USER-INTEL upgrade from M Brown
This commit is contained in:
Binary file not shown.
|
Before Width: | Height: | Size: 14 KiB After Width: | Height: | Size: 14 KiB |
@ -30,8 +30,8 @@ Dihedral Styles: charmm, harmonic, opls :l
|
|||||||
Fixes: nve, npt, nvt, nvt/sllod :l
|
Fixes: nve, npt, nvt, nvt/sllod :l
|
||||||
Improper Styles: cvff, harmonic :l
|
Improper Styles: cvff, harmonic :l
|
||||||
Pair Styles: buck/coul/cut, buck/coul/long, buck, eam, gayberne,
|
Pair Styles: buck/coul/cut, buck/coul/long, buck, eam, gayberne,
|
||||||
charmm/coul/long, lj/cut, lj/cut/coul/long, sw, tersoff :l
|
charmm/coul/long, lj/cut, lj/cut/coul/long, lj/long/coul/long, sw, tersoff :l
|
||||||
K-Space Styles: pppm :l
|
K-Space Styles: pppm, pppm/disp :l
|
||||||
:ule
|
:ule
|
||||||
|
|
||||||
[Speed-ups to expect:]
|
[Speed-ups to expect:]
|
||||||
@ -42,62 +42,88 @@ precision mode. Performance improvements are shown compared to
|
|||||||
LAMMPS {without using other acceleration packages} as these are
|
LAMMPS {without using other acceleration packages} as these are
|
||||||
under active development (and subject to performance changes). The
|
under active development (and subject to performance changes). The
|
||||||
measurements were performed using the input files available in
|
measurements were performed using the input files available in
|
||||||
the src/USER-INTEL/TEST directory. These are scalable in size; the
|
the src/USER-INTEL/TEST directory with the provided run script.
|
||||||
results given are with 512K particles (524K for Liquid Crystal).
|
These are scalable in size; the results given are with 512K
|
||||||
Most of the simulations are standard LAMMPS benchmarks (indicated
|
particles (524K for Liquid Crystal). Most of the simulations are
|
||||||
by the filename extension in parenthesis) with modifications to the
|
standard LAMMPS benchmarks (indicated by the filename extension in
|
||||||
run length and to add a warmup run (for use with offload
|
parenthesis) with modifications to the run length and to add a
|
||||||
benchmarks).
|
warmup run (for use with offload benchmarks).
|
||||||
|
|
||||||
:c,image(JPG/user_intel.png)
|
:c,image(JPG/user_intel.png)
|
||||||
|
|
||||||
Results are speedups obtained on Intel Xeon E5-2697v4 processors
|
Results are speedups obtained on Intel Xeon E5-2697v4 processors
|
||||||
(code-named Broadwell) and Intel Xeon Phi 7250 processors
|
(code-named Broadwell) and Intel Xeon Phi 7250 processors
|
||||||
(code-named Knights Landing) with "18 Jun 2016" LAMMPS built with
|
(code-named Knights Landing) with "June 2017" LAMMPS built with
|
||||||
Intel Parallel Studio 2016 update 3. Results are with 1 MPI task
|
Intel Parallel Studio 2017 update 2. Results are with 1 MPI task
|
||||||
per physical core. See {src/USER-INTEL/TEST/README} for the raw
|
per physical core. See {src/USER-INTEL/TEST/README} for the raw
|
||||||
simulation rates and instructions to reproduce.
|
simulation rates and instructions to reproduce.
|
||||||
|
|
||||||
:line
|
:line
|
||||||
|
|
||||||
|
[Accuracy and order of operations:]
|
||||||
|
|
||||||
|
In most molecular dynamics software, parallelization parameters
|
||||||
|
(# of MPI, OpenMP, and vectorization) can change the results due
|
||||||
|
to changing the order of operations with finite-precision
|
||||||
|
calculations. The USER-INTEL package is deterministic. This means
|
||||||
|
that the results should be reproducible from run to run with the
|
||||||
|
{same} parallel configurations and when using determinstic
|
||||||
|
libraries or library settings (MPI, OpenMP, FFT). However, there
|
||||||
|
are differences in the USER-INTEL package that can change the
|
||||||
|
order of operations compared to LAMMPS without acceleration:
|
||||||
|
|
||||||
|
Neighbor lists can be created in a different order :ulb,l
|
||||||
|
Bins used for sorting atoms can be oriented differently :l
|
||||||
|
The default stencil order for PPPM is 7. By default, LAMMPS will
|
||||||
|
calculate other PPPM parameters to fit the desired acuracy with
|
||||||
|
this order :l
|
||||||
|
The {newton} setting applies to all atoms, not just atoms shared
|
||||||
|
between MPI tasks :l
|
||||||
|
Vectorization can change the order for adding pairwise forces :l
|
||||||
|
:ule
|
||||||
|
|
||||||
|
The precision mode (described below) used with the USER-INTEL
|
||||||
|
package can change the {accuracy} of the calculations. For the
|
||||||
|
default {mixed} precision option, calculations between pairs or
|
||||||
|
triplets of atoms are performed in single precision, intended to
|
||||||
|
be within the inherent error of MD simulations. All accumulation
|
||||||
|
is performed in double precision to prevent the error from growing
|
||||||
|
with the number of atoms in the simulation. {Single} precision
|
||||||
|
mode should not be used without appropriate validation.
|
||||||
|
|
||||||
|
:line
|
||||||
|
|
||||||
[Quick Start for Experienced Users:]
|
[Quick Start for Experienced Users:]
|
||||||
|
|
||||||
LAMMPS should be built with the USER-INTEL package installed.
|
LAMMPS should be built with the USER-INTEL package installed.
|
||||||
Simulations should be run with 1 MPI task per physical {core},
|
Simulations should be run with 1 MPI task per physical {core},
|
||||||
not {hardware thread}.
|
not {hardware thread}.
|
||||||
|
|
||||||
For Intel Xeon CPUs:
|
|
||||||
|
|
||||||
Edit src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi as necessary. :ulb,l
|
Edit src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi as necessary. :ulb,l
|
||||||
If using {kspace_style pppm} in the input script, add "neigh_modify binsize cutoff" and "kspace_modify diff ad" to the input script for better
|
Set the environment variable KMP_BLOCKTIME=0 :l
|
||||||
performance. Cutoff should be roughly the neighbor list cutoff. By
|
"-pk intel 0 omp $t -sf intel" added to LAMMPS command-line :l
|
||||||
default the binsize is half the neighbor list cutoff. :l
|
$t should be 2 for Intel Xeon CPUs and 2 or 4 for Intel Xeon Phi :l
|
||||||
"-pk intel 0 omp 2 -sf intel" added to LAMMPS command-line :l
|
For some of the simple 2-body potentials without long-range
|
||||||
|
electrostatics, performance and scalability can be better with
|
||||||
|
the "newton off" setting added to the input script :l
|
||||||
|
If using {kspace_style pppm} in the input script, add
|
||||||
|
"kspace_modify diff ad" for better performance :l
|
||||||
:ule
|
:ule
|
||||||
|
|
||||||
For Intel Xeon Phi CPUs for simulations without {kspace_style
|
For Intel Xeon Phi CPUs:
|
||||||
pppm} in the input script :
|
|
||||||
|
|
||||||
Edit src/MAKE/OPTIONS/Makefile.knl as necessary. :ulb,l
|
Runs should be performed using MCDRAM. :ulb,l
|
||||||
Runs should be performed using MCDRAM. :l
|
|
||||||
"-pk intel 0 omp 2 -sf intel" {or} "-pk intel 0 omp 4 -sf intel"
|
|
||||||
should be added to the LAMMPS command-line. Choice for best
|
|
||||||
performance will depend on the simulation. :l
|
|
||||||
:ule
|
:ule
|
||||||
|
|
||||||
For Intel Xeon Phi CPUs for simulations with {kspace_style
|
For simulations using {kspace_style pppm} on Intel CPUs
|
||||||
pppm} in the input script:
|
supporting AVX-512:
|
||||||
|
|
||||||
Edit src/MAKE/OPTIONS/Makefile.knl as necessary. :ulb,l
|
Add "kspace_modify diff ad" to the input script :ulb,l
|
||||||
Runs should be performed using MCDRAM. :l
|
The command-line option should be changed to
|
||||||
Add "neigh_modify binsize 3" to the input script for better
|
"-pk intel 0 omp $r lrt yes -sf intel" where $r is the number of
|
||||||
performance. :l
|
threads minus 1. :l
|
||||||
Add "kspace_modify diff ad" to the input script for better
|
Do not use thread affinity (set KMP_AFFINITY=none) :l
|
||||||
performance. :l
|
The "newton off" setting may provide better scalability :l
|
||||||
export KMP_AFFINITY=none :l
|
|
||||||
"-pk intel 0 omp 3 lrt yes -sf intel" or "-pk intel 0 omp 1 lrt yes
|
|
||||||
-sf intel" added to LAMMPS command-line. Choice for best performance
|
|
||||||
will depend on the simulation. :l
|
|
||||||
:ule
|
:ule
|
||||||
|
|
||||||
For Intel Xeon Phi coprocessors (Offload):
|
For Intel Xeon Phi coprocessors (Offload):
|
||||||
@ -169,6 +195,10 @@ cat /proc/cpuinfo :pre
|
|||||||
|
|
||||||
[Building LAMMPS with the USER-INTEL package:]
|
[Building LAMMPS with the USER-INTEL package:]
|
||||||
|
|
||||||
|
NOTE: See the src/USER-INTEL/README file for additional flags that
|
||||||
|
might be needed for best performance on Intel server processors
|
||||||
|
code-named "Skylake".
|
||||||
|
|
||||||
The USER-INTEL package must be installed into the source directory:
|
The USER-INTEL package must be installed into the source directory:
|
||||||
|
|
||||||
make yes-user-intel :pre
|
make yes-user-intel :pre
|
||||||
@ -322,8 +352,8 @@ follow in the input script.
|
|||||||
|
|
||||||
NOTE: The USER-INTEL package will perform better with modifications
|
NOTE: The USER-INTEL package will perform better with modifications
|
||||||
to the input script when "PPPM"_kspace_style.html is used:
|
to the input script when "PPPM"_kspace_style.html is used:
|
||||||
"kspace_modify diff ad"_kspace_modify.html and "neigh_modify binsize
|
"kspace_modify diff ad"_kspace_modify.html should be added to the
|
||||||
3"_neigh_modify.html should be added to the input script.
|
input script.
|
||||||
|
|
||||||
Long-Range Thread (LRT) mode is an option to the "package
|
Long-Range Thread (LRT) mode is an option to the "package
|
||||||
intel"_package.html command that can improve performance when using
|
intel"_package.html command that can improve performance when using
|
||||||
@ -342,6 +372,10 @@ would normally perform best with "-pk intel 0 omp 4", instead use
|
|||||||
environment variable "KMP_AFFINITY=none". LRT mode is not supported
|
environment variable "KMP_AFFINITY=none". LRT mode is not supported
|
||||||
when using offload.
|
when using offload.
|
||||||
|
|
||||||
|
NOTE: Changing the "newton"_newton.html setting to off can improve
|
||||||
|
performance and/or scalability for simple 2-body potentials such as
|
||||||
|
lj/cut or when using LRT mode on processors supporting AVX-512.
|
||||||
|
|
||||||
Not all styles are supported in the USER-INTEL package. You can mix
|
Not all styles are supported in the USER-INTEL package. You can mix
|
||||||
the USER-INTEL package with styles from the "OPT"_accelerate_opt.html
|
the USER-INTEL package with styles from the "OPT"_accelerate_opt.html
|
||||||
package or the "USER-OMP package"_accelerate_omp.html. Of course,
|
package or the "USER-OMP package"_accelerate_omp.html. Of course,
|
||||||
@ -467,7 +501,7 @@ supported.
|
|||||||
|
|
||||||
Brown, W.M., Carrillo, J.-M.Y., Mishra, B., Gavhane, N., Thakker, F.M., De Kraker, A.R., Yamada, M., Ang, J.A., Plimpton, S.J., "Optimizing Classical Molecular Dynamics in LAMMPS," in Intel Xeon Phi Processor High Performance Programming: Knights Landing Edition, J. Jeffers, J. Reinders, A. Sodani, Eds. Morgan Kaufmann. :ulb,l
|
Brown, W.M., Carrillo, J.-M.Y., Mishra, B., Gavhane, N., Thakker, F.M., De Kraker, A.R., Yamada, M., Ang, J.A., Plimpton, S.J., "Optimizing Classical Molecular Dynamics in LAMMPS," in Intel Xeon Phi Processor High Performance Programming: Knights Landing Edition, J. Jeffers, J. Reinders, A. Sodani, Eds. Morgan Kaufmann. :ulb,l
|
||||||
|
|
||||||
Brown, W. M., Semin, A., Hebenstreit, M., Khvostov, S., Raman, K., Plimpton, S.J. Increasing Molecular Dynamics Simulation Rates with an 8-Fold Increase in Electrical Power Efficiency. 2016 International Conference for High Performance Computing. In press. :l
|
Brown, W. M., Semin, A., Hebenstreit, M., Khvostov, S., Raman, K., Plimpton, S.J. "Increasing Molecular Dynamics Simulation Rates with an 8-Fold Increase in Electrical Power Efficiency."_http://dl.acm.org/citation.cfm?id=3014915 2016 High Performance Computing, Networking, Storage and Analysis, SC16: International Conference (pp. 82-95). :l
|
||||||
|
|
||||||
Brown, W.M., Carrillo, J.-M.Y., Gavhane, N., Thakkar, F.M., Plimpton, S.J. Optimizing Legacy Molecular Dynamics Software with Directive-Based Offload. Computer Physics Communications. 2015. 195: p. 95-101. :l
|
Brown, W.M., Carrillo, J.-M.Y., Gavhane, N., Thakkar, F.M., Plimpton, S.J. Optimizing Legacy Molecular Dynamics Software with Directive-Based Offload. Computer Physics Communications. 2015. 195: p. 95-101. :l
|
||||||
:ule
|
:ule
|
||||||
|
|||||||
@ -14,152 +14,178 @@ fix ID group-ID neb Kspring keyword value :pre
|
|||||||
|
|
||||||
ID, group-ID are documented in "fix"_fix.html command :ulb,l
|
ID, group-ID are documented in "fix"_fix.html command :ulb,l
|
||||||
neb = style name of this fix command :l
|
neb = style name of this fix command :l
|
||||||
Kspring = parallel spring constant (force/distance units or force units) :l
|
Kspring = parallel spring constant (force/distance units or force units, see nudge keyword) :l
|
||||||
zero or more keyword/value pairs may be appended :l
|
zero or more keyword/value pairs may be appended :l
|
||||||
keyword = {nudg_style} or {perp} or {freend} or {freend_k_spring} :l
|
keyword = {nudge} or {perp} or {ends} :l
|
||||||
{nudg_style} value = {neigh} or {idealpos}
|
{nudge} value = {neigh} or {ideal}
|
||||||
{neigh} = the parallel nudging force is calculated from the distances to neighbouring replicas (in this case, Kspring is in force/distance units)
|
{neigh} = parallel nudging force based on distance to neighbor replicas (Kspring = force/distance units)
|
||||||
{idealpos} = the parallel nudging force is proportional to the distance between the replica and its interpolated ideal position (in this case Kspring is in force units)
|
{ideal} = parallel nudging force based on interpolated ideal position (Kspring = force units)
|
||||||
{perp} value {none} or kspring2
|
{perp} value = {Kspring2}
|
||||||
{none} = no perpendicular spring force is applied
|
{Kspring2} = spring constant for perpendicular nudging force (force/distance units)
|
||||||
{kspring2} = spring constant for the perpendicular nudging force (in force/distance units)
|
{end} values = estyle Kspring3
|
||||||
{freeend} value = {none} or {ini} or {final} or {finaleini} or {final2eini}
|
{estyle} = {first} or {last} or {last/efirst} or {last/efirst/middle}
|
||||||
{none} = no nudging force is applied to the first and last replicas
|
{first} = apply force to first replica
|
||||||
{ini} = set the first replica to be a free end
|
{last} = apply force to last replica
|
||||||
{final} = set the last replica to be a free end
|
{last/efirst} = apply force to last replica and set its target energy to that of first replica
|
||||||
{finaleini} = set the last replica to be a free end and set its target energy as that of the first replica
|
{last/efirst/middle} = same as {last/efirst} plus prevent middle replicas having lower energy than first replica
|
||||||
{final2eini} = same as {finaleini} plus prevent intermediate replicas to have a lower energy than the first replica
|
{Kspring3} = spring constant for target energy term (1/distance units) :pre
|
||||||
{freeend_kspring} value = kspring3
|
|
||||||
kspring3 = spring constant of the perpendicular spring force (per distance units)
|
|
||||||
:pre
|
|
||||||
|
|
||||||
[Examples:]
|
[Examples:]
|
||||||
|
|
||||||
fix 1 active neb 10.0
|
fix 1 active neb 10.0
|
||||||
fix 2 all neb 1.0 perp 1.0 freeend final
|
fix 2 all neb 1.0 perp 1.0 end last
|
||||||
fix 1 all neb 1.0 nudg_style idealpos freeend final2eini freend_kspring 1:pre
|
fix 2 all neb 1.0 perp 1.0 end first end last
|
||||||
|
fix 1 all neb 1.0 nudge ideal end last/efirst 1 :pre
|
||||||
|
|
||||||
[Description:]
|
[Description:]
|
||||||
|
|
||||||
Add a nudging force to atoms in the group for a multi-replica
|
Add nudging forces to atoms in the group for a multi-replica
|
||||||
simulation run via the "neb"_neb.html command to perform a nudged
|
simulation run via the "neb"_neb.html command to perform a nudged
|
||||||
elastic band (NEB) calculation for finding the transition state.
|
elastic band (NEB) calculation for finding the transition state.
|
||||||
Hi-level explanations of NEB are given with the "neb"_neb.html command
|
Hi-level explanations of NEB are given with the "neb"_neb.html command
|
||||||
and in "Section_howto 5"_Section_howto.html#howto_5 of the manual.
|
and in "Section_howto 5"_Section_howto.html#howto_5 of the manual.
|
||||||
The fix neb command must be used with the "neb" command and defines
|
The fix neb command must be used with the "neb" command and defines
|
||||||
how nudging inter-replica forces are computed. A NEB calculation is
|
how inter-replica nudging forces are computed. A NEB calculation is
|
||||||
divided in two stages. In the first stage n replicas are relaxed
|
divided in two stages. In the first stage n replicas are relaxed
|
||||||
toward a MEP and in a second stage, the climbing image scheme (see
|
toward a MEP until convergence. In the second stage, the climbing
|
||||||
"(Henkelman2)"_#Henkelman2) is turned on so that the replica having
|
image scheme (see "(Henkelman2)"_#Henkelman2) is enabled, so that the
|
||||||
the highest energy relaxes toward the saddle point (i.e. the point of
|
replica having the highest energy relaxes toward the saddle point
|
||||||
highest energy along the MEP).
|
(i.e. the point of highest energy along the MEP), and a second
|
||||||
|
relaxation is performed.
|
||||||
|
|
||||||
One purpose of the nudging forces is to keep the replicas equally
|
A key purpose of the nudging forces is to keep the replicas equally
|
||||||
spaced. During the NEB, the 3N-length vector of interatomic force Fi
|
spaced. During the NEB calculation, the 3N-length vector of
|
||||||
= -Grad(V) of replicas i is altered. For all intermediate replicas
|
interatomic force Fi = -Grad(V) for each replica I is altered. For
|
||||||
(i.e. for 1<i<n) but the climbing replica the force vector
|
all intermediate replicas (i.e. for 1 < I < N, except the climbing
|
||||||
becomes:
|
replica) the force vector becomes:
|
||||||
|
|
||||||
Fi = -Grad(V) + (Grad(V) dot That) That + Fnudgparallel + Fspringperp :pre
|
Fi = -Grad(V) + (Grad(V) dot T') T' + Fnudge_parallel + Fspring_perp :pre
|
||||||
|
|
||||||
That is the unit "tangent" vector for replica i and is a function of
|
T' is the unit "tangent" vector for replica I and is a function of Ri,
|
||||||
Ri, Ri-1, Ri+1, and the potential energy of the 3 replicas; it points
|
Ri-1, Ri+1, and the potential energy of the 3 replicas; it points
|
||||||
roughly in the direction of (Ri+i - Ri-1) (see the
|
roughly in the direction of (Ri+i - Ri-1); see the
|
||||||
"(Henkelman1)"_#Henkelman1 paper for details). Ri are the atomic
|
"(Henkelman1)"_#Henkelman1 paper for details. Ri are the atomic
|
||||||
coordinates of replica i; Ri-1 and Ri+1 are the coordinates of its
|
coordinates of replica I; Ri-1 and Ri+1 are the coordinates of its
|
||||||
neighbor replicas. The term (Grad(V) dot That) is used to remove the
|
neighbor replicas. The term (Grad(V) dot T') is used to remove the
|
||||||
component of the gradient parallel to the path which would tend to
|
component of the gradient parallel to the path which would tend to
|
||||||
distribute the replica unevenly along the path. Fnudgparallel is an
|
distribute the replica unevenly along the path. Fnudge_parallel is an
|
||||||
artificial nudging force which is applied only in the tangent direction
|
artificial nudging force which is applied only in the tangent
|
||||||
and which maintains the replicas equally spaced (see below for more
|
direction and which maintains the equal spacing between replicas (see
|
||||||
information). Fspringperp is an optinal artificial spring which is
|
below for more information). Fspring_perp is an optional artificial
|
||||||
applied only perpendicular to the tangent and which prevent the paths
|
spring which is applied only perpendicular to the tangent and which
|
||||||
from forming too acute kinks (see below for more information).
|
prevent the paths from forming acute kinks (see below for more
|
||||||
|
information).
|
||||||
|
|
||||||
The keyword {nudg_style} allow to specify how to parallel
|
In the second stage of the NEB calculation, the interatomic force Fi
|
||||||
nudging force is computed. With a value of idealpos, the spring
|
for the climbing replica (the replica of highest energy after the
|
||||||
force is computed as suggested in "(E)"_#E :
|
first stage) is changed to:
|
||||||
|
|
||||||
Fnudgparallel=-{Kspring}* (RD-RDideal)/(2 meanDist) :pre
|
Fi = -Grad(V) + 2 (Grad(V) dot T') T' :pre
|
||||||
|
|
||||||
|
and the relaxation procedure is continued to a new converged MEP.
|
||||||
|
|
||||||
|
:line
|
||||||
|
|
||||||
|
The keyword {nudge} specifies how the parallel nudging force is
|
||||||
|
computed. With a value of {neigh}, the parallel nudging force is
|
||||||
|
computed as in "(Henkelman1)"_#Henkelman1 by connecting each
|
||||||
|
intermediate replica with the previous and the next image:
|
||||||
|
|
||||||
|
Fnudge_parallel = {Kspring} * (|Ri+1 - Ri| - |Ri - Ri-1|) :pre
|
||||||
|
|
||||||
|
Note that in this case the specified {Kspring) is in force/distance
|
||||||
|
units.
|
||||||
|
|
||||||
|
With a value of {ideal}, the spring force is computed as suggested in
|
||||||
|
"(WeinenE)"_#WeinenE :
|
||||||
|
|
||||||
|
Fnudge_parallel = -{Kspring} * (RD-RDideal) / (2 * meanDist) :pre
|
||||||
|
|
||||||
where RD is the "reaction coordinate" see "neb"_neb.html section, and
|
where RD is the "reaction coordinate" see "neb"_neb.html section, and
|
||||||
RDideal is the ideal RD for which all the images are equally spaced
|
RDideal is the ideal RD for which all the images are equally spaced.
|
||||||
(i.e. RDideal = (i-1)*meanDist when the climbing image is off, where i
|
I.e. RDideal = (I-1)*meanDist when the climbing replica is off, where
|
||||||
is the replica number). The meanDist is the average distance between
|
I is the replica number). The meanDist is the average distance
|
||||||
replicas.
|
between replicas. Note that in this case the specified {Kspring) is
|
||||||
|
in force units.
|
||||||
|
|
||||||
When {nudg_style} has a value of neigh (or by default), the parallel
|
Note that the {ideal} form of nudging can often be more effective at
|
||||||
nudging force is computed as in "(Henkelman1)"_#Henkelman1 by
|
keeping the replicas equally spaced.
|
||||||
connecting each intermediate replica with the previous and the next
|
|
||||||
image:
|
|
||||||
|
|
||||||
Fnudgparallel= {Kspring}* (|Ri+1 - Ri| - |Ri - Ri-1|) :pre
|
|
||||||
|
|
||||||
The parallel nudging force associated with the key word idealpos should
|
|
||||||
usually be more efficient at keeping the images equally spaced.
|
|
||||||
|
|
||||||
:line
|
:line
|
||||||
|
|
||||||
The keyword {perp} allows to add a spring force perpendicular to the
|
The keyword {perp} adds a spring force perpendicular to the path in
|
||||||
path in order to prevent the path from becoming too kinky. It can
|
order to prevent the path from becoming too kinky, with magnitude It
|
||||||
improve significantly the convergence of the NEB when the resolution
|
can significantly improve the convergence of the NEB calculation when
|
||||||
is poor (i.e. when too few images are used) (see "(Maras)"_#Maras1).
|
the resolution is poor. I.e. when too few replicas are used; see
|
||||||
|
"(Maras)"_#Maras1 for details.
|
||||||
|
|
||||||
The perpendicular spring force is given by
|
The perpendicular spring force is given by
|
||||||
|
|
||||||
Fspringperp = {Kspringperp} * f(Ri-1,Ri,Ri+1) (Ri+1 + Ri-1 - 2 Ri) :pre
|
Fspring_perp = {Kspring2} * F(Ri-1,Ri,Ri+1) (Ri+1 + Ri-1 - 2 Ri) :pre
|
||||||
|
|
||||||
f(Ri-1 Ri R+1) is a smooth scalar function of the angle Ri-1 Ri
|
where {Kspring2} is the specified value. F(Ri-1 Ri R+1) is a smooth
|
||||||
Ri+1. It is equal to 0 when the path is straight and is equal to 1
|
scalar function of the angle Ri-1 Ri Ri+1. It is equal to 0.0 when
|
||||||
when the angle Ri-1 Ri Ri+1 is accute. f(Ri-1 Ri R+1) is defined in
|
the path is straight and is equal to 1 when the angle Ri-1 Ri Ri+1 is
|
||||||
"(Jonsson)"_#Jonsson
|
acute. F(Ri-1 Ri R+1) is defined in "(Jonsson)"_#Jonsson.
|
||||||
|
|
||||||
|
If {Kspring2} is set to 0.0 (the default) then no perpendicular spring
|
||||||
|
force is added.
|
||||||
|
|
||||||
:line
|
:line
|
||||||
|
|
||||||
By default, the force acting on the first and last replicas is not
|
By default, no forces act on the first and last replicas during the
|
||||||
altered so that during the NEB relaxation, these ending replicas relax
|
NEB relaxation, so these replicas simply relax toward their respective
|
||||||
toward local minima. However it is possible to use the key word
|
local minima. By using the key word {end}, additional forces can be
|
||||||
{freeend} to allow either the initial or the final replica to relax
|
applied to the first or last replica, to enable them to relax toward a
|
||||||
toward a MEP while constraining its energy. The interatomic force Fi
|
MEP while constraining their energy.
|
||||||
for the free end image becomes :
|
|
||||||
|
|
||||||
Fi = -Grad(V)+ (Grad(V) dot That + (E-ETarget)*kspring3) That, {when} Grad(V) dot That < 0
|
The interatomic force Fi for the specified replica becomes:
|
||||||
Fi = -Grad(V)+ (Grad(V) dot That + (ETarget- E)*kspring3) That, {when} Grad(V) dot That > 0
|
|
||||||
|
Fi = -Grad(V) + (Grad(V) dot T' + (E-ETarget)*Kspring3) T', {when} Grad(V) dot T' < 0
|
||||||
|
Fi = -Grad(V) + (Grad(V) dot T' + (ETarget- E)*Kspring3) T', {when} Grad(V) dot T' > 0
|
||||||
:pre
|
:pre
|
||||||
|
|
||||||
where E is the energy of the free end replica and ETarget is the
|
where E is the current energy of the replica and ETarget is the target
|
||||||
target energy.
|
energy. The "spring" constant on the difference in energies is the
|
||||||
|
specified {Kspring3} value.
|
||||||
|
|
||||||
When the value {ini} ({final}) is used after the keyword {freeend},
|
When {estyle} is specified as {first}, the force is applied to the
|
||||||
the first (last) replica is considered as a free end. The target
|
first replica. When {estyle} is specified as {last}, the force is
|
||||||
energy is set to the energy of the replica at starting of the NEB
|
applied to the last replica. Note that the {end} keyword can be used
|
||||||
calculation. When the value {finaleini} or {final2eini} is used the
|
twice to add forces to both the first and last replicas.
|
||||||
last image is considered as a free end and the target energy is equal
|
|
||||||
to the energy of the first replica (which can evolve during the NEB
|
|
||||||
relaxation). With the value {finaleini}, when the initial path is too
|
|
||||||
far from the MEP, an intermediate repilica might relax "faster" and
|
|
||||||
get a lower energy than the last replica. The benefit of the free end
|
|
||||||
is then lost since this intermediate replica will relax toward a local
|
|
||||||
minima. This behavior can be prevented by using the value {final2eini}
|
|
||||||
which remove entirely the contribution of the gradient for all
|
|
||||||
intermediate replica which have a lower energy than the initial one
|
|
||||||
thus preventing these replicae to over-relax. After converging a NEB
|
|
||||||
with the {final2eini} value it is recommended to check that all
|
|
||||||
intermediate replica have a larger energy than the initial
|
|
||||||
replica. Finally note that if the last replica converges toward a
|
|
||||||
local minimum with a larger energy than the energy of the first
|
|
||||||
replica, a free end neb calculation with the value {finaleini} or
|
|
||||||
{final2eini} cannot reach the convergence criteria.
|
|
||||||
|
|
||||||
:line
|
For both these {estyle} settings, the target energy {ETarget} is set
|
||||||
|
to the initial energy of the replica (at the start of the NEB
|
||||||
|
calculation).
|
||||||
|
|
||||||
|
If the {estyle} is specified as {last/efirst} or {last/efirst/middle},
|
||||||
|
force is applied to the last replica, but the target energy {ETarget}
|
||||||
|
is continuously set to the energy of the first replica, as it evolves
|
||||||
|
during the NEB relaxation.
|
||||||
|
|
||||||
|
The difference between these two {estyle} options is as follows. When
|
||||||
|
{estyle} is specified as {last/efirst}, no change is made to the
|
||||||
|
inter-replica force applied to the intermediate replicas (neither
|
||||||
|
first or last). If the initial path is too far from the MEP, an
|
||||||
|
intermediate repilica may relax "faster" and reach a lower energy than
|
||||||
|
the last replica. In this case the intermediate replica will be
|
||||||
|
relaxing toward its own local minima. This behavior can be prevented
|
||||||
|
by specifying {estyle} as {last/efirst/middle} which will alter the
|
||||||
|
inter-replica force applied to intermediate replicas by removing the
|
||||||
|
contribution of the gradient to the inter-replica force. This will
|
||||||
|
only be done if a particular intermediate replica has a lower energy
|
||||||
|
than the first replica. This should effectively prevent the
|
||||||
|
intermediate replicas from over-relaxing.
|
||||||
|
|
||||||
In the second stage of the NEB, the interatomic force Fi for the
|
After converging a NEB calculation using an {estyle} of {last/efirst},
|
||||||
climbing replica (which is the replica of highest energy) becomes:
|
you should check that all intermediate replicas have a larger energy
|
||||||
|
than the first replica. If not, then repeat the calculation with an
|
||||||
Fi = -Grad(V) + 2 (Grad(V) dot That) That :pre
|
{estyle} of {last/efirst/middle}.
|
||||||
|
|
||||||
|
|
||||||
|
Finally, note that if the last replica converges toward a local
|
||||||
|
minimum which has a larger energy than the energy of the first
|
||||||
|
replica, a NEB calculation using an {estyle} of {last/efirst} or
|
||||||
|
{last/efirst/middle} cannot reach final convergence.
|
||||||
|
|
||||||
[Restart, fix_modify, output, run start/stop, minimize info:]
|
[Restart, fix_modify, output, run start/stop, minimize info:]
|
||||||
|
|
||||||
@ -186,7 +212,8 @@ for more info on packages.
|
|||||||
|
|
||||||
[Default:]
|
[Default:]
|
||||||
|
|
||||||
The option defaults are nudg_style = neigh, perp = none, freeend = none and freend_kspring = 1.
|
The option defaults are nudge = neigh, perp = 0.0, ends is not
|
||||||
|
specified (no inter-replica force on the end replicas).
|
||||||
|
|
||||||
:line
|
:line
|
||||||
|
|
||||||
@ -197,14 +224,14 @@ The option defaults are nudg_style = neigh, perp = none, freeend = none and free
|
|||||||
[(Henkelman2)] Henkelman, Uberuaga, Jonsson, J Chem Phys, 113,
|
[(Henkelman2)] Henkelman, Uberuaga, Jonsson, J Chem Phys, 113,
|
||||||
9901-9904 (2000).
|
9901-9904 (2000).
|
||||||
|
|
||||||
:link(E)
|
:link(WeinenE)
|
||||||
[(E)] E, Ren, Vanden-Eijnden, Phys Rev B, 66, 052301 (2002)
|
[(WeinenE)] E, Ren, Vanden-Eijnden, Phys Rev B, 66, 052301 (2002).
|
||||||
|
|
||||||
:link(Jonsson)
|
:link(Jonsson)
|
||||||
[(Jonsson)] Jonsson, Mills and Jacobsen, in Classical and Quantum
|
[(Jonsson)] Jonsson, Mills and Jacobsen, in Classical and Quantum
|
||||||
Dynamics in Condensed Phase Simulations, edited by Berne, Ciccotti, and Coker
|
Dynamics in Condensed Phase Simulations, edited by Berne, Ciccotti,
|
||||||
World Scientific, Singapore, 1998, p. 385
|
and Coker World Scientific, Singapore, 1998, p 385.
|
||||||
|
|
||||||
:link(Maras1)
|
:link(Maras1)
|
||||||
[(Maras)] Maras, Trushin, Stukowski, Ala-Nissila, Jonsson,
|
[(Maras)] Maras, Trushin, Stukowski, Ala-Nissila, Jonsson,
|
||||||
Comp Phys Comm, 205, 13-21 (2016)
|
Comp Phys Comm, 205, 13-21 (2016).
|
||||||
|
|||||||
@ -308,7 +308,8 @@ The option defaults are mesh = mesh/disp = 0 0 0, order = order/disp =
|
|||||||
gewald = gewald/disp = 0.0, slab = 1.0, compute = yes, cutoff/adjust =
|
gewald = gewald/disp = 0.0, slab = 1.0, compute = yes, cutoff/adjust =
|
||||||
yes (MSM), pressure/scalar = yes (MSM), fftbench = yes (PPPM), diff = ik
|
yes (MSM), pressure/scalar = yes (MSM), fftbench = yes (PPPM), diff = ik
|
||||||
(PPPM), mix/disp = pair, force/disp/real = -1.0, force/disp/kspace = -1.0,
|
(PPPM), mix/disp = pair, force/disp/real = -1.0, force/disp/kspace = -1.0,
|
||||||
split = 0, tol = 1.0e-6, and disp/auto = no.
|
split = 0, tol = 1.0e-6, and disp/auto = no. For pppm/intel, order =
|
||||||
|
order/disp = 7.
|
||||||
|
|
||||||
:line
|
:line
|
||||||
|
|
||||||
|
|||||||
@ -33,12 +33,16 @@ style = {none} or {ewald} or {ewald/disp} or {ewald/omp} or {pppm} or {pppm/cg}
|
|||||||
accuracy = desired relative error in forces
|
accuracy = desired relative error in forces
|
||||||
{pppm/gpu} value = accuracy
|
{pppm/gpu} value = accuracy
|
||||||
accuracy = desired relative error in forces
|
accuracy = desired relative error in forces
|
||||||
|
{pppm/intel} value = accuracy
|
||||||
|
accuracy = desired relative error in forces
|
||||||
{pppm/kk} value = accuracy
|
{pppm/kk} value = accuracy
|
||||||
accuracy = desired relative error in forces
|
accuracy = desired relative error in forces
|
||||||
{pppm/omp} value = accuracy
|
{pppm/omp} value = accuracy
|
||||||
accuracy = desired relative error in forces
|
accuracy = desired relative error in forces
|
||||||
{pppm/cg/omp} value = accuracy
|
{pppm/cg/omp} value = accuracy
|
||||||
accuracy = desired relative error in forces
|
accuracy = desired relative error in forces
|
||||||
|
{pppm/disp/intel} value = accuracy
|
||||||
|
accuracy = desired relative error in forces
|
||||||
{pppm/tip4p/omp} value = accuracy
|
{pppm/tip4p/omp} value = accuracy
|
||||||
accuracy = desired relative error in forces
|
accuracy = desired relative error in forces
|
||||||
{pppm/stagger} value = accuracy
|
{pppm/stagger} value = accuracy
|
||||||
|
|||||||
@ -7,6 +7,7 @@
|
|||||||
:line
|
:line
|
||||||
|
|
||||||
pair_style lj/long/coul/long command :h3
|
pair_style lj/long/coul/long command :h3
|
||||||
|
pair_style lj/long/coul/long/intel command :h3
|
||||||
pair_style lj/long/coul/long/omp command :h3
|
pair_style lj/long/coul/long/omp command :h3
|
||||||
pair_style lj/long/coul/long/opt command :h3
|
pair_style lj/long/coul/long/opt command :h3
|
||||||
pair_style lj/long/tip4p/long command :h3
|
pair_style lj/long/tip4p/long command :h3
|
||||||
|
|||||||
@ -51,7 +51,7 @@ set group nebatoms type 3
|
|||||||
group nonneb subtract all nebatoms
|
group nonneb subtract all nebatoms
|
||||||
|
|
||||||
fix 1 lower setforce 0.0 0.0 0.0
|
fix 1 lower setforce 0.0 0.0 0.0
|
||||||
fix 2 nebatoms neb 1.0 nudg_style idealpos
|
fix 2 nebatoms neb 1.0 #nudge ideal
|
||||||
fix 3 all enforce2d
|
fix 3 all enforce2d
|
||||||
|
|
||||||
thermo 100
|
thermo 100
|
||||||
|
|||||||
@ -15,7 +15,7 @@ variable u uloop 20
|
|||||||
lattice hex 0.9
|
lattice hex 0.9
|
||||||
region box block 0 20 0 10 -0.25 0.25
|
region box block 0 20 0 10 -0.25 0.25
|
||||||
|
|
||||||
read_data initial.hop1freeend
|
read_data initial.hop1.end
|
||||||
|
|
||||||
# LJ potentials
|
# LJ potentials
|
||||||
|
|
||||||
@ -41,7 +41,7 @@ set group nebatoms type 3
|
|||||||
group nonneb subtract all nebatoms
|
group nonneb subtract all nebatoms
|
||||||
|
|
||||||
fix 1 lower setforce 0.0 0.0 0.0
|
fix 1 lower setforce 0.0 0.0 0.0
|
||||||
fix 2 nebatoms neb 1.0 nudg_style idealpos freeend ini
|
fix 2 nebatoms neb 1.0 nudge ideal end first 1.0
|
||||||
fix 3 all enforce2d
|
fix 3 all enforce2d
|
||||||
|
|
||||||
thermo 100
|
thermo 100
|
||||||
@ -8,7 +8,7 @@ SHELL = /bin/sh
|
|||||||
|
|
||||||
CC = mpiicpc
|
CC = mpiicpc
|
||||||
OPTFLAGS = -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
|
OPTFLAGS = -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
|
||||||
CCFLAGS = -g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \
|
CCFLAGS = -qopenmp -DLAMMPS_MEMALIGN=64 -qno-offload \
|
||||||
-fno-alias -ansi-alias -restrict $(OPTFLAGS)
|
-fno-alias -ansi-alias -restrict $(OPTFLAGS)
|
||||||
SHFLAGS = -fPIC
|
SHFLAGS = -fPIC
|
||||||
DEPFLAGS = -M
|
DEPFLAGS = -M
|
||||||
|
|||||||
@ -8,7 +8,7 @@ SHELL = /bin/sh
|
|||||||
|
|
||||||
CC = mpiicpc
|
CC = mpiicpc
|
||||||
MIC_OPT = -qoffload-arch=mic-avx512 -fp-model fast=2
|
MIC_OPT = -qoffload-arch=mic-avx512 -fp-model fast=2
|
||||||
CCFLAGS = -g -O3 -qopenmp -DLMP_INTEL_OFFLOAD -DLAMMPS_MEMALIGN=64 \
|
CCFLAGS = -O3 -qopenmp -DLMP_INTEL_OFFLOAD -DLAMMPS_MEMALIGN=64 \
|
||||||
-xHost -fno-alias -ansi-alias -restrict \
|
-xHost -fno-alias -ansi-alias -restrict \
|
||||||
-qoverride-limits $(MIC_OPT)
|
-qoverride-limits $(MIC_OPT)
|
||||||
SHFLAGS = -fPIC
|
SHFLAGS = -fPIC
|
||||||
|
|||||||
@ -8,7 +8,7 @@ SHELL = /bin/sh
|
|||||||
|
|
||||||
CC = mpiicpc
|
CC = mpiicpc
|
||||||
OPTFLAGS = -xMIC-AVX512 -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
|
OPTFLAGS = -xMIC-AVX512 -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
|
||||||
CCFLAGS = -g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \
|
CCFLAGS = -qopenmp -DLAMMPS_MEMALIGN=64 -qno-offload \
|
||||||
-fno-alias -ansi-alias -restrict $(OPTFLAGS)
|
-fno-alias -ansi-alias -restrict $(OPTFLAGS)
|
||||||
SHFLAGS = -fPIC
|
SHFLAGS = -fPIC
|
||||||
DEPFLAGS = -M
|
DEPFLAGS = -M
|
||||||
|
|||||||
@ -34,6 +34,9 @@ using namespace FixConst;
|
|||||||
using namespace MathConst;
|
using namespace MathConst;
|
||||||
|
|
||||||
enum{SINGLE_PROC_DIRECT,SINGLE_PROC_MAP,MULTI_PROC};
|
enum{SINGLE_PROC_DIRECT,SINGLE_PROC_MAP,MULTI_PROC};
|
||||||
|
|
||||||
|
#define BUFSIZE 8
|
||||||
|
|
||||||
/* ---------------------------------------------------------------------- */
|
/* ---------------------------------------------------------------------- */
|
||||||
|
|
||||||
FixNEB::FixNEB(LAMMPS *lmp, int narg, char **arg) :
|
FixNEB::FixNEB(LAMMPS *lmp, int narg, char **arg) :
|
||||||
@ -45,56 +48,62 @@ FixNEB::FixNEB(LAMMPS *lmp, int narg, char **arg) :
|
|||||||
tagsendall(NULL), tagrecvall(NULL), counts(NULL),
|
tagsendall(NULL), tagrecvall(NULL), counts(NULL),
|
||||||
displacements(NULL)
|
displacements(NULL)
|
||||||
{
|
{
|
||||||
|
if (narg < 4) error->all(FLERR,"Illegal fix neb command");
|
||||||
NEBLongRange=false;
|
|
||||||
StandardNEB=true;
|
|
||||||
PerpSpring=FreeEndIni=FreeEndFinal=false;
|
|
||||||
FreeEndFinalWithRespToEIni=FinalAndInterWithRespToEIni=false;
|
|
||||||
|
|
||||||
kspringPerp=0.0;
|
|
||||||
kspring2=1.0;
|
|
||||||
if (narg < 4)
|
|
||||||
error->all(FLERR,"Illegal fix neb command, argument missing");
|
|
||||||
|
|
||||||
kspring = force->numeric(FLERR,arg[3]);
|
kspring = force->numeric(FLERR,arg[3]);
|
||||||
if (kspring <= 0.0)
|
if (kspring <= 0.0) error->all(FLERR,"Illegal fix neb command");
|
||||||
error->all(FLERR,"Illegal fix neb command."
|
|
||||||
" The spring force was not provided properly");
|
|
||||||
|
|
||||||
int iarg =4;
|
// optional params
|
||||||
|
|
||||||
|
NEBLongRange = false;
|
||||||
|
StandardNEB = true;
|
||||||
|
PerpSpring = FreeEndIni = FreeEndFinal = false;
|
||||||
|
FreeEndFinalWithRespToEIni = FinalAndInterWithRespToEIni = false;
|
||||||
|
kspringPerp = 0.0;
|
||||||
|
kspring2 = 1.0;
|
||||||
|
|
||||||
|
int iarg = 4;
|
||||||
while (iarg < narg) {
|
while (iarg < narg) {
|
||||||
if (strcmp (arg[iarg],"nudg_style")==0) {
|
if (strcmp(arg[iarg],"nudge") == 0) {
|
||||||
if (strcmp (arg[iarg+1],"idealpos")==0) {
|
if (iarg+2 > narg) error->all(FLERR,"Illegal fix neb command");
|
||||||
NEBLongRange = true;
|
if (strcmp(arg[iarg+1],"ideal") == 0) {
|
||||||
iarg+=2;}
|
NEBLongRange = true;
|
||||||
else if (strcmp (arg[iarg+1],"neigh")==0) {
|
StandardNEB = false;
|
||||||
NEBLongRange = false;
|
} else if (strcmp(arg[iarg+1],"neigh") == 0) {
|
||||||
StandardNEB = true;
|
NEBLongRange = false;
|
||||||
iarg+=2;}
|
StandardNEB = true;
|
||||||
else error->all(FLERR,"Illegal fix neb command. Unknown keyword");}
|
} else error->all(FLERR,"Illegal fix neb command");
|
||||||
else if (strcmp (arg[iarg],"perp")==0) {
|
iarg += 2;
|
||||||
PerpSpring=true;
|
|
||||||
|
} else if (strcmp(arg[iarg],"perp") == 0) {
|
||||||
|
if (iarg+2 > narg) error->all(FLERR,"Illegal fix neb command");
|
||||||
|
PerpSpring = true;
|
||||||
kspringPerp = force->numeric(FLERR,arg[iarg+1]);
|
kspringPerp = force->numeric(FLERR,arg[iarg+1]);
|
||||||
if (kspringPerp < 0.0)
|
if (kspringPerp == 0.0) PerpSpring = false;
|
||||||
error->all(FLERR,"Illegal fix neb command. "
|
if (kspringPerp < 0.0) error->all(FLERR,"Illegal fix neb command");
|
||||||
"The perpendicular spring force was not provided properly");
|
iarg += 2;
|
||||||
iarg+=2;}
|
|
||||||
else if (strcmp (arg[iarg],"freeend")==0) {
|
} else if (strcmp (arg[iarg],"end") == 0) {
|
||||||
if (strcmp (arg[iarg+1],"ini")==0)
|
if (iarg+3 > narg) error->all(FLERR,"Illegal fix neb command");
|
||||||
FreeEndIni=true;
|
if (strcmp(arg[iarg+1],"first") == 0) {
|
||||||
else if (strcmp (arg[iarg+1],"final")==0)
|
FreeEndIni = true;
|
||||||
FreeEndFinal=true;
|
} else if (strcmp(arg[iarg+1],"last") == 0) {
|
||||||
else if (strcmp (arg[iarg+1],"finaleini")==0)
|
FreeEndFinal = true;
|
||||||
FreeEndFinalWithRespToEIni=true;
|
FinalAndInterWithRespToEIni = false;
|
||||||
else if (strcmp (arg[iarg+1],"final2eini")==0) {
|
FreeEndFinalWithRespToEIni = false;
|
||||||
FinalAndInterWithRespToEIni=true;
|
} else if (strcmp(arg[iarg+1],"last/efirst") == 0) {
|
||||||
FreeEndFinalWithRespToEIni=true;}
|
FreeEndFinal = false;
|
||||||
else if (strcmp (arg[iarg+1],"none")!=0) error->all(FLERR,"Illegal fix neb command. Unknown keyword");
|
FinalAndInterWithRespToEIni = false;
|
||||||
iarg+=2;}
|
FreeEndFinalWithRespToEIni = true;
|
||||||
else if (strcmp (arg[iarg],"freeend_kspring")==0) {
|
} else if (strcmp(arg[iarg+1],"last/efirst/middle") == 0) {
|
||||||
kspring2=force->numeric(FLERR,arg[iarg+1]);
|
FreeEndFinal = false;
|
||||||
iarg+=2; }
|
FinalAndInterWithRespToEIni = true;
|
||||||
else error->all(FLERR,"Illegal fix neb command. Unknown keyword");
|
FreeEndFinalWithRespToEIni = true;
|
||||||
|
} else error->all(FLERR,"Illegal fix neb command");
|
||||||
|
kspring2 = force->numeric(FLERR,arg[iarg+2]);
|
||||||
|
iarg += 3;
|
||||||
|
|
||||||
|
} else error->all(FLERR,"Illegal fix neb command");
|
||||||
}
|
}
|
||||||
|
|
||||||
// nreplica = number of partitions
|
// nreplica = number of partitions
|
||||||
@ -119,12 +128,12 @@ FixNEB::FixNEB(LAMMPS *lmp, int narg, char **arg) :
|
|||||||
MPI_Group uworldgroup,rootgroup;
|
MPI_Group uworldgroup,rootgroup;
|
||||||
if (NEBLongRange) {
|
if (NEBLongRange) {
|
||||||
for (int i=0; i<nreplica; i++)
|
for (int i=0; i<nreplica; i++)
|
||||||
iroots[i]=universe->root_proc[i];
|
iroots[i] = universe->root_proc[i];
|
||||||
MPI_Comm_group(uworld, &uworldgroup);
|
MPI_Comm_group(uworld, &uworldgroup);
|
||||||
MPI_Group_incl(uworldgroup, nreplica, iroots, &rootgroup);
|
MPI_Group_incl(uworldgroup, nreplica, iroots, &rootgroup);
|
||||||
MPI_Comm_create(uworld, rootgroup, &rootworld);
|
MPI_Comm_create(uworld, rootgroup, &rootworld);
|
||||||
}
|
}
|
||||||
delete[] iroots;
|
delete [] iroots;
|
||||||
|
|
||||||
// create a new compute pe style
|
// create a new compute pe style
|
||||||
// id = fix-ID + pe, compute group = all
|
// id = fix-ID + pe, compute group = all
|
||||||
@ -256,11 +265,11 @@ void FixNEB::min_post_force(int vflag)
|
|||||||
double delxp,delyp,delzp,delxn,delyn,delzn;
|
double delxp,delyp,delzp,delxn,delyn,delzn;
|
||||||
double vIni=0.0;
|
double vIni=0.0;
|
||||||
|
|
||||||
vprev=vnext=veng=pe->compute_scalar();
|
vprev = vnext = veng = pe->compute_scalar();
|
||||||
|
|
||||||
if (ireplica < nreplica-1 && me ==0)
|
if (ireplica < nreplica-1 && me == 0)
|
||||||
MPI_Send(&veng,1,MPI_DOUBLE,procnext,0,uworld);
|
MPI_Send(&veng,1,MPI_DOUBLE,procnext,0,uworld);
|
||||||
if (ireplica > 0 && me ==0)
|
if (ireplica > 0 && me == 0)
|
||||||
MPI_Recv(&vprev,1,MPI_DOUBLE,procprev,0,uworld,MPI_STATUS_IGNORE);
|
MPI_Recv(&vprev,1,MPI_DOUBLE,procprev,0,uworld,MPI_STATUS_IGNORE);
|
||||||
|
|
||||||
if (ireplica > 0 && me == 0)
|
if (ireplica > 0 && me == 0)
|
||||||
@ -297,6 +306,7 @@ void FixNEB::min_post_force(int vflag)
|
|||||||
}
|
}
|
||||||
|
|
||||||
// communicate atoms to/from adjacent replicas to fill xprev,xnext
|
// communicate atoms to/from adjacent replicas to fill xprev,xnext
|
||||||
|
|
||||||
inter_replica_comm();
|
inter_replica_comm();
|
||||||
|
|
||||||
// trigger potential energy computation on next timestep
|
// trigger potential energy computation on next timestep
|
||||||
@ -335,10 +345,10 @@ void FixNEB::min_post_force(int vflag)
|
|||||||
tangent[i][0]=delxp;
|
tangent[i][0]=delxp;
|
||||||
tangent[i][1]=delyp;
|
tangent[i][1]=delyp;
|
||||||
tangent[i][2]=delzp;
|
tangent[i][2]=delzp;
|
||||||
tlen += tangent[i][0]*tangent[i][0]
|
tlen += tangent[i][0]*tangent[i][0] +
|
||||||
+ tangent[i][1]*tangent[i][1] + tangent[i][2]*tangent[i][2];
|
tangent[i][1]*tangent[i][1] + tangent[i][2]*tangent[i][2];
|
||||||
dot += f[i][0]*tangent[i][0]
|
dot += f[i][0]*tangent[i][0] + f[i][1]*tangent[i][1] +
|
||||||
+ f[i][1]*tangent[i][1] + f[i][2]*tangent[i][2];
|
f[i][2]*tangent[i][2];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -360,10 +370,10 @@ void FixNEB::min_post_force(int vflag)
|
|||||||
tangent[i][0]=delxn;
|
tangent[i][0]=delxn;
|
||||||
tangent[i][1]=delyn;
|
tangent[i][1]=delyn;
|
||||||
tangent[i][2]=delzn;
|
tangent[i][2]=delzn;
|
||||||
tlen += tangent[i][0]*tangent[i][0]
|
tlen += tangent[i][0]*tangent[i][0] +
|
||||||
+ tangent[i][1]*tangent[i][1] + tangent[i][2]*tangent[i][2];
|
tangent[i][1]*tangent[i][1] + tangent[i][2]*tangent[i][2];
|
||||||
dot += f[i][0]*tangent[i][0]
|
dot += f[i][0]*tangent[i][0] + f[i][1]*tangent[i][1] +
|
||||||
+ f[i][1]*tangent[i][1] + f[i][2]*tangent[i][2];
|
f[i][2]*tangent[i][2];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -388,13 +398,13 @@ void FixNEB::min_post_force(int vflag)
|
|||||||
domain->minimum_image(delxn,delyn,delzn);
|
domain->minimum_image(delxn,delyn,delzn);
|
||||||
|
|
||||||
if (vnext > veng && veng > vprev) {
|
if (vnext > veng && veng > vprev) {
|
||||||
tangent[i][0]=delxn;
|
tangent[i][0] = delxn;
|
||||||
tangent[i][1]=delyn;
|
tangent[i][1] = delyn;
|
||||||
tangent[i][2]=delzn;
|
tangent[i][2] = delzn;
|
||||||
} else if (vnext < veng && veng < vprev) {
|
} else if (vnext < veng && veng < vprev) {
|
||||||
tangent[i][0]=delxp;
|
tangent[i][0] = delxp;
|
||||||
tangent[i][1]=delyp;
|
tangent[i][1] = delyp;
|
||||||
tangent[i][2]=delzp;
|
tangent[i][2] = delzp;
|
||||||
} else {
|
} else {
|
||||||
if (vnext > vprev) {
|
if (vnext > vprev) {
|
||||||
tangent[i][0] = vmax*delxn + vmin*delxp;
|
tangent[i][0] = vmax*delxn + vmin*delxp;
|
||||||
@ -408,24 +418,23 @@ void FixNEB::min_post_force(int vflag)
|
|||||||
}
|
}
|
||||||
|
|
||||||
nlen += delxn*delxn + delyn*delyn + delzn*delzn;
|
nlen += delxn*delxn + delyn*delyn + delzn*delzn;
|
||||||
tlen += tangent[i][0]*tangent[i][0]
|
tlen += tangent[i][0]*tangent[i][0] +
|
||||||
+ tangent[i][1]*tangent[i][1] + tangent[i][2]*tangent[i][2];
|
tangent[i][1]*tangent[i][1] + tangent[i][2]*tangent[i][2];
|
||||||
gradlen += f[i][0]*f[i][0] + f[i][1]*f[i][1] + f[i][2]*f[i][2];
|
gradlen += f[i][0]*f[i][0] + f[i][1]*f[i][1] + f[i][2]*f[i][2];
|
||||||
dotpath += delxp*delxn + delyp*delyn + delzp*delzn;
|
dotpath += delxp*delxn + delyp*delyn + delzp*delzn;
|
||||||
dottangrad += tangent[i][0]* f[i][0]
|
dottangrad += tangent[i][0]*f[i][0] +
|
||||||
+ tangent[i][1]*f[i][1] + tangent[i][2]*f[i][2];
|
tangent[i][1]*f[i][1] + tangent[i][2]*f[i][2];
|
||||||
gradnextlen += fnext[i][0]*fnext[i][0]
|
gradnextlen += fnext[i][0]*fnext[i][0] +
|
||||||
+ fnext[i][1]*fnext[i][1] +fnext[i][2] * fnext[i][2];
|
fnext[i][1]*fnext[i][1] +fnext[i][2] * fnext[i][2];
|
||||||
dotgrad += f[i][0]*fnext[i][0]
|
dotgrad += f[i][0]*fnext[i][0] + f[i][1]*fnext[i][1] +
|
||||||
+ f[i][1]*fnext[i][1] + f[i][2]*fnext[i][2];
|
f[i][2]*fnext[i][2];
|
||||||
|
|
||||||
springF[i][0]=kspringPerp*(delxn-delxp);
|
springF[i][0] = kspringPerp*(delxn-delxp);
|
||||||
springF[i][1]=kspringPerp*(delyn-delyp);
|
springF[i][1] = kspringPerp*(delyn-delyp);
|
||||||
springF[i][2]=kspringPerp*(delzn-delzp);
|
springF[i][2] = kspringPerp*(delzn-delzp);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#define BUFSIZE 8
|
|
||||||
double bufin[BUFSIZE], bufout[BUFSIZE];
|
double bufin[BUFSIZE], bufout[BUFSIZE];
|
||||||
bufin[0] = nlen;
|
bufin[0] = nlen;
|
||||||
bufin[1] = plen;
|
bufin[1] = plen;
|
||||||
@ -459,7 +468,7 @@ void FixNEB::min_post_force(int vflag)
|
|||||||
|
|
||||||
// first or last replica has no change to forces, just return
|
// first or last replica has no change to forces, just return
|
||||||
|
|
||||||
if(ireplica>0 && ireplica<nreplica-1)
|
if (ireplica > 0 && ireplica < nreplica-1)
|
||||||
dottangrad = dottangrad/(tlen*gradlen);
|
dottangrad = dottangrad/(tlen*gradlen);
|
||||||
if (ireplica == 0)
|
if (ireplica == 0)
|
||||||
dottangrad = dottangrad/(nlen*gradlen);
|
dottangrad = dottangrad/(nlen*gradlen);
|
||||||
@ -468,7 +477,6 @@ void FixNEB::min_post_force(int vflag)
|
|||||||
if (ireplica < nreplica-1)
|
if (ireplica < nreplica-1)
|
||||||
dotgrad = dotgrad /(gradlen*gradnextlen);
|
dotgrad = dotgrad /(gradlen*gradnextlen);
|
||||||
|
|
||||||
|
|
||||||
if (FreeEndIni && ireplica == 0) {
|
if (FreeEndIni && ireplica == 0) {
|
||||||
if (tlen > 0.0) {
|
if (tlen > 0.0) {
|
||||||
double dotall;
|
double dotall;
|
||||||
@ -568,14 +576,15 @@ void FixNEB::min_post_force(int vflag)
|
|||||||
|
|
||||||
for (int i = 0; i < nlocal; i++) {
|
for (int i = 0; i < nlocal; i++) {
|
||||||
if (mask[i] & groupbit) {
|
if (mask[i] & groupbit) {
|
||||||
dot += f[i][0]*tangent[i][0]
|
dot += f[i][0]*tangent[i][0] + f[i][1]*tangent[i][1] +
|
||||||
+ f[i][1]*tangent[i][1] + f[i][2]*tangent[i][2];
|
f[i][2]*tangent[i][2];
|
||||||
dotSpringTangent += springF[i][0]*tangent[i][0]
|
dotSpringTangent += springF[i][0]*tangent[i][0] +
|
||||||
+springF[i][1]*tangent[i][1]+springF[i][2]*tangent[i][2];}
|
springF[i][1]*tangent[i][1] + springF[i][2]*tangent[i][2];}
|
||||||
}
|
}
|
||||||
|
|
||||||
double dotSpringTangentall;
|
double dotSpringTangentall;
|
||||||
MPI_Allreduce(&dotSpringTangent,&dotSpringTangentall,1,MPI_DOUBLE,MPI_SUM,world);
|
MPI_Allreduce(&dotSpringTangent,&dotSpringTangentall,1,
|
||||||
|
MPI_DOUBLE,MPI_SUM,world);
|
||||||
dotSpringTangent=dotSpringTangentall;
|
dotSpringTangent=dotSpringTangentall;
|
||||||
double dotall;
|
double dotall;
|
||||||
MPI_Allreduce(&dot,&dotall,1,MPI_DOUBLE,MPI_SUM,world);
|
MPI_Allreduce(&dot,&dotall,1,MPI_DOUBLE,MPI_SUM,world);
|
||||||
@ -603,12 +612,12 @@ void FixNEB::min_post_force(int vflag)
|
|||||||
|
|
||||||
for (int i = 0; i < nlocal; i++)
|
for (int i = 0; i < nlocal; i++)
|
||||||
if (mask[i] & groupbit) {
|
if (mask[i] & groupbit) {
|
||||||
f[i][0] += prefactor*tangent[i][0]
|
f[i][0] += prefactor*tangent[i][0] +
|
||||||
+AngularContr*(springF[i][0] -dotSpringTangent*tangent[i][0]);
|
AngularContr*(springF[i][0] - dotSpringTangent*tangent[i][0]);
|
||||||
f[i][1] += prefactor*tangent[i][1]
|
f[i][1] += prefactor*tangent[i][1] +
|
||||||
+ AngularContr*(springF[i][1] - dotSpringTangent*tangent[i][1]);
|
AngularContr*(springF[i][1] - dotSpringTangent*tangent[i][1]);
|
||||||
f[i][2] += prefactor*tangent[i][2]
|
f[i][2] += prefactor*tangent[i][2] +
|
||||||
+ AngularContr*(springF[i][2] - dotSpringTangent*tangent[i][2]);
|
AngularContr*(springF[i][2] - dotSpringTangent*tangent[i][2]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -827,7 +836,6 @@ void FixNEB::inter_replica_comm()
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/* ----------------------------------------------------------------------
|
/* ----------------------------------------------------------------------
|
||||||
reallocate xprev,xnext,tangent arrays if necessary
|
reallocate xprev,xnext,tangent arrays if necessary
|
||||||
reallocate communication arrays if necessary
|
reallocate communication arrays if necessary
|
||||||
|
|||||||
@ -4,6 +4,7 @@
|
|||||||
--------------------------------
|
--------------------------------
|
||||||
|
|
||||||
W. Michael Brown (Intel) michael.w.brown at intel.com
|
W. Michael Brown (Intel) michael.w.brown at intel.com
|
||||||
|
William McDoniel (RWTH Aachen University)
|
||||||
Rodrigo Canales (RWTH Aachen University)
|
Rodrigo Canales (RWTH Aachen University)
|
||||||
Markus H<>hnerbach (RWTH Aachen University)
|
Markus H<>hnerbach (RWTH Aachen University)
|
||||||
Stan Moore (Sandia)
|
Stan Moore (Sandia)
|
||||||
@ -14,15 +15,25 @@
|
|||||||
|
|
||||||
-----------------------------------------------------------------------------
|
-----------------------------------------------------------------------------
|
||||||
|
|
||||||
This package is based on the USER-OMP package and provides LAMMPS styles that:
|
This package provides LAMMPS styles that:
|
||||||
|
|
||||||
1. include support for single and mixed precision in addition to double.
|
1. include support for single and mixed precision in addition to double.
|
||||||
2. include modifications to support vectorization for key routines
|
2. include modifications to support vectorization for key routines
|
||||||
|
3. include modifications for data layouts to improve cache efficiency
|
||||||
3. include modifications to support offload to Intel(R) Xeon Phi(TM)
|
3. include modifications to support offload to Intel(R) Xeon Phi(TM)
|
||||||
coprocessors
|
coprocessors
|
||||||
|
|
||||||
-----------------------------------------------------------------------------
|
-----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
For Intel server processors codenamed "Skylake", the following flags should
|
||||||
|
be added or changed in the Makefile depending on the version:
|
||||||
|
|
||||||
|
2017 update 2 - No changes needed
|
||||||
|
2017 updates 3 or 4 - Use -xCOMMON-AVX512 and not -xHost or -xCORE-AVX512
|
||||||
|
2018 or newer - Use -xHost or -xCORE-AVX512 and -qopt-zmm-usage=high
|
||||||
|
|
||||||
|
-----------------------------------------------------------------------------
|
||||||
|
|
||||||
When using the suffix command with "intel", intel styles will be used if they
|
When using the suffix command with "intel", intel styles will be used if they
|
||||||
exist. If the suffix command is used with "hybrid intel omp" and the USER-OMP
|
exist. If the suffix command is used with "hybrid intel omp" and the USER-OMP
|
||||||
USER-OMP styles will be used whenever USER-INTEL styles are not available. This
|
USER-OMP styles will be used whenever USER-INTEL styles are not available. This
|
||||||
|
|||||||
@ -4,6 +4,7 @@
|
|||||||
# in.intel.lj - Atomic fluid (LJ Benchmark)
|
# in.intel.lj - Atomic fluid (LJ Benchmark)
|
||||||
# in.intel.rhodo - Protein (Rhodopsin Benchmark)
|
# in.intel.rhodo - Protein (Rhodopsin Benchmark)
|
||||||
# in.intel.lc - Liquid Crystal w/ Gay-Berne potential
|
# in.intel.lc - Liquid Crystal w/ Gay-Berne potential
|
||||||
|
# in.intel.eam - Copper benchmark with Embedded Atom Method
|
||||||
# in.intel.sw - Silicon benchmark with Stillinger-Weber
|
# in.intel.sw - Silicon benchmark with Stillinger-Weber
|
||||||
# in.intel.tersoff - Silicon benchmark with Tersoff
|
# in.intel.tersoff - Silicon benchmark with Tersoff
|
||||||
# in.intel.water - Coarse-grain water benchmark using Stillinger-Weber
|
# in.intel.water - Coarse-grain water benchmark using Stillinger-Weber
|
||||||
@ -11,19 +12,26 @@
|
|||||||
#############################################################################
|
#############################################################################
|
||||||
|
|
||||||
#############################################################################
|
#############################################################################
|
||||||
# Expected Timesteps/second with turbo on and HT enabled, LAMMPS 18-Jun-2016
|
# Expected Timesteps/second with turbo on and HT enabled, LAMMPS June-2017
|
||||||
|
# - Compiled w/ Intel Parallel Studio 2017u2 and Makefile.intel_cpu_intelmpi
|
||||||
#
|
#
|
||||||
# Xeon E5-2697v4 Xeon Phi 7250
|
# Xeon E5-2697v4 Xeon Phi 7250
|
||||||
#
|
#
|
||||||
# in.intel.lj - 162.764 179.148
|
# in.intel.lj - 199.5 282.3
|
||||||
# in.intel.rhodo - 11.633 13.668
|
# in.intel.rhodo - 12.4 17.5
|
||||||
# in.intel.lc - 19.136 24.863
|
# in.intel.lc - 19.0 25.7
|
||||||
# in.intel.sw - 139.048 152.026
|
# in.intel.eam - 59.4 92.8
|
||||||
# in.intel.tersoff - 82.663 92.985
|
# in.intel.sw - 132.4 161.9
|
||||||
# in.intel.water - 59.838 85.704
|
# in.intel.tersoff - 83.3 101.1
|
||||||
|
# in.intel.water - 53.4 90.3
|
||||||
#
|
#
|
||||||
#############################################################################
|
#############################################################################
|
||||||
|
|
||||||
|
#############################################################################
|
||||||
|
# For Skylake server (Xeon) architectures, see notes in the USER-INTEL/README
|
||||||
|
# for build flags that should be used.
|
||||||
|
#############################################################################
|
||||||
|
|
||||||
#############################################################################
|
#############################################################################
|
||||||
# For Haswell (Xeon v3) architectures, depending on the compiler version,
|
# For Haswell (Xeon v3) architectures, depending on the compiler version,
|
||||||
# it may give better performance to compile for an AVX target (with -xAVX
|
# it may give better performance to compile for an AVX target (with -xAVX
|
||||||
@ -42,7 +50,18 @@
|
|||||||
# -v m 0.5 # Run for half as long
|
# -v m 0.5 # Run for half as long
|
||||||
#############################################################################
|
#############################################################################
|
||||||
|
|
||||||
# Example for running benchmarks:
|
#############################################################################
|
||||||
|
# The LAMMPS newton setting can be controlled from the commandline for the
|
||||||
|
# benchmarks with the N variable:
|
||||||
|
#
|
||||||
|
# -v N on # newton on
|
||||||
|
# -v N off # newton off
|
||||||
|
#
|
||||||
|
# The default is on for all of the benchmarks except for LJ where the off
|
||||||
|
# setting performs best with the USER-INTEL package
|
||||||
|
#############################################################################
|
||||||
|
|
||||||
|
# Example for running benchmarks (see run_benchmarks.sh for script):
|
||||||
|
|
||||||
# Number of physical cores per node not including hyperthreads
|
# Number of physical cores per node not including hyperthreads
|
||||||
export LMP_CORES=28
|
export LMP_CORES=28
|
||||||
@ -57,26 +76,35 @@ export LMP_BIN=../../lmp_intel_cpu
|
|||||||
# LAMMPS root directory
|
# LAMMPS root directory
|
||||||
export LMP_ROOT=../../../
|
export LMP_ROOT=../../../
|
||||||
|
|
||||||
source /opt/intel/parallel_studio_xe_2016.2.062/psxevars.sh
|
source source /opt/intel/parallel_studio_xe_2017.2.050/psxevars.sh
|
||||||
|
export KMP_BLOCKTIME=0
|
||||||
export I_MPI_PIN_DOMAIN=core
|
export I_MPI_PIN_DOMAIN=core
|
||||||
export I_MPI_FABRICS=shm # For single node
|
export I_MPI_FABRICS=shm # For single node
|
||||||
|
|
||||||
|
# ONLY FOR INTEL XEON PHI x200 SERIES PROCESSORS
|
||||||
|
export I_MPI_SHM_LMT=shm
|
||||||
|
|
||||||
# Generate the restart file for use with liquid crystal benchmark
|
# Generate the restart file for use with liquid crystal benchmark
|
||||||
mpirun -np $LMP_CORES $LMP_BIN -in in.lc_generate_restart -log none
|
mpirun -np $LMP_CORES $LMP_BIN -in in.lc_generate_restart -log none
|
||||||
|
|
||||||
# Benchmark to run
|
# Benchmark to run
|
||||||
export bench=in.intel.lj
|
export bench=in.intel.lj
|
||||||
|
|
||||||
|
#############################################################################
|
||||||
|
# For Intel Xeon Phi x200 series processors best performance is achieved by
|
||||||
|
# using MCDRAM. In flat mode, this can be achieved with numactl,
|
||||||
|
# MPI environment variables, or other options provided by batch schedulers
|
||||||
|
#############################################################################
|
||||||
|
|
||||||
#############################################################################
|
#############################################################################
|
||||||
# To run without a optimization package
|
# To run without a optimization package
|
||||||
#############################################################################
|
#############################################################################
|
||||||
mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none
|
mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none -v N on
|
||||||
|
|
||||||
#############################################################################
|
#############################################################################
|
||||||
# To run with USER-OMP package
|
# To run with USER-OMP package
|
||||||
#############################################################################
|
#############################################################################
|
||||||
mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none -pk omp 0 -sf omp
|
mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none -pk omp 0 -sf omp -v N on
|
||||||
|
|
||||||
#############################################################################
|
#############################################################################
|
||||||
# To run with USER-INTEL package and no coprocessor
|
# To run with USER-INTEL package and no coprocessor
|
||||||
@ -89,6 +117,9 @@ mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none -pk intel 0 -sf intel
|
|||||||
mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none -pk intel 1 -sf intel
|
mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none -pk intel 1 -sf intel
|
||||||
|
|
||||||
#############################################################################
|
#############################################################################
|
||||||
# If using PPPM (in.intel.rhodo) on Intel Xeon Phi x200 series processors
|
# If using PPPM (e.g. in.intel.rhodo) on Intel Xeon Phi x200 series
|
||||||
|
# or Skylake processors
|
||||||
#############################################################################
|
#############################################################################
|
||||||
mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none -pk intel 0 omp 3 lrt yes -sf intel
|
export KMP_AFFINITY=none
|
||||||
|
rthreads=$((OMP_NUM_THREADS-1))
|
||||||
|
mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none -pk intel 0 omp $rthreads lrt yes -sf intel
|
||||||
|
|||||||
@ -1,4 +1,6 @@
|
|||||||
# bulk Cu lattice
|
# bulk Cu lattice
|
||||||
|
|
||||||
|
variable N index on # Newton Setting
|
||||||
variable w index 10 # Warmup Timesteps
|
variable w index 10 # Warmup Timesteps
|
||||||
variable t index 3100 # Main Run Timesteps
|
variable t index 3100 # Main Run Timesteps
|
||||||
variable m index 1 # Main Run Timestep Multiplier
|
variable m index 1 # Main Run Timestep Multiplier
|
||||||
@ -13,6 +15,7 @@ variable z index 2
|
|||||||
variable rr equal floor($t*$m)
|
variable rr equal floor($t*$m)
|
||||||
variable root getenv LMP_ROOT
|
variable root getenv LMP_ROOT
|
||||||
|
|
||||||
|
newton $N
|
||||||
if "$n > 0" then "processors * * * grid numa"
|
if "$n > 0" then "processors * * * grid numa"
|
||||||
|
|
||||||
variable xx equal 20*$x
|
variable xx equal 20*$x
|
||||||
|
|||||||
@ -3,6 +3,7 @@
|
|||||||
# shape: 2 1.5 1
|
# shape: 2 1.5 1
|
||||||
# cutoff 4.0 with skin 0.8
|
# cutoff 4.0 with skin 0.8
|
||||||
|
|
||||||
|
variable N index on # Newton Setting
|
||||||
variable w index 10 # Warmup Timesteps
|
variable w index 10 # Warmup Timesteps
|
||||||
variable t index 840 # Main Run Timesteps
|
variable t index 840 # Main Run Timesteps
|
||||||
variable m index 1 # Main Run Timestep Multiplier
|
variable m index 1 # Main Run Timestep Multiplier
|
||||||
@ -15,6 +16,7 @@ variable z index 2
|
|||||||
|
|
||||||
variable rr equal floor($t*$m)
|
variable rr equal floor($t*$m)
|
||||||
|
|
||||||
|
newton $N
|
||||||
if "$n > 0" then "processors * * * grid numa"
|
if "$n > 0" then "processors * * * grid numa"
|
||||||
|
|
||||||
units lj
|
units lj
|
||||||
|
|||||||
@ -1,5 +1,6 @@
|
|||||||
# 3d Lennard-Jones melt
|
# 3d Lennard-Jones melt
|
||||||
|
|
||||||
|
variable N index off # Newton Setting
|
||||||
variable w index 10 # Warmup Timesteps
|
variable w index 10 # Warmup Timesteps
|
||||||
variable t index 7900 # Main Run Timesteps
|
variable t index 7900 # Main Run Timesteps
|
||||||
variable m index 1 # Main Run Timestep Multiplier
|
variable m index 1 # Main Run Timestep Multiplier
|
||||||
@ -15,6 +16,7 @@ variable yy equal 20*$y
|
|||||||
variable zz equal 20*$z
|
variable zz equal 20*$z
|
||||||
variable rr equal floor($t*$m)
|
variable rr equal floor($t*$m)
|
||||||
|
|
||||||
|
newton $N
|
||||||
if "$n > 0" then "processors * * * grid numa"
|
if "$n > 0" then "processors * * * grid numa"
|
||||||
|
|
||||||
units lj
|
units lj
|
||||||
|
|||||||
@ -1,5 +1,6 @@
|
|||||||
# Rhodopsin model
|
# Rhodopsin model
|
||||||
|
|
||||||
|
variable N index on # Newton Setting
|
||||||
variable w index 10 # Warmup Timesteps
|
variable w index 10 # Warmup Timesteps
|
||||||
variable t index 520 # Main Run Timesteps
|
variable t index 520 # Main Run Timesteps
|
||||||
variable m index 1 # Main Run Timestep Multiplier
|
variable m index 1 # Main Run Timestep Multiplier
|
||||||
@ -16,10 +17,11 @@ variable z index 2
|
|||||||
variable rr equal floor($t*$m)
|
variable rr equal floor($t*$m)
|
||||||
variable root getenv LMP_ROOT
|
variable root getenv LMP_ROOT
|
||||||
|
|
||||||
|
newton $N
|
||||||
if "$n > 0" then "processors * * * grid numa"
|
if "$n > 0" then "processors * * * grid numa"
|
||||||
|
|
||||||
units real
|
units real
|
||||||
neigh_modify delay 5 every 1 binsize $b
|
neigh_modify delay 5 every 1
|
||||||
|
|
||||||
atom_style full
|
atom_style full
|
||||||
bond_style harmonic
|
bond_style harmonic
|
||||||
|
|||||||
@ -1,5 +1,6 @@
|
|||||||
# bulk Si via Stillinger-Weber
|
# bulk Si via Stillinger-Weber
|
||||||
|
|
||||||
|
variable N index on # Newton Setting
|
||||||
variable w index 10 # Warmup Timesteps
|
variable w index 10 # Warmup Timesteps
|
||||||
variable t index 6200 # Main Run Timesteps
|
variable t index 6200 # Main Run Timesteps
|
||||||
variable m index 1 # Main Run Timestep Multiplier
|
variable m index 1 # Main Run Timestep Multiplier
|
||||||
@ -16,6 +17,7 @@ variable zz equal 10*$z
|
|||||||
variable rr equal floor($t*$m)
|
variable rr equal floor($t*$m)
|
||||||
variable root getenv LMP_ROOT
|
variable root getenv LMP_ROOT
|
||||||
|
|
||||||
|
newton $N
|
||||||
if "$n > 0" then "processors * * * grid numa"
|
if "$n > 0" then "processors * * * grid numa"
|
||||||
|
|
||||||
units metal
|
units metal
|
||||||
|
|||||||
@ -1,5 +1,6 @@
|
|||||||
# bulk Si via Tersoff
|
# bulk Si via Tersoff
|
||||||
|
|
||||||
|
variable N index on # Newton Setting
|
||||||
variable w index 10 # Warmup Timesteps
|
variable w index 10 # Warmup Timesteps
|
||||||
variable t index 2420 # Main Run Timesteps
|
variable t index 2420 # Main Run Timesteps
|
||||||
variable m index 1 # Main Run Timestep Multiplier
|
variable m index 1 # Main Run Timestep Multiplier
|
||||||
@ -16,6 +17,7 @@ variable zz equal 10*$z
|
|||||||
variable rr equal floor($t*$m)
|
variable rr equal floor($t*$m)
|
||||||
variable root getenv LMP_ROOT
|
variable root getenv LMP_ROOT
|
||||||
|
|
||||||
|
newton $N
|
||||||
if "$n > 0" then "processors * * * grid numa"
|
if "$n > 0" then "processors * * * grid numa"
|
||||||
|
|
||||||
units metal
|
units metal
|
||||||
|
|||||||
@ -1,5 +1,6 @@
|
|||||||
# Coarse-grain water simulation using Stillinger-Weber
|
# Coarse-grain water simulation using Stillinger-Weber
|
||||||
|
|
||||||
|
variable N index on # Newton Setting
|
||||||
variable w index 10 # Warmup Timesteps
|
variable w index 10 # Warmup Timesteps
|
||||||
variable t index 2600 # Main Run Timesteps
|
variable t index 2600 # Main Run Timesteps
|
||||||
variable m index 1 # Main Run Timestep Multiplier
|
variable m index 1 # Main Run Timestep Multiplier
|
||||||
@ -11,6 +12,7 @@ variable y index 2
|
|||||||
variable z index 2
|
variable z index 2
|
||||||
variable rr equal floor($t*$m)
|
variable rr equal floor($t*$m)
|
||||||
|
|
||||||
|
newton $N
|
||||||
if "$n > 0" then "processors * * * grid numa"
|
if "$n > 0" then "processors * * * grid numa"
|
||||||
|
|
||||||
units real
|
units real
|
||||||
|
|||||||
@ -4,13 +4,13 @@
|
|||||||
# cutoff 4.0 with skin 0.8
|
# cutoff 4.0 with skin 0.8
|
||||||
# NPT, T=2.4, P=8.0
|
# NPT, T=2.4, P=8.0
|
||||||
|
|
||||||
variable x index 1
|
variable xt index 1
|
||||||
variable y index 1
|
variable yt index 1
|
||||||
variable z index 1
|
variable zt index 1
|
||||||
|
|
||||||
variable i equal $x*32
|
variable i equal ${xt}*32
|
||||||
variable j equal $y*32
|
variable j equal ${yt}*32
|
||||||
variable k equal $z*32
|
variable k equal ${zt}*32
|
||||||
|
|
||||||
units lj
|
units lj
|
||||||
atom_style ellipsoid
|
atom_style ellipsoid
|
||||||
|
|||||||
86
src/USER-INTEL/TEST/run_benchmarks.sh
Executable file
86
src/USER-INTEL/TEST/run_benchmarks.sh
Executable file
@ -0,0 +1,86 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
#########################################################################
|
||||||
|
# Adjust settings below for your system
|
||||||
|
#########################################################################
|
||||||
|
|
||||||
|
# --------------------- MPI Launch Command
|
||||||
|
|
||||||
|
export MPI="mpirun"
|
||||||
|
#export MPI="numactl -p 1 mpirun" # -- Systems w/ MCDRAM in flat mode
|
||||||
|
|
||||||
|
# ------------- Name and location of the LAMMPS binary
|
||||||
|
|
||||||
|
export LMP_BIN=../../lmp_intel_cpu_intelmpi
|
||||||
|
#export LMP_BIN=../../lmp_knl
|
||||||
|
|
||||||
|
# ------------- Directory containing the LAMMPS installation
|
||||||
|
|
||||||
|
export LMP_ROOT=../../../
|
||||||
|
|
||||||
|
# ------------- Number of physical cores (not HW threads)
|
||||||
|
|
||||||
|
export LMP_CORES=36 # -- For Intel Xeon E5-2697v4 SKU
|
||||||
|
#export LMP_CORES=68 # -- For Intel Xeon Phi x200 7250 SKU
|
||||||
|
|
||||||
|
# ------------- Number of HW threads to use in tests
|
||||||
|
|
||||||
|
export LMP_THREAD_LIST="2" # -- For 2 threads per core w/ HT enabled
|
||||||
|
#export LMP_THREAD_LIST="2 4" # -- For 2 threads per core w/ HT enabled
|
||||||
|
|
||||||
|
# ------------- MPI Tuning Parameters
|
||||||
|
|
||||||
|
#export I_MPI_SHM_LMT=shm # -- Uncomment for Xeon Phi x200 series
|
||||||
|
|
||||||
|
# ------------- Library locations for build
|
||||||
|
|
||||||
|
#source /opt/intel/parallel_studio_xe_2017.2.050/psxevars.sh
|
||||||
|
|
||||||
|
#########################################################################
|
||||||
|
# End settings for your system
|
||||||
|
#########################################################################
|
||||||
|
|
||||||
|
export WORKLOADS="lj rhodo rhodo_lrt lc sw water eam"
|
||||||
|
export LMP_ARGS="-pk intel 0 -sf intel -screen none -v d 1"
|
||||||
|
export RLMP_ARGS="-pk intel 0 lrt yes -sf intel -screen none -v d 1"
|
||||||
|
|
||||||
|
export LOG_DIR_HEADER=`echo $LMP_BIN | sed 's/\.\.\///g' | sed 's/\.\///g'`
|
||||||
|
export LOG_DIR_HOST=`hostname`
|
||||||
|
export DATE_STRING=`date +%s`
|
||||||
|
export LOG_DIR=$LOG_DIR_HOST"_"$LOG_DIR_HEADER"_"$DATE_STRING
|
||||||
|
mkdir $LOG_DIR
|
||||||
|
|
||||||
|
export I_MPI_PIN_DOMAIN=core
|
||||||
|
export I_MPI_FABRICS=shm
|
||||||
|
export KMP_BLOCKTIME=0
|
||||||
|
|
||||||
|
echo -n "Creating restart file...."
|
||||||
|
$MPI -np $LMP_CORES $LMP_BIN -in in.lc_generate_restart -log none $LMP_ARGS
|
||||||
|
echo "Done."
|
||||||
|
for threads in $LMP_THREAD_LIST
|
||||||
|
do
|
||||||
|
export OMP_NUM_THREADS=$threads
|
||||||
|
for workload in $WORKLOADS
|
||||||
|
do
|
||||||
|
export LOGFILE=$LOG_DIR/$workload.$LMP_CORES"c"$threads"t".log
|
||||||
|
echo "Running $LOGFILE"
|
||||||
|
cmd="$MPI -np $LMP_CORES $LMP_BIN -in in.intel.$workload -log $LOGFILE $LMP_ARGS";
|
||||||
|
rthreads=$threads
|
||||||
|
unset KMP_AFFINITY
|
||||||
|
$cmd
|
||||||
|
|
||||||
|
# - For benchmarks with PPPM, also try LRT mode
|
||||||
|
if [ $workload = "rhodo" ]; then
|
||||||
|
export LOGFILE=$LOG_DIR/$workload"_lrt".$LMP_CORES"c"$threads"t".log
|
||||||
|
cmd="$MPI -np $LMP_CORES $LMP_BIN -in in.intel.$workload -log $LOGFILE $RLMP_ARGS";
|
||||||
|
rthreads=$((threads-1))
|
||||||
|
export KMP_AFFINITY=none
|
||||||
|
export OMP_NUM_THREADS=$rthreads
|
||||||
|
echo " $cmd" >> $LOG_DIR/commands.info
|
||||||
|
$cmd
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
done
|
||||||
|
|
||||||
|
# Performance reported by LAMMPS (Timesteps/second ignoring warm-up run)
|
||||||
|
grep Perf $LOG_DIR/*.log | awk 'BEGIN{n=1}n%2==0{print $0}{n++}' | sed 's/\/day//g' | sed 's/steps\/s/steps_s/g' | sed 's/hours\/ns//g' | sed 's/.*\///g' | sed 's/\.log:Performance://g' | awk '{c=NF-1; print $1,$c}'
|
||||||
@ -81,16 +81,16 @@ void AngleCharmmIntel::compute(int eflag, int vflag,
|
|||||||
else evflag = 0;
|
else evflag = 0;
|
||||||
|
|
||||||
if (evflag) {
|
if (evflag) {
|
||||||
if (eflag) {
|
if (vflag && !eflag) {
|
||||||
|
if (force->newton_bond)
|
||||||
|
eval<0,1,1>(vflag, buffers, fc);
|
||||||
|
else
|
||||||
|
eval<0,1,0>(vflag, buffers, fc);
|
||||||
|
} else {
|
||||||
if (force->newton_bond)
|
if (force->newton_bond)
|
||||||
eval<1,1,1>(vflag, buffers, fc);
|
eval<1,1,1>(vflag, buffers, fc);
|
||||||
else
|
else
|
||||||
eval<1,1,0>(vflag, buffers, fc);
|
eval<1,1,0>(vflag, buffers, fc);
|
||||||
} else {
|
|
||||||
if (force->newton_bond)
|
|
||||||
eval<1,0,1>(vflag, buffers, fc);
|
|
||||||
else
|
|
||||||
eval<1,0,0>(vflag, buffers, fc);
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_bond)
|
if (force->newton_bond)
|
||||||
@ -102,7 +102,7 @@ void AngleCharmmIntel::compute(int eflag, int vflag,
|
|||||||
|
|
||||||
/* ---------------------------------------------------------------------- */
|
/* ---------------------------------------------------------------------- */
|
||||||
|
|
||||||
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||||
void AngleCharmmIntel::eval(const int vflag,
|
void AngleCharmmIntel::eval(const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc)
|
const ForceConst<flt_t> &fc)
|
||||||
@ -126,12 +126,9 @@ void AngleCharmmIntel::eval(const int vflag,
|
|||||||
const int nthreads = tc;
|
const int nthreads = tc;
|
||||||
|
|
||||||
acc_t oeangle, ov0, ov1, ov2, ov3, ov4, ov5;
|
acc_t oeangle, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||||
if (EVFLAG) {
|
if (EFLAG) oeangle = (acc_t)0.0;
|
||||||
if (EFLAG)
|
if (VFLAG && vflag) {
|
||||||
oeangle = (acc_t)0.0;
|
ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
|
||||||
if (vflag) {
|
|
||||||
ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
@ -140,8 +137,12 @@ void AngleCharmmIntel::eval(const int vflag,
|
|||||||
reduction(+:oeangle,ov0,ov1,ov2,ov3,ov4,ov5)
|
reduction(+:oeangle,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
int nfrom, nto, tid;
|
int nfrom, npl, nto, tid;
|
||||||
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
|
IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
|
||||||
|
#else
|
||||||
|
IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
|
||||||
|
#endif
|
||||||
|
|
||||||
FORCE_T * _noalias const f = f_start + (tid * f_stride);
|
FORCE_T * _noalias const f = f_start + (tid * f_stride);
|
||||||
if (fix->need_zero(tid))
|
if (fix->need_zero(tid))
|
||||||
@ -150,7 +151,17 @@ void AngleCharmmIntel::eval(const int vflag,
|
|||||||
const int4_t * _noalias const anglelist =
|
const int4_t * _noalias const anglelist =
|
||||||
(int4_t *) neighbor->anglelist[0];
|
(int4_t *) neighbor->anglelist[0];
|
||||||
|
|
||||||
for (int n = nfrom; n < nto; n++) {
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
|
acc_t seangle, sv0, sv1, sv2, sv3, sv4, sv5;
|
||||||
|
if (EFLAG) seangle = (acc_t)0.0;
|
||||||
|
if (VFLAG && vflag) {
|
||||||
|
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
||||||
|
}
|
||||||
|
#pragma simd reduction(+:seangle, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
|
for (int n = nfrom; n < nto; n ++) {
|
||||||
|
#else
|
||||||
|
for (int n = nfrom; n < nto; n += npl) {
|
||||||
|
#endif
|
||||||
const int i1 = anglelist[n].a;
|
const int i1 = anglelist[n].a;
|
||||||
const int i2 = anglelist[n].b;
|
const int i2 = anglelist[n].b;
|
||||||
const int i3 = anglelist[n].c;
|
const int i3 = anglelist[n].c;
|
||||||
@ -229,40 +240,58 @@ void AngleCharmmIntel::eval(const int vflag,
|
|||||||
|
|
||||||
// apply force to each of 3 atoms
|
// apply force to each of 3 atoms
|
||||||
|
|
||||||
if (NEWTON_BOND || i1 < nlocal) {
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
f[i1].x += f1x;
|
#pragma simdoff
|
||||||
f[i1].y += f1y;
|
#endif
|
||||||
f[i1].z += f1z;
|
{
|
||||||
|
if (NEWTON_BOND || i1 < nlocal) {
|
||||||
|
f[i1].x += f1x;
|
||||||
|
f[i1].y += f1y;
|
||||||
|
f[i1].z += f1z;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (NEWTON_BOND || i2 < nlocal) {
|
||||||
|
f[i2].x -= f1x + f3x;
|
||||||
|
f[i2].y -= f1y + f3y;
|
||||||
|
f[i2].z -= f1z + f3z;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (NEWTON_BOND || i3 < nlocal) {
|
||||||
|
f[i3].x += f3x;
|
||||||
|
f[i3].y += f3y;
|
||||||
|
f[i3].z += f3z;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (NEWTON_BOND || i2 < nlocal) {
|
if (EFLAG || VFLAG) {
|
||||||
f[i2].x -= f1x + f3x;
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
f[i2].y -= f1y + f3y;
|
IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2,
|
||||||
f[i2].z -= f1z + f3z;
|
i3, f1x, f1y, f1z, f3x, f3y, f3z, delx1,
|
||||||
}
|
dely1, delz1, delx2, dely2, delz2, seangle,
|
||||||
|
f, NEWTON_BOND, nlocal, sv0, sv1, sv2, sv3,
|
||||||
if (NEWTON_BOND || i3 < nlocal) {
|
sv4, sv5);
|
||||||
f[i3].x += f3x;
|
#else
|
||||||
f[i3].y += f3y;
|
IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2,
|
||||||
f[i3].z += f3z;
|
i3, f1x, f1y, f1z, f3x, f3y, f3z, delx1,
|
||||||
}
|
dely1, delz1, delx2, dely2, delz2, oeangle,
|
||||||
|
f, NEWTON_BOND, nlocal, ov0, ov1, ov2, ov3,
|
||||||
if (EVFLAG) {
|
ov4, ov5);
|
||||||
IP_PRE_ev_tally_angle(EFLAG, eatom, vflag, eangle, i1, i2, i3,f1x,
|
#endif
|
||||||
f1y, f1z, f3x, f3y, f3z, delx1, dely1, delz1,
|
|
||||||
delx2, dely2, delz2, oeangle, f, NEWTON_BOND,
|
|
||||||
nlocal, ov0, ov1, ov2, ov3, ov4, ov5);
|
|
||||||
}
|
}
|
||||||
} // for n
|
} // for n
|
||||||
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
|
if (EFLAG) oeangle += seangle;
|
||||||
|
if (VFLAG && vflag) {
|
||||||
|
ov0 += sv0; ov1 += sv1; ov2 += sv2;
|
||||||
|
ov3 += sv3; ov4 += sv4; ov5 += sv5;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
} // omp parallel
|
} // omp parallel
|
||||||
|
|
||||||
if (EVFLAG) {
|
if (EFLAG) energy += oeangle;
|
||||||
if (EFLAG)
|
if (VFLAG && vflag) {
|
||||||
energy += oeangle;
|
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
||||||
if (vflag) {
|
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
||||||
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
|
||||||
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fix->set_reduce_flag();
|
fix->set_reduce_flag();
|
||||||
|
|||||||
@ -81,16 +81,16 @@ void AngleHarmonicIntel::compute(int eflag, int vflag,
|
|||||||
else evflag = 0;
|
else evflag = 0;
|
||||||
|
|
||||||
if (evflag) {
|
if (evflag) {
|
||||||
if (eflag) {
|
if (vflag && !eflag) {
|
||||||
|
if (force->newton_bond)
|
||||||
|
eval<0,1,1>(vflag, buffers, fc);
|
||||||
|
else
|
||||||
|
eval<0,1,0>(vflag, buffers, fc);
|
||||||
|
} else {
|
||||||
if (force->newton_bond)
|
if (force->newton_bond)
|
||||||
eval<1,1,1>(vflag, buffers, fc);
|
eval<1,1,1>(vflag, buffers, fc);
|
||||||
else
|
else
|
||||||
eval<1,1,0>(vflag, buffers, fc);
|
eval<1,1,0>(vflag, buffers, fc);
|
||||||
} else {
|
|
||||||
if (force->newton_bond)
|
|
||||||
eval<1,0,1>(vflag, buffers, fc);
|
|
||||||
else
|
|
||||||
eval<1,0,0>(vflag, buffers, fc);
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_bond)
|
if (force->newton_bond)
|
||||||
@ -102,7 +102,7 @@ void AngleHarmonicIntel::compute(int eflag, int vflag,
|
|||||||
|
|
||||||
/* ---------------------------------------------------------------------- */
|
/* ---------------------------------------------------------------------- */
|
||||||
|
|
||||||
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||||
void AngleHarmonicIntel::eval(const int vflag,
|
void AngleHarmonicIntel::eval(const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc)
|
const ForceConst<flt_t> &fc)
|
||||||
@ -126,12 +126,9 @@ void AngleHarmonicIntel::eval(const int vflag,
|
|||||||
const int nthreads = tc;
|
const int nthreads = tc;
|
||||||
|
|
||||||
acc_t oeangle, ov0, ov1, ov2, ov3, ov4, ov5;
|
acc_t oeangle, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||||
if (EVFLAG) {
|
if (EFLAG) oeangle = (acc_t)0.0;
|
||||||
if (EFLAG)
|
if (VFLAG && vflag) {
|
||||||
oeangle = (acc_t)0.0;
|
ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
|
||||||
if (vflag) {
|
|
||||||
ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
@ -140,8 +137,12 @@ void AngleHarmonicIntel::eval(const int vflag,
|
|||||||
reduction(+:oeangle,ov0,ov1,ov2,ov3,ov4,ov5)
|
reduction(+:oeangle,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
int nfrom, nto, tid;
|
int nfrom, npl, nto, tid;
|
||||||
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
|
IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
|
||||||
|
#else
|
||||||
|
IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
|
||||||
|
#endif
|
||||||
|
|
||||||
FORCE_T * _noalias const f = f_start + (tid * f_stride);
|
FORCE_T * _noalias const f = f_start + (tid * f_stride);
|
||||||
if (fix->need_zero(tid))
|
if (fix->need_zero(tid))
|
||||||
@ -150,7 +151,17 @@ void AngleHarmonicIntel::eval(const int vflag,
|
|||||||
const int4_t * _noalias const anglelist =
|
const int4_t * _noalias const anglelist =
|
||||||
(int4_t *) neighbor->anglelist[0];
|
(int4_t *) neighbor->anglelist[0];
|
||||||
|
|
||||||
for (int n = nfrom; n < nto; n++) {
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
|
acc_t seangle, sv0, sv1, sv2, sv3, sv4, sv5;
|
||||||
|
if (EFLAG) seangle = (acc_t)0.0;
|
||||||
|
if (VFLAG && vflag) {
|
||||||
|
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
||||||
|
}
|
||||||
|
#pragma simd reduction(+:seangle, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
|
for (int n = nfrom; n < nto; n ++) {
|
||||||
|
#else
|
||||||
|
for (int n = nfrom; n < nto; n += npl) {
|
||||||
|
#endif
|
||||||
const int i1 = anglelist[n].a;
|
const int i1 = anglelist[n].a;
|
||||||
const int i2 = anglelist[n].b;
|
const int i2 = anglelist[n].b;
|
||||||
const int i3 = anglelist[n].c;
|
const int i3 = anglelist[n].c;
|
||||||
@ -211,40 +222,58 @@ void AngleHarmonicIntel::eval(const int vflag,
|
|||||||
|
|
||||||
// apply force to each of 3 atoms
|
// apply force to each of 3 atoms
|
||||||
|
|
||||||
if (NEWTON_BOND || i1 < nlocal) {
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
f[i1].x += f1x;
|
#pragma simdoff
|
||||||
f[i1].y += f1y;
|
#endif
|
||||||
f[i1].z += f1z;
|
{
|
||||||
|
if (NEWTON_BOND || i1 < nlocal) {
|
||||||
|
f[i1].x += f1x;
|
||||||
|
f[i1].y += f1y;
|
||||||
|
f[i1].z += f1z;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (NEWTON_BOND || i2 < nlocal) {
|
||||||
|
f[i2].x -= f1x + f3x;
|
||||||
|
f[i2].y -= f1y + f3y;
|
||||||
|
f[i2].z -= f1z + f3z;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (NEWTON_BOND || i3 < nlocal) {
|
||||||
|
f[i3].x += f3x;
|
||||||
|
f[i3].y += f3y;
|
||||||
|
f[i3].z += f3z;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (NEWTON_BOND || i2 < nlocal) {
|
if (EFLAG || VFLAG) {
|
||||||
f[i2].x -= f1x + f3x;
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
f[i2].y -= f1y + f3y;
|
IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2, i3,
|
||||||
f[i2].z -= f1z + f3z;
|
f1x, f1y, f1z, f3x, f3y, f3z, delx1, dely1,
|
||||||
}
|
delz1, delx2, dely2, delz2, seangle, f,
|
||||||
|
NEWTON_BOND, nlocal, sv0, sv1, sv2, sv3, sv4,
|
||||||
if (NEWTON_BOND || i3 < nlocal) {
|
sv5);
|
||||||
f[i3].x += f3x;
|
#else
|
||||||
f[i3].y += f3y;
|
IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2, i3,
|
||||||
f[i3].z += f3z;
|
f1x, f1y, f1z, f3x, f3y, f3z, delx1, dely1,
|
||||||
}
|
delz1, delx2, dely2, delz2, oeangle, f,
|
||||||
|
NEWTON_BOND, nlocal, ov0, ov1, ov2, ov3, ov4,
|
||||||
if (EVFLAG) {
|
ov5);
|
||||||
IP_PRE_ev_tally_angle(EFLAG, eatom, vflag, eangle, i1, i2, i3,f1x,
|
#endif
|
||||||
f1y, f1z, f3x, f3y, f3z, delx1, dely1, delz1,
|
|
||||||
delx2, dely2, delz2, oeangle, f, NEWTON_BOND,
|
|
||||||
nlocal, ov0, ov1, ov2, ov3, ov4, ov5);
|
|
||||||
}
|
}
|
||||||
} // for n
|
} // for n
|
||||||
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
|
if (EFLAG) oeangle += seangle;
|
||||||
|
if (VFLAG && vflag) {
|
||||||
|
ov0 += sv0; ov1 += sv1; ov2 += sv2;
|
||||||
|
ov3 += sv3; ov4 += sv4; ov5 += sv5;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
} // omp parallel
|
} // omp parallel
|
||||||
|
|
||||||
if (EVFLAG) {
|
if (EFLAG) energy += oeangle;
|
||||||
if (EFLAG)
|
if (VFLAG && vflag) {
|
||||||
energy += oeangle;
|
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
||||||
if (vflag) {
|
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
||||||
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
|
||||||
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fix->set_reduce_flag();
|
fix->set_reduce_flag();
|
||||||
|
|||||||
@ -77,16 +77,16 @@ void BondFENEIntel::compute(int eflag, int vflag,
|
|||||||
else evflag = 0;
|
else evflag = 0;
|
||||||
|
|
||||||
if (evflag) {
|
if (evflag) {
|
||||||
if (eflag) {
|
if (vflag && !eflag) {
|
||||||
|
if (force->newton_bond)
|
||||||
|
eval<0,1,1>(vflag, buffers, fc);
|
||||||
|
else
|
||||||
|
eval<0,1,0>(vflag, buffers, fc);
|
||||||
|
} else {
|
||||||
if (force->newton_bond)
|
if (force->newton_bond)
|
||||||
eval<1,1,1>(vflag, buffers, fc);
|
eval<1,1,1>(vflag, buffers, fc);
|
||||||
else
|
else
|
||||||
eval<1,1,0>(vflag, buffers, fc);
|
eval<1,1,0>(vflag, buffers, fc);
|
||||||
} else {
|
|
||||||
if (force->newton_bond)
|
|
||||||
eval<1,0,1>(vflag, buffers, fc);
|
|
||||||
else
|
|
||||||
eval<1,0,0>(vflag, buffers, fc);
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_bond)
|
if (force->newton_bond)
|
||||||
@ -96,10 +96,10 @@ void BondFENEIntel::compute(int eflag, int vflag,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||||
void BondFENEIntel::eval(const int vflag,
|
void BondFENEIntel::eval(const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc)
|
const ForceConst<flt_t> &fc)
|
||||||
{
|
{
|
||||||
const int inum = neighbor->nbondlist;
|
const int inum = neighbor->nbondlist;
|
||||||
if (inum == 0) return;
|
if (inum == 0) return;
|
||||||
@ -119,23 +119,23 @@ void BondFENEIntel::eval(const int vflag,
|
|||||||
const int nthreads = tc;
|
const int nthreads = tc;
|
||||||
|
|
||||||
acc_t oebond, ov0, ov1, ov2, ov3, ov4, ov5;
|
acc_t oebond, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||||
if (EVFLAG) {
|
if (EFLAG) oebond = (acc_t)0.0;
|
||||||
if (EFLAG)
|
if (VFLAG && vflag) {
|
||||||
oebond = (acc_t)0.0;
|
ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
|
||||||
if (vflag) {
|
|
||||||
ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
#pragma omp parallel default(none) \
|
#pragma omp parallel default(none) \
|
||||||
shared(f_start,f_stride,fc) \
|
shared(f_start,f_stride,fc) \
|
||||||
reduction(+:oebond,ov0,ov1,ov2,ov3,ov4,ov5)
|
reduction(+:oebond,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
int nfrom, nto, tid;
|
int nfrom, npl, nto, tid;
|
||||||
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
|
IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
|
||||||
|
#else
|
||||||
|
IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
|
||||||
|
#endif
|
||||||
|
|
||||||
FORCE_T * _noalias const f = f_start + (tid * f_stride);
|
FORCE_T * _noalias const f = f_start + (tid * f_stride);
|
||||||
if (fix->need_zero(tid))
|
if (fix->need_zero(tid))
|
||||||
@ -144,7 +144,17 @@ void BondFENEIntel::eval(const int vflag,
|
|||||||
const int3_t * _noalias const bondlist =
|
const int3_t * _noalias const bondlist =
|
||||||
(int3_t *) neighbor->bondlist[0];
|
(int3_t *) neighbor->bondlist[0];
|
||||||
|
|
||||||
for (int n = nfrom; n < nto; n++) {
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
|
acc_t sebond, sv0, sv1, sv2, sv3, sv4, sv5;
|
||||||
|
if (EFLAG) sebond = (acc_t)0.0;
|
||||||
|
if (VFLAG && vflag) {
|
||||||
|
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
||||||
|
}
|
||||||
|
#pragma simd reduction(+:sebond, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
|
for (int n = nfrom; n < nto; n ++) {
|
||||||
|
#else
|
||||||
|
for (int n = nfrom; n < nto; n += npl) {
|
||||||
|
#endif
|
||||||
const int i1 = bondlist[n].a;
|
const int i1 = bondlist[n].a;
|
||||||
const int i2 = bondlist[n].b;
|
const int i2 = bondlist[n].b;
|
||||||
const int type = bondlist[n].t;
|
const int type = bondlist[n].t;
|
||||||
@ -199,33 +209,48 @@ void BondFENEIntel::eval(const int vflag,
|
|||||||
|
|
||||||
// apply force to each of 2 atoms
|
// apply force to each of 2 atoms
|
||||||
|
|
||||||
if (NEWTON_BOND || i1 < nlocal) {
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
f[i1].x += delx*fbond;
|
#pragma simdoff
|
||||||
f[i1].y += dely*fbond;
|
#endif
|
||||||
f[i1].z += delz*fbond;
|
{
|
||||||
|
if (NEWTON_BOND || i1 < nlocal) {
|
||||||
|
f[i1].x += delx*fbond;
|
||||||
|
f[i1].y += dely*fbond;
|
||||||
|
f[i1].z += delz*fbond;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (NEWTON_BOND || i2 < nlocal) {
|
||||||
|
f[i2].x -= delx*fbond;
|
||||||
|
f[i2].y -= dely*fbond;
|
||||||
|
f[i2].z -= delz*fbond;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (NEWTON_BOND || i2 < nlocal) {
|
if (EFLAG || VFLAG) {
|
||||||
f[i2].x -= delx*fbond;
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
f[i2].y -= dely*fbond;
|
IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2, fbond,
|
||||||
f[i2].z -= delz*fbond;
|
delx, dely, delz, sebond, f, NEWTON_BOND,
|
||||||
}
|
nlocal, sv0, sv1, sv2, sv3, sv4, sv5);
|
||||||
|
#else
|
||||||
if (EVFLAG) {
|
IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2, fbond,
|
||||||
IP_PRE_ev_tally_bond(EFLAG, eatom, vflag, ebond, i1, i2, fbond,
|
|
||||||
delx, dely, delz, oebond, f, NEWTON_BOND,
|
delx, dely, delz, oebond, f, NEWTON_BOND,
|
||||||
nlocal, ov0, ov1, ov2, ov3, ov4, ov5);
|
nlocal, ov0, ov1, ov2, ov3, ov4, ov5);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
} // for n
|
} // for n
|
||||||
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
|
if (EFLAG) oebond += sebond;
|
||||||
|
if (VFLAG && vflag) {
|
||||||
|
ov0 += sv0; ov1 += sv1; ov2 += sv2;
|
||||||
|
ov3 += sv3; ov4 += sv4; ov5 += sv5;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
} // omp parallel
|
} // omp parallel
|
||||||
|
|
||||||
if (EVFLAG) {
|
if (EFLAG) energy += oebond;
|
||||||
if (EFLAG)
|
if (VFLAG && vflag) {
|
||||||
energy += oebond;
|
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
||||||
if (vflag) {
|
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
||||||
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
|
||||||
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fix->set_reduce_flag();
|
fix->set_reduce_flag();
|
||||||
|
|||||||
@ -77,16 +77,16 @@ void BondHarmonicIntel::compute(int eflag, int vflag,
|
|||||||
else evflag = 0;
|
else evflag = 0;
|
||||||
|
|
||||||
if (evflag) {
|
if (evflag) {
|
||||||
if (eflag) {
|
if (vflag && !eflag) {
|
||||||
|
if (force->newton_bond)
|
||||||
|
eval<0,1,1>(vflag, buffers, fc);
|
||||||
|
else
|
||||||
|
eval<0,1,0>(vflag, buffers, fc);
|
||||||
|
} else {
|
||||||
if (force->newton_bond)
|
if (force->newton_bond)
|
||||||
eval<1,1,1>(vflag, buffers, fc);
|
eval<1,1,1>(vflag, buffers, fc);
|
||||||
else
|
else
|
||||||
eval<1,1,0>(vflag, buffers, fc);
|
eval<1,1,0>(vflag, buffers, fc);
|
||||||
} else {
|
|
||||||
if (force->newton_bond)
|
|
||||||
eval<1,0,1>(vflag, buffers, fc);
|
|
||||||
else
|
|
||||||
eval<1,0,0>(vflag, buffers, fc);
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_bond)
|
if (force->newton_bond)
|
||||||
@ -96,7 +96,7 @@ void BondHarmonicIntel::compute(int eflag, int vflag,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||||
void BondHarmonicIntel::eval(const int vflag,
|
void BondHarmonicIntel::eval(const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc)
|
const ForceConst<flt_t> &fc)
|
||||||
@ -119,12 +119,9 @@ void BondHarmonicIntel::eval(const int vflag,
|
|||||||
const int nthreads = tc;
|
const int nthreads = tc;
|
||||||
|
|
||||||
acc_t oebond, ov0, ov1, ov2, ov3, ov4, ov5;
|
acc_t oebond, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||||
if (EVFLAG) {
|
if (EFLAG) oebond = (acc_t)0.0;
|
||||||
if (EFLAG)
|
if (VFLAG && vflag) {
|
||||||
oebond = (acc_t)0.0;
|
ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
|
||||||
if (vflag) {
|
|
||||||
ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
@ -133,8 +130,12 @@ void BondHarmonicIntel::eval(const int vflag,
|
|||||||
reduction(+:oebond,ov0,ov1,ov2,ov3,ov4,ov5)
|
reduction(+:oebond,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
int nfrom, nto, tid;
|
int nfrom, npl, nto, tid;
|
||||||
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
|
IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
|
||||||
|
#else
|
||||||
|
IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
|
||||||
|
#endif
|
||||||
|
|
||||||
FORCE_T * _noalias const f = f_start + (tid * f_stride);
|
FORCE_T * _noalias const f = f_start + (tid * f_stride);
|
||||||
if (fix->need_zero(tid))
|
if (fix->need_zero(tid))
|
||||||
@ -143,7 +144,17 @@ void BondHarmonicIntel::eval(const int vflag,
|
|||||||
const int3_t * _noalias const bondlist =
|
const int3_t * _noalias const bondlist =
|
||||||
(int3_t *) neighbor->bondlist[0];
|
(int3_t *) neighbor->bondlist[0];
|
||||||
|
|
||||||
for (int n = nfrom; n < nto; n++) {
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
|
acc_t sebond, sv0, sv1, sv2, sv3, sv4, sv5;
|
||||||
|
if (EFLAG) sebond = (acc_t)0.0;
|
||||||
|
if (VFLAG && vflag) {
|
||||||
|
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
||||||
|
}
|
||||||
|
#pragma simd reduction(+:sebond, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
|
for (int n = nfrom; n < nto; n ++) {
|
||||||
|
#else
|
||||||
|
for (int n = nfrom; n < nto; n += npl) {
|
||||||
|
#endif
|
||||||
const int i1 = bondlist[n].a;
|
const int i1 = bondlist[n].a;
|
||||||
const int i2 = bondlist[n].b;
|
const int i2 = bondlist[n].b;
|
||||||
const int type = bondlist[n].t;
|
const int type = bondlist[n].t;
|
||||||
@ -167,33 +178,50 @@ void BondHarmonicIntel::eval(const int vflag,
|
|||||||
if (EFLAG) ebond = rk*dr;
|
if (EFLAG) ebond = rk*dr;
|
||||||
|
|
||||||
// apply force to each of 2 atoms
|
// apply force to each of 2 atoms
|
||||||
if (NEWTON_BOND || i1 < nlocal) {
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
f[i1].x += delx*fbond;
|
#pragma simdoff
|
||||||
f[i1].y += dely*fbond;
|
#endif
|
||||||
f[i1].z += delz*fbond;
|
{
|
||||||
|
if (NEWTON_BOND || i1 < nlocal) {
|
||||||
|
f[i1].x += delx*fbond;
|
||||||
|
f[i1].y += dely*fbond;
|
||||||
|
f[i1].z += delz*fbond;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (NEWTON_BOND || i2 < nlocal) {
|
||||||
|
f[i2].x -= delx*fbond;
|
||||||
|
f[i2].y -= dely*fbond;
|
||||||
|
f[i2].z -= delz*fbond;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (NEWTON_BOND || i2 < nlocal) {
|
if (EFLAG || VFLAG) {
|
||||||
f[i2].x -= delx*fbond;
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
f[i2].y -= dely*fbond;
|
IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2,
|
||||||
f[i2].z -= delz*fbond;
|
fbond, delx, dely, delz, sebond, f,
|
||||||
}
|
NEWTON_BOND, nlocal, sv0, sv1, sv2, sv3,
|
||||||
|
sv4, sv5);
|
||||||
if (EVFLAG) {
|
#else
|
||||||
IP_PRE_ev_tally_bond(EFLAG, eatom, vflag, ebond, i1, i2, fbond,
|
IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2,
|
||||||
delx, dely, delz, oebond, f, NEWTON_BOND,
|
fbond, delx, dely, delz, oebond, f,
|
||||||
nlocal, ov0, ov1, ov2, ov3, ov4, ov5);
|
NEWTON_BOND, nlocal, ov0, ov1, ov2, ov3,
|
||||||
|
ov4, ov5);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
} // for n
|
} // for n
|
||||||
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
|
if (EFLAG) oebond += sebond;
|
||||||
|
if (VFLAG && vflag) {
|
||||||
|
ov0 += sv0; ov1 += sv1; ov2 += sv2;
|
||||||
|
ov3 += sv3; ov4 += sv4; ov5 += sv5;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
} // omp parallel
|
} // omp parallel
|
||||||
|
|
||||||
if (EVFLAG) {
|
if (EFLAG) energy += oebond;
|
||||||
if (EFLAG)
|
if (VFLAG && vflag) {
|
||||||
energy += oebond;
|
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
||||||
if (vflag) {
|
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
||||||
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
|
||||||
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fix->set_reduce_flag();
|
fix->set_reduce_flag();
|
||||||
|
|||||||
@ -93,16 +93,16 @@ void DihedralCharmmIntel::compute(int eflag, int vflag,
|
|||||||
force->pair->vflag_either = force->pair->vflag_global = 1;
|
force->pair->vflag_either = force->pair->vflag_global = 1;
|
||||||
|
|
||||||
if (evflag) {
|
if (evflag) {
|
||||||
if (eflag) {
|
if (vflag && !eflag) {
|
||||||
|
if (force->newton_bond)
|
||||||
|
eval<0,1,1>(vflag, buffers, fc);
|
||||||
|
else
|
||||||
|
eval<0,1,0>(vflag, buffers, fc);
|
||||||
|
} else {
|
||||||
if (force->newton_bond)
|
if (force->newton_bond)
|
||||||
eval<1,1,1>(vflag, buffers, fc);
|
eval<1,1,1>(vflag, buffers, fc);
|
||||||
else
|
else
|
||||||
eval<1,1,0>(vflag, buffers, fc);
|
eval<1,1,0>(vflag, buffers, fc);
|
||||||
} else {
|
|
||||||
if (force->newton_bond)
|
|
||||||
eval<1,0,1>(vflag, buffers, fc);
|
|
||||||
else
|
|
||||||
eval<1,0,0>(vflag, buffers, fc);
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_bond)
|
if (force->newton_bond)
|
||||||
@ -114,7 +114,7 @@ void DihedralCharmmIntel::compute(int eflag, int vflag,
|
|||||||
|
|
||||||
#ifndef LMP_USE_AVXCD_DHC
|
#ifndef LMP_USE_AVXCD_DHC
|
||||||
|
|
||||||
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||||
void DihedralCharmmIntel::eval(const int vflag,
|
void DihedralCharmmIntel::eval(const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc)
|
const ForceConst<flt_t> &fc)
|
||||||
@ -140,13 +140,10 @@ void DihedralCharmmIntel::eval(const int vflag,
|
|||||||
|
|
||||||
acc_t oedihedral, ov0, ov1, ov2, ov3, ov4, ov5;
|
acc_t oedihedral, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||||
acc_t oevdwl, oecoul, opv0, opv1, opv2, opv3, opv4, opv5;
|
acc_t oevdwl, oecoul, opv0, opv1, opv2, opv3, opv4, opv5;
|
||||||
if (EVFLAG) {
|
if (EFLAG) oevdwl = oecoul = oedihedral = (acc_t)0.0;
|
||||||
if (EFLAG)
|
if (VFLAG && vflag) {
|
||||||
oevdwl = oecoul = oedihedral = (acc_t)0.0;
|
ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
|
||||||
if (vflag) {
|
opv0 = opv1 = opv2 = opv3 = opv4 = opv5 = (acc_t)0.0;
|
||||||
ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
|
|
||||||
opv0 = opv1 = opv2 = opv3 = opv4 = opv5 = (acc_t)0.0;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
@ -156,8 +153,13 @@ void DihedralCharmmIntel::eval(const int vflag,
|
|||||||
opv0,opv1,opv2,opv3,opv4,opv5)
|
opv0,opv1,opv2,opv3,opv4,opv5)
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
|
#if defined(LMP_SIMD_COMPILER_TEST)
|
||||||
int nfrom, nto, tid;
|
int nfrom, nto, tid;
|
||||||
IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
|
IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
|
||||||
|
#else
|
||||||
|
int nfrom, npl, nto, tid;
|
||||||
|
IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
|
||||||
|
#endif
|
||||||
|
|
||||||
FORCE_T * _noalias const f = f_start + (tid * f_stride);
|
FORCE_T * _noalias const f = f_start + (tid * f_stride);
|
||||||
if (fix->need_zero(tid))
|
if (fix->need_zero(tid))
|
||||||
@ -169,21 +171,19 @@ void DihedralCharmmIntel::eval(const int vflag,
|
|||||||
|
|
||||||
acc_t sedihedral, sv0, sv1, sv2, sv3, sv4, sv5;
|
acc_t sedihedral, sv0, sv1, sv2, sv3, sv4, sv5;
|
||||||
acc_t sevdwl, secoul, spv0, spv1, spv2, spv3, spv4, spv5;
|
acc_t sevdwl, secoul, spv0, spv1, spv2, spv3, spv4, spv5;
|
||||||
if (EVFLAG) {
|
if (EFLAG) sevdwl = secoul = sedihedral = (acc_t)0.0;
|
||||||
if (EFLAG)
|
if (VFLAG && vflag) {
|
||||||
sevdwl = secoul = sedihedral = (acc_t)0.0;
|
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
||||||
if (vflag) {
|
spv0 = spv1 = spv2 = spv3 = spv4 = spv5 = (acc_t)0.0;
|
||||||
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
|
||||||
spv0 = spv1 = spv2 = spv3 = spv4 = spv5 = (acc_t)0.0;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(LMP_SIMD_COMPILER_TEST)
|
#if defined(LMP_SIMD_COMPILER_TEST)
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
#pragma simd reduction(+:sedihedral, sevdwl, secoul, sv0, sv1, sv2, \
|
#pragma simd reduction(+:sedihedral, sevdwl, secoul, sv0, sv1, sv2, \
|
||||||
sv3, sv4, sv5, spv0, spv1, spv2, spv3, spv4, spv5)
|
sv3, sv4, sv5, spv0, spv1, spv2, spv3, spv4, spv5)
|
||||||
#endif
|
|
||||||
for (int n = nfrom; n < nto; n++) {
|
for (int n = nfrom; n < nto; n++) {
|
||||||
|
#endif
|
||||||
|
for (int n = nfrom; n < nto; n += npl) {
|
||||||
const int i1 = dihedrallist[n].a;
|
const int i1 = dihedrallist[n].a;
|
||||||
const int i2 = dihedrallist[n].b;
|
const int i2 = dihedrallist[n].b;
|
||||||
const int i3 = dihedrallist[n].c;
|
const int i3 = dihedrallist[n].c;
|
||||||
@ -333,14 +333,14 @@ void DihedralCharmmIntel::eval(const int vflag,
|
|||||||
const flt_t f3y = -sy2 - f4y;
|
const flt_t f3y = -sy2 - f4y;
|
||||||
const flt_t f3z = -sz2 - f4z;
|
const flt_t f3z = -sz2 - f4z;
|
||||||
|
|
||||||
if (EVFLAG) {
|
if (EFLAG || VFLAG) {
|
||||||
flt_t deng;
|
flt_t deng;
|
||||||
if (EFLAG) deng = tk * p;
|
if (EFLAG) deng = tk * p;
|
||||||
IP_PRE_ev_tally_dihed(EFLAG, eatom, vflag, deng, i1, i2, i3, i4, f1x,
|
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3,
|
||||||
f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, vb1x,
|
i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y,
|
||||||
vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, vb3y,
|
f4z, vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm,
|
||||||
vb3z, sedihedral, f, NEWTON_BOND, nlocal,
|
vb3x, vb3y, vb3z, sedihedral, f, NEWTON_BOND,
|
||||||
sv0, sv1, sv2, sv3, sv4, sv5);
|
nlocal, sv0, sv1, sv2, sv3, sv4, sv5);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -387,7 +387,7 @@ void DihedralCharmmIntel::eval(const int vflag,
|
|||||||
f4z -= delz*fpair;
|
f4z -= delz*fpair;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (EVFLAG) {
|
if (EFLAG || VFLAG) {
|
||||||
flt_t ev_pre = (flt_t)0;
|
flt_t ev_pre = (flt_t)0;
|
||||||
if (NEWTON_BOND || i1 < nlocal)
|
if (NEWTON_BOND || i1 < nlocal)
|
||||||
ev_pre += (flt_t)0.5;
|
ev_pre += (flt_t)0.5;
|
||||||
@ -412,7 +412,7 @@ void DihedralCharmmIntel::eval(const int vflag,
|
|||||||
}
|
}
|
||||||
// IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair,
|
// IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair,
|
||||||
// delx, dely, delz);
|
// delx, dely, delz);
|
||||||
if (vflag) {
|
if (VFLAG && vflag) {
|
||||||
spv0 += ev_pre * delx * delx * fpair;
|
spv0 += ev_pre * delx * delx * fpair;
|
||||||
spv1 += ev_pre * dely * dely * fpair;
|
spv1 += ev_pre * dely * dely * fpair;
|
||||||
spv2 += ev_pre * delz * delz * fpair;
|
spv2 += ev_pre * delz * delz * fpair;
|
||||||
@ -440,36 +440,32 @@ void DihedralCharmmIntel::eval(const int vflag,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
} // for n
|
} // for n
|
||||||
if (EVFLAG) {
|
if (EFLAG) {
|
||||||
if (EFLAG) {
|
oedihedral += sedihedral;
|
||||||
oedihedral += sedihedral;
|
oecoul += secoul;
|
||||||
oecoul += secoul;
|
oevdwl += sevdwl;
|
||||||
oevdwl += sevdwl;
|
}
|
||||||
}
|
if (VFLAG && vflag) {
|
||||||
if (vflag) {
|
ov0 += sv0; ov1 += sv1; ov2 += sv2; ov3 += sv3; ov4 += sv4; ov5 += sv5;
|
||||||
ov0 += sv0; ov1 += sv1; ov2 += sv2; ov3 += sv3; ov4 += sv4; ov5 += sv5;
|
opv0 += spv0; opv1 += spv1; opv2 += spv2;
|
||||||
opv0 += spv0; opv1 += spv1; opv2 += spv2;
|
opv3 += spv3; opv4 += spv4; opv5 += spv5;
|
||||||
opv3 += spv3; opv4 += spv4; opv5 += spv5;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
} // omp parallel
|
} // omp parallel
|
||||||
|
|
||||||
if (EVFLAG) {
|
if (EFLAG) {
|
||||||
if (EFLAG) {
|
energy += oedihedral;
|
||||||
energy += oedihedral;
|
force->pair->eng_vdwl += oevdwl;
|
||||||
force->pair->eng_vdwl += oevdwl;
|
force->pair->eng_coul += oecoul;
|
||||||
force->pair->eng_coul += oecoul;
|
}
|
||||||
}
|
if (VFLAG && vflag) {
|
||||||
if (vflag) {
|
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
||||||
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
||||||
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
force->pair->virial[0] += opv0;
|
||||||
force->pair->virial[0] += opv0;
|
force->pair->virial[1] += opv1;
|
||||||
force->pair->virial[1] += opv1;
|
force->pair->virial[2] += opv2;
|
||||||
force->pair->virial[2] += opv2;
|
force->pair->virial[3] += opv3;
|
||||||
force->pair->virial[3] += opv3;
|
force->pair->virial[4] += opv4;
|
||||||
force->pair->virial[4] += opv4;
|
force->pair->virial[5] += opv5;
|
||||||
force->pair->virial[5] += opv5;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fix->set_reduce_flag();
|
fix->set_reduce_flag();
|
||||||
@ -488,7 +484,7 @@ authors for more details.
|
|||||||
|
|
||||||
------------------------------------------------------------------------- */
|
------------------------------------------------------------------------- */
|
||||||
|
|
||||||
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||||
void DihedralCharmmIntel::eval(const int vflag,
|
void DihedralCharmmIntel::eval(const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc)
|
const ForceConst<flt_t> &fc)
|
||||||
@ -518,13 +514,10 @@ void DihedralCharmmIntel::eval(const int vflag,
|
|||||||
|
|
||||||
acc_t oedihedral, ov0, ov1, ov2, ov3, ov4, ov5;
|
acc_t oedihedral, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||||
acc_t oevdwl, oecoul, opv0, opv1, opv2, opv3, opv4, opv5;
|
acc_t oevdwl, oecoul, opv0, opv1, opv2, opv3, opv4, opv5;
|
||||||
if (EVFLAG) {
|
if (EFLAG) oevdwl = oecoul = oedihedral = (acc_t)0.0;
|
||||||
if (EFLAG)
|
if (VFLAG && vflag) {
|
||||||
oevdwl = oecoul = oedihedral = (acc_t)0.0;
|
ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
|
||||||
if (vflag) {
|
opv0 = opv1 = opv2 = opv3 = opv4 = opv5 = (acc_t)0.0;
|
||||||
ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
|
|
||||||
opv0 = opv1 = opv2 = opv3 = opv4 = opv5 = (acc_t)0.0;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
@ -534,8 +527,9 @@ void DihedralCharmmIntel::eval(const int vflag,
|
|||||||
opv0,opv1,opv2,opv3,opv4,opv5)
|
opv0,opv1,opv2,opv3,opv4,opv5)
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
int nfrom, nto, tid;
|
int nfrom, npl, nto, tid;
|
||||||
IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
|
IP_PRE_omp_stride_id_vec(nfrom, npl, nto, tid, inum, nthreads,
|
||||||
|
swidth);
|
||||||
|
|
||||||
FORCE_T * _noalias const f = f_start + (tid * f_stride);
|
FORCE_T * _noalias const f = f_start + (tid * f_stride);
|
||||||
if (fix->need_zero(tid))
|
if (fix->need_zero(tid))
|
||||||
@ -559,26 +553,24 @@ void DihedralCharmmIntel::eval(const int vflag,
|
|||||||
|
|
||||||
SIMD_acc_t sedihedral, sv0, sv1, sv2, sv3, sv4, sv5;
|
SIMD_acc_t sedihedral, sv0, sv1, sv2, sv3, sv4, sv5;
|
||||||
SIMD_acc_t sevdwl, secoul, spv0, spv1, spv2, spv3, spv4, spv5;
|
SIMD_acc_t sevdwl, secoul, spv0, spv1, spv2, spv3, spv4, spv5;
|
||||||
if (EVFLAG) {
|
if (EFLAG) {
|
||||||
if (EFLAG) {
|
sevdwl = SIMD_set((acc_t)0.0);
|
||||||
sevdwl = SIMD_set((acc_t)0.0);
|
secoul = SIMD_set((acc_t)0.0);
|
||||||
secoul = SIMD_set((acc_t)0.0);
|
sedihedral = SIMD_set((acc_t)0.0);
|
||||||
sedihedral = SIMD_set((acc_t)0.0);
|
}
|
||||||
}
|
if (VFLAG && vflag) {
|
||||||
if (vflag) {
|
sv0 = SIMD_set((acc_t)0.0);
|
||||||
sv0 = SIMD_set((acc_t)0.0);
|
sv1 = SIMD_set((acc_t)0.0);
|
||||||
sv1 = SIMD_set((acc_t)0.0);
|
sv2 = SIMD_set((acc_t)0.0);
|
||||||
sv2 = SIMD_set((acc_t)0.0);
|
sv3 = SIMD_set((acc_t)0.0);
|
||||||
sv3 = SIMD_set((acc_t)0.0);
|
sv4 = SIMD_set((acc_t)0.0);
|
||||||
sv4 = SIMD_set((acc_t)0.0);
|
sv5 = SIMD_set((acc_t)0.0);
|
||||||
sv5 = SIMD_set((acc_t)0.0);
|
spv0 = SIMD_set((acc_t)0.0);
|
||||||
spv0 = SIMD_set((acc_t)0.0);
|
spv1 = SIMD_set((acc_t)0.0);
|
||||||
spv1 = SIMD_set((acc_t)0.0);
|
spv2 = SIMD_set((acc_t)0.0);
|
||||||
spv2 = SIMD_set((acc_t)0.0);
|
spv3 = SIMD_set((acc_t)0.0);
|
||||||
spv3 = SIMD_set((acc_t)0.0);
|
spv4 = SIMD_set((acc_t)0.0);
|
||||||
spv4 = SIMD_set((acc_t)0.0);
|
spv5 = SIMD_set((acc_t)0.0);
|
||||||
spv5 = SIMD_set((acc_t)0.0);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
SIMD_int n_offset = SIMD_set(0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50,
|
SIMD_int n_offset = SIMD_set(0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50,
|
||||||
@ -588,7 +580,7 @@ void DihedralCharmmIntel::eval(const int vflag,
|
|||||||
const SIMD_int simd_nlocals4 = SIMD_set(nlocals4);
|
const SIMD_int simd_nlocals4 = SIMD_set(nlocals4);
|
||||||
const int ntypes = atom->ntypes + 1;
|
const int ntypes = atom->ntypes + 1;
|
||||||
|
|
||||||
for (int n = nfrom; n < nto; n += swidth) {
|
for (int n = nfrom; n < nto; n += npl) {
|
||||||
SIMD_mask nmask = n_offset < nto5;
|
SIMD_mask nmask = n_offset < nto5;
|
||||||
SIMD_int i1 = SIMD_gather(nmask, dihedrallist, n_offset);
|
SIMD_int i1 = SIMD_gather(nmask, dihedrallist, n_offset);
|
||||||
const SIMD_flt_t q1 = SIMD_gather(nmask, q, i1);
|
const SIMD_flt_t q1 = SIMD_gather(nmask, q, i1);
|
||||||
@ -601,7 +593,7 @@ void DihedralCharmmIntel::eval(const int vflag,
|
|||||||
SIMD_int type = SIMD_gather(nmask, dihedrallist+4, n_offset);
|
SIMD_int type = SIMD_gather(nmask, dihedrallist+4, n_offset);
|
||||||
const SIMD_flt_t tweight = SIMD_gather(nmask, weight, type);
|
const SIMD_flt_t tweight = SIMD_gather(nmask, weight, type);
|
||||||
type = type << 2;
|
type = type << 2;
|
||||||
n_offset = n_offset + swidth * 5;
|
n_offset = n_offset + npl * 5;
|
||||||
|
|
||||||
// 1st bond
|
// 1st bond
|
||||||
|
|
||||||
@ -747,7 +739,7 @@ void DihedralCharmmIntel::eval(const int vflag,
|
|||||||
SIMD_flt_t f3z = -sz2 - f4z;
|
SIMD_flt_t f3z = -sz2 - f4z;
|
||||||
|
|
||||||
SIMD_flt_t qdeng;
|
SIMD_flt_t qdeng;
|
||||||
if (EVFLAG) {
|
if (EFLAG || VFLAG) {
|
||||||
SIMD_flt_t ev_pre;
|
SIMD_flt_t ev_pre;
|
||||||
if (NEWTON_BOND) ev_pre = one;
|
if (NEWTON_BOND) ev_pre = one;
|
||||||
else {
|
else {
|
||||||
@ -774,7 +766,7 @@ void DihedralCharmmIntel::eval(const int vflag,
|
|||||||
SIMD_jeng_update(newton_mask, featom, i3, ieng);
|
SIMD_jeng_update(newton_mask, featom, i3, ieng);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (vflag) {
|
if (VFLAG && vflag) {
|
||||||
sv0 = SIMD_ev_add(sv0, ev_pre*(vb1x*f1x-vb2xm*f3x+(vb3x-vb2xm)*f4x));
|
sv0 = SIMD_ev_add(sv0, ev_pre*(vb1x*f1x-vb2xm*f3x+(vb3x-vb2xm)*f4x));
|
||||||
sv1 = SIMD_ev_add(sv1, ev_pre*(vb1y*f1y-vb2ym*f3y+(vb3y-vb2ym)*f4y));
|
sv1 = SIMD_ev_add(sv1, ev_pre*(vb1y*f1y-vb2ym*f3y+(vb3y-vb2ym)*f4y));
|
||||||
sv2 = SIMD_ev_add(sv2, ev_pre*(vb1z*f1z-vb2zm*f3z+(vb3z-vb2zm)*f4z));
|
sv2 = SIMD_ev_add(sv2, ev_pre*(vb1z*f1z-vb2zm*f3z+(vb3z-vb2zm)*f4z));
|
||||||
@ -816,7 +808,7 @@ void DihedralCharmmIntel::eval(const int vflag,
|
|||||||
f4y = f4y - dely * fpair;
|
f4y = f4y - dely * fpair;
|
||||||
f4z = f4z - delz * fpair;
|
f4z = f4z - delz * fpair;
|
||||||
|
|
||||||
if (EVFLAG) {
|
if (EFLAG || VFLAG) {
|
||||||
SIMD_flt_t ev_pre;
|
SIMD_flt_t ev_pre;
|
||||||
if (NEWTON_BOND) ev_pre = one;
|
if (NEWTON_BOND) ev_pre = one;
|
||||||
else {
|
else {
|
||||||
@ -848,7 +840,7 @@ void DihedralCharmmIntel::eval(const int vflag,
|
|||||||
SIMD_jeng_update(newton_mask, featom, i4, ieng);
|
SIMD_jeng_update(newton_mask, featom, i4, ieng);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (vflag) {
|
if (VFLAG && vflag) {
|
||||||
spv0 = SIMD_ev_add(spv0, ev_pre * delx * delx * fpair);
|
spv0 = SIMD_ev_add(spv0, ev_pre * delx * delx * fpair);
|
||||||
spv1 = SIMD_ev_add(spv1, ev_pre * dely * dely * fpair);
|
spv1 = SIMD_ev_add(spv1, ev_pre * dely * dely * fpair);
|
||||||
spv2 = SIMD_ev_add(spv2, ev_pre * delz * delz * fpair);
|
spv2 = SIMD_ev_add(spv2, ev_pre * delz * delz * fpair);
|
||||||
@ -865,45 +857,41 @@ void DihedralCharmmIntel::eval(const int vflag,
|
|||||||
SIMD_safe_jforce(newton_mask, pforce, i4, f4x, f4y, f4z);
|
SIMD_safe_jforce(newton_mask, pforce, i4, f4x, f4y, f4z);
|
||||||
} // for n
|
} // for n
|
||||||
|
|
||||||
if (EVFLAG) {
|
if (EFLAG) {
|
||||||
if (EFLAG) {
|
oedihedral += SIMD_sum(sedihedral);
|
||||||
oedihedral += SIMD_sum(sedihedral);
|
oecoul += SIMD_sum(secoul);
|
||||||
oecoul += SIMD_sum(secoul);
|
oevdwl += SIMD_sum(sevdwl);
|
||||||
oevdwl += SIMD_sum(sevdwl);
|
}
|
||||||
}
|
if (VFLAG && vflag) {
|
||||||
if (vflag) {
|
ov0 += SIMD_sum(sv0);
|
||||||
ov0 += SIMD_sum(sv0);
|
ov1 += SIMD_sum(sv1);
|
||||||
ov1 += SIMD_sum(sv1);
|
ov2 += SIMD_sum(sv2);
|
||||||
ov2 += SIMD_sum(sv2);
|
ov3 += SIMD_sum(sv3);
|
||||||
ov3 += SIMD_sum(sv3);
|
ov4 += SIMD_sum(sv4);
|
||||||
ov4 += SIMD_sum(sv4);
|
ov5 += SIMD_sum(sv5);
|
||||||
ov5 += SIMD_sum(sv5);
|
opv0 += SIMD_sum(spv0);
|
||||||
opv0 += SIMD_sum(spv0);
|
opv1 += SIMD_sum(spv1);
|
||||||
opv1 += SIMD_sum(spv1);
|
opv2 += SIMD_sum(spv2);
|
||||||
opv2 += SIMD_sum(spv2);
|
opv3 += SIMD_sum(spv3);
|
||||||
opv3 += SIMD_sum(spv3);
|
opv4 += SIMD_sum(spv4);
|
||||||
opv4 += SIMD_sum(spv4);
|
opv5 += SIMD_sum(spv5);
|
||||||
opv5 += SIMD_sum(spv5);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
} // omp parallel
|
} // omp parallel
|
||||||
|
|
||||||
if (EVFLAG) {
|
if (EFLAG) {
|
||||||
if (EFLAG) {
|
energy += oedihedral;
|
||||||
energy += oedihedral;
|
force->pair->eng_vdwl += oevdwl;
|
||||||
force->pair->eng_vdwl += oevdwl;
|
force->pair->eng_coul += oecoul;
|
||||||
force->pair->eng_coul += oecoul;
|
}
|
||||||
}
|
if (VFLAG && vflag) {
|
||||||
if (vflag) {
|
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
||||||
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
||||||
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
force->pair->virial[0] += opv0;
|
||||||
force->pair->virial[0] += opv0;
|
force->pair->virial[1] += opv1;
|
||||||
force->pair->virial[1] += opv1;
|
force->pair->virial[2] += opv2;
|
||||||
force->pair->virial[2] += opv2;
|
force->pair->virial[3] += opv3;
|
||||||
force->pair->virial[3] += opv3;
|
force->pair->virial[4] += opv4;
|
||||||
force->pair->virial[4] += opv4;
|
force->pair->virial[5] += opv5;
|
||||||
force->pair->virial[5] += opv5;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fix->set_reduce_flag();
|
fix->set_reduce_flag();
|
||||||
@ -953,12 +941,14 @@ void DihedralCharmmIntel::pack_force_const(ForceConst<flt_t> &fc,
|
|||||||
fc.set_ntypes(tp1,bp1,memory);
|
fc.set_ntypes(tp1,bp1,memory);
|
||||||
buffers->set_ntypes(tp1);
|
buffers->set_ntypes(tp1);
|
||||||
|
|
||||||
for (int i = 0; i < tp1; i++) {
|
if (weightflag) {
|
||||||
for (int j = 0; j < tp1; j++) {
|
for (int i = 0; i < tp1; i++) {
|
||||||
fc.ljp[i][j].lj1 = lj14_1[i][j];
|
for (int j = 0; j < tp1; j++) {
|
||||||
fc.ljp[i][j].lj2 = lj14_2[i][j];
|
fc.ljp[i][j].lj1 = lj14_1[i][j];
|
||||||
fc.ljp[i][j].lj3 = lj14_3[i][j];
|
fc.ljp[i][j].lj2 = lj14_2[i][j];
|
||||||
fc.ljp[i][j].lj4 = lj14_4[i][j];
|
fc.ljp[i][j].lj3 = lj14_3[i][j];
|
||||||
|
fc.ljp[i][j].lj4 = lj14_4[i][j];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -77,16 +77,16 @@ void DihedralHarmonicIntel::compute(int eflag, int vflag,
|
|||||||
} else evflag = 0;
|
} else evflag = 0;
|
||||||
|
|
||||||
if (evflag) {
|
if (evflag) {
|
||||||
if (eflag) {
|
if (vflag && !eflag) {
|
||||||
|
if (force->newton_bond)
|
||||||
|
eval<0,1,1>(vflag, buffers, fc);
|
||||||
|
else
|
||||||
|
eval<0,1,0>(vflag, buffers, fc);
|
||||||
|
} else {
|
||||||
if (force->newton_bond)
|
if (force->newton_bond)
|
||||||
eval<1,1,1>(vflag, buffers, fc);
|
eval<1,1,1>(vflag, buffers, fc);
|
||||||
else
|
else
|
||||||
eval<1,1,0>(vflag, buffers, fc);
|
eval<1,1,0>(vflag, buffers, fc);
|
||||||
} else {
|
|
||||||
if (force->newton_bond)
|
|
||||||
eval<1,0,1>(vflag, buffers, fc);
|
|
||||||
else
|
|
||||||
eval<1,0,0>(vflag, buffers, fc);
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_bond)
|
if (force->newton_bond)
|
||||||
@ -96,7 +96,7 @@ void DihedralHarmonicIntel::compute(int eflag, int vflag,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||||
void DihedralHarmonicIntel::eval(const int vflag,
|
void DihedralHarmonicIntel::eval(const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc)
|
const ForceConst<flt_t> &fc)
|
||||||
@ -120,12 +120,9 @@ void DihedralHarmonicIntel::eval(const int vflag,
|
|||||||
const int nthreads = tc;
|
const int nthreads = tc;
|
||||||
|
|
||||||
acc_t oedihedral, ov0, ov1, ov2, ov3, ov4, ov5;
|
acc_t oedihedral, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||||
if (EVFLAG) {
|
if (EFLAG) oedihedral = (acc_t)0.0;
|
||||||
if (EFLAG)
|
if (VFLAG && vflag) {
|
||||||
oedihedral = (acc_t)0.0;
|
ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
|
||||||
if (vflag) {
|
|
||||||
ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
@ -134,8 +131,12 @@ void DihedralHarmonicIntel::eval(const int vflag,
|
|||||||
reduction(+:oedihedral,ov0,ov1,ov2,ov3,ov4,ov5)
|
reduction(+:oedihedral,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
int nfrom, nto, tid;
|
int nfrom, npl, nto, tid;
|
||||||
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
|
IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
|
||||||
|
#else
|
||||||
|
IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
|
||||||
|
#endif
|
||||||
|
|
||||||
FORCE_T * _noalias const f = f_start + (tid * f_stride);
|
FORCE_T * _noalias const f = f_start + (tid * f_stride);
|
||||||
if (fix->need_zero(tid))
|
if (fix->need_zero(tid))
|
||||||
@ -144,16 +145,17 @@ void DihedralHarmonicIntel::eval(const int vflag,
|
|||||||
const int5_t * _noalias const dihedrallist =
|
const int5_t * _noalias const dihedrallist =
|
||||||
(int5_t *) neighbor->dihedrallist[0];
|
(int5_t *) neighbor->dihedrallist[0];
|
||||||
|
|
||||||
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
acc_t sedihedral, sv0, sv1, sv2, sv3, sv4, sv5;
|
acc_t sedihedral, sv0, sv1, sv2, sv3, sv4, sv5;
|
||||||
if (EVFLAG) {
|
if (EFLAG) sedihedral = (acc_t)0.0;
|
||||||
if (EFLAG)
|
if (VFLAG && vflag) {
|
||||||
sedihedral = (acc_t)0.0;
|
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
||||||
if (vflag) {
|
|
||||||
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
#pragma simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
for (int n = nfrom; n < nto; n++) {
|
for (int n = nfrom; n < nto; n ++) {
|
||||||
|
#else
|
||||||
|
for (int n = nfrom; n < nto; n += npl) {
|
||||||
|
#endif
|
||||||
const int i1 = dihedrallist[n].a;
|
const int i1 = dihedrallist[n].a;
|
||||||
const int i2 = dihedrallist[n].b;
|
const int i2 = dihedrallist[n].b;
|
||||||
const int i3 = dihedrallist[n].c;
|
const int i3 = dihedrallist[n].c;
|
||||||
@ -203,6 +205,7 @@ void DihedralHarmonicIntel::eval(const int vflag,
|
|||||||
const flt_t s = rg*rabinv*(ax*vb3x + ay*vb3y + az*vb3z);
|
const flt_t s = rg*rabinv*(ax*vb3x + ay*vb3y + az*vb3z);
|
||||||
|
|
||||||
// error check
|
// error check
|
||||||
|
#ifndef LMP_INTEL_USE_SIMDOFF
|
||||||
if (c > PTOLERANCE || c < MTOLERANCE) {
|
if (c > PTOLERANCE || c < MTOLERANCE) {
|
||||||
int me = comm->me;
|
int me = comm->me;
|
||||||
|
|
||||||
@ -224,6 +227,7 @@ void DihedralHarmonicIntel::eval(const int vflag,
|
|||||||
me,x[i4].x,x[i4].y,x[i4].z);
|
me,x[i4].x,x[i4].y,x[i4].z);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
if (c > (flt_t)1.0) c = (flt_t)1.0;
|
if (c > (flt_t)1.0) c = (flt_t)1.0;
|
||||||
if (c < (flt_t)-1.0) c = (flt_t)-1.0;
|
if (c < (flt_t)-1.0) c = (flt_t)-1.0;
|
||||||
@ -292,16 +296,27 @@ void DihedralHarmonicIntel::eval(const int vflag,
|
|||||||
const flt_t f3y = -sy2 - f4y;
|
const flt_t f3y = -sy2 - f4y;
|
||||||
const flt_t f3z = -sz2 - f4z;
|
const flt_t f3z = -sz2 - f4z;
|
||||||
|
|
||||||
if (EVFLAG) {
|
if (EFLAG || VFLAG) {
|
||||||
flt_t deng;
|
flt_t deng;
|
||||||
if (EFLAG) deng = tk * p;
|
if (EFLAG) deng = tk * p;
|
||||||
IP_PRE_ev_tally_dihed(EFLAG, eatom, vflag, deng, i1, i2, i3, i4, f1x,
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, vb1x,
|
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4,
|
||||||
vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, vb3y,
|
f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
|
||||||
vb3z, sedihedral, f, NEWTON_BOND, nlocal,
|
vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x,
|
||||||
|
vb3y, vb3z, sedihedral, f, NEWTON_BOND, nlocal,
|
||||||
sv0, sv1, sv2, sv3, sv4, sv5);
|
sv0, sv1, sv2, sv3, sv4, sv5);
|
||||||
|
#else
|
||||||
|
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4,
|
||||||
|
f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
|
||||||
|
vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x,
|
||||||
|
vb3y, vb3z, oedihedral, f, NEWTON_BOND, nlocal,
|
||||||
|
ov0, ov1, ov2, ov3, ov4, ov5);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
|
#pragma simdoff
|
||||||
|
#endif
|
||||||
{
|
{
|
||||||
if (NEWTON_BOND || i1 < nlocal) {
|
if (NEWTON_BOND || i1 < nlocal) {
|
||||||
f[i1].x += f1x;
|
f[i1].x += f1x;
|
||||||
@ -328,20 +343,19 @@ void DihedralHarmonicIntel::eval(const int vflag,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
} // for n
|
} // for n
|
||||||
if (EVFLAG) {
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
if (EFLAG) oedihedral += sedihedral;
|
if (EFLAG) oedihedral += sedihedral;
|
||||||
if (vflag) {
|
if (VFLAG && vflag) {
|
||||||
ov0 += sv0; ov1 += sv1; ov2 += sv2; ov3 += sv3; ov4 += sv4; ov5 += sv5;
|
ov0 += sv0; ov1 += sv1; ov2 += sv2;
|
||||||
}
|
ov3 += sv3; ov4 += sv4; ov5 += sv5;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
} // omp parallel
|
} // omp parallel
|
||||||
|
|
||||||
if (EVFLAG) {
|
if (EFLAG) energy += oedihedral;
|
||||||
if (EFLAG) energy += oedihedral;
|
if (VFLAG && vflag) {
|
||||||
if (vflag) {
|
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
||||||
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
||||||
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fix->set_reduce_flag();
|
fix->set_reduce_flag();
|
||||||
|
|||||||
@ -81,16 +81,16 @@ void DihedralOPLSIntel::compute(int eflag, int vflag,
|
|||||||
} else evflag = 0;
|
} else evflag = 0;
|
||||||
|
|
||||||
if (evflag) {
|
if (evflag) {
|
||||||
if (eflag) {
|
if (vflag && !eflag) {
|
||||||
|
if (force->newton_bond)
|
||||||
|
eval<0,1,1>(vflag, buffers, fc);
|
||||||
|
else
|
||||||
|
eval<0,1,0>(vflag, buffers, fc);
|
||||||
|
} else {
|
||||||
if (force->newton_bond)
|
if (force->newton_bond)
|
||||||
eval<1,1,1>(vflag, buffers, fc);
|
eval<1,1,1>(vflag, buffers, fc);
|
||||||
else
|
else
|
||||||
eval<1,1,0>(vflag, buffers, fc);
|
eval<1,1,0>(vflag, buffers, fc);
|
||||||
} else {
|
|
||||||
if (force->newton_bond)
|
|
||||||
eval<1,0,1>(vflag, buffers, fc);
|
|
||||||
else
|
|
||||||
eval<1,0,0>(vflag, buffers, fc);
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_bond)
|
if (force->newton_bond)
|
||||||
@ -100,7 +100,7 @@ void DihedralOPLSIntel::compute(int eflag, int vflag,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||||
void DihedralOPLSIntel::eval(const int vflag,
|
void DihedralOPLSIntel::eval(const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc)
|
const ForceConst<flt_t> &fc)
|
||||||
@ -124,12 +124,9 @@ void DihedralOPLSIntel::eval(const int vflag,
|
|||||||
const int nthreads = tc;
|
const int nthreads = tc;
|
||||||
|
|
||||||
acc_t oedihedral, ov0, ov1, ov2, ov3, ov4, ov5;
|
acc_t oedihedral, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||||
if (EVFLAG) {
|
if (EFLAG) oedihedral = (acc_t)0.0;
|
||||||
if (EFLAG)
|
if (VFLAG && vflag) {
|
||||||
oedihedral = (acc_t)0.0;
|
ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
|
||||||
if (vflag) {
|
|
||||||
ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
@ -138,8 +135,12 @@ void DihedralOPLSIntel::eval(const int vflag,
|
|||||||
reduction(+:oedihedral,ov0,ov1,ov2,ov3,ov4,ov5)
|
reduction(+:oedihedral,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
int nfrom, nto, tid;
|
int nfrom, npl, nto, tid;
|
||||||
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
|
IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
|
||||||
|
#else
|
||||||
|
IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
|
||||||
|
#endif
|
||||||
|
|
||||||
FORCE_T * _noalias const f = f_start + (tid * f_stride);
|
FORCE_T * _noalias const f = f_start + (tid * f_stride);
|
||||||
if (fix->need_zero(tid))
|
if (fix->need_zero(tid))
|
||||||
@ -148,16 +149,17 @@ void DihedralOPLSIntel::eval(const int vflag,
|
|||||||
const int5_t * _noalias const dihedrallist =
|
const int5_t * _noalias const dihedrallist =
|
||||||
(int5_t *) neighbor->dihedrallist[0];
|
(int5_t *) neighbor->dihedrallist[0];
|
||||||
|
|
||||||
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
acc_t sedihedral, sv0, sv1, sv2, sv3, sv4, sv5;
|
acc_t sedihedral, sv0, sv1, sv2, sv3, sv4, sv5;
|
||||||
if (EVFLAG) {
|
if (EFLAG) sedihedral = (acc_t)0.0;
|
||||||
if (EFLAG)
|
if (VFLAG && vflag) {
|
||||||
sedihedral = (acc_t)0.0;
|
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
||||||
if (vflag) {
|
|
||||||
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
#pragma simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
for (int n = nfrom; n < nto; n++) {
|
for (int n = nfrom; n < nto; n ++) {
|
||||||
|
#else
|
||||||
|
for (int n = nfrom; n < nto; n += npl) {
|
||||||
|
#endif
|
||||||
const int i1 = dihedrallist[n].a;
|
const int i1 = dihedrallist[n].a;
|
||||||
const int i2 = dihedrallist[n].b;
|
const int i2 = dihedrallist[n].b;
|
||||||
const int i3 = dihedrallist[n].c;
|
const int i3 = dihedrallist[n].c;
|
||||||
@ -236,6 +238,7 @@ void DihedralOPLSIntel::eval(const int vflag,
|
|||||||
const flt_t dx = (cx*vb3x + cy*vb3y + cz*vb3z)*cmag*rb3;
|
const flt_t dx = (cx*vb3x + cy*vb3y + cz*vb3z)*cmag*rb3;
|
||||||
|
|
||||||
// error check
|
// error check
|
||||||
|
#ifndef LMP_INTEL_USE_SIMDOFF
|
||||||
if (c > PTOLERANCE || c < MTOLERANCE) {
|
if (c > PTOLERANCE || c < MTOLERANCE) {
|
||||||
int me = comm->me;
|
int me = comm->me;
|
||||||
|
|
||||||
@ -257,6 +260,7 @@ void DihedralOPLSIntel::eval(const int vflag,
|
|||||||
me,x[i4].x,x[i4].y,x[i4].z);
|
me,x[i4].x,x[i4].y,x[i4].z);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
if (c > (flt_t)1.0) c = (flt_t)1.0;
|
if (c > (flt_t)1.0) c = (flt_t)1.0;
|
||||||
if (c < (flt_t)-1.0) c = (flt_t)-1.0;
|
if (c < (flt_t)-1.0) c = (flt_t)-1.0;
|
||||||
@ -321,14 +325,25 @@ void DihedralOPLSIntel::eval(const int vflag,
|
|||||||
const flt_t f3y = sy2 - f4y;
|
const flt_t f3y = sy2 - f4y;
|
||||||
const flt_t f3z = sz2 - f4z;
|
const flt_t f3z = sz2 - f4z;
|
||||||
|
|
||||||
if (EVFLAG) {
|
if (EFLAG || VFLAG) {
|
||||||
IP_PRE_ev_tally_dihed(EFLAG, eatom, vflag, edihed, i1, i2, i3, i4, f1x,
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, vb1x,
|
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, edihed, i1, i2, i3,
|
||||||
vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, vb3y,
|
i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
|
||||||
vb3z, sedihedral, f, NEWTON_BOND, nlocal,
|
vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x,
|
||||||
|
vb3y, vb3z, sedihedral, f, NEWTON_BOND, nlocal,
|
||||||
sv0, sv1, sv2, sv3, sv4, sv5);
|
sv0, sv1, sv2, sv3, sv4, sv5);
|
||||||
|
#else
|
||||||
|
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, edihed, i1, i2, i3,
|
||||||
|
i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
|
||||||
|
vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x,
|
||||||
|
vb3y, vb3z, oedihedral, f, NEWTON_BOND, nlocal,
|
||||||
|
ov0, ov1, ov2, ov3, ov4, ov5);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
|
#pragma simdoff
|
||||||
|
#endif
|
||||||
{
|
{
|
||||||
if (NEWTON_BOND || i1 < nlocal) {
|
if (NEWTON_BOND || i1 < nlocal) {
|
||||||
f[i1].x += f1x;
|
f[i1].x += f1x;
|
||||||
@ -355,20 +370,19 @@ void DihedralOPLSIntel::eval(const int vflag,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
} // for n
|
} // for n
|
||||||
if (EVFLAG) {
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
if (EFLAG) oedihedral += sedihedral;
|
if (EFLAG) oedihedral += sedihedral;
|
||||||
if (vflag) {
|
if (VFLAG && vflag) {
|
||||||
ov0 += sv0; ov1 += sv1; ov2 += sv2; ov3 += sv3; ov4 += sv4; ov5 += sv5;
|
ov0 += sv0; ov1 += sv1; ov2 += sv2;
|
||||||
}
|
ov3 += sv3; ov4 += sv4; ov5 += sv5;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
} // omp parallel
|
} // omp parallel
|
||||||
|
|
||||||
if (EVFLAG) {
|
if (EFLAG) energy += oedihedral;
|
||||||
if (EFLAG) energy += oedihedral;
|
if (VFLAG && vflag) {
|
||||||
if (vflag) {
|
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
||||||
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
||||||
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fix->set_reduce_flag();
|
fix->set_reduce_flag();
|
||||||
|
|||||||
@ -61,6 +61,7 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg)
|
|||||||
int ncops = force->inumeric(FLERR,arg[3]);
|
int ncops = force->inumeric(FLERR,arg[3]);
|
||||||
|
|
||||||
_nbor_pack_width = 1;
|
_nbor_pack_width = 1;
|
||||||
|
_three_body_neighbor = 0;
|
||||||
|
|
||||||
_precision_mode = PREC_MODE_MIXED;
|
_precision_mode = PREC_MODE_MIXED;
|
||||||
_offload_balance = -1.0;
|
_offload_balance = -1.0;
|
||||||
@ -326,12 +327,18 @@ void FixIntel::init()
|
|||||||
"Currently, cannot use more than one intel style with hybrid.");
|
"Currently, cannot use more than one intel style with hybrid.");
|
||||||
|
|
||||||
check_neighbor_intel();
|
check_neighbor_intel();
|
||||||
if (_precision_mode == PREC_MODE_SINGLE)
|
int off_mode = 0;
|
||||||
|
if (_offload_balance != 0.0) off_mode = 1;
|
||||||
|
if (_precision_mode == PREC_MODE_SINGLE) {
|
||||||
_single_buffers->zero_ev();
|
_single_buffers->zero_ev();
|
||||||
else if (_precision_mode == PREC_MODE_MIXED)
|
_single_buffers->grow_ncache(off_mode,_nthreads);
|
||||||
|
} else if (_precision_mode == PREC_MODE_MIXED) {
|
||||||
_mixed_buffers->zero_ev();
|
_mixed_buffers->zero_ev();
|
||||||
else
|
_mixed_buffers->grow_ncache(off_mode,_nthreads);
|
||||||
|
} else {
|
||||||
_double_buffers->zero_ev();
|
_double_buffers->zero_ev();
|
||||||
|
_double_buffers->grow_ncache(off_mode,_nthreads);
|
||||||
|
}
|
||||||
|
|
||||||
_need_reduce = 0;
|
_need_reduce = 0;
|
||||||
}
|
}
|
||||||
@ -367,8 +374,6 @@ void FixIntel::pair_init_check(const bool cdmessage)
|
|||||||
{
|
{
|
||||||
#ifdef INTEL_VMASK
|
#ifdef INTEL_VMASK
|
||||||
atom->sortfreq = 1;
|
atom->sortfreq = 1;
|
||||||
if (neighbor->binsizeflag && atom->userbinsize <= 0.0)
|
|
||||||
atom->userbinsize = neighbor->binsize_user;
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
_nbor_pack_width = 1;
|
_nbor_pack_width = 1;
|
||||||
@ -376,9 +381,8 @@ void FixIntel::pair_init_check(const bool cdmessage)
|
|||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
if (_offload_balance != 0.0) atom->sortfreq = 1;
|
if (_offload_balance != 0.0) atom->sortfreq = 1;
|
||||||
|
|
||||||
if (force->newton_pair == 0)
|
_offload_noghost = 0;
|
||||||
_offload_noghost = 0;
|
if (force->newton_pair && _offload_ghost == 0)
|
||||||
else if (_offload_ghost == 0)
|
|
||||||
_offload_noghost = 1;
|
_offload_noghost = 1;
|
||||||
|
|
||||||
set_offload_affinity();
|
set_offload_affinity();
|
||||||
@ -535,24 +539,24 @@ void FixIntel::pre_reverse(int eflag, int vflag)
|
|||||||
{
|
{
|
||||||
if (_force_array_m != 0) {
|
if (_force_array_m != 0) {
|
||||||
if (_need_reduce) {
|
if (_need_reduce) {
|
||||||
reduce_results(_force_array_m);
|
reduce_results(&_force_array_m[0].x);
|
||||||
_need_reduce = 0;
|
_need_reduce = 0;
|
||||||
}
|
}
|
||||||
add_results(_force_array_m, _ev_array_d, _results_eatom, _results_vatom, 0);
|
add_results(_force_array_m, _ev_array_d, _results_eatom, _results_vatom,0);
|
||||||
_force_array_m = 0;
|
_force_array_m = 0;
|
||||||
} else if (_force_array_d != 0) {
|
} else if (_force_array_d != 0) {
|
||||||
if (_need_reduce) {
|
if (_need_reduce) {
|
||||||
reduce_results(_force_array_d);
|
reduce_results(&_force_array_d[0].x);
|
||||||
_need_reduce = 0;
|
_need_reduce = 0;
|
||||||
}
|
}
|
||||||
add_results(_force_array_d, _ev_array_d, _results_eatom, _results_vatom, 0);
|
add_results(_force_array_d, _ev_array_d, _results_eatom, _results_vatom,0);
|
||||||
_force_array_d = 0;
|
_force_array_d = 0;
|
||||||
} else if (_force_array_s != 0) {
|
} else if (_force_array_s != 0) {
|
||||||
if (_need_reduce) {
|
if (_need_reduce) {
|
||||||
reduce_results(_force_array_s);
|
reduce_results(&_force_array_s[0].x);
|
||||||
_need_reduce = 0;
|
_need_reduce = 0;
|
||||||
}
|
}
|
||||||
add_results(_force_array_s, _ev_array_s, _results_eatom, _results_vatom, 0);
|
add_results(_force_array_s, _ev_array_s, _results_eatom, _results_vatom,0);
|
||||||
_force_array_s = 0;
|
_force_array_s = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -563,47 +567,56 @@ void FixIntel::pre_reverse(int eflag, int vflag)
|
|||||||
|
|
||||||
/* ---------------------------------------------------------------------- */
|
/* ---------------------------------------------------------------------- */
|
||||||
|
|
||||||
template <class ft>
|
template <class acc_t>
|
||||||
void FixIntel::reduce_results(ft * _noalias const f_start)
|
void FixIntel::reduce_results(acc_t * _noalias const f_scalar)
|
||||||
{
|
{
|
||||||
int o_range, f_stride;
|
int o_range, f_stride;
|
||||||
if (force->newton_pair)
|
if (force->newton_pair)
|
||||||
o_range = atom->nlocal + atom->nghost;
|
o_range = atom->nlocal + atom->nghost;
|
||||||
else
|
else
|
||||||
o_range = atom->nlocal;
|
o_range = atom->nlocal;
|
||||||
IP_PRE_get_stride(f_stride, o_range, sizeof(ft), lmp->atom->torque);
|
IP_PRE_get_stride(f_stride, o_range, (sizeof(acc_t)*4), lmp->atom->torque);
|
||||||
|
|
||||||
#if defined(_OPENMP)
|
o_range *= 4;
|
||||||
#pragma omp parallel default(none) shared(o_range, f_stride)
|
const int f_stride4 = f_stride * 4;
|
||||||
#endif
|
|
||||||
{
|
|
||||||
int iifrom, iito, tid;
|
|
||||||
IP_PRE_omp_range_id_align(iifrom, iito, tid, o_range, _nthreads,
|
|
||||||
sizeof(ft));
|
|
||||||
|
|
||||||
int t_off = f_stride;
|
if (_nthreads <= INTEL_HTHREADS) {
|
||||||
if (_results_eatom) {
|
acc_t *f_scalar2 = f_scalar + f_stride4;
|
||||||
for (int t = 1; t < _nthreads; t++) {
|
if (_nthreads == 4) {
|
||||||
_use_simd_pragma("vector nontemporal")
|
acc_t *f_scalar3 = f_scalar2 + f_stride4;
|
||||||
_use_simd_pragma("novector")
|
acc_t *f_scalar4 = f_scalar3 + f_stride4;
|
||||||
for (int n = iifrom; n < iito; n++) {
|
_use_simd_pragma("vector aligned")
|
||||||
f_start[n].x += f_start[n + t_off].x;
|
_use_simd_pragma("simd")
|
||||||
f_start[n].y += f_start[n + t_off].y;
|
for (int n = 0; n < o_range; n++)
|
||||||
f_start[n].z += f_start[n + t_off].z;
|
f_scalar[n] += f_scalar2[n] + f_scalar3[n] + f_scalar4[n];
|
||||||
f_start[n].w += f_start[n + t_off].w;
|
} else if (_nthreads == 2) {
|
||||||
}
|
_use_simd_pragma("vector aligned")
|
||||||
t_off += f_stride;
|
_use_simd_pragma("simd")
|
||||||
}
|
for (int n = 0; n < o_range; n++)
|
||||||
|
f_scalar[n] += f_scalar2[n];
|
||||||
} else {
|
} else {
|
||||||
|
acc_t *f_scalar3 = f_scalar2 + f_stride4;
|
||||||
|
_use_simd_pragma("vector aligned")
|
||||||
|
_use_simd_pragma("simd")
|
||||||
|
for (int n = 0; n < o_range; n++)
|
||||||
|
f_scalar[n] += f_scalar2[n] + f_scalar3[n];
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
#if defined(_OPENMP)
|
||||||
|
#pragma omp parallel
|
||||||
|
#endif
|
||||||
|
{
|
||||||
|
int iifrom, iito, tid;
|
||||||
|
IP_PRE_omp_range_id_align(iifrom, iito, tid, o_range, _nthreads,
|
||||||
|
sizeof(acc_t));
|
||||||
|
|
||||||
|
acc_t *f_scalar2 = f_scalar + f_stride4;
|
||||||
for (int t = 1; t < _nthreads; t++) {
|
for (int t = 1; t < _nthreads; t++) {
|
||||||
_use_simd_pragma("vector nontemporal")
|
_use_simd_pragma("vector aligned")
|
||||||
_use_simd_pragma("novector")
|
_use_simd_pragma("simd")
|
||||||
for (int n = iifrom; n < iito; n++) {
|
for (int n = iifrom; n < iito; n++)
|
||||||
f_start[n].x += f_start[n + t_off].x;
|
f_scalar[n] += f_scalar2[n];
|
||||||
f_start[n].y += f_start[n + t_off].y;
|
f_scalar2 += f_stride4;
|
||||||
f_start[n].z += f_start[n + t_off].z;
|
|
||||||
}
|
|
||||||
t_off += f_stride;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -641,40 +654,59 @@ void FixIntel::add_results(const ft * _noalias const f_in,
|
|||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
if (_separate_buffers) {
|
if (_separate_buffers) {
|
||||||
if (offload) {
|
if (offload) {
|
||||||
add_oresults(f_in, ev_global, eatom, vatom, 0, _offload_nlocal);
|
|
||||||
if (force->newton_pair) {
|
if (force->newton_pair) {
|
||||||
|
add_oresults(f_in, ev_global, eatom, vatom, 0, _offload_nlocal);
|
||||||
const acc_t * _noalias const enull = 0;
|
const acc_t * _noalias const enull = 0;
|
||||||
int offset = _offload_nlocal;
|
int offset = _offload_nlocal;
|
||||||
if (atom->torque) offset *= 2;
|
if (atom->torque) offset *= 2;
|
||||||
add_oresults(f_in + offset, enull, eatom, vatom,
|
add_oresults(f_in + offset, enull, eatom, vatom,
|
||||||
_offload_min_ghost, _offload_nghost);
|
_offload_min_ghost, _offload_nghost);
|
||||||
}
|
} else
|
||||||
|
add_oresults(f_in, ev_global, eatom, vatom, 0, offload_end_pair());
|
||||||
} else {
|
} else {
|
||||||
add_oresults(f_in, ev_global, eatom, vatom,
|
|
||||||
_host_min_local, _host_used_local);
|
|
||||||
if (force->newton_pair) {
|
if (force->newton_pair) {
|
||||||
|
add_oresults(f_in, ev_global, eatom, vatom,
|
||||||
|
_host_min_local, _host_used_local);
|
||||||
const acc_t * _noalias const enull = 0;
|
const acc_t * _noalias const enull = 0;
|
||||||
int offset = _host_used_local;
|
int offset = _host_used_local;
|
||||||
if (atom->torque) offset *= 2;
|
if (atom->torque) offset *= 2;
|
||||||
add_oresults(f_in + offset, enull, eatom,
|
add_oresults(f_in + offset, enull, eatom,
|
||||||
vatom, _host_min_ghost, _host_used_ghost);
|
vatom, _host_min_ghost, _host_used_ghost);
|
||||||
|
} else {
|
||||||
|
int start = host_start_pair();
|
||||||
|
add_oresults(f_in, ev_global, eatom, vatom, start, atom->nlocal-start);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
stop_watch(TIME_PACK);
|
stop_watch(TIME_PACK);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (force->newton_pair && (_offload_noghost == 0 || offload == 0))
|
int start;
|
||||||
f_length = atom->nlocal + atom->nghost;
|
if (offload) {
|
||||||
else
|
start = 0;
|
||||||
f_length = atom->nlocal;
|
if (force->newton_pair) {
|
||||||
|
if (_offload_noghost == 0)
|
||||||
|
f_length = atom->nlocal + atom->nghost;
|
||||||
|
else
|
||||||
|
f_length = atom->nlocal;
|
||||||
|
} else
|
||||||
|
f_length = offload_end_pair();
|
||||||
|
} else {
|
||||||
|
if (force->newton_pair) {
|
||||||
|
start = 0;
|
||||||
|
f_length = atom->nlocal + atom->nghost;
|
||||||
|
} else {
|
||||||
|
start = host_start_pair();
|
||||||
|
f_length = atom->nlocal - start;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
add_oresults(f_in, ev_global, eatom, vatom, start, f_length);
|
||||||
#else
|
#else
|
||||||
if (force->newton_pair)
|
if (force->newton_pair)
|
||||||
f_length = atom->nlocal + atom->nghost;
|
f_length = atom->nlocal + atom->nghost;
|
||||||
else
|
else
|
||||||
f_length = atom->nlocal;
|
f_length = atom->nlocal;
|
||||||
#endif
|
|
||||||
|
|
||||||
add_oresults(f_in, ev_global, eatom, vatom, 0, f_length);
|
add_oresults(f_in, ev_global, eatom, vatom, 0, f_length);
|
||||||
|
#endif
|
||||||
stop_watch(TIME_PACK);
|
stop_watch(TIME_PACK);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -695,8 +727,11 @@ void FixIntel::add_oresults(const ft * _noalias const f_in,
|
|||||||
"Sphere particles not yet supported for gayberne/intel");
|
"Sphere particles not yet supported for gayberne/intel");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int packthreads;
|
||||||
|
if (_nthreads > INTEL_HTHREADS) packthreads = _nthreads;
|
||||||
|
else packthreads = 1;
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
#pragma omp parallel default(none)
|
#pragma omp parallel if(packthreads > 1)
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
@ -705,7 +740,7 @@ void FixIntel::add_oresults(const ft * _noalias const f_in,
|
|||||||
const int tid = 0;
|
const int tid = 0;
|
||||||
#endif
|
#endif
|
||||||
int ifrom, ito;
|
int ifrom, ito;
|
||||||
IP_PRE_omp_range_align(ifrom, ito, tid, nall, _nthreads, sizeof(acc_t));
|
IP_PRE_omp_range_align(ifrom, ito, tid, nall, packthreads, sizeof(acc_t));
|
||||||
if (atom->torque) {
|
if (atom->torque) {
|
||||||
int ii = ifrom * 2;
|
int ii = ifrom * 2;
|
||||||
lmp_ft * _noalias const tor = (lmp_ft *) lmp->atom->torque[0] +
|
lmp_ft * _noalias const tor = (lmp_ft *) lmp->atom->torque[0] +
|
||||||
@ -833,6 +868,11 @@ void FixIntel::add_off_results(const ft * _noalias const f_in,
|
|||||||
_offload_nlocal;
|
_offload_nlocal;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (atom->torque)
|
||||||
|
if (f_in[1].w < 0.0)
|
||||||
|
error->all(FLERR, "Bad matrix inversion in mldivide3");
|
||||||
|
add_results(f_in, ev_global, _off_results_eatom, _off_results_vatom, 1);
|
||||||
|
|
||||||
// Load balance?
|
// Load balance?
|
||||||
if (_offload_balance < 0.0) {
|
if (_offload_balance < 0.0) {
|
||||||
if (neighbor->ago == 0)
|
if (neighbor->ago == 0)
|
||||||
@ -860,10 +900,6 @@ void FixIntel::add_off_results(const ft * _noalias const f_in,
|
|||||||
stop_watch(TIME_IMBALANCE);
|
stop_watch(TIME_IMBALANCE);
|
||||||
#endif
|
#endif
|
||||||
acc_timers();
|
acc_timers();
|
||||||
if (atom->torque)
|
|
||||||
if (f_in[1].w < 0.0)
|
|
||||||
error->all(FLERR, "Bad matrix inversion in mldivide3");
|
|
||||||
add_results(f_in, ev_global, _off_results_eatom, _off_results_vatom, 1);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ---------------------------------------------------------------------- */
|
/* ---------------------------------------------------------------------- */
|
||||||
|
|||||||
@ -70,23 +70,32 @@ class FixIntel : public Fix {
|
|||||||
|
|
||||||
inline int nbor_pack_width() const { return _nbor_pack_width; }
|
inline int nbor_pack_width() const { return _nbor_pack_width; }
|
||||||
inline void nbor_pack_width(const int w) { _nbor_pack_width = w; }
|
inline void nbor_pack_width(const int w) { _nbor_pack_width = w; }
|
||||||
|
inline int three_body_neighbor() { return _three_body_neighbor; }
|
||||||
|
inline void three_body_neighbor(const int i) { _three_body_neighbor = 1; }
|
||||||
|
|
||||||
inline int need_zero(const int tid) {
|
inline int need_zero(const int tid) {
|
||||||
if (_need_reduce == 0 && tid > 0) return 1;
|
if (_need_reduce == 0 && tid > 0) return 1;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
inline void set_reduce_flag() { _need_reduce = 1; }
|
inline void set_reduce_flag() { if (_nthreads > 1) _need_reduce = 1; }
|
||||||
inline int lrt() {
|
inline int lrt() {
|
||||||
if (force->kspace_match("pppm/intel", 0)) return _lrt;
|
if (force->kspace_match("pppm/intel", 0)) return _lrt;
|
||||||
else return 0;
|
else return 0;
|
||||||
}
|
}
|
||||||
|
inline int pppm_table() {
|
||||||
|
if (force->kspace_match("pppm/intel", 0) ||
|
||||||
|
force->kspace_match("pppm/disp/intel",0))
|
||||||
|
return INTEL_P3M_TABLE;
|
||||||
|
else return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
IntelBuffers<float,float> *_single_buffers;
|
IntelBuffers<float,float> *_single_buffers;
|
||||||
IntelBuffers<float,double> *_mixed_buffers;
|
IntelBuffers<float,double> *_mixed_buffers;
|
||||||
IntelBuffers<double,double> *_double_buffers;
|
IntelBuffers<double,double> *_double_buffers;
|
||||||
|
|
||||||
int _precision_mode, _nthreads, _nbor_pack_width;
|
int _precision_mode, _nthreads, _nbor_pack_width, _three_body_neighbor;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
inline int* get_overflow_flag() { return _overflow_flag; }
|
inline int* get_overflow_flag() { return _overflow_flag; }
|
||||||
@ -241,7 +250,10 @@ void FixIntel::get_buffern(const int offload, int &nlocal, int &nall,
|
|||||||
} else {
|
} else {
|
||||||
nlocal = atom->nlocal;
|
nlocal = atom->nlocal;
|
||||||
nall = _host_nall;
|
nall = _host_nall;
|
||||||
minlocal = _host_min_local;
|
if (force->newton)
|
||||||
|
minlocal = _host_min_local;
|
||||||
|
else
|
||||||
|
minlocal = host_start_pair();
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -275,7 +287,7 @@ void FixIntel::add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in,
|
|||||||
_results_eatom = eatom;
|
_results_eatom = eatom;
|
||||||
_results_vatom = vatom;
|
_results_vatom = vatom;
|
||||||
#ifndef _LMP_INTEL_OFFLOAD
|
#ifndef _LMP_INTEL_OFFLOAD
|
||||||
if (rflag != 2 && _nthreads > 1) _need_reduce = 1;
|
if (rflag != 2 && _nthreads > 1 && force->newton) _need_reduce = 1;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (_overflow_flag[LMP_OVERFLOW])
|
if (_overflow_flag[LMP_OVERFLOW])
|
||||||
@ -303,7 +315,7 @@ void FixIntel::add_result_array(IntelBuffers<float,double>::vec3_acc_t *f_in,
|
|||||||
_results_eatom = eatom;
|
_results_eatom = eatom;
|
||||||
_results_vatom = vatom;
|
_results_vatom = vatom;
|
||||||
#ifndef _LMP_INTEL_OFFLOAD
|
#ifndef _LMP_INTEL_OFFLOAD
|
||||||
if (rflag != 2 && _nthreads > 1) _need_reduce = 1;
|
if (rflag != 2 && _nthreads > 1 && force->newton) _need_reduce = 1;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (_overflow_flag[LMP_OVERFLOW])
|
if (_overflow_flag[LMP_OVERFLOW])
|
||||||
@ -331,7 +343,7 @@ void FixIntel::add_result_array(IntelBuffers<float,float>::vec3_acc_t *f_in,
|
|||||||
_results_eatom = eatom;
|
_results_eatom = eatom;
|
||||||
_results_vatom = vatom;
|
_results_vatom = vatom;
|
||||||
#ifndef _LMP_INTEL_OFFLOAD
|
#ifndef _LMP_INTEL_OFFLOAD
|
||||||
if (rflag != 2 && _nthreads > 1) _need_reduce = 1;
|
if (rflag != 2 && _nthreads > 1 && force->newton) _need_reduce = 1;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (_overflow_flag[LMP_OVERFLOW])
|
if (_overflow_flag[LMP_OVERFLOW])
|
||||||
|
|||||||
@ -87,16 +87,16 @@ void ImproperCvffIntel::compute(int eflag, int vflag,
|
|||||||
else evflag = 0;
|
else evflag = 0;
|
||||||
|
|
||||||
if (evflag) {
|
if (evflag) {
|
||||||
if (eflag) {
|
if (vflag && !eflag) {
|
||||||
|
if (force->newton_bond)
|
||||||
|
eval<0,1,1>(vflag, buffers, fc);
|
||||||
|
else
|
||||||
|
eval<0,1,0>(vflag, buffers, fc);
|
||||||
|
} else {
|
||||||
if (force->newton_bond)
|
if (force->newton_bond)
|
||||||
eval<1,1,1>(vflag, buffers, fc);
|
eval<1,1,1>(vflag, buffers, fc);
|
||||||
else
|
else
|
||||||
eval<1,1,0>(vflag, buffers, fc);
|
eval<1,1,0>(vflag, buffers, fc);
|
||||||
} else {
|
|
||||||
if (force->newton_bond)
|
|
||||||
eval<1,0,1>(vflag, buffers, fc);
|
|
||||||
else
|
|
||||||
eval<1,0,0>(vflag, buffers, fc);
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_bond)
|
if (force->newton_bond)
|
||||||
@ -108,7 +108,7 @@ void ImproperCvffIntel::compute(int eflag, int vflag,
|
|||||||
|
|
||||||
/* ---------------------------------------------------------------------- */
|
/* ---------------------------------------------------------------------- */
|
||||||
|
|
||||||
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||||
void ImproperCvffIntel::eval(const int vflag,
|
void ImproperCvffIntel::eval(const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc)
|
const ForceConst<flt_t> &fc)
|
||||||
@ -131,12 +131,9 @@ void ImproperCvffIntel::eval(const int vflag,
|
|||||||
const int nthreads = tc;
|
const int nthreads = tc;
|
||||||
|
|
||||||
acc_t oeimproper, ov0, ov1, ov2, ov3, ov4, ov5;
|
acc_t oeimproper, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||||
if (EVFLAG) {
|
if (EFLAG) oeimproper = (acc_t)0.0;
|
||||||
if (EFLAG)
|
if (VFLAG && vflag) {
|
||||||
oeimproper = (acc_t)0.0;
|
ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
|
||||||
if (vflag) {
|
|
||||||
ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
@ -145,8 +142,12 @@ void ImproperCvffIntel::eval(const int vflag,
|
|||||||
reduction(+:oeimproper,ov0,ov1,ov2,ov3,ov4,ov5)
|
reduction(+:oeimproper,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
int nfrom, nto, tid;
|
int nfrom, npl, nto, tid;
|
||||||
|
#ifdef LMP_INTEL_USE_SIMDOFF_FIX
|
||||||
IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
|
IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
|
||||||
|
#else
|
||||||
|
IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
|
||||||
|
#endif
|
||||||
|
|
||||||
FORCE_T * _noalias const f = f_start + (tid * f_stride);
|
FORCE_T * _noalias const f = f_start + (tid * f_stride);
|
||||||
if (fix->need_zero(tid))
|
if (fix->need_zero(tid))
|
||||||
@ -155,7 +156,17 @@ void ImproperCvffIntel::eval(const int vflag,
|
|||||||
const int5_t * _noalias const improperlist =
|
const int5_t * _noalias const improperlist =
|
||||||
(int5_t *) neighbor->improperlist[0];
|
(int5_t *) neighbor->improperlist[0];
|
||||||
|
|
||||||
|
#ifdef LMP_INTEL_USE_SIMDOFF_FIX
|
||||||
|
acc_t seimproper, sv0, sv1, sv2, sv3, sv4, sv5;
|
||||||
|
if (EFLAG) seimproper = (acc_t)0.0;
|
||||||
|
if (VFLAG && vflag) {
|
||||||
|
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
||||||
|
}
|
||||||
|
#pragma simd reduction(+:seimproper, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
for (int n = nfrom; n < nto; n++) {
|
for (int n = nfrom; n < nto; n++) {
|
||||||
|
#else
|
||||||
|
for (int n = nfrom; n < nto; n += npl) {
|
||||||
|
#endif
|
||||||
const int i1 = improperlist[n].a;
|
const int i1 = improperlist[n].a;
|
||||||
const int i2 = improperlist[n].b;
|
const int i2 = improperlist[n].b;
|
||||||
const int i3 = improperlist[n].c;
|
const int i3 = improperlist[n].c;
|
||||||
@ -216,7 +227,7 @@ void ImproperCvffIntel::eval(const int vflag,
|
|||||||
flt_t c = (c0 + c1mag*c2mag) * s12;
|
flt_t c = (c0 + c1mag*c2mag) * s12;
|
||||||
|
|
||||||
// error check
|
// error check
|
||||||
|
#ifndef LMP_INTEL_USE_SIMDOFF_FIX
|
||||||
if (c > PTOLERANCE || c < MTOLERANCE) {
|
if (c > PTOLERANCE || c < MTOLERANCE) {
|
||||||
int me;
|
int me;
|
||||||
MPI_Comm_rank(world,&me);
|
MPI_Comm_rank(world,&me);
|
||||||
@ -238,6 +249,7 @@ void ImproperCvffIntel::eval(const int vflag,
|
|||||||
me,x[i4].x,x[i4].y,x[i4].z);
|
me,x[i4].x,x[i4].y,x[i4].z);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
if (c > (flt_t)1.0) c = (flt_t)1.0;
|
if (c > (flt_t)1.0) c = (flt_t)1.0;
|
||||||
if (c < (flt_t)-1.0) c = (flt_t)-1.0;
|
if (c < (flt_t)-1.0) c = (flt_t)-1.0;
|
||||||
@ -250,31 +262,36 @@ void ImproperCvffIntel::eval(const int vflag,
|
|||||||
const int m = fc.fc[type].multiplicity;
|
const int m = fc.fc[type].multiplicity;
|
||||||
|
|
||||||
flt_t p, pd;
|
flt_t p, pd;
|
||||||
if (m == 2) {
|
#ifdef LMP_INTEL_USE_SIMDOFF_FIX
|
||||||
p = (flt_t)2.0*c*c;
|
#pragma simdoff
|
||||||
pd = (flt_t)2.0*c;
|
#endif
|
||||||
} else if (m == 3) {
|
{
|
||||||
const flt_t rc2 = c*c;
|
if (m == 2) {
|
||||||
p = ((flt_t)4.0*rc2-(flt_t)3.0)*c + (flt_t)1.0;
|
p = (flt_t)2.0*c*c;
|
||||||
pd = (flt_t)6.0*rc2 - (flt_t)1.5;
|
pd = (flt_t)2.0*c;
|
||||||
} else if (m == 4) {
|
} else if (m == 3) {
|
||||||
const flt_t rc2 = c*c;
|
const flt_t rc2 = c*c;
|
||||||
p = (flt_t)8.0*(rc2-1)*rc2 + (flt_t)2.0;
|
p = ((flt_t)4.0*rc2-(flt_t)3.0)*c + (flt_t)1.0;
|
||||||
pd = ((flt_t)16.0*rc2-(flt_t)8.0)*c;
|
pd = (flt_t)6.0*rc2 - (flt_t)1.5;
|
||||||
} else if (m == 6) {
|
} else if (m == 4) {
|
||||||
const flt_t rc2 = c*c;
|
const flt_t rc2 = c*c;
|
||||||
p = (((flt_t)32.0*rc2-(flt_t)48.0)*rc2 + (flt_t)18.0)*rc2;
|
p = (flt_t)8.0*(rc2-1)*rc2 + (flt_t)2.0;
|
||||||
pd = ((flt_t)96.0*(rc2-(flt_t)1.0)*rc2 + (flt_t)18.0)*c;
|
pd = ((flt_t)16.0*rc2-(flt_t)8.0)*c;
|
||||||
} else if (m == 1) {
|
} else if (m == 6) {
|
||||||
p = c + (flt_t)1.0;
|
const flt_t rc2 = c*c;
|
||||||
pd = (flt_t)0.5;
|
p = (((flt_t)32.0*rc2-(flt_t)48.0)*rc2 + (flt_t)18.0)*rc2;
|
||||||
} else if (m == 5) {
|
pd = ((flt_t)96.0*(rc2-(flt_t)1.0)*rc2 + (flt_t)18.0)*c;
|
||||||
const flt_t rc2 = c*c;
|
} else if (m == 1) {
|
||||||
p = (((flt_t)16.0*rc2-(flt_t)20.0)*rc2 + (flt_t)5.0)*c + (flt_t)1.0;
|
p = c + (flt_t)1.0;
|
||||||
pd = ((flt_t)40.0*rc2-(flt_t)30.0)*rc2 + (flt_t)2.5;
|
pd = (flt_t)0.5;
|
||||||
} else if (m == 0) {
|
} else if (m == 5) {
|
||||||
p = (flt_t)2.0;
|
const flt_t rc2 = c*c;
|
||||||
pd = (flt_t)0.0;
|
p = (((flt_t)16.0*rc2-(flt_t)20.0)*rc2 + (flt_t)5.0)*c + (flt_t)1.0;
|
||||||
|
pd = ((flt_t)40.0*rc2-(flt_t)30.0)*rc2 + (flt_t)2.5;
|
||||||
|
} else if (m == 0) {
|
||||||
|
p = (flt_t)2.0;
|
||||||
|
pd = (flt_t)0.0;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (fc.fc[type].sign == -1) {
|
if (fc.fc[type].sign == -1) {
|
||||||
@ -317,46 +334,63 @@ void ImproperCvffIntel::eval(const int vflag,
|
|||||||
|
|
||||||
// apply force to each of 4 atoms
|
// apply force to each of 4 atoms
|
||||||
|
|
||||||
if (NEWTON_BOND || i1 < nlocal) {
|
#ifdef LMP_INTEL_USE_SIMDOFF_FIX
|
||||||
f[i1].x += f1x;
|
#pragma simdoff
|
||||||
f[i1].y += f1y;
|
#endif
|
||||||
f[i1].z += f1z;
|
{
|
||||||
|
if (NEWTON_BOND || i1 < nlocal) {
|
||||||
|
f[i1].x += f1x;
|
||||||
|
f[i1].y += f1y;
|
||||||
|
f[i1].z += f1z;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (NEWTON_BOND || i2 < nlocal) {
|
||||||
|
f[i2].x += f2x;
|
||||||
|
f[i2].y += f2y;
|
||||||
|
f[i2].z += f2z;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (NEWTON_BOND || i3 < nlocal) {
|
||||||
|
f[i3].x += f3x;
|
||||||
|
f[i3].y += f3y;
|
||||||
|
f[i3].z += f3z;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (NEWTON_BOND || i4 < nlocal) {
|
||||||
|
f[i4].x += f4x;
|
||||||
|
f[i4].y += f4y;
|
||||||
|
f[i4].z += f4z;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (NEWTON_BOND || i2 < nlocal) {
|
if (EFLAG || VFLAG) {
|
||||||
f[i2].x += f2x;
|
#ifdef LMP_INTEL_USE_SIMDOFF_FIX
|
||||||
f[i2].y += f2y;
|
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, eimproper, i1, i2,
|
||||||
f[i2].z += f2z;
|
i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y,
|
||||||
}
|
f4z, vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm,
|
||||||
|
vb3x, vb3y, vb3z, seimproper, f, NEWTON_BOND,
|
||||||
if (NEWTON_BOND || i3 < nlocal) {
|
nlocal, sv0, sv1, sv2, sv3, sv4, sv5);
|
||||||
f[i3].x += f3x;
|
#else
|
||||||
f[i3].y += f3y;
|
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, eimproper, i1, i2,
|
||||||
f[i3].z += f3z;
|
i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y,
|
||||||
}
|
f4z, vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm,
|
||||||
|
vb3x, vb3y, vb3z, oeimproper, f, NEWTON_BOND,
|
||||||
if (NEWTON_BOND || i4 < nlocal) {
|
nlocal, ov0, ov1, ov2, ov3, ov4, ov5);
|
||||||
f[i4].x += f4x;
|
#endif
|
||||||
f[i4].y += f4y;
|
|
||||||
f[i4].z += f4z;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (EVFLAG) {
|
|
||||||
IP_PRE_ev_tally_dihed(EFLAG, eatom, vflag, eimproper, i1, i2, i3, i4,
|
|
||||||
f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
|
|
||||||
vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x,
|
|
||||||
vb3y, vb3z, oeimproper, f, NEWTON_BOND, nlocal,
|
|
||||||
ov0, ov1, ov2, ov3, ov4, ov5);
|
|
||||||
}
|
}
|
||||||
} // for n
|
} // for n
|
||||||
} // omp parallel
|
#ifdef LMP_INTEL_USE_SIMDOFF_FIX
|
||||||
if (EVFLAG) {
|
if (EFLAG) oeimproper += seimproper;
|
||||||
if (EFLAG)
|
if (VFLAG && vflag) {
|
||||||
energy += oeimproper;
|
ov0 += sv0; ov1 += sv1; ov2 += sv2;
|
||||||
if (vflag) {
|
ov3 += sv3; ov4 += sv4; ov5 += sv5;
|
||||||
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
|
||||||
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
} // omp parallel
|
||||||
|
if (EFLAG) energy += oeimproper;
|
||||||
|
if (VFLAG && vflag) {
|
||||||
|
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
||||||
|
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
||||||
}
|
}
|
||||||
|
|
||||||
fix->set_reduce_flag();
|
fix->set_reduce_flag();
|
||||||
|
|||||||
@ -88,16 +88,16 @@ void ImproperHarmonicIntel::compute(int eflag, int vflag,
|
|||||||
else evflag = 0;
|
else evflag = 0;
|
||||||
|
|
||||||
if (evflag) {
|
if (evflag) {
|
||||||
if (eflag) {
|
if (vflag && !eflag) {
|
||||||
|
if (force->newton_bond)
|
||||||
|
eval<0,1,1>(vflag, buffers, fc);
|
||||||
|
else
|
||||||
|
eval<0,1,0>(vflag, buffers, fc);
|
||||||
|
} else {
|
||||||
if (force->newton_bond)
|
if (force->newton_bond)
|
||||||
eval<1,1,1>(vflag, buffers, fc);
|
eval<1,1,1>(vflag, buffers, fc);
|
||||||
else
|
else
|
||||||
eval<1,1,0>(vflag, buffers, fc);
|
eval<1,1,0>(vflag, buffers, fc);
|
||||||
} else {
|
|
||||||
if (force->newton_bond)
|
|
||||||
eval<1,0,1>(vflag, buffers, fc);
|
|
||||||
else
|
|
||||||
eval<1,0,0>(vflag, buffers, fc);
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_bond)
|
if (force->newton_bond)
|
||||||
@ -109,7 +109,7 @@ void ImproperHarmonicIntel::compute(int eflag, int vflag,
|
|||||||
|
|
||||||
/* ---------------------------------------------------------------------- */
|
/* ---------------------------------------------------------------------- */
|
||||||
|
|
||||||
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||||
void ImproperHarmonicIntel::eval(const int vflag,
|
void ImproperHarmonicIntel::eval(const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc)
|
const ForceConst<flt_t> &fc)
|
||||||
@ -132,12 +132,9 @@ void ImproperHarmonicIntel::eval(const int vflag,
|
|||||||
const int nthreads = tc;
|
const int nthreads = tc;
|
||||||
|
|
||||||
acc_t oeimproper, ov0, ov1, ov2, ov3, ov4, ov5;
|
acc_t oeimproper, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||||
if (EVFLAG) {
|
if (EFLAG) oeimproper = (acc_t)0.0;
|
||||||
if (EFLAG)
|
if (VFLAG && vflag) {
|
||||||
oeimproper = (acc_t)0.0;
|
ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
|
||||||
if (vflag) {
|
|
||||||
ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
@ -146,8 +143,12 @@ void ImproperHarmonicIntel::eval(const int vflag,
|
|||||||
reduction(+:oeimproper,ov0,ov1,ov2,ov3,ov4,ov5)
|
reduction(+:oeimproper,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
int nfrom, nto, tid;
|
int nfrom, npl, nto, tid;
|
||||||
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
|
IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
|
||||||
|
#else
|
||||||
|
IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
|
||||||
|
#endif
|
||||||
|
|
||||||
FORCE_T * _noalias const f = f_start + (tid * f_stride);
|
FORCE_T * _noalias const f = f_start + (tid * f_stride);
|
||||||
if (fix->need_zero(tid))
|
if (fix->need_zero(tid))
|
||||||
@ -156,7 +157,17 @@ void ImproperHarmonicIntel::eval(const int vflag,
|
|||||||
const int5_t * _noalias const improperlist =
|
const int5_t * _noalias const improperlist =
|
||||||
(int5_t *) neighbor->improperlist[0];
|
(int5_t *) neighbor->improperlist[0];
|
||||||
|
|
||||||
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
|
acc_t seimproper, sv0, sv1, sv2, sv3, sv4, sv5;
|
||||||
|
if (EFLAG) seimproper = (acc_t)0.0;
|
||||||
|
if (VFLAG && vflag) {
|
||||||
|
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
||||||
|
}
|
||||||
|
#pragma simd reduction(+:seimproper, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
for (int n = nfrom; n < nto; n++) {
|
for (int n = nfrom; n < nto; n++) {
|
||||||
|
#else
|
||||||
|
for (int n = nfrom; n < nto; n += npl) {
|
||||||
|
#endif
|
||||||
const int i1 = improperlist[n].a;
|
const int i1 = improperlist[n].a;
|
||||||
const int i2 = improperlist[n].b;
|
const int i2 = improperlist[n].b;
|
||||||
const int i3 = improperlist[n].c;
|
const int i3 = improperlist[n].c;
|
||||||
@ -207,7 +218,7 @@ void ImproperHarmonicIntel::eval(const int vflag,
|
|||||||
flt_t c = (c1*c2 + c0) * s12;
|
flt_t c = (c1*c2 + c0) * s12;
|
||||||
|
|
||||||
// error check
|
// error check
|
||||||
|
#ifndef LMP_INTEL_USE_SIMDOFF
|
||||||
if (c > PTOLERANCE || c < MTOLERANCE) {
|
if (c > PTOLERANCE || c < MTOLERANCE) {
|
||||||
int me;
|
int me;
|
||||||
MPI_Comm_rank(world,&me);
|
MPI_Comm_rank(world,&me);
|
||||||
@ -229,6 +240,7 @@ void ImproperHarmonicIntel::eval(const int vflag,
|
|||||||
me,x[i4].x,x[i4].y,x[i4].z);
|
me,x[i4].x,x[i4].y,x[i4].z);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
if (c > (flt_t)1.0) c = (flt_t)1.0;
|
if (c > (flt_t)1.0) c = (flt_t)1.0;
|
||||||
if (c < (flt_t)-1.0) c = (flt_t)-1.0;
|
if (c < (flt_t)-1.0) c = (flt_t)-1.0;
|
||||||
@ -278,46 +290,63 @@ void ImproperHarmonicIntel::eval(const int vflag,
|
|||||||
|
|
||||||
// apply force to each of 4 atoms
|
// apply force to each of 4 atoms
|
||||||
|
|
||||||
if (NEWTON_BOND || i1 < nlocal) {
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
f[i1].x += f1x;
|
#pragma simdoff
|
||||||
f[i1].y += f1y;
|
#endif
|
||||||
f[i1].z += f1z;
|
{
|
||||||
|
if (NEWTON_BOND || i1 < nlocal) {
|
||||||
|
f[i1].x += f1x;
|
||||||
|
f[i1].y += f1y;
|
||||||
|
f[i1].z += f1z;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (NEWTON_BOND || i2 < nlocal) {
|
||||||
|
f[i2].x += f2x;
|
||||||
|
f[i2].y += f2y;
|
||||||
|
f[i2].z += f2z;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (NEWTON_BOND || i3 < nlocal) {
|
||||||
|
f[i3].x += f3x;
|
||||||
|
f[i3].y += f3y;
|
||||||
|
f[i3].z += f3z;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (NEWTON_BOND || i4 < nlocal) {
|
||||||
|
f[i4].x += f4x;
|
||||||
|
f[i4].y += f4y;
|
||||||
|
f[i4].z += f4z;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (NEWTON_BOND || i2 < nlocal) {
|
if (EFLAG || VFLAG) {
|
||||||
f[i2].x += f2x;
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
f[i2].y += f2y;
|
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, eimproper, i1, i2,
|
||||||
f[i2].z += f2z;
|
i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x,
|
||||||
}
|
f4y, f4z, vb1x, vb1y, vb1z, vb2x, vb2y, vb2z,
|
||||||
|
vb3x, vb3y, vb3z, seimproper, f, NEWTON_BOND,
|
||||||
if (NEWTON_BOND || i3 < nlocal) {
|
nlocal, sv0, sv1, sv2, sv3, sv4, sv5);
|
||||||
f[i3].x += f3x;
|
#else
|
||||||
f[i3].y += f3y;
|
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, eimproper, i1, i2,
|
||||||
f[i3].z += f3z;
|
i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x,
|
||||||
}
|
f4y, f4z, vb1x, vb1y, vb1z, vb2x, vb2y, vb2z,
|
||||||
|
vb3x, vb3y, vb3z, oeimproper, f, NEWTON_BOND,
|
||||||
if (NEWTON_BOND || i4 < nlocal) {
|
nlocal, ov0, ov1, ov2, ov3, ov4, ov5);
|
||||||
f[i4].x += f4x;
|
#endif
|
||||||
f[i4].y += f4y;
|
|
||||||
f[i4].z += f4z;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (EVFLAG) {
|
|
||||||
IP_PRE_ev_tally_dihed(EFLAG, eatom, vflag, eimproper, i1, i2, i3, i4,
|
|
||||||
f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
|
|
||||||
vb1x, vb1y, vb1z, vb2x, vb2y, vb2z, vb3x, vb3y,
|
|
||||||
vb3z, oeimproper, f, NEWTON_BOND, nlocal,
|
|
||||||
ov0, ov1, ov2, ov3, ov4, ov5);
|
|
||||||
}
|
}
|
||||||
} // for n
|
} // for n
|
||||||
} // omp parallel
|
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||||
if (EVFLAG) {
|
if (EFLAG) oeimproper += seimproper;
|
||||||
if (EFLAG)
|
if (VFLAG && vflag) {
|
||||||
energy += oeimproper;
|
ov0 += sv0; ov1 += sv1; ov2 += sv2;
|
||||||
if (vflag) {
|
ov3 += sv3; ov4 += sv4; ov5 += sv5;
|
||||||
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
|
||||||
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
} // omp parallel
|
||||||
|
if (EFLAG) energy += oeimproper;
|
||||||
|
if (VFLAG && vflag) {
|
||||||
|
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
||||||
|
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
||||||
}
|
}
|
||||||
|
|
||||||
fix->set_reduce_flag();
|
fix->set_reduce_flag();
|
||||||
|
|||||||
@ -12,6 +12,7 @@
|
|||||||
Contributing author: W. Michael Brown (Intel)
|
Contributing author: W. Michael Brown (Intel)
|
||||||
------------------------------------------------------------------------- */
|
------------------------------------------------------------------------- */
|
||||||
|
|
||||||
|
#include <math.h>
|
||||||
#include "intel_buffers.h"
|
#include "intel_buffers.h"
|
||||||
#include "force.h"
|
#include "force.h"
|
||||||
#include "memory.h"
|
#include "memory.h"
|
||||||
@ -28,6 +29,7 @@ IntelBuffers<flt_t, acc_t>::IntelBuffers(class LAMMPS *lmp_in) :
|
|||||||
_ntypes = 0;
|
_ntypes = 0;
|
||||||
_off_map_listlocal = 0;
|
_off_map_listlocal = 0;
|
||||||
_ccachex = 0;
|
_ccachex = 0;
|
||||||
|
_ncache_alloc = 0;
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
_separate_buffers = 0;
|
_separate_buffers = 0;
|
||||||
_off_f = 0;
|
_off_f = 0;
|
||||||
@ -36,6 +38,7 @@ IntelBuffers<flt_t, acc_t>::IntelBuffers(class LAMMPS *lmp_in) :
|
|||||||
_off_list_alloc = false;
|
_off_list_alloc = false;
|
||||||
_off_threads = 0;
|
_off_threads = 0;
|
||||||
_off_ccache = 0;
|
_off_ccache = 0;
|
||||||
|
_off_ncache = 0;
|
||||||
_host_nmax = 0;
|
_host_nmax = 0;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
@ -111,15 +114,20 @@ void IntelBuffers<flt_t, acc_t>::_grow(const int nall, const int nlocal,
|
|||||||
_buf_local_size = _buf_size;
|
_buf_local_size = _buf_size;
|
||||||
else
|
else
|
||||||
_buf_local_size = static_cast<double>(nlocal) * 1.1 + 1;
|
_buf_local_size = static_cast<double>(nlocal) * 1.1 + 1;
|
||||||
if (lmp->atom->torque)
|
|
||||||
_buf_local_size *= 2;
|
|
||||||
const int f_stride = get_stride(_buf_local_size);
|
const int f_stride = get_stride(_buf_local_size);
|
||||||
lmp->memory->create(_x, _buf_size,"intel_x");
|
lmp->memory->create(_x, _buf_size,"intel_x");
|
||||||
if (lmp->atom->q != NULL)
|
if (lmp->atom->q != NULL)
|
||||||
lmp->memory->create(_q, _buf_size, "intel_q");
|
lmp->memory->create(_q, _buf_size, "intel_q");
|
||||||
if (lmp->atom->ellipsoid != NULL)
|
if (lmp->atom->ellipsoid != NULL)
|
||||||
lmp->memory->create(_quat, _buf_size, "intel_quat");
|
lmp->memory->create(_quat, _buf_size, "intel_quat");
|
||||||
lmp->memory->create(_f, f_stride * nthreads, "intel_f");
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
|
if (lmp->force->newton_pair)
|
||||||
|
#else
|
||||||
|
if (lmp->force->newton_pair || lmp->atom->molecular)
|
||||||
|
#endif
|
||||||
|
lmp->memory->create(_f, f_stride * nthreads, "intel_f");
|
||||||
|
else
|
||||||
|
lmp->memory->create(_f, f_stride, "intel_f");
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
if (_separate_buffers) {
|
if (_separate_buffers) {
|
||||||
@ -131,7 +139,10 @@ void IntelBuffers<flt_t, acc_t>::_grow(const int nall, const int nlocal,
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (offload_end > 0) {
|
if (offload_end > 0) {
|
||||||
lmp->memory->create(_off_f, f_stride * _off_threads, "intel_off_f");
|
int fm;
|
||||||
|
if (lmp->force->newton_pair) fm = _off_threads;
|
||||||
|
else fm = 1;
|
||||||
|
lmp->memory->create(_off_f, f_stride * fm, "intel_off_f");
|
||||||
const atom_t * const x = get_x();
|
const atom_t * const x = get_x();
|
||||||
const flt_t * const q = get_q();
|
const flt_t * const q = get_q();
|
||||||
const vec3_acc_t * f_start = get_off_f();
|
const vec3_acc_t * f_start = get_off_f();
|
||||||
@ -140,14 +151,14 @@ void IntelBuffers<flt_t, acc_t>::_grow(const int nall, const int nlocal,
|
|||||||
if (x != NULL && q != NULL && f_start != NULL && ev_global != NULL) {
|
if (x != NULL && q != NULL && f_start != NULL && ev_global != NULL) {
|
||||||
#pragma offload_transfer target(mic:_cop) \
|
#pragma offload_transfer target(mic:_cop) \
|
||||||
nocopy(x,q:length(_buf_size) alloc_if(1) free_if(0)) \
|
nocopy(x,q:length(_buf_size) alloc_if(1) free_if(0)) \
|
||||||
nocopy(f_start:length(f_stride*_off_threads) alloc_if(1) free_if(0))\
|
nocopy(f_start:length(f_stride*fm) alloc_if(1) free_if(0))\
|
||||||
nocopy(ev_global:length(8) alloc_if(1) free_if(0))
|
nocopy(ev_global:length(8) alloc_if(1) free_if(0))
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (x != NULL && f_start != NULL && ev_global != NULL) {
|
if (x != NULL && f_start != NULL && ev_global != NULL) {
|
||||||
#pragma offload_transfer target(mic:_cop) \
|
#pragma offload_transfer target(mic:_cop) \
|
||||||
nocopy(x:length(_buf_size) alloc_if(1) free_if(0)) \
|
nocopy(x:length(_buf_size) alloc_if(1) free_if(0)) \
|
||||||
nocopy(f_start:length(f_stride*_off_threads) alloc_if(1) free_if(0))\
|
nocopy(f_start:length(f_stride*fm) alloc_if(1) free_if(0))\
|
||||||
nocopy(ev_global:length(8) alloc_if(1) free_if(0))
|
nocopy(ev_global:length(8) alloc_if(1) free_if(0))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -427,6 +438,115 @@ void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
|
|||||||
|
|
||||||
/* ---------------------------------------------------------------------- */
|
/* ---------------------------------------------------------------------- */
|
||||||
|
|
||||||
|
template <class flt_t, class acc_t>
|
||||||
|
void IntelBuffers<flt_t, acc_t>::free_ncache()
|
||||||
|
{
|
||||||
|
if (_ncache_alloc) {
|
||||||
|
flt_t *ncachex = _ncachex;
|
||||||
|
flt_t *ncachey = _ncachey;
|
||||||
|
flt_t *ncachez = _ncachez;
|
||||||
|
int *ncachej = _ncachej;
|
||||||
|
int *ncachejtype = _ncachejtype;
|
||||||
|
|
||||||
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
|
if (_off_ncache) {
|
||||||
|
#pragma offload_transfer target(mic:_cop) \
|
||||||
|
nocopy(ncachex,ncachey,ncachez,ncachej:alloc_if(0) free_if(1)) \
|
||||||
|
nocopy(ncachejtype:alloc_if(0) free_if(1))
|
||||||
|
}
|
||||||
|
_off_ncache = 0;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
lmp->memory->destroy(ncachex);
|
||||||
|
lmp->memory->destroy(ncachey);
|
||||||
|
lmp->memory->destroy(ncachez);
|
||||||
|
lmp->memory->destroy(ncachej);
|
||||||
|
lmp->memory->destroy(ncachejtype);
|
||||||
|
|
||||||
|
_ncache_alloc = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ---------------------------------------------------------------------- */
|
||||||
|
|
||||||
|
template <class flt_t, class acc_t>
|
||||||
|
void IntelBuffers<flt_t, acc_t>::grow_ncache(const int off_flag,
|
||||||
|
const int nthreads)
|
||||||
|
{
|
||||||
|
const int nsize = get_max_nbors() * 3;
|
||||||
|
int esize = MIN(sizeof(int), sizeof(flt_t));
|
||||||
|
IP_PRE_get_stride(_ncache_stride, nsize, esize, 0);
|
||||||
|
int nt = MAX(nthreads, _off_threads);
|
||||||
|
const int vsize = _ncache_stride * nt;
|
||||||
|
|
||||||
|
if (_ncache_alloc) {
|
||||||
|
if (vsize > _ncache_alloc)
|
||||||
|
free_ncache();
|
||||||
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
|
else if (off_flag && _off_ncache == 0)
|
||||||
|
free_ncache();
|
||||||
|
#endif
|
||||||
|
else
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
lmp->memory->create(_ncachex, vsize, "_ncachex");
|
||||||
|
lmp->memory->create(_ncachey, vsize, "_ncachey");
|
||||||
|
lmp->memory->create(_ncachez, vsize, "_ncachez");
|
||||||
|
lmp->memory->create(_ncachej, vsize, "_ncachej");
|
||||||
|
lmp->memory->create(_ncachejtype, vsize, "_ncachejtype");
|
||||||
|
|
||||||
|
_ncache_alloc = vsize;
|
||||||
|
|
||||||
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
|
if (off_flag) {
|
||||||
|
flt_t *ncachex = _ncachex;
|
||||||
|
flt_t *ncachey = _ncachey;
|
||||||
|
flt_t *ncachez = _ncachez;
|
||||||
|
int *ncachej = _ncachej;
|
||||||
|
int *ncachejtype = _ncachejtype;
|
||||||
|
|
||||||
|
if (ncachex != NULL && ncachey !=NULL && ncachez != NULL &&
|
||||||
|
ncachej != NULL && ncachejtype != NULL) {
|
||||||
|
#pragma offload_transfer target(mic:_cop) \
|
||||||
|
nocopy(ncachex,ncachey:length(vsize) alloc_if(1) free_if(0)) \
|
||||||
|
nocopy(ncachez,ncachej:length(vsize) alloc_if(1) free_if(0)) \
|
||||||
|
nocopy(ncachejtype:length(vsize) alloc_if(1) free_if(0))
|
||||||
|
}
|
||||||
|
_off_ncache = 1;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ---------------------------------------------------------------------- */
|
||||||
|
|
||||||
|
#ifndef _LMP_INTEL_OFFLOAD
|
||||||
|
template <class flt_t, class acc_t>
|
||||||
|
void IntelBuffers<flt_t, acc_t>::fdotr_reduce_l5(const int lf, const int lt,
|
||||||
|
const int nthreads, const int f_stride, acc_t &ov0, acc_t &ov1,
|
||||||
|
acc_t &ov2, acc_t &ov3, acc_t &ov4, acc_t &ov5)
|
||||||
|
{
|
||||||
|
IP_PRE_fdotr_acc_force_l5(lf, lt, 0, nthreads, _f, f_stride, _x, ov0,
|
||||||
|
ov1, ov2, ov3, ov4, ov5);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* ---------------------------------------------------------------------- */
|
||||||
|
|
||||||
|
#ifndef _LMP_INTEL_OFFLOAD
|
||||||
|
template <class flt_t, class acc_t>
|
||||||
|
void IntelBuffers<flt_t, acc_t>::fdotr_reduce(const int nall,
|
||||||
|
const int nthreads, const int f_stride, acc_t &ov0, acc_t &ov1,
|
||||||
|
acc_t &ov2, acc_t &ov3, acc_t &ov4, acc_t &ov5)
|
||||||
|
{
|
||||||
|
int iifrom, iito, tid;
|
||||||
|
IP_PRE_fdotr_acc_force(nall, 0, nthreads, _f, f_stride, _x, 0, 2,
|
||||||
|
ov0, ov1, ov2, ov3, ov4, ov5);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* ---------------------------------------------------------------------- */
|
||||||
|
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void IntelBuffers<flt_t, acc_t>::set_ntypes(const int ntypes)
|
void IntelBuffers<flt_t, acc_t>::set_ntypes(const int ntypes)
|
||||||
{
|
{
|
||||||
|
|||||||
@ -78,6 +78,7 @@ class IntelBuffers {
|
|||||||
free_nbor_list();
|
free_nbor_list();
|
||||||
free_nmax();
|
free_nmax();
|
||||||
free_list_local();
|
free_list_local();
|
||||||
|
free_ncache();
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void grow_list(NeighList *list, const int nlocal, const int nthreads,
|
inline void grow_list(NeighList *list, const int nlocal, const int nthreads,
|
||||||
@ -106,6 +107,15 @@ class IntelBuffers {
|
|||||||
inline acc_t * get_ccachef() { return _ccachef; }
|
inline acc_t * get_ccachef() { return _ccachef; }
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
void free_ncache();
|
||||||
|
void grow_ncache(const int off_flag, const int nthreads);
|
||||||
|
inline int ncache_stride() { return _ncache_stride; }
|
||||||
|
inline flt_t * get_ncachex() { return _ncachex; }
|
||||||
|
inline flt_t * get_ncachey() { return _ncachey; }
|
||||||
|
inline flt_t * get_ncachez() { return _ncachez; }
|
||||||
|
inline int * get_ncachej() { return _ncachej; }
|
||||||
|
inline int * get_ncachejtype() { return _ncachejtype; }
|
||||||
|
|
||||||
inline int get_max_nbors() {
|
inline int get_max_nbors() {
|
||||||
int mn = lmp->neighbor->oneatom * sizeof(int) /
|
int mn = lmp->neighbor->oneatom * sizeof(int) /
|
||||||
(INTEL_ONEATOM_FACTOR * INTEL_DATA_ALIGN);
|
(INTEL_ONEATOM_FACTOR * INTEL_DATA_ALIGN);
|
||||||
@ -180,6 +190,15 @@ class IntelBuffers {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifndef _LMP_INTEL_OFFLOAD
|
||||||
|
void fdotr_reduce_l5(const int lf, const int lt, const int nthreads,
|
||||||
|
const int f_stride, acc_t &ov0, acc_t &ov1,
|
||||||
|
acc_t &ov2, acc_t &ov3, acc_t &ov4, acc_t &ov5);
|
||||||
|
void fdotr_reduce(const int nall, const int nthreads, const int f_stride,
|
||||||
|
acc_t &ov0, acc_t &ov1, acc_t &ov2, acc_t &ov3,
|
||||||
|
acc_t &ov4, acc_t &ov5);
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
inline void thr_pack_cop(const int ifrom, const int ito,
|
inline void thr_pack_cop(const int ifrom, const int ito,
|
||||||
const int offset, const bool dotype = false) {
|
const int offset, const bool dotype = false) {
|
||||||
@ -263,6 +282,10 @@ class IntelBuffers {
|
|||||||
int _ccache_stride;
|
int _ccache_stride;
|
||||||
flt_t *_ccachex, *_ccachey, *_ccachez, *_ccachew;
|
flt_t *_ccachex, *_ccachey, *_ccachez, *_ccachew;
|
||||||
int *_ccachei, *_ccachej;
|
int *_ccachei, *_ccachej;
|
||||||
|
|
||||||
|
int _ncache_stride, _ncache_alloc;
|
||||||
|
flt_t *_ncachex, *_ncachey, *_ncachez;
|
||||||
|
int *_ncachej, *_ncachejtype;
|
||||||
#ifdef LMP_USE_AVXCD
|
#ifdef LMP_USE_AVXCD
|
||||||
int _ccache_stride3;
|
int _ccache_stride3;
|
||||||
acc_t * _ccachef;
|
acc_t * _ccachef;
|
||||||
@ -274,7 +297,7 @@ class IntelBuffers {
|
|||||||
flt_t *_host_q;
|
flt_t *_host_q;
|
||||||
quat_t *_host_quat;
|
quat_t *_host_quat;
|
||||||
vec3_acc_t *_off_f;
|
vec3_acc_t *_off_f;
|
||||||
int _off_map_nmax, _cop, _off_ccache;
|
int _off_map_nmax, _cop, _off_ccache, _off_ncache;
|
||||||
int *_off_map_ilist;
|
int *_off_map_ilist;
|
||||||
int *_off_map_special, *_off_map_nspecial, *_off_map_tag;
|
int *_off_map_special, *_off_map_nspecial, *_off_map_tag;
|
||||||
int *_off_map_numneigh;
|
int *_off_map_numneigh;
|
||||||
|
|||||||
@ -17,6 +17,9 @@
|
|||||||
|
|
||||||
#ifdef __INTEL_COMPILER
|
#ifdef __INTEL_COMPILER
|
||||||
#define LMP_SIMD_COMPILER
|
#define LMP_SIMD_COMPILER
|
||||||
|
#if (__INTEL_COMPILER_BUILD_DATE > 20160720)
|
||||||
|
#define LMP_INTEL_USE_SIMDOFF
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __INTEL_OFFLOAD
|
#ifdef __INTEL_OFFLOAD
|
||||||
@ -65,7 +68,10 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
|
|||||||
#define INTEL_MAX_STENCIL 256
|
#define INTEL_MAX_STENCIL 256
|
||||||
// INTEL_MAX_STENCIL * sqrt(INTEL_MAX_STENCIL)
|
// INTEL_MAX_STENCIL * sqrt(INTEL_MAX_STENCIL)
|
||||||
#define INTEL_MAX_STENCIL_CHECK 4096
|
#define INTEL_MAX_STENCIL_CHECK 4096
|
||||||
#define INTEL_P3M_MAXORDER 5
|
#define INTEL_P3M_MAXORDER 7
|
||||||
|
#define INTEL_P3M_ALIGNED_MAXORDER 8
|
||||||
|
// PRECOMPUTE VALUES IN TABLE (DOESN'T AFFECT ACCURACY)
|
||||||
|
#define INTEL_P3M_TABLE 1
|
||||||
|
|
||||||
#ifdef __INTEL_COMPILER
|
#ifdef __INTEL_COMPILER
|
||||||
#ifdef __AVX__
|
#ifdef __AVX__
|
||||||
@ -87,24 +93,36 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
|
|||||||
#ifdef __MIC__
|
#ifdef __MIC__
|
||||||
#define INTEL_V512 1
|
#define INTEL_V512 1
|
||||||
#define INTEL_VMASK 1
|
#define INTEL_VMASK 1
|
||||||
|
#define INTEL_HTHREADS 4
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef __AVX512ER__
|
||||||
|
#define INTEL_HTHREADS 4
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef __AVX512CD__
|
#ifdef __AVX512CD__
|
||||||
#ifndef _LMP_INTEL_OFFLOAD
|
#ifndef _LMP_INTEL_OFFLOAD
|
||||||
#define LMP_USE_AVXCD
|
#define LMP_USE_AVXCD
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef __MIC__
|
||||||
|
#define INTEL_COMPILE_WIDTH INTEL_MIC_VECTOR_WIDTH
|
||||||
|
#else
|
||||||
|
#define INTEL_COMPILE_WIDTH INTEL_VECTOR_WIDTH
|
||||||
|
#endif
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
#undef INTEL_VECTOR_WIDTH
|
#undef INTEL_VECTOR_WIDTH
|
||||||
#define INTEL_VECTOR_WIDTH 1
|
#define INTEL_VECTOR_WIDTH 1
|
||||||
|
#define INTEL_COMPILE_WIDTH 1
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define INTEL_DATA_ALIGN 64
|
#define INTEL_DATA_ALIGN 64
|
||||||
#define INTEL_ONEATOM_FACTOR 2
|
#define INTEL_ONEATOM_FACTOR 1
|
||||||
#define INTEL_MIC_NBOR_PAD INTEL_MIC_VECTOR_WIDTH
|
#define INTEL_MIC_NBOR_PAD INTEL_MIC_VECTOR_WIDTH
|
||||||
#define INTEL_NBOR_PAD INTEL_VECTOR_WIDTH
|
#define INTEL_NBOR_PAD INTEL_VECTOR_WIDTH
|
||||||
#define INTEL_LB_MEAN_WEIGHT 0.1
|
#define INTEL_LB_MEAN_WEIGHT 0.1
|
||||||
@ -112,6 +130,10 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
|
|||||||
#define INTEL_MAX_HOST_CORE_COUNT 512
|
#define INTEL_MAX_HOST_CORE_COUNT 512
|
||||||
#define INTEL_MAX_COI_CORES 36
|
#define INTEL_MAX_COI_CORES 36
|
||||||
|
|
||||||
|
#ifndef INTEL_HTHREADS
|
||||||
|
#define INTEL_HTHREADS 2
|
||||||
|
#endif
|
||||||
|
|
||||||
#define IP_PRE_get_stride(stride, n, datasize, torque) \
|
#define IP_PRE_get_stride(stride, n, datasize, torque) \
|
||||||
{ \
|
{ \
|
||||||
int blength = n; \
|
int blength = n; \
|
||||||
@ -125,9 +147,17 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
|
|||||||
|
|
||||||
#define IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads) \
|
#define IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads) \
|
||||||
{ \
|
{ \
|
||||||
const int idelta = 1 + inum/nthreads; \
|
int idelta = inum/nthreads; \
|
||||||
|
const int imod = inum % nthreads; \
|
||||||
ifrom = tid * idelta; \
|
ifrom = tid * idelta; \
|
||||||
ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta; \
|
ito = ifrom + idelta; \
|
||||||
|
if (tid < imod) { \
|
||||||
|
ito+=tid+1; \
|
||||||
|
ifrom+=tid; \
|
||||||
|
} else { \
|
||||||
|
ito+=imod; \
|
||||||
|
ifrom+=imod; \
|
||||||
|
} \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define IP_PRE_omp_range_id(ifrom, ito, tid, inum, nthreads) \
|
#define IP_PRE_omp_range_id(ifrom, ito, tid, inum, nthreads) \
|
||||||
@ -136,12 +166,37 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
|
|||||||
IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads); \
|
IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads); \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define IP_PRE_omp_stride(ifrom, ip, ito, tid, inum, nthr) \
|
||||||
|
{ \
|
||||||
|
if (nthr <= INTEL_HTHREADS) { \
|
||||||
|
ifrom = tid; \
|
||||||
|
ito = inum; \
|
||||||
|
ip = nthr; \
|
||||||
|
} else if (nthr % INTEL_HTHREADS == 0) { \
|
||||||
|
int nd = nthr / INTEL_HTHREADS; \
|
||||||
|
int td = tid / INTEL_HTHREADS; \
|
||||||
|
int tm = tid % INTEL_HTHREADS; \
|
||||||
|
IP_PRE_omp_range(ifrom, ito, td, inum, nd); \
|
||||||
|
ifrom += tm; \
|
||||||
|
ip = INTEL_HTHREADS; \
|
||||||
|
} else { \
|
||||||
|
IP_PRE_omp_range(ifrom, ito, tid, inum, nthr); \
|
||||||
|
ip = 1; \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define IP_PRE_omp_stride_id(ifrom, ip, ito, tid, inum, nthr) \
|
||||||
|
{ \
|
||||||
|
tid = omp_get_thread_num(); \
|
||||||
|
IP_PRE_omp_stride(ifrom, ip, ito, tid, inum, nthr); \
|
||||||
|
}
|
||||||
|
|
||||||
#define IP_PRE_omp_range_align(ifrom, ito, tid, inum, nthreads, \
|
#define IP_PRE_omp_range_align(ifrom, ito, tid, inum, nthreads, \
|
||||||
datasize) \
|
datasize) \
|
||||||
{ \
|
{ \
|
||||||
int chunk_size = INTEL_DATA_ALIGN / datasize; \
|
int chunk_size = INTEL_DATA_ALIGN / datasize; \
|
||||||
int idelta = static_cast<int>(static_cast<float>(inum) \
|
int idelta = static_cast<int>(ceil(static_cast<float>(inum) \
|
||||||
/chunk_size/nthreads) + 1; \
|
/chunk_size/nthreads)); \
|
||||||
idelta *= chunk_size; \
|
idelta *= chunk_size; \
|
||||||
ifrom = tid*idelta; \
|
ifrom = tid*idelta; \
|
||||||
ito = ifrom + idelta; \
|
ito = ifrom + idelta; \
|
||||||
@ -168,6 +223,29 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
|
|||||||
if (ito > inum) ito = inum; \
|
if (ito > inum) ito = inum; \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define IP_PRE_omp_stride_id_vec(ifrom, ip, ito, tid, inum, \
|
||||||
|
nthr, vecsize) \
|
||||||
|
{ \
|
||||||
|
tid = omp_get_thread_num(); \
|
||||||
|
if (nthr <= INTEL_HTHREADS) { \
|
||||||
|
ifrom = tid*vecsize; \
|
||||||
|
ito = inum; \
|
||||||
|
ip = nthr*vecsize; \
|
||||||
|
} else if (nthr % INTEL_HTHREADS == 0) { \
|
||||||
|
int nd = nthr / INTEL_HTHREADS; \
|
||||||
|
int td = tid / INTEL_HTHREADS; \
|
||||||
|
int tm = tid % INTEL_HTHREADS; \
|
||||||
|
IP_PRE_omp_range_id_vec(ifrom, ito, td, inum, nd, \
|
||||||
|
vecsize); \
|
||||||
|
ifrom += tm * vecsize; \
|
||||||
|
ip = INTEL_HTHREADS * vecsize; \
|
||||||
|
} else { \
|
||||||
|
IP_PRE_omp_range_id_vec(ifrom, ito, tid, inum, nthr, \
|
||||||
|
vecsize); \
|
||||||
|
ip = vecsize; \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
#define IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads) \
|
#define IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads) \
|
||||||
@ -183,6 +261,21 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
|
|||||||
ito = inum; \
|
ito = inum; \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define IP_PRE_omp_range(ifrom, ip, ito, tid, inum, nthreads) \
|
||||||
|
{ \
|
||||||
|
ifrom = 0; \
|
||||||
|
ito = inum; \
|
||||||
|
ip = 1; \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define IP_PRE_omp_stride_id(ifrom, ip, ito, tid, inum, nthr) \
|
||||||
|
{ \
|
||||||
|
tid = 0; \
|
||||||
|
ifrom = 0; \
|
||||||
|
ito = inum; \
|
||||||
|
ip = 1; \
|
||||||
|
}
|
||||||
|
|
||||||
#define IP_PRE_omp_range_align(ifrom, ito, tid, inum, nthreads, \
|
#define IP_PRE_omp_range_align(ifrom, ito, tid, inum, nthreads, \
|
||||||
datasize) \
|
datasize) \
|
||||||
{ \
|
{ \
|
||||||
@ -202,14 +295,215 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
|
|||||||
nthreads, vecsize) \
|
nthreads, vecsize) \
|
||||||
{ \
|
{ \
|
||||||
tid = 0; \
|
tid = 0; \
|
||||||
int idelta = static_cast<int>(ceil(static_cast<float>(inum) \
|
|
||||||
/vecsize)); \
|
|
||||||
ifrom = 0; \
|
ifrom = 0; \
|
||||||
ito = inum; \
|
ito = inum; \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define IP_PRE_omp_range_id_vec(ifrom, ip, ito, tid, inum, \
|
||||||
|
nthreads, vecsize) \
|
||||||
|
{ \
|
||||||
|
tid = 0; \
|
||||||
|
ifrom = 0; \
|
||||||
|
ito = inum; \
|
||||||
|
ip = vecsize; \
|
||||||
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#define IP_PRE_fdotr_acc_force_l5(lf, lt, minlocal, nthreads, f_start, \
|
||||||
|
f_stride, pos, ov0, ov1, ov2, \
|
||||||
|
ov3, ov4, ov5) \
|
||||||
|
{ \
|
||||||
|
acc_t *f_scalar = &f_start[0].x; \
|
||||||
|
flt_t *x_scalar = &pos[minlocal].x; \
|
||||||
|
int f_stride4 = f_stride * 4; \
|
||||||
|
_alignvar(acc_t ovv[INTEL_COMPILE_WIDTH],64); \
|
||||||
|
int vwidth; \
|
||||||
|
if (sizeof(acc_t) == sizeof(double)) \
|
||||||
|
vwidth = INTEL_COMPILE_WIDTH/2; \
|
||||||
|
else \
|
||||||
|
vwidth = INTEL_COMPILE_WIDTH; \
|
||||||
|
if (vwidth < 4) vwidth = 4; \
|
||||||
|
_use_simd_pragma("vector aligned") \
|
||||||
|
_use_simd_pragma("simd") \
|
||||||
|
for (int v = 0; v < vwidth; v++) ovv[v] = (acc_t)0.0; \
|
||||||
|
int remainder = lt % vwidth; \
|
||||||
|
if (lf > lt) remainder = 0; \
|
||||||
|
const int v_range = lt - remainder; \
|
||||||
|
if (nthreads == 2) { \
|
||||||
|
acc_t *f_scalar2 = f_scalar + f_stride4; \
|
||||||
|
for (int n = lf; n < v_range; n += vwidth) { \
|
||||||
|
_use_simd_pragma("vector aligned") \
|
||||||
|
_use_simd_pragma("simd") \
|
||||||
|
for (int v = 0; v < vwidth; v++) { \
|
||||||
|
f_scalar[n+v] += f_scalar2[n+v]; \
|
||||||
|
ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \
|
||||||
|
} \
|
||||||
|
ov3 += f_scalar[n+1] * x_scalar[n+0]; \
|
||||||
|
ov4 += f_scalar[n+2] * x_scalar[n+0]; \
|
||||||
|
ov5 += f_scalar[n+2] * x_scalar[n+1]; \
|
||||||
|
if (vwidth > 4) { \
|
||||||
|
ov3 += f_scalar[n+5] * x_scalar[n+4]; \
|
||||||
|
ov4 += f_scalar[n+6] * x_scalar[n+4]; \
|
||||||
|
ov5 += f_scalar[n+6] * x_scalar[n+5]; \
|
||||||
|
} \
|
||||||
|
if (vwidth > 8) { \
|
||||||
|
ov3 += f_scalar[n+9] * x_scalar[n+8]; \
|
||||||
|
ov3 += f_scalar[n+13] * x_scalar[n+12]; \
|
||||||
|
ov4 += f_scalar[n+10] * x_scalar[n+8]; \
|
||||||
|
ov4 += f_scalar[n+14] * x_scalar[n+12]; \
|
||||||
|
ov5 += f_scalar[n+10] * x_scalar[n+9]; \
|
||||||
|
ov5 += f_scalar[n+14] * x_scalar[n+13]; \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
_use_simd_pragma("vector aligned") \
|
||||||
|
_use_simd_pragma("ivdep") \
|
||||||
|
_use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)") \
|
||||||
|
for (int n = v_range; n < lt; n++) \
|
||||||
|
f_scalar[n] += f_scalar2[n]; \
|
||||||
|
} else if (nthreads==4) { \
|
||||||
|
acc_t *f_scalar2 = f_scalar + f_stride4; \
|
||||||
|
acc_t *f_scalar3 = f_scalar2 + f_stride4; \
|
||||||
|
acc_t *f_scalar4 = f_scalar3 + f_stride4; \
|
||||||
|
for (int n = lf; n < v_range; n += vwidth) { \
|
||||||
|
_use_simd_pragma("vector aligned") \
|
||||||
|
_use_simd_pragma("simd") \
|
||||||
|
for (int v = 0; v < vwidth; v++) { \
|
||||||
|
f_scalar[n+v] += f_scalar2[n+v] + f_scalar3[n+v] + \
|
||||||
|
f_scalar4[n+v]; \
|
||||||
|
ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \
|
||||||
|
} \
|
||||||
|
ov3 += f_scalar[n+1] * x_scalar[n+0]; \
|
||||||
|
ov4 += f_scalar[n+2] * x_scalar[n+0]; \
|
||||||
|
ov5 += f_scalar[n+2] * x_scalar[n+1]; \
|
||||||
|
if (vwidth > 4) { \
|
||||||
|
ov3 += f_scalar[n+5] * x_scalar[n+4]; \
|
||||||
|
ov4 += f_scalar[n+6] * x_scalar[n+4]; \
|
||||||
|
ov5 += f_scalar[n+6] * x_scalar[n+5]; \
|
||||||
|
} \
|
||||||
|
if (vwidth > 8) { \
|
||||||
|
ov3 += f_scalar[n+9] * x_scalar[n+8]; \
|
||||||
|
ov3 += f_scalar[n+13] * x_scalar[n+12]; \
|
||||||
|
ov4 += f_scalar[n+10] * x_scalar[n+8]; \
|
||||||
|
ov4 += f_scalar[n+14] * x_scalar[n+12]; \
|
||||||
|
ov5 += f_scalar[n+10] * x_scalar[n+9]; \
|
||||||
|
ov5 += f_scalar[n+14] * x_scalar[n+13]; \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
_use_simd_pragma("vector aligned") \
|
||||||
|
_use_simd_pragma("ivdep") \
|
||||||
|
_use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)") \
|
||||||
|
for (int n = v_range; n < lt; n++) \
|
||||||
|
f_scalar[n] += f_scalar2[n] + f_scalar3[n] + f_scalar4[n]; \
|
||||||
|
} else if (nthreads==1) { \
|
||||||
|
for (int n = lf; n < v_range; n += vwidth) { \
|
||||||
|
_use_simd_pragma("vector aligned") \
|
||||||
|
_use_simd_pragma("simd") \
|
||||||
|
for (int v = 0; v < vwidth; v++) \
|
||||||
|
ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \
|
||||||
|
ov3 += f_scalar[n+1] * x_scalar[n+0]; \
|
||||||
|
ov4 += f_scalar[n+2] * x_scalar[n+0]; \
|
||||||
|
ov5 += f_scalar[n+2] * x_scalar[n+1]; \
|
||||||
|
if (vwidth > 4) { \
|
||||||
|
ov3 += f_scalar[n+5] * x_scalar[n+4]; \
|
||||||
|
ov4 += f_scalar[n+6] * x_scalar[n+4]; \
|
||||||
|
ov5 += f_scalar[n+6] * x_scalar[n+5]; \
|
||||||
|
} \
|
||||||
|
if (vwidth > 8) { \
|
||||||
|
ov3 += f_scalar[n+9] * x_scalar[n+8]; \
|
||||||
|
ov3 += f_scalar[n+13] * x_scalar[n+12]; \
|
||||||
|
ov4 += f_scalar[n+10] * x_scalar[n+8]; \
|
||||||
|
ov4 += f_scalar[n+14] * x_scalar[n+12]; \
|
||||||
|
ov5 += f_scalar[n+10] * x_scalar[n+9]; \
|
||||||
|
ov5 += f_scalar[n+14] * x_scalar[n+13]; \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
} else if (nthreads==3) { \
|
||||||
|
acc_t *f_scalar2 = f_scalar + f_stride4; \
|
||||||
|
acc_t *f_scalar3 = f_scalar2 + f_stride4; \
|
||||||
|
for (int n = lf; n < v_range; n += vwidth) { \
|
||||||
|
_use_simd_pragma("vector aligned") \
|
||||||
|
_use_simd_pragma("simd") \
|
||||||
|
for (int v = 0; v < vwidth; v++) { \
|
||||||
|
f_scalar[n+v] += f_scalar2[n+v] + f_scalar3[n+v]; \
|
||||||
|
ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \
|
||||||
|
} \
|
||||||
|
ov3 += f_scalar[n+1] * x_scalar[n+0]; \
|
||||||
|
ov4 += f_scalar[n+2] * x_scalar[n+0]; \
|
||||||
|
ov5 += f_scalar[n+2] * x_scalar[n+1]; \
|
||||||
|
if (vwidth > 4) { \
|
||||||
|
ov3 += f_scalar[n+5] * x_scalar[n+4]; \
|
||||||
|
ov4 += f_scalar[n+6] * x_scalar[n+4]; \
|
||||||
|
ov5 += f_scalar[n+6] * x_scalar[n+5]; \
|
||||||
|
} \
|
||||||
|
if (vwidth > 8) { \
|
||||||
|
ov3 += f_scalar[n+9] * x_scalar[n+8]; \
|
||||||
|
ov3 += f_scalar[n+13] * x_scalar[n+12]; \
|
||||||
|
ov4 += f_scalar[n+10] * x_scalar[n+8]; \
|
||||||
|
ov4 += f_scalar[n+14] * x_scalar[n+12]; \
|
||||||
|
ov5 += f_scalar[n+10] * x_scalar[n+9]; \
|
||||||
|
ov5 += f_scalar[n+14] * x_scalar[n+13]; \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
_use_simd_pragma("vector aligned") \
|
||||||
|
_use_simd_pragma("ivdep") \
|
||||||
|
_use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)") \
|
||||||
|
for (int n = v_range; n < lt; n++) \
|
||||||
|
f_scalar[n] += f_scalar2[n] + f_scalar3[n]; \
|
||||||
|
} \
|
||||||
|
for (int n = v_range; n < lt; n += 4) { \
|
||||||
|
_use_simd_pragma("vector aligned") \
|
||||||
|
_use_simd_pragma("ivdep") \
|
||||||
|
for (int v = 0; v < 4; v++) \
|
||||||
|
ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \
|
||||||
|
ov3 += f_scalar[n+1] * x_scalar[n+0]; \
|
||||||
|
ov4 += f_scalar[n+2] * x_scalar[n+0]; \
|
||||||
|
ov5 += f_scalar[n+2] * x_scalar[n+1]; \
|
||||||
|
} \
|
||||||
|
ov0 += ovv[0]; \
|
||||||
|
ov1 += ovv[1]; \
|
||||||
|
ov2 += ovv[2]; \
|
||||||
|
if (vwidth > 4) { \
|
||||||
|
ov0 += ovv[4]; \
|
||||||
|
ov1 += ovv[5]; \
|
||||||
|
ov2 += ovv[6]; \
|
||||||
|
} \
|
||||||
|
if (vwidth > 8) { \
|
||||||
|
ov0 += ovv[8] + ovv[12]; \
|
||||||
|
ov1 += ovv[9] + ovv[13]; \
|
||||||
|
ov2 += ovv[10] + ovv[14]; \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define IP_PRE_fdotr_acc_force(nall, minlocal, nthreads, f_start, \
|
||||||
|
f_stride, pos, offload, vflag, ov0, ov1, \
|
||||||
|
ov2, ov3, ov4, ov5) \
|
||||||
|
{ \
|
||||||
|
int o_range = (nall - minlocal) * 4; \
|
||||||
|
IP_PRE_omp_range_id_align(iifrom, iito, tid, o_range, nthreads, \
|
||||||
|
sizeof(acc_t)); \
|
||||||
|
\
|
||||||
|
acc_t *f_scalar = &f_start[0].x; \
|
||||||
|
int f_stride4 = f_stride * 4; \
|
||||||
|
int t; \
|
||||||
|
if (vflag == 2) t = 4; else t = 1; \
|
||||||
|
acc_t *f_scalar2 = f_scalar + f_stride4 * t; \
|
||||||
|
for ( ; t < nthreads; t++) { \
|
||||||
|
_use_simd_pragma("vector aligned") \
|
||||||
|
_use_simd_pragma("simd") \
|
||||||
|
for (int n = iifrom; n < iito; n++) \
|
||||||
|
f_scalar[n] += f_scalar2[n]; \
|
||||||
|
f_scalar2 += f_stride4; \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
if (vflag == 2) { \
|
||||||
|
int nt_min = MIN(4,nthreads); \
|
||||||
|
IP_PRE_fdotr_acc_force_l5(iifrom, iito, minlocal, nt_min, f_start, \
|
||||||
|
f_stride, pos, ov0, ov1, ov2, ov3, ov4, \
|
||||||
|
ov5); \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
#include <sys/time.h>
|
#include <sys/time.h>
|
||||||
|
|
||||||
@ -229,17 +523,19 @@ inline double MIC_Wtime() {
|
|||||||
if (fix->separate_buffers() && ago != 0) { \
|
if (fix->separate_buffers() && ago != 0) { \
|
||||||
fix->start_watch(TIME_PACK); \
|
fix->start_watch(TIME_PACK); \
|
||||||
if (offload) { \
|
if (offload) { \
|
||||||
_use_omp_pragma("omp parallel default(none) shared(buffers,nlocal,nall)") \
|
int packthreads; \
|
||||||
|
if (comm->nthreads > INTEL_HTHREADS) packthreads = comm->nthreads;\
|
||||||
|
else packthreads = 1; \
|
||||||
|
_use_omp_pragma("omp parallel if(packthreads > 1)") \
|
||||||
{ \
|
{ \
|
||||||
int ifrom, ito, tid; \
|
int ifrom, ito, tid; \
|
||||||
int nthreads = comm->nthreads; \
|
|
||||||
IP_PRE_omp_range_id_align(ifrom, ito, tid, nlocal, \
|
IP_PRE_omp_range_id_align(ifrom, ito, tid, nlocal, \
|
||||||
nthreads, sizeof(flt_t)); \
|
packthreads, sizeof(flt_t)); \
|
||||||
buffers->thr_pack_cop(ifrom, ito, 0); \
|
buffers->thr_pack_cop(ifrom, ito, 0); \
|
||||||
int nghost = nall - nlocal; \
|
int nghost = nall - nlocal; \
|
||||||
if (nghost) { \
|
if (nghost) { \
|
||||||
IP_PRE_omp_range_align(ifrom, ito, tid, nall - nlocal, \
|
IP_PRE_omp_range_align(ifrom, ito, tid, nall - nlocal, \
|
||||||
nthreads, sizeof(flt_t)); \
|
packthreads, sizeof(flt_t)); \
|
||||||
buffers->thr_pack_cop(ifrom + nlocal, ito + nlocal, \
|
buffers->thr_pack_cop(ifrom + nlocal, ito + nlocal, \
|
||||||
fix->offload_min_ghost() - nlocal, \
|
fix->offload_min_ghost() - nlocal, \
|
||||||
ago == 1); \
|
ago == 1); \
|
||||||
@ -254,7 +550,7 @@ inline double MIC_Wtime() {
|
|||||||
} \
|
} \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define IP_PRE_get_transfern(ago, newton, evflag, eflag, vflag, \
|
#define IP_PRE_get_transfern(ago, newton, eflag, vflag, \
|
||||||
buffers, offload, fix, separate_flag, \
|
buffers, offload, fix, separate_flag, \
|
||||||
x_size, q_size, ev_size, f_stride) \
|
x_size, q_size, ev_size, f_stride) \
|
||||||
{ \
|
{ \
|
||||||
@ -276,17 +572,12 @@ inline double MIC_Wtime() {
|
|||||||
q_size = 0; \
|
q_size = 0; \
|
||||||
} \
|
} \
|
||||||
ev_size = 0; \
|
ev_size = 0; \
|
||||||
if (evflag) { \
|
if (eflag) ev_size = 2; \
|
||||||
if (eflag) ev_size = 2; \
|
if (vflag) ev_size = 8; \
|
||||||
if (vflag) ev_size = 8; \
|
|
||||||
} \
|
|
||||||
int f_length; \
|
|
||||||
if (newton) \
|
if (newton) \
|
||||||
f_length = nall; \
|
f_stride = buffers->get_stride(nall); \
|
||||||
else \
|
else \
|
||||||
f_length = nlocal; \
|
f_stride = buffers->get_stride(inum); \
|
||||||
f_length -= minlocal; \
|
|
||||||
f_stride = buffers->get_stride(f_length); \
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#define IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, \
|
#define IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, \
|
||||||
@ -337,6 +628,20 @@ inline double MIC_Wtime() {
|
|||||||
} \
|
} \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define IP_PRE_fdotr_reduce_omp(newton, nall, minlocal, nthreads, \
|
||||||
|
f_start, f_stride, x, offload, vflag, \
|
||||||
|
ov0, ov1, ov2, ov3, ov4, ov5) \
|
||||||
|
{ \
|
||||||
|
if (newton) { \
|
||||||
|
_use_omp_pragma("omp barrier"); \
|
||||||
|
IP_PRE_fdotr_acc_force(nall, minlocal, nthreads, f_start, \
|
||||||
|
f_stride, x, offload, vflag, ov0, ov1, ov2, \
|
||||||
|
ov3, ov4, ov5); \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define IP_PRE_fdotr_reduce(newton, nall, nthreads, f_stride, vflag, \
|
||||||
|
ov0, ov1, ov2, ov3, ov4, ov5)
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
@ -344,7 +649,7 @@ inline double MIC_Wtime() {
|
|||||||
#define IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, \
|
#define IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, \
|
||||||
nlocal, nall)
|
nlocal, nall)
|
||||||
|
|
||||||
#define IP_PRE_get_transfern(ago, newton, evflag, eflag, vflag, \
|
#define IP_PRE_get_transfern(ago, newton, eflag, vflag, \
|
||||||
buffers, offload, fix, separate_flag, \
|
buffers, offload, fix, separate_flag, \
|
||||||
x_size, q_size, ev_size, f_stride) \
|
x_size, q_size, ev_size, f_stride) \
|
||||||
{ \
|
{ \
|
||||||
@ -369,18 +674,54 @@ inline double MIC_Wtime() {
|
|||||||
#define IP_PRE_repack_for_offload(newton, separate_flag, nlocal, nall, \
|
#define IP_PRE_repack_for_offload(newton, separate_flag, nlocal, nall, \
|
||||||
f_stride, x, q)
|
f_stride, x, q)
|
||||||
|
|
||||||
|
#define IP_PRE_fdotr_reduce_omp(newton, nall, minlocal, nthreads, \
|
||||||
|
f_start, f_stride, x, offload, vflag, \
|
||||||
|
ov0, ov1, ov2, ov3, ov4, ov5) \
|
||||||
|
{ \
|
||||||
|
if (newton) { \
|
||||||
|
if (vflag == 2 && nthreads > INTEL_HTHREADS) { \
|
||||||
|
_use_omp_pragma("omp barrier"); \
|
||||||
|
buffers->fdotr_reduce(nall, nthreads, f_stride, ov0, ov1, ov2, \
|
||||||
|
ov3, ov4, ov5); \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define IP_PRE_fdotr_reduce(newton, nall, nthreads, f_stride, vflag, \
|
||||||
|
ov0, ov1, ov2, ov3, ov4, ov5) \
|
||||||
|
{ \
|
||||||
|
if (newton) { \
|
||||||
|
if (vflag == 2 && nthreads <= INTEL_HTHREADS) { \
|
||||||
|
int lt = nall * 4; \
|
||||||
|
buffers->fdotr_reduce_l5(0, lt, nthreads, f_stride, ov0, ov1, \
|
||||||
|
ov2, ov3, ov4, ov5); \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair, delx, dely, delz) \
|
#define IP_PRE_ev_tally_nbor(vflag, fpair, delx, dely, delz) \
|
||||||
{ \
|
{ \
|
||||||
if (vflag == 1) { \
|
if (vflag == 1) { \
|
||||||
sv0 += ev_pre * delx * delx * fpair; \
|
sv0 += delx * delx * fpair; \
|
||||||
sv1 += ev_pre * dely * dely * fpair; \
|
sv1 += dely * dely * fpair; \
|
||||||
sv2 += ev_pre * delz * delz * fpair; \
|
sv2 += delz * delz * fpair; \
|
||||||
sv3 += ev_pre * delx * dely * fpair; \
|
sv3 += delx * dely * fpair; \
|
||||||
sv4 += ev_pre * delx * delz * fpair; \
|
sv4 += delx * delz * fpair; \
|
||||||
sv5 += ev_pre * dely * delz * fpair; \
|
sv5 += dely * delz * fpair; \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define IP_PRE_ev_tally_nborv(vflag, dx, dy, dz, fpx, fpy, fpz) \
|
||||||
|
{ \
|
||||||
|
if (vflag == 1) { \
|
||||||
|
sv0 += dx * fpx; \
|
||||||
|
sv1 += dy * fpy; \
|
||||||
|
sv2 += dz * fpz; \
|
||||||
|
sv3 += dx * fpy; \
|
||||||
|
sv4 += dx * fpz; \
|
||||||
|
sv5 += dy * fpz; \
|
||||||
} \
|
} \
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -408,9 +749,10 @@ inline double MIC_Wtime() {
|
|||||||
} \
|
} \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define IP_PRE_ev_tally_bond(eflag, eatom, vflag, ebond, i1, i2, fbond, \
|
#define IP_PRE_ev_tally_bond(eflag, VFLAG, eatom, vflag, ebond, i1, i2, \
|
||||||
delx, dely, delz, obond, force, newton, \
|
fbond, delx, dely, delz, obond, force, \
|
||||||
nlocal, ov0, ov1, ov2, ov3, ov4, ov5) \
|
newton, nlocal, ov0, ov1, ov2, ov3, ov4, \
|
||||||
|
ov5) \
|
||||||
{ \
|
{ \
|
||||||
flt_t ev_pre; \
|
flt_t ev_pre; \
|
||||||
if (newton) ev_pre = (flt_t)1.0; \
|
if (newton) ev_pre = (flt_t)1.0; \
|
||||||
@ -421,7 +763,7 @@ inline double MIC_Wtime() {
|
|||||||
} \
|
} \
|
||||||
\
|
\
|
||||||
if (eflag) { \
|
if (eflag) { \
|
||||||
oebond += ev_pre * ebond; \
|
obond += ev_pre * ebond; \
|
||||||
if (eatom) { \
|
if (eatom) { \
|
||||||
flt_t halfeng = ebond * (flt_t)0.5; \
|
flt_t halfeng = ebond * (flt_t)0.5; \
|
||||||
if (newton || i1 < nlocal) f[i1].w += halfeng; \
|
if (newton || i1 < nlocal) f[i1].w += halfeng; \
|
||||||
@ -429,7 +771,7 @@ inline double MIC_Wtime() {
|
|||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
\
|
\
|
||||||
if (vflag) { \
|
if (VFLAG && vflag) { \
|
||||||
ov0 += ev_pre * (delx * delx * fbond); \
|
ov0 += ev_pre * (delx * delx * fbond); \
|
||||||
ov1 += ev_pre * (dely * dely * fbond); \
|
ov1 += ev_pre * (dely * dely * fbond); \
|
||||||
ov2 += ev_pre * (delz * delz * fbond); \
|
ov2 += ev_pre * (delz * delz * fbond); \
|
||||||
@ -439,9 +781,9 @@ inline double MIC_Wtime() {
|
|||||||
} \
|
} \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define IP_PRE_ev_tally_angle(eflag, eatom, vflag, eangle, i1, i2, i3, \
|
#define IP_PRE_ev_tally_angle(eflag, VFLAG, eatom, vflag, eangle, i1, \
|
||||||
f1x, f1y, f1z, f3x, f3y, f3z, delx1, \
|
i2, i3, f1x, f1y, f1z, f3x, f3y, f3z, \
|
||||||
dely1, delz1, delx2, dely2, delz2, \
|
delx1, dely1, delz1, delx2, dely2, delz2, \
|
||||||
oeangle, force, newton, nlocal, ov0, ov1, \
|
oeangle, force, newton, nlocal, ov0, ov1, \
|
||||||
ov2, ov3, ov4, ov5) \
|
ov2, ov3, ov4, ov5) \
|
||||||
{ \
|
{ \
|
||||||
@ -464,20 +806,20 @@ inline double MIC_Wtime() {
|
|||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
\
|
\
|
||||||
if (vflag) { \
|
if (VFLAG && vflag) { \
|
||||||
ov0 += ev_pre * (delx1 * f1x + delx2 * f3x); \
|
ov0 += ev_pre * (delx1 * f1x + delx2 * f3x); \
|
||||||
ov1 += ev_pre * (dely1 * f1y + dely2 * f3y); \
|
ov1 += ev_pre * (dely1 * f1y + dely2 * f3y); \
|
||||||
ov2 += ev_pre * (delz1 * f1z + delz2 * f3z); \
|
ov2 += ev_pre * (delz1 * f1z + delz2 * f3z); \
|
||||||
ov3 += ev_pre * (delx1 * f1y + delx2 * f3y); \
|
ov3 += ev_pre * (delx1 * f1y + delx2 * f3y); \
|
||||||
ov4 += ev_pre * (delx1 * f1z + delx2 * f3z); \
|
ov4 += ev_pre * (delx1 * f1z + delx2 * f3z); \
|
||||||
ov5 += ev_pre * (dely1 * f1z + dely2 * f3z); \
|
ov5 += ev_pre * (dely1 * f1z + dely2 * f3z); \
|
||||||
} \
|
} \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define IP_PRE_ev_tally_dihed(eflag, eatom, vflag, deng, i1, i2, i3, i4,\
|
#define IP_PRE_ev_tally_dihed(eflag, VFLAG, eatom, vflag, deng, i1, i2, \
|
||||||
f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, \
|
i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x,\
|
||||||
f4z, vb1x, vb1y, vb1z, vb2x, vb2y, vb2z, \
|
f4y, f4z, vb1x, vb1y, vb1z, vb2x, vb2y, \
|
||||||
vb3x, vb3y, vb3z,oedihedral, force, \
|
vb2z, vb3x, vb3y, vb3z, oedihedral, force,\
|
||||||
newton, nlocal, ov0, ov1, ov2, ov3, ov4, \
|
newton, nlocal, ov0, ov1, ov2, ov3, ov4, \
|
||||||
ov5) \
|
ov5) \
|
||||||
{ \
|
{ \
|
||||||
@ -502,7 +844,7 @@ inline double MIC_Wtime() {
|
|||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
\
|
\
|
||||||
if (vflag) { \
|
if (VFLAG && vflag) { \
|
||||||
ov0 += ev_pre * (vb1x*f1x + vb2x*f3x + (vb3x+vb2x)*f4x); \
|
ov0 += ev_pre * (vb1x*f1x + vb2x*f3x + (vb3x+vb2x)*f4x); \
|
||||||
ov1 += ev_pre * (vb1y*f1y + vb2y*f3y + (vb3y+vb2y)*f4y); \
|
ov1 += ev_pre * (vb1y*f1y + vb2y*f3y + (vb3y+vb2y)*f4y); \
|
||||||
ov2 += ev_pre * (vb1z*f1z + vb2z*f3z + (vb3z+vb2z)*f4z); \
|
ov2 += ev_pre * (vb1z*f1z + vb2z*f3z + (vb3z+vb2z)*f4z); \
|
||||||
@ -512,96 +854,36 @@ inline double MIC_Wtime() {
|
|||||||
} \
|
} \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define IP_PRE_ev_tally_atom(evflag, eflag, vflag, f, fwtmp) \
|
#define IP_PRE_ev_tally_atom(newton, eflag, vflag, f, fwtmp) \
|
||||||
{ \
|
{ \
|
||||||
if (evflag) { \
|
if (eflag) { \
|
||||||
if (eflag) { \
|
f[i].w += fwtmp; \
|
||||||
f[i].w += fwtmp; \
|
oevdwl += sevdwl; \
|
||||||
oevdwl += sevdwl; \
|
} \
|
||||||
} \
|
if (newton == 0 && vflag == 1) { \
|
||||||
if (vflag == 1) { \
|
ov0 += sv0; \
|
||||||
ov0 += sv0; \
|
ov1 += sv1; \
|
||||||
ov1 += sv1; \
|
ov2 += sv2; \
|
||||||
ov2 += sv2; \
|
ov3 += sv3; \
|
||||||
ov3 += sv3; \
|
ov4 += sv4; \
|
||||||
ov4 += sv4; \
|
ov5 += sv5; \
|
||||||
ov5 += sv5; \
|
|
||||||
} \
|
|
||||||
} \
|
} \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define IP_PRE_ev_tally_atomq(evflag, eflag, vflag, f, fwtmp) \
|
#define IP_PRE_ev_tally_atomq(newton, eflag, vflag, f, fwtmp) \
|
||||||
{ \
|
{ \
|
||||||
if (evflag) { \
|
if (eflag) { \
|
||||||
if (eflag) { \
|
f[i].w += fwtmp; \
|
||||||
f[i].w += fwtmp; \
|
oevdwl += sevdwl; \
|
||||||
oevdwl += sevdwl; \
|
oecoul += secoul; \
|
||||||
oecoul += secoul; \
|
|
||||||
} \
|
|
||||||
if (vflag == 1) { \
|
|
||||||
ov0 += sv0; \
|
|
||||||
ov1 += sv1; \
|
|
||||||
ov2 += sv2; \
|
|
||||||
ov3 += sv3; \
|
|
||||||
ov4 += sv4; \
|
|
||||||
ov5 += sv5; \
|
|
||||||
} \
|
|
||||||
} \
|
} \
|
||||||
}
|
if (newton == 0 && vflag == 1) { \
|
||||||
|
ov0 += sv0; \
|
||||||
#define IP_PRE_fdotr_acc_force(newton, evflag, eflag, vflag, eatom, \
|
ov1 += sv1; \
|
||||||
nall, nlocal, minlocal, nthreads, \
|
ov2 += sv2; \
|
||||||
f_start, f_stride, x, offload) \
|
ov3 += sv3; \
|
||||||
{ \
|
ov4 += sv4; \
|
||||||
int o_range; \
|
ov5 += sv5; \
|
||||||
if (newton) \
|
|
||||||
o_range = nall; \
|
|
||||||
else \
|
|
||||||
o_range = nlocal; \
|
|
||||||
if (offload == 0) o_range -= minlocal; \
|
|
||||||
IP_PRE_omp_range_align(iifrom, iito, tid, o_range, nthreads, \
|
|
||||||
sizeof(acc_t)); \
|
|
||||||
\
|
|
||||||
int t_off = f_stride; \
|
|
||||||
if (eflag && eatom) { \
|
|
||||||
for (int t = 1; t < nthreads; t++) { \
|
|
||||||
_use_simd_pragma("vector nontemporal") \
|
|
||||||
_use_simd_pragma("novector") \
|
|
||||||
for (int n = iifrom; n < iito; n++) { \
|
|
||||||
f_start[n].x += f_start[n + t_off].x; \
|
|
||||||
f_start[n].y += f_start[n + t_off].y; \
|
|
||||||
f_start[n].z += f_start[n + t_off].z; \
|
|
||||||
f_start[n].w += f_start[n + t_off].w; \
|
|
||||||
} \
|
|
||||||
t_off += f_stride; \
|
|
||||||
} \
|
|
||||||
} else { \
|
|
||||||
for (int t = 1; t < nthreads; t++) { \
|
|
||||||
_use_simd_pragma("vector nontemporal") \
|
|
||||||
_use_simd_pragma("novector") \
|
|
||||||
for (int n = iifrom; n < iito; n++) { \
|
|
||||||
f_start[n].x += f_start[n + t_off].x; \
|
|
||||||
f_start[n].y += f_start[n + t_off].y; \
|
|
||||||
f_start[n].z += f_start[n + t_off].z; \
|
|
||||||
} \
|
|
||||||
t_off += f_stride; \
|
|
||||||
} \
|
|
||||||
} \
|
|
||||||
\
|
|
||||||
if (evflag) { \
|
|
||||||
if (vflag == 2) { \
|
|
||||||
const ATOM_T * _noalias const xo = x + minlocal; \
|
|
||||||
_use_simd_pragma("vector nontemporal") \
|
|
||||||
_use_simd_pragma("novector") \
|
|
||||||
for (int n = iifrom; n < iito; n++) { \
|
|
||||||
ov0 += f_start[n].x * xo[n].x; \
|
|
||||||
ov1 += f_start[n].y * xo[n].y; \
|
|
||||||
ov2 += f_start[n].z * xo[n].z; \
|
|
||||||
ov3 += f_start[n].y * xo[n].x; \
|
|
||||||
ov4 += f_start[n].z * xo[n].x; \
|
|
||||||
ov5 += f_start[n].z * xo[n].y; \
|
|
||||||
} \
|
|
||||||
} \
|
|
||||||
} \
|
} \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -1778,7 +1778,7 @@ namespace ip_simd {
|
|||||||
inline void SIMD_iforce_update(const SIMD_mask &m, float *force,
|
inline void SIMD_iforce_update(const SIMD_mask &m, float *force,
|
||||||
const SIMD_int &i, const SIMD_float &fx,
|
const SIMD_int &i, const SIMD_float &fx,
|
||||||
const SIMD_float &fy, const SIMD_float &fz,
|
const SIMD_float &fy, const SIMD_float &fz,
|
||||||
const int EVFLAG, const int eatom,
|
const int EFLAG, const int eatom,
|
||||||
const SIMD_float &fwtmp) {
|
const SIMD_float &fwtmp) {
|
||||||
SIMD_float jfrc;
|
SIMD_float jfrc;
|
||||||
jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force,
|
jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force,
|
||||||
@ -1793,7 +1793,7 @@ namespace ip_simd {
|
|||||||
_MM_SCALE_1);
|
_MM_SCALE_1);
|
||||||
jfrc = jfrc + fz;
|
jfrc = jfrc + fz;
|
||||||
_mm512_mask_i32scatter_ps(force+2, m, i, jfrc, _MM_SCALE_1);
|
_mm512_mask_i32scatter_ps(force+2, m, i, jfrc, _MM_SCALE_1);
|
||||||
if (EVFLAG) {
|
if (EFLAG) {
|
||||||
if (eatom) {
|
if (eatom) {
|
||||||
jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 3,
|
jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 3,
|
||||||
_MM_SCALE_1);
|
_MM_SCALE_1);
|
||||||
@ -1806,7 +1806,7 @@ namespace ip_simd {
|
|||||||
inline void SIMD_iforce_update(const SIMD_mask &m, double *force,
|
inline void SIMD_iforce_update(const SIMD_mask &m, double *force,
|
||||||
const SIMD_int &i, const SIMD_double &fx,
|
const SIMD_int &i, const SIMD_double &fx,
|
||||||
const SIMD_double &fy, const SIMD_double &fz,
|
const SIMD_double &fy, const SIMD_double &fz,
|
||||||
const int EVFLAG, const int eatom,
|
const int EFLAG, const int eatom,
|
||||||
const SIMD_double &fwtmp) {
|
const SIMD_double &fwtmp) {
|
||||||
SIMD_double jfrc;
|
SIMD_double jfrc;
|
||||||
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force,
|
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force,
|
||||||
@ -1821,7 +1821,7 @@ namespace ip_simd {
|
|||||||
_MM_SCALE_2);
|
_MM_SCALE_2);
|
||||||
jfrc = jfrc + fz;
|
jfrc = jfrc + fz;
|
||||||
_mm512_mask_i32loscatter_pd(force+2, m, i, jfrc, _MM_SCALE_2);
|
_mm512_mask_i32loscatter_pd(force+2, m, i, jfrc, _MM_SCALE_2);
|
||||||
if (EVFLAG) {
|
if (EFLAG) {
|
||||||
if (eatom) {
|
if (eatom) {
|
||||||
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i,
|
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i,
|
||||||
force + 3, _MM_SCALE_2);
|
force + 3, _MM_SCALE_2);
|
||||||
|
|||||||
@ -71,7 +71,7 @@ void NBinIntel::bin_atoms_setup(int nall)
|
|||||||
if (_offload_alloc) {
|
if (_offload_alloc) {
|
||||||
const int * binhead = this->binhead;
|
const int * binhead = this->binhead;
|
||||||
#pragma offload_transfer target(mic:_cop) \
|
#pragma offload_transfer target(mic:_cop) \
|
||||||
nocopy(binhead:alloc_if(0) free_if(1))
|
nocopy(binhead:alloc_if(0) free_if(1))
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -99,7 +99,7 @@ void NBinIntel::bin_atoms_setup(int nall)
|
|||||||
const int * _atombin = this->_atombin;
|
const int * _atombin = this->_atombin;
|
||||||
const int * _binpacked = this->_binpacked;
|
const int * _binpacked = this->_binpacked;
|
||||||
#pragma offload_transfer target(mic:_cop) \
|
#pragma offload_transfer target(mic:_cop) \
|
||||||
nocopy(bins,_atombin,_binpacked:alloc_if(0) free_if(1))
|
nocopy(bins,_atombin,_binpacked:alloc_if(0) free_if(1))
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
memory->destroy(bins);
|
memory->destroy(bins);
|
||||||
@ -158,9 +158,9 @@ void NBinIntel::bin_atoms(IntelBuffers<flt_t,acc_t> * buffers) {
|
|||||||
const flt_t dy = (INTEL_BIGP - bboxhi[1]);
|
const flt_t dy = (INTEL_BIGP - bboxhi[1]);
|
||||||
const flt_t dz = (INTEL_BIGP - bboxhi[2]);
|
const flt_t dz = (INTEL_BIGP - bboxhi[2]);
|
||||||
if (dx * dx + dy * dy + dz * dz <
|
if (dx * dx + dy * dy + dz * dz <
|
||||||
static_cast<flt_t>(neighbor->cutneighmaxsq))
|
static_cast<flt_t>(neighbor->cutneighmaxsq))
|
||||||
error->one(FLERR,
|
error->one(FLERR,
|
||||||
"Intel package expects no atoms within cutoff of {1e15,1e15,1e15}.");
|
"Intel package expects no atoms within cutoff of {1e15,1e15,1e15}.");
|
||||||
}
|
}
|
||||||
|
|
||||||
// ---------- Grow and cast/pack buffers -------------
|
// ---------- Grow and cast/pack buffers -------------
|
||||||
@ -174,14 +174,16 @@ void NBinIntel::bin_atoms(IntelBuffers<flt_t,acc_t> * buffers) {
|
|||||||
biga.w = 1;
|
biga.w = 1;
|
||||||
buffers->get_x()[nall] = biga;
|
buffers->get_x()[nall] = biga;
|
||||||
|
|
||||||
const int nthreads = comm->nthreads;
|
int nthreads;
|
||||||
|
if (comm->nthreads > INTEL_HTHREADS) nthreads = comm->nthreads;
|
||||||
|
else nthreads = 1;
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
#pragma omp parallel default(none) shared(buffers)
|
#pragma omp parallel if(nthreads > INTEL_HTHREADS)
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
int ifrom, ito, tid;
|
int ifrom, ito, tid;
|
||||||
IP_PRE_omp_range_id_align(ifrom, ito, tid, nall, nthreads,
|
IP_PRE_omp_range_id_align(ifrom, ito, tid, nall, nthreads,
|
||||||
sizeof(ATOM_T));
|
sizeof(ATOM_T));
|
||||||
buffers->thr_pack(ifrom, ito, 0);
|
buffers->thr_pack(ifrom, ito, 0);
|
||||||
}
|
}
|
||||||
_fix->stop_watch(TIME_PACK);
|
_fix->stop_watch(TIME_PACK);
|
||||||
|
|||||||
@ -70,483 +70,62 @@ fbi(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) {
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
buffers->grow_list(list, atom->nlocal, comm->nthreads, off_end,
|
buffers->grow_list(list, atom->nlocal, comm->nthreads, off_end,
|
||||||
_fix->nbor_pack_width());
|
_fix->nbor_pack_width());
|
||||||
|
|
||||||
int need_ic = 0;
|
int need_ic = 0;
|
||||||
if (atom->molecular)
|
if (atom->molecular)
|
||||||
dminimum_image_check(need_ic, neighbor->cutneighmax, neighbor->cutneighmax,
|
dminimum_image_check(need_ic, neighbor->cutneighmax, neighbor->cutneighmax,
|
||||||
neighbor->cutneighmax);
|
neighbor->cutneighmax);
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
if (need_ic) {
|
if (_fix->three_body_neighbor()) {
|
||||||
if (offload_noghost) {
|
if (need_ic) {
|
||||||
fbi<flt_t,acc_t,1,1>(1, list, buffers, 0, off_end);
|
if (offload_noghost) {
|
||||||
fbi<flt_t,acc_t,1,1>(0, list, buffers, host_start, nlocal, off_end);
|
bin_newton<flt_t,acc_t,1,1,1,0,1>(1, list, buffers, 0, off_end);
|
||||||
|
bin_newton<flt_t,acc_t,1,1,1,0,1>(0, list, buffers, host_start, nlocal, off_end);
|
||||||
|
} else {
|
||||||
|
bin_newton<flt_t,acc_t,0,1,1,0,1>(1, list, buffers, 0, off_end);
|
||||||
|
bin_newton<flt_t,acc_t,0,1,1,0,1>(0, list, buffers, host_start, nlocal);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
fbi<flt_t,acc_t,0,1>(1, list, buffers, 0, off_end);
|
if (offload_noghost) {
|
||||||
fbi<flt_t,acc_t,0,1>(0, list, buffers, host_start, nlocal);
|
bin_newton<flt_t,acc_t,1,0,1,0,1>(1, list, buffers, 0, off_end);
|
||||||
|
bin_newton<flt_t,acc_t,1,0,1,0,1>(0, list, buffers, host_start, nlocal, off_end);
|
||||||
|
} else {
|
||||||
|
bin_newton<flt_t,acc_t,0,0,1,0,1>(1, list, buffers, 0, off_end);
|
||||||
|
bin_newton<flt_t,acc_t,0,0,1,0,1>(0, list, buffers, host_start, nlocal);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (offload_noghost) {
|
if (need_ic) {
|
||||||
fbi<flt_t,acc_t,1,0>(1, list, buffers, 0, off_end);
|
if (offload_noghost) {
|
||||||
fbi<flt_t,acc_t,1,0>(0, list, buffers, host_start, nlocal, off_end);
|
bin_newton<flt_t,acc_t,1,1,1,0,0>(1, list, buffers, 0, off_end);
|
||||||
|
bin_newton<flt_t,acc_t,1,1,1,0,0>(0, list, buffers, host_start, nlocal, off_end);
|
||||||
|
} else {
|
||||||
|
bin_newton<flt_t,acc_t,0,1,1,0,0>(1, list, buffers, 0, off_end);
|
||||||
|
bin_newton<flt_t,acc_t,0,1,1,0,0>(0, list, buffers, host_start, nlocal);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
fbi<flt_t,acc_t,0,0>(1, list, buffers, 0, off_end);
|
if (offload_noghost) {
|
||||||
fbi<flt_t,acc_t,0,0>(0, list, buffers, host_start, nlocal);
|
bin_newton<flt_t,acc_t,1,0,1,0,0>(1, list, buffers, 0, off_end);
|
||||||
|
bin_newton<flt_t,acc_t,1,0,1,0,0>(0, list, buffers, host_start, nlocal, off_end);
|
||||||
|
} else {
|
||||||
|
bin_newton<flt_t,acc_t,0,0,1,0,0>(1, list, buffers, 0, off_end);
|
||||||
|
bin_newton<flt_t,acc_t,0,0,1,0,0>(0, list, buffers, host_start, nlocal);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
if (need_ic)
|
if (_fix->three_body_neighbor()) {
|
||||||
fbi<flt_t,acc_t,0,1>(0, list, buffers, host_start, nlocal);
|
if (need_ic)
|
||||||
else
|
bin_newton<flt_t,acc_t,0,1,1,0,1>(0, list, buffers, host_start, nlocal);
|
||||||
fbi<flt_t,acc_t,0,0>(0, list, buffers, host_start, nlocal);
|
else
|
||||||
#endif
|
bin_newton<flt_t,acc_t,0,0,1,0,1>(0, list, buffers, host_start, nlocal);
|
||||||
}
|
} else {
|
||||||
|
if (need_ic)
|
||||||
template <class flt_t, class acc_t, int offload_noghost, int need_ic>
|
bin_newton<flt_t,acc_t,0,1,1,0,0>(0, list, buffers, host_start, nlocal);
|
||||||
void NPairFullBinIntel::
|
else
|
||||||
fbi(const int offload, NeighList *list, IntelBuffers<flt_t,acc_t> *buffers,
|
bin_newton<flt_t,acc_t,0,0,1,0,0>(0, list, buffers, host_start, nlocal);
|
||||||
const int astart, const int aend, const int offload_end) {
|
}
|
||||||
|
|
||||||
if (aend-astart == 0) return;
|
|
||||||
|
|
||||||
const int nall = atom->nlocal + atom->nghost;
|
|
||||||
int pad = 1;
|
|
||||||
int nall_t = nall;
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
if (offload_noghost && offload) nall_t = atom->nlocal;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
const int pack_width = _fix->nbor_pack_width();
|
|
||||||
const int pad_width = pad;
|
|
||||||
|
|
||||||
const ATOM_T * _noalias const x = buffers->get_x();
|
|
||||||
int * _noalias const firstneigh = buffers->firstneigh(list);
|
|
||||||
const int e_nall = nall_t;
|
|
||||||
|
|
||||||
const int molecular = atom->molecular;
|
|
||||||
int *ns = NULL;
|
|
||||||
tagint *s = NULL;
|
|
||||||
int tag_size = 0, special_size;
|
|
||||||
if (buffers->need_tag()) tag_size = e_nall;
|
|
||||||
if (molecular) {
|
|
||||||
s = atom->special[0];
|
|
||||||
ns = atom->nspecial[0];
|
|
||||||
special_size = aend;
|
|
||||||
} else {
|
|
||||||
s = &buffers->_special_holder;
|
|
||||||
ns = &buffers->_nspecial_holder;
|
|
||||||
special_size = 0;
|
|
||||||
}
|
|
||||||
const tagint * _noalias const special = s;
|
|
||||||
const int * _noalias const nspecial = ns;
|
|
||||||
const int maxspecial = atom->maxspecial;
|
|
||||||
const tagint * _noalias const tag = atom->tag;
|
|
||||||
|
|
||||||
int * _noalias const ilist = list->ilist;
|
|
||||||
int * _noalias numneigh = list->numneigh;
|
|
||||||
int * _noalias const cnumneigh = buffers->cnumneigh(list);
|
|
||||||
const int nstencil = this->nstencil;
|
|
||||||
const int * _noalias const stencil = this->stencil;
|
|
||||||
const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0];
|
|
||||||
const int ntypes = atom->ntypes + 1;
|
|
||||||
const int nlocal = atom->nlocal;
|
|
||||||
|
|
||||||
#ifndef _LMP_INTEL_OFFLOAD
|
|
||||||
int * const mask = atom->mask;
|
|
||||||
tagint * const molecule = atom->molecule;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
int tnum;
|
|
||||||
int *overflow;
|
|
||||||
double *timer_compute;
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
if (offload) {
|
|
||||||
timer_compute = _fix->off_watch_neighbor();
|
|
||||||
tnum = buffers->get_off_threads();
|
|
||||||
overflow = _fix->get_off_overflow_flag();
|
|
||||||
_fix->stop_watch(TIME_HOST_NEIGHBOR);
|
|
||||||
_fix->start_watch(TIME_OFFLOAD_LATENCY);
|
|
||||||
} else
|
|
||||||
#endif
|
|
||||||
{
|
|
||||||
tnum = comm->nthreads;
|
|
||||||
overflow = _fix->get_overflow_flag();
|
|
||||||
}
|
|
||||||
const int nthreads = tnum;
|
|
||||||
const int maxnbors = buffers->get_max_nbors();
|
|
||||||
int * _noalias const atombin = buffers->get_atombin();
|
|
||||||
const int * _noalias const binpacked = buffers->get_binpacked();
|
|
||||||
|
|
||||||
const int xperiodic = domain->xperiodic;
|
|
||||||
const int yperiodic = domain->yperiodic;
|
|
||||||
const int zperiodic = domain->zperiodic;
|
|
||||||
const flt_t xprd_half = domain->xprd_half;
|
|
||||||
const flt_t yprd_half = domain->yprd_half;
|
|
||||||
const flt_t zprd_half = domain->zprd_half;
|
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
const int * _noalias const binhead = this->binhead;
|
|
||||||
const int * _noalias const bins = this->bins;
|
|
||||||
const int cop = _fix->coprocessor_number();
|
|
||||||
const int separate_buffers = _fix->separate_buffers();
|
|
||||||
#pragma offload target(mic:cop) if(offload) \
|
|
||||||
in(x:length(e_nall+1) alloc_if(0) free_if(0)) \
|
|
||||||
in(tag:length(tag_size) alloc_if(0) free_if(0)) \
|
|
||||||
in(special:length(special_size*maxspecial) alloc_if(0) free_if(0)) \
|
|
||||||
in(nspecial:length(special_size*3) alloc_if(0) free_if(0)) \
|
|
||||||
in(bins,binpacked:length(nall) alloc_if(0) free_if(0)) \
|
|
||||||
in(binhead:length(mbins+1) alloc_if(0) free_if(0)) \
|
|
||||||
in(cutneighsq:length(0) alloc_if(0) free_if(0)) \
|
|
||||||
in(firstneigh:length(0) alloc_if(0) free_if(0)) \
|
|
||||||
in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
|
|
||||||
out(numneigh:length(0) alloc_if(0) free_if(0)) \
|
|
||||||
in(ilist:length(0) alloc_if(0) free_if(0)) \
|
|
||||||
in(atombin:length(aend) alloc_if(0) free_if(0)) \
|
|
||||||
in(stencil:length(nstencil) alloc_if(0) free_if(0)) \
|
|
||||||
in(maxnbors,nthreads,maxspecial,nstencil,e_nall,offload,pack_width) \
|
|
||||||
in(offload_end,separate_buffers,astart, aend, nlocal, molecular, ntypes) \
|
|
||||||
in(xperiodic, yperiodic, zperiodic, xprd_half, yprd_half, zprd_half) \
|
|
||||||
out(overflow:length(5) alloc_if(0) free_if(0)) \
|
|
||||||
out(timer_compute:length(1) alloc_if(0) free_if(0)) \
|
|
||||||
signal(tag)
|
|
||||||
#endif
|
|
||||||
{
|
|
||||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
|
||||||
*timer_compute = MIC_Wtime();
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
overflow[LMP_LOCAL_MIN] = astart;
|
|
||||||
overflow[LMP_LOCAL_MAX] = aend - 1;
|
|
||||||
overflow[LMP_GHOST_MIN] = e_nall;
|
|
||||||
overflow[LMP_GHOST_MAX] = -1;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
int nstencilp = 0;
|
|
||||||
int binstart[INTEL_MAX_STENCIL], binend[INTEL_MAX_STENCIL];
|
|
||||||
for (int k = 0; k < nstencil; k++) {
|
|
||||||
binstart[nstencilp] = stencil[k];
|
|
||||||
int end = stencil[k] + 1;
|
|
||||||
for (int kk = k + 1; kk < nstencil; kk++) {
|
|
||||||
if (stencil[kk-1]+1 == stencil[kk]) {
|
|
||||||
end++;
|
|
||||||
k++;
|
|
||||||
} else break;
|
|
||||||
}
|
|
||||||
binend[nstencilp] = end;
|
|
||||||
nstencilp++;
|
|
||||||
}
|
|
||||||
|
|
||||||
#if defined(_OPENMP)
|
|
||||||
#pragma omp parallel default(none) \
|
|
||||||
shared(numneigh, overflow, nstencilp, binstart, binend)
|
|
||||||
#endif
|
|
||||||
{
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
int lmin = e_nall, lmax = -1, gmin = e_nall, gmax = -1;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
const int num = aend - astart;
|
|
||||||
int tid, ifrom, ito;
|
|
||||||
|
|
||||||
IP_PRE_omp_range_id_vec(ifrom, ito, tid, num, nthreads, pack_width);
|
|
||||||
ifrom += astart;
|
|
||||||
ito += astart;
|
|
||||||
int e_ito = ito;
|
|
||||||
if (ito == num) {
|
|
||||||
int imod = ito % pack_width;
|
|
||||||
if (imod) e_ito += pack_width - imod;
|
|
||||||
}
|
|
||||||
const int list_size = (e_ito + tid * 2 + 2) * maxnbors;
|
|
||||||
int which;
|
|
||||||
int pack_offset = maxnbors * pack_width;
|
|
||||||
int ct = (ifrom + tid * 2) * maxnbors;
|
|
||||||
int *neighptr = firstneigh + ct;
|
|
||||||
const int obound = pack_offset + maxnbors * 2;
|
|
||||||
|
|
||||||
int max_chunk = 0;
|
|
||||||
int lane = 0;
|
|
||||||
for (int i = ifrom; i < ito; i++) {
|
|
||||||
const flt_t xtmp = x[i].x;
|
|
||||||
const flt_t ytmp = x[i].y;
|
|
||||||
const flt_t ztmp = x[i].z;
|
|
||||||
const int itype = x[i].w;
|
|
||||||
const tagint itag = tag[i];
|
|
||||||
const int ioffset = ntypes * itype;
|
|
||||||
|
|
||||||
const int ibin = atombin[i];
|
|
||||||
int raw_count = pack_offset;
|
|
||||||
|
|
||||||
// loop over all atoms in surrounding bins in stencil including self
|
|
||||||
// skip i = j
|
|
||||||
if (exclude) {
|
|
||||||
for (int k = 0; k < nstencilp; k++) {
|
|
||||||
const int bstart = binhead[ibin + binstart[k]];
|
|
||||||
const int bend = binhead[ibin + binend[k]];
|
|
||||||
#ifndef _LMP_INTEL_OFFLOAD
|
|
||||||
#ifdef INTEL_VMASK
|
|
||||||
#pragma simd
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
for (int jj = bstart; jj < bend; jj++) {
|
|
||||||
int j = binpacked[jj];
|
|
||||||
|
|
||||||
if (i == j) j=e_nall;
|
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
if (offload_noghost) {
|
|
||||||
if (j < nlocal) {
|
|
||||||
if (i < offload_end) continue;
|
|
||||||
} else if (offload) continue;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef _LMP_INTEL_OFFLOAD
|
|
||||||
const int jtype = x[j].w;
|
|
||||||
if (exclusion(i,j,itype,jtype,mask,molecule)) continue;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
neighptr[raw_count++] = j;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for (int k = 0; k < nstencilp; k++) {
|
|
||||||
const int bstart = binhead[ibin + binstart[k]];
|
|
||||||
const int bend = binhead[ibin + binend[k]];
|
|
||||||
#ifndef _LMP_INTEL_OFFLOAD
|
|
||||||
#ifdef INTEL_VMASK
|
|
||||||
#pragma simd
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
for (int jj = bstart; jj < bend; jj++) {
|
|
||||||
int j = binpacked[jj];
|
|
||||||
|
|
||||||
if (i == j) j=e_nall;
|
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
if (offload_noghost) {
|
|
||||||
if (j < nlocal) {
|
|
||||||
if (i < offload_end) continue;
|
|
||||||
} else if (offload) continue;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
neighptr[raw_count++] = j;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (raw_count > obound) *overflow = 1;
|
|
||||||
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
int vlmin = lmin, vlmax = lmax, vgmin = gmin, vgmax = gmax;
|
|
||||||
#if __INTEL_COMPILER+0 > 1499
|
|
||||||
#pragma vector aligned
|
|
||||||
#pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin)
|
|
||||||
#endif
|
|
||||||
#else
|
|
||||||
#pragma vector aligned
|
|
||||||
#pragma simd
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
for (int u = pack_offset; u < raw_count; u++) {
|
|
||||||
int j = neighptr[u];
|
|
||||||
const flt_t delx = xtmp - x[j].x;
|
|
||||||
const flt_t dely = ytmp - x[j].y;
|
|
||||||
const flt_t delz = ztmp - x[j].z;
|
|
||||||
const int jtype = x[j].w;
|
|
||||||
const flt_t rsq = delx * delx + dely * dely + delz * delz;
|
|
||||||
if (rsq > cutneighsq[ioffset + jtype])
|
|
||||||
neighptr[u] = e_nall;
|
|
||||||
else {
|
|
||||||
if (need_ic) {
|
|
||||||
int no_special;
|
|
||||||
ominimum_image_check(no_special, delx, dely, delz);
|
|
||||||
if (no_special)
|
|
||||||
neighptr[u] = -j - 1;
|
|
||||||
}
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
if (j < nlocal) {
|
|
||||||
if (j < vlmin) vlmin = j;
|
|
||||||
if (j > vlmax) vlmax = j;
|
|
||||||
} else {
|
|
||||||
if (j < vgmin) vgmin = j;
|
|
||||||
if (j > vgmax) vgmax = j;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
lmin = MIN(lmin,vlmin);
|
|
||||||
gmin = MIN(gmin,vgmin);
|
|
||||||
lmax = MAX(lmax,vlmax);
|
|
||||||
gmax = MAX(gmax,vgmax);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
int n = lane, n2 = pack_offset;
|
|
||||||
for (int u = pack_offset; u < raw_count; u++) {
|
|
||||||
const int j = neighptr[u];
|
|
||||||
int pj = j;
|
|
||||||
if (pj < e_nall) {
|
|
||||||
if (need_ic)
|
|
||||||
if (pj < 0) pj = -pj - 1;
|
|
||||||
|
|
||||||
const int jtag = tag[pj];
|
|
||||||
int flist = 0;
|
|
||||||
if (itag > jtag) {
|
|
||||||
if ((itag+jtag) % 2 == 0) flist = 1;
|
|
||||||
} else if (itag < jtag) {
|
|
||||||
if ((itag+jtag) % 2 == 1) flist = 1;
|
|
||||||
} else {
|
|
||||||
if (x[pj].z < ztmp) flist = 1;
|
|
||||||
else if (x[pj].z == ztmp && x[pj].y < ytmp) flist = 1;
|
|
||||||
else if (x[pj].z == ztmp && x[pj].y == ytmp && x[pj].x < xtmp)
|
|
||||||
flist = 1;
|
|
||||||
}
|
|
||||||
if (flist) {
|
|
||||||
neighptr[n2++] = j;
|
|
||||||
} else {
|
|
||||||
neighptr[n] = j;
|
|
||||||
n += pack_width;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
int ns = (n - lane) / pack_width;
|
|
||||||
atombin[i] = ns;
|
|
||||||
for (int u = pack_offset; u < n2; u++) {
|
|
||||||
neighptr[n] = neighptr[u];
|
|
||||||
n += pack_width;
|
|
||||||
}
|
|
||||||
|
|
||||||
ilist[i] = i;
|
|
||||||
cnumneigh[i] = ct + lane;
|
|
||||||
ns += n2 - pack_offset;
|
|
||||||
numneigh[i] = ns;
|
|
||||||
|
|
||||||
if (ns > max_chunk) max_chunk = ns;
|
|
||||||
lane++;
|
|
||||||
if (lane == pack_width) {
|
|
||||||
ct += max_chunk * pack_width;
|
|
||||||
const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
|
|
||||||
const int edge = (ct % alignb);
|
|
||||||
if (edge) ct += alignb - edge;
|
|
||||||
neighptr = firstneigh + ct;
|
|
||||||
max_chunk = 0;
|
|
||||||
pack_offset = maxnbors * pack_width;
|
|
||||||
lane = 0;
|
|
||||||
if (ct + obound > list_size) {
|
|
||||||
if (i < ito - 1) {
|
|
||||||
*overflow = 1;
|
|
||||||
ct = (ifrom + tid * 2) * maxnbors;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (*overflow == 1)
|
|
||||||
for (int i = ifrom; i < ito; i++)
|
|
||||||
numneigh[i] = 0;
|
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
if (separate_buffers) {
|
|
||||||
#if defined(_OPENMP)
|
|
||||||
#pragma omp critical
|
|
||||||
#endif
|
|
||||||
{
|
|
||||||
if (lmin < overflow[LMP_LOCAL_MIN]) overflow[LMP_LOCAL_MIN] = lmin;
|
|
||||||
if (lmax > overflow[LMP_LOCAL_MAX]) overflow[LMP_LOCAL_MAX] = lmax;
|
|
||||||
if (gmin < overflow[LMP_GHOST_MIN]) overflow[LMP_GHOST_MIN] = gmin;
|
|
||||||
if (gmax > overflow[LMP_GHOST_MAX]) overflow[LMP_GHOST_MAX] = gmax;
|
|
||||||
}
|
|
||||||
#pragma omp barrier
|
|
||||||
}
|
|
||||||
|
|
||||||
int ghost_offset = 0, nall_offset = e_nall;
|
|
||||||
if (separate_buffers) {
|
|
||||||
int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN];
|
|
||||||
if (nghost < 0) nghost = 0;
|
|
||||||
if (offload) {
|
|
||||||
ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1;
|
|
||||||
nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost;
|
|
||||||
} else {
|
|
||||||
ghost_offset = overflow[LMP_GHOST_MIN] - nlocal;
|
|
||||||
nall_offset = nlocal + nghost;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
if (molecular) {
|
|
||||||
for (int i = ifrom; i < ito; ++i) {
|
|
||||||
int * _noalias jlist = firstneigh + cnumneigh[i];
|
|
||||||
const int jnum = numneigh[i];
|
|
||||||
|
|
||||||
const int trip = jnum * pack_width;
|
|
||||||
for (int jj = 0; jj < trip; jj+=pack_width) {
|
|
||||||
const int j = jlist[jj];
|
|
||||||
if (need_ic && j < 0) {
|
|
||||||
which = 0;
|
|
||||||
jlist[jj] = -j - 1;
|
|
||||||
} else
|
|
||||||
ofind_special(which, special, nspecial, i, tag[j]);
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
if (j >= nlocal) {
|
|
||||||
if (j == e_nall)
|
|
||||||
jlist[jj] = nall_offset;
|
|
||||||
else if (which)
|
|
||||||
jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
|
|
||||||
else jlist[jj]-=ghost_offset;
|
|
||||||
} else
|
|
||||||
#endif
|
|
||||||
if (which) jlist[jj] = j ^ (which << SBBITS);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
else if (separate_buffers) {
|
|
||||||
for (int i = ifrom; i < ito; ++i) {
|
|
||||||
int * _noalias jlist = firstneigh + cnumneigh[i];
|
|
||||||
const int jnum = numneigh[i];
|
|
||||||
int jj = 0;
|
|
||||||
for (jj = 0; jj < jnum; jj++) {
|
|
||||||
if (jlist[jj] >= nlocal) {
|
|
||||||
if (jlist[jj] == e_nall) jlist[jj] = nall_offset;
|
|
||||||
else jlist[jj] -= ghost_offset;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
} // end omp
|
|
||||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
|
||||||
*timer_compute = MIC_Wtime() - *timer_compute;
|
|
||||||
#endif
|
|
||||||
} // end offload
|
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
if (offload) {
|
|
||||||
_fix->stop_watch(TIME_OFFLOAD_LATENCY);
|
|
||||||
_fix->start_watch(TIME_HOST_NEIGHBOR);
|
|
||||||
for (int n = 0; n < aend; n++) {
|
|
||||||
ilist[n] = n;
|
|
||||||
numneigh[n] = 0;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for (int i = astart; i < aend; i++)
|
|
||||||
list->firstneigh[i] = firstneigh + cnumneigh[i];
|
|
||||||
if (separate_buffers) {
|
|
||||||
_fix->start_watch(TIME_PACK);
|
|
||||||
_fix->set_neighbor_host_sizes();
|
|
||||||
buffers->pack_sep_from_single(_fix->host_min_local(),
|
|
||||||
_fix->host_used_local(),
|
|
||||||
_fix->host_min_ghost(),
|
|
||||||
_fix->host_used_ghost());
|
|
||||||
_fix->stop_watch(TIME_PACK);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
for (int i = astart; i < aend; i++)
|
|
||||||
list->firstneigh[i] = firstneigh + cnumneigh[i];
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|||||||
@ -36,9 +36,6 @@ class NPairFullBinIntel : public NPairIntel {
|
|||||||
private:
|
private:
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void fbi(NeighList *, IntelBuffers<flt_t,acc_t> *);
|
void fbi(NeighList *, IntelBuffers<flt_t,acc_t> *);
|
||||||
template <class flt_t, class acc_t, int, int>
|
|
||||||
void fbi(const int, NeighList *, IntelBuffers<flt_t,acc_t> *, const int,
|
|
||||||
const int, const int offload_end = 0);
|
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,451 +0,0 @@
|
|||||||
/* ----------------------------------------------------------------------
|
|
||||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
|
||||||
http://lammps.sandia.gov, Sandia National Laboratories
|
|
||||||
Steve Plimpton, sjplimp@sandia.gov
|
|
||||||
|
|
||||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
|
||||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
|
||||||
certain rights in this software. This software is distributed under
|
|
||||||
the GNU General Public License.
|
|
||||||
|
|
||||||
See the README file in the top-level LAMMPS directory.
|
|
||||||
------------------------------------------------------------------------- */
|
|
||||||
|
|
||||||
/* ----------------------------------------------------------------------
|
|
||||||
Contributing author: W. Michael Brown (Intel)
|
|
||||||
------------------------------------------------------------------------- */
|
|
||||||
|
|
||||||
#include "npair_half_bin_newtoff_intel.h"
|
|
||||||
#include "neighbor.h"
|
|
||||||
#include "neigh_list.h"
|
|
||||||
#include "atom.h"
|
|
||||||
#include "comm.h"
|
|
||||||
#include "group.h"
|
|
||||||
|
|
||||||
using namespace LAMMPS_NS;
|
|
||||||
|
|
||||||
/* ---------------------------------------------------------------------- */
|
|
||||||
|
|
||||||
NPairHalfBinNewtoffIntel::NPairHalfBinNewtoffIntel(LAMMPS *lmp) :
|
|
||||||
NPairIntel(lmp) {}
|
|
||||||
|
|
||||||
/* ----------------------------------------------------------------------
|
|
||||||
binned neighbor list construction with partial Newton's 3rd law
|
|
||||||
each owned atom i checks own bin and other bins in stencil
|
|
||||||
pair stored once if i,j are both owned and i < j
|
|
||||||
pair stored by me if j is ghost (also stored by proc owning j)
|
|
||||||
------------------------------------------------------------------------- */
|
|
||||||
|
|
||||||
void NPairHalfBinNewtoffIntel::build(NeighList *list)
|
|
||||||
{
|
|
||||||
if (nstencil > INTEL_MAX_STENCIL_CHECK)
|
|
||||||
error->all(FLERR, "Too many neighbor bins for USER-INTEL package.");
|
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
if (exclude)
|
|
||||||
error->all(FLERR, "Exclusion lists not yet supported for Intel offload");
|
|
||||||
#endif
|
|
||||||
|
|
||||||
if (_fix->precision() == FixIntel::PREC_MODE_MIXED)
|
|
||||||
hbnni(list, _fix->get_mixed_buffers());
|
|
||||||
else if (_fix->precision() == FixIntel::PREC_MODE_DOUBLE)
|
|
||||||
hbnni(list, _fix->get_double_buffers());
|
|
||||||
else
|
|
||||||
hbnni(list, _fix->get_single_buffers());
|
|
||||||
|
|
||||||
_fix->stop_watch(TIME_HOST_NEIGHBOR);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class flt_t, class acc_t>
|
|
||||||
void NPairHalfBinNewtoffIntel::
|
|
||||||
hbnni(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) {
|
|
||||||
const int nlocal = (includegroup) ? atom->nfirst : atom->nlocal;
|
|
||||||
list->inum = nlocal;
|
|
||||||
|
|
||||||
const int off_end = _fix->offload_end_neighbor();
|
|
||||||
int host_start = off_end;;
|
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
if (off_end) grow_stencil();
|
|
||||||
if (_fix->full_host_list()) host_start = 0;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
buffers->grow_list(list, atom->nlocal, comm->nthreads, off_end);
|
|
||||||
|
|
||||||
int need_ic = 0;
|
|
||||||
if (atom->molecular)
|
|
||||||
dminimum_image_check(need_ic, neighbor->cutneighmax, neighbor->cutneighmax,
|
|
||||||
neighbor->cutneighmax);
|
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
if (need_ic) {
|
|
||||||
hbnni<flt_t,acc_t,1>(1, list, buffers, 0, off_end);
|
|
||||||
hbnni<flt_t,acc_t,1>(0, list, buffers, host_start, nlocal);
|
|
||||||
} else {
|
|
||||||
hbnni<flt_t,acc_t,0>(1, list, buffers, 0, off_end);
|
|
||||||
hbnni<flt_t,acc_t,0>(0, list, buffers, host_start, nlocal);
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
if (need_ic)
|
|
||||||
hbnni<flt_t,acc_t,1>(0, list, buffers, host_start, nlocal);
|
|
||||||
else
|
|
||||||
hbnni<flt_t,acc_t,0>(0, list, buffers, host_start, nlocal);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class flt_t, class acc_t, int need_ic>
|
|
||||||
void NPairHalfBinNewtoffIntel::
|
|
||||||
hbnni(const int offload, NeighList *list, IntelBuffers<flt_t,acc_t> *buffers,
|
|
||||||
const int astart, const int aend) {
|
|
||||||
|
|
||||||
if (aend-astart == 0) return;
|
|
||||||
|
|
||||||
const int nall = atom->nlocal + atom->nghost;
|
|
||||||
int pad = 1;
|
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
if (offload) {
|
|
||||||
if (INTEL_MIC_NBOR_PAD > 1)
|
|
||||||
pad = INTEL_MIC_NBOR_PAD * sizeof(float) / sizeof(flt_t);
|
|
||||||
} else
|
|
||||||
#endif
|
|
||||||
if (INTEL_NBOR_PAD > 1)
|
|
||||||
pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t);
|
|
||||||
const int pad_width = pad;
|
|
||||||
|
|
||||||
const ATOM_T * _noalias const x = buffers->get_x();
|
|
||||||
int * _noalias const firstneigh = buffers->firstneigh(list);
|
|
||||||
|
|
||||||
const int molecular = atom->molecular;
|
|
||||||
int *ns = NULL;
|
|
||||||
tagint *s = NULL;
|
|
||||||
int tag_size = 0, special_size;
|
|
||||||
if (buffers->need_tag()) tag_size = nall;
|
|
||||||
if (molecular) {
|
|
||||||
s = atom->special[0];
|
|
||||||
ns = atom->nspecial[0];
|
|
||||||
special_size = aend;
|
|
||||||
} else {
|
|
||||||
s = &buffers->_special_holder;
|
|
||||||
ns = &buffers->_nspecial_holder;
|
|
||||||
special_size = 0;
|
|
||||||
}
|
|
||||||
const tagint * _noalias const special = s;
|
|
||||||
const int * _noalias const nspecial = ns;
|
|
||||||
const int maxspecial = atom->maxspecial;
|
|
||||||
const tagint * _noalias const tag = atom->tag;
|
|
||||||
|
|
||||||
int * _noalias const ilist = list->ilist;
|
|
||||||
int * _noalias numneigh = list->numneigh;
|
|
||||||
int * _noalias const cnumneigh = buffers->cnumneigh(list);
|
|
||||||
const int nstencil = this->nstencil;
|
|
||||||
const int * _noalias const stencil = this->stencil;
|
|
||||||
const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0];
|
|
||||||
const int ntypes = atom->ntypes + 1;
|
|
||||||
const int nlocal = atom->nlocal;
|
|
||||||
|
|
||||||
#ifndef _LMP_INTEL_OFFLOAD
|
|
||||||
int * const mask = atom->mask;
|
|
||||||
tagint * const molecule = atom->molecule;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
int tnum;
|
|
||||||
int *overflow;
|
|
||||||
double *timer_compute;
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
if (offload) {
|
|
||||||
timer_compute = _fix->off_watch_neighbor();
|
|
||||||
tnum = buffers->get_off_threads();
|
|
||||||
overflow = _fix->get_off_overflow_flag();
|
|
||||||
_fix->stop_watch(TIME_HOST_NEIGHBOR);
|
|
||||||
_fix->start_watch(TIME_OFFLOAD_LATENCY);
|
|
||||||
} else
|
|
||||||
#endif
|
|
||||||
{
|
|
||||||
tnum = comm->nthreads;
|
|
||||||
overflow = _fix->get_overflow_flag();
|
|
||||||
}
|
|
||||||
const int nthreads = tnum;
|
|
||||||
const int maxnbors = buffers->get_max_nbors();
|
|
||||||
int * _noalias const atombin = buffers->get_atombin();
|
|
||||||
const int * _noalias const binpacked = buffers->get_binpacked();
|
|
||||||
|
|
||||||
const int xperiodic = domain->xperiodic;
|
|
||||||
const int yperiodic = domain->yperiodic;
|
|
||||||
const int zperiodic = domain->zperiodic;
|
|
||||||
const flt_t xprd_half = domain->xprd_half;
|
|
||||||
const flt_t yprd_half = domain->yprd_half;
|
|
||||||
const flt_t zprd_half = domain->zprd_half;
|
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
const int * _noalias const binhead = this->binhead;
|
|
||||||
const int * _noalias const bins = this->bins;
|
|
||||||
const int cop = _fix->coprocessor_number();
|
|
||||||
const int separate_buffers = _fix->separate_buffers();
|
|
||||||
#pragma offload target(mic:cop) if(offload) \
|
|
||||||
in(x:length(nall+1) alloc_if(0) free_if(0)) \
|
|
||||||
in(tag:length(tag_size) alloc_if(0) free_if(0)) \
|
|
||||||
in(special:length(special_size*maxspecial) alloc_if(0) free_if(0)) \
|
|
||||||
in(nspecial:length(special_size*3) alloc_if(0) free_if(0)) \
|
|
||||||
in(bins,binpacked:length(nall) alloc_if(0) free_if(0)) \
|
|
||||||
in(binhead:length(mbins+1) alloc_if(0) free_if(0)) \
|
|
||||||
in(cutneighsq:length(0) alloc_if(0) free_if(0)) \
|
|
||||||
in(firstneigh:length(0) alloc_if(0) free_if(0)) \
|
|
||||||
in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
|
|
||||||
out(numneigh:length(0) alloc_if(0) free_if(0)) \
|
|
||||||
in(ilist:length(0) alloc_if(0) free_if(0)) \
|
|
||||||
in(atombin:length(aend) alloc_if(0) free_if(0)) \
|
|
||||||
in(stencil:length(nstencil) alloc_if(0) free_if(0)) \
|
|
||||||
in(maxnbors,nthreads,maxspecial,nstencil,pad_width,offload,nall) \
|
|
||||||
in(separate_buffers, astart, aend, nlocal, molecular, ntypes) \
|
|
||||||
in(xperiodic, yperiodic, zperiodic, xprd_half, yprd_half, zprd_half) \
|
|
||||||
out(overflow:length(5) alloc_if(0) free_if(0)) \
|
|
||||||
out(timer_compute:length(1) alloc_if(0) free_if(0)) \
|
|
||||||
signal(tag)
|
|
||||||
#endif
|
|
||||||
{
|
|
||||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
|
||||||
*timer_compute = MIC_Wtime();
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
overflow[LMP_LOCAL_MIN] = astart;
|
|
||||||
overflow[LMP_LOCAL_MAX] = aend - 1;
|
|
||||||
overflow[LMP_GHOST_MIN] = nall;
|
|
||||||
overflow[LMP_GHOST_MAX] = -1;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
int nstencilp = 0;
|
|
||||||
int binstart[INTEL_MAX_STENCIL], binend[INTEL_MAX_STENCIL];
|
|
||||||
for (int k = 0; k < nstencil; k++) {
|
|
||||||
binstart[nstencilp] = stencil[k];
|
|
||||||
int end = stencil[k] + 1;
|
|
||||||
for (int kk = k + 1; kk < nstencil; kk++) {
|
|
||||||
if (stencil[kk-1]+1 == stencil[kk]) {
|
|
||||||
end++;
|
|
||||||
k++;
|
|
||||||
} else break;
|
|
||||||
}
|
|
||||||
binend[nstencilp] = end;
|
|
||||||
nstencilp++;
|
|
||||||
}
|
|
||||||
|
|
||||||
#if defined(_OPENMP)
|
|
||||||
#pragma omp parallel default(none) \
|
|
||||||
shared(numneigh, overflow, nstencilp, binstart, binend)
|
|
||||||
#endif
|
|
||||||
{
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
int lmin = nall, lmax = -1, gmin = nall, gmax = -1;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
const int num = aend - astart;
|
|
||||||
int tid, ifrom, ito;
|
|
||||||
IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads);
|
|
||||||
ifrom += astart;
|
|
||||||
ito += astart;
|
|
||||||
|
|
||||||
int which;
|
|
||||||
|
|
||||||
const int list_size = (ito + tid + 1) * maxnbors;
|
|
||||||
int ct = (ifrom + tid) * maxnbors;
|
|
||||||
int *neighptr = firstneigh + ct;
|
|
||||||
|
|
||||||
for (int i = ifrom; i < ito; i++) {
|
|
||||||
int j, k, n, n2, itype, jtype, ibin;
|
|
||||||
double xtmp, ytmp, ztmp, delx, dely, delz, rsq;
|
|
||||||
|
|
||||||
n = 0;
|
|
||||||
n2 = maxnbors;
|
|
||||||
|
|
||||||
xtmp = x[i].x;
|
|
||||||
ytmp = x[i].y;
|
|
||||||
ztmp = x[i].z;
|
|
||||||
itype = x[i].w;
|
|
||||||
const int ioffset = ntypes*itype;
|
|
||||||
|
|
||||||
// loop over all atoms in other bins in stencil including self
|
|
||||||
// only store pair if i < j
|
|
||||||
// stores own/own pairs only once
|
|
||||||
// stores own/ghost pairs on both procs
|
|
||||||
|
|
||||||
ibin = atombin[i];
|
|
||||||
|
|
||||||
for (k = 0; k < nstencilp; k++) {
|
|
||||||
const int bstart = binhead[ibin + binstart[k]];
|
|
||||||
const int bend = binhead[ibin + binend[k]];
|
|
||||||
for (int jj = bstart; jj < bend; jj++) {
|
|
||||||
const int j = binpacked[jj];
|
|
||||||
if (j <= i) continue;
|
|
||||||
|
|
||||||
jtype = x[j].w;
|
|
||||||
#ifndef _LMP_INTEL_OFFLOAD
|
|
||||||
if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
delx = xtmp - x[j].x;
|
|
||||||
dely = ytmp - x[j].y;
|
|
||||||
delz = ztmp - x[j].z;
|
|
||||||
rsq = delx * delx + dely * dely + delz * delz;
|
|
||||||
if (rsq <= cutneighsq[ioffset + jtype]) {
|
|
||||||
if (j < nlocal) {
|
|
||||||
if (need_ic) {
|
|
||||||
int no_special;
|
|
||||||
ominimum_image_check(no_special, delx, dely, delz);
|
|
||||||
if (no_special)
|
|
||||||
neighptr[n++] = -j - 1;
|
|
||||||
else
|
|
||||||
neighptr[n++] = j;
|
|
||||||
} else
|
|
||||||
neighptr[n++] = j;
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
if (j < lmin) lmin = j;
|
|
||||||
if (j > lmax) lmax = j;
|
|
||||||
#endif
|
|
||||||
} else {
|
|
||||||
if (need_ic) {
|
|
||||||
int no_special;
|
|
||||||
ominimum_image_check(no_special, delx, dely, delz);
|
|
||||||
if (no_special)
|
|
||||||
neighptr[n2++] = -j - 1;
|
|
||||||
else
|
|
||||||
neighptr[n2++] = j;
|
|
||||||
} else
|
|
||||||
neighptr[n2++] = j;
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
if (j < gmin) gmin = j;
|
|
||||||
if (j > gmax) gmax = j;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
ilist[i] = i;
|
|
||||||
|
|
||||||
cnumneigh[i] = ct;
|
|
||||||
if (n > maxnbors) *overflow = 1;
|
|
||||||
for (k = maxnbors; k < n2; k++) neighptr[n++] = neighptr[k];
|
|
||||||
|
|
||||||
const int edge = (n % pad_width);
|
|
||||||
if (edge) {
|
|
||||||
const int pad_end = n + (pad_width - edge);
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
|
||||||
#pragma loop_count min=1, max=15, avg=8
|
|
||||||
#endif
|
|
||||||
for ( ; n < pad_end; n++)
|
|
||||||
neighptr[n] = nall;
|
|
||||||
}
|
|
||||||
numneigh[i] = n;
|
|
||||||
while((n % (INTEL_DATA_ALIGN / sizeof(int))) != 0) n++;
|
|
||||||
ct += n;
|
|
||||||
neighptr += n;
|
|
||||||
if (ct + n + maxnbors > list_size) {
|
|
||||||
*overflow = 1;
|
|
||||||
ct = (ifrom + tid) * maxnbors;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (*overflow == 1)
|
|
||||||
for (int i = ifrom; i < ito; i++)
|
|
||||||
numneigh[i] = 0;
|
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
if (separate_buffers) {
|
|
||||||
#if defined(_OPENMP)
|
|
||||||
#pragma omp critical
|
|
||||||
#endif
|
|
||||||
{
|
|
||||||
if (lmin < overflow[LMP_LOCAL_MIN]) overflow[LMP_LOCAL_MIN] = lmin;
|
|
||||||
if (lmax > overflow[LMP_LOCAL_MAX]) overflow[LMP_LOCAL_MAX] = lmax;
|
|
||||||
if (gmin < overflow[LMP_GHOST_MIN]) overflow[LMP_GHOST_MIN] = gmin;
|
|
||||||
if (gmax > overflow[LMP_GHOST_MAX]) overflow[LMP_GHOST_MAX] = gmax;
|
|
||||||
}
|
|
||||||
#pragma omp barrier
|
|
||||||
}
|
|
||||||
|
|
||||||
int ghost_offset = 0, nall_offset = nall;
|
|
||||||
if (separate_buffers) {
|
|
||||||
int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN];
|
|
||||||
if (nghost < 0) nghost = 0;
|
|
||||||
if (offload) {
|
|
||||||
ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1;
|
|
||||||
nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost;
|
|
||||||
} else {
|
|
||||||
ghost_offset = overflow[LMP_GHOST_MIN] - nlocal;
|
|
||||||
nall_offset = nlocal + nghost;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
if (molecular) {
|
|
||||||
for (int i = ifrom; i < ito; ++i) {
|
|
||||||
int * _noalias jlist = firstneigh + cnumneigh[i];
|
|
||||||
const int jnum = numneigh[i];
|
|
||||||
for (int jj = 0; jj < jnum; jj++) {
|
|
||||||
const int j = jlist[jj];
|
|
||||||
if (need_ic && j < 0) {
|
|
||||||
which = 0;
|
|
||||||
jlist[jj] = -j - 1;
|
|
||||||
} else
|
|
||||||
ofind_special(which, special, nspecial, i, tag[j]);
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
if (j >= nlocal) {
|
|
||||||
if (j == nall)
|
|
||||||
jlist[jj] = nall_offset;
|
|
||||||
else if (which)
|
|
||||||
jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
|
|
||||||
else jlist[jj]-=ghost_offset;
|
|
||||||
} else
|
|
||||||
#endif
|
|
||||||
if (which) jlist[jj] = j ^ (which << SBBITS);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
else if (separate_buffers) {
|
|
||||||
for (int i = ifrom; i < ito; ++i) {
|
|
||||||
int * _noalias jlist = firstneigh + cnumneigh[i];
|
|
||||||
const int jnum = numneigh[i];
|
|
||||||
int jj = 0;
|
|
||||||
for (jj = 0; jj < jnum; jj++)
|
|
||||||
if (jlist[jj] >= nlocal) break;
|
|
||||||
while (jj < jnum) {
|
|
||||||
if (jlist[jj] == nall) jlist[jj] = nall_offset;
|
|
||||||
else jlist[jj] -= ghost_offset;
|
|
||||||
jj++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
} // end omp
|
|
||||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
|
||||||
*timer_compute = MIC_Wtime() - *timer_compute;
|
|
||||||
#endif
|
|
||||||
} // end offload
|
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
if (offload) {
|
|
||||||
_fix->stop_watch(TIME_OFFLOAD_LATENCY);
|
|
||||||
_fix->start_watch(TIME_HOST_NEIGHBOR);
|
|
||||||
for (int n = 0; n < aend; n++) {
|
|
||||||
ilist[n] = n;
|
|
||||||
numneigh[n] = 0;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for (int i = astart; i < aend; i++)
|
|
||||||
list->firstneigh[i] = firstneigh + cnumneigh[i];
|
|
||||||
if (separate_buffers) {
|
|
||||||
_fix->start_watch(TIME_PACK);
|
|
||||||
_fix->set_neighbor_host_sizes();
|
|
||||||
buffers->pack_sep_from_single(_fix->host_min_local(),
|
|
||||||
_fix->host_used_local(),
|
|
||||||
_fix->host_min_ghost(),
|
|
||||||
_fix->host_used_ghost());
|
|
||||||
_fix->stop_watch(TIME_PACK);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
for (int i = astart; i < aend; i++)
|
|
||||||
list->firstneigh[i] = firstneigh + cnumneigh[i];
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
@ -1,52 +0,0 @@
|
|||||||
/* -*- c++ -*- ----------------------------------------------------------
|
|
||||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
|
||||||
http://lammps.sandia.gov, Sandia National Laboratories
|
|
||||||
Steve Plimpton, sjplimp@sandia.gov
|
|
||||||
|
|
||||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
|
||||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
|
||||||
certain rights in this software. This software is distributed under
|
|
||||||
the GNU General Public License.
|
|
||||||
|
|
||||||
See the README file in the top-level LAMMPS directory.
|
|
||||||
------------------------------------------------------------------------- */
|
|
||||||
|
|
||||||
#ifdef NPAIR_CLASS
|
|
||||||
|
|
||||||
NPairStyle(half/bin/newtoff/intel,
|
|
||||||
NPairHalfBinNewtoffIntel,
|
|
||||||
NP_HALF | NP_BIN | NP_NEWTOFF | NP_ORTHO | NP_TRI | NP_INTEL)
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
#ifndef LMP_NPAIR_HALF_BIN_NEWTOFF_INTEL_H
|
|
||||||
#define LMP_NPAIR_HALF_BIN_NEWTOFF_INTEL_H
|
|
||||||
|
|
||||||
#include "npair_intel.h"
|
|
||||||
#include "fix_intel.h"
|
|
||||||
|
|
||||||
namespace LAMMPS_NS {
|
|
||||||
|
|
||||||
class NPairHalfBinNewtoffIntel : public NPairIntel {
|
|
||||||
public:
|
|
||||||
NPairHalfBinNewtoffIntel(class LAMMPS *);
|
|
||||||
~NPairHalfBinNewtoffIntel() {}
|
|
||||||
void build(class NeighList *);
|
|
||||||
|
|
||||||
private:
|
|
||||||
template <class flt_t, class acc_t>
|
|
||||||
void hbnni(NeighList *, IntelBuffers<flt_t,acc_t> *);
|
|
||||||
template <class flt_t, class acc_t, int>
|
|
||||||
void hbnni(const int, NeighList *, IntelBuffers<flt_t,acc_t> *, const int,
|
|
||||||
const int);
|
|
||||||
};
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* ERROR/WARNING messages:
|
|
||||||
|
|
||||||
|
|
||||||
*/
|
|
||||||
@ -75,536 +75,32 @@ hbni(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) {
|
|||||||
int need_ic = 0;
|
int need_ic = 0;
|
||||||
if (atom->molecular)
|
if (atom->molecular)
|
||||||
dminimum_image_check(need_ic, neighbor->cutneighmax, neighbor->cutneighmax,
|
dminimum_image_check(need_ic, neighbor->cutneighmax, neighbor->cutneighmax,
|
||||||
neighbor->cutneighmax);
|
neighbor->cutneighmax);
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
if (need_ic) {
|
if (need_ic) {
|
||||||
if (offload_noghost) {
|
if (offload_noghost) {
|
||||||
hbni<flt_t,acc_t,1,1>(1, list, buffers, 0, off_end);
|
bin_newton<flt_t,acc_t,1,1,0,0,0>(1, list, buffers, 0, off_end);
|
||||||
hbni<flt_t,acc_t,1,1>(0, list, buffers, host_start, nlocal, off_end);
|
bin_newton<flt_t,acc_t,1,1,0,0,0>(0, list, buffers, host_start, nlocal,
|
||||||
|
off_end);
|
||||||
} else {
|
} else {
|
||||||
hbni<flt_t,acc_t,0,1>(1, list, buffers, 0, off_end);
|
bin_newton<flt_t,acc_t,0,1,0,0,0>(1, list, buffers, 0, off_end);
|
||||||
hbni<flt_t,acc_t,0,1>(0, list, buffers, host_start, nlocal);
|
bin_newton<flt_t,acc_t,0,1,0,0,0>(0, list, buffers, host_start, nlocal);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (offload_noghost) {
|
if (offload_noghost) {
|
||||||
hbni<flt_t,acc_t,1,0>(1, list, buffers, 0, off_end);
|
bin_newton<flt_t,acc_t,1,0,0,0,0>(1, list, buffers, 0, off_end);
|
||||||
hbni<flt_t,acc_t,1,0>(0, list, buffers, host_start, nlocal, off_end);
|
bin_newton<flt_t,acc_t,1,0,0,0,0>(0, list, buffers, host_start, nlocal,
|
||||||
|
off_end);
|
||||||
} else {
|
} else {
|
||||||
hbni<flt_t,acc_t,0,0>(1, list, buffers, 0, off_end);
|
bin_newton<flt_t,acc_t,0,0,0,0,0>(1, list, buffers, 0, off_end);
|
||||||
hbni<flt_t,acc_t,0,0>(0, list, buffers, host_start, nlocal);
|
bin_newton<flt_t,acc_t,0,0,0,0,0>(0, list, buffers, host_start, nlocal);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
if (need_ic)
|
if (need_ic)
|
||||||
hbni<flt_t,acc_t,0,1>(0, list, buffers, host_start, nlocal);
|
bin_newton<flt_t,acc_t,0,1,0,0,0>(0, list, buffers, host_start, nlocal);
|
||||||
else
|
else
|
||||||
hbni<flt_t,acc_t,0,0>(0, list, buffers, host_start, nlocal);
|
bin_newton<flt_t,acc_t,0,0,0,0,0>(0, list, buffers, host_start, nlocal);
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class flt_t, class acc_t, int offload_noghost, int need_ic>
|
|
||||||
void NPairHalfBinNewtonIntel::
|
|
||||||
hbni(const int offload, NeighList *list, IntelBuffers<flt_t,acc_t> *buffers,
|
|
||||||
const int astart, const int aend, const int offload_end) {
|
|
||||||
|
|
||||||
if (aend-astart == 0) return;
|
|
||||||
|
|
||||||
const int nall = atom->nlocal + atom->nghost;
|
|
||||||
int pad = 1;
|
|
||||||
int nall_t = nall;
|
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
if (offload_noghost && offload) nall_t = atom->nlocal;
|
|
||||||
if (offload) {
|
|
||||||
if (INTEL_MIC_NBOR_PAD > 1)
|
|
||||||
pad = INTEL_MIC_NBOR_PAD * sizeof(float) / sizeof(flt_t);
|
|
||||||
} else
|
|
||||||
#endif
|
|
||||||
if (INTEL_NBOR_PAD > 1)
|
|
||||||
pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t);
|
|
||||||
const int pad_width = pad;
|
|
||||||
|
|
||||||
const ATOM_T * _noalias const x = buffers->get_x();
|
|
||||||
int * _noalias const firstneigh = buffers->firstneigh(list);
|
|
||||||
const int e_nall = nall_t;
|
|
||||||
|
|
||||||
const int molecular = atom->molecular;
|
|
||||||
int *ns = NULL;
|
|
||||||
tagint *s = NULL;
|
|
||||||
int tag_size = 0, special_size;
|
|
||||||
if (buffers->need_tag()) tag_size = e_nall;
|
|
||||||
if (molecular) {
|
|
||||||
s = atom->special[0];
|
|
||||||
ns = atom->nspecial[0];
|
|
||||||
special_size = aend;
|
|
||||||
} else {
|
|
||||||
s = &buffers->_special_holder;
|
|
||||||
ns = &buffers->_nspecial_holder;
|
|
||||||
special_size = 0;
|
|
||||||
}
|
|
||||||
const tagint * _noalias const special = s;
|
|
||||||
const int * _noalias const nspecial = ns;
|
|
||||||
const int maxspecial = atom->maxspecial;
|
|
||||||
const tagint * _noalias const tag = atom->tag;
|
|
||||||
|
|
||||||
int * _noalias const ilist = list->ilist;
|
|
||||||
int * _noalias numneigh = list->numneigh;
|
|
||||||
int * _noalias const cnumneigh = buffers->cnumneigh(list);
|
|
||||||
const int nstencil = this->nstencil;
|
|
||||||
const int * _noalias const stencil = this->stencil;
|
|
||||||
const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0];
|
|
||||||
const int ntypes = atom->ntypes + 1;
|
|
||||||
const int nlocal = atom->nlocal;
|
|
||||||
|
|
||||||
#ifndef _LMP_INTEL_OFFLOAD
|
|
||||||
int * const mask = atom->mask;
|
|
||||||
tagint * const molecule = atom->molecule;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
int tnum;
|
|
||||||
int *overflow;
|
|
||||||
double *timer_compute;
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
if (offload) {
|
|
||||||
timer_compute = _fix->off_watch_neighbor();
|
|
||||||
tnum = buffers->get_off_threads();
|
|
||||||
overflow = _fix->get_off_overflow_flag();
|
|
||||||
_fix->stop_watch(TIME_HOST_NEIGHBOR);
|
|
||||||
_fix->start_watch(TIME_OFFLOAD_LATENCY);
|
|
||||||
} else
|
|
||||||
#endif
|
|
||||||
{
|
|
||||||
tnum = comm->nthreads;
|
|
||||||
overflow = _fix->get_overflow_flag();
|
|
||||||
}
|
|
||||||
const int nthreads = tnum;
|
|
||||||
const int maxnbors = buffers->get_max_nbors();
|
|
||||||
int * _noalias const atombin = buffers->get_atombin();
|
|
||||||
const int * _noalias const binpacked = buffers->get_binpacked();
|
|
||||||
|
|
||||||
const int xperiodic = domain->xperiodic;
|
|
||||||
const int yperiodic = domain->yperiodic;
|
|
||||||
const int zperiodic = domain->zperiodic;
|
|
||||||
const flt_t xprd_half = domain->xprd_half;
|
|
||||||
const flt_t yprd_half = domain->yprd_half;
|
|
||||||
const flt_t zprd_half = domain->zprd_half;
|
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
const int * _noalias const binhead = this->binhead;
|
|
||||||
const int * _noalias const bins = this->bins;
|
|
||||||
const int cop = _fix->coprocessor_number();
|
|
||||||
const int separate_buffers = _fix->separate_buffers();
|
|
||||||
#pragma offload target(mic:cop) if(offload) \
|
|
||||||
in(x:length(e_nall+1) alloc_if(0) free_if(0)) \
|
|
||||||
in(tag:length(tag_size) alloc_if(0) free_if(0)) \
|
|
||||||
in(special:length(special_size*maxspecial) alloc_if(0) free_if(0)) \
|
|
||||||
in(nspecial:length(special_size*3) alloc_if(0) free_if(0)) \
|
|
||||||
in(bins,binpacked:length(nall) alloc_if(0) free_if(0)) \
|
|
||||||
in(binhead:length(mbins+1) alloc_if(0) free_if(0)) \
|
|
||||||
in(cutneighsq:length(0) alloc_if(0) free_if(0)) \
|
|
||||||
in(firstneigh:length(0) alloc_if(0) free_if(0)) \
|
|
||||||
in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
|
|
||||||
out(numneigh:length(0) alloc_if(0) free_if(0)) \
|
|
||||||
in(ilist:length(0) alloc_if(0) free_if(0)) \
|
|
||||||
in(atombin:length(aend) alloc_if(0) free_if(0)) \
|
|
||||||
in(stencil:length(nstencil) alloc_if(0) free_if(0)) \
|
|
||||||
in(maxnbors,nthreads,maxspecial,nstencil,e_nall,offload,pad_width) \
|
|
||||||
in(offload_end,separate_buffers,astart, aend, nlocal, molecular, ntypes) \
|
|
||||||
in(xperiodic, yperiodic, zperiodic, xprd_half, yprd_half, zprd_half) \
|
|
||||||
out(overflow:length(5) alloc_if(0) free_if(0)) \
|
|
||||||
out(timer_compute:length(1) alloc_if(0) free_if(0)) \
|
|
||||||
signal(tag)
|
|
||||||
#endif
|
|
||||||
{
|
|
||||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
|
||||||
*timer_compute = MIC_Wtime();
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
overflow[LMP_LOCAL_MIN] = astart;
|
|
||||||
overflow[LMP_LOCAL_MAX] = aend - 1;
|
|
||||||
overflow[LMP_GHOST_MIN] = e_nall;
|
|
||||||
overflow[LMP_GHOST_MAX] = -1;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
int nstencilp = 0;
|
|
||||||
int binstart[INTEL_MAX_STENCIL], binend[INTEL_MAX_STENCIL];
|
|
||||||
for (int k = 0; k < nstencil; k++) {
|
|
||||||
binstart[nstencilp] = stencil[k];
|
|
||||||
int end = stencil[k] + 1;
|
|
||||||
for (int kk = k + 1; kk < nstencil; kk++) {
|
|
||||||
if (stencil[kk-1]+1 == stencil[kk]) {
|
|
||||||
end++;
|
|
||||||
k++;
|
|
||||||
} else break;
|
|
||||||
}
|
|
||||||
binend[nstencilp] = end;
|
|
||||||
nstencilp++;
|
|
||||||
}
|
|
||||||
|
|
||||||
#if defined(_OPENMP)
|
|
||||||
#pragma omp parallel default(none) \
|
|
||||||
shared(numneigh, overflow, nstencilp, binstart, binend)
|
|
||||||
#endif
|
|
||||||
{
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
int lmin = e_nall, lmax = -1, gmin = e_nall, gmax = -1;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
const int num = aend - astart;
|
|
||||||
int tid, ifrom, ito;
|
|
||||||
|
|
||||||
#ifdef OUTER_CHUNK
|
|
||||||
const int swidth = ip_simd::SIMD_type<flt_t>::width();
|
|
||||||
IP_PRE_omp_range_id_vec(ifrom, ito, tid, num, nthreads, swidth);
|
|
||||||
ifrom += astart;
|
|
||||||
ito += astart;
|
|
||||||
int e_ito = ito;
|
|
||||||
if (ito == num) {
|
|
||||||
int imod = ito % swidth;
|
|
||||||
if (imod) e_ito += swidth - imod;
|
|
||||||
}
|
|
||||||
const int list_size = (e_ito + tid * 2 + 2) * maxnbors;
|
|
||||||
#else
|
|
||||||
const int swidth = 1;
|
|
||||||
IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads);
|
|
||||||
ifrom += astart;
|
|
||||||
ito += astart;
|
|
||||||
const int list_size = (ito + tid * 2 + 2) * maxnbors;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
int which;
|
|
||||||
|
|
||||||
int pack_offset = maxnbors * swidth;
|
|
||||||
int ct = (ifrom + tid * 2) * maxnbors;
|
|
||||||
int *neighptr = firstneigh + ct;
|
|
||||||
const int obound = pack_offset + maxnbors * 2;
|
|
||||||
|
|
||||||
int max_chunk = 0;
|
|
||||||
int lane = 0;
|
|
||||||
for (int i = ifrom; i < ito; i++) {
|
|
||||||
const flt_t xtmp = x[i].x;
|
|
||||||
const flt_t ytmp = x[i].y;
|
|
||||||
const flt_t ztmp = x[i].z;
|
|
||||||
const int itype = x[i].w;
|
|
||||||
const int ioffset = ntypes * itype;
|
|
||||||
|
|
||||||
// loop over rest of atoms in i's bin, ghosts are at end of linked list
|
|
||||||
// if j is owned atom, store it, since j is beyond i in linked list
|
|
||||||
// if j is ghost, only store if j coords are "above/to the right" of i
|
|
||||||
|
|
||||||
int raw_count = pack_offset;
|
|
||||||
for (int j = bins[i]; j >= 0; j = bins[j]) {
|
|
||||||
if (j >= nlocal) {
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
if (offload_noghost && offload) continue;
|
|
||||||
#endif
|
|
||||||
if (x[j].z < ztmp) continue;
|
|
||||||
if (x[j].z == ztmp) {
|
|
||||||
if (x[j].y < ytmp) continue;
|
|
||||||
if (x[j].y == ytmp && x[j].x < xtmp) continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
else if (offload_noghost && i < offload_end) continue;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef _LMP_INTEL_OFFLOAD
|
|
||||||
if (exclude) {
|
|
||||||
const int jtype = x[j].w;
|
|
||||||
if (exclusion(i,j,itype,jtype,mask,molecule)) continue;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
neighptr[raw_count++] = j;
|
|
||||||
}
|
|
||||||
|
|
||||||
// loop over all atoms in other bins in stencil, store every pair
|
|
||||||
|
|
||||||
const int ibin = atombin[i];
|
|
||||||
if (exclude) {
|
|
||||||
for (int k = 0; k < nstencilp; k++) {
|
|
||||||
const int bstart = binhead[ibin + binstart[k]];
|
|
||||||
const int bend = binhead[ibin + binend[k]];
|
|
||||||
#ifndef _LMP_INTEL_OFFLOAD
|
|
||||||
#ifdef INTEL_VMASK
|
|
||||||
#pragma simd
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
for (int jj = bstart; jj < bend; jj++) {
|
|
||||||
const int j = binpacked[jj];
|
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
if (offload_noghost) {
|
|
||||||
if (j < nlocal) {
|
|
||||||
if (i < offload_end) continue;
|
|
||||||
} else if (offload) continue;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef _LMP_INTEL_OFFLOAD
|
|
||||||
const int jtype = x[j].w;
|
|
||||||
if (exclusion(i,j,itype,jtype,mask,molecule)) continue;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
neighptr[raw_count++] = j;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for (int k = 0; k < nstencilp; k++) {
|
|
||||||
const int bstart = binhead[ibin + binstart[k]];
|
|
||||||
const int bend = binhead[ibin + binend[k]];
|
|
||||||
#ifndef _LMP_INTEL_OFFLOAD
|
|
||||||
#ifdef INTEL_VMASK
|
|
||||||
#pragma simd
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
for (int jj = bstart; jj < bend; jj++) {
|
|
||||||
const int j = binpacked[jj];
|
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
if (offload_noghost) {
|
|
||||||
if (j < nlocal) {
|
|
||||||
if (i < offload_end) continue;
|
|
||||||
} else if (offload) continue;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
neighptr[raw_count++] = j;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (raw_count > obound) *overflow = 1;
|
|
||||||
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
int vlmin = lmin, vlmax = lmax, vgmin = gmin, vgmax = gmax;
|
|
||||||
#if __INTEL_COMPILER+0 > 1499
|
|
||||||
#pragma vector aligned
|
|
||||||
#pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin)
|
|
||||||
#endif
|
|
||||||
#else
|
|
||||||
#pragma vector aligned
|
|
||||||
#pragma simd
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
for (int u = pack_offset; u < raw_count; u++) {
|
|
||||||
int j = neighptr[u];
|
|
||||||
const flt_t delx = xtmp - x[j].x;
|
|
||||||
const flt_t dely = ytmp - x[j].y;
|
|
||||||
const flt_t delz = ztmp - x[j].z;
|
|
||||||
const int jtype = x[j].w;
|
|
||||||
const flt_t rsq = delx * delx + dely * dely + delz * delz;
|
|
||||||
if (rsq > cutneighsq[ioffset + jtype])
|
|
||||||
neighptr[u] = e_nall;
|
|
||||||
else {
|
|
||||||
if (need_ic) {
|
|
||||||
int no_special;
|
|
||||||
ominimum_image_check(no_special, delx, dely, delz);
|
|
||||||
if (no_special)
|
|
||||||
neighptr[u] = -j - 1;
|
|
||||||
}
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
if (j < nlocal) {
|
|
||||||
if (j < vlmin) vlmin = j;
|
|
||||||
if (j > vlmax) vlmax = j;
|
|
||||||
} else {
|
|
||||||
if (j < vgmin) vgmin = j;
|
|
||||||
if (j > vgmax) vgmax = j;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
lmin = MIN(lmin,vlmin);
|
|
||||||
gmin = MIN(gmin,vgmin);
|
|
||||||
lmax = MAX(lmax,vlmax);
|
|
||||||
gmax = MAX(gmax,vgmax);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
int n = lane, n2 = pack_offset;
|
|
||||||
for (int u = pack_offset; u < raw_count; u++) {
|
|
||||||
const int j = neighptr[u];
|
|
||||||
int pj = j;
|
|
||||||
if (pj < e_nall) {
|
|
||||||
if (need_ic)
|
|
||||||
if (pj < 0) pj = -pj - 1;
|
|
||||||
|
|
||||||
if (pj < nlocal) {
|
|
||||||
neighptr[n] = j;
|
|
||||||
n += swidth;
|
|
||||||
} else
|
|
||||||
neighptr[n2++] = j;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
int ns = (n - lane) / swidth;
|
|
||||||
for (int u = pack_offset; u < n2; u++) {
|
|
||||||
neighptr[n] = neighptr[u];
|
|
||||||
n += swidth;
|
|
||||||
}
|
|
||||||
|
|
||||||
ilist[i] = i;
|
|
||||||
cnumneigh[i] = ct + lane;
|
|
||||||
ns += n2 - pack_offset;
|
|
||||||
#ifndef OUTER_CHUNK
|
|
||||||
int edge = (ns % pad_width);
|
|
||||||
if (edge) {
|
|
||||||
const int pad_end = ns + (pad_width - edge);
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
|
||||||
#pragma loop_count min=1, max=15, avg=8
|
|
||||||
#endif
|
|
||||||
for ( ; ns < pad_end; ns++)
|
|
||||||
neighptr[ns] = e_nall;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
numneigh[i] = ns;
|
|
||||||
|
|
||||||
#ifdef OUTER_CHUNK
|
|
||||||
if (ns > max_chunk) max_chunk = ns;
|
|
||||||
lane++;
|
|
||||||
if (lane == swidth) {
|
|
||||||
ct += max_chunk * swidth;
|
|
||||||
const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
|
|
||||||
int edge = (ct % alignb);
|
|
||||||
if (edge) ct += alignb - edge;
|
|
||||||
neighptr = firstneigh + ct;
|
|
||||||
max_chunk = 0;
|
|
||||||
pack_offset = maxnbors * swidth;
|
|
||||||
lane = 0;
|
|
||||||
if (ct + obound > list_size) {
|
|
||||||
if (i < ito - 1) {
|
|
||||||
*overflow = 1;
|
|
||||||
ct = (ifrom + tid * 2) * maxnbors;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
ct += ns;
|
|
||||||
const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
|
|
||||||
edge = (ct % alignb);
|
|
||||||
if (edge) ct += alignb - edge;
|
|
||||||
neighptr = firstneigh + ct;
|
|
||||||
if (ct + obound > list_size) {
|
|
||||||
if (i < ito - 1) {
|
|
||||||
*overflow = 1;
|
|
||||||
ct = (ifrom + tid * 2) * maxnbors;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
if (*overflow == 1)
|
|
||||||
for (int i = ifrom; i < ito; i++)
|
|
||||||
numneigh[i] = 0;
|
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
if (separate_buffers) {
|
|
||||||
#if defined(_OPENMP)
|
|
||||||
#pragma omp critical
|
|
||||||
#endif
|
|
||||||
{
|
|
||||||
if (lmin < overflow[LMP_LOCAL_MIN]) overflow[LMP_LOCAL_MIN] = lmin;
|
|
||||||
if (lmax > overflow[LMP_LOCAL_MAX]) overflow[LMP_LOCAL_MAX] = lmax;
|
|
||||||
if (gmin < overflow[LMP_GHOST_MIN]) overflow[LMP_GHOST_MIN] = gmin;
|
|
||||||
if (gmax > overflow[LMP_GHOST_MAX]) overflow[LMP_GHOST_MAX] = gmax;
|
|
||||||
}
|
|
||||||
#pragma omp barrier
|
|
||||||
}
|
|
||||||
|
|
||||||
int ghost_offset = 0, nall_offset = e_nall;
|
|
||||||
if (separate_buffers) {
|
|
||||||
int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN];
|
|
||||||
if (nghost < 0) nghost = 0;
|
|
||||||
if (offload) {
|
|
||||||
ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1;
|
|
||||||
nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost;
|
|
||||||
} else {
|
|
||||||
ghost_offset = overflow[LMP_GHOST_MIN] - nlocal;
|
|
||||||
nall_offset = nlocal + nghost;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
if (molecular) {
|
|
||||||
for (int i = ifrom; i < ito; ++i) {
|
|
||||||
int * _noalias jlist = firstneigh + cnumneigh[i];
|
|
||||||
const int jnum = numneigh[i];
|
|
||||||
#ifndef OUTER_CHUNK
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
|
||||||
#pragma vector aligned
|
|
||||||
#pragma simd
|
|
||||||
#endif
|
|
||||||
for (int jj = 0; jj < jnum; jj++) {
|
|
||||||
#else
|
|
||||||
const int trip = jnum * swidth;
|
|
||||||
for (int jj = 0; jj < trip; jj+= swidth) {
|
|
||||||
#endif
|
|
||||||
const int j = jlist[jj];
|
|
||||||
if (need_ic && j < 0) {
|
|
||||||
which = 0;
|
|
||||||
jlist[jj] = -j - 1;
|
|
||||||
} else
|
|
||||||
ofind_special(which, special, nspecial, i, tag[j]);
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
if (j >= nlocal) {
|
|
||||||
if (j == e_nall)
|
|
||||||
jlist[jj] = nall_offset;
|
|
||||||
else if (which)
|
|
||||||
jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
|
|
||||||
else jlist[jj]-=ghost_offset;
|
|
||||||
} else
|
|
||||||
#endif
|
|
||||||
if (which) jlist[jj] = j ^ (which << SBBITS);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
else if (separate_buffers) {
|
|
||||||
for (int i = ifrom; i < ito; ++i) {
|
|
||||||
int * _noalias jlist = firstneigh + cnumneigh[i];
|
|
||||||
const int jnum = numneigh[i];
|
|
||||||
int jj = 0;
|
|
||||||
for (jj = 0; jj < jnum; jj++)
|
|
||||||
if (jlist[jj] >= nlocal) break;
|
|
||||||
while (jj < jnum) {
|
|
||||||
if (jlist[jj] == e_nall) jlist[jj] = nall_offset;
|
|
||||||
else jlist[jj] -= ghost_offset;
|
|
||||||
jj++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
} // end omp
|
|
||||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
|
||||||
*timer_compute = MIC_Wtime() - *timer_compute;
|
|
||||||
#endif
|
|
||||||
} // end offload
|
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
if (offload) {
|
|
||||||
_fix->stop_watch(TIME_OFFLOAD_LATENCY);
|
|
||||||
_fix->start_watch(TIME_HOST_NEIGHBOR);
|
|
||||||
for (int n = 0; n < aend; n++) {
|
|
||||||
ilist[n] = n;
|
|
||||||
numneigh[n] = 0;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for (int i = astart; i < aend; i++)
|
|
||||||
list->firstneigh[i] = firstneigh + cnumneigh[i];
|
|
||||||
if (separate_buffers) {
|
|
||||||
_fix->start_watch(TIME_PACK);
|
|
||||||
_fix->set_neighbor_host_sizes();
|
|
||||||
buffers->pack_sep_from_single(_fix->host_min_local(),
|
|
||||||
_fix->host_used_local(),
|
|
||||||
_fix->host_min_ghost(),
|
|
||||||
_fix->host_used_ghost());
|
|
||||||
_fix->stop_watch(TIME_PACK);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
for (int i = astart; i < aend; i++)
|
|
||||||
list->firstneigh[i] = firstneigh + cnumneigh[i];
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|||||||
@ -36,9 +36,6 @@ class NPairHalfBinNewtonIntel : public NPairIntel {
|
|||||||
private:
|
private:
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void hbni(NeighList *, IntelBuffers<flt_t,acc_t> *);
|
void hbni(NeighList *, IntelBuffers<flt_t,acc_t> *);
|
||||||
template <class flt_t, class acc_t, int, int>
|
|
||||||
void hbni(const int, NeighList *, IntelBuffers<flt_t,acc_t> *, const int,
|
|
||||||
const int, const int offload_end = 0);
|
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -75,439 +75,32 @@ hbnti(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) {
|
|||||||
int need_ic = 0;
|
int need_ic = 0;
|
||||||
if (atom->molecular)
|
if (atom->molecular)
|
||||||
dminimum_image_check(need_ic, neighbor->cutneighmax, neighbor->cutneighmax,
|
dminimum_image_check(need_ic, neighbor->cutneighmax, neighbor->cutneighmax,
|
||||||
neighbor->cutneighmax);
|
neighbor->cutneighmax);
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
if (need_ic) {
|
if (need_ic) {
|
||||||
if (offload_noghost) {
|
if (offload_noghost) {
|
||||||
hbnti<flt_t,acc_t,1,1>(1, list, buffers, 0, off_end);
|
bin_newton<flt_t,acc_t,1,1,0,1,0>(1, list, buffers, 0, off_end);
|
||||||
hbnti<flt_t,acc_t,1,1>(0, list, buffers, host_start, nlocal, off_end);
|
bin_newton<flt_t,acc_t,1,1,0,1,0>(0, list, buffers, host_start, nlocal,
|
||||||
|
off_end);
|
||||||
} else {
|
} else {
|
||||||
hbnti<flt_t,acc_t,0,1>(1, list, buffers, 0, off_end);
|
bin_newton<flt_t,acc_t,0,1,0,1,0>(1, list, buffers, 0, off_end);
|
||||||
hbnti<flt_t,acc_t,0,1>(0, list, buffers, host_start, nlocal);
|
bin_newton<flt_t,acc_t,0,1,0,1,0>(0, list, buffers, host_start, nlocal);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (offload_noghost) {
|
if (offload_noghost) {
|
||||||
hbnti<flt_t,acc_t,1,0>(1, list, buffers, 0, off_end);
|
bin_newton<flt_t,acc_t,1,0,0,1,0>(1, list, buffers, 0, off_end);
|
||||||
hbnti<flt_t,acc_t,1,0>(0, list, buffers, host_start, nlocal, off_end);
|
bin_newton<flt_t,acc_t,1,0,0,1,0>(0, list, buffers, host_start, nlocal,
|
||||||
|
off_end);
|
||||||
} else {
|
} else {
|
||||||
hbnti<flt_t,acc_t,0,0>(1, list, buffers, 0, off_end);
|
bin_newton<flt_t,acc_t,0,0,0,1,0>(1, list, buffers, 0, off_end);
|
||||||
hbnti<flt_t,acc_t,0,0>(0, list, buffers, host_start, nlocal);
|
bin_newton<flt_t,acc_t,0,0,0,1,0>(0, list, buffers, host_start, nlocal);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
if (need_ic)
|
if (need_ic)
|
||||||
hbnti<flt_t,acc_t,0,1>(0, list, buffers, host_start, nlocal);
|
bin_newton<flt_t,acc_t,0,1,0,1,0>(0, list, buffers, host_start, nlocal);
|
||||||
else
|
else
|
||||||
hbnti<flt_t,acc_t,0,0>(0, list, buffers, host_start, nlocal);
|
bin_newton<flt_t,acc_t,0,0,0,1,0>(0, list, buffers, host_start, nlocal);
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class flt_t, class acc_t, int offload_noghost, int need_ic>
|
|
||||||
void NPairHalfBinNewtonTriIntel::
|
|
||||||
hbnti(const int offload, NeighList *list, IntelBuffers<flt_t,acc_t> *buffers,
|
|
||||||
const int astart, const int aend, const int offload_end) {
|
|
||||||
if (aend-astart == 0) return;
|
|
||||||
|
|
||||||
const int nall = atom->nlocal + atom->nghost;
|
|
||||||
int pad = 1;
|
|
||||||
int nall_t = nall;
|
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
if (offload_noghost && offload) nall_t = atom->nlocal;
|
|
||||||
if (offload) {
|
|
||||||
if (INTEL_MIC_NBOR_PAD > 1)
|
|
||||||
pad = INTEL_MIC_NBOR_PAD * sizeof(float) / sizeof(flt_t);
|
|
||||||
} else
|
|
||||||
#endif
|
|
||||||
if (INTEL_NBOR_PAD > 1)
|
|
||||||
pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t);
|
|
||||||
const int pad_width = pad;
|
|
||||||
|
|
||||||
const ATOM_T * _noalias const x = buffers->get_x();
|
|
||||||
int * _noalias const firstneigh = buffers->firstneigh(list);
|
|
||||||
const int e_nall = nall_t;
|
|
||||||
|
|
||||||
const int molecular = atom->molecular;
|
|
||||||
int *ns = NULL;
|
|
||||||
tagint *s = NULL;
|
|
||||||
int tag_size = 0, special_size;
|
|
||||||
if (buffers->need_tag()) tag_size = e_nall;
|
|
||||||
if (molecular) {
|
|
||||||
s = atom->special[0];
|
|
||||||
ns = atom->nspecial[0];
|
|
||||||
special_size = aend;
|
|
||||||
} else {
|
|
||||||
s = &buffers->_special_holder;
|
|
||||||
ns = &buffers->_nspecial_holder;
|
|
||||||
special_size = 0;
|
|
||||||
}
|
|
||||||
const tagint * _noalias const special = s;
|
|
||||||
const int * _noalias const nspecial = ns;
|
|
||||||
const int maxspecial = atom->maxspecial;
|
|
||||||
const tagint * _noalias const tag = atom->tag;
|
|
||||||
|
|
||||||
int * _noalias const ilist = list->ilist;
|
|
||||||
int * _noalias numneigh = list->numneigh;
|
|
||||||
int * _noalias const cnumneigh = buffers->cnumneigh(list);
|
|
||||||
const int nstencil = this->nstencil;
|
|
||||||
const int * _noalias const stencil = this->stencil;
|
|
||||||
const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0];
|
|
||||||
const int ntypes = atom->ntypes + 1;
|
|
||||||
const int nlocal = atom->nlocal;
|
|
||||||
|
|
||||||
#ifndef _LMP_INTEL_OFFLOAD
|
|
||||||
int * const mask = atom->mask;
|
|
||||||
tagint * const molecule = atom->molecule;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
int tnum;
|
|
||||||
int *overflow;
|
|
||||||
double *timer_compute;
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
if (offload) {
|
|
||||||
timer_compute = _fix->off_watch_neighbor();
|
|
||||||
tnum = buffers->get_off_threads();
|
|
||||||
overflow = _fix->get_off_overflow_flag();
|
|
||||||
_fix->stop_watch(TIME_HOST_NEIGHBOR);
|
|
||||||
_fix->start_watch(TIME_OFFLOAD_LATENCY);
|
|
||||||
} else
|
|
||||||
#endif
|
|
||||||
{
|
|
||||||
tnum = comm->nthreads;
|
|
||||||
overflow = _fix->get_overflow_flag();
|
|
||||||
}
|
|
||||||
const int nthreads = tnum;
|
|
||||||
const int maxnbors = buffers->get_max_nbors();
|
|
||||||
int * _noalias const atombin = buffers->get_atombin();
|
|
||||||
const int * _noalias const binpacked = buffers->get_binpacked();
|
|
||||||
|
|
||||||
const int xperiodic = domain->xperiodic;
|
|
||||||
const int yperiodic = domain->yperiodic;
|
|
||||||
const int zperiodic = domain->zperiodic;
|
|
||||||
const flt_t xprd_half = domain->xprd_half;
|
|
||||||
const flt_t yprd_half = domain->yprd_half;
|
|
||||||
const flt_t zprd_half = domain->zprd_half;
|
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
const int * _noalias const binhead = this->binhead;
|
|
||||||
const int * _noalias const bins = this->bins;
|
|
||||||
const int cop = _fix->coprocessor_number();
|
|
||||||
const int separate_buffers = _fix->separate_buffers();
|
|
||||||
#pragma offload target(mic:cop) if(offload) \
|
|
||||||
in(x:length(e_nall+1) alloc_if(0) free_if(0)) \
|
|
||||||
in(tag:length(tag_size) alloc_if(0) free_if(0)) \
|
|
||||||
in(special:length(special_size*maxspecial) alloc_if(0) free_if(0)) \
|
|
||||||
in(nspecial:length(special_size*3) alloc_if(0) free_if(0)) \
|
|
||||||
in(bins,binpacked:length(nall) alloc_if(0) free_if(0)) \
|
|
||||||
in(binhead:length(mbins+1) alloc_if(0) free_if(0)) \
|
|
||||||
in(cutneighsq:length(0) alloc_if(0) free_if(0)) \
|
|
||||||
in(firstneigh:length(0) alloc_if(0) free_if(0)) \
|
|
||||||
in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
|
|
||||||
out(numneigh:length(0) alloc_if(0) free_if(0)) \
|
|
||||||
in(ilist:length(0) alloc_if(0) free_if(0)) \
|
|
||||||
in(atombin:length(aend) alloc_if(0) free_if(0)) \
|
|
||||||
in(stencil:length(nstencil) alloc_if(0) free_if(0)) \
|
|
||||||
in(maxnbors,nthreads,maxspecial,nstencil,offload_end,pad_width,e_nall) \
|
|
||||||
in(offload,separate_buffers, astart, aend, nlocal, molecular, ntypes) \
|
|
||||||
in(xperiodic, yperiodic, zperiodic, xprd_half, yprd_half, zprd_half) \
|
|
||||||
out(overflow:length(5) alloc_if(0) free_if(0)) \
|
|
||||||
out(timer_compute:length(1) alloc_if(0) free_if(0)) \
|
|
||||||
signal(tag)
|
|
||||||
#endif
|
|
||||||
{
|
|
||||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
|
||||||
*timer_compute = MIC_Wtime();
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
overflow[LMP_LOCAL_MIN] = astart;
|
|
||||||
overflow[LMP_LOCAL_MAX] = aend - 1;
|
|
||||||
overflow[LMP_GHOST_MIN] = e_nall;
|
|
||||||
overflow[LMP_GHOST_MAX] = -1;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
int nstencilp = 0;
|
|
||||||
int binstart[INTEL_MAX_STENCIL], binend[INTEL_MAX_STENCIL];
|
|
||||||
for (int k = 0; k < nstencil; k++) {
|
|
||||||
binstart[nstencilp] = stencil[k];
|
|
||||||
int end = stencil[k] + 1;
|
|
||||||
for (int kk = k + 1; kk < nstencil; kk++) {
|
|
||||||
if (stencil[kk-1]+1 == stencil[kk]) {
|
|
||||||
end++;
|
|
||||||
k++;
|
|
||||||
} else break;
|
|
||||||
}
|
|
||||||
binend[nstencilp] = end;
|
|
||||||
nstencilp++;
|
|
||||||
}
|
|
||||||
|
|
||||||
#if defined(_OPENMP)
|
|
||||||
#pragma omp parallel default(none) \
|
|
||||||
shared(numneigh, overflow, nstencilp, binstart, binend)
|
|
||||||
#endif
|
|
||||||
{
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
int lmin = e_nall, lmax = -1, gmin = e_nall, gmax = -1;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
const int num = aend - astart;
|
|
||||||
int tid, ifrom, ito;
|
|
||||||
IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads);
|
|
||||||
ifrom += astart;
|
|
||||||
ito += astart;
|
|
||||||
|
|
||||||
int which;
|
|
||||||
|
|
||||||
const int list_size = (ito + tid * 2 + 2) * maxnbors;
|
|
||||||
int ct = (ifrom + tid * 2) * maxnbors;
|
|
||||||
int *neighptr = firstneigh + ct;
|
|
||||||
const int obound = maxnbors * 3;
|
|
||||||
|
|
||||||
for (int i = ifrom; i < ito; i++) {
|
|
||||||
const flt_t xtmp = x[i].x;
|
|
||||||
const flt_t ytmp = x[i].y;
|
|
||||||
const flt_t ztmp = x[i].z;
|
|
||||||
const int itype = x[i].w;
|
|
||||||
const int ioffset = ntypes * itype;
|
|
||||||
|
|
||||||
// loop over all atoms in bins in stencil
|
|
||||||
// pairs for atoms j "below" i are excluded
|
|
||||||
// below = lower z or (equal z and lower y) or (equal zy and lower x)
|
|
||||||
// (equal zyx and j <= i)
|
|
||||||
// latter excludes self-self interaction but allows superposed atoms
|
|
||||||
|
|
||||||
const int ibin = atombin[i];
|
|
||||||
|
|
||||||
int raw_count = maxnbors;
|
|
||||||
for (int k = 0; k < nstencilp; k++) {
|
|
||||||
const int bstart = binhead[ibin + binstart[k]];
|
|
||||||
const int bend = binhead[ibin + binend[k]];
|
|
||||||
for (int jj = bstart; jj < bend; jj++) {
|
|
||||||
const int j = binpacked[jj];
|
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
if (offload_noghost) {
|
|
||||||
if (j < nlocal) {
|
|
||||||
if (i < offload_end) continue;
|
|
||||||
} else if (offload) continue;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
if (x[j].z < ztmp) continue;
|
|
||||||
if (x[j].z == ztmp) {
|
|
||||||
if (x[j].y < ytmp) continue;
|
|
||||||
if (x[j].y == ytmp) {
|
|
||||||
if (x[j].x < xtmp) continue;
|
|
||||||
if (x[j].x == xtmp && j <= i) continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifndef _LMP_INTEL_OFFLOAD
|
|
||||||
if (exclude) {
|
|
||||||
const int jtype = x[j].w;
|
|
||||||
if (exclusion(i,j,itype,jtype,mask,molecule)) continue;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
neighptr[raw_count++] = j;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (raw_count > obound)
|
|
||||||
*overflow = 1;
|
|
||||||
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
int vlmin = lmin, vlmax = lmax, vgmin = gmin, vgmax = gmax;
|
|
||||||
#if __INTEL_COMPILER+0 > 1499
|
|
||||||
#pragma vector aligned
|
|
||||||
#pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin)
|
|
||||||
#endif
|
|
||||||
#else
|
|
||||||
#pragma vector aligned
|
|
||||||
#pragma simd
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
for (int u = maxnbors; u < raw_count; u++) {
|
|
||||||
int j = neighptr[u];
|
|
||||||
const flt_t delx = xtmp - x[j].x;
|
|
||||||
const flt_t dely = ytmp - x[j].y;
|
|
||||||
const flt_t delz = ztmp - x[j].z;
|
|
||||||
const int jtype = x[j].w;
|
|
||||||
const flt_t rsq = delx * delx + dely * dely + delz * delz;
|
|
||||||
if (rsq > cutneighsq[ioffset + jtype])
|
|
||||||
neighptr[u] = e_nall;
|
|
||||||
else {
|
|
||||||
if (need_ic) {
|
|
||||||
int no_special;
|
|
||||||
ominimum_image_check(no_special, delx, dely, delz);
|
|
||||||
if (no_special)
|
|
||||||
neighptr[u] = -j - 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
if (j < nlocal) {
|
|
||||||
if (j < vlmin) vlmin = j;
|
|
||||||
if (j > vlmax) vlmax = j;
|
|
||||||
} else {
|
|
||||||
if (j < vgmin) vgmin = j;
|
|
||||||
if (j > vgmax) vgmax = j;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
int n = 0, n2 = maxnbors;
|
|
||||||
for (int u = maxnbors; u < raw_count; u++) {
|
|
||||||
const int j = neighptr[u];
|
|
||||||
int pj = j;
|
|
||||||
if (pj < e_nall) {
|
|
||||||
if (need_ic)
|
|
||||||
if (pj < 0) pj = -pj - 1;
|
|
||||||
|
|
||||||
if (pj < nlocal)
|
|
||||||
neighptr[n++] = j;
|
|
||||||
else
|
|
||||||
neighptr[n2++] = j;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
int ns = n;
|
|
||||||
for (int u = maxnbors; u < n2; u++)
|
|
||||||
neighptr[n++] = neighptr[u];
|
|
||||||
|
|
||||||
ilist[i] = i;
|
|
||||||
cnumneigh[i] = ct;
|
|
||||||
ns += n2 - maxnbors;
|
|
||||||
|
|
||||||
int edge = (ns % pad_width);
|
|
||||||
if (edge) {
|
|
||||||
const int pad_end = ns + (pad_width - edge);
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
|
||||||
#pragma loop_count min=1, max=15, avg=8
|
|
||||||
#endif
|
|
||||||
for ( ; ns < pad_end; ns++)
|
|
||||||
neighptr[ns] = e_nall;
|
|
||||||
}
|
|
||||||
numneigh[i] = ns;
|
|
||||||
|
|
||||||
ct += ns;
|
|
||||||
const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
|
|
||||||
edge = (ct % alignb);
|
|
||||||
if (edge) ct += alignb - edge;
|
|
||||||
neighptr = firstneigh + ct;
|
|
||||||
if (ct + obound > list_size) {
|
|
||||||
if (i < ito - 1) {
|
|
||||||
*overflow = 1;
|
|
||||||
ct = (ifrom + tid * 2) * maxnbors;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (*overflow == 1)
|
|
||||||
for (int i = ifrom; i < ito; i++)
|
|
||||||
numneigh[i] = 0;
|
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
if (separate_buffers) {
|
|
||||||
#if defined(_OPENMP)
|
|
||||||
#pragma omp critical
|
|
||||||
#endif
|
|
||||||
{
|
|
||||||
if (lmin < overflow[LMP_LOCAL_MIN]) overflow[LMP_LOCAL_MIN] = lmin;
|
|
||||||
if (lmax > overflow[LMP_LOCAL_MAX]) overflow[LMP_LOCAL_MAX] = lmax;
|
|
||||||
if (gmin < overflow[LMP_GHOST_MIN]) overflow[LMP_GHOST_MIN] = gmin;
|
|
||||||
if (gmax > overflow[LMP_GHOST_MAX]) overflow[LMP_GHOST_MAX] = gmax;
|
|
||||||
}
|
|
||||||
#pragma omp barrier
|
|
||||||
}
|
|
||||||
|
|
||||||
int ghost_offset = 0, nall_offset = e_nall;
|
|
||||||
if (separate_buffers) {
|
|
||||||
int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN];
|
|
||||||
if (nghost < 0) nghost = 0;
|
|
||||||
if (offload) {
|
|
||||||
ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1;
|
|
||||||
nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost;
|
|
||||||
} else {
|
|
||||||
ghost_offset = overflow[LMP_GHOST_MIN] - nlocal;
|
|
||||||
nall_offset = nlocal + nghost;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
if (molecular) {
|
|
||||||
for (int i = ifrom; i < ito; ++i) {
|
|
||||||
int * _noalias jlist = firstneigh + cnumneigh[i];
|
|
||||||
const int jnum = numneigh[i];
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
|
||||||
#pragma vector aligned
|
|
||||||
#pragma simd
|
|
||||||
#endif
|
|
||||||
for (int jj = 0; jj < jnum; jj++) {
|
|
||||||
const int j = jlist[jj];
|
|
||||||
if (need_ic && j < 0) {
|
|
||||||
which = 0;
|
|
||||||
jlist[jj] = -j - 1;
|
|
||||||
} else
|
|
||||||
ofind_special(which, special, nspecial, i, tag[j]);
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
if (j >= nlocal) {
|
|
||||||
if (j == e_nall)
|
|
||||||
jlist[jj] = nall_offset;
|
|
||||||
else if (which)
|
|
||||||
jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
|
|
||||||
else jlist[jj]-=ghost_offset;
|
|
||||||
} else
|
|
||||||
#endif
|
|
||||||
if (which) jlist[jj] = j ^ (which << SBBITS);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
else if (separate_buffers) {
|
|
||||||
for (int i = ifrom; i < ito; ++i) {
|
|
||||||
int * _noalias jlist = firstneigh + cnumneigh[i];
|
|
||||||
const int jnum = numneigh[i];
|
|
||||||
int jj = 0;
|
|
||||||
for (jj = 0; jj < jnum; jj++)
|
|
||||||
if (jlist[jj] >= nlocal) break;
|
|
||||||
while (jj < jnum) {
|
|
||||||
if (jlist[jj] == e_nall) jlist[jj] = nall_offset;
|
|
||||||
else jlist[jj] -= ghost_offset;
|
|
||||||
jj++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
} // end omp
|
|
||||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
|
||||||
*timer_compute = MIC_Wtime() - *timer_compute;
|
|
||||||
#endif
|
|
||||||
} // end offload
|
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
|
||||||
if (offload) {
|
|
||||||
_fix->stop_watch(TIME_OFFLOAD_LATENCY);
|
|
||||||
_fix->start_watch(TIME_HOST_NEIGHBOR);
|
|
||||||
for (int n = 0; n < aend; n++) {
|
|
||||||
ilist[n] = n;
|
|
||||||
numneigh[n] = 0;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for (int i = astart; i < aend; i++)
|
|
||||||
list->firstneigh[i] = firstneigh + cnumneigh[i];
|
|
||||||
if (separate_buffers) {
|
|
||||||
_fix->start_watch(TIME_PACK);
|
|
||||||
_fix->set_neighbor_host_sizes();
|
|
||||||
buffers->pack_sep_from_single(_fix->host_min_local(),
|
|
||||||
_fix->host_used_local(),
|
|
||||||
_fix->host_min_ghost(),
|
|
||||||
_fix->host_used_ghost());
|
|
||||||
_fix->stop_watch(TIME_PACK);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
for (int i = astart; i < aend; i++)
|
|
||||||
list->firstneigh[i] = firstneigh + cnumneigh[i];
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|||||||
@ -36,9 +36,6 @@ class NPairHalfBinNewtonTriIntel : public NPairIntel {
|
|||||||
private:
|
private:
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void hbnti(NeighList *, IntelBuffers<flt_t,acc_t> *);
|
void hbnti(NeighList *, IntelBuffers<flt_t,acc_t> *);
|
||||||
template <class flt_t, class acc_t, int, int>
|
|
||||||
void hbnti(const int, NeighList *, IntelBuffers<flt_t,acc_t> *, const int,
|
|
||||||
const int, const int offload_end = 0);
|
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -48,6 +48,678 @@ NPairIntel::~NPairIntel() {
|
|||||||
|
|
||||||
/* ---------------------------------------------------------------------- */
|
/* ---------------------------------------------------------------------- */
|
||||||
|
|
||||||
|
template <class flt_t, class acc_t, int offload_noghost, int need_ic,
|
||||||
|
int FULL, int TRI, int THREE>
|
||||||
|
void NPairIntel::bin_newton(const int offload, NeighList *list,
|
||||||
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
|
const int astart, const int aend,
|
||||||
|
const int offload_end) {
|
||||||
|
|
||||||
|
if (aend-astart == 0) return;
|
||||||
|
|
||||||
|
const int nall = atom->nlocal + atom->nghost;
|
||||||
|
int pad = 1;
|
||||||
|
int nall_t = nall;
|
||||||
|
|
||||||
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
|
if (offload_noghost && offload) nall_t = atom->nlocal;
|
||||||
|
if (THREE == 0 && offload) {
|
||||||
|
if (INTEL_MIC_NBOR_PAD > 1)
|
||||||
|
pad = INTEL_MIC_NBOR_PAD * sizeof(float) / sizeof(flt_t);
|
||||||
|
} else
|
||||||
|
#endif
|
||||||
|
if (THREE == 0 && INTEL_NBOR_PAD > 1)
|
||||||
|
pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t);
|
||||||
|
const int pad_width = pad;
|
||||||
|
const int pack_width = _fix->nbor_pack_width();
|
||||||
|
|
||||||
|
const ATOM_T * _noalias const x = buffers->get_x();
|
||||||
|
int * _noalias const firstneigh = buffers->firstneigh(list);
|
||||||
|
const int e_nall = nall_t;
|
||||||
|
|
||||||
|
const int molecular = atom->molecular;
|
||||||
|
int *ns = NULL;
|
||||||
|
tagint *s = NULL;
|
||||||
|
int tag_size = 0, special_size;
|
||||||
|
if (buffers->need_tag()) tag_size = e_nall;
|
||||||
|
if (molecular) {
|
||||||
|
s = atom->special[0];
|
||||||
|
ns = atom->nspecial[0];
|
||||||
|
special_size = aend;
|
||||||
|
} else {
|
||||||
|
s = &buffers->_special_holder;
|
||||||
|
ns = &buffers->_nspecial_holder;
|
||||||
|
special_size = 0;
|
||||||
|
}
|
||||||
|
const tagint * _noalias const special = s;
|
||||||
|
const int * _noalias const nspecial = ns;
|
||||||
|
const int maxspecial = atom->maxspecial;
|
||||||
|
const tagint * _noalias const tag = atom->tag;
|
||||||
|
|
||||||
|
int * _noalias const ilist = list->ilist;
|
||||||
|
int * _noalias numneigh = list->numneigh;
|
||||||
|
int * _noalias const cnumneigh = buffers->cnumneigh(list);
|
||||||
|
const int nstencil = this->nstencil;
|
||||||
|
const int * _noalias const stencil = this->stencil;
|
||||||
|
const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0];
|
||||||
|
const int ntypes = atom->ntypes + 1;
|
||||||
|
const int nlocal = atom->nlocal;
|
||||||
|
|
||||||
|
#ifndef _LMP_INTEL_OFFLOAD
|
||||||
|
int * const mask = atom->mask;
|
||||||
|
tagint * const molecule = atom->molecule;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
int tnum;
|
||||||
|
int *overflow;
|
||||||
|
double *timer_compute;
|
||||||
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
|
if (offload) {
|
||||||
|
timer_compute = _fix->off_watch_neighbor();
|
||||||
|
tnum = buffers->get_off_threads();
|
||||||
|
overflow = _fix->get_off_overflow_flag();
|
||||||
|
_fix->stop_watch(TIME_HOST_NEIGHBOR);
|
||||||
|
_fix->start_watch(TIME_OFFLOAD_LATENCY);
|
||||||
|
} else
|
||||||
|
#endif
|
||||||
|
{
|
||||||
|
tnum = comm->nthreads;
|
||||||
|
overflow = _fix->get_overflow_flag();
|
||||||
|
}
|
||||||
|
const int nthreads = tnum;
|
||||||
|
const int maxnbors = buffers->get_max_nbors();
|
||||||
|
int * _noalias const atombin = buffers->get_atombin();
|
||||||
|
const int * _noalias const binpacked = buffers->get_binpacked();
|
||||||
|
|
||||||
|
const int xperiodic = domain->xperiodic;
|
||||||
|
const int yperiodic = domain->yperiodic;
|
||||||
|
const int zperiodic = domain->zperiodic;
|
||||||
|
const flt_t xprd_half = domain->xprd_half;
|
||||||
|
const flt_t yprd_half = domain->yprd_half;
|
||||||
|
const flt_t zprd_half = domain->zprd_half;
|
||||||
|
|
||||||
|
flt_t * _noalias const ncachex = buffers->get_ncachex();
|
||||||
|
flt_t * _noalias const ncachey = buffers->get_ncachey();
|
||||||
|
flt_t * _noalias const ncachez = buffers->get_ncachez();
|
||||||
|
int * _noalias const ncachej = buffers->get_ncachej();
|
||||||
|
int * _noalias const ncachejtype = buffers->get_ncachejtype();
|
||||||
|
const int ncache_stride = buffers->ncache_stride();
|
||||||
|
|
||||||
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
|
const int * _noalias const binhead = this->binhead;
|
||||||
|
const int * _noalias const bins = this->bins;
|
||||||
|
const int cop = _fix->coprocessor_number();
|
||||||
|
const int separate_buffers = _fix->separate_buffers();
|
||||||
|
#pragma offload target(mic:cop) if(offload) \
|
||||||
|
in(x:length(e_nall+1) alloc_if(0) free_if(0)) \
|
||||||
|
in(tag:length(tag_size) alloc_if(0) free_if(0)) \
|
||||||
|
in(special:length(special_size*maxspecial) alloc_if(0) free_if(0)) \
|
||||||
|
in(nspecial:length(special_size*3) alloc_if(0) free_if(0)) \
|
||||||
|
in(bins,binpacked:length(nall) alloc_if(0) free_if(0)) \
|
||||||
|
in(binhead:length(mbins+1) alloc_if(0) free_if(0)) \
|
||||||
|
in(cutneighsq:length(0) alloc_if(0) free_if(0)) \
|
||||||
|
in(firstneigh:length(0) alloc_if(0) free_if(0)) \
|
||||||
|
in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
|
||||||
|
out(numneigh:length(0) alloc_if(0) free_if(0)) \
|
||||||
|
in(ilist:length(0) alloc_if(0) free_if(0)) \
|
||||||
|
in(atombin:length(aend) alloc_if(0) free_if(0)) \
|
||||||
|
in(stencil:length(nstencil) alloc_if(0) free_if(0)) \
|
||||||
|
in(ncachex,ncachey,ncachez,ncachej:length(0) alloc_if(0) free_if(0)) \
|
||||||
|
in(ncachejtype:length(0) alloc_if(0) free_if(0)) \
|
||||||
|
in(ncache_stride,maxnbors,nthreads,maxspecial,nstencil,e_nall,offload) \
|
||||||
|
in(pad_width,offload_end,separate_buffers,astart,aend,nlocal,molecular) \
|
||||||
|
in(ntypes,xperiodic,yperiodic,zperiodic,xprd_half,yprd_half,zprd_half) \
|
||||||
|
in(pack_width) \
|
||||||
|
out(overflow:length(5) alloc_if(0) free_if(0)) \
|
||||||
|
out(timer_compute:length(1) alloc_if(0) free_if(0)) \
|
||||||
|
signal(tag)
|
||||||
|
#endif
|
||||||
|
{
|
||||||
|
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||||
|
*timer_compute = MIC_Wtime();
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
|
overflow[LMP_LOCAL_MIN] = astart;
|
||||||
|
overflow[LMP_LOCAL_MAX] = aend - 1;
|
||||||
|
overflow[LMP_GHOST_MIN] = e_nall;
|
||||||
|
overflow[LMP_GHOST_MAX] = -1;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
int nstencilp = 0;
|
||||||
|
int binstart[INTEL_MAX_STENCIL], binend[INTEL_MAX_STENCIL];
|
||||||
|
for (int k = 0; k < nstencil; k++) {
|
||||||
|
binstart[nstencilp] = stencil[k];
|
||||||
|
int end = stencil[k] + 1;
|
||||||
|
for (int kk = k + 1; kk < nstencil; kk++) {
|
||||||
|
if (stencil[kk-1]+1 == stencil[kk]) {
|
||||||
|
end++;
|
||||||
|
k++;
|
||||||
|
} else break;
|
||||||
|
}
|
||||||
|
binend[nstencilp] = end;
|
||||||
|
nstencilp++;
|
||||||
|
}
|
||||||
|
|
||||||
|
#if defined(_OPENMP)
|
||||||
|
#pragma omp parallel default(none) \
|
||||||
|
shared(numneigh, overflow, nstencilp, binstart, binend)
|
||||||
|
#endif
|
||||||
|
{
|
||||||
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
|
int lmin = e_nall, lmax = -1, gmin = e_nall, gmax = -1;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
const int num = aend - astart;
|
||||||
|
int tid, ifrom, ito;
|
||||||
|
|
||||||
|
if (THREE) {
|
||||||
|
IP_PRE_omp_range_id_vec(ifrom, ito, tid, num, nthreads, pack_width);
|
||||||
|
} else {
|
||||||
|
IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads);
|
||||||
|
}
|
||||||
|
ifrom += astart;
|
||||||
|
ito += astart;
|
||||||
|
int e_ito = ito;
|
||||||
|
if (THREE && ito == num) {
|
||||||
|
int imod = ito % pack_width;
|
||||||
|
if (imod) e_ito += pack_width - imod;
|
||||||
|
}
|
||||||
|
const int list_size = (e_ito + tid * 2 + 2) * maxnbors;
|
||||||
|
|
||||||
|
int which;
|
||||||
|
|
||||||
|
int pack_offset = maxnbors;
|
||||||
|
if (THREE) pack_offset *= pack_width;
|
||||||
|
int ct = (ifrom + tid * 2) * maxnbors;
|
||||||
|
int *neighptr = firstneigh + ct;
|
||||||
|
const int obound = pack_offset + maxnbors * 2;
|
||||||
|
|
||||||
|
const int toffs = tid * ncache_stride;
|
||||||
|
flt_t * _noalias const tx = ncachex + toffs;
|
||||||
|
flt_t * _noalias const ty = ncachey + toffs;
|
||||||
|
flt_t * _noalias const tz = ncachez + toffs;
|
||||||
|
int * _noalias const tj = ncachej + toffs;
|
||||||
|
int * _noalias const tjtype = ncachejtype + toffs;
|
||||||
|
|
||||||
|
flt_t * _noalias itx;
|
||||||
|
flt_t * _noalias ity;
|
||||||
|
flt_t * _noalias itz;
|
||||||
|
int * _noalias itj;
|
||||||
|
int * _noalias itjtype;
|
||||||
|
|
||||||
|
// loop over all atoms in other bins in stencil, store every pair
|
||||||
|
int istart, icount, ncount, oldbin = -9999999, lane, max_chunk;
|
||||||
|
if (THREE) {
|
||||||
|
lane = 0;
|
||||||
|
max_chunk = 0;
|
||||||
|
}
|
||||||
|
for (int i = ifrom; i < ito; i++) {
|
||||||
|
const flt_t xtmp = x[i].x;
|
||||||
|
const flt_t ytmp = x[i].y;
|
||||||
|
const flt_t ztmp = x[i].z;
|
||||||
|
const int itype = x[i].w;
|
||||||
|
tagint itag;
|
||||||
|
if (THREE) itag = tag[i];
|
||||||
|
const int ioffset = ntypes * itype;
|
||||||
|
|
||||||
|
const int ibin = atombin[i];
|
||||||
|
if (ibin != oldbin) {
|
||||||
|
oldbin = ibin;
|
||||||
|
ncount = 0;
|
||||||
|
for (int k = 0; k < nstencilp; k++) {
|
||||||
|
const int bstart = binhead[ibin + binstart[k]];
|
||||||
|
const int bend = binhead[ibin + binend[k]];
|
||||||
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#pragma vector aligned
|
||||||
|
#pragma simd
|
||||||
|
#endif
|
||||||
|
for (int jj = bstart; jj < bend; jj++)
|
||||||
|
tj[ncount++] = binpacked[jj];
|
||||||
|
}
|
||||||
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#pragma vector aligned
|
||||||
|
#pragma simd
|
||||||
|
#endif
|
||||||
|
for (int u = 0; u < ncount; u++) {
|
||||||
|
const int j = tj[u];
|
||||||
|
tx[u] = x[j].x;
|
||||||
|
ty[u] = x[j].y;
|
||||||
|
tz[u] = x[j].z;
|
||||||
|
tjtype[u] = x[j].w;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (FULL == 0 || TRI == 1) {
|
||||||
|
icount = 0;
|
||||||
|
istart = ncount;
|
||||||
|
const int alignb = INTEL_DATA_ALIGN / sizeof(int);
|
||||||
|
int nedge = istart % alignb;
|
||||||
|
if (nedge) istart + (alignb - nedge);
|
||||||
|
itx = tx + istart;
|
||||||
|
ity = ty + istart;
|
||||||
|
itz = tz + istart;
|
||||||
|
itj = tj + istart;
|
||||||
|
itjtype = tjtype + istart;
|
||||||
|
|
||||||
|
const int bstart = binhead[ibin];
|
||||||
|
const int bend = binhead[ibin + 1];
|
||||||
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#pragma vector aligned
|
||||||
|
#pragma simd
|
||||||
|
#endif
|
||||||
|
for (int jj = bstart; jj < bend; jj++) {
|
||||||
|
const int j = binpacked[jj];
|
||||||
|
itj[icount] = j;
|
||||||
|
itx[icount] = x[j].x;
|
||||||
|
ity[icount] = x[j].y;
|
||||||
|
itz[icount] = x[j].z;
|
||||||
|
itjtype[icount] = x[j].w;
|
||||||
|
icount++;
|
||||||
|
}
|
||||||
|
if (icount + istart > obound) *overflow = 1;
|
||||||
|
} else
|
||||||
|
if (ncount > obound) *overflow = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------- Loop over i bin
|
||||||
|
|
||||||
|
int n = 0;
|
||||||
|
if (FULL == 0 || TRI == 1) {
|
||||||
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#pragma vector aligned
|
||||||
|
#pragma ivdep
|
||||||
|
#endif
|
||||||
|
for (int u = 0; u < icount; u++) {
|
||||||
|
int addme = 1;
|
||||||
|
int j = itj[u];
|
||||||
|
|
||||||
|
// Cutoff Check
|
||||||
|
const flt_t delx = xtmp - itx[u];
|
||||||
|
const flt_t dely = ytmp - ity[u];
|
||||||
|
const flt_t delz = ztmp - itz[u];
|
||||||
|
const int jtype = itjtype[u];
|
||||||
|
const flt_t rsq = delx * delx + dely * dely + delz * delz;
|
||||||
|
if (rsq > cutneighsq[ioffset + jtype]) addme = 0;
|
||||||
|
|
||||||
|
// i bin (half) check and offload ghost check
|
||||||
|
if (j < nlocal) {
|
||||||
|
const int ijmod = (i + j) % 2;
|
||||||
|
if (i > j) {
|
||||||
|
if (ijmod == 0) addme = 0;
|
||||||
|
} else if (i < j) {
|
||||||
|
if (ijmod == 1) addme = 0;
|
||||||
|
} else
|
||||||
|
addme = 0;
|
||||||
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
|
if (offload_noghost && i < offload_end) addme = 0;
|
||||||
|
#endif
|
||||||
|
} else {
|
||||||
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
|
if (offload_noghost && offload) addme = 0;
|
||||||
|
#endif
|
||||||
|
if (itz[u] < ztmp) addme = 0;
|
||||||
|
if (itz[u] == ztmp) {
|
||||||
|
if (ity[u] < ytmp) addme = 0;
|
||||||
|
if (ity[u] == ytmp && itx[u] < xtmp) addme = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (need_ic) {
|
||||||
|
int no_special;
|
||||||
|
ominimum_image_check(no_special, delx, dely, delz);
|
||||||
|
if (no_special)
|
||||||
|
j = -j - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (addme)
|
||||||
|
neighptr[n++] = j;
|
||||||
|
}
|
||||||
|
} // if FULL==0
|
||||||
|
|
||||||
|
// ---------------------- Loop over other bins
|
||||||
|
|
||||||
|
int n2, *neighptr2;
|
||||||
|
if (THREE) {
|
||||||
|
n = pack_offset;
|
||||||
|
n2 = pack_offset + maxnbors;
|
||||||
|
neighptr2 = neighptr;
|
||||||
|
}
|
||||||
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#pragma vector aligned
|
||||||
|
#pragma ivdep
|
||||||
|
#endif
|
||||||
|
for (int u = 0; u < ncount; u++) {
|
||||||
|
int addme = 1;
|
||||||
|
int j = tj[u];
|
||||||
|
|
||||||
|
if (FULL)
|
||||||
|
if (i == j) addme = 0;
|
||||||
|
|
||||||
|
// Cutoff Check
|
||||||
|
const flt_t delx = xtmp - tx[u];
|
||||||
|
const flt_t dely = ytmp - ty[u];
|
||||||
|
const flt_t delz = ztmp - tz[u];
|
||||||
|
const int jtype = tjtype[u];
|
||||||
|
const flt_t rsq = delx * delx + dely * dely + delz * delz;
|
||||||
|
if (rsq > cutneighsq[ioffset + jtype]) addme = 0;
|
||||||
|
|
||||||
|
// Triclinic
|
||||||
|
if (TRI) {
|
||||||
|
if (tz[u] < ztmp) addme = 0;
|
||||||
|
if (tz[u] == ztmp) {
|
||||||
|
if (ty[u] < ytmp) addme = 0;
|
||||||
|
if (ty[u] == ytmp) {
|
||||||
|
if (tx[u] < xtmp) addme = 0;
|
||||||
|
if (tx[u] == xtmp && j <= i) addme = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// offload ghost check
|
||||||
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
|
if (offload_noghost) {
|
||||||
|
if (j < nlocal) {
|
||||||
|
if (i < offload_end) addme = 0;
|
||||||
|
} else if (offload) addme = 0;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
int pj;
|
||||||
|
if (THREE) pj = j;
|
||||||
|
if (need_ic) {
|
||||||
|
int no_special;
|
||||||
|
ominimum_image_check(no_special, delx, dely, delz);
|
||||||
|
if (no_special)
|
||||||
|
j = -j - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (THREE) {
|
||||||
|
const int jtag = tag[pj];
|
||||||
|
int flist = 0;
|
||||||
|
if (itag > jtag) {
|
||||||
|
if ((itag+jtag) % 2 == 0) flist = 1;
|
||||||
|
} else if (itag < jtag) {
|
||||||
|
if ((itag+jtag) % 2 == 1) flist = 1;
|
||||||
|
} else {
|
||||||
|
if (tz[u] < ztmp) flist = 1;
|
||||||
|
else if (tz[u] == ztmp && ty[u] < ytmp) flist = 1;
|
||||||
|
else if (tz[u] == ztmp && ty[u] == ytmp && tx[u] < xtmp)
|
||||||
|
flist = 1;
|
||||||
|
}
|
||||||
|
if (addme) {
|
||||||
|
if (flist)
|
||||||
|
neighptr2[n2++] = j;
|
||||||
|
else
|
||||||
|
neighptr[n++] = j;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (addme)
|
||||||
|
neighptr[n++] = j;
|
||||||
|
}
|
||||||
|
} // for u
|
||||||
|
|
||||||
|
#ifndef _LMP_INTEL_OFFLOAD
|
||||||
|
if (exclude) {
|
||||||
|
int alln = n;
|
||||||
|
if (THREE) n = pack_offset;
|
||||||
|
else n = 0;
|
||||||
|
for (int u = pack_offset; u < alln; u++) {
|
||||||
|
const int j = neighptr[u];
|
||||||
|
int pj = j;
|
||||||
|
if (need_ic)
|
||||||
|
if (pj < 0) pj = -j - 1;
|
||||||
|
const int jtype = x[pj].w;
|
||||||
|
if (exclusion(i,pj,itype,jtype,mask,molecule)) continue;
|
||||||
|
neighptr[n++] = j;
|
||||||
|
}
|
||||||
|
if (THREE) {
|
||||||
|
alln = n2;
|
||||||
|
n2 = pack_offset + maxnbors;
|
||||||
|
for (int u = pack_offset + maxnbors; u < alln; u++) {
|
||||||
|
const int j = neighptr[u];
|
||||||
|
int pj = j;
|
||||||
|
if (need_ic)
|
||||||
|
if (pj < 0) pj = -j - 1;
|
||||||
|
const int jtype = x[pj].w;
|
||||||
|
if (exclusion(i,pj,itype,jtype,mask,molecule)) continue;
|
||||||
|
neighptr[n2++] = j;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
int ns;
|
||||||
|
if (THREE) {
|
||||||
|
int alln = n;
|
||||||
|
ns = n - pack_offset;
|
||||||
|
atombin[i] = ns;
|
||||||
|
n = lane;
|
||||||
|
for (int u = pack_offset; u < alln; u++) {
|
||||||
|
neighptr[n] = neighptr[u];
|
||||||
|
n += pack_width;
|
||||||
|
}
|
||||||
|
ns += n2 - pack_offset - maxnbors;
|
||||||
|
for (int u = pack_offset + maxnbors; u < n2; u++) {
|
||||||
|
neighptr[n] = neighptr[u];
|
||||||
|
n += pack_width;
|
||||||
|
}
|
||||||
|
if (ns > maxnbors) *overflow = 1;
|
||||||
|
} else
|
||||||
|
if (n > maxnbors) *overflow = 1;
|
||||||
|
|
||||||
|
ilist[i] = i;
|
||||||
|
cnumneigh[i] = ct;
|
||||||
|
if (THREE) {
|
||||||
|
cnumneigh[i] += lane;
|
||||||
|
numneigh[i] = ns;
|
||||||
|
} else {
|
||||||
|
int edge = (n % pad_width);
|
||||||
|
if (edge) {
|
||||||
|
const int pad_end = n + (pad_width - edge);
|
||||||
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#pragma vector aligned
|
||||||
|
#pragma loop_count min=1, max=INTEL_COMPILE_WIDTH-1, \
|
||||||
|
avg=INTEL_COMPILE_WIDTH/2
|
||||||
|
#endif
|
||||||
|
for ( ; n < pad_end; n++)
|
||||||
|
neighptr[n] = e_nall;
|
||||||
|
}
|
||||||
|
numneigh[i] = n;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (THREE) {
|
||||||
|
if (ns > max_chunk) max_chunk = ns;
|
||||||
|
lane++;
|
||||||
|
if (lane == pack_width) {
|
||||||
|
ct += max_chunk * pack_width;
|
||||||
|
const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
|
||||||
|
const int edge = (ct % alignb);
|
||||||
|
if (edge) ct += alignb - edge;
|
||||||
|
neighptr = firstneigh + ct;
|
||||||
|
max_chunk = 0;
|
||||||
|
pack_offset = maxnbors * pack_width;
|
||||||
|
lane = 0;
|
||||||
|
if (ct + obound > list_size) {
|
||||||
|
if (i < ito - 1) {
|
||||||
|
*overflow = 1;
|
||||||
|
ct = (ifrom + tid * 2) * maxnbors;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
ct += n;
|
||||||
|
const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
|
||||||
|
const int edge = (ct % alignb);
|
||||||
|
if (edge) ct += alignb - edge;
|
||||||
|
neighptr = firstneigh + ct;
|
||||||
|
if (ct + obound > list_size) {
|
||||||
|
if (i < ito - 1) {
|
||||||
|
*overflow = 1;
|
||||||
|
ct = (ifrom + tid * 2) * maxnbors;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (*overflow == 1)
|
||||||
|
for (int i = ifrom; i < ito; i++)
|
||||||
|
numneigh[i] = 0;
|
||||||
|
|
||||||
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
|
int vlmin = lmin, vlmax = lmax, vgmin = gmin, vgmax = gmax;
|
||||||
|
int ghost_offset = 0, nall_offset = e_nall;
|
||||||
|
if (separate_buffers) {
|
||||||
|
for (int i = ifrom; i < ito; ++i) {
|
||||||
|
int * _noalias jlist = firstneigh + cnumneigh[i];
|
||||||
|
const int jnum = numneigh[i];
|
||||||
|
#if __INTEL_COMPILER+0 > 1499
|
||||||
|
#pragma vector aligned
|
||||||
|
#pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin)
|
||||||
|
#endif
|
||||||
|
for (int jj = 0; jj < jnum; jj++) {
|
||||||
|
int j = jlist[jj];
|
||||||
|
if (need_ic && j < 0) j = -j - 1;
|
||||||
|
if (j < nlocal) {
|
||||||
|
if (j < vlmin) vlmin = j;
|
||||||
|
if (j > vlmax) vlmax = j;
|
||||||
|
} else {
|
||||||
|
if (j < vgmin) vgmin = j;
|
||||||
|
if (j > vgmax) vgmax = j;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
lmin = MIN(lmin,vlmin);
|
||||||
|
gmin = MIN(gmin,vgmin);
|
||||||
|
lmax = MAX(lmax,vlmax);
|
||||||
|
gmax = MAX(gmax,vgmax);
|
||||||
|
|
||||||
|
#if defined(_OPENMP)
|
||||||
|
#pragma omp critical
|
||||||
|
#endif
|
||||||
|
{
|
||||||
|
if (lmin < overflow[LMP_LOCAL_MIN]) overflow[LMP_LOCAL_MIN] = lmin;
|
||||||
|
if (lmax > overflow[LMP_LOCAL_MAX]) overflow[LMP_LOCAL_MAX] = lmax;
|
||||||
|
if (gmin < overflow[LMP_GHOST_MIN]) overflow[LMP_GHOST_MIN] = gmin;
|
||||||
|
if (gmax > overflow[LMP_GHOST_MAX]) overflow[LMP_GHOST_MAX] = gmax;
|
||||||
|
}
|
||||||
|
#pragma omp barrier
|
||||||
|
|
||||||
|
int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN];
|
||||||
|
if (nghost < 0) nghost = 0;
|
||||||
|
if (offload) {
|
||||||
|
ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1;
|
||||||
|
nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost;
|
||||||
|
} else {
|
||||||
|
ghost_offset = overflow[LMP_GHOST_MIN] - nlocal;
|
||||||
|
nall_offset = nlocal + nghost;
|
||||||
|
}
|
||||||
|
} // if separate_buffers
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if (molecular) {
|
||||||
|
for (int i = ifrom; i < ito; ++i) {
|
||||||
|
int * _noalias jlist = firstneigh + cnumneigh[i];
|
||||||
|
const int jnum = numneigh[i];
|
||||||
|
|
||||||
|
if (THREE) {
|
||||||
|
const int trip = jnum * pack_width;
|
||||||
|
for (int jj = 0; jj < trip; jj+=pack_width) {
|
||||||
|
const int j = jlist[jj];
|
||||||
|
if (need_ic && j < 0) {
|
||||||
|
which = 0;
|
||||||
|
jlist[jj] = -j - 1;
|
||||||
|
} else
|
||||||
|
ofind_special(which, special, nspecial, i, tag[j]);
|
||||||
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
|
if (j >= nlocal) {
|
||||||
|
if (j == e_nall)
|
||||||
|
jlist[jj] = nall_offset;
|
||||||
|
else if (which)
|
||||||
|
jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
|
||||||
|
else jlist[jj]-=ghost_offset;
|
||||||
|
} else
|
||||||
|
#endif
|
||||||
|
if (which) jlist[jj] = j ^ (which << SBBITS);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#pragma vector aligned
|
||||||
|
#pragma simd
|
||||||
|
#endif
|
||||||
|
for (int jj = 0; jj < jnum; jj++) {
|
||||||
|
const int j = jlist[jj];
|
||||||
|
if (need_ic && j < 0) {
|
||||||
|
which = 0;
|
||||||
|
jlist[jj] = -j - 1;
|
||||||
|
} else
|
||||||
|
ofind_special(which, special, nspecial, i, tag[j]);
|
||||||
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
|
if (j >= nlocal) {
|
||||||
|
if (j == e_nall)
|
||||||
|
jlist[jj] = nall_offset;
|
||||||
|
else if (which)
|
||||||
|
jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
|
||||||
|
else jlist[jj]-=ghost_offset;
|
||||||
|
} else
|
||||||
|
#endif
|
||||||
|
if (which) jlist[jj] = j ^ (which << SBBITS);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} // for i
|
||||||
|
} // if molecular
|
||||||
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
|
else if (separate_buffers) {
|
||||||
|
for (int i = ifrom; i < ito; ++i) {
|
||||||
|
int * _noalias jlist = firstneigh + cnumneigh[i];
|
||||||
|
const int jnum = numneigh[i];
|
||||||
|
int jj = 0;
|
||||||
|
#pragma vector aligned
|
||||||
|
#pragma simd
|
||||||
|
for (jj = 0; jj < jnum; jj++) {
|
||||||
|
if (jlist[jj] >= nlocal) {
|
||||||
|
if (jlist[jj] == e_nall) jlist[jj] = nall_offset;
|
||||||
|
else jlist[jj] -= ghost_offset;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
} // end omp
|
||||||
|
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||||
|
*timer_compute = MIC_Wtime() - *timer_compute;
|
||||||
|
#endif
|
||||||
|
} // end offload
|
||||||
|
|
||||||
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
|
if (offload) {
|
||||||
|
_fix->stop_watch(TIME_OFFLOAD_LATENCY);
|
||||||
|
_fix->start_watch(TIME_HOST_NEIGHBOR);
|
||||||
|
for (int n = 0; n < aend; n++) {
|
||||||
|
ilist[n] = n;
|
||||||
|
numneigh[n] = 0;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (int i = astart; i < aend; i++)
|
||||||
|
list->firstneigh[i] = firstneigh + cnumneigh[i];
|
||||||
|
if (separate_buffers) {
|
||||||
|
_fix->start_watch(TIME_PACK);
|
||||||
|
_fix->set_neighbor_host_sizes();
|
||||||
|
buffers->pack_sep_from_single(_fix->host_min_local(),
|
||||||
|
_fix->host_used_local(),
|
||||||
|
_fix->host_min_ghost(),
|
||||||
|
_fix->host_used_ghost());
|
||||||
|
_fix->stop_watch(TIME_PACK);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
#pragma vector aligned
|
||||||
|
#pragma simd
|
||||||
|
for (int i = astart; i < aend; i++)
|
||||||
|
list->firstneigh[i] = firstneigh + cnumneigh[i];
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ---------------------------------------------------------------------- */
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
void NPairIntel::grow_stencil()
|
void NPairIntel::grow_stencil()
|
||||||
{
|
{
|
||||||
@ -65,3 +737,201 @@ void NPairIntel::grow_stencil()
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/* ---------------------------------------------------------------------- */
|
||||||
|
|
||||||
|
// ---- Half, no IC
|
||||||
|
|
||||||
|
template void NPairIntel::bin_newton<float, float, 0, 0, 0, 0, 0>
|
||||||
|
(const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
|
||||||
|
const int);
|
||||||
|
template void NPairIntel::bin_newton<float, double, 0, 0, 0, 0, 0>
|
||||||
|
(const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
|
||||||
|
const int);
|
||||||
|
template void NPairIntel::bin_newton<double, double, 0, 0, 0, 0, 0>
|
||||||
|
(const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
|
||||||
|
const int);
|
||||||
|
|
||||||
|
// ---- Half, IC
|
||||||
|
|
||||||
|
template void NPairIntel::bin_newton<float, float, 0, 1, 0, 0, 0>
|
||||||
|
(const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
|
||||||
|
const int);
|
||||||
|
template void NPairIntel::bin_newton<float, double, 0, 1, 0, 0, 0>
|
||||||
|
(const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
|
||||||
|
const int);
|
||||||
|
template void NPairIntel::bin_newton<double, double, 0, 1, 0, 0, 0>
|
||||||
|
(const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
|
||||||
|
const int);
|
||||||
|
|
||||||
|
// ---- Tri, no IC
|
||||||
|
|
||||||
|
template void NPairIntel::bin_newton<float, float, 0, 0, 0, 1, 0>
|
||||||
|
(const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
|
||||||
|
const int);
|
||||||
|
template void NPairIntel::bin_newton<float, double, 0, 0, 0, 1, 0>
|
||||||
|
(const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
|
||||||
|
const int);
|
||||||
|
template void NPairIntel::bin_newton<double, double, 0, 0, 0, 1, 0>
|
||||||
|
(const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
|
||||||
|
const int);
|
||||||
|
|
||||||
|
// ---- Tri, IC
|
||||||
|
|
||||||
|
template void NPairIntel::bin_newton<float, float, 0, 1, 0, 1, 0>
|
||||||
|
(const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
|
||||||
|
const int);
|
||||||
|
template void NPairIntel::bin_newton<float, double, 0, 1, 0, 1, 0>
|
||||||
|
(const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
|
||||||
|
const int);
|
||||||
|
template void NPairIntel::bin_newton<double, double, 0, 1, 0, 1, 0>
|
||||||
|
(const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
|
||||||
|
const int);
|
||||||
|
|
||||||
|
// ---- Full, no IC
|
||||||
|
|
||||||
|
template void NPairIntel::bin_newton<float, float, 0, 0, 1, 0, 0>
|
||||||
|
(const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
|
||||||
|
const int);
|
||||||
|
template void NPairIntel::bin_newton<float, double, 0, 0, 1, 0, 0>
|
||||||
|
(const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
|
||||||
|
const int);
|
||||||
|
template void NPairIntel::bin_newton<double, double, 0, 0, 1, 0, 0>
|
||||||
|
(const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
|
||||||
|
const int);
|
||||||
|
|
||||||
|
// ---- Full, IC
|
||||||
|
|
||||||
|
template void NPairIntel::bin_newton<float, float, 0, 1, 1, 0, 0>
|
||||||
|
(const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
|
||||||
|
const int);
|
||||||
|
template void NPairIntel::bin_newton<float, double, 0, 1, 1, 0, 0>
|
||||||
|
(const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
|
||||||
|
const int);
|
||||||
|
template void NPairIntel::bin_newton<double, double, 0, 1, 1, 0, 0>
|
||||||
|
(const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
|
||||||
|
const int);
|
||||||
|
|
||||||
|
// ---- 3-body, no IC
|
||||||
|
|
||||||
|
template void NPairIntel::bin_newton<float, float, 0, 0, 1, 0, 1>
|
||||||
|
(const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
|
||||||
|
const int);
|
||||||
|
template void NPairIntel::bin_newton<float, double, 0, 0, 1, 0, 1>
|
||||||
|
(const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
|
||||||
|
const int);
|
||||||
|
template void NPairIntel::bin_newton<double, double, 0, 0, 1, 0, 1>
|
||||||
|
(const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
|
||||||
|
const int);
|
||||||
|
|
||||||
|
// ---- 3-body, IC
|
||||||
|
|
||||||
|
template void NPairIntel::bin_newton<float, float, 0, 1, 1, 0, 1>
|
||||||
|
(const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
|
||||||
|
const int);
|
||||||
|
template void NPairIntel::bin_newton<float, double, 0, 1, 1, 0, 1>
|
||||||
|
(const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
|
||||||
|
const int);
|
||||||
|
template void NPairIntel::bin_newton<double, double, 0, 1, 1, 0, 1>
|
||||||
|
(const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
|
||||||
|
const int);
|
||||||
|
|
||||||
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
|
|
||||||
|
// ---- Half, no IC, no ghost
|
||||||
|
|
||||||
|
template void NPairIntel::bin_newton<float, float, 1, 0, 0, 0, 0>
|
||||||
|
(const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
|
||||||
|
const int);
|
||||||
|
template void NPairIntel::bin_newton<float, double, 1, 0, 0, 0, 0>
|
||||||
|
(const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
|
||||||
|
const int);
|
||||||
|
template void NPairIntel::bin_newton<double, double, 1, 0, 0, 0, 0>
|
||||||
|
(const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
|
||||||
|
const int);
|
||||||
|
|
||||||
|
// ---- Half, IC, no ghost
|
||||||
|
|
||||||
|
template void NPairIntel::bin_newton<float, float, 1, 1, 0, 0, 0>
|
||||||
|
(const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
|
||||||
|
const int);
|
||||||
|
template void NPairIntel::bin_newton<float, double, 1, 1, 0, 0, 0>
|
||||||
|
(const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
|
||||||
|
const int);
|
||||||
|
template void NPairIntel::bin_newton<double, double, 1, 1, 0, 0, 0>
|
||||||
|
(const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
|
||||||
|
const int);
|
||||||
|
|
||||||
|
// ---- Tri, no IC, no ghost
|
||||||
|
|
||||||
|
template void NPairIntel::bin_newton<float, float, 1, 0, 0, 1, 0>
|
||||||
|
(const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
|
||||||
|
const int);
|
||||||
|
template void NPairIntel::bin_newton<float, double, 1, 0, 0, 1, 0>
|
||||||
|
(const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
|
||||||
|
const int);
|
||||||
|
template void NPairIntel::bin_newton<double, double, 1, 0, 0, 1, 0>
|
||||||
|
(const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
|
||||||
|
const int);
|
||||||
|
|
||||||
|
// ---- Tri, IC, no ghost
|
||||||
|
|
||||||
|
template void NPairIntel::bin_newton<float, float, 1, 1, 0, 1, 0>
|
||||||
|
(const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
|
||||||
|
const int);
|
||||||
|
template void NPairIntel::bin_newton<float, double, 1, 1, 0, 1, 0>
|
||||||
|
(const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
|
||||||
|
const int);
|
||||||
|
template void NPairIntel::bin_newton<double, double, 1, 1, 0, 1, 0>
|
||||||
|
(const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
|
||||||
|
const int);
|
||||||
|
|
||||||
|
// ---- Full, no IC, no ghost
|
||||||
|
|
||||||
|
template void NPairIntel::bin_newton<float, float, 1, 0, 1, 0, 0>
|
||||||
|
(const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
|
||||||
|
const int);
|
||||||
|
template void NPairIntel::bin_newton<float, double, 1, 0, 1, 0, 0>
|
||||||
|
(const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
|
||||||
|
const int);
|
||||||
|
template void NPairIntel::bin_newton<double, double, 1, 0, 1, 0, 0>
|
||||||
|
(const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
|
||||||
|
const int);
|
||||||
|
|
||||||
|
// ---- Full, IC, no ghost
|
||||||
|
|
||||||
|
template void NPairIntel::bin_newton<float, float, 1, 1, 1, 0, 0>
|
||||||
|
(const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
|
||||||
|
const int);
|
||||||
|
template void NPairIntel::bin_newton<float, double, 1, 1, 1, 0, 0>
|
||||||
|
(const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
|
||||||
|
const int);
|
||||||
|
template void NPairIntel::bin_newton<double, double, 1, 1, 1, 0, 0>
|
||||||
|
(const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
|
||||||
|
const int);
|
||||||
|
|
||||||
|
// ---- 3-body, no IC, no ghost
|
||||||
|
|
||||||
|
template void NPairIntel::bin_newton<float, float, 1, 0, 1, 0, 1>
|
||||||
|
(const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
|
||||||
|
const int);
|
||||||
|
template void NPairIntel::bin_newton<float, double, 1, 0, 1, 0, 1>
|
||||||
|
(const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
|
||||||
|
const int);
|
||||||
|
template void NPairIntel::bin_newton<double, double, 1, 0, 1, 0, 1>
|
||||||
|
(const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
|
||||||
|
const int);
|
||||||
|
|
||||||
|
// ---- 3-body, IC, no ghost
|
||||||
|
|
||||||
|
template void NPairIntel::bin_newton<float, float, 1, 1, 1, 0, 1>
|
||||||
|
(const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
|
||||||
|
const int);
|
||||||
|
template void NPairIntel::bin_newton<float, double, 1, 1, 1, 0, 1>
|
||||||
|
(const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
|
||||||
|
const int);
|
||||||
|
template void NPairIntel::bin_newton<double, double, 1, 1, 1, 0, 1>
|
||||||
|
(const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
|
||||||
|
const int);
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|||||||
@ -25,10 +25,6 @@
|
|||||||
#include "intel_simd.h"
|
#include "intel_simd.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef OUTER_CHUNK
|
|
||||||
#include "intel_simd.h"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
#pragma offload_attribute(push,target(mic))
|
#pragma offload_attribute(push,target(mic))
|
||||||
#endif
|
#endif
|
||||||
@ -87,6 +83,10 @@ class NPairIntel : public NPair {
|
|||||||
protected:
|
protected:
|
||||||
FixIntel *_fix;
|
FixIntel *_fix;
|
||||||
|
|
||||||
|
template <class flt_t, class acc_t, int, int, int, int, int>
|
||||||
|
void bin_newton(const int, NeighList *, IntelBuffers<flt_t,acc_t> *,
|
||||||
|
const int, const int, const int offload_end = 0);
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
int _cop;
|
int _cop;
|
||||||
int *_off_map_stencil;
|
int *_off_map_stencil;
|
||||||
|
|||||||
@ -85,53 +85,47 @@ void PairBuckCoulCutIntel::compute(int eflag, int vflag,
|
|||||||
|
|
||||||
if (ago != 0 && fix->separate_buffers() == 0) {
|
if (ago != 0 && fix->separate_buffers() == 0) {
|
||||||
fix->start_watch(TIME_PACK);
|
fix->start_watch(TIME_PACK);
|
||||||
|
|
||||||
|
int packthreads;
|
||||||
|
if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
|
||||||
|
else packthreads = 1;
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
#pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
|
#pragma omp parallel if(packthreads > 1)
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
int ifrom, ito, tid;
|
int ifrom, ito, tid;
|
||||||
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
|
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
|
||||||
nthreads, sizeof(ATOM_T));
|
packthreads, sizeof(ATOM_T));
|
||||||
buffers->thr_pack(ifrom,ito,ago);
|
buffers->thr_pack(ifrom,ito,ago);
|
||||||
}
|
}
|
||||||
fix->stop_watch(TIME_PACK);
|
fix->stop_watch(TIME_PACK);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (evflag || vflag_fdotr) {
|
int ovflag = 0;
|
||||||
int ovflag = 0;
|
if (vflag_fdotr) ovflag = 2;
|
||||||
if (vflag_fdotr) ovflag = 2;
|
else if (vflag) ovflag = 1;
|
||||||
else if (vflag) ovflag = 1;
|
if (eflag) {
|
||||||
if (eflag) {
|
if (force->newton_pair) {
|
||||||
if (force->newton_pair) {
|
eval<1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
eval<1,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
|
|
||||||
} else {
|
|
||||||
eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
|
||||||
eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_pair) {
|
eval<1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
eval<1,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
|
|
||||||
} else {
|
|
||||||
eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
|
||||||
eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_pair) {
|
if (force->newton_pair) {
|
||||||
eval<0,0,1>(1, 0, buffers, fc, 0, offload_end);
|
eval<0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<0,0,1>(0, 0, buffers, fc, host_start, inum);
|
eval<0,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
} else {
|
} else {
|
||||||
eval<0,0,0>(1, 0, buffers, fc, 0, offload_end);
|
eval<0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<0,0,0>(0, 0, buffers, fc, host_start, inum);
|
eval<0,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ---------------------------------------------------------------------- */
|
/* ---------------------------------------------------------------------- */
|
||||||
|
|
||||||
template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||||
void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
|
void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc,
|
const ForceConst<flt_t> &fc,
|
||||||
@ -165,7 +159,7 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
|
|||||||
|
|
||||||
// Determine how much data to transfer
|
// Determine how much data to transfer
|
||||||
int x_size, q_size, f_stride, ev_size, separate_flag;
|
int x_size, q_size, f_stride, ev_size, separate_flag;
|
||||||
IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
|
IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
|
||||||
buffers, offload, fix, separate_flag,
|
buffers, offload, fix, separate_flag,
|
||||||
x_size, q_size, ev_size, f_stride);
|
x_size, q_size, ev_size, f_stride);
|
||||||
|
|
||||||
@ -208,27 +202,26 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
|
|||||||
f_stride, x, q);
|
f_stride, x, q);
|
||||||
|
|
||||||
acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
|
acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||||
if (EVFLAG) {
|
if (EFLAG) oevdwl = oecoul = (acc_t)0;
|
||||||
oevdwl = oecoul = (acc_t)0;
|
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
|
||||||
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// loop over neighbors of my atoms
|
// loop over neighbors of my atoms
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
#pragma omp parallel default(none) \
|
#pragma omp parallel reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||||
shared(f_start,f_stride,nlocal,nall,minlocal) \
|
|
||||||
reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
|
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
int iifrom, iito, tid;
|
int iifrom, iip, iito, tid;
|
||||||
IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
|
IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
|
||||||
iifrom += astart;
|
iifrom += astart;
|
||||||
iito += astart;
|
iito += astart;
|
||||||
|
|
||||||
FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
|
int foff;
|
||||||
memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
|
if (NEWTON_PAIR) foff = tid * f_stride - minlocal;
|
||||||
|
else foff = -minlocal;
|
||||||
|
FORCE_T * _noalias const f = f_start + foff;
|
||||||
|
if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
|
||||||
|
|
||||||
for (int i = iifrom; i < iito; ++i) {
|
for (int i = iifrom; i < iito; i += iip) {
|
||||||
const int itype = x[i].w;
|
const int itype = x[i].w;
|
||||||
|
|
||||||
const int ptr_off = itype * ntypes;
|
const int ptr_off = itype * ntypes;
|
||||||
@ -246,10 +239,9 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
|
|||||||
const flt_t ztmp = x[i].z;
|
const flt_t ztmp = x[i].z;
|
||||||
const flt_t qtmp = q[i];
|
const flt_t qtmp = q[i];
|
||||||
fxtmp = fytmp = fztmp = (acc_t)0;
|
fxtmp = fytmp = fztmp = (acc_t)0;
|
||||||
if (EVFLAG) {
|
if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
|
||||||
if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
|
if (NEWTON_PAIR == 0)
|
||||||
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
||||||
}
|
|
||||||
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
@ -319,71 +311,72 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
|
|||||||
if (rsq < c_cuti[jtype].cutsq) {
|
if (rsq < c_cuti[jtype].cutsq) {
|
||||||
#endif
|
#endif
|
||||||
const flt_t fpair = (forcecoul + forcebuck) * r2inv;
|
const flt_t fpair = (forcecoul + forcebuck) * r2inv;
|
||||||
fxtmp += delx * fpair;
|
const flt_t fpx = fpair * delx;
|
||||||
fytmp += dely * fpair;
|
fxtmp += fpx;
|
||||||
fztmp += delz * fpair;
|
if (NEWTON_PAIR) f[j].x -= fpx;
|
||||||
if (NEWTON_PAIR || j < nlocal) {
|
const flt_t fpy = fpair * dely;
|
||||||
f[j].x -= delx * fpair;
|
fytmp += fpy;
|
||||||
f[j].y -= dely * fpair;
|
if (NEWTON_PAIR) f[j].y -= fpy;
|
||||||
f[j].z -= delz * fpair;
|
const flt_t fpz = fpair * delz;
|
||||||
}
|
fztmp += fpz;
|
||||||
|
if (NEWTON_PAIR) f[j].z -= fpz;
|
||||||
|
|
||||||
if (EVFLAG) {
|
|
||||||
flt_t ev_pre = (flt_t)0;
|
|
||||||
if (NEWTON_PAIR || i < nlocal)
|
|
||||||
ev_pre += (flt_t)0.5;
|
|
||||||
if (NEWTON_PAIR || j < nlocal)
|
|
||||||
ev_pre += (flt_t)0.5;
|
|
||||||
|
|
||||||
if (EFLAG) {
|
if (EFLAG) {
|
||||||
sevdwl += ev_pre * evdwl;
|
sevdwl += evdwl;
|
||||||
secoul += ev_pre * ecoul;
|
secoul += ecoul;
|
||||||
if (eatom) {
|
if (eatom) {
|
||||||
if (NEWTON_PAIR || i < nlocal)
|
fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
|
||||||
fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
|
if (NEWTON_PAIR)
|
||||||
if (NEWTON_PAIR || j < nlocal)
|
f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
|
||||||
f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair, delx, dely, delz);
|
}
|
||||||
}
|
if (NEWTON_PAIR == 0)
|
||||||
|
IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz);
|
||||||
#ifdef INTEL_VMASK
|
#ifdef INTEL_VMASK
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
} // for jj
|
} // for jj
|
||||||
|
if (NEWTON_PAIR) {
|
||||||
f[i].x += fxtmp;
|
f[i].x += fxtmp;
|
||||||
f[i].y += fytmp;
|
f[i].y += fytmp;
|
||||||
f[i].z += fztmp;
|
f[i].z += fztmp;
|
||||||
|
} else {
|
||||||
IP_PRE_ev_tally_atomq(EVFLAG, EFLAG, vflag, f, fwtmp);
|
f[i].x = fxtmp;
|
||||||
|
f[i].y = fytmp;
|
||||||
|
f[i].z = fztmp;
|
||||||
|
}
|
||||||
|
IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
|
||||||
} // for ii
|
} // for ii
|
||||||
|
|
||||||
#ifndef _LMP_INTEL_OFFLOAD
|
IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
|
||||||
if (vflag == 2)
|
f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
|
||||||
#endif
|
ov4, ov5);
|
||||||
{
|
|
||||||
#if defined(_OPENMP)
|
|
||||||
#pragma omp barrier
|
|
||||||
#endif
|
|
||||||
IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall,
|
|
||||||
nlocal, minlocal, nthreads, f_start, f_stride,
|
|
||||||
x, offload);
|
|
||||||
}
|
|
||||||
} // end of omp parallel region
|
} // end of omp parallel region
|
||||||
if (EVFLAG) {
|
|
||||||
if (EFLAG) {
|
IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
|
||||||
ev_global[0] = oevdwl;
|
ov0, ov1, ov2, ov3, ov4, ov5);
|
||||||
ev_global[1] = oecoul;
|
|
||||||
}
|
if (EFLAG) {
|
||||||
if (vflag) {
|
if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
|
||||||
ev_global[2] = ov0;
|
ev_global[0] = oevdwl;
|
||||||
ev_global[3] = ov1;
|
ev_global[1] = oecoul;
|
||||||
ev_global[4] = ov2;
|
}
|
||||||
ev_global[5] = ov3;
|
if (vflag) {
|
||||||
ev_global[6] = ov4;
|
if (NEWTON_PAIR == 0) {
|
||||||
ev_global[7] = ov5;
|
ov0 *= (acc_t)0.5;
|
||||||
|
ov1 *= (acc_t)0.5;
|
||||||
|
ov2 *= (acc_t)0.5;
|
||||||
|
ov3 *= (acc_t)0.5;
|
||||||
|
ov4 *= (acc_t)0.5;
|
||||||
|
ov5 *= (acc_t)0.5;
|
||||||
}
|
}
|
||||||
|
ev_global[2] = ov0;
|
||||||
|
ev_global[3] = ov1;
|
||||||
|
ev_global[4] = ov2;
|
||||||
|
ev_global[5] = ov3;
|
||||||
|
ev_global[6] = ov4;
|
||||||
|
ev_global[7] = ov5;
|
||||||
}
|
}
|
||||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||||
*timer_compute = MIC_Wtime() - *timer_compute;
|
*timer_compute = MIC_Wtime() - *timer_compute;
|
||||||
@ -395,7 +388,7 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
|
|||||||
else
|
else
|
||||||
fix->stop_watch(TIME_HOST_PAIR);
|
fix->stop_watch(TIME_HOST_PAIR);
|
||||||
|
|
||||||
if (EVFLAG)
|
if (EFLAG || vflag)
|
||||||
fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
|
fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
|
||||||
else
|
else
|
||||||
fix->add_result_array(f_start, 0, offload);
|
fix->add_result_array(f_start, 0, offload);
|
||||||
@ -406,6 +399,10 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
|
|||||||
void PairBuckCoulCutIntel::init_style()
|
void PairBuckCoulCutIntel::init_style()
|
||||||
{
|
{
|
||||||
PairBuckCoulCut::init_style();
|
PairBuckCoulCut::init_style();
|
||||||
|
if (force->newton_pair == 0) {
|
||||||
|
neighbor->requests[neighbor->nrequest-1]->half = 0;
|
||||||
|
neighbor->requests[neighbor->nrequest-1]->full = 1;
|
||||||
|
}
|
||||||
neighbor->requests[neighbor->nrequest-1]->intel = 1;
|
neighbor->requests[neighbor->nrequest-1]->intel = 1;
|
||||||
|
|
||||||
int ifix = modify->find_fix("package_intel");
|
int ifix = modify->find_fix("package_intel");
|
||||||
|
|||||||
@ -49,7 +49,7 @@ class PairBuckCoulCutIntel : public PairBuckCoulCut {
|
|||||||
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> * buffers,
|
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> * buffers,
|
||||||
const ForceConst<flt_t> &fc);
|
const ForceConst<flt_t> &fc);
|
||||||
|
|
||||||
template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||||
void eval(const int offload, const int vflag,
|
void eval(const int offload, const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> * buffers,
|
IntelBuffers<flt_t,acc_t> * buffers,
|
||||||
const ForceConst<flt_t> &fc, const int astart, const int aend);
|
const ForceConst<flt_t> &fc, const int astart, const int aend);
|
||||||
|
|||||||
@ -85,53 +85,47 @@ void PairBuckCoulLongIntel::compute(int eflag, int vflag,
|
|||||||
|
|
||||||
if (_lrt == 0 && ago != 0 && fix->separate_buffers() == 0) {
|
if (_lrt == 0 && ago != 0 && fix->separate_buffers() == 0) {
|
||||||
fix->start_watch(TIME_PACK);
|
fix->start_watch(TIME_PACK);
|
||||||
|
|
||||||
|
int packthreads;
|
||||||
|
if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
|
||||||
|
else packthreads = 1;
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
#pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
|
#pragma omp parallel if(packthreads > 1)
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
int ifrom, ito, tid;
|
int ifrom, ito, tid;
|
||||||
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
|
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
|
||||||
nthreads, sizeof(ATOM_T));
|
packthreads, sizeof(ATOM_T));
|
||||||
buffers->thr_pack(ifrom,ito,ago);
|
buffers->thr_pack(ifrom,ito,ago);
|
||||||
}
|
}
|
||||||
fix->stop_watch(TIME_PACK);
|
fix->stop_watch(TIME_PACK);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (evflag || vflag_fdotr) {
|
int ovflag = 0;
|
||||||
int ovflag = 0;
|
if (vflag_fdotr) ovflag = 2;
|
||||||
if (vflag_fdotr) ovflag = 2;
|
else if (vflag) ovflag = 1;
|
||||||
else if (vflag) ovflag = 1;
|
if (eflag) {
|
||||||
if (eflag) {
|
if (force->newton_pair) {
|
||||||
if (force->newton_pair) {
|
eval<1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
eval<1,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
|
|
||||||
} else {
|
|
||||||
eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
|
||||||
eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_pair) {
|
eval<1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
eval<1,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
|
|
||||||
} else {
|
|
||||||
eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
|
||||||
eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_pair) {
|
if (force->newton_pair) {
|
||||||
eval<0,0,1>(1, 0, buffers, fc, 0, offload_end);
|
eval<0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<0,0,1>(0, 0, buffers, fc, host_start, inum);
|
eval<0,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
} else {
|
} else {
|
||||||
eval<0,0,0>(1, 0, buffers, fc, 0, offload_end);
|
eval<0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<0,0,0>(0, 0, buffers, fc, host_start, inum);
|
eval<0,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ---------------------------------------------------------------------- */
|
/* ---------------------------------------------------------------------- */
|
||||||
|
|
||||||
template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||||
void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
|
void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc,
|
const ForceConst<flt_t> &fc,
|
||||||
@ -170,9 +164,17 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
const int ntypes = atom->ntypes + 1;
|
const int ntypes = atom->ntypes + 1;
|
||||||
const int eatom = this->eflag_atom;
|
const int eatom = this->eflag_atom;
|
||||||
|
|
||||||
|
flt_t * _noalias const ccachex = buffers->get_ccachex();
|
||||||
|
flt_t * _noalias const ccachey = buffers->get_ccachey();
|
||||||
|
flt_t * _noalias const ccachez = buffers->get_ccachez();
|
||||||
|
flt_t * _noalias const ccachew = buffers->get_ccachew();
|
||||||
|
int * _noalias const ccachei = buffers->get_ccachei();
|
||||||
|
int * _noalias const ccachej = buffers->get_ccachej();
|
||||||
|
const int ccache_stride = _ccache_stride;
|
||||||
|
|
||||||
// Determine how much data to transfer
|
// Determine how much data to transfer
|
||||||
int x_size, q_size, f_stride, ev_size, separate_flag;
|
int x_size, q_size, f_stride, ev_size, separate_flag;
|
||||||
IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
|
IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
|
||||||
buffers, offload, fix, separate_flag,
|
buffers, offload, fix, separate_flag,
|
||||||
x_size, q_size, ev_size, f_stride);
|
x_size, q_size, ev_size, f_stride);
|
||||||
|
|
||||||
@ -208,8 +210,10 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
in(x:length(x_size) alloc_if(0) free_if(0)) \
|
in(x:length(x_size) alloc_if(0) free_if(0)) \
|
||||||
in(q:length(q_size) alloc_if(0) free_if(0)) \
|
in(q:length(q_size) alloc_if(0) free_if(0)) \
|
||||||
in(overflow:length(0) alloc_if(0) free_if(0)) \
|
in(overflow:length(0) alloc_if(0) free_if(0)) \
|
||||||
|
in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \
|
||||||
|
in(ccachei,ccachej:length(0) alloc_if(0) free_if(0)) \
|
||||||
in(astart,nthreads,qqrd2e,g_ewald,inum,nall,ntypes,vflag,eatom) \
|
in(astart,nthreads,qqrd2e,g_ewald,inum,nall,ntypes,vflag,eatom) \
|
||||||
in(f_stride,nlocal,minlocal,separate_flag,offload) \
|
in(ccache_stride,f_stride,nlocal,minlocal,separate_flag,offload) \
|
||||||
out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
|
out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
|
||||||
out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
|
out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
|
||||||
out(timer_compute:length(1) alloc_if(0) free_if(0)) \
|
out(timer_compute:length(1) alloc_if(0) free_if(0)) \
|
||||||
@ -224,27 +228,34 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
f_stride, x, q);
|
f_stride, x, q);
|
||||||
|
|
||||||
acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
|
acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||||
if (EVFLAG) {
|
if (EFLAG) oevdwl = oecoul = (acc_t)0;
|
||||||
oevdwl = oecoul = (acc_t)0;
|
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
|
||||||
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// loop over neighbors of my atoms
|
// loop over neighbors of my atoms
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
#pragma omp parallel default(none) \
|
#pragma omp parallel reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||||
shared(f_start,f_stride,nlocal,nall,minlocal) \
|
|
||||||
reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
|
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
int iifrom, iito, tid;
|
int iifrom, iip, iito, tid;
|
||||||
IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
|
IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
|
||||||
iifrom += astart;
|
iifrom += astart;
|
||||||
iito += astart;
|
iito += astart;
|
||||||
|
|
||||||
FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
|
int foff;
|
||||||
memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
|
if (NEWTON_PAIR) foff = tid * f_stride - minlocal;
|
||||||
|
else foff = -minlocal;
|
||||||
|
FORCE_T * _noalias const f = f_start + foff;
|
||||||
|
if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
|
||||||
|
|
||||||
for (int i = iifrom; i < iito; ++i) {
|
const int toffs = tid * ccache_stride;
|
||||||
|
flt_t * _noalias const tdelx = ccachex + toffs;
|
||||||
|
flt_t * _noalias const tdely = ccachey + toffs;
|
||||||
|
flt_t * _noalias const tdelz = ccachez + toffs;
|
||||||
|
flt_t * _noalias const trsq = ccachew + toffs;
|
||||||
|
int * _noalias const tj = ccachei + toffs;
|
||||||
|
int * _noalias const tjtype = ccachej + toffs;
|
||||||
|
|
||||||
|
for (int i = iifrom; i < iito; i += iip) {
|
||||||
const int itype = x[i].w;
|
const int itype = x[i].w;
|
||||||
const int ptr_off = itype * ntypes;
|
const int ptr_off = itype * ntypes;
|
||||||
const C_FORCE_T * _noalias const c_forcei = c_force + ptr_off;
|
const C_FORCE_T * _noalias const c_forcei = c_force + ptr_off;
|
||||||
@ -262,85 +273,98 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
const flt_t ztmp = x[i].z;
|
const flt_t ztmp = x[i].z;
|
||||||
const flt_t qtmp = q[i];
|
const flt_t qtmp = q[i];
|
||||||
fxtmp = fytmp = fztmp = (acc_t)0;
|
fxtmp = fytmp = fztmp = (acc_t)0;
|
||||||
if (EVFLAG) {
|
if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
|
||||||
if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
|
if (NEWTON_PAIR == 0)
|
||||||
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
if (vflag == 1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
||||||
|
|
||||||
|
int ej = 0;
|
||||||
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#pragma vector aligned
|
||||||
|
#pragma ivdep
|
||||||
|
#endif
|
||||||
|
for (int jj = 0; jj < jnum; jj++) {
|
||||||
|
const int j = jlist[jj] & NEIGHMASK;
|
||||||
|
const flt_t delx = xtmp - x[j].x;
|
||||||
|
const flt_t dely = ytmp - x[j].y;
|
||||||
|
const flt_t delz = ztmp - x[j].z;
|
||||||
|
const int jtype = x[j].w;
|
||||||
|
const flt_t rsq = delx * delx + dely * dely + delz * delz;
|
||||||
|
|
||||||
|
if (rsq < c_forcei[jtype].cutsq) {
|
||||||
|
trsq[ej]=rsq;
|
||||||
|
tdelx[ej]=delx;
|
||||||
|
tdely[ej]=dely;
|
||||||
|
tdelz[ej]=delz;
|
||||||
|
tjtype[ej]=jtype;
|
||||||
|
tj[ej]=jlist[jj];
|
||||||
|
ej++;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
|
||||||
sv0, sv1, sv2, sv3, sv4, sv5)
|
sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
#endif
|
#endif
|
||||||
for (int jj = 0; jj < jnum; jj++) {
|
for (int jj = 0; jj < ej; jj++) {
|
||||||
flt_t forcecoul, forcebuck, evdwl, ecoul;
|
flt_t forcecoul, forcebuck, evdwl, ecoul;
|
||||||
forcecoul = forcebuck = evdwl = ecoul = (flt_t)0.0;
|
forcecoul = forcebuck = evdwl = ecoul = (flt_t)0.0;
|
||||||
|
|
||||||
const int sbindex = jlist[jj] >> SBBITS & 3;
|
const int j = tj[jj] & NEIGHMASK;
|
||||||
const int j = jlist[jj] & NEIGHMASK;
|
const int sbindex = tj[jj] >> SBBITS & 3;
|
||||||
|
const int jtype = tjtype[jj];
|
||||||
const flt_t delx = xtmp - x[j].x;
|
const flt_t rsq = trsq[jj];
|
||||||
const flt_t dely = ytmp - x[j].y;
|
|
||||||
const flt_t delz = ztmp - x[j].z;
|
|
||||||
const int jtype = x[j].w;
|
|
||||||
const flt_t rsq = delx * delx + dely * dely + delz * delz;
|
|
||||||
const flt_t r2inv = (flt_t)1.0 / rsq;
|
const flt_t r2inv = (flt_t)1.0 / rsq;
|
||||||
const flt_t r = (flt_t)1.0 / sqrt(r2inv);
|
const flt_t r = (flt_t)1.0 / sqrt(r2inv);
|
||||||
|
|
||||||
#ifdef INTEL_VMASK
|
#ifdef INTEL_ALLOW_TABLE
|
||||||
if (rsq < c_forcei[jtype].cutsq) {
|
if (!ncoultablebits || rsq <= tabinnersq) {
|
||||||
#endif
|
#endif
|
||||||
#ifdef INTEL_ALLOW_TABLE
|
const flt_t A1 = 0.254829592;
|
||||||
if (!ncoultablebits || rsq <= tabinnersq) {
|
const flt_t A2 = -0.284496736;
|
||||||
#endif
|
const flt_t A3 = 1.421413741;
|
||||||
const flt_t A1 = 0.254829592;
|
const flt_t A4 = -1.453152027;
|
||||||
const flt_t A2 = -0.284496736;
|
const flt_t A5 = 1.061405429;
|
||||||
const flt_t A3 = 1.421413741;
|
const flt_t EWALD_F = 1.12837917;
|
||||||
const flt_t A4 = -1.453152027;
|
const flt_t INV_EWALD_P = 1.0 / 0.3275911;
|
||||||
const flt_t A5 = 1.061405429;
|
|
||||||
const flt_t EWALD_F = 1.12837917;
|
|
||||||
const flt_t INV_EWALD_P = 1.0 / 0.3275911;
|
|
||||||
|
|
||||||
const flt_t grij = g_ewald * r;
|
const flt_t grij = g_ewald * r;
|
||||||
const flt_t expm2 = exp(-grij * grij);
|
const flt_t expm2 = exp(-grij * grij);
|
||||||
const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij);
|
const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij);
|
||||||
const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
|
const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
|
||||||
const flt_t prefactor = qqrd2e * qtmp * q[j] / r;
|
const flt_t prefactor = qqrd2e * qtmp * q[j] / r;
|
||||||
forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
|
forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
|
||||||
if (EFLAG) ecoul = prefactor * erfc;
|
if (EFLAG) ecoul = prefactor * erfc;
|
||||||
|
|
||||||
const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])*
|
const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])*
|
||||||
|
prefactor;
|
||||||
|
forcecoul -= adjust;
|
||||||
|
if (EFLAG) ecoul -= adjust;
|
||||||
|
|
||||||
|
#ifdef INTEL_ALLOW_TABLE
|
||||||
|
} else {
|
||||||
|
float rsq_lookup = rsq;
|
||||||
|
const int itable = (__intel_castf32_u32(rsq_lookup) &
|
||||||
|
ncoulmask) >> ncoulshiftbits;
|
||||||
|
const flt_t fraction = (rsq_lookup - table[itable].r) *
|
||||||
|
table[itable].dr;
|
||||||
|
|
||||||
|
const flt_t tablet = table[itable].f +
|
||||||
|
fraction * table[itable].df;
|
||||||
|
forcecoul = qtmp * q[j] * tablet;
|
||||||
|
if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] +
|
||||||
|
fraction * detable[itable]);
|
||||||
|
if (sbindex) {
|
||||||
|
const flt_t table2 = ctable[itable] +
|
||||||
|
fraction * dctable[itable];
|
||||||
|
const flt_t prefactor = qtmp * q[j] * table2;
|
||||||
|
const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) *
|
||||||
prefactor;
|
prefactor;
|
||||||
forcecoul -= adjust;
|
forcecoul -= adjust;
|
||||||
if (EFLAG) ecoul -= adjust;
|
if (EFLAG) ecoul -= adjust;
|
||||||
|
|
||||||
#ifdef INTEL_ALLOW_TABLE
|
|
||||||
} else {
|
|
||||||
float rsq_lookup = rsq;
|
|
||||||
const int itable = (__intel_castf32_u32(rsq_lookup) &
|
|
||||||
ncoulmask) >> ncoulshiftbits;
|
|
||||||
const flt_t fraction = (rsq_lookup - table[itable].r) *
|
|
||||||
table[itable].dr;
|
|
||||||
|
|
||||||
const flt_t tablet = table[itable].f +
|
|
||||||
fraction * table[itable].df;
|
|
||||||
forcecoul = qtmp * q[j] * tablet;
|
|
||||||
if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] +
|
|
||||||
fraction * detable[itable]);
|
|
||||||
if (sbindex) {
|
|
||||||
const flt_t table2 = ctable[itable] +
|
|
||||||
fraction * dctable[itable];
|
|
||||||
const flt_t prefactor = qtmp * q[j] * table2;
|
|
||||||
const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) *
|
|
||||||
prefactor;
|
|
||||||
forcecoul -= adjust;
|
|
||||||
if (EFLAG) ecoul -= adjust;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
#ifdef INTEL_VMASK
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef INTEL_VMASK
|
#ifdef INTEL_VMASK
|
||||||
if (rsq < c_forcei[jtype].cut_ljsq) {
|
if (rsq < c_forcei[jtype].cut_ljsq) {
|
||||||
@ -361,80 +385,74 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
#ifdef INTEL_VMASK
|
#ifdef INTEL_VMASK
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
if (rsq > c_forcei[jtype].cutsq)
|
|
||||||
{ forcecoul = (flt_t)0.0; ecoul = (flt_t)0.0; }
|
|
||||||
if (rsq > c_forcei[jtype].cut_ljsq)
|
if (rsq > c_forcei[jtype].cut_ljsq)
|
||||||
{ forcebuck = (flt_t)0.0; evdwl = (flt_t)0.0; }
|
{ forcebuck = (flt_t)0.0; evdwl = (flt_t)0.0; }
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef INTEL_VMASK
|
const flt_t fpair = (forcecoul + forcebuck) * r2inv;
|
||||||
if (rsq < c_forcei[jtype].cutsq) {
|
const flt_t fpx = fpair * tdelx[jj];
|
||||||
#endif
|
fxtmp += fpx;
|
||||||
const flt_t fpair = (forcecoul + forcebuck) * r2inv;
|
if (NEWTON_PAIR) f[j].x -= fpx;
|
||||||
fxtmp += delx * fpair;
|
const flt_t fpy = fpair * tdely[jj];
|
||||||
fytmp += dely * fpair;
|
fytmp += fpy;
|
||||||
fztmp += delz * fpair;
|
if (NEWTON_PAIR) f[j].y -= fpy;
|
||||||
if (NEWTON_PAIR || j < nlocal) {
|
const flt_t fpz = fpair * tdelz[jj];
|
||||||
f[j].x -= delx * fpair;
|
fztmp += fpz;
|
||||||
f[j].y -= dely * fpair;
|
if (NEWTON_PAIR) f[j].z -= fpz;
|
||||||
f[j].z -= delz * fpair;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (EVFLAG) {
|
if (EFLAG) {
|
||||||
flt_t ev_pre = (flt_t)0;
|
sevdwl += evdwl;
|
||||||
if (NEWTON_PAIR || i < nlocal)
|
secoul += ecoul;
|
||||||
ev_pre += (flt_t)0.5;
|
if (eatom) {
|
||||||
if (NEWTON_PAIR || j < nlocal)
|
fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
|
||||||
ev_pre += (flt_t)0.5;
|
if (NEWTON_PAIR)
|
||||||
|
f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
|
||||||
if (EFLAG) {
|
|
||||||
sevdwl += ev_pre * evdwl;
|
|
||||||
secoul += ev_pre * ecoul;
|
|
||||||
if (eatom) {
|
|
||||||
if (NEWTON_PAIR || i < nlocal)
|
|
||||||
fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
|
|
||||||
if (NEWTON_PAIR || j < nlocal)
|
|
||||||
f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair, delx, dely, delz);
|
|
||||||
}
|
}
|
||||||
#ifdef INTEL_VMASK
|
}
|
||||||
}
|
if (NEWTON_PAIR == 0)
|
||||||
#endif
|
IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj],
|
||||||
|
fpx, fpy, fpz);
|
||||||
} // for jj
|
} // for jj
|
||||||
|
if (NEWTON_PAIR) {
|
||||||
f[i].x += fxtmp;
|
f[i].x += fxtmp;
|
||||||
f[i].y += fytmp;
|
f[i].y += fytmp;
|
||||||
f[i].z += fztmp;
|
f[i].z += fztmp;
|
||||||
IP_PRE_ev_tally_atomq(EVFLAG, EFLAG, vflag, f, fwtmp);
|
} else {
|
||||||
|
f[i].x = fxtmp;
|
||||||
|
f[i].y = fytmp;
|
||||||
|
f[i].z = fztmp;
|
||||||
|
}
|
||||||
|
IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
|
||||||
} // for ii
|
} // for ii
|
||||||
|
|
||||||
#ifndef _LMP_INTEL_OFFLOAD
|
IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
|
||||||
if (vflag == 2)
|
f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
|
||||||
#endif
|
ov4, ov5);
|
||||||
{
|
|
||||||
#if defined(_OPENMP)
|
|
||||||
#pragma omp barrier
|
|
||||||
#endif
|
|
||||||
IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall,
|
|
||||||
nlocal, minlocal, nthreads, f_start, f_stride,
|
|
||||||
x, offload);
|
|
||||||
}
|
|
||||||
} // end of omp parallel region
|
} // end of omp parallel region
|
||||||
if (EVFLAG) {
|
|
||||||
if (EFLAG) {
|
IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
|
||||||
ev_global[0] = oevdwl;
|
ov0, ov1, ov2, ov3, ov4, ov5);
|
||||||
ev_global[1] = oecoul;
|
|
||||||
}
|
if (EFLAG) {
|
||||||
if (vflag) {
|
if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
|
||||||
ev_global[2] = ov0;
|
ev_global[0] = oevdwl;
|
||||||
ev_global[3] = ov1;
|
ev_global[1] = oecoul;
|
||||||
ev_global[4] = ov2;
|
}
|
||||||
ev_global[5] = ov3;
|
if (vflag) {
|
||||||
ev_global[6] = ov4;
|
if (NEWTON_PAIR == 0) {
|
||||||
ev_global[7] = ov5;
|
ov0 *= (acc_t)0.5;
|
||||||
|
ov1 *= (acc_t)0.5;
|
||||||
|
ov2 *= (acc_t)0.5;
|
||||||
|
ov3 *= (acc_t)0.5;
|
||||||
|
ov4 *= (acc_t)0.5;
|
||||||
|
ov5 *= (acc_t)0.5;
|
||||||
}
|
}
|
||||||
|
ev_global[2] = ov0;
|
||||||
|
ev_global[3] = ov1;
|
||||||
|
ev_global[4] = ov2;
|
||||||
|
ev_global[5] = ov3;
|
||||||
|
ev_global[6] = ov4;
|
||||||
|
ev_global[7] = ov5;
|
||||||
}
|
}
|
||||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||||
*timer_compute = MIC_Wtime() - *timer_compute;
|
*timer_compute = MIC_Wtime() - *timer_compute;
|
||||||
@ -446,7 +464,7 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
else
|
else
|
||||||
fix->stop_watch(TIME_HOST_PAIR);
|
fix->stop_watch(TIME_HOST_PAIR);
|
||||||
|
|
||||||
if (EVFLAG)
|
if (EFLAG || vflag)
|
||||||
fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
|
fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
|
||||||
else
|
else
|
||||||
fix->add_result_array(f_start, 0, offload);
|
fix->add_result_array(f_start, 0, offload);
|
||||||
@ -457,6 +475,10 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
void PairBuckCoulLongIntel::init_style()
|
void PairBuckCoulLongIntel::init_style()
|
||||||
{
|
{
|
||||||
PairBuckCoulLong::init_style();
|
PairBuckCoulLong::init_style();
|
||||||
|
if (force->newton_pair == 0) {
|
||||||
|
neighbor->requests[neighbor->nrequest-1]->half = 0;
|
||||||
|
neighbor->requests[neighbor->nrequest-1]->full = 1;
|
||||||
|
}
|
||||||
neighbor->requests[neighbor->nrequest-1]->intel = 1;
|
neighbor->requests[neighbor->nrequest-1]->intel = 1;
|
||||||
|
|
||||||
int ifix = modify->find_fix("package_intel");
|
int ifix = modify->find_fix("package_intel");
|
||||||
@ -484,6 +506,13 @@ template <class flt_t, class acc_t>
|
|||||||
void PairBuckCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
|
void PairBuckCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers)
|
IntelBuffers<flt_t,acc_t> *buffers)
|
||||||
{
|
{
|
||||||
|
int off_ccache = 0;
|
||||||
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
|
if (_cop >= 0) off_ccache = 1;
|
||||||
|
#endif
|
||||||
|
buffers->grow_ccache(off_ccache, comm->nthreads, 1);
|
||||||
|
_ccache_stride = buffers->ccache_stride();
|
||||||
|
|
||||||
int tp1 = atom->ntypes + 1;
|
int tp1 = atom->ntypes + 1;
|
||||||
int ntable = 1;
|
int ntable = 1;
|
||||||
if (ncoultablebits)
|
if (ncoultablebits)
|
||||||
@ -518,6 +547,9 @@ void PairBuckCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
|
|||||||
|
|
||||||
for (int i = 0; i < tp1; i++) {
|
for (int i = 0; i < tp1; i++) {
|
||||||
for (int j = 0; j < tp1; j++) {
|
for (int j = 0; j < tp1; j++) {
|
||||||
|
if (cutsq[i][j] < cut_ljsq[i][j])
|
||||||
|
error->all(FLERR,
|
||||||
|
"Intel variant of lj/buck/coul/long expects lj cutoff<=coulombic");
|
||||||
fc.c_force[i][j].cutsq = cutsq[i][j];
|
fc.c_force[i][j].cutsq = cutsq[i][j];
|
||||||
fc.c_force[i][j].cut_ljsq = cut_ljsq[i][j];
|
fc.c_force[i][j].cut_ljsq = cut_ljsq[i][j];
|
||||||
fc.c_force[i][j].buck1 = buck1[i][j];
|
fc.c_force[i][j].buck1 = buck1[i][j];
|
||||||
|
|||||||
@ -40,7 +40,7 @@ class PairBuckCoulLongIntel : public PairBuckCoulLong {
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
FixIntel *fix;
|
FixIntel *fix;
|
||||||
int _cop, _lrt;
|
int _cop, _lrt, _ccache_stride;
|
||||||
|
|
||||||
template <class flt_t> class ForceConst;
|
template <class flt_t> class ForceConst;
|
||||||
|
|
||||||
@ -48,7 +48,7 @@ class PairBuckCoulLongIntel : public PairBuckCoulLong {
|
|||||||
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> * buffers,
|
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> * buffers,
|
||||||
const ForceConst<flt_t> &fc);
|
const ForceConst<flt_t> &fc);
|
||||||
|
|
||||||
template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||||
void eval(const int offload, const int vflag,
|
void eval(const int offload, const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> * buffers,
|
IntelBuffers<flt_t,acc_t> * buffers,
|
||||||
const ForceConst<flt_t> &fc, const int astart, const int aend);
|
const ForceConst<flt_t> &fc, const int astart, const int aend);
|
||||||
|
|||||||
@ -78,57 +78,51 @@ void PairBuckIntel::compute(int eflag, int vflag,
|
|||||||
|
|
||||||
if (ago != 0 && fix->separate_buffers() == 0) {
|
if (ago != 0 && fix->separate_buffers() == 0) {
|
||||||
fix->start_watch(TIME_PACK);
|
fix->start_watch(TIME_PACK);
|
||||||
|
|
||||||
|
int packthreads;
|
||||||
|
if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
|
||||||
|
else packthreads = 1;
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
#pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
|
#pragma omp parallel if(packthreads > 1)
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
int ifrom, ito, tid;
|
int ifrom, ito, tid;
|
||||||
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
|
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
|
||||||
nthreads, sizeof(ATOM_T));
|
packthreads, sizeof(ATOM_T));
|
||||||
buffers->thr_pack(ifrom,ito,ago);
|
buffers->thr_pack(ifrom,ito,ago);
|
||||||
}
|
}
|
||||||
fix->stop_watch(TIME_PACK);
|
fix->stop_watch(TIME_PACK);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (evflag || vflag_fdotr) {
|
int ovflag = 0;
|
||||||
int ovflag = 0;
|
if (vflag_fdotr) ovflag = 2;
|
||||||
if (vflag_fdotr) ovflag = 2;
|
else if (vflag) ovflag = 1;
|
||||||
else if (vflag) ovflag = 1;
|
if (eflag) {
|
||||||
if (eflag) {
|
if (force->newton_pair) {
|
||||||
if (force->newton_pair) {
|
eval<1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
eval<1,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
|
|
||||||
} else {
|
|
||||||
eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
|
||||||
eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_pair) {
|
eval<1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
eval<1,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
|
|
||||||
} else {
|
|
||||||
eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
|
||||||
eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_pair) {
|
if (force->newton_pair) {
|
||||||
eval<0,0,1>(1, 0, buffers, fc, 0, offload_end);
|
eval<0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<0,0,1>(0, 0, buffers, fc, host_start, inum);
|
eval<0,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
} else {
|
} else {
|
||||||
eval<0,0,0>(1, 0, buffers, fc, 0, offload_end);
|
eval<0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<0,0,0>(0, 0, buffers, fc, host_start, inum);
|
eval<0,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ---------------------------------------------------------------------- */
|
/* ---------------------------------------------------------------------- */
|
||||||
|
|
||||||
template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||||
void PairBuckIntel::eval(const int offload, const int vflag,
|
void PairBuckIntel::eval(const int offload, const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc,
|
const ForceConst<flt_t> &fc,
|
||||||
const int astart, const int aend)
|
const int astart, const int aend)
|
||||||
{
|
{
|
||||||
const int inum = aend - astart;
|
const int inum = aend - astart;
|
||||||
if (inum == 0) return;
|
if (inum == 0) return;
|
||||||
@ -152,7 +146,7 @@ void PairBuckIntel::eval(const int offload, const int vflag,
|
|||||||
|
|
||||||
// Determine how much data to transfer
|
// Determine how much data to transfer
|
||||||
int x_size, q_size, f_stride, ev_size, separate_flag;
|
int x_size, q_size, f_stride, ev_size, separate_flag;
|
||||||
IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
|
IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
|
||||||
buffers, offload, fix, separate_flag,
|
buffers, offload, fix, separate_flag,
|
||||||
x_size, q_size, ev_size, f_stride);
|
x_size, q_size, ev_size, f_stride);
|
||||||
|
|
||||||
@ -192,27 +186,26 @@ void PairBuckIntel::eval(const int offload, const int vflag,
|
|||||||
f_stride, x, 0);
|
f_stride, x, 0);
|
||||||
|
|
||||||
acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
|
acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||||
if (EVFLAG) {
|
if (EFLAG) oevdwl = (acc_t)0;
|
||||||
oevdwl = (acc_t)0;
|
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
|
||||||
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// loop over neighbors of my atoms
|
// loop over neighbors of my atoms
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
#pragma omp parallel default(none) \
|
#pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||||
shared(f_start,f_stride,nlocal,nall,minlocal) \
|
|
||||||
reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
|
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
int iifrom, iito, tid;
|
int iifrom, iip, iito, tid;
|
||||||
IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
|
IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
|
||||||
iifrom += astart;
|
iifrom += astart;
|
||||||
iito += astart;
|
iito += astart;
|
||||||
|
|
||||||
FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
|
int foff;
|
||||||
memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
|
if (NEWTON_PAIR) foff = tid * f_stride - minlocal;
|
||||||
|
else foff = -minlocal;
|
||||||
|
FORCE_T * _noalias const f = f_start + foff;
|
||||||
|
if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
|
||||||
|
|
||||||
for (int i = iifrom; i < iito; ++i) {
|
for (int i = iifrom; i < iito; i += iip) {
|
||||||
const int itype = x[i].w;
|
const int itype = x[i].w;
|
||||||
|
|
||||||
const int ptr_off = itype * ntypes;
|
const int ptr_off = itype * ntypes;
|
||||||
@ -228,10 +221,9 @@ void PairBuckIntel::eval(const int offload, const int vflag,
|
|||||||
const flt_t ytmp = x[i].y;
|
const flt_t ytmp = x[i].y;
|
||||||
const flt_t ztmp = x[i].z;
|
const flt_t ztmp = x[i].z;
|
||||||
fxtmp = fytmp = fztmp = (acc_t)0;
|
fxtmp = fytmp = fztmp = (acc_t)0;
|
||||||
if (EVFLAG) {
|
if (EFLAG) fwtmp = sevdwl = (acc_t)0;
|
||||||
if (EFLAG) fwtmp = sevdwl = (acc_t)0;
|
if (NEWTON_PAIR == 0)
|
||||||
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
||||||
}
|
|
||||||
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
@ -284,69 +276,70 @@ void PairBuckIntel::eval(const int offload, const int vflag,
|
|||||||
evdwl *= factor_lj;
|
evdwl *= factor_lj;
|
||||||
}
|
}
|
||||||
const flt_t fpair = forcebuck * r2inv;
|
const flt_t fpair = forcebuck * r2inv;
|
||||||
fxtmp += delx * fpair;
|
const flt_t fpx = fpair * delx;
|
||||||
fytmp += dely * fpair;
|
fxtmp += fpx;
|
||||||
fztmp += delz * fpair;
|
if (NEWTON_PAIR) f[j].x -= fpx;
|
||||||
if (NEWTON_PAIR || j < nlocal) {
|
const flt_t fpy = fpair * dely;
|
||||||
f[j].x -= delx * fpair;
|
fytmp += fpy;
|
||||||
f[j].y -= dely * fpair;
|
if (NEWTON_PAIR) f[j].y -= fpy;
|
||||||
f[j].z -= delz * fpair;
|
const flt_t fpz = fpair * delz;
|
||||||
}
|
fztmp += fpz;
|
||||||
|
if (NEWTON_PAIR) f[j].z -= fpz;
|
||||||
|
|
||||||
if (EVFLAG) {
|
if (EFLAG) {
|
||||||
flt_t ev_pre = (flt_t)0;
|
sevdwl += evdwl;
|
||||||
if (NEWTON_PAIR || i < nlocal)
|
if (eatom) {
|
||||||
ev_pre += (flt_t)0.5;
|
fwtmp += (flt_t)0.5 * evdwl;
|
||||||
if (NEWTON_PAIR || j < nlocal)
|
if (NEWTON_PAIR)
|
||||||
ev_pre += (flt_t)0.5;
|
f[j].w += (flt_t)0.5 * evdwl;
|
||||||
|
}
|
||||||
if (EFLAG) {
|
}
|
||||||
sevdwl += ev_pre * evdwl;
|
if (NEWTON_PAIR == 0)
|
||||||
if (eatom) {
|
IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz);
|
||||||
if (NEWTON_PAIR || i < nlocal)
|
|
||||||
fwtmp += (flt_t)0.5 * evdwl;
|
|
||||||
if (NEWTON_PAIR || j < nlocal)
|
|
||||||
f[j].w += (flt_t)0.5 * evdwl;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair, delx, dely, delz);
|
|
||||||
}
|
|
||||||
#ifdef INTEL_VMASK
|
#ifdef INTEL_VMASK
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
} // for jj
|
} // for jj
|
||||||
|
if (NEWTON_PAIR) {
|
||||||
f[i].x += fxtmp;
|
f[i].x += fxtmp;
|
||||||
f[i].y += fytmp;
|
f[i].y += fytmp;
|
||||||
f[i].z += fztmp;
|
f[i].z += fztmp;
|
||||||
IP_PRE_ev_tally_atom(EVFLAG, EFLAG, vflag, f, fwtmp);
|
} else {
|
||||||
|
f[i].x = fxtmp;
|
||||||
|
f[i].y = fytmp;
|
||||||
|
f[i].z = fztmp;
|
||||||
|
}
|
||||||
|
IP_PRE_ev_tally_atom(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
|
||||||
} // for ii
|
} // for ii
|
||||||
|
|
||||||
#ifndef _LMP_INTEL_OFFLOAD
|
IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
|
||||||
if (vflag == 2)
|
f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
|
||||||
#endif
|
ov4, ov5);
|
||||||
{
|
|
||||||
#if defined(_OPENMP)
|
|
||||||
#pragma omp barrier
|
|
||||||
#endif
|
|
||||||
IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall,
|
|
||||||
nlocal, minlocal, nthreads, f_start, f_stride,
|
|
||||||
x, offload);
|
|
||||||
}
|
|
||||||
} // end of omp parallel region
|
} // end of omp parallel region
|
||||||
if (EVFLAG) {
|
|
||||||
if (EFLAG) {
|
IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
|
||||||
ev_global[0] = oevdwl;
|
ov0, ov1, ov2, ov3, ov4, ov5);
|
||||||
ev_global[1] = (acc_t)0;
|
|
||||||
}
|
if (EFLAG) {
|
||||||
if (vflag) {
|
if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
|
||||||
ev_global[2] = ov0;
|
ev_global[0] = oevdwl;
|
||||||
ev_global[3] = ov1;
|
ev_global[1] = (acc_t)0;
|
||||||
ev_global[4] = ov2;
|
}
|
||||||
ev_global[5] = ov3;
|
if (vflag) {
|
||||||
ev_global[6] = ov4;
|
if (NEWTON_PAIR == 0) {
|
||||||
ev_global[7] = ov5;
|
ov0 *= (acc_t)0.5;
|
||||||
|
ov1 *= (acc_t)0.5;
|
||||||
|
ov2 *= (acc_t)0.5;
|
||||||
|
ov3 *= (acc_t)0.5;
|
||||||
|
ov4 *= (acc_t)0.5;
|
||||||
|
ov5 *= (acc_t)0.5;
|
||||||
}
|
}
|
||||||
|
ev_global[2] = ov0;
|
||||||
|
ev_global[3] = ov1;
|
||||||
|
ev_global[4] = ov2;
|
||||||
|
ev_global[5] = ov3;
|
||||||
|
ev_global[6] = ov4;
|
||||||
|
ev_global[7] = ov5;
|
||||||
}
|
}
|
||||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||||
*timer_compute = MIC_Wtime() - *timer_compute;
|
*timer_compute = MIC_Wtime() - *timer_compute;
|
||||||
@ -358,7 +351,7 @@ void PairBuckIntel::eval(const int offload, const int vflag,
|
|||||||
else
|
else
|
||||||
fix->stop_watch(TIME_HOST_PAIR);
|
fix->stop_watch(TIME_HOST_PAIR);
|
||||||
|
|
||||||
if (EVFLAG)
|
if (EFLAG || vflag)
|
||||||
fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
|
fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
|
||||||
else
|
else
|
||||||
fix->add_result_array(f_start, 0, offload);
|
fix->add_result_array(f_start, 0, offload);
|
||||||
@ -367,6 +360,10 @@ void PairBuckIntel::eval(const int offload, const int vflag,
|
|||||||
void PairBuckIntel::init_style()
|
void PairBuckIntel::init_style()
|
||||||
{
|
{
|
||||||
PairBuck::init_style();
|
PairBuck::init_style();
|
||||||
|
if (force->newton_pair == 0) {
|
||||||
|
neighbor->requests[neighbor->nrequest-1]->half = 0;
|
||||||
|
neighbor->requests[neighbor->nrequest-1]->full = 1;
|
||||||
|
}
|
||||||
neighbor->requests[neighbor->nrequest-1]->intel = 1;
|
neighbor->requests[neighbor->nrequest-1]->intel = 1;
|
||||||
|
|
||||||
int ifix = modify->find_fix("package_intel");
|
int ifix = modify->find_fix("package_intel");
|
||||||
|
|||||||
@ -48,7 +48,7 @@ private:
|
|||||||
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> * buffers,
|
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> * buffers,
|
||||||
const ForceConst<flt_t> &fc);
|
const ForceConst<flt_t> &fc);
|
||||||
|
|
||||||
template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||||
void eval(const int offload, const int vflag,
|
void eval(const int offload, const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> * buffers,
|
IntelBuffers<flt_t,acc_t> * buffers,
|
||||||
const ForceConst<flt_t> &fc, const int astart, const int aend);
|
const ForceConst<flt_t> &fc, const int astart, const int aend);
|
||||||
|
|||||||
@ -90,78 +90,58 @@ void PairEAMIntel::compute(int eflag, int vflag,
|
|||||||
if (ago != 0 && fix->separate_buffers() == 0) {
|
if (ago != 0 && fix->separate_buffers() == 0) {
|
||||||
fix->start_watch(TIME_PACK);
|
fix->start_watch(TIME_PACK);
|
||||||
|
|
||||||
|
int packthreads;
|
||||||
|
if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
|
||||||
|
else packthreads = 1;
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
#pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
|
#pragma omp parallel if(packthreads > 1)
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
int ifrom, ito, tid;
|
int ifrom, ito, tid;
|
||||||
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
|
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
|
||||||
nthreads, sizeof(ATOM_T));
|
packthreads, sizeof(ATOM_T));
|
||||||
buffers->thr_pack(ifrom,ito,ago);
|
buffers->thr_pack(ifrom,ito,ago);
|
||||||
}
|
}
|
||||||
fix->stop_watch(TIME_PACK);
|
fix->stop_watch(TIME_PACK);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int ovflag = 0;
|
||||||
|
if (vflag_fdotr) ovflag = 2;
|
||||||
|
else if (vflag) ovflag = 1;
|
||||||
if (_onetype) {
|
if (_onetype) {
|
||||||
if (evflag || vflag_fdotr) {
|
if (eflag) {
|
||||||
int ovflag = 0;
|
if (force->newton_pair) {
|
||||||
if (vflag_fdotr) ovflag = 2;
|
eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
else if (vflag) ovflag = 1;
|
eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
if (eflag) {
|
|
||||||
if (force->newton_pair) {
|
|
||||||
eval<1,1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
|
||||||
eval<1,1,1,1>(0, ovflag, buffers, fc, host_start, inum);
|
|
||||||
} else {
|
|
||||||
eval<1,1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
|
||||||
eval<1,1,1,0>(0, ovflag, buffers, fc, host_start, inum);
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_pair) {
|
eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<1,1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
eval<1,1,0,1>(0, ovflag, buffers, fc, host_start, inum);
|
|
||||||
} else {
|
|
||||||
eval<1,1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
|
||||||
eval<1,1,0,0>(0, ovflag, buffers, fc, host_start, inum);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_pair) {
|
if (force->newton_pair) {
|
||||||
eval<0,0,0,1>(1, 0, buffers, fc, 0, offload_end);
|
eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<0,0,0,1>(0, 0, buffers, fc, host_start, inum);
|
eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
} else {
|
} else {
|
||||||
eval<0,0,0,0>(1, 0, buffers, fc, 0, offload_end);
|
eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<0,0,0,0>(0, 0, buffers, fc, host_start, inum);
|
eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (evflag || vflag_fdotr) {
|
if (eflag) {
|
||||||
int ovflag = 0;
|
if (force->newton_pair) {
|
||||||
if (vflag_fdotr) ovflag = 2;
|
eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
else if (vflag) ovflag = 1;
|
eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
if (eflag) {
|
|
||||||
if (force->newton_pair) {
|
|
||||||
eval<0,1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
|
||||||
eval<0,1,1,1>(0, ovflag, buffers, fc, host_start, inum);
|
|
||||||
} else {
|
|
||||||
eval<0,1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
|
||||||
eval<0,1,1,0>(0, ovflag, buffers, fc, host_start, inum);
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_pair) {
|
eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<0,1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
eval<0,1,0,1>(0, ovflag, buffers, fc, host_start, inum);
|
|
||||||
} else {
|
|
||||||
eval<0,1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
|
||||||
eval<0,1,0,0>(0, ovflag, buffers, fc, host_start, inum);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_pair) {
|
if (force->newton_pair) {
|
||||||
eval<0,0,0,1>(1, 0, buffers, fc, 0, offload_end);
|
eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<0,0,0,1>(0, 0, buffers, fc, host_start, inum);
|
eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
} else {
|
} else {
|
||||||
eval<0,0,0,0>(1, 0, buffers, fc, 0, offload_end);
|
eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<0,0,0,0>(0, 0, buffers, fc, host_start, inum);
|
eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -169,8 +149,7 @@ void PairEAMIntel::compute(int eflag, int vflag,
|
|||||||
|
|
||||||
/* ---------------------------------------------------------------------- */
|
/* ---------------------------------------------------------------------- */
|
||||||
|
|
||||||
template <int ONETYPE, int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t,
|
template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||||
class acc_t>
|
|
||||||
void PairEAMIntel::eval(const int offload, const int vflag,
|
void PairEAMIntel::eval(const int offload, const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc,
|
const ForceConst<flt_t> &fc,
|
||||||
@ -186,7 +165,10 @@ void PairEAMIntel::eval(const int offload, const int vflag,
|
|||||||
nmax = atom->nmax;
|
nmax = atom->nmax;
|
||||||
int edge = (nmax * sizeof(acc_t)) % INTEL_DATA_ALIGN;
|
int edge = (nmax * sizeof(acc_t)) % INTEL_DATA_ALIGN;
|
||||||
if (edge) nmax += (INTEL_DATA_ALIGN - edge) / sizeof(acc_t);
|
if (edge) nmax += (INTEL_DATA_ALIGN - edge) / sizeof(acc_t);
|
||||||
memory->create(rho,nmax*comm->nthreads,"pair:rho");
|
if (NEWTON_PAIR)
|
||||||
|
memory->create(rho,nmax*comm->nthreads,"pair:rho");
|
||||||
|
else
|
||||||
|
memory->create(rho,nmax,"pair:rho");
|
||||||
memory->create(fp,nmax,"pair:fp");
|
memory->create(fp,nmax,"pair:fp");
|
||||||
// Use single precision allocation for single/mixed mode
|
// Use single precision allocation for single/mixed mode
|
||||||
// Keep double version for single and swap_eam
|
// Keep double version for single and swap_eam
|
||||||
@ -222,9 +204,17 @@ void PairEAMIntel::eval(const int offload, const int vflag,
|
|||||||
const int ntypes = atom->ntypes + 1;
|
const int ntypes = atom->ntypes + 1;
|
||||||
const int eatom = this->eflag_atom;
|
const int eatom = this->eflag_atom;
|
||||||
|
|
||||||
|
flt_t * _noalias const ccachex = buffers->get_ccachex();
|
||||||
|
flt_t * _noalias const ccachey = buffers->get_ccachey();
|
||||||
|
flt_t * _noalias const ccachez = buffers->get_ccachez();
|
||||||
|
flt_t * _noalias const ccachew = buffers->get_ccachew();
|
||||||
|
int * _noalias const ccachei = buffers->get_ccachei();
|
||||||
|
int * _noalias const ccachej = buffers->get_ccachej();
|
||||||
|
const int ccache_stride = _ccache_stride;
|
||||||
|
|
||||||
// Determine how much data to transfer
|
// Determine how much data to transfer
|
||||||
int x_size, q_size, f_stride, ev_size, separate_flag;
|
int x_size, q_size, f_stride, ev_size, separate_flag;
|
||||||
IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
|
IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
|
||||||
buffers, offload, fix, separate_flag,
|
buffers, offload, fix, separate_flag,
|
||||||
x_size, q_size, ev_size, f_stride);
|
x_size, q_size, ev_size, f_stride);
|
||||||
|
|
||||||
@ -252,16 +242,12 @@ void PairEAMIntel::eval(const int offload, const int vflag,
|
|||||||
f_stride, x, 0);
|
f_stride, x, 0);
|
||||||
|
|
||||||
acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
|
acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||||
if (EVFLAG) {
|
if (EFLAG) oevdwl = (acc_t)0;
|
||||||
oevdwl = (acc_t)0;
|
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
|
||||||
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// loop over neighbors of my atoms
|
// loop over neighbors of my atoms
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
#pragma omp parallel default(none) \
|
#pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||||
shared(fp_f, f_start,f_stride,nlocal,nall,minlocal) \
|
|
||||||
reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
|
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
int iifrom, iito, tid;
|
int iifrom, iito, tid;
|
||||||
@ -270,12 +256,25 @@ void PairEAMIntel::eval(const int offload, const int vflag,
|
|||||||
iifrom += astart;
|
iifrom += astart;
|
||||||
iito += astart;
|
iito += astart;
|
||||||
|
|
||||||
FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
|
int foff;
|
||||||
double * _noalias const trho = rho + tid*nmax;
|
if (NEWTON_PAIR) foff = tid * f_stride - minlocal;
|
||||||
if (NEWTON_PAIR)
|
else foff = -minlocal;
|
||||||
|
FORCE_T * _noalias const f = f_start + foff;
|
||||||
|
if (NEWTON_PAIR) foff = tid * nmax;
|
||||||
|
else foff = 0;
|
||||||
|
double * _noalias const trho = rho + foff;
|
||||||
|
if (NEWTON_PAIR) {
|
||||||
|
memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
|
||||||
memset(trho, 0, nall * sizeof(double));
|
memset(trho, 0, nall * sizeof(double));
|
||||||
else
|
}
|
||||||
memset(trho, 0, nlocal * sizeof(double));
|
|
||||||
|
const int toffs = tid * ccache_stride;
|
||||||
|
flt_t * _noalias const tdelx = ccachex + toffs;
|
||||||
|
flt_t * _noalias const tdely = ccachey + toffs;
|
||||||
|
flt_t * _noalias const tdelz = ccachez + toffs;
|
||||||
|
flt_t * _noalias const trsq = ccachew + toffs;
|
||||||
|
int * _noalias const tj = ccachei + toffs;
|
||||||
|
int * _noalias const tjtype = ccachej + toffs;
|
||||||
|
|
||||||
flt_t oscale;
|
flt_t oscale;
|
||||||
int rhor_joff, frho_ioff;
|
int rhor_joff, frho_ioff;
|
||||||
@ -300,53 +299,67 @@ void PairEAMIntel::eval(const int offload, const int vflag,
|
|||||||
const flt_t ztmp = x[i].z;
|
const flt_t ztmp = x[i].z;
|
||||||
|
|
||||||
acc_t rhoi = (acc_t)0.0;
|
acc_t rhoi = (acc_t)0.0;
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
int ej = 0;
|
||||||
#pragma vector aligned
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma simd reduction(+:rhoi)
|
#pragma vector aligned
|
||||||
|
#pragma ivdep
|
||||||
#endif
|
#endif
|
||||||
for (int jj = 0; jj < jnum; jj++) {
|
for (int jj = 0; jj < jnum; jj++) {
|
||||||
int j, jtype;
|
const int j = jlist[jj] & NEIGHMASK;
|
||||||
j = jlist[jj] & NEIGHMASK;
|
|
||||||
|
|
||||||
const flt_t delx = xtmp - x[j].x;
|
const flt_t delx = xtmp - x[j].x;
|
||||||
const flt_t dely = ytmp - x[j].y;
|
const flt_t dely = ytmp - x[j].y;
|
||||||
const flt_t delz = ztmp - x[j].z;
|
const flt_t delz = ztmp - x[j].z;
|
||||||
const flt_t rsq = delx*delx + dely*dely + delz*delz;
|
const flt_t rsq = delx*delx + dely*dely + delz*delz;
|
||||||
|
|
||||||
if (rsq < fcutforcesq) {
|
if (rsq < fcutforcesq) {
|
||||||
if (!ONETYPE) jtype = x[j].w;
|
trsq[ej]=rsq;
|
||||||
flt_t p = sqrt(rsq)*frdr + (flt_t)1.0;
|
if (!ONETYPE) tjtype[ej]=x[j].w;
|
||||||
int m = static_cast<int> (p);
|
tj[ej]=jlist[jj];
|
||||||
m = MIN(m,nr-1);
|
ej++;
|
||||||
p -= m;
|
|
||||||
p = MIN(p,(flt_t)1.0);
|
|
||||||
if (!ONETYPE)
|
|
||||||
rhor_joff = rhor_ioff + jtype * jstride;
|
|
||||||
const int joff = rhor_joff + m;
|
|
||||||
flt_t ra;
|
|
||||||
ra = ((rhor_spline_e[joff].a*p + rhor_spline_e[joff].b) * p +
|
|
||||||
rhor_spline_e[joff].c) * p + rhor_spline_e[joff].d;
|
|
||||||
rhoi += ra;
|
|
||||||
if (NEWTON_PAIR || j < nlocal) {
|
|
||||||
if (!ONETYPE) {
|
|
||||||
const int ioff = jtype * istride + itype * jstride + m;
|
|
||||||
ra = ((rhor_spline_e[ioff].a*p + rhor_spline_e[ioff].b)*p +
|
|
||||||
rhor_spline_e[ioff].c) * p + rhor_spline_e[ioff].d;
|
|
||||||
}
|
|
||||||
trho[j] += ra;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#pragma vector aligned
|
||||||
|
#pragma simd reduction(+:rhoi)
|
||||||
|
#endif
|
||||||
|
for (int jj = 0; jj < ej; jj++) {
|
||||||
|
int jtype;
|
||||||
|
const int j = tj[jj] & NEIGHMASK;
|
||||||
|
if (!ONETYPE) jtype = tjtype[jj];
|
||||||
|
const flt_t rsq = trsq[jj];
|
||||||
|
flt_t p = sqrt(rsq)*frdr + (flt_t)1.0;
|
||||||
|
int m = static_cast<int> (p);
|
||||||
|
m = MIN(m,nr-1);
|
||||||
|
p -= m;
|
||||||
|
p = MIN(p,(flt_t)1.0);
|
||||||
|
if (!ONETYPE)
|
||||||
|
rhor_joff = rhor_ioff + jtype * jstride;
|
||||||
|
const int joff = rhor_joff + m;
|
||||||
|
flt_t ra;
|
||||||
|
ra = ((rhor_spline_e[joff].a*p + rhor_spline_e[joff].b) * p +
|
||||||
|
rhor_spline_e[joff].c) * p + rhor_spline_e[joff].d;
|
||||||
|
rhoi += ra;
|
||||||
|
if (NEWTON_PAIR) {
|
||||||
|
if (!ONETYPE) {
|
||||||
|
const int ioff = jtype * istride + itype * jstride + m;
|
||||||
|
ra = ((rhor_spline_e[ioff].a*p + rhor_spline_e[ioff].b)*p +
|
||||||
|
rhor_spline_e[ioff].c) * p + rhor_spline_e[ioff].d;
|
||||||
|
}
|
||||||
|
trho[j] += ra;
|
||||||
|
}
|
||||||
} // for jj
|
} // for jj
|
||||||
trho[i] += rhoi;
|
if (NEWTON_PAIR)
|
||||||
|
trho[i] += rhoi;
|
||||||
|
else
|
||||||
|
trho[i] = rhoi;
|
||||||
} // for i
|
} // for i
|
||||||
|
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
if (nthreads > 1) {
|
if (NEWTON_PAIR && nthreads > 1) {
|
||||||
#pragma omp barrier
|
#pragma omp barrier
|
||||||
if (tid == 0) {
|
if (tid == 0) {
|
||||||
int rcount;
|
const int rcount = nall;
|
||||||
if (NEWTON_PAIR) rcount = nall;
|
|
||||||
else rcount = nlocal;
|
|
||||||
if (nthreads == 2) {
|
if (nthreads == 2) {
|
||||||
double *trho2 = rho + nmax;
|
double *trho2 = rho + nmax;
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
@ -431,10 +444,9 @@ void PairEAMIntel::eval(const int offload, const int vflag,
|
|||||||
#pragma omp barrier
|
#pragma omp barrier
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (tid == 0) {
|
if (tid == 0)
|
||||||
comm->forward_comm_pair(this);
|
comm->forward_comm_pair(this);
|
||||||
memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
|
if (NEWTON_PAIR)
|
||||||
} else
|
|
||||||
memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
|
memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
|
||||||
|
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
@ -462,124 +474,142 @@ void PairEAMIntel::eval(const int offload, const int vflag,
|
|||||||
const flt_t ytmp = x[i].y;
|
const flt_t ytmp = x[i].y;
|
||||||
const flt_t ztmp = x[i].z;
|
const flt_t ztmp = x[i].z;
|
||||||
fxtmp = fytmp = fztmp = (acc_t)0;
|
fxtmp = fytmp = fztmp = (acc_t)0;
|
||||||
if (EVFLAG) {
|
if (EFLAG) fwtmp = sevdwl = (acc_t)0;
|
||||||
if (EFLAG) fwtmp = sevdwl = (acc_t)0;
|
if (NEWTON_PAIR == 0)
|
||||||
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
||||||
}
|
|
||||||
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
int ej = 0;
|
||||||
#pragma vector aligned
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
#pragma vector aligned
|
||||||
sv0, sv1, sv2, sv3, sv4, sv5)
|
#pragma ivdep
|
||||||
#endif
|
#endif
|
||||||
for (int jj = 0; jj < jnum; jj++) {
|
for (int jj = 0; jj < jnum; jj++) {
|
||||||
int j, jtype;
|
const int j = jlist[jj] & NEIGHMASK;
|
||||||
j = jlist[jj] & NEIGHMASK;
|
|
||||||
|
|
||||||
const flt_t delx = xtmp - x[j].x;
|
const flt_t delx = xtmp - x[j].x;
|
||||||
const flt_t dely = ytmp - x[j].y;
|
const flt_t dely = ytmp - x[j].y;
|
||||||
const flt_t delz = ztmp - x[j].z;
|
const flt_t delz = ztmp - x[j].z;
|
||||||
const flt_t rsq = delx*delx + dely*dely + delz*delz;
|
const flt_t rsq = delx*delx + dely*dely + delz*delz;
|
||||||
|
|
||||||
|
|
||||||
if (rsq < fcutforcesq) {
|
if (rsq < fcutforcesq) {
|
||||||
if (!ONETYPE) jtype = x[j].w;
|
trsq[ej]=rsq;
|
||||||
const flt_t r = sqrt(rsq);
|
tdelx[ej]=delx;
|
||||||
flt_t p = r*frdr + (flt_t)1.0;
|
tdely[ej]=dely;
|
||||||
int m = static_cast<int> (p);
|
tdelz[ej]=delz;
|
||||||
m = MIN(m,nr-1);
|
if (!ONETYPE) tjtype[ej]=x[j].w;
|
||||||
p -= m;
|
tj[ej]=jlist[jj];
|
||||||
p = MIN(p,(flt_t)1.0);
|
ej++;
|
||||||
if (!ONETYPE)
|
}
|
||||||
rhor_joff = rhor_ioff + jtype * jstride;
|
}
|
||||||
const int joff = rhor_joff + m;
|
|
||||||
const flt_t rhojp = (rhor_spline_f[joff].a*p +
|
|
||||||
rhor_spline_f[joff].b)*p +
|
|
||||||
rhor_spline_f[joff].c;
|
|
||||||
flt_t rhoip;
|
|
||||||
if (!ONETYPE) {
|
|
||||||
const int ioff = jtype * istride + itype * jstride + m;
|
|
||||||
rhoip = (rhor_spline_f[ioff].a*p + rhor_spline_f[ioff].b)*p +
|
|
||||||
rhor_spline_f[ioff].c;
|
|
||||||
} else
|
|
||||||
rhoip = rhojp;
|
|
||||||
const flt_t z2p = (z2r_spline_t[joff].a*p +
|
|
||||||
z2r_spline_t[joff].b)*p +
|
|
||||||
z2r_spline_t[joff].c;
|
|
||||||
const flt_t z2 = ((z2r_spline_t[joff].d*p +
|
|
||||||
z2r_spline_t[joff].e)*p +
|
|
||||||
z2r_spline_t[joff].f)*p +
|
|
||||||
z2r_spline_t[joff].g;
|
|
||||||
|
|
||||||
const flt_t recip = (flt_t)1.0/r;
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
const flt_t phi = z2*recip;
|
#pragma vector aligned
|
||||||
const flt_t phip = z2p*recip - phi*recip;
|
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
||||||
const flt_t psip = fp_f[i]*rhojp + fp_f[j]*rhoip + phip;
|
sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
if (!ONETYPE)
|
#endif
|
||||||
oscale = scale_fi[jtype];
|
for (int jj = 0; jj < ej; jj++) {
|
||||||
const flt_t fpair = -oscale*psip*recip;
|
int jtype;
|
||||||
|
const int j = tj[jj] & NEIGHMASK;
|
||||||
|
if (!ONETYPE) jtype = tjtype[jj];
|
||||||
|
const flt_t rsq = trsq[jj];
|
||||||
|
const flt_t r = sqrt(rsq);
|
||||||
|
flt_t p = r*frdr + (flt_t)1.0;
|
||||||
|
int m = static_cast<int> (p);
|
||||||
|
m = MIN(m,nr-1);
|
||||||
|
p -= m;
|
||||||
|
p = MIN(p,(flt_t)1.0);
|
||||||
|
if (!ONETYPE)
|
||||||
|
rhor_joff = rhor_ioff + jtype * jstride;
|
||||||
|
const int joff = rhor_joff + m;
|
||||||
|
const flt_t rhojp = (rhor_spline_f[joff].a*p +
|
||||||
|
rhor_spline_f[joff].b)*p +
|
||||||
|
rhor_spline_f[joff].c;
|
||||||
|
flt_t rhoip;
|
||||||
|
if (!ONETYPE) {
|
||||||
|
const int ioff = jtype * istride + itype * jstride + m;
|
||||||
|
rhoip = (rhor_spline_f[ioff].a*p + rhor_spline_f[ioff].b)*p +
|
||||||
|
rhor_spline_f[ioff].c;
|
||||||
|
} else
|
||||||
|
rhoip = rhojp;
|
||||||
|
const flt_t z2p = (z2r_spline_t[joff].a*p +
|
||||||
|
z2r_spline_t[joff].b)*p +
|
||||||
|
z2r_spline_t[joff].c;
|
||||||
|
const flt_t z2 = ((z2r_spline_t[joff].d*p +
|
||||||
|
z2r_spline_t[joff].e)*p +
|
||||||
|
z2r_spline_t[joff].f)*p +
|
||||||
|
z2r_spline_t[joff].g;
|
||||||
|
|
||||||
fxtmp += delx*fpair;
|
const flt_t recip = (flt_t)1.0/r;
|
||||||
fytmp += dely*fpair;
|
const flt_t phi = z2*recip;
|
||||||
fztmp += delz*fpair;
|
const flt_t phip = z2p*recip - phi*recip;
|
||||||
if (NEWTON_PAIR || j < nlocal) {
|
const flt_t psip = fp_f[i]*rhojp + fp_f[j]*rhoip + phip;
|
||||||
f[j].x -= delx*fpair;
|
if (!ONETYPE)
|
||||||
f[j].y -= dely*fpair;
|
oscale = scale_fi[jtype];
|
||||||
f[j].z -= delz*fpair;
|
const flt_t fpair = -oscale*psip*recip;
|
||||||
}
|
|
||||||
|
|
||||||
if (EVFLAG) {
|
const flt_t fpx = fpair * tdelx[jj];
|
||||||
flt_t ev_pre = (flt_t)0;
|
fxtmp += fpx;
|
||||||
if (NEWTON_PAIR || i<nlocal)
|
if (NEWTON_PAIR) f[j].x -= fpx;
|
||||||
ev_pre += (flt_t)0.5;
|
const flt_t fpy = fpair * tdely[jj];
|
||||||
if (NEWTON_PAIR || j<nlocal)
|
fytmp += fpy;
|
||||||
ev_pre += (flt_t)0.5;
|
if (NEWTON_PAIR) f[j].y -= fpy;
|
||||||
|
const flt_t fpz = fpair * tdelz[jj];
|
||||||
|
fztmp += fpz;
|
||||||
|
if (NEWTON_PAIR) f[j].z -= fpz;
|
||||||
|
|
||||||
if (EFLAG) {
|
if (EFLAG) {
|
||||||
const flt_t evdwl = oscale*phi;
|
const flt_t evdwl = oscale*phi;
|
||||||
sevdwl += ev_pre * evdwl;
|
sevdwl += evdwl;
|
||||||
if (eatom) {
|
if (eatom) {
|
||||||
if (NEWTON_PAIR || i < nlocal)
|
fwtmp += (flt_t)0.5 * evdwl;
|
||||||
fwtmp += (flt_t)0.5 * evdwl;
|
if (NEWTON_PAIR)
|
||||||
if (NEWTON_PAIR || j < nlocal)
|
f[j].w += (flt_t)0.5 * evdwl;
|
||||||
f[j].w += (flt_t)0.5 * evdwl;
|
}
|
||||||
}
|
}
|
||||||
}
|
if (NEWTON_PAIR == 0)
|
||||||
IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair,
|
IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj],
|
||||||
delx, dely, delz);
|
fpx, fpy, fpz);
|
||||||
}
|
|
||||||
} // if rsq
|
|
||||||
} // for jj
|
} // for jj
|
||||||
f[i].x += fxtmp;
|
if (NEWTON_PAIR) {
|
||||||
f[i].y += fytmp;
|
f[i].x += fxtmp;
|
||||||
f[i].z += fztmp;
|
f[i].y += fytmp;
|
||||||
|
f[i].z += fztmp;
|
||||||
|
} else {
|
||||||
|
f[i].x = fxtmp;
|
||||||
|
f[i].y = fytmp;
|
||||||
|
f[i].z = fztmp;
|
||||||
|
sevdwl *= (acc_t)0.5;
|
||||||
|
}
|
||||||
|
|
||||||
IP_PRE_ev_tally_atom(EVFLAG, EFLAG, vflag, f, fwtmp);
|
IP_PRE_ev_tally_atom(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
|
||||||
} // for i
|
} // for i
|
||||||
|
|
||||||
if (vflag == 2) {
|
IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
|
||||||
#if defined(_OPENMP)
|
f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
|
||||||
#pragma omp barrier
|
ov4, ov5);
|
||||||
#endif
|
|
||||||
IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall,
|
|
||||||
nlocal, minlocal, nthreads, f_start, f_stride,
|
|
||||||
x, offload);
|
|
||||||
}
|
|
||||||
|
|
||||||
} /// omp
|
} /// omp
|
||||||
if (EVFLAG) {
|
|
||||||
if (EFLAG) {
|
IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
|
||||||
ev_global[0] = oevdwl;
|
ov0, ov1, ov2, ov3, ov4, ov5);
|
||||||
ev_global[1] = (acc_t)0.0;
|
|
||||||
}
|
if (EFLAG) {
|
||||||
if (vflag) {
|
ev_global[0] = oevdwl;
|
||||||
ev_global[2] = ov0;
|
ev_global[1] = (acc_t)0.0;
|
||||||
ev_global[3] = ov1;
|
}
|
||||||
ev_global[4] = ov2;
|
if (vflag) {
|
||||||
ev_global[5] = ov3;
|
if (NEWTON_PAIR == 0) {
|
||||||
ev_global[6] = ov4;
|
ov0 *= (acc_t)0.5;
|
||||||
ev_global[7] = ov5;
|
ov1 *= (acc_t)0.5;
|
||||||
|
ov2 *= (acc_t)0.5;
|
||||||
|
ov3 *= (acc_t)0.5;
|
||||||
|
ov4 *= (acc_t)0.5;
|
||||||
|
ov5 *= (acc_t)0.5;
|
||||||
}
|
}
|
||||||
|
ev_global[2] = ov0;
|
||||||
|
ev_global[3] = ov1;
|
||||||
|
ev_global[4] = ov2;
|
||||||
|
ev_global[5] = ov3;
|
||||||
|
ev_global[6] = ov4;
|
||||||
|
ev_global[7] = ov5;
|
||||||
}
|
}
|
||||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||||
*timer_compute = MIC_Wtime() - *timer_compute;
|
*timer_compute = MIC_Wtime() - *timer_compute;
|
||||||
@ -591,7 +621,7 @@ void PairEAMIntel::eval(const int offload, const int vflag,
|
|||||||
else
|
else
|
||||||
fix->stop_watch(TIME_HOST_PAIR);
|
fix->stop_watch(TIME_HOST_PAIR);
|
||||||
|
|
||||||
if (EVFLAG)
|
if (EFLAG || vflag)
|
||||||
fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
|
fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
|
||||||
else
|
else
|
||||||
fix->add_result_array(f_start, 0, offload);
|
fix->add_result_array(f_start, 0, offload);
|
||||||
@ -604,6 +634,10 @@ void PairEAMIntel::eval(const int offload, const int vflag,
|
|||||||
void PairEAMIntel::init_style()
|
void PairEAMIntel::init_style()
|
||||||
{
|
{
|
||||||
PairEAM::init_style();
|
PairEAM::init_style();
|
||||||
|
if (force->newton_pair == 0) {
|
||||||
|
neighbor->requests[neighbor->nrequest-1]->half = 0;
|
||||||
|
neighbor->requests[neighbor->nrequest-1]->full = 1;
|
||||||
|
}
|
||||||
neighbor->requests[neighbor->nrequest-1]->intel = 1;
|
neighbor->requests[neighbor->nrequest-1]->intel = 1;
|
||||||
|
|
||||||
int ifix = modify->find_fix("package_intel");
|
int ifix = modify->find_fix("package_intel");
|
||||||
@ -633,6 +667,13 @@ template <class flt_t, class acc_t>
|
|||||||
void PairEAMIntel::pack_force_const(ForceConst<flt_t> &fc,
|
void PairEAMIntel::pack_force_const(ForceConst<flt_t> &fc,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers)
|
IntelBuffers<flt_t,acc_t> *buffers)
|
||||||
{
|
{
|
||||||
|
int off_ccache = 0;
|
||||||
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
|
if (_cop >= 0) off_ccache = 1;
|
||||||
|
#endif
|
||||||
|
buffers->grow_ccache(off_ccache, comm->nthreads, 1);
|
||||||
|
_ccache_stride = buffers->ccache_stride();
|
||||||
|
|
||||||
int tp1 = atom->ntypes + 1;
|
int tp1 = atom->ntypes + 1;
|
||||||
fc.set_ntypes(tp1,nr,nrho,memory,_cop);
|
fc.set_ntypes(tp1,nr,nrho,memory,_cop);
|
||||||
buffers->set_ntypes(tp1);
|
buffers->set_ntypes(tp1);
|
||||||
|
|||||||
@ -41,7 +41,7 @@ class PairEAMIntel : public PairEAM {
|
|||||||
protected:
|
protected:
|
||||||
|
|
||||||
FixIntel *fix;
|
FixIntel *fix;
|
||||||
int _cop, _onetype;
|
int _cop, _onetype, _ccache_stride;
|
||||||
float *fp_float;
|
float *fp_float;
|
||||||
|
|
||||||
template <class flt_t>
|
template <class flt_t>
|
||||||
@ -53,7 +53,7 @@ class PairEAMIntel : public PairEAM {
|
|||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc);
|
const ForceConst<flt_t> &fc);
|
||||||
template <int ONETYPE, int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t,
|
template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t,
|
||||||
class acc_t>
|
class acc_t>
|
||||||
void eval(const int offload, const int vflag,
|
void eval(const int offload, const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> * buffers,
|
IntelBuffers<flt_t,acc_t> * buffers,
|
||||||
|
|||||||
@ -88,12 +88,16 @@ void PairGayBerneIntel::compute(int eflag, int vflag,
|
|||||||
const AtomVecEllipsoid::Bonus * const bonus = avec->bonus;
|
const AtomVecEllipsoid::Bonus * const bonus = avec->bonus;
|
||||||
const int * const ellipsoid = atom->ellipsoid;
|
const int * const ellipsoid = atom->ellipsoid;
|
||||||
QUAT_T * _noalias const quat = buffers->get_quat();
|
QUAT_T * _noalias const quat = buffers->get_quat();
|
||||||
|
|
||||||
|
int packthreads;
|
||||||
|
if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
|
||||||
|
else packthreads = 1;
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
#pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
|
#pragma omp parallel if(packthreads > 1)
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
int ifrom, ito, tid;
|
int ifrom, ito, tid;
|
||||||
IP_PRE_omp_range_id_align(ifrom, ito, tid, nall, nthreads,
|
IP_PRE_omp_range_id_align(ifrom, ito, tid, nall, packthreads,
|
||||||
sizeof(ATOM_T));
|
sizeof(ATOM_T));
|
||||||
if (ago != 0) buffers->thr_pack(ifrom,ito,ago);
|
if (ago != 0) buffers->thr_pack(ifrom,ito,ago);
|
||||||
|
|
||||||
@ -114,39 +118,29 @@ void PairGayBerneIntel::compute(int eflag, int vflag,
|
|||||||
fix->stop_watch(TIME_PACK);
|
fix->stop_watch(TIME_PACK);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (evflag || vflag_fdotr) {
|
int ovflag = 0;
|
||||||
int ovflag = 0;
|
if (vflag_fdotr) ovflag = 2;
|
||||||
if (vflag_fdotr) ovflag = 2;
|
else if (vflag) ovflag = 1;
|
||||||
else if (vflag) ovflag = 1;
|
if (eflag) {
|
||||||
if (eflag) {
|
if (force->newton_pair) {
|
||||||
if (force->newton_pair) {
|
eval<1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
eval<1,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
|
|
||||||
} else {
|
|
||||||
eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
|
||||||
eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_pair) {
|
eval<1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
eval<1,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
|
|
||||||
} else {
|
|
||||||
eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
|
||||||
eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_pair) {
|
if (force->newton_pair) {
|
||||||
eval<0,0,1>(1, 0, buffers, fc, 0, offload_end);
|
eval<0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<0,0,1>(0, 0, buffers, fc, host_start, inum);
|
eval<0,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
} else {
|
} else {
|
||||||
eval<0,0,0>(1, 0, buffers, fc, 0, offload_end);
|
eval<0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<0,0,0>(0, 0, buffers, fc, host_start, inum);
|
eval<0,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||||
void PairGayBerneIntel::eval(const int offload, const int vflag,
|
void PairGayBerneIntel::eval(const int offload, const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc,
|
const ForceConst<flt_t> &fc,
|
||||||
@ -167,8 +161,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
|||||||
if (fix->separate_buffers()) {
|
if (fix->separate_buffers()) {
|
||||||
fix->start_watch(TIME_PACK);
|
fix->start_watch(TIME_PACK);
|
||||||
if (offload) {
|
if (offload) {
|
||||||
#pragma omp parallel default(none) \
|
#pragma omp parallel
|
||||||
shared(buffers,nlocal,nall,bonus,ellipsoid)
|
|
||||||
{
|
{
|
||||||
int ifrom, ito, tid;
|
int ifrom, ito, tid;
|
||||||
int nthreads = comm->nthreads;
|
int nthreads = comm->nthreads;
|
||||||
@ -258,7 +251,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
|||||||
|
|
||||||
// Determine how much data to transfer
|
// Determine how much data to transfer
|
||||||
int x_size, q_size, f_stride, ev_size, separate_flag;
|
int x_size, q_size, f_stride, ev_size, separate_flag;
|
||||||
IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
|
IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
|
||||||
buffers, offload, fix, separate_flag,
|
buffers, offload, fix, separate_flag,
|
||||||
x_size, q_size, ev_size, f_stride);
|
x_size, q_size, ev_size, f_stride);
|
||||||
|
|
||||||
@ -334,6 +327,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
|||||||
x[nall].x = (flt_t)INTEL_BIGP;
|
x[nall].x = (flt_t)INTEL_BIGP;
|
||||||
x[nall].y = (flt_t)INTEL_BIGP;
|
x[nall].y = (flt_t)INTEL_BIGP;
|
||||||
x[nall].z = (flt_t)INTEL_BIGP;
|
x[nall].z = (flt_t)INTEL_BIGP;
|
||||||
|
x[nall].w = 1;
|
||||||
quat[nall].w = (flt_t)1.0;
|
quat[nall].w = (flt_t)1.0;
|
||||||
quat[nall].i = (flt_t)0.0;
|
quat[nall].i = (flt_t)0.0;
|
||||||
quat[nall].j = (flt_t)0.0;
|
quat[nall].j = (flt_t)0.0;
|
||||||
@ -342,25 +336,25 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
|
acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||||
if (EVFLAG) {
|
if (EFLAG) oevdwl = (acc_t)0.0;
|
||||||
oevdwl = (acc_t)0.0;
|
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
|
||||||
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
|
if (NEWTON_PAIR == 0) f_start[1].w = 0;
|
||||||
}
|
|
||||||
|
|
||||||
// loop over neighbors of my atoms
|
// loop over neighbors of my atoms
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
#pragma omp parallel default(none) \
|
#pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||||
shared(f_start,f_stride,nlocal,nall,minlocal) \
|
|
||||||
reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
|
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
int iifrom, iito, tid;
|
int iifrom, iip, iito, tid;
|
||||||
IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
|
IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
|
||||||
iifrom += astart;
|
iifrom += astart;
|
||||||
iito += astart;
|
iito += astart;
|
||||||
|
|
||||||
FORCE_T * _noalias const f = f_start - minlocal * 2 + (tid * f_stride);
|
int foff;
|
||||||
memset(f + minlocal * 2, 0, f_stride * sizeof(FORCE_T));
|
if (NEWTON_PAIR) foff = tid * f_stride - minlocal * 2;
|
||||||
|
else foff = minlocal*-2;
|
||||||
|
FORCE_T * _noalias const f = f_start + foff;
|
||||||
|
if (NEWTON_PAIR) memset(f + minlocal * 2, 0, f_stride * sizeof(FORCE_T));
|
||||||
|
|
||||||
flt_t * _noalias const rsq_form = rsq_formi + tid * max_nbors;
|
flt_t * _noalias const rsq_form = rsq_formi + tid * max_nbors;
|
||||||
flt_t * _noalias const delx_form = delx_formi + tid * max_nbors;
|
flt_t * _noalias const delx_form = delx_formi + tid * max_nbors;
|
||||||
@ -370,7 +364,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
|||||||
int * _noalias const jlist_form = jlist_formi + tid * max_nbors;
|
int * _noalias const jlist_form = jlist_formi + tid * max_nbors;
|
||||||
|
|
||||||
int ierror = 0;
|
int ierror = 0;
|
||||||
for (int i = iifrom; i < iito; ++i) {
|
for (int i = iifrom; i < iito; i += iip) {
|
||||||
// const int i = ilist[ii];
|
// const int i = ilist[ii];
|
||||||
const int itype = x[i].w;
|
const int itype = x[i].w;
|
||||||
const int ptr_off = itype * ntypes;
|
const int ptr_off = itype * ntypes;
|
||||||
@ -401,14 +395,17 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
|||||||
acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
|
acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
|
||||||
fxtmp = fytmp = fztmp = t1tmp = t2tmp = t3tmp = (acc_t)0.0;
|
fxtmp = fytmp = fztmp = t1tmp = t2tmp = t3tmp = (acc_t)0.0;
|
||||||
|
|
||||||
if (EVFLAG) {
|
if (EFLAG) fwtmp = sevdwl = (acc_t)0.0;
|
||||||
if (EFLAG) fwtmp = sevdwl = (acc_t)0.0;
|
if (NEWTON_PAIR == 0)
|
||||||
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
||||||
}
|
|
||||||
|
|
||||||
bool multiple_forms = false;
|
bool multiple_forms = false;
|
||||||
int packed_j = 0;
|
int packed_j = 0;
|
||||||
for (int jj = 0; jj < jnum; jj++) {
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#pragma vector aligned
|
||||||
|
#pragma ivdep
|
||||||
|
#endif
|
||||||
|
for (int jj = 0; jj < jnum; jj++) {
|
||||||
int jm = jlist[jj];
|
int jm = jlist[jj];
|
||||||
int j = jm & NEIGHMASK;
|
int j = jm & NEIGHMASK;
|
||||||
const int jtype = x[j].w;
|
const int jtype = x[j].w;
|
||||||
@ -573,7 +570,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
|||||||
ME_cross3(tempv, tempv2, dUr);
|
ME_cross3(tempv, tempv2, dUr);
|
||||||
flt_t dUr2_0, dUr2_1, dUr2_2;
|
flt_t dUr2_0, dUr2_1, dUr2_2;
|
||||||
|
|
||||||
if (NEWTON_PAIR || j < nlocal) {
|
if (NEWTON_PAIR) {
|
||||||
ME_vecmat(kappa, g2, tempv2);
|
ME_vecmat(kappa, g2, tempv2);
|
||||||
ME_cross3(tempv, tempv2, dUr2);
|
ME_cross3(tempv, tempv2, dUr2);
|
||||||
}
|
}
|
||||||
@ -588,7 +585,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
|||||||
dchi_2 *= temp1;
|
dchi_2 *= temp1;
|
||||||
flt_t dchi2_0, dchi2_1, dchi2_2;
|
flt_t dchi2_0, dchi2_1, dchi2_2;
|
||||||
|
|
||||||
if (NEWTON_PAIR || j < nlocal) {
|
if (NEWTON_PAIR) {
|
||||||
ME_vecmat(iota, b2, tempv);
|
ME_vecmat(iota, b2, tempv);
|
||||||
ME_cross3(tempv, iota, dchi2);
|
ME_cross3(tempv, iota, dchi2);
|
||||||
dchi2_0 *= temp1;
|
dchi2_0 *= temp1;
|
||||||
@ -630,7 +627,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
|||||||
// compute d_eta for particle 2
|
// compute d_eta for particle 2
|
||||||
|
|
||||||
flt_t deta2_0, deta2_1, deta2_2;
|
flt_t deta2_0, deta2_1, deta2_2;
|
||||||
if (NEWTON_PAIR || j < nlocal) {
|
if (NEWTON_PAIR) {
|
||||||
deta2_0 = deta2_1 = deta2_2 = (flt_t)0.0;
|
deta2_0 = deta2_1 = deta2_2 = (flt_t)0.0;
|
||||||
ME_compute_eta_torque(g12, a2, shape2, temp);
|
ME_compute_eta_torque(g12, a2, shape2, temp);
|
||||||
|
|
||||||
@ -672,7 +669,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
|||||||
ttor_2 = (temp1 * dchi_2 + temp2 * deta_2 + temp3 * dUr_2) *
|
ttor_2 = (temp1 * dchi_2 + temp2 * deta_2 + temp3 * dUr_2) *
|
||||||
(flt_t)-1.0;
|
(flt_t)-1.0;
|
||||||
|
|
||||||
if (NEWTON_PAIR || j < nlocal) {
|
if (NEWTON_PAIR) {
|
||||||
rtor_0 = (temp1 * dchi2_0 + temp2 * deta2_0 + temp3 * dUr2_0) *
|
rtor_0 = (temp1 * dchi2_0 + temp2 * deta2_0 + temp3 * dUr2_0) *
|
||||||
(flt_t)-1.0;
|
(flt_t)-1.0;
|
||||||
rtor_1 = (temp1 * dchi2_1 + temp2 * deta2_1 + temp3 * dUr2_1) *
|
rtor_1 = (temp1 * dchi2_1 + temp2 * deta2_1 + temp3 * dUr2_1) *
|
||||||
@ -714,7 +711,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
|||||||
t2tmp += ttor_1;
|
t2tmp += ttor_1;
|
||||||
t3tmp += ttor_2;
|
t3tmp += ttor_2;
|
||||||
|
|
||||||
if (NEWTON_PAIR || j < nlocal) {
|
if (NEWTON_PAIR) {
|
||||||
rtor_0 *= factor_lj;
|
rtor_0 *= factor_lj;
|
||||||
rtor_1 *= factor_lj;
|
rtor_1 *= factor_lj;
|
||||||
rtor_2 *= factor_lj;
|
rtor_2 *= factor_lj;
|
||||||
@ -728,34 +725,26 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
|||||||
f[jp].z += rtor_2;
|
f[jp].z += rtor_2;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (EVFLAG) {
|
if (EFLAG) {
|
||||||
flt_t ev_pre = (flt_t)0.0;
|
evdwl = factor_lj * one_eng;
|
||||||
if (NEWTON_PAIR || i < nlocal)
|
sevdwl += evdwl;
|
||||||
ev_pre += (flt_t)0.5;
|
if (eatom) {
|
||||||
if (NEWTON_PAIR || j < nlocal)
|
fwtmp += (flt_t)0.5 * evdwl;
|
||||||
ev_pre += (flt_t)0.5;
|
if (NEWTON_PAIR)
|
||||||
|
f[j*2].w += (flt_t)0.5 * evdwl;
|
||||||
if (EFLAG) {
|
|
||||||
evdwl = factor_lj * one_eng;
|
|
||||||
sevdwl += ev_pre * evdwl;
|
|
||||||
if (eatom) {
|
|
||||||
if (NEWTON_PAIR || i < nlocal)
|
|
||||||
fwtmp += (flt_t)0.5 * evdwl;
|
|
||||||
if (NEWTON_PAIR || j < nlocal)
|
|
||||||
f[j*2].w += (flt_t)0.5 * evdwl;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (NEWTON_PAIR == 0) {
|
||||||
if (vflag == 1) {
|
if (vflag == 1) {
|
||||||
ev_pre *= (flt_t)-1.0;
|
sv0 += delx_form[jj] * fforce_0;
|
||||||
sv0 += ev_pre * delx_form[jj] * fforce_0;
|
sv1 += dely_form[jj] * fforce_1;
|
||||||
sv1 += ev_pre * dely_form[jj] * fforce_1;
|
sv2 += delz_form[jj] * fforce_2;
|
||||||
sv2 += ev_pre * delz_form[jj] * fforce_2;
|
sv3 += delx_form[jj] * fforce_1;
|
||||||
sv3 += ev_pre * delx_form[jj] * fforce_1;
|
sv4 += delx_form[jj] * fforce_2;
|
||||||
sv4 += ev_pre * delx_form[jj] * fforce_2;
|
sv5 += dely_form[jj] * fforce_2;
|
||||||
sv5 += ev_pre * dely_form[jj] * fforce_2;
|
|
||||||
}
|
}
|
||||||
} // EVFLAG
|
} // EVFLAG
|
||||||
#ifdef INTEL_VMASK
|
#ifdef INTEL_VMASK
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
@ -767,19 +756,29 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
|||||||
ierror = 2;
|
ierror = 2;
|
||||||
|
|
||||||
int ip = i * 2;
|
int ip = i * 2;
|
||||||
f[ip].x += fxtmp;
|
if (NEWTON_PAIR) {
|
||||||
f[ip].y += fytmp;
|
f[ip].x += fxtmp;
|
||||||
f[ip].z += fztmp;
|
f[ip].y += fytmp;
|
||||||
ip++;
|
f[ip].z += fztmp;
|
||||||
f[ip].x += t1tmp;
|
ip++;
|
||||||
f[ip].y += t2tmp;
|
f[ip].x += t1tmp;
|
||||||
f[ip].z += t3tmp;
|
f[ip].y += t2tmp;
|
||||||
|
f[ip].z += t3tmp;
|
||||||
|
} else {
|
||||||
|
f[ip].x = fxtmp;
|
||||||
|
f[ip].y = fytmp;
|
||||||
|
f[ip].z = fztmp;
|
||||||
|
ip++;
|
||||||
|
f[ip].x = t1tmp;
|
||||||
|
f[ip].y = t2tmp;
|
||||||
|
f[ip].z = t3tmp;
|
||||||
|
}
|
||||||
|
|
||||||
if (EVFLAG) {
|
if (EFLAG) {
|
||||||
if (EFLAG) {
|
oevdwl += sevdwl;
|
||||||
if (eatom) f[i * 2].w += fwtmp;
|
if (eatom) f[i * 2].w += fwtmp;
|
||||||
oevdwl += sevdwl;
|
}
|
||||||
}
|
if (NEWTON_PAIR == 0) {
|
||||||
if (vflag == 1) {
|
if (vflag == 1) {
|
||||||
ov0 += sv0;
|
ov0 += sv0;
|
||||||
ov1 += sv1;
|
ov1 += sv1;
|
||||||
@ -791,56 +790,31 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
|||||||
}
|
}
|
||||||
} // for i
|
} // for i
|
||||||
int o_range;
|
int o_range;
|
||||||
if (NEWTON_PAIR)
|
if (NEWTON_PAIR) {
|
||||||
o_range = nall;
|
o_range = nall;
|
||||||
else
|
if (offload == 0) o_range -= minlocal;
|
||||||
o_range = nlocal;
|
IP_PRE_omp_range_align(iifrom, iito, tid, o_range, nthreads,
|
||||||
if (offload == 0) o_range -= minlocal;
|
sizeof(FORCE_T));
|
||||||
IP_PRE_omp_range_align(iifrom, iito, tid, o_range, nthreads,
|
const int sto = iito * 8;
|
||||||
sizeof(FORCE_T));
|
const int fst4 = f_stride * 4;
|
||||||
const int two_iito = iito * 2;
|
#if defined(_OPENMP)
|
||||||
|
#pragma omp barrier
|
||||||
acc_t *facc = &(f_start[0].x);
|
#endif
|
||||||
const int sto = two_iito * 4;
|
acc_t *f_scalar = &f_start[0].x;
|
||||||
const int fst4 = f_stride * 4;
|
acc_t *f_scalar2 = f_scalar + fst4;
|
||||||
#if defined(_OPENMP)
|
for (int t = 1; t < nthreads; t++) {
|
||||||
#pragma omp barrier
|
|
||||||
#endif
|
|
||||||
int t_off = f_stride;
|
|
||||||
if (EFLAG && eatom) {
|
|
||||||
for (int t = 1; t < nthreads; t++) {
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector nontemporal
|
#pragma vector aligned
|
||||||
#pragma novector
|
#pragma simd
|
||||||
#endif
|
#endif
|
||||||
for (int n = iifrom * 2; n < two_iito; n++) {
|
for (int n = iifrom * 8; n < sto; n++)
|
||||||
f_start[n].x += f_start[n + t_off].x;
|
f_scalar[n] += f_scalar2[n];
|
||||||
f_start[n].y += f_start[n + t_off].y;
|
f_scalar2 += fst4;
|
||||||
f_start[n].z += f_start[n + t_off].z;
|
|
||||||
f_start[n].w += f_start[n + t_off].w;
|
|
||||||
}
|
|
||||||
t_off += f_stride;
|
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
for (int t = 1; t < nthreads; t++) {
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
|
||||||
#pragma vector nontemporal
|
|
||||||
#pragma novector
|
|
||||||
#endif
|
|
||||||
for (int n = iifrom * 2; n < two_iito; n++) {
|
|
||||||
f_start[n].x += f_start[n + t_off].x;
|
|
||||||
f_start[n].y += f_start[n + t_off].y;
|
|
||||||
f_start[n].z += f_start[n + t_off].z;
|
|
||||||
}
|
|
||||||
t_off += f_stride;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (EVFLAG) {
|
|
||||||
if (vflag==2) {
|
if (vflag==2) {
|
||||||
const ATOM_T * _noalias const xo = x + minlocal;
|
const ATOM_T * _noalias const xo = x + minlocal;
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector nontemporal
|
|
||||||
#pragma novector
|
#pragma novector
|
||||||
#endif
|
#endif
|
||||||
for (int n = iifrom; n < iito; n++) {
|
for (int n = iifrom; n < iito; n++) {
|
||||||
@ -852,26 +826,33 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
|||||||
ov4 += f_start[nt2].z * xo[n].x;
|
ov4 += f_start[nt2].z * xo[n].x;
|
||||||
ov5 += f_start[nt2].z * xo[n].y;
|
ov5 += f_start[nt2].z * xo[n].y;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ierror)
|
if (ierror)
|
||||||
f_start[1].w = ierror;
|
f_start[1].w = ierror;
|
||||||
} // omp
|
} // omp
|
||||||
|
|
||||||
if (EVFLAG) {
|
if (EFLAG) {
|
||||||
if (EFLAG) {
|
if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
|
||||||
ev_global[0] = oevdwl;
|
ev_global[0] = oevdwl;
|
||||||
ev_global[1] = (acc_t)0.0;
|
ev_global[1] = (acc_t)0.0;
|
||||||
}
|
}
|
||||||
if (vflag) {
|
if (vflag) {
|
||||||
ev_global[2] = ov0;
|
if (NEWTON_PAIR == 0) {
|
||||||
ev_global[3] = ov1;
|
ov0 *= (acc_t)-0.5;
|
||||||
ev_global[4] = ov2;
|
ov1 *= (acc_t)-0.5;
|
||||||
ev_global[5] = ov3;
|
ov2 *= (acc_t)-0.5;
|
||||||
ev_global[6] = ov4;
|
ov3 *= (acc_t)-0.5;
|
||||||
ev_global[7] = ov5;
|
ov4 *= (acc_t)-0.5;
|
||||||
|
ov5 *= (acc_t)-0.5;
|
||||||
}
|
}
|
||||||
|
ev_global[2] = ov0;
|
||||||
|
ev_global[3] = ov1;
|
||||||
|
ev_global[4] = ov2;
|
||||||
|
ev_global[5] = ov3;
|
||||||
|
ev_global[6] = ov4;
|
||||||
|
ev_global[7] = ov5;
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||||
@ -884,7 +865,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
|||||||
else
|
else
|
||||||
fix->stop_watch(TIME_HOST_PAIR);
|
fix->stop_watch(TIME_HOST_PAIR);
|
||||||
|
|
||||||
if (EVFLAG)
|
if (EFLAG || vflag)
|
||||||
fix->add_result_array(f_start, ev_global, offload, eatom, 0, 2);
|
fix->add_result_array(f_start, ev_global, offload, eatom, 0, 2);
|
||||||
else
|
else
|
||||||
fix->add_result_array(f_start, 0, offload, 0, 0, 2);
|
fix->add_result_array(f_start, 0, offload, 0, 0, 2);
|
||||||
@ -895,6 +876,10 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
|||||||
void PairGayBerneIntel::init_style()
|
void PairGayBerneIntel::init_style()
|
||||||
{
|
{
|
||||||
PairGayBerne::init_style();
|
PairGayBerne::init_style();
|
||||||
|
if (force->newton_pair == 0) {
|
||||||
|
neighbor->requests[neighbor->nrequest-1]->half = 0;
|
||||||
|
neighbor->requests[neighbor->nrequest-1]->full = 1;
|
||||||
|
}
|
||||||
neighbor->requests[neighbor->nrequest-1]->intel = 1;
|
neighbor->requests[neighbor->nrequest-1]->intel = 1;
|
||||||
|
|
||||||
int ifix = modify->find_fix("package_intel");
|
int ifix = modify->find_fix("package_intel");
|
||||||
|
|||||||
@ -43,7 +43,7 @@ class PairGayBerneIntel : public PairGayBerne {
|
|||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc);
|
const ForceConst<flt_t> &fc);
|
||||||
template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||||
void eval(const int offload, const int vflag,
|
void eval(const int offload, const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> * buffers,
|
IntelBuffers<flt_t,acc_t> * buffers,
|
||||||
const ForceConst<flt_t> &fc, const int astart, const int aend);
|
const ForceConst<flt_t> &fc, const int astart, const int aend);
|
||||||
|
|||||||
@ -82,54 +82,48 @@ void PairLJCharmmCoulLongIntel::compute(int eflag, int vflag,
|
|||||||
|
|
||||||
if (_lrt == 0 && ago != 0 && fix->separate_buffers() == 0) {
|
if (_lrt == 0 && ago != 0 && fix->separate_buffers() == 0) {
|
||||||
fix->start_watch(TIME_PACK);
|
fix->start_watch(TIME_PACK);
|
||||||
|
|
||||||
|
int packthreads;
|
||||||
|
if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
|
||||||
|
else packthreads = 1;
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
#pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
|
#pragma omp parallel if(packthreads > 1)
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
int ifrom, ito, tid;
|
int ifrom, ito, tid;
|
||||||
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal+atom->nghost,
|
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal+atom->nghost,
|
||||||
nthreads, sizeof(ATOM_T));
|
packthreads, sizeof(ATOM_T));
|
||||||
buffers->thr_pack(ifrom,ito,ago);
|
buffers->thr_pack(ifrom,ito,ago);
|
||||||
}
|
}
|
||||||
fix->stop_watch(TIME_PACK);
|
fix->stop_watch(TIME_PACK);
|
||||||
}
|
}
|
||||||
|
|
||||||
// -------------------- Regular version
|
// -------------------- Regular version
|
||||||
if (evflag || vflag_fdotr) {
|
int ovflag = 0;
|
||||||
int ovflag = 0;
|
if (vflag_fdotr) ovflag = 2;
|
||||||
if (vflag_fdotr) ovflag = 2;
|
else if (vflag) ovflag = 1;
|
||||||
else if (vflag) ovflag = 1;
|
if (eflag) {
|
||||||
if (eflag) {
|
if (force->newton_pair) {
|
||||||
if (force->newton_pair) {
|
eval<1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
eval<1,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
|
|
||||||
} else {
|
|
||||||
eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
|
||||||
eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_pair) {
|
eval<1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
eval<1,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
|
|
||||||
} else {
|
|
||||||
eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
|
||||||
eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_pair) {
|
if (force->newton_pair) {
|
||||||
eval<0,0,1>(1, 0, buffers, fc, 0, offload_end);
|
eval<0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<0,0,1>(0, 0, buffers, fc, host_start, inum);
|
eval<0,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
} else {
|
} else {
|
||||||
eval<0,0,0>(1, 0, buffers, fc, 0, offload_end);
|
eval<0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<0,0,0>(0, 0, buffers, fc, host_start, inum);
|
eval<0,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ---------------------------------------------------------------------- */
|
/* ---------------------------------------------------------------------- */
|
||||||
|
|
||||||
template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||||
void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
|
void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc,
|
const ForceConst<flt_t> &fc,
|
||||||
@ -182,7 +176,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
|
|
||||||
// Determine how much data to transfer
|
// Determine how much data to transfer
|
||||||
int x_size, q_size, f_stride, ev_size, separate_flag;
|
int x_size, q_size, f_stride, ev_size, separate_flag;
|
||||||
IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
|
IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
|
||||||
buffers, offload, fix, separate_flag,
|
buffers, offload, fix, separate_flag,
|
||||||
x_size, q_size, ev_size, f_stride);
|
x_size, q_size, ev_size, f_stride);
|
||||||
|
|
||||||
@ -236,25 +230,24 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
f_stride, x, q);
|
f_stride, x, q);
|
||||||
|
|
||||||
acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
|
acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||||
if (EVFLAG) {
|
if (EFLAG) oevdwl = oecoul = (acc_t)0;
|
||||||
oevdwl = oecoul = (acc_t)0;
|
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
|
||||||
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// loop over neighbors of my atoms
|
// loop over neighbors of my atoms
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
#pragma omp parallel default(none) \
|
#pragma omp parallel reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||||
shared(f_start,f_stride,nlocal,nall,minlocal) \
|
|
||||||
reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
|
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
int iifrom, iito, tid;
|
int iifrom, iip, iito, tid;
|
||||||
IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
|
IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
|
||||||
iifrom += astart;
|
iifrom += astart;
|
||||||
iito += astart;
|
iito += astart;
|
||||||
|
|
||||||
FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
|
int foff;
|
||||||
memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
|
if (NEWTON_PAIR) foff = tid * f_stride - minlocal;
|
||||||
|
else foff = -minlocal;
|
||||||
|
FORCE_T * _noalias const f = f_start + foff;
|
||||||
|
if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
|
||||||
flt_t cutboth = cut_coulsq;
|
flt_t cutboth = cut_coulsq;
|
||||||
|
|
||||||
const int toffs = tid * ccache_stride;
|
const int toffs = tid * ccache_stride;
|
||||||
@ -265,7 +258,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
int * _noalias const tj = ccachei + toffs;
|
int * _noalias const tj = ccachei + toffs;
|
||||||
int * _noalias const tjtype = ccachej + toffs;
|
int * _noalias const tjtype = ccachej + toffs;
|
||||||
|
|
||||||
for (int i = iifrom; i < iito; ++i) {
|
for (int i = iifrom; i < iito; i += iip) {
|
||||||
// const int i = ilist[ii];
|
// const int i = ilist[ii];
|
||||||
const int itype = x[i].w;
|
const int itype = x[i].w;
|
||||||
|
|
||||||
@ -284,10 +277,9 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
const flt_t ztmp = x[i].z;
|
const flt_t ztmp = x[i].z;
|
||||||
const flt_t qtmp = q[i];
|
const flt_t qtmp = q[i];
|
||||||
fxtmp = fytmp = fztmp = (acc_t)0;
|
fxtmp = fytmp = fztmp = (acc_t)0;
|
||||||
if (EVFLAG) {
|
if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
|
||||||
if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
|
if (NEWTON_PAIR == 0)
|
||||||
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
||||||
}
|
|
||||||
|
|
||||||
int ej = 0;
|
int ej = 0;
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
@ -421,77 +413,76 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
#ifdef INTEL_VMASK
|
#ifdef INTEL_VMASK
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
if (rsq > cut_coulsq) { forcecoul = (flt_t)0.0; ecoul = (flt_t)0.0; }
|
|
||||||
if (rsq > cut_ljsq) { forcelj = (flt_t)0.0; evdwl = (flt_t)0.0; }
|
if (rsq > cut_ljsq) { forcelj = (flt_t)0.0; evdwl = (flt_t)0.0; }
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
const flt_t delx = tdelx[jj];
|
|
||||||
const flt_t dely = tdely[jj];
|
|
||||||
const flt_t delz = tdelz[jj];
|
|
||||||
const flt_t fpair = (forcecoul + forcelj) * r2inv;
|
const flt_t fpair = (forcecoul + forcelj) * r2inv;
|
||||||
fxtmp += delx * fpair;
|
const flt_t fpx = fpair * tdelx[jj];
|
||||||
fytmp += dely * fpair;
|
fxtmp += fpx;
|
||||||
fztmp += delz * fpair;
|
if (NEWTON_PAIR) f[j].x -= fpx;
|
||||||
if (NEWTON_PAIR || j < nlocal) {
|
const flt_t fpy = fpair * tdely[jj];
|
||||||
f[j].x -= delx * fpair;
|
fytmp += fpy;
|
||||||
f[j].y -= dely * fpair;
|
if (NEWTON_PAIR) f[j].y -= fpy;
|
||||||
f[j].z -= delz * fpair;
|
const flt_t fpz = fpair * tdelz[jj];
|
||||||
}
|
fztmp += fpz;
|
||||||
|
if (NEWTON_PAIR) f[j].z -= fpz;
|
||||||
|
|
||||||
if (EVFLAG) {
|
if (EFLAG) {
|
||||||
flt_t ev_pre = (flt_t)0;
|
sevdwl += evdwl;
|
||||||
if (NEWTON_PAIR || i < nlocal)
|
secoul += ecoul;
|
||||||
ev_pre += (flt_t)0.5;
|
if (eatom) {
|
||||||
if (NEWTON_PAIR || j < nlocal)
|
fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
|
||||||
ev_pre += (flt_t)0.5;
|
if (NEWTON_PAIR)
|
||||||
|
f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
|
||||||
if (EFLAG) {
|
|
||||||
sevdwl += ev_pre * evdwl;
|
|
||||||
secoul += ev_pre * ecoul;
|
|
||||||
if (eatom) {
|
|
||||||
if (NEWTON_PAIR || i < nlocal)
|
|
||||||
fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
|
|
||||||
if (NEWTON_PAIR || j < nlocal)
|
|
||||||
f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair,
|
|
||||||
delx, dely, delz);
|
|
||||||
}
|
}
|
||||||
|
if (NEWTON_PAIR == 0)
|
||||||
|
IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj],
|
||||||
|
fpx, fpy, fpz);
|
||||||
} // for jj
|
} // for jj
|
||||||
f[i].x += fxtmp;
|
if (NEWTON_PAIR) {
|
||||||
f[i].y += fytmp;
|
f[i].x += fxtmp;
|
||||||
f[i].z += fztmp;
|
f[i].y += fytmp;
|
||||||
|
f[i].z += fztmp;
|
||||||
IP_PRE_ev_tally_atomq(EVFLAG, EFLAG, vflag, f, fwtmp);
|
} else {
|
||||||
|
f[i].x = fxtmp;
|
||||||
|
f[i].y = fytmp;
|
||||||
|
f[i].z = fztmp;
|
||||||
|
}
|
||||||
|
IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
|
||||||
} // for ii
|
} // for ii
|
||||||
|
|
||||||
#ifndef _LMP_INTEL_OFFLOAD
|
IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
|
||||||
if (vflag == 2)
|
f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
|
||||||
#endif
|
ov4, ov5);
|
||||||
{
|
|
||||||
#if defined(_OPENMP)
|
|
||||||
#pragma omp barrier
|
|
||||||
#endif
|
|
||||||
IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall,
|
|
||||||
nlocal, minlocal, nthreads, f_start, f_stride,
|
|
||||||
x, offload);
|
|
||||||
}
|
|
||||||
} // end of omp parallel region
|
} // end of omp parallel region
|
||||||
if (EVFLAG) {
|
|
||||||
if (EFLAG) {
|
IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
|
||||||
ev_global[0] = oevdwl;
|
ov0, ov1, ov2, ov3, ov4, ov5);
|
||||||
ev_global[1] = oecoul;
|
|
||||||
|
if (EFLAG) {
|
||||||
|
if (NEWTON_PAIR == 0) {
|
||||||
|
oevdwl *= (acc_t)0.5;
|
||||||
|
oecoul *= (acc_t)0.5;
|
||||||
}
|
}
|
||||||
if (vflag) {
|
ev_global[0] = oevdwl;
|
||||||
ev_global[2] = ov0;
|
ev_global[1] = oecoul;
|
||||||
ev_global[3] = ov1;
|
}
|
||||||
ev_global[4] = ov2;
|
if (vflag) {
|
||||||
ev_global[5] = ov3;
|
if (NEWTON_PAIR == 0) {
|
||||||
ev_global[6] = ov4;
|
ov0 *= (acc_t)0.5;
|
||||||
ev_global[7] = ov5;
|
ov1 *= (acc_t)0.5;
|
||||||
|
ov2 *= (acc_t)0.5;
|
||||||
|
ov3 *= (acc_t)0.5;
|
||||||
|
ov4 *= (acc_t)0.5;
|
||||||
|
ov5 *= (acc_t)0.5;
|
||||||
}
|
}
|
||||||
|
ev_global[2] = ov0;
|
||||||
|
ev_global[3] = ov1;
|
||||||
|
ev_global[4] = ov2;
|
||||||
|
ev_global[5] = ov3;
|
||||||
|
ev_global[6] = ov4;
|
||||||
|
ev_global[7] = ov5;
|
||||||
}
|
}
|
||||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||||
*timer_compute = MIC_Wtime() - *timer_compute;
|
*timer_compute = MIC_Wtime() - *timer_compute;
|
||||||
@ -503,7 +494,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
else
|
else
|
||||||
fix->stop_watch(TIME_HOST_PAIR);
|
fix->stop_watch(TIME_HOST_PAIR);
|
||||||
|
|
||||||
if (EVFLAG)
|
if (EFLAG || vflag)
|
||||||
fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
|
fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
|
||||||
else
|
else
|
||||||
fix->add_result_array(f_start, 0, offload);
|
fix->add_result_array(f_start, 0, offload);
|
||||||
@ -514,6 +505,10 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
void PairLJCharmmCoulLongIntel::init_style()
|
void PairLJCharmmCoulLongIntel::init_style()
|
||||||
{
|
{
|
||||||
PairLJCharmmCoulLong::init_style();
|
PairLJCharmmCoulLong::init_style();
|
||||||
|
if (force->newton_pair == 0) {
|
||||||
|
neighbor->requests[neighbor->nrequest-1]->half = 0;
|
||||||
|
neighbor->requests[neighbor->nrequest-1]->full = 1;
|
||||||
|
}
|
||||||
neighbor->requests[neighbor->nrequest-1]->intel = 1;
|
neighbor->requests[neighbor->nrequest-1]->intel = 1;
|
||||||
|
|
||||||
int ifix = modify->find_fix("package_intel");
|
int ifix = modify->find_fix("package_intel");
|
||||||
@ -541,11 +536,6 @@ template <class flt_t, class acc_t>
|
|||||||
void PairLJCharmmCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
|
void PairLJCharmmCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers)
|
IntelBuffers<flt_t,acc_t> *buffers)
|
||||||
{
|
{
|
||||||
int tp1 = atom->ntypes + 1;
|
|
||||||
int ntable = 1;
|
|
||||||
if (ncoultablebits)
|
|
||||||
for (int i = 0; i < ncoultablebits; i++) ntable *= 2;
|
|
||||||
|
|
||||||
int off_ccache = 0;
|
int off_ccache = 0;
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
if (_cop >= 0) off_ccache = 1;
|
if (_cop >= 0) off_ccache = 1;
|
||||||
@ -553,6 +543,11 @@ void PairLJCharmmCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
|
|||||||
buffers->grow_ccache(off_ccache, comm->nthreads, 1);
|
buffers->grow_ccache(off_ccache, comm->nthreads, 1);
|
||||||
_ccache_stride = buffers->ccache_stride();
|
_ccache_stride = buffers->ccache_stride();
|
||||||
|
|
||||||
|
int tp1 = atom->ntypes + 1;
|
||||||
|
int ntable = 1;
|
||||||
|
if (ncoultablebits)
|
||||||
|
for (int i = 0; i < ncoultablebits; i++) ntable *= 2;
|
||||||
|
|
||||||
fc.set_ntypes(tp1, ntable, memory, _cop);
|
fc.set_ntypes(tp1, ntable, memory, _cop);
|
||||||
buffers->set_ntypes(tp1);
|
buffers->set_ntypes(tp1);
|
||||||
flt_t **cutneighsq = buffers->get_cutneighsq();
|
flt_t **cutneighsq = buffers->get_cutneighsq();
|
||||||
|
|||||||
@ -48,7 +48,7 @@ class PairLJCharmmCoulLongIntel : public PairLJCharmmCoulLong {
|
|||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc);
|
const ForceConst<flt_t> &fc);
|
||||||
template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||||
void eval(const int offload, const int vflag,
|
void eval(const int offload, const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> * buffers,
|
IntelBuffers<flt_t,acc_t> * buffers,
|
||||||
const ForceConst<flt_t> &fc, const int astart, const int aend);
|
const ForceConst<flt_t> &fc, const int astart, const int aend);
|
||||||
|
|||||||
@ -83,57 +83,50 @@ void PairLJCutCoulLongIntel::compute(int eflag, int vflag,
|
|||||||
|
|
||||||
if (_lrt == 0 && ago != 0 && fix->separate_buffers() == 0) {
|
if (_lrt == 0 && ago != 0 && fix->separate_buffers() == 0) {
|
||||||
fix->start_watch(TIME_PACK);
|
fix->start_watch(TIME_PACK);
|
||||||
|
int packthreads;
|
||||||
|
if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
|
||||||
|
else packthreads = 1;
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
#pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
|
#pragma omp parallel if(packthreads > 1)
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
int ifrom, ito, tid;
|
int ifrom, ito, tid;
|
||||||
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
|
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
|
||||||
nthreads, sizeof(ATOM_T));
|
packthreads, sizeof(ATOM_T));
|
||||||
buffers->thr_pack(ifrom,ito,ago);
|
buffers->thr_pack(ifrom,ito,ago);
|
||||||
}
|
}
|
||||||
fix->stop_watch(TIME_PACK);
|
fix->stop_watch(TIME_PACK);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (evflag || vflag_fdotr) {
|
int ovflag = 0;
|
||||||
int ovflag = 0;
|
if (vflag_fdotr) ovflag = 2;
|
||||||
if (vflag_fdotr) ovflag = 2;
|
else if (vflag) ovflag = 1;
|
||||||
else if (vflag) ovflag = 1;
|
if (eflag) {
|
||||||
if (eflag) {
|
if (force->newton_pair) {
|
||||||
if (force->newton_pair) {
|
eval<1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
eval<1,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
|
|
||||||
} else {
|
|
||||||
eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
|
||||||
eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_pair) {
|
eval<1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
eval<1,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
|
|
||||||
} else {
|
|
||||||
eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
|
||||||
eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_pair) {
|
if (force->newton_pair) {
|
||||||
eval<0,0,1>(1, 0, buffers, fc, 0, offload_end);
|
eval<0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<0,0,1>(0, 0, buffers, fc, host_start, inum);
|
eval<0,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
} else {
|
} else {
|
||||||
eval<0,0,0>(1, 0, buffers, fc, 0, offload_end);
|
eval<0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<0,0,0>(0, 0, buffers, fc, host_start, inum);
|
eval<0,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ---------------------------------------------------------------------- */
|
/* ---------------------------------------------------------------------- */
|
||||||
|
|
||||||
template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||||
void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
|
void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc,
|
const ForceConst<flt_t> &fc,
|
||||||
const int astart, const int aend)
|
const int astart, const int aend)
|
||||||
{
|
{
|
||||||
const int inum = aend - astart;
|
const int inum = aend - astart;
|
||||||
if (inum == 0) return;
|
if (inum == 0) return;
|
||||||
@ -167,9 +160,17 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
const int ntypes = atom->ntypes + 1;
|
const int ntypes = atom->ntypes + 1;
|
||||||
const int eatom = this->eflag_atom;
|
const int eatom = this->eflag_atom;
|
||||||
|
|
||||||
|
flt_t * _noalias const ccachex = buffers->get_ccachex();
|
||||||
|
flt_t * _noalias const ccachey = buffers->get_ccachey();
|
||||||
|
flt_t * _noalias const ccachez = buffers->get_ccachez();
|
||||||
|
flt_t * _noalias const ccachew = buffers->get_ccachew();
|
||||||
|
int * _noalias const ccachei = buffers->get_ccachei();
|
||||||
|
int * _noalias const ccachej = buffers->get_ccachej();
|
||||||
|
const int ccache_stride = _ccache_stride;
|
||||||
|
|
||||||
// Determine how much data to transfer
|
// Determine how much data to transfer
|
||||||
int x_size, q_size, f_stride, ev_size, separate_flag;
|
int x_size, q_size, f_stride, ev_size, separate_flag;
|
||||||
IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
|
IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
|
||||||
buffers, offload, fix, separate_flag,
|
buffers, offload, fix, separate_flag,
|
||||||
x_size, q_size, ev_size, f_stride);
|
x_size, q_size, ev_size, f_stride);
|
||||||
|
|
||||||
@ -204,8 +205,10 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
in(x:length(x_size) alloc_if(0) free_if(0)) \
|
in(x:length(x_size) alloc_if(0) free_if(0)) \
|
||||||
in(q:length(q_size) alloc_if(0) free_if(0)) \
|
in(q:length(q_size) alloc_if(0) free_if(0)) \
|
||||||
in(overflow:length(0) alloc_if(0) free_if(0)) \
|
in(overflow:length(0) alloc_if(0) free_if(0)) \
|
||||||
|
in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \
|
||||||
|
in(ccachei,ccachej:length(0) alloc_if(0) free_if(0)) \
|
||||||
in(astart,nthreads,qqrd2e,g_ewald,inum,nall,ntypes,vflag,eatom) \
|
in(astart,nthreads,qqrd2e,g_ewald,inum,nall,ntypes,vflag,eatom) \
|
||||||
in(f_stride,nlocal,minlocal,separate_flag,offload) \
|
in(ccache_stride,f_stride,nlocal,minlocal,separate_flag,offload) \
|
||||||
out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
|
out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
|
||||||
out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
|
out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
|
||||||
out(timer_compute:length(1) alloc_if(0) free_if(0)) \
|
out(timer_compute:length(1) alloc_if(0) free_if(0)) \
|
||||||
@ -220,27 +223,34 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
f_stride, x, q);
|
f_stride, x, q);
|
||||||
|
|
||||||
acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
|
acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||||
if (EVFLAG) {
|
if (EFLAG) oevdwl = oecoul = (acc_t)0;
|
||||||
oevdwl = oecoul = (acc_t)0;
|
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
|
||||||
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// loop over neighbors of my atoms
|
// loop over neighbors of my atoms
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
#pragma omp parallel default(none) \
|
#pragma omp parallel reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||||
shared(f_start,f_stride,nlocal,nall,minlocal) \
|
|
||||||
reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
|
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
int iifrom, iito, tid;
|
int iifrom, iip, iito, tid;
|
||||||
IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
|
IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
|
||||||
iifrom += astart;
|
iifrom += astart;
|
||||||
iito += astart;
|
iito += astart;
|
||||||
|
|
||||||
FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
|
int foff;
|
||||||
memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
|
if (NEWTON_PAIR) foff = tid * f_stride - minlocal;
|
||||||
|
else foff = -minlocal;
|
||||||
|
FORCE_T * _noalias const f = f_start + foff;
|
||||||
|
if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
|
||||||
|
|
||||||
for (int i = iifrom; i < iito; ++i) {
|
const int toffs = tid * ccache_stride;
|
||||||
|
flt_t * _noalias const tdelx = ccachex + toffs;
|
||||||
|
flt_t * _noalias const tdely = ccachey + toffs;
|
||||||
|
flt_t * _noalias const tdelz = ccachez + toffs;
|
||||||
|
flt_t * _noalias const trsq = ccachew + toffs;
|
||||||
|
int * _noalias const tj = ccachei + toffs;
|
||||||
|
int * _noalias const tjtype = ccachej + toffs;
|
||||||
|
|
||||||
|
for (int i = iifrom; i < iito; i += iip) {
|
||||||
const int itype = x[i].w;
|
const int itype = x[i].w;
|
||||||
|
|
||||||
const int ptr_off = itype * ntypes;
|
const int ptr_off = itype * ntypes;
|
||||||
@ -258,86 +268,98 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
const flt_t ztmp = x[i].z;
|
const flt_t ztmp = x[i].z;
|
||||||
const flt_t qtmp = q[i];
|
const flt_t qtmp = q[i];
|
||||||
fxtmp = fytmp = fztmp = (acc_t)0;
|
fxtmp = fytmp = fztmp = (acc_t)0;
|
||||||
if (EVFLAG) {
|
if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
|
||||||
if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
|
if (NEWTON_PAIR == 0)
|
||||||
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
||||||
|
|
||||||
|
int ej = 0;
|
||||||
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
|
#pragma vector aligned
|
||||||
|
#pragma ivdep
|
||||||
|
#endif
|
||||||
|
for (int jj = 0; jj < jnum; jj++) {
|
||||||
|
const int j = jlist[jj] & NEIGHMASK;
|
||||||
|
const flt_t delx = xtmp - x[j].x;
|
||||||
|
const flt_t dely = ytmp - x[j].y;
|
||||||
|
const flt_t delz = ztmp - x[j].z;
|
||||||
|
const int jtype = x[j].w;
|
||||||
|
const flt_t rsq = delx * delx + dely * dely + delz * delz;
|
||||||
|
|
||||||
|
if (rsq < c_forcei[jtype].cutsq) {
|
||||||
|
trsq[ej]=rsq;
|
||||||
|
tdelx[ej]=delx;
|
||||||
|
tdely[ej]=dely;
|
||||||
|
tdelz[ej]=delz;
|
||||||
|
tjtype[ej]=jtype;
|
||||||
|
tj[ej]=jlist[jj];
|
||||||
|
ej++;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
|
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
|
||||||
sv0, sv1, sv2, sv3, sv4, sv5)
|
sv0, sv1, sv2, sv3, sv4, sv5)
|
||||||
#endif
|
#endif
|
||||||
for (int jj = 0; jj < jnum; jj++) {
|
for (int jj = 0; jj < ej; jj++) {
|
||||||
flt_t forcecoul, forcelj, evdwl, ecoul;
|
flt_t forcecoul, forcelj, evdwl, ecoul;
|
||||||
forcecoul = forcelj = evdwl = ecoul = (flt_t)0.0;
|
forcecoul = forcelj = evdwl = ecoul = (flt_t)0.0;
|
||||||
|
|
||||||
const int sbindex = jlist[jj] >> SBBITS & 3;
|
const int j = tj[jj] & NEIGHMASK;
|
||||||
const int j = jlist[jj] & NEIGHMASK;
|
const int sbindex = tj[jj] >> SBBITS & 3;
|
||||||
|
const int jtype = tjtype[jj];
|
||||||
const flt_t delx = xtmp - x[j].x;
|
const flt_t rsq = trsq[jj];
|
||||||
const flt_t dely = ytmp - x[j].y;
|
|
||||||
const flt_t delz = ztmp - x[j].z;
|
|
||||||
const int jtype = x[j].w;
|
|
||||||
const flt_t rsq = delx * delx + dely * dely + delz * delz;
|
|
||||||
|
|
||||||
const flt_t r2inv = (flt_t)1.0 / rsq;
|
const flt_t r2inv = (flt_t)1.0 / rsq;
|
||||||
|
|
||||||
#ifdef INTEL_VMASK
|
#ifdef INTEL_ALLOW_TABLE
|
||||||
if (rsq < c_forcei[jtype].cutsq) {
|
if (!ncoultablebits || rsq <= tabinnersq) {
|
||||||
#endif
|
#endif
|
||||||
#ifdef INTEL_ALLOW_TABLE
|
const flt_t A1 = 0.254829592;
|
||||||
if (!ncoultablebits || rsq <= tabinnersq) {
|
const flt_t A2 = -0.284496736;
|
||||||
#endif
|
const flt_t A3 = 1.421413741;
|
||||||
const flt_t A1 = 0.254829592;
|
const flt_t A4 = -1.453152027;
|
||||||
const flt_t A2 = -0.284496736;
|
const flt_t A5 = 1.061405429;
|
||||||
const flt_t A3 = 1.421413741;
|
const flt_t EWALD_F = 1.12837917;
|
||||||
const flt_t A4 = -1.453152027;
|
const flt_t INV_EWALD_P = 1.0 / 0.3275911;
|
||||||
const flt_t A5 = 1.061405429;
|
|
||||||
const flt_t EWALD_F = 1.12837917;
|
|
||||||
const flt_t INV_EWALD_P = 1.0 / 0.3275911;
|
|
||||||
|
|
||||||
const flt_t r = (flt_t)1.0 / sqrt(r2inv);
|
const flt_t r = (flt_t)1.0 / sqrt(r2inv);
|
||||||
const flt_t grij = g_ewald * r;
|
const flt_t grij = g_ewald * r;
|
||||||
const flt_t expm2 = exp(-grij * grij);
|
const flt_t expm2 = exp(-grij * grij);
|
||||||
const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij);
|
const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij);
|
||||||
const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
|
const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
|
||||||
const flt_t prefactor = qqrd2e * qtmp * q[j] / r;
|
const flt_t prefactor = qqrd2e * qtmp * q[j] / r;
|
||||||
forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
|
forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
|
||||||
if (EFLAG) ecoul = prefactor * erfc;
|
if (EFLAG) ecoul = prefactor * erfc;
|
||||||
|
|
||||||
const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])*
|
const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])*
|
||||||
|
prefactor;
|
||||||
|
forcecoul -= adjust;
|
||||||
|
if (EFLAG) ecoul -= adjust;
|
||||||
|
|
||||||
|
#ifdef INTEL_ALLOW_TABLE
|
||||||
|
} else {
|
||||||
|
float rsq_lookup = rsq;
|
||||||
|
const int itable = (__intel_castf32_u32(rsq_lookup) &
|
||||||
|
ncoulmask) >> ncoulshiftbits;
|
||||||
|
const flt_t fraction = (rsq_lookup - table[itable].r) *
|
||||||
|
table[itable].dr;
|
||||||
|
|
||||||
|
const flt_t tablet = table[itable].f +
|
||||||
|
fraction * table[itable].df;
|
||||||
|
forcecoul = qtmp * q[j] * tablet;
|
||||||
|
if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] +
|
||||||
|
fraction * detable[itable]);
|
||||||
|
if (sbindex) {
|
||||||
|
const flt_t table2 = ctable[itable] +
|
||||||
|
fraction * dctable[itable];
|
||||||
|
const flt_t prefactor = qtmp * q[j] * table2;
|
||||||
|
const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) *
|
||||||
prefactor;
|
prefactor;
|
||||||
forcecoul -= adjust;
|
forcecoul -= adjust;
|
||||||
if (EFLAG) ecoul -= adjust;
|
if (EFLAG) ecoul -= adjust;
|
||||||
|
}
|
||||||
#ifdef INTEL_ALLOW_TABLE
|
|
||||||
} else {
|
|
||||||
float rsq_lookup = rsq;
|
|
||||||
const int itable = (__intel_castf32_u32(rsq_lookup) &
|
|
||||||
ncoulmask) >> ncoulshiftbits;
|
|
||||||
const flt_t fraction = (rsq_lookup - table[itable].r) *
|
|
||||||
table[itable].dr;
|
|
||||||
|
|
||||||
const flt_t tablet = table[itable].f +
|
|
||||||
fraction * table[itable].df;
|
|
||||||
forcecoul = qtmp * q[j] * tablet;
|
|
||||||
if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] +
|
|
||||||
fraction * detable[itable]);
|
|
||||||
if (sbindex) {
|
|
||||||
const flt_t table2 = ctable[itable] +
|
|
||||||
fraction * dctable[itable];
|
|
||||||
const flt_t prefactor = qtmp * q[j] * table2;
|
|
||||||
const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) *
|
|
||||||
prefactor;
|
|
||||||
forcecoul -= adjust;
|
|
||||||
if (EFLAG) ecoul -= adjust;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
#ifdef INTEL_VMASK
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef INTEL_VMASK
|
#ifdef INTEL_VMASK
|
||||||
if (rsq < c_forcei[jtype].cut_ljsq) {
|
if (rsq < c_forcei[jtype].cut_ljsq) {
|
||||||
@ -357,80 +379,79 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
#ifdef INTEL_VMASK
|
#ifdef INTEL_VMASK
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
if (rsq > c_forcei[jtype].cutsq)
|
|
||||||
{ forcecoul = (flt_t)0.0; ecoul = (flt_t)0.0; }
|
|
||||||
if (rsq > c_forcei[jtype].cut_ljsq)
|
if (rsq > c_forcei[jtype].cut_ljsq)
|
||||||
{ forcelj = (flt_t)0.0; evdwl = (flt_t)0.0; }
|
{ forcelj = (flt_t)0.0; evdwl = (flt_t)0.0; }
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef INTEL_VMASK
|
const flt_t fpair = (forcecoul + forcelj) * r2inv;
|
||||||
if (rsq < c_forcei[jtype].cutsq) {
|
const flt_t fpx = fpair * tdelx[jj];
|
||||||
#endif
|
fxtmp += fpx;
|
||||||
const flt_t fpair = (forcecoul + forcelj) * r2inv;
|
if (NEWTON_PAIR) f[j].x -= fpx;
|
||||||
fxtmp += delx * fpair;
|
const flt_t fpy = fpair * tdely[jj];
|
||||||
fytmp += dely * fpair;
|
fytmp += fpy;
|
||||||
fztmp += delz * fpair;
|
if (NEWTON_PAIR) f[j].y -= fpy;
|
||||||
if (NEWTON_PAIR || j < nlocal) {
|
const flt_t fpz = fpair * tdelz[jj];
|
||||||
f[j].x -= delx * fpair;
|
fztmp += fpz;
|
||||||
f[j].y -= dely * fpair;
|
if (NEWTON_PAIR) f[j].z -= fpz;
|
||||||
f[j].z -= delz * fpair;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (EVFLAG) {
|
if (EFLAG) {
|
||||||
flt_t ev_pre = (flt_t)0;
|
sevdwl += evdwl;
|
||||||
if (NEWTON_PAIR || i < nlocal)
|
secoul += ecoul;
|
||||||
ev_pre += (flt_t)0.5;
|
if (eatom) {
|
||||||
if (NEWTON_PAIR || j < nlocal)
|
fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
|
||||||
ev_pre += (flt_t)0.5;
|
if (NEWTON_PAIR)
|
||||||
|
f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
|
||||||
if (EFLAG) {
|
|
||||||
sevdwl += ev_pre * evdwl;
|
|
||||||
secoul += ev_pre * ecoul;
|
|
||||||
if (eatom) {
|
|
||||||
if (NEWTON_PAIR || i < nlocal)
|
|
||||||
fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
|
|
||||||
if (NEWTON_PAIR || j < nlocal)
|
|
||||||
f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair, delx, dely, delz);
|
|
||||||
}
|
}
|
||||||
#ifdef INTEL_VMASK
|
}
|
||||||
}
|
if (NEWTON_PAIR == 0)
|
||||||
#endif
|
IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj],
|
||||||
|
fpx, fpy, fpz);
|
||||||
} // for jj
|
} // for jj
|
||||||
|
|
||||||
f[i].x += fxtmp;
|
if (NEWTON_PAIR) {
|
||||||
f[i].y += fytmp;
|
f[i].x += fxtmp;
|
||||||
f[i].z += fztmp;
|
f[i].y += fytmp;
|
||||||
IP_PRE_ev_tally_atomq(EVFLAG, EFLAG, vflag, f, fwtmp);
|
f[i].z += fztmp;
|
||||||
|
} else {
|
||||||
|
f[i].x = fxtmp;
|
||||||
|
f[i].y = fytmp;
|
||||||
|
f[i].z = fztmp;
|
||||||
|
}
|
||||||
|
|
||||||
|
IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
|
||||||
} // for ii
|
} // for ii
|
||||||
|
|
||||||
#ifndef _LMP_INTEL_OFFLOAD
|
IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
|
||||||
if (vflag == 2)
|
f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
|
||||||
#endif
|
ov4, ov5);
|
||||||
{
|
|
||||||
#if defined(_OPENMP)
|
|
||||||
#pragma omp barrier
|
|
||||||
#endif
|
|
||||||
IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall,
|
|
||||||
nlocal, minlocal, nthreads, f_start, f_stride,
|
|
||||||
x, offload);
|
|
||||||
}
|
|
||||||
} // end of omp parallel region
|
} // end of omp parallel region
|
||||||
if (EVFLAG) {
|
|
||||||
if (EFLAG) {
|
IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
|
||||||
ev_global[0] = oevdwl;
|
ov0, ov1, ov2, ov3, ov4, ov5);
|
||||||
ev_global[1] = oecoul;
|
|
||||||
|
if (EFLAG) {
|
||||||
|
if (NEWTON_PAIR == 0) {
|
||||||
|
oevdwl *= (acc_t)0.5;
|
||||||
|
oecoul *= (acc_t)0.5;
|
||||||
}
|
}
|
||||||
if (vflag) {
|
ev_global[0] = oevdwl;
|
||||||
ev_global[2] = ov0;
|
ev_global[1] = oecoul;
|
||||||
ev_global[3] = ov1;
|
}
|
||||||
ev_global[4] = ov2;
|
if (vflag) {
|
||||||
ev_global[5] = ov3;
|
if (NEWTON_PAIR == 0) {
|
||||||
ev_global[6] = ov4;
|
ov0 *= (acc_t)0.5;
|
||||||
ev_global[7] = ov5;
|
ov1 *= (acc_t)0.5;
|
||||||
|
ov2 *= (acc_t)0.5;
|
||||||
|
ov3 *= (acc_t)0.5;
|
||||||
|
ov4 *= (acc_t)0.5;
|
||||||
|
ov5 *= (acc_t)0.5;
|
||||||
}
|
}
|
||||||
|
ev_global[2] = ov0;
|
||||||
|
ev_global[3] = ov1;
|
||||||
|
ev_global[4] = ov2;
|
||||||
|
ev_global[5] = ov3;
|
||||||
|
ev_global[6] = ov4;
|
||||||
|
ev_global[7] = ov5;
|
||||||
}
|
}
|
||||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||||
*timer_compute = MIC_Wtime() - *timer_compute;
|
*timer_compute = MIC_Wtime() - *timer_compute;
|
||||||
@ -442,7 +463,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
else
|
else
|
||||||
fix->stop_watch(TIME_HOST_PAIR);
|
fix->stop_watch(TIME_HOST_PAIR);
|
||||||
|
|
||||||
if (EVFLAG)
|
if (EFLAG || vflag)
|
||||||
fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
|
fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
|
||||||
else
|
else
|
||||||
fix->add_result_array(f_start, 0, offload);
|
fix->add_result_array(f_start, 0, offload);
|
||||||
@ -453,6 +474,10 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
|
|||||||
void PairLJCutCoulLongIntel::init_style()
|
void PairLJCutCoulLongIntel::init_style()
|
||||||
{
|
{
|
||||||
PairLJCutCoulLong::init_style();
|
PairLJCutCoulLong::init_style();
|
||||||
|
if (force->newton_pair == 0) {
|
||||||
|
neighbor->requests[neighbor->nrequest-1]->half = 0;
|
||||||
|
neighbor->requests[neighbor->nrequest-1]->full = 1;
|
||||||
|
}
|
||||||
neighbor->requests[neighbor->nrequest-1]->intel = 1;
|
neighbor->requests[neighbor->nrequest-1]->intel = 1;
|
||||||
|
|
||||||
int ifix = modify->find_fix("package_intel");
|
int ifix = modify->find_fix("package_intel");
|
||||||
@ -480,6 +505,13 @@ template <class flt_t, class acc_t>
|
|||||||
void PairLJCutCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
|
void PairLJCutCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers)
|
IntelBuffers<flt_t,acc_t> *buffers)
|
||||||
{
|
{
|
||||||
|
int off_ccache = 0;
|
||||||
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
|
if (_cop >= 0) off_ccache = 1;
|
||||||
|
#endif
|
||||||
|
buffers->grow_ccache(off_ccache, comm->nthreads, 1);
|
||||||
|
_ccache_stride = buffers->ccache_stride();
|
||||||
|
|
||||||
int tp1 = atom->ntypes + 1;
|
int tp1 = atom->ntypes + 1;
|
||||||
int ntable = 1;
|
int ntable = 1;
|
||||||
if (ncoultablebits)
|
if (ncoultablebits)
|
||||||
@ -514,6 +546,9 @@ void PairLJCutCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
|
|||||||
|
|
||||||
for (int i = 0; i < tp1; i++) {
|
for (int i = 0; i < tp1; i++) {
|
||||||
for (int j = 0; j < tp1; j++) {
|
for (int j = 0; j < tp1; j++) {
|
||||||
|
if (cutsq[i][j] < cut_ljsq[i][j])
|
||||||
|
error->all(FLERR,
|
||||||
|
"Intel variant of lj/cut/coul/long expects lj cutoff<=coulombic");
|
||||||
fc.c_force[i][j].cutsq = cutsq[i][j];
|
fc.c_force[i][j].cutsq = cutsq[i][j];
|
||||||
fc.c_force[i][j].cut_ljsq = cut_ljsq[i][j];
|
fc.c_force[i][j].cut_ljsq = cut_ljsq[i][j];
|
||||||
fc.c_force[i][j].lj1 = lj1[i][j];
|
fc.c_force[i][j].lj1 = lj1[i][j];
|
||||||
|
|||||||
@ -42,13 +42,13 @@ class PairLJCutCoulLongIntel : public PairLJCutCoulLong {
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
FixIntel *fix;
|
FixIntel *fix;
|
||||||
int _cop, _lrt;
|
int _cop, _lrt, _ccache_stride;
|
||||||
|
|
||||||
template <class flt_t> class ForceConst;
|
template <class flt_t> class ForceConst;
|
||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc);
|
const ForceConst<flt_t> &fc);
|
||||||
template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||||
void eval(const int offload, const int vflag,
|
void eval(const int offload, const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> * buffers,
|
IntelBuffers<flt_t,acc_t> * buffers,
|
||||||
const ForceConst<flt_t> &fc, const int astart, const int aend);
|
const ForceConst<flt_t> &fc, const int astart, const int aend);
|
||||||
|
|||||||
@ -75,85 +75,64 @@ void PairLJCutIntel::compute(int eflag, int vflag,
|
|||||||
if (ago != 0 && fix->separate_buffers() == 0) {
|
if (ago != 0 && fix->separate_buffers() == 0) {
|
||||||
fix->start_watch(TIME_PACK);
|
fix->start_watch(TIME_PACK);
|
||||||
|
|
||||||
|
int packthreads;
|
||||||
|
if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
|
||||||
|
else packthreads = 1;
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
#pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
|
#pragma omp parallel if(packthreads > 1)
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
int ifrom, ito, tid;
|
int ifrom, ito, tid;
|
||||||
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
|
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
|
||||||
nthreads, sizeof(ATOM_T));
|
packthreads, sizeof(ATOM_T));
|
||||||
buffers->thr_pack(ifrom,ito,ago);
|
buffers->thr_pack(ifrom,ito,ago);
|
||||||
}
|
}
|
||||||
fix->stop_watch(TIME_PACK);
|
fix->stop_watch(TIME_PACK);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int ovflag = 0;
|
||||||
|
if (vflag_fdotr) ovflag = 2;
|
||||||
|
else if (vflag) ovflag = 1;
|
||||||
if (_onetype) {
|
if (_onetype) {
|
||||||
if (evflag || vflag_fdotr) {
|
if (eflag) {
|
||||||
int ovflag = 0;
|
if (force->newton_pair) {
|
||||||
if (vflag_fdotr) ovflag = 2;
|
eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
else if (vflag) ovflag = 1;
|
eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
if (eflag) {
|
|
||||||
if (force->newton_pair) {
|
|
||||||
eval<1,1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
|
||||||
eval<1,1,1,1>(0, ovflag, buffers, fc, host_start, inum);
|
|
||||||
} else {
|
|
||||||
eval<1,1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
|
||||||
eval<1,1,1,0>(0, ovflag, buffers, fc, host_start, inum);
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_pair) {
|
eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<1,1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
eval<1,1,0,1>(0, ovflag, buffers, fc, host_start, inum);
|
|
||||||
} else {
|
|
||||||
eval<1,1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
|
||||||
eval<1,1,0,0>(0, ovflag, buffers, fc, host_start, inum);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_pair) {
|
if (force->newton_pair) {
|
||||||
eval<1,0,0,1>(1, 0, buffers, fc, 0, offload_end);
|
eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<1,0,0,1>(0, 0, buffers, fc, host_start, inum);
|
eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
} else {
|
} else {
|
||||||
eval<1,0,0,0>(1, 0, buffers, fc, 0, offload_end);
|
eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<1,0,0,0>(0, 0, buffers, fc, host_start, inum);
|
eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (evflag || vflag_fdotr) {
|
if (eflag) {
|
||||||
int ovflag = 0;
|
if (force->newton_pair) {
|
||||||
if (vflag_fdotr) ovflag = 2;
|
eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
else if (vflag) ovflag = 1;
|
eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
if (eflag) {
|
|
||||||
if (force->newton_pair) {
|
|
||||||
eval<0,1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
|
||||||
eval<0,1,1,1>(0, ovflag, buffers, fc, host_start, inum);
|
|
||||||
} else {
|
|
||||||
eval<0,1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
|
||||||
eval<0,1,1,0>(0, ovflag, buffers, fc, host_start, inum);
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_pair) {
|
eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<0,1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
eval<0,1,0,1>(0, ovflag, buffers, fc, host_start, inum);
|
|
||||||
} else {
|
|
||||||
eval<0,1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
|
||||||
eval<0,1,0,0>(0, ovflag, buffers, fc, host_start, inum);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (force->newton_pair) {
|
if (force->newton_pair) {
|
||||||
eval<0,0,0,1>(1, 0, buffers, fc, 0, offload_end);
|
eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<0,0,0,1>(0, 0, buffers, fc, host_start, inum);
|
eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
} else {
|
} else {
|
||||||
eval<0,0,0,0>(1, 0, buffers, fc, 0, offload_end);
|
eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<0,0,0,0>(0, 0, buffers, fc, host_start, inum);
|
eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int ONETYPE, int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t,
|
template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||||
class acc_t>
|
|
||||||
void PairLJCutIntel::eval(const int offload, const int vflag,
|
void PairLJCutIntel::eval(const int offload, const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc,
|
const ForceConst<flt_t> &fc,
|
||||||
@ -181,7 +160,7 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
|
|||||||
|
|
||||||
// Determine how much data to transfer
|
// Determine how much data to transfer
|
||||||
int x_size, q_size, f_stride, ev_size, separate_flag;
|
int x_size, q_size, f_stride, ev_size, separate_flag;
|
||||||
IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
|
IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
|
||||||
buffers, offload, fix, separate_flag,
|
buffers, offload, fix, separate_flag,
|
||||||
x_size, q_size, ev_size, f_stride);
|
x_size, q_size, ev_size, f_stride);
|
||||||
|
|
||||||
@ -200,25 +179,24 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
|
|||||||
f_stride, x, 0);
|
f_stride, x, 0);
|
||||||
|
|
||||||
acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
|
acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||||
if (EVFLAG) {
|
if (EFLAG) oevdwl = (acc_t)0;
|
||||||
oevdwl = (acc_t)0;
|
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
|
||||||
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// loop over neighbors of my atoms
|
// loop over neighbors of my atoms
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
#pragma omp parallel default(none) \
|
#pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||||
shared(f_start,f_stride,nlocal,nall,minlocal) \
|
|
||||||
reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
|
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
int iifrom, iito, tid;
|
int iifrom, iip, iito, tid;
|
||||||
IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
|
IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
|
||||||
iifrom += astart;
|
iifrom += astart;
|
||||||
iito += astart;
|
iito += astart;
|
||||||
|
|
||||||
FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
|
int foff;
|
||||||
memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
|
if (NEWTON_PAIR) foff = tid * f_stride - minlocal;
|
||||||
|
else foff = -minlocal;
|
||||||
|
FORCE_T * _noalias const f = f_start + foff;
|
||||||
|
if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
|
||||||
|
|
||||||
flt_t cutsq, lj1, lj2, lj3, lj4, offset;
|
flt_t cutsq, lj1, lj2, lj3, lj4, offset;
|
||||||
if (ONETYPE) {
|
if (ONETYPE) {
|
||||||
@ -229,7 +207,7 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
|
|||||||
lj4 = lj34[3].lj4;
|
lj4 = lj34[3].lj4;
|
||||||
offset = ljc12o[3].offset;
|
offset = ljc12o[3].offset;
|
||||||
}
|
}
|
||||||
for (int i = iifrom; i < iito; ++i) {
|
for (int i = iifrom; i < iito; i += iip) {
|
||||||
int itype, ptr_off;
|
int itype, ptr_off;
|
||||||
const FC_PACKED1_T * _noalias ljc12oi;
|
const FC_PACKED1_T * _noalias ljc12oi;
|
||||||
const FC_PACKED2_T * _noalias lj34i;
|
const FC_PACKED2_T * _noalias lj34i;
|
||||||
@ -250,10 +228,9 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
|
|||||||
const flt_t ytmp = x[i].y;
|
const flt_t ytmp = x[i].y;
|
||||||
const flt_t ztmp = x[i].z;
|
const flt_t ztmp = x[i].z;
|
||||||
fxtmp = fytmp = fztmp = (acc_t)0;
|
fxtmp = fytmp = fztmp = (acc_t)0;
|
||||||
if (EVFLAG) {
|
if (EFLAG) fwtmp = sevdwl = (acc_t)0;
|
||||||
if (EFLAG) fwtmp = sevdwl = (acc_t)0;
|
if (NEWTON_PAIR == 0)
|
||||||
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
||||||
}
|
|
||||||
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
@ -301,83 +278,84 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
|
|||||||
else
|
else
|
||||||
fpair = forcelj * r2inv;
|
fpair = forcelj * r2inv;
|
||||||
|
|
||||||
fxtmp += delx * fpair;
|
const flt_t fpx = fpair * delx;
|
||||||
fytmp += dely * fpair;
|
fxtmp += fpx;
|
||||||
fztmp += delz * fpair;
|
if (NEWTON_PAIR) f[j].x -= fpx;
|
||||||
if (NEWTON_PAIR || j < nlocal) {
|
const flt_t fpy = fpair * dely;
|
||||||
f[j].x -= delx * fpair;
|
fytmp += fpy;
|
||||||
f[j].y -= dely * fpair;
|
if (NEWTON_PAIR) f[j].y -= fpy;
|
||||||
f[j].z -= delz * fpair;
|
const flt_t fpz = fpair * delz;
|
||||||
}
|
fztmp += fpz;
|
||||||
|
if (NEWTON_PAIR) f[j].z -= fpz;
|
||||||
|
|
||||||
if (EVFLAG) {
|
if (EFLAG) {
|
||||||
flt_t ev_pre = (flt_t)0;
|
if (!ONETYPE) {
|
||||||
if (NEWTON_PAIR || i<nlocal)
|
lj3 = lj34i[jtype].lj3;
|
||||||
ev_pre += (flt_t)0.5;
|
lj4 = lj34i[jtype].lj4;
|
||||||
if (NEWTON_PAIR || j<nlocal)
|
offset = ljc12oi[jtype].offset;
|
||||||
ev_pre += (flt_t)0.5;
|
}
|
||||||
|
evdwl = r6inv * (lj3 * r6inv - lj4);
|
||||||
if (EFLAG) {
|
#ifdef INTEL_VMASK
|
||||||
if (!ONETYPE) {
|
evdwl -= offset;
|
||||||
lj3 = lj34i[jtype].lj3;
|
#else
|
||||||
lj4 = lj34i[jtype].lj4;
|
if (rsq < cutsq) evdwl -= offset;
|
||||||
offset = ljc12oi[jtype].offset;
|
#endif
|
||||||
}
|
if (!ONETYPE) evdwl *= factor_lj;
|
||||||
evdwl = r6inv * (lj3 * r6inv - lj4);
|
sevdwl += evdwl;
|
||||||
#ifdef INTEL_VMASK
|
if (eatom) {
|
||||||
evdwl -= offset;
|
fwtmp += (flt_t)0.5 * evdwl;
|
||||||
#else
|
if (NEWTON_PAIR)
|
||||||
if (rsq < cutsq) evdwl -= offset;
|
f[j].w += (flt_t)0.5 * evdwl;
|
||||||
#endif
|
|
||||||
if (!ONETYPE) evdwl *= factor_lj;
|
|
||||||
sevdwl += ev_pre*evdwl;
|
|
||||||
if (eatom) {
|
|
||||||
if (NEWTON_PAIR || i < nlocal)
|
|
||||||
fwtmp += 0.5 * evdwl;
|
|
||||||
if (NEWTON_PAIR || j < nlocal)
|
|
||||||
f[j].w += 0.5 * evdwl;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair,
|
if (NEWTON_PAIR == 0)
|
||||||
delx, dely, delz);
|
IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz);
|
||||||
}
|
|
||||||
#ifdef INTEL_VMASK
|
#ifdef INTEL_VMASK
|
||||||
} // if rsq
|
} // if rsq
|
||||||
#endif
|
#endif
|
||||||
} // for jj
|
} // for jj
|
||||||
f[i].x += fxtmp;
|
if (NEWTON_PAIR) {
|
||||||
f[i].y += fytmp;
|
f[i].x += fxtmp;
|
||||||
f[i].z += fztmp;
|
f[i].y += fytmp;
|
||||||
|
f[i].z += fztmp;
|
||||||
|
} else {
|
||||||
|
f[i].x = fxtmp;
|
||||||
|
f[i].y = fytmp;
|
||||||
|
f[i].z = fztmp;
|
||||||
|
}
|
||||||
|
|
||||||
IP_PRE_ev_tally_atom(EVFLAG, EFLAG, vflag, f, fwtmp);
|
IP_PRE_ev_tally_atom(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
|
||||||
} // for ii
|
} // for ii
|
||||||
|
|
||||||
#ifndef _LMP_INTEL_OFFLOAD
|
IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
|
||||||
if (vflag == 2)
|
f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
|
||||||
#endif
|
ov4, ov5);
|
||||||
{
|
|
||||||
#if defined(_OPENMP)
|
|
||||||
#pragma omp barrier
|
|
||||||
#endif
|
|
||||||
IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall,
|
|
||||||
nlocal, minlocal, nthreads, f_start, f_stride,
|
|
||||||
x, offload);
|
|
||||||
}
|
|
||||||
} // end omp
|
} // end omp
|
||||||
if (EVFLAG) {
|
|
||||||
if (EFLAG) {
|
IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
|
||||||
ev_global[0] = oevdwl;
|
ov0, ov1, ov2, ov3, ov4, ov5);
|
||||||
ev_global[1] = (acc_t)0.0;
|
|
||||||
}
|
if (EFLAG) {
|
||||||
if (vflag) {
|
if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
|
||||||
ev_global[2] = ov0;
|
ev_global[0] = oevdwl;
|
||||||
ev_global[3] = ov1;
|
ev_global[1] = (acc_t)0.0;
|
||||||
ev_global[4] = ov2;
|
}
|
||||||
ev_global[5] = ov3;
|
if (vflag) {
|
||||||
ev_global[6] = ov4;
|
if (NEWTON_PAIR == 0) {
|
||||||
ev_global[7] = ov5;
|
ov0 *= (acc_t)0.5;
|
||||||
|
ov1 *= (acc_t)0.5;
|
||||||
|
ov2 *= (acc_t)0.5;
|
||||||
|
ov3 *= (acc_t)0.5;
|
||||||
|
ov4 *= (acc_t)0.5;
|
||||||
|
ov5 *= (acc_t)0.5;
|
||||||
}
|
}
|
||||||
|
ev_global[2] = ov0;
|
||||||
|
ev_global[3] = ov1;
|
||||||
|
ev_global[4] = ov2;
|
||||||
|
ev_global[5] = ov3;
|
||||||
|
ev_global[6] = ov4;
|
||||||
|
ev_global[7] = ov5;
|
||||||
}
|
}
|
||||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||||
*timer_compute = MIC_Wtime() - *timer_compute;
|
*timer_compute = MIC_Wtime() - *timer_compute;
|
||||||
@ -389,7 +367,7 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
|
|||||||
else
|
else
|
||||||
fix->stop_watch(TIME_HOST_PAIR);
|
fix->stop_watch(TIME_HOST_PAIR);
|
||||||
|
|
||||||
if (EVFLAG)
|
if (EFLAG || vflag)
|
||||||
fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
|
fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
|
||||||
else
|
else
|
||||||
fix->add_result_array(f_start, 0, offload);
|
fix->add_result_array(f_start, 0, offload);
|
||||||
@ -400,6 +378,10 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
|
|||||||
void PairLJCutIntel::init_style()
|
void PairLJCutIntel::init_style()
|
||||||
{
|
{
|
||||||
PairLJCut::init_style();
|
PairLJCut::init_style();
|
||||||
|
if (force->newton_pair == 0) {
|
||||||
|
neighbor->requests[neighbor->nrequest-1]->half = 0;
|
||||||
|
neighbor->requests[neighbor->nrequest-1]->full = 1;
|
||||||
|
}
|
||||||
neighbor->requests[neighbor->nrequest-1]->intel = 1;
|
neighbor->requests[neighbor->nrequest-1]->intel = 1;
|
||||||
|
|
||||||
int ifix = modify->find_fix("package_intel");
|
int ifix = modify->find_fix("package_intel");
|
||||||
|
|||||||
@ -45,8 +45,7 @@ class PairLJCutIntel : public PairLJCut {
|
|||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc);
|
const ForceConst<flt_t> &fc);
|
||||||
template <int ONETYPE, int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t,
|
template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||||
class acc_t>
|
|
||||||
void eval(const int offload, const int vflag,
|
void eval(const int offload, const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> * buffers,
|
IntelBuffers<flt_t,acc_t> * buffers,
|
||||||
const ForceConst<flt_t> &fc, const int astart, const int aend);
|
const ForceConst<flt_t> &fc, const int astart, const int aend);
|
||||||
|
|||||||
50
src/USER-INTEL/pair_lj_long_coul_long_intel.cpp
Normal file
50
src/USER-INTEL/pair_lj_long_coul_long_intel.cpp
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
/* ----------------------------------------------------------------------
|
||||||
|
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||||
|
http://lammps.sandia.gov, Sandia National Laboratories
|
||||||
|
Steve Plimpton, sjplimp@sandia.gov
|
||||||
|
|
||||||
|
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||||
|
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||||
|
certain rights in this software. This software is distributed under
|
||||||
|
the GNU General Public License.
|
||||||
|
|
||||||
|
See the README file in the top-level LAMMPS directory.
|
||||||
|
------------------------------------------------------------------------- */
|
||||||
|
|
||||||
|
/* ----------------------------------------------------------------------
|
||||||
|
Contributing authors: William McDoniel (RWTH Aachen University)
|
||||||
|
------------------------------------------------------------------------- */
|
||||||
|
|
||||||
|
#include <math.h>
|
||||||
|
#include "pair_lj_long_coul_long_intel.h"
|
||||||
|
#include "atom.h"
|
||||||
|
#include "comm.h"
|
||||||
|
#include "force.h"
|
||||||
|
#include "group.h"
|
||||||
|
#include "kspace.h"
|
||||||
|
#include "memory.h"
|
||||||
|
#include "neighbor.h"
|
||||||
|
#include "neigh_list.h"
|
||||||
|
#include "neigh_request.h"
|
||||||
|
#include "memory.h"
|
||||||
|
#include "suffix.h"
|
||||||
|
|
||||||
|
|
||||||
|
using namespace LAMMPS_NS;
|
||||||
|
|
||||||
|
#define C_FORCE_T typename ForceConst<flt_t>::c_force_t
|
||||||
|
#define C_ENERGY_T typename ForceConst<flt_t>::c_energy_t
|
||||||
|
#define TABLE_T typename ForceConst<flt_t>::table_t
|
||||||
|
|
||||||
|
PairLJLongCoulLongIntel::PairLJLongCoulLongIntel(LAMMPS *lmp) :
|
||||||
|
PairLJLongCoulLong(lmp)
|
||||||
|
{
|
||||||
|
suffix_flag |= Suffix::INTEL;
|
||||||
|
respa_enable = 0;
|
||||||
|
cut_respa = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
PairLJLongCoulLongIntel::~PairLJLongCoulLongIntel()
|
||||||
|
{
|
||||||
|
}
|
||||||
39
src/USER-INTEL/pair_lj_long_coul_long_intel.h
Normal file
39
src/USER-INTEL/pair_lj_long_coul_long_intel.h
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
/* ----------------------------------------------------------------------
|
||||||
|
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||||
|
http://lammps.sandia.gov, Sandia National Laboratories
|
||||||
|
Steve Plimpton, sjplimp@sandia.gov
|
||||||
|
|
||||||
|
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||||
|
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||||
|
certain rights in this software. This software is distributed under
|
||||||
|
the GNU General Public License.
|
||||||
|
|
||||||
|
See the README file in the top-level LAMMPS directory.
|
||||||
|
------------------------------------------------------------------------- */
|
||||||
|
|
||||||
|
/* ----------------------------------------------------------------------
|
||||||
|
Contributing authors: William McDoniel (RWTH Aachen University)
|
||||||
|
------------------------------------------------------------------------- */
|
||||||
|
|
||||||
|
#ifdef PAIR_CLASS
|
||||||
|
|
||||||
|
PairStyle(lj/long/coul/long/intel,PairLJLongCoulLongIntel)
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
#ifndef LMP_PAIR_LJ_LONG_COUL_LONG_INTEL_H
|
||||||
|
#define LMP_PAIR_LJ_LONG_COUL_LONG_INTEL_H
|
||||||
|
|
||||||
|
#include "pair_lj_long_coul_long.h"
|
||||||
|
#include "fix_intel.h"
|
||||||
|
|
||||||
|
namespace LAMMPS_NS {
|
||||||
|
class PairLJLongCoulLongIntel : public PairLJLongCoulLong {
|
||||||
|
public:
|
||||||
|
PairLJLongCoulLongIntel(class LAMMPS *);
|
||||||
|
virtual ~PairLJLongCoulLongIntel();
|
||||||
|
|
||||||
|
};
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
@ -109,85 +109,59 @@ void PairSWIntel::compute(int eflag, int vflag,
|
|||||||
if (ago != 0 && fix->separate_buffers() == 0) {
|
if (ago != 0 && fix->separate_buffers() == 0) {
|
||||||
fix->start_watch(TIME_PACK);
|
fix->start_watch(TIME_PACK);
|
||||||
|
|
||||||
|
int packthreads;
|
||||||
|
if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
|
||||||
|
else packthreads = 1;
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
#pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
|
#pragma omp parallel if(packthreads > 1)
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
int ifrom, ito, tid;
|
int ifrom, ito, tid;
|
||||||
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
|
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
|
||||||
nthreads, sizeof(ATOM_T));
|
packthreads, sizeof(ATOM_T));
|
||||||
buffers->thr_pack(ifrom, ito, ago);
|
buffers->thr_pack(ifrom, ito, ago);
|
||||||
}
|
}
|
||||||
|
|
||||||
fix->stop_watch(TIME_PACK);
|
fix->stop_watch(TIME_PACK);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int ovflag = 0;
|
||||||
|
if (vflag_fdotr) ovflag = 2;
|
||||||
|
else if (vflag) ovflag = 1;
|
||||||
if (_onetype) {
|
if (_onetype) {
|
||||||
if (_spq) {
|
if (_spq) {
|
||||||
if (evflag || vflag_fdotr) {
|
if (eflag) {
|
||||||
int ovflag = 0;
|
eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
|
||||||
if (vflag_fdotr) ovflag = 2;
|
eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
|
||||||
else if (vflag) ovflag = 1;
|
|
||||||
if (eflag) {
|
|
||||||
eval<1,1,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
|
|
||||||
eval<1,1,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
|
|
||||||
} else {
|
|
||||||
eval<1,1,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
|
|
||||||
eval<1,1,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
eval<1,1,0,0>(1, 0, buffers, fc, 0, offload_end, _offload_pad);
|
eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
|
||||||
eval<1,1,0,0>(0, 0, buffers, fc, host_start, inum, _host_pad);
|
eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (evflag || vflag_fdotr) {
|
if (eflag) {
|
||||||
int ovflag = 0;
|
eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
|
||||||
if (vflag_fdotr) ovflag = 2;
|
eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
|
||||||
else if (vflag) ovflag = 1;
|
|
||||||
if (eflag) {
|
|
||||||
eval<0,1,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
|
|
||||||
eval<0,1,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
|
|
||||||
} else {
|
|
||||||
eval<0,1,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
|
|
||||||
eval<0,1,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
eval<0,1,0,0>(1, 0, buffers, fc, 0, offload_end, _offload_pad);
|
eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
|
||||||
eval<0,1,0,0>(0, 0, buffers, fc, host_start, inum, _host_pad);
|
eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (_spq) {
|
if (_spq) {
|
||||||
if (evflag || vflag_fdotr) {
|
if (eflag) {
|
||||||
int ovflag = 0;
|
eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
|
||||||
if (vflag_fdotr) ovflag = 2;
|
eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
|
||||||
else if (vflag) ovflag = 1;
|
|
||||||
if (eflag) {
|
|
||||||
eval<1,0,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
|
|
||||||
eval<1,0,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
|
|
||||||
} else {
|
|
||||||
eval<1,0,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
|
|
||||||
eval<1,0,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
eval<1,0,0,0>(1, 0, buffers, fc, 0, offload_end, _offload_pad);
|
eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
|
||||||
eval<1,0,0,0>(0, 0, buffers, fc, host_start, inum, _host_pad);
|
eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (evflag || vflag_fdotr) {
|
if (eflag) {
|
||||||
int ovflag = 0;
|
eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
|
||||||
if (vflag_fdotr) ovflag = 2;
|
eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
|
||||||
else if (vflag) ovflag = 1;
|
|
||||||
if (eflag) {
|
|
||||||
eval<0,0,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
|
|
||||||
eval<0,0,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
|
|
||||||
} else {
|
|
||||||
eval<0,0,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
|
|
||||||
eval<0,0,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
eval<0,0,0,0>(1, 0, buffers, fc, 0, offload_end, _offload_pad);
|
eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
|
||||||
eval<0,0,0,0>(0, 0, buffers, fc, host_start, inum, _host_pad);
|
eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -196,7 +170,7 @@ void PairSWIntel::compute(int eflag, int vflag,
|
|||||||
/* ---------------------------------------------------------------------- */
|
/* ---------------------------------------------------------------------- */
|
||||||
#ifndef LMP_USE_AVXCD
|
#ifndef LMP_USE_AVXCD
|
||||||
|
|
||||||
template <int SPQ,int ONETYPE,int EVFLAG,int EFLAG,class flt_t,class acc_t>
|
template <int SPQ,int ONETYPE,int EFLAG,class flt_t,class acc_t>
|
||||||
void PairSWIntel::eval(const int offload, const int vflag,
|
void PairSWIntel::eval(const int offload, const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc, const int astart,
|
const ForceConst<flt_t> &fc, const int astart,
|
||||||
@ -235,7 +209,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
|
|||||||
|
|
||||||
// Determine how much data to transfer
|
// Determine how much data to transfer
|
||||||
int x_size, q_size, f_stride, ev_size, separate_flag;
|
int x_size, q_size, f_stride, ev_size, separate_flag;
|
||||||
IP_PRE_get_transfern(ago, /* NEWTON_PAIR*/ 1, EVFLAG, EFLAG, vflag,
|
IP_PRE_get_transfern(ago, /* NEWTON_PAIR*/ 1, EFLAG, vflag,
|
||||||
buffers, offload, fix, separate_flag,
|
buffers, offload, fix, separate_flag,
|
||||||
x_size, q_size, ev_size, f_stride);
|
x_size, q_size, ev_size, f_stride);
|
||||||
|
|
||||||
@ -276,19 +250,15 @@ void PairSWIntel::eval(const int offload, const int vflag,
|
|||||||
f_stride, x, 0);
|
f_stride, x, 0);
|
||||||
|
|
||||||
acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
|
acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||||
if (EVFLAG) {
|
if (EFLAG) oevdwl = (acc_t)0;
|
||||||
oevdwl = (acc_t)0;
|
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
|
||||||
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
|
|
||||||
}
|
|
||||||
|
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
#pragma omp parallel default(none) \
|
#pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||||
shared(f_start,f_stride,nlocal,nall,minlocal) \
|
|
||||||
reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
|
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
int iifrom, iito, tid;
|
int iifrom, iip, iito, tid;
|
||||||
IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
|
IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
|
||||||
iifrom += astart;
|
iifrom += astart;
|
||||||
iito += astart;
|
iito += astart;
|
||||||
|
|
||||||
@ -328,7 +298,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = iifrom; i < iito; ++i) {
|
for (int i = iifrom; i < iito; i += iip) {
|
||||||
int itype, itype_offset;
|
int itype, itype_offset;
|
||||||
const flt_t xtmp = x[i].x;
|
const flt_t xtmp = x[i].x;
|
||||||
const flt_t ytmp = x[i].y;
|
const flt_t ytmp = x[i].y;
|
||||||
@ -344,14 +314,13 @@ void PairSWIntel::eval(const int offload, const int vflag,
|
|||||||
const int jnumhalf = numneighhalf[i];
|
const int jnumhalf = numneighhalf[i];
|
||||||
|
|
||||||
acc_t fxtmp, fytmp, fztmp, fwtmp;
|
acc_t fxtmp, fytmp, fztmp, fwtmp;
|
||||||
acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
|
acc_t sevdwl;
|
||||||
fxtmp = fytmp = fztmp = (acc_t)0.0;
|
fxtmp = fytmp = fztmp = (acc_t)0.0;
|
||||||
if (EVFLAG) {
|
if (EFLAG) fwtmp = sevdwl = (acc_t)0;
|
||||||
if (EFLAG) fwtmp = sevdwl = (acc_t)0;
|
|
||||||
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
|
||||||
}
|
|
||||||
|
|
||||||
int ejnum = 0, ejnumhalf = 0;
|
int ejnum = 0, ejnumhalf = 0;
|
||||||
|
#pragma vector aligned
|
||||||
|
#pragma ivdep
|
||||||
for (int jj = 0; jj < jnum; jj++) {
|
for (int jj = 0; jj < jnum; jj++) {
|
||||||
int j = jlist[jj];
|
int j = jlist[jj];
|
||||||
j &= NEIGHMASK;
|
j &= NEIGHMASK;
|
||||||
@ -390,8 +359,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
|
|||||||
|
|
||||||
#if defined(LMP_SIMD_COMPILER)
|
#if defined(LMP_SIMD_COMPILER)
|
||||||
#pragma vector aligned
|
#pragma vector aligned
|
||||||
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl)
|
||||||
sv0, sv1, sv2, sv3, sv4, sv5)
|
|
||||||
#endif
|
#endif
|
||||||
for (int jj = 0; jj < ejnum_pad; jj++) {
|
for (int jj = 0; jj < ejnum_pad; jj++) {
|
||||||
acc_t fjxtmp, fjytmp, fjztmp, fjtmp;
|
acc_t fjxtmp, fjytmp, fjztmp, fjtmp;
|
||||||
@ -399,9 +367,6 @@ void PairSWIntel::eval(const int offload, const int vflag,
|
|||||||
if (EFLAG) fjtmp = (acc_t)0.0;
|
if (EFLAG) fjtmp = (acc_t)0.0;
|
||||||
int ijtype;
|
int ijtype;
|
||||||
|
|
||||||
const flt_t delx = tdelx[jj];
|
|
||||||
const flt_t dely = tdely[jj];
|
|
||||||
const flt_t delz = tdelz[jj];
|
|
||||||
if (!ONETYPE) ijtype = tjtype[jj] + itype_offset;
|
if (!ONETYPE) ijtype = tjtype[jj] + itype_offset;
|
||||||
const flt_t rsq1 = trsq[jj];
|
const flt_t rsq1 = trsq[jj];
|
||||||
|
|
||||||
@ -440,29 +405,31 @@ void PairSWIntel::eval(const int offload, const int vflag,
|
|||||||
const flt_t fpair = (c1 * rp - c2 * rq + (c3 * rp - c4 * rq) *
|
const flt_t fpair = (c1 * rp - c2 * rq + (c3 * rp - c4 * rq) *
|
||||||
rainvsq) * expsrainv * rinvsq1;
|
rainvsq) * expsrainv * rinvsq1;
|
||||||
|
|
||||||
fxtmp -= delx * fpair;
|
const flt_t delx = tdelx[jj];
|
||||||
fytmp -= dely * fpair;
|
const flt_t dely = tdely[jj];
|
||||||
fztmp -= delz * fpair;
|
const flt_t delz = tdelz[jj];
|
||||||
fjxtmp += delx * fpair;
|
const flt_t fpx = fpair * delx;
|
||||||
fjytmp += dely * fpair;
|
fxtmp -= fpx;
|
||||||
fjztmp += delz * fpair;
|
fjxtmp += fpx;
|
||||||
|
const flt_t fpy = fpair * dely;
|
||||||
|
fytmp -= fpy;
|
||||||
|
fjytmp += fpy;
|
||||||
|
const flt_t fpz = fpair * delz;
|
||||||
|
fztmp -= fpz;
|
||||||
|
fjztmp += fpz;
|
||||||
|
|
||||||
if (EVFLAG) {
|
if (EFLAG) {
|
||||||
if (EFLAG) {
|
flt_t evdwl;
|
||||||
flt_t evdwl;
|
if (!ONETYPE) {
|
||||||
if (!ONETYPE) {
|
c5 = p2e[ijtype].c5;
|
||||||
c5 = p2e[ijtype].c5;
|
c6 = p2e[ijtype].c6;
|
||||||
c6 = p2e[ijtype].c6;
|
}
|
||||||
}
|
evdwl = (c5 * rp - c6 * rq) * expsrainv;
|
||||||
evdwl = (c5 * rp - c6 * rq) * expsrainv;
|
sevdwl += evdwl;
|
||||||
sevdwl += evdwl;
|
if (eatom) {
|
||||||
if (eatom) {
|
fwtmp += (flt_t)0.5 * evdwl;
|
||||||
fwtmp += (acc_t)0.5 * evdwl;
|
fjtmp += (flt_t)0.5 * evdwl;
|
||||||
fjtmp += (acc_t)0.5 * evdwl;
|
}
|
||||||
}
|
|
||||||
}
|
|
||||||
IP_PRE_ev_tally_nbor(vflag, (flt_t)1.0, fpair,
|
|
||||||
-delx, -dely, -delz);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*---------------------------------------------*/
|
/*---------------------------------------------*/
|
||||||
@ -533,17 +500,13 @@ void PairSWIntel::eval(const int offload, const int vflag,
|
|||||||
fjytmp += fjy;
|
fjytmp += fjy;
|
||||||
fjztmp += fjz;
|
fjztmp += fjz;
|
||||||
|
|
||||||
if (EVFLAG) {
|
if (EFLAG) {
|
||||||
if (EFLAG) {
|
const flt_t evdwl = facrad * (flt_t)0.5;
|
||||||
const flt_t evdwl = facrad * (flt_t)0.5;
|
sevdwl += evdwl;
|
||||||
sevdwl += evdwl;
|
if (eatom) {
|
||||||
if (eatom) {
|
fwtmp += (acc_t)0.33333333 * evdwl;
|
||||||
fwtmp += (acc_t)0.33333333 * evdwl;
|
fjtmp += (acc_t)0.33333333 * facrad;
|
||||||
fjtmp += (acc_t)0.33333333 * facrad;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
IP_PRE_ev_tally_nbor3v(vflag, fjx, fjy, fjz,
|
|
||||||
delx, dely, delz);
|
|
||||||
}
|
}
|
||||||
} // for kk
|
} // for kk
|
||||||
const int j = tj[jj];
|
const int j = tj[jj];
|
||||||
@ -557,34 +520,31 @@ void PairSWIntel::eval(const int offload, const int vflag,
|
|||||||
f[i].x += fxtmp;
|
f[i].x += fxtmp;
|
||||||
f[i].y += fytmp;
|
f[i].y += fytmp;
|
||||||
f[i].z += fztmp;
|
f[i].z += fztmp;
|
||||||
IP_PRE_ev_tally_atom(EVFLAG, EFLAG, vflag, f, fwtmp);
|
|
||||||
|
if (EFLAG) {
|
||||||
|
f[i].w += fwtmp;
|
||||||
|
oevdwl += sevdwl;
|
||||||
|
}
|
||||||
} // for ii
|
} // for ii
|
||||||
|
|
||||||
#ifndef _LMP_INTEL_OFFLOAD
|
IP_PRE_fdotr_reduce_omp(1, nall, minlocal, nthreads, f_start, f_stride,
|
||||||
if (vflag == 2)
|
x, offload, vflag, ov0, ov1, ov2, ov3, ov4, ov5);
|
||||||
#endif
|
|
||||||
{
|
|
||||||
#if defined(_OPENMP)
|
|
||||||
#pragma omp barrier
|
|
||||||
#endif
|
|
||||||
IP_PRE_fdotr_acc_force(1, EVFLAG, EFLAG, vflag, eatom, nall,
|
|
||||||
nlocal, minlocal, nthreads, f_start, f_stride,
|
|
||||||
x, offload);
|
|
||||||
}
|
|
||||||
} // end omp
|
} // end omp
|
||||||
if (EVFLAG) {
|
|
||||||
if (EFLAG) {
|
IP_PRE_fdotr_reduce(1, nall, nthreads, f_stride, vflag,
|
||||||
ev_global[0] = oevdwl;
|
ov0, ov1, ov2, ov3, ov4, ov5);
|
||||||
ev_global[1] = (acc_t)0.0;
|
|
||||||
}
|
if (EFLAG) {
|
||||||
if (vflag) {
|
ev_global[0] = oevdwl;
|
||||||
ev_global[2] = ov0;
|
ev_global[1] = (acc_t)0.0;
|
||||||
ev_global[3] = ov1;
|
}
|
||||||
ev_global[4] = ov2;
|
if (vflag) {
|
||||||
ev_global[5] = ov3;
|
ev_global[2] = ov0;
|
||||||
ev_global[6] = ov4;
|
ev_global[3] = ov1;
|
||||||
ev_global[7] = ov5;
|
ev_global[4] = ov2;
|
||||||
}
|
ev_global[5] = ov3;
|
||||||
|
ev_global[6] = ov4;
|
||||||
|
ev_global[7] = ov5;
|
||||||
}
|
}
|
||||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||||
*timer_compute = MIC_Wtime() - *timer_compute;
|
*timer_compute = MIC_Wtime() - *timer_compute;
|
||||||
@ -595,7 +555,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
|
|||||||
else
|
else
|
||||||
fix->stop_watch(TIME_HOST_PAIR);
|
fix->stop_watch(TIME_HOST_PAIR);
|
||||||
|
|
||||||
if (EVFLAG)
|
if (EFLAG || vflag)
|
||||||
fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
|
fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
|
||||||
else
|
else
|
||||||
fix->add_result_array(f_start, 0, offload);
|
fix->add_result_array(f_start, 0, offload);
|
||||||
@ -614,7 +574,7 @@ authors for more details.
|
|||||||
|
|
||||||
------------------------------------------------------------------------- */
|
------------------------------------------------------------------------- */
|
||||||
|
|
||||||
template <int SPQ,int ONETYPE,int EVFLAG,int EFLAG,class flt_t,class acc_t>
|
template <int SPQ,int ONETYPE,int EFLAG,class flt_t,class acc_t>
|
||||||
void PairSWIntel::eval(const int offload, const int vflag,
|
void PairSWIntel::eval(const int offload, const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc, const int astart,
|
const ForceConst<flt_t> &fc, const int astart,
|
||||||
@ -659,7 +619,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
|
|||||||
|
|
||||||
// Determine how much data to transfer
|
// Determine how much data to transfer
|
||||||
int x_size, q_size, f_stride, ev_size, separate_flag;
|
int x_size, q_size, f_stride, ev_size, separate_flag;
|
||||||
IP_PRE_get_transfern(ago, /* NEWTON_PAIR*/ 1, EVFLAG, EFLAG, vflag,
|
IP_PRE_get_transfern(ago, /* NEWTON_PAIR*/ 1, EFLAG, vflag,
|
||||||
buffers, offload, fix, separate_flag,
|
buffers, offload, fix, separate_flag,
|
||||||
x_size, q_size, ev_size, f_stride);
|
x_size, q_size, ev_size, f_stride);
|
||||||
|
|
||||||
@ -701,19 +661,17 @@ void PairSWIntel::eval(const int offload, const int vflag,
|
|||||||
f_stride, x, 0);
|
f_stride, x, 0);
|
||||||
|
|
||||||
acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
|
acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||||
if (EVFLAG) {
|
if (EFLAG) oevdwl = (acc_t)0;
|
||||||
oevdwl = (acc_t)0;
|
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
|
||||||
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
|
|
||||||
}
|
|
||||||
|
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
#pragma omp parallel default(none) \
|
#pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||||
shared(f_start,f_stride,nlocal,nall,minlocal) \
|
|
||||||
reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
|
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
int iifrom, iito, tid;
|
int iifrom, iip, iito, tid;
|
||||||
IP_PRE_omp_range_id_vec(iifrom, iito, tid, inum, nthreads, swidth);
|
IP_PRE_omp_stride_id_vec(iifrom, iip, iito, tid, inum, nthreads,
|
||||||
|
swidth);
|
||||||
|
|
||||||
iifrom += astart;
|
iifrom += astart;
|
||||||
iito += astart;
|
iito += astart;
|
||||||
|
|
||||||
@ -760,7 +718,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
|
|||||||
144,160,176,192,208,224,240);
|
144,160,176,192,208,224,240);
|
||||||
ilist = ilist + iifrom;
|
ilist = ilist + iifrom;
|
||||||
acc_t * const dforce = &(f[0].x);
|
acc_t * const dforce = &(f[0].x);
|
||||||
for (int i = iifrom; i < iito; i += swidth) {
|
for (int i = iifrom; i < iito; i += iip) {
|
||||||
SIMD_mask imask = ilist < iito;
|
SIMD_mask imask = ilist < iito;
|
||||||
SIMD_flt_t xtmp, ytmp, ztmp;
|
SIMD_flt_t xtmp, ytmp, ztmp;
|
||||||
SIMD_int itype, itype_offset;
|
SIMD_int itype, itype_offset;
|
||||||
@ -793,20 +751,10 @@ void PairSWIntel::eval(const int offload, const int vflag,
|
|||||||
if (EFLAG) fwtmp2 = SIMD_set((acc_t)0);
|
if (EFLAG) fwtmp2 = SIMD_set((acc_t)0);
|
||||||
}
|
}
|
||||||
|
|
||||||
SIMD_acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
|
SIMD_acc_t sevdwl;
|
||||||
if (EVFLAG) {
|
if (EFLAG) {
|
||||||
if (EFLAG) {
|
fwtmp = SIMD_set((acc_t)0);
|
||||||
fwtmp = SIMD_set((acc_t)0);
|
sevdwl = SIMD_set((acc_t)0);
|
||||||
sevdwl = SIMD_set((acc_t)0);
|
|
||||||
}
|
|
||||||
if (vflag==1) {
|
|
||||||
sv0 = SIMD_set((acc_t)0);
|
|
||||||
sv1 = SIMD_set((acc_t)0);
|
|
||||||
sv2 = SIMD_set((acc_t)0);
|
|
||||||
sv3 = SIMD_set((acc_t)0);
|
|
||||||
sv4 = SIMD_set((acc_t)0);
|
|
||||||
sv5 = SIMD_set((acc_t)0);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
SIMD_int ejnum = SIMD_set(0);
|
SIMD_int ejnum = SIMD_set(0);
|
||||||
@ -930,19 +878,15 @@ void PairSWIntel::eval(const int offload, const int vflag,
|
|||||||
fjxtmp, fjytmp, fjztmp, fxtmp2, fytmp2,
|
fjxtmp, fjytmp, fjztmp, fxtmp2, fytmp2,
|
||||||
fztmp2, fjxtmp2, fjytmp2, fjztmp2);
|
fztmp2, fjxtmp2, fjytmp2, fjztmp2);
|
||||||
|
|
||||||
if (EVFLAG) {
|
if (EFLAG) {
|
||||||
if (EFLAG) {
|
if (!ONETYPE) {
|
||||||
if (!ONETYPE) {
|
c5 = SIMD_gather(&(p2e[0].c5), ijtype);
|
||||||
c5 = SIMD_gather(&(p2e[0].c5), ijtype);
|
c6 = SIMD_gather(&(p2e[0].c6), ijtype);
|
||||||
c6 = SIMD_gather(&(p2e[0].c6), ijtype);
|
|
||||||
}
|
|
||||||
SIMD_flt_t evdwl;
|
|
||||||
evdwl = (c5 * rp - c6 * rq) * expsrainv;
|
|
||||||
SIMD_acc_energy3(hmask, evdwl, eatom, sevdwl, fwtmp, fjtmp,
|
|
||||||
fwtmp2, fjtmp2);
|
|
||||||
}
|
}
|
||||||
SIMD_ev_tally_nbor(hmask, vflag, (flt_t)1.0, fpair, delx, dely,
|
SIMD_flt_t evdwl;
|
||||||
delz, sv0, sv1, sv2, sv3, sv4, sv5);
|
evdwl = (c5 * rp - c6 * rq) * expsrainv;
|
||||||
|
SIMD_acc_energy3(hmask, evdwl, eatom, sevdwl, fwtmp, fjtmp,
|
||||||
|
fwtmp2, fjtmp2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1012,21 +956,15 @@ void PairSWIntel::eval(const int offload, const int vflag,
|
|||||||
fztmp2, fjxtmp2, fjytmp2, fjztmp2,
|
fztmp2, fjxtmp2, fjytmp2, fjztmp2,
|
||||||
tf + kcoffset * 3, swidth);
|
tf + kcoffset * 3, swidth);
|
||||||
|
|
||||||
if (EVFLAG) {
|
if (EFLAG) {
|
||||||
if (EFLAG) {
|
SIMD_int k;
|
||||||
SIMD_int k;
|
if (eatom) {
|
||||||
if (eatom) {
|
k = SIMD_load(tj + kcoffset);
|
||||||
k = SIMD_load(tj + kcoffset);
|
k = k << 4;
|
||||||
k = k << 4;
|
|
||||||
}
|
|
||||||
SIMD_acc_three(kmask, facrad, eatom, sevdwl, fwtmp, fjtmp,
|
|
||||||
fwtmp2, fjtmp2, k, dforce);
|
|
||||||
}
|
}
|
||||||
SIMD_ev_tally_nbor3v(kmask, vflag, fjx, fjy, fjz, fkx, fky, fkz,
|
SIMD_acc_three(kmask, facrad, eatom, sevdwl, fwtmp, fjtmp,
|
||||||
delx, dely, delz, delr2x, delr2y, delr2z,
|
fwtmp2, fjtmp2, k, dforce);
|
||||||
sv0, sv1, sv2, sv3, sv4, sv5);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
} // for kk
|
} // for kk
|
||||||
if (is_same<flt_t,acc_t>::value == 1)
|
if (is_same<flt_t,acc_t>::value == 1)
|
||||||
SIMD_cache3(tf + coffset * 3, swidth, fjxtmp, fjytmp, fjztmp);
|
SIMD_cache3(tf + coffset * 3, swidth, fjxtmp, fjytmp, fjztmp);
|
||||||
@ -1087,52 +1025,34 @@ void PairSWIntel::eval(const int offload, const int vflag,
|
|||||||
} // for jj second loop
|
} // for jj second loop
|
||||||
|
|
||||||
SIMD_iforce_update(imask, &(f[i].x), goffset, fxtmp, fytmp, fztmp,
|
SIMD_iforce_update(imask, &(f[i].x), goffset, fxtmp, fytmp, fztmp,
|
||||||
EVFLAG, eatom, fwtmp);
|
EFLAG, eatom, fwtmp);
|
||||||
if (is_same<flt_t,acc_t>::value == 0) {
|
if (is_same<flt_t,acc_t>::value == 0) {
|
||||||
imask = imask >> 8;
|
imask = imask >> 8;
|
||||||
SIMD_iforce_update(imask, &(f[i+8].x), goffset, fxtmp2, fytmp2,
|
SIMD_iforce_update(imask, &(f[i+8].x), goffset, fxtmp2, fytmp2,
|
||||||
fztmp2, EVFLAG, eatom, fwtmp2);
|
fztmp2, EFLAG, eatom, fwtmp2);
|
||||||
}
|
}
|
||||||
if (EVFLAG) {
|
if (EFLAG) oevdwl += SIMD_sum(sevdwl);
|
||||||
if (EFLAG) oevdwl += SIMD_sum(sevdwl);
|
ilist = ilist + iip;
|
||||||
if (vflag == 1) {
|
|
||||||
ov0 += SIMD_sum(sv0);
|
|
||||||
ov1 += SIMD_sum(sv1);
|
|
||||||
ov2 += SIMD_sum(sv2);
|
|
||||||
ov3 += SIMD_sum(sv3);
|
|
||||||
ov4 += SIMD_sum(sv4);
|
|
||||||
ov5 += SIMD_sum(sv5);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
ilist = ilist + swidth;
|
|
||||||
} // for ii
|
} // for ii
|
||||||
|
|
||||||
#ifndef _LMP_INTEL_OFFLOAD
|
IP_PRE_fdotr_reduce_omp(1, nall, minlocal, nthreads, f_start, f_stride,
|
||||||
if (vflag == 2)
|
x, offload, vflag, ov0, ov1, ov2, ov3, ov4, ov5);
|
||||||
#endif
|
|
||||||
{
|
|
||||||
#if defined(_OPENMP)
|
|
||||||
#pragma omp barrier
|
|
||||||
#endif
|
|
||||||
IP_PRE_fdotr_acc_force(1, EVFLAG, EFLAG, vflag, eatom, nall, nlocal,
|
|
||||||
minlocal, nthreads, f_start, f_stride, x,
|
|
||||||
offload);
|
|
||||||
}
|
|
||||||
} // end omp
|
} // end omp
|
||||||
|
|
||||||
if (EVFLAG) {
|
IP_PRE_fdotr_reduce(1, nall, nthreads, f_stride, vflag,
|
||||||
if (EFLAG) {
|
ov0, ov1, ov2, ov3, ov4, ov5);
|
||||||
ev_global[0] = oevdwl;
|
|
||||||
ev_global[1] = (acc_t)0.0;
|
if (EFLAG) {
|
||||||
}
|
ev_global[0] = oevdwl;
|
||||||
if (vflag) {
|
ev_global[1] = (acc_t)0.0;
|
||||||
ev_global[2] = ov0;
|
}
|
||||||
ev_global[3] = ov1;
|
if (vflag) {
|
||||||
ev_global[4] = ov2;
|
ev_global[2] = ov0;
|
||||||
ev_global[5] = ov3;
|
ev_global[3] = ov1;
|
||||||
ev_global[6] = ov4;
|
ev_global[4] = ov2;
|
||||||
ev_global[7] = ov5;
|
ev_global[5] = ov3;
|
||||||
}
|
ev_global[6] = ov4;
|
||||||
|
ev_global[7] = ov5;
|
||||||
}
|
}
|
||||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||||
*timer_compute = MIC_Wtime() - *timer_compute;
|
*timer_compute = MIC_Wtime() - *timer_compute;
|
||||||
@ -1143,7 +1063,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
|
|||||||
else
|
else
|
||||||
fix->stop_watch(TIME_HOST_PAIR);
|
fix->stop_watch(TIME_HOST_PAIR);
|
||||||
|
|
||||||
if (EVFLAG)
|
if (EFLAG || vflag)
|
||||||
fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
|
fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
|
||||||
else
|
else
|
||||||
fix->add_result_array(f_start, 0, offload);
|
fix->add_result_array(f_start, 0, offload);
|
||||||
@ -1212,6 +1132,7 @@ void PairSWIntel::pack_force_const(ForceConst<flt_t> &fc,
|
|||||||
#ifdef LMP_USE_AVXCD
|
#ifdef LMP_USE_AVXCD
|
||||||
fix->nbor_pack_width(SIMD_type<flt_t>::width());
|
fix->nbor_pack_width(SIMD_type<flt_t>::width());
|
||||||
#endif
|
#endif
|
||||||
|
fix->three_body_neighbor(1);
|
||||||
|
|
||||||
int off_ccache = 0;
|
int off_ccache = 0;
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
|
|||||||
@ -46,7 +46,7 @@ class PairSWIntel : public PairSW {
|
|||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc);
|
const ForceConst<flt_t> &fc);
|
||||||
template <int SPQ,int ONETYPE,int EVFLAG,int EFLAG,class flt_t,class acc_t>
|
template <int SPQ, int ONETYPE, int EFLAG, class flt_t, class acc_t>
|
||||||
void eval(const int offload, const int vflag,
|
void eval(const int offload, const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> * buffers, const ForceConst<flt_t> &fc,
|
IntelBuffers<flt_t,acc_t> * buffers, const ForceConst<flt_t> &fc,
|
||||||
const int astart, const int aend, const int pad_width);
|
const int astart, const int aend, const int pad_width);
|
||||||
|
|||||||
@ -119,32 +119,30 @@ void PairTersoffIntel::compute(int eflag, int vflag,
|
|||||||
|
|
||||||
if (ago != 0 && fix->separate_buffers() == 0) {
|
if (ago != 0 && fix->separate_buffers() == 0) {
|
||||||
fix->start_watch(TIME_PACK);
|
fix->start_watch(TIME_PACK);
|
||||||
|
int packthreads;
|
||||||
|
if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
|
||||||
|
else packthreads = 1;
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
#pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
|
#pragma omp parallel if(packthreads > 1)
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
int ifrom, ito, tid;
|
int ifrom, ito, tid;
|
||||||
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
|
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
|
||||||
nthreads, sizeof(ATOM_T));
|
packthreads, sizeof(ATOM_T));
|
||||||
buffers->thr_pack(ifrom,ito,ago);
|
buffers->thr_pack(ifrom,ito,ago);
|
||||||
}
|
}
|
||||||
fix->stop_watch(TIME_PACK);
|
fix->stop_watch(TIME_PACK);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (evflag || vflag_fdotr) {
|
int ovflag = 0;
|
||||||
int ovflag = 0;
|
if (vflag_fdotr) ovflag = 2;
|
||||||
if (vflag_fdotr) ovflag = 2;
|
else if (vflag) ovflag = 1;
|
||||||
else if (vflag) ovflag = 1;
|
if (eflag) {
|
||||||
if (eflag) {
|
eval<1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
eval<1>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
|
|
||||||
} else {
|
|
||||||
eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
|
||||||
eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
eval<0,0,1>(1, 0, buffers, fc, 0, offload_end);
|
eval<0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||||
eval<0,0,1>(0, 0, buffers, fc, host_start, inum);
|
eval<0>(0, ovflag, buffers, fc, host_start, inum);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -202,7 +200,7 @@ struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic>
|
|||||||
);
|
);
|
||||||
|
|
||||||
// perform the actual computation
|
// perform the actual computation
|
||||||
template<bool EVFLAG, bool EFLAG>
|
template<bool EFLAG>
|
||||||
static void kernel(
|
static void kernel(
|
||||||
int iito, int iifrom, int eatom, int vflag,
|
int iito, int iifrom, int eatom, int vflag,
|
||||||
const int * _noalias const numneigh,
|
const int * _noalias const numneigh,
|
||||||
@ -213,11 +211,11 @@ struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic>
|
|||||||
const c_inner_t * _noalias const c_inner,
|
const c_inner_t * _noalias const c_inner,
|
||||||
const c_outer_t * _noalias const c_outer,
|
const c_outer_t * _noalias const c_outer,
|
||||||
typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
|
typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
|
||||||
acc_t *evdwl, acc_t *ov0, acc_t * ov1, acc_t *ov2, acc_t* ov3, acc_t *ov4, acc_t *ov5
|
acc_t *evdwl
|
||||||
);
|
);
|
||||||
|
|
||||||
// perform one step of calculation, pass in i-j pairs of atoms (is, js)
|
// perform one step of calculation, pass in i-j pairs of atoms (is, js)
|
||||||
template<int EVFLAG, int EFLAG>
|
template<int EFLAG>
|
||||||
static void kernel_step(
|
static void kernel_step(
|
||||||
int eatom, int vflag,
|
int eatom, int vflag,
|
||||||
const int * _noalias const numneigh,
|
const int * _noalias const numneigh,
|
||||||
@ -228,13 +226,12 @@ struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic>
|
|||||||
const c_inner_t * _noalias const c_inner,
|
const c_inner_t * _noalias const c_inner,
|
||||||
const c_outer_t * _noalias const c_outer,
|
const c_outer_t * _noalias const c_outer,
|
||||||
typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
|
typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
|
||||||
avec *vsevdwl, avec *vsv0, avec * vsv1, avec *vsv2, avec* vsv3, avec *vsv4, avec *vsv5,
|
avec *vsevdwl, int compress_idx, iarr is, iarr js, bvec vmask_repulsive
|
||||||
int compress_idx, iarr is, iarr js, bvec vmask_repulsive
|
|
||||||
);
|
);
|
||||||
|
|
||||||
// perform one step of calculation, as opposed to the previous method now
|
// perform one step of calculation, as opposed to the previous method now
|
||||||
// with fixed i and a number of js
|
// with fixed i and a number of js
|
||||||
template<int EVFLAG, int EFLAG>
|
template<int EFLAG>
|
||||||
static void kernel_step_const_i(
|
static void kernel_step_const_i(
|
||||||
int eatom, int vflag,
|
int eatom, int vflag,
|
||||||
const int * _noalias const numneigh, const int * _noalias const cnumneigh,
|
const int * _noalias const numneigh, const int * _noalias const cnumneigh,
|
||||||
@ -243,8 +240,7 @@ struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic>
|
|||||||
const c_inner_t * _noalias const c_inner,
|
const c_inner_t * _noalias const c_inner,
|
||||||
const c_outer_t * _noalias const c_outer,
|
const c_outer_t * _noalias const c_outer,
|
||||||
typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
|
typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
|
||||||
avec *vsevdwl, avec *vsv0, avec *vsv1, avec *vsv2, avec *vsv3, avec *vsv4, avec *vsv5,
|
avec *vsevdwl, int compress_idx, int i, iarr js, bvec vmask_repulsive
|
||||||
int compress_idx, int i, iarr js, bvec vmask_repulsive
|
|
||||||
);
|
);
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -257,7 +253,7 @@ struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic>
|
|||||||
// Dispatch to correct kernel instatiation and perform all the work neccesary
|
// Dispatch to correct kernel instatiation and perform all the work neccesary
|
||||||
// for offloading. In this routine we enter the Phi.
|
// for offloading. In this routine we enter the Phi.
|
||||||
// This method is nearly identical to what happens in the other /intel styles
|
// This method is nearly identical to what happens in the other /intel styles
|
||||||
template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
template <int EFLAG, class flt_t, class acc_t>
|
||||||
void PairTersoffIntel::eval(const int offload, const int vflag,
|
void PairTersoffIntel::eval(const int offload, const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> *buffers,
|
IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc,
|
const ForceConst<flt_t> &fc,
|
||||||
@ -292,7 +288,7 @@ void PairTersoffIntel::eval(const int offload, const int vflag,
|
|||||||
|
|
||||||
// Determine how much data to transfer
|
// Determine how much data to transfer
|
||||||
int x_size, q_size, f_stride, ev_size, separate_flag;
|
int x_size, q_size, f_stride, ev_size, separate_flag;
|
||||||
IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
|
IP_PRE_get_transfern(ago, 1, EFLAG, vflag,
|
||||||
buffers, offload, fix, separate_flag,
|
buffers, offload, fix, separate_flag,
|
||||||
x_size, q_size, ev_size, f_stride);
|
x_size, q_size, ev_size, f_stride);
|
||||||
|
|
||||||
@ -330,20 +326,16 @@ void PairTersoffIntel::eval(const int offload, const int vflag,
|
|||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
|
IP_PRE_repack_for_offload(1, separate_flag, nlocal, nall,
|
||||||
f_stride, x, 0);
|
f_stride, x, 0);
|
||||||
|
|
||||||
acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
|
acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||||
if (EVFLAG) {
|
if (EFLAG) oevdwl = oecoul = (acc_t)0;
|
||||||
oevdwl = oecoul = (acc_t)0;
|
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
|
||||||
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// loop over neighbors of my atoms
|
// loop over neighbors of my atoms
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
#pragma omp parallel default(none) \
|
#pragma omp parallel reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||||
shared(f_start,f_stride,nlocal,nall,minlocal) \
|
|
||||||
reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
|
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
int iifrom, iito, tid;
|
int iifrom, iito, tid;
|
||||||
@ -355,10 +347,10 @@ void PairTersoffIntel::eval(const int offload, const int vflag,
|
|||||||
memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
|
memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
|
||||||
|
|
||||||
{
|
{
|
||||||
acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
|
acc_t sevdwl;
|
||||||
sevdwl = sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = 0.;
|
sevdwl = 0.;
|
||||||
#define ARGS iito, iifrom, eatom, vflag, numneigh, numneighhalf, cnumneigh, \
|
#define ARGS iito, iifrom, eatom, vflag, numneigh, numneighhalf, cnumneigh, \
|
||||||
firstneigh, ntypes, x, c_inner, c_outer, f, &sevdwl, &sv0, &sv1, &sv2, &sv3, &sv4, &sv5
|
firstneigh, ntypes, x, c_inner, c_outer, f, &sevdwl
|
||||||
// Pick the variable i algorithm under specific conditions
|
// Pick the variable i algorithm under specific conditions
|
||||||
// do use scalar algorithm with very short vectors
|
// do use scalar algorithm with very short vectors
|
||||||
int VL = lmp_intel::vector_routines<flt_t,acc_t,lmp_intel::mode>::VL;
|
int VL = lmp_intel::vector_routines<flt_t,acc_t,lmp_intel::mode>::VL;
|
||||||
@ -366,50 +358,34 @@ void PairTersoffIntel::eval(const int offload, const int vflag,
|
|||||||
lmp_intel::vector_traits<lmp_intel::mode>::support_integer_and_gather_ops;
|
lmp_intel::vector_traits<lmp_intel::mode>::support_integer_and_gather_ops;
|
||||||
bool use_scalar = VL < 4;
|
bool use_scalar = VL < 4;
|
||||||
if (use_scalar) {
|
if (use_scalar) {
|
||||||
IntelKernelTersoff<flt_t,acc_t,lmp_intel::NONE,false>::kernel<EVFLAG,EFLAG>(ARGS);
|
IntelKernelTersoff<flt_t,acc_t,lmp_intel::NONE,false>::kernel<EFLAG>(ARGS);
|
||||||
} else if (pack_i) {
|
} else if (pack_i) {
|
||||||
IntelKernelTersoff<flt_t,acc_t,lmp_intel::mode,true >::kernel<EVFLAG,EFLAG>(ARGS);
|
IntelKernelTersoff<flt_t,acc_t,lmp_intel::mode,true >::kernel<EFLAG>(ARGS);
|
||||||
} else {
|
} else {
|
||||||
IntelKernelTersoff<flt_t,acc_t,lmp_intel::mode,false>::kernel<EVFLAG,EFLAG>(ARGS);
|
IntelKernelTersoff<flt_t,acc_t,lmp_intel::mode,false>::kernel<EFLAG>(ARGS);
|
||||||
}
|
|
||||||
if (EVFLAG) {
|
|
||||||
if (EFLAG) oevdwl += sevdwl;
|
|
||||||
if (vflag == 1) {
|
|
||||||
ov0 += sv0;
|
|
||||||
ov1 += sv1;
|
|
||||||
ov2 += sv2;
|
|
||||||
ov3 += sv3;
|
|
||||||
ov4 += sv4;
|
|
||||||
ov5 += sv5;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
if (EFLAG) oevdwl += sevdwl;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifndef _LMP_INTEL_OFFLOAD
|
IP_PRE_fdotr_reduce_omp(1, nall, minlocal, nthreads, f_start,
|
||||||
if (vflag == 2)
|
f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
|
||||||
#endif
|
ov4, ov5);
|
||||||
{
|
|
||||||
#if defined(_OPENMP)
|
|
||||||
#pragma omp barrier
|
|
||||||
#endif
|
|
||||||
IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall,
|
|
||||||
nlocal, minlocal, nthreads, f_start, f_stride,
|
|
||||||
x, offload);
|
|
||||||
}
|
|
||||||
} // end of omp parallel region
|
} // end of omp parallel region
|
||||||
if (EVFLAG) {
|
|
||||||
if (EFLAG) {
|
IP_PRE_fdotr_reduce(1, nall, nthreads, f_stride, vflag,
|
||||||
ev_global[0] = oevdwl;
|
ov0, ov1, ov2, ov3, ov4, ov5);
|
||||||
ev_global[1] = 0.0;
|
|
||||||
}
|
if (EFLAG) {
|
||||||
if (vflag) {
|
ev_global[0] = oevdwl;
|
||||||
ev_global[2] = ov0;
|
ev_global[1] = 0.0;
|
||||||
ev_global[3] = ov1;
|
}
|
||||||
ev_global[4] = ov2;
|
if (vflag) {
|
||||||
ev_global[5] = ov3;
|
ev_global[2] = ov0;
|
||||||
ev_global[6] = ov4;
|
ev_global[3] = ov1;
|
||||||
ev_global[7] = ov5;
|
ev_global[4] = ov2;
|
||||||
}
|
ev_global[5] = ov3;
|
||||||
|
ev_global[6] = ov4;
|
||||||
|
ev_global[7] = ov5;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
@ -424,7 +400,7 @@ void PairTersoffIntel::eval(const int offload, const int vflag,
|
|||||||
else
|
else
|
||||||
fix->stop_watch(TIME_HOST_PAIR);
|
fix->stop_watch(TIME_HOST_PAIR);
|
||||||
|
|
||||||
if (EVFLAG)
|
if (EFLAG || vflag)
|
||||||
fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
|
fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
|
||||||
else
|
else
|
||||||
fix->add_result_array(f_start, 0, offload);
|
fix->add_result_array(f_start, 0, offload);
|
||||||
@ -457,6 +433,7 @@ void PairTersoffIntel::init_style()
|
|||||||
fix = static_cast<FixIntel *>(modify->fix[ifix]);
|
fix = static_cast<FixIntel *>(modify->fix[ifix]);
|
||||||
|
|
||||||
fix->pair_init_check();
|
fix->pair_init_check();
|
||||||
|
fix->three_body_neighbor(1);
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
_cop = fix->coprocessor_number();
|
_cop = fix->coprocessor_number();
|
||||||
#endif
|
#endif
|
||||||
@ -663,7 +640,7 @@ void PairTersoffIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
|
|||||||
static const int N_CACHE = 8;
|
static const int N_CACHE = 8;
|
||||||
|
|
||||||
template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
|
template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
|
||||||
template<int EVFLAG, int EFLAG>
|
template<int EFLAG>
|
||||||
void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
|
void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
|
||||||
int eatom, int vflag,
|
int eatom, int vflag,
|
||||||
const int * _noalias const numneigh, const int * _noalias const cnumneigh,
|
const int * _noalias const numneigh, const int * _noalias const cnumneigh,
|
||||||
@ -673,12 +650,6 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
|
|||||||
const typename PairTersoffIntel::ForceConst<flt_t>::c_outer_t * _noalias const c_outer,
|
const typename PairTersoffIntel::ForceConst<flt_t>::c_outer_t * _noalias const c_outer,
|
||||||
typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
|
typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
|
||||||
avec *vsevdwl,
|
avec *vsevdwl,
|
||||||
avec *vsv0,
|
|
||||||
avec *vsv1,
|
|
||||||
avec *vsv2,
|
|
||||||
avec* vsv3,
|
|
||||||
avec *vsv4,
|
|
||||||
avec *vsv5,
|
|
||||||
int compress_idx,
|
int compress_idx,
|
||||||
iarr is,
|
iarr is,
|
||||||
iarr js,
|
iarr js,
|
||||||
@ -829,20 +800,10 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
|
|||||||
vfjytmp = vfjytmp * vprefactor - vdy_ij * vfpair;
|
vfjytmp = vfjytmp * vprefactor - vdy_ij * vfpair;
|
||||||
vfjztmp = vfjztmp * vprefactor - vdz_ij * vfpair;
|
vfjztmp = vfjztmp * vprefactor - vdz_ij * vfpair;
|
||||||
|
|
||||||
if (EVFLAG) {
|
if (EFLAG) {
|
||||||
if (EFLAG) {
|
*vsevdwl = v::acc_mask_add(*vsevdwl, vmask, *vsevdwl, vevdwl);
|
||||||
*vsevdwl = v::acc_mask_add(*vsevdwl, vmask, *vsevdwl, vevdwl);
|
if (eatom) {
|
||||||
if (eatom) {
|
v::store(fw, (v_0_5 * vevdwl));
|
||||||
v::store(fw, (v_0_5 * vevdwl));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (vflag == 1) {
|
|
||||||
*vsv0 = v::acc_mask_add(*vsv0, vmask, *vsv0, vdx_ij * vdx_ij * vfpair);
|
|
||||||
*vsv1 = v::acc_mask_add(*vsv1, vmask, *vsv1, vdy_ij * vdy_ij * vfpair);
|
|
||||||
*vsv2 = v::acc_mask_add(*vsv2, vmask, *vsv2, vdz_ij * vdz_ij * vfpair);
|
|
||||||
*vsv3 = v::acc_mask_add(*vsv3, vmask, *vsv3, vdx_ij * vdy_ij * vfpair);
|
|
||||||
*vsv4 = v::acc_mask_add(*vsv4, vmask, *vsv4, vdx_ij * vdz_ij * vfpair);
|
|
||||||
*vsv5 = v::acc_mask_add(*vsv5, vmask, *vsv5, vdy_ij * vdz_ij * vfpair);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
@ -933,7 +894,7 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
|
|||||||
f[t_].x += fx[t];
|
f[t_].x += fx[t];
|
||||||
f[t_].y += fy[t];
|
f[t_].y += fy[t];
|
||||||
f[t_].z += fz[t];
|
f[t_].z += fz[t];
|
||||||
if (EVFLAG && EFLAG && eatom) {
|
if (EFLAG && eatom) {
|
||||||
f[t_].w += fw[t];
|
f[t_].w += fw[t];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -945,7 +906,7 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
|
|||||||
f[t_].x += fx[t];
|
f[t_].x += fx[t];
|
||||||
f[t_].y += fy[t];
|
f[t_].y += fy[t];
|
||||||
f[t_].z += fz[t];
|
f[t_].z += fz[t];
|
||||||
if (EVFLAG && EFLAG && eatom) {
|
if (EFLAG && eatom) {
|
||||||
f[t_].w += fw[t];
|
f[t_].w += fw[t];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -954,7 +915,7 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
|
|||||||
// Specialized kernel step for fixed i, means that we don't have to use the
|
// Specialized kernel step for fixed i, means that we don't have to use the
|
||||||
// convoluted iteration scheme above, as the loop variables are uniform.
|
// convoluted iteration scheme above, as the loop variables are uniform.
|
||||||
template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
|
template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
|
||||||
template<int EVFLAG, int EFLAG>
|
template<int EFLAG>
|
||||||
void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
|
void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
|
||||||
int eatom, int vflag,
|
int eatom, int vflag,
|
||||||
const int * _noalias const numneigh, const int * _noalias const cnumneigh,
|
const int * _noalias const numneigh, const int * _noalias const cnumneigh,
|
||||||
@ -964,12 +925,6 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
|
|||||||
const typename PairTersoffIntel::ForceConst<flt_t>::c_outer_t * _noalias const c_outer,
|
const typename PairTersoffIntel::ForceConst<flt_t>::c_outer_t * _noalias const c_outer,
|
||||||
typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
|
typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
|
||||||
avec *vsevdwl,
|
avec *vsevdwl,
|
||||||
avec *vsv0,
|
|
||||||
avec *vsv1,
|
|
||||||
avec *vsv2,
|
|
||||||
avec* vsv3,
|
|
||||||
avec *vsv4,
|
|
||||||
avec *vsv5,
|
|
||||||
int compress_idx,
|
int compress_idx,
|
||||||
int i,
|
int i,
|
||||||
iarr js,
|
iarr js,
|
||||||
@ -1097,21 +1052,11 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
|
|||||||
vfjytmp = vfjytmp * vaprefactor - avec(vdy_ij * vfpair);
|
vfjytmp = vfjytmp * vaprefactor - avec(vdy_ij * vfpair);
|
||||||
vfjztmp = vfjztmp * vaprefactor - avec(vdz_ij * vfpair);
|
vfjztmp = vfjztmp * vaprefactor - avec(vdz_ij * vfpair);
|
||||||
|
|
||||||
if (EVFLAG) {
|
if (EFLAG) {
|
||||||
if (EFLAG) {
|
*vsevdwl = v::acc_mask_add(*vsevdwl, vmask, *vsevdwl, vevdwl);
|
||||||
*vsevdwl = v::acc_mask_add(*vsevdwl, vmask, *vsevdwl, vevdwl);
|
if (eatom) {
|
||||||
if (eatom) {
|
vfwtmp = v_0_5 * vevdwl;
|
||||||
vfwtmp = v_0_5 * vevdwl;
|
v::store(fw, vfwtmp);
|
||||||
v::store(fw, vfwtmp);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (vflag == 1) {
|
|
||||||
*vsv0 = v::acc_mask_add(*vsv0, vmask, *vsv0, vdx_ij * vdx_ij * vfpair);
|
|
||||||
*vsv1 = v::acc_mask_add(*vsv1, vmask, *vsv1, vdy_ij * vdy_ij * vfpair);
|
|
||||||
*vsv2 = v::acc_mask_add(*vsv2, vmask, *vsv2, vdz_ij * vdz_ij * vfpair);
|
|
||||||
*vsv3 = v::acc_mask_add(*vsv3, vmask, *vsv3, vdx_ij * vdy_ij * vfpair);
|
|
||||||
*vsv4 = v::acc_mask_add(*vsv4, vmask, *vsv4, vdx_ij * vdz_ij * vfpair);
|
|
||||||
*vsv5 = v::acc_mask_add(*vsv5, vmask, *vsv5, vdy_ij * vdz_ij * vfpair);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
while (cache_idx-- > 0) {
|
while (cache_idx-- > 0) {
|
||||||
@ -1169,20 +1114,20 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
|
|||||||
f[t_].x += fx[t];
|
f[t_].x += fx[t];
|
||||||
f[t_].y += fy[t];
|
f[t_].y += fy[t];
|
||||||
f[t_].z += fz[t];
|
f[t_].z += fz[t];
|
||||||
if (EVFLAG && EFLAG && eatom) {
|
if (EFLAG && eatom) {
|
||||||
f[t_].w += fw[t];
|
f[t_].w += fw[t];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
f[i].x += v::acc_reduce_add(v::acc_mask_add(v::acc_zero(), vmask, vfxtmp, v::zero()));
|
f[i].x += v::acc_reduce_add(v::acc_mask_add(v::acc_zero(), vmask, vfxtmp, v::zero()));
|
||||||
f[i].y += v::acc_reduce_add(v::acc_mask_add(v::acc_zero(), vmask, vfytmp, v::zero()));
|
f[i].y += v::acc_reduce_add(v::acc_mask_add(v::acc_zero(), vmask, vfytmp, v::zero()));
|
||||||
f[i].z += v::acc_reduce_add(v::acc_mask_add(v::acc_zero(), vmask, vfztmp, v::zero()));
|
f[i].z += v::acc_reduce_add(v::acc_mask_add(v::acc_zero(), vmask, vfztmp, v::zero()));
|
||||||
if (EVFLAG && EFLAG && eatom) {
|
if (EFLAG && eatom) {
|
||||||
f[i].z += v::acc_reduce_add(v::acc_mask_add(v::acc_zero(), vmask, vfwtmp, v::zero()));
|
f[i].z += v::acc_reduce_add(v::acc_mask_add(v::acc_zero(), vmask, vfwtmp, v::zero()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
|
template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
|
||||||
template<bool EVFLAG, bool EFLAG>
|
template<bool EFLAG>
|
||||||
void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
|
void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
|
||||||
int iito, int iifrom, int eatom, int vflag,
|
int iito, int iifrom, int eatom, int vflag,
|
||||||
const int * _noalias const numneigh,
|
const int * _noalias const numneigh,
|
||||||
@ -1193,14 +1138,12 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
|
|||||||
const c_inner_t * _noalias const c_inner,
|
const c_inner_t * _noalias const c_inner,
|
||||||
const c_outer_t * _noalias const c_outer,
|
const c_outer_t * _noalias const c_outer,
|
||||||
typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
|
typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
|
||||||
acc_t *evdwl, acc_t *ov0, acc_t * ov1, acc_t *ov2, acc_t* ov3, acc_t *ov4, acc_t *ov5
|
acc_t *evdwl
|
||||||
) {
|
) {
|
||||||
int compress_idx = 0;
|
int compress_idx = 0;
|
||||||
int ii, jj;
|
int ii, jj;
|
||||||
iarr is, js;
|
iarr is, js;
|
||||||
avec vsevdwl = v::acc_zero();
|
avec vsevdwl = v::acc_zero();
|
||||||
avec vsv0 = v::acc_zero(), vsv1 = v::acc_zero(), vsv2 = v::acc_zero();
|
|
||||||
avec vsv3 = v::acc_zero(), vsv4 = v::acc_zero(), vsv5 = v::acc_zero();
|
|
||||||
ivec v_i4floats(static_cast<int>(sizeof(typename v::fscal) * 4));
|
ivec v_i4floats(static_cast<int>(sizeof(typename v::fscal) * 4));
|
||||||
ivec vj, v_NEIGHMASK(NEIGHMASK);
|
ivec vj, v_NEIGHMASK(NEIGHMASK);
|
||||||
bvec vmask_repulsive(0);
|
bvec vmask_repulsive(0);
|
||||||
@ -1237,11 +1180,11 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
|
|||||||
if (pack_i) {
|
if (pack_i) {
|
||||||
if (compress_idx == v::VL) {
|
if (compress_idx == v::VL) {
|
||||||
vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0));
|
vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0));
|
||||||
kernel_step<EVFLAG,EFLAG>(
|
kernel_step<EFLAG>(
|
||||||
eatom, vflag,
|
eatom, vflag,
|
||||||
numneigh, cnumneigh, firstneigh, ntypes,
|
numneigh, cnumneigh, firstneigh, ntypes,
|
||||||
x, c_inner, c_outer, f,
|
x, c_inner, c_outer, f,
|
||||||
&vsevdwl, &vsv0, &vsv1, &vsv2, &vsv3, &vsv4, &vsv5, compress_idx,
|
&vsevdwl, compress_idx,
|
||||||
is, js, vmask_repulsive
|
is, js, vmask_repulsive
|
||||||
);
|
);
|
||||||
compress_idx = 0;
|
compress_idx = 0;
|
||||||
@ -1250,11 +1193,11 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
|
|||||||
} else {
|
} else {
|
||||||
if (compress_idx == v::VL || (compress_idx > 0 && jj == jnum-1)) {
|
if (compress_idx == v::VL || (compress_idx > 0 && jj == jnum-1)) {
|
||||||
vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0));
|
vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0));
|
||||||
kernel_step_const_i<EVFLAG,EFLAG>(
|
kernel_step_const_i<EFLAG>(
|
||||||
eatom, vflag,
|
eatom, vflag,
|
||||||
numneigh, cnumneigh, firstneigh, ntypes,
|
numneigh, cnumneigh, firstneigh, ntypes,
|
||||||
x, c_inner, c_outer, f,
|
x, c_inner, c_outer, f,
|
||||||
&vsevdwl, &vsv0, &vsv1, &vsv2, &vsv3, &vsv4, &vsv5, compress_idx,
|
&vsevdwl, compress_idx,
|
||||||
i, js, vmask_repulsive
|
i, js, vmask_repulsive
|
||||||
);
|
);
|
||||||
compress_idx = 0;
|
compress_idx = 0;
|
||||||
@ -1265,26 +1208,16 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
|
|||||||
}
|
}
|
||||||
if (compress_idx > 0) {
|
if (compress_idx > 0) {
|
||||||
vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0));
|
vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0));
|
||||||
IntelKernelTersoff::kernel_step<EVFLAG,EFLAG>(
|
IntelKernelTersoff::kernel_step<EFLAG>(
|
||||||
eatom, vflag,
|
eatom, vflag,
|
||||||
numneigh, cnumneigh, firstneigh, ntypes,
|
numneigh, cnumneigh, firstneigh, ntypes,
|
||||||
x, c_inner, c_outer, f,
|
x, c_inner, c_outer, f,
|
||||||
&vsevdwl, &vsv0, &vsv1, &vsv2, &vsv3, &vsv4, &vsv5, compress_idx,
|
&vsevdwl, compress_idx,
|
||||||
is, js, vmask_repulsive
|
is, js, vmask_repulsive
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
if (EVFLAG) {
|
if (EFLAG) {
|
||||||
if (EFLAG) {
|
*evdwl += v::acc_reduce_add(vsevdwl);
|
||||||
*evdwl += v::acc_reduce_add(vsevdwl);
|
|
||||||
}
|
|
||||||
if (vflag == 1) {
|
|
||||||
*ov0 += v::acc_reduce_add(vsv0);
|
|
||||||
*ov1 += v::acc_reduce_add(vsv1);
|
|
||||||
*ov2 += v::acc_reduce_add(vsv2);
|
|
||||||
*ov3 += v::acc_reduce_add(vsv3);
|
|
||||||
*ov4 += v::acc_reduce_add(vsv4);
|
|
||||||
*ov5 += v::acc_reduce_add(vsv5);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -79,7 +79,7 @@ class PairTersoffIntel : public PairTersoff {
|
|||||||
template <class flt_t, class acc_t>
|
template <class flt_t, class acc_t>
|
||||||
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
||||||
const ForceConst<flt_t> &fc);
|
const ForceConst<flt_t> &fc);
|
||||||
template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
template <int EFLAG, class flt_t, class acc_t>
|
||||||
void eval(const int offload, const int vflag,
|
void eval(const int offload, const int vflag,
|
||||||
IntelBuffers<flt_t,acc_t> * buffers,
|
IntelBuffers<flt_t,acc_t> * buffers,
|
||||||
const ForceConst<flt_t> &fc, const int astart, const int aend);
|
const ForceConst<flt_t> &fc, const int astart, const int aend);
|
||||||
|
|||||||
3034
src/USER-INTEL/pppm_disp_intel.cpp
Normal file
3034
src/USER-INTEL/pppm_disp_intel.cpp
Normal file
File diff suppressed because it is too large
Load Diff
238
src/USER-INTEL/pppm_disp_intel.h
Normal file
238
src/USER-INTEL/pppm_disp_intel.h
Normal file
@ -0,0 +1,238 @@
|
|||||||
|
/* -*- c++ -*- ----------------------------------------------------------
|
||||||
|
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||||
|
http://lammps.sandia.gov, Sandia National Laboratories
|
||||||
|
Steve Plimpton, sjplimp@sandia.gov
|
||||||
|
|
||||||
|
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||||
|
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||||
|
certain rights in this software. This software is distributed under
|
||||||
|
the GNU General Public License.
|
||||||
|
|
||||||
|
See the README file in the top-level LAMMPS directory.
|
||||||
|
------------------------------------------------------------------------- */
|
||||||
|
|
||||||
|
/* ----------------------------------------------------------------------
|
||||||
|
Contributing authors: William McDoniel (RWTH Aachen University)
|
||||||
|
------------------------------------------------------------------------- */
|
||||||
|
|
||||||
|
#ifdef KSPACE_CLASS
|
||||||
|
|
||||||
|
KSpaceStyle(pppm/disp/intel,PPPMDispIntel)
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
#ifndef LMP_PPPMINTEL_DISP_H
|
||||||
|
#define LMP_PPPMINTEL_DISP_H
|
||||||
|
|
||||||
|
#include "pppm_disp.h"
|
||||||
|
#include "fix_intel.h"
|
||||||
|
|
||||||
|
namespace LAMMPS_NS {
|
||||||
|
|
||||||
|
class PPPMDispIntel : public PPPMDisp {
|
||||||
|
public:
|
||||||
|
PPPMDispIntel(class LAMMPS *, int, char **);
|
||||||
|
virtual ~PPPMDispIntel();
|
||||||
|
virtual void init();
|
||||||
|
virtual void compute(int, int);
|
||||||
|
|
||||||
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
|
int use_base();
|
||||||
|
#endif
|
||||||
|
|
||||||
|
protected:
|
||||||
|
FixIntel *fix;
|
||||||
|
|
||||||
|
int _use_lrt;
|
||||||
|
FFT_SCALAR **perthread_density;
|
||||||
|
FFT_SCALAR *particle_ekx;
|
||||||
|
FFT_SCALAR *particle_eky;
|
||||||
|
FFT_SCALAR *particle_ekz;
|
||||||
|
FFT_SCALAR *particle_ekx0;
|
||||||
|
FFT_SCALAR *particle_eky0;
|
||||||
|
FFT_SCALAR *particle_ekz0;
|
||||||
|
FFT_SCALAR *particle_ekx1;
|
||||||
|
FFT_SCALAR *particle_eky1;
|
||||||
|
FFT_SCALAR *particle_ekz1;
|
||||||
|
FFT_SCALAR *particle_ekx2;
|
||||||
|
FFT_SCALAR *particle_eky2;
|
||||||
|
FFT_SCALAR *particle_ekz2;
|
||||||
|
FFT_SCALAR *particle_ekx3;
|
||||||
|
FFT_SCALAR *particle_eky3;
|
||||||
|
FFT_SCALAR *particle_ekz3;
|
||||||
|
FFT_SCALAR *particle_ekx4;
|
||||||
|
FFT_SCALAR *particle_eky4;
|
||||||
|
FFT_SCALAR *particle_ekz4;
|
||||||
|
FFT_SCALAR *particle_ekx5;
|
||||||
|
FFT_SCALAR *particle_eky5;
|
||||||
|
FFT_SCALAR *particle_ekz5;
|
||||||
|
FFT_SCALAR *particle_ekx6;
|
||||||
|
FFT_SCALAR *particle_eky6;
|
||||||
|
FFT_SCALAR *particle_ekz6;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
int _use_table;
|
||||||
|
int rho_points;
|
||||||
|
FFT_SCALAR **rho_lookup;
|
||||||
|
FFT_SCALAR **rho6_lookup;
|
||||||
|
FFT_SCALAR **drho_lookup;
|
||||||
|
FFT_SCALAR **drho6_lookup;
|
||||||
|
FFT_SCALAR half_rho_scale, half_rho_scale_plus;
|
||||||
|
|
||||||
|
int _use_packing;
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
|
int _use_base;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
template<class flt_t, class acc_t>
|
||||||
|
void particle_map(double, double, double,
|
||||||
|
double, int **, int, int,
|
||||||
|
int, int, int,
|
||||||
|
int, int, int,
|
||||||
|
IntelBuffers<flt_t,acc_t> *buffers);
|
||||||
|
|
||||||
|
template<class flt_t, class acc_t, int use_table>
|
||||||
|
void make_rho_c(IntelBuffers<flt_t,acc_t> *buffers);
|
||||||
|
template<class flt_t, class acc_t>
|
||||||
|
void make_rho_c(IntelBuffers<flt_t,acc_t> *buffers) {
|
||||||
|
if (_use_table == 1) {
|
||||||
|
make_rho_c<flt_t,acc_t,1>(buffers);
|
||||||
|
} else {
|
||||||
|
make_rho_c<flt_t,acc_t,0>(buffers);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class flt_t, class acc_t, int use_table>
|
||||||
|
void make_rho_g(IntelBuffers<flt_t,acc_t> *buffers);
|
||||||
|
template<class flt_t, class acc_t>
|
||||||
|
void make_rho_g(IntelBuffers<flt_t,acc_t> *buffers) {
|
||||||
|
if (_use_table == 1) {
|
||||||
|
make_rho_g<flt_t,acc_t,1>(buffers);
|
||||||
|
} else {
|
||||||
|
make_rho_g<flt_t,acc_t,0>(buffers);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class flt_t, class acc_t, int use_table>
|
||||||
|
void make_rho_a(IntelBuffers<flt_t,acc_t> *buffers);
|
||||||
|
template<class flt_t, class acc_t>
|
||||||
|
void make_rho_a(IntelBuffers<flt_t,acc_t> *buffers) {
|
||||||
|
if (_use_table == 1) {
|
||||||
|
make_rho_a<flt_t,acc_t,1>(buffers);
|
||||||
|
} else {
|
||||||
|
make_rho_a<flt_t,acc_t,0>(buffers);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template<class flt_t, class acc_t, int use_table>
|
||||||
|
void make_rho_none(IntelBuffers<flt_t,acc_t> *buffers);
|
||||||
|
template<class flt_t, class acc_t>
|
||||||
|
void make_rho_none(IntelBuffers<flt_t,acc_t> *buffers) {
|
||||||
|
if (_use_table == 1) {
|
||||||
|
make_rho_none<flt_t,acc_t,1>(buffers);
|
||||||
|
} else {
|
||||||
|
make_rho_none<flt_t,acc_t,0>(buffers);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template<class flt_t, class acc_t, int use_table>
|
||||||
|
void fieldforce_c_ik(IntelBuffers<flt_t,acc_t> *buffers);
|
||||||
|
template<class flt_t, class acc_t>
|
||||||
|
void fieldforce_c_ik(IntelBuffers<flt_t,acc_t> *buffers) {
|
||||||
|
if (_use_table == 1) {
|
||||||
|
fieldforce_c_ik<flt_t,acc_t,1>(buffers);
|
||||||
|
} else {
|
||||||
|
fieldforce_c_ik<flt_t,acc_t,0>(buffers);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class flt_t, class acc_t, int use_table>
|
||||||
|
void fieldforce_c_ad(IntelBuffers<flt_t,acc_t> *buffers);
|
||||||
|
template<class flt_t, class acc_t>
|
||||||
|
void fieldforce_c_ad(IntelBuffers<flt_t,acc_t> *buffers) {
|
||||||
|
if (_use_table == 1) {
|
||||||
|
fieldforce_c_ad<flt_t,acc_t,1>(buffers);
|
||||||
|
} else {
|
||||||
|
fieldforce_c_ad<flt_t,acc_t,0>(buffers);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class flt_t, class acc_t, int use_table>
|
||||||
|
void fieldforce_g_ik(IntelBuffers<flt_t,acc_t> *buffers);
|
||||||
|
template<class flt_t, class acc_t>
|
||||||
|
void fieldforce_g_ik(IntelBuffers<flt_t,acc_t> *buffers) {
|
||||||
|
if (_use_table == 1) {
|
||||||
|
fieldforce_g_ik<flt_t,acc_t,1>(buffers);
|
||||||
|
} else {
|
||||||
|
fieldforce_g_ik<flt_t,acc_t,0>(buffers);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class flt_t, class acc_t, int use_table>
|
||||||
|
void fieldforce_g_ad(IntelBuffers<flt_t,acc_t> *buffers);
|
||||||
|
template<class flt_t, class acc_t>
|
||||||
|
void fieldforce_g_ad(IntelBuffers<flt_t,acc_t> *buffers) {
|
||||||
|
if (_use_table == 1) {
|
||||||
|
fieldforce_g_ad<flt_t,acc_t,1>(buffers);
|
||||||
|
} else {
|
||||||
|
fieldforce_g_ad<flt_t,acc_t,0>(buffers);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class flt_t, class acc_t, int use_table>
|
||||||
|
void fieldforce_a_ik(IntelBuffers<flt_t,acc_t> *buffers);
|
||||||
|
template<class flt_t, class acc_t>
|
||||||
|
void fieldforce_a_ik(IntelBuffers<flt_t,acc_t> *buffers) {
|
||||||
|
if (_use_table == 1) {
|
||||||
|
fieldforce_a_ik<flt_t,acc_t,1>(buffers);
|
||||||
|
} else {
|
||||||
|
fieldforce_a_ik<flt_t,acc_t,0>(buffers);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class flt_t, class acc_t, int use_table>
|
||||||
|
void fieldforce_a_ad(IntelBuffers<flt_t,acc_t> *buffers);
|
||||||
|
template<class flt_t, class acc_t>
|
||||||
|
void fieldforce_a_ad(IntelBuffers<flt_t,acc_t> *buffers) {
|
||||||
|
if (_use_table == 1) {
|
||||||
|
fieldforce_a_ad<flt_t,acc_t,1>(buffers);
|
||||||
|
} else {
|
||||||
|
fieldforce_a_ad<flt_t,acc_t,0>(buffers);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
template<class flt_t, class acc_t, int use_table>
|
||||||
|
void fieldforce_none_ik(IntelBuffers<flt_t,acc_t> *buffers);
|
||||||
|
template<class flt_t, class acc_t>
|
||||||
|
void fieldforce_none_ik(IntelBuffers<flt_t,acc_t> *buffers) {
|
||||||
|
if (_use_table == 1) {
|
||||||
|
fieldforce_none_ik<flt_t,acc_t,1>(buffers);
|
||||||
|
} else {
|
||||||
|
fieldforce_none_ik<flt_t,acc_t,0>(buffers);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class flt_t, class acc_t, int use_table>
|
||||||
|
void fieldforce_none_ad(IntelBuffers<flt_t,acc_t> *buffers);
|
||||||
|
template<class flt_t, class acc_t>
|
||||||
|
void fieldforce_none_ad(IntelBuffers<flt_t,acc_t> *buffers) {
|
||||||
|
if (_use_table == 1) {
|
||||||
|
fieldforce_none_ad<flt_t,acc_t,1>(buffers);
|
||||||
|
} else {
|
||||||
|
fieldforce_none_ad<flt_t,acc_t,0>(buffers);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void precompute_rho();
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
File diff suppressed because it is too large
Load Diff
@ -1,4 +1,4 @@
|
|||||||
/* -*- c++ -*- ----------------------------------------------------------
|
/* ----------------------------------------------------------------------
|
||||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||||
http://lammps.sandia.gov, Sandia National Laboratories
|
http://lammps.sandia.gov, Sandia National Laboratories
|
||||||
Steve Plimpton, sjplimp@sandia.gov
|
Steve Plimpton, sjplimp@sandia.gov
|
||||||
@ -12,7 +12,9 @@
|
|||||||
------------------------------------------------------------------------- */
|
------------------------------------------------------------------------- */
|
||||||
|
|
||||||
/* ----------------------------------------------------------------------
|
/* ----------------------------------------------------------------------
|
||||||
Contributing authors: Rodrigo Canales (RWTH Aachen University)
|
Contributing authors: William McDoniel (RWTH Aachen University)
|
||||||
|
Rodrigo Canales (RWTH Aachen University)
|
||||||
|
Markus Hoehnerbach (RWTH Aachen University)
|
||||||
W. Michael Brown (Intel)
|
W. Michael Brown (Intel)
|
||||||
------------------------------------------------------------------------- */
|
------------------------------------------------------------------------- */
|
||||||
|
|
||||||
@ -36,6 +38,9 @@ class PPPMIntel : public PPPM {
|
|||||||
virtual ~PPPMIntel();
|
virtual ~PPPMIntel();
|
||||||
virtual void init();
|
virtual void init();
|
||||||
virtual void compute(int, int);
|
virtual void compute(int, int);
|
||||||
|
virtual void pack_forward(int, FFT_SCALAR *, int, int *);
|
||||||
|
virtual void unpack_forward(int, FFT_SCALAR *, int, int *);
|
||||||
|
virtual double memory_usage();
|
||||||
void compute_first(int, int);
|
void compute_first(int, int);
|
||||||
void compute_second(int, int);
|
void compute_second(int, int);
|
||||||
void pack_buffers();
|
void pack_buffers();
|
||||||
@ -47,18 +52,74 @@ class PPPMIntel : public PPPM {
|
|||||||
protected:
|
protected:
|
||||||
FixIntel *fix;
|
FixIntel *fix;
|
||||||
|
|
||||||
|
int _use_lrt;
|
||||||
|
FFT_SCALAR **perthread_density;
|
||||||
|
FFT_SCALAR *particle_ekx;
|
||||||
|
FFT_SCALAR *particle_eky;
|
||||||
|
FFT_SCALAR *particle_ekz;
|
||||||
|
|
||||||
|
int _use_table;
|
||||||
|
int rho_points;
|
||||||
|
FFT_SCALAR **rho_lookup;
|
||||||
|
FFT_SCALAR **drho_lookup;
|
||||||
|
FFT_SCALAR half_rho_scale, half_rho_scale_plus;
|
||||||
|
|
||||||
|
int _use_packing;
|
||||||
|
FFT_SCALAR ***vdxy_brick;
|
||||||
|
FFT_SCALAR ***vdz0_brick;
|
||||||
|
FFT_SCALAR *work3;
|
||||||
|
class GridComm *cg_pack;
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
int _use_base;
|
int _use_base;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
template<class flt_t, class acc_t>
|
||||||
|
void test_function(IntelBuffers<flt_t,acc_t> *buffers);
|
||||||
|
|
||||||
|
|
||||||
|
void precompute_rho();
|
||||||
template<class flt_t, class acc_t>
|
template<class flt_t, class acc_t>
|
||||||
void particle_map(IntelBuffers<flt_t,acc_t> *buffers);
|
void particle_map(IntelBuffers<flt_t,acc_t> *buffers);
|
||||||
template<class flt_t, class acc_t>
|
template<class flt_t, class acc_t, int use_table>
|
||||||
void make_rho(IntelBuffers<flt_t,acc_t> *buffers);
|
void make_rho(IntelBuffers<flt_t,acc_t> *buffers);
|
||||||
template<class flt_t, class acc_t>
|
template<class flt_t, class acc_t>
|
||||||
|
void make_rho(IntelBuffers<flt_t,acc_t> *buffers) {
|
||||||
|
if (_use_table == 1) {
|
||||||
|
make_rho<flt_t,acc_t,1>(buffers);
|
||||||
|
} else {
|
||||||
|
make_rho<flt_t,acc_t,0>(buffers);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
void poisson_ik_intel();
|
||||||
|
template<class flt_t, class acc_t, int use_table, int use_packing>
|
||||||
void fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers);
|
void fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers);
|
||||||
template<class flt_t, class acc_t>
|
template<class flt_t, class acc_t>
|
||||||
|
void fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers) {
|
||||||
|
if (_use_table == 1) {
|
||||||
|
if (_use_packing == 1) {
|
||||||
|
fieldforce_ik<flt_t, acc_t, 1, 1>(buffers);
|
||||||
|
} else {
|
||||||
|
fieldforce_ik<flt_t, acc_t, 1, 0>(buffers);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (_use_packing == 1) {
|
||||||
|
fieldforce_ik<flt_t, acc_t, 0, 1>(buffers);
|
||||||
|
} else {
|
||||||
|
fieldforce_ik<flt_t, acc_t, 0, 0>(buffers);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
template<class flt_t, class acc_t, int use_table>
|
||||||
void fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers);
|
void fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers);
|
||||||
|
template<class flt_t, class acc_t>
|
||||||
|
void fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers) {
|
||||||
|
if (_use_table == 1) {
|
||||||
|
fieldforce_ad<flt_t,acc_t,1>(buffers);
|
||||||
|
} else {
|
||||||
|
fieldforce_ad<flt_t,acc_t,0>(buffers);
|
||||||
|
}
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -78,17 +78,17 @@ void VerletLRTIntel::init()
|
|||||||
setup before run
|
setup before run
|
||||||
------------------------------------------------------------------------- */
|
------------------------------------------------------------------------- */
|
||||||
|
|
||||||
void VerletLRTIntel::setup()
|
void VerletLRTIntel::setup(int flag)
|
||||||
{
|
{
|
||||||
if (_intel_kspace == 0) {
|
if (_intel_kspace == 0) {
|
||||||
Verlet::setup();
|
Verlet::setup(flag);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef _LMP_INTEL_OFFLOAD
|
#ifdef _LMP_INTEL_OFFLOAD
|
||||||
if (_intel_kspace->use_base()) {
|
if (_intel_kspace->use_base()) {
|
||||||
_intel_kspace = 0;
|
_intel_kspace = 0;
|
||||||
Verlet::setup();
|
Verlet::setup(flag);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -42,7 +42,7 @@ class VerletLRTIntel : public Verlet {
|
|||||||
VerletLRTIntel(class LAMMPS *, int, char **);
|
VerletLRTIntel(class LAMMPS *, int, char **);
|
||||||
virtual ~VerletLRTIntel();
|
virtual ~VerletLRTIntel();
|
||||||
virtual void init();
|
virtual void init();
|
||||||
virtual void setup();
|
virtual void setup(int flag = 1);
|
||||||
virtual void run(int);
|
virtual void run(int);
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
|
|||||||
51
src/atom.cpp
51
src/atom.cpp
@ -40,6 +40,10 @@
|
|||||||
#include "memory.h"
|
#include "memory.h"
|
||||||
#include "error.h"
|
#include "error.h"
|
||||||
|
|
||||||
|
#ifdef LMP_USER_INTEL
|
||||||
|
#include "neigh_request.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
using namespace LAMMPS_NS;
|
using namespace LAMMPS_NS;
|
||||||
using namespace MathConst;
|
using namespace MathConst;
|
||||||
|
|
||||||
@ -1882,6 +1886,53 @@ void Atom::setup_sort_bins()
|
|||||||
bininvy = nbiny / (bboxhi[1]-bboxlo[1]);
|
bininvy = nbiny / (bboxhi[1]-bboxlo[1]);
|
||||||
bininvz = nbinz / (bboxhi[2]-bboxlo[2]);
|
bininvz = nbinz / (bboxhi[2]-bboxlo[2]);
|
||||||
|
|
||||||
|
#ifdef LMP_USER_INTEL
|
||||||
|
int intel_neigh = 0;
|
||||||
|
if (neighbor->nrequest) {
|
||||||
|
if (neighbor->requests[0]->intel) intel_neigh = 1;
|
||||||
|
} else if (neighbor->old_nrequest)
|
||||||
|
if (neighbor->old_requests[0]->intel) intel_neigh = 1;
|
||||||
|
if (intel_neigh && userbinsize == 0.0) {
|
||||||
|
if (neighbor->binsizeflag) bininv = 1.0/neighbor->binsize_user;
|
||||||
|
|
||||||
|
double nx_low = neighbor->bboxlo[0];
|
||||||
|
double ny_low = neighbor->bboxlo[1];
|
||||||
|
double nz_low = neighbor->bboxlo[2];
|
||||||
|
double nxbbox = neighbor->bboxhi[0] - nx_low;
|
||||||
|
double nybbox = neighbor->bboxhi[1] - ny_low;
|
||||||
|
double nzbbox = neighbor->bboxhi[2] - nz_low;
|
||||||
|
int nnbinx = static_cast<int> (nxbbox * bininv);
|
||||||
|
int nnbiny = static_cast<int> (nybbox * bininv);
|
||||||
|
int nnbinz = static_cast<int> (nzbbox * bininv);
|
||||||
|
if (domain->dimension == 2) nnbinz = 1;
|
||||||
|
|
||||||
|
if (nnbinx == 0) nnbinx = 1;
|
||||||
|
if (nnbiny == 0) nnbiny = 1;
|
||||||
|
if (nnbinz == 0) nnbinz = 1;
|
||||||
|
|
||||||
|
double binsizex = nxbbox/nnbinx;
|
||||||
|
double binsizey = nybbox/nnbiny;
|
||||||
|
double binsizez = nzbbox/nnbinz;
|
||||||
|
|
||||||
|
bininvx = 1.0 / binsizex;
|
||||||
|
bininvy = 1.0 / binsizey;
|
||||||
|
bininvz = 1.0 / binsizez;
|
||||||
|
|
||||||
|
int lxo = (bboxlo[0] - nx_low) * bininvx;
|
||||||
|
int lyo = (bboxlo[1] - ny_low) * bininvy;
|
||||||
|
int lzo = (bboxlo[2] - nz_low) * bininvz;
|
||||||
|
bboxlo[0] = nx_low + static_cast<double>(lxo) / bininvx;
|
||||||
|
bboxlo[1] = ny_low + static_cast<double>(lyo) / bininvy;
|
||||||
|
bboxlo[2] = nz_low + static_cast<double>(lzo) / bininvz;
|
||||||
|
nbinx = static_cast<int>((bboxhi[0] - bboxlo[0]) * bininvx) + 1;
|
||||||
|
nbiny = static_cast<int>((bboxhi[1] - bboxlo[1]) * bininvy) + 1;
|
||||||
|
nbinz = static_cast<int>((bboxhi[2] - bboxlo[2]) * bininvz) + 1;
|
||||||
|
bboxhi[0] = bboxlo[0] + static_cast<double>(nbinx) / bininvx;
|
||||||
|
bboxhi[1] = bboxlo[1] + static_cast<double>(nbiny) / bininvy;
|
||||||
|
bboxhi[2] = bboxlo[2] + static_cast<double>(nbinz) / bininvz;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
if (1.0*nbinx*nbiny*nbinz > INT_MAX)
|
if (1.0*nbinx*nbiny*nbinz > INT_MAX)
|
||||||
error->one(FLERR,"Too many atom sorting bins");
|
error->one(FLERR,"Too many atom sorting bins");
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user