USER-INTEL upgrade from M Brown
This commit is contained in:
Binary file not shown.
|
Before Width: | Height: | Size: 14 KiB After Width: | Height: | Size: 14 KiB |
@ -30,8 +30,8 @@ Dihedral Styles: charmm, harmonic, opls :l
|
||||
Fixes: nve, npt, nvt, nvt/sllod :l
|
||||
Improper Styles: cvff, harmonic :l
|
||||
Pair Styles: buck/coul/cut, buck/coul/long, buck, eam, gayberne,
|
||||
charmm/coul/long, lj/cut, lj/cut/coul/long, sw, tersoff :l
|
||||
K-Space Styles: pppm :l
|
||||
charmm/coul/long, lj/cut, lj/cut/coul/long, lj/long/coul/long, sw, tersoff :l
|
||||
K-Space Styles: pppm, pppm/disp :l
|
||||
:ule
|
||||
|
||||
[Speed-ups to expect:]
|
||||
@ -42,62 +42,88 @@ precision mode. Performance improvements are shown compared to
|
||||
LAMMPS {without using other acceleration packages} as these are
|
||||
under active development (and subject to performance changes). The
|
||||
measurements were performed using the input files available in
|
||||
the src/USER-INTEL/TEST directory. These are scalable in size; the
|
||||
results given are with 512K particles (524K for Liquid Crystal).
|
||||
Most of the simulations are standard LAMMPS benchmarks (indicated
|
||||
by the filename extension in parenthesis) with modifications to the
|
||||
run length and to add a warmup run (for use with offload
|
||||
benchmarks).
|
||||
the src/USER-INTEL/TEST directory with the provided run script.
|
||||
These are scalable in size; the results given are with 512K
|
||||
particles (524K for Liquid Crystal). Most of the simulations are
|
||||
standard LAMMPS benchmarks (indicated by the filename extension in
|
||||
parenthesis) with modifications to the run length and to add a
|
||||
warmup run (for use with offload benchmarks).
|
||||
|
||||
:c,image(JPG/user_intel.png)
|
||||
|
||||
Results are speedups obtained on Intel Xeon E5-2697v4 processors
|
||||
(code-named Broadwell) and Intel Xeon Phi 7250 processors
|
||||
(code-named Knights Landing) with "18 Jun 2016" LAMMPS built with
|
||||
Intel Parallel Studio 2016 update 3. Results are with 1 MPI task
|
||||
(code-named Knights Landing) with "June 2017" LAMMPS built with
|
||||
Intel Parallel Studio 2017 update 2. Results are with 1 MPI task
|
||||
per physical core. See {src/USER-INTEL/TEST/README} for the raw
|
||||
simulation rates and instructions to reproduce.
|
||||
|
||||
:line
|
||||
|
||||
[Accuracy and order of operations:]
|
||||
|
||||
In most molecular dynamics software, parallelization parameters
|
||||
(# of MPI, OpenMP, and vectorization) can change the results due
|
||||
to changing the order of operations with finite-precision
|
||||
calculations. The USER-INTEL package is deterministic. This means
|
||||
that the results should be reproducible from run to run with the
|
||||
{same} parallel configurations and when using determinstic
|
||||
libraries or library settings (MPI, OpenMP, FFT). However, there
|
||||
are differences in the USER-INTEL package that can change the
|
||||
order of operations compared to LAMMPS without acceleration:
|
||||
|
||||
Neighbor lists can be created in a different order :ulb,l
|
||||
Bins used for sorting atoms can be oriented differently :l
|
||||
The default stencil order for PPPM is 7. By default, LAMMPS will
|
||||
calculate other PPPM parameters to fit the desired acuracy with
|
||||
this order :l
|
||||
The {newton} setting applies to all atoms, not just atoms shared
|
||||
between MPI tasks :l
|
||||
Vectorization can change the order for adding pairwise forces :l
|
||||
:ule
|
||||
|
||||
The precision mode (described below) used with the USER-INTEL
|
||||
package can change the {accuracy} of the calculations. For the
|
||||
default {mixed} precision option, calculations between pairs or
|
||||
triplets of atoms are performed in single precision, intended to
|
||||
be within the inherent error of MD simulations. All accumulation
|
||||
is performed in double precision to prevent the error from growing
|
||||
with the number of atoms in the simulation. {Single} precision
|
||||
mode should not be used without appropriate validation.
|
||||
|
||||
:line
|
||||
|
||||
[Quick Start for Experienced Users:]
|
||||
|
||||
LAMMPS should be built with the USER-INTEL package installed.
|
||||
Simulations should be run with 1 MPI task per physical {core},
|
||||
not {hardware thread}.
|
||||
|
||||
For Intel Xeon CPUs:
|
||||
|
||||
Edit src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi as necessary. :ulb,l
|
||||
If using {kspace_style pppm} in the input script, add "neigh_modify binsize cutoff" and "kspace_modify diff ad" to the input script for better
|
||||
performance. Cutoff should be roughly the neighbor list cutoff. By
|
||||
default the binsize is half the neighbor list cutoff. :l
|
||||
"-pk intel 0 omp 2 -sf intel" added to LAMMPS command-line :l
|
||||
Set the environment variable KMP_BLOCKTIME=0 :l
|
||||
"-pk intel 0 omp $t -sf intel" added to LAMMPS command-line :l
|
||||
$t should be 2 for Intel Xeon CPUs and 2 or 4 for Intel Xeon Phi :l
|
||||
For some of the simple 2-body potentials without long-range
|
||||
electrostatics, performance and scalability can be better with
|
||||
the "newton off" setting added to the input script :l
|
||||
If using {kspace_style pppm} in the input script, add
|
||||
"kspace_modify diff ad" for better performance :l
|
||||
:ule
|
||||
|
||||
For Intel Xeon Phi CPUs for simulations without {kspace_style
|
||||
pppm} in the input script :
|
||||
For Intel Xeon Phi CPUs:
|
||||
|
||||
Edit src/MAKE/OPTIONS/Makefile.knl as necessary. :ulb,l
|
||||
Runs should be performed using MCDRAM. :l
|
||||
"-pk intel 0 omp 2 -sf intel" {or} "-pk intel 0 omp 4 -sf intel"
|
||||
should be added to the LAMMPS command-line. Choice for best
|
||||
performance will depend on the simulation. :l
|
||||
Runs should be performed using MCDRAM. :ulb,l
|
||||
:ule
|
||||
|
||||
For Intel Xeon Phi CPUs for simulations with {kspace_style
|
||||
pppm} in the input script:
|
||||
For simulations using {kspace_style pppm} on Intel CPUs
|
||||
supporting AVX-512:
|
||||
|
||||
Edit src/MAKE/OPTIONS/Makefile.knl as necessary. :ulb,l
|
||||
Runs should be performed using MCDRAM. :l
|
||||
Add "neigh_modify binsize 3" to the input script for better
|
||||
performance. :l
|
||||
Add "kspace_modify diff ad" to the input script for better
|
||||
performance. :l
|
||||
export KMP_AFFINITY=none :l
|
||||
"-pk intel 0 omp 3 lrt yes -sf intel" or "-pk intel 0 omp 1 lrt yes
|
||||
-sf intel" added to LAMMPS command-line. Choice for best performance
|
||||
will depend on the simulation. :l
|
||||
Add "kspace_modify diff ad" to the input script :ulb,l
|
||||
The command-line option should be changed to
|
||||
"-pk intel 0 omp $r lrt yes -sf intel" where $r is the number of
|
||||
threads minus 1. :l
|
||||
Do not use thread affinity (set KMP_AFFINITY=none) :l
|
||||
The "newton off" setting may provide better scalability :l
|
||||
:ule
|
||||
|
||||
For Intel Xeon Phi coprocessors (Offload):
|
||||
@ -169,6 +195,10 @@ cat /proc/cpuinfo :pre
|
||||
|
||||
[Building LAMMPS with the USER-INTEL package:]
|
||||
|
||||
NOTE: See the src/USER-INTEL/README file for additional flags that
|
||||
might be needed for best performance on Intel server processors
|
||||
code-named "Skylake".
|
||||
|
||||
The USER-INTEL package must be installed into the source directory:
|
||||
|
||||
make yes-user-intel :pre
|
||||
@ -322,8 +352,8 @@ follow in the input script.
|
||||
|
||||
NOTE: The USER-INTEL package will perform better with modifications
|
||||
to the input script when "PPPM"_kspace_style.html is used:
|
||||
"kspace_modify diff ad"_kspace_modify.html and "neigh_modify binsize
|
||||
3"_neigh_modify.html should be added to the input script.
|
||||
"kspace_modify diff ad"_kspace_modify.html should be added to the
|
||||
input script.
|
||||
|
||||
Long-Range Thread (LRT) mode is an option to the "package
|
||||
intel"_package.html command that can improve performance when using
|
||||
@ -342,6 +372,10 @@ would normally perform best with "-pk intel 0 omp 4", instead use
|
||||
environment variable "KMP_AFFINITY=none". LRT mode is not supported
|
||||
when using offload.
|
||||
|
||||
NOTE: Changing the "newton"_newton.html setting to off can improve
|
||||
performance and/or scalability for simple 2-body potentials such as
|
||||
lj/cut or when using LRT mode on processors supporting AVX-512.
|
||||
|
||||
Not all styles are supported in the USER-INTEL package. You can mix
|
||||
the USER-INTEL package with styles from the "OPT"_accelerate_opt.html
|
||||
package or the "USER-OMP package"_accelerate_omp.html. Of course,
|
||||
@ -467,7 +501,7 @@ supported.
|
||||
|
||||
Brown, W.M., Carrillo, J.-M.Y., Mishra, B., Gavhane, N., Thakker, F.M., De Kraker, A.R., Yamada, M., Ang, J.A., Plimpton, S.J., "Optimizing Classical Molecular Dynamics in LAMMPS," in Intel Xeon Phi Processor High Performance Programming: Knights Landing Edition, J. Jeffers, J. Reinders, A. Sodani, Eds. Morgan Kaufmann. :ulb,l
|
||||
|
||||
Brown, W. M., Semin, A., Hebenstreit, M., Khvostov, S., Raman, K., Plimpton, S.J. Increasing Molecular Dynamics Simulation Rates with an 8-Fold Increase in Electrical Power Efficiency. 2016 International Conference for High Performance Computing. In press. :l
|
||||
Brown, W. M., Semin, A., Hebenstreit, M., Khvostov, S., Raman, K., Plimpton, S.J. "Increasing Molecular Dynamics Simulation Rates with an 8-Fold Increase in Electrical Power Efficiency."_http://dl.acm.org/citation.cfm?id=3014915 2016 High Performance Computing, Networking, Storage and Analysis, SC16: International Conference (pp. 82-95). :l
|
||||
|
||||
Brown, W.M., Carrillo, J.-M.Y., Gavhane, N., Thakkar, F.M., Plimpton, S.J. Optimizing Legacy Molecular Dynamics Software with Directive-Based Offload. Computer Physics Communications. 2015. 195: p. 95-101. :l
|
||||
:ule
|
||||
|
||||
@ -14,152 +14,178 @@ fix ID group-ID neb Kspring keyword value :pre
|
||||
|
||||
ID, group-ID are documented in "fix"_fix.html command :ulb,l
|
||||
neb = style name of this fix command :l
|
||||
Kspring = parallel spring constant (force/distance units or force units) :l
|
||||
Kspring = parallel spring constant (force/distance units or force units, see nudge keyword) :l
|
||||
zero or more keyword/value pairs may be appended :l
|
||||
keyword = {nudg_style} or {perp} or {freend} or {freend_k_spring} :l
|
||||
{nudg_style} value = {neigh} or {idealpos}
|
||||
{neigh} = the parallel nudging force is calculated from the distances to neighbouring replicas (in this case, Kspring is in force/distance units)
|
||||
{idealpos} = the parallel nudging force is proportional to the distance between the replica and its interpolated ideal position (in this case Kspring is in force units)
|
||||
{perp} value {none} or kspring2
|
||||
{none} = no perpendicular spring force is applied
|
||||
{kspring2} = spring constant for the perpendicular nudging force (in force/distance units)
|
||||
{freeend} value = {none} or {ini} or {final} or {finaleini} or {final2eini}
|
||||
{none} = no nudging force is applied to the first and last replicas
|
||||
{ini} = set the first replica to be a free end
|
||||
{final} = set the last replica to be a free end
|
||||
{finaleini} = set the last replica to be a free end and set its target energy as that of the first replica
|
||||
{final2eini} = same as {finaleini} plus prevent intermediate replicas to have a lower energy than the first replica
|
||||
{freeend_kspring} value = kspring3
|
||||
kspring3 = spring constant of the perpendicular spring force (per distance units)
|
||||
:pre
|
||||
keyword = {nudge} or {perp} or {ends} :l
|
||||
{nudge} value = {neigh} or {ideal}
|
||||
{neigh} = parallel nudging force based on distance to neighbor replicas (Kspring = force/distance units)
|
||||
{ideal} = parallel nudging force based on interpolated ideal position (Kspring = force units)
|
||||
{perp} value = {Kspring2}
|
||||
{Kspring2} = spring constant for perpendicular nudging force (force/distance units)
|
||||
{end} values = estyle Kspring3
|
||||
{estyle} = {first} or {last} or {last/efirst} or {last/efirst/middle}
|
||||
{first} = apply force to first replica
|
||||
{last} = apply force to last replica
|
||||
{last/efirst} = apply force to last replica and set its target energy to that of first replica
|
||||
{last/efirst/middle} = same as {last/efirst} plus prevent middle replicas having lower energy than first replica
|
||||
{Kspring3} = spring constant for target energy term (1/distance units) :pre
|
||||
|
||||
[Examples:]
|
||||
|
||||
fix 1 active neb 10.0
|
||||
fix 2 all neb 1.0 perp 1.0 freeend final
|
||||
fix 1 all neb 1.0 nudg_style idealpos freeend final2eini freend_kspring 1:pre
|
||||
fix 2 all neb 1.0 perp 1.0 end last
|
||||
fix 2 all neb 1.0 perp 1.0 end first end last
|
||||
fix 1 all neb 1.0 nudge ideal end last/efirst 1 :pre
|
||||
|
||||
[Description:]
|
||||
|
||||
Add a nudging force to atoms in the group for a multi-replica
|
||||
Add nudging forces to atoms in the group for a multi-replica
|
||||
simulation run via the "neb"_neb.html command to perform a nudged
|
||||
elastic band (NEB) calculation for finding the transition state.
|
||||
Hi-level explanations of NEB are given with the "neb"_neb.html command
|
||||
and in "Section_howto 5"_Section_howto.html#howto_5 of the manual.
|
||||
The fix neb command must be used with the "neb" command and defines
|
||||
how nudging inter-replica forces are computed. A NEB calculation is
|
||||
how inter-replica nudging forces are computed. A NEB calculation is
|
||||
divided in two stages. In the first stage n replicas are relaxed
|
||||
toward a MEP and in a second stage, the climbing image scheme (see
|
||||
"(Henkelman2)"_#Henkelman2) is turned on so that the replica having
|
||||
the highest energy relaxes toward the saddle point (i.e. the point of
|
||||
highest energy along the MEP).
|
||||
toward a MEP until convergence. In the second stage, the climbing
|
||||
image scheme (see "(Henkelman2)"_#Henkelman2) is enabled, so that the
|
||||
replica having the highest energy relaxes toward the saddle point
|
||||
(i.e. the point of highest energy along the MEP), and a second
|
||||
relaxation is performed.
|
||||
|
||||
One purpose of the nudging forces is to keep the replicas equally
|
||||
spaced. During the NEB, the 3N-length vector of interatomic force Fi
|
||||
= -Grad(V) of replicas i is altered. For all intermediate replicas
|
||||
(i.e. for 1<i<n) but the climbing replica the force vector
|
||||
becomes:
|
||||
A key purpose of the nudging forces is to keep the replicas equally
|
||||
spaced. During the NEB calculation, the 3N-length vector of
|
||||
interatomic force Fi = -Grad(V) for each replica I is altered. For
|
||||
all intermediate replicas (i.e. for 1 < I < N, except the climbing
|
||||
replica) the force vector becomes:
|
||||
|
||||
Fi = -Grad(V) + (Grad(V) dot That) That + Fnudgparallel + Fspringperp :pre
|
||||
Fi = -Grad(V) + (Grad(V) dot T') T' + Fnudge_parallel + Fspring_perp :pre
|
||||
|
||||
That is the unit "tangent" vector for replica i and is a function of
|
||||
Ri, Ri-1, Ri+1, and the potential energy of the 3 replicas; it points
|
||||
roughly in the direction of (Ri+i - Ri-1) (see the
|
||||
"(Henkelman1)"_#Henkelman1 paper for details). Ri are the atomic
|
||||
coordinates of replica i; Ri-1 and Ri+1 are the coordinates of its
|
||||
neighbor replicas. The term (Grad(V) dot That) is used to remove the
|
||||
T' is the unit "tangent" vector for replica I and is a function of Ri,
|
||||
Ri-1, Ri+1, and the potential energy of the 3 replicas; it points
|
||||
roughly in the direction of (Ri+i - Ri-1); see the
|
||||
"(Henkelman1)"_#Henkelman1 paper for details. Ri are the atomic
|
||||
coordinates of replica I; Ri-1 and Ri+1 are the coordinates of its
|
||||
neighbor replicas. The term (Grad(V) dot T') is used to remove the
|
||||
component of the gradient parallel to the path which would tend to
|
||||
distribute the replica unevenly along the path. Fnudgparallel is an
|
||||
artificial nudging force which is applied only in the tangent direction
|
||||
and which maintains the replicas equally spaced (see below for more
|
||||
information). Fspringperp is an optinal artificial spring which is
|
||||
applied only perpendicular to the tangent and which prevent the paths
|
||||
from forming too acute kinks (see below for more information).
|
||||
distribute the replica unevenly along the path. Fnudge_parallel is an
|
||||
artificial nudging force which is applied only in the tangent
|
||||
direction and which maintains the equal spacing between replicas (see
|
||||
below for more information). Fspring_perp is an optional artificial
|
||||
spring which is applied only perpendicular to the tangent and which
|
||||
prevent the paths from forming acute kinks (see below for more
|
||||
information).
|
||||
|
||||
The keyword {nudg_style} allow to specify how to parallel
|
||||
nudging force is computed. With a value of idealpos, the spring
|
||||
force is computed as suggested in "(E)"_#E :
|
||||
In the second stage of the NEB calculation, the interatomic force Fi
|
||||
for the climbing replica (the replica of highest energy after the
|
||||
first stage) is changed to:
|
||||
|
||||
Fnudgparallel=-{Kspring}* (RD-RDideal)/(2 meanDist) :pre
|
||||
Fi = -Grad(V) + 2 (Grad(V) dot T') T' :pre
|
||||
|
||||
and the relaxation procedure is continued to a new converged MEP.
|
||||
|
||||
:line
|
||||
|
||||
The keyword {nudge} specifies how the parallel nudging force is
|
||||
computed. With a value of {neigh}, the parallel nudging force is
|
||||
computed as in "(Henkelman1)"_#Henkelman1 by connecting each
|
||||
intermediate replica with the previous and the next image:
|
||||
|
||||
Fnudge_parallel = {Kspring} * (|Ri+1 - Ri| - |Ri - Ri-1|) :pre
|
||||
|
||||
Note that in this case the specified {Kspring) is in force/distance
|
||||
units.
|
||||
|
||||
With a value of {ideal}, the spring force is computed as suggested in
|
||||
"(WeinenE)"_#WeinenE :
|
||||
|
||||
Fnudge_parallel = -{Kspring} * (RD-RDideal) / (2 * meanDist) :pre
|
||||
|
||||
where RD is the "reaction coordinate" see "neb"_neb.html section, and
|
||||
RDideal is the ideal RD for which all the images are equally spaced
|
||||
(i.e. RDideal = (i-1)*meanDist when the climbing image is off, where i
|
||||
is the replica number). The meanDist is the average distance between
|
||||
replicas.
|
||||
RDideal is the ideal RD for which all the images are equally spaced.
|
||||
I.e. RDideal = (I-1)*meanDist when the climbing replica is off, where
|
||||
I is the replica number). The meanDist is the average distance
|
||||
between replicas. Note that in this case the specified {Kspring) is
|
||||
in force units.
|
||||
|
||||
When {nudg_style} has a value of neigh (or by default), the parallel
|
||||
nudging force is computed as in "(Henkelman1)"_#Henkelman1 by
|
||||
connecting each intermediate replica with the previous and the next
|
||||
image:
|
||||
|
||||
Fnudgparallel= {Kspring}* (|Ri+1 - Ri| - |Ri - Ri-1|) :pre
|
||||
|
||||
The parallel nudging force associated with the key word idealpos should
|
||||
usually be more efficient at keeping the images equally spaced.
|
||||
Note that the {ideal} form of nudging can often be more effective at
|
||||
keeping the replicas equally spaced.
|
||||
|
||||
:line
|
||||
|
||||
The keyword {perp} allows to add a spring force perpendicular to the
|
||||
path in order to prevent the path from becoming too kinky. It can
|
||||
improve significantly the convergence of the NEB when the resolution
|
||||
is poor (i.e. when too few images are used) (see "(Maras)"_#Maras1).
|
||||
The keyword {perp} adds a spring force perpendicular to the path in
|
||||
order to prevent the path from becoming too kinky, with magnitude It
|
||||
can significantly improve the convergence of the NEB calculation when
|
||||
the resolution is poor. I.e. when too few replicas are used; see
|
||||
"(Maras)"_#Maras1 for details.
|
||||
|
||||
The perpendicular spring force is given by
|
||||
|
||||
Fspringperp = {Kspringperp} * f(Ri-1,Ri,Ri+1) (Ri+1 + Ri-1 - 2 Ri) :pre
|
||||
Fspring_perp = {Kspring2} * F(Ri-1,Ri,Ri+1) (Ri+1 + Ri-1 - 2 Ri) :pre
|
||||
|
||||
f(Ri-1 Ri R+1) is a smooth scalar function of the angle Ri-1 Ri
|
||||
Ri+1. It is equal to 0 when the path is straight and is equal to 1
|
||||
when the angle Ri-1 Ri Ri+1 is accute. f(Ri-1 Ri R+1) is defined in
|
||||
"(Jonsson)"_#Jonsson
|
||||
where {Kspring2} is the specified value. F(Ri-1 Ri R+1) is a smooth
|
||||
scalar function of the angle Ri-1 Ri Ri+1. It is equal to 0.0 when
|
||||
the path is straight and is equal to 1 when the angle Ri-1 Ri Ri+1 is
|
||||
acute. F(Ri-1 Ri R+1) is defined in "(Jonsson)"_#Jonsson.
|
||||
|
||||
If {Kspring2} is set to 0.0 (the default) then no perpendicular spring
|
||||
force is added.
|
||||
|
||||
:line
|
||||
|
||||
By default, the force acting on the first and last replicas is not
|
||||
altered so that during the NEB relaxation, these ending replicas relax
|
||||
toward local minima. However it is possible to use the key word
|
||||
{freeend} to allow either the initial or the final replica to relax
|
||||
toward a MEP while constraining its energy. The interatomic force Fi
|
||||
for the free end image becomes :
|
||||
By default, no forces act on the first and last replicas during the
|
||||
NEB relaxation, so these replicas simply relax toward their respective
|
||||
local minima. By using the key word {end}, additional forces can be
|
||||
applied to the first or last replica, to enable them to relax toward a
|
||||
MEP while constraining their energy.
|
||||
|
||||
Fi = -Grad(V)+ (Grad(V) dot That + (E-ETarget)*kspring3) That, {when} Grad(V) dot That < 0
|
||||
Fi = -Grad(V)+ (Grad(V) dot That + (ETarget- E)*kspring3) That, {when} Grad(V) dot That > 0
|
||||
The interatomic force Fi for the specified replica becomes:
|
||||
|
||||
Fi = -Grad(V) + (Grad(V) dot T' + (E-ETarget)*Kspring3) T', {when} Grad(V) dot T' < 0
|
||||
Fi = -Grad(V) + (Grad(V) dot T' + (ETarget- E)*Kspring3) T', {when} Grad(V) dot T' > 0
|
||||
:pre
|
||||
|
||||
where E is the energy of the free end replica and ETarget is the
|
||||
target energy.
|
||||
where E is the current energy of the replica and ETarget is the target
|
||||
energy. The "spring" constant on the difference in energies is the
|
||||
specified {Kspring3} value.
|
||||
|
||||
When the value {ini} ({final}) is used after the keyword {freeend},
|
||||
the first (last) replica is considered as a free end. The target
|
||||
energy is set to the energy of the replica at starting of the NEB
|
||||
calculation. When the value {finaleini} or {final2eini} is used the
|
||||
last image is considered as a free end and the target energy is equal
|
||||
to the energy of the first replica (which can evolve during the NEB
|
||||
relaxation). With the value {finaleini}, when the initial path is too
|
||||
far from the MEP, an intermediate repilica might relax "faster" and
|
||||
get a lower energy than the last replica. The benefit of the free end
|
||||
is then lost since this intermediate replica will relax toward a local
|
||||
minima. This behavior can be prevented by using the value {final2eini}
|
||||
which remove entirely the contribution of the gradient for all
|
||||
intermediate replica which have a lower energy than the initial one
|
||||
thus preventing these replicae to over-relax. After converging a NEB
|
||||
with the {final2eini} value it is recommended to check that all
|
||||
intermediate replica have a larger energy than the initial
|
||||
replica. Finally note that if the last replica converges toward a
|
||||
local minimum with a larger energy than the energy of the first
|
||||
replica, a free end neb calculation with the value {finaleini} or
|
||||
{final2eini} cannot reach the convergence criteria.
|
||||
When {estyle} is specified as {first}, the force is applied to the
|
||||
first replica. When {estyle} is specified as {last}, the force is
|
||||
applied to the last replica. Note that the {end} keyword can be used
|
||||
twice to add forces to both the first and last replicas.
|
||||
|
||||
:line
|
||||
For both these {estyle} settings, the target energy {ETarget} is set
|
||||
to the initial energy of the replica (at the start of the NEB
|
||||
calculation).
|
||||
|
||||
If the {estyle} is specified as {last/efirst} or {last/efirst/middle},
|
||||
force is applied to the last replica, but the target energy {ETarget}
|
||||
is continuously set to the energy of the first replica, as it evolves
|
||||
during the NEB relaxation.
|
||||
|
||||
The difference between these two {estyle} options is as follows. When
|
||||
{estyle} is specified as {last/efirst}, no change is made to the
|
||||
inter-replica force applied to the intermediate replicas (neither
|
||||
first or last). If the initial path is too far from the MEP, an
|
||||
intermediate repilica may relax "faster" and reach a lower energy than
|
||||
the last replica. In this case the intermediate replica will be
|
||||
relaxing toward its own local minima. This behavior can be prevented
|
||||
by specifying {estyle} as {last/efirst/middle} which will alter the
|
||||
inter-replica force applied to intermediate replicas by removing the
|
||||
contribution of the gradient to the inter-replica force. This will
|
||||
only be done if a particular intermediate replica has a lower energy
|
||||
than the first replica. This should effectively prevent the
|
||||
intermediate replicas from over-relaxing.
|
||||
|
||||
In the second stage of the NEB, the interatomic force Fi for the
|
||||
climbing replica (which is the replica of highest energy) becomes:
|
||||
|
||||
Fi = -Grad(V) + 2 (Grad(V) dot That) That :pre
|
||||
|
||||
After converging a NEB calculation using an {estyle} of {last/efirst},
|
||||
you should check that all intermediate replicas have a larger energy
|
||||
than the first replica. If not, then repeat the calculation with an
|
||||
{estyle} of {last/efirst/middle}.
|
||||
|
||||
Finally, note that if the last replica converges toward a local
|
||||
minimum which has a larger energy than the energy of the first
|
||||
replica, a NEB calculation using an {estyle} of {last/efirst} or
|
||||
{last/efirst/middle} cannot reach final convergence.
|
||||
|
||||
[Restart, fix_modify, output, run start/stop, minimize info:]
|
||||
|
||||
@ -186,7 +212,8 @@ for more info on packages.
|
||||
|
||||
[Default:]
|
||||
|
||||
The option defaults are nudg_style = neigh, perp = none, freeend = none and freend_kspring = 1.
|
||||
The option defaults are nudge = neigh, perp = 0.0, ends is not
|
||||
specified (no inter-replica force on the end replicas).
|
||||
|
||||
:line
|
||||
|
||||
@ -197,14 +224,14 @@ The option defaults are nudg_style = neigh, perp = none, freeend = none and free
|
||||
[(Henkelman2)] Henkelman, Uberuaga, Jonsson, J Chem Phys, 113,
|
||||
9901-9904 (2000).
|
||||
|
||||
:link(E)
|
||||
[(E)] E, Ren, Vanden-Eijnden, Phys Rev B, 66, 052301 (2002)
|
||||
:link(WeinenE)
|
||||
[(WeinenE)] E, Ren, Vanden-Eijnden, Phys Rev B, 66, 052301 (2002).
|
||||
|
||||
:link(Jonsson)
|
||||
[(Jonsson)] Jonsson, Mills and Jacobsen, in Classical and Quantum
|
||||
Dynamics in Condensed Phase Simulations, edited by Berne, Ciccotti, and Coker
|
||||
World Scientific, Singapore, 1998, p. 385
|
||||
Dynamics in Condensed Phase Simulations, edited by Berne, Ciccotti,
|
||||
and Coker World Scientific, Singapore, 1998, p 385.
|
||||
|
||||
:link(Maras1)
|
||||
[(Maras)] Maras, Trushin, Stukowski, Ala-Nissila, Jonsson,
|
||||
Comp Phys Comm, 205, 13-21 (2016)
|
||||
Comp Phys Comm, 205, 13-21 (2016).
|
||||
|
||||
@ -308,7 +308,8 @@ The option defaults are mesh = mesh/disp = 0 0 0, order = order/disp =
|
||||
gewald = gewald/disp = 0.0, slab = 1.0, compute = yes, cutoff/adjust =
|
||||
yes (MSM), pressure/scalar = yes (MSM), fftbench = yes (PPPM), diff = ik
|
||||
(PPPM), mix/disp = pair, force/disp/real = -1.0, force/disp/kspace = -1.0,
|
||||
split = 0, tol = 1.0e-6, and disp/auto = no.
|
||||
split = 0, tol = 1.0e-6, and disp/auto = no. For pppm/intel, order =
|
||||
order/disp = 7.
|
||||
|
||||
:line
|
||||
|
||||
|
||||
@ -33,12 +33,16 @@ style = {none} or {ewald} or {ewald/disp} or {ewald/omp} or {pppm} or {pppm/cg}
|
||||
accuracy = desired relative error in forces
|
||||
{pppm/gpu} value = accuracy
|
||||
accuracy = desired relative error in forces
|
||||
{pppm/intel} value = accuracy
|
||||
accuracy = desired relative error in forces
|
||||
{pppm/kk} value = accuracy
|
||||
accuracy = desired relative error in forces
|
||||
{pppm/omp} value = accuracy
|
||||
accuracy = desired relative error in forces
|
||||
{pppm/cg/omp} value = accuracy
|
||||
accuracy = desired relative error in forces
|
||||
{pppm/disp/intel} value = accuracy
|
||||
accuracy = desired relative error in forces
|
||||
{pppm/tip4p/omp} value = accuracy
|
||||
accuracy = desired relative error in forces
|
||||
{pppm/stagger} value = accuracy
|
||||
|
||||
@ -7,6 +7,7 @@
|
||||
:line
|
||||
|
||||
pair_style lj/long/coul/long command :h3
|
||||
pair_style lj/long/coul/long/intel command :h3
|
||||
pair_style lj/long/coul/long/omp command :h3
|
||||
pair_style lj/long/coul/long/opt command :h3
|
||||
pair_style lj/long/tip4p/long command :h3
|
||||
|
||||
@ -51,7 +51,7 @@ set group nebatoms type 3
|
||||
group nonneb subtract all nebatoms
|
||||
|
||||
fix 1 lower setforce 0.0 0.0 0.0
|
||||
fix 2 nebatoms neb 1.0 nudg_style idealpos
|
||||
fix 2 nebatoms neb 1.0 #nudge ideal
|
||||
fix 3 all enforce2d
|
||||
|
||||
thermo 100
|
||||
|
||||
@ -15,7 +15,7 @@ variable u uloop 20
|
||||
lattice hex 0.9
|
||||
region box block 0 20 0 10 -0.25 0.25
|
||||
|
||||
read_data initial.hop1freeend
|
||||
read_data initial.hop1.end
|
||||
|
||||
# LJ potentials
|
||||
|
||||
@ -41,7 +41,7 @@ set group nebatoms type 3
|
||||
group nonneb subtract all nebatoms
|
||||
|
||||
fix 1 lower setforce 0.0 0.0 0.0
|
||||
fix 2 nebatoms neb 1.0 nudg_style idealpos freeend ini
|
||||
fix 2 nebatoms neb 1.0 nudge ideal end first 1.0
|
||||
fix 3 all enforce2d
|
||||
|
||||
thermo 100
|
||||
@ -8,7 +8,7 @@ SHELL = /bin/sh
|
||||
|
||||
CC = mpiicpc
|
||||
OPTFLAGS = -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
|
||||
CCFLAGS = -g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \
|
||||
CCFLAGS = -qopenmp -DLAMMPS_MEMALIGN=64 -qno-offload \
|
||||
-fno-alias -ansi-alias -restrict $(OPTFLAGS)
|
||||
SHFLAGS = -fPIC
|
||||
DEPFLAGS = -M
|
||||
|
||||
@ -8,7 +8,7 @@ SHELL = /bin/sh
|
||||
|
||||
CC = mpiicpc
|
||||
MIC_OPT = -qoffload-arch=mic-avx512 -fp-model fast=2
|
||||
CCFLAGS = -g -O3 -qopenmp -DLMP_INTEL_OFFLOAD -DLAMMPS_MEMALIGN=64 \
|
||||
CCFLAGS = -O3 -qopenmp -DLMP_INTEL_OFFLOAD -DLAMMPS_MEMALIGN=64 \
|
||||
-xHost -fno-alias -ansi-alias -restrict \
|
||||
-qoverride-limits $(MIC_OPT)
|
||||
SHFLAGS = -fPIC
|
||||
|
||||
@ -8,7 +8,7 @@ SHELL = /bin/sh
|
||||
|
||||
CC = mpiicpc
|
||||
OPTFLAGS = -xMIC-AVX512 -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
|
||||
CCFLAGS = -g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \
|
||||
CCFLAGS = -qopenmp -DLAMMPS_MEMALIGN=64 -qno-offload \
|
||||
-fno-alias -ansi-alias -restrict $(OPTFLAGS)
|
||||
SHFLAGS = -fPIC
|
||||
DEPFLAGS = -M
|
||||
|
||||
@ -34,6 +34,9 @@ using namespace FixConst;
|
||||
using namespace MathConst;
|
||||
|
||||
enum{SINGLE_PROC_DIRECT,SINGLE_PROC_MAP,MULTI_PROC};
|
||||
|
||||
#define BUFSIZE 8
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
FixNEB::FixNEB(LAMMPS *lmp, int narg, char **arg) :
|
||||
@ -45,56 +48,62 @@ FixNEB::FixNEB(LAMMPS *lmp, int narg, char **arg) :
|
||||
tagsendall(NULL), tagrecvall(NULL), counts(NULL),
|
||||
displacements(NULL)
|
||||
{
|
||||
|
||||
NEBLongRange=false;
|
||||
StandardNEB=true;
|
||||
PerpSpring=FreeEndIni=FreeEndFinal=false;
|
||||
FreeEndFinalWithRespToEIni=FinalAndInterWithRespToEIni=false;
|
||||
|
||||
kspringPerp=0.0;
|
||||
kspring2=1.0;
|
||||
if (narg < 4)
|
||||
error->all(FLERR,"Illegal fix neb command, argument missing");
|
||||
if (narg < 4) error->all(FLERR,"Illegal fix neb command");
|
||||
|
||||
kspring = force->numeric(FLERR,arg[3]);
|
||||
if (kspring <= 0.0)
|
||||
error->all(FLERR,"Illegal fix neb command."
|
||||
" The spring force was not provided properly");
|
||||
if (kspring <= 0.0) error->all(FLERR,"Illegal fix neb command");
|
||||
|
||||
int iarg =4;
|
||||
// optional params
|
||||
|
||||
NEBLongRange = false;
|
||||
StandardNEB = true;
|
||||
PerpSpring = FreeEndIni = FreeEndFinal = false;
|
||||
FreeEndFinalWithRespToEIni = FinalAndInterWithRespToEIni = false;
|
||||
kspringPerp = 0.0;
|
||||
kspring2 = 1.0;
|
||||
|
||||
int iarg = 4;
|
||||
while (iarg < narg) {
|
||||
if (strcmp (arg[iarg],"nudg_style")==0) {
|
||||
if (strcmp (arg[iarg+1],"idealpos")==0) {
|
||||
NEBLongRange = true;
|
||||
iarg+=2;}
|
||||
else if (strcmp (arg[iarg+1],"neigh")==0) {
|
||||
NEBLongRange = false;
|
||||
StandardNEB = true;
|
||||
iarg+=2;}
|
||||
else error->all(FLERR,"Illegal fix neb command. Unknown keyword");}
|
||||
else if (strcmp (arg[iarg],"perp")==0) {
|
||||
PerpSpring=true;
|
||||
if (strcmp(arg[iarg],"nudge") == 0) {
|
||||
if (iarg+2 > narg) error->all(FLERR,"Illegal fix neb command");
|
||||
if (strcmp(arg[iarg+1],"ideal") == 0) {
|
||||
NEBLongRange = true;
|
||||
StandardNEB = false;
|
||||
} else if (strcmp(arg[iarg+1],"neigh") == 0) {
|
||||
NEBLongRange = false;
|
||||
StandardNEB = true;
|
||||
} else error->all(FLERR,"Illegal fix neb command");
|
||||
iarg += 2;
|
||||
|
||||
} else if (strcmp(arg[iarg],"perp") == 0) {
|
||||
if (iarg+2 > narg) error->all(FLERR,"Illegal fix neb command");
|
||||
PerpSpring = true;
|
||||
kspringPerp = force->numeric(FLERR,arg[iarg+1]);
|
||||
if (kspringPerp < 0.0)
|
||||
error->all(FLERR,"Illegal fix neb command. "
|
||||
"The perpendicular spring force was not provided properly");
|
||||
iarg+=2;}
|
||||
else if (strcmp (arg[iarg],"freeend")==0) {
|
||||
if (strcmp (arg[iarg+1],"ini")==0)
|
||||
FreeEndIni=true;
|
||||
else if (strcmp (arg[iarg+1],"final")==0)
|
||||
FreeEndFinal=true;
|
||||
else if (strcmp (arg[iarg+1],"finaleini")==0)
|
||||
FreeEndFinalWithRespToEIni=true;
|
||||
else if (strcmp (arg[iarg+1],"final2eini")==0) {
|
||||
FinalAndInterWithRespToEIni=true;
|
||||
FreeEndFinalWithRespToEIni=true;}
|
||||
else if (strcmp (arg[iarg+1],"none")!=0) error->all(FLERR,"Illegal fix neb command. Unknown keyword");
|
||||
iarg+=2;}
|
||||
else if (strcmp (arg[iarg],"freeend_kspring")==0) {
|
||||
kspring2=force->numeric(FLERR,arg[iarg+1]);
|
||||
iarg+=2; }
|
||||
else error->all(FLERR,"Illegal fix neb command. Unknown keyword");
|
||||
if (kspringPerp == 0.0) PerpSpring = false;
|
||||
if (kspringPerp < 0.0) error->all(FLERR,"Illegal fix neb command");
|
||||
iarg += 2;
|
||||
|
||||
} else if (strcmp (arg[iarg],"end") == 0) {
|
||||
if (iarg+3 > narg) error->all(FLERR,"Illegal fix neb command");
|
||||
if (strcmp(arg[iarg+1],"first") == 0) {
|
||||
FreeEndIni = true;
|
||||
} else if (strcmp(arg[iarg+1],"last") == 0) {
|
||||
FreeEndFinal = true;
|
||||
FinalAndInterWithRespToEIni = false;
|
||||
FreeEndFinalWithRespToEIni = false;
|
||||
} else if (strcmp(arg[iarg+1],"last/efirst") == 0) {
|
||||
FreeEndFinal = false;
|
||||
FinalAndInterWithRespToEIni = false;
|
||||
FreeEndFinalWithRespToEIni = true;
|
||||
} else if (strcmp(arg[iarg+1],"last/efirst/middle") == 0) {
|
||||
FreeEndFinal = false;
|
||||
FinalAndInterWithRespToEIni = true;
|
||||
FreeEndFinalWithRespToEIni = true;
|
||||
} else error->all(FLERR,"Illegal fix neb command");
|
||||
kspring2 = force->numeric(FLERR,arg[iarg+2]);
|
||||
iarg += 3;
|
||||
|
||||
} else error->all(FLERR,"Illegal fix neb command");
|
||||
}
|
||||
|
||||
// nreplica = number of partitions
|
||||
@ -119,12 +128,12 @@ FixNEB::FixNEB(LAMMPS *lmp, int narg, char **arg) :
|
||||
MPI_Group uworldgroup,rootgroup;
|
||||
if (NEBLongRange) {
|
||||
for (int i=0; i<nreplica; i++)
|
||||
iroots[i]=universe->root_proc[i];
|
||||
iroots[i] = universe->root_proc[i];
|
||||
MPI_Comm_group(uworld, &uworldgroup);
|
||||
MPI_Group_incl(uworldgroup, nreplica, iroots, &rootgroup);
|
||||
MPI_Comm_create(uworld, rootgroup, &rootworld);
|
||||
}
|
||||
delete[] iroots;
|
||||
delete [] iroots;
|
||||
|
||||
// create a new compute pe style
|
||||
// id = fix-ID + pe, compute group = all
|
||||
@ -256,11 +265,11 @@ void FixNEB::min_post_force(int vflag)
|
||||
double delxp,delyp,delzp,delxn,delyn,delzn;
|
||||
double vIni=0.0;
|
||||
|
||||
vprev=vnext=veng=pe->compute_scalar();
|
||||
vprev = vnext = veng = pe->compute_scalar();
|
||||
|
||||
if (ireplica < nreplica-1 && me ==0)
|
||||
if (ireplica < nreplica-1 && me == 0)
|
||||
MPI_Send(&veng,1,MPI_DOUBLE,procnext,0,uworld);
|
||||
if (ireplica > 0 && me ==0)
|
||||
if (ireplica > 0 && me == 0)
|
||||
MPI_Recv(&vprev,1,MPI_DOUBLE,procprev,0,uworld,MPI_STATUS_IGNORE);
|
||||
|
||||
if (ireplica > 0 && me == 0)
|
||||
@ -297,6 +306,7 @@ void FixNEB::min_post_force(int vflag)
|
||||
}
|
||||
|
||||
// communicate atoms to/from adjacent replicas to fill xprev,xnext
|
||||
|
||||
inter_replica_comm();
|
||||
|
||||
// trigger potential energy computation on next timestep
|
||||
@ -335,10 +345,10 @@ void FixNEB::min_post_force(int vflag)
|
||||
tangent[i][0]=delxp;
|
||||
tangent[i][1]=delyp;
|
||||
tangent[i][2]=delzp;
|
||||
tlen += tangent[i][0]*tangent[i][0]
|
||||
+ tangent[i][1]*tangent[i][1] + tangent[i][2]*tangent[i][2];
|
||||
dot += f[i][0]*tangent[i][0]
|
||||
+ f[i][1]*tangent[i][1] + f[i][2]*tangent[i][2];
|
||||
tlen += tangent[i][0]*tangent[i][0] +
|
||||
tangent[i][1]*tangent[i][1] + tangent[i][2]*tangent[i][2];
|
||||
dot += f[i][0]*tangent[i][0] + f[i][1]*tangent[i][1] +
|
||||
f[i][2]*tangent[i][2];
|
||||
}
|
||||
}
|
||||
|
||||
@ -360,10 +370,10 @@ void FixNEB::min_post_force(int vflag)
|
||||
tangent[i][0]=delxn;
|
||||
tangent[i][1]=delyn;
|
||||
tangent[i][2]=delzn;
|
||||
tlen += tangent[i][0]*tangent[i][0]
|
||||
+ tangent[i][1]*tangent[i][1] + tangent[i][2]*tangent[i][2];
|
||||
dot += f[i][0]*tangent[i][0]
|
||||
+ f[i][1]*tangent[i][1] + f[i][2]*tangent[i][2];
|
||||
tlen += tangent[i][0]*tangent[i][0] +
|
||||
tangent[i][1]*tangent[i][1] + tangent[i][2]*tangent[i][2];
|
||||
dot += f[i][0]*tangent[i][0] + f[i][1]*tangent[i][1] +
|
||||
f[i][2]*tangent[i][2];
|
||||
}
|
||||
}
|
||||
} else {
|
||||
@ -388,13 +398,13 @@ void FixNEB::min_post_force(int vflag)
|
||||
domain->minimum_image(delxn,delyn,delzn);
|
||||
|
||||
if (vnext > veng && veng > vprev) {
|
||||
tangent[i][0]=delxn;
|
||||
tangent[i][1]=delyn;
|
||||
tangent[i][2]=delzn;
|
||||
tangent[i][0] = delxn;
|
||||
tangent[i][1] = delyn;
|
||||
tangent[i][2] = delzn;
|
||||
} else if (vnext < veng && veng < vprev) {
|
||||
tangent[i][0]=delxp;
|
||||
tangent[i][1]=delyp;
|
||||
tangent[i][2]=delzp;
|
||||
tangent[i][0] = delxp;
|
||||
tangent[i][1] = delyp;
|
||||
tangent[i][2] = delzp;
|
||||
} else {
|
||||
if (vnext > vprev) {
|
||||
tangent[i][0] = vmax*delxn + vmin*delxp;
|
||||
@ -408,24 +418,23 @@ void FixNEB::min_post_force(int vflag)
|
||||
}
|
||||
|
||||
nlen += delxn*delxn + delyn*delyn + delzn*delzn;
|
||||
tlen += tangent[i][0]*tangent[i][0]
|
||||
+ tangent[i][1]*tangent[i][1] + tangent[i][2]*tangent[i][2];
|
||||
tlen += tangent[i][0]*tangent[i][0] +
|
||||
tangent[i][1]*tangent[i][1] + tangent[i][2]*tangent[i][2];
|
||||
gradlen += f[i][0]*f[i][0] + f[i][1]*f[i][1] + f[i][2]*f[i][2];
|
||||
dotpath += delxp*delxn + delyp*delyn + delzp*delzn;
|
||||
dottangrad += tangent[i][0]* f[i][0]
|
||||
+ tangent[i][1]*f[i][1] + tangent[i][2]*f[i][2];
|
||||
gradnextlen += fnext[i][0]*fnext[i][0]
|
||||
+ fnext[i][1]*fnext[i][1] +fnext[i][2] * fnext[i][2];
|
||||
dotgrad += f[i][0]*fnext[i][0]
|
||||
+ f[i][1]*fnext[i][1] + f[i][2]*fnext[i][2];
|
||||
dottangrad += tangent[i][0]*f[i][0] +
|
||||
tangent[i][1]*f[i][1] + tangent[i][2]*f[i][2];
|
||||
gradnextlen += fnext[i][0]*fnext[i][0] +
|
||||
fnext[i][1]*fnext[i][1] +fnext[i][2] * fnext[i][2];
|
||||
dotgrad += f[i][0]*fnext[i][0] + f[i][1]*fnext[i][1] +
|
||||
f[i][2]*fnext[i][2];
|
||||
|
||||
springF[i][0]=kspringPerp*(delxn-delxp);
|
||||
springF[i][1]=kspringPerp*(delyn-delyp);
|
||||
springF[i][2]=kspringPerp*(delzn-delzp);
|
||||
springF[i][0] = kspringPerp*(delxn-delxp);
|
||||
springF[i][1] = kspringPerp*(delyn-delyp);
|
||||
springF[i][2] = kspringPerp*(delzn-delzp);
|
||||
}
|
||||
}
|
||||
|
||||
#define BUFSIZE 8
|
||||
double bufin[BUFSIZE], bufout[BUFSIZE];
|
||||
bufin[0] = nlen;
|
||||
bufin[1] = plen;
|
||||
@ -459,7 +468,7 @@ void FixNEB::min_post_force(int vflag)
|
||||
|
||||
// first or last replica has no change to forces, just return
|
||||
|
||||
if(ireplica>0 && ireplica<nreplica-1)
|
||||
if (ireplica > 0 && ireplica < nreplica-1)
|
||||
dottangrad = dottangrad/(tlen*gradlen);
|
||||
if (ireplica == 0)
|
||||
dottangrad = dottangrad/(nlen*gradlen);
|
||||
@ -468,7 +477,6 @@ void FixNEB::min_post_force(int vflag)
|
||||
if (ireplica < nreplica-1)
|
||||
dotgrad = dotgrad /(gradlen*gradnextlen);
|
||||
|
||||
|
||||
if (FreeEndIni && ireplica == 0) {
|
||||
if (tlen > 0.0) {
|
||||
double dotall;
|
||||
@ -568,14 +576,15 @@ void FixNEB::min_post_force(int vflag)
|
||||
|
||||
for (int i = 0; i < nlocal; i++) {
|
||||
if (mask[i] & groupbit) {
|
||||
dot += f[i][0]*tangent[i][0]
|
||||
+ f[i][1]*tangent[i][1] + f[i][2]*tangent[i][2];
|
||||
dotSpringTangent += springF[i][0]*tangent[i][0]
|
||||
+springF[i][1]*tangent[i][1]+springF[i][2]*tangent[i][2];}
|
||||
dot += f[i][0]*tangent[i][0] + f[i][1]*tangent[i][1] +
|
||||
f[i][2]*tangent[i][2];
|
||||
dotSpringTangent += springF[i][0]*tangent[i][0] +
|
||||
springF[i][1]*tangent[i][1] + springF[i][2]*tangent[i][2];}
|
||||
}
|
||||
|
||||
double dotSpringTangentall;
|
||||
MPI_Allreduce(&dotSpringTangent,&dotSpringTangentall,1,MPI_DOUBLE,MPI_SUM,world);
|
||||
MPI_Allreduce(&dotSpringTangent,&dotSpringTangentall,1,
|
||||
MPI_DOUBLE,MPI_SUM,world);
|
||||
dotSpringTangent=dotSpringTangentall;
|
||||
double dotall;
|
||||
MPI_Allreduce(&dot,&dotall,1,MPI_DOUBLE,MPI_SUM,world);
|
||||
@ -603,12 +612,12 @@ void FixNEB::min_post_force(int vflag)
|
||||
|
||||
for (int i = 0; i < nlocal; i++)
|
||||
if (mask[i] & groupbit) {
|
||||
f[i][0] += prefactor*tangent[i][0]
|
||||
+AngularContr*(springF[i][0] -dotSpringTangent*tangent[i][0]);
|
||||
f[i][1] += prefactor*tangent[i][1]
|
||||
+ AngularContr*(springF[i][1] - dotSpringTangent*tangent[i][1]);
|
||||
f[i][2] += prefactor*tangent[i][2]
|
||||
+ AngularContr*(springF[i][2] - dotSpringTangent*tangent[i][2]);
|
||||
f[i][0] += prefactor*tangent[i][0] +
|
||||
AngularContr*(springF[i][0] - dotSpringTangent*tangent[i][0]);
|
||||
f[i][1] += prefactor*tangent[i][1] +
|
||||
AngularContr*(springF[i][1] - dotSpringTangent*tangent[i][1]);
|
||||
f[i][2] += prefactor*tangent[i][2] +
|
||||
AngularContr*(springF[i][2] - dotSpringTangent*tangent[i][2]);
|
||||
}
|
||||
}
|
||||
|
||||
@ -827,7 +836,6 @@ void FixNEB::inter_replica_comm()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
reallocate xprev,xnext,tangent arrays if necessary
|
||||
reallocate communication arrays if necessary
|
||||
|
||||
@ -4,6 +4,7 @@
|
||||
--------------------------------
|
||||
|
||||
W. Michael Brown (Intel) michael.w.brown at intel.com
|
||||
William McDoniel (RWTH Aachen University)
|
||||
Rodrigo Canales (RWTH Aachen University)
|
||||
Markus H<>hnerbach (RWTH Aachen University)
|
||||
Stan Moore (Sandia)
|
||||
@ -14,15 +15,25 @@
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
This package is based on the USER-OMP package and provides LAMMPS styles that:
|
||||
This package provides LAMMPS styles that:
|
||||
|
||||
1. include support for single and mixed precision in addition to double.
|
||||
2. include modifications to support vectorization for key routines
|
||||
3. include modifications for data layouts to improve cache efficiency
|
||||
3. include modifications to support offload to Intel(R) Xeon Phi(TM)
|
||||
coprocessors
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
For Intel server processors codenamed "Skylake", the following flags should
|
||||
be added or changed in the Makefile depending on the version:
|
||||
|
||||
2017 update 2 - No changes needed
|
||||
2017 updates 3 or 4 - Use -xCOMMON-AVX512 and not -xHost or -xCORE-AVX512
|
||||
2018 or newer - Use -xHost or -xCORE-AVX512 and -qopt-zmm-usage=high
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
When using the suffix command with "intel", intel styles will be used if they
|
||||
exist. If the suffix command is used with "hybrid intel omp" and the USER-OMP
|
||||
USER-OMP styles will be used whenever USER-INTEL styles are not available. This
|
||||
|
||||
@ -4,6 +4,7 @@
|
||||
# in.intel.lj - Atomic fluid (LJ Benchmark)
|
||||
# in.intel.rhodo - Protein (Rhodopsin Benchmark)
|
||||
# in.intel.lc - Liquid Crystal w/ Gay-Berne potential
|
||||
# in.intel.eam - Copper benchmark with Embedded Atom Method
|
||||
# in.intel.sw - Silicon benchmark with Stillinger-Weber
|
||||
# in.intel.tersoff - Silicon benchmark with Tersoff
|
||||
# in.intel.water - Coarse-grain water benchmark using Stillinger-Weber
|
||||
@ -11,19 +12,26 @@
|
||||
#############################################################################
|
||||
|
||||
#############################################################################
|
||||
# Expected Timesteps/second with turbo on and HT enabled, LAMMPS 18-Jun-2016
|
||||
# Expected Timesteps/second with turbo on and HT enabled, LAMMPS June-2017
|
||||
# - Compiled w/ Intel Parallel Studio 2017u2 and Makefile.intel_cpu_intelmpi
|
||||
#
|
||||
# Xeon E5-2697v4 Xeon Phi 7250
|
||||
#
|
||||
# in.intel.lj - 162.764 179.148
|
||||
# in.intel.rhodo - 11.633 13.668
|
||||
# in.intel.lc - 19.136 24.863
|
||||
# in.intel.sw - 139.048 152.026
|
||||
# in.intel.tersoff - 82.663 92.985
|
||||
# in.intel.water - 59.838 85.704
|
||||
# in.intel.lj - 199.5 282.3
|
||||
# in.intel.rhodo - 12.4 17.5
|
||||
# in.intel.lc - 19.0 25.7
|
||||
# in.intel.eam - 59.4 92.8
|
||||
# in.intel.sw - 132.4 161.9
|
||||
# in.intel.tersoff - 83.3 101.1
|
||||
# in.intel.water - 53.4 90.3
|
||||
#
|
||||
#############################################################################
|
||||
|
||||
#############################################################################
|
||||
# For Skylake server (Xeon) architectures, see notes in the USER-INTEL/README
|
||||
# for build flags that should be used.
|
||||
#############################################################################
|
||||
|
||||
#############################################################################
|
||||
# For Haswell (Xeon v3) architectures, depending on the compiler version,
|
||||
# it may give better performance to compile for an AVX target (with -xAVX
|
||||
@ -42,7 +50,18 @@
|
||||
# -v m 0.5 # Run for half as long
|
||||
#############################################################################
|
||||
|
||||
# Example for running benchmarks:
|
||||
#############################################################################
|
||||
# The LAMMPS newton setting can be controlled from the commandline for the
|
||||
# benchmarks with the N variable:
|
||||
#
|
||||
# -v N on # newton on
|
||||
# -v N off # newton off
|
||||
#
|
||||
# The default is on for all of the benchmarks except for LJ where the off
|
||||
# setting performs best with the USER-INTEL package
|
||||
#############################################################################
|
||||
|
||||
# Example for running benchmarks (see run_benchmarks.sh for script):
|
||||
|
||||
# Number of physical cores per node not including hyperthreads
|
||||
export LMP_CORES=28
|
||||
@ -57,26 +76,35 @@ export LMP_BIN=../../lmp_intel_cpu
|
||||
# LAMMPS root directory
|
||||
export LMP_ROOT=../../../
|
||||
|
||||
source /opt/intel/parallel_studio_xe_2016.2.062/psxevars.sh
|
||||
source source /opt/intel/parallel_studio_xe_2017.2.050/psxevars.sh
|
||||
export KMP_BLOCKTIME=0
|
||||
export I_MPI_PIN_DOMAIN=core
|
||||
export I_MPI_FABRICS=shm # For single node
|
||||
|
||||
# ONLY FOR INTEL XEON PHI x200 SERIES PROCESSORS
|
||||
export I_MPI_SHM_LMT=shm
|
||||
|
||||
# Generate the restart file for use with liquid crystal benchmark
|
||||
mpirun -np $LMP_CORES $LMP_BIN -in in.lc_generate_restart -log none
|
||||
|
||||
# Benchmark to run
|
||||
export bench=in.intel.lj
|
||||
|
||||
#############################################################################
|
||||
# For Intel Xeon Phi x200 series processors best performance is achieved by
|
||||
# using MCDRAM. In flat mode, this can be achieved with numactl,
|
||||
# MPI environment variables, or other options provided by batch schedulers
|
||||
#############################################################################
|
||||
|
||||
#############################################################################
|
||||
# To run without a optimization package
|
||||
#############################################################################
|
||||
mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none
|
||||
mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none -v N on
|
||||
|
||||
#############################################################################
|
||||
# To run with USER-OMP package
|
||||
#############################################################################
|
||||
mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none -pk omp 0 -sf omp
|
||||
mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none -pk omp 0 -sf omp -v N on
|
||||
|
||||
#############################################################################
|
||||
# To run with USER-INTEL package and no coprocessor
|
||||
@ -89,6 +117,9 @@ mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none -pk intel 0 -sf intel
|
||||
mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none -pk intel 1 -sf intel
|
||||
|
||||
#############################################################################
|
||||
# If using PPPM (in.intel.rhodo) on Intel Xeon Phi x200 series processors
|
||||
# If using PPPM (e.g. in.intel.rhodo) on Intel Xeon Phi x200 series
|
||||
# or Skylake processors
|
||||
#############################################################################
|
||||
mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none -pk intel 0 omp 3 lrt yes -sf intel
|
||||
export KMP_AFFINITY=none
|
||||
rthreads=$((OMP_NUM_THREADS-1))
|
||||
mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none -pk intel 0 omp $rthreads lrt yes -sf intel
|
||||
|
||||
@ -1,4 +1,6 @@
|
||||
# bulk Cu lattice
|
||||
|
||||
variable N index on # Newton Setting
|
||||
variable w index 10 # Warmup Timesteps
|
||||
variable t index 3100 # Main Run Timesteps
|
||||
variable m index 1 # Main Run Timestep Multiplier
|
||||
@ -13,6 +15,7 @@ variable z index 2
|
||||
variable rr equal floor($t*$m)
|
||||
variable root getenv LMP_ROOT
|
||||
|
||||
newton $N
|
||||
if "$n > 0" then "processors * * * grid numa"
|
||||
|
||||
variable xx equal 20*$x
|
||||
|
||||
@ -3,6 +3,7 @@
|
||||
# shape: 2 1.5 1
|
||||
# cutoff 4.0 with skin 0.8
|
||||
|
||||
variable N index on # Newton Setting
|
||||
variable w index 10 # Warmup Timesteps
|
||||
variable t index 840 # Main Run Timesteps
|
||||
variable m index 1 # Main Run Timestep Multiplier
|
||||
@ -15,6 +16,7 @@ variable z index 2
|
||||
|
||||
variable rr equal floor($t*$m)
|
||||
|
||||
newton $N
|
||||
if "$n > 0" then "processors * * * grid numa"
|
||||
|
||||
units lj
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
# 3d Lennard-Jones melt
|
||||
|
||||
variable N index off # Newton Setting
|
||||
variable w index 10 # Warmup Timesteps
|
||||
variable t index 7900 # Main Run Timesteps
|
||||
variable m index 1 # Main Run Timestep Multiplier
|
||||
@ -15,6 +16,7 @@ variable yy equal 20*$y
|
||||
variable zz equal 20*$z
|
||||
variable rr equal floor($t*$m)
|
||||
|
||||
newton $N
|
||||
if "$n > 0" then "processors * * * grid numa"
|
||||
|
||||
units lj
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
# Rhodopsin model
|
||||
|
||||
variable N index on # Newton Setting
|
||||
variable w index 10 # Warmup Timesteps
|
||||
variable t index 520 # Main Run Timesteps
|
||||
variable m index 1 # Main Run Timestep Multiplier
|
||||
@ -16,10 +17,11 @@ variable z index 2
|
||||
variable rr equal floor($t*$m)
|
||||
variable root getenv LMP_ROOT
|
||||
|
||||
newton $N
|
||||
if "$n > 0" then "processors * * * grid numa"
|
||||
|
||||
units real
|
||||
neigh_modify delay 5 every 1 binsize $b
|
||||
neigh_modify delay 5 every 1
|
||||
|
||||
atom_style full
|
||||
bond_style harmonic
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
# bulk Si via Stillinger-Weber
|
||||
|
||||
variable N index on # Newton Setting
|
||||
variable w index 10 # Warmup Timesteps
|
||||
variable t index 6200 # Main Run Timesteps
|
||||
variable m index 1 # Main Run Timestep Multiplier
|
||||
@ -16,6 +17,7 @@ variable zz equal 10*$z
|
||||
variable rr equal floor($t*$m)
|
||||
variable root getenv LMP_ROOT
|
||||
|
||||
newton $N
|
||||
if "$n > 0" then "processors * * * grid numa"
|
||||
|
||||
units metal
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
# bulk Si via Tersoff
|
||||
|
||||
variable N index on # Newton Setting
|
||||
variable w index 10 # Warmup Timesteps
|
||||
variable t index 2420 # Main Run Timesteps
|
||||
variable m index 1 # Main Run Timestep Multiplier
|
||||
@ -16,6 +17,7 @@ variable zz equal 10*$z
|
||||
variable rr equal floor($t*$m)
|
||||
variable root getenv LMP_ROOT
|
||||
|
||||
newton $N
|
||||
if "$n > 0" then "processors * * * grid numa"
|
||||
|
||||
units metal
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
# Coarse-grain water simulation using Stillinger-Weber
|
||||
|
||||
variable N index on # Newton Setting
|
||||
variable w index 10 # Warmup Timesteps
|
||||
variable t index 2600 # Main Run Timesteps
|
||||
variable m index 1 # Main Run Timestep Multiplier
|
||||
@ -11,6 +12,7 @@ variable y index 2
|
||||
variable z index 2
|
||||
variable rr equal floor($t*$m)
|
||||
|
||||
newton $N
|
||||
if "$n > 0" then "processors * * * grid numa"
|
||||
|
||||
units real
|
||||
|
||||
@ -4,13 +4,13 @@
|
||||
# cutoff 4.0 with skin 0.8
|
||||
# NPT, T=2.4, P=8.0
|
||||
|
||||
variable x index 1
|
||||
variable y index 1
|
||||
variable z index 1
|
||||
variable xt index 1
|
||||
variable yt index 1
|
||||
variable zt index 1
|
||||
|
||||
variable i equal $x*32
|
||||
variable j equal $y*32
|
||||
variable k equal $z*32
|
||||
variable i equal ${xt}*32
|
||||
variable j equal ${yt}*32
|
||||
variable k equal ${zt}*32
|
||||
|
||||
units lj
|
||||
atom_style ellipsoid
|
||||
|
||||
86
src/USER-INTEL/TEST/run_benchmarks.sh
Executable file
86
src/USER-INTEL/TEST/run_benchmarks.sh
Executable file
@ -0,0 +1,86 @@
|
||||
#!/bin/bash
|
||||
|
||||
#########################################################################
|
||||
# Adjust settings below for your system
|
||||
#########################################################################
|
||||
|
||||
# --------------------- MPI Launch Command
|
||||
|
||||
export MPI="mpirun"
|
||||
#export MPI="numactl -p 1 mpirun" # -- Systems w/ MCDRAM in flat mode
|
||||
|
||||
# ------------- Name and location of the LAMMPS binary
|
||||
|
||||
export LMP_BIN=../../lmp_intel_cpu_intelmpi
|
||||
#export LMP_BIN=../../lmp_knl
|
||||
|
||||
# ------------- Directory containing the LAMMPS installation
|
||||
|
||||
export LMP_ROOT=../../../
|
||||
|
||||
# ------------- Number of physical cores (not HW threads)
|
||||
|
||||
export LMP_CORES=36 # -- For Intel Xeon E5-2697v4 SKU
|
||||
#export LMP_CORES=68 # -- For Intel Xeon Phi x200 7250 SKU
|
||||
|
||||
# ------------- Number of HW threads to use in tests
|
||||
|
||||
export LMP_THREAD_LIST="2" # -- For 2 threads per core w/ HT enabled
|
||||
#export LMP_THREAD_LIST="2 4" # -- For 2 threads per core w/ HT enabled
|
||||
|
||||
# ------------- MPI Tuning Parameters
|
||||
|
||||
#export I_MPI_SHM_LMT=shm # -- Uncomment for Xeon Phi x200 series
|
||||
|
||||
# ------------- Library locations for build
|
||||
|
||||
#source /opt/intel/parallel_studio_xe_2017.2.050/psxevars.sh
|
||||
|
||||
#########################################################################
|
||||
# End settings for your system
|
||||
#########################################################################
|
||||
|
||||
export WORKLOADS="lj rhodo rhodo_lrt lc sw water eam"
|
||||
export LMP_ARGS="-pk intel 0 -sf intel -screen none -v d 1"
|
||||
export RLMP_ARGS="-pk intel 0 lrt yes -sf intel -screen none -v d 1"
|
||||
|
||||
export LOG_DIR_HEADER=`echo $LMP_BIN | sed 's/\.\.\///g' | sed 's/\.\///g'`
|
||||
export LOG_DIR_HOST=`hostname`
|
||||
export DATE_STRING=`date +%s`
|
||||
export LOG_DIR=$LOG_DIR_HOST"_"$LOG_DIR_HEADER"_"$DATE_STRING
|
||||
mkdir $LOG_DIR
|
||||
|
||||
export I_MPI_PIN_DOMAIN=core
|
||||
export I_MPI_FABRICS=shm
|
||||
export KMP_BLOCKTIME=0
|
||||
|
||||
echo -n "Creating restart file...."
|
||||
$MPI -np $LMP_CORES $LMP_BIN -in in.lc_generate_restart -log none $LMP_ARGS
|
||||
echo "Done."
|
||||
for threads in $LMP_THREAD_LIST
|
||||
do
|
||||
export OMP_NUM_THREADS=$threads
|
||||
for workload in $WORKLOADS
|
||||
do
|
||||
export LOGFILE=$LOG_DIR/$workload.$LMP_CORES"c"$threads"t".log
|
||||
echo "Running $LOGFILE"
|
||||
cmd="$MPI -np $LMP_CORES $LMP_BIN -in in.intel.$workload -log $LOGFILE $LMP_ARGS";
|
||||
rthreads=$threads
|
||||
unset KMP_AFFINITY
|
||||
$cmd
|
||||
|
||||
# - For benchmarks with PPPM, also try LRT mode
|
||||
if [ $workload = "rhodo" ]; then
|
||||
export LOGFILE=$LOG_DIR/$workload"_lrt".$LMP_CORES"c"$threads"t".log
|
||||
cmd="$MPI -np $LMP_CORES $LMP_BIN -in in.intel.$workload -log $LOGFILE $RLMP_ARGS";
|
||||
rthreads=$((threads-1))
|
||||
export KMP_AFFINITY=none
|
||||
export OMP_NUM_THREADS=$rthreads
|
||||
echo " $cmd" >> $LOG_DIR/commands.info
|
||||
$cmd
|
||||
fi
|
||||
done
|
||||
done
|
||||
|
||||
# Performance reported by LAMMPS (Timesteps/second ignoring warm-up run)
|
||||
grep Perf $LOG_DIR/*.log | awk 'BEGIN{n=1}n%2==0{print $0}{n++}' | sed 's/\/day//g' | sed 's/steps\/s/steps_s/g' | sed 's/hours\/ns//g' | sed 's/.*\///g' | sed 's/\.log:Performance://g' | awk '{c=NF-1; print $1,$c}'
|
||||
@ -81,16 +81,16 @@ void AngleCharmmIntel::compute(int eflag, int vflag,
|
||||
else evflag = 0;
|
||||
|
||||
if (evflag) {
|
||||
if (eflag) {
|
||||
if (vflag && !eflag) {
|
||||
if (force->newton_bond)
|
||||
eval<0,1,1>(vflag, buffers, fc);
|
||||
else
|
||||
eval<0,1,0>(vflag, buffers, fc);
|
||||
} else {
|
||||
if (force->newton_bond)
|
||||
eval<1,1,1>(vflag, buffers, fc);
|
||||
else
|
||||
eval<1,1,0>(vflag, buffers, fc);
|
||||
} else {
|
||||
if (force->newton_bond)
|
||||
eval<1,0,1>(vflag, buffers, fc);
|
||||
else
|
||||
eval<1,0,0>(vflag, buffers, fc);
|
||||
}
|
||||
} else {
|
||||
if (force->newton_bond)
|
||||
@ -102,7 +102,7 @@ void AngleCharmmIntel::compute(int eflag, int vflag,
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||
void AngleCharmmIntel::eval(const int vflag,
|
||||
IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc)
|
||||
@ -126,12 +126,9 @@ void AngleCharmmIntel::eval(const int vflag,
|
||||
const int nthreads = tc;
|
||||
|
||||
acc_t oeangle, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||
if (EVFLAG) {
|
||||
if (EFLAG)
|
||||
oeangle = (acc_t)0.0;
|
||||
if (vflag) {
|
||||
ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
|
||||
}
|
||||
if (EFLAG) oeangle = (acc_t)0.0;
|
||||
if (VFLAG && vflag) {
|
||||
ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
|
||||
}
|
||||
|
||||
#if defined(_OPENMP)
|
||||
@ -140,8 +137,12 @@ void AngleCharmmIntel::eval(const int vflag,
|
||||
reduction(+:oeangle,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||
#endif
|
||||
{
|
||||
int nfrom, nto, tid;
|
||||
int nfrom, npl, nto, tid;
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||
IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
|
||||
#else
|
||||
IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
|
||||
#endif
|
||||
|
||||
FORCE_T * _noalias const f = f_start + (tid * f_stride);
|
||||
if (fix->need_zero(tid))
|
||||
@ -150,7 +151,17 @@ void AngleCharmmIntel::eval(const int vflag,
|
||||
const int4_t * _noalias const anglelist =
|
||||
(int4_t *) neighbor->anglelist[0];
|
||||
|
||||
for (int n = nfrom; n < nto; n++) {
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||
acc_t seangle, sv0, sv1, sv2, sv3, sv4, sv5;
|
||||
if (EFLAG) seangle = (acc_t)0.0;
|
||||
if (VFLAG && vflag) {
|
||||
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
||||
}
|
||||
#pragma simd reduction(+:seangle, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
for (int n = nfrom; n < nto; n ++) {
|
||||
#else
|
||||
for (int n = nfrom; n < nto; n += npl) {
|
||||
#endif
|
||||
const int i1 = anglelist[n].a;
|
||||
const int i2 = anglelist[n].b;
|
||||
const int i3 = anglelist[n].c;
|
||||
@ -229,40 +240,58 @@ void AngleCharmmIntel::eval(const int vflag,
|
||||
|
||||
// apply force to each of 3 atoms
|
||||
|
||||
if (NEWTON_BOND || i1 < nlocal) {
|
||||
f[i1].x += f1x;
|
||||
f[i1].y += f1y;
|
||||
f[i1].z += f1z;
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||
#pragma simdoff
|
||||
#endif
|
||||
{
|
||||
if (NEWTON_BOND || i1 < nlocal) {
|
||||
f[i1].x += f1x;
|
||||
f[i1].y += f1y;
|
||||
f[i1].z += f1z;
|
||||
}
|
||||
|
||||
if (NEWTON_BOND || i2 < nlocal) {
|
||||
f[i2].x -= f1x + f3x;
|
||||
f[i2].y -= f1y + f3y;
|
||||
f[i2].z -= f1z + f3z;
|
||||
}
|
||||
|
||||
if (NEWTON_BOND || i3 < nlocal) {
|
||||
f[i3].x += f3x;
|
||||
f[i3].y += f3y;
|
||||
f[i3].z += f3z;
|
||||
}
|
||||
}
|
||||
|
||||
if (NEWTON_BOND || i2 < nlocal) {
|
||||
f[i2].x -= f1x + f3x;
|
||||
f[i2].y -= f1y + f3y;
|
||||
f[i2].z -= f1z + f3z;
|
||||
}
|
||||
|
||||
if (NEWTON_BOND || i3 < nlocal) {
|
||||
f[i3].x += f3x;
|
||||
f[i3].y += f3y;
|
||||
f[i3].z += f3z;
|
||||
}
|
||||
|
||||
if (EVFLAG) {
|
||||
IP_PRE_ev_tally_angle(EFLAG, eatom, vflag, eangle, i1, i2, i3,f1x,
|
||||
f1y, f1z, f3x, f3y, f3z, delx1, dely1, delz1,
|
||||
delx2, dely2, delz2, oeangle, f, NEWTON_BOND,
|
||||
nlocal, ov0, ov1, ov2, ov3, ov4, ov5);
|
||||
if (EFLAG || VFLAG) {
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||
IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2,
|
||||
i3, f1x, f1y, f1z, f3x, f3y, f3z, delx1,
|
||||
dely1, delz1, delx2, dely2, delz2, seangle,
|
||||
f, NEWTON_BOND, nlocal, sv0, sv1, sv2, sv3,
|
||||
sv4, sv5);
|
||||
#else
|
||||
IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2,
|
||||
i3, f1x, f1y, f1z, f3x, f3y, f3z, delx1,
|
||||
dely1, delz1, delx2, dely2, delz2, oeangle,
|
||||
f, NEWTON_BOND, nlocal, ov0, ov1, ov2, ov3,
|
||||
ov4, ov5);
|
||||
#endif
|
||||
}
|
||||
} // for n
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||
if (EFLAG) oeangle += seangle;
|
||||
if (VFLAG && vflag) {
|
||||
ov0 += sv0; ov1 += sv1; ov2 += sv2;
|
||||
ov3 += sv3; ov4 += sv4; ov5 += sv5;
|
||||
}
|
||||
#endif
|
||||
} // omp parallel
|
||||
|
||||
if (EVFLAG) {
|
||||
if (EFLAG)
|
||||
energy += oeangle;
|
||||
if (vflag) {
|
||||
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
||||
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
||||
}
|
||||
if (EFLAG) energy += oeangle;
|
||||
if (VFLAG && vflag) {
|
||||
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
||||
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
||||
}
|
||||
|
||||
fix->set_reduce_flag();
|
||||
|
||||
@ -81,16 +81,16 @@ void AngleHarmonicIntel::compute(int eflag, int vflag,
|
||||
else evflag = 0;
|
||||
|
||||
if (evflag) {
|
||||
if (eflag) {
|
||||
if (vflag && !eflag) {
|
||||
if (force->newton_bond)
|
||||
eval<0,1,1>(vflag, buffers, fc);
|
||||
else
|
||||
eval<0,1,0>(vflag, buffers, fc);
|
||||
} else {
|
||||
if (force->newton_bond)
|
||||
eval<1,1,1>(vflag, buffers, fc);
|
||||
else
|
||||
eval<1,1,0>(vflag, buffers, fc);
|
||||
} else {
|
||||
if (force->newton_bond)
|
||||
eval<1,0,1>(vflag, buffers, fc);
|
||||
else
|
||||
eval<1,0,0>(vflag, buffers, fc);
|
||||
}
|
||||
} else {
|
||||
if (force->newton_bond)
|
||||
@ -102,7 +102,7 @@ void AngleHarmonicIntel::compute(int eflag, int vflag,
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||
void AngleHarmonicIntel::eval(const int vflag,
|
||||
IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc)
|
||||
@ -126,12 +126,9 @@ void AngleHarmonicIntel::eval(const int vflag,
|
||||
const int nthreads = tc;
|
||||
|
||||
acc_t oeangle, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||
if (EVFLAG) {
|
||||
if (EFLAG)
|
||||
oeangle = (acc_t)0.0;
|
||||
if (vflag) {
|
||||
ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
|
||||
}
|
||||
if (EFLAG) oeangle = (acc_t)0.0;
|
||||
if (VFLAG && vflag) {
|
||||
ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
|
||||
}
|
||||
|
||||
#if defined(_OPENMP)
|
||||
@ -140,8 +137,12 @@ void AngleHarmonicIntel::eval(const int vflag,
|
||||
reduction(+:oeangle,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||
#endif
|
||||
{
|
||||
int nfrom, nto, tid;
|
||||
int nfrom, npl, nto, tid;
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||
IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
|
||||
#else
|
||||
IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
|
||||
#endif
|
||||
|
||||
FORCE_T * _noalias const f = f_start + (tid * f_stride);
|
||||
if (fix->need_zero(tid))
|
||||
@ -150,7 +151,17 @@ void AngleHarmonicIntel::eval(const int vflag,
|
||||
const int4_t * _noalias const anglelist =
|
||||
(int4_t *) neighbor->anglelist[0];
|
||||
|
||||
for (int n = nfrom; n < nto; n++) {
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||
acc_t seangle, sv0, sv1, sv2, sv3, sv4, sv5;
|
||||
if (EFLAG) seangle = (acc_t)0.0;
|
||||
if (VFLAG && vflag) {
|
||||
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
||||
}
|
||||
#pragma simd reduction(+:seangle, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
for (int n = nfrom; n < nto; n ++) {
|
||||
#else
|
||||
for (int n = nfrom; n < nto; n += npl) {
|
||||
#endif
|
||||
const int i1 = anglelist[n].a;
|
||||
const int i2 = anglelist[n].b;
|
||||
const int i3 = anglelist[n].c;
|
||||
@ -211,40 +222,58 @@ void AngleHarmonicIntel::eval(const int vflag,
|
||||
|
||||
// apply force to each of 3 atoms
|
||||
|
||||
if (NEWTON_BOND || i1 < nlocal) {
|
||||
f[i1].x += f1x;
|
||||
f[i1].y += f1y;
|
||||
f[i1].z += f1z;
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||
#pragma simdoff
|
||||
#endif
|
||||
{
|
||||
if (NEWTON_BOND || i1 < nlocal) {
|
||||
f[i1].x += f1x;
|
||||
f[i1].y += f1y;
|
||||
f[i1].z += f1z;
|
||||
}
|
||||
|
||||
if (NEWTON_BOND || i2 < nlocal) {
|
||||
f[i2].x -= f1x + f3x;
|
||||
f[i2].y -= f1y + f3y;
|
||||
f[i2].z -= f1z + f3z;
|
||||
}
|
||||
|
||||
if (NEWTON_BOND || i3 < nlocal) {
|
||||
f[i3].x += f3x;
|
||||
f[i3].y += f3y;
|
||||
f[i3].z += f3z;
|
||||
}
|
||||
}
|
||||
|
||||
if (NEWTON_BOND || i2 < nlocal) {
|
||||
f[i2].x -= f1x + f3x;
|
||||
f[i2].y -= f1y + f3y;
|
||||
f[i2].z -= f1z + f3z;
|
||||
}
|
||||
|
||||
if (NEWTON_BOND || i3 < nlocal) {
|
||||
f[i3].x += f3x;
|
||||
f[i3].y += f3y;
|
||||
f[i3].z += f3z;
|
||||
}
|
||||
|
||||
if (EVFLAG) {
|
||||
IP_PRE_ev_tally_angle(EFLAG, eatom, vflag, eangle, i1, i2, i3,f1x,
|
||||
f1y, f1z, f3x, f3y, f3z, delx1, dely1, delz1,
|
||||
delx2, dely2, delz2, oeangle, f, NEWTON_BOND,
|
||||
nlocal, ov0, ov1, ov2, ov3, ov4, ov5);
|
||||
if (EFLAG || VFLAG) {
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||
IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2, i3,
|
||||
f1x, f1y, f1z, f3x, f3y, f3z, delx1, dely1,
|
||||
delz1, delx2, dely2, delz2, seangle, f,
|
||||
NEWTON_BOND, nlocal, sv0, sv1, sv2, sv3, sv4,
|
||||
sv5);
|
||||
#else
|
||||
IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2, i3,
|
||||
f1x, f1y, f1z, f3x, f3y, f3z, delx1, dely1,
|
||||
delz1, delx2, dely2, delz2, oeangle, f,
|
||||
NEWTON_BOND, nlocal, ov0, ov1, ov2, ov3, ov4,
|
||||
ov5);
|
||||
#endif
|
||||
}
|
||||
} // for n
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||
if (EFLAG) oeangle += seangle;
|
||||
if (VFLAG && vflag) {
|
||||
ov0 += sv0; ov1 += sv1; ov2 += sv2;
|
||||
ov3 += sv3; ov4 += sv4; ov5 += sv5;
|
||||
}
|
||||
#endif
|
||||
} // omp parallel
|
||||
|
||||
if (EVFLAG) {
|
||||
if (EFLAG)
|
||||
energy += oeangle;
|
||||
if (vflag) {
|
||||
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
||||
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
||||
}
|
||||
if (EFLAG) energy += oeangle;
|
||||
if (VFLAG && vflag) {
|
||||
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
||||
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
||||
}
|
||||
|
||||
fix->set_reduce_flag();
|
||||
|
||||
@ -77,16 +77,16 @@ void BondFENEIntel::compute(int eflag, int vflag,
|
||||
else evflag = 0;
|
||||
|
||||
if (evflag) {
|
||||
if (eflag) {
|
||||
if (vflag && !eflag) {
|
||||
if (force->newton_bond)
|
||||
eval<0,1,1>(vflag, buffers, fc);
|
||||
else
|
||||
eval<0,1,0>(vflag, buffers, fc);
|
||||
} else {
|
||||
if (force->newton_bond)
|
||||
eval<1,1,1>(vflag, buffers, fc);
|
||||
else
|
||||
eval<1,1,0>(vflag, buffers, fc);
|
||||
} else {
|
||||
if (force->newton_bond)
|
||||
eval<1,0,1>(vflag, buffers, fc);
|
||||
else
|
||||
eval<1,0,0>(vflag, buffers, fc);
|
||||
}
|
||||
} else {
|
||||
if (force->newton_bond)
|
||||
@ -96,10 +96,10 @@ void BondFENEIntel::compute(int eflag, int vflag,
|
||||
}
|
||||
}
|
||||
|
||||
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||
void BondFENEIntel::eval(const int vflag,
|
||||
IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc)
|
||||
IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc)
|
||||
{
|
||||
const int inum = neighbor->nbondlist;
|
||||
if (inum == 0) return;
|
||||
@ -119,23 +119,23 @@ void BondFENEIntel::eval(const int vflag,
|
||||
const int nthreads = tc;
|
||||
|
||||
acc_t oebond, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||
if (EVFLAG) {
|
||||
if (EFLAG)
|
||||
oebond = (acc_t)0.0;
|
||||
if (vflag) {
|
||||
ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
|
||||
}
|
||||
if (EFLAG) oebond = (acc_t)0.0;
|
||||
if (VFLAG && vflag) {
|
||||
ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
|
||||
}
|
||||
|
||||
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp parallel default(none) \
|
||||
shared(f_start,f_stride,fc) \
|
||||
reduction(+:oebond,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||
#endif
|
||||
{
|
||||
int nfrom, nto, tid;
|
||||
int nfrom, npl, nto, tid;
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||
IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
|
||||
#else
|
||||
IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
|
||||
#endif
|
||||
|
||||
FORCE_T * _noalias const f = f_start + (tid * f_stride);
|
||||
if (fix->need_zero(tid))
|
||||
@ -144,7 +144,17 @@ void BondFENEIntel::eval(const int vflag,
|
||||
const int3_t * _noalias const bondlist =
|
||||
(int3_t *) neighbor->bondlist[0];
|
||||
|
||||
for (int n = nfrom; n < nto; n++) {
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||
acc_t sebond, sv0, sv1, sv2, sv3, sv4, sv5;
|
||||
if (EFLAG) sebond = (acc_t)0.0;
|
||||
if (VFLAG && vflag) {
|
||||
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
||||
}
|
||||
#pragma simd reduction(+:sebond, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
for (int n = nfrom; n < nto; n ++) {
|
||||
#else
|
||||
for (int n = nfrom; n < nto; n += npl) {
|
||||
#endif
|
||||
const int i1 = bondlist[n].a;
|
||||
const int i2 = bondlist[n].b;
|
||||
const int type = bondlist[n].t;
|
||||
@ -199,33 +209,48 @@ void BondFENEIntel::eval(const int vflag,
|
||||
|
||||
// apply force to each of 2 atoms
|
||||
|
||||
if (NEWTON_BOND || i1 < nlocal) {
|
||||
f[i1].x += delx*fbond;
|
||||
f[i1].y += dely*fbond;
|
||||
f[i1].z += delz*fbond;
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||
#pragma simdoff
|
||||
#endif
|
||||
{
|
||||
if (NEWTON_BOND || i1 < nlocal) {
|
||||
f[i1].x += delx*fbond;
|
||||
f[i1].y += dely*fbond;
|
||||
f[i1].z += delz*fbond;
|
||||
}
|
||||
|
||||
if (NEWTON_BOND || i2 < nlocal) {
|
||||
f[i2].x -= delx*fbond;
|
||||
f[i2].y -= dely*fbond;
|
||||
f[i2].z -= delz*fbond;
|
||||
}
|
||||
}
|
||||
|
||||
if (NEWTON_BOND || i2 < nlocal) {
|
||||
f[i2].x -= delx*fbond;
|
||||
f[i2].y -= dely*fbond;
|
||||
f[i2].z -= delz*fbond;
|
||||
}
|
||||
|
||||
if (EVFLAG) {
|
||||
IP_PRE_ev_tally_bond(EFLAG, eatom, vflag, ebond, i1, i2, fbond,
|
||||
if (EFLAG || VFLAG) {
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||
IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2, fbond,
|
||||
delx, dely, delz, sebond, f, NEWTON_BOND,
|
||||
nlocal, sv0, sv1, sv2, sv3, sv4, sv5);
|
||||
#else
|
||||
IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2, fbond,
|
||||
delx, dely, delz, oebond, f, NEWTON_BOND,
|
||||
nlocal, ov0, ov1, ov2, ov3, ov4, ov5);
|
||||
#endif
|
||||
}
|
||||
} // for n
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||
if (EFLAG) oebond += sebond;
|
||||
if (VFLAG && vflag) {
|
||||
ov0 += sv0; ov1 += sv1; ov2 += sv2;
|
||||
ov3 += sv3; ov4 += sv4; ov5 += sv5;
|
||||
}
|
||||
#endif
|
||||
} // omp parallel
|
||||
|
||||
if (EVFLAG) {
|
||||
if (EFLAG)
|
||||
energy += oebond;
|
||||
if (vflag) {
|
||||
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
||||
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
||||
}
|
||||
if (EFLAG) energy += oebond;
|
||||
if (VFLAG && vflag) {
|
||||
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
||||
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
||||
}
|
||||
|
||||
fix->set_reduce_flag();
|
||||
|
||||
@ -77,16 +77,16 @@ void BondHarmonicIntel::compute(int eflag, int vflag,
|
||||
else evflag = 0;
|
||||
|
||||
if (evflag) {
|
||||
if (eflag) {
|
||||
if (vflag && !eflag) {
|
||||
if (force->newton_bond)
|
||||
eval<0,1,1>(vflag, buffers, fc);
|
||||
else
|
||||
eval<0,1,0>(vflag, buffers, fc);
|
||||
} else {
|
||||
if (force->newton_bond)
|
||||
eval<1,1,1>(vflag, buffers, fc);
|
||||
else
|
||||
eval<1,1,0>(vflag, buffers, fc);
|
||||
} else {
|
||||
if (force->newton_bond)
|
||||
eval<1,0,1>(vflag, buffers, fc);
|
||||
else
|
||||
eval<1,0,0>(vflag, buffers, fc);
|
||||
}
|
||||
} else {
|
||||
if (force->newton_bond)
|
||||
@ -96,7 +96,7 @@ void BondHarmonicIntel::compute(int eflag, int vflag,
|
||||
}
|
||||
}
|
||||
|
||||
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||
void BondHarmonicIntel::eval(const int vflag,
|
||||
IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc)
|
||||
@ -119,12 +119,9 @@ void BondHarmonicIntel::eval(const int vflag,
|
||||
const int nthreads = tc;
|
||||
|
||||
acc_t oebond, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||
if (EVFLAG) {
|
||||
if (EFLAG)
|
||||
oebond = (acc_t)0.0;
|
||||
if (vflag) {
|
||||
ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
|
||||
}
|
||||
if (EFLAG) oebond = (acc_t)0.0;
|
||||
if (VFLAG && vflag) {
|
||||
ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
|
||||
}
|
||||
|
||||
#if defined(_OPENMP)
|
||||
@ -133,8 +130,12 @@ void BondHarmonicIntel::eval(const int vflag,
|
||||
reduction(+:oebond,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||
#endif
|
||||
{
|
||||
int nfrom, nto, tid;
|
||||
int nfrom, npl, nto, tid;
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||
IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
|
||||
#else
|
||||
IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
|
||||
#endif
|
||||
|
||||
FORCE_T * _noalias const f = f_start + (tid * f_stride);
|
||||
if (fix->need_zero(tid))
|
||||
@ -143,7 +144,17 @@ void BondHarmonicIntel::eval(const int vflag,
|
||||
const int3_t * _noalias const bondlist =
|
||||
(int3_t *) neighbor->bondlist[0];
|
||||
|
||||
for (int n = nfrom; n < nto; n++) {
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||
acc_t sebond, sv0, sv1, sv2, sv3, sv4, sv5;
|
||||
if (EFLAG) sebond = (acc_t)0.0;
|
||||
if (VFLAG && vflag) {
|
||||
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
||||
}
|
||||
#pragma simd reduction(+:sebond, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
for (int n = nfrom; n < nto; n ++) {
|
||||
#else
|
||||
for (int n = nfrom; n < nto; n += npl) {
|
||||
#endif
|
||||
const int i1 = bondlist[n].a;
|
||||
const int i2 = bondlist[n].b;
|
||||
const int type = bondlist[n].t;
|
||||
@ -167,33 +178,50 @@ void BondHarmonicIntel::eval(const int vflag,
|
||||
if (EFLAG) ebond = rk*dr;
|
||||
|
||||
// apply force to each of 2 atoms
|
||||
if (NEWTON_BOND || i1 < nlocal) {
|
||||
f[i1].x += delx*fbond;
|
||||
f[i1].y += dely*fbond;
|
||||
f[i1].z += delz*fbond;
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||
#pragma simdoff
|
||||
#endif
|
||||
{
|
||||
if (NEWTON_BOND || i1 < nlocal) {
|
||||
f[i1].x += delx*fbond;
|
||||
f[i1].y += dely*fbond;
|
||||
f[i1].z += delz*fbond;
|
||||
}
|
||||
|
||||
if (NEWTON_BOND || i2 < nlocal) {
|
||||
f[i2].x -= delx*fbond;
|
||||
f[i2].y -= dely*fbond;
|
||||
f[i2].z -= delz*fbond;
|
||||
}
|
||||
}
|
||||
|
||||
if (NEWTON_BOND || i2 < nlocal) {
|
||||
f[i2].x -= delx*fbond;
|
||||
f[i2].y -= dely*fbond;
|
||||
f[i2].z -= delz*fbond;
|
||||
}
|
||||
|
||||
if (EVFLAG) {
|
||||
IP_PRE_ev_tally_bond(EFLAG, eatom, vflag, ebond, i1, i2, fbond,
|
||||
delx, dely, delz, oebond, f, NEWTON_BOND,
|
||||
nlocal, ov0, ov1, ov2, ov3, ov4, ov5);
|
||||
if (EFLAG || VFLAG) {
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||
IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2,
|
||||
fbond, delx, dely, delz, sebond, f,
|
||||
NEWTON_BOND, nlocal, sv0, sv1, sv2, sv3,
|
||||
sv4, sv5);
|
||||
#else
|
||||
IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2,
|
||||
fbond, delx, dely, delz, oebond, f,
|
||||
NEWTON_BOND, nlocal, ov0, ov1, ov2, ov3,
|
||||
ov4, ov5);
|
||||
#endif
|
||||
}
|
||||
} // for n
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||
if (EFLAG) oebond += sebond;
|
||||
if (VFLAG && vflag) {
|
||||
ov0 += sv0; ov1 += sv1; ov2 += sv2;
|
||||
ov3 += sv3; ov4 += sv4; ov5 += sv5;
|
||||
}
|
||||
#endif
|
||||
} // omp parallel
|
||||
|
||||
if (EVFLAG) {
|
||||
if (EFLAG)
|
||||
energy += oebond;
|
||||
if (vflag) {
|
||||
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
||||
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
||||
}
|
||||
if (EFLAG) energy += oebond;
|
||||
if (VFLAG && vflag) {
|
||||
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
||||
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
||||
}
|
||||
|
||||
fix->set_reduce_flag();
|
||||
|
||||
@ -93,16 +93,16 @@ void DihedralCharmmIntel::compute(int eflag, int vflag,
|
||||
force->pair->vflag_either = force->pair->vflag_global = 1;
|
||||
|
||||
if (evflag) {
|
||||
if (eflag) {
|
||||
if (vflag && !eflag) {
|
||||
if (force->newton_bond)
|
||||
eval<0,1,1>(vflag, buffers, fc);
|
||||
else
|
||||
eval<0,1,0>(vflag, buffers, fc);
|
||||
} else {
|
||||
if (force->newton_bond)
|
||||
eval<1,1,1>(vflag, buffers, fc);
|
||||
else
|
||||
eval<1,1,0>(vflag, buffers, fc);
|
||||
} else {
|
||||
if (force->newton_bond)
|
||||
eval<1,0,1>(vflag, buffers, fc);
|
||||
else
|
||||
eval<1,0,0>(vflag, buffers, fc);
|
||||
}
|
||||
} else {
|
||||
if (force->newton_bond)
|
||||
@ -114,7 +114,7 @@ void DihedralCharmmIntel::compute(int eflag, int vflag,
|
||||
|
||||
#ifndef LMP_USE_AVXCD_DHC
|
||||
|
||||
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||
void DihedralCharmmIntel::eval(const int vflag,
|
||||
IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc)
|
||||
@ -140,13 +140,10 @@ void DihedralCharmmIntel::eval(const int vflag,
|
||||
|
||||
acc_t oedihedral, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||
acc_t oevdwl, oecoul, opv0, opv1, opv2, opv3, opv4, opv5;
|
||||
if (EVFLAG) {
|
||||
if (EFLAG)
|
||||
oevdwl = oecoul = oedihedral = (acc_t)0.0;
|
||||
if (vflag) {
|
||||
ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
|
||||
opv0 = opv1 = opv2 = opv3 = opv4 = opv5 = (acc_t)0.0;
|
||||
}
|
||||
if (EFLAG) oevdwl = oecoul = oedihedral = (acc_t)0.0;
|
||||
if (VFLAG && vflag) {
|
||||
ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
|
||||
opv0 = opv1 = opv2 = opv3 = opv4 = opv5 = (acc_t)0.0;
|
||||
}
|
||||
|
||||
#if defined(_OPENMP)
|
||||
@ -156,8 +153,13 @@ void DihedralCharmmIntel::eval(const int vflag,
|
||||
opv0,opv1,opv2,opv3,opv4,opv5)
|
||||
#endif
|
||||
{
|
||||
#if defined(LMP_SIMD_COMPILER_TEST)
|
||||
int nfrom, nto, tid;
|
||||
IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
|
||||
#else
|
||||
int nfrom, npl, nto, tid;
|
||||
IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
|
||||
#endif
|
||||
|
||||
FORCE_T * _noalias const f = f_start + (tid * f_stride);
|
||||
if (fix->need_zero(tid))
|
||||
@ -169,21 +171,19 @@ void DihedralCharmmIntel::eval(const int vflag,
|
||||
|
||||
acc_t sedihedral, sv0, sv1, sv2, sv3, sv4, sv5;
|
||||
acc_t sevdwl, secoul, spv0, spv1, spv2, spv3, spv4, spv5;
|
||||
if (EVFLAG) {
|
||||
if (EFLAG)
|
||||
sevdwl = secoul = sedihedral = (acc_t)0.0;
|
||||
if (vflag) {
|
||||
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
||||
spv0 = spv1 = spv2 = spv3 = spv4 = spv5 = (acc_t)0.0;
|
||||
}
|
||||
if (EFLAG) sevdwl = secoul = sedihedral = (acc_t)0.0;
|
||||
if (VFLAG && vflag) {
|
||||
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
||||
spv0 = spv1 = spv2 = spv3 = spv4 = spv5 = (acc_t)0.0;
|
||||
}
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER_TEST)
|
||||
#pragma vector aligned
|
||||
#pragma simd reduction(+:sedihedral, sevdwl, secoul, sv0, sv1, sv2, \
|
||||
sv3, sv4, sv5, spv0, spv1, spv2, spv3, spv4, spv5)
|
||||
#endif
|
||||
for (int n = nfrom; n < nto; n++) {
|
||||
#endif
|
||||
for (int n = nfrom; n < nto; n += npl) {
|
||||
const int i1 = dihedrallist[n].a;
|
||||
const int i2 = dihedrallist[n].b;
|
||||
const int i3 = dihedrallist[n].c;
|
||||
@ -333,14 +333,14 @@ void DihedralCharmmIntel::eval(const int vflag,
|
||||
const flt_t f3y = -sy2 - f4y;
|
||||
const flt_t f3z = -sz2 - f4z;
|
||||
|
||||
if (EVFLAG) {
|
||||
if (EFLAG || VFLAG) {
|
||||
flt_t deng;
|
||||
if (EFLAG) deng = tk * p;
|
||||
IP_PRE_ev_tally_dihed(EFLAG, eatom, vflag, deng, i1, i2, i3, i4, f1x,
|
||||
f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, vb1x,
|
||||
vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, vb3y,
|
||||
vb3z, sedihedral, f, NEWTON_BOND, nlocal,
|
||||
sv0, sv1, sv2, sv3, sv4, sv5);
|
||||
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3,
|
||||
i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y,
|
||||
f4z, vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm,
|
||||
vb3x, vb3y, vb3z, sedihedral, f, NEWTON_BOND,
|
||||
nlocal, sv0, sv1, sv2, sv3, sv4, sv5);
|
||||
}
|
||||
|
||||
|
||||
@ -387,7 +387,7 @@ void DihedralCharmmIntel::eval(const int vflag,
|
||||
f4z -= delz*fpair;
|
||||
}
|
||||
|
||||
if (EVFLAG) {
|
||||
if (EFLAG || VFLAG) {
|
||||
flt_t ev_pre = (flt_t)0;
|
||||
if (NEWTON_BOND || i1 < nlocal)
|
||||
ev_pre += (flt_t)0.5;
|
||||
@ -412,7 +412,7 @@ void DihedralCharmmIntel::eval(const int vflag,
|
||||
}
|
||||
// IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair,
|
||||
// delx, dely, delz);
|
||||
if (vflag) {
|
||||
if (VFLAG && vflag) {
|
||||
spv0 += ev_pre * delx * delx * fpair;
|
||||
spv1 += ev_pre * dely * dely * fpair;
|
||||
spv2 += ev_pre * delz * delz * fpair;
|
||||
@ -440,36 +440,32 @@ void DihedralCharmmIntel::eval(const int vflag,
|
||||
}
|
||||
}
|
||||
} // for n
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) {
|
||||
oedihedral += sedihedral;
|
||||
oecoul += secoul;
|
||||
oevdwl += sevdwl;
|
||||
}
|
||||
if (vflag) {
|
||||
ov0 += sv0; ov1 += sv1; ov2 += sv2; ov3 += sv3; ov4 += sv4; ov5 += sv5;
|
||||
opv0 += spv0; opv1 += spv1; opv2 += spv2;
|
||||
opv3 += spv3; opv4 += spv4; opv5 += spv5;
|
||||
}
|
||||
if (EFLAG) {
|
||||
oedihedral += sedihedral;
|
||||
oecoul += secoul;
|
||||
oevdwl += sevdwl;
|
||||
}
|
||||
if (VFLAG && vflag) {
|
||||
ov0 += sv0; ov1 += sv1; ov2 += sv2; ov3 += sv3; ov4 += sv4; ov5 += sv5;
|
||||
opv0 += spv0; opv1 += spv1; opv2 += spv2;
|
||||
opv3 += spv3; opv4 += spv4; opv5 += spv5;
|
||||
}
|
||||
} // omp parallel
|
||||
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) {
|
||||
energy += oedihedral;
|
||||
force->pair->eng_vdwl += oevdwl;
|
||||
force->pair->eng_coul += oecoul;
|
||||
}
|
||||
if (vflag) {
|
||||
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
||||
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
||||
force->pair->virial[0] += opv0;
|
||||
force->pair->virial[1] += opv1;
|
||||
force->pair->virial[2] += opv2;
|
||||
force->pair->virial[3] += opv3;
|
||||
force->pair->virial[4] += opv4;
|
||||
force->pair->virial[5] += opv5;
|
||||
}
|
||||
if (EFLAG) {
|
||||
energy += oedihedral;
|
||||
force->pair->eng_vdwl += oevdwl;
|
||||
force->pair->eng_coul += oecoul;
|
||||
}
|
||||
if (VFLAG && vflag) {
|
||||
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
||||
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
||||
force->pair->virial[0] += opv0;
|
||||
force->pair->virial[1] += opv1;
|
||||
force->pair->virial[2] += opv2;
|
||||
force->pair->virial[3] += opv3;
|
||||
force->pair->virial[4] += opv4;
|
||||
force->pair->virial[5] += opv5;
|
||||
}
|
||||
|
||||
fix->set_reduce_flag();
|
||||
@ -488,7 +484,7 @@ authors for more details.
|
||||
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||
void DihedralCharmmIntel::eval(const int vflag,
|
||||
IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc)
|
||||
@ -518,13 +514,10 @@ void DihedralCharmmIntel::eval(const int vflag,
|
||||
|
||||
acc_t oedihedral, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||
acc_t oevdwl, oecoul, opv0, opv1, opv2, opv3, opv4, opv5;
|
||||
if (EVFLAG) {
|
||||
if (EFLAG)
|
||||
oevdwl = oecoul = oedihedral = (acc_t)0.0;
|
||||
if (vflag) {
|
||||
ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
|
||||
opv0 = opv1 = opv2 = opv3 = opv4 = opv5 = (acc_t)0.0;
|
||||
}
|
||||
if (EFLAG) oevdwl = oecoul = oedihedral = (acc_t)0.0;
|
||||
if (VFLAG && vflag) {
|
||||
ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
|
||||
opv0 = opv1 = opv2 = opv3 = opv4 = opv5 = (acc_t)0.0;
|
||||
}
|
||||
|
||||
#if defined(_OPENMP)
|
||||
@ -534,8 +527,9 @@ void DihedralCharmmIntel::eval(const int vflag,
|
||||
opv0,opv1,opv2,opv3,opv4,opv5)
|
||||
#endif
|
||||
{
|
||||
int nfrom, nto, tid;
|
||||
IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
|
||||
int nfrom, npl, nto, tid;
|
||||
IP_PRE_omp_stride_id_vec(nfrom, npl, nto, tid, inum, nthreads,
|
||||
swidth);
|
||||
|
||||
FORCE_T * _noalias const f = f_start + (tid * f_stride);
|
||||
if (fix->need_zero(tid))
|
||||
@ -559,26 +553,24 @@ void DihedralCharmmIntel::eval(const int vflag,
|
||||
|
||||
SIMD_acc_t sedihedral, sv0, sv1, sv2, sv3, sv4, sv5;
|
||||
SIMD_acc_t sevdwl, secoul, spv0, spv1, spv2, spv3, spv4, spv5;
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) {
|
||||
sevdwl = SIMD_set((acc_t)0.0);
|
||||
secoul = SIMD_set((acc_t)0.0);
|
||||
sedihedral = SIMD_set((acc_t)0.0);
|
||||
}
|
||||
if (vflag) {
|
||||
sv0 = SIMD_set((acc_t)0.0);
|
||||
sv1 = SIMD_set((acc_t)0.0);
|
||||
sv2 = SIMD_set((acc_t)0.0);
|
||||
sv3 = SIMD_set((acc_t)0.0);
|
||||
sv4 = SIMD_set((acc_t)0.0);
|
||||
sv5 = SIMD_set((acc_t)0.0);
|
||||
spv0 = SIMD_set((acc_t)0.0);
|
||||
spv1 = SIMD_set((acc_t)0.0);
|
||||
spv2 = SIMD_set((acc_t)0.0);
|
||||
spv3 = SIMD_set((acc_t)0.0);
|
||||
spv4 = SIMD_set((acc_t)0.0);
|
||||
spv5 = SIMD_set((acc_t)0.0);
|
||||
}
|
||||
if (EFLAG) {
|
||||
sevdwl = SIMD_set((acc_t)0.0);
|
||||
secoul = SIMD_set((acc_t)0.0);
|
||||
sedihedral = SIMD_set((acc_t)0.0);
|
||||
}
|
||||
if (VFLAG && vflag) {
|
||||
sv0 = SIMD_set((acc_t)0.0);
|
||||
sv1 = SIMD_set((acc_t)0.0);
|
||||
sv2 = SIMD_set((acc_t)0.0);
|
||||
sv3 = SIMD_set((acc_t)0.0);
|
||||
sv4 = SIMD_set((acc_t)0.0);
|
||||
sv5 = SIMD_set((acc_t)0.0);
|
||||
spv0 = SIMD_set((acc_t)0.0);
|
||||
spv1 = SIMD_set((acc_t)0.0);
|
||||
spv2 = SIMD_set((acc_t)0.0);
|
||||
spv3 = SIMD_set((acc_t)0.0);
|
||||
spv4 = SIMD_set((acc_t)0.0);
|
||||
spv5 = SIMD_set((acc_t)0.0);
|
||||
}
|
||||
|
||||
SIMD_int n_offset = SIMD_set(0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50,
|
||||
@ -588,7 +580,7 @@ void DihedralCharmmIntel::eval(const int vflag,
|
||||
const SIMD_int simd_nlocals4 = SIMD_set(nlocals4);
|
||||
const int ntypes = atom->ntypes + 1;
|
||||
|
||||
for (int n = nfrom; n < nto; n += swidth) {
|
||||
for (int n = nfrom; n < nto; n += npl) {
|
||||
SIMD_mask nmask = n_offset < nto5;
|
||||
SIMD_int i1 = SIMD_gather(nmask, dihedrallist, n_offset);
|
||||
const SIMD_flt_t q1 = SIMD_gather(nmask, q, i1);
|
||||
@ -601,7 +593,7 @@ void DihedralCharmmIntel::eval(const int vflag,
|
||||
SIMD_int type = SIMD_gather(nmask, dihedrallist+4, n_offset);
|
||||
const SIMD_flt_t tweight = SIMD_gather(nmask, weight, type);
|
||||
type = type << 2;
|
||||
n_offset = n_offset + swidth * 5;
|
||||
n_offset = n_offset + npl * 5;
|
||||
|
||||
// 1st bond
|
||||
|
||||
@ -747,7 +739,7 @@ void DihedralCharmmIntel::eval(const int vflag,
|
||||
SIMD_flt_t f3z = -sz2 - f4z;
|
||||
|
||||
SIMD_flt_t qdeng;
|
||||
if (EVFLAG) {
|
||||
if (EFLAG || VFLAG) {
|
||||
SIMD_flt_t ev_pre;
|
||||
if (NEWTON_BOND) ev_pre = one;
|
||||
else {
|
||||
@ -774,7 +766,7 @@ void DihedralCharmmIntel::eval(const int vflag,
|
||||
SIMD_jeng_update(newton_mask, featom, i3, ieng);
|
||||
}
|
||||
}
|
||||
if (vflag) {
|
||||
if (VFLAG && vflag) {
|
||||
sv0 = SIMD_ev_add(sv0, ev_pre*(vb1x*f1x-vb2xm*f3x+(vb3x-vb2xm)*f4x));
|
||||
sv1 = SIMD_ev_add(sv1, ev_pre*(vb1y*f1y-vb2ym*f3y+(vb3y-vb2ym)*f4y));
|
||||
sv2 = SIMD_ev_add(sv2, ev_pre*(vb1z*f1z-vb2zm*f3z+(vb3z-vb2zm)*f4z));
|
||||
@ -816,7 +808,7 @@ void DihedralCharmmIntel::eval(const int vflag,
|
||||
f4y = f4y - dely * fpair;
|
||||
f4z = f4z - delz * fpair;
|
||||
|
||||
if (EVFLAG) {
|
||||
if (EFLAG || VFLAG) {
|
||||
SIMD_flt_t ev_pre;
|
||||
if (NEWTON_BOND) ev_pre = one;
|
||||
else {
|
||||
@ -848,7 +840,7 @@ void DihedralCharmmIntel::eval(const int vflag,
|
||||
SIMD_jeng_update(newton_mask, featom, i4, ieng);
|
||||
}
|
||||
}
|
||||
if (vflag) {
|
||||
if (VFLAG && vflag) {
|
||||
spv0 = SIMD_ev_add(spv0, ev_pre * delx * delx * fpair);
|
||||
spv1 = SIMD_ev_add(spv1, ev_pre * dely * dely * fpair);
|
||||
spv2 = SIMD_ev_add(spv2, ev_pre * delz * delz * fpair);
|
||||
@ -865,45 +857,41 @@ void DihedralCharmmIntel::eval(const int vflag,
|
||||
SIMD_safe_jforce(newton_mask, pforce, i4, f4x, f4y, f4z);
|
||||
} // for n
|
||||
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) {
|
||||
oedihedral += SIMD_sum(sedihedral);
|
||||
oecoul += SIMD_sum(secoul);
|
||||
oevdwl += SIMD_sum(sevdwl);
|
||||
}
|
||||
if (vflag) {
|
||||
ov0 += SIMD_sum(sv0);
|
||||
ov1 += SIMD_sum(sv1);
|
||||
ov2 += SIMD_sum(sv2);
|
||||
ov3 += SIMD_sum(sv3);
|
||||
ov4 += SIMD_sum(sv4);
|
||||
ov5 += SIMD_sum(sv5);
|
||||
opv0 += SIMD_sum(spv0);
|
||||
opv1 += SIMD_sum(spv1);
|
||||
opv2 += SIMD_sum(spv2);
|
||||
opv3 += SIMD_sum(spv3);
|
||||
opv4 += SIMD_sum(spv4);
|
||||
opv5 += SIMD_sum(spv5);
|
||||
}
|
||||
if (EFLAG) {
|
||||
oedihedral += SIMD_sum(sedihedral);
|
||||
oecoul += SIMD_sum(secoul);
|
||||
oevdwl += SIMD_sum(sevdwl);
|
||||
}
|
||||
if (VFLAG && vflag) {
|
||||
ov0 += SIMD_sum(sv0);
|
||||
ov1 += SIMD_sum(sv1);
|
||||
ov2 += SIMD_sum(sv2);
|
||||
ov3 += SIMD_sum(sv3);
|
||||
ov4 += SIMD_sum(sv4);
|
||||
ov5 += SIMD_sum(sv5);
|
||||
opv0 += SIMD_sum(spv0);
|
||||
opv1 += SIMD_sum(spv1);
|
||||
opv2 += SIMD_sum(spv2);
|
||||
opv3 += SIMD_sum(spv3);
|
||||
opv4 += SIMD_sum(spv4);
|
||||
opv5 += SIMD_sum(spv5);
|
||||
}
|
||||
} // omp parallel
|
||||
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) {
|
||||
energy += oedihedral;
|
||||
force->pair->eng_vdwl += oevdwl;
|
||||
force->pair->eng_coul += oecoul;
|
||||
}
|
||||
if (vflag) {
|
||||
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
||||
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
||||
force->pair->virial[0] += opv0;
|
||||
force->pair->virial[1] += opv1;
|
||||
force->pair->virial[2] += opv2;
|
||||
force->pair->virial[3] += opv3;
|
||||
force->pair->virial[4] += opv4;
|
||||
force->pair->virial[5] += opv5;
|
||||
}
|
||||
if (EFLAG) {
|
||||
energy += oedihedral;
|
||||
force->pair->eng_vdwl += oevdwl;
|
||||
force->pair->eng_coul += oecoul;
|
||||
}
|
||||
if (VFLAG && vflag) {
|
||||
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
||||
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
||||
force->pair->virial[0] += opv0;
|
||||
force->pair->virial[1] += opv1;
|
||||
force->pair->virial[2] += opv2;
|
||||
force->pair->virial[3] += opv3;
|
||||
force->pair->virial[4] += opv4;
|
||||
force->pair->virial[5] += opv5;
|
||||
}
|
||||
|
||||
fix->set_reduce_flag();
|
||||
@ -953,12 +941,14 @@ void DihedralCharmmIntel::pack_force_const(ForceConst<flt_t> &fc,
|
||||
fc.set_ntypes(tp1,bp1,memory);
|
||||
buffers->set_ntypes(tp1);
|
||||
|
||||
for (int i = 0; i < tp1; i++) {
|
||||
for (int j = 0; j < tp1; j++) {
|
||||
fc.ljp[i][j].lj1 = lj14_1[i][j];
|
||||
fc.ljp[i][j].lj2 = lj14_2[i][j];
|
||||
fc.ljp[i][j].lj3 = lj14_3[i][j];
|
||||
fc.ljp[i][j].lj4 = lj14_4[i][j];
|
||||
if (weightflag) {
|
||||
for (int i = 0; i < tp1; i++) {
|
||||
for (int j = 0; j < tp1; j++) {
|
||||
fc.ljp[i][j].lj1 = lj14_1[i][j];
|
||||
fc.ljp[i][j].lj2 = lj14_2[i][j];
|
||||
fc.ljp[i][j].lj3 = lj14_3[i][j];
|
||||
fc.ljp[i][j].lj4 = lj14_4[i][j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -77,16 +77,16 @@ void DihedralHarmonicIntel::compute(int eflag, int vflag,
|
||||
} else evflag = 0;
|
||||
|
||||
if (evflag) {
|
||||
if (eflag) {
|
||||
if (vflag && !eflag) {
|
||||
if (force->newton_bond)
|
||||
eval<0,1,1>(vflag, buffers, fc);
|
||||
else
|
||||
eval<0,1,0>(vflag, buffers, fc);
|
||||
} else {
|
||||
if (force->newton_bond)
|
||||
eval<1,1,1>(vflag, buffers, fc);
|
||||
else
|
||||
eval<1,1,0>(vflag, buffers, fc);
|
||||
} else {
|
||||
if (force->newton_bond)
|
||||
eval<1,0,1>(vflag, buffers, fc);
|
||||
else
|
||||
eval<1,0,0>(vflag, buffers, fc);
|
||||
}
|
||||
} else {
|
||||
if (force->newton_bond)
|
||||
@ -96,7 +96,7 @@ void DihedralHarmonicIntel::compute(int eflag, int vflag,
|
||||
}
|
||||
}
|
||||
|
||||
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||
void DihedralHarmonicIntel::eval(const int vflag,
|
||||
IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc)
|
||||
@ -120,12 +120,9 @@ void DihedralHarmonicIntel::eval(const int vflag,
|
||||
const int nthreads = tc;
|
||||
|
||||
acc_t oedihedral, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||
if (EVFLAG) {
|
||||
if (EFLAG)
|
||||
oedihedral = (acc_t)0.0;
|
||||
if (vflag) {
|
||||
ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
|
||||
}
|
||||
if (EFLAG) oedihedral = (acc_t)0.0;
|
||||
if (VFLAG && vflag) {
|
||||
ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
|
||||
}
|
||||
|
||||
#if defined(_OPENMP)
|
||||
@ -134,8 +131,12 @@ void DihedralHarmonicIntel::eval(const int vflag,
|
||||
reduction(+:oedihedral,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||
#endif
|
||||
{
|
||||
int nfrom, nto, tid;
|
||||
int nfrom, npl, nto, tid;
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||
IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
|
||||
#else
|
||||
IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
|
||||
#endif
|
||||
|
||||
FORCE_T * _noalias const f = f_start + (tid * f_stride);
|
||||
if (fix->need_zero(tid))
|
||||
@ -144,16 +145,17 @@ void DihedralHarmonicIntel::eval(const int vflag,
|
||||
const int5_t * _noalias const dihedrallist =
|
||||
(int5_t *) neighbor->dihedrallist[0];
|
||||
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||
acc_t sedihedral, sv0, sv1, sv2, sv3, sv4, sv5;
|
||||
if (EVFLAG) {
|
||||
if (EFLAG)
|
||||
sedihedral = (acc_t)0.0;
|
||||
if (vflag) {
|
||||
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
||||
}
|
||||
if (EFLAG) sedihedral = (acc_t)0.0;
|
||||
if (VFLAG && vflag) {
|
||||
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
||||
}
|
||||
|
||||
for (int n = nfrom; n < nto; n++) {
|
||||
#pragma simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
for (int n = nfrom; n < nto; n ++) {
|
||||
#else
|
||||
for (int n = nfrom; n < nto; n += npl) {
|
||||
#endif
|
||||
const int i1 = dihedrallist[n].a;
|
||||
const int i2 = dihedrallist[n].b;
|
||||
const int i3 = dihedrallist[n].c;
|
||||
@ -203,6 +205,7 @@ void DihedralHarmonicIntel::eval(const int vflag,
|
||||
const flt_t s = rg*rabinv*(ax*vb3x + ay*vb3y + az*vb3z);
|
||||
|
||||
// error check
|
||||
#ifndef LMP_INTEL_USE_SIMDOFF
|
||||
if (c > PTOLERANCE || c < MTOLERANCE) {
|
||||
int me = comm->me;
|
||||
|
||||
@ -224,6 +227,7 @@ void DihedralHarmonicIntel::eval(const int vflag,
|
||||
me,x[i4].x,x[i4].y,x[i4].z);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (c > (flt_t)1.0) c = (flt_t)1.0;
|
||||
if (c < (flt_t)-1.0) c = (flt_t)-1.0;
|
||||
@ -292,16 +296,27 @@ void DihedralHarmonicIntel::eval(const int vflag,
|
||||
const flt_t f3y = -sy2 - f4y;
|
||||
const flt_t f3z = -sz2 - f4z;
|
||||
|
||||
if (EVFLAG) {
|
||||
if (EFLAG || VFLAG) {
|
||||
flt_t deng;
|
||||
if (EFLAG) deng = tk * p;
|
||||
IP_PRE_ev_tally_dihed(EFLAG, eatom, vflag, deng, i1, i2, i3, i4, f1x,
|
||||
f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, vb1x,
|
||||
vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, vb3y,
|
||||
vb3z, sedihedral, f, NEWTON_BOND, nlocal,
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4,
|
||||
f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
|
||||
vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x,
|
||||
vb3y, vb3z, sedihedral, f, NEWTON_BOND, nlocal,
|
||||
sv0, sv1, sv2, sv3, sv4, sv5);
|
||||
#else
|
||||
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4,
|
||||
f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
|
||||
vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x,
|
||||
vb3y, vb3z, oedihedral, f, NEWTON_BOND, nlocal,
|
||||
ov0, ov1, ov2, ov3, ov4, ov5);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||
#pragma simdoff
|
||||
#endif
|
||||
{
|
||||
if (NEWTON_BOND || i1 < nlocal) {
|
||||
f[i1].x += f1x;
|
||||
@ -328,20 +343,19 @@ void DihedralHarmonicIntel::eval(const int vflag,
|
||||
}
|
||||
}
|
||||
} // for n
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) oedihedral += sedihedral;
|
||||
if (vflag) {
|
||||
ov0 += sv0; ov1 += sv1; ov2 += sv2; ov3 += sv3; ov4 += sv4; ov5 += sv5;
|
||||
}
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||
if (EFLAG) oedihedral += sedihedral;
|
||||
if (VFLAG && vflag) {
|
||||
ov0 += sv0; ov1 += sv1; ov2 += sv2;
|
||||
ov3 += sv3; ov4 += sv4; ov5 += sv5;
|
||||
}
|
||||
#endif
|
||||
} // omp parallel
|
||||
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) energy += oedihedral;
|
||||
if (vflag) {
|
||||
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
||||
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
||||
}
|
||||
if (EFLAG) energy += oedihedral;
|
||||
if (VFLAG && vflag) {
|
||||
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
||||
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
||||
}
|
||||
|
||||
fix->set_reduce_flag();
|
||||
|
||||
@ -81,16 +81,16 @@ void DihedralOPLSIntel::compute(int eflag, int vflag,
|
||||
} else evflag = 0;
|
||||
|
||||
if (evflag) {
|
||||
if (eflag) {
|
||||
if (vflag && !eflag) {
|
||||
if (force->newton_bond)
|
||||
eval<0,1,1>(vflag, buffers, fc);
|
||||
else
|
||||
eval<0,1,0>(vflag, buffers, fc);
|
||||
} else {
|
||||
if (force->newton_bond)
|
||||
eval<1,1,1>(vflag, buffers, fc);
|
||||
else
|
||||
eval<1,1,0>(vflag, buffers, fc);
|
||||
} else {
|
||||
if (force->newton_bond)
|
||||
eval<1,0,1>(vflag, buffers, fc);
|
||||
else
|
||||
eval<1,0,0>(vflag, buffers, fc);
|
||||
}
|
||||
} else {
|
||||
if (force->newton_bond)
|
||||
@ -100,7 +100,7 @@ void DihedralOPLSIntel::compute(int eflag, int vflag,
|
||||
}
|
||||
}
|
||||
|
||||
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||
void DihedralOPLSIntel::eval(const int vflag,
|
||||
IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc)
|
||||
@ -124,12 +124,9 @@ void DihedralOPLSIntel::eval(const int vflag,
|
||||
const int nthreads = tc;
|
||||
|
||||
acc_t oedihedral, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||
if (EVFLAG) {
|
||||
if (EFLAG)
|
||||
oedihedral = (acc_t)0.0;
|
||||
if (vflag) {
|
||||
ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
|
||||
}
|
||||
if (EFLAG) oedihedral = (acc_t)0.0;
|
||||
if (VFLAG && vflag) {
|
||||
ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
|
||||
}
|
||||
|
||||
#if defined(_OPENMP)
|
||||
@ -138,8 +135,12 @@ void DihedralOPLSIntel::eval(const int vflag,
|
||||
reduction(+:oedihedral,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||
#endif
|
||||
{
|
||||
int nfrom, nto, tid;
|
||||
int nfrom, npl, nto, tid;
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||
IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
|
||||
#else
|
||||
IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
|
||||
#endif
|
||||
|
||||
FORCE_T * _noalias const f = f_start + (tid * f_stride);
|
||||
if (fix->need_zero(tid))
|
||||
@ -148,16 +149,17 @@ void DihedralOPLSIntel::eval(const int vflag,
|
||||
const int5_t * _noalias const dihedrallist =
|
||||
(int5_t *) neighbor->dihedrallist[0];
|
||||
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||
acc_t sedihedral, sv0, sv1, sv2, sv3, sv4, sv5;
|
||||
if (EVFLAG) {
|
||||
if (EFLAG)
|
||||
sedihedral = (acc_t)0.0;
|
||||
if (vflag) {
|
||||
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
||||
}
|
||||
if (EFLAG) sedihedral = (acc_t)0.0;
|
||||
if (VFLAG && vflag) {
|
||||
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
||||
}
|
||||
|
||||
for (int n = nfrom; n < nto; n++) {
|
||||
#pragma simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
for (int n = nfrom; n < nto; n ++) {
|
||||
#else
|
||||
for (int n = nfrom; n < nto; n += npl) {
|
||||
#endif
|
||||
const int i1 = dihedrallist[n].a;
|
||||
const int i2 = dihedrallist[n].b;
|
||||
const int i3 = dihedrallist[n].c;
|
||||
@ -236,6 +238,7 @@ void DihedralOPLSIntel::eval(const int vflag,
|
||||
const flt_t dx = (cx*vb3x + cy*vb3y + cz*vb3z)*cmag*rb3;
|
||||
|
||||
// error check
|
||||
#ifndef LMP_INTEL_USE_SIMDOFF
|
||||
if (c > PTOLERANCE || c < MTOLERANCE) {
|
||||
int me = comm->me;
|
||||
|
||||
@ -257,6 +260,7 @@ void DihedralOPLSIntel::eval(const int vflag,
|
||||
me,x[i4].x,x[i4].y,x[i4].z);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (c > (flt_t)1.0) c = (flt_t)1.0;
|
||||
if (c < (flt_t)-1.0) c = (flt_t)-1.0;
|
||||
@ -321,14 +325,25 @@ void DihedralOPLSIntel::eval(const int vflag,
|
||||
const flt_t f3y = sy2 - f4y;
|
||||
const flt_t f3z = sz2 - f4z;
|
||||
|
||||
if (EVFLAG) {
|
||||
IP_PRE_ev_tally_dihed(EFLAG, eatom, vflag, edihed, i1, i2, i3, i4, f1x,
|
||||
f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, vb1x,
|
||||
vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, vb3y,
|
||||
vb3z, sedihedral, f, NEWTON_BOND, nlocal,
|
||||
if (EFLAG || VFLAG) {
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, edihed, i1, i2, i3,
|
||||
i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
|
||||
vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x,
|
||||
vb3y, vb3z, sedihedral, f, NEWTON_BOND, nlocal,
|
||||
sv0, sv1, sv2, sv3, sv4, sv5);
|
||||
#else
|
||||
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, edihed, i1, i2, i3,
|
||||
i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
|
||||
vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x,
|
||||
vb3y, vb3z, oedihedral, f, NEWTON_BOND, nlocal,
|
||||
ov0, ov1, ov2, ov3, ov4, ov5);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||
#pragma simdoff
|
||||
#endif
|
||||
{
|
||||
if (NEWTON_BOND || i1 < nlocal) {
|
||||
f[i1].x += f1x;
|
||||
@ -355,20 +370,19 @@ void DihedralOPLSIntel::eval(const int vflag,
|
||||
}
|
||||
}
|
||||
} // for n
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) oedihedral += sedihedral;
|
||||
if (vflag) {
|
||||
ov0 += sv0; ov1 += sv1; ov2 += sv2; ov3 += sv3; ov4 += sv4; ov5 += sv5;
|
||||
}
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||
if (EFLAG) oedihedral += sedihedral;
|
||||
if (VFLAG && vflag) {
|
||||
ov0 += sv0; ov1 += sv1; ov2 += sv2;
|
||||
ov3 += sv3; ov4 += sv4; ov5 += sv5;
|
||||
}
|
||||
#endif
|
||||
} // omp parallel
|
||||
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) energy += oedihedral;
|
||||
if (vflag) {
|
||||
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
||||
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
||||
}
|
||||
if (EFLAG) energy += oedihedral;
|
||||
if (VFLAG && vflag) {
|
||||
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
||||
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
||||
}
|
||||
|
||||
fix->set_reduce_flag();
|
||||
|
||||
@ -61,6 +61,7 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg)
|
||||
int ncops = force->inumeric(FLERR,arg[3]);
|
||||
|
||||
_nbor_pack_width = 1;
|
||||
_three_body_neighbor = 0;
|
||||
|
||||
_precision_mode = PREC_MODE_MIXED;
|
||||
_offload_balance = -1.0;
|
||||
@ -326,12 +327,18 @@ void FixIntel::init()
|
||||
"Currently, cannot use more than one intel style with hybrid.");
|
||||
|
||||
check_neighbor_intel();
|
||||
if (_precision_mode == PREC_MODE_SINGLE)
|
||||
int off_mode = 0;
|
||||
if (_offload_balance != 0.0) off_mode = 1;
|
||||
if (_precision_mode == PREC_MODE_SINGLE) {
|
||||
_single_buffers->zero_ev();
|
||||
else if (_precision_mode == PREC_MODE_MIXED)
|
||||
_single_buffers->grow_ncache(off_mode,_nthreads);
|
||||
} else if (_precision_mode == PREC_MODE_MIXED) {
|
||||
_mixed_buffers->zero_ev();
|
||||
else
|
||||
_mixed_buffers->grow_ncache(off_mode,_nthreads);
|
||||
} else {
|
||||
_double_buffers->zero_ev();
|
||||
_double_buffers->grow_ncache(off_mode,_nthreads);
|
||||
}
|
||||
|
||||
_need_reduce = 0;
|
||||
}
|
||||
@ -367,8 +374,6 @@ void FixIntel::pair_init_check(const bool cdmessage)
|
||||
{
|
||||
#ifdef INTEL_VMASK
|
||||
atom->sortfreq = 1;
|
||||
if (neighbor->binsizeflag && atom->userbinsize <= 0.0)
|
||||
atom->userbinsize = neighbor->binsize_user;
|
||||
#endif
|
||||
|
||||
_nbor_pack_width = 1;
|
||||
@ -376,9 +381,8 @@ void FixIntel::pair_init_check(const bool cdmessage)
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (_offload_balance != 0.0) atom->sortfreq = 1;
|
||||
|
||||
if (force->newton_pair == 0)
|
||||
_offload_noghost = 0;
|
||||
else if (_offload_ghost == 0)
|
||||
_offload_noghost = 0;
|
||||
if (force->newton_pair && _offload_ghost == 0)
|
||||
_offload_noghost = 1;
|
||||
|
||||
set_offload_affinity();
|
||||
@ -535,24 +539,24 @@ void FixIntel::pre_reverse(int eflag, int vflag)
|
||||
{
|
||||
if (_force_array_m != 0) {
|
||||
if (_need_reduce) {
|
||||
reduce_results(_force_array_m);
|
||||
reduce_results(&_force_array_m[0].x);
|
||||
_need_reduce = 0;
|
||||
}
|
||||
add_results(_force_array_m, _ev_array_d, _results_eatom, _results_vatom, 0);
|
||||
add_results(_force_array_m, _ev_array_d, _results_eatom, _results_vatom,0);
|
||||
_force_array_m = 0;
|
||||
} else if (_force_array_d != 0) {
|
||||
if (_need_reduce) {
|
||||
reduce_results(_force_array_d);
|
||||
reduce_results(&_force_array_d[0].x);
|
||||
_need_reduce = 0;
|
||||
}
|
||||
add_results(_force_array_d, _ev_array_d, _results_eatom, _results_vatom, 0);
|
||||
add_results(_force_array_d, _ev_array_d, _results_eatom, _results_vatom,0);
|
||||
_force_array_d = 0;
|
||||
} else if (_force_array_s != 0) {
|
||||
if (_need_reduce) {
|
||||
reduce_results(_force_array_s);
|
||||
reduce_results(&_force_array_s[0].x);
|
||||
_need_reduce = 0;
|
||||
}
|
||||
add_results(_force_array_s, _ev_array_s, _results_eatom, _results_vatom, 0);
|
||||
add_results(_force_array_s, _ev_array_s, _results_eatom, _results_vatom,0);
|
||||
_force_array_s = 0;
|
||||
}
|
||||
|
||||
@ -563,47 +567,56 @@ void FixIntel::pre_reverse(int eflag, int vflag)
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template <class ft>
|
||||
void FixIntel::reduce_results(ft * _noalias const f_start)
|
||||
template <class acc_t>
|
||||
void FixIntel::reduce_results(acc_t * _noalias const f_scalar)
|
||||
{
|
||||
int o_range, f_stride;
|
||||
if (force->newton_pair)
|
||||
o_range = atom->nlocal + atom->nghost;
|
||||
else
|
||||
o_range = atom->nlocal;
|
||||
IP_PRE_get_stride(f_stride, o_range, sizeof(ft), lmp->atom->torque);
|
||||
IP_PRE_get_stride(f_stride, o_range, (sizeof(acc_t)*4), lmp->atom->torque);
|
||||
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp parallel default(none) shared(o_range, f_stride)
|
||||
#endif
|
||||
{
|
||||
int iifrom, iito, tid;
|
||||
IP_PRE_omp_range_id_align(iifrom, iito, tid, o_range, _nthreads,
|
||||
sizeof(ft));
|
||||
o_range *= 4;
|
||||
const int f_stride4 = f_stride * 4;
|
||||
|
||||
int t_off = f_stride;
|
||||
if (_results_eatom) {
|
||||
for (int t = 1; t < _nthreads; t++) {
|
||||
_use_simd_pragma("vector nontemporal")
|
||||
_use_simd_pragma("novector")
|
||||
for (int n = iifrom; n < iito; n++) {
|
||||
f_start[n].x += f_start[n + t_off].x;
|
||||
f_start[n].y += f_start[n + t_off].y;
|
||||
f_start[n].z += f_start[n + t_off].z;
|
||||
f_start[n].w += f_start[n + t_off].w;
|
||||
}
|
||||
t_off += f_stride;
|
||||
}
|
||||
if (_nthreads <= INTEL_HTHREADS) {
|
||||
acc_t *f_scalar2 = f_scalar + f_stride4;
|
||||
if (_nthreads == 4) {
|
||||
acc_t *f_scalar3 = f_scalar2 + f_stride4;
|
||||
acc_t *f_scalar4 = f_scalar3 + f_stride4;
|
||||
_use_simd_pragma("vector aligned")
|
||||
_use_simd_pragma("simd")
|
||||
for (int n = 0; n < o_range; n++)
|
||||
f_scalar[n] += f_scalar2[n] + f_scalar3[n] + f_scalar4[n];
|
||||
} else if (_nthreads == 2) {
|
||||
_use_simd_pragma("vector aligned")
|
||||
_use_simd_pragma("simd")
|
||||
for (int n = 0; n < o_range; n++)
|
||||
f_scalar[n] += f_scalar2[n];
|
||||
} else {
|
||||
acc_t *f_scalar3 = f_scalar2 + f_stride4;
|
||||
_use_simd_pragma("vector aligned")
|
||||
_use_simd_pragma("simd")
|
||||
for (int n = 0; n < o_range; n++)
|
||||
f_scalar[n] += f_scalar2[n] + f_scalar3[n];
|
||||
}
|
||||
} else {
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp parallel
|
||||
#endif
|
||||
{
|
||||
int iifrom, iito, tid;
|
||||
IP_PRE_omp_range_id_align(iifrom, iito, tid, o_range, _nthreads,
|
||||
sizeof(acc_t));
|
||||
|
||||
acc_t *f_scalar2 = f_scalar + f_stride4;
|
||||
for (int t = 1; t < _nthreads; t++) {
|
||||
_use_simd_pragma("vector nontemporal")
|
||||
_use_simd_pragma("novector")
|
||||
for (int n = iifrom; n < iito; n++) {
|
||||
f_start[n].x += f_start[n + t_off].x;
|
||||
f_start[n].y += f_start[n + t_off].y;
|
||||
f_start[n].z += f_start[n + t_off].z;
|
||||
}
|
||||
t_off += f_stride;
|
||||
_use_simd_pragma("vector aligned")
|
||||
_use_simd_pragma("simd")
|
||||
for (int n = iifrom; n < iito; n++)
|
||||
f_scalar[n] += f_scalar2[n];
|
||||
f_scalar2 += f_stride4;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -641,40 +654,59 @@ void FixIntel::add_results(const ft * _noalias const f_in,
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (_separate_buffers) {
|
||||
if (offload) {
|
||||
add_oresults(f_in, ev_global, eatom, vatom, 0, _offload_nlocal);
|
||||
if (force->newton_pair) {
|
||||
add_oresults(f_in, ev_global, eatom, vatom, 0, _offload_nlocal);
|
||||
const acc_t * _noalias const enull = 0;
|
||||
int offset = _offload_nlocal;
|
||||
if (atom->torque) offset *= 2;
|
||||
add_oresults(f_in + offset, enull, eatom, vatom,
|
||||
_offload_min_ghost, _offload_nghost);
|
||||
}
|
||||
} else
|
||||
add_oresults(f_in, ev_global, eatom, vatom, 0, offload_end_pair());
|
||||
} else {
|
||||
add_oresults(f_in, ev_global, eatom, vatom,
|
||||
_host_min_local, _host_used_local);
|
||||
if (force->newton_pair) {
|
||||
add_oresults(f_in, ev_global, eatom, vatom,
|
||||
_host_min_local, _host_used_local);
|
||||
const acc_t * _noalias const enull = 0;
|
||||
int offset = _host_used_local;
|
||||
if (atom->torque) offset *= 2;
|
||||
add_oresults(f_in + offset, enull, eatom,
|
||||
vatom, _host_min_ghost, _host_used_ghost);
|
||||
} else {
|
||||
int start = host_start_pair();
|
||||
add_oresults(f_in, ev_global, eatom, vatom, start, atom->nlocal-start);
|
||||
}
|
||||
}
|
||||
stop_watch(TIME_PACK);
|
||||
return;
|
||||
}
|
||||
if (force->newton_pair && (_offload_noghost == 0 || offload == 0))
|
||||
f_length = atom->nlocal + atom->nghost;
|
||||
else
|
||||
f_length = atom->nlocal;
|
||||
int start;
|
||||
if (offload) {
|
||||
start = 0;
|
||||
if (force->newton_pair) {
|
||||
if (_offload_noghost == 0)
|
||||
f_length = atom->nlocal + atom->nghost;
|
||||
else
|
||||
f_length = atom->nlocal;
|
||||
} else
|
||||
f_length = offload_end_pair();
|
||||
} else {
|
||||
if (force->newton_pair) {
|
||||
start = 0;
|
||||
f_length = atom->nlocal + atom->nghost;
|
||||
} else {
|
||||
start = host_start_pair();
|
||||
f_length = atom->nlocal - start;
|
||||
}
|
||||
}
|
||||
add_oresults(f_in, ev_global, eatom, vatom, start, f_length);
|
||||
#else
|
||||
if (force->newton_pair)
|
||||
f_length = atom->nlocal + atom->nghost;
|
||||
else
|
||||
f_length = atom->nlocal;
|
||||
#endif
|
||||
|
||||
add_oresults(f_in, ev_global, eatom, vatom, 0, f_length);
|
||||
#endif
|
||||
stop_watch(TIME_PACK);
|
||||
}
|
||||
|
||||
@ -695,8 +727,11 @@ void FixIntel::add_oresults(const ft * _noalias const f_in,
|
||||
"Sphere particles not yet supported for gayberne/intel");
|
||||
}
|
||||
|
||||
int packthreads;
|
||||
if (_nthreads > INTEL_HTHREADS) packthreads = _nthreads;
|
||||
else packthreads = 1;
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp parallel default(none)
|
||||
#pragma omp parallel if(packthreads > 1)
|
||||
#endif
|
||||
{
|
||||
#if defined(_OPENMP)
|
||||
@ -705,7 +740,7 @@ void FixIntel::add_oresults(const ft * _noalias const f_in,
|
||||
const int tid = 0;
|
||||
#endif
|
||||
int ifrom, ito;
|
||||
IP_PRE_omp_range_align(ifrom, ito, tid, nall, _nthreads, sizeof(acc_t));
|
||||
IP_PRE_omp_range_align(ifrom, ito, tid, nall, packthreads, sizeof(acc_t));
|
||||
if (atom->torque) {
|
||||
int ii = ifrom * 2;
|
||||
lmp_ft * _noalias const tor = (lmp_ft *) lmp->atom->torque[0] +
|
||||
@ -833,6 +868,11 @@ void FixIntel::add_off_results(const ft * _noalias const f_in,
|
||||
_offload_nlocal;
|
||||
}
|
||||
|
||||
if (atom->torque)
|
||||
if (f_in[1].w < 0.0)
|
||||
error->all(FLERR, "Bad matrix inversion in mldivide3");
|
||||
add_results(f_in, ev_global, _off_results_eatom, _off_results_vatom, 1);
|
||||
|
||||
// Load balance?
|
||||
if (_offload_balance < 0.0) {
|
||||
if (neighbor->ago == 0)
|
||||
@ -860,10 +900,6 @@ void FixIntel::add_off_results(const ft * _noalias const f_in,
|
||||
stop_watch(TIME_IMBALANCE);
|
||||
#endif
|
||||
acc_timers();
|
||||
if (atom->torque)
|
||||
if (f_in[1].w < 0.0)
|
||||
error->all(FLERR, "Bad matrix inversion in mldivide3");
|
||||
add_results(f_in, ev_global, _off_results_eatom, _off_results_vatom, 1);
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
@ -70,23 +70,32 @@ class FixIntel : public Fix {
|
||||
|
||||
inline int nbor_pack_width() const { return _nbor_pack_width; }
|
||||
inline void nbor_pack_width(const int w) { _nbor_pack_width = w; }
|
||||
inline int three_body_neighbor() { return _three_body_neighbor; }
|
||||
inline void three_body_neighbor(const int i) { _three_body_neighbor = 1; }
|
||||
|
||||
inline int need_zero(const int tid) {
|
||||
if (_need_reduce == 0 && tid > 0) return 1;
|
||||
return 0;
|
||||
}
|
||||
inline void set_reduce_flag() { _need_reduce = 1; }
|
||||
inline void set_reduce_flag() { if (_nthreads > 1) _need_reduce = 1; }
|
||||
inline int lrt() {
|
||||
if (force->kspace_match("pppm/intel", 0)) return _lrt;
|
||||
else return 0;
|
||||
}
|
||||
inline int pppm_table() {
|
||||
if (force->kspace_match("pppm/intel", 0) ||
|
||||
force->kspace_match("pppm/disp/intel",0))
|
||||
return INTEL_P3M_TABLE;
|
||||
else return 0;
|
||||
}
|
||||
|
||||
|
||||
protected:
|
||||
IntelBuffers<float,float> *_single_buffers;
|
||||
IntelBuffers<float,double> *_mixed_buffers;
|
||||
IntelBuffers<double,double> *_double_buffers;
|
||||
|
||||
int _precision_mode, _nthreads, _nbor_pack_width;
|
||||
int _precision_mode, _nthreads, _nbor_pack_width, _three_body_neighbor;
|
||||
|
||||
public:
|
||||
inline int* get_overflow_flag() { return _overflow_flag; }
|
||||
@ -241,7 +250,10 @@ void FixIntel::get_buffern(const int offload, int &nlocal, int &nall,
|
||||
} else {
|
||||
nlocal = atom->nlocal;
|
||||
nall = _host_nall;
|
||||
minlocal = _host_min_local;
|
||||
if (force->newton)
|
||||
minlocal = _host_min_local;
|
||||
else
|
||||
minlocal = host_start_pair();
|
||||
}
|
||||
return;
|
||||
}
|
||||
@ -275,7 +287,7 @@ void FixIntel::add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in,
|
||||
_results_eatom = eatom;
|
||||
_results_vatom = vatom;
|
||||
#ifndef _LMP_INTEL_OFFLOAD
|
||||
if (rflag != 2 && _nthreads > 1) _need_reduce = 1;
|
||||
if (rflag != 2 && _nthreads > 1 && force->newton) _need_reduce = 1;
|
||||
#endif
|
||||
|
||||
if (_overflow_flag[LMP_OVERFLOW])
|
||||
@ -303,7 +315,7 @@ void FixIntel::add_result_array(IntelBuffers<float,double>::vec3_acc_t *f_in,
|
||||
_results_eatom = eatom;
|
||||
_results_vatom = vatom;
|
||||
#ifndef _LMP_INTEL_OFFLOAD
|
||||
if (rflag != 2 && _nthreads > 1) _need_reduce = 1;
|
||||
if (rflag != 2 && _nthreads > 1 && force->newton) _need_reduce = 1;
|
||||
#endif
|
||||
|
||||
if (_overflow_flag[LMP_OVERFLOW])
|
||||
@ -331,7 +343,7 @@ void FixIntel::add_result_array(IntelBuffers<float,float>::vec3_acc_t *f_in,
|
||||
_results_eatom = eatom;
|
||||
_results_vatom = vatom;
|
||||
#ifndef _LMP_INTEL_OFFLOAD
|
||||
if (rflag != 2 && _nthreads > 1) _need_reduce = 1;
|
||||
if (rflag != 2 && _nthreads > 1 && force->newton) _need_reduce = 1;
|
||||
#endif
|
||||
|
||||
if (_overflow_flag[LMP_OVERFLOW])
|
||||
|
||||
@ -87,16 +87,16 @@ void ImproperCvffIntel::compute(int eflag, int vflag,
|
||||
else evflag = 0;
|
||||
|
||||
if (evflag) {
|
||||
if (eflag) {
|
||||
if (vflag && !eflag) {
|
||||
if (force->newton_bond)
|
||||
eval<0,1,1>(vflag, buffers, fc);
|
||||
else
|
||||
eval<0,1,0>(vflag, buffers, fc);
|
||||
} else {
|
||||
if (force->newton_bond)
|
||||
eval<1,1,1>(vflag, buffers, fc);
|
||||
else
|
||||
eval<1,1,0>(vflag, buffers, fc);
|
||||
} else {
|
||||
if (force->newton_bond)
|
||||
eval<1,0,1>(vflag, buffers, fc);
|
||||
else
|
||||
eval<1,0,0>(vflag, buffers, fc);
|
||||
}
|
||||
} else {
|
||||
if (force->newton_bond)
|
||||
@ -108,7 +108,7 @@ void ImproperCvffIntel::compute(int eflag, int vflag,
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||
void ImproperCvffIntel::eval(const int vflag,
|
||||
IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc)
|
||||
@ -131,12 +131,9 @@ void ImproperCvffIntel::eval(const int vflag,
|
||||
const int nthreads = tc;
|
||||
|
||||
acc_t oeimproper, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||
if (EVFLAG) {
|
||||
if (EFLAG)
|
||||
oeimproper = (acc_t)0.0;
|
||||
if (vflag) {
|
||||
ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
|
||||
}
|
||||
if (EFLAG) oeimproper = (acc_t)0.0;
|
||||
if (VFLAG && vflag) {
|
||||
ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
|
||||
}
|
||||
|
||||
#if defined(_OPENMP)
|
||||
@ -145,8 +142,12 @@ void ImproperCvffIntel::eval(const int vflag,
|
||||
reduction(+:oeimproper,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||
#endif
|
||||
{
|
||||
int nfrom, nto, tid;
|
||||
int nfrom, npl, nto, tid;
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF_FIX
|
||||
IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
|
||||
#else
|
||||
IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
|
||||
#endif
|
||||
|
||||
FORCE_T * _noalias const f = f_start + (tid * f_stride);
|
||||
if (fix->need_zero(tid))
|
||||
@ -155,7 +156,17 @@ void ImproperCvffIntel::eval(const int vflag,
|
||||
const int5_t * _noalias const improperlist =
|
||||
(int5_t *) neighbor->improperlist[0];
|
||||
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF_FIX
|
||||
acc_t seimproper, sv0, sv1, sv2, sv3, sv4, sv5;
|
||||
if (EFLAG) seimproper = (acc_t)0.0;
|
||||
if (VFLAG && vflag) {
|
||||
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
||||
}
|
||||
#pragma simd reduction(+:seimproper, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
for (int n = nfrom; n < nto; n++) {
|
||||
#else
|
||||
for (int n = nfrom; n < nto; n += npl) {
|
||||
#endif
|
||||
const int i1 = improperlist[n].a;
|
||||
const int i2 = improperlist[n].b;
|
||||
const int i3 = improperlist[n].c;
|
||||
@ -216,7 +227,7 @@ void ImproperCvffIntel::eval(const int vflag,
|
||||
flt_t c = (c0 + c1mag*c2mag) * s12;
|
||||
|
||||
// error check
|
||||
|
||||
#ifndef LMP_INTEL_USE_SIMDOFF_FIX
|
||||
if (c > PTOLERANCE || c < MTOLERANCE) {
|
||||
int me;
|
||||
MPI_Comm_rank(world,&me);
|
||||
@ -238,6 +249,7 @@ void ImproperCvffIntel::eval(const int vflag,
|
||||
me,x[i4].x,x[i4].y,x[i4].z);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (c > (flt_t)1.0) c = (flt_t)1.0;
|
||||
if (c < (flt_t)-1.0) c = (flt_t)-1.0;
|
||||
@ -250,31 +262,36 @@ void ImproperCvffIntel::eval(const int vflag,
|
||||
const int m = fc.fc[type].multiplicity;
|
||||
|
||||
flt_t p, pd;
|
||||
if (m == 2) {
|
||||
p = (flt_t)2.0*c*c;
|
||||
pd = (flt_t)2.0*c;
|
||||
} else if (m == 3) {
|
||||
const flt_t rc2 = c*c;
|
||||
p = ((flt_t)4.0*rc2-(flt_t)3.0)*c + (flt_t)1.0;
|
||||
pd = (flt_t)6.0*rc2 - (flt_t)1.5;
|
||||
} else if (m == 4) {
|
||||
const flt_t rc2 = c*c;
|
||||
p = (flt_t)8.0*(rc2-1)*rc2 + (flt_t)2.0;
|
||||
pd = ((flt_t)16.0*rc2-(flt_t)8.0)*c;
|
||||
} else if (m == 6) {
|
||||
const flt_t rc2 = c*c;
|
||||
p = (((flt_t)32.0*rc2-(flt_t)48.0)*rc2 + (flt_t)18.0)*rc2;
|
||||
pd = ((flt_t)96.0*(rc2-(flt_t)1.0)*rc2 + (flt_t)18.0)*c;
|
||||
} else if (m == 1) {
|
||||
p = c + (flt_t)1.0;
|
||||
pd = (flt_t)0.5;
|
||||
} else if (m == 5) {
|
||||
const flt_t rc2 = c*c;
|
||||
p = (((flt_t)16.0*rc2-(flt_t)20.0)*rc2 + (flt_t)5.0)*c + (flt_t)1.0;
|
||||
pd = ((flt_t)40.0*rc2-(flt_t)30.0)*rc2 + (flt_t)2.5;
|
||||
} else if (m == 0) {
|
||||
p = (flt_t)2.0;
|
||||
pd = (flt_t)0.0;
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF_FIX
|
||||
#pragma simdoff
|
||||
#endif
|
||||
{
|
||||
if (m == 2) {
|
||||
p = (flt_t)2.0*c*c;
|
||||
pd = (flt_t)2.0*c;
|
||||
} else if (m == 3) {
|
||||
const flt_t rc2 = c*c;
|
||||
p = ((flt_t)4.0*rc2-(flt_t)3.0)*c + (flt_t)1.0;
|
||||
pd = (flt_t)6.0*rc2 - (flt_t)1.5;
|
||||
} else if (m == 4) {
|
||||
const flt_t rc2 = c*c;
|
||||
p = (flt_t)8.0*(rc2-1)*rc2 + (flt_t)2.0;
|
||||
pd = ((flt_t)16.0*rc2-(flt_t)8.0)*c;
|
||||
} else if (m == 6) {
|
||||
const flt_t rc2 = c*c;
|
||||
p = (((flt_t)32.0*rc2-(flt_t)48.0)*rc2 + (flt_t)18.0)*rc2;
|
||||
pd = ((flt_t)96.0*(rc2-(flt_t)1.0)*rc2 + (flt_t)18.0)*c;
|
||||
} else if (m == 1) {
|
||||
p = c + (flt_t)1.0;
|
||||
pd = (flt_t)0.5;
|
||||
} else if (m == 5) {
|
||||
const flt_t rc2 = c*c;
|
||||
p = (((flt_t)16.0*rc2-(flt_t)20.0)*rc2 + (flt_t)5.0)*c + (flt_t)1.0;
|
||||
pd = ((flt_t)40.0*rc2-(flt_t)30.0)*rc2 + (flt_t)2.5;
|
||||
} else if (m == 0) {
|
||||
p = (flt_t)2.0;
|
||||
pd = (flt_t)0.0;
|
||||
}
|
||||
}
|
||||
|
||||
if (fc.fc[type].sign == -1) {
|
||||
@ -317,46 +334,63 @@ void ImproperCvffIntel::eval(const int vflag,
|
||||
|
||||
// apply force to each of 4 atoms
|
||||
|
||||
if (NEWTON_BOND || i1 < nlocal) {
|
||||
f[i1].x += f1x;
|
||||
f[i1].y += f1y;
|
||||
f[i1].z += f1z;
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF_FIX
|
||||
#pragma simdoff
|
||||
#endif
|
||||
{
|
||||
if (NEWTON_BOND || i1 < nlocal) {
|
||||
f[i1].x += f1x;
|
||||
f[i1].y += f1y;
|
||||
f[i1].z += f1z;
|
||||
}
|
||||
|
||||
if (NEWTON_BOND || i2 < nlocal) {
|
||||
f[i2].x += f2x;
|
||||
f[i2].y += f2y;
|
||||
f[i2].z += f2z;
|
||||
}
|
||||
|
||||
if (NEWTON_BOND || i3 < nlocal) {
|
||||
f[i3].x += f3x;
|
||||
f[i3].y += f3y;
|
||||
f[i3].z += f3z;
|
||||
}
|
||||
|
||||
if (NEWTON_BOND || i4 < nlocal) {
|
||||
f[i4].x += f4x;
|
||||
f[i4].y += f4y;
|
||||
f[i4].z += f4z;
|
||||
}
|
||||
}
|
||||
|
||||
if (NEWTON_BOND || i2 < nlocal) {
|
||||
f[i2].x += f2x;
|
||||
f[i2].y += f2y;
|
||||
f[i2].z += f2z;
|
||||
}
|
||||
|
||||
if (NEWTON_BOND || i3 < nlocal) {
|
||||
f[i3].x += f3x;
|
||||
f[i3].y += f3y;
|
||||
f[i3].z += f3z;
|
||||
}
|
||||
|
||||
if (NEWTON_BOND || i4 < nlocal) {
|
||||
f[i4].x += f4x;
|
||||
f[i4].y += f4y;
|
||||
f[i4].z += f4z;
|
||||
}
|
||||
|
||||
if (EVFLAG) {
|
||||
IP_PRE_ev_tally_dihed(EFLAG, eatom, vflag, eimproper, i1, i2, i3, i4,
|
||||
f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
|
||||
vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x,
|
||||
vb3y, vb3z, oeimproper, f, NEWTON_BOND, nlocal,
|
||||
ov0, ov1, ov2, ov3, ov4, ov5);
|
||||
if (EFLAG || VFLAG) {
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF_FIX
|
||||
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, eimproper, i1, i2,
|
||||
i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y,
|
||||
f4z, vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm,
|
||||
vb3x, vb3y, vb3z, seimproper, f, NEWTON_BOND,
|
||||
nlocal, sv0, sv1, sv2, sv3, sv4, sv5);
|
||||
#else
|
||||
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, eimproper, i1, i2,
|
||||
i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y,
|
||||
f4z, vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm,
|
||||
vb3x, vb3y, vb3z, oeimproper, f, NEWTON_BOND,
|
||||
nlocal, ov0, ov1, ov2, ov3, ov4, ov5);
|
||||
#endif
|
||||
}
|
||||
} // for n
|
||||
} // omp parallel
|
||||
if (EVFLAG) {
|
||||
if (EFLAG)
|
||||
energy += oeimproper;
|
||||
if (vflag) {
|
||||
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
||||
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF_FIX
|
||||
if (EFLAG) oeimproper += seimproper;
|
||||
if (VFLAG && vflag) {
|
||||
ov0 += sv0; ov1 += sv1; ov2 += sv2;
|
||||
ov3 += sv3; ov4 += sv4; ov5 += sv5;
|
||||
}
|
||||
#endif
|
||||
} // omp parallel
|
||||
if (EFLAG) energy += oeimproper;
|
||||
if (VFLAG && vflag) {
|
||||
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
||||
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
||||
}
|
||||
|
||||
fix->set_reduce_flag();
|
||||
|
||||
@ -88,16 +88,16 @@ void ImproperHarmonicIntel::compute(int eflag, int vflag,
|
||||
else evflag = 0;
|
||||
|
||||
if (evflag) {
|
||||
if (eflag) {
|
||||
if (vflag && !eflag) {
|
||||
if (force->newton_bond)
|
||||
eval<0,1,1>(vflag, buffers, fc);
|
||||
else
|
||||
eval<0,1,0>(vflag, buffers, fc);
|
||||
} else {
|
||||
if (force->newton_bond)
|
||||
eval<1,1,1>(vflag, buffers, fc);
|
||||
else
|
||||
eval<1,1,0>(vflag, buffers, fc);
|
||||
} else {
|
||||
if (force->newton_bond)
|
||||
eval<1,0,1>(vflag, buffers, fc);
|
||||
else
|
||||
eval<1,0,0>(vflag, buffers, fc);
|
||||
}
|
||||
} else {
|
||||
if (force->newton_bond)
|
||||
@ -109,7 +109,7 @@ void ImproperHarmonicIntel::compute(int eflag, int vflag,
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||
void ImproperHarmonicIntel::eval(const int vflag,
|
||||
IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc)
|
||||
@ -132,12 +132,9 @@ void ImproperHarmonicIntel::eval(const int vflag,
|
||||
const int nthreads = tc;
|
||||
|
||||
acc_t oeimproper, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||
if (EVFLAG) {
|
||||
if (EFLAG)
|
||||
oeimproper = (acc_t)0.0;
|
||||
if (vflag) {
|
||||
ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
|
||||
}
|
||||
if (EFLAG) oeimproper = (acc_t)0.0;
|
||||
if (VFLAG && vflag) {
|
||||
ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
|
||||
}
|
||||
|
||||
#if defined(_OPENMP)
|
||||
@ -146,8 +143,12 @@ void ImproperHarmonicIntel::eval(const int vflag,
|
||||
reduction(+:oeimproper,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||
#endif
|
||||
{
|
||||
int nfrom, nto, tid;
|
||||
int nfrom, npl, nto, tid;
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||
IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
|
||||
#else
|
||||
IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
|
||||
#endif
|
||||
|
||||
FORCE_T * _noalias const f = f_start + (tid * f_stride);
|
||||
if (fix->need_zero(tid))
|
||||
@ -156,7 +157,17 @@ void ImproperHarmonicIntel::eval(const int vflag,
|
||||
const int5_t * _noalias const improperlist =
|
||||
(int5_t *) neighbor->improperlist[0];
|
||||
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||
acc_t seimproper, sv0, sv1, sv2, sv3, sv4, sv5;
|
||||
if (EFLAG) seimproper = (acc_t)0.0;
|
||||
if (VFLAG && vflag) {
|
||||
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
||||
}
|
||||
#pragma simd reduction(+:seimproper, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
for (int n = nfrom; n < nto; n++) {
|
||||
#else
|
||||
for (int n = nfrom; n < nto; n += npl) {
|
||||
#endif
|
||||
const int i1 = improperlist[n].a;
|
||||
const int i2 = improperlist[n].b;
|
||||
const int i3 = improperlist[n].c;
|
||||
@ -207,7 +218,7 @@ void ImproperHarmonicIntel::eval(const int vflag,
|
||||
flt_t c = (c1*c2 + c0) * s12;
|
||||
|
||||
// error check
|
||||
|
||||
#ifndef LMP_INTEL_USE_SIMDOFF
|
||||
if (c > PTOLERANCE || c < MTOLERANCE) {
|
||||
int me;
|
||||
MPI_Comm_rank(world,&me);
|
||||
@ -229,6 +240,7 @@ void ImproperHarmonicIntel::eval(const int vflag,
|
||||
me,x[i4].x,x[i4].y,x[i4].z);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (c > (flt_t)1.0) c = (flt_t)1.0;
|
||||
if (c < (flt_t)-1.0) c = (flt_t)-1.0;
|
||||
@ -278,46 +290,63 @@ void ImproperHarmonicIntel::eval(const int vflag,
|
||||
|
||||
// apply force to each of 4 atoms
|
||||
|
||||
if (NEWTON_BOND || i1 < nlocal) {
|
||||
f[i1].x += f1x;
|
||||
f[i1].y += f1y;
|
||||
f[i1].z += f1z;
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||
#pragma simdoff
|
||||
#endif
|
||||
{
|
||||
if (NEWTON_BOND || i1 < nlocal) {
|
||||
f[i1].x += f1x;
|
||||
f[i1].y += f1y;
|
||||
f[i1].z += f1z;
|
||||
}
|
||||
|
||||
if (NEWTON_BOND || i2 < nlocal) {
|
||||
f[i2].x += f2x;
|
||||
f[i2].y += f2y;
|
||||
f[i2].z += f2z;
|
||||
}
|
||||
|
||||
if (NEWTON_BOND || i3 < nlocal) {
|
||||
f[i3].x += f3x;
|
||||
f[i3].y += f3y;
|
||||
f[i3].z += f3z;
|
||||
}
|
||||
|
||||
if (NEWTON_BOND || i4 < nlocal) {
|
||||
f[i4].x += f4x;
|
||||
f[i4].y += f4y;
|
||||
f[i4].z += f4z;
|
||||
}
|
||||
}
|
||||
|
||||
if (NEWTON_BOND || i2 < nlocal) {
|
||||
f[i2].x += f2x;
|
||||
f[i2].y += f2y;
|
||||
f[i2].z += f2z;
|
||||
}
|
||||
|
||||
if (NEWTON_BOND || i3 < nlocal) {
|
||||
f[i3].x += f3x;
|
||||
f[i3].y += f3y;
|
||||
f[i3].z += f3z;
|
||||
}
|
||||
|
||||
if (NEWTON_BOND || i4 < nlocal) {
|
||||
f[i4].x += f4x;
|
||||
f[i4].y += f4y;
|
||||
f[i4].z += f4z;
|
||||
}
|
||||
|
||||
if (EVFLAG) {
|
||||
IP_PRE_ev_tally_dihed(EFLAG, eatom, vflag, eimproper, i1, i2, i3, i4,
|
||||
f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
|
||||
vb1x, vb1y, vb1z, vb2x, vb2y, vb2z, vb3x, vb3y,
|
||||
vb3z, oeimproper, f, NEWTON_BOND, nlocal,
|
||||
ov0, ov1, ov2, ov3, ov4, ov5);
|
||||
if (EFLAG || VFLAG) {
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, eimproper, i1, i2,
|
||||
i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x,
|
||||
f4y, f4z, vb1x, vb1y, vb1z, vb2x, vb2y, vb2z,
|
||||
vb3x, vb3y, vb3z, seimproper, f, NEWTON_BOND,
|
||||
nlocal, sv0, sv1, sv2, sv3, sv4, sv5);
|
||||
#else
|
||||
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, eimproper, i1, i2,
|
||||
i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x,
|
||||
f4y, f4z, vb1x, vb1y, vb1z, vb2x, vb2y, vb2z,
|
||||
vb3x, vb3y, vb3z, oeimproper, f, NEWTON_BOND,
|
||||
nlocal, ov0, ov1, ov2, ov3, ov4, ov5);
|
||||
#endif
|
||||
}
|
||||
} // for n
|
||||
} // omp parallel
|
||||
if (EVFLAG) {
|
||||
if (EFLAG)
|
||||
energy += oeimproper;
|
||||
if (vflag) {
|
||||
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
||||
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||
if (EFLAG) oeimproper += seimproper;
|
||||
if (VFLAG && vflag) {
|
||||
ov0 += sv0; ov1 += sv1; ov2 += sv2;
|
||||
ov3 += sv3; ov4 += sv4; ov5 += sv5;
|
||||
}
|
||||
#endif
|
||||
} // omp parallel
|
||||
if (EFLAG) energy += oeimproper;
|
||||
if (VFLAG && vflag) {
|
||||
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
||||
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
||||
}
|
||||
|
||||
fix->set_reduce_flag();
|
||||
|
||||
@ -12,6 +12,7 @@
|
||||
Contributing author: W. Michael Brown (Intel)
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include <math.h>
|
||||
#include "intel_buffers.h"
|
||||
#include "force.h"
|
||||
#include "memory.h"
|
||||
@ -28,6 +29,7 @@ IntelBuffers<flt_t, acc_t>::IntelBuffers(class LAMMPS *lmp_in) :
|
||||
_ntypes = 0;
|
||||
_off_map_listlocal = 0;
|
||||
_ccachex = 0;
|
||||
_ncache_alloc = 0;
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
_separate_buffers = 0;
|
||||
_off_f = 0;
|
||||
@ -36,6 +38,7 @@ IntelBuffers<flt_t, acc_t>::IntelBuffers(class LAMMPS *lmp_in) :
|
||||
_off_list_alloc = false;
|
||||
_off_threads = 0;
|
||||
_off_ccache = 0;
|
||||
_off_ncache = 0;
|
||||
_host_nmax = 0;
|
||||
#endif
|
||||
}
|
||||
@ -111,15 +114,20 @@ void IntelBuffers<flt_t, acc_t>::_grow(const int nall, const int nlocal,
|
||||
_buf_local_size = _buf_size;
|
||||
else
|
||||
_buf_local_size = static_cast<double>(nlocal) * 1.1 + 1;
|
||||
if (lmp->atom->torque)
|
||||
_buf_local_size *= 2;
|
||||
const int f_stride = get_stride(_buf_local_size);
|
||||
lmp->memory->create(_x, _buf_size,"intel_x");
|
||||
if (lmp->atom->q != NULL)
|
||||
lmp->memory->create(_q, _buf_size, "intel_q");
|
||||
if (lmp->atom->ellipsoid != NULL)
|
||||
lmp->memory->create(_quat, _buf_size, "intel_quat");
|
||||
lmp->memory->create(_f, f_stride * nthreads, "intel_f");
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (lmp->force->newton_pair)
|
||||
#else
|
||||
if (lmp->force->newton_pair || lmp->atom->molecular)
|
||||
#endif
|
||||
lmp->memory->create(_f, f_stride * nthreads, "intel_f");
|
||||
else
|
||||
lmp->memory->create(_f, f_stride, "intel_f");
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (_separate_buffers) {
|
||||
@ -131,7 +139,10 @@ void IntelBuffers<flt_t, acc_t>::_grow(const int nall, const int nlocal,
|
||||
}
|
||||
|
||||
if (offload_end > 0) {
|
||||
lmp->memory->create(_off_f, f_stride * _off_threads, "intel_off_f");
|
||||
int fm;
|
||||
if (lmp->force->newton_pair) fm = _off_threads;
|
||||
else fm = 1;
|
||||
lmp->memory->create(_off_f, f_stride * fm, "intel_off_f");
|
||||
const atom_t * const x = get_x();
|
||||
const flt_t * const q = get_q();
|
||||
const vec3_acc_t * f_start = get_off_f();
|
||||
@ -140,14 +151,14 @@ void IntelBuffers<flt_t, acc_t>::_grow(const int nall, const int nlocal,
|
||||
if (x != NULL && q != NULL && f_start != NULL && ev_global != NULL) {
|
||||
#pragma offload_transfer target(mic:_cop) \
|
||||
nocopy(x,q:length(_buf_size) alloc_if(1) free_if(0)) \
|
||||
nocopy(f_start:length(f_stride*_off_threads) alloc_if(1) free_if(0))\
|
||||
nocopy(f_start:length(f_stride*fm) alloc_if(1) free_if(0))\
|
||||
nocopy(ev_global:length(8) alloc_if(1) free_if(0))
|
||||
}
|
||||
} else {
|
||||
if (x != NULL && f_start != NULL && ev_global != NULL) {
|
||||
#pragma offload_transfer target(mic:_cop) \
|
||||
nocopy(x:length(_buf_size) alloc_if(1) free_if(0)) \
|
||||
nocopy(f_start:length(f_stride*_off_threads) alloc_if(1) free_if(0))\
|
||||
nocopy(f_start:length(f_stride*fm) alloc_if(1) free_if(0))\
|
||||
nocopy(ev_global:length(8) alloc_if(1) free_if(0))
|
||||
}
|
||||
}
|
||||
@ -427,6 +438,115 @@ void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template <class flt_t, class acc_t>
|
||||
void IntelBuffers<flt_t, acc_t>::free_ncache()
|
||||
{
|
||||
if (_ncache_alloc) {
|
||||
flt_t *ncachex = _ncachex;
|
||||
flt_t *ncachey = _ncachey;
|
||||
flt_t *ncachez = _ncachez;
|
||||
int *ncachej = _ncachej;
|
||||
int *ncachejtype = _ncachejtype;
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (_off_ncache) {
|
||||
#pragma offload_transfer target(mic:_cop) \
|
||||
nocopy(ncachex,ncachey,ncachez,ncachej:alloc_if(0) free_if(1)) \
|
||||
nocopy(ncachejtype:alloc_if(0) free_if(1))
|
||||
}
|
||||
_off_ncache = 0;
|
||||
#endif
|
||||
|
||||
lmp->memory->destroy(ncachex);
|
||||
lmp->memory->destroy(ncachey);
|
||||
lmp->memory->destroy(ncachez);
|
||||
lmp->memory->destroy(ncachej);
|
||||
lmp->memory->destroy(ncachejtype);
|
||||
|
||||
_ncache_alloc = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template <class flt_t, class acc_t>
|
||||
void IntelBuffers<flt_t, acc_t>::grow_ncache(const int off_flag,
|
||||
const int nthreads)
|
||||
{
|
||||
const int nsize = get_max_nbors() * 3;
|
||||
int esize = MIN(sizeof(int), sizeof(flt_t));
|
||||
IP_PRE_get_stride(_ncache_stride, nsize, esize, 0);
|
||||
int nt = MAX(nthreads, _off_threads);
|
||||
const int vsize = _ncache_stride * nt;
|
||||
|
||||
if (_ncache_alloc) {
|
||||
if (vsize > _ncache_alloc)
|
||||
free_ncache();
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
else if (off_flag && _off_ncache == 0)
|
||||
free_ncache();
|
||||
#endif
|
||||
else
|
||||
return;
|
||||
}
|
||||
|
||||
lmp->memory->create(_ncachex, vsize, "_ncachex");
|
||||
lmp->memory->create(_ncachey, vsize, "_ncachey");
|
||||
lmp->memory->create(_ncachez, vsize, "_ncachez");
|
||||
lmp->memory->create(_ncachej, vsize, "_ncachej");
|
||||
lmp->memory->create(_ncachejtype, vsize, "_ncachejtype");
|
||||
|
||||
_ncache_alloc = vsize;
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (off_flag) {
|
||||
flt_t *ncachex = _ncachex;
|
||||
flt_t *ncachey = _ncachey;
|
||||
flt_t *ncachez = _ncachez;
|
||||
int *ncachej = _ncachej;
|
||||
int *ncachejtype = _ncachejtype;
|
||||
|
||||
if (ncachex != NULL && ncachey !=NULL && ncachez != NULL &&
|
||||
ncachej != NULL && ncachejtype != NULL) {
|
||||
#pragma offload_transfer target(mic:_cop) \
|
||||
nocopy(ncachex,ncachey:length(vsize) alloc_if(1) free_if(0)) \
|
||||
nocopy(ncachez,ncachej:length(vsize) alloc_if(1) free_if(0)) \
|
||||
nocopy(ncachejtype:length(vsize) alloc_if(1) free_if(0))
|
||||
}
|
||||
_off_ncache = 1;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
#ifndef _LMP_INTEL_OFFLOAD
|
||||
template <class flt_t, class acc_t>
|
||||
void IntelBuffers<flt_t, acc_t>::fdotr_reduce_l5(const int lf, const int lt,
|
||||
const int nthreads, const int f_stride, acc_t &ov0, acc_t &ov1,
|
||||
acc_t &ov2, acc_t &ov3, acc_t &ov4, acc_t &ov5)
|
||||
{
|
||||
IP_PRE_fdotr_acc_force_l5(lf, lt, 0, nthreads, _f, f_stride, _x, ov0,
|
||||
ov1, ov2, ov3, ov4, ov5);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
#ifndef _LMP_INTEL_OFFLOAD
|
||||
template <class flt_t, class acc_t>
|
||||
void IntelBuffers<flt_t, acc_t>::fdotr_reduce(const int nall,
|
||||
const int nthreads, const int f_stride, acc_t &ov0, acc_t &ov1,
|
||||
acc_t &ov2, acc_t &ov3, acc_t &ov4, acc_t &ov5)
|
||||
{
|
||||
int iifrom, iito, tid;
|
||||
IP_PRE_fdotr_acc_force(nall, 0, nthreads, _f, f_stride, _x, 0, 2,
|
||||
ov0, ov1, ov2, ov3, ov4, ov5);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template <class flt_t, class acc_t>
|
||||
void IntelBuffers<flt_t, acc_t>::set_ntypes(const int ntypes)
|
||||
{
|
||||
|
||||
@ -78,6 +78,7 @@ class IntelBuffers {
|
||||
free_nbor_list();
|
||||
free_nmax();
|
||||
free_list_local();
|
||||
free_ncache();
|
||||
}
|
||||
|
||||
inline void grow_list(NeighList *list, const int nlocal, const int nthreads,
|
||||
@ -106,6 +107,15 @@ class IntelBuffers {
|
||||
inline acc_t * get_ccachef() { return _ccachef; }
|
||||
#endif
|
||||
|
||||
void free_ncache();
|
||||
void grow_ncache(const int off_flag, const int nthreads);
|
||||
inline int ncache_stride() { return _ncache_stride; }
|
||||
inline flt_t * get_ncachex() { return _ncachex; }
|
||||
inline flt_t * get_ncachey() { return _ncachey; }
|
||||
inline flt_t * get_ncachez() { return _ncachez; }
|
||||
inline int * get_ncachej() { return _ncachej; }
|
||||
inline int * get_ncachejtype() { return _ncachejtype; }
|
||||
|
||||
inline int get_max_nbors() {
|
||||
int mn = lmp->neighbor->oneatom * sizeof(int) /
|
||||
(INTEL_ONEATOM_FACTOR * INTEL_DATA_ALIGN);
|
||||
@ -180,6 +190,15 @@ class IntelBuffers {
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef _LMP_INTEL_OFFLOAD
|
||||
void fdotr_reduce_l5(const int lf, const int lt, const int nthreads,
|
||||
const int f_stride, acc_t &ov0, acc_t &ov1,
|
||||
acc_t &ov2, acc_t &ov3, acc_t &ov4, acc_t &ov5);
|
||||
void fdotr_reduce(const int nall, const int nthreads, const int f_stride,
|
||||
acc_t &ov0, acc_t &ov1, acc_t &ov2, acc_t &ov3,
|
||||
acc_t &ov4, acc_t &ov5);
|
||||
#endif
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
inline void thr_pack_cop(const int ifrom, const int ito,
|
||||
const int offset, const bool dotype = false) {
|
||||
@ -263,6 +282,10 @@ class IntelBuffers {
|
||||
int _ccache_stride;
|
||||
flt_t *_ccachex, *_ccachey, *_ccachez, *_ccachew;
|
||||
int *_ccachei, *_ccachej;
|
||||
|
||||
int _ncache_stride, _ncache_alloc;
|
||||
flt_t *_ncachex, *_ncachey, *_ncachez;
|
||||
int *_ncachej, *_ncachejtype;
|
||||
#ifdef LMP_USE_AVXCD
|
||||
int _ccache_stride3;
|
||||
acc_t * _ccachef;
|
||||
@ -274,7 +297,7 @@ class IntelBuffers {
|
||||
flt_t *_host_q;
|
||||
quat_t *_host_quat;
|
||||
vec3_acc_t *_off_f;
|
||||
int _off_map_nmax, _cop, _off_ccache;
|
||||
int _off_map_nmax, _cop, _off_ccache, _off_ncache;
|
||||
int *_off_map_ilist;
|
||||
int *_off_map_special, *_off_map_nspecial, *_off_map_tag;
|
||||
int *_off_map_numneigh;
|
||||
|
||||
@ -17,6 +17,9 @@
|
||||
|
||||
#ifdef __INTEL_COMPILER
|
||||
#define LMP_SIMD_COMPILER
|
||||
#if (__INTEL_COMPILER_BUILD_DATE > 20160720)
|
||||
#define LMP_INTEL_USE_SIMDOFF
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef __INTEL_OFFLOAD
|
||||
@ -65,7 +68,10 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
|
||||
#define INTEL_MAX_STENCIL 256
|
||||
// INTEL_MAX_STENCIL * sqrt(INTEL_MAX_STENCIL)
|
||||
#define INTEL_MAX_STENCIL_CHECK 4096
|
||||
#define INTEL_P3M_MAXORDER 5
|
||||
#define INTEL_P3M_MAXORDER 7
|
||||
#define INTEL_P3M_ALIGNED_MAXORDER 8
|
||||
// PRECOMPUTE VALUES IN TABLE (DOESN'T AFFECT ACCURACY)
|
||||
#define INTEL_P3M_TABLE 1
|
||||
|
||||
#ifdef __INTEL_COMPILER
|
||||
#ifdef __AVX__
|
||||
@ -87,24 +93,36 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
|
||||
#ifdef __MIC__
|
||||
#define INTEL_V512 1
|
||||
#define INTEL_VMASK 1
|
||||
#define INTEL_HTHREADS 4
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef __AVX512ER__
|
||||
#define INTEL_HTHREADS 4
|
||||
#endif
|
||||
|
||||
#ifdef __AVX512CD__
|
||||
#ifndef _LMP_INTEL_OFFLOAD
|
||||
#define LMP_USE_AVXCD
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef __MIC__
|
||||
#define INTEL_COMPILE_WIDTH INTEL_MIC_VECTOR_WIDTH
|
||||
#else
|
||||
#define INTEL_COMPILE_WIDTH INTEL_VECTOR_WIDTH
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#undef INTEL_VECTOR_WIDTH
|
||||
#define INTEL_VECTOR_WIDTH 1
|
||||
#define INTEL_COMPILE_WIDTH 1
|
||||
|
||||
#endif
|
||||
|
||||
#define INTEL_DATA_ALIGN 64
|
||||
#define INTEL_ONEATOM_FACTOR 2
|
||||
#define INTEL_ONEATOM_FACTOR 1
|
||||
#define INTEL_MIC_NBOR_PAD INTEL_MIC_VECTOR_WIDTH
|
||||
#define INTEL_NBOR_PAD INTEL_VECTOR_WIDTH
|
||||
#define INTEL_LB_MEAN_WEIGHT 0.1
|
||||
@ -112,6 +130,10 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
|
||||
#define INTEL_MAX_HOST_CORE_COUNT 512
|
||||
#define INTEL_MAX_COI_CORES 36
|
||||
|
||||
#ifndef INTEL_HTHREADS
|
||||
#define INTEL_HTHREADS 2
|
||||
#endif
|
||||
|
||||
#define IP_PRE_get_stride(stride, n, datasize, torque) \
|
||||
{ \
|
||||
int blength = n; \
|
||||
@ -125,9 +147,17 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
|
||||
|
||||
#define IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads) \
|
||||
{ \
|
||||
const int idelta = 1 + inum/nthreads; \
|
||||
int idelta = inum/nthreads; \
|
||||
const int imod = inum % nthreads; \
|
||||
ifrom = tid * idelta; \
|
||||
ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta; \
|
||||
ito = ifrom + idelta; \
|
||||
if (tid < imod) { \
|
||||
ito+=tid+1; \
|
||||
ifrom+=tid; \
|
||||
} else { \
|
||||
ito+=imod; \
|
||||
ifrom+=imod; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define IP_PRE_omp_range_id(ifrom, ito, tid, inum, nthreads) \
|
||||
@ -136,12 +166,37 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
|
||||
IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads); \
|
||||
}
|
||||
|
||||
#define IP_PRE_omp_stride(ifrom, ip, ito, tid, inum, nthr) \
|
||||
{ \
|
||||
if (nthr <= INTEL_HTHREADS) { \
|
||||
ifrom = tid; \
|
||||
ito = inum; \
|
||||
ip = nthr; \
|
||||
} else if (nthr % INTEL_HTHREADS == 0) { \
|
||||
int nd = nthr / INTEL_HTHREADS; \
|
||||
int td = tid / INTEL_HTHREADS; \
|
||||
int tm = tid % INTEL_HTHREADS; \
|
||||
IP_PRE_omp_range(ifrom, ito, td, inum, nd); \
|
||||
ifrom += tm; \
|
||||
ip = INTEL_HTHREADS; \
|
||||
} else { \
|
||||
IP_PRE_omp_range(ifrom, ito, tid, inum, nthr); \
|
||||
ip = 1; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define IP_PRE_omp_stride_id(ifrom, ip, ito, tid, inum, nthr) \
|
||||
{ \
|
||||
tid = omp_get_thread_num(); \
|
||||
IP_PRE_omp_stride(ifrom, ip, ito, tid, inum, nthr); \
|
||||
}
|
||||
|
||||
#define IP_PRE_omp_range_align(ifrom, ito, tid, inum, nthreads, \
|
||||
datasize) \
|
||||
{ \
|
||||
int chunk_size = INTEL_DATA_ALIGN / datasize; \
|
||||
int idelta = static_cast<int>(static_cast<float>(inum) \
|
||||
/chunk_size/nthreads) + 1; \
|
||||
int idelta = static_cast<int>(ceil(static_cast<float>(inum) \
|
||||
/chunk_size/nthreads)); \
|
||||
idelta *= chunk_size; \
|
||||
ifrom = tid*idelta; \
|
||||
ito = ifrom + idelta; \
|
||||
@ -168,6 +223,29 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
|
||||
if (ito > inum) ito = inum; \
|
||||
}
|
||||
|
||||
#define IP_PRE_omp_stride_id_vec(ifrom, ip, ito, tid, inum, \
|
||||
nthr, vecsize) \
|
||||
{ \
|
||||
tid = omp_get_thread_num(); \
|
||||
if (nthr <= INTEL_HTHREADS) { \
|
||||
ifrom = tid*vecsize; \
|
||||
ito = inum; \
|
||||
ip = nthr*vecsize; \
|
||||
} else if (nthr % INTEL_HTHREADS == 0) { \
|
||||
int nd = nthr / INTEL_HTHREADS; \
|
||||
int td = tid / INTEL_HTHREADS; \
|
||||
int tm = tid % INTEL_HTHREADS; \
|
||||
IP_PRE_omp_range_id_vec(ifrom, ito, td, inum, nd, \
|
||||
vecsize); \
|
||||
ifrom += tm * vecsize; \
|
||||
ip = INTEL_HTHREADS * vecsize; \
|
||||
} else { \
|
||||
IP_PRE_omp_range_id_vec(ifrom, ito, tid, inum, nthr, \
|
||||
vecsize); \
|
||||
ip = vecsize; \
|
||||
} \
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads) \
|
||||
@ -183,6 +261,21 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
|
||||
ito = inum; \
|
||||
}
|
||||
|
||||
#define IP_PRE_omp_range(ifrom, ip, ito, tid, inum, nthreads) \
|
||||
{ \
|
||||
ifrom = 0; \
|
||||
ito = inum; \
|
||||
ip = 1; \
|
||||
}
|
||||
|
||||
#define IP_PRE_omp_stride_id(ifrom, ip, ito, tid, inum, nthr) \
|
||||
{ \
|
||||
tid = 0; \
|
||||
ifrom = 0; \
|
||||
ito = inum; \
|
||||
ip = 1; \
|
||||
}
|
||||
|
||||
#define IP_PRE_omp_range_align(ifrom, ito, tid, inum, nthreads, \
|
||||
datasize) \
|
||||
{ \
|
||||
@ -202,14 +295,215 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
|
||||
nthreads, vecsize) \
|
||||
{ \
|
||||
tid = 0; \
|
||||
int idelta = static_cast<int>(ceil(static_cast<float>(inum) \
|
||||
/vecsize)); \
|
||||
ifrom = 0; \
|
||||
ito = inum; \
|
||||
}
|
||||
|
||||
#define IP_PRE_omp_range_id_vec(ifrom, ip, ito, tid, inum, \
|
||||
nthreads, vecsize) \
|
||||
{ \
|
||||
tid = 0; \
|
||||
ifrom = 0; \
|
||||
ito = inum; \
|
||||
ip = vecsize; \
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#define IP_PRE_fdotr_acc_force_l5(lf, lt, minlocal, nthreads, f_start, \
|
||||
f_stride, pos, ov0, ov1, ov2, \
|
||||
ov3, ov4, ov5) \
|
||||
{ \
|
||||
acc_t *f_scalar = &f_start[0].x; \
|
||||
flt_t *x_scalar = &pos[minlocal].x; \
|
||||
int f_stride4 = f_stride * 4; \
|
||||
_alignvar(acc_t ovv[INTEL_COMPILE_WIDTH],64); \
|
||||
int vwidth; \
|
||||
if (sizeof(acc_t) == sizeof(double)) \
|
||||
vwidth = INTEL_COMPILE_WIDTH/2; \
|
||||
else \
|
||||
vwidth = INTEL_COMPILE_WIDTH; \
|
||||
if (vwidth < 4) vwidth = 4; \
|
||||
_use_simd_pragma("vector aligned") \
|
||||
_use_simd_pragma("simd") \
|
||||
for (int v = 0; v < vwidth; v++) ovv[v] = (acc_t)0.0; \
|
||||
int remainder = lt % vwidth; \
|
||||
if (lf > lt) remainder = 0; \
|
||||
const int v_range = lt - remainder; \
|
||||
if (nthreads == 2) { \
|
||||
acc_t *f_scalar2 = f_scalar + f_stride4; \
|
||||
for (int n = lf; n < v_range; n += vwidth) { \
|
||||
_use_simd_pragma("vector aligned") \
|
||||
_use_simd_pragma("simd") \
|
||||
for (int v = 0; v < vwidth; v++) { \
|
||||
f_scalar[n+v] += f_scalar2[n+v]; \
|
||||
ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \
|
||||
} \
|
||||
ov3 += f_scalar[n+1] * x_scalar[n+0]; \
|
||||
ov4 += f_scalar[n+2] * x_scalar[n+0]; \
|
||||
ov5 += f_scalar[n+2] * x_scalar[n+1]; \
|
||||
if (vwidth > 4) { \
|
||||
ov3 += f_scalar[n+5] * x_scalar[n+4]; \
|
||||
ov4 += f_scalar[n+6] * x_scalar[n+4]; \
|
||||
ov5 += f_scalar[n+6] * x_scalar[n+5]; \
|
||||
} \
|
||||
if (vwidth > 8) { \
|
||||
ov3 += f_scalar[n+9] * x_scalar[n+8]; \
|
||||
ov3 += f_scalar[n+13] * x_scalar[n+12]; \
|
||||
ov4 += f_scalar[n+10] * x_scalar[n+8]; \
|
||||
ov4 += f_scalar[n+14] * x_scalar[n+12]; \
|
||||
ov5 += f_scalar[n+10] * x_scalar[n+9]; \
|
||||
ov5 += f_scalar[n+14] * x_scalar[n+13]; \
|
||||
} \
|
||||
} \
|
||||
_use_simd_pragma("vector aligned") \
|
||||
_use_simd_pragma("ivdep") \
|
||||
_use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)") \
|
||||
for (int n = v_range; n < lt; n++) \
|
||||
f_scalar[n] += f_scalar2[n]; \
|
||||
} else if (nthreads==4) { \
|
||||
acc_t *f_scalar2 = f_scalar + f_stride4; \
|
||||
acc_t *f_scalar3 = f_scalar2 + f_stride4; \
|
||||
acc_t *f_scalar4 = f_scalar3 + f_stride4; \
|
||||
for (int n = lf; n < v_range; n += vwidth) { \
|
||||
_use_simd_pragma("vector aligned") \
|
||||
_use_simd_pragma("simd") \
|
||||
for (int v = 0; v < vwidth; v++) { \
|
||||
f_scalar[n+v] += f_scalar2[n+v] + f_scalar3[n+v] + \
|
||||
f_scalar4[n+v]; \
|
||||
ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \
|
||||
} \
|
||||
ov3 += f_scalar[n+1] * x_scalar[n+0]; \
|
||||
ov4 += f_scalar[n+2] * x_scalar[n+0]; \
|
||||
ov5 += f_scalar[n+2] * x_scalar[n+1]; \
|
||||
if (vwidth > 4) { \
|
||||
ov3 += f_scalar[n+5] * x_scalar[n+4]; \
|
||||
ov4 += f_scalar[n+6] * x_scalar[n+4]; \
|
||||
ov5 += f_scalar[n+6] * x_scalar[n+5]; \
|
||||
} \
|
||||
if (vwidth > 8) { \
|
||||
ov3 += f_scalar[n+9] * x_scalar[n+8]; \
|
||||
ov3 += f_scalar[n+13] * x_scalar[n+12]; \
|
||||
ov4 += f_scalar[n+10] * x_scalar[n+8]; \
|
||||
ov4 += f_scalar[n+14] * x_scalar[n+12]; \
|
||||
ov5 += f_scalar[n+10] * x_scalar[n+9]; \
|
||||
ov5 += f_scalar[n+14] * x_scalar[n+13]; \
|
||||
} \
|
||||
} \
|
||||
_use_simd_pragma("vector aligned") \
|
||||
_use_simd_pragma("ivdep") \
|
||||
_use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)") \
|
||||
for (int n = v_range; n < lt; n++) \
|
||||
f_scalar[n] += f_scalar2[n] + f_scalar3[n] + f_scalar4[n]; \
|
||||
} else if (nthreads==1) { \
|
||||
for (int n = lf; n < v_range; n += vwidth) { \
|
||||
_use_simd_pragma("vector aligned") \
|
||||
_use_simd_pragma("simd") \
|
||||
for (int v = 0; v < vwidth; v++) \
|
||||
ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \
|
||||
ov3 += f_scalar[n+1] * x_scalar[n+0]; \
|
||||
ov4 += f_scalar[n+2] * x_scalar[n+0]; \
|
||||
ov5 += f_scalar[n+2] * x_scalar[n+1]; \
|
||||
if (vwidth > 4) { \
|
||||
ov3 += f_scalar[n+5] * x_scalar[n+4]; \
|
||||
ov4 += f_scalar[n+6] * x_scalar[n+4]; \
|
||||
ov5 += f_scalar[n+6] * x_scalar[n+5]; \
|
||||
} \
|
||||
if (vwidth > 8) { \
|
||||
ov3 += f_scalar[n+9] * x_scalar[n+8]; \
|
||||
ov3 += f_scalar[n+13] * x_scalar[n+12]; \
|
||||
ov4 += f_scalar[n+10] * x_scalar[n+8]; \
|
||||
ov4 += f_scalar[n+14] * x_scalar[n+12]; \
|
||||
ov5 += f_scalar[n+10] * x_scalar[n+9]; \
|
||||
ov5 += f_scalar[n+14] * x_scalar[n+13]; \
|
||||
} \
|
||||
} \
|
||||
} else if (nthreads==3) { \
|
||||
acc_t *f_scalar2 = f_scalar + f_stride4; \
|
||||
acc_t *f_scalar3 = f_scalar2 + f_stride4; \
|
||||
for (int n = lf; n < v_range; n += vwidth) { \
|
||||
_use_simd_pragma("vector aligned") \
|
||||
_use_simd_pragma("simd") \
|
||||
for (int v = 0; v < vwidth; v++) { \
|
||||
f_scalar[n+v] += f_scalar2[n+v] + f_scalar3[n+v]; \
|
||||
ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \
|
||||
} \
|
||||
ov3 += f_scalar[n+1] * x_scalar[n+0]; \
|
||||
ov4 += f_scalar[n+2] * x_scalar[n+0]; \
|
||||
ov5 += f_scalar[n+2] * x_scalar[n+1]; \
|
||||
if (vwidth > 4) { \
|
||||
ov3 += f_scalar[n+5] * x_scalar[n+4]; \
|
||||
ov4 += f_scalar[n+6] * x_scalar[n+4]; \
|
||||
ov5 += f_scalar[n+6] * x_scalar[n+5]; \
|
||||
} \
|
||||
if (vwidth > 8) { \
|
||||
ov3 += f_scalar[n+9] * x_scalar[n+8]; \
|
||||
ov3 += f_scalar[n+13] * x_scalar[n+12]; \
|
||||
ov4 += f_scalar[n+10] * x_scalar[n+8]; \
|
||||
ov4 += f_scalar[n+14] * x_scalar[n+12]; \
|
||||
ov5 += f_scalar[n+10] * x_scalar[n+9]; \
|
||||
ov5 += f_scalar[n+14] * x_scalar[n+13]; \
|
||||
} \
|
||||
} \
|
||||
_use_simd_pragma("vector aligned") \
|
||||
_use_simd_pragma("ivdep") \
|
||||
_use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)") \
|
||||
for (int n = v_range; n < lt; n++) \
|
||||
f_scalar[n] += f_scalar2[n] + f_scalar3[n]; \
|
||||
} \
|
||||
for (int n = v_range; n < lt; n += 4) { \
|
||||
_use_simd_pragma("vector aligned") \
|
||||
_use_simd_pragma("ivdep") \
|
||||
for (int v = 0; v < 4; v++) \
|
||||
ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \
|
||||
ov3 += f_scalar[n+1] * x_scalar[n+0]; \
|
||||
ov4 += f_scalar[n+2] * x_scalar[n+0]; \
|
||||
ov5 += f_scalar[n+2] * x_scalar[n+1]; \
|
||||
} \
|
||||
ov0 += ovv[0]; \
|
||||
ov1 += ovv[1]; \
|
||||
ov2 += ovv[2]; \
|
||||
if (vwidth > 4) { \
|
||||
ov0 += ovv[4]; \
|
||||
ov1 += ovv[5]; \
|
||||
ov2 += ovv[6]; \
|
||||
} \
|
||||
if (vwidth > 8) { \
|
||||
ov0 += ovv[8] + ovv[12]; \
|
||||
ov1 += ovv[9] + ovv[13]; \
|
||||
ov2 += ovv[10] + ovv[14]; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define IP_PRE_fdotr_acc_force(nall, minlocal, nthreads, f_start, \
|
||||
f_stride, pos, offload, vflag, ov0, ov1, \
|
||||
ov2, ov3, ov4, ov5) \
|
||||
{ \
|
||||
int o_range = (nall - minlocal) * 4; \
|
||||
IP_PRE_omp_range_id_align(iifrom, iito, tid, o_range, nthreads, \
|
||||
sizeof(acc_t)); \
|
||||
\
|
||||
acc_t *f_scalar = &f_start[0].x; \
|
||||
int f_stride4 = f_stride * 4; \
|
||||
int t; \
|
||||
if (vflag == 2) t = 4; else t = 1; \
|
||||
acc_t *f_scalar2 = f_scalar + f_stride4 * t; \
|
||||
for ( ; t < nthreads; t++) { \
|
||||
_use_simd_pragma("vector aligned") \
|
||||
_use_simd_pragma("simd") \
|
||||
for (int n = iifrom; n < iito; n++) \
|
||||
f_scalar[n] += f_scalar2[n]; \
|
||||
f_scalar2 += f_stride4; \
|
||||
} \
|
||||
\
|
||||
if (vflag == 2) { \
|
||||
int nt_min = MIN(4,nthreads); \
|
||||
IP_PRE_fdotr_acc_force_l5(iifrom, iito, minlocal, nt_min, f_start, \
|
||||
f_stride, pos, ov0, ov1, ov2, ov3, ov4, \
|
||||
ov5); \
|
||||
} \
|
||||
}
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
#include <sys/time.h>
|
||||
|
||||
@ -229,17 +523,19 @@ inline double MIC_Wtime() {
|
||||
if (fix->separate_buffers() && ago != 0) { \
|
||||
fix->start_watch(TIME_PACK); \
|
||||
if (offload) { \
|
||||
_use_omp_pragma("omp parallel default(none) shared(buffers,nlocal,nall)") \
|
||||
int packthreads; \
|
||||
if (comm->nthreads > INTEL_HTHREADS) packthreads = comm->nthreads;\
|
||||
else packthreads = 1; \
|
||||
_use_omp_pragma("omp parallel if(packthreads > 1)") \
|
||||
{ \
|
||||
int ifrom, ito, tid; \
|
||||
int nthreads = comm->nthreads; \
|
||||
IP_PRE_omp_range_id_align(ifrom, ito, tid, nlocal, \
|
||||
nthreads, sizeof(flt_t)); \
|
||||
packthreads, sizeof(flt_t)); \
|
||||
buffers->thr_pack_cop(ifrom, ito, 0); \
|
||||
int nghost = nall - nlocal; \
|
||||
if (nghost) { \
|
||||
IP_PRE_omp_range_align(ifrom, ito, tid, nall - nlocal, \
|
||||
nthreads, sizeof(flt_t)); \
|
||||
packthreads, sizeof(flt_t)); \
|
||||
buffers->thr_pack_cop(ifrom + nlocal, ito + nlocal, \
|
||||
fix->offload_min_ghost() - nlocal, \
|
||||
ago == 1); \
|
||||
@ -254,7 +550,7 @@ inline double MIC_Wtime() {
|
||||
} \
|
||||
}
|
||||
|
||||
#define IP_PRE_get_transfern(ago, newton, evflag, eflag, vflag, \
|
||||
#define IP_PRE_get_transfern(ago, newton, eflag, vflag, \
|
||||
buffers, offload, fix, separate_flag, \
|
||||
x_size, q_size, ev_size, f_stride) \
|
||||
{ \
|
||||
@ -276,17 +572,12 @@ inline double MIC_Wtime() {
|
||||
q_size = 0; \
|
||||
} \
|
||||
ev_size = 0; \
|
||||
if (evflag) { \
|
||||
if (eflag) ev_size = 2; \
|
||||
if (vflag) ev_size = 8; \
|
||||
} \
|
||||
int f_length; \
|
||||
if (eflag) ev_size = 2; \
|
||||
if (vflag) ev_size = 8; \
|
||||
if (newton) \
|
||||
f_length = nall; \
|
||||
f_stride = buffers->get_stride(nall); \
|
||||
else \
|
||||
f_length = nlocal; \
|
||||
f_length -= minlocal; \
|
||||
f_stride = buffers->get_stride(f_length); \
|
||||
f_stride = buffers->get_stride(inum); \
|
||||
}
|
||||
|
||||
#define IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, \
|
||||
@ -337,6 +628,20 @@ inline double MIC_Wtime() {
|
||||
} \
|
||||
}
|
||||
|
||||
#define IP_PRE_fdotr_reduce_omp(newton, nall, minlocal, nthreads, \
|
||||
f_start, f_stride, x, offload, vflag, \
|
||||
ov0, ov1, ov2, ov3, ov4, ov5) \
|
||||
{ \
|
||||
if (newton) { \
|
||||
_use_omp_pragma("omp barrier"); \
|
||||
IP_PRE_fdotr_acc_force(nall, minlocal, nthreads, f_start, \
|
||||
f_stride, x, offload, vflag, ov0, ov1, ov2, \
|
||||
ov3, ov4, ov5); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define IP_PRE_fdotr_reduce(newton, nall, nthreads, f_stride, vflag, \
|
||||
ov0, ov1, ov2, ov3, ov4, ov5)
|
||||
|
||||
#else
|
||||
|
||||
@ -344,7 +649,7 @@ inline double MIC_Wtime() {
|
||||
#define IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, \
|
||||
nlocal, nall)
|
||||
|
||||
#define IP_PRE_get_transfern(ago, newton, evflag, eflag, vflag, \
|
||||
#define IP_PRE_get_transfern(ago, newton, eflag, vflag, \
|
||||
buffers, offload, fix, separate_flag, \
|
||||
x_size, q_size, ev_size, f_stride) \
|
||||
{ \
|
||||
@ -369,18 +674,54 @@ inline double MIC_Wtime() {
|
||||
#define IP_PRE_repack_for_offload(newton, separate_flag, nlocal, nall, \
|
||||
f_stride, x, q)
|
||||
|
||||
#define IP_PRE_fdotr_reduce_omp(newton, nall, minlocal, nthreads, \
|
||||
f_start, f_stride, x, offload, vflag, \
|
||||
ov0, ov1, ov2, ov3, ov4, ov5) \
|
||||
{ \
|
||||
if (newton) { \
|
||||
if (vflag == 2 && nthreads > INTEL_HTHREADS) { \
|
||||
_use_omp_pragma("omp barrier"); \
|
||||
buffers->fdotr_reduce(nall, nthreads, f_stride, ov0, ov1, ov2, \
|
||||
ov3, ov4, ov5); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
#define IP_PRE_fdotr_reduce(newton, nall, nthreads, f_stride, vflag, \
|
||||
ov0, ov1, ov2, ov3, ov4, ov5) \
|
||||
{ \
|
||||
if (newton) { \
|
||||
if (vflag == 2 && nthreads <= INTEL_HTHREADS) { \
|
||||
int lt = nall * 4; \
|
||||
buffers->fdotr_reduce_l5(0, lt, nthreads, f_stride, ov0, ov1, \
|
||||
ov2, ov3, ov4, ov5); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#define IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair, delx, dely, delz) \
|
||||
#define IP_PRE_ev_tally_nbor(vflag, fpair, delx, dely, delz) \
|
||||
{ \
|
||||
if (vflag == 1) { \
|
||||
sv0 += ev_pre * delx * delx * fpair; \
|
||||
sv1 += ev_pre * dely * dely * fpair; \
|
||||
sv2 += ev_pre * delz * delz * fpair; \
|
||||
sv3 += ev_pre * delx * dely * fpair; \
|
||||
sv4 += ev_pre * delx * delz * fpair; \
|
||||
sv5 += ev_pre * dely * delz * fpair; \
|
||||
sv0 += delx * delx * fpair; \
|
||||
sv1 += dely * dely * fpair; \
|
||||
sv2 += delz * delz * fpair; \
|
||||
sv3 += delx * dely * fpair; \
|
||||
sv4 += delx * delz * fpair; \
|
||||
sv5 += dely * delz * fpair; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define IP_PRE_ev_tally_nborv(vflag, dx, dy, dz, fpx, fpy, fpz) \
|
||||
{ \
|
||||
if (vflag == 1) { \
|
||||
sv0 += dx * fpx; \
|
||||
sv1 += dy * fpy; \
|
||||
sv2 += dz * fpz; \
|
||||
sv3 += dx * fpy; \
|
||||
sv4 += dx * fpz; \
|
||||
sv5 += dy * fpz; \
|
||||
} \
|
||||
}
|
||||
|
||||
@ -408,9 +749,10 @@ inline double MIC_Wtime() {
|
||||
} \
|
||||
}
|
||||
|
||||
#define IP_PRE_ev_tally_bond(eflag, eatom, vflag, ebond, i1, i2, fbond, \
|
||||
delx, dely, delz, obond, force, newton, \
|
||||
nlocal, ov0, ov1, ov2, ov3, ov4, ov5) \
|
||||
#define IP_PRE_ev_tally_bond(eflag, VFLAG, eatom, vflag, ebond, i1, i2, \
|
||||
fbond, delx, dely, delz, obond, force, \
|
||||
newton, nlocal, ov0, ov1, ov2, ov3, ov4, \
|
||||
ov5) \
|
||||
{ \
|
||||
flt_t ev_pre; \
|
||||
if (newton) ev_pre = (flt_t)1.0; \
|
||||
@ -421,7 +763,7 @@ inline double MIC_Wtime() {
|
||||
} \
|
||||
\
|
||||
if (eflag) { \
|
||||
oebond += ev_pre * ebond; \
|
||||
obond += ev_pre * ebond; \
|
||||
if (eatom) { \
|
||||
flt_t halfeng = ebond * (flt_t)0.5; \
|
||||
if (newton || i1 < nlocal) f[i1].w += halfeng; \
|
||||
@ -429,7 +771,7 @@ inline double MIC_Wtime() {
|
||||
} \
|
||||
} \
|
||||
\
|
||||
if (vflag) { \
|
||||
if (VFLAG && vflag) { \
|
||||
ov0 += ev_pre * (delx * delx * fbond); \
|
||||
ov1 += ev_pre * (dely * dely * fbond); \
|
||||
ov2 += ev_pre * (delz * delz * fbond); \
|
||||
@ -439,9 +781,9 @@ inline double MIC_Wtime() {
|
||||
} \
|
||||
}
|
||||
|
||||
#define IP_PRE_ev_tally_angle(eflag, eatom, vflag, eangle, i1, i2, i3, \
|
||||
f1x, f1y, f1z, f3x, f3y, f3z, delx1, \
|
||||
dely1, delz1, delx2, dely2, delz2, \
|
||||
#define IP_PRE_ev_tally_angle(eflag, VFLAG, eatom, vflag, eangle, i1, \
|
||||
i2, i3, f1x, f1y, f1z, f3x, f3y, f3z, \
|
||||
delx1, dely1, delz1, delx2, dely2, delz2, \
|
||||
oeangle, force, newton, nlocal, ov0, ov1, \
|
||||
ov2, ov3, ov4, ov5) \
|
||||
{ \
|
||||
@ -464,20 +806,20 @@ inline double MIC_Wtime() {
|
||||
} \
|
||||
} \
|
||||
\
|
||||
if (vflag) { \
|
||||
ov0 += ev_pre * (delx1 * f1x + delx2 * f3x); \
|
||||
ov1 += ev_pre * (dely1 * f1y + dely2 * f3y); \
|
||||
ov2 += ev_pre * (delz1 * f1z + delz2 * f3z); \
|
||||
ov3 += ev_pre * (delx1 * f1y + delx2 * f3y); \
|
||||
ov4 += ev_pre * (delx1 * f1z + delx2 * f3z); \
|
||||
ov5 += ev_pre * (dely1 * f1z + dely2 * f3z); \
|
||||
if (VFLAG && vflag) { \
|
||||
ov0 += ev_pre * (delx1 * f1x + delx2 * f3x); \
|
||||
ov1 += ev_pre * (dely1 * f1y + dely2 * f3y); \
|
||||
ov2 += ev_pre * (delz1 * f1z + delz2 * f3z); \
|
||||
ov3 += ev_pre * (delx1 * f1y + delx2 * f3y); \
|
||||
ov4 += ev_pre * (delx1 * f1z + delx2 * f3z); \
|
||||
ov5 += ev_pre * (dely1 * f1z + dely2 * f3z); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define IP_PRE_ev_tally_dihed(eflag, eatom, vflag, deng, i1, i2, i3, i4,\
|
||||
f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, \
|
||||
f4z, vb1x, vb1y, vb1z, vb2x, vb2y, vb2z, \
|
||||
vb3x, vb3y, vb3z,oedihedral, force, \
|
||||
#define IP_PRE_ev_tally_dihed(eflag, VFLAG, eatom, vflag, deng, i1, i2, \
|
||||
i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x,\
|
||||
f4y, f4z, vb1x, vb1y, vb1z, vb2x, vb2y, \
|
||||
vb2z, vb3x, vb3y, vb3z, oedihedral, force,\
|
||||
newton, nlocal, ov0, ov1, ov2, ov3, ov4, \
|
||||
ov5) \
|
||||
{ \
|
||||
@ -502,7 +844,7 @@ inline double MIC_Wtime() {
|
||||
} \
|
||||
} \
|
||||
\
|
||||
if (vflag) { \
|
||||
if (VFLAG && vflag) { \
|
||||
ov0 += ev_pre * (vb1x*f1x + vb2x*f3x + (vb3x+vb2x)*f4x); \
|
||||
ov1 += ev_pre * (vb1y*f1y + vb2y*f3y + (vb3y+vb2y)*f4y); \
|
||||
ov2 += ev_pre * (vb1z*f1z + vb2z*f3z + (vb3z+vb2z)*f4z); \
|
||||
@ -512,96 +854,36 @@ inline double MIC_Wtime() {
|
||||
} \
|
||||
}
|
||||
|
||||
#define IP_PRE_ev_tally_atom(evflag, eflag, vflag, f, fwtmp) \
|
||||
#define IP_PRE_ev_tally_atom(newton, eflag, vflag, f, fwtmp) \
|
||||
{ \
|
||||
if (evflag) { \
|
||||
if (eflag) { \
|
||||
f[i].w += fwtmp; \
|
||||
oevdwl += sevdwl; \
|
||||
} \
|
||||
if (vflag == 1) { \
|
||||
ov0 += sv0; \
|
||||
ov1 += sv1; \
|
||||
ov2 += sv2; \
|
||||
ov3 += sv3; \
|
||||
ov4 += sv4; \
|
||||
ov5 += sv5; \
|
||||
} \
|
||||
if (eflag) { \
|
||||
f[i].w += fwtmp; \
|
||||
oevdwl += sevdwl; \
|
||||
} \
|
||||
if (newton == 0 && vflag == 1) { \
|
||||
ov0 += sv0; \
|
||||
ov1 += sv1; \
|
||||
ov2 += sv2; \
|
||||
ov3 += sv3; \
|
||||
ov4 += sv4; \
|
||||
ov5 += sv5; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define IP_PRE_ev_tally_atomq(evflag, eflag, vflag, f, fwtmp) \
|
||||
#define IP_PRE_ev_tally_atomq(newton, eflag, vflag, f, fwtmp) \
|
||||
{ \
|
||||
if (evflag) { \
|
||||
if (eflag) { \
|
||||
f[i].w += fwtmp; \
|
||||
oevdwl += sevdwl; \
|
||||
oecoul += secoul; \
|
||||
} \
|
||||
if (vflag == 1) { \
|
||||
ov0 += sv0; \
|
||||
ov1 += sv1; \
|
||||
ov2 += sv2; \
|
||||
ov3 += sv3; \
|
||||
ov4 += sv4; \
|
||||
ov5 += sv5; \
|
||||
} \
|
||||
if (eflag) { \
|
||||
f[i].w += fwtmp; \
|
||||
oevdwl += sevdwl; \
|
||||
oecoul += secoul; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define IP_PRE_fdotr_acc_force(newton, evflag, eflag, vflag, eatom, \
|
||||
nall, nlocal, minlocal, nthreads, \
|
||||
f_start, f_stride, x, offload) \
|
||||
{ \
|
||||
int o_range; \
|
||||
if (newton) \
|
||||
o_range = nall; \
|
||||
else \
|
||||
o_range = nlocal; \
|
||||
if (offload == 0) o_range -= minlocal; \
|
||||
IP_PRE_omp_range_align(iifrom, iito, tid, o_range, nthreads, \
|
||||
sizeof(acc_t)); \
|
||||
\
|
||||
int t_off = f_stride; \
|
||||
if (eflag && eatom) { \
|
||||
for (int t = 1; t < nthreads; t++) { \
|
||||
_use_simd_pragma("vector nontemporal") \
|
||||
_use_simd_pragma("novector") \
|
||||
for (int n = iifrom; n < iito; n++) { \
|
||||
f_start[n].x += f_start[n + t_off].x; \
|
||||
f_start[n].y += f_start[n + t_off].y; \
|
||||
f_start[n].z += f_start[n + t_off].z; \
|
||||
f_start[n].w += f_start[n + t_off].w; \
|
||||
} \
|
||||
t_off += f_stride; \
|
||||
} \
|
||||
} else { \
|
||||
for (int t = 1; t < nthreads; t++) { \
|
||||
_use_simd_pragma("vector nontemporal") \
|
||||
_use_simd_pragma("novector") \
|
||||
for (int n = iifrom; n < iito; n++) { \
|
||||
f_start[n].x += f_start[n + t_off].x; \
|
||||
f_start[n].y += f_start[n + t_off].y; \
|
||||
f_start[n].z += f_start[n + t_off].z; \
|
||||
} \
|
||||
t_off += f_stride; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
if (evflag) { \
|
||||
if (vflag == 2) { \
|
||||
const ATOM_T * _noalias const xo = x + minlocal; \
|
||||
_use_simd_pragma("vector nontemporal") \
|
||||
_use_simd_pragma("novector") \
|
||||
for (int n = iifrom; n < iito; n++) { \
|
||||
ov0 += f_start[n].x * xo[n].x; \
|
||||
ov1 += f_start[n].y * xo[n].y; \
|
||||
ov2 += f_start[n].z * xo[n].z; \
|
||||
ov3 += f_start[n].y * xo[n].x; \
|
||||
ov4 += f_start[n].z * xo[n].x; \
|
||||
ov5 += f_start[n].z * xo[n].y; \
|
||||
} \
|
||||
} \
|
||||
if (newton == 0 && vflag == 1) { \
|
||||
ov0 += sv0; \
|
||||
ov1 += sv1; \
|
||||
ov2 += sv2; \
|
||||
ov3 += sv3; \
|
||||
ov4 += sv4; \
|
||||
ov5 += sv5; \
|
||||
} \
|
||||
}
|
||||
|
||||
|
||||
@ -1778,7 +1778,7 @@ namespace ip_simd {
|
||||
inline void SIMD_iforce_update(const SIMD_mask &m, float *force,
|
||||
const SIMD_int &i, const SIMD_float &fx,
|
||||
const SIMD_float &fy, const SIMD_float &fz,
|
||||
const int EVFLAG, const int eatom,
|
||||
const int EFLAG, const int eatom,
|
||||
const SIMD_float &fwtmp) {
|
||||
SIMD_float jfrc;
|
||||
jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force,
|
||||
@ -1793,7 +1793,7 @@ namespace ip_simd {
|
||||
_MM_SCALE_1);
|
||||
jfrc = jfrc + fz;
|
||||
_mm512_mask_i32scatter_ps(force+2, m, i, jfrc, _MM_SCALE_1);
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) {
|
||||
if (eatom) {
|
||||
jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 3,
|
||||
_MM_SCALE_1);
|
||||
@ -1806,7 +1806,7 @@ namespace ip_simd {
|
||||
inline void SIMD_iforce_update(const SIMD_mask &m, double *force,
|
||||
const SIMD_int &i, const SIMD_double &fx,
|
||||
const SIMD_double &fy, const SIMD_double &fz,
|
||||
const int EVFLAG, const int eatom,
|
||||
const int EFLAG, const int eatom,
|
||||
const SIMD_double &fwtmp) {
|
||||
SIMD_double jfrc;
|
||||
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force,
|
||||
@ -1821,7 +1821,7 @@ namespace ip_simd {
|
||||
_MM_SCALE_2);
|
||||
jfrc = jfrc + fz;
|
||||
_mm512_mask_i32loscatter_pd(force+2, m, i, jfrc, _MM_SCALE_2);
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) {
|
||||
if (eatom) {
|
||||
jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i,
|
||||
force + 3, _MM_SCALE_2);
|
||||
|
||||
@ -71,7 +71,7 @@ void NBinIntel::bin_atoms_setup(int nall)
|
||||
if (_offload_alloc) {
|
||||
const int * binhead = this->binhead;
|
||||
#pragma offload_transfer target(mic:_cop) \
|
||||
nocopy(binhead:alloc_if(0) free_if(1))
|
||||
nocopy(binhead:alloc_if(0) free_if(1))
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -99,7 +99,7 @@ void NBinIntel::bin_atoms_setup(int nall)
|
||||
const int * _atombin = this->_atombin;
|
||||
const int * _binpacked = this->_binpacked;
|
||||
#pragma offload_transfer target(mic:_cop) \
|
||||
nocopy(bins,_atombin,_binpacked:alloc_if(0) free_if(1))
|
||||
nocopy(bins,_atombin,_binpacked:alloc_if(0) free_if(1))
|
||||
}
|
||||
#endif
|
||||
memory->destroy(bins);
|
||||
@ -158,9 +158,9 @@ void NBinIntel::bin_atoms(IntelBuffers<flt_t,acc_t> * buffers) {
|
||||
const flt_t dy = (INTEL_BIGP - bboxhi[1]);
|
||||
const flt_t dz = (INTEL_BIGP - bboxhi[2]);
|
||||
if (dx * dx + dy * dy + dz * dz <
|
||||
static_cast<flt_t>(neighbor->cutneighmaxsq))
|
||||
static_cast<flt_t>(neighbor->cutneighmaxsq))
|
||||
error->one(FLERR,
|
||||
"Intel package expects no atoms within cutoff of {1e15,1e15,1e15}.");
|
||||
"Intel package expects no atoms within cutoff of {1e15,1e15,1e15}.");
|
||||
}
|
||||
|
||||
// ---------- Grow and cast/pack buffers -------------
|
||||
@ -174,14 +174,16 @@ void NBinIntel::bin_atoms(IntelBuffers<flt_t,acc_t> * buffers) {
|
||||
biga.w = 1;
|
||||
buffers->get_x()[nall] = biga;
|
||||
|
||||
const int nthreads = comm->nthreads;
|
||||
int nthreads;
|
||||
if (comm->nthreads > INTEL_HTHREADS) nthreads = comm->nthreads;
|
||||
else nthreads = 1;
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp parallel default(none) shared(buffers)
|
||||
#pragma omp parallel if(nthreads > INTEL_HTHREADS)
|
||||
#endif
|
||||
{
|
||||
int ifrom, ito, tid;
|
||||
IP_PRE_omp_range_id_align(ifrom, ito, tid, nall, nthreads,
|
||||
sizeof(ATOM_T));
|
||||
sizeof(ATOM_T));
|
||||
buffers->thr_pack(ifrom, ito, 0);
|
||||
}
|
||||
_fix->stop_watch(TIME_PACK);
|
||||
|
||||
@ -70,483 +70,62 @@ fbi(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) {
|
||||
#endif
|
||||
|
||||
buffers->grow_list(list, atom->nlocal, comm->nthreads, off_end,
|
||||
_fix->nbor_pack_width());
|
||||
_fix->nbor_pack_width());
|
||||
|
||||
int need_ic = 0;
|
||||
if (atom->molecular)
|
||||
dminimum_image_check(need_ic, neighbor->cutneighmax, neighbor->cutneighmax,
|
||||
neighbor->cutneighmax);
|
||||
neighbor->cutneighmax);
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (need_ic) {
|
||||
if (offload_noghost) {
|
||||
fbi<flt_t,acc_t,1,1>(1, list, buffers, 0, off_end);
|
||||
fbi<flt_t,acc_t,1,1>(0, list, buffers, host_start, nlocal, off_end);
|
||||
if (_fix->three_body_neighbor()) {
|
||||
if (need_ic) {
|
||||
if (offload_noghost) {
|
||||
bin_newton<flt_t,acc_t,1,1,1,0,1>(1, list, buffers, 0, off_end);
|
||||
bin_newton<flt_t,acc_t,1,1,1,0,1>(0, list, buffers, host_start, nlocal, off_end);
|
||||
} else {
|
||||
bin_newton<flt_t,acc_t,0,1,1,0,1>(1, list, buffers, 0, off_end);
|
||||
bin_newton<flt_t,acc_t,0,1,1,0,1>(0, list, buffers, host_start, nlocal);
|
||||
}
|
||||
} else {
|
||||
fbi<flt_t,acc_t,0,1>(1, list, buffers, 0, off_end);
|
||||
fbi<flt_t,acc_t,0,1>(0, list, buffers, host_start, nlocal);
|
||||
if (offload_noghost) {
|
||||
bin_newton<flt_t,acc_t,1,0,1,0,1>(1, list, buffers, 0, off_end);
|
||||
bin_newton<flt_t,acc_t,1,0,1,0,1>(0, list, buffers, host_start, nlocal, off_end);
|
||||
} else {
|
||||
bin_newton<flt_t,acc_t,0,0,1,0,1>(1, list, buffers, 0, off_end);
|
||||
bin_newton<flt_t,acc_t,0,0,1,0,1>(0, list, buffers, host_start, nlocal);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (offload_noghost) {
|
||||
fbi<flt_t,acc_t,1,0>(1, list, buffers, 0, off_end);
|
||||
fbi<flt_t,acc_t,1,0>(0, list, buffers, host_start, nlocal, off_end);
|
||||
if (need_ic) {
|
||||
if (offload_noghost) {
|
||||
bin_newton<flt_t,acc_t,1,1,1,0,0>(1, list, buffers, 0, off_end);
|
||||
bin_newton<flt_t,acc_t,1,1,1,0,0>(0, list, buffers, host_start, nlocal, off_end);
|
||||
} else {
|
||||
bin_newton<flt_t,acc_t,0,1,1,0,0>(1, list, buffers, 0, off_end);
|
||||
bin_newton<flt_t,acc_t,0,1,1,0,0>(0, list, buffers, host_start, nlocal);
|
||||
}
|
||||
} else {
|
||||
fbi<flt_t,acc_t,0,0>(1, list, buffers, 0, off_end);
|
||||
fbi<flt_t,acc_t,0,0>(0, list, buffers, host_start, nlocal);
|
||||
if (offload_noghost) {
|
||||
bin_newton<flt_t,acc_t,1,0,1,0,0>(1, list, buffers, 0, off_end);
|
||||
bin_newton<flt_t,acc_t,1,0,1,0,0>(0, list, buffers, host_start, nlocal, off_end);
|
||||
} else {
|
||||
bin_newton<flt_t,acc_t,0,0,1,0,0>(1, list, buffers, 0, off_end);
|
||||
bin_newton<flt_t,acc_t,0,0,1,0,0>(0, list, buffers, host_start, nlocal);
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
if (need_ic)
|
||||
fbi<flt_t,acc_t,0,1>(0, list, buffers, host_start, nlocal);
|
||||
else
|
||||
fbi<flt_t,acc_t,0,0>(0, list, buffers, host_start, nlocal);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class flt_t, class acc_t, int offload_noghost, int need_ic>
|
||||
void NPairFullBinIntel::
|
||||
fbi(const int offload, NeighList *list, IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const int astart, const int aend, const int offload_end) {
|
||||
|
||||
if (aend-astart == 0) return;
|
||||
|
||||
const int nall = atom->nlocal + atom->nghost;
|
||||
int pad = 1;
|
||||
int nall_t = nall;
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (offload_noghost && offload) nall_t = atom->nlocal;
|
||||
#endif
|
||||
|
||||
const int pack_width = _fix->nbor_pack_width();
|
||||
const int pad_width = pad;
|
||||
|
||||
const ATOM_T * _noalias const x = buffers->get_x();
|
||||
int * _noalias const firstneigh = buffers->firstneigh(list);
|
||||
const int e_nall = nall_t;
|
||||
|
||||
const int molecular = atom->molecular;
|
||||
int *ns = NULL;
|
||||
tagint *s = NULL;
|
||||
int tag_size = 0, special_size;
|
||||
if (buffers->need_tag()) tag_size = e_nall;
|
||||
if (molecular) {
|
||||
s = atom->special[0];
|
||||
ns = atom->nspecial[0];
|
||||
special_size = aend;
|
||||
} else {
|
||||
s = &buffers->_special_holder;
|
||||
ns = &buffers->_nspecial_holder;
|
||||
special_size = 0;
|
||||
}
|
||||
const tagint * _noalias const special = s;
|
||||
const int * _noalias const nspecial = ns;
|
||||
const int maxspecial = atom->maxspecial;
|
||||
const tagint * _noalias const tag = atom->tag;
|
||||
|
||||
int * _noalias const ilist = list->ilist;
|
||||
int * _noalias numneigh = list->numneigh;
|
||||
int * _noalias const cnumneigh = buffers->cnumneigh(list);
|
||||
const int nstencil = this->nstencil;
|
||||
const int * _noalias const stencil = this->stencil;
|
||||
const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0];
|
||||
const int ntypes = atom->ntypes + 1;
|
||||
const int nlocal = atom->nlocal;
|
||||
|
||||
#ifndef _LMP_INTEL_OFFLOAD
|
||||
int * const mask = atom->mask;
|
||||
tagint * const molecule = atom->molecule;
|
||||
#endif
|
||||
|
||||
int tnum;
|
||||
int *overflow;
|
||||
double *timer_compute;
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (offload) {
|
||||
timer_compute = _fix->off_watch_neighbor();
|
||||
tnum = buffers->get_off_threads();
|
||||
overflow = _fix->get_off_overflow_flag();
|
||||
_fix->stop_watch(TIME_HOST_NEIGHBOR);
|
||||
_fix->start_watch(TIME_OFFLOAD_LATENCY);
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
tnum = comm->nthreads;
|
||||
overflow = _fix->get_overflow_flag();
|
||||
}
|
||||
const int nthreads = tnum;
|
||||
const int maxnbors = buffers->get_max_nbors();
|
||||
int * _noalias const atombin = buffers->get_atombin();
|
||||
const int * _noalias const binpacked = buffers->get_binpacked();
|
||||
|
||||
const int xperiodic = domain->xperiodic;
|
||||
const int yperiodic = domain->yperiodic;
|
||||
const int zperiodic = domain->zperiodic;
|
||||
const flt_t xprd_half = domain->xprd_half;
|
||||
const flt_t yprd_half = domain->yprd_half;
|
||||
const flt_t zprd_half = domain->zprd_half;
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
const int * _noalias const binhead = this->binhead;
|
||||
const int * _noalias const bins = this->bins;
|
||||
const int cop = _fix->coprocessor_number();
|
||||
const int separate_buffers = _fix->separate_buffers();
|
||||
#pragma offload target(mic:cop) if(offload) \
|
||||
in(x:length(e_nall+1) alloc_if(0) free_if(0)) \
|
||||
in(tag:length(tag_size) alloc_if(0) free_if(0)) \
|
||||
in(special:length(special_size*maxspecial) alloc_if(0) free_if(0)) \
|
||||
in(nspecial:length(special_size*3) alloc_if(0) free_if(0)) \
|
||||
in(bins,binpacked:length(nall) alloc_if(0) free_if(0)) \
|
||||
in(binhead:length(mbins+1) alloc_if(0) free_if(0)) \
|
||||
in(cutneighsq:length(0) alloc_if(0) free_if(0)) \
|
||||
in(firstneigh:length(0) alloc_if(0) free_if(0)) \
|
||||
in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
|
||||
out(numneigh:length(0) alloc_if(0) free_if(0)) \
|
||||
in(ilist:length(0) alloc_if(0) free_if(0)) \
|
||||
in(atombin:length(aend) alloc_if(0) free_if(0)) \
|
||||
in(stencil:length(nstencil) alloc_if(0) free_if(0)) \
|
||||
in(maxnbors,nthreads,maxspecial,nstencil,e_nall,offload,pack_width) \
|
||||
in(offload_end,separate_buffers,astart, aend, nlocal, molecular, ntypes) \
|
||||
in(xperiodic, yperiodic, zperiodic, xprd_half, yprd_half, zprd_half) \
|
||||
out(overflow:length(5) alloc_if(0) free_if(0)) \
|
||||
out(timer_compute:length(1) alloc_if(0) free_if(0)) \
|
||||
signal(tag)
|
||||
#endif
|
||||
{
|
||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||
*timer_compute = MIC_Wtime();
|
||||
#endif
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
overflow[LMP_LOCAL_MIN] = astart;
|
||||
overflow[LMP_LOCAL_MAX] = aend - 1;
|
||||
overflow[LMP_GHOST_MIN] = e_nall;
|
||||
overflow[LMP_GHOST_MAX] = -1;
|
||||
#endif
|
||||
|
||||
int nstencilp = 0;
|
||||
int binstart[INTEL_MAX_STENCIL], binend[INTEL_MAX_STENCIL];
|
||||
for (int k = 0; k < nstencil; k++) {
|
||||
binstart[nstencilp] = stencil[k];
|
||||
int end = stencil[k] + 1;
|
||||
for (int kk = k + 1; kk < nstencil; kk++) {
|
||||
if (stencil[kk-1]+1 == stencil[kk]) {
|
||||
end++;
|
||||
k++;
|
||||
} else break;
|
||||
}
|
||||
binend[nstencilp] = end;
|
||||
nstencilp++;
|
||||
}
|
||||
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp parallel default(none) \
|
||||
shared(numneigh, overflow, nstencilp, binstart, binend)
|
||||
#endif
|
||||
{
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
int lmin = e_nall, lmax = -1, gmin = e_nall, gmax = -1;
|
||||
#endif
|
||||
|
||||
const int num = aend - astart;
|
||||
int tid, ifrom, ito;
|
||||
|
||||
IP_PRE_omp_range_id_vec(ifrom, ito, tid, num, nthreads, pack_width);
|
||||
ifrom += astart;
|
||||
ito += astart;
|
||||
int e_ito = ito;
|
||||
if (ito == num) {
|
||||
int imod = ito % pack_width;
|
||||
if (imod) e_ito += pack_width - imod;
|
||||
}
|
||||
const int list_size = (e_ito + tid * 2 + 2) * maxnbors;
|
||||
int which;
|
||||
int pack_offset = maxnbors * pack_width;
|
||||
int ct = (ifrom + tid * 2) * maxnbors;
|
||||
int *neighptr = firstneigh + ct;
|
||||
const int obound = pack_offset + maxnbors * 2;
|
||||
|
||||
int max_chunk = 0;
|
||||
int lane = 0;
|
||||
for (int i = ifrom; i < ito; i++) {
|
||||
const flt_t xtmp = x[i].x;
|
||||
const flt_t ytmp = x[i].y;
|
||||
const flt_t ztmp = x[i].z;
|
||||
const int itype = x[i].w;
|
||||
const tagint itag = tag[i];
|
||||
const int ioffset = ntypes * itype;
|
||||
|
||||
const int ibin = atombin[i];
|
||||
int raw_count = pack_offset;
|
||||
|
||||
// loop over all atoms in surrounding bins in stencil including self
|
||||
// skip i = j
|
||||
if (exclude) {
|
||||
for (int k = 0; k < nstencilp; k++) {
|
||||
const int bstart = binhead[ibin + binstart[k]];
|
||||
const int bend = binhead[ibin + binend[k]];
|
||||
#ifndef _LMP_INTEL_OFFLOAD
|
||||
#ifdef INTEL_VMASK
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int jj = bstart; jj < bend; jj++) {
|
||||
int j = binpacked[jj];
|
||||
|
||||
if (i == j) j=e_nall;
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (offload_noghost) {
|
||||
if (j < nlocal) {
|
||||
if (i < offload_end) continue;
|
||||
} else if (offload) continue;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef _LMP_INTEL_OFFLOAD
|
||||
const int jtype = x[j].w;
|
||||
if (exclusion(i,j,itype,jtype,mask,molecule)) continue;
|
||||
#endif
|
||||
|
||||
neighptr[raw_count++] = j;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int k = 0; k < nstencilp; k++) {
|
||||
const int bstart = binhead[ibin + binstart[k]];
|
||||
const int bend = binhead[ibin + binend[k]];
|
||||
#ifndef _LMP_INTEL_OFFLOAD
|
||||
#ifdef INTEL_VMASK
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int jj = bstart; jj < bend; jj++) {
|
||||
int j = binpacked[jj];
|
||||
|
||||
if (i == j) j=e_nall;
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (offload_noghost) {
|
||||
if (j < nlocal) {
|
||||
if (i < offload_end) continue;
|
||||
} else if (offload) continue;
|
||||
}
|
||||
#endif
|
||||
|
||||
neighptr[raw_count++] = j;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (raw_count > obound) *overflow = 1;
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
int vlmin = lmin, vlmax = lmax, vgmin = gmin, vgmax = gmax;
|
||||
#if __INTEL_COMPILER+0 > 1499
|
||||
#pragma vector aligned
|
||||
#pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin)
|
||||
#endif
|
||||
#else
|
||||
#pragma vector aligned
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int u = pack_offset; u < raw_count; u++) {
|
||||
int j = neighptr[u];
|
||||
const flt_t delx = xtmp - x[j].x;
|
||||
const flt_t dely = ytmp - x[j].y;
|
||||
const flt_t delz = ztmp - x[j].z;
|
||||
const int jtype = x[j].w;
|
||||
const flt_t rsq = delx * delx + dely * dely + delz * delz;
|
||||
if (rsq > cutneighsq[ioffset + jtype])
|
||||
neighptr[u] = e_nall;
|
||||
else {
|
||||
if (need_ic) {
|
||||
int no_special;
|
||||
ominimum_image_check(no_special, delx, dely, delz);
|
||||
if (no_special)
|
||||
neighptr[u] = -j - 1;
|
||||
}
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (j < nlocal) {
|
||||
if (j < vlmin) vlmin = j;
|
||||
if (j > vlmax) vlmax = j;
|
||||
} else {
|
||||
if (j < vgmin) vgmin = j;
|
||||
if (j > vgmax) vgmax = j;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
lmin = MIN(lmin,vlmin);
|
||||
gmin = MIN(gmin,vgmin);
|
||||
lmax = MAX(lmax,vlmax);
|
||||
gmax = MAX(gmax,vgmax);
|
||||
#endif
|
||||
|
||||
int n = lane, n2 = pack_offset;
|
||||
for (int u = pack_offset; u < raw_count; u++) {
|
||||
const int j = neighptr[u];
|
||||
int pj = j;
|
||||
if (pj < e_nall) {
|
||||
if (need_ic)
|
||||
if (pj < 0) pj = -pj - 1;
|
||||
|
||||
const int jtag = tag[pj];
|
||||
int flist = 0;
|
||||
if (itag > jtag) {
|
||||
if ((itag+jtag) % 2 == 0) flist = 1;
|
||||
} else if (itag < jtag) {
|
||||
if ((itag+jtag) % 2 == 1) flist = 1;
|
||||
} else {
|
||||
if (x[pj].z < ztmp) flist = 1;
|
||||
else if (x[pj].z == ztmp && x[pj].y < ytmp) flist = 1;
|
||||
else if (x[pj].z == ztmp && x[pj].y == ytmp && x[pj].x < xtmp)
|
||||
flist = 1;
|
||||
}
|
||||
if (flist) {
|
||||
neighptr[n2++] = j;
|
||||
} else {
|
||||
neighptr[n] = j;
|
||||
n += pack_width;
|
||||
}
|
||||
}
|
||||
}
|
||||
int ns = (n - lane) / pack_width;
|
||||
atombin[i] = ns;
|
||||
for (int u = pack_offset; u < n2; u++) {
|
||||
neighptr[n] = neighptr[u];
|
||||
n += pack_width;
|
||||
}
|
||||
|
||||
ilist[i] = i;
|
||||
cnumneigh[i] = ct + lane;
|
||||
ns += n2 - pack_offset;
|
||||
numneigh[i] = ns;
|
||||
|
||||
if (ns > max_chunk) max_chunk = ns;
|
||||
lane++;
|
||||
if (lane == pack_width) {
|
||||
ct += max_chunk * pack_width;
|
||||
const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
|
||||
const int edge = (ct % alignb);
|
||||
if (edge) ct += alignb - edge;
|
||||
neighptr = firstneigh + ct;
|
||||
max_chunk = 0;
|
||||
pack_offset = maxnbors * pack_width;
|
||||
lane = 0;
|
||||
if (ct + obound > list_size) {
|
||||
if (i < ito - 1) {
|
||||
*overflow = 1;
|
||||
ct = (ifrom + tid * 2) * maxnbors;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (*overflow == 1)
|
||||
for (int i = ifrom; i < ito; i++)
|
||||
numneigh[i] = 0;
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (separate_buffers) {
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp critical
|
||||
#endif
|
||||
{
|
||||
if (lmin < overflow[LMP_LOCAL_MIN]) overflow[LMP_LOCAL_MIN] = lmin;
|
||||
if (lmax > overflow[LMP_LOCAL_MAX]) overflow[LMP_LOCAL_MAX] = lmax;
|
||||
if (gmin < overflow[LMP_GHOST_MIN]) overflow[LMP_GHOST_MIN] = gmin;
|
||||
if (gmax > overflow[LMP_GHOST_MAX]) overflow[LMP_GHOST_MAX] = gmax;
|
||||
}
|
||||
#pragma omp barrier
|
||||
}
|
||||
|
||||
int ghost_offset = 0, nall_offset = e_nall;
|
||||
if (separate_buffers) {
|
||||
int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN];
|
||||
if (nghost < 0) nghost = 0;
|
||||
if (offload) {
|
||||
ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1;
|
||||
nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost;
|
||||
} else {
|
||||
ghost_offset = overflow[LMP_GHOST_MIN] - nlocal;
|
||||
nall_offset = nlocal + nghost;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (molecular) {
|
||||
for (int i = ifrom; i < ito; ++i) {
|
||||
int * _noalias jlist = firstneigh + cnumneigh[i];
|
||||
const int jnum = numneigh[i];
|
||||
|
||||
const int trip = jnum * pack_width;
|
||||
for (int jj = 0; jj < trip; jj+=pack_width) {
|
||||
const int j = jlist[jj];
|
||||
if (need_ic && j < 0) {
|
||||
which = 0;
|
||||
jlist[jj] = -j - 1;
|
||||
} else
|
||||
ofind_special(which, special, nspecial, i, tag[j]);
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (j >= nlocal) {
|
||||
if (j == e_nall)
|
||||
jlist[jj] = nall_offset;
|
||||
else if (which)
|
||||
jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
|
||||
else jlist[jj]-=ghost_offset;
|
||||
} else
|
||||
#endif
|
||||
if (which) jlist[jj] = j ^ (which << SBBITS);
|
||||
}
|
||||
}
|
||||
}
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
else if (separate_buffers) {
|
||||
for (int i = ifrom; i < ito; ++i) {
|
||||
int * _noalias jlist = firstneigh + cnumneigh[i];
|
||||
const int jnum = numneigh[i];
|
||||
int jj = 0;
|
||||
for (jj = 0; jj < jnum; jj++) {
|
||||
if (jlist[jj] >= nlocal) {
|
||||
if (jlist[jj] == e_nall) jlist[jj] = nall_offset;
|
||||
else jlist[jj] -= ghost_offset;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
} // end omp
|
||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||
*timer_compute = MIC_Wtime() - *timer_compute;
|
||||
#endif
|
||||
} // end offload
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (offload) {
|
||||
_fix->stop_watch(TIME_OFFLOAD_LATENCY);
|
||||
_fix->start_watch(TIME_HOST_NEIGHBOR);
|
||||
for (int n = 0; n < aend; n++) {
|
||||
ilist[n] = n;
|
||||
numneigh[n] = 0;
|
||||
}
|
||||
} else {
|
||||
for (int i = astart; i < aend; i++)
|
||||
list->firstneigh[i] = firstneigh + cnumneigh[i];
|
||||
if (separate_buffers) {
|
||||
_fix->start_watch(TIME_PACK);
|
||||
_fix->set_neighbor_host_sizes();
|
||||
buffers->pack_sep_from_single(_fix->host_min_local(),
|
||||
_fix->host_used_local(),
|
||||
_fix->host_min_ghost(),
|
||||
_fix->host_used_ghost());
|
||||
_fix->stop_watch(TIME_PACK);
|
||||
}
|
||||
}
|
||||
#else
|
||||
for (int i = astart; i < aend; i++)
|
||||
list->firstneigh[i] = firstneigh + cnumneigh[i];
|
||||
if (_fix->three_body_neighbor()) {
|
||||
if (need_ic)
|
||||
bin_newton<flt_t,acc_t,0,1,1,0,1>(0, list, buffers, host_start, nlocal);
|
||||
else
|
||||
bin_newton<flt_t,acc_t,0,0,1,0,1>(0, list, buffers, host_start, nlocal);
|
||||
} else {
|
||||
if (need_ic)
|
||||
bin_newton<flt_t,acc_t,0,1,1,0,0>(0, list, buffers, host_start, nlocal);
|
||||
else
|
||||
bin_newton<flt_t,acc_t,0,0,1,0,0>(0, list, buffers, host_start, nlocal);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -36,9 +36,6 @@ class NPairFullBinIntel : public NPairIntel {
|
||||
private:
|
||||
template <class flt_t, class acc_t>
|
||||
void fbi(NeighList *, IntelBuffers<flt_t,acc_t> *);
|
||||
template <class flt_t, class acc_t, int, int>
|
||||
void fbi(const int, NeighList *, IntelBuffers<flt_t,acc_t> *, const int,
|
||||
const int, const int offload_end = 0);
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@ -1,451 +0,0 @@
|
||||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing author: W. Michael Brown (Intel)
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include "npair_half_bin_newtoff_intel.h"
|
||||
#include "neighbor.h"
|
||||
#include "neigh_list.h"
|
||||
#include "atom.h"
|
||||
#include "comm.h"
|
||||
#include "group.h"
|
||||
|
||||
using namespace LAMMPS_NS;
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
NPairHalfBinNewtoffIntel::NPairHalfBinNewtoffIntel(LAMMPS *lmp) :
|
||||
NPairIntel(lmp) {}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
binned neighbor list construction with partial Newton's 3rd law
|
||||
each owned atom i checks own bin and other bins in stencil
|
||||
pair stored once if i,j are both owned and i < j
|
||||
pair stored by me if j is ghost (also stored by proc owning j)
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
void NPairHalfBinNewtoffIntel::build(NeighList *list)
|
||||
{
|
||||
if (nstencil > INTEL_MAX_STENCIL_CHECK)
|
||||
error->all(FLERR, "Too many neighbor bins for USER-INTEL package.");
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (exclude)
|
||||
error->all(FLERR, "Exclusion lists not yet supported for Intel offload");
|
||||
#endif
|
||||
|
||||
if (_fix->precision() == FixIntel::PREC_MODE_MIXED)
|
||||
hbnni(list, _fix->get_mixed_buffers());
|
||||
else if (_fix->precision() == FixIntel::PREC_MODE_DOUBLE)
|
||||
hbnni(list, _fix->get_double_buffers());
|
||||
else
|
||||
hbnni(list, _fix->get_single_buffers());
|
||||
|
||||
_fix->stop_watch(TIME_HOST_NEIGHBOR);
|
||||
}
|
||||
|
||||
template <class flt_t, class acc_t>
|
||||
void NPairHalfBinNewtoffIntel::
|
||||
hbnni(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) {
|
||||
const int nlocal = (includegroup) ? atom->nfirst : atom->nlocal;
|
||||
list->inum = nlocal;
|
||||
|
||||
const int off_end = _fix->offload_end_neighbor();
|
||||
int host_start = off_end;;
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (off_end) grow_stencil();
|
||||
if (_fix->full_host_list()) host_start = 0;
|
||||
#endif
|
||||
|
||||
buffers->grow_list(list, atom->nlocal, comm->nthreads, off_end);
|
||||
|
||||
int need_ic = 0;
|
||||
if (atom->molecular)
|
||||
dminimum_image_check(need_ic, neighbor->cutneighmax, neighbor->cutneighmax,
|
||||
neighbor->cutneighmax);
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (need_ic) {
|
||||
hbnni<flt_t,acc_t,1>(1, list, buffers, 0, off_end);
|
||||
hbnni<flt_t,acc_t,1>(0, list, buffers, host_start, nlocal);
|
||||
} else {
|
||||
hbnni<flt_t,acc_t,0>(1, list, buffers, 0, off_end);
|
||||
hbnni<flt_t,acc_t,0>(0, list, buffers, host_start, nlocal);
|
||||
}
|
||||
#else
|
||||
if (need_ic)
|
||||
hbnni<flt_t,acc_t,1>(0, list, buffers, host_start, nlocal);
|
||||
else
|
||||
hbnni<flt_t,acc_t,0>(0, list, buffers, host_start, nlocal);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class flt_t, class acc_t, int need_ic>
|
||||
void NPairHalfBinNewtoffIntel::
|
||||
hbnni(const int offload, NeighList *list, IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const int astart, const int aend) {
|
||||
|
||||
if (aend-astart == 0) return;
|
||||
|
||||
const int nall = atom->nlocal + atom->nghost;
|
||||
int pad = 1;
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (offload) {
|
||||
if (INTEL_MIC_NBOR_PAD > 1)
|
||||
pad = INTEL_MIC_NBOR_PAD * sizeof(float) / sizeof(flt_t);
|
||||
} else
|
||||
#endif
|
||||
if (INTEL_NBOR_PAD > 1)
|
||||
pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t);
|
||||
const int pad_width = pad;
|
||||
|
||||
const ATOM_T * _noalias const x = buffers->get_x();
|
||||
int * _noalias const firstneigh = buffers->firstneigh(list);
|
||||
|
||||
const int molecular = atom->molecular;
|
||||
int *ns = NULL;
|
||||
tagint *s = NULL;
|
||||
int tag_size = 0, special_size;
|
||||
if (buffers->need_tag()) tag_size = nall;
|
||||
if (molecular) {
|
||||
s = atom->special[0];
|
||||
ns = atom->nspecial[0];
|
||||
special_size = aend;
|
||||
} else {
|
||||
s = &buffers->_special_holder;
|
||||
ns = &buffers->_nspecial_holder;
|
||||
special_size = 0;
|
||||
}
|
||||
const tagint * _noalias const special = s;
|
||||
const int * _noalias const nspecial = ns;
|
||||
const int maxspecial = atom->maxspecial;
|
||||
const tagint * _noalias const tag = atom->tag;
|
||||
|
||||
int * _noalias const ilist = list->ilist;
|
||||
int * _noalias numneigh = list->numneigh;
|
||||
int * _noalias const cnumneigh = buffers->cnumneigh(list);
|
||||
const int nstencil = this->nstencil;
|
||||
const int * _noalias const stencil = this->stencil;
|
||||
const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0];
|
||||
const int ntypes = atom->ntypes + 1;
|
||||
const int nlocal = atom->nlocal;
|
||||
|
||||
#ifndef _LMP_INTEL_OFFLOAD
|
||||
int * const mask = atom->mask;
|
||||
tagint * const molecule = atom->molecule;
|
||||
#endif
|
||||
|
||||
int tnum;
|
||||
int *overflow;
|
||||
double *timer_compute;
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (offload) {
|
||||
timer_compute = _fix->off_watch_neighbor();
|
||||
tnum = buffers->get_off_threads();
|
||||
overflow = _fix->get_off_overflow_flag();
|
||||
_fix->stop_watch(TIME_HOST_NEIGHBOR);
|
||||
_fix->start_watch(TIME_OFFLOAD_LATENCY);
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
tnum = comm->nthreads;
|
||||
overflow = _fix->get_overflow_flag();
|
||||
}
|
||||
const int nthreads = tnum;
|
||||
const int maxnbors = buffers->get_max_nbors();
|
||||
int * _noalias const atombin = buffers->get_atombin();
|
||||
const int * _noalias const binpacked = buffers->get_binpacked();
|
||||
|
||||
const int xperiodic = domain->xperiodic;
|
||||
const int yperiodic = domain->yperiodic;
|
||||
const int zperiodic = domain->zperiodic;
|
||||
const flt_t xprd_half = domain->xprd_half;
|
||||
const flt_t yprd_half = domain->yprd_half;
|
||||
const flt_t zprd_half = domain->zprd_half;
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
const int * _noalias const binhead = this->binhead;
|
||||
const int * _noalias const bins = this->bins;
|
||||
const int cop = _fix->coprocessor_number();
|
||||
const int separate_buffers = _fix->separate_buffers();
|
||||
#pragma offload target(mic:cop) if(offload) \
|
||||
in(x:length(nall+1) alloc_if(0) free_if(0)) \
|
||||
in(tag:length(tag_size) alloc_if(0) free_if(0)) \
|
||||
in(special:length(special_size*maxspecial) alloc_if(0) free_if(0)) \
|
||||
in(nspecial:length(special_size*3) alloc_if(0) free_if(0)) \
|
||||
in(bins,binpacked:length(nall) alloc_if(0) free_if(0)) \
|
||||
in(binhead:length(mbins+1) alloc_if(0) free_if(0)) \
|
||||
in(cutneighsq:length(0) alloc_if(0) free_if(0)) \
|
||||
in(firstneigh:length(0) alloc_if(0) free_if(0)) \
|
||||
in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
|
||||
out(numneigh:length(0) alloc_if(0) free_if(0)) \
|
||||
in(ilist:length(0) alloc_if(0) free_if(0)) \
|
||||
in(atombin:length(aend) alloc_if(0) free_if(0)) \
|
||||
in(stencil:length(nstencil) alloc_if(0) free_if(0)) \
|
||||
in(maxnbors,nthreads,maxspecial,nstencil,pad_width,offload,nall) \
|
||||
in(separate_buffers, astart, aend, nlocal, molecular, ntypes) \
|
||||
in(xperiodic, yperiodic, zperiodic, xprd_half, yprd_half, zprd_half) \
|
||||
out(overflow:length(5) alloc_if(0) free_if(0)) \
|
||||
out(timer_compute:length(1) alloc_if(0) free_if(0)) \
|
||||
signal(tag)
|
||||
#endif
|
||||
{
|
||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||
*timer_compute = MIC_Wtime();
|
||||
#endif
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
overflow[LMP_LOCAL_MIN] = astart;
|
||||
overflow[LMP_LOCAL_MAX] = aend - 1;
|
||||
overflow[LMP_GHOST_MIN] = nall;
|
||||
overflow[LMP_GHOST_MAX] = -1;
|
||||
#endif
|
||||
|
||||
int nstencilp = 0;
|
||||
int binstart[INTEL_MAX_STENCIL], binend[INTEL_MAX_STENCIL];
|
||||
for (int k = 0; k < nstencil; k++) {
|
||||
binstart[nstencilp] = stencil[k];
|
||||
int end = stencil[k] + 1;
|
||||
for (int kk = k + 1; kk < nstencil; kk++) {
|
||||
if (stencil[kk-1]+1 == stencil[kk]) {
|
||||
end++;
|
||||
k++;
|
||||
} else break;
|
||||
}
|
||||
binend[nstencilp] = end;
|
||||
nstencilp++;
|
||||
}
|
||||
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp parallel default(none) \
|
||||
shared(numneigh, overflow, nstencilp, binstart, binend)
|
||||
#endif
|
||||
{
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
int lmin = nall, lmax = -1, gmin = nall, gmax = -1;
|
||||
#endif
|
||||
|
||||
const int num = aend - astart;
|
||||
int tid, ifrom, ito;
|
||||
IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads);
|
||||
ifrom += astart;
|
||||
ito += astart;
|
||||
|
||||
int which;
|
||||
|
||||
const int list_size = (ito + tid + 1) * maxnbors;
|
||||
int ct = (ifrom + tid) * maxnbors;
|
||||
int *neighptr = firstneigh + ct;
|
||||
|
||||
for (int i = ifrom; i < ito; i++) {
|
||||
int j, k, n, n2, itype, jtype, ibin;
|
||||
double xtmp, ytmp, ztmp, delx, dely, delz, rsq;
|
||||
|
||||
n = 0;
|
||||
n2 = maxnbors;
|
||||
|
||||
xtmp = x[i].x;
|
||||
ytmp = x[i].y;
|
||||
ztmp = x[i].z;
|
||||
itype = x[i].w;
|
||||
const int ioffset = ntypes*itype;
|
||||
|
||||
// loop over all atoms in other bins in stencil including self
|
||||
// only store pair if i < j
|
||||
// stores own/own pairs only once
|
||||
// stores own/ghost pairs on both procs
|
||||
|
||||
ibin = atombin[i];
|
||||
|
||||
for (k = 0; k < nstencilp; k++) {
|
||||
const int bstart = binhead[ibin + binstart[k]];
|
||||
const int bend = binhead[ibin + binend[k]];
|
||||
for (int jj = bstart; jj < bend; jj++) {
|
||||
const int j = binpacked[jj];
|
||||
if (j <= i) continue;
|
||||
|
||||
jtype = x[j].w;
|
||||
#ifndef _LMP_INTEL_OFFLOAD
|
||||
if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
|
||||
#endif
|
||||
|
||||
delx = xtmp - x[j].x;
|
||||
dely = ytmp - x[j].y;
|
||||
delz = ztmp - x[j].z;
|
||||
rsq = delx * delx + dely * dely + delz * delz;
|
||||
if (rsq <= cutneighsq[ioffset + jtype]) {
|
||||
if (j < nlocal) {
|
||||
if (need_ic) {
|
||||
int no_special;
|
||||
ominimum_image_check(no_special, delx, dely, delz);
|
||||
if (no_special)
|
||||
neighptr[n++] = -j - 1;
|
||||
else
|
||||
neighptr[n++] = j;
|
||||
} else
|
||||
neighptr[n++] = j;
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (j < lmin) lmin = j;
|
||||
if (j > lmax) lmax = j;
|
||||
#endif
|
||||
} else {
|
||||
if (need_ic) {
|
||||
int no_special;
|
||||
ominimum_image_check(no_special, delx, dely, delz);
|
||||
if (no_special)
|
||||
neighptr[n2++] = -j - 1;
|
||||
else
|
||||
neighptr[n2++] = j;
|
||||
} else
|
||||
neighptr[n2++] = j;
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (j < gmin) gmin = j;
|
||||
if (j > gmax) gmax = j;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
ilist[i] = i;
|
||||
|
||||
cnumneigh[i] = ct;
|
||||
if (n > maxnbors) *overflow = 1;
|
||||
for (k = maxnbors; k < n2; k++) neighptr[n++] = neighptr[k];
|
||||
|
||||
const int edge = (n % pad_width);
|
||||
if (edge) {
|
||||
const int pad_end = n + (pad_width - edge);
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count min=1, max=15, avg=8
|
||||
#endif
|
||||
for ( ; n < pad_end; n++)
|
||||
neighptr[n] = nall;
|
||||
}
|
||||
numneigh[i] = n;
|
||||
while((n % (INTEL_DATA_ALIGN / sizeof(int))) != 0) n++;
|
||||
ct += n;
|
||||
neighptr += n;
|
||||
if (ct + n + maxnbors > list_size) {
|
||||
*overflow = 1;
|
||||
ct = (ifrom + tid) * maxnbors;
|
||||
}
|
||||
}
|
||||
|
||||
if (*overflow == 1)
|
||||
for (int i = ifrom; i < ito; i++)
|
||||
numneigh[i] = 0;
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (separate_buffers) {
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp critical
|
||||
#endif
|
||||
{
|
||||
if (lmin < overflow[LMP_LOCAL_MIN]) overflow[LMP_LOCAL_MIN] = lmin;
|
||||
if (lmax > overflow[LMP_LOCAL_MAX]) overflow[LMP_LOCAL_MAX] = lmax;
|
||||
if (gmin < overflow[LMP_GHOST_MIN]) overflow[LMP_GHOST_MIN] = gmin;
|
||||
if (gmax > overflow[LMP_GHOST_MAX]) overflow[LMP_GHOST_MAX] = gmax;
|
||||
}
|
||||
#pragma omp barrier
|
||||
}
|
||||
|
||||
int ghost_offset = 0, nall_offset = nall;
|
||||
if (separate_buffers) {
|
||||
int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN];
|
||||
if (nghost < 0) nghost = 0;
|
||||
if (offload) {
|
||||
ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1;
|
||||
nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost;
|
||||
} else {
|
||||
ghost_offset = overflow[LMP_GHOST_MIN] - nlocal;
|
||||
nall_offset = nlocal + nghost;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (molecular) {
|
||||
for (int i = ifrom; i < ito; ++i) {
|
||||
int * _noalias jlist = firstneigh + cnumneigh[i];
|
||||
const int jnum = numneigh[i];
|
||||
for (int jj = 0; jj < jnum; jj++) {
|
||||
const int j = jlist[jj];
|
||||
if (need_ic && j < 0) {
|
||||
which = 0;
|
||||
jlist[jj] = -j - 1;
|
||||
} else
|
||||
ofind_special(which, special, nspecial, i, tag[j]);
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (j >= nlocal) {
|
||||
if (j == nall)
|
||||
jlist[jj] = nall_offset;
|
||||
else if (which)
|
||||
jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
|
||||
else jlist[jj]-=ghost_offset;
|
||||
} else
|
||||
#endif
|
||||
if (which) jlist[jj] = j ^ (which << SBBITS);
|
||||
}
|
||||
}
|
||||
}
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
else if (separate_buffers) {
|
||||
for (int i = ifrom; i < ito; ++i) {
|
||||
int * _noalias jlist = firstneigh + cnumneigh[i];
|
||||
const int jnum = numneigh[i];
|
||||
int jj = 0;
|
||||
for (jj = 0; jj < jnum; jj++)
|
||||
if (jlist[jj] >= nlocal) break;
|
||||
while (jj < jnum) {
|
||||
if (jlist[jj] == nall) jlist[jj] = nall_offset;
|
||||
else jlist[jj] -= ghost_offset;
|
||||
jj++;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
} // end omp
|
||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||
*timer_compute = MIC_Wtime() - *timer_compute;
|
||||
#endif
|
||||
} // end offload
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (offload) {
|
||||
_fix->stop_watch(TIME_OFFLOAD_LATENCY);
|
||||
_fix->start_watch(TIME_HOST_NEIGHBOR);
|
||||
for (int n = 0; n < aend; n++) {
|
||||
ilist[n] = n;
|
||||
numneigh[n] = 0;
|
||||
}
|
||||
} else {
|
||||
for (int i = astart; i < aend; i++)
|
||||
list->firstneigh[i] = firstneigh + cnumneigh[i];
|
||||
if (separate_buffers) {
|
||||
_fix->start_watch(TIME_PACK);
|
||||
_fix->set_neighbor_host_sizes();
|
||||
buffers->pack_sep_from_single(_fix->host_min_local(),
|
||||
_fix->host_used_local(),
|
||||
_fix->host_min_ghost(),
|
||||
_fix->host_used_ghost());
|
||||
_fix->stop_watch(TIME_PACK);
|
||||
}
|
||||
}
|
||||
#else
|
||||
for (int i = astart; i < aend; i++)
|
||||
list->firstneigh[i] = firstneigh + cnumneigh[i];
|
||||
#endif
|
||||
}
|
||||
@ -1,52 +0,0 @@
|
||||
/* -*- c++ -*- ----------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifdef NPAIR_CLASS
|
||||
|
||||
NPairStyle(half/bin/newtoff/intel,
|
||||
NPairHalfBinNewtoffIntel,
|
||||
NP_HALF | NP_BIN | NP_NEWTOFF | NP_ORTHO | NP_TRI | NP_INTEL)
|
||||
|
||||
#else
|
||||
|
||||
#ifndef LMP_NPAIR_HALF_BIN_NEWTOFF_INTEL_H
|
||||
#define LMP_NPAIR_HALF_BIN_NEWTOFF_INTEL_H
|
||||
|
||||
#include "npair_intel.h"
|
||||
#include "fix_intel.h"
|
||||
|
||||
namespace LAMMPS_NS {
|
||||
|
||||
class NPairHalfBinNewtoffIntel : public NPairIntel {
|
||||
public:
|
||||
NPairHalfBinNewtoffIntel(class LAMMPS *);
|
||||
~NPairHalfBinNewtoffIntel() {}
|
||||
void build(class NeighList *);
|
||||
|
||||
private:
|
||||
template <class flt_t, class acc_t>
|
||||
void hbnni(NeighList *, IntelBuffers<flt_t,acc_t> *);
|
||||
template <class flt_t, class acc_t, int>
|
||||
void hbnni(const int, NeighList *, IntelBuffers<flt_t,acc_t> *, const int,
|
||||
const int);
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* ERROR/WARNING messages:
|
||||
|
||||
|
||||
*/
|
||||
@ -75,536 +75,32 @@ hbni(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) {
|
||||
int need_ic = 0;
|
||||
if (atom->molecular)
|
||||
dminimum_image_check(need_ic, neighbor->cutneighmax, neighbor->cutneighmax,
|
||||
neighbor->cutneighmax);
|
||||
neighbor->cutneighmax);
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (need_ic) {
|
||||
if (offload_noghost) {
|
||||
hbni<flt_t,acc_t,1,1>(1, list, buffers, 0, off_end);
|
||||
hbni<flt_t,acc_t,1,1>(0, list, buffers, host_start, nlocal, off_end);
|
||||
bin_newton<flt_t,acc_t,1,1,0,0,0>(1, list, buffers, 0, off_end);
|
||||
bin_newton<flt_t,acc_t,1,1,0,0,0>(0, list, buffers, host_start, nlocal,
|
||||
off_end);
|
||||
} else {
|
||||
hbni<flt_t,acc_t,0,1>(1, list, buffers, 0, off_end);
|
||||
hbni<flt_t,acc_t,0,1>(0, list, buffers, host_start, nlocal);
|
||||
bin_newton<flt_t,acc_t,0,1,0,0,0>(1, list, buffers, 0, off_end);
|
||||
bin_newton<flt_t,acc_t,0,1,0,0,0>(0, list, buffers, host_start, nlocal);
|
||||
}
|
||||
} else {
|
||||
if (offload_noghost) {
|
||||
hbni<flt_t,acc_t,1,0>(1, list, buffers, 0, off_end);
|
||||
hbni<flt_t,acc_t,1,0>(0, list, buffers, host_start, nlocal, off_end);
|
||||
bin_newton<flt_t,acc_t,1,0,0,0,0>(1, list, buffers, 0, off_end);
|
||||
bin_newton<flt_t,acc_t,1,0,0,0,0>(0, list, buffers, host_start, nlocal,
|
||||
off_end);
|
||||
} else {
|
||||
hbni<flt_t,acc_t,0,0>(1, list, buffers, 0, off_end);
|
||||
hbni<flt_t,acc_t,0,0>(0, list, buffers, host_start, nlocal);
|
||||
bin_newton<flt_t,acc_t,0,0,0,0,0>(1, list, buffers, 0, off_end);
|
||||
bin_newton<flt_t,acc_t,0,0,0,0,0>(0, list, buffers, host_start, nlocal);
|
||||
}
|
||||
}
|
||||
#else
|
||||
if (need_ic)
|
||||
hbni<flt_t,acc_t,0,1>(0, list, buffers, host_start, nlocal);
|
||||
bin_newton<flt_t,acc_t,0,1,0,0,0>(0, list, buffers, host_start, nlocal);
|
||||
else
|
||||
hbni<flt_t,acc_t,0,0>(0, list, buffers, host_start, nlocal);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class flt_t, class acc_t, int offload_noghost, int need_ic>
|
||||
void NPairHalfBinNewtonIntel::
|
||||
hbni(const int offload, NeighList *list, IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const int astart, const int aend, const int offload_end) {
|
||||
|
||||
if (aend-astart == 0) return;
|
||||
|
||||
const int nall = atom->nlocal + atom->nghost;
|
||||
int pad = 1;
|
||||
int nall_t = nall;
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (offload_noghost && offload) nall_t = atom->nlocal;
|
||||
if (offload) {
|
||||
if (INTEL_MIC_NBOR_PAD > 1)
|
||||
pad = INTEL_MIC_NBOR_PAD * sizeof(float) / sizeof(flt_t);
|
||||
} else
|
||||
#endif
|
||||
if (INTEL_NBOR_PAD > 1)
|
||||
pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t);
|
||||
const int pad_width = pad;
|
||||
|
||||
const ATOM_T * _noalias const x = buffers->get_x();
|
||||
int * _noalias const firstneigh = buffers->firstneigh(list);
|
||||
const int e_nall = nall_t;
|
||||
|
||||
const int molecular = atom->molecular;
|
||||
int *ns = NULL;
|
||||
tagint *s = NULL;
|
||||
int tag_size = 0, special_size;
|
||||
if (buffers->need_tag()) tag_size = e_nall;
|
||||
if (molecular) {
|
||||
s = atom->special[0];
|
||||
ns = atom->nspecial[0];
|
||||
special_size = aend;
|
||||
} else {
|
||||
s = &buffers->_special_holder;
|
||||
ns = &buffers->_nspecial_holder;
|
||||
special_size = 0;
|
||||
}
|
||||
const tagint * _noalias const special = s;
|
||||
const int * _noalias const nspecial = ns;
|
||||
const int maxspecial = atom->maxspecial;
|
||||
const tagint * _noalias const tag = atom->tag;
|
||||
|
||||
int * _noalias const ilist = list->ilist;
|
||||
int * _noalias numneigh = list->numneigh;
|
||||
int * _noalias const cnumneigh = buffers->cnumneigh(list);
|
||||
const int nstencil = this->nstencil;
|
||||
const int * _noalias const stencil = this->stencil;
|
||||
const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0];
|
||||
const int ntypes = atom->ntypes + 1;
|
||||
const int nlocal = atom->nlocal;
|
||||
|
||||
#ifndef _LMP_INTEL_OFFLOAD
|
||||
int * const mask = atom->mask;
|
||||
tagint * const molecule = atom->molecule;
|
||||
#endif
|
||||
|
||||
int tnum;
|
||||
int *overflow;
|
||||
double *timer_compute;
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (offload) {
|
||||
timer_compute = _fix->off_watch_neighbor();
|
||||
tnum = buffers->get_off_threads();
|
||||
overflow = _fix->get_off_overflow_flag();
|
||||
_fix->stop_watch(TIME_HOST_NEIGHBOR);
|
||||
_fix->start_watch(TIME_OFFLOAD_LATENCY);
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
tnum = comm->nthreads;
|
||||
overflow = _fix->get_overflow_flag();
|
||||
}
|
||||
const int nthreads = tnum;
|
||||
const int maxnbors = buffers->get_max_nbors();
|
||||
int * _noalias const atombin = buffers->get_atombin();
|
||||
const int * _noalias const binpacked = buffers->get_binpacked();
|
||||
|
||||
const int xperiodic = domain->xperiodic;
|
||||
const int yperiodic = domain->yperiodic;
|
||||
const int zperiodic = domain->zperiodic;
|
||||
const flt_t xprd_half = domain->xprd_half;
|
||||
const flt_t yprd_half = domain->yprd_half;
|
||||
const flt_t zprd_half = domain->zprd_half;
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
const int * _noalias const binhead = this->binhead;
|
||||
const int * _noalias const bins = this->bins;
|
||||
const int cop = _fix->coprocessor_number();
|
||||
const int separate_buffers = _fix->separate_buffers();
|
||||
#pragma offload target(mic:cop) if(offload) \
|
||||
in(x:length(e_nall+1) alloc_if(0) free_if(0)) \
|
||||
in(tag:length(tag_size) alloc_if(0) free_if(0)) \
|
||||
in(special:length(special_size*maxspecial) alloc_if(0) free_if(0)) \
|
||||
in(nspecial:length(special_size*3) alloc_if(0) free_if(0)) \
|
||||
in(bins,binpacked:length(nall) alloc_if(0) free_if(0)) \
|
||||
in(binhead:length(mbins+1) alloc_if(0) free_if(0)) \
|
||||
in(cutneighsq:length(0) alloc_if(0) free_if(0)) \
|
||||
in(firstneigh:length(0) alloc_if(0) free_if(0)) \
|
||||
in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
|
||||
out(numneigh:length(0) alloc_if(0) free_if(0)) \
|
||||
in(ilist:length(0) alloc_if(0) free_if(0)) \
|
||||
in(atombin:length(aend) alloc_if(0) free_if(0)) \
|
||||
in(stencil:length(nstencil) alloc_if(0) free_if(0)) \
|
||||
in(maxnbors,nthreads,maxspecial,nstencil,e_nall,offload,pad_width) \
|
||||
in(offload_end,separate_buffers,astart, aend, nlocal, molecular, ntypes) \
|
||||
in(xperiodic, yperiodic, zperiodic, xprd_half, yprd_half, zprd_half) \
|
||||
out(overflow:length(5) alloc_if(0) free_if(0)) \
|
||||
out(timer_compute:length(1) alloc_if(0) free_if(0)) \
|
||||
signal(tag)
|
||||
#endif
|
||||
{
|
||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||
*timer_compute = MIC_Wtime();
|
||||
#endif
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
overflow[LMP_LOCAL_MIN] = astart;
|
||||
overflow[LMP_LOCAL_MAX] = aend - 1;
|
||||
overflow[LMP_GHOST_MIN] = e_nall;
|
||||
overflow[LMP_GHOST_MAX] = -1;
|
||||
#endif
|
||||
|
||||
int nstencilp = 0;
|
||||
int binstart[INTEL_MAX_STENCIL], binend[INTEL_MAX_STENCIL];
|
||||
for (int k = 0; k < nstencil; k++) {
|
||||
binstart[nstencilp] = stencil[k];
|
||||
int end = stencil[k] + 1;
|
||||
for (int kk = k + 1; kk < nstencil; kk++) {
|
||||
if (stencil[kk-1]+1 == stencil[kk]) {
|
||||
end++;
|
||||
k++;
|
||||
} else break;
|
||||
}
|
||||
binend[nstencilp] = end;
|
||||
nstencilp++;
|
||||
}
|
||||
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp parallel default(none) \
|
||||
shared(numneigh, overflow, nstencilp, binstart, binend)
|
||||
#endif
|
||||
{
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
int lmin = e_nall, lmax = -1, gmin = e_nall, gmax = -1;
|
||||
#endif
|
||||
|
||||
const int num = aend - astart;
|
||||
int tid, ifrom, ito;
|
||||
|
||||
#ifdef OUTER_CHUNK
|
||||
const int swidth = ip_simd::SIMD_type<flt_t>::width();
|
||||
IP_PRE_omp_range_id_vec(ifrom, ito, tid, num, nthreads, swidth);
|
||||
ifrom += astart;
|
||||
ito += astart;
|
||||
int e_ito = ito;
|
||||
if (ito == num) {
|
||||
int imod = ito % swidth;
|
||||
if (imod) e_ito += swidth - imod;
|
||||
}
|
||||
const int list_size = (e_ito + tid * 2 + 2) * maxnbors;
|
||||
#else
|
||||
const int swidth = 1;
|
||||
IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads);
|
||||
ifrom += astart;
|
||||
ito += astart;
|
||||
const int list_size = (ito + tid * 2 + 2) * maxnbors;
|
||||
#endif
|
||||
|
||||
int which;
|
||||
|
||||
int pack_offset = maxnbors * swidth;
|
||||
int ct = (ifrom + tid * 2) * maxnbors;
|
||||
int *neighptr = firstneigh + ct;
|
||||
const int obound = pack_offset + maxnbors * 2;
|
||||
|
||||
int max_chunk = 0;
|
||||
int lane = 0;
|
||||
for (int i = ifrom; i < ito; i++) {
|
||||
const flt_t xtmp = x[i].x;
|
||||
const flt_t ytmp = x[i].y;
|
||||
const flt_t ztmp = x[i].z;
|
||||
const int itype = x[i].w;
|
||||
const int ioffset = ntypes * itype;
|
||||
|
||||
// loop over rest of atoms in i's bin, ghosts are at end of linked list
|
||||
// if j is owned atom, store it, since j is beyond i in linked list
|
||||
// if j is ghost, only store if j coords are "above/to the right" of i
|
||||
|
||||
int raw_count = pack_offset;
|
||||
for (int j = bins[i]; j >= 0; j = bins[j]) {
|
||||
if (j >= nlocal) {
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (offload_noghost && offload) continue;
|
||||
#endif
|
||||
if (x[j].z < ztmp) continue;
|
||||
if (x[j].z == ztmp) {
|
||||
if (x[j].y < ytmp) continue;
|
||||
if (x[j].y == ytmp && x[j].x < xtmp) continue;
|
||||
}
|
||||
}
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
else if (offload_noghost && i < offload_end) continue;
|
||||
#endif
|
||||
|
||||
#ifndef _LMP_INTEL_OFFLOAD
|
||||
if (exclude) {
|
||||
const int jtype = x[j].w;
|
||||
if (exclusion(i,j,itype,jtype,mask,molecule)) continue;
|
||||
}
|
||||
#endif
|
||||
|
||||
neighptr[raw_count++] = j;
|
||||
}
|
||||
|
||||
// loop over all atoms in other bins in stencil, store every pair
|
||||
|
||||
const int ibin = atombin[i];
|
||||
if (exclude) {
|
||||
for (int k = 0; k < nstencilp; k++) {
|
||||
const int bstart = binhead[ibin + binstart[k]];
|
||||
const int bend = binhead[ibin + binend[k]];
|
||||
#ifndef _LMP_INTEL_OFFLOAD
|
||||
#ifdef INTEL_VMASK
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int jj = bstart; jj < bend; jj++) {
|
||||
const int j = binpacked[jj];
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (offload_noghost) {
|
||||
if (j < nlocal) {
|
||||
if (i < offload_end) continue;
|
||||
} else if (offload) continue;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef _LMP_INTEL_OFFLOAD
|
||||
const int jtype = x[j].w;
|
||||
if (exclusion(i,j,itype,jtype,mask,molecule)) continue;
|
||||
#endif
|
||||
|
||||
neighptr[raw_count++] = j;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int k = 0; k < nstencilp; k++) {
|
||||
const int bstart = binhead[ibin + binstart[k]];
|
||||
const int bend = binhead[ibin + binend[k]];
|
||||
#ifndef _LMP_INTEL_OFFLOAD
|
||||
#ifdef INTEL_VMASK
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int jj = bstart; jj < bend; jj++) {
|
||||
const int j = binpacked[jj];
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (offload_noghost) {
|
||||
if (j < nlocal) {
|
||||
if (i < offload_end) continue;
|
||||
} else if (offload) continue;
|
||||
}
|
||||
#endif
|
||||
|
||||
neighptr[raw_count++] = j;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (raw_count > obound) *overflow = 1;
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
int vlmin = lmin, vlmax = lmax, vgmin = gmin, vgmax = gmax;
|
||||
#if __INTEL_COMPILER+0 > 1499
|
||||
#pragma vector aligned
|
||||
#pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin)
|
||||
#endif
|
||||
#else
|
||||
#pragma vector aligned
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int u = pack_offset; u < raw_count; u++) {
|
||||
int j = neighptr[u];
|
||||
const flt_t delx = xtmp - x[j].x;
|
||||
const flt_t dely = ytmp - x[j].y;
|
||||
const flt_t delz = ztmp - x[j].z;
|
||||
const int jtype = x[j].w;
|
||||
const flt_t rsq = delx * delx + dely * dely + delz * delz;
|
||||
if (rsq > cutneighsq[ioffset + jtype])
|
||||
neighptr[u] = e_nall;
|
||||
else {
|
||||
if (need_ic) {
|
||||
int no_special;
|
||||
ominimum_image_check(no_special, delx, dely, delz);
|
||||
if (no_special)
|
||||
neighptr[u] = -j - 1;
|
||||
}
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (j < nlocal) {
|
||||
if (j < vlmin) vlmin = j;
|
||||
if (j > vlmax) vlmax = j;
|
||||
} else {
|
||||
if (j < vgmin) vgmin = j;
|
||||
if (j > vgmax) vgmax = j;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
lmin = MIN(lmin,vlmin);
|
||||
gmin = MIN(gmin,vgmin);
|
||||
lmax = MAX(lmax,vlmax);
|
||||
gmax = MAX(gmax,vgmax);
|
||||
#endif
|
||||
|
||||
int n = lane, n2 = pack_offset;
|
||||
for (int u = pack_offset; u < raw_count; u++) {
|
||||
const int j = neighptr[u];
|
||||
int pj = j;
|
||||
if (pj < e_nall) {
|
||||
if (need_ic)
|
||||
if (pj < 0) pj = -pj - 1;
|
||||
|
||||
if (pj < nlocal) {
|
||||
neighptr[n] = j;
|
||||
n += swidth;
|
||||
} else
|
||||
neighptr[n2++] = j;
|
||||
}
|
||||
}
|
||||
int ns = (n - lane) / swidth;
|
||||
for (int u = pack_offset; u < n2; u++) {
|
||||
neighptr[n] = neighptr[u];
|
||||
n += swidth;
|
||||
}
|
||||
|
||||
ilist[i] = i;
|
||||
cnumneigh[i] = ct + lane;
|
||||
ns += n2 - pack_offset;
|
||||
#ifndef OUTER_CHUNK
|
||||
int edge = (ns % pad_width);
|
||||
if (edge) {
|
||||
const int pad_end = ns + (pad_width - edge);
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count min=1, max=15, avg=8
|
||||
#endif
|
||||
for ( ; ns < pad_end; ns++)
|
||||
neighptr[ns] = e_nall;
|
||||
}
|
||||
#endif
|
||||
numneigh[i] = ns;
|
||||
|
||||
#ifdef OUTER_CHUNK
|
||||
if (ns > max_chunk) max_chunk = ns;
|
||||
lane++;
|
||||
if (lane == swidth) {
|
||||
ct += max_chunk * swidth;
|
||||
const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
|
||||
int edge = (ct % alignb);
|
||||
if (edge) ct += alignb - edge;
|
||||
neighptr = firstneigh + ct;
|
||||
max_chunk = 0;
|
||||
pack_offset = maxnbors * swidth;
|
||||
lane = 0;
|
||||
if (ct + obound > list_size) {
|
||||
if (i < ito - 1) {
|
||||
*overflow = 1;
|
||||
ct = (ifrom + tid * 2) * maxnbors;
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
ct += ns;
|
||||
const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
|
||||
edge = (ct % alignb);
|
||||
if (edge) ct += alignb - edge;
|
||||
neighptr = firstneigh + ct;
|
||||
if (ct + obound > list_size) {
|
||||
if (i < ito - 1) {
|
||||
*overflow = 1;
|
||||
ct = (ifrom + tid * 2) * maxnbors;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
if (*overflow == 1)
|
||||
for (int i = ifrom; i < ito; i++)
|
||||
numneigh[i] = 0;
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (separate_buffers) {
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp critical
|
||||
#endif
|
||||
{
|
||||
if (lmin < overflow[LMP_LOCAL_MIN]) overflow[LMP_LOCAL_MIN] = lmin;
|
||||
if (lmax > overflow[LMP_LOCAL_MAX]) overflow[LMP_LOCAL_MAX] = lmax;
|
||||
if (gmin < overflow[LMP_GHOST_MIN]) overflow[LMP_GHOST_MIN] = gmin;
|
||||
if (gmax > overflow[LMP_GHOST_MAX]) overflow[LMP_GHOST_MAX] = gmax;
|
||||
}
|
||||
#pragma omp barrier
|
||||
}
|
||||
|
||||
int ghost_offset = 0, nall_offset = e_nall;
|
||||
if (separate_buffers) {
|
||||
int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN];
|
||||
if (nghost < 0) nghost = 0;
|
||||
if (offload) {
|
||||
ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1;
|
||||
nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost;
|
||||
} else {
|
||||
ghost_offset = overflow[LMP_GHOST_MIN] - nlocal;
|
||||
nall_offset = nlocal + nghost;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (molecular) {
|
||||
for (int i = ifrom; i < ito; ++i) {
|
||||
int * _noalias jlist = firstneigh + cnumneigh[i];
|
||||
const int jnum = numneigh[i];
|
||||
#ifndef OUTER_CHUNK
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#pragma simd
|
||||
#endif
|
||||
for (int jj = 0; jj < jnum; jj++) {
|
||||
#else
|
||||
const int trip = jnum * swidth;
|
||||
for (int jj = 0; jj < trip; jj+= swidth) {
|
||||
#endif
|
||||
const int j = jlist[jj];
|
||||
if (need_ic && j < 0) {
|
||||
which = 0;
|
||||
jlist[jj] = -j - 1;
|
||||
} else
|
||||
ofind_special(which, special, nspecial, i, tag[j]);
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (j >= nlocal) {
|
||||
if (j == e_nall)
|
||||
jlist[jj] = nall_offset;
|
||||
else if (which)
|
||||
jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
|
||||
else jlist[jj]-=ghost_offset;
|
||||
} else
|
||||
#endif
|
||||
if (which) jlist[jj] = j ^ (which << SBBITS);
|
||||
}
|
||||
}
|
||||
}
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
else if (separate_buffers) {
|
||||
for (int i = ifrom; i < ito; ++i) {
|
||||
int * _noalias jlist = firstneigh + cnumneigh[i];
|
||||
const int jnum = numneigh[i];
|
||||
int jj = 0;
|
||||
for (jj = 0; jj < jnum; jj++)
|
||||
if (jlist[jj] >= nlocal) break;
|
||||
while (jj < jnum) {
|
||||
if (jlist[jj] == e_nall) jlist[jj] = nall_offset;
|
||||
else jlist[jj] -= ghost_offset;
|
||||
jj++;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
} // end omp
|
||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||
*timer_compute = MIC_Wtime() - *timer_compute;
|
||||
#endif
|
||||
} // end offload
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (offload) {
|
||||
_fix->stop_watch(TIME_OFFLOAD_LATENCY);
|
||||
_fix->start_watch(TIME_HOST_NEIGHBOR);
|
||||
for (int n = 0; n < aend; n++) {
|
||||
ilist[n] = n;
|
||||
numneigh[n] = 0;
|
||||
}
|
||||
} else {
|
||||
for (int i = astart; i < aend; i++)
|
||||
list->firstneigh[i] = firstneigh + cnumneigh[i];
|
||||
if (separate_buffers) {
|
||||
_fix->start_watch(TIME_PACK);
|
||||
_fix->set_neighbor_host_sizes();
|
||||
buffers->pack_sep_from_single(_fix->host_min_local(),
|
||||
_fix->host_used_local(),
|
||||
_fix->host_min_ghost(),
|
||||
_fix->host_used_ghost());
|
||||
_fix->stop_watch(TIME_PACK);
|
||||
}
|
||||
}
|
||||
#else
|
||||
for (int i = astart; i < aend; i++)
|
||||
list->firstneigh[i] = firstneigh + cnumneigh[i];
|
||||
bin_newton<flt_t,acc_t,0,0,0,0,0>(0, list, buffers, host_start, nlocal);
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -36,9 +36,6 @@ class NPairHalfBinNewtonIntel : public NPairIntel {
|
||||
private:
|
||||
template <class flt_t, class acc_t>
|
||||
void hbni(NeighList *, IntelBuffers<flt_t,acc_t> *);
|
||||
template <class flt_t, class acc_t, int, int>
|
||||
void hbni(const int, NeighList *, IntelBuffers<flt_t,acc_t> *, const int,
|
||||
const int, const int offload_end = 0);
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@ -75,439 +75,32 @@ hbnti(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) {
|
||||
int need_ic = 0;
|
||||
if (atom->molecular)
|
||||
dminimum_image_check(need_ic, neighbor->cutneighmax, neighbor->cutneighmax,
|
||||
neighbor->cutneighmax);
|
||||
neighbor->cutneighmax);
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (need_ic) {
|
||||
if (offload_noghost) {
|
||||
hbnti<flt_t,acc_t,1,1>(1, list, buffers, 0, off_end);
|
||||
hbnti<flt_t,acc_t,1,1>(0, list, buffers, host_start, nlocal, off_end);
|
||||
bin_newton<flt_t,acc_t,1,1,0,1,0>(1, list, buffers, 0, off_end);
|
||||
bin_newton<flt_t,acc_t,1,1,0,1,0>(0, list, buffers, host_start, nlocal,
|
||||
off_end);
|
||||
} else {
|
||||
hbnti<flt_t,acc_t,0,1>(1, list, buffers, 0, off_end);
|
||||
hbnti<flt_t,acc_t,0,1>(0, list, buffers, host_start, nlocal);
|
||||
bin_newton<flt_t,acc_t,0,1,0,1,0>(1, list, buffers, 0, off_end);
|
||||
bin_newton<flt_t,acc_t,0,1,0,1,0>(0, list, buffers, host_start, nlocal);
|
||||
}
|
||||
} else {
|
||||
if (offload_noghost) {
|
||||
hbnti<flt_t,acc_t,1,0>(1, list, buffers, 0, off_end);
|
||||
hbnti<flt_t,acc_t,1,0>(0, list, buffers, host_start, nlocal, off_end);
|
||||
bin_newton<flt_t,acc_t,1,0,0,1,0>(1, list, buffers, 0, off_end);
|
||||
bin_newton<flt_t,acc_t,1,0,0,1,0>(0, list, buffers, host_start, nlocal,
|
||||
off_end);
|
||||
} else {
|
||||
hbnti<flt_t,acc_t,0,0>(1, list, buffers, 0, off_end);
|
||||
hbnti<flt_t,acc_t,0,0>(0, list, buffers, host_start, nlocal);
|
||||
bin_newton<flt_t,acc_t,0,0,0,1,0>(1, list, buffers, 0, off_end);
|
||||
bin_newton<flt_t,acc_t,0,0,0,1,0>(0, list, buffers, host_start, nlocal);
|
||||
}
|
||||
}
|
||||
#else
|
||||
if (need_ic)
|
||||
hbnti<flt_t,acc_t,0,1>(0, list, buffers, host_start, nlocal);
|
||||
bin_newton<flt_t,acc_t,0,1,0,1,0>(0, list, buffers, host_start, nlocal);
|
||||
else
|
||||
hbnti<flt_t,acc_t,0,0>(0, list, buffers, host_start, nlocal);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class flt_t, class acc_t, int offload_noghost, int need_ic>
|
||||
void NPairHalfBinNewtonTriIntel::
|
||||
hbnti(const int offload, NeighList *list, IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const int astart, const int aend, const int offload_end) {
|
||||
if (aend-astart == 0) return;
|
||||
|
||||
const int nall = atom->nlocal + atom->nghost;
|
||||
int pad = 1;
|
||||
int nall_t = nall;
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (offload_noghost && offload) nall_t = atom->nlocal;
|
||||
if (offload) {
|
||||
if (INTEL_MIC_NBOR_PAD > 1)
|
||||
pad = INTEL_MIC_NBOR_PAD * sizeof(float) / sizeof(flt_t);
|
||||
} else
|
||||
#endif
|
||||
if (INTEL_NBOR_PAD > 1)
|
||||
pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t);
|
||||
const int pad_width = pad;
|
||||
|
||||
const ATOM_T * _noalias const x = buffers->get_x();
|
||||
int * _noalias const firstneigh = buffers->firstneigh(list);
|
||||
const int e_nall = nall_t;
|
||||
|
||||
const int molecular = atom->molecular;
|
||||
int *ns = NULL;
|
||||
tagint *s = NULL;
|
||||
int tag_size = 0, special_size;
|
||||
if (buffers->need_tag()) tag_size = e_nall;
|
||||
if (molecular) {
|
||||
s = atom->special[0];
|
||||
ns = atom->nspecial[0];
|
||||
special_size = aend;
|
||||
} else {
|
||||
s = &buffers->_special_holder;
|
||||
ns = &buffers->_nspecial_holder;
|
||||
special_size = 0;
|
||||
}
|
||||
const tagint * _noalias const special = s;
|
||||
const int * _noalias const nspecial = ns;
|
||||
const int maxspecial = atom->maxspecial;
|
||||
const tagint * _noalias const tag = atom->tag;
|
||||
|
||||
int * _noalias const ilist = list->ilist;
|
||||
int * _noalias numneigh = list->numneigh;
|
||||
int * _noalias const cnumneigh = buffers->cnumneigh(list);
|
||||
const int nstencil = this->nstencil;
|
||||
const int * _noalias const stencil = this->stencil;
|
||||
const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0];
|
||||
const int ntypes = atom->ntypes + 1;
|
||||
const int nlocal = atom->nlocal;
|
||||
|
||||
#ifndef _LMP_INTEL_OFFLOAD
|
||||
int * const mask = atom->mask;
|
||||
tagint * const molecule = atom->molecule;
|
||||
#endif
|
||||
|
||||
int tnum;
|
||||
int *overflow;
|
||||
double *timer_compute;
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (offload) {
|
||||
timer_compute = _fix->off_watch_neighbor();
|
||||
tnum = buffers->get_off_threads();
|
||||
overflow = _fix->get_off_overflow_flag();
|
||||
_fix->stop_watch(TIME_HOST_NEIGHBOR);
|
||||
_fix->start_watch(TIME_OFFLOAD_LATENCY);
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
tnum = comm->nthreads;
|
||||
overflow = _fix->get_overflow_flag();
|
||||
}
|
||||
const int nthreads = tnum;
|
||||
const int maxnbors = buffers->get_max_nbors();
|
||||
int * _noalias const atombin = buffers->get_atombin();
|
||||
const int * _noalias const binpacked = buffers->get_binpacked();
|
||||
|
||||
const int xperiodic = domain->xperiodic;
|
||||
const int yperiodic = domain->yperiodic;
|
||||
const int zperiodic = domain->zperiodic;
|
||||
const flt_t xprd_half = domain->xprd_half;
|
||||
const flt_t yprd_half = domain->yprd_half;
|
||||
const flt_t zprd_half = domain->zprd_half;
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
const int * _noalias const binhead = this->binhead;
|
||||
const int * _noalias const bins = this->bins;
|
||||
const int cop = _fix->coprocessor_number();
|
||||
const int separate_buffers = _fix->separate_buffers();
|
||||
#pragma offload target(mic:cop) if(offload) \
|
||||
in(x:length(e_nall+1) alloc_if(0) free_if(0)) \
|
||||
in(tag:length(tag_size) alloc_if(0) free_if(0)) \
|
||||
in(special:length(special_size*maxspecial) alloc_if(0) free_if(0)) \
|
||||
in(nspecial:length(special_size*3) alloc_if(0) free_if(0)) \
|
||||
in(bins,binpacked:length(nall) alloc_if(0) free_if(0)) \
|
||||
in(binhead:length(mbins+1) alloc_if(0) free_if(0)) \
|
||||
in(cutneighsq:length(0) alloc_if(0) free_if(0)) \
|
||||
in(firstneigh:length(0) alloc_if(0) free_if(0)) \
|
||||
in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
|
||||
out(numneigh:length(0) alloc_if(0) free_if(0)) \
|
||||
in(ilist:length(0) alloc_if(0) free_if(0)) \
|
||||
in(atombin:length(aend) alloc_if(0) free_if(0)) \
|
||||
in(stencil:length(nstencil) alloc_if(0) free_if(0)) \
|
||||
in(maxnbors,nthreads,maxspecial,nstencil,offload_end,pad_width,e_nall) \
|
||||
in(offload,separate_buffers, astart, aend, nlocal, molecular, ntypes) \
|
||||
in(xperiodic, yperiodic, zperiodic, xprd_half, yprd_half, zprd_half) \
|
||||
out(overflow:length(5) alloc_if(0) free_if(0)) \
|
||||
out(timer_compute:length(1) alloc_if(0) free_if(0)) \
|
||||
signal(tag)
|
||||
#endif
|
||||
{
|
||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||
*timer_compute = MIC_Wtime();
|
||||
#endif
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
overflow[LMP_LOCAL_MIN] = astart;
|
||||
overflow[LMP_LOCAL_MAX] = aend - 1;
|
||||
overflow[LMP_GHOST_MIN] = e_nall;
|
||||
overflow[LMP_GHOST_MAX] = -1;
|
||||
#endif
|
||||
|
||||
int nstencilp = 0;
|
||||
int binstart[INTEL_MAX_STENCIL], binend[INTEL_MAX_STENCIL];
|
||||
for (int k = 0; k < nstencil; k++) {
|
||||
binstart[nstencilp] = stencil[k];
|
||||
int end = stencil[k] + 1;
|
||||
for (int kk = k + 1; kk < nstencil; kk++) {
|
||||
if (stencil[kk-1]+1 == stencil[kk]) {
|
||||
end++;
|
||||
k++;
|
||||
} else break;
|
||||
}
|
||||
binend[nstencilp] = end;
|
||||
nstencilp++;
|
||||
}
|
||||
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp parallel default(none) \
|
||||
shared(numneigh, overflow, nstencilp, binstart, binend)
|
||||
#endif
|
||||
{
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
int lmin = e_nall, lmax = -1, gmin = e_nall, gmax = -1;
|
||||
#endif
|
||||
|
||||
const int num = aend - astart;
|
||||
int tid, ifrom, ito;
|
||||
IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads);
|
||||
ifrom += astart;
|
||||
ito += astart;
|
||||
|
||||
int which;
|
||||
|
||||
const int list_size = (ito + tid * 2 + 2) * maxnbors;
|
||||
int ct = (ifrom + tid * 2) * maxnbors;
|
||||
int *neighptr = firstneigh + ct;
|
||||
const int obound = maxnbors * 3;
|
||||
|
||||
for (int i = ifrom; i < ito; i++) {
|
||||
const flt_t xtmp = x[i].x;
|
||||
const flt_t ytmp = x[i].y;
|
||||
const flt_t ztmp = x[i].z;
|
||||
const int itype = x[i].w;
|
||||
const int ioffset = ntypes * itype;
|
||||
|
||||
// loop over all atoms in bins in stencil
|
||||
// pairs for atoms j "below" i are excluded
|
||||
// below = lower z or (equal z and lower y) or (equal zy and lower x)
|
||||
// (equal zyx and j <= i)
|
||||
// latter excludes self-self interaction but allows superposed atoms
|
||||
|
||||
const int ibin = atombin[i];
|
||||
|
||||
int raw_count = maxnbors;
|
||||
for (int k = 0; k < nstencilp; k++) {
|
||||
const int bstart = binhead[ibin + binstart[k]];
|
||||
const int bend = binhead[ibin + binend[k]];
|
||||
for (int jj = bstart; jj < bend; jj++) {
|
||||
const int j = binpacked[jj];
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (offload_noghost) {
|
||||
if (j < nlocal) {
|
||||
if (i < offload_end) continue;
|
||||
} else if (offload) continue;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (x[j].z < ztmp) continue;
|
||||
if (x[j].z == ztmp) {
|
||||
if (x[j].y < ytmp) continue;
|
||||
if (x[j].y == ytmp) {
|
||||
if (x[j].x < xtmp) continue;
|
||||
if (x[j].x == xtmp && j <= i) continue;
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef _LMP_INTEL_OFFLOAD
|
||||
if (exclude) {
|
||||
const int jtype = x[j].w;
|
||||
if (exclusion(i,j,itype,jtype,mask,molecule)) continue;
|
||||
}
|
||||
#endif
|
||||
|
||||
neighptr[raw_count++] = j;
|
||||
}
|
||||
}
|
||||
if (raw_count > obound)
|
||||
*overflow = 1;
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
int vlmin = lmin, vlmax = lmax, vgmin = gmin, vgmax = gmax;
|
||||
#if __INTEL_COMPILER+0 > 1499
|
||||
#pragma vector aligned
|
||||
#pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin)
|
||||
#endif
|
||||
#else
|
||||
#pragma vector aligned
|
||||
#pragma simd
|
||||
#endif
|
||||
#endif
|
||||
for (int u = maxnbors; u < raw_count; u++) {
|
||||
int j = neighptr[u];
|
||||
const flt_t delx = xtmp - x[j].x;
|
||||
const flt_t dely = ytmp - x[j].y;
|
||||
const flt_t delz = ztmp - x[j].z;
|
||||
const int jtype = x[j].w;
|
||||
const flt_t rsq = delx * delx + dely * dely + delz * delz;
|
||||
if (rsq > cutneighsq[ioffset + jtype])
|
||||
neighptr[u] = e_nall;
|
||||
else {
|
||||
if (need_ic) {
|
||||
int no_special;
|
||||
ominimum_image_check(no_special, delx, dely, delz);
|
||||
if (no_special)
|
||||
neighptr[u] = -j - 1;
|
||||
}
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (j < nlocal) {
|
||||
if (j < vlmin) vlmin = j;
|
||||
if (j > vlmax) vlmax = j;
|
||||
} else {
|
||||
if (j < vgmin) vgmin = j;
|
||||
if (j > vgmax) vgmax = j;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
int n = 0, n2 = maxnbors;
|
||||
for (int u = maxnbors; u < raw_count; u++) {
|
||||
const int j = neighptr[u];
|
||||
int pj = j;
|
||||
if (pj < e_nall) {
|
||||
if (need_ic)
|
||||
if (pj < 0) pj = -pj - 1;
|
||||
|
||||
if (pj < nlocal)
|
||||
neighptr[n++] = j;
|
||||
else
|
||||
neighptr[n2++] = j;
|
||||
}
|
||||
}
|
||||
int ns = n;
|
||||
for (int u = maxnbors; u < n2; u++)
|
||||
neighptr[n++] = neighptr[u];
|
||||
|
||||
ilist[i] = i;
|
||||
cnumneigh[i] = ct;
|
||||
ns += n2 - maxnbors;
|
||||
|
||||
int edge = (ns % pad_width);
|
||||
if (edge) {
|
||||
const int pad_end = ns + (pad_width - edge);
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count min=1, max=15, avg=8
|
||||
#endif
|
||||
for ( ; ns < pad_end; ns++)
|
||||
neighptr[ns] = e_nall;
|
||||
}
|
||||
numneigh[i] = ns;
|
||||
|
||||
ct += ns;
|
||||
const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
|
||||
edge = (ct % alignb);
|
||||
if (edge) ct += alignb - edge;
|
||||
neighptr = firstneigh + ct;
|
||||
if (ct + obound > list_size) {
|
||||
if (i < ito - 1) {
|
||||
*overflow = 1;
|
||||
ct = (ifrom + tid * 2) * maxnbors;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (*overflow == 1)
|
||||
for (int i = ifrom; i < ito; i++)
|
||||
numneigh[i] = 0;
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (separate_buffers) {
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp critical
|
||||
#endif
|
||||
{
|
||||
if (lmin < overflow[LMP_LOCAL_MIN]) overflow[LMP_LOCAL_MIN] = lmin;
|
||||
if (lmax > overflow[LMP_LOCAL_MAX]) overflow[LMP_LOCAL_MAX] = lmax;
|
||||
if (gmin < overflow[LMP_GHOST_MIN]) overflow[LMP_GHOST_MIN] = gmin;
|
||||
if (gmax > overflow[LMP_GHOST_MAX]) overflow[LMP_GHOST_MAX] = gmax;
|
||||
}
|
||||
#pragma omp barrier
|
||||
}
|
||||
|
||||
int ghost_offset = 0, nall_offset = e_nall;
|
||||
if (separate_buffers) {
|
||||
int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN];
|
||||
if (nghost < 0) nghost = 0;
|
||||
if (offload) {
|
||||
ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1;
|
||||
nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost;
|
||||
} else {
|
||||
ghost_offset = overflow[LMP_GHOST_MIN] - nlocal;
|
||||
nall_offset = nlocal + nghost;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (molecular) {
|
||||
for (int i = ifrom; i < ito; ++i) {
|
||||
int * _noalias jlist = firstneigh + cnumneigh[i];
|
||||
const int jnum = numneigh[i];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#pragma simd
|
||||
#endif
|
||||
for (int jj = 0; jj < jnum; jj++) {
|
||||
const int j = jlist[jj];
|
||||
if (need_ic && j < 0) {
|
||||
which = 0;
|
||||
jlist[jj] = -j - 1;
|
||||
} else
|
||||
ofind_special(which, special, nspecial, i, tag[j]);
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (j >= nlocal) {
|
||||
if (j == e_nall)
|
||||
jlist[jj] = nall_offset;
|
||||
else if (which)
|
||||
jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
|
||||
else jlist[jj]-=ghost_offset;
|
||||
} else
|
||||
#endif
|
||||
if (which) jlist[jj] = j ^ (which << SBBITS);
|
||||
}
|
||||
}
|
||||
}
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
else if (separate_buffers) {
|
||||
for (int i = ifrom; i < ito; ++i) {
|
||||
int * _noalias jlist = firstneigh + cnumneigh[i];
|
||||
const int jnum = numneigh[i];
|
||||
int jj = 0;
|
||||
for (jj = 0; jj < jnum; jj++)
|
||||
if (jlist[jj] >= nlocal) break;
|
||||
while (jj < jnum) {
|
||||
if (jlist[jj] == e_nall) jlist[jj] = nall_offset;
|
||||
else jlist[jj] -= ghost_offset;
|
||||
jj++;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
} // end omp
|
||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||
*timer_compute = MIC_Wtime() - *timer_compute;
|
||||
#endif
|
||||
} // end offload
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (offload) {
|
||||
_fix->stop_watch(TIME_OFFLOAD_LATENCY);
|
||||
_fix->start_watch(TIME_HOST_NEIGHBOR);
|
||||
for (int n = 0; n < aend; n++) {
|
||||
ilist[n] = n;
|
||||
numneigh[n] = 0;
|
||||
}
|
||||
} else {
|
||||
for (int i = astart; i < aend; i++)
|
||||
list->firstneigh[i] = firstneigh + cnumneigh[i];
|
||||
if (separate_buffers) {
|
||||
_fix->start_watch(TIME_PACK);
|
||||
_fix->set_neighbor_host_sizes();
|
||||
buffers->pack_sep_from_single(_fix->host_min_local(),
|
||||
_fix->host_used_local(),
|
||||
_fix->host_min_ghost(),
|
||||
_fix->host_used_ghost());
|
||||
_fix->stop_watch(TIME_PACK);
|
||||
}
|
||||
}
|
||||
#else
|
||||
for (int i = astart; i < aend; i++)
|
||||
list->firstneigh[i] = firstneigh + cnumneigh[i];
|
||||
bin_newton<flt_t,acc_t,0,0,0,1,0>(0, list, buffers, host_start, nlocal);
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -36,9 +36,6 @@ class NPairHalfBinNewtonTriIntel : public NPairIntel {
|
||||
private:
|
||||
template <class flt_t, class acc_t>
|
||||
void hbnti(NeighList *, IntelBuffers<flt_t,acc_t> *);
|
||||
template <class flt_t, class acc_t, int, int>
|
||||
void hbnti(const int, NeighList *, IntelBuffers<flt_t,acc_t> *, const int,
|
||||
const int, const int offload_end = 0);
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@ -48,6 +48,678 @@ NPairIntel::~NPairIntel() {
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template <class flt_t, class acc_t, int offload_noghost, int need_ic,
|
||||
int FULL, int TRI, int THREE>
|
||||
void NPairIntel::bin_newton(const int offload, NeighList *list,
|
||||
IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const int astart, const int aend,
|
||||
const int offload_end) {
|
||||
|
||||
if (aend-astart == 0) return;
|
||||
|
||||
const int nall = atom->nlocal + atom->nghost;
|
||||
int pad = 1;
|
||||
int nall_t = nall;
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (offload_noghost && offload) nall_t = atom->nlocal;
|
||||
if (THREE == 0 && offload) {
|
||||
if (INTEL_MIC_NBOR_PAD > 1)
|
||||
pad = INTEL_MIC_NBOR_PAD * sizeof(float) / sizeof(flt_t);
|
||||
} else
|
||||
#endif
|
||||
if (THREE == 0 && INTEL_NBOR_PAD > 1)
|
||||
pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t);
|
||||
const int pad_width = pad;
|
||||
const int pack_width = _fix->nbor_pack_width();
|
||||
|
||||
const ATOM_T * _noalias const x = buffers->get_x();
|
||||
int * _noalias const firstneigh = buffers->firstneigh(list);
|
||||
const int e_nall = nall_t;
|
||||
|
||||
const int molecular = atom->molecular;
|
||||
int *ns = NULL;
|
||||
tagint *s = NULL;
|
||||
int tag_size = 0, special_size;
|
||||
if (buffers->need_tag()) tag_size = e_nall;
|
||||
if (molecular) {
|
||||
s = atom->special[0];
|
||||
ns = atom->nspecial[0];
|
||||
special_size = aend;
|
||||
} else {
|
||||
s = &buffers->_special_holder;
|
||||
ns = &buffers->_nspecial_holder;
|
||||
special_size = 0;
|
||||
}
|
||||
const tagint * _noalias const special = s;
|
||||
const int * _noalias const nspecial = ns;
|
||||
const int maxspecial = atom->maxspecial;
|
||||
const tagint * _noalias const tag = atom->tag;
|
||||
|
||||
int * _noalias const ilist = list->ilist;
|
||||
int * _noalias numneigh = list->numneigh;
|
||||
int * _noalias const cnumneigh = buffers->cnumneigh(list);
|
||||
const int nstencil = this->nstencil;
|
||||
const int * _noalias const stencil = this->stencil;
|
||||
const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0];
|
||||
const int ntypes = atom->ntypes + 1;
|
||||
const int nlocal = atom->nlocal;
|
||||
|
||||
#ifndef _LMP_INTEL_OFFLOAD
|
||||
int * const mask = atom->mask;
|
||||
tagint * const molecule = atom->molecule;
|
||||
#endif
|
||||
|
||||
int tnum;
|
||||
int *overflow;
|
||||
double *timer_compute;
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (offload) {
|
||||
timer_compute = _fix->off_watch_neighbor();
|
||||
tnum = buffers->get_off_threads();
|
||||
overflow = _fix->get_off_overflow_flag();
|
||||
_fix->stop_watch(TIME_HOST_NEIGHBOR);
|
||||
_fix->start_watch(TIME_OFFLOAD_LATENCY);
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
tnum = comm->nthreads;
|
||||
overflow = _fix->get_overflow_flag();
|
||||
}
|
||||
const int nthreads = tnum;
|
||||
const int maxnbors = buffers->get_max_nbors();
|
||||
int * _noalias const atombin = buffers->get_atombin();
|
||||
const int * _noalias const binpacked = buffers->get_binpacked();
|
||||
|
||||
const int xperiodic = domain->xperiodic;
|
||||
const int yperiodic = domain->yperiodic;
|
||||
const int zperiodic = domain->zperiodic;
|
||||
const flt_t xprd_half = domain->xprd_half;
|
||||
const flt_t yprd_half = domain->yprd_half;
|
||||
const flt_t zprd_half = domain->zprd_half;
|
||||
|
||||
flt_t * _noalias const ncachex = buffers->get_ncachex();
|
||||
flt_t * _noalias const ncachey = buffers->get_ncachey();
|
||||
flt_t * _noalias const ncachez = buffers->get_ncachez();
|
||||
int * _noalias const ncachej = buffers->get_ncachej();
|
||||
int * _noalias const ncachejtype = buffers->get_ncachejtype();
|
||||
const int ncache_stride = buffers->ncache_stride();
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
const int * _noalias const binhead = this->binhead;
|
||||
const int * _noalias const bins = this->bins;
|
||||
const int cop = _fix->coprocessor_number();
|
||||
const int separate_buffers = _fix->separate_buffers();
|
||||
#pragma offload target(mic:cop) if(offload) \
|
||||
in(x:length(e_nall+1) alloc_if(0) free_if(0)) \
|
||||
in(tag:length(tag_size) alloc_if(0) free_if(0)) \
|
||||
in(special:length(special_size*maxspecial) alloc_if(0) free_if(0)) \
|
||||
in(nspecial:length(special_size*3) alloc_if(0) free_if(0)) \
|
||||
in(bins,binpacked:length(nall) alloc_if(0) free_if(0)) \
|
||||
in(binhead:length(mbins+1) alloc_if(0) free_if(0)) \
|
||||
in(cutneighsq:length(0) alloc_if(0) free_if(0)) \
|
||||
in(firstneigh:length(0) alloc_if(0) free_if(0)) \
|
||||
in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
|
||||
out(numneigh:length(0) alloc_if(0) free_if(0)) \
|
||||
in(ilist:length(0) alloc_if(0) free_if(0)) \
|
||||
in(atombin:length(aend) alloc_if(0) free_if(0)) \
|
||||
in(stencil:length(nstencil) alloc_if(0) free_if(0)) \
|
||||
in(ncachex,ncachey,ncachez,ncachej:length(0) alloc_if(0) free_if(0)) \
|
||||
in(ncachejtype:length(0) alloc_if(0) free_if(0)) \
|
||||
in(ncache_stride,maxnbors,nthreads,maxspecial,nstencil,e_nall,offload) \
|
||||
in(pad_width,offload_end,separate_buffers,astart,aend,nlocal,molecular) \
|
||||
in(ntypes,xperiodic,yperiodic,zperiodic,xprd_half,yprd_half,zprd_half) \
|
||||
in(pack_width) \
|
||||
out(overflow:length(5) alloc_if(0) free_if(0)) \
|
||||
out(timer_compute:length(1) alloc_if(0) free_if(0)) \
|
||||
signal(tag)
|
||||
#endif
|
||||
{
|
||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||
*timer_compute = MIC_Wtime();
|
||||
#endif
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
overflow[LMP_LOCAL_MIN] = astart;
|
||||
overflow[LMP_LOCAL_MAX] = aend - 1;
|
||||
overflow[LMP_GHOST_MIN] = e_nall;
|
||||
overflow[LMP_GHOST_MAX] = -1;
|
||||
#endif
|
||||
|
||||
int nstencilp = 0;
|
||||
int binstart[INTEL_MAX_STENCIL], binend[INTEL_MAX_STENCIL];
|
||||
for (int k = 0; k < nstencil; k++) {
|
||||
binstart[nstencilp] = stencil[k];
|
||||
int end = stencil[k] + 1;
|
||||
for (int kk = k + 1; kk < nstencil; kk++) {
|
||||
if (stencil[kk-1]+1 == stencil[kk]) {
|
||||
end++;
|
||||
k++;
|
||||
} else break;
|
||||
}
|
||||
binend[nstencilp] = end;
|
||||
nstencilp++;
|
||||
}
|
||||
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp parallel default(none) \
|
||||
shared(numneigh, overflow, nstencilp, binstart, binend)
|
||||
#endif
|
||||
{
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
int lmin = e_nall, lmax = -1, gmin = e_nall, gmax = -1;
|
||||
#endif
|
||||
|
||||
const int num = aend - astart;
|
||||
int tid, ifrom, ito;
|
||||
|
||||
if (THREE) {
|
||||
IP_PRE_omp_range_id_vec(ifrom, ito, tid, num, nthreads, pack_width);
|
||||
} else {
|
||||
IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads);
|
||||
}
|
||||
ifrom += astart;
|
||||
ito += astart;
|
||||
int e_ito = ito;
|
||||
if (THREE && ito == num) {
|
||||
int imod = ito % pack_width;
|
||||
if (imod) e_ito += pack_width - imod;
|
||||
}
|
||||
const int list_size = (e_ito + tid * 2 + 2) * maxnbors;
|
||||
|
||||
int which;
|
||||
|
||||
int pack_offset = maxnbors;
|
||||
if (THREE) pack_offset *= pack_width;
|
||||
int ct = (ifrom + tid * 2) * maxnbors;
|
||||
int *neighptr = firstneigh + ct;
|
||||
const int obound = pack_offset + maxnbors * 2;
|
||||
|
||||
const int toffs = tid * ncache_stride;
|
||||
flt_t * _noalias const tx = ncachex + toffs;
|
||||
flt_t * _noalias const ty = ncachey + toffs;
|
||||
flt_t * _noalias const tz = ncachez + toffs;
|
||||
int * _noalias const tj = ncachej + toffs;
|
||||
int * _noalias const tjtype = ncachejtype + toffs;
|
||||
|
||||
flt_t * _noalias itx;
|
||||
flt_t * _noalias ity;
|
||||
flt_t * _noalias itz;
|
||||
int * _noalias itj;
|
||||
int * _noalias itjtype;
|
||||
|
||||
// loop over all atoms in other bins in stencil, store every pair
|
||||
int istart, icount, ncount, oldbin = -9999999, lane, max_chunk;
|
||||
if (THREE) {
|
||||
lane = 0;
|
||||
max_chunk = 0;
|
||||
}
|
||||
for (int i = ifrom; i < ito; i++) {
|
||||
const flt_t xtmp = x[i].x;
|
||||
const flt_t ytmp = x[i].y;
|
||||
const flt_t ztmp = x[i].z;
|
||||
const int itype = x[i].w;
|
||||
tagint itag;
|
||||
if (THREE) itag = tag[i];
|
||||
const int ioffset = ntypes * itype;
|
||||
|
||||
const int ibin = atombin[i];
|
||||
if (ibin != oldbin) {
|
||||
oldbin = ibin;
|
||||
ncount = 0;
|
||||
for (int k = 0; k < nstencilp; k++) {
|
||||
const int bstart = binhead[ibin + binstart[k]];
|
||||
const int bend = binhead[ibin + binend[k]];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#pragma simd
|
||||
#endif
|
||||
for (int jj = bstart; jj < bend; jj++)
|
||||
tj[ncount++] = binpacked[jj];
|
||||
}
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#pragma simd
|
||||
#endif
|
||||
for (int u = 0; u < ncount; u++) {
|
||||
const int j = tj[u];
|
||||
tx[u] = x[j].x;
|
||||
ty[u] = x[j].y;
|
||||
tz[u] = x[j].z;
|
||||
tjtype[u] = x[j].w;
|
||||
}
|
||||
|
||||
if (FULL == 0 || TRI == 1) {
|
||||
icount = 0;
|
||||
istart = ncount;
|
||||
const int alignb = INTEL_DATA_ALIGN / sizeof(int);
|
||||
int nedge = istart % alignb;
|
||||
if (nedge) istart + (alignb - nedge);
|
||||
itx = tx + istart;
|
||||
ity = ty + istart;
|
||||
itz = tz + istart;
|
||||
itj = tj + istart;
|
||||
itjtype = tjtype + istart;
|
||||
|
||||
const int bstart = binhead[ibin];
|
||||
const int bend = binhead[ibin + 1];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#pragma simd
|
||||
#endif
|
||||
for (int jj = bstart; jj < bend; jj++) {
|
||||
const int j = binpacked[jj];
|
||||
itj[icount] = j;
|
||||
itx[icount] = x[j].x;
|
||||
ity[icount] = x[j].y;
|
||||
itz[icount] = x[j].z;
|
||||
itjtype[icount] = x[j].w;
|
||||
icount++;
|
||||
}
|
||||
if (icount + istart > obound) *overflow = 1;
|
||||
} else
|
||||
if (ncount > obound) *overflow = 1;
|
||||
}
|
||||
|
||||
// ---------------------- Loop over i bin
|
||||
|
||||
int n = 0;
|
||||
if (FULL == 0 || TRI == 1) {
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for (int u = 0; u < icount; u++) {
|
||||
int addme = 1;
|
||||
int j = itj[u];
|
||||
|
||||
// Cutoff Check
|
||||
const flt_t delx = xtmp - itx[u];
|
||||
const flt_t dely = ytmp - ity[u];
|
||||
const flt_t delz = ztmp - itz[u];
|
||||
const int jtype = itjtype[u];
|
||||
const flt_t rsq = delx * delx + dely * dely + delz * delz;
|
||||
if (rsq > cutneighsq[ioffset + jtype]) addme = 0;
|
||||
|
||||
// i bin (half) check and offload ghost check
|
||||
if (j < nlocal) {
|
||||
const int ijmod = (i + j) % 2;
|
||||
if (i > j) {
|
||||
if (ijmod == 0) addme = 0;
|
||||
} else if (i < j) {
|
||||
if (ijmod == 1) addme = 0;
|
||||
} else
|
||||
addme = 0;
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (offload_noghost && i < offload_end) addme = 0;
|
||||
#endif
|
||||
} else {
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (offload_noghost && offload) addme = 0;
|
||||
#endif
|
||||
if (itz[u] < ztmp) addme = 0;
|
||||
if (itz[u] == ztmp) {
|
||||
if (ity[u] < ytmp) addme = 0;
|
||||
if (ity[u] == ytmp && itx[u] < xtmp) addme = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (need_ic) {
|
||||
int no_special;
|
||||
ominimum_image_check(no_special, delx, dely, delz);
|
||||
if (no_special)
|
||||
j = -j - 1;
|
||||
}
|
||||
|
||||
if (addme)
|
||||
neighptr[n++] = j;
|
||||
}
|
||||
} // if FULL==0
|
||||
|
||||
// ---------------------- Loop over other bins
|
||||
|
||||
int n2, *neighptr2;
|
||||
if (THREE) {
|
||||
n = pack_offset;
|
||||
n2 = pack_offset + maxnbors;
|
||||
neighptr2 = neighptr;
|
||||
}
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for (int u = 0; u < ncount; u++) {
|
||||
int addme = 1;
|
||||
int j = tj[u];
|
||||
|
||||
if (FULL)
|
||||
if (i == j) addme = 0;
|
||||
|
||||
// Cutoff Check
|
||||
const flt_t delx = xtmp - tx[u];
|
||||
const flt_t dely = ytmp - ty[u];
|
||||
const flt_t delz = ztmp - tz[u];
|
||||
const int jtype = tjtype[u];
|
||||
const flt_t rsq = delx * delx + dely * dely + delz * delz;
|
||||
if (rsq > cutneighsq[ioffset + jtype]) addme = 0;
|
||||
|
||||
// Triclinic
|
||||
if (TRI) {
|
||||
if (tz[u] < ztmp) addme = 0;
|
||||
if (tz[u] == ztmp) {
|
||||
if (ty[u] < ytmp) addme = 0;
|
||||
if (ty[u] == ytmp) {
|
||||
if (tx[u] < xtmp) addme = 0;
|
||||
if (tx[u] == xtmp && j <= i) addme = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// offload ghost check
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (offload_noghost) {
|
||||
if (j < nlocal) {
|
||||
if (i < offload_end) addme = 0;
|
||||
} else if (offload) addme = 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
int pj;
|
||||
if (THREE) pj = j;
|
||||
if (need_ic) {
|
||||
int no_special;
|
||||
ominimum_image_check(no_special, delx, dely, delz);
|
||||
if (no_special)
|
||||
j = -j - 1;
|
||||
}
|
||||
|
||||
if (THREE) {
|
||||
const int jtag = tag[pj];
|
||||
int flist = 0;
|
||||
if (itag > jtag) {
|
||||
if ((itag+jtag) % 2 == 0) flist = 1;
|
||||
} else if (itag < jtag) {
|
||||
if ((itag+jtag) % 2 == 1) flist = 1;
|
||||
} else {
|
||||
if (tz[u] < ztmp) flist = 1;
|
||||
else if (tz[u] == ztmp && ty[u] < ytmp) flist = 1;
|
||||
else if (tz[u] == ztmp && ty[u] == ytmp && tx[u] < xtmp)
|
||||
flist = 1;
|
||||
}
|
||||
if (addme) {
|
||||
if (flist)
|
||||
neighptr2[n2++] = j;
|
||||
else
|
||||
neighptr[n++] = j;
|
||||
}
|
||||
} else {
|
||||
if (addme)
|
||||
neighptr[n++] = j;
|
||||
}
|
||||
} // for u
|
||||
|
||||
#ifndef _LMP_INTEL_OFFLOAD
|
||||
if (exclude) {
|
||||
int alln = n;
|
||||
if (THREE) n = pack_offset;
|
||||
else n = 0;
|
||||
for (int u = pack_offset; u < alln; u++) {
|
||||
const int j = neighptr[u];
|
||||
int pj = j;
|
||||
if (need_ic)
|
||||
if (pj < 0) pj = -j - 1;
|
||||
const int jtype = x[pj].w;
|
||||
if (exclusion(i,pj,itype,jtype,mask,molecule)) continue;
|
||||
neighptr[n++] = j;
|
||||
}
|
||||
if (THREE) {
|
||||
alln = n2;
|
||||
n2 = pack_offset + maxnbors;
|
||||
for (int u = pack_offset + maxnbors; u < alln; u++) {
|
||||
const int j = neighptr[u];
|
||||
int pj = j;
|
||||
if (need_ic)
|
||||
if (pj < 0) pj = -j - 1;
|
||||
const int jtype = x[pj].w;
|
||||
if (exclusion(i,pj,itype,jtype,mask,molecule)) continue;
|
||||
neighptr[n2++] = j;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
int ns;
|
||||
if (THREE) {
|
||||
int alln = n;
|
||||
ns = n - pack_offset;
|
||||
atombin[i] = ns;
|
||||
n = lane;
|
||||
for (int u = pack_offset; u < alln; u++) {
|
||||
neighptr[n] = neighptr[u];
|
||||
n += pack_width;
|
||||
}
|
||||
ns += n2 - pack_offset - maxnbors;
|
||||
for (int u = pack_offset + maxnbors; u < n2; u++) {
|
||||
neighptr[n] = neighptr[u];
|
||||
n += pack_width;
|
||||
}
|
||||
if (ns > maxnbors) *overflow = 1;
|
||||
} else
|
||||
if (n > maxnbors) *overflow = 1;
|
||||
|
||||
ilist[i] = i;
|
||||
cnumneigh[i] = ct;
|
||||
if (THREE) {
|
||||
cnumneigh[i] += lane;
|
||||
numneigh[i] = ns;
|
||||
} else {
|
||||
int edge = (n % pad_width);
|
||||
if (edge) {
|
||||
const int pad_end = n + (pad_width - edge);
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#pragma loop_count min=1, max=INTEL_COMPILE_WIDTH-1, \
|
||||
avg=INTEL_COMPILE_WIDTH/2
|
||||
#endif
|
||||
for ( ; n < pad_end; n++)
|
||||
neighptr[n] = e_nall;
|
||||
}
|
||||
numneigh[i] = n;
|
||||
}
|
||||
|
||||
if (THREE) {
|
||||
if (ns > max_chunk) max_chunk = ns;
|
||||
lane++;
|
||||
if (lane == pack_width) {
|
||||
ct += max_chunk * pack_width;
|
||||
const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
|
||||
const int edge = (ct % alignb);
|
||||
if (edge) ct += alignb - edge;
|
||||
neighptr = firstneigh + ct;
|
||||
max_chunk = 0;
|
||||
pack_offset = maxnbors * pack_width;
|
||||
lane = 0;
|
||||
if (ct + obound > list_size) {
|
||||
if (i < ito - 1) {
|
||||
*overflow = 1;
|
||||
ct = (ifrom + tid * 2) * maxnbors;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
ct += n;
|
||||
const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
|
||||
const int edge = (ct % alignb);
|
||||
if (edge) ct += alignb - edge;
|
||||
neighptr = firstneigh + ct;
|
||||
if (ct + obound > list_size) {
|
||||
if (i < ito - 1) {
|
||||
*overflow = 1;
|
||||
ct = (ifrom + tid * 2) * maxnbors;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (*overflow == 1)
|
||||
for (int i = ifrom; i < ito; i++)
|
||||
numneigh[i] = 0;
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
int vlmin = lmin, vlmax = lmax, vgmin = gmin, vgmax = gmax;
|
||||
int ghost_offset = 0, nall_offset = e_nall;
|
||||
if (separate_buffers) {
|
||||
for (int i = ifrom; i < ito; ++i) {
|
||||
int * _noalias jlist = firstneigh + cnumneigh[i];
|
||||
const int jnum = numneigh[i];
|
||||
#if __INTEL_COMPILER+0 > 1499
|
||||
#pragma vector aligned
|
||||
#pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin)
|
||||
#endif
|
||||
for (int jj = 0; jj < jnum; jj++) {
|
||||
int j = jlist[jj];
|
||||
if (need_ic && j < 0) j = -j - 1;
|
||||
if (j < nlocal) {
|
||||
if (j < vlmin) vlmin = j;
|
||||
if (j > vlmax) vlmax = j;
|
||||
} else {
|
||||
if (j < vgmin) vgmin = j;
|
||||
if (j > vgmax) vgmax = j;
|
||||
}
|
||||
}
|
||||
}
|
||||
lmin = MIN(lmin,vlmin);
|
||||
gmin = MIN(gmin,vgmin);
|
||||
lmax = MAX(lmax,vlmax);
|
||||
gmax = MAX(gmax,vgmax);
|
||||
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp critical
|
||||
#endif
|
||||
{
|
||||
if (lmin < overflow[LMP_LOCAL_MIN]) overflow[LMP_LOCAL_MIN] = lmin;
|
||||
if (lmax > overflow[LMP_LOCAL_MAX]) overflow[LMP_LOCAL_MAX] = lmax;
|
||||
if (gmin < overflow[LMP_GHOST_MIN]) overflow[LMP_GHOST_MIN] = gmin;
|
||||
if (gmax > overflow[LMP_GHOST_MAX]) overflow[LMP_GHOST_MAX] = gmax;
|
||||
}
|
||||
#pragma omp barrier
|
||||
|
||||
int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN];
|
||||
if (nghost < 0) nghost = 0;
|
||||
if (offload) {
|
||||
ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1;
|
||||
nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost;
|
||||
} else {
|
||||
ghost_offset = overflow[LMP_GHOST_MIN] - nlocal;
|
||||
nall_offset = nlocal + nghost;
|
||||
}
|
||||
} // if separate_buffers
|
||||
#endif
|
||||
|
||||
if (molecular) {
|
||||
for (int i = ifrom; i < ito; ++i) {
|
||||
int * _noalias jlist = firstneigh + cnumneigh[i];
|
||||
const int jnum = numneigh[i];
|
||||
|
||||
if (THREE) {
|
||||
const int trip = jnum * pack_width;
|
||||
for (int jj = 0; jj < trip; jj+=pack_width) {
|
||||
const int j = jlist[jj];
|
||||
if (need_ic && j < 0) {
|
||||
which = 0;
|
||||
jlist[jj] = -j - 1;
|
||||
} else
|
||||
ofind_special(which, special, nspecial, i, tag[j]);
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (j >= nlocal) {
|
||||
if (j == e_nall)
|
||||
jlist[jj] = nall_offset;
|
||||
else if (which)
|
||||
jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
|
||||
else jlist[jj]-=ghost_offset;
|
||||
} else
|
||||
#endif
|
||||
if (which) jlist[jj] = j ^ (which << SBBITS);
|
||||
}
|
||||
} else {
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#pragma simd
|
||||
#endif
|
||||
for (int jj = 0; jj < jnum; jj++) {
|
||||
const int j = jlist[jj];
|
||||
if (need_ic && j < 0) {
|
||||
which = 0;
|
||||
jlist[jj] = -j - 1;
|
||||
} else
|
||||
ofind_special(which, special, nspecial, i, tag[j]);
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (j >= nlocal) {
|
||||
if (j == e_nall)
|
||||
jlist[jj] = nall_offset;
|
||||
else if (which)
|
||||
jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
|
||||
else jlist[jj]-=ghost_offset;
|
||||
} else
|
||||
#endif
|
||||
if (which) jlist[jj] = j ^ (which << SBBITS);
|
||||
}
|
||||
}
|
||||
} // for i
|
||||
} // if molecular
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
else if (separate_buffers) {
|
||||
for (int i = ifrom; i < ito; ++i) {
|
||||
int * _noalias jlist = firstneigh + cnumneigh[i];
|
||||
const int jnum = numneigh[i];
|
||||
int jj = 0;
|
||||
#pragma vector aligned
|
||||
#pragma simd
|
||||
for (jj = 0; jj < jnum; jj++) {
|
||||
if (jlist[jj] >= nlocal) {
|
||||
if (jlist[jj] == e_nall) jlist[jj] = nall_offset;
|
||||
else jlist[jj] -= ghost_offset;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
} // end omp
|
||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||
*timer_compute = MIC_Wtime() - *timer_compute;
|
||||
#endif
|
||||
} // end offload
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (offload) {
|
||||
_fix->stop_watch(TIME_OFFLOAD_LATENCY);
|
||||
_fix->start_watch(TIME_HOST_NEIGHBOR);
|
||||
for (int n = 0; n < aend; n++) {
|
||||
ilist[n] = n;
|
||||
numneigh[n] = 0;
|
||||
}
|
||||
} else {
|
||||
for (int i = astart; i < aend; i++)
|
||||
list->firstneigh[i] = firstneigh + cnumneigh[i];
|
||||
if (separate_buffers) {
|
||||
_fix->start_watch(TIME_PACK);
|
||||
_fix->set_neighbor_host_sizes();
|
||||
buffers->pack_sep_from_single(_fix->host_min_local(),
|
||||
_fix->host_used_local(),
|
||||
_fix->host_min_ghost(),
|
||||
_fix->host_used_ghost());
|
||||
_fix->stop_watch(TIME_PACK);
|
||||
}
|
||||
}
|
||||
#else
|
||||
#pragma vector aligned
|
||||
#pragma simd
|
||||
for (int i = astart; i < aend; i++)
|
||||
list->firstneigh[i] = firstneigh + cnumneigh[i];
|
||||
#endif
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
void NPairIntel::grow_stencil()
|
||||
{
|
||||
@ -65,3 +737,201 @@ void NPairIntel::grow_stencil()
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
// ---- Half, no IC
|
||||
|
||||
template void NPairIntel::bin_newton<float, float, 0, 0, 0, 0, 0>
|
||||
(const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
|
||||
const int);
|
||||
template void NPairIntel::bin_newton<float, double, 0, 0, 0, 0, 0>
|
||||
(const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
|
||||
const int);
|
||||
template void NPairIntel::bin_newton<double, double, 0, 0, 0, 0, 0>
|
||||
(const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
|
||||
const int);
|
||||
|
||||
// ---- Half, IC
|
||||
|
||||
template void NPairIntel::bin_newton<float, float, 0, 1, 0, 0, 0>
|
||||
(const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
|
||||
const int);
|
||||
template void NPairIntel::bin_newton<float, double, 0, 1, 0, 0, 0>
|
||||
(const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
|
||||
const int);
|
||||
template void NPairIntel::bin_newton<double, double, 0, 1, 0, 0, 0>
|
||||
(const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
|
||||
const int);
|
||||
|
||||
// ---- Tri, no IC
|
||||
|
||||
template void NPairIntel::bin_newton<float, float, 0, 0, 0, 1, 0>
|
||||
(const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
|
||||
const int);
|
||||
template void NPairIntel::bin_newton<float, double, 0, 0, 0, 1, 0>
|
||||
(const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
|
||||
const int);
|
||||
template void NPairIntel::bin_newton<double, double, 0, 0, 0, 1, 0>
|
||||
(const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
|
||||
const int);
|
||||
|
||||
// ---- Tri, IC
|
||||
|
||||
template void NPairIntel::bin_newton<float, float, 0, 1, 0, 1, 0>
|
||||
(const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
|
||||
const int);
|
||||
template void NPairIntel::bin_newton<float, double, 0, 1, 0, 1, 0>
|
||||
(const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
|
||||
const int);
|
||||
template void NPairIntel::bin_newton<double, double, 0, 1, 0, 1, 0>
|
||||
(const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
|
||||
const int);
|
||||
|
||||
// ---- Full, no IC
|
||||
|
||||
template void NPairIntel::bin_newton<float, float, 0, 0, 1, 0, 0>
|
||||
(const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
|
||||
const int);
|
||||
template void NPairIntel::bin_newton<float, double, 0, 0, 1, 0, 0>
|
||||
(const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
|
||||
const int);
|
||||
template void NPairIntel::bin_newton<double, double, 0, 0, 1, 0, 0>
|
||||
(const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
|
||||
const int);
|
||||
|
||||
// ---- Full, IC
|
||||
|
||||
template void NPairIntel::bin_newton<float, float, 0, 1, 1, 0, 0>
|
||||
(const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
|
||||
const int);
|
||||
template void NPairIntel::bin_newton<float, double, 0, 1, 1, 0, 0>
|
||||
(const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
|
||||
const int);
|
||||
template void NPairIntel::bin_newton<double, double, 0, 1, 1, 0, 0>
|
||||
(const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
|
||||
const int);
|
||||
|
||||
// ---- 3-body, no IC
|
||||
|
||||
template void NPairIntel::bin_newton<float, float, 0, 0, 1, 0, 1>
|
||||
(const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
|
||||
const int);
|
||||
template void NPairIntel::bin_newton<float, double, 0, 0, 1, 0, 1>
|
||||
(const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
|
||||
const int);
|
||||
template void NPairIntel::bin_newton<double, double, 0, 0, 1, 0, 1>
|
||||
(const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
|
||||
const int);
|
||||
|
||||
// ---- 3-body, IC
|
||||
|
||||
template void NPairIntel::bin_newton<float, float, 0, 1, 1, 0, 1>
|
||||
(const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
|
||||
const int);
|
||||
template void NPairIntel::bin_newton<float, double, 0, 1, 1, 0, 1>
|
||||
(const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
|
||||
const int);
|
||||
template void NPairIntel::bin_newton<double, double, 0, 1, 1, 0, 1>
|
||||
(const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
|
||||
const int);
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
|
||||
// ---- Half, no IC, no ghost
|
||||
|
||||
template void NPairIntel::bin_newton<float, float, 1, 0, 0, 0, 0>
|
||||
(const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
|
||||
const int);
|
||||
template void NPairIntel::bin_newton<float, double, 1, 0, 0, 0, 0>
|
||||
(const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
|
||||
const int);
|
||||
template void NPairIntel::bin_newton<double, double, 1, 0, 0, 0, 0>
|
||||
(const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
|
||||
const int);
|
||||
|
||||
// ---- Half, IC, no ghost
|
||||
|
||||
template void NPairIntel::bin_newton<float, float, 1, 1, 0, 0, 0>
|
||||
(const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
|
||||
const int);
|
||||
template void NPairIntel::bin_newton<float, double, 1, 1, 0, 0, 0>
|
||||
(const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
|
||||
const int);
|
||||
template void NPairIntel::bin_newton<double, double, 1, 1, 0, 0, 0>
|
||||
(const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
|
||||
const int);
|
||||
|
||||
// ---- Tri, no IC, no ghost
|
||||
|
||||
template void NPairIntel::bin_newton<float, float, 1, 0, 0, 1, 0>
|
||||
(const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
|
||||
const int);
|
||||
template void NPairIntel::bin_newton<float, double, 1, 0, 0, 1, 0>
|
||||
(const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
|
||||
const int);
|
||||
template void NPairIntel::bin_newton<double, double, 1, 0, 0, 1, 0>
|
||||
(const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
|
||||
const int);
|
||||
|
||||
// ---- Tri, IC, no ghost
|
||||
|
||||
template void NPairIntel::bin_newton<float, float, 1, 1, 0, 1, 0>
|
||||
(const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
|
||||
const int);
|
||||
template void NPairIntel::bin_newton<float, double, 1, 1, 0, 1, 0>
|
||||
(const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
|
||||
const int);
|
||||
template void NPairIntel::bin_newton<double, double, 1, 1, 0, 1, 0>
|
||||
(const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
|
||||
const int);
|
||||
|
||||
// ---- Full, no IC, no ghost
|
||||
|
||||
template void NPairIntel::bin_newton<float, float, 1, 0, 1, 0, 0>
|
||||
(const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
|
||||
const int);
|
||||
template void NPairIntel::bin_newton<float, double, 1, 0, 1, 0, 0>
|
||||
(const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
|
||||
const int);
|
||||
template void NPairIntel::bin_newton<double, double, 1, 0, 1, 0, 0>
|
||||
(const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
|
||||
const int);
|
||||
|
||||
// ---- Full, IC, no ghost
|
||||
|
||||
template void NPairIntel::bin_newton<float, float, 1, 1, 1, 0, 0>
|
||||
(const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
|
||||
const int);
|
||||
template void NPairIntel::bin_newton<float, double, 1, 1, 1, 0, 0>
|
||||
(const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
|
||||
const int);
|
||||
template void NPairIntel::bin_newton<double, double, 1, 1, 1, 0, 0>
|
||||
(const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
|
||||
const int);
|
||||
|
||||
// ---- 3-body, no IC, no ghost
|
||||
|
||||
template void NPairIntel::bin_newton<float, float, 1, 0, 1, 0, 1>
|
||||
(const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
|
||||
const int);
|
||||
template void NPairIntel::bin_newton<float, double, 1, 0, 1, 0, 1>
|
||||
(const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
|
||||
const int);
|
||||
template void NPairIntel::bin_newton<double, double, 1, 0, 1, 0, 1>
|
||||
(const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
|
||||
const int);
|
||||
|
||||
// ---- 3-body, IC, no ghost
|
||||
|
||||
template void NPairIntel::bin_newton<float, float, 1, 1, 1, 0, 1>
|
||||
(const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
|
||||
const int);
|
||||
template void NPairIntel::bin_newton<float, double, 1, 1, 1, 0, 1>
|
||||
(const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
|
||||
const int);
|
||||
template void NPairIntel::bin_newton<double, double, 1, 1, 1, 0, 1>
|
||||
(const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
|
||||
const int);
|
||||
|
||||
#endif
|
||||
|
||||
@ -25,10 +25,6 @@
|
||||
#include "intel_simd.h"
|
||||
#endif
|
||||
|
||||
#ifdef OUTER_CHUNK
|
||||
#include "intel_simd.h"
|
||||
#endif
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
#pragma offload_attribute(push,target(mic))
|
||||
#endif
|
||||
@ -87,6 +83,10 @@ class NPairIntel : public NPair {
|
||||
protected:
|
||||
FixIntel *_fix;
|
||||
|
||||
template <class flt_t, class acc_t, int, int, int, int, int>
|
||||
void bin_newton(const int, NeighList *, IntelBuffers<flt_t,acc_t> *,
|
||||
const int, const int, const int offload_end = 0);
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
int _cop;
|
||||
int *_off_map_stencil;
|
||||
|
||||
@ -85,53 +85,47 @@ void PairBuckCoulCutIntel::compute(int eflag, int vflag,
|
||||
|
||||
if (ago != 0 && fix->separate_buffers() == 0) {
|
||||
fix->start_watch(TIME_PACK);
|
||||
|
||||
int packthreads;
|
||||
if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
|
||||
else packthreads = 1;
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
|
||||
#pragma omp parallel if(packthreads > 1)
|
||||
#endif
|
||||
{
|
||||
int ifrom, ito, tid;
|
||||
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
|
||||
nthreads, sizeof(ATOM_T));
|
||||
packthreads, sizeof(ATOM_T));
|
||||
buffers->thr_pack(ifrom,ito,ago);
|
||||
}
|
||||
fix->stop_watch(TIME_PACK);
|
||||
}
|
||||
|
||||
if (evflag || vflag_fdotr) {
|
||||
int ovflag = 0;
|
||||
if (vflag_fdotr) ovflag = 2;
|
||||
else if (vflag) ovflag = 1;
|
||||
if (eflag) {
|
||||
if (force->newton_pair) {
|
||||
eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
int ovflag = 0;
|
||||
if (vflag_fdotr) ovflag = 2;
|
||||
else if (vflag) ovflag = 1;
|
||||
if (eflag) {
|
||||
if (force->newton_pair) {
|
||||
eval<1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
if (force->newton_pair) {
|
||||
eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
eval<1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
} else {
|
||||
if (force->newton_pair) {
|
||||
eval<0,0,1>(1, 0, buffers, fc, 0, offload_end);
|
||||
eval<0,0,1>(0, 0, buffers, fc, host_start, inum);
|
||||
eval<0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<0,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
eval<0,0,0>(1, 0, buffers, fc, 0, offload_end);
|
||||
eval<0,0,0>(0, 0, buffers, fc, host_start, inum);
|
||||
eval<0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<0,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||
void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
|
||||
IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc,
|
||||
@ -165,7 +159,7 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
|
||||
|
||||
// Determine how much data to transfer
|
||||
int x_size, q_size, f_stride, ev_size, separate_flag;
|
||||
IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
|
||||
IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
|
||||
buffers, offload, fix, separate_flag,
|
||||
x_size, q_size, ev_size, f_stride);
|
||||
|
||||
@ -208,27 +202,26 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
|
||||
f_stride, x, q);
|
||||
|
||||
acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||
if (EVFLAG) {
|
||||
oevdwl = oecoul = (acc_t)0;
|
||||
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
|
||||
}
|
||||
if (EFLAG) oevdwl = oecoul = (acc_t)0;
|
||||
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
|
||||
|
||||
// loop over neighbors of my atoms
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp parallel default(none) \
|
||||
shared(f_start,f_stride,nlocal,nall,minlocal) \
|
||||
reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||
#pragma omp parallel reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||
#endif
|
||||
{
|
||||
int iifrom, iito, tid;
|
||||
IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
|
||||
int iifrom, iip, iito, tid;
|
||||
IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
|
||||
iifrom += astart;
|
||||
iito += astart;
|
||||
|
||||
FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
|
||||
memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
|
||||
int foff;
|
||||
if (NEWTON_PAIR) foff = tid * f_stride - minlocal;
|
||||
else foff = -minlocal;
|
||||
FORCE_T * _noalias const f = f_start + foff;
|
||||
if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
|
||||
|
||||
for (int i = iifrom; i < iito; ++i) {
|
||||
for (int i = iifrom; i < iito; i += iip) {
|
||||
const int itype = x[i].w;
|
||||
|
||||
const int ptr_off = itype * ntypes;
|
||||
@ -246,10 +239,9 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
|
||||
const flt_t ztmp = x[i].z;
|
||||
const flt_t qtmp = q[i];
|
||||
fxtmp = fytmp = fztmp = (acc_t)0;
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
|
||||
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
||||
}
|
||||
if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
|
||||
if (NEWTON_PAIR == 0)
|
||||
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
@ -319,71 +311,72 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
|
||||
if (rsq < c_cuti[jtype].cutsq) {
|
||||
#endif
|
||||
const flt_t fpair = (forcecoul + forcebuck) * r2inv;
|
||||
fxtmp += delx * fpair;
|
||||
fytmp += dely * fpair;
|
||||
fztmp += delz * fpair;
|
||||
if (NEWTON_PAIR || j < nlocal) {
|
||||
f[j].x -= delx * fpair;
|
||||
f[j].y -= dely * fpair;
|
||||
f[j].z -= delz * fpair;
|
||||
}
|
||||
const flt_t fpx = fpair * delx;
|
||||
fxtmp += fpx;
|
||||
if (NEWTON_PAIR) f[j].x -= fpx;
|
||||
const flt_t fpy = fpair * dely;
|
||||
fytmp += fpy;
|
||||
if (NEWTON_PAIR) f[j].y -= fpy;
|
||||
const flt_t fpz = fpair * delz;
|
||||
fztmp += fpz;
|
||||
if (NEWTON_PAIR) f[j].z -= fpz;
|
||||
|
||||
if (EVFLAG) {
|
||||
flt_t ev_pre = (flt_t)0;
|
||||
if (NEWTON_PAIR || i < nlocal)
|
||||
ev_pre += (flt_t)0.5;
|
||||
if (NEWTON_PAIR || j < nlocal)
|
||||
ev_pre += (flt_t)0.5;
|
||||
|
||||
if (EFLAG) {
|
||||
sevdwl += ev_pre * evdwl;
|
||||
secoul += ev_pre * ecoul;
|
||||
if (eatom) {
|
||||
if (NEWTON_PAIR || i < nlocal)
|
||||
fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
|
||||
if (NEWTON_PAIR || j < nlocal)
|
||||
f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
|
||||
}
|
||||
if (EFLAG) {
|
||||
sevdwl += evdwl;
|
||||
secoul += ecoul;
|
||||
if (eatom) {
|
||||
fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
|
||||
if (NEWTON_PAIR)
|
||||
f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
|
||||
}
|
||||
IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair, delx, dely, delz);
|
||||
}
|
||||
}
|
||||
if (NEWTON_PAIR == 0)
|
||||
IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz);
|
||||
#ifdef INTEL_VMASK
|
||||
}
|
||||
#endif
|
||||
} // for jj
|
||||
|
||||
f[i].x += fxtmp;
|
||||
f[i].y += fytmp;
|
||||
f[i].z += fztmp;
|
||||
|
||||
IP_PRE_ev_tally_atomq(EVFLAG, EFLAG, vflag, f, fwtmp);
|
||||
if (NEWTON_PAIR) {
|
||||
f[i].x += fxtmp;
|
||||
f[i].y += fytmp;
|
||||
f[i].z += fztmp;
|
||||
} else {
|
||||
f[i].x = fxtmp;
|
||||
f[i].y = fytmp;
|
||||
f[i].z = fztmp;
|
||||
}
|
||||
IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
|
||||
} // for ii
|
||||
|
||||
#ifndef _LMP_INTEL_OFFLOAD
|
||||
if (vflag == 2)
|
||||
#endif
|
||||
{
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp barrier
|
||||
#endif
|
||||
IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall,
|
||||
nlocal, minlocal, nthreads, f_start, f_stride,
|
||||
x, offload);
|
||||
}
|
||||
IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
|
||||
f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
|
||||
ov4, ov5);
|
||||
} // end of omp parallel region
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) {
|
||||
ev_global[0] = oevdwl;
|
||||
ev_global[1] = oecoul;
|
||||
}
|
||||
if (vflag) {
|
||||
ev_global[2] = ov0;
|
||||
ev_global[3] = ov1;
|
||||
ev_global[4] = ov2;
|
||||
ev_global[5] = ov3;
|
||||
ev_global[6] = ov4;
|
||||
ev_global[7] = ov5;
|
||||
|
||||
IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
|
||||
ov0, ov1, ov2, ov3, ov4, ov5);
|
||||
|
||||
if (EFLAG) {
|
||||
if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
|
||||
ev_global[0] = oevdwl;
|
||||
ev_global[1] = oecoul;
|
||||
}
|
||||
if (vflag) {
|
||||
if (NEWTON_PAIR == 0) {
|
||||
ov0 *= (acc_t)0.5;
|
||||
ov1 *= (acc_t)0.5;
|
||||
ov2 *= (acc_t)0.5;
|
||||
ov3 *= (acc_t)0.5;
|
||||
ov4 *= (acc_t)0.5;
|
||||
ov5 *= (acc_t)0.5;
|
||||
}
|
||||
ev_global[2] = ov0;
|
||||
ev_global[3] = ov1;
|
||||
ev_global[4] = ov2;
|
||||
ev_global[5] = ov3;
|
||||
ev_global[6] = ov4;
|
||||
ev_global[7] = ov5;
|
||||
}
|
||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||
*timer_compute = MIC_Wtime() - *timer_compute;
|
||||
@ -395,7 +388,7 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
|
||||
else
|
||||
fix->stop_watch(TIME_HOST_PAIR);
|
||||
|
||||
if (EVFLAG)
|
||||
if (EFLAG || vflag)
|
||||
fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
|
||||
else
|
||||
fix->add_result_array(f_start, 0, offload);
|
||||
@ -406,6 +399,10 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
|
||||
void PairBuckCoulCutIntel::init_style()
|
||||
{
|
||||
PairBuckCoulCut::init_style();
|
||||
if (force->newton_pair == 0) {
|
||||
neighbor->requests[neighbor->nrequest-1]->half = 0;
|
||||
neighbor->requests[neighbor->nrequest-1]->full = 1;
|
||||
}
|
||||
neighbor->requests[neighbor->nrequest-1]->intel = 1;
|
||||
|
||||
int ifix = modify->find_fix("package_intel");
|
||||
|
||||
@ -49,7 +49,7 @@ class PairBuckCoulCutIntel : public PairBuckCoulCut {
|
||||
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> * buffers,
|
||||
const ForceConst<flt_t> &fc);
|
||||
|
||||
template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||
void eval(const int offload, const int vflag,
|
||||
IntelBuffers<flt_t,acc_t> * buffers,
|
||||
const ForceConst<flt_t> &fc, const int astart, const int aend);
|
||||
|
||||
@ -85,53 +85,47 @@ void PairBuckCoulLongIntel::compute(int eflag, int vflag,
|
||||
|
||||
if (_lrt == 0 && ago != 0 && fix->separate_buffers() == 0) {
|
||||
fix->start_watch(TIME_PACK);
|
||||
|
||||
int packthreads;
|
||||
if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
|
||||
else packthreads = 1;
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
|
||||
#pragma omp parallel if(packthreads > 1)
|
||||
#endif
|
||||
{
|
||||
int ifrom, ito, tid;
|
||||
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
|
||||
nthreads, sizeof(ATOM_T));
|
||||
packthreads, sizeof(ATOM_T));
|
||||
buffers->thr_pack(ifrom,ito,ago);
|
||||
}
|
||||
fix->stop_watch(TIME_PACK);
|
||||
}
|
||||
|
||||
if (evflag || vflag_fdotr) {
|
||||
int ovflag = 0;
|
||||
if (vflag_fdotr) ovflag = 2;
|
||||
else if (vflag) ovflag = 1;
|
||||
if (eflag) {
|
||||
if (force->newton_pair) {
|
||||
eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
int ovflag = 0;
|
||||
if (vflag_fdotr) ovflag = 2;
|
||||
else if (vflag) ovflag = 1;
|
||||
if (eflag) {
|
||||
if (force->newton_pair) {
|
||||
eval<1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
if (force->newton_pair) {
|
||||
eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
eval<1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
} else {
|
||||
if (force->newton_pair) {
|
||||
eval<0,0,1>(1, 0, buffers, fc, 0, offload_end);
|
||||
eval<0,0,1>(0, 0, buffers, fc, host_start, inum);
|
||||
eval<0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<0,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
eval<0,0,0>(1, 0, buffers, fc, 0, offload_end);
|
||||
eval<0,0,0>(0, 0, buffers, fc, host_start, inum);
|
||||
eval<0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<0,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||
void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
|
||||
IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc,
|
||||
@ -170,9 +164,17 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
|
||||
const int ntypes = atom->ntypes + 1;
|
||||
const int eatom = this->eflag_atom;
|
||||
|
||||
flt_t * _noalias const ccachex = buffers->get_ccachex();
|
||||
flt_t * _noalias const ccachey = buffers->get_ccachey();
|
||||
flt_t * _noalias const ccachez = buffers->get_ccachez();
|
||||
flt_t * _noalias const ccachew = buffers->get_ccachew();
|
||||
int * _noalias const ccachei = buffers->get_ccachei();
|
||||
int * _noalias const ccachej = buffers->get_ccachej();
|
||||
const int ccache_stride = _ccache_stride;
|
||||
|
||||
// Determine how much data to transfer
|
||||
int x_size, q_size, f_stride, ev_size, separate_flag;
|
||||
IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
|
||||
IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
|
||||
buffers, offload, fix, separate_flag,
|
||||
x_size, q_size, ev_size, f_stride);
|
||||
|
||||
@ -208,8 +210,10 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
|
||||
in(x:length(x_size) alloc_if(0) free_if(0)) \
|
||||
in(q:length(q_size) alloc_if(0) free_if(0)) \
|
||||
in(overflow:length(0) alloc_if(0) free_if(0)) \
|
||||
in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \
|
||||
in(ccachei,ccachej:length(0) alloc_if(0) free_if(0)) \
|
||||
in(astart,nthreads,qqrd2e,g_ewald,inum,nall,ntypes,vflag,eatom) \
|
||||
in(f_stride,nlocal,minlocal,separate_flag,offload) \
|
||||
in(ccache_stride,f_stride,nlocal,minlocal,separate_flag,offload) \
|
||||
out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
|
||||
out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
|
||||
out(timer_compute:length(1) alloc_if(0) free_if(0)) \
|
||||
@ -224,27 +228,34 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
|
||||
f_stride, x, q);
|
||||
|
||||
acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||
if (EVFLAG) {
|
||||
oevdwl = oecoul = (acc_t)0;
|
||||
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
|
||||
}
|
||||
if (EFLAG) oevdwl = oecoul = (acc_t)0;
|
||||
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
|
||||
|
||||
// loop over neighbors of my atoms
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp parallel default(none) \
|
||||
shared(f_start,f_stride,nlocal,nall,minlocal) \
|
||||
reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||
#pragma omp parallel reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||
#endif
|
||||
{
|
||||
int iifrom, iito, tid;
|
||||
IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
|
||||
int iifrom, iip, iito, tid;
|
||||
IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
|
||||
iifrom += astart;
|
||||
iito += astart;
|
||||
|
||||
FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
|
||||
memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
|
||||
int foff;
|
||||
if (NEWTON_PAIR) foff = tid * f_stride - minlocal;
|
||||
else foff = -minlocal;
|
||||
FORCE_T * _noalias const f = f_start + foff;
|
||||
if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
|
||||
|
||||
for (int i = iifrom; i < iito; ++i) {
|
||||
const int toffs = tid * ccache_stride;
|
||||
flt_t * _noalias const tdelx = ccachex + toffs;
|
||||
flt_t * _noalias const tdely = ccachey + toffs;
|
||||
flt_t * _noalias const tdelz = ccachez + toffs;
|
||||
flt_t * _noalias const trsq = ccachew + toffs;
|
||||
int * _noalias const tj = ccachei + toffs;
|
||||
int * _noalias const tjtype = ccachej + toffs;
|
||||
|
||||
for (int i = iifrom; i < iito; i += iip) {
|
||||
const int itype = x[i].w;
|
||||
const int ptr_off = itype * ntypes;
|
||||
const C_FORCE_T * _noalias const c_forcei = c_force + ptr_off;
|
||||
@ -262,85 +273,98 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
|
||||
const flt_t ztmp = x[i].z;
|
||||
const flt_t qtmp = q[i];
|
||||
fxtmp = fytmp = fztmp = (acc_t)0;
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
|
||||
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
||||
if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
|
||||
if (NEWTON_PAIR == 0)
|
||||
if (vflag == 1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
||||
|
||||
int ej = 0;
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for (int jj = 0; jj < jnum; jj++) {
|
||||
const int j = jlist[jj] & NEIGHMASK;
|
||||
const flt_t delx = xtmp - x[j].x;
|
||||
const flt_t dely = ytmp - x[j].y;
|
||||
const flt_t delz = ztmp - x[j].z;
|
||||
const int jtype = x[j].w;
|
||||
const flt_t rsq = delx * delx + dely * dely + delz * delz;
|
||||
|
||||
if (rsq < c_forcei[jtype].cutsq) {
|
||||
trsq[ej]=rsq;
|
||||
tdelx[ej]=delx;
|
||||
tdely[ej]=dely;
|
||||
tdelz[ej]=delz;
|
||||
tjtype[ej]=jtype;
|
||||
tj[ej]=jlist[jj];
|
||||
ej++;
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
||||
sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
|
||||
sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
#endif
|
||||
for (int jj = 0; jj < jnum; jj++) {
|
||||
for (int jj = 0; jj < ej; jj++) {
|
||||
flt_t forcecoul, forcebuck, evdwl, ecoul;
|
||||
forcecoul = forcebuck = evdwl = ecoul = (flt_t)0.0;
|
||||
|
||||
const int sbindex = jlist[jj] >> SBBITS & 3;
|
||||
const int j = jlist[jj] & NEIGHMASK;
|
||||
|
||||
const flt_t delx = xtmp - x[j].x;
|
||||
const flt_t dely = ytmp - x[j].y;
|
||||
const flt_t delz = ztmp - x[j].z;
|
||||
const int jtype = x[j].w;
|
||||
const flt_t rsq = delx * delx + dely * dely + delz * delz;
|
||||
const int j = tj[jj] & NEIGHMASK;
|
||||
const int sbindex = tj[jj] >> SBBITS & 3;
|
||||
const int jtype = tjtype[jj];
|
||||
const flt_t rsq = trsq[jj];
|
||||
const flt_t r2inv = (flt_t)1.0 / rsq;
|
||||
const flt_t r = (flt_t)1.0 / sqrt(r2inv);
|
||||
|
||||
#ifdef INTEL_VMASK
|
||||
if (rsq < c_forcei[jtype].cutsq) {
|
||||
#ifdef INTEL_ALLOW_TABLE
|
||||
if (!ncoultablebits || rsq <= tabinnersq) {
|
||||
#endif
|
||||
#ifdef INTEL_ALLOW_TABLE
|
||||
if (!ncoultablebits || rsq <= tabinnersq) {
|
||||
#endif
|
||||
const flt_t A1 = 0.254829592;
|
||||
const flt_t A2 = -0.284496736;
|
||||
const flt_t A3 = 1.421413741;
|
||||
const flt_t A4 = -1.453152027;
|
||||
const flt_t A5 = 1.061405429;
|
||||
const flt_t EWALD_F = 1.12837917;
|
||||
const flt_t INV_EWALD_P = 1.0 / 0.3275911;
|
||||
const flt_t A1 = 0.254829592;
|
||||
const flt_t A2 = -0.284496736;
|
||||
const flt_t A3 = 1.421413741;
|
||||
const flt_t A4 = -1.453152027;
|
||||
const flt_t A5 = 1.061405429;
|
||||
const flt_t EWALD_F = 1.12837917;
|
||||
const flt_t INV_EWALD_P = 1.0 / 0.3275911;
|
||||
|
||||
const flt_t grij = g_ewald * r;
|
||||
const flt_t expm2 = exp(-grij * grij);
|
||||
const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij);
|
||||
const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
|
||||
const flt_t prefactor = qqrd2e * qtmp * q[j] / r;
|
||||
forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
|
||||
if (EFLAG) ecoul = prefactor * erfc;
|
||||
const flt_t grij = g_ewald * r;
|
||||
const flt_t expm2 = exp(-grij * grij);
|
||||
const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij);
|
||||
const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
|
||||
const flt_t prefactor = qqrd2e * qtmp * q[j] / r;
|
||||
forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
|
||||
if (EFLAG) ecoul = prefactor * erfc;
|
||||
|
||||
const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])*
|
||||
const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])*
|
||||
prefactor;
|
||||
forcecoul -= adjust;
|
||||
if (EFLAG) ecoul -= adjust;
|
||||
|
||||
#ifdef INTEL_ALLOW_TABLE
|
||||
} else {
|
||||
float rsq_lookup = rsq;
|
||||
const int itable = (__intel_castf32_u32(rsq_lookup) &
|
||||
ncoulmask) >> ncoulshiftbits;
|
||||
const flt_t fraction = (rsq_lookup - table[itable].r) *
|
||||
table[itable].dr;
|
||||
|
||||
const flt_t tablet = table[itable].f +
|
||||
fraction * table[itable].df;
|
||||
forcecoul = qtmp * q[j] * tablet;
|
||||
if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] +
|
||||
fraction * detable[itable]);
|
||||
if (sbindex) {
|
||||
const flt_t table2 = ctable[itable] +
|
||||
fraction * dctable[itable];
|
||||
const flt_t prefactor = qtmp * q[j] * table2;
|
||||
const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) *
|
||||
prefactor;
|
||||
forcecoul -= adjust;
|
||||
if (EFLAG) ecoul -= adjust;
|
||||
|
||||
#ifdef INTEL_ALLOW_TABLE
|
||||
} else {
|
||||
float rsq_lookup = rsq;
|
||||
const int itable = (__intel_castf32_u32(rsq_lookup) &
|
||||
ncoulmask) >> ncoulshiftbits;
|
||||
const flt_t fraction = (rsq_lookup - table[itable].r) *
|
||||
table[itable].dr;
|
||||
|
||||
const flt_t tablet = table[itable].f +
|
||||
fraction * table[itable].df;
|
||||
forcecoul = qtmp * q[j] * tablet;
|
||||
if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] +
|
||||
fraction * detable[itable]);
|
||||
if (sbindex) {
|
||||
const flt_t table2 = ctable[itable] +
|
||||
fraction * dctable[itable];
|
||||
const flt_t prefactor = qtmp * q[j] * table2;
|
||||
const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) *
|
||||
prefactor;
|
||||
forcecoul -= adjust;
|
||||
if (EFLAG) ecoul -= adjust;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#ifdef INTEL_VMASK
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef INTEL_VMASK
|
||||
if (rsq < c_forcei[jtype].cut_ljsq) {
|
||||
@ -361,80 +385,74 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
|
||||
#ifdef INTEL_VMASK
|
||||
}
|
||||
#else
|
||||
if (rsq > c_forcei[jtype].cutsq)
|
||||
{ forcecoul = (flt_t)0.0; ecoul = (flt_t)0.0; }
|
||||
if (rsq > c_forcei[jtype].cut_ljsq)
|
||||
{ forcebuck = (flt_t)0.0; evdwl = (flt_t)0.0; }
|
||||
#endif
|
||||
|
||||
#ifdef INTEL_VMASK
|
||||
if (rsq < c_forcei[jtype].cutsq) {
|
||||
#endif
|
||||
const flt_t fpair = (forcecoul + forcebuck) * r2inv;
|
||||
fxtmp += delx * fpair;
|
||||
fytmp += dely * fpair;
|
||||
fztmp += delz * fpair;
|
||||
if (NEWTON_PAIR || j < nlocal) {
|
||||
f[j].x -= delx * fpair;
|
||||
f[j].y -= dely * fpair;
|
||||
f[j].z -= delz * fpair;
|
||||
}
|
||||
const flt_t fpair = (forcecoul + forcebuck) * r2inv;
|
||||
const flt_t fpx = fpair * tdelx[jj];
|
||||
fxtmp += fpx;
|
||||
if (NEWTON_PAIR) f[j].x -= fpx;
|
||||
const flt_t fpy = fpair * tdely[jj];
|
||||
fytmp += fpy;
|
||||
if (NEWTON_PAIR) f[j].y -= fpy;
|
||||
const flt_t fpz = fpair * tdelz[jj];
|
||||
fztmp += fpz;
|
||||
if (NEWTON_PAIR) f[j].z -= fpz;
|
||||
|
||||
if (EVFLAG) {
|
||||
flt_t ev_pre = (flt_t)0;
|
||||
if (NEWTON_PAIR || i < nlocal)
|
||||
ev_pre += (flt_t)0.5;
|
||||
if (NEWTON_PAIR || j < nlocal)
|
||||
ev_pre += (flt_t)0.5;
|
||||
|
||||
if (EFLAG) {
|
||||
sevdwl += ev_pre * evdwl;
|
||||
secoul += ev_pre * ecoul;
|
||||
if (eatom) {
|
||||
if (NEWTON_PAIR || i < nlocal)
|
||||
fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
|
||||
if (NEWTON_PAIR || j < nlocal)
|
||||
f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
|
||||
}
|
||||
}
|
||||
IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair, delx, dely, delz);
|
||||
if (EFLAG) {
|
||||
sevdwl += evdwl;
|
||||
secoul += ecoul;
|
||||
if (eatom) {
|
||||
fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
|
||||
if (NEWTON_PAIR)
|
||||
f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
|
||||
}
|
||||
#ifdef INTEL_VMASK
|
||||
}
|
||||
#endif
|
||||
}
|
||||
if (NEWTON_PAIR == 0)
|
||||
IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj],
|
||||
fpx, fpy, fpz);
|
||||
} // for jj
|
||||
|
||||
f[i].x += fxtmp;
|
||||
f[i].y += fytmp;
|
||||
f[i].z += fztmp;
|
||||
IP_PRE_ev_tally_atomq(EVFLAG, EFLAG, vflag, f, fwtmp);
|
||||
if (NEWTON_PAIR) {
|
||||
f[i].x += fxtmp;
|
||||
f[i].y += fytmp;
|
||||
f[i].z += fztmp;
|
||||
} else {
|
||||
f[i].x = fxtmp;
|
||||
f[i].y = fytmp;
|
||||
f[i].z = fztmp;
|
||||
}
|
||||
IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
|
||||
} // for ii
|
||||
|
||||
#ifndef _LMP_INTEL_OFFLOAD
|
||||
if (vflag == 2)
|
||||
#endif
|
||||
{
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp barrier
|
||||
#endif
|
||||
IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall,
|
||||
nlocal, minlocal, nthreads, f_start, f_stride,
|
||||
x, offload);
|
||||
}
|
||||
IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
|
||||
f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
|
||||
ov4, ov5);
|
||||
} // end of omp parallel region
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) {
|
||||
ev_global[0] = oevdwl;
|
||||
ev_global[1] = oecoul;
|
||||
}
|
||||
if (vflag) {
|
||||
ev_global[2] = ov0;
|
||||
ev_global[3] = ov1;
|
||||
ev_global[4] = ov2;
|
||||
ev_global[5] = ov3;
|
||||
ev_global[6] = ov4;
|
||||
ev_global[7] = ov5;
|
||||
|
||||
IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
|
||||
ov0, ov1, ov2, ov3, ov4, ov5);
|
||||
|
||||
if (EFLAG) {
|
||||
if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
|
||||
ev_global[0] = oevdwl;
|
||||
ev_global[1] = oecoul;
|
||||
}
|
||||
if (vflag) {
|
||||
if (NEWTON_PAIR == 0) {
|
||||
ov0 *= (acc_t)0.5;
|
||||
ov1 *= (acc_t)0.5;
|
||||
ov2 *= (acc_t)0.5;
|
||||
ov3 *= (acc_t)0.5;
|
||||
ov4 *= (acc_t)0.5;
|
||||
ov5 *= (acc_t)0.5;
|
||||
}
|
||||
ev_global[2] = ov0;
|
||||
ev_global[3] = ov1;
|
||||
ev_global[4] = ov2;
|
||||
ev_global[5] = ov3;
|
||||
ev_global[6] = ov4;
|
||||
ev_global[7] = ov5;
|
||||
}
|
||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||
*timer_compute = MIC_Wtime() - *timer_compute;
|
||||
@ -446,7 +464,7 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
|
||||
else
|
||||
fix->stop_watch(TIME_HOST_PAIR);
|
||||
|
||||
if (EVFLAG)
|
||||
if (EFLAG || vflag)
|
||||
fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
|
||||
else
|
||||
fix->add_result_array(f_start, 0, offload);
|
||||
@ -457,6 +475,10 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
|
||||
void PairBuckCoulLongIntel::init_style()
|
||||
{
|
||||
PairBuckCoulLong::init_style();
|
||||
if (force->newton_pair == 0) {
|
||||
neighbor->requests[neighbor->nrequest-1]->half = 0;
|
||||
neighbor->requests[neighbor->nrequest-1]->full = 1;
|
||||
}
|
||||
neighbor->requests[neighbor->nrequest-1]->intel = 1;
|
||||
|
||||
int ifix = modify->find_fix("package_intel");
|
||||
@ -484,6 +506,13 @@ template <class flt_t, class acc_t>
|
||||
void PairBuckCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
|
||||
IntelBuffers<flt_t,acc_t> *buffers)
|
||||
{
|
||||
int off_ccache = 0;
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (_cop >= 0) off_ccache = 1;
|
||||
#endif
|
||||
buffers->grow_ccache(off_ccache, comm->nthreads, 1);
|
||||
_ccache_stride = buffers->ccache_stride();
|
||||
|
||||
int tp1 = atom->ntypes + 1;
|
||||
int ntable = 1;
|
||||
if (ncoultablebits)
|
||||
@ -518,6 +547,9 @@ void PairBuckCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
|
||||
|
||||
for (int i = 0; i < tp1; i++) {
|
||||
for (int j = 0; j < tp1; j++) {
|
||||
if (cutsq[i][j] < cut_ljsq[i][j])
|
||||
error->all(FLERR,
|
||||
"Intel variant of lj/buck/coul/long expects lj cutoff<=coulombic");
|
||||
fc.c_force[i][j].cutsq = cutsq[i][j];
|
||||
fc.c_force[i][j].cut_ljsq = cut_ljsq[i][j];
|
||||
fc.c_force[i][j].buck1 = buck1[i][j];
|
||||
|
||||
@ -40,7 +40,7 @@ class PairBuckCoulLongIntel : public PairBuckCoulLong {
|
||||
|
||||
private:
|
||||
FixIntel *fix;
|
||||
int _cop, _lrt;
|
||||
int _cop, _lrt, _ccache_stride;
|
||||
|
||||
template <class flt_t> class ForceConst;
|
||||
|
||||
@ -48,7 +48,7 @@ class PairBuckCoulLongIntel : public PairBuckCoulLong {
|
||||
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> * buffers,
|
||||
const ForceConst<flt_t> &fc);
|
||||
|
||||
template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||
void eval(const int offload, const int vflag,
|
||||
IntelBuffers<flt_t,acc_t> * buffers,
|
||||
const ForceConst<flt_t> &fc, const int astart, const int aend);
|
||||
|
||||
@ -78,57 +78,51 @@ void PairBuckIntel::compute(int eflag, int vflag,
|
||||
|
||||
if (ago != 0 && fix->separate_buffers() == 0) {
|
||||
fix->start_watch(TIME_PACK);
|
||||
|
||||
int packthreads;
|
||||
if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
|
||||
else packthreads = 1;
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
|
||||
#pragma omp parallel if(packthreads > 1)
|
||||
#endif
|
||||
{
|
||||
int ifrom, ito, tid;
|
||||
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
|
||||
nthreads, sizeof(ATOM_T));
|
||||
packthreads, sizeof(ATOM_T));
|
||||
buffers->thr_pack(ifrom,ito,ago);
|
||||
}
|
||||
fix->stop_watch(TIME_PACK);
|
||||
}
|
||||
|
||||
if (evflag || vflag_fdotr) {
|
||||
int ovflag = 0;
|
||||
if (vflag_fdotr) ovflag = 2;
|
||||
else if (vflag) ovflag = 1;
|
||||
if (eflag) {
|
||||
if (force->newton_pair) {
|
||||
eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
int ovflag = 0;
|
||||
if (vflag_fdotr) ovflag = 2;
|
||||
else if (vflag) ovflag = 1;
|
||||
if (eflag) {
|
||||
if (force->newton_pair) {
|
||||
eval<1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
if (force->newton_pair) {
|
||||
eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
eval<1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
} else {
|
||||
if (force->newton_pair) {
|
||||
eval<0,0,1>(1, 0, buffers, fc, 0, offload_end);
|
||||
eval<0,0,1>(0, 0, buffers, fc, host_start, inum);
|
||||
eval<0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<0,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
eval<0,0,0>(1, 0, buffers, fc, 0, offload_end);
|
||||
eval<0,0,0>(0, 0, buffers, fc, host_start, inum);
|
||||
eval<0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<0,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||
void PairBuckIntel::eval(const int offload, const int vflag,
|
||||
IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc,
|
||||
const int astart, const int aend)
|
||||
IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc,
|
||||
const int astart, const int aend)
|
||||
{
|
||||
const int inum = aend - astart;
|
||||
if (inum == 0) return;
|
||||
@ -152,7 +146,7 @@ void PairBuckIntel::eval(const int offload, const int vflag,
|
||||
|
||||
// Determine how much data to transfer
|
||||
int x_size, q_size, f_stride, ev_size, separate_flag;
|
||||
IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
|
||||
IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
|
||||
buffers, offload, fix, separate_flag,
|
||||
x_size, q_size, ev_size, f_stride);
|
||||
|
||||
@ -192,27 +186,26 @@ void PairBuckIntel::eval(const int offload, const int vflag,
|
||||
f_stride, x, 0);
|
||||
|
||||
acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||
if (EVFLAG) {
|
||||
oevdwl = (acc_t)0;
|
||||
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
|
||||
}
|
||||
if (EFLAG) oevdwl = (acc_t)0;
|
||||
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
|
||||
|
||||
// loop over neighbors of my atoms
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp parallel default(none) \
|
||||
shared(f_start,f_stride,nlocal,nall,minlocal) \
|
||||
reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||
#pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||
#endif
|
||||
{
|
||||
int iifrom, iito, tid;
|
||||
IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
|
||||
int iifrom, iip, iito, tid;
|
||||
IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
|
||||
iifrom += astart;
|
||||
iito += astart;
|
||||
|
||||
FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
|
||||
memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
|
||||
int foff;
|
||||
if (NEWTON_PAIR) foff = tid * f_stride - minlocal;
|
||||
else foff = -minlocal;
|
||||
FORCE_T * _noalias const f = f_start + foff;
|
||||
if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
|
||||
|
||||
for (int i = iifrom; i < iito; ++i) {
|
||||
for (int i = iifrom; i < iito; i += iip) {
|
||||
const int itype = x[i].w;
|
||||
|
||||
const int ptr_off = itype * ntypes;
|
||||
@ -228,10 +221,9 @@ void PairBuckIntel::eval(const int offload, const int vflag,
|
||||
const flt_t ytmp = x[i].y;
|
||||
const flt_t ztmp = x[i].z;
|
||||
fxtmp = fytmp = fztmp = (acc_t)0;
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) fwtmp = sevdwl = (acc_t)0;
|
||||
if (EFLAG) fwtmp = sevdwl = (acc_t)0;
|
||||
if (NEWTON_PAIR == 0)
|
||||
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
||||
}
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
@ -284,69 +276,70 @@ void PairBuckIntel::eval(const int offload, const int vflag,
|
||||
evdwl *= factor_lj;
|
||||
}
|
||||
const flt_t fpair = forcebuck * r2inv;
|
||||
fxtmp += delx * fpair;
|
||||
fytmp += dely * fpair;
|
||||
fztmp += delz * fpair;
|
||||
if (NEWTON_PAIR || j < nlocal) {
|
||||
f[j].x -= delx * fpair;
|
||||
f[j].y -= dely * fpair;
|
||||
f[j].z -= delz * fpair;
|
||||
}
|
||||
const flt_t fpx = fpair * delx;
|
||||
fxtmp += fpx;
|
||||
if (NEWTON_PAIR) f[j].x -= fpx;
|
||||
const flt_t fpy = fpair * dely;
|
||||
fytmp += fpy;
|
||||
if (NEWTON_PAIR) f[j].y -= fpy;
|
||||
const flt_t fpz = fpair * delz;
|
||||
fztmp += fpz;
|
||||
if (NEWTON_PAIR) f[j].z -= fpz;
|
||||
|
||||
if (EVFLAG) {
|
||||
flt_t ev_pre = (flt_t)0;
|
||||
if (NEWTON_PAIR || i < nlocal)
|
||||
ev_pre += (flt_t)0.5;
|
||||
if (NEWTON_PAIR || j < nlocal)
|
||||
ev_pre += (flt_t)0.5;
|
||||
|
||||
if (EFLAG) {
|
||||
sevdwl += ev_pre * evdwl;
|
||||
if (eatom) {
|
||||
if (NEWTON_PAIR || i < nlocal)
|
||||
fwtmp += (flt_t)0.5 * evdwl;
|
||||
if (NEWTON_PAIR || j < nlocal)
|
||||
f[j].w += (flt_t)0.5 * evdwl;
|
||||
}
|
||||
}
|
||||
IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair, delx, dely, delz);
|
||||
}
|
||||
if (EFLAG) {
|
||||
sevdwl += evdwl;
|
||||
if (eatom) {
|
||||
fwtmp += (flt_t)0.5 * evdwl;
|
||||
if (NEWTON_PAIR)
|
||||
f[j].w += (flt_t)0.5 * evdwl;
|
||||
}
|
||||
}
|
||||
if (NEWTON_PAIR == 0)
|
||||
IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz);
|
||||
#ifdef INTEL_VMASK
|
||||
}
|
||||
#endif
|
||||
} // for jj
|
||||
|
||||
f[i].x += fxtmp;
|
||||
f[i].y += fytmp;
|
||||
f[i].z += fztmp;
|
||||
IP_PRE_ev_tally_atom(EVFLAG, EFLAG, vflag, f, fwtmp);
|
||||
if (NEWTON_PAIR) {
|
||||
f[i].x += fxtmp;
|
||||
f[i].y += fytmp;
|
||||
f[i].z += fztmp;
|
||||
} else {
|
||||
f[i].x = fxtmp;
|
||||
f[i].y = fytmp;
|
||||
f[i].z = fztmp;
|
||||
}
|
||||
IP_PRE_ev_tally_atom(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
|
||||
} // for ii
|
||||
|
||||
#ifndef _LMP_INTEL_OFFLOAD
|
||||
if (vflag == 2)
|
||||
#endif
|
||||
{
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp barrier
|
||||
#endif
|
||||
IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall,
|
||||
nlocal, minlocal, nthreads, f_start, f_stride,
|
||||
x, offload);
|
||||
}
|
||||
IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
|
||||
f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
|
||||
ov4, ov5);
|
||||
} // end of omp parallel region
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) {
|
||||
ev_global[0] = oevdwl;
|
||||
ev_global[1] = (acc_t)0;
|
||||
}
|
||||
if (vflag) {
|
||||
ev_global[2] = ov0;
|
||||
ev_global[3] = ov1;
|
||||
ev_global[4] = ov2;
|
||||
ev_global[5] = ov3;
|
||||
ev_global[6] = ov4;
|
||||
ev_global[7] = ov5;
|
||||
|
||||
IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
|
||||
ov0, ov1, ov2, ov3, ov4, ov5);
|
||||
|
||||
if (EFLAG) {
|
||||
if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
|
||||
ev_global[0] = oevdwl;
|
||||
ev_global[1] = (acc_t)0;
|
||||
}
|
||||
if (vflag) {
|
||||
if (NEWTON_PAIR == 0) {
|
||||
ov0 *= (acc_t)0.5;
|
||||
ov1 *= (acc_t)0.5;
|
||||
ov2 *= (acc_t)0.5;
|
||||
ov3 *= (acc_t)0.5;
|
||||
ov4 *= (acc_t)0.5;
|
||||
ov5 *= (acc_t)0.5;
|
||||
}
|
||||
ev_global[2] = ov0;
|
||||
ev_global[3] = ov1;
|
||||
ev_global[4] = ov2;
|
||||
ev_global[5] = ov3;
|
||||
ev_global[6] = ov4;
|
||||
ev_global[7] = ov5;
|
||||
}
|
||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||
*timer_compute = MIC_Wtime() - *timer_compute;
|
||||
@ -358,7 +351,7 @@ void PairBuckIntel::eval(const int offload, const int vflag,
|
||||
else
|
||||
fix->stop_watch(TIME_HOST_PAIR);
|
||||
|
||||
if (EVFLAG)
|
||||
if (EFLAG || vflag)
|
||||
fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
|
||||
else
|
||||
fix->add_result_array(f_start, 0, offload);
|
||||
@ -367,6 +360,10 @@ void PairBuckIntel::eval(const int offload, const int vflag,
|
||||
void PairBuckIntel::init_style()
|
||||
{
|
||||
PairBuck::init_style();
|
||||
if (force->newton_pair == 0) {
|
||||
neighbor->requests[neighbor->nrequest-1]->half = 0;
|
||||
neighbor->requests[neighbor->nrequest-1]->full = 1;
|
||||
}
|
||||
neighbor->requests[neighbor->nrequest-1]->intel = 1;
|
||||
|
||||
int ifix = modify->find_fix("package_intel");
|
||||
|
||||
@ -48,7 +48,7 @@ private:
|
||||
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> * buffers,
|
||||
const ForceConst<flt_t> &fc);
|
||||
|
||||
template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||
void eval(const int offload, const int vflag,
|
||||
IntelBuffers<flt_t,acc_t> * buffers,
|
||||
const ForceConst<flt_t> &fc, const int astart, const int aend);
|
||||
|
||||
@ -90,78 +90,58 @@ void PairEAMIntel::compute(int eflag, int vflag,
|
||||
if (ago != 0 && fix->separate_buffers() == 0) {
|
||||
fix->start_watch(TIME_PACK);
|
||||
|
||||
int packthreads;
|
||||
if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
|
||||
else packthreads = 1;
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
|
||||
#pragma omp parallel if(packthreads > 1)
|
||||
#endif
|
||||
{
|
||||
int ifrom, ito, tid;
|
||||
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
|
||||
nthreads, sizeof(ATOM_T));
|
||||
packthreads, sizeof(ATOM_T));
|
||||
buffers->thr_pack(ifrom,ito,ago);
|
||||
}
|
||||
fix->stop_watch(TIME_PACK);
|
||||
}
|
||||
|
||||
int ovflag = 0;
|
||||
if (vflag_fdotr) ovflag = 2;
|
||||
else if (vflag) ovflag = 1;
|
||||
if (_onetype) {
|
||||
if (evflag || vflag_fdotr) {
|
||||
int ovflag = 0;
|
||||
if (vflag_fdotr) ovflag = 2;
|
||||
else if (vflag) ovflag = 1;
|
||||
if (eflag) {
|
||||
if (force->newton_pair) {
|
||||
eval<1,1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,1,1,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
eval<1,1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,1,1,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
if (eflag) {
|
||||
if (force->newton_pair) {
|
||||
eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
if (force->newton_pair) {
|
||||
eval<1,1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,1,0,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
eval<1,1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,1,0,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
} else {
|
||||
if (force->newton_pair) {
|
||||
eval<0,0,0,1>(1, 0, buffers, fc, 0, offload_end);
|
||||
eval<0,0,0,1>(0, 0, buffers, fc, host_start, inum);
|
||||
eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
eval<0,0,0,0>(1, 0, buffers, fc, 0, offload_end);
|
||||
eval<0,0,0,0>(0, 0, buffers, fc, host_start, inum);
|
||||
eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (evflag || vflag_fdotr) {
|
||||
int ovflag = 0;
|
||||
if (vflag_fdotr) ovflag = 2;
|
||||
else if (vflag) ovflag = 1;
|
||||
if (eflag) {
|
||||
if (force->newton_pair) {
|
||||
eval<0,1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<0,1,1,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
eval<0,1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<0,1,1,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
if (eflag) {
|
||||
if (force->newton_pair) {
|
||||
eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
if (force->newton_pair) {
|
||||
eval<0,1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<0,1,0,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
eval<0,1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<0,1,0,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
} else {
|
||||
if (force->newton_pair) {
|
||||
eval<0,0,0,1>(1, 0, buffers, fc, 0, offload_end);
|
||||
eval<0,0,0,1>(0, 0, buffers, fc, host_start, inum);
|
||||
eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
eval<0,0,0,0>(1, 0, buffers, fc, 0, offload_end);
|
||||
eval<0,0,0,0>(0, 0, buffers, fc, host_start, inum);
|
||||
eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -169,8 +149,7 @@ void PairEAMIntel::compute(int eflag, int vflag,
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template <int ONETYPE, int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t,
|
||||
class acc_t>
|
||||
template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||
void PairEAMIntel::eval(const int offload, const int vflag,
|
||||
IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc,
|
||||
@ -186,7 +165,10 @@ void PairEAMIntel::eval(const int offload, const int vflag,
|
||||
nmax = atom->nmax;
|
||||
int edge = (nmax * sizeof(acc_t)) % INTEL_DATA_ALIGN;
|
||||
if (edge) nmax += (INTEL_DATA_ALIGN - edge) / sizeof(acc_t);
|
||||
memory->create(rho,nmax*comm->nthreads,"pair:rho");
|
||||
if (NEWTON_PAIR)
|
||||
memory->create(rho,nmax*comm->nthreads,"pair:rho");
|
||||
else
|
||||
memory->create(rho,nmax,"pair:rho");
|
||||
memory->create(fp,nmax,"pair:fp");
|
||||
// Use single precision allocation for single/mixed mode
|
||||
// Keep double version for single and swap_eam
|
||||
@ -222,9 +204,17 @@ void PairEAMIntel::eval(const int offload, const int vflag,
|
||||
const int ntypes = atom->ntypes + 1;
|
||||
const int eatom = this->eflag_atom;
|
||||
|
||||
flt_t * _noalias const ccachex = buffers->get_ccachex();
|
||||
flt_t * _noalias const ccachey = buffers->get_ccachey();
|
||||
flt_t * _noalias const ccachez = buffers->get_ccachez();
|
||||
flt_t * _noalias const ccachew = buffers->get_ccachew();
|
||||
int * _noalias const ccachei = buffers->get_ccachei();
|
||||
int * _noalias const ccachej = buffers->get_ccachej();
|
||||
const int ccache_stride = _ccache_stride;
|
||||
|
||||
// Determine how much data to transfer
|
||||
int x_size, q_size, f_stride, ev_size, separate_flag;
|
||||
IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
|
||||
IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
|
||||
buffers, offload, fix, separate_flag,
|
||||
x_size, q_size, ev_size, f_stride);
|
||||
|
||||
@ -252,16 +242,12 @@ void PairEAMIntel::eval(const int offload, const int vflag,
|
||||
f_stride, x, 0);
|
||||
|
||||
acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||
if (EVFLAG) {
|
||||
oevdwl = (acc_t)0;
|
||||
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
|
||||
}
|
||||
if (EFLAG) oevdwl = (acc_t)0;
|
||||
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
|
||||
|
||||
// loop over neighbors of my atoms
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp parallel default(none) \
|
||||
shared(fp_f, f_start,f_stride,nlocal,nall,minlocal) \
|
||||
reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||
#pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||
#endif
|
||||
{
|
||||
int iifrom, iito, tid;
|
||||
@ -270,12 +256,25 @@ void PairEAMIntel::eval(const int offload, const int vflag,
|
||||
iifrom += astart;
|
||||
iito += astart;
|
||||
|
||||
FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
|
||||
double * _noalias const trho = rho + tid*nmax;
|
||||
if (NEWTON_PAIR)
|
||||
int foff;
|
||||
if (NEWTON_PAIR) foff = tid * f_stride - minlocal;
|
||||
else foff = -minlocal;
|
||||
FORCE_T * _noalias const f = f_start + foff;
|
||||
if (NEWTON_PAIR) foff = tid * nmax;
|
||||
else foff = 0;
|
||||
double * _noalias const trho = rho + foff;
|
||||
if (NEWTON_PAIR) {
|
||||
memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
|
||||
memset(trho, 0, nall * sizeof(double));
|
||||
else
|
||||
memset(trho, 0, nlocal * sizeof(double));
|
||||
}
|
||||
|
||||
const int toffs = tid * ccache_stride;
|
||||
flt_t * _noalias const tdelx = ccachex + toffs;
|
||||
flt_t * _noalias const tdely = ccachey + toffs;
|
||||
flt_t * _noalias const tdelz = ccachez + toffs;
|
||||
flt_t * _noalias const trsq = ccachew + toffs;
|
||||
int * _noalias const tj = ccachei + toffs;
|
||||
int * _noalias const tjtype = ccachej + toffs;
|
||||
|
||||
flt_t oscale;
|
||||
int rhor_joff, frho_ioff;
|
||||
@ -300,53 +299,67 @@ void PairEAMIntel::eval(const int offload, const int vflag,
|
||||
const flt_t ztmp = x[i].z;
|
||||
|
||||
acc_t rhoi = (acc_t)0.0;
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#pragma simd reduction(+:rhoi)
|
||||
int ej = 0;
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for (int jj = 0; jj < jnum; jj++) {
|
||||
int j, jtype;
|
||||
j = jlist[jj] & NEIGHMASK;
|
||||
|
||||
const int j = jlist[jj] & NEIGHMASK;
|
||||
const flt_t delx = xtmp - x[j].x;
|
||||
const flt_t dely = ytmp - x[j].y;
|
||||
const flt_t delz = ztmp - x[j].z;
|
||||
const flt_t rsq = delx*delx + dely*dely + delz*delz;
|
||||
|
||||
if (rsq < fcutforcesq) {
|
||||
if (!ONETYPE) jtype = x[j].w;
|
||||
flt_t p = sqrt(rsq)*frdr + (flt_t)1.0;
|
||||
int m = static_cast<int> (p);
|
||||
m = MIN(m,nr-1);
|
||||
p -= m;
|
||||
p = MIN(p,(flt_t)1.0);
|
||||
if (!ONETYPE)
|
||||
rhor_joff = rhor_ioff + jtype * jstride;
|
||||
const int joff = rhor_joff + m;
|
||||
flt_t ra;
|
||||
ra = ((rhor_spline_e[joff].a*p + rhor_spline_e[joff].b) * p +
|
||||
rhor_spline_e[joff].c) * p + rhor_spline_e[joff].d;
|
||||
rhoi += ra;
|
||||
if (NEWTON_PAIR || j < nlocal) {
|
||||
if (!ONETYPE) {
|
||||
const int ioff = jtype * istride + itype * jstride + m;
|
||||
ra = ((rhor_spline_e[ioff].a*p + rhor_spline_e[ioff].b)*p +
|
||||
rhor_spline_e[ioff].c) * p + rhor_spline_e[ioff].d;
|
||||
}
|
||||
trho[j] += ra;
|
||||
}
|
||||
trsq[ej]=rsq;
|
||||
if (!ONETYPE) tjtype[ej]=x[j].w;
|
||||
tj[ej]=jlist[jj];
|
||||
ej++;
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#pragma simd reduction(+:rhoi)
|
||||
#endif
|
||||
for (int jj = 0; jj < ej; jj++) {
|
||||
int jtype;
|
||||
const int j = tj[jj] & NEIGHMASK;
|
||||
if (!ONETYPE) jtype = tjtype[jj];
|
||||
const flt_t rsq = trsq[jj];
|
||||
flt_t p = sqrt(rsq)*frdr + (flt_t)1.0;
|
||||
int m = static_cast<int> (p);
|
||||
m = MIN(m,nr-1);
|
||||
p -= m;
|
||||
p = MIN(p,(flt_t)1.0);
|
||||
if (!ONETYPE)
|
||||
rhor_joff = rhor_ioff + jtype * jstride;
|
||||
const int joff = rhor_joff + m;
|
||||
flt_t ra;
|
||||
ra = ((rhor_spline_e[joff].a*p + rhor_spline_e[joff].b) * p +
|
||||
rhor_spline_e[joff].c) * p + rhor_spline_e[joff].d;
|
||||
rhoi += ra;
|
||||
if (NEWTON_PAIR) {
|
||||
if (!ONETYPE) {
|
||||
const int ioff = jtype * istride + itype * jstride + m;
|
||||
ra = ((rhor_spline_e[ioff].a*p + rhor_spline_e[ioff].b)*p +
|
||||
rhor_spline_e[ioff].c) * p + rhor_spline_e[ioff].d;
|
||||
}
|
||||
trho[j] += ra;
|
||||
}
|
||||
} // for jj
|
||||
trho[i] += rhoi;
|
||||
if (NEWTON_PAIR)
|
||||
trho[i] += rhoi;
|
||||
else
|
||||
trho[i] = rhoi;
|
||||
} // for i
|
||||
|
||||
#if defined(_OPENMP)
|
||||
if (nthreads > 1) {
|
||||
if (NEWTON_PAIR && nthreads > 1) {
|
||||
#pragma omp barrier
|
||||
if (tid == 0) {
|
||||
int rcount;
|
||||
if (NEWTON_PAIR) rcount = nall;
|
||||
else rcount = nlocal;
|
||||
const int rcount = nall;
|
||||
if (nthreads == 2) {
|
||||
double *trho2 = rho + nmax;
|
||||
#pragma vector aligned
|
||||
@ -431,10 +444,9 @@ void PairEAMIntel::eval(const int offload, const int vflag,
|
||||
#pragma omp barrier
|
||||
#endif
|
||||
|
||||
if (tid == 0) {
|
||||
if (tid == 0)
|
||||
comm->forward_comm_pair(this);
|
||||
memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
|
||||
} else
|
||||
if (NEWTON_PAIR)
|
||||
memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
|
||||
|
||||
#if defined(_OPENMP)
|
||||
@ -462,124 +474,142 @@ void PairEAMIntel::eval(const int offload, const int vflag,
|
||||
const flt_t ytmp = x[i].y;
|
||||
const flt_t ztmp = x[i].z;
|
||||
fxtmp = fytmp = fztmp = (acc_t)0;
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) fwtmp = sevdwl = (acc_t)0;
|
||||
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
||||
}
|
||||
if (EFLAG) fwtmp = sevdwl = (acc_t)0;
|
||||
if (NEWTON_PAIR == 0)
|
||||
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
||||
sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
int ej = 0;
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for (int jj = 0; jj < jnum; jj++) {
|
||||
int j, jtype;
|
||||
j = jlist[jj] & NEIGHMASK;
|
||||
|
||||
const int j = jlist[jj] & NEIGHMASK;
|
||||
const flt_t delx = xtmp - x[j].x;
|
||||
const flt_t dely = ytmp - x[j].y;
|
||||
const flt_t delz = ztmp - x[j].z;
|
||||
const flt_t rsq = delx*delx + dely*dely + delz*delz;
|
||||
|
||||
|
||||
if (rsq < fcutforcesq) {
|
||||
if (!ONETYPE) jtype = x[j].w;
|
||||
const flt_t r = sqrt(rsq);
|
||||
flt_t p = r*frdr + (flt_t)1.0;
|
||||
int m = static_cast<int> (p);
|
||||
m = MIN(m,nr-1);
|
||||
p -= m;
|
||||
p = MIN(p,(flt_t)1.0);
|
||||
if (!ONETYPE)
|
||||
rhor_joff = rhor_ioff + jtype * jstride;
|
||||
const int joff = rhor_joff + m;
|
||||
const flt_t rhojp = (rhor_spline_f[joff].a*p +
|
||||
rhor_spline_f[joff].b)*p +
|
||||
rhor_spline_f[joff].c;
|
||||
flt_t rhoip;
|
||||
if (!ONETYPE) {
|
||||
const int ioff = jtype * istride + itype * jstride + m;
|
||||
rhoip = (rhor_spline_f[ioff].a*p + rhor_spline_f[ioff].b)*p +
|
||||
rhor_spline_f[ioff].c;
|
||||
} else
|
||||
rhoip = rhojp;
|
||||
const flt_t z2p = (z2r_spline_t[joff].a*p +
|
||||
z2r_spline_t[joff].b)*p +
|
||||
z2r_spline_t[joff].c;
|
||||
const flt_t z2 = ((z2r_spline_t[joff].d*p +
|
||||
z2r_spline_t[joff].e)*p +
|
||||
z2r_spline_t[joff].f)*p +
|
||||
z2r_spline_t[joff].g;
|
||||
trsq[ej]=rsq;
|
||||
tdelx[ej]=delx;
|
||||
tdely[ej]=dely;
|
||||
tdelz[ej]=delz;
|
||||
if (!ONETYPE) tjtype[ej]=x[j].w;
|
||||
tj[ej]=jlist[jj];
|
||||
ej++;
|
||||
}
|
||||
}
|
||||
|
||||
const flt_t recip = (flt_t)1.0/r;
|
||||
const flt_t phi = z2*recip;
|
||||
const flt_t phip = z2p*recip - phi*recip;
|
||||
const flt_t psip = fp_f[i]*rhojp + fp_f[j]*rhoip + phip;
|
||||
if (!ONETYPE)
|
||||
oscale = scale_fi[jtype];
|
||||
const flt_t fpair = -oscale*psip*recip;
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
||||
sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
#endif
|
||||
for (int jj = 0; jj < ej; jj++) {
|
||||
int jtype;
|
||||
const int j = tj[jj] & NEIGHMASK;
|
||||
if (!ONETYPE) jtype = tjtype[jj];
|
||||
const flt_t rsq = trsq[jj];
|
||||
const flt_t r = sqrt(rsq);
|
||||
flt_t p = r*frdr + (flt_t)1.0;
|
||||
int m = static_cast<int> (p);
|
||||
m = MIN(m,nr-1);
|
||||
p -= m;
|
||||
p = MIN(p,(flt_t)1.0);
|
||||
if (!ONETYPE)
|
||||
rhor_joff = rhor_ioff + jtype * jstride;
|
||||
const int joff = rhor_joff + m;
|
||||
const flt_t rhojp = (rhor_spline_f[joff].a*p +
|
||||
rhor_spline_f[joff].b)*p +
|
||||
rhor_spline_f[joff].c;
|
||||
flt_t rhoip;
|
||||
if (!ONETYPE) {
|
||||
const int ioff = jtype * istride + itype * jstride + m;
|
||||
rhoip = (rhor_spline_f[ioff].a*p + rhor_spline_f[ioff].b)*p +
|
||||
rhor_spline_f[ioff].c;
|
||||
} else
|
||||
rhoip = rhojp;
|
||||
const flt_t z2p = (z2r_spline_t[joff].a*p +
|
||||
z2r_spline_t[joff].b)*p +
|
||||
z2r_spline_t[joff].c;
|
||||
const flt_t z2 = ((z2r_spline_t[joff].d*p +
|
||||
z2r_spline_t[joff].e)*p +
|
||||
z2r_spline_t[joff].f)*p +
|
||||
z2r_spline_t[joff].g;
|
||||
|
||||
fxtmp += delx*fpair;
|
||||
fytmp += dely*fpair;
|
||||
fztmp += delz*fpair;
|
||||
if (NEWTON_PAIR || j < nlocal) {
|
||||
f[j].x -= delx*fpair;
|
||||
f[j].y -= dely*fpair;
|
||||
f[j].z -= delz*fpair;
|
||||
}
|
||||
const flt_t recip = (flt_t)1.0/r;
|
||||
const flt_t phi = z2*recip;
|
||||
const flt_t phip = z2p*recip - phi*recip;
|
||||
const flt_t psip = fp_f[i]*rhojp + fp_f[j]*rhoip + phip;
|
||||
if (!ONETYPE)
|
||||
oscale = scale_fi[jtype];
|
||||
const flt_t fpair = -oscale*psip*recip;
|
||||
|
||||
if (EVFLAG) {
|
||||
flt_t ev_pre = (flt_t)0;
|
||||
if (NEWTON_PAIR || i<nlocal)
|
||||
ev_pre += (flt_t)0.5;
|
||||
if (NEWTON_PAIR || j<nlocal)
|
||||
ev_pre += (flt_t)0.5;
|
||||
const flt_t fpx = fpair * tdelx[jj];
|
||||
fxtmp += fpx;
|
||||
if (NEWTON_PAIR) f[j].x -= fpx;
|
||||
const flt_t fpy = fpair * tdely[jj];
|
||||
fytmp += fpy;
|
||||
if (NEWTON_PAIR) f[j].y -= fpy;
|
||||
const flt_t fpz = fpair * tdelz[jj];
|
||||
fztmp += fpz;
|
||||
if (NEWTON_PAIR) f[j].z -= fpz;
|
||||
|
||||
if (EFLAG) {
|
||||
const flt_t evdwl = oscale*phi;
|
||||
sevdwl += ev_pre * evdwl;
|
||||
if (eatom) {
|
||||
if (NEWTON_PAIR || i < nlocal)
|
||||
fwtmp += (flt_t)0.5 * evdwl;
|
||||
if (NEWTON_PAIR || j < nlocal)
|
||||
f[j].w += (flt_t)0.5 * evdwl;
|
||||
}
|
||||
}
|
||||
IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair,
|
||||
delx, dely, delz);
|
||||
}
|
||||
} // if rsq
|
||||
if (EFLAG) {
|
||||
const flt_t evdwl = oscale*phi;
|
||||
sevdwl += evdwl;
|
||||
if (eatom) {
|
||||
fwtmp += (flt_t)0.5 * evdwl;
|
||||
if (NEWTON_PAIR)
|
||||
f[j].w += (flt_t)0.5 * evdwl;
|
||||
}
|
||||
}
|
||||
if (NEWTON_PAIR == 0)
|
||||
IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj],
|
||||
fpx, fpy, fpz);
|
||||
} // for jj
|
||||
f[i].x += fxtmp;
|
||||
f[i].y += fytmp;
|
||||
f[i].z += fztmp;
|
||||
if (NEWTON_PAIR) {
|
||||
f[i].x += fxtmp;
|
||||
f[i].y += fytmp;
|
||||
f[i].z += fztmp;
|
||||
} else {
|
||||
f[i].x = fxtmp;
|
||||
f[i].y = fytmp;
|
||||
f[i].z = fztmp;
|
||||
sevdwl *= (acc_t)0.5;
|
||||
}
|
||||
|
||||
IP_PRE_ev_tally_atom(EVFLAG, EFLAG, vflag, f, fwtmp);
|
||||
IP_PRE_ev_tally_atom(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
|
||||
} // for i
|
||||
|
||||
if (vflag == 2) {
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp barrier
|
||||
#endif
|
||||
IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall,
|
||||
nlocal, minlocal, nthreads, f_start, f_stride,
|
||||
x, offload);
|
||||
}
|
||||
|
||||
IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
|
||||
f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
|
||||
ov4, ov5);
|
||||
} /// omp
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) {
|
||||
ev_global[0] = oevdwl;
|
||||
ev_global[1] = (acc_t)0.0;
|
||||
}
|
||||
if (vflag) {
|
||||
ev_global[2] = ov0;
|
||||
ev_global[3] = ov1;
|
||||
ev_global[4] = ov2;
|
||||
ev_global[5] = ov3;
|
||||
ev_global[6] = ov4;
|
||||
ev_global[7] = ov5;
|
||||
|
||||
IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
|
||||
ov0, ov1, ov2, ov3, ov4, ov5);
|
||||
|
||||
if (EFLAG) {
|
||||
ev_global[0] = oevdwl;
|
||||
ev_global[1] = (acc_t)0.0;
|
||||
}
|
||||
if (vflag) {
|
||||
if (NEWTON_PAIR == 0) {
|
||||
ov0 *= (acc_t)0.5;
|
||||
ov1 *= (acc_t)0.5;
|
||||
ov2 *= (acc_t)0.5;
|
||||
ov3 *= (acc_t)0.5;
|
||||
ov4 *= (acc_t)0.5;
|
||||
ov5 *= (acc_t)0.5;
|
||||
}
|
||||
ev_global[2] = ov0;
|
||||
ev_global[3] = ov1;
|
||||
ev_global[4] = ov2;
|
||||
ev_global[5] = ov3;
|
||||
ev_global[6] = ov4;
|
||||
ev_global[7] = ov5;
|
||||
}
|
||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||
*timer_compute = MIC_Wtime() - *timer_compute;
|
||||
@ -591,7 +621,7 @@ void PairEAMIntel::eval(const int offload, const int vflag,
|
||||
else
|
||||
fix->stop_watch(TIME_HOST_PAIR);
|
||||
|
||||
if (EVFLAG)
|
||||
if (EFLAG || vflag)
|
||||
fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
|
||||
else
|
||||
fix->add_result_array(f_start, 0, offload);
|
||||
@ -604,6 +634,10 @@ void PairEAMIntel::eval(const int offload, const int vflag,
|
||||
void PairEAMIntel::init_style()
|
||||
{
|
||||
PairEAM::init_style();
|
||||
if (force->newton_pair == 0) {
|
||||
neighbor->requests[neighbor->nrequest-1]->half = 0;
|
||||
neighbor->requests[neighbor->nrequest-1]->full = 1;
|
||||
}
|
||||
neighbor->requests[neighbor->nrequest-1]->intel = 1;
|
||||
|
||||
int ifix = modify->find_fix("package_intel");
|
||||
@ -633,6 +667,13 @@ template <class flt_t, class acc_t>
|
||||
void PairEAMIntel::pack_force_const(ForceConst<flt_t> &fc,
|
||||
IntelBuffers<flt_t,acc_t> *buffers)
|
||||
{
|
||||
int off_ccache = 0;
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (_cop >= 0) off_ccache = 1;
|
||||
#endif
|
||||
buffers->grow_ccache(off_ccache, comm->nthreads, 1);
|
||||
_ccache_stride = buffers->ccache_stride();
|
||||
|
||||
int tp1 = atom->ntypes + 1;
|
||||
fc.set_ntypes(tp1,nr,nrho,memory,_cop);
|
||||
buffers->set_ntypes(tp1);
|
||||
|
||||
@ -41,7 +41,7 @@ class PairEAMIntel : public PairEAM {
|
||||
protected:
|
||||
|
||||
FixIntel *fix;
|
||||
int _cop, _onetype;
|
||||
int _cop, _onetype, _ccache_stride;
|
||||
float *fp_float;
|
||||
|
||||
template <class flt_t>
|
||||
@ -53,7 +53,7 @@ class PairEAMIntel : public PairEAM {
|
||||
template <class flt_t, class acc_t>
|
||||
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc);
|
||||
template <int ONETYPE, int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t,
|
||||
template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t,
|
||||
class acc_t>
|
||||
void eval(const int offload, const int vflag,
|
||||
IntelBuffers<flt_t,acc_t> * buffers,
|
||||
|
||||
@ -88,12 +88,16 @@ void PairGayBerneIntel::compute(int eflag, int vflag,
|
||||
const AtomVecEllipsoid::Bonus * const bonus = avec->bonus;
|
||||
const int * const ellipsoid = atom->ellipsoid;
|
||||
QUAT_T * _noalias const quat = buffers->get_quat();
|
||||
|
||||
int packthreads;
|
||||
if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
|
||||
else packthreads = 1;
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
|
||||
#pragma omp parallel if(packthreads > 1)
|
||||
#endif
|
||||
{
|
||||
int ifrom, ito, tid;
|
||||
IP_PRE_omp_range_id_align(ifrom, ito, tid, nall, nthreads,
|
||||
IP_PRE_omp_range_id_align(ifrom, ito, tid, nall, packthreads,
|
||||
sizeof(ATOM_T));
|
||||
if (ago != 0) buffers->thr_pack(ifrom,ito,ago);
|
||||
|
||||
@ -114,39 +118,29 @@ void PairGayBerneIntel::compute(int eflag, int vflag,
|
||||
fix->stop_watch(TIME_PACK);
|
||||
}
|
||||
|
||||
if (evflag || vflag_fdotr) {
|
||||
int ovflag = 0;
|
||||
if (vflag_fdotr) ovflag = 2;
|
||||
else if (vflag) ovflag = 1;
|
||||
if (eflag) {
|
||||
if (force->newton_pair) {
|
||||
eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
int ovflag = 0;
|
||||
if (vflag_fdotr) ovflag = 2;
|
||||
else if (vflag) ovflag = 1;
|
||||
if (eflag) {
|
||||
if (force->newton_pair) {
|
||||
eval<1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
if (force->newton_pair) {
|
||||
eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
eval<1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
} else {
|
||||
if (force->newton_pair) {
|
||||
eval<0,0,1>(1, 0, buffers, fc, 0, offload_end);
|
||||
eval<0,0,1>(0, 0, buffers, fc, host_start, inum);
|
||||
eval<0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<0,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
eval<0,0,0>(1, 0, buffers, fc, 0, offload_end);
|
||||
eval<0,0,0>(0, 0, buffers, fc, host_start, inum);
|
||||
eval<0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<0,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||
void PairGayBerneIntel::eval(const int offload, const int vflag,
|
||||
IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc,
|
||||
@ -167,8 +161,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
||||
if (fix->separate_buffers()) {
|
||||
fix->start_watch(TIME_PACK);
|
||||
if (offload) {
|
||||
#pragma omp parallel default(none) \
|
||||
shared(buffers,nlocal,nall,bonus,ellipsoid)
|
||||
#pragma omp parallel
|
||||
{
|
||||
int ifrom, ito, tid;
|
||||
int nthreads = comm->nthreads;
|
||||
@ -258,7 +251,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
||||
|
||||
// Determine how much data to transfer
|
||||
int x_size, q_size, f_stride, ev_size, separate_flag;
|
||||
IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
|
||||
IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
|
||||
buffers, offload, fix, separate_flag,
|
||||
x_size, q_size, ev_size, f_stride);
|
||||
|
||||
@ -334,6 +327,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
||||
x[nall].x = (flt_t)INTEL_BIGP;
|
||||
x[nall].y = (flt_t)INTEL_BIGP;
|
||||
x[nall].z = (flt_t)INTEL_BIGP;
|
||||
x[nall].w = 1;
|
||||
quat[nall].w = (flt_t)1.0;
|
||||
quat[nall].i = (flt_t)0.0;
|
||||
quat[nall].j = (flt_t)0.0;
|
||||
@ -342,25 +336,25 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
||||
#endif
|
||||
|
||||
acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||
if (EVFLAG) {
|
||||
oevdwl = (acc_t)0.0;
|
||||
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
|
||||
}
|
||||
if (EFLAG) oevdwl = (acc_t)0.0;
|
||||
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
|
||||
if (NEWTON_PAIR == 0) f_start[1].w = 0;
|
||||
|
||||
// loop over neighbors of my atoms
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp parallel default(none) \
|
||||
shared(f_start,f_stride,nlocal,nall,minlocal) \
|
||||
reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||
#pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||
#endif
|
||||
{
|
||||
int iifrom, iito, tid;
|
||||
IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
|
||||
int iifrom, iip, iito, tid;
|
||||
IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
|
||||
iifrom += astart;
|
||||
iito += astart;
|
||||
|
||||
FORCE_T * _noalias const f = f_start - minlocal * 2 + (tid * f_stride);
|
||||
memset(f + minlocal * 2, 0, f_stride * sizeof(FORCE_T));
|
||||
int foff;
|
||||
if (NEWTON_PAIR) foff = tid * f_stride - minlocal * 2;
|
||||
else foff = minlocal*-2;
|
||||
FORCE_T * _noalias const f = f_start + foff;
|
||||
if (NEWTON_PAIR) memset(f + minlocal * 2, 0, f_stride * sizeof(FORCE_T));
|
||||
|
||||
flt_t * _noalias const rsq_form = rsq_formi + tid * max_nbors;
|
||||
flt_t * _noalias const delx_form = delx_formi + tid * max_nbors;
|
||||
@ -370,7 +364,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
||||
int * _noalias const jlist_form = jlist_formi + tid * max_nbors;
|
||||
|
||||
int ierror = 0;
|
||||
for (int i = iifrom; i < iito; ++i) {
|
||||
for (int i = iifrom; i < iito; i += iip) {
|
||||
// const int i = ilist[ii];
|
||||
const int itype = x[i].w;
|
||||
const int ptr_off = itype * ntypes;
|
||||
@ -401,14 +395,17 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
||||
acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
|
||||
fxtmp = fytmp = fztmp = t1tmp = t2tmp = t3tmp = (acc_t)0.0;
|
||||
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) fwtmp = sevdwl = (acc_t)0.0;
|
||||
if (EFLAG) fwtmp = sevdwl = (acc_t)0.0;
|
||||
if (NEWTON_PAIR == 0)
|
||||
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
||||
}
|
||||
|
||||
bool multiple_forms = false;
|
||||
int packed_j = 0;
|
||||
for (int jj = 0; jj < jnum; jj++) {
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for (int jj = 0; jj < jnum; jj++) {
|
||||
int jm = jlist[jj];
|
||||
int j = jm & NEIGHMASK;
|
||||
const int jtype = x[j].w;
|
||||
@ -573,7 +570,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
||||
ME_cross3(tempv, tempv2, dUr);
|
||||
flt_t dUr2_0, dUr2_1, dUr2_2;
|
||||
|
||||
if (NEWTON_PAIR || j < nlocal) {
|
||||
if (NEWTON_PAIR) {
|
||||
ME_vecmat(kappa, g2, tempv2);
|
||||
ME_cross3(tempv, tempv2, dUr2);
|
||||
}
|
||||
@ -588,7 +585,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
||||
dchi_2 *= temp1;
|
||||
flt_t dchi2_0, dchi2_1, dchi2_2;
|
||||
|
||||
if (NEWTON_PAIR || j < nlocal) {
|
||||
if (NEWTON_PAIR) {
|
||||
ME_vecmat(iota, b2, tempv);
|
||||
ME_cross3(tempv, iota, dchi2);
|
||||
dchi2_0 *= temp1;
|
||||
@ -630,7 +627,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
||||
// compute d_eta for particle 2
|
||||
|
||||
flt_t deta2_0, deta2_1, deta2_2;
|
||||
if (NEWTON_PAIR || j < nlocal) {
|
||||
if (NEWTON_PAIR) {
|
||||
deta2_0 = deta2_1 = deta2_2 = (flt_t)0.0;
|
||||
ME_compute_eta_torque(g12, a2, shape2, temp);
|
||||
|
||||
@ -672,7 +669,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
||||
ttor_2 = (temp1 * dchi_2 + temp2 * deta_2 + temp3 * dUr_2) *
|
||||
(flt_t)-1.0;
|
||||
|
||||
if (NEWTON_PAIR || j < nlocal) {
|
||||
if (NEWTON_PAIR) {
|
||||
rtor_0 = (temp1 * dchi2_0 + temp2 * deta2_0 + temp3 * dUr2_0) *
|
||||
(flt_t)-1.0;
|
||||
rtor_1 = (temp1 * dchi2_1 + temp2 * deta2_1 + temp3 * dUr2_1) *
|
||||
@ -714,7 +711,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
||||
t2tmp += ttor_1;
|
||||
t3tmp += ttor_2;
|
||||
|
||||
if (NEWTON_PAIR || j < nlocal) {
|
||||
if (NEWTON_PAIR) {
|
||||
rtor_0 *= factor_lj;
|
||||
rtor_1 *= factor_lj;
|
||||
rtor_2 *= factor_lj;
|
||||
@ -728,34 +725,26 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
||||
f[jp].z += rtor_2;
|
||||
}
|
||||
|
||||
if (EVFLAG) {
|
||||
flt_t ev_pre = (flt_t)0.0;
|
||||
if (NEWTON_PAIR || i < nlocal)
|
||||
ev_pre += (flt_t)0.5;
|
||||
if (NEWTON_PAIR || j < nlocal)
|
||||
ev_pre += (flt_t)0.5;
|
||||
|
||||
if (EFLAG) {
|
||||
evdwl = factor_lj * one_eng;
|
||||
sevdwl += ev_pre * evdwl;
|
||||
if (eatom) {
|
||||
if (NEWTON_PAIR || i < nlocal)
|
||||
fwtmp += (flt_t)0.5 * evdwl;
|
||||
if (NEWTON_PAIR || j < nlocal)
|
||||
f[j*2].w += (flt_t)0.5 * evdwl;
|
||||
}
|
||||
if (EFLAG) {
|
||||
evdwl = factor_lj * one_eng;
|
||||
sevdwl += evdwl;
|
||||
if (eatom) {
|
||||
fwtmp += (flt_t)0.5 * evdwl;
|
||||
if (NEWTON_PAIR)
|
||||
f[j*2].w += (flt_t)0.5 * evdwl;
|
||||
}
|
||||
}
|
||||
|
||||
if (NEWTON_PAIR == 0) {
|
||||
if (vflag == 1) {
|
||||
ev_pre *= (flt_t)-1.0;
|
||||
sv0 += ev_pre * delx_form[jj] * fforce_0;
|
||||
sv1 += ev_pre * dely_form[jj] * fforce_1;
|
||||
sv2 += ev_pre * delz_form[jj] * fforce_2;
|
||||
sv3 += ev_pre * delx_form[jj] * fforce_1;
|
||||
sv4 += ev_pre * delx_form[jj] * fforce_2;
|
||||
sv5 += ev_pre * dely_form[jj] * fforce_2;
|
||||
sv0 += delx_form[jj] * fforce_0;
|
||||
sv1 += dely_form[jj] * fforce_1;
|
||||
sv2 += delz_form[jj] * fforce_2;
|
||||
sv3 += delx_form[jj] * fforce_1;
|
||||
sv4 += delx_form[jj] * fforce_2;
|
||||
sv5 += dely_form[jj] * fforce_2;
|
||||
}
|
||||
} // EVFLAG
|
||||
} // EVFLAG
|
||||
#ifdef INTEL_VMASK
|
||||
}
|
||||
#endif
|
||||
@ -767,19 +756,29 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
||||
ierror = 2;
|
||||
|
||||
int ip = i * 2;
|
||||
f[ip].x += fxtmp;
|
||||
f[ip].y += fytmp;
|
||||
f[ip].z += fztmp;
|
||||
ip++;
|
||||
f[ip].x += t1tmp;
|
||||
f[ip].y += t2tmp;
|
||||
f[ip].z += t3tmp;
|
||||
if (NEWTON_PAIR) {
|
||||
f[ip].x += fxtmp;
|
||||
f[ip].y += fytmp;
|
||||
f[ip].z += fztmp;
|
||||
ip++;
|
||||
f[ip].x += t1tmp;
|
||||
f[ip].y += t2tmp;
|
||||
f[ip].z += t3tmp;
|
||||
} else {
|
||||
f[ip].x = fxtmp;
|
||||
f[ip].y = fytmp;
|
||||
f[ip].z = fztmp;
|
||||
ip++;
|
||||
f[ip].x = t1tmp;
|
||||
f[ip].y = t2tmp;
|
||||
f[ip].z = t3tmp;
|
||||
}
|
||||
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) {
|
||||
if (eatom) f[i * 2].w += fwtmp;
|
||||
oevdwl += sevdwl;
|
||||
}
|
||||
if (EFLAG) {
|
||||
oevdwl += sevdwl;
|
||||
if (eatom) f[i * 2].w += fwtmp;
|
||||
}
|
||||
if (NEWTON_PAIR == 0) {
|
||||
if (vflag == 1) {
|
||||
ov0 += sv0;
|
||||
ov1 += sv1;
|
||||
@ -791,56 +790,31 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
||||
}
|
||||
} // for i
|
||||
int o_range;
|
||||
if (NEWTON_PAIR)
|
||||
if (NEWTON_PAIR) {
|
||||
o_range = nall;
|
||||
else
|
||||
o_range = nlocal;
|
||||
if (offload == 0) o_range -= minlocal;
|
||||
IP_PRE_omp_range_align(iifrom, iito, tid, o_range, nthreads,
|
||||
sizeof(FORCE_T));
|
||||
const int two_iito = iito * 2;
|
||||
|
||||
acc_t *facc = &(f_start[0].x);
|
||||
const int sto = two_iito * 4;
|
||||
const int fst4 = f_stride * 4;
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp barrier
|
||||
#endif
|
||||
int t_off = f_stride;
|
||||
if (EFLAG && eatom) {
|
||||
for (int t = 1; t < nthreads; t++) {
|
||||
if (offload == 0) o_range -= minlocal;
|
||||
IP_PRE_omp_range_align(iifrom, iito, tid, o_range, nthreads,
|
||||
sizeof(FORCE_T));
|
||||
const int sto = iito * 8;
|
||||
const int fst4 = f_stride * 4;
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp barrier
|
||||
#endif
|
||||
acc_t *f_scalar = &f_start[0].x;
|
||||
acc_t *f_scalar2 = f_scalar + fst4;
|
||||
for (int t = 1; t < nthreads; t++) {
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector nontemporal
|
||||
#pragma novector
|
||||
#pragma vector aligned
|
||||
#pragma simd
|
||||
#endif
|
||||
for (int n = iifrom * 2; n < two_iito; n++) {
|
||||
f_start[n].x += f_start[n + t_off].x;
|
||||
f_start[n].y += f_start[n + t_off].y;
|
||||
f_start[n].z += f_start[n + t_off].z;
|
||||
f_start[n].w += f_start[n + t_off].w;
|
||||
}
|
||||
t_off += f_stride;
|
||||
for (int n = iifrom * 8; n < sto; n++)
|
||||
f_scalar[n] += f_scalar2[n];
|
||||
f_scalar2 += fst4;
|
||||
}
|
||||
} else {
|
||||
for (int t = 1; t < nthreads; t++) {
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector nontemporal
|
||||
#pragma novector
|
||||
#endif
|
||||
for (int n = iifrom * 2; n < two_iito; n++) {
|
||||
f_start[n].x += f_start[n + t_off].x;
|
||||
f_start[n].y += f_start[n + t_off].y;
|
||||
f_start[n].z += f_start[n + t_off].z;
|
||||
}
|
||||
t_off += f_stride;
|
||||
}
|
||||
}
|
||||
|
||||
if (EVFLAG) {
|
||||
if (vflag==2) {
|
||||
const ATOM_T * _noalias const xo = x + minlocal;
|
||||
const ATOM_T * _noalias const xo = x + minlocal;
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector nontemporal
|
||||
#pragma novector
|
||||
#endif
|
||||
for (int n = iifrom; n < iito; n++) {
|
||||
@ -852,26 +826,33 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
||||
ov4 += f_start[nt2].z * xo[n].x;
|
||||
ov5 += f_start[nt2].z * xo[n].y;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (ierror)
|
||||
f_start[1].w = ierror;
|
||||
} // omp
|
||||
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) {
|
||||
ev_global[0] = oevdwl;
|
||||
ev_global[1] = (acc_t)0.0;
|
||||
}
|
||||
if (vflag) {
|
||||
ev_global[2] = ov0;
|
||||
ev_global[3] = ov1;
|
||||
ev_global[4] = ov2;
|
||||
ev_global[5] = ov3;
|
||||
ev_global[6] = ov4;
|
||||
ev_global[7] = ov5;
|
||||
if (EFLAG) {
|
||||
if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
|
||||
ev_global[0] = oevdwl;
|
||||
ev_global[1] = (acc_t)0.0;
|
||||
}
|
||||
if (vflag) {
|
||||
if (NEWTON_PAIR == 0) {
|
||||
ov0 *= (acc_t)-0.5;
|
||||
ov1 *= (acc_t)-0.5;
|
||||
ov2 *= (acc_t)-0.5;
|
||||
ov3 *= (acc_t)-0.5;
|
||||
ov4 *= (acc_t)-0.5;
|
||||
ov5 *= (acc_t)-0.5;
|
||||
}
|
||||
ev_global[2] = ov0;
|
||||
ev_global[3] = ov1;
|
||||
ev_global[4] = ov2;
|
||||
ev_global[5] = ov3;
|
||||
ev_global[6] = ov4;
|
||||
ev_global[7] = ov5;
|
||||
}
|
||||
|
||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||
@ -884,7 +865,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
||||
else
|
||||
fix->stop_watch(TIME_HOST_PAIR);
|
||||
|
||||
if (EVFLAG)
|
||||
if (EFLAG || vflag)
|
||||
fix->add_result_array(f_start, ev_global, offload, eatom, 0, 2);
|
||||
else
|
||||
fix->add_result_array(f_start, 0, offload, 0, 0, 2);
|
||||
@ -895,6 +876,10 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
|
||||
void PairGayBerneIntel::init_style()
|
||||
{
|
||||
PairGayBerne::init_style();
|
||||
if (force->newton_pair == 0) {
|
||||
neighbor->requests[neighbor->nrequest-1]->half = 0;
|
||||
neighbor->requests[neighbor->nrequest-1]->full = 1;
|
||||
}
|
||||
neighbor->requests[neighbor->nrequest-1]->intel = 1;
|
||||
|
||||
int ifix = modify->find_fix("package_intel");
|
||||
|
||||
@ -43,7 +43,7 @@ class PairGayBerneIntel : public PairGayBerne {
|
||||
template <class flt_t, class acc_t>
|
||||
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc);
|
||||
template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||
void eval(const int offload, const int vflag,
|
||||
IntelBuffers<flt_t,acc_t> * buffers,
|
||||
const ForceConst<flt_t> &fc, const int astart, const int aend);
|
||||
|
||||
@ -82,54 +82,48 @@ void PairLJCharmmCoulLongIntel::compute(int eflag, int vflag,
|
||||
|
||||
if (_lrt == 0 && ago != 0 && fix->separate_buffers() == 0) {
|
||||
fix->start_watch(TIME_PACK);
|
||||
|
||||
int packthreads;
|
||||
if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
|
||||
else packthreads = 1;
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
|
||||
#pragma omp parallel if(packthreads > 1)
|
||||
#endif
|
||||
{
|
||||
int ifrom, ito, tid;
|
||||
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal+atom->nghost,
|
||||
nthreads, sizeof(ATOM_T));
|
||||
packthreads, sizeof(ATOM_T));
|
||||
buffers->thr_pack(ifrom,ito,ago);
|
||||
}
|
||||
fix->stop_watch(TIME_PACK);
|
||||
}
|
||||
|
||||
// -------------------- Regular version
|
||||
if (evflag || vflag_fdotr) {
|
||||
int ovflag = 0;
|
||||
if (vflag_fdotr) ovflag = 2;
|
||||
else if (vflag) ovflag = 1;
|
||||
if (eflag) {
|
||||
if (force->newton_pair) {
|
||||
eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
int ovflag = 0;
|
||||
if (vflag_fdotr) ovflag = 2;
|
||||
else if (vflag) ovflag = 1;
|
||||
if (eflag) {
|
||||
if (force->newton_pair) {
|
||||
eval<1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
if (force->newton_pair) {
|
||||
eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
eval<1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
} else {
|
||||
if (force->newton_pair) {
|
||||
eval<0,0,1>(1, 0, buffers, fc, 0, offload_end);
|
||||
eval<0,0,1>(0, 0, buffers, fc, host_start, inum);
|
||||
eval<0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<0,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
eval<0,0,0>(1, 0, buffers, fc, 0, offload_end);
|
||||
eval<0,0,0>(0, 0, buffers, fc, host_start, inum);
|
||||
eval<0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<0,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||
void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
|
||||
IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc,
|
||||
@ -182,7 +176,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
|
||||
|
||||
// Determine how much data to transfer
|
||||
int x_size, q_size, f_stride, ev_size, separate_flag;
|
||||
IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
|
||||
IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
|
||||
buffers, offload, fix, separate_flag,
|
||||
x_size, q_size, ev_size, f_stride);
|
||||
|
||||
@ -236,25 +230,24 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
|
||||
f_stride, x, q);
|
||||
|
||||
acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||
if (EVFLAG) {
|
||||
oevdwl = oecoul = (acc_t)0;
|
||||
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
|
||||
}
|
||||
if (EFLAG) oevdwl = oecoul = (acc_t)0;
|
||||
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
|
||||
|
||||
// loop over neighbors of my atoms
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp parallel default(none) \
|
||||
shared(f_start,f_stride,nlocal,nall,minlocal) \
|
||||
reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||
#pragma omp parallel reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||
#endif
|
||||
{
|
||||
int iifrom, iito, tid;
|
||||
IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
|
||||
int iifrom, iip, iito, tid;
|
||||
IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
|
||||
iifrom += astart;
|
||||
iito += astart;
|
||||
|
||||
FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
|
||||
memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
|
||||
int foff;
|
||||
if (NEWTON_PAIR) foff = tid * f_stride - minlocal;
|
||||
else foff = -minlocal;
|
||||
FORCE_T * _noalias const f = f_start + foff;
|
||||
if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
|
||||
flt_t cutboth = cut_coulsq;
|
||||
|
||||
const int toffs = tid * ccache_stride;
|
||||
@ -265,7 +258,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
|
||||
int * _noalias const tj = ccachei + toffs;
|
||||
int * _noalias const tjtype = ccachej + toffs;
|
||||
|
||||
for (int i = iifrom; i < iito; ++i) {
|
||||
for (int i = iifrom; i < iito; i += iip) {
|
||||
// const int i = ilist[ii];
|
||||
const int itype = x[i].w;
|
||||
|
||||
@ -284,10 +277,9 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
|
||||
const flt_t ztmp = x[i].z;
|
||||
const flt_t qtmp = q[i];
|
||||
fxtmp = fytmp = fztmp = (acc_t)0;
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
|
||||
if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
|
||||
if (NEWTON_PAIR == 0)
|
||||
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
||||
}
|
||||
|
||||
int ej = 0;
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
@ -421,77 +413,76 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
|
||||
#ifdef INTEL_VMASK
|
||||
}
|
||||
#else
|
||||
if (rsq > cut_coulsq) { forcecoul = (flt_t)0.0; ecoul = (flt_t)0.0; }
|
||||
if (rsq > cut_ljsq) { forcelj = (flt_t)0.0; evdwl = (flt_t)0.0; }
|
||||
#endif
|
||||
|
||||
const flt_t delx = tdelx[jj];
|
||||
const flt_t dely = tdely[jj];
|
||||
const flt_t delz = tdelz[jj];
|
||||
const flt_t fpair = (forcecoul + forcelj) * r2inv;
|
||||
fxtmp += delx * fpair;
|
||||
fytmp += dely * fpair;
|
||||
fztmp += delz * fpair;
|
||||
if (NEWTON_PAIR || j < nlocal) {
|
||||
f[j].x -= delx * fpair;
|
||||
f[j].y -= dely * fpair;
|
||||
f[j].z -= delz * fpair;
|
||||
}
|
||||
const flt_t fpx = fpair * tdelx[jj];
|
||||
fxtmp += fpx;
|
||||
if (NEWTON_PAIR) f[j].x -= fpx;
|
||||
const flt_t fpy = fpair * tdely[jj];
|
||||
fytmp += fpy;
|
||||
if (NEWTON_PAIR) f[j].y -= fpy;
|
||||
const flt_t fpz = fpair * tdelz[jj];
|
||||
fztmp += fpz;
|
||||
if (NEWTON_PAIR) f[j].z -= fpz;
|
||||
|
||||
if (EVFLAG) {
|
||||
flt_t ev_pre = (flt_t)0;
|
||||
if (NEWTON_PAIR || i < nlocal)
|
||||
ev_pre += (flt_t)0.5;
|
||||
if (NEWTON_PAIR || j < nlocal)
|
||||
ev_pre += (flt_t)0.5;
|
||||
|
||||
if (EFLAG) {
|
||||
sevdwl += ev_pre * evdwl;
|
||||
secoul += ev_pre * ecoul;
|
||||
if (eatom) {
|
||||
if (NEWTON_PAIR || i < nlocal)
|
||||
fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
|
||||
if (NEWTON_PAIR || j < nlocal)
|
||||
f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
|
||||
}
|
||||
if (EFLAG) {
|
||||
sevdwl += evdwl;
|
||||
secoul += ecoul;
|
||||
if (eatom) {
|
||||
fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
|
||||
if (NEWTON_PAIR)
|
||||
f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
|
||||
}
|
||||
|
||||
IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair,
|
||||
delx, dely, delz);
|
||||
}
|
||||
if (NEWTON_PAIR == 0)
|
||||
IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj],
|
||||
fpx, fpy, fpz);
|
||||
} // for jj
|
||||
f[i].x += fxtmp;
|
||||
f[i].y += fytmp;
|
||||
f[i].z += fztmp;
|
||||
|
||||
IP_PRE_ev_tally_atomq(EVFLAG, EFLAG, vflag, f, fwtmp);
|
||||
if (NEWTON_PAIR) {
|
||||
f[i].x += fxtmp;
|
||||
f[i].y += fytmp;
|
||||
f[i].z += fztmp;
|
||||
} else {
|
||||
f[i].x = fxtmp;
|
||||
f[i].y = fytmp;
|
||||
f[i].z = fztmp;
|
||||
}
|
||||
IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
|
||||
} // for ii
|
||||
|
||||
#ifndef _LMP_INTEL_OFFLOAD
|
||||
if (vflag == 2)
|
||||
#endif
|
||||
{
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp barrier
|
||||
#endif
|
||||
IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall,
|
||||
nlocal, minlocal, nthreads, f_start, f_stride,
|
||||
x, offload);
|
||||
}
|
||||
IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
|
||||
f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
|
||||
ov4, ov5);
|
||||
} // end of omp parallel region
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) {
|
||||
ev_global[0] = oevdwl;
|
||||
ev_global[1] = oecoul;
|
||||
|
||||
IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
|
||||
ov0, ov1, ov2, ov3, ov4, ov5);
|
||||
|
||||
if (EFLAG) {
|
||||
if (NEWTON_PAIR == 0) {
|
||||
oevdwl *= (acc_t)0.5;
|
||||
oecoul *= (acc_t)0.5;
|
||||
}
|
||||
if (vflag) {
|
||||
ev_global[2] = ov0;
|
||||
ev_global[3] = ov1;
|
||||
ev_global[4] = ov2;
|
||||
ev_global[5] = ov3;
|
||||
ev_global[6] = ov4;
|
||||
ev_global[7] = ov5;
|
||||
ev_global[0] = oevdwl;
|
||||
ev_global[1] = oecoul;
|
||||
}
|
||||
if (vflag) {
|
||||
if (NEWTON_PAIR == 0) {
|
||||
ov0 *= (acc_t)0.5;
|
||||
ov1 *= (acc_t)0.5;
|
||||
ov2 *= (acc_t)0.5;
|
||||
ov3 *= (acc_t)0.5;
|
||||
ov4 *= (acc_t)0.5;
|
||||
ov5 *= (acc_t)0.5;
|
||||
}
|
||||
ev_global[2] = ov0;
|
||||
ev_global[3] = ov1;
|
||||
ev_global[4] = ov2;
|
||||
ev_global[5] = ov3;
|
||||
ev_global[6] = ov4;
|
||||
ev_global[7] = ov5;
|
||||
}
|
||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||
*timer_compute = MIC_Wtime() - *timer_compute;
|
||||
@ -503,7 +494,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
|
||||
else
|
||||
fix->stop_watch(TIME_HOST_PAIR);
|
||||
|
||||
if (EVFLAG)
|
||||
if (EFLAG || vflag)
|
||||
fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
|
||||
else
|
||||
fix->add_result_array(f_start, 0, offload);
|
||||
@ -514,6 +505,10 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
|
||||
void PairLJCharmmCoulLongIntel::init_style()
|
||||
{
|
||||
PairLJCharmmCoulLong::init_style();
|
||||
if (force->newton_pair == 0) {
|
||||
neighbor->requests[neighbor->nrequest-1]->half = 0;
|
||||
neighbor->requests[neighbor->nrequest-1]->full = 1;
|
||||
}
|
||||
neighbor->requests[neighbor->nrequest-1]->intel = 1;
|
||||
|
||||
int ifix = modify->find_fix("package_intel");
|
||||
@ -541,11 +536,6 @@ template <class flt_t, class acc_t>
|
||||
void PairLJCharmmCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
|
||||
IntelBuffers<flt_t,acc_t> *buffers)
|
||||
{
|
||||
int tp1 = atom->ntypes + 1;
|
||||
int ntable = 1;
|
||||
if (ncoultablebits)
|
||||
for (int i = 0; i < ncoultablebits; i++) ntable *= 2;
|
||||
|
||||
int off_ccache = 0;
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (_cop >= 0) off_ccache = 1;
|
||||
@ -553,6 +543,11 @@ void PairLJCharmmCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
|
||||
buffers->grow_ccache(off_ccache, comm->nthreads, 1);
|
||||
_ccache_stride = buffers->ccache_stride();
|
||||
|
||||
int tp1 = atom->ntypes + 1;
|
||||
int ntable = 1;
|
||||
if (ncoultablebits)
|
||||
for (int i = 0; i < ncoultablebits; i++) ntable *= 2;
|
||||
|
||||
fc.set_ntypes(tp1, ntable, memory, _cop);
|
||||
buffers->set_ntypes(tp1);
|
||||
flt_t **cutneighsq = buffers->get_cutneighsq();
|
||||
|
||||
@ -48,7 +48,7 @@ class PairLJCharmmCoulLongIntel : public PairLJCharmmCoulLong {
|
||||
template <class flt_t, class acc_t>
|
||||
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc);
|
||||
template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||
void eval(const int offload, const int vflag,
|
||||
IntelBuffers<flt_t,acc_t> * buffers,
|
||||
const ForceConst<flt_t> &fc, const int astart, const int aend);
|
||||
|
||||
@ -83,57 +83,50 @@ void PairLJCutCoulLongIntel::compute(int eflag, int vflag,
|
||||
|
||||
if (_lrt == 0 && ago != 0 && fix->separate_buffers() == 0) {
|
||||
fix->start_watch(TIME_PACK);
|
||||
int packthreads;
|
||||
if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
|
||||
else packthreads = 1;
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
|
||||
#pragma omp parallel if(packthreads > 1)
|
||||
#endif
|
||||
{
|
||||
int ifrom, ito, tid;
|
||||
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
|
||||
nthreads, sizeof(ATOM_T));
|
||||
packthreads, sizeof(ATOM_T));
|
||||
buffers->thr_pack(ifrom,ito,ago);
|
||||
}
|
||||
fix->stop_watch(TIME_PACK);
|
||||
}
|
||||
|
||||
if (evflag || vflag_fdotr) {
|
||||
int ovflag = 0;
|
||||
if (vflag_fdotr) ovflag = 2;
|
||||
else if (vflag) ovflag = 1;
|
||||
if (eflag) {
|
||||
if (force->newton_pair) {
|
||||
eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
int ovflag = 0;
|
||||
if (vflag_fdotr) ovflag = 2;
|
||||
else if (vflag) ovflag = 1;
|
||||
if (eflag) {
|
||||
if (force->newton_pair) {
|
||||
eval<1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
if (force->newton_pair) {
|
||||
eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
eval<1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
} else {
|
||||
if (force->newton_pair) {
|
||||
eval<0,0,1>(1, 0, buffers, fc, 0, offload_end);
|
||||
eval<0,0,1>(0, 0, buffers, fc, host_start, inum);
|
||||
eval<0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<0,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
eval<0,0,0>(1, 0, buffers, fc, 0, offload_end);
|
||||
eval<0,0,0>(0, 0, buffers, fc, host_start, inum);
|
||||
eval<0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<0,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||
void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
|
||||
IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc,
|
||||
const int astart, const int aend)
|
||||
IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc,
|
||||
const int astart, const int aend)
|
||||
{
|
||||
const int inum = aend - astart;
|
||||
if (inum == 0) return;
|
||||
@ -167,9 +160,17 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
|
||||
const int ntypes = atom->ntypes + 1;
|
||||
const int eatom = this->eflag_atom;
|
||||
|
||||
flt_t * _noalias const ccachex = buffers->get_ccachex();
|
||||
flt_t * _noalias const ccachey = buffers->get_ccachey();
|
||||
flt_t * _noalias const ccachez = buffers->get_ccachez();
|
||||
flt_t * _noalias const ccachew = buffers->get_ccachew();
|
||||
int * _noalias const ccachei = buffers->get_ccachei();
|
||||
int * _noalias const ccachej = buffers->get_ccachej();
|
||||
const int ccache_stride = _ccache_stride;
|
||||
|
||||
// Determine how much data to transfer
|
||||
int x_size, q_size, f_stride, ev_size, separate_flag;
|
||||
IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
|
||||
IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
|
||||
buffers, offload, fix, separate_flag,
|
||||
x_size, q_size, ev_size, f_stride);
|
||||
|
||||
@ -204,8 +205,10 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
|
||||
in(x:length(x_size) alloc_if(0) free_if(0)) \
|
||||
in(q:length(q_size) alloc_if(0) free_if(0)) \
|
||||
in(overflow:length(0) alloc_if(0) free_if(0)) \
|
||||
in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \
|
||||
in(ccachei,ccachej:length(0) alloc_if(0) free_if(0)) \
|
||||
in(astart,nthreads,qqrd2e,g_ewald,inum,nall,ntypes,vflag,eatom) \
|
||||
in(f_stride,nlocal,minlocal,separate_flag,offload) \
|
||||
in(ccache_stride,f_stride,nlocal,minlocal,separate_flag,offload) \
|
||||
out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
|
||||
out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
|
||||
out(timer_compute:length(1) alloc_if(0) free_if(0)) \
|
||||
@ -220,27 +223,34 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
|
||||
f_stride, x, q);
|
||||
|
||||
acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||
if (EVFLAG) {
|
||||
oevdwl = oecoul = (acc_t)0;
|
||||
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
|
||||
}
|
||||
if (EFLAG) oevdwl = oecoul = (acc_t)0;
|
||||
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
|
||||
|
||||
// loop over neighbors of my atoms
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp parallel default(none) \
|
||||
shared(f_start,f_stride,nlocal,nall,minlocal) \
|
||||
reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||
#pragma omp parallel reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||
#endif
|
||||
{
|
||||
int iifrom, iito, tid;
|
||||
IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
|
||||
int iifrom, iip, iito, tid;
|
||||
IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
|
||||
iifrom += astart;
|
||||
iito += astart;
|
||||
|
||||
FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
|
||||
memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
|
||||
int foff;
|
||||
if (NEWTON_PAIR) foff = tid * f_stride - minlocal;
|
||||
else foff = -minlocal;
|
||||
FORCE_T * _noalias const f = f_start + foff;
|
||||
if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
|
||||
|
||||
for (int i = iifrom; i < iito; ++i) {
|
||||
const int toffs = tid * ccache_stride;
|
||||
flt_t * _noalias const tdelx = ccachex + toffs;
|
||||
flt_t * _noalias const tdely = ccachey + toffs;
|
||||
flt_t * _noalias const tdelz = ccachez + toffs;
|
||||
flt_t * _noalias const trsq = ccachew + toffs;
|
||||
int * _noalias const tj = ccachei + toffs;
|
||||
int * _noalias const tjtype = ccachej + toffs;
|
||||
|
||||
for (int i = iifrom; i < iito; i += iip) {
|
||||
const int itype = x[i].w;
|
||||
|
||||
const int ptr_off = itype * ntypes;
|
||||
@ -258,86 +268,98 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
|
||||
const flt_t ztmp = x[i].z;
|
||||
const flt_t qtmp = q[i];
|
||||
fxtmp = fytmp = fztmp = (acc_t)0;
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
|
||||
if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
|
||||
if (NEWTON_PAIR == 0)
|
||||
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
||||
|
||||
int ej = 0;
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for (int jj = 0; jj < jnum; jj++) {
|
||||
const int j = jlist[jj] & NEIGHMASK;
|
||||
const flt_t delx = xtmp - x[j].x;
|
||||
const flt_t dely = ytmp - x[j].y;
|
||||
const flt_t delz = ztmp - x[j].z;
|
||||
const int jtype = x[j].w;
|
||||
const flt_t rsq = delx * delx + dely * dely + delz * delz;
|
||||
|
||||
if (rsq < c_forcei[jtype].cutsq) {
|
||||
trsq[ej]=rsq;
|
||||
tdelx[ej]=delx;
|
||||
tdely[ej]=dely;
|
||||
tdelz[ej]=delz;
|
||||
tjtype[ej]=jtype;
|
||||
tj[ej]=jlist[jj];
|
||||
ej++;
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
|
||||
sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
|
||||
sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
#endif
|
||||
for (int jj = 0; jj < jnum; jj++) {
|
||||
for (int jj = 0; jj < ej; jj++) {
|
||||
flt_t forcecoul, forcelj, evdwl, ecoul;
|
||||
forcecoul = forcelj = evdwl = ecoul = (flt_t)0.0;
|
||||
|
||||
const int sbindex = jlist[jj] >> SBBITS & 3;
|
||||
const int j = jlist[jj] & NEIGHMASK;
|
||||
|
||||
const flt_t delx = xtmp - x[j].x;
|
||||
const flt_t dely = ytmp - x[j].y;
|
||||
const flt_t delz = ztmp - x[j].z;
|
||||
const int jtype = x[j].w;
|
||||
const flt_t rsq = delx * delx + dely * dely + delz * delz;
|
||||
|
||||
const int j = tj[jj] & NEIGHMASK;
|
||||
const int sbindex = tj[jj] >> SBBITS & 3;
|
||||
const int jtype = tjtype[jj];
|
||||
const flt_t rsq = trsq[jj];
|
||||
const flt_t r2inv = (flt_t)1.0 / rsq;
|
||||
|
||||
#ifdef INTEL_VMASK
|
||||
if (rsq < c_forcei[jtype].cutsq) {
|
||||
#ifdef INTEL_ALLOW_TABLE
|
||||
if (!ncoultablebits || rsq <= tabinnersq) {
|
||||
#endif
|
||||
#ifdef INTEL_ALLOW_TABLE
|
||||
if (!ncoultablebits || rsq <= tabinnersq) {
|
||||
#endif
|
||||
const flt_t A1 = 0.254829592;
|
||||
const flt_t A2 = -0.284496736;
|
||||
const flt_t A3 = 1.421413741;
|
||||
const flt_t A4 = -1.453152027;
|
||||
const flt_t A5 = 1.061405429;
|
||||
const flt_t EWALD_F = 1.12837917;
|
||||
const flt_t INV_EWALD_P = 1.0 / 0.3275911;
|
||||
const flt_t A1 = 0.254829592;
|
||||
const flt_t A2 = -0.284496736;
|
||||
const flt_t A3 = 1.421413741;
|
||||
const flt_t A4 = -1.453152027;
|
||||
const flt_t A5 = 1.061405429;
|
||||
const flt_t EWALD_F = 1.12837917;
|
||||
const flt_t INV_EWALD_P = 1.0 / 0.3275911;
|
||||
|
||||
const flt_t r = (flt_t)1.0 / sqrt(r2inv);
|
||||
const flt_t grij = g_ewald * r;
|
||||
const flt_t expm2 = exp(-grij * grij);
|
||||
const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij);
|
||||
const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
|
||||
const flt_t prefactor = qqrd2e * qtmp * q[j] / r;
|
||||
forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
|
||||
if (EFLAG) ecoul = prefactor * erfc;
|
||||
const flt_t r = (flt_t)1.0 / sqrt(r2inv);
|
||||
const flt_t grij = g_ewald * r;
|
||||
const flt_t expm2 = exp(-grij * grij);
|
||||
const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij);
|
||||
const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
|
||||
const flt_t prefactor = qqrd2e * qtmp * q[j] / r;
|
||||
forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
|
||||
if (EFLAG) ecoul = prefactor * erfc;
|
||||
|
||||
const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])*
|
||||
const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])*
|
||||
prefactor;
|
||||
forcecoul -= adjust;
|
||||
if (EFLAG) ecoul -= adjust;
|
||||
|
||||
#ifdef INTEL_ALLOW_TABLE
|
||||
} else {
|
||||
float rsq_lookup = rsq;
|
||||
const int itable = (__intel_castf32_u32(rsq_lookup) &
|
||||
ncoulmask) >> ncoulshiftbits;
|
||||
const flt_t fraction = (rsq_lookup - table[itable].r) *
|
||||
table[itable].dr;
|
||||
|
||||
const flt_t tablet = table[itable].f +
|
||||
fraction * table[itable].df;
|
||||
forcecoul = qtmp * q[j] * tablet;
|
||||
if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] +
|
||||
fraction * detable[itable]);
|
||||
if (sbindex) {
|
||||
const flt_t table2 = ctable[itable] +
|
||||
fraction * dctable[itable];
|
||||
const flt_t prefactor = qtmp * q[j] * table2;
|
||||
const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) *
|
||||
prefactor;
|
||||
forcecoul -= adjust;
|
||||
if (EFLAG) ecoul -= adjust;
|
||||
|
||||
#ifdef INTEL_ALLOW_TABLE
|
||||
} else {
|
||||
float rsq_lookup = rsq;
|
||||
const int itable = (__intel_castf32_u32(rsq_lookup) &
|
||||
ncoulmask) >> ncoulshiftbits;
|
||||
const flt_t fraction = (rsq_lookup - table[itable].r) *
|
||||
table[itable].dr;
|
||||
|
||||
const flt_t tablet = table[itable].f +
|
||||
fraction * table[itable].df;
|
||||
forcecoul = qtmp * q[j] * tablet;
|
||||
if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] +
|
||||
fraction * detable[itable]);
|
||||
if (sbindex) {
|
||||
const flt_t table2 = ctable[itable] +
|
||||
fraction * dctable[itable];
|
||||
const flt_t prefactor = qtmp * q[j] * table2;
|
||||
const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) *
|
||||
prefactor;
|
||||
forcecoul -= adjust;
|
||||
if (EFLAG) ecoul -= adjust;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#ifdef INTEL_VMASK
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef INTEL_VMASK
|
||||
if (rsq < c_forcei[jtype].cut_ljsq) {
|
||||
@ -357,80 +379,79 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
|
||||
#ifdef INTEL_VMASK
|
||||
}
|
||||
#else
|
||||
if (rsq > c_forcei[jtype].cutsq)
|
||||
{ forcecoul = (flt_t)0.0; ecoul = (flt_t)0.0; }
|
||||
if (rsq > c_forcei[jtype].cut_ljsq)
|
||||
{ forcelj = (flt_t)0.0; evdwl = (flt_t)0.0; }
|
||||
#endif
|
||||
|
||||
#ifdef INTEL_VMASK
|
||||
if (rsq < c_forcei[jtype].cutsq) {
|
||||
#endif
|
||||
const flt_t fpair = (forcecoul + forcelj) * r2inv;
|
||||
fxtmp += delx * fpair;
|
||||
fytmp += dely * fpair;
|
||||
fztmp += delz * fpair;
|
||||
if (NEWTON_PAIR || j < nlocal) {
|
||||
f[j].x -= delx * fpair;
|
||||
f[j].y -= dely * fpair;
|
||||
f[j].z -= delz * fpair;
|
||||
}
|
||||
const flt_t fpair = (forcecoul + forcelj) * r2inv;
|
||||
const flt_t fpx = fpair * tdelx[jj];
|
||||
fxtmp += fpx;
|
||||
if (NEWTON_PAIR) f[j].x -= fpx;
|
||||
const flt_t fpy = fpair * tdely[jj];
|
||||
fytmp += fpy;
|
||||
if (NEWTON_PAIR) f[j].y -= fpy;
|
||||
const flt_t fpz = fpair * tdelz[jj];
|
||||
fztmp += fpz;
|
||||
if (NEWTON_PAIR) f[j].z -= fpz;
|
||||
|
||||
if (EVFLAG) {
|
||||
flt_t ev_pre = (flt_t)0;
|
||||
if (NEWTON_PAIR || i < nlocal)
|
||||
ev_pre += (flt_t)0.5;
|
||||
if (NEWTON_PAIR || j < nlocal)
|
||||
ev_pre += (flt_t)0.5;
|
||||
|
||||
if (EFLAG) {
|
||||
sevdwl += ev_pre * evdwl;
|
||||
secoul += ev_pre * ecoul;
|
||||
if (eatom) {
|
||||
if (NEWTON_PAIR || i < nlocal)
|
||||
fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
|
||||
if (NEWTON_PAIR || j < nlocal)
|
||||
f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
|
||||
}
|
||||
}
|
||||
IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair, delx, dely, delz);
|
||||
if (EFLAG) {
|
||||
sevdwl += evdwl;
|
||||
secoul += ecoul;
|
||||
if (eatom) {
|
||||
fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
|
||||
if (NEWTON_PAIR)
|
||||
f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
|
||||
}
|
||||
#ifdef INTEL_VMASK
|
||||
}
|
||||
#endif
|
||||
}
|
||||
if (NEWTON_PAIR == 0)
|
||||
IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj],
|
||||
fpx, fpy, fpz);
|
||||
} // for jj
|
||||
|
||||
f[i].x += fxtmp;
|
||||
f[i].y += fytmp;
|
||||
f[i].z += fztmp;
|
||||
IP_PRE_ev_tally_atomq(EVFLAG, EFLAG, vflag, f, fwtmp);
|
||||
if (NEWTON_PAIR) {
|
||||
f[i].x += fxtmp;
|
||||
f[i].y += fytmp;
|
||||
f[i].z += fztmp;
|
||||
} else {
|
||||
f[i].x = fxtmp;
|
||||
f[i].y = fytmp;
|
||||
f[i].z = fztmp;
|
||||
}
|
||||
|
||||
IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
|
||||
} // for ii
|
||||
|
||||
#ifndef _LMP_INTEL_OFFLOAD
|
||||
if (vflag == 2)
|
||||
#endif
|
||||
{
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp barrier
|
||||
#endif
|
||||
IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall,
|
||||
nlocal, minlocal, nthreads, f_start, f_stride,
|
||||
x, offload);
|
||||
}
|
||||
IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
|
||||
f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
|
||||
ov4, ov5);
|
||||
} // end of omp parallel region
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) {
|
||||
ev_global[0] = oevdwl;
|
||||
ev_global[1] = oecoul;
|
||||
|
||||
IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
|
||||
ov0, ov1, ov2, ov3, ov4, ov5);
|
||||
|
||||
if (EFLAG) {
|
||||
if (NEWTON_PAIR == 0) {
|
||||
oevdwl *= (acc_t)0.5;
|
||||
oecoul *= (acc_t)0.5;
|
||||
}
|
||||
if (vflag) {
|
||||
ev_global[2] = ov0;
|
||||
ev_global[3] = ov1;
|
||||
ev_global[4] = ov2;
|
||||
ev_global[5] = ov3;
|
||||
ev_global[6] = ov4;
|
||||
ev_global[7] = ov5;
|
||||
ev_global[0] = oevdwl;
|
||||
ev_global[1] = oecoul;
|
||||
}
|
||||
if (vflag) {
|
||||
if (NEWTON_PAIR == 0) {
|
||||
ov0 *= (acc_t)0.5;
|
||||
ov1 *= (acc_t)0.5;
|
||||
ov2 *= (acc_t)0.5;
|
||||
ov3 *= (acc_t)0.5;
|
||||
ov4 *= (acc_t)0.5;
|
||||
ov5 *= (acc_t)0.5;
|
||||
}
|
||||
ev_global[2] = ov0;
|
||||
ev_global[3] = ov1;
|
||||
ev_global[4] = ov2;
|
||||
ev_global[5] = ov3;
|
||||
ev_global[6] = ov4;
|
||||
ev_global[7] = ov5;
|
||||
}
|
||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||
*timer_compute = MIC_Wtime() - *timer_compute;
|
||||
@ -442,7 +463,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
|
||||
else
|
||||
fix->stop_watch(TIME_HOST_PAIR);
|
||||
|
||||
if (EVFLAG)
|
||||
if (EFLAG || vflag)
|
||||
fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
|
||||
else
|
||||
fix->add_result_array(f_start, 0, offload);
|
||||
@ -453,6 +474,10 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
|
||||
void PairLJCutCoulLongIntel::init_style()
|
||||
{
|
||||
PairLJCutCoulLong::init_style();
|
||||
if (force->newton_pair == 0) {
|
||||
neighbor->requests[neighbor->nrequest-1]->half = 0;
|
||||
neighbor->requests[neighbor->nrequest-1]->full = 1;
|
||||
}
|
||||
neighbor->requests[neighbor->nrequest-1]->intel = 1;
|
||||
|
||||
int ifix = modify->find_fix("package_intel");
|
||||
@ -480,6 +505,13 @@ template <class flt_t, class acc_t>
|
||||
void PairLJCutCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
|
||||
IntelBuffers<flt_t,acc_t> *buffers)
|
||||
{
|
||||
int off_ccache = 0;
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (_cop >= 0) off_ccache = 1;
|
||||
#endif
|
||||
buffers->grow_ccache(off_ccache, comm->nthreads, 1);
|
||||
_ccache_stride = buffers->ccache_stride();
|
||||
|
||||
int tp1 = atom->ntypes + 1;
|
||||
int ntable = 1;
|
||||
if (ncoultablebits)
|
||||
@ -514,6 +546,9 @@ void PairLJCutCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
|
||||
|
||||
for (int i = 0; i < tp1; i++) {
|
||||
for (int j = 0; j < tp1; j++) {
|
||||
if (cutsq[i][j] < cut_ljsq[i][j])
|
||||
error->all(FLERR,
|
||||
"Intel variant of lj/cut/coul/long expects lj cutoff<=coulombic");
|
||||
fc.c_force[i][j].cutsq = cutsq[i][j];
|
||||
fc.c_force[i][j].cut_ljsq = cut_ljsq[i][j];
|
||||
fc.c_force[i][j].lj1 = lj1[i][j];
|
||||
|
||||
@ -42,13 +42,13 @@ class PairLJCutCoulLongIntel : public PairLJCutCoulLong {
|
||||
|
||||
private:
|
||||
FixIntel *fix;
|
||||
int _cop, _lrt;
|
||||
int _cop, _lrt, _ccache_stride;
|
||||
|
||||
template <class flt_t> class ForceConst;
|
||||
template <class flt_t, class acc_t>
|
||||
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc);
|
||||
template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||
template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||
void eval(const int offload, const int vflag,
|
||||
IntelBuffers<flt_t,acc_t> * buffers,
|
||||
const ForceConst<flt_t> &fc, const int astart, const int aend);
|
||||
|
||||
@ -75,85 +75,64 @@ void PairLJCutIntel::compute(int eflag, int vflag,
|
||||
if (ago != 0 && fix->separate_buffers() == 0) {
|
||||
fix->start_watch(TIME_PACK);
|
||||
|
||||
int packthreads;
|
||||
if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
|
||||
else packthreads = 1;
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
|
||||
#pragma omp parallel if(packthreads > 1)
|
||||
#endif
|
||||
{
|
||||
int ifrom, ito, tid;
|
||||
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
|
||||
nthreads, sizeof(ATOM_T));
|
||||
packthreads, sizeof(ATOM_T));
|
||||
buffers->thr_pack(ifrom,ito,ago);
|
||||
}
|
||||
fix->stop_watch(TIME_PACK);
|
||||
}
|
||||
|
||||
int ovflag = 0;
|
||||
if (vflag_fdotr) ovflag = 2;
|
||||
else if (vflag) ovflag = 1;
|
||||
if (_onetype) {
|
||||
if (evflag || vflag_fdotr) {
|
||||
int ovflag = 0;
|
||||
if (vflag_fdotr) ovflag = 2;
|
||||
else if (vflag) ovflag = 1;
|
||||
if (eflag) {
|
||||
if (force->newton_pair) {
|
||||
eval<1,1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,1,1,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
eval<1,1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,1,1,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
if (eflag) {
|
||||
if (force->newton_pair) {
|
||||
eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
if (force->newton_pair) {
|
||||
eval<1,1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,1,0,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
eval<1,1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,1,0,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
} else {
|
||||
if (force->newton_pair) {
|
||||
eval<1,0,0,1>(1, 0, buffers, fc, 0, offload_end);
|
||||
eval<1,0,0,1>(0, 0, buffers, fc, host_start, inum);
|
||||
eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
eval<1,0,0,0>(1, 0, buffers, fc, 0, offload_end);
|
||||
eval<1,0,0,0>(0, 0, buffers, fc, host_start, inum);
|
||||
eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (evflag || vflag_fdotr) {
|
||||
int ovflag = 0;
|
||||
if (vflag_fdotr) ovflag = 2;
|
||||
else if (vflag) ovflag = 1;
|
||||
if (eflag) {
|
||||
if (force->newton_pair) {
|
||||
eval<0,1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<0,1,1,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
eval<0,1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<0,1,1,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
if (eflag) {
|
||||
if (force->newton_pair) {
|
||||
eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
if (force->newton_pair) {
|
||||
eval<0,1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<0,1,0,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
eval<0,1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<0,1,0,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
} else {
|
||||
if (force->newton_pair) {
|
||||
eval<0,0,0,1>(1, 0, buffers, fc, 0, offload_end);
|
||||
eval<0,0,0,1>(0, 0, buffers, fc, host_start, inum);
|
||||
eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
eval<0,0,0,0>(1, 0, buffers, fc, 0, offload_end);
|
||||
eval<0,0,0,0>(0, 0, buffers, fc, host_start, inum);
|
||||
eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <int ONETYPE, int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t,
|
||||
class acc_t>
|
||||
template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||
void PairLJCutIntel::eval(const int offload, const int vflag,
|
||||
IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc,
|
||||
@ -181,7 +160,7 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
|
||||
|
||||
// Determine how much data to transfer
|
||||
int x_size, q_size, f_stride, ev_size, separate_flag;
|
||||
IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
|
||||
IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
|
||||
buffers, offload, fix, separate_flag,
|
||||
x_size, q_size, ev_size, f_stride);
|
||||
|
||||
@ -200,25 +179,24 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
|
||||
f_stride, x, 0);
|
||||
|
||||
acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||
if (EVFLAG) {
|
||||
oevdwl = (acc_t)0;
|
||||
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
|
||||
}
|
||||
if (EFLAG) oevdwl = (acc_t)0;
|
||||
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
|
||||
|
||||
// loop over neighbors of my atoms
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp parallel default(none) \
|
||||
shared(f_start,f_stride,nlocal,nall,minlocal) \
|
||||
reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||
#pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||
#endif
|
||||
{
|
||||
int iifrom, iito, tid;
|
||||
IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
|
||||
int iifrom, iip, iito, tid;
|
||||
IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
|
||||
iifrom += astart;
|
||||
iito += astart;
|
||||
|
||||
FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
|
||||
memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
|
||||
int foff;
|
||||
if (NEWTON_PAIR) foff = tid * f_stride - minlocal;
|
||||
else foff = -minlocal;
|
||||
FORCE_T * _noalias const f = f_start + foff;
|
||||
if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
|
||||
|
||||
flt_t cutsq, lj1, lj2, lj3, lj4, offset;
|
||||
if (ONETYPE) {
|
||||
@ -229,7 +207,7 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
|
||||
lj4 = lj34[3].lj4;
|
||||
offset = ljc12o[3].offset;
|
||||
}
|
||||
for (int i = iifrom; i < iito; ++i) {
|
||||
for (int i = iifrom; i < iito; i += iip) {
|
||||
int itype, ptr_off;
|
||||
const FC_PACKED1_T * _noalias ljc12oi;
|
||||
const FC_PACKED2_T * _noalias lj34i;
|
||||
@ -250,10 +228,9 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
|
||||
const flt_t ytmp = x[i].y;
|
||||
const flt_t ztmp = x[i].z;
|
||||
fxtmp = fytmp = fztmp = (acc_t)0;
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) fwtmp = sevdwl = (acc_t)0;
|
||||
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
||||
}
|
||||
if (EFLAG) fwtmp = sevdwl = (acc_t)0;
|
||||
if (NEWTON_PAIR == 0)
|
||||
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
@ -301,83 +278,84 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
|
||||
else
|
||||
fpair = forcelj * r2inv;
|
||||
|
||||
fxtmp += delx * fpair;
|
||||
fytmp += dely * fpair;
|
||||
fztmp += delz * fpair;
|
||||
if (NEWTON_PAIR || j < nlocal) {
|
||||
f[j].x -= delx * fpair;
|
||||
f[j].y -= dely * fpair;
|
||||
f[j].z -= delz * fpair;
|
||||
}
|
||||
const flt_t fpx = fpair * delx;
|
||||
fxtmp += fpx;
|
||||
if (NEWTON_PAIR) f[j].x -= fpx;
|
||||
const flt_t fpy = fpair * dely;
|
||||
fytmp += fpy;
|
||||
if (NEWTON_PAIR) f[j].y -= fpy;
|
||||
const flt_t fpz = fpair * delz;
|
||||
fztmp += fpz;
|
||||
if (NEWTON_PAIR) f[j].z -= fpz;
|
||||
|
||||
if (EVFLAG) {
|
||||
flt_t ev_pre = (flt_t)0;
|
||||
if (NEWTON_PAIR || i<nlocal)
|
||||
ev_pre += (flt_t)0.5;
|
||||
if (NEWTON_PAIR || j<nlocal)
|
||||
ev_pre += (flt_t)0.5;
|
||||
|
||||
if (EFLAG) {
|
||||
if (!ONETYPE) {
|
||||
lj3 = lj34i[jtype].lj3;
|
||||
lj4 = lj34i[jtype].lj4;
|
||||
offset = ljc12oi[jtype].offset;
|
||||
}
|
||||
evdwl = r6inv * (lj3 * r6inv - lj4);
|
||||
#ifdef INTEL_VMASK
|
||||
evdwl -= offset;
|
||||
#else
|
||||
if (rsq < cutsq) evdwl -= offset;
|
||||
#endif
|
||||
if (!ONETYPE) evdwl *= factor_lj;
|
||||
sevdwl += ev_pre*evdwl;
|
||||
if (eatom) {
|
||||
if (NEWTON_PAIR || i < nlocal)
|
||||
fwtmp += 0.5 * evdwl;
|
||||
if (NEWTON_PAIR || j < nlocal)
|
||||
f[j].w += 0.5 * evdwl;
|
||||
}
|
||||
if (EFLAG) {
|
||||
if (!ONETYPE) {
|
||||
lj3 = lj34i[jtype].lj3;
|
||||
lj4 = lj34i[jtype].lj4;
|
||||
offset = ljc12oi[jtype].offset;
|
||||
}
|
||||
evdwl = r6inv * (lj3 * r6inv - lj4);
|
||||
#ifdef INTEL_VMASK
|
||||
evdwl -= offset;
|
||||
#else
|
||||
if (rsq < cutsq) evdwl -= offset;
|
||||
#endif
|
||||
if (!ONETYPE) evdwl *= factor_lj;
|
||||
sevdwl += evdwl;
|
||||
if (eatom) {
|
||||
fwtmp += (flt_t)0.5 * evdwl;
|
||||
if (NEWTON_PAIR)
|
||||
f[j].w += (flt_t)0.5 * evdwl;
|
||||
}
|
||||
}
|
||||
|
||||
IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair,
|
||||
delx, dely, delz);
|
||||
}
|
||||
if (NEWTON_PAIR == 0)
|
||||
IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz);
|
||||
#ifdef INTEL_VMASK
|
||||
} // if rsq
|
||||
#endif
|
||||
} // for jj
|
||||
f[i].x += fxtmp;
|
||||
f[i].y += fytmp;
|
||||
f[i].z += fztmp;
|
||||
if (NEWTON_PAIR) {
|
||||
f[i].x += fxtmp;
|
||||
f[i].y += fytmp;
|
||||
f[i].z += fztmp;
|
||||
} else {
|
||||
f[i].x = fxtmp;
|
||||
f[i].y = fytmp;
|
||||
f[i].z = fztmp;
|
||||
}
|
||||
|
||||
IP_PRE_ev_tally_atom(EVFLAG, EFLAG, vflag, f, fwtmp);
|
||||
IP_PRE_ev_tally_atom(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
|
||||
} // for ii
|
||||
|
||||
#ifndef _LMP_INTEL_OFFLOAD
|
||||
if (vflag == 2)
|
||||
#endif
|
||||
{
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp barrier
|
||||
#endif
|
||||
IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall,
|
||||
nlocal, minlocal, nthreads, f_start, f_stride,
|
||||
x, offload);
|
||||
}
|
||||
IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
|
||||
f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
|
||||
ov4, ov5);
|
||||
} // end omp
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) {
|
||||
ev_global[0] = oevdwl;
|
||||
ev_global[1] = (acc_t)0.0;
|
||||
}
|
||||
if (vflag) {
|
||||
ev_global[2] = ov0;
|
||||
ev_global[3] = ov1;
|
||||
ev_global[4] = ov2;
|
||||
ev_global[5] = ov3;
|
||||
ev_global[6] = ov4;
|
||||
ev_global[7] = ov5;
|
||||
|
||||
IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
|
||||
ov0, ov1, ov2, ov3, ov4, ov5);
|
||||
|
||||
if (EFLAG) {
|
||||
if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
|
||||
ev_global[0] = oevdwl;
|
||||
ev_global[1] = (acc_t)0.0;
|
||||
}
|
||||
if (vflag) {
|
||||
if (NEWTON_PAIR == 0) {
|
||||
ov0 *= (acc_t)0.5;
|
||||
ov1 *= (acc_t)0.5;
|
||||
ov2 *= (acc_t)0.5;
|
||||
ov3 *= (acc_t)0.5;
|
||||
ov4 *= (acc_t)0.5;
|
||||
ov5 *= (acc_t)0.5;
|
||||
}
|
||||
ev_global[2] = ov0;
|
||||
ev_global[3] = ov1;
|
||||
ev_global[4] = ov2;
|
||||
ev_global[5] = ov3;
|
||||
ev_global[6] = ov4;
|
||||
ev_global[7] = ov5;
|
||||
}
|
||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||
*timer_compute = MIC_Wtime() - *timer_compute;
|
||||
@ -389,7 +367,7 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
|
||||
else
|
||||
fix->stop_watch(TIME_HOST_PAIR);
|
||||
|
||||
if (EVFLAG)
|
||||
if (EFLAG || vflag)
|
||||
fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
|
||||
else
|
||||
fix->add_result_array(f_start, 0, offload);
|
||||
@ -400,6 +378,10 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
|
||||
void PairLJCutIntel::init_style()
|
||||
{
|
||||
PairLJCut::init_style();
|
||||
if (force->newton_pair == 0) {
|
||||
neighbor->requests[neighbor->nrequest-1]->half = 0;
|
||||
neighbor->requests[neighbor->nrequest-1]->full = 1;
|
||||
}
|
||||
neighbor->requests[neighbor->nrequest-1]->intel = 1;
|
||||
|
||||
int ifix = modify->find_fix("package_intel");
|
||||
|
||||
@ -45,8 +45,7 @@ class PairLJCutIntel : public PairLJCut {
|
||||
template <class flt_t, class acc_t>
|
||||
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc);
|
||||
template <int ONETYPE, int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t,
|
||||
class acc_t>
|
||||
template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||
void eval(const int offload, const int vflag,
|
||||
IntelBuffers<flt_t,acc_t> * buffers,
|
||||
const ForceConst<flt_t> &fc, const int astart, const int aend);
|
||||
|
||||
50
src/USER-INTEL/pair_lj_long_coul_long_intel.cpp
Normal file
50
src/USER-INTEL/pair_lj_long_coul_long_intel.cpp
Normal file
@ -0,0 +1,50 @@
|
||||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: William McDoniel (RWTH Aachen University)
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include <math.h>
|
||||
#include "pair_lj_long_coul_long_intel.h"
|
||||
#include "atom.h"
|
||||
#include "comm.h"
|
||||
#include "force.h"
|
||||
#include "group.h"
|
||||
#include "kspace.h"
|
||||
#include "memory.h"
|
||||
#include "neighbor.h"
|
||||
#include "neigh_list.h"
|
||||
#include "neigh_request.h"
|
||||
#include "memory.h"
|
||||
#include "suffix.h"
|
||||
|
||||
|
||||
using namespace LAMMPS_NS;
|
||||
|
||||
#define C_FORCE_T typename ForceConst<flt_t>::c_force_t
|
||||
#define C_ENERGY_T typename ForceConst<flt_t>::c_energy_t
|
||||
#define TABLE_T typename ForceConst<flt_t>::table_t
|
||||
|
||||
PairLJLongCoulLongIntel::PairLJLongCoulLongIntel(LAMMPS *lmp) :
|
||||
PairLJLongCoulLong(lmp)
|
||||
{
|
||||
suffix_flag |= Suffix::INTEL;
|
||||
respa_enable = 0;
|
||||
cut_respa = NULL;
|
||||
}
|
||||
|
||||
|
||||
PairLJLongCoulLongIntel::~PairLJLongCoulLongIntel()
|
||||
{
|
||||
}
|
||||
39
src/USER-INTEL/pair_lj_long_coul_long_intel.h
Normal file
39
src/USER-INTEL/pair_lj_long_coul_long_intel.h
Normal file
@ -0,0 +1,39 @@
|
||||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: William McDoniel (RWTH Aachen University)
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifdef PAIR_CLASS
|
||||
|
||||
PairStyle(lj/long/coul/long/intel,PairLJLongCoulLongIntel)
|
||||
|
||||
#else
|
||||
|
||||
#ifndef LMP_PAIR_LJ_LONG_COUL_LONG_INTEL_H
|
||||
#define LMP_PAIR_LJ_LONG_COUL_LONG_INTEL_H
|
||||
|
||||
#include "pair_lj_long_coul_long.h"
|
||||
#include "fix_intel.h"
|
||||
|
||||
namespace LAMMPS_NS {
|
||||
class PairLJLongCoulLongIntel : public PairLJLongCoulLong {
|
||||
public:
|
||||
PairLJLongCoulLongIntel(class LAMMPS *);
|
||||
virtual ~PairLJLongCoulLongIntel();
|
||||
|
||||
};
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
@ -109,85 +109,59 @@ void PairSWIntel::compute(int eflag, int vflag,
|
||||
if (ago != 0 && fix->separate_buffers() == 0) {
|
||||
fix->start_watch(TIME_PACK);
|
||||
|
||||
int packthreads;
|
||||
if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
|
||||
else packthreads = 1;
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
|
||||
#pragma omp parallel if(packthreads > 1)
|
||||
#endif
|
||||
{
|
||||
int ifrom, ito, tid;
|
||||
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
|
||||
nthreads, sizeof(ATOM_T));
|
||||
packthreads, sizeof(ATOM_T));
|
||||
buffers->thr_pack(ifrom, ito, ago);
|
||||
}
|
||||
|
||||
fix->stop_watch(TIME_PACK);
|
||||
}
|
||||
|
||||
int ovflag = 0;
|
||||
if (vflag_fdotr) ovflag = 2;
|
||||
else if (vflag) ovflag = 1;
|
||||
if (_onetype) {
|
||||
if (_spq) {
|
||||
if (evflag || vflag_fdotr) {
|
||||
int ovflag = 0;
|
||||
if (vflag_fdotr) ovflag = 2;
|
||||
else if (vflag) ovflag = 1;
|
||||
if (eflag) {
|
||||
eval<1,1,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
|
||||
eval<1,1,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
|
||||
} else {
|
||||
eval<1,1,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
|
||||
eval<1,1,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
|
||||
}
|
||||
if (eflag) {
|
||||
eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
|
||||
eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
|
||||
} else {
|
||||
eval<1,1,0,0>(1, 0, buffers, fc, 0, offload_end, _offload_pad);
|
||||
eval<1,1,0,0>(0, 0, buffers, fc, host_start, inum, _host_pad);
|
||||
eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
|
||||
eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
|
||||
}
|
||||
} else {
|
||||
if (evflag || vflag_fdotr) {
|
||||
int ovflag = 0;
|
||||
if (vflag_fdotr) ovflag = 2;
|
||||
else if (vflag) ovflag = 1;
|
||||
if (eflag) {
|
||||
eval<0,1,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
|
||||
eval<0,1,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
|
||||
} else {
|
||||
eval<0,1,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
|
||||
eval<0,1,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
|
||||
}
|
||||
if (eflag) {
|
||||
eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
|
||||
eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
|
||||
} else {
|
||||
eval<0,1,0,0>(1, 0, buffers, fc, 0, offload_end, _offload_pad);
|
||||
eval<0,1,0,0>(0, 0, buffers, fc, host_start, inum, _host_pad);
|
||||
eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
|
||||
eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (_spq) {
|
||||
if (evflag || vflag_fdotr) {
|
||||
int ovflag = 0;
|
||||
if (vflag_fdotr) ovflag = 2;
|
||||
else if (vflag) ovflag = 1;
|
||||
if (eflag) {
|
||||
eval<1,0,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
|
||||
eval<1,0,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
|
||||
} else {
|
||||
eval<1,0,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
|
||||
eval<1,0,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
|
||||
}
|
||||
if (eflag) {
|
||||
eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
|
||||
eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
|
||||
} else {
|
||||
eval<1,0,0,0>(1, 0, buffers, fc, 0, offload_end, _offload_pad);
|
||||
eval<1,0,0,0>(0, 0, buffers, fc, host_start, inum, _host_pad);
|
||||
eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
|
||||
eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
|
||||
}
|
||||
} else {
|
||||
if (evflag || vflag_fdotr) {
|
||||
int ovflag = 0;
|
||||
if (vflag_fdotr) ovflag = 2;
|
||||
else if (vflag) ovflag = 1;
|
||||
if (eflag) {
|
||||
eval<0,0,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
|
||||
eval<0,0,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
|
||||
} else {
|
||||
eval<0,0,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
|
||||
eval<0,0,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
|
||||
}
|
||||
if (eflag) {
|
||||
eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
|
||||
eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
|
||||
} else {
|
||||
eval<0,0,0,0>(1, 0, buffers, fc, 0, offload_end, _offload_pad);
|
||||
eval<0,0,0,0>(0, 0, buffers, fc, host_start, inum, _host_pad);
|
||||
eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
|
||||
eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -196,7 +170,7 @@ void PairSWIntel::compute(int eflag, int vflag,
|
||||
/* ---------------------------------------------------------------------- */
|
||||
#ifndef LMP_USE_AVXCD
|
||||
|
||||
template <int SPQ,int ONETYPE,int EVFLAG,int EFLAG,class flt_t,class acc_t>
|
||||
template <int SPQ,int ONETYPE,int EFLAG,class flt_t,class acc_t>
|
||||
void PairSWIntel::eval(const int offload, const int vflag,
|
||||
IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc, const int astart,
|
||||
@ -235,7 +209,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
|
||||
|
||||
// Determine how much data to transfer
|
||||
int x_size, q_size, f_stride, ev_size, separate_flag;
|
||||
IP_PRE_get_transfern(ago, /* NEWTON_PAIR*/ 1, EVFLAG, EFLAG, vflag,
|
||||
IP_PRE_get_transfern(ago, /* NEWTON_PAIR*/ 1, EFLAG, vflag,
|
||||
buffers, offload, fix, separate_flag,
|
||||
x_size, q_size, ev_size, f_stride);
|
||||
|
||||
@ -276,19 +250,15 @@ void PairSWIntel::eval(const int offload, const int vflag,
|
||||
f_stride, x, 0);
|
||||
|
||||
acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||
if (EVFLAG) {
|
||||
oevdwl = (acc_t)0;
|
||||
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
|
||||
}
|
||||
if (EFLAG) oevdwl = (acc_t)0;
|
||||
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
|
||||
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp parallel default(none) \
|
||||
shared(f_start,f_stride,nlocal,nall,minlocal) \
|
||||
reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||
#pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||
#endif
|
||||
{
|
||||
int iifrom, iito, tid;
|
||||
IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
|
||||
int iifrom, iip, iito, tid;
|
||||
IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
|
||||
iifrom += astart;
|
||||
iito += astart;
|
||||
|
||||
@ -328,7 +298,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = iifrom; i < iito; ++i) {
|
||||
for (int i = iifrom; i < iito; i += iip) {
|
||||
int itype, itype_offset;
|
||||
const flt_t xtmp = x[i].x;
|
||||
const flt_t ytmp = x[i].y;
|
||||
@ -344,14 +314,13 @@ void PairSWIntel::eval(const int offload, const int vflag,
|
||||
const int jnumhalf = numneighhalf[i];
|
||||
|
||||
acc_t fxtmp, fytmp, fztmp, fwtmp;
|
||||
acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
|
||||
acc_t sevdwl;
|
||||
fxtmp = fytmp = fztmp = (acc_t)0.0;
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) fwtmp = sevdwl = (acc_t)0;
|
||||
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
||||
}
|
||||
if (EFLAG) fwtmp = sevdwl = (acc_t)0;
|
||||
|
||||
int ejnum = 0, ejnumhalf = 0;
|
||||
#pragma vector aligned
|
||||
#pragma ivdep
|
||||
for (int jj = 0; jj < jnum; jj++) {
|
||||
int j = jlist[jj];
|
||||
j &= NEIGHMASK;
|
||||
@ -390,8 +359,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
||||
sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl)
|
||||
#endif
|
||||
for (int jj = 0; jj < ejnum_pad; jj++) {
|
||||
acc_t fjxtmp, fjytmp, fjztmp, fjtmp;
|
||||
@ -399,9 +367,6 @@ void PairSWIntel::eval(const int offload, const int vflag,
|
||||
if (EFLAG) fjtmp = (acc_t)0.0;
|
||||
int ijtype;
|
||||
|
||||
const flt_t delx = tdelx[jj];
|
||||
const flt_t dely = tdely[jj];
|
||||
const flt_t delz = tdelz[jj];
|
||||
if (!ONETYPE) ijtype = tjtype[jj] + itype_offset;
|
||||
const flt_t rsq1 = trsq[jj];
|
||||
|
||||
@ -440,29 +405,31 @@ void PairSWIntel::eval(const int offload, const int vflag,
|
||||
const flt_t fpair = (c1 * rp - c2 * rq + (c3 * rp - c4 * rq) *
|
||||
rainvsq) * expsrainv * rinvsq1;
|
||||
|
||||
fxtmp -= delx * fpair;
|
||||
fytmp -= dely * fpair;
|
||||
fztmp -= delz * fpair;
|
||||
fjxtmp += delx * fpair;
|
||||
fjytmp += dely * fpair;
|
||||
fjztmp += delz * fpair;
|
||||
const flt_t delx = tdelx[jj];
|
||||
const flt_t dely = tdely[jj];
|
||||
const flt_t delz = tdelz[jj];
|
||||
const flt_t fpx = fpair * delx;
|
||||
fxtmp -= fpx;
|
||||
fjxtmp += fpx;
|
||||
const flt_t fpy = fpair * dely;
|
||||
fytmp -= fpy;
|
||||
fjytmp += fpy;
|
||||
const flt_t fpz = fpair * delz;
|
||||
fztmp -= fpz;
|
||||
fjztmp += fpz;
|
||||
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) {
|
||||
flt_t evdwl;
|
||||
if (!ONETYPE) {
|
||||
c5 = p2e[ijtype].c5;
|
||||
c6 = p2e[ijtype].c6;
|
||||
}
|
||||
evdwl = (c5 * rp - c6 * rq) * expsrainv;
|
||||
sevdwl += evdwl;
|
||||
if (eatom) {
|
||||
fwtmp += (acc_t)0.5 * evdwl;
|
||||
fjtmp += (acc_t)0.5 * evdwl;
|
||||
}
|
||||
}
|
||||
IP_PRE_ev_tally_nbor(vflag, (flt_t)1.0, fpair,
|
||||
-delx, -dely, -delz);
|
||||
if (EFLAG) {
|
||||
flt_t evdwl;
|
||||
if (!ONETYPE) {
|
||||
c5 = p2e[ijtype].c5;
|
||||
c6 = p2e[ijtype].c6;
|
||||
}
|
||||
evdwl = (c5 * rp - c6 * rq) * expsrainv;
|
||||
sevdwl += evdwl;
|
||||
if (eatom) {
|
||||
fwtmp += (flt_t)0.5 * evdwl;
|
||||
fjtmp += (flt_t)0.5 * evdwl;
|
||||
}
|
||||
}
|
||||
|
||||
/*---------------------------------------------*/
|
||||
@ -533,17 +500,13 @@ void PairSWIntel::eval(const int offload, const int vflag,
|
||||
fjytmp += fjy;
|
||||
fjztmp += fjz;
|
||||
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) {
|
||||
const flt_t evdwl = facrad * (flt_t)0.5;
|
||||
sevdwl += evdwl;
|
||||
if (eatom) {
|
||||
fwtmp += (acc_t)0.33333333 * evdwl;
|
||||
fjtmp += (acc_t)0.33333333 * facrad;
|
||||
}
|
||||
if (EFLAG) {
|
||||
const flt_t evdwl = facrad * (flt_t)0.5;
|
||||
sevdwl += evdwl;
|
||||
if (eatom) {
|
||||
fwtmp += (acc_t)0.33333333 * evdwl;
|
||||
fjtmp += (acc_t)0.33333333 * facrad;
|
||||
}
|
||||
IP_PRE_ev_tally_nbor3v(vflag, fjx, fjy, fjz,
|
||||
delx, dely, delz);
|
||||
}
|
||||
} // for kk
|
||||
const int j = tj[jj];
|
||||
@ -557,34 +520,31 @@ void PairSWIntel::eval(const int offload, const int vflag,
|
||||
f[i].x += fxtmp;
|
||||
f[i].y += fytmp;
|
||||
f[i].z += fztmp;
|
||||
IP_PRE_ev_tally_atom(EVFLAG, EFLAG, vflag, f, fwtmp);
|
||||
|
||||
if (EFLAG) {
|
||||
f[i].w += fwtmp;
|
||||
oevdwl += sevdwl;
|
||||
}
|
||||
} // for ii
|
||||
|
||||
#ifndef _LMP_INTEL_OFFLOAD
|
||||
if (vflag == 2)
|
||||
#endif
|
||||
{
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp barrier
|
||||
#endif
|
||||
IP_PRE_fdotr_acc_force(1, EVFLAG, EFLAG, vflag, eatom, nall,
|
||||
nlocal, minlocal, nthreads, f_start, f_stride,
|
||||
x, offload);
|
||||
}
|
||||
IP_PRE_fdotr_reduce_omp(1, nall, minlocal, nthreads, f_start, f_stride,
|
||||
x, offload, vflag, ov0, ov1, ov2, ov3, ov4, ov5);
|
||||
} // end omp
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) {
|
||||
ev_global[0] = oevdwl;
|
||||
ev_global[1] = (acc_t)0.0;
|
||||
}
|
||||
if (vflag) {
|
||||
ev_global[2] = ov0;
|
||||
ev_global[3] = ov1;
|
||||
ev_global[4] = ov2;
|
||||
ev_global[5] = ov3;
|
||||
ev_global[6] = ov4;
|
||||
ev_global[7] = ov5;
|
||||
}
|
||||
|
||||
IP_PRE_fdotr_reduce(1, nall, nthreads, f_stride, vflag,
|
||||
ov0, ov1, ov2, ov3, ov4, ov5);
|
||||
|
||||
if (EFLAG) {
|
||||
ev_global[0] = oevdwl;
|
||||
ev_global[1] = (acc_t)0.0;
|
||||
}
|
||||
if (vflag) {
|
||||
ev_global[2] = ov0;
|
||||
ev_global[3] = ov1;
|
||||
ev_global[4] = ov2;
|
||||
ev_global[5] = ov3;
|
||||
ev_global[6] = ov4;
|
||||
ev_global[7] = ov5;
|
||||
}
|
||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||
*timer_compute = MIC_Wtime() - *timer_compute;
|
||||
@ -595,7 +555,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
|
||||
else
|
||||
fix->stop_watch(TIME_HOST_PAIR);
|
||||
|
||||
if (EVFLAG)
|
||||
if (EFLAG || vflag)
|
||||
fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
|
||||
else
|
||||
fix->add_result_array(f_start, 0, offload);
|
||||
@ -614,7 +574,7 @@ authors for more details.
|
||||
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template <int SPQ,int ONETYPE,int EVFLAG,int EFLAG,class flt_t,class acc_t>
|
||||
template <int SPQ,int ONETYPE,int EFLAG,class flt_t,class acc_t>
|
||||
void PairSWIntel::eval(const int offload, const int vflag,
|
||||
IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc, const int astart,
|
||||
@ -659,7 +619,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
|
||||
|
||||
// Determine how much data to transfer
|
||||
int x_size, q_size, f_stride, ev_size, separate_flag;
|
||||
IP_PRE_get_transfern(ago, /* NEWTON_PAIR*/ 1, EVFLAG, EFLAG, vflag,
|
||||
IP_PRE_get_transfern(ago, /* NEWTON_PAIR*/ 1, EFLAG, vflag,
|
||||
buffers, offload, fix, separate_flag,
|
||||
x_size, q_size, ev_size, f_stride);
|
||||
|
||||
@ -701,19 +661,17 @@ void PairSWIntel::eval(const int offload, const int vflag,
|
||||
f_stride, x, 0);
|
||||
|
||||
acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||
if (EVFLAG) {
|
||||
oevdwl = (acc_t)0;
|
||||
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
|
||||
}
|
||||
if (EFLAG) oevdwl = (acc_t)0;
|
||||
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
|
||||
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp parallel default(none) \
|
||||
shared(f_start,f_stride,nlocal,nall,minlocal) \
|
||||
reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||
#pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||
#endif
|
||||
{
|
||||
int iifrom, iito, tid;
|
||||
IP_PRE_omp_range_id_vec(iifrom, iito, tid, inum, nthreads, swidth);
|
||||
int iifrom, iip, iito, tid;
|
||||
IP_PRE_omp_stride_id_vec(iifrom, iip, iito, tid, inum, nthreads,
|
||||
swidth);
|
||||
|
||||
iifrom += astart;
|
||||
iito += astart;
|
||||
|
||||
@ -760,7 +718,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
|
||||
144,160,176,192,208,224,240);
|
||||
ilist = ilist + iifrom;
|
||||
acc_t * const dforce = &(f[0].x);
|
||||
for (int i = iifrom; i < iito; i += swidth) {
|
||||
for (int i = iifrom; i < iito; i += iip) {
|
||||
SIMD_mask imask = ilist < iito;
|
||||
SIMD_flt_t xtmp, ytmp, ztmp;
|
||||
SIMD_int itype, itype_offset;
|
||||
@ -793,20 +751,10 @@ void PairSWIntel::eval(const int offload, const int vflag,
|
||||
if (EFLAG) fwtmp2 = SIMD_set((acc_t)0);
|
||||
}
|
||||
|
||||
SIMD_acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) {
|
||||
fwtmp = SIMD_set((acc_t)0);
|
||||
sevdwl = SIMD_set((acc_t)0);
|
||||
}
|
||||
if (vflag==1) {
|
||||
sv0 = SIMD_set((acc_t)0);
|
||||
sv1 = SIMD_set((acc_t)0);
|
||||
sv2 = SIMD_set((acc_t)0);
|
||||
sv3 = SIMD_set((acc_t)0);
|
||||
sv4 = SIMD_set((acc_t)0);
|
||||
sv5 = SIMD_set((acc_t)0);
|
||||
}
|
||||
SIMD_acc_t sevdwl;
|
||||
if (EFLAG) {
|
||||
fwtmp = SIMD_set((acc_t)0);
|
||||
sevdwl = SIMD_set((acc_t)0);
|
||||
}
|
||||
|
||||
SIMD_int ejnum = SIMD_set(0);
|
||||
@ -930,19 +878,15 @@ void PairSWIntel::eval(const int offload, const int vflag,
|
||||
fjxtmp, fjytmp, fjztmp, fxtmp2, fytmp2,
|
||||
fztmp2, fjxtmp2, fjytmp2, fjztmp2);
|
||||
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) {
|
||||
if (!ONETYPE) {
|
||||
c5 = SIMD_gather(&(p2e[0].c5), ijtype);
|
||||
c6 = SIMD_gather(&(p2e[0].c6), ijtype);
|
||||
}
|
||||
SIMD_flt_t evdwl;
|
||||
evdwl = (c5 * rp - c6 * rq) * expsrainv;
|
||||
SIMD_acc_energy3(hmask, evdwl, eatom, sevdwl, fwtmp, fjtmp,
|
||||
fwtmp2, fjtmp2);
|
||||
if (EFLAG) {
|
||||
if (!ONETYPE) {
|
||||
c5 = SIMD_gather(&(p2e[0].c5), ijtype);
|
||||
c6 = SIMD_gather(&(p2e[0].c6), ijtype);
|
||||
}
|
||||
SIMD_ev_tally_nbor(hmask, vflag, (flt_t)1.0, fpair, delx, dely,
|
||||
delz, sv0, sv1, sv2, sv3, sv4, sv5);
|
||||
SIMD_flt_t evdwl;
|
||||
evdwl = (c5 * rp - c6 * rq) * expsrainv;
|
||||
SIMD_acc_energy3(hmask, evdwl, eatom, sevdwl, fwtmp, fjtmp,
|
||||
fwtmp2, fjtmp2);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1012,21 +956,15 @@ void PairSWIntel::eval(const int offload, const int vflag,
|
||||
fztmp2, fjxtmp2, fjytmp2, fjztmp2,
|
||||
tf + kcoffset * 3, swidth);
|
||||
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) {
|
||||
SIMD_int k;
|
||||
if (eatom) {
|
||||
k = SIMD_load(tj + kcoffset);
|
||||
k = k << 4;
|
||||
}
|
||||
SIMD_acc_three(kmask, facrad, eatom, sevdwl, fwtmp, fjtmp,
|
||||
fwtmp2, fjtmp2, k, dforce);
|
||||
if (EFLAG) {
|
||||
SIMD_int k;
|
||||
if (eatom) {
|
||||
k = SIMD_load(tj + kcoffset);
|
||||
k = k << 4;
|
||||
}
|
||||
SIMD_ev_tally_nbor3v(kmask, vflag, fjx, fjy, fjz, fkx, fky, fkz,
|
||||
delx, dely, delz, delr2x, delr2y, delr2z,
|
||||
sv0, sv1, sv2, sv3, sv4, sv5);
|
||||
SIMD_acc_three(kmask, facrad, eatom, sevdwl, fwtmp, fjtmp,
|
||||
fwtmp2, fjtmp2, k, dforce);
|
||||
}
|
||||
|
||||
} // for kk
|
||||
if (is_same<flt_t,acc_t>::value == 1)
|
||||
SIMD_cache3(tf + coffset * 3, swidth, fjxtmp, fjytmp, fjztmp);
|
||||
@ -1087,52 +1025,34 @@ void PairSWIntel::eval(const int offload, const int vflag,
|
||||
} // for jj second loop
|
||||
|
||||
SIMD_iforce_update(imask, &(f[i].x), goffset, fxtmp, fytmp, fztmp,
|
||||
EVFLAG, eatom, fwtmp);
|
||||
EFLAG, eatom, fwtmp);
|
||||
if (is_same<flt_t,acc_t>::value == 0) {
|
||||
imask = imask >> 8;
|
||||
SIMD_iforce_update(imask, &(f[i+8].x), goffset, fxtmp2, fytmp2,
|
||||
fztmp2, EVFLAG, eatom, fwtmp2);
|
||||
fztmp2, EFLAG, eatom, fwtmp2);
|
||||
}
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) oevdwl += SIMD_sum(sevdwl);
|
||||
if (vflag == 1) {
|
||||
ov0 += SIMD_sum(sv0);
|
||||
ov1 += SIMD_sum(sv1);
|
||||
ov2 += SIMD_sum(sv2);
|
||||
ov3 += SIMD_sum(sv3);
|
||||
ov4 += SIMD_sum(sv4);
|
||||
ov5 += SIMD_sum(sv5);
|
||||
}
|
||||
}
|
||||
ilist = ilist + swidth;
|
||||
if (EFLAG) oevdwl += SIMD_sum(sevdwl);
|
||||
ilist = ilist + iip;
|
||||
} // for ii
|
||||
|
||||
#ifndef _LMP_INTEL_OFFLOAD
|
||||
if (vflag == 2)
|
||||
#endif
|
||||
{
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp barrier
|
||||
#endif
|
||||
IP_PRE_fdotr_acc_force(1, EVFLAG, EFLAG, vflag, eatom, nall, nlocal,
|
||||
minlocal, nthreads, f_start, f_stride, x,
|
||||
offload);
|
||||
}
|
||||
IP_PRE_fdotr_reduce_omp(1, nall, minlocal, nthreads, f_start, f_stride,
|
||||
x, offload, vflag, ov0, ov1, ov2, ov3, ov4, ov5);
|
||||
} // end omp
|
||||
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) {
|
||||
ev_global[0] = oevdwl;
|
||||
ev_global[1] = (acc_t)0.0;
|
||||
}
|
||||
if (vflag) {
|
||||
ev_global[2] = ov0;
|
||||
ev_global[3] = ov1;
|
||||
ev_global[4] = ov2;
|
||||
ev_global[5] = ov3;
|
||||
ev_global[6] = ov4;
|
||||
ev_global[7] = ov5;
|
||||
}
|
||||
IP_PRE_fdotr_reduce(1, nall, nthreads, f_stride, vflag,
|
||||
ov0, ov1, ov2, ov3, ov4, ov5);
|
||||
|
||||
if (EFLAG) {
|
||||
ev_global[0] = oevdwl;
|
||||
ev_global[1] = (acc_t)0.0;
|
||||
}
|
||||
if (vflag) {
|
||||
ev_global[2] = ov0;
|
||||
ev_global[3] = ov1;
|
||||
ev_global[4] = ov2;
|
||||
ev_global[5] = ov3;
|
||||
ev_global[6] = ov4;
|
||||
ev_global[7] = ov5;
|
||||
}
|
||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||
*timer_compute = MIC_Wtime() - *timer_compute;
|
||||
@ -1143,7 +1063,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
|
||||
else
|
||||
fix->stop_watch(TIME_HOST_PAIR);
|
||||
|
||||
if (EVFLAG)
|
||||
if (EFLAG || vflag)
|
||||
fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
|
||||
else
|
||||
fix->add_result_array(f_start, 0, offload);
|
||||
@ -1212,6 +1132,7 @@ void PairSWIntel::pack_force_const(ForceConst<flt_t> &fc,
|
||||
#ifdef LMP_USE_AVXCD
|
||||
fix->nbor_pack_width(SIMD_type<flt_t>::width());
|
||||
#endif
|
||||
fix->three_body_neighbor(1);
|
||||
|
||||
int off_ccache = 0;
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
|
||||
@ -46,7 +46,7 @@ class PairSWIntel : public PairSW {
|
||||
template <class flt_t, class acc_t>
|
||||
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc);
|
||||
template <int SPQ,int ONETYPE,int EVFLAG,int EFLAG,class flt_t,class acc_t>
|
||||
template <int SPQ, int ONETYPE, int EFLAG, class flt_t, class acc_t>
|
||||
void eval(const int offload, const int vflag,
|
||||
IntelBuffers<flt_t,acc_t> * buffers, const ForceConst<flt_t> &fc,
|
||||
const int astart, const int aend, const int pad_width);
|
||||
|
||||
@ -119,32 +119,30 @@ void PairTersoffIntel::compute(int eflag, int vflag,
|
||||
|
||||
if (ago != 0 && fix->separate_buffers() == 0) {
|
||||
fix->start_watch(TIME_PACK);
|
||||
int packthreads;
|
||||
if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
|
||||
else packthreads = 1;
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
|
||||
#pragma omp parallel if(packthreads > 1)
|
||||
#endif
|
||||
{
|
||||
int ifrom, ito, tid;
|
||||
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
|
||||
nthreads, sizeof(ATOM_T));
|
||||
packthreads, sizeof(ATOM_T));
|
||||
buffers->thr_pack(ifrom,ito,ago);
|
||||
}
|
||||
fix->stop_watch(TIME_PACK);
|
||||
}
|
||||
|
||||
if (evflag || vflag_fdotr) {
|
||||
int ovflag = 0;
|
||||
if (vflag_fdotr) ovflag = 2;
|
||||
else if (vflag) ovflag = 1;
|
||||
if (eflag) {
|
||||
eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
int ovflag = 0;
|
||||
if (vflag_fdotr) ovflag = 2;
|
||||
else if (vflag) ovflag = 1;
|
||||
if (eflag) {
|
||||
eval<1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
eval<0,0,1>(1, 0, buffers, fc, 0, offload_end);
|
||||
eval<0,0,1>(0, 0, buffers, fc, host_start, inum);
|
||||
eval<0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
}
|
||||
|
||||
@ -202,7 +200,7 @@ struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic>
|
||||
);
|
||||
|
||||
// perform the actual computation
|
||||
template<bool EVFLAG, bool EFLAG>
|
||||
template<bool EFLAG>
|
||||
static void kernel(
|
||||
int iito, int iifrom, int eatom, int vflag,
|
||||
const int * _noalias const numneigh,
|
||||
@ -213,11 +211,11 @@ struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic>
|
||||
const c_inner_t * _noalias const c_inner,
|
||||
const c_outer_t * _noalias const c_outer,
|
||||
typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
|
||||
acc_t *evdwl, acc_t *ov0, acc_t * ov1, acc_t *ov2, acc_t* ov3, acc_t *ov4, acc_t *ov5
|
||||
acc_t *evdwl
|
||||
);
|
||||
|
||||
// perform one step of calculation, pass in i-j pairs of atoms (is, js)
|
||||
template<int EVFLAG, int EFLAG>
|
||||
template<int EFLAG>
|
||||
static void kernel_step(
|
||||
int eatom, int vflag,
|
||||
const int * _noalias const numneigh,
|
||||
@ -228,13 +226,12 @@ struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic>
|
||||
const c_inner_t * _noalias const c_inner,
|
||||
const c_outer_t * _noalias const c_outer,
|
||||
typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
|
||||
avec *vsevdwl, avec *vsv0, avec * vsv1, avec *vsv2, avec* vsv3, avec *vsv4, avec *vsv5,
|
||||
int compress_idx, iarr is, iarr js, bvec vmask_repulsive
|
||||
avec *vsevdwl, int compress_idx, iarr is, iarr js, bvec vmask_repulsive
|
||||
);
|
||||
|
||||
// perform one step of calculation, as opposed to the previous method now
|
||||
// with fixed i and a number of js
|
||||
template<int EVFLAG, int EFLAG>
|
||||
template<int EFLAG>
|
||||
static void kernel_step_const_i(
|
||||
int eatom, int vflag,
|
||||
const int * _noalias const numneigh, const int * _noalias const cnumneigh,
|
||||
@ -243,8 +240,7 @@ struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic>
|
||||
const c_inner_t * _noalias const c_inner,
|
||||
const c_outer_t * _noalias const c_outer,
|
||||
typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
|
||||
avec *vsevdwl, avec *vsv0, avec *vsv1, avec *vsv2, avec *vsv3, avec *vsv4, avec *vsv5,
|
||||
int compress_idx, int i, iarr js, bvec vmask_repulsive
|
||||
avec *vsevdwl, int compress_idx, int i, iarr js, bvec vmask_repulsive
|
||||
);
|
||||
};
|
||||
|
||||
@ -257,7 +253,7 @@ struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic>
|
||||
// Dispatch to correct kernel instatiation and perform all the work neccesary
|
||||
// for offloading. In this routine we enter the Phi.
|
||||
// This method is nearly identical to what happens in the other /intel styles
|
||||
template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||
template <int EFLAG, class flt_t, class acc_t>
|
||||
void PairTersoffIntel::eval(const int offload, const int vflag,
|
||||
IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc,
|
||||
@ -292,7 +288,7 @@ void PairTersoffIntel::eval(const int offload, const int vflag,
|
||||
|
||||
// Determine how much data to transfer
|
||||
int x_size, q_size, f_stride, ev_size, separate_flag;
|
||||
IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
|
||||
IP_PRE_get_transfern(ago, 1, EFLAG, vflag,
|
||||
buffers, offload, fix, separate_flag,
|
||||
x_size, q_size, ev_size, f_stride);
|
||||
|
||||
@ -330,20 +326,16 @@ void PairTersoffIntel::eval(const int offload, const int vflag,
|
||||
#endif
|
||||
#endif
|
||||
|
||||
IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
|
||||
f_stride, x, 0);
|
||||
IP_PRE_repack_for_offload(1, separate_flag, nlocal, nall,
|
||||
f_stride, x, 0);
|
||||
|
||||
acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||
if (EVFLAG) {
|
||||
oevdwl = oecoul = (acc_t)0;
|
||||
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
|
||||
}
|
||||
if (EFLAG) oevdwl = oecoul = (acc_t)0;
|
||||
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
|
||||
|
||||
// loop over neighbors of my atoms
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp parallel default(none) \
|
||||
shared(f_start,f_stride,nlocal,nall,minlocal) \
|
||||
reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||
#pragma omp parallel reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||
#endif
|
||||
{
|
||||
int iifrom, iito, tid;
|
||||
@ -355,10 +347,10 @@ void PairTersoffIntel::eval(const int offload, const int vflag,
|
||||
memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
|
||||
|
||||
{
|
||||
acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
|
||||
sevdwl = sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = 0.;
|
||||
acc_t sevdwl;
|
||||
sevdwl = 0.;
|
||||
#define ARGS iito, iifrom, eatom, vflag, numneigh, numneighhalf, cnumneigh, \
|
||||
firstneigh, ntypes, x, c_inner, c_outer, f, &sevdwl, &sv0, &sv1, &sv2, &sv3, &sv4, &sv5
|
||||
firstneigh, ntypes, x, c_inner, c_outer, f, &sevdwl
|
||||
// Pick the variable i algorithm under specific conditions
|
||||
// do use scalar algorithm with very short vectors
|
||||
int VL = lmp_intel::vector_routines<flt_t,acc_t,lmp_intel::mode>::VL;
|
||||
@ -366,50 +358,34 @@ void PairTersoffIntel::eval(const int offload, const int vflag,
|
||||
lmp_intel::vector_traits<lmp_intel::mode>::support_integer_and_gather_ops;
|
||||
bool use_scalar = VL < 4;
|
||||
if (use_scalar) {
|
||||
IntelKernelTersoff<flt_t,acc_t,lmp_intel::NONE,false>::kernel<EVFLAG,EFLAG>(ARGS);
|
||||
IntelKernelTersoff<flt_t,acc_t,lmp_intel::NONE,false>::kernel<EFLAG>(ARGS);
|
||||
} else if (pack_i) {
|
||||
IntelKernelTersoff<flt_t,acc_t,lmp_intel::mode,true >::kernel<EVFLAG,EFLAG>(ARGS);
|
||||
IntelKernelTersoff<flt_t,acc_t,lmp_intel::mode,true >::kernel<EFLAG>(ARGS);
|
||||
} else {
|
||||
IntelKernelTersoff<flt_t,acc_t,lmp_intel::mode,false>::kernel<EVFLAG,EFLAG>(ARGS);
|
||||
}
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) oevdwl += sevdwl;
|
||||
if (vflag == 1) {
|
||||
ov0 += sv0;
|
||||
ov1 += sv1;
|
||||
ov2 += sv2;
|
||||
ov3 += sv3;
|
||||
ov4 += sv4;
|
||||
ov5 += sv5;
|
||||
}
|
||||
IntelKernelTersoff<flt_t,acc_t,lmp_intel::mode,false>::kernel<EFLAG>(ARGS);
|
||||
}
|
||||
if (EFLAG) oevdwl += sevdwl;
|
||||
}
|
||||
|
||||
#ifndef _LMP_INTEL_OFFLOAD
|
||||
if (vflag == 2)
|
||||
#endif
|
||||
{
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp barrier
|
||||
#endif
|
||||
IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall,
|
||||
nlocal, minlocal, nthreads, f_start, f_stride,
|
||||
x, offload);
|
||||
}
|
||||
IP_PRE_fdotr_reduce_omp(1, nall, minlocal, nthreads, f_start,
|
||||
f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
|
||||
ov4, ov5);
|
||||
} // end of omp parallel region
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) {
|
||||
ev_global[0] = oevdwl;
|
||||
ev_global[1] = 0.0;
|
||||
}
|
||||
if (vflag) {
|
||||
ev_global[2] = ov0;
|
||||
ev_global[3] = ov1;
|
||||
ev_global[4] = ov2;
|
||||
ev_global[5] = ov3;
|
||||
ev_global[6] = ov4;
|
||||
ev_global[7] = ov5;
|
||||
}
|
||||
|
||||
IP_PRE_fdotr_reduce(1, nall, nthreads, f_stride, vflag,
|
||||
ov0, ov1, ov2, ov3, ov4, ov5);
|
||||
|
||||
if (EFLAG) {
|
||||
ev_global[0] = oevdwl;
|
||||
ev_global[1] = 0.0;
|
||||
}
|
||||
if (vflag) {
|
||||
ev_global[2] = ov0;
|
||||
ev_global[3] = ov1;
|
||||
ev_global[4] = ov2;
|
||||
ev_global[5] = ov3;
|
||||
ev_global[6] = ov4;
|
||||
ev_global[7] = ov5;
|
||||
}
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
@ -424,7 +400,7 @@ void PairTersoffIntel::eval(const int offload, const int vflag,
|
||||
else
|
||||
fix->stop_watch(TIME_HOST_PAIR);
|
||||
|
||||
if (EVFLAG)
|
||||
if (EFLAG || vflag)
|
||||
fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
|
||||
else
|
||||
fix->add_result_array(f_start, 0, offload);
|
||||
@ -457,6 +433,7 @@ void PairTersoffIntel::init_style()
|
||||
fix = static_cast<FixIntel *>(modify->fix[ifix]);
|
||||
|
||||
fix->pair_init_check();
|
||||
fix->three_body_neighbor(1);
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
_cop = fix->coprocessor_number();
|
||||
#endif
|
||||
@ -663,7 +640,7 @@ void PairTersoffIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
|
||||
static const int N_CACHE = 8;
|
||||
|
||||
template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
|
||||
template<int EVFLAG, int EFLAG>
|
||||
template<int EFLAG>
|
||||
void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
|
||||
int eatom, int vflag,
|
||||
const int * _noalias const numneigh, const int * _noalias const cnumneigh,
|
||||
@ -673,12 +650,6 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
|
||||
const typename PairTersoffIntel::ForceConst<flt_t>::c_outer_t * _noalias const c_outer,
|
||||
typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
|
||||
avec *vsevdwl,
|
||||
avec *vsv0,
|
||||
avec *vsv1,
|
||||
avec *vsv2,
|
||||
avec* vsv3,
|
||||
avec *vsv4,
|
||||
avec *vsv5,
|
||||
int compress_idx,
|
||||
iarr is,
|
||||
iarr js,
|
||||
@ -829,20 +800,10 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
|
||||
vfjytmp = vfjytmp * vprefactor - vdy_ij * vfpair;
|
||||
vfjztmp = vfjztmp * vprefactor - vdz_ij * vfpair;
|
||||
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) {
|
||||
*vsevdwl = v::acc_mask_add(*vsevdwl, vmask, *vsevdwl, vevdwl);
|
||||
if (eatom) {
|
||||
v::store(fw, (v_0_5 * vevdwl));
|
||||
}
|
||||
}
|
||||
if (vflag == 1) {
|
||||
*vsv0 = v::acc_mask_add(*vsv0, vmask, *vsv0, vdx_ij * vdx_ij * vfpair);
|
||||
*vsv1 = v::acc_mask_add(*vsv1, vmask, *vsv1, vdy_ij * vdy_ij * vfpair);
|
||||
*vsv2 = v::acc_mask_add(*vsv2, vmask, *vsv2, vdz_ij * vdz_ij * vfpair);
|
||||
*vsv3 = v::acc_mask_add(*vsv3, vmask, *vsv3, vdx_ij * vdy_ij * vfpair);
|
||||
*vsv4 = v::acc_mask_add(*vsv4, vmask, *vsv4, vdx_ij * vdz_ij * vfpair);
|
||||
*vsv5 = v::acc_mask_add(*vsv5, vmask, *vsv5, vdy_ij * vdz_ij * vfpair);
|
||||
if (EFLAG) {
|
||||
*vsevdwl = v::acc_mask_add(*vsevdwl, vmask, *vsevdwl, vevdwl);
|
||||
if (eatom) {
|
||||
v::store(fw, (v_0_5 * vevdwl));
|
||||
}
|
||||
}
|
||||
{
|
||||
@ -933,7 +894,7 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
|
||||
f[t_].x += fx[t];
|
||||
f[t_].y += fy[t];
|
||||
f[t_].z += fz[t];
|
||||
if (EVFLAG && EFLAG && eatom) {
|
||||
if (EFLAG && eatom) {
|
||||
f[t_].w += fw[t];
|
||||
}
|
||||
}
|
||||
@ -945,7 +906,7 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
|
||||
f[t_].x += fx[t];
|
||||
f[t_].y += fy[t];
|
||||
f[t_].z += fz[t];
|
||||
if (EVFLAG && EFLAG && eatom) {
|
||||
if (EFLAG && eatom) {
|
||||
f[t_].w += fw[t];
|
||||
}
|
||||
}
|
||||
@ -954,7 +915,7 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
|
||||
// Specialized kernel step for fixed i, means that we don't have to use the
|
||||
// convoluted iteration scheme above, as the loop variables are uniform.
|
||||
template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
|
||||
template<int EVFLAG, int EFLAG>
|
||||
template<int EFLAG>
|
||||
void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
|
||||
int eatom, int vflag,
|
||||
const int * _noalias const numneigh, const int * _noalias const cnumneigh,
|
||||
@ -964,12 +925,6 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
|
||||
const typename PairTersoffIntel::ForceConst<flt_t>::c_outer_t * _noalias const c_outer,
|
||||
typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
|
||||
avec *vsevdwl,
|
||||
avec *vsv0,
|
||||
avec *vsv1,
|
||||
avec *vsv2,
|
||||
avec* vsv3,
|
||||
avec *vsv4,
|
||||
avec *vsv5,
|
||||
int compress_idx,
|
||||
int i,
|
||||
iarr js,
|
||||
@ -1097,21 +1052,11 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
|
||||
vfjytmp = vfjytmp * vaprefactor - avec(vdy_ij * vfpair);
|
||||
vfjztmp = vfjztmp * vaprefactor - avec(vdz_ij * vfpair);
|
||||
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) {
|
||||
*vsevdwl = v::acc_mask_add(*vsevdwl, vmask, *vsevdwl, vevdwl);
|
||||
if (eatom) {
|
||||
vfwtmp = v_0_5 * vevdwl;
|
||||
v::store(fw, vfwtmp);
|
||||
}
|
||||
}
|
||||
if (vflag == 1) {
|
||||
*vsv0 = v::acc_mask_add(*vsv0, vmask, *vsv0, vdx_ij * vdx_ij * vfpair);
|
||||
*vsv1 = v::acc_mask_add(*vsv1, vmask, *vsv1, vdy_ij * vdy_ij * vfpair);
|
||||
*vsv2 = v::acc_mask_add(*vsv2, vmask, *vsv2, vdz_ij * vdz_ij * vfpair);
|
||||
*vsv3 = v::acc_mask_add(*vsv3, vmask, *vsv3, vdx_ij * vdy_ij * vfpair);
|
||||
*vsv4 = v::acc_mask_add(*vsv4, vmask, *vsv4, vdx_ij * vdz_ij * vfpair);
|
||||
*vsv5 = v::acc_mask_add(*vsv5, vmask, *vsv5, vdy_ij * vdz_ij * vfpair);
|
||||
if (EFLAG) {
|
||||
*vsevdwl = v::acc_mask_add(*vsevdwl, vmask, *vsevdwl, vevdwl);
|
||||
if (eatom) {
|
||||
vfwtmp = v_0_5 * vevdwl;
|
||||
v::store(fw, vfwtmp);
|
||||
}
|
||||
}
|
||||
while (cache_idx-- > 0) {
|
||||
@ -1169,20 +1114,20 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
|
||||
f[t_].x += fx[t];
|
||||
f[t_].y += fy[t];
|
||||
f[t_].z += fz[t];
|
||||
if (EVFLAG && EFLAG && eatom) {
|
||||
if (EFLAG && eatom) {
|
||||
f[t_].w += fw[t];
|
||||
}
|
||||
}
|
||||
f[i].x += v::acc_reduce_add(v::acc_mask_add(v::acc_zero(), vmask, vfxtmp, v::zero()));
|
||||
f[i].y += v::acc_reduce_add(v::acc_mask_add(v::acc_zero(), vmask, vfytmp, v::zero()));
|
||||
f[i].z += v::acc_reduce_add(v::acc_mask_add(v::acc_zero(), vmask, vfztmp, v::zero()));
|
||||
if (EVFLAG && EFLAG && eatom) {
|
||||
if (EFLAG && eatom) {
|
||||
f[i].z += v::acc_reduce_add(v::acc_mask_add(v::acc_zero(), vmask, vfwtmp, v::zero()));
|
||||
}
|
||||
}
|
||||
|
||||
template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
|
||||
template<bool EVFLAG, bool EFLAG>
|
||||
template<bool EFLAG>
|
||||
void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
|
||||
int iito, int iifrom, int eatom, int vflag,
|
||||
const int * _noalias const numneigh,
|
||||
@ -1193,14 +1138,12 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
|
||||
const c_inner_t * _noalias const c_inner,
|
||||
const c_outer_t * _noalias const c_outer,
|
||||
typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
|
||||
acc_t *evdwl, acc_t *ov0, acc_t * ov1, acc_t *ov2, acc_t* ov3, acc_t *ov4, acc_t *ov5
|
||||
acc_t *evdwl
|
||||
) {
|
||||
int compress_idx = 0;
|
||||
int ii, jj;
|
||||
iarr is, js;
|
||||
avec vsevdwl = v::acc_zero();
|
||||
avec vsv0 = v::acc_zero(), vsv1 = v::acc_zero(), vsv2 = v::acc_zero();
|
||||
avec vsv3 = v::acc_zero(), vsv4 = v::acc_zero(), vsv5 = v::acc_zero();
|
||||
ivec v_i4floats(static_cast<int>(sizeof(typename v::fscal) * 4));
|
||||
ivec vj, v_NEIGHMASK(NEIGHMASK);
|
||||
bvec vmask_repulsive(0);
|
||||
@ -1237,11 +1180,11 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
|
||||
if (pack_i) {
|
||||
if (compress_idx == v::VL) {
|
||||
vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0));
|
||||
kernel_step<EVFLAG,EFLAG>(
|
||||
kernel_step<EFLAG>(
|
||||
eatom, vflag,
|
||||
numneigh, cnumneigh, firstneigh, ntypes,
|
||||
x, c_inner, c_outer, f,
|
||||
&vsevdwl, &vsv0, &vsv1, &vsv2, &vsv3, &vsv4, &vsv5, compress_idx,
|
||||
&vsevdwl, compress_idx,
|
||||
is, js, vmask_repulsive
|
||||
);
|
||||
compress_idx = 0;
|
||||
@ -1250,11 +1193,11 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
|
||||
} else {
|
||||
if (compress_idx == v::VL || (compress_idx > 0 && jj == jnum-1)) {
|
||||
vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0));
|
||||
kernel_step_const_i<EVFLAG,EFLAG>(
|
||||
kernel_step_const_i<EFLAG>(
|
||||
eatom, vflag,
|
||||
numneigh, cnumneigh, firstneigh, ntypes,
|
||||
x, c_inner, c_outer, f,
|
||||
&vsevdwl, &vsv0, &vsv1, &vsv2, &vsv3, &vsv4, &vsv5, compress_idx,
|
||||
&vsevdwl, compress_idx,
|
||||
i, js, vmask_repulsive
|
||||
);
|
||||
compress_idx = 0;
|
||||
@ -1265,26 +1208,16 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
|
||||
}
|
||||
if (compress_idx > 0) {
|
||||
vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0));
|
||||
IntelKernelTersoff::kernel_step<EVFLAG,EFLAG>(
|
||||
IntelKernelTersoff::kernel_step<EFLAG>(
|
||||
eatom, vflag,
|
||||
numneigh, cnumneigh, firstneigh, ntypes,
|
||||
x, c_inner, c_outer, f,
|
||||
&vsevdwl, &vsv0, &vsv1, &vsv2, &vsv3, &vsv4, &vsv5, compress_idx,
|
||||
&vsevdwl, compress_idx,
|
||||
is, js, vmask_repulsive
|
||||
);
|
||||
}
|
||||
if (EVFLAG) {
|
||||
if (EFLAG) {
|
||||
*evdwl += v::acc_reduce_add(vsevdwl);
|
||||
}
|
||||
if (vflag == 1) {
|
||||
*ov0 += v::acc_reduce_add(vsv0);
|
||||
*ov1 += v::acc_reduce_add(vsv1);
|
||||
*ov2 += v::acc_reduce_add(vsv2);
|
||||
*ov3 += v::acc_reduce_add(vsv3);
|
||||
*ov4 += v::acc_reduce_add(vsv4);
|
||||
*ov5 += v::acc_reduce_add(vsv5);
|
||||
}
|
||||
if (EFLAG) {
|
||||
*evdwl += v::acc_reduce_add(vsevdwl);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -79,7 +79,7 @@ class PairTersoffIntel : public PairTersoff {
|
||||
template <class flt_t, class acc_t>
|
||||
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc);
|
||||
template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||
template <int EFLAG, class flt_t, class acc_t>
|
||||
void eval(const int offload, const int vflag,
|
||||
IntelBuffers<flt_t,acc_t> * buffers,
|
||||
const ForceConst<flt_t> &fc, const int astart, const int aend);
|
||||
|
||||
3034
src/USER-INTEL/pppm_disp_intel.cpp
Normal file
3034
src/USER-INTEL/pppm_disp_intel.cpp
Normal file
File diff suppressed because it is too large
Load Diff
238
src/USER-INTEL/pppm_disp_intel.h
Normal file
238
src/USER-INTEL/pppm_disp_intel.h
Normal file
@ -0,0 +1,238 @@
|
||||
/* -*- c++ -*- ----------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: William McDoniel (RWTH Aachen University)
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifdef KSPACE_CLASS
|
||||
|
||||
KSpaceStyle(pppm/disp/intel,PPPMDispIntel)
|
||||
|
||||
#else
|
||||
|
||||
#ifndef LMP_PPPMINTEL_DISP_H
|
||||
#define LMP_PPPMINTEL_DISP_H
|
||||
|
||||
#include "pppm_disp.h"
|
||||
#include "fix_intel.h"
|
||||
|
||||
namespace LAMMPS_NS {
|
||||
|
||||
class PPPMDispIntel : public PPPMDisp {
|
||||
public:
|
||||
PPPMDispIntel(class LAMMPS *, int, char **);
|
||||
virtual ~PPPMDispIntel();
|
||||
virtual void init();
|
||||
virtual void compute(int, int);
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
int use_base();
|
||||
#endif
|
||||
|
||||
protected:
|
||||
FixIntel *fix;
|
||||
|
||||
int _use_lrt;
|
||||
FFT_SCALAR **perthread_density;
|
||||
FFT_SCALAR *particle_ekx;
|
||||
FFT_SCALAR *particle_eky;
|
||||
FFT_SCALAR *particle_ekz;
|
||||
FFT_SCALAR *particle_ekx0;
|
||||
FFT_SCALAR *particle_eky0;
|
||||
FFT_SCALAR *particle_ekz0;
|
||||
FFT_SCALAR *particle_ekx1;
|
||||
FFT_SCALAR *particle_eky1;
|
||||
FFT_SCALAR *particle_ekz1;
|
||||
FFT_SCALAR *particle_ekx2;
|
||||
FFT_SCALAR *particle_eky2;
|
||||
FFT_SCALAR *particle_ekz2;
|
||||
FFT_SCALAR *particle_ekx3;
|
||||
FFT_SCALAR *particle_eky3;
|
||||
FFT_SCALAR *particle_ekz3;
|
||||
FFT_SCALAR *particle_ekx4;
|
||||
FFT_SCALAR *particle_eky4;
|
||||
FFT_SCALAR *particle_ekz4;
|
||||
FFT_SCALAR *particle_ekx5;
|
||||
FFT_SCALAR *particle_eky5;
|
||||
FFT_SCALAR *particle_ekz5;
|
||||
FFT_SCALAR *particle_ekx6;
|
||||
FFT_SCALAR *particle_eky6;
|
||||
FFT_SCALAR *particle_ekz6;
|
||||
|
||||
|
||||
|
||||
int _use_table;
|
||||
int rho_points;
|
||||
FFT_SCALAR **rho_lookup;
|
||||
FFT_SCALAR **rho6_lookup;
|
||||
FFT_SCALAR **drho_lookup;
|
||||
FFT_SCALAR **drho6_lookup;
|
||||
FFT_SCALAR half_rho_scale, half_rho_scale_plus;
|
||||
|
||||
int _use_packing;
|
||||
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
int _use_base;
|
||||
#endif
|
||||
|
||||
template<class flt_t, class acc_t>
|
||||
void particle_map(double, double, double,
|
||||
double, int **, int, int,
|
||||
int, int, int,
|
||||
int, int, int,
|
||||
IntelBuffers<flt_t,acc_t> *buffers);
|
||||
|
||||
template<class flt_t, class acc_t, int use_table>
|
||||
void make_rho_c(IntelBuffers<flt_t,acc_t> *buffers);
|
||||
template<class flt_t, class acc_t>
|
||||
void make_rho_c(IntelBuffers<flt_t,acc_t> *buffers) {
|
||||
if (_use_table == 1) {
|
||||
make_rho_c<flt_t,acc_t,1>(buffers);
|
||||
} else {
|
||||
make_rho_c<flt_t,acc_t,0>(buffers);
|
||||
}
|
||||
}
|
||||
|
||||
template<class flt_t, class acc_t, int use_table>
|
||||
void make_rho_g(IntelBuffers<flt_t,acc_t> *buffers);
|
||||
template<class flt_t, class acc_t>
|
||||
void make_rho_g(IntelBuffers<flt_t,acc_t> *buffers) {
|
||||
if (_use_table == 1) {
|
||||
make_rho_g<flt_t,acc_t,1>(buffers);
|
||||
} else {
|
||||
make_rho_g<flt_t,acc_t,0>(buffers);
|
||||
}
|
||||
}
|
||||
|
||||
template<class flt_t, class acc_t, int use_table>
|
||||
void make_rho_a(IntelBuffers<flt_t,acc_t> *buffers);
|
||||
template<class flt_t, class acc_t>
|
||||
void make_rho_a(IntelBuffers<flt_t,acc_t> *buffers) {
|
||||
if (_use_table == 1) {
|
||||
make_rho_a<flt_t,acc_t,1>(buffers);
|
||||
} else {
|
||||
make_rho_a<flt_t,acc_t,0>(buffers);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<class flt_t, class acc_t, int use_table>
|
||||
void make_rho_none(IntelBuffers<flt_t,acc_t> *buffers);
|
||||
template<class flt_t, class acc_t>
|
||||
void make_rho_none(IntelBuffers<flt_t,acc_t> *buffers) {
|
||||
if (_use_table == 1) {
|
||||
make_rho_none<flt_t,acc_t,1>(buffers);
|
||||
} else {
|
||||
make_rho_none<flt_t,acc_t,0>(buffers);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<class flt_t, class acc_t, int use_table>
|
||||
void fieldforce_c_ik(IntelBuffers<flt_t,acc_t> *buffers);
|
||||
template<class flt_t, class acc_t>
|
||||
void fieldforce_c_ik(IntelBuffers<flt_t,acc_t> *buffers) {
|
||||
if (_use_table == 1) {
|
||||
fieldforce_c_ik<flt_t,acc_t,1>(buffers);
|
||||
} else {
|
||||
fieldforce_c_ik<flt_t,acc_t,0>(buffers);
|
||||
}
|
||||
}
|
||||
|
||||
template<class flt_t, class acc_t, int use_table>
|
||||
void fieldforce_c_ad(IntelBuffers<flt_t,acc_t> *buffers);
|
||||
template<class flt_t, class acc_t>
|
||||
void fieldforce_c_ad(IntelBuffers<flt_t,acc_t> *buffers) {
|
||||
if (_use_table == 1) {
|
||||
fieldforce_c_ad<flt_t,acc_t,1>(buffers);
|
||||
} else {
|
||||
fieldforce_c_ad<flt_t,acc_t,0>(buffers);
|
||||
}
|
||||
}
|
||||
|
||||
template<class flt_t, class acc_t, int use_table>
|
||||
void fieldforce_g_ik(IntelBuffers<flt_t,acc_t> *buffers);
|
||||
template<class flt_t, class acc_t>
|
||||
void fieldforce_g_ik(IntelBuffers<flt_t,acc_t> *buffers) {
|
||||
if (_use_table == 1) {
|
||||
fieldforce_g_ik<flt_t,acc_t,1>(buffers);
|
||||
} else {
|
||||
fieldforce_g_ik<flt_t,acc_t,0>(buffers);
|
||||
}
|
||||
}
|
||||
|
||||
template<class flt_t, class acc_t, int use_table>
|
||||
void fieldforce_g_ad(IntelBuffers<flt_t,acc_t> *buffers);
|
||||
template<class flt_t, class acc_t>
|
||||
void fieldforce_g_ad(IntelBuffers<flt_t,acc_t> *buffers) {
|
||||
if (_use_table == 1) {
|
||||
fieldforce_g_ad<flt_t,acc_t,1>(buffers);
|
||||
} else {
|
||||
fieldforce_g_ad<flt_t,acc_t,0>(buffers);
|
||||
}
|
||||
}
|
||||
|
||||
template<class flt_t, class acc_t, int use_table>
|
||||
void fieldforce_a_ik(IntelBuffers<flt_t,acc_t> *buffers);
|
||||
template<class flt_t, class acc_t>
|
||||
void fieldforce_a_ik(IntelBuffers<flt_t,acc_t> *buffers) {
|
||||
if (_use_table == 1) {
|
||||
fieldforce_a_ik<flt_t,acc_t,1>(buffers);
|
||||
} else {
|
||||
fieldforce_a_ik<flt_t,acc_t,0>(buffers);
|
||||
}
|
||||
}
|
||||
|
||||
template<class flt_t, class acc_t, int use_table>
|
||||
void fieldforce_a_ad(IntelBuffers<flt_t,acc_t> *buffers);
|
||||
template<class flt_t, class acc_t>
|
||||
void fieldforce_a_ad(IntelBuffers<flt_t,acc_t> *buffers) {
|
||||
if (_use_table == 1) {
|
||||
fieldforce_a_ad<flt_t,acc_t,1>(buffers);
|
||||
} else {
|
||||
fieldforce_a_ad<flt_t,acc_t,0>(buffers);
|
||||
}
|
||||
}
|
||||
template<class flt_t, class acc_t, int use_table>
|
||||
void fieldforce_none_ik(IntelBuffers<flt_t,acc_t> *buffers);
|
||||
template<class flt_t, class acc_t>
|
||||
void fieldforce_none_ik(IntelBuffers<flt_t,acc_t> *buffers) {
|
||||
if (_use_table == 1) {
|
||||
fieldforce_none_ik<flt_t,acc_t,1>(buffers);
|
||||
} else {
|
||||
fieldforce_none_ik<flt_t,acc_t,0>(buffers);
|
||||
}
|
||||
}
|
||||
|
||||
template<class flt_t, class acc_t, int use_table>
|
||||
void fieldforce_none_ad(IntelBuffers<flt_t,acc_t> *buffers);
|
||||
template<class flt_t, class acc_t>
|
||||
void fieldforce_none_ad(IntelBuffers<flt_t,acc_t> *buffers) {
|
||||
if (_use_table == 1) {
|
||||
fieldforce_none_ad<flt_t,acc_t,1>(buffers);
|
||||
} else {
|
||||
fieldforce_none_ad<flt_t,acc_t,0>(buffers);
|
||||
}
|
||||
}
|
||||
|
||||
void precompute_rho();
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -1,4 +1,4 @@
|
||||
/* -*- c++ -*- ----------------------------------------------------------
|
||||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
@ -12,7 +12,9 @@
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Rodrigo Canales (RWTH Aachen University)
|
||||
Contributing authors: William McDoniel (RWTH Aachen University)
|
||||
Rodrigo Canales (RWTH Aachen University)
|
||||
Markus Hoehnerbach (RWTH Aachen University)
|
||||
W. Michael Brown (Intel)
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
@ -36,6 +38,9 @@ class PPPMIntel : public PPPM {
|
||||
virtual ~PPPMIntel();
|
||||
virtual void init();
|
||||
virtual void compute(int, int);
|
||||
virtual void pack_forward(int, FFT_SCALAR *, int, int *);
|
||||
virtual void unpack_forward(int, FFT_SCALAR *, int, int *);
|
||||
virtual double memory_usage();
|
||||
void compute_first(int, int);
|
||||
void compute_second(int, int);
|
||||
void pack_buffers();
|
||||
@ -47,18 +52,74 @@ class PPPMIntel : public PPPM {
|
||||
protected:
|
||||
FixIntel *fix;
|
||||
|
||||
int _use_lrt;
|
||||
FFT_SCALAR **perthread_density;
|
||||
FFT_SCALAR *particle_ekx;
|
||||
FFT_SCALAR *particle_eky;
|
||||
FFT_SCALAR *particle_ekz;
|
||||
|
||||
int _use_table;
|
||||
int rho_points;
|
||||
FFT_SCALAR **rho_lookup;
|
||||
FFT_SCALAR **drho_lookup;
|
||||
FFT_SCALAR half_rho_scale, half_rho_scale_plus;
|
||||
|
||||
int _use_packing;
|
||||
FFT_SCALAR ***vdxy_brick;
|
||||
FFT_SCALAR ***vdz0_brick;
|
||||
FFT_SCALAR *work3;
|
||||
class GridComm *cg_pack;
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
int _use_base;
|
||||
#endif
|
||||
|
||||
template<class flt_t, class acc_t>
|
||||
void test_function(IntelBuffers<flt_t,acc_t> *buffers);
|
||||
|
||||
|
||||
void precompute_rho();
|
||||
template<class flt_t, class acc_t>
|
||||
void particle_map(IntelBuffers<flt_t,acc_t> *buffers);
|
||||
template<class flt_t, class acc_t>
|
||||
template<class flt_t, class acc_t, int use_table>
|
||||
void make_rho(IntelBuffers<flt_t,acc_t> *buffers);
|
||||
template<class flt_t, class acc_t>
|
||||
void make_rho(IntelBuffers<flt_t,acc_t> *buffers) {
|
||||
if (_use_table == 1) {
|
||||
make_rho<flt_t,acc_t,1>(buffers);
|
||||
} else {
|
||||
make_rho<flt_t,acc_t,0>(buffers);
|
||||
}
|
||||
}
|
||||
void poisson_ik_intel();
|
||||
template<class flt_t, class acc_t, int use_table, int use_packing>
|
||||
void fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers);
|
||||
template<class flt_t, class acc_t>
|
||||
void fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers) {
|
||||
if (_use_table == 1) {
|
||||
if (_use_packing == 1) {
|
||||
fieldforce_ik<flt_t, acc_t, 1, 1>(buffers);
|
||||
} else {
|
||||
fieldforce_ik<flt_t, acc_t, 1, 0>(buffers);
|
||||
}
|
||||
} else {
|
||||
if (_use_packing == 1) {
|
||||
fieldforce_ik<flt_t, acc_t, 0, 1>(buffers);
|
||||
} else {
|
||||
fieldforce_ik<flt_t, acc_t, 0, 0>(buffers);
|
||||
}
|
||||
}
|
||||
}
|
||||
template<class flt_t, class acc_t, int use_table>
|
||||
void fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers);
|
||||
template<class flt_t, class acc_t>
|
||||
void fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers) {
|
||||
if (_use_table == 1) {
|
||||
fieldforce_ad<flt_t,acc_t,1>(buffers);
|
||||
} else {
|
||||
fieldforce_ad<flt_t,acc_t,0>(buffers);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@ -78,17 +78,17 @@ void VerletLRTIntel::init()
|
||||
setup before run
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
void VerletLRTIntel::setup()
|
||||
void VerletLRTIntel::setup(int flag)
|
||||
{
|
||||
if (_intel_kspace == 0) {
|
||||
Verlet::setup();
|
||||
Verlet::setup(flag);
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (_intel_kspace->use_base()) {
|
||||
_intel_kspace = 0;
|
||||
Verlet::setup();
|
||||
Verlet::setup(flag);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -42,7 +42,7 @@ class VerletLRTIntel : public Verlet {
|
||||
VerletLRTIntel(class LAMMPS *, int, char **);
|
||||
virtual ~VerletLRTIntel();
|
||||
virtual void init();
|
||||
virtual void setup();
|
||||
virtual void setup(int flag = 1);
|
||||
virtual void run(int);
|
||||
|
||||
protected:
|
||||
|
||||
51
src/atom.cpp
51
src/atom.cpp
@ -40,6 +40,10 @@
|
||||
#include "memory.h"
|
||||
#include "error.h"
|
||||
|
||||
#ifdef LMP_USER_INTEL
|
||||
#include "neigh_request.h"
|
||||
#endif
|
||||
|
||||
using namespace LAMMPS_NS;
|
||||
using namespace MathConst;
|
||||
|
||||
@ -1882,6 +1886,53 @@ void Atom::setup_sort_bins()
|
||||
bininvy = nbiny / (bboxhi[1]-bboxlo[1]);
|
||||
bininvz = nbinz / (bboxhi[2]-bboxlo[2]);
|
||||
|
||||
#ifdef LMP_USER_INTEL
|
||||
int intel_neigh = 0;
|
||||
if (neighbor->nrequest) {
|
||||
if (neighbor->requests[0]->intel) intel_neigh = 1;
|
||||
} else if (neighbor->old_nrequest)
|
||||
if (neighbor->old_requests[0]->intel) intel_neigh = 1;
|
||||
if (intel_neigh && userbinsize == 0.0) {
|
||||
if (neighbor->binsizeflag) bininv = 1.0/neighbor->binsize_user;
|
||||
|
||||
double nx_low = neighbor->bboxlo[0];
|
||||
double ny_low = neighbor->bboxlo[1];
|
||||
double nz_low = neighbor->bboxlo[2];
|
||||
double nxbbox = neighbor->bboxhi[0] - nx_low;
|
||||
double nybbox = neighbor->bboxhi[1] - ny_low;
|
||||
double nzbbox = neighbor->bboxhi[2] - nz_low;
|
||||
int nnbinx = static_cast<int> (nxbbox * bininv);
|
||||
int nnbiny = static_cast<int> (nybbox * bininv);
|
||||
int nnbinz = static_cast<int> (nzbbox * bininv);
|
||||
if (domain->dimension == 2) nnbinz = 1;
|
||||
|
||||
if (nnbinx == 0) nnbinx = 1;
|
||||
if (nnbiny == 0) nnbiny = 1;
|
||||
if (nnbinz == 0) nnbinz = 1;
|
||||
|
||||
double binsizex = nxbbox/nnbinx;
|
||||
double binsizey = nybbox/nnbiny;
|
||||
double binsizez = nzbbox/nnbinz;
|
||||
|
||||
bininvx = 1.0 / binsizex;
|
||||
bininvy = 1.0 / binsizey;
|
||||
bininvz = 1.0 / binsizez;
|
||||
|
||||
int lxo = (bboxlo[0] - nx_low) * bininvx;
|
||||
int lyo = (bboxlo[1] - ny_low) * bininvy;
|
||||
int lzo = (bboxlo[2] - nz_low) * bininvz;
|
||||
bboxlo[0] = nx_low + static_cast<double>(lxo) / bininvx;
|
||||
bboxlo[1] = ny_low + static_cast<double>(lyo) / bininvy;
|
||||
bboxlo[2] = nz_low + static_cast<double>(lzo) / bininvz;
|
||||
nbinx = static_cast<int>((bboxhi[0] - bboxlo[0]) * bininvx) + 1;
|
||||
nbiny = static_cast<int>((bboxhi[1] - bboxlo[1]) * bininvy) + 1;
|
||||
nbinz = static_cast<int>((bboxhi[2] - bboxlo[2]) * bininvz) + 1;
|
||||
bboxhi[0] = bboxlo[0] + static_cast<double>(nbinx) / bininvx;
|
||||
bboxhi[1] = bboxlo[1] + static_cast<double>(nbiny) / bininvy;
|
||||
bboxhi[2] = bboxlo[2] + static_cast<double>(nbinz) / bininvz;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (1.0*nbinx*nbiny*nbinz > INT_MAX)
|
||||
error->one(FLERR,"Too many atom sorting bins");
|
||||
|
||||
|
||||
Reference in New Issue
Block a user