Merge pull request #674 from wmbrownIntel/user-intel-update

Mike Brown has added DPD to the USER-INTEL package with speedups over 3X for Xeon Phi and over 1.7X for some Xeon processors.
This commit is contained in:
Steve Plimpton
2017-10-06 09:12:39 -06:00
committed by GitHub
28 changed files with 1397 additions and 192 deletions

Binary file not shown.

Before

Width:  |  Height:  |  Size: 20 KiB

After

Width:  |  Height:  |  Size: 19 KiB

View File

@ -25,14 +25,14 @@ LAMMPS to run on the CPU cores and coprocessor cores simultaneously.
[Currently Available USER-INTEL Styles:]
Angle Styles: charmm, harmonic :ulb,l
Bond Styles: fene, harmonic :l
Bond Styles: fene, fourier, harmonic :l
Dihedral Styles: charmm, harmonic, opls :l
Fixes: nve, npt, nvt, nvt/sllod :l
Fixes: nve, npt, nvt, nvt/sllod, nve/asphere :l
Improper Styles: cvff, harmonic :l
Pair Styles: airebo, airebo/morse, buck/coul/cut, buck/coul/long,
buck, eam, eam/alloy, eam/fs, gayberne, lj/charmm/coul/charmm,
lj/charmm/coul/long, lj/cut, lj/cut/coul/long, lj/long/coul/long, rebo,
sw, tersoff :l
buck, dpd, eam, eam/alloy, eam/fs, gayberne, lj/charmm/coul/charmm,
lj/charmm/coul/long, lj/cut, lj/cut/coul/long, lj/long/coul/long,
rebo, sw, tersoff :l
K-Space Styles: pppm, pppm/disp :l
:ule
@ -54,11 +54,12 @@ warmup run (for use with offload benchmarks).
:c,image(JPG/user_intel.png)
Results are speedups obtained on Intel Xeon E5-2697v4 processors
(code-named Broadwell) and Intel Xeon Phi 7250 processors
(code-named Knights Landing) with "June 2017" LAMMPS built with
Intel Parallel Studio 2017 update 2. Results are with 1 MPI task
per physical core. See {src/USER-INTEL/TEST/README} for the raw
simulation rates and instructions to reproduce.
(code-named Broadwell), Intel Xeon Phi 7250 processors (code-named
Knights Landing), and Intel Xeon Gold 6148 processors (code-named
Skylake) with "June 2017" LAMMPS built with Intel Parallel Studio
2017 update 2. Results are with 1 MPI task per physical core. See
{src/USER-INTEL/TEST/README} for the raw simulation rates and
instructions to reproduce.
:line
@ -82,6 +83,11 @@ this order :l
The {newton} setting applies to all atoms, not just atoms shared
between MPI tasks :l
Vectorization can change the order for adding pairwise forces :l
When using the -DLMP_USE_MKL_RNG define (all included intel optimized
makefiles do) at build time, the random number generator for
dissipative particle dynamics (pair style dpd/intel) uses the Mersenne
Twister generator included in the Intel MKL library (that should be
more robust than the default Masaglia random number generator) :l
:ule
The precision mode (described below) used with the USER-INTEL
@ -119,8 +125,8 @@ For Intel Xeon Phi CPUs:
Runs should be performed using MCDRAM. :ulb,l
:ule
For simulations using {kspace_style pppm} on Intel CPUs
supporting AVX-512:
For simulations using {kspace_style pppm} on Intel CPUs supporting
AVX-512:
Add "kspace_modify diff ad" to the input script :ulb,l
The command-line option should be changed to
@ -237,14 +243,17 @@ However, if you do not have coprocessors on your system, building
without offload support will produce a smaller binary.
The general requirements for Makefiles with the USER-INTEL package
are as follows. "-DLAMMPS_MEMALIGN=64" is required for CCFLAGS. When
using Intel compilers, "-restrict" is required and "-qopenmp" is
highly recommended for CCFLAGS and LINKFLAGS. LIB should include
"-ltbbmalloc". For builds supporting offload, "-DLMP_INTEL_OFFLOAD"
is required for CCFLAGS and "-qoffload" is required for LINKFLAGS.
Other recommended CCFLAG options for best performance are
"-O2 -fno-alias -ansi-alias -qoverride-limits fp-model fast=2
-no-prec-div".
are as follows. When using Intel compilers, "-restrict" is required
and "-qopenmp" is highly recommended for CCFLAGS and LINKFLAGS.
CCFLAGS should include "-DLMP_INTEL_USELRT" (unless POSIX Threads
are not supported in the build environment) and "-DLMP_USE_MKL_RNG"
(unless Intel Math Kernel Library (MKL) is not available in the build
environment). For Intel compilers, LIB should include "-ltbbmalloc"
or if the library is not available, "-DLMP_INTEL_NO_TBB" can be added
to CCFLAGS. For builds supporting offload, "-DLMP_INTEL_OFFLOAD" is
required for CCFLAGS and "-qoffload" is required for LINKFLAGS. Other
recommended CCFLAG options for best performance are "-O2 -fno-alias
-ansi-alias -qoverride-limits fp-model fast=2 -no-prec-div".
NOTE: The vectorization and math capabilities can differ depending on
the CPU. For Intel compilers, the "-x" flag specifies the type of

View File

@ -7,6 +7,7 @@
:line
dihedral_style fourier command :h3
dihedral_style fourier/intel command :h3
dihedral_style fourier/omp command :h3
[Syntax:]

View File

@ -8,6 +8,7 @@
pair_style dpd command :h3
pair_style dpd/gpu command :h3
pair_style dpd/intel command :h3
pair_style dpd/omp command :h3
pair_style dpd/tstat command :h3
pair_style dpd/tstat/gpu command :h3

View File

@ -15,13 +15,14 @@ SHELL = /bin/sh
CC = CC
OPTFLAGS = -xMIC-AVX512 -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
CCFLAGS = -g -qopenmp -DLAMMPS_MEMALIGN=64 -qno-offload \
-fno-alias -ansi-alias -restrict $(OPTFLAGS) -DLMP_INTEL_NO_TBB
CCFLAGS = -qopenmp -qno-offload -fno-alias -ansi-alias -restrict \
-DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG -DLMP_INTEL_NO_TBB \
$(OPTFLAGS)
SHFLAGS = -fPIC
DEPFLAGS = -M
LINK = CC
LINKFLAGS = -g -qopenmp $(OPTFLAGS)
LINKFLAGS = -qopenmp $(OPTFLAGS)
LIB =
SIZE = size

View File

@ -10,7 +10,7 @@ CC = mpiicpc
MIC_OPT = -qoffload-option,mic,compiler,"-fp-model fast=2 -mGLOB_default_function_attrs=\"gather_scatter_loop_unroll=4\""
CCFLAGS = -g -O3 -qopenmp -DLMP_INTEL_OFFLOAD -DLAMMPS_MEMALIGN=64 \
-xHost -fno-alias -ansi-alias -restrict -DLMP_INTEL_USELRT \
-qoverride-limits $(MIC_OPT)
-qoverride-limits $(MIC_OPT) -DLMP_USE_MKL_RNG
SHFLAGS = -fPIC
DEPFLAGS = -M

8
src/MAKE/OPTIONS/Makefile.intel_cpu Executable file → Normal file
View File

@ -8,14 +8,14 @@ SHELL = /bin/sh
CC = mpiicpc
OPTFLAGS = -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
CCFLAGS = -g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \
-fno-alias -ansi-alias -restrict $(OPTFLAGS)
CCFLAGS = -qopenmp -qno-offload -fno-alias -ansi-alias -restrict \
-DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS)
SHFLAGS = -fPIC
DEPFLAGS = -M
LINK = mpiicpc
LINKFLAGS = -g -qopenmp $(OPTFLAGS)
LIB = -ltbbmalloc -ltbbmalloc_proxy
LINKFLAGS = -qopenmp $(OPTFLAGS)
LIB = -ltbbmalloc
SIZE = size
ARCHIVE = ar

View File

@ -8,8 +8,8 @@ SHELL = /bin/sh
CC = mpiicpc
OPTFLAGS = -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
CCFLAGS = -qopenmp -DLAMMPS_MEMALIGN=64 -qno-offload \
-fno-alias -ansi-alias -restrict $(OPTFLAGS) -DLMP_INTEL_USELRT
CCFLAGS = -qopenmp -qno-offload -fno-alias -ansi-alias -restrict \
-DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS)
SHFLAGS = -fPIC
DEPFLAGS = -M

View File

@ -8,13 +8,13 @@ SHELL = /bin/sh
CC = mpicxx -cxx=icc
OPTFLAGS = -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
CCFLAGS = -g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \
-fno-alias -ansi-alias -restrict $(OPTFLAGS) -DLMP_INTEL_USELRT
CCFLAGS = -qopenmp -qno-offload -fno-alias -ansi-alias -restrict \
-DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS)
SHFLAGS = -fPIC
DEPFLAGS = -M
LINK = mpicxx -cxx=icc
LINKFLAGS = -g -qopenmp $(OPTFLAGS)
LINKFLAGS = -qopenmp $(OPTFLAGS)
LIB = -ltbbmalloc
SIZE = size

View File

@ -9,14 +9,14 @@ SHELL = /bin/sh
export OMPI_CXX = icc
CC = mpicxx
OPTFLAGS = -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
CCFLAGS = -g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \
-fno-alias -ansi-alias -restrict $(OPTFLAGS) -DLMP_INTEL_USELRT
CCFLAGS = -qopenmp -qno-offload -fno-alias -ansi-alias -restrict \
-DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS)
SHFLAGS = -fPIC
DEPFLAGS = -M
LINK = mpicxx
LINKFLAGS = -g -qopenmp $(OPTFLAGS)
LIB = -ltbbmalloc -ltbbmalloc_proxy
LINKFLAGS = -qopenmp $(OPTFLAGS)
LIB = -ltbbmalloc
SIZE = size
ARCHIVE = ar

View File

@ -1,123 +0,0 @@
# intel_phi = USER-INTEL with Phi x200 (KNL) offload support,Intel MPI,MKL FFT
SHELL = /bin/sh
# ---------------------------------------------------------------------
# compiler/linker settings
# specify flags and libraries needed for your compiler
CC = mpiicpc
MIC_OPT = -qoffload-arch=mic-avx512 -fp-model fast=2
CCFLAGS = -O3 -qopenmp -DLMP_INTEL_OFFLOAD -DLAMMPS_MEMALIGN=64 \
-xHost -fno-alias -ansi-alias -restrict \
-qoverride-limits $(MIC_OPT) -DLMP_INTEL_USELRT
SHFLAGS = -fPIC
DEPFLAGS = -M
LINK = mpiicpc
LINKFLAGS = -g -O3 -xHost -qopenmp -qoffload $(MIC_OPT)
LIB = -ltbbmalloc
SIZE = size
ARCHIVE = ar
ARFLAGS = -rc
SHLIBFLAGS = -shared
# ---------------------------------------------------------------------
# LAMMPS-specific settings, all OPTIONAL
# specify settings for LAMMPS features you will use
# if you change any -D setting, do full re-compile after "make clean"
# LAMMPS ifdef settings
# see possible settings in Section 2.2 (step 4) of manual
LMP_INC = -DLAMMPS_GZIP -DLAMMPS_JPEG
# MPI library
# see discussion in Section 2.2 (step 5) of manual
# MPI wrapper compiler/linker can provide this info
# can point to dummy MPI library in src/STUBS as in Makefile.serial
# use -D MPICH and OMPI settings in INC to avoid C++ lib conflicts
# INC = path for mpi.h, MPI compiler settings
# PATH = path for MPI library
# LIB = name of MPI library
MPI_INC = -DMPICH_SKIP_MPICXX -DOMPI_SKIP_MPICXX=1
MPI_PATH =
MPI_LIB =
# FFT library
# see discussion in Section 2.2 (step 6) of manaul
# can be left blank to use provided KISS FFT library
# INC = -DFFT setting, e.g. -DFFT_FFTW, FFT compiler settings
# PATH = path for FFT library
# LIB = name of FFT library
FFT_INC = -DFFT_MKL -DFFT_SINGLE
FFT_PATH =
FFT_LIB = -L$(MKLROOT)/lib/intel64/ -lmkl_intel_ilp64 -lmkl_sequential -lmkl_core
# JPEG and/or PNG library
# see discussion in Section 2.2 (step 7) of manual
# only needed if -DLAMMPS_JPEG or -DLAMMPS_PNG listed with LMP_INC
# INC = path(s) for jpeglib.h and/or png.h
# PATH = path(s) for JPEG library and/or PNG library
# LIB = name(s) of JPEG library and/or PNG library
JPG_INC =
JPG_PATH =
JPG_LIB = -ljpeg
# ---------------------------------------------------------------------
# build rules and dependencies
# do not edit this section
include Makefile.package.settings
include Makefile.package
EXTRA_INC = $(LMP_INC) $(PKG_INC) $(MPI_INC) $(FFT_INC) $(JPG_INC) $(PKG_SYSINC)
EXTRA_PATH = $(PKG_PATH) $(MPI_PATH) $(FFT_PATH) $(JPG_PATH) $(PKG_SYSPATH)
EXTRA_LIB = $(PKG_LIB) $(MPI_LIB) $(FFT_LIB) $(JPG_LIB) $(PKG_SYSLIB)
EXTRA_CPP_DEPENDS = $(PKG_CPP_DEPENDS)
EXTRA_LINK_DEPENDS = $(PKG_LINK_DEPENDS)
# Path to src files
vpath %.cpp ..
vpath %.h ..
# Link target
$(EXE): $(OBJ) $(EXTRA_LINK_DEPENDS)
$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
$(SIZE) $(EXE)
# Library targets
lib: $(OBJ) $(EXTRA_LINK_DEPENDS)
$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
shlib: $(OBJ) $(EXTRA_LINK_DEPENDS)
$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
$(OBJ) $(EXTRA_LIB) $(LIB)
# Compilation rules
%.o:%.cpp $(EXTRA_CPP_DEPENDS)
$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
%.d:%.cpp $(EXTRA_CPP_DEPENDS)
$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
%.o:%.cu $(EXTRA_CPP_DEPENDS)
$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
# Individual dependencies
depend : fastdep.exe $(SRC)
@./fastdep.exe $(EXTRA_INC) -- $^ > .depend || exit 1
fastdep.exe: ../DEPEND/fastdep.c
cc -O -o $@ $<
sinclude .depend

View File

@ -8,13 +8,13 @@ SHELL = /bin/sh
CC = mpiicpc
OPTFLAGS = -xMIC-AVX512 -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
CCFLAGS = -qopenmp -DLAMMPS_MEMALIGN=64 -qno-offload \
-fno-alias -ansi-alias -restrict $(OPTFLAGS)
CCFLAGS = -qopenmp -qno-offload -fno-alias -ansi-alias -restrict \
-DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS)
SHFLAGS = -fPIC
DEPFLAGS = -M
LINK = mpiicpc
LINKFLAGS = -g -qopenmp $(OPTFLAGS)
LINKFLAGS = -qopenmp $(OPTFLAGS)
LIB = -ltbbmalloc
SIZE = size

View File

@ -46,7 +46,7 @@ action nbin_intel.h
action nbin_intel.cpp
action npair_intel.h
action npair_intel.cpp
action intel_simd.h pair_sw_intel.cpp
action intel_simd.h
action intel_intrinsics.h pair_tersoff_intel.cpp
action intel_intrinsics_airebo.h pair_airebo_intel.cpp

View File

@ -30,28 +30,37 @@ be added or changed in the Makefile depending on the version:
2017 update 2 - No changes needed
2017 updates 3 or 4 - Use -xCOMMON-AVX512 and not -xHost or -xCORE-AVX512
2018 or newer - Use -xHost or -xCORE-AVX512 and -qopt-zmm-usage=high
2018 inital release - Use -xCOMMON-AVX512 and not -xHost or -xCORE-AVX512
2018u1 or newer - Use -xHost or -xCORE-AVX512 and -qopt-zmm-usage=high
-----------------------------------------------------------------------------
When using the suffix command with "intel", intel styles will be used if they
exist. If the suffix command is used with "hybrid intel omp" and the USER-OMP
USER-OMP styles will be used whenever USER-INTEL styles are not available. This
allow for running most styles in LAMMPS with threading.
is installed, USER-OMP styles will be used whenever USER-INTEL styles are not
available. This allow for running most styles in LAMMPS with threading.
-----------------------------------------------------------------------------
The Long-Range Thread mode (LRT) in the Intel package currently uses
pthreads by default. If pthreads are not supported in the build environment,
the compile flag "-DLMP_INTEL_NOLRT" will disable the feature to allow for
builds without pthreads. Alternatively, "-DLMP_INTEL_LRT11" can be used to
build with compilers that support threads using the C++11 standard. When using
The Long-Range Thread mode (LRT) in the Intel package is enabled through the
-DLMP_INTEL_USELRT define at compile time. All intel optimized makefiles
include this define. This feature will use pthreads by default.
Alternatively, "-DLMP_INTEL_LRT11" can be used to build with compilers that
support threads intrinsically using the C++11 standard. When using
LRT mode, you might need to disable OpenMP affinity settings (e.g.
export KMP_AFFINITY=none). LAMMPS will generate a warning if the settings
need to be changed.
-----------------------------------------------------------------------------
Unless Intel Math Kernel Library (MKL) is unavailable, -DLMP_USE_MKL_RNG
should be added to the compile flags. This will enable using the MKL Mersenne
Twister random number generator (RNG) for Dissipative Particle Dynamics
(DPD). This RNG can allow significantly faster performance and it also has a
significantly longer period than the standard RNG for DPD.
-----------------------------------------------------------------------------
In order to use offload to Intel(R) Xeon Phi(TM) coprocessors, the flag
-DLMP_INTEL_OFFLOAD should be set in the Makefile. Offload requires the use of
Intel compilers.

View File

@ -9,6 +9,7 @@
# in.intel.tersoff - Silicon benchmark with Tersoff
# in.intel.water - Coarse-grain water benchmark using Stillinger-Weber
# in.intel.airebo - Polyethelene benchmark with AIREBO
# in.intel.dpd - Dissipative Particle Dynamics
#
#############################################################################
@ -16,16 +17,17 @@
# Expected Timesteps/second with turbo on and HT enabled, LAMMPS June-2017
# - Compiled w/ Intel Parallel Studio 2017u2 and Makefile.intel_cpu_intelmpi
#
# Xeon E5-2697v4 Xeon Phi 7250
# Xeon E5-2697v4 Xeon Phi 7250 Xeon Gold 6148
#
# in.intel.lj - 199.5 282.3
# in.intel.rhodo - 12.4 17.5
# in.intel.lc - 19.0 25.7
# in.intel.eam - 59.4 92.8
# in.intel.sw - 132.4 161.9
# in.intel.tersoff - 83.3 101.1
# in.intel.water - 53.4 90.3
# in.intel.airebo - 7.3 11.8
# in.intel.lj - 199.5 282.3 317.3
# in.intel.rhodo - 12.4 17.5 24.4
# in.intel.lc - 19.0 25.7 26.8
# in.intel.eam - 59.4 92.8 105.6
# in.intel.sw - 132.4 161.9 213.8
# in.intel.tersoff - 83.3 101.1 109.6
# in.intel.water - 53.4 90.3 105.5
# in.intel.airebo - 7.3 11.8 17.6
# in.intel.dpd - 74.5 100.4 148.1
#
#############################################################################

View File

@ -0,0 +1,48 @@
# DPD benchmark
variable N index on # Newton Setting
variable w index 10 # Warmup Timesteps
variable t index 4000 # Main Run Timesteps
variable m index 1 # Main Run Timestep Multiplier
variable n index 0 # Use NUMA Mapping for Multi-Node
variable p index 0 # Use Power Measurement
variable x index 4
variable y index 2
variable z index 2
variable xx equal 20*$x
variable yy equal 20*$y
variable zz equal 20*$z
variable rr equal floor($t*$m)
newton $N
if "$n > 0" then "processors * * * grid numa"
units lj
atom_style atomic
comm_modify mode single vel yes
lattice fcc 3.0
region box block 0 ${xx} 0 ${yy} 0 ${zz}
create_box 1 box
create_atoms 1 box
mass 1 1.0
velocity all create 1.0 87287 loop geom
pair_style dpd 1.0 1.0 928948
pair_coeff 1 1 25.0 4.5
neighbor 0.5 bin
neigh_modify delay 0 every 1
fix 1 all nve
timestep 0.04
thermo 1000
if "$p > 0" then "run_style verlet/power"
if "$w > 0" then "run $w"
run ${rr}

View File

@ -0,0 +1,441 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: W. Michael Brown (Intel)
------------------------------------------------------------------------- */
#include <mpi.h>
#include <math.h>
#include "dihedral_fourier_intel.h"
#include "atom.h"
#include "comm.h"
#include "memory.h"
#include "neighbor.h"
#include "domain.h"
#include "force.h"
#include "pair.h"
#include "update.h"
#include "error.h"
#include "suffix.h"
using namespace LAMMPS_NS;
#define PTOLERANCE (flt_t)1.05
#define MTOLERANCE (flt_t)-1.05
typedef struct { int a,b,c,d,t; } int5_t;
/* ---------------------------------------------------------------------- */
DihedralFourierIntel::DihedralFourierIntel(class LAMMPS *lmp)
: DihedralFourier(lmp)
{
suffix_flag |= Suffix::INTEL;
}
/* ---------------------------------------------------------------------- */
void DihedralFourierIntel::compute(int eflag, int vflag)
{
#ifdef _LMP_INTEL_OFFLOAD
if (_use_base) {
DihedralFourier::compute(eflag, vflag);
return;
}
#endif
if (fix->precision() == FixIntel::PREC_MODE_MIXED)
compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
force_const_single);
else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
compute<double,double>(eflag, vflag, fix->get_double_buffers(),
force_const_double);
else
compute<float,float>(eflag, vflag, fix->get_single_buffers(),
force_const_single);
}
/* ---------------------------------------------------------------------- */
template <class flt_t, class acc_t>
void DihedralFourierIntel::compute(int eflag, int vflag,
IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc)
{
if (eflag || vflag) {
ev_setup(eflag,vflag);
} else evflag = 0;
if (evflag) {
if (vflag && !eflag) {
if (force->newton_bond)
eval<0,1,1>(vflag, buffers, fc);
else
eval<0,1,0>(vflag, buffers, fc);
} else {
if (force->newton_bond)
eval<1,1,1>(vflag, buffers, fc);
else
eval<1,1,0>(vflag, buffers, fc);
}
} else {
if (force->newton_bond)
eval<0,0,1>(vflag, buffers, fc);
else
eval<0,0,0>(vflag, buffers, fc);
}
}
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
void DihedralFourierIntel::eval(const int vflag,
IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc)
{
const int inum = neighbor->ndihedrallist;
if (inum == 0) return;
ATOM_T * _noalias const x = buffers->get_x(0);
const int nlocal = atom->nlocal;
const int nall = nlocal + atom->nghost;
int f_stride;
if (NEWTON_BOND) f_stride = buffers->get_stride(nall);
else f_stride = buffers->get_stride(nlocal);
int tc;
FORCE_T * _noalias f_start;
acc_t * _noalias ev_global;
IP_PRE_get_buffers(0, buffers, fix, tc, f_start, ev_global);
const int nthreads = tc;
acc_t oedihedral, ov0, ov1, ov2, ov3, ov4, ov5;
if (EFLAG) oedihedral = (acc_t)0.0;
if (VFLAG && vflag) {
ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
}
#if defined(_OPENMP)
#pragma omp parallel default(none) \
shared(f_start,f_stride,fc) \
reduction(+:oedihedral,ov0,ov1,ov2,ov3,ov4,ov5)
#endif
{
int nfrom, npl, nto, tid;
#ifdef LMP_INTEL_USE_SIMDOFF
IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
#else
IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
#endif
FORCE_T * _noalias const f = f_start + (tid * f_stride);
if (fix->need_zero(tid))
memset(f, 0, f_stride * sizeof(FORCE_T));
const int5_t * _noalias const dihedrallist =
(int5_t *) neighbor->dihedrallist[0];
#ifdef LMP_INTEL_USE_SIMDOFF
acc_t sedihedral, sv0, sv1, sv2, sv3, sv4, sv5;
if (EFLAG) sedihedral = (acc_t)0.0;
if (VFLAG && vflag) {
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
}
#pragma simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
for (int n = nfrom; n < nto; n ++) {
#else
for (int n = nfrom; n < nto; n += npl) {
#endif
const int i1 = dihedrallist[n].a;
const int i2 = dihedrallist[n].b;
const int i3 = dihedrallist[n].c;
const int i4 = dihedrallist[n].d;
const int type = dihedrallist[n].t;
// 1st bond
const flt_t vb1x = x[i1].x - x[i2].x;
const flt_t vb1y = x[i1].y - x[i2].y;
const flt_t vb1z = x[i1].z - x[i2].z;
// 2nd bond
const flt_t vb2xm = x[i2].x - x[i3].x;
const flt_t vb2ym = x[i2].y - x[i3].y;
const flt_t vb2zm = x[i2].z - x[i3].z;
// 3rd bond
const flt_t vb3x = x[i4].x - x[i3].x;
const flt_t vb3y = x[i4].y - x[i3].y;
const flt_t vb3z = x[i4].z - x[i3].z;
// c,s calculation
const flt_t ax = vb1y*vb2zm - vb1z*vb2ym;
const flt_t ay = vb1z*vb2xm - vb1x*vb2zm;
const flt_t az = vb1x*vb2ym - vb1y*vb2xm;
const flt_t bx = vb3y*vb2zm - vb3z*vb2ym;
const flt_t by = vb3z*vb2xm - vb3x*vb2zm;
const flt_t bz = vb3x*vb2ym - vb3y*vb2xm;
const flt_t rasq = ax*ax + ay*ay + az*az;
const flt_t rbsq = bx*bx + by*by + bz*bz;
const flt_t rgsq = vb2xm*vb2xm + vb2ym*vb2ym + vb2zm*vb2zm;
const flt_t rg = sqrt(rgsq);
flt_t rginv, ra2inv, rb2inv;
rginv = ra2inv = rb2inv = (flt_t)0.0;
if (rg > 0) rginv = (flt_t)1.0/rg;
if (rasq > 0) ra2inv = (flt_t)1.0/rasq;
if (rbsq > 0) rb2inv = (flt_t)1.0/rbsq;
const flt_t rabinv = sqrt(ra2inv*rb2inv);
flt_t c = (ax*bx + ay*by + az*bz)*rabinv;
const flt_t s = rg*rabinv*(ax*vb3x + ay*vb3y + az*vb3z);
// error check
#ifndef LMP_INTEL_USE_SIMDOFF
if (c > PTOLERANCE || c < MTOLERANCE) {
int me = comm->me;
if (screen) {
char str[128];
sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " "
TAGINT_FORMAT " " TAGINT_FORMAT " "
TAGINT_FORMAT " " TAGINT_FORMAT,
me,tid,update->ntimestep,
atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
error->warning(FLERR,str,0);
fprintf(screen," 1st atom: %d %g %g %g\n",
me,x[i1].x,x[i1].y,x[i1].z);
fprintf(screen," 2nd atom: %d %g %g %g\n",
me,x[i2].x,x[i2].y,x[i2].z);
fprintf(screen," 3rd atom: %d %g %g %g\n",
me,x[i3].x,x[i3].y,x[i3].z);
fprintf(screen," 4th atom: %d %g %g %g\n",
me,x[i4].x,x[i4].y,x[i4].z);
}
}
#endif
if (c > (flt_t)1.0) c = (flt_t)1.0;
if (c < (flt_t)-1.0) c = (flt_t)-1.0;
flt_t deng;
flt_t df = (flt_t)0.0;
if (EFLAG) deng = (flt_t)0.0;
for (int j = 0; j < nterms[type]; j++) {
const flt_t tcos_shift = fc.bp[j][type].cos_shift;
const flt_t tsin_shift = fc.bp[j][type].sin_shift;
const flt_t tk = fc.bp[j][type].k;
const int m = fc.bp[j][type].multiplicity;
flt_t p = (flt_t)1.0;
flt_t ddf1, df1;
ddf1 = df1 = (flt_t)0.0;
for (int i = 0; i < m; i++) {
ddf1 = p*c - df1*s;
df1 = p*s + df1*c;
p = ddf1;
}
p = p*tcos_shift + df1*tsin_shift;
df1 = df1*tcos_shift - ddf1*tsin_shift;
df1 *= -m;
p += (flt_t)1.0;
if (m == 0) {
p = (flt_t)1.0 + tcos_shift;
df1 = (flt_t)0.0;
}
if (EFLAG) deng += tk * p;
df -= tk * df1;
}
const flt_t fg = vb1x*vb2xm + vb1y*vb2ym + vb1z*vb2zm;
const flt_t hg = vb3x*vb2xm + vb3y*vb2ym + vb3z*vb2zm;
const flt_t fga = fg*ra2inv*rginv;
const flt_t hgb = hg*rb2inv*rginv;
const flt_t gaa = -ra2inv*rg;
const flt_t gbb = rb2inv*rg;
const flt_t dtfx = gaa*ax;
const flt_t dtfy = gaa*ay;
const flt_t dtfz = gaa*az;
const flt_t dtgx = fga*ax - hgb*bx;
const flt_t dtgy = fga*ay - hgb*by;
const flt_t dtgz = fga*az - hgb*bz;
const flt_t dthx = gbb*bx;
const flt_t dthy = gbb*by;
const flt_t dthz = gbb*bz;
const flt_t sx2 = df*dtgx;
const flt_t sy2 = df*dtgy;
const flt_t sz2 = df*dtgz;
flt_t f1x = df*dtfx;
flt_t f1y = df*dtfy;
flt_t f1z = df*dtfz;
const flt_t f2x = sx2 - f1x;
const flt_t f2y = sy2 - f1y;
const flt_t f2z = sz2 - f1z;
flt_t f4x = df*dthx;
flt_t f4y = df*dthy;
flt_t f4z = df*dthz;
const flt_t f3x = -sx2 - f4x;
const flt_t f3y = -sy2 - f4y;
const flt_t f3z = -sz2 - f4z;
if (EFLAG || VFLAG) {
#ifdef LMP_INTEL_USE_SIMDOFF
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4,
f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x,
vb3y, vb3z, sedihedral, f, NEWTON_BOND, nlocal,
sv0, sv1, sv2, sv3, sv4, sv5);
#else
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4,
f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x,
vb3y, vb3z, oedihedral, f, NEWTON_BOND, nlocal,
ov0, ov1, ov2, ov3, ov4, ov5);
#endif
}
#ifdef LMP_INTEL_USE_SIMDOFF
#pragma simdoff
#endif
{
if (NEWTON_BOND || i1 < nlocal) {
f[i1].x += f1x;
f[i1].y += f1y;
f[i1].z += f1z;
}
if (NEWTON_BOND || i2 < nlocal) {
f[i2].x += f2x;
f[i2].y += f2y;
f[i2].z += f2z;
}
if (NEWTON_BOND || i3 < nlocal) {
f[i3].x += f3x;
f[i3].y += f3y;
f[i3].z += f3z;
}
if (NEWTON_BOND || i4 < nlocal) {
f[i4].x += f4x;
f[i4].y += f4y;
f[i4].z += f4z;
}
}
} // for n
#ifdef LMP_INTEL_USE_SIMDOFF
if (EFLAG) oedihedral += sedihedral;
if (VFLAG && vflag) {
ov0 += sv0; ov1 += sv1; ov2 += sv2;
ov3 += sv3; ov4 += sv4; ov5 += sv5;
}
#endif
} // omp parallel
if (EFLAG) energy += oedihedral;
if (VFLAG && vflag) {
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
}
fix->set_reduce_flag();
}
/* ---------------------------------------------------------------------- */
void DihedralFourierIntel::init_style()
{
DihedralFourier::init_style();
int ifix = modify->find_fix("package_intel");
if (ifix < 0)
error->all(FLERR,
"The 'package intel' command is required for /intel styles");
fix = static_cast<FixIntel *>(modify->fix[ifix]);
#ifdef _LMP_INTEL_OFFLOAD
_use_base = 0;
if (fix->offload_balance() != 0.0) {
_use_base = 1;
return;
}
#endif
fix->bond_init_check();
if (fix->precision() == FixIntel::PREC_MODE_MIXED)
pack_force_const(force_const_single, fix->get_mixed_buffers());
else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
pack_force_const(force_const_double, fix->get_double_buffers());
else
pack_force_const(force_const_single, fix->get_single_buffers());
}
/* ---------------------------------------------------------------------- */
template <class flt_t, class acc_t>
void DihedralFourierIntel::pack_force_const(ForceConst<flt_t> &fc,
IntelBuffers<flt_t,acc_t> *buffers)
{
const int bp1 = atom->ndihedraltypes + 1;
fc.set_ntypes(bp1, setflag, nterms, memory);
for (int i = 1; i < bp1; i++) {
if (setflag[i]) {
for (int j = 0; j < nterms[i]; j++) {
fc.bp[j][i].cos_shift = cos_shift[i][j];
fc.bp[j][i].sin_shift = sin_shift[i][j];
fc.bp[j][i].k = k[i][j];
fc.bp[j][i].multiplicity = multiplicity[i][j];
}
}
}
}
/* ---------------------------------------------------------------------- */
template <class flt_t>
void DihedralFourierIntel::ForceConst<flt_t>::set_ntypes(const int nbondtypes,
int *setflag,
int *nterms,
Memory *memory) {
if (nbondtypes != _nbondtypes) {
if (_nbondtypes > 0)
_memory->destroy(bp);
if (nbondtypes > 0) {
_maxnterms = 1;
for (int i = 1; i <= nbondtypes; i++)
if (setflag[i]) _maxnterms = MAX(_maxnterms, nterms[i]);
_memory->create(bp, _maxnterms, nbondtypes, "dihedralfourierintel.bp");
}
}
_nbondtypes = nbondtypes;
_memory = memory;
}

View File

@ -0,0 +1,82 @@
/* -*- c++ -*- ----------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: W. Michael Brown (Intel)
------------------------------------------------------------------------- */
#ifdef DIHEDRAL_CLASS
DihedralStyle(fourier/intel,DihedralFourierIntel)
#else
#ifndef LMP_DIHEDRAL_FOURIER_INTEL_H
#define LMP_DIHEDRAL_FOURIER_INTEL_H
#include "dihedral_fourier.h"
#include "fix_intel.h"
namespace LAMMPS_NS {
class DihedralFourierIntel : public DihedralFourier {
public:
DihedralFourierIntel(class LAMMPS *lmp);
virtual void compute(int, int);
void init_style();
private:
FixIntel *fix;
template <class flt_t> class ForceConst;
template <class flt_t, class acc_t>
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc);
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
const ForceConst<flt_t> &fc);
template <class flt_t, class acc_t>
void pack_force_const(ForceConst<flt_t> &fc,
IntelBuffers<flt_t, acc_t> *buffers);
#ifdef _LMP_INTEL_OFFLOAD
int _use_base;
#endif
template <class flt_t>
class ForceConst {
public:
typedef struct { flt_t cos_shift, sin_shift, k;
int multiplicity; } fc_packed1;
fc_packed1 **bp;
ForceConst() : _nbondtypes(0) {}
~ForceConst() { set_ntypes(0, NULL, NULL, NULL); }
void set_ntypes(const int nbondtypes, int *setflag, int *nterms,
Memory *memory);
private:
int _nbondtypes, _maxnterms;
Memory *_memory;
};
ForceConst<float> force_const_single;
ForceConst<double> force_const_double;
};
}
#endif
#endif

View File

@ -285,6 +285,7 @@ int FixIntel::setmask()
{
int mask = 0;
mask |= PRE_REVERSE;
mask |= MIN_PRE_REVERSE;
#ifdef _LMP_INTEL_OFFLOAD
mask |= POST_FORCE;
mask |= MIN_POST_FORCE;

View File

@ -43,6 +43,7 @@ class FixIntel : public Fix {
virtual int setmask();
virtual void init();
virtual void setup(int);
inline void min_setup(int in) { setup(in); }
void setup_pre_reverse(int eflag = 0, int vflag = 0);
void pair_init_check(const bool cdmessage=false);
@ -50,6 +51,8 @@ class FixIntel : public Fix {
void kspace_init_check();
void pre_reverse(int eflag = 0, int vflag = 0);
inline void min_pre_reverse(int eflag = 0, int vflag = 0)
{ pre_reverse(eflag, vflag); }
// Get all forces, calculation results from coprocesser
void sync_coprocessor();

View File

@ -409,6 +409,7 @@ void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
IP_PRE_get_stride(_ccache_stride3, nsize * 3, sizeof(acc_t), 0);
lmp->memory->create(_ccachef, _ccache_stride3 * nt, "_ccachef");
#endif
memset(_ccachei, 0, vsize * sizeof(int));
memset(_ccachej, 0, vsize * sizeof(int));
#ifdef _LMP_INTEL_OFFLOAD
@ -425,7 +426,7 @@ void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
#pragma offload_transfer target(mic:_cop) \
nocopy(ccachex,ccachey:length(vsize) alloc_if(1) free_if(0)) \
nocopy(ccachez,ccachew:length(vsize) alloc_if(1) free_if(0)) \
nocopy(ccachei:length(vsize) alloc_if(1) free_if(0)) \
in(ccachei:length(vsize) alloc_if(1) free_if(0)) \
in(ccachej:length(vsize) alloc_if(1) free_if(0))
}
#ifdef LMP_USE_AVXCD

View File

@ -292,6 +292,15 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
ito = inum; \
}
#define IP_PRE_omp_stride_id_vec(ifrom, ip, ito, tid, inum, \
nthr, vecsize) \
{ \
tid = 0; \
ifrom = 0; \
ip = 1; \
ito = inum; \
}
#endif
#define IP_PRE_fdotr_acc_force_l5(lf, lt, minlocal, nthreads, f_start, \

View File

@ -319,7 +319,6 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
const int bstart = binhead[ibin + binstart[k]];
const int bend = binhead[ibin + binend[k]];
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#pragma simd
#endif
for (int jj = bstart; jj < bend; jj++)
@ -341,7 +340,6 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
const int bstart = binhead[ibin + stencil[k]];
const int bend = binhead[ibin + stencil[k] + 1];
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#pragma simd
#endif
for (int jj = bstart; jj < bend; jj++)

View File

@ -273,7 +273,6 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
const int bstart = binhead[ibin + binstart[k]];
const int bend = binhead[ibin + binend[k]];
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#pragma simd
#endif
for (int jj = bstart; jj < bend; jj++)
@ -307,7 +306,6 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
const int bstart = binhead[ibin];
const int bend = binhead[ibin + 1];
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#pragma simd
#endif
for (int jj = bstart; jj < bend; jj++) {

View File

@ -0,0 +1,617 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
This software is distributed under the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: W. Michael Brown (Intel)
Shun Xu (Computer Network Information Center, CAS)
------------------------------------------------------------------------- */
#include <math.h>
#include "pair_dpd_intel.h"
#include "atom.h"
#include "comm.h"
#include "force.h"
#include "memory.h"
#include "modify.h"
#include "neighbor.h"
#include "neigh_list.h"
#include "neigh_request.h"
#include "suffix.h"
using namespace LAMMPS_NS;
#define LMP_MKL_RNG VSL_BRNG_MT19937
#define FC_PACKED1_T typename ForceConst<flt_t>::fc_packed1
#define IEPSILON 1.0e10
/* ---------------------------------------------------------------------- */
PairDPDIntel::PairDPDIntel(LAMMPS *lmp) :
PairDPD(lmp)
{
suffix_flag |= Suffix::INTEL;
respa_enable = 0;
random_thread = NULL;
_nrandom_thread = 0;
}
/* ---------------------------------------------------------------------- */
PairDPDIntel::~PairDPDIntel()
{
#if defined(_OPENMP)
if (_nrandom_thread) {
#ifdef LMP_USE_MKL_RNG
for (int i = 0; i < _nrandom_thread; i++)
vslDeleteStream(&random_thread[i]);
#else
for (int i = 1; i < _nrandom_thread; i++)
delete random_thread[i];
#endif
}
#endif
delete []random_thread;
}
/* ---------------------------------------------------------------------- */
void PairDPDIntel::compute(int eflag, int vflag)
{
if (fix->precision() == FixIntel::PREC_MODE_MIXED)
compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
force_const_single);
else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
compute<double,double>(eflag, vflag, fix->get_double_buffers(),
force_const_double);
else
compute<float,float>(eflag, vflag, fix->get_single_buffers(),
force_const_single);
fix->balance_stamp();
vflag_fdotr = 0;
}
template <class flt_t, class acc_t>
void PairDPDIntel::compute(int eflag, int vflag,
IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc)
{
if (eflag || vflag) {
ev_setup(eflag, vflag);
} else evflag = vflag_fdotr = 0;
const int inum = list->inum;
const int nthreads = comm->nthreads;
const int host_start = fix->host_start_pair();
const int offload_end = fix->offload_end_pair();
const int ago = neighbor->ago;
if (ago != 0 && fix->separate_buffers() == 0) {
fix->start_watch(TIME_PACK);
int packthreads;
if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
else packthreads = 1;
#if defined(_OPENMP)
#pragma omp parallel if(packthreads > 1)
#endif
{
int ifrom, ito, tid;
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
packthreads, sizeof(ATOM_T));
buffers->thr_pack(ifrom,ito,ago);
}
fix->stop_watch(TIME_PACK);
}
int ovflag = 0;
if (vflag_fdotr) ovflag = 2;
else if (vflag) ovflag = 1;
if (_onetype) {
if (eflag) {
if (force->newton_pair) {
eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
} else {
eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
}
} else {
if (force->newton_pair) {
eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
} else {
eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
}
}
} else {
if (eflag) {
if (force->newton_pair) {
eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end);
eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum);
} else {
eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end);
eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum);
}
} else {
if (force->newton_pair) {
eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end);
eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum);
} else {
eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end);
eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum);
}
}
}
}
template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
void PairDPDIntel::eval(const int offload, const int vflag,
IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc,
const int astart, const int aend)
{
const int inum = aend - astart;
if (inum == 0) return;
int nlocal, nall, minlocal;
fix->get_buffern(offload, nlocal, nall, minlocal);
const int ago = neighbor->ago;
IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);
ATOM_T * _noalias const x = buffers->get_x(offload);
typedef struct { double x, y, z; } lmp_vt;
lmp_vt *v = (lmp_vt *)atom->v[0];
const flt_t dtinvsqrt = 1.0/sqrt(update->dt);
const int * _noalias const numneigh = list->numneigh;
const int * _noalias const cnumneigh = buffers->cnumneigh(list);
const int * _noalias const firstneigh = buffers->firstneigh(list);
const FC_PACKED1_T * _noalias const param = fc.param[0];
const flt_t * _noalias const special_lj = fc.special_lj;
int * _noalias const rngi_thread = fc.rngi;
const int rng_size = buffers->get_max_nbors();
const int ntypes = atom->ntypes + 1;
const int eatom = this->eflag_atom;
// Determine how much data to transfer
int x_size, q_size, f_stride, ev_size, separate_flag;
IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
buffers, offload, fix, separate_flag,
x_size, q_size, ev_size, f_stride);
int tc;
FORCE_T * _noalias f_start;
acc_t * _noalias ev_global;
IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
const int nthreads = tc;
int *overflow = fix->get_off_overflow_flag();
{
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
*timer_compute = MIC_Wtime();
#endif
IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
f_stride, x, 0);
acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
if (EFLAG) oevdwl = (acc_t)0;
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
// loop over neighbors of my atoms
#if defined(_OPENMP)
#pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
#endif
{
int iifrom, iip, iito, tid;
IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
iifrom += astart;
iito += astart;
#ifdef LMP_USE_MKL_RNG
VSLStreamStatePtr *my_random = &(random_thread[tid]);
#else
RanMars *my_random = random_thread[tid];
#endif
flt_t *my_rand_buffer = fc.rand_buffer_thread[tid];
int rngi = rngi_thread[tid];
int foff;
if (NEWTON_PAIR) foff = tid * f_stride - minlocal;
else foff = -minlocal;
FORCE_T * _noalias const f = f_start + foff;
if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
flt_t icut, a0, gamma, sigma;
if (ONETYPE) {
icut = param[3].icut;
a0 = param[3].a0;
gamma = param[3].gamma;
sigma = param[3].sigma;
}
for (int i = iifrom; i < iito; i += iip) {
int itype, ptr_off;
const FC_PACKED1_T * _noalias parami;
if (!ONETYPE) {
itype = x[i].w;
ptr_off = itype * ntypes;
parami = param + ptr_off;
}
const int * _noalias const jlist = firstneigh + cnumneigh[i];
const int jnum = numneigh[i];
acc_t fxtmp, fytmp, fztmp, fwtmp;
acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
const flt_t xtmp = x[i].x;
const flt_t ytmp = x[i].y;
const flt_t ztmp = x[i].z;
const flt_t vxtmp = v[i].x;
const flt_t vytmp = v[i].y;
const flt_t vztmp = v[i].z;
fxtmp = fytmp = fztmp = (acc_t)0;
if (EFLAG) fwtmp = sevdwl = (acc_t)0;
if (NEWTON_PAIR == 0)
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
if (rngi + jnum > rng_size) {
#ifdef LMP_USE_MKL_RNG
if (sizeof(flt_t) == sizeof(float))
vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, *my_random, rngi,
(float*)my_rand_buffer, (float)0.0, (float)1.0 );
else
vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, *my_random, rngi,
(double*)my_rand_buffer, 0.0, 1.0 );
#else
for (int jj = 0; jj < rngi; jj++)
my_rand_buffer[jj] = my_random->gaussian();
#endif
rngi = 0;
}
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
sv0, sv1, sv2, sv3, sv4, sv5)
#endif
for (int jj = 0; jj < jnum; jj++) {
flt_t forcelj, evdwl;
forcelj = evdwl = (flt_t)0.0;
int j, jtype, sbindex;
if (!ONETYPE) {
sbindex = jlist[jj] >> SBBITS & 3;
j = jlist[jj] & NEIGHMASK;
} else
j = jlist[jj];
const flt_t delx = xtmp - x[j].x;
const flt_t dely = ytmp - x[j].y;
const flt_t delz = ztmp - x[j].z;
if (!ONETYPE) {
jtype = x[j].w;
icut = parami[jtype].icut;
}
const flt_t rsq = delx * delx + dely * dely + delz * delz;
const flt_t rinv = (flt_t)1.0/sqrt(rsq);
if (rinv > icut) {
flt_t factor_dpd;
if (!ONETYPE) factor_dpd = special_lj[sbindex];
flt_t delvx = vxtmp - v[j].x;
flt_t delvy = vytmp - v[j].y;
flt_t delvz = vztmp - v[j].z;
flt_t dot = delx*delvx + dely*delvy + delz*delvz;
flt_t randnum = my_rand_buffer[jj];
flt_t iwd = rinv - icut;
if (rinv > (flt_t)IEPSILON) iwd = (flt_t)0.0;
if (!ONETYPE) {
a0 = parami[jtype].a0;
gamma = parami[jtype].gamma;
sigma = parami[jtype].sigma;
}
flt_t fpair = a0 - iwd * gamma * dot + sigma * randnum * dtinvsqrt;
if (!ONETYPE) fpair *= factor_dpd;
fpair *= iwd;
const flt_t fpx = fpair * delx;
fxtmp += fpx;
if (NEWTON_PAIR) f[j].x -= fpx;
const flt_t fpy = fpair * dely;
fytmp += fpy;
if (NEWTON_PAIR) f[j].y -= fpy;
const flt_t fpz = fpair * delz;
fztmp += fpz;
if (NEWTON_PAIR) f[j].z -= fpz;
if (EFLAG) {
flt_t cut = (flt_t)1.0/icut;
flt_t r = (flt_t)1.0/rinv;
evdwl = (flt_t)0.5 * a0 * (cut - (flt_t)2.0*r + rsq * icut);
if (!ONETYPE) evdwl *= factor_dpd;
sevdwl += evdwl;
if (eatom) {
fwtmp += (flt_t)0.5 * evdwl;
if (NEWTON_PAIR)
f[j].w += (flt_t)0.5 * evdwl;
}
}
if (NEWTON_PAIR == 0)
IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz);
} // if rsq
} // for jj
if (NEWTON_PAIR) {
f[i].x += fxtmp;
f[i].y += fytmp;
f[i].z += fztmp;
} else {
f[i].x = fxtmp;
f[i].y = fytmp;
f[i].z = fztmp;
}
IP_PRE_ev_tally_atom(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
rngi += jnum;
} // for ii
IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
ov4, ov5);
rngi_thread[tid] = rngi;
} // end omp
IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
ov0, ov1, ov2, ov3, ov4, ov5);
if (EFLAG) {
if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
ev_global[0] = oevdwl;
ev_global[1] = (acc_t)0.0;
}
if (vflag) {
if (NEWTON_PAIR == 0) {
ov0 *= (acc_t)0.5;
ov1 *= (acc_t)0.5;
ov2 *= (acc_t)0.5;
ov3 *= (acc_t)0.5;
ov4 *= (acc_t)0.5;
ov5 *= (acc_t)0.5;
}
ev_global[2] = ov0;
ev_global[3] = ov1;
ev_global[4] = ov2;
ev_global[5] = ov3;
ev_global[6] = ov4;
ev_global[7] = ov5;
}
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
*timer_compute = MIC_Wtime() - *timer_compute;
#endif
} // end offload
if (offload)
fix->stop_watch(TIME_OFFLOAD_LATENCY);
else
fix->stop_watch(TIME_HOST_PAIR);
if (EFLAG || vflag)
fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
else
fix->add_result_array(f_start, 0, offload);
}
/* ----------------------------------------------------------------------
global settings
------------------------------------------------------------------------- */
void PairDPDIntel::settings(int narg, char **arg) {
#if defined(_OPENMP)
if (_nrandom_thread) {
#ifdef LMP_USE_MKL_RNG
for (int i = 0; i < _nrandom_thread; i++)
vslDeleteStream(&random_thread[i]);
#else
for (int i = 1; i < _nrandom_thread; i++)
delete random_thread[i];
#endif
}
delete []random_thread;
#endif
PairDPD::settings(narg,arg);
_nrandom_thread = comm->nthreads;
#ifdef LMP_USE_MKL_RNG
random_thread=new VSLStreamStatePtr[comm->nthreads];
#if defined(_OPENMP)
#pragma omp parallel
{
int tid = omp_get_thread_num();
vslNewStream(&random_thread[tid], LMP_MKL_RNG,
seed + comm->me + comm->nprocs * tid );
}
#endif
#else
random_thread =new RanMars*[comm->nthreads];
random_thread[0] = random;
#if defined(_OPENMP)
#pragma omp parallel
{
int tid = omp_get_thread_num();
if (tid > 0)
random_thread[tid] = new RanMars(lmp, seed+comm->me+comm->nprocs*tid);
}
#endif
#endif
}
/* ---------------------------------------------------------------------- */
void PairDPDIntel::init_style()
{
PairDPD::init_style();
if (force->newton_pair == 0) {
neighbor->requests[neighbor->nrequest-1]->half = 0;
neighbor->requests[neighbor->nrequest-1]->full = 1;
}
neighbor->requests[neighbor->nrequest-1]->intel = 1;
int ifix = modify->find_fix("package_intel");
if (ifix < 0)
error->all(FLERR,
"The 'package intel' command is required for /intel styles");
fix = static_cast<FixIntel *>(modify->fix[ifix]);
fix->pair_init_check();
#ifdef _LMP_INTEL_OFFLOAD
if (fix->offload_balance() != 0.0)
error->all(FLERR,
"Offload for dpd/intel is not yet available. Set balance to 0.");
#endif
if (fix->precision() == FixIntel::PREC_MODE_MIXED)
pack_force_const(force_const_single, fix->get_mixed_buffers());
else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
pack_force_const(force_const_double, fix->get_double_buffers());
else
pack_force_const(force_const_single, fix->get_single_buffers());
}
/* ---------------------------------------------------------------------- */
template <class flt_t, class acc_t>
void PairDPDIntel::pack_force_const(ForceConst<flt_t> &fc,
IntelBuffers<flt_t,acc_t> *buffers)
{
_onetype = 0;
if (atom->ntypes == 1 && !atom->molecular) _onetype = 1;
int tp1 = atom->ntypes + 1;
fc.set_ntypes(tp1,comm->nthreads,buffers->get_max_nbors(),memory,_cop);
buffers->set_ntypes(tp1);
flt_t **cutneighsq = buffers->get_cutneighsq();
// Repeat cutsq calculation because done after call to init_style
double cut, cutneigh;
for (int i = 1; i <= atom->ntypes; i++) {
for (int j = i; j <= atom->ntypes; j++) {
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
cut = init_one(i,j);
cutneigh = cut + neighbor->skin;
cutsq[i][j] = cutsq[j][i] = cut*cut;
cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
double icut = 1.0 / cut;
fc.param[i][j].icut = fc.param[j][i].icut = icut;
} else {
cut = init_one(i,j);
double icut = 1.0 / cut;
fc.param[i][j].icut = fc.param[j][i].icut = icut;
}
}
}
for (int i = 0; i < 4; i++) {
fc.special_lj[i] = force->special_lj[i];
fc.special_lj[0] = 1.0;
}
for (int i = 0; i < tp1; i++) {
for (int j = 0; j < tp1; j++) {
fc.param[i][j].a0 = a0[i][j];
fc.param[i][j].gamma = gamma[i][j];
fc.param[i][j].sigma = sigma[i][j];
}
}
}
/* ---------------------------------------------------------------------- */
template <class flt_t>
void PairDPDIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
const int nthreads,
const int max_nbors,
Memory *memory,
const int cop) {
if (ntypes != _ntypes) {
if (_ntypes > 0) {
_memory->destroy(param);
_memory->destroy(rand_buffer_thread);
_memory->destroy(rngi);
}
if (ntypes > 0) {
_cop = cop;
memory->create(param,ntypes,ntypes,"fc.param");
memory->create(rand_buffer_thread, nthreads, max_nbors,
"fc.rand_buffer_thread");
memory->create(rngi,nthreads,"fc.param");
for (int i = 0; i < nthreads; i++) rngi[i] = max_nbors;
}
}
_ntypes = ntypes;
_memory = memory;
}
/* ----------------------------------------------------------------------
proc 0 reads from restart file, bcasts
------------------------------------------------------------------------- */
void PairDPDIntel::read_restart_settings(FILE *fp)
{
#if defined(_OPENMP)
if (_nrandom_thread) {
#ifdef LMP_USE_MKL_RNG
for (int i = 0; i < _nrandom_thread; i++)
vslDeleteStream(&random_thread[i]);
#else
for (int i = 1; i < _nrandom_thread; i++)
delete random_thread[i];
#endif
}
delete []random_thread;
#endif
PairDPD::read_restart_settings(fp);
_nrandom_thread = comm->nthreads;
#ifdef LMP_USE_MKL_RNG
random_thread=new VSLStreamStatePtr[comm->nthreads];
#if defined(_OPENMP)
#pragma omp parallel
{
int tid = omp_get_thread_num();
vslNewStream(&random_thread[tid], LMP_MKL_RNG,
seed + comm->me + comm->nprocs * tid );
}
#endif
#else
random_thread =new RanMars*[comm->nthreads];
random_thread[0] = random;
#if defined(_OPENMP)
#pragma omp parallel
{
int tid = omp_get_thread_num();
if (tid > 0)
random_thread[tid] = new RanMars(lmp, seed+comm->me+comm->nprocs*tid);
}
#endif
#endif
}

View File

@ -0,0 +1,110 @@
/* -*- c++ -*- ----------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: W. Michael Brown (Intel)
Shun Xu (Computer Network Information Center, CAS)
------------------------------------------------------------------------- */
#ifdef PAIR_CLASS
PairStyle(dpd/intel,PairDPDIntel)
#else
#ifndef LMP_PAIR_DPD_INTEL_H
#define LMP_PAIR_DPD_INTEL_H
#include "pair_dpd.h"
#include "fix_intel.h"
#ifdef LMP_USE_MKL_RNG
#include "mkl_vsl.h"
#else
#include "random_mars.h"
#endif
namespace LAMMPS_NS {
class PairDPDIntel : public PairDPD {
public:
PairDPDIntel(class LAMMPS *);
~PairDPDIntel();
virtual void compute(int, int);
void settings(int, char **);
void init_style();
void read_restart_settings(FILE *);
private:
FixIntel *fix;
int _cop, _onetype, _nrandom_thread;
#ifdef LMP_USE_MKL_RNG
VSLStreamStatePtr *random_thread;
#else
RanMars **random_thread;
#endif
template <class flt_t> class ForceConst;
template <class flt_t, class acc_t>
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc);
template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
void eval(const int offload, const int vflag,
IntelBuffers<flt_t,acc_t> * buffers,
const ForceConst<flt_t> &fc, const int astart, const int aend);
template <class flt_t, class acc_t>
void pack_force_const(ForceConst<flt_t> &fc,
IntelBuffers<flt_t, acc_t> *buffers);
// ----------------------------------------------------------------------
template <class flt_t>
class ForceConst {
public:
typedef struct { flt_t icut, a0, gamma, sigma; } fc_packed1;
_alignvar(flt_t special_lj[4],64);
fc_packed1 **param;
flt_t **rand_buffer_thread;
int *rngi;
ForceConst() : _ntypes(0) {}
~ForceConst() { set_ntypes(0, 0, 0, NULL, _cop); }
void set_ntypes(const int ntypes, const int nthreads, const int max_nbors,
Memory *memory, const int cop);
private:
int _ntypes, _cop;
Memory *_memory;
};
ForceConst<float> force_const_single;
ForceConst<double> force_const_double;
};
}
#endif
#endif
/* ERROR/WARNING messages:
E: The 'package intel' command is required for /intel styles
Self-explanatory.
*/

View File

@ -68,7 +68,7 @@ void VerletLRTIntel::init()
_intel_kspace = (PPPMIntel*)(force->kspace_match("pppm/intel", 0));
#ifdef LMP_INTEL_NOLRT
#ifndef LMP_INTEL_USELRT
error->all(FLERR,
"LRT otion for Intel package disabled at compile time");
#endif

View File

@ -23,10 +23,7 @@ IntegrateStyle(verlet/lrt/intel,VerletLRTIntel)
#include "verlet.h"
#include "pppm_intel.h"
#ifndef LMP_INTEL_USELRT
#define LMP_INTEL_NOLRT
#else
#ifdef LMP_INTEL_USELRT
#ifdef LMP_INTEL_LRT11
#define _LMP_INTEL_LRT_11
#include <thread>