diff --git a/doc/src/JPG/user_intel.png b/doc/src/JPG/user_intel.png index 7ec83b3207..5061f1af2e 100755 Binary files a/doc/src/JPG/user_intel.png and b/doc/src/JPG/user_intel.png differ diff --git a/doc/src/accelerate_intel.txt b/doc/src/accelerate_intel.txt index 83e17b4f27..aaa38d7de2 100644 --- a/doc/src/accelerate_intel.txt +++ b/doc/src/accelerate_intel.txt @@ -25,14 +25,14 @@ LAMMPS to run on the CPU cores and coprocessor cores simultaneously. [Currently Available USER-INTEL Styles:] Angle Styles: charmm, harmonic :ulb,l -Bond Styles: fene, harmonic :l +Bond Styles: fene, fourier, harmonic :l Dihedral Styles: charmm, harmonic, opls :l -Fixes: nve, npt, nvt, nvt/sllod :l +Fixes: nve, npt, nvt, nvt/sllod, nve/asphere :l Improper Styles: cvff, harmonic :l Pair Styles: airebo, airebo/morse, buck/coul/cut, buck/coul/long, -buck, eam, eam/alloy, eam/fs, gayberne, lj/charmm/coul/charmm, -lj/charmm/coul/long, lj/cut, lj/cut/coul/long, lj/long/coul/long, rebo, -sw, tersoff :l +buck, dpd, eam, eam/alloy, eam/fs, gayberne, lj/charmm/coul/charmm, +lj/charmm/coul/long, lj/cut, lj/cut/coul/long, lj/long/coul/long, +rebo, sw, tersoff :l K-Space Styles: pppm, pppm/disp :l :ule @@ -54,11 +54,12 @@ warmup run (for use with offload benchmarks). :c,image(JPG/user_intel.png) Results are speedups obtained on Intel Xeon E5-2697v4 processors -(code-named Broadwell) and Intel Xeon Phi 7250 processors -(code-named Knights Landing) with "June 2017" LAMMPS built with -Intel Parallel Studio 2017 update 2. Results are with 1 MPI task -per physical core. See {src/USER-INTEL/TEST/README} for the raw -simulation rates and instructions to reproduce. +(code-named Broadwell), Intel Xeon Phi 7250 processors (code-named +Knights Landing), and Intel Xeon Gold 6148 processors (code-named +Skylake) with "June 2017" LAMMPS built with Intel Parallel Studio +2017 update 2. Results are with 1 MPI task per physical core. See +{src/USER-INTEL/TEST/README} for the raw simulation rates and +instructions to reproduce. :line @@ -82,6 +83,11 @@ this order :l The {newton} setting applies to all atoms, not just atoms shared between MPI tasks :l Vectorization can change the order for adding pairwise forces :l +When using the -DLMP_USE_MKL_RNG define (all included intel optimized +makefiles do) at build time, the random number generator for +dissipative particle dynamics (pair style dpd/intel) uses the Mersenne +Twister generator included in the Intel MKL library (that should be +more robust than the default Masaglia random number generator) :l :ule The precision mode (described below) used with the USER-INTEL @@ -108,7 +114,7 @@ $t should be 2 for Intel Xeon CPUs and 2 or 4 for Intel Xeon Phi :l For some of the simple 2-body potentials without long-range electrostatics, performance and scalability can be better with the "newton off" setting added to the input script :l -For simulations on higher node counts, add "processors * * * grid +For simulations on higher node counts, add "processors * * * grid numa" to the beginning of the input script for better scalability :l If using {kspace_style pppm} in the input script, add "kspace_modify diff ad" for better performance :l @@ -119,8 +125,8 @@ For Intel Xeon Phi CPUs: Runs should be performed using MCDRAM. :ulb,l :ule -For simulations using {kspace_style pppm} on Intel CPUs -supporting AVX-512: +For simulations using {kspace_style pppm} on Intel CPUs supporting +AVX-512: Add "kspace_modify diff ad" to the input script :ulb,l The command-line option should be changed to @@ -237,14 +243,17 @@ However, if you do not have coprocessors on your system, building without offload support will produce a smaller binary. The general requirements for Makefiles with the USER-INTEL package -are as follows. "-DLAMMPS_MEMALIGN=64" is required for CCFLAGS. When -using Intel compilers, "-restrict" is required and "-qopenmp" is -highly recommended for CCFLAGS and LINKFLAGS. LIB should include -"-ltbbmalloc". For builds supporting offload, "-DLMP_INTEL_OFFLOAD" -is required for CCFLAGS and "-qoffload" is required for LINKFLAGS. -Other recommended CCFLAG options for best performance are -"-O2 -fno-alias -ansi-alias -qoverride-limits fp-model fast=2 --no-prec-div". +are as follows. When using Intel compilers, "-restrict" is required +and "-qopenmp" is highly recommended for CCFLAGS and LINKFLAGS. +CCFLAGS should include "-DLMP_INTEL_USELRT" (unless POSIX Threads +are not supported in the build environment) and "-DLMP_USE_MKL_RNG" +(unless Intel Math Kernel Library (MKL) is not available in the build +environment). For Intel compilers, LIB should include "-ltbbmalloc" +or if the library is not available, "-DLMP_INTEL_NO_TBB" can be added +to CCFLAGS. For builds supporting offload, "-DLMP_INTEL_OFFLOAD" is +required for CCFLAGS and "-qoffload" is required for LINKFLAGS. Other +recommended CCFLAG options for best performance are "-O2 -fno-alias +-ansi-alias -qoverride-limits fp-model fast=2 -no-prec-div". NOTE: The vectorization and math capabilities can differ depending on the CPU. For Intel compilers, the "-x" flag specifies the type of diff --git a/doc/src/dihedral_fourier.txt b/doc/src/dihedral_fourier.txt index da892b59da..0accbb22bf 100644 --- a/doc/src/dihedral_fourier.txt +++ b/doc/src/dihedral_fourier.txt @@ -7,6 +7,7 @@ :line dihedral_style fourier command :h3 +dihedral_style fourier/intel command :h3 dihedral_style fourier/omp command :h3 [Syntax:] diff --git a/doc/src/pair_dpd.txt b/doc/src/pair_dpd.txt index 8d194bb092..9e29e93430 100644 --- a/doc/src/pair_dpd.txt +++ b/doc/src/pair_dpd.txt @@ -8,6 +8,7 @@ pair_style dpd command :h3 pair_style dpd/gpu command :h3 +pair_style dpd/intel command :h3 pair_style dpd/omp command :h3 pair_style dpd/tstat command :h3 pair_style dpd/tstat/gpu command :h3 diff --git a/src/MAKE/MACHINES/Makefile.cori2 b/src/MAKE/MACHINES/Makefile.cori2 index a367d54080..45e1ab1f8a 100755 --- a/src/MAKE/MACHINES/Makefile.cori2 +++ b/src/MAKE/MACHINES/Makefile.cori2 @@ -15,13 +15,14 @@ SHELL = /bin/sh CC = CC OPTFLAGS = -xMIC-AVX512 -O2 -fp-model fast=2 -no-prec-div -qoverride-limits -CCFLAGS = -g -qopenmp -DLAMMPS_MEMALIGN=64 -qno-offload \ - -fno-alias -ansi-alias -restrict $(OPTFLAGS) -DLMP_INTEL_NO_TBB +CCFLAGS = -qopenmp -qno-offload -fno-alias -ansi-alias -restrict \ + -DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG -DLMP_INTEL_NO_TBB \ + $(OPTFLAGS) SHFLAGS = -fPIC DEPFLAGS = -M LINK = CC -LINKFLAGS = -g -qopenmp $(OPTFLAGS) +LINKFLAGS = -qopenmp $(OPTFLAGS) LIB = SIZE = size diff --git a/src/MAKE/OPTIONS/Makefile.intel_coprocessor b/src/MAKE/OPTIONS/Makefile.intel_coprocessor index a717be93ff..75e4d89170 100644 --- a/src/MAKE/OPTIONS/Makefile.intel_coprocessor +++ b/src/MAKE/OPTIONS/Makefile.intel_coprocessor @@ -10,7 +10,7 @@ CC = mpiicpc MIC_OPT = -qoffload-option,mic,compiler,"-fp-model fast=2 -mGLOB_default_function_attrs=\"gather_scatter_loop_unroll=4\"" CCFLAGS = -g -O3 -qopenmp -DLMP_INTEL_OFFLOAD -DLAMMPS_MEMALIGN=64 \ -xHost -fno-alias -ansi-alias -restrict -DLMP_INTEL_USELRT \ - -qoverride-limits $(MIC_OPT) + -qoverride-limits $(MIC_OPT) -DLMP_USE_MKL_RNG SHFLAGS = -fPIC DEPFLAGS = -M diff --git a/src/MAKE/OPTIONS/Makefile.intel_cpu b/src/MAKE/OPTIONS/Makefile.intel_cpu old mode 100755 new mode 100644 index b7db064574..41d0f959fe --- a/src/MAKE/OPTIONS/Makefile.intel_cpu +++ b/src/MAKE/OPTIONS/Makefile.intel_cpu @@ -8,14 +8,14 @@ SHELL = /bin/sh CC = mpiicpc OPTFLAGS = -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits -CCFLAGS = -g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \ - -fno-alias -ansi-alias -restrict $(OPTFLAGS) +CCFLAGS = -qopenmp -qno-offload -fno-alias -ansi-alias -restrict \ + -DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS) SHFLAGS = -fPIC DEPFLAGS = -M LINK = mpiicpc -LINKFLAGS = -g -qopenmp $(OPTFLAGS) -LIB = -ltbbmalloc -ltbbmalloc_proxy +LINKFLAGS = -qopenmp $(OPTFLAGS) +LIB = -ltbbmalloc SIZE = size ARCHIVE = ar diff --git a/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi b/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi index 8a45b781f8..ef514f43c6 100644 --- a/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi +++ b/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi @@ -8,8 +8,8 @@ SHELL = /bin/sh CC = mpiicpc OPTFLAGS = -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits -CCFLAGS = -qopenmp -DLAMMPS_MEMALIGN=64 -qno-offload \ - -fno-alias -ansi-alias -restrict $(OPTFLAGS) -DLMP_INTEL_USELRT +CCFLAGS = -qopenmp -qno-offload -fno-alias -ansi-alias -restrict \ + -DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS) SHFLAGS = -fPIC DEPFLAGS = -M diff --git a/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich b/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich index 7ca59e7b1c..e4dc74d79b 100644 --- a/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich +++ b/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich @@ -8,13 +8,13 @@ SHELL = /bin/sh CC = mpicxx -cxx=icc OPTFLAGS = -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits -CCFLAGS = -g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \ - -fno-alias -ansi-alias -restrict $(OPTFLAGS) -DLMP_INTEL_USELRT +CCFLAGS = -qopenmp -qno-offload -fno-alias -ansi-alias -restrict \ + -DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS) SHFLAGS = -fPIC DEPFLAGS = -M LINK = mpicxx -cxx=icc -LINKFLAGS = -g -qopenmp $(OPTFLAGS) +LINKFLAGS = -qopenmp $(OPTFLAGS) LIB = -ltbbmalloc SIZE = size diff --git a/src/MAKE/OPTIONS/Makefile.intel_cpu_openmpi b/src/MAKE/OPTIONS/Makefile.intel_cpu_openmpi index fe1be99e58..457a64b223 100644 --- a/src/MAKE/OPTIONS/Makefile.intel_cpu_openmpi +++ b/src/MAKE/OPTIONS/Makefile.intel_cpu_openmpi @@ -9,14 +9,14 @@ SHELL = /bin/sh export OMPI_CXX = icc CC = mpicxx OPTFLAGS = -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits -CCFLAGS = -g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \ - -fno-alias -ansi-alias -restrict $(OPTFLAGS) -DLMP_INTEL_USELRT +CCFLAGS = -qopenmp -qno-offload -fno-alias -ansi-alias -restrict \ + -DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS) SHFLAGS = -fPIC DEPFLAGS = -M LINK = mpicxx -LINKFLAGS = -g -qopenmp $(OPTFLAGS) -LIB = -ltbbmalloc -ltbbmalloc_proxy +LINKFLAGS = -qopenmp $(OPTFLAGS) +LIB = -ltbbmalloc SIZE = size ARCHIVE = ar diff --git a/src/MAKE/OPTIONS/Makefile.intel_knl_coprocessor b/src/MAKE/OPTIONS/Makefile.intel_knl_coprocessor deleted file mode 100644 index 406e98b36d..0000000000 --- a/src/MAKE/OPTIONS/Makefile.intel_knl_coprocessor +++ /dev/null @@ -1,123 +0,0 @@ -# intel_phi = USER-INTEL with Phi x200 (KNL) offload support,Intel MPI,MKL FFT - -SHELL = /bin/sh - -# --------------------------------------------------------------------- -# compiler/linker settings -# specify flags and libraries needed for your compiler - -CC = mpiicpc -MIC_OPT = -qoffload-arch=mic-avx512 -fp-model fast=2 -CCFLAGS = -O3 -qopenmp -DLMP_INTEL_OFFLOAD -DLAMMPS_MEMALIGN=64 \ - -xHost -fno-alias -ansi-alias -restrict \ - -qoverride-limits $(MIC_OPT) -DLMP_INTEL_USELRT -SHFLAGS = -fPIC -DEPFLAGS = -M - -LINK = mpiicpc -LINKFLAGS = -g -O3 -xHost -qopenmp -qoffload $(MIC_OPT) -LIB = -ltbbmalloc -SIZE = size - -ARCHIVE = ar -ARFLAGS = -rc -SHLIBFLAGS = -shared - -# --------------------------------------------------------------------- -# LAMMPS-specific settings, all OPTIONAL -# specify settings for LAMMPS features you will use -# if you change any -D setting, do full re-compile after "make clean" - -# LAMMPS ifdef settings -# see possible settings in Section 2.2 (step 4) of manual - -LMP_INC = -DLAMMPS_GZIP -DLAMMPS_JPEG - -# MPI library -# see discussion in Section 2.2 (step 5) of manual -# MPI wrapper compiler/linker can provide this info -# can point to dummy MPI library in src/STUBS as in Makefile.serial -# use -D MPICH and OMPI settings in INC to avoid C++ lib conflicts -# INC = path for mpi.h, MPI compiler settings -# PATH = path for MPI library -# LIB = name of MPI library - -MPI_INC = -DMPICH_SKIP_MPICXX -DOMPI_SKIP_MPICXX=1 -MPI_PATH = -MPI_LIB = - -# FFT library -# see discussion in Section 2.2 (step 6) of manaul -# can be left blank to use provided KISS FFT library -# INC = -DFFT setting, e.g. -DFFT_FFTW, FFT compiler settings -# PATH = path for FFT library -# LIB = name of FFT library - -FFT_INC = -DFFT_MKL -DFFT_SINGLE -FFT_PATH = -FFT_LIB = -L$(MKLROOT)/lib/intel64/ -lmkl_intel_ilp64 -lmkl_sequential -lmkl_core - -# JPEG and/or PNG library -# see discussion in Section 2.2 (step 7) of manual -# only needed if -DLAMMPS_JPEG or -DLAMMPS_PNG listed with LMP_INC -# INC = path(s) for jpeglib.h and/or png.h -# PATH = path(s) for JPEG library and/or PNG library -# LIB = name(s) of JPEG library and/or PNG library - -JPG_INC = -JPG_PATH = -JPG_LIB = -ljpeg - -# --------------------------------------------------------------------- -# build rules and dependencies -# do not edit this section - -include Makefile.package.settings -include Makefile.package - -EXTRA_INC = $(LMP_INC) $(PKG_INC) $(MPI_INC) $(FFT_INC) $(JPG_INC) $(PKG_SYSINC) -EXTRA_PATH = $(PKG_PATH) $(MPI_PATH) $(FFT_PATH) $(JPG_PATH) $(PKG_SYSPATH) -EXTRA_LIB = $(PKG_LIB) $(MPI_LIB) $(FFT_LIB) $(JPG_LIB) $(PKG_SYSLIB) -EXTRA_CPP_DEPENDS = $(PKG_CPP_DEPENDS) -EXTRA_LINK_DEPENDS = $(PKG_LINK_DEPENDS) - -# Path to src files - -vpath %.cpp .. -vpath %.h .. - -# Link target - -$(EXE): $(OBJ) $(EXTRA_LINK_DEPENDS) - $(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE) - $(SIZE) $(EXE) - -# Library targets - -lib: $(OBJ) $(EXTRA_LINK_DEPENDS) - $(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ) - -shlib: $(OBJ) $(EXTRA_LINK_DEPENDS) - $(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \ - $(OBJ) $(EXTRA_LIB) $(LIB) - -# Compilation rules - -%.o:%.cpp $(EXTRA_CPP_DEPENDS) - $(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $< - -%.d:%.cpp $(EXTRA_CPP_DEPENDS) - $(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@ - -%.o:%.cu $(EXTRA_CPP_DEPENDS) - $(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $< - -# Individual dependencies - -depend : fastdep.exe $(SRC) - @./fastdep.exe $(EXTRA_INC) -- $^ > .depend || exit 1 - -fastdep.exe: ../DEPEND/fastdep.c - cc -O -o $@ $< - -sinclude .depend diff --git a/src/MAKE/OPTIONS/Makefile.knl b/src/MAKE/OPTIONS/Makefile.knl index 881c51f0e4..8e266a4fce 100644 --- a/src/MAKE/OPTIONS/Makefile.knl +++ b/src/MAKE/OPTIONS/Makefile.knl @@ -8,13 +8,13 @@ SHELL = /bin/sh CC = mpiicpc OPTFLAGS = -xMIC-AVX512 -O2 -fp-model fast=2 -no-prec-div -qoverride-limits -CCFLAGS = -qopenmp -DLAMMPS_MEMALIGN=64 -qno-offload \ - -fno-alias -ansi-alias -restrict $(OPTFLAGS) +CCFLAGS = -qopenmp -qno-offload -fno-alias -ansi-alias -restrict \ + -DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS) SHFLAGS = -fPIC DEPFLAGS = -M LINK = mpiicpc -LINKFLAGS = -g -qopenmp $(OPTFLAGS) +LINKFLAGS = -qopenmp $(OPTFLAGS) LIB = -ltbbmalloc SIZE = size diff --git a/src/USER-INTEL/Install.sh b/src/USER-INTEL/Install.sh index f7163e6791..da553d158a 100644 --- a/src/USER-INTEL/Install.sh +++ b/src/USER-INTEL/Install.sh @@ -46,7 +46,7 @@ action nbin_intel.h action nbin_intel.cpp action npair_intel.h action npair_intel.cpp -action intel_simd.h pair_sw_intel.cpp +action intel_simd.h action intel_intrinsics.h pair_tersoff_intel.cpp action intel_intrinsics_airebo.h pair_airebo_intel.cpp diff --git a/src/USER-INTEL/README b/src/USER-INTEL/README index 3b84446057..871d881f39 100644 --- a/src/USER-INTEL/README +++ b/src/USER-INTEL/README @@ -30,28 +30,37 @@ be added or changed in the Makefile depending on the version: 2017 update 2 - No changes needed 2017 updates 3 or 4 - Use -xCOMMON-AVX512 and not -xHost or -xCORE-AVX512 -2018 or newer - Use -xHost or -xCORE-AVX512 and -qopt-zmm-usage=high +2018 inital release - Use -xCOMMON-AVX512 and not -xHost or -xCORE-AVX512 +2018u1 or newer - Use -xHost or -xCORE-AVX512 and -qopt-zmm-usage=high ----------------------------------------------------------------------------- When using the suffix command with "intel", intel styles will be used if they exist. If the suffix command is used with "hybrid intel omp" and the USER-OMP -USER-OMP styles will be used whenever USER-INTEL styles are not available. This -allow for running most styles in LAMMPS with threading. +is installed, USER-OMP styles will be used whenever USER-INTEL styles are not +available. This allow for running most styles in LAMMPS with threading. ----------------------------------------------------------------------------- -The Long-Range Thread mode (LRT) in the Intel package currently uses -pthreads by default. If pthreads are not supported in the build environment, -the compile flag "-DLMP_INTEL_NOLRT" will disable the feature to allow for -builds without pthreads. Alternatively, "-DLMP_INTEL_LRT11" can be used to -build with compilers that support threads using the C++11 standard. When using +The Long-Range Thread mode (LRT) in the Intel package is enabled through the +-DLMP_INTEL_USELRT define at compile time. All intel optimized makefiles +include this define. This feature will use pthreads by default. +Alternatively, "-DLMP_INTEL_LRT11" can be used to build with compilers that +support threads intrinsically using the C++11 standard. When using LRT mode, you might need to disable OpenMP affinity settings (e.g. export KMP_AFFINITY=none). LAMMPS will generate a warning if the settings need to be changed. ----------------------------------------------------------------------------- +Unless Intel Math Kernel Library (MKL) is unavailable, -DLMP_USE_MKL_RNG +should be added to the compile flags. This will enable using the MKL Mersenne +Twister random number generator (RNG) for Dissipative Particle Dynamics +(DPD). This RNG can allow significantly faster performance and it also has a +significantly longer period than the standard RNG for DPD. + +----------------------------------------------------------------------------- + In order to use offload to Intel(R) Xeon Phi(TM) coprocessors, the flag -DLMP_INTEL_OFFLOAD should be set in the Makefile. Offload requires the use of Intel compilers. diff --git a/src/USER-INTEL/TEST/README b/src/USER-INTEL/TEST/README index 434189dd26..62602d5920 100644 --- a/src/USER-INTEL/TEST/README +++ b/src/USER-INTEL/TEST/README @@ -9,6 +9,7 @@ # in.intel.tersoff - Silicon benchmark with Tersoff # in.intel.water - Coarse-grain water benchmark using Stillinger-Weber # in.intel.airebo - Polyethelene benchmark with AIREBO +# in.intel.dpd - Dissipative Particle Dynamics # ############################################################################# @@ -16,16 +17,17 @@ # Expected Timesteps/second with turbo on and HT enabled, LAMMPS June-2017 # - Compiled w/ Intel Parallel Studio 2017u2 and Makefile.intel_cpu_intelmpi # -# Xeon E5-2697v4 Xeon Phi 7250 +# Xeon E5-2697v4 Xeon Phi 7250 Xeon Gold 6148 # -# in.intel.lj - 199.5 282.3 -# in.intel.rhodo - 12.4 17.5 -# in.intel.lc - 19.0 25.7 -# in.intel.eam - 59.4 92.8 -# in.intel.sw - 132.4 161.9 -# in.intel.tersoff - 83.3 101.1 -# in.intel.water - 53.4 90.3 -# in.intel.airebo - 7.3 11.8 +# in.intel.lj - 199.5 282.3 317.3 +# in.intel.rhodo - 12.4 17.5 24.4 +# in.intel.lc - 19.0 25.7 26.8 +# in.intel.eam - 59.4 92.8 105.6 +# in.intel.sw - 132.4 161.9 213.8 +# in.intel.tersoff - 83.3 101.1 109.6 +# in.intel.water - 53.4 90.3 105.5 +# in.intel.airebo - 7.3 11.8 17.6 +# in.intel.dpd - 74.5 100.4 148.1 # ############################################################################# diff --git a/src/USER-INTEL/TEST/in.intel.dpd b/src/USER-INTEL/TEST/in.intel.dpd new file mode 100644 index 0000000000..e257d91f84 --- /dev/null +++ b/src/USER-INTEL/TEST/in.intel.dpd @@ -0,0 +1,48 @@ +# DPD benchmark + +variable N index on # Newton Setting +variable w index 10 # Warmup Timesteps +variable t index 4000 # Main Run Timesteps +variable m index 1 # Main Run Timestep Multiplier +variable n index 0 # Use NUMA Mapping for Multi-Node +variable p index 0 # Use Power Measurement + +variable x index 4 +variable y index 2 +variable z index 2 + +variable xx equal 20*$x +variable yy equal 20*$y +variable zz equal 20*$z +variable rr equal floor($t*$m) + +newton $N +if "$n > 0" then "processors * * * grid numa" + +units lj +atom_style atomic +comm_modify mode single vel yes + +lattice fcc 3.0 +region box block 0 ${xx} 0 ${yy} 0 ${zz} +create_box 1 box +create_atoms 1 box +mass 1 1.0 + +velocity all create 1.0 87287 loop geom + +pair_style dpd 1.0 1.0 928948 +pair_coeff 1 1 25.0 4.5 + +neighbor 0.5 bin +neigh_modify delay 0 every 1 + +fix 1 all nve +timestep 0.04 + +thermo 1000 + +if "$p > 0" then "run_style verlet/power" + +if "$w > 0" then "run $w" +run ${rr} diff --git a/src/USER-INTEL/dihedral_fourier_intel.cpp b/src/USER-INTEL/dihedral_fourier_intel.cpp new file mode 100644 index 0000000000..805ffc0e25 --- /dev/null +++ b/src/USER-INTEL/dihedral_fourier_intel.cpp @@ -0,0 +1,441 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#include +#include +#include "dihedral_fourier_intel.h" +#include "atom.h" +#include "comm.h" +#include "memory.h" +#include "neighbor.h" +#include "domain.h" +#include "force.h" +#include "pair.h" +#include "update.h" +#include "error.h" + +#include "suffix.h" +using namespace LAMMPS_NS; + +#define PTOLERANCE (flt_t)1.05 +#define MTOLERANCE (flt_t)-1.05 +typedef struct { int a,b,c,d,t; } int5_t; + +/* ---------------------------------------------------------------------- */ + +DihedralFourierIntel::DihedralFourierIntel(class LAMMPS *lmp) + : DihedralFourier(lmp) +{ + suffix_flag |= Suffix::INTEL; +} + +/* ---------------------------------------------------------------------- */ + +void DihedralFourierIntel::compute(int eflag, int vflag) +{ + #ifdef _LMP_INTEL_OFFLOAD + if (_use_base) { + DihedralFourier::compute(eflag, vflag); + return; + } + #endif + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) + compute(eflag, vflag, fix->get_mixed_buffers(), + force_const_single); + else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) + compute(eflag, vflag, fix->get_double_buffers(), + force_const_double); + else + compute(eflag, vflag, fix->get_single_buffers(), + force_const_single); +} + +/* ---------------------------------------------------------------------- */ + +template +void DihedralFourierIntel::compute(int eflag, int vflag, + IntelBuffers *buffers, + const ForceConst &fc) +{ + if (eflag || vflag) { + ev_setup(eflag,vflag); + } else evflag = 0; + + if (evflag) { + if (vflag && !eflag) { + if (force->newton_bond) + eval<0,1,1>(vflag, buffers, fc); + else + eval<0,1,0>(vflag, buffers, fc); + } else { + if (force->newton_bond) + eval<1,1,1>(vflag, buffers, fc); + else + eval<1,1,0>(vflag, buffers, fc); + } + } else { + if (force->newton_bond) + eval<0,0,1>(vflag, buffers, fc); + else + eval<0,0,0>(vflag, buffers, fc); + } +} + +template +void DihedralFourierIntel::eval(const int vflag, + IntelBuffers *buffers, + const ForceConst &fc) + +{ + const int inum = neighbor->ndihedrallist; + if (inum == 0) return; + + ATOM_T * _noalias const x = buffers->get_x(0); + const int nlocal = atom->nlocal; + const int nall = nlocal + atom->nghost; + + int f_stride; + if (NEWTON_BOND) f_stride = buffers->get_stride(nall); + else f_stride = buffers->get_stride(nlocal); + + int tc; + FORCE_T * _noalias f_start; + acc_t * _noalias ev_global; + IP_PRE_get_buffers(0, buffers, fix, tc, f_start, ev_global); + const int nthreads = tc; + + acc_t oedihedral, ov0, ov1, ov2, ov3, ov4, ov5; + if (EFLAG) oedihedral = (acc_t)0.0; + if (VFLAG && vflag) { + ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0; + } + + #if defined(_OPENMP) + #pragma omp parallel default(none) \ + shared(f_start,f_stride,fc) \ + reduction(+:oedihedral,ov0,ov1,ov2,ov3,ov4,ov5) + #endif + { + int nfrom, npl, nto, tid; + #ifdef LMP_INTEL_USE_SIMDOFF + IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads); + #else + IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads); + #endif + + FORCE_T * _noalias const f = f_start + (tid * f_stride); + if (fix->need_zero(tid)) + memset(f, 0, f_stride * sizeof(FORCE_T)); + + const int5_t * _noalias const dihedrallist = + (int5_t *) neighbor->dihedrallist[0]; + + #ifdef LMP_INTEL_USE_SIMDOFF + acc_t sedihedral, sv0, sv1, sv2, sv3, sv4, sv5; + if (EFLAG) sedihedral = (acc_t)0.0; + if (VFLAG && vflag) { + sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0; + } + #pragma simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5) + for (int n = nfrom; n < nto; n ++) { + #else + for (int n = nfrom; n < nto; n += npl) { + #endif + const int i1 = dihedrallist[n].a; + const int i2 = dihedrallist[n].b; + const int i3 = dihedrallist[n].c; + const int i4 = dihedrallist[n].d; + const int type = dihedrallist[n].t; + + // 1st bond + + const flt_t vb1x = x[i1].x - x[i2].x; + const flt_t vb1y = x[i1].y - x[i2].y; + const flt_t vb1z = x[i1].z - x[i2].z; + + // 2nd bond + + const flt_t vb2xm = x[i2].x - x[i3].x; + const flt_t vb2ym = x[i2].y - x[i3].y; + const flt_t vb2zm = x[i2].z - x[i3].z; + + // 3rd bond + + const flt_t vb3x = x[i4].x - x[i3].x; + const flt_t vb3y = x[i4].y - x[i3].y; + const flt_t vb3z = x[i4].z - x[i3].z; + + // c,s calculation + + const flt_t ax = vb1y*vb2zm - vb1z*vb2ym; + const flt_t ay = vb1z*vb2xm - vb1x*vb2zm; + const flt_t az = vb1x*vb2ym - vb1y*vb2xm; + const flt_t bx = vb3y*vb2zm - vb3z*vb2ym; + const flt_t by = vb3z*vb2xm - vb3x*vb2zm; + const flt_t bz = vb3x*vb2ym - vb3y*vb2xm; + + const flt_t rasq = ax*ax + ay*ay + az*az; + const flt_t rbsq = bx*bx + by*by + bz*bz; + const flt_t rgsq = vb2xm*vb2xm + vb2ym*vb2ym + vb2zm*vb2zm; + const flt_t rg = sqrt(rgsq); + + flt_t rginv, ra2inv, rb2inv; + rginv = ra2inv = rb2inv = (flt_t)0.0; + if (rg > 0) rginv = (flt_t)1.0/rg; + if (rasq > 0) ra2inv = (flt_t)1.0/rasq; + if (rbsq > 0) rb2inv = (flt_t)1.0/rbsq; + const flt_t rabinv = sqrt(ra2inv*rb2inv); + + flt_t c = (ax*bx + ay*by + az*bz)*rabinv; + const flt_t s = rg*rabinv*(ax*vb3x + ay*vb3y + az*vb3z); + + // error check + #ifndef LMP_INTEL_USE_SIMDOFF + if (c > PTOLERANCE || c < MTOLERANCE) { + int me = comm->me; + + if (screen) { + char str[128]; + sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " " + TAGINT_FORMAT " " TAGINT_FORMAT " " + TAGINT_FORMAT " " TAGINT_FORMAT, + me,tid,update->ntimestep, + atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]); + error->warning(FLERR,str,0); + fprintf(screen," 1st atom: %d %g %g %g\n", + me,x[i1].x,x[i1].y,x[i1].z); + fprintf(screen," 2nd atom: %d %g %g %g\n", + me,x[i2].x,x[i2].y,x[i2].z); + fprintf(screen," 3rd atom: %d %g %g %g\n", + me,x[i3].x,x[i3].y,x[i3].z); + fprintf(screen," 4th atom: %d %g %g %g\n", + me,x[i4].x,x[i4].y,x[i4].z); + } + } + #endif + + if (c > (flt_t)1.0) c = (flt_t)1.0; + if (c < (flt_t)-1.0) c = (flt_t)-1.0; + + flt_t deng; + flt_t df = (flt_t)0.0; + if (EFLAG) deng = (flt_t)0.0; + + for (int j = 0; j < nterms[type]; j++) { + const flt_t tcos_shift = fc.bp[j][type].cos_shift; + const flt_t tsin_shift = fc.bp[j][type].sin_shift; + const flt_t tk = fc.bp[j][type].k; + const int m = fc.bp[j][type].multiplicity; + + flt_t p = (flt_t)1.0; + flt_t ddf1, df1; + ddf1 = df1 = (flt_t)0.0; + + for (int i = 0; i < m; i++) { + ddf1 = p*c - df1*s; + df1 = p*s + df1*c; + p = ddf1; + } + + p = p*tcos_shift + df1*tsin_shift; + df1 = df1*tcos_shift - ddf1*tsin_shift; + df1 *= -m; + p += (flt_t)1.0; + + if (m == 0) { + p = (flt_t)1.0 + tcos_shift; + df1 = (flt_t)0.0; + } + + if (EFLAG) deng += tk * p; + df -= tk * df1; + } + + const flt_t fg = vb1x*vb2xm + vb1y*vb2ym + vb1z*vb2zm; + const flt_t hg = vb3x*vb2xm + vb3y*vb2ym + vb3z*vb2zm; + const flt_t fga = fg*ra2inv*rginv; + const flt_t hgb = hg*rb2inv*rginv; + const flt_t gaa = -ra2inv*rg; + const flt_t gbb = rb2inv*rg; + + const flt_t dtfx = gaa*ax; + const flt_t dtfy = gaa*ay; + const flt_t dtfz = gaa*az; + const flt_t dtgx = fga*ax - hgb*bx; + const flt_t dtgy = fga*ay - hgb*by; + const flt_t dtgz = fga*az - hgb*bz; + const flt_t dthx = gbb*bx; + const flt_t dthy = gbb*by; + const flt_t dthz = gbb*bz; + + const flt_t sx2 = df*dtgx; + const flt_t sy2 = df*dtgy; + const flt_t sz2 = df*dtgz; + + flt_t f1x = df*dtfx; + flt_t f1y = df*dtfy; + flt_t f1z = df*dtfz; + + const flt_t f2x = sx2 - f1x; + const flt_t f2y = sy2 - f1y; + const flt_t f2z = sz2 - f1z; + + flt_t f4x = df*dthx; + flt_t f4y = df*dthy; + flt_t f4z = df*dthz; + + const flt_t f3x = -sx2 - f4x; + const flt_t f3y = -sy2 - f4y; + const flt_t f3z = -sz2 - f4z; + + if (EFLAG || VFLAG) { + #ifdef LMP_INTEL_USE_SIMDOFF + IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4, + f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, + vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, + vb3y, vb3z, sedihedral, f, NEWTON_BOND, nlocal, + sv0, sv1, sv2, sv3, sv4, sv5); + #else + IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4, + f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, + vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, + vb3y, vb3z, oedihedral, f, NEWTON_BOND, nlocal, + ov0, ov1, ov2, ov3, ov4, ov5); + #endif + } + + #ifdef LMP_INTEL_USE_SIMDOFF + #pragma simdoff + #endif + { + if (NEWTON_BOND || i1 < nlocal) { + f[i1].x += f1x; + f[i1].y += f1y; + f[i1].z += f1z; + } + + if (NEWTON_BOND || i2 < nlocal) { + f[i2].x += f2x; + f[i2].y += f2y; + f[i2].z += f2z; + } + + if (NEWTON_BOND || i3 < nlocal) { + f[i3].x += f3x; + f[i3].y += f3y; + f[i3].z += f3z; + } + + if (NEWTON_BOND || i4 < nlocal) { + f[i4].x += f4x; + f[i4].y += f4y; + f[i4].z += f4z; + } + } + } // for n + #ifdef LMP_INTEL_USE_SIMDOFF + if (EFLAG) oedihedral += sedihedral; + if (VFLAG && vflag) { + ov0 += sv0; ov1 += sv1; ov2 += sv2; + ov3 += sv3; ov4 += sv4; ov5 += sv5; + } + #endif + } // omp parallel + + if (EFLAG) energy += oedihedral; + if (VFLAG && vflag) { + virial[0] += ov0; virial[1] += ov1; virial[2] += ov2; + virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; + } + + fix->set_reduce_flag(); +} + +/* ---------------------------------------------------------------------- */ + +void DihedralFourierIntel::init_style() +{ + DihedralFourier::init_style(); + + int ifix = modify->find_fix("package_intel"); + if (ifix < 0) + error->all(FLERR, + "The 'package intel' command is required for /intel styles"); + fix = static_cast(modify->fix[ifix]); + + #ifdef _LMP_INTEL_OFFLOAD + _use_base = 0; + if (fix->offload_balance() != 0.0) { + _use_base = 1; + return; + } + #endif + + fix->bond_init_check(); + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) + pack_force_const(force_const_single, fix->get_mixed_buffers()); + else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) + pack_force_const(force_const_double, fix->get_double_buffers()); + else + pack_force_const(force_const_single, fix->get_single_buffers()); +} + +/* ---------------------------------------------------------------------- */ + +template +void DihedralFourierIntel::pack_force_const(ForceConst &fc, + IntelBuffers *buffers) +{ + const int bp1 = atom->ndihedraltypes + 1; + fc.set_ntypes(bp1, setflag, nterms, memory); + + for (int i = 1; i < bp1; i++) { + if (setflag[i]) { + for (int j = 0; j < nterms[i]; j++) { + fc.bp[j][i].cos_shift = cos_shift[i][j]; + fc.bp[j][i].sin_shift = sin_shift[i][j]; + fc.bp[j][i].k = k[i][j]; + fc.bp[j][i].multiplicity = multiplicity[i][j]; + } + } + } +} + +/* ---------------------------------------------------------------------- */ + +template +void DihedralFourierIntel::ForceConst::set_ntypes(const int nbondtypes, + int *setflag, + int *nterms, + Memory *memory) { + if (nbondtypes != _nbondtypes) { + if (_nbondtypes > 0) + _memory->destroy(bp); + + if (nbondtypes > 0) { + _maxnterms = 1; + for (int i = 1; i <= nbondtypes; i++) + if (setflag[i]) _maxnterms = MAX(_maxnterms, nterms[i]); + + _memory->create(bp, _maxnterms, nbondtypes, "dihedralfourierintel.bp"); + } + } + _nbondtypes = nbondtypes; + _memory = memory; +} diff --git a/src/USER-INTEL/dihedral_fourier_intel.h b/src/USER-INTEL/dihedral_fourier_intel.h new file mode 100644 index 0000000000..a775e129f4 --- /dev/null +++ b/src/USER-INTEL/dihedral_fourier_intel.h @@ -0,0 +1,82 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#ifdef DIHEDRAL_CLASS + +DihedralStyle(fourier/intel,DihedralFourierIntel) + +#else + +#ifndef LMP_DIHEDRAL_FOURIER_INTEL_H +#define LMP_DIHEDRAL_FOURIER_INTEL_H + +#include "dihedral_fourier.h" +#include "fix_intel.h" + +namespace LAMMPS_NS { + +class DihedralFourierIntel : public DihedralFourier { + + public: + DihedralFourierIntel(class LAMMPS *lmp); + virtual void compute(int, int); + void init_style(); + + private: + FixIntel *fix; + + template class ForceConst; + template + void compute(int eflag, int vflag, IntelBuffers *buffers, + const ForceConst &fc); + template + void eval(const int vflag, IntelBuffers * buffers, + const ForceConst &fc); + template + void pack_force_const(ForceConst &fc, + IntelBuffers *buffers); + + #ifdef _LMP_INTEL_OFFLOAD + int _use_base; + #endif + + template + class ForceConst { + public: + typedef struct { flt_t cos_shift, sin_shift, k; + int multiplicity; } fc_packed1; + + fc_packed1 **bp; + + ForceConst() : _nbondtypes(0) {} + ~ForceConst() { set_ntypes(0, NULL, NULL, NULL); } + + void set_ntypes(const int nbondtypes, int *setflag, int *nterms, + Memory *memory); + + private: + int _nbondtypes, _maxnterms; + Memory *_memory; + }; + ForceConst force_const_single; + ForceConst force_const_double; +}; + +} + +#endif +#endif diff --git a/src/USER-INTEL/fix_intel.cpp b/src/USER-INTEL/fix_intel.cpp index 637fc0d06e..eac48b8510 100644 --- a/src/USER-INTEL/fix_intel.cpp +++ b/src/USER-INTEL/fix_intel.cpp @@ -285,6 +285,7 @@ int FixIntel::setmask() { int mask = 0; mask |= PRE_REVERSE; + mask |= MIN_PRE_REVERSE; #ifdef _LMP_INTEL_OFFLOAD mask |= POST_FORCE; mask |= MIN_POST_FORCE; diff --git a/src/USER-INTEL/fix_intel.h b/src/USER-INTEL/fix_intel.h index 068e5ed890..d7093e79bb 100644 --- a/src/USER-INTEL/fix_intel.h +++ b/src/USER-INTEL/fix_intel.h @@ -43,6 +43,7 @@ class FixIntel : public Fix { virtual int setmask(); virtual void init(); virtual void setup(int); + inline void min_setup(int in) { setup(in); } void setup_pre_reverse(int eflag = 0, int vflag = 0); void pair_init_check(const bool cdmessage=false); @@ -50,6 +51,8 @@ class FixIntel : public Fix { void kspace_init_check(); void pre_reverse(int eflag = 0, int vflag = 0); + inline void min_pre_reverse(int eflag = 0, int vflag = 0) + { pre_reverse(eflag, vflag); } // Get all forces, calculation results from coprocesser void sync_coprocessor(); diff --git a/src/USER-INTEL/intel_buffers.cpp b/src/USER-INTEL/intel_buffers.cpp index b4b664cb94..ac208f5a0c 100644 --- a/src/USER-INTEL/intel_buffers.cpp +++ b/src/USER-INTEL/intel_buffers.cpp @@ -409,6 +409,7 @@ void IntelBuffers::grow_ccache(const int off_flag, IP_PRE_get_stride(_ccache_stride3, nsize * 3, sizeof(acc_t), 0); lmp->memory->create(_ccachef, _ccache_stride3 * nt, "_ccachef"); #endif + memset(_ccachei, 0, vsize * sizeof(int)); memset(_ccachej, 0, vsize * sizeof(int)); #ifdef _LMP_INTEL_OFFLOAD @@ -425,7 +426,7 @@ void IntelBuffers::grow_ccache(const int off_flag, #pragma offload_transfer target(mic:_cop) \ nocopy(ccachex,ccachey:length(vsize) alloc_if(1) free_if(0)) \ nocopy(ccachez,ccachew:length(vsize) alloc_if(1) free_if(0)) \ - nocopy(ccachei:length(vsize) alloc_if(1) free_if(0)) \ + in(ccachei:length(vsize) alloc_if(1) free_if(0)) \ in(ccachej:length(vsize) alloc_if(1) free_if(0)) } #ifdef LMP_USE_AVXCD diff --git a/src/USER-INTEL/intel_preprocess.h b/src/USER-INTEL/intel_preprocess.h index a7663d54a6..d49d0d8b00 100644 --- a/src/USER-INTEL/intel_preprocess.h +++ b/src/USER-INTEL/intel_preprocess.h @@ -292,6 +292,15 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR, ito = inum; \ } +#define IP_PRE_omp_stride_id_vec(ifrom, ip, ito, tid, inum, \ + nthr, vecsize) \ + { \ + tid = 0; \ + ifrom = 0; \ + ip = 1; \ + ito = inum; \ + } + #endif #define IP_PRE_fdotr_acc_force_l5(lf, lt, minlocal, nthreads, f_start, \ diff --git a/src/USER-INTEL/npair_full_bin_ghost_intel.cpp b/src/USER-INTEL/npair_full_bin_ghost_intel.cpp index 12101712f1..e6d45d7b2c 100644 --- a/src/USER-INTEL/npair_full_bin_ghost_intel.cpp +++ b/src/USER-INTEL/npair_full_bin_ghost_intel.cpp @@ -319,7 +319,6 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list, const int bstart = binhead[ibin + binstart[k]]; const int bend = binhead[ibin + binend[k]]; #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned #pragma simd #endif for (int jj = bstart; jj < bend; jj++) @@ -341,7 +340,6 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list, const int bstart = binhead[ibin + stencil[k]]; const int bend = binhead[ibin + stencil[k] + 1]; #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned #pragma simd #endif for (int jj = bstart; jj < bend; jj++) diff --git a/src/USER-INTEL/npair_intel.cpp b/src/USER-INTEL/npair_intel.cpp index 79dc75366e..0068e02635 100644 --- a/src/USER-INTEL/npair_intel.cpp +++ b/src/USER-INTEL/npair_intel.cpp @@ -273,7 +273,6 @@ void NPairIntel::bin_newton(const int offload, NeighList *list, const int bstart = binhead[ibin + binstart[k]]; const int bend = binhead[ibin + binend[k]]; #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned #pragma simd #endif for (int jj = bstart; jj < bend; jj++) @@ -307,7 +306,6 @@ void NPairIntel::bin_newton(const int offload, NeighList *list, const int bstart = binhead[ibin]; const int bend = binhead[ibin + 1]; #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned #pragma simd #endif for (int jj = bstart; jj < bend; jj++) { diff --git a/src/USER-INTEL/pair_dpd_intel.cpp b/src/USER-INTEL/pair_dpd_intel.cpp new file mode 100644 index 0000000000..c7cddfccc1 --- /dev/null +++ b/src/USER-INTEL/pair_dpd_intel.cpp @@ -0,0 +1,617 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + This software is distributed under the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) + Shun Xu (Computer Network Information Center, CAS) +------------------------------------------------------------------------- */ + +#include +#include "pair_dpd_intel.h" +#include "atom.h" +#include "comm.h" +#include "force.h" +#include "memory.h" +#include "modify.h" +#include "neighbor.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "suffix.h" +using namespace LAMMPS_NS; + +#define LMP_MKL_RNG VSL_BRNG_MT19937 +#define FC_PACKED1_T typename ForceConst::fc_packed1 +#define IEPSILON 1.0e10 + +/* ---------------------------------------------------------------------- */ + +PairDPDIntel::PairDPDIntel(LAMMPS *lmp) : + PairDPD(lmp) +{ + suffix_flag |= Suffix::INTEL; + respa_enable = 0; + random_thread = NULL; + _nrandom_thread = 0; +} + +/* ---------------------------------------------------------------------- */ + +PairDPDIntel::~PairDPDIntel() +{ + #if defined(_OPENMP) + if (_nrandom_thread) { + #ifdef LMP_USE_MKL_RNG + for (int i = 0; i < _nrandom_thread; i++) + vslDeleteStream(&random_thread[i]); + #else + for (int i = 1; i < _nrandom_thread; i++) + delete random_thread[i]; + #endif + } + #endif + delete []random_thread; +} + +/* ---------------------------------------------------------------------- */ + +void PairDPDIntel::compute(int eflag, int vflag) +{ + if (fix->precision() == FixIntel::PREC_MODE_MIXED) + compute(eflag, vflag, fix->get_mixed_buffers(), + force_const_single); + else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) + compute(eflag, vflag, fix->get_double_buffers(), + force_const_double); + else + compute(eflag, vflag, fix->get_single_buffers(), + force_const_single); + + fix->balance_stamp(); + vflag_fdotr = 0; +} + +template +void PairDPDIntel::compute(int eflag, int vflag, + IntelBuffers *buffers, + const ForceConst &fc) +{ + if (eflag || vflag) { + ev_setup(eflag, vflag); + } else evflag = vflag_fdotr = 0; + + const int inum = list->inum; + const int nthreads = comm->nthreads; + const int host_start = fix->host_start_pair(); + const int offload_end = fix->offload_end_pair(); + const int ago = neighbor->ago; + + if (ago != 0 && fix->separate_buffers() == 0) { + fix->start_watch(TIME_PACK); + + int packthreads; + if (nthreads > INTEL_HTHREADS) packthreads = nthreads; + else packthreads = 1; + #if defined(_OPENMP) + #pragma omp parallel if(packthreads > 1) + #endif + { + int ifrom, ito, tid; + IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost, + packthreads, sizeof(ATOM_T)); + buffers->thr_pack(ifrom,ito,ago); + } + fix->stop_watch(TIME_PACK); + } + + int ovflag = 0; + if (vflag_fdotr) ovflag = 2; + else if (vflag) ovflag = 1; + if (_onetype) { + if (eflag) { + if (force->newton_pair) { + eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum); + } else { + eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum); + } + } else { + if (force->newton_pair) { + eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum); + } else { + eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum); + } + } + } else { + if (eflag) { + if (force->newton_pair) { + eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum); + } else { + eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum); + } + } else { + if (force->newton_pair) { + eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum); + } else { + eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum); + } + } + } +} + +template +void PairDPDIntel::eval(const int offload, const int vflag, + IntelBuffers *buffers, + const ForceConst &fc, + const int astart, const int aend) +{ + const int inum = aend - astart; + if (inum == 0) return; + int nlocal, nall, minlocal; + fix->get_buffern(offload, nlocal, nall, minlocal); + + const int ago = neighbor->ago; + IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall); + + ATOM_T * _noalias const x = buffers->get_x(offload); + typedef struct { double x, y, z; } lmp_vt; + lmp_vt *v = (lmp_vt *)atom->v[0]; + const flt_t dtinvsqrt = 1.0/sqrt(update->dt); + + const int * _noalias const numneigh = list->numneigh; + const int * _noalias const cnumneigh = buffers->cnumneigh(list); + const int * _noalias const firstneigh = buffers->firstneigh(list); + const FC_PACKED1_T * _noalias const param = fc.param[0]; + const flt_t * _noalias const special_lj = fc.special_lj; + int * _noalias const rngi_thread = fc.rngi; + const int rng_size = buffers->get_max_nbors(); + + const int ntypes = atom->ntypes + 1; + const int eatom = this->eflag_atom; + + // Determine how much data to transfer + int x_size, q_size, f_stride, ev_size, separate_flag; + IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag, + buffers, offload, fix, separate_flag, + x_size, q_size, ev_size, f_stride); + + int tc; + FORCE_T * _noalias f_start; + acc_t * _noalias ev_global; + IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global); + const int nthreads = tc; + int *overflow = fix->get_off_overflow_flag(); + { + #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD) + *timer_compute = MIC_Wtime(); + #endif + + IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall, + f_stride, x, 0); + + acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5; + if (EFLAG) oevdwl = (acc_t)0; + if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; + + // loop over neighbors of my atoms + #if defined(_OPENMP) + #pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5) + #endif + { + int iifrom, iip, iito, tid; + IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads); + iifrom += astart; + iito += astart; + + #ifdef LMP_USE_MKL_RNG + VSLStreamStatePtr *my_random = &(random_thread[tid]); + #else + RanMars *my_random = random_thread[tid]; + #endif + flt_t *my_rand_buffer = fc.rand_buffer_thread[tid]; + int rngi = rngi_thread[tid]; + + int foff; + if (NEWTON_PAIR) foff = tid * f_stride - minlocal; + else foff = -minlocal; + FORCE_T * _noalias const f = f_start + foff; + if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T)); + + flt_t icut, a0, gamma, sigma; + if (ONETYPE) { + icut = param[3].icut; + a0 = param[3].a0; + gamma = param[3].gamma; + sigma = param[3].sigma; + } + for (int i = iifrom; i < iito; i += iip) { + int itype, ptr_off; + const FC_PACKED1_T * _noalias parami; + if (!ONETYPE) { + itype = x[i].w; + ptr_off = itype * ntypes; + parami = param + ptr_off; + } + + const int * _noalias const jlist = firstneigh + cnumneigh[i]; + const int jnum = numneigh[i]; + + acc_t fxtmp, fytmp, fztmp, fwtmp; + acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5; + + const flt_t xtmp = x[i].x; + const flt_t ytmp = x[i].y; + const flt_t ztmp = x[i].z; + const flt_t vxtmp = v[i].x; + const flt_t vytmp = v[i].y; + const flt_t vztmp = v[i].z; + fxtmp = fytmp = fztmp = (acc_t)0; + if (EFLAG) fwtmp = sevdwl = (acc_t)0; + if (NEWTON_PAIR == 0) + if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0; + + if (rngi + jnum > rng_size) { + #ifdef LMP_USE_MKL_RNG + if (sizeof(flt_t) == sizeof(float)) + vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, *my_random, rngi, + (float*)my_rand_buffer, (float)0.0, (float)1.0 ); + else + vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, *my_random, rngi, + (double*)my_rand_buffer, 0.0, 1.0 ); + #else + for (int jj = 0; jj < rngi; jj++) + my_rand_buffer[jj] = my_random->gaussian(); + #endif + rngi = 0; + } + + #if defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \ + sv0, sv1, sv2, sv3, sv4, sv5) + #endif + for (int jj = 0; jj < jnum; jj++) { + flt_t forcelj, evdwl; + forcelj = evdwl = (flt_t)0.0; + + int j, jtype, sbindex; + if (!ONETYPE) { + sbindex = jlist[jj] >> SBBITS & 3; + j = jlist[jj] & NEIGHMASK; + } else + j = jlist[jj]; + + const flt_t delx = xtmp - x[j].x; + const flt_t dely = ytmp - x[j].y; + const flt_t delz = ztmp - x[j].z; + if (!ONETYPE) { + jtype = x[j].w; + icut = parami[jtype].icut; + } + const flt_t rsq = delx * delx + dely * dely + delz * delz; + const flt_t rinv = (flt_t)1.0/sqrt(rsq); + + if (rinv > icut) { + flt_t factor_dpd; + if (!ONETYPE) factor_dpd = special_lj[sbindex]; + + flt_t delvx = vxtmp - v[j].x; + flt_t delvy = vytmp - v[j].y; + flt_t delvz = vztmp - v[j].z; + flt_t dot = delx*delvx + dely*delvy + delz*delvz; + flt_t randnum = my_rand_buffer[jj]; + + flt_t iwd = rinv - icut; + if (rinv > (flt_t)IEPSILON) iwd = (flt_t)0.0; + + if (!ONETYPE) { + a0 = parami[jtype].a0; + gamma = parami[jtype].gamma; + sigma = parami[jtype].sigma; + } + flt_t fpair = a0 - iwd * gamma * dot + sigma * randnum * dtinvsqrt; + if (!ONETYPE) fpair *= factor_dpd; + fpair *= iwd; + + const flt_t fpx = fpair * delx; + fxtmp += fpx; + if (NEWTON_PAIR) f[j].x -= fpx; + const flt_t fpy = fpair * dely; + fytmp += fpy; + if (NEWTON_PAIR) f[j].y -= fpy; + const flt_t fpz = fpair * delz; + fztmp += fpz; + if (NEWTON_PAIR) f[j].z -= fpz; + + if (EFLAG) { + flt_t cut = (flt_t)1.0/icut; + flt_t r = (flt_t)1.0/rinv; + evdwl = (flt_t)0.5 * a0 * (cut - (flt_t)2.0*r + rsq * icut); + if (!ONETYPE) evdwl *= factor_dpd; + sevdwl += evdwl; + if (eatom) { + fwtmp += (flt_t)0.5 * evdwl; + if (NEWTON_PAIR) + f[j].w += (flt_t)0.5 * evdwl; + } + } + + if (NEWTON_PAIR == 0) + IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz); + } // if rsq + } // for jj + if (NEWTON_PAIR) { + f[i].x += fxtmp; + f[i].y += fytmp; + f[i].z += fztmp; + } else { + f[i].x = fxtmp; + f[i].y = fytmp; + f[i].z = fztmp; + } + + IP_PRE_ev_tally_atom(NEWTON_PAIR, EFLAG, vflag, f, fwtmp); + rngi += jnum; + } // for ii + + IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start, + f_stride, x, offload, vflag, ov0, ov1, ov2, ov3, + ov4, ov5); + rngi_thread[tid] = rngi; + } // end omp + + IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag, + ov0, ov1, ov2, ov3, ov4, ov5); + + if (EFLAG) { + if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5; + ev_global[0] = oevdwl; + ev_global[1] = (acc_t)0.0; + } + if (vflag) { + if (NEWTON_PAIR == 0) { + ov0 *= (acc_t)0.5; + ov1 *= (acc_t)0.5; + ov2 *= (acc_t)0.5; + ov3 *= (acc_t)0.5; + ov4 *= (acc_t)0.5; + ov5 *= (acc_t)0.5; + } + ev_global[2] = ov0; + ev_global[3] = ov1; + ev_global[4] = ov2; + ev_global[5] = ov3; + ev_global[6] = ov4; + ev_global[7] = ov5; + } + #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD) + *timer_compute = MIC_Wtime() - *timer_compute; + #endif + } // end offload + + if (offload) + fix->stop_watch(TIME_OFFLOAD_LATENCY); + else + fix->stop_watch(TIME_HOST_PAIR); + + if (EFLAG || vflag) + fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag); + else + fix->add_result_array(f_start, 0, offload); +} + +/* ---------------------------------------------------------------------- + global settings + ------------------------------------------------------------------------- */ + +void PairDPDIntel::settings(int narg, char **arg) { + #if defined(_OPENMP) + if (_nrandom_thread) { + #ifdef LMP_USE_MKL_RNG + for (int i = 0; i < _nrandom_thread; i++) + vslDeleteStream(&random_thread[i]); + #else + for (int i = 1; i < _nrandom_thread; i++) + delete random_thread[i]; + #endif + } + delete []random_thread; + #endif + PairDPD::settings(narg,arg); + _nrandom_thread = comm->nthreads; + + #ifdef LMP_USE_MKL_RNG + + random_thread=new VSLStreamStatePtr[comm->nthreads]; + #if defined(_OPENMP) + #pragma omp parallel + { + int tid = omp_get_thread_num(); + vslNewStream(&random_thread[tid], LMP_MKL_RNG, + seed + comm->me + comm->nprocs * tid ); + } + #endif + + #else + + random_thread =new RanMars*[comm->nthreads]; + random_thread[0] = random; + #if defined(_OPENMP) + #pragma omp parallel + { + int tid = omp_get_thread_num(); + if (tid > 0) + random_thread[tid] = new RanMars(lmp, seed+comm->me+comm->nprocs*tid); + } + #endif + + #endif +} + +/* ---------------------------------------------------------------------- */ + +void PairDPDIntel::init_style() +{ + PairDPD::init_style(); + if (force->newton_pair == 0) { + neighbor->requests[neighbor->nrequest-1]->half = 0; + neighbor->requests[neighbor->nrequest-1]->full = 1; + } + neighbor->requests[neighbor->nrequest-1]->intel = 1; + + int ifix = modify->find_fix("package_intel"); + if (ifix < 0) + error->all(FLERR, + "The 'package intel' command is required for /intel styles"); + fix = static_cast(modify->fix[ifix]); + + fix->pair_init_check(); + #ifdef _LMP_INTEL_OFFLOAD + if (fix->offload_balance() != 0.0) + error->all(FLERR, + "Offload for dpd/intel is not yet available. Set balance to 0."); + #endif + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) + pack_force_const(force_const_single, fix->get_mixed_buffers()); + else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) + pack_force_const(force_const_double, fix->get_double_buffers()); + else + pack_force_const(force_const_single, fix->get_single_buffers()); +} + +/* ---------------------------------------------------------------------- */ + +template +void PairDPDIntel::pack_force_const(ForceConst &fc, + IntelBuffers *buffers) +{ + _onetype = 0; + if (atom->ntypes == 1 && !atom->molecular) _onetype = 1; + + int tp1 = atom->ntypes + 1; + fc.set_ntypes(tp1,comm->nthreads,buffers->get_max_nbors(),memory,_cop); + buffers->set_ntypes(tp1); + flt_t **cutneighsq = buffers->get_cutneighsq(); + + // Repeat cutsq calculation because done after call to init_style + double cut, cutneigh; + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = i; j <= atom->ntypes; j++) { + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + cut = init_one(i,j); + cutneigh = cut + neighbor->skin; + cutsq[i][j] = cutsq[j][i] = cut*cut; + cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh; + double icut = 1.0 / cut; + fc.param[i][j].icut = fc.param[j][i].icut = icut; + } else { + cut = init_one(i,j); + double icut = 1.0 / cut; + fc.param[i][j].icut = fc.param[j][i].icut = icut; + } + } + } + + for (int i = 0; i < 4; i++) { + fc.special_lj[i] = force->special_lj[i]; + fc.special_lj[0] = 1.0; + } + + for (int i = 0; i < tp1; i++) { + for (int j = 0; j < tp1; j++) { + fc.param[i][j].a0 = a0[i][j]; + fc.param[i][j].gamma = gamma[i][j]; + fc.param[i][j].sigma = sigma[i][j]; + } + } +} + +/* ---------------------------------------------------------------------- */ + +template +void PairDPDIntel::ForceConst::set_ntypes(const int ntypes, + const int nthreads, + const int max_nbors, + Memory *memory, + const int cop) { + if (ntypes != _ntypes) { + if (_ntypes > 0) { + _memory->destroy(param); + _memory->destroy(rand_buffer_thread); + _memory->destroy(rngi); + } + if (ntypes > 0) { + _cop = cop; + memory->create(param,ntypes,ntypes,"fc.param"); + memory->create(rand_buffer_thread, nthreads, max_nbors, + "fc.rand_buffer_thread"); + memory->create(rngi,nthreads,"fc.param"); + for (int i = 0; i < nthreads; i++) rngi[i] = max_nbors; + } + } + _ntypes = ntypes; + _memory = memory; +} + +/* ---------------------------------------------------------------------- + proc 0 reads from restart file, bcasts + ------------------------------------------------------------------------- */ + +void PairDPDIntel::read_restart_settings(FILE *fp) +{ + #if defined(_OPENMP) + if (_nrandom_thread) { + #ifdef LMP_USE_MKL_RNG + for (int i = 0; i < _nrandom_thread; i++) + vslDeleteStream(&random_thread[i]); + #else + for (int i = 1; i < _nrandom_thread; i++) + delete random_thread[i]; + #endif + } + delete []random_thread; + #endif + PairDPD::read_restart_settings(fp); + _nrandom_thread = comm->nthreads; + + #ifdef LMP_USE_MKL_RNG + + random_thread=new VSLStreamStatePtr[comm->nthreads]; + #if defined(_OPENMP) + #pragma omp parallel + { + int tid = omp_get_thread_num(); + vslNewStream(&random_thread[tid], LMP_MKL_RNG, + seed + comm->me + comm->nprocs * tid ); + } + #endif + + #else + + random_thread =new RanMars*[comm->nthreads]; + random_thread[0] = random; + #if defined(_OPENMP) + #pragma omp parallel + { + int tid = omp_get_thread_num(); + if (tid > 0) + random_thread[tid] = new RanMars(lmp, seed+comm->me+comm->nprocs*tid); + } + #endif + + #endif +} diff --git a/src/USER-INTEL/pair_dpd_intel.h b/src/USER-INTEL/pair_dpd_intel.h new file mode 100644 index 0000000000..416d873c00 --- /dev/null +++ b/src/USER-INTEL/pair_dpd_intel.h @@ -0,0 +1,110 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) + Shun Xu (Computer Network Information Center, CAS) +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS + +PairStyle(dpd/intel,PairDPDIntel) + +#else + +#ifndef LMP_PAIR_DPD_INTEL_H +#define LMP_PAIR_DPD_INTEL_H + +#include "pair_dpd.h" +#include "fix_intel.h" + +#ifdef LMP_USE_MKL_RNG +#include "mkl_vsl.h" +#else +#include "random_mars.h" +#endif + +namespace LAMMPS_NS { + +class PairDPDIntel : public PairDPD { + + public: + PairDPDIntel(class LAMMPS *); + ~PairDPDIntel(); + + virtual void compute(int, int); + void settings(int, char **); + void init_style(); + void read_restart_settings(FILE *); + + private: + FixIntel *fix; + int _cop, _onetype, _nrandom_thread; + + #ifdef LMP_USE_MKL_RNG + VSLStreamStatePtr *random_thread; + #else + RanMars **random_thread; + #endif + + template class ForceConst; + template + void compute(int eflag, int vflag, IntelBuffers *buffers, + const ForceConst &fc); + template + void eval(const int offload, const int vflag, + IntelBuffers * buffers, + const ForceConst &fc, const int astart, const int aend); + + template + void pack_force_const(ForceConst &fc, + IntelBuffers *buffers); + + // ---------------------------------------------------------------------- + + template + class ForceConst { + public: + typedef struct { flt_t icut, a0, gamma, sigma; } fc_packed1; + + _alignvar(flt_t special_lj[4],64); + fc_packed1 **param; + flt_t **rand_buffer_thread; + int *rngi; + + ForceConst() : _ntypes(0) {} + ~ForceConst() { set_ntypes(0, 0, 0, NULL, _cop); } + + void set_ntypes(const int ntypes, const int nthreads, const int max_nbors, + Memory *memory, const int cop); + + private: + int _ntypes, _cop; + Memory *_memory; + }; + ForceConst force_const_single; + ForceConst force_const_double; +}; + +} + +#endif +#endif + +/* ERROR/WARNING messages: + +E: The 'package intel' command is required for /intel styles + +Self-explanatory. + +*/ diff --git a/src/USER-INTEL/verlet_lrt_intel.cpp b/src/USER-INTEL/verlet_lrt_intel.cpp index 81f4586143..9ff5f85176 100644 --- a/src/USER-INTEL/verlet_lrt_intel.cpp +++ b/src/USER-INTEL/verlet_lrt_intel.cpp @@ -68,7 +68,7 @@ void VerletLRTIntel::init() _intel_kspace = (PPPMIntel*)(force->kspace_match("pppm/intel", 0)); - #ifdef LMP_INTEL_NOLRT + #ifndef LMP_INTEL_USELRT error->all(FLERR, "LRT otion for Intel package disabled at compile time"); #endif diff --git a/src/USER-INTEL/verlet_lrt_intel.h b/src/USER-INTEL/verlet_lrt_intel.h index 813cd53605..0d7154ff64 100644 --- a/src/USER-INTEL/verlet_lrt_intel.h +++ b/src/USER-INTEL/verlet_lrt_intel.h @@ -23,10 +23,7 @@ IntegrateStyle(verlet/lrt/intel,VerletLRTIntel) #include "verlet.h" #include "pppm_intel.h" -#ifndef LMP_INTEL_USELRT -#define LMP_INTEL_NOLRT -#else - +#ifdef LMP_INTEL_USELRT #ifdef LMP_INTEL_LRT11 #define _LMP_INTEL_LRT_11 #include