diff --git a/src/Depend.sh b/src/Depend.sh index 3786172d64..632aa4a3e8 100644 --- a/src/Depend.sh +++ b/src/Depend.sh @@ -44,6 +44,7 @@ depend () { if (test $1 = "ASPHERE") then depend GPU depend USER-OMP + depend USER-INTEL fi if (test $1 = "CLASS2") then @@ -72,6 +73,7 @@ if (test $1 = "KSPACE") then depend OPT depend USER-CUDA depend USER-OMP + depend USER-INTEL depend USER-PHONON fi @@ -88,6 +90,7 @@ if (test $1 = "MOLECULE") then depend USER-CUDA depend USER-MISC depend USER-OMP + depend USER-INTEL fi if (test $1 = "PERI") then diff --git a/src/GRANULAR/pair_gran_hooke_history.cpp b/src/GRANULAR/pair_gran_hooke_history.cpp index 77d2f0d70a..7f98cfa4e7 100644 --- a/src/GRANULAR/pair_gran_hooke_history.cpp +++ b/src/GRANULAR/pair_gran_hooke_history.cpp @@ -45,7 +45,6 @@ PairGranHookeHistory::PairGranHookeHistory(LAMMPS *lmp) : Pair(lmp) no_virial_fdotr_compute = 1; history = 1; fix_history = NULL; - suffix = NULL; single_extra = 4; svector = new double[4]; @@ -67,7 +66,6 @@ PairGranHookeHistory::~PairGranHookeHistory() { delete [] svector; if (fix_history) modify->delete_fix("SHEAR_HISTORY"); - if (suffix) delete[] suffix; if (allocated) { memory->destroy(setflag); @@ -436,7 +434,7 @@ void PairGranHookeHistory::init_style() fixarg[0] = (char *) "SHEAR_HISTORY"; fixarg[1] = (char *) "all"; fixarg[2] = (char *) "SHEAR_HISTORY"; - modify->add_fix(3,fixarg,suffix); + modify->add_fix(3,fixarg,1); delete [] fixarg; fix_history = (FixShearHistory *) modify->fix[modify->nfix-1]; fix_history->pair = this; diff --git a/src/GRANULAR/pair_gran_hooke_history.h b/src/GRANULAR/pair_gran_hooke_history.h index 4e2e51a4cc..25762ca65d 100644 --- a/src/GRANULAR/pair_gran_hooke_history.h +++ b/src/GRANULAR/pair_gran_hooke_history.h @@ -54,7 +54,6 @@ class PairGranHookeHistory : public Pair { int freeze_group_bit; int history; - char *suffix; int neighprev; double *onerad_dynamic,*onerad_frozen; double *maxrad_dynamic,*maxrad_frozen; diff --git a/src/KSPACE/fix_tune_kspace.cpp b/src/KSPACE/fix_tune_kspace.cpp index 9abfc9d1b4..e3d5a5b5cd 100644 --- a/src/KSPACE/fix_tune_kspace.cpp +++ b/src/KSPACE/fix_tune_kspace.cpp @@ -218,7 +218,8 @@ void FixTuneKspace::store_old_kspace_settings() update the pair style if necessary, preserving the settings ------------------------------------------------------------------------- */ -void FixTuneKspace::update_pair_style(char *new_pair_style, double pair_cut_coul) +void FixTuneKspace::update_pair_style(char *new_pair_style, + double pair_cut_coul) { int itmp; double *p_cutoff = (double *) force->pair->extract("cut_coul",itmp); @@ -235,7 +236,7 @@ void FixTuneKspace::update_pair_style(char *new_pair_style, double pair_cut_coul cout << "Creating new pair style: " << new_pair_style << endl; // delete old pair style and create new one - force->create_pair(new_pair_style,lmp->suffix); + force->create_pair(new_pair_style,1); // restore current pair settings from temporary file force->pair->read_restart(p_pair_settings_file); @@ -252,7 +253,8 @@ void FixTuneKspace::update_pair_style(char *new_pair_style, double pair_cut_coul update the kspace style if necessary ------------------------------------------------------------------------- */ -void FixTuneKspace::update_kspace_style(char *new_kspace_style, char *new_acc_str) +void FixTuneKspace::update_kspace_style(char *new_kspace_style, + char *new_acc_str) { // create kspace style char string @@ -269,8 +271,7 @@ void FixTuneKspace::update_kspace_style(char *new_kspace_style, char *new_acc_st // delete old kspace style and create new one - force->create_kspace(narg,arg,lmp->suffix); - + force->create_kspace(narg,arg,1); force->kspace->differentiation_flag = old_differentiation_flag; force->kspace->slabflag = old_slabflag; force->kspace->slab_volfactor = old_slab_volfactor; diff --git a/src/MAKE/Makefile.beacon b/src/MAKE/Makefile.beacon new file mode 100755 index 0000000000..98e816a430 --- /dev/null +++ b/src/MAKE/Makefile.beacon @@ -0,0 +1,109 @@ +# linux = RedHat Linux box, Intel icc, MPICH2, FFTW + +SHELL = /bin/sh + +# --------------------------------------------------------------------- +# compiler/linker settings +# specify flags and libraries needed for your compiler + +CC = mpiicpc -openmp -DLMP_INTEL_OFFLOAD -DLAMMPS_MEMALIGN=64 +MIC_OPT = -offload-option,mic,compiler,"-fp-model fast=2 -mGLOB_default_function_attrs=\"gather_scatter_loop_unroll=4\"" +CCFLAGS = -O3 -xAVX -fno-alias -ansi-alias -restrict -override-limits $(MIC_OPT) +SHFLAGS = -fPIC +DEPFLAGS = -M + +LINK = mpiicpc -openmp +LINKFLAGS = -O3 -xAVX +LIB = +SIZE = size + +ARCHIVE = ar +ARFLAGS = -rc +SHLIBFLAGS = -shared + +# --------------------------------------------------------------------- +# LAMMPS-specific settings +# specify settings for LAMMPS features you will use +# if you change any -D setting, do full re-compile after "make clean" + +# LAMMPS ifdef settings, OPTIONAL +# see possible settings in doc/Section_start.html#2_2 (step 4) + +LMP_INC = -DLAMMPS_GZIP -DLAMMPS_JPEG + +# MPI library, REQUIRED +# see discussion in doc/Section_start.html#2_2 (step 5) +# can point to dummy MPI library in src/STUBS as in Makefile.serial +# INC = path for mpi.h, MPI compiler settings +# PATH = path for MPI library +# LIB = name of MPI library + +MPI_INC = -DMPICH_SKIP_MPICXX +MPI_PATH = +MPI_LIB = + +# FFT library, OPTIONAL +# see discussion in doc/Section_start.html#2_2 (step 6) +# can be left blank to use provided KISS FFT library +# INC = -DFFT setting, e.g. -DFFT_FFTW, FFT compiler settings +# PATH = path for FFT library +# LIB = name of FFT library + +FFT_INC = -DFFT_MKL -DFFT_SINGLE -I$(MKLROOT) +FFT_PATH = +FFT_LIB = -L$(MKLROOT) -lmkl_intel_ilp64 -lmkl_intel_thread -lmkl_core + +# JPEG and/or PNG library, OPTIONAL +# see discussion in doc/Section_start.html#2_2 (step 7) +# only needed if -DLAMMPS_JPEG or -DLAMMPS_PNG listed with LMP_INC +# INC = path(s) for jpeglib.h and/or png.h +# PATH = path(s) for JPEG library and/or PNG library +# LIB = name(s) of JPEG library and/or PNG library + +JPG_INC = +JPG_PATH = +JPG_LIB = -ljpeg + +# --------------------------------------------------------------------- +# build rules and dependencies +# no need to edit this section + +include Makefile.package.settings +include Makefile.package + +EXTRA_INC = $(LMP_INC) $(PKG_INC) $(MPI_INC) $(FFT_INC) $(JPG_INC) $(PKG_SYSINC) +EXTRA_PATH = $(PKG_PATH) $(MPI_PATH) $(FFT_PATH) $(JPG_PATH) $(PKG_SYSPATH) +EXTRA_LIB = $(PKG_LIB) $(MPI_LIB) $(FFT_LIB) $(JPG_LIB) $(PKG_SYSLIB) + +# Path to src files + +vpath %.cpp .. +vpath %.h .. + +# Link target + +$(EXE): $(OBJ) + $(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE) + $(SIZE) $(EXE) + +# Library targets + +lib: $(OBJ) + $(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ) + +shlib: $(OBJ) + $(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \ + $(OBJ) $(EXTRA_LIB) $(LIB) + +# Compilation rules + +%.o:%.cpp + $(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $< + +%.d:%.cpp + $(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@ + +# Individual dependencies + +DEPENDS = $(OBJ:.o=.d) +sinclude $(DEPENDS) diff --git a/src/MAKE/Makefile.g++_openmpi b/src/MAKE/Makefile.g++_openmpi new file mode 100755 index 0000000000..c8912f1713 --- /dev/null +++ b/src/MAKE/Makefile.g++_openmpi @@ -0,0 +1,108 @@ +# g++ = RedHat Linux box, g++4, OpenMPI, FFTW + +SHELL = /bin/sh + +# --------------------------------------------------------------------- +# compiler/linker settings +# specify flags and libraries needed for your compiler + +CC = g++ +CCFLAGS = -g -O # -Wunused +SHFLAGS = -fPIC +DEPFLAGS = -M + +LINK = g++ +LINKFLAGS = -g -O +LIB = +SIZE = size + +ARCHIVE = ar +ARFLAGS = -rc +SHLIBFLAGS = -shared + +# --------------------------------------------------------------------- +# LAMMPS-specific settings +# specify settings for LAMMPS features you will use +# if you change any -D setting, do full re-compile after "make clean" + +# LAMMPS ifdef settings, OPTIONAL +# see possible settings in doc/Section_start.html#2_2 (step 4) + +LMP_INC = -DLAMMPS_GZIP -DLAMMPS_JPEG + +# MPI library, REQUIRED +# see discussion in doc/Section_start.html#2_2 (step 5) +# can point to dummy MPI library in src/STUBS as in Makefile.serial +# INC = path for mpi.h, MPI compiler settings +# PATH = path for MPI library +# LIB = name of MPI library + +MPI_INC = -DMPICH_SKIP_MPICXX -I/usr/local/openmpi/include +MPI_PATH = -L/usr/local/openmpi/lib +MPI_LIB = -lmpi -lmpi_cxx + +# FFT library, OPTIONAL +# see discussion in doc/Section_start.html#2_2 (step 6) +# can be left blank to use provided KISS FFT library +# INC = -DFFT setting, e.g. -DFFT_FFTW, FFT compiler settings +# PATH = path for FFT library +# LIB = name of FFT library + +FFT_INC = -DFFT_FFTW +FFT_PATH = +FFT_LIB = -lfftw + +# JPEG and/or PNG library, OPTIONAL +# see discussion in doc/Section_start.html#2_2 (step 7) +# only needed if -DLAMMPS_JPEG or -DLAMMPS_PNG listed with LMP_INC +# INC = path(s) for jpeglib.h and/or png.h +# PATH = path(s) for JPEG library and/or PNG library +# LIB = name(s) of JPEG library and/or PNG library + +JPG_INC = +JPG_PATH = +JPG_LIB = -ljpeg + +# --------------------------------------------------------------------- +# build rules and dependencies +# no need to edit this section + +include Makefile.package.settings +include Makefile.package + +EXTRA_INC = $(LMP_INC) $(PKG_INC) $(MPI_INC) $(FFT_INC) $(JPG_INC) $(PKG_SYSINC) +EXTRA_PATH = $(PKG_PATH) $(MPI_PATH) $(FFT_PATH) $(JPG_PATH) $(PKG_SYSPATH) +EXTRA_LIB = $(PKG_LIB) $(MPI_LIB) $(FFT_LIB) $(JPG_LIB) $(PKG_SYSLIB) + +# Path to src files + +vpath %.cpp .. +vpath %.h .. + +# Link target + +$(EXE): $(OBJ) + $(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE) + $(SIZE) $(EXE) + +# Library targets + +lib: $(OBJ) + $(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ) + +shlib: $(OBJ) + $(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \ + $(OBJ) $(EXTRA_LIB) $(LIB) + +# Compilation rules + +%.o:%.cpp + $(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $< + +%.d:%.cpp + $(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@ + +# Individual dependencies + +DEPENDS = $(OBJ:.o=.d) +sinclude $(DEPENDS) diff --git a/src/MAKE/Makefile.intel b/src/MAKE/Makefile.intel new file mode 100755 index 0000000000..2b209e27b0 --- /dev/null +++ b/src/MAKE/Makefile.intel @@ -0,0 +1,108 @@ +# Intel compiler, Intel MPI, MKL FFT, no offload to coprocessor + +SHELL = /bin/sh + +# --------------------------------------------------------------------- +# compiler/linker settings +# specify flags and libraries needed for your compiler + +CC = mpiicpc -openmp -DLAMMPS_MEMALIGN=64 -no-offload +CCFLAGS = -O3 -xHost -fno-alias -ansi-alias -restrict -override-limits +SHFLAGS = -fPIC +DEPFLAGS = -M + +LINK = mpiicpc -openmp +LINKFLAGS = -O3 -xHost +LIB = +SIZE = size + +ARCHIVE = ar +ARFLAGS = -rc +SHLIBFLAGS = -shared + +# --------------------------------------------------------------------- +# LAMMPS-specific settings +# specify settings for LAMMPS features you will use +# if you change any -D setting, do full re-compile after "make clean" + +# LAMMPS ifdef settings, OPTIONAL +# see possible settings in doc/Section_start.html#2_2 (step 4) + +LMP_INC = -DLAMMPS_GZIP -DLAMMPS_JPEG + +# MPI library, REQUIRED +# see discussion in doc/Section_start.html#2_2 (step 5) +# can point to dummy MPI library in src/STUBS as in Makefile.serial +# INC = path for mpi.h, MPI compiler settings +# PATH = path for MPI library +# LIB = name of MPI library + +MPI_INC = -DMPICH_SKIP_MPICXX +MPI_PATH = +MPI_LIB = + +# FFT library, OPTIONAL +# see discussion in doc/Section_start.html#2_2 (step 6) +# can be left blank to use provided KISS FFT library +# INC = -DFFT setting, e.g. -DFFT_FFTW, FFT compiler settings +# PATH = path for FFT library +# LIB = name of FFT library + +FFT_INC = -DFFT_MKL -DFFT_SINGLE +FFT_PATH = +FFT_LIB = -L$MKLROOT/lib/intel64/ -lmkl_intel_ilp64 -lmkl_intel_thread -lmkl_core + +# JPEG and/or PNG library, OPTIONAL +# see discussion in doc/Section_start.html#2_2 (step 7) +# only needed if -DLAMMPS_JPEG or -DLAMMPS_PNG listed with LMP_INC +# INC = path(s) for jpeglib.h and/or png.h +# PATH = path(s) for JPEG library and/or PNG library +# LIB = name(s) of JPEG library and/or PNG library + +JPG_INC = +JPG_PATH = +JPG_LIB = -ljpeg + +# --------------------------------------------------------------------- +# build rules and dependencies +# no need to edit this section + +include Makefile.package.settings +include Makefile.package + +EXTRA_INC = $(LMP_INC) $(PKG_INC) $(MPI_INC) $(FFT_INC) $(JPG_INC) $(PKG_SYSINC) +EXTRA_PATH = $(PKG_PATH) $(MPI_PATH) $(FFT_PATH) $(JPG_PATH) $(PKG_SYSPATH) +EXTRA_LIB = $(PKG_LIB) $(MPI_LIB) $(FFT_LIB) $(JPG_LIB) $(PKG_SYSLIB) + +# Path to src files + +vpath %.cpp .. +vpath %.h .. + +# Link target + +$(EXE): $(OBJ) + $(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE) + $(SIZE) $(EXE) + +# Library targets + +lib: $(OBJ) + $(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ) + +shlib: $(OBJ) + $(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \ + $(OBJ) $(EXTRA_LIB) $(LIB) + +# Compilation rules + +%.o:%.cpp + $(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $< + +%.d:%.cpp + $(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@ + +# Individual dependencies + +DEPENDS = $(OBJ:.o=.d) +sinclude $(DEPENDS) diff --git a/src/MAKE/Makefile.intel_offload b/src/MAKE/Makefile.intel_offload new file mode 100755 index 0000000000..eb4415fc8a --- /dev/null +++ b/src/MAKE/Makefile.intel_offload @@ -0,0 +1,109 @@ +# Intel compiler, Intel MPI, MKL FFT, no offload to coprocessor + +SHELL = /bin/sh + +# --------------------------------------------------------------------- +# compiler/linker settings +# specify flags and libraries needed for your compiler + +CC = mpiicpc -openmp -DLMP_INTEL_OFFLOAD -DLAMMPS_MEMALIGN=64 +MIC_OPT = -offload-option,mic,compiler,"-fp-model fast=2 -mGLOB_default_function_attrs=\"gather_scatter_loop_unroll=4\"" +CCFLAGS = -g -O3 -xHost -fno-alias -ansi-alias -restrict -override-limits $(MIC_OPT) +SHFLAGS = -fPIC +DEPFLAGS = -M + +LINK = mpiicpc -openmp -offload +LINKFLAGS = -O3 -xHost +LIB = +SIZE = size + +ARCHIVE = ar +ARFLAGS = -rc +SHLIBFLAGS = -shared + +# --------------------------------------------------------------------- +# LAMMPS-specific settings +# specify settings for LAMMPS features you will use +# if you change any -D setting, do full re-compile after "make clean" + +# LAMMPS ifdef settings, OPTIONAL +# see possible settings in doc/Section_start.html#2_2 (step 4) + +LMP_INC = -DLAMMPS_GZIP -DLAMMPS_JPEG + +# MPI library, REQUIRED +# see discussion in doc/Section_start.html#2_2 (step 5) +# can point to dummy MPI library in src/STUBS as in Makefile.serial +# INC = path for mpi.h, MPI compiler settings +# PATH = path for MPI library +# LIB = name of MPI library + +MPI_INC = -DMPICH_SKIP_MPICXX +MPI_PATH = +MPI_LIB = + +# FFT library, OPTIONAL +# see discussion in doc/Section_start.html#2_2 (step 6) +# can be left blank to use provided KISS FFT library +# INC = -DFFT setting, e.g. -DFFT_FFTW, FFT compiler settings +# PATH = path for FFT library +# LIB = name of FFT library + +FFT_INC = -DFFT_MKL -DFFT_SINGLE +FFT_PATH = +FFT_LIB = -L$(MKLROOT)/lib/intel64/ -lmkl_intel_ilp64 -lmkl_intel_thread -lmkl_core + +# JPEG and/or PNG library, OPTIONAL +# see discussion in doc/Section_start.html#2_2 (step 7) +# only needed if -DLAMMPS_JPEG or -DLAMMPS_PNG listed with LMP_INC +# INC = path(s) for jpeglib.h and/or png.h +# PATH = path(s) for JPEG library and/or PNG library +# LIB = name(s) of JPEG library and/or PNG library + +JPG_INC = +JPG_PATH = +JPG_LIB = -ljpeg + +# --------------------------------------------------------------------- +# build rules and dependencies +# no need to edit this section + +include Makefile.package.settings +include Makefile.package + +EXTRA_INC = $(LMP_INC) $(PKG_INC) $(MPI_INC) $(FFT_INC) $(JPG_INC) $(PKG_SYSINC) +EXTRA_PATH = $(PKG_PATH) $(MPI_PATH) $(FFT_PATH) $(JPG_PATH) $(PKG_SYSPATH) +EXTRA_LIB = $(PKG_LIB) $(MPI_LIB) $(FFT_LIB) $(JPG_LIB) $(PKG_SYSLIB) + +# Path to src files + +vpath %.cpp .. +vpath %.h .. + +# Link target + +$(EXE): $(OBJ) + $(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE) + $(SIZE) $(EXE) + +# Library targets + +lib: $(OBJ) + $(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ) + +shlib: $(OBJ) + $(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \ + $(OBJ) $(EXTRA_LIB) $(LIB) + +# Compilation rules + +%.o:%.cpp + $(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $< + +%.d:%.cpp + $(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@ + +# Individual dependencies + +DEPENDS = $(OBJ:.o=.d) +sinclude $(DEPENDS) diff --git a/src/MAKE/Makefile.linux b/src/MAKE/Makefile.linux index c4264dc223..d835bed045 100755 --- a/src/MAKE/Makefile.linux +++ b/src/MAKE/Makefile.linux @@ -7,12 +7,12 @@ SHELL = /bin/sh # specify flags and libraries needed for your compiler CC = icc -CCFLAGS = -O +CCFLAGS = -O -DLAMMPS_MEMALIGN=64 -openmp -restrict SHFLAGS = -fPIC DEPFLAGS = -M LINK = icc -LINKFLAGS = -O +LINKFLAGS = -O -openmp LIB = -lstdc++ SIZE = size diff --git a/src/MAKE/Makefile.stampede b/src/MAKE/Makefile.stampede new file mode 100755 index 0000000000..8c9591d112 --- /dev/null +++ b/src/MAKE/Makefile.stampede @@ -0,0 +1,109 @@ +# Stampede, Intel Compiler, MKL FFT, Offload to Xeon Phi + +SHELL = /bin/sh + +# --------------------------------------------------------------------- +# compiler/linker settings +# specify flags and libraries needed for your compiler + +CC = mpicc -openmp -DLMP_INTEL_OFFLOAD -DLAMMPS_MEMALIGN=64 +MIC_OPT = -offload-option,mic,compiler,"-fp-model fast=2 -mGLOB_default_function_attrs=\"gather_scatter_loop_unroll=4\"" +CCFLAGS = -O3 -xAVX -fno-alias -ansi-alias -restrict -override-limits $(MIC_OPT) +SHFLAGS = -fPIC +DEPFLAGS = -M + +LINK = mpicc -openmp +LINKFLAGS = -O3 -xAVX +LIB = +SIZE = size + +ARCHIVE = ar +ARFLAGS = -rc +SHLIBFLAGS = -shared + +# --------------------------------------------------------------------- +# LAMMPS-specific settings +# specify settings for LAMMPS features you will use +# if you change any -D setting, do full re-compile after "make clean" + +# LAMMPS ifdef settings, OPTIONAL +# see possible settings in doc/Section_start.html#2_2 (step 4) + +LMP_INC = -DLAMMPS_GZIP -DLAMMPS_JPEG + +# MPI library, REQUIRED +# see discussion in doc/Section_start.html#2_2 (step 5) +# can point to dummy MPI library in src/STUBS as in Makefile.serial +# INC = path for mpi.h, MPI compiler settings +# PATH = path for MPI library +# LIB = name of MPI library + +MPI_INC = -DMPICH_SKIP_MPICXX +MPI_PATH = +MPI_LIB = + +# FFT library, OPTIONAL +# see discussion in doc/Section_start.html#2_2 (step 6) +# can be left blank to use provided KISS FFT library +# INC = -DFFT setting, e.g. -DFFT_FFTW, FFT compiler settings +# PATH = path for FFT library +# LIB = name of FFT library + +FFT_INC = -DFFT_MKL -DFFT_SINGLE -I$(TACC_MKL_INC) +FFT_PATH = +FFT_LIB = -L$(TACC_MKL_LIB) -lmkl_intel_ilp64 -lmkl_intel_thread -lmkl_core + +# JPEG and/or PNG library, OPTIONAL +# see discussion in doc/Section_start.html#2_2 (step 7) +# only needed if -DLAMMPS_JPEG or -DLAMMPS_PNG listed with LMP_INC +# INC = path(s) for jpeglib.h and/or png.h +# PATH = path(s) for JPEG library and/or PNG library +# LIB = name(s) of JPEG library and/or PNG library + +JPG_INC = +JPG_PATH = +JPG_LIB = -ljpeg + +# --------------------------------------------------------------------- +# build rules and dependencies +# no need to edit this section + +include Makefile.package.settings +include Makefile.package + +EXTRA_INC = $(LMP_INC) $(PKG_INC) $(MPI_INC) $(FFT_INC) $(JPG_INC) $(PKG_SYSINC) +EXTRA_PATH = $(PKG_PATH) $(MPI_PATH) $(FFT_PATH) $(JPG_PATH) $(PKG_SYSPATH) +EXTRA_LIB = $(PKG_LIB) $(MPI_LIB) $(FFT_LIB) $(JPG_LIB) $(PKG_SYSLIB) + +# Path to src files + +vpath %.cpp .. +vpath %.h .. + +# Link target + +$(EXE): $(OBJ) + $(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE) + $(SIZE) $(EXE) + +# Library targets + +lib: $(OBJ) + $(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ) + +shlib: $(OBJ) + $(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \ + $(OBJ) $(EXTRA_LIB) $(LIB) + +# Compilation rules + +%.o:%.cpp + $(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $< + +%.d:%.cpp + $(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@ + +# Individual dependencies + +DEPENDS = $(OBJ:.o=.d) +sinclude $(DEPENDS) diff --git a/src/Makefile b/src/Makefile index f8e70a94dc..2c4bb15fa2 100755 --- a/src/Makefile +++ b/src/Makefile @@ -18,8 +18,8 @@ PACKAGE = asphere body class2 colloid dipole fld gpu granular kim \ reax replica rigid shock srd voronoi xtc PACKUSER = user-atc user-awpmd user-cg-cmm user-colvars \ - user-cuda user-eff user-fep user-lb user-misc user-molfile \ - user-omp user-phonon user-qmmm user-reaxc user-sph + user-cuda user-eff user-fep user-intel user-lb user-misc \ + user-molfile user-omp user-phonon user-qmmm user-reaxc user-sph PACKLIB = gpu kim meam poems reax voronoi \ user-atc user-awpmd user-colvars user-qmmm user-cuda user-molfile diff --git a/src/USER-INTEL/Install.sh b/src/USER-INTEL/Install.sh new file mode 100644 index 0000000000..70fc48306a --- /dev/null +++ b/src/USER-INTEL/Install.sh @@ -0,0 +1,107 @@ +# Install/unInstall package files in LAMMPS +# mode = 0/1/2 for uninstall/install/update + +mode=$1 + +# arg1 = file, arg2 = file it depends on + +action () { + if (test $mode = 0) then + rm -f ../$1 + elif (! cmp -s $1 ../$1) then + if (test -z "$2" || test -e ../$2) then + cp $1 .. + if (test $mode = 2) then + echo " updating src/$1" + fi + fi + elif (test -n "$2") then + if (test ! -e ../$2) then + rm -f ../$1 + fi + fi +} + +# step 1: process all *_intel.cpp and *_intel.h files. +# do not install child files if parent does not exist + +for file in *_intel.cpp; do + test $file = thr_intel.cpp && continue + dep=`echo $file | sed 's/neigh_full_intel/neigh_full/g' | \ + sed 's/_offload_intel//g' | sed 's/_intel//g'` + action $file $dep +done + +for file in *_intel.h; do + test $file = thr_intel.h && continue + dep=`echo $file | sed 's/_offload_intel//g' | sed 's/_intel//g'` + action $file $dep +done + +action intel_preprocess.h +action intel_buffers.h +action intel_buffers.cpp +action math_extra_intel.h + +# step 2: handle cases and tasks not handled in step 1. + +if (test $mode = 1) then + + if (test -e ../Makefile.package) then + sed -i -e 's/[^ \t]*INTEL[^ \t]* //' ../Makefile.package + sed -i -e 's|^PKG_INC =[ \t]*|&-DLMP_USER_INTEL |' ../Makefile.package + fi + + # force rebuild of files with LMP_USER_INTEL switch + + touch ../accelerator_intel.h + +elif (test $mode = 0) then + + if (test -e ../Makefile.package) then + sed -i -e 's/[^ \t]*INTEL[^ \t]* //' ../Makefile.package + fi + + # force rebuild of files with LMP_USER_INTEL switch + + touch ../accelerator_intel.h + +fi + +# step 3: map omp styles that are not in the intel package to intel suffix + +#if (test $mode = 0) then +# +# rm -f ../*ompinto_intel* +# +#else +# +# echo " The 'intel' suffix will use the USER-OMP package for all" +# echo " angle, bond, dihedral, kspace, and improper styles:" +# stylelist="pair fix angle bond dihedral improper" +# for header in $stylelist; do +# HEADER=`echo $header | sed 's/\(.*\)/\U\1/'` +# outfile=../$header"_ompinto_intel.h" +# echo " Creating $header style map: $outfile" +# echo -n "// -- Header to map USER-OMP " > $outfile +# echo "styles to the intel suffix" >> $outfile +# echo >> $outfile +# echo "#ifdef "$HEADER"_CLASS" >> $outfile +# grep -h 'Style(' ../$header*_omp.h | grep -v 'charmm/coul/long' | \ +# grep -v 'lj/cut' | grep -v 'gayberne' | \ +# sed 's/\/omp/\/intel/g' >> $outfile +# echo "#endif" >> $outfile +# done +# +# header="kspace" +# HEADER="KSPACE" +# outfile=../$header"_ompinto_intel.h" +# echo " Creating $header style map: $outfile" +# echo -n "// -- Header to map USER-OMP " > $outfile +# echo "styles to the intel suffix" >> $outfile +# echo >> $outfile +# echo "#ifdef "$HEADER"_CLASS" >> $outfile +# grep -h 'KSpaceStyle(' ../*_omp.h | sed 's/\/omp/\/intel/g' >> $outfile +# echo "#endif" >> $outfile +# +#fi diff --git a/src/USER-INTEL/README b/src/USER-INTEL/README new file mode 100644 index 0000000000..0b38928b2e --- /dev/null +++ b/src/USER-INTEL/README @@ -0,0 +1,35 @@ + + -------------------------------- + LAMMPS Intel Package + -------------------------------- + + W. Michael Brown (Intel) + michael.w.brown at intel.com + +----------------------------------------------------------------------------- + +This package is based on the USER-OMP package and provides LAMMPS styles that: + + 1. include support for single and mixed precision in addition to double. + 2. include modifications to support vectorization for key routines + 3. include modifications to support offload to Xeon Phi coprocessors + +----------------------------------------------------------------------------- + +When using the suffix command with "intel", intel styles will be used if they +exist; if they do not, and an omp version exists, that style will be used. +This is accomplished through the files *ompinto_intel.h that are created +in the src directory when the intel package is installed. For example, + + kspace_style pppm/intel 1e-4 + +is equivalent to: + + kspace_style pppm/omp 1e-4 + +because no pppm style has been implemented for the Intel package. + +----------------------------------------------------------------------------- + +In order to use offload to Xeon Phi, the flag -DLMP_INTEL_OFFLOAD should be +set in the Makefile. Offload requires the use of Intel compilers. diff --git a/src/USER-INTEL/fix_intel.cpp b/src/USER-INTEL/fix_intel.cpp new file mode 100644 index 0000000000..8fd3003b49 --- /dev/null +++ b/src/USER-INTEL/fix_intel.cpp @@ -0,0 +1,530 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#include "comm.h" +#include "error.h" +#include "force.h" +#include "neighbor.h" +#include "neigh_request.h" +#include "pair.h" +#include "pair_hybrid.h" +#include "pair_hybrid_overlay.h" +#include "timer.h" +#include "universe.h" +#include "update.h" +#include "fix_intel.h" + +#include +#include +#include + +#include "suffix.h" + +using namespace LAMMPS_NS; +using namespace FixConst; + +#ifdef __INTEL_OFFLOAD +#ifndef _LMP_INTEL_OFFLOAD +#warning "Not building Intel package with Xeon Phi offload support." +#endif +#endif + +enum{NSQ,BIN,MULTI}; + +/* ---------------------------------------------------------------------- */ + +FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg) +{ + if (narg < 4) + error->all(FLERR, "Illegal package intel command"); + if (strcmp(arg[1],"all") != 0) + error->all(FLERR, "fix Intel has to operate on group 'all'"); + + _precision_mode = PREC_MODE_MIXED; + _offload_balance = 1.0; + _overflow_flag[LMP_OVERFLOW] = 0; + _off_overflow_flag[LMP_OVERFLOW] = 0; + + _offload_affinity_balanced = 0; + _offload_threads = 1; + _offload_tpc = 4; + + #ifdef _LMP_INTEL_OFFLOAD + _offload_affinity_set = 0; + _off_force_array_s = 0; + _off_force_array_m = 0; + _off_force_array_d = 0; + _off_ev_array_s = 0; + _off_ev_array_d = 0; + _balance_fixed = 0.0; + + _cop = 0; + + int max_offload_threads, offload_cores; + #pragma offload target(mic:_cop) mandatory \ + out(max_offload_threads,offload_cores) + { + offload_cores = omp_get_num_procs(); + omp_set_num_threads(offload_cores); + max_offload_threads = omp_get_max_threads(); + } + _max_offload_threads = max_offload_threads; + _offload_cores = offload_cores; + _offload_threads = offload_cores; + #endif + int ncops = 1; + _allow_separate_buffers = 1; + _offload_ghost = -1; + + int iarg = 4; + while (iarg < narg) { + if (strcmp(arg[iarg], "mixed") == 0) + _precision_mode = PREC_MODE_MIXED; + else if (strcmp(arg[iarg], "double") == 0) + _precision_mode = PREC_MODE_DOUBLE; + else if (strcmp(arg[iarg], "single") == 0) + _precision_mode = PREC_MODE_SINGLE; + else if (strcmp(arg[iarg], "offload_affinity_balanced") == 0) + _offload_affinity_balanced = 1; + else if (strcmp(arg[iarg], "balance") == 0) { + if (iarg == narg - 1) + error->all(FLERR, "Illegal package intel mode requested"); + ++iarg; + _offload_balance = force->numeric(FLERR,arg[iarg]); + } else if (strcmp(arg[iarg], "offload_threads") == 0) { + if (iarg == narg - 1) + error->all(FLERR, "Illegal package intel mode requested"); + ++iarg; + _offload_threads = atoi(arg[iarg]); + } else if (strcmp(arg[iarg], "offload_tpc") == 0) { + if (iarg == narg - 1) + error->all(FLERR, "Illegal package intel mode requested"); + ++iarg; + _offload_tpc = atoi(arg[iarg]); + } else if (strcmp(arg[iarg], "offload_cards") == 0) { + if (iarg == narg - 1) + error->all(FLERR, "Illegal package intel mode requested"); + ++iarg; + ncops = atoi(arg[iarg]); + } else if (strcmp(arg[iarg], "buffers") == 0) { + if (iarg == narg - 1) + error->all(FLERR, "Illegal package intel mode requested"); + ++iarg; + _allow_separate_buffers = atoi(arg[iarg]); + } else if (strcmp(arg[iarg], "offload_ghost") == 0) { + if (iarg == narg - 1) + error->all(FLERR, "Illegal package intel mode requested"); + ++iarg; + _offload_ghost = atoi(arg[iarg]); + } else + error->all(FLERR, "Illegal package intel mode requested"); + ++iarg; + } + + if (_offload_balance > 1.0 || _offload_threads <= 0 || + _offload_tpc <= 0 || _offload_tpc > 4) + error->all(FLERR, "Illegal package intel mode requested"); + + #ifdef _LMP_INTEL_OFFLOAD + _ncops = ncops; + if (_offload_balance < 0.0) { + _balance_neighbor = 0.9; + _balance_pair = 0.9; + } else { + _balance_neighbor = _offload_balance; + _balance_pair = _offload_balance; + } + + _tscreen = screen; + zero_timers(); + _setup_time_cleared = false; + _timers_allocated = false; + #else + _offload_balance = 0.0; + #endif + + if (_precision_mode == PREC_MODE_SINGLE) + _single_buffers = new IntelBuffers(lmp); + else if (_precision_mode == PREC_MODE_MIXED) + _mixed_buffers = new IntelBuffers(lmp); + else + _double_buffers = new IntelBuffers(lmp); +} + +/* ---------------------------------------------------------------------- */ + +FixIntel::~FixIntel() +{ + #ifdef _LMP_INTEL_OFFLOAD + output_timing_data(); + if (_timers_allocated) { + double *time1 = off_watch_pair(); + double *time2 = off_watch_neighbor(); + int *overflow = get_off_overflow_flag(); + if (time1 != NULL && time2 != NULL && overflow != NULL) { + #pragma offload_transfer target(mic:_cop) \ + nocopy(time1,time2,overflow:alloc_if(0) free_if(1)) + } + } + #endif + + if (_precision_mode == PREC_MODE_SINGLE) + delete _single_buffers; + else if (_precision_mode == PREC_MODE_MIXED) + delete _mixed_buffers; + else + delete _double_buffers; +} + +/* ---------------------------------------------------------------------- */ + +int FixIntel::setmask() +{ + int mask = 0; + return mask; +} + +/* ---------------------------------------------------------------------- */ + +void FixIntel::init() +{ + #ifdef _LMP_INTEL_OFFLOAD + if (_offload_balance != 0.0) atom->sortfreq = 1; + + if (force->newton_pair == 0) + _offload_noghost = 0; + else if (_offload_ghost == 0) + _offload_noghost = 1; + + set_offload_affinity(); + + output_timing_data(); + if (!_timers_allocated) { + double *time1 = off_watch_pair(); + double *time2 = off_watch_neighbor(); + int *overflow = get_off_overflow_flag(); + if (time1 != NULL && time2 != NULL && overflow != NULL) { + #pragma offload_transfer target(mic:_cop) \ + nocopy(time1,time2:length(1) alloc_if(1) free_if(0)) \ + in(overflow:length(5) alloc_if(1) free_if(0)) + } + _timers_allocated = true; + } + + char kmode[80]; + if (_precision_mode == PREC_MODE_SINGLE) + strcpy(kmode, "single"); + else if (_precision_mode == PREC_MODE_MIXED) + strcpy(kmode, "mixed"); + else + strcpy(kmode, "double"); + + // print summary of settings + if (comm->me == 0) { + if (screen) { + #ifdef _LMP_INTEL_OFFLOAD + if (_offload_balance != 0.0) { + fprintf(screen,"using offload with %d threads per core, ",_offload_tpc); + fprintf(screen,"%d threads per task\n",_offload_threads); + } + #endif + } + } + if (update->whichflag == 2 && _offload_balance != 0.0) { + if (_offload_balance == 1.0 && _offload_noghost == 0) + _sync_at_pair = 1; + else + _sync_at_pair = 2; + } else { + _sync_at_pair = 0; + if (strstr(update->integrate_style,"intel") == 0) + error->all(FLERR, + "Specified run_style does not support the Intel package."); + } + #endif + + if (neighbor->style != BIN) + error->all(FLERR, + "Currently, neighbor style BIN must be used with Intel package."); + if (neighbor->exclude_setting() != 0) + error->all(FLERR, + "Currently, cannot use neigh_modify exclude with Intel package."); + int nstyles = 0; + if (force->pair_match("hybrid", 1) != NULL) { + PairHybrid *hybrid = (PairHybrid *) force->pair; + for (int i = 0; i < hybrid->nstyles; i++) + if (strstr(hybrid->keywords[i], "/intel") == NULL) + nstyles++; + } else if (force->pair_match("hybrid/overlay", 1) != NULL) { + PairHybridOverlay *hybrid = (PairHybridOverlay *) force->pair; + for (int i = 0; i < hybrid->nstyles; i++) + if (strstr(hybrid->keywords[i], "/intel") == NULL) + nstyles++; + else + force->pair->no_virial_fdotr_compute = 1; + } + if (nstyles > 1) + error->all(FLERR, + "Currently, cannot use more than one intel style with hybrid."); + + neighbor->fix_intel = (void *)this; + _nthreads = comm->nthreads; + + check_neighbor_intel(); + if (_precision_mode == PREC_MODE_SINGLE) + _single_buffers->zero_ev(); + else if (_precision_mode == PREC_MODE_MIXED) + _mixed_buffers->zero_ev(); + else + _double_buffers->zero_ev(); +} + +/* ---------------------------------------------------------------------- */ + +void FixIntel::check_neighbor_intel() +{ + #ifdef _LMP_INTEL_OFFLOAD + _full_host_list = 0; + #endif + const int nrequest = neighbor->nrequest; + + for (int i = 0; i < nrequest; ++i) { + #ifdef _LMP_INTEL_OFFLOAD + if (_offload_balance != 0.0 && neighbor->requests[i]->intel == 0) { + _full_host_list = 1; + _offload_noghost = 0; + } + #endif + if (neighbor->requests[i]->skip) + error->all(FLERR, "Cannot yet use hybrid styles with Intel package."); + } +} + +/* ---------------------------------------------------------------------- */ + +void FixIntel::sync_coprocessor() +{ + #ifdef _LMP_INTEL_OFFLOAD + if (_offload_balance != 0.0) { + if (_off_force_array_m != 0) { + add_off_results(_off_force_array_m, _off_ev_array_d); + _off_force_array_m = 0; + } else if (_off_force_array_d != 0) { + add_off_results(_off_force_array_d, _off_ev_array_d); + _off_force_array_d = 0; + } else if (_off_force_array_s != 0) { + add_off_results(_off_force_array_s, _off_ev_array_s); + _off_force_array_s = 0; + } + } + #endif +} + +/* ---------------------------------------------------------------------- */ + +double FixIntel::memory_usage() +{ + double bytes; + if (_precision_mode == PREC_MODE_SINGLE) + bytes = _single_buffers->memory_usage(_nthreads); + else if (_precision_mode == PREC_MODE_MIXED) + bytes = _mixed_buffers->memory_usage(_nthreads); + else + bytes = _double_buffers->memory_usage(_nthreads); + + return bytes; +} + +/* ---------------------------------------------------------------------- */ + +#ifdef _LMP_INTEL_OFFLOAD + +void FixIntel::output_timing_data() { + if (_im_real_space_task == 0 || _offload_affinity_set == 0) return; + + double timer_total = 0.0; + int size, rank; + double timers[NUM_ITIMERS]; + MPI_Comm_size(_real_space_comm, &size); + MPI_Comm_rank(_real_space_comm, &rank); + MPI_Allreduce(&_timers, &timers, NUM_ITIMERS, MPI_DOUBLE, MPI_SUM, + _real_space_comm); + for (int i=0; i < NUM_ITIMERS; i++) { + timers[i] /= size; + timer_total += timers[i]; + } + #ifdef TIME_BALANCE + double timers_min[NUM_ITIMERS], timers_max[NUM_ITIMERS]; + MPI_Allreduce(&_timers, &timers_max, NUM_ITIMERS, MPI_DOUBLE, MPI_MAX, + _real_space_comm); + MPI_Allreduce(&_timers, &timers_min, NUM_ITIMERS, MPI_DOUBLE, MPI_MIN, + _real_space_comm); + #endif + + if (timer_total > 0.0) { + double balance_out[2], balance_in[2]; + balance_out[0] = _balance_pair; + balance_out[1] = _balance_neighbor; + MPI_Reduce(balance_out, balance_in, 2, MPI_DOUBLE, MPI_SUM, + 0, _real_space_comm); + balance_in[0] /= size; + balance_in[1] /= size; + + if (rank == 0 && _tscreen) { + fprintf(_tscreen, "\n------------------------------------------------\n"); + fprintf(_tscreen, " Offload Timing Data\n"); + fprintf(_tscreen, "------------------------------------------------\n"); + fprintf(_tscreen, " Data Pack/Cast Seconds %f\n", + timers[TIME_PACK]); + if (_offload_balance != 0.0) { + fprintf(_tscreen, " Host Neighbor Seconds %f\n", + timers[TIME_HOST_NEIGHBOR]); + fprintf(_tscreen, " Host Pair Seconds %f\n", + timers[TIME_HOST_PAIR]); + fprintf(_tscreen, " Offload Neighbor Seconds %f\n", + timers[TIME_OFFLOAD_NEIGHBOR]); + fprintf(_tscreen, " Offload Pair Seconds %f\n", + timers[TIME_OFFLOAD_PAIR]); + fprintf(_tscreen, " Offload Wait Seconds %f\n", + timers[TIME_OFFLOAD_WAIT]); + fprintf(_tscreen, " Offload Latency Seconds %f\n", + timers[TIME_OFFLOAD_LATENCY]); + fprintf(_tscreen, " Offload Neighbor Balance %f\n", + balance_in[1]); + fprintf(_tscreen, " Offload Pair Balance %f\n", + balance_in[0]); + fprintf(_tscreen, " Offload Ghost Atoms "); + if (_offload_noghost) fprintf(_tscreen,"No\n"); + else fprintf(_tscreen,"Yes\n"); + #ifdef TIME_BALANCE + fprintf(_tscreen, " Offload Imbalance Seconds %f\n", + timers[TIME_IMBALANCE]); + fprintf(_tscreen, " Offload Min/Max Seconds "); + for (int i = 0; i < NUM_ITIMERS; i++) + fprintf(_tscreen, "[%f, %f] ",timers_min[i],timers_max[i]); + fprintf(_tscreen, "\n"); + #endif + } + fprintf(_tscreen, "------------------------------------------------\n"); + } + zero_timers(); + _setup_time_cleared = false; + } +} + +/* ---------------------------------------------------------------------- */ + +int FixIntel::get_ppn(int &node_rank) { + int nprocs; + int rank; + MPI_Comm_size(_real_space_comm, &nprocs); + MPI_Comm_rank(_real_space_comm, &rank); + + int name_length; + char node_name[MPI_MAX_PROCESSOR_NAME]; + MPI_Get_processor_name(node_name,&name_length); + node_name[name_length] = '\0'; + char *node_names = new char[MPI_MAX_PROCESSOR_NAME*nprocs]; + MPI_Allgather(node_name, MPI_MAX_PROCESSOR_NAME, MPI_CHAR, node_names, + MPI_MAX_PROCESSOR_NAME, MPI_CHAR, _real_space_comm); + int ppn = 0; + node_rank = 0; + for (int i = 0; i < nprocs; i++) { + if (strcmp(node_name, node_names + i * MPI_MAX_PROCESSOR_NAME) == 0) { + ppn++; + if (i < rank) + node_rank++; + } + } + + return ppn; +} + +/* ---------------------------------------------------------------------- */ + +void FixIntel::set_offload_affinity() +{ + _separate_buffers = 0; + if (_allow_separate_buffers) + if (_offload_balance != 0.0 && _offload_balance < 1.0) + _separate_buffers = 1; + + _im_real_space_task = 1; + if (strncmp(update->integrate_style,"verlet/split",12) == 0) { + _real_space_comm = world; + if (universe->iworld != 0) { + _im_real_space_task = 0; + return; + } + } else + _real_space_comm = universe->uworld; + + if (_offload_balance == 0.0) _cop = -1; + if (_offload_balance == 0.0 || _offload_affinity_set == 1) + return; + + _offload_affinity_set = 1; + int node_rank; + int ppn = get_ppn(node_rank); + + if (ppn % _ncops != 0) + error->all(FLERR, "MPI tasks per node must be multiple of offload_cards"); + ppn = ppn / _ncops; + _cop = node_rank / ppn; + node_rank = node_rank % ppn; + + int max_threads_per_task = _offload_cores / 4 * _offload_tpc / ppn; + if (_offload_threads > max_threads_per_task) + _offload_threads = max_threads_per_task; + if (_offload_threads > _max_offload_threads) + _offload_threads = _max_offload_threads; + + int offload_threads = _offload_threads; + int offload_tpc = _offload_tpc; + int offload_affinity_balanced = _offload_affinity_balanced; + #pragma offload target(mic:_cop) mandatory \ + in(node_rank,offload_threads,offload_tpc,offload_affinity_balanced) + { + omp_set_num_threads(offload_threads); + #pragma omp parallel + { + int tnum = omp_get_thread_num(); + kmp_affinity_mask_t mask; + kmp_create_affinity_mask(&mask); + int proc; + if (offload_affinity_balanced) { + proc = offload_threads * node_rank + tnum; + proc = proc * 4 - (proc / 60) * 240 + proc / 60 + 1; + } else { + proc = offload_threads * node_rank + tnum; + proc += (proc / 4) * (4 - offload_tpc) + 1; + } + kmp_set_affinity_mask_proc(proc, &mask); + if (kmp_set_affinity(&mask) != 0) + printf("Could not set affinity on rank %d thread %d to %d\n", + node_rank, tnum, proc); + } + } + if (_precision_mode == PREC_MODE_SINGLE) + _single_buffers->set_off_params(offload_threads, _cop, _separate_buffers); + else if (_precision_mode == PREC_MODE_MIXED) + _mixed_buffers->set_off_params(offload_threads, _cop, _separate_buffers); + else + _double_buffers->set_off_params(offload_threads, _cop, _separate_buffers); +} + +#endif diff --git a/src/USER-INTEL/fix_intel.h b/src/USER-INTEL/fix_intel.h new file mode 100644 index 0000000000..82ebc734a2 --- /dev/null +++ b/src/USER-INTEL/fix_intel.h @@ -0,0 +1,593 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef FIX_CLASS + +FixStyle(Intel,FixIntel) + +#else + +#ifndef LMP_FIX_INTEL_H +#define LMP_FIX_INTEL_H + +#include "fix.h" +#include "intel_buffers.h" +#include "force.h" +#include "pair.h" +#include "error.h" +#include "update.h" + +namespace LAMMPS_NS { + +class IntelData; +template class IntelBuffers; + +class FixIntel : public Fix { + public: + FixIntel(class LAMMPS *, int, char **); + virtual ~FixIntel(); + virtual int setmask(); + virtual void init(); + + // Get all forces, calculation results from coprocesser + void sync_coprocessor(); + + double memory_usage(); + + typedef struct { double x,y,z; } lmp_ft; + + enum {PREC_MODE_SINGLE, PREC_MODE_MIXED, PREC_MODE_DOUBLE}; + + inline int precision() { return _precision_mode; } + inline IntelBuffers * get_single_buffers() + { return _single_buffers; } + inline IntelBuffers * get_mixed_buffers() + { return _mixed_buffers; } + inline IntelBuffers * get_double_buffers() + { return _double_buffers; } + + protected: + IntelBuffers *_single_buffers; + IntelBuffers *_mixed_buffers; + IntelBuffers *_double_buffers; + + int _precision_mode, _nthreads; + + public: + inline int* get_overflow_flag() { return _overflow_flag; } + inline int* get_off_overflow_flag() { return _off_overflow_flag; } + inline void add_result_array(IntelBuffers::vec3_acc_t *f_in, + double *ev_in, const int offload, + const int eatom = 0, const int vatom = 0); + inline void add_result_array(IntelBuffers::vec3_acc_t *f_in, + double *ev_in, const int offload, + const int eatom = 0, const int vatom = 0); + inline void add_result_array(IntelBuffers::vec3_acc_t *f_in, + float *ev_in, const int offload, + const int eatom = 0, const int vatom = 0); + inline void get_buffern(const int offload, int &nlocal, int &nall, + int &minlocal); + + #ifdef _LMP_INTEL_OFFLOAD + inline int coprocessor_number() { return _cop; } + inline int full_host_list() { return _full_host_list; } + void set_offload_affinity(); + inline double offload_balance() { return _offload_balance; } + inline int offload_end_neighbor() { return _balance_neighbor * atom->nlocal; } + inline int offload_end_pair(); + inline int host_start_neighbor() + { if (_offload_noghost) return 0; else return offload_end_neighbor(); } + inline int host_start_pair() + { if (_offload_noghost) return 0; else return offload_end_pair(); } + inline int offload_nlocal() { return _offload_nlocal; } + inline int offload_nall() { return _offload_nall; } + inline int offload_min_ghost() { return _offload_min_ghost; } + inline int host_min_local() { return _host_min_local; } + inline int host_min_ghost() { return _host_min_ghost; } + inline int host_used_local() { return _host_used_local; } + inline int host_used_ghost() { return _host_used_ghost; } + inline int host_nall() { return _host_nall; } + inline int separate_buffers() { return _separate_buffers; } + inline int offload_noghost() { return _offload_noghost; } + inline void set_offload_noghost(const int v) + { if (_offload_ghost < 0) _offload_noghost = v; } + inline void set_neighbor_host_sizes(); + + inline void zero_timers() + { memset(_timers, 0, sizeof(double) * NUM_ITIMERS); } + inline void start_watch(const int which) { _stopwatch[which] = MPI_Wtime(); } + inline double stop_watch(const int which); + inline double * off_watch_pair() { return _stopwatch_offload_pair; } + inline double * off_watch_neighbor() { return _stopwatch_offload_neighbor; } + inline void balance_stamp(); + inline void acc_timers(); + #else + inline int offload_end_neighbor() { return 0; } + inline int offload_end_pair() { return 0; } + inline int host_start_neighbor() { return 0; } + inline int host_start_pair() { return 0; } + inline void zero_timers() {} + inline void start_watch(const int which) {} + inline double stop_watch(const int which) { return 0.0; } + double * off_watch_pair() { return NULL; } + double * off_watch_neighbor() { return NULL; } + inline void balance_stamp() {} + inline void acc_timers() {} + inline int separate_buffers() { return 0; } + #endif + + protected: + int _overflow_flag[5]; + __declspec(align(64)) int _off_overflow_flag[5]; + int _allow_separate_buffers, _offload_ghost; + #ifdef _LMP_INTEL_OFFLOAD + double _balance_pair_time, _balance_other_time; + int _offload_nlocal, _offload_nall, _offload_min_ghost, _offload_nghost; + int _host_min_local, _host_min_ghost, _host_nall; + int _host_used_local, _host_used_ghost; + int _separate_buffers, _offload_noghost, _sync_at_pair; + bool _setup_time_cleared, _timers_allocated; + void output_timing_data(); + FILE *_tscreen; + + IntelBuffers::vec3_acc_t *_off_force_array_s; + IntelBuffers::vec3_acc_t *_off_force_array_m; + IntelBuffers::vec3_acc_t *_off_force_array_d; + float *_off_ev_array_s; + double *_off_ev_array_d; + int _off_results_eatom, _off_results_vatom; + int _full_host_list, _cop, _ncops; + + int get_ppn(int &); + #endif + void check_neighbor_intel(); + + double _offload_balance, _balance_neighbor, _balance_pair, _balance_fixed; + double _timers[NUM_ITIMERS]; + double _stopwatch[NUM_ITIMERS]; + __declspec(align(64)) double _stopwatch_offload_neighbor[1]; + __declspec(align(64)) double _stopwatch_offload_pair[1]; + + template + inline void add_results(const ft * restrict const f_in, + const acc_t * restrict const ev_global, + const int eatom, const int vatom, + const int offload); + + template + inline void add_oresults(const ft * restrict const f_in, + const acc_t * restrict const ev_global, + const int eatom, const int vatom, + const int out_offset, const int nall); + + int _offload_affinity_balanced, _offload_threads, _offload_tpc; + #ifdef _LMP_INTEL_OFFLOAD + int _max_offload_threads, _offload_cores, _offload_affinity_set; + int _im_real_space_task; + MPI_Comm _real_space_comm; + template + inline void add_off_results(const ft * restrict const f_in, + const acc_t * restrict const ev_global); + #endif +}; + +/* ---------------------------------------------------------------------- */ + +void FixIntel::get_buffern(const int offload, int &nlocal, int &nall, + int &minlocal) { + #ifdef _LMP_INTEL_OFFLOAD + if (_separate_buffers) { + if (offload) { + if (neighbor->ago != 0) { + nlocal = _offload_nlocal; + nall = _offload_nall; + } else { + nlocal = atom->nlocal; + nall = nlocal + atom->nghost; + } + minlocal = 0; + } else { + nlocal = atom->nlocal; + nall = _host_nall; + minlocal = _host_min_local; + } + return; + } + if (_offload_noghost && offload) + nall = atom->nlocal; + else + #endif + nall = atom->nlocal + atom->nghost; + nlocal = atom->nlocal; + minlocal = 0; +} + +/* ---------------------------------------------------------------------- */ + +void FixIntel::add_result_array(IntelBuffers::vec3_acc_t *f_in, + double *ev_in, const int offload, + const int eatom, const int vatom) { + #ifdef _LMP_INTEL_OFFLOAD + if (offload) { + _off_results_eatom = eatom; + _off_results_vatom = vatom; + _off_force_array_d = f_in; + _off_ev_array_d = ev_in; + if (_sync_at_pair == 1) sync_coprocessor(); + return; + } + #endif + add_results(f_in, ev_in, eatom, vatom, 0); + if (_overflow_flag[LMP_OVERFLOW]) + error->one(FLERR, "Neighbor list overflow, boost neigh_modify one"); + #ifdef _LMP_INTEL_OFFLOAD + if (_sync_at_pair) sync_coprocessor(); + #endif +} + +/* ---------------------------------------------------------------------- */ + +void FixIntel::add_result_array(IntelBuffers::vec3_acc_t *f_in, + double *ev_in, const int offload, + const int eatom, const int vatom) { + #ifdef _LMP_INTEL_OFFLOAD + if (offload) { + _off_results_eatom = eatom; + _off_results_vatom = vatom; + _off_force_array_m = f_in; + _off_ev_array_d = ev_in; + if (_sync_at_pair == 1) sync_coprocessor(); + return; + } + #endif + add_results(f_in, ev_in, eatom, vatom, 0); + if (_overflow_flag[LMP_OVERFLOW]) + error->one(FLERR, "Neighbor list overflow, boost neigh_modify one"); + #ifdef _LMP_INTEL_OFFLOAD + if (_sync_at_pair) sync_coprocessor(); + #endif +} + +/* ---------------------------------------------------------------------- */ + +void FixIntel::add_result_array(IntelBuffers::vec3_acc_t *f_in, + float *ev_in, const int offload, + const int eatom, const int vatom) { + #ifdef _LMP_INTEL_OFFLOAD + if (offload) { + _off_results_eatom = eatom; + _off_results_vatom = vatom; + _off_force_array_s = f_in; + _off_ev_array_s = ev_in; + if (_sync_at_pair == 1) sync_coprocessor(); + return; + } + #endif + add_results(f_in, ev_in, eatom, vatom, 0); + if (_overflow_flag[LMP_OVERFLOW]) + error->one(FLERR, "Neighbor list overflow, boost neigh_modify one"); + #ifdef _LMP_INTEL_OFFLOAD + if (_sync_at_pair) sync_coprocessor(); + #endif +} + +/* ---------------------------------------------------------------------- */ + +template +void FixIntel::add_results(const ft * restrict const f_in, + const acc_t * restrict const ev_global, + const int eatom, const int vatom, + const int offload) { + start_watch(TIME_PACK); + int f_length; + #ifdef _LMP_INTEL_OFFLOAD + if (_separate_buffers) { + if (offload) { + add_oresults(f_in, ev_global, eatom, vatom, 0, _offload_nlocal); + if (force->newton_pair) { + const acc_t * restrict const enull = 0; + int offset = _offload_nlocal; + if (atom->torque) offset *= 2; + add_oresults(f_in + offset, enull, eatom, vatom, + _offload_min_ghost, _offload_nghost); + } + } else { + add_oresults(f_in, ev_global, eatom, vatom, + _host_min_local, _host_used_local); + if (force->newton_pair) { + const acc_t * restrict const enull = 0; + int offset = _host_used_local; + if (atom->torque) offset *= 2; + add_oresults(f_in + offset, enull, eatom, + vatom, _host_min_ghost, _host_used_ghost); + } + } + stop_watch(TIME_PACK); + return; + } + if (force->newton_pair && (_offload_noghost == 0 || offload == 0)) + f_length = atom->nlocal + atom->nghost; + else + f_length = atom->nlocal; + #else + if (force->newton_pair) + f_length = atom->nlocal + atom->nghost; + else + f_length = atom->nlocal; + #endif + + add_oresults(f_in, ev_global, eatom, vatom, 0, f_length); + stop_watch(TIME_PACK); +} + +/* ---------------------------------------------------------------------- */ + +template +void FixIntel::add_oresults(const ft * restrict const f_in, + const acc_t * restrict const ev_global, + const int eatom, const int vatom, + const int out_offset, const int nall) { + lmp_ft * restrict const f = (lmp_ft *) lmp->atom->f[0] + out_offset; + if (atom->torque) { + if (f_in[1].w) + if (f_in[1].w == 1) + error->all(FLERR,"Bad matrix inversion in mldivide3"); + else + error->all(FLERR, + "Sphere particles not yet supported for gayberne/intel"); + } + + #if defined(_OPENMP) + #pragma omp parallel default(none) + #endif + { + const int tid = omp_get_thread_num(); + int ifrom, ito; + IP_PRE_omp_range_align(ifrom, ito, tid, nall, _nthreads, sizeof(acc_t)); + if (atom->torque) { + int ii = ifrom * 2; + lmp_ft * restrict const tor = (lmp_ft *) lmp->atom->torque[0] + + out_offset; + if (eatom) { + for (int i = ifrom; i < ito; i++) { + f[i].x += f_in[ii].x; + f[i].y += f_in[ii].y; + f[i].z += f_in[ii].z; + force->pair->eatom[i] += f_in[ii].w; + tor[i].x += f_in[ii+1].x; + tor[i].y += f_in[ii+1].y; + tor[i].z += f_in[ii+1].z; + ii += 2; + } + } else { + for (int i = ifrom; i < ito; i++) { + f[i].x += f_in[ii].x; + f[i].y += f_in[ii].y; + f[i].z += f_in[ii].z; + tor[i].x += f_in[ii+1].x; + tor[i].y += f_in[ii+1].y; + tor[i].z += f_in[ii+1].z; + ii += 2; + } + } + } else { + if (eatom) { + for (int i = ifrom; i < ito; i++) { + f[i].x += f_in[i].x; + f[i].y += f_in[i].y; + f[i].z += f_in[i].z; + force->pair->eatom[i] += f_in[i].w; + } + } else { + for (int i = ifrom; i < ito; i++) { + f[i].x += f_in[i].x; + f[i].y += f_in[i].y; + f[i].z += f_in[i].z; + } + } + } + } + + if (ev_global != NULL) { + force->pair->eng_vdwl += ev_global[0]; + force->pair->eng_coul += ev_global[1]; + force->pair->virial[0] += ev_global[2]; + force->pair->virial[1] += ev_global[3]; + force->pair->virial[2] += ev_global[4]; + force->pair->virial[3] += ev_global[5]; + force->pair->virial[4] += ev_global[6]; + force->pair->virial[5] += ev_global[7]; + } +} + +#ifdef _LMP_INTEL_OFFLOAD + +/* ---------------------------------------------------------------------- */ + +int FixIntel::offload_end_pair() { + if (neighbor->ago == 0) return _balance_neighbor * atom->nlocal; + else return _balance_pair * atom->nlocal; +} + +/* ---------------------------------------------------------------------- */ + +double FixIntel::stop_watch(const int which) { + double elapsed = MPI_Wtime() - _stopwatch[which]; + _timers[which] += elapsed; + return elapsed; +} + +/* ---------------------------------------------------------------------- */ + +void FixIntel::balance_stamp() { + if (_offload_balance < 0.0) { + double ct = MPI_Wtime(); + _balance_other_time = ct; + _balance_pair_time = ct - _stopwatch[TIME_HOST_PAIR]; + } +} + +/* ---------------------------------------------------------------------- */ + +void FixIntel::acc_timers() { + if (neighbor->ago == 0) { + _timers[TIME_OFFLOAD_NEIGHBOR] += *_stopwatch_offload_neighbor; + if (_setup_time_cleared == false) { + zero_timers(); + _setup_time_cleared = true; + } + } + _timers[TIME_OFFLOAD_PAIR] += *_stopwatch_offload_pair; +} + +/* ---------------------------------------------------------------------- */ + +void FixIntel::set_neighbor_host_sizes() { + _host_min_local = _overflow_flag[LMP_LOCAL_MIN]; + _host_min_ghost = _overflow_flag[LMP_GHOST_MIN]; + _host_used_local = atom->nlocal - _host_min_local; + _host_used_ghost = _overflow_flag[LMP_GHOST_MAX] + 1 - _host_min_ghost; + if (_host_used_ghost < 0) _host_used_ghost = 0; + _host_nall = atom->nlocal + _host_used_ghost; +} + +/* ---------------------------------------------------------------------- */ + +template +void FixIntel::add_off_results(const ft * restrict const f_in, + const acc_t * restrict const ev_global) { + if (_offload_balance < 0.0) + _balance_other_time = MPI_Wtime() - _balance_other_time; + + start_watch(TIME_OFFLOAD_WAIT); + #ifdef _LMP_INTEL_OFFLOAD + #pragma offload_wait target(mic:_cop) wait(f_in) + #endif + double wait_time = stop_watch(TIME_OFFLOAD_WAIT); + + if (neighbor->ago == 0) { + if (_off_overflow_flag[LMP_OVERFLOW]) + error->one(FLERR, "Neighbor list overflow, boost neigh_modify one"); + _offload_nlocal = _off_overflow_flag[LMP_LOCAL_MAX] + 1; + _offload_min_ghost = _off_overflow_flag[LMP_GHOST_MIN]; + _offload_nghost = _off_overflow_flag[LMP_GHOST_MAX] + 1 - + _offload_min_ghost; + if (_offload_nghost < 0) _offload_nghost = 0; + _offload_nall = _offload_nlocal + _offload_nghost; + _offload_nlocal; + } + + int nlocal = atom->nlocal; + // Load balance? + if (_offload_balance < 0.0) { + if (neighbor->ago == 0) + _balance_pair = _balance_neighbor; + double mic_time; + mic_time = *_stopwatch_offload_pair; + if (_balance_pair_time + _balance_other_time < mic_time) { + double ft = _balance_pair_time + _balance_other_time + wait_time - + mic_time; + _balance_fixed = (1.0 - INTEL_LB_MEAN_WEIGHT) * _balance_fixed + + INTEL_LB_MEAN_WEIGHT * ft; + } + + double ctps = _balance_pair_time / (1.0-_balance_pair); + double otps = mic_time / _balance_pair; + double new_balance = (ctps + _balance_other_time - _balance_fixed) / + (otps + ctps); + if (new_balance < 0.01) new_balance = 0.01; + else if (new_balance > 0.99) new_balance = 0.99; + _balance_neighbor = (1.0 - INTEL_LB_MEAN_WEIGHT) *_balance_neighbor + + INTEL_LB_MEAN_WEIGHT * new_balance; + } + + #ifdef TIME_BALANCE + start_watch(TIME_IMBALANCE); + MPI_Barrier(_real_space_comm); + stop_watch(TIME_IMBALANCE); + #endif + acc_timers(); + if (atom->torque) + if (f_in[1].w < 0.0) + error->all(FLERR, "Bad matrix inversion in mldivide3"); + add_results(f_in, ev_global, _off_results_eatom, _off_results_vatom, 1); +} + +#endif + +} + +#endif +#endif + +/* ERROR/WARNING messages: + +E: The 'package intel' command is required for /intel styles + +Self-explanatory. + +E: Neighbor list overflow, boost neigh_modify one + +Increase the value for neigh_modify one to allow for larger allocations for +neighbor list builds. The value required can be different for the Intel +package in order to support offload to a coprocessor. + +E: Bad matrix inversion in mldivide3 + +This error should not occur unless the matrix is badly formed. + +E: Illegal package intel command + +The format for the package intel command is incorrect. Please see the +documentation. + +E: fix intel has to operate on group 'all' + +Self explanatory. + +E: Illegal package intel mode requested + +The format for the package intel command is incorrect. Please see the +documentation. + +E: Specified run_style does not support the Intel package. + +When using offload to a coprocessor, the Intel package requires a run style +with the intel suffix. + +E: Currently, neighbor style BIN must be used with Intel package. + +This is the only neighbor style that has been implemented for the Intel +package. + +E: Currently, cannot use neigh_modify exclude with Intel package. + +This is a current restriction of the Intel package. + +E: Currently, cannot use more than one intel style with hybrid. + +Currently, hybrid pair styles can only use the intel suffix for one of the +pair styles. + +E: Cannot yet use hybrid styles with Intel package. + +The hybrid pair style configuration is not yet supported by the Intel +package. Support is limited to hybrid/overlay or a hybrid style that does +not require a skip list. + +E: MPI tasks per node must be multiple of offload_cards + +For offload to multiple coprocessors on a single node, the Intel package +requires that each coprocessor is used by the same number of MPI tasks. + +*/ diff --git a/src/USER-INTEL/intel_buffers.cpp b/src/USER-INTEL/intel_buffers.cpp new file mode 100644 index 0000000000..a541f0f359 --- /dev/null +++ b/src/USER-INTEL/intel_buffers.cpp @@ -0,0 +1,432 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + This software is distributed under the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#include "intel_buffers.h" +#include "force.h" +#include "memory.h" + +using namespace LAMMPS_NS; + +/* ---------------------------------------------------------------------- */ + +template +IntelBuffers::IntelBuffers(class LAMMPS *lmp_in) : + lmp(lmp_in), _x(0), _q(0), _quat(0), _f(0), _buf_size(0), + _buf_local_size(0), _off_threads(0) { + _list_alloc_atoms = 0; + _ntypes = 0; + _off_map_maxlocal = 0; + #ifdef _LMP_INTEL_OFFLOAD + _separate_buffers = 0; + _off_f = 0; + _off_map_ilist = 0; + _off_map_nmax = 0; + _off_map_maxhead = 0; + _off_list_alloc = false; + _off_threads = 0; + #endif +} + +/* ---------------------------------------------------------------------- */ + +template +IntelBuffers::~IntelBuffers() +{ + free_buffers(); + free_all_nbor_buffers(); + set_ntypes(0); +} + +/* ---------------------------------------------------------------------- */ + +template +void IntelBuffers::free_buffers() +{ + if (_buf_size > 0) { + atom_t * x = get_x(); + flt_t * q = get_q(); + quat_t * quat = get_quat(); + + #ifdef _LMP_INTEL_OFFLOAD + vec3_acc_t * f_start = get_off_f(); + if (f_start != 0) { + acc_t * ev_global = get_ev_global(); + if (ev_global != 0) { + #pragma offload_transfer target(mic:_cop) \ + nocopy(x:alloc_if(0) free_if(1)) \ + nocopy(f_start:alloc_if(0) free_if(1)) \ + nocopy(ev_global:alloc_if(0) free_if(1)) + } + + if (q != 0) { + #pragma offload_transfer target (mic:_cop) \ + nocopy(q:alloc_if(0) free_if(1)) + } + if (quat != 0) { + #pragma offload_transfer target (mic:_cop) \ + nocopy(quat:alloc_if(0) free_if(1)) + } + lmp->memory->destroy(f_start); + } + + if (_separate_buffers) { + lmp->memory->destroy(_host_x); + if (q != 0) lmp->memory->destroy(_host_q); + if (quat != 0) lmp->memory->destroy(_host_quat); + } + #endif + + lmp->memory->destroy(x); + if (q != 0) lmp->memory->destroy(q); + if (quat != 0) lmp->memory->destroy(quat); + lmp->memory->destroy(_f); + _buf_size = _buf_local_size = 0; + } +} + +/* ---------------------------------------------------------------------- */ + +template +void IntelBuffers::_grow(const int nall, const int nlocal, + const int nthreads, + const int offload_end) +{ + free_buffers(); + _buf_size = static_cast(nall) * 1.1 + 1; + if (lmp->force->newton_pair) + _buf_local_size = _buf_size; + else + _buf_local_size = static_cast(nlocal) * 1.1 + 1; + if (lmp->atom->torque) + _buf_local_size *= 2; + const int f_stride = get_stride(_buf_local_size); + lmp->memory->create(_x, _buf_size,"intel_x"); + if (lmp->atom->q != NULL) + lmp->memory->create(_q, _buf_size, "intel_q"); + if (lmp->atom->ellipsoid != NULL) + lmp->memory->create(_quat, _buf_size, "intel_quat"); + lmp->memory->create(_f, f_stride * nthreads, "intel_f"); + + #ifdef _LMP_INTEL_OFFLOAD + if (_separate_buffers) { + lmp->memory->create(_host_x, _buf_size,"intel_host_x"); + if (lmp->atom->q != NULL) + lmp->memory->create(_host_q, _buf_size, "intel_host_q"); + if (lmp->atom->ellipsoid != NULL) + lmp->memory->create(_host_quat, _buf_size, "intel_host_quat"); + } + + if (offload_end > 0) { + lmp->memory->create(_off_f, f_stride * _off_threads, "intel_off_f"); + const atom_t * const x = get_x(); + const flt_t * const q = get_q(); + const vec3_acc_t * f_start = get_off_f(); + acc_t * ev_global = get_ev_global(); + if (lmp->atom->q != NULL) { + if (x != NULL && q != NULL && f_start != NULL && ev_global != NULL) { + #pragma offload_transfer target(mic:_cop) \ + nocopy(x,q:length(_buf_size) alloc_if(1) free_if(0)) \ + nocopy(f_start:length(f_stride*_off_threads) alloc_if(1) free_if(0))\ + nocopy(ev_global:length(8) alloc_if(1) free_if(0)) + } + } else { + if (x != NULL && f_start != NULL && ev_global != NULL) { + #pragma offload_transfer target(mic:_cop) \ + nocopy(x:length(_buf_size) alloc_if(1) free_if(0)) \ + nocopy(f_start:length(f_stride*_off_threads) alloc_if(1) free_if(0))\ + nocopy(ev_global:length(8) alloc_if(1) free_if(0)) + } + } + if (lmp->atom->ellipsoid != NULL) { + const quat_t * const quat = get_quat(); + if (quat != NULL) { + #pragma offload_transfer target(mic:_cop) \ + nocopy(quat:length(_buf_size) alloc_if(1) free_if(0)) + } + } + } + #endif +} + +/* ---------------------------------------------------------------------- */ + +template +void IntelBuffers::free_nmax() +{ + #ifdef _LMP_INTEL_OFFLOAD + if (_off_map_nmax > 0) { + const int * tag = _off_map_tag; + const int * special = _off_map_special; + const int * nspecial = _off_map_nspecial; + const int * bins = _off_map_bins; + if (tag != 0 && special != 0 && nspecial !=0 && bins != 0) { + #pragma offload_transfer target(mic:_cop) \ + nocopy(tag:alloc_if(0) free_if(1)) \ + nocopy(special,nspecial:alloc_if(0) free_if(1)) \ + nocopy(bins:alloc_if(0) free_if(1)) + } + _off_map_nmax = 0; + } + #endif +} + +/* ---------------------------------------------------------------------- */ + +template +void IntelBuffers::_grow_nmax() +{ + #ifdef _LMP_INTEL_OFFLOAD + free_nmax(); + int *special, *nspecial; + int tag_length, special_length, nspecial_length; + int size = lmp->atom->nmax; + if (lmp->atom->molecular) { + special = lmp->atom->special[0]; + nspecial = lmp->atom->nspecial[0]; + special_length = size * lmp->atom->maxspecial; + nspecial_length = size * 3; + tag_length = size; + } else { + special = &_special_holder; + nspecial = &_nspecial_holder; + special_length = 1; + nspecial_length = 1; + tag_length = 1; + } + int *tag = lmp->atom->tag; + int *bins = lmp->neighbor->bins; + #pragma offload_transfer target(mic:_cop) \ + nocopy(bins:length(size) alloc_if(1) free_if(0)) \ + nocopy(tag:length(tag_length) alloc_if(1) free_if(0)) \ + nocopy(special:length(special_length) alloc_if(1) free_if(0)) \ + nocopy(nspecial:length(nspecial_length) alloc_if(1) free_if(0)) + _off_map_tag = tag; + _off_map_special = special; + _off_map_nspecial = nspecial; + _off_map_nmax = size; + _off_map_bins = bins; + #endif +} + +/* ---------------------------------------------------------------------- */ + +template +void IntelBuffers::free_local() +{ + if (_off_map_maxlocal > 0) { + int * cnumneigh = _cnumneigh; + #ifdef _LMP_INTEL_OFFLOAD + if (_off_map_ilist != NULL) { + const int * ilist = _off_map_ilist; + const int * numneigh = _off_map_numneigh; + _off_map_ilist = NULL; + if (numneigh != 0 && ilist != 0) { + #pragma offload_transfer target(mic:_cop) \ + nocopy(ilist,numneigh,cnumneigh:alloc_if(0) free_if(1)) + } + } + #endif + lmp->memory->destroy(cnumneigh); + _off_map_maxlocal = 0; + } +} + +/* ---------------------------------------------------------------------- */ + +template +void IntelBuffers::_grow_local(NeighList *list, + const int offload_end) +{ + free_local(); + int size = list->get_maxlocal(); + lmp->memory->create(_cnumneigh, size, "_cnumneigh"); + _off_map_maxlocal = size; + + #ifdef _LMP_INTEL_OFFLOAD + if (offload_end > 0) { + int * numneigh = list->numneigh; + int * ilist = list->ilist; + int * cnumneigh = _cnumneigh; + if (cnumneigh != 0) { + #pragma offload_transfer target(mic:_cop) \ + nocopy(ilist:length(size) alloc_if(1) free_if(0)) \ + nocopy(numneigh:length(size) alloc_if(1) free_if(0)) \ + nocopy(cnumneigh:length(size) alloc_if(1) free_if(0)) + } + _off_map_ilist = ilist; + _off_map_numneigh = numneigh; + } + #endif +} + +/* ---------------------------------------------------------------------- */ + +template +void IntelBuffers::free_binhead() +{ + #ifdef _LMP_INTEL_OFFLOAD + if (_off_map_maxhead > 0) { + const int * binhead = _off_map_binhead; + if (binhead !=0) { + #pragma offload_transfer target(mic:_cop) \ + nocopy(binhead:alloc_if(0) free_if(1)) + } + _off_map_maxhead = 0; + } + #endif +} + +/* ---------------------------------------------------------------------- */ + +template +void IntelBuffers::_grow_binhead() +{ + #ifdef _LMP_INTEL_OFFLOAD + free_binhead(); + int * binhead = lmp->neighbor->binhead; + const int maxhead = lmp->neighbor->maxhead; + #pragma offload_transfer target(mic:_cop) \ + nocopy(binhead:length(maxhead) alloc_if(1) free_if(0)) + _off_map_binhead = binhead; + _off_map_maxhead = maxhead; + #endif +} + +/* ---------------------------------------------------------------------- */ + +template +void IntelBuffers::free_nbor_list() +{ + if (_list_alloc_atoms > 0) { + lmp->memory->destroy(_list_alloc); + _list_alloc_atoms = 0; + + #ifdef _LMP_INTEL_OFFLOAD + if (_off_list_alloc) { + int * list_alloc = _list_alloc; + int * special_flag = lmp->neighbor->special_flag_alloc(); + int * stencil = _off_map_stencil; + if (list_alloc != 0 && special_flag != 0 && stencil != 0) { + #pragma offload_transfer target(mic:_cop) \ + nocopy(special_flag,stencil:alloc_if(0) free_if(1)) \ + nocopy(list_alloc:alloc_if(0) free_if(1)) + } + _off_list_alloc = false; + } + #endif + } +} + +/* ---------------------------------------------------------------------- */ + +template +void IntelBuffers::_grow_nbor_list(NeighList *list, + const int nlocal, + const int offload_end) +{ + free_nbor_list(); + _list_alloc_atoms = 1.10 * nlocal; + int list_alloc_size = (_list_alloc_atoms + _off_threads) * get_max_nbors(); + lmp->memory->create(_list_alloc, list_alloc_size, "_list_alloc"); + #ifdef _LMP_INTEL_OFFLOAD + if (offload_end > 0) { + int * list_alloc =_list_alloc; + int * special_flag = lmp->neighbor->special_flag; + int * stencil = list->stencil; + + if (special_flag != NULL && list_alloc != NULL) { + #pragma offload_transfer target(mic:_cop) \ + in(special_flag:length(4) alloc_if(1) free_if(0)) \ + in(stencil:length(list->maxstencil) alloc_if(1) free_if(0)) \ + nocopy(list_alloc:length(list_alloc_size) alloc_if(1) free_if(0)) + _off_map_stencil = stencil; + _off_list_alloc = true; + } + } + #endif +} + +template +void IntelBuffers::_grow_stencil(NeighList *list) +{ + #ifdef _LMP_INTEL_OFFLOAD + int * stencil = _off_map_stencil; + #pragma offload_transfer target(mic:_cop) \ + nocopy(stencil:alloc_if(0) free_if(1)) + stencil = list->stencil; + #pragma offload_transfer target(mic:_cop) \ + in(stencil:length(list->maxstencil) alloc_if(1) free_if(0)) + _off_map_stencil = stencil; + #endif +} + +/* ---------------------------------------------------------------------- */ + +template +void IntelBuffers::set_ntypes(const int ntypes) +{ + if (ntypes != _ntypes) { + if (_ntypes > 0) { + #ifdef _LMP_INTEL_OFFLOAD + flt_t * cutneighsqo = _cutneighsq[0]; + if (cutneighsqo != 0) { + #pragma offload_transfer target(mic:_cop) \ + nocopy(cutneighsqo:alloc_if(0) free_if(1)) + } + #endif + lmp->memory->destroy(_cutneighsq); + } + if (ntypes > 0) { + lmp->memory->create(_cutneighsq, ntypes, ntypes, "_cutneighsq"); + #ifdef _LMP_INTEL_OFFLOAD + flt_t * cutneighsqo = _cutneighsq[0]; + if (cutneighsqo != NULL) { + #pragma offload_transfer target(mic:_cop) \ + nocopy(cutneighsqo:length(ntypes * ntypes) alloc_if(1) free_if(0)) + } + #endif + } + _ntypes = ntypes; + } +} + +/* ---------------------------------------------------------------------- */ + +template +double IntelBuffers::memory_usage(const int nthreads) +{ + double tmem = sizeof(atom_t); + if (lmp->atom->q) tmem += sizeof(flt_t); + if (lmp->atom->torque) tmem += sizeof(quat_t); + #ifdef _LMP_INTEL_OFFLOAD + if (_separate_buffers) tmem *= 2; + #endif + tmem *= _buf_size; + + const int fstride = get_stride(_buf_local_size); + tmem += fstride * nthreads * sizeof(vec3_acc_t); + #ifdef _LMP_INTEL_OFFLOAD + if (_off_f) tmem += fstride*_off_threads * sizeof(vec3_acc_t); + #endif + + tmem += _off_map_maxlocal * sizeof(int); + tmem += (_list_alloc_atoms + _off_threads) * get_max_nbors() * sizeof(int); + tmem += _ntypes * _ntypes * sizeof(int); +} + +/* ---------------------------------------------------------------------- */ + +template class IntelBuffers; +template class IntelBuffers; +template class IntelBuffers; diff --git a/src/USER-INTEL/intel_buffers.h b/src/USER-INTEL/intel_buffers.h new file mode 100644 index 0000000000..bc1ca9e3b8 --- /dev/null +++ b/src/USER-INTEL/intel_buffers.h @@ -0,0 +1,284 @@ +/* -*- c++ -*- ------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#ifndef LMP_INTEL_BUFFERS_H +#define LMP_INTEL_BUFFERS_H + +#if defined(_OPENMP) +#include +#endif +#include "atom.h" +#include "neighbor.h" +#include "neigh_list.h" +#include "intel_preprocess.h" +#include + +namespace LAMMPS_NS { + +#define ATOM_T typename IntelBuffers::atom_t +#define QUAT_T typename IntelBuffers::quat_t +#define FORCE_T typename IntelBuffers::vec3_acc_t + +// May not need a separate force array for mixed/double +template +class IntelBuffers { + public: + typedef struct { flt_t x,y,z; int w; } atom_t; + typedef struct { flt_t w,i,j,k; } quat_t; + typedef struct { flt_t x,y,z,w; } vec3_t; + typedef struct { flt_t x,y,z,w; } vec4_t; + typedef struct { acc_t x,y,z,w; } vec3_acc_t; + + IntelBuffers(class LAMMPS *lmp_in); + ~IntelBuffers(); + + inline int get_stride(int nall) { + int stride; + IP_PRE_get_stride(stride, nall, sizeof(vec3_acc_t), + lmp->atom->torque); + return stride; + } + + void free_buffers(); + + inline void grow(const int nall, const int nlocal, const int nthreads, + const int offload_end) { + if (nall >= _buf_size || nlocal >= _buf_local_size) + _grow(nall, nlocal, nthreads, offload_end); + } + + inline void free_all_nbor_buffers() { + free_nbor_list(); + free_nmax(); + free_binhead(); + free_local(); + } + + inline void grow_nbor(NeighList *list, const int nlocal, + const int offload_end) { + grow_local(list, offload_end); + if (offload_end) { + grow_nmax(); + grow_binhead(); + } + grow_nbor_list(list, nlocal, offload_end); + } + + void free_nmax(); + + inline void grow_nmax() { + #ifdef _LMP_INTEL_OFFLOAD + if (lmp->atom->nmax > _off_map_nmax) + _grow_nmax(); + #endif + } + + void free_local(); + + inline void grow_local(NeighList *list, const int offload_end) { + if (list->get_maxlocal() > _off_map_maxlocal) + _grow_local(list, offload_end); + } + + void free_binhead(); + + inline void grow_binhead() { + #ifdef _LMP_INTEL_OFFLOAD + if (lmp->neighbor->maxhead > _off_map_maxhead) + _grow_binhead(); + #endif + } + + inline int get_max_nbors() { + int mn = lmp->neighbor->oneatom * sizeof(int) / + (INTEL_ONEATOM_FACTOR * INTEL_DATA_ALIGN); + return mn * INTEL_DATA_ALIGN / sizeof(int); + } + + void free_nbor_list(); + + inline void grow_nbor_list(NeighList *list, const int nlocal, + const int offload_end) { + if (nlocal > _list_alloc_atoms) + _grow_nbor_list(list, nlocal, offload_end); + #ifdef _LMP_INTEL_OFFLOAD + else if (offload_end > 0 && _off_map_stencil != list->stencil) + _grow_stencil(list); + #endif + } + + void set_ntypes(const int ntypes); + + inline int * firstneigh(const NeighList *list) { return _list_alloc; } + inline int * cnumneigh(const NeighList *list) { return _cnumneigh; } + + inline atom_t * get_x(const int offload = 1) { + #ifdef _LMP_INTEL_OFFLOAD + if (_separate_buffers && offload == 0) return _host_x; + #endif + return _x; + } + inline flt_t * get_q(const int offload = 1) { + #ifdef _LMP_INTEL_OFFLOAD + if (_separate_buffers && offload == 0) return _host_q; + #endif + return _q; + } + inline quat_t * get_quat(const int offload = 1) { + #ifdef _LMP_INTEL_OFFLOAD + if (_separate_buffers && offload == 0) return _host_quat; + #endif + return _quat; + } + inline vec3_acc_t * get_f() { return _f; } + inline acc_t * get_ev_global() { return _ev_global; } + inline acc_t * get_ev_global_host() { return _ev_global_host; } + inline void zero_ev() + { for (int i = 0; i < 8; i++) _ev_global[i] = _ev_global_host[i] = 0.0; } + inline flt_t ** get_cutneighsq() { return _cutneighsq; } + inline int get_off_threads() { return _off_threads; } + #ifdef _LMP_INTEL_OFFLOAD + inline void set_off_params(const int n, const int cop, + const int separate_buffers) + { _off_threads = n; _cop = cop; _separate_buffers = separate_buffers; } + inline vec3_acc_t * get_off_f() { return _off_f; } + #endif + + inline void thr_pack(const int ifrom, const int ito, const int ago) { + if (ago == 0) { + for (int i = ifrom; i < ito; i++) { + _x[i].x = lmp->atom->x[i][0]; + _x[i].y = lmp->atom->x[i][1]; + _x[i].z = lmp->atom->x[i][2]; + _x[i].w = lmp->atom->type[i]; + } + if (lmp->atom->q != NULL) + for (int i = ifrom; i < ito; i++) + _q[i] = lmp->atom->q[i]; + } else { + for (int i = ifrom; i < ito; i++) { + _x[i].x = lmp->atom->x[i][0]; + _x[i].y = lmp->atom->x[i][1]; + _x[i].z = lmp->atom->x[i][2]; + } + } + } + + #ifdef _LMP_INTEL_OFFLOAD + inline void thr_pack_cop(const int ifrom, const int ito, + const int offset, const bool dotype = false) { + double ** x = lmp->atom->x + offset; + if (dotype == false) { + #pragma vector nontemporal + for (int i = ifrom; i < ito; i++) { + _x[i].x = x[i][0]; + _x[i].y = x[i][1]; + _x[i].z = x[i][2]; + } + } else { + int *type = lmp->atom->type + offset; + #pragma vector nontemporal + for (int i = ifrom; i < ito; i++) { + _x[i].x = x[i][0]; + _x[i].y = x[i][1]; + _x[i].z = x[i][2]; + _x[i].w = type[i]; + } + } + } + + inline void thr_pack_host(const int ifrom, const int ito, + const int offset) { + double ** x = lmp->atom->x + offset; + for (int i = ifrom; i < ito; i++) { + _host_x[i].x = x[i][0]; + _host_x[i].y = x[i][1]; + _host_x[i].z = x[i][2]; + } + } + + inline void pack_sep_from_single(const int host_min_local, + const int used_local, + const int host_min_ghost, + const int used_ghost) { + memcpy(_host_x + host_min_local, _x + host_min_local, + used_local * sizeof(atom_t)); + memcpy(_host_x + host_min_local + used_local, _x + host_min_ghost, + used_ghost * sizeof(atom_t)); + int nall = used_local + used_ghost + host_min_local; + _host_x[nall].x = INTEL_BIGP; + _host_x[nall].y = INTEL_BIGP; + _host_x[nall].z = INTEL_BIGP; + _host_x[nall].w = 1; + if (lmp->atom->q != NULL) { + memcpy(_host_q + host_min_local, _q + host_min_local, + used_local * sizeof(flt_t)); + memcpy(_host_q + host_min_local + used_local, _q + host_min_ghost, + used_ghost * sizeof(flt_t)); + } + } + #endif + + double memory_usage(const int nthreads); + + int _special_holder, _nspecial_holder; + + protected: + LAMMPS *lmp; + atom_t *_x; + flt_t *_q; + quat_t *_quat; + vec3_acc_t * _f; + int _off_threads, _off_map_maxlocal; + + int _list_alloc_atoms; + int * _list_alloc; + int * _cnumneigh; + + flt_t **_cutneighsq; + int _ntypes; + + #ifdef _LMP_INTEL_OFFLOAD + int _separate_buffers; + atom_t *_host_x; + flt_t *_host_q; + quat_t *_host_quat; + vec3_acc_t *_off_f; + int _off_map_nmax, _off_map_maxhead, _cop; + int *_off_map_ilist; + int *_off_map_stencil, *_off_map_special, *_off_map_nspecial, *_off_map_tag; + int *_off_map_binhead, *_off_map_bins, *_off_map_numneigh; + bool _off_list_alloc; + #endif + + int _buf_size, _buf_local_size; + __declspec(align(64)) acc_t _ev_global[8]; + __declspec(align(64)) acc_t _ev_global_host[8]; + + void _grow(const int nall, const int nlocal, const int nthreads, + const int offload_end); + void _grow_nmax(); + void _grow_local(NeighList *list, const int offload_end); + void _grow_binhead(); + void _grow_nbor_list(NeighList *list, const int nlocal, + const int offload_end); + void _grow_stencil(NeighList *list); +}; + +} + +#endif diff --git a/src/USER-INTEL/intel_preprocess.h b/src/USER-INTEL/intel_preprocess.h new file mode 100644 index 0000000000..49e3413e0a --- /dev/null +++ b/src/USER-INTEL/intel_preprocess.h @@ -0,0 +1,391 @@ +/* -*- c++ -*- ------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#ifdef __INTEL_OFFLOAD +#ifdef LMP_INTEL_OFFLOAD +#define _LMP_INTEL_OFFLOAD +#endif +#endif + +#ifndef LMP_INTEL_PREPROCESS_H +#define LMP_INTEL_PREPROCESS_H + +#ifndef LAMMPS_MEMALIGN +#error Please set -DLAMMPS_MEMALIGN=64 in CCFLAGS for your LAMMPS makefile. +#endif + +namespace LAMMPS_NS { + +enum {LMP_OVERFLOW, LMP_LOCAL_MIN, LMP_LOCAL_MAX, LMP_GHOST_MIN, + LMP_GHOST_MAX}; +enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR, + TIME_OFFLOAD_PAIR, TIME_OFFLOAD_WAIT, TIME_OFFLOAD_LATENCY, + TIME_IMBALANCE}; +#define NUM_ITIMERS ( TIME_IMBALANCE + 1 ) + +#define INTEL_DATA_ALIGN 64 +#define INTEL_ONEATOM_FACTOR 2 +#define INTEL_MIC_VECTOR_WIDTH 16 +#define INTEL_MIC_NBOR_PAD INTEL_MIC_VECTOR_WIDTH +#define INTEL_VECTOR_WIDTH 8 +#define INTEL_NBOR_PAD INTEL_VECTOR_WIDTH +#define INTEL_LB_MEAN_WEIGHT 0.1 +#define INTEL_BIGP 1e15 + +#define IP_PRE_get_stride(stride, n, datasize, torque) \ + { \ + int blength = n; \ + if (torque) blength *= 2; \ + const int bytes = blength * datasize; \ + stride = INTEL_DATA_ALIGN - (bytes % INTEL_DATA_ALIGN); \ + stride = blength + stride / datasize; \ + } + +#if defined(_OPENMP) + +#define IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads) \ + { \ + const int idelta = 1 + inum/nthreads; \ + ifrom = tid * idelta; \ + ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta; \ + } + +#define IP_PRE_omp_range_id(ifrom, ito, tid, inum, nthreads) \ + { \ + tid = omp_get_thread_num(); \ + IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads); \ + } + +#define IP_PRE_omp_range_align(ifrom, ito, tid, inum, nthreads, \ + datasize) \ +{ \ + int chunk_size = INTEL_DATA_ALIGN / datasize; \ + int idelta = static_cast(static_cast(inum) \ + /chunk_size/nthreads) + 1; \ + idelta *= chunk_size; \ + ifrom = tid*idelta; \ + ito = ifrom + idelta; \ + if (ito > inum) ito = inum; \ +} + +#define IP_PRE_omp_range_id_align(ifrom, ito, tid, inum, \ + nthreads, datasize) \ + { \ + tid = omp_get_thread_num(); \ + IP_PRE_omp_range_align(ifrom, ito, tid, inum, nthreads, \ + datasize); \ + } + +#else + +#define IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads) \ + { \ + ifrom = 0; \ + ito = inum; \ + } + +#define IP_PRE_omp_range_id(ifrom, ito, tid, inum, nthreads) \ + { \ + tid = 0; \ + ifrom = 0; \ + ito = inum; \ + } + +#define IP_PRE_omp_range_align(ifrom, ito, tid, inum, nthreads, \ + datasize) \ +{ \ + ifrom = 0; \ + ito = inum; \ +} + +#define IP_PRE_omp_range_id_align(ifrom, ito, tid, inum, \ + nthreads, datasize) \ +{ \ + tid = 0; \ + ifrom = 0; \ + ito = inum; \ +} + +#endif + +#ifdef _LMP_INTEL_OFFLOAD +#include + +__declspec( target (mic)) +inline double MIC_Wtime() { + double time; + struct timeval tv; + + gettimeofday(&tv, NULL); + time = 1.0 * tv.tv_sec + 1.0e-6 * tv.tv_usec; + return time; +} + +#define IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, \ + nlocal, nall) \ +{ \ + if (fix->separate_buffers() && ago != 0) { \ + fix->start_watch(TIME_PACK); \ + if (offload) { \ + _Pragma("omp parallel default(none) shared(buffers,nlocal,nall)") \ + { \ + int ifrom, ito, tid; \ + int nthreads = comm->nthreads; \ + IP_PRE_omp_range_id_align(ifrom, ito, tid, nlocal, \ + nthreads, sizeof(flt_t)); \ + buffers->thr_pack_cop(ifrom, ito, 0); \ + int nghost = nall - nlocal; \ + if (nghost) { \ + IP_PRE_omp_range_align(ifrom, ito, tid, nall - nlocal, \ + nthreads, sizeof(flt_t)); \ + buffers->thr_pack_cop(ifrom + nlocal, ito + nlocal, \ + fix->offload_min_ghost() - nlocal, \ + ago == 1); \ + } \ + } \ + } else { \ + buffers->thr_pack_host(fix->host_min_local(), nlocal, 0); \ + buffers->thr_pack_host(nlocal, nall, \ + fix->host_min_ghost()-nlocal); \ + } \ + fix->stop_watch(TIME_PACK); \ + } \ +} + +#define IP_PRE_get_transfern(ago, newton, evflag, eflag, vflag, \ + buffers, offload, fix, separate_flag, \ + x_size, q_size, ev_size, f_stride) \ +{ \ + separate_flag = 0; \ + if (ago == 0) { \ + x_size = 0; \ + q_size = nall; \ + if (offload) { \ + if (fix->separate_buffers()) { \ + if (lmp->atom->torque) \ + separate_flag = 2; \ + else \ + separate_flag = 1; \ + } else \ + separate_flag = 3; \ + } \ + } else { \ + x_size = nall; \ + q_size = 0; \ + } \ + ev_size = 0; \ + if (evflag) { \ + if (eflag) ev_size = 2; \ + if (vflag) ev_size = 8; \ + } \ + int f_length; \ + if (newton) \ + f_length = nall; \ + else \ + f_length = nlocal; \ + f_length -= minlocal; \ + f_stride = buffers->get_stride(f_length); \ +} + +#define IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, \ + ev_global) \ +{ \ + if (offload) { \ + tc = buffers->get_off_threads(); \ + f_start = buffers->get_off_f(); \ + ev_global = buffers->get_ev_global(); \ + } else { \ + tc = comm->nthreads; \ + f_start = buffers->get_f(); \ + fix->start_watch(TIME_HOST_PAIR); \ + ev_global = buffers->get_ev_global_host(); \ + } \ +} + +#define IP_PRE_repack_for_offload(newton, separate_flag, nlocal, nall, \ + f_stride, x, q) \ +{ \ + if (separate_flag) { \ + if (separate_flag < 3) { \ + int all_local = nlocal; \ + int ghost_min = overflow[LMP_GHOST_MIN]; \ + nlocal = overflow[LMP_LOCAL_MAX] + 1; \ + int nghost = overflow[LMP_GHOST_MAX] + 1 - ghost_min; \ + if (nghost < 0) nghost = 0; \ + nall = nlocal + nghost; \ + separate_flag--; \ + int flength; \ + if (NEWTON_PAIR) flength = nall; \ + else flength = nlocal; \ + IP_PRE_get_stride(f_stride, flength, sizeof(FORCE_T), \ + separate_flag); \ + if (nghost) { \ + if (nlocal < all_local || ghost_min > all_local) { \ + memmove(x + nlocal, x + ghost_min, \ + (nall - nlocal) * sizeof(ATOM_T)); \ + if (q != 0) \ + memmove((void *)(q + nlocal), (void *)(q + ghost_min), \ + (nall - nlocal) * sizeof(flt_t)); \ + } \ + } \ + } \ + x[nall].x = INTEL_BIGP; \ + x[nall].y = INTEL_BIGP; \ + x[nall].z = INTEL_BIGP; \ + } \ +} + + +#else + +#define MIC_Wtime MPI_Wtime +#define IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, \ + nlocal, nall) + +#define IP_PRE_get_transfern(ago, newton, evflag, eflag, vflag, \ + buffers, offload, fix, separate_flag, \ + x_size, q_size, ev_size, f_stride) \ +{ \ + separate_flag = 0; \ + int f_length; \ + if (newton) \ + f_length = nall; \ + else \ + f_length = nlocal; \ + f_stride = buffers->get_stride(f_length); \ +} + +#define IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, \ + ev_global) \ +{ \ + tc = comm->nthreads; \ + f_start = buffers->get_f(); \ + fix->start_watch(TIME_HOST_PAIR); \ + ev_global = buffers->get_ev_global_host(); \ +} + +#define IP_PRE_repack_for_offload(newton, separate_flag, nlocal, nall, \ + f_stride, x, q) + + +#endif + +#define IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair, delx, dely, delz) \ +{ \ + if (vflag == 1) { \ + sv0 += ev_pre * delx * delx * fpair; \ + sv1 += ev_pre * dely * dely * fpair; \ + sv2 += ev_pre * delz * delz * fpair; \ + sv3 += ev_pre * delx * dely * fpair; \ + sv4 += ev_pre * delx * delz * fpair; \ + sv5 += ev_pre * dely * delz * fpair; \ + } \ +} + +#define IP_PRE_ev_tally_atom(evflag, eflag, vflag, f, fwtmp) \ +{ \ + if (evflag) { \ + if (eflag) { \ + f[i].w += fwtmp; \ + oevdwl += sevdwl; \ + } \ + if (vflag == 1) { \ + ov0 += sv0; \ + ov1 += sv1; \ + ov2 += sv2; \ + ov3 += sv3; \ + ov4 += sv4; \ + ov5 += sv5; \ + } \ + } \ +} + +#define IP_PRE_ev_tally_atomq(evflag, eflag, vflag, f, fwtmp) \ +{ \ + if (evflag) { \ + if (eflag) { \ + f[i].w += fwtmp; \ + oevdwl += sevdwl; \ + oecoul += secoul; \ + } \ + if (vflag == 1) { \ + ov0 += sv0; \ + ov1 += sv1; \ + ov2 += sv2; \ + ov3 += sv3; \ + ov4 += sv4; \ + ov5 += sv5; \ + } \ + } \ +} + +#define IP_PRE_fdotr_acc_force(newton, evflag, eflag, vflag, eatom, \ + nall, nlocal, minlocal, nthreads, \ + f_start, f_stride, x) \ +{ \ + int o_range; \ + if (newton) \ + o_range = nall; \ + else \ + o_range = nlocal; \ + if (offload == 0) o_range -= minlocal; \ + IP_PRE_omp_range_align(iifrom, iito, tid, o_range, nthreads, \ + sizeof(acc_t)); \ + \ + int t_off = f_stride; \ + if (eflag && eatom) { \ + for (int t = 1; t < nthreads; t++) { \ + _Pragma("vector nontemporal") \ + for (int n = iifrom; n < iito; n++) { \ + f_start[n].x += f_start[n + t_off].x; \ + f_start[n].y += f_start[n + t_off].y; \ + f_start[n].z += f_start[n + t_off].z; \ + f_start[n].w += f_start[n + t_off].w; \ + } \ + t_off += f_stride; \ + } \ + } else { \ + for (int t = 1; t < nthreads; t++) { \ + _Pragma("vector nontemporal") \ + for (int n = iifrom; n < iito; n++) { \ + f_start[n].x += f_start[n + t_off].x; \ + f_start[n].y += f_start[n + t_off].y; \ + f_start[n].z += f_start[n + t_off].z; \ + } \ + t_off += f_stride; \ + } \ + } \ + \ + if (evflag) { \ + if (vflag == 2) { \ + const ATOM_T * restrict const xo = x + minlocal; \ + _Pragma("vector nontemporal") \ + for (int n = iifrom; n < iito; n++) { \ + ov0 += f_start[n].x * xo[n].x; \ + ov1 += f_start[n].y * xo[n].y; \ + ov2 += f_start[n].z * xo[n].z; \ + ov3 += f_start[n].y * xo[n].x; \ + ov4 += f_start[n].z * xo[n].x; \ + ov5 += f_start[n].z * xo[n].y; \ + } \ + } \ + } \ +} + +} + +#endif diff --git a/src/USER-INTEL/math_extra_intel.h b/src/USER-INTEL/math_extra_intel.h new file mode 100644 index 0000000000..62163b3f60 --- /dev/null +++ b/src/USER-INTEL/math_extra_intel.h @@ -0,0 +1,354 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#ifndef LMP_MATH_EXTRA_INTEL_H +#define LMP_MATH_EXTRA_INTEL_H + +#define ME_quat_to_mat_trans(quat, mat) \ +{ \ + flt_t quat_w = quat.w; \ + flt_t quat_i = quat.i; \ + flt_t quat_j = quat.j; \ + flt_t quat_k = quat.k; \ + flt_t w2 = quat_w * quat_w; \ + flt_t i2 = quat_i * quat_i; \ + flt_t j2 = quat_j * quat_j; \ + flt_t k2 = quat_k * quat_k; \ + flt_t twoij = (flt_t)2.0 * quat_i * quat_j; \ + flt_t twoik = (flt_t)2.0 * quat_i * quat_k; \ + flt_t twojk = (flt_t)2.0 * quat_j * quat_k; \ + flt_t twoiw = (flt_t)2.0 * quat_i * quat_w; \ + flt_t twojw = (flt_t)2.0 * quat_j * quat_w; \ + flt_t twokw = (flt_t)2.0 * quat_k * quat_w; \ + \ + mat##_0 = w2 + i2 - j2 - k2; \ + mat##_3 = twoij - twokw; \ + mat##_6 = twojw + twoik; \ + \ + mat##_1 = twoij + twokw; \ + mat##_4 = w2 - i2 + j2 - k2; \ + mat##_7 = twojk - twoiw; \ + \ + mat##_2 = twoik - twojw; \ + mat##_5 = twojk + twoiw; \ + mat##_8 = w2 - i2 - j2 + k2; \ +} + +/* ---------------------------------------------------------------------- + diagonal matrix times a full matrix +------------------------------------------------------------------------- */ + +#define ME_diag_times3(d, m, ans) \ + { \ + ans##_0 = d[0] * m##_0; \ + ans##_1 = d[0] * m##_1; \ + ans##_2 = d[0] * m##_2; \ + ans##_3 = d[1] * m##_3; \ + ans##_4 = d[1] * m##_4; \ + ans##_5 = d[1] * m##_5; \ + ans##_6 = d[2] * m##_6; \ + ans##_7 = d[2] * m##_7; \ + ans##_8 = d[2] * m##_8; \ +} + +#define ME_diag_times3a(d, m, ans) \ + { \ + ans##_0 = d##_0 * m##_0; \ + ans##_1 = d##_0 * m##_1; \ + ans##_2 = d##_0 * m##_2; \ + ans##_3 = d##_1 * m##_3; \ + ans##_4 = d##_1 * m##_4; \ + ans##_5 = d##_1 * m##_5; \ + ans##_6 = d##_2 * m##_6; \ + ans##_7 = d##_2 * m##_7; \ + ans##_8 = d##_2 * m##_8; \ +} + +/* ---------------------------------------------------------------------- + multiply the transpose of mat1 times mat2 +------------------------------------------------------------------------- */ + +#define ME_transpose_times3(m1, m2, ans) \ +{ \ + ans##_0 = m1##_0*m2##_0 + m1##_3*m2##_3 + m1##_6*m2##_6; \ + ans##_1 = m1##_0*m2##_1 + m1##_3*m2##_4 + m1##_6*m2##_7; \ + ans##_2 = m1##_0*m2##_2 + m1##_3*m2##_5 + m1##_6*m2##_8; \ + ans##_3 = m1##_1*m2##_0 + m1##_4*m2##_3 + m1##_7*m2##_6; \ + ans##_4 = m1##_1*m2##_1 + m1##_4*m2##_4 + m1##_7*m2##_7; \ + ans##_5 = m1##_1*m2##_2 + m1##_4*m2##_5 + m1##_7*m2##_8; \ + ans##_6 = m1##_2*m2##_0 + m1##_5*m2##_3 + m1##_8*m2##_6; \ + ans##_7 = m1##_2*m2##_1 + m1##_5*m2##_4 + m1##_8*m2##_7; \ + ans##_8 = m1##_2*m2##_2 + m1##_5*m2##_5 + m1##_8*m2##_8; \ +} + +/* ---------------------------------------------------------------------- + normalize a vector, return in ans +------------------------------------------------------------------------- */ + +#define ME_normalize3(v0, v1, v2, ans) \ +{ \ + flt_t scale = (flt_t)1.0 / sqrt(v0*v0+v1*v1+v2*v2); \ + ans##_0 = v0 * scale; \ + ans##_1 = v1 * scale; \ + ans##_2 = v2 * scale; \ +} + +/* ---------------------------------------------------------------------- + add two matrices +------------------------------------------------------------------------- */ + +#define ME_plus3(m1, m2, ans) \ +{ \ + ans##_0 = m1##_0 + m2##_0; \ + ans##_1 = m1##_1 + m2##_1; \ + ans##_2 = m1##_2 + m2##_2; \ + ans##_3 = m1##_3 + m2##_3; \ + ans##_4 = m1##_4 + m2##_4; \ + ans##_5 = m1##_5 + m2##_5; \ + ans##_6 = m1##_6 + m2##_6; \ + ans##_7 = m1##_7 + m2##_7; \ + ans##_8 = m1##_8 + m2##_8; \ +} + +/* ---------------------------------------------------------------------- + dot product of 2 vectors +------------------------------------------------------------------------- */ + +#define ME_dot3(v1, v2) \ + (v1##_0*v2##_0 + v1##_1 * v2##_1 + v1##_2 * v2##_2) + +/* ---------------------------------------------------------------------- + determinant of a matrix +------------------------------------------------------------------------- */ + +#define ME_det3(m) \ + ( m##_0 * m##_4 * m##_8 - m##_0 * m##_5 * m##_7 - \ + m##_3 * m##_1 * m##_8 + m##_3 * m##_2 * m##_7 + \ + m##_6 * m##_1 * m##_5 - m##_6 * m##_2 * m##_4 ) + +/* ---------------------------------------------------------------------- + row vector times matrix +------------------------------------------------------------------------- */ + +#define ME_vecmat(v, m, ans) \ +{ \ + ans##_0 = v##_0 * m##_0 + v##_1 * m##_3 + v##_2 * m##_6; \ + ans##_1 = v##_0 * m##_1 + v##_1 * m##_4 + v##_2 * m##_7; \ + ans##_2 = v##_0 * m##_2 + v##_1 * m##_5 + v##_2 * m##_8; \ +} + +/* ---------------------------------------------------------------------- + cross product of 2 vectors +------------------------------------------------------------------------- */ + +#define ME_cross3(v1, v2, ans) \ +{ \ + ans##_0 = v1##_1 * v2##_2 - v1##_2 * v2##_1; \ + ans##_1 = v1##_2 * v2##_0 - v1##_0 * v2##_2; \ + ans##_2 = v1##_0 * v2##_1 - v1##_1 * v2##_0; \ +} + +/* ---------------------------------------------------------------------- + cross product of 2 vectors +------------------------------------------------------------------------- */ + +#define ME_mv0_cross3(m1, v2, ans) \ +{ \ + ans##_0 = m1##_1 * v2##_2 - m1##_2 * v2##_1; \ + ans##_1 = m1##_2 * v2##_0 - m1##_0 * v2##_2; \ + ans##_2 = m1##_0 * v2##_1 - m1##_1 * v2##_0; \ +} + +#define ME_mv1_cross3(m1, v2, ans) \ +{ \ + ans##_0 = m1##_4 * v2##_2 - m1##_5 * v2##_1; \ + ans##_1 = m1##_5 * v2##_0 - m1##_3 * v2##_2; \ + ans##_2 = m1##_3 * v2##_1 - m1##_4 * v2##_0; \ +} + +#define ME_mv2_cross3(m1, v2, ans) \ +{ \ + ans##_0 = m1##_7 * v2##_2 - m1##_8 * v2##_1; \ + ans##_1 = m1##_8 * v2##_0 - m1##_6 * v2##_2; \ + ans##_2 = m1##_6 * v2##_1 - m1##_7 * v2##_0; \ +} + + +#define ME_compute_eta_torque(m1, m2, s1, ans) \ +{ \ + flt_t den = m1##_3*m1##_2*m1##_7-m1##_0*m1##_5*m1##_7- \ + m1##_2*m1##_6*m1##_4+m1##_1*m1##_6*m1##_5- \ + m1##_3*m1##_1*m1##_8+m1##_0*m1##_4*m1##_8; \ + den = (flt_t)1.0 / den; \ + \ + ans##_0 = s1##_0*(m1##_5*m1##_1*m2##_2+(flt_t)2.0*m1##_4*m1##_8*m2##_0- \ + m1##_4*m2##_2*m1##_2-(flt_t)2.0*m1##_5*m2##_0*m1##_7+ \ + m2##_1*m1##_2*m1##_7-m2##_1*m1##_1*m1##_8- \ + m1##_3*m1##_8*m2##_1+m1##_6*m1##_5*m2##_1+ \ + m1##_3*m2##_2*m1##_7-m2##_2*m1##_6*m1##_4)*den; \ + \ + ans##_1 = s1##_0*(m1##_2*m2##_0*m1##_7-m1##_8*m2##_0*m1##_1+ \ + (flt_t)2.0*m1##_0*m1##_8*m2##_1-m1##_0*m2##_2*m1##_5- \ + (flt_t)2.0*m1##_6*m1##_2*m2##_1+m2##_2*m1##_3*m1##_2- \ + m1##_8*m1##_3*m2##_0+m1##_6*m2##_0*m1##_5+ \ + m1##_6*m2##_2*m1##_1-m2##_2*m1##_0*m1##_7)*den; \ + \ + ans##_2 = s1##_0*(m1##_1*m1##_5*m2##_0-m1##_2*m2##_0*m1##_4- \ + m1##_0*m1##_5*m2##_1+m1##_3*m1##_2*m2##_1- \ + m2##_1*m1##_0*m1##_7-m1##_6*m1##_4*m2##_0+ \ + (flt_t)2.0*m1##_4*m1##_0*m2##_2- \ + (flt_t)2.0*m1##_3*m2##_2*m1##_1+ \ + m1##_3*m1##_7*m2##_0+m1##_6*m2##_1*m1##_1)*den; \ + \ + ans##_3 = s1##_1*(-m1##_4*m2##_5*m1##_2+(flt_t)2.0*m1##_4*m1##_8*m2##_3+ \ + m1##_5*m1##_1*m2##_5-(flt_t)2.0*m1##_5*m2##_3*m1##_7+ \ + m2##_4*m1##_2*m1##_7-m2##_4*m1##_1*m1##_8- \ + m1##_3*m1##_8*m2##_4+m1##_6*m1##_5*m2##_4- \ + m2##_5*m1##_6*m1##_4+m1##_3*m2##_5*m1##_7)*den; \ + \ + ans##_4 = s1##_1*(m1##_2*m2##_3*m1##_7-m1##_1*m1##_8*m2##_3+ \ + (flt_t)2.0*m1##_8*m1##_0*m2##_4-m2##_5*m1##_0*m1##_5- \ + (flt_t)2.0*m1##_6*m2##_4*m1##_2-m1##_3*m1##_8*m2##_3+ \ + m1##_6*m1##_5*m2##_3+m1##_3*m2##_5*m1##_2- \ + m1##_0*m2##_5*m1##_7+m2##_5*m1##_1*m1##_6)*den; \ + \ + ans##_5 = s1##_1*(m1##_1*m1##_5*m2##_3-m1##_2*m2##_3*m1##_4- \ + m1##_0*m1##_5*m2##_4+m1##_3*m1##_2*m2##_4+ \ + (flt_t)2.0*m1##_4*m1##_0*m2##_5-m1##_0*m2##_4*m1##_7+ \ + m1##_1*m1##_6*m2##_4-m2##_3*m1##_6*m1##_4- \ + (flt_t)2.0*m1##_3*m1##_1*m2##_5+m1##_3*m2##_3*m1##_7)* \ + den; \ + \ + ans##_6 = s1##_2*(-m1##_4*m1##_2*m2##_8+m1##_1*m1##_5*m2##_8+ \ + (flt_t)2.0*m1##_4*m2##_6*m1##_8-m1##_1*m2##_7*m1##_8+ \ + m1##_2*m1##_7*m2##_7-(flt_t)2.0*m2##_6*m1##_7*m1##_5- \ + m1##_3*m2##_7*m1##_8+m1##_5*m1##_6*m2##_7- \ + m1##_4*m1##_6*m2##_8+m1##_7*m1##_3*m2##_8)*den; \ + \ + ans##_7 = s1##_2*-(m1##_1*m1##_8*m2##_6-m1##_2*m2##_6*m1##_7- \ + (flt_t)2.0*m2##_7*m1##_0*m1##_8+m1##_5*m2##_8*m1##_0+ \ + (flt_t)2.0*m2##_7*m1##_2*m1##_6+m1##_3*m2##_6*m1##_8- \ + m1##_3*m1##_2*m2##_8-m1##_5*m1##_6*m2##_6+ \ + m1##_0*m2##_8*m1##_7-m2##_8*m1##_1*m1##_6)*den; \ + \ + ans##_8 = s1##_2*(m1##_1*m1##_5*m2##_6-m1##_2*m2##_6*m1##_4- \ + m1##_0*m1##_5*m2##_7+m1##_3*m1##_2*m2##_7- \ + m1##_4*m1##_6*m2##_6-m1##_7*m2##_7*m1##_0+ \ + (flt_t)2.0*m1##_4*m2##_8*m1##_0+m1##_7*m1##_3*m2##_6+ \ + m1##_6*m1##_1*m2##_7-(flt_t)2.0*m2##_8*m1##_3*m1##_1)* \ + den; \ +} + +#define ME_vcopy4(dst,src) \ + dst##_0 = src##_0; \ + dst##_1 = src##_1; \ + dst##_2 = src##_2; \ + dst##_3 = src##_3; + +#define ME_mldivide3(m1, v_0, v_1, v_2, ans, error) \ +{ \ + flt_t aug_0, aug_1, aug_2, aug_3, aug_4, aug_5; \ + flt_t aug_6, aug_7, aug_8, aug_9, aug_10, aug_11, t; \ + \ + aug_3 = v_0; \ + aug_0 = m1##_0; \ + aug_1 = m1##_1; \ + aug_2 = m1##_2; \ + aug_7 = v_1; \ + aug_4 = m1##_3; \ + aug_5 = m1##_4; \ + aug_6 = m1##_5; \ + aug_11 = v_2; \ + aug_8 = m1##_6; \ + aug_9 = m1##_7; \ + aug_10 = m1##_8; \ + \ + if (fabs(aug_4) > fabs(aug_0)) { \ + flt_t swapt; \ + swapt = aug_0; aug_0 = aug_4; aug_4 = swapt; \ + swapt = aug_1; aug_1 = aug_5; aug_5 = swapt; \ + swapt = aug_2; aug_2 = aug_6; aug_6 = swapt; \ + swapt = aug_3; aug_3 = aug_7; aug_7 = swapt; \ + } \ + if (fabs(aug_8) > fabs(aug_0)) { \ + flt_t swapt; \ + swapt = aug_0; aug_0 = aug_8; aug_8 = swapt; \ + swapt = aug_1; aug_1 = aug_9; aug_9 = swapt; \ + swapt = aug_2; aug_2 = aug_10; aug_10 = swapt; \ + swapt = aug_3; aug_3 = aug_11; aug_11 = swapt; \ + } \ + \ + if (aug_0 != (flt_t)0.0) { \ + } else if (aug_4 != (flt_t)0.0) { \ + flt_t swapt; \ + swapt = aug_0; aug_0 = aug_4; aug_4 = swapt; \ + swapt = aug_1; aug_1 = aug_5; aug_5 = swapt; \ + swapt = aug_2; aug_2 = aug_6; aug_6 = swapt; \ + swapt = aug_3; aug_3 = aug_7; aug_7 = swapt; \ + } else if (aug_8 != (flt_t)0.0) { \ + flt_t swapt; \ + swapt = aug_0; aug_0 = aug_8; aug_8 = swapt; \ + swapt = aug_1; aug_1 = aug_9; aug_9 = swapt; \ + swapt = aug_2; aug_2 = aug_10; aug_10 = swapt; \ + swapt = aug_3; aug_3 = aug_11; aug_11 = swapt; \ + } else \ + error = 1; \ + \ + t = aug_4 / aug_0; \ + aug_5 -= t * aug_1; \ + aug_6 -= t * aug_2; \ + aug_7 -= t * aug_3; \ + t = aug_8 / aug_0; \ + aug_9 -= t * aug_1; \ + aug_10 -= t * aug_2; \ + aug_11 -= t * aug_3; \ + \ + if (fabs(aug_9) > fabs(aug_5)) { \ + flt_t swapt; \ + swapt = aug_4; aug_4 = aug_8; aug_8 = swapt; \ + swapt = aug_5; aug_5 = aug_9; aug_9 = swapt; \ + swapt = aug_6; aug_6 = aug_10; aug_10 = swapt; \ + swapt = aug_7; aug_7 = aug_11; aug_11 = swapt; \ + } \ + \ + if (aug_5 != (flt_t)0.0) { \ + } else if (aug_9 != (flt_t)0.0) { \ + flt_t swapt; \ + swapt = aug_4; aug_4 = aug_8; aug_8 = swapt; \ + swapt = aug_5; aug_5 = aug_9; aug_9 = swapt; \ + swapt = aug_6; aug_6 = aug_10; aug_10 = swapt; \ + swapt = aug_7; aug_7 = aug_11; aug_11 = swapt; \ + } \ + \ + t = aug_9 / aug_5; \ + aug_10 -= t * aug_6; \ + aug_11 -= t * aug_7; \ + \ + if (aug_10 == (flt_t)0.0) \ + error = 1; \ + \ + ans##_2 = aug_11/aug_10; \ + t = (flt_t)0.0; \ + t += aug_6 * ans##_2; \ + ans##_1 = (aug_7-t) / aug_5; \ + t = (flt_t)0.0; \ + t += aug_1 * ans##_1; \ + t += aug_2 * ans##_2; \ + ans##_0 = (aug_3 - t) / aug_0; \ +} + +#endif diff --git a/src/USER-INTEL/neigh_half_bin_intel.cpp b/src/USER-INTEL/neigh_half_bin_intel.cpp new file mode 100644 index 0000000000..a5f12a56f9 --- /dev/null +++ b/src/USER-INTEL/neigh_half_bin_intel.cpp @@ -0,0 +1,1453 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#include "neighbor.h" +#include "neigh_list.h" +#include "atom.h" +#include "comm.h" +#include "group.h" +#include "fix_intel.h" + +#if defined(_OPENMP) +#include +#endif + +using namespace LAMMPS_NS; + +#ifdef _LMP_INTEL_OFFLOAD +#pragma offload_attribute(push,target(mic)) +#endif + +template +inline int mcoord2bin(const flt_t x0, const flt_t x1, const flt_t x2, + const flt_t bboxlo0, const flt_t bboxlo1, + const flt_t bboxlo2, const flt_t bboxhi0, + const flt_t bboxhi1, const flt_t bboxhi2, + const flt_t bininvx, const flt_t bininvy, + const flt_t bininvz, const int nbinx, const int nbiny, + const int nbinz, const int mbinx, const int mbiny, + const int mbinz, const int mbinxlo, const int mbinylo, + const int mbinzlo) +{ + int ix, iy, iz; + + if (x0 >= bboxhi0) + ix = static_cast ((x0 - bboxhi0) * bininvx) + nbinx; + else if (x0 >= bboxlo0) { + ix = static_cast ((x0 - bboxlo0) * bininvx); + ix = MIN(ix, nbinx-1); + } else + ix = static_cast ((x0 - bboxlo0) * bininvx) - 1; + + if (x1 >= bboxhi1) + iy = static_cast ((x1 - bboxhi1) * bininvy) + nbiny; + else if (x1 >= bboxlo1) { + iy = static_cast ((x1 - bboxlo1) * bininvy); + iy = MIN(iy, nbiny-1); + } else + iy = static_cast ((x1 - bboxlo1) * bininvy) - 1; + + if (x2 >= bboxhi2) + iz = static_cast ((x2 - bboxhi2) * bininvz) + nbinz; + else if (x2 >= bboxlo2) { + iz = static_cast ((x2 - bboxlo2) * bininvz); + iz = MIN(iz, nbinz - 1); + } else + iz = static_cast ((x2 - bboxlo2) * bininvz) - 1; + + return (iz - mbinzlo) * mbiny * mbinx + (iy - mbinylo) * mbinx + + (ix - mbinxlo); +} + +#define ofind_special(which, special, nspecial, i, tag, special_flag) \ +{ \ + which = 0; \ + const int n1 = nspecial[i * 3]; \ + const int n2 = nspecial[i * 3 + 1]; \ + const int n3 = nspecial[i * 3 + 2]; \ + const int *sptr = special + i * maxspecial; \ + for (int s = 0; s < n3; s++) { \ + if (sptr[s] == tag) { \ + if (s < n1) { \ + if (special_flag[1] == 0) which = -1; \ + else if (special_flag[1] == 1) which = 0; \ + else which = 1; \ + } else if (s < n2) { \ + if (special_flag[2] == 0) which = -1; \ + else if (special_flag[2] == 1) which = 0; \ + else which = 2; \ + } else { \ + if (special_flag[3] == 0) which = -1; \ + else if (special_flag[3] == 1) which = 0; \ + else which = 3; \ + } \ + } \ + } \ +} + +#ifdef _LMP_INTEL_OFFLOAD +#pragma offload_attribute(pop) +#endif + +template +void Neighbor::bin_atoms(void * xin) { + const ATOM_T * restrict const x = (const ATOM_T * restrict const)xin; + int nlocal = atom->nlocal; + const int nall = nlocal + atom->nghost; + + const flt_t bininvx = this->bininvx; + const flt_t bininvy = this->bininvy; + const flt_t bininvz = this->bininvz; + const flt_t bboxlo0 = this->bboxlo[0]; + const flt_t bboxlo1 = this->bboxlo[1]; + const flt_t bboxlo2 = this->bboxlo[2]; + const flt_t bboxhi0 = this->bboxhi[0]; + const flt_t bboxhi1 = this->bboxhi[1]; + const flt_t bboxhi2 = this->bboxhi[2]; + + int i, ibin; + + for (i = 0; i < mbins; i++) binhead[i] = -1; + + int *mask = atom->mask; + + if (includegroup) { + int bitmask = group->bitmask[includegroup]; + for (i = nall-1; i >= nlocal; i--) { + if (mask[i] & bitmask) { + ibin = mcoord2bin(x[i].x, x[i].y, x[i].z, bboxlo0, bboxlo1, bboxlo2, + bboxhi0, bboxhi1, bboxhi2, bininvx, bininvy, bininvz, nbinx, nbiny, + nbinz, mbinx, mbiny, mbinz, mbinxlo, mbinylo, mbinzlo); + bins[i] = binhead[ibin]; + binhead[ibin] = i; + } + } + for (i = atom->nfirst-1; i >= 0; i--) { + ibin = mcoord2bin(x[i].x, x[i].y, x[i].z, bboxlo0, bboxlo1, bboxlo2, + bboxhi0, bboxhi1, bboxhi2, bininvx, bininvy, bininvz, nbinx, nbiny, + nbinz, mbinx, mbiny, mbinz, mbinxlo, mbinylo, mbinzlo); + bins[i] = binhead[ibin]; + binhead[ibin] = i; + } + } else { + for (i = nall-1; i >= 0; i--) { + ibin = mcoord2bin(x[i].x, x[i].y, x[i].z, bboxlo0, bboxlo1, bboxlo2, + bboxhi0, bboxhi1, bboxhi2, bininvx, bininvy, bininvz, nbinx, nbiny, + nbinz, mbinx, mbiny, mbinz, mbinxlo, mbinylo, mbinzlo); + bins[i] = binhead[ibin]; + binhead[ibin] = i; + } + } +} + +/* ---------------------------------------------------------------------- + binned neighbor list construction with partial Newton's 3rd law + each owned atom i checks own bin and other bins in stencil + pair stored once if i,j are both owned and i < j + pair stored by me if j is ghost (also stored by proc owning j) +------------------------------------------------------------------------- */ + +void Neighbor::half_bin_no_newton_intel(NeighList *list) +{ + const int nlocal = (includegroup) ? atom->nfirst : atom->nlocal; + list->inum = nlocal; + + // Get fix for intel stuff + FixIntel *fix = static_cast(fix_intel); + + const int off_end = fix->offload_end_neighbor(); + int host_start = off_end;; + #ifdef _LMP_INTEL_OFFLOAD + if (fix->full_host_list()) host_start = 0; + if (exclude) + error->all(FLERR, "Exclusion lists not yet supported for Intel offload"); + #endif + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) { + hbnni(1, list, fix->get_mixed_buffers(), + 0, off_end, fix); + hbnni(0, list, fix->get_mixed_buffers(), + host_start, nlocal,fix); + } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) { + hbnni(1, list, fix->get_double_buffers(), + 0, off_end, fix); + hbnni(0, list, fix->get_double_buffers(), + host_start, nlocal, fix); + } else { + hbnni(1, list, fix->get_single_buffers(), + 0, off_end, fix); + hbnni(0, list, fix->get_single_buffers(), + host_start, nlocal, fix); + } +} + +template +void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in, + const int astart, const int aend, void *fix_in) { + IntelBuffers *buffers = (IntelBuffers *)buffers_in; + FixIntel *fix = (FixIntel *)fix_in; + const int nall = atom->nlocal + atom->nghost; + int pad = 1; + + if (offload) { + fix->start_watch(TIME_PACK); + buffers->grow(nall, atom->nlocal, comm->nthreads, aend); + buffers->grow_nbor(list, atom->nlocal, aend); + + ATOM_T biga; + biga.x = INTEL_BIGP; + biga.y = INTEL_BIGP; + biga.z = INTEL_BIGP; + biga.w = 1; + buffers->get_x()[nall] = biga; + + const int nthreads = comm->nthreads; + #if defined(_OPENMP) + #pragma omp parallel default(none) shared(buffers) + #endif + { + int ifrom, ito, tid; + IP_PRE_omp_range_id_align(ifrom, ito, tid, nall, nthreads, + sizeof(ATOM_T)); + buffers->thr_pack(ifrom, ito, 0); + } + fix->stop_watch(TIME_PACK); + + fix->start_watch(TIME_HOST_NEIGHBOR); + bin_atoms(buffers->get_x()); + if (INTEL_MIC_NBOR_PAD > 1) + pad = INTEL_MIC_NBOR_PAD * sizeof(float) / sizeof(flt_t); + } else { + fix->start_watch(TIME_HOST_NEIGHBOR); + if (INTEL_NBOR_PAD > 1) + pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t); + } + const int pad_width = pad; + + if (aend-astart == 0) { + fix->stop_watch(TIME_HOST_NEIGHBOR); + return; + } + + const ATOM_T * restrict const x = buffers->get_x(); + int * restrict const firstneigh = buffers->firstneigh(list); + + const int molecular = atom->molecular; + int *ns = NULL, *s = NULL; + int tag_size, special_size; + if (molecular) { + s = atom->special[0]; + ns = atom->nspecial[0]; + tag_size = nall; + special_size = aend; + } else { + s = &buffers->_special_holder; + ns = &buffers->_nspecial_holder; + tag_size = 0; + special_size = 0; + } + const int * restrict const special = s; + const int * restrict const nspecial = ns; + const int maxspecial = atom->maxspecial; + const int * restrict const tag = atom->tag; + + int * restrict const ilist = list->ilist; + int * restrict numneigh = list->numneigh; + int * restrict const cnumneigh = buffers->cnumneigh(list); + const int nstencil = list->nstencil; + const int * restrict const stencil = list->stencil; + const flt_t * restrict const cutneighsq = buffers->get_cutneighsq()[0]; + const int ntypes = atom->ntypes + 1; + const int nlocal = atom->nlocal; + + #ifndef _LMP_INTEL_OFFLOAD + int * const mask = atom->mask; + int * const molecule = atom->molecule; + #endif + + int tnum; + int *overflow; + double *timer_compute; + if (offload) { + timer_compute = fix->off_watch_neighbor(); + tnum = buffers->get_off_threads(); + overflow = fix->get_off_overflow_flag(); + fix->stop_watch(TIME_HOST_NEIGHBOR); + fix->start_watch(TIME_OFFLOAD_LATENCY); + } else { + tnum = comm->nthreads; + overflow = fix->get_overflow_flag(); + } + const int nthreads = tnum; + const int maxnbors = buffers->get_max_nbors(); + + const flt_t bboxlo0 = this->bboxlo[0]; + const flt_t bboxlo1 = this->bboxlo[1]; + const flt_t bboxlo2 = this->bboxlo[2]; + const flt_t bboxhi0 = this->bboxhi[0]; + const flt_t bboxhi1 = this->bboxhi[1]; + const flt_t bboxhi2 = this->bboxhi[2]; + const flt_t bininvx = this->bininvx; + const flt_t bininvy = this->bininvy; + const flt_t bininvz = this->bininvz; + + // Make sure dummy coordinates to eliminate loop remainder not within cutoff + { + const flt_t dx = (INTEL_BIGP - bboxhi0); + const flt_t dy = (INTEL_BIGP - bboxhi1); + const flt_t dz = (INTEL_BIGP - bboxhi2); + if (dx * dx + dy * dy + dz * dz < static_cast(cutneighmaxsq)) + error->one(FLERR, + "Intel package expects no atoms within cutoff of {1e15,1e15,1e15}."); + } + + #ifdef _LMP_INTEL_OFFLOAD + const int * restrict const binhead = this->binhead; + const int * restrict const special_flag = this->special_flag; + const int nbinx = this->nbinx; + const int nbiny = this->nbiny; + const int nbinz = this->nbinz; + const int mbinxlo = this->mbinxlo; + const int mbinylo = this->mbinylo; + const int mbinzlo = this->mbinzlo; + const int mbinx = this->mbinx; + const int mbiny = this->mbiny; + const int mbinz = this->mbinz; + const int * restrict const bins = this->bins; + const int cop = fix->coprocessor_number(); + const int separate_buffers = fix->separate_buffers(); + #pragma offload target(mic:cop) if(offload) \ + in(x:length(nall+1) alloc_if(0) free_if(0)) \ + in(tag:length(tag_size) alloc_if(0) free_if(0)) \ + in(special:length(special_size*maxspecial) alloc_if(0) free_if(0)) \ + in(nspecial:length(special_size*3) alloc_if(0) free_if(0)) \ + in(bins:length(nall) alloc_if(0) free_if(0)) \ + in(binhead:length(mbins) alloc_if(0) free_if(0)) \ + in(cutneighsq:length(0) alloc_if(0) free_if(0)) \ + in(firstneigh:length(0) alloc_if(0) free_if(0)) \ + in(cnumneigh:length(0) alloc_if(0) free_if(0)) \ + out(numneigh:length(0) alloc_if(0) free_if(0)) \ + in(ilist:length(0) alloc_if(0) free_if(0)) \ + in(stencil:length(nstencil) alloc_if(0) free_if(0)) \ + in(special_flag:length(0) alloc_if(0) free_if(0)) \ + in(maxnbors,nthreads,maxspecial,nstencil,nbinx,nbiny,nbinz) \ + in(mbinxlo,mbinylo,mbinzlo,mbinx,mbiny,mbinz,pad_width,offload) \ + in(bininvx,bininvy,bininvz,bboxlo0,bboxlo1,bboxlo2,separate_buffers) \ + in(bboxhi0, bboxhi1, bboxhi2, astart, aend, nlocal, molecular, ntypes) \ + out(overflow:length(5) alloc_if(0) free_if(0)) \ + out(timer_compute:length(1) alloc_if(0) free_if(0)) \ + signal(numneigh) + #endif + { + #ifdef __MIC__ + *timer_compute = MIC_Wtime(); + #endif + + #ifdef _LMP_INTEL_OFFLOAD + overflow[LMP_LOCAL_MIN] = astart; + overflow[LMP_LOCAL_MAX] = aend - 1; + overflow[LMP_GHOST_MIN] = nall; + overflow[LMP_GHOST_MAX] = -1; + #endif + + #if defined(_OPENMP) + #pragma omp parallel default(none) shared(numneigh,overflow) + #endif + { + #ifdef _LMP_INTEL_OFFLOAD + int lmin = nall, lmax = -1, gmin = nall, gmax = -1; + #endif + + const int num = aend - astart; + int tid, ifrom, ito; + IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads); + ifrom += astart; + ito += astart; + + int which; + + const int list_size = (ito + tid + 1) * maxnbors; + int ct = (ifrom + tid) * maxnbors; + int *neighptr = firstneigh + ct; + for (int i = ifrom; i < ito; i++) { + int j, k, n, n2, itype, jtype, ibin; + double xtmp, ytmp, ztmp, delx, dely, delz, rsq; + + n = 0; + n2 = maxnbors; + + xtmp = x[i].x; + ytmp = x[i].y; + ztmp = x[i].z; + itype = x[i].w; + const int ioffset = ntypes*itype; + + // loop over all atoms in other bins in stencil including self + // only store pair if i < j + // stores own/own pairs only once + // stores own/ghost pairs on both procs + + ibin = mcoord2bin(x[i].x, x[i].y, x[i].z, bboxlo0, bboxlo1, bboxlo2, + bboxhi0, bboxhi1, bboxhi2, bininvx, bininvy, bininvz, + nbinx, nbiny, nbinz, mbinx, mbiny, mbinz, + mbinxlo, mbinylo, mbinzlo); + + for (k = 0; k < nstencil; k++) { + for (j = binhead[ibin + stencil[k]]; j >= 0; j = bins[j]) { + if (j <= i) continue; + + jtype = x[j].w; + #ifndef _LMP_INTEL_OFFLOAD + if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue; + #endif + + delx = xtmp - x[j].x; + dely = ytmp - x[j].y; + delz = ztmp - x[j].z; + rsq = delx * delx + dely * dely + delz * delz; + if (rsq <= cutneighsq[ioffset + jtype]) { + if (j < nlocal) { + neighptr[n++] = j; + #ifdef _LMP_INTEL_OFFLOAD + if (j < lmin) lmin = j; + if (j > lmax) lmax = j; + #endif + } else { + neighptr[n2++] = j; + #ifdef _LMP_INTEL_OFFLOAD + if (j < gmin) gmin = j; + if (j > gmax) gmax = j; + #endif + } + } + } + } + ilist[i] = i; + + cnumneigh[i] = ct; + if (n > maxnbors) *overflow = 1; + for (k = maxnbors; k < n2; k++) neighptr[n++] = neighptr[k]; + while( (n % pad_width) != 0 ) neighptr[n++] = nall; + numneigh[i] = n; + while((n % (INTEL_DATA_ALIGN / sizeof(int))) != 0) n++; + ct += n; + neighptr += n; + if (ct + n + maxnbors > list_size) { + *overflow = 1; + ct = (ifrom + tid) * maxnbors; + } + } + + if (*overflow == 1) + for (int i = ifrom; i < ito; i++) + numneigh[i] = 0; + + #ifdef _LMP_INTEL_OFFLOAD + if (separate_buffers) { + #if defined(_OPENMP) + #pragma omp critical + #endif + { + if (lmin < overflow[LMP_LOCAL_MIN]) overflow[LMP_LOCAL_MIN] = lmin; + if (lmax > overflow[LMP_LOCAL_MAX]) overflow[LMP_LOCAL_MAX] = lmax; + if (gmin < overflow[LMP_GHOST_MIN]) overflow[LMP_GHOST_MIN] = gmin; + if (gmax > overflow[LMP_GHOST_MAX]) overflow[LMP_GHOST_MAX] = gmax; + } + #pragma omp barrier + } + + int ghost_offset = 0, nall_offset = nall; + if (separate_buffers) { + int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN]; + if (nghost < 0) nghost = 0; + if (offload) { + ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1; + nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost; + } else { + ghost_offset = overflow[LMP_GHOST_MIN] - nlocal; + nall_offset = nlocal + nghost; + } + } + #endif + + if (molecular) { + for (int i = ifrom; i < ito; ++i) { + int * restrict jlist = firstneigh + cnumneigh[i]; + const int jnum = numneigh[i]; + for (int jj = 0; jj < jnum; jj++) { + const int j = jlist[jj]; + ofind_special(which, special, nspecial, i, tag[j], special_flag); + #ifdef _LMP_INTEL_OFFLOAD + if (j >= nlocal) { + if (j == nall) + jlist[jj] = nall_offset; + else if (which > 0) + jlist[jj] = (j-ghost_offset) ^ (which << SBBITS); + else jlist[jj]-=ghost_offset; + } else + #endif + if (which > 0) jlist[jj] = j ^ (which << SBBITS); + } + } + } + #ifdef _LMP_INTEL_OFFLOAD + else if (separate_buffers) { + for (int i = ifrom; i < ito; ++i) { + int * restrict jlist = firstneigh + cnumneigh[i]; + const int jnum = numneigh[i]; + int jj = 0; + for (jj = 0; jj < jnum; jj++) + if (jlist[jj] >= nlocal) break; + while (jj < jnum) { + if (jlist[jj] == nall) jlist[jj] = nall_offset; + else jlist[jj] -= ghost_offset; + jj++; + } + } + } + #endif + } // end omp + #ifdef __MIC__ + *timer_compute = MIC_Wtime() - *timer_compute; + #endif + } // end offload + + if (offload) { + fix->stop_watch(TIME_OFFLOAD_LATENCY); + #ifdef _LMP_INTEL_OFFLOAD + for (int n = 0; n < aend; n++) { + ilist[n] = n; + numneigh[n] = 0; + } + #endif + } else { + for (int i = astart; i < aend; i++) + list->firstneigh[i] = firstneigh + cnumneigh[i]; + fix->stop_watch(TIME_HOST_NEIGHBOR); + #ifdef _LMP_INTEL_OFFLOAD + if (separate_buffers) { + fix->start_watch(TIME_PACK); + fix->set_neighbor_host_sizes(); + buffers->pack_sep_from_single(fix->host_min_local(), + fix->host_used_local(), + fix->host_min_ghost(), + fix->host_used_ghost()); + fix->stop_watch(TIME_PACK); + } + #endif + } +} + +/* ---------------------------------------------------------------------- + binned neighbor list construction with full Newton's 3rd law + each owned atom i checks its own bin and other bins in Newton stencil + every pair stored exactly once by some processor +------------------------------------------------------------------------- */ + +void Neighbor::half_bin_newton_intel(NeighList *list) +{ + const int nlocal = (includegroup) ? atom->nfirst : atom->nlocal; + list->inum = nlocal; + + // Get fix for intel stuff + FixIntel *fix = static_cast(fix_intel); + + const int off_end = fix->offload_end_neighbor(); + int host_start = fix->host_start_neighbor();; + int offload_noghost = 0; + #ifdef _LMP_INTEL_OFFLOAD + if (fix->full_host_list()) host_start = 0; + offload_noghost = fix->offload_noghost(); + if (exclude) + error->all(FLERR, "Exclusion lists not yet supported for Intel offload"); + #endif + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) { + if (offload_noghost) { + hbni(1, list, fix->get_mixed_buffers(), + 0, off_end, fix); + hbni(0, list, fix->get_mixed_buffers(), + host_start, nlocal, fix, off_end); + } else { + hbni(1, list, fix->get_mixed_buffers(), + 0, off_end, fix); + hbni(0, list, fix->get_mixed_buffers(), + host_start, nlocal, fix); + } + } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) { + if (offload_noghost) { + hbni(1, list, fix->get_double_buffers(), + 0, off_end, fix); + hbni(0, list, fix->get_double_buffers(), + host_start, nlocal, fix, off_end); + } else { + hbni(1, list, fix->get_double_buffers(), + 0, off_end, fix); + hbni(0, list, fix->get_double_buffers(), + host_start, nlocal, fix); + } + } else { + if (offload_noghost) { + hbni(1, list, fix->get_single_buffers(), 0, off_end, fix); + hbni(0, list, fix->get_single_buffers(), + host_start, nlocal, fix, off_end); + } else { + hbni(1, list, fix->get_single_buffers(), 0, off_end, fix); + hbni(0, list, fix->get_single_buffers(), + host_start, nlocal, fix); + } + } +} + +template +void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in, + const int astart, const int aend, void *fix_in, + const int offload_end) { + IntelBuffers *buffers = (IntelBuffers *)buffers_in; + FixIntel *fix = (FixIntel *)fix_in; + const int nall = atom->nlocal + atom->nghost; + int pad = 1; + + if (offload) { + fix->start_watch(TIME_PACK); + buffers->grow(nall, atom->nlocal, comm->nthreads, aend); + buffers->grow_nbor(list, atom->nlocal, aend); + + ATOM_T biga; + biga.x = INTEL_BIGP; + biga.y = INTEL_BIGP; + biga.z = INTEL_BIGP; + biga.w = 1; + buffers->get_x()[nall]=biga; + + const int nthreads = comm->nthreads; + #if defined(_OPENMP) + #pragma omp parallel default(none) shared(buffers) + #endif + { + int ifrom, ito, tid; + IP_PRE_omp_range_id_align(ifrom, ito, tid, nall, nthreads, + sizeof(ATOM_T)); + buffers->thr_pack(ifrom, ito, 0); + } + fix->stop_watch(TIME_PACK); + + fix->start_watch(TIME_HOST_NEIGHBOR); + bin_atoms(buffers->get_x()); + if (INTEL_MIC_NBOR_PAD > 1) + pad = INTEL_MIC_NBOR_PAD * sizeof(float) / sizeof(flt_t); + } else { + fix->start_watch(TIME_HOST_NEIGHBOR); + if (INTEL_NBOR_PAD > 1) + pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t); + } + const int pad_width = pad; + + if (aend-astart == 0) { + fix->stop_watch(TIME_HOST_NEIGHBOR); + return; + } + + const ATOM_T * restrict const x = buffers->get_x(); + int * restrict const firstneigh = buffers->firstneigh(list); + int nall_t = nall; + if (offload_noghost && offload) nall_t = atom->nlocal; + const int e_nall = nall_t; + + const int molecular = atom->molecular; + int *ns = NULL, *s = NULL; + int tag_size, special_size; + if (molecular) { + s = atom->special[0]; + ns = atom->nspecial[0]; + tag_size = e_nall; + special_size = aend; + } else { + s = &buffers->_special_holder; + ns = &buffers->_nspecial_holder; + tag_size = 0; + special_size = 0; + } + const int * restrict const special = s; + const int * restrict const nspecial = ns; + const int maxspecial = atom->maxspecial; + const int * restrict const tag = atom->tag; + + int * restrict const ilist = list->ilist; + int * restrict numneigh = list->numneigh; + int * restrict const cnumneigh = buffers->cnumneigh(list); + const int nstencil = list->nstencil; + const int * restrict const stencil = list->stencil; + const flt_t * restrict const cutneighsq = buffers->get_cutneighsq()[0]; + const int ntypes = atom->ntypes + 1; + const int nlocal = atom->nlocal; + + #ifndef _LMP_INTEL_OFFLOAD + int * const mask = atom->mask; + int * const molecule = atom->molecule; + #endif + + int tnum; + int *overflow; + double *timer_compute; + if (offload) { + timer_compute = fix->off_watch_neighbor(); + tnum = buffers->get_off_threads(); + overflow = fix->get_off_overflow_flag(); + fix->stop_watch(TIME_HOST_NEIGHBOR); + fix->start_watch(TIME_OFFLOAD_LATENCY); + } else { + tnum = comm->nthreads; + overflow = fix->get_overflow_flag(); + } + const int nthreads = tnum; + const int maxnbors = buffers->get_max_nbors(); + + const flt_t bboxlo0 = this->bboxlo[0]; + const flt_t bboxlo1 = this->bboxlo[1]; + const flt_t bboxlo2 = this->bboxlo[2]; + const flt_t bboxhi0 = this->bboxhi[0]; + const flt_t bboxhi1 = this->bboxhi[1]; + const flt_t bboxhi2 = this->bboxhi[2]; + const flt_t bininvx = this->bininvx; + const flt_t bininvy = this->bininvy; + const flt_t bininvz = this->bininvz; + // Make sure dummy coordinates to eliminate loop remainder not within cutoff + { + const flt_t dx = (INTEL_BIGP - bboxhi0); + const flt_t dy = (INTEL_BIGP - bboxhi1); + const flt_t dz = (INTEL_BIGP - bboxhi2); + if (dx * dx + dy * dy + dz * dz < static_cast(cutneighmaxsq)) + error->one(FLERR, + "Intel package expects no atoms within cutoff of {1e15,1e15,1e15}."); + } + + #ifdef _LMP_INTEL_OFFLOAD + const int * restrict const binhead = this->binhead; + const int * restrict const special_flag = this->special_flag; + const int nbinx = this->nbinx; + const int nbiny = this->nbiny; + const int nbinz = this->nbinz; + const int mbinxlo = this->mbinxlo; + const int mbinylo = this->mbinylo; + const int mbinzlo = this->mbinzlo; + const int mbinx = this->mbinx; + const int mbiny = this->mbiny; + const int mbinz = this->mbinz; + const int * restrict const bins = this->bins; + const int cop = fix->coprocessor_number(); + const int separate_buffers = fix->separate_buffers(); + #pragma offload target(mic:cop) if(offload) \ + in(x:length(e_nall+1) alloc_if(0) free_if(0)) \ + in(tag:length(tag_size) alloc_if(0) free_if(0)) \ + in(special:length(special_size*maxspecial) alloc_if(0) free_if(0)) \ + in(nspecial:length(special_size*3) alloc_if(0) free_if(0)) \ + in(bins:length(nall) alloc_if(0) free_if(0)) \ + in(binhead:length(mbins) alloc_if(0) free_if(0)) \ + in(cutneighsq:length(0) alloc_if(0) free_if(0)) \ + in(firstneigh:length(0) alloc_if(0) free_if(0)) \ + in(cnumneigh:length(0) alloc_if(0) free_if(0)) \ + out(numneigh:length(0) alloc_if(0) free_if(0)) \ + in(ilist:length(0) alloc_if(0) free_if(0)) \ + in(stencil:length(nstencil) alloc_if(0) free_if(0)) \ + in(special_flag:length(0) alloc_if(0) free_if(0)) \ + in(maxnbors,nthreads,maxspecial,nstencil,nbinx,nbiny,nbinz,e_nall,offload)\ + in(mbinxlo,mbinylo,mbinzlo,mbinx,mbiny,mbinz,pad_width,offload_end) \ + in(bininvx,bininvy,bininvz,bboxlo0,bboxlo1,bboxlo2,separate_buffers) \ + in(bboxhi0, bboxhi1, bboxhi2, astart, aend, nlocal, molecular, ntypes) \ + out(overflow:length(5) alloc_if(0) free_if(0)) \ + out(timer_compute:length(1) alloc_if(0) free_if(0)) \ + signal(numneigh) + #endif + { + #ifdef __MIC__ + *timer_compute = MIC_Wtime(); + #endif + + #ifdef _LMP_INTEL_OFFLOAD + overflow[LMP_LOCAL_MIN] = astart; + overflow[LMP_LOCAL_MAX] = aend - 1; + overflow[LMP_GHOST_MIN] = e_nall; + overflow[LMP_GHOST_MAX] = -1; + #endif + + #if defined(_OPENMP) + #pragma omp parallel default(none) shared(numneigh, overflow) + #endif + { + #ifdef _LMP_INTEL_OFFLOAD + int lmin = e_nall, lmax = -1, gmin = e_nall, gmax = -1; + #endif + + const int num = aend - astart; + int tid, ifrom, ito; + IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads); + ifrom += astart; + ito += astart; + + int which; + + const int list_size = (ito + tid + 1) * maxnbors; + int ct = (ifrom + tid) * maxnbors; + int *neighptr = firstneigh + ct; + for (int i = ifrom; i < ito; i++) { + int j, k, n, n2, itype, jtype, ibin; + double xtmp, ytmp, ztmp, delx, dely, delz, rsq; + + n = 0; + n2 = maxnbors; + + xtmp = x[i].x; + ytmp = x[i].y; + ztmp = x[i].z; + itype = x[i].w; + const int ioffset = ntypes * itype; + + // loop over rest of atoms in i's bin, ghosts are at end of linked list + // if j is owned atom, store it, since j is beyond i in linked list + // if j is ghost, only store if j coords are "above/to the right" of i + + for (j = bins[i]; j >= 0; j = bins[j]) { + if (j >= nlocal) { + if (offload_noghost && offload) continue; + if (x[j].z < ztmp) continue; + if (x[j].z == ztmp) { + if (x[j].y < ytmp) continue; + if (x[j].y == ytmp && x[j].x < xtmp) continue; + } + } else if (offload_noghost && i < offload_end) continue; + + jtype = x[j].w; + #ifndef _LMP_INTEL_OFFLOAD + if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue; + #endif + + delx = xtmp - x[j].x; + dely = ytmp - x[j].y; + delz = ztmp - x[j].z; + rsq = delx * delx + dely * dely + delz * delz; + + if (rsq <= cutneighsq[ioffset + jtype]) { + if (j < nlocal) { + neighptr[n++] = j; + #ifdef _LMP_INTEL_OFFLOAD + if (j < lmin) lmin = j; + if (j > lmax) lmax = j; + #endif + } else { + neighptr[n2++] = j; + #ifdef _LMP_INTEL_OFFLOAD + if (j < gmin) gmin = j; + if (j > gmax) gmax = j; + #endif + } + } + } + // loop over all atoms in other bins in stencil, store every pair + + ibin = mcoord2bin(x[i].x, x[i].y, x[i].z, bboxlo0, bboxlo1, bboxlo2, + bboxhi0, bboxhi1, bboxhi2, bininvx, bininvy, bininvz, + nbinx, nbiny, nbinz, mbinx, mbiny, mbinz, + mbinxlo, mbinylo, mbinzlo); + + for (k = 0; k < nstencil; k++) { + for (j = binhead[ibin + stencil[k]]; j >= 0; j = bins[j]) { + if (offload_noghost) { + if (j < nlocal) { + if (i < offload_end) continue; + } else if (offload) continue; + } + + jtype = x[j].w; + #ifndef _LMP_INTEL_OFFLOAD + if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue; + #endif + + delx = xtmp - x[j].x; + dely = ytmp - x[j].y; + delz = ztmp - x[j].z; + rsq = delx * delx + dely * dely + delz * delz; + if (rsq <= cutneighsq[ioffset + jtype]) { + if (j < nlocal) { + neighptr[n++] = j; + #ifdef _LMP_INTEL_OFFLOAD + if (j < lmin) lmin = j; + if (j > lmax) lmax = j; + #endif + } else { + neighptr[n2++] = j; + #ifdef _LMP_INTEL_OFFLOAD + if (j < gmin) gmin = j; + if (j > gmax) gmax = j; + #endif + } + } + } + } + ilist[i] = i; + + cnumneigh[i] = ct; + if (n > maxnbors) *overflow = 1; + for (k = maxnbors; k < n2; k++) neighptr[n++] = neighptr[k]; + while( (n % pad_width) != 0 ) neighptr[n++] = e_nall; + numneigh[i] = n; + while((n % (INTEL_DATA_ALIGN / sizeof(int))) != 0) n++; + ct += n; + neighptr += n; + if (ct + n + maxnbors > list_size) { + *overflow = 1; + ct = (ifrom + tid) * maxnbors; + } + } + + if (*overflow == 1) + for (int i = ifrom; i < ito; i++) + numneigh[i] = 0; + + #ifdef _LMP_INTEL_OFFLOAD + if (separate_buffers) { + #if defined(_OPENMP) + #pragma omp critical + #endif + { + if (lmin < overflow[LMP_LOCAL_MIN]) overflow[LMP_LOCAL_MIN] = lmin; + if (lmax > overflow[LMP_LOCAL_MAX]) overflow[LMP_LOCAL_MAX] = lmax; + if (gmin < overflow[LMP_GHOST_MIN]) overflow[LMP_GHOST_MIN] = gmin; + if (gmax > overflow[LMP_GHOST_MAX]) overflow[LMP_GHOST_MAX] = gmax; + } + #pragma omp barrier + } + + int ghost_offset = 0, nall_offset = e_nall; + if (separate_buffers) { + int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN]; + if (nghost < 0) nghost = 0; + if (offload) { + ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1; + nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost; + } else { + ghost_offset = overflow[LMP_GHOST_MIN] - nlocal; + nall_offset = nlocal + nghost; + } + } + #endif + + if (molecular) { + for (int i = ifrom; i < ito; ++i) { + int * restrict jlist = firstneigh + cnumneigh[i]; + const int jnum = numneigh[i]; + for (int jj = 0; jj < jnum; jj++) { + const int j = jlist[jj]; + ofind_special(which, special, nspecial, i, tag[j], + special_flag); + #ifdef _LMP_INTEL_OFFLOAD + if (j >= nlocal) { + if (j == e_nall) + jlist[jj] = nall_offset; + else if (which > 0) + jlist[jj] = (j-ghost_offset) ^ (which << SBBITS); + else jlist[jj]-=ghost_offset; + } else + #endif + if (which > 0) jlist[jj] = j ^ (which << SBBITS); + } + } + } + #ifdef _LMP_INTEL_OFFLOAD + else if (separate_buffers) { + for (int i = ifrom; i < ito; ++i) { + int * restrict jlist = firstneigh + cnumneigh[i]; + const int jnum = numneigh[i]; + int jj = 0; + for (jj = 0; jj < jnum; jj++) + if (jlist[jj] >= nlocal) break; + while (jj < jnum) { + if (jlist[jj] == e_nall) jlist[jj] = nall_offset; + else jlist[jj] -= ghost_offset; + jj++; + } + } + } + #endif + } // end omp + #ifdef __MIC__ + *timer_compute = MIC_Wtime() - *timer_compute; + #endif + } // end offload + + if (offload) { + fix->stop_watch(TIME_OFFLOAD_LATENCY); + #ifdef _LMP_INTEL_OFFLOAD + for (int n = 0; n < aend; n++) { + ilist[n] = n; + numneigh[n] = 0; + } + #endif + } else { + for (int i = astart; i < aend; i++) + list->firstneigh[i] = firstneigh + cnumneigh[i]; + fix->stop_watch(TIME_HOST_NEIGHBOR); + #ifdef _LMP_INTEL_OFFLOAD + if (separate_buffers) { + fix->start_watch(TIME_PACK); + fix->set_neighbor_host_sizes(); + buffers->pack_sep_from_single(fix->host_min_local(), + fix->host_used_local(), + fix->host_min_ghost(), + fix->host_used_ghost()); + fix->stop_watch(TIME_PACK); + } + #endif + } +} + +/* ---------------------------------------------------------------------- + binned neighbor list construction with Newton's 3rd law for triclinic + each owned atom i checks its own bin and other bins in triclinic stencil + every pair stored exactly once by some processor +------------------------------------------------------------------------- */ + +void Neighbor::half_bin_newton_tri_intel(NeighList *list) +{ + const int nlocal = (includegroup) ? atom->nfirst : atom->nlocal; + list->inum = nlocal; + + // Get fix for intel stuff + FixIntel *fix = static_cast(fix_intel); + + const int off_end = fix->offload_end_neighbor(); + int host_start = fix->host_start_neighbor(); + int offload_noghost = 0; + #ifdef _LMP_INTEL_OFFLOAD + if (fix->full_host_list()) host_start = 0; + offload_noghost = fix->offload_noghost(); + if (exclude) + error->all(FLERR, "Exclusion lists not yet supported for Intel offload"); + #endif + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) { + if (offload_noghost) { + hbnti(1, list, fix->get_mixed_buffers(), + 0, off_end, fix); + hbnti(0, list, fix->get_mixed_buffers(), + host_start, nlocal, fix, off_end); + } else { + hbnti(1, list, fix->get_mixed_buffers(), + 0, off_end, fix); + hbnti(0, list, fix->get_mixed_buffers(), + host_start, nlocal, fix); + } + } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) { + if (offload_noghost) { + hbnti(1, list, fix->get_double_buffers(), + 0, off_end, fix); + hbnti(0, list, fix->get_double_buffers(), + host_start, nlocal, fix, off_end); + } else { + hbnti(1, list, fix->get_double_buffers(), + 0, off_end, fix); + hbnti(0, list, fix->get_double_buffers(), + host_start, nlocal, fix); + } + } else { + if (offload_noghost) { + hbnti(1, list, fix->get_single_buffers(), + 0, off_end, fix); + hbnti(0, list, fix->get_single_buffers(), + host_start, nlocal, fix, off_end); + } else { + hbnti(1, list, fix->get_single_buffers(), + 0, off_end, fix); + hbnti(0, list, fix->get_single_buffers(), + host_start, nlocal, fix); + } + } +} + +template +void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in, + const int astart, const int aend, void *fix_in, + const int offload_end) { + IntelBuffers *buffers = (IntelBuffers *)buffers_in; + FixIntel *fix = (FixIntel *)fix_in; + const int nall = atom->nlocal + atom->nghost; + int pad = 1; + + if (offload) { + fix->start_watch(TIME_PACK); + buffers->grow(nall, atom->nlocal, comm->nthreads, aend); + buffers->grow_nbor(list, atom->nlocal, aend); + + ATOM_T biga; + biga.x = INTEL_BIGP; + biga.y = INTEL_BIGP; + biga.z = INTEL_BIGP; + biga.w = 1; + buffers->get_x()[nall]=biga; + + const int nthreads = comm->nthreads; + #if defined(_OPENMP) + #pragma omp parallel default(none) shared(buffers) + #endif + { + int ifrom, ito, tid; + IP_PRE_omp_range_id_align(ifrom, ito, tid, nall, nthreads, + sizeof(ATOM_T)); + buffers->thr_pack(ifrom, ito, 0); + } + fix->stop_watch(TIME_PACK); + + fix->start_watch(TIME_HOST_NEIGHBOR); + bin_atoms(buffers->get_x()); + if (INTEL_MIC_NBOR_PAD > 1) + pad = INTEL_MIC_NBOR_PAD * sizeof(float) / sizeof(flt_t); + } else { + fix->start_watch(TIME_HOST_NEIGHBOR); + if (INTEL_NBOR_PAD > 1) + pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t); + } + const int pad_width = pad; + + if (aend-astart == 0) { + fix->stop_watch(TIME_HOST_NEIGHBOR); + return; + } + + const ATOM_T * restrict const x = buffers->get_x(); + int * restrict const firstneigh = buffers->firstneigh(list); + int nall_t = nall; + if (offload_noghost && offload) nall_t = atom->nlocal; + const int e_nall = nall_t; + + const int molecular = atom->molecular; + int *ns = NULL, *s = NULL; + int tag_size, special_size; + if (molecular) { + s = atom->special[0]; + ns = atom->nspecial[0]; + tag_size = e_nall; + special_size = aend; + } else { + s = &buffers->_special_holder; + ns = &buffers->_nspecial_holder; + tag_size = 0; + special_size = 0; + } + const int * restrict const special = s; + const int * restrict const nspecial = ns; + const int maxspecial = atom->maxspecial; + const int * restrict const tag = atom->tag; + + int * restrict const ilist = list->ilist; + int * restrict numneigh = list->numneigh; + int * restrict const cnumneigh = buffers->cnumneigh(list); + const int nstencil = list->nstencil; + const int * restrict const stencil = list->stencil; + const flt_t * restrict const cutneighsq = buffers->get_cutneighsq()[0]; + const int ntypes = atom->ntypes + 1; + const int nlocal = atom->nlocal; + + #ifndef _LMP_INTEL_OFFLOAD + int * const mask = atom->mask; + int * const molecule = atom->molecule; + #endif + + int tnum; + int *overflow; + double *timer_compute; + if (offload) { + timer_compute = fix->off_watch_neighbor(); + tnum = buffers->get_off_threads(); + overflow = fix->get_off_overflow_flag(); + fix->stop_watch(TIME_HOST_NEIGHBOR); + fix->start_watch(TIME_OFFLOAD_LATENCY); + } else { + tnum = comm->nthreads; + overflow = fix->get_overflow_flag(); + } + const int nthreads = tnum; + const int maxnbors = buffers->get_max_nbors(); + + const flt_t bboxlo0 = this->bboxlo[0]; + const flt_t bboxlo1 = this->bboxlo[1]; + const flt_t bboxlo2 = this->bboxlo[2]; + const flt_t bboxhi0 = this->bboxhi[0]; + const flt_t bboxhi1 = this->bboxhi[1]; + const flt_t bboxhi2 = this->bboxhi[2]; + const flt_t bininvx = this->bininvx; + const flt_t bininvy = this->bininvy; + const flt_t bininvz = this->bininvz; + // Make sure dummy coordinates to eliminate loop remainder not within cutoff + { + const flt_t dx = (INTEL_BIGP - bboxhi0); + const flt_t dy = (INTEL_BIGP - bboxhi1); + const flt_t dz = (INTEL_BIGP - bboxhi2); + if (dx * dx + dy * dy + dz * dz < static_cast(cutneighmaxsq)) + error->one(FLERR, + "Intel package expects no atoms within cutoff of {1e15,1e15,1e15}."); + } + + #ifdef _LMP_INTEL_OFFLOAD + const int * restrict const binhead = this->binhead; + const int * restrict const special_flag = this->special_flag; + const int nbinx = this->nbinx; + const int nbiny = this->nbiny; + const int nbinz = this->nbinz; + const int mbinxlo = this->mbinxlo; + const int mbinylo = this->mbinylo; + const int mbinzlo = this->mbinzlo; + const int mbinx = this->mbinx; + const int mbiny = this->mbiny; + const int mbinz = this->mbinz; + const int * restrict const bins = this->bins; + const int cop = fix->coprocessor_number(); + const int separate_buffers = fix->separate_buffers(); + #pragma offload target(mic:cop) if(offload) \ + in(x:length(e_nall+1) alloc_if(0) free_if(0)) \ + in(tag:length(tag_size) alloc_if(0) free_if(0)) \ + in(special:length(special_size*maxspecial) alloc_if(0) free_if(0)) \ + in(nspecial:length(special_size*3) alloc_if(0) free_if(0)) \ + in(bins:length(nall) alloc_if(0) free_if(0)) \ + in(binhead:length(mbins) alloc_if(0) free_if(0)) \ + in(cutneighsq:length(0) alloc_if(0) free_if(0)) \ + in(firstneigh:length(0) alloc_if(0) free_if(0)) \ + in(cnumneigh:length(0) alloc_if(0) free_if(0)) \ + out(numneigh:length(0) alloc_if(0) free_if(0)) \ + in(ilist:length(0) alloc_if(0) free_if(0)) \ + in(stencil:length(nstencil) alloc_if(0) free_if(0)) \ + in(special_flag:length(0) alloc_if(0) free_if(0)) \ + in(maxnbors,nthreads,maxspecial,nstencil,nbinx,nbiny,nbinz,offload_end) \ + in(mbinxlo,mbinylo,mbinzlo,mbinx,mbiny,mbinz,pad_width,e_nall,offload) \ + in(bininvx,bininvy,bininvz,bboxlo0,bboxlo1,bboxlo2,separate_buffers) \ + in(bboxhi0, bboxhi1, bboxhi2, astart, aend, nlocal, molecular, ntypes) \ + out(overflow:length(5) alloc_if(0) free_if(0)) \ + out(timer_compute:length(1) alloc_if(0) free_if(0)) \ + signal(numneigh) + #endif + { + #ifdef __MIC__ + *timer_compute = MIC_Wtime(); + #endif + + #ifdef _LMP_INTEL_OFFLOAD + overflow[LMP_LOCAL_MIN] = astart; + overflow[LMP_LOCAL_MAX] = aend - 1; + overflow[LMP_GHOST_MIN] = e_nall; + overflow[LMP_GHOST_MAX] = -1; + #endif + + #if defined(_OPENMP) + #pragma omp parallel default(none) shared(numneigh, overflow) + #endif + { + #ifdef _LMP_INTEL_OFFLOAD + int lmin = e_nall, lmax = -1, gmin = e_nall, gmax = -1; + #endif + + const int num = aend-astart; + int tid, ifrom, ito; + IP_PRE_omp_range_id(ifrom,ito,tid,num,nthreads); + ifrom += astart; + ito += astart; + + int which; + + const int list_size = (ito + tid + 1) * maxnbors; + int ct = (ifrom + tid) * maxnbors; + int *neighptr = firstneigh + ct; + for (int i = ifrom; i < ito; i++) { + int j, k, n, n2, itype, jtype, ibin; + double xtmp, ytmp, ztmp, delx, dely, delz, rsq; + + n = 0; + n2 = maxnbors; + + xtmp = x[i].x; + ytmp = x[i].y; + ztmp = x[i].z; + itype = x[i].w; + const int ioffset = ntypes * itype; + + // loop over all atoms in bins in stencil + // pairs for atoms j "below" i are excluded + // below = lower z or (equal z and lower y) or (equal zy and lower x) + // (equal zyx and j <= i) + // latter excludes self-self interaction but allows superposed atoms + + ibin = mcoord2bin(x[i].x, x[i].y, x[i].z, bboxlo0, bboxlo1, bboxlo2, + bboxhi0, bboxhi1, bboxhi2, bininvx, bininvy, bininvz, + nbinx, nbiny, nbinz, mbinx, mbiny, mbinz, + mbinxlo, mbinylo, mbinzlo); + + for (k = 0; k < nstencil; k++) { + for (j = binhead[ibin + stencil[k]]; j >= 0; j = bins[j]) { + if (offload_noghost) { + if (j < nlocal) { + if (i < offload_end) continue; + } else if (offload) continue; + } + + if (x[j].z < ztmp) continue; + if (x[j].z == ztmp) { + if (x[j].y < ytmp) continue; + if (x[j].y == ytmp) { + if (x[j].x < xtmp) continue; + if (x[j].x == xtmp && j <= i) continue; + } + } + + jtype = x[j].w; + #ifndef _LMP_INTEL_OFFLOAD + if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue; + #endif + + delx = xtmp - x[j].x; + dely = ytmp - x[j].y; + delz = ztmp - x[j].z; + rsq = delx * delx + dely * dely + delz * delz; + if (rsq <= cutneighsq[ioffset + jtype]) { + if (j < nlocal) { + neighptr[n++] = j; + #ifdef _LMP_INTEL_OFFLOAD + if (j < lmin) lmin = j; + if (j > lmax) lmax = j; + #endif + } else { + neighptr[n2++] = j; + #ifdef _LMP_INTEL_OFFLOAD + if (j < gmin) gmin = j; + if (j > gmax) gmax = j; + #endif + } + } + } + } + ilist[i] = i; + + cnumneigh[i] = ct; + if (n > maxnbors) *overflow = 1; + for (k = maxnbors; k < n2; k++) neighptr[n++] = neighptr[k]; + while( (n % pad_width) != 0 ) neighptr[n++] = e_nall; + numneigh[i] = n; + while((n % (INTEL_DATA_ALIGN / sizeof(int))) != 0) n++; + ct += n; + neighptr += n; + if (ct + n + maxnbors > list_size) { + *overflow = 1; + ct = (ifrom + tid) * maxnbors; + } + } + + if (*overflow == 1) + for (int i = ifrom; i < ito; i++) + numneigh[i] = 0; + + #ifdef _LMP_INTEL_OFFLOAD + if (separate_buffers) { + #if defined(_OPENMP) + #pragma omp critical + #endif + { + if (lmin < overflow[LMP_LOCAL_MIN]) overflow[LMP_LOCAL_MIN] = lmin; + if (lmax > overflow[LMP_LOCAL_MAX]) overflow[LMP_LOCAL_MAX] = lmax; + if (gmin < overflow[LMP_GHOST_MIN]) overflow[LMP_GHOST_MIN] = gmin; + if (gmax > overflow[LMP_GHOST_MAX]) overflow[LMP_GHOST_MAX] = gmax; + } + #pragma omp barrier + } + + int ghost_offset = 0, nall_offset = e_nall; + if (separate_buffers) { + int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN]; + if (nghost < 0) nghost = 0; + if (offload) { + ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1; + nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost; + } else { + ghost_offset = overflow[LMP_GHOST_MIN] - nlocal; + nall_offset = nlocal + nghost; + } + } + #endif + + if (molecular) { + for (int i = ifrom; i < ito; ++i) { + int * restrict jlist = firstneigh + cnumneigh[i]; + const int jnum = numneigh[i]; + for (int jj = 0; jj < jnum; jj++) { + const int j = jlist[jj]; + ofind_special(which, special, nspecial, i, tag[j], special_flag); + #ifdef _LMP_INTEL_OFFLOAD + if (j >= nlocal) { + if (j == e_nall) + jlist[jj] = nall_offset; + else if (which > 0) + jlist[jj] = (j-ghost_offset) ^ (which << SBBITS); + else jlist[jj]-=ghost_offset; + } else + #endif + if (which > 0) jlist[jj] = j ^ (which << SBBITS); + } + } + } + #ifdef _LMP_INTEL_OFFLOAD + else if (separate_buffers) { + for (int i = ifrom; i < ito; ++i) { + int * restrict jlist = firstneigh + cnumneigh[i]; + const int jnum = numneigh[i]; + int jj = 0; + for (jj = 0; jj < jnum; jj++) + if (jlist[jj] >= nlocal) break; + while (jj < jnum) { + if (jlist[jj] == e_nall) jlist[jj] = nall_offset; + else jlist[jj] -= ghost_offset; + jj++; + } + } + } + #endif + } // end omp + #ifdef __MIC__ + *timer_compute = MIC_Wtime() - *timer_compute; + #endif + } // end offload + + if (offload) { + fix->stop_watch(TIME_OFFLOAD_LATENCY); + #ifdef _LMP_INTEL_OFFLOAD + for (int n = 0; n < aend; n++) { + ilist[n] = n; + numneigh[n] = 0; + } + #endif + } else { + for (int i = astart; i < aend; i++) + list->firstneigh[i] = firstneigh + cnumneigh[i]; + fix->stop_watch(TIME_HOST_NEIGHBOR); + #ifdef _LMP_INTEL_OFFLOAD + if (separate_buffers) { + fix->start_watch(TIME_PACK); + fix->set_neighbor_host_sizes(); + buffers->pack_sep_from_single(fix->host_min_local(), + fix->host_used_local(), + fix->host_min_ghost(), + fix->host_used_ghost()); + fix->stop_watch(TIME_PACK); + } + #endif + } +} diff --git a/src/USER-INTEL/pair_gayberne_intel.cpp b/src/USER-INTEL/pair_gayberne_intel.cpp new file mode 100644 index 0000000000..46e608c92f --- /dev/null +++ b/src/USER-INTEL/pair_gayberne_intel.cpp @@ -0,0 +1,1075 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + This software is distributed under the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#include "math.h" +#include "pair_gayberne_intel.h" +#include "math_extra_intel.h" +#include "atom.h" +#include "comm.h" +#include "atom_vec_ellipsoid.h" +#include "force.h" +#include "memory.h" +#include "modify.h" +#include "neighbor.h" +#include "neigh_list.h" +#include "neigh_request.h" + +#include "suffix.h" +using namespace LAMMPS_NS; + +#define FC_PACKED1_T typename ForceConst::fc_packed1 +#define FC_PACKED2_T typename ForceConst::fc_packed2 +#define FC_PACKED3_T typename ForceConst::fc_packed3 + +/* ---------------------------------------------------------------------- */ + +PairGayBerneIntel::PairGayBerneIntel(LAMMPS *lmp) : + PairGayBerne(lmp) +{ + suffix_flag |= Suffix::INTEL; + respa_enable = 0; +} + +/* ---------------------------------------------------------------------- */ + +void PairGayBerneIntel::compute(int eflag, int vflag) +{ + if (fix->precision()==FixIntel::PREC_MODE_MIXED) + compute(eflag, vflag, fix->get_mixed_buffers(), + force_const_single); + else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE) + compute(eflag, vflag, fix->get_double_buffers(), + force_const_double); + else + compute(eflag, vflag, fix->get_single_buffers(), + force_const_single); + + fix->balance_stamp(); + vflag_fdotr = 0; +} + +template +void PairGayBerneIntel::compute(int eflag, int vflag, + IntelBuffers *buffers, + const ForceConst &fc) +{ + if (eflag || vflag) { + ev_setup(eflag, vflag); + } else evflag = vflag_fdotr = 0; + + const int inum = list->inum; + const int nall = atom->nlocal + atom->nghost; + const int nthreads = comm->nthreads; + const int host_start = fix->host_start_pair(); + const int offload_end = fix->offload_end_pair(); + const int ago = neighbor->ago; + + if (fix->separate_buffers() == 0) { + fix->start_watch(TIME_PACK); + const AtomVecEllipsoid::Bonus * const bonus = avec->bonus; + const int * const ellipsoid = atom->ellipsoid; + QUAT_T * restrict const quat = buffers->get_quat(); + #if defined(_OPENMP) + #pragma omp parallel default(none) shared(eflag,vflag,buffers,fc) + #endif + { + int ifrom, ito, tid; + IP_PRE_omp_range_id_align(ifrom, ito, tid, nall, nthreads, + sizeof(ATOM_T)); + if (ago != 0) buffers->thr_pack(ifrom,ito,ago); + + for (int i = ifrom; i < ito; i++) { + int qi = ellipsoid[i]; + if (qi > -1) { + quat[i].w = bonus[qi].quat[0]; + quat[i].i = bonus[qi].quat[1]; + quat[i].j = bonus[qi].quat[2]; + quat[i].k = bonus[qi].quat[3]; + } + } + } + quat[nall].w = (flt_t)1.0; + quat[nall].i = (flt_t)0.0; + quat[nall].j = (flt_t)0.0; + quat[nall].k = (flt_t)0.0; + fix->stop_watch(TIME_PACK); + } + + if (evflag || vflag_fdotr) { + int ovflag = 0; + if (vflag_fdotr) ovflag = 2; + else if (vflag) ovflag = 1; + if (eflag) { + if (force->newton_pair) { + eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum); + } else { + eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum); + } + } else { + if (force->newton_pair) { + eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum); + } else { + eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum); + } + } + } else { + if (force->newton_pair) { + eval<0,0,1>(1, 0, buffers, fc, 0, offload_end); + eval<0,0,1>(0, 0, buffers, fc, host_start, inum); + } else { + eval<0,0,0>(1, 0, buffers, fc, 0, offload_end); + eval<0,0,0>(0, 0, buffers, fc, host_start, inum); + } + } +} + +template +void PairGayBerneIntel::eval(const int offload, const int vflag, + IntelBuffers *buffers, + const ForceConst &fc, + const int astart, const int aend) +{ + const int inum = aend - astart; + if (inum == 0) return; + int nlocal, nall, minlocal; + fix->get_buffern(offload, nlocal, nall, minlocal); + + const int ago = neighbor->ago; + ATOM_T * restrict const x = buffers->get_x(offload); + QUAT_T * restrict const quat = buffers->get_quat(offload); + const AtomVecEllipsoid::Bonus *bonus = avec->bonus; + const int *ellipsoid = atom->ellipsoid; + + #ifdef _LMP_INTEL_OFFLOAD + if (fix->separate_buffers()) { + fix->start_watch(TIME_PACK); + if (offload) { + #pragma omp parallel default(none) \ + shared(buffers,nlocal,nall,bonus,ellipsoid) + { + int ifrom, ito, tid; + int nthreads = comm->nthreads; + IP_PRE_omp_range_id_align(ifrom, ito, tid, nlocal, + nthreads, sizeof(ATOM_T)); + if (ago != 0) buffers->thr_pack_cop(ifrom, ito, 0); + for (int i = ifrom; i < ito; i++) { + int qi = ellipsoid[i]; + if (qi > -1) { + quat[i].w = bonus[qi].quat[0]; + quat[i].i = bonus[qi].quat[1]; + quat[i].j = bonus[qi].quat[2]; + quat[i].k = bonus[qi].quat[3]; + } + } + int nghost = nall - nlocal; + if (nghost) { + IP_PRE_omp_range_align(ifrom, ito, tid, nall - nlocal, + nthreads, sizeof(ATOM_T)); + int offset = 0; + ifrom += nlocal; + ito += nlocal; + if (ago != 0) { + offset = fix->offload_min_ghost() - nlocal; + buffers->thr_pack_cop(ifrom, ito, offset, ago == 1); + } + for (int i = ifrom; i < ito; i++) { + int qi = ellipsoid[i + offset]; + if (qi > -1) { + quat[i].w = bonus[qi].quat[0]; + quat[i].i = bonus[qi].quat[1]; + quat[i].j = bonus[qi].quat[2]; + quat[i].k = bonus[qi].quat[3]; + } + } + } + } + } else { + if (ago != 0) buffers->thr_pack_host(fix->host_min_local(), nlocal, 0); + for (int i = fix->host_min_local(); i < nlocal; i++) { + int qi = ellipsoid[i]; + if (qi > -1) { + quat[i].w = bonus[qi].quat[0]; + quat[i].i = bonus[qi].quat[1]; + quat[i].j = bonus[qi].quat[2]; + quat[i].k = bonus[qi].quat[3]; + } + } + int offset = fix->host_min_ghost() - nlocal; + if (ago != 0) buffers->thr_pack_host(nlocal, nall, offset); + for (int i = nlocal; i < nall; i++) { + int qi = ellipsoid[i + offset]; + if (qi > -1) { + quat[i].w = bonus[qi].quat[0]; + quat[i].i = bonus[qi].quat[1]; + quat[i].j = bonus[qi].quat[2]; + quat[i].k = bonus[qi].quat[3]; + } + } + } + fix->stop_watch(TIME_PACK); + } + #endif + + // const int * restrict const ilist = list->ilist; + const int * restrict const numneigh = list->numneigh; + const int * restrict const cnumneigh = buffers->cnumneigh(list); + const int * restrict const firstneigh = buffers->firstneigh(list); + const flt_t * restrict const special_lj = fc.special_lj; + + const FC_PACKED1_T * restrict const ijc = fc.ijc[0]; + const FC_PACKED2_T * restrict const lj34 = fc.lj34[0]; + const FC_PACKED3_T * restrict const ic = fc.ic; + const flt_t mu = fc.mu; + const flt_t gamma = fc.gamma; + const flt_t upsilon = fc.upsilon; + + flt_t * const rsq_formi = fc.rsq_form[0]; + flt_t * const delx_formi = fc.delx_form[0]; + flt_t * const dely_formi = fc.dely_form[0]; + flt_t * const delz_formi = fc.delz_form[0]; + int * const jtype_formi = fc.jtype_form[0]; + int * const jlist_formi = fc.jlist_form[0]; + + const int ntypes = atom->ntypes + 1; + const int eatom = this->eflag_atom; + + // Determine how much data to transfer + int x_size, q_size, f_stride, ev_size, separate_flag; + IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag, + buffers, offload, fix, separate_flag, + x_size, q_size, ev_size, f_stride); + + int tc; + FORCE_T * restrict f_start; + acc_t * restrict ev_global; + IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global); + const int max_nbors = _max_nbors; + const int nthreads = tc; + + int pad = 1; + if (offload) { + if (INTEL_MIC_NBOR_PAD > 1) + pad = INTEL_MIC_NBOR_PAD * sizeof(float) / sizeof(flt_t); + } else { + if (INTEL_NBOR_PAD > 1) + pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t); + } + const int pad_width = pad; + + #ifdef _LMP_INTEL_OFFLOAD + int *overflow = fix->get_off_overflow_flag(); + double *timer_compute = fix->off_watch_pair(); + + if (offload) fix->start_watch(TIME_OFFLOAD_LATENCY); + #pragma offload target(mic:_cop) if(offload) \ + in(special_lj:length(0) alloc_if(0) free_if(0)) \ + in(ijc,lj34,ic:length(0) alloc_if(0) free_if(0)) \ + in(rsq_formi, delx_formi, dely_formi: length(0) alloc_if(0) free_if(0)) \ + in(delz_formi, jtype_formi, jlist_formi: length(0) alloc_if(0) free_if(0))\ + in(firstneigh:length(0) alloc_if(0) free_if(0)) \ + in(cnumneigh:length(0) alloc_if(0) free_if(0)) \ + in(numneigh:length(0) alloc_if(0) free_if(0)) \ + in(x:length(x_size) alloc_if(0) free_if(0)) \ + in(quat:length(nall+1) alloc_if(0) free_if(0)) \ + in(overflow:length(0) alloc_if(0) free_if(0)) \ + in(nthreads,inum,nall,ntypes,vflag,eatom,minlocal,separate_flag) \ + in(astart,nlocal,f_stride,max_nbors,mu,gamma,upsilon,offload,pad_width) \ + out(f_start:length(f_stride) alloc_if(0) free_if(0)) \ + out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \ + out(timer_compute:length(1) alloc_if(0) free_if(0)) \ + signal(f_start) + #endif + { + #ifdef __MIC__ + *timer_compute=MIC_Wtime(); + #endif + + #ifdef _LMP_INTEL_OFFLOAD + if (separate_flag) { + if (separate_flag < 3) { + int all_local = nlocal; + int ghost_min = overflow[LMP_GHOST_MIN]; + nlocal = overflow[LMP_LOCAL_MAX] + 1; + int nghost = overflow[LMP_GHOST_MAX] + 1 - ghost_min; + if (nghost < 0) nghost = 0; + nall = nlocal + nghost; + separate_flag--; + int flength; + if (NEWTON_PAIR) flength = nall; + else flength = nlocal; + IP_PRE_get_stride(f_stride, flength, sizeof(FORCE_T), + separate_flag); + if (nghost) { + if (nlocal < all_local || ghost_min > all_local) { + memmove(x + nlocal, x + ghost_min, + (nall - nlocal) * sizeof(ATOM_T)); + memmove(quat + nlocal, quat + ghost_min, + (nall - nlocal) * sizeof(QUAT_T)); + } + } + } + x[nall].x = (flt_t)INTEL_BIGP; + x[nall].y = (flt_t)INTEL_BIGP; + x[nall].z = (flt_t)INTEL_BIGP; + quat[nall].w = (flt_t)1.0; + quat[nall].i = (flt_t)0.0; + quat[nall].j = (flt_t)0.0; + quat[nall].k = (flt_t)0.0; + } + #endif + + acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5; + if (EVFLAG) { + oevdwl = (acc_t)0; + if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; + } + + // loop over neighbors of my atoms + #if defined(_OPENMP) + #pragma omp parallel default(none) \ + shared(f_start,f_stride,nlocal,nall,minlocal) \ + reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5) + #endif + { + int iifrom, iito, tid; + IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads); + iifrom += astart; + iito += astart; + + FORCE_T * restrict const f = f_start - minlocal * 2 + (tid * f_stride); + memset(f + minlocal * 2, 0, f_stride * sizeof(FORCE_T)); + + flt_t * restrict const rsq_form = rsq_formi + tid * max_nbors; + flt_t * restrict const delx_form = delx_formi + tid * max_nbors; + flt_t * restrict const dely_form = dely_formi + tid * max_nbors; + flt_t * restrict const delz_form = delz_formi + tid * max_nbors; + int * restrict const jtype_form = jtype_formi + tid * max_nbors; + int * restrict const jlist_form = jlist_formi + tid * max_nbors; + + int ierror = 0; + for (int i = iifrom; i < iito; ++i) { + // const int i = ilist[ii]; + const int itype = x[i].w; + const int ptr_off = itype * ntypes; + const FC_PACKED1_T * restrict const ijci = ijc + ptr_off; + const FC_PACKED2_T * restrict const lj34i = lj34 + ptr_off; + + const int * restrict const jlist = firstneigh + cnumneigh[i]; + const int jnum = numneigh[i]; + + const flt_t xtmp = x[i].x; + const flt_t ytmp = x[i].y; + const flt_t ztmp = x[i].z; + + flt_t a1_0, a1_1, a1_2, a1_3, a1_4, a1_5, a1_6, a1_7, a1_8; + flt_t b1_0, b1_1, b1_2, b1_3, b1_4, b1_5, b1_6, b1_7, b1_8; + flt_t g1_0, g1_1, g1_2, g1_3, g1_4, g1_5, g1_6, g1_7, g1_8; + + if (ijci[itype].form == ELLIPSE_ELLIPSE) { + flt_t temp_0,temp_1,temp_2,temp_3,temp_4,temp_5,temp_6,temp_7,temp_8; + ME_quat_to_mat_trans(quat[i],a1); + ME_diag_times3(ic[itype].well,a1,temp); + ME_transpose_times3(a1,temp,b1); + ME_diag_times3(ic[itype].shape2,a1,temp); + ME_transpose_times3(a1,temp,g1); + } + + acc_t fxtmp, fytmp, fztmp, fwtmp, t1tmp, t2tmp, t3tmp; + acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5; + fxtmp = fytmp = fztmp = t1tmp = t2tmp = t3tmp = (acc_t)0.0; + + if (EVFLAG) { + if (EFLAG) fwtmp = sevdwl = (acc_t)0; + if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0; + } + + bool multiple_forms = false; + int packed_j = 0; + for (int jj = 0; jj < jnum; jj++) { + int jm = jlist[jj]; + int j = jm & NEIGHMASK; + const int jtype = x[j].w; + + if (ijci[jtype].form == ELLIPSE_ELLIPSE) { + flt_t delx = x[j].x-xtmp; + flt_t dely = x[j].y-ytmp; + flt_t delz = x[j].z-ztmp; + flt_t rsq = delx * delx + dely * dely + delz * delz; + + if (rsq < ijci[jtype].cutsq) { + rsq_form[packed_j] = rsq; + delx_form[packed_j] = delx; + dely_form[packed_j] = dely; + delz_form[packed_j] = delz; + jtype_form[packed_j] = jtype; + jlist_form[packed_j] = jm; + packed_j++; + } + } else + multiple_forms = true; + } + while( (packed_j % pad_width) != 0 ) + jlist_form[packed_j++] = nall; + + // ------------------------------------------------------------- + + #ifdef __MIC__ + __assume(packed_j % INTEL_VECTOR_WIDTH == 0); + __assume(packed_j % 8 == 0); + __assume(packed_j % INTEL_MIC_VECTOR_WIDTH == 0); + #endif + #pragma vector aligned + #pragma simd reduction(+:fxtmp,fytmp,fztmp,fwtmp,t1tmp,t2tmp,t3tmp, \ + sevdwl,sv0,sv1,sv2,sv3,sv4,sv5) + for (int jj = 0; jj < packed_j; jj++) { + flt_t a2_0, a2_1, a2_2, a2_3, a2_4, a2_5, a2_6, a2_7, a2_8; + flt_t b2_0, b2_1, b2_2, b2_3, b2_4, b2_5, b2_6, b2_7, b2_8; + flt_t g2_0, g2_1, g2_2, g2_3, g2_4, g2_5, g2_6, g2_7, g2_8; + flt_t temp_0,temp_1,temp_2,temp_3,temp_4,temp_5,temp_6,temp_7,temp_8; + flt_t fforce_0, fforce_1, fforce_2, ttor_0, ttor_1, ttor_2; + flt_t rtor_0, rtor_1, rtor_2; + + const int sbindex = jlist_form[jj] >> SBBITS & 3; + const int j = jlist_form[jj] & NEIGHMASK; + flt_t factor_lj = special_lj[sbindex]; + const int jtype = jtype_form[jj]; + const flt_t sigma = ijci[jtype].sigma; + const flt_t epsilon = ijci[jtype].epsilon; + const flt_t shape2_0 = ic[jtype].shape2[0]; + const flt_t shape2_1 = ic[jtype].shape2[1]; + const flt_t shape2_2 = ic[jtype].shape2[2]; + flt_t one_eng, evdwl; + + ME_quat_to_mat_trans(quat[j], a2); + ME_diag_times3(ic[jtype].well, a2, temp); + ME_transpose_times3(a2, temp, b2); + ME_diag_times3a(shape2, a2, temp); + ME_transpose_times3(a2, temp, g2); + + flt_t tempv_0, tempv_1, tempv_2, tempv2_0, tempv2_1, tempv2_2; + flt_t temp1, temp2, temp3; + + flt_t r12hat_0, r12hat_1, r12hat_2; + ME_normalize3(delx_form[jj], dely_form[jj], delz_form[jj], r12hat); + flt_t r = sqrt(rsq_form[jj]); + + // compute distance of closest approach + + flt_t g12_0, g12_1, g12_2, g12_3, g12_4, g12_5, g12_6, g12_7, g12_8; + ME_plus3(g1, g2, g12); + flt_t kappa_0, kappa_1, kappa_2; + ME_mldivide3(g12, delx_form[jj], dely_form[jj], delz_form[jj], + kappa, ierror); + + // tempv = G12^-1*r12hat + + flt_t inv_r = (flt_t)1.0 / r; + tempv_0 = kappa_0 * inv_r; + tempv_1 = kappa_1 * inv_r; + tempv_2 = kappa_2 * inv_r; + flt_t sigma12 = ME_dot3(r12hat, tempv); + sigma12 = pow((flt_t)0.5 * sigma12,(flt_t) - 0.5); + flt_t h12 = r - sigma12; + + // energy + // compute u_r + + flt_t varrho = sigma / (h12 + gamma * sigma); + flt_t varrho6 = pow(varrho, (flt_t)6.0); + flt_t varrho12 = varrho6 * varrho6; + flt_t u_r = (flt_t)4.0 * epsilon * (varrho12 - varrho6); + + // compute eta_12 + + flt_t eta = (flt_t)2.0 * ijci[jtype].lshape; + flt_t det_g12 = ME_det3(g12); + eta = pow(eta / det_g12, upsilon); + + // compute chi_12 + + flt_t b12_0, b12_1, b12_2, b12_3, b12_4, b12_5, b12_6, b12_7, b12_8; + flt_t iota_0, iota_1, iota_2; + ME_plus3(b1, b2, b12); + ME_mldivide3(b12, delx_form[jj], dely_form[jj], delz_form[jj], + iota, ierror); + + // tempv = G12^-1*r12hat + + tempv_0 = iota_0 * inv_r; + tempv_1 = iota_1 * inv_r; + tempv_2 = iota_2 * inv_r; + flt_t chi = ME_dot3(r12hat, tempv); + chi = pow(chi * (flt_t)2.0, mu); + + // force + // compute dUr/dr + + temp1 = ((flt_t)2.0 * varrho12 * varrho - varrho6 * varrho) / + sigma; + temp1 = temp1 * (flt_t)24.0 * epsilon; + flt_t u_slj = temp1 * pow(sigma12, (flt_t)3.0) * (flt_t)0.5; + flt_t dUr_0, dUr_1, dUr_2; + temp2 = ME_dot3(kappa, r12hat); + flt_t uslj_rsq = u_slj / rsq_form[jj]; + dUr_0 = temp1 * r12hat_0 + uslj_rsq * (kappa_0 - temp2 * r12hat_0); + dUr_1 = temp1 * r12hat_1 + uslj_rsq * (kappa_1 - temp2 * r12hat_1); + dUr_2 = temp1 * r12hat_2 + uslj_rsq * (kappa_2 - temp2 * r12hat_2); + + // compute dChi_12/dr + + flt_t dchi_0, dchi_1, dchi_2; + temp1 = ME_dot3(iota, r12hat); + temp2 = (flt_t)-4.0 / rsq_form[jj] * mu * + pow(chi, (mu - (flt_t)1.0) / mu); + dchi_0 = temp2 * (iota_0 - temp1 * r12hat_0); + dchi_1 = temp2 * (iota_1 - temp1 * r12hat_1); + dchi_2 = temp2 * (iota_2 - temp1 * r12hat_2); + + temp1 = -eta * u_r; + temp2 = eta * chi; + fforce_0 = temp1 * dchi_0 - temp2 * dUr_0; + fforce_1 = temp1 * dchi_1 - temp2 * dUr_1; + fforce_2 = temp1 * dchi_2 - temp2 * dUr_2; + + // torque for particle 1 and 2 + // compute dUr + + tempv_0 = -uslj_rsq * kappa_0; + tempv_1 = -uslj_rsq * kappa_1; + tempv_2 = -uslj_rsq * kappa_2; + ME_vecmat(kappa, g1, tempv2); + ME_cross3(tempv, tempv2, dUr); + flt_t dUr2_0, dUr2_1, dUr2_2; + + if (NEWTON_PAIR || j < nlocal) { + ME_vecmat(kappa, g2, tempv2); + ME_cross3(tempv, tempv2, dUr2); + } + + // compute d_chi + + ME_vecmat(iota, b1, tempv); + ME_cross3(tempv, iota, dchi); + temp1 = (flt_t)-4.0 / rsq_form[jj]; + dchi_0 *= temp1; + dchi_1 *= temp1; + dchi_2 *= temp1; + flt_t dchi2_0, dchi2_1, dchi2_2; + + if (NEWTON_PAIR || j < nlocal) { + ME_vecmat(iota, b2, tempv); + ME_cross3(tempv, iota, dchi2); + dchi2_0 *= temp1; + dchi2_1 *= temp1; + dchi2_2 *= temp1; + } + + // compute d_eta + + flt_t deta_0, deta_1, deta_2; + deta_0 = deta_1 = deta_2 = (flt_t)0.0; + ME_compute_eta_torque(g12, a1, shape2, temp); + temp1 = -eta * upsilon; + + tempv_0 = temp1 * temp_0; + tempv_1 = temp1 * temp_1; + tempv_2 = temp1 * temp_2; + ME_mv0_cross3(a1, tempv, tempv2); + deta_0 += tempv2_0; + deta_1 += tempv2_1; + deta_2 += tempv2_2; + + tempv_0 = temp1 * temp_3; + tempv_1 = temp1 * temp_4; + tempv_2 = temp1 * temp_5; + ME_mv1_cross3(a1, tempv, tempv2); + deta_0 += tempv2_0; + deta_1 += tempv2_1; + deta_2 += tempv2_2; + + tempv_0 = temp1 * temp_6; + tempv_1 = temp1 * temp_7; + tempv_2 = temp1 * temp_8; + ME_mv2_cross3(a1, tempv, tempv2); + deta_0 += tempv2_0; + deta_1 += tempv2_1; + deta_2 += tempv2_2; + + // compute d_eta for particle 2 + + flt_t deta2_0, deta2_1, deta2_2; + if (NEWTON_PAIR || j < nlocal) { + deta2_0 = deta2_1 = deta2_2 = (flt_t)0.0; + ME_compute_eta_torque(g12, a2, shape2, temp); + + tempv_0 = temp1 * temp_0; + tempv_1 = temp1 * temp_1; + tempv_2 = temp1 * temp_2; + ME_mv0_cross3(a2, tempv, tempv2); + deta2_0 += tempv2_0; + deta2_1 += tempv2_1; + deta2_2 += tempv2_2; + + tempv_0 = temp1 * temp_3; + tempv_1 = temp1 * temp_4; + tempv_2 = temp1 * temp_5; + ME_mv1_cross3(a2, tempv, tempv2); + deta2_0 += tempv2_0; + deta2_1 += tempv2_1; + deta2_2 += tempv2_2; + + tempv_0 = temp1 * temp_6; + tempv_1 = temp1 * temp_7; + tempv_2 = temp1 * temp_8; + ME_mv2_cross3(a2, tempv, tempv2); + deta2_0 += tempv2_0; + deta2_1 += tempv2_1; + deta2_2 += tempv2_2; + } + + // torque + + temp1 = u_r * eta; + temp2 = u_r * chi; + temp3 = chi * eta; + + ttor_0 = (temp1 * dchi_0 + temp2 * deta_0 + temp3 * dUr_0) * + (flt_t)-1.0; + ttor_1 = (temp1 * dchi_1 + temp2 * deta_1 + temp3 * dUr_1) * + (flt_t)-1.0; + ttor_2 = (temp1 * dchi_2 + temp2 * deta_2 + temp3 * dUr_2) * + (flt_t)-1.0; + + if (NEWTON_PAIR || j < nlocal) { + rtor_0 = (temp1 * dchi2_0 + temp2 * deta2_0 + temp3 * dUr2_0) * + (flt_t)-1.0; + rtor_1 = (temp1 * dchi2_1 + temp2 * deta2_1 + temp3 * dUr2_1) * + (flt_t)-1.0; + rtor_2 = (temp1 * dchi2_2 + temp2 * deta2_2 + temp3 * dUr2_2) * + (flt_t)-1.0; + } + + one_eng = temp1 * chi; + #ifndef __MIC__ + if (jlist_form[jj] == nall) { + one_eng = (flt_t)0.0; + fforce_0 = 0.0; + fforce_1 = 0.0; + fforce_2 = 0.0; + ttor_0 = 0.0; + ttor_1 = 0.0; + ttor_2 = 0.0; + rtor_0 = 0.0; + rtor_1 = 0.0; + rtor_2 = 0.0; + } + #endif + + fforce_0 *= factor_lj; + fforce_1 *= factor_lj; + fforce_2 *= factor_lj; + ttor_0 *= factor_lj; + ttor_1 *= factor_lj; + ttor_2 *= factor_lj; + + #ifdef __MIC__ + if (jlist_form[jj] < nall) { + #endif + fxtmp += fforce_0; + fytmp += fforce_1; + fztmp += fforce_2; + t1tmp += ttor_0; + t2tmp += ttor_1; + t3tmp += ttor_2; + + if (NEWTON_PAIR || j < nlocal) { + rtor_0 *= factor_lj; + rtor_1 *= factor_lj; + rtor_2 *= factor_lj; + int jp = j * 2; + f[jp].x -= fforce_0; + f[jp].y -= fforce_1; + f[jp].z -= fforce_2; + jp++; + f[jp].x += rtor_0; + f[jp].y += rtor_1; + f[jp].z += rtor_2; + } + + if (EVFLAG) { + flt_t ev_pre = (flt_t)0; + if (NEWTON_PAIR || i < nlocal) + ev_pre += (flt_t)0.5; + if (NEWTON_PAIR || j < nlocal) + ev_pre += (flt_t)0.5; + + if (EFLAG) { + evdwl = factor_lj * one_eng; + sevdwl += ev_pre * evdwl; + if (eatom) { + if (NEWTON_PAIR || i < nlocal) + fwtmp += (flt_t)0.5 * evdwl; + if (NEWTON_PAIR || j < nlocal) + f[j*2].w += (flt_t)0.5 * evdwl; + } + } + + if (vflag == 1) { + ev_pre *= (flt_t)-1.0; + sv0 += ev_pre * delx_form[jj] * fforce_0; + sv1 += ev_pre * dely_form[jj] * fforce_1; + sv2 += ev_pre * delz_form[jj] * fforce_2; + sv3 += ev_pre * delx_form[jj] * fforce_1; + sv4 += ev_pre * delx_form[jj] * fforce_2; + sv5 += ev_pre * dely_form[jj] * fforce_2; + } + } // EVFLAG + #ifdef __MIC__ + } + #endif + } // for jj + + // ------------------------------------------------------------- + + if (multiple_forms) + ierror = 2; + + int ip = i * 2; + f[ip].x += fxtmp; + f[ip].y += fytmp; + f[ip].z += fztmp; + ip++; + f[ip].x += t1tmp; + f[ip].y += t2tmp; + f[ip].z += t3tmp; + + if (EVFLAG) { + if (EFLAG) { + if (eatom) f[i * 2].w += fwtmp; + oevdwl += sevdwl; + } + if (vflag == 1) { + ov0 += sv0; + ov1 += sv1; + ov2 += sv2; + ov3 += sv3; + ov4 += sv4; + ov5 += sv5; + } + } + } // for i + int o_range; + if (NEWTON_PAIR) + o_range = nall; + else + o_range = nlocal; + if (offload == 0) o_range -= minlocal; + IP_PRE_omp_range_align(iifrom, iito, tid, o_range, nthreads, + sizeof(FORCE_T)); + const int two_iito = iito * 2; + + #if defined(_OPENMP) + #pragma omp barrier + #endif + + acc_t *facc = &(f_start[0].x); + const int sto = two_iito * 4; + const int fst4 = f_stride * 4; + #if defined(_OPENMP) + #pragma omp barrier + #endif + int t_off = f_stride; + if (EFLAG && eatom) { + for (int t = 1; t < nthreads; t++) { + #pragma vector nontemporal + for (int n = iifrom * 2; n < two_iito; n++) { + f_start[n].x += f_start[n + t_off].x; + f_start[n].y += f_start[n + t_off].y; + f_start[n].z += f_start[n + t_off].z; + f_start[n].w += f_start[n + t_off].w; + } + t_off += f_stride; + } + } else { + for (int t = 1; t < nthreads; t++) { + #pragma vector nontemporal + for (int n = iifrom * 2; n < two_iito; n++) { + f_start[n].x += f_start[n + t_off].x; + f_start[n].y += f_start[n + t_off].y; + f_start[n].z += f_start[n + t_off].z; + } + t_off += f_stride; + } + } + + if (EVFLAG) { + if (vflag==2) { + const ATOM_T * restrict const xo = x + minlocal; + #pragma vector nontemporal + for (int n = iifrom; n < iito; n++) { + const int nt2 = n * 2; + ov0 += f_start[nt2].x * xo[n].x; + ov1 += f_start[nt2].y * xo[n].y; + ov2 += f_start[nt2].z * xo[n].z; + ov3 += f_start[nt2].y * xo[n].x; + ov4 += f_start[nt2].z * xo[n].x; + ov5 += f_start[nt2].z * xo[n].y; + } + } + } + + if (ierror) + f_start[1].w = ierror; + } // omp + + if (EVFLAG) { + if (EFLAG) { + ev_global[0] = oevdwl; + ev_global[1] = (acc_t)0.0; + } + if (vflag) { + ev_global[2] = ov0; + ev_global[3] = ov1; + ev_global[4] = ov2; + ev_global[5] = ov3; + ev_global[6] = ov4; + ev_global[7] = ov5; + } + } + + #ifdef __MIC__ + *timer_compute = MIC_Wtime() - *timer_compute; + #endif + } // offload + + if (offload) + fix->stop_watch(TIME_OFFLOAD_LATENCY); + else + fix->stop_watch(TIME_HOST_PAIR); + + if (EVFLAG) + fix->add_result_array(f_start, ev_global, offload,eatom); + else + fix->add_result_array(f_start, 0, offload); +} + +/* ---------------------------------------------------------------------- */ + +void PairGayBerneIntel::init_style() +{ + PairGayBerne::init_style(); + neighbor->requests[neighbor->nrequest-1]->intel = 1; + + int ifix = modify->find_fix("package_intel"); + if (ifix < 0) + error->all(FLERR, + "The 'package intel' command is required for /intel styles"); + fix = static_cast(modify->fix[ifix]); + + #ifdef _LMP_INTEL_OFFLOAD + fix->set_offload_affinity(); + if (force->newton_pair) fix->set_offload_noghost(1); + _cop = fix->coprocessor_number(); + #endif + if (fix->precision() == FixIntel::PREC_MODE_MIXED) { + fix->get_mixed_buffers()->free_all_nbor_buffers(); + pack_force_const(force_const_single, fix->get_mixed_buffers()); + } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) { + fix->get_double_buffers()->free_all_nbor_buffers(); + pack_force_const(force_const_double, fix->get_double_buffers()); + } else { + fix->get_single_buffers()->free_all_nbor_buffers(); + pack_force_const(force_const_single, fix->get_single_buffers()); + } +} + +/* ---------------------------------------------------------------------- */ + +template +void PairGayBerneIntel::pack_force_const(ForceConst &fc, + IntelBuffers *buffers) +{ + int tp1 = atom->ntypes + 1; + _max_nbors = buffers->get_max_nbors(); + int mthreads = comm->nthreads; + if (mthreads < buffers->get_off_threads()) + mthreads = buffers->get_off_threads(); + fc.set_ntypes(tp1, _max_nbors, mthreads, memory, _cop); + buffers->set_ntypes(tp1); + flt_t **cutneighsq = buffers->get_cutneighsq(); + + // Repeat cutsq calculation because done after call to init_style + double cut, cutneigh; + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = i; j <= atom->ntypes; j++) { + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + cut = init_one(i,j); + cutneigh = cut + neighbor->skin; + cutsq[i][j] = cutsq[j][i] = cut*cut; + cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh; + } + } + } + + for (int i = 0; i < 4; i++) { + fc.special_lj[i] = force->special_lj[i]; + fc.special_lj[0] = 1.0; + } + fc.gamma = gamma; + fc.upsilon = upsilon; + fc.mu = mu; + + for (int i = 0; i < tp1; i++) { + for (int j = 0; j < tp1; j++) { + fc.ijc[i][j].lj1 = lj1[i][j]; + fc.ijc[i][j].lj2 = lj2[i][j]; + fc.ijc[i][j].cutsq = cutsq[i][j]; + fc.ijc[i][j].offset = offset[i][j]; + fc.ijc[i][j].sigma = sigma[i][j]; + fc.ijc[i][j].epsilon = epsilon[i][j]; + fc.ijc[i][j].form = form[i][j]; + fc.ijc[i][j].lshape = lshape[i] * lshape[j]; + fc.lj34[i][j].lj3 = lj3[i][j]; + fc.lj34[i][j].lj4 = lj4[i][j]; + } + for (int j = 0; j < 4; j++) { + fc.ic[i].shape2[j] = shape2[i][j]; + fc.ic[i].well[j] = well[i][j]; + } + } + + #ifdef _LMP_INTEL_OFFLOAD + if (_cop < 0) return; + flt_t * special_lj = fc.special_lj; + FC_PACKED1_T *oijc = fc.ijc[0]; + FC_PACKED2_T *olj34 = fc.lj34[0]; + FC_PACKED3_T *oic = fc.ic; + flt_t * ocutneighsq = cutneighsq[0]; + int tp1sq = tp1 * tp1; + if (oijc != NULL && oic != NULL) { + #pragma offload_transfer target(mic:_cop) \ + in(special_lj: length(4) alloc_if(0) free_if(0)) \ + in(oijc,olj34: length(tp1sq) alloc_if(0) free_if(0)) \ + in(oic: length(tp1) alloc_if(0) free_if(0)) \ + in(ocutneighsq: length(tp1sq)) + } + #endif +} + +/* ---------------------------------------------------------------------- */ + +template +void PairGayBerneIntel::ForceConst::set_ntypes(const int ntypes, + const int one_length, + const int nthreads, + Memory *memory, + const int cop) { + if (ntypes != _ntypes) { + if (_ntypes > 0) { + fc_packed3 *oic = ic; + + #ifdef _LMP_INTEL_OFFLOAD + flt_t * ospecial_lj = special_lj; + fc_packed1 *oijc = ijc[0]; + fc_packed2 *olj34 = lj34[0]; + flt_t * orsq_form = rsq_form[0]; + flt_t * odelx_form = delx_form[0]; + flt_t * odely_form = dely_form[0]; + flt_t * odelz_form = delz_form[0]; + int * ojtype_form = jtype_form[0]; + int * ojlist_form = jlist_form[0]; + + if (ospecial_lj != NULL && oijc != NULL && olj34 != NULL && + orsq_form != NULL && odelx_form != NULL && odely_form != NULL && + odelz_form != NULL && ojtype_form != NULL && ojlist_form != NULL && + _cop >= 0) { + #pragma offload_transfer target(mic:_cop) \ + nocopy(ospecial_lj, oijc, olj34, oic: alloc_if(0) free_if(1)) \ + nocopy(orsq_form, odelx_form, odely_form: alloc_if(0) free_if(1)) \ + nocopy(odelz_form, ojtype_form, ojlist_form: alloc_if(0) free_if(1)) + } + #endif + + _memory->destroy(oic); + _memory->destroy(ijc); + _memory->destroy(lj34); + _memory->destroy(rsq_form); + _memory->destroy(delx_form); + _memory->destroy(dely_form); + _memory->destroy(delz_form); + _memory->destroy(jtype_form); + _memory->destroy(jlist_form); + } + + if (ntypes > 0) { + _cop = cop; + memory->create(ijc, ntypes, ntypes, "fc.ijc"); + memory->create(lj34, ntypes, ntypes, "fc.lj34"); + memory->create(ic, ntypes, "fc.ic"); + memory->create(rsq_form, nthreads, one_length, "rsq_form"); + memory->create(delx_form, nthreads, one_length, "delx_form"); + memory->create(dely_form, nthreads, one_length, "dely_form"); + memory->create(delz_form, nthreads, one_length, "delz_form"); + memory->create(jtype_form, nthreads, one_length, "jtype_form"); + memory->create(jlist_form, nthreads, one_length, "jlist_form"); + + for (int zn = 0; zn < nthreads; zn++) + for (int zo = 0; zo < one_length; zo++) { + rsq_form[zn][zo] = 10.0; + delx_form[zn][zo] = 10.0; + dely_form[zn][zo] = 10.0; + delz_form[zn][zo] = 10.0; + jtype_form[zn][zo] = 1; + jlist_form[zn][zo] = 0; + } + + #ifdef _LMP_INTEL_OFFLOAD + flt_t * ospecial_lj = special_lj; + fc_packed1 *oijc = ijc[0]; + fc_packed2 *olj34 = lj34[0]; + fc_packed3 *oic = ic; + flt_t * orsq_form = rsq_form[0]; + flt_t * odelx_form = delx_form[0]; + flt_t * odely_form = dely_form[0]; + flt_t * odelz_form = delz_form[0]; + int * ojtype_form = jtype_form[0]; + int * ojlist_form = jlist_form[0]; + int off_onel = one_length * nthreads; + + int tp1sq = ntypes*ntypes; + if (ospecial_lj != NULL && oijc != NULL && olj34 != NULL && + oic != NULL && orsq_form != NULL && odelx_form != NULL && + odely_form != NULL && odelz_form != NULL && ojtype_form !=NULL && + ojlist_form !=NULL && cop >= 0) { + #pragma offload_transfer target(mic:cop) \ + nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \ + nocopy(oijc,olj34: length(tp1sq) alloc_if(1) free_if(0)) \ + nocopy(oic: length(ntypes) alloc_if(1) free_if(0)) \ + in(orsq_form: length(off_onel) alloc_if(1) free_if(0)) \ + in(odelx_form: length(off_onel) alloc_if(1) free_if(0)) \ + in(odely_form: length(off_onel) alloc_if(1) free_if(0)) \ + in(odelz_form: length(off_onel) alloc_if(1) free_if(0)) \ + in(ojtype_form: length(off_onel) alloc_if(1) free_if(0)) \ + in(ojlist_form: length(off_onel) alloc_if(1) free_if(0)) + } + #endif + } + } + _ntypes = ntypes; + _memory = memory; +} diff --git a/src/USER-INTEL/pair_gayberne_intel.h b/src/USER-INTEL/pair_gayberne_intel.h new file mode 100644 index 0000000000..eb055e151e --- /dev/null +++ b/src/USER-INTEL/pair_gayberne_intel.h @@ -0,0 +1,99 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS + +PairStyle(gayberne/intel,PairGayBerneIntel) + +#else + +#ifndef LMP_PAIR_GAYBERNE_INTEL_H +#define LMP_PAIR_GAYBERNE_INTEL_H + +#include "pair_gayberne.h" +#include "fix_intel.h" + +namespace LAMMPS_NS { + +class PairGayBerneIntel : public PairGayBerne { + + public: + PairGayBerneIntel(class LAMMPS *); + + virtual void compute(int, int); + void init_style(); + + private: + template class ForceConst; + + template + void compute(int eflag, int vflag, IntelBuffers *buffers, + const ForceConst &fc); + template + void eval(const int offload, const int vflag, + IntelBuffers * buffers, + const ForceConst &fc, const int astart, const int aend); + + template + void pack_force_const(ForceConst &fc, + IntelBuffers *buffers); + + template + class ForceConst { + public: + typedef struct { + flt_t cutsq, lj1, lj2, offset, sigma, epsilon, lshape; + int form; + } fc_packed1; + typedef struct { flt_t lj3, lj4; } fc_packed2; + typedef struct { flt_t shape2[4], well[4]; } fc_packed3; + + __declspec(align(64)) flt_t special_lj[4], gamma, upsilon, mu; + fc_packed1 **ijc; + fc_packed2 **lj34; + fc_packed3 *ic; + + flt_t **rsq_form, **delx_form, **dely_form, **delz_form; + int **jtype_form, **jlist_form; + + ForceConst() : _ntypes(0) {} + ~ForceConst() { set_ntypes(0, 0, 0, NULL, _cop); } + + void set_ntypes(const int ntypes, const int one_length, + const int nthreads, Memory *memory, const int cop); + + private: + int _ntypes, _cop; + Memory *_memory; + }; + + ForceConst force_const_single; + ForceConst force_const_double; + int _max_nbors; + + double gayberne_lj(const int i, const int j, double a1[3][3], + double b1[3][3], double g1[3][3], double *r12, + const double rsq, double *fforce, double *ttor); + + FixIntel *fix; + int _cop; +}; + +} + +#endif +#endif diff --git a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp new file mode 100644 index 0000000000..576d5b21c7 --- /dev/null +++ b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp @@ -0,0 +1,675 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + This software is distributed under the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#include "math.h" +#include "pair_lj_charmm_coul_long_intel.h" +#include "atom.h" +#include "comm.h" +#include "force.h" +#include "group.h" +#include "kspace.h" +#include "memory.h" +#include "modify.h" +#include "neighbor.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "memory.h" +#include "suffix.h" +using namespace LAMMPS_NS; + +#define LJ_T typename IntelBuffers::vec4_t +#define TABLE_T typename ForceConst::table_t + +/* ---------------------------------------------------------------------- */ + +PairLJCharmmCoulLongIntel::PairLJCharmmCoulLongIntel(LAMMPS *lmp) : + PairLJCharmmCoulLong(lmp) +{ + suffix_flag |= Suffix::INTEL; + respa_enable = 0; + cut_respa = NULL; +} + +/* ---------------------------------------------------------------------- */ + +PairLJCharmmCoulLongIntel::~PairLJCharmmCoulLongIntel() +{ +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCharmmCoulLongIntel::compute(int eflag, int vflag) +{ + if (fix->precision()==FixIntel::PREC_MODE_MIXED) + compute(eflag, vflag, fix->get_mixed_buffers(), + force_const_single); + else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE) + compute(eflag, vflag, fix->get_double_buffers(), + force_const_double); + else + compute(eflag, vflag, fix->get_single_buffers(), + force_const_single); + + fix->balance_stamp(); + vflag_fdotr = 0; +} + +template +void PairLJCharmmCoulLongIntel::compute(int eflag, int vflag, + IntelBuffers *buffers, + const ForceConst &fc) +{ + if (eflag || vflag) { + ev_setup(eflag,vflag); + } else evflag = vflag_fdotr = 0; + + const int inum = list->inum; + const int nthreads = comm->nthreads; + const int host_start = fix->host_start_pair(); + const int offload_end = fix->offload_end_pair(); + const int ago = neighbor->ago; + + if (ago != 0 && fix->separate_buffers() == 0) { + fix->start_watch(TIME_PACK); + #if defined(_OPENMP) + #pragma omp parallel default(none) shared(eflag,vflag,buffers,fc) + #endif + { + int ifrom, ito, tid; + IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal+atom->nghost, + nthreads, sizeof(ATOM_T)); + buffers->thr_pack(ifrom,ito,ago); + } + fix->stop_watch(TIME_PACK); + } + + // -------------------- Regular version + if (evflag || vflag_fdotr) { + int ovflag = 0; + if (vflag_fdotr) ovflag = 2; + else if (vflag) ovflag = 1; + if (eflag) { + if (force->newton_pair) { + eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum); + } else { + eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum); + } + } else { + if (force->newton_pair) { + eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum); + } else { + eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum); + } + } + } else { + if (force->newton_pair) { + eval<0,0,1>(1, 0, buffers, fc, 0, offload_end); + eval<0,0,1>(0, 0, buffers, fc, host_start, inum); + } else { + eval<0,0,0>(1, 0, buffers, fc, 0, offload_end); + eval<0,0,0>(0, 0, buffers, fc, host_start, inum); + } + } +} + +/* ---------------------------------------------------------------------- */ + +template +void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag, + IntelBuffers *buffers, + const ForceConst &fc, + const int astart, const int aend) +{ + const int inum = aend - astart; + if (inum == 0) return; + int nlocal, nall, minlocal; + fix->get_buffern(offload, nlocal, nall, minlocal); + + const int ago = neighbor->ago; + IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall); + + ATOM_T * restrict const x = buffers->get_x(offload); + flt_t * restrict const q = buffers->get_q(offload); + + const int * restrict const numneigh = list->numneigh; + const int * restrict const cnumneigh = buffers->cnumneigh(list); + const int * restrict const firstneigh = buffers->firstneigh(list); + + const flt_t * restrict const special_coul = fc.special_coul; + const flt_t * restrict const special_lj = fc.special_lj; + const flt_t qqrd2e = force->qqrd2e; + const flt_t inv_denom_lj = (flt_t)1.0/denom_lj; + + const flt_t * restrict const cutsq = fc.cutsq[0]; + const LJ_T * restrict const lj = fc.lj[0]; + const TABLE_T * restrict const table = fc.table; + const flt_t * restrict const etable = fc.etable; + const flt_t * restrict const detable = fc.detable; + const flt_t * restrict const ctable = fc.ctable; + const flt_t * restrict const dctable = fc.dctable; + const flt_t cut_ljsq = fc.cut_ljsq; + const flt_t cut_lj_innersq = fc.cut_lj_innersq; + const flt_t cut_coulsq = fc.cut_coulsq; + const flt_t g_ewald = fc.g_ewald; + const flt_t tabinnersq = fc.tabinnersq; + + const int ntypes = atom->ntypes + 1; + const int eatom = this->eflag_atom; + + // Determine how much data to transfer + int x_size, q_size, f_stride, ev_size, separate_flag; + IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag, + buffers, offload, fix, separate_flag, + x_size, q_size, ev_size, f_stride); + + int tc; + FORCE_T * restrict f_start; + acc_t * restrict ev_global; + IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global); + + const int nthreads = tc; + #ifdef _LMP_INTEL_OFFLOAD + int *overflow = fix->get_off_overflow_flag(); + double *timer_compute = fix->off_watch_pair(); + // Redeclare as local variables for offload + const int ncoultablebits = this->ncoultablebits; + const int ncoulmask = this->ncoulmask; + const int ncoulshiftbits = this->ncoulshiftbits; + #ifdef INTEL_ALLOW_TABLE + #define ITABLE_IN in(table,etable,detable:length(0) alloc_if(0) free_if(0)) \ + in(ctable,dctable:length(0) alloc_if(0) free_if(0)) \ + in(ncoultablebits,tabinnersq,ncoulmask,ncoulshiftbits) + #else + #define ITABLE_IN + #endif + + if (offload) fix->start_watch(TIME_OFFLOAD_LATENCY); + #pragma offload target(mic:_cop) if(offload) \ + in(special_lj,special_coul:length(0) alloc_if(0) free_if(0)) \ + in(cutsq,lj:length(0) alloc_if(0) free_if(0)) \ + in(firstneigh:length(0) alloc_if(0) free_if(0)) \ + in(cnumneigh:length(0) alloc_if(0) free_if(0)) \ + in(numneigh:length(0) alloc_if(0) free_if(0)) \ + in(x:length(x_size) alloc_if(0) free_if(0)) \ + in(q:length(q_size) alloc_if(0) free_if(0)) \ + in(overflow:length(0) alloc_if(0) free_if(0)) \ + in(nthreads,qqrd2e,g_ewald,inum,nall,ntypes,cut_coulsq,vflag,eatom) \ + in(f_stride,separate_flag,offload) \ + in(astart,cut_ljsq,cut_lj_innersq,nlocal,inv_denom_lj,minlocal) \ + out(f_start:length(f_stride) alloc_if(0) free_if(0)) \ + out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \ + out(timer_compute:length(1) alloc_if(0) free_if(0)) \ + ITABLE_IN signal(f_start) + #endif + { + #ifdef __MIC__ + *timer_compute = MIC_Wtime(); + #endif + + IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall, + f_stride, x, q); + + acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5; + if (EVFLAG) { + oevdwl = oecoul = (acc_t)0; + if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; + } + + // loop over neighbors of my atoms + #if defined(_OPENMP) + #pragma omp parallel default(none) \ + shared(f_start,f_stride,nlocal,nall,minlocal) \ + reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5) + #endif + { + int iifrom, iito, tid; + IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads); + iifrom += astart; + iito += astart; + + FORCE_T * restrict const f = f_start - minlocal + (tid * f_stride); + memset(f + minlocal, 0, f_stride * sizeof(FORCE_T)); + flt_t cutboth = cut_coulsq; + + for (int i = iifrom; i < iito; ++i) { + // const int i = ilist[ii]; + const int itype = x[i].w; + + const int ptr_off = itype * ntypes; + const flt_t * restrict const cutsqi = cutsq + ptr_off; + const LJ_T * restrict const lji = lj + ptr_off; + + const int * restrict const jlist = firstneigh + cnumneigh[i]; + const int jnum = numneigh[i]; + + acc_t fxtmp,fytmp,fztmp,fwtmp; + acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5; + + const flt_t xtmp = x[i].x; + const flt_t ytmp = x[i].y; + const flt_t ztmp = x[i].z; + const flt_t qtmp = q[i]; + fxtmp = fytmp = fztmp = (acc_t)0; + if (EVFLAG) { + if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0; + if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0; + } + + #pragma vector aligned + #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \ + sv0, sv1, sv2, sv3, sv4, sv5) + for (int jj = 0; jj < jnum; jj++) { + flt_t forcecoul, forcelj, evdwl, ecoul; + forcecoul = forcelj = evdwl = ecoul = (flt_t)0.0; + + const int sbindex = jlist[jj] >> SBBITS & 3; + const int j = jlist[jj] & NEIGHMASK; + + const flt_t delx = xtmp - x[j].x; + const flt_t dely = ytmp - x[j].y; + const flt_t delz = ztmp - x[j].z; + const int jtype = x[j].w; + const flt_t rsq = delx * delx + dely * dely + delz * delz; + const flt_t r2inv = (flt_t)1.0 / rsq; + + #ifdef __MIC__ + if (rsq < cut_coulsq) { + #endif + #ifdef INTEL_ALLOW_TABLE + if (!ncoultablebits || rsq <= tabinnersq) { + #endif + const flt_t A1 = 0.254829592; + const flt_t A2 = -0.284496736; + const flt_t A3 = 1.421413741; + const flt_t A4 = -1.453152027; + const flt_t A5 = 1.061405429; + const flt_t EWALD_F = 1.12837917; + const flt_t INV_EWALD_P = 1.0 / 0.3275911; + + const flt_t r = sqrt(rsq); + const flt_t grij = g_ewald * r; + const flt_t expm2 = exp(-grij * grij); + const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij); + const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; + const flt_t prefactor = qqrd2e * qtmp * q[j] / r; + forcecoul = prefactor * (erfc + EWALD_F * grij * expm2); + if (EFLAG) ecoul = prefactor * erfc; + if (sbindex) { + const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])* + prefactor; + forcecoul -= adjust; + if (EFLAG) ecoul -= adjust; + } + #ifdef INTEL_ALLOW_TABLE + } else { + float rsq_lookup = rsq; + const int itable = (__intel_castf32_u32(rsq_lookup) & + ncoulmask) >> ncoulshiftbits; + const flt_t fraction = (rsq_lookup - table[itable].r) * + table[itable].dr; + + const flt_t tablet = table[itable].f + + fraction * table[itable].df; + forcecoul = qtmp * q[j] * tablet; + if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] + + fraction * detable[itable]); + if (sbindex) { + const flt_t table2 = ctable[itable] + + fraction * dctable[itable]; + const flt_t prefactor = qtmp * q[j] * table2; + const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) * + prefactor; + forcecoul -= adjust; + if (EFLAG) ecoul -= adjust; + } + } + #endif + #ifdef __MIC__ + } + #endif + + #ifdef __MIC__ + if (rsq < cut_ljsq) { + #endif + flt_t r6inv = r2inv * r2inv * r2inv; + forcelj = r6inv * (lji[jtype].x * r6inv - lji[jtype].y); + if (EFLAG) evdwl = r6inv*(lji[jtype].z * r6inv - lji[jtype].w); + + #ifdef __MIC__ + if (rsq > cut_lj_innersq) { + #endif + const flt_t drsq = cut_ljsq - rsq; + const flt_t cut2 = (rsq - cut_lj_innersq) * drsq; + const flt_t switch1 = drsq * (drsq * drsq + (flt_t)3.0 * cut2) * + inv_denom_lj; + const flt_t switch2 = (flt_t)12.0 * rsq * cut2 * inv_denom_lj; + if (EFLAG) { + #ifndef __MIC__ + if (rsq > cut_lj_innersq) { + #endif + forcelj = forcelj * switch1 + evdwl * switch2; + evdwl *= switch1; + #ifndef __MIC__ + } + #endif + } else { + const flt_t philj = r6inv * (lji[jtype].z*r6inv - + lji[jtype].w); + #ifndef __MIC__ + if (rsq > cut_lj_innersq) + #endif + forcelj = forcelj * switch1 + philj * switch2; + } + #ifdef __MIC__ + } + #endif + + if (sbindex) { + const flt_t factor_lj = special_lj[sbindex]; + forcelj *= factor_lj; + if (EFLAG) evdwl *= factor_lj; + } + #ifdef __MIC__ + } + #else + if (rsq > cut_coulsq) { forcecoul = (flt_t)0.0; ecoul = (flt_t)0.0; } + if (rsq > cut_ljsq) { forcelj = (flt_t)0.0; evdwl = (flt_t)0.0; } + #endif + + #ifdef __MIC__ + if (rsq < cut_coulsq) { + #endif + const flt_t fpair = (forcecoul + forcelj) * r2inv; + fxtmp += delx * fpair; + fytmp += dely * fpair; + fztmp += delz * fpair; + if (NEWTON_PAIR || j < nlocal) { + f[j].x -= delx * fpair; + f[j].y -= dely * fpair; + f[j].z -= delz * fpair; + } + + if (EVFLAG) { + flt_t ev_pre = (flt_t)0; + if (NEWTON_PAIR || i < nlocal) + ev_pre += (flt_t)0.5; + if (NEWTON_PAIR || j < nlocal) + ev_pre += (flt_t)0.5; + + if (EFLAG) { + sevdwl += ev_pre * evdwl; + secoul += ev_pre * ecoul; + if (eatom) { + if (NEWTON_PAIR || i < nlocal) + fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul; + if (NEWTON_PAIR || j < nlocal) + f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul; + } + } + + IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair, + delx, dely, delz); + } + #ifdef __MIC__ + } + #endif + } // for jj + f[i].x += fxtmp; + f[i].y += fytmp; + f[i].z += fztmp; + + IP_PRE_ev_tally_atomq(EVFLAG, EFLAG, vflag, f, fwtmp); + } // for ii + + #if defined(_OPENMP) + #pragma omp barrier + #endif + IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall, + nlocal, minlocal, nthreads, f_start, f_stride, + x); + } // end of omp parallel region + if (EVFLAG) { + if (EFLAG) { + ev_global[0] = oevdwl; + ev_global[1] = oecoul; + } + if (vflag) { + ev_global[2] = ov0; + ev_global[3] = ov1; + ev_global[4] = ov2; + ev_global[5] = ov3; + ev_global[6] = ov4; + ev_global[7] = ov5; + } + } + #ifdef __MIC__ + *timer_compute = MIC_Wtime() - *timer_compute; + #endif + } // end of offload region + + if (offload) + fix->stop_watch(TIME_OFFLOAD_LATENCY); + else + fix->stop_watch(TIME_HOST_PAIR); + + if (EVFLAG) + fix->add_result_array(f_start, ev_global, offload, eatom); + else + fix->add_result_array(f_start, 0, offload); +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCharmmCoulLongIntel::init_style() +{ + PairLJCharmmCoulLong::init_style(); + neighbor->requests[neighbor->nrequest-1]->intel = 1; + + int ifix = modify->find_fix("package_intel"); + if (ifix < 0) + error->all(FLERR, + "The 'package intel' command is required for /intel styles"); + fix = static_cast(modify->fix[ifix]); + + #ifdef _LMP_INTEL_OFFLOAD + fix->set_offload_affinity(); + _cop = fix->coprocessor_number(); + #endif + if (fix->precision() == FixIntel::PREC_MODE_MIXED) { + fix->get_mixed_buffers()->free_all_nbor_buffers(); + pack_force_const(force_const_single, fix->get_mixed_buffers()); + } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) { + fix->get_double_buffers()->free_all_nbor_buffers(); + pack_force_const(force_const_double, fix->get_double_buffers()); + } else { + fix->get_single_buffers()->free_all_nbor_buffers(); + pack_force_const(force_const_single, fix->get_single_buffers()); + } +} + +template +void PairLJCharmmCoulLongIntel::pack_force_const(ForceConst &fc, + IntelBuffers *buffers) +{ + int tp1 = atom->ntypes + 1; + int ntable = 1; + if (ncoultablebits) + for (int i = 0; i < ncoultablebits; i++) ntable *= 2; + + fc.set_ntypes(tp1, ntable, memory, _cop); + buffers->set_ntypes(tp1); + flt_t **cutneighsq = buffers->get_cutneighsq(); + + // Repeat cutsq calculation because done after call to init_style + double cut, cutneigh; + if (cut_lj > cut_coul) + error->all(FLERR, + "Intel varient of lj/charmm/coul/long expects lj cutoff<=coulombic"); + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = i; j <= atom->ntypes; j++) { + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + cut = init_one(i, j); + cutneigh = cut + neighbor->skin; + cutsq[i][j] = cutsq[j][i] = cut*cut; + cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh; + } + } + } + + cut_lj_innersq = cut_lj_inner * cut_lj_inner; + cut_ljsq = cut_lj * cut_lj; + cut_coulsq = cut_coul * cut_coul; + cut_bothsq = MAX(cut_ljsq, cut_coulsq); + + fc.g_ewald = force->kspace->g_ewald; + fc.tabinnersq = tabinnersq; + fc.cut_coulsq = cut_coulsq; + fc.cut_ljsq = cut_ljsq; + fc.cut_lj_innersq = cut_lj_innersq; + + for (int i = 0; i < 4; i++) { + fc.special_lj[i] = force->special_lj[i]; + fc.special_coul[i] = force->special_coul[i]; + fc.special_coul[0] = 1.0; + fc.special_lj[0] = 1.0; + } + + for (int i = 0; i < tp1; i++) { + for (int j = 0; j < tp1; j++) { + fc.lj[i][j].x = lj1[i][j]; + fc.lj[i][j].y = lj2[i][j]; + fc.lj[i][j].z = lj3[i][j]; + fc.lj[i][j].w = lj4[i][j]; + fc.cutsq[i][j] = cutsq[i][j]; + } + } + + if (ncoultablebits) { + for (int i = 0; i < ntable; i++) { + fc.table[i].r = rtable[i]; + fc.table[i].dr = drtable[i]; + fc.table[i].f = ftable[i]; + fc.table[i].df = dftable[i]; + fc.etable[i] = etable[i]; + fc.detable[i] = detable[i]; + fc.ctable[i] = ctable[i]; + fc.dctable[i] = dctable[i]; + } + } + + #ifdef _LMP_INTEL_OFFLOAD + if (_cop < 0) return; + flt_t * special_lj = fc.special_lj; + flt_t * special_coul = fc.special_coul; + flt_t * cutsq = fc.cutsq[0]; + LJ_T * lj = fc.lj[0]; + TABLE_T * table = fc.table; + flt_t * etable = fc.etable; + flt_t * detable = fc.detable; + flt_t * ctable = fc.ctable; + flt_t * dctable = fc.dctable; + flt_t * ocutneighsq = cutneighsq[0]; + int tp1sq = tp1 * tp1; + #pragma offload_transfer target(mic:_cop) \ + in(special_lj, special_coul: length(4) alloc_if(0) free_if(0)) \ + in(cutsq,lj: length(tp1sq) alloc_if(0) free_if(0)) \ + in(table: length(ntable) alloc_if(0) free_if(0)) \ + in(etable,detable,ctable,dctable: length(ntable) alloc_if(0) free_if(0)) \ + in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0)) + #endif +} + +/* ---------------------------------------------------------------------- */ + +template +void PairLJCharmmCoulLongIntel::ForceConst::set_ntypes(const int ntypes, + const int ntable, + Memory *memory, + const int cop) { + if ( (ntypes != _ntypes || ntable != _ntable) ) { + if (_ntypes > 0) { + #ifdef _LMP_INTEL_OFFLOAD + flt_t * ospecial_lj = special_lj; + flt_t * ospecial_coul = special_coul; + flt_t * ocutsq = cutsq[0]; + typename IntelBuffers::vec4_t * olj = lj[0]; + table_t * otable = table; + flt_t * oetable = etable; + flt_t * odetable = detable; + flt_t * octable = ctable; + flt_t * odctable = dctable; + if (ospecial_lj != NULL && ocutsq != NULL && olj != NULL && + otable != NULL && oetable != NULL && odetable != NULL && + octable != NULL && odctable != NULL && ospecial_coul != NULL && + cop >= 0) { + #pragma offload_transfer target(mic:cop) \ + nocopy(ospecial_lj, ospecial_coul: alloc_if(0) free_if(1)) \ + nocopy(ocutsq, olj: alloc_if(0) free_if(1)) \ + nocopy(otable: alloc_if(0) free_if(1)) \ + nocopy(oetable, odetable, octable, odctable: alloc_if(0) free_if(1)) + } + #endif + + _memory->destroy(cutsq); + _memory->destroy(lj); + _memory->destroy(table); + _memory->destroy(etable); + _memory->destroy(detable); + _memory->destroy(ctable); + _memory->destroy(dctable); + } + if (ntypes > 0) { + _cop = cop; + memory->create(cutsq,ntypes,ntypes,"fc.cutsq"); + memory->create(lj,ntypes,ntypes,"fc.lj"); + memory->create(table,ntable,"pair:fc.table"); + memory->create(etable,ntable,"pair:fc.etable"); + memory->create(detable,ntable,"pair:fc.detable"); + memory->create(ctable,ntable,"pair:fc.ctable"); + memory->create(dctable,ntable,"pair:fc.dctable"); + + #ifdef _LMP_INTEL_OFFLOAD + flt_t * ospecial_lj = special_lj; + flt_t * ospecial_coul = special_coul; + flt_t * ocutsq = cutsq[0]; + typename IntelBuffers::vec4_t * olj = lj[0]; + table_t * otable = table; + flt_t * oetable = etable; + flt_t * odetable = detable; + flt_t * octable = ctable; + flt_t * odctable = dctable; + int tp1sq = ntypes*ntypes; + if (ospecial_lj != NULL && ocutsq != NULL && olj != NULL && + otable !=NULL && oetable != NULL && odetable != NULL && + octable != NULL && odctable != NULL && ospecial_coul != NULL && + cop >= 0) { + #pragma offload_transfer target(mic:cop) \ + nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \ + nocopy(ospecial_coul: length(4) alloc_if(1) free_if(0)) \ + nocopy(ocutsq,olj: length(tp1sq) alloc_if(1) free_if(0)) \ + nocopy(otable: length(ntable) alloc_if(1) free_if(0)) \ + nocopy(oetable,odetable: length(ntable) alloc_if(1) free_if(0)) \ + nocopy(octable,odctable: length(ntable) alloc_if(1) free_if(0)) + } + #endif + } + } + _ntypes=ntypes; + _ntable=ntable; + _memory=memory; +} diff --git a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.h b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.h new file mode 100644 index 0000000000..ad66c786b6 --- /dev/null +++ b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.h @@ -0,0 +1,104 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS + +PairStyle(lj/charmm/coul/long/intel,PairLJCharmmCoulLongIntel) + +#else + +#ifndef LMP_PAIR_LJ_CHARMM_COUL_LONG_INTEL_H +#define LMP_PAIR_LJ_CHARMM_COUL_LONG_INTEL_H + +#include "pair_lj_charmm_coul_long.h" +#include "fix_intel.h" + +namespace LAMMPS_NS { + +class PairLJCharmmCoulLongIntel : public PairLJCharmmCoulLong { + + public: + PairLJCharmmCoulLongIntel(class LAMMPS *); + virtual ~PairLJCharmmCoulLongIntel(); + + virtual void compute(int, int); + void init_style(); + + typedef struct { float x,y,z; int w; } sng4_t; + + private: + FixIntel *fix; + int _cop; + + template class ForceConst; + template + void compute(int eflag, int vflag, IntelBuffers *buffers, + const ForceConst &fc); + template + void eval(const int offload, const int vflag, + IntelBuffers * buffers, + const ForceConst &fc, const int astart, const int aend); + + template + void pack_force_const(ForceConst &fc, + IntelBuffers *buffers); + + // ---------------------------------------------------------------------- + template + class ForceConst { + public: + typedef struct { flt_t r, dr, f, df; } table_t; + __declspec(align(64)) flt_t special_coul[4]; + __declspec(align(64)) flt_t special_lj[4]; + flt_t **cutsq, g_ewald, tabinnersq; + flt_t cut_coulsq, cut_ljsq; + flt_t cut_lj_innersq; + table_t *table; + flt_t *etable, *detable, *ctable, *dctable; + typename IntelBuffers::vec4_t **lj; + + ForceConst() : _ntypes(0), _ntable(0) {} + ~ForceConst() { set_ntypes(0,0,NULL,_cop); } + + void set_ntypes(const int ntypes, const int ntable, Memory *memory, + const int cop); + + private: + int _ntypes, _ntable, _cop; + Memory *_memory; + }; + ForceConst force_const_single; + ForceConst force_const_double; +}; + +} + +#endif +#endif + +/* ERROR/WARNING messages: + +E: The 'package intel' command is required for /intel styles + +Self-explanatory. + +E: Intel varient of lj/charmm/coul/long expects lj cutoff<=coulombic + +The intel accelerated version of the CHARMM style requires that the +Lennard-Jones cutoff is not greater than the coulombic cutoff. + +*/ diff --git a/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp b/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp new file mode 100644 index 0000000000..4163a1f7d2 --- /dev/null +++ b/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp @@ -0,0 +1,634 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + This software is distributed under the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#include "math.h" +#include "pair_lj_cut_coul_long_intel.h" +#include "atom.h" +#include "comm.h" +#include "force.h" +#include "group.h" +#include "kspace.h" +#include "memory.h" +#include "modify.h" +#include "neighbor.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "memory.h" +#include "suffix.h" +using namespace LAMMPS_NS; + +#define C_FORCE_T typename ForceConst::c_force_t +#define C_ENERGY_T typename ForceConst::c_energy_t +#define TABLE_T typename ForceConst::table_t + +/* ---------------------------------------------------------------------- */ + +PairLJCutCoulLongIntel::PairLJCutCoulLongIntel(LAMMPS *lmp) : + PairLJCutCoulLong(lmp) +{ + suffix_flag |= Suffix::INTEL; + respa_enable = 0; + cut_respa = NULL; +} + +/* ---------------------------------------------------------------------- */ + +PairLJCutCoulLongIntel::~PairLJCutCoulLongIntel() +{ +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCutCoulLongIntel::compute(int eflag, int vflag) +{ + if (fix->precision()==FixIntel::PREC_MODE_MIXED) + compute(eflag, vflag, fix->get_mixed_buffers(), + force_const_single); + else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE) + compute(eflag, vflag, fix->get_double_buffers(), + force_const_double); + else + compute(eflag, vflag, fix->get_single_buffers(), + force_const_single); + + fix->balance_stamp(); + vflag_fdotr = 0; +} + +template +void PairLJCutCoulLongIntel::compute(int eflag, int vflag, + IntelBuffers *buffers, + const ForceConst &fc) +{ + if (eflag || vflag) { + ev_setup(eflag,vflag); + } else evflag = vflag_fdotr = 0; + + const int inum = list->inum; + const int nthreads = comm->nthreads; + const int host_start = fix->host_start_pair(); + const int offload_end = fix->offload_end_pair(); + const int ago = neighbor->ago; + + if (ago != 0 && fix->separate_buffers() == 0) { + fix->start_watch(TIME_PACK); + #if defined(_OPENMP) + #pragma omp parallel default(none) shared(eflag,vflag,buffers,fc) + #endif + { + int ifrom, ito, tid; + IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost, + nthreads, sizeof(ATOM_T)); + buffers->thr_pack(ifrom,ito,ago); + } + fix->stop_watch(TIME_PACK); + } + + if (evflag || vflag_fdotr) { + int ovflag = 0; + if (vflag_fdotr) ovflag = 2; + else if (vflag) ovflag = 1; + if (eflag) { + if (force->newton_pair) { + eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum); + } else { + eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum); + } + } else { + if (force->newton_pair) { + eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum); + } else { + eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum); + } + } + } else { + if (force->newton_pair) { + eval<0,0,1>(1, 0, buffers, fc, 0, offload_end); + eval<0,0,1>(0, 0, buffers, fc, host_start, inum); + } else { + eval<0,0,0>(1, 0, buffers, fc, 0, offload_end); + eval<0,0,0>(0, 0, buffers, fc, host_start, inum); + } + } +} + +/* ---------------------------------------------------------------------- */ + +template +void PairLJCutCoulLongIntel::eval(const int offload, const int vflag, + IntelBuffers *buffers, + const ForceConst &fc, + const int astart, const int aend) +{ + const int inum = aend - astart; + if (inum == 0) return; + int nlocal, nall, minlocal; + fix->get_buffern(offload, nlocal, nall, minlocal); + + const int ago = neighbor->ago; + IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall); + + ATOM_T * restrict const x = buffers->get_x(offload); + flt_t * restrict const q = buffers->get_q(offload); + + const int * restrict const numneigh = list->numneigh; + const int * restrict const cnumneigh = buffers->cnumneigh(list); + const int * restrict const firstneigh = buffers->firstneigh(list); + + const flt_t * restrict const special_coul = fc.special_coul; + const flt_t * restrict const special_lj = fc.special_lj; + const flt_t qqrd2e = force->qqrd2e; + + const C_FORCE_T * restrict const c_force = fc.c_force[0]; + const C_ENERGY_T * restrict const c_energy = fc.c_energy[0]; + const TABLE_T * restrict const table = fc.table; + const flt_t * restrict const etable = fc.etable; + const flt_t * restrict const detable = fc.detable; + const flt_t * restrict const ctable = fc.ctable; + const flt_t * restrict const dctable = fc.dctable; + const flt_t g_ewald = fc.g_ewald; + const flt_t tabinnersq = fc.tabinnersq; + + const int ntypes = atom->ntypes + 1; + const int eatom = this->eflag_atom; + + // Determine how much data to transfer + int x_size, q_size, f_stride, ev_size, separate_flag; + IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag, + buffers, offload, fix, separate_flag, + x_size, q_size, ev_size, f_stride); + + int tc; + FORCE_T * restrict f_start; + acc_t * restrict ev_global; + IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global); + + const int nthreads = tc; + #ifdef _LMP_INTEL_OFFLOAD + int *overflow = fix->get_off_overflow_flag(); + double *timer_compute = fix->off_watch_pair(); + // Redeclare as local variables for offload + const int ncoultablebits = this->ncoultablebits; + const int ncoulmask = this->ncoulmask; + const int ncoulshiftbits = this->ncoulshiftbits; + #ifdef INTEL_ALLOW_TABLE + #define ITABLE_IN in(table,etable,detable:length(0) alloc_if(0) free_if(0)) \ + in(ctable,dctable:length(0) alloc_if(0) free_if(0)) \ + in(ncoultablebits,tabinnersq,ncoulmask,ncoulshiftbits) + #else + #define ITABLE_IN + #endif + + if (offload) fix->start_watch(TIME_OFFLOAD_LATENCY); + #pragma offload target(mic:_cop) if(offload) \ + in(special_lj,special_coul:length(0) alloc_if(0) free_if(0)) \ + in(c_force, c_energy:length(0) alloc_if(0) free_if(0)) \ + in(firstneigh:length(0) alloc_if(0) free_if(0)) \ + in(cnumneigh:length(0) alloc_if(0) free_if(0)) \ + in(numneigh:length(0) alloc_if(0) free_if(0)) \ + in(x:length(x_size) alloc_if(0) free_if(0)) \ + in(q:length(q_size) alloc_if(0) free_if(0)) \ + in(overflow:length(0) alloc_if(0) free_if(0)) \ + in(astart,nthreads,qqrd2e,g_ewald,inum,nall,ntypes,vflag,eatom) \ + in(f_stride,nlocal,minlocal,separate_flag,offload) \ + out(f_start:length(f_stride) alloc_if(0) free_if(0)) \ + out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \ + out(timer_compute:length(1) alloc_if(0) free_if(0)) \ + ITABLE_IN signal(f_start) + #endif + { + #ifdef __MIC__ + *timer_compute = MIC_Wtime(); + #endif + + IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall, + f_stride, x, q); + + acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5; + if (EVFLAG) { + oevdwl = oecoul = (acc_t)0; + if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; + } + + // loop over neighbors of my atoms + #if defined(_OPENMP) + #pragma omp parallel default(none) \ + shared(f_start,f_stride,nlocal,nall,minlocal) \ + reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5) + #endif + { + int iifrom, iito, tid; + IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads); + iifrom += astart; + iito += astart; + + FORCE_T * restrict const f = f_start - minlocal + (tid * f_stride); + memset(f + minlocal, 0, f_stride * sizeof(FORCE_T)); + + for (int i = iifrom; i < iito; ++i) { + const int itype = x[i].w; + + const int ptr_off = itype * ntypes; + const C_FORCE_T * restrict const c_forcei = c_force + ptr_off; + const C_ENERGY_T * restrict const c_energyi = c_energy + ptr_off; + + const int * restrict const jlist = firstneigh + cnumneigh[i]; + const int jnum = numneigh[i]; + + acc_t fxtmp,fytmp,fztmp,fwtmp; + acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5; + + const flt_t xtmp = x[i].x; + const flt_t ytmp = x[i].y; + const flt_t ztmp = x[i].z; + const flt_t qtmp = q[i]; + fxtmp = fytmp = fztmp = (acc_t)0; + if (EVFLAG) { + if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0; + if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0; + } + + #pragma vector aligned + #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \ + sv0, sv1, sv2, sv3, sv4, sv5) + for (int jj = 0; jj < jnum; jj++) { + flt_t forcecoul, forcelj, evdwl, ecoul; + forcecoul = forcelj = evdwl = ecoul = (flt_t)0.0; + + const int sbindex = jlist[jj] >> SBBITS & 3; + const int j = jlist[jj] & NEIGHMASK; + + const flt_t delx = xtmp - x[j].x; + const flt_t dely = ytmp - x[j].y; + const flt_t delz = ztmp - x[j].z; + const int jtype = x[j].w; + const flt_t rsq = delx * delx + dely * dely + delz * delz; + + const flt_t r2inv = (flt_t)1.0 / rsq; + + #ifdef __MIC__ + if (rsq < c_forcei[jtype].cutsq) { + #endif + #ifdef INTEL_ALLOW_TABLE + if (!ncoultablebits || rsq <= tabinnersq) { + #endif + const flt_t A1 = 0.254829592; + const flt_t A2 = -0.284496736; + const flt_t A3 = 1.421413741; + const flt_t A4 = -1.453152027; + const flt_t A5 = 1.061405429; + const flt_t EWALD_F = 1.12837917; + const flt_t INV_EWALD_P = 1.0 / 0.3275911; + + const flt_t r = sqrt(rsq); + const flt_t grij = g_ewald * r; + const flt_t expm2 = exp(-grij * grij); + const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij); + const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; + const flt_t prefactor = qqrd2e * qtmp * q[j] / r; + forcecoul = prefactor * (erfc + EWALD_F * grij * expm2); + if (EFLAG) ecoul = prefactor * erfc; + if (sbindex) { + const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])* + prefactor; + forcecoul -= adjust; + if (EFLAG) ecoul -= adjust; + } + #ifdef INTEL_ALLOW_TABLE + } else { + float rsq_lookup = rsq; + const int itable = (__intel_castf32_u32(rsq_lookup) & + ncoulmask) >> ncoulshiftbits; + const flt_t fraction = (rsq_lookup - table[itable].r) * + table[itable].dr; + + const flt_t tablet = table[itable].f + + fraction * table[itable].df; + forcecoul = qtmp * q[j] * tablet; + if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] + + fraction * detable[itable]); + if (sbindex) { + const flt_t table2 = ctable[itable] + + fraction * dctable[itable]; + const flt_t prefactor = qtmp * q[j] * table2; + const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) * + prefactor; + forcecoul -= adjust; + if (EFLAG) ecoul -= adjust; + } + } + #endif + #ifdef __MIC__ + } + #endif + + #ifdef __MIC__ + if (rsq < c_forcei[jtype].cut_ljsq) { + #endif + flt_t r6inv = r2inv * r2inv * r2inv; + forcelj = r6inv * (c_forcei[jtype].lj1 * r6inv - + c_forcei[jtype].lj2); + if (EFLAG) evdwl = r6inv*(c_energyi[jtype].lj3 * r6inv - + c_energyi[jtype].lj4) - + c_energyi[jtype].offset; + + if (sbindex) { + const flt_t factor_lj = special_lj[sbindex]; + forcelj *= factor_lj; + if (EFLAG) evdwl *= factor_lj; + } + #ifdef __MIC__ + } + #else + if (rsq > c_forcei[jtype].cutsq) + { forcecoul = (flt_t)0.0; ecoul = (flt_t)0.0; } + if (rsq > c_forcei[jtype].cut_ljsq) + { forcelj = (flt_t)0.0; evdwl = (flt_t)0.0; } + #endif + + #ifdef __MIC__ + if (rsq < c_forcei[jtype].cutsq) { + #endif + const flt_t fpair = (forcecoul + forcelj) * r2inv; + fxtmp += delx * fpair; + fytmp += dely * fpair; + fztmp += delz * fpair; + if (NEWTON_PAIR || j < nlocal) { + f[j].x -= delx * fpair; + f[j].y -= dely * fpair; + f[j].z -= delz * fpair; + } + + if (EVFLAG) { + flt_t ev_pre = (flt_t)0; + if (NEWTON_PAIR || i < nlocal) + ev_pre += (flt_t)0.5; + if (NEWTON_PAIR || j < nlocal) + ev_pre += (flt_t)0.5; + + if (EFLAG) { + sevdwl += ev_pre * evdwl; + secoul += ev_pre * ecoul; + if (eatom) { + if (NEWTON_PAIR || i < nlocal) + fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul; + if (NEWTON_PAIR || j < nlocal) + f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul; + } + } + IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair, delx, dely, delz); + } + #ifdef __MIC__ + } + #endif + } // for jj + + f[i].x += fxtmp; + f[i].y += fytmp; + f[i].z += fztmp; + IP_PRE_ev_tally_atomq(EVFLAG, EFLAG, vflag, f, fwtmp); + } // for ii + #if defined(_OPENMP) + #pragma omp barrier + #endif + IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall, + nlocal, minlocal, nthreads, f_start, f_stride, + x); + } // end of omp parallel region + if (EVFLAG) { + if (EFLAG) { + ev_global[0] = oevdwl; + ev_global[1] = oecoul; + } + if (vflag) { + ev_global[2] = ov0; + ev_global[3] = ov1; + ev_global[4] = ov2; + ev_global[5] = ov3; + ev_global[6] = ov4; + ev_global[7] = ov5; + } + } + #ifdef __MIC__ + *timer_compute = MIC_Wtime() - *timer_compute; + #endif + } // end of offload region + + if (offload) + fix->stop_watch(TIME_OFFLOAD_LATENCY); + else + fix->stop_watch(TIME_HOST_PAIR); + + if (EVFLAG) + fix->add_result_array(f_start, ev_global, offload, eatom); + else + fix->add_result_array(f_start, 0, offload); +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCutCoulLongIntel::init_style() +{ + PairLJCutCoulLong::init_style(); + neighbor->requests[neighbor->nrequest-1]->intel = 1; + + int ifix = modify->find_fix("package_intel"); + if (ifix < 0) + error->all(FLERR, + "The 'package intel' command is required for /intel styles"); + fix = static_cast(modify->fix[ifix]); + + #ifdef _LMP_INTEL_OFFLOAD + fix->set_offload_affinity(); + _cop = fix->coprocessor_number(); + #endif + if (fix->precision() == FixIntel::PREC_MODE_MIXED) { + fix->get_mixed_buffers()->free_all_nbor_buffers(); + pack_force_const(force_const_single, fix->get_mixed_buffers()); + } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) { + fix->get_double_buffers()->free_all_nbor_buffers(); + pack_force_const(force_const_double, fix->get_double_buffers()); + } else { + fix->get_single_buffers()->free_all_nbor_buffers(); + pack_force_const(force_const_single, fix->get_single_buffers()); + } +} + +template +void PairLJCutCoulLongIntel::pack_force_const(ForceConst &fc, + IntelBuffers *buffers) +{ + int tp1 = atom->ntypes + 1; + int ntable = 1; + if (ncoultablebits) + for (int i = 0; i < ncoultablebits; i++) ntable *= 2; + + fc.set_ntypes(tp1, ntable, memory, _cop); + buffers->set_ntypes(tp1); + flt_t **cutneighsq = buffers->get_cutneighsq(); + + // Repeat cutsq calculation because done after call to init_style + double cut, cutneigh; + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = i; j <= atom->ntypes; j++) { + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + cut = init_one(i, j); + cutneigh = cut + neighbor->skin; + cutsq[i][j] = cutsq[j][i] = cut*cut; + cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh; + } + } + } + + fc.g_ewald = force->kspace->g_ewald; + fc.tabinnersq = tabinnersq; + + for (int i = 0; i < 4; i++) { + fc.special_lj[i] = force->special_lj[i]; + fc.special_coul[i] = force->special_coul[i]; + fc.special_coul[0] = 1.0; + fc.special_lj[0] = 1.0; + } + + for (int i = 0; i < tp1; i++) { + for (int j = 0; j < tp1; j++) { + fc.c_force[i][j].cutsq = cutsq[i][j]; + fc.c_force[i][j].cut_ljsq = cut_ljsq[i][j]; + fc.c_force[i][j].lj1 = lj1[i][j]; + fc.c_force[i][j].lj2 = lj2[i][j]; + fc.c_energy[i][j].lj3 = lj3[i][j]; + fc.c_energy[i][j].lj4 = lj4[i][j]; + fc.c_energy[i][j].offset = offset[i][j]; + } + } + + if (ncoultablebits) { + for (int i = 0; i < ntable; i++) { + fc.table[i].r = rtable[i]; + fc.table[i].dr = drtable[i]; + fc.table[i].f = ftable[i]; + fc.table[i].df = dftable[i]; + fc.etable[i] = etable[i]; + fc.detable[i] = detable[i]; + fc.ctable[i] = ctable[i]; + fc.dctable[i] = dctable[i]; + } + } + + #ifdef _LMP_INTEL_OFFLOAD + if (_cop < 0) return; + flt_t * special_lj = fc.special_lj; + flt_t * special_coul = fc.special_coul; + C_FORCE_T * c_force = fc.c_force[0]; + C_ENERGY_T * c_energy = fc.c_energy[0]; + TABLE_T * table = fc.table; + flt_t * etable = fc.etable; + flt_t * detable = fc.detable; + flt_t * ctable = fc.ctable; + flt_t * dctable = fc.dctable; + flt_t * ocutneighsq = cutneighsq[0]; + int tp1sq = tp1 * tp1; + #pragma offload_transfer target(mic:_cop) \ + in(special_lj, special_coul: length(4) alloc_if(0) free_if(0)) \ + in(c_force, c_energy: length(tp1sq) alloc_if(0) free_if(0)) \ + in(table: length(ntable) alloc_if(0) free_if(0)) \ + in(etable,detable,ctable,dctable: length(ntable) alloc_if(0) free_if(0)) \ + in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0)) + #endif +} + +/* ---------------------------------------------------------------------- */ + +template +void PairLJCutCoulLongIntel::ForceConst::set_ntypes(const int ntypes, + const int ntable, + Memory *memory, + const int cop) { + if ( (ntypes != _ntypes || ntable != _ntable) ) { + if (_ntypes > 0) { + #ifdef _LMP_INTEL_OFFLOAD + flt_t * ospecial_lj = special_lj; + flt_t * ospecial_coul = special_coul; + c_force_t * oc_force = c_force[0]; + c_energy_t * oc_energy = c_energy[0]; + table_t * otable = table; + flt_t * oetable = etable; + flt_t * odetable = detable; + flt_t * octable = ctable; + flt_t * odctable = dctable; + if (ospecial_lj != NULL && oc_force != NULL && + oc_energy != NULL && otable != NULL && oetable != NULL && + odetable != NULL && octable != NULL && odctable != NULL && + ospecial_coul != NULL && _cop >= 0) { + #pragma offload_transfer target(mic:cop) \ + nocopy(ospecial_lj, ospecial_coul: alloc_if(0) free_if(1)) \ + nocopy(oc_force, oc_energy: alloc_if(0) free_if(1)) \ + nocopy(otable: alloc_if(0) free_if(1)) \ + nocopy(oetable, odetable, octable, odctable: alloc_if(0) free_if(1)) + } + #endif + + _memory->destroy(c_force); + _memory->destroy(c_energy); + _memory->destroy(table); + _memory->destroy(etable); + _memory->destroy(detable); + _memory->destroy(ctable); + _memory->destroy(dctable); + } + if (ntypes > 0) { + _cop = cop; + memory->create(c_force,ntypes,ntypes,"fc.c_force"); + memory->create(c_energy,ntypes,ntypes,"fc.c_energy"); + memory->create(table,ntable,"pair:fc.table"); + memory->create(etable,ntable,"pair:fc.etable"); + memory->create(detable,ntable,"pair:fc.detable"); + memory->create(ctable,ntable,"pair:fc.ctable"); + memory->create(dctable,ntable,"pair:fc.dctable"); + + #ifdef _LMP_INTEL_OFFLOAD + flt_t * ospecial_lj = special_lj; + flt_t * ospecial_coul = special_coul; + c_force_t * oc_force = c_force[0]; + c_energy_t * oc_energy = c_energy[0]; + table_t * otable = table; + flt_t * oetable = etable; + flt_t * odetable = detable; + flt_t * octable = ctable; + flt_t * odctable = dctable; + int tp1sq = ntypes*ntypes; + if (ospecial_lj != NULL && oc_force != NULL && + oc_energy != NULL && otable !=NULL && oetable != NULL && + odetable != NULL && octable != NULL && odctable != NULL && + ospecial_coul != NULL && cop >= 0) { + #pragma offload_transfer target(mic:cop) \ + nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \ + nocopy(ospecial_coul: length(4) alloc_if(1) free_if(0)) \ + nocopy(oc_force: length(tp1sq) alloc_if(1) free_if(0)) \ + nocopy(oc_energy: length(tp1sq) alloc_if(1) free_if(0)) \ + nocopy(otable: length(ntable) alloc_if(1) free_if(0)) \ + nocopy(oetable,odetable: length(ntable) alloc_if(1) free_if(0)) \ + nocopy(octable,odctable: length(ntable) alloc_if(1) free_if(0)) + } + #endif + } + } + _ntypes=ntypes; + _ntable=ntable; + _memory=memory; +} diff --git a/src/USER-INTEL/pair_lj_cut_coul_long_intel.h b/src/USER-INTEL/pair_lj_cut_coul_long_intel.h new file mode 100644 index 0000000000..d7b4282a99 --- /dev/null +++ b/src/USER-INTEL/pair_lj_cut_coul_long_intel.h @@ -0,0 +1,100 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS + +PairStyle(lj/cut/coul/long/intel,PairLJCutCoulLongIntel) + +#else + +#ifndef LMP_PAIR_LJ_CUT_COUL_LONG_INTEL_H +#define LMP_PAIR_LJ_CUT_COUL_LONG_INTEL_H + +#include "pair_lj_cut_coul_long.h" +#include "fix_intel.h" + +namespace LAMMPS_NS { + +class PairLJCutCoulLongIntel : public PairLJCutCoulLong { + + public: + PairLJCutCoulLongIntel(class LAMMPS *); + virtual ~PairLJCutCoulLongIntel(); + + virtual void compute(int, int); + void init_style(); + + typedef struct { float x,y,z; int w; } sng4_t; + + private: + FixIntel *fix; + int _cop; + + template class ForceConst; + template + void compute(int eflag, int vflag, IntelBuffers *buffers, + const ForceConst &fc); + template + void eval(const int offload, const int vflag, + IntelBuffers * buffers, + const ForceConst &fc, const int astart, const int aend); + + template + void pack_force_const(ForceConst &fc, + IntelBuffers *buffers); + + // ---------------------------------------------------------------------- + template + class ForceConst { + public: + typedef struct { flt_t cutsq, cut_ljsq, lj1, lj2; } c_force_t; + typedef struct { flt_t lj3, lj4, offset, pad; } c_energy_t; + typedef struct { flt_t r, dr, f, df; } table_t; + __declspec(align(64)) flt_t special_coul[4]; + __declspec(align(64)) flt_t special_lj[4]; + flt_t g_ewald, tabinnersq; + c_force_t **c_force; + c_energy_t **c_energy; + table_t *table; + flt_t *etable, *detable, *ctable, *dctable; + + ForceConst() : _ntypes(0), _ntable(0) {} + ~ForceConst() { set_ntypes(0,0,NULL,_cop); } + + void set_ntypes(const int ntypes, const int ntable, Memory *memory, + const int cop); + + private: + int _ntypes, _ntable, _cop; + Memory *_memory; + }; + ForceConst force_const_single; + ForceConst force_const_double; +}; + +} + +#endif +#endif + +/* ERROR/WARNING messages: + +E: The 'package intel' command is required for /intel styles + +Self-explanatory. + +*/ diff --git a/src/USER-INTEL/pair_lj_cut_intel.cpp b/src/USER-INTEL/pair_lj_cut_intel.cpp new file mode 100644 index 0000000000..bca3a73493 --- /dev/null +++ b/src/USER-INTEL/pair_lj_cut_intel.cpp @@ -0,0 +1,412 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + This software is distributed under the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#include "math.h" +#include "pair_lj_cut_intel.h" +#include "atom.h" +#include "comm.h" +#include "force.h" +#include "memory.h" +#include "modify.h" +#include "neighbor.h" +#include "neigh_list.h" +#include "neigh_request.h" + +#include "suffix.h" +using namespace LAMMPS_NS; + +#define FC_PACKED1_T typename ForceConst::fc_packed1 +#define FC_PACKED2_T typename ForceConst::fc_packed2 + +/* ---------------------------------------------------------------------- */ + +PairLJCutIntel::PairLJCutIntel(LAMMPS *lmp) : + PairLJCut(lmp) +{ + suffix_flag |= Suffix::INTEL; + respa_enable = 0; + cut_respa = NULL; +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCutIntel::compute(int eflag, int vflag) +{ + if (fix->precision() == FixIntel::PREC_MODE_MIXED) + compute(eflag, vflag, fix->get_mixed_buffers(), + force_const_single); + else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) + compute(eflag, vflag, fix->get_double_buffers(), + force_const_double); + else + compute(eflag, vflag, fix->get_single_buffers(), + force_const_single); + + fix->balance_stamp(); + vflag_fdotr = 0; +} + +template +void PairLJCutIntel::compute(int eflag, int vflag, + IntelBuffers *buffers, + const ForceConst &fc) +{ + if (eflag || vflag) { + ev_setup(eflag, vflag); + } else evflag = vflag_fdotr = 0; + + const int inum = list->inum; + const int nthreads = comm->nthreads; + const int host_start = fix->host_start_pair(); + const int offload_end = fix->offload_end_pair(); + const int ago = neighbor->ago; + + if (ago != 0 && fix->separate_buffers() == 0) { + fix->start_watch(TIME_PACK); + if (ago != 0) { + #if defined(_OPENMP) + #pragma omp parallel default(none) shared(eflag,vflag,buffers,fc) + #endif + { + int ifrom, ito, tid; + IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost, + nthreads, sizeof(ATOM_T)); + buffers->thr_pack(ifrom,ito,ago); + } + } + fix->stop_watch(TIME_PACK); + } + + if (evflag || vflag_fdotr) { + int ovflag = 0; + if (vflag_fdotr) ovflag = 2; + else if (vflag) ovflag = 1; + if (eflag) { + if (force->newton_pair) { + eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum); + } else { + eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum); + } + } else { + if (force->newton_pair) { + eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum); + } else { + eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum); + } + } + } else { + if (force->newton_pair) { + eval<0,0,1>(1, 0, buffers, fc, 0, offload_end); + eval<0,0,1>(0, 0, buffers, fc, host_start, inum); + } else { + eval<0,0,0>(1, 0, buffers, fc, 0, offload_end); + eval<0,0,0>(0, 0, buffers, fc, host_start, inum); + } + } +} + +template +void PairLJCutIntel::eval(const int offload, const int vflag, + IntelBuffers *buffers, + const ForceConst &fc, + const int astart, const int aend) +{ + const int inum = aend - astart; + if (inum == 0) return; + int nlocal, nall, minlocal; + fix->get_buffern(offload, nlocal, nall, minlocal); + + const int ago = neighbor->ago; + IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall); + + ATOM_T * restrict const x = buffers->get_x(offload); + + const int * restrict const numneigh = list->numneigh; + const int * restrict const cnumneigh = buffers->cnumneigh(list); + const int * restrict const firstneigh = buffers->firstneigh(list); + const flt_t * restrict const special_lj = fc.special_lj; + const FC_PACKED1_T * restrict const ljc12o = fc.ljc12o[0]; + const FC_PACKED2_T * restrict const lj34 = fc.lj34[0]; + + const int ntypes = atom->ntypes + 1; + const int eatom = this->eflag_atom; + + // Determine how much data to transfer + int x_size, q_size, f_stride, ev_size, separate_flag; + IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag, + buffers, offload, fix, separate_flag, + x_size, q_size, ev_size, f_stride); + + int tc; + FORCE_T * restrict f_start; + acc_t * restrict ev_global; + IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global); + const int nthreads = tc; + int *overflow = fix->get_off_overflow_flag(); + { + #ifdef __MIC__ + *timer_compute = MIC_Wtime(); + #endif + + IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall, + f_stride, x, 0); + + acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5; + if (EVFLAG) { + oevdwl = (acc_t)0; + if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; + } + + // loop over neighbors of my atoms + #if defined(_OPENMP) + #pragma omp parallel default(none) \ + shared(f_start,f_stride,nlocal,nall,minlocal) \ + reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5) + #endif + { + int iifrom, iito, tid; + IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads); + iifrom += astart; + iito += astart; + + FORCE_T * restrict const f = f_start - minlocal + (tid * f_stride); + memset(f + minlocal, 0, f_stride * sizeof(FORCE_T)); + + for (int i = iifrom; i < iito; ++i) { + const int itype = x[i].w; + + const int ptr_off = itype * ntypes; + const FC_PACKED1_T * restrict const ljc12oi = ljc12o + ptr_off; + const FC_PACKED2_T * restrict const lj34i = lj34 + ptr_off; + + const int * restrict const jlist = firstneigh + cnumneigh[i]; + const int jnum = numneigh[i]; + + acc_t fxtmp, fytmp, fztmp, fwtmp; + acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5; + + const flt_t xtmp = x[i].x; + const flt_t ytmp = x[i].y; + const flt_t ztmp = x[i].z; + fxtmp = fytmp = fztmp = (acc_t)0; + if (EVFLAG) { + if (EFLAG) fwtmp = sevdwl = (acc_t)0; + if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0; + } + + #pragma vector aligned + #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \ + sv0, sv1, sv2, sv3, sv4, sv5) + for (int jj = 0; jj < jnum; jj++) { + flt_t forcelj, evdwl; + forcelj = evdwl = (flt_t)0.0; + + const int sbindex = jlist[jj] >> SBBITS & 3; + const int j = jlist[jj] & NEIGHMASK; + const flt_t delx = xtmp - x[j].x; + const flt_t dely = ytmp - x[j].y; + const flt_t delz = ztmp - x[j].z; + const int jtype = x[j].w; + const flt_t rsq = delx * delx + dely * dely + delz * delz; + + #ifdef __MIC__ + if (rsq < ljc12oi[jtype].cutsq) { + #endif + flt_t factor_lj = special_lj[sbindex]; + flt_t r2inv = 1.0 / rsq; + flt_t r6inv = r2inv * r2inv * r2inv; + #ifndef __MIC__ + if (rsq > ljc12oi[jtype].cutsq) r6inv = (flt_t)0.0; + #endif + forcelj = r6inv * (ljc12oi[jtype].lj1 * r6inv - ljc12oi[jtype].lj2); + flt_t fpair = factor_lj * forcelj * r2inv; + + fxtmp += delx * fpair; + fytmp += dely * fpair; + fztmp += delz * fpair; + if (NEWTON_PAIR || j < nlocal) { + f[j].x -= delx * fpair; + f[j].y -= dely * fpair; + f[j].z -= delz * fpair; + } + + if (EVFLAG) { + flt_t ev_pre = (flt_t)0; + if (NEWTON_PAIR || istop_watch(TIME_OFFLOAD_LATENCY); + else + fix->stop_watch(TIME_HOST_PAIR); + + if (EVFLAG) + fix->add_result_array(f_start, ev_global, offload, eatom); + else + fix->add_result_array(f_start, 0, offload); +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCutIntel::init_style() +{ + PairLJCut::init_style(); + neighbor->requests[neighbor->nrequest-1]->intel = 1; + + int ifix = modify->find_fix("package_intel"); + if (ifix < 0) + error->all(FLERR, + "The 'package intel' command is required for /intel styles"); + fix = static_cast(modify->fix[ifix]); + + #ifdef _LMP_INTEL_OFFLOAD + if (fix->offload_balance() != 0.0) + error->all(FLERR, + "Offload for lj/cut/intel is not yet available. Set balance to 0."); + #endif + if (fix->precision() == FixIntel::PREC_MODE_MIXED) { + fix->get_mixed_buffers()->free_all_nbor_buffers(); + pack_force_const(force_const_single, fix->get_mixed_buffers()); + } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) { + fix->get_double_buffers()->free_all_nbor_buffers(); + pack_force_const(force_const_double, fix->get_double_buffers()); + } else { + fix->get_single_buffers()->free_all_nbor_buffers(); + pack_force_const(force_const_single, fix->get_single_buffers()); + } +} + +/* ---------------------------------------------------------------------- */ + +template +void PairLJCutIntel::pack_force_const(ForceConst &fc, + IntelBuffers *buffers) +{ + int tp1 = atom->ntypes + 1; + fc.set_ntypes(tp1,memory,_cop); + buffers->set_ntypes(tp1); + flt_t **cutneighsq = buffers->get_cutneighsq(); + + // Repeat cutsq calculation because done after call to init_style + double cut, cutneigh; + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = i; j <= atom->ntypes; j++) { + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + cut = init_one(i,j); + cutneigh = cut + neighbor->skin; + cutsq[i][j] = cutsq[j][i] = cut*cut; + cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh; + } + } + } + + for (int i = 0; i < 4; i++) { + fc.special_lj[i] = force->special_lj[i]; + fc.special_lj[0] = 1.0; + } + + for (int i = 0; i < tp1; i++) { + for (int j = 0; j < tp1; j++) { + fc.ljc12o[i][j].lj1 = lj1[i][j]; + fc.ljc12o[i][j].lj2 = lj2[i][j]; + fc.lj34[i][j].lj3 = lj3[i][j]; + fc.lj34[i][j].lj4 = lj4[i][j]; + fc.ljc12o[i][j].cutsq = cutsq[i][j]; + fc.ljc12o[i][j].offset = offset[i][j]; + } + } +} + +/* ---------------------------------------------------------------------- */ + +template +void PairLJCutIntel::ForceConst::set_ntypes(const int ntypes, + Memory *memory, + const int cop) { + if (ntypes != _ntypes) { + if (_ntypes > 0) { + fc_packed1 *oljc12o = ljc12o[0]; + fc_packed2 *olj34 = lj34[0]; + + _memory->destroy(oljc12o); + _memory->destroy(olj34); + } + if (ntypes > 0) { + _cop = cop; + memory->create(ljc12o,ntypes,ntypes,"fc.c12o"); + memory->create(lj34,ntypes,ntypes,"fc.lj34"); + } + } + _ntypes = ntypes; + _memory = memory; +} diff --git a/src/USER-INTEL/pair_lj_cut_intel.h b/src/USER-INTEL/pair_lj_cut_intel.h new file mode 100644 index 0000000000..a40e39af56 --- /dev/null +++ b/src/USER-INTEL/pair_lj_cut_intel.h @@ -0,0 +1,93 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS + +PairStyle(lj/cut/intel,PairLJCutIntel) + +#else + +#ifndef LMP_PAIR_LJ_CUT_INTEL_H +#define LMP_PAIR_LJ_CUT_INTEL_H + +#include "pair_lj_cut.h" +#include "fix_intel.h" + +namespace LAMMPS_NS { + +class PairLJCutIntel : public PairLJCut { + + public: + PairLJCutIntel(class LAMMPS *); + + virtual void compute(int, int); + void init_style(); + + private: + FixIntel *fix; + int _cop; + + template class ForceConst; + template + void compute(int eflag, int vflag, IntelBuffers *buffers, + const ForceConst &fc); + template + void eval(const int offload, const int vflag, + IntelBuffers * buffers, + const ForceConst &fc, const int astart, const int aend); + + template + void pack_force_const(ForceConst &fc, + IntelBuffers *buffers); + + // ---------------------------------------------------------------------- + + template + class ForceConst { + public: + typedef struct { flt_t cutsq, lj1, lj2, offset; } fc_packed1; + typedef struct { flt_t lj3, lj4; } fc_packed2; + + __declspec(align(64)) flt_t special_lj[4]; + fc_packed1 **ljc12o; + fc_packed2 **lj34; + + ForceConst() : _ntypes(0) {} + ~ForceConst() { set_ntypes(0, NULL, _cop); } + + void set_ntypes(const int ntypes, Memory *memory, const int cop); + + private: + int _ntypes, _cop; + Memory *_memory; + }; + ForceConst force_const_single; + ForceConst force_const_double; +}; + +} + +#endif +#endif + +/* ERROR/WARNING messages: + +E: The 'package intel' command is required for /intel styles + +Self-explanatory. + +*/ diff --git a/src/USER-INTEL/verlet_intel.cpp b/src/USER-INTEL/verlet_intel.cpp new file mode 100644 index 0000000000..64177e0f05 --- /dev/null +++ b/src/USER-INTEL/verlet_intel.cpp @@ -0,0 +1,486 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include "string.h" +#include "verlet_intel.h" +#include "neighbor.h" +#include "domain.h" +#include "comm.h" +#include "atom.h" +#include "force.h" +#include "pair.h" +#include "bond.h" +#include "angle.h" +#include "dihedral.h" +#include "improper.h" +#include "kspace.h" +#include "output.h" +#include "update.h" +#include "modify.h" +#include "compute.h" +#include "fix.h" +#include "timer.h" +#include "memory.h" +#include "error.h" + +using namespace LAMMPS_NS; + +/* ---------------------------------------------------------------------- */ + +VerletIntel::VerletIntel(LAMMPS *lmp, int narg, char **arg) : + Integrate(lmp, narg, arg) {} + +/* ---------------------------------------------------------------------- + initialization before run +------------------------------------------------------------------------- */ + +void VerletIntel::init() +{ + Integrate::init(); + + // warn if no fixes + + if (modify->nfix == 0 && comm->me == 0) + error->warning(FLERR,"No fixes defined, atoms won't move"); + + // virial_style: + // 1 if computed explicitly by pair->compute via sum over pair interactions + // 2 if computed implicitly by pair->virial_fdotr_compute via sum over ghosts + + if (force->newton_pair) virial_style = 2; + else virial_style = 1; + + // setup lists of computes for global and per-atom PE and pressure + + ev_setup(); + + // detect if fix omp is present for clearing force arrays + + int ifix = modify->find_fix("package_omp"); + if (ifix >= 0) external_force_clear = 1; + + if (nvlist_atom) + error->all(FLERR, + "Cannot currently get per-atom virials with Intel package."); + #ifdef _LMP_INTEL_OFFLOAD + ifix = modify->find_fix("package_intel"); + if (ifix >= 0) fix_intel = static_cast(modify->fix[ifix]); + else fix_intel = 0; + #endif + + // set flags for what arrays to clear in force_clear() + // need to clear additionals arrays if they exist + + torqueflag = 0; + if (atom->torque_flag) torqueflag = 1; + erforceflag = 0; + if (atom->erforce_flag) erforceflag = 1; + e_flag = 0; + if (atom->e_flag) e_flag = 1; + rho_flag = 0; + if (atom->rho_flag) rho_flag = 1; + + // orthogonal vs triclinic simulation box + + triclinic = domain->triclinic; +} + +/* ---------------------------------------------------------------------- + setup before run +------------------------------------------------------------------------- */ + +void VerletIntel::setup() +{ + if (comm->me == 0 && screen) fprintf(screen,"Setting up run ...\n"); + + update->setupflag = 1; + + // setup domain, communication and neighboring + // acquire ghosts + // build neighbor lists + + atom->setup(); + modify->setup_pre_exchange(); + if (triclinic) domain->x2lamda(atom->nlocal); + domain->pbc(); + domain->reset_box(); + comm->setup(); + if (neighbor->style) neighbor->setup_bins(); + comm->exchange(); + if (atom->sortfreq > 0) atom->sort(); + comm->borders(); + if (triclinic) domain->lamda2x(atom->nlocal+atom->nghost); + domain->image_check(); + domain->box_too_small_check(); + modify->setup_pre_neighbor(); + neighbor->build(); + neighbor->ncalls = 0; + + // compute all forces + + ev_set(update->ntimestep); + force_clear(); + modify->setup_pre_force(vflag); + + if (pair_compute_flag) force->pair->compute(eflag,vflag); + else if (force->pair) force->pair->compute_dummy(eflag,vflag); + + if (atom->molecular) { + if (force->bond) force->bond->compute(eflag,vflag); + if (force->angle) force->angle->compute(eflag,vflag); + if (force->dihedral) force->dihedral->compute(eflag,vflag); + if (force->improper) force->improper->compute(eflag,vflag); + } + + if (force->kspace) { + force->kspace->setup(); + if (kspace_compute_flag) force->kspace->compute(eflag,vflag); + else force->kspace->compute_dummy(eflag,vflag); + } + + #ifdef _LMP_INTEL_OFFLOAD + sync_mode = 0; + if (fix_intel) { + if (fix_intel->offload_balance() != 0.0) { + if (fix_intel->offload_noghost()) + sync_mode = 2; + else + sync_mode = 1; + } + } + + if (sync_mode == 1) fix_intel->sync_coprocessor(); + #endif + + if (force->newton) comm->reverse_comm(); + + #ifdef _LMP_INTEL_OFFLOAD + if (sync_mode == 2) fix_intel->sync_coprocessor(); + #endif + + modify->setup(vflag); + output->setup(); + update->setupflag = 0; +} + +/* ---------------------------------------------------------------------- + setup without output + flag = 0 = just force calculation + flag = 1 = reneighbor and force calculation +------------------------------------------------------------------------- */ + +void VerletIntel::setup_minimal(int flag) +{ + update->setupflag = 1; + + // setup domain, communication and neighboring + // acquire ghosts + // build neighbor lists + + if (flag) { + modify->setup_pre_exchange(); + if (triclinic) domain->x2lamda(atom->nlocal); + domain->pbc(); + domain->reset_box(); + comm->setup(); + if (neighbor->style) neighbor->setup_bins(); + comm->exchange(); + comm->borders(); + if (triclinic) domain->lamda2x(atom->nlocal+atom->nghost); + domain->image_check(); + domain->box_too_small_check(); + modify->setup_pre_neighbor(); + neighbor->build(); + neighbor->ncalls = 0; + } + + // compute all forces + + ev_set(update->ntimestep); + force_clear(); + modify->setup_pre_force(vflag); + + if (pair_compute_flag) force->pair->compute(eflag,vflag); + else if (force->pair) force->pair->compute_dummy(eflag,vflag); + + if (atom->molecular) { + if (force->bond) force->bond->compute(eflag,vflag); + if (force->angle) force->angle->compute(eflag,vflag); + if (force->dihedral) force->dihedral->compute(eflag,vflag); + if (force->improper) force->improper->compute(eflag,vflag); + } + + if (force->kspace) { + force->kspace->setup(); + if (kspace_compute_flag) force->kspace->compute(eflag,vflag); + else force->kspace->compute_dummy(eflag,vflag); + } + + #ifdef _LMP_INTEL_OFFLOAD + sync_mode = 0; + if (fix_intel) { + if (fix_intel->offload_balance() != 0.0) { + if (fix_intel->offload_noghost()) + sync_mode = 2; + else + sync_mode = 1; + } + } + + if (sync_mode == 1) fix_intel->sync_coprocessor(); + #endif + + if (force->newton) comm->reverse_comm(); + + #ifdef _LMP_INTEL_OFFLOAD + if (sync_mode == 2) fix_intel->sync_coprocessor(); + #endif + + modify->setup(vflag); + update->setupflag = 0; +} + +/* ---------------------------------------------------------------------- + run for N steps +------------------------------------------------------------------------- */ + +void VerletIntel::run(int n) +{ + bigint ntimestep; + int nflag,sortflag; + + int n_post_integrate = modify->n_post_integrate; + int n_pre_exchange = modify->n_pre_exchange; + int n_pre_neighbor = modify->n_pre_neighbor; + int n_pre_force = modify->n_pre_force; + int n_post_force = modify->n_post_force; + int n_end_of_step = modify->n_end_of_step; + + if (atom->sortfreq > 0) sortflag = 1; + else sortflag = 0; + + for (int i = 0; i < n; i++) { + + ntimestep = ++update->ntimestep; + ev_set(ntimestep); + + // initial time integration + + modify->initial_integrate(vflag); + if (n_post_integrate) modify->post_integrate(); + + // regular communication vs neighbor list rebuild + + nflag = neighbor->decide(); + + if (nflag == 0) { + timer->stamp(); + comm->forward_comm(); + timer->stamp(TIME_COMM); + } else { + if (n_pre_exchange) modify->pre_exchange(); + if (triclinic) domain->x2lamda(atom->nlocal); + domain->pbc(); + if (domain->box_change) { + domain->reset_box(); + comm->setup(); + if (neighbor->style) neighbor->setup_bins(); + } + timer->stamp(); + comm->exchange(); + if (sortflag && ntimestep >= atom->nextsort) atom->sort(); + comm->borders(); + if (triclinic) domain->lamda2x(atom->nlocal+atom->nghost); + timer->stamp(TIME_COMM); + if (n_pre_neighbor) modify->pre_neighbor(); + neighbor->build(); + timer->stamp(TIME_NEIGHBOR); + } + + // force computations + // important for pair to come before bonded contributions + // since some bonded potentials tally pairwise energy/virial + // and Pair:ev_tally() needs to be called before any tallying + + force_clear(); + if (n_pre_force) modify->pre_force(vflag); + + timer->stamp(); + + if (pair_compute_flag) { + force->pair->compute(eflag,vflag); + timer->stamp(TIME_PAIR); + } + + if (atom->molecular) { + if (force->bond) force->bond->compute(eflag,vflag); + if (force->angle) force->angle->compute(eflag,vflag); + if (force->dihedral) force->dihedral->compute(eflag,vflag); + if (force->improper) force->improper->compute(eflag,vflag); + timer->stamp(TIME_BOND); + } + + if (kspace_compute_flag) { + force->kspace->compute(eflag,vflag); + timer->stamp(TIME_KSPACE); + } + + #ifdef _LMP_INTEL_OFFLOAD + if (sync_mode == 1) { + fix_intel->sync_coprocessor(); + timer->stamp(TIME_PAIR); + } + #endif + + // reverse communication of forces + + if (force->newton) { + comm->reverse_comm(); + timer->stamp(TIME_COMM); + } + + #ifdef _LMP_INTEL_OFFLOAD + if (sync_mode == 2) { + fix_intel->sync_coprocessor(); + timer->stamp(TIME_PAIR); + } + #endif + + // force modifications, final time integration, diagnostics + + if (n_post_force) modify->post_force(vflag); + modify->final_integrate(); + if (n_end_of_step) modify->end_of_step(); + + // all output + + if (ntimestep == output->next) { + timer->stamp(); + output->write(ntimestep); + timer->stamp(TIME_OUTPUT); + } + } +} + +/* ---------------------------------------------------------------------- */ + +void VerletIntel::cleanup() +{ + modify->post_run(); + domain->box_too_small_check(); + update->update_time(); +} + +/* ---------------------------------------------------------------------- + clear force on own & ghost atoms + clear other arrays as needed +------------------------------------------------------------------------- */ + +void VerletIntel::force_clear() +{ + int i; + + if (external_force_clear) return; + + // clear force on all particles + // if either newton flag is set, also include ghosts + // when using threads always clear all forces. + + if (neighbor->includegroup == 0) { + int nall; + if (force->newton) nall = atom->nlocal + atom->nghost; + else nall = atom->nlocal; + + size_t nbytes = sizeof(double) * nall; + + if (nbytes) { + memset(&(atom->f[0][0]),0,3*nbytes); + if (torqueflag) memset(&(atom->torque[0][0]),0,3*nbytes); + if (erforceflag) memset(&(atom->erforce[0]), 0, nbytes); + if (e_flag) memset(&(atom->de[0]), 0, nbytes); + if (rho_flag) memset(&(atom->drho[0]), 0, nbytes); + } + + // neighbor includegroup flag is set + // clear force only on initial nfirst particles + // if either newton flag is set, also include ghosts + + } else { + int nall = atom->nfirst; + + double **f = atom->f; + for (i = 0; i < nall; i++) { + f[i][0] = 0.0; + f[i][1] = 0.0; + f[i][2] = 0.0; + } + + if (torqueflag) { + double **torque = atom->torque; + for (i = 0; i < nall; i++) { + torque[i][0] = 0.0; + torque[i][1] = 0.0; + torque[i][2] = 0.0; + } + } + + if (erforceflag) { + double *erforce = atom->erforce; + for (i = 0; i < nall; i++) erforce[i] = 0.0; + } + + if (e_flag) { + double *de = atom->de; + for (i = 0; i < nall; i++) de[i] = 0.0; + } + + if (rho_flag) { + double *drho = atom->drho; + for (i = 0; i < nall; i++) drho[i] = 0.0; + } + + if (force->newton) { + nall = atom->nlocal + atom->nghost; + + for (i = atom->nlocal; i < nall; i++) { + f[i][0] = 0.0; + f[i][1] = 0.0; + f[i][2] = 0.0; + } + + if (torqueflag) { + double **torque = atom->torque; + for (i = atom->nlocal; i < nall; i++) { + torque[i][0] = 0.0; + torque[i][1] = 0.0; + torque[i][2] = 0.0; + } + } + + if (erforceflag) { + double *erforce = atom->erforce; + for (i = atom->nlocal; i < nall; i++) erforce[i] = 0.0; + } + + if (e_flag) { + double *de = atom->de; + for (i = 0; i < nall; i++) de[i] = 0.0; + } + + if (rho_flag) { + double *drho = atom->drho; + for (i = 0; i < nall; i++) drho[i] = 0.0; + } + } + } +} diff --git a/src/USER-INTEL/verlet_intel.h b/src/USER-INTEL/verlet_intel.h new file mode 100644 index 0000000000..de4231431d --- /dev/null +++ b/src/USER-INTEL/verlet_intel.h @@ -0,0 +1,68 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef INTEGRATE_CLASS + +IntegrateStyle(verlet/intel,VerletIntel) + +#else + +#ifndef LMP_VERLET_INTEL_H +#define LMP_VERLET_INTEL_H + +#include "integrate.h" +#ifdef LMP_INTEL_OFFLOAD +#include "fix_intel.h" +#endif + +namespace LAMMPS_NS { + +class VerletIntel : public Integrate { + public: + VerletIntel(class LAMMPS *, int, char **); + virtual ~VerletIntel() {} + virtual void init(); + virtual void setup(); + virtual void setup_minimal(int); + virtual void run(int); + void cleanup(); + + protected: + int triclinic; // 0 if domain is orthog, 1 if triclinic + int torqueflag,erforceflag; + int e_flag,rho_flag; + + virtual void force_clear(); + #ifdef _LMP_INTEL_OFFLOAD + FixIntel *fix_intel; + int sync_mode; + #endif +}; + +} + +#endif +#endif + +/* ERROR/WARNING messages: + +W: No fixes defined, atoms won't move + +If you are not using a fix like nve, nvt, npt then atom velocities and +coordinates will not be updated during timestepping. + +E: Cannot currently get per-atom virials with intel package. + +The Intel package does not yet support per-atom virial calculation. + +*/ diff --git a/src/USER-INTEL/verlet_split_intel.cpp b/src/USER-INTEL/verlet_split_intel.cpp new file mode 100644 index 0000000000..3976607b18 --- /dev/null +++ b/src/USER-INTEL/verlet_split_intel.cpp @@ -0,0 +1,589 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Yuxing Peng and Chris Knight (U Chicago) +------------------------------------------------------------------------- */ + +#include "string.h" +#include "verlet_split_intel.h" +#include "universe.h" +#include "neighbor.h" +#include "domain.h" +#include "comm.h" +#include "atom.h" +#include "atom_vec.h" +#include "force.h" +#include "pair.h" +#include "bond.h" +#include "angle.h" +#include "dihedral.h" +#include "improper.h" +#include "kspace.h" +#include "output.h" +#include "update.h" +#include "fix.h" +#include "modify.h" +#include "timer.h" +#include "memory.h" +#include "error.h" + +using namespace LAMMPS_NS; + +/* ---------------------------------------------------------------------- */ + +VerletSplitIntel::VerletSplitIntel(LAMMPS *lmp, int narg, char **arg) : + VerletIntel(lmp, narg, arg) +{ + // error checks on partitions + + if (universe->nworlds != 2) + error->universe_all(FLERR,"Verlet/split requires 2 partitions"); + if (universe->procs_per_world[0] % universe->procs_per_world[1]) + error->universe_all(FLERR,"Verlet/split requires Rspace partition " + "size be multiple of Kspace partition size"); + + // master = 1 for Rspace procs, 0 for Kspace procs + + if (universe->iworld == 0) master = 1; + else master = 0; + + ratio = universe->procs_per_world[0] / universe->procs_per_world[1]; + + // Kspace root proc broadcasts info about Kspace proc layout to Rspace procs + + int kspace_procgrid[3]; + + if (universe->me == universe->root_proc[1]) { + kspace_procgrid[0] = comm->procgrid[0]; + kspace_procgrid[1] = comm->procgrid[1]; + kspace_procgrid[2] = comm->procgrid[2]; + } + MPI_Bcast(kspace_procgrid,3,MPI_INT,universe->root_proc[1],universe->uworld); + + int ***kspace_grid2proc; + memory->create(kspace_grid2proc,kspace_procgrid[0], + kspace_procgrid[1],kspace_procgrid[2], + "verlet/split:kspace_grid2proc"); + + if (universe->me == universe->root_proc[1]) { + for (int i = 0; i < comm->procgrid[0]; i++) + for (int j = 0; j < comm->procgrid[1]; j++) + for (int k = 0; k < comm->procgrid[2]; k++) + kspace_grid2proc[i][j][k] = comm->grid2proc[i][j][k]; + } + MPI_Bcast(&kspace_grid2proc[0][0][0], + kspace_procgrid[0]*kspace_procgrid[1]*kspace_procgrid[2],MPI_INT, + universe->root_proc[1],universe->uworld); + + // Rspace partition must be multiple of Kspace partition in each dim + // so atoms of one Kspace proc coincide with atoms of several Rspace procs + + if (master) { + int flag = 0; + if (comm->procgrid[0] % kspace_procgrid[0]) flag = 1; + if (comm->procgrid[1] % kspace_procgrid[1]) flag = 1; + if (comm->procgrid[2] % kspace_procgrid[2]) flag = 1; + if (flag) + error->one(FLERR, + "Verlet/split requires Rspace partition layout be " + "multiple of Kspace partition layout in each dim"); + } + + // block = 1 Kspace proc with set of Rspace procs it overlays + // me_block = 0 for Kspace proc + // me_block = 1 to ratio for Rspace procs + // block = MPI communicator for that set of procs + + int iblock,key; + + if (!master) { + iblock = comm->me; + key = 0; + } else { + int kpx = comm->myloc[0] / (comm->procgrid[0]/kspace_procgrid[0]); + int kpy = comm->myloc[1] / (comm->procgrid[1]/kspace_procgrid[1]); + int kpz = comm->myloc[2] / (comm->procgrid[2]/kspace_procgrid[2]); + iblock = kspace_grid2proc[kpx][kpy][kpz]; + key = 1; + } + + MPI_Comm_split(universe->uworld,iblock,key,&block); + MPI_Comm_rank(block,&me_block); + + // output block groupings to universe screen/logfile + // bmap is ordered by block and then by proc within block + + int *bmap = new int[universe->nprocs]; + for (int i = 0; i < universe->nprocs; i++) bmap[i] = -1; + bmap[iblock*(ratio+1)+me_block] = universe->me; + + int *bmapall = new int[universe->nprocs]; + MPI_Allreduce(bmap,bmapall,universe->nprocs,MPI_INT,MPI_MAX,universe->uworld); + + if (universe->me == 0) { + if (universe->uscreen) { + fprintf(universe->uscreen, + "Per-block Rspace/Kspace proc IDs (original proc IDs):\n"); + int m = 0; + for (int i = 0; i < universe->nprocs/(ratio+1); i++) { + fprintf(universe->uscreen," block %d:",i); + int kspace_proc = bmapall[m]; + for (int j = 1; j <= ratio; j++) + fprintf(universe->uscreen," %d",bmapall[m+j]); + fprintf(universe->uscreen," %d",kspace_proc); + kspace_proc = bmapall[m]; + for (int j = 1; j <= ratio; j++) { + if (j == 1) fprintf(universe->uscreen," ("); + else fprintf(universe->uscreen," "); + fprintf(universe->uscreen,"%d", + universe->uni2orig[bmapall[m+j]]); + } + fprintf(universe->uscreen," %d)\n",universe->uni2orig[kspace_proc]); + m += ratio + 1; + } + } + if (universe->ulogfile) { + fprintf(universe->ulogfile, + "Per-block Rspace/Kspace proc IDs (original proc IDs):\n"); + int m = 0; + for (int i = 0; i < universe->nprocs/(ratio+1); i++) { + fprintf(universe->ulogfile," block %d:",i); + int kspace_proc = bmapall[m]; + for (int j = 1; j <= ratio; j++) + fprintf(universe->ulogfile," %d",bmapall[m+j]); + + fprintf(universe->ulogfile," %d",kspace_proc); + kspace_proc = bmapall[m]; + for (int j = 1; j <= ratio; j++) { + if (j == 1) fprintf(universe->ulogfile," ("); + else fprintf(universe->ulogfile," "); + fprintf(universe->ulogfile,"%d", + universe->uni2orig[bmapall[m+j]]); + } + fprintf(universe->ulogfile," %d)\n",universe->uni2orig[kspace_proc]); + m += ratio + 1; + } + } + } + + memory->destroy(kspace_grid2proc); + delete [] bmap; + delete [] bmapall; + + // size/disp = vectors for MPI gather/scatter within block + + qsize = new int[ratio+1]; + qdisp = new int[ratio+1]; + xsize = new int[ratio+1]; + xdisp = new int[ratio+1]; + + // f_kspace = Rspace copy of Kspace forces + // allocate dummy version for Kspace partition + + maxatom = 0; + f_kspace = NULL; + if (!master) memory->create(f_kspace,1,1,"verlet/split:f_kspace"); +} + +/* ---------------------------------------------------------------------- */ + +VerletSplitIntel::~VerletSplitIntel() +{ + delete [] qsize; + delete [] qdisp; + delete [] xsize; + delete [] xdisp; + memory->destroy(f_kspace); + MPI_Comm_free(&block); +} + +/* ---------------------------------------------------------------------- + initialization before run +------------------------------------------------------------------------- */ + +void VerletSplitIntel::init() +{ + if (!force->kspace && comm->me == 0) + error->warning(FLERR,"No Kspace calculation with verlet/split"); + + if (force->kspace_match("tip4p",0)) tip4p_flag = 1; + else tip4p_flag = 0; + + // currently TIP4P does not work with verlet/split, so generate error + // see Axel email on this, also other TIP4P notes below + + if (tip4p_flag) error->all(FLERR,"Verlet/split does not yet support TIP4P"); + + VerletIntel::init(); +} + +/* ---------------------------------------------------------------------- + setup before run + servant partition only sets up KSpace calculation +------------------------------------------------------------------------- */ + +void VerletSplitIntel::setup() +{ + if (comm->me == 0 && screen) fprintf(screen,"Setting up run ...\n"); + + if (!master) force->kspace->setup(); + else { + VerletIntel::setup(); + } +} + +/* ---------------------------------------------------------------------- + setup without output + flag = 0 = just force calculation + flag = 1 = reneighbor and force calculation + servant partition only sets up KSpace calculation +------------------------------------------------------------------------- */ + +void VerletSplitIntel::setup_minimal(int flag) +{ + if (!master) force->kspace->setup(); + else { + VerletIntel::setup_minimal(flag); + } +} + +/* ---------------------------------------------------------------------- + run for N steps + master partition does everything but Kspace + servant partition does just Kspace + communicate back and forth every step: + atom coords from master -> servant + kspace forces from servant -> master + also box bounds from master -> servant if necessary +------------------------------------------------------------------------- */ + +void VerletSplitIntel::run(int n) +{ + bigint ntimestep; + int nflag,sortflag; + + // sync both partitions before start timer + + MPI_Barrier(universe->uworld); + timer->init(); + timer->barrier_start(TIME_LOOP); + + // setup initial Rspace <-> Kspace comm params + + rk_setup(); + + // check if OpenMP support fix defined + + Fix *fix_omp; + int ifix = modify->find_fix("package_omp"); + if (ifix < 0) fix_omp = NULL; + else fix_omp = modify->fix[ifix]; + + // flags for timestepping iterations + + int n_post_integrate = modify->n_post_integrate; + int n_pre_exchange = modify->n_pre_exchange; + int n_pre_neighbor = modify->n_pre_neighbor; + int n_pre_force = modify->n_pre_force; + int n_post_force = modify->n_post_force; + int n_end_of_step = modify->n_end_of_step; + + if (atom->sortfreq > 0) sortflag = 1; + else sortflag = 0; + + for (int i = 0; i < n; i++) { + + ntimestep = ++update->ntimestep; + ev_set(ntimestep); + + // initial time integration + + if (master) { + modify->initial_integrate(vflag); + if (n_post_integrate) modify->post_integrate(); + } + + // regular communication vs neighbor list rebuild + + if (master) nflag = neighbor->decide(); + MPI_Bcast(&nflag,1,MPI_INT,1,block); + + if (master) { + if (nflag == 0) { + timer->stamp(); + comm->forward_comm(); + timer->stamp(TIME_COMM); + } else { + if (n_pre_exchange) modify->pre_exchange(); + if (triclinic) domain->x2lamda(atom->nlocal); + domain->pbc(); + if (domain->box_change) { + domain->reset_box(); + comm->setup(); + if (neighbor->style) neighbor->setup_bins(); + } + timer->stamp(); + comm->exchange(); + if (sortflag && ntimestep >= atom->nextsort) atom->sort(); + comm->borders(); + if (triclinic) domain->lamda2x(atom->nlocal+atom->nghost); + timer->stamp(TIME_COMM); + if (n_pre_neighbor) modify->pre_neighbor(); + neighbor->build(); + timer->stamp(TIME_NEIGHBOR); + } + } + + // if reneighboring occurred, re-setup Rspace <-> Kspace comm params + // comm Rspace atom coords to Kspace procs + + if (nflag) rk_setup(); + r2k_comm(); + + // force computations + + force_clear(); + + if (master) { + if (n_pre_force) modify->pre_force(vflag); + + timer->stamp(); + if (force->pair) { + force->pair->compute(eflag,vflag); + timer->stamp(TIME_PAIR); + } + + if (atom->molecular) { + if (force->bond) force->bond->compute(eflag,vflag); + if (force->angle) force->angle->compute(eflag,vflag); + if (force->dihedral) force->dihedral->compute(eflag,vflag); + if (force->improper) force->improper->compute(eflag,vflag); + timer->stamp(TIME_BOND); + } + + #ifdef _LMP_INTEL_OFFLOAD + if (sync_mode == 1) { + fix_intel->sync_coprocessor(); + timer->stamp(TIME_PAIR); + } + #endif + + if (force->newton) { + comm->reverse_comm(); + timer->stamp(TIME_COMM); + } + + #ifdef _LMP_INTEL_OFFLOAD + if (sync_mode == 2) { + fix_intel->sync_coprocessor(); + timer->stamp(TIME_PAIR); + } + #endif + + } else { + + // run FixOMP as sole pre_force fix, if defined + + if (fix_omp) fix_omp->pre_force(vflag); + + if (force->kspace) { + timer->stamp(); + force->kspace->compute(eflag,vflag); + timer->stamp(TIME_KSPACE); + } + + // TIP4P PPPM puts forces on ghost atoms, so must reverse_comm() + + if (tip4p_flag && force->newton) { + comm->reverse_comm(); + timer->stamp(TIME_COMM); + } + } + + // comm and sum Kspace forces back to Rspace procs + + k2r_comm(); + + // force modifications, final time integration, diagnostics + // all output + + if (master) { + if (n_post_force) modify->post_force(vflag); + modify->final_integrate(); + if (n_end_of_step) modify->end_of_step(); + + if (ntimestep == output->next) { + timer->stamp(); + output->write(ntimestep); + timer->stamp(TIME_OUTPUT); + } + } + } +} + +/* ---------------------------------------------------------------------- + setup params for Rspace <-> Kspace communication + called initially and after every reneighbor + also communcicate atom charges from Rspace to KSpace since static +------------------------------------------------------------------------- */ + +void VerletSplitIntel::rk_setup() +{ + // grow f_kspace array on master procs if necessary + + if (master) { + if (atom->nlocal > maxatom) { + memory->destroy(f_kspace); + maxatom = atom->nmax; + memory->create(f_kspace,maxatom,3,"verlet/split:f_kspace"); + } + } + + // qsize = # of atoms owned by each master proc in block + + int n = 0; + if (master) n = atom->nlocal; + MPI_Gather(&n,1,MPI_INT,qsize,1,MPI_INT,0,block); + + // setup qdisp, xsize, xdisp based on qsize + // only needed by Kspace proc + // set Kspace nlocal to sum of Rspace nlocals + // insure Kspace atom arrays are large enough + + if (!master) { + qsize[0] = qdisp[0] = xsize[0] = xdisp[0] = 0; + for (int i = 1; i <= ratio; i++) { + qdisp[i] = qdisp[i-1]+qsize[i-1]; + xsize[i] = 3*qsize[i]; + xdisp[i] = xdisp[i-1]+xsize[i-1]; + } + + atom->nlocal = qdisp[ratio] + qsize[ratio]; + while (atom->nmax <= atom->nlocal) atom->avec->grow(0); + atom->nghost = 0; + } + + // one-time gather of Rspace atom charges to Kspace proc + + MPI_Gatherv(atom->q,n,MPI_DOUBLE,atom->q,qsize,qdisp,MPI_DOUBLE,0,block); + + // for TIP4P also need to send atom type and tag + // KSpace procs need to acquire ghost atoms and map all their atoms + // map_clear() call is in lieu of comm->exchange() which performs map_clear + // borders() call acquires ghost atoms and maps them + // NOTE: do atom coords need to be communicated here before borders() call? + // could do this by calling r2k_comm() here and not again from run() + // except that forward_comm() in r2k_comm() is wrong + + if (tip4p_flag) { + //r2k_comm(); + MPI_Gatherv(atom->type,n,MPI_INT,atom->type,qsize,qdisp,MPI_INT,0,block); + MPI_Gatherv(atom->tag,n,MPI_LMP_TAGINT, + atom->tag,qsize,qdisp,MPI_LMP_TAGINT,0,block); + if (!master) { + if (triclinic) domain->x2lamda(atom->nlocal); + if (domain->box_change) comm->setup(); + timer->stamp(); + atom->map_clear(); + comm->borders(); + if (triclinic) domain->lamda2x(atom->nlocal+atom->nghost); + timer->stamp(TIME_COMM); + } + } +} + +/* ---------------------------------------------------------------------- + communicate Rspace atom coords to Kspace + also eflag,vflag and box bounds if needed +------------------------------------------------------------------------- */ + +void VerletSplitIntel::r2k_comm() +{ + MPI_Status status; + + int n = 0; + if (master) n = atom->nlocal; + MPI_Gatherv(atom->x[0],n*3,MPI_DOUBLE,atom->x[0],xsize,xdisp, + MPI_DOUBLE,0,block); + + // send eflag,vflag from Rspace to Kspace + + if (me_block == 1) { + int flags[2]; + flags[0] = eflag; flags[1] = vflag; + MPI_Send(flags,2,MPI_INT,0,0,block); + } else if (!master) { + int flags[2]; + MPI_Recv(flags,2,MPI_DOUBLE,1,0,block,&status); + eflag = flags[0]; vflag = flags[1]; + } + + // send box bounds from Rspace to Kspace if simulation box is dynamic + + if (domain->box_change) { + if (me_block == 1) { + MPI_Send(domain->boxlo,3,MPI_DOUBLE,0,0,block); + MPI_Send(domain->boxhi,3,MPI_DOUBLE,0,0,block); + } else if (!master) { + MPI_Recv(domain->boxlo,3,MPI_DOUBLE,1,0,block,&status); + MPI_Recv(domain->boxhi,3,MPI_DOUBLE,1,0,block,&status); + domain->set_global_box(); + domain->set_local_box(); + force->kspace->setup(); + } + } + + // for TIP4P, Kspace partition needs to update its ghost atoms + + if (tip4p_flag && !master) { + timer->stamp(); + comm->forward_comm(); + timer->stamp(TIME_COMM); + } +} + +/* ---------------------------------------------------------------------- + communicate and sum Kspace atom forces back to Rspace +------------------------------------------------------------------------- */ + +void VerletSplitIntel::k2r_comm() +{ + if (eflag) MPI_Bcast(&force->kspace->energy,1,MPI_DOUBLE,0,block); + if (vflag) MPI_Bcast(force->kspace->virial,6,MPI_DOUBLE,0,block); + + int n = 0; + if (master) n = atom->nlocal; + MPI_Scatterv(atom->f[0],xsize,xdisp,MPI_DOUBLE, + f_kspace[0],n*3,MPI_DOUBLE,0,block); + + if (master) { + double **f = atom->f; + int nlocal = atom->nlocal; + for (int i = 0; i < nlocal; i++) { + f[i][0] += f_kspace[i][0]; + f[i][1] += f_kspace[i][1]; + f[i][2] += f_kspace[i][2]; + } + } +} + +/* ---------------------------------------------------------------------- + memory usage of Kspace force array on master procs +------------------------------------------------------------------------- */ + +bigint VerletSplitIntel::memory_usage() +{ + bigint bytes = maxatom*3 * sizeof(double); + return bytes; +} diff --git a/src/USER-INTEL/verlet_split_intel.h b/src/USER-INTEL/verlet_split_intel.h new file mode 100644 index 0000000000..3f81d41a97 --- /dev/null +++ b/src/USER-INTEL/verlet_split_intel.h @@ -0,0 +1,89 @@ +/* ------------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef INTEGRATE_CLASS + +IntegrateStyle(verlet/split/intel,VerletSplitIntel) + +#else + +#ifndef LMP_VERLET_SPLIT_INTEL_H +#define LMP_VERLET_SPLIT_INTEL_H + +#include "verlet_intel.h" +#ifdef LMP_INTEL_OFFLOAD +#include "fix_intel.h" +#endif + +namespace LAMMPS_NS { + +class VerletSplitIntel : public VerletIntel { + public: + VerletSplitIntel(class LAMMPS *, int, char **); + ~VerletSplitIntel(); + void init(); + void setup(); + void setup_minimal(int); + void run(int); + bigint memory_usage(); + + private: + int master; // 1 if an Rspace proc, 0 if Kspace + int me_block; // proc ID within Rspace/Kspace block + int ratio; // ratio of Rspace procs to Kspace procs + int *qsize,*qdisp,*xsize,*xdisp; // MPI gather/scatter params for block comm + MPI_Comm block; // communicator within one block + int tip4p_flag; // 1 if PPPM/tip4p so do extra comm + + double **f_kspace; // copy of Kspace forces on Rspace procs + int maxatom; + + void rk_setup(); + void r2k_comm(); + void k2r_comm(); +}; + +} + +#endif +#endif + +/* ERROR/WARNING messages: + +E: Verlet/split requires 2 partitions + +See the -partition command-line switch. + +E: Verlet/split requires Rspace partition size be multiple of Kspace partition size + +This is so there is an equal number of Rspace processors for every +Kspace processor. + +E: Verlet/split requires Rspace partition layout be multiple of Kspace partition layout in each dim + +This is controlled by the processors command. + +W: No Kspace calculation with verlet/split + +The 2nd partition performs a kspace calculation so the kspace_style +command must be used. + +E: Verlet/split does not yet support TIP4P + +This is a current limitation. + +E: Cannot currently get per-atom virials with Intel package. + +The Intel package does not yet support per-atom virial calculation. + +*/ diff --git a/src/USER-OMP/pair_gran_hooke_history_omp.cpp b/src/USER-OMP/pair_gran_hooke_history_omp.cpp index 57827adf28..afe287c601 100644 --- a/src/USER-OMP/pair_gran_hooke_history_omp.cpp +++ b/src/USER-OMP/pair_gran_hooke_history_omp.cpp @@ -35,9 +35,6 @@ PairGranHookeHistoryOMP::PairGranHookeHistoryOMP(LAMMPS *lmp) : { suffix_flag |= Suffix::OMP; respa_enable = 0; - // trigger use of OpenMP version of FixShearHistory - suffix = new char[4]; - memcpy(suffix,"omp",4); } /* ---------------------------------------------------------------------- */ diff --git a/src/angle_hybrid.cpp b/src/angle_hybrid.cpp index 6f1cceba00..1780c4344e 100644 --- a/src/angle_hybrid.cpp +++ b/src/angle_hybrid.cpp @@ -208,7 +208,7 @@ void AngleHybrid::settings(int narg, char **arg) // one exception is 1st arg of style "table", which is non-numeric // need a better way to skip these exceptions - int dummy; + int sflag; nstyles = 0; i = 0; @@ -221,9 +221,10 @@ void AngleHybrid::settings(int narg, char **arg) error->all(FLERR,"Angle style hybrid cannot have hybrid as an argument"); if (strcmp(arg[i],"none") == 0) error->all(FLERR,"Angle style hybrid cannot have none as an argument"); - styles[nstyles] = force->new_angle(arg[i],lmp->suffix,dummy); - keywords[nstyles] = new char[strlen(arg[i])+1]; - strcpy(keywords[nstyles],arg[i]); + + styles[nstyles] = force->new_angle(arg[i],1,sflag); + force->store_style(keywords[nstyles],arg[i],sflag); + istyle = i; if (strcmp(arg[i],"table") == 0) i++; i++; @@ -346,7 +347,7 @@ void AngleHybrid::read_restart(FILE *fp) keywords[m] = new char[n]; if (me == 0) fread(keywords[m],sizeof(char),n,fp); MPI_Bcast(keywords[m],n,MPI_CHAR,0,world); - styles[m] = force->new_angle(keywords[m],lmp->suffix,dummy); + styles[m] = force->new_angle(keywords[m],0,dummy); } } diff --git a/src/atom.cpp b/src/atom.cpp index 7efbf4740f..550b959f22 100644 --- a/src/atom.cpp +++ b/src/atom.cpp @@ -333,7 +333,7 @@ void Atom::settings(Atom *old) called from lammps.cpp, input script, restart file, replicate ------------------------------------------------------------------------- */ -void Atom::create_avec(const char *style, int narg, char **arg, char *suffix) +void Atom::create_avec(const char *style, int narg, char **arg, int trysuffix) { delete [] atom_style; if (avec) delete avec; @@ -362,14 +362,15 @@ void Atom::create_avec(const char *style, int narg, char **arg, char *suffix) // so that x[0][0] can always be referenced even if proc has no atoms int sflag; - avec = new_avec(style,suffix,sflag); + avec = new_avec(style,trysuffix,sflag); avec->store_args(narg,arg); avec->process_args(narg,arg); avec->grow(1); if (sflag) { char estyle[256]; - sprintf(estyle,"%s/%s",style,suffix); + if (sflag = 1) sprintf(estyle,"%s/%s",style,lmp->suffix); + else sprintf(estyle,"%s/%s",style,lmp->suffix2); int n = strlen(estyle) + 1; atom_style = new char[n]; strcpy(atom_style,estyle); @@ -394,26 +395,41 @@ void Atom::create_avec(const char *style, int narg, char **arg, char *suffix) generate an AtomVec class, first with suffix appended ------------------------------------------------------------------------- */ -AtomVec *Atom::new_avec(const char *style, char *suffix, int &sflag) +AtomVec *Atom::new_avec(const char *style, int trysuffix, int &sflag) { - if (suffix && lmp->suffix_enable) { - sflag = 1; - char estyle[256]; - sprintf(estyle,"%s/%s",style,suffix); + if (trysuffix && lmp->suffix_enable) { + if (lmp->suffix) { + sflag = 1; + char estyle[256]; + sprintf(estyle,"%s/%s",style,lmp->suffix); - if (0) return NULL; + if (0) return NULL; #define ATOM_CLASS #define AtomStyle(key,Class) \ - else if (strcmp(estyle,#key) == 0) return new Class(lmp); + else if (strcmp(estyle,#key) == 0) return new Class(lmp); #include "style_atom.h" #undef AtomStyle #undef ATOM_CLASS + } + if (lmp->suffix2) { + sflag = 1; + char estyle[256]; + sprintf(estyle,"%s/%s",style,lmp->suffix2); + + if (0) return NULL; + +#define ATOM_CLASS +#define AtomStyle(key,Class) \ + else if (strcmp(estyle,#key) == 0) return new Class(lmp); +#include "style_atom.h" +#undef AtomStyle +#undef ATOM_CLASS + } } sflag = 0; - if (0) return NULL; #define ATOM_CLASS @@ -423,7 +439,6 @@ AtomVec *Atom::new_avec(const char *style, char *suffix, int &sflag) #undef ATOM_CLASS else error->all(FLERR,"Invalid atom style"); - return NULL; } diff --git a/src/atom.h b/src/atom.h index c6bebe88a9..2f21fee722 100644 --- a/src/atom.h +++ b/src/atom.h @@ -171,8 +171,8 @@ class Atom : protected Pointers { ~Atom(); void settings(class Atom *); - void create_avec(const char *, int, char **, char *suffix = NULL); - class AtomVec *new_avec(const char *, char *, int &); + void create_avec(const char *, int, char **, int); + class AtomVec *new_avec(const char *, int, int &); void init(); void setup(); diff --git a/src/bond_hybrid.cpp b/src/bond_hybrid.cpp index 4545008611..63357a12e5 100644 --- a/src/bond_hybrid.cpp +++ b/src/bond_hybrid.cpp @@ -207,7 +207,7 @@ void BondHybrid::settings(int narg, char **arg) // one exception is 1st arg of style "table", which is non-numeric // need a better way to skip these exceptions - int dummy; + int sflag; nstyles = 0; i = 0; @@ -219,9 +219,10 @@ void BondHybrid::settings(int narg, char **arg) error->all(FLERR,"Bond style hybrid cannot have hybrid as an argument"); if (strcmp(arg[i],"none") == 0) error->all(FLERR,"Bond style hybrid cannot have none as an argument"); - styles[nstyles] = force->new_bond(arg[i],lmp->suffix,dummy); - keywords[nstyles] = new char[strlen(arg[i])+1]; - strcpy(keywords[nstyles],arg[i]); + + styles[nstyles] = force->new_bond(arg[i],1,sflag); + force->store_style(keywords[nstyles],arg[i],sflag); + istyle = i; if (strcmp(arg[i],"table") == 0) i++; i++; @@ -330,7 +331,7 @@ void BondHybrid::read_restart(FILE *fp) keywords[m] = new char[n]; if (me == 0) fread(keywords[m],sizeof(char),n,fp); MPI_Bcast(keywords[m],n,MPI_CHAR,0,world); - styles[m] = force->new_bond(keywords[m],lmp->suffix,dummy); + styles[m] = force->new_bond(keywords[m],0,dummy); } } diff --git a/src/delete_bonds.cpp b/src/delete_bonds.cpp index 3b2e9a528b..b380508ee9 100644 --- a/src/delete_bonds.cpp +++ b/src/delete_bonds.cpp @@ -167,7 +167,7 @@ void DeleteBonds::command(int narg, char **arg) else if (style == ATOM) { if (tlist[type[i]] || tlist[type[atom1]]) flag = 1; } else if (style == BOND) { - itype = static_cast (fabs(bond_type[i][m])); + itype = abs(bond_type[i][m]); if (tlist[itype]) flag = 1; } if (flag) { @@ -205,7 +205,7 @@ void DeleteBonds::command(int narg, char **arg) if (tlist[type[atom1]] || tlist[type[atom2]] || tlist[type[atom3]]) flag = 1; } else if (style == ANGLE) { - itype = static_cast (fabs(angle_type[i][m])); + itype = abs(angle_type[i][m]); if (tlist[itype]) flag = 1; } if (flag) { @@ -245,7 +245,7 @@ void DeleteBonds::command(int narg, char **arg) if (tlist[type[atom1]] || tlist[type[atom2]] || tlist[type[atom3]] || tlist[type[atom4]]) flag = 1; } else if (style == DIHEDRAL) { - itype = static_cast (fabs(dihedral_type[i][m])); + itype = abs(dihedral_type[i][m]); if (tlist[itype]) flag = 1; } if (flag) { @@ -285,7 +285,7 @@ void DeleteBonds::command(int narg, char **arg) if (tlist[type[atom1]] || tlist[type[atom2]] || tlist[type[atom3]] || tlist[type[atom4]]) flag = 1; } else if (style == IMPROPER) { - itype = static_cast (fabs(improper_type[i][m])); + itype = abs(improper_type[i][m]); if (tlist[itype]) flag = 1; } if (flag) { diff --git a/src/dihedral_hybrid.cpp b/src/dihedral_hybrid.cpp index 7b0dea64db..6f8ef5a093 100644 --- a/src/dihedral_hybrid.cpp +++ b/src/dihedral_hybrid.cpp @@ -209,7 +209,7 @@ void DihedralHybrid::settings(int narg, char **arg) // one exception is 1st arg of style "table", which is non-numeric // need a better way to skip these exceptions - int dummy; + int sflag; nstyles = 0; i = 0; @@ -223,9 +223,10 @@ void DihedralHybrid::settings(int narg, char **arg) "Dihedral style hybrid cannot have hybrid as an argument"); if (strcmp(arg[i],"none") == 0) error->all(FLERR,"Dihedral style hybrid cannot have none as an argument"); - styles[nstyles] = force->new_dihedral(arg[i],lmp->suffix,dummy); - keywords[nstyles] = new char[strlen(arg[i])+1]; - strcpy(keywords[nstyles],arg[i]); + + styles[nstyles] = force->new_dihedral(arg[i],1,sflag); + force->store_style(keywords[nstyles],arg[i],sflag); + istyle = i; if (strcmp(arg[i],"table") == 0) i++; i++; @@ -331,7 +332,7 @@ void DihedralHybrid::read_restart(FILE *fp) keywords[m] = new char[n]; if (me == 0) fread(keywords[m],sizeof(char),n,fp); MPI_Bcast(keywords[m],n,MPI_CHAR,0,world); - styles[m] = force->new_dihedral(keywords[m],lmp->suffix,dummy); + styles[m] = force->new_dihedral(keywords[m],0,dummy); } } diff --git a/src/force.cpp b/src/force.cpp index c316c04a52..832133548a 100644 --- a/src/force.cpp +++ b/src/force.cpp @@ -125,47 +125,46 @@ void Force::init() create a pair style, called from input script or restart file ------------------------------------------------------------------------- */ -void Force::create_pair(const char *style, const char *suffix) +void Force::create_pair(const char *style, int trysuffix) { delete [] pair_style; if (pair) delete pair; int sflag; - pair = new_pair(style,suffix,sflag); - - if (sflag) { - char estyle[256]; - sprintf(estyle,"%s/%s",style,suffix); - int n = strlen(estyle) + 1; - pair_style = new char[n]; - strcpy(pair_style,estyle); - } else { - int n = strlen(style) + 1; - pair_style = new char[n]; - strcpy(pair_style,style); - } + pair = new_pair(style,trysuffix,sflag); + store_style(pair_style,style,sflag); } /* ---------------------------------------------------------------------- generate a pair class - try first with suffix appended + if trysuffix = 1, try first with suffix1/2 appended + return sflag = 0 for no suffix added, 1 or 2 for suffix1/2 added ------------------------------------------------------------------------- */ -Pair *Force::new_pair(const char *style, const char *suffix, int &sflag) +Pair *Force::new_pair(const char *style, int trysuffix, int &sflag) { - if (suffix && lmp->suffix_enable) { - sflag = 1; - char estyle[256]; - sprintf(estyle,"%s/%s",style,suffix); - - if (pair_map->find(estyle) != pair_map->end()) { - PairCreator pair_creator = (*pair_map)[estyle]; - return pair_creator(lmp); + if (trysuffix && lmp->suffix_enable) { + if (lmp->suffix) { + sflag = 1; + char estyle[256]; + sprintf(estyle,"%s/%s",style,lmp->suffix); + if (pair_map->find(estyle) != pair_map->end()) { + PairCreator pair_creator = (*pair_map)[estyle]; + return pair_creator(lmp); + } + } + if (lmp->suffix2) { + sflag = 2; + char estyle[256]; + sprintf(estyle,"%s/%s",style,lmp->suffix2); + if (pair_map->find(estyle) != pair_map->end()) { + PairCreator pair_creator = (*pair_map)[estyle]; + return pair_creator(lmp); + } } } sflag = 0; - if (strcmp(style,"none") == 0) return NULL; if (pair_map->find(style) != pair_map->end()) { PairCreator pair_creator = (*pair_map)[style]; @@ -230,50 +229,55 @@ Pair *Force::pair_match(const char *word, int exact) create a bond style, called from input script or restart file ------------------------------------------------------------------------- */ -void Force::create_bond(const char *style, const char *suffix) +void Force::create_bond(const char *style, int trysuffix) { delete [] bond_style; if (bond) delete bond; int sflag; - bond = new_bond(style,suffix,sflag); - - if (sflag) { - char estyle[256]; - sprintf(estyle,"%s/%s",style,suffix); - int n = strlen(estyle) + 1; - bond_style = new char[n]; - strcpy(bond_style,estyle); - } else { - int n = strlen(style) + 1; - bond_style = new char[n]; - strcpy(bond_style,style); - } + bond = new_bond(style,trysuffix,sflag); + store_style(bond_style,style,sflag); } /* ---------------------------------------------------------------------- generate a bond class, fist with suffix appended ------------------------------------------------------------------------- */ -Bond *Force::new_bond(const char *style, const char *suffix, int &sflag) +Bond *Force::new_bond(const char *style, int trysuffix, int &sflag) { - if (suffix && lmp->suffix_enable) { - sflag = 1; - char estyle[256]; - sprintf(estyle,"%s/%s",style,suffix); - - if (0) return NULL; + if (trysuffix && lmp->suffix_enable) { + if (lmp->suffix) { + sflag = 1; + char estyle[256]; + sprintf(estyle,"%s/%s",style,lmp->suffix); + + if (0) return NULL; #define BOND_CLASS #define BondStyle(key,Class) \ - else if (strcmp(estyle,#key) == 0) return new Class(lmp); + else if (strcmp(estyle,#key) == 0) return new Class(lmp); #include "style_bond.h" #undef BondStyle #undef BOND_CLASS + } + + if (lmp->suffix2) { + sflag = 2; + char estyle[256]; + sprintf(estyle,"%s/%s",style,lmp->suffix2); + + if (0) return NULL; + +#define BOND_CLASS +#define BondStyle(key,Class) \ + else if (strcmp(estyle,#key) == 0) return new Class(lmp); +#include "style_bond.h" +#undef BondStyle +#undef BOND_CLASS + } } sflag = 0; - if (strcmp(style,"none") == 0) return NULL; #define BOND_CLASS @@ -305,51 +309,55 @@ Bond *Force::bond_match(const char *style) create an angle style, called from input script or restart file ------------------------------------------------------------------------- */ -void Force::create_angle(const char *style, const char *suffix) +void Force::create_angle(const char *style, int trysuffix) { delete [] angle_style; if (angle) delete angle; int sflag; - angle = new_angle(style,suffix,sflag); - - if (sflag) { - char estyle[256]; - sprintf(estyle,"%s/%s",style,suffix); - int n = strlen(estyle) + 1; - angle_style = new char[n]; - strcpy(angle_style,estyle); - } else { - int n = strlen(style) + 1; - angle_style = new char[n]; - strcpy(angle_style,style); - } + angle = new_angle(style,trysuffix,sflag); + store_style(angle_style,style,sflag); } /* ---------------------------------------------------------------------- generate an angle class ------------------------------------------------------------------------- */ -Angle *Force::new_angle(const char *style, const char *suffix, int &sflag) +Angle *Force::new_angle(const char *style, int trysuffix, int &sflag) { - if (suffix && lmp->suffix_enable) { - sflag = 1; - char estyle[256]; - sprintf(estyle,"%s/%s",style,suffix); - - if (0) return NULL; + if (trysuffix && lmp->suffix_enable) { + if (lmp->suffix) { + sflag = 1; + char estyle[256]; + sprintf(estyle,"%s/%s",style,lmp->suffix); + + if (0) return NULL; #define ANGLE_CLASS #define AngleStyle(key,Class) \ - else if (strcmp(estyle,#key) == 0) return new Class(lmp); + else if (strcmp(estyle,#key) == 0) return new Class(lmp); #include "style_angle.h" #undef AngleStyle #undef ANGLE_CLASS + } + if (lmp->suffix2) { + sflag = 2; + char estyle[256]; + sprintf(estyle,"%s/%s",style,lmp->suffix); + + if (0) return NULL; + +#define ANGLE_CLASS +#define AngleStyle(key,Class) \ + else if (strcmp(estyle,#key) == 0) return new Class(lmp); +#include "style_angle.h" +#undef AngleStyle +#undef ANGLE_CLASS + } } sflag = 0; - if (strcmp(style,"none") == 0) return NULL; #define ANGLE_CLASS @@ -366,51 +374,55 @@ Angle *Force::new_angle(const char *style, const char *suffix, int &sflag) create a dihedral style, called from input script or restart file ------------------------------------------------------------------------- */ -void Force::create_dihedral(const char *style, const char *suffix) +void Force::create_dihedral(const char *style, int trysuffix) { delete [] dihedral_style; if (dihedral) delete dihedral; int sflag; - dihedral = new_dihedral(style,suffix,sflag); - - if (sflag) { - char estyle[256]; - sprintf(estyle,"%s/%s",style,suffix); - int n = strlen(estyle) + 1; - dihedral_style = new char[n]; - strcpy(dihedral_style,estyle); - } else { - int n = strlen(style) + 1; - dihedral_style = new char[n]; - strcpy(dihedral_style,style); - } + dihedral = new_dihedral(style,trysuffix,sflag); + store_style(dihedral_style,style,sflag); } /* ---------------------------------------------------------------------- generate a dihedral class ------------------------------------------------------------------------- */ -Dihedral *Force::new_dihedral(const char *style, const char *suffix, int &sflag) +Dihedral *Force::new_dihedral(const char *style, int trysuffix, int &sflag) { - if (suffix && lmp->suffix_enable) { - sflag = 1; - char estyle[256]; - sprintf(estyle,"%s/%s",style,suffix); + if (trysuffix && lmp->suffix_enable) { + if (lmp->suffix) { + sflag = 1; + char estyle[256]; + sprintf(estyle,"%s/%s",style,lmp->suffix); - if (0) return NULL; + if (0) return NULL; #define DIHEDRAL_CLASS #define DihedralStyle(key,Class) \ - else if (strcmp(estyle,#key) == 0) return new Class(lmp); + else if (strcmp(estyle,#key) == 0) return new Class(lmp); #include "style_dihedral.h" #undef DihedralStyle #undef DIHEDRAL_CLASS + } + if (lmp->suffix) { + sflag = 2; + char estyle[256]; + sprintf(estyle,"%s/%s",style,lmp->suffix2); + + if (0) return NULL; + +#define DIHEDRAL_CLASS +#define DihedralStyle(key,Class) \ + else if (strcmp(estyle,#key) == 0) return new Class(lmp); +#include "style_dihedral.h" +#undef DihedralStyle +#undef DIHEDRAL_CLASS + } } sflag = 0; - if (strcmp(style,"none") == 0) return NULL; #define DIHEDRAL_CLASS @@ -428,51 +440,55 @@ Dihedral *Force::new_dihedral(const char *style, const char *suffix, int &sflag) create an improper style, called from input script or restart file ------------------------------------------------------------------------- */ -void Force::create_improper(const char *style, const char *suffix) +void Force::create_improper(const char *style, int trysuffix) { delete [] improper_style; if (improper) delete improper; int sflag; - improper = new_improper(style,suffix,sflag); - - if (sflag) { - char estyle[256]; - sprintf(estyle,"%s/%s",style,suffix); - int n = strlen(estyle) + 1; - improper_style = new char[n]; - strcpy(improper_style,estyle); - } else { - int n = strlen(style) + 1; - improper_style = new char[n]; - strcpy(improper_style,style); - } + improper = new_improper(style,trysuffix,sflag); + store_style(improper_style,style,sflag); } /* ---------------------------------------------------------------------- generate a improper class ------------------------------------------------------------------------- */ -Improper *Force::new_improper(const char *style, const char *suffix, int &sflag) +Improper *Force::new_improper(const char *style, int trysuffix, int &sflag) { - if (suffix && lmp->suffix_enable) { - sflag = 1; - char estyle[256]; - sprintf(estyle,"%s/%s",style,suffix); + if (trysuffix && lmp->suffix_enable) { + if (lmp->suffix) { + sflag = 1; + char estyle[256]; + sprintf(estyle,"%s/%s",style,lmp->suffix); - if (0) return NULL; + if (0) return NULL; #define IMPROPER_CLASS #define ImproperStyle(key,Class) \ - else if (strcmp(estyle,#key) == 0) return new Class(lmp); + else if (strcmp(estyle,#key) == 0) return new Class(lmp); #include "style_improper.h" #undef ImproperStyle #undef IMPROPER_CLASS + } + if (lmp->suffix2) { + sflag = 2; + char estyle[256]; + sprintf(estyle,"%s/%s",style,lmp->suffix2); + + if (0) return NULL; + +#define IMPROPER_CLASS +#define ImproperStyle(key,Class) \ + else if (strcmp(estyle,#key) == 0) return new Class(lmp); +#include "style_improper.h" +#undef ImproperStyle +#undef IMPROPER_CLASS + } } sflag = 0; - if (strcmp(style,"none") == 0) return NULL; #define IMPROPER_CLASS @@ -504,25 +520,14 @@ Improper *Force::improper_match(const char *style) new kspace style ------------------------------------------------------------------------- */ -void Force::create_kspace(int narg, char **arg, const char *suffix) +void Force::create_kspace(int narg, char **arg, int trysuffix) { delete [] kspace_style; if (kspace) delete kspace; int sflag; - kspace = new_kspace(narg,arg,suffix,sflag); - - if (sflag) { - char estyle[256]; - sprintf(estyle,"%s/%s",arg[0],suffix); - int n = strlen(estyle) + 1; - kspace_style = new char[n]; - strcpy(kspace_style,estyle); - } else { - int n = strlen(arg[0]) + 1; - kspace_style = new char[n]; - strcpy(kspace_style,arg[0]); - } + kspace = new_kspace(narg,arg,trysuffix,sflag); + store_style(kspace_style,arg[0],sflag); if (comm->style == 1 && !kspace_match("ewald",0)) error->all(FLERR, @@ -533,26 +538,41 @@ void Force::create_kspace(int narg, char **arg, const char *suffix) generate a kspace class ------------------------------------------------------------------------- */ -KSpace *Force::new_kspace(int narg, char **arg, const char *suffix, int &sflag) +KSpace *Force::new_kspace(int narg, char **arg, int trysuffix, int &sflag) { - if (suffix && lmp->suffix_enable) { - sflag = 1; - char estyle[256]; - sprintf(estyle,"%s/%s",arg[0],suffix); + if (trysuffix && lmp->suffix_enable) { + if (lmp->suffix) { + sflag = 1; + char estyle[256]; + sprintf(estyle,"%s/%s",arg[0],lmp->suffix); - if (0) return NULL; + if (0) return NULL; #define KSPACE_CLASS #define KSpaceStyle(key,Class) \ - else if (strcmp(estyle,#key) == 0) return new Class(lmp,narg-1,&arg[1]); + else if (strcmp(estyle,#key) == 0) return new Class(lmp,narg-1,&arg[1]); #include "style_kspace.h" #undef KSpaceStyle #undef KSPACE_CLASS + } + if (lmp->suffix2) { + sflag = 1; + char estyle[256]; + sprintf(estyle,"%s/%s",arg[0],lmp->suffix2); + + if (0) return NULL; + +#define KSPACE_CLASS +#define KSpaceStyle(key,Class) \ + else if (strcmp(estyle,#key) == 0) return new Class(lmp,narg-1,&arg[1]); +#include "style_kspace.h" +#undef KSpaceStyle +#undef KSPACE_CLASS + } } sflag = 0; - if (strcmp(arg[0],"none") == 0) return NULL; #define KSPACE_CLASS @@ -579,6 +599,28 @@ KSpace *Force::kspace_match(const char *word, int exact) return NULL; } +/* ---------------------------------------------------------------------- + store style name in str allocated here + if sflag = 0, no suffix + if sflag = 1/2, append suffix or suffix2 to style +------------------------------------------------------------------------- */ + +void Force::store_style(char *&str, const char *style, int sflag) +{ + if (sflag) { + char estyle[256]; + if (sflag == 1) sprintf(estyle,"%s/%s",style,lmp->suffix); + else sprintf(estyle,"%s/%s",style,lmp->suffix2); + int n = strlen(estyle) + 1; + str = new char[n]; + strcpy(str,estyle); + } else { + int n = strlen(style) + 1; + str = new char[n]; + strcpy(str,style); + } +} + /* ---------------------------------------------------------------------- set special bond values ------------------------------------------------------------------------- */ diff --git a/src/force.h b/src/force.h index bf364f253b..f857c1a119 100644 --- a/src/force.h +++ b/src/force.h @@ -77,28 +77,29 @@ class Force : protected Pointers { ~Force(); void init(); - void create_pair(const char *, const char *suffix = NULL); - class Pair *new_pair(const char *, const char *, int &); + void create_pair(const char *, int); + class Pair *new_pair(const char *, int, int &); class Pair *pair_match(const char *, int); - void create_bond(const char *, const char *suffix = NULL); - class Bond *new_bond(const char *, const char *, int &); + void create_bond(const char *, int); + class Bond *new_bond(const char *, int, int &); class Bond *bond_match(const char *); - void create_angle(const char *, const char *suffix = NULL); - class Angle *new_angle(const char *, const char *, int &); + void create_angle(const char *, int); + class Angle *new_angle(const char *, int, int &); - void create_dihedral(const char *, const char *suffix = NULL); - class Dihedral *new_dihedral(const char *, const char *, int &); + void create_dihedral(const char *, int); + class Dihedral *new_dihedral(const char *, int, int &); - void create_improper(const char *, const char *suffix = NULL); - class Improper *new_improper(const char *, const char *, int &); + void create_improper(const char *, int); + class Improper *new_improper(const char *, int, int &); class Improper *improper_match(const char *); - void create_kspace(int, char **, const char *suffix = NULL); - class KSpace *new_kspace(int, char **, const char *, int &); + void create_kspace(int, char **, int); + class KSpace *new_kspace(int, char **, int, int &); class KSpace *kspace_match(const char *, int); + void store_style(char *&, const char *, int); void set_special(int, char **); void bounds(char *, int, int &, int &, int nmin=1); void boundsbig(char *, bigint, bigint &, bigint &, bigint nmin=1); diff --git a/src/improper_hybrid.cpp b/src/improper_hybrid.cpp index 9212051e38..09e73ac9b5 100644 --- a/src/improper_hybrid.cpp +++ b/src/improper_hybrid.cpp @@ -209,7 +209,7 @@ void ImproperHybrid::settings(int narg, char **arg) // one exception is 1st arg of style "table", which is non-numeric // need a better way to skip these exceptions - int dummy; + int sflag; nstyles = 0; i = 0; @@ -223,9 +223,10 @@ void ImproperHybrid::settings(int narg, char **arg) "Improper style hybrid cannot have hybrid as an argument"); if (strcmp(arg[i],"none") == 0) error->all(FLERR,"Improper style hybrid cannot have none as an argument"); - styles[nstyles] = force->new_improper(arg[i],lmp->suffix,dummy); - keywords[nstyles] = new char[strlen(arg[i])+1]; - strcpy(keywords[nstyles],arg[i]); + + styles[nstyles] = force->new_improper(arg[i],1,sflag); + force->store_style(keywords[nstyles],arg[i],sflag); + istyle = i; if (strcmp(arg[i],"table") == 0) i++; i++; @@ -319,7 +320,7 @@ void ImproperHybrid::read_restart(FILE *fp) keywords[m] = new char[n]; if (me == 0) fread(keywords[m],sizeof(char),n,fp); MPI_Bcast(keywords[m],n,MPI_CHAR,0,world); - styles[m] = force->new_improper(keywords[m],lmp->suffix,dummy); + styles[m] = force->new_improper(keywords[m],0,dummy); } } diff --git a/src/input.cpp b/src/input.cpp index 785929bb21..ac91ab5467 100644 --- a/src/input.cpp +++ b/src/input.cpp @@ -1083,7 +1083,7 @@ void Input::angle_style() if (narg < 1) error->all(FLERR,"Illegal angle_style command"); if (atom->avec->angles_allow == 0) error->all(FLERR,"Angle_style command when no angles allowed"); - force->create_angle(arg[0],lmp->suffix); + force->create_angle(arg[0],1); if (force->angle) force->angle->settings(narg-1,&arg[1]); } @@ -1101,7 +1101,7 @@ void Input::atom_style() if (narg < 1) error->all(FLERR,"Illegal atom_style command"); if (domain->box_exist) error->all(FLERR,"Atom_style command after simulation box is defined"); - atom->create_avec(arg[0],narg-1,&arg[1],lmp->suffix); + atom->create_avec(arg[0],narg-1,&arg[1],1); } /* ---------------------------------------------------------------------- */ @@ -1124,7 +1124,7 @@ void Input::bond_style() if (narg < 1) error->all(FLERR,"Illegal bond_style command"); if (atom->avec->bonds_allow == 0) error->all(FLERR,"Bond_style command when no bonds allowed"); - force->create_bond(arg[0],lmp->suffix); + force->create_bond(arg[0],1); if (force->bond) force->bond->settings(narg-1,&arg[1]); } @@ -1175,7 +1175,7 @@ void Input::comm_style() void Input::compute() { - modify->add_compute(narg,arg,lmp->suffix); + modify->add_compute(narg,arg,1); } /* ---------------------------------------------------------------------- */ @@ -1213,7 +1213,7 @@ void Input::dihedral_style() if (narg < 1) error->all(FLERR,"Illegal dihedral_style command"); if (atom->avec->dihedrals_allow == 0) error->all(FLERR,"Dihedral_style command when no dihedrals allowed"); - force->create_dihedral(arg[0],lmp->suffix); + force->create_dihedral(arg[0],1); if (force->dihedral) force->dihedral->settings(narg-1,&arg[1]); } @@ -1253,7 +1253,7 @@ void Input::dump_modify() void Input::fix() { - modify->add_fix(narg,arg,lmp->suffix); + modify->add_fix(narg,arg,1); } /* ---------------------------------------------------------------------- */ @@ -1290,7 +1290,7 @@ void Input::improper_style() if (narg < 1) error->all(FLERR,"Illegal improper_style command"); if (atom->avec->impropers_allow == 0) error->all(FLERR,"Improper_style command when no impropers allowed"); - force->create_improper(arg[0],lmp->suffix); + force->create_improper(arg[0],1); if (force->improper) force->improper->settings(narg-1,&arg[1]); } @@ -1307,7 +1307,7 @@ void Input::kspace_modify() void Input::kspace_style() { - force->create_kspace(narg,arg,lmp->suffix); + force->create_kspace(narg,arg,1); } /* ---------------------------------------------------------------------- */ @@ -1412,7 +1412,7 @@ void Input::package() fixarg[1] = (char *) "all"; fixarg[2] = (char *) "GPU"; for (int i = 1; i < narg; i++) fixarg[i+2] = arg[i]; - modify->add_fix(2+narg,fixarg,NULL); + modify->add_fix(2+narg,fixarg); delete [] fixarg; force->newton_pair = 0; @@ -1427,9 +1427,54 @@ void Input::package() fixarg[1] = (char *) "all"; fixarg[2] = (char *) "OMP"; for (int i = 1; i < narg; i++) fixarg[i+2] = arg[i]; - modify->add_fix(2+narg,fixarg,NULL); + modify->add_fix(2+narg,fixarg); delete [] fixarg; + } else if (strcmp(arg[0],"intel") == 0) { + + // add omp package for non-pair routines + + /* + char **fixarg = new char*[2+narg]; + fixarg[0] = (char *) "package_omp"; + fixarg[1] = (char *) "all"; + fixarg[2] = (char *) "OMP"; + int omp_narg = 3; + if (narg > 1) { + fixarg[3] = arg[1]; + omp_narg++; + if (narg > 2) + for (int i = 2; i < narg; i++) + if (strcmp(arg[i],"mixed") == 0) { + fixarg[4] = arg[i]; + omp_narg++; + } + } + modify->add_fix(omp_narg,fixarg); + + // add intel package for neighbor and pair routines + */ + + char **fixarg = new char*[2+narg]; + fixarg[0] = (char *) "package_intel"; + fixarg[1] = (char *) "all"; + fixarg[2] = (char *) "Intel"; + for (int i = 1; i < narg; i++) fixarg[i+2] = arg[i]; + modify->add_fix(2+narg,fixarg); + delete [] fixarg; + + /* + // if running with offload, set run_style to verlet/intel + + #ifdef LMP_INTEL_OFFLOAD + #ifdef __INTEL_OFFLOAD + char *str; + str = (char *) "verlet/intel"; + update->create_integrate(1,&str,0); + #endif + #endif + */ + } else error->all(FLERR,"Illegal package command"); } @@ -1461,11 +1506,27 @@ void Input::pair_modify() void Input::pair_style() { if (narg < 1) error->all(FLERR,"Illegal pair_style command"); - if (force->pair && strcmp(arg[0],force->pair_style) == 0) { - force->pair->settings(narg-1,&arg[1]); - return; + if (force->pair) { + int match = 0; + if (strcmp(arg[0],force->pair_style) == 0) match = 1; + if (!match && lmp->suffix_enable) { + char estyle[256]; + if (lmp->suffix) { + sprintf(estyle,"%s/%s",arg[0],lmp->suffix); + if (strcmp(estyle,force->pair_style) == 0) match = 1; + } + if (lmp->suffix2) { + sprintf(estyle,"%s/%s",arg[0],lmp->suffix2); + if (strcmp(estyle,force->pair_style) == 0) match = 1; + } + } + if (match) { + force->pair->settings(narg-1,&arg[1]); + return; + } } - force->create_pair(arg[0],lmp->suffix); + + force->create_pair(arg[0],1); if (force->pair) force->pair->settings(narg-1,&arg[1]); } @@ -1514,7 +1575,7 @@ void Input::run_style() { if (domain->box_exist == 0) error->all(FLERR,"Run_style command before simulation box is defined"); - update->create_integrate(narg,arg,lmp->suffix); + update->create_integrate(narg,arg,1); } /* ---------------------------------------------------------------------- */ @@ -1561,6 +1622,12 @@ void Input::suffix() int n = strlen(arg[0]) + 1; lmp->suffix = new char[n]; strcpy(lmp->suffix,arg[0]); + // set 2nd suffix = "omp" when suffix = "intel" + if (strcmp(lmp->suffix,"intel") == 0) { + delete [] lmp->suffix2; + lmp->suffix2 = new char[4]; + strcpy(lmp->suffix2,"omp"); + } lmp->suffix_enable = 1; } } diff --git a/src/lammps.cpp b/src/lammps.cpp index d1e84cf9b3..69945a805f 100644 --- a/src/lammps.cpp +++ b/src/lammps.cpp @@ -45,6 +45,7 @@ #include "accelerator_cuda.h" #include "accelerator_kokkos.h" #include "accelerator_omp.h" +#include "accelerator_intel.h" #include "timer.h" #include "memory.h" #include "error.h" @@ -84,7 +85,7 @@ LAMMPS::LAMMPS(int narg, char **arg, MPI_Comm communicator) int citeflag = 1; int helpflag = 0; - suffix = NULL; + suffix = suffix2 = NULL; suffix_enable = 0; char *rfile = NULL; char *dfile = NULL; @@ -172,6 +173,11 @@ LAMMPS::LAMMPS(int narg, char **arg, MPI_Comm communicator) int n = strlen(arg[iarg+1]) + 1; suffix = new char[n]; strcpy(suffix,arg[iarg+1]); + // set 2nd suffix = "omp" when suffix = "intel" + if (strcmp(suffix,"intel") == 0) { + suffix2 = new char[4]; + strcpy(suffix2,"omp"); + } suffix_enable = 1; iarg += 2; } else if (strcmp(arg[iarg],"-reorder") == 0 || @@ -535,6 +541,7 @@ LAMMPS::~LAMMPS() delete cuda; delete kokkos; delete [] suffix; + delete [] suffix2; delete input; delete universe; @@ -571,7 +578,7 @@ void LAMMPS::create() if (kokkos) atom = new AtomKokkos(this); else atom = new Atom(this); - atom->create_avec("atomic",0,NULL,suffix); + atom->create_avec("atomic",0,NULL,1); group = new Group(this); force = new Force(this); // must be after group, to create temperature @@ -590,13 +597,20 @@ void LAMMPS::create() invoke package-specific setup commands called from LAMMPS constructor and after clear() command only invoke if suffix is set and enabled + also check if suffix2 is set ------------------------------------------------------------------------- */ void LAMMPS::post_create() { - if (suffix && suffix_enable) { + if (!suffix_enable) return; + if (suffix) { if (strcmp(suffix,"gpu") == 0) input->one("package gpu force/neigh 0 0 1"); if (strcmp(suffix,"omp") == 0) input->one("package omp *"); + if (strcmp(suffix,"intel") == 0) + input->one("package intel * mixed balance -1"); + } + if (suffix2) { + if (strcmp(suffix,"omp") == 0) input->one("package omp *"); } } diff --git a/src/lammps.h b/src/lammps.h index 44c7921bd8..8ff0eca067 100644 --- a/src/lammps.h +++ b/src/lammps.h @@ -42,11 +42,14 @@ class LAMMPS { FILE *screen; // screen output FILE *logfile; // logfile - char *suffix; // suffix to add to input script style names - int suffix_enable; // 1 if suffix enabled, 0 if disabled + char *suffix,*suffix2; // suffixes to add to input script style names + int suffix_enable; // 1 if suffixes are enabled, 0 if disabled int cite_enable; // 1 if generating log.cite, 0 if disabled class Cuda *cuda; // CUDA accelerator class + //class GPU *gpu; // GPU accelerator class + //class Intel *intel; // Intel accelerator class + //class OMP *omp; // OMP accelerator class class KokkosLMP *kokkos; // KOKKOS accelerator class class CiteMe *citeme; // citation info diff --git a/src/modify.cpp b/src/modify.cpp index b55f368fef..065838f4ce 100644 --- a/src/modify.cpp +++ b/src/modify.cpp @@ -31,7 +31,7 @@ using namespace FixConst; #define DELTA 4 #define BIG 1.0e20 -#define NEXCEPT 4 // change when add to exceptions in add_fix() +#define NEXCEPT 5 // change when add to exceptions in add_fix() /* ---------------------------------------------------------------------- */ @@ -649,7 +649,7 @@ int Modify::min_reset_ref() add a new fix or replace one with same ID ------------------------------------------------------------------------- */ -void Modify::add_fix(int narg, char **arg, char *suffix) +void Modify::add_fix(int narg, char **arg, int trysuffix) { if (narg < 3) error->all(FLERR,"Illegal fix command"); @@ -658,9 +658,10 @@ void Modify::add_fix(int narg, char **arg, char *suffix) // but can't think of better way // too late if instantiate fix, then check flag set in fix constructor, // since some fixes access domain settings in their constructor - // change NEXCEPT above when add new fix to this list + // MUST change NEXCEPT above when add new fix to this list - const char *exceptions[NEXCEPT] = {"GPU","OMP","property/atom","cmap"}; + const char *exceptions[NEXCEPT] = + {"GPU","OMP","Intel","property/atom","cmap"}; if (domain->box_exist == 0) { int m; @@ -694,12 +695,27 @@ void Modify::add_fix(int narg, char **arg, char *suffix) if (ifix < nfix) { newflag = 0; - if (strcmp(arg[2],fix[ifix]->style) != 0) - error->all(FLERR,"Replacing a fix, but new style != old style"); + + int match = 0; + if (strcmp(arg[2],fix[ifix]->style) == 0) match = 1; + if (!match && trysuffix && lmp->suffix_enable) { + char estyle[256]; + if (lmp->suffix) { + sprintf(estyle,"%s/%s",arg[2],lmp->suffix); + if (strcmp(estyle,fix[ifix]->style) == 0) match = 1; + } + if (lmp->suffix2) { + sprintf(estyle,"%s/%s",arg[2],lmp->suffix2); + if (strcmp(estyle,fix[ifix]->style) == 0) match = 1; + } + } + if (!match) error->all(FLERR,"Replacing a fix, but new style != old style"); + if (fix[ifix]->igroup != igroup && comm->me == 0) error->warning(FLERR,"Replacing a fix, but new group != old group"); delete fix[ifix]; fix[ifix] = NULL; + } else { newflag = 1; if (nfix == maxfix) { @@ -714,12 +730,22 @@ void Modify::add_fix(int narg, char **arg, char *suffix) fix[ifix] = NULL; - if (suffix && lmp->suffix_enable) { - char estyle[256]; - sprintf(estyle,"%s/%s",arg[2],suffix); - if (fix_map->find(estyle) != fix_map->end()) { - FixCreator fix_creator = (*fix_map)[estyle]; - fix[ifix] = fix_creator(lmp,narg,arg); + if (trysuffix && lmp->suffix_enable) { + if (lmp->suffix) { + char estyle[256]; + sprintf(estyle,"%s/%s",arg[2],lmp->suffix); + if (fix_map->find(estyle) != fix_map->end()) { + FixCreator fix_creator = (*fix_map)[estyle]; + fix[ifix] = fix_creator(lmp,narg,arg); + } + } + if (fix[ifix] == NULL && lmp->suffix2) { + char estyle[256]; + sprintf(estyle,"%s/%s",arg[2],lmp->suffix2); + if (fix_map->find(estyle) != fix_map->end()) { + FixCreator fix_creator = (*fix_map)[estyle]; + fix[ifix] = fix_creator(lmp,narg,arg); + } } } @@ -838,7 +864,7 @@ int Modify::find_fix(const char *id) add a new compute ------------------------------------------------------------------------- */ -void Modify::add_compute(int narg, char **arg, char *suffix) +void Modify::add_compute(int narg, char **arg, int trysuffix) { if (narg < 3) error->all(FLERR,"Illegal compute command"); @@ -861,12 +887,22 @@ void Modify::add_compute(int narg, char **arg, char *suffix) compute[ncompute] = NULL; - if (suffix && lmp->suffix_enable) { - char estyle[256]; - sprintf(estyle,"%s/%s",arg[2],suffix); - if (compute_map->find(estyle) != compute_map->end()) { - ComputeCreator compute_creator = (*compute_map)[estyle]; - compute[ncompute] = compute_creator(lmp,narg,arg); + if (trysuffix && lmp->suffix_enable) { + if (lmp->suffix) { + char estyle[256]; + sprintf(estyle,"%s/%s",arg[2],lmp->suffix); + if (compute_map->find(estyle) != compute_map->end()) { + ComputeCreator compute_creator = (*compute_map)[estyle]; + compute[ncompute] = compute_creator(lmp,narg,arg); + } + } + if (compute[ncompute] == NULL && lmp->suffix2) { + char estyle[256]; + sprintf(estyle,"%s/%s",arg[2],lmp->suffix2); + if (compute_map->find(estyle) != compute_map->end()) { + ComputeCreator compute_creator = (*compute_map)[estyle]; + compute[ncompute] = compute_creator(lmp,narg,arg); + } } } diff --git a/src/modify.h b/src/modify.h index 422c77d2fe..ba7101a934 100644 --- a/src/modify.h +++ b/src/modify.h @@ -82,12 +82,12 @@ class Modify : protected Pointers { virtual int min_dof(); virtual int min_reset_ref(); - void add_fix(int, char **, char *suffix = NULL); + void add_fix(int, char **, int trysuffix=0); void modify_fix(int, char **); void delete_fix(const char *); int find_fix(const char *); - void add_compute(int, char **, char *suffix = NULL); + void add_compute(int, char **, int trysuffix=0); void modify_compute(int, char **); void delete_compute(const char *); int find_compute(const char *); diff --git a/src/neigh_list.cpp b/src/neigh_list.cpp index dc925a6f90..0de4c0cb5f 100644 --- a/src/neigh_list.cpp +++ b/src/neigh_list.cpp @@ -246,6 +246,7 @@ void NeighList::print_attributes() printf(" %d = occasional\n",rq->occasional); printf(" %d = dnum\n",rq->dnum); printf(" %d = omp\n",rq->omp); + printf(" %d = intel\n",rq->intel); printf(" %d = ghost\n",rq->ghost); printf(" %d = cudable\n",rq->cudable); printf(" %d = omp\n",rq->omp); diff --git a/src/neigh_request.cpp b/src/neigh_request.cpp index 643d11b3bb..95ddc01517 100644 --- a/src/neigh_request.cpp +++ b/src/neigh_request.cpp @@ -56,6 +56,7 @@ NeighRequest::NeighRequest(LAMMPS *lmp) : Pointers(lmp) ghost = 0; cudable = 0; omp = 0; + intel = 0; kokkos_host = kokkos_device = 0; // default is no copy or skip @@ -126,6 +127,7 @@ int NeighRequest::identical(NeighRequest *other) if (ghost != other->ghost) same = 0; if (cudable != other->cudable) same = 0; if (omp != other->omp) same = 0; + if (intel != other->intel) same = 0; if (copy != other->copy_original) same = 0; if (same_skip(other) == 0) same = 0; @@ -155,6 +157,7 @@ int NeighRequest::same_kind(NeighRequest *other) if (ghost != other->ghost) same = 0; if (cudable != other->cudable) same = 0; if (omp != other->omp) same = 0; + if (intel != other->intel) same = 0; return same; } @@ -205,4 +208,5 @@ void NeighRequest::copy_request(NeighRequest *other) ghost = other->ghost; cudable = other->cudable; omp = other->omp; + intel = other->intel; } diff --git a/src/neigh_request.h b/src/neigh_request.h index 769d5354bf..41fa951fee 100644 --- a/src/neigh_request.h +++ b/src/neigh_request.h @@ -79,9 +79,10 @@ class NeighRequest : protected Pointers { int cudable; - // 1 if using multi-threaded neighbor list build + // 1 if using multi-threaded neighbor list build for USER-OMP or USER-INTEL int omp; + int intel; // 1 if using Kokkos neighbor build diff --git a/src/neighbor.cpp b/src/neighbor.cpp index 705887ba03..28c051313c 100644 --- a/src/neighbor.cpp +++ b/src/neighbor.cpp @@ -920,7 +920,7 @@ void Neighbor::choose_build(int index, NeighRequest *rq) { PairPtr pb = NULL; - if (rq->omp == 0) { + if (rq->omp == 0 && rq->intel == 0) { if (rq->copy) pb = &Neighbor::copy_from; @@ -1076,21 +1076,33 @@ void Neighbor::choose_build(int index, NeighRequest *rq) } else if (style == BIN) { if (rq->newton == 0) { if (newton_pair == 0) { - if (rq->ghost == 0) pb = &Neighbor::half_bin_no_newton_omp; - else if (includegroup) + if (rq->ghost == 0) { + if (rq->intel) pb = &Neighbor::half_bin_no_newton_intel; + else pb = &Neighbor::half_bin_no_newton_omp; + } else if (includegroup) error->all(FLERR,"Neighbor include group not allowed " "with ghost neighbors"); else pb = &Neighbor::half_bin_no_newton_ghost_omp; } else if (triclinic == 0) { - pb = &Neighbor::half_bin_newton_omp; - } else if (triclinic == 1) - pb = &Neighbor::half_bin_newton_tri_omp; + if (rq->intel) pb = &Neighbor::half_bin_newton_intel; + else pb = &Neighbor::half_bin_newton_omp; + } else if (triclinic == 1) { + if (rq->intel) pb = &Neighbor::half_bin_newton_tri_intel; + else pb = &Neighbor::half_bin_newton_tri_omp; + } } else if (rq->newton == 1) { - if (triclinic == 0) pb = &Neighbor::half_bin_newton_omp; - else if (triclinic == 1) pb = &Neighbor::half_bin_newton_tri_omp; + if (triclinic == 0) { + if (rq->intel) pb = &Neighbor::half_bin_newton_intel; + else pb = &Neighbor::half_bin_newton_omp; + } else if (triclinic == 1) { + if (rq->intel) pb = &Neighbor::half_bin_newton_tri_intel; + else pb = &Neighbor::half_bin_newton_tri_omp; + } } else if (rq->newton == 2) { - if (rq->ghost == 0) pb = &Neighbor::half_bin_no_newton_omp; - else if (includegroup) + if (rq->ghost == 0) { + if (rq->intel) pb = &Neighbor::half_bin_no_newton_intel; + else pb = &Neighbor::half_bin_no_newton_omp; + } else if (includegroup) error->all(FLERR,"Neighbor include group not allowed " "with ghost neighbors"); else pb = &Neighbor::half_bin_no_newton_ghost_omp; diff --git a/src/neighbor.h b/src/neighbor.h index 3c0c4af889..05a8622d04 100644 --- a/src/neighbor.h +++ b/src/neighbor.h @@ -237,6 +237,7 @@ class Neighbor : protected Pointers { #define LMP_INSIDE_NEIGHBOR_H #include "accelerator_omp.h" +#include "accelerator_intel.h" #undef LMP_INSIDE_NEIGHBOR_H // pairwise stencil creation functions diff --git a/src/output.cpp b/src/output.cpp index 0383dfe84d..6a9223604e 100644 --- a/src/output.cpp +++ b/src/output.cpp @@ -50,18 +50,18 @@ Output::Output(LAMMPS *lmp) : Pointers(lmp) newarg[0] = (char *) "thermo_temp"; newarg[1] = (char *) "all"; newarg[2] = (char *) "temp"; - modify->add_compute(3,newarg,lmp->suffix); + modify->add_compute(3,newarg,1); newarg[0] = (char *) "thermo_press"; newarg[1] = (char *) "all"; newarg[2] = (char *) "pressure"; newarg[3] = (char *) "thermo_temp"; - modify->add_compute(4,newarg,lmp->suffix); + modify->add_compute(4,newarg,1); newarg[0] = (char *) "thermo_pe"; newarg[1] = (char *) "all"; newarg[2] = (char *) "pe"; - modify->add_compute(3,newarg,lmp->suffix); + modify->add_compute(3,newarg,1); delete [] newarg; diff --git a/src/pair_hybrid.cpp b/src/pair_hybrid.cpp index e15d4b00d8..15de14db13 100644 --- a/src/pair_hybrid.cpp +++ b/src/pair_hybrid.cpp @@ -219,7 +219,7 @@ void PairHybrid::settings(int narg, char **arg) // call settings() with set of args that are not pair style names // use force->pair_map to determine which args these are - int iarg,jarg,dummy; + int iarg,jarg,sflag; iarg = 0; nstyles = 0; @@ -228,10 +228,10 @@ void PairHybrid::settings(int narg, char **arg) error->all(FLERR,"Pair style hybrid cannot have hybrid as an argument"); if (strcmp(arg[iarg],"none") == 0) error->all(FLERR,"Pair style hybrid cannot have none as an argument"); - styles[nstyles] = force->new_pair(arg[iarg],lmp->suffix,dummy); - int n = strlen(arg[iarg]) + 1; - keywords[nstyles] = new char[n]; - strcpy(keywords[nstyles],arg[iarg]); + + styles[nstyles] = force->new_pair(arg[iarg],1,sflag); + force->store_style(keywords[nstyles],arg[iarg],sflag); + jarg = iarg + 1; while (jarg < narg && !force->pair_map->count(arg[jarg])) jarg++; styles[nstyles]->settings(jarg-iarg-1,&arg[iarg+1]); @@ -637,7 +637,7 @@ void PairHybrid::read_restart(FILE *fp) keywords[m] = new char[n]; if (me == 0) fread(keywords[m],sizeof(char),n,fp); MPI_Bcast(keywords[m],n,MPI_CHAR,0,world); - styles[m] = force->new_pair(keywords[m],lmp->suffix,dummy); + styles[m] = force->new_pair(keywords[m],0,dummy); styles[m]->read_restart_settings(fp); } diff --git a/src/read_restart.cpp b/src/read_restart.cpp index 749bff7f31..441ecbfcd2 100644 --- a/src/read_restart.cpp +++ b/src/read_restart.cpp @@ -797,7 +797,7 @@ void ReadRestart::header(int incompatible) char **argcopy = new char*[nargcopy]; for (int i = 0; i < nargcopy; i++) argcopy[i] = read_string(); - atom->create_avec(style,nargcopy,argcopy); + atom->create_avec(style,nargcopy,argcopy,0); for (int i = 0; i < nargcopy; i++) delete [] argcopy[i]; delete [] argcopy; delete [] style; @@ -891,31 +891,31 @@ void ReadRestart::force_fields() if (flag == PAIR) { style = read_string(); - force->create_pair(style); + force->create_pair(style,0); delete [] style; force->pair->read_restart(fp); } else if (flag == BOND) { style = read_string(); - force->create_bond(style); + force->create_bond(style,0); delete [] style; force->bond->read_restart(fp); } else if (flag == ANGLE) { style = read_string(); - force->create_angle(style); + force->create_angle(style,0); delete [] style; force->angle->read_restart(fp); } else if (flag == DIHEDRAL) { style = read_string(); - force->create_dihedral(style); + force->create_dihedral(style,0); delete [] style; force->dihedral->read_restart(fp); } else if (flag == IMPROPER) { style = read_string(); - force->create_improper(style); + force->create_improper(style,0); delete [] style; force->improper->read_restart(fp); diff --git a/src/replicate.cpp b/src/replicate.cpp index 26f3fca7ed..7300da0968 100644 --- a/src/replicate.cpp +++ b/src/replicate.cpp @@ -116,7 +116,7 @@ void Replicate::command(int narg, char **arg) Atom *old = atom; atom = new Atom(lmp); atom->settings(old); - atom->create_avec(old->atom_style,old->avec->nargcopy,old->avec->argcopy); + atom->create_avec(old->atom_style,old->avec->nargcopy,old->avec->argcopy,0); // check that new system will not be too large // new tags cannot exceed MAXTAGINT diff --git a/src/suffix.h b/src/suffix.h index 2a150ed5ef..43493d6203 100644 --- a/src/suffix.h +++ b/src/suffix.h @@ -22,6 +22,7 @@ namespace Suffix { static const int GPU = 1<<1; static const int CUDA = 1<<2; static const int OMP = 1<<3; + static const int INTEL = 1<<4; } } diff --git a/src/update.cpp b/src/update.cpp index a2017db066..610cce1abd 100644 --- a/src/update.cpp +++ b/src/update.cpp @@ -62,7 +62,7 @@ Update::Update(LAMMPS *lmp) : Pointers(lmp) minimize = NULL; str = (char *) "verlet"; - create_integrate(1,&str,lmp->suffix); + create_integrate(1,&str,1); str = (char *) "cg"; create_minimize(1,&str); @@ -293,7 +293,7 @@ void Update::set_units(const char *style) /* ---------------------------------------------------------------------- */ -void Update::create_integrate(int narg, char **arg, char *suffix) +void Update::create_integrate(int narg, char **arg, int trysuffix) { if (narg < 1) error->all(FLERR,"Illegal run_style command"); @@ -301,11 +301,12 @@ void Update::create_integrate(int narg, char **arg, char *suffix) delete integrate; int sflag; - new_integrate(arg[0],narg-1,&arg[1],suffix,sflag); + new_integrate(arg[0],narg-1,&arg[1],trysuffix,sflag); if (sflag) { char estyle[256]; - sprintf(estyle,"%s/%s",arg[0],suffix); + if (sflag == 1) sprintf(estyle,"%s/%s",arg[0],lmp->suffix); + else sprintf(estyle,"%s/%s",arg[0],lmp->suffix2); int n = strlen(estyle) + 1; integrate_style = new char[n]; strcpy(integrate_style,estyle); @@ -321,42 +322,59 @@ void Update::create_integrate(int narg, char **arg, char *suffix) ------------------------------------------------------------------------- */ void Update::new_integrate(char *style, int narg, char **arg, - char *suffix, int &sflag) + int trysuffix, int &sflag) { - int success = 0; + if (trysuffix && lmp->suffix_enable) { + if (lmp->suffix) { + sflag = 1; + char estyle[256]; + sprintf(estyle,"%s/%s",style,lmp->suffix); + int success = 1; - if (suffix && lmp->suffix_enable) { - sflag = 1; - char estyle[256]; - sprintf(estyle,"%s/%s",style,suffix); - success = 1; - - if (0) return; + if (0) return; #define INTEGRATE_CLASS #define IntegrateStyle(key,Class) \ - else if (strcmp(estyle,#key) == 0) integrate = new Class(lmp,narg,arg); + else if (strcmp(estyle,#key) == 0) integrate = new Class(lmp,narg,arg); +#include "style_integrate.h" +#undef IntegrateStyle +#undef INTEGRATE_CLASS + + else success = 0; + if (success) return; + } + + if (lmp->suffix2) { + sflag = 2; + char estyle[256]; + sprintf(estyle,"%s/%s",style,lmp->suffix2); + int success = 1; + + if (0) return; + +#define INTEGRATE_CLASS +#define IntegrateStyle(key,Class) \ + else if (strcmp(estyle,#key) == 0) integrate = new Class(lmp,narg,arg); +#include "style_integrate.h" +#undef IntegrateStyle +#undef INTEGRATE_CLASS + + else success = 0; + if (success) return; + } + } + + sflag = 0; + if (0) return; + +#define INTEGRATE_CLASS +#define IntegrateStyle(key,Class) \ + else if (strcmp(style,#key) == 0) integrate = new Class(lmp,narg,arg); #include "style_integrate.h" #undef IntegrateStyle #undef INTEGRATE_CLASS - else success = 0; - } - - if (!success) { - sflag = 0; - - if (0) return; - -#define INTEGRATE_CLASS -#define IntegrateStyle(key,Class) \ - else if (strcmp(style,#key) == 0) integrate = new Class(lmp,narg,arg); -#include "style_integrate.h" -#undef IntegrateStyle -#undef INTEGRATE_CLASS - - else error->all(FLERR,"Illegal integrate style"); - } + else error->all(FLERR,"Illegal integrate style"); } /* ---------------------------------------------------------------------- */ diff --git a/src/update.h b/src/update.h index 53bf041dea..5c6c15bad7 100644 --- a/src/update.h +++ b/src/update.h @@ -50,7 +50,7 @@ class Update : protected Pointers { ~Update(); void init(); void set_units(const char *); - void create_integrate(int, char **, char *); + void create_integrate(int, char **, int); void create_minimize(int, char **); void reset_timestep(int, char **); void reset_timestep(bigint); @@ -58,7 +58,7 @@ class Update : protected Pointers { bigint memory_usage(); private: - void new_integrate(char *, int, char **, char *, int &); + void new_integrate(char *, int, char **, int, int &); };