Feb2021 GPU Package Update - GPU Package Files

This commit is contained in:
Michael Brown
2021-02-15 08:20:50 -08:00
parent 16004e8f45
commit e7e2d2323b
345 changed files with 13424 additions and 7708 deletions

View File

@ -51,7 +51,7 @@ BIN2C = $(CUDA_HOME)/bin/bin2c
# host code compiler and settings # host code compiler and settings
CUDR_CPP = mpicxx -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK -DOMPI_SKIP_MPICXX=1 -fPIC CUDR_CPP = mpicxx -fopenmp -fopenmp-simd -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK -DOMPI_SKIP_MPICXX=1 -fPIC
CUDR_OPTS = -O2 $(LMP_INC) CUDR_OPTS = -O2 $(LMP_INC)
CUDR = $(CUDR_CPP) $(CUDR_OPTS) $(CUDA_PROXY) $(CUDA_PRECISION) $(CUDA_INCLUDE) \ CUDR = $(CUDR_CPP) $(CUDR_OPTS) $(CUDA_PROXY) $(CUDA_PRECISION) $(CUDA_INCLUDE) \
$(CUDPP_OPT) $(CUDPP_OPT)

View File

@ -17,7 +17,7 @@ LMP_INC = -DLAMMPS_SMALLBIG
HIP_PRECISION = -D_SINGLE_DOUBLE HIP_PRECISION = -D_SINGLE_DOUBLE
HIP_OPTS = -O3 HIP_OPTS = -O3
HIP_HOST_OPTS = -Wno-deprecated-declarations HIP_HOST_OPTS = -Wno-deprecated-declarations -fopenmp -fopenmp-sim
HIP_HOST_INCLUDE = HIP_HOST_INCLUDE =
# use device sort # use device sort

View File

@ -1,5 +1,5 @@
# Settings that the LAMMPS build will import when this package library is used # Settings that the LAMMPS build will import when this package library is used
gpu_SYSINC = gpu_SYSINC = -DFFT_SINGLE
gpu_SYSLIB = -framework OpenCL gpu_SYSLIB = -framework OpenCL
gpu_SYSPATH = gpu_SYSPATH =

View File

@ -1,25 +1,21 @@
# /* ---------------------------------------------------------------------- # /* ----------------------------------------------------------------------
# Generic Linux Makefile for OpenCL # Generic Linux Makefile for OpenCL - Mixed precision
# ------------------------------------------------------------------------- */ # ------------------------------------------------------------------------- */
# which file will be copied to Makefile.lammps # which file will be copied to Makefile.lammps
EXTRAMAKE = Makefile.lammps.opencl EXTRAMAKE = Makefile.lammps.opencl
# OCL_TUNE = -DFERMI_OCL # -- Uncomment for NVIDIA Fermi
# OCL_TUNE = -DKEPLER_OCL # -- Uncomment for NVIDIA Kepler
# OCL_TUNE = -DCYPRESS_OCL # -- Uncomment for AMD Cypress
OCL_TUNE = -DGENERIC_OCL # -- Uncomment for generic device
# this setting should match LAMMPS Makefile # this setting should match LAMMPS Makefile
# one of LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG and LAMMPS_SMALLSMALL # one of LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG and LAMMPS_SMALLSMALL
LMP_INC = -DLAMMPS_SMALLBIG LMP_INC = -DLAMMPS_SMALLBIG
OCL_INC = -I/usr/local/cuda/include # Path to CL directory OCL_INC =
OCL_CPP = mpic++ $(DEFAULT_DEVICE) -O3 -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK $(LMP_INC) $(OCL_INC) -std=c++11 OCL_CPP = mpic++ -std=c++11 -O3 -DMPICH_IGNORE_CXX_SEEK $(LMP_INC) $(OCL_INC)
OCL_LINK = -L/usr/local/cuda/lib64 -lOpenCL OCL_LINK = -lOpenCL
OCL_PREC = -D_SINGLE_DOUBLE OCL_PREC = -D_SINGLE_DOUBLE
OCL_TUNE = -fopenmp -fopenmp-simd -DMPI_GERYON -DGERYON_NUMA_FISSION -DUCL_NO_EXIT
BIN_DIR = ./ BIN_DIR = ./
OBJ_DIR = ./ OBJ_DIR = ./
@ -28,4 +24,3 @@ AR = ar
BSH = /bin/sh BSH = /bin/sh
include Opencl.makefile include Opencl.makefile

View File

@ -1,19 +1,17 @@
# /* ---------------------------------------------------------------------- # /* ----------------------------------------------------------------------
# Generic Mac Makefile for OpenCL # Generic Mac Makefile for OpenCL - Single precision with FFT_SINGLE
# ------------------------------------------------------------------------- */ # ------------------------------------------------------------------------- */
# which file will be copied to Makefile.lammps # which file will be copied to Makefile.lammps
EXTRAMAKE = Makefile.lammps.mac_ocl EXTRAMAKE = Makefile.lammps.mac_ocl
OCL_TUNE = -DFERMI_OCL # -- Uncomment for NVIDIA Fermi LMP_INC = -DLAMMPS_SMALLBIG
# OCL_TUNE = -DKEPLER_OCL # -- Uncomment for NVIDIA Kepler
# OCL_TUNE = -DCYPRESS_OCL # -- Uncomment for AMD Cypress
# OCL_TUNE = -DGENERIC_OCL # -- Uncomment for generic device
OCL_CPP = mpic++ -O3 -DMPI_GERYON -DUCL_NO_EXIT OCL_CPP = clang++ -std=c++11 -O3 -I../../src/STUBS
OCL_LINK = -framework OpenCL OCL_LINK = -framework OpenCL
OCL_PREC = -D_SINGLE_SINGLE OCL_PREC = -D_SINGLE_SINGLE
OCL_TUNE = -DUCL_NO_EXIT
BIN_DIR = ./ BIN_DIR = ./
OBJ_DIR = ./ OBJ_DIR = ./

View File

@ -0,0 +1,23 @@
# /* ----------------------------------------------------------------------
# Generic Mac Makefile for OpenCL - Single precision with FFT_SINGLE
# ------------------------------------------------------------------------- */
# which file will be copied to Makefile.lammps
EXTRAMAKE = Makefile.lammps.mac_ocl
LMP_INC = -DLAMMPS_SMALLBIG
OCL_CPP = mpicxx -std=c++11 -O3 -DMPICH_SKIP_MPICXX -DOMPI_SKIP_MPICXX=1
OCL_LINK = -framework OpenCL
OCL_PREC = -D_SINGLE_SINGLE
OCL_TUNE = -DUCL_NO_EXIT -DMPI_GERYON
BIN_DIR = ./
OBJ_DIR = ./
LIB_DIR = ./
AR = ar
BSH = /bin/sh
include Opencl.makefile

26
lib/gpu/Makefile.oneapi Normal file
View File

@ -0,0 +1,26 @@
# /* ----------------------------------------------------------------------
# Generic Linux Makefile for OpenCL
# ------------------------------------------------------------------------- */
# which file will be copied to Makefile.lammps
EXTRAMAKE = Makefile.lammps.opencl
# this setting should match LAMMPS Makefile
# one of LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG and LAMMPS_SMALLSMALL
LMP_INC = -DLAMMPS_SMALLBIG
OCL_INC =
OCL_CPP = mpiicpc -std=c++11 -xHost -O2 -qopenmp -qopenmp-simd -DMPICH_IGNORE_CXX_SEEK $(LMP_INC) $(OCL_INC)
OCL_LINK = -lOpenCL
OCL_PREC = -D_SINGLE_DOUBLE
OCL_TUNE = -DMPI_GERYON -DGERYON_NUMA_FISSION -DUCL_NO_EXIT -fp-model fast=2 -no-prec-div
BIN_DIR = ./
OBJ_DIR = ./
LIB_DIR = ./
AR = ar
BSH = /bin/sh
include Opencl.makefile

View File

@ -1,6 +1,7 @@
# Headers for Geryon # Headers for Geryon
UCL_H = $(wildcard ./geryon/ucl*.h) UCL_H = $(wildcard ./geryon/ucl*.h)
NVD_H = $(wildcard ./geryon/nvd*.h) $(UCL_H) lal_preprocessor.h NVD_H = $(wildcard ./geryon/nvd*.h) $(UCL_H) lal_preprocessor.h \
lal_pre_cuda_hip.h
ALL_H = $(NVD_H) $(wildcard ./lal_*.h) ALL_H = $(NVD_H) $(wildcard ./lal_*.h)
# Source files # Source files
@ -39,17 +40,21 @@ BIN2C = $(CUDA_HOME)/bin/bin2c
# device code compilation # device code compilation
$(OBJ_DIR)/pppm_f.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h $(OBJ_DIR)/pppm_f.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h \
lal_pre_cuda_hip.h
$(CUDA) --fatbin -DNV_KERNEL -Dgrdtyp=float -Dgrdtyp4=float4 -o $@ lal_pppm.cu $(CUDA) --fatbin -DNV_KERNEL -Dgrdtyp=float -Dgrdtyp4=float4 -o $@ lal_pppm.cu
$(OBJ_DIR)/pppm_f_cubin.h: $(OBJ_DIR)/pppm_f.cubin $(OBJ_DIR)/pppm_f_cubin.h: $(OBJ_DIR)/pppm_f.cubin
$(BIN2C) -c -n pppm_f $(OBJ_DIR)/pppm_f.cubin > $(OBJ_DIR)/pppm_f_cubin.h $(BIN2C) -c -n pppm_f $(OBJ_DIR)/pppm_f.cubin > $(OBJ_DIR)/pppm_f_cubin.h
rm $(OBJ_DIR)/pppm_f.cubin
$(OBJ_DIR)/pppm_d.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h $(OBJ_DIR)/pppm_d.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h \
lal_pre_cuda_hip.h
$(CUDA) --fatbin -DNV_KERNEL -Dgrdtyp=double -Dgrdtyp4=double4 -o $@ lal_pppm.cu $(CUDA) --fatbin -DNV_KERNEL -Dgrdtyp=double -Dgrdtyp4=double4 -o $@ lal_pppm.cu
$(OBJ_DIR)/pppm_d_cubin.h: $(OBJ_DIR)/pppm_d.cubin $(OBJ_DIR)/pppm_d_cubin.h: $(OBJ_DIR)/pppm_d.cubin
$(BIN2C) -c -n pppm_d $(OBJ_DIR)/pppm_d.cubin > $(OBJ_DIR)/pppm_d_cubin.h $(BIN2C) -c -n pppm_d $(OBJ_DIR)/pppm_d.cubin > $(OBJ_DIR)/pppm_d_cubin.h
rm $(OBJ_DIR)/pppm_d.cubin
$(OBJ_DIR)/%_cubin.h: lal_%.cu $(ALL_H) $(OBJ_DIR)/%_cubin.h: lal_%.cu $(ALL_H)
$(CUDA) --fatbin -DNV_KERNEL -o $(OBJ_DIR)/$*.cubin $(OBJ_DIR)/lal_$*.cu $(CUDA) --fatbin -DNV_KERNEL -o $(OBJ_DIR)/$*.cubin $(OBJ_DIR)/lal_$*.cu
@ -93,7 +98,7 @@ $(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVD_H)
$(CUDR) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_CUDADR $(CUDA_LIB) -lcuda $(CUDR) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_CUDADR $(CUDA_LIB) -lcuda
clean: clean:
-rm -f $(EXECS) $(GPU_LIB) $(OBJS) $(CUDPP) $(CUHS) *.linkinfo -rm -f $(EXECS) $(GPU_LIB) $(OBJS) $(CUDPP) $(CUHS) *.cubin *.linkinfo
veryclean: clean veryclean: clean
-rm -rf *~ *.linkinfo -rm -rf *~ *.linkinfo

View File

@ -1,8 +1,15 @@
# Common headers for kernels
PRE1_H = lal_preprocessor.h lal_aux_fun1.h
# Headers for Geryon # Headers for Geryon
UCL_H = $(wildcard ./geryon/ucl*.h) UCL_H = $(wildcard ./geryon/ucl*.h)
OCL_H = $(wildcard ./geryon/ocl*.h) $(UCL_H) lal_preprocessor.h OCL_H = $(wildcard ./geryon/ocl*.h) $(UCL_H) lal_precision.h
PRE1_H = lal_preprocessor.h lal_aux_fun1.h
ALL_H = $(OCL_H) $(wildcard ./lal_*.h) # Headers for Host files
HOST_H = lal_answer.h lal_atom.h lal_balance.h lal_base_atomic.h \
lal_base_charge.h lal_base_dipole.h lal_base_dpd.h \
lal_base_ellipsoid.h lal_base_three.h lal_device.h lal_neighbor.h \
lal_neighbor_shared.h lal_pre_ocl_config.h $(OCL_H)
# Source files # Source files
SRCS := $(wildcard ./lal_*.cpp) SRCS := $(wildcard ./lal_*.cpp)
@ -28,12 +35,75 @@ OCL = $(OCL_CPP) $(OCL_PREC) $(OCL_TUNE) -DUSE_OPENCL
# device code compilation # device code compilation
$(OBJ_DIR)/atom_cl.h: lal_atom.cu lal_preprocessor.h
$(BSH) ./geryon/file_to_cstr.sh atom lal_preprocessor.h lal_atom.cu $(OBJ_DIR)/atom_cl.h
$(OBJ_DIR)/neighbor_cpu_cl.h: lal_neighbor_cpu.cu lal_preprocessor.h
$(BSH) ./geryon/file_to_cstr.sh neighbor_cpu lal_preprocessor.h lal_neighbor_cpu.cu $(OBJ_DIR)/neighbor_cpu_cl.h
$(OBJ_DIR)/neighbor_gpu_cl.h: lal_neighbor_gpu.cu lal_preprocessor.h
$(BSH) ./geryon/file_to_cstr.sh neighbor_gpu lal_preprocessor.h lal_neighbor_gpu.cu $(OBJ_DIR)/neighbor_gpu_cl.h
$(OBJ_DIR)/device_cl.h: lal_device.cu lal_preprocessor.h
$(BSH) ./geryon/file_to_cstr.sh device lal_preprocessor.h lal_device.cu $(OBJ_DIR)/device_cl.h
$(OBJ_DIR)/pppm_cl.h: lal_pppm.cu lal_preprocessor.h
$(BSH) ./geryon/file_to_cstr.sh pppm lal_preprocessor.h lal_pppm.cu $(OBJ_DIR)/pppm_cl.h;
$(OBJ_DIR)/ellipsoid_nbor_cl.h: lal_ellipsoid_nbor.cu lal_preprocessor.h
$(BSH) ./geryon/file_to_cstr.sh ellipsoid_nbor lal_preprocessor.h lal_ellipsoid_nbor.cu $(OBJ_DIR)/ellipsoid_nbor_cl.h
$(OBJ_DIR)/gayberne_cl.h: lal_gayberne.cu $(PRE1_H) lal_ellipsoid_extra.h
$(BSH) ./geryon/file_to_cstr.sh gayberne $(PRE1_H) lal_ellipsoid_extra.h lal_gayberne.cu $(OBJ_DIR)/gayberne_cl.h;
$(OBJ_DIR)/gayberne_lj_cl.h: lal_gayberne_lj.cu $(PRE1_H) lal_ellipsoid_extra.h
$(BSH) ./geryon/file_to_cstr.sh gayberne_lj $(PRE1_H) lal_ellipsoid_extra.h lal_gayberne_lj.cu $(OBJ_DIR)/gayberne_lj_cl.h;
$(OBJ_DIR)/re_squared_cl.h: lal_re_squared.cu $(PRE1_H) lal_ellipsoid_extra.h
$(BSH) ./geryon/file_to_cstr.sh re_squared $(PRE1_H) lal_ellipsoid_extra.h lal_re_squared.cu $(OBJ_DIR)/re_squared_cl.h;
$(OBJ_DIR)/re_squared_lj_cl.h: lal_re_squared_lj.cu $(PRE1_H) lal_ellipsoid_extra.h
$(BSH) ./geryon/file_to_cstr.sh re_squared_lj $(PRE1_H) lal_ellipsoid_extra.h lal_re_squared_lj.cu $(OBJ_DIR)/re_squared_lj_cl.h;
$(OBJ_DIR)/tersoff_cl.h: lal_tersoff.cu $(PRE1_H) lal_tersoff_extra.h
$(BSH) ./geryon/file_to_cstr.sh tersoff $(PRE1_H) lal_tersoff_extra.h lal_tersoff.cu $(OBJ_DIR)/tersoff_cl.h;
$(OBJ_DIR)/tersoff_mod_cl.h: lal_tersoff_mod.cu $(PRE1_H) lal_tersoff_mod_extra.h
$(BSH) ./geryon/file_to_cstr.sh tersoff_mod $(PRE1_H) lal_tersoff_mod_extra.h lal_tersoff_mod.cu $(OBJ_DIR)/tersoff_mod_cl.h;
$(OBJ_DIR)/tersoff_zbl_cl.h: lal_tersoff_zbl.cu $(PRE1_H) lal_tersoff_zbl_extra.h
$(BSH) ./geryon/file_to_cstr.sh tersoff_zbl $(PRE1_H) lal_tersoff_zbl_extra.h lal_tersoff_zbl.cu $(OBJ_DIR)/tersoff_zbl_cl.h;
$(OBJ_DIR)/%_cl.h: lal_%.cu $(PRE1_H) $(OBJ_DIR)/%_cl.h: lal_%.cu $(PRE1_H)
$(BSH) ./geryon/file_to_cstr.sh $* $(PRE1_H) $< $@; $(BSH) ./geryon/file_to_cstr.sh $* $(PRE1_H) $< $@;
# host code compilation # host code compilation
$(OBJ_DIR)/lal_%.o: lal_%.cpp $(KERS) $(OBJ_DIR)/lal_answer.o: lal_answer.cpp $(HOST_H)
$(OCL) -o $@ -c lal_answer.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_dpd_tstat_ext.o: lal_dpd_tstat_ext.cpp lal_dpd.h $(HOST_H)
$(OCL) -o $@ -c lal_dpd_tstat_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_eam_alloy_ext.o: lal_eam_alloy_ext.cpp lal_eam.h $(HOST_H)
$(OCL) -o $@ -c lal_eam_alloy_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_eam_fs_ext.o: lal_eam_fs_ext.cpp lal_eam.h $(HOST_H)
$(OCL) -o $@ -c lal_eam_fs_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_neighbor.o: lal_neighbor.cpp $(HOST_H)
$(OCL) -o $@ -c lal_neighbor.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_neighbor_shared.o: lal_neighbor_shared.cpp $(HOST_H)
$(OCL) -o $@ -c lal_neighbor_shared.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_%_ext.o: lal_%_ext.cpp lal_%.h $(HOST_H)
$(OCL) -o $@ -c $< -I$(OBJ_DIR)
$(OBJ_DIR)/lal_base_%.o: lal_base_%.cpp $(HOST_H)
$(OCL) -o $@ -c $< -I$(OBJ_DIR)
$(OBJ_DIR)/lal_%.o: lal_%.cpp %_cl.h $(HOST_H)
$(OCL) -o $@ -c $< -I$(OBJ_DIR) $(OCL) -o $@ -c $< -I$(OBJ_DIR)
$(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp $(OCL_H) $(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp $(OCL_H)

View File

@ -4,18 +4,107 @@
W. Michael Brown (ORNL) W. Michael Brown (ORNL)
Trung Dac Nguyen (ORNL/Northwestern) Trung Dac Nguyen (ORNL/Northwestern)
Peng Wang (NVIDIA) Nitin Dhamankar (Intel)
Axel Kohlmeyer (Temple) Axel Kohlmeyer (Temple)
Peng Wang (NVIDIA)
Anders Hafreager (UiO)
V. Nikolskiy (HSE)
Maurice de Koning (Unicamp/Brazil)
Rodolfo Paula Leite (Unicamp/Brazil)
Steve Plimpton (SNL) Steve Plimpton (SNL)
Inderaj Bains (NVIDIA) Inderaj Bains (NVIDIA)
-------------------------------------------------------------------
This directory has source files to build a library that LAMMPS ------------------------------------------------------------------------------
links against when using the GPU package.
This library must be built with a C++ compiler, before LAMMPS is This directory has source files to build a library that LAMMPS links against
built, so LAMMPS can link against it. when using the GPU package.
This library must be built with a C++ compiler along with CUDA, HIP, or OpenCL
before LAMMPS is built, so LAMMPS can link against it.
This library, libgpu.a, provides routines for acceleration of certain
LAMMPS styles and neighbor list builds using CUDA, OpenCL, or ROCm HIP.
Pair styles supported by this library are marked in the list of Pair style
potentials with a "g". See the online version at:
https://lammps.sandia.gov/doc/Commands_pair.html
In addition the (plain) pppm kspace style is supported as well.
------------------------------------------------------------------------------
DEVICE QUERY
------------------------------------------------------------------------------
The gpu library includes binaries to check for available GPUs and their
properties. It is a good idea to run this on first use to make sure the
system and build is setup properly. Additionally, the GPU numbering for
specific selection of devices should be taking from this output. The GPU
library may split some accelerators into separate virtual accelerators for
efficient use with MPI.
After building the GPU library, for OpenCL:
./ocl_get_devices
and for CUDA
./nvc_get_devices
------------------------------------------------------------------------------
QUICK START
------------------------------------------------------------------------------
OpenCL: Mac without MPI:
make -f Makefile.mac_opencl -j; cd ../../src/; make mpi-stubs
make g++_serial -j
./lmp_g++_serial -in ../bench/in.lj -log none -sf gpu
OpenCL: Mac with MPI:
make -f Makefile.mac_opencl_mpi -j; cd ../../src/; make g++_openmpi -j
mpirun -np $NUM_MPI ./lmp_g++_openmpi -in ../bench/in.lj -log none -sf gpu
OpenCL: Linux with Intel oneAPI:
make -f Makefile.oneapi -j; cd ../../src; make oneapi -j
export OMP_NUM_THREADS=$NUM_THREADS
mpirun -np $NUM_MPI ./lmp_oneapi -in ../bench/in.lj -log none -sf gpu
OpenCL: Linux with MPI:
make -f Makefile.linux_opencl -j; cd ../../src; make omp -j
export OMP_NUM_THREADS=$NUM_THREADS
mpirun -np $NUM_MPI ./lmp_omp -in ../bench/in.lj -log none -sf gpu
NVIDIA CUDA:
make -f Makefile.cuda_mps -j; cd ../../src; make omp -j
export CUDA_MPS_LOG_DIRECTORY=/tmp; export CUDA_MPS_PIPE_DIRECTORY=/tmp
nvidia-smi -i 0 -c EXCLUSIVE_PROCESS
export OMP_NUM_THREADS=$NUM_THREADS
mpirun -np $NUM_MPI ./lmp_omp -in ../bench/in.lj -log none -sf gpu
echo quit | /usr/bin/nvidia-cuda-mps-control
AMD HIP:
make -f Makefile.hip -j; cd ../../src; make omp -j
export OMP_NUM_THREADS=$NUM_THREADS
mpirun -np $NUM_MPI ./lmp_omp -in ../bench/in.lj -log none -sf gpu
------------------------------------------------------------------------------
Installing oneAPI, OpenCl, CUDA, or ROCm
------------------------------------------------------------------------------
The easiest approach is to use the linux package manger to perform the
installation from Intel, NVIDIA, etc. repositories. All are available for
free. The oneAPI installation includes Intel optimized MPI and C++ compilers,
along with many libraries. Alternatively, Intel OpenCL can also be installed
separately from the Intel repository.
NOTE: Installation of the CUDA SDK is not required, only the CUDA toolkit.
See:
https://software.intel.com/content/www/us/en/develop/tools/oneapi/hpc-toolkit.html
https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html
https://github.com/RadeonOpenCompute/ROCm
------------------------------------------------------------------------------
Build Intro
------------------------------------------------------------------------------
You can type "make lib-gpu" from the src directory to see help on how You can type "make lib-gpu" from the src directory to see help on how
to build this library via make commands, or you can do the same thing to build this library via make commands, or you can do the same thing
@ -25,7 +114,7 @@ do it manually by following the instructions below.
Build the library using one of the provided Makefile.* files or create Build the library using one of the provided Makefile.* files or create
your own, specific to your compiler and system. For example: your own, specific to your compiler and system. For example:
make -f Makefile.linux make -f Makefile.linux_opencl
When you are done building this library, two files should When you are done building this library, two files should
exist in this directory: exist in this directory:
@ -45,33 +134,132 @@ IMPORTANT: If you re-build the library, e.g. for a different precision
Makefile.linux clean, to insure all previous derived files are removed Makefile.linux clean, to insure all previous derived files are removed
before the new build is done. before the new build is done.
Makefile.lammps has settings for 3 variables: NOTE: The system-specific setting LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG,
or LAMMPS_SMALLSMALL if specified when building LAMMPS (i.e. in
src/MAKE/Makefile.foo) should be consistent with that specified
when building libgpu.a (i.e. by LMP_INC in the lib/gpu/Makefile.bar).
user-gpu_SYSINC = leave blank for this package
user-gpu_SYSLIB = CUDA libraries needed by this package
user-gpu_SYSPATH = path(s) to where those libraries are
Because you have the CUDA compilers on your system, you should have ------------------------------------------------------------------------------
the needed libraries. If the CUDA development tools were installed PRECISION MODES
in the standard manner, the settings in the Makefile.lammps.standard ------------------------------------------------------------------------------
file should work. The GPU library supports 3 precision modes: single, double, and mixed, with
the latter being the default for most Makefiles aside from Mac specific
Makefiles due to the more restrictive nature of the Apple OpenCL for some
devices.
------------------------------------------------------------------- To specify the precision mode (output to the screen before LAMMPS runs for
verification), set either CUDA_PRECISION, OCL_PREC, or HIP_PRECISION to one
of -D_SINGLE_SINGLE, -D_DOUBLE_DOUBLE, or -D_SINGLE_DOUBLE.
GENERAL NOTES Some accelerators or OpenCL implementations only support single precision.
-------------------------------- This mode should be used with care and appropriate validation as the errors
can scale with system size in this implementation. This can be useful for
accelerating test runs when setting up a simulation for production runs on
another machine. In the case where only single precision is supported, either
LAMMPS must be compiled with -DFFT_SINGLE to use PPPM with GPU acceleration
or GPU acceleration should be disabled for PPPM (e.g. suffix off or pair/only
as described in the LAMMPS documentation).
This library, libgpu.a, provides routines for GPU acceleration
of certain LAMMPS styles and neighbor list builds. Compilation of this
library requires installing the CUDA GPU driver and CUDA toolkit for
your operating system. Installation of the CUDA SDK is not necessary.
In addition to the LAMMPS library, the binary nvc_get_devices will also
be built. This can be used to query the names and properties of GPU
devices on your system. A Makefile for OpenCL and ROCm HIP compilation
is provided, but support for it is not currently provided by the developers.
Details of the implementation are provided in:
---- ------------------------------------------------------------------------------
CUDA BUILD NOTES
------------------------------------------------------------------------------
NOTE: when compiling with CMake, all of the considerations listed below
are considered within the CMake configuration process, so no separate
compilation of the gpu library is required. Also this will build in support
for all compute architecture that are supported by the CUDA toolkit version
used to build the gpu library.
If you do not want to use a fat binary, that supports multiple CUDA
architectures, the CUDA_ARCH must be set to match the GPU architecture. This
is reported by nvc_get_devices executable created by the build process and
a detailed list of GPU architectures and CUDA compatible GPUs can be found
e.g. here: https://en.wikipedia.org/wiki/CUDA#GPUs_supported
The CUDA_HOME variable should be set to the location of the CUDA toolkit.
To build, edit the CUDA_ARCH, CUDA_PRECISION, CUDA_HOME variables in one of
the Makefiles. CUDA_ARCH should be set based on the compute capability of
your GPU. This can be verified by running the nvc_get_devices executable after
the build is complete. Additionally, the GPU package must be installed and
compiled for LAMMPS. This may require editing the gpu_SYSPATH variable in the
LAMMPS makefile.
Please note that the GPU library accesses the CUDA driver library directly,
so it needs to be linked with the CUDA driver library (libcuda.so) that ships
with the Nvidia driver. If you are compiling LAMMPS on the head node of a GPU
cluster, this library may not be installed, so you may need to copy it over
from one of the compute nodes (best into this directory). Recent CUDA toolkits
starting from CUDA 9 provide a dummy libcuda.so library (typically under
$(CUDA_HOME)/lib64/stubs), that can be used for linking.
Best performance with the GPU library is typically with multiple MPI processes
sharing the same GPU cards. For NVIDIA, this is most efficient with CUDA
MPS enabled. To prevent runtime errors for GPUs configured in exclusive process
mode with MPS, the GPU library should be build with either of the equivalent
-DCUDA_MPS_SUPPORT or -DCUDA_PROXY flags.
------------------------------------------------------------------------------
HIP BUILD NOTES
------------------------------------------------------------------------------
1. GPU sorting requires installing hipcub
(https://github.com/ROCmSoftwarePlatform/hipCUB). The HIP CUDA-backend
additionally requires cub (https://nvlabs.github.io/cub). Download and
extract the cub directory to lammps/lib/gpu/ or specify an appropriate
path in lammps/lib/gpu/Makefile.hip.
2. In Makefile.hip it is possible to specify the target platform via
export HIP_PLATFORM=hcc or HIP_PLATFORM=nvcc as well as the target
architecture (gfx803, gfx900, gfx906 etc.)
3. If your MPI implementation does not support `mpicxx --showme` command,
it is required to specify the corresponding MPI compiler and linker flags
in lammps/lib/gpu/Makefile.hip and in lammps/src/MAKE/OPTIONS/Makefile.hip.
------------------------------------------------------------------------------
OPENCL BUILD NOTES
------------------------------------------------------------------------------
If GERYON_NUMA_FISSION is defined at build time, LAMMPS will consider separate
NUMA nodes on GPUs or accelerators as separate devices. For example, a 2-socket
CPU would appear as two separate devices for OpenCL (and LAMMPS would require
two MPI processes to use both sockets with the GPU library - each with its
own device ID as output by ocl_get_devices).
For a debug build, use "-DUCL_DEBUG -DGERYON_KERNEL_DUMP" and remove
"-DUCL_NO_EXIT" and "-DMPI_GERYON" from the build options.
------------------------------------------------------------------------------
ALL PREPROCESSOR OPTIONS (For Advanced Users)
------------------------------------------------------------------------------
_SINGLE_SINGLE Build library for single precision mode
_SINGLE_DOUBLE Build library for mixed precision mode
_DOUBLE_DOUBLE Build library for double precision mode
CUDA_MPS_SUPPORT Do not generate errors for exclusive mode for CUDA
CUDA_PROXY Same as above
MPI_GERYON Library should use MPI_Abort for unhandled errors
GERYON_NUMA_FISSION Accelerators with main memory NUMA are split into
multiple virtual accelerators for each NUMA node
LAL_USE_OMP=0 Disable OpenMP in lib, regardless of compiler setting
LAL_USE_OMP_SIMD=0 Disable OpenMP SIMD in lib, regardless of compiler set
GERYON_OCL_FLUSH For OpenCL, flush queue after every enqueue
LAL_NO_OCL_EV_JIT Turn off JIT specialization for kernels in OpenCL
LAL_USE_OLD_NEIGHBOR Use old neighbor list algorithm
USE_CUDPP Enable GPU binning in neighbor builds (not recommended)
USE_HIP_DEVICE_SORT Enable GPU binning for HIP builds
(only w/ LAL_USE_OLD_NEIGHBOR)
LAL_NO_BLOCK_REDUCE Use host for energy/virial accumulation
LAL_OCL_EXTRA_ARGS Supply extra args for OpenCL compiler delimited with :
UCL_NO_EXIT LAMMPS should handle errors instead of Geryon lib
UCL_DEBUG Debug build for Geryon
GERYON_KERNEL_DUMP Dump all compiled OpenCL programs with compiler
flags and build logs
GPU_CAST Casting performed on GPU, untested recently
THREE_CONCURRENT Concurrent 3-body calcs in separate queues, untested
------------------------------------------------------------------------------
References for Details
------------------------------------------------------------------------------
Brown, W.M., Wang, P. Plimpton, S.J., Tharrington, A.N. Implementing Brown, W.M., Wang, P. Plimpton, S.J., Tharrington, A.N. Implementing
Molecular Dynamics on Hybrid High Performance Computers - Short Range Molecular Dynamics on Hybrid High Performance Computers - Short Range
@ -89,116 +277,3 @@ Brown, W.M., Masako, Y. Implementing Molecular Dynamics on Hybrid High
Performance Computers - Three-Body Potentials. Computer Physics Communications. Performance Computers - Three-Body Potentials. Computer Physics Communications.
2013. 184: p. 27852793. 2013. 184: p. 27852793.
----
NOTE: Installation of the CUDA SDK is not required, only the CUDA
toolkit itself or an OpenCL 1.2 compatible header and library.
Pair styles supporting GPU acceleration this this library
are marked in the list of Pair style potentials with a "g".
See the online version at: https://lammps.sandia.gov/doc/Commands_pair.html
In addition the (plain) pppm kspace style is supported as well.
MULTIPLE LAMMPS PROCESSES
--------------------------------
Multiple LAMMPS MPI processes can share GPUs on the system, but multiple
GPUs cannot be utilized by a single MPI process. In many cases, the
best performance will be obtained by running as many MPI processes as
CPU cores available with the condition that the number of MPI processes
is an integer multiple of the number of GPUs being used. See the
LAMMPS user manual for details on running with GPU acceleration.
BUILDING AND PRECISION MODES
--------------------------------
To build, edit the CUDA_ARCH, CUDA_PRECISION, CUDA_HOME variables in one of
the Makefiles. CUDA_ARCH should be set based on the compute capability of
your GPU. This can be verified by running the nvc_get_devices executable after
the build is complete. Additionally, the GPU package must be installed and
compiled for LAMMPS. This may require editing the gpu_SYSPATH variable in the
LAMMPS makefile.
Please note that the GPU library accesses the CUDA driver library directly,
so it needs to be linked not only to the CUDA runtime library (libcudart.so)
that ships with the CUDA toolkit, but also with the CUDA driver library
(libcuda.so) that ships with the Nvidia driver. If you are compiling LAMMPS
on the head node of a GPU cluster, this library may not be installed,
so you may need to copy it over from one of the compute nodes (best into
this directory). Recent CUDA toolkits starting from CUDA 9 provide a dummy
libcuda.so library (typically under $(CUDA_HOME)/lib64/stubs), that can be used for
linking.
The gpu library supports 3 precision modes as determined by
the CUDA_PRECISION variable:
CUDA_PRECISION = -D_SINGLE_SINGLE # Single precision for all calculations
CUDA_PRECISION = -D_DOUBLE_DOUBLE # Double precision for all calculations
CUDA_PRECISION = -D_SINGLE_DOUBLE # Accumulation of forces, etc. in double
As of CUDA 7.5 only GPUs with compute capability 2.0 (Fermi) or newer are
supported and as of CUDA 9.0 only compute capability 3.0 (Kepler) or newer
are supported. There are some limitations of this library for GPUs older
than that, which require additional preprocessor flag, and limit features,
but they are kept for historical reasons. There is no value in trying to
use those GPUs for production calculations.
You have to make sure that you set a CUDA_ARCH line suitable for your
hardware and CUDA toolkit version: e.g. -arch=sm_35 for Tesla K20 or K40
or -arch=sm_52 GeForce GTX Titan X. A detailed list of GPU architectures
and CUDA compatible GPUs can be found e.g. here:
https://en.wikipedia.org/wiki/CUDA#GPUs_supported
NOTE: when compiling with CMake, all of the considerations listed below
are considered within the CMake configuration process, so no separate
compilation of the gpu library is required. Also this will build in support
for all compute architecture that are supported by the CUDA toolkit version
used to build the gpu library.
Please note the CUDA_CODE settings in Makefile.linux_multi, which allows
to compile this library with support for multiple GPUs. This list can be
extended for newer GPUs with newer CUDA toolkits and should allow to build
a single GPU library compatible with all GPUs that are worth using for
GPU acceleration and supported by the current CUDA toolkits and drivers.
NOTE: The system-specific setting LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG,
or LAMMPS_SMALLSMALL if specified when building LAMMPS (i.e. in
src/MAKE/Makefile.foo) should be consistent with that specified
when building libgpu.a (i.e. by LMP_INC in the lib/gpu/Makefile.bar).
BUILDING FOR HIP FRAMEWORK
--------------------------------
1. Install the latest ROCm framework (https://github.com/RadeonOpenCompute/ROCm).
2. GPU sorting requires installing hipcub
(https://github.com/ROCmSoftwarePlatform/hipCUB). The HIP CUDA-backend
additionally requires cub (https://nvlabs.github.io/cub). Download and
extract the cub directory to lammps/lib/gpu/ or specify an appropriate
path in lammps/lib/gpu/Makefile.hip.
3. In Makefile.hip it is possible to specify the target platform via
export HIP_PLATFORM=hcc or HIP_PLATFORM=nvcc as well as the target
architecture (gfx803, gfx900, gfx906 etc.)
4. If your MPI implementation does not support `mpicxx --showme` command,
it is required to specify the corresponding MPI compiler and linker flags
in lammps/lib/gpu/Makefile.hip and in lammps/src/MAKE/OPTIONS/Makefile.hip.
5. Building the GPU library (libgpu.a):
cd lammps/lib/gpu; make -f Makefile.hip -j
6. Building the LAMMPS executable (lmp_hip):
cd ../../src; make hip -j
EXAMPLE CONVENTIONAL BUILD PROCESS
--------------------------------
cd ~/lammps/lib/gpu
emacs Makefile.linux
make -f Makefile.linux
./nvc_get_devices
cd ../../src
emacs ./MAKE/Makefile.linux
make yes-asphere
make yes-kspace
make yes-gpu
make linux

View File

@ -24,6 +24,8 @@ namespace ucl_hip {
// -------------------------------------------------------------------------- // --------------------------------------------------------------------------
typedef hipStream_t command_queue; typedef hipStream_t command_queue;
inline void ucl_flush(command_queue &cq) {}
inline void ucl_sync(hipStream_t &stream) { inline void ucl_sync(hipStream_t &stream) {
CU_SAFE_CALL(hipStreamSynchronize(stream)); CU_SAFE_CALL(hipStreamSynchronize(stream));
} }
@ -143,15 +145,26 @@ class UCL_Device {
inline std::string device_type_name(const int i) { return "GPU"; } inline std::string device_type_name(const int i) { return "GPU"; }
/// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT) /// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
inline int device_type() { return device_type(_device); } inline enum UCL_DEVICE_TYPE device_type() { return device_type(_device); }
/// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT) /// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
inline int device_type(const int i) { return UCL_GPU; } inline enum UCL_DEVICE_TYPE device_type(const int i) { return UCL_GPU; }
/// Returns true if host memory is efficiently addressable from device /// Returns true if host memory is efficiently addressable from device
inline bool shared_memory() { return shared_memory(_device); } inline bool shared_memory() { return shared_memory(_device); }
/// Returns true if host memory is efficiently addressable from device /// Returns true if host memory is efficiently addressable from device
inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; } inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; }
/// Returns preferred vector width
inline int preferred_fp32_width() { return preferred_fp32_width(_device); }
/// Returns preferred vector width
inline int preferred_fp32_width(const int i)
{return _properties[i].SIMDWidth;}
/// Returns preferred vector width
inline int preferred_fp64_width() { return preferred_fp64_width(_device); }
/// Returns preferred vector width
inline int preferred_fp64_width(const int i)
{return _properties[i].SIMDWidth;}
/// Returns true if double precision is support for the current device /// Returns true if double precision is support for the current device
inline bool double_precision() { return double_precision(_device); } inline bool double_precision() { return double_precision(_device); }
/// Returns true if double precision is support for the device /// Returns true if double precision is support for the device
@ -215,6 +228,18 @@ class UCL_Device {
/// Get the maximum number of threads per block /// Get the maximum number of threads per block
inline size_t group_size(const int i) inline size_t group_size(const int i)
{ return _properties[i].maxThreadsPerBlock; } { return _properties[i].maxThreadsPerBlock; }
/// Get the maximum number of threads per block in dimension 'dim'
inline size_t group_size_dim(const int dim)
{ return group_size_dim(_device, dim); }
/// Get the maximum number of threads per block in dimension 'dim'
inline size_t group_size_dim(const int i, const int dim)
{ return _properties[i].maxThreadsDim[dim];}
/// Get the shared local memory size in bytes
inline size_t slm_size() { return slm_size(_device); }
/// Get the shared local memory size in bytes
inline size_t slm_size(const int i)
{ return _properties[i].sharedMemPerBlock; }
/// Return the maximum memory pitch in bytes for current device /// Return the maximum memory pitch in bytes for current device
inline size_t max_pitch() { return max_pitch(_device); } inline size_t max_pitch() { return max_pitch(_device); }
@ -255,11 +280,20 @@ class UCL_Device {
inline int max_sub_devices(const int i) inline int max_sub_devices(const int i)
{ return 0; } { return 0; }
/// True if the device supports shuffle intrinsics
inline bool has_shuffle_support()
{ return has_shuffle_support(_device); }
/// True if the device supports shuffle intrinsics
inline bool has_shuffle_support(const int i)
{ return arch(i)>=3.0; }
/// List all devices along with all properties /// List all devices along with all properties
inline void print_all(std::ostream &out); inline void print_all(std::ostream &out);
/// Select the platform that has accelerators (for compatibility with OpenCL) /// For compatability with OCL API
inline int set_platform_accelerator(int pid=-1) { return UCL_SUCCESS; } inline int auto_set_platform(const enum UCL_DEVICE_TYPE type=UCL_GPU,
const std::string vendor="")
{ return set_platform(0); }
inline int load_module(const void* program, hipModule_t& module, std::string *log=nullptr){ inline int load_module(const void* program, hipModule_t& module, std::string *log=nullptr){
auto it = _loaded_modules.emplace(program, hipModule_t()); auto it = _loaded_modules.emplace(program, hipModule_t());

View File

@ -14,6 +14,7 @@
#include <fstream> #include <fstream>
#include <string> #include <string>
#include <iostream> #include <iostream>
#include <cstdio>
namespace ucl_hip { namespace ucl_hip {
@ -64,7 +65,7 @@ class UCL_Program {
} }
/// Load a program from a string and compile with flags /// Load a program from a string and compile with flags
inline int load_string(const void *program, const char *flags="", std::string *log=nullptr) { inline int load_string(const void *program, const char *flags="", std::string *log=nullptr, FILE* foutput=nullptr) {
return _device_ptr->load_module(program, _module, log); return _device_ptr->load_module(program, _module, log);
} }
@ -73,6 +74,7 @@ class UCL_Program {
hipModule_t _module; hipModule_t _module;
hipStream_t _cq; hipStream_t _cq;
friend class UCL_Texture; friend class UCL_Texture;
friend class UCL_Const;
}; };
/// Class for dealing with CUDA Driver kernels /// Class for dealing with CUDA Driver kernels

View File

@ -107,6 +107,37 @@ class UCL_Texture {
} }
}; };
/// Class storing a const global memory reference
class UCL_Const {
public:
UCL_Const() {}
~UCL_Const() {}
/// Construct with a specified global reference
inline UCL_Const(UCL_Program &prog, const char *global_name)
{ get_global(prog,global_name); }
/// Set the global reference for this object
inline void get_global(UCL_Program &prog, const char *global_name) {
_cq=prog.cq();
CU_SAFE_CALL(hipModuleGetGlobal(&_global, &_global_bytes, prog._module,
global_name));
}
/// Copy from array on host to const memory
template <class numtyp>
inline void update_device(UCL_H_Vec<numtyp> &src, const int numel) {
CU_SAFE_CALL(hipMemcpyHtoDAsync(_global, src.begin(), numel*sizeof(numtyp),
_cq));
}
/// Get device ptr associated with object
inline const void* begin() const { return &_global; }
inline void clear() {}
private:
hipStream_t _cq;
void* _global;
size_t _global_bytes;
friend class UCL_Kernel;
};
} // namespace } // namespace
#endif #endif

View File

@ -37,6 +37,8 @@ namespace ucl_cudadr {
// -------------------------------------------------------------------------- // --------------------------------------------------------------------------
typedef CUstream command_queue; typedef CUstream command_queue;
inline void ucl_flush(command_queue &cq) {}
inline void ucl_sync(CUstream &stream) { inline void ucl_sync(CUstream &stream) {
CU_SAFE_CALL(cuStreamSynchronize(stream)); CU_SAFE_CALL(cuStreamSynchronize(stream));
} }
@ -156,15 +158,26 @@ class UCL_Device {
inline std::string device_type_name(const int i) { return "GPU"; } inline std::string device_type_name(const int i) { return "GPU"; }
/// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT) /// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
inline int device_type() { return device_type(_device); } inline enum UCL_DEVICE_TYPE device_type() { return device_type(_device); }
/// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT) /// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
inline int device_type(const int i) { return UCL_GPU; } inline enum UCL_DEVICE_TYPE device_type(const int i) { return UCL_GPU; }
/// Returns true if host memory is efficiently addressable from device /// Returns true if host memory is efficiently addressable from device
inline bool shared_memory() { return shared_memory(_device); } inline bool shared_memory() { return shared_memory(_device); }
/// Returns true if host memory is efficiently addressable from device /// Returns true if host memory is efficiently addressable from device
inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; } inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; }
/// Returns preferred vector width
inline int preferred_fp32_width() { return preferred_fp32_width(_device); }
/// Returns preferred vector width
inline int preferred_fp32_width(const int i)
{return _properties[i].SIMDWidth;}
/// Returns preferred vector width
inline int preferred_fp64_width() { return preferred_fp64_width(_device); }
/// Returns preferred vector width
inline int preferred_fp64_width(const int i)
{return _properties[i].SIMDWidth;}
/// Returns true if double precision is support for the current device /// Returns true if double precision is support for the current device
inline bool double_precision() { return double_precision(_device); } inline bool double_precision() { return double_precision(_device); }
/// Returns true if double precision is support for the device /// Returns true if double precision is support for the device
@ -228,6 +241,18 @@ class UCL_Device {
/// Get the maximum number of threads per block /// Get the maximum number of threads per block
inline size_t group_size(const int i) inline size_t group_size(const int i)
{ return _properties[i].maxThreadsPerBlock; } { return _properties[i].maxThreadsPerBlock; }
/// Get the maximum number of threads per block in dimension 'dim'
inline size_t group_size_dim(const int dim)
{ return group_size_dim(_device, dim); }
/// Get the maximum number of threads per block in dimension 'dim'
inline size_t group_size_dim(const int i, const int dim)
{ return _properties[i].maxThreadsDim[dim]; }
/// Get the shared local memory size in bytes
inline size_t slm_size() { return slm_size(_device); }
/// Get the shared local memory size in bytes
inline size_t slm_size(const int i)
{ return _properties[i].sharedMemPerBlock; }
/// Return the maximum memory pitch in bytes for current device /// Return the maximum memory pitch in bytes for current device
inline size_t max_pitch() { return max_pitch(_device); } inline size_t max_pitch() { return max_pitch(_device); }
@ -268,11 +293,22 @@ class UCL_Device {
inline int max_sub_devices(const int i) inline int max_sub_devices(const int i)
{ return 0; } { return 0; }
/// True if the device supports shuffle intrinsics
inline bool has_shuffle_support()
{ return has_shuffle_support(_device); }
/// True if the device supports shuffle intrinsics
inline bool has_shuffle_support(const int i)
{ return arch(i)>=3.0; }
/// List all devices along with all properties /// List all devices along with all properties
inline void print_all(std::ostream &out); inline void print_all(std::ostream &out);
/// Select the platform that has accelerators (for compatibility with OpenCL) /// For compatability with OCL API
inline int set_platform_accelerator(int pid=-1) { return UCL_SUCCESS; } inline int auto_set_platform(const enum UCL_DEVICE_TYPE type=UCL_GPU,
const std::string vendor="",
const int ndevices=-1,
const int first_device=-1)
{ return set_platform(0); }
private: private:
int _device, _num_devices; int _device, _num_devices;

View File

@ -26,6 +26,7 @@
#include "nvd_device.h" #include "nvd_device.h"
#include <fstream> #include <fstream>
#include <cstdio>
namespace ucl_cudadr { namespace ucl_cudadr {
@ -77,7 +78,7 @@ class UCL_Program {
/// Load a program from a string and compile with flags /// Load a program from a string and compile with flags
inline int load_string(const void *program, const char *flags="", inline int load_string(const void *program, const char *flags="",
std::string *log=nullptr) { std::string *log=nullptr, FILE* foutput=nullptr) {
if (std::string(flags)=="BINARY") if (std::string(flags)=="BINARY")
return load_binary((const char *)program); return load_binary((const char *)program);
const unsigned int num_opts=2; const unsigned int num_opts=2;
@ -100,12 +101,25 @@ class UCL_Program {
if (err != CUDA_SUCCESS) { if (err != CUDA_SUCCESS) {
#ifndef UCL_NO_EXIT #ifndef UCL_NO_EXIT
std::cerr << std::endl std::cerr << std::endl << std::endl
<< "----------------------------------------------------------\n" << "----------------------------------------------------------\n"
<< " UCL Error: Error compiling PTX Program...\n" << " UCL Error: Error compiling PTX Program...\n"
<< "----------------------------------------------------------\n"; << "----------------------------------------------------------\n";
std::cerr << log << std::endl; std::cerr << log << std::endl
<< "----------------------------------------------------------\n\n";
#endif #endif
if (foutput != NULL) {
fprintf(foutput,"\n\n");
fprintf(foutput,
"----------------------------------------------------------\n");
fprintf(foutput," UCL Error: Error compiling PTX Program...\n");
fprintf(foutput,
"----------------------------------------------------------\n");
fprintf(foutput,"%s\n",log);
fprintf(foutput,
"----------------------------------------------------------\n");
fprintf(foutput,"\n\n");
}
return UCL_COMPILE_ERROR; return UCL_COMPILE_ERROR;
} }
@ -139,11 +153,15 @@ class UCL_Program {
return UCL_SUCCESS; return UCL_SUCCESS;
} }
/// Return the default command queue/stream associated with this data
inline command_queue & cq() { return _cq; }
friend class UCL_Kernel; friend class UCL_Kernel;
private: private:
CUmodule _module; CUmodule _module;
CUstream _cq; CUstream _cq;
friend class UCL_Texture; friend class UCL_Texture;
friend class UCL_Const;
}; };
/// Class for dealing with CUDA Driver kernels /// Class for dealing with CUDA Driver kernels

View File

@ -38,8 +38,11 @@ class UCL_Texture {
inline UCL_Texture(UCL_Program &prog, const char *texture_name) inline UCL_Texture(UCL_Program &prog, const char *texture_name)
{ get_texture(prog,texture_name); } { get_texture(prog,texture_name); }
/// Set the texture reference for this object /// Set the texture reference for this object
inline void get_texture(UCL_Program &prog, const char *texture_name) inline void get_texture(UCL_Program &prog, const char *texture_name) {
{ CU_SAFE_CALL(cuModuleGetTexRef(&_tex, prog._module, texture_name)); } #if (CUDA_VERSION < 11000)
CU_SAFE_CALL(cuModuleGetTexRef(&_tex, prog._module, texture_name));
#endif
}
/// Bind a float array where each fetch grabs a vector of length numel /// Bind a float array where each fetch grabs a vector of length numel
template<class numtyp> template<class numtyp>
@ -72,11 +75,14 @@ class UCL_Texture {
} }
private: private:
#if (CUDA_VERSION < 11000)
CUtexref _tex; CUtexref _tex;
#endif
friend class UCL_Kernel; friend class UCL_Kernel;
template<class mat_typ> template<class mat_typ>
inline void _bind_float(mat_typ &vec, const unsigned numel) { inline void _bind_float(mat_typ &vec, const unsigned numel) {
#if (CUDA_VERSION < 11000)
#ifdef UCL_DEBUG #ifdef UCL_DEBUG
assert(numel!=0 && numel<5); assert(numel!=0 && numel<5);
#endif #endif
@ -90,10 +96,42 @@ class UCL_Texture {
else else
CU_SAFE_CALL(cuTexRefSetFormat(_tex,CU_AD_FORMAT_SIGNED_INT32,numel*2)); CU_SAFE_CALL(cuTexRefSetFormat(_tex,CU_AD_FORMAT_SIGNED_INT32,numel*2));
} }
#endif
} }
}; };
/// Class storing a const global memory reference
class UCL_Const {
public:
UCL_Const() {}
~UCL_Const() {}
/// Construct with a specified global reference
inline UCL_Const(UCL_Program &prog, const char *global_name)
{ get_global(prog,global_name); }
/// Set the global reference for this object
inline void get_global(UCL_Program &prog, const char *global_name) {
_cq=prog.cq();
CU_SAFE_CALL(cuModuleGetGlobal(&_global, &_global_bytes, prog._module,
global_name));
}
/// Copy from array on host to const memory
template <class numtyp>
inline void update_device(UCL_H_Vec<numtyp> &src, const int numel) {
CU_SAFE_CALL(cuMemcpyHtoDAsync(_global, src.begin(), numel*sizeof(numtyp),
_cq));
}
/// Get device ptr associated with object
inline const CUdeviceptr * begin() const { return &_global; }
inline void clear() {}
private:
CUstream _cq;
CUdeviceptr _global;
size_t _global_bytes;
friend class UCL_Kernel;
};
} // namespace } // namespace
#endif #endif

View File

@ -28,14 +28,6 @@
#include <vector> #include <vector>
#include <iostream> #include <iostream>
/* We default to OpenCL 1.2 as target version for now as
* there are known issues with OpenCL 2.0 and later.
* This is also to silence warnings from generic OpenCL headers */
#if !defined(CL_TARGET_OPENCL_VERSION)
#define CL_TARGET_OPENCL_VERSION 120
#endif
#ifdef __APPLE__ #ifdef __APPLE__
#include <OpenCL/cl.h> #include <OpenCL/cl.h>
#include <OpenCL/cl_platform.h> #include <OpenCL/cl_platform.h>
@ -55,17 +47,36 @@ namespace ucl_opencl {
typedef cl_command_queue command_queue; typedef cl_command_queue command_queue;
typedef cl_context context_type; typedef cl_context context_type;
inline void ucl_flush(command_queue &cq) { CL_SAFE_CALL(clFlush(cq)); }
inline void ucl_sync(cl_command_queue &cq) { inline void ucl_sync(cl_command_queue &cq) {
CL_SAFE_CALL(clFinish(cq)); CL_SAFE_CALL(clFinish(cq));
} }
inline bool _shared_mem_device(cl_device_type &device_type) { #if defined(GERYON_FORCE_SHARED_MAIN_MEM_ON)
inline bool _shared_mem_device(cl_device_id &device) { return true; }
#elif defined(GERYON_FORCE_SHARED_MAIN_MEM_OFF)
inline bool _shared_mem_device(cl_device_id &device) { return false; }
#else
inline bool _shared_mem_device(cl_device_id &device) {
#ifdef CL_VERSION_1_2
cl_bool br;
CL_SAFE_CALL(clGetDeviceInfo(device, CL_DEVICE_HOST_UNIFIED_MEMORY,
sizeof(cl_bool), &br,NULL));
return (br == CL_TRUE);
#else
cl_device_type device_type;
CL_SAFE_CALL(clGetDeviceInfo(device,CL_DEVICE_TYPE,
sizeof(device_type),&device_type,NULL));
return (device_type==CL_DEVICE_TYPE_CPU); return (device_type==CL_DEVICE_TYPE_CPU);
#endif
} }
#endif
struct OCLProperties { struct OCLProperties {
std::string name; std::string name;
cl_device_type device_type; cl_device_type device_type;
bool is_subdevice;
cl_ulong global_mem; cl_ulong global_mem;
cl_ulong shared_mem; cl_ulong shared_mem;
cl_ulong const_mem; cl_ulong const_mem;
@ -74,12 +85,16 @@ struct OCLProperties {
size_t work_group_size; size_t work_group_size;
size_t work_item_size[3]; size_t work_item_size[3];
bool double_precision; bool double_precision;
int preferred_vector_width32, preferred_vector_width64;
int alignment; int alignment;
size_t timer_resolution; size_t timer_resolution;
bool ecc_support; bool ecc_support;
std::string c_version; std::string c_version;
bool partition_equal, partition_counts, partition_affinity; bool partition_equal, partition_counts, partition_affinity;
cl_uint max_sub_devices; cl_uint max_sub_devices;
int cl_device_version;
bool has_subgroup_support;
bool has_shuffle_support;
}; };
/// Class for looking at data parallel device properties /// Class for looking at data parallel device properties
@ -182,15 +197,26 @@ class UCL_Device {
inline std::string device_type_name(const int i); inline std::string device_type_name(const int i);
/// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT) /// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
inline int device_type() { return device_type(_device); } inline enum UCL_DEVICE_TYPE device_type() { return device_type(_device); }
/// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT) /// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
inline int device_type(const int i); inline enum UCL_DEVICE_TYPE device_type(const int i);
/// Returns true if host memory is efficiently addressable from device /// Returns true if host memory is efficiently addressable from device
inline bool shared_memory() { return shared_memory(_device); } inline bool shared_memory() { return shared_memory(_device); }
/// Returns true if host memory is efficiently addressable from device /// Returns true if host memory is efficiently addressable from device
inline bool shared_memory(const int i) inline bool shared_memory(const int i)
{ return _shared_mem_device(_properties[i].device_type); } { return _shared_mem_device(_cl_devices[i]); }
/// Returns preferred vector width
inline int preferred_fp32_width() { return preferred_fp32_width(_device); }
/// Returns preferred vector width
inline int preferred_fp32_width(const int i)
{return _properties[i].preferred_vector_width32;}
/// Returns preferred vector width
inline int preferred_fp64_width() { return preferred_fp64_width(_device); }
/// Returns preferred vector width
inline int preferred_fp64_width(const int i)
{return _properties[i].preferred_vector_width64;}
/// Returns true if double precision is support for the current device /// Returns true if double precision is support for the current device
inline bool double_precision() { return double_precision(_device); } inline bool double_precision() { return double_precision(_device); }
@ -242,6 +268,18 @@ class UCL_Device {
/// Get the maximum number of threads per block /// Get the maximum number of threads per block
inline size_t group_size(const int i) inline size_t group_size(const int i)
{ return _properties[i].work_group_size; } { return _properties[i].work_group_size; }
/// Get the maximum number of threads per block in dimension 'dim'
inline size_t group_size_dim(const int dim)
{ return group_size_dim(_device, dim); }
/// Get the maximum number of threads per block in dimension 'dim'
inline size_t group_size_dim(const int i, const int dim)
{ return _properties[i].work_item_size[dim]; }
/// Get the shared local memory size in bytes
inline size_t slm_size() { return slm_size(_device); }
/// Get the shared local memory size in bytes
inline size_t slm_size(const int i)
{ return _properties[i].shared_mem; }
/// Return the maximum memory pitch in bytes for current device /// Return the maximum memory pitch in bytes for current device
inline size_t max_pitch() { return max_pitch(_device); } inline size_t max_pitch() { return max_pitch(_device); }
@ -256,6 +294,12 @@ class UCL_Device {
inline bool sharing_supported(const int i) inline bool sharing_supported(const int i)
{ return true; } { return true; }
/// True if the device is a sub-device
inline bool is_subdevice()
{ return is_subdevice(_device); }
/// True if the device is a sub-device
inline bool is_subdevice(const int i)
{ return _properties[i].is_subdevice; }
/// True if splitting device into equal subdevices supported /// True if splitting device into equal subdevices supported
inline bool fission_equal() inline bool fission_equal()
{ return fission_equal(_device); } { return fission_equal(_device); }
@ -274,6 +318,18 @@ class UCL_Device {
/// True if splitting device into subdevices by affinity domains supported /// True if splitting device into subdevices by affinity domains supported
inline bool fission_by_affinity(const int i) inline bool fission_by_affinity(const int i)
{ return _properties[i].partition_affinity; } { return _properties[i].partition_affinity; }
/// True if the device has subgroup support
inline bool has_subgroup_support()
{ return has_subgroup_support(_device); }
/// True if the device has subgroup support
inline bool has_subgroup_support(const int i)
{ return _properties[i].has_subgroup_support; }
/// True if the device supports shuffle intrinsics
inline bool has_shuffle_support()
{ return has_shuffle_support(_device); }
/// True if the device supports shuffle intrinsics
inline bool has_shuffle_support(const int i)
{ return _properties[i].has_shuffle_support; }
/// Maximum number of subdevices allowed from device fission /// Maximum number of subdevices allowed from device fission
inline int max_sub_devices() inline int max_sub_devices()
@ -281,6 +337,12 @@ class UCL_Device {
/// Maximum number of subdevices allowed from device fission /// Maximum number of subdevices allowed from device fission
inline int max_sub_devices(const int i) inline int max_sub_devices(const int i)
{ return _properties[i].max_sub_devices; } { return _properties[i].max_sub_devices; }
/// OpenCL version supported by the device
inline int cl_device_version()
{ return cl_device_version(_device); }
/// OpenCL version supported by the device
inline int cl_device_version(const int i)
{ return _properties[i].cl_device_version; }
/// List all devices along with all properties /// List all devices along with all properties
inline void print_all(std::ostream &out); inline void print_all(std::ostream &out);
@ -288,8 +350,14 @@ class UCL_Device {
/// Return the OpenCL type for the device /// Return the OpenCL type for the device
inline cl_device_id & cl_device() { return _cl_device; } inline cl_device_id & cl_device() { return _cl_device; }
/// Select the platform that has accelerators /// Automatically set the platform by type, vendor, and/or CU count
inline int set_platform_accelerator(int pid=-1); /** If first_device is positive, search restricted to platforms containing
* this device IDs. If ndevices is positive, search is restricted
* to platforms with at least that many devices **/
inline int auto_set_platform(const enum UCL_DEVICE_TYPE type=UCL_GPU,
const std::string vendor="",
const int ndevices=-1,
const int first_device=-1);
private: private:
int _num_platforms; // Number of platforms int _num_platforms; // Number of platforms
@ -322,8 +390,7 @@ UCL_Device::UCL_Device() {
return; return;
} else } else
_num_platforms=static_cast<int>(nplatforms); _num_platforms=static_cast<int>(nplatforms);
// note that platform 0 may not necessarily be associated with accelerators set_platform(0);
set_platform_accelerator();
} }
UCL_Device::~UCL_Device() { UCL_Device::~UCL_Device() {
@ -332,6 +399,14 @@ UCL_Device::~UCL_Device() {
void UCL_Device::clear() { void UCL_Device::clear() {
_properties.clear(); _properties.clear();
#ifdef GERYON_NUMA_FISSION
#ifdef CL_VERSION_1_2
for (int i=0; i<_cl_devices.size(); i++)
CL_DESTRUCT_CALL(clReleaseDevice(_cl_devices[i]));
#endif
#endif
_cl_devices.clear(); _cl_devices.clear();
if (_device>-1) { if (_device>-1) {
for (size_t i=0; i<_cq.size(); i++) { for (size_t i=0; i<_cq.size(); i++) {
@ -341,6 +416,7 @@ void UCL_Device::clear() {
CL_DESTRUCT_CALL(clReleaseContext(_context)); CL_DESTRUCT_CALL(clReleaseContext(_context));
} }
_device=-1; _device=-1;
_num_devices=0;
} }
int UCL_Device::set_platform(int pid) { int UCL_Device::set_platform(int pid) {
@ -370,11 +446,51 @@ int UCL_Device::set_platform(int pid) {
CL_SAFE_CALL(clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,n,device_list, CL_SAFE_CALL(clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,n,device_list,
&n)); &n));
#ifndef GERYON_NUMA_FISSION
// --- Store properties for each device // --- Store properties for each device
for (int i=0; i<_num_devices; i++) { for (int i=0; i<_num_devices; i++) {
_cl_devices.push_back(device_list[i]); _cl_devices.push_back(device_list[i]);
add_properties(device_list[i]); add_properties(device_list[i]);
} }
#else
// --- Create sub-devices for anything partitionable by NUMA and store props
int num_unpart = _num_devices;
_num_devices = 0;
for (int i=0; i<num_unpart; i++) {
cl_uint num_subdevices = 1;
cl_device_id *subdevice_list = device_list + i;
#ifdef CL_VERSION_1_2
cl_device_affinity_domain adomain;
CL_SAFE_CALL(clGetDeviceInfo(device_list[i],
CL_DEVICE_PARTITION_AFFINITY_DOMAIN,
sizeof(cl_device_affinity_domain),
&adomain,NULL));
cl_device_partition_property props[3];
props[0]=CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN;
props[1]=CL_DEVICE_AFFINITY_DOMAIN_NUMA;
props[2]=0;
if (adomain & CL_DEVICE_AFFINITY_DOMAIN_NUMA)
CL_SAFE_CALL(clCreateSubDevices(device_list[i], props, 0, NULL,
&num_subdevices));
if (num_subdevices > 1) {
subdevice_list = new cl_device_id[num_subdevices];
CL_SAFE_CALL(clCreateSubDevices(device_list[i], props, num_subdevices,
subdevice_list, &num_subdevices));
}
#endif
for (int j=0; j<num_subdevices; j++) {
_num_devices++;
_cl_devices.push_back(subdevice_list[j]);
add_properties(subdevice_list[j]);
}
if (num_subdevices > 1) delete[] subdevice_list;
} // for i
#endif
delete[] device_list; delete[] device_list;
return UCL_SUCCESS; return UCL_SUCCESS;
} }
@ -429,11 +545,18 @@ void UCL_Device::add_properties(cl_device_id device_list) {
sizeof(cl_uint),&op.alignment,nullptr)); sizeof(cl_uint),&op.alignment,nullptr));
op.alignment/=8; op.alignment/=8;
cl_uint float_width;
CL_SAFE_CALL(clGetDeviceInfo(device_list,
CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT,
sizeof(float_width),&float_width,nullptr));
op.preferred_vector_width32=float_width;
// Determine if double precision is supported // Determine if double precision is supported
cl_uint double_width; cl_uint double_width;
CL_SAFE_CALL(clGetDeviceInfo(device_list, CL_SAFE_CALL(clGetDeviceInfo(device_list,
CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE,
sizeof(double_width),&double_width,nullptr)); sizeof(double_width),&double_width,nullptr));
op.preferred_vector_width64=double_width;
if (double_width==0) if (double_width==0)
op.double_precision=false; op.double_precision=false;
else else
@ -452,9 +575,14 @@ void UCL_Device::add_properties(cl_device_id device_list) {
op.ecc_support=true; op.ecc_support=true;
op.c_version=""; op.c_version="";
op.is_subdevice=false;
op.partition_equal=false; op.partition_equal=false;
op.partition_counts=false; op.partition_counts=false;
op.partition_affinity=false; op.partition_affinity=false;
op.max_sub_devices=1;
op.cl_device_version=0;
op.has_subgroup_support=false;
op.has_shuffle_support=false;
#ifdef CL_VERSION_1_2 #ifdef CL_VERSION_1_2
size_t return_bytes; size_t return_bytes;
@ -463,6 +591,13 @@ void UCL_Device::add_properties(cl_device_id device_list) {
op.c_version=buffer; op.c_version=buffer;
cl_device_partition_property pinfo[4]; cl_device_partition_property pinfo[4];
CL_SAFE_CALL(clGetDeviceInfo(device_list, CL_DEVICE_PARTITION_TYPE,
4*sizeof(cl_device_partition_property),
&pinfo, &return_bytes));
if (return_bytes == 0) op.is_subdevice=false;
else if (pinfo[0]) op.is_subdevice=true;
else op.is_subdevice=false;
CL_SAFE_CALL(clGetDeviceInfo(device_list, CL_SAFE_CALL(clGetDeviceInfo(device_list,
CL_DEVICE_PARTITION_PROPERTIES, CL_DEVICE_PARTITION_PROPERTIES,
4*sizeof(cl_device_partition_property), 4*sizeof(cl_device_partition_property),
@ -480,6 +615,46 @@ void UCL_Device::add_properties(cl_device_id device_list) {
CL_SAFE_CALL(clGetDeviceInfo(device_list, CL_SAFE_CALL(clGetDeviceInfo(device_list,
CL_DEVICE_PARTITION_MAX_SUB_DEVICES, CL_DEVICE_PARTITION_MAX_SUB_DEVICES,
sizeof(cl_uint),&op.max_sub_devices,nullptr)); sizeof(cl_uint),&op.max_sub_devices,nullptr));
CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_VERSION,1024,buffer,nullptr));
int cl_version_maj = buffer[7] - '0';
int cl_version_min = buffer[9] - '0';
op.cl_device_version = cl_version_maj * 100 + cl_version_min * 10;
size_t ext_str_size_ret;
CL_SAFE_CALL(clGetDeviceInfo(device_list, CL_DEVICE_EXTENSIONS, 0, nullptr,
&ext_str_size_ret));
char buffer2[ext_str_size_ret];
CL_SAFE_CALL(clGetDeviceInfo(device_list, CL_DEVICE_EXTENSIONS,
ext_str_size_ret, buffer2, nullptr));
#if defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0)
if (op.cl_device_version >= 210) {
if ((std::string(buffer2).find("cl_khr_subgroups") != std::string::npos) ||
(std::string(buffer2).find("cl_intel_subgroups") != std::string::npos))
op.has_subgroup_support=true;
if (std::string(buffer2).find("cl_intel_subgroups") != std::string::npos)
op.has_shuffle_support=true;
}
#endif
if (std::string(buffer2).find("cl_nv_device_attribute_query") !=
std::string::npos) {
#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000
#endif
#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV
#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001
#endif
cl_uint major, minor;
CL_SAFE_CALL(clGetDeviceInfo(device_list,
CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
sizeof(cl_uint), &major, nullptr));
CL_SAFE_CALL(clGetDeviceInfo(device_list,
CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,
sizeof(cl_uint), &minor, nullptr));
double arch = static_cast<double>(minor)/10+major;
if (arch >= 3.0)
op.has_shuffle_support=true;
}
#endif #endif
_properties.push_back(op); _properties.push_back(op);
@ -516,7 +691,7 @@ std::string UCL_Device::device_type_name(const int i) {
} }
// Get a string telling the type of the device // Get a string telling the type of the device
int UCL_Device::device_type(const int i) { enum UCL_DEVICE_TYPE UCL_Device::device_type(const int i) {
if (_properties[i].device_type==CL_DEVICE_TYPE_CPU) if (_properties[i].device_type==CL_DEVICE_TYPE_CPU)
return UCL_CPU; return UCL_CPU;
else if (_properties[i].device_type==CL_DEVICE_TYPE_GPU) else if (_properties[i].device_type==CL_DEVICE_TYPE_GPU)
@ -529,14 +704,8 @@ int UCL_Device::device_type(const int i) {
// Set the CUDA device to the specified device number // Set the CUDA device to the specified device number
int UCL_Device::set(int num) { int UCL_Device::set(int num) {
cl_device_id *device_list = new cl_device_id[_num_devices];
cl_uint n;
CL_SAFE_CALL(clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,_num_devices,
device_list,&n));
_device=num; _device=num;
_cl_device=device_list[_device]; _cl_device=_cl_devices[_device];
delete[] device_list;
return create_context(); return create_context();
} }
@ -555,6 +724,11 @@ void UCL_Device::print_all(std::ostream &out) {
out << "\nDevice " << i << ": \"" << name(i).c_str() << "\"\n"; out << "\nDevice " << i << ": \"" << name(i).c_str() << "\"\n";
out << " Type of device: " out << " Type of device: "
<< device_type_name(i).c_str() << std::endl; << device_type_name(i).c_str() << std::endl;
out << " Is a subdevice: ";
if (is_subdevice(i))
out << "Yes\n";
else
out << "No\n";
out << " Double precision support: "; out << " Double precision support: ";
if (double_precision(i)) if (double_precision(i))
out << "Yes\n"; out << "Yes\n";
@ -613,31 +787,91 @@ void UCL_Device::print_all(std::ostream &out) {
out << "No\n"; out << "No\n";
out << " Maximum subdevices from fission: " out << " Maximum subdevices from fission: "
<< max_sub_devices(i) << std::endl; << max_sub_devices(i) << std::endl;
out << " Shared memory system: ";
if (shared_memory(i))
out << "Yes\n";
else
out << "No\n";
} }
} }
} }
// Select the platform that is associated with accelerators int UCL_Device::auto_set_platform(const enum UCL_DEVICE_TYPE type,
// if pid < 0, select the first platform const std::string vendor,
int UCL_Device::set_platform_accelerator(int pid) { const int ndevices,
if (pid < 0) { const int first_device) {
int found = 0; if (_num_platforms < 2) return set_platform(0);
int last_device = -1;
if (first_device > -1) {
if (ndevices)
last_device = first_device + ndevices - 1;
else
last_device = first_device;
}
bool vendor_match=false;
bool type_match=false;
int max_cus=0;
int best_platform=0;
std::string vendor_upper=vendor;
for (int i=0; i<vendor.length(); i++)
if (vendor_upper[i]<='z' && vendor_upper[i]>='a')
vendor_upper[i]=toupper(vendor_upper[i]);
for (int n=0; n<_num_platforms; n++) { for (int n=0; n<_num_platforms; n++) {
set_platform(n); set_platform(n);
for (int i=0; i<num_devices(); i++) { if (last_device > -1 && last_device >= num_devices()) continue;
if ((_properties[i].device_type & CL_DEVICE_TYPE_CPU) || if (ndevices > num_devices()) continue;
(_properties[i].device_type & CL_DEVICE_TYPE_GPU) ||
(_properties[i].device_type & CL_DEVICE_TYPE_ACCELERATOR)) { int first_id=0;
found = 1; int last_id=num_devices()-1;
break; if (last_device > -1) {
first_id=first_device;
last_id=last_device;
}
if (vendor_upper!="") {
std::string pname = platform_name();
for (int i=0; i<pname.length(); i++)
if (pname[i]<='z' && pname[i]>='a')
pname[i]=toupper(pname[i]);
if (pname.find(vendor_upper)!=std::string::npos) {
if (vendor_match == false) {
best_platform=n;
max_cus=0;
vendor_match=true;
}
} else if (vendor_match)
continue;
}
if (type != UCL_DEFAULT) {
bool ptype_matched=false;
for (int d=first_id; d<=last_id; d++) {
if (type==device_type(d)) {
if (type_match == false) {
best_platform=n;
max_cus=0;
type_match=true;
ptype_matched=true;
} }
} }
if (found) return UCL_SUCCESS;
} }
return UCL_ERROR; if (type_match==true && ptype_matched==false)
} else { continue;
return set_platform(pid);
} }
for (int d=first_id; d<=last_id; d++) {
if (cus(d) > max_cus) {
best_platform=n;
max_cus=cus(d);
}
}
}
return set_platform(best_platform);
} }
} // namespace ucl_opencl } // namespace ucl_opencl

View File

@ -2,6 +2,7 @@
ocl_kernel.h ocl_kernel.h
------------------- -------------------
W. Michael Brown W. Michael Brown
Nitin Dhamankar (Intel)
Utilities for dealing with OpenCL kernels Utilities for dealing with OpenCL kernels
@ -26,6 +27,7 @@
#include "ocl_device.h" #include "ocl_device.h"
#include <fstream> #include <fstream>
#include <cstdio>
namespace ucl_opencl { namespace ucl_opencl {
@ -93,7 +95,7 @@ class UCL_Program {
/// Load a program from a string and compile with flags /// Load a program from a string and compile with flags
inline int load_string(const void *program, const char *flags="", inline int load_string(const void *program, const char *flags="",
std::string *log=nullptr) { std::string *log=nullptr, FILE* foutput=nullptr) {
cl_int error_flag; cl_int error_flag;
const char *prog=(const char *)program; const char *prog=(const char *)program;
_program=clCreateProgramWithSource(_context,1,&prog,nullptr,&error_flag); _program=clCreateProgramWithSource(_context,1,&prog,nullptr,&error_flag);
@ -107,26 +109,65 @@ class UCL_Program {
sizeof(cl_build_status),&build_status, sizeof(cl_build_status),&build_status,
nullptr)); nullptr));
if (build_status != CL_SUCCESS || log!=nullptr) { #ifdef GERYON_KERNEL_DUMP
{
size_t ms; size_t ms;
CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,0, CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,
nullptr, &ms)); 0,NULL,&ms));
char *build_log = new char[ms]; char *build_log = new char[ms];
CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,ms, CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,
build_log, nullptr)); ms,build_log, NULL));
std::cout << std::endl << std::endl
<< "--------------------------------------------------------\n"
<< " UCL PROGRAM DUMP\n"
<< "--------------------------------------------------------\n"
<< flags << std::endl
<< "--------------------------------------------------------\n"
<< prog << std::endl
<< "--------------------------------------------------------\n"
<< build_log
<< "--------------------------------------------------------\n"
<< std::endl << std::endl;
}
#endif
if (build_status != CL_SUCCESS || log!=NULL) {
size_t ms;
CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,
0,NULL,&ms));
char *build_log = new char[ms];
CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,
ms,build_log, NULL));
if (log!=nullptr) if (log!=nullptr)
*log=std::string(build_log); *log=std::string(build_log);
if (build_status != CL_SUCCESS) { if (build_status != CL_SUCCESS) {
#ifndef UCL_NO_EXIT #ifndef UCL_NO_EXIT
std::cerr << std::endl std::cerr << std::endl << std::endl
<< "----------------------------------------------------------\n" << "----------------------------------------------------------\n"
<< " UCL Error: Error compiling OpenCL Program (" << " UCL Error: Error compiling OpenCL Program ("
<< build_status << ") ...\n" << build_status << ") ...\n"
<< "----------------------------------------------------------\n"; << "----------------------------------------------------------\n";
std::cerr << build_log << std::endl; std::cerr << build_log << std::endl;
std::cerr <<
"----------------------------------------------------------\n"
<< std::endl << std::endl;
#endif #endif
if (foutput != NULL) {
fprintf(foutput,"\n\n");
fprintf(foutput,
"----------------------------------------------------------\n");
fprintf(foutput,
" UCL Error: Error compiling OpenCL Program (%d) ...\n",
build_status);
fprintf(foutput,
"----------------------------------------------------------\n");
fprintf(foutput,"%s\n",build_log);
fprintf(foutput,
"----------------------------------------------------------\n");
fprintf(foutput,"\n\n");
}
delete[] build_log; delete[] build_log;
return UCL_COMPILE_ERROR; return UCL_COMPILE_ERROR;
} else delete[] build_log; } else delete[] build_log;
@ -141,6 +182,7 @@ class UCL_Program {
inline void cq(command_queue &cq_in) { _cq=cq_in; } inline void cq(command_queue &cq_in) { _cq=cq_in; }
friend class UCL_Kernel; friend class UCL_Kernel;
friend class UCL_Const;
private: private:
bool _init_done; bool _init_done;
cl_program _program; cl_program _program;
@ -322,9 +364,45 @@ class UCL_Kernel {
inline void cq(command_queue &cq_in) { _cq=cq_in; } inline void cq(command_queue &cq_in) { _cq=cq_in; }
#include "ucl_arg_kludge.h" #include "ucl_arg_kludge.h"
#if defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0)
inline size_t max_subgroup_size(const size_t block_size_x) {
size_t block_size = block_size_x;
CL_SAFE_CALL(clGetKernelSubGroupInfo(_kernel, _device,
CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE,
sizeof(block_size), (void *) &block_size,
sizeof(size_t), (void *) &_mx_subgroup_sz,
NULL));
return _mx_subgroup_sz;
}
inline size_t max_subgroup_size(const size_t block_size_x,
const size_t block_size_y) {
size_t block_size[2] { block_size_x, block_size_y };
CL_SAFE_CALL(clGetKernelSubGroupInfo(_kernel, _device,
CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE,
sizeof(block_size), (void *) &block_size,
sizeof(size_t), (void *) &_mx_subgroup_sz,
NULL));
return _mx_subgroup_sz;
}
inline size_t max_subgroup_size(const size_t block_size_x,
const size_t block_size_y,
const size_t block_size_z) {
size_t block_size[3] { block_size_x, block_size_y, block_size_z };
CL_SAFE_CALL(clGetKernelSubGroupInfo(_kernel, _device,
CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE,
sizeof(block_size), (void *) &block_size,
sizeof(size_t), (void *) &_mx_subgroup_sz,
NULL));
return _mx_subgroup_sz;
}
#endif
private: private:
cl_kernel _kernel; cl_kernel _kernel;
cl_program _program; cl_program _program;
cl_device_id _device;
cl_uint _dimensions; cl_uint _dimensions;
size_t _block_size[3]; size_t _block_size[3];
size_t _num_blocks[3]; size_t _num_blocks[3];
@ -338,6 +416,11 @@ class UCL_Kernel {
unsigned _kernel_info_nargs; unsigned _kernel_info_nargs;
//std::string _kernel_info_args[256]; //std::string _kernel_info_args[256];
#endif #endif
#ifdef CL_VERSION_2_1
size_t _mx_subgroup_sz; // Maximum sub-group size for this kernel
#endif
}; };
inline int UCL_Kernel::set_function(UCL_Program &program, const char *function) { inline int UCL_Kernel::set_function(UCL_Program &program, const char *function) {
@ -347,6 +430,7 @@ inline int UCL_Kernel::set_function(UCL_Program &program, const char *function)
CL_SAFE_CALL(clRetainCommandQueue(_cq)); CL_SAFE_CALL(clRetainCommandQueue(_cq));
_program=program._program; _program=program._program;
CL_SAFE_CALL(clRetainProgram(_program)); CL_SAFE_CALL(clRetainProgram(_program));
_device=program._device;
cl_int error_flag; cl_int error_flag;
_kernel=clCreateKernel(program._program,function,&error_flag); _kernel=clCreateKernel(program._program,function,&error_flag);
@ -380,8 +464,11 @@ inline int UCL_Kernel::set_function(UCL_Program &program, const char *function)
} }
void UCL_Kernel::run() { void UCL_Kernel::run() {
CL_SAFE_CALL(clEnqueueNDRangeKernel(_cq,_kernel,_dimensions,nullptr, CL_SAFE_CALL(clEnqueueNDRangeKernel(_cq,_kernel,_dimensions,NULL,
_num_blocks,_block_size,0,nullptr,nullptr)); _num_blocks,_block_size,0,NULL,NULL));
#ifdef GERYON_OCL_FLUSH
ucl_flush(_cq);
#endif
} }
} // namespace } // namespace

View File

@ -4,14 +4,6 @@
#include <cstdio> #include <cstdio>
#include <cassert> #include <cassert>
/* We default to OpenCL 1.2 as target version for now as
* there are known issues with OpenCL 2.0 and later.
* This is also to silence warnings from generic OpenCL headers */
#if !defined(CL_TARGET_OPENCL_VERSION)
#define CL_TARGET_OPENCL_VERSION 120
#endif
#ifdef __APPLE__ #ifdef __APPLE__
#include <OpenCL/cl.h> #include <OpenCL/cl.h>
#else #else

View File

@ -108,7 +108,7 @@ inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
return UCL_MEMORY_ERROR; return UCL_MEMORY_ERROR;
*mat.host_ptr() = (typename mat_type::data_type*) *mat.host_ptr() = (typename mat_type::data_type*)
clEnqueueMapBuffer(cm.cq(),mat.cbegin(),CL_TRUE, clEnqueueMapBuffer(cm.cq(),mat.cbegin(),CL_TRUE,
map_perm,0,n,0,nullptr,nullptr,nullptr); map_perm,0,n,0,NULL,NULL,NULL);
mat.cq()=cm.cq(); mat.cq()=cm.cq();
CL_SAFE_CALL(clRetainCommandQueue(mat.cq())); CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
@ -116,18 +116,15 @@ inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
} }
template <class mat_type, class copy_type> template <class mat_type, class copy_type>
inline int _host_view(mat_type &mat, copy_type &cm, const size_t n) { inline int _host_view(mat_type &mat, copy_type &cm, const size_t o,
const size_t n) {
cl_int error_flag; cl_int error_flag;
cl_context context; cl_buffer_region subbuffer;
CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_CONTEXT,sizeof(context), subbuffer.origin = o;
&context,nullptr)); subbuffer.size = n;
cl_mem_flags orig_flags; mat.cbegin()=clCreateSubBuffer(cm.cbegin(), 0,
CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_FLAGS,sizeof(orig_flags), CL_BUFFER_CREATE_TYPE_REGION, &subbuffer,
&orig_flags,nullptr)); &error_flag);
orig_flags=orig_flags & ~CL_MEM_ALLOC_HOST_PTR;
mat.cbegin()=clCreateBuffer(context, CL_MEM_USE_HOST_PTR | orig_flags, n,
*mat.host_ptr(), &error_flag);
CL_CHECK_ERR(error_flag); CL_CHECK_ERR(error_flag);
CL_SAFE_CALL(clRetainCommandQueue(mat.cq())); CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
@ -470,6 +467,9 @@ inline void _device_zero(mat_type &mat, const size_t n, command_queue &cq) {
size_t kn=n/sizeof(typename mat_type::data_type); size_t kn=n/sizeof(typename mat_type::data_type);
CL_SAFE_CALL(clEnqueueNDRangeKernel(cq,kzero,1,0,&kn,0,0,0,0)); CL_SAFE_CALL(clEnqueueNDRangeKernel(cq,kzero,1,0,&kn,0,0,0,0));
#endif #endif
#ifdef GERYON_OCL_FLUSH
ucl_flush(cq);
#endif
} }
// -------------------------------------------------------------------------- // --------------------------------------------------------------------------
@ -585,7 +585,10 @@ template <> struct _ucl_memcpy<1,0> {
std::cerr << "UCL_COPY 1NS\n"; std::cerr << "UCL_COPY 1NS\n";
#endif #endif
CL_SAFE_CALL(clEnqueueReadBuffer(cq,src.cbegin(),block,src_offset,n, CL_SAFE_CALL(clEnqueueReadBuffer(cq,src.cbegin(),block,src_offset,n,
dst.begin(),0,nullptr,nullptr)); dst.begin(),0,NULL,NULL));
#ifdef GERYON_OCL_FLUSH
if (block==CL_FALSE) ucl_flush(cq);
#endif
} }
template <class p1, class p2> template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
@ -617,6 +620,9 @@ template <> struct _ucl_memcpy<1,0> {
src_offset+=spitch; src_offset+=spitch;
dst_offset+=dpitch; dst_offset+=dpitch;
} }
#ifdef GERYON_OCL_FLUSH
if (block==CL_FALSE) ucl_flush(cq);
#endif
} }
}; };
@ -637,7 +643,10 @@ template <> struct _ucl_memcpy<0,1> {
std::cerr << "UCL_COPY 3NS\n"; std::cerr << "UCL_COPY 3NS\n";
#endif #endif
CL_SAFE_CALL(clEnqueueWriteBuffer(cq,dst.cbegin(),block,dst_offset,n, CL_SAFE_CALL(clEnqueueWriteBuffer(cq,dst.cbegin(),block,dst_offset,n,
src.begin(),0,nullptr,nullptr)); src.begin(),0,NULL,NULL));
#ifdef GERYON_OCL_FLUSH
if (block==CL_FALSE) ucl_flush(cq);
#endif
} }
template <class p1, class p2> template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
@ -669,6 +678,9 @@ template <> struct _ucl_memcpy<0,1> {
src_offset+=spitch; src_offset+=spitch;
dst_offset+=dpitch; dst_offset+=dpitch;
} }
#ifdef GERYON_OCL_FLUSH
if (block==CL_FALSE) ucl_flush(cq);
#endif
} }
}; };
@ -690,6 +702,9 @@ template <int mem1, int mem2> struct _ucl_memcpy {
#endif #endif
if (block==CL_TRUE) ucl_sync(cq); if (block==CL_TRUE) ucl_sync(cq);
#ifdef GERYON_OCL_FLUSH
else ucl_flush(cq);
#endif
} }
template <class p1, class p2> template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
@ -720,6 +735,9 @@ template <int mem1, int mem2> struct _ucl_memcpy {
#endif #endif
if (block==CL_TRUE) ucl_sync(cq); if (block==CL_TRUE) ucl_sync(cq);
#ifdef GERYON_OCL_FLUSH
else ucl_flush(cq);
#endif
} }
}; };

View File

@ -53,6 +53,59 @@ class UCL_Texture {
friend class UCL_Kernel; friend class UCL_Kernel;
}; };
/// Class storing a const global memory reference
class UCL_Const {
public:
UCL_Const() : _global_bytes(0), _active(false) {}
~UCL_Const() { clear(); }
/// Construct with a specified global reference
inline UCL_Const(UCL_Program &prog, const char *global_name)
{ get_global(prog,global_name); }
/// Set the global reference for this object
inline void get_global(UCL_Program &prog, const char *global_name) {
if (_active) {
CL_DESTRUCT_CALL(clReleaseContext(_context));
CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq));
}
_active = true;
_context = prog._context;
_cq = prog._cq;
CL_SAFE_CALL(clRetainContext(_context));
CL_SAFE_CALL(clRetainCommandQueue(_cq));
}
/// Copy from array on host to const memory
template <class numtyp>
inline void update_device(UCL_H_Vec<numtyp> &src, const int numel) {
const int bytes=numel*sizeof(numtyp);
if (_global_bytes < bytes) {
if (_global_bytes) CL_SAFE_CALL(clReleaseMemObject(_global));
cl_int e;
_global = clCreateBuffer(_context, CL_MEM_READ_ONLY, bytes, NULL, &e);
CL_SAFE_CALL(e);
}
CL_SAFE_CALL(clEnqueueWriteBuffer(_cq, _global, CL_FALSE, 0, bytes,
(void *)src.begin(), 0, NULL, NULL));
}
/// Get device ptr associated with object
inline const cl_mem * begin() const { return &_global; }
inline void clear() {
if (_global_bytes) CL_SAFE_CALL(clReleaseMemObject(_global));
if (_active) {
CL_DESTRUCT_CALL(clReleaseContext(_context));
CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq));
}
_global_bytes=0;
_active=false;
}
private:
cl_mem _global;
size_t _global_bytes;
cl_context _context;
cl_command_queue _cq;
bool _active;
};
} // namespace } // namespace
#endif #endif

View File

@ -61,7 +61,6 @@ class UCL_Timer {
/// Initialize command queue for timing /// Initialize command queue for timing
inline void init(UCL_Device &dev, command_queue &cq) { inline void init(UCL_Device &dev, command_queue &cq) {
clear(); clear();
t_factor=dev.timer_resolution()/1000000000.0;
_cq=cq; _cq=cq;
clRetainCommandQueue(_cq); clRetainCommandQueue(_cq);
_initialized=true; _initialized=true;
@ -124,17 +123,17 @@ class UCL_Timer {
clReleaseEvent(start_event); clReleaseEvent(start_event);
clReleaseEvent(stop_event); clReleaseEvent(stop_event);
has_measured_time = false; has_measured_time = false;
return (tend-tstart)*t_factor; return (tend-tstart)*1e-6;
} }
/// Return the time (s) of last start to stop - Forces synchronization /// Return the time (s) of last start to stop - Forces synchronization
inline double seconds() { return time()/1000.0; } inline double seconds() { return time()*1e-3; }
/// Return the total time in ms /// Return the total time in ms
inline double total_time() { return _total_time; } inline double total_time() { return _total_time; }
/// Return the total time in seconds /// Return the total time in seconds
inline double total_seconds() { return _total_time/1000.0; } inline double total_seconds() { return _total_time*1e-3; }
private: private:
cl_event start_event, stop_event; cl_event start_event, stop_event;

View File

@ -69,17 +69,17 @@ class UCL_BaseMat {
/// Return the type/permissions of memory allocation /// Return the type/permissions of memory allocation
/** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, UCL_NOT_PINNED /** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, UCL_NOT_PINNED
* or UCL_VIEW **/ * or UCL_VIEW **/
/// Assert that any ops in associate command queue have been issued to device
inline void flush() { ucl_flush(_cq); }
inline enum UCL_MEMOPT kind() const { return _kind; } inline enum UCL_MEMOPT kind() const { return _kind; }
inline bool shared_mem_device() { inline bool shared_mem_device() {
#ifdef _OCL_MAT #ifdef _OCL_MAT
cl_device_id device; cl_device_id device;
CL_SAFE_CALL(clGetCommandQueueInfo(_cq,CL_QUEUE_DEVICE, CL_SAFE_CALL(clGetCommandQueueInfo(_cq,CL_QUEUE_DEVICE,
sizeof(cl_device_id),&device,nullptr)); sizeof(cl_device_id),&device,NULL));
cl_device_type device_type; return _shared_mem_device(device);
CL_SAFE_CALL(clGetDeviceInfo(device,CL_DEVICE_TYPE,
sizeof(device_type),&device_type,nullptr));
return _shared_mem_device(device_type);
#else #else
return false; return false;
#endif #endif

View File

@ -39,7 +39,7 @@ class UCL_D_Vec : public UCL_BaseMat {
}; };
typedef numtyp data_type; typedef numtyp data_type;
UCL_D_Vec() : _cols(0) {} UCL_D_Vec() : _cols(0), _row_bytes(0) {}
~UCL_D_Vec() { _device_free(*this); } ~UCL_D_Vec() { _device_free(*this); }
/// Construct with n columns /// Construct with n columns

View File

@ -44,10 +44,8 @@ using namespace ucl_hip;
int main(int argc, char** argv) { int main(int argc, char** argv) {
UCL_Device cop; UCL_Device cop;
std::cout << "Found " << cop.num_platforms() << " platform(s).\n"; std::cout << "Found " << cop.num_platforms() << " platform(s).\n";
if (cop.num_platforms()>0) { if (cop.num_platforms()>0)
std::cout << "Using platform: " << cop.platform_name() << std::endl;
cop.print_all(std::cout); cop.print_all(std::cout);
}
return 0; return 0;
} }

View File

@ -241,7 +241,7 @@ class UCL_H_Mat : public UCL_BaseMat {
_array=input.begin()+offset; _array=input.begin()+offset;
_end=_array+_cols; _end=_array+_cols;
#ifdef _OCL_MAT #ifdef _OCL_MAT
_host_view(*this,input,_row_bytes*_rows); _host_view(*this,input,offset*sizeof(numtyp),_row_bytes*_rows);
#endif #endif
} }

View File

@ -39,7 +39,7 @@ class UCL_H_Vec : public UCL_BaseMat {
}; };
typedef numtyp data_type; typedef numtyp data_type;
UCL_H_Vec() : _cols(0) { UCL_H_Vec() : _cols(0), _row_bytes(0) {
#ifdef _OCL_MAT #ifdef _OCL_MAT
_carray=(cl_mem)(0); _carray=(cl_mem)(0);
#endif #endif
@ -135,7 +135,7 @@ class UCL_H_Vec : public UCL_BaseMat {
_cols=cols; _cols=cols;
_row_bytes=_cols*sizeof(numtyp); _row_bytes=_cols*sizeof(numtyp);
this->_cq=input.cq(); this->_cq=input.cq();
_array=input.begin(); _array=(numtyp *)input.begin();
_end=_array+_cols; _end=_array+_cols;
#ifdef _OCL_MAT #ifdef _OCL_MAT
_carray=input.cbegin(); _carray=input.cbegin();
@ -240,10 +240,10 @@ class UCL_H_Vec : public UCL_BaseMat {
_cols=cols; _cols=cols;
_row_bytes=_cols*sizeof(numtyp); _row_bytes=_cols*sizeof(numtyp);
this->_cq=input.cq(); this->_cq=input.cq();
_array=input.begin()+offset; _array=(numtyp *)input.begin()+offset;
_end=_array+_cols; _end=_array+_cols;
#ifdef _OCL_MAT #ifdef _OCL_MAT
_host_view(*this,input,_row_bytes); _host_view(*this,input,offset*sizeof(numtyp),_row_bytes);
#endif #endif
} }

View File

@ -162,6 +162,8 @@ class UCL_Vector {
inline void cq(command_queue &cq_in) { host.cq(cq_in); device.cq(cq_in); } inline void cq(command_queue &cq_in) { host.cq(cq_in); device.cq(cq_in); }
/// Block until command_queue associated with matrix is complete /// Block until command_queue associated with matrix is complete
inline void sync() { host.sync(); } inline void sync() { host.sync(); }
/// Assert that any ops in associate command queue have been issued to device
inline void flush() { ucl_flush(host.cq()); }
///Get the size of a row on the host (including any padding) in elements ///Get the size of a row on the host (including any padding) in elements
inline size_t row_size() const { return host.row_size(); } inline size_t row_size() const { return host.row_size(); }

View File

@ -14,6 +14,9 @@
***************************************************************************/ ***************************************************************************/
#include "lal_answer.h" #include "lal_answer.h"
#if (LAL_USE_OMP == 1)
#include <omp.h>
#endif
namespace LAMMPS_AL { namespace LAMMPS_AL {
#define AnswerT Answer<numtyp,acctyp> #define AnswerT Answer<numtyp,acctyp>
@ -81,6 +84,10 @@ bool AnswerT::init(const int inum, const bool charge, const bool rot,
_time_cast=0.0; _time_cast=0.0;
_time_cpu_idle=0.0; _time_cpu_idle=0.0;
success=success && (error_flag.alloc(1,*dev,UCL_READ_WRITE,
UCL_WRITE_ONLY)==UCL_SUCCESS);
if (success) error_flag.zero();
return success && alloc(ef_inum); return success && alloc(ef_inum);
} }
@ -111,6 +118,7 @@ bool AnswerT::add_fields(const bool charge, const bool rot) {
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
void AnswerT::clear() { void AnswerT::clear() {
_gpu_bytes=0; _gpu_bytes=0;
error_flag.clear();
if (!_allocated) if (!_allocated)
return; return;
_allocated=false; _allocated=false;
@ -138,12 +146,21 @@ double AnswerT::host_memory_usage() const {
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
void AnswerT::copy_answers(const bool eflag, const bool vflag, void AnswerT::copy_answers(const bool eflag, const bool vflag,
const bool ef_atom, const bool vf_atom) { const bool ef_atom, const bool vf_atom,
const int red_blocks) {
time_answer.start(); time_answer.start();
_eflag=eflag; _eflag=eflag;
_vflag=vflag; _vflag=vflag;
_ef_atom=ef_atom; _ef_atom=ef_atom;
_vf_atom=vf_atom; _vf_atom=vf_atom;
#ifdef LAL_NO_BLOCK_REDUCE
_ev_stride=_inum;
#else
if (ef_atom || vf_atom)
_ev_stride=_inum;
else
_ev_stride=red_blocks;
#endif
int csize=_ev_fields; int csize=_ev_fields;
if (!eflag) if (!eflag)
@ -152,20 +169,24 @@ void AnswerT::copy_answers(const bool eflag, const bool vflag,
csize-=6; csize-=6;
if (csize>0) if (csize>0)
engv.update_host(_inum*csize,true); engv.update_host(_ev_stride*csize,true);
if (_rot) if (_rot)
force.update_host(_inum*4*2,true); force.update_host(_inum*4*2,true);
else else
force.update_host(_inum*4,true); force.update_host(_inum*4,true);
time_answer.stop(); time_answer.stop();
#ifndef GERYON_OCL_FLUSH
force.flush();
#endif
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
void AnswerT::copy_answers(const bool eflag, const bool vflag, void AnswerT::copy_answers(const bool eflag, const bool vflag,
const bool ef_atom, const bool vf_atom, const bool ef_atom, const bool vf_atom,
int *ilist) { int *ilist, const int red_blocks) {
_ilist=ilist; _ilist=ilist;
copy_answers(eflag,vflag,ef_atom,vf_atom); copy_answers(eflag,vflag,ef_atom,vf_atom,red_blocks);
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
@ -177,21 +198,24 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
double evdwl=0.0; double evdwl=0.0;
int vstart=0; int vstart=0;
if (_eflag) { if (_eflag) {
for (int i=0; i<_inum; i++) #if (LAL_USE_OMP_SIMD == 1)
#pragma omp simd reduction(+:evdwl)
#endif
for (int i=0; i<_ev_stride; i++)
evdwl+=engv[i]; evdwl+=engv[i];
if (_ef_atom) { if (_ef_atom) {
if (_ilist==nullptr) { if (_ilist==nullptr) {
for (int i=0; i<_inum; i++) for (int i=0; i<_ev_stride; i++)
eatom[i]+=engv[i]; eatom[i]+=engv[i];
} else { } else {
for (int i=0; i<_inum; i++) for (int i=0; i<_ev_stride; i++)
eatom[_ilist[i]]+=engv[i]; eatom[_ilist[i]]+=engv[i];
} }
} }
vstart=_inum; vstart=_ev_stride;
} }
if (_vflag) { if (_vflag) {
int iend=vstart+_inum; int iend=vstart+_ev_stride;
for (int j=0; j<6; j++) { for (int j=0; j<6; j++) {
for (int i=vstart; i<iend; i++) for (int i=vstart; i<iend; i++)
virial[j]+=engv[i]; virial[j]+=engv[i];
@ -206,8 +230,8 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
vatom[_ilist[ii++]][j]+=engv[i]; vatom[_ilist[ii++]][j]+=engv[i];
} }
} }
vstart+=_inum; vstart+=_ev_stride;
iend+=_inum; iend+=_ev_stride;
} }
} }
@ -224,28 +248,36 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
return energy_virial(eatom,vatom,virial); return energy_virial(eatom,vatom,virial);
double evdwl=0.0; double evdwl=0.0;
int ii, vstart=0, iend=_inum; int ii, vstart=0, iend=_ev_stride;
if (_eflag) { if (_eflag) {
iend=_inum*2; iend=_ev_stride*2;
for (int i=0; i<_inum; i++) #if (LAL_USE_OMP_SIMD == 1)
#pragma omp simd reduction(+:evdwl)
#endif
for (int i=0; i<_ev_stride; i++)
evdwl+=engv[i]; evdwl+=engv[i];
for (int i=_inum; i<iend; i++) double ecv=0.0;
ecoul+=engv[i]; #if (LAL_USE_OMP_SIMD == 1)
#pragma omp simd reduction(+:ecv)
#endif
for (int i=_ev_stride; i<iend; i++)
ecv+=engv[i];
ecoul+=ecv;
if (_ef_atom) { if (_ef_atom) {
if (_ilist==nullptr) { if (_ilist==nullptr) {
for (int i=0; i<_inum; i++) for (int i=0; i<_ev_stride; i++)
eatom[i]+=engv[i]; eatom[i]+=engv[i];
for (int i=_inum; i<iend; i++) for (int i=_ev_stride; i<iend; i++)
eatom[i]+=engv[i]; eatom[i]+=engv[i];
} else { } else {
for (int i=0, ii=0; i<_inum; i++) for (int i=0, ii=0; i<_ev_stride; i++)
eatom[_ilist[ii++]]+=engv[i]; eatom[_ilist[ii++]]+=engv[i];
for (int i=_inum, ii=0; i<iend; i++) for (int i=_ev_stride, ii=0; i<iend; i++)
eatom[_ilist[ii++]]+=engv[i]; eatom[_ilist[ii++]]+=engv[i];
} }
} }
vstart=iend; vstart=iend;
iend+=_inum; iend+=_ev_stride;
} }
if (_vflag) { if (_vflag) {
for (int j=0; j<6; j++) { for (int j=0; j<6; j++) {
@ -260,8 +292,8 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
vatom[_ilist[ii++]][j]+=engv[i]; vatom[_ilist[ii++]][j]+=engv[i];
} }
} }
vstart+=_inum; vstart+=_ev_stride;
iend+=_inum; iend+=_ev_stride;
} }
} }
@ -270,24 +302,63 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
void AnswerT::get_answers(double **f, double **tor) { void AnswerT::get_answers(double **f, double **tor) {
int fl=0;
if (_ilist==nullptr) { if (_ilist==nullptr) {
for (int i=0; i<_inum; i++) { typedef struct { double x,y,z; } vec3d;
f[i][0]+=force[fl]; typedef struct { acctyp x,y,z,w; } vec4d_t;
f[i][1]+=force[fl+1]; vec3d *fp=reinterpret_cast<vec3d*>(&(f[0][0]));
f[i][2]+=force[fl+2]; vec4d_t *forcep=reinterpret_cast<vec4d_t*>(&(force[0]));
fl+=4;
#if (LAL_USE_OMP == 1)
#pragma omp parallel
#endif
{
#if (LAL_USE_OMP == 1)
const int nthreads = omp_get_num_threads();
const int tid = omp_get_thread_num();
const int idelta = _inum / nthreads + 1;
const int ifrom = tid * idelta;
const int ito = std::min(ifrom + idelta, _inum);
#else
const int tid = 0;
const int ifrom = 0;
const int ito = _inum;
#endif
for (int i=ifrom; i<ito; i++) {
fp[i].x+=forcep[i].x;
fp[i].y+=forcep[i].y;
fp[i].z+=forcep[i].z;
} }
if (_rot) { if (_rot) {
for (int i=0; i<_inum; i++) { vec3d *torp=reinterpret_cast<vec3d*>(&(tor[0][0]));
tor[i][0]+=force[fl]; forcep=reinterpret_cast<vec4d_t*>(&(force[_inum*4]));
tor[i][1]+=force[fl+1]; for (int i=ifrom; i<ito; i++) {
tor[i][2]+=force[fl+2]; torp[i].x+=forcep[i].x;
fl+=4; torp[i].y+=forcep[i].y;
torp[i].z+=forcep[i].z;
}
} }
} }
} else { } else {
for (int i=0; i<_inum; i++) { #if (LAL_USE_OMP == 1)
#pragma omp parallel
#endif
{
#if (LAL_USE_OMP == 1)
const int nthreads = omp_get_num_threads();
const int tid = omp_get_thread_num();
const int idelta = _inum / nthreads + 1;
const int ifrom = tid * idelta;
const int ito = std::min(ifrom + idelta, _inum);
int fl=ifrom*4;
#else
const int tid = 0;
const int ifrom = 0;
const int ito = _inum;
int fl=0;
#endif
for (int i=ifrom; i<ito; i++) {
int ii=_ilist[i]; int ii=_ilist[i];
f[ii][0]+=force[fl]; f[ii][0]+=force[fl];
f[ii][1]+=force[fl+1]; f[ii][1]+=force[fl+1];
@ -295,7 +366,8 @@ void AnswerT::get_answers(double **f, double **tor) {
fl+=4; fl+=4;
} }
if (_rot) { if (_rot) {
for (int i=0; i<_inum; i++) { fl=_inum*4 + ifrom*4;
for (int i=ifrom; i<ito; i++) {
int ii=_ilist[i]; int ii=_ilist[i];
tor[ii][0]+=force[fl]; tor[ii][0]+=force[fl];
tor[ii][1]+=force[fl+1]; tor[ii][1]+=force[fl+1];
@ -305,6 +377,7 @@ void AnswerT::get_answers(double **f, double **tor) {
} }
} }
} }
}
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
void AnswerT::cq(const int cq_index) { void AnswerT::cq(const int cq_index) {

View File

@ -110,12 +110,12 @@ class Answer {
// -------------------------COPY FROM GPU ------------------------------- // -------------------------COPY FROM GPU -------------------------------
/// Copy answers from device into read buffer asynchronously /// Copy answers from device into read buffer asynchronously
void copy_answers(const bool eflag, const bool vflag, void copy_answers(const bool eflag, const bool vflag, const bool ef_atom,
const bool ef_atom, const bool vf_atom); const bool vf_atom, const int red_blocks);
/// Copy answers from device into read buffer asynchronously /// Copy answers from device into read buffer asynchronously
void copy_answers(const bool eflag, const bool vflag, void copy_answers(const bool eflag, const bool vflag, const bool ef_atom,
const bool ef_atom, const bool vf_atom, int *ilist); const bool vf_atom, int *ilist, const int red_blocks);
/// Copy energy and virial data into LAMMPS memory /// Copy energy and virial data into LAMMPS memory
double energy_virial(double *eatom, double **vatom, double *virial); double energy_virial(double *eatom, double **vatom, double *virial);
@ -128,11 +128,13 @@ class Answer {
void get_answers(double **f, double **tor); void get_answers(double **f, double **tor);
inline double get_answers(double **f, double **tor, double *eatom, inline double get_answers(double **f, double **tor, double *eatom,
double **vatom, double *virial, double &ecoul) { double **vatom, double *virial, double &ecoul,
int &error_flag_in) {
double ta=MPI_Wtime(); double ta=MPI_Wtime();
time_answer.sync_stop(); time_answer.sync_stop();
_time_cpu_idle+=MPI_Wtime()-ta; _time_cpu_idle+=MPI_Wtime()-ta;
double ts=MPI_Wtime(); double ts=MPI_Wtime();
if (error_flag[0]) error_flag_in=error_flag[0];
double evdw=energy_virial(eatom,vatom,virial,ecoul); double evdw=energy_virial(eatom,vatom,virial,ecoul);
get_answers(f,tor); get_answers(f,tor);
_time_cast+=MPI_Wtime()-ts; _time_cast+=MPI_Wtime()-ts;
@ -151,6 +153,8 @@ class Answer {
UCL_Vector<acctyp,acctyp> force; UCL_Vector<acctyp,acctyp> force;
/// Energy and virial per-atom storage /// Energy and virial per-atom storage
UCL_Vector<acctyp,acctyp> engv; UCL_Vector<acctyp,acctyp> engv;
/// Error flag
UCL_Vector<int,int> error_flag;
/// Device timers /// Device timers
UCL_Timer time_answer; UCL_Timer time_answer;
@ -162,7 +166,7 @@ class Answer {
bool alloc(const int inum); bool alloc(const int inum);
bool _allocated, _eflag, _vflag, _ef_atom, _vf_atom, _rot, _charge, _other; bool _allocated, _eflag, _vflag, _ef_atom, _vf_atom, _rot, _charge, _other;
int _max_local, _inum, _e_fields, _ev_fields, _ans_fields; int _max_local, _inum, _e_fields, _ev_fields, _ans_fields, _ev_stride;
int *_ilist; int *_ilist;
double _time_cast, _time_cpu_idle; double _time_cast, _time_cpu_idle;

View File

@ -414,9 +414,9 @@ const char *atom=0;
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
void AtomT::compile_kernels(UCL_Device &dev) { void AtomT::compile_kernels(UCL_Device &dev) {
std::string flags = "-D"+std::string(OCL_VENDOR); std::string flags = "";
atom_program=new UCL_Program(dev); atom_program=new UCL_Program(dev);
atom_program->load_string(atom,flags); atom_program->load_string(atom,flags,nullptr,screen);
k_cast_x.set_function(*atom_program,"kernel_cast_x"); k_cast_x.set_function(*atom_program,"kernel_cast_x");
_compiled=true; _compiled=true;
} }

View File

@ -24,6 +24,9 @@
#include "geryon/ocl_mat.h" #include "geryon/ocl_mat.h"
#include "geryon/ocl_kernel.h" #include "geryon/ocl_kernel.h"
using namespace ucl_opencl; using namespace ucl_opencl;
#ifndef LAL_NO_OCL_EV_JIT
#define LAL_OCL_EV_JIT
#endif
#elif defined(USE_CUDART) #elif defined(USE_CUDART)
#include "geryon/nvc_timer.h" #include "geryon/nvc_timer.h"
#include "geryon/nvc_mat.h" #include "geryon/nvc_mat.h"
@ -178,7 +181,7 @@ class Atom {
ii+=m_size-n; ii+=m_size-n;
} }
UCL_H_Vec<dev_typ> view; UCL_H_Vec<dev_typ> view;
view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev); view.view_offset(0,buffer,m_size*m_size);
ucl_copy(dev_v,view,false); ucl_copy(dev_v,view,false);
} }
@ -197,7 +200,26 @@ class Atom {
ii+=m_size-n; ii+=m_size-n;
} }
UCL_H_Vec<dev_typ> view; UCL_H_Vec<dev_typ> view;
view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev); view.view_offset(0,buffer,m_size*m_size);
ucl_copy(dev_v,view,false);
}
/// Pack LAMMPS atom type constants into 2 vectors and copy to device
template <class dev_typ, class t1, class t2>
inline void type_pack2(const int n, UCL_D_Vec<dev_typ> &dev_v,
UCL_H_Vec<numtyp> &buffer, t1 ***one, t2 ***two) {
int ii=0;
for (int i=0; i<n; i++) {
for (int j=0; j<n; j++) {
for (int k=0; k<n; k++) {
buffer[ii*2]=static_cast<numtyp>(one[i][j][k]);
buffer[ii*2+1]=static_cast<numtyp>(two[i][j][k]);
ii++;
}
}
}
UCL_H_Vec<dev_typ> view;
view.view_offset(0,buffer,n*n*n);
ucl_copy(dev_v,view,false); ucl_copy(dev_v,view,false);
} }
@ -217,7 +239,7 @@ class Atom {
ii+=m_size-n; ii+=m_size-n;
} }
UCL_H_Vec<dev_typ> view; UCL_H_Vec<dev_typ> view;
view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev); view.view_offset(0,buffer,m_size*m_size);
ucl_copy(dev_v,view,false); ucl_copy(dev_v,view,false);
} }
@ -238,7 +260,7 @@ class Atom {
ii+=m_size-n; ii+=m_size-n;
} }
UCL_H_Vec<dev_typ> view; UCL_H_Vec<dev_typ> view;
view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev); view.view_offset(0,buffer,m_size*m_size);
ucl_copy(dev_v,view,false); ucl_copy(dev_v,view,false);
} }
@ -251,7 +273,7 @@ class Atom {
buffer[i*2+1]=static_cast<numtyp>(two[i][i]); buffer[i*2+1]=static_cast<numtyp>(two[i][i]);
} }
UCL_H_Vec<dev_typ> view; UCL_H_Vec<dev_typ> view;
view.view((dev_typ*)buffer.begin(),n,*dev); view.view_offset(0,buffer,n);
ucl_copy(dev_v,view,false); ucl_copy(dev_v,view,false);
} }
@ -261,6 +283,9 @@ class Atom {
inline void data_unavail() inline void data_unavail()
{ _x_avail=false; _q_avail=false; _quat_avail=false; _v_avail=false; _resized=false; } { _x_avail=false; _q_avail=false; _quat_avail=false; _v_avail=false; _resized=false; }
typedef struct { double x,y,z; } vec3d;
typedef struct { numtyp x,y,z,w; } vec4d_t;
/// Cast positions and types to write buffer /// Cast positions and types to write buffer
inline void cast_x_data(double **host_ptr, const int *host_type) { inline void cast_x_data(double **host_ptr, const int *host_type) {
if (_x_avail==false) { if (_x_avail==false) {
@ -269,13 +294,16 @@ class Atom {
memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double)); memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
memcpy(host_type_cast.begin(),host_type,_nall*sizeof(int)); memcpy(host_type_cast.begin(),host_type,_nall*sizeof(int));
#else #else
int wl=0; vec3d *host_p=reinterpret_cast<vec3d*>(&(host_ptr[0][0]));
vec4d_t *xp=reinterpret_cast<vec4d_t*>(&(x[0]));
#if (LAL_USE_OMP == 1)
#pragma omp parallel for schedule(static)
#endif
for (int i=0; i<_nall; i++) { for (int i=0; i<_nall; i++) {
x[wl]=host_ptr[i][0]; xp[i].x=host_p[i].x;
x[wl+1]=host_ptr[i][1]; xp[i].y=host_p[i].y;
x[wl+2]=host_ptr[i][2]; xp[i].z=host_p[i].z;
x[wl+3]=host_type[i]; xp[i].w=host_type[i];
wl+=4;
} }
#endif #endif
_time_cast+=MPI_Wtime()-t; _time_cast+=MPI_Wtime()-t;
@ -320,6 +348,11 @@ class Atom {
} else if (sizeof(numtyp)==sizeof(double)) } else if (sizeof(numtyp)==sizeof(double))
memcpy(q.host.begin(),host_ptr,_nall*sizeof(numtyp)); memcpy(q.host.begin(),host_ptr,_nall*sizeof(numtyp));
else else
#if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
#pragma omp parallel for simd schedule(static)
#elif (LAL_USE_OMP_SIMD == 1)
#pragma omp simd
#endif
for (int i=0; i<_nall; i++) q[i]=host_ptr[i]; for (int i=0; i<_nall; i++) q[i]=host_ptr[i];
_time_cast+=MPI_Wtime()-t; _time_cast+=MPI_Wtime()-t;
} }
@ -346,6 +379,11 @@ class Atom {
} else if (sizeof(numtyp)==sizeof(double)) } else if (sizeof(numtyp)==sizeof(double))
memcpy(quat.host.begin(),host_ptr,_nall*4*sizeof(numtyp)); memcpy(quat.host.begin(),host_ptr,_nall*4*sizeof(numtyp));
else else
#if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
#pragma omp parallel for simd schedule(static)
#elif (LAL_USE_OMP_SIMD == 1)
#pragma omp simd
#endif
for (int i=0; i<_nall*4; i++) quat[i]=host_ptr[i]; for (int i=0; i<_nall*4; i++) quat[i]=host_ptr[i];
_time_cast+=MPI_Wtime()-t; _time_cast+=MPI_Wtime()-t;
} }
@ -370,13 +408,16 @@ class Atom {
memcpy(host_v_cast.begin(),host_ptr[0],_nall*3*sizeof(double)); memcpy(host_v_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
memcpy(host_tag_cast.begin(),host_tag,_nall*sizeof(int)); memcpy(host_tag_cast.begin(),host_tag,_nall*sizeof(int));
#else #else
int wl=0; vec3d *host_p=reinterpret_cast<vec3d*>(&(host_ptr[0][0]));
vec4d_t *vp=reinterpret_cast<vec4d_t*>(&(v[0]));
#if (LAL_USE_OMP == 1)
#pragma omp parallel for schedule(static)
#endif
for (int i=0; i<_nall; i++) { for (int i=0; i<_nall; i++) {
v[wl]=host_ptr[i][0]; vp[i].x=host_p[i].x;
v[wl+1]=host_ptr[i][1]; vp[i].y=host_p[i].y;
v[wl+2]=host_ptr[i][2]; vp[i].z=host_p[i].z;
v[wl+3]=host_tag[i]; vp[i].w=host_tag[i];
wl+=4;
} }
#endif #endif
_time_cast+=MPI_Wtime()-t; _time_cast+=MPI_Wtime()-t;

View File

@ -40,170 +40,521 @@
nbor_begin+=offset; \ nbor_begin+=offset; \
} }
#if (ARCH < 300) #define nbor_info_p(nbor_mem, nbor_stride, t_per_atom, ii, offset, \
i, numj, stride, nbor_end, nbor_begin) \
i=nbor_mem[ii]; \
nbor_begin=ii+nbor_stride; \
numj=nbor_mem[nbor_begin]; \
nbor_begin+=nbor_stride+ii*(t_per_atom-1); \
stride=fast_mul(t_per_atom,nbor_stride); \
nbor_end=nbor_begin+fast_mul(numj/t_per_atom,stride)+(numj & \
(t_per_atom-1)); \
nbor_begin+=offset;
#define store_answers(f, energy, virial, ii, inum, tid, t_per_atom, offset, \ #if (SHUFFLE_AVAIL == 0)
eflag, vflag, ans, engv) \
if (t_per_atom>1) { \ #define simd_reduce_add1(width, local, offset, tid, one) \
__local acctyp red_acc[6][BLOCK_PAIR]; \ local[0][tid]=one; \
red_acc[0][tid]=f.x; \ for (unsigned int s=width/2; s>0; s>>=1) { \
red_acc[1][tid]=f.y; \ simdsync(); \
red_acc[2][tid]=f.z; \ if (offset < s) local[0][tid] += local[0][tid+s]; \
red_acc[3][tid]=energy; \ } \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ if (offset==0) one=local[0][tid];
#define simd_reduce_add2(width, local, offset, tid, one, two) \
local[0][tid]=one; \
local[1][tid]=two; \
for (unsigned int s=width/2; s>0; s>>=1) { \
simdsync(); \
if (offset < s) { \ if (offset < s) { \
for (int r=0; r<4; r++) \ local[0][tid] += local[0][tid+s]; \
red_acc[r][tid] += red_acc[r][tid+s]; \ local[1][tid] += local[1][tid+s]; \
} \
} \
f.x=red_acc[0][tid]; \
f.y=red_acc[1][tid]; \
f.z=red_acc[2][tid]; \
energy=red_acc[3][tid]; \
if (vflag>0) { \
for (int r=0; r<6; r++) \
red_acc[r][tid]=virial[r]; \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
if (offset < s) { \
for (int r=0; r<6; r++) \
red_acc[r][tid] += red_acc[r][tid+s]; \
} \
} \
for (int r=0; r<6; r++) \
virial[r]=red_acc[r][tid]; \
} \ } \
} \ } \
if (offset==0) { \ if (offset==0) { \
one=local[0][tid]; \
two=local[1][tid]; \
}
#define simd_reduce_add3(width, local, offset, tid, one, two, three) \
local[0][tid]=one; \
local[1][tid]=two; \
local[2][tid]=three; \
for (unsigned int s=width/2; s>0; s>>=1) { \
simdsync(); \
if (offset < s) { \
local[0][tid] += local[0][tid+s]; \
local[1][tid] += local[1][tid+s]; \
local[2][tid] += local[2][tid+s]; \
} \
} \
if (offset==0) { \
one=local[0][tid]; \
two=local[1][tid]; \
three=local[2][tid]; \
}
#define simd_reduce_add6(width, local, offset, tid, one, two, three, \
four, five, six) \
local[0][tid]=one; \
local[1][tid]=two; \
local[2][tid]=three; \
local[3][tid]=four; \
local[4][tid]=five; \
local[5][tid]=six; \
for (unsigned int s=width/2; s>0; s>>=1) { \
simdsync(); \
if (offset < s) { \
local[0][tid] += local[0][tid+s]; \
local[1][tid] += local[1][tid+s]; \
local[2][tid] += local[2][tid+s]; \
local[3][tid] += local[3][tid+s]; \
local[4][tid] += local[4][tid+s]; \
local[5][tid] += local[5][tid+s]; \
} \
} \
if (offset==0) { \
one=local[0][tid]; \
two=local[1][tid]; \
three=local[2][tid]; \
four=local[3][tid]; \
five=local[4][tid]; \
six=local[5][tid]; \
}
#define simd_reduce_arr(trip, width, local, offset, tid, arr) \
for (int r=0; r<trip; r++) \
local[r][tid]=arr[r]; \
for (unsigned int s=width/2; s>0; s>>=1) { \
simdsync(); \
if (offset < s) { \
for (int r=0; r<trip; r++) \
local[r][tid] += local[r][tid+s]; \
} \
} \
if (offset==0) { \
for (int r=0; r<trip; r++) \
arr[r]=local[r][tid]; \
}
#define block_reduce_add1(width, local, tid, one) \
local[0][tid]=one; \
for (unsigned int s=BLOCK_SIZE_X/2; s>width/2; s>>=1) { \
__syncthreads(); \
if (tid < s) local[0][tid] += local[0][tid+s]; \
} \
if (tid<width) { \
for (unsigned int s=width/2; s>0; s>>=1) { \
simdsync(); \
if (tid < s) local[0][tid] += local[0][tid+s]; \
} \
if (tid==0) one=local[0][tid]; \
}
#define block_reduce_add2(width, local, tid, one, two) \
local[0][tid]=one; \
local[1][tid]=two; \
for (unsigned int s=BLOCK_SIZE_X/2; s>width/2; s>>=1) { \
__syncthreads(); \
if (tid < s) { \
local[0][tid] += local[0][tid+s]; \
local[1][tid] += local[1][tid+s]; \
} \
} \
if (tid<width) { \
for (unsigned int s=width/2; s>0; s>>=1) { \
simdsync(); \
if (tid < s) { \
local[0][tid] += local[0][tid+s]; \
local[1][tid] += local[1][tid+s]; \
} \
} \
if (tid==0) { \
one=local[0][tid]; \
two=local[1][tid]; \
} \
}
#define block_reduce_arr(trip, width, local, tid, arr) \
for (int r=0; r<trip; r++) \
local[r][tid]=arr[r]; \
for (unsigned int s=BLOCK_SIZE_X/2; s>width/2; s>>=1) { \
__syncthreads(); \
if (tid < s) { \
for (int r=0; r<trip; r++) \
local[r][tid] += local[r][tid+s]; \
} \
} \
if (tid<width) { \
for (unsigned int s=width/2; s>0; s>>=1) { \
simdsync(); \
if (tid < s) { \
for (int r=0; r<trip; r++) \
local[r][tid] += local[r][tid+s]; \
} \
} \
if (tid==0) { \
for (int r=0; r<trip; r++) \
arr[r]=local[r][tid]; \
} \
}
#define local_allocate_store_pair() \
__local acctyp red_acc[6][BLOCK_PAIR];
#define local_allocate_store_charge() \
__local acctyp red_acc[6][BLOCK_PAIR];
#define local_allocate_store_bio() \
__local acctyp red_acc[6][BLOCK_BIO_PAIR];
#define local_allocate_store_ellipse() \
__local acctyp red_acc[6][BLOCK_ELLIPSE];
#define local_allocate_store_three() \
__local acctyp red_acc[6][BLOCK_ELLIPSE];
#define store_answers(f, energy, virial, ii, inum, tid, \
t_per_atom, offset, eflag, vflag, ans, engv) \
if (t_per_atom>1) { \
simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z); \
if (EVFLAG && (vflag==2 || eflag==2)) { \
if (eflag) { \
simdsync(); \
simd_reduce_add1(t_per_atom, red_acc, offset, tid, energy); \
} \
if (vflag) { \
simdsync(); \
simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial); \
} \
} \
} \
if (offset==0 && ii<inum) ans[ii]=f; \
if (EVFLAG && (eflag || vflag)) { \
int ei=BLOCK_ID_X; \
if (eflag!=2 && vflag!=2) { \
const int ev_stride=NUM_BLOCKS_X; \
if (eflag) { \
simdsync(); \
block_reduce_add1(simd_size(), red_acc, tid, energy); \
if (vflag) __syncthreads(); \
if (tid==0) { \
engv[ei]=energy*(acctyp)0.5; \
ei+=ev_stride; \
} \
} \
if (vflag) { \
simdsync(); \
block_reduce_arr(6, simd_size(), red_acc, tid, virial); \
if (tid==0) { \
for (int r=0; r<6; r++) { \
engv[ei]=virial[r]*(acctyp)0.5; \
ei+=ev_stride; \
} \
} \
} \
} else if (offset==0 && ii<inum) { \
int ei=ii; \ int ei=ii; \
if (eflag>0) { \ if (EVFLAG && eflag) { \
engv[ei]=energy*(acctyp)0.5; \ engv[ei]=energy*(acctyp)0.5; \
ei+=inum; \ ei+=inum; \
} \ } \
if (vflag>0) { \ if (EVFLAG && vflag) { \
for (int i=0; i<6; i++) { \ for (int i=0; i<6; i++) { \
engv[ei]=virial[i]*(acctyp)0.5; \ engv[ei]=virial[i]*(acctyp)0.5; \
ei+=inum; \ ei+=inum; \
} \ } \
} \ } \
ans[ii]=f; \ } \
} }
#define store_answers_q(f, energy, e_coul, virial, ii, inum, tid, \ #define store_answers_q(f, energy, e_coul, virial, ii, inum, tid, \
t_per_atom, offset, eflag, vflag, ans, engv) \ t_per_atom, offset, eflag, vflag, ans, engv) \
if (t_per_atom>1) { \ if (t_per_atom>1) { \
__local acctyp red_acc[6][BLOCK_PAIR]; \ simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z); \
red_acc[0][tid]=f.x; \ if (EVFLAG && (vflag==2 || eflag==2)) { \
red_acc[1][tid]=f.y; \ if (eflag) { \
red_acc[2][tid]=f.z; \ simdsync(); \
red_acc[3][tid]=energy; \ simd_reduce_add2(t_per_atom, red_acc, offset, tid, energy, e_coul); \
red_acc[4][tid]=e_coul; \ } \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ if (vflag) { \
if (offset < s) { \ simdsync(); \
for (int r=0; r<5; r++) \ simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial); \
red_acc[r][tid] += red_acc[r][tid+s]; \
} \ } \
} \ } \
f.x=red_acc[0][tid]; \ } \
f.y=red_acc[1][tid]; \ if (offset==0 && ii<inum) ans[ii]=f; \
f.z=red_acc[2][tid]; \ if (EVFLAG && (eflag || vflag)) { \
energy=red_acc[3][tid]; \ int ei=BLOCK_ID_X; \
e_coul=red_acc[4][tid]; \ const int ev_stride=NUM_BLOCKS_X; \
if (vflag>0) { \ if (eflag!=2 && vflag!=2) { \
for (int r=0; r<6; r++) \ if (eflag) { \
red_acc[r][tid]=virial[r]; \ simdsync(); \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ block_reduce_add2(simd_size(), red_acc, tid, energy, e_coul); \
if (offset < s) { \ if (vflag) __syncthreads(); \
for (int r=0; r<6; r++) \ if (tid==0) { \
red_acc[r][tid] += red_acc[r][tid+s]; \ engv[ei]=energy*(acctyp)0.5; \
ei+=ev_stride; \
engv[ei]=e_coul*(acctyp)0.5; \
ei+=ev_stride; \
} \ } \
} \ } \
for (int r=0; r<6; r++) \ if (vflag) { \
virial[r]=red_acc[r][tid]; \ simdsync(); \
block_reduce_arr(6, simd_size(), red_acc, tid, virial); \
if (tid==0) { \
for (int r=0; r<6; r++) { \
engv[ei]=virial[r]*(acctyp)0.5; \
ei+=ev_stride; \
} \ } \
} \ } \
if (offset==0) { \ } \
} else if (offset==0 && ii<inum) { \
int ei=ii; \ int ei=ii; \
if (eflag>0) { \ if (EVFLAG && eflag) { \
engv[ei]=energy*(acctyp)0.5; \ engv[ei]=energy*(acctyp)0.5; \
ei+=inum; \ ei+=inum; \
engv[ei]=e_coul*(acctyp)0.5; \ engv[ei]=e_coul*(acctyp)0.5; \
ei+=inum; \ ei+=inum; \
} \ } \
if (vflag>0) { \ if (EVFLAG && vflag) { \
for (int i=0; i<6; i++) { \ for (int i=0; i<6; i++) { \
engv[ei]=virial[i]*(acctyp)0.5; \ engv[ei]=virial[i]*(acctyp)0.5; \
ei+=inum; \ ei+=inum; \
} \ } \
} \ } \
ans[ii]=f; \ } \
} }
#else #else
#define store_answers(f, energy, virial, ii, inum, tid, t_per_atom, offset, \ #define simd_reduce_add1(width, one) \
eflag, vflag, ans, engv) \ for (unsigned int s=width/2; s>0; s>>=1) one += shfl_down(one, s, width);
#define simd_reduce_add2(width, one, two) \
for (unsigned int s=width/2; s>0; s>>=1) { \
one += shfl_down(one, s, width); \
two += shfl_down(two, s, width); \
}
#define simd_reduce_add3(width, one, two, three) \
for (unsigned int s=width/2; s>0; s>>=1) { \
one += shfl_down(one, s, width); \
two += shfl_down(two, s, width); \
three += shfl_down(three, s, width); \
}
#define simd_reduce_add6(width, one, two, three, four, five, six) \
for (unsigned int s=width/2; s>0; s>>=1) { \
one += shfl_down(one, s, width); \
two += shfl_down(two, s, width); \
three += shfl_down(three, s, width); \
four += shfl_down(four, s, width); \
five += shfl_down(five, s, width); \
six += shfl_down(six, s, width); \
}
#define simd_reduce_arr(trip, width, arr) \
for (unsigned int s=width/2; s>0; s>>=1) { \
for (int r=0; r<trip; r++) \
arr[r] += shfl_down(arr[r], s, width); \
}
#if (EVFLAG == 1)
#define local_allocate_store_pair() \
__local acctyp red_acc[7][BLOCK_PAIR / SIMD_SIZE];
#define local_allocate_store_charge() \
__local acctyp red_acc[8][BLOCK_PAIR / SIMD_SIZE];
#define local_allocate_store_bio() \
__local acctyp red_acc[8][BLOCK_BIO_PAIR / SIMD_SIZE];
#define local_allocate_store_ellipse()
#define local_allocate_store_three() \
__local acctyp red_acc[7][BLOCK_ELLIPSE / SIMD_SIZE];
#define store_answers(f, energy, virial, ii, inum, tid, \
t_per_atom, offset, eflag, vflag, ans, engv) \
if (t_per_atom>1) { \ if (t_per_atom>1) { \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \
f.x += shfl_xor(f.x, s, t_per_atom); \ if (vflag==2 || eflag==2) { \
f.y += shfl_xor(f.y, s, t_per_atom); \ if (eflag) \
f.z += shfl_xor(f.z, s, t_per_atom); \ simd_reduce_add1(t_per_atom,energy); \
energy += shfl_xor(energy, s, t_per_atom); \ if (vflag) \
simd_reduce_arr(6, t_per_atom,virial); \
} \ } \
if (vflag>0) { \ } \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ if (offset==0 && ii<inum) ans[ii]=f; \
for (int r=0; r<6; r++) \ if (eflag || vflag) { \
virial[r] += shfl_xor(virial[r], s, t_per_atom); \ if (eflag!=2 && vflag!=2) { \
const int vwidth = simd_size(); \
const int voffset = tid & (simd_size() - 1); \
const int bnum = tid/simd_size(); \
int active_subgs = BLOCK_SIZE_X/simd_size(); \
for ( ; active_subgs > 1; active_subgs /= vwidth) { \
if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads(); \
if (bnum < active_subgs) { \
if (eflag) { \
simd_reduce_add1(vwidth, energy); \
if (voffset==0) red_acc[6][bnum] = energy; \
} \
if (vflag) { \
simd_reduce_arr(6, vwidth, virial); \
if (voffset==0) \
for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r]; \
} \
} \
\
__syncthreads(); \
if (tid < active_subgs) { \
if (eflag) energy = red_acc[6][tid]; \
if (vflag) \
for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid]; \
} else { \
if (eflag) energy = (acctyp)0; \
if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0; \
} \
} \
\
if (bnum == 0) { \
int ei=BLOCK_ID_X; \
const int ev_stride=NUM_BLOCKS_X; \
if (eflag) { \
simd_reduce_add1(vwidth, energy); \
if (tid==0) { \
engv[ei]=energy*(acctyp)0.5; \
ei+=ev_stride; \
} \
} \
if (vflag) { \
simd_reduce_arr(6, vwidth, virial); \
if (tid==0) { \
for (int r=0; r<6; r++) { \
engv[ei]=virial[r]*(acctyp)0.5; \
ei+=ev_stride; \
} \ } \
} \ } \
} \ } \
if (offset==0) { \ } \
} else if (offset==0 && ii<inum) { \
int ei=ii; \ int ei=ii; \
if (eflag>0) { \ if (eflag) { \
engv[ei]=energy*(acctyp)0.5; \ engv[ei]=energy*(acctyp)0.5; \
ei+=inum; \ ei+=inum; \
} \ } \
if (vflag>0) { \ if (vflag) { \
for (int i=0; i<6; i++) { \ for (int i=0; i<6; i++) { \
engv[ei]=virial[i]*(acctyp)0.5; \ engv[ei]=virial[i]*(acctyp)0.5; \
ei+=inum; \ ei+=inum; \
} \ } \
} \ } \
ans[ii]=f; \ } \
} }
#define store_answers_q(f, energy, e_coul, virial, ii, inum, tid, \ #define store_answers_q(f, energy, e_coul, virial, ii, inum, tid, \
t_per_atom, offset, eflag, vflag, ans, engv) \ t_per_atom, offset, eflag, vflag, ans, engv) \
if (t_per_atom>1) { \ if (t_per_atom>1) { \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \
f.x += shfl_xor(f.x, s, t_per_atom); \ if (vflag==2 || eflag==2) { \
f.y += shfl_xor(f.y, s, t_per_atom); \ if (eflag) \
f.z += shfl_xor(f.z, s, t_per_atom); \ simd_reduce_add2(t_per_atom,energy,e_coul); \
energy += shfl_xor(energy, s, t_per_atom); \ if (vflag) \
e_coul += shfl_xor(e_coul, s, t_per_atom); \ simd_reduce_arr(6, t_per_atom,virial); \
} \ } \
if (vflag>0) { \ } \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ if (offset==0 && ii<inum) ans[ii]=f; \
for (int r=0; r<6; r++) \ if (eflag || vflag) { \
virial[r] += shfl_xor(virial[r], s, t_per_atom); \ if (eflag!=2 && vflag!=2) { \
const int vwidth = simd_size(); \
const int voffset = tid & (simd_size() - 1); \
const int bnum = tid/simd_size(); \
int active_subgs = BLOCK_SIZE_X/simd_size(); \
for ( ; active_subgs > 1; active_subgs /= vwidth) { \
if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads(); \
if (bnum < active_subgs) { \
if (eflag) { \
simd_reduce_add2(vwidth, energy, e_coul); \
if (voffset==0) { \
red_acc[6][bnum] = energy; \
red_acc[7][bnum] = e_coul; \
} \
} \
if (vflag) { \
simd_reduce_arr(6, vwidth, virial); \
if (voffset==0) \
for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r]; \
} \
} \
\
__syncthreads(); \
if (tid < active_subgs) { \
if (eflag) { \
energy = red_acc[6][tid]; \
e_coul = red_acc[7][tid]; \
} \
if (vflag) \
for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid]; \
} else { \
if (eflag) energy = e_coul = (acctyp)0; \
if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0; \
} \
} \
\
if (bnum == 0) { \
int ei=BLOCK_ID_X; \
const int ev_stride=NUM_BLOCKS_X; \
if (eflag) { \
simd_reduce_add2(vwidth, energy, e_coul); \
if (tid==0) { \
engv[ei]=energy*(acctyp)0.5; \
ei+=ev_stride; \
engv[ei]=e_coul*(acctyp)0.5; \
ei+=ev_stride; \
} \
} \
if (vflag) { \
simd_reduce_arr(6, vwidth, virial); \
if (tid==0) { \
for (int r=0; r<6; r++) { \
engv[ei]=virial[r]*(acctyp)0.5; \
ei+=ev_stride; \
} \ } \
} \ } \
} \ } \
if (offset==0) { \ } \
} else if (offset==0 && ii<inum) { \
int ei=ii; \ int ei=ii; \
if (eflag>0) { \ if (eflag) { \
engv[ei]=energy*(acctyp)0.5; \ engv[ei]=energy*(acctyp)0.5; \
ei+=inum; \ ei+=inum; \
engv[ei]=e_coul*(acctyp)0.5; \ engv[ei]=e_coul*(acctyp)0.5; \
ei+=inum; \ ei+=inum; \
} \ } \
if (vflag>0) { \ if (vflag) { \
for (int i=0; i<6; i++) { \ for (int i=0; i<6; i++) { \
engv[ei]=virial[i]*(acctyp)0.5; \ engv[ei]=virial[i]*(acctyp)0.5; \
ei+=inum; \ ei+=inum; \
} \ } \
} \ } \
ans[ii]=f; \ } \
} }
#else
#define local_allocate_store_pair()
#define local_allocate_store_charge()
#define local_allocate_store_bio()
#define local_allocate_store_ellipse()
#define local_allocate_store_three()
#define store_answers(f, energy, virial, ii, inum, tid, \
t_per_atom, offset, eflag, vflag, ans, engv) \
if (t_per_atom>1) \
simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \
if (offset==0 && ii<inum) ans[ii]=f;
#define store_answers_q(f, energy, e_coul, virial, ii, inum, tid, \
t_per_atom, offset, eflag, vflag, ans, engv) \
if (t_per_atom>1) \
simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \
if (offset==0 && ii<inum) ans[ii]=f;
#endif
#endif #endif

View File

@ -21,12 +21,15 @@ namespace LAMMPS_AL {
extern Device<PRECISION,ACC_PRECISION> global_device; extern Device<PRECISION,ACC_PRECISION> global_device;
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
BaseAtomicT::BaseAtomic() : _compiled(false), _max_bytes(0) { BaseAtomicT::BaseAtomic() : _compiled(false), _max_bytes(0), _onetype(0) {
device=&global_device; device=&global_device;
ans=new Answer<numtyp,acctyp>(); ans=new Answer<numtyp,acctyp>();
nbor=new Neighbor(); nbor=new Neighbor();
pair_program=nullptr; pair_program=nullptr;
ucl_device=nullptr; ucl_device=nullptr;
#if defined(LAL_OCL_EV_JIT)
pair_program_noev=nullptr;
#endif
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
@ -36,6 +39,10 @@ BaseAtomicT::~BaseAtomic() {
k_pair_fast.clear(); k_pair_fast.clear();
k_pair.clear(); k_pair.clear();
if (pair_program) delete pair_program; if (pair_program) delete pair_program;
#if defined(LAL_OCL_EV_JIT)
k_pair_noev.clear();
if (pair_program_noev) delete pair_program_noev;
#endif
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
@ -49,7 +56,7 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall,
const int max_nbors, const int maxspecial, const int max_nbors, const int maxspecial,
const double cell_size, const double gpu_split, const double cell_size, const double gpu_split,
FILE *_screen, const void *pair_program, FILE *_screen, const void *pair_program,
const char *k_name) { const char *k_name, const int onetype) {
screen=_screen; screen=_screen;
int gpu_nbor=0; int gpu_nbor=0;
@ -64,28 +71,29 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall,
_gpu_host=1; _gpu_host=1;
_threads_per_atom=device->threads_per_atom(); _threads_per_atom=device->threads_per_atom();
if (_threads_per_atom>1 && gpu_nbor==0) {
nbor->packing(true);
_nbor_data=&(nbor->dev_packed);
} else
_nbor_data=&(nbor->dev_nbor);
int success=device->init(*ans,false,false,nlocal,nall,maxspecial); int success=device->init(*ans,false,false,nlocal,nall,maxspecial);
if (success!=0) if (success!=0)
return success; return success;
success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
max_nbors,cell_size,false,_threads_per_atom);
if (success!=0)
return success;
if (ucl_device!=device->gpu) _compiled=false; if (ucl_device!=device->gpu) _compiled=false;
ucl_device=device->gpu; ucl_device=device->gpu;
atom=&device->atom; atom=&device->atom;
_block_size=device->pair_block_size(); _block_size=device->pair_block_size();
compile_kernels(*ucl_device,pair_program,k_name); compile_kernels(*ucl_device,pair_program,k_name,onetype);
if (_threads_per_atom>1 && gpu_nbor==0) {
nbor->packing(true);
_nbor_data=&(nbor->dev_packed);
} else
_nbor_data=&(nbor->dev_nbor);
success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
max_nbors,cell_size,false,_threads_per_atom);
if (success!=0)
return success;
// Initialize host-device load balancer // Initialize host-device load balancer
hd_balancer.init(device,gpu_nbor,gpu_split); hd_balancer.init(device,gpu_nbor,gpu_split);
@ -102,8 +110,8 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall,
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
void BaseAtomicT::estimate_gpu_overhead() { void BaseAtomicT::estimate_gpu_overhead(const int add_kernels) {
device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead); device->estimate_gpu_overhead(1+add_kernels,_gpu_overhead,_driver_overhead);
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
@ -164,8 +172,8 @@ inline void BaseAtomicT::build_nbor_list(const int inum, const int host_inum,
atom->cast_copy_x(host_x,host_type); atom->cast_copy_x(host_x,host_type);
int mn; int mn;
nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, tag, nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi,
nspecial, special, success, mn); tag, nspecial, special, success, mn, ans->error_flag);
double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
if (bytes>_max_an_bytes) if (bytes>_max_an_bytes)
@ -179,11 +187,25 @@ template <class numtyp, class acctyp>
void BaseAtomicT::compute(const int f_ago, const int inum_full, void BaseAtomicT::compute(const int f_ago, const int inum_full,
const int nall, double **host_x, int *host_type, const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh, int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag, const bool eflag_in, const bool vflag_in,
const bool eatom, const bool vatom, const bool eatom, const bool vatom,
int &host_start, const double cpu_time, int &host_start, const double cpu_time,
bool &success) { bool &success) {
acc_timers(); acc_timers();
int eflag, vflag;
if (eatom) eflag=2;
else if (eflag_in) eflag=1;
else eflag=0;
if (vatom) vflag=2;
else if (vflag_in) vflag=1;
else vflag=0;
#ifdef LAL_NO_BLOCK_REDUCE
if (eflag) eflag=2;
if (vflag) vflag=2;
#endif
set_kernel(eflag,vflag);
if (inum_full==0) { if (inum_full==0) {
host_start=0; host_start=0;
// Make sure textures are correct if realloc by a different hybrid style // Make sure textures are correct if realloc by a different hybrid style
@ -207,8 +229,8 @@ void BaseAtomicT::compute(const int f_ago, const int inum_full,
hd_balancer.start_timer(); hd_balancer.start_timer();
atom->add_x_data(host_x,host_type); atom->add_x_data(host_x,host_type);
loop(eflag,vflag); const int red_blocks=loop(eflag,vflag);
ans->copy_answers(eflag,vflag,eatom,vatom,ilist); ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks);
device->add_ans_object(ans); device->add_ans_object(ans);
hd_balancer.stop_timer(); hd_balancer.stop_timer();
} }
@ -220,12 +242,26 @@ template <class numtyp, class acctyp>
int ** BaseAtomicT::compute(const int ago, const int inum_full, int ** BaseAtomicT::compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type, const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, tagint *tag, double *sublo, double *subhi, tagint *tag,
int **nspecial, tagint **special, const bool eflag, int **nspecial, tagint **special,
const bool vflag, const bool eatom, const bool eflag_in, const bool vflag_in,
const bool vatom, int &host_start, const bool eatom, const bool vatom,
int **ilist, int **jnum, int &host_start, int **ilist, int **jnum,
const double cpu_time, bool &success) { const double cpu_time, bool &success) {
acc_timers(); acc_timers();
int eflag, vflag;
if (eatom) eflag=2;
else if (eflag_in) eflag=1;
else eflag=0;
if (vatom) vflag=2;
else if (vflag_in) vflag=1;
else vflag=0;
#ifdef LAL_NO_BLOCK_REDUCE
if (eflag) eflag=2;
if (vflag) vflag=2;
#endif
set_kernel(eflag,vflag);
if (inum_full==0) { if (inum_full==0) {
host_start=0; host_start=0;
// Make sure textures are correct if realloc by a different hybrid style // Make sure textures are correct if realloc by a different hybrid style
@ -254,8 +290,8 @@ int ** BaseAtomicT::compute(const int ago, const int inum_full,
*ilist=nbor->host_ilist.begin(); *ilist=nbor->host_ilist.begin();
*jnum=nbor->host_acc.begin(); *jnum=nbor->host_acc.begin();
loop(eflag,vflag); const int red_blocks=loop(eflag,vflag);
ans->copy_answers(eflag,vflag,eatom,vatom); ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
device->add_ans_object(ans); device->add_ans_object(ans);
hd_balancer.stop_timer(); hd_balancer.stop_timer();
@ -270,19 +306,46 @@ double BaseAtomicT::host_memory_usage_atomic() const {
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
void BaseAtomicT::compile_kernels(UCL_Device &dev, const void *pair_str, void BaseAtomicT::compile_kernels(UCL_Device &dev, const void *pair_str,
const char *kname) { const char *kname, const int onetype) {
if (_compiled) if (_compiled && _onetype==onetype)
return; return;
_onetype=onetype;
std::string s_fast=std::string(kname)+"_fast"; std::string s_fast=std::string(kname)+"_fast";
if (pair_program) delete pair_program; if (pair_program) delete pair_program;
pair_program=new UCL_Program(dev); pair_program=new UCL_Program(dev);
pair_program->load_string(pair_str,device->compile_string().c_str()); std::string oclstring = device->compile_string()+" -DEVFLAG=1";
if (_onetype) oclstring+=" -DONETYPE="+device->toa(_onetype);
pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen);
k_pair_fast.set_function(*pair_program,s_fast.c_str()); k_pair_fast.set_function(*pair_program,s_fast.c_str());
k_pair.set_function(*pair_program,kname); k_pair.set_function(*pair_program,kname);
pos_tex.get_texture(*pair_program,"pos_tex"); pos_tex.get_texture(*pair_program,"pos_tex");
#if defined(LAL_OCL_EV_JIT)
oclstring = device->compile_string()+" -DEVFLAG=0";
if (_onetype) oclstring+=" -DONETYPE="+device->toa(_onetype);
if (pair_program_noev) delete pair_program_noev;
pair_program_noev=new UCL_Program(dev);
pair_program_noev->load_string(pair_str,oclstring.c_str(),nullptr,screen);
k_pair_noev.set_function(*pair_program_noev,s_fast.c_str());
#else
k_pair_sel = &k_pair_fast;
#endif
_compiled=true; _compiled=true;
#if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
if (dev.cl_device_version() >= 210) {
size_t mx_subgroup_sz = k_pair_fast.max_subgroup_size(_block_size);
#if defined(LAL_OCL_EV_JIT)
mx_subgroup_sz = std::min(mx_subgroup_sz, k_pair_noev.max_subgroup_size(_block_size));
#endif
if (_threads_per_atom > mx_subgroup_sz)
_threads_per_atom = mx_subgroup_sz;
device->set_simd_size(mx_subgroup_sz);
}
#endif
} }
template class BaseAtomic<PRECISION,ACC_PRECISION>; template class BaseAtomic<PRECISION,ACC_PRECISION>;

View File

@ -53,10 +53,11 @@ class BaseAtomic {
int init_atomic(const int nlocal, const int nall, const int max_nbors, int init_atomic(const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size, const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen, const double gpu_split, FILE *screen,
const void *pair_program, const char *k_name); const void *pair_program, const char *k_name,
const int onetype=0);
/// Estimate the overhead for GPU context changes and CPU driver /// Estimate the overhead for GPU context changes and CPU driver
void estimate_gpu_overhead(); void estimate_gpu_overhead(const int add_kernels=0);
/// Check if there is enough storage for atom arrays and realloc if not /// Check if there is enough storage for atom arrays and realloc if not
/** \param success set to false if insufficient memory **/ /** \param success set to false if insufficient memory **/
@ -100,7 +101,7 @@ class BaseAtomic {
/// Accumulate timers /// Accumulate timers
inline void acc_timers() { inline void acc_timers() {
if (device->time_device()) { if (device->time_device()) {
nbor->acc_timers(); nbor->acc_timers(screen);
time_pair.add_to_total(); time_pair.add_to_total();
atom->acc_timers(); atom->acc_timers();
ans->acc_timers(); ans->acc_timers();
@ -179,23 +180,31 @@ class BaseAtomic {
Neighbor *nbor; Neighbor *nbor;
// ------------------------- DEVICE KERNELS ------------------------- // ------------------------- DEVICE KERNELS -------------------------
UCL_Program *pair_program; UCL_Program *pair_program, *pair_program_noev;
UCL_Kernel k_pair_fast, k_pair; UCL_Kernel k_pair_fast, k_pair, k_pair_noev, *k_pair_sel;
inline int block_size() { return _block_size; } inline int block_size() { return _block_size; }
inline void set_kernel(const int eflag, const int vflag) {
#if defined(LAL_OCL_EV_JIT)
if (eflag || vflag) k_pair_sel = &k_pair_fast;
else k_pair_sel = &k_pair_noev;
#endif
}
// --------------------------- TEXTURES ----------------------------- // --------------------------- TEXTURES -----------------------------
UCL_Texture pos_tex; UCL_Texture pos_tex;
protected: protected:
bool _compiled; bool _compiled;
int _block_size, _threads_per_atom; int _block_size, _threads_per_atom, _onetype;
double _max_bytes, _max_an_bytes; double _max_bytes, _max_an_bytes;
double _gpu_overhead, _driver_overhead; double _gpu_overhead, _driver_overhead;
UCL_D_Vec<int> *_nbor_data; UCL_D_Vec<int> *_nbor_data;
void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k); void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k,
const int onetype);
virtual void loop(const bool _eflag, const bool _vflag) = 0; virtual int loop(const int eflag, const int vflag) = 0;
}; };
} }

View File

@ -27,6 +27,9 @@ BaseChargeT::BaseCharge() : _compiled(false), _max_bytes(0) {
nbor=new Neighbor(); nbor=new Neighbor();
pair_program=nullptr; pair_program=nullptr;
ucl_device=nullptr; ucl_device=nullptr;
#if defined(LAL_OCL_EV_JIT)
pair_program_noev=nullptr;
#endif
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
@ -36,6 +39,10 @@ BaseChargeT::~BaseCharge() {
k_pair_fast.clear(); k_pair_fast.clear();
k_pair.clear(); k_pair.clear();
if (pair_program) delete pair_program; if (pair_program) delete pair_program;
#if defined(LAL_OCL_EV_JIT)
k_pair_noev.clear();
if (pair_program_noev) delete pair_program_noev;
#endif
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
@ -64,21 +71,11 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall,
_gpu_host=1; _gpu_host=1;
_threads_per_atom=device->threads_per_charge(); _threads_per_atom=device->threads_per_charge();
if (_threads_per_atom>1 && gpu_nbor==0) {
nbor->packing(true);
_nbor_data=&(nbor->dev_packed);
} else
_nbor_data=&(nbor->dev_nbor);
int success=device->init(*ans,true,false,nlocal,nall,maxspecial); int success=device->init(*ans,true,false,nlocal,nall,maxspecial);
if (success!=0) if (success!=0)
return success; return success;
success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
max_nbors,cell_size,false,_threads_per_atom);
if (success!=0)
return success;
if (ucl_device!=device->gpu) _compiled=false; if (ucl_device!=device->gpu) _compiled=false;
ucl_device=device->gpu; ucl_device=device->gpu;
@ -88,6 +85,17 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall,
_block_bio_size=device->block_bio_pair(); _block_bio_size=device->block_bio_pair();
compile_kernels(*ucl_device,pair_program,k_name); compile_kernels(*ucl_device,pair_program,k_name);
if (_threads_per_atom>1 && gpu_nbor==0) {
nbor->packing(true);
_nbor_data=&(nbor->dev_packed);
} else
_nbor_data=&(nbor->dev_nbor);
success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
max_nbors,cell_size,false,_threads_per_atom);
if (success!=0)
return success;
// Initialize host-device load balancer // Initialize host-device load balancer
hd_balancer.init(device,gpu_nbor,gpu_split); hd_balancer.init(device,gpu_nbor,gpu_split);
@ -104,8 +112,8 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall,
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
void BaseChargeT::estimate_gpu_overhead() { void BaseChargeT::estimate_gpu_overhead(const int add_kernels) {
device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead); device->estimate_gpu_overhead(1+add_kernels,_gpu_overhead,_driver_overhead);
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
@ -166,8 +174,8 @@ inline void BaseChargeT::build_nbor_list(const int inum, const int host_inum,
atom->cast_copy_x(host_x,host_type); atom->cast_copy_x(host_x,host_type);
int mn; int mn;
nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, tag, nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi,
nspecial, special, success, mn); tag, nspecial, special, success, mn, ans->error_flag);
double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
if (bytes>_max_an_bytes) if (bytes>_max_an_bytes)
@ -181,12 +189,26 @@ template <class numtyp, class acctyp>
void BaseChargeT::compute(const int f_ago, const int inum_full, void BaseChargeT::compute(const int f_ago, const int inum_full,
const int nall, double **host_x, int *host_type, const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh, int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag, const bool eflag_in, const bool vflag_in,
const bool eatom, const bool vatom, const bool eatom, const bool vatom,
int &host_start, const double cpu_time, int &host_start, const double cpu_time,
bool &success, double *host_q, bool &success, double *host_q,
const int nlocal, double *boxlo, double *prd) { const int nlocal, double *boxlo, double *prd) {
acc_timers(); acc_timers();
int eflag, vflag;
if (eatom) eflag=2;
else if (eflag_in) eflag=1;
else eflag=0;
if (vatom) vflag=2;
else if (vflag_in) vflag=1;
else vflag=0;
#ifdef LAL_NO_BLOCK_REDUCE
if (eflag) eflag=2;
if (vflag) vflag=2;
#endif
set_kernel(eflag,vflag);
if (inum_full==0) { if (inum_full==0) {
host_start=0; host_start=0;
// Make sure textures are correct if realloc by a different hybrid style // Make sure textures are correct if realloc by a different hybrid style
@ -215,8 +237,8 @@ void BaseChargeT::compute(const int f_ago, const int inum_full,
device->precompute(f_ago,nlocal,nall,host_x,host_type,success,host_q, device->precompute(f_ago,nlocal,nall,host_x,host_type,success,host_q,
boxlo, prd); boxlo, prd);
loop(eflag,vflag); const int red_blocks=loop(eflag,vflag);
ans->copy_answers(eflag,vflag,eatom,vatom,ilist); ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks);
device->add_ans_object(ans); device->add_ans_object(ans);
hd_balancer.stop_timer(); hd_balancer.stop_timer();
} }
@ -228,13 +250,27 @@ template <class numtyp, class acctyp>
int** BaseChargeT::compute(const int ago, const int inum_full, int** BaseChargeT::compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type, const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, tagint *tag, double *sublo, double *subhi, tagint *tag,
int **nspecial, tagint **special, const bool eflag, int **nspecial, tagint **special,
const bool vflag, const bool eatom, const bool eflag_in, const bool vflag_in,
const bool vatom, int &host_start, const bool eatom, const bool vatom, int &host_start,
int **ilist, int **jnum, int **ilist, int **jnum,
const double cpu_time, bool &success, const double cpu_time, bool &success,
double *host_q, double *boxlo, double *prd) { double *host_q, double *boxlo, double *prd) {
acc_timers(); acc_timers();
int eflag, vflag;
if (eatom) eflag=2;
else if (eflag_in) eflag=1;
else eflag=0;
if (vatom) vflag=2;
else if (vflag_in) vflag=1;
else vflag=0;
#ifdef LAL_NO_BLOCK_REDUCE
if (eflag) eflag=2;
if (vflag) vflag=2;
#endif
set_kernel(eflag,vflag);
if (inum_full==0) { if (inum_full==0) {
host_start=0; host_start=0;
// Make sure textures are correct if realloc by a different hybrid style // Make sure textures are correct if realloc by a different hybrid style
@ -269,8 +305,8 @@ int** BaseChargeT::compute(const int ago, const int inum_full,
device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q, device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q,
boxlo, prd); boxlo, prd);
loop(eflag,vflag); const int red_blocks=loop(eflag,vflag);
ans->copy_answers(eflag,vflag,eatom,vatom); ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
device->add_ans_object(ans); device->add_ans_object(ans);
hd_balancer.stop_timer(); hd_balancer.stop_timer();
@ -292,13 +328,37 @@ void BaseChargeT::compile_kernels(UCL_Device &dev, const void *pair_str,
std::string s_fast=std::string(kname)+"_fast"; std::string s_fast=std::string(kname)+"_fast";
if (pair_program) delete pair_program; if (pair_program) delete pair_program;
pair_program=new UCL_Program(dev); pair_program=new UCL_Program(dev);
pair_program->load_string(pair_str,device->compile_string().c_str()); std::string oclstring = device->compile_string()+" -DEVFLAG=1";
pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen);
k_pair_fast.set_function(*pair_program,s_fast.c_str()); k_pair_fast.set_function(*pair_program,s_fast.c_str());
k_pair.set_function(*pair_program,kname); k_pair.set_function(*pair_program,kname);
pos_tex.get_texture(*pair_program,"pos_tex"); pos_tex.get_texture(*pair_program,"pos_tex");
q_tex.get_texture(*pair_program,"q_tex"); q_tex.get_texture(*pair_program,"q_tex");
#if defined(LAL_OCL_EV_JIT)
oclstring = device->compile_string()+" -DEVFLAG=0";
if (pair_program_noev) delete pair_program_noev;
pair_program_noev=new UCL_Program(dev);
pair_program_noev->load_string(pair_str,oclstring.c_str(),nullptr,screen);
k_pair_noev.set_function(*pair_program_noev,s_fast.c_str());
#else
k_pair_sel = &k_pair_fast;
#endif
_compiled=true; _compiled=true;
#if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
if (dev.cl_device_version() >= 210) {
size_t mx_subgroup_sz = k_pair_fast.max_subgroup_size(_block_size);
#if defined(LAL_OCL_EV_JIT)
mx_subgroup_sz = std::min(mx_subgroup_sz, k_pair_noev.max_subgroup_size(_block_size));
#endif
if (_threads_per_atom > mx_subgroup_sz)
_threads_per_atom = mx_subgroup_sz;
device->set_simd_size(mx_subgroup_sz);
}
#endif
} }
template class BaseCharge<PRECISION,ACC_PRECISION>; template class BaseCharge<PRECISION,ACC_PRECISION>;

View File

@ -57,7 +57,7 @@ class BaseCharge {
const void *pair_program, const char *k_name); const void *pair_program, const char *k_name);
/// Estimate the overhead for GPU context changes and CPU driver /// Estimate the overhead for GPU context changes and CPU driver
void estimate_gpu_overhead(); void estimate_gpu_overhead(const int add_kernels=0);
/// Check if there is enough storage for atom arrays and realloc if not /// Check if there is enough storage for atom arrays and realloc if not
/** \param success set to false if insufficient memory **/ /** \param success set to false if insufficient memory **/
@ -103,7 +103,7 @@ class BaseCharge {
/// Accumulate timers /// Accumulate timers
inline void acc_timers() { inline void acc_timers() {
if (device->time_device()) { if (device->time_device()) {
nbor->acc_timers(); nbor->acc_timers(screen);
time_pair.add_to_total(); time_pair.add_to_total();
atom->acc_timers(); atom->acc_timers();
ans->acc_timers(); ans->acc_timers();
@ -177,9 +177,15 @@ class BaseCharge {
Neighbor *nbor; Neighbor *nbor;
// ------------------------- DEVICE KERNELS ------------------------- // ------------------------- DEVICE KERNELS -------------------------
UCL_Program *pair_program; UCL_Program *pair_program, *pair_program_noev;
UCL_Kernel k_pair_fast, k_pair; UCL_Kernel k_pair_fast, k_pair, k_pair_noev, *k_pair_sel;
inline int block_size() { return _block_size; } inline int block_size() { return _block_size; }
inline void set_kernel(const int eflag, const int vflag) {
#if defined(LAL_OCL_EV_JIT)
if (eflag || vflag) k_pair_sel = &k_pair_fast;
else k_pair_sel = &k_pair_noev;
#endif
}
// --------------------------- TEXTURES ----------------------------- // --------------------------- TEXTURES -----------------------------
UCL_Texture pos_tex; UCL_Texture pos_tex;
@ -194,7 +200,7 @@ class BaseCharge {
void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k); void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k);
virtual void loop(const bool _eflag, const bool _vflag) = 0; virtual int loop(const int eflag, const int vflag) = 0;
}; };
} }

View File

@ -27,6 +27,9 @@ BaseDipoleT::BaseDipole() : _compiled(false), _max_bytes(0) {
nbor=new Neighbor(); nbor=new Neighbor();
pair_program=nullptr; pair_program=nullptr;
ucl_device=nullptr; ucl_device=nullptr;
#if defined(LAL_OCL_EV_JIT)
pair_program_noev=nullptr;
#endif
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
@ -36,6 +39,10 @@ BaseDipoleT::~BaseDipole() {
k_pair_fast.clear(); k_pair_fast.clear();
k_pair.clear(); k_pair.clear();
if (pair_program) delete pair_program; if (pair_program) delete pair_program;
#if defined(LAL_OCL_EV_JIT)
k_pair_noev.clear();
if (pair_program_noev) delete pair_program_noev;
#endif
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
@ -65,30 +72,30 @@ int BaseDipoleT::init_atomic(const int nlocal, const int nall,
_gpu_host=1; _gpu_host=1;
_threads_per_atom=device->threads_per_charge(); _threads_per_atom=device->threads_per_charge();
if (_threads_per_atom>1 && gpu_nbor==0) {
nbor->packing(true);
_nbor_data=&(nbor->dev_packed);
} else
_nbor_data=&(nbor->dev_nbor);
int success=device->init(*ans,true,true,nlocal,nall,maxspecial); int success=device->init(*ans,true,true,nlocal,nall,maxspecial);
if (success!=0) if (success!=0)
return success; return success;
success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
max_nbors,cell_size,false,_threads_per_atom);
if (success!=0)
return success;
if (ucl_device!=device->gpu) _compiled=false; if (ucl_device!=device->gpu) _compiled=false;
ucl_device=device->gpu; ucl_device=device->gpu;
atom=&device->atom; atom=&device->atom;
_block_size=device->pair_block_size(); _block_size=device->pair_block_size();
_block_bio_size=device->block_bio_pair();
compile_kernels(*ucl_device,pair_program,k_name); compile_kernels(*ucl_device,pair_program,k_name);
if (_threads_per_atom>1 && gpu_nbor==0) {
nbor->packing(true);
_nbor_data=&(nbor->dev_packed);
} else
_nbor_data=&(nbor->dev_nbor);
success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
max_nbors,cell_size,false,_threads_per_atom);
if (success!=0)
return success;
// Initialize host-device load balancer // Initialize host-device load balancer
hd_balancer.init(device,gpu_nbor,gpu_split); hd_balancer.init(device,gpu_nbor,gpu_split);
@ -168,8 +175,8 @@ inline void BaseDipoleT::build_nbor_list(const int inum, const int host_inum,
atom->cast_copy_x(host_x,host_type); atom->cast_copy_x(host_x,host_type);
int mn; int mn;
nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, tag, nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi,
nspecial, special, success, mn); tag, nspecial, special, success, mn, ans->error_flag);
double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
if (bytes>_max_an_bytes) if (bytes>_max_an_bytes)
@ -183,12 +190,26 @@ template <class numtyp, class acctyp>
void BaseDipoleT::compute(const int f_ago, const int inum_full, void BaseDipoleT::compute(const int f_ago, const int inum_full,
const int nall, double **host_x, int *host_type, const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh, int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag, const bool eflag_in, const bool vflag_in,
const bool eatom, const bool vatom, const bool eatom, const bool vatom,
int &host_start, const double cpu_time, int &host_start, const double cpu_time,
bool &success, double *host_q, double **host_mu, bool &success, double *host_q, double **host_mu,
const int nlocal, double *boxlo, double *prd) { const int nlocal, double *boxlo, double *prd) {
acc_timers(); acc_timers();
int eflag, vflag;
if (eatom) eflag=2;
else if (eflag_in) eflag=1;
else eflag=0;
if (vatom) vflag=2;
else if (vflag_in) vflag=1;
else vflag=0;
#ifdef LAL_NO_BLOCK_REDUCE
if (eflag) eflag=2;
if (vflag) vflag=2;
#endif
set_kernel(eflag,vflag);
if (inum_full==0) { if (inum_full==0) {
host_start=0; host_start=0;
// Make sure textures are correct if realloc by a different hybrid style // Make sure textures are correct if realloc by a different hybrid style
@ -219,8 +240,8 @@ void BaseDipoleT::compute(const int f_ago, const int inum_full,
device->precompute(f_ago,nlocal,nall,host_x,host_type,success,host_q, device->precompute(f_ago,nlocal,nall,host_x,host_type,success,host_q,
boxlo, prd); boxlo, prd);
loop(eflag,vflag); const int red_blocks=loop(eflag,vflag);
ans->copy_answers(eflag,vflag,eatom,vatom,ilist); ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks);
device->add_ans_object(ans); device->add_ans_object(ans);
hd_balancer.stop_timer(); hd_balancer.stop_timer();
} }
@ -232,14 +253,28 @@ template <class numtyp, class acctyp>
int** BaseDipoleT::compute(const int ago, const int inum_full, int** BaseDipoleT::compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type, const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, tagint *tag, double *sublo, double *subhi, tagint *tag,
int **nspecial, tagint **special, const bool eflag, int **nspecial, tagint **special,
const bool vflag, const bool eatom, const bool eflag_in, const bool vflag_in,
const bool vatom, int &host_start, const bool eatom, const bool vatom,
int **ilist, int **jnum, int &host_start, int **ilist, int **jnum,
const double cpu_time, bool &success, const double cpu_time, bool &success,
double *host_q, double **host_mu, double *host_q, double **host_mu,
double *boxlo, double *prd) { double *boxlo, double *prd) {
acc_timers(); acc_timers();
int eflag, vflag;
if (eatom) eflag=2;
else if (eflag_in) eflag=1;
else eflag=0;
if (vatom) vflag=2;
else if (vflag_in) vflag=1;
else vflag=0;
#ifdef LAL_NO_BLOCK_REDUCE
if (eflag) eflag=2;
if (vflag) vflag=2;
#endif
set_kernel(eflag,vflag);
if (inum_full==0) { if (inum_full==0) {
host_start=0; host_start=0;
// Make sure textures are correct if realloc by a different hybrid style // Make sure textures are correct if realloc by a different hybrid style
@ -277,8 +312,8 @@ int** BaseDipoleT::compute(const int ago, const int inum_full,
device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q, device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q,
boxlo, prd); boxlo, prd);
loop(eflag,vflag); const int red_blocks=loop(eflag,vflag);
ans->copy_answers(eflag,vflag,eatom,vatom); ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
device->add_ans_object(ans); device->add_ans_object(ans);
hd_balancer.stop_timer(); hd_balancer.stop_timer();
@ -300,14 +335,38 @@ void BaseDipoleT::compile_kernels(UCL_Device &dev, const void *pair_str,
std::string s_fast=std::string(kname)+"_fast"; std::string s_fast=std::string(kname)+"_fast";
if (pair_program) delete pair_program; if (pair_program) delete pair_program;
pair_program=new UCL_Program(dev); pair_program=new UCL_Program(dev);
pair_program->load_string(pair_str,device->compile_string().c_str()); std::string oclstring = device->compile_string()+" -DEVFLAG=1";
pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen);
k_pair_fast.set_function(*pair_program,s_fast.c_str()); k_pair_fast.set_function(*pair_program,s_fast.c_str());
k_pair.set_function(*pair_program,kname); k_pair.set_function(*pair_program,kname);
pos_tex.get_texture(*pair_program,"pos_tex"); pos_tex.get_texture(*pair_program,"pos_tex");
q_tex.get_texture(*pair_program,"q_tex"); q_tex.get_texture(*pair_program,"q_tex");
mu_tex.get_texture(*pair_program,"mu_tex"); mu_tex.get_texture(*pair_program,"mu_tex");
#if defined(LAL_OCL_EV_JIT)
oclstring = device->compile_string()+" -DEVFLAG=0";
if (pair_program_noev) delete pair_program_noev;
pair_program_noev=new UCL_Program(dev);
pair_program_noev->load_string(pair_str,oclstring.c_str(),nullptr,screen);
k_pair_noev.set_function(*pair_program_noev,s_fast.c_str());
#else
k_pair_sel = &k_pair_fast;
#endif
_compiled=true; _compiled=true;
#if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
if (dev.cl_device_version() >= 210) {
size_t mx_subgroup_sz = k_pair_fast.max_subgroup_size(_block_size);
#if defined(LAL_OCL_EV_JIT)
mx_subgroup_sz = std::min(mx_subgroup_sz, k_pair_noev.max_subgroup_size(_block_size));
#endif
if (_threads_per_atom > mx_subgroup_sz)
_threads_per_atom = mx_subgroup_sz;
device->set_simd_size(mx_subgroup_sz);
}
#endif
} }
template class BaseDipole<PRECISION,ACC_PRECISION>; template class BaseDipole<PRECISION,ACC_PRECISION>;

View File

@ -102,7 +102,7 @@ class BaseDipole {
/// Accumulate timers /// Accumulate timers
inline void acc_timers() { inline void acc_timers() {
if (device->time_device()) { if (device->time_device()) {
nbor->acc_timers(); nbor->acc_timers(screen);
time_pair.add_to_total(); time_pair.add_to_total();
atom->acc_timers(); atom->acc_timers();
ans->acc_timers(); ans->acc_timers();
@ -176,9 +176,16 @@ class BaseDipole {
Neighbor *nbor; Neighbor *nbor;
// ------------------------- DEVICE KERNELS ------------------------- // ------------------------- DEVICE KERNELS -------------------------
UCL_Program *pair_program; UCL_Program *pair_program, *pair_program_noev;
UCL_Kernel k_pair_fast, k_pair; UCL_Kernel k_pair_fast, k_pair, k_pair_noev, *k_pair_sel;
inline int block_size() { return _block_size; } inline int block_size() { return _block_size; }
inline void set_kernel(const int eflag, const int vflag) {
#if defined(LAL_OCL_EV_JIT)
if (eflag || vflag) k_pair_sel = &k_pair_fast;
else k_pair_sel = &k_pair_noev;
#endif
}
// --------------------------- TEXTURES ----------------------------- // --------------------------- TEXTURES -----------------------------
UCL_Texture pos_tex; UCL_Texture pos_tex;
@ -187,14 +194,14 @@ class BaseDipole {
protected: protected:
bool _compiled; bool _compiled;
int _block_size, _block_bio_size, _threads_per_atom; int _block_size, _threads_per_atom;
double _max_bytes, _max_an_bytes; double _max_bytes, _max_an_bytes;
double _gpu_overhead, _driver_overhead; double _gpu_overhead, _driver_overhead;
UCL_D_Vec<int> *_nbor_data; UCL_D_Vec<int> *_nbor_data;
void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k); void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k);
virtual void loop(const bool _eflag, const bool _vflag) = 0; virtual int loop(const int eflag, const int vflag) = 0;
}; };
} }

View File

@ -27,6 +27,9 @@ BaseDPDT::BaseDPD() : _compiled(false), _max_bytes(0) {
nbor=new Neighbor(); nbor=new Neighbor();
pair_program=nullptr; pair_program=nullptr;
ucl_device=nullptr; ucl_device=nullptr;
#if defined(LAL_OCL_EV_JIT)
pair_program_noev=nullptr;
#endif
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
@ -36,6 +39,10 @@ BaseDPDT::~BaseDPD() {
k_pair_fast.clear(); k_pair_fast.clear();
k_pair.clear(); k_pair.clear();
if (pair_program) delete pair_program; if (pair_program) delete pair_program;
#if defined(LAL_OCL_EV_JIT)
k_pair_noev.clear();
if (pair_program_noev) delete pair_program_noev;
#endif
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
@ -47,9 +54,9 @@ int BaseDPDT::bytes_per_atom_atomic(const int max_nbors) const {
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
int BaseDPDT::init_atomic(const int nlocal, const int nall, int BaseDPDT::init_atomic(const int nlocal, const int nall,
const int max_nbors, const int maxspecial, const int max_nbors, const int maxspecial,
const double cell_size, const double cell_size, const double gpu_split,
const double gpu_split, FILE *_screen, FILE *_screen, const void *pair_program,
const void *pair_program, const char *k_name) { const char *k_name, const int onetype) {
screen=_screen; screen=_screen;
int gpu_nbor=0; int gpu_nbor=0;
@ -63,31 +70,30 @@ int BaseDPDT::init_atomic(const int nlocal, const int nall,
if (host_nlocal>0) if (host_nlocal>0)
_gpu_host=1; _gpu_host=1;
_threads_per_atom=device->threads_per_charge(); _threads_per_atom=device->threads_per_atom();
if (_threads_per_atom>1 && gpu_nbor==0) {
nbor->packing(true);
_nbor_data=&(nbor->dev_packed);
} else
_nbor_data=&(nbor->dev_nbor);
int success=device->init(*ans,false,false,nlocal,nall,maxspecial,true); int success=device->init(*ans,false,false,nlocal,nall,maxspecial,true);
if (success!=0) if (success!=0)
return success; return success;
success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
max_nbors,cell_size,false,_threads_per_atom);
if (success!=0)
return success;
if (ucl_device!=device->gpu) _compiled=false; if (ucl_device!=device->gpu) _compiled=false;
ucl_device=device->gpu; ucl_device=device->gpu;
atom=&device->atom; atom=&device->atom;
_block_size=device->pair_block_size(); _block_size=device->pair_block_size();
_block_bio_size=device->block_bio_pair(); compile_kernels(*ucl_device,pair_program,k_name,onetype);
compile_kernels(*ucl_device,pair_program,k_name);
if (_threads_per_atom>1 && gpu_nbor==0) {
nbor->packing(true);
_nbor_data=&(nbor->dev_packed);
} else
_nbor_data=&(nbor->dev_nbor);
success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
max_nbors,cell_size,false,_threads_per_atom);
if (success!=0)
return success;
// Initialize host-device load balancer // Initialize host-device load balancer
hd_balancer.init(device,gpu_nbor,gpu_split); hd_balancer.init(device,gpu_nbor,gpu_split);
@ -167,8 +173,8 @@ inline void BaseDPDT::build_nbor_list(const int inum, const int host_inum,
atom->cast_copy_x(host_x,host_type); atom->cast_copy_x(host_x,host_type);
int mn; int mn;
nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, tag, nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi,
nspecial, special, success, mn); tag, nspecial, special, success, mn, ans->error_flag);
double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
if (bytes>_max_an_bytes) if (bytes>_max_an_bytes)
@ -179,16 +185,30 @@ inline void BaseDPDT::build_nbor_list(const int inum, const int host_inum,
// Copy nbor list from host if necessary and then calculate forces, virials,.. // Copy nbor list from host if necessary and then calculate forces, virials,..
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
void BaseDPDT::compute(const int f_ago, const int inum_full, void BaseDPDT::compute(const int f_ago, const int inum_full, const int nall,
const int nall, double **host_x, int *host_type, double **host_x, int *host_type, int *ilist, int *numj,
int *ilist, int *numj, int **firstneigh, int **firstneigh, const bool eflag_in,
const bool eflag, const bool vflag, const bool vflag_in, const bool eatom,
const bool eatom, const bool vatom, const bool vatom, int &host_start,
int &host_start, const double cpu_time, const double cpu_time, bool &success, tagint *tag,
bool &success, tagint *tag, double **host_v, double **host_v, const double dtinvsqrt,
const double dtinvsqrt, const int seed, const int timestep, const int seed, const int timestep,
const int nlocal, double *boxlo, double *prd) { const int nlocal, double *boxlo, double *prd) {
acc_timers(); acc_timers();
int eflag, vflag;
if (eatom) eflag=2;
else if (eflag_in) eflag=1;
else eflag=0;
if (vatom) vflag=2;
else if (vflag_in) vflag=1;
else vflag=0;
#ifdef LAL_NO_BLOCK_REDUCE
if (eflag) eflag=2;
if (vflag) vflag=2;
#endif
set_kernel(eflag,vflag);
if (inum_full==0) { if (inum_full==0) {
host_start=0; host_start=0;
// Make sure textures are correct if realloc by a different hybrid style // Make sure textures are correct if realloc by a different hybrid style
@ -218,8 +238,8 @@ void BaseDPDT::compute(const int f_ago, const int inum_full,
_seed = seed; _seed = seed;
_timestep = timestep; _timestep = timestep;
loop(eflag,vflag); const int red_blocks=loop(eflag,vflag);
ans->copy_answers(eflag,vflag,eatom,vatom,ilist); ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks);
device->add_ans_object(ans); device->add_ans_object(ans);
hd_balancer.stop_timer(); hd_balancer.stop_timer();
} }
@ -231,8 +251,8 @@ template <class numtyp, class acctyp>
int** BaseDPDT::compute(const int ago, const int inum_full, int** BaseDPDT::compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type, const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, tagint *tag, double *sublo, double *subhi, tagint *tag,
int **nspecial, tagint **special, const bool eflag, int **nspecial, tagint **special, const bool eflag_in,
const bool vflag, const bool eatom, const bool vflag_in, const bool eatom,
const bool vatom, int &host_start, const bool vatom, int &host_start,
int **ilist, int **jnum, int **ilist, int **jnum,
const double cpu_time, bool &success, const double cpu_time, bool &success,
@ -240,6 +260,20 @@ int** BaseDPDT::compute(const int ago, const int inum_full,
const int seed, const int timestep, const int seed, const int timestep,
double *boxlo, double *prd) { double *boxlo, double *prd) {
acc_timers(); acc_timers();
int eflag, vflag;
if (eatom) eflag=2;
else if (eflag_in) eflag=1;
else eflag=0;
if (vatom) vflag=2;
else if (vflag_in) vflag=1;
else vflag=0;
#ifdef LAL_NO_BLOCK_REDUCE
if (eflag) eflag=2;
if (vflag) vflag=2;
#endif
set_kernel(eflag,vflag);
if (inum_full==0) { if (inum_full==0) {
host_start=0; host_start=0;
// Make sure textures are correct if realloc by a different hybrid style // Make sure textures are correct if realloc by a different hybrid style
@ -275,8 +309,8 @@ int** BaseDPDT::compute(const int ago, const int inum_full,
_seed = seed; _seed = seed;
_timestep = timestep; _timestep = timestep;
loop(eflag,vflag); const int red_blocks=loop(eflag,vflag);
ans->copy_answers(eflag,vflag,eatom,vatom); ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
device->add_ans_object(ans); device->add_ans_object(ans);
hd_balancer.stop_timer(); hd_balancer.stop_timer();
@ -291,20 +325,48 @@ double BaseDPDT::host_memory_usage_atomic() const {
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
void BaseDPDT::compile_kernels(UCL_Device &dev, const void *pair_str, void BaseDPDT::compile_kernels(UCL_Device &dev, const void *pair_str,
const char *kname) { const char *kname, const int onetype) {
if (_compiled) if (_compiled && _onetype==onetype)
return; return;
_onetype=onetype;
std::string s_fast=std::string(kname)+"_fast"; std::string s_fast=std::string(kname)+"_fast";
if (pair_program) delete pair_program; if (pair_program) delete pair_program;
pair_program=new UCL_Program(dev); pair_program=new UCL_Program(dev);
pair_program->load_string(pair_str,device->compile_string().c_str()); std::string oclstring = device->compile_string()+" -DEVFLAG=1";
if (_onetype) oclstring+=" -DONETYPE="+device->toa(_onetype);
pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen);
k_pair_fast.set_function(*pair_program,s_fast.c_str()); k_pair_fast.set_function(*pair_program,s_fast.c_str());
k_pair.set_function(*pair_program,kname); k_pair.set_function(*pair_program,kname);
pos_tex.get_texture(*pair_program,"pos_tex"); pos_tex.get_texture(*pair_program,"pos_tex");
vel_tex.get_texture(*pair_program,"vel_tex"); vel_tex.get_texture(*pair_program,"vel_tex");
#if defined(LAL_OCL_EV_JIT)
oclstring = device->compile_string()+" -DEVFLAG=0";
if (_onetype) oclstring+=" -DONETYPE="+device->toa(_onetype);
if (pair_program_noev) delete pair_program_noev;
pair_program_noev=new UCL_Program(dev);
pair_program_noev->load_string(pair_str,oclstring.c_str(),nullptr,screen);
k_pair_noev.set_function(*pair_program_noev,s_fast.c_str());
#else
k_pair_sel = &k_pair_fast;
#endif
_compiled=true; _compiled=true;
#if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
if (dev.cl_device_version() >= 210) {
size_t mx_subgroup_sz = k_pair_fast.max_subgroup_size(_block_size);
#if defined(LAL_OCL_EV_JIT)
mx_subgroup_sz = std::min(mx_subgroup_sz, k_pair_noev.max_subgroup_size(_block_size));
#endif
if (_threads_per_atom > mx_subgroup_sz)
_threads_per_atom = mx_subgroup_sz;
device->set_simd_size(mx_subgroup_sz);
}
#endif
} }
template class BaseDPD<PRECISION,ACC_PRECISION>; template class BaseDPD<PRECISION,ACC_PRECISION>;

View File

@ -52,7 +52,8 @@ class BaseDPD {
int init_atomic(const int nlocal, const int nall, const int max_nbors, int init_atomic(const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size, const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen, const double gpu_split, FILE *screen,
const void *pair_program, const char *k_name); const void *pair_program, const char *k_name,
const int onetype=0);
/// Estimate the overhead for GPU context changes and CPU driver /// Estimate the overhead for GPU context changes and CPU driver
void estimate_gpu_overhead(); void estimate_gpu_overhead();
@ -101,7 +102,7 @@ class BaseDPD {
/// Accumulate timers /// Accumulate timers
inline void acc_timers() { inline void acc_timers() {
if (device->time_device()) { if (device->time_device()) {
nbor->acc_timers(); nbor->acc_timers(screen);
time_pair.add_to_total(); time_pair.add_to_total();
atom->acc_timers(); atom->acc_timers();
ans->acc_timers(); ans->acc_timers();
@ -177,9 +178,16 @@ class BaseDPD {
Neighbor *nbor; Neighbor *nbor;
// ------------------------- DEVICE KERNELS ------------------------- // ------------------------- DEVICE KERNELS -------------------------
UCL_Program *pair_program; UCL_Program *pair_program, *pair_program_noev;
UCL_Kernel k_pair_fast, k_pair; UCL_Kernel k_pair_fast, k_pair, k_pair_noev, *k_pair_sel;
inline int block_size() { return _block_size; } inline int block_size() { return _block_size; }
inline void set_kernel(const int eflag, const int vflag) {
#if defined(LAL_OCL_EV_JIT)
if (eflag || vflag) k_pair_sel = &k_pair_fast;
else k_pair_sel = &k_pair_noev;
#endif
}
// --------------------------- TEXTURES ----------------------------- // --------------------------- TEXTURES -----------------------------
UCL_Texture pos_tex; UCL_Texture pos_tex;
@ -191,13 +199,14 @@ class BaseDPD {
protected: protected:
bool _compiled; bool _compiled;
int _block_size, _block_bio_size, _threads_per_atom; int _block_size, _threads_per_atom, _onetype;
double _max_bytes, _max_an_bytes; double _max_bytes, _max_an_bytes;
double _gpu_overhead, _driver_overhead; double _gpu_overhead, _driver_overhead;
UCL_D_Vec<int> *_nbor_data; UCL_D_Vec<int> *_nbor_data;
void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k); void compile_kernels(UCL_Device &dev, const void *pair_string,
virtual void loop(const bool _eflag, const bool _vflag) = 0; const char *k, const int onetype);
virtual int loop(const int eflag, const int vflag) = 0;
}; };
} }

View File

@ -29,7 +29,8 @@ const char *ellipsoid_nbor=0;
extern Device<PRECISION,ACC_PRECISION> global_device; extern Device<PRECISION,ACC_PRECISION> global_device;
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
BaseEllipsoidT::BaseEllipsoid() : _compiled(false), _max_bytes(0) { BaseEllipsoidT::BaseEllipsoid() : _compiled(false), _max_bytes(0),
host_olist_size(0) {
device=&global_device; device=&global_device;
ans=new Answer<numtyp,acctyp>(); ans=new Answer<numtyp,acctyp>();
nbor=new Neighbor(); nbor=new Neighbor();
@ -37,6 +38,10 @@ BaseEllipsoidT::BaseEllipsoid() : _compiled(false), _max_bytes(0) {
ellipsoid_program=nullptr; ellipsoid_program=nullptr;
lj_program=nullptr; lj_program=nullptr;
ucl_device=nullptr; ucl_device=nullptr;
#if defined(LAL_OCL_EV_JIT)
ellipsoid_program_noev=nullptr;
lj_program_noev=nullptr;
#endif
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
@ -53,6 +58,14 @@ BaseEllipsoidT::~BaseEllipsoid() {
if (nbor_program) delete nbor_program; if (nbor_program) delete nbor_program;
if (ellipsoid_program) delete ellipsoid_program; if (ellipsoid_program) delete ellipsoid_program;
if (lj_program) delete lj_program; if (lj_program) delete lj_program;
#if defined(LAL_OCL_EV_JIT)
k_ellipsoid_noev.clear();
k_ellipsoid_sphere_noev.clear();
k_sphere_ellipsoid_noev.clear();
k_lj_fast.clear();
if (ellipsoid_program_noev) delete ellipsoid_program_noev;
if (lj_program_noev) delete lj_program_noev;
#endif
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
@ -89,11 +102,6 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,
if (success!=0) if (success!=0)
return success; return success;
success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
max_nbors,cell_size,true,1);
if (success!=0)
return success;
if (ucl_device!=device->gpu) _compiled=false; if (ucl_device!=device->gpu) _compiled=false;
ucl_device=device->gpu; ucl_device=device->gpu;
@ -102,6 +110,11 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,
_block_size=device->block_ellipse(); _block_size=device->block_ellipse();
compile_kernels(*ucl_device,ellipsoid_program,lj_program,k_name,ellip_sphere); compile_kernels(*ucl_device,ellipsoid_program,lj_program,k_name,ellip_sphere);
success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
max_nbors,cell_size,true,1);
if (success!=0)
return success;
// Initialize host-device load balancer // Initialize host-device load balancer
hd_balancer.init(device,gpu_nbor,gpu_split); hd_balancer.init(device,gpu_nbor,gpu_split);
@ -133,12 +146,11 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,
if (_multiple_forms && gpu_nbor!=0) if (_multiple_forms && gpu_nbor!=0)
return -9; return -9;
if (_multiple_forms) if (_multiple_forms) {
ans->force.zero(); ans->force.zero();
host_olist_size = nbor->max_atoms();
// Memory for ilist ordered by particle type host_olist = new int[nbor->max_atoms()];
if (host_olist.alloc(nbor->max_atoms(),*ucl_device)!=UCL_SUCCESS) }
return -3;
_max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes(); _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
@ -160,7 +172,10 @@ template <class numtyp, class acctyp>
void BaseEllipsoidT::clear_base() { void BaseEllipsoidT::clear_base() {
// Output any timing information // Output any timing information
output_times(); output_times();
host_olist.clear(); if (host_olist_size) {
host_olist_size = 0;
delete []host_olist;
}
time_nbor1.clear(); time_nbor1.clear();
time_ellipsoid.clear(); time_ellipsoid.clear();
@ -206,10 +221,14 @@ void BaseEllipsoidT::output_times() {
MPI_Reduce(&_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0, MPI_Reduce(&_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,
device->replica()); device->replica());
double max_mb=mpi_max_bytes/(1024*1024); double max_mb=mpi_max_bytes/(1024*1024);
double t_time=times[0]+times[1]+times[2]+times[3]+times[4]+times[5];
#ifdef USE_OPENCL
// Workaround for timing issue on Intel OpenCL
if (times[3] > 80e6) times[3]=0.0;
#endif
if (device->replica_me()==0) if (device->replica_me()==0)
if (screen && times[5]>0.0) { if (screen && times[7]>0.0) {
int replica_size=device->replica_size(); int replica_size=device->replica_size();
fprintf(screen,"\n\n-------------------------------------"); fprintf(screen,"\n\n-------------------------------------");
@ -218,9 +237,8 @@ void BaseEllipsoidT::output_times() {
fprintf(screen,"\n-------------------------------------"); fprintf(screen,"\n-------------------------------------");
fprintf(screen,"--------------------------------\n"); fprintf(screen,"--------------------------------\n");
if (device->procs_per_gpu()==1 && t_time>0) { if (device->procs_per_gpu()==1 && times[3]>0) {
fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/replica_size); fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/replica_size);
fprintf(screen,"Data Cast/Pack: %.4f s.\n",times[5]/replica_size);
fprintf(screen,"Neighbor copy: %.4f s.\n",times[1]/replica_size); fprintf(screen,"Neighbor copy: %.4f s.\n",times[1]/replica_size);
if (nbor->gpu_nbor()>0) if (nbor->gpu_nbor()>0)
fprintf(screen,"Neighbor build: %.4f s.\n",times[2]/replica_size); fprintf(screen,"Neighbor build: %.4f s.\n",times[2]/replica_size);
@ -229,13 +247,15 @@ void BaseEllipsoidT::output_times() {
fprintf(screen,"Force calc: %.4f s.\n",times[3]/replica_size); fprintf(screen,"Force calc: %.4f s.\n",times[3]/replica_size);
fprintf(screen,"LJ calc: %.4f s.\n",times[4]/replica_size); fprintf(screen,"LJ calc: %.4f s.\n",times[4]/replica_size);
} }
if (nbor->gpu_nbor()==2)
fprintf(screen,"Neighbor (CPU): %.4f s.\n",times[9]/replica_size);
if (times[6]>0) if (times[6]>0)
fprintf(screen,"Device Overhead: %.4f s.\n",times[6]/replica_size); fprintf(screen,"Device Overhead: %.4f s.\n",times[6]/replica_size);
fprintf(screen,"Average split: %.4f.\n",avg_split); fprintf(screen,"Average split: %.4f.\n",avg_split);
fprintf(screen,"Threads / atom: %d.\n",_threads_per_atom); fprintf(screen,"Threads / atom: %d.\n",_threads_per_atom);
fprintf(screen,"Vector width: %d.\n", device->simd_size());
fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb); fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb);
if (nbor->gpu_nbor()==2)
fprintf(screen,"CPU Neighbor: %.4f s.\n",times[9]/replica_size);
fprintf(screen,"CPU Cast/Pack: %.4f s.\n",times[5]/replica_size);
fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[7]/replica_size); fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[7]/replica_size);
fprintf(screen,"CPU Idle_Time: %.4f s.\n",times[8]/replica_size); fprintf(screen,"CPU Idle_Time: %.4f s.\n",times[8]/replica_size);
fprintf(screen,"-------------------------------------"); fprintf(screen,"-------------------------------------");
@ -256,11 +276,13 @@ void BaseEllipsoidT::pack_nbors(const int GX, const int BX, const int start,
if (shared_types) { if (shared_types) {
k_nbor_fast.set_size(GX,BX); k_nbor_fast.set_size(GX,BX);
k_nbor_fast.run(&atom->x, &cut_form, &nbor->dev_nbor, &stride, &start, k_nbor_fast.run(&atom->x, &cut_form, &nbor->dev_nbor, &stride, &start,
&inum, &nbor->dev_packed, &form_low, &form_high); &inum, &nbor->dev_packed, &form_low, &form_high,
&_threads_per_atom);
} else { } else {
k_nbor.set_size(GX,BX); k_nbor.set_size(GX,BX);
k_nbor.run(&atom->x, &cut_form, &ntypes, &nbor->dev_nbor, &stride, k_nbor.run(&atom->x, &cut_form, &ntypes, &nbor->dev_nbor, &stride,
&start, &inum, &nbor->dev_packed, &form_low, &form_high); &start, &inum, &nbor->dev_packed, &form_low, &form_high,
&_threads_per_atom);
} }
} }
@ -298,7 +320,7 @@ void BaseEllipsoidT::reset_nbors(const int nall, const int inum,
p++; p++;
} }
} }
nbor->get_host(inum,host_olist.begin(),numj,firstneigh,block_size()); nbor->get_host(inum,host_olist,numj,firstneigh,block_size());
nbor->copy_unpacked(inum,mn); nbor->copy_unpacked(inum,mn);
return; return;
} }
@ -330,8 +352,8 @@ inline void BaseEllipsoidT::build_nbor_list(const int inum, const int host_inum,
atom->cast_copy_x(host_x,host_type); atom->cast_copy_x(host_x,host_type);
int mn; int mn;
nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, tag, nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi,
nspecial, special, success, mn); tag, nspecial, special, success, mn, ans->error_flag);
nbor->copy_unpacked(inum,mn); nbor->copy_unpacked(inum,mn);
_last_ellipse=inum; _last_ellipse=inum;
_max_last_ellipse=inum; _max_last_ellipse=inum;
@ -348,11 +370,18 @@ template <class numtyp, class acctyp>
int* BaseEllipsoidT::compute(const int f_ago, const int inum_full, int* BaseEllipsoidT::compute(const int f_ago, const int inum_full,
const int nall, double **host_x, int *host_type, const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh, int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag, const bool eflag_in, const bool vflag_in,
const bool eatom, const bool vatom, const bool eatom, const bool vatom,
int &host_start, const double cpu_time, int &host_start, const double cpu_time,
bool &success, double **host_quat) { bool &success, double **host_quat) {
acc_timers(); acc_timers();
int eflag, vflag;
if (eflag_in) eflag=2;
else eflag=0;
if (vflag_in) vflag=2;
else vflag=0;
set_kernel(eflag,vflag);
if (inum_full==0) { if (inum_full==0) {
host_start=0; host_start=0;
zero_timers(); zero_timers();
@ -373,7 +402,7 @@ int* BaseEllipsoidT::compute(const int f_ago, const int inum_full,
} }
int *list; int *list;
if (_multiple_forms) if (_multiple_forms)
list=host_olist.begin(); list=host_olist;
else else
list=ilist; list=ilist;
@ -384,7 +413,7 @@ int* BaseEllipsoidT::compute(const int f_ago, const int inum_full,
atom->add_quat_data(); atom->add_quat_data();
loop(eflag,vflag); loop(eflag,vflag);
ans->copy_answers(eflag,vflag,eatom,vatom,list); ans->copy_answers(eflag_in,vflag_in,eatom,vatom,list,inum);
device->add_ans_object(ans); device->add_ans_object(ans);
hd_balancer.stop_timer(); hd_balancer.stop_timer();
return list; return list;
@ -394,15 +423,23 @@ int* BaseEllipsoidT::compute(const int f_ago, const int inum_full,
// Reneighbor on GPU if necessary and then compute forces, virials, energies // Reneighbor on GPU if necessary and then compute forces, virials, energies
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
int** BaseEllipsoidT::compute(const int ago, const int inum_full, const int nall, int** BaseEllipsoidT::compute(const int ago, const int inum_full,
double **host_x, int *host_type, double *sublo, const int nall, double **host_x, int *host_type,
double *subhi, tagint *tag, int **nspecial, double *sublo, double *subhi, tagint *tag,
tagint **special, const bool eflag, const bool vflag, int **nspecial, tagint **special,
const bool eflag_in, const bool vflag_in,
const bool eatom, const bool vatom, const bool eatom, const bool vatom,
int &host_start, int **ilist, int **jnum, int &host_start, int **ilist, int **jnum,
const double cpu_time, bool &success, const double cpu_time, bool &success,
double **host_quat) { double **host_quat) {
acc_timers(); acc_timers();
int eflag, vflag;
if (eflag_in) eflag=2;
else eflag=0;
if (vflag_in) vflag=2;
else vflag=0;
set_kernel(eflag,vflag);
if (inum_full==0) { if (inum_full==0) {
host_start=0; host_start=0;
zero_timers(); zero_timers();
@ -435,7 +472,7 @@ int** BaseEllipsoidT::compute(const int ago, const int inum_full, const int nall
*jnum=nbor->host_acc.begin(); *jnum=nbor->host_acc.begin();
loop(eflag,vflag); loop(eflag,vflag);
ans->copy_answers(eflag,vflag,eatom,vatom); ans->copy_answers(eflag_in,vflag_in,eatom,vatom,inum);
device->add_ans_object(ans); device->add_ans_object(ans);
hd_balancer.stop_timer(); hd_balancer.stop_timer();
@ -462,25 +499,26 @@ void BaseEllipsoidT::compile_kernels(UCL_Device &dev,
std::string s_lj=kns+"_lj"; std::string s_lj=kns+"_lj";
std::string s_lj_fast=kns+"_lj_fast"; std::string s_lj_fast=kns+"_lj_fast";
std::string flags=device->compile_string(); std::string oclstring = device->compile_string()+" -DEVFLAG=1";
if (nbor_program) delete nbor_program; if (nbor_program) delete nbor_program;
nbor_program=new UCL_Program(dev); nbor_program=new UCL_Program(dev);
nbor_program->load_string(ellipsoid_nbor,flags.c_str()); nbor_program->load_string(ellipsoid_nbor,oclstring.c_str(),nullptr,screen);
k_nbor_fast.set_function(*nbor_program,"kernel_nbor_fast"); k_nbor_fast.set_function(*nbor_program,"kernel_nbor_fast");
k_nbor.set_function(*nbor_program,"kernel_nbor"); k_nbor.set_function(*nbor_program,"kernel_nbor");
neigh_tex.get_texture(*nbor_program,"pos_tex"); neigh_tex.get_texture(*nbor_program,"pos_tex");
if (ellipsoid_program) delete ellipsoid_program; if (ellipsoid_program) delete ellipsoid_program;
ellipsoid_program=new UCL_Program(dev); ellipsoid_program=new UCL_Program(dev);
ellipsoid_program->load_string(ellipsoid_string,flags.c_str()); ellipsoid_program->load_string(ellipsoid_string,oclstring.c_str(),
nullptr,screen);
k_ellipsoid.set_function(*ellipsoid_program,kname); k_ellipsoid.set_function(*ellipsoid_program,kname);
pos_tex.get_texture(*ellipsoid_program,"pos_tex"); pos_tex.get_texture(*ellipsoid_program,"pos_tex");
quat_tex.get_texture(*ellipsoid_program,"quat_tex"); quat_tex.get_texture(*ellipsoid_program,"quat_tex");
if (lj_program) delete lj_program; if (lj_program) delete lj_program;
lj_program=new UCL_Program(dev); lj_program=new UCL_Program(dev);
lj_program->load_string(lj_string,flags.c_str()); lj_program->load_string(lj_string,oclstring.c_str(),nullptr,screen);
k_sphere_ellipsoid.set_function(*lj_program,s_sphere_ellipsoid.c_str()); k_sphere_ellipsoid.set_function(*lj_program,s_sphere_ellipsoid.c_str());
k_lj_fast.set_function(*lj_program,s_lj_fast.c_str()); k_lj_fast.set_function(*lj_program,s_lj_fast.c_str());
k_lj.set_function(*lj_program,s_lj.c_str()); k_lj.set_function(*lj_program,s_lj.c_str());
@ -489,7 +527,52 @@ void BaseEllipsoidT::compile_kernels(UCL_Device &dev,
lj_pos_tex.get_texture(*lj_program,"pos_tex"); lj_pos_tex.get_texture(*lj_program,"pos_tex");
lj_quat_tex.get_texture(*lj_program,"quat_tex"); lj_quat_tex.get_texture(*lj_program,"quat_tex");
#if defined(LAL_OCL_EV_JIT)
oclstring = device->compile_string()+" -DEVFLAG=0";
if (ellipsoid_program_noev) delete ellipsoid_program_noev;
ellipsoid_program_noev=new UCL_Program(dev);
ellipsoid_program_noev->load_string(ellipsoid_string,oclstring.c_str(),
nullptr,screen);
k_ellipsoid_noev.set_function(*ellipsoid_program_noev,kname);
if (lj_program_noev) delete lj_program_noev;
lj_program_noev=new UCL_Program(dev);
lj_program_noev->load_string(lj_string,oclstring.c_str(),nullptr,screen);
k_sphere_ellipsoid_noev.set_function(*lj_program_noev,
s_sphere_ellipsoid.c_str());
k_lj_fast_noev.set_function(*lj_program_noev,s_lj_fast.c_str());
if (e_s)
k_ellipsoid_sphere_noev.set_function(*lj_program_noev,
s_ellipsoid_sphere.c_str());
#else
k_elps_sel = &k_ellipsoid;
k_elps_sphere_sel = &k_ellipsoid_sphere;
k_sphere_elps_sel = &k_sphere_ellipsoid;
k_lj_sel = &k_lj_fast;
#endif
_compiled=true; _compiled=true;
#if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
if (dev.cl_device_version() >= 210) {
size_t mx_subgroup_sz = k_lj_fast.max_subgroup_size(_block_size);
mx_subgroup_sz = std::min(mx_subgroup_sz, k_ellipsoid.max_subgroup_size(_block_size));
mx_subgroup_sz = std::min(mx_subgroup_sz, k_sphere_ellipsoid.max_subgroup_size(_block_size));
if (e_s)
mx_subgroup_sz = std::min(mx_subgroup_sz, k_ellipsoid_sphere.max_subgroup_size(_block_size));
#if defined(LAL_OCL_EV_JIT)
mx_subgroup_sz = std::min(mx_subgroup_sz, k_lj_fast_noev.max_subgroup_size(_block_size));
mx_subgroup_sz = std::min(mx_subgroup_sz, k_ellipsoid_noev.max_subgroup_size(_block_size));
mx_subgroup_sz = std::min(mx_subgroup_sz, k_sphere_ellipsoid_noev.max_subgroup_size(_block_size));
if (e_s)
mx_subgroup_sz = std::min(mx_subgroup_sz, k_ellipsoid_sphere_noev.max_subgroup_size(_block_size));
#endif
if (_threads_per_atom > mx_subgroup_sz)
_threads_per_atom = mx_subgroup_sz;
device->set_simd_size(mx_subgroup_sz);
}
#endif
} }
template class BaseEllipsoid<PRECISION,ACC_PRECISION>; template class BaseEllipsoid<PRECISION,ACC_PRECISION>;

View File

@ -88,10 +88,10 @@ class BaseEllipsoid {
ans->resize(nlocal, success); ans->resize(nlocal, success);
if (_multiple_forms) ans->force.zero(); if (_multiple_forms) ans->force.zero();
if (olist_size>static_cast<int>(host_olist.numel())) { if (olist_size>host_olist_size) {
host_olist.clear(); if (host_olist_size) delete []host_olist;
int new_size=static_cast<int>(static_cast<double>(olist_size)*1.10); host_olist_size=static_cast<int>(static_cast<double>(olist_size)*1.10);
success=success && (host_olist.alloc(new_size,*ucl_device)==UCL_SUCCESS); host_olist = new int[host_olist_size];
} }
nbor->resize(nlocal,host_inum,max_nbors,success); nbor->resize(nlocal,host_inum,max_nbors,success);
@ -116,7 +116,7 @@ class BaseEllipsoid {
/// Accumulate timers /// Accumulate timers
inline void acc_timers() { inline void acc_timers() {
if (device->time_device()) { if (device->time_device()) {
nbor->acc_timers(); nbor->acc_timers(screen);
time_nbor1.add_to_total(); time_nbor1.add_to_total();
time_ellipsoid.add_to_total(); time_ellipsoid.add_to_total();
if (_multiple_forms) { if (_multiple_forms) {
@ -223,14 +223,40 @@ class BaseEllipsoid {
/// Neighbor data /// Neighbor data
Neighbor *nbor; Neighbor *nbor;
/// ilist with particles sorted by type /// ilist with particles sorted by type
UCL_H_Vec<int> host_olist; int *host_olist;
int host_olist_size;
// ------------------------- DEVICE KERNELS ------------------------- // ------------------------- DEVICE KERNELS -------------------------
UCL_Program *nbor_program, *ellipsoid_program, *lj_program; UCL_Program *nbor_program, *ellipsoid_program, *lj_program;
UCL_Program *ellipsoid_program_noev, *lj_program_noev;
UCL_Kernel k_nbor_fast, k_nbor; UCL_Kernel k_nbor_fast, k_nbor;
UCL_Kernel k_ellipsoid, k_ellipsoid_sphere, k_sphere_ellipsoid; UCL_Kernel k_ellipsoid, k_ellipsoid_sphere, k_sphere_ellipsoid;
UCL_Kernel k_lj_fast, k_lj; UCL_Kernel k_lj_fast, k_lj;
UCL_Kernel k_ellipsoid_noev, k_ellipsoid_sphere_noev;
UCL_Kernel k_sphere_ellipsoid_noev, k_lj_fast_noev;
UCL_Kernel *k_elps_sel, *k_elps_sphere_sel, *k_sphere_elps_sel, *k_lj_sel;
inline int block_size() { return _block_size; } inline int block_size() { return _block_size; }
inline void set_kernel(const int eflag, const int vflag) {
#if defined(LAL_OCL_EV_JIT)
if (_multiple_forms == false) {
if (eflag || vflag) k_elps_sel = &k_ellipsoid;
else k_elps_sel = &k_ellipsoid_noev;
} else {
if (eflag || vflag) {
k_elps_sel = &k_ellipsoid;
k_elps_sphere_sel = &k_ellipsoid_sphere;
k_sphere_elps_sel = &k_sphere_ellipsoid;
k_lj_sel = &k_lj_fast;
} else {
k_elps_sel = &k_ellipsoid_noev;
k_elps_sphere_sel = &k_ellipsoid_sphere_noev;
k_sphere_elps_sel = &k_sphere_ellipsoid_noev;
k_lj_sel = &k_lj_fast_noev;
}
}
#endif
}
// --------------------------- TEXTURES ----------------------------- // --------------------------- TEXTURES -----------------------------
UCL_Texture pos_tex, quat_tex, lj_pos_tex, lj_quat_tex, neigh_tex; UCL_Texture pos_tex, quat_tex, lj_pos_tex, lj_quat_tex, neigh_tex;
@ -240,7 +266,6 @@ class BaseEllipsoid {
int _block_size, _threads_per_atom; int _block_size, _threads_per_atom;
double _max_bytes, _max_an_bytes; double _max_bytes, _max_an_bytes;
double _gpu_overhead, _driver_overhead; double _gpu_overhead, _driver_overhead;
UCL_D_Vec<int> *_nbor_data;
// True if we want to use fast GB-sphere or sphere-sphere calculations // True if we want to use fast GB-sphere or sphere-sphere calculations
bool _multiple_forms; bool _multiple_forms;
@ -250,7 +275,7 @@ class BaseEllipsoid {
void compile_kernels(UCL_Device &dev, const void *ellipsoid_string, void compile_kernels(UCL_Device &dev, const void *ellipsoid_string,
const void *lj_string, const char *kname,const bool e_s); const void *lj_string, const char *kname,const bool e_s);
virtual void loop(const bool _eflag, const bool _vflag) = 0; virtual int loop(const int eflag, const int vflag) = 0;
}; };
} }

View File

@ -20,7 +20,7 @@ namespace LAMMPS_AL {
extern Device<PRECISION,ACC_PRECISION> global_device; extern Device<PRECISION,ACC_PRECISION> global_device;
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
BaseThreeT::BaseThree() : _compiled(false), _max_bytes(0) { BaseThreeT::BaseThree() : _compiled(false), _max_bytes(0), _onetype(-1) {
device=&global_device; device=&global_device;
ans=new Answer<numtyp,acctyp>(); ans=new Answer<numtyp,acctyp>();
nbor=new Neighbor(); nbor=new Neighbor();
@ -29,6 +29,9 @@ BaseThreeT::BaseThree() : _compiled(false), _max_bytes(0) {
#endif #endif
pair_program=nullptr; pair_program=nullptr;
ucl_device=nullptr; ucl_device=nullptr;
#if defined(LAL_OCL_EV_JIT)
pair_program_noev=nullptr;
#endif
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
@ -44,6 +47,12 @@ BaseThreeT::~BaseThree() {
k_pair.clear(); k_pair.clear();
k_short_nbor.clear(); k_short_nbor.clear();
if (pair_program) delete pair_program; if (pair_program) delete pair_program;
#if defined(LAL_OCL_EV_JIT)
k_three_center_noev.clear();
k_three_end_noev.clear();
k_pair_noev.clear();
if (pair_program_noev) delete pair_program_noev;
#endif
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
@ -62,7 +71,9 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
const double cell_size, const double gpu_split, const double cell_size, const double gpu_split,
FILE *_screen, const void *pair_program, FILE *_screen, const void *pair_program,
const char *two, const char *three_center, const char *two, const char *three_center,
const char *three_end, const char *short_nbor) { const char *three_end, const char *short_nbor,
const int onetype, const int onetype3,
const int spq, const int tpa_override) {
screen=_screen; screen=_screen;
int gpu_nbor=0; int gpu_nbor=0;
@ -77,24 +88,16 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
if (host_nlocal>0) if (host_nlocal>0)
_gpu_host=1; _gpu_host=1;
_threads_per_atom=device->threads_per_atom(); // Allow forcing threads per atom to 1 for tersoff due to subg sync issue
if (_threads_per_atom>1 && gpu_nbor==0) { // neigh no and tpa > 1 if (tpa_override)
nbor->packing(true); _threads_per_atom=tpa_override;
_nbor_data=&(nbor->dev_packed); else
} else // neigh yes or tpa == 1 _threads_per_atom=device->threads_per_three();
_nbor_data=&(nbor->dev_nbor);
if (_threads_per_atom*_threads_per_atom>device->warp_size())
return -10;
int success=device->init(*ans,false,false,nlocal,nall,maxspecial); int success=device->init(*ans,false,false,nlocal,nall,maxspecial);
if (success!=0) if (success!=0)
return success; return success;
success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
max_nbors,cell_size,false,_threads_per_atom);
if (success!=0)
return success;
if (ucl_device!=device->gpu) _compiled=false; if (ucl_device!=device->gpu) _compiled=false;
ucl_device=device->gpu; ucl_device=device->gpu;
@ -110,7 +113,19 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
_block_pair=device->pair_block_size(); _block_pair=device->pair_block_size();
_block_size=device->block_ellipse(); _block_size=device->block_ellipse();
compile_kernels(*ucl_device,pair_program,two,three_center,three_end,short_nbor); compile_kernels(*ucl_device,pair_program,two,three_center,three_end,
short_nbor,onetype,onetype3,spq);
while (_threads_per_atom*_threads_per_atom>device->simd_size())
_threads_per_atom = _threads_per_atom / 2;
if (_threads_per_atom*_threads_per_atom>device->simd_size())
return -10;
success = device->init_nbor(nbor,nall,host_nlocal,nall,maxspecial,
_gpu_host,max_nbors,cell_size,true,1,true);
if (success!=0)
return success;
// Initialize host-device load balancer // Initialize host-device load balancer
hd_balancer.init(device,gpu_nbor,gpu_split); hd_balancer.init(device,gpu_nbor,gpu_split);
@ -121,22 +136,21 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
pos_tex.bind_float(atom->x,4); pos_tex.bind_float(atom->x,4);
int ef_nall=nall;
if (ef_nall==0)
ef_nall=2000;
_max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes(); _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
#ifdef THREE_CONCURRENT #ifdef THREE_CONCURRENT
_max_an_bytes+=ans2->gpu_bytes(); _max_an_bytes+=ans2->gpu_bytes();
#endif #endif
int ef_nall=nall;
if (ef_nall==0)
ef_nall=2000;
dev_short_nbor.alloc(ef_nall*(2+max_nbors),*(this->ucl_device),UCL_READ_WRITE);
return 0; return 0;
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
void BaseThreeT::estimate_gpu_overhead() { void BaseThreeT::estimate_gpu_overhead(const int add_kernels) {
device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead); device->estimate_gpu_overhead(4+add_kernels,_gpu_overhead,_driver_overhead);
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
@ -152,7 +166,6 @@ void BaseThreeT::clear_atomic() {
time_pair.clear(); time_pair.clear();
hd_balancer.clear(); hd_balancer.clear();
dev_short_nbor.clear();
nbor->clear(); nbor->clear();
ans->clear(); ans->clear();
#ifdef THREE_CONCURRENT #ifdef THREE_CONCURRENT
@ -186,6 +199,7 @@ int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist,
// now the requirement is removed, allowing to work within pair hybrid // now the requirement is removed, allowing to work within pair hybrid
nbor->get_host(nlist,ilist,numj,firstneigh,block_size()); nbor->get_host(nlist,ilist,numj,firstneigh,block_size());
nbor->copy_unpacked(nlist,mn);
double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
#ifdef THREE_CONCURRENT #ifdef THREE_CONCURRENT
@ -201,7 +215,7 @@ int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist,
// Build neighbor list on device // Build neighbor list on device
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
inline int BaseThreeT::build_nbor_list(const int inum, const int host_inum, inline void BaseThreeT::build_nbor_list(const int inum, const int host_inum,
const int nall, double **host_x, const int nall, double **host_x,
int *host_type, double *sublo, int *host_type, double *sublo,
double *subhi, tagint *tag, double *subhi, tagint *tag,
@ -211,14 +225,22 @@ inline int BaseThreeT::build_nbor_list(const int inum, const int host_inum,
resize_atom(inum,nall,success); resize_atom(inum,nall,success);
resize_local(nall,host_inum,nbor->max_nbors(),success); resize_local(nall,host_inum,nbor->max_nbors(),success);
if (!success) if (!success)
return 0; return;
atom->cast_copy_x(host_x,host_type); atom->cast_copy_x(host_x,host_type);
_nall = nall; _nall = nall;
// Increase the effective sub-domain size for neighbors of ghosts
// This is still inefficient because we are calculating neighbors for more
// ghosts than necessary due to increased ghost cutoff
const double ncut=nbor->cutoff()*2.0;
for (int i=0; i<3; i++) sublo[i]-=ncut;
for (int i=0; i<3; i++) subhi[i]+=ncut;
int mn; int mn;
nbor->build_nbor_list(host_x, nall, host_inum, nall, *atom, sublo, subhi, tag, nbor->build_nbor_list(host_x, nall, host_inum, nall, *atom, sublo, subhi,
nspecial, special, success, mn); tag, nspecial, special, success, mn, ans->error_flag);
nbor->copy_unpacked(nall,mn);
double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
#ifdef THREE_CONCURRENT #ifdef THREE_CONCURRENT
@ -226,7 +248,6 @@ inline int BaseThreeT::build_nbor_list(const int inum, const int host_inum,
#endif #endif
if (bytes>_max_an_bytes) if (bytes>_max_an_bytes)
_max_an_bytes=bytes; _max_an_bytes=bytes;
return mn;
} }
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
@ -236,10 +257,24 @@ template <class numtyp, class acctyp>
void BaseThreeT::compute(const int f_ago, const int inum_full, const int nall, void BaseThreeT::compute(const int f_ago, const int inum_full, const int nall,
const int nlist, double **host_x, int *host_type, const int nlist, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh, int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag, const bool eatom, const bool eflag_in, const bool vflag_in,
const bool vatom, int &host_start, const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success) { const double cpu_time, bool &success) {
acc_timers(); acc_timers();
int eflag, vflag;
if (eatom) eflag=2;
else if (eflag_in) eflag=1;
else eflag=0;
if (vatom) vflag=2;
else if (vflag_in) vflag=1;
else vflag=0;
#ifdef LAL_NO_BLOCK_REDUCE
if (eflag) eflag=2;
if (vflag) vflag=2;
#endif
set_kernel(eflag,vflag);
if (inum_full==0) { if (inum_full==0) {
host_start=0; host_start=0;
// Make sure textures are correct if realloc by a different hybrid style // Make sure textures are correct if realloc by a different hybrid style
@ -260,19 +295,12 @@ void BaseThreeT::compute(const int f_ago, const int inum_full, const int nall,
reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success); reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success);
if (!success) if (!success)
return; return;
_max_nbors = nbor->max_nbor_loop(nlist,numj,ilist);
} }
atom->cast_x_data(host_x,host_type); atom->cast_x_data(host_x,host_type);
hd_balancer.start_timer(); hd_balancer.start_timer();
atom->add_x_data(host_x,host_type); atom->add_x_data(host_x,host_type);
// re-allocate dev_short_nbor if necessary
if (nall*(2+_max_nbors) > dev_short_nbor.cols()) {
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
dev_short_nbor.resize((2+_max_nbors)*_nmax);
}
// _ainum to be used in loop() for short neighbor list build // _ainum to be used in loop() for short neighbor list build
_ainum = nlist; _ainum = nlist;
@ -282,11 +310,11 @@ void BaseThreeT::compute(const int f_ago, const int inum_full, const int nall,
#ifdef THREE_CONCURRENT #ifdef THREE_CONCURRENT
ucl_device->sync(); ucl_device->sync();
#endif #endif
loop(eflag,vflag,evatom); const int red_blocks=loop(eflag,vflag,evatom,success);
ans->copy_answers(eflag,vflag,eatom,vatom,ilist); ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks);
device->add_ans_object(ans); device->add_ans_object(ans);
#ifdef THREE_CONCURRENT #ifdef THREE_CONCURRENT
ans2->copy_answers(eflag,vflag,eatom,vatom,ilist); ans2->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks);
device->add_ans_object(ans2); device->add_ans_object(ans2);
#endif #endif
hd_balancer.stop_timer(); hd_balancer.stop_timer();
@ -296,15 +324,29 @@ void BaseThreeT::compute(const int f_ago, const int inum_full, const int nall,
// Reneighbor on GPU if necessary and then compute forces, virials, energies // Reneighbor on GPU if necessary and then compute forces, virials, energies
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
int ** BaseThreeT::compute(const int ago, const int inum_full, int ** BaseThreeT::compute(const int ago, const int inum_full, const int nall,
const int nall, double **host_x, int *host_type, double **host_x, int *host_type, double *sublo,
double *sublo, double *subhi, tagint *tag, double *subhi, tagint *tag, int **nspecial,
int **nspecial, tagint **special, const bool eflag, tagint **special, const bool eflag_in,
const bool vflag, const bool eatom, const bool vflag_in, const bool eatom,
const bool vatom, int &host_start, const bool vatom, int &host_start,
int **ilist, int **jnum, int **ilist, int **jnum,
const double cpu_time, bool &success) { const double cpu_time, bool &success) {
acc_timers(); acc_timers();
int eflag, vflag;
if (eatom) eflag=2;
else if (eflag_in) eflag=1;
else eflag=0;
if (vatom) vflag=2;
else if (vflag_in) vflag=1;
else vflag=0;
#ifdef LAL_NO_BLOCK_REDUCE
if (eflag) eflag=2;
if (vflag) vflag=2;
#endif
set_kernel(eflag,vflag);
if (inum_full==0) { if (inum_full==0) {
host_start=0; host_start=0;
// Make sure textures are correct if realloc by a different hybrid style // Make sure textures are correct if realloc by a different hybrid style
@ -323,7 +365,7 @@ int ** BaseThreeT::compute(const int ago, const int inum_full,
// Build neighbor list on GPU if necessary // Build neighbor list on GPU if necessary
if (ago==0) { if (ago==0) {
_max_nbors = build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
sublo, subhi, tag, nspecial, special, success); sublo, subhi, tag, nspecial, special, success);
if (!success) if (!success)
return nullptr; return nullptr;
@ -336,12 +378,6 @@ int ** BaseThreeT::compute(const int ago, const int inum_full,
*ilist=nbor->host_ilist.begin(); *ilist=nbor->host_ilist.begin();
*jnum=nbor->host_acc.begin(); *jnum=nbor->host_acc.begin();
// re-allocate dev_short_nbor if necessary
if (nall*(2+_max_nbors) > dev_short_nbor.cols()) {
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
dev_short_nbor.resize((2+_max_nbors)*_nmax);
}
// _ainum to be used in loop() for short neighbor list build // _ainum to be used in loop() for short neighbor list build
_ainum = nall; _ainum = nall;
@ -351,11 +387,11 @@ int ** BaseThreeT::compute(const int ago, const int inum_full,
#ifdef THREE_CONCURRENT #ifdef THREE_CONCURRENT
ucl_device->sync(); ucl_device->sync();
#endif #endif
loop(eflag,vflag,evatom); const int red_blocks=loop(eflag,vflag,evatom,success);
ans->copy_answers(eflag,vflag,eatom,vatom); ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
device->add_ans_object(ans); device->add_ans_object(ans);
#ifdef THREE_CONCURRENT #ifdef THREE_CONCURRENT
ans2->copy_answers(eflag,vflag,eatom,vatom); ans2->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
device->add_ans_object(ans2); device->add_ans_object(ans2);
#endif #endif
hd_balancer.stop_timer(); hd_balancer.stop_timer();
@ -372,14 +408,24 @@ double BaseThreeT::host_memory_usage_atomic() const {
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
void BaseThreeT::compile_kernels(UCL_Device &dev, const void *pair_str, void BaseThreeT::compile_kernels(UCL_Device &dev, const void *pair_str,
const char *two, const char *three_center, const char *two, const char *three_center,
const char *three_end, const char* short_nbor) { const char *three_end, const char* short_nbor,
if (_compiled) const int onetype, const int onetype3,
const int spq) {
if (_compiled && _onetype==onetype && _onetype3==onetype3 && _spq==spq)
return; return;
_onetype=onetype;
_onetype3=onetype3;
_spq=spq;
std::string vatom_name=std::string(three_end)+"_vatom"; std::string vatom_name=std::string(three_end)+"_vatom";
if (pair_program) delete pair_program; if (pair_program) delete pair_program;
pair_program=new UCL_Program(dev); pair_program=new UCL_Program(dev);
pair_program->load_string(pair_str,device->compile_string().c_str()); std::string oclstring = device->compile_string()+" -DEVFLAG=1";
if (_onetype>=0) oclstring+=" -DONETYPE="+device->toa(_onetype)+
" -DONETYPE3="+device->toa(_onetype3);
if (_spq) oclstring+=" -DSPQ="+device->toa(_spq);
pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen);
k_three_center.set_function(*pair_program,three_center); k_three_center.set_function(*pair_program,three_center);
k_three_end.set_function(*pair_program,three_end); k_three_end.set_function(*pair_program,three_end);
k_three_end_vatom.set_function(*pair_program,vatom_name.c_str()); k_three_end_vatom.set_function(*pair_program,vatom_name.c_str());
@ -387,12 +433,50 @@ void BaseThreeT::compile_kernels(UCL_Device &dev, const void *pair_str,
k_short_nbor.set_function(*pair_program,short_nbor); k_short_nbor.set_function(*pair_program,short_nbor);
pos_tex.get_texture(*pair_program,"pos_tex"); pos_tex.get_texture(*pair_program,"pos_tex");
#if defined(LAL_OCL_EV_JIT)
oclstring = device->compile_string()+" -DEVFLAG=0";
if (_onetype>=0) oclstring+=" -DONETYPE="+device->toa(_onetype)+
" -DONETYPE3="+device->toa(_onetype3);
if (_spq) oclstring+=" -DSPQ="+device->toa(_spq);
if (pair_program_noev) delete pair_program_noev;
pair_program_noev=new UCL_Program(dev);
pair_program_noev->load_string(pair_str,oclstring.c_str(),nullptr,screen);
k_three_center_noev.set_function(*pair_program_noev,three_center);
k_three_end_noev.set_function(*pair_program_noev,three_end);
k_pair_noev.set_function(*pair_program_noev,two);
#else
k_sel = &k_pair;
k_3center_sel = &k_three_center;
k_3end_sel = &k_three_end;
#endif
#ifdef THREE_CONCURRENT #ifdef THREE_CONCURRENT
k_three_end.cq(ucl_device->cq(_end_command_queue)); k_three_end.cq(ucl_device->cq(_end_command_queue));
k_three_end_vatom.cq(ucl_device->cq(_end_command_queue)); k_three_end_vatom.cq(ucl_device->cq(_end_command_queue));
#if defined(LAL_OCL_EV_JIT)
k_three_end_noev.cq(ucl_device->cq(_end_command_queue));
#endif
#endif #endif
_compiled=true; _compiled=true;
#if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
if (dev.cl_device_version() >= 210) {
size_t mx_subgroup_sz = k_pair.max_subgroup_size(_block_size);
mx_subgroup_sz = std::min(mx_subgroup_sz, k_three_center.max_subgroup_size(_block_size));
mx_subgroup_sz = std::min(mx_subgroup_sz, k_three_end.max_subgroup_size(_block_size));
mx_subgroup_sz = std::min(mx_subgroup_sz, k_three_end_vatom.max_subgroup_size(_block_size));
#if defined(LAL_OCL_EV_JIT)
mx_subgroup_sz = std::min(mx_subgroup_sz, k_pair_noev.max_subgroup_size(_block_size));
mx_subgroup_sz = std::min(mx_subgroup_sz, k_three_center_noev.max_subgroup_size(_block_size));
mx_subgroup_sz = std::min(mx_subgroup_sz, k_three_end_noev.max_subgroup_size(_block_size));
#endif
if (_threads_per_atom > mx_subgroup_sz)
_threads_per_atom = mx_subgroup_sz;
device->set_simd_size(mx_subgroup_sz);
}
#endif
} }
template class BaseThree<PRECISION,ACC_PRECISION>; template class BaseThree<PRECISION,ACC_PRECISION>;

View File

@ -59,10 +59,12 @@ class BaseThree {
const double gpu_split, FILE *screen, const double gpu_split, FILE *screen,
const void *pair_program, const char *k_two, const void *pair_program, const char *k_two,
const char *k_three_center, const char *k_three_end, const char *k_three_center, const char *k_three_end,
const char *k_short_nbor=nullptr); const char *k_short_nbor=nullptr, const int onetype=-1,
const int onetype3=-1, const int spq=0,
const int tpa_override=0);
/// Estimate the overhead for GPU context changes and CPU driver /// Estimate the overhead for GPU context changes and CPU driver
void estimate_gpu_overhead(); void estimate_gpu_overhead(const int add_kernels=0);
/// Check if there is enough storage for atom arrays and realloc if not /// Check if there is enough storage for atom arrays and realloc if not
/** \param success set to false if insufficient memory **/ /** \param success set to false if insufficient memory **/
@ -109,7 +111,7 @@ class BaseThree {
/// Accumulate timers /// Accumulate timers
inline void acc_timers() { inline void acc_timers() {
if (device->time_device()) { if (device->time_device()) {
nbor->acc_timers(); nbor->acc_timers(screen);
time_pair.add_to_total(); time_pair.add_to_total();
atom->acc_timers(); atom->acc_timers();
ans->acc_timers(); ans->acc_timers();
@ -134,9 +136,9 @@ class BaseThree {
int *numj, int **firstneigh, bool &success); int *numj, int **firstneigh, bool &success);
/// Build neighbor list on device /// Build neighbor list on device
int build_nbor_list(const int inum, const int host_inum, void build_nbor_list(const int inum, const int host_inum, const int nall,
const int nall, double **host_x, int *host_type, double **host_x, int *host_type, double *sublo,
double *sublo, double *subhi, tagint *tag, int **nspecial, double *subhi, tagint *tag, int **nspecial,
tagint **special, bool &success); tagint **special, bool &success);
/// Pair loop with host neighboring /// Pair loop with host neighboring
@ -147,12 +149,12 @@ class BaseThree {
int &host_start, const double cpu_time, bool &success); int &host_start, const double cpu_time, bool &success);
/// Pair loop with device neighboring /// Pair loop with device neighboring
int ** compute(const int ago, const int inum_full, int ** compute(const int ago, const int inum_full, const int nall,
const int nall, double **host_x, int *host_type, double *sublo, double **host_x, int *host_type, double *sublo,
double *subhi, tagint *tag, int **nspecial, double *subhi, tagint *tag, int **nspecial, tagint **special,
tagint **special, const bool eflag, const bool vflag, const bool eflag, const bool vflag, const bool eatom,
const bool eatom, const bool vatom, int &host_start, const bool vatom, int &host_start, int **ilist,
int **ilist, int **numj, const double cpu_time, bool &success); int **numj, const double cpu_time, bool &success);
// -------------------------- DEVICE DATA ------------------------- // -------------------------- DEVICE DATA -------------------------
@ -188,14 +190,29 @@ class BaseThree {
/// Neighbor data /// Neighbor data
Neighbor *nbor; Neighbor *nbor;
UCL_D_Vec<int> dev_short_nbor;
UCL_Kernel k_short_nbor; UCL_Kernel k_short_nbor;
// ------------------------- DEVICE KERNELS ------------------------- // ------------------------- DEVICE KERNELS -------------------------
UCL_Program *pair_program; UCL_Program *pair_program, *pair_program_noev;
UCL_Kernel k_pair, k_three_center, k_three_end, k_three_end_vatom; UCL_Kernel k_pair, k_three_center, k_three_end, k_three_end_vatom;
UCL_Kernel k_pair_noev, k_three_center_noev, k_three_end_noev;
UCL_Kernel *k_sel, *k_3center_sel, *k_3end_sel;
inline int block_pair() { return _block_pair; } inline int block_pair() { return _block_pair; }
inline int block_size() { return _block_size; } inline int block_size() { return _block_size; }
inline void set_kernel(const int eflag, const int vflag) {
#if defined(LAL_OCL_EV_JIT)
if (eflag || vflag) {
k_sel = &k_pair;
k_3center_sel = &k_three_center;
k_3end_sel = &k_three_end;
} else {
k_sel = &k_pair_noev;
k_3center_sel = &k_three_center_noev;
k_3end_sel = &k_three_end_noev;
}
#endif
}
// --------------------------- TEXTURES ----------------------------- // --------------------------- TEXTURES -----------------------------
UCL_Texture pos_tex; UCL_Texture pos_tex;
@ -203,18 +220,19 @@ class BaseThree {
protected: protected:
bool _compiled; bool _compiled;
int _block_pair, _block_size, _threads_per_atom, _end_command_queue; int _block_pair, _block_size, _threads_per_atom, _end_command_queue;
int _gpu_nbor; int _gpu_nbor, _onetype, _onetype3, _spq;
double _max_bytes, _max_an_bytes; double _max_bytes, _max_an_bytes;
int _max_nbors, _ainum, _nall; int _ainum, _nall;
double _gpu_overhead, _driver_overhead; double _gpu_overhead, _driver_overhead;
UCL_D_Vec<int> *_nbor_data;
void compile_kernels(UCL_Device &dev, const void *pair_string, void compile_kernels(UCL_Device &dev, const void *pair_string,
const char *two, const char *three_center, const char *two, const char *three_center,
const char *three_end, const char* short_nbor); const char *three_end, const char* short_nbor,
const int onetype, const int onetype3,
const int spq);
virtual void loop(const bool _eflag, const bool _vflag, virtual int loop(const int eflag, const int vflag, const int evatom,
const int evatom) = 0; bool &success) = 0;
}; };
} }

View File

@ -113,20 +113,9 @@ double BeckT::host_memory_usage() const {
// Calculate energies, forces, and torques // Calculate energies, forces, and torques
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
void BeckT::loop(const bool _eflag, const bool _vflag) { int BeckT::loop(const int eflag, const int vflag) {
// Compute the block size and grid size to keep all cores busy // Compute the block size and grid size to keep all cores busy
const int BX=this->block_size(); const int BX=this->block_size();
int eflag, vflag;
if (_eflag)
eflag=1;
else
eflag=0;
if (_vflag)
vflag=1;
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom))); (BX/this->_threads_per_atom)));
@ -134,8 +123,8 @@ void BeckT::loop(const bool _eflag, const bool _vflag) {
int nbor_pitch=this->nbor->nbor_pitch(); int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start(); this->time_pair.start();
if (shared_types) { if (shared_types) {
this->k_pair_fast.set_size(GX,BX); this->k_pair_sel->set_size(GX,BX);
this->k_pair_fast.run(&this->atom->x, &beck1, &beck2, &sp_lj, this->k_pair_sel->run(&this->atom->x, &beck1, &beck2, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &vflag, &this->ans->force, &this->ans->engv, &eflag, &vflag,
&ainum, &nbor_pitch, &this->_threads_per_atom); &ainum, &nbor_pitch, &this->_threads_per_atom);
@ -147,6 +136,7 @@ void BeckT::loop(const bool _eflag, const bool _vflag) {
&ainum, &nbor_pitch, &this->_threads_per_atom); &ainum, &nbor_pitch, &this->_threads_per_atom);
} }
this->time_pair.stop(); this->time_pair.stop();
return GX;
} }
template class Beck<PRECISION,ACC_PRECISION>; template class Beck<PRECISION,ACC_PRECISION>;

View File

@ -39,22 +39,25 @@ __kernel void k_beck(const __global numtyp4 *restrict x_,
atom_info(t_per_atom,ii,tid,offset); atom_info(t_per_atom,ii,tid,offset);
__local numtyp sp_lj[4]; __local numtyp sp_lj[4];
int n_stride;
local_allocate_store_pair();
sp_lj[0]=sp_lj_in[0]; sp_lj[0]=sp_lj_in[0];
sp_lj[1]=sp_lj_in[1]; sp_lj[1]=sp_lj_in[1];
sp_lj[2]=sp_lj_in[2]; sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3]; sp_lj[3]=sp_lj_in[3];
acctyp energy=(acctyp)0;
acctyp4 f; acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6]; acctyp energy, virial[6];
for (int i=0; i<6; i++) if (EVFLAG) {
virial[i]=(acctyp)0; energy=(acctyp)0;
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
}
if (ii<inum) { if (ii<inum) {
int nbor, nbor_end; int nbor, nbor_end;
int i, numj; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
@ -98,14 +101,14 @@ __kernel void k_beck(const __global numtyp4 *restrict x_,
f.y+=dely*force; f.y+=dely*force;
f.z+=delz*force; f.z+=delz*force;
if (eflag>0) { if (EVFLAG && eflag) {
numtyp term6 = pow(term1,(numtyp)-3); numtyp term6 = pow(term1,(numtyp)-3);
numtyp term1inv = ucl_recip(term1); numtyp term1inv = ucl_recip(term1);
numtyp e = beck2[mtype].x*ucl_exp((numtyp)-1.0*r*term4); numtyp e = beck2[mtype].x*ucl_exp((numtyp)-1.0*r*term4);
e -= beck2[mtype].y*term6*((numtyp)1.0+((numtyp)2.709+(numtyp)3.0*aaij*aaij)*term1inv); e -= beck2[mtype].y*term6*((numtyp)1.0+((numtyp)2.709+(numtyp)3.0*aaij*aaij)*term1inv);
energy+=factor_lj*e; energy+=factor_lj*e;
} }
if (vflag>0) { if (EVFLAG && vflag) {
virial[0] += delx*delx*force; virial[0] += delx*delx*force;
virial[1] += dely*dely*force; virial[1] += dely*dely*force;
virial[2] += delz*delz*force; virial[2] += delz*delz*force;
@ -116,9 +119,9 @@ __kernel void k_beck(const __global numtyp4 *restrict x_,
} }
} // for nbor } // for nbor
} // if ii
store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
ans,engv); ans,engv);
} // if ii
} }
__kernel void k_beck_fast(const __global numtyp4 *restrict x_, __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
@ -137,6 +140,9 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
__local numtyp4 beck1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 beck1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 beck2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 beck2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_lj[4]; __local numtyp sp_lj[4];
int n_stride;
local_allocate_store_pair();
if (tid<4) if (tid<4)
sp_lj[tid]=sp_lj_in[tid]; sp_lj[tid]=sp_lj_in[tid];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) { if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
@ -144,19 +150,19 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
beck2[tid]=beck2_in[tid]; beck2[tid]=beck2_in[tid];
} }
acctyp energy=(acctyp)0;
acctyp4 f; acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6]; acctyp energy, virial[6];
for (int i=0; i<6; i++) if (EVFLAG) {
virial[i]=(acctyp)0; energy=(acctyp)0;
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
}
__syncthreads(); __syncthreads();
if (ii<inum) { if (ii<inum) {
int nbor, nbor_end; int nbor, nbor_end;
int i, numj; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
@ -200,14 +206,14 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
f.y+=dely*force; f.y+=dely*force;
f.z+=delz*force; f.z+=delz*force;
if (eflag>0) { if (EVFLAG && eflag) {
numtyp term6 = pow(term1,(numtyp)-3); numtyp term6 = pow(term1,(numtyp)-3);
numtyp term1inv = ucl_recip(term1); numtyp term1inv = ucl_recip(term1);
numtyp e = beck2[mtype].x*ucl_exp((numtyp)-1.0*r*term4); numtyp e = beck2[mtype].x*ucl_exp((numtyp)-1.0*r*term4);
e -= beck2[mtype].y*term6*((numtyp)1.0+((numtyp)2.709+(numtyp)3.0*aaij*aaij)*term1inv); e -= beck2[mtype].y*term6*((numtyp)1.0+((numtyp)2.709+(numtyp)3.0*aaij*aaij)*term1inv);
energy+=factor_lj*e; energy+=factor_lj*e;
} }
if (vflag>0) { if (EVFLAG && vflag) {
virial[0] += delx*delx*force; virial[0] += delx*delx*force;
virial[1] += dely*dely*force; virial[1] += dely*dely*force;
virial[2] += delz*delz*force; virial[2] += delz*delz*force;
@ -218,8 +224,8 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
} }
} // for nbor } // for nbor
} // if ii
store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
ans,engv); ans,engv);
} // if ii
} }

View File

@ -72,7 +72,7 @@ class Beck : public BaseAtomic<numtyp, acctyp> {
private: private:
bool _allocated; bool _allocated;
void loop(const bool _eflag, const bool _vflag); int loop(const int eflag, const int vflag);
}; };
} }

View File

@ -55,7 +55,7 @@ int beck_gpu_init(const int ntypes, double **cutsq, double **aa,
int init_ok=0; int init_ok=0;
if (world_me==0) if (world_me==0)
init_ok=BLMF.init(ntypes, cutsq, aa, alpha, beta, init_ok=BLMF.init(ntypes, cutsq, aa, alpha, beta,
AA, BB, special_lj, inum, nall, 300, AA, BB, special_lj, inum, nall, max_nbors,
maxspecial, cell_size, gpu_split, screen); maxspecial, cell_size, gpu_split, screen);
BLMF.device->world_barrier(); BLMF.device->world_barrier();
@ -73,7 +73,7 @@ int beck_gpu_init(const int ntypes, double **cutsq, double **aa,
} }
if (gpu_rank==i && world_me!=0) if (gpu_rank==i && world_me!=0)
init_ok=BLMF.init(ntypes, cutsq, aa, alpha, beta, AA, BB, init_ok=BLMF.init(ntypes, cutsq, aa, alpha, beta, AA, BB,
special_lj, inum, nall, 300, maxspecial, special_lj, inum, nall, max_nbors, maxspecial,
cell_size, gpu_split, screen); cell_size, gpu_split, screen);
BLMF.device->gpu_barrier(); BLMF.device->gpu_barrier();

View File

@ -138,20 +138,9 @@ double BornT::host_memory_usage() const {
// Calculate energies, forces, and torques // Calculate energies, forces, and torques
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
void BornT::loop(const bool _eflag, const bool _vflag) { int BornT::loop(const int eflag, const int vflag) {
// Compute the block size and grid size to keep all cores busy // Compute the block size and grid size to keep all cores busy
const int BX=this->block_size(); const int BX=this->block_size();
int eflag, vflag;
if (_eflag)
eflag=1;
else
eflag=0;
if (_vflag)
vflag=1;
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom))); (BX/this->_threads_per_atom)));
@ -159,8 +148,8 @@ void BornT::loop(const bool _eflag, const bool _vflag) {
int nbor_pitch=this->nbor->nbor_pitch(); int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start(); this->time_pair.start();
if (shared_types) { if (shared_types) {
this->k_pair_fast.set_size(GX,BX); this->k_pair_sel->set_size(GX,BX);
this->k_pair_fast.run(&this->atom->x, &coeff1,&coeff2, this->k_pair_sel->run(&this->atom->x, &coeff1,&coeff2,
&cutsq_sigma, &sp_lj, &cutsq_sigma, &sp_lj,
&this->nbor->dev_nbor, &this->nbor->dev_nbor,
&this->_nbor_data->begin(), &this->_nbor_data->begin(),
@ -176,6 +165,7 @@ void BornT::loop(const bool _eflag, const bool _vflag) {
&nbor_pitch, &this->_threads_per_atom); &nbor_pitch, &this->_threads_per_atom);
} }
this->time_pair.stop(); this->time_pair.stop();
return GX;
} }
template class Born<PRECISION,ACC_PRECISION>; template class Born<PRECISION,ACC_PRECISION>;

View File

@ -40,22 +40,25 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
atom_info(t_per_atom,ii,tid,offset); atom_info(t_per_atom,ii,tid,offset);
__local numtyp sp_lj[4]; __local numtyp sp_lj[4];
int n_stride;
local_allocate_store_pair();
sp_lj[0]=sp_lj_in[0]; sp_lj[0]=sp_lj_in[0];
sp_lj[1]=sp_lj_in[1]; sp_lj[1]=sp_lj_in[1];
sp_lj[2]=sp_lj_in[2]; sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3]; sp_lj[3]=sp_lj_in[3];
acctyp energy=(acctyp)0;
acctyp4 f; acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6]; acctyp energy, virial[6];
for (int i=0; i<6; i++) if (EVFLAG) {
virial[i]=(acctyp)0; energy=(acctyp)0;
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
}
if (ii<inum) { if (ii<inum) {
int nbor, nbor_end; int nbor, nbor_end;
int i, numj; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
@ -92,12 +95,12 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
f.y+=dely*force; f.y+=dely*force;
f.z+=delz*force; f.z+=delz*force;
if (eflag>0) { if (EVFLAG && eflag) {
numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
+ coeff2[mtype].z*r2inv*r6inv; + coeff2[mtype].z*r2inv*r6inv;
energy+=factor_lj*(e-coeff2[mtype].w); energy+=factor_lj*(e-coeff2[mtype].w);
} }
if (vflag>0) { if (EVFLAG && vflag) {
virial[0] += delx*delx*force; virial[0] += delx*delx*force;
virial[1] += dely*dely*force; virial[1] += dely*dely*force;
virial[2] += delz*delz*force; virial[2] += delz*delz*force;
@ -108,9 +111,9 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
} }
} // for nbor } // for nbor
} // if ii
store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
ans,engv); ans,engv);
} // if ii
} }
__kernel void k_born_fast(const __global numtyp4 *restrict x_, __kernel void k_born_fast(const __global numtyp4 *restrict x_,
@ -130,27 +133,30 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
__local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_lj[4]; __local numtyp sp_lj[4];
int n_stride;
local_allocate_store_pair();
if (tid<4) if (tid<4)
sp_lj[tid]=sp_lj_in[tid]; sp_lj[tid]=sp_lj_in[tid];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) { if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
coeff1[tid]=coeff1_in[tid]; coeff1[tid]=coeff1_in[tid];
if (eflag>0) if (EVFLAG && eflag)
coeff2[tid]=coeff2_in[tid]; coeff2[tid]=coeff2_in[tid];
} }
acctyp energy=(acctyp)0;
acctyp4 f; acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6]; acctyp energy, virial[6];
for (int i=0; i<6; i++) if (EVFLAG) {
virial[i]=(acctyp)0; energy=(acctyp)0;
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
}
__syncthreads(); __syncthreads();
if (ii<inum) { if (ii<inum) {
int nbor, nbor_end; int nbor, nbor_end;
int i, numj; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
@ -187,12 +193,12 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
f.y+=dely*force; f.y+=dely*force;
f.z+=delz*force; f.z+=delz*force;
if (eflag>0) { if (EVFLAG && eflag) {
numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
+ coeff2[mtype].z*r2inv*r6inv; + coeff2[mtype].z*r2inv*r6inv;
energy+=factor_lj*(e-coeff2[mtype].w); energy+=factor_lj*(e-coeff2[mtype].w);
} }
if (vflag>0) { if (EVFLAG && vflag) {
virial[0] += delx*delx*force; virial[0] += delx*delx*force;
virial[1] += dely*dely*force; virial[1] += dely*dely*force;
virial[2] += delz*delz*force; virial[2] += delz*delz*force;
@ -203,8 +209,8 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
} }
} // for nbor } // for nbor
} // if ii
store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
ans,engv); ans,engv);
} // if ii
} }

View File

@ -82,7 +82,7 @@ class Born : public BaseAtomic<numtyp, acctyp> {
private: private:
bool _allocated; bool _allocated;
void loop(const bool _eflag, const bool _vflag); int loop(const int eflag, const int vflag);
}; };
} }

View File

@ -129,20 +129,9 @@ double BornCoulLongT::host_memory_usage() const {
// Calculate energies, forces, and torques // Calculate energies, forces, and torques
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
void BornCoulLongT::loop(const bool _eflag, const bool _vflag) { int BornCoulLongT::loop(const int eflag, const int vflag) {
// Compute the block size and grid size to keep all cores busy // Compute the block size and grid size to keep all cores busy
const int BX=this->block_size(); const int BX=this->block_size();
int eflag, vflag;
if (_eflag)
eflag=1;
else
eflag=0;
if (_vflag)
vflag=1;
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom))); (BX/this->_threads_per_atom)));
@ -150,8 +139,8 @@ void BornCoulLongT::loop(const bool _eflag, const bool _vflag) {
int nbor_pitch=this->nbor->nbor_pitch(); int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start(); this->time_pair.start();
if (shared_types) { if (shared_types) {
this->k_pair_fast.set_size(GX,BX); this->k_pair_sel->set_size(GX,BX);
this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj, this->k_pair_sel->run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
&this->nbor->dev_nbor, &this->nbor->dev_nbor,
&this->_nbor_data->begin(), &this->_nbor_data->begin(),
&this->ans->force, &this->ans->force,
@ -170,6 +159,7 @@ void BornCoulLongT::loop(const bool _eflag, const bool _vflag) {
&_qqrd2e, &_g_ewald, &this->_threads_per_atom); &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
} }
this->time_pair.stop(); this->time_pair.stop();
return GX;
} }
template class BornCoulLong<PRECISION,ACC_PRECISION>; template class BornCoulLong<PRECISION,ACC_PRECISION>;

View File

@ -48,6 +48,9 @@ __kernel void k_born_coul_long(const __global numtyp4 *restrict x_,
atom_info(t_per_atom,ii,tid,offset); atom_info(t_per_atom,ii,tid,offset);
__local numtyp sp_lj[8]; __local numtyp sp_lj[8];
int n_stride;
local_allocate_store_charge();
sp_lj[0]=sp_lj_in[0]; sp_lj[0]=sp_lj_in[0];
sp_lj[1]=sp_lj_in[1]; sp_lj[1]=sp_lj_in[1];
sp_lj[2]=sp_lj_in[2]; sp_lj[2]=sp_lj_in[2];
@ -57,18 +60,18 @@ __kernel void k_born_coul_long(const __global numtyp4 *restrict x_,
sp_lj[6]=sp_lj_in[6]; sp_lj[6]=sp_lj_in[6];
sp_lj[7]=sp_lj_in[7]; sp_lj[7]=sp_lj_in[7];
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f; acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6]; acctyp energy, e_coul, virial[6];
for (int i=0; i<6; i++) if (EVFLAG) {
virial[i]=(acctyp)0; energy=(acctyp)0;
e_coul=(acctyp)0;
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
}
if (ii<inum) { if (ii<inum) {
int nbor, nbor_end; int nbor, nbor_end;
int i, numj; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
@ -124,7 +127,7 @@ __kernel void k_born_coul_long(const __global numtyp4 *restrict x_,
f.y+=dely*force; f.y+=dely*force;
f.z+=delz*force; f.z+=delz*force;
if (eflag>0) { if (EVFLAG && eflag) {
if (rsq < cut_coulsq) if (rsq < cut_coulsq)
e_coul += prefactor*(_erfc-factor_coul); e_coul += prefactor*(_erfc-factor_coul);
if (rsq < cutsq_sigma[mtype].y) { if (rsq < cutsq_sigma[mtype].y) {
@ -133,7 +136,7 @@ __kernel void k_born_coul_long(const __global numtyp4 *restrict x_,
energy+=factor_lj*(e-coeff2[mtype].w); energy+=factor_lj*(e-coeff2[mtype].w);
} }
} }
if (vflag>0) { if (EVFLAG && vflag) {
virial[0] += delx*delx*force; virial[0] += delx*delx*force;
virial[1] += dely*dely*force; virial[1] += dely*dely*force;
virial[2] += delz*delz*force; virial[2] += delz*delz*force;
@ -144,9 +147,9 @@ __kernel void k_born_coul_long(const __global numtyp4 *restrict x_,
} }
} // for nbor } // for nbor
} // if ii
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
vflag,ans,engv); vflag,ans,engv);
} // if ii
} }
__kernel void k_born_coul_long_fast(const __global numtyp4 *restrict x_, __kernel void k_born_coul_long_fast(const __global numtyp4 *restrict x_,
@ -169,28 +172,31 @@ __kernel void k_born_coul_long_fast(const __global numtyp4 *restrict x_,
__local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_lj[8]; __local numtyp sp_lj[8];
int n_stride;
local_allocate_store_charge();
if (tid<8) if (tid<8)
sp_lj[tid]=sp_lj_in[tid]; sp_lj[tid]=sp_lj_in[tid];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) { if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
coeff1[tid]=coeff1_in[tid]; coeff1[tid]=coeff1_in[tid];
if (eflag>0) if (EVFLAG && eflag)
coeff2[tid]=coeff2_in[tid]; coeff2[tid]=coeff2_in[tid];
} }
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f; acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6]; acctyp energy, e_coul, virial[6];
for (int i=0; i<6; i++) if (EVFLAG) {
virial[i]=(acctyp)0; energy=(acctyp)0;
e_coul=(acctyp)0;
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
}
__syncthreads(); __syncthreads();
if (ii<inum) { if (ii<inum) {
int nbor, nbor_end; int nbor, nbor_end;
int i, numj; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
@ -246,7 +252,7 @@ __kernel void k_born_coul_long_fast(const __global numtyp4 *restrict x_,
f.y+=dely*force; f.y+=dely*force;
f.z+=delz*force; f.z+=delz*force;
if (eflag>0) { if (EVFLAG && eflag) {
if (rsq < cut_coulsq) if (rsq < cut_coulsq)
e_coul += prefactor*(_erfc-factor_coul); e_coul += prefactor*(_erfc-factor_coul);
if (rsq < cutsq_sigma[mtype].y) { if (rsq < cutsq_sigma[mtype].y) {
@ -255,7 +261,7 @@ __kernel void k_born_coul_long_fast(const __global numtyp4 *restrict x_,
energy+=factor_lj*(e-coeff2[mtype].w); energy+=factor_lj*(e-coeff2[mtype].w);
} }
} }
if (vflag>0) { if (EVFLAG && vflag) {
virial[0] += delx*delx*force; virial[0] += delx*delx*force;
virial[1] += dely*dely*force; virial[1] += dely*dely*force;
virial[2] += delz*delz*force; virial[2] += delz*delz*force;
@ -266,8 +272,8 @@ __kernel void k_born_coul_long_fast(const __global numtyp4 *restrict x_,
} }
} // for nbor } // for nbor
} // if ii
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
vflag,ans,engv); vflag,ans,engv);
} // if ii
} }

View File

@ -80,7 +80,7 @@ class BornCoulLong : public BaseCharge<numtyp, acctyp> {
protected: protected:
bool _allocated; bool _allocated;
void loop(const bool _eflag, const bool _vflag); int loop(const int eflag, const int vflag);
}; };
} }

View File

@ -63,6 +63,9 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_,
atom_info(t_per_atom,ii,tid,offset); atom_info(t_per_atom,ii,tid,offset);
__local numtyp sp_lj[8]; __local numtyp sp_lj[8];
int n_stride;
local_allocate_store_charge();
sp_lj[0]=sp_lj_in[0]; sp_lj[0]=sp_lj_in[0];
sp_lj[1]=sp_lj_in[1]; sp_lj[1]=sp_lj_in[1];
sp_lj[2]=sp_lj_in[2]; sp_lj[2]=sp_lj_in[2];
@ -72,18 +75,18 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_,
sp_lj[6]=sp_lj_in[6]; sp_lj[6]=sp_lj_in[6];
sp_lj[7]=sp_lj_in[7]; sp_lj[7]=sp_lj_in[7];
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f; acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6]; acctyp energy, e_coul, virial[6];
for (int i=0; i<6; i++) if (EVFLAG) {
virial[i]=(acctyp)0; energy=(acctyp)0;
e_coul=(acctyp)0;
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
}
if (ii<inum) { if (ii<inum) {
int nbor, nbor_end; int nbor, nbor_end;
int i, numj; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
@ -155,7 +158,7 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_,
f.y+=dely*force; f.y+=dely*force;
f.z+=delz*force; f.z+=delz*force;
if (eflag>0) { if (EVFLAG && eflag) {
if (rsq < cut_coulsq) { if (rsq < cut_coulsq) {
numtyp e = prefactor*_erfc; numtyp e = prefactor*_erfc;
if (factor_coul<(numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor; if (factor_coul<(numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
@ -167,7 +170,7 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_,
energy+=factor_lj*(e-coeff2[mtype].w); energy+=factor_lj*(e-coeff2[mtype].w);
} }
} }
if (vflag>0) { if (EVFLAG && vflag) {
virial[0] += delx*delx*force; virial[0] += delx*delx*force;
virial[1] += dely*dely*force; virial[1] += dely*dely*force;
virial[2] += delz*delz*force; virial[2] += delz*delz*force;
@ -178,9 +181,9 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_,
} }
} // for nbor } // for nbor
} // if ii
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
vflag,ans,engv); vflag,ans,engv);
} // if ii
} }
__kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_, __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_,
@ -203,28 +206,31 @@ __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_,
__local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_lj[8]; __local numtyp sp_lj[8];
int n_stride;
local_allocate_store_charge();
if (tid<8) if (tid<8)
sp_lj[tid]=sp_lj_in[tid]; sp_lj[tid]=sp_lj_in[tid];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) { if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
coeff1[tid]=coeff1_in[tid]; coeff1[tid]=coeff1_in[tid];
if (eflag>0) if (EVFLAG && eflag)
coeff2[tid]=coeff2_in[tid]; coeff2[tid]=coeff2_in[tid];
} }
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f; acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6]; acctyp energy, e_coul, virial[6];
for (int i=0; i<6; i++) if (EVFLAG) {
virial[i]=(acctyp)0; energy=(acctyp)0;
e_coul=(acctyp)0;
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
}
__syncthreads(); __syncthreads();
if (ii<inum) { if (ii<inum) {
int nbor, nbor_end; int nbor, nbor_end;
int i, numj; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
@ -296,7 +302,7 @@ __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_,
f.y+=dely*force; f.y+=dely*force;
f.z+=delz*force; f.z+=delz*force;
if (eflag>0) { if (EVFLAG && eflag) {
if (rsq < cut_coulsq) { if (rsq < cut_coulsq) {
numtyp e = prefactor*_erfc; numtyp e = prefactor*_erfc;
if (factor_coul<(numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor; if (factor_coul<(numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
@ -308,7 +314,7 @@ __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_,
energy+=factor_lj*(e-coeff2[mtype].w); energy+=factor_lj*(e-coeff2[mtype].w);
} }
} }
if (vflag>0) { if (EVFLAG && vflag) {
virial[0] += delx*delx*force; virial[0] += delx*delx*force;
virial[1] += dely*dely*force; virial[1] += dely*dely*force;
virial[2] += delz*delz*force; virial[2] += delz*delz*force;
@ -319,8 +325,8 @@ __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_,
} }
} // for nbor } // for nbor
} // if ii
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
vflag,ans,engv); vflag,ans,engv);
} // if ii
} }

View File

@ -60,7 +60,7 @@ int bornclcs_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
if (world_me==0) if (world_me==0)
init_ok=BCLCSMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, init_ok=BCLCSMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
host_born3, host_a, host_c, host_d, sigma, offset, host_born3, host_a, host_c, host_d, sigma, offset,
special_lj, inum, nall, 300, maxspecial, cell_size, special_lj, inum, nall, max_nbors, maxspecial, cell_size,
gpu_split, screen, host_cut_ljsq, host_cut_coulsq, gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
host_special_coul, qqrd2e, g_ewald); host_special_coul, qqrd2e, g_ewald);
@ -80,7 +80,7 @@ int bornclcs_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
if (gpu_rank==i && world_me!=0) if (gpu_rank==i && world_me!=0)
init_ok=BCLCSMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, init_ok=BCLCSMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
host_born3, host_a, host_c, host_d, sigma, offset, host_born3, host_a, host_c, host_d, sigma, offset,
special_lj, inum, nall, 300, maxspecial, cell_size, special_lj, inum, nall, max_nbors, maxspecial, cell_size,
gpu_split, screen, host_cut_ljsq, host_cut_coulsq, gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
host_special_coul, qqrd2e, g_ewald); host_special_coul, qqrd2e, g_ewald);

View File

@ -60,7 +60,7 @@ int borncl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
if (world_me==0) if (world_me==0)
init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
host_born3, host_a, host_c, host_d, sigma, offset, host_born3, host_a, host_c, host_d, sigma, offset,
special_lj, inum, nall, 300, maxspecial, cell_size, special_lj, inum, nall, max_nbors, maxspecial, cell_size,
gpu_split, screen, host_cut_ljsq, host_cut_coulsq, gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
host_special_coul, qqrd2e, g_ewald); host_special_coul, qqrd2e, g_ewald);
@ -80,7 +80,7 @@ int borncl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
if (gpu_rank==i && world_me!=0) if (gpu_rank==i && world_me!=0)
init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
host_born3, host_a, host_c, host_d, sigma, offset, host_born3, host_a, host_c, host_d, sigma, offset,
special_lj, inum, nall, 300, maxspecial, cell_size, special_lj, inum, nall, max_nbors, maxspecial, cell_size,
gpu_split, screen, host_cut_ljsq, host_cut_coulsq, gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
host_special_coul, qqrd2e, g_ewald); host_special_coul, qqrd2e, g_ewald);

View File

@ -131,20 +131,9 @@ double BornCoulWolfT::host_memory_usage() const {
// Calculate energies, forces, and torques // Calculate energies, forces, and torques
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
void BornCoulWolfT::loop(const bool _eflag, const bool _vflag) { int BornCoulWolfT::loop(const int eflag, const int vflag) {
// Compute the block size and grid size to keep all cores busy // Compute the block size and grid size to keep all cores busy
const int BX=this->block_size(); const int BX=this->block_size();
int eflag, vflag;
if (_eflag)
eflag=1;
else
eflag=0;
if (_vflag)
vflag=1;
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom))); (BX/this->_threads_per_atom)));
@ -152,8 +141,8 @@ void BornCoulWolfT::loop(const bool _eflag, const bool _vflag) {
int nbor_pitch=this->nbor->nbor_pitch(); int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start(); this->time_pair.start();
if (shared_types) { if (shared_types) {
this->k_pair_fast.set_size(GX,BX); this->k_pair_sel->set_size(GX,BX);
this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj, this->k_pair_sel->run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &vflag, &this->ans->force, &this->ans->engv, &eflag, &vflag,
&ainum, &nbor_pitch, &this->atom->q, &ainum, &nbor_pitch, &this->atom->q,
@ -171,6 +160,7 @@ void BornCoulWolfT::loop(const bool _eflag, const bool _vflag) {
&this->_threads_per_atom); &this->_threads_per_atom);
} }
this->time_pair.stop(); this->time_pair.stop();
return GX;
} }
template class BornCoulWolf<PRECISION,ACC_PRECISION>; template class BornCoulWolf<PRECISION,ACC_PRECISION>;

View File

@ -51,6 +51,9 @@ __kernel void k_born_coul_wolf(const __global numtyp4 *restrict x_,
atom_info(t_per_atom,ii,tid,offset); atom_info(t_per_atom,ii,tid,offset);
__local numtyp sp_lj[8]; __local numtyp sp_lj[8];
int n_stride;
local_allocate_store_charge();
sp_lj[0]=sp_lj_in[0]; sp_lj[0]=sp_lj_in[0];
sp_lj[1]=sp_lj_in[1]; sp_lj[1]=sp_lj_in[1];
sp_lj[2]=sp_lj_in[2]; sp_lj[2]=sp_lj_in[2];
@ -60,18 +63,18 @@ __kernel void k_born_coul_wolf(const __global numtyp4 *restrict x_,
sp_lj[6]=sp_lj_in[6]; sp_lj[6]=sp_lj_in[6];
sp_lj[7]=sp_lj_in[7]; sp_lj[7]=sp_lj_in[7];
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f; acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6]; acctyp energy, e_coul, virial[6];
for (int i=0; i<6; i++) if (EVFLAG) {
virial[i]=(acctyp)0; energy=(acctyp)0;
e_coul=(acctyp)0;
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
}
if (ii<inum) { if (ii<inum) {
int nbor, nbor_end; int nbor, nbor_end;
int i, numj; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
@ -79,7 +82,7 @@ __kernel void k_born_coul_wolf(const __global numtyp4 *restrict x_,
numtyp qtmp; fetch(qtmp,i,q_tex); numtyp qtmp; fetch(qtmp,i,q_tex);
int itype=ix.w; int itype=ix.w;
if (eflag>0) { if (EVFLAG && eflag) {
acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) * acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) *
qtmp*qtmp*qqrd2e/(acctyp)t_per_atom; qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
e_coul += (acctyp)2.0*e_self; e_coul += (acctyp)2.0*e_self;
@ -137,7 +140,7 @@ __kernel void k_born_coul_wolf(const __global numtyp4 *restrict x_,
f.y+=dely*force; f.y+=dely*force;
f.z+=delz*force; f.z+=delz*force;
if (eflag>0) { if (EVFLAG && eflag) {
if (rsq < cut_coulsq) { if (rsq < cut_coulsq) {
numtyp e=v_sh; numtyp e=v_sh;
if (factor_coul < (numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor; if (factor_coul < (numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
@ -149,7 +152,7 @@ __kernel void k_born_coul_wolf(const __global numtyp4 *restrict x_,
energy+=factor_lj*(e-coeff2[mtype].w); energy+=factor_lj*(e-coeff2[mtype].w);
} }
} }
if (vflag>0) { if (EVFLAG && vflag) {
virial[0] += delx*delx*force; virial[0] += delx*delx*force;
virial[1] += dely*dely*force; virial[1] += dely*dely*force;
virial[2] += delz*delz*force; virial[2] += delz*delz*force;
@ -160,9 +163,9 @@ __kernel void k_born_coul_wolf(const __global numtyp4 *restrict x_,
} }
} // for nbor } // for nbor
} // if ii
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
vflag,ans,engv); vflag,ans,engv);
} // if ii
} }
__kernel void k_born_coul_wolf_fast(const __global numtyp4 *restrict x_, __kernel void k_born_coul_wolf_fast(const __global numtyp4 *restrict x_,
@ -186,28 +189,31 @@ __kernel void k_born_coul_wolf_fast(const __global numtyp4 *restrict x_,
__local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_lj[8]; __local numtyp sp_lj[8];
int n_stride;
local_allocate_store_charge();
if (tid<8) if (tid<8)
sp_lj[tid]=sp_lj_in[tid]; sp_lj[tid]=sp_lj_in[tid];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) { if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
coeff1[tid]=coeff1_in[tid]; coeff1[tid]=coeff1_in[tid];
if (eflag>0) if (EVFLAG && eflag)
coeff2[tid]=coeff2_in[tid]; coeff2[tid]=coeff2_in[tid];
} }
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f; acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6]; acctyp energy, e_coul, virial[6];
for (int i=0; i<6; i++) if (EVFLAG) {
virial[i]=(acctyp)0; energy=(acctyp)0;
e_coul=(acctyp)0;
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
}
__syncthreads(); __syncthreads();
if (ii<inum) { if (ii<inum) {
int nbor, nbor_end; int nbor, nbor_end;
int i, numj; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
@ -216,7 +222,7 @@ __kernel void k_born_coul_wolf_fast(const __global numtyp4 *restrict x_,
int iw=ix.w; int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw); int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
if (eflag>0) { if (EVFLAG && eflag) {
acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) * acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) *
qtmp*qtmp*qqrd2e/(acctyp)t_per_atom; qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
e_coul += (acctyp)2.0*e_self; e_coul += (acctyp)2.0*e_self;
@ -273,7 +279,7 @@ __kernel void k_born_coul_wolf_fast(const __global numtyp4 *restrict x_,
f.y+=dely*force; f.y+=dely*force;
f.z+=delz*force; f.z+=delz*force;
if (eflag>0) { if (EVFLAG && eflag) {
if (rsq < cut_coulsq) { if (rsq < cut_coulsq) {
numtyp e=v_sh; numtyp e=v_sh;
if (factor_coul < (numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor; if (factor_coul < (numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
@ -285,7 +291,7 @@ __kernel void k_born_coul_wolf_fast(const __global numtyp4 *restrict x_,
energy+=factor_lj*(e-coeff2[mtype].w); energy+=factor_lj*(e-coeff2[mtype].w);
} }
} }
if (vflag>0) { if (EVFLAG && vflag) {
virial[0] += delx*delx*force; virial[0] += delx*delx*force;
virial[1] += dely*dely*force; virial[1] += dely*dely*force;
virial[2] += delz*delz*force; virial[2] += delz*delz*force;
@ -296,8 +302,7 @@ __kernel void k_born_coul_wolf_fast(const __global numtyp4 *restrict x_,
} }
} // for nbor } // for nbor
} // if ii
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
vflag,ans,engv); vflag,ans,engv);
} // if ii
} }

View File

@ -81,7 +81,7 @@ class BornCoulWolf : public BaseCharge<numtyp, acctyp> {
protected: protected:
bool _allocated; bool _allocated;
void loop(const bool _eflag, const bool _vflag); int loop(const int eflag, const int vflag);
}; };
} }

View File

@ -52,6 +52,9 @@ __kernel void k_born_coul_wolf_cs(const __global numtyp4 *restrict x_,
atom_info(t_per_atom,ii,tid,offset); atom_info(t_per_atom,ii,tid,offset);
__local numtyp sp_lj[8]; __local numtyp sp_lj[8];
int n_stride;
local_allocate_store_charge();
sp_lj[0]=sp_lj_in[0]; sp_lj[0]=sp_lj_in[0];
sp_lj[1]=sp_lj_in[1]; sp_lj[1]=sp_lj_in[1];
sp_lj[2]=sp_lj_in[2]; sp_lj[2]=sp_lj_in[2];
@ -61,18 +64,18 @@ __kernel void k_born_coul_wolf_cs(const __global numtyp4 *restrict x_,
sp_lj[6]=sp_lj_in[6]; sp_lj[6]=sp_lj_in[6];
sp_lj[7]=sp_lj_in[7]; sp_lj[7]=sp_lj_in[7];
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f; acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6]; acctyp energy, e_coul, virial[6];
for (int i=0; i<6; i++) if (EVFLAG) {
virial[i]=(acctyp)0; energy=(acctyp)0;
e_coul=(acctyp)0;
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
}
if (ii<inum) { if (ii<inum) {
int nbor, nbor_end; int nbor, nbor_end;
int i, numj; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
@ -80,7 +83,7 @@ __kernel void k_born_coul_wolf_cs(const __global numtyp4 *restrict x_,
numtyp qtmp; fetch(qtmp,i,q_tex); numtyp qtmp; fetch(qtmp,i,q_tex);
int itype=ix.w; int itype=ix.w;
if (eflag>0) { if (EVFLAG && eflag) {
acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) * acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) *
qtmp*qtmp*qqrd2e/(acctyp)t_per_atom; qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
e_coul += (acctyp)2.0*e_self; e_coul += (acctyp)2.0*e_self;
@ -139,7 +142,7 @@ __kernel void k_born_coul_wolf_cs(const __global numtyp4 *restrict x_,
f.y+=dely*force; f.y+=dely*force;
f.z+=delz*force; f.z+=delz*force;
if (eflag>0) { if (EVFLAG && eflag) {
if (rsq < cut_coulsq) { if (rsq < cut_coulsq) {
acctyp e=v_sh; acctyp e=v_sh;
if (factor_coul < (numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor; if (factor_coul < (numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
@ -151,7 +154,7 @@ __kernel void k_born_coul_wolf_cs(const __global numtyp4 *restrict x_,
energy+=factor_lj*(e-coeff2[mtype].w); energy+=factor_lj*(e-coeff2[mtype].w);
} }
} }
if (vflag>0) { if (EVFLAG && vflag) {
virial[0] += delx*delx*force; virial[0] += delx*delx*force;
virial[1] += dely*dely*force; virial[1] += dely*dely*force;
virial[2] += delz*delz*force; virial[2] += delz*delz*force;
@ -162,9 +165,9 @@ __kernel void k_born_coul_wolf_cs(const __global numtyp4 *restrict x_,
} }
} // for nbor } // for nbor
} // if ii
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
vflag,ans,engv); vflag,ans,engv);
} // if ii
} }
__kernel void k_born_coul_wolf_cs_fast(const __global numtyp4 *restrict x_, __kernel void k_born_coul_wolf_cs_fast(const __global numtyp4 *restrict x_,
@ -188,28 +191,31 @@ __kernel void k_born_coul_wolf_cs_fast(const __global numtyp4 *restrict x_,
__local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_lj[8]; __local numtyp sp_lj[8];
int n_stride;
local_allocate_store_charge();
if (tid<8) if (tid<8)
sp_lj[tid]=sp_lj_in[tid]; sp_lj[tid]=sp_lj_in[tid];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) { if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
coeff1[tid]=coeff1_in[tid]; coeff1[tid]=coeff1_in[tid];
if (eflag>0) if (EVFLAG && eflag)
coeff2[tid]=coeff2_in[tid]; coeff2[tid]=coeff2_in[tid];
} }
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f; acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6]; acctyp energy, e_coul, virial[6];
for (int i=0; i<6; i++) if (EVFLAG) {
virial[i]=(acctyp)0; energy=(acctyp)0;
e_coul=(acctyp)0;
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
}
__syncthreads(); __syncthreads();
if (ii<inum) { if (ii<inum) {
int nbor, nbor_end; int nbor, nbor_end;
int i, numj; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
@ -218,7 +224,7 @@ __kernel void k_born_coul_wolf_cs_fast(const __global numtyp4 *restrict x_,
int iw=ix.w; int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw); int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
if (eflag>0) { if (EVFLAG && eflag) {
acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) * acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) *
qtmp*qtmp*qqrd2e/(acctyp)t_per_atom; qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
e_coul += (acctyp)2.0*e_self; e_coul += (acctyp)2.0*e_self;
@ -276,7 +282,7 @@ __kernel void k_born_coul_wolf_cs_fast(const __global numtyp4 *restrict x_,
f.y+=dely*force; f.y+=dely*force;
f.z+=delz*force; f.z+=delz*force;
if (eflag>0) { if (EVFLAG && eflag) {
if (rsq < cut_coulsq) { if (rsq < cut_coulsq) {
acctyp e=v_sh; acctyp e=v_sh;
if (factor_coul < (numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor; if (factor_coul < (numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
@ -288,7 +294,7 @@ __kernel void k_born_coul_wolf_cs_fast(const __global numtyp4 *restrict x_,
energy+=factor_lj*(e-coeff2[mtype].w); energy+=factor_lj*(e-coeff2[mtype].w);
} }
} }
if (vflag>0) { if (EVFLAG && vflag) {
virial[0] += delx*delx*force; virial[0] += delx*delx*force;
virial[1] += dely*dely*force; virial[1] += dely*dely*force;
virial[2] += delz*delz*force; virial[2] += delz*delz*force;
@ -299,8 +305,8 @@ __kernel void k_born_coul_wolf_cs_fast(const __global numtyp4 *restrict x_,
} }
} // for nbor } // for nbor
} // if ii
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
vflag,ans,engv); vflag,ans,engv);
} // if ii
} }

View File

@ -60,7 +60,7 @@ int borncwcs_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
if (world_me==0) if (world_me==0)
init_ok=BornCWCST.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, init_ok=BornCWCST.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
host_born3, host_a, host_c, host_d, sigma, host_born3, host_a, host_c, host_d, sigma,
offset, special_lj, inum, nall, 300, offset, special_lj, inum, nall, max_nbors,
maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
host_cut_coulsq, host_special_coul, qqrd2e, host_cut_coulsq, host_special_coul, qqrd2e,
alf, e_shift, f_shift); alf, e_shift, f_shift);
@ -81,7 +81,7 @@ int borncwcs_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
if (gpu_rank==i && world_me!=0) if (gpu_rank==i && world_me!=0)
init_ok=BornCWCST.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, init_ok=BornCWCST.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
host_born3, host_a, host_c, host_d, sigma, host_born3, host_a, host_c, host_d, sigma,
offset, special_lj, inum, nall, 300, offset, special_lj, inum, nall, max_nbors,
maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
host_cut_coulsq, host_special_coul, qqrd2e, host_cut_coulsq, host_special_coul, qqrd2e,
alf, e_shift, f_shift); alf, e_shift, f_shift);

View File

@ -60,7 +60,7 @@ int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
if (world_me==0) if (world_me==0)
init_ok=BORNCWMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, init_ok=BORNCWMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
host_born3, host_a, host_c, host_d, sigma, host_born3, host_a, host_c, host_d, sigma,
offset, special_lj, inum, nall, 300, offset, special_lj, inum, nall, max_nbors,
maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
host_cut_coulsq, host_special_coul, qqrd2e, host_cut_coulsq, host_special_coul, qqrd2e,
alf, e_shift, f_shift); alf, e_shift, f_shift);
@ -81,7 +81,7 @@ int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
if (gpu_rank==i && world_me!=0) if (gpu_rank==i && world_me!=0)
init_ok=BORNCWMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, init_ok=BORNCWMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
host_born3, host_a, host_c, host_d, sigma, host_born3, host_a, host_c, host_d, sigma,
offset, special_lj, inum, nall, 300, offset, special_lj, inum, nall, max_nbors,
maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
host_cut_coulsq, host_special_coul, qqrd2e, host_cut_coulsq, host_special_coul, qqrd2e,
alf, e_shift, f_shift); alf, e_shift, f_shift);

View File

@ -58,7 +58,7 @@ int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
if (world_me==0) if (world_me==0)
init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
host_born3, host_a, host_c, host_d, sigma, host_born3, host_a, host_c, host_d, sigma,
offset, special_lj, inum, nall, 300, offset, special_lj, inum, nall, max_nbors,
maxspecial, cell_size, gpu_split, screen); maxspecial, cell_size, gpu_split, screen);
BORNMF.device->world_barrier(); BORNMF.device->world_barrier();
@ -77,7 +77,7 @@ int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
if (gpu_rank==i && world_me!=0) if (gpu_rank==i && world_me!=0)
init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
host_born3, host_a, host_c, host_d, sigma, host_born3, host_a, host_c, host_d, sigma,
offset, special_lj, inum, nall, 300, offset, special_lj, inum, nall, max_nbors,
maxspecial, cell_size, gpu_split, screen); maxspecial, cell_size, gpu_split, screen);
BORNMF.device->gpu_barrier(); BORNMF.device->gpu_barrier();

View File

@ -130,20 +130,9 @@ double BuckT::host_memory_usage() const {
// Calculate energies, forces, and torques // Calculate energies, forces, and torques
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
void BuckT::loop(const bool _eflag, const bool _vflag) { int BuckT::loop(const int eflag, const int vflag) {
// Compute the block size and grid size to keep all cores busy // Compute the block size and grid size to keep all cores busy
const int BX=this->block_size(); const int BX=this->block_size();
int eflag, vflag;
if (_eflag)
eflag=1;
else
eflag=0;
if (_vflag)
vflag=1;
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom))); (BX/this->_threads_per_atom)));
@ -151,8 +140,8 @@ void BuckT::loop(const bool _eflag, const bool _vflag) {
int nbor_pitch=this->nbor->nbor_pitch(); int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start(); this->time_pair.start();
if (shared_types) { if (shared_types) {
this->k_pair_fast.set_size(GX,BX); this->k_pair_sel->set_size(GX,BX);
this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj, this->k_pair_sel->run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &this->ans->force, &this->ans->engv, &eflag,
&vflag, &ainum, &nbor_pitch, &vflag, &ainum, &nbor_pitch,
@ -165,6 +154,7 @@ void BuckT::loop(const bool _eflag, const bool _vflag) {
&ainum, &nbor_pitch, &this->_threads_per_atom); &ainum, &nbor_pitch, &this->_threads_per_atom);
} }
this->time_pair.stop(); this->time_pair.stop();
return GX;
} }
template class Buck<PRECISION,ACC_PRECISION>; template class Buck<PRECISION,ACC_PRECISION>;

View File

@ -39,22 +39,25 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,
atom_info(t_per_atom,ii,tid,offset); atom_info(t_per_atom,ii,tid,offset);
__local numtyp sp_lj[4]; __local numtyp sp_lj[4];
int n_stride;
local_allocate_store_pair();
sp_lj[0]=sp_lj_in[0]; sp_lj[0]=sp_lj_in[0];
sp_lj[1]=sp_lj_in[1]; sp_lj[1]=sp_lj_in[1];
sp_lj[2]=sp_lj_in[2]; sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3]; sp_lj[3]=sp_lj_in[3];
acctyp energy=(acctyp)0;
acctyp4 f; acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6]; acctyp energy, virial[6];
for (int i=0; i<6; i++) if (EVFLAG) {
virial[i]=(acctyp)0; energy=(acctyp)0;
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
}
if (ii<inum) { if (ii<inum) {
int nbor, nbor_end; int nbor, nbor_end;
int i, numj; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
@ -91,11 +94,11 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,
f.y+=dely*force; f.y+=dely*force;
f.z+=delz*force; f.z+=delz*force;
if (eflag>0) { if (EVFLAG && eflag) {
numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv; numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv;
energy+=factor_lj*(e-coeff2[mtype].z); energy+=factor_lj*(e-coeff2[mtype].z);
} }
if (vflag>0) { if (EVFLAG && vflag) {
virial[0] += delx*delx*force; virial[0] += delx*delx*force;
virial[1] += dely*dely*force; virial[1] += dely*dely*force;
virial[2] += delz*delz*force; virial[2] += delz*delz*force;
@ -106,9 +109,9 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,
} }
} // for nbor } // for nbor
} // if ii
store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
ans,engv); ans,engv);
} // if ii
} }
__kernel void k_buck_fast(const __global numtyp4 *restrict x_, __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
@ -127,27 +130,30 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
__local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_lj[4]; __local numtyp sp_lj[4];
int n_stride;
local_allocate_store_pair();
if (tid<4) if (tid<4)
sp_lj[tid]=sp_lj_in[tid]; sp_lj[tid]=sp_lj_in[tid];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) { if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
coeff1[tid]=coeff1_in[tid]; coeff1[tid]=coeff1_in[tid];
if (eflag>0) if (EVFLAG && eflag)
coeff2[tid]=coeff2_in[tid]; coeff2[tid]=coeff2_in[tid];
} }
acctyp energy=(acctyp)0;
acctyp4 f; acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6]; acctyp energy, virial[6];
for (int i=0; i<6; i++) if (EVFLAG) {
virial[i]=(acctyp)0; energy=(acctyp)0;
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
}
__syncthreads(); __syncthreads();
if (ii<inum) { if (ii<inum) {
int nbor, nbor_end; int nbor, nbor_end;
int i, numj; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
@ -184,11 +190,11 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
f.y+=dely*force; f.y+=dely*force;
f.z+=delz*force; f.z+=delz*force;
if (eflag>0) { if (EVFLAG && eflag) {
numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv; numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv;
energy+=factor_lj*(e-coeff2[mtype].z); energy+=factor_lj*(e-coeff2[mtype].z);
} }
if (vflag>0) { if (EVFLAG && vflag) {
virial[0] += delx*delx*force; virial[0] += delx*delx*force;
virial[1] += dely*dely*force; virial[1] += dely*dely*force;
virial[2] += delz*delz*force; virial[2] += delz*delz*force;
@ -199,8 +205,8 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
} }
} // for nbor } // for nbor
} // if ii
store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
ans,engv); ans,engv);
} // if ii
} }

View File

@ -77,7 +77,7 @@ class Buck : public BaseAtomic<numtyp, acctyp> {
private: private:
bool _allocated; bool _allocated;
void loop(const bool _eflag, const bool _vflag); int loop(const int eflag, const int vflag);
}; };
} }

View File

@ -122,20 +122,9 @@ double BuckCoulT::host_memory_usage() const {
// Calculate energies, forces, and torques // Calculate energies, forces, and torques
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
void BuckCoulT::loop(const bool _eflag, const bool _vflag) { int BuckCoulT::loop(const int eflag, const int vflag) {
// Compute the block size and grid size to keep all cores busy // Compute the block size and grid size to keep all cores busy
const int BX=this->block_size(); const int BX=this->block_size();
int eflag, vflag;
if (_eflag)
eflag=1;
else
eflag=0;
if (_vflag)
vflag=1;
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom))); (BX/this->_threads_per_atom)));
@ -143,8 +132,8 @@ void BuckCoulT::loop(const bool _eflag, const bool _vflag) {
int nbor_pitch=this->nbor->nbor_pitch(); int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start(); this->time_pair.start();
if (shared_types) { if (shared_types) {
this->k_pair_fast.set_size(GX,BX); this->k_pair_sel->set_size(GX,BX);
this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj, this->k_pair_sel->run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &this->ans->force, &this->ans->engv, &eflag,
&vflag, &ainum, &nbor_pitch, &this->atom->q, &vflag, &ainum, &nbor_pitch, &this->atom->q,
@ -158,6 +147,7 @@ void BuckCoulT::loop(const bool _eflag, const bool _vflag) {
&cutsq, &_qqrd2e, &this->_threads_per_atom); &cutsq, &_qqrd2e, &this->_threads_per_atom);
} }
this->time_pair.stop(); this->time_pair.stop();
return GX;
} }
template class BuckCoul<PRECISION,ACC_PRECISION>; template class BuckCoul<PRECISION,ACC_PRECISION>;

View File

@ -47,6 +47,9 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
atom_info(t_per_atom,ii,tid,offset); atom_info(t_per_atom,ii,tid,offset);
__local numtyp sp_lj[8]; __local numtyp sp_lj[8];
int n_stride;
local_allocate_store_charge();
sp_lj[0]=sp_lj_in[0]; sp_lj[0]=sp_lj_in[0];
sp_lj[1]=sp_lj_in[1]; sp_lj[1]=sp_lj_in[1];
sp_lj[2]=sp_lj_in[2]; sp_lj[2]=sp_lj_in[2];
@ -56,18 +59,18 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
sp_lj[6]=sp_lj_in[6]; sp_lj[6]=sp_lj_in[6];
sp_lj[7]=sp_lj_in[7]; sp_lj[7]=sp_lj_in[7];
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f; acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6]; acctyp energy, e_coul, virial[6];
for (int i=0; i<6; i++) if (EVFLAG) {
virial[i]=(acctyp)0; energy=(acctyp)0;
e_coul=(acctyp)0;
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
}
if (ii<inum) { if (ii<inum) {
int nbor, nbor_end; int nbor, nbor_end;
int i, numj; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
@ -119,14 +122,14 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
f.y+=dely*force; f.y+=dely*force;
f.z+=delz*force; f.z+=delz*force;
if (eflag>0) { if (EVFLAG && eflag) {
e_coul += forcecoul; e_coul += forcecoul;
if (rsq < cutsq[mtype].y) { if (rsq < cutsq[mtype].y) {
numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv; numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv;
energy+=factor_lj*(e-coeff2[mtype].z); energy+=factor_lj*(e-coeff2[mtype].z);
} }
} }
if (vflag>0) { if (EVFLAG && vflag) {
virial[0] += delx*delx*force; virial[0] += delx*delx*force;
virial[1] += dely*dely*force; virial[1] += dely*dely*force;
virial[2] += delz*delz*force; virial[2] += delz*delz*force;
@ -137,9 +140,9 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
} }
} // for nbor } // for nbor
} // if ii
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
vflag,ans,engv); vflag,ans,engv);
} // if ii
} }
__kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_, __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
@ -162,29 +165,32 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
__local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_lj[8]; __local numtyp sp_lj[8];
int n_stride;
local_allocate_store_charge();
if (tid<8) if (tid<8)
sp_lj[tid]=sp_lj_in[tid]; sp_lj[tid]=sp_lj_in[tid];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) { if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
coeff1[tid]=coeff1_in[tid]; coeff1[tid]=coeff1_in[tid];
cutsq[tid]=_cutsq[tid]; cutsq[tid]=_cutsq[tid];
if (eflag>0) if (EVFLAG && eflag)
coeff2[tid]=coeff2_in[tid]; coeff2[tid]=coeff2_in[tid];
} }
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f; acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6]; acctyp energy, e_coul, virial[6];
for (int i=0; i<6; i++) if (EVFLAG) {
virial[i]=(acctyp)0; energy=(acctyp)0;
e_coul=(acctyp)0;
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
}
__syncthreads(); __syncthreads();
if (ii<inum) { if (ii<inum) {
int nbor, nbor_end; int nbor, nbor_end;
int i, numj; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
@ -236,14 +242,14 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
f.y+=dely*force; f.y+=dely*force;
f.z+=delz*force; f.z+=delz*force;
if (eflag>0) { if (EVFLAG && eflag) {
e_coul += forcecoul; e_coul += forcecoul;
if (rsq < cutsq[mtype].y) { if (rsq < cutsq[mtype].y) {
numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv; numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv;
energy+=factor_lj*(e-coeff2[mtype].z); energy+=factor_lj*(e-coeff2[mtype].z);
} }
} }
if (vflag>0) { if (EVFLAG && vflag) {
virial[0] += delx*delx*force; virial[0] += delx*delx*force;
virial[1] += dely*dely*force; virial[1] += dely*dely*force;
virial[2] += delz*delz*force; virial[2] += delz*delz*force;
@ -254,8 +260,8 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
} }
} // for nbor } // for nbor
} // if ii
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
vflag,ans,engv); vflag,ans,engv);
} // if ii
} }

View File

@ -78,7 +78,7 @@ class BuckCoul : public BaseCharge<numtyp, acctyp> {
private: private:
bool _allocated; bool _allocated;
void loop(const bool _eflag, const bool _vflag); int loop(const int eflag, const int vflag);
}; };
} }

View File

@ -58,7 +58,7 @@ int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
int init_ok=0; int init_ok=0;
if (world_me==0) if (world_me==0)
init_ok=BUCKCMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, init_ok=BUCKCMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
host_a, host_c, offset, special_lj, inum, nall, 300, host_a, host_c, offset, special_lj, inum, nall, max_nbors,
maxspecial, cell_size, gpu_split, screen, maxspecial, cell_size, gpu_split, screen,
host_cut_ljsq, host_cut_coulsq, host_cut_ljsq, host_cut_coulsq,
host_special_coul, qqrd2e); host_special_coul, qqrd2e);
@ -78,7 +78,7 @@ int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
} }
if (gpu_rank==i && world_me!=0) if (gpu_rank==i && world_me!=0)
init_ok=BUCKCMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, init_ok=BUCKCMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
host_a, host_c, offset, special_lj, inum, nall, 300, host_a, host_c, offset, special_lj, inum, nall, max_nbors,
maxspecial, cell_size, gpu_split, screen, maxspecial, cell_size, gpu_split, screen,
host_cut_ljsq, host_cut_coulsq, host_cut_ljsq, host_cut_coulsq,
host_special_coul, qqrd2e); host_special_coul, qqrd2e);

View File

@ -126,20 +126,9 @@ double BuckCoulLongT::host_memory_usage() const {
// Calculate energies, forces, and torques // Calculate energies, forces, and torques
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
void BuckCoulLongT::loop(const bool _eflag, const bool _vflag) { int BuckCoulLongT::loop(const int eflag, const int vflag) {
// Compute the block size and grid size to keep all cores busy // Compute the block size and grid size to keep all cores busy
const int BX=this->block_size(); const int BX=this->block_size();
int eflag, vflag;
if (_eflag)
eflag=1;
else
eflag=0;
if (_vflag)
vflag=1;
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom))); (BX/this->_threads_per_atom)));
@ -147,8 +136,8 @@ void BuckCoulLongT::loop(const bool _eflag, const bool _vflag) {
int nbor_pitch=this->nbor->nbor_pitch(); int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start(); this->time_pair.start();
if (shared_types) { if (shared_types) {
this->k_pair_fast.set_size(GX,BX); this->k_pair_sel->set_size(GX,BX);
this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj, this->k_pair_sel->run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &this->ans->force, &this->ans->engv, &eflag,
&vflag, &ainum, &nbor_pitch, &this->atom->q, &vflag, &ainum, &nbor_pitch, &this->atom->q,
@ -163,6 +152,7 @@ void BuckCoulLongT::loop(const bool _eflag, const bool _vflag) {
&_cut_coulsq, &_qqrd2e, &_g_ewald, &this->_threads_per_atom); &_cut_coulsq, &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
} }
this->time_pair.stop(); this->time_pair.stop();
return GX;
} }
template class BuckCoulLong<PRECISION,ACC_PRECISION>; template class BuckCoulLong<PRECISION,ACC_PRECISION>;

View File

@ -48,6 +48,9 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
atom_info(t_per_atom,ii,tid,offset); atom_info(t_per_atom,ii,tid,offset);
__local numtyp sp_lj[8]; __local numtyp sp_lj[8];
int n_stride;
local_allocate_store_charge();
sp_lj[0]=sp_lj_in[0]; sp_lj[0]=sp_lj_in[0];
sp_lj[1]=sp_lj_in[1]; sp_lj[1]=sp_lj_in[1];
sp_lj[2]=sp_lj_in[2]; sp_lj[2]=sp_lj_in[2];
@ -57,18 +60,18 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
sp_lj[6]=sp_lj_in[6]; sp_lj[6]=sp_lj_in[6];
sp_lj[7]=sp_lj_in[7]; sp_lj[7]=sp_lj_in[7];
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f; acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6]; acctyp energy, e_coul, virial[6];
for (int i=0; i<6; i++) if (EVFLAG) {
virial[i]=(acctyp)0; energy=(acctyp)0;
e_coul=(acctyp)0;
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
}
if (ii<inum) { if (ii<inum) {
int nbor, nbor_end; int nbor, nbor_end;
int i, numj; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
@ -126,7 +129,7 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
f.y+=dely*force; f.y+=dely*force;
f.z+=delz*force; f.z+=delz*force;
if (eflag>0) { if (EVFLAG && eflag) {
if (rsq < cut_coulsq) if (rsq < cut_coulsq)
e_coul += prefactor*(_erfc-factor_coul); e_coul += prefactor*(_erfc-factor_coul);
if (rsq < coeff1[mtype].w) { if (rsq < coeff1[mtype].w) {
@ -134,7 +137,7 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
energy+=factor_lj*(e-coeff2[mtype].z); energy+=factor_lj*(e-coeff2[mtype].z);
} }
} }
if (vflag>0) { if (EVFLAG && vflag) {
virial[0] += delx*delx*force; virial[0] += delx*delx*force;
virial[1] += dely*dely*force; virial[1] += dely*dely*force;
virial[2] += delz*delz*force; virial[2] += delz*delz*force;
@ -145,9 +148,9 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
} }
} // for nbor } // for nbor
} // if ii
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
vflag,ans,engv); vflag,ans,engv);
} // if ii
} }
__kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_, __kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
@ -171,28 +174,31 @@ __kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
__local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_lj[8]; __local numtyp sp_lj[8];
int n_stride;
local_allocate_store_charge();
if (tid<8) if (tid<8)
sp_lj[tid]=sp_lj_in[tid]; sp_lj[tid]=sp_lj_in[tid];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) { if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
coeff1[tid]=coeff1_in[tid]; coeff1[tid]=coeff1_in[tid];
if (eflag>0) if (EVFLAG && eflag)
coeff2[tid]=coeff2_in[tid]; coeff2[tid]=coeff2_in[tid];
} }
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f; acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6]; acctyp energy, e_coul, virial[6];
for (int i=0; i<6; i++) if (EVFLAG) {
virial[i]=(acctyp)0; energy=(acctyp)0;
e_coul=(acctyp)0;
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
}
__syncthreads(); __syncthreads();
if (ii<inum) { if (ii<inum) {
int nbor, nbor_end; int nbor, nbor_end;
int i, numj; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
@ -250,7 +256,7 @@ __kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
f.y+=dely*force; f.y+=dely*force;
f.z+=delz*force; f.z+=delz*force;
if (eflag>0) { if (EVFLAG && eflag) {
if (rsq < cut_coulsq) if (rsq < cut_coulsq)
e_coul += prefactor*(_erfc-factor_coul); e_coul += prefactor*(_erfc-factor_coul);
if (rsq < coeff1[mtype].w) { if (rsq < coeff1[mtype].w) {
@ -258,7 +264,7 @@ __kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
energy+=factor_lj*(e-coeff2[mtype].z); energy+=factor_lj*(e-coeff2[mtype].z);
} }
} }
if (vflag>0) { if (EVFLAG && vflag) {
virial[0] += delx*delx*force; virial[0] += delx*delx*force;
virial[1] += dely*dely*force; virial[1] += dely*dely*force;
virial[2] += delz*delz*force; virial[2] += delz*delz*force;
@ -269,8 +275,8 @@ __kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
} }
} // for nbor } // for nbor
} // if ii
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
vflag,ans,engv); vflag,ans,engv);
} // if ii
} }

View File

@ -78,7 +78,7 @@ class BuckCoulLong : public BaseCharge<numtyp, acctyp> {
private: private:
bool _allocated; bool _allocated;
void loop(const bool _eflag, const bool _vflag); int loop(const int eflag, const int vflag);
}; };
} }

View File

@ -59,7 +59,7 @@ int buckcl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
int init_ok=0; int init_ok=0;
if (world_me==0) if (world_me==0)
init_ok=BUCKCLMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, init_ok=BUCKCLMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
host_a, host_c, offset, special_lj, inum, nall, 300, host_a, host_c, offset, special_lj, inum, nall, max_nbors,
maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
@ -78,7 +78,7 @@ int buckcl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
} }
if (gpu_rank==i && world_me!=0) if (gpu_rank==i && world_me!=0)
init_ok=BUCKCLMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, init_ok=BUCKCLMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
host_a, host_c, offset, special_lj, inum, nall, 300, host_a, host_c, offset, special_lj, inum, nall, max_nbors,
maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);

View File

@ -56,7 +56,7 @@ int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
int init_ok=0; int init_ok=0;
if (world_me==0) if (world_me==0)
init_ok=BUCKMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, init_ok=BUCKMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
host_a, host_c, offset, special_lj, inum, nall, 300, host_a, host_c, offset, special_lj, inum, nall, max_nbors,
maxspecial, cell_size, gpu_split, screen); maxspecial, cell_size, gpu_split, screen);
BUCKMF.device->world_barrier(); BUCKMF.device->world_barrier();
@ -74,7 +74,7 @@ int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
} }
if (gpu_rank==i && world_me!=0) if (gpu_rank==i && world_me!=0)
init_ok=BUCKMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, init_ok=BUCKMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
host_a, host_c, offset, special_lj, inum, nall, 300, host_a, host_c, offset, special_lj, inum, nall, max_nbors,
maxspecial, cell_size, gpu_split, screen); maxspecial, cell_size, gpu_split, screen);
BUCKMF.device->gpu_barrier(); BUCKMF.device->gpu_barrier();

166
lib/gpu/lal_charmm.cpp Normal file
View File

@ -0,0 +1,166 @@
/***************************************************************************
charmm.cpp
-------------------
W. Michael Brown (ORNL)
Class for acceleration of the charmm/coul pair style.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : brownw@ornl.gov
***************************************************************************/
#if defined(USE_OPENCL)
#include "charmm_cl.h"
#elif defined(USE_CUDART)
const char *charmm_long=0;
#else
#include "charmm_cubin.h"
#endif
#include "lal_charmm.h"
#include <cassert>
namespace LAMMPS_AL {
#define CHARMMT CHARMM<numtyp, acctyp>
extern Device<PRECISION,ACC_PRECISION> device;
template <class numtyp, class acctyp>
CHARMMT::CHARMM() : BaseCharge<numtyp,acctyp>(),
_allocated(false) {
}
template <class numtyp, class acctyp>
CHARMMT::~CHARMM() {
clear();
}
template <class numtyp, class acctyp>
int CHARMMT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors);
}
template <class numtyp, class acctyp>
int CHARMMT::init(const int ntypes, double host_cut_bothsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
double *host_special_lj, const int nlocal, const int nall,
const int max_nbors, const int maxspecial,
const double cell_size, const double gpu_split,
FILE *_screen, double host_cut_ljsq,
const double host_cut_coulsq, double *host_special_coul,
const double qqrd2e, const double cut_lj_innersq,
const double cut_coul_innersq, const double denom_lj,
const double denom_coul, double **epsilon,
double **sigma, const bool mix_arithmetic) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,
gpu_split,_screen,charmm,"k_charmm");
if (success!=0)
return success;
// If atom type constants fit in shared memory use fast kernel
int lj_types=ntypes;
shared_types=false;
int max_bio_shared_types=this->device->max_bio_shared_types();
if (this->_block_bio_size>=64 && mix_arithmetic &&
lj_types<=max_bio_shared_types)
shared_types=true;
_lj_types=lj_types;
// Allocate a host write buffer for data initialization
int h_size=lj_types*lj_types;
if (h_size<max_bio_shared_types)
h_size=max_bio_shared_types;
UCL_H_Vec<numtyp> host_write(h_size*32,*(this->ucl_device),
UCL_WRITE_ONLY);
for (int i=0; i<h_size*32; i++)
host_write[i]=0.0;
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
host_lj3,host_lj4);
if (shared_types) {
ljd.alloc(max_bio_shared_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->self_pack2(ntypes,ljd,host_write,epsilon,sigma);
}
sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
for (int i=0; i<4; i++) {
host_write[i]=host_special_lj[i];
host_write[i+4]=host_special_coul[i];
}
ucl_copy(sp_lj,host_write,8,false);
_cut_bothsq = host_cut_bothsq;
_cut_coulsq = host_cut_coulsq;
_cut_ljsq = host_cut_ljsq;
_cut_lj_innersq = cut_lj_innersq;
_cut_coul_innersq = cut_coul_innersq;
_qqrd2e=qqrd2e;
_denom_lj=denom_lj;
_denom_coul=denom_coul;
_allocated=true;
this->_max_bytes=lj1.row_bytes()+ljd.row_bytes()+sp_lj.row_bytes();
return 0;
}
template <class numtyp, class acctyp>
void CHARMMT::clear() {
if (!_allocated)
return;
_allocated=false;
lj1.clear();
ljd.clear();
sp_lj.clear();
this->clear_atomic();
}
template <class numtyp, class acctyp>
double CHARMMT::host_memory_usage() const {
return this->host_memory_usage_atomic()+sizeof(CHARMM<numtyp,acctyp>);
}
// ---------------------------------------------------------------------------
// Calculate energies, forces, and torques
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int CHARMMT::loop(const int eflag, const int vflag) {
// Compute the block size and grid size to keep all cores busy
const int BX=this->_block_bio_size;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
int ainum=this->ans->inum();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
if (shared_types) {
this->k_pair_sel->set_size(GX,BX);
this->k_pair_sel->run(&this->atom->x, &ljd, &sp_lj,
&this->nbor->dev_nbor, this->_nbor_data,
&this->ans->force, &this->ans->engv, &eflag,
&vflag, &ainum, &nbor_pitch, &this->atom->q,
&_cut_coulsq, &_qqrd2e, &_denom_lj, &_denom_coul,
&_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq,
&_cut_coul_innersq, &this->_threads_per_atom);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->x, &ljd, &sp_lj,
&this->nbor->dev_nbor, this->_nbor_data,
&this->ans->force, &this->ans->engv, &eflag,
&vflag, &ainum, &nbor_pitch, &this->atom->q,
&_cut_coulsq, &_qqrd2e, &_denom_lj, &_denom_coul,
&_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq,
&_cut_coul_innersq, &this->_threads_per_atom);
}
this->time_pair.stop();
return GX;
}
template class CHARMM<PRECISION,ACC_PRECISION>;
}

303
lib/gpu/lal_charmm.cu Normal file
View File

@ -0,0 +1,303 @@
// **************************************************************************
// charmm.cu
// -------------------
// W. Michael Brown (ORNL)
//
// Device code for acceleration of the charmm/coul pair style
//
// __________________________________________________________________________
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________
//
// begin :
// email : brownw@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
#endif
#else
#define pos_tex x_
#define q_tex q_
#endif
__kernel void k_charmm(const __global numtyp4 *restrict x_,
const __global numtyp2 *restrict ljd,
const __global numtyp *restrict sp_lj,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
const int inum, const int nbor_pitch,
const __global numtyp *restrict q_,
const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp denom_lj,
const numtyp denom_coul,
const numtyp cut_bothsq,
const numtyp cut_ljsq,
const numtyp cut_lj_innersq,
const numtyp cut_coul_innersq,
const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
int n_stride;
local_allocate_store_bio();
acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, e_coul, virial[6];
if (EVFLAG) {
energy=(acctyp)0;
e_coul=(acctyp)0;
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
}
if (ii<inum) {
int nbor, nbor_end;
int i, numj;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
int itype=ix.w;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];
factor_coul = sp_lj[sbmask(j)+4];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<cut_bothsq) {
numtyp r2inv=ucl_recip(rsq);
numtyp forcecoul, force_lj, force, switch1;
numtyp lj3, lj4;
if (rsq < cut_ljsq) {
numtyp eps = ucl_sqrt(ljd[itype].x*ljd[jtype].x);
numtyp sig6 = (numtyp)0.5 * (ljd[itype].y+ljd[jtype].y);
numtyp sig_r_6 = sig6*sig6*r2inv;
sig_r_6 = sig_r_6*sig_r_6*sig_r_6;
lj4 = (numtyp)4.0*eps*sig_r_6;
lj3 = lj4*sig_r_6;
force_lj = factor_lj*((numtyp)12.0 * lj3 - (numtyp)6.0 * lj4);
if (rsq > cut_lj_innersq) {
switch1 = (cut_ljsq-rsq);
numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)*
denom_lj;
switch1 *= switch1;
switch1 *= (cut_ljsq+(numtyp)2.0*rsq-(numtyp)3.0*cut_lj_innersq)*
denom_lj;
switch2 *= lj3-lj4;
force_lj = force_lj*switch1+switch2;
}
} else
force_lj = (numtyp)0.0;
if (rsq < cut_coulsq) {
numtyp rinv = ucl_rsqrt(rsq);
fetch(forcecoul,j,q_tex);
forcecoul *= factor_coul * qqrd2e * qtmp * rinv;
if (rsq > cut_coul_innersq) {
numtyp switch3 = (cut_coulsq-rsq) * (cut_coulsq-rsq) *
(cut_coulsq + (numtyp)2.0*rsq - (numtyp)3.0*cut_coul_innersq) *
denom_coul;
forcecoul *= switch3;
}
} else
forcecoul = (numtyp)0.0;
force = (force_lj + forcecoul) * r2inv;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (EVFLAG && eflag) {
e_coul += forcecoul;
if (rsq < cut_ljsq) {
numtyp e=lj3-lj4;
if (rsq > cut_lj_innersq)
e *= switch1;
energy+=factor_lj*e;
}
}
if (EVFLAG && vflag) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
} // if ii
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
vflag,ans,engv);
}
__kernel void k_charmm_fast(const __global numtyp4 *restrict x_,
const __global numtyp2 *restrict ljd_in,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
const int inum, const int nbor_pitch,
const __global numtyp *restrict q_,
const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp denom_lj,
const numtyp denom_coul,
const numtyp cut_bothsq,
const numtyp cut_ljsq,
const numtyp cut_lj_innersq,
const numtyp cut_coul_innersq,
const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
__local numtyp2 ljd[MAX_BIO_SHARED_TYPES];
__local numtyp sp_lj[8];
int n_stride;
local_allocate_store_bio();
if (tid<8)
sp_lj[tid]=sp_lj_in[tid];
if (tid<MAX_BIO_SHARED_TYPES)
ljd[tid]=ljd_in[tid];
if (tid+BLOCK_BIO_PAIR<MAX_BIO_SHARED_TYPES)
ljd[tid+BLOCK_BIO_PAIR]=ljd_in[tid+BLOCK_BIO_PAIR];
acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, e_coul, virial[6];
if (EVFLAG) {
energy=(acctyp)0;
e_coul=(acctyp)0;
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
}
__syncthreads();
if (ii<inum) {
int nbor, nbor_end;
int i, numj;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
int itype=ix.w;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];
factor_coul = sp_lj[sbmask(j)+4];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<cut_bothsq) {
numtyp r2inv=ucl_recip(rsq);
numtyp forcecoul, force_lj, force, switch1;
numtyp lj3, lj4;
if (rsq < cut_ljsq) {
numtyp eps = ucl_sqrt(ljd[itype].x*ljd[jtype].x);
numtyp sig6 = (numtyp)0.5 * (ljd[itype].y+ljd[jtype].y);
numtyp sig_r_6 = sig6*sig6*r2inv;
sig_r_6 = sig_r_6*sig_r_6*sig_r_6;
lj4 = (numtyp)4.0*eps*sig_r_6;
lj3 = lj4*sig_r_6;
force_lj = factor_lj*((numtyp)12.0 * lj3 - (numtyp)6.0 * lj4);
if (rsq > cut_lj_innersq) {
switch1 = (cut_ljsq-rsq);
numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)*
denom_lj;
switch1 *= switch1;
switch1 *= (cut_ljsq+(numtyp)2.0*rsq-(numtyp)3.0*cut_lj_innersq)*
denom_lj;
switch2 *= lj3-lj4;
force_lj = force_lj*switch1+switch2;
}
} else
force_lj = (numtyp)0.0;
if (rsq < cut_coulsq) {
numtyp rinv = ucl_rsqrt(rsq);
fetch(forcecoul,j,q_tex);
forcecoul *= factor_coul * qqrd2e * qtmp * rinv;
if (rsq > cut_coul_innersq) {
numtyp switch3 = (cut_coulsq-rsq) * (cut_coulsq-rsq) *
(cut_coulsq + (numtyp)2.0*rsq - (numtyp)3.0*cut_coul_innersq) *
denom_coul;
forcecoul *= switch3;
}
} else
forcecoul = (numtyp)0.0;
force = (force_lj + forcecoul) * r2inv;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (EVFLAG && eflag) {
e_coul += forcecoul;
if (rsq < cut_ljsq) {
numtyp e=lj3-lj4;
if (rsq > cut_lj_innersq)
e *= switch1;
energy+=factor_lj*e;
}
}
if (EVFLAG && vflag) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
} // if ii
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
vflag,ans,engv);
}

89
lib/gpu/lal_charmm.h Normal file
View File

@ -0,0 +1,89 @@
/***************************************************************************
charmm.h
-------------------
W. Michael Brown (ORNL)
Class for acceleration of the charmm/coul pair style.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : brownw@ornl.gov
***************************************************************************/
#ifndef LAL_CHARMM_
#define LAL_CHARMM_
#include "lal_base_charge.h"
namespace LAMMPS_AL {
template <class numtyp, class acctyp>
class CHARMM : public BaseCharge<numtyp, acctyp> {
public:
CHARMM();
~CHARMM();
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(const int ntypes, double host_cut_bothsq,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen, double host_cut_ljsq,
const double host_cut_coulsq, double *host_special_coul,
const double qqrd2e, const double cut_lj_innersq,
const double cut_coul_innersq, const double denom_lj,
const double denom_coul, double **epsilon, double **sigma,
const bool mix_arithmetic);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();
/// Returns memory usage on device per atom
int bytes_per_atom(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage() const;
// --------------------------- TYPE DATA --------------------------
/// x = lj1, y = lj2, z = lj3, w = lj4
UCL_D_Vec<numtyp4> lj1;
/// x = epsilon, y = sigma
UCL_D_Vec<numtyp2> ljd;
/// Special LJ values [0-3] and Special Coul values [4-7]
UCL_D_Vec<numtyp> sp_lj;
/// If atom type constants fit in shared memory, use fast kernels
bool shared_types;
/// Number of atom types
int _lj_types;
numtyp _qqrd2e, _denom_lj, _denom_coul;
numtyp _cut_coulsq, _cut_bothsq, _cut_ljsq, _cut_lj_innersq;
numtyp _cut_coul_innersq;
private:
bool _allocated;
int loop(const int eflag, const int vflag);
};
}
#endif

137
lib/gpu/lal_charmm_ext.cpp Normal file
View File

@ -0,0 +1,137 @@
/***************************************************************************
charmm_long_ext.cpp
-------------------
W. Michael Brown (ORNL)
Functions for LAMMPS access to charmm/coul/long acceleration routines.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : brownw@ornl.gov
***************************************************************************/
#include <iostream>
#include <cassert>
#include <cmath>
#include "lal_charmm.h"
using namespace std;
using namespace LAMMPS_AL;
static CHARMM<PRECISION,ACC_PRECISION> CRMMF;
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
int crm_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
double *special_lj, const int inum,
const int nall, const int max_nbors, const int maxspecial,
const double cell_size, int &gpu_mode, FILE *screen,
double host_cut_ljsq, double host_cut_coulsq,
double *host_special_coul, const double qqrd2e,
const double cut_lj_innersq, const double cut_coul_innersq,
const double denom_lj, const double denom_coul,
double **epsilon, double **sigma,
const bool mix_arithmetic) {
CRMMF.clear();
gpu_mode=CRMMF.device->gpu_mode();
double gpu_split=CRMMF.device->particle_split();
int first_gpu=CRMMF.device->first_device();
int last_gpu=CRMMF.device->last_device();
int world_me=CRMMF.device->world_me();
int gpu_rank=CRMMF.device->gpu_rank();
int procs_per_gpu=CRMMF.device->procs_per_gpu();
CRMMF.device->init_message(screen,"lj/charmm/coul/charmm",first_gpu,
last_gpu);
bool message=false;
if (CRMMF.device->replica_me()==0 && screen)
message=true;
if (message) {
fprintf(screen,"Initializing Device and compiling on process 0...");
fflush(screen);
}
int init_ok=0;
if (world_me==0)
CRMMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3, host_lj4,
special_lj, inum, nall, max_nbors, maxspecial, cell_size,
gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
host_special_coul, qqrd2e, cut_lj_innersq, cut_coul_innersq,
denom_lj, denom_coul, epsilon, sigma, mix_arithmetic);
CRMMF.device->world_barrier();
if (message)
fprintf(screen,"Done.\n");
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}
if (gpu_rank==i && world_me!=0)
init_ok=CRMMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3,
host_lj4, special_lj, inum, nall, max_nbors,
maxspecial, cell_size, gpu_split, screen,
host_cut_ljsq, host_cut_coulsq, host_special_coul,
qqrd2e, cut_lj_innersq, cut_coul_innersq, denom_lj,
denom_coul, epsilon, sigma, mix_arithmetic);
CRMMF.device->gpu_barrier();
if (message)
fprintf(screen,"Done.\n");
}
if (message)
fprintf(screen,"\n");
if (init_ok==0)
CRMMF.estimate_gpu_overhead();
return init_ok;
}
void crm_gpu_clear() {
CRMMF.clear();
}
int** crm_gpu_compute_n(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, double *sublo,
double *subhi, tagint *tag, int **nspecial,
tagint **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time,
bool &success, double *host_q, double *boxlo,
double *prd) {
return CRMMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
subhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, ilist, jnum, cpu_time, success,
host_q, boxlo, prd);
}
void crm_gpu_compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success, double *host_q,
const int nlocal, double *boxlo, double *prd) {
CRMMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,
eflag,vflag,eatom,vatom,host_start,cpu_time,success,host_q,
nlocal,boxlo,prd);
}
double crm_gpu_bytes() {
return CRMMF.host_memory_usage();
}

View File

@ -131,20 +131,9 @@ double CHARMMLongT::host_memory_usage() const {
// Calculate energies, forces, and torques // Calculate energies, forces, and torques
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
void CHARMMLongT::loop(const bool _eflag, const bool _vflag) { int CHARMMLongT::loop(const int eflag, const int vflag) {
// Compute the block size and grid size to keep all cores busy // Compute the block size and grid size to keep all cores busy
const int BX=this->_block_bio_size; const int BX=this->_block_bio_size;
int eflag, vflag;
if (_eflag)
eflag=1;
else
eflag=0;
if (_vflag)
vflag=1;
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom))); (BX/this->_threads_per_atom)));
@ -152,8 +141,8 @@ void CHARMMLongT::loop(const bool _eflag, const bool _vflag) {
int nbor_pitch=this->nbor->nbor_pitch(); int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start(); this->time_pair.start();
if (shared_types) { if (shared_types) {
this->k_pair_fast.set_size(GX,BX); this->k_pair_sel->set_size(GX,BX);
this->k_pair_fast.run(&this->atom->x, &ljd, &sp_lj, this->k_pair_sel->run(&this->atom->x, &ljd, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &this->ans->force, &this->ans->engv, &eflag,
&vflag, &ainum, &nbor_pitch, &this->atom->q, &vflag, &ainum, &nbor_pitch, &this->atom->q,
@ -171,6 +160,7 @@ void CHARMMLongT::loop(const bool _eflag, const bool _vflag) {
&this->_threads_per_atom); &this->_threads_per_atom);
} }
this->time_pair.stop(); this->time_pair.stop();
return GX;
} }
template class CHARMMLong<PRECISION,ACC_PRECISION>; template class CHARMMLong<PRECISION,ACC_PRECISION>;

View File

@ -47,18 +47,21 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
int tid, ii, offset; int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset); atom_info(t_per_atom,ii,tid,offset);
acctyp energy=(acctyp)0; int n_stride;
acctyp e_coul=(acctyp)0; local_allocate_store_bio();
acctyp4 f; acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6]; acctyp energy, e_coul, virial[6];
for (int i=0; i<6; i++) if (EVFLAG) {
virial[i]=(acctyp)0; energy=(acctyp)0;
e_coul=(acctyp)0;
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
}
if (ii<inum) { if (ii<inum) {
int nbor, nbor_end; int nbor, nbor_end;
int i, numj; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
@ -122,7 +125,7 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
f.y+=dely*force; f.y+=dely*force;
f.z+=delz*force; f.z+=delz*force;
if (eflag>0) { if (EVFLAG && eflag) {
if (rsq < cut_coulsq) if (rsq < cut_coulsq)
e_coul += prefactor*(_erfc-factor_coul); e_coul += prefactor*(_erfc-factor_coul);
if (rsq < cut_ljsq) { if (rsq < cut_ljsq) {
@ -132,7 +135,7 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
energy+=factor_lj*e; energy+=factor_lj*e;
} }
} }
if (vflag>0) { if (EVFLAG && vflag) {
virial[0] += delx*delx*force; virial[0] += delx*delx*force;
virial[1] += dely*dely*force; virial[1] += dely*dely*force;
virial[2] += delz*delz*force; virial[2] += delz*delz*force;
@ -143,9 +146,9 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
} }
} // for nbor } // for nbor
} // if ii
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
vflag,ans,engv); vflag,ans,engv);
} // if ii
} }
__kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_, __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
@ -168,6 +171,9 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
__local numtyp2 ljd[MAX_BIO_SHARED_TYPES]; __local numtyp2 ljd[MAX_BIO_SHARED_TYPES];
__local numtyp sp_lj[8]; __local numtyp sp_lj[8];
int n_stride;
local_allocate_store_bio();
if (tid<8) if (tid<8)
sp_lj[tid]=sp_lj_in[tid]; sp_lj[tid]=sp_lj_in[tid];
if (tid<MAX_BIO_SHARED_TYPES) if (tid<MAX_BIO_SHARED_TYPES)
@ -175,20 +181,20 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
if (tid+BLOCK_BIO_PAIR<MAX_BIO_SHARED_TYPES) if (tid+BLOCK_BIO_PAIR<MAX_BIO_SHARED_TYPES)
ljd[tid+BLOCK_BIO_PAIR]=ljd_in[tid+BLOCK_BIO_PAIR]; ljd[tid+BLOCK_BIO_PAIR]=ljd_in[tid+BLOCK_BIO_PAIR];
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f; acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6]; acctyp energy, e_coul, virial[6];
for (int i=0; i<6; i++) if (EVFLAG) {
virial[i]=(acctyp)0; energy=(acctyp)0;
e_coul=(acctyp)0;
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
}
__syncthreads(); __syncthreads();
if (ii<inum) { if (ii<inum) {
int nbor, nbor_end; int nbor, nbor_end;
int i, numj; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
@ -258,7 +264,7 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
f.y+=dely*force; f.y+=dely*force;
f.z+=delz*force; f.z+=delz*force;
if (eflag>0) { if (EVFLAG && eflag) {
if (rsq < cut_coulsq) if (rsq < cut_coulsq)
e_coul += prefactor*(_erfc-factor_coul); e_coul += prefactor*(_erfc-factor_coul);
if (rsq < cut_ljsq) { if (rsq < cut_ljsq) {
@ -268,7 +274,7 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
energy+=factor_lj*e; energy+=factor_lj*e;
} }
} }
if (vflag>0) { if (EVFLAG && vflag) {
virial[0] += delx*delx*force; virial[0] += delx*delx*force;
virial[1] += dely*dely*force; virial[1] += dely*dely*force;
virial[2] += delz*delz*force; virial[2] += delz*delz*force;
@ -277,10 +283,9 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
virial[5] += dely*delz*force; virial[5] += dely*delz*force;
} }
} }
} // for nbor } // for nbor
} // if ii
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
vflag,ans,engv); vflag,ans,engv);
} // if ii
} }

View File

@ -79,7 +79,7 @@ class CHARMMLong : public BaseCharge<numtyp, acctyp> {
private: private:
bool _allocated; bool _allocated;
void loop(const bool _eflag, const bool _vflag); int loop(const int eflag, const int vflag);
}; };
} }

View File

@ -60,7 +60,7 @@ int crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
int init_ok=0; int init_ok=0;
if (world_me==0) if (world_me==0)
CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3, host_lj4, CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3, host_lj4,
offset, special_lj, inum, nall, 300, maxspecial, cell_size, offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size,
gpu_split, screen, host_cut_ljsq, host_cut_coulsq, gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
host_special_coul, qqrd2e, g_ewald, cut_lj_innersq, denom_lj, host_special_coul, qqrd2e, g_ewald, cut_lj_innersq, denom_lj,
epsilon,sigma,mix_arithmetic); epsilon,sigma,mix_arithmetic);
@ -80,7 +80,7 @@ int crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
} }
if (gpu_rank==i && world_me!=0) if (gpu_rank==i && world_me!=0)
init_ok=CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3, init_ok=CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3,
host_lj4, offset, special_lj, inum, nall, 300, host_lj4, offset, special_lj, inum, nall, max_nbors,
maxspecial, cell_size, gpu_split, screen, maxspecial, cell_size, gpu_split, screen,
host_cut_ljsq, host_cut_coulsq, host_special_coul, host_cut_ljsq, host_cut_coulsq, host_special_coul,
qqrd2e, g_ewald, cut_lj_innersq, denom_lj, epsilon, qqrd2e, g_ewald, cut_lj_innersq, denom_lj, epsilon,

View File

@ -140,20 +140,9 @@ double ColloidT::host_memory_usage() const {
// Calculate energies, forces, and torques // Calculate energies, forces, and torques
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
void ColloidT::loop(const bool _eflag, const bool _vflag) { int ColloidT::loop(const int eflag, const int vflag) {
// Compute the block size and grid size to keep all cores busy // Compute the block size and grid size to keep all cores busy
const int BX=this->block_size(); const int BX=this->block_size();
int eflag, vflag;
if (_eflag)
eflag=1;
else
eflag=0;
if (_vflag)
vflag=1;
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom))); (BX/this->_threads_per_atom)));
@ -161,8 +150,8 @@ void ColloidT::loop(const bool _eflag, const bool _vflag) {
int nbor_pitch=this->nbor->nbor_pitch(); int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start(); this->time_pair.start();
if (shared_types) { if (shared_types) {
this->k_pair_fast.set_size(GX,BX); this->k_pair_sel->set_size(GX,BX);
this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj,
&colloid1, &colloid2, &form, &colloid1, &colloid2, &form,
&this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &vflag, &this->ans->force, &this->ans->engv, &eflag, &vflag,
@ -176,6 +165,7 @@ void ColloidT::loop(const bool _eflag, const bool _vflag) {
&ainum, &nbor_pitch, &this->_threads_per_atom); &ainum, &nbor_pitch, &this->_threads_per_atom);
} }
this->time_pair.stop(); this->time_pair.stop();
return GX;
} }
template class Colloid<PRECISION,ACC_PRECISION>; template class Colloid<PRECISION,ACC_PRECISION>;

View File

@ -42,22 +42,25 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
atom_info(t_per_atom,ii,tid,offset); atom_info(t_per_atom,ii,tid,offset);
__local numtyp sp_lj[4]; __local numtyp sp_lj[4];
int n_stride;
local_allocate_store_pair();
sp_lj[0]=sp_lj_in[0]; sp_lj[0]=sp_lj_in[0];
sp_lj[1]=sp_lj_in[1]; sp_lj[1]=sp_lj_in[1];
sp_lj[2]=sp_lj_in[2]; sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3]; sp_lj[3]=sp_lj_in[3];
acctyp energy=(acctyp)0;
acctyp4 f; acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6]; acctyp energy, virial[6];
for (int i=0; i<6; i++) if (EVFLAG) {
virial[i]=(acctyp)0; energy=(acctyp)0;
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
}
if (ii<inum) { if (ii<inum) {
int nbor, nbor_end; int nbor, nbor_end;
int i, numj; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
@ -146,7 +149,7 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
f.y+=dely*force; f.y+=dely*force;
f.z+=delz*force; f.z+=delz*force;
if (eflag>0) { if (EVFLAG && eflag) {
numtyp e=(numtyp)0.0; numtyp e=(numtyp)0.0;
if (form[mtype]==0) { if (form[mtype]==0) {
e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
@ -160,7 +163,7 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
} }
energy+=factor_lj*(e-lj3[mtype].z); energy+=factor_lj*(e-lj3[mtype].z);
} }
if (vflag>0) { if (EVFLAG && vflag) {
virial[0] += delx*delx*force; virial[0] += delx*delx*force;
virial[1] += dely*dely*force; virial[1] += dely*dely*force;
virial[2] += delz*delz*force; virial[2] += delz*delz*force;
@ -171,9 +174,9 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
} }
} // for nbor } // for nbor
} // if ii
store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
ans,engv); ans,engv);
} // if ii
} }
__kernel void k_colloid_fast(const __global numtyp4 *restrict x_, __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
@ -198,6 +201,9 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
__local numtyp4 colloid2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 colloid2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_lj[4]; __local numtyp sp_lj[4];
int n_stride;
local_allocate_store_pair();
if (tid<4) if (tid<4)
sp_lj[tid]=sp_lj_in[tid]; sp_lj[tid]=sp_lj_in[tid];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) { if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
@ -205,23 +211,23 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
colloid1[tid]=colloid1_in[tid]; colloid1[tid]=colloid1_in[tid];
colloid2[tid]=colloid2_in[tid]; colloid2[tid]=colloid2_in[tid];
form[tid]=form_in[tid]; form[tid]=form_in[tid];
if (eflag>0) if (EVFLAG && eflag)
lj3[tid]=lj3_in[tid]; lj3[tid]=lj3_in[tid];
} }
acctyp energy=(acctyp)0;
acctyp4 f; acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6]; acctyp energy, virial[6];
for (int i=0; i<6; i++) if (EVFLAG) {
virial[i]=(acctyp)0; energy=(acctyp)0;
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
}
__syncthreads(); __syncthreads();
if (ii<inum) { if (ii<inum) {
int nbor, nbor_end; int nbor, nbor_end;
int i, numj; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
@ -310,7 +316,7 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
f.y+=dely*force; f.y+=dely*force;
f.z+=delz*force; f.z+=delz*force;
if (eflag>0) { if (EVFLAG && eflag) {
numtyp e=(numtyp)0.0; numtyp e=(numtyp)0.0;
if (form[mtype]==0) { if (form[mtype]==0) {
e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
@ -325,7 +331,7 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
} }
energy+=factor_lj*(e-lj3[mtype].z); energy+=factor_lj*(e-lj3[mtype].z);
} }
if (vflag>0) { if (EVFLAG && vflag) {
virial[0] += delx*delx*force; virial[0] += delx*delx*force;
virial[1] += dely*dely*force; virial[1] += dely*dely*force;
virial[2] += delz*delz*force; virial[2] += delz*delz*force;
@ -336,8 +342,8 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
} }
} // for nbor } // for nbor
} // if ii
store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
ans,engv); ans,engv);
} // if ii
} }

View File

@ -81,7 +81,7 @@ class Colloid : public BaseAtomic<numtyp, acctyp> {
private: private:
bool _allocated; bool _allocated;
void loop(const bool _eflag, const bool _vflag); int loop(const int eflag, const int vflag);
}; };
} }

View File

@ -60,7 +60,7 @@ int colloid_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
init_ok=COLLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, init_ok=COLLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
host_lj4, offset, special_lj, host_a12, host_a1, host_lj4, offset, special_lj, host_a12, host_a1,
host_a2, host_d1, host_d2, host_sigma3, host_a2, host_d1, host_d2, host_sigma3,
host_sigma6, host_form, inum, nall, 300, host_sigma6, host_form, inum, nall, max_nbors,
maxspecial, cell_size, gpu_split, screen); maxspecial, cell_size, gpu_split, screen);
COLLMF.device->world_barrier(); COLLMF.device->world_barrier();
@ -80,7 +80,7 @@ int colloid_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
init_ok=COLLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, init_ok=COLLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
offset, special_lj, host_a12, host_a1, host_a2, offset, special_lj, host_a12, host_a1, host_a2,
host_d1, host_d2, host_sigma3, host_sigma6, host_form, host_d1, host_d2, host_sigma3, host_sigma6, host_form,
inum, nall, 300, maxspecial, inum, nall, max_nbors, maxspecial,
cell_size, gpu_split, screen); cell_size, gpu_split, screen);
COLLMF.device->gpu_barrier(); COLLMF.device->gpu_barrier();

View File

@ -125,20 +125,9 @@ double CoulT::host_memory_usage() const {
// Calculate energies, forces, and torques // Calculate energies, forces, and torques
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
void CoulT::loop(const bool _eflag, const bool _vflag) { int CoulT::loop(const int eflag, const int vflag) {
// Compute the block size and grid size to keep all cores busy // Compute the block size and grid size to keep all cores busy
const int BX=this->block_size(); const int BX=this->block_size();
int eflag, vflag;
if (_eflag)
eflag=1;
else
eflag=0;
if (_vflag)
vflag=1;
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom))); (BX/this->_threads_per_atom)));
@ -146,8 +135,8 @@ void CoulT::loop(const bool _eflag, const bool _vflag) {
int nbor_pitch=this->nbor->nbor_pitch(); int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start(); this->time_pair.start();
if (shared_types) { if (shared_types) {
this->k_pair_fast.set_size(GX,BX); this->k_pair_sel->set_size(GX,BX);
this->k_pair_fast.run(&this->atom->x, &scale, &sp_cl, this->k_pair_sel->run(&this->atom->x, &scale, &sp_cl,
&this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &this->ans->force, &this->ans->engv, &eflag,
&vflag, &ainum, &nbor_pitch, &this->atom->q, &vflag, &ainum, &nbor_pitch, &this->atom->q,
@ -161,6 +150,7 @@ void CoulT::loop(const bool _eflag, const bool _vflag) {
&cutsq, &_qqrd2e, &this->_threads_per_atom); &cutsq, &_qqrd2e, &this->_threads_per_atom);
} }
this->time_pair.stop(); this->time_pair.stop();
return GX;
} }
template class Coul<PRECISION,ACC_PRECISION>; template class Coul<PRECISION,ACC_PRECISION>;

View File

@ -46,22 +46,25 @@ __kernel void k_coul(const __global numtyp4 *restrict x_,
atom_info(t_per_atom,ii,tid,offset); atom_info(t_per_atom,ii,tid,offset);
__local numtyp sp_cl[8]; __local numtyp sp_cl[8];
int n_stride;
local_allocate_store_charge();
sp_cl[0]=sp_cl_in[0]; sp_cl[0]=sp_cl_in[0];
sp_cl[1]=sp_cl_in[1]; sp_cl[1]=sp_cl_in[1];
sp_cl[2]=sp_cl_in[2]; sp_cl[2]=sp_cl_in[2];
sp_cl[3]=sp_cl_in[3]; sp_cl[3]=sp_cl_in[3];
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f; acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6]; acctyp energy, e_coul, virial[6];
for (int i=0; i<6; i++) if (EVFLAG) {
virial[i]=(acctyp)0; energy=(acctyp)0;
e_coul=(acctyp)0;
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
}
if (ii<inum) { if (ii<inum) {
int i, numj, nbor, nbor_end; int i, numj, nbor, nbor_end;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
@ -98,10 +101,10 @@ __kernel void k_coul(const __global numtyp4 *restrict x_,
f.y+=dely*force; f.y+=dely*force;
f.z+=delz*force; f.z+=delz*force;
if (eflag>0) { if (EVFLAG && eflag) {
e_coul += forcecoul; e_coul += forcecoul;
} }
if (vflag>0) { if (EVFLAG && vflag) {
virial[0] += delx*delx*force; virial[0] += delx*delx*force;
virial[1] += dely*dely*force; virial[1] += dely*dely*force;
virial[2] += delz*delz*force; virial[2] += delz*delz*force;
@ -112,9 +115,9 @@ __kernel void k_coul(const __global numtyp4 *restrict x_,
} }
} // for nbor } // for nbor
} // if ii
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
vflag,ans,engv); vflag,ans,engv);
} // if ii
} }
__kernel void k_coul_fast(const __global numtyp4 *restrict x_, __kernel void k_coul_fast(const __global numtyp4 *restrict x_,
@ -134,25 +137,28 @@ __kernel void k_coul_fast(const __global numtyp4 *restrict x_,
__local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_cl[4]; __local numtyp sp_cl[4];
int n_stride;
local_allocate_store_charge();
if (tid<4) if (tid<4)
sp_cl[tid]=sp_cl_in[tid]; sp_cl[tid]=sp_cl_in[tid];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) { if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
cutsq[tid]=_cutsq[tid]; cutsq[tid]=_cutsq[tid];
} }
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f; acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6]; acctyp energy, e_coul, virial[6];
for (int i=0; i<6; i++) if (EVFLAG) {
virial[i]=(acctyp)0; energy=(acctyp)0;
e_coul=(acctyp)0;
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
}
__syncthreads(); __syncthreads();
if (ii<inum) { if (ii<inum) {
int i, numj, nbor, nbor_end; int i, numj, nbor, nbor_end;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
@ -189,10 +195,10 @@ __kernel void k_coul_fast(const __global numtyp4 *restrict x_,
f.y+=dely*force; f.y+=dely*force;
f.z+=delz*force; f.z+=delz*force;
if (eflag>0) { if (EVFLAG && eflag) {
e_coul += forcecoul; e_coul += forcecoul;
} }
if (vflag>0) { if (EVFLAG && vflag) {
virial[0] += delx*delx*force; virial[0] += delx*delx*force;
virial[1] += dely*dely*force; virial[1] += dely*dely*force;
virial[2] += delz*delz*force; virial[2] += delz*delz*force;
@ -203,8 +209,8 @@ __kernel void k_coul_fast(const __global numtyp4 *restrict x_,
} }
} // for nbor } // for nbor
} // if ii
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
vflag,ans,engv); vflag,ans,engv);
} // if ii
} }

View File

@ -75,7 +75,7 @@ class Coul : public BaseCharge<numtyp, acctyp> {
private: private:
bool _allocated; bool _allocated;
void loop(const bool _eflag, const bool _vflag); int loop(const int eflag, const int vflag);
}; };
} }

View File

@ -126,20 +126,9 @@ double CoulDebyeT::host_memory_usage() const {
// Calculate energies, forces, and torques // Calculate energies, forces, and torques
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
void CoulDebyeT::loop(const bool _eflag, const bool _vflag) { int CoulDebyeT::loop(const int eflag, const int vflag) {
// Compute the block size and grid size to keep all cores busy // Compute the block size and grid size to keep all cores busy
const int BX=this->block_size(); const int BX=this->block_size();
int eflag, vflag;
if (_eflag)
eflag=1;
else
eflag=0;
if (_vflag)
vflag=1;
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom))); (BX/this->_threads_per_atom)));
@ -147,8 +136,8 @@ void CoulDebyeT::loop(const bool _eflag, const bool _vflag) {
int nbor_pitch=this->nbor->nbor_pitch(); int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start(); this->time_pair.start();
if (shared_types) { if (shared_types) {
this->k_pair_fast.set_size(GX,BX); this->k_pair_sel->set_size(GX,BX);
this->k_pair_fast.run(&this->atom->x, &scale, &sp_cl, this->k_pair_sel->run(&this->atom->x, &scale, &sp_cl,
&this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &vflag, &this->ans->force, &this->ans->engv, &eflag, &vflag,
&ainum, &nbor_pitch, &this->atom->q, &cutsq, &ainum, &nbor_pitch, &this->atom->q, &cutsq,
@ -162,6 +151,7 @@ void CoulDebyeT::loop(const bool _eflag, const bool _vflag) {
&_qqrd2e, &_kappa, &this->_threads_per_atom); &_qqrd2e, &_kappa, &this->_threads_per_atom);
} }
this->time_pair.stop(); this->time_pair.stop();
return GX;
} }
template class CoulDebye<PRECISION,ACC_PRECISION>; template class CoulDebye<PRECISION,ACC_PRECISION>;

View File

@ -47,22 +47,25 @@ __kernel void k_coul_debye(const __global numtyp4 *restrict x_,
atom_info(t_per_atom,ii,tid,offset); atom_info(t_per_atom,ii,tid,offset);
__local numtyp sp_cl[4]; __local numtyp sp_cl[4];
int n_stride;
local_allocate_store_charge();
sp_cl[0]=sp_cl_in[0]; sp_cl[0]=sp_cl_in[0];
sp_cl[1]=sp_cl_in[1]; sp_cl[1]=sp_cl_in[1];
sp_cl[2]=sp_cl_in[2]; sp_cl[2]=sp_cl_in[2];
sp_cl[3]=sp_cl_in[3]; sp_cl[3]=sp_cl_in[3];
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f; acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6]; acctyp energy, e_coul, virial[6];
for (int i=0; i<6; i++) if (EVFLAG) {
virial[i]=(acctyp)0; energy=(acctyp)0;
e_coul=(acctyp)0;
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
}
if (ii<inum) { if (ii<inum) {
int i, numj, nbor, nbor_end; int i, numj, nbor, nbor_end;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
@ -102,10 +105,10 @@ __kernel void k_coul_debye(const __global numtyp4 *restrict x_,
f.y+=dely*force; f.y+=dely*force;
f.z+=delz*force; f.z+=delz*force;
if (eflag>0) { if (EVFLAG && eflag) {
e_coul+=qqrd2e*scale[mtype]*qtmp*rinv*screening*factor_coul; e_coul+=qqrd2e*scale[mtype]*qtmp*rinv*screening*factor_coul;
} }
if (vflag>0) { if (EVFLAG && vflag) {
virial[0] += delx*delx*force; virial[0] += delx*delx*force;
virial[1] += dely*dely*force; virial[1] += dely*dely*force;
virial[2] += delz*delz*force; virial[2] += delz*delz*force;
@ -116,9 +119,9 @@ __kernel void k_coul_debye(const __global numtyp4 *restrict x_,
} }
} // for nbor } // for nbor
} // if ii
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
vflag,ans,engv); vflag,ans,engv);
} // if ii
} }
__kernel void k_coul_debye_fast(const __global numtyp4 *restrict x_, __kernel void k_coul_debye_fast(const __global numtyp4 *restrict x_,
@ -140,6 +143,9 @@ __kernel void k_coul_debye_fast(const __global numtyp4 *restrict x_,
__local numtyp scale[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp scale[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_cl[4]; __local numtyp sp_cl[4];
int n_stride;
local_allocate_store_charge();
if (tid<4) if (tid<4)
sp_cl[tid]=sp_cl_in[tid]; sp_cl[tid]=sp_cl_in[tid];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) { if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
@ -147,19 +153,19 @@ __kernel void k_coul_debye_fast(const __global numtyp4 *restrict x_,
cutsq[tid]=_cutsq[tid]; cutsq[tid]=_cutsq[tid];
} }
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f; acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6]; acctyp energy, e_coul, virial[6];
for (int i=0; i<6; i++) if (EVFLAG) {
virial[i]=(acctyp)0; energy=(acctyp)0;
e_coul=(acctyp)0;
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
}
__syncthreads(); __syncthreads();
if (ii<inum) { if (ii<inum) {
int i, numj, nbor, nbor_end; int i, numj, nbor, nbor_end;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
@ -199,10 +205,10 @@ __kernel void k_coul_debye_fast(const __global numtyp4 *restrict x_,
f.y+=dely*force; f.y+=dely*force;
f.z+=delz*force; f.z+=delz*force;
if (eflag>0) { if (EVFLAG && eflag) {
e_coul+=qqrd2e*scale[mtype]*qtmp*rinv*screening*factor_coul; e_coul+=qqrd2e*scale[mtype]*qtmp*rinv*screening*factor_coul;
} }
if (vflag>0) { if (EVFLAG && vflag) {
virial[0] += delx*delx*force; virial[0] += delx*delx*force;
virial[1] += dely*dely*force; virial[1] += dely*dely*force;
virial[2] += delz*delz*force; virial[2] += delz*delz*force;
@ -213,8 +219,7 @@ __kernel void k_coul_debye_fast(const __global numtyp4 *restrict x_,
} }
} // for nbor } // for nbor
} // if ii
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
vflag,ans,engv); vflag,ans,engv);
} // if ii
} }

View File

@ -76,7 +76,7 @@ class CoulDebye : public BaseCharge<numtyp, acctyp> {
private: private:
bool _allocated; bool _allocated;
void loop(const bool _eflag, const bool _vflag); int loop(const int eflag, const int vflag);
}; };
} }

View File

@ -54,7 +54,7 @@ int cdebye_gpu_init(const int ntypes, double **host_scale, double **cutsq,
int init_ok=0; int init_ok=0;
if (world_me==0) if (world_me==0)
init_ok=CDEMF.init(ntypes, host_scale, cutsq, host_special_coul, inum, nall, 300, init_ok=CDEMF.init(ntypes, host_scale, cutsq, host_special_coul, inum, nall, max_nbors,
maxspecial, cell_size, gpu_split, screen, qqrd2e, kappa); maxspecial, cell_size, gpu_split, screen, qqrd2e, kappa);
CDEMF.device->world_barrier(); CDEMF.device->world_barrier();
@ -71,7 +71,7 @@ int cdebye_gpu_init(const int ntypes, double **host_scale, double **cutsq,
fflush(screen); fflush(screen);
} }
if (gpu_rank==i && world_me!=0) if (gpu_rank==i && world_me!=0)
init_ok=CDEMF.init(ntypes, host_scale, cutsq, host_special_coul, inum, nall, 300, init_ok=CDEMF.init(ntypes, host_scale, cutsq, host_special_coul, inum, nall, max_nbors,
maxspecial, cell_size, gpu_split, screen, qqrd2e, kappa); maxspecial, cell_size, gpu_split, screen, qqrd2e, kappa);
CDEMF.device->gpu_barrier(); CDEMF.device->gpu_barrier();

View File

@ -110,20 +110,9 @@ double CoulDSFT::host_memory_usage() const {
// Calculate energies, forces, and torques // Calculate energies, forces, and torques
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
void CoulDSFT::loop(const bool _eflag, const bool _vflag) { int CoulDSFT::loop(const int eflag, const int vflag) {
// Compute the block size and grid size to keep all cores busy // Compute the block size and grid size to keep all cores busy
const int BX=this->block_size(); const int BX=this->block_size();
int eflag, vflag;
if (_eflag)
eflag=1;
else
eflag=0;
if (_vflag)
vflag=1;
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom))); (BX/this->_threads_per_atom)));
@ -131,8 +120,8 @@ void CoulDSFT::loop(const bool _eflag, const bool _vflag) {
int nbor_pitch=this->nbor->nbor_pitch(); int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start(); this->time_pair.start();
if (shared_types) { if (shared_types) {
this->k_pair_fast.set_size(GX,BX); this->k_pair_sel->set_size(GX,BX);
this->k_pair_fast.run(&this->atom->x, &sp_lj, this->k_pair_sel->run(&this->atom->x, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &this->ans->force, &this->ans->engv, &eflag,
&vflag, &ainum, &nbor_pitch, &this->atom->q, &vflag, &ainum, &nbor_pitch, &this->atom->q,
@ -148,6 +137,7 @@ void CoulDSFT::loop(const bool _eflag, const bool _vflag) {
&this->_threads_per_atom); &this->_threads_per_atom);
} }
this->time_pair.stop(); this->time_pair.stop();
return GX;
} }
template class CoulDSF<PRECISION,ACC_PRECISION>; template class CoulDSF<PRECISION,ACC_PRECISION>;

View File

@ -48,30 +48,33 @@ __kernel void k_coul_dsf(const __global numtyp4 *restrict x_,
atom_info(t_per_atom,ii,tid,offset); atom_info(t_per_atom,ii,tid,offset);
__local numtyp sp_lj[4]; __local numtyp sp_lj[4];
int n_stride;
local_allocate_store_charge();
sp_lj[0]=sp_lj_in[0]; sp_lj[0]=sp_lj_in[0];
sp_lj[1]=sp_lj_in[1]; sp_lj[1]=sp_lj_in[1];
sp_lj[2]=sp_lj_in[2]; sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3]; sp_lj[3]=sp_lj_in[3];
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f; acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6]; acctyp energy, e_coul, virial[6];
for (int i=0; i<6; i++) if (EVFLAG) {
virial[i]=(acctyp)0; energy=(acctyp)0;
e_coul=(acctyp)0;
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
}
if (ii<inum) { if (ii<inum) {
int nbor, nbor_end; int nbor, nbor_end;
int i, numj; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex); numtyp qtmp; fetch(qtmp,i,q_tex);
if (eflag>0) { if (EVFLAG && eflag) {
acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) * acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) *
qtmp*qtmp*qqrd2e/(acctyp)t_per_atom; qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
e_coul += (acctyp)2.0*e_self; e_coul += (acctyp)2.0*e_self;
@ -111,11 +114,11 @@ __kernel void k_coul_dsf(const __global numtyp4 *restrict x_,
f.y+=dely*force; f.y+=dely*force;
f.z+=delz*force; f.z+=delz*force;
if (eflag>0) { if (EVFLAG && eflag) {
numtyp e=prefactor*(erfcc-r*e_shift-rsq*f_shift-factor_coul); numtyp e=prefactor*(erfcc-r*e_shift-rsq*f_shift-factor_coul);
e_coul += e; e_coul += e;
} }
if (vflag>0) { if (EVFLAG && vflag) {
virial[0] += delx*delx*force; virial[0] += delx*delx*force;
virial[1] += dely*dely*force; virial[1] += dely*dely*force;
virial[2] += delz*delz*force; virial[2] += delz*delz*force;
@ -126,9 +129,9 @@ __kernel void k_coul_dsf(const __global numtyp4 *restrict x_,
} }
} // for nbor } // for nbor
} // if ii
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
vflag,ans,engv); vflag,ans,engv);
} // if ii
} }
__kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_, __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_,
@ -147,30 +150,33 @@ __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_,
atom_info(t_per_atom,ii,tid,offset); atom_info(t_per_atom,ii,tid,offset);
__local numtyp sp_lj[4]; __local numtyp sp_lj[4];
int n_stride;
local_allocate_store_charge();
if (tid<4) if (tid<4)
sp_lj[tid]=sp_lj_in[tid]; sp_lj[tid]=sp_lj_in[tid];
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f; acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6]; acctyp energy, e_coul, virial[6];
for (int i=0; i<6; i++) if (EVFLAG) {
virial[i]=(acctyp)0; energy=(acctyp)0;
e_coul=(acctyp)0;
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
}
__syncthreads(); __syncthreads();
if (ii<inum) { if (ii<inum) {
int nbor, nbor_end; int nbor, nbor_end;
int i, numj; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex); numtyp qtmp; fetch(qtmp,i,q_tex);
if (eflag>0) { if (EVFLAG && eflag) {
acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) * acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) *
qtmp*qtmp*qqrd2e/(acctyp)t_per_atom; qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
e_coul += (acctyp)2.0*e_self; e_coul += (acctyp)2.0*e_self;
@ -210,11 +216,11 @@ __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_,
f.y+=dely*force; f.y+=dely*force;
f.z+=delz*force; f.z+=delz*force;
if (eflag>0) { if (EVFLAG && eflag) {
numtyp e=prefactor*(erfcc-r*e_shift-rsq*f_shift-factor_coul); numtyp e=prefactor*(erfcc-r*e_shift-rsq*f_shift-factor_coul);
e_coul += e; e_coul += e;
} }
if (vflag>0) { if (EVFLAG && vflag) {
virial[0] += delx*delx*force; virial[0] += delx*delx*force;
virial[1] += dely*dely*force; virial[1] += dely*dely*force;
virial[2] += delz*delz*force; virial[2] += delz*delz*force;
@ -225,8 +231,7 @@ __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_,
} }
} // for nbor } // for nbor
} // if ii
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
vflag,ans,engv); vflag,ans,engv);
} // if ii
} }

View File

@ -70,7 +70,7 @@ class CoulDSF : public BaseCharge<numtyp, acctyp> {
private: private:
bool _allocated; bool _allocated;
numtyp _e_shift, _f_shift, _alpha, _cut_coulsq; numtyp _e_shift, _f_shift, _alpha, _cut_coulsq;
void loop(const bool _eflag, const bool _vflag); int loop(const int eflag, const int vflag);
}; };
} }

View File

@ -55,7 +55,7 @@ int cdsf_gpu_init(const int ntypes, const int inum, const int nall,
int init_ok=0; int init_ok=0;
if (world_me==0) if (world_me==0)
init_ok=CDMF.init(ntypes, inum, nall, 300, maxspecial, cell_size, init_ok=CDMF.init(ntypes, inum, nall, max_nbors, maxspecial, cell_size,
gpu_split, screen, host_cut_coulsq, host_special_coul, gpu_split, screen, host_cut_coulsq, host_special_coul,
qqrd2e, e_shift, f_shift, alpha); qqrd2e, e_shift, f_shift, alpha);
@ -73,7 +73,7 @@ int cdsf_gpu_init(const int ntypes, const int inum, const int nall,
fflush(screen); fflush(screen);
} }
if (gpu_rank==i && world_me!=0) if (gpu_rank==i && world_me!=0)
init_ok=CDMF.init(ntypes, inum, nall, 300, maxspecial, cell_size, init_ok=CDMF.init(ntypes, inum, nall, max_nbors, maxspecial, cell_size,
gpu_split, screen, host_cut_coulsq, host_special_coul, gpu_split, screen, host_cut_coulsq, host_special_coul,
qqrd2e, e_shift, f_shift, alpha); qqrd2e, e_shift, f_shift, alpha);

Some files were not shown because too many files have changed in this diff Show More