git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6053 f3b2605a-c512-4ea7-a41b-209d697bcdaa
This commit is contained in:
@ -26,7 +26,7 @@ CUDA_INCLUDE = -I$(CUDA_HOME)/include
|
|||||||
CUDA_LIB = -L$(CUDA_HOME)/lib64 -Xlinker -rpath -Xlinker $(CUDA_HOME)/lib64
|
CUDA_LIB = -L$(CUDA_HOME)/lib64 -Xlinker -rpath -Xlinker $(CUDA_HOME)/lib64
|
||||||
CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math
|
CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math
|
||||||
|
|
||||||
CUDR_CPP = mpic++ -DMPI_GERYON -I$(CUDA_HOME)/include
|
CUDR_CPP = mpic++ -DMPI_GERYON -DUCL_NO_EXIT -I$(CUDA_HOME)/include
|
||||||
CUDR_OPTS = -O3 -ffast-math -funroll-loops -DMPI_GERYON
|
CUDR_OPTS = -O3 -ffast-math -funroll-loops -DMPI_GERYON
|
||||||
|
|
||||||
BIN_DIR = ./
|
BIN_DIR = ./
|
||||||
|
|||||||
@ -17,16 +17,16 @@
|
|||||||
# Paul Crozier (SNL), pscrozi@sandia.gov
|
# Paul Crozier (SNL), pscrozi@sandia.gov
|
||||||
# ------------------------------------------------------------------------- */
|
# ------------------------------------------------------------------------- */
|
||||||
|
|
||||||
CUDA_HOME = /sw/analysis-x64/cuda/3.0/sl5.0_binary/
|
CUDA_HOME = /sw/analysis-x64/cuda/3.2/centos5.5_binary/
|
||||||
NVCC = nvcc
|
NVCC = nvcc
|
||||||
|
|
||||||
CUDA_ARCH = -arch=sm_13
|
CUDA_ARCH = -arch=sm_13
|
||||||
CUDA_PRECISION = -D_SINGLE_SINGLE
|
CUDA_PRECISION = -D_SINGLE_DOUBLE
|
||||||
CUDA_INCLUDE = -I$(CUDA_HOME)/include
|
CUDA_INCLUDE = -I$(CUDA_HOME)/include
|
||||||
CUDA_LIB = -L$(CUDA_HOME)/lib64
|
CUDA_LIB = -L$(CUDA_HOME)/lib64
|
||||||
CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math
|
CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math
|
||||||
|
|
||||||
CUDR_CPP = mpic++ -DMPI_GERYON -openmp
|
CUDR_CPP = mpic++ -DMPI_GERYON -DUCL_NO_EXIT -openmp
|
||||||
CUDR_OPTS = -O2 -xSSE2 -ip -use-intel-optimized-headers -fno-alias
|
CUDR_OPTS = -O2 -xSSE2 -ip -use-intel-optimized-headers -fno-alias
|
||||||
|
|
||||||
BIN_DIR = ./
|
BIN_DIR = ./
|
||||||
|
|||||||
@ -24,7 +24,7 @@ CUDA_INCLUDE = -I$(CUDA_HOME)/include
|
|||||||
CUDA_LIB = -L$(CUDA_HOME)/lib64 -Wl,-rpath,$(CUDA_HOME)/lib64
|
CUDA_LIB = -L$(CUDA_HOME)/lib64 -Wl,-rpath,$(CUDA_HOME)/lib64
|
||||||
CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math
|
CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math
|
||||||
|
|
||||||
CUDR_CPP = mpic++ -DMPI_GERYON
|
CUDR_CPP = mpic++ -DMPI_GERYON -DUCL_NO_EXIT
|
||||||
CUDR_OPTS = -O3 -DMPI_GERYON -ffast-math -funroll-loops
|
CUDR_OPTS = -O3 -DMPI_GERYON -ffast-math -funroll-loops
|
||||||
|
|
||||||
BIN_DIR = ./
|
BIN_DIR = ./
|
||||||
|
|||||||
@ -26,7 +26,7 @@ CUDA_INCLUDE = -I$(CUDA_HOME)/include
|
|||||||
CUDA_LIB = -L$(CUDA_HOME)/lib64
|
CUDA_LIB = -L$(CUDA_HOME)/lib64
|
||||||
CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math
|
CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math
|
||||||
|
|
||||||
CUDR_CPP = mpic++ -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK
|
CUDR_CPP = mpic++ -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK
|
||||||
CUDR_OPTS = -O2 # -xHost -no-prec-div -ansi-alias
|
CUDR_OPTS = -O2 # -xHost -no-prec-div -ansi-alias
|
||||||
|
|
||||||
BIN_DIR = ./
|
BIN_DIR = ./
|
||||||
|
|||||||
@ -17,7 +17,7 @@
|
|||||||
# Paul Crozier (SNL), pscrozi@sandia.gov
|
# Paul Crozier (SNL), pscrozi@sandia.gov
|
||||||
# ------------------------------------------------------------------------- */
|
# ------------------------------------------------------------------------- */
|
||||||
|
|
||||||
OCL_CPP = mpic++ -I./geryon/opencl -O3 -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK
|
OCL_CPP = mpic++ -O3 -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK
|
||||||
OCL_LINK = -lOpenCL
|
OCL_LINK = -lOpenCL
|
||||||
OCL_PREC = -D_SINGLE_SINGLE
|
OCL_PREC = -D_SINGLE_SINGLE
|
||||||
|
|
||||||
|
|||||||
@ -23,7 +23,7 @@ CUDA_INCLUDE = -I$(CUDA_HOME)/include
|
|||||||
CUDA_LIB = -L$(TACC_CUDA_LIB) -Wl,-rpath,$(TACC_CUDA_LIB)
|
CUDA_LIB = -L$(TACC_CUDA_LIB) -Wl,-rpath,$(TACC_CUDA_LIB)
|
||||||
CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math
|
CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math
|
||||||
|
|
||||||
CUDR_CPP = mpicxx -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK
|
CUDR_CPP = mpicxx -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK
|
||||||
CUDR_OPTS = -O2 # -xHost -no-prec-div -ansi-alias
|
CUDR_OPTS = -O2 # -xHost -no-prec-div -ansi-alias
|
||||||
|
|
||||||
BIN_DIR = ./
|
BIN_DIR = ./
|
||||||
|
|||||||
@ -24,7 +24,7 @@ CUDA_ARCH = -arch=sm_11
|
|||||||
CUDA_PRECISION = -D_SINGLE_SINGLE
|
CUDA_PRECISION = -D_SINGLE_SINGLE
|
||||||
CUDA_INCLUDE = -I$(CUDA_HOME)/include
|
CUDA_INCLUDE = -I$(CUDA_HOME)/include
|
||||||
CUDA_LIB = -L$(CUDA_HOME)/lib
|
CUDA_LIB = -L$(CUDA_HOME)/lib
|
||||||
CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math -m32
|
CUDA_OPTS = -DUNIX -DUCL_NO_EXIT -O3 -Xptxas -v --use_fast_math -m32
|
||||||
|
|
||||||
CUDR_CPP = mpic++
|
CUDR_CPP = mpic++
|
||||||
CUDR_OPTS = -O2 -m32 -g
|
CUDR_OPTS = -O2 -m32 -g
|
||||||
|
|||||||
@ -17,7 +17,7 @@
|
|||||||
# Paul Crozier (SNL), pscrozi@sandia.gov
|
# Paul Crozier (SNL), pscrozi@sandia.gov
|
||||||
# ------------------------------------------------------------------------- */
|
# ------------------------------------------------------------------------- */
|
||||||
|
|
||||||
OCL_CPP = mpic++ -I./geryon/opencl_1_0 -O3 -DMPI_GERYON
|
OCL_CPP = mpic++ -I./geryon/opencl_1_0 -O3 -DMPI_GERYON -DUCL_NO_EXIT
|
||||||
OCL_LINK = -framework OpenCL
|
OCL_LINK = -framework OpenCL
|
||||||
OCL_PREC = -D_SINGLE_SINGLE
|
OCL_PREC = -D_SINGLE_SINGLE
|
||||||
|
|
||||||
|
|||||||
@ -14,6 +14,7 @@
|
|||||||
# /* ----------------------------------------------------------------------
|
# /* ----------------------------------------------------------------------
|
||||||
# Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
# Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||||
# Peng Wang (Nvidia), penwang@nvidia.com
|
# Peng Wang (Nvidia), penwang@nvidia.com
|
||||||
|
# Inderaj Bains (NVIDIA), ibains@nvidia.com
|
||||||
# Paul Crozier (SNL), pscrozi@sandia.gov
|
# Paul Crozier (SNL), pscrozi@sandia.gov
|
||||||
# ------------------------------------------------------------------------- */
|
# ------------------------------------------------------------------------- */
|
||||||
|
|
||||||
@ -28,10 +29,11 @@ GPU_LIB = $(LIB_DIR)/libgpu.a
|
|||||||
# Headers for Geryon
|
# Headers for Geryon
|
||||||
UCL_H = $(wildcard ./geryon/ucl*.h)
|
UCL_H = $(wildcard ./geryon/ucl*.h)
|
||||||
NVC_H = $(wildcard ./geryon/nvc*.h) $(UCL_H)
|
NVC_H = $(wildcard ./geryon/nvc*.h) $(UCL_H)
|
||||||
NVD_H = $(wildcard ./geryon/nvd*.h) $(UCL_H)
|
NVD_H = $(wildcard ./geryon/nvd*.h) $(UCL_H) nv_kernel_def.h
|
||||||
# Headers for Pair Stuff
|
# Headers for Pair Stuff
|
||||||
PAIR_H = pair_gpu_atom.h pair_gpu_nbor.h pair_gpu_precision.h \
|
PAIR_H = pair_gpu_atom.h pair_gpu_ans.h pair_gpu_nbor_shared.h \
|
||||||
pair_gpu_device.h pair_gpu_balance.h
|
pair_gpu_nbor.h pair_gpu_precision.h pair_gpu_device.h \
|
||||||
|
pair_gpu_balance.h pppm_gpu_memory.h
|
||||||
|
|
||||||
ALL_H = $(NVD_H) $(PAIR_H)
|
ALL_H = $(NVD_H) $(PAIR_H)
|
||||||
|
|
||||||
@ -39,28 +41,37 @@ EXECS = $(BIN_DIR)/nvc_get_devices
|
|||||||
CUDPP = $(OBJ_DIR)/cudpp.o $(OBJ_DIR)/cudpp_plan.o \
|
CUDPP = $(OBJ_DIR)/cudpp.o $(OBJ_DIR)/cudpp_plan.o \
|
||||||
$(OBJ_DIR)/cudpp_maximal_launch.o $(OBJ_DIR)/cudpp_plan_manager.o \
|
$(OBJ_DIR)/cudpp_maximal_launch.o $(OBJ_DIR)/cudpp_plan_manager.o \
|
||||||
$(OBJ_DIR)/radixsort_app.cu_o $(OBJ_DIR)/scan_app.cu_o
|
$(OBJ_DIR)/radixsort_app.cu_o $(OBJ_DIR)/scan_app.cu_o
|
||||||
OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_nbor.o \
|
OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_ans.o \
|
||||||
$(OBJ_DIR)/pair_gpu_device.o $(OBJ_DIR)/atomic_gpu_memory.o \
|
$(OBJ_DIR)/pair_gpu_nbor.o $(OBJ_DIR)/pair_gpu_nbor_shared.o \
|
||||||
$(OBJ_DIR)/charge_gpu_memory.o \
|
$(OBJ_DIR)/pair_gpu_device.o \
|
||||||
|
$(OBJ_DIR)/atomic_gpu_memory.o $(OBJ_DIR)/charge_gpu_memory.o \
|
||||||
|
$(OBJ_DIR)/pppm_gpu_memory.o $(OBJ_DIR)/pppm_l_gpu.o \
|
||||||
$(OBJ_DIR)/gb_gpu_memory.o $(OBJ_DIR)/gb_gpu.o \
|
$(OBJ_DIR)/gb_gpu_memory.o $(OBJ_DIR)/gb_gpu.o \
|
||||||
$(OBJ_DIR)/lj_cut_gpu_memory.o $(OBJ_DIR)/lj_cut_gpu.o \
|
$(OBJ_DIR)/lj_cut_gpu_memory.o $(OBJ_DIR)/lj_cut_gpu.o \
|
||||||
$(OBJ_DIR)/lj96_cut_gpu_memory.o $(OBJ_DIR)/lj96_cut_gpu.o \
|
$(OBJ_DIR)/lj96_cut_gpu_memory.o $(OBJ_DIR)/lj96_cut_gpu.o \
|
||||||
|
$(OBJ_DIR)/lj_expand_gpu_memory.o $(OBJ_DIR)/lj_expand_gpu.o \
|
||||||
$(OBJ_DIR)/ljc_cut_gpu_memory.o $(OBJ_DIR)/ljc_cut_gpu.o \
|
$(OBJ_DIR)/ljc_cut_gpu_memory.o $(OBJ_DIR)/ljc_cut_gpu.o \
|
||||||
$(OBJ_DIR)/ljcl_cut_gpu_memory.o $(OBJ_DIR)/ljcl_cut_gpu.o \
|
$(OBJ_DIR)/ljcl_cut_gpu_memory.o $(OBJ_DIR)/ljcl_cut_gpu.o \
|
||||||
|
$(OBJ_DIR)/morse_gpu_memory.o $(OBJ_DIR)/morse_gpu.o \
|
||||||
$(OBJ_DIR)/crml_gpu_memory.o $(OBJ_DIR)/crml_gpu.o \
|
$(OBJ_DIR)/crml_gpu_memory.o $(OBJ_DIR)/crml_gpu.o \
|
||||||
$(OBJ_DIR)/cmm_cut_gpu_memory.o $(OBJ_DIR)/cmm_cut_gpu.o \
|
$(OBJ_DIR)/cmm_cut_gpu_memory.o $(OBJ_DIR)/cmm_cut_gpu.o \
|
||||||
$(OBJ_DIR)/cmmc_long_gpu_memory.o $(OBJ_DIR)/cmmc_long_gpu.o \
|
$(OBJ_DIR)/cmmc_long_gpu_memory.o $(OBJ_DIR)/cmmc_long_gpu.o \
|
||||||
$(CUDPP)
|
$(CUDPP)
|
||||||
PTXS = $(OBJ_DIR)/pair_gpu_atom_kernel.ptx $(OBJ_DIR)/pair_gpu_atom_ptx.h \
|
PTXS = $(OBJ_DIR)/pair_gpu_dev_kernel.ptx \
|
||||||
|
$(OBJ_DIR)/pair_gpu_atom_kernel.ptx $(OBJ_DIR)/pair_gpu_atom_ptx.h \
|
||||||
$(OBJ_DIR)/pair_gpu_nbor_kernel.ptx $(OBJ_DIR)/pair_gpu_nbor_ptx.h \
|
$(OBJ_DIR)/pair_gpu_nbor_kernel.ptx $(OBJ_DIR)/pair_gpu_nbor_ptx.h \
|
||||||
$(OBJ_DIR)/pair_gpu_build_kernel.ptx $(OBJ_DIR)/pair_gpu_build_ptx.h \
|
$(OBJ_DIR)/pair_gpu_build_kernel.ptx $(OBJ_DIR)/pair_gpu_build_ptx.h \
|
||||||
|
$(OBJ_DIR)/pppm_f_gpu_kernel.ptx $(OBJ_DIR)/pppm_f_gpu_ptx.h \
|
||||||
|
$(OBJ_DIR)/pppm_d_gpu_kernel.ptx $(OBJ_DIR)/pppm_d_gpu_ptx.h \
|
||||||
$(OBJ_DIR)/gb_gpu_kernel_nbor.ptx $(OBJ_DIR)/gb_gpu_kernel.ptx \
|
$(OBJ_DIR)/gb_gpu_kernel_nbor.ptx $(OBJ_DIR)/gb_gpu_kernel.ptx \
|
||||||
$(OBJ_DIR)/gb_gpu_kernel_lj.ptx $(OBJ_DIR)/gb_gpu_ptx.h \
|
$(OBJ_DIR)/gb_gpu_kernel_lj.ptx $(OBJ_DIR)/gb_gpu_ptx.h \
|
||||||
$(OBJ_DIR)/lj_cut_gpu_kernel.ptx $(OBJ_DIR)/lj_cut_gpu_ptx.h \
|
$(OBJ_DIR)/lj_cut_gpu_kernel.ptx $(OBJ_DIR)/lj_cut_gpu_ptx.h \
|
||||||
$(OBJ_DIR)/lj96_cut_gpu_kernel.ptx $(OBJ_DIR)/lj96_cut_gpu_ptx.h \
|
$(OBJ_DIR)/lj96_cut_gpu_kernel.ptx $(OBJ_DIR)/lj96_cut_gpu_ptx.h \
|
||||||
|
$(OBJ_DIR)/lj_expand_gpu_kernel.ptx $(OBJ_DIR)/lj_expand_gpu_ptx.h \
|
||||||
$(OBJ_DIR)/ljc_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc_cut_gpu_ptx.h \
|
$(OBJ_DIR)/ljc_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc_cut_gpu_ptx.h \
|
||||||
$(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx $(OBJ_DIR)/ljcl_cut_gpu_ptx.h \
|
$(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx $(OBJ_DIR)/ljcl_cut_gpu_ptx.h \
|
||||||
$(OBJ_DIR)/crml_cut_gpu_kernel.ptx $(OBJ_DIR)/crml_cut_gpu_ptx.h \
|
$(OBJ_DIR)/morse_gpu_kernel.ptx $(OBJ_DIR)/morse_gpu_ptx.h \
|
||||||
|
$(OBJ_DIR)/crml_gpu_kernel.ptx $(OBJ_DIR)/crml_gpu_ptx.h \
|
||||||
$(OBJ_DIR)/cmm_cut_gpu_kernel.ptx $(OBJ_DIR)/cmm_cut_gpu_ptx.h \
|
$(OBJ_DIR)/cmm_cut_gpu_kernel.ptx $(OBJ_DIR)/cmm_cut_gpu_ptx.h \
|
||||||
$(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(OBJ_DIR)/cmmc_long_gpu_ptx.h
|
$(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(OBJ_DIR)/cmmc_long_gpu_ptx.h
|
||||||
|
|
||||||
@ -93,6 +104,9 @@ $(OBJ_DIR)/pair_gpu_atom_ptx.h: $(OBJ_DIR)/pair_gpu_atom_kernel.ptx
|
|||||||
$(OBJ_DIR)/pair_gpu_atom.o: pair_gpu_atom.cpp pair_gpu_atom.h $(NVD_H) $(OBJ_DIR)/pair_gpu_atom_ptx.h
|
$(OBJ_DIR)/pair_gpu_atom.o: pair_gpu_atom.cpp pair_gpu_atom.h $(NVD_H) $(OBJ_DIR)/pair_gpu_atom_ptx.h
|
||||||
$(CUDR) -o $@ -c pair_gpu_atom.cpp -I$(OBJ_DIR)
|
$(CUDR) -o $@ -c pair_gpu_atom.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
|
$(OBJ_DIR)/pair_gpu_ans.o: pair_gpu_ans.cpp pair_gpu_ans.h $(NVD_H)
|
||||||
|
$(CUDR) -o $@ -c pair_gpu_ans.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
$(OBJ_DIR)/pair_gpu_nbor_kernel.ptx: pair_gpu_nbor_kernel.cu
|
$(OBJ_DIR)/pair_gpu_nbor_kernel.ptx: pair_gpu_nbor_kernel.cu
|
||||||
$(CUDA) --ptx -DNV_KERNEL -o $@ pair_gpu_nbor_kernel.cu
|
$(CUDA) --ptx -DNV_KERNEL -o $@ pair_gpu_nbor_kernel.cu
|
||||||
|
|
||||||
@ -105,11 +119,20 @@ $(OBJ_DIR)/pair_gpu_build_kernel.ptx: pair_gpu_build_kernel.cu
|
|||||||
$(OBJ_DIR)/pair_gpu_build_ptx.h: $(OBJ_DIR)/pair_gpu_build_kernel.ptx
|
$(OBJ_DIR)/pair_gpu_build_ptx.h: $(OBJ_DIR)/pair_gpu_build_kernel.ptx
|
||||||
$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pair_gpu_build_kernel.ptx $(OBJ_DIR)/pair_gpu_build_ptx.h
|
$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pair_gpu_build_kernel.ptx $(OBJ_DIR)/pair_gpu_build_ptx.h
|
||||||
|
|
||||||
$(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h $(OBJ_DIR)/pair_gpu_nbor_ptx.h $(OBJ_DIR)/pair_gpu_build_ptx.h $(NVD_H)
|
$(OBJ_DIR)/pair_gpu_nbor_shared.o: pair_gpu_nbor_shared.cpp pair_gpu_nbor_shared.h $(OBJ_DIR)/pair_gpu_nbor_ptx.h $(OBJ_DIR)/pair_gpu_build_ptx.h $(NVD_H)
|
||||||
|
$(CUDR) -o $@ -c pair_gpu_nbor_shared.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
|
$(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h pair_gpu_nbor_shared.h $(NVD_H)
|
||||||
$(CUDR) -o $@ -c pair_gpu_nbor.cpp -I$(OBJ_DIR)
|
$(CUDR) -o $@ -c pair_gpu_nbor.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(NVD_H)
|
$(OBJ_DIR)/pair_gpu_dev_kernel.ptx: pair_gpu_dev_kernel.cu
|
||||||
$(CUDR) -o $@ -c pair_gpu_device.cpp
|
$(CUDA) --ptx -DNV_KERNEL -o $@ pair_gpu_dev_kernel.cu
|
||||||
|
|
||||||
|
$(OBJ_DIR)/pair_gpu_dev_ptx.h: $(OBJ_DIR)/pair_gpu_dev_kernel.ptx
|
||||||
|
$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pair_gpu_dev_kernel.ptx $(OBJ_DIR)/pair_gpu_dev_ptx.h
|
||||||
|
|
||||||
|
$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(ALL_H) $(OBJ_DIR)/pair_gpu_dev_ptx.h
|
||||||
|
$(CUDR) -o $@ -c pair_gpu_device.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
$(OBJ_DIR)/atomic_gpu_memory.o: $(ALL_H) atomic_gpu_memory.h atomic_gpu_memory.cpp
|
$(OBJ_DIR)/atomic_gpu_memory.o: $(ALL_H) atomic_gpu_memory.h atomic_gpu_memory.cpp
|
||||||
$(CUDR) -o $@ -c atomic_gpu_memory.cpp
|
$(CUDR) -o $@ -c atomic_gpu_memory.cpp
|
||||||
@ -117,6 +140,24 @@ $(OBJ_DIR)/atomic_gpu_memory.o: $(ALL_H) atomic_gpu_memory.h atomic_gpu_memory.c
|
|||||||
$(OBJ_DIR)/charge_gpu_memory.o: $(ALL_H) charge_gpu_memory.h charge_gpu_memory.cpp
|
$(OBJ_DIR)/charge_gpu_memory.o: $(ALL_H) charge_gpu_memory.h charge_gpu_memory.cpp
|
||||||
$(CUDR) -o $@ -c charge_gpu_memory.cpp
|
$(CUDR) -o $@ -c charge_gpu_memory.cpp
|
||||||
|
|
||||||
|
$(OBJ_DIR)/pppm_f_gpu_kernel.ptx: pppm_gpu_kernel.cu pair_gpu_precision.h
|
||||||
|
$(CUDA) --ptx -DNV_KERNEL -Dgrdtyp=float -Dgrdtyp4=float4 -o $@ pppm_gpu_kernel.cu
|
||||||
|
|
||||||
|
$(OBJ_DIR)/pppm_f_gpu_ptx.h: $(OBJ_DIR)/pppm_f_gpu_kernel.ptx
|
||||||
|
$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pppm_f_gpu_kernel.ptx $(OBJ_DIR)/pppm_f_gpu_ptx.h
|
||||||
|
|
||||||
|
$(OBJ_DIR)/pppm_d_gpu_kernel.ptx: pppm_gpu_kernel.cu pair_gpu_precision.h
|
||||||
|
$(CUDA) --ptx -DNV_KERNEL -Dgrdtyp=double -Dgrdtyp4=double4 -o $@ pppm_gpu_kernel.cu
|
||||||
|
|
||||||
|
$(OBJ_DIR)/pppm_d_gpu_ptx.h: $(OBJ_DIR)/pppm_d_gpu_kernel.ptx
|
||||||
|
$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pppm_d_gpu_kernel.ptx $(OBJ_DIR)/pppm_d_gpu_ptx.h
|
||||||
|
|
||||||
|
$(OBJ_DIR)/pppm_gpu_memory.o: $(ALL_H) pppm_gpu_memory.h pppm_gpu_memory.cpp $(OBJ_DIR)/pppm_f_gpu_ptx.h $(OBJ_DIR)/pppm_d_gpu_ptx.h
|
||||||
|
$(CUDR) -o $@ -c pppm_gpu_memory.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
|
$(OBJ_DIR)/pppm_l_gpu.o: $(ALL_H) pppm_gpu_memory.h pppm_l_gpu.cpp
|
||||||
|
$(CUDR) -o $@ -c pppm_l_gpu.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
$(OBJ_DIR)/gb_gpu_kernel.ptx: gb_gpu_kernel.cu pair_gpu_precision.h gb_gpu_extra.h
|
$(OBJ_DIR)/gb_gpu_kernel.ptx: gb_gpu_kernel.cu pair_gpu_precision.h gb_gpu_extra.h
|
||||||
$(CUDA) --ptx -DNV_KERNEL -o $@ gb_gpu_kernel.cu
|
$(CUDA) --ptx -DNV_KERNEL -o $@ gb_gpu_kernel.cu
|
||||||
|
|
||||||
@ -144,7 +185,7 @@ $(OBJ_DIR)/lj_cut_gpu_ptx.h: $(OBJ_DIR)/lj_cut_gpu_kernel.ptx $(OBJ_DIR)/lj_cut_
|
|||||||
$(OBJ_DIR)/lj_cut_gpu_memory.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu_memory.cpp $(OBJ_DIR)/lj_cut_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
|
$(OBJ_DIR)/lj_cut_gpu_memory.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu_memory.cpp $(OBJ_DIR)/lj_cut_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
|
||||||
$(CUDR) -o $@ -c lj_cut_gpu_memory.cpp -I$(OBJ_DIR)
|
$(CUDR) -o $@ -c lj_cut_gpu_memory.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
$(OBJ_DIR)/lj_cut_gpu.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu.cpp
|
$(OBJ_DIR)/lj_cut_gpu.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu.cpp atomic_gpu_memory.h
|
||||||
$(CUDR) -o $@ -c lj_cut_gpu.cpp -I$(OBJ_DIR)
|
$(CUDR) -o $@ -c lj_cut_gpu.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
$(OBJ_DIR)/ljc_cut_gpu_kernel.ptx: ljc_cut_gpu_kernel.cu pair_gpu_precision.h
|
$(OBJ_DIR)/ljc_cut_gpu_kernel.ptx: ljc_cut_gpu_kernel.cu pair_gpu_precision.h
|
||||||
@ -156,7 +197,7 @@ $(OBJ_DIR)/ljc_cut_gpu_ptx.h: $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc_c
|
|||||||
$(OBJ_DIR)/ljc_cut_gpu_memory.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu_memory.cpp $(OBJ_DIR)/ljc_cut_gpu_ptx.h $(OBJ_DIR)/charge_gpu_memory.o
|
$(OBJ_DIR)/ljc_cut_gpu_memory.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu_memory.cpp $(OBJ_DIR)/ljc_cut_gpu_ptx.h $(OBJ_DIR)/charge_gpu_memory.o
|
||||||
$(CUDR) -o $@ -c ljc_cut_gpu_memory.cpp -I$(OBJ_DIR)
|
$(CUDR) -o $@ -c ljc_cut_gpu_memory.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
$(OBJ_DIR)/ljc_cut_gpu.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu.cpp
|
$(OBJ_DIR)/ljc_cut_gpu.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu.cpp charge_gpu_memory.h
|
||||||
$(CUDR) -o $@ -c ljc_cut_gpu.cpp -I$(OBJ_DIR)
|
$(CUDR) -o $@ -c ljc_cut_gpu.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
$(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx: ljcl_cut_gpu_kernel.cu pair_gpu_precision.h
|
$(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx: ljcl_cut_gpu_kernel.cu pair_gpu_precision.h
|
||||||
@ -168,9 +209,21 @@ $(OBJ_DIR)/ljcl_cut_gpu_ptx.h: $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc
|
|||||||
$(OBJ_DIR)/ljcl_cut_gpu_memory.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu_memory.cpp $(OBJ_DIR)/ljcl_cut_gpu_ptx.h $(OBJ_DIR)/charge_gpu_memory.o
|
$(OBJ_DIR)/ljcl_cut_gpu_memory.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu_memory.cpp $(OBJ_DIR)/ljcl_cut_gpu_ptx.h $(OBJ_DIR)/charge_gpu_memory.o
|
||||||
$(CUDR) -o $@ -c ljcl_cut_gpu_memory.cpp -I$(OBJ_DIR)
|
$(CUDR) -o $@ -c ljcl_cut_gpu_memory.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
$(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp
|
$(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp charge_gpu_memory.h
|
||||||
$(CUDR) -o $@ -c ljcl_cut_gpu.cpp -I$(OBJ_DIR)
|
$(CUDR) -o $@ -c ljcl_cut_gpu.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
|
$(OBJ_DIR)/morse_gpu_kernel.ptx: morse_gpu_kernel.cu pair_gpu_precision.h
|
||||||
|
$(CUDA) --ptx -DNV_KERNEL -o $@ morse_gpu_kernel.cu
|
||||||
|
|
||||||
|
$(OBJ_DIR)/morse_gpu_ptx.h: $(OBJ_DIR)/morse_gpu_kernel.ptx $(OBJ_DIR)/morse_gpu_kernel.ptx
|
||||||
|
$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/morse_gpu_kernel.ptx $(OBJ_DIR)/morse_gpu_ptx.h
|
||||||
|
|
||||||
|
$(OBJ_DIR)/morse_gpu_memory.o: $(ALL_H) morse_gpu_memory.h morse_gpu_memory.cpp $(OBJ_DIR)/morse_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
|
||||||
|
$(CUDR) -o $@ -c morse_gpu_memory.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
|
$(OBJ_DIR)/morse_gpu.o: $(ALL_H) morse_gpu_memory.h morse_gpu.cpp atomic_gpu_memory.h
|
||||||
|
$(CUDR) -o $@ -c morse_gpu.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
$(OBJ_DIR)/crml_gpu_kernel.ptx: crml_gpu_kernel.cu pair_gpu_precision.h
|
$(OBJ_DIR)/crml_gpu_kernel.ptx: crml_gpu_kernel.cu pair_gpu_precision.h
|
||||||
$(CUDA) --ptx -DNV_KERNEL -o $@ crml_gpu_kernel.cu
|
$(CUDA) --ptx -DNV_KERNEL -o $@ crml_gpu_kernel.cu
|
||||||
|
|
||||||
@ -180,7 +233,7 @@ $(OBJ_DIR)/crml_gpu_ptx.h: $(OBJ_DIR)/crml_gpu_kernel.ptx $(OBJ_DIR)/crml_gpu_ke
|
|||||||
$(OBJ_DIR)/crml_gpu_memory.o: $(ALL_H) crml_gpu_memory.h crml_gpu_memory.cpp $(OBJ_DIR)/crml_gpu_ptx.h $(OBJ_DIR)/charge_gpu_memory.o
|
$(OBJ_DIR)/crml_gpu_memory.o: $(ALL_H) crml_gpu_memory.h crml_gpu_memory.cpp $(OBJ_DIR)/crml_gpu_ptx.h $(OBJ_DIR)/charge_gpu_memory.o
|
||||||
$(CUDR) -o $@ -c crml_gpu_memory.cpp -I$(OBJ_DIR)
|
$(CUDR) -o $@ -c crml_gpu_memory.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
$(OBJ_DIR)/crml_gpu.o: $(ALL_H) crml_gpu_memory.h crml_gpu.cpp
|
$(OBJ_DIR)/crml_gpu.o: $(ALL_H) crml_gpu_memory.h crml_gpu.cpp charge_gpu_memory.h
|
||||||
$(CUDR) -o $@ -c crml_gpu.cpp -I$(OBJ_DIR)
|
$(CUDR) -o $@ -c crml_gpu.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
$(OBJ_DIR)/lj96_cut_gpu_kernel.ptx: lj96_cut_gpu_kernel.cu pair_gpu_precision.h
|
$(OBJ_DIR)/lj96_cut_gpu_kernel.ptx: lj96_cut_gpu_kernel.cu pair_gpu_precision.h
|
||||||
@ -192,9 +245,21 @@ $(OBJ_DIR)/lj96_cut_gpu_ptx.h: $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx $(OBJ_DIR)/lj9
|
|||||||
$(OBJ_DIR)/lj96_cut_gpu_memory.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu_memory.cpp $(OBJ_DIR)/lj96_cut_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
|
$(OBJ_DIR)/lj96_cut_gpu_memory.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu_memory.cpp $(OBJ_DIR)/lj96_cut_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
|
||||||
$(CUDR) -o $@ -c lj96_cut_gpu_memory.cpp -I$(OBJ_DIR)
|
$(CUDR) -o $@ -c lj96_cut_gpu_memory.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
$(OBJ_DIR)/lj96_cut_gpu.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu.cpp
|
$(OBJ_DIR)/lj96_cut_gpu.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu.cpp atomic_gpu_memory.h
|
||||||
$(CUDR) -o $@ -c lj96_cut_gpu.cpp -I$(OBJ_DIR)
|
$(CUDR) -o $@ -c lj96_cut_gpu.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
|
$(OBJ_DIR)/lj_expand_gpu_kernel.ptx: lj_expand_gpu_kernel.cu pair_gpu_precision.h
|
||||||
|
$(CUDA) --ptx -DNV_KERNEL -o $@ lj_expand_gpu_kernel.cu
|
||||||
|
|
||||||
|
$(OBJ_DIR)/lj_expand_gpu_ptx.h: $(OBJ_DIR)/lj_expand_gpu_kernel.ptx $(OBJ_DIR)/lj_expand_gpu_kernel.ptx
|
||||||
|
$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/lj_expand_gpu_kernel.ptx $(OBJ_DIR)/lj_expand_gpu_ptx.h
|
||||||
|
|
||||||
|
$(OBJ_DIR)/lj_expand_gpu_memory.o: $(ALL_H) lj_expand_gpu_memory.h lj_expand_gpu_memory.cpp $(OBJ_DIR)/lj_expand_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
|
||||||
|
$(CUDR) -o $@ -c lj_expand_gpu_memory.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
|
$(OBJ_DIR)/lj_expand_gpu.o: $(ALL_H) lj_expand_gpu_memory.h lj_expand_gpu.cpp atomic_gpu_memory.h
|
||||||
|
$(CUDR) -o $@ -c lj_expand_gpu.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
$(OBJ_DIR)/cmm_cut_gpu_kernel.ptx: cmm_cut_gpu_kernel.cu pair_gpu_precision.h
|
$(OBJ_DIR)/cmm_cut_gpu_kernel.ptx: cmm_cut_gpu_kernel.cu pair_gpu_precision.h
|
||||||
$(CUDA) --ptx -DNV_KERNEL -o $@ cmm_cut_gpu_kernel.cu
|
$(CUDA) --ptx -DNV_KERNEL -o $@ cmm_cut_gpu_kernel.cu
|
||||||
|
|
||||||
@ -204,7 +269,7 @@ $(OBJ_DIR)/cmm_cut_gpu_ptx.h: $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx $(OBJ_DIR)/cmm_c
|
|||||||
$(OBJ_DIR)/cmm_cut_gpu_memory.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu_memory.cpp $(OBJ_DIR)/cmm_cut_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
|
$(OBJ_DIR)/cmm_cut_gpu_memory.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu_memory.cpp $(OBJ_DIR)/cmm_cut_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
|
||||||
$(CUDR) -o $@ -c cmm_cut_gpu_memory.cpp -I$(OBJ_DIR)
|
$(CUDR) -o $@ -c cmm_cut_gpu_memory.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
$(OBJ_DIR)/cmm_cut_gpu.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu.cpp
|
$(OBJ_DIR)/cmm_cut_gpu.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu.cpp atomic_gpu_memory.h
|
||||||
$(CUDR) -o $@ -c cmm_cut_gpu.cpp -I$(OBJ_DIR)
|
$(CUDR) -o $@ -c cmm_cut_gpu.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
$(OBJ_DIR)/cmmc_long_gpu_kernel.ptx: cmmc_long_gpu_kernel.cu pair_gpu_precision.h
|
$(OBJ_DIR)/cmmc_long_gpu_kernel.ptx: cmmc_long_gpu_kernel.cu pair_gpu_precision.h
|
||||||
@ -216,7 +281,7 @@ $(OBJ_DIR)/cmmc_long_gpu_ptx.h: $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(OBJ_DIR)/c
|
|||||||
$(OBJ_DIR)/cmmc_long_gpu_memory.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu_memory.cpp $(OBJ_DIR)/cmmc_long_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
|
$(OBJ_DIR)/cmmc_long_gpu_memory.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu_memory.cpp $(OBJ_DIR)/cmmc_long_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
|
||||||
$(CUDR) -o $@ -c cmmc_long_gpu_memory.cpp -I$(OBJ_DIR)
|
$(CUDR) -o $@ -c cmmc_long_gpu_memory.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
$(OBJ_DIR)/cmmc_long_gpu.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu.cpp
|
$(OBJ_DIR)/cmmc_long_gpu.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu.cpp charge_gpu_memory.h
|
||||||
$(CUDR) -o $@ -c cmmc_long_gpu.cpp -I$(OBJ_DIR)
|
$(CUDR) -o $@ -c cmmc_long_gpu.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
$(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVC_H)
|
$(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVC_H)
|
||||||
|
|||||||
@ -14,6 +14,7 @@
|
|||||||
# /* ----------------------------------------------------------------------
|
# /* ----------------------------------------------------------------------
|
||||||
# Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
# Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||||
# Peng Wang (Nvidia), penwang@nvidia.com
|
# Peng Wang (Nvidia), penwang@nvidia.com
|
||||||
|
# Inderaj Bains (NVIDIA), ibains@nvidia.com
|
||||||
# Paul Crozier (SNL), pscrozi@sandia.gov
|
# Paul Crozier (SNL), pscrozi@sandia.gov
|
||||||
# ------------------------------------------------------------------------- */
|
# ------------------------------------------------------------------------- */
|
||||||
|
|
||||||
@ -23,29 +24,36 @@ OCL_LIB = $(LIB_DIR)/libgpu.a
|
|||||||
UCL_H = $(wildcard ./geryon/ucl*.h)
|
UCL_H = $(wildcard ./geryon/ucl*.h)
|
||||||
OCL_H = $(wildcard ./geryon/ocl*.h) $(UCL_H)
|
OCL_H = $(wildcard ./geryon/ocl*.h) $(UCL_H)
|
||||||
# Headers for Pair Stuff
|
# Headers for Pair Stuff
|
||||||
PAIR_H = pair_gpu_atom.h pair_gpu_nbor.h pair_gpu_precision.h \
|
PAIR_H = pair_gpu_atom.h pair_gpu_ans.h pair_gpu_nbor_shared.h \
|
||||||
pair_gpu_device.h pair_gpu_balance.h
|
pair_gpu_nbor.h pair_gpu_precision.h pair_gpu_device.h \
|
||||||
|
pair_gpu_balance.h pppm_gpu_memory.h
|
||||||
|
|
||||||
ALL_H = $(OCL_H) $(PAIR_H)
|
ALL_H = $(OCL_H) $(PAIR_H)
|
||||||
|
|
||||||
EXECS = $(BIN_DIR)/ocl_get_devices
|
EXECS = $(BIN_DIR)/ocl_get_devices
|
||||||
OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_nbor.o \
|
OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_ans.o \
|
||||||
$(OBJ_DIR)/pair_gpu_device.o $(OBJ_DIR)/atomic_gpu_memory.o \
|
$(OBJ_DIR)/pair_gpu_nbor_shared.o $(OBJ_DIR)/pair_gpu_nbor.o \
|
||||||
$(OBJ_DIR)/charge_gpu_memory.o \
|
$(OBJ_DIR)/pair_gpu_device.o \
|
||||||
|
$(OBJ_DIR)/atomic_gpu_memory.o $(OBJ_DIR)/charge_gpu_memory.o \
|
||||||
|
$(OBJ_DIR)/pppm_gpu_memory.o $(OBJ_DIR)/pppm_l_gpu.o \
|
||||||
$(OBJ_DIR)/gb_gpu_memory.o $(OBJ_DIR)/gb_gpu.o \
|
$(OBJ_DIR)/gb_gpu_memory.o $(OBJ_DIR)/gb_gpu.o \
|
||||||
$(OBJ_DIR)/lj_cut_gpu_memory.o $(OBJ_DIR)/lj_cut_gpu.o \
|
$(OBJ_DIR)/lj_cut_gpu_memory.o $(OBJ_DIR)/lj_cut_gpu.o \
|
||||||
$(OBJ_DIR)/lj96_cut_gpu_memory.o $(OBJ_DIR)/lj96_cut_gpu.o \
|
$(OBJ_DIR)/lj96_cut_gpu_memory.o $(OBJ_DIR)/lj96_cut_gpu.o \
|
||||||
|
$(OBJ_DIR)/lj_expand_gpu_memory.o $(OBJ_DIR)/lj_expand_gpu.o \
|
||||||
$(OBJ_DIR)/ljc_cut_gpu_memory.o $(OBJ_DIR)/ljc_cut_gpu.o \
|
$(OBJ_DIR)/ljc_cut_gpu_memory.o $(OBJ_DIR)/ljc_cut_gpu.o \
|
||||||
$(OBJ_DIR)/ljcl_cut_gpu_memory.o $(OBJ_DIR)/ljcl_cut_gpu.o \
|
$(OBJ_DIR)/ljcl_cut_gpu_memory.o $(OBJ_DIR)/ljcl_cut_gpu.o \
|
||||||
|
$(OBJ_DIR)/morse_gpu_memory.o $(OBJ_DIR)/morse_gpu.o \
|
||||||
$(OBJ_DIR)/crml_gpu_memory.o $(OBJ_DIR)/crml_gpu.o \
|
$(OBJ_DIR)/crml_gpu_memory.o $(OBJ_DIR)/crml_gpu.o \
|
||||||
$(OBJ_DIR)/cmm_cut_gpu_memory.o $(OBJ_DIR)/cmm_cut_gpu.o \
|
$(OBJ_DIR)/cmm_cut_gpu_memory.o $(OBJ_DIR)/cmm_cut_gpu.o \
|
||||||
$(OBJ_DIR)/cmmc_long_gpu_memory.o $(OBJ_DIR)/cmmc_long_gpu.o
|
$(OBJ_DIR)/cmmc_long_gpu_memory.o $(OBJ_DIR)/cmmc_long_gpu.o
|
||||||
KERS = $(OBJ_DIR)/pair_gpu_atom_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h \
|
KERS = $(OBJ_DIR)/pair_gpu_dev_cl.h $(OBJ_DIR)/pair_gpu_atom_cl.h \
|
||||||
|
$(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/pppm_gpu_cl.h \
|
||||||
$(OBJ_DIR)/gb_gpu_nbor_cl.h $(OBJ_DIR)/gb_gpu_cl.h \
|
$(OBJ_DIR)/gb_gpu_nbor_cl.h $(OBJ_DIR)/gb_gpu_cl.h \
|
||||||
$(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/lj96_cut_gpu_cl.h \
|
$(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/lj96_cut_gpu_cl.h \
|
||||||
$(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/ljcl_cut_gpu_cl.h \
|
$(OBJ_DIR)/lj_expand_gpu_cl.h $(OBJ_DIR)/ljc_cut_gpu_cl.h \
|
||||||
$(OBJ_DIR)/crml_gpu_cl.h \
|
$(OBJ_DIR)/ljcl_cut_gpu_cl.h $(OBJ_DIR)/morse_gpu_cl.h \
|
||||||
$(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/cmmc_long_gpu_cl.h
|
$(OBJ_DIR)/crml_gpu_cl.h $(OBJ_DIR)/cmm_cut_gpu_cl.h \
|
||||||
|
$(OBJ_DIR)/cmmc_long_gpu_cl.h
|
||||||
|
|
||||||
OCL_EXECS = $(BIN_DIR)/ocl_get_devices
|
OCL_EXECS = $(BIN_DIR)/ocl_get_devices
|
||||||
|
|
||||||
@ -57,14 +65,23 @@ $(OBJ_DIR)/pair_gpu_atom_cl.h: pair_gpu_atom_kernel.cu
|
|||||||
$(OBJ_DIR)/pair_gpu_atom.o: pair_gpu_atom.cpp pair_gpu_atom.h $(OCL_H) $(OBJ_DIR)/pair_gpu_atom_cl.h
|
$(OBJ_DIR)/pair_gpu_atom.o: pair_gpu_atom.cpp pair_gpu_atom.h $(OCL_H) $(OBJ_DIR)/pair_gpu_atom_cl.h
|
||||||
$(OCL) -o $@ -c pair_gpu_atom.cpp -I$(OBJ_DIR)
|
$(OCL) -o $@ -c pair_gpu_atom.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
|
$(OBJ_DIR)/pair_gpu_ans.o: pair_gpu_ans.cpp pair_gpu_ans.h $(OCL_H)
|
||||||
|
$(OCL) -o $@ -c pair_gpu_ans.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
$(OBJ_DIR)/pair_gpu_nbor_cl.h: pair_gpu_nbor_kernel.cu
|
$(OBJ_DIR)/pair_gpu_nbor_cl.h: pair_gpu_nbor_kernel.cu
|
||||||
$(BSH) ./geryon/file_to_cstr.sh pair_gpu_nbor_kernel.cu $(OBJ_DIR)/pair_gpu_nbor_cl.h
|
$(BSH) ./geryon/file_to_cstr.sh pair_gpu_nbor_kernel.cu $(OBJ_DIR)/pair_gpu_nbor_cl.h
|
||||||
|
|
||||||
$(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h $(OCL_H) $(OBJ_DIR)/pair_gpu_nbor_cl.h
|
$(OBJ_DIR)/pair_gpu_nbor_shared.o: pair_gpu_nbor_shared.cpp pair_gpu_nbor_shared.h $(OCL_H) $(OBJ_DIR)/pair_gpu_nbor_cl.h
|
||||||
|
$(OCL) -o $@ -c pair_gpu_nbor_shared.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
|
$(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h $(OCL_H) pair_gpu_nbor_shared.h
|
||||||
$(OCL) -o $@ -c pair_gpu_nbor.cpp -I$(OBJ_DIR)
|
$(OCL) -o $@ -c pair_gpu_nbor.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(OCL_H)
|
$(OBJ_DIR)/pair_gpu_dev_cl.h: pair_gpu_dev_kernel.cu
|
||||||
$(OCL) -o $@ -c pair_gpu_device.cpp
|
$(BSH) ./geryon/file_to_cstr.sh pair_gpu_dev_kernel.cu $(OBJ_DIR)/pair_gpu_dev_cl.h
|
||||||
|
|
||||||
|
$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(ALL_H) $(OBJ_DIR)/pair_gpu_dev_cl.h
|
||||||
|
$(OCL) -o $@ -c pair_gpu_device.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
$(OBJ_DIR)/atomic_gpu_memory.o: $(OCL_H) atomic_gpu_memory.h atomic_gpu_memory.cpp
|
$(OBJ_DIR)/atomic_gpu_memory.o: $(OCL_H) atomic_gpu_memory.h atomic_gpu_memory.cpp
|
||||||
$(OCL) -o $@ -c atomic_gpu_memory.cpp
|
$(OCL) -o $@ -c atomic_gpu_memory.cpp
|
||||||
@ -72,6 +89,15 @@ $(OBJ_DIR)/atomic_gpu_memory.o: $(OCL_H) atomic_gpu_memory.h atomic_gpu_memory.c
|
|||||||
$(OBJ_DIR)/charge_gpu_memory.o: $(OCL_H) charge_gpu_memory.h charge_gpu_memory.cpp
|
$(OBJ_DIR)/charge_gpu_memory.o: $(OCL_H) charge_gpu_memory.h charge_gpu_memory.cpp
|
||||||
$(OCL) -o $@ -c charge_gpu_memory.cpp
|
$(OCL) -o $@ -c charge_gpu_memory.cpp
|
||||||
|
|
||||||
|
$(OBJ_DIR)/pppm_gpu_cl.h: pppm_gpu_kernel.cu
|
||||||
|
$(BSH) ./geryon/file_to_cstr.sh pppm_gpu_kernel.cu $(OBJ_DIR)/pppm_gpu_cl.h;
|
||||||
|
|
||||||
|
$(OBJ_DIR)/pppm_gpu_memory.o: $(ALL_H) pppm_gpu_memory.h pppm_gpu_memory.cpp $(OBJ_DIR)/pppm_gpu_cl.h $(OBJ_DIR)/pppm_gpu_cl.h
|
||||||
|
$(OCL) -o $@ -c pppm_gpu_memory.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
|
$(OBJ_DIR)/pppm_l_gpu.o: $(ALL_H) pppm_gpu_memory.h pppm_l_gpu.cpp
|
||||||
|
$(OCL) -o $@ -c pppm_l_gpu.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
$(OBJ_DIR)/gb_gpu_nbor_cl.h: gb_gpu_kernel_nbor.cu
|
$(OBJ_DIR)/gb_gpu_nbor_cl.h: gb_gpu_kernel_nbor.cu
|
||||||
$(BSH) ./geryon/file_to_cstr.sh gb_gpu_kernel_nbor.cu $(OBJ_DIR)/gb_gpu_nbor_cl.h
|
$(BSH) ./geryon/file_to_cstr.sh gb_gpu_kernel_nbor.cu $(OBJ_DIR)/gb_gpu_nbor_cl.h
|
||||||
|
|
||||||
@ -93,7 +119,7 @@ $(OBJ_DIR)/lj_cut_gpu_cl.h: lj_cut_gpu_kernel.cu
|
|||||||
$(OBJ_DIR)/lj_cut_gpu_memory.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu_memory.cpp $(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
|
$(OBJ_DIR)/lj_cut_gpu_memory.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu_memory.cpp $(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
|
||||||
$(OCL) -o $@ -c lj_cut_gpu_memory.cpp -I$(OBJ_DIR)
|
$(OCL) -o $@ -c lj_cut_gpu_memory.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
$(OBJ_DIR)/lj_cut_gpu.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu.cpp
|
$(OBJ_DIR)/lj_cut_gpu.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu.cpp atomic_gpu_memory.h
|
||||||
$(OCL) -o $@ -c lj_cut_gpu.cpp -I$(OBJ_DIR)
|
$(OCL) -o $@ -c lj_cut_gpu.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
$(OBJ_DIR)/ljc_cut_gpu_cl.h: ljc_cut_gpu_kernel.cu
|
$(OBJ_DIR)/ljc_cut_gpu_cl.h: ljc_cut_gpu_kernel.cu
|
||||||
@ -102,7 +128,7 @@ $(OBJ_DIR)/ljc_cut_gpu_cl.h: ljc_cut_gpu_kernel.cu
|
|||||||
$(OBJ_DIR)/ljc_cut_gpu_memory.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu_memory.cpp $(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/charge_gpu_memory.o
|
$(OBJ_DIR)/ljc_cut_gpu_memory.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu_memory.cpp $(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/charge_gpu_memory.o
|
||||||
$(OCL) -o $@ -c ljc_cut_gpu_memory.cpp -I$(OBJ_DIR)
|
$(OCL) -o $@ -c ljc_cut_gpu_memory.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
$(OBJ_DIR)/ljc_cut_gpu.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu.cpp
|
$(OBJ_DIR)/ljc_cut_gpu.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu.cpp charge_gpu_memory.h
|
||||||
$(OCL) -o $@ -c ljc_cut_gpu.cpp -I$(OBJ_DIR)
|
$(OCL) -o $@ -c ljc_cut_gpu.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
$(OBJ_DIR)/ljcl_cut_gpu_cl.h: ljcl_cut_gpu_kernel.cu
|
$(OBJ_DIR)/ljcl_cut_gpu_cl.h: ljcl_cut_gpu_kernel.cu
|
||||||
@ -111,16 +137,25 @@ $(OBJ_DIR)/ljcl_cut_gpu_cl.h: ljcl_cut_gpu_kernel.cu
|
|||||||
$(OBJ_DIR)/ljcl_cut_gpu_memory.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu_memory.cpp $(OBJ_DIR)/ljcl_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/ljcl_cut_gpu_cl.h $(OBJ_DIR)/charge_gpu_memory.o
|
$(OBJ_DIR)/ljcl_cut_gpu_memory.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu_memory.cpp $(OBJ_DIR)/ljcl_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/ljcl_cut_gpu_cl.h $(OBJ_DIR)/charge_gpu_memory.o
|
||||||
$(OCL) -o $@ -c ljcl_cut_gpu_memory.cpp -I$(OBJ_DIR)
|
$(OCL) -o $@ -c ljcl_cut_gpu_memory.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
$(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp
|
$(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp charge_gpu_memory.h
|
||||||
$(OCL) -o $@ -c ljcl_cut_gpu.cpp -I$(OBJ_DIR)
|
$(OCL) -o $@ -c ljcl_cut_gpu.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
|
$(OBJ_DIR)/morse_gpu_cl.h: morse_gpu_kernel.cu
|
||||||
|
$(BSH) ./geryon/file_to_cstr.sh morse_gpu_kernel.cu $(OBJ_DIR)/morse_gpu_cl.h;
|
||||||
|
|
||||||
|
$(OBJ_DIR)/morse_gpu_memory.o: $(ALL_H) morse_gpu_memory.h morse_gpu_memory.cpp $(OBJ_DIR)/morse_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/morse_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
|
||||||
|
$(OCL) -o $@ -c morse_gpu_memory.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
|
$(OBJ_DIR)/morse_gpu.o: $(ALL_H) morse_gpu_memory.h morse_gpu.cpp atomic_gpu_memory.h
|
||||||
|
$(OCL) -o $@ -c morse_gpu.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
$(OBJ_DIR)/crml_gpu_cl.h: crml_gpu_kernel.cu
|
$(OBJ_DIR)/crml_gpu_cl.h: crml_gpu_kernel.cu
|
||||||
$(BSH) ./geryon/file_to_cstr.sh crml_gpu_kernel.cu $(OBJ_DIR)/crml_gpu_cl.h;
|
$(BSH) ./geryon/file_to_cstr.sh crml_gpu_kernel.cu $(OBJ_DIR)/crml_gpu_cl.h;
|
||||||
|
|
||||||
$(OBJ_DIR)/crml_gpu_memory.o: $(ALL_H) crml_gpu_memory.h crml_gpu_memory.cpp $(OBJ_DIR)/crml_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/crml_gpu_cl.h $(OBJ_DIR)/charge_gpu_memory.o
|
$(OBJ_DIR)/crml_gpu_memory.o: $(ALL_H) crml_gpu_memory.h crml_gpu_memory.cpp $(OBJ_DIR)/crml_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/crml_gpu_cl.h $(OBJ_DIR)/charge_gpu_memory.o
|
||||||
$(OCL) -o $@ -c crml_gpu_memory.cpp -I$(OBJ_DIR)
|
$(OCL) -o $@ -c crml_gpu_memory.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
$(OBJ_DIR)/crml_gpu.o: $(ALL_H) crml_gpu_memory.h crml_gpu.cpp
|
$(OBJ_DIR)/crml_gpu.o: $(ALL_H) crml_gpu_memory.h crml_gpu.cpp charge_gpu_memory.h
|
||||||
$(OCL) -o $@ -c crml_gpu.cpp -I$(OBJ_DIR)
|
$(OCL) -o $@ -c crml_gpu.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
$(OBJ_DIR)/lj96_cut_gpu_cl.h: lj96_cut_gpu_kernel.cu
|
$(OBJ_DIR)/lj96_cut_gpu_cl.h: lj96_cut_gpu_kernel.cu
|
||||||
@ -129,16 +164,25 @@ $(OBJ_DIR)/lj96_cut_gpu_cl.h: lj96_cut_gpu_kernel.cu
|
|||||||
$(OBJ_DIR)/lj96_cut_gpu_memory.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu_memory.cpp $(OBJ_DIR)/lj96_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/lj96_cut_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
|
$(OBJ_DIR)/lj96_cut_gpu_memory.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu_memory.cpp $(OBJ_DIR)/lj96_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/lj96_cut_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
|
||||||
$(OCL) -o $@ -c lj96_cut_gpu_memory.cpp -I$(OBJ_DIR)
|
$(OCL) -o $@ -c lj96_cut_gpu_memory.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
$(OBJ_DIR)/lj96_cut_gpu.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu.cpp
|
$(OBJ_DIR)/lj96_cut_gpu.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu.cpp atomic_gpu_memory.h
|
||||||
$(OCL) -o $@ -c lj96_cut_gpu.cpp -I$(OBJ_DIR)
|
$(OCL) -o $@ -c lj96_cut_gpu.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
|
$(OBJ_DIR)/lj_expand_gpu_cl.h: lj_expand_gpu_kernel.cu
|
||||||
|
$(BSH) ./geryon/file_to_cstr.sh lj_expand_gpu_kernel.cu $(OBJ_DIR)/lj_expand_gpu_cl.h;
|
||||||
|
|
||||||
|
$(OBJ_DIR)/lj_expand_gpu_memory.o: $(ALL_H) lj_expand_gpu_memory.h lj_expand_gpu_memory.cpp $(OBJ_DIR)/lj_expand_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/lj_expand_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
|
||||||
|
$(OCL) -o $@ -c lj_expand_gpu_memory.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
|
$(OBJ_DIR)/lj_expand_gpu.o: $(ALL_H) lj_expand_gpu_memory.h lj_expand_gpu.cpp atomic_gpu_memory.h
|
||||||
|
$(OCL) -o $@ -c lj_expand_gpu.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
$(OBJ_DIR)/cmm_cut_gpu_cl.h: cmm_cut_gpu_kernel.cu
|
$(OBJ_DIR)/cmm_cut_gpu_cl.h: cmm_cut_gpu_kernel.cu
|
||||||
$(BSH) ./geryon/file_to_cstr.sh cmm_cut_gpu_kernel.cu $(OBJ_DIR)/cmm_cut_gpu_cl.h;
|
$(BSH) ./geryon/file_to_cstr.sh cmm_cut_gpu_kernel.cu $(OBJ_DIR)/cmm_cut_gpu_cl.h;
|
||||||
|
|
||||||
$(OBJ_DIR)/cmm_cut_gpu_memory.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu_memory.cpp $(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
|
$(OBJ_DIR)/cmm_cut_gpu_memory.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu_memory.cpp $(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
|
||||||
$(OCL) -o $@ -c cmm_cut_gpu_memory.cpp -I$(OBJ_DIR)
|
$(OCL) -o $@ -c cmm_cut_gpu_memory.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
$(OBJ_DIR)/cmm_cut_gpu.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu.cpp
|
$(OBJ_DIR)/cmm_cut_gpu.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu.cpp atomic_gpu_memory.h
|
||||||
$(OCL) -o $@ -c cmm_cut_gpu.cpp -I$(OBJ_DIR)
|
$(OCL) -o $@ -c cmm_cut_gpu.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
$(OBJ_DIR)/cmmc_long_gpu_cl.h: cmmc_long_gpu_kernel.cu
|
$(OBJ_DIR)/cmmc_long_gpu_cl.h: cmmc_long_gpu_kernel.cu
|
||||||
@ -147,7 +191,7 @@ $(OBJ_DIR)/cmmc_long_gpu_cl.h: cmmc_long_gpu_kernel.cu
|
|||||||
$(OBJ_DIR)/cmmc_long_gpu_memory.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu_memory.cpp $(OBJ_DIR)/cmmc_long_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/cmmc_long_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
|
$(OBJ_DIR)/cmmc_long_gpu_memory.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu_memory.cpp $(OBJ_DIR)/cmmc_long_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/cmmc_long_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
|
||||||
$(OCL) -o $@ -c cmmc_long_gpu_memory.cpp -I$(OBJ_DIR)
|
$(OCL) -o $@ -c cmmc_long_gpu_memory.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
$(OBJ_DIR)/cmmc_long_gpu.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu.cpp
|
$(OBJ_DIR)/cmmc_long_gpu.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu.cpp charge_gpu_memory.h
|
||||||
$(OCL) -o $@ -c cmmc_long_gpu.cpp -I$(OBJ_DIR)
|
$(OCL) -o $@ -c cmmc_long_gpu.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
$(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp
|
$(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp
|
||||||
|
|||||||
@ -14,6 +14,7 @@
|
|||||||
/* ----------------------------------------------------------------------
|
/* ----------------------------------------------------------------------
|
||||||
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||||
Peng Wang (Nvidia), penwang@nvidia.com
|
Peng Wang (Nvidia), penwang@nvidia.com
|
||||||
|
Inderaj Bains (NVIDIA), ibains@nvidia.com
|
||||||
Paul Crozier (SNL), pscrozi@sandia.gov
|
Paul Crozier (SNL), pscrozi@sandia.gov
|
||||||
------------------------------------------------------------------------- */
|
------------------------------------------------------------------------- */
|
||||||
|
|
||||||
|
|||||||
@ -23,19 +23,24 @@ extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
|
|||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
AtomicGPUMemoryT::AtomicGPUMemory() : _compiled(false), _max_bytes(0) {
|
AtomicGPUMemoryT::AtomicGPUMemory() : _compiled(false), _max_bytes(0) {
|
||||||
device=&pair_gpu_device;
|
device=&pair_gpu_device;
|
||||||
|
ans=new PairGPUAns<numtyp,acctyp>();
|
||||||
|
nbor=new PairGPUNbor();
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
AtomicGPUMemoryT::~AtomicGPUMemory() {
|
AtomicGPUMemoryT::~AtomicGPUMemory() {
|
||||||
|
delete ans;
|
||||||
|
delete nbor;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
int AtomicGPUMemoryT::bytes_per_atom_atomic(const int max_nbors) const {
|
int AtomicGPUMemoryT::bytes_per_atom_atomic(const int max_nbors) const {
|
||||||
return device->atom.bytes_per_atom()+device->nbor.bytes_per_atom(max_nbors);
|
return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
|
||||||
|
nbor->bytes_per_atom(max_nbors);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
bool AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall,
|
int AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall,
|
||||||
const int max_nbors, const int maxspecial,
|
const int max_nbors, const int maxspecial,
|
||||||
const double cell_size,
|
const double cell_size,
|
||||||
const double gpu_split, FILE *_screen,
|
const double gpu_split, FILE *_screen,
|
||||||
@ -48,24 +53,30 @@ bool AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall,
|
|||||||
gpu_nbor=true;
|
gpu_nbor=true;
|
||||||
|
|
||||||
int _gpu_host=0;
|
int _gpu_host=0;
|
||||||
int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_nbor,gpu_split);
|
int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
|
||||||
if (host_nlocal>0)
|
if (host_nlocal>0)
|
||||||
_gpu_host=1;
|
_gpu_host=1;
|
||||||
|
|
||||||
if (!device->init(false,false,nlocal,host_nlocal,nall,maxspecial,gpu_nbor,
|
_threads_per_atom=device->threads_per_atom();
|
||||||
_gpu_host,max_nbors,cell_size,false))
|
if (_threads_per_atom>1 && gpu_nbor==false) {
|
||||||
return false;
|
nbor->packing(true);
|
||||||
|
_nbor_data=&(nbor->dev_packed);
|
||||||
|
} else
|
||||||
|
_nbor_data=&(nbor->dev_nbor);
|
||||||
|
|
||||||
|
int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor,
|
||||||
|
maxspecial,_gpu_host,max_nbors,cell_size,false);
|
||||||
|
if (success!=0)
|
||||||
|
return success;
|
||||||
|
|
||||||
ucl_device=device->gpu;
|
ucl_device=device->gpu;
|
||||||
atom=&device->atom;
|
atom=&device->atom;
|
||||||
nbor=&device->nbor;
|
|
||||||
|
|
||||||
_block_size=BLOCK_1D;
|
_block_size=device->pair_block_size();
|
||||||
if (static_cast<size_t>(_block_size)>ucl_device->group_size())
|
|
||||||
_block_size=ucl_device->group_size();
|
|
||||||
compile_kernels(*ucl_device,pair_program);
|
compile_kernels(*ucl_device,pair_program);
|
||||||
|
|
||||||
// Initialize host-device load balancer
|
// Initialize host-device load balancer
|
||||||
hd_balancer.init(device,gpu_split);
|
hd_balancer.init(device,gpu_nbor,gpu_split);
|
||||||
|
|
||||||
// Initialize timers for the selected GPU
|
// Initialize timers for the selected GPU
|
||||||
time_pair.init(*ucl_device);
|
time_pair.init(*ucl_device);
|
||||||
@ -73,9 +84,14 @@ bool AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall,
|
|||||||
|
|
||||||
pos_tex.bind_float(atom->dev_x,4);
|
pos_tex.bind_float(atom->dev_x,4);
|
||||||
|
|
||||||
_max_an_bytes=atom->gpu_bytes()+nbor->gpu_bytes();
|
_max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||||
|
|
||||||
return true;
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
void AtomicGPUMemoryT::estimate_gpu_overhead() {
|
||||||
|
device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
@ -83,7 +99,10 @@ void AtomicGPUMemoryT::clear_atomic() {
|
|||||||
// Output any timing information
|
// Output any timing information
|
||||||
acc_timers();
|
acc_timers();
|
||||||
double avg_split=hd_balancer.all_avg_split();
|
double avg_split=hd_balancer.all_avg_split();
|
||||||
device->output_times(time_pair,avg_split,_max_bytes+_max_an_bytes,screen);
|
_gpu_overhead*=hd_balancer.timestep();
|
||||||
|
_driver_overhead*=hd_balancer.timestep();
|
||||||
|
device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes,
|
||||||
|
_gpu_overhead,_driver_overhead,_threads_per_atom,screen);
|
||||||
|
|
||||||
if (_compiled) {
|
if (_compiled) {
|
||||||
k_pair_fast.clear();
|
k_pair_fast.clear();
|
||||||
@ -107,8 +126,7 @@ int * AtomicGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist,
|
|||||||
success=true;
|
success=true;
|
||||||
|
|
||||||
nbor_time_avail=true;
|
nbor_time_avail=true;
|
||||||
|
int mn=nbor->max_nbor_loop(inum,numj,ilist);
|
||||||
int mn=nbor->max_nbor_loop(inum,numj);
|
|
||||||
resize_atom(inum,nall,success);
|
resize_atom(inum,nall,success);
|
||||||
resize_local(inum,mn,success);
|
resize_local(inum,mn,success);
|
||||||
if (!success)
|
if (!success)
|
||||||
@ -116,7 +134,7 @@ int * AtomicGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist,
|
|||||||
|
|
||||||
nbor->get_host(inum,ilist,numj,firstneigh,block_size());
|
nbor->get_host(inum,ilist,numj,firstneigh,block_size());
|
||||||
|
|
||||||
double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
|
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||||
if (bytes>_max_an_bytes)
|
if (bytes>_max_an_bytes)
|
||||||
_max_an_bytes=bytes;
|
_max_an_bytes=bytes;
|
||||||
|
|
||||||
@ -130,8 +148,8 @@ template <class numtyp, class acctyp>
|
|||||||
inline void AtomicGPUMemoryT::build_nbor_list(const int inum,
|
inline void AtomicGPUMemoryT::build_nbor_list(const int inum,
|
||||||
const int host_inum,
|
const int host_inum,
|
||||||
const int nall, double **host_x,
|
const int nall, double **host_x,
|
||||||
int *host_type, double *boxlo,
|
int *host_type, double *sublo,
|
||||||
double *boxhi, int *tag,
|
double *subhi, int *tag,
|
||||||
int **nspecial, int **special,
|
int **nspecial, int **special,
|
||||||
bool &success) {
|
bool &success) {
|
||||||
nbor_time_avail=true;
|
nbor_time_avail=true;
|
||||||
@ -144,10 +162,10 @@ inline void AtomicGPUMemoryT::build_nbor_list(const int inum,
|
|||||||
atom->cast_copy_x(host_x,host_type);
|
atom->cast_copy_x(host_x,host_type);
|
||||||
|
|
||||||
int mn;
|
int mn;
|
||||||
nbor->build_nbor_list(inum, host_inum, nall, *atom, boxlo, boxhi, tag,
|
nbor->build_nbor_list(inum, host_inum, nall, *atom, sublo, subhi, tag,
|
||||||
nspecial, special, success, mn);
|
nspecial, special, success, mn);
|
||||||
|
|
||||||
double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
|
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||||
if (bytes>_max_an_bytes)
|
if (bytes>_max_an_bytes)
|
||||||
_max_an_bytes=bytes;
|
_max_an_bytes=bytes;
|
||||||
}
|
}
|
||||||
@ -156,9 +174,8 @@ inline void AtomicGPUMemoryT::build_nbor_list(const int inum,
|
|||||||
// Copy nbor list from host if necessary and then calculate forces, virials,..
|
// Copy nbor list from host if necessary and then calculate forces, virials,..
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
void AtomicGPUMemoryT::compute(const int timestep, const int f_ago,
|
void AtomicGPUMemoryT::compute(const int f_ago, const int inum_full,
|
||||||
const int inum_full, const int nall,
|
const int nall, double **host_x, int *host_type,
|
||||||
double **host_x, int *host_type,
|
|
||||||
int *ilist, int *numj, int **firstneigh,
|
int *ilist, int *numj, int **firstneigh,
|
||||||
const bool eflag, const bool vflag,
|
const bool eflag, const bool vflag,
|
||||||
const bool eatom, const bool vatom,
|
const bool eatom, const bool vatom,
|
||||||
@ -166,14 +183,16 @@ void AtomicGPUMemoryT::compute(const int timestep, const int f_ago,
|
|||||||
bool &success) {
|
bool &success) {
|
||||||
acc_timers();
|
acc_timers();
|
||||||
if (inum_full==0) {
|
if (inum_full==0) {
|
||||||
|
host_start=0;
|
||||||
|
// Make sure textures are correct if realloc by a different hybrid style
|
||||||
|
resize_atom(0,nall,success);
|
||||||
zero_timers();
|
zero_timers();
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
int ago=hd_balancer.ago_first(f_ago);
|
int ago=hd_balancer.ago_first(f_ago);
|
||||||
int inum=hd_balancer.balance(timestep,ago,inum_full,cpu_time,
|
int inum=hd_balancer.balance(ago,inum_full,cpu_time);
|
||||||
nbor->gpu_nbor());
|
ans->inum(inum);
|
||||||
atom->inum(inum);
|
|
||||||
host_start=inum;
|
host_start=inum;
|
||||||
|
|
||||||
if (ago==0) {
|
if (ago==0) {
|
||||||
@ -187,7 +206,8 @@ void AtomicGPUMemoryT::compute(const int timestep, const int f_ago,
|
|||||||
atom->add_x_data(host_x,host_type);
|
atom->add_x_data(host_x,host_type);
|
||||||
|
|
||||||
loop(eflag,vflag);
|
loop(eflag,vflag);
|
||||||
atom->copy_answers(eflag,vflag,eatom,vatom,ilist);
|
ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
|
||||||
|
device->add_ans_object(ans);
|
||||||
hd_balancer.stop_timer();
|
hd_balancer.stop_timer();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -195,29 +215,32 @@ void AtomicGPUMemoryT::compute(const int timestep, const int f_ago,
|
|||||||
// Reneighbor on GPU if necessary and then compute forces, virials, energies
|
// Reneighbor on GPU if necessary and then compute forces, virials, energies
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
int * AtomicGPUMemoryT::compute(const int timestep, const int ago,
|
int ** AtomicGPUMemoryT::compute(const int ago, const int inum_full,
|
||||||
const int inum_full, const int nall,
|
const int nall, double **host_x, int *host_type,
|
||||||
double **host_x, int *host_type, double *boxlo,
|
double *sublo, double *subhi, int *tag,
|
||||||
double *boxhi, int *tag, int **nspecial,
|
int **nspecial, int **special, const bool eflag,
|
||||||
int **special, const bool eflag,
|
|
||||||
const bool vflag, const bool eatom,
|
const bool vflag, const bool eatom,
|
||||||
const bool vatom, int &host_start,
|
const bool vatom, int &host_start,
|
||||||
|
int **ilist, int **jnum,
|
||||||
const double cpu_time, bool &success) {
|
const double cpu_time, bool &success) {
|
||||||
acc_timers();
|
acc_timers();
|
||||||
if (inum_full==0) {
|
if (inum_full==0) {
|
||||||
|
host_start=0;
|
||||||
|
// Make sure textures are correct if realloc by a different hybrid style
|
||||||
|
resize_atom(0,nall,success);
|
||||||
zero_timers();
|
zero_timers();
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
hd_balancer.balance(cpu_time,nbor->gpu_nbor());
|
hd_balancer.balance(cpu_time);
|
||||||
int inum=hd_balancer.get_gpu_count(timestep,ago,inum_full);
|
int inum=hd_balancer.get_gpu_count(ago,inum_full);
|
||||||
atom->inum(inum);
|
ans->inum(inum);
|
||||||
host_start=inum;
|
host_start=inum;
|
||||||
|
|
||||||
// Build neighbor list on GPU if necessary
|
// Build neighbor list on GPU if necessary
|
||||||
if (ago==0) {
|
if (ago==0) {
|
||||||
build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
|
build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
|
||||||
boxlo, boxhi, tag, nspecial, special, success);
|
sublo, subhi, tag, nspecial, special, success);
|
||||||
if (!success)
|
if (!success)
|
||||||
return NULL;
|
return NULL;
|
||||||
hd_balancer.start_timer();
|
hd_balancer.start_timer();
|
||||||
@ -226,19 +249,21 @@ int * AtomicGPUMemoryT::compute(const int timestep, const int ago,
|
|||||||
hd_balancer.start_timer();
|
hd_balancer.start_timer();
|
||||||
atom->add_x_data(host_x,host_type);
|
atom->add_x_data(host_x,host_type);
|
||||||
}
|
}
|
||||||
|
*ilist=nbor->host_ilist.begin();
|
||||||
|
*jnum=nbor->host_acc.begin();
|
||||||
|
|
||||||
loop(eflag,vflag);
|
loop(eflag,vflag);
|
||||||
atom->copy_answers(eflag,vflag,eatom,vatom);
|
ans->copy_answers(eflag,vflag,eatom,vatom);
|
||||||
|
device->add_ans_object(ans);
|
||||||
hd_balancer.stop_timer();
|
hd_balancer.stop_timer();
|
||||||
|
|
||||||
return device->nbor.host_nbor.begin();
|
return nbor->host_jlist.begin()-host_start;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
double AtomicGPUMemoryT::host_memory_usage_atomic() const {
|
double AtomicGPUMemoryT::host_memory_usage_atomic() const {
|
||||||
return device->atom.host_memory_usage()+
|
return device->atom.host_memory_usage()+nbor->host_memory_usage()+
|
||||||
device->nbor.host_memory_usage()+4*sizeof(numtyp)+
|
4*sizeof(numtyp)+sizeof(AtomicGPUMemory<numtyp,acctyp>);
|
||||||
sizeof(AtomicGPUMemory<numtyp,acctyp>);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
|
|||||||
@ -18,8 +18,6 @@
|
|||||||
#ifndef ATOMIC_GPU_MEMORY_H
|
#ifndef ATOMIC_GPU_MEMORY_H
|
||||||
#define ATOMIC_GPU_MEMORY_H
|
#define ATOMIC_GPU_MEMORY_H
|
||||||
|
|
||||||
#define BLOCK_1D 64
|
|
||||||
|
|
||||||
#include "pair_gpu_device.h"
|
#include "pair_gpu_device.h"
|
||||||
#include "pair_gpu_balance.h"
|
#include "pair_gpu_balance.h"
|
||||||
#include "mpi.h"
|
#include "mpi.h"
|
||||||
@ -39,17 +37,28 @@ class AtomicGPUMemory {
|
|||||||
/// Clear any previous data and set up for a new LAMMPS run
|
/// Clear any previous data and set up for a new LAMMPS run
|
||||||
/** \param max_nbors initial number of rows in the neighbor matrix
|
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||||
* \param cell_size cutoff + skin
|
* \param cell_size cutoff + skin
|
||||||
* \param gpu_split fraction of particles handled by device **/
|
* \param gpu_split fraction of particles handled by device
|
||||||
bool init_atomic(const int nlocal, const int nall, const int max_nbors,
|
*
|
||||||
|
* Returns:
|
||||||
|
* - 0 if successfull
|
||||||
|
* - -1 if fix gpu not found
|
||||||
|
* - -3 if there is an out of memory error
|
||||||
|
* - -4 if the GPU library was not compiled for GPU
|
||||||
|
* - -5 Double precision is not supported on card **/
|
||||||
|
int init_atomic(const int nlocal, const int nall, const int max_nbors,
|
||||||
const int maxspecial, const double cell_size,
|
const int maxspecial, const double cell_size,
|
||||||
const double gpu_split, FILE *screen,
|
const double gpu_split, FILE *screen,
|
||||||
const char *pair_program);
|
const char *pair_program);
|
||||||
|
|
||||||
|
/// Estimate the overhead for GPU context changes and CPU driver
|
||||||
|
void estimate_gpu_overhead();
|
||||||
|
|
||||||
/// Check if there is enough storage for atom arrays and realloc if not
|
/// Check if there is enough storage for atom arrays and realloc if not
|
||||||
/** \param success set to false if insufficient memory **/
|
/** \param success set to false if insufficient memory **/
|
||||||
inline void resize_atom(const int inum, const int nall, bool &success) {
|
inline void resize_atom(const int inum, const int nall, bool &success) {
|
||||||
if (atom->resize(inum, nall, success))
|
if (atom->resize(nall, success))
|
||||||
pos_tex.bind_float(atom->dev_x,4);
|
pos_tex.bind_float(atom->dev_x,4);
|
||||||
|
ans->resize(inum,success);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Check if there is enough storage for neighbors and realloc if not
|
/// Check if there is enough storage for neighbors and realloc if not
|
||||||
@ -85,6 +94,7 @@ class AtomicGPUMemory {
|
|||||||
|
|
||||||
/// Accumulate timers
|
/// Accumulate timers
|
||||||
inline void acc_timers() {
|
inline void acc_timers() {
|
||||||
|
if (device->time_device()) {
|
||||||
if (nbor_time_avail) {
|
if (nbor_time_avail) {
|
||||||
nbor->time_nbor.add_to_total();
|
nbor->time_nbor.add_to_total();
|
||||||
nbor->time_kernel.add_to_total();
|
nbor->time_kernel.add_to_total();
|
||||||
@ -92,6 +102,8 @@ class AtomicGPUMemory {
|
|||||||
}
|
}
|
||||||
time_pair.add_to_total();
|
time_pair.add_to_total();
|
||||||
atom->acc_timers();
|
atom->acc_timers();
|
||||||
|
ans->acc_timers();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Zero timers
|
/// Zero timers
|
||||||
@ -99,6 +111,7 @@ class AtomicGPUMemory {
|
|||||||
nbor_time_avail=false;
|
nbor_time_avail=false;
|
||||||
time_pair.zero();
|
time_pair.zero();
|
||||||
atom->zero_timers();
|
atom->zero_timers();
|
||||||
|
ans->zero_timers();
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Copy neighbor list from host
|
/// Copy neighbor list from host
|
||||||
@ -108,24 +121,32 @@ class AtomicGPUMemory {
|
|||||||
/// Build neighbor list on device
|
/// Build neighbor list on device
|
||||||
void build_nbor_list(const int inum, const int host_inum,
|
void build_nbor_list(const int inum, const int host_inum,
|
||||||
const int nall, double **host_x, int *host_type,
|
const int nall, double **host_x, int *host_type,
|
||||||
double *boxlo, double *boxhi, int *tag, int **nspecial,
|
double *sublo, double *subhi, int *tag, int **nspecial,
|
||||||
int **special, bool &success);
|
int **special, bool &success);
|
||||||
|
|
||||||
/// Pair loop with host neighboring
|
/// Pair loop with host neighboring
|
||||||
void compute(const int timestep, const int f_ago, const int inum_full,
|
void compute(const int f_ago, const int inum_full,
|
||||||
const int nall, double **host_x, int *host_type,
|
const int nall, double **host_x, int *host_type,
|
||||||
int *ilist, int *numj, int **firstneigh, const bool eflag,
|
int *ilist, int *numj, int **firstneigh, const bool eflag,
|
||||||
const bool vflag, const bool eatom, const bool vatom,
|
const bool vflag, const bool eatom, const bool vatom,
|
||||||
int &host_start, const double cpu_time, bool &success);
|
int &host_start, const double cpu_time, bool &success);
|
||||||
|
|
||||||
/// Pair loop with device neighboring
|
/// Pair loop with device neighboring
|
||||||
int * compute(const int timestep, const int ago, const int inum_full,
|
int * compute(const int ago, const int inum_full,
|
||||||
const int nall, double **host_x, int *host_type, double *boxlo,
|
const int nall, double **host_x, int *host_type, double *sublo,
|
||||||
double *boxhi, int *tag, int **nspecial,
|
double *subhi, int *tag, int **nspecial,
|
||||||
int **special, const bool eflag, const bool vflag,
|
int **special, const bool eflag, const bool vflag,
|
||||||
const bool eatom, const bool vatom, int &host_start,
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
const double cpu_time, bool &success);
|
const double cpu_time, bool &success);
|
||||||
|
|
||||||
|
/// Pair loop with device neighboring
|
||||||
|
int ** compute(const int ago, const int inum_full,
|
||||||
|
const int nall, double **host_x, int *host_type, double *sublo,
|
||||||
|
double *subhi, int *tag, int **nspecial,
|
||||||
|
int **special, const bool eflag, const bool vflag,
|
||||||
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
|
int **ilist, int **numj, const double cpu_time, bool &success);
|
||||||
|
|
||||||
// -------------------------- DEVICE DATA -------------------------
|
// -------------------------- DEVICE DATA -------------------------
|
||||||
|
|
||||||
/// Device Properties and Atom and Neighbor storage
|
/// Device Properties and Atom and Neighbor storage
|
||||||
@ -148,6 +169,9 @@ class AtomicGPUMemory {
|
|||||||
/// Atom Data
|
/// Atom Data
|
||||||
PairGPUAtom<numtyp,acctyp> *atom;
|
PairGPUAtom<numtyp,acctyp> *atom;
|
||||||
|
|
||||||
|
// ------------------------ FORCE/ENERGY DATA -----------------------
|
||||||
|
|
||||||
|
PairGPUAns<numtyp,acctyp> *ans;
|
||||||
|
|
||||||
// --------------------------- NBOR DATA ----------------------------
|
// --------------------------- NBOR DATA ----------------------------
|
||||||
|
|
||||||
@ -167,8 +191,10 @@ class AtomicGPUMemory {
|
|||||||
|
|
||||||
protected:
|
protected:
|
||||||
bool _compiled;
|
bool _compiled;
|
||||||
int _block_size;
|
int _block_size, _threads_per_atom;
|
||||||
double _max_bytes, _max_an_bytes;
|
double _max_bytes, _max_an_bytes;
|
||||||
|
double _gpu_overhead, _driver_overhead;
|
||||||
|
UCL_D_Vec<int> *_nbor_data;
|
||||||
|
|
||||||
void compile_kernels(UCL_Device &dev, const char *pair_string);
|
void compile_kernels(UCL_Device &dev, const char *pair_string);
|
||||||
|
|
||||||
|
|||||||
@ -23,19 +23,24 @@ extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
|
|||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
ChargeGPUMemoryT::ChargeGPUMemory() : _compiled(false), _max_bytes(0) {
|
ChargeGPUMemoryT::ChargeGPUMemory() : _compiled(false), _max_bytes(0) {
|
||||||
device=&pair_gpu_device;
|
device=&pair_gpu_device;
|
||||||
|
ans=new PairGPUAns<numtyp,acctyp>();
|
||||||
|
nbor=new PairGPUNbor();
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
ChargeGPUMemoryT::~ChargeGPUMemory() {
|
ChargeGPUMemoryT::~ChargeGPUMemory() {
|
||||||
|
delete ans;
|
||||||
|
delete nbor;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
int ChargeGPUMemoryT::bytes_per_atom_atomic(const int max_nbors) const {
|
int ChargeGPUMemoryT::bytes_per_atom_atomic(const int max_nbors) const {
|
||||||
return device->atom.bytes_per_atom()+device->nbor.bytes_per_atom(max_nbors);
|
return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
|
||||||
|
nbor->bytes_per_atom(max_nbors);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
bool ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall,
|
int ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall,
|
||||||
const int max_nbors, const int maxspecial,
|
const int max_nbors, const int maxspecial,
|
||||||
const double cell_size,
|
const double cell_size,
|
||||||
const double gpu_split, FILE *_screen,
|
const double gpu_split, FILE *_screen,
|
||||||
@ -48,24 +53,31 @@ bool ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall,
|
|||||||
gpu_nbor=true;
|
gpu_nbor=true;
|
||||||
|
|
||||||
int _gpu_host=0;
|
int _gpu_host=0;
|
||||||
int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_nbor,gpu_split);
|
int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
|
||||||
if (host_nlocal>0)
|
if (host_nlocal>0)
|
||||||
_gpu_host=1;
|
_gpu_host=1;
|
||||||
|
|
||||||
if (!device->init(true,false,nlocal,host_nlocal,nall,maxspecial,gpu_nbor,
|
_threads_per_atom=device->threads_per_charge();
|
||||||
_gpu_host,max_nbors,cell_size,false))
|
if (_threads_per_atom>1 && gpu_nbor==false) {
|
||||||
return false;
|
nbor->packing(true);
|
||||||
|
_nbor_data=&(nbor->dev_packed);
|
||||||
|
} else
|
||||||
|
_nbor_data=&(nbor->dev_nbor);
|
||||||
|
|
||||||
|
int success=device->init(*ans,true,false,nlocal,host_nlocal,nall,nbor,
|
||||||
|
maxspecial,_gpu_host,max_nbors,cell_size,false);
|
||||||
|
if (success!=0)
|
||||||
|
return success;
|
||||||
|
|
||||||
ucl_device=device->gpu;
|
ucl_device=device->gpu;
|
||||||
atom=&device->atom;
|
atom=&device->atom;
|
||||||
nbor=&device->nbor;
|
|
||||||
|
|
||||||
_block_size=BLOCK_1D;
|
_block_size=device->pair_block_size();
|
||||||
if (static_cast<size_t>(_block_size)>ucl_device->group_size())
|
_block_bio_size=device->block_bio_pair();
|
||||||
_block_size=ucl_device->group_size();
|
|
||||||
compile_kernels(*ucl_device,pair_program);
|
compile_kernels(*ucl_device,pair_program);
|
||||||
|
|
||||||
// Initialize host-device load balancer
|
// Initialize host-device load balancer
|
||||||
hd_balancer.init(device,gpu_split);
|
hd_balancer.init(device,gpu_nbor,gpu_split);
|
||||||
|
|
||||||
// Initialize timers for the selected GPU
|
// Initialize timers for the selected GPU
|
||||||
time_pair.init(*ucl_device);
|
time_pair.init(*ucl_device);
|
||||||
@ -74,9 +86,14 @@ bool ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall,
|
|||||||
pos_tex.bind_float(atom->dev_x,4);
|
pos_tex.bind_float(atom->dev_x,4);
|
||||||
q_tex.bind_float(atom->dev_q,1);
|
q_tex.bind_float(atom->dev_q,1);
|
||||||
|
|
||||||
_max_an_bytes=atom->gpu_bytes()+nbor->gpu_bytes();
|
_max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||||
|
|
||||||
return true;
|
return success;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
void ChargeGPUMemoryT::estimate_gpu_overhead() {
|
||||||
|
device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
@ -84,7 +101,10 @@ void ChargeGPUMemoryT::clear_atomic() {
|
|||||||
// Output any timing information
|
// Output any timing information
|
||||||
acc_timers();
|
acc_timers();
|
||||||
double avg_split=hd_balancer.all_avg_split();
|
double avg_split=hd_balancer.all_avg_split();
|
||||||
device->output_times(time_pair,avg_split,_max_bytes+_max_an_bytes,screen);
|
_gpu_overhead*=hd_balancer.timestep();
|
||||||
|
_driver_overhead*=hd_balancer.timestep();
|
||||||
|
device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes,
|
||||||
|
_gpu_overhead,_driver_overhead,_threads_per_atom,screen);
|
||||||
|
|
||||||
if (_compiled) {
|
if (_compiled) {
|
||||||
k_pair_fast.clear();
|
k_pair_fast.clear();
|
||||||
@ -109,7 +129,7 @@ int * ChargeGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist,
|
|||||||
|
|
||||||
nbor_time_avail=true;
|
nbor_time_avail=true;
|
||||||
|
|
||||||
int mn=nbor->max_nbor_loop(inum,numj);
|
int mn=nbor->max_nbor_loop(inum,numj,ilist);
|
||||||
resize_atom(inum,nall,success);
|
resize_atom(inum,nall,success);
|
||||||
resize_local(inum,mn,success);
|
resize_local(inum,mn,success);
|
||||||
if (!success)
|
if (!success)
|
||||||
@ -117,7 +137,7 @@ int * ChargeGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist,
|
|||||||
|
|
||||||
nbor->get_host(inum,ilist,numj,firstneigh,block_size());
|
nbor->get_host(inum,ilist,numj,firstneigh,block_size());
|
||||||
|
|
||||||
double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
|
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||||
if (bytes>_max_an_bytes)
|
if (bytes>_max_an_bytes)
|
||||||
_max_an_bytes=bytes;
|
_max_an_bytes=bytes;
|
||||||
|
|
||||||
@ -131,8 +151,8 @@ template <class numtyp, class acctyp>
|
|||||||
inline void ChargeGPUMemoryT::build_nbor_list(const int inum,
|
inline void ChargeGPUMemoryT::build_nbor_list(const int inum,
|
||||||
const int host_inum,
|
const int host_inum,
|
||||||
const int nall, double **host_x,
|
const int nall, double **host_x,
|
||||||
int *host_type, double *boxlo,
|
int *host_type, double *sublo,
|
||||||
double *boxhi, int *tag,
|
double *subhi, int *tag,
|
||||||
int **nspecial, int **special,
|
int **nspecial, int **special,
|
||||||
bool &success) {
|
bool &success) {
|
||||||
nbor_time_avail=true;
|
nbor_time_avail=true;
|
||||||
@ -145,10 +165,10 @@ inline void ChargeGPUMemoryT::build_nbor_list(const int inum,
|
|||||||
atom->cast_copy_x(host_x,host_type);
|
atom->cast_copy_x(host_x,host_type);
|
||||||
|
|
||||||
int mn;
|
int mn;
|
||||||
nbor->build_nbor_list(inum, host_inum, nall, *atom, boxlo, boxhi, tag,
|
nbor->build_nbor_list(inum, host_inum, nall, *atom, sublo, subhi, tag,
|
||||||
nspecial, special, success, mn);
|
nspecial, special, success, mn);
|
||||||
|
|
||||||
double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
|
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||||
if (bytes>_max_an_bytes)
|
if (bytes>_max_an_bytes)
|
||||||
_max_an_bytes=bytes;
|
_max_an_bytes=bytes;
|
||||||
}
|
}
|
||||||
@ -157,24 +177,26 @@ inline void ChargeGPUMemoryT::build_nbor_list(const int inum,
|
|||||||
// Copy nbor list from host if necessary and then calculate forces, virials,..
|
// Copy nbor list from host if necessary and then calculate forces, virials,..
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
void ChargeGPUMemoryT::compute(const int timestep, const int f_ago,
|
void ChargeGPUMemoryT::compute(const int f_ago, const int inum_full,
|
||||||
const int inum_full, const int nall,
|
const int nall, double **host_x, int *host_type,
|
||||||
double **host_x, int *host_type,
|
|
||||||
int *ilist, int *numj, int **firstneigh,
|
int *ilist, int *numj, int **firstneigh,
|
||||||
const bool eflag, const bool vflag,
|
const bool eflag, const bool vflag,
|
||||||
const bool eatom, const bool vatom,
|
const bool eatom, const bool vatom,
|
||||||
int &host_start, const double cpu_time,
|
int &host_start, const double cpu_time,
|
||||||
bool &success, double *host_q) {
|
bool &success, double *host_q,
|
||||||
|
const int nlocal, double *boxlo, double *prd) {
|
||||||
acc_timers();
|
acc_timers();
|
||||||
if (inum_full==0) {
|
if (inum_full==0) {
|
||||||
|
host_start=0;
|
||||||
|
// Make sure textures are correct if realloc by a different hybrid style
|
||||||
|
resize_atom(0,nall,success);
|
||||||
zero_timers();
|
zero_timers();
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
int ago=hd_balancer.ago_first(f_ago);
|
int ago=hd_balancer.ago_first(f_ago);
|
||||||
int inum=hd_balancer.balance(timestep,ago,inum_full,cpu_time,
|
int inum=hd_balancer.balance(ago,inum_full,cpu_time);
|
||||||
nbor->gpu_nbor());
|
ans->inum(inum);
|
||||||
atom->inum(inum);
|
|
||||||
host_start=inum;
|
host_start=inum;
|
||||||
|
|
||||||
if (ago==0) {
|
if (ago==0) {
|
||||||
@ -187,10 +209,14 @@ void ChargeGPUMemoryT::compute(const int timestep, const int f_ago,
|
|||||||
atom->cast_q_data(host_q);
|
atom->cast_q_data(host_q);
|
||||||
hd_balancer.start_timer();
|
hd_balancer.start_timer();
|
||||||
atom->add_x_data(host_x,host_type);
|
atom->add_x_data(host_x,host_type);
|
||||||
atom->add_other_data();
|
atom->add_q_data();
|
||||||
|
|
||||||
|
device->precompute(f_ago,nlocal,nall,host_x,host_type,success,host_q,
|
||||||
|
boxlo, prd);
|
||||||
|
|
||||||
loop(eflag,vflag);
|
loop(eflag,vflag);
|
||||||
atom->copy_answers(eflag,vflag,eatom,vatom,ilist);
|
ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
|
||||||
|
device->add_ans_object(ans);
|
||||||
hd_balancer.stop_timer();
|
hd_balancer.stop_timer();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -198,30 +224,33 @@ void ChargeGPUMemoryT::compute(const int timestep, const int f_ago,
|
|||||||
// Reneighbor on GPU if necessary and then compute forces, virials, energies
|
// Reneighbor on GPU if necessary and then compute forces, virials, energies
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
int * ChargeGPUMemoryT::compute(const int timestep, const int ago,
|
int** ChargeGPUMemoryT::compute(const int ago, const int inum_full,
|
||||||
const int inum_full, const int nall,
|
const int nall, double **host_x, int *host_type,
|
||||||
double **host_x, int *host_type, double *boxlo,
|
double *sublo, double *subhi, int *tag,
|
||||||
double *boxhi, int *tag, int **nspecial,
|
int **nspecial, int **special, const bool eflag,
|
||||||
int **special, const bool eflag,
|
|
||||||
const bool vflag, const bool eatom,
|
const bool vflag, const bool eatom,
|
||||||
const bool vatom, int &host_start,
|
const bool vatom, int &host_start,
|
||||||
|
int **ilist, int **jnum,
|
||||||
const double cpu_time, bool &success,
|
const double cpu_time, bool &success,
|
||||||
double *host_q) {
|
double *host_q, double *boxlo, double *prd) {
|
||||||
acc_timers();
|
acc_timers();
|
||||||
if (inum_full==0) {
|
if (inum_full==0) {
|
||||||
|
host_start=0;
|
||||||
|
// Make sure textures are correct if realloc by a different hybrid style
|
||||||
|
resize_atom(0,nall,success);
|
||||||
zero_timers();
|
zero_timers();
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
hd_balancer.balance(cpu_time,nbor->gpu_nbor());
|
hd_balancer.balance(cpu_time);
|
||||||
int inum=hd_balancer.get_gpu_count(timestep,ago,inum_full);
|
int inum=hd_balancer.get_gpu_count(ago,inum_full);
|
||||||
atom->inum(inum);
|
ans->inum(inum);
|
||||||
host_start=inum;
|
host_start=inum;
|
||||||
|
|
||||||
// Build neighbor list on GPU if necessary
|
// Build neighbor list on GPU if necessary
|
||||||
if (ago==0) {
|
if (ago==0) {
|
||||||
build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
|
build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
|
||||||
boxlo, boxhi, tag, nspecial, special, success);
|
sublo, subhi, tag, nspecial, special, success);
|
||||||
if (!success)
|
if (!success)
|
||||||
return NULL;
|
return NULL;
|
||||||
atom->cast_q_data(host_q);
|
atom->cast_q_data(host_q);
|
||||||
@ -232,20 +261,25 @@ int * ChargeGPUMemoryT::compute(const int timestep, const int ago,
|
|||||||
hd_balancer.start_timer();
|
hd_balancer.start_timer();
|
||||||
atom->add_x_data(host_x,host_type);
|
atom->add_x_data(host_x,host_type);
|
||||||
}
|
}
|
||||||
atom->add_other_data();
|
atom->add_q_data();
|
||||||
|
*ilist=nbor->host_ilist.begin();
|
||||||
|
*jnum=nbor->host_acc.begin();
|
||||||
|
|
||||||
|
device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q,
|
||||||
|
boxlo, prd);
|
||||||
|
|
||||||
loop(eflag,vflag);
|
loop(eflag,vflag);
|
||||||
atom->copy_answers(eflag,vflag,eatom,vatom);
|
ans->copy_answers(eflag,vflag,eatom,vatom);
|
||||||
|
device->add_ans_object(ans);
|
||||||
hd_balancer.stop_timer();
|
hd_balancer.stop_timer();
|
||||||
|
|
||||||
return device->nbor.host_nbor.begin();
|
return nbor->host_jlist.begin()-host_start;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
double ChargeGPUMemoryT::host_memory_usage_atomic() const {
|
double ChargeGPUMemoryT::host_memory_usage_atomic() const {
|
||||||
return device->atom.host_memory_usage()+
|
return device->atom.host_memory_usage()+nbor->host_memory_usage()+
|
||||||
device->nbor.host_memory_usage()+4*sizeof(numtyp)+
|
4*sizeof(numtyp)+sizeof(ChargeGPUMemory<numtyp,acctyp>);
|
||||||
sizeof(ChargeGPUMemory<numtyp,acctyp>);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
|
|||||||
@ -18,8 +18,6 @@
|
|||||||
#ifndef CHARGE_GPU_MEMORY_H
|
#ifndef CHARGE_GPU_MEMORY_H
|
||||||
#define CHARGE_GPU_MEMORY_H
|
#define CHARGE_GPU_MEMORY_H
|
||||||
|
|
||||||
#define BLOCK_1D 64
|
|
||||||
|
|
||||||
#include "pair_gpu_device.h"
|
#include "pair_gpu_device.h"
|
||||||
#include "pair_gpu_balance.h"
|
#include "pair_gpu_balance.h"
|
||||||
#include "mpi.h"
|
#include "mpi.h"
|
||||||
@ -39,19 +37,30 @@ class ChargeGPUMemory {
|
|||||||
/// Clear any previous data and set up for a new LAMMPS run
|
/// Clear any previous data and set up for a new LAMMPS run
|
||||||
/** \param max_nbors initial number of rows in the neighbor matrix
|
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||||
* \param cell_size cutoff + skin
|
* \param cell_size cutoff + skin
|
||||||
* \param gpu_split fraction of particles handled by device **/
|
* \param gpu_split fraction of particles handled by device
|
||||||
bool init_atomic(const int nlocal, const int nall, const int max_nbors,
|
*
|
||||||
|
* Returns:
|
||||||
|
* - 0 if successfull
|
||||||
|
* - -1 if fix gpu not found
|
||||||
|
* - -3 if there is an out of memory error
|
||||||
|
* - -4 if the GPU library was not compiled for GPU
|
||||||
|
* - -5 Double precision is not supported on card **/
|
||||||
|
int init_atomic(const int nlocal, const int nall, const int max_nbors,
|
||||||
const int maxspecial, const double cell_size,
|
const int maxspecial, const double cell_size,
|
||||||
const double gpu_split, FILE *screen,
|
const double gpu_split, FILE *screen,
|
||||||
const char *pair_program);
|
const char *pair_program);
|
||||||
|
|
||||||
|
/// Estimate the overhead for GPU context changes and CPU driver
|
||||||
|
void estimate_gpu_overhead();
|
||||||
|
|
||||||
/// Check if there is enough storage for atom arrays and realloc if not
|
/// Check if there is enough storage for atom arrays and realloc if not
|
||||||
/** \param success set to false if insufficient memory **/
|
/** \param success set to false if insufficient memory **/
|
||||||
inline void resize_atom(const int inum, const int nall, bool &success) {
|
inline void resize_atom(const int inum, const int nall, bool &success) {
|
||||||
if (atom->resize(inum, nall, success)) {
|
if (atom->resize(nall, success)) {
|
||||||
pos_tex.bind_float(atom->dev_x,4);
|
pos_tex.bind_float(atom->dev_x,4);
|
||||||
q_tex.bind_float(atom->dev_q,1);
|
q_tex.bind_float(atom->dev_q,1);
|
||||||
}
|
}
|
||||||
|
ans->resize(inum,success);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Check if there is enough storage for neighbors and realloc if not
|
/// Check if there is enough storage for neighbors and realloc if not
|
||||||
@ -87,6 +96,7 @@ class ChargeGPUMemory {
|
|||||||
|
|
||||||
/// Accumulate timers
|
/// Accumulate timers
|
||||||
inline void acc_timers() {
|
inline void acc_timers() {
|
||||||
|
if (device->time_device()) {
|
||||||
if (nbor_time_avail) {
|
if (nbor_time_avail) {
|
||||||
nbor->time_nbor.add_to_total();
|
nbor->time_nbor.add_to_total();
|
||||||
nbor->time_kernel.add_to_total();
|
nbor->time_kernel.add_to_total();
|
||||||
@ -94,6 +104,8 @@ class ChargeGPUMemory {
|
|||||||
}
|
}
|
||||||
time_pair.add_to_total();
|
time_pair.add_to_total();
|
||||||
atom->acc_timers();
|
atom->acc_timers();
|
||||||
|
ans->acc_timers();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Zero timers
|
/// Zero timers
|
||||||
@ -101,6 +113,7 @@ class ChargeGPUMemory {
|
|||||||
nbor_time_avail=false;
|
nbor_time_avail=false;
|
||||||
time_pair.zero();
|
time_pair.zero();
|
||||||
atom->zero_timers();
|
atom->zero_timers();
|
||||||
|
ans->zero_timers();
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Copy neighbor list from host
|
/// Copy neighbor list from host
|
||||||
@ -110,24 +123,25 @@ class ChargeGPUMemory {
|
|||||||
/// Build neighbor list on device
|
/// Build neighbor list on device
|
||||||
void build_nbor_list(const int inum, const int host_inum,
|
void build_nbor_list(const int inum, const int host_inum,
|
||||||
const int nall, double **host_x, int *host_type,
|
const int nall, double **host_x, int *host_type,
|
||||||
double *boxlo, double *boxhi, int *tag, int **nspecial,
|
double *sublo, double *subhi, int *tag, int **nspecial,
|
||||||
int **special, bool &success);
|
int **special, bool &success);
|
||||||
|
|
||||||
/// Pair loop with host neighboring
|
/// Pair loop with host neighboring
|
||||||
void compute(const int timestep, const int f_ago, const int inum_full,
|
void compute(const int f_ago, const int inum_full, const int nall,
|
||||||
const int nall, double **host_x, int *host_type,
|
double **host_x, int *host_type, int *ilist, int *numj,
|
||||||
int *ilist, int *numj, int **firstneigh, const bool eflag,
|
int **firstneigh, const bool eflag, const bool vflag,
|
||||||
const bool vflag, const bool eatom, const bool vatom,
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
int &host_start, const double cpu_time, bool &success,
|
const double cpu_time, bool &success, double *charge,
|
||||||
double *charge);
|
const int nlocal, double *boxlo, double *prd);
|
||||||
|
|
||||||
/// Pair loop with device neighboring
|
/// Pair loop with device neighboring
|
||||||
int * compute(const int timestep, const int ago, const int inum_full,
|
int** compute(const int ago, const int inum_full, const int nall,
|
||||||
const int nall, double **host_x, int *host_type, double *boxlo,
|
double **host_x, int *host_type, double *sublo,
|
||||||
double *boxhi, int *tag, int **nspecial,
|
double *subhi, int *tag, int **nspecial,
|
||||||
int **special, const bool eflag, const bool vflag,
|
int **special, const bool eflag, const bool vflag,
|
||||||
const bool eatom, const bool vatom, int &host_start,
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
const double cpu_time, bool &success, double *charge);
|
int **ilist, int **numj, const double cpu_time, bool &success,
|
||||||
|
double *charge, double *boxlo, double *prd);
|
||||||
|
|
||||||
// -------------------------- DEVICE DATA -------------------------
|
// -------------------------- DEVICE DATA -------------------------
|
||||||
|
|
||||||
@ -152,6 +166,10 @@ class ChargeGPUMemory {
|
|||||||
PairGPUAtom<numtyp,acctyp> *atom;
|
PairGPUAtom<numtyp,acctyp> *atom;
|
||||||
|
|
||||||
|
|
||||||
|
// ------------------------ FORCE/ENERGY DATA -----------------------
|
||||||
|
|
||||||
|
PairGPUAns<numtyp,acctyp> *ans;
|
||||||
|
|
||||||
// --------------------------- NBOR DATA ----------------------------
|
// --------------------------- NBOR DATA ----------------------------
|
||||||
|
|
||||||
/// Neighbor data
|
/// Neighbor data
|
||||||
@ -171,8 +189,10 @@ class ChargeGPUMemory {
|
|||||||
|
|
||||||
protected:
|
protected:
|
||||||
bool _compiled;
|
bool _compiled;
|
||||||
int _block_size;
|
int _block_size, _block_bio_size, _threads_per_atom;
|
||||||
double _max_bytes, _max_an_bytes;
|
double _max_bytes, _max_an_bytes;
|
||||||
|
double _gpu_overhead, _driver_overhead;
|
||||||
|
UCL_D_Vec<int> *_nbor_data;
|
||||||
|
|
||||||
void compile_kernels(UCL_Device &dev, const char *pair_string);
|
void compile_kernels(UCL_Device &dev, const char *pair_string);
|
||||||
|
|
||||||
|
|||||||
@ -28,7 +28,7 @@ static CMM_GPU_Memory<PRECISION,ACC_PRECISION> CMMMF;
|
|||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
// Allocate memory on host and device and copy constants to device
|
// Allocate memory on host and device and copy constants to device
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
bool cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
|
int cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
|
||||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||||
double **host_lj4, double **offset, double *special_lj,
|
double **host_lj4, double **offset, double *special_lj,
|
||||||
const int inum, const int nall, const int max_nbors,
|
const int inum, const int nall, const int max_nbors,
|
||||||
@ -54,13 +54,11 @@ bool cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
|
|||||||
fflush(screen);
|
fflush(screen);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (world_me==0) {
|
int init_ok=0;
|
||||||
bool init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3,
|
if (world_me==0)
|
||||||
|
init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3,
|
||||||
host_lj4, offset, special_lj, inum, nall, 300,
|
host_lj4, offset, special_lj, inum, nall, 300,
|
||||||
maxspecial, cell_size, gpu_split, screen);
|
maxspecial, cell_size, gpu_split, screen);
|
||||||
if (!init_ok)
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
CMMMF.device->world_barrier();
|
CMMMF.device->world_barrier();
|
||||||
if (message)
|
if (message)
|
||||||
@ -75,45 +73,45 @@ bool cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
|
|||||||
last_gpu,i);
|
last_gpu,i);
|
||||||
fflush(screen);
|
fflush(screen);
|
||||||
}
|
}
|
||||||
if (gpu_rank==i && world_me!=0) {
|
if (gpu_rank==i && world_me!=0)
|
||||||
bool init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3,
|
init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3,
|
||||||
host_lj4, offset, special_lj, inum, nall, 300,
|
host_lj4, offset, special_lj, inum, nall, 300,
|
||||||
maxspecial, cell_size, gpu_split,
|
maxspecial, cell_size, gpu_split, screen);
|
||||||
screen);
|
|
||||||
if (!init_ok)
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
CMMMF.device->gpu_barrier();
|
CMMMF.device->gpu_barrier();
|
||||||
if (message)
|
if (message)
|
||||||
fprintf(screen,"Done.\n");
|
fprintf(screen,"Done.\n");
|
||||||
}
|
}
|
||||||
if (message)
|
if (message)
|
||||||
fprintf(screen,"\n");
|
fprintf(screen,"\n");
|
||||||
return true;
|
|
||||||
|
if (init_ok==0)
|
||||||
|
CMMMF.estimate_gpu_overhead();
|
||||||
|
return init_ok;
|
||||||
}
|
}
|
||||||
|
|
||||||
void cmm_gpu_clear() {
|
void cmm_gpu_clear() {
|
||||||
CMMMF.clear();
|
CMMMF.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
int * cmm_gpu_compute_n(const int timestep, const int ago, const int inum_full,
|
int** cmm_gpu_compute_n(const int ago, const int inum_full,
|
||||||
const int nall, double **host_x, int *host_type,
|
const int nall, double **host_x, int *host_type,
|
||||||
double *boxlo, double *boxhi, int *tag, int **nspecial,
|
double *sublo, double *subhi, int *tag, int **nspecial,
|
||||||
int **special, const bool eflag, const bool vflag,
|
int **special, const bool eflag, const bool vflag,
|
||||||
const bool eatom, const bool vatom, int &host_start,
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
const double cpu_time, bool &success) {
|
int **ilist, int **jnum, const double cpu_time,
|
||||||
return CMMMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
|
bool &success) {
|
||||||
boxhi, tag, nspecial, special, eflag, vflag, eatom,
|
return CMMMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
|
||||||
vatom, host_start, cpu_time, success);
|
subhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||||
|
vatom, host_start, ilist, jnum, cpu_time, success);
|
||||||
}
|
}
|
||||||
|
|
||||||
void cmm_gpu_compute(const int timestep, const int ago, const int inum_full,
|
void cmm_gpu_compute(const int ago, const int inum_full, const int nall,
|
||||||
const int nall, double **host_x, int *host_type,
|
double **host_x, int *host_type, int *ilist, int *numj,
|
||||||
int *ilist, int *numj, int **firstneigh,
|
int **firstneigh, const bool eflag, const bool vflag,
|
||||||
const bool eflag, const bool vflag, const bool eatom,
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
const bool vatom, int &host_start, const double cpu_time,
|
const double cpu_time, bool &success) {
|
||||||
bool &success) {
|
CMMMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
|
||||||
CMMMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
|
|
||||||
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
|
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -18,8 +18,6 @@
|
|||||||
#ifndef CMM_GPU_KERNEL
|
#ifndef CMM_GPU_KERNEL
|
||||||
#define CMM_GPU_KERNEL
|
#define CMM_GPU_KERNEL
|
||||||
|
|
||||||
#define MAX_SHARED_TYPES 8
|
|
||||||
|
|
||||||
#ifdef _DOUBLE_DOUBLE
|
#ifdef _DOUBLE_DOUBLE
|
||||||
#define numtyp double
|
#define numtyp double
|
||||||
#define numtyp2 double2
|
#define numtyp2 double2
|
||||||
@ -46,7 +44,7 @@
|
|||||||
|
|
||||||
#ifdef NV_KERNEL
|
#ifdef NV_KERNEL
|
||||||
|
|
||||||
#include "geryon/ucl_nv_kernel.h"
|
#include "nv_kernel_def.h"
|
||||||
texture<float4> pos_tex;
|
texture<float4> pos_tex;
|
||||||
|
|
||||||
#ifdef _DOUBLE_DOUBLE
|
#ifdef _DOUBLE_DOUBLE
|
||||||
@ -72,6 +70,8 @@ __inline float4 fetch_pos(const int& i, const float4 *pos)
|
|||||||
#define __inline inline
|
#define __inline inline
|
||||||
|
|
||||||
#define fetch_pos(i,y) x_[i]
|
#define fetch_pos(i,y) x_[i]
|
||||||
|
#define BLOCK_PAIR 64
|
||||||
|
#define MAX_SHARED_TYPES 8
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -82,19 +82,21 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; }
|
|||||||
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||||
__global numtyp4* lj3, const int lj_types,
|
__global numtyp4* lj3, const int lj_types,
|
||||||
__global numtyp *sp_lj_in, __global int *dev_nbor,
|
__global numtyp *sp_lj_in, __global int *dev_nbor,
|
||||||
__global acctyp4 *ans, __global acctyp *engv,
|
__global int *dev_packed, __global acctyp4 *ans,
|
||||||
const int eflag, const int vflag, const int inum,
|
__global acctyp *engv, const int eflag,
|
||||||
const int nall, const int nbor_pitch) {
|
const int vflag, const int inum, const int nall,
|
||||||
// ii indexes the two interacting particles in gi
|
const int nbor_pitch, const int t_per_atom) {
|
||||||
int ii=GLOBAL_ID_X;
|
int tid=THREAD_ID_X;
|
||||||
|
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||||
|
ii+=tid/t_per_atom;
|
||||||
|
int offset=tid%t_per_atom;
|
||||||
|
|
||||||
__local numtyp sp_lj[4];
|
__local numtyp sp_lj[4];
|
||||||
sp_lj[0]=sp_lj_in[0];
|
sp_lj[0]=sp_lj_in[0];
|
||||||
sp_lj[1]=sp_lj_in[1];
|
sp_lj[1]=sp_lj_in[1];
|
||||||
sp_lj[2]=sp_lj_in[2];
|
sp_lj[2]=sp_lj_in[2];
|
||||||
sp_lj[3]=sp_lj_in[3];
|
sp_lj[3]=sp_lj_in[3];
|
||||||
|
|
||||||
if (ii<inum) {
|
|
||||||
|
|
||||||
acctyp energy=(acctyp)0;
|
acctyp energy=(acctyp)0;
|
||||||
acctyp4 f;
|
acctyp4 f;
|
||||||
f.x=(acctyp)0;
|
f.x=(acctyp)0;
|
||||||
@ -104,18 +106,32 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||||||
for (int i=0; i<6; i++)
|
for (int i=0; i<6; i++)
|
||||||
virial[i]=(acctyp)0;
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
|
if (ii<inum) {
|
||||||
|
|
||||||
__global int *nbor=dev_nbor+ii;
|
__global int *nbor=dev_nbor+ii;
|
||||||
int i=*nbor;
|
int i=*nbor;
|
||||||
nbor+=nbor_pitch;
|
nbor+=nbor_pitch;
|
||||||
int numj=*nbor;
|
int numj=*nbor;
|
||||||
nbor+=nbor_pitch;
|
nbor+=nbor_pitch;
|
||||||
__global int *list_end=nbor+mul24(numj,nbor_pitch);
|
|
||||||
|
int n_stride;
|
||||||
|
__global int *list_end;
|
||||||
|
if (dev_nbor==dev_packed) {
|
||||||
|
list_end=nbor+mul24(numj,nbor_pitch);
|
||||||
|
nbor+=mul24(offset,nbor_pitch);
|
||||||
|
n_stride=mul24(t_per_atom,nbor_pitch);
|
||||||
|
} else {
|
||||||
|
nbor=dev_packed+*nbor;
|
||||||
|
list_end=nbor+numj;
|
||||||
|
n_stride=t_per_atom;
|
||||||
|
nbor+=offset;
|
||||||
|
}
|
||||||
|
|
||||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||||
int itype=ix.w;
|
int itype=ix.w;
|
||||||
|
|
||||||
numtyp factor_lj;
|
numtyp factor_lj;
|
||||||
for ( ; nbor<list_end; nbor+=nbor_pitch) {
|
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||||
|
|
||||||
int j=*nbor;
|
int j=*nbor;
|
||||||
factor_lj = sp_lj[sbmask(j)];
|
factor_lj = sp_lj[sbmask(j)];
|
||||||
@ -164,8 +180,47 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||||||
}
|
}
|
||||||
|
|
||||||
} // for nbor
|
} // for nbor
|
||||||
|
} // if ii
|
||||||
|
|
||||||
|
// Reduce answers
|
||||||
|
if (t_per_atom>1) {
|
||||||
|
__local acctyp red_acc[6][BLOCK_PAIR];
|
||||||
|
|
||||||
|
red_acc[0][tid]=f.x;
|
||||||
|
red_acc[1][tid]=f.y;
|
||||||
|
red_acc[2][tid]=f.z;
|
||||||
|
red_acc[3][tid]=energy;
|
||||||
|
|
||||||
|
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||||
|
if (offset < s) {
|
||||||
|
for (int r=0; r<4; r++)
|
||||||
|
red_acc[r][tid] += red_acc[r][tid+s];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
f.x=red_acc[0][tid];
|
||||||
|
f.y=red_acc[1][tid];
|
||||||
|
f.z=red_acc[2][tid];
|
||||||
|
energy=red_acc[3][tid];
|
||||||
|
|
||||||
|
if (vflag>0) {
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
red_acc[r][tid]=virial[r];
|
||||||
|
|
||||||
|
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||||
|
if (offset < s) {
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
red_acc[r][tid] += red_acc[r][tid+s];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
virial[r]=red_acc[r][tid];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Store answers
|
// Store answers
|
||||||
|
if (ii<inum && offset==0) {
|
||||||
__global acctyp *ap1=engv+ii;
|
__global acctyp *ap1=engv+ii;
|
||||||
if (eflag>0) {
|
if (eflag>0) {
|
||||||
*ap1=energy;
|
*ap1=energy;
|
||||||
@ -184,25 +239,25 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||||||
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||||
__global numtyp4* lj3_in,
|
__global numtyp4* lj3_in,
|
||||||
__global numtyp* sp_lj_in,__global int *dev_nbor,
|
__global numtyp* sp_lj_in,__global int *dev_nbor,
|
||||||
__global acctyp4 *ans, __global acctyp *engv,
|
__global int *dev_packed, __global acctyp4 *ans,
|
||||||
const int eflag, const int vflag, const int inum,
|
__global acctyp *engv, const int eflag,
|
||||||
const int nall, const int nbor_pitch) {
|
const int vflag, const int inum, const int nall,
|
||||||
// ii indexes the two interacting particles in gi
|
const int nbor_pitch, const int t_per_atom) {
|
||||||
int ii=THREAD_ID_X;
|
int tid=THREAD_ID_X;
|
||||||
|
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||||
|
ii+=tid/t_per_atom;
|
||||||
|
int offset=tid%t_per_atom;
|
||||||
|
|
||||||
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||||
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||||
__local numtyp sp_lj[4];
|
__local numtyp sp_lj[4];
|
||||||
if (ii<4)
|
if (tid<4)
|
||||||
sp_lj[ii]=sp_lj_in[ii];
|
sp_lj[tid]=sp_lj_in[tid];
|
||||||
if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||||
lj1[ii]=lj1_in[ii];
|
lj1[tid]=lj1_in[tid];
|
||||||
if (eflag>0)
|
if (eflag>0)
|
||||||
lj3[ii]=lj3_in[ii];
|
lj3[tid]=lj3_in[tid];
|
||||||
}
|
}
|
||||||
ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
if (ii<inum) {
|
|
||||||
|
|
||||||
acctyp energy=(acctyp)0;
|
acctyp energy=(acctyp)0;
|
||||||
acctyp4 f;
|
acctyp4 f;
|
||||||
@ -213,19 +268,34 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
|||||||
for (int i=0; i<6; i++)
|
for (int i=0; i<6; i++)
|
||||||
virial[i]=(acctyp)0;
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
if (ii<inum) {
|
||||||
__global int *nbor=dev_nbor+ii;
|
__global int *nbor=dev_nbor+ii;
|
||||||
int i=*nbor;
|
int i=*nbor;
|
||||||
nbor+=nbor_pitch;
|
nbor+=nbor_pitch;
|
||||||
int numj=*nbor;
|
int numj=*nbor;
|
||||||
nbor+=nbor_pitch;
|
nbor+=nbor_pitch;
|
||||||
__global int *list_end=nbor+mul24(numj,nbor_pitch);
|
|
||||||
|
int n_stride;
|
||||||
|
__global int *list_end;
|
||||||
|
if (dev_nbor==dev_packed) {
|
||||||
|
list_end=nbor+mul24(numj,nbor_pitch);
|
||||||
|
nbor+=mul24(offset,nbor_pitch);
|
||||||
|
n_stride=mul24(t_per_atom,nbor_pitch);
|
||||||
|
} else {
|
||||||
|
nbor=dev_packed+*nbor;
|
||||||
|
list_end=nbor+numj;
|
||||||
|
n_stride=t_per_atom;
|
||||||
|
nbor+=offset;
|
||||||
|
}
|
||||||
|
|
||||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||||
int iw=ix.w;
|
int iw=ix.w;
|
||||||
int itype=mul24((int)MAX_SHARED_TYPES,iw);
|
int itype=mul24((int)MAX_SHARED_TYPES,iw);
|
||||||
|
|
||||||
numtyp factor_lj;
|
numtyp factor_lj;
|
||||||
for ( ; nbor<list_end; nbor+=nbor_pitch) {
|
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||||
|
|
||||||
int j=*nbor;
|
int j=*nbor;
|
||||||
factor_lj = sp_lj[sbmask(j)];
|
factor_lj = sp_lj[sbmask(j)];
|
||||||
@ -273,8 +343,47 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
|||||||
}
|
}
|
||||||
|
|
||||||
} // for nbor
|
} // for nbor
|
||||||
|
} // if ii
|
||||||
|
|
||||||
|
// Reduce answers
|
||||||
|
if (t_per_atom>1) {
|
||||||
|
__local acctyp red_acc[6][BLOCK_PAIR];
|
||||||
|
|
||||||
|
red_acc[0][tid]=f.x;
|
||||||
|
red_acc[1][tid]=f.y;
|
||||||
|
red_acc[2][tid]=f.z;
|
||||||
|
red_acc[3][tid]=energy;
|
||||||
|
|
||||||
|
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||||
|
if (offset < s) {
|
||||||
|
for (int r=0; r<4; r++)
|
||||||
|
red_acc[r][tid] += red_acc[r][tid+s];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
f.x=red_acc[0][tid];
|
||||||
|
f.y=red_acc[1][tid];
|
||||||
|
f.z=red_acc[2][tid];
|
||||||
|
energy=red_acc[3][tid];
|
||||||
|
|
||||||
|
if (vflag>0) {
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
red_acc[r][tid]=virial[r];
|
||||||
|
|
||||||
|
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||||
|
if (offset < s) {
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
red_acc[r][tid] += red_acc[r][tid+s];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
virial[r]=red_acc[r][tid];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Store answers
|
// Store answers
|
||||||
|
if (ii<inum && offset==0) {
|
||||||
__global acctyp *ap1=engv+ii;
|
__global acctyp *ap1=engv+ii;
|
||||||
if (eflag>0) {
|
if (eflag>0) {
|
||||||
*ap1=energy;
|
*ap1=energy;
|
||||||
|
|||||||
@ -42,7 +42,7 @@ int CMM_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
bool CMM_GPU_MemoryT::init(const int ntypes, double **host_cutsq,
|
int CMM_GPU_MemoryT::init(const int ntypes, double **host_cutsq,
|
||||||
int **host_cg_type, double **host_lj1,
|
int **host_cg_type, double **host_lj1,
|
||||||
double **host_lj2, double **host_lj3,
|
double **host_lj2, double **host_lj3,
|
||||||
double **host_lj4, double **host_offset,
|
double **host_lj4, double **host_offset,
|
||||||
@ -50,14 +50,18 @@ bool CMM_GPU_MemoryT::init(const int ntypes, double **host_cutsq,
|
|||||||
const int nall, const int max_nbors,
|
const int nall, const int max_nbors,
|
||||||
const int maxspecial, const double cell_size,
|
const int maxspecial, const double cell_size,
|
||||||
const double gpu_split, FILE *_screen) {
|
const double gpu_split, FILE *_screen) {
|
||||||
this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
int success;
|
||||||
|
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||||
_screen,cmm_cut_gpu_kernel);
|
_screen,cmm_cut_gpu_kernel);
|
||||||
|
if (success!=0)
|
||||||
|
return success;
|
||||||
|
|
||||||
// If atom type constants fit in shared memory use fast kernel
|
// If atom type constants fit in shared memory use fast kernel
|
||||||
int cmm_types=ntypes;
|
int cmm_types=ntypes;
|
||||||
shared_types=false;
|
shared_types=false;
|
||||||
if (cmm_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) {
|
int max_shared_types=this->device->max_shared_types();
|
||||||
cmm_types=MAX_SHARED_TYPES;
|
if (cmm_types<=max_shared_types && this->_block_size>=max_shared_types) {
|
||||||
|
cmm_types=max_shared_types;
|
||||||
shared_types=true;
|
shared_types=true;
|
||||||
}
|
}
|
||||||
_cmm_types=cmm_types;
|
_cmm_types=cmm_types;
|
||||||
@ -84,7 +88,7 @@ bool CMM_GPU_MemoryT::init(const int ntypes, double **host_cutsq,
|
|||||||
|
|
||||||
_allocated=true;
|
_allocated=true;
|
||||||
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
|
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
|
||||||
return true;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
@ -122,9 +126,10 @@ void CMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
|||||||
else
|
else
|
||||||
vflag=0;
|
vflag=0;
|
||||||
|
|
||||||
int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
|
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||||
|
(BX/this->_threads_per_atom)));
|
||||||
|
|
||||||
int ainum=this->atom->inum();
|
int ainum=this->ans->inum();
|
||||||
int anall=this->atom->nall();
|
int anall=this->atom->nall();
|
||||||
int nbor_pitch=this->nbor->nbor_pitch();
|
int nbor_pitch=this->nbor->nbor_pitch();
|
||||||
this->time_pair.start();
|
this->time_pair.start();
|
||||||
@ -133,16 +138,18 @@ void CMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
|||||||
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
|
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
|
||||||
&lj3.begin(), &sp_lj.begin(),
|
&lj3.begin(), &sp_lj.begin(),
|
||||||
&this->nbor->dev_nbor.begin(),
|
&this->nbor->dev_nbor.begin(),
|
||||||
&this->atom->dev_ans.begin(),
|
&this->_nbor_data->begin(),
|
||||||
&this->atom->dev_engv.begin(), &eflag, &vflag,
|
&this->ans->dev_ans.begin(),
|
||||||
&ainum, &anall, &nbor_pitch);
|
&this->ans->dev_engv.begin(), &eflag, &vflag,
|
||||||
|
&ainum, &anall, &nbor_pitch,
|
||||||
|
&this->_threads_per_atom);
|
||||||
} else {
|
} else {
|
||||||
this->k_pair.set_size(GX,BX);
|
this->k_pair.set_size(GX,BX);
|
||||||
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
|
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
|
||||||
&_cmm_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
&_cmm_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
||||||
&this->atom->dev_ans.begin(),
|
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
|
||||||
&this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
|
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||||
&anall, &nbor_pitch);
|
&anall, &nbor_pitch, &this->_threads_per_atom);
|
||||||
}
|
}
|
||||||
this->time_pair.stop();
|
this->time_pair.stop();
|
||||||
}
|
}
|
||||||
|
|||||||
@ -29,8 +29,15 @@ class CMM_GPU_Memory : public AtomicGPUMemory<numtyp, acctyp> {
|
|||||||
/// Clear any previous data and set up for a new LAMMPS run
|
/// Clear any previous data and set up for a new LAMMPS run
|
||||||
/** \param max_nbors initial number of rows in the neighbor matrix
|
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||||
* \param cell_size cutoff + skin
|
* \param cell_size cutoff + skin
|
||||||
* \param gpu_split fraction of particles handled by device **/
|
* \param gpu_split fraction of particles handled by device
|
||||||
bool init(const int ntypes, double **host_cutsq, int **host_cg_type,
|
*
|
||||||
|
* Returns:
|
||||||
|
* - 0 if successfull
|
||||||
|
* - -1 if fix gpu not found
|
||||||
|
* - -3 if there is an out of memory error
|
||||||
|
* - -4 if the GPU library was not compiled for GPU
|
||||||
|
* - -5 Double precision is not supported on card **/
|
||||||
|
int init(const int ntypes, double **host_cutsq, int **host_cg_type,
|
||||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||||
double **host_lj4, double **host_offset, double *host_special_lj,
|
double **host_lj4, double **host_offset, double *host_special_lj,
|
||||||
const int nlocal, const int nall, const int max_nbors,
|
const int nlocal, const int nall, const int max_nbors,
|
||||||
|
|||||||
@ -28,7 +28,7 @@ static CMML_GPU_Memory<PRECISION,ACC_PRECISION> CMMLMF;
|
|||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
// Allocate memory on host and device and copy constants to device
|
// Allocate memory on host and device and copy constants to device
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
bool cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
|
int cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
|
||||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||||
double **host_lj4, double **offset, double *special_lj,
|
double **host_lj4, double **offset, double *special_lj,
|
||||||
const int inum, const int nall, const int max_nbors,
|
const int inum, const int nall, const int max_nbors,
|
||||||
@ -56,15 +56,12 @@ bool cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
|
|||||||
fflush(screen);
|
fflush(screen);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (world_me==0) {
|
int init_ok=0;
|
||||||
bool init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2,
|
if (world_me==0)
|
||||||
host_lj3, host_lj4, offset, special_lj, inum,
|
init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, host_lj3,
|
||||||
nall, 300, maxspecial, cell_size, gpu_split,
|
host_lj4, offset, special_lj, inum, nall, 300,
|
||||||
screen, host_cut_ljsq, host_cut_coulsq,
|
maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
|
||||||
host_special_coul, qqrd2e,g_ewald);
|
host_cut_coulsq, host_special_coul, qqrd2e,g_ewald);
|
||||||
if (!init_ok)
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
CMMLMF.device->world_barrier();
|
CMMLMF.device->world_barrier();
|
||||||
if (message)
|
if (message)
|
||||||
@ -79,48 +76,51 @@ bool cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
|
|||||||
last_gpu,i);
|
last_gpu,i);
|
||||||
fflush(screen);
|
fflush(screen);
|
||||||
}
|
}
|
||||||
if (gpu_rank==i && world_me!=0) {
|
if (gpu_rank==i && world_me!=0)
|
||||||
bool init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2,
|
init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, host_lj3,
|
||||||
host_lj3, host_lj4, offset, special_lj, inum,
|
host_lj4, offset, special_lj, inum, nall, 300,
|
||||||
nall, 300, maxspecial, cell_size, gpu_split,
|
maxspecial, cell_size, gpu_split, screen,
|
||||||
screen, host_cut_ljsq, host_cut_coulsq,
|
host_cut_ljsq, host_cut_coulsq, host_special_coul,
|
||||||
host_special_coul, qqrd2e, g_ewald);
|
qqrd2e, g_ewald);
|
||||||
if (!init_ok)
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
CMMLMF.device->gpu_barrier();
|
CMMLMF.device->gpu_barrier();
|
||||||
if (message)
|
if (message)
|
||||||
fprintf(screen,"Done.\n");
|
fprintf(screen,"Done.\n");
|
||||||
}
|
}
|
||||||
if (message)
|
if (message)
|
||||||
fprintf(screen,"\n");
|
fprintf(screen,"\n");
|
||||||
return true;
|
|
||||||
|
if (init_ok==0)
|
||||||
|
CMMLMF.estimate_gpu_overhead();
|
||||||
|
return init_ok;
|
||||||
}
|
}
|
||||||
|
|
||||||
void cmml_gpu_clear() {
|
void cmml_gpu_clear() {
|
||||||
CMMLMF.clear();
|
CMMLMF.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
int * cmml_gpu_compute_n(const int timestep, const int ago, const int inum_full,
|
int** cmml_gpu_compute_n(const int ago, const int inum_full,
|
||||||
const int nall, double **host_x, int *host_type,
|
const int nall, double **host_x, int *host_type,
|
||||||
double *boxlo, double *boxhi, int *tag, int **nspecial,
|
double *sublo, double *subhi, int *tag, int **nspecial,
|
||||||
int **special, const bool eflag, const bool vflag,
|
int **special, const bool eflag, const bool vflag,
|
||||||
const bool eatom, const bool vatom, int &host_start,
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
const double cpu_time, bool &success, double *host_q) {
|
int **ilist, int **jnum, const double cpu_time,
|
||||||
return CMMLMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
|
bool &success, double *host_q, double *boxlo,
|
||||||
boxhi, tag, nspecial, special, eflag, vflag, eatom,
|
double *prd) {
|
||||||
vatom, host_start, cpu_time, success, host_q);
|
return CMMLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
|
||||||
|
subhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||||
|
vatom, host_start, ilist, jnum, cpu_time, success,
|
||||||
|
host_q,boxlo,prd);
|
||||||
}
|
}
|
||||||
|
|
||||||
void cmml_gpu_compute(const int timestep, const int ago, const int inum_full,
|
void cmml_gpu_compute(const int ago, const int inum_full, const int nall,
|
||||||
const int nall, double **host_x, int *host_type,
|
double **host_x, int *host_type, int *ilist, int *numj,
|
||||||
int *ilist, int *numj, int **firstneigh,
|
int **firstneigh, const bool eflag, const bool vflag,
|
||||||
const bool eflag, const bool vflag, const bool eatom,
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
const bool vatom, int &host_start, const double cpu_time,
|
const double cpu_time, bool &success, double *host_q,
|
||||||
bool &success, double *host_q) {
|
const int nlocal, double *boxlo, double *prd) {
|
||||||
CMMLMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
|
CMMLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
|
||||||
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
|
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
|
||||||
host_q);
|
host_q,nlocal,boxlo,prd);
|
||||||
}
|
}
|
||||||
|
|
||||||
double cmml_gpu_bytes() {
|
double cmml_gpu_bytes() {
|
||||||
|
|||||||
@ -18,8 +18,6 @@
|
|||||||
#ifndef CMML_GPU_KERNEL
|
#ifndef CMML_GPU_KERNEL
|
||||||
#define CMML_GPU_KERNEL
|
#define CMML_GPU_KERNEL
|
||||||
|
|
||||||
#define MAX_SHARED_TYPES 8
|
|
||||||
|
|
||||||
#ifdef _DOUBLE_DOUBLE
|
#ifdef _DOUBLE_DOUBLE
|
||||||
#define numtyp double
|
#define numtyp double
|
||||||
#define numtyp2 double2
|
#define numtyp2 double2
|
||||||
@ -54,7 +52,7 @@
|
|||||||
|
|
||||||
#ifdef NV_KERNEL
|
#ifdef NV_KERNEL
|
||||||
|
|
||||||
#include "geryon/ucl_nv_kernel.h"
|
#include "nv_kernel_def.h"
|
||||||
texture<float4> pos_tex;
|
texture<float4> pos_tex;
|
||||||
texture<float> q_tex;
|
texture<float> q_tex;
|
||||||
|
|
||||||
@ -90,6 +88,8 @@ __inline float fetch_q(const int& i, const float *q)
|
|||||||
|
|
||||||
#define fetch_pos(i,y) x_[i]
|
#define fetch_pos(i,y) x_[i]
|
||||||
#define fetch_q(i,y) q_[i]
|
#define fetch_q(i,y) q_[i]
|
||||||
|
#define BLOCK_PAIR 64
|
||||||
|
#define MAX_SHARED_TYPES 8
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -100,13 +100,17 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; }
|
|||||||
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||||
__global numtyp4* lj3, const int lj_types,
|
__global numtyp4* lj3, const int lj_types,
|
||||||
__global numtyp *sp_lj_in, __global int *dev_nbor,
|
__global numtyp *sp_lj_in, __global int *dev_nbor,
|
||||||
__global acctyp4 *ans, __global acctyp *engv,
|
__global int *dev_packed, __global acctyp4 *ans,
|
||||||
const int eflag, const int vflag, const int inum,
|
__global acctyp *engv, const int eflag,
|
||||||
const int nall, const int nbor_pitch,
|
const int vflag, const int inum, const int nall,
|
||||||
__global numtyp *q_ , const numtyp cut_coulsq,
|
const int nbor_pitch, __global numtyp *q_ ,
|
||||||
const numtyp qqrd2e, const numtyp g_ewald) {
|
const numtyp cut_coulsq, const numtyp qqrd2e,
|
||||||
// ii indexes the two interacting particles in gi
|
const numtyp g_ewald, const int t_per_atom) {
|
||||||
int ii=GLOBAL_ID_X;
|
int tid=THREAD_ID_X;
|
||||||
|
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||||
|
ii+=tid/t_per_atom;
|
||||||
|
int offset=tid%t_per_atom;
|
||||||
|
|
||||||
__local numtyp sp_lj[8];
|
__local numtyp sp_lj[8];
|
||||||
sp_lj[0]=sp_lj_in[0];
|
sp_lj[0]=sp_lj_in[0];
|
||||||
sp_lj[1]=sp_lj_in[1];
|
sp_lj[1]=sp_lj_in[1];
|
||||||
@ -117,7 +121,6 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||||||
sp_lj[6]=sp_lj_in[6];
|
sp_lj[6]=sp_lj_in[6];
|
||||||
sp_lj[7]=sp_lj_in[7];
|
sp_lj[7]=sp_lj_in[7];
|
||||||
|
|
||||||
if (ii<inum) {
|
|
||||||
acctyp energy=(acctyp)0;
|
acctyp energy=(acctyp)0;
|
||||||
acctyp e_coul=(acctyp)0;
|
acctyp e_coul=(acctyp)0;
|
||||||
acctyp4 f;
|
acctyp4 f;
|
||||||
@ -128,18 +131,31 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||||||
for (int i=0; i<6; i++)
|
for (int i=0; i<6; i++)
|
||||||
virial[i]=(acctyp)0;
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
|
if (ii<inum) {
|
||||||
__global int *nbor=dev_nbor+ii;
|
__global int *nbor=dev_nbor+ii;
|
||||||
int i=*nbor;
|
int i=*nbor;
|
||||||
nbor+=nbor_pitch;
|
nbor+=nbor_pitch;
|
||||||
int numj=*nbor;
|
int numj=*nbor;
|
||||||
nbor+=nbor_pitch;
|
nbor+=nbor_pitch;
|
||||||
__global int *list_end=nbor+mul24(numj,nbor_pitch);
|
|
||||||
|
int n_stride;
|
||||||
|
__global int *list_end;
|
||||||
|
if (dev_nbor==dev_packed) {
|
||||||
|
list_end=nbor+mul24(numj,nbor_pitch);
|
||||||
|
nbor+=mul24(offset,nbor_pitch);
|
||||||
|
n_stride=mul24(t_per_atom,nbor_pitch);
|
||||||
|
} else {
|
||||||
|
nbor=dev_packed+*nbor;
|
||||||
|
list_end=nbor+numj;
|
||||||
|
n_stride=t_per_atom;
|
||||||
|
nbor+=offset;
|
||||||
|
}
|
||||||
|
|
||||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||||
numtyp qtmp=fetch_q(i,q_);
|
numtyp qtmp=fetch_q(i,q_);
|
||||||
int itype=ix.w;
|
int itype=ix.w;
|
||||||
|
|
||||||
for ( ; nbor<list_end; nbor+=nbor_pitch) {
|
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||||
int j=*nbor;
|
int j=*nbor;
|
||||||
|
|
||||||
numtyp factor_lj, factor_coul;
|
numtyp factor_lj, factor_coul;
|
||||||
@ -213,8 +229,49 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||||||
}
|
}
|
||||||
|
|
||||||
} // for nbor
|
} // for nbor
|
||||||
|
} // if ii
|
||||||
|
|
||||||
|
// Reduce answers
|
||||||
|
if (t_per_atom>1) {
|
||||||
|
__local acctyp red_acc[6][BLOCK_PAIR];
|
||||||
|
|
||||||
|
red_acc[0][tid]=f.x;
|
||||||
|
red_acc[1][tid]=f.y;
|
||||||
|
red_acc[2][tid]=f.z;
|
||||||
|
red_acc[3][tid]=energy;
|
||||||
|
red_acc[4][tid]=e_coul;
|
||||||
|
|
||||||
|
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||||
|
if (offset < s) {
|
||||||
|
for (int r=0; r<5; r++)
|
||||||
|
red_acc[r][tid] += red_acc[r][tid+s];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
f.x=red_acc[0][tid];
|
||||||
|
f.y=red_acc[1][tid];
|
||||||
|
f.z=red_acc[2][tid];
|
||||||
|
energy=red_acc[3][tid];
|
||||||
|
e_coul=red_acc[4][tid];
|
||||||
|
|
||||||
|
if (vflag>0) {
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
red_acc[r][tid]=virial[r];
|
||||||
|
|
||||||
|
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||||
|
if (offset < s) {
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
red_acc[r][tid] += red_acc[r][tid+s];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
virial[r]=red_acc[r][tid];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Store answers
|
// Store answers
|
||||||
|
if (ii<inum && offset==0) {
|
||||||
__global acctyp *ap1=engv+ii;
|
__global acctyp *ap1=engv+ii;
|
||||||
if (eflag>0) {
|
if (eflag>0) {
|
||||||
*ap1=energy;
|
*ap1=energy;
|
||||||
@ -234,27 +291,28 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||||||
|
|
||||||
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||||
__global numtyp4* lj3_in,
|
__global numtyp4* lj3_in,
|
||||||
__global numtyp* sp_lj_in, __global int *dev_nbor,
|
__global numtyp* sp_lj_in,
|
||||||
|
__global int *dev_nbor, __global int *dev_packed,
|
||||||
__global acctyp4 *ans, __global acctyp *engv,
|
__global acctyp4 *ans, __global acctyp *engv,
|
||||||
const int eflag, const int vflag, const int inum,
|
const int eflag, const int vflag, const int inum,
|
||||||
const int nall, const int nbor_pitch,
|
const int nall, const int nbor_pitch,
|
||||||
__global numtyp *q_ , const numtyp cut_coulsq,
|
__global numtyp *q_ , const numtyp cut_coulsq,
|
||||||
const numtyp qqrd2e, const numtyp g_ewald) {
|
const numtyp qqrd2e, const numtyp g_ewald,
|
||||||
// ii indexes the two interacting particles in gi
|
const int t_per_atom) {
|
||||||
int ii=THREAD_ID_X;
|
int tid=THREAD_ID_X;
|
||||||
|
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||||
|
ii+=tid/t_per_atom;
|
||||||
|
int offset=tid%t_per_atom;
|
||||||
|
|
||||||
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||||
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||||
__local numtyp sp_lj[8];
|
__local numtyp sp_lj[8];
|
||||||
if (ii<8)
|
if (tid<8)
|
||||||
sp_lj[ii]=sp_lj_in[ii];
|
sp_lj[tid]=sp_lj_in[tid];
|
||||||
if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||||
lj1[ii]=lj1_in[ii];
|
lj1[tid]=lj1_in[tid];
|
||||||
lj3[ii]=lj3_in[ii];
|
lj3[tid]=lj3_in[tid];
|
||||||
}
|
}
|
||||||
ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
if (ii<inum) {
|
|
||||||
|
|
||||||
acctyp energy=(acctyp)0;
|
acctyp energy=(acctyp)0;
|
||||||
acctyp e_coul=(acctyp)0;
|
acctyp e_coul=(acctyp)0;
|
||||||
@ -266,19 +324,34 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
|||||||
for (int i=0; i<6; i++)
|
for (int i=0; i<6; i++)
|
||||||
virial[i]=(acctyp)0;
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
if (ii<inum) {
|
||||||
__global int *nbor=dev_nbor+ii;
|
__global int *nbor=dev_nbor+ii;
|
||||||
int i=*nbor;
|
int i=*nbor;
|
||||||
nbor+=nbor_pitch;
|
nbor+=nbor_pitch;
|
||||||
int numj=*nbor;
|
int numj=*nbor;
|
||||||
nbor+=nbor_pitch;
|
nbor+=nbor_pitch;
|
||||||
__global int *list_end=nbor+mul24(numj,nbor_pitch);
|
|
||||||
|
int n_stride;
|
||||||
|
__global int *list_end;
|
||||||
|
if (dev_nbor==dev_packed) {
|
||||||
|
list_end=nbor+mul24(numj,nbor_pitch);
|
||||||
|
nbor+=mul24(offset,nbor_pitch);
|
||||||
|
n_stride=mul24(t_per_atom,nbor_pitch);
|
||||||
|
} else {
|
||||||
|
nbor=dev_packed+*nbor;
|
||||||
|
list_end=nbor+numj;
|
||||||
|
n_stride=t_per_atom;
|
||||||
|
nbor+=offset;
|
||||||
|
}
|
||||||
|
|
||||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||||
numtyp qtmp=fetch_q(i,q_);
|
numtyp qtmp=fetch_q(i,q_);
|
||||||
int iw=ix.w;
|
int iw=ix.w;
|
||||||
int itype=mul24((int)MAX_SHARED_TYPES,iw);
|
int itype=mul24((int)MAX_SHARED_TYPES,iw);
|
||||||
|
|
||||||
for ( ; nbor<list_end; nbor+=nbor_pitch) {
|
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||||
int j=*nbor;
|
int j=*nbor;
|
||||||
|
|
||||||
numtyp factor_lj, factor_coul;
|
numtyp factor_lj, factor_coul;
|
||||||
@ -351,8 +424,49 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
|||||||
}
|
}
|
||||||
|
|
||||||
} // for nbor
|
} // for nbor
|
||||||
|
} // if ii
|
||||||
|
|
||||||
|
// Reduce answers
|
||||||
|
if (t_per_atom>1) {
|
||||||
|
__local acctyp red_acc[6][BLOCK_PAIR];
|
||||||
|
|
||||||
|
red_acc[0][tid]=f.x;
|
||||||
|
red_acc[1][tid]=f.y;
|
||||||
|
red_acc[2][tid]=f.z;
|
||||||
|
red_acc[3][tid]=energy;
|
||||||
|
red_acc[4][tid]=e_coul;
|
||||||
|
|
||||||
|
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||||
|
if (offset < s) {
|
||||||
|
for (int r=0; r<5; r++)
|
||||||
|
red_acc[r][tid] += red_acc[r][tid+s];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
f.x=red_acc[0][tid];
|
||||||
|
f.y=red_acc[1][tid];
|
||||||
|
f.z=red_acc[2][tid];
|
||||||
|
energy=red_acc[3][tid];
|
||||||
|
e_coul=red_acc[4][tid];
|
||||||
|
|
||||||
|
if (vflag>0) {
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
red_acc[r][tid]=virial[r];
|
||||||
|
|
||||||
|
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||||
|
if (offset < s) {
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
red_acc[r][tid] += red_acc[r][tid+s];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
virial[r]=red_acc[r][tid];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Store answers
|
// Store answers
|
||||||
|
if (ii<inum && offset==0) {
|
||||||
__global acctyp *ap1=engv+ii;
|
__global acctyp *ap1=engv+ii;
|
||||||
if (eflag>0) {
|
if (eflag>0) {
|
||||||
*ap1=energy;
|
*ap1=energy;
|
||||||
|
|||||||
@ -43,7 +43,7 @@ int CMML_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
bool CMML_GPU_MemoryT::init(const int ntypes, double **host_cutsq,
|
int CMML_GPU_MemoryT::init(const int ntypes, double **host_cutsq,
|
||||||
int **host_cg_type, double **host_lj1,
|
int **host_cg_type, double **host_lj1,
|
||||||
double **host_lj2, double **host_lj3,
|
double **host_lj2, double **host_lj3,
|
||||||
double **host_lj4, double **host_offset,
|
double **host_lj4, double **host_offset,
|
||||||
@ -55,14 +55,18 @@ bool CMML_GPU_MemoryT::init(const int ntypes, double **host_cutsq,
|
|||||||
const double host_cut_coulsq,
|
const double host_cut_coulsq,
|
||||||
double *host_special_coul, const double qqrd2e,
|
double *host_special_coul, const double qqrd2e,
|
||||||
const double g_ewald) {
|
const double g_ewald) {
|
||||||
this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
int success;
|
||||||
|
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||||
_screen,cmmc_long_gpu_kernel);
|
_screen,cmmc_long_gpu_kernel);
|
||||||
|
if (success!=0)
|
||||||
|
return success;
|
||||||
|
|
||||||
// If atom type constants fit in shared memory use fast kernel
|
// If atom type constants fit in shared memory use fast kernel
|
||||||
int lj_types=ntypes;
|
int lj_types=ntypes;
|
||||||
shared_types=false;
|
shared_types=false;
|
||||||
if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) {
|
int max_shared_types=this->device->max_shared_types();
|
||||||
lj_types=MAX_SHARED_TYPES;
|
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
|
||||||
|
lj_types=max_shared_types;
|
||||||
shared_types=true;
|
shared_types=true;
|
||||||
}
|
}
|
||||||
_lj_types=lj_types;
|
_lj_types=lj_types;
|
||||||
@ -95,7 +99,7 @@ bool CMML_GPU_MemoryT::init(const int ntypes, double **host_cutsq,
|
|||||||
|
|
||||||
_allocated=true;
|
_allocated=true;
|
||||||
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
|
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
|
||||||
return true;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
@ -133,9 +137,10 @@ void CMML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
|||||||
else
|
else
|
||||||
vflag=0;
|
vflag=0;
|
||||||
|
|
||||||
int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
|
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||||
|
(BX/this->_threads_per_atom)));
|
||||||
|
|
||||||
int ainum=this->atom->inum();
|
int ainum=this->ans->inum();
|
||||||
int anall=this->atom->nall();
|
int anall=this->atom->nall();
|
||||||
int nbor_pitch=this->nbor->nbor_pitch();
|
int nbor_pitch=this->nbor->nbor_pitch();
|
||||||
this->time_pair.start();
|
this->time_pair.start();
|
||||||
@ -144,19 +149,21 @@ void CMML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
|||||||
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
|
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
|
||||||
&lj3.begin(), &sp_lj.begin(),
|
&lj3.begin(), &sp_lj.begin(),
|
||||||
&this->nbor->dev_nbor.begin(),
|
&this->nbor->dev_nbor.begin(),
|
||||||
&this->atom->dev_ans.begin(),
|
&this->_nbor_data->begin(),
|
||||||
&this->atom->dev_engv.begin(), &eflag, &vflag,
|
&this->ans->dev_ans.begin(),
|
||||||
|
&this->ans->dev_engv.begin(), &eflag, &vflag,
|
||||||
&ainum, &anall, &nbor_pitch,
|
&ainum, &anall, &nbor_pitch,
|
||||||
&this->atom->dev_q.begin(), &_cut_coulsq,
|
&this->atom->dev_q.begin(), &_cut_coulsq,
|
||||||
&_qqrd2e, &_g_ewald);
|
&_qqrd2e, &_g_ewald, &this->_threads_per_atom);
|
||||||
} else {
|
} else {
|
||||||
this->k_pair.set_size(GX,BX);
|
this->k_pair.set_size(GX,BX);
|
||||||
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
|
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
|
||||||
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
||||||
&this->atom->dev_ans.begin(),
|
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
|
||||||
&this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
|
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||||
&anall, &nbor_pitch, &this->atom->dev_q.begin(),
|
&anall, &nbor_pitch, &this->atom->dev_q.begin(),
|
||||||
&_cut_coulsq, &_qqrd2e, &_g_ewald);
|
&_cut_coulsq, &_qqrd2e, &_g_ewald,
|
||||||
|
&this->_threads_per_atom);
|
||||||
}
|
}
|
||||||
this->time_pair.stop();
|
this->time_pair.stop();
|
||||||
}
|
}
|
||||||
|
|||||||
@ -29,8 +29,15 @@ class CMML_GPU_Memory : public ChargeGPUMemory<numtyp, acctyp> {
|
|||||||
/// Clear any previous data and set up for a new LAMMPS run
|
/// Clear any previous data and set up for a new LAMMPS run
|
||||||
/** \param max_nbors initial number of rows in the neighbor matrix
|
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||||
* \param cell_size cutoff + skin
|
* \param cell_size cutoff + skin
|
||||||
* \param gpu_split fraction of particles handled by device **/
|
* \param gpu_split fraction of particles handled by device
|
||||||
bool init(const int ntypes, double **host_cutsq, int ** cg_type,
|
*
|
||||||
|
* Returns:
|
||||||
|
* - 0 if successfull
|
||||||
|
* - -1 if fix gpu not found
|
||||||
|
* - -3 if there is an out of memory error
|
||||||
|
* - -4 if the GPU library was not compiled for GPU
|
||||||
|
* - -5 Double precision is not supported on card **/
|
||||||
|
int init(const int ntypes, double **host_cutsq, int ** cg_type,
|
||||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||||
double **host_lj4, double **host_offset, double *host_special_lj,
|
double **host_lj4, double **host_offset, double *host_special_lj,
|
||||||
const int nlocal, const int nall, const int max_nbors,
|
const int nlocal, const int nall, const int max_nbors,
|
||||||
|
|||||||
@ -28,7 +28,7 @@ static CRML_GPU_Memory<PRECISION,ACC_PRECISION> CRMLMF;
|
|||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
// Allocate memory on host and device and copy constants to device
|
// Allocate memory on host and device and copy constants to device
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
bool crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
|
int crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
|
||||||
double **host_lj2, double **host_lj3, double **host_lj4,
|
double **host_lj2, double **host_lj3, double **host_lj4,
|
||||||
double **offset, double *special_lj, const int inum,
|
double **offset, double *special_lj, const int inum,
|
||||||
const int nall, const int max_nbors, const int maxspecial,
|
const int nall, const int max_nbors, const int maxspecial,
|
||||||
@ -58,16 +58,13 @@ bool crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
|
|||||||
fflush(screen);
|
fflush(screen);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (world_me==0) {
|
int init_ok=0;
|
||||||
bool init_ok=CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3,
|
if (world_me==0)
|
||||||
host_lj4, offset, special_lj, inum, nall, 300,
|
CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3, host_lj4,
|
||||||
maxspecial, cell_size, gpu_split, screen,
|
offset, special_lj, inum, nall, 300, maxspecial, cell_size,
|
||||||
host_cut_ljsq, host_cut_coulsq, host_special_coul,
|
gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
|
||||||
qqrd2e, g_ewald, cut_lj_innersq, denom_lj,
|
host_special_coul, qqrd2e, g_ewald, cut_lj_innersq, denom_lj,
|
||||||
epsilon,sigma,mix_arithmetic);
|
epsilon,sigma,mix_arithmetic);
|
||||||
if (!init_ok)
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
CRMLMF.device->world_barrier();
|
CRMLMF.device->world_barrier();
|
||||||
if (message)
|
if (message)
|
||||||
@ -82,50 +79,54 @@ bool crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
|
|||||||
last_gpu,i);
|
last_gpu,i);
|
||||||
fflush(screen);
|
fflush(screen);
|
||||||
}
|
}
|
||||||
if (gpu_rank==i && world_me!=0) {
|
if (gpu_rank==i && world_me!=0)
|
||||||
bool init_ok=CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3,
|
init_ok=CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3,
|
||||||
host_lj4, offset, special_lj, inum, nall, 300,
|
host_lj4, offset, special_lj, inum, nall, 300,
|
||||||
maxspecial, cell_size, gpu_split,
|
maxspecial, cell_size, gpu_split, screen,
|
||||||
screen, host_cut_ljsq, host_cut_coulsq,
|
host_cut_ljsq, host_cut_coulsq, host_special_coul,
|
||||||
host_special_coul, qqrd2e, g_ewald,
|
qqrd2e, g_ewald, cut_lj_innersq, denom_lj, epsilon,
|
||||||
cut_lj_innersq, denom_lj, epsilon, sigma,
|
sigma, mix_arithmetic);
|
||||||
mix_arithmetic);
|
|
||||||
if (!init_ok)
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
CRMLMF.device->gpu_barrier();
|
CRMLMF.device->gpu_barrier();
|
||||||
if (message)
|
if (message)
|
||||||
fprintf(screen,"Done.\n");
|
fprintf(screen,"Done.\n");
|
||||||
}
|
}
|
||||||
if (message)
|
if (message)
|
||||||
fprintf(screen,"\n");
|
fprintf(screen,"\n");
|
||||||
return true;
|
|
||||||
|
if (init_ok==0)
|
||||||
|
CRMLMF.estimate_gpu_overhead();
|
||||||
|
return init_ok;
|
||||||
}
|
}
|
||||||
|
|
||||||
void crml_gpu_clear() {
|
void crml_gpu_clear() {
|
||||||
CRMLMF.clear();
|
CRMLMF.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
int * crml_gpu_compute_n(const int timestep, const int ago, const int inum_full,
|
int** crml_gpu_compute_n(const int ago, const int inum_full,
|
||||||
const int nall, double **host_x, int *host_type,
|
const int nall, double **host_x, int *host_type,
|
||||||
double *boxlo, double *boxhi, int *tag, int **nspecial,
|
double *sublo, double *subhi, int *tag, int **nspecial,
|
||||||
int **special, const bool eflag, const bool vflag,
|
int **special, const bool eflag, const bool vflag,
|
||||||
const bool eatom, const bool vatom, int &host_start,
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
const double cpu_time, bool &success, double *host_q) {
|
int **ilist, int **jnum, const double cpu_time,
|
||||||
return CRMLMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
|
bool &success, double *host_q, double *boxlo,
|
||||||
boxhi, tag, nspecial, special, eflag, vflag, eatom,
|
double *prd) {
|
||||||
vatom, host_start, cpu_time, success, host_q);
|
return CRMLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
|
||||||
|
subhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||||
|
vatom, host_start, ilist, jnum, cpu_time, success,
|
||||||
|
host_q, boxlo, prd);
|
||||||
}
|
}
|
||||||
|
|
||||||
void crml_gpu_compute(const int timestep, const int ago, const int inum_full,
|
void crml_gpu_compute(const int ago, const int inum_full,
|
||||||
const int nall, double **host_x, int *host_type,
|
const int nall, double **host_x, int *host_type,
|
||||||
int *ilist, int *numj, int **firstneigh,
|
int *ilist, int *numj, int **firstneigh,
|
||||||
const bool eflag, const bool vflag, const bool eatom,
|
const bool eflag, const bool vflag, const bool eatom,
|
||||||
const bool vatom, int &host_start, const double cpu_time,
|
const bool vatom, int &host_start, const double cpu_time,
|
||||||
bool &success, double *host_q) {
|
bool &success, double *host_q, const int nlocal,
|
||||||
CRMLMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
|
double *boxlo, double *prd) {
|
||||||
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
|
CRMLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,
|
||||||
host_q);
|
eflag,vflag,eatom,vatom,host_start,cpu_time,success,host_q,
|
||||||
|
nlocal,boxlo,prd);
|
||||||
}
|
}
|
||||||
|
|
||||||
double crml_gpu_bytes() {
|
double crml_gpu_bytes() {
|
||||||
|
|||||||
@ -54,7 +54,7 @@
|
|||||||
|
|
||||||
#ifdef NV_KERNEL
|
#ifdef NV_KERNEL
|
||||||
|
|
||||||
#include "geryon/ucl_nv_kernel.h"
|
#include "nv_kernel_def.h"
|
||||||
texture<float4> pos_tex;
|
texture<float4> pos_tex;
|
||||||
texture<float> q_tex;
|
texture<float> q_tex;
|
||||||
|
|
||||||
@ -90,6 +90,7 @@ __inline float fetch_q(const int& i, const float *q)
|
|||||||
|
|
||||||
#define fetch_pos(i,y) x_[i]
|
#define fetch_pos(i,y) x_[i]
|
||||||
#define fetch_q(i,y) q_[i]
|
#define fetch_q(i,y) q_[i]
|
||||||
|
#define BLOCK_BIO_PAIR 64
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -98,18 +99,22 @@ __inline float fetch_q(const int& i, const float *q)
|
|||||||
__inline int sbmask(int j) { return j >> SBBITS & 3; }
|
__inline int sbmask(int j) { return j >> SBBITS & 3; }
|
||||||
|
|
||||||
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||||
const int lj_types,
|
const int lj_types, __global numtyp *sp_lj_in,
|
||||||
__global numtyp *sp_lj_in, __global int *dev_nbor,
|
__global int *dev_nbor, __global int *dev_packed,
|
||||||
__global acctyp4 *ans, __global acctyp *engv,
|
__global acctyp4 *ans, __global acctyp *engv,
|
||||||
const int eflag, const int vflag, const int inum,
|
const int eflag, const int vflag, const int inum,
|
||||||
const int nall, const int nbor_pitch,
|
const int nall, const int nbor_pitch,
|
||||||
__global numtyp *q_, const numtyp cut_coulsq,
|
__global numtyp *q_, const numtyp cut_coulsq,
|
||||||
const numtyp qqrd2e, const numtyp g_ewald,
|
const numtyp qqrd2e, const numtyp g_ewald,
|
||||||
const numtyp denom_lj, const numtyp cut_bothsq,
|
const numtyp denom_lj, const numtyp cut_bothsq,
|
||||||
const numtyp cut_ljsq, const numtyp cut_lj_innersq) {
|
const numtyp cut_ljsq, const numtyp cut_lj_innersq,
|
||||||
|
const int t_per_atom) {
|
||||||
|
|
||||||
|
int tid=THREAD_ID_X;
|
||||||
|
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||||
|
ii+=tid/t_per_atom;
|
||||||
|
int offset=tid%t_per_atom;
|
||||||
|
|
||||||
// ii indexes the two interacting particles in gi
|
|
||||||
int ii=GLOBAL_ID_X;
|
|
||||||
__local numtyp sp_lj[8];
|
__local numtyp sp_lj[8];
|
||||||
sp_lj[0]=sp_lj_in[0];
|
sp_lj[0]=sp_lj_in[0];
|
||||||
sp_lj[1]=sp_lj_in[1];
|
sp_lj[1]=sp_lj_in[1];
|
||||||
@ -120,7 +125,6 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||||||
sp_lj[6]=sp_lj_in[6];
|
sp_lj[6]=sp_lj_in[6];
|
||||||
sp_lj[7]=sp_lj_in[7];
|
sp_lj[7]=sp_lj_in[7];
|
||||||
|
|
||||||
if (ii<inum) {
|
|
||||||
acctyp energy=(acctyp)0;
|
acctyp energy=(acctyp)0;
|
||||||
acctyp e_coul=(acctyp)0;
|
acctyp e_coul=(acctyp)0;
|
||||||
acctyp4 f;
|
acctyp4 f;
|
||||||
@ -131,18 +135,31 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||||||
for (int i=0; i<6; i++)
|
for (int i=0; i<6; i++)
|
||||||
virial[i]=(acctyp)0;
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
|
if (ii<inum) {
|
||||||
__global int *nbor=dev_nbor+ii;
|
__global int *nbor=dev_nbor+ii;
|
||||||
int i=*nbor;
|
int i=*nbor;
|
||||||
nbor+=nbor_pitch;
|
nbor+=nbor_pitch;
|
||||||
int numj=*nbor;
|
int numj=*nbor;
|
||||||
nbor+=nbor_pitch;
|
nbor+=nbor_pitch;
|
||||||
__global int *list_end=nbor+mul24(numj,nbor_pitch);
|
|
||||||
|
int n_stride;
|
||||||
|
__global int *list_end;
|
||||||
|
if (dev_nbor==dev_packed) {
|
||||||
|
list_end=nbor+mul24(numj,nbor_pitch);
|
||||||
|
nbor+=mul24(offset,nbor_pitch);
|
||||||
|
n_stride=mul24(t_per_atom,nbor_pitch);
|
||||||
|
} else {
|
||||||
|
nbor=dev_packed+*nbor;
|
||||||
|
list_end=nbor+numj;
|
||||||
|
n_stride=t_per_atom;
|
||||||
|
nbor+=offset;
|
||||||
|
}
|
||||||
|
|
||||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||||
numtyp qtmp=fetch_q(i,q_);
|
numtyp qtmp=fetch_q(i,q_);
|
||||||
int itype=ix.w;
|
int itype=ix.w;
|
||||||
|
|
||||||
for ( ; nbor<list_end; nbor+=nbor_pitch) {
|
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||||
int j=*nbor;
|
int j=*nbor;
|
||||||
|
|
||||||
numtyp factor_lj, factor_coul;
|
numtyp factor_lj, factor_coul;
|
||||||
@ -219,8 +236,49 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||||||
}
|
}
|
||||||
|
|
||||||
} // for nbor
|
} // for nbor
|
||||||
|
} // if ii
|
||||||
|
|
||||||
|
// Reduce answers
|
||||||
|
if (t_per_atom>1) {
|
||||||
|
__local acctyp red_acc[6][BLOCK_BIO_PAIR];
|
||||||
|
|
||||||
|
red_acc[0][tid]=f.x;
|
||||||
|
red_acc[1][tid]=f.y;
|
||||||
|
red_acc[2][tid]=f.z;
|
||||||
|
red_acc[3][tid]=energy;
|
||||||
|
red_acc[4][tid]=e_coul;
|
||||||
|
|
||||||
|
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||||
|
if (offset < s) {
|
||||||
|
for (int r=0; r<5; r++)
|
||||||
|
red_acc[r][tid] += red_acc[r][tid+s];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
f.x=red_acc[0][tid];
|
||||||
|
f.y=red_acc[1][tid];
|
||||||
|
f.z=red_acc[2][tid];
|
||||||
|
energy=red_acc[3][tid];
|
||||||
|
e_coul=red_acc[4][tid];
|
||||||
|
|
||||||
|
if (vflag>0) {
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
red_acc[r][tid]=virial[r];
|
||||||
|
|
||||||
|
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||||
|
if (offset < s) {
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
red_acc[r][tid] += red_acc[r][tid+s];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
virial[r]=red_acc[r][tid];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Store answers
|
// Store answers
|
||||||
|
if (ii<inum && offset==0) {
|
||||||
__global acctyp *ap1=engv+ii;
|
__global acctyp *ap1=engv+ii;
|
||||||
if (eflag>0) {
|
if (eflag>0) {
|
||||||
*ap1=energy;
|
*ap1=energy;
|
||||||
@ -240,27 +298,27 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||||||
|
|
||||||
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
|
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
|
||||||
__global numtyp* sp_lj_in, __global int *dev_nbor,
|
__global numtyp* sp_lj_in, __global int *dev_nbor,
|
||||||
__global acctyp4 *ans, __global acctyp *engv,
|
__global int *dev_packed, __global acctyp4 *ans,
|
||||||
const int eflag, const int vflag, const int inum,
|
__global acctyp *engv, const int eflag,
|
||||||
const int nall, const int nbor_pitch,
|
const int vflag, const int inum, const int nall,
|
||||||
__global numtyp *q_, const numtyp cut_coulsq,
|
const int nbor_pitch, __global numtyp *q_,
|
||||||
const numtyp qqrd2e, const numtyp g_ewald,
|
const numtyp cut_coulsq, const numtyp qqrd2e,
|
||||||
const numtyp denom_lj, const numtyp cut_bothsq,
|
const numtyp g_ewald, const numtyp denom_lj,
|
||||||
const numtyp cut_ljsq,
|
const numtyp cut_bothsq, const numtyp cut_ljsq,
|
||||||
const numtyp cut_lj_innersq) {
|
const numtyp cut_lj_innersq,
|
||||||
// ii indexes the two interacting particles in gi
|
const int t_per_atom) {
|
||||||
int ii=THREAD_ID_X;
|
int tid=THREAD_ID_X;
|
||||||
|
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||||
|
ii+=tid/t_per_atom;
|
||||||
|
int offset=tid%t_per_atom;
|
||||||
|
|
||||||
__local numtyp2 ljd[MAX_BIO_SHARED_TYPES];
|
__local numtyp2 ljd[MAX_BIO_SHARED_TYPES];
|
||||||
__local numtyp sp_lj[8];
|
__local numtyp sp_lj[8];
|
||||||
if (ii<8)
|
if (tid<8)
|
||||||
sp_lj[ii]=sp_lj_in[ii];
|
sp_lj[tid]=sp_lj_in[tid];
|
||||||
ljd[ii]=ljd_in[ii];
|
ljd[tid]=ljd_in[tid];
|
||||||
ljd[ii+64]=ljd_in[ii+64];
|
if (tid+BLOCK_BIO_PAIR<MAX_BIO_SHARED_TYPES)
|
||||||
|
ljd[tid+BLOCK_BIO_PAIR]=ljd_in[tid+BLOCK_BIO_PAIR];
|
||||||
ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
if (ii<inum) {
|
|
||||||
|
|
||||||
acctyp energy=(acctyp)0;
|
acctyp energy=(acctyp)0;
|
||||||
acctyp e_coul=(acctyp)0;
|
acctyp e_coul=(acctyp)0;
|
||||||
@ -272,18 +330,33 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
|
|||||||
for (int i=0; i<6; i++)
|
for (int i=0; i<6; i++)
|
||||||
virial[i]=(acctyp)0;
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
if (ii<inum) {
|
||||||
__global int *nbor=dev_nbor+ii;
|
__global int *nbor=dev_nbor+ii;
|
||||||
int i=*nbor;
|
int i=*nbor;
|
||||||
nbor+=nbor_pitch;
|
nbor+=nbor_pitch;
|
||||||
int numj=*nbor;
|
int numj=*nbor;
|
||||||
nbor+=nbor_pitch;
|
nbor+=nbor_pitch;
|
||||||
__global int *list_end=nbor+mul24(numj,nbor_pitch);
|
|
||||||
|
int n_stride;
|
||||||
|
__global int *list_end;
|
||||||
|
if (dev_nbor==dev_packed) {
|
||||||
|
list_end=nbor+mul24(numj,nbor_pitch);
|
||||||
|
nbor+=mul24(offset,nbor_pitch);
|
||||||
|
n_stride=mul24(t_per_atom,nbor_pitch);
|
||||||
|
} else {
|
||||||
|
nbor=dev_packed+*nbor;
|
||||||
|
list_end=nbor+numj;
|
||||||
|
n_stride=t_per_atom;
|
||||||
|
nbor+=offset;
|
||||||
|
}
|
||||||
|
|
||||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||||
numtyp qtmp=fetch_q(i,q_);
|
numtyp qtmp=fetch_q(i,q_);
|
||||||
int itype=ix.w;
|
int itype=ix.w;
|
||||||
|
|
||||||
for ( ; nbor<list_end; nbor+=nbor_pitch) {
|
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||||
int j=*nbor;
|
int j=*nbor;
|
||||||
|
|
||||||
numtyp factor_lj, factor_coul;
|
numtyp factor_lj, factor_coul;
|
||||||
@ -366,8 +439,49 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
|
|||||||
}
|
}
|
||||||
|
|
||||||
} // for nbor
|
} // for nbor
|
||||||
|
} // if ii
|
||||||
|
|
||||||
|
// Reduce answers
|
||||||
|
if (t_per_atom>1) {
|
||||||
|
__local acctyp red_acc[6][BLOCK_BIO_PAIR];
|
||||||
|
|
||||||
|
red_acc[0][tid]=f.x;
|
||||||
|
red_acc[1][tid]=f.y;
|
||||||
|
red_acc[2][tid]=f.z;
|
||||||
|
red_acc[3][tid]=energy;
|
||||||
|
red_acc[4][tid]=e_coul;
|
||||||
|
|
||||||
|
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||||
|
if (offset < s) {
|
||||||
|
for (int r=0; r<5; r++)
|
||||||
|
red_acc[r][tid] += red_acc[r][tid+s];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
f.x=red_acc[0][tid];
|
||||||
|
f.y=red_acc[1][tid];
|
||||||
|
f.z=red_acc[2][tid];
|
||||||
|
energy=red_acc[3][tid];
|
||||||
|
e_coul=red_acc[4][tid];
|
||||||
|
|
||||||
|
if (vflag>0) {
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
red_acc[r][tid]=virial[r];
|
||||||
|
|
||||||
|
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||||
|
if (offset < s) {
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
red_acc[r][tid] += red_acc[r][tid+s];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
virial[r]=red_acc[r][tid];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Store answers
|
// Store answers
|
||||||
|
if (ii<inum && offset==0) {
|
||||||
__global acctyp *ap1=engv+ii;
|
__global acctyp *ap1=engv+ii;
|
||||||
if (eflag>0) {
|
if (eflag>0) {
|
||||||
*ap1=energy;
|
*ap1=energy;
|
||||||
|
|||||||
@ -43,7 +43,7 @@ int CRML_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
bool CRML_GPU_MemoryT::init(const int ntypes,
|
int CRML_GPU_MemoryT::init(const int ntypes,
|
||||||
double host_cut_bothsq, double **host_lj1,
|
double host_cut_bothsq, double **host_lj1,
|
||||||
double **host_lj2, double **host_lj3,
|
double **host_lj2, double **host_lj3,
|
||||||
double **host_lj4, double **host_offset,
|
double **host_lj4, double **host_offset,
|
||||||
@ -56,20 +56,24 @@ bool CRML_GPU_MemoryT::init(const int ntypes,
|
|||||||
const double g_ewald, const double cut_lj_innersq,
|
const double g_ewald, const double cut_lj_innersq,
|
||||||
const double denom_lj, double **epsilon,
|
const double denom_lj, double **epsilon,
|
||||||
double **sigma, const bool mix_arithmetic) {
|
double **sigma, const bool mix_arithmetic) {
|
||||||
this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
int success;
|
||||||
|
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||||
_screen,crml_gpu_kernel);
|
_screen,crml_gpu_kernel);
|
||||||
|
if (success!=0)
|
||||||
|
return success;
|
||||||
|
|
||||||
// If atom type constants fit in shared memory use fast kernel
|
// If atom type constants fit in shared memory use fast kernel
|
||||||
int lj_types=ntypes;
|
int lj_types=ntypes;
|
||||||
shared_types=false;
|
shared_types=false;
|
||||||
if (this->_block_size>=64 && mix_arithmetic)
|
if (this->_block_bio_size>=64 && mix_arithmetic)
|
||||||
shared_types=true;
|
shared_types=true;
|
||||||
_lj_types=lj_types;
|
_lj_types=lj_types;
|
||||||
|
|
||||||
// Allocate a host write buffer for data initialization
|
// Allocate a host write buffer for data initialization
|
||||||
int h_size=lj_types*lj_types;
|
int h_size=lj_types*lj_types;
|
||||||
if (h_size<MAX_BIO_SHARED_TYPES)
|
int max_bio_shared_types=this->device->max_bio_shared_types();
|
||||||
h_size=MAX_BIO_SHARED_TYPES;
|
if (h_size<max_bio_shared_types)
|
||||||
|
h_size=max_bio_shared_types;
|
||||||
UCL_H_Vec<numtyp> host_write(h_size*32,*(this->ucl_device),
|
UCL_H_Vec<numtyp> host_write(h_size*32,*(this->ucl_device),
|
||||||
UCL_WRITE_OPTIMIZED);
|
UCL_WRITE_OPTIMIZED);
|
||||||
for (int i=0; i<h_size*32; i++)
|
for (int i=0; i<h_size*32; i++)
|
||||||
@ -79,7 +83,7 @@ bool CRML_GPU_MemoryT::init(const int ntypes,
|
|||||||
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
|
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
|
||||||
host_lj3,host_lj4);
|
host_lj3,host_lj4);
|
||||||
|
|
||||||
ljd.alloc(MAX_BIO_SHARED_TYPES,*(this->ucl_device),UCL_READ_ONLY);
|
ljd.alloc(max_bio_shared_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||||
this->atom->self_pack2(ntypes,ljd,host_write,epsilon,sigma);
|
this->atom->self_pack2(ntypes,ljd,host_write,epsilon,sigma);
|
||||||
|
|
||||||
sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
|
sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
|
||||||
@ -99,7 +103,7 @@ bool CRML_GPU_MemoryT::init(const int ntypes,
|
|||||||
|
|
||||||
_allocated=true;
|
_allocated=true;
|
||||||
this->_max_bytes=lj1.row_bytes()+ljd.row_bytes()+sp_lj.row_bytes();
|
this->_max_bytes=lj1.row_bytes()+ljd.row_bytes()+sp_lj.row_bytes();
|
||||||
return true;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
@ -125,7 +129,7 @@ double CRML_GPU_MemoryT::host_memory_usage() const {
|
|||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
void CRML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
void CRML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
||||||
// Compute the block size and grid size to keep all cores busy
|
// Compute the block size and grid size to keep all cores busy
|
||||||
const int BX=this->block_size();
|
const int BX=this->_block_bio_size;
|
||||||
int eflag, vflag;
|
int eflag, vflag;
|
||||||
if (_eflag)
|
if (_eflag)
|
||||||
eflag=1;
|
eflag=1;
|
||||||
@ -137,9 +141,10 @@ void CRML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
|||||||
else
|
else
|
||||||
vflag=0;
|
vflag=0;
|
||||||
|
|
||||||
int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
|
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||||
|
(BX/this->_threads_per_atom)));
|
||||||
|
|
||||||
int ainum=this->atom->inum();
|
int ainum=this->ans->inum();
|
||||||
int anall=this->atom->nall();
|
int anall=this->atom->nall();
|
||||||
int nbor_pitch=this->nbor->nbor_pitch();
|
int nbor_pitch=this->nbor->nbor_pitch();
|
||||||
this->time_pair.start();
|
this->time_pair.start();
|
||||||
@ -147,21 +152,24 @@ void CRML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
|||||||
this->k_pair_fast.set_size(GX,BX);
|
this->k_pair_fast.set_size(GX,BX);
|
||||||
this->k_pair_fast.run(&this->atom->dev_x.begin(), &ljd.begin(),
|
this->k_pair_fast.run(&this->atom->dev_x.begin(), &ljd.begin(),
|
||||||
&sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
&sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
||||||
&this->atom->dev_ans.begin(),
|
&this->_nbor_data->begin(),
|
||||||
&this->atom->dev_engv.begin(), &eflag, &vflag,
|
&this->ans->dev_ans.begin(),
|
||||||
|
&this->ans->dev_engv.begin(), &eflag, &vflag,
|
||||||
&ainum, &anall, &nbor_pitch,
|
&ainum, &anall, &nbor_pitch,
|
||||||
&this->atom->dev_q.begin(), &_cut_coulsq,
|
&this->atom->dev_q.begin(), &_cut_coulsq,
|
||||||
&_qqrd2e, &_g_ewald, &_denom_lj, &_cut_bothsq,
|
&_qqrd2e, &_g_ewald, &_denom_lj, &_cut_bothsq,
|
||||||
&_cut_ljsq, &_cut_lj_innersq);
|
&_cut_ljsq, &_cut_lj_innersq,
|
||||||
|
&this->_threads_per_atom);
|
||||||
} else {
|
} else {
|
||||||
this->k_pair.set_size(GX,BX);
|
this->k_pair.set_size(GX,BX);
|
||||||
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(),
|
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(),
|
||||||
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
||||||
&this->atom->dev_ans.begin(),
|
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
|
||||||
&this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
|
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||||
&anall, &nbor_pitch, &this->atom->dev_q.begin(),
|
&anall, &nbor_pitch, &this->atom->dev_q.begin(),
|
||||||
&_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj,
|
&_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj,
|
||||||
&_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq);
|
&_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq,
|
||||||
|
&this->_threads_per_atom);
|
||||||
}
|
}
|
||||||
this->time_pair.stop();
|
this->time_pair.stop();
|
||||||
}
|
}
|
||||||
|
|||||||
@ -29,8 +29,15 @@ class CRML_GPU_Memory : public ChargeGPUMemory<numtyp, acctyp> {
|
|||||||
/// Clear any previous data and set up for a new LAMMPS run
|
/// Clear any previous data and set up for a new LAMMPS run
|
||||||
/** \param max_nbors initial number of rows in the neighbor matrix
|
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||||
* \param cell_size cutoff + skin
|
* \param cell_size cutoff + skin
|
||||||
* \param gpu_split fraction of particles handled by device **/
|
* \param gpu_split fraction of particles handled by device
|
||||||
bool init(const int ntypes, double host_cut_bothsq,
|
*
|
||||||
|
* Returns:
|
||||||
|
* - 0 if successfull
|
||||||
|
* - -1 if fix gpu not found
|
||||||
|
* - -3 if there is an out of memory error
|
||||||
|
* - -4 if the GPU library was not compiled for GPU
|
||||||
|
* - -5 Double precision is not supported on card **/
|
||||||
|
int init(const int ntypes, double host_cut_bothsq,
|
||||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||||
double **host_lj4, double **host_offset, double *host_special_lj,
|
double **host_lj4, double **host_offset, double *host_special_lj,
|
||||||
const int nlocal, const int nall, const int max_nbors,
|
const int nlocal, const int nall, const int max_nbors,
|
||||||
|
|||||||
@ -49,7 +49,7 @@ void gb_gpu_pack_nbors(GBMT &gbm, const int GX, const int BX, const int start,
|
|||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
// Allocate memory on host and device and copy constants to device
|
// Allocate memory on host and device and copy constants to device
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
bool gb_gpu_init(const int ntypes, const double gamma,
|
int gb_gpu_init(const int ntypes, const double gamma,
|
||||||
const double upsilon, const double mu, double **shape,
|
const double upsilon, const double mu, double **shape,
|
||||||
double **well, double **cutsq, double **sigma,
|
double **well, double **cutsq, double **sigma,
|
||||||
double **epsilon, double *host_lshape, int **form,
|
double **epsilon, double *host_lshape, int **form,
|
||||||
@ -77,14 +77,12 @@ bool gb_gpu_init(const int ntypes, const double gamma,
|
|||||||
fflush(screen);
|
fflush(screen);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (world_me==0) {
|
int init_ok=0;
|
||||||
bool init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq,
|
if (world_me==0)
|
||||||
|
init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq,
|
||||||
sigma, epsilon, host_lshape, form, host_lj1,
|
sigma, epsilon, host_lshape, form, host_lj1,
|
||||||
host_lj2, host_lj3, host_lj4, offset, special_lj,
|
host_lj2, host_lj3, host_lj4, offset, special_lj,
|
||||||
inum, nall, max_nbors, cell_size, gpu_split, screen);
|
inum, nall, max_nbors, cell_size, gpu_split, screen);
|
||||||
if (!init_ok)
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
GBMF.device->world_barrier();
|
GBMF.device->world_barrier();
|
||||||
if (message)
|
if (message)
|
||||||
@ -99,22 +97,22 @@ bool gb_gpu_init(const int ntypes, const double gamma,
|
|||||||
last_gpu,i);
|
last_gpu,i);
|
||||||
fflush(screen);
|
fflush(screen);
|
||||||
}
|
}
|
||||||
if (gpu_rank==i && world_me!=0) {
|
if (gpu_rank==i && world_me!=0)
|
||||||
bool init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq,
|
init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq, sigma,
|
||||||
sigma, epsilon, host_lshape, form, host_lj1,
|
epsilon, host_lshape, form, host_lj1, host_lj2,
|
||||||
host_lj2, host_lj3, host_lj4, offset, special_lj,
|
host_lj3, host_lj4, offset, special_lj, inum, nall,
|
||||||
inum, nall, max_nbors, cell_size, gpu_split,
|
max_nbors, cell_size, gpu_split, screen);
|
||||||
screen);
|
|
||||||
if (!init_ok)
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
GBMF.device->gpu_barrier();
|
GBMF.device->gpu_barrier();
|
||||||
if (message)
|
if (message)
|
||||||
fprintf(screen,"Done.\n");
|
fprintf(screen,"Done.\n");
|
||||||
}
|
}
|
||||||
if (message)
|
if (message)
|
||||||
fprintf(screen,"\n");
|
fprintf(screen,"\n");
|
||||||
return true;
|
|
||||||
|
if (init_ok==0)
|
||||||
|
GBMF.estimate_gpu_overhead();
|
||||||
|
return init_ok;
|
||||||
}
|
}
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
@ -131,8 +129,8 @@ template <class gbmtyp>
|
|||||||
inline void _gb_gpu_build_nbor_list(gbmtyp &gbm, const int inum,
|
inline void _gb_gpu_build_nbor_list(gbmtyp &gbm, const int inum,
|
||||||
const int host_inum, const int nall,
|
const int host_inum, const int nall,
|
||||||
double **host_x, double **host_quat,
|
double **host_x, double **host_quat,
|
||||||
int *host_type, double *boxlo,
|
int *host_type, double *sublo,
|
||||||
double *boxhi, bool &success) {
|
double *subhi, bool &success) {
|
||||||
gbm.nbor_time_avail=true;
|
gbm.nbor_time_avail=true;
|
||||||
|
|
||||||
success=true;
|
success=true;
|
||||||
@ -144,7 +142,7 @@ inline void _gb_gpu_build_nbor_list(gbmtyp &gbm, const int inum,
|
|||||||
gbm.atom->cast_copy_x(host_x,host_type);
|
gbm.atom->cast_copy_x(host_x,host_type);
|
||||||
int mn;
|
int mn;
|
||||||
gbm.nbor->build_nbor_list(inum, host_inum, nall, *gbm.atom,
|
gbm.nbor->build_nbor_list(inum, host_inum, nall, *gbm.atom,
|
||||||
boxlo, boxhi, NULL, NULL, NULL, success, mn);
|
sublo, subhi, NULL, NULL, NULL, success, mn);
|
||||||
gbm.nbor->copy_unpacked(inum,mn);
|
gbm.nbor->copy_unpacked(inum,mn);
|
||||||
gbm.last_ellipse=inum;
|
gbm.last_ellipse=inum;
|
||||||
gbm.max_last_ellipse=inum;
|
gbm.max_last_ellipse=inum;
|
||||||
@ -163,7 +161,7 @@ void _gb_gpu_reset_nbors(gbmtyp &gbm, const int nall,
|
|||||||
|
|
||||||
gbm.nbor_time_avail=true;
|
gbm.nbor_time_avail=true;
|
||||||
|
|
||||||
int mn=gbm.nbor->max_nbor_loop(inum,numj);
|
int mn=gbm.nbor->max_nbor_loop(inum,numj,ilist);
|
||||||
gbm.resize_atom(inum,nall,success);
|
gbm.resize_atom(inum,nall,success);
|
||||||
gbm.resize_local(inum,0,mn,osize,success);
|
gbm.resize_local(inum,0,mn,osize,success);
|
||||||
if (!success)
|
if (!success)
|
||||||
@ -216,9 +214,10 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
|
|||||||
else
|
else
|
||||||
vflag=0;
|
vflag=0;
|
||||||
|
|
||||||
int GX=static_cast<int>(ceil(static_cast<double>(gbm.atom->inum())/BX));
|
int GX=static_cast<int>(ceil(static_cast<double>(gbm.ans->inum())/
|
||||||
|
(BX/gbm._threads_per_atom)));
|
||||||
int stride=gbm.nbor->nbor_pitch();
|
int stride=gbm.nbor->nbor_pitch();
|
||||||
int ainum=gbm.atom->inum();
|
int ainum=gbm.ans->inum();
|
||||||
int anall=gbm.atom->nall();
|
int anall=gbm.atom->nall();
|
||||||
|
|
||||||
if (gbm.multiple_forms) {
|
if (gbm.multiple_forms) {
|
||||||
@ -226,7 +225,7 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
|
|||||||
if (gbm.last_ellipse>0) {
|
if (gbm.last_ellipse>0) {
|
||||||
// ------------ ELLIPSE_ELLIPSE and ELLIPSE_SPHERE ---------------
|
// ------------ ELLIPSE_ELLIPSE and ELLIPSE_SPHERE ---------------
|
||||||
GX=static_cast<int>(ceil(static_cast<double>(gbm.last_ellipse)/
|
GX=static_cast<int>(ceil(static_cast<double>(gbm.last_ellipse)/
|
||||||
static_cast<double>(BX)));
|
(BX/gbm._threads_per_atom)));
|
||||||
gb_gpu_pack_nbors(gbm,GX,BX, 0, gbm.last_ellipse,ELLIPSE_SPHERE,
|
gb_gpu_pack_nbors(gbm,GX,BX, 0, gbm.last_ellipse,ELLIPSE_SPHERE,
|
||||||
ELLIPSE_ELLIPSE);
|
ELLIPSE_ELLIPSE);
|
||||||
gbm.time_kernel.stop();
|
gbm.time_kernel.stop();
|
||||||
@ -237,11 +236,12 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
|
|||||||
&gbm.atom->dev_quat.begin(), &gbm.shape.begin(), &gbm.well.begin(),
|
&gbm.atom->dev_quat.begin(), &gbm.shape.begin(), &gbm.well.begin(),
|
||||||
&gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(),
|
&gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(),
|
||||||
&gbm._lj_types, &gbm.lshape.begin(), &gbm.nbor->dev_nbor.begin(),
|
&gbm._lj_types, &gbm.lshape.begin(), &gbm.nbor->dev_nbor.begin(),
|
||||||
&stride, &gbm.atom->dev_ans.begin(),&ainum,&gbm.atom->dev_engv.begin(),
|
&stride, &gbm.ans->dev_ans.begin(),&ainum,&gbm.ans->dev_engv.begin(),
|
||||||
&gbm.dev_error.begin(), &eflag, &vflag, &gbm.last_ellipse, &anall);
|
&gbm.dev_error.begin(), &eflag, &vflag, &gbm.last_ellipse, &anall,
|
||||||
|
&gbm._threads_per_atom);
|
||||||
gbm.time_gayberne.stop();
|
gbm.time_gayberne.stop();
|
||||||
|
|
||||||
if (gbm.last_ellipse==gbm.atom->inum()) {
|
if (gbm.last_ellipse==gbm.ans->inum()) {
|
||||||
gbm.time_kernel2.start();
|
gbm.time_kernel2.start();
|
||||||
gbm.time_kernel2.stop();
|
gbm.time_kernel2.stop();
|
||||||
gbm.time_gayberne2.start();
|
gbm.time_gayberne2.start();
|
||||||
@ -254,9 +254,10 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
|
|||||||
// ------------ SPHERE_ELLIPSE ---------------
|
// ------------ SPHERE_ELLIPSE ---------------
|
||||||
|
|
||||||
gbm.time_kernel2.start();
|
gbm.time_kernel2.start();
|
||||||
GX=static_cast<int>(ceil(static_cast<double>(gbm.atom->inum()-
|
GX=static_cast<int>(ceil(static_cast<double>(gbm.ans->inum()-
|
||||||
gbm.last_ellipse)/BX));
|
gbm.last_ellipse)/
|
||||||
gb_gpu_pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.atom->inum(),
|
(BX/gbm._threads_per_atom)));
|
||||||
|
gb_gpu_pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.ans->inum(),
|
||||||
SPHERE_ELLIPSE,SPHERE_ELLIPSE);
|
SPHERE_ELLIPSE,SPHERE_ELLIPSE);
|
||||||
gbm.time_kernel2.stop();
|
gbm.time_kernel2.stop();
|
||||||
|
|
||||||
@ -266,13 +267,14 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
|
|||||||
&gbm.shape.begin(), &gbm.well.begin(),
|
&gbm.shape.begin(), &gbm.well.begin(),
|
||||||
&gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(),
|
&gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(),
|
||||||
&gbm._lj_types, &gbm.lshape.begin(),
|
&gbm._lj_types, &gbm.lshape.begin(),
|
||||||
&gbm.nbor->dev_nbor.begin(), &stride, &gbm.atom->dev_ans.begin(),
|
&gbm.nbor->dev_nbor.begin(), &stride, &gbm.ans->dev_ans.begin(),
|
||||||
&gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(), &eflag,
|
&gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(), &eflag,
|
||||||
&vflag, &gbm.last_ellipse, &ainum, &anall);
|
&vflag, &gbm.last_ellipse, &ainum, &anall,
|
||||||
|
&gbm._threads_per_atom);
|
||||||
gbm.time_gayberne2.stop();
|
gbm.time_gayberne2.stop();
|
||||||
} else {
|
} else {
|
||||||
gbm.atom->dev_ans.zero();
|
gbm.ans->dev_ans.zero();
|
||||||
gbm.atom->dev_engv.zero();
|
gbm.ans->dev_engv.zero();
|
||||||
gbm.time_kernel.stop();
|
gbm.time_kernel.stop();
|
||||||
gbm.time_gayberne.start();
|
gbm.time_gayberne.start();
|
||||||
gbm.time_gayberne.stop();
|
gbm.time_gayberne.stop();
|
||||||
@ -284,29 +286,31 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
|
|||||||
|
|
||||||
// ------------ LJ ---------------
|
// ------------ LJ ---------------
|
||||||
gbm.time_pair.start();
|
gbm.time_pair.start();
|
||||||
if (gbm.last_ellipse<gbm.atom->inum()) {
|
if (gbm.last_ellipse<gbm.ans->inum()) {
|
||||||
if (gbm.shared_types) {
|
if (gbm.shared_types) {
|
||||||
GBMF.k_lj_fast.set_size(GX,BX);
|
GBMF.k_lj_fast.set_size(GX,BX);
|
||||||
GBMF.k_lj_fast.run(&gbm.atom->dev_x.begin(), &gbm.lj1.begin(),
|
GBMF.k_lj_fast.run(&gbm.atom->dev_x.begin(), &gbm.lj1.begin(),
|
||||||
&gbm.lj3.begin(), &gbm.gamma_upsilon_mu.begin(),
|
&gbm.lj3.begin(), &gbm.gamma_upsilon_mu.begin(),
|
||||||
&stride, &gbm.nbor->dev_packed.begin(),
|
&stride, &gbm.nbor->dev_packed.begin(),
|
||||||
&gbm.atom->dev_ans.begin(),
|
&gbm.ans->dev_ans.begin(),
|
||||||
&gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(),
|
&gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(),
|
||||||
&eflag, &vflag, &gbm.last_ellipse, &ainum, &anall);
|
&eflag, &vflag, &gbm.last_ellipse, &ainum, &anall,
|
||||||
|
&gbm._threads_per_atom);
|
||||||
} else {
|
} else {
|
||||||
GBMF.k_lj.set_size(GX,BX);
|
GBMF.k_lj.set_size(GX,BX);
|
||||||
GBMF.k_lj.run(&gbm.atom->dev_x.begin(), &gbm.lj1.begin(),
|
GBMF.k_lj.run(&gbm.atom->dev_x.begin(), &gbm.lj1.begin(),
|
||||||
&gbm.lj3.begin(), &gbm._lj_types,
|
&gbm.lj3.begin(), &gbm._lj_types,
|
||||||
&gbm.gamma_upsilon_mu.begin(), &stride,
|
&gbm.gamma_upsilon_mu.begin(), &stride,
|
||||||
&gbm.nbor->dev_packed.begin(), &gbm.atom->dev_ans.begin(),
|
&gbm.nbor->dev_packed.begin(), &gbm.ans->dev_ans.begin(),
|
||||||
&gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(),
|
&gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(),
|
||||||
&eflag, &vflag, &gbm.last_ellipse, &ainum, &anall);
|
&eflag, &vflag, &gbm.last_ellipse, &ainum, &anall,
|
||||||
|
&gbm._threads_per_atom);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
gbm.time_pair.stop();
|
gbm.time_pair.stop();
|
||||||
} else {
|
} else {
|
||||||
gbm.time_kernel.start();
|
gbm.time_kernel.start();
|
||||||
gb_gpu_pack_nbors(gbm, GX, BX, 0, gbm.atom->inum(),SPHERE_SPHERE,
|
gb_gpu_pack_nbors(gbm, GX, BX, 0, gbm.ans->inum(),SPHERE_SPHERE,
|
||||||
ELLIPSE_ELLIPSE);
|
ELLIPSE_ELLIPSE);
|
||||||
gbm.time_kernel.stop();
|
gbm.time_kernel.stop();
|
||||||
gbm.time_gayberne.start();
|
gbm.time_gayberne.start();
|
||||||
@ -315,9 +319,9 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
|
|||||||
&gbm.shape.begin(), &gbm.well.begin(),
|
&gbm.shape.begin(), &gbm.well.begin(),
|
||||||
&gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(),
|
&gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(),
|
||||||
&gbm._lj_types, &gbm.lshape.begin(), &gbm.nbor->dev_nbor.begin(),
|
&gbm._lj_types, &gbm.lshape.begin(), &gbm.nbor->dev_nbor.begin(),
|
||||||
&stride, &gbm.atom->dev_ans.begin(), &ainum,
|
&stride, &gbm.ans->dev_ans.begin(), &ainum,
|
||||||
&gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(),
|
&gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(),
|
||||||
&eflag, &vflag, &ainum, &anall);
|
&eflag, &vflag, &ainum, &anall, &gbm._threads_per_atom);
|
||||||
gbm.time_gayberne.stop();
|
gbm.time_gayberne.stop();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -326,30 +330,31 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
|
|||||||
// Reneighbor on GPU if necessary and then compute forces, torques, energies
|
// Reneighbor on GPU if necessary and then compute forces, torques, energies
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
template <class gbmtyp>
|
template <class gbmtyp>
|
||||||
inline int * _gb_gpu_compute_n(gbmtyp &gbm, const int timestep, const int ago,
|
inline int** _gb_gpu_compute_n(gbmtyp &gbm, const int ago,
|
||||||
const int inum_full, const int nall,
|
const int inum_full, const int nall,
|
||||||
double **host_x, int *host_type,
|
double **host_x, int *host_type,
|
||||||
double *boxlo, double *boxhi, const bool eflag,
|
double *sublo, double *subhi, const bool eflag,
|
||||||
const bool vflag, const bool eatom,
|
const bool vflag, const bool eatom,
|
||||||
const bool vatom, int &host_start,
|
const bool vatom, int &host_start,
|
||||||
const double cpu_time, bool &success,
|
int **ilist, int **jnum, const double cpu_time,
|
||||||
double **host_quat) {
|
bool &success, double **host_quat) {
|
||||||
gbm.acc_timers();
|
gbm.acc_timers();
|
||||||
if (inum_full==0) {
|
if (inum_full==0) {
|
||||||
|
host_start=0;
|
||||||
gbm.zero_timers();
|
gbm.zero_timers();
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
gbm.hd_balancer.balance(cpu_time,gbm.nbor->gpu_nbor());
|
gbm.hd_balancer.balance(cpu_time);
|
||||||
int inum=gbm.hd_balancer.get_gpu_count(timestep,ago,inum_full);
|
int inum=gbm.hd_balancer.get_gpu_count(ago,inum_full);
|
||||||
gbm.atom->inum(inum);
|
gbm.ans->inum(inum);
|
||||||
gbm.last_ellipse=std::min(inum,gbm.max_last_ellipse);
|
gbm.last_ellipse=std::min(inum,gbm.max_last_ellipse);
|
||||||
host_start=inum;
|
host_start=inum;
|
||||||
|
|
||||||
// Build neighbor list on GPU if necessary
|
// Build neighbor list on GPU if necessary
|
||||||
if (ago==0) {
|
if (ago==0) {
|
||||||
_gb_gpu_build_nbor_list(gbm, inum, inum_full-inum, nall, host_x,
|
_gb_gpu_build_nbor_list(gbm, inum, inum_full-inum, nall, host_x,
|
||||||
host_quat, host_type, boxlo, boxhi, success);
|
host_quat, host_type, sublo, subhi, success);
|
||||||
if (!success)
|
if (!success)
|
||||||
return NULL;
|
return NULL;
|
||||||
gbm.atom->cast_quat_data(host_quat[0]);
|
gbm.atom->cast_quat_data(host_quat[0]);
|
||||||
@ -361,47 +366,49 @@ inline int * _gb_gpu_compute_n(gbmtyp &gbm, const int timestep, const int ago,
|
|||||||
gbm.atom->add_x_data(host_x,host_type);
|
gbm.atom->add_x_data(host_x,host_type);
|
||||||
}
|
}
|
||||||
|
|
||||||
gbm.atom->add_other_data();
|
gbm.atom->add_quat_data();
|
||||||
|
*ilist=gbm.nbor->host_ilist.begin();
|
||||||
|
*jnum=gbm.nbor->host_acc.begin();
|
||||||
|
|
||||||
_gb_gpu_gayberne<PRECISION,ACC_PRECISION>(gbm,eflag,vflag);
|
_gb_gpu_gayberne<PRECISION,ACC_PRECISION>(gbm,eflag,vflag);
|
||||||
gbm.atom->copy_answers(eflag,vflag,eatom,vatom);
|
gbm.ans->copy_answers(eflag,vflag,eatom,vatom);
|
||||||
|
gbm.device->add_ans_object(gbm.ans);
|
||||||
gbm.hd_balancer.stop_timer();
|
gbm.hd_balancer.stop_timer();
|
||||||
return gbm.device->nbor.host_nbor.begin();
|
return gbm.nbor->host_jlist.begin()-host_start;
|
||||||
}
|
}
|
||||||
|
|
||||||
int * gb_gpu_compute_n(const int timestep, const int ago, const int inum_full,
|
int** gb_gpu_compute_n(const int ago, const int inum_full, const int nall,
|
||||||
const int nall, double **host_x, int *host_type,
|
double **host_x, int *host_type, double *sublo,
|
||||||
double *boxlo, double *boxhi, const bool eflag,
|
double *subhi, const bool eflag, const bool vflag,
|
||||||
const bool vflag, const bool eatom, const bool vatom,
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
int &host_start, const double cpu_time, bool &success,
|
int **ilist, int **jnum, const double cpu_time,
|
||||||
double **host_quat) {
|
bool &success, double **host_quat) {
|
||||||
return _gb_gpu_compute_n(GBMF, timestep, ago, inum_full, nall, host_x,
|
return _gb_gpu_compute_n(GBMF, ago, inum_full, nall, host_x, host_type, sublo,
|
||||||
host_type, boxlo, boxhi, eflag, vflag, eatom, vatom,
|
subhi, eflag, vflag, eatom, vatom, host_start, ilist,
|
||||||
host_start, cpu_time, success, host_quat);
|
jnum, cpu_time, success, host_quat);
|
||||||
}
|
}
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
// Copy nbor list from host if necessary and then calculate forces, torques,..
|
// Copy nbor list from host if necessary and then calculate forces, torques,..
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
template <class gbmtyp>
|
template <class gbmtyp>
|
||||||
inline int * _gb_gpu_compute(gbmtyp &gbm, const int timestep, const int f_ago,
|
inline int * _gb_gpu_compute(gbmtyp &gbm, const int f_ago, const int inum_full,
|
||||||
const int inum_full,const int nall,double **host_x,
|
const int nall,double **host_x, int *host_type,
|
||||||
int *host_type, int *ilist, int *numj,
|
int *ilist, int *numj, int **firstneigh,
|
||||||
int **firstneigh, const bool eflag,
|
const bool eflag, const bool vflag,
|
||||||
const bool vflag, const bool eatom,
|
const bool eatom, const bool vatom,
|
||||||
const bool vatom, int &host_start,
|
int &host_start, const double cpu_time,
|
||||||
const double cpu_time, bool &success,
|
bool &success, double **host_quat) {
|
||||||
double **host_quat) {
|
|
||||||
gbm.acc_timers();
|
gbm.acc_timers();
|
||||||
if (inum_full==0) {
|
if (inum_full==0) {
|
||||||
|
host_start=0;
|
||||||
gbm.zero_timers();
|
gbm.zero_timers();
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
int ago=gbm.hd_balancer.ago_first(f_ago);
|
int ago=gbm.hd_balancer.ago_first(f_ago);
|
||||||
int inum=gbm.hd_balancer.balance(timestep,ago,inum_full,cpu_time,
|
int inum=gbm.hd_balancer.balance(ago,inum_full,cpu_time);
|
||||||
gbm.nbor->gpu_nbor());
|
gbm.ans->inum(inum);
|
||||||
gbm.atom->inum(inum);
|
|
||||||
gbm.last_ellipse=std::min(inum,gbm.max_last_ellipse);
|
gbm.last_ellipse=std::min(inum,gbm.max_last_ellipse);
|
||||||
host_start=inum;
|
host_start=inum;
|
||||||
|
|
||||||
@ -421,21 +428,21 @@ inline int * _gb_gpu_compute(gbmtyp &gbm, const int timestep, const int f_ago,
|
|||||||
gbm.atom->cast_quat_data(host_quat[0]);
|
gbm.atom->cast_quat_data(host_quat[0]);
|
||||||
gbm.hd_balancer.start_timer();
|
gbm.hd_balancer.start_timer();
|
||||||
gbm.atom->add_x_data(host_x,host_type);
|
gbm.atom->add_x_data(host_x,host_type);
|
||||||
gbm.atom->add_other_data();
|
gbm.atom->add_quat_data();
|
||||||
|
|
||||||
_gb_gpu_gayberne<PRECISION,ACC_PRECISION>(gbm,eflag,vflag);
|
_gb_gpu_gayberne<PRECISION,ACC_PRECISION>(gbm,eflag,vflag);
|
||||||
gbm.atom->copy_answers(eflag,vflag,eatom,vatom,list);
|
gbm.ans->copy_answers(eflag,vflag,eatom,vatom,list);
|
||||||
|
gbm.device->add_ans_object(gbm.ans);
|
||||||
gbm.hd_balancer.stop_timer();
|
gbm.hd_balancer.stop_timer();
|
||||||
return list;
|
return list;
|
||||||
}
|
}
|
||||||
|
|
||||||
int * gb_gpu_compute(const int timestep, const int ago, const int inum_full,
|
int * gb_gpu_compute(const int ago, const int inum_full, const int nall,
|
||||||
const int nall, double **host_x, int *host_type,
|
double **host_x, int *host_type, int *ilist, int *numj,
|
||||||
int *ilist, int *numj, int **firstneigh,
|
int **firstneigh, const bool eflag, const bool vflag,
|
||||||
const bool eflag, const bool vflag, const bool eatom,
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
const bool vatom, int &host_start, const double cpu_time,
|
const double cpu_time, bool &success, double **host_quat) {
|
||||||
bool &success, double **host_quat) {
|
return _gb_gpu_compute(GBMF, ago, inum_full, nall, host_x,
|
||||||
return _gb_gpu_compute(GBMF, timestep, ago, inum_full, nall, host_x,
|
|
||||||
host_type, ilist, numj, firstneigh, eflag, vflag,
|
host_type, ilist, numj, firstneigh, eflag, vflag,
|
||||||
eatom, vatom, host_start, cpu_time, success,
|
eatom, vatom, host_start, cpu_time, success,
|
||||||
host_quat);
|
host_quat);
|
||||||
|
|||||||
@ -18,7 +18,6 @@
|
|||||||
#ifndef GB_GPU_EXTRA_H
|
#ifndef GB_GPU_EXTRA_H
|
||||||
#define GB_GPU_EXTRA_H
|
#define GB_GPU_EXTRA_H
|
||||||
|
|
||||||
#define MAX_SHARED_TYPES 8
|
|
||||||
enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
|
enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
|
||||||
|
|
||||||
#ifdef _DOUBLE_DOUBLE
|
#ifdef _DOUBLE_DOUBLE
|
||||||
@ -47,7 +46,7 @@ enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
|
|||||||
|
|
||||||
#ifdef NV_KERNEL
|
#ifdef NV_KERNEL
|
||||||
|
|
||||||
#include "geryon/ucl_nv_kernel.h"
|
#include "nv_kernel_def.h"
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
@ -58,6 +57,8 @@ enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
|
|||||||
#define BLOCK_SIZE_X get_local_size(0)
|
#define BLOCK_SIZE_X get_local_size(0)
|
||||||
#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
|
#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
|
||||||
#define __inline inline
|
#define __inline inline
|
||||||
|
#define BLOCK_PAIR 64
|
||||||
|
#define MAX_SHARED_TYPES 8
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|||||||
@ -97,17 +97,17 @@ __kernel void kernel_gayberne(__global numtyp4* x_,__global numtyp4 *q,
|
|||||||
__global acctyp4 *ans, const int astride,
|
__global acctyp4 *ans, const int astride,
|
||||||
__global acctyp *engv, __global int *err_flag,
|
__global acctyp *engv, __global int *err_flag,
|
||||||
const int eflag, const int vflag, const int inum,
|
const int eflag, const int vflag, const int inum,
|
||||||
const int nall) {
|
const int nall, const int t_per_atom) {
|
||||||
|
int tid=THREAD_ID_X;
|
||||||
|
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||||
|
ii+=tid/t_per_atom;
|
||||||
|
int offset=tid%t_per_atom;
|
||||||
|
|
||||||
__local numtyp sp_lj[4];
|
__local numtyp sp_lj[4];
|
||||||
|
sp_lj[0]=gum[3];
|
||||||
// ii indexes the two interacting particles in gi
|
sp_lj[1]=gum[4];
|
||||||
int ii=THREAD_ID_X;
|
sp_lj[2]=gum[5];
|
||||||
if (ii<4)
|
sp_lj[3]=gum[6];
|
||||||
sp_lj[ii]=gum[ii+3];
|
|
||||||
ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
if (ii<inum) {
|
|
||||||
|
|
||||||
acctyp energy=(acctyp)0;
|
acctyp energy=(acctyp)0;
|
||||||
acctyp4 f;
|
acctyp4 f;
|
||||||
@ -122,12 +122,15 @@ __kernel void kernel_gayberne(__global numtyp4* x_,__global numtyp4 *q,
|
|||||||
for (int i=0; i<6; i++)
|
for (int i=0; i<6; i++)
|
||||||
virial[i]=(acctyp)0;
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
|
if (ii<inum) {
|
||||||
__global int *nbor=dev_nbor+ii;
|
__global int *nbor=dev_nbor+ii;
|
||||||
int i=*nbor;
|
int i=*nbor;
|
||||||
nbor+=stride;
|
nbor+=stride;
|
||||||
int numj=*nbor;
|
int numj=*nbor;
|
||||||
nbor+=stride;
|
nbor+=stride;
|
||||||
__global int *nbor_end=nbor+mul24(stride,numj);
|
__global int *nbor_end=nbor+mul24(stride,numj);
|
||||||
|
nbor+=mul24(offset,stride);
|
||||||
|
int n_stride=mul24(t_per_atom,stride);
|
||||||
|
|
||||||
numtyp4 ix=x_[i];
|
numtyp4 ix=x_[i];
|
||||||
int itype=ix.w;
|
int itype=ix.w;
|
||||||
@ -143,8 +146,7 @@ __kernel void kernel_gayberne(__global numtyp4* x_,__global numtyp4 *q,
|
|||||||
}
|
}
|
||||||
|
|
||||||
numtyp factor_lj;
|
numtyp factor_lj;
|
||||||
for ( ; nbor<nbor_end; nbor+=stride) {
|
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||||
|
|
||||||
int j=*nbor;
|
int j=*nbor;
|
||||||
factor_lj = sp_lj[sbmask(j)];
|
factor_lj = sp_lj[sbmask(j)];
|
||||||
j &= NEIGHMASK;
|
j &= NEIGHMASK;
|
||||||
@ -362,8 +364,53 @@ __kernel void kernel_gayberne(__global numtyp4* x_,__global numtyp4 *q,
|
|||||||
tor.z+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2];
|
tor.z+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2];
|
||||||
|
|
||||||
} // for nbor
|
} // for nbor
|
||||||
|
} // if ii
|
||||||
|
|
||||||
|
// Reduce answers
|
||||||
|
if (t_per_atom>1) {
|
||||||
|
__local acctyp red_acc[7][BLOCK_PAIR];
|
||||||
|
|
||||||
|
red_acc[0][tid]=f.x;
|
||||||
|
red_acc[1][tid]=f.y;
|
||||||
|
red_acc[2][tid]=f.z;
|
||||||
|
red_acc[3][tid]=tor.x;
|
||||||
|
red_acc[4][tid]=tor.y;
|
||||||
|
red_acc[5][tid]=tor.z;
|
||||||
|
|
||||||
|
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||||
|
if (offset < s) {
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
red_acc[r][tid] += red_acc[r][tid+s];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
f.x=red_acc[0][tid];
|
||||||
|
f.y=red_acc[1][tid];
|
||||||
|
f.z=red_acc[2][tid];
|
||||||
|
tor.x=red_acc[3][tid];
|
||||||
|
tor.y=red_acc[4][tid];
|
||||||
|
tor.z=red_acc[5][tid];
|
||||||
|
|
||||||
|
if (eflag>0 || vflag>0) {
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
red_acc[r][tid]=virial[r];
|
||||||
|
red_acc[6][tid]=energy;
|
||||||
|
|
||||||
|
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||||
|
if (offset < s) {
|
||||||
|
for (int r=0; r<7; r++)
|
||||||
|
red_acc[r][tid] += red_acc[r][tid+s];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
virial[r]=red_acc[r][tid];
|
||||||
|
energy=red_acc[6][tid];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Store answers
|
// Store answers
|
||||||
|
if (ii<inum && offset==0) {
|
||||||
__global acctyp *ap1=engv+ii;
|
__global acctyp *ap1=engv+ii;
|
||||||
if (eflag>0) {
|
if (eflag>0) {
|
||||||
*ap1=energy;
|
*ap1=energy;
|
||||||
|
|||||||
@ -34,17 +34,17 @@ __kernel void kernel_sphere_gb(__global numtyp4 *x_,__global numtyp4 *q,
|
|||||||
__global acctyp4 *ans, __global acctyp *engv,
|
__global acctyp4 *ans, __global acctyp *engv,
|
||||||
__global int *err_flag, const int eflag,
|
__global int *err_flag, const int eflag,
|
||||||
const int vflag,const int start, const int inum,
|
const int vflag,const int start, const int inum,
|
||||||
const int nall) {
|
const int nall, const int t_per_atom) {
|
||||||
|
int tid=THREAD_ID_X;
|
||||||
|
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||||
|
ii+=tid/t_per_atom+start;
|
||||||
|
int offset=tid%t_per_atom;
|
||||||
|
|
||||||
__local numtyp sp_lj[4];
|
__local numtyp sp_lj[4];
|
||||||
|
sp_lj[0]=gum[3];
|
||||||
// ii indexes the two interacting particles in gi
|
sp_lj[1]=gum[4];
|
||||||
int ii=THREAD_ID_X;
|
sp_lj[2]=gum[5];
|
||||||
if (ii<4)
|
sp_lj[3]=gum[6];
|
||||||
sp_lj[ii]=gum[ii+3];
|
|
||||||
ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X)+start;
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
if (ii<inum) {
|
|
||||||
|
|
||||||
acctyp energy=(acctyp)0;
|
acctyp energy=(acctyp)0;
|
||||||
acctyp4 f;
|
acctyp4 f;
|
||||||
@ -55,12 +55,15 @@ __kernel void kernel_sphere_gb(__global numtyp4 *x_,__global numtyp4 *q,
|
|||||||
for (int i=0; i<6; i++)
|
for (int i=0; i<6; i++)
|
||||||
virial[i]=(acctyp)0;
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
|
if (ii<inum) {
|
||||||
__global int *nbor=dev_nbor+ii;
|
__global int *nbor=dev_nbor+ii;
|
||||||
int i=*nbor;
|
int i=*nbor;
|
||||||
nbor+=stride;
|
nbor+=stride;
|
||||||
int numj=*nbor;
|
int numj=*nbor;
|
||||||
nbor+=stride;
|
nbor+=stride;
|
||||||
__global int *nbor_end=nbor+stride*numj;
|
__global int *nbor_end=nbor+stride*numj;
|
||||||
|
nbor+=mul24(offset,stride);
|
||||||
|
int n_stride=mul24(t_per_atom,stride);
|
||||||
|
|
||||||
numtyp4 ix=x_[i];
|
numtyp4 ix=x_[i];
|
||||||
int itype=ix.w;
|
int itype=ix.w;
|
||||||
@ -69,7 +72,7 @@ __kernel void kernel_sphere_gb(__global numtyp4 *x_,__global numtyp4 *q,
|
|||||||
numtyp one_well=well[itype].x;
|
numtyp one_well=well[itype].x;
|
||||||
|
|
||||||
numtyp factor_lj;
|
numtyp factor_lj;
|
||||||
for ( ; nbor<nbor_end; nbor+=stride) {
|
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||||
|
|
||||||
int j=*nbor;
|
int j=*nbor;
|
||||||
factor_lj = sp_lj[sbmask(j)];
|
factor_lj = sp_lj[sbmask(j)];
|
||||||
@ -241,8 +244,47 @@ __kernel void kernel_sphere_gb(__global numtyp4 *x_,__global numtyp4 *q,
|
|||||||
f.z+=temp1*dchi[2]-temp2*dUr[2];
|
f.z+=temp1*dchi[2]-temp2*dUr[2];
|
||||||
}
|
}
|
||||||
} // for nbor
|
} // for nbor
|
||||||
|
} // if ii
|
||||||
|
|
||||||
|
// Reduce answers
|
||||||
|
if (t_per_atom>1) {
|
||||||
|
__local acctyp red_acc[6][BLOCK_PAIR];
|
||||||
|
|
||||||
|
red_acc[0][tid]=f.x;
|
||||||
|
red_acc[1][tid]=f.y;
|
||||||
|
red_acc[2][tid]=f.z;
|
||||||
|
red_acc[3][tid]=energy;
|
||||||
|
|
||||||
|
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||||
|
if (offset < s) {
|
||||||
|
for (int r=0; r<4; r++)
|
||||||
|
red_acc[r][tid] += red_acc[r][tid+s];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
f.x=red_acc[0][tid];
|
||||||
|
f.y=red_acc[1][tid];
|
||||||
|
f.z=red_acc[2][tid];
|
||||||
|
energy=red_acc[3][tid];
|
||||||
|
|
||||||
|
if (vflag>0) {
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
red_acc[r][tid]=virial[r];
|
||||||
|
|
||||||
|
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||||
|
if (offset < s) {
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
red_acc[r][tid] += red_acc[r][tid+s];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
virial[r]=red_acc[r][tid];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Store answers
|
// Store answers
|
||||||
|
if (ii<inum && offset==0) {
|
||||||
__global acctyp *ap1=engv+ii;
|
__global acctyp *ap1=engv+ii;
|
||||||
if (eflag>0) {
|
if (eflag>0) {
|
||||||
*ap1=energy;
|
*ap1=energy;
|
||||||
@ -265,17 +307,17 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||||||
__global acctyp4 *ans, __global acctyp *engv,
|
__global acctyp4 *ans, __global acctyp *engv,
|
||||||
__global int *err_flag, const int eflag,
|
__global int *err_flag, const int eflag,
|
||||||
const int vflag, const int start, const int inum,
|
const int vflag, const int start, const int inum,
|
||||||
const int nall) {
|
const int nall, const int t_per_atom) {
|
||||||
|
int tid=THREAD_ID_X;
|
||||||
|
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||||
|
ii+=tid/t_per_atom+start;
|
||||||
|
int offset=tid%t_per_atom;
|
||||||
|
|
||||||
__local numtyp sp_lj[4];
|
__local numtyp sp_lj[4];
|
||||||
|
sp_lj[0]=gum[3];
|
||||||
// ii indexes the two interacting particles in gi
|
sp_lj[1]=gum[4];
|
||||||
int ii=THREAD_ID_X;
|
sp_lj[2]=gum[5];
|
||||||
if (ii<4)
|
sp_lj[3]=gum[6];
|
||||||
sp_lj[ii]=gum[ii+3];
|
|
||||||
ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X)+start;
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
if (ii<inum) {
|
|
||||||
|
|
||||||
acctyp energy=(acctyp)0;
|
acctyp energy=(acctyp)0;
|
||||||
acctyp4 f;
|
acctyp4 f;
|
||||||
@ -286,18 +328,21 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||||||
for (int i=0; i<6; i++)
|
for (int i=0; i<6; i++)
|
||||||
virial[i]=(acctyp)0;
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
|
if (ii<inum) {
|
||||||
__global int *nbor=dev_ij+ii;
|
__global int *nbor=dev_ij+ii;
|
||||||
int i=*nbor;
|
int i=*nbor;
|
||||||
nbor+=stride;
|
nbor+=stride;
|
||||||
int numj=*nbor;
|
int numj=*nbor;
|
||||||
nbor+=stride;
|
nbor+=stride;
|
||||||
__global int *list_end=nbor+mul24(stride,numj);
|
__global int *list_end=nbor+mul24(stride,numj);
|
||||||
|
nbor+=mul24(offset,stride);
|
||||||
|
int n_stride=mul24(t_per_atom,stride);
|
||||||
|
|
||||||
numtyp4 ix=x_[i];
|
numtyp4 ix=x_[i];
|
||||||
int itype=ix.w;
|
int itype=ix.w;
|
||||||
|
|
||||||
numtyp factor_lj;
|
numtyp factor_lj;
|
||||||
for ( ; nbor<list_end; nbor+=stride) {
|
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||||
|
|
||||||
int j=*nbor;
|
int j=*nbor;
|
||||||
factor_lj = sp_lj[sbmask(j)];
|
factor_lj = sp_lj[sbmask(j)];
|
||||||
@ -338,8 +383,47 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||||||
}
|
}
|
||||||
|
|
||||||
} // for nbor
|
} // for nbor
|
||||||
|
} // if ii
|
||||||
|
|
||||||
|
// Reduce answers
|
||||||
|
if (t_per_atom>1) {
|
||||||
|
__local acctyp red_acc[6][BLOCK_PAIR];
|
||||||
|
|
||||||
|
red_acc[0][tid]=f.x;
|
||||||
|
red_acc[1][tid]=f.y;
|
||||||
|
red_acc[2][tid]=f.z;
|
||||||
|
red_acc[3][tid]=energy;
|
||||||
|
|
||||||
|
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||||
|
if (offset < s) {
|
||||||
|
for (int r=0; r<4; r++)
|
||||||
|
red_acc[r][tid] += red_acc[r][tid+s];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
f.x=red_acc[0][tid];
|
||||||
|
f.y=red_acc[1][tid];
|
||||||
|
f.z=red_acc[2][tid];
|
||||||
|
energy=red_acc[3][tid];
|
||||||
|
|
||||||
|
if (vflag>0) {
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
red_acc[r][tid]=virial[r];
|
||||||
|
|
||||||
|
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||||
|
if (offset < s) {
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
red_acc[r][tid] += red_acc[r][tid+s];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
virial[r]=red_acc[r][tid];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Store answers
|
// Store answers
|
||||||
|
if (ii<inum && offset==0) {
|
||||||
__global acctyp *ap1=engv+ii;
|
__global acctyp *ap1=engv+ii;
|
||||||
if (eflag>0) {
|
if (eflag>0) {
|
||||||
*ap1+=energy;
|
*ap1+=energy;
|
||||||
@ -361,27 +445,26 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||||||
|
|
||||||
__kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
__kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||||
__global numtyp4* lj3_in, __global numtyp *gum,
|
__global numtyp4* lj3_in, __global numtyp *gum,
|
||||||
const int stride,
|
const int stride, __global int *dev_ij,
|
||||||
__global int *dev_ij, __global acctyp4 *ans,
|
__global acctyp4 *ans, __global acctyp *engv,
|
||||||
__global acctyp *engv, __global int *err_flag,
|
__global int *err_flag, const int eflag,
|
||||||
const int eflag,const int vflag, const int start,
|
const int vflag, const int start, const int inum,
|
||||||
const int inum, const int nall) {
|
const int nall, const int t_per_atom) {
|
||||||
// ii indexes the two interacting particles in gi
|
int tid=THREAD_ID_X;
|
||||||
int ii=THREAD_ID_X;
|
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||||
|
ii+=tid/t_per_atom+start;
|
||||||
|
int offset=tid%t_per_atom;
|
||||||
|
|
||||||
__local numtyp sp_lj[4];
|
__local numtyp sp_lj[4];
|
||||||
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||||
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||||
if (ii<4)
|
if (tid<4)
|
||||||
sp_lj[ii]=gum[ii+3];
|
sp_lj[tid]=gum[tid+3];
|
||||||
if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||||
lj1[ii]=lj1_in[ii];
|
lj1[tid]=lj1_in[tid];
|
||||||
if (eflag>0)
|
if (eflag>0)
|
||||||
lj3[ii]=lj3_in[ii];
|
lj3[tid]=lj3_in[tid];
|
||||||
}
|
}
|
||||||
ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X)+start;
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
if (ii<inum) {
|
|
||||||
|
|
||||||
acctyp energy=(acctyp)0;
|
acctyp energy=(acctyp)0;
|
||||||
acctyp4 f;
|
acctyp4 f;
|
||||||
@ -392,19 +475,24 @@ __kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
|||||||
for (int i=0; i<6; i++)
|
for (int i=0; i<6; i++)
|
||||||
virial[i]=(acctyp)0;
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
if (ii<inum) {
|
||||||
__global int *nbor=dev_ij+ii;
|
__global int *nbor=dev_ij+ii;
|
||||||
int i=*nbor;
|
int i=*nbor;
|
||||||
nbor+=stride;
|
nbor+=stride;
|
||||||
int numj=*nbor;
|
int numj=*nbor;
|
||||||
nbor+=stride;
|
nbor+=stride;
|
||||||
__global int *list_end=nbor+mul24(stride,numj);
|
__global int *list_end=nbor+mul24(stride,numj);
|
||||||
|
nbor+=mul24(offset,stride);
|
||||||
|
int n_stride=mul24(t_per_atom,stride);
|
||||||
|
|
||||||
numtyp4 ix=x_[i];
|
numtyp4 ix=x_[i];
|
||||||
int iw=ix.w;
|
int iw=ix.w;
|
||||||
int itype=mul24((int)MAX_SHARED_TYPES,iw);
|
int itype=mul24((int)MAX_SHARED_TYPES,iw);
|
||||||
|
|
||||||
numtyp factor_lj;
|
numtyp factor_lj;
|
||||||
for ( ; nbor<list_end; nbor+=stride) {
|
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||||
|
|
||||||
int j=*nbor;
|
int j=*nbor;
|
||||||
factor_lj = sp_lj[sbmask(j)];
|
factor_lj = sp_lj[sbmask(j)];
|
||||||
@ -443,8 +531,47 @@ __kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
|||||||
}
|
}
|
||||||
|
|
||||||
} // for nbor
|
} // for nbor
|
||||||
|
} // if ii
|
||||||
|
|
||||||
|
// Reduce answers
|
||||||
|
if (t_per_atom>1) {
|
||||||
|
__local acctyp red_acc[6][BLOCK_PAIR];
|
||||||
|
|
||||||
|
red_acc[0][tid]=f.x;
|
||||||
|
red_acc[1][tid]=f.y;
|
||||||
|
red_acc[2][tid]=f.z;
|
||||||
|
red_acc[3][tid]=energy;
|
||||||
|
|
||||||
|
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||||
|
if (offset < s) {
|
||||||
|
for (int r=0; r<4; r++)
|
||||||
|
red_acc[r][tid] += red_acc[r][tid+s];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
f.x=red_acc[0][tid];
|
||||||
|
f.y=red_acc[1][tid];
|
||||||
|
f.z=red_acc[2][tid];
|
||||||
|
energy=red_acc[3][tid];
|
||||||
|
|
||||||
|
if (vflag>0) {
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
red_acc[r][tid]=virial[r];
|
||||||
|
|
||||||
|
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||||
|
if (offset < s) {
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
red_acc[r][tid] += red_acc[r][tid+s];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
virial[r]=red_acc[r][tid];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Store answers
|
// Store answers
|
||||||
|
if (ii<inum && offset==0) {
|
||||||
__global acctyp *ap1=engv+ii;
|
__global acctyp *ap1=engv+ii;
|
||||||
if (eflag>0) {
|
if (eflag>0) {
|
||||||
*ap1+=energy;
|
*ap1+=energy;
|
||||||
|
|||||||
@ -18,8 +18,6 @@
|
|||||||
#ifndef PAIR_GPU_KERNEL_H
|
#ifndef PAIR_GPU_KERNEL_H
|
||||||
#define PAIR_GPU_KERNEL_H
|
#define PAIR_GPU_KERNEL_H
|
||||||
|
|
||||||
#define MAX_SHARED_TYPES 8
|
|
||||||
|
|
||||||
#ifdef _DOUBLE_DOUBLE
|
#ifdef _DOUBLE_DOUBLE
|
||||||
#define numtyp double
|
#define numtyp double
|
||||||
#define numtyp2 double2
|
#define numtyp2 double2
|
||||||
@ -32,7 +30,7 @@
|
|||||||
|
|
||||||
#ifdef NV_KERNEL
|
#ifdef NV_KERNEL
|
||||||
|
|
||||||
#include "geryon/ucl_nv_kernel.h"
|
#include "nv_kernel_def.h"
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
@ -42,6 +40,7 @@
|
|||||||
#define BLOCK_ID_X get_group_id(0)
|
#define BLOCK_ID_X get_group_id(0)
|
||||||
#define BLOCK_SIZE_X get_local_size(0)
|
#define BLOCK_SIZE_X get_local_size(0)
|
||||||
#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
|
#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
|
||||||
|
#define MAX_SHARED_TYPES 8
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|||||||
@ -32,20 +32,25 @@ template <class numtyp, class acctyp>
|
|||||||
GB_GPU_MemoryT::GB_GPU_Memory() : _allocated(false), _compiled(false),
|
GB_GPU_MemoryT::GB_GPU_Memory() : _allocated(false), _compiled(false),
|
||||||
_max_bytes(0.0) {
|
_max_bytes(0.0) {
|
||||||
device=&pair_gpu_device;
|
device=&pair_gpu_device;
|
||||||
|
ans=new PairGPUAns<numtyp,acctyp>();
|
||||||
|
nbor=new PairGPUNbor;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
GB_GPU_MemoryT::~GB_GPU_Memory() {
|
GB_GPU_MemoryT::~GB_GPU_Memory() {
|
||||||
clear();
|
clear();
|
||||||
|
delete ans;
|
||||||
|
delete nbor;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
int GB_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
|
int GB_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
|
||||||
return device->atom.bytes_per_atom()+device->nbor.bytes_per_atom(max_nbors);
|
return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
|
||||||
|
nbor->bytes_per_atom(max_nbors);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
bool GB_GPU_MemoryT::init(const int ntypes, const double gamma,
|
int GB_GPU_MemoryT::init(const int ntypes, const double gamma,
|
||||||
const double upsilon, const double mu,
|
const double upsilon, const double mu,
|
||||||
double **host_shape, double **host_well,
|
double **host_shape, double **host_well,
|
||||||
double **host_cutsq, double **host_sigma,
|
double **host_cutsq, double **host_sigma,
|
||||||
@ -64,24 +69,24 @@ bool GB_GPU_MemoryT::init(const int ntypes, const double gamma,
|
|||||||
gpu_nbor=true;
|
gpu_nbor=true;
|
||||||
|
|
||||||
int _gpu_host=0;
|
int _gpu_host=0;
|
||||||
int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_nbor,gpu_split);
|
int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
|
||||||
if (host_nlocal>0)
|
if (host_nlocal>0)
|
||||||
_gpu_host=1;
|
_gpu_host=1;
|
||||||
|
|
||||||
if (!device->init(false,true,nlocal,host_nlocal,nall,0,gpu_nbor,_gpu_host,
|
_threads_per_atom=device->threads_per_atom();
|
||||||
max_nbors,cell_size,true))
|
int success=device->init(*ans,false,true,nlocal,host_nlocal,nall,nbor,0,
|
||||||
return false;
|
_gpu_host,max_nbors,cell_size,true);
|
||||||
|
if (success!=0)
|
||||||
|
return success;
|
||||||
|
|
||||||
ucl_device=device->gpu;
|
ucl_device=device->gpu;
|
||||||
atom=&device->atom;
|
atom=&device->atom;
|
||||||
nbor=&device->nbor;
|
|
||||||
|
|
||||||
_block_size=BLOCK_1D;
|
_block_size=device->pair_block_size();
|
||||||
if (static_cast<size_t>(_block_size)>ucl_device->group_size())
|
|
||||||
_block_size=ucl_device->group_size();
|
|
||||||
compile_kernels(*ucl_device);
|
compile_kernels(*ucl_device);
|
||||||
|
|
||||||
// Initialize host-device load balancer
|
// Initialize host-device load balancer
|
||||||
hd_balancer.init(device,gpu_split);
|
hd_balancer.init(device,gpu_nbor,gpu_split);
|
||||||
|
|
||||||
// Initialize timers for the selected GPU
|
// Initialize timers for the selected GPU
|
||||||
time_pair.init(*ucl_device);
|
time_pair.init(*ucl_device);
|
||||||
@ -90,8 +95,9 @@ bool GB_GPU_MemoryT::init(const int ntypes, const double gamma,
|
|||||||
// If atom type constants fit in shared memory use fast kernel
|
// If atom type constants fit in shared memory use fast kernel
|
||||||
int lj_types=ntypes;
|
int lj_types=ntypes;
|
||||||
shared_types=false;
|
shared_types=false;
|
||||||
if (lj_types<=MAX_SHARED_TYPES && _block_size>=MAX_SHARED_TYPES) {
|
int max_shared_types=device->max_shared_types();
|
||||||
lj_types=MAX_SHARED_TYPES;
|
if (lj_types<=max_shared_types && _block_size>=max_shared_types) {
|
||||||
|
lj_types=max_shared_types;
|
||||||
shared_types=true;
|
shared_types=true;
|
||||||
}
|
}
|
||||||
_lj_types=lj_types;
|
_lj_types=lj_types;
|
||||||
@ -186,12 +192,19 @@ bool GB_GPU_MemoryT::init(const int ntypes, const double gamma,
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (multiple_forms)
|
if (multiple_forms)
|
||||||
atom->dev_ans.zero();
|
ans->dev_ans.zero();
|
||||||
|
|
||||||
_max_bytes=atom->gpu_bytes()+nbor->gpu_bytes();
|
_max_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||||
|
|
||||||
// Memory for ilist ordered by particle type
|
// Memory for ilist ordered by particle type
|
||||||
return (host_olist.alloc(nbor->max_atoms(),*ucl_device)==UCL_SUCCESS);
|
if (host_olist.alloc(nbor->max_atoms(),*ucl_device)==UCL_SUCCESS)
|
||||||
|
return 0;
|
||||||
|
else return -3;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
void GB_GPU_MemoryT::estimate_gpu_overhead() {
|
||||||
|
device->estimate_gpu_overhead(2,_gpu_overhead,_driver_overhead);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
@ -209,9 +222,9 @@ void GB_GPU_MemoryT::clear() {
|
|||||||
|
|
||||||
// Output any timing information
|
// Output any timing information
|
||||||
acc_timers();
|
acc_timers();
|
||||||
double single[6], times[6];
|
double single[9], times[9];
|
||||||
|
|
||||||
single[0]=atom->transfer_time();
|
single[0]=atom->transfer_time()+ans->transfer_time();
|
||||||
single[1]=nbor->time_nbor.total_seconds();
|
single[1]=nbor->time_nbor.total_seconds();
|
||||||
single[2]=time_kernel.total_seconds()+time_kernel2.total_seconds()+
|
single[2]=time_kernel.total_seconds()+time_kernel2.total_seconds()+
|
||||||
nbor->time_kernel.total_seconds();
|
nbor->time_kernel.total_seconds();
|
||||||
@ -220,15 +233,18 @@ void GB_GPU_MemoryT::clear() {
|
|||||||
single[4]=time_pair.total_seconds();
|
single[4]=time_pair.total_seconds();
|
||||||
else
|
else
|
||||||
single[4]=0;
|
single[4]=0;
|
||||||
single[5]=atom->cast_time();
|
single[5]=atom->cast_time()+ans->cast_time();
|
||||||
|
single[6]=_gpu_overhead;
|
||||||
|
single[7]=_driver_overhead;
|
||||||
|
single[8]=ans->cpu_idle_time();
|
||||||
|
|
||||||
MPI_Reduce(single,times,6,MPI_DOUBLE,MPI_SUM,0,device->replica());
|
MPI_Reduce(single,times,9,MPI_DOUBLE,MPI_SUM,0,device->replica());
|
||||||
double avg_split=hd_balancer.all_avg_split();
|
double avg_split=hd_balancer.all_avg_split();
|
||||||
|
|
||||||
_max_bytes+=dev_error.row_bytes()+lj1.row_bytes()+lj3.row_bytes()+
|
_max_bytes+=dev_error.row_bytes()+lj1.row_bytes()+lj3.row_bytes()+
|
||||||
sigma_epsilon.row_bytes()+cut_form.row_bytes()+
|
sigma_epsilon.row_bytes()+cut_form.row_bytes()+
|
||||||
shape.row_bytes()+well.row_bytes()+lshape.row_bytes()+
|
shape.row_bytes()+well.row_bytes()+lshape.row_bytes()+
|
||||||
gamma_upsilon_mu.row_bytes();
|
gamma_upsilon_mu.row_bytes()+atom->max_gpu_bytes();
|
||||||
double mpi_max_bytes;
|
double mpi_max_bytes;
|
||||||
MPI_Reduce(&_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,
|
MPI_Reduce(&_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,
|
||||||
device->replica());
|
device->replica());
|
||||||
@ -255,10 +271,19 @@ void GB_GPU_MemoryT::clear() {
|
|||||||
fprintf(screen,"Force calc: %.4f s.\n",times[3]/replica_size);
|
fprintf(screen,"Force calc: %.4f s.\n",times[3]/replica_size);
|
||||||
fprintf(screen,"LJ calc: %.4f s.\n",times[4]/replica_size);
|
fprintf(screen,"LJ calc: %.4f s.\n",times[4]/replica_size);
|
||||||
}
|
}
|
||||||
|
fprintf(screen,"GPU Overhead: %.4f s.\n",times[6]/replica_size);
|
||||||
fprintf(screen,"Average split: %.4f.\n",avg_split);
|
fprintf(screen,"Average split: %.4f.\n",avg_split);
|
||||||
fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb);
|
fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb);
|
||||||
|
fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[7]/replica_size);
|
||||||
|
fprintf(screen,"CPU Idle_Time: %.4f s.\n",times[8]/replica_size);
|
||||||
fprintf(screen,"-------------------------------------");
|
fprintf(screen,"-------------------------------------");
|
||||||
fprintf(screen,"--------------------------------\n\n");
|
fprintf(screen,"--------------------------------\n\n");
|
||||||
|
|
||||||
|
|
||||||
|
fprintf(screen,"Average split: %.4f.\n",avg_split);
|
||||||
|
fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb);
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
_max_bytes=0.0;
|
_max_bytes=0.0;
|
||||||
|
|
||||||
@ -299,10 +324,9 @@ void GB_GPU_MemoryT::clear() {
|
|||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
double GB_GPU_MemoryT::host_memory_usage() const {
|
double GB_GPU_MemoryT::host_memory_usage() const {
|
||||||
return device->atom.host_memory_usage()+
|
return device->atom.host_memory_usage()+nbor->host_memory_usage()+
|
||||||
device->nbor.host_memory_usage()+4*sizeof(numtyp)+
|
4*sizeof(numtyp)+sizeof(GB_GPU_Memory<numtyp,acctyp>)+
|
||||||
sizeof(GB_GPU_Memory<numtyp,acctyp>)+
|
nbor->max_atoms()*sizeof(int);
|
||||||
device->nbor.max_atoms()*sizeof(int);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
|
|||||||
@ -18,8 +18,6 @@
|
|||||||
#ifndef GB_GPU_MEMORY_H
|
#ifndef GB_GPU_MEMORY_H
|
||||||
#define GB_GPU_MEMORY_H
|
#define GB_GPU_MEMORY_H
|
||||||
|
|
||||||
#define BLOCK_1D 64
|
|
||||||
|
|
||||||
#include "pair_gpu_device.h"
|
#include "pair_gpu_device.h"
|
||||||
#include "pair_gpu_balance.h"
|
#include "pair_gpu_balance.h"
|
||||||
#include "mpi.h"
|
#include "mpi.h"
|
||||||
@ -35,8 +33,15 @@ class GB_GPU_Memory {
|
|||||||
* \param max_nbors initial number of rows in the neighbor matrix
|
* \param max_nbors initial number of rows in the neighbor matrix
|
||||||
* \param cell_size cutoff + skin
|
* \param cell_size cutoff + skin
|
||||||
* \param gpu_split fraction of particles handled by device
|
* \param gpu_split fraction of particles handled by device
|
||||||
* \return false if there is not sufficient memory or device init prob **/
|
* \return false if there is not sufficient memory or device init prob
|
||||||
bool init(const int ntypes, const double gamma,
|
*
|
||||||
|
* Returns:
|
||||||
|
* - 0 if successfull
|
||||||
|
* - -1 if fix gpu not found
|
||||||
|
* - -3 if there is an out of memory error
|
||||||
|
* - -4 if the GPU library was not compiled for GPU
|
||||||
|
* - -5 Double precision is not supported on card **/
|
||||||
|
int init(const int ntypes, const double gamma,
|
||||||
const double upsilon, const double mu, double **host_shape,
|
const double upsilon, const double mu, double **host_shape,
|
||||||
double **host_well, double **host_cutsq, double **host_sigma,
|
double **host_well, double **host_cutsq, double **host_sigma,
|
||||||
double **host_epsilon, double *host_lshape, int **h_form,
|
double **host_epsilon, double *host_lshape, int **h_form,
|
||||||
@ -46,12 +51,16 @@ class GB_GPU_Memory {
|
|||||||
const int max_nbors, const double cell_size,
|
const int max_nbors, const double cell_size,
|
||||||
const double gpu_split, FILE *screen);
|
const double gpu_split, FILE *screen);
|
||||||
|
|
||||||
|
/// Estimate the overhead for GPU context changes and CPU driver
|
||||||
|
void estimate_gpu_overhead();
|
||||||
|
|
||||||
/// Check if there is enough storage for atom arrays and realloc if not
|
/// Check if there is enough storage for atom arrays and realloc if not
|
||||||
/** \param success set to false if insufficient memory **/
|
/** \param success set to false if insufficient memory **/
|
||||||
inline void resize_atom(const int inum, const int nall, bool &success) {
|
inline void resize_atom(const int inum, const int nall, bool &success) {
|
||||||
atom->resize(inum, nall, success);
|
atom->resize(nall, success);
|
||||||
if (multiple_forms) atom->dev_ans.zero();
|
ans->resize(inum, success);
|
||||||
double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
|
if (multiple_forms) ans->dev_ans.zero();
|
||||||
|
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||||
if (bytes>_max_bytes)
|
if (bytes>_max_bytes)
|
||||||
_max_bytes=bytes;
|
_max_bytes=bytes;
|
||||||
}
|
}
|
||||||
@ -74,7 +83,7 @@ class GB_GPU_Memory {
|
|||||||
success=success && (host_olist.alloc(new_size,*ucl_device)==UCL_SUCCESS);
|
success=success && (host_olist.alloc(new_size,*ucl_device)==UCL_SUCCESS);
|
||||||
}
|
}
|
||||||
nbor->resize(nlocal,host_inum,max_nbors,success);
|
nbor->resize(nlocal,host_inum,max_nbors,success);
|
||||||
double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
|
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||||
if (bytes>_max_bytes)
|
if (bytes>_max_bytes)
|
||||||
_max_bytes=bytes;
|
_max_bytes=bytes;
|
||||||
}
|
}
|
||||||
@ -91,6 +100,7 @@ class GB_GPU_Memory {
|
|||||||
|
|
||||||
/// Accumulate timers
|
/// Accumulate timers
|
||||||
inline void acc_timers() {
|
inline void acc_timers() {
|
||||||
|
if (device->time_device()) {
|
||||||
if (nbor_time_avail) {
|
if (nbor_time_avail) {
|
||||||
nbor->time_nbor.add_to_total();
|
nbor->time_nbor.add_to_total();
|
||||||
nbor->time_kernel.add_to_total();
|
nbor->time_kernel.add_to_total();
|
||||||
@ -104,6 +114,8 @@ class GB_GPU_Memory {
|
|||||||
time_pair.add_to_total();
|
time_pair.add_to_total();
|
||||||
}
|
}
|
||||||
atom->acc_timers();
|
atom->acc_timers();
|
||||||
|
ans->acc_timers();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Accumulate timers
|
/// Accumulate timers
|
||||||
@ -117,6 +129,7 @@ class GB_GPU_Memory {
|
|||||||
time_pair.zero();
|
time_pair.zero();
|
||||||
}
|
}
|
||||||
atom->zero_timers();
|
atom->zero_timers();
|
||||||
|
ans->zero_timers();
|
||||||
}
|
}
|
||||||
|
|
||||||
// -------------------------- DEVICE DATA -------------------------
|
// -------------------------- DEVICE DATA -------------------------
|
||||||
@ -168,6 +181,10 @@ class GB_GPU_Memory {
|
|||||||
|
|
||||||
int last_ellipse, max_last_ellipse;
|
int last_ellipse, max_last_ellipse;
|
||||||
|
|
||||||
|
// ------------------------ FORCE/ENERGY DATA -----------------------
|
||||||
|
|
||||||
|
PairGPUAns<numtyp,acctyp> *ans;
|
||||||
|
|
||||||
// --------------------------- NBOR DATA ----------------------------
|
// --------------------------- NBOR DATA ----------------------------
|
||||||
|
|
||||||
/// Neighbor data
|
/// Neighbor data
|
||||||
@ -183,10 +200,12 @@ class GB_GPU_Memory {
|
|||||||
UCL_Kernel k_gayberne, k_sphere_gb, k_lj_fast, k_lj;
|
UCL_Kernel k_gayberne, k_sphere_gb, k_lj_fast, k_lj;
|
||||||
inline int block_size() { return _block_size; }
|
inline int block_size() { return _block_size; }
|
||||||
|
|
||||||
|
int _threads_per_atom;
|
||||||
private:
|
private:
|
||||||
bool _allocated, _compiled;
|
bool _allocated, _compiled;
|
||||||
int _block_size;
|
int _block_size;
|
||||||
double _max_bytes;
|
double _max_bytes;
|
||||||
|
double _gpu_overhead, _driver_overhead;
|
||||||
|
|
||||||
void compile_kernels(UCL_Device &dev);
|
void compile_kernels(UCL_Device &dev);
|
||||||
};
|
};
|
||||||
|
|||||||
@ -1,2 +1,2 @@
|
|||||||
Geryon Version 10.280
|
Geryon Version 11.094
|
||||||
|
|
||||||
@ -167,6 +167,7 @@ class UCL_Device {
|
|||||||
int _device, _num_devices;
|
int _device, _num_devices;
|
||||||
std::vector<cudaDeviceProp> _properties;
|
std::vector<cudaDeviceProp> _properties;
|
||||||
std::vector<cudaStream_t> _cq;
|
std::vector<cudaStream_t> _cq;
|
||||||
|
std::vector<int> _device_ids;
|
||||||
};
|
};
|
||||||
|
|
||||||
// Grabs the properties for all devices
|
// Grabs the properties for all devices
|
||||||
@ -178,6 +179,7 @@ inline UCL_Device::UCL_Device() {
|
|||||||
if (deviceProp.major == 9999 && deviceProp.minor == 9999)
|
if (deviceProp.major == 9999 && deviceProp.minor == 9999)
|
||||||
break;
|
break;
|
||||||
_properties.push_back(deviceProp);
|
_properties.push_back(deviceProp);
|
||||||
|
_device_ids.push_back(dev);
|
||||||
}
|
}
|
||||||
_device=-1;
|
_device=-1;
|
||||||
_cq.push_back(cudaStream_t());
|
_cq.push_back(cudaStream_t());
|
||||||
@ -194,7 +196,7 @@ inline void UCL_Device::set(int num) {
|
|||||||
return;
|
return;
|
||||||
for (int i=1; i<num_queues(); i++) pop_command_queue();
|
for (int i=1; i<num_queues(); i++) pop_command_queue();
|
||||||
cudaThreadExit();
|
cudaThreadExit();
|
||||||
CUDA_SAFE_CALL_NS(cudaSetDevice(num));
|
CUDA_SAFE_CALL_NS(cudaSetDevice(_device_ids[num]));
|
||||||
_device=num;
|
_device=num;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -42,6 +42,7 @@ inline void ucl_sync(CUstream &stream) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
struct NVDProperties {
|
struct NVDProperties {
|
||||||
|
int device_id;
|
||||||
std::string name;
|
std::string name;
|
||||||
int major;
|
int major;
|
||||||
int minor;
|
int minor;
|
||||||
@ -208,15 +209,20 @@ inline UCL_Device::UCL_Device() {
|
|||||||
for (int dev=0; dev<_num_devices; ++dev) {
|
for (int dev=0; dev<_num_devices; ++dev) {
|
||||||
CUdevice m;
|
CUdevice m;
|
||||||
CU_SAFE_CALL_NS(cuDeviceGet(&m,dev));
|
CU_SAFE_CALL_NS(cuDeviceGet(&m,dev));
|
||||||
|
int major, minor;
|
||||||
|
CU_SAFE_CALL_NS(cuDeviceComputeCapability(&major,&minor,m));
|
||||||
|
if (major==9999)
|
||||||
|
continue;
|
||||||
|
|
||||||
_properties.push_back(NVDProperties());
|
_properties.push_back(NVDProperties());
|
||||||
|
_properties.back().device_id=dev;
|
||||||
|
_properties.back().major=major;
|
||||||
|
_properties.back().minor=minor;
|
||||||
|
|
||||||
char namecstr[1024];
|
char namecstr[1024];
|
||||||
CU_SAFE_CALL_NS(cuDeviceGetName(namecstr,1024,m));
|
CU_SAFE_CALL_NS(cuDeviceGetName(namecstr,1024,m));
|
||||||
_properties.back().name=namecstr;
|
_properties.back().name=namecstr;
|
||||||
|
|
||||||
CU_SAFE_CALL_NS(cuDeviceComputeCapability(&_properties.back().major,
|
|
||||||
&_properties.back().minor,m));
|
|
||||||
|
|
||||||
CU_SAFE_CALL_NS(cuDeviceTotalMem(&_properties.back().totalGlobalMem,m));
|
CU_SAFE_CALL_NS(cuDeviceTotalMem(&_properties.back().totalGlobalMem,m));
|
||||||
CU_SAFE_CALL_NS(cuDeviceGetAttribute(&_properties.back().multiProcessorCount,
|
CU_SAFE_CALL_NS(cuDeviceGetAttribute(&_properties.back().multiProcessorCount,
|
||||||
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
|
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
|
||||||
@ -262,9 +268,9 @@ inline void UCL_Device::set(int num) {
|
|||||||
CU_SAFE_CALL_NS(cuCtxDestroy(_context));
|
CU_SAFE_CALL_NS(cuCtxDestroy(_context));
|
||||||
for (int i=1; i<num_queues(); i++) pop_command_queue();
|
for (int i=1; i<num_queues(); i++) pop_command_queue();
|
||||||
}
|
}
|
||||||
CU_SAFE_CALL_NS(cuDeviceGet(&_cu_device,num));
|
_device=_properties[num].device_id;
|
||||||
|
CU_SAFE_CALL_NS(cuDeviceGet(&_cu_device,_device));
|
||||||
CU_SAFE_CALL_NS(cuCtxCreate(&_context,0,_cu_device));
|
CU_SAFE_CALL_NS(cuCtxCreate(&_context,0,_cu_device));
|
||||||
_device=num;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// List all devices along with all properties
|
// List all devices along with all properties
|
||||||
|
|||||||
@ -25,6 +25,7 @@
|
|||||||
#define NVD_TIMER_H
|
#define NVD_TIMER_H
|
||||||
|
|
||||||
#include "nvd_macros.h"
|
#include "nvd_macros.h"
|
||||||
|
#include "nvd_device.h"
|
||||||
|
|
||||||
namespace ucl_cudadr {
|
namespace ucl_cudadr {
|
||||||
|
|
||||||
@ -66,12 +67,23 @@ class UCL_Timer {
|
|||||||
/// Stop timing on command queue
|
/// Stop timing on command queue
|
||||||
inline void stop() { CU_SAFE_CALL(cuEventRecord(stop_event,_cq)); }
|
inline void stop() { CU_SAFE_CALL(cuEventRecord(stop_event,_cq)); }
|
||||||
|
|
||||||
|
/// Block until the start event has been reached on device
|
||||||
|
inline void sync_start()
|
||||||
|
{ CU_SAFE_CALL(cuEventSynchronize(start_event)); }
|
||||||
|
|
||||||
|
/// Block until the stop event has been reached on device
|
||||||
|
inline void sync_stop()
|
||||||
|
{ CU_SAFE_CALL(cuEventSynchronize(stop_event)); }
|
||||||
|
|
||||||
/// Set the time elapsed to zero (not the total_time)
|
/// Set the time elapsed to zero (not the total_time)
|
||||||
inline void zero() {
|
inline void zero() {
|
||||||
CU_SAFE_CALL(cuEventRecord(start_event,_cq));
|
CU_SAFE_CALL(cuEventRecord(start_event,_cq));
|
||||||
CU_SAFE_CALL(cuEventRecord(stop_event,_cq));
|
CU_SAFE_CALL(cuEventRecord(stop_event,_cq));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Set the total time to zero
|
||||||
|
inline void zero_total() { _total_time=0.0; }
|
||||||
|
|
||||||
/// Add time from previous start and stop to total
|
/// Add time from previous start and stop to total
|
||||||
/** Forces synchronization **/
|
/** Forces synchronization **/
|
||||||
inline double add_to_total()
|
inline double add_to_total()
|
||||||
|
|||||||
@ -25,6 +25,7 @@
|
|||||||
#define OCL_TIMER_H
|
#define OCL_TIMER_H
|
||||||
|
|
||||||
#include "ocl_macros.h"
|
#include "ocl_macros.h"
|
||||||
|
#include "ocl_device.h"
|
||||||
|
|
||||||
namespace ucl_opencl {
|
namespace ucl_opencl {
|
||||||
|
|
||||||
@ -67,10 +68,21 @@ class UCL_Timer {
|
|||||||
/// Stop timing on default command queue
|
/// Stop timing on default command queue
|
||||||
inline void stop() { clEnqueueMarker(_cq,&stop_event); }
|
inline void stop() { clEnqueueMarker(_cq,&stop_event); }
|
||||||
|
|
||||||
|
/// Block until the start event has been reached on device
|
||||||
|
inline void sync_start()
|
||||||
|
{ CL_SAFE_CALL(clWaitForEvents(1,&start_event)); }
|
||||||
|
|
||||||
|
/// Block until the stop event has been reached on device
|
||||||
|
inline void sync_stop()
|
||||||
|
{ CL_SAFE_CALL(clWaitForEvents(1,&stop_event)); }
|
||||||
|
|
||||||
/// Set the time elapsed to zero (not the total_time)
|
/// Set the time elapsed to zero (not the total_time)
|
||||||
inline void zero()
|
inline void zero()
|
||||||
{ clEnqueueMarker(_cq,&start_event); clEnqueueMarker(_cq,&stop_event); }
|
{ clEnqueueMarker(_cq,&start_event); clEnqueueMarker(_cq,&stop_event); }
|
||||||
|
|
||||||
|
/// Set the total time to zero
|
||||||
|
inline void zero_total() { _total_time=0.0; }
|
||||||
|
|
||||||
/// Add time from previous start and stop to total
|
/// Add time from previous start and stop to total
|
||||||
/** Forces synchronization **/
|
/** Forces synchronization **/
|
||||||
inline double add_to_total()
|
inline double add_to_total()
|
||||||
|
|||||||
@ -206,6 +206,191 @@
|
|||||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <class t1, class t2, class t3, class t4, class t5,
|
||||||
|
class t6, class t7, class t8, class t9, class t10,
|
||||||
|
class t11, class t12, class t13, class t14, class t15,
|
||||||
|
class t16, class t17, class t18, class t19, class t20,
|
||||||
|
class t21>
|
||||||
|
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||||
|
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||||
|
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||||
|
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||||
|
t21 *a21) {
|
||||||
|
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||||
|
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||||
|
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||||
|
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||||
|
add_arg(a21);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class t1, class t2, class t3, class t4, class t5,
|
||||||
|
class t6, class t7, class t8, class t9, class t10,
|
||||||
|
class t11, class t12, class t13, class t14, class t15,
|
||||||
|
class t16, class t17, class t18, class t19, class t20,
|
||||||
|
class t21, class t22>
|
||||||
|
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||||
|
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||||
|
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||||
|
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||||
|
t21 *a21, t22 *a22) {
|
||||||
|
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||||
|
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||||
|
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||||
|
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||||
|
add_arg(a21); add_arg(a22);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class t1, class t2, class t3, class t4, class t5,
|
||||||
|
class t6, class t7, class t8, class t9, class t10,
|
||||||
|
class t11, class t12, class t13, class t14, class t15,
|
||||||
|
class t16, class t17, class t18, class t19, class t20,
|
||||||
|
class t21, class t22, class t23>
|
||||||
|
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||||
|
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||||
|
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||||
|
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||||
|
t21 *a21, t22 *a22, t23 *a23) {
|
||||||
|
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||||
|
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||||
|
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||||
|
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||||
|
add_arg(a21); add_arg(a22); add_arg(a23);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class t1, class t2, class t3, class t4, class t5,
|
||||||
|
class t6, class t7, class t8, class t9, class t10,
|
||||||
|
class t11, class t12, class t13, class t14, class t15,
|
||||||
|
class t16, class t17, class t18, class t19, class t20,
|
||||||
|
class t21, class t22, class t23, class t24>
|
||||||
|
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||||
|
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||||
|
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||||
|
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||||
|
t21 *a21, t22 *a22, t23 *a23, t24 *a24) {
|
||||||
|
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||||
|
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||||
|
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||||
|
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||||
|
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class t1, class t2, class t3, class t4, class t5,
|
||||||
|
class t6, class t7, class t8, class t9, class t10,
|
||||||
|
class t11, class t12, class t13, class t14, class t15,
|
||||||
|
class t16, class t17, class t18, class t19, class t20,
|
||||||
|
class t21, class t22, class t23, class t24, class t25>
|
||||||
|
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||||
|
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||||
|
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||||
|
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||||
|
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25) {
|
||||||
|
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||||
|
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||||
|
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||||
|
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||||
|
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class t1, class t2, class t3, class t4, class t5,
|
||||||
|
class t6, class t7, class t8, class t9, class t10,
|
||||||
|
class t11, class t12, class t13, class t14, class t15,
|
||||||
|
class t16, class t17, class t18, class t19, class t20,
|
||||||
|
class t21, class t22, class t23, class t24, class t25,
|
||||||
|
class t26>
|
||||||
|
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||||
|
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||||
|
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||||
|
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||||
|
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
|
||||||
|
t26 *a26) {
|
||||||
|
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||||
|
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||||
|
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||||
|
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||||
|
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||||
|
add_arg(a26);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class t1, class t2, class t3, class t4, class t5,
|
||||||
|
class t6, class t7, class t8, class t9, class t10,
|
||||||
|
class t11, class t12, class t13, class t14, class t15,
|
||||||
|
class t16, class t17, class t18, class t19, class t20,
|
||||||
|
class t21, class t22, class t23, class t24, class t25,
|
||||||
|
class t26, class t27>
|
||||||
|
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||||
|
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||||
|
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||||
|
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||||
|
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
|
||||||
|
t26 *a26, t27 *a27) {
|
||||||
|
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||||
|
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||||
|
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||||
|
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||||
|
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||||
|
add_arg(a26); add_arg(a27);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class t1, class t2, class t3, class t4, class t5,
|
||||||
|
class t6, class t7, class t8, class t9, class t10,
|
||||||
|
class t11, class t12, class t13, class t14, class t15,
|
||||||
|
class t16, class t17, class t18, class t19, class t20,
|
||||||
|
class t21, class t22, class t23, class t24, class t25,
|
||||||
|
class t26, class t27, class t28>
|
||||||
|
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||||
|
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||||
|
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||||
|
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||||
|
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
|
||||||
|
t26 *a26, t27 *a27, t28 *a28) {
|
||||||
|
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||||
|
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||||
|
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||||
|
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||||
|
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||||
|
add_arg(a26); add_arg(a27); add_arg(a28);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class t1, class t2, class t3, class t4, class t5,
|
||||||
|
class t6, class t7, class t8, class t9, class t10,
|
||||||
|
class t11, class t12, class t13, class t14, class t15,
|
||||||
|
class t16, class t17, class t18, class t19, class t20,
|
||||||
|
class t21, class t22, class t23, class t24, class t25,
|
||||||
|
class t26, class t27, class t28, class t29>
|
||||||
|
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||||
|
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||||
|
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||||
|
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||||
|
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
|
||||||
|
t26 *a26, t27 *a27, t28 *a28, t29 *a29) {
|
||||||
|
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||||
|
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||||
|
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||||
|
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||||
|
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||||
|
add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class t1, class t2, class t3, class t4, class t5,
|
||||||
|
class t6, class t7, class t8, class t9, class t10,
|
||||||
|
class t11, class t12, class t13, class t14, class t15,
|
||||||
|
class t16, class t17, class t18, class t19, class t20,
|
||||||
|
class t21, class t22, class t23, class t24, class t25,
|
||||||
|
class t26, class t27, class t28, class t29, class t30>
|
||||||
|
inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||||
|
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||||
|
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||||
|
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||||
|
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
|
||||||
|
t26 *a26, t27 *a27, t28 *a28, t29 *a29, t30 *a30) {
|
||||||
|
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||||
|
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||||
|
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||||
|
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||||
|
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||||
|
add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
@ -439,6 +624,211 @@
|
|||||||
run();
|
run();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <class t1, class t2, class t3, class t4, class t5,
|
||||||
|
class t6, class t7, class t8, class t9, class t10,
|
||||||
|
class t11, class t12, class t13, class t14, class t15,
|
||||||
|
class t16, class t17, class t18, class t19, class t20,
|
||||||
|
class t21>
|
||||||
|
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||||
|
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||||
|
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||||
|
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||||
|
t21 *a21) {
|
||||||
|
clear_args();
|
||||||
|
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||||
|
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||||
|
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||||
|
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||||
|
add_arg(a21);
|
||||||
|
run();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class t1, class t2, class t3, class t4, class t5,
|
||||||
|
class t6, class t7, class t8, class t9, class t10,
|
||||||
|
class t11, class t12, class t13, class t14, class t15,
|
||||||
|
class t16, class t17, class t18, class t19, class t20,
|
||||||
|
class t21, class t22>
|
||||||
|
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||||
|
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||||
|
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||||
|
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||||
|
t21 *a21, t22 *a22) {
|
||||||
|
clear_args();
|
||||||
|
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||||
|
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||||
|
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||||
|
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||||
|
add_arg(a21); add_arg(a22);
|
||||||
|
run();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class t1, class t2, class t3, class t4, class t5,
|
||||||
|
class t6, class t7, class t8, class t9, class t10,
|
||||||
|
class t11, class t12, class t13, class t14, class t15,
|
||||||
|
class t16, class t17, class t18, class t19, class t20,
|
||||||
|
class t21, class t22, class t23>
|
||||||
|
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||||
|
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||||
|
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||||
|
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||||
|
t21 *a21, t22 *a22, t23 *a23) {
|
||||||
|
clear_args();
|
||||||
|
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||||
|
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||||
|
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||||
|
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||||
|
add_arg(a21); add_arg(a22); add_arg(a23);
|
||||||
|
run();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class t1, class t2, class t3, class t4, class t5,
|
||||||
|
class t6, class t7, class t8, class t9, class t10,
|
||||||
|
class t11, class t12, class t13, class t14, class t15,
|
||||||
|
class t16, class t17, class t18, class t19, class t20,
|
||||||
|
class t21, class t22, class t23, class t24>
|
||||||
|
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||||
|
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||||
|
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||||
|
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||||
|
t21 *a21, t22 *a22, t23 *a23, t24 *a24) {
|
||||||
|
clear_args();
|
||||||
|
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||||
|
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||||
|
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||||
|
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||||
|
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24);
|
||||||
|
run();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class t1, class t2, class t3, class t4, class t5,
|
||||||
|
class t6, class t7, class t8, class t9, class t10,
|
||||||
|
class t11, class t12, class t13, class t14, class t15,
|
||||||
|
class t16, class t17, class t18, class t19, class t20,
|
||||||
|
class t21, class t22, class t23, class t24, class t25>
|
||||||
|
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||||
|
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||||
|
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||||
|
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||||
|
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25) {
|
||||||
|
clear_args();
|
||||||
|
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||||
|
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||||
|
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||||
|
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||||
|
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||||
|
run();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class t1, class t2, class t3, class t4, class t5,
|
||||||
|
class t6, class t7, class t8, class t9, class t10,
|
||||||
|
class t11, class t12, class t13, class t14, class t15,
|
||||||
|
class t16, class t17, class t18, class t19, class t20,
|
||||||
|
class t21, class t22, class t23, class t24, class t25,
|
||||||
|
class t26>
|
||||||
|
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||||
|
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||||
|
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||||
|
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||||
|
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
|
||||||
|
t26 *a26) {
|
||||||
|
clear_args();
|
||||||
|
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||||
|
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||||
|
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||||
|
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||||
|
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||||
|
add_arg(a26);
|
||||||
|
run();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class t1, class t2, class t3, class t4, class t5,
|
||||||
|
class t6, class t7, class t8, class t9, class t10,
|
||||||
|
class t11, class t12, class t13, class t14, class t15,
|
||||||
|
class t16, class t17, class t18, class t19, class t20,
|
||||||
|
class t21, class t22, class t23, class t24, class t25,
|
||||||
|
class t26, class t27>
|
||||||
|
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||||
|
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||||
|
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||||
|
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||||
|
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
|
||||||
|
t26 *a26, t27 *a27) {
|
||||||
|
clear_args();
|
||||||
|
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||||
|
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||||
|
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||||
|
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||||
|
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||||
|
add_arg(a26); add_arg(a27);
|
||||||
|
run();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class t1, class t2, class t3, class t4, class t5,
|
||||||
|
class t6, class t7, class t8, class t9, class t10,
|
||||||
|
class t11, class t12, class t13, class t14, class t15,
|
||||||
|
class t16, class t17, class t18, class t19, class t20,
|
||||||
|
class t21, class t22, class t23, class t24, class t25,
|
||||||
|
class t26, class t27, class t28>
|
||||||
|
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||||
|
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||||
|
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||||
|
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||||
|
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
|
||||||
|
t26 *a26, t27 *a27, t28 *a28) {
|
||||||
|
clear_args();
|
||||||
|
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||||
|
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||||
|
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||||
|
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||||
|
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||||
|
add_arg(a26); add_arg(a27); add_arg(a28);
|
||||||
|
run();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class t1, class t2, class t3, class t4, class t5,
|
||||||
|
class t6, class t7, class t8, class t9, class t10,
|
||||||
|
class t11, class t12, class t13, class t14, class t15,
|
||||||
|
class t16, class t17, class t18, class t19, class t20,
|
||||||
|
class t21, class t22, class t23, class t24, class t25,
|
||||||
|
class t26, class t27, class t28, class t29>
|
||||||
|
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||||
|
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||||
|
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||||
|
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||||
|
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
|
||||||
|
t26 *a26, t27 *a27, t28 *a28, t29 *a29) {
|
||||||
|
clear_args();
|
||||||
|
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||||
|
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||||
|
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||||
|
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||||
|
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||||
|
add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29);
|
||||||
|
run();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class t1, class t2, class t3, class t4, class t5,
|
||||||
|
class t6, class t7, class t8, class t9, class t10,
|
||||||
|
class t11, class t12, class t13, class t14, class t15,
|
||||||
|
class t16, class t17, class t18, class t19, class t20,
|
||||||
|
class t21, class t22, class t23, class t24, class t25,
|
||||||
|
class t26, class t27, class t28, class t29, class t30>
|
||||||
|
inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||||
|
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||||
|
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||||
|
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||||
|
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
|
||||||
|
t26 *a26, t27 *a27, t28 *a28, t29 *a29, t30 *a30) {
|
||||||
|
clear_args();
|
||||||
|
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||||
|
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||||
|
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||||
|
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||||
|
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||||
|
add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30);
|
||||||
|
run();
|
||||||
|
}
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
template <class t1>
|
template <class t1>
|
||||||
@ -671,3 +1061,208 @@
|
|||||||
run(cq);
|
run(cq);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <class t1, class t2, class t3, class t4, class t5,
|
||||||
|
class t6, class t7, class t8, class t9, class t10,
|
||||||
|
class t11, class t12, class t13, class t14, class t15,
|
||||||
|
class t16, class t17, class t18, class t19, class t20,
|
||||||
|
class t21>
|
||||||
|
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||||
|
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||||
|
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||||
|
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||||
|
t21 *a21) {
|
||||||
|
clear_args();
|
||||||
|
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||||
|
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||||
|
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||||
|
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||||
|
add_arg(a21);
|
||||||
|
run(cq);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class t1, class t2, class t3, class t4, class t5,
|
||||||
|
class t6, class t7, class t8, class t9, class t10,
|
||||||
|
class t11, class t12, class t13, class t14, class t15,
|
||||||
|
class t16, class t17, class t18, class t19, class t20,
|
||||||
|
class t21, class t22>
|
||||||
|
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||||
|
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||||
|
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||||
|
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||||
|
t21 *a21, t22 *a22) {
|
||||||
|
clear_args();
|
||||||
|
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||||
|
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||||
|
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||||
|
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||||
|
add_arg(a21); add_arg(a22);
|
||||||
|
run(cq);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class t1, class t2, class t3, class t4, class t5,
|
||||||
|
class t6, class t7, class t8, class t9, class t10,
|
||||||
|
class t11, class t12, class t13, class t14, class t15,
|
||||||
|
class t16, class t17, class t18, class t19, class t20,
|
||||||
|
class t21, class t22, class t23>
|
||||||
|
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||||
|
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||||
|
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||||
|
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||||
|
t21 *a21, t22 *a22, t23 *a23) {
|
||||||
|
clear_args();
|
||||||
|
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||||
|
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||||
|
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||||
|
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||||
|
add_arg(a21); add_arg(a22); add_arg(a23);
|
||||||
|
run(cq);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class t1, class t2, class t3, class t4, class t5,
|
||||||
|
class t6, class t7, class t8, class t9, class t10,
|
||||||
|
class t11, class t12, class t13, class t14, class t15,
|
||||||
|
class t16, class t17, class t18, class t19, class t20,
|
||||||
|
class t21, class t22, class t23, class t24>
|
||||||
|
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||||
|
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||||
|
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||||
|
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||||
|
t21 *a21, t22 *a22, t23 *a23, t24 *a24) {
|
||||||
|
clear_args();
|
||||||
|
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||||
|
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||||
|
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||||
|
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||||
|
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24);
|
||||||
|
run(cq);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class t1, class t2, class t3, class t4, class t5,
|
||||||
|
class t6, class t7, class t8, class t9, class t10,
|
||||||
|
class t11, class t12, class t13, class t14, class t15,
|
||||||
|
class t16, class t17, class t18, class t19, class t20,
|
||||||
|
class t21, class t22, class t23, class t24, class t25>
|
||||||
|
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||||
|
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||||
|
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||||
|
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||||
|
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25) {
|
||||||
|
clear_args();
|
||||||
|
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||||
|
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||||
|
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||||
|
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||||
|
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||||
|
run(cq);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class t1, class t2, class t3, class t4, class t5,
|
||||||
|
class t6, class t7, class t8, class t9, class t10,
|
||||||
|
class t11, class t12, class t13, class t14, class t15,
|
||||||
|
class t16, class t17, class t18, class t19, class t20,
|
||||||
|
class t21, class t22, class t23, class t24, class t25,
|
||||||
|
class t26>
|
||||||
|
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||||
|
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||||
|
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||||
|
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||||
|
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
|
||||||
|
t26 *a26) {
|
||||||
|
clear_args();
|
||||||
|
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||||
|
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||||
|
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||||
|
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||||
|
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||||
|
add_arg(a26);
|
||||||
|
run(cq);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class t1, class t2, class t3, class t4, class t5,
|
||||||
|
class t6, class t7, class t8, class t9, class t10,
|
||||||
|
class t11, class t12, class t13, class t14, class t15,
|
||||||
|
class t16, class t17, class t18, class t19, class t20,
|
||||||
|
class t21, class t22, class t23, class t24, class t25,
|
||||||
|
class t26, class t27>
|
||||||
|
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||||
|
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||||
|
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||||
|
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||||
|
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
|
||||||
|
t26 *a26, t27 *a27) {
|
||||||
|
clear_args();
|
||||||
|
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||||
|
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||||
|
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||||
|
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||||
|
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||||
|
add_arg(a26); add_arg(a27);
|
||||||
|
run(cq);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class t1, class t2, class t3, class t4, class t5,
|
||||||
|
class t6, class t7, class t8, class t9, class t10,
|
||||||
|
class t11, class t12, class t13, class t14, class t15,
|
||||||
|
class t16, class t17, class t18, class t19, class t20,
|
||||||
|
class t21, class t22, class t23, class t24, class t25,
|
||||||
|
class t26, class t27, class t28>
|
||||||
|
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||||
|
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||||
|
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||||
|
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||||
|
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
|
||||||
|
t26 *a26, t27 *a27, t28 *a28) {
|
||||||
|
clear_args();
|
||||||
|
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||||
|
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||||
|
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||||
|
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||||
|
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||||
|
add_arg(a26); add_arg(a27); add_arg(a28);
|
||||||
|
run(cq);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class t1, class t2, class t3, class t4, class t5,
|
||||||
|
class t6, class t7, class t8, class t9, class t10,
|
||||||
|
class t11, class t12, class t13, class t14, class t15,
|
||||||
|
class t16, class t17, class t18, class t19, class t20,
|
||||||
|
class t21, class t22, class t23, class t24, class t25,
|
||||||
|
class t26, class t27, class t28, class t29>
|
||||||
|
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||||
|
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||||
|
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||||
|
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||||
|
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
|
||||||
|
t26 *a26, t27 *a27, t28 *a28, t29 *a29) {
|
||||||
|
clear_args();
|
||||||
|
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||||
|
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||||
|
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||||
|
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||||
|
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||||
|
add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29);
|
||||||
|
run(cq);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class t1, class t2, class t3, class t4, class t5,
|
||||||
|
class t6, class t7, class t8, class t9, class t10,
|
||||||
|
class t11, class t12, class t13, class t14, class t15,
|
||||||
|
class t16, class t17, class t18, class t19, class t20,
|
||||||
|
class t21, class t22, class t23, class t24, class t25,
|
||||||
|
class t26, class t27, class t28, class t29, class t30>
|
||||||
|
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||||
|
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||||
|
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||||
|
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||||
|
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
|
||||||
|
t26 *a26, t27 *a27, t28 *a28, t29 *a29, t30 *a30) {
|
||||||
|
clear_args();
|
||||||
|
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||||
|
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||||
|
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||||
|
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||||
|
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||||
|
add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30);
|
||||||
|
run(cq);
|
||||||
|
}
|
||||||
|
|
||||||
|
|||||||
@ -61,20 +61,23 @@ class UCL_D_Mat : public UCL_BaseMat {
|
|||||||
inline int alloc(const size_t rows, const size_t cols, mat_type &cq,
|
inline int alloc(const size_t rows, const size_t cols, mat_type &cq,
|
||||||
const enum UCL_MEMOPT kind=UCL_READ_WRITE) {
|
const enum UCL_MEMOPT kind=UCL_READ_WRITE) {
|
||||||
clear();
|
clear();
|
||||||
_kind=kind;
|
|
||||||
_rows=rows;
|
|
||||||
_cols=cols;
|
|
||||||
int err=_device_alloc(*this,cq,rows,cols,_pitch,kind);
|
int err=_device_alloc(*this,cq,rows,cols,_pitch,kind);
|
||||||
_row_size=_pitch/sizeof(numtyp);
|
|
||||||
#ifndef _UCL_DEVICE_PTR_MAT
|
|
||||||
_end=_array+_row_size*cols;
|
|
||||||
#endif
|
|
||||||
#ifndef UCL_NO_EXIT
|
|
||||||
if (err!=UCL_SUCCESS) {
|
if (err!=UCL_SUCCESS) {
|
||||||
|
#ifndef UCL_NO_EXIT
|
||||||
std::cerr << "UCL Error: Could not allocate "
|
std::cerr << "UCL Error: Could not allocate "
|
||||||
<< rows*cols*sizeof(numtyp) << " bytes on device.\n";
|
<< rows*cols*sizeof(numtyp) << " bytes on device.\n";
|
||||||
exit(1);
|
exit(1);
|
||||||
|
#endif
|
||||||
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
_kind=kind;
|
||||||
|
_rows=rows;
|
||||||
|
_cols=cols;
|
||||||
|
_row_size=_pitch/sizeof(numtyp);
|
||||||
|
#ifndef _UCL_DEVICE_PTR_MAT
|
||||||
|
_end=_array+_row_size*cols;
|
||||||
#endif
|
#endif
|
||||||
#ifdef _OCL_MAT
|
#ifdef _OCL_MAT
|
||||||
_offset=0;
|
_offset=0;
|
||||||
@ -94,20 +97,23 @@ class UCL_D_Mat : public UCL_BaseMat {
|
|||||||
inline int alloc(const size_t rows, const size_t cols, UCL_Device &device,
|
inline int alloc(const size_t rows, const size_t cols, UCL_Device &device,
|
||||||
const enum UCL_MEMOPT kind=UCL_READ_WRITE) {
|
const enum UCL_MEMOPT kind=UCL_READ_WRITE) {
|
||||||
clear();
|
clear();
|
||||||
_kind=kind;
|
|
||||||
_rows=rows;
|
|
||||||
_cols=cols;
|
|
||||||
int err=_device_alloc(*this,device,rows,cols,_pitch,kind);
|
int err=_device_alloc(*this,device,rows,cols,_pitch,kind);
|
||||||
_row_size=_pitch/sizeof(numtyp);
|
|
||||||
#ifndef _UCL_DEVICE_PTR_MAT
|
|
||||||
_end=_array+_row_size*cols;
|
|
||||||
#endif
|
|
||||||
#ifndef UCL_NO_EXIT
|
|
||||||
if (err!=UCL_SUCCESS) {
|
if (err!=UCL_SUCCESS) {
|
||||||
|
#ifndef UCL_NO_EXIT
|
||||||
std::cerr << "UCL Error: Could not allocate "
|
std::cerr << "UCL Error: Could not allocate "
|
||||||
<< rows*cols*sizeof(numtyp) << " bytes on device.\n";
|
<< rows*cols*sizeof(numtyp) << " bytes on device.\n";
|
||||||
exit(1);
|
exit(1);
|
||||||
|
#endif
|
||||||
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
_kind=kind;
|
||||||
|
_rows=rows;
|
||||||
|
_cols=cols;
|
||||||
|
_row_size=_pitch/sizeof(numtyp);
|
||||||
|
#ifndef _UCL_DEVICE_PTR_MAT
|
||||||
|
_end=_array+_row_size*cols;
|
||||||
#endif
|
#endif
|
||||||
#ifdef _OCL_MAT
|
#ifdef _OCL_MAT
|
||||||
_offset=0;
|
_offset=0;
|
||||||
|
|||||||
@ -60,19 +60,24 @@ class UCL_D_Vec : public UCL_BaseMat {
|
|||||||
const enum UCL_MEMOPT kind=UCL_READ_WRITE) {
|
const enum UCL_MEMOPT kind=UCL_READ_WRITE) {
|
||||||
|
|
||||||
clear();
|
clear();
|
||||||
_kind=kind;
|
|
||||||
_cols=cols;
|
|
||||||
_row_bytes=cols*sizeof(numtyp);
|
_row_bytes=cols*sizeof(numtyp);
|
||||||
int err=_device_alloc(*this,cq,_row_bytes,kind);
|
int err=_device_alloc(*this,cq,_row_bytes,kind);
|
||||||
#ifndef _UCL_DEVICE_PTR_MAT
|
|
||||||
_end=_array+cols;
|
|
||||||
#endif
|
|
||||||
#ifndef UCL_NO_EXIT
|
|
||||||
if (err!=UCL_SUCCESS) {
|
if (err!=UCL_SUCCESS) {
|
||||||
|
#ifndef UCL_NO_EXIT
|
||||||
std::cerr << "UCL Error: Could not allocate " << _row_bytes
|
std::cerr << "UCL Error: Could not allocate " << _row_bytes
|
||||||
<< " bytes on device.\n";
|
<< " bytes on device.\n";
|
||||||
|
_row_bytes=0;
|
||||||
exit(1);
|
exit(1);
|
||||||
|
#endif
|
||||||
|
_row_bytes=0;
|
||||||
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
_kind=kind;
|
||||||
|
_cols=cols;
|
||||||
|
#ifndef _UCL_DEVICE_PTR_MAT
|
||||||
|
_end=_array+cols;
|
||||||
#endif
|
#endif
|
||||||
#ifdef _OCL_MAT
|
#ifdef _OCL_MAT
|
||||||
_offset=0;
|
_offset=0;
|
||||||
@ -90,19 +95,23 @@ class UCL_D_Vec : public UCL_BaseMat {
|
|||||||
inline int alloc(const size_t cols, UCL_Device &device,
|
inline int alloc(const size_t cols, UCL_Device &device,
|
||||||
const enum UCL_MEMOPT kind=UCL_READ_WRITE) {
|
const enum UCL_MEMOPT kind=UCL_READ_WRITE) {
|
||||||
clear();
|
clear();
|
||||||
_kind=kind;
|
|
||||||
_cols=cols;
|
|
||||||
_row_bytes=cols*sizeof(numtyp);
|
_row_bytes=cols*sizeof(numtyp);
|
||||||
int err=_device_alloc(*this,device,_row_bytes,kind);
|
int err=_device_alloc(*this,device,_row_bytes,kind);
|
||||||
#ifndef _UCL_DEVICE_PTR_MAT
|
|
||||||
_end=_array+cols;
|
|
||||||
#endif
|
|
||||||
#ifndef UCL_NO_EXIT
|
|
||||||
if (err!=UCL_SUCCESS) {
|
if (err!=UCL_SUCCESS) {
|
||||||
|
#ifndef UCL_NO_EXIT
|
||||||
std::cerr << "UCL Error: Could not allocate " << _row_bytes
|
std::cerr << "UCL Error: Could not allocate " << _row_bytes
|
||||||
<< " bytes on device.\n";
|
<< " bytes on device.\n";
|
||||||
|
_row_bytes=0;
|
||||||
exit(1);
|
exit(1);
|
||||||
|
#endif
|
||||||
|
_row_bytes=0;
|
||||||
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
_kind=kind;
|
||||||
|
_cols=cols;
|
||||||
|
#ifndef _UCL_DEVICE_PTR_MAT
|
||||||
|
_end=_array+cols;
|
||||||
#endif
|
#endif
|
||||||
#ifdef _OCL_MAT
|
#ifdef _OCL_MAT
|
||||||
_offset=0;
|
_offset=0;
|
||||||
|
|||||||
@ -39,7 +39,11 @@ class UCL_H_Mat : public UCL_BaseMat {
|
|||||||
};
|
};
|
||||||
typedef numtyp data_type;
|
typedef numtyp data_type;
|
||||||
|
|
||||||
UCL_H_Mat() : _kind(UCL_VIEW), _rows(0) { }
|
UCL_H_Mat() : _kind(UCL_VIEW), _rows(0) {
|
||||||
|
#ifdef _OCL_MAT
|
||||||
|
_carray=(cl_mem)(0);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
~UCL_H_Mat() { if (_kind!=UCL_VIEW) _host_free(*this,_kind); }
|
~UCL_H_Mat() { if (_kind!=UCL_VIEW) _host_free(*this,_kind); }
|
||||||
|
|
||||||
/// Construct with specied number of rows and columns
|
/// Construct with specied number of rows and columns
|
||||||
@ -59,18 +63,23 @@ class UCL_H_Mat : public UCL_BaseMat {
|
|||||||
inline int alloc(const size_t rows, const size_t cols, mat_type &cq,
|
inline int alloc(const size_t rows, const size_t cols, mat_type &cq,
|
||||||
const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
|
const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
|
||||||
clear();
|
clear();
|
||||||
_cols=cols;
|
|
||||||
_rows=rows;
|
|
||||||
_row_bytes=cols*sizeof(numtyp);
|
_row_bytes=cols*sizeof(numtyp);
|
||||||
_kind=kind;
|
int err=_host_alloc(*this,cq,_row_bytes*rows,kind);
|
||||||
int err=_host_alloc(*this,cq,_row_bytes*_rows,kind);
|
|
||||||
#ifndef UCL_NO_EXIT
|
|
||||||
if (err!=UCL_SUCCESS) {
|
if (err!=UCL_SUCCESS) {
|
||||||
|
#ifndef UCL_NO_EXIT
|
||||||
std::cerr << "UCL Error: Could not allocate " << _row_bytes*_rows
|
std::cerr << "UCL Error: Could not allocate " << _row_bytes*_rows
|
||||||
<< " bytes on host.\n";
|
<< " bytes on host.\n";
|
||||||
|
_row_bytes=0;
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
|
_row_bytes=0;
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
|
||||||
|
_cols=cols;
|
||||||
|
_rows=rows;
|
||||||
|
_kind=kind;
|
||||||
_end=_array+rows*cols;
|
_end=_array+rows*cols;
|
||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
@ -85,19 +94,24 @@ class UCL_H_Mat : public UCL_BaseMat {
|
|||||||
inline int alloc(const size_t rows, const size_t cols, UCL_Device &device,
|
inline int alloc(const size_t rows, const size_t cols, UCL_Device &device,
|
||||||
const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
|
const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
|
||||||
clear();
|
clear();
|
||||||
_cols=cols;
|
|
||||||
_rows=rows;
|
|
||||||
_row_bytes=cols*sizeof(numtyp);
|
_row_bytes=cols*sizeof(numtyp);
|
||||||
_kind=kind;
|
int err=_host_alloc(*this,device,_row_bytes*rows,kind);
|
||||||
int err=_host_alloc(*this,device,_row_bytes*_rows,kind);
|
|
||||||
_end=_array+rows*cols;
|
|
||||||
#ifndef UCL_NO_EXIT
|
|
||||||
if (err!=UCL_SUCCESS) {
|
if (err!=UCL_SUCCESS) {
|
||||||
|
#ifndef UCL_NO_EXIT
|
||||||
std::cerr << "UCL Error: Could not allocate " << _row_bytes*_rows
|
std::cerr << "UCL Error: Could not allocate " << _row_bytes*_rows
|
||||||
<< " bytes on host.\n";
|
<< " bytes on host.\n";
|
||||||
|
_row_bytes=0;
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
|
_row_bytes=0;
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
|
||||||
|
_cols=cols;
|
||||||
|
_rows=rows;
|
||||||
|
_kind=kind;
|
||||||
|
_end=_array+rows*cols;
|
||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -39,7 +39,11 @@ class UCL_H_Vec : public UCL_BaseMat {
|
|||||||
};
|
};
|
||||||
typedef numtyp data_type;
|
typedef numtyp data_type;
|
||||||
|
|
||||||
UCL_H_Vec() : _kind(UCL_VIEW), _cols(0) { }
|
UCL_H_Vec() : _kind(UCL_VIEW), _cols(0) {
|
||||||
|
#ifdef _OCL_MAT
|
||||||
|
_carray=(cl_mem)(0);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
~UCL_H_Vec() { if (_kind!=UCL_VIEW) _host_free(*this,_kind); }
|
~UCL_H_Vec() { if (_kind!=UCL_VIEW) _host_free(*this,_kind); }
|
||||||
|
|
||||||
/// Construct with n columns
|
/// Construct with n columns
|
||||||
@ -59,18 +63,24 @@ class UCL_H_Vec : public UCL_BaseMat {
|
|||||||
inline int alloc(const size_t cols, mat_type &cq,
|
inline int alloc(const size_t cols, mat_type &cq,
|
||||||
const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
|
const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
|
||||||
clear();
|
clear();
|
||||||
_cols=cols;
|
|
||||||
_row_bytes=cols*sizeof(numtyp);
|
_row_bytes=cols*sizeof(numtyp);
|
||||||
_kind=kind;
|
|
||||||
int err=_host_alloc(*this,cq,_row_bytes,kind);
|
int err=_host_alloc(*this,cq,_row_bytes,kind);
|
||||||
_end=_array+cols;
|
|
||||||
#ifndef UCL_NO_EXIT
|
|
||||||
if (err!=UCL_SUCCESS) {
|
if (err!=UCL_SUCCESS) {
|
||||||
|
#ifndef UCL_NO_EXIT
|
||||||
std::cerr << "UCL Error: Could not allocate " << _row_bytes
|
std::cerr << "UCL Error: Could not allocate " << _row_bytes
|
||||||
<< " bytes on host.\n";
|
<< " bytes on host.\n";
|
||||||
|
_row_bytes=0;
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
|
_row_bytes=0;
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
|
||||||
|
_cols=cols;
|
||||||
|
_kind=kind;
|
||||||
|
_end=_array+cols;
|
||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -84,18 +94,24 @@ class UCL_H_Vec : public UCL_BaseMat {
|
|||||||
inline int alloc(const size_t cols, UCL_Device &device,
|
inline int alloc(const size_t cols, UCL_Device &device,
|
||||||
const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
|
const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
|
||||||
clear();
|
clear();
|
||||||
_cols=cols;
|
|
||||||
_row_bytes=cols*sizeof(numtyp);
|
_row_bytes=cols*sizeof(numtyp);
|
||||||
_kind=kind;
|
|
||||||
int err=_host_alloc(*this,device,_row_bytes,kind);
|
int err=_host_alloc(*this,device,_row_bytes,kind);
|
||||||
_end=_array+cols;
|
|
||||||
#ifndef UCL_NO_EXIT
|
|
||||||
if (err!=UCL_SUCCESS) {
|
if (err!=UCL_SUCCESS) {
|
||||||
|
#ifndef UCL_NO_EXIT
|
||||||
std::cerr << "UCL Error: Could not allocate " << _row_bytes
|
std::cerr << "UCL Error: Could not allocate " << _row_bytes
|
||||||
<< " bytes on host.\n";
|
<< " bytes on host.\n";
|
||||||
|
_row_bytes=0;
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
|
_row_bytes=0;
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
|
||||||
|
_cols=cols;
|
||||||
|
_kind=kind;
|
||||||
|
_end=_array+cols;
|
||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -25,8 +25,18 @@
|
|||||||
#ifndef UCL_NV_KERNEL_H
|
#ifndef UCL_NV_KERNEL_H
|
||||||
#define UCL_NV_KERNEL_H
|
#define UCL_NV_KERNEL_H
|
||||||
|
|
||||||
#define GLOBAL_ID_X threadIdx.x+__mul24(blockIdx.x,blockDim.x)
|
#if (__CUDA_ARCH__ < 200)
|
||||||
#define GLOBAL_ID_Y threadIdx.y+__mul24(blockIdx.y,blockDim.y)
|
#define mul24 __mul24
|
||||||
|
#define MEM_THREADS 16
|
||||||
|
#else
|
||||||
|
#define mul24(X,Y) (X)*(Y)
|
||||||
|
#define MEM_THREADS 32
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define GLOBAL_ID_X threadIdx.x+mul24(blockIdx.x,blockDim.x)
|
||||||
|
#define GLOBAL_ID_Y threadIdx.y+mul24(blockIdx.y,blockDim.y)
|
||||||
|
#define GLOBAL_SIZE_X mul24(gridDim.x,blockDim.x);
|
||||||
|
#define GLOBAL_SIZE_Y mul24(gridDim.y,blockDim.y);
|
||||||
#define THREAD_ID_X threadIdx.x
|
#define THREAD_ID_X threadIdx.x
|
||||||
#define THREAD_ID_Y threadIdx.y
|
#define THREAD_ID_Y threadIdx.y
|
||||||
#define BLOCK_ID_X blockIdx.x
|
#define BLOCK_ID_X blockIdx.x
|
||||||
@ -35,8 +45,9 @@
|
|||||||
#define BLOCK_SIZE_Y blockDim.y
|
#define BLOCK_SIZE_Y blockDim.y
|
||||||
#define __kernel extern "C" __global__
|
#define __kernel extern "C" __global__
|
||||||
#define __local __shared__
|
#define __local __shared__
|
||||||
#define mul24 __mul24
|
|
||||||
#define __global
|
#define __global
|
||||||
#define __inline static __inline__ __device__
|
#define __inline static __inline__ __device__
|
||||||
|
#define atom_add atomicAdd
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|||||||
@ -28,7 +28,7 @@ static LJ96_GPU_Memory<PRECISION,ACC_PRECISION> LJ96MF;
|
|||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
// Allocate memory on host and device and copy constants to device
|
// Allocate memory on host and device and copy constants to device
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
bool lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
int lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
||||||
double **host_lj2, double **host_lj3, double **host_lj4,
|
double **host_lj2, double **host_lj3, double **host_lj4,
|
||||||
double **offset, double *special_lj, const int inum,
|
double **offset, double *special_lj, const int inum,
|
||||||
const int nall, const int max_nbors, const int maxspecial,
|
const int nall, const int max_nbors, const int maxspecial,
|
||||||
@ -53,13 +53,11 @@ bool lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
|||||||
fflush(screen);
|
fflush(screen);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (world_me==0) {
|
int init_ok=0;
|
||||||
bool init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
|
if (world_me==0)
|
||||||
|
init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
|
||||||
host_lj4, offset, special_lj, inum, nall, 300,
|
host_lj4, offset, special_lj, inum, nall, 300,
|
||||||
maxspecial, cell_size, gpu_split, screen);
|
maxspecial, cell_size, gpu_split, screen);
|
||||||
if (!init_ok)
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
LJ96MF.device->world_barrier();
|
LJ96MF.device->world_barrier();
|
||||||
if (message)
|
if (message)
|
||||||
@ -74,46 +72,46 @@ bool lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
|||||||
last_gpu,i);
|
last_gpu,i);
|
||||||
fflush(screen);
|
fflush(screen);
|
||||||
}
|
}
|
||||||
if (gpu_rank==i && world_me!=0) {
|
if (gpu_rank==i && world_me!=0)
|
||||||
bool init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
|
init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
|
||||||
host_lj4, offset, special_lj, inum,
|
offset, special_lj, inum, nall, 300, maxspecial,
|
||||||
nall, 300, maxspecial, cell_size, gpu_split,
|
cell_size, gpu_split, screen);
|
||||||
screen);
|
|
||||||
if (!init_ok)
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
LJ96MF.device->gpu_barrier();
|
LJ96MF.device->gpu_barrier();
|
||||||
if (message)
|
if (message)
|
||||||
fprintf(screen,"Done.\n");
|
fprintf(screen,"Done.\n");
|
||||||
}
|
}
|
||||||
if (message)
|
if (message)
|
||||||
fprintf(screen,"\n");
|
fprintf(screen,"\n");
|
||||||
return true;
|
|
||||||
|
if (init_ok==0)
|
||||||
|
LJ96MF.estimate_gpu_overhead();
|
||||||
|
return init_ok;
|
||||||
}
|
}
|
||||||
|
|
||||||
void lj96_gpu_clear() {
|
void lj96_gpu_clear() {
|
||||||
LJ96MF.clear();
|
LJ96MF.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
int * lj96_gpu_compute_n(const int timestep, const int ago, const int inum_full,
|
int** lj96_gpu_compute_n(const int ago, const int inum_full,
|
||||||
const int nall, double **host_x, int *host_type,
|
const int nall, double **host_x, int *host_type,
|
||||||
double *boxlo, double *boxhi, int *tag, int **nspecial,
|
double *sublo, double *subhi, int *tag, int **nspecial,
|
||||||
int **special, const bool eflag, const bool vflag,
|
int **special, const bool eflag, const bool vflag,
|
||||||
const bool eatom, const bool vatom, int &host_start,
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
const double cpu_time, bool &success) {
|
int **ilist, int **jnum, const double cpu_time,
|
||||||
return LJ96MF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
|
bool &success) {
|
||||||
boxhi, tag, nspecial, special, eflag, vflag, eatom,
|
return LJ96MF.compute(ago, inum_full, nall, host_x, host_type, sublo,
|
||||||
vatom, host_start, cpu_time, success);
|
subhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||||
|
vatom, host_start, ilist, jnum, cpu_time, success);
|
||||||
}
|
}
|
||||||
|
|
||||||
void lj96_gpu_compute(const int timestep, const int ago, const int inum_full,
|
void lj96_gpu_compute(const int ago, const int inum_full, const int nall,
|
||||||
const int nall, double **host_x, int *host_type,
|
double **host_x, int *host_type, int *ilist, int *numj,
|
||||||
int *ilist, int *numj, int **firstneigh,
|
int **firstneigh, const bool eflag, const bool vflag,
|
||||||
const bool eflag, const bool vflag, const bool eatom,
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
const bool vatom, int &host_start, const double cpu_time,
|
const double cpu_time, bool &success) {
|
||||||
bool &success) {
|
LJ96MF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,
|
||||||
LJ96MF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
|
eflag,vflag,eatom,vatom,host_start,cpu_time,success);
|
||||||
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
double lj96_gpu_bytes() {
|
double lj96_gpu_bytes() {
|
||||||
|
|||||||
@ -18,8 +18,6 @@
|
|||||||
#ifndef LJ96_GPU_KERNEL
|
#ifndef LJ96_GPU_KERNEL
|
||||||
#define LJ96_GPU_KERNEL
|
#define LJ96_GPU_KERNEL
|
||||||
|
|
||||||
#define MAX_SHARED_TYPES 8
|
|
||||||
|
|
||||||
#ifdef _DOUBLE_DOUBLE
|
#ifdef _DOUBLE_DOUBLE
|
||||||
#define numtyp double
|
#define numtyp double
|
||||||
#define numtyp2 double2
|
#define numtyp2 double2
|
||||||
@ -46,7 +44,7 @@
|
|||||||
|
|
||||||
#ifdef NV_KERNEL
|
#ifdef NV_KERNEL
|
||||||
|
|
||||||
#include "geryon/ucl_nv_kernel.h"
|
#include "nv_kernel_def.h"
|
||||||
texture<float4> pos_tex;
|
texture<float4> pos_tex;
|
||||||
|
|
||||||
#ifdef _DOUBLE_DOUBLE
|
#ifdef _DOUBLE_DOUBLE
|
||||||
@ -72,6 +70,8 @@ __inline float4 fetch_pos(const int& i, const float4 *pos)
|
|||||||
#define __inline inline
|
#define __inline inline
|
||||||
|
|
||||||
#define fetch_pos(i,y) x_[i]
|
#define fetch_pos(i,y) x_[i]
|
||||||
|
#define BLOCK_PAIR 64
|
||||||
|
#define MAX_SHARED_TYPES 8
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -82,19 +82,21 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; }
|
|||||||
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||||
__global numtyp4* lj3, const int lj_types,
|
__global numtyp4* lj3, const int lj_types,
|
||||||
__global numtyp *sp_lj_in, __global int *dev_nbor,
|
__global numtyp *sp_lj_in, __global int *dev_nbor,
|
||||||
__global acctyp4 *ans, __global acctyp *engv,
|
__global int *dev_packed, __global acctyp4 *ans,
|
||||||
const int eflag, const int vflag, const int inum,
|
__global acctyp *engv, const int eflag,
|
||||||
const int nall, const int nbor_pitch) {
|
const int vflag, const int inum, const int nall,
|
||||||
// ii indexes the two interacting particles in gi
|
const int nbor_pitch, const int t_per_atom) {
|
||||||
int ii=GLOBAL_ID_X;
|
int tid=THREAD_ID_X;
|
||||||
|
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||||
|
ii+=tid/t_per_atom;
|
||||||
|
int offset=tid%t_per_atom;
|
||||||
|
|
||||||
__local numtyp sp_lj[4];
|
__local numtyp sp_lj[4];
|
||||||
sp_lj[0]=sp_lj_in[0];
|
sp_lj[0]=sp_lj_in[0];
|
||||||
sp_lj[1]=sp_lj_in[1];
|
sp_lj[1]=sp_lj_in[1];
|
||||||
sp_lj[2]=sp_lj_in[2];
|
sp_lj[2]=sp_lj_in[2];
|
||||||
sp_lj[3]=sp_lj_in[3];
|
sp_lj[3]=sp_lj_in[3];
|
||||||
|
|
||||||
if (ii<inum) {
|
|
||||||
|
|
||||||
acctyp energy=(acctyp)0;
|
acctyp energy=(acctyp)0;
|
||||||
acctyp4 f;
|
acctyp4 f;
|
||||||
f.x=(acctyp)0;
|
f.x=(acctyp)0;
|
||||||
@ -104,18 +106,31 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||||||
for (int i=0; i<6; i++)
|
for (int i=0; i<6; i++)
|
||||||
virial[i]=(acctyp)0;
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
|
if (ii<inum) {
|
||||||
__global int *nbor=dev_nbor+ii;
|
__global int *nbor=dev_nbor+ii;
|
||||||
int i=*nbor;
|
int i=*nbor;
|
||||||
nbor+=nbor_pitch;
|
nbor+=nbor_pitch;
|
||||||
int numj=*nbor;
|
int numj=*nbor;
|
||||||
nbor+=nbor_pitch;
|
nbor+=nbor_pitch;
|
||||||
__global int *list_end=nbor+mul24(numj,nbor_pitch);
|
|
||||||
|
int n_stride;
|
||||||
|
__global int *list_end;
|
||||||
|
if (dev_nbor==dev_packed) {
|
||||||
|
list_end=nbor+mul24(numj,nbor_pitch);
|
||||||
|
nbor+=mul24(offset,nbor_pitch);
|
||||||
|
n_stride=mul24(t_per_atom,nbor_pitch);
|
||||||
|
} else {
|
||||||
|
nbor=dev_packed+*nbor;
|
||||||
|
list_end=nbor+numj;
|
||||||
|
n_stride=t_per_atom;
|
||||||
|
nbor+=offset;
|
||||||
|
}
|
||||||
|
|
||||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||||
int itype=ix.w;
|
int itype=ix.w;
|
||||||
|
|
||||||
numtyp factor_lj;
|
numtyp factor_lj;
|
||||||
for ( ; nbor<list_end; nbor+=nbor_pitch) {
|
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||||
|
|
||||||
int j=*nbor;
|
int j=*nbor;
|
||||||
factor_lj = sp_lj[sbmask(j)];
|
factor_lj = sp_lj[sbmask(j)];
|
||||||
@ -157,8 +172,47 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||||||
}
|
}
|
||||||
|
|
||||||
} // for nbor
|
} // for nbor
|
||||||
|
} // if ii
|
||||||
|
|
||||||
|
// Reduce answers
|
||||||
|
if (t_per_atom>1) {
|
||||||
|
__local acctyp red_acc[6][BLOCK_PAIR];
|
||||||
|
|
||||||
|
red_acc[0][tid]=f.x;
|
||||||
|
red_acc[1][tid]=f.y;
|
||||||
|
red_acc[2][tid]=f.z;
|
||||||
|
red_acc[3][tid]=energy;
|
||||||
|
|
||||||
|
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||||
|
if (offset < s) {
|
||||||
|
for (int r=0; r<4; r++)
|
||||||
|
red_acc[r][tid] += red_acc[r][tid+s];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
f.x=red_acc[0][tid];
|
||||||
|
f.y=red_acc[1][tid];
|
||||||
|
f.z=red_acc[2][tid];
|
||||||
|
energy=red_acc[3][tid];
|
||||||
|
|
||||||
|
if (vflag>0) {
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
red_acc[r][tid]=virial[r];
|
||||||
|
|
||||||
|
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||||
|
if (offset < s) {
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
red_acc[r][tid] += red_acc[r][tid+s];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
virial[r]=red_acc[r][tid];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Store answers
|
// Store answers
|
||||||
|
if (ii<inum && offset==0) {
|
||||||
__global acctyp *ap1=engv+ii;
|
__global acctyp *ap1=engv+ii;
|
||||||
if (eflag>0) {
|
if (eflag>0) {
|
||||||
*ap1=energy;
|
*ap1=energy;
|
||||||
@ -176,26 +230,27 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||||||
|
|
||||||
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||||
__global numtyp4* lj3_in,
|
__global numtyp4* lj3_in,
|
||||||
__global numtyp* sp_lj_in, __global int *dev_nbor,
|
__global numtyp* sp_lj_in,
|
||||||
|
__global int *dev_nbor, __global int *dev_packed,
|
||||||
__global acctyp4 *ans, __global acctyp *engv,
|
__global acctyp4 *ans, __global acctyp *engv,
|
||||||
const int eflag, const int vflag, const int inum,
|
const int eflag, const int vflag, const int inum,
|
||||||
const int nall, const int nbor_pitch) {
|
const int nall, const int nbor_pitch,
|
||||||
// ii indexes the two interacting particles in gi
|
const int t_per_atom) {
|
||||||
int ii=THREAD_ID_X;
|
int tid=THREAD_ID_X;
|
||||||
|
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||||
|
ii+=tid/t_per_atom;
|
||||||
|
int offset=tid%t_per_atom;
|
||||||
|
|
||||||
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||||
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||||
__local numtyp sp_lj[4];
|
__local numtyp sp_lj[4];
|
||||||
if (ii<4)
|
if (tid<4)
|
||||||
sp_lj[ii]=sp_lj_in[ii];
|
sp_lj[tid]=sp_lj_in[tid];
|
||||||
if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||||
lj1[ii]=lj1_in[ii];
|
lj1[tid]=lj1_in[tid];
|
||||||
if (eflag>0)
|
if (eflag>0)
|
||||||
lj3[ii]=lj3_in[ii];
|
lj3[tid]=lj3_in[tid];
|
||||||
}
|
}
|
||||||
ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
if (ii<inum) {
|
|
||||||
|
|
||||||
acctyp energy=(acctyp)0;
|
acctyp energy=(acctyp)0;
|
||||||
acctyp4 f;
|
acctyp4 f;
|
||||||
@ -206,19 +261,34 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
|||||||
for (int i=0; i<6; i++)
|
for (int i=0; i<6; i++)
|
||||||
virial[i]=(acctyp)0;
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
if (ii<inum) {
|
||||||
__global int *nbor=dev_nbor+ii;
|
__global int *nbor=dev_nbor+ii;
|
||||||
int i=*nbor;
|
int i=*nbor;
|
||||||
nbor+=nbor_pitch;
|
nbor+=nbor_pitch;
|
||||||
int numj=*nbor;
|
int numj=*nbor;
|
||||||
nbor+=nbor_pitch;
|
nbor+=nbor_pitch;
|
||||||
__global int *list_end=nbor+mul24(numj,nbor_pitch);
|
|
||||||
|
int n_stride;
|
||||||
|
__global int *list_end;
|
||||||
|
if (dev_nbor==dev_packed) {
|
||||||
|
list_end=nbor+mul24(numj,nbor_pitch);
|
||||||
|
nbor+=mul24(offset,nbor_pitch);
|
||||||
|
n_stride=mul24(t_per_atom,nbor_pitch);
|
||||||
|
} else {
|
||||||
|
nbor=dev_packed+*nbor;
|
||||||
|
list_end=nbor+numj;
|
||||||
|
n_stride=t_per_atom;
|
||||||
|
nbor+=offset;
|
||||||
|
}
|
||||||
|
|
||||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||||
int iw=ix.w;
|
int iw=ix.w;
|
||||||
int itype=mul24((int)MAX_SHARED_TYPES,iw);
|
int itype=mul24((int)MAX_SHARED_TYPES,iw);
|
||||||
|
|
||||||
numtyp factor_lj;
|
numtyp factor_lj;
|
||||||
for ( ; nbor<list_end; nbor+=nbor_pitch) {
|
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||||
|
|
||||||
int j=*nbor;
|
int j=*nbor;
|
||||||
factor_lj = sp_lj[sbmask(j)];
|
factor_lj = sp_lj[sbmask(j)];
|
||||||
@ -258,8 +328,47 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
|||||||
}
|
}
|
||||||
|
|
||||||
} // for nbor
|
} // for nbor
|
||||||
|
} // if ii
|
||||||
|
|
||||||
|
// Reduce answers
|
||||||
|
if (t_per_atom>1) {
|
||||||
|
__local acctyp red_acc[6][BLOCK_PAIR];
|
||||||
|
|
||||||
|
red_acc[0][tid]=f.x;
|
||||||
|
red_acc[1][tid]=f.y;
|
||||||
|
red_acc[2][tid]=f.z;
|
||||||
|
red_acc[3][tid]=energy;
|
||||||
|
|
||||||
|
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||||
|
if (offset < s) {
|
||||||
|
for (int r=0; r<4; r++)
|
||||||
|
red_acc[r][tid] += red_acc[r][tid+s];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
f.x=red_acc[0][tid];
|
||||||
|
f.y=red_acc[1][tid];
|
||||||
|
f.z=red_acc[2][tid];
|
||||||
|
energy=red_acc[3][tid];
|
||||||
|
|
||||||
|
if (vflag>0) {
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
red_acc[r][tid]=virial[r];
|
||||||
|
|
||||||
|
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||||
|
if (offset < s) {
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
red_acc[r][tid] += red_acc[r][tid+s];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
virial[r]=red_acc[r][tid];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Store answers
|
// Store answers
|
||||||
|
if (ii<inum && offset==0) {
|
||||||
__global acctyp *ap1=engv+ii;
|
__global acctyp *ap1=engv+ii;
|
||||||
if (eflag>0) {
|
if (eflag>0) {
|
||||||
*ap1=energy;
|
*ap1=energy;
|
||||||
|
|||||||
@ -42,7 +42,7 @@ int LJ96_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
bool LJ96_GPU_MemoryT::init(const int ntypes,
|
int LJ96_GPU_MemoryT::init(const int ntypes,
|
||||||
double **host_cutsq, double **host_lj1,
|
double **host_cutsq, double **host_lj1,
|
||||||
double **host_lj2, double **host_lj3,
|
double **host_lj2, double **host_lj3,
|
||||||
double **host_lj4, double **host_offset,
|
double **host_lj4, double **host_offset,
|
||||||
@ -50,14 +50,18 @@ bool LJ96_GPU_MemoryT::init(const int ntypes,
|
|||||||
const int nall, const int max_nbors,
|
const int nall, const int max_nbors,
|
||||||
const int maxspecial, const double cell_size,
|
const int maxspecial, const double cell_size,
|
||||||
const double gpu_split, FILE *_screen) {
|
const double gpu_split, FILE *_screen) {
|
||||||
this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
int success;
|
||||||
|
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||||
_screen,lj96_cut_gpu_kernel);
|
_screen,lj96_cut_gpu_kernel);
|
||||||
|
if (success!=0)
|
||||||
|
return success;
|
||||||
|
|
||||||
// If atom type constants fit in shared memory use fast kernel
|
// If atom type constants fit in shared memory use fast kernel
|
||||||
int lj_types=ntypes;
|
int lj_types=ntypes;
|
||||||
shared_types=false;
|
shared_types=false;
|
||||||
if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) {
|
int max_shared_types=this->device->max_shared_types();
|
||||||
lj_types=MAX_SHARED_TYPES;
|
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
|
||||||
|
lj_types=max_shared_types;
|
||||||
shared_types=true;
|
shared_types=true;
|
||||||
}
|
}
|
||||||
_lj_types=lj_types;
|
_lj_types=lj_types;
|
||||||
@ -84,7 +88,7 @@ bool LJ96_GPU_MemoryT::init(const int ntypes,
|
|||||||
|
|
||||||
_allocated=true;
|
_allocated=true;
|
||||||
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
|
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
|
||||||
return true;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
@ -122,9 +126,10 @@ void LJ96_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
|||||||
else
|
else
|
||||||
vflag=0;
|
vflag=0;
|
||||||
|
|
||||||
int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
|
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||||
|
(BX/this->_threads_per_atom)));
|
||||||
|
|
||||||
int ainum=this->atom->inum();
|
int ainum=this->ans->inum();
|
||||||
int anall=this->atom->nall();
|
int anall=this->atom->nall();
|
||||||
int nbor_pitch=this->nbor->nbor_pitch();
|
int nbor_pitch=this->nbor->nbor_pitch();
|
||||||
this->time_pair.start();
|
this->time_pair.start();
|
||||||
@ -133,16 +138,18 @@ void LJ96_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
|||||||
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
|
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
|
||||||
&lj3.begin(), &sp_lj.begin(),
|
&lj3.begin(), &sp_lj.begin(),
|
||||||
&this->nbor->dev_nbor.begin(),
|
&this->nbor->dev_nbor.begin(),
|
||||||
&this->atom->dev_ans.begin(),
|
&this->_nbor_data->begin(),
|
||||||
&this->atom->dev_engv.begin(), &eflag, &vflag,
|
&this->ans->dev_ans.begin(),
|
||||||
&ainum, &anall, &nbor_pitch);
|
&this->ans->dev_engv.begin(), &eflag, &vflag,
|
||||||
|
&ainum, &anall, &nbor_pitch,
|
||||||
|
&this->_threads_per_atom);
|
||||||
} else {
|
} else {
|
||||||
this->k_pair.set_size(GX,BX);
|
this->k_pair.set_size(GX,BX);
|
||||||
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
|
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
|
||||||
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
||||||
&this->atom->dev_ans.begin(),
|
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
|
||||||
&this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
|
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||||
&anall, &nbor_pitch);
|
&anall, &nbor_pitch, &this->_threads_per_atom);
|
||||||
}
|
}
|
||||||
this->time_pair.stop();
|
this->time_pair.stop();
|
||||||
}
|
}
|
||||||
|
|||||||
@ -29,8 +29,15 @@ class LJ96_GPU_Memory : public AtomicGPUMemory<numtyp, acctyp> {
|
|||||||
/// Clear any previous data and set up for a new LAMMPS run
|
/// Clear any previous data and set up for a new LAMMPS run
|
||||||
/** \param max_nbors initial number of rows in the neighbor matrix
|
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||||
* \param cell_size cutoff + skin
|
* \param cell_size cutoff + skin
|
||||||
* \param gpu_split fraction of particles handled by device **/
|
* \param gpu_split fraction of particles handled by device
|
||||||
bool init(const int ntypes, double **host_cutsq, double **host_lj1,
|
*
|
||||||
|
* Returns:
|
||||||
|
* - 0 if successfull
|
||||||
|
* - -1 if fix gpu not found
|
||||||
|
* - -3 if there is an out of memory error
|
||||||
|
* - -4 if the GPU library was not compiled for GPU
|
||||||
|
* - -5 Double precision is not supported on card **/
|
||||||
|
int init(const int ntypes, double **host_cutsq, double **host_lj1,
|
||||||
double **host_lj2, double **host_lj3, double **host_lj4,
|
double **host_lj2, double **host_lj3, double **host_lj4,
|
||||||
double **host_offset, double *host_special_lj,
|
double **host_offset, double *host_special_lj,
|
||||||
const int nlocal, const int nall, const int max_nbors,
|
const int nlocal, const int nall, const int max_nbors,
|
||||||
|
|||||||
@ -28,12 +28,11 @@ static LJL_GPU_Memory<PRECISION,ACC_PRECISION> LJLMF;
|
|||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
// Allocate memory on host and device and copy constants to device
|
// Allocate memory on host and device and copy constants to device
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
bool ljl_gpu_init(const int ntypes, double **cutsq,
|
int ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
||||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
double **host_lj2, double **host_lj3, double **host_lj4,
|
||||||
double **host_lj4, double **offset, double *special_lj,
|
double **offset, double *special_lj, const int inum,
|
||||||
const int inum, const int nall, const int max_nbors,
|
const int nall, const int max_nbors, const int maxspecial,
|
||||||
const int maxspecial, const double cell_size, int &gpu_mode,
|
const double cell_size, int &gpu_mode, FILE *screen) {
|
||||||
FILE *screen) {
|
|
||||||
LJLMF.clear();
|
LJLMF.clear();
|
||||||
gpu_mode=LJLMF.device->gpu_mode();
|
gpu_mode=LJLMF.device->gpu_mode();
|
||||||
double gpu_split=LJLMF.device->particle_split();
|
double gpu_split=LJLMF.device->particle_split();
|
||||||
@ -54,13 +53,11 @@ bool ljl_gpu_init(const int ntypes, double **cutsq,
|
|||||||
fflush(screen);
|
fflush(screen);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (world_me==0) {
|
int init_ok=0;
|
||||||
bool init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
|
if (world_me==0)
|
||||||
|
init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
|
||||||
host_lj4, offset, special_lj, inum, nall, 300,
|
host_lj4, offset, special_lj, inum, nall, 300,
|
||||||
maxspecial, cell_size, gpu_split, screen);
|
maxspecial, cell_size, gpu_split, screen);
|
||||||
if (!init_ok)
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
LJLMF.device->world_barrier();
|
LJLMF.device->world_barrier();
|
||||||
if (message)
|
if (message)
|
||||||
@ -75,45 +72,45 @@ bool ljl_gpu_init(const int ntypes, double **cutsq,
|
|||||||
last_gpu,i);
|
last_gpu,i);
|
||||||
fflush(screen);
|
fflush(screen);
|
||||||
}
|
}
|
||||||
if (gpu_rank==i && world_me!=0) {
|
if (gpu_rank==i && world_me!=0)
|
||||||
bool init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
|
init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
|
||||||
host_lj4, offset, special_lj, inum, nall, 300,
|
offset, special_lj, inum, nall, 300, maxspecial,
|
||||||
maxspecial, cell_size, gpu_split,
|
cell_size, gpu_split, screen);
|
||||||
screen);
|
|
||||||
if (!init_ok)
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
LJLMF.device->gpu_barrier();
|
LJLMF.device->gpu_barrier();
|
||||||
if (message)
|
if (message)
|
||||||
fprintf(screen,"Done.\n");
|
fprintf(screen,"Done.\n");
|
||||||
}
|
}
|
||||||
if (message)
|
if (message)
|
||||||
fprintf(screen,"\n");
|
fprintf(screen,"\n");
|
||||||
return true;
|
|
||||||
|
if (init_ok==0)
|
||||||
|
LJLMF.estimate_gpu_overhead();
|
||||||
|
return init_ok;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ljl_gpu_clear() {
|
void ljl_gpu_clear() {
|
||||||
LJLMF.clear();
|
LJLMF.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
int * ljl_gpu_compute_n(const int timestep, const int ago, const int inum_full,
|
int ** ljl_gpu_compute_n(const int ago, const int inum_full,
|
||||||
const int nall, double **host_x, int *host_type,
|
const int nall, double **host_x, int *host_type,
|
||||||
double *boxlo, double *boxhi, int *tag, int **nspecial,
|
double *sublo, double *subhi, int *tag, int **nspecial,
|
||||||
int **special, const bool eflag, const bool vflag,
|
int **special, const bool eflag, const bool vflag,
|
||||||
const bool eatom, const bool vatom, int &host_start,
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
const double cpu_time, bool &success) {
|
int **ilist, int **jnum, const double cpu_time,
|
||||||
return LJLMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
|
bool &success) {
|
||||||
boxhi, tag, nspecial, special, eflag, vflag, eatom,
|
return LJLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
|
||||||
vatom, host_start, cpu_time, success);
|
subhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||||
|
vatom, host_start, ilist, jnum, cpu_time, success);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ljl_gpu_compute(const int timestep, const int ago, const int inum_full,
|
void ljl_gpu_compute(const int ago, const int inum_full, const int nall,
|
||||||
const int nall, double **host_x, int *host_type,
|
double **host_x, int *host_type, int *ilist, int *numj,
|
||||||
int *ilist, int *numj, int **firstneigh,
|
int **firstneigh, const bool eflag, const bool vflag,
|
||||||
const bool eflag, const bool vflag, const bool eatom,
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
const bool vatom, int &host_start, const double cpu_time,
|
const double cpu_time, bool &success) {
|
||||||
bool &success) {
|
LJLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
|
||||||
LJLMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
|
|
||||||
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
|
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -18,8 +18,6 @@
|
|||||||
#ifndef LJ_GPU_KERNEL
|
#ifndef LJ_GPU_KERNEL
|
||||||
#define LJ_GPU_KERNEL
|
#define LJ_GPU_KERNEL
|
||||||
|
|
||||||
#define MAX_SHARED_TYPES 8
|
|
||||||
|
|
||||||
#ifdef _DOUBLE_DOUBLE
|
#ifdef _DOUBLE_DOUBLE
|
||||||
#define numtyp double
|
#define numtyp double
|
||||||
#define numtyp2 double2
|
#define numtyp2 double2
|
||||||
@ -46,7 +44,7 @@
|
|||||||
|
|
||||||
#ifdef NV_KERNEL
|
#ifdef NV_KERNEL
|
||||||
|
|
||||||
#include "geryon/ucl_nv_kernel.h"
|
#include "nv_kernel_def.h"
|
||||||
texture<float4> pos_tex;
|
texture<float4> pos_tex;
|
||||||
|
|
||||||
#ifdef _DOUBLE_DOUBLE
|
#ifdef _DOUBLE_DOUBLE
|
||||||
@ -72,6 +70,8 @@ __inline float4 fetch_pos(const int& i, const float4 *pos)
|
|||||||
#define __inline inline
|
#define __inline inline
|
||||||
|
|
||||||
#define fetch_pos(i,y) x_[i]
|
#define fetch_pos(i,y) x_[i]
|
||||||
|
#define BLOCK_PAIR 64
|
||||||
|
#define MAX_SHARED_TYPES 8
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -82,19 +82,21 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; }
|
|||||||
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||||
__global numtyp4* lj3, const int lj_types,
|
__global numtyp4* lj3, const int lj_types,
|
||||||
__global numtyp *sp_lj_in, __global int *dev_nbor,
|
__global numtyp *sp_lj_in, __global int *dev_nbor,
|
||||||
__global acctyp4 *ans, __global acctyp *engv,
|
__global int *dev_packed, __global acctyp4 *ans,
|
||||||
const int eflag, const int vflag, const int inum,
|
__global acctyp *engv, const int eflag,
|
||||||
const int nall, const int nbor_pitch) {
|
const int vflag, const int inum, const int nall,
|
||||||
// ii indexes the two interacting particles in gi
|
const int nbor_pitch, const int t_per_atom) {
|
||||||
int ii=GLOBAL_ID_X;
|
int tid=THREAD_ID_X;
|
||||||
|
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||||
|
ii+=tid/t_per_atom;
|
||||||
|
int offset=tid%t_per_atom;
|
||||||
|
|
||||||
__local numtyp sp_lj[4];
|
__local numtyp sp_lj[4];
|
||||||
sp_lj[0]=sp_lj_in[0];
|
sp_lj[0]=sp_lj_in[0];
|
||||||
sp_lj[1]=sp_lj_in[1];
|
sp_lj[1]=sp_lj_in[1];
|
||||||
sp_lj[2]=sp_lj_in[2];
|
sp_lj[2]=sp_lj_in[2];
|
||||||
sp_lj[3]=sp_lj_in[3];
|
sp_lj[3]=sp_lj_in[3];
|
||||||
|
|
||||||
if (ii<inum) {
|
|
||||||
|
|
||||||
acctyp energy=(acctyp)0;
|
acctyp energy=(acctyp)0;
|
||||||
acctyp4 f;
|
acctyp4 f;
|
||||||
f.x=(acctyp)0;
|
f.x=(acctyp)0;
|
||||||
@ -104,18 +106,31 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||||||
for (int i=0; i<6; i++)
|
for (int i=0; i<6; i++)
|
||||||
virial[i]=(acctyp)0;
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
|
if (ii<inum) {
|
||||||
__global int *nbor=dev_nbor+ii;
|
__global int *nbor=dev_nbor+ii;
|
||||||
int i=*nbor;
|
int i=*nbor;
|
||||||
nbor+=nbor_pitch;
|
nbor+=nbor_pitch;
|
||||||
int numj=*nbor;
|
int numj=*nbor;
|
||||||
nbor+=nbor_pitch;
|
nbor+=nbor_pitch;
|
||||||
__global int *list_end=nbor+mul24(numj,nbor_pitch);
|
|
||||||
|
int n_stride;
|
||||||
|
__global int *list_end;
|
||||||
|
if (dev_nbor==dev_packed) {
|
||||||
|
list_end=nbor+mul24(numj,nbor_pitch);
|
||||||
|
nbor+=mul24(offset,nbor_pitch);
|
||||||
|
n_stride=mul24(t_per_atom,nbor_pitch);
|
||||||
|
} else {
|
||||||
|
nbor=dev_packed+*nbor;
|
||||||
|
list_end=nbor+numj;
|
||||||
|
n_stride=t_per_atom;
|
||||||
|
nbor+=offset;
|
||||||
|
}
|
||||||
|
|
||||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||||
int itype=ix.w;
|
int itype=ix.w;
|
||||||
|
|
||||||
numtyp factor_lj;
|
numtyp factor_lj;
|
||||||
for ( ; nbor<list_end; nbor+=nbor_pitch) {
|
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||||
|
|
||||||
int j=*nbor;
|
int j=*nbor;
|
||||||
factor_lj = sp_lj[sbmask(j)];
|
factor_lj = sp_lj[sbmask(j)];
|
||||||
@ -156,8 +171,47 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||||||
}
|
}
|
||||||
|
|
||||||
} // for nbor
|
} // for nbor
|
||||||
|
} // if ii
|
||||||
|
|
||||||
|
// Reduce answers
|
||||||
|
if (t_per_atom>1) {
|
||||||
|
__local acctyp red_acc[6][BLOCK_PAIR];
|
||||||
|
|
||||||
|
red_acc[0][tid]=f.x;
|
||||||
|
red_acc[1][tid]=f.y;
|
||||||
|
red_acc[2][tid]=f.z;
|
||||||
|
red_acc[3][tid]=energy;
|
||||||
|
|
||||||
|
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||||
|
if (offset < s) {
|
||||||
|
for (int r=0; r<4; r++)
|
||||||
|
red_acc[r][tid] += red_acc[r][tid+s];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
f.x=red_acc[0][tid];
|
||||||
|
f.y=red_acc[1][tid];
|
||||||
|
f.z=red_acc[2][tid];
|
||||||
|
energy=red_acc[3][tid];
|
||||||
|
|
||||||
|
if (vflag>0) {
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
red_acc[r][tid]=virial[r];
|
||||||
|
|
||||||
|
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||||
|
if (offset < s) {
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
red_acc[r][tid] += red_acc[r][tid+s];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
virial[r]=red_acc[r][tid];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Store answers
|
// Store answers
|
||||||
|
if (ii<inum && offset==0) {
|
||||||
__global acctyp *ap1=engv+ii;
|
__global acctyp *ap1=engv+ii;
|
||||||
if (eflag>0) {
|
if (eflag>0) {
|
||||||
*ap1=energy;
|
*ap1=energy;
|
||||||
@ -175,26 +229,27 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||||||
|
|
||||||
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||||
__global numtyp4* lj3_in,
|
__global numtyp4* lj3_in,
|
||||||
__global numtyp* sp_lj_in, __global int *dev_nbor,
|
__global numtyp* sp_lj_in,
|
||||||
|
__global int *dev_nbor, __global int *dev_packed,
|
||||||
__global acctyp4 *ans, __global acctyp *engv,
|
__global acctyp4 *ans, __global acctyp *engv,
|
||||||
const int eflag, const int vflag, const int inum,
|
const int eflag, const int vflag, const int inum,
|
||||||
const int nall, const int nbor_pitch) {
|
const int nall, const int nbor_pitch,
|
||||||
// ii indexes the two interacting particles in gi
|
const int t_per_atom) {
|
||||||
int ii=THREAD_ID_X;
|
int tid=THREAD_ID_X;
|
||||||
|
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||||
|
ii+=tid/t_per_atom;
|
||||||
|
int offset=tid%t_per_atom;
|
||||||
|
|
||||||
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||||
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||||
__local numtyp sp_lj[4];
|
__local numtyp sp_lj[4];
|
||||||
if (ii<4)
|
if (tid<4)
|
||||||
sp_lj[ii]=sp_lj_in[ii];
|
sp_lj[tid]=sp_lj_in[tid];
|
||||||
if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||||
lj1[ii]=lj1_in[ii];
|
lj1[tid]=lj1_in[tid];
|
||||||
if (eflag>0)
|
if (eflag>0)
|
||||||
lj3[ii]=lj3_in[ii];
|
lj3[tid]=lj3_in[tid];
|
||||||
}
|
}
|
||||||
ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
if (ii<inum) {
|
|
||||||
|
|
||||||
acctyp energy=(acctyp)0;
|
acctyp energy=(acctyp)0;
|
||||||
acctyp4 f;
|
acctyp4 f;
|
||||||
@ -205,19 +260,34 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
|||||||
for (int i=0; i<6; i++)
|
for (int i=0; i<6; i++)
|
||||||
virial[i]=(acctyp)0;
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
if (ii<inum) {
|
||||||
__global int *nbor=dev_nbor+ii;
|
__global int *nbor=dev_nbor+ii;
|
||||||
int i=*nbor;
|
int i=*nbor;
|
||||||
nbor+=nbor_pitch;
|
nbor+=nbor_pitch;
|
||||||
int numj=*nbor;
|
int numj=*nbor;
|
||||||
nbor+=nbor_pitch;
|
nbor+=nbor_pitch;
|
||||||
__global int *list_end=nbor+mul24(numj,nbor_pitch);
|
|
||||||
|
int n_stride;
|
||||||
|
__global int *list_end;
|
||||||
|
if (dev_nbor==dev_packed) {
|
||||||
|
list_end=nbor+mul24(numj,nbor_pitch);
|
||||||
|
nbor+=mul24(offset,nbor_pitch);
|
||||||
|
n_stride=mul24(t_per_atom,nbor_pitch);
|
||||||
|
} else {
|
||||||
|
nbor=dev_packed+*nbor;
|
||||||
|
list_end=nbor+numj;
|
||||||
|
n_stride=t_per_atom;
|
||||||
|
nbor+=offset;
|
||||||
|
}
|
||||||
|
|
||||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||||
int iw=ix.w;
|
int iw=ix.w;
|
||||||
int itype=mul24((int)MAX_SHARED_TYPES,iw);
|
int itype=mul24((int)MAX_SHARED_TYPES,iw);
|
||||||
|
|
||||||
numtyp factor_lj;
|
numtyp factor_lj;
|
||||||
for ( ; nbor<list_end; nbor+=nbor_pitch) {
|
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||||
|
|
||||||
int j=*nbor;
|
int j=*nbor;
|
||||||
factor_lj = sp_lj[sbmask(j)];
|
factor_lj = sp_lj[sbmask(j)];
|
||||||
@ -256,8 +326,47 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
|||||||
}
|
}
|
||||||
|
|
||||||
} // for nbor
|
} // for nbor
|
||||||
|
} // if ii
|
||||||
|
|
||||||
|
// Reduce answers
|
||||||
|
if (t_per_atom>1) {
|
||||||
|
__local acctyp red_acc[6][BLOCK_PAIR];
|
||||||
|
|
||||||
|
red_acc[0][tid]=f.x;
|
||||||
|
red_acc[1][tid]=f.y;
|
||||||
|
red_acc[2][tid]=f.z;
|
||||||
|
red_acc[3][tid]=energy;
|
||||||
|
|
||||||
|
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||||
|
if (offset < s) {
|
||||||
|
for (int r=0; r<4; r++)
|
||||||
|
red_acc[r][tid] += red_acc[r][tid+s];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
f.x=red_acc[0][tid];
|
||||||
|
f.y=red_acc[1][tid];
|
||||||
|
f.z=red_acc[2][tid];
|
||||||
|
energy=red_acc[3][tid];
|
||||||
|
|
||||||
|
if (vflag>0) {
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
red_acc[r][tid]=virial[r];
|
||||||
|
|
||||||
|
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||||
|
if (offset < s) {
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
red_acc[r][tid] += red_acc[r][tid+s];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
virial[r]=red_acc[r][tid];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Store answers
|
// Store answers
|
||||||
|
if (ii<inum && offset==0) {
|
||||||
__global acctyp *ap1=engv+ii;
|
__global acctyp *ap1=engv+ii;
|
||||||
if (eflag>0) {
|
if (eflag>0) {
|
||||||
*ap1=energy;
|
*ap1=energy;
|
||||||
|
|||||||
@ -42,7 +42,7 @@ int LJL_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
bool LJL_GPU_MemoryT::init(const int ntypes,
|
int LJL_GPU_MemoryT::init(const int ntypes,
|
||||||
double **host_cutsq, double **host_lj1,
|
double **host_cutsq, double **host_lj1,
|
||||||
double **host_lj2, double **host_lj3,
|
double **host_lj2, double **host_lj3,
|
||||||
double **host_lj4, double **host_offset,
|
double **host_lj4, double **host_offset,
|
||||||
@ -50,14 +50,18 @@ bool LJL_GPU_MemoryT::init(const int ntypes,
|
|||||||
const int nall, const int max_nbors,
|
const int nall, const int max_nbors,
|
||||||
const int maxspecial, const double cell_size,
|
const int maxspecial, const double cell_size,
|
||||||
const double gpu_split, FILE *_screen) {
|
const double gpu_split, FILE *_screen) {
|
||||||
this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
int success;
|
||||||
|
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||||
_screen,lj_cut_gpu_kernel);
|
_screen,lj_cut_gpu_kernel);
|
||||||
|
if (success!=0)
|
||||||
|
return success;
|
||||||
|
|
||||||
// If atom type constants fit in shared memory use fast kernel
|
// If atom type constants fit in shared memory use fast kernel
|
||||||
int lj_types=ntypes;
|
int lj_types=ntypes;
|
||||||
shared_types=false;
|
shared_types=false;
|
||||||
if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) {
|
int max_shared_types=this->device->max_shared_types();
|
||||||
lj_types=MAX_SHARED_TYPES;
|
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
|
||||||
|
lj_types=max_shared_types;
|
||||||
shared_types=true;
|
shared_types=true;
|
||||||
}
|
}
|
||||||
_lj_types=lj_types;
|
_lj_types=lj_types;
|
||||||
@ -84,7 +88,7 @@ bool LJL_GPU_MemoryT::init(const int ntypes,
|
|||||||
|
|
||||||
_allocated=true;
|
_allocated=true;
|
||||||
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
|
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
|
||||||
return true;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
@ -122,9 +126,10 @@ void LJL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
|||||||
else
|
else
|
||||||
vflag=0;
|
vflag=0;
|
||||||
|
|
||||||
int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
|
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||||
|
(BX/this->_threads_per_atom)));
|
||||||
|
|
||||||
int ainum=this->atom->inum();
|
int ainum=this->ans->inum();
|
||||||
int anall=this->atom->nall();
|
int anall=this->atom->nall();
|
||||||
int nbor_pitch=this->nbor->nbor_pitch();
|
int nbor_pitch=this->nbor->nbor_pitch();
|
||||||
this->time_pair.start();
|
this->time_pair.start();
|
||||||
@ -133,16 +138,18 @@ void LJL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
|||||||
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
|
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
|
||||||
&lj3.begin(), &sp_lj.begin(),
|
&lj3.begin(), &sp_lj.begin(),
|
||||||
&this->nbor->dev_nbor.begin(),
|
&this->nbor->dev_nbor.begin(),
|
||||||
&this->atom->dev_ans.begin(),
|
&this->_nbor_data->begin(),
|
||||||
&this->atom->dev_engv.begin(), &eflag, &vflag,
|
&this->ans->dev_ans.begin(),
|
||||||
&ainum, &anall, &nbor_pitch);
|
&this->ans->dev_engv.begin(), &eflag, &vflag,
|
||||||
|
&ainum, &anall, &nbor_pitch,
|
||||||
|
&this->_threads_per_atom);
|
||||||
} else {
|
} else {
|
||||||
this->k_pair.set_size(GX,BX);
|
this->k_pair.set_size(GX,BX);
|
||||||
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
|
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
|
||||||
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
||||||
&this->atom->dev_ans.begin(),
|
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
|
||||||
&this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
|
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||||
&anall, &nbor_pitch);
|
&anall, &nbor_pitch, &this->_threads_per_atom);
|
||||||
}
|
}
|
||||||
this->time_pair.stop();
|
this->time_pair.stop();
|
||||||
}
|
}
|
||||||
|
|||||||
@ -29,8 +29,15 @@ class LJL_GPU_Memory : public AtomicGPUMemory<numtyp, acctyp> {
|
|||||||
/// Clear any previous data and set up for a new LAMMPS run
|
/// Clear any previous data and set up for a new LAMMPS run
|
||||||
/** \param max_nbors initial number of rows in the neighbor matrix
|
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||||
* \param cell_size cutoff + skin
|
* \param cell_size cutoff + skin
|
||||||
* \param gpu_split fraction of particles handled by device **/
|
* \param gpu_split fraction of particles handled by device
|
||||||
bool init(const int ntypes, double **host_cutsq,
|
*
|
||||||
|
* Returns:
|
||||||
|
* - 0 if successfull
|
||||||
|
* - -1 if fix gpu not found
|
||||||
|
* - -3 if there is an out of memory error
|
||||||
|
* - -4 if the GPU library was not compiled for GPU
|
||||||
|
* - -5 Double precision is not supported on card **/
|
||||||
|
int init(const int ntypes, double **host_cutsq,
|
||||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||||
double **host_lj4, double **host_offset, double *host_special_lj,
|
double **host_lj4, double **host_offset, double *host_special_lj,
|
||||||
const int nlocal, const int nall, const int max_nbors,
|
const int nlocal, const int nall, const int max_nbors,
|
||||||
|
|||||||
@ -28,7 +28,7 @@ static LJC_GPU_Memory<PRECISION,ACC_PRECISION> LJCMF;
|
|||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
// Allocate memory on host and device and copy constants to device
|
// Allocate memory on host and device and copy constants to device
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
bool ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
int ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
||||||
double **host_lj2, double **host_lj3, double **host_lj4,
|
double **host_lj2, double **host_lj3, double **host_lj4,
|
||||||
double **offset, double *special_lj, const int inum,
|
double **offset, double *special_lj, const int inum,
|
||||||
const int nall, const int max_nbors, const int maxspecial,
|
const int nall, const int max_nbors, const int maxspecial,
|
||||||
@ -55,15 +55,12 @@ bool ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
|||||||
fflush(screen);
|
fflush(screen);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (world_me==0) {
|
int init_ok=0;
|
||||||
bool init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
|
if (world_me==0)
|
||||||
|
init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
|
||||||
host_lj4, offset, special_lj, inum, nall, 300,
|
host_lj4, offset, special_lj, inum, nall, 300,
|
||||||
maxspecial, cell_size, gpu_split, screen,
|
maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
|
||||||
host_cut_ljsq, host_cut_coulsq, host_special_coul,
|
host_cut_coulsq, host_special_coul, qqrd2e);
|
||||||
qqrd2e);
|
|
||||||
if (!init_ok)
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
LJCMF.device->world_barrier();
|
LJCMF.device->world_barrier();
|
||||||
if (message)
|
if (message)
|
||||||
@ -78,48 +75,51 @@ bool ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
|||||||
last_gpu,i);
|
last_gpu,i);
|
||||||
fflush(screen);
|
fflush(screen);
|
||||||
}
|
}
|
||||||
if (gpu_rank==i && world_me!=0) {
|
if (gpu_rank==i && world_me!=0)
|
||||||
bool init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
|
init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
|
||||||
host_lj4, offset, special_lj, inum, nall, 300,
|
offset, special_lj, inum, nall, 300, maxspecial,
|
||||||
maxspecial, cell_size, gpu_split,
|
cell_size, gpu_split, screen, host_cut_ljsq,
|
||||||
screen, host_cut_ljsq, host_cut_coulsq,
|
host_cut_coulsq, host_special_coul, qqrd2e);
|
||||||
host_special_coul, qqrd2e);
|
|
||||||
if (!init_ok)
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
LJCMF.device->gpu_barrier();
|
LJCMF.device->gpu_barrier();
|
||||||
if (message)
|
if (message)
|
||||||
fprintf(screen,"Done.\n");
|
fprintf(screen,"Done.\n");
|
||||||
}
|
}
|
||||||
if (message)
|
if (message)
|
||||||
fprintf(screen,"\n");
|
fprintf(screen,"\n");
|
||||||
return true;
|
|
||||||
|
if (init_ok==0)
|
||||||
|
LJCMF.estimate_gpu_overhead();
|
||||||
|
return init_ok;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ljc_gpu_clear() {
|
void ljc_gpu_clear() {
|
||||||
LJCMF.clear();
|
LJCMF.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
int * ljc_gpu_compute_n(const int timestep, const int ago, const int inum_full,
|
int** ljc_gpu_compute_n(const int ago, const int inum_full,
|
||||||
const int nall, double **host_x, int *host_type,
|
const int nall, double **host_x, int *host_type,
|
||||||
double *boxlo, double *boxhi, int *tag, int **nspecial,
|
double *sublo, double *subhi, int *tag, int **nspecial,
|
||||||
int **special, const bool eflag, const bool vflag,
|
int **special, const bool eflag, const bool vflag,
|
||||||
const bool eatom, const bool vatom, int &host_start,
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
const double cpu_time, bool &success, double *host_q) {
|
int **ilist, int **jnum, const double cpu_time,
|
||||||
return LJCMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
|
bool &success, double *host_q, double *boxlo,
|
||||||
boxhi, tag, nspecial, special, eflag, vflag, eatom,
|
double *prd) {
|
||||||
vatom, host_start, cpu_time, success, host_q);
|
return LJCMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
|
||||||
|
subhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||||
|
vatom, host_start, ilist, jnum, cpu_time, success,
|
||||||
|
host_q, boxlo, prd);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ljc_gpu_compute(const int timestep, const int ago, const int inum_full,
|
void ljc_gpu_compute(const int ago, const int inum_full, const int nall,
|
||||||
const int nall, double **host_x, int *host_type,
|
double **host_x, int *host_type, int *ilist, int *numj,
|
||||||
int *ilist, int *numj, int **firstneigh,
|
int **firstneigh, const bool eflag, const bool vflag,
|
||||||
const bool eflag, const bool vflag, const bool eatom,
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
const bool vatom, int &host_start, const double cpu_time,
|
const double cpu_time, bool &success, double *host_q,
|
||||||
bool &success, double *host_q) {
|
const int nlocal, double *boxlo, double *prd) {
|
||||||
LJCMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
|
LJCMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag,
|
||||||
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
|
vflag,eatom,vatom,host_start,cpu_time,success,host_q,
|
||||||
host_q);
|
nlocal,boxlo,prd);
|
||||||
}
|
}
|
||||||
|
|
||||||
double ljc_gpu_bytes() {
|
double ljc_gpu_bytes() {
|
||||||
|
|||||||
@ -18,8 +18,6 @@
|
|||||||
#ifndef LJC_GPU_KERNEL
|
#ifndef LJC_GPU_KERNEL
|
||||||
#define LJC_GPU_KERNEL
|
#define LJC_GPU_KERNEL
|
||||||
|
|
||||||
#define MAX_SHARED_TYPES 8
|
|
||||||
|
|
||||||
#ifdef _DOUBLE_DOUBLE
|
#ifdef _DOUBLE_DOUBLE
|
||||||
#define numtyp double
|
#define numtyp double
|
||||||
#define numtyp2 double2
|
#define numtyp2 double2
|
||||||
@ -46,7 +44,7 @@
|
|||||||
|
|
||||||
#ifdef NV_KERNEL
|
#ifdef NV_KERNEL
|
||||||
|
|
||||||
#include "geryon/ucl_nv_kernel.h"
|
#include "nv_kernel_def.h"
|
||||||
texture<float4> pos_tex;
|
texture<float4> pos_tex;
|
||||||
texture<float> q_tex;
|
texture<float> q_tex;
|
||||||
|
|
||||||
@ -82,6 +80,8 @@ __inline float fetch_q(const int& i, const float *q)
|
|||||||
|
|
||||||
#define fetch_pos(i,y) x_[i]
|
#define fetch_pos(i,y) x_[i]
|
||||||
#define fetch_q(i,y) q_[i]
|
#define fetch_q(i,y) q_[i]
|
||||||
|
#define BLOCK_PAIR 64
|
||||||
|
#define MAX_SHARED_TYPES 8
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -92,13 +92,17 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; }
|
|||||||
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||||
__global numtyp4* lj3, const int lj_types,
|
__global numtyp4* lj3, const int lj_types,
|
||||||
__global numtyp *sp_lj_in, __global int *dev_nbor,
|
__global numtyp *sp_lj_in, __global int *dev_nbor,
|
||||||
__global acctyp4 *ans, __global acctyp *engv,
|
__global int *dev_packed, __global acctyp4 *ans,
|
||||||
const int eflag, const int vflag, const int inum,
|
__global acctyp *engv, const int eflag,
|
||||||
const int nall, const int nbor_pitch,
|
const int vflag, const int inum, const int nall,
|
||||||
__global numtyp *q_ , __global numtyp *cutsq,
|
const int nbor_pitch, __global numtyp *q_ ,
|
||||||
const numtyp qqrd2e) {
|
__global numtyp *cutsq, const numtyp qqrd2e,
|
||||||
// ii indexes the two interacting particles in gi
|
const int t_per_atom) {
|
||||||
int ii=GLOBAL_ID_X;
|
int tid=THREAD_ID_X;
|
||||||
|
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||||
|
ii+=tid/t_per_atom;
|
||||||
|
int offset=tid%t_per_atom;
|
||||||
|
|
||||||
__local numtyp sp_lj[8];
|
__local numtyp sp_lj[8];
|
||||||
sp_lj[0]=sp_lj_in[0];
|
sp_lj[0]=sp_lj_in[0];
|
||||||
sp_lj[1]=sp_lj_in[1];
|
sp_lj[1]=sp_lj_in[1];
|
||||||
@ -109,7 +113,6 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||||||
sp_lj[6]=sp_lj_in[6];
|
sp_lj[6]=sp_lj_in[6];
|
||||||
sp_lj[7]=sp_lj_in[7];
|
sp_lj[7]=sp_lj_in[7];
|
||||||
|
|
||||||
if (ii<inum) {
|
|
||||||
acctyp energy=(acctyp)0;
|
acctyp energy=(acctyp)0;
|
||||||
acctyp e_coul=(acctyp)0;
|
acctyp e_coul=(acctyp)0;
|
||||||
acctyp4 f;
|
acctyp4 f;
|
||||||
@ -120,18 +123,31 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||||||
for (int i=0; i<6; i++)
|
for (int i=0; i<6; i++)
|
||||||
virial[i]=(acctyp)0;
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
|
if (ii<inum) {
|
||||||
__global int *nbor=dev_nbor+ii;
|
__global int *nbor=dev_nbor+ii;
|
||||||
int i=*nbor;
|
int i=*nbor;
|
||||||
nbor+=nbor_pitch;
|
nbor+=nbor_pitch;
|
||||||
int numj=*nbor;
|
int numj=*nbor;
|
||||||
nbor+=nbor_pitch;
|
nbor+=nbor_pitch;
|
||||||
__global int *list_end=nbor+mul24(numj,nbor_pitch);
|
|
||||||
|
int n_stride;
|
||||||
|
__global int *list_end;
|
||||||
|
if (dev_nbor==dev_packed) {
|
||||||
|
list_end=nbor+mul24(numj,nbor_pitch);
|
||||||
|
nbor+=mul24(offset,nbor_pitch);
|
||||||
|
n_stride=mul24(t_per_atom,nbor_pitch);
|
||||||
|
} else {
|
||||||
|
nbor=dev_packed+*nbor;
|
||||||
|
list_end=nbor+numj;
|
||||||
|
n_stride=t_per_atom;
|
||||||
|
nbor+=offset;
|
||||||
|
}
|
||||||
|
|
||||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||||
numtyp qtmp=fetch_q(i,q_);
|
numtyp qtmp=fetch_q(i,q_);
|
||||||
int itype=ix.w;
|
int itype=ix.w;
|
||||||
|
|
||||||
for ( ; nbor<list_end; nbor+=nbor_pitch) {
|
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||||
int j=*nbor;
|
int j=*nbor;
|
||||||
|
|
||||||
numtyp factor_lj, factor_coul;
|
numtyp factor_lj, factor_coul;
|
||||||
@ -188,8 +204,49 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||||||
}
|
}
|
||||||
|
|
||||||
} // for nbor
|
} // for nbor
|
||||||
|
} // if ii
|
||||||
|
|
||||||
|
// Reduce answers
|
||||||
|
if (t_per_atom>1) {
|
||||||
|
__local acctyp red_acc[6][BLOCK_PAIR];
|
||||||
|
|
||||||
|
red_acc[0][tid]=f.x;
|
||||||
|
red_acc[1][tid]=f.y;
|
||||||
|
red_acc[2][tid]=f.z;
|
||||||
|
red_acc[3][tid]=energy;
|
||||||
|
red_acc[4][tid]=e_coul;
|
||||||
|
|
||||||
|
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||||
|
if (offset < s) {
|
||||||
|
for (int r=0; r<5; r++)
|
||||||
|
red_acc[r][tid] += red_acc[r][tid+s];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
f.x=red_acc[0][tid];
|
||||||
|
f.y=red_acc[1][tid];
|
||||||
|
f.z=red_acc[2][tid];
|
||||||
|
energy=red_acc[3][tid];
|
||||||
|
e_coul=red_acc[4][tid];
|
||||||
|
|
||||||
|
if (vflag>0) {
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
red_acc[r][tid]=virial[r];
|
||||||
|
|
||||||
|
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||||
|
if (offset < s) {
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
red_acc[r][tid] += red_acc[r][tid+s];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
virial[r]=red_acc[r][tid];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Store answers
|
// Store answers
|
||||||
|
if (ii<inum && offset==0) {
|
||||||
__global acctyp *ap1=engv+ii;
|
__global acctyp *ap1=engv+ii;
|
||||||
if (eflag>0) {
|
if (eflag>0) {
|
||||||
*ap1=energy;
|
*ap1=energy;
|
||||||
@ -209,30 +266,30 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||||||
|
|
||||||
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||||
__global numtyp4* lj3_in,
|
__global numtyp4* lj3_in,
|
||||||
__global numtyp* sp_lj_in, __global int *dev_nbor,
|
__global numtyp* sp_lj_in,
|
||||||
|
__global int *dev_nbor, __global int *dev_packed,
|
||||||
__global acctyp4 *ans, __global acctyp *engv,
|
__global acctyp4 *ans, __global acctyp *engv,
|
||||||
const int eflag, const int vflag, const int inum,
|
const int eflag, const int vflag, const int inum,
|
||||||
const int nall, const int nbor_pitch,
|
const int nall, const int nbor_pitch,
|
||||||
__global numtyp *q_ , __global numtyp *_cutsq,
|
__global numtyp *q_ , __global numtyp *_cutsq,
|
||||||
const numtyp qqrd2e) {
|
const numtyp qqrd2e, const int t_per_atom) {
|
||||||
// ii indexes the two interacting particles in gi
|
int tid=THREAD_ID_X;
|
||||||
int ii=THREAD_ID_X;
|
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||||
|
ii+=tid/t_per_atom;
|
||||||
|
int offset=tid%t_per_atom;
|
||||||
|
|
||||||
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||||
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||||
__local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
__local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||||
__local numtyp sp_lj[8];
|
__local numtyp sp_lj[8];
|
||||||
if (ii<8)
|
if (tid<8)
|
||||||
sp_lj[ii]=sp_lj_in[ii];
|
sp_lj[tid]=sp_lj_in[tid];
|
||||||
if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||||
lj1[ii]=lj1_in[ii];
|
lj1[tid]=lj1_in[tid];
|
||||||
cutsq[ii]=_cutsq[ii];
|
cutsq[tid]=_cutsq[tid];
|
||||||
if (eflag>0)
|
if (eflag>0)
|
||||||
lj3[ii]=lj3_in[ii];
|
lj3[tid]=lj3_in[tid];
|
||||||
}
|
}
|
||||||
ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
if (ii<inum) {
|
|
||||||
|
|
||||||
acctyp energy=(acctyp)0;
|
acctyp energy=(acctyp)0;
|
||||||
acctyp e_coul=(acctyp)0;
|
acctyp e_coul=(acctyp)0;
|
||||||
@ -244,19 +301,34 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
|||||||
for (int i=0; i<6; i++)
|
for (int i=0; i<6; i++)
|
||||||
virial[i]=(acctyp)0;
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
if (ii<inum) {
|
||||||
__global int *nbor=dev_nbor+ii;
|
__global int *nbor=dev_nbor+ii;
|
||||||
int i=*nbor;
|
int i=*nbor;
|
||||||
nbor+=nbor_pitch;
|
nbor+=nbor_pitch;
|
||||||
int numj=*nbor;
|
int numj=*nbor;
|
||||||
nbor+=nbor_pitch;
|
nbor+=nbor_pitch;
|
||||||
__global int *list_end=nbor+mul24(numj,nbor_pitch);
|
|
||||||
|
int n_stride;
|
||||||
|
__global int *list_end;
|
||||||
|
if (dev_nbor==dev_packed) {
|
||||||
|
list_end=nbor+mul24(numj,nbor_pitch);
|
||||||
|
nbor+=mul24(offset,nbor_pitch);
|
||||||
|
n_stride=mul24(t_per_atom,nbor_pitch);
|
||||||
|
} else {
|
||||||
|
nbor=dev_packed+*nbor;
|
||||||
|
list_end=nbor+numj;
|
||||||
|
n_stride=t_per_atom;
|
||||||
|
nbor+=offset;
|
||||||
|
}
|
||||||
|
|
||||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||||
numtyp qtmp=fetch_q(i,q_);
|
numtyp qtmp=fetch_q(i,q_);
|
||||||
int iw=ix.w;
|
int iw=ix.w;
|
||||||
int itype=mul24((int)MAX_SHARED_TYPES,iw);
|
int itype=mul24((int)MAX_SHARED_TYPES,iw);
|
||||||
|
|
||||||
for ( ; nbor<list_end; nbor+=nbor_pitch) {
|
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||||
int j=*nbor;
|
int j=*nbor;
|
||||||
|
|
||||||
numtyp factor_lj, factor_coul;
|
numtyp factor_lj, factor_coul;
|
||||||
@ -312,8 +384,49 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
|||||||
}
|
}
|
||||||
|
|
||||||
} // for nbor
|
} // for nbor
|
||||||
|
} // if ii
|
||||||
|
|
||||||
|
// Reduce answers
|
||||||
|
if (t_per_atom>1) {
|
||||||
|
__local acctyp red_acc[6][BLOCK_PAIR];
|
||||||
|
|
||||||
|
red_acc[0][tid]=f.x;
|
||||||
|
red_acc[1][tid]=f.y;
|
||||||
|
red_acc[2][tid]=f.z;
|
||||||
|
red_acc[3][tid]=energy;
|
||||||
|
red_acc[4][tid]=e_coul;
|
||||||
|
|
||||||
|
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||||
|
if (offset < s) {
|
||||||
|
for (int r=0; r<5; r++)
|
||||||
|
red_acc[r][tid] += red_acc[r][tid+s];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
f.x=red_acc[0][tid];
|
||||||
|
f.y=red_acc[1][tid];
|
||||||
|
f.z=red_acc[2][tid];
|
||||||
|
energy=red_acc[3][tid];
|
||||||
|
e_coul=red_acc[4][tid];
|
||||||
|
|
||||||
|
if (vflag>0) {
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
red_acc[r][tid]=virial[r];
|
||||||
|
|
||||||
|
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||||
|
if (offset < s) {
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
red_acc[r][tid] += red_acc[r][tid+s];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
virial[r]=red_acc[r][tid];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Store answers
|
// Store answers
|
||||||
|
if (ii<inum && offset==0) {
|
||||||
__global acctyp *ap1=engv+ii;
|
__global acctyp *ap1=engv+ii;
|
||||||
if (eflag>0) {
|
if (eflag>0) {
|
||||||
*ap1=energy;
|
*ap1=energy;
|
||||||
|
|||||||
@ -43,7 +43,7 @@ int LJC_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
bool LJC_GPU_MemoryT::init(const int ntypes,
|
int LJC_GPU_MemoryT::init(const int ntypes,
|
||||||
double **host_cutsq, double **host_lj1,
|
double **host_cutsq, double **host_lj1,
|
||||||
double **host_lj2, double **host_lj3,
|
double **host_lj2, double **host_lj3,
|
||||||
double **host_lj4, double **host_offset,
|
double **host_lj4, double **host_offset,
|
||||||
@ -53,14 +53,18 @@ bool LJC_GPU_MemoryT::init(const int ntypes,
|
|||||||
const double gpu_split, FILE *_screen,
|
const double gpu_split, FILE *_screen,
|
||||||
double **host_cut_ljsq, double **host_cut_coulsq,
|
double **host_cut_ljsq, double **host_cut_coulsq,
|
||||||
double *host_special_coul, const double qqrd2e) {
|
double *host_special_coul, const double qqrd2e) {
|
||||||
this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
int success;
|
||||||
|
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||||
_screen,ljc_cut_gpu_kernel);
|
_screen,ljc_cut_gpu_kernel);
|
||||||
|
if (success!=0)
|
||||||
|
return success;
|
||||||
|
|
||||||
// If atom type constants fit in shared memory use fast kernel
|
// If atom type constants fit in shared memory use fast kernel
|
||||||
int lj_types=ntypes;
|
int lj_types=ntypes;
|
||||||
shared_types=false;
|
shared_types=false;
|
||||||
if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) {
|
int max_shared_types=this->device->max_shared_types();
|
||||||
lj_types=MAX_SHARED_TYPES;
|
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
|
||||||
|
lj_types=max_shared_types;
|
||||||
shared_types=true;
|
shared_types=true;
|
||||||
}
|
}
|
||||||
_lj_types=lj_types;
|
_lj_types=lj_types;
|
||||||
@ -95,7 +99,7 @@ bool LJC_GPU_MemoryT::init(const int ntypes,
|
|||||||
_allocated=true;
|
_allocated=true;
|
||||||
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+cutsq.row_bytes()+
|
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+cutsq.row_bytes()+
|
||||||
sp_lj.row_bytes();
|
sp_lj.row_bytes();
|
||||||
return true;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
@ -134,9 +138,10 @@ void LJC_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
|||||||
else
|
else
|
||||||
vflag=0;
|
vflag=0;
|
||||||
|
|
||||||
int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
|
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||||
|
(BX/this->_threads_per_atom)));
|
||||||
|
|
||||||
int ainum=this->atom->inum();
|
int ainum=this->ans->inum();
|
||||||
int anall=this->atom->nall();
|
int anall=this->atom->nall();
|
||||||
int nbor_pitch=this->nbor->nbor_pitch();
|
int nbor_pitch=this->nbor->nbor_pitch();
|
||||||
this->time_pair.start();
|
this->time_pair.start();
|
||||||
@ -145,19 +150,20 @@ void LJC_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
|||||||
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
|
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
|
||||||
&lj3.begin(), &sp_lj.begin(),
|
&lj3.begin(), &sp_lj.begin(),
|
||||||
&this->nbor->dev_nbor.begin(),
|
&this->nbor->dev_nbor.begin(),
|
||||||
&this->atom->dev_ans.begin(),
|
&this->_nbor_data->begin(),
|
||||||
&this->atom->dev_engv.begin(), &eflag, &vflag,
|
&this->ans->dev_ans.begin(),
|
||||||
|
&this->ans->dev_engv.begin(), &eflag, &vflag,
|
||||||
&ainum, &anall, &nbor_pitch,
|
&ainum, &anall, &nbor_pitch,
|
||||||
&this->atom->dev_q.begin(), &cutsq.begin(),
|
&this->atom->dev_q.begin(), &cutsq.begin(),
|
||||||
&_qqrd2e);
|
&_qqrd2e, &this->_threads_per_atom);
|
||||||
} else {
|
} else {
|
||||||
this->k_pair.set_size(GX,BX);
|
this->k_pair.set_size(GX,BX);
|
||||||
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
|
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
|
||||||
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
||||||
&this->atom->dev_ans.begin(),
|
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
|
||||||
&this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
|
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||||
&anall, &nbor_pitch, &this->atom->dev_q.begin(),
|
&anall, &nbor_pitch, &this->atom->dev_q.begin(),
|
||||||
&cutsq.begin(), &_qqrd2e);
|
&cutsq.begin(), &_qqrd2e, &this->_threads_per_atom);
|
||||||
}
|
}
|
||||||
this->time_pair.stop();
|
this->time_pair.stop();
|
||||||
}
|
}
|
||||||
|
|||||||
@ -29,8 +29,15 @@ class LJC_GPU_Memory : public ChargeGPUMemory<numtyp, acctyp> {
|
|||||||
/// Clear any previous data and set up for a new LAMMPS run
|
/// Clear any previous data and set up for a new LAMMPS run
|
||||||
/** \param max_nbors initial number of rows in the neighbor matrix
|
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||||
* \param cell_size cutoff + skin
|
* \param cell_size cutoff + skin
|
||||||
* \param gpu_split fraction of particles handled by device **/
|
* \param gpu_split fraction of particles handled by device
|
||||||
bool init(const int ntypes, double **host_cutsq, double **host_lj1,
|
*
|
||||||
|
* Returns:
|
||||||
|
* - 0 if successfull
|
||||||
|
* - -1 if fix gpu not found
|
||||||
|
* - -3 if there is an out of memory error
|
||||||
|
* - -4 if the GPU library was not compiled for GPU
|
||||||
|
* - -5 Double precision is not supported on card **/
|
||||||
|
int init(const int ntypes, double **host_cutsq, double **host_lj1,
|
||||||
double **host_lj2, double **host_lj3, double **host_lj4,
|
double **host_lj2, double **host_lj3, double **host_lj4,
|
||||||
double **host_offset, double *host_special_lj,
|
double **host_offset, double *host_special_lj,
|
||||||
const int nlocal, const int nall, const int max_nbors,
|
const int nlocal, const int nall, const int max_nbors,
|
||||||
|
|||||||
@ -28,7 +28,7 @@ static LJCL_GPU_Memory<PRECISION,ACC_PRECISION> LJCLMF;
|
|||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
// Allocate memory on host and device and copy constants to device
|
// Allocate memory on host and device and copy constants to device
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
bool ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
int ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
||||||
double **host_lj2, double **host_lj3, double **host_lj4,
|
double **host_lj2, double **host_lj3, double **host_lj4,
|
||||||
double **offset, double *special_lj, const int inum,
|
double **offset, double *special_lj, const int inum,
|
||||||
const int nall, const int max_nbors, const int maxspecial,
|
const int nall, const int max_nbors, const int maxspecial,
|
||||||
@ -56,15 +56,12 @@ bool ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
|||||||
fflush(screen);
|
fflush(screen);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (world_me==0) {
|
int init_ok=0;
|
||||||
bool init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
|
if (world_me==0)
|
||||||
host_lj4, offset, special_lj, inum, nall, 300,
|
init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
|
||||||
maxspecial, cell_size, gpu_split, screen,
|
offset, special_lj, inum, nall, 300, maxspecial,
|
||||||
host_cut_ljsq, host_cut_coulsq, host_special_coul,
|
cell_size, gpu_split, screen, host_cut_ljsq,
|
||||||
qqrd2e,g_ewald);
|
host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
|
||||||
if (!init_ok)
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
LJCLMF.device->world_barrier();
|
LJCLMF.device->world_barrier();
|
||||||
if (message)
|
if (message)
|
||||||
@ -79,48 +76,51 @@ bool ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
|||||||
last_gpu,i);
|
last_gpu,i);
|
||||||
fflush(screen);
|
fflush(screen);
|
||||||
}
|
}
|
||||||
if (gpu_rank==i && world_me!=0) {
|
if (gpu_rank==i && world_me!=0)
|
||||||
bool init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
|
init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
|
||||||
host_lj4, offset, special_lj, inum, nall, 300,
|
offset, special_lj, inum, nall, 300, maxspecial,
|
||||||
maxspecial, cell_size, gpu_split,
|
cell_size, gpu_split, screen, host_cut_ljsq,
|
||||||
screen, host_cut_ljsq, host_cut_coulsq,
|
host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
|
||||||
host_special_coul, qqrd2e, g_ewald);
|
|
||||||
if (!init_ok)
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
LJCLMF.device->gpu_barrier();
|
LJCLMF.device->gpu_barrier();
|
||||||
if (message)
|
if (message)
|
||||||
fprintf(screen,"Done.\n");
|
fprintf(screen,"Done.\n");
|
||||||
}
|
}
|
||||||
if (message)
|
if (message)
|
||||||
fprintf(screen,"\n");
|
fprintf(screen,"\n");
|
||||||
return true;
|
|
||||||
|
if (init_ok==0)
|
||||||
|
LJCLMF.estimate_gpu_overhead();
|
||||||
|
return init_ok;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ljcl_gpu_clear() {
|
void ljcl_gpu_clear() {
|
||||||
LJCLMF.clear();
|
LJCLMF.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
int * ljcl_gpu_compute_n(const int timestep, const int ago, const int inum_full,
|
int** ljcl_gpu_compute_n(const int ago, const int inum_full,
|
||||||
const int nall, double **host_x, int *host_type,
|
const int nall, double **host_x, int *host_type,
|
||||||
double *boxlo, double *boxhi, int *tag, int **nspecial,
|
double *sublo, double *subhi, int *tag, int **nspecial,
|
||||||
int **special, const bool eflag, const bool vflag,
|
int **special, const bool eflag, const bool vflag,
|
||||||
const bool eatom, const bool vatom, int &host_start,
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
const double cpu_time, bool &success, double *host_q) {
|
int **ilist, int **jnum, const double cpu_time,
|
||||||
return LJCLMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
|
bool &success, double *host_q, double *boxlo,
|
||||||
boxhi, tag, nspecial, special, eflag, vflag, eatom,
|
double *prd) {
|
||||||
vatom, host_start, cpu_time, success, host_q);
|
return LJCLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
|
||||||
|
subhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||||
|
vatom, host_start, ilist, jnum, cpu_time, success,
|
||||||
|
host_q, boxlo, prd);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ljcl_gpu_compute(const int timestep, const int ago, const int inum_full,
|
void ljcl_gpu_compute(const int ago, const int inum_full, const int nall,
|
||||||
const int nall, double **host_x, int *host_type,
|
double **host_x, int *host_type, int *ilist, int *numj,
|
||||||
int *ilist, int *numj, int **firstneigh,
|
int **firstneigh, const bool eflag, const bool vflag,
|
||||||
const bool eflag, const bool vflag, const bool eatom,
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
const bool vatom, int &host_start, const double cpu_time,
|
const double cpu_time, bool &success, double *host_q,
|
||||||
bool &success, double *host_q) {
|
const int nlocal, double *boxlo, double *prd) {
|
||||||
LJCLMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
|
LJCLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
|
||||||
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
|
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
|
||||||
host_q);
|
host_q,nlocal,boxlo,prd);
|
||||||
}
|
}
|
||||||
|
|
||||||
double ljcl_gpu_bytes() {
|
double ljcl_gpu_bytes() {
|
||||||
|
|||||||
@ -18,8 +18,6 @@
|
|||||||
#ifndef LJCL_GPU_KERNEL
|
#ifndef LJCL_GPU_KERNEL
|
||||||
#define LJCL_GPU_KERNEL
|
#define LJCL_GPU_KERNEL
|
||||||
|
|
||||||
#define MAX_SHARED_TYPES 8
|
|
||||||
|
|
||||||
#ifdef _DOUBLE_DOUBLE
|
#ifdef _DOUBLE_DOUBLE
|
||||||
#define numtyp double
|
#define numtyp double
|
||||||
#define numtyp2 double2
|
#define numtyp2 double2
|
||||||
@ -54,7 +52,7 @@
|
|||||||
|
|
||||||
#ifdef NV_KERNEL
|
#ifdef NV_KERNEL
|
||||||
|
|
||||||
#include "geryon/ucl_nv_kernel.h"
|
#include "nv_kernel_def.h"
|
||||||
texture<float4> pos_tex;
|
texture<float4> pos_tex;
|
||||||
texture<float> q_tex;
|
texture<float> q_tex;
|
||||||
|
|
||||||
@ -90,6 +88,8 @@ __inline float fetch_q(const int& i, const float *q)
|
|||||||
|
|
||||||
#define fetch_pos(i,y) x_[i]
|
#define fetch_pos(i,y) x_[i]
|
||||||
#define fetch_q(i,y) q_[i]
|
#define fetch_q(i,y) q_[i]
|
||||||
|
#define BLOCK_PAIR 64
|
||||||
|
#define MAX_SHARED_TYPES 8
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -100,13 +100,17 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; }
|
|||||||
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||||
__global numtyp4* lj3, const int lj_types,
|
__global numtyp4* lj3, const int lj_types,
|
||||||
__global numtyp *sp_lj_in, __global int *dev_nbor,
|
__global numtyp *sp_lj_in, __global int *dev_nbor,
|
||||||
__global acctyp4 *ans, __global acctyp *engv,
|
__global int *dev_packed, __global acctyp4 *ans,
|
||||||
const int eflag, const int vflag, const int inum,
|
__global acctyp *engv, const int eflag,
|
||||||
const int nall, const int nbor_pitch,
|
const int vflag, const int inum, const int nall,
|
||||||
__global numtyp *q_ , const numtyp cut_coulsq,
|
const int nbor_pitch, __global numtyp *q_,
|
||||||
const numtyp qqrd2e, const numtyp g_ewald) {
|
const numtyp cut_coulsq, const numtyp qqrd2e,
|
||||||
// ii indexes the two interacting particles in gi
|
const numtyp g_ewald, const int t_per_atom) {
|
||||||
int ii=GLOBAL_ID_X;
|
int tid=THREAD_ID_X;
|
||||||
|
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||||
|
ii+=tid/t_per_atom;
|
||||||
|
int offset=tid%t_per_atom;
|
||||||
|
|
||||||
__local numtyp sp_lj[8];
|
__local numtyp sp_lj[8];
|
||||||
sp_lj[0]=sp_lj_in[0];
|
sp_lj[0]=sp_lj_in[0];
|
||||||
sp_lj[1]=sp_lj_in[1];
|
sp_lj[1]=sp_lj_in[1];
|
||||||
@ -117,7 +121,6 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||||||
sp_lj[6]=sp_lj_in[6];
|
sp_lj[6]=sp_lj_in[6];
|
||||||
sp_lj[7]=sp_lj_in[7];
|
sp_lj[7]=sp_lj_in[7];
|
||||||
|
|
||||||
if (ii<inum) {
|
|
||||||
acctyp energy=(acctyp)0;
|
acctyp energy=(acctyp)0;
|
||||||
acctyp e_coul=(acctyp)0;
|
acctyp e_coul=(acctyp)0;
|
||||||
acctyp4 f;
|
acctyp4 f;
|
||||||
@ -128,18 +131,31 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||||||
for (int i=0; i<6; i++)
|
for (int i=0; i<6; i++)
|
||||||
virial[i]=(acctyp)0;
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
|
if (ii<inum) {
|
||||||
__global int *nbor=dev_nbor+ii;
|
__global int *nbor=dev_nbor+ii;
|
||||||
int i=*nbor;
|
int i=*nbor;
|
||||||
nbor+=nbor_pitch;
|
nbor+=nbor_pitch;
|
||||||
int numj=*nbor;
|
int numj=*nbor;
|
||||||
nbor+=nbor_pitch;
|
nbor+=nbor_pitch;
|
||||||
__global int *list_end=nbor+mul24(numj,nbor_pitch);
|
|
||||||
|
int n_stride;
|
||||||
|
__global int *list_end;
|
||||||
|
if (dev_nbor==dev_packed) {
|
||||||
|
list_end=nbor+mul24(numj,nbor_pitch);
|
||||||
|
nbor+=mul24(offset,nbor_pitch);
|
||||||
|
n_stride=mul24(t_per_atom,nbor_pitch);
|
||||||
|
} else {
|
||||||
|
nbor=dev_packed+*nbor;
|
||||||
|
list_end=nbor+numj;
|
||||||
|
n_stride=t_per_atom;
|
||||||
|
nbor+=offset;
|
||||||
|
}
|
||||||
|
|
||||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||||
numtyp qtmp=fetch_q(i,q_);
|
numtyp qtmp=fetch_q(i,q_);
|
||||||
int itype=ix.w;
|
int itype=ix.w;
|
||||||
|
|
||||||
for ( ; nbor<list_end; nbor+=nbor_pitch) {
|
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||||
int j=*nbor;
|
int j=*nbor;
|
||||||
|
|
||||||
numtyp factor_lj, factor_coul;
|
numtyp factor_lj, factor_coul;
|
||||||
@ -204,8 +220,49 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||||||
}
|
}
|
||||||
|
|
||||||
} // for nbor
|
} // for nbor
|
||||||
|
} // if ii
|
||||||
|
|
||||||
|
// Reduce answers
|
||||||
|
if (t_per_atom>1) {
|
||||||
|
__local acctyp red_acc[6][BLOCK_PAIR];
|
||||||
|
|
||||||
|
red_acc[0][tid]=f.x;
|
||||||
|
red_acc[1][tid]=f.y;
|
||||||
|
red_acc[2][tid]=f.z;
|
||||||
|
red_acc[3][tid]=energy;
|
||||||
|
red_acc[4][tid]=e_coul;
|
||||||
|
|
||||||
|
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||||
|
if (offset < s) {
|
||||||
|
for (int r=0; r<5; r++)
|
||||||
|
red_acc[r][tid] += red_acc[r][tid+s];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
f.x=red_acc[0][tid];
|
||||||
|
f.y=red_acc[1][tid];
|
||||||
|
f.z=red_acc[2][tid];
|
||||||
|
energy=red_acc[3][tid];
|
||||||
|
e_coul=red_acc[4][tid];
|
||||||
|
|
||||||
|
if (vflag>0) {
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
red_acc[r][tid]=virial[r];
|
||||||
|
|
||||||
|
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||||
|
if (offset < s) {
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
red_acc[r][tid] += red_acc[r][tid+s];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
virial[r]=red_acc[r][tid];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Store answers
|
// Store answers
|
||||||
|
if (ii<inum && offset==0) {
|
||||||
__global acctyp *ap1=engv+ii;
|
__global acctyp *ap1=engv+ii;
|
||||||
if (eflag>0) {
|
if (eflag>0) {
|
||||||
*ap1=energy;
|
*ap1=energy;
|
||||||
@ -225,28 +282,29 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
|||||||
|
|
||||||
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||||
__global numtyp4* lj3_in,
|
__global numtyp4* lj3_in,
|
||||||
__global numtyp* sp_lj_in, __global int *dev_nbor,
|
__global numtyp* sp_lj_in,
|
||||||
|
__global int *dev_nbor, __global int *dev_packed,
|
||||||
__global acctyp4 *ans, __global acctyp *engv,
|
__global acctyp4 *ans, __global acctyp *engv,
|
||||||
const int eflag, const int vflag, const int inum,
|
const int eflag, const int vflag, const int inum,
|
||||||
const int nall, const int nbor_pitch,
|
const int nall, const int nbor_pitch,
|
||||||
__global numtyp *q_ , const numtyp cut_coulsq,
|
__global numtyp *q_ , const numtyp cut_coulsq,
|
||||||
const numtyp qqrd2e, const numtyp g_ewald) {
|
const numtyp qqrd2e, const numtyp g_ewald,
|
||||||
// ii indexes the two interacting particles in gi
|
const int t_per_atom) {
|
||||||
int ii=THREAD_ID_X;
|
int tid=THREAD_ID_X;
|
||||||
|
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||||
|
ii+=tid/t_per_atom;
|
||||||
|
int offset=tid%t_per_atom;
|
||||||
|
|
||||||
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||||
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||||
__local numtyp sp_lj[8];
|
__local numtyp sp_lj[8];
|
||||||
if (ii<8)
|
if (tid<8)
|
||||||
sp_lj[ii]=sp_lj_in[ii];
|
sp_lj[tid]=sp_lj_in[tid];
|
||||||
if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||||
lj1[ii]=lj1_in[ii];
|
lj1[tid]=lj1_in[tid];
|
||||||
if (eflag>0)
|
if (eflag>0)
|
||||||
lj3[ii]=lj3_in[ii];
|
lj3[tid]=lj3_in[tid];
|
||||||
}
|
}
|
||||||
ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
if (ii<inum) {
|
|
||||||
|
|
||||||
acctyp energy=(acctyp)0;
|
acctyp energy=(acctyp)0;
|
||||||
acctyp e_coul=(acctyp)0;
|
acctyp e_coul=(acctyp)0;
|
||||||
@ -258,19 +316,34 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
|||||||
for (int i=0; i<6; i++)
|
for (int i=0; i<6; i++)
|
||||||
virial[i]=(acctyp)0;
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
if (ii<inum) {
|
||||||
__global int *nbor=dev_nbor+ii;
|
__global int *nbor=dev_nbor+ii;
|
||||||
int i=*nbor;
|
int i=*nbor;
|
||||||
nbor+=nbor_pitch;
|
nbor+=nbor_pitch;
|
||||||
int numj=*nbor;
|
int numj=*nbor;
|
||||||
nbor+=nbor_pitch;
|
nbor+=nbor_pitch;
|
||||||
__global int *list_end=nbor+mul24(numj,nbor_pitch);
|
|
||||||
|
int n_stride;
|
||||||
|
__global int *list_end;
|
||||||
|
if (dev_nbor==dev_packed) {
|
||||||
|
list_end=nbor+mul24(numj,nbor_pitch);
|
||||||
|
nbor+=mul24(offset,nbor_pitch);
|
||||||
|
n_stride=mul24(t_per_atom,nbor_pitch);
|
||||||
|
} else {
|
||||||
|
nbor=dev_packed+*nbor;
|
||||||
|
list_end=nbor+numj;
|
||||||
|
n_stride=t_per_atom;
|
||||||
|
nbor+=offset;
|
||||||
|
}
|
||||||
|
|
||||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||||
numtyp qtmp=fetch_q(i,q_);
|
numtyp qtmp=fetch_q(i,q_);
|
||||||
int iw=ix.w;
|
int iw=ix.w;
|
||||||
int itype=mul24((int)MAX_SHARED_TYPES,iw);
|
int itype=mul24((int)MAX_SHARED_TYPES,iw);
|
||||||
|
|
||||||
for ( ; nbor<list_end; nbor+=nbor_pitch) {
|
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||||
int j=*nbor;
|
int j=*nbor;
|
||||||
|
|
||||||
numtyp factor_lj, factor_coul;
|
numtyp factor_lj, factor_coul;
|
||||||
@ -334,8 +407,49 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
|||||||
}
|
}
|
||||||
|
|
||||||
} // for nbor
|
} // for nbor
|
||||||
|
} // if ii
|
||||||
|
|
||||||
|
// Reduce answers
|
||||||
|
if (t_per_atom>1) {
|
||||||
|
__local acctyp red_acc[6][BLOCK_PAIR];
|
||||||
|
|
||||||
|
red_acc[0][tid]=f.x;
|
||||||
|
red_acc[1][tid]=f.y;
|
||||||
|
red_acc[2][tid]=f.z;
|
||||||
|
red_acc[3][tid]=energy;
|
||||||
|
red_acc[4][tid]=e_coul;
|
||||||
|
|
||||||
|
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||||
|
if (offset < s) {
|
||||||
|
for (int r=0; r<5; r++)
|
||||||
|
red_acc[r][tid] += red_acc[r][tid+s];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
f.x=red_acc[0][tid];
|
||||||
|
f.y=red_acc[1][tid];
|
||||||
|
f.z=red_acc[2][tid];
|
||||||
|
energy=red_acc[3][tid];
|
||||||
|
e_coul=red_acc[4][tid];
|
||||||
|
|
||||||
|
if (vflag>0) {
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
red_acc[r][tid]=virial[r];
|
||||||
|
|
||||||
|
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||||
|
if (offset < s) {
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
red_acc[r][tid] += red_acc[r][tid+s];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
virial[r]=red_acc[r][tid];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Store answers
|
// Store answers
|
||||||
|
if (ii<inum && offset==0) {
|
||||||
__global acctyp *ap1=engv+ii;
|
__global acctyp *ap1=engv+ii;
|
||||||
if (eflag>0) {
|
if (eflag>0) {
|
||||||
*ap1=energy;
|
*ap1=energy;
|
||||||
|
|||||||
@ -43,7 +43,7 @@ int LJCL_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
bool LJCL_GPU_MemoryT::init(const int ntypes,
|
int LJCL_GPU_MemoryT::init(const int ntypes,
|
||||||
double **host_cutsq, double **host_lj1,
|
double **host_cutsq, double **host_lj1,
|
||||||
double **host_lj2, double **host_lj3,
|
double **host_lj2, double **host_lj3,
|
||||||
double **host_lj4, double **host_offset,
|
double **host_lj4, double **host_offset,
|
||||||
@ -54,14 +54,18 @@ bool LJCL_GPU_MemoryT::init(const int ntypes,
|
|||||||
double **host_cut_ljsq, const double host_cut_coulsq,
|
double **host_cut_ljsq, const double host_cut_coulsq,
|
||||||
double *host_special_coul, const double qqrd2e,
|
double *host_special_coul, const double qqrd2e,
|
||||||
const double g_ewald) {
|
const double g_ewald) {
|
||||||
this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
int success;
|
||||||
|
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||||
_screen,ljcl_cut_gpu_kernel);
|
_screen,ljcl_cut_gpu_kernel);
|
||||||
|
if (success!=0)
|
||||||
|
return success;
|
||||||
|
|
||||||
// If atom type constants fit in shared memory use fast kernel
|
// If atom type constants fit in shared memory use fast kernel
|
||||||
int lj_types=ntypes;
|
int lj_types=ntypes;
|
||||||
shared_types=false;
|
shared_types=false;
|
||||||
if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) {
|
int max_shared_types=this->device->max_shared_types();
|
||||||
lj_types=MAX_SHARED_TYPES;
|
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
|
||||||
|
lj_types=max_shared_types;
|
||||||
shared_types=true;
|
shared_types=true;
|
||||||
}
|
}
|
||||||
_lj_types=lj_types;
|
_lj_types=lj_types;
|
||||||
@ -94,7 +98,7 @@ bool LJCL_GPU_MemoryT::init(const int ntypes,
|
|||||||
|
|
||||||
_allocated=true;
|
_allocated=true;
|
||||||
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
|
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
|
||||||
return true;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
@ -132,9 +136,10 @@ void LJCL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
|||||||
else
|
else
|
||||||
vflag=0;
|
vflag=0;
|
||||||
|
|
||||||
int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
|
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||||
|
(BX/this->_threads_per_atom)));
|
||||||
|
|
||||||
int ainum=this->atom->inum();
|
int ainum=this->ans->inum();
|
||||||
int anall=this->atom->nall();
|
int anall=this->atom->nall();
|
||||||
int nbor_pitch=this->nbor->nbor_pitch();
|
int nbor_pitch=this->nbor->nbor_pitch();
|
||||||
this->time_pair.start();
|
this->time_pair.start();
|
||||||
@ -143,19 +148,21 @@ void LJCL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
|||||||
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
|
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
|
||||||
&lj3.begin(), &sp_lj.begin(),
|
&lj3.begin(), &sp_lj.begin(),
|
||||||
&this->nbor->dev_nbor.begin(),
|
&this->nbor->dev_nbor.begin(),
|
||||||
&this->atom->dev_ans.begin(),
|
&this->_nbor_data->begin(),
|
||||||
&this->atom->dev_engv.begin(), &eflag, &vflag,
|
&this->ans->dev_ans.begin(),
|
||||||
|
&this->ans->dev_engv.begin(), &eflag, &vflag,
|
||||||
&ainum, &anall, &nbor_pitch,
|
&ainum, &anall, &nbor_pitch,
|
||||||
&this->atom->dev_q.begin(), &_cut_coulsq,
|
&this->atom->dev_q.begin(), &_cut_coulsq,
|
||||||
&_qqrd2e, &_g_ewald);
|
&_qqrd2e, &_g_ewald, &this->_threads_per_atom);
|
||||||
} else {
|
} else {
|
||||||
this->k_pair.set_size(GX,BX);
|
this->k_pair.set_size(GX,BX);
|
||||||
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
|
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
|
||||||
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
||||||
&this->atom->dev_ans.begin(),
|
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
|
||||||
&this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
|
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||||
&anall, &nbor_pitch, &this->atom->dev_q.begin(),
|
&anall, &nbor_pitch, &this->atom->dev_q.begin(),
|
||||||
&_cut_coulsq, &_qqrd2e, &_g_ewald);
|
&_cut_coulsq, &_qqrd2e, &_g_ewald,
|
||||||
|
&this->_threads_per_atom);
|
||||||
}
|
}
|
||||||
this->time_pair.stop();
|
this->time_pair.stop();
|
||||||
}
|
}
|
||||||
|
|||||||
@ -29,8 +29,15 @@ class LJCL_GPU_Memory : public ChargeGPUMemory<numtyp, acctyp> {
|
|||||||
/// Clear any previous data and set up for a new LAMMPS run
|
/// Clear any previous data and set up for a new LAMMPS run
|
||||||
/** \param max_nbors initial number of rows in the neighbor matrix
|
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||||
* \param cell_size cutoff + skin
|
* \param cell_size cutoff + skin
|
||||||
* \param gpu_split fraction of particles handled by device **/
|
* \param gpu_split fraction of particles handled by device
|
||||||
bool init(const int ntypes, double **host_cutsq,
|
*
|
||||||
|
* Returns:
|
||||||
|
* - 0 if successfull
|
||||||
|
* - -1 if fix gpu not found
|
||||||
|
* - -3 if there is an out of memory error
|
||||||
|
* - -4 if the GPU library was not compiled for GPU
|
||||||
|
* - -5 Double precision is not supported on card **/
|
||||||
|
int init(const int ntypes, double **host_cutsq,
|
||||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||||
double **host_lj4, double **host_offset, double *host_special_lj,
|
double **host_lj4, double **host_offset, double *host_special_lj,
|
||||||
const int nlocal, const int nall, const int max_nbors,
|
const int nlocal, const int nall, const int max_nbors,
|
||||||
|
|||||||
@ -29,9 +29,8 @@ __win_sort _win_sort;
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
PairGPUAtomT::PairGPUAtom() : _compiled(false),_allocated(false),_eflag(false),
|
PairGPUAtomT::PairGPUAtom() : _compiled(false),_allocated(false),
|
||||||
_vflag(false),_inum(0),_ilist(NULL),
|
_max_gpu_bytes(0) {
|
||||||
_newton(false) {
|
|
||||||
#ifndef USE_OPENCL
|
#ifndef USE_OPENCL
|
||||||
sort_config.op = CUDPP_ADD;
|
sort_config.op = CUDPP_ADD;
|
||||||
sort_config.datatype = CUDPP_UINT;
|
sort_config.datatype = CUDPP_UINT;
|
||||||
@ -56,28 +55,20 @@ int PairGPUAtomT::bytes_per_atom() const {
|
|||||||
int id_space=0;
|
int id_space=0;
|
||||||
if (_gpu_nbor)
|
if (_gpu_nbor)
|
||||||
id_space=2;
|
id_space=2;
|
||||||
int bytes=4*sizeof(numtyp)+11*sizeof(acctyp)+id_space;
|
int bytes=4*sizeof(numtyp)+id_space;
|
||||||
if (_rot)
|
if (_rot)
|
||||||
bytes+=4*sizeof(numtyp)+4*sizeof(acctyp);
|
bytes+=4*sizeof(numtyp);
|
||||||
if (_charge)
|
if (_charge)
|
||||||
bytes+=sizeof(numtyp);
|
bytes+=sizeof(numtyp);
|
||||||
return bytes;
|
return bytes;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
bool PairGPUAtomT::alloc(const int inum, const int nall) {
|
bool PairGPUAtomT::alloc(const int nall) {
|
||||||
_max_atoms=static_cast<int>(static_cast<double>(nall)*1.10);
|
_max_atoms=static_cast<int>(static_cast<double>(nall)*1.10);
|
||||||
if (_newton)
|
|
||||||
_max_local=_max_atoms;
|
|
||||||
else
|
|
||||||
_max_local=static_cast<int>(static_cast<double>(inum)*1.10);
|
|
||||||
|
|
||||||
bool success=true;
|
bool success=true;
|
||||||
|
|
||||||
int ans_elements=4;
|
|
||||||
if (_rot)
|
|
||||||
ans_elements+=4;
|
|
||||||
|
|
||||||
// Ignore host/device transfers?
|
// Ignore host/device transfers?
|
||||||
bool cpuview=false;
|
bool cpuview=false;
|
||||||
if (dev->device_type()==UCL_CPU)
|
if (dev->device_type()==UCL_CPU)
|
||||||
@ -107,8 +98,6 @@ bool PairGPUAtomT::alloc(const int inum, const int nall) {
|
|||||||
success=success && (host_x.alloc(_max_atoms*4,*dev,
|
success=success && (host_x.alloc(_max_atoms*4,*dev,
|
||||||
UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
|
UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
|
||||||
#endif
|
#endif
|
||||||
success=success &&(host_ans.alloc(ans_elements*_max_local,*dev)==UCL_SUCCESS);
|
|
||||||
success=success &&(host_engv.alloc(_ev_fields*_max_local,*dev)==UCL_SUCCESS);
|
|
||||||
// Buffer for casting only if different precisions
|
// Buffer for casting only if different precisions
|
||||||
if (_charge)
|
if (_charge)
|
||||||
success=success && (host_q.alloc(_max_atoms,*dev,
|
success=success && (host_q.alloc(_max_atoms,*dev,
|
||||||
@ -120,15 +109,13 @@ bool PairGPUAtomT::alloc(const int inum, const int nall) {
|
|||||||
|
|
||||||
|
|
||||||
// --------------------------- Device allocations
|
// --------------------------- Device allocations
|
||||||
_gpu_bytes=0;
|
int gpu_bytes=0;
|
||||||
if (cpuview) {
|
if (cpuview) {
|
||||||
#ifdef GPU_CAST
|
#ifdef GPU_CAST
|
||||||
assert(0==1);
|
assert(0==1);
|
||||||
#else
|
#else
|
||||||
dev_x.view(host_x);
|
dev_x.view(host_x);
|
||||||
#endif
|
#endif
|
||||||
dev_engv.view(host_engv);
|
|
||||||
dev_ans.view(host_ans);
|
|
||||||
if (_rot)
|
if (_rot)
|
||||||
dev_quat.view(host_quat);
|
dev_quat.view(host_quat);
|
||||||
if (_charge)
|
if (_charge)
|
||||||
@ -140,49 +127,80 @@ bool PairGPUAtomT::alloc(const int inum, const int nall) {
|
|||||||
dev_x_cast.alloc(_max_atoms*3,*dev,UCL_READ_ONLY));
|
dev_x_cast.alloc(_max_atoms*3,*dev,UCL_READ_ONLY));
|
||||||
success=success && (UCL_SUCCESS==
|
success=success && (UCL_SUCCESS==
|
||||||
dev_type_cast.alloc(_max_atoms,*dev,UCL_READ_ONLY));
|
dev_type_cast.alloc(_max_atoms,*dev,UCL_READ_ONLY));
|
||||||
_gpu_bytes+=dev_x_cast.row_bytes()+dev_type_cast.row_bytes();
|
gpu_bytes+=dev_x_cast.row_bytes()+dev_type_cast.row_bytes();
|
||||||
#else
|
#else
|
||||||
success=success && (UCL_SUCCESS==
|
success=success && (UCL_SUCCESS==
|
||||||
dev_x.alloc(_max_atoms*4,*dev,UCL_READ_ONLY));
|
dev_x.alloc(_max_atoms*4,*dev,UCL_READ_ONLY));
|
||||||
#endif
|
#endif
|
||||||
success=success && (dev_engv.alloc(_ev_fields*_max_local,*dev,
|
|
||||||
UCL_WRITE_ONLY)==UCL_SUCCESS);
|
|
||||||
success=success && (dev_ans.alloc(ans_elements*_max_local,
|
|
||||||
*dev,UCL_WRITE_ONLY)==UCL_SUCCESS);
|
|
||||||
if (_charge) {
|
if (_charge) {
|
||||||
success=success && (dev_q.alloc(_max_atoms,*dev,
|
success=success && (dev_q.alloc(_max_atoms,*dev,
|
||||||
UCL_READ_ONLY)==UCL_SUCCESS);
|
UCL_READ_ONLY)==UCL_SUCCESS);
|
||||||
_gpu_bytes+=dev_q.row_bytes();
|
gpu_bytes+=dev_q.row_bytes();
|
||||||
}
|
}
|
||||||
if (_rot) {
|
if (_rot) {
|
||||||
success=success && (dev_quat.alloc(_max_atoms*4,*dev,
|
success=success && (dev_quat.alloc(_max_atoms*4,*dev,
|
||||||
UCL_READ_ONLY)==UCL_SUCCESS);
|
UCL_READ_ONLY)==UCL_SUCCESS);
|
||||||
_gpu_bytes+=dev_quat.row_bytes();
|
gpu_bytes+=dev_quat.row_bytes();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (_gpu_nbor) {
|
if (_gpu_nbor) {
|
||||||
success=success && (dev_cell_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
|
success=success && (dev_cell_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
|
||||||
success=success && (dev_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
|
success=success && (dev_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
|
||||||
_gpu_bytes+=dev_cell_id.row_bytes()+dev_particle_id.row_bytes();
|
gpu_bytes+=dev_cell_id.row_bytes()+dev_particle_id.row_bytes();
|
||||||
if (_bonds) {
|
if (_bonds) {
|
||||||
success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS);
|
success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS);
|
||||||
_gpu_bytes+=dev_tag.row_bytes();
|
gpu_bytes+=dev_tag.row_bytes();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
_gpu_bytes+=dev_x.row_bytes()+dev_engv.row_bytes()+dev_ans.row_bytes();
|
gpu_bytes+=dev_x.row_bytes();
|
||||||
|
if (gpu_bytes>_max_gpu_bytes)
|
||||||
|
_max_gpu_bytes=gpu_bytes;
|
||||||
|
|
||||||
_allocated=true;
|
_allocated=true;
|
||||||
return success;
|
return success;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
bool PairGPUAtomT::init(const int inum, const int nall, const bool charge,
|
bool PairGPUAtomT::add_fields(const bool charge, const bool rot,
|
||||||
const bool rot, UCL_Device &devi, const bool gpu_nbor,
|
const bool gpu_nbor, const bool bonds) {
|
||||||
|
bool realloc=false;
|
||||||
|
if (charge && _charge==false) {
|
||||||
|
_charge=true;
|
||||||
|
realloc=true;
|
||||||
|
}
|
||||||
|
if (rot && _rot==false) {
|
||||||
|
_rot=true;
|
||||||
|
realloc=true;
|
||||||
|
}
|
||||||
|
if (gpu_nbor && _gpu_nbor==false) {
|
||||||
|
_gpu_nbor=true;
|
||||||
|
realloc=true;
|
||||||
|
}
|
||||||
|
if (bonds && _bonds==false) {
|
||||||
|
_bonds=true;
|
||||||
|
realloc=true;
|
||||||
|
}
|
||||||
|
if (realloc) {
|
||||||
|
_other=_charge || _rot;
|
||||||
|
int max_atoms=_max_atoms;
|
||||||
|
clear_resize();
|
||||||
|
return alloc(max_atoms);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
bool PairGPUAtomT::init(const int nall, const bool charge, const bool rot,
|
||||||
|
UCL_Device &devi, const bool gpu_nbor,
|
||||||
const bool bonds) {
|
const bool bonds) {
|
||||||
clear();
|
clear();
|
||||||
|
|
||||||
bool success=true;
|
bool success=true;
|
||||||
|
_x_avail=false;
|
||||||
|
_q_avail=false;
|
||||||
|
_quat_avail=false;
|
||||||
|
_resized=false;
|
||||||
_gpu_nbor=gpu_nbor;
|
_gpu_nbor=gpu_nbor;
|
||||||
_bonds=bonds;
|
_bonds=bonds;
|
||||||
_charge=charge;
|
_charge=charge;
|
||||||
@ -190,33 +208,25 @@ bool PairGPUAtomT::init(const int inum, const int nall, const bool charge,
|
|||||||
_other=_charge || _rot;
|
_other=_charge || _rot;
|
||||||
dev=&devi;
|
dev=&devi;
|
||||||
|
|
||||||
_e_fields=1;
|
|
||||||
if (_charge)
|
|
||||||
_e_fields++;
|
|
||||||
_ev_fields=6+_e_fields;
|
|
||||||
|
|
||||||
// Initialize atom and nbor data
|
// Initialize atom and nbor data
|
||||||
int ef_inum=inum;
|
|
||||||
if (ef_inum==0)
|
|
||||||
ef_inum=1000;
|
|
||||||
int ef_nall=nall;
|
int ef_nall=nall;
|
||||||
if (ef_nall<=ef_inum)
|
if (ef_nall==0)
|
||||||
ef_nall=ef_inum*2;
|
ef_nall=2000;
|
||||||
|
|
||||||
// Initialize timers for the selected device
|
// Initialize timers for the selected device
|
||||||
time_pos.init(*dev);
|
time_pos.init(*dev);
|
||||||
time_other.init(*dev);
|
time_q.init(*dev);
|
||||||
time_answer.init(*dev);
|
time_quat.init(*dev);
|
||||||
time_pos.zero();
|
time_pos.zero();
|
||||||
time_other.zero();
|
time_q.zero();
|
||||||
time_answer.zero();
|
time_quat.zero();
|
||||||
_time_cast=0.0;
|
_time_cast=0.0;
|
||||||
|
|
||||||
#ifdef GPU_CAST
|
#ifdef GPU_CAST
|
||||||
compile_kernels(*dev);
|
compile_kernels(*dev);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
return success && alloc(ef_inum,ef_nall);
|
return success && alloc(ef_nall);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
@ -234,16 +244,12 @@ void PairGPUAtomT::clear_resize() {
|
|||||||
dev_quat.clear();
|
dev_quat.clear();
|
||||||
host_quat.clear();
|
host_quat.clear();
|
||||||
}
|
}
|
||||||
dev_ans.clear();
|
|
||||||
dev_engv.clear();
|
|
||||||
#ifndef GPU_CAST
|
#ifndef GPU_CAST
|
||||||
host_x.clear();
|
host_x.clear();
|
||||||
#else
|
#else
|
||||||
host_x_cast.clear();
|
host_x_cast.clear();
|
||||||
host_type_cast.clear();
|
host_type_cast.clear();
|
||||||
#endif
|
#endif
|
||||||
host_ans.clear();
|
|
||||||
host_engv.clear();
|
|
||||||
dev_cell_id.clear();
|
dev_cell_id.clear();
|
||||||
dev_particle_id.clear();
|
dev_particle_id.clear();
|
||||||
dev_tag.clear();
|
dev_tag.clear();
|
||||||
@ -261,17 +267,14 @@ void PairGPUAtomT::clear_resize() {
|
|||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
void PairGPUAtomT::clear() {
|
void PairGPUAtomT::clear() {
|
||||||
_gpu_bytes=0;
|
_max_gpu_bytes=0;
|
||||||
if (!_allocated)
|
if (!_allocated)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
time_pos.clear();
|
time_pos.clear();
|
||||||
time_other.clear();
|
time_q.clear();
|
||||||
time_answer.clear();
|
time_quat.clear();
|
||||||
clear_resize();
|
clear_resize();
|
||||||
_inum=0;
|
|
||||||
_eflag=false;
|
|
||||||
_vflag=false;
|
|
||||||
|
|
||||||
#ifdef GPU_CAST
|
#ifdef GPU_CAST
|
||||||
if (_compiled) {
|
if (_compiled) {
|
||||||
@ -289,255 +292,10 @@ double PairGPUAtomT::host_memory_usage() const {
|
|||||||
atom_bytes+=1;
|
atom_bytes+=1;
|
||||||
if (_rot)
|
if (_rot)
|
||||||
atom_bytes+=4;
|
atom_bytes+=4;
|
||||||
int ans_bytes=atom_bytes+_ev_fields;
|
|
||||||
return _max_atoms*atom_bytes*sizeof(numtyp)+
|
return _max_atoms*atom_bytes*sizeof(numtyp)+
|
||||||
ans_bytes*(_max_local)*sizeof(acctyp)+
|
|
||||||
sizeof(PairGPUAtom<numtyp,acctyp>);
|
sizeof(PairGPUAtom<numtyp,acctyp>);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
|
||||||
void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag,
|
|
||||||
const bool ef_atom, const bool vf_atom) {
|
|
||||||
time_answer.start();
|
|
||||||
_eflag=eflag;
|
|
||||||
_vflag=vflag;
|
|
||||||
_ef_atom=ef_atom;
|
|
||||||
_vf_atom=vf_atom;
|
|
||||||
|
|
||||||
int csize=_ev_fields;
|
|
||||||
if (!eflag)
|
|
||||||
csize-=_e_fields;
|
|
||||||
if (!vflag)
|
|
||||||
csize-=6;
|
|
||||||
|
|
||||||
if (csize>0)
|
|
||||||
ucl_copy(host_engv,dev_engv,_inum*csize,true);
|
|
||||||
if (_rot)
|
|
||||||
ucl_copy(host_ans,dev_ans,_inum*4*2,true);
|
|
||||||
else
|
|
||||||
ucl_copy(host_ans,dev_ans,_inum*4,true);
|
|
||||||
time_answer.stop();
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
|
||||||
void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag,
|
|
||||||
const bool ef_atom, const bool vf_atom,
|
|
||||||
int *ilist) {
|
|
||||||
_ilist=ilist;
|
|
||||||
copy_answers(eflag,vflag,ef_atom,vf_atom);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
|
||||||
double PairGPUAtomT::energy_virial(double *eatom, double **vatom,
|
|
||||||
double *virial) {
|
|
||||||
if (_eflag==false && _vflag==false)
|
|
||||||
return 0.0;
|
|
||||||
|
|
||||||
double evdwl=0.0;
|
|
||||||
if (_gpu_nbor) {
|
|
||||||
for (int i=0; i<_inum; i++) {
|
|
||||||
acctyp *ap=host_engv.begin()+i;
|
|
||||||
if (_eflag) {
|
|
||||||
if (_ef_atom) {
|
|
||||||
evdwl+=*ap;
|
|
||||||
eatom[i]+=*ap*0.5;
|
|
||||||
ap+=_inum;
|
|
||||||
} else {
|
|
||||||
evdwl+=*ap;
|
|
||||||
ap+=_inum;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (_vflag) {
|
|
||||||
if (_vf_atom) {
|
|
||||||
for (int j=0; j<6; j++) {
|
|
||||||
vatom[i][j]+=*ap*0.5;
|
|
||||||
virial[j]+=*ap;
|
|
||||||
ap+=_inum;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for (int j=0; j<6; j++) {
|
|
||||||
virial[j]+=*ap;
|
|
||||||
ap+=_inum;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for (int j=0; j<6; j++)
|
|
||||||
virial[j]*=0.5;
|
|
||||||
} else {
|
|
||||||
for (int i=0; i<_inum; i++) {
|
|
||||||
acctyp *ap=host_engv.begin()+i;
|
|
||||||
int ii=_ilist[i];
|
|
||||||
if (_eflag) {
|
|
||||||
if (_ef_atom) {
|
|
||||||
evdwl+=*ap;
|
|
||||||
eatom[ii]+=*ap*0.5;
|
|
||||||
ap+=_inum;
|
|
||||||
} else {
|
|
||||||
evdwl+=*ap;
|
|
||||||
ap+=_inum;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (_vflag) {
|
|
||||||
if (_vf_atom) {
|
|
||||||
for (int j=0; j<6; j++) {
|
|
||||||
vatom[ii][j]+=*ap*0.5;
|
|
||||||
virial[j]+=*ap;
|
|
||||||
ap+=_inum;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for (int j=0; j<6; j++) {
|
|
||||||
virial[j]+=*ap;
|
|
||||||
ap+=_inum;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for (int j=0; j<6; j++)
|
|
||||||
virial[j]*=0.5;
|
|
||||||
}
|
|
||||||
|
|
||||||
evdwl*=0.5;
|
|
||||||
return evdwl;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
|
||||||
double PairGPUAtomT::energy_virial(double *eatom, double **vatom,
|
|
||||||
double *virial, double &ecoul) {
|
|
||||||
if (_eflag==false && _vflag==false) {
|
|
||||||
ecoul=0.0;
|
|
||||||
return 0.0;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (_charge==false)
|
|
||||||
return energy_virial(eatom,vatom,virial);
|
|
||||||
|
|
||||||
double evdwl=0.0;
|
|
||||||
double _ecoul=0.0;
|
|
||||||
if (_gpu_nbor) {
|
|
||||||
for (int i=0; i<_inum; i++) {
|
|
||||||
acctyp *ap=host_engv.begin()+i;
|
|
||||||
if (_eflag) {
|
|
||||||
if (_ef_atom) {
|
|
||||||
evdwl+=*ap;
|
|
||||||
eatom[i]+=*ap*0.5;
|
|
||||||
ap+=_inum;
|
|
||||||
_ecoul+=*ap;
|
|
||||||
eatom[i]+=*ap*0.5;
|
|
||||||
ap+=_inum;
|
|
||||||
} else {
|
|
||||||
evdwl+=*ap;
|
|
||||||
ap+=_inum;
|
|
||||||
_ecoul+=*ap;
|
|
||||||
ap+=_inum;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (_vflag) {
|
|
||||||
if (_vf_atom) {
|
|
||||||
for (int j=0; j<6; j++) {
|
|
||||||
vatom[i][j]+=*ap*0.5;
|
|
||||||
virial[j]+=*ap;
|
|
||||||
ap+=_inum;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for (int j=0; j<6; j++) {
|
|
||||||
virial[j]+=*ap;
|
|
||||||
ap+=_inum;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for (int j=0; j<6; j++)
|
|
||||||
virial[j]*=0.5;
|
|
||||||
} else {
|
|
||||||
for (int i=0; i<_inum; i++) {
|
|
||||||
acctyp *ap=host_engv.begin()+i;
|
|
||||||
int ii=_ilist[i];
|
|
||||||
if (_eflag) {
|
|
||||||
if (_ef_atom) {
|
|
||||||
evdwl+=*ap;
|
|
||||||
eatom[ii]+=*ap*0.5;
|
|
||||||
ap+=_inum;
|
|
||||||
_ecoul+=*ap;
|
|
||||||
eatom[ii]+=*ap*0.5;
|
|
||||||
ap+=_inum;
|
|
||||||
} else {
|
|
||||||
evdwl+=*ap;
|
|
||||||
ap+=_inum;
|
|
||||||
_ecoul+=*ap;
|
|
||||||
ap+=_inum;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (_vflag) {
|
|
||||||
if (_vf_atom) {
|
|
||||||
for (int j=0; j<6; j++) {
|
|
||||||
vatom[ii][j]+=*ap*0.5;
|
|
||||||
virial[j]+=*ap;
|
|
||||||
ap+=_inum;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for (int j=0; j<6; j++) {
|
|
||||||
virial[j]+=*ap;
|
|
||||||
ap+=_inum;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for (int j=0; j<6; j++)
|
|
||||||
virial[j]*=0.5;
|
|
||||||
}
|
|
||||||
|
|
||||||
evdwl*=0.5;
|
|
||||||
ecoul+=_ecoul*0.5;
|
|
||||||
return evdwl;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
|
||||||
void PairGPUAtomT::get_answers(double **f, double **tor) {
|
|
||||||
acctyp *ap=host_ans.begin();
|
|
||||||
if (_gpu_nbor) {
|
|
||||||
for (int i=0; i<_inum; i++) {
|
|
||||||
f[i][0]+=*ap;
|
|
||||||
ap++;
|
|
||||||
f[i][1]+=*ap;
|
|
||||||
ap++;
|
|
||||||
f[i][2]+=*ap;
|
|
||||||
ap+=2;
|
|
||||||
}
|
|
||||||
if (_rot) {
|
|
||||||
for (int i=0; i<_inum; i++) {
|
|
||||||
tor[i][0]+=*ap;
|
|
||||||
ap++;
|
|
||||||
tor[i][1]+=*ap;
|
|
||||||
ap++;
|
|
||||||
tor[i][2]+=*ap;
|
|
||||||
ap+=2;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for (int i=0; i<_inum; i++) {
|
|
||||||
int ii=_ilist[i];
|
|
||||||
f[ii][0]+=*ap;
|
|
||||||
ap++;
|
|
||||||
f[ii][1]+=*ap;
|
|
||||||
ap++;
|
|
||||||
f[ii][2]+=*ap;
|
|
||||||
ap+=2;
|
|
||||||
}
|
|
||||||
if (_rot) {
|
|
||||||
for (int i=0; i<_inum; i++) {
|
|
||||||
int ii=_ilist[i];
|
|
||||||
tor[ii][0]+=*ap;
|
|
||||||
ap++;
|
|
||||||
tor[ii][1]+=*ap;
|
|
||||||
ap++;
|
|
||||||
tor[ii][2]+=*ap;
|
|
||||||
ap+=2;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Sort arrays for neighbor list calculation
|
// Sort arrays for neighbor list calculation
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
void PairGPUAtomT::sort_neighbor(const int num_atoms) {
|
void PairGPUAtomT::sort_neighbor(const int num_atoms) {
|
||||||
|
|||||||
@ -23,7 +23,6 @@
|
|||||||
|
|
||||||
#ifdef USE_OPENCL
|
#ifdef USE_OPENCL
|
||||||
|
|
||||||
#include "geryon/ocl_device.h"
|
|
||||||
#include "geryon/ocl_timer.h"
|
#include "geryon/ocl_timer.h"
|
||||||
#include "geryon/ocl_mat.h"
|
#include "geryon/ocl_mat.h"
|
||||||
#include "geryon/ocl_kernel.h"
|
#include "geryon/ocl_kernel.h"
|
||||||
@ -32,7 +31,6 @@ using namespace ucl_opencl;
|
|||||||
#else
|
#else
|
||||||
|
|
||||||
#include "cudpp.h"
|
#include "cudpp.h"
|
||||||
#include "geryon/nvd_device.h"
|
|
||||||
#include "geryon/nvd_timer.h"
|
#include "geryon/nvd_timer.h"
|
||||||
#include "geryon/nvd_mat.h"
|
#include "geryon/nvd_mat.h"
|
||||||
#include "geryon/nvd_kernel.h"
|
#include "geryon/nvd_kernel.h"
|
||||||
@ -40,10 +38,6 @@ using namespace ucl_cudadr;
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef int2
|
|
||||||
struct int2 { int x; int y; };
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#include "pair_gpu_precision.h"
|
#include "pair_gpu_precision.h"
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
@ -56,13 +50,9 @@ class PairGPUAtom {
|
|||||||
inline int max_atoms() const { return _max_atoms; }
|
inline int max_atoms() const { return _max_atoms; }
|
||||||
/// Current number of local+ghost atoms stored
|
/// Current number of local+ghost atoms stored
|
||||||
inline int nall() const { return _nall; }
|
inline int nall() const { return _nall; }
|
||||||
/// Current number of local atoms stored
|
|
||||||
inline int inum() const { return _inum; }
|
|
||||||
|
|
||||||
/// Set number of local+ghost atoms for future copy operations
|
/// Set number of local+ghost atoms for future copy operations
|
||||||
inline void nall(const int n) { _nall=n; }
|
inline void nall(const int n) { _nall=n; }
|
||||||
/// Set number of local atoms for future copy operations
|
|
||||||
inline void inum(const int n) { _inum=n; }
|
|
||||||
|
|
||||||
/// Memory usage per atom in this class
|
/// Memory usage per atom in this class
|
||||||
int bytes_per_atom() const;
|
int bytes_per_atom() const;
|
||||||
@ -70,21 +60,33 @@ class PairGPUAtom {
|
|||||||
/// Clear any previous data and set up for a new LAMMPS run
|
/// Clear any previous data and set up for a new LAMMPS run
|
||||||
/** \param rot True if atom storage needs quaternions
|
/** \param rot True if atom storage needs quaternions
|
||||||
* \param gpu_nbor True if neighboring will be performed on device **/
|
* \param gpu_nbor True if neighboring will be performed on device **/
|
||||||
bool init(const int inum, const int nall, const bool charge, const bool rot,
|
bool init(const int nall, const bool charge, const bool rot,
|
||||||
UCL_Device &dev, const bool gpu_nbor=false, const bool bonds=false);
|
UCL_Device &dev, const bool gpu_nbor=false, const bool bonds=false);
|
||||||
|
|
||||||
/// Check if we have enough device storage and realloc if not
|
/// Check if we have enough device storage and realloc if not
|
||||||
inline bool resize(const int inum, const int nall, bool &success) {
|
/** Returns true if resized with any call during this timestep **/
|
||||||
_inum=inum;
|
inline bool resize(const int nall, bool &success) {
|
||||||
_nall=nall;
|
_nall=nall;
|
||||||
if (inum>_max_local || nall>_max_atoms) {
|
if (nall>_max_atoms) {
|
||||||
clear_resize();
|
clear_resize();
|
||||||
success = success && alloc(inum,nall);
|
success = success && alloc(nall);
|
||||||
return true;
|
_resized=true;
|
||||||
}
|
}
|
||||||
return false;
|
return _resized;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// If already initialized by another LAMMPS style, add fields as necessary
|
||||||
|
/** \param rot True if atom storage needs quaternions
|
||||||
|
* \param gpu_nbor True if neighboring will be performed on device **/
|
||||||
|
bool add_fields(const bool charge, const bool rot, const bool gpu_nbor,
|
||||||
|
const bool bonds);
|
||||||
|
|
||||||
|
/// Returns true if GPU is using charges
|
||||||
|
bool charge() { return _charge; }
|
||||||
|
|
||||||
|
/// Returns true if GPU is using quaternions
|
||||||
|
bool quat() { return _rot; }
|
||||||
|
|
||||||
/// Only free matrices of length inum or nall for resizing
|
/// Only free matrices of length inum or nall for resizing
|
||||||
void clear_resize();
|
void clear_resize();
|
||||||
|
|
||||||
@ -100,28 +102,42 @@ class PairGPUAtom {
|
|||||||
/// Add copy times to timers
|
/// Add copy times to timers
|
||||||
inline void acc_timers() {
|
inline void acc_timers() {
|
||||||
time_pos.add_to_total();
|
time_pos.add_to_total();
|
||||||
time_answer.add_to_total();
|
if (_charge)
|
||||||
if (_other)
|
time_q.add_to_total();
|
||||||
time_other.add_to_total();
|
if (_rot)
|
||||||
|
time_quat.add_to_total();
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Add copy times to timers
|
/// Add copy times to timers
|
||||||
inline void zero_timers() {
|
inline void zero_timers() {
|
||||||
time_pos.zero();
|
time_pos.zero();
|
||||||
time_answer.zero();
|
if (_charge)
|
||||||
if (_other)
|
time_q.zero();
|
||||||
time_other.zero();
|
if (_rot)
|
||||||
|
time_quat.zero();
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return the total time for host/device data transfer
|
/// Return the total time for host/device data transfer
|
||||||
|
/** Zeros the total so that the atom times are only included once **/
|
||||||
inline double transfer_time() {
|
inline double transfer_time() {
|
||||||
double total=time_pos.total_seconds()+time_answer.total_seconds();
|
double total=time_pos.total_seconds();
|
||||||
if (_other) total+=time_other.total_seconds();
|
time_pos.zero_total();
|
||||||
|
if (_charge) {
|
||||||
|
total+=time_q.total_seconds();
|
||||||
|
time_q.zero_total();
|
||||||
|
}
|
||||||
|
if (_rot) {
|
||||||
|
total+=time_q.total_seconds();
|
||||||
|
time_quat.zero_total();
|
||||||
|
}
|
||||||
|
|
||||||
return total;
|
return total;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return the total time for data cast/pack
|
/// Return the total time for data cast/pack
|
||||||
inline double cast_time() { return _time_cast; }
|
/** Zeros the time so that atom times are only included once **/
|
||||||
|
inline double cast_time()
|
||||||
|
{ double t=_time_cast; _time_cast=0.0; return t; }
|
||||||
|
|
||||||
/// Pack LAMMPS atom type constants into matrix and copy to device
|
/// Pack LAMMPS atom type constants into matrix and copy to device
|
||||||
template <class dev_typ, class t1>
|
template <class dev_typ, class t1>
|
||||||
@ -216,8 +232,13 @@ class PairGPUAtom {
|
|||||||
|
|
||||||
// -------------------------COPY TO GPU ----------------------------------
|
// -------------------------COPY TO GPU ----------------------------------
|
||||||
|
|
||||||
|
/// Signal that we need to transfer atom data for next timestep
|
||||||
|
inline void data_unavail()
|
||||||
|
{ _x_avail=false; _q_avail=false; _quat_avail=false; _resized=false; }
|
||||||
|
|
||||||
/// Cast positions and types to write buffer
|
/// Cast positions and types to write buffer
|
||||||
inline void cast_x_data(double **host_ptr, const int *host_type) {
|
inline void cast_x_data(double **host_ptr, const int *host_type) {
|
||||||
|
if (_x_avail==false) {
|
||||||
double t=MPI_Wtime();
|
double t=MPI_Wtime();
|
||||||
#ifdef GPU_CAST
|
#ifdef GPU_CAST
|
||||||
memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
|
memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
|
||||||
@ -237,11 +258,13 @@ class PairGPUAtom {
|
|||||||
#endif
|
#endif
|
||||||
_time_cast+=MPI_Wtime()-t;
|
_time_cast+=MPI_Wtime()-t;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Copy positions and types to device asynchronously
|
/// Copy positions and types to device asynchronously
|
||||||
/** Copies nall() elements **/
|
/** Copies nall() elements **/
|
||||||
inline void add_x_data(double **host_ptr, int *host_type) {
|
inline void add_x_data(double **host_ptr, int *host_type) {
|
||||||
time_pos.start();
|
time_pos.start();
|
||||||
|
if (_x_avail==false) {
|
||||||
#ifdef GPU_CAST
|
#ifdef GPU_CAST
|
||||||
ucl_copy(dev_x_cast,host_x_cast,_nall*3,true);
|
ucl_copy(dev_x_cast,host_x_cast,_nall*3,true);
|
||||||
ucl_copy(dev_type_cast,host_type_cast,_nall,true);
|
ucl_copy(dev_type_cast,host_type_cast,_nall,true);
|
||||||
@ -253,6 +276,8 @@ class PairGPUAtom {
|
|||||||
#else
|
#else
|
||||||
ucl_copy(dev_x,host_x,_nall*4,true);
|
ucl_copy(dev_x,host_x,_nall*4,true);
|
||||||
#endif
|
#endif
|
||||||
|
_x_avail=true;
|
||||||
|
}
|
||||||
time_pos.stop();
|
time_pos.stop();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -262,9 +287,10 @@ class PairGPUAtom {
|
|||||||
add_x_data(host_ptr,host_type);
|
add_x_data(host_ptr,host_type);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Cast charges to write buffer
|
// Cast charges to write buffer
|
||||||
template<class cpytyp>
|
template<class cpytyp>
|
||||||
inline void cast_q_data(cpytyp *host_ptr) {
|
inline void cast_q_data(cpytyp *host_ptr) {
|
||||||
|
if (_q_avail==false) {
|
||||||
double t=MPI_Wtime();
|
double t=MPI_Wtime();
|
||||||
if (dev->device_type()==UCL_CPU) {
|
if (dev->device_type()==UCL_CPU) {
|
||||||
if (sizeof(numtyp)==sizeof(double)) {
|
if (sizeof(numtyp)==sizeof(double)) {
|
||||||
@ -280,15 +306,20 @@ class PairGPUAtom {
|
|||||||
}
|
}
|
||||||
_time_cast+=MPI_Wtime()-t;
|
_time_cast+=MPI_Wtime()-t;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Copy charges to device asynchronously
|
|
||||||
inline void add_q_data() {
|
|
||||||
ucl_copy(dev_q,host_q,_nall,true);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Cast quaternions to write buffer
|
// Copy charges to device asynchronously
|
||||||
|
inline void add_q_data() {
|
||||||
|
if (_q_avail==false) {
|
||||||
|
ucl_copy(dev_q,host_q,_nall,true);
|
||||||
|
_q_avail=true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Cast quaternions to write buffer
|
||||||
template<class cpytyp>
|
template<class cpytyp>
|
||||||
inline void cast_quat_data(cpytyp *host_ptr) {
|
inline void cast_quat_data(cpytyp *host_ptr) {
|
||||||
|
if (_quat_avail==false) {
|
||||||
double t=MPI_Wtime();
|
double t=MPI_Wtime();
|
||||||
if (dev->device_type()==UCL_CPU) {
|
if (dev->device_type()==UCL_CPU) {
|
||||||
if (sizeof(numtyp)==sizeof(double)) {
|
if (sizeof(numtyp)==sizeof(double)) {
|
||||||
@ -304,45 +335,20 @@ class PairGPUAtom {
|
|||||||
}
|
}
|
||||||
_time_cast+=MPI_Wtime()-t;
|
_time_cast+=MPI_Wtime()-t;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Copy quaternions to device
|
|
||||||
/** Copies nall()*4 elements **/
|
|
||||||
inline void add_quat_data() {
|
|
||||||
ucl_copy(dev_quat,host_quat,_nall*4,true);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Copy data other than pos and data to device
|
// Copy quaternions to device
|
||||||
inline void add_other_data() {
|
/** Copies nall()*4 elements **/
|
||||||
time_other.start();
|
inline void add_quat_data() {
|
||||||
if (_charge)
|
if (_quat_avail==false) {
|
||||||
add_q_data();
|
ucl_copy(dev_quat,host_quat,_nall*4,true);
|
||||||
if (_rot)
|
_quat_avail=true;
|
||||||
add_quat_data();
|
}
|
||||||
time_other.stop();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return number of bytes used on device
|
/// Return number of bytes used on device
|
||||||
inline double gpu_bytes() { return _gpu_bytes; }
|
inline double max_gpu_bytes()
|
||||||
|
{ double m=_max_gpu_bytes; _max_gpu_bytes=0.0; return m; }
|
||||||
// -------------------------COPY FROM GPU -------------------------------
|
|
||||||
|
|
||||||
/// Copy answers from device into read buffer asynchronously
|
|
||||||
void copy_answers(const bool eflag, const bool vflag,
|
|
||||||
const bool ef_atom, const bool vf_atom);
|
|
||||||
|
|
||||||
/// Copy answers from device into read buffer asynchronously
|
|
||||||
void copy_answers(const bool eflag, const bool vflag,
|
|
||||||
const bool ef_atom, const bool vf_atom, int *ilist);
|
|
||||||
|
|
||||||
/// Copy energy and virial data into LAMMPS memory
|
|
||||||
double energy_virial(double *eatom, double **vatom, double *virial);
|
|
||||||
|
|
||||||
/// Copy energy and virial data into LAMMPS memory
|
|
||||||
double energy_virial(double *eatom, double **vatom, double *virial,
|
|
||||||
double &ecoul);
|
|
||||||
|
|
||||||
/// Add forces and torques from the GPU into a LAMMPS pointer
|
|
||||||
void get_answers(double **f, double **tor);
|
|
||||||
|
|
||||||
// ------------------------------ DATA ----------------------------------
|
// ------------------------------ DATA ----------------------------------
|
||||||
|
|
||||||
@ -352,10 +358,6 @@ class PairGPUAtom {
|
|||||||
UCL_D_Vec<numtyp> dev_q;
|
UCL_D_Vec<numtyp> dev_q;
|
||||||
/// Quaterions
|
/// Quaterions
|
||||||
UCL_D_Vec<numtyp> dev_quat;
|
UCL_D_Vec<numtyp> dev_quat;
|
||||||
/// Force and possibly torque
|
|
||||||
UCL_D_Vec<acctyp> dev_ans;
|
|
||||||
/// Energy and virial per-atom storage
|
|
||||||
UCL_D_Vec<acctyp> dev_engv;
|
|
||||||
|
|
||||||
#ifdef GPU_CAST
|
#ifdef GPU_CAST
|
||||||
UCL_D_Vec<double> dev_x_cast;
|
UCL_D_Vec<double> dev_x_cast;
|
||||||
@ -370,10 +372,6 @@ class PairGPUAtom {
|
|||||||
UCL_H_Vec<numtyp> host_q;
|
UCL_H_Vec<numtyp> host_q;
|
||||||
/// Buffer for moving quat data to GPU
|
/// Buffer for moving quat data to GPU
|
||||||
UCL_H_Vec<numtyp> host_quat;
|
UCL_H_Vec<numtyp> host_quat;
|
||||||
/// Force and possibly torque data on host
|
|
||||||
UCL_H_Vec<acctyp> host_ans;
|
|
||||||
/// Energy/virial data on host
|
|
||||||
UCL_H_Vec<acctyp> host_engv;
|
|
||||||
|
|
||||||
/// Cell list identifiers for device nbor builds
|
/// Cell list identifiers for device nbor builds
|
||||||
UCL_D_Vec<unsigned> dev_cell_id;
|
UCL_D_Vec<unsigned> dev_cell_id;
|
||||||
@ -383,7 +381,7 @@ class PairGPUAtom {
|
|||||||
UCL_D_Vec<int> dev_tag;
|
UCL_D_Vec<int> dev_tag;
|
||||||
|
|
||||||
/// Device timers
|
/// Device timers
|
||||||
UCL_Timer time_pos, time_other, time_answer;
|
UCL_Timer time_pos, time_q, time_quat;
|
||||||
|
|
||||||
/// Geryon device
|
/// Geryon device
|
||||||
UCL_Device *dev;
|
UCL_Device *dev;
|
||||||
@ -397,17 +395,17 @@ class PairGPUAtom {
|
|||||||
|
|
||||||
bool _compiled;
|
bool _compiled;
|
||||||
|
|
||||||
bool alloc(const int inum, const int nall);
|
// True if data has been copied to device already
|
||||||
|
bool _x_avail, _q_avail, _quat_avail, _resized;
|
||||||
|
|
||||||
bool _allocated, _eflag, _vflag, _ef_atom, _vf_atom, _rot, _charge, _other;
|
bool alloc(const int nall);
|
||||||
int _max_local, _max_atoms, _nall, _inum, _e_fields, _ev_fields;
|
|
||||||
|
bool _allocated, _rot, _charge, _other;
|
||||||
|
int _max_atoms, _nall;
|
||||||
bool _gpu_nbor, _bonds;
|
bool _gpu_nbor, _bonds;
|
||||||
int *_ilist;
|
|
||||||
double _time_cast;
|
double _time_cast;
|
||||||
|
|
||||||
double _gpu_bytes;
|
double _max_gpu_bytes;
|
||||||
|
|
||||||
bool _newton;
|
|
||||||
|
|
||||||
#ifndef USE_OPENCL
|
#ifndef USE_OPENCL
|
||||||
CUDPPConfiguration sort_config;
|
CUDPPConfiguration sort_config;
|
||||||
|
|||||||
@ -23,7 +23,7 @@
|
|||||||
|
|
||||||
#define _HD_BALANCE_EVERY 25
|
#define _HD_BALANCE_EVERY 25
|
||||||
#define _HD_BALANCE_WEIGHT 0.5
|
#define _HD_BALANCE_WEIGHT 0.5
|
||||||
#define _HD_BALANCE_GAP 1.05
|
#define _HD_BALANCE_GAP 1.10
|
||||||
|
|
||||||
/// Host/device load balancer
|
/// Host/device load balancer
|
||||||
template<class numtyp, class acctyp>
|
template<class numtyp, class acctyp>
|
||||||
@ -33,7 +33,8 @@ class PairGPUBalance {
|
|||||||
inline ~PairGPUBalance() { clear(); }
|
inline ~PairGPUBalance() { clear(); }
|
||||||
|
|
||||||
/// Clear any old data and setup for new LAMMPS run
|
/// Clear any old data and setup for new LAMMPS run
|
||||||
inline void init(PairGPUDevice<numtyp, acctyp> *gpu, const double split);
|
inline void init(PairGPUDevice<numtyp, acctyp> *gpu, const bool gpu_nbor,
|
||||||
|
const double split);
|
||||||
|
|
||||||
/// Clear all host and device data
|
/// Clear all host and device data
|
||||||
inline void clear() {
|
inline void clear() {
|
||||||
@ -44,22 +45,24 @@ class PairGPUBalance {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Return the timestep since initialization
|
||||||
|
inline int timestep() { return _timestep; }
|
||||||
|
|
||||||
/// Get a count of the number of particles host will handle for initial alloc
|
/// Get a count of the number of particles host will handle for initial alloc
|
||||||
inline int first_host_count(const int nlocal,const bool gpu_nbor,
|
inline int first_host_count(const int nlocal, const double gpu_split,
|
||||||
const double gpu_split) const {
|
const bool gpu_nbor) const {
|
||||||
int host_nlocal=0;
|
int host_nlocal=0;
|
||||||
if (gpu_nbor && gpu_split!=1.0) {
|
if (gpu_nbor && gpu_split!=1.0) {
|
||||||
if (gpu_split>0)
|
if (gpu_split>0)
|
||||||
host_nlocal=static_cast<int>(ceil((1.0-gpu_split)*nlocal));
|
host_nlocal=static_cast<int>(ceil((1.0-gpu_split)*nlocal));
|
||||||
else
|
else
|
||||||
host_nlocal=static_cast<int>(ceil(0.1*nlocal));
|
host_nlocal=static_cast<int>(ceil(0.05*nlocal));
|
||||||
}
|
}
|
||||||
return host_nlocal;
|
return host_nlocal;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return the number of particles the device will handle this timestep
|
/// Return the number of particles the device will handle this timestep
|
||||||
inline int get_gpu_count(const int timestep, const int ago,
|
inline int get_gpu_count(const int ago, const int inum_full);
|
||||||
const int inum_full);
|
|
||||||
|
|
||||||
/// Return the average fraction of particles handled by device on all procs
|
/// Return the average fraction of particles handled by device on all procs
|
||||||
inline double all_avg_split() {
|
inline double all_avg_split() {
|
||||||
@ -82,10 +85,10 @@ class PairGPUBalance {
|
|||||||
if (_measure_this_step) {
|
if (_measure_this_step) {
|
||||||
_device->gpu->sync();
|
_device->gpu->sync();
|
||||||
_device->gpu_barrier();
|
_device->gpu_barrier();
|
||||||
|
_device->start_host_timer();
|
||||||
_device_time.start();
|
_device_time.start();
|
||||||
_device->gpu->sync();
|
_device->gpu->sync();
|
||||||
_device->gpu_barrier();
|
_device->gpu_barrier();
|
||||||
_device->start_host_timer();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -95,34 +98,34 @@ class PairGPUBalance {
|
|||||||
/// Calculate the new host/device split based on the cpu and device times
|
/// Calculate the new host/device split based on the cpu and device times
|
||||||
/** \note Only does calculation every _HD_BALANCE_EVERY timesteps
|
/** \note Only does calculation every _HD_BALANCE_EVERY timesteps
|
||||||
(and first 10) **/
|
(and first 10) **/
|
||||||
inline void balance(const double cpu_time, const bool gpu_nbor);
|
inline void balance(const double cpu_time);
|
||||||
|
|
||||||
/// Calls balance() and then get_gpu_count()
|
/// Calls balance() and then get_gpu_count()
|
||||||
inline int balance(const int timestep, const int ago, const int inum_full,
|
inline int balance(const int ago,const int inum_full,const double cpu_time) {
|
||||||
const double cpu_time, const bool gpu_nbor) {
|
balance(cpu_time);
|
||||||
balance(cpu_time,gpu_nbor);
|
return get_gpu_count(ago,inum_full);
|
||||||
return get_gpu_count(timestep,ago,inum_full);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
PairGPUDevice<numtyp,acctyp> *_device;
|
PairGPUDevice<numtyp,acctyp> *_device;
|
||||||
UCL_Timer _device_time;
|
UCL_Timer _device_time;
|
||||||
bool _init_done;
|
bool _init_done, _gpu_nbor;
|
||||||
|
|
||||||
bool _load_balance;
|
bool _load_balance;
|
||||||
double _actual_split, _avg_split, _desired_split, _max_split;
|
double _actual_split, _avg_split, _desired_split, _max_split;
|
||||||
int _avg_count;
|
int _avg_count;
|
||||||
|
|
||||||
bool _measure_this_step;
|
bool _measure_this_step;
|
||||||
int _inum, _inum_full;
|
int _inum, _inum_full, _timestep;
|
||||||
};
|
};
|
||||||
|
|
||||||
#define PairGPUBalanceT PairGPUBalance<numtyp,acctyp>
|
#define PairGPUBalanceT PairGPUBalance<numtyp,acctyp>
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
void PairGPUBalanceT::init(PairGPUDevice<numtyp, acctyp> *gpu,
|
void PairGPUBalanceT::init(PairGPUDevice<numtyp, acctyp> *gpu,
|
||||||
const double split) {
|
const bool gpu_nbor, const double split) {
|
||||||
clear();
|
clear();
|
||||||
|
_gpu_nbor=gpu_nbor;
|
||||||
_init_done=true;
|
_init_done=true;
|
||||||
|
|
||||||
_device=gpu;
|
_device=gpu;
|
||||||
@ -130,7 +133,7 @@ void PairGPUBalanceT::init(PairGPUDevice<numtyp, acctyp> *gpu,
|
|||||||
|
|
||||||
if (split<0.0) {
|
if (split<0.0) {
|
||||||
_load_balance=true;
|
_load_balance=true;
|
||||||
_desired_split=0.9;
|
_desired_split=0.90;
|
||||||
} else {
|
} else {
|
||||||
_load_balance=false;
|
_load_balance=false;
|
||||||
_desired_split=split;
|
_desired_split=split;
|
||||||
@ -138,14 +141,14 @@ void PairGPUBalanceT::init(PairGPUDevice<numtyp, acctyp> *gpu,
|
|||||||
_actual_split=_desired_split;
|
_actual_split=_desired_split;
|
||||||
_avg_split=0.0;
|
_avg_split=0.0;
|
||||||
_avg_count=0;
|
_avg_count=0;
|
||||||
|
_timestep=0;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
int PairGPUBalanceT::get_gpu_count(const int timestep, const int ago,
|
int PairGPUBalanceT::get_gpu_count(const int ago, const int inum_full) {
|
||||||
const int inum_full) {
|
|
||||||
_measure_this_step=false;
|
_measure_this_step=false;
|
||||||
if (_load_balance) {
|
if (_load_balance) {
|
||||||
if (_avg_count<11 || timestep%_HD_BALANCE_EVERY==0) {
|
if (_avg_count<11 || _timestep%_HD_BALANCE_EVERY==0) {
|
||||||
_measure_this_step=true;
|
_measure_this_step=true;
|
||||||
_inum_full=inum_full;
|
_inum_full=inum_full;
|
||||||
}
|
}
|
||||||
@ -156,44 +159,44 @@ int PairGPUBalanceT::get_gpu_count(const int timestep, const int ago,
|
|||||||
}
|
}
|
||||||
_inum=static_cast<int>(floor(_actual_split*inum_full));
|
_inum=static_cast<int>(floor(_actual_split*inum_full));
|
||||||
if (_inum==0) _inum++;
|
if (_inum==0) _inum++;
|
||||||
|
_timestep++;
|
||||||
return _inum;
|
return _inum;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
void PairGPUBalanceT::balance(const double cpu_time, const bool gpu_nbor) {
|
void PairGPUBalanceT::balance(const double cpu_time) {
|
||||||
if (_measure_this_step) {
|
if (_measure_this_step) {
|
||||||
|
_measure_this_step=false;
|
||||||
|
double gpu_time=_device_time.seconds();
|
||||||
|
|
||||||
|
double max_gpu_time;
|
||||||
|
MPI_Allreduce(&gpu_time,&max_gpu_time,1,MPI_DOUBLE,MPI_MAX,
|
||||||
|
_device->gpu_comm());
|
||||||
|
|
||||||
if (_inum_full==_inum) {
|
if (_inum_full==_inum) {
|
||||||
_desired_split=1.0;
|
_desired_split=1.0;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
_measure_this_step=false;
|
double cpu_time_per_atom=cpu_time/(_inum_full-_inum);
|
||||||
double gpu_time=_device_time.seconds();
|
double cpu_other_time=_device->host_time()-cpu_time;
|
||||||
|
int host_inum=static_cast<int>((max_gpu_time-cpu_other_time)/
|
||||||
|
cpu_time_per_atom);
|
||||||
|
|
||||||
double cpu_gpu_time[3], max_times[3];
|
double split=static_cast<double>(_inum_full-host_inum)/_inum_full;
|
||||||
cpu_gpu_time[0]=cpu_time/(_inum_full-_inum);
|
_desired_split=split*_HD_BALANCE_GAP;
|
||||||
cpu_gpu_time[1]=gpu_time/_inum;
|
if (_desired_split>1.0)
|
||||||
cpu_gpu_time[2]=(_device->host_time()-cpu_time)/_inum_full;
|
_desired_split=1.0;
|
||||||
|
if (_desired_split<0.0)
|
||||||
|
_desired_split=0.0;
|
||||||
|
|
||||||
MPI_Allreduce(cpu_gpu_time,max_times,3,MPI_DOUBLE,MPI_MAX,
|
if (!_gpu_nbor) {
|
||||||
_device->gpu_comm());
|
|
||||||
double split=(max_times[0]+max_times[2])/(max_times[0]+max_times[1]);
|
|
||||||
split*=_HD_BALANCE_GAP;
|
|
||||||
|
|
||||||
if (split>1.0)
|
|
||||||
split=1.0;
|
|
||||||
if (_avg_count<10)
|
|
||||||
_desired_split=(_desired_split*_avg_count+split)/(_avg_count+1);
|
|
||||||
else
|
|
||||||
_desired_split=_desired_split*(1.0-_HD_BALANCE_WEIGHT)+
|
|
||||||
_HD_BALANCE_WEIGHT*split;
|
|
||||||
|
|
||||||
if (!gpu_nbor) {
|
|
||||||
if (_desired_split<_max_split)
|
if (_desired_split<_max_split)
|
||||||
_actual_split=_desired_split;
|
_actual_split=_desired_split;
|
||||||
else
|
else
|
||||||
_actual_split=_max_split;
|
_actual_split=_max_split;
|
||||||
}
|
}
|
||||||
|
//std::cout << gpu_time << " " << max_gpu_time << " " << cpu_other_time << " " << cpu_time_per_atom << " " << cpu_time << " " << _desired_split << " " << host_inum << std::endl;
|
||||||
}
|
}
|
||||||
_avg_split+=_desired_split;
|
_avg_split+=_desired_split;
|
||||||
_avg_count++;
|
_avg_count++;
|
||||||
|
|||||||
@ -18,7 +18,7 @@
|
|||||||
|
|
||||||
#ifdef NV_KERNEL
|
#ifdef NV_KERNEL
|
||||||
|
|
||||||
#include "geryon/ucl_nv_kernel.h"
|
#include "nv_kernel_def.h"
|
||||||
texture<float4> neigh_tex;
|
texture<float4> neigh_tex;
|
||||||
|
|
||||||
#ifdef _DOUBLE_DOUBLE
|
#ifdef _DOUBLE_DOUBLE
|
||||||
@ -36,6 +36,7 @@ __inline float4 fetch_pos(const int& i, const float4 *pos)
|
|||||||
#else
|
#else
|
||||||
|
|
||||||
#define fetch_pos(i,y) x_[i]
|
#define fetch_pos(i,y) x_[i]
|
||||||
|
#define BLOCK_NBOR_BUILD 64
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -54,29 +55,30 @@ __inline float4 fetch_pos(const int& i, const float4 *pos)
|
|||||||
#define numtyp4 float4
|
#define numtyp4 float4
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define CELL_BLOCK_SIZE 64
|
#define BLOCK_CELL_2D 8
|
||||||
#define BLOCK_2D 8
|
|
||||||
|
#define SBBITS 30
|
||||||
|
|
||||||
#define SBBITS 30
|
#define SBBITS 30
|
||||||
|
|
||||||
__kernel void transpose(int *out, int *in, int columns_in, int rows_in)
|
__kernel void transpose(int *out, int *in, int columns_in, int rows_in)
|
||||||
{
|
{
|
||||||
__local float block[BLOCK_2D][BLOCK_2D+1];
|
__local float block[BLOCK_CELL_2D][BLOCK_CELL_2D+1];
|
||||||
|
|
||||||
unsigned ti=THREAD_ID_X;
|
unsigned ti=THREAD_ID_X;
|
||||||
unsigned tj=THREAD_ID_Y;
|
unsigned tj=THREAD_ID_Y;
|
||||||
unsigned bi=BLOCK_ID_X;
|
unsigned bi=BLOCK_ID_X;
|
||||||
unsigned bj=BLOCK_ID_Y;
|
unsigned bj=BLOCK_ID_Y;
|
||||||
|
|
||||||
unsigned i=bi*BLOCK_2D+ti;
|
unsigned i=bi*BLOCK_CELL_2D+ti;
|
||||||
unsigned j=bj*BLOCK_2D+tj;
|
unsigned j=bj*BLOCK_CELL_2D+tj;
|
||||||
if ((i<columns_in) && (j<rows_in))
|
if ((i<columns_in) && (j<rows_in))
|
||||||
block[tj][ti]=in[j*columns_in+i];
|
block[tj][ti]=in[j*columns_in+i];
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
i=bj*BLOCK_2D+ti;
|
i=bj*BLOCK_CELL_2D+ti;
|
||||||
j=bi*BLOCK_2D+tj;
|
j=bi*BLOCK_CELL_2D+tj;
|
||||||
if ((i<rows_in) && (j<columns_in))
|
if ((i<rows_in) && (j<columns_in))
|
||||||
out[j*rows_in+i] = block[ti][tj];
|
out[j*rows_in+i] = block[ti][tj];
|
||||||
}
|
}
|
||||||
@ -142,6 +144,7 @@ __kernel void calc_neigh_list_cell(numtyp4 *pos,
|
|||||||
int *cell_counts,
|
int *cell_counts,
|
||||||
int *nbor_list,
|
int *nbor_list,
|
||||||
int *host_nbor_list,
|
int *host_nbor_list,
|
||||||
|
int *host_numj,
|
||||||
int neigh_bin_size,
|
int neigh_bin_size,
|
||||||
numtyp cell_size,
|
numtyp cell_size,
|
||||||
int ncellx, int ncelly, int ncellz,
|
int ncellx, int ncelly, int ncellz,
|
||||||
@ -154,8 +157,8 @@ __kernel void calc_neigh_list_cell(numtyp4 *pos,
|
|||||||
|
|
||||||
int icell = ix + iy*ncellx + iz*ncellx*ncelly;
|
int icell = ix + iy*ncellx + iz*ncellx*ncelly;
|
||||||
|
|
||||||
__shared__ int cell_list_sh[CELL_BLOCK_SIZE];
|
__shared__ int cell_list_sh[BLOCK_NBOR_BUILD];
|
||||||
__shared__ numtyp4 pos_sh[CELL_BLOCK_SIZE];
|
__shared__ numtyp4 pos_sh[BLOCK_NBOR_BUILD];
|
||||||
|
|
||||||
int icell_begin = cell_counts[icell];
|
int icell_begin = cell_counts[icell];
|
||||||
int icell_end = cell_counts[icell+1];
|
int icell_end = cell_counts[icell+1];
|
||||||
@ -185,9 +188,9 @@ __kernel void calc_neigh_list_cell(numtyp4 *pos,
|
|||||||
neigh_list=neigh_counts+stride;
|
neigh_list=neigh_counts+stride;
|
||||||
nbor_list[pid_i]=pid_i;
|
nbor_list[pid_i]=pid_i;
|
||||||
} else {
|
} else {
|
||||||
stride=nt-inum;
|
stride=1;
|
||||||
neigh_counts=host_nbor_list+pid_i-inum;
|
neigh_counts=host_numj+pid_i-inum;
|
||||||
neigh_list=neigh_counts+stride;
|
neigh_list=host_nbor_list+(pid_i-inum)*neigh_bin_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
// loop through neighbors
|
// loop through neighbors
|
||||||
@ -203,13 +206,13 @@ __kernel void calc_neigh_list_cell(numtyp4 *pos,
|
|||||||
int num_atom_cell = jcell_end - jcell_begin;
|
int num_atom_cell = jcell_end - jcell_begin;
|
||||||
|
|
||||||
// load jcell to shared memory
|
// load jcell to shared memory
|
||||||
int num_iter = (int)ceil((numtyp)num_atom_cell/CELL_BLOCK_SIZE);
|
int num_iter = (int)ceil((numtyp)num_atom_cell/BLOCK_NBOR_BUILD);
|
||||||
|
|
||||||
for (int k = 0; k < num_iter; k++) {
|
for (int k = 0; k < num_iter; k++) {
|
||||||
int end_idx = min(CELL_BLOCK_SIZE, num_atom_cell-k*CELL_BLOCK_SIZE);
|
int end_idx = min(BLOCK_NBOR_BUILD, num_atom_cell-k*BLOCK_NBOR_BUILD);
|
||||||
|
|
||||||
if (tid < end_idx) {
|
if (tid < end_idx) {
|
||||||
pid_j = cell_particle_id[tid+k*CELL_BLOCK_SIZE+jcell_begin];
|
pid_j = cell_particle_id[tid+k*BLOCK_NBOR_BUILD+jcell_begin];
|
||||||
cell_list_sh[tid] = pid_j;
|
cell_list_sh[tid] = pid_j;
|
||||||
atom_j = fetch_pos(pid_j,pos); //[pid_j];
|
atom_j = fetch_pos(pid_j,pos); //[pid_j];
|
||||||
pos_sh[tid].x = atom_j.x;
|
pos_sh[tid].x = atom_j.x;
|
||||||
@ -222,7 +225,6 @@ __kernel void calc_neigh_list_cell(numtyp4 *pos,
|
|||||||
|
|
||||||
for (int j = 0; j < end_idx; j++) {
|
for (int j = 0; j < end_idx; j++) {
|
||||||
int pid_j = cell_list_sh[j]; // gather from shared memory
|
int pid_j = cell_list_sh[j]; // gather from shared memory
|
||||||
if (pid_i<inum || pid_j<inum || pid_j>pid_i) {
|
|
||||||
diff.x = atom_i.x - pos_sh[j].x;
|
diff.x = atom_i.x - pos_sh[j].x;
|
||||||
diff.y = atom_i.y - pos_sh[j].y;
|
diff.y = atom_i.y - pos_sh[j].y;
|
||||||
diff.z = atom_i.z - pos_sh[j].z;
|
diff.z = atom_i.z - pos_sh[j].z;
|
||||||
@ -236,7 +238,6 @@ __kernel void calc_neigh_list_cell(numtyp4 *pos,
|
|||||||
cnt++;
|
cnt++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
} // for (k)
|
} // for (k)
|
||||||
@ -249,9 +250,10 @@ __kernel void calc_neigh_list_cell(numtyp4 *pos,
|
|||||||
}
|
}
|
||||||
|
|
||||||
__kernel void kernel_special(__global int *dev_nbor,
|
__kernel void kernel_special(__global int *dev_nbor,
|
||||||
__global int *host_nbor_list, __global int *tag,
|
__global int *host_nbor_list,
|
||||||
|
__global int *host_numj, __global int *tag,
|
||||||
__global int *nspecial, __global int *special,
|
__global int *nspecial, __global int *special,
|
||||||
int inum, int nt, int nall) {
|
int inum, int nt, int nall, int max_nbors) {
|
||||||
// ii indexes the two interacting particles in gi
|
// ii indexes the two interacting particles in gi
|
||||||
int ii=GLOBAL_ID_X;
|
int ii=GLOBAL_ID_X;
|
||||||
|
|
||||||
@ -263,15 +265,17 @@ __kernel void kernel_special(__global int *dev_nbor,
|
|||||||
int n2=nspecial[ii*3+1];
|
int n2=nspecial[ii*3+1];
|
||||||
int n3=nspecial[ii*3+2];
|
int n3=nspecial[ii*3+2];
|
||||||
|
|
||||||
|
int numj;
|
||||||
if (ii < inum) {
|
if (ii < inum) {
|
||||||
stride=inum;
|
stride=inum;
|
||||||
list=dev_nbor+stride+ii;
|
list=dev_nbor+stride+ii;
|
||||||
} else {
|
numj=*list;
|
||||||
stride=nt-inum;
|
|
||||||
list=host_nbor_list+ii-inum;
|
|
||||||
}
|
|
||||||
int numj=*list;
|
|
||||||
list+=stride;
|
list+=stride;
|
||||||
|
} else {
|
||||||
|
stride=1;
|
||||||
|
list=host_nbor_list+(ii-inum)*max_nbors;
|
||||||
|
numj=host_numj[ii-inum];
|
||||||
|
}
|
||||||
list_end=list+numj*stride;
|
list_end=list+numj*stride;
|
||||||
|
|
||||||
for ( ; list<list_end; list+=stride) {
|
for ( ; list<list_end; list+=stride) {
|
||||||
@ -294,4 +298,3 @@ __kernel void kernel_special(__global int *dev_nbor,
|
|||||||
}
|
}
|
||||||
} // if ii
|
} // if ii
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -19,13 +19,22 @@
|
|||||||
#include "pair_gpu_precision.h"
|
#include "pair_gpu_precision.h"
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
#ifdef _OPENMP
|
||||||
|
#include <omp.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef USE_OPENCL
|
||||||
|
#include "pair_gpu_dev_cl.h"
|
||||||
|
#else
|
||||||
|
#include "pair_gpu_dev_ptx.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
#define PairGPUDeviceT PairGPUDevice<numtyp, acctyp>
|
#define PairGPUDeviceT PairGPUDevice<numtyp, acctyp>
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
PairGPUDeviceT::PairGPUDevice() : _init_count(0), _device_init(false),
|
PairGPUDeviceT::PairGPUDevice() : _init_count(0), _device_init(false),
|
||||||
_gpu_mode(GPU_FORCE), _first_device(0),
|
_gpu_mode(GPU_FORCE), _first_device(0),
|
||||||
_last_device(0) {
|
_last_device(0), _compiled(false) {
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
@ -34,14 +43,19 @@ PairGPUDeviceT::~PairGPUDevice() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
bool PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica,
|
int PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica,
|
||||||
const int first_gpu, const int last_gpu,
|
const int first_gpu, const int last_gpu,
|
||||||
const int gpu_mode, const double p_split,
|
const int gpu_mode, const double p_split,
|
||||||
const int nthreads) {
|
const int nthreads, const int t_per_atom) {
|
||||||
_nthreads=nthreads;
|
_nthreads=nthreads;
|
||||||
|
#ifdef _OPENMP
|
||||||
|
omp_set_num_threads(nthreads);
|
||||||
|
#endif
|
||||||
|
_threads_per_atom=t_per_atom;
|
||||||
|
_threads_per_charge=t_per_atom;
|
||||||
|
|
||||||
if (_device_init)
|
if (_device_init)
|
||||||
return true;
|
return 0;
|
||||||
_device_init=true;
|
_device_init=true;
|
||||||
_comm_world=world;
|
_comm_world=world;
|
||||||
_comm_replica=replica;
|
_comm_replica=replica;
|
||||||
@ -96,7 +110,12 @@ bool PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica,
|
|||||||
// set the device ID
|
// set the device ID
|
||||||
_procs_per_gpu=static_cast<int>(ceil(static_cast<double>(procs_per_node)/
|
_procs_per_gpu=static_cast<int>(ceil(static_cast<double>(procs_per_node)/
|
||||||
(last_gpu-first_gpu+1)));
|
(last_gpu-first_gpu+1)));
|
||||||
int my_gpu=node_rank/_procs_per_gpu;
|
int my_gpu=node_rank/_procs_per_gpu+first_gpu;
|
||||||
|
|
||||||
|
// Time on the device only if 1 proc per gpu
|
||||||
|
_time_device=true;
|
||||||
|
if (_procs_per_gpu>1)
|
||||||
|
_time_device=false;
|
||||||
|
|
||||||
// Set up a per device communicator
|
// Set up a per device communicator
|
||||||
MPI_Comm_split(node_comm,my_gpu,0,&_comm_gpu);
|
MPI_Comm_split(node_comm,my_gpu,0,&_comm_gpu);
|
||||||
@ -104,39 +123,109 @@ bool PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica,
|
|||||||
|
|
||||||
gpu=new UCL_Device();
|
gpu=new UCL_Device();
|
||||||
if (my_gpu>=gpu->num_devices())
|
if (my_gpu>=gpu->num_devices())
|
||||||
return false;
|
return -2;
|
||||||
|
|
||||||
gpu->set(my_gpu);
|
gpu->set(my_gpu);
|
||||||
return true;
|
|
||||||
|
_long_range_precompute=0;
|
||||||
|
|
||||||
|
int flag=compile_kernels();
|
||||||
|
|
||||||
|
return flag;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
bool PairGPUDeviceT::init(const bool charge, const bool rot, const int nlocal,
|
int PairGPUDeviceT::init(PairGPUAns<numtyp,acctyp> &ans, const bool charge,
|
||||||
|
const bool rot, const int nlocal,
|
||||||
const int host_nlocal, const int nall,
|
const int host_nlocal, const int nall,
|
||||||
const int maxspecial, const bool gpu_nbor,
|
PairGPUNbor *nbor, const int maxspecial,
|
||||||
const int gpu_host, const int max_nbors,
|
const int gpu_host, const int max_nbors,
|
||||||
const double cell_size, const bool pre_cut) {
|
const double cell_size, const bool pre_cut) {
|
||||||
if (!_device_init)
|
if (!_device_init)
|
||||||
return false;
|
return -1;
|
||||||
if (_init_count==0) {
|
if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false)
|
||||||
// Initialize atom and nbor data
|
return -5;
|
||||||
|
|
||||||
|
// Counts of data transfers for timing overhead estimates
|
||||||
|
_data_in_estimate=0;
|
||||||
|
_data_out_estimate=1;
|
||||||
|
|
||||||
|
// Initial number of local particles
|
||||||
int ef_nlocal=nlocal;
|
int ef_nlocal=nlocal;
|
||||||
if (_particle_split<1.0 && _particle_split>0.0)
|
if (_particle_split<1.0 && _particle_split>0.0)
|
||||||
ef_nlocal=static_cast<int>(_particle_split*nlocal);
|
ef_nlocal=static_cast<int>(_particle_split*nlocal);
|
||||||
if (!atom.init(ef_nlocal,nall,charge,rot,*gpu,gpu_nbor,
|
|
||||||
gpu_nbor && maxspecial>0))
|
bool gpu_nbor=false;
|
||||||
return false;
|
if (_gpu_mode==GPU_NEIGH)
|
||||||
if (!nbor.init(ef_nlocal,host_nlocal,max_nbors,maxspecial,*gpu,gpu_nbor,
|
gpu_nbor=true;
|
||||||
gpu_host,pre_cut))
|
|
||||||
return false;
|
if (_init_count==0) {
|
||||||
nbor.cell_size(cell_size);
|
// Initialize atom and nbor data
|
||||||
|
if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor && maxspecial>0))
|
||||||
|
return -3;
|
||||||
|
|
||||||
|
_data_in_estimate++;
|
||||||
|
if (charge)
|
||||||
|
_data_in_estimate++;
|
||||||
|
if (rot)
|
||||||
|
_data_in_estimate++;
|
||||||
} else {
|
} else {
|
||||||
if (cell_size>nbor.cell_size())
|
if (atom.charge()==false && charge)
|
||||||
nbor.cell_size(cell_size);
|
_data_in_estimate++;
|
||||||
|
if (atom.quat()==false && rot)
|
||||||
|
_data_in_estimate++;
|
||||||
|
if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor && maxspecial))
|
||||||
|
return -3;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!ans.init(ef_nlocal,charge,rot,*gpu))
|
||||||
|
return -3;
|
||||||
|
|
||||||
|
if (!nbor->init(&_nbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial,
|
||||||
|
*gpu,gpu_nbor,gpu_host,pre_cut, _block_cell_2d,
|
||||||
|
_block_cell_id, _block_nbor_build))
|
||||||
|
return -3;
|
||||||
|
nbor->cell_size(cell_size);
|
||||||
|
|
||||||
_init_count++;
|
_init_count++;
|
||||||
return true;
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
int PairGPUDeviceT::init(PairGPUAns<numtyp,acctyp> &ans, const int nlocal,
|
||||||
|
const int nall) {
|
||||||
|
if (!_device_init)
|
||||||
|
return -1;
|
||||||
|
if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false)
|
||||||
|
return -5;
|
||||||
|
|
||||||
|
if (_init_count==0) {
|
||||||
|
// Initialize atom and nbor data
|
||||||
|
if (!atom.init(nall,true,false,*gpu,false,false))
|
||||||
|
return -3;
|
||||||
|
} else
|
||||||
|
if (!atom.add_fields(true,false,false,false))
|
||||||
|
return -3;
|
||||||
|
|
||||||
|
if (!ans.init(nlocal,true,false,*gpu))
|
||||||
|
return -3;
|
||||||
|
|
||||||
|
_init_count++;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
void PairGPUDeviceT::set_single_precompute
|
||||||
|
(PPPMGPUMemory<numtyp,acctyp,float,_lgpu_float4> *pppm) {
|
||||||
|
_long_range_precompute=1;
|
||||||
|
pppm_single=pppm;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
void PairGPUDeviceT::set_double_precompute
|
||||||
|
(PPPMGPUMemory<numtyp,acctyp,double,_lgpu_double4> *pppm) {
|
||||||
|
_long_range_precompute=2;
|
||||||
|
pppm_double=pppm;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
@ -152,11 +241,17 @@ void PairGPUDeviceT::init_message(FILE *screen, const char *name,
|
|||||||
fprintf(screen,"\n-------------------------------------");
|
fprintf(screen,"\n-------------------------------------");
|
||||||
fprintf(screen,"-------------------------------------\n");
|
fprintf(screen,"-------------------------------------\n");
|
||||||
fprintf(screen,"- Using GPGPU acceleration for %s:\n",name);
|
fprintf(screen,"- Using GPGPU acceleration for %s:\n",name);
|
||||||
fprintf(screen,"- with %d procs per device.\n",_procs_per_gpu);
|
fprintf(screen,"- with %d proc(s) per device.\n",_procs_per_gpu);
|
||||||
|
#ifdef _OPENMP
|
||||||
|
fprintf(screen,"- with %d thread(s) per proc.\n",_nthreads);
|
||||||
|
#endif
|
||||||
fprintf(screen,"-------------------------------------");
|
fprintf(screen,"-------------------------------------");
|
||||||
fprintf(screen,"-------------------------------------\n");
|
fprintf(screen,"-------------------------------------\n");
|
||||||
|
|
||||||
for (int i=first_gpu; i<=last_gpu; i++) {
|
int last=last_gpu+1;
|
||||||
|
if (last>gpu->num_devices())
|
||||||
|
last=gpu->num_devices();
|
||||||
|
for (int i=first_gpu; i<last; i++) {
|
||||||
std::string sname=gpu->name(i)+", "+toa(gpu->cores(i))+" cores, "+fs+
|
std::string sname=gpu->name(i)+", "+toa(gpu->cores(i))+" cores, "+fs+
|
||||||
toa(gpu->gigabytes(i))+" GB, "+toa(gpu->clock_rate(i))+
|
toa(gpu->gigabytes(i))+" GB, "+toa(gpu->clock_rate(i))+
|
||||||
" GHZ (";
|
" GHZ (";
|
||||||
@ -177,32 +272,152 @@ void PairGPUDeviceT::init_message(FILE *screen, const char *name,
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
void PairGPUDeviceT::output_times(UCL_Timer &time_pair, const double avg_split,
|
void PairGPUDeviceT::estimate_gpu_overhead(const int kernel_calls,
|
||||||
const double max_bytes, FILE *screen) {
|
double &gpu_overhead,
|
||||||
double single[5], times[5];
|
double &gpu_driver_overhead) {
|
||||||
|
UCL_H_Vec<int> *host_data_in=NULL, *host_data_out=NULL;
|
||||||
|
UCL_D_Vec<int> *dev_data_in=NULL, *dev_data_out=NULL, *kernel_data=NULL;
|
||||||
|
UCL_Timer *timers_in=NULL, *timers_out=NULL, *timers_kernel=NULL;
|
||||||
|
UCL_Timer over_timer(*gpu);
|
||||||
|
|
||||||
single[0]=atom.transfer_time();
|
if (_data_in_estimate>0) {
|
||||||
|
host_data_in=new UCL_H_Vec<int>[_data_in_estimate];
|
||||||
|
dev_data_in=new UCL_D_Vec<int>[_data_in_estimate];
|
||||||
|
timers_in=new UCL_Timer[_data_in_estimate];
|
||||||
|
}
|
||||||
|
|
||||||
|
if (_data_out_estimate>0) {
|
||||||
|
host_data_out=new UCL_H_Vec<int>[_data_out_estimate];
|
||||||
|
dev_data_out=new UCL_D_Vec<int>[_data_out_estimate];
|
||||||
|
timers_out=new UCL_Timer[_data_out_estimate];
|
||||||
|
}
|
||||||
|
|
||||||
|
if (kernel_calls>0) {
|
||||||
|
kernel_data=new UCL_D_Vec<int>[kernel_calls];
|
||||||
|
timers_kernel=new UCL_Timer[kernel_calls];
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i=0; i<_data_in_estimate; i++) {
|
||||||
|
host_data_in[i].alloc(1,*gpu);
|
||||||
|
dev_data_in[i].alloc(1,*gpu);
|
||||||
|
timers_in[i].init(*gpu);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i=0; i<_data_out_estimate; i++) {
|
||||||
|
host_data_out[i].alloc(1,*gpu);
|
||||||
|
dev_data_out[i].alloc(1,*gpu);
|
||||||
|
timers_out[i].init(*gpu);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i=0; i<kernel_calls; i++) {
|
||||||
|
kernel_data[i].alloc(1,*gpu);
|
||||||
|
timers_kernel[i].init(*gpu);
|
||||||
|
}
|
||||||
|
|
||||||
|
gpu_overhead=0.0;
|
||||||
|
gpu_driver_overhead=0.0;
|
||||||
|
|
||||||
|
for (int i=0; i<10; i++) {
|
||||||
|
gpu->sync();
|
||||||
|
gpu_barrier();
|
||||||
|
over_timer.start();
|
||||||
|
gpu->sync();
|
||||||
|
gpu_barrier();
|
||||||
|
|
||||||
|
double driver_time=MPI_Wtime();
|
||||||
|
for (int i=0; i<_data_in_estimate; i++) {
|
||||||
|
timers_in[i].start();
|
||||||
|
ucl_copy(dev_data_in[i],host_data_in[i],true);
|
||||||
|
timers_in[i].stop();
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i=0; i<kernel_calls; i++) {
|
||||||
|
timers_kernel[i].start();
|
||||||
|
zero(kernel_data[i],1);
|
||||||
|
timers_kernel[i].stop();
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i=0; i<_data_out_estimate; i++) {
|
||||||
|
timers_out[i].start();
|
||||||
|
ucl_copy(host_data_out[i],dev_data_out[i],true);
|
||||||
|
timers_out[i].stop();
|
||||||
|
}
|
||||||
|
over_timer.stop();
|
||||||
|
|
||||||
|
double time=over_timer.seconds();
|
||||||
|
driver_time=MPI_Wtime()-driver_time;
|
||||||
|
|
||||||
|
if (time_device()) {
|
||||||
|
for (int i=0; i<_data_in_estimate; i++)
|
||||||
|
timers_in[i].add_to_total();
|
||||||
|
for (int i=0; i<kernel_calls; i++)
|
||||||
|
timers_kernel[i].add_to_total();
|
||||||
|
for (int i=0; i<_data_out_estimate; i++)
|
||||||
|
timers_out[i].add_to_total();
|
||||||
|
}
|
||||||
|
|
||||||
|
double mpi_time, mpi_driver_time;
|
||||||
|
MPI_Allreduce(&time,&mpi_time,1,MPI_DOUBLE,MPI_MAX,gpu_comm());
|
||||||
|
MPI_Allreduce(&driver_time,&mpi_driver_time,1,MPI_DOUBLE,MPI_MAX,gpu_comm());
|
||||||
|
gpu_overhead+=mpi_time;
|
||||||
|
gpu_driver_overhead+=mpi_driver_time;
|
||||||
|
}
|
||||||
|
gpu_overhead/=10.0;
|
||||||
|
gpu_driver_overhead/=10.0;
|
||||||
|
|
||||||
|
if (_data_in_estimate>0) {
|
||||||
|
delete [] host_data_in;
|
||||||
|
delete [] dev_data_in;
|
||||||
|
delete [] timers_in;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (_data_out_estimate>0) {
|
||||||
|
delete [] host_data_out;
|
||||||
|
delete [] dev_data_out;
|
||||||
|
delete [] timers_out;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (kernel_calls>0) {
|
||||||
|
delete [] kernel_data;
|
||||||
|
delete [] timers_kernel;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
void PairGPUDeviceT::output_times(UCL_Timer &time_pair,
|
||||||
|
PairGPUAns<numtyp,acctyp> &ans,
|
||||||
|
PairGPUNbor &nbor, const double avg_split,
|
||||||
|
const double max_bytes,
|
||||||
|
const double gpu_overhead,
|
||||||
|
const double driver_overhead,
|
||||||
|
const int threads_per_atom, FILE *screen) {
|
||||||
|
double single[8], times[8];
|
||||||
|
|
||||||
|
single[0]=atom.transfer_time()+ans.transfer_time();
|
||||||
single[1]=nbor.time_nbor.total_seconds();
|
single[1]=nbor.time_nbor.total_seconds();
|
||||||
single[2]=nbor.time_kernel.total_seconds();
|
single[2]=nbor.time_kernel.total_seconds();
|
||||||
single[3]=time_pair.total_seconds();
|
single[3]=time_pair.total_seconds();
|
||||||
single[4]=atom.cast_time();
|
single[4]=atom.cast_time()+ans.cast_time();
|
||||||
|
single[5]=gpu_overhead;
|
||||||
|
single[6]=driver_overhead;
|
||||||
|
single[7]=ans.cpu_idle_time();
|
||||||
|
|
||||||
MPI_Reduce(single,times,5,MPI_DOUBLE,MPI_SUM,0,_comm_replica);
|
MPI_Reduce(single,times,8,MPI_DOUBLE,MPI_SUM,0,_comm_replica);
|
||||||
|
|
||||||
double my_max_bytes=max_bytes;
|
double my_max_bytes=max_bytes+atom.max_gpu_bytes();
|
||||||
double mpi_max_bytes;
|
double mpi_max_bytes;
|
||||||
MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica);
|
MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica);
|
||||||
double max_mb=mpi_max_bytes/(1024.0*1024.0);
|
double max_mb=mpi_max_bytes/(1024.0*1024.0);
|
||||||
|
|
||||||
if (replica_me()==0)
|
if (replica_me()==0)
|
||||||
if (screen && times[3]>0.0) {
|
if (screen && times[5]>0.0) {
|
||||||
fprintf(screen,"\n\n-------------------------------------");
|
fprintf(screen,"\n\n-------------------------------------");
|
||||||
fprintf(screen,"--------------------------------\n");
|
fprintf(screen,"--------------------------------\n");
|
||||||
fprintf(screen," GPU Time Info (average): ");
|
fprintf(screen," GPU Time Info (average): ");
|
||||||
fprintf(screen,"\n-------------------------------------");
|
fprintf(screen,"\n-------------------------------------");
|
||||||
fprintf(screen,"--------------------------------\n");
|
fprintf(screen,"--------------------------------\n");
|
||||||
|
|
||||||
if (procs_per_gpu()==1) {
|
if (time_device()) {
|
||||||
fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/_replica_size);
|
fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/_replica_size);
|
||||||
fprintf(screen,"Data Cast/Pack: %.4f s.\n",times[4]/_replica_size);
|
fprintf(screen,"Data Cast/Pack: %.4f s.\n",times[4]/_replica_size);
|
||||||
fprintf(screen,"Neighbor copy: %.4f s.\n",times[1]/_replica_size);
|
fprintf(screen,"Neighbor copy: %.4f s.\n",times[1]/_replica_size);
|
||||||
@ -212,7 +427,71 @@ void PairGPUDeviceT::output_times(UCL_Timer &time_pair, const double avg_split,
|
|||||||
fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/_replica_size);
|
fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/_replica_size);
|
||||||
fprintf(screen,"Force calc: %.4f s.\n",times[3]/_replica_size);
|
fprintf(screen,"Force calc: %.4f s.\n",times[3]/_replica_size);
|
||||||
}
|
}
|
||||||
|
fprintf(screen,"GPU Overhead: %.4f s.\n",times[5]/_replica_size);
|
||||||
fprintf(screen,"Average split: %.4f.\n",avg_split);
|
fprintf(screen,"Average split: %.4f.\n",avg_split);
|
||||||
|
fprintf(screen,"Threads / atom: %d.\n",threads_per_atom);
|
||||||
|
fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb);
|
||||||
|
fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[6]/_replica_size);
|
||||||
|
fprintf(screen,"CPU Idle_Time: %.4f s.\n",times[7]/_replica_size);
|
||||||
|
|
||||||
|
fprintf(screen,"-------------------------------------");
|
||||||
|
fprintf(screen,"--------------------------------\n\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
void PairGPUDeviceT::output_kspace_times(UCL_Timer &time_in,
|
||||||
|
UCL_Timer &time_out,
|
||||||
|
UCL_Timer &time_map,
|
||||||
|
UCL_Timer &time_rho,
|
||||||
|
UCL_Timer &time_interp,
|
||||||
|
PairGPUAns<numtyp,acctyp> &ans,
|
||||||
|
const double max_bytes,
|
||||||
|
const double cpu_time,
|
||||||
|
const double idle_time, FILE *screen) {
|
||||||
|
double single[8], times[8];
|
||||||
|
|
||||||
|
single[0]=time_out.total_seconds();
|
||||||
|
single[1]=time_in.total_seconds()+atom.transfer_time()+atom.cast_time();
|
||||||
|
single[2]=time_map.total_seconds();
|
||||||
|
single[3]=time_rho.total_seconds();
|
||||||
|
single[4]=time_interp.total_seconds();
|
||||||
|
single[5]=ans.transfer_time()+ans.cast_time();
|
||||||
|
single[6]=cpu_time;
|
||||||
|
single[7]=idle_time;
|
||||||
|
|
||||||
|
MPI_Reduce(single,times,8,MPI_DOUBLE,MPI_SUM,0,_comm_replica);
|
||||||
|
|
||||||
|
double my_max_bytes=max_bytes+atom.max_gpu_bytes();
|
||||||
|
double mpi_max_bytes;
|
||||||
|
MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica);
|
||||||
|
double max_mb=mpi_max_bytes/(1024.0*1024.0);
|
||||||
|
|
||||||
|
if (replica_me()==0)
|
||||||
|
if (screen && times[6]>0.0) {
|
||||||
|
fprintf(screen,"\n\n-------------------------------------");
|
||||||
|
fprintf(screen,"--------------------------------\n");
|
||||||
|
fprintf(screen," GPU Time Info (average): ");
|
||||||
|
fprintf(screen,"\n-------------------------------------");
|
||||||
|
fprintf(screen,"--------------------------------\n");
|
||||||
|
|
||||||
|
if (time_device()) {
|
||||||
|
fprintf(screen,"Data Out: %.4f s.\n",times[0]/_replica_size);
|
||||||
|
fprintf(screen,"Data In: %.4f s.\n",times[1]/_replica_size);
|
||||||
|
fprintf(screen,"Kernel (map): %.4f s.\n",times[2]/_replica_size);
|
||||||
|
fprintf(screen,"Kernel (rho): %.4f s.\n",times[3]/_replica_size);
|
||||||
|
fprintf(screen,"Force interp: %.4f s.\n",times[4]/_replica_size);
|
||||||
|
fprintf(screen,"Total rho: %.4f s.\n",
|
||||||
|
(times[0]+times[2]+times[3])/_replica_size);
|
||||||
|
fprintf(screen,"Total interp: %.4f s.\n",
|
||||||
|
(times[1]+times[4])/_replica_size);
|
||||||
|
fprintf(screen,"Force copy/cast: %.4f s.\n",times[5]/_replica_size);
|
||||||
|
fprintf(screen,"Total: %.4f s.\n",
|
||||||
|
(times[0]+times[1]+times[2]+times[3]+times[4]+times[5])/
|
||||||
|
_replica_size);
|
||||||
|
}
|
||||||
|
fprintf(screen,"CPU Poisson: %.4f s.\n",times[6]/_replica_size);
|
||||||
|
fprintf(screen,"CPU Idle Time: %.4f s.\n",times[7]/_replica_size);
|
||||||
fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb);
|
fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb);
|
||||||
|
|
||||||
fprintf(screen,"-------------------------------------");
|
fprintf(screen,"-------------------------------------");
|
||||||
@ -223,10 +502,17 @@ void PairGPUDeviceT::output_times(UCL_Timer &time_pair, const double avg_split,
|
|||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
void PairGPUDeviceT::clear() {
|
void PairGPUDeviceT::clear() {
|
||||||
if (_init_count>0) {
|
if (_init_count>0) {
|
||||||
|
_long_range_precompute=0;
|
||||||
_init_count--;
|
_init_count--;
|
||||||
if (_init_count==0) {
|
if (_init_count==0) {
|
||||||
atom.clear();
|
atom.clear();
|
||||||
nbor.clear();
|
_nbor_shared.clear();
|
||||||
|
if (_compiled) {
|
||||||
|
k_zero.clear();
|
||||||
|
k_info.clear();
|
||||||
|
delete dev_program;
|
||||||
|
_compiled=false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -241,21 +527,80 @@ void PairGPUDeviceT::clear_device() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
int PairGPUDeviceT::compile_kernels() {
|
||||||
|
int flag=0;
|
||||||
|
|
||||||
|
if (_compiled)
|
||||||
|
return flag;
|
||||||
|
|
||||||
|
std::string flags="-cl-mad-enable";
|
||||||
|
dev_program=new UCL_Program(*gpu);
|
||||||
|
int success=dev_program->load_string(pair_gpu_dev_kernel,flags.c_str());
|
||||||
|
if (success!=UCL_SUCCESS)
|
||||||
|
return -4;
|
||||||
|
k_zero.set_function(*dev_program,"kernel_zero");
|
||||||
|
k_info.set_function(*dev_program,"kernel_info");
|
||||||
|
_compiled=true;
|
||||||
|
|
||||||
|
UCL_H_Vec<int> h_gpu_lib_data(14,*gpu,UCL_NOT_PINNED);
|
||||||
|
UCL_D_Vec<int> d_gpu_lib_data(14,*gpu);
|
||||||
|
k_info.set_size(1,1);
|
||||||
|
k_info.run(&d_gpu_lib_data.begin());
|
||||||
|
ucl_copy(h_gpu_lib_data,d_gpu_lib_data,false);
|
||||||
|
|
||||||
|
#ifndef USE_OPENCL
|
||||||
|
if (static_cast<double>(h_gpu_lib_data[0])/100.0>gpu->arch())
|
||||||
|
return -4;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
_num_mem_threads=h_gpu_lib_data[1];
|
||||||
|
_warp_size=h_gpu_lib_data[2];
|
||||||
|
if (_threads_per_atom<1)
|
||||||
|
_threads_per_atom=h_gpu_lib_data[3];
|
||||||
|
if (_threads_per_charge<1)
|
||||||
|
_threads_per_charge=h_gpu_lib_data[13];
|
||||||
|
_pppm_max_spline=h_gpu_lib_data[4];
|
||||||
|
_pppm_block=h_gpu_lib_data[5];
|
||||||
|
_block_pair=h_gpu_lib_data[6];
|
||||||
|
_max_shared_types=h_gpu_lib_data[7];
|
||||||
|
_block_cell_2d=h_gpu_lib_data[8];
|
||||||
|
_block_cell_id=h_gpu_lib_data[9];
|
||||||
|
_block_nbor_build=h_gpu_lib_data[10];
|
||||||
|
_block_bio_pair=h_gpu_lib_data[11];
|
||||||
|
_max_bio_shared_types=h_gpu_lib_data[12];
|
||||||
|
|
||||||
|
if (static_cast<size_t>(_block_pair)>gpu->group_size())
|
||||||
|
_block_pair=gpu->group_size();
|
||||||
|
if (static_cast<size_t>(_block_bio_pair)>gpu->group_size())
|
||||||
|
_block_bio_pair=gpu->group_size();
|
||||||
|
if (_threads_per_atom>_warp_size)
|
||||||
|
_threads_per_atom=_warp_size;
|
||||||
|
if (_warp_size%_threads_per_atom!=0)
|
||||||
|
_threads_per_atom=1;
|
||||||
|
if (_threads_per_charge>_warp_size)
|
||||||
|
_threads_per_charge=_warp_size;
|
||||||
|
if (_warp_size%_threads_per_charge!=0)
|
||||||
|
_threads_per_charge=1;
|
||||||
|
|
||||||
|
return flag;
|
||||||
|
}
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
double PairGPUDeviceT::host_memory_usage() const {
|
double PairGPUDeviceT::host_memory_usage() const {
|
||||||
return atom.host_memory_usage()+
|
return atom.host_memory_usage()+4*sizeof(numtyp)+
|
||||||
nbor.host_memory_usage()+4*sizeof(numtyp)+
|
|
||||||
sizeof(PairGPUDevice<numtyp,acctyp>);
|
sizeof(PairGPUDevice<numtyp,acctyp>);
|
||||||
}
|
}
|
||||||
|
|
||||||
template class PairGPUDevice<PRECISION,ACC_PRECISION>;
|
template class PairGPUDevice<PRECISION,ACC_PRECISION>;
|
||||||
PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
|
PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
|
||||||
|
|
||||||
bool lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
|
int lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
|
||||||
const int last_gpu, const int gpu_mode,
|
const int last_gpu, const int gpu_mode,
|
||||||
const double particle_split, const int nthreads) {
|
const double particle_split, const int nthreads,
|
||||||
|
const int t_per_atom) {
|
||||||
return pair_gpu_device.init_device(world,replica,first_gpu,last_gpu,gpu_mode,
|
return pair_gpu_device.init_device(world,replica,first_gpu,last_gpu,gpu_mode,
|
||||||
particle_split,nthreads);
|
particle_split,nthreads,t_per_atom);
|
||||||
}
|
}
|
||||||
|
|
||||||
void lmp_clear_device() {
|
void lmp_clear_device() {
|
||||||
@ -264,14 +609,5 @@ void lmp_clear_device() {
|
|||||||
|
|
||||||
double lmp_gpu_forces(double **f, double **tor, double *eatom,
|
double lmp_gpu_forces(double **f, double **tor, double *eatom,
|
||||||
double **vatom, double *virial, double &ecoul) {
|
double **vatom, double *virial, double &ecoul) {
|
||||||
if (pair_gpu_device.init_count()) {
|
return pair_gpu_device.fix_gpu(f,tor,eatom,vatom,virial,ecoul);
|
||||||
pair_gpu_device.stop_host_timer();
|
|
||||||
pair_gpu_device.gpu->sync();
|
|
||||||
double evdw=pair_gpu_device.atom.energy_virial(eatom,vatom,virial,ecoul);
|
|
||||||
pair_gpu_device.atom.get_answers(f,tor);
|
|
||||||
|
|
||||||
return evdw;
|
|
||||||
}
|
|
||||||
return 0.0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -19,11 +19,17 @@
|
|||||||
#define PAIR_GPU_DEVICE_H
|
#define PAIR_GPU_DEVICE_H
|
||||||
|
|
||||||
#include "pair_gpu_atom.h"
|
#include "pair_gpu_atom.h"
|
||||||
|
#include "pair_gpu_ans.h"
|
||||||
#include "pair_gpu_nbor.h"
|
#include "pair_gpu_nbor.h"
|
||||||
|
#include "pppm_gpu_memory.h"
|
||||||
#include "mpi.h"
|
#include "mpi.h"
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include "stdio.h"
|
#include "stdio.h"
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <queue>
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp,
|
||||||
|
class grdtyp, class grdtyp4> class PPPMGPUMemory;
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
class PairGPUDevice {
|
class PairGPUDevice {
|
||||||
@ -33,10 +39,15 @@ class PairGPUDevice {
|
|||||||
|
|
||||||
/// Initialize the device for use by this process
|
/// Initialize the device for use by this process
|
||||||
/** Sets up a per-device MPI communicator for load balancing and initializes
|
/** Sets up a per-device MPI communicator for load balancing and initializes
|
||||||
* the device (>=first_gpu and <=last_gpu) that this proc will be using **/
|
* the device (>=first_gpu and <=last_gpu) that this proc will be using
|
||||||
bool init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
|
* Returns:
|
||||||
|
* - 0 if successfull
|
||||||
|
* - -2 if GPU not found
|
||||||
|
* - -4 if GPU library not compiled for GPU **/
|
||||||
|
int init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
|
||||||
const int last_gpu, const int gpu_mode,
|
const int last_gpu, const int gpu_mode,
|
||||||
const double particle_split, const int nthreads);
|
const double particle_split, const int nthreads,
|
||||||
|
const int t_per_atom);
|
||||||
|
|
||||||
/// Initialize the device for Atom and Neighbor storage
|
/// Initialize the device for Atom and Neighbor storage
|
||||||
/** \param rot True if quaternions need to be stored
|
/** \param rot True if quaternions need to be stored
|
||||||
@ -50,19 +61,67 @@ class PairGPUDevice {
|
|||||||
* \param max_nbors Initial number of rows in the neighbor matrix
|
* \param max_nbors Initial number of rows in the neighbor matrix
|
||||||
* \param cell_size cutoff+skin
|
* \param cell_size cutoff+skin
|
||||||
* \param pre_cut True if cutoff test will be performed in separate kernel
|
* \param pre_cut True if cutoff test will be performed in separate kernel
|
||||||
* than the force kernel **/
|
* than the force kernel
|
||||||
bool init(const bool charge, const bool rot, const int nlocal,
|
* Returns:
|
||||||
const int host_nlocal, const int nall, const int maxspecial,
|
* - 0 if successfull
|
||||||
const bool gpu_nbor, const int gpu_host, const int max_nbors,
|
* - -1 if fix gpu not found
|
||||||
const double cell_size, const bool pre_cut);
|
* - -3 if there is an out of memory error
|
||||||
|
* - -4 if the GPU library was not compiled for GPU
|
||||||
|
* - -5 Double precision is not supported on card **/
|
||||||
|
int init(PairGPUAns<numtyp,acctyp> &a, const bool charge, const bool rot,
|
||||||
|
const int nlocal, const int host_nlocal, const int nall,
|
||||||
|
PairGPUNbor *nbor, const int maxspecial, const int gpu_host,
|
||||||
|
const int max_nbors, const double cell_size, const bool pre_cut);
|
||||||
|
|
||||||
|
/// Initialize the device for Atom storage only
|
||||||
|
/** \param nlocal Total number of local particles to allocate memory for
|
||||||
|
* \param nall Total number of local+ghost particles
|
||||||
|
*
|
||||||
|
* Returns:
|
||||||
|
* - 0 if successfull
|
||||||
|
* - -1 if fix gpu not found
|
||||||
|
* - -3 if there is an out of memory error
|
||||||
|
* - -4 if the GPU library was not compiled for GPU
|
||||||
|
* - -5 Double precision is not supported on card **/
|
||||||
|
int init(PairGPUAns<numtyp,acctyp> &ans, const int nlocal, const int nall);
|
||||||
|
|
||||||
/// Output a message for pair_style acceleration with device stats
|
/// Output a message for pair_style acceleration with device stats
|
||||||
void init_message(FILE *screen, const char *name,
|
void init_message(FILE *screen, const char *name,
|
||||||
const int first_gpu, const int last_gpu);
|
const int first_gpu, const int last_gpu);
|
||||||
|
|
||||||
|
/// Perform charge assignment asynchronously for PPPM
|
||||||
|
void set_single_precompute(PPPMGPUMemory<numtyp,acctyp,
|
||||||
|
float,_lgpu_float4> *pppm);
|
||||||
|
|
||||||
|
/// Perform charge assignment asynchronously for PPPM
|
||||||
|
void set_double_precompute(PPPMGPUMemory<numtyp,acctyp,
|
||||||
|
double,_lgpu_double4> *pppm);
|
||||||
|
|
||||||
|
/// Esimate the overhead from GPU calls from multiple procs
|
||||||
|
/** \param kernel_calls Number of kernel calls/timestep for timing estimated
|
||||||
|
* overhead
|
||||||
|
* \param gpu_overhead Estimated gpu overhead per timestep (sec)
|
||||||
|
* \param driver_overhead Estimated overhead from driver per timestep (s) **/
|
||||||
|
void estimate_gpu_overhead(const int kernel_calls, double &gpu_overhead,
|
||||||
|
double &gpu_driver_overhead);
|
||||||
|
|
||||||
|
/// Returns true if double precision is supported on card
|
||||||
|
inline bool double_precision() { return gpu->double_precision(); }
|
||||||
|
|
||||||
/// Output a message with timing information
|
/// Output a message with timing information
|
||||||
void output_times(UCL_Timer &time_pair, const double avg_split,
|
void output_times(UCL_Timer &time_pair, PairGPUAns<numtyp,acctyp> &ans,
|
||||||
const double max_bytes, FILE *screen);
|
PairGPUNbor &nbor, const double avg_split,
|
||||||
|
const double max_bytes, const double gpu_overhead,
|
||||||
|
const double driver_overhead,
|
||||||
|
const int threads_per_atom, FILE *screen);
|
||||||
|
|
||||||
|
/// Output a message with timing information
|
||||||
|
void output_kspace_times(UCL_Timer &time_in, UCL_Timer &time_out,
|
||||||
|
UCL_Timer & time_map, UCL_Timer & time_rho,
|
||||||
|
UCL_Timer &time_interp,
|
||||||
|
PairGPUAns<numtyp,acctyp> &ans,
|
||||||
|
const double max_bytes, const double cpu_time,
|
||||||
|
const double cpu_idle_time, FILE *screen);
|
||||||
|
|
||||||
/// Clear all memory on host and device associated with atom and nbor data
|
/// Clear all memory on host and device associated with atom and nbor data
|
||||||
void clear();
|
void clear();
|
||||||
@ -70,11 +129,37 @@ class PairGPUDevice {
|
|||||||
/// Clear all memory on host and device
|
/// Clear all memory on host and device
|
||||||
void clear_device();
|
void clear_device();
|
||||||
|
|
||||||
|
/// Add an answer object for putting forces, energies, etc from GPU to LAMMPS
|
||||||
|
inline void add_ans_object(PairGPUAns<numtyp,acctyp> *ans)
|
||||||
|
{ ans_queue.push(ans); }
|
||||||
|
|
||||||
|
/// Add "answers" (force,energies,etc.) into LAMMPS structures
|
||||||
|
inline double fix_gpu(double **f, double **tor, double *eatom,
|
||||||
|
double **vatom, double *virial, double &ecoul) {
|
||||||
|
atom.data_unavail();
|
||||||
|
if (ans_queue.empty()==false) {
|
||||||
|
stop_host_timer();
|
||||||
|
double evdw=0.0;
|
||||||
|
while (ans_queue.empty()==false) {
|
||||||
|
evdw+=ans_queue.front()->get_answers(f,tor,eatom,vatom,virial,ecoul);
|
||||||
|
ans_queue.pop();
|
||||||
|
}
|
||||||
|
return evdw;
|
||||||
|
}
|
||||||
|
return 0.0;
|
||||||
|
}
|
||||||
|
|
||||||
/// Start timer on host
|
/// Start timer on host
|
||||||
inline void start_host_timer() { _cpu_full=MPI_Wtime(); }
|
inline void start_host_timer()
|
||||||
|
{ _cpu_full=MPI_Wtime(); _host_timer_started=true; }
|
||||||
|
|
||||||
/// Stop timer on host
|
/// Stop timer on host
|
||||||
inline void stop_host_timer() { _cpu_full=MPI_Wtime()-_cpu_full; }
|
inline void stop_host_timer() {
|
||||||
|
if (_host_timer_started) {
|
||||||
|
_cpu_full=MPI_Wtime()-_cpu_full;
|
||||||
|
_host_timer_started=false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Return host time
|
/// Return host time
|
||||||
inline double host_time() { return _cpu_full; }
|
inline double host_time() { return _cpu_full; }
|
||||||
@ -114,6 +199,42 @@ class PairGPUDevice {
|
|||||||
inline double particle_split() const { return _particle_split; }
|
inline double particle_split() const { return _particle_split; }
|
||||||
/// Return the initialization count for the device
|
/// Return the initialization count for the device
|
||||||
inline int init_count() const { return _init_count; }
|
inline int init_count() const { return _init_count; }
|
||||||
|
/// True if device is being timed
|
||||||
|
inline bool time_device() const { return _time_device; }
|
||||||
|
|
||||||
|
/// Return the number of threads accessing memory simulatenously
|
||||||
|
inline int num_mem_threads() const { return _num_mem_threads; }
|
||||||
|
/// Return the number of threads per atom for pair styles
|
||||||
|
inline int threads_per_atom() const { return _threads_per_atom; }
|
||||||
|
/// Return the number of threads per atom for pair styles using charge
|
||||||
|
inline int threads_per_charge() const { return _threads_per_charge; }
|
||||||
|
/// Return the min of the pair block size or the device max block size
|
||||||
|
inline int pair_block_size() const { return _block_pair; }
|
||||||
|
/// Return the maximum number of atom types that can be used with shared mem
|
||||||
|
inline int max_shared_types() const { return _max_shared_types; }
|
||||||
|
/// Return the maximum order for PPPM splines
|
||||||
|
inline int pppm_max_spline() const { return _pppm_max_spline; }
|
||||||
|
/// Return the block size for PPPM kernels
|
||||||
|
inline int pppm_block() const { return _pppm_block; }
|
||||||
|
/// Return the block size for neighbor binning
|
||||||
|
inline int block_cell_2d() const { return _block_cell_2d; }
|
||||||
|
/// Return the block size for atom mapping for neighbor builds
|
||||||
|
inline int block_cell_id() const { return _block_cell_id; }
|
||||||
|
/// Return the block size for neighbor build kernel
|
||||||
|
inline int block_nbor_build() const { return _block_nbor_build; }
|
||||||
|
/// Return the block size for "bio" pair styles
|
||||||
|
inline int block_bio_pair() const { return _block_bio_pair; }
|
||||||
|
/// Return the maximum number of atom types for shared mem with "bio" styles
|
||||||
|
inline int max_bio_shared_types() const { return _max_bio_shared_types; }
|
||||||
|
|
||||||
|
// -------------------- SHARED DEVICE ROUTINES --------------------
|
||||||
|
// Perform asynchronous zero of integer array
|
||||||
|
void zero(UCL_D_Vec<int> &mem, const int numel) {
|
||||||
|
int num_blocks=static_cast<int>(ceil(static_cast<double>(numel)/
|
||||||
|
_block_pair));
|
||||||
|
k_zero.set_size(num_blocks,_block_pair);
|
||||||
|
k_zero.run(&mem.begin(),&numel);
|
||||||
|
}
|
||||||
|
|
||||||
// -------------------------- DEVICE DATA -------------------------
|
// -------------------------- DEVICE DATA -------------------------
|
||||||
|
|
||||||
@ -130,11 +251,30 @@ class PairGPUDevice {
|
|||||||
// --------------------------- NBOR DATA ----------------------------
|
// --------------------------- NBOR DATA ----------------------------
|
||||||
|
|
||||||
/// Neighbor Data
|
/// Neighbor Data
|
||||||
PairGPUNbor nbor;
|
PairGPUNborShared _nbor_shared;
|
||||||
|
|
||||||
|
// ------------------------ LONG RANGE DATA -------------------------
|
||||||
|
|
||||||
|
// Long Range Data
|
||||||
|
int _long_range_precompute;
|
||||||
|
PPPMGPUMemory<numtyp,acctyp,float,_lgpu_float4> *pppm_single;
|
||||||
|
PPPMGPUMemory<numtyp,acctyp,double,_lgpu_double4> *pppm_double;
|
||||||
|
/// Precomputations for long range charge assignment (asynchronously)
|
||||||
|
inline void precompute(const int ago, const int nlocal, const int nall,
|
||||||
|
double **host_x, int *host_type, bool &success,
|
||||||
|
double *charge, double *boxlo, double *prd) {
|
||||||
|
if (_long_range_precompute==1)
|
||||||
|
pppm_single->precompute(ago,nlocal,nall,host_x,host_type,success,charge,
|
||||||
|
boxlo,prd);
|
||||||
|
else if (_long_range_precompute==2)
|
||||||
|
pppm_double->precompute(ago,nlocal,nall,host_x,host_type,success,charge,
|
||||||
|
boxlo,prd);
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
std::queue<PairGPUAns<numtyp,acctyp> *> ans_queue;
|
||||||
int _init_count;
|
int _init_count;
|
||||||
bool _device_init;
|
bool _device_init, _host_timer_started, _time_device;
|
||||||
MPI_Comm _comm_world, _comm_replica, _comm_gpu;
|
MPI_Comm _comm_world, _comm_replica, _comm_gpu;
|
||||||
int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me,
|
int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me,
|
||||||
_replica_size;
|
_replica_size;
|
||||||
@ -142,6 +282,19 @@ class PairGPUDevice {
|
|||||||
double _particle_split;
|
double _particle_split;
|
||||||
double _cpu_full;
|
double _cpu_full;
|
||||||
|
|
||||||
|
int _num_mem_threads, _warp_size, _threads_per_atom, _threads_per_charge;
|
||||||
|
int _pppm_max_spline, _pppm_block;
|
||||||
|
int _block_pair, _max_shared_types;
|
||||||
|
int _block_cell_2d, _block_cell_id, _block_nbor_build;
|
||||||
|
int _block_bio_pair, _max_bio_shared_types;
|
||||||
|
|
||||||
|
UCL_Program *dev_program;
|
||||||
|
UCL_Kernel k_zero, k_info;
|
||||||
|
bool _compiled;
|
||||||
|
int compile_kernels();
|
||||||
|
|
||||||
|
int _data_in_estimate, _data_out_estimate;
|
||||||
|
|
||||||
template <class t>
|
template <class t>
|
||||||
inline std::string toa(const t& in) {
|
inline std::string toa(const t& in) {
|
||||||
std::ostringstream o;
|
std::ostringstream o;
|
||||||
|
|||||||
@ -18,15 +18,9 @@
|
|||||||
|
|
||||||
#include "pair_gpu_precision.h"
|
#include "pair_gpu_precision.h"
|
||||||
#include "pair_gpu_nbor.h"
|
#include "pair_gpu_nbor.h"
|
||||||
|
#include "pair_gpu_device.h"
|
||||||
#include "math.h"
|
#include "math.h"
|
||||||
|
|
||||||
#ifdef USE_OPENCL
|
|
||||||
#include "pair_gpu_nbor_cl.h"
|
|
||||||
#else
|
|
||||||
#include "pair_gpu_nbor_ptx.h"
|
|
||||||
#include "pair_gpu_build_ptx.h"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
int PairGPUNbor::bytes_per_atom(const int max_nbors) const {
|
int PairGPUNbor::bytes_per_atom(const int max_nbors) const {
|
||||||
if (_gpu_nbor)
|
if (_gpu_nbor)
|
||||||
return (max_nbors+2)*sizeof(int);
|
return (max_nbors+2)*sizeof(int);
|
||||||
@ -36,12 +30,18 @@ int PairGPUNbor::bytes_per_atom(const int max_nbors) const {
|
|||||||
return (max_nbors+3)*sizeof(int);
|
return (max_nbors+3)*sizeof(int);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool PairGPUNbor::init(const int inum, const int host_inum, const int max_nbors,
|
bool PairGPUNbor::init(PairGPUNborShared *shared, const int inum,
|
||||||
|
const int host_inum, const int max_nbors,
|
||||||
const int maxspecial, UCL_Device &devi,
|
const int maxspecial, UCL_Device &devi,
|
||||||
const bool gpu_nbor, const int gpu_host,
|
const bool gpu_nbor, const int gpu_host,
|
||||||
const bool pre_cut) {
|
const bool pre_cut, const int block_cell_2d,
|
||||||
|
const int block_cell_id, const int block_nbor_build) {
|
||||||
clear();
|
clear();
|
||||||
|
|
||||||
|
_block_cell_2d=block_cell_2d;
|
||||||
|
_block_cell_id=block_cell_id;
|
||||||
|
_block_nbor_build=block_nbor_build;
|
||||||
|
_shared=shared;
|
||||||
dev=&devi;
|
dev=&devi;
|
||||||
_gpu_nbor=gpu_nbor;
|
_gpu_nbor=gpu_nbor;
|
||||||
if (gpu_host==0)
|
if (gpu_host==0)
|
||||||
@ -80,8 +80,11 @@ bool PairGPUNbor::init(const int inum, const int host_inum, const int max_nbors,
|
|||||||
success=success && (host_packed.alloc(2*IJ_SIZE,*dev,
|
success=success && (host_packed.alloc(2*IJ_SIZE,*dev,
|
||||||
UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
|
UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
|
||||||
alloc(success);
|
alloc(success);
|
||||||
|
if (!success)
|
||||||
|
return false;
|
||||||
|
|
||||||
if (_use_packing==false)
|
if (_use_packing==false)
|
||||||
compile_kernels(devi);
|
_shared->compile_kernels(devi,gpu_nbor);
|
||||||
|
|
||||||
return success;
|
return success;
|
||||||
}
|
}
|
||||||
@ -89,13 +92,14 @@ bool PairGPUNbor::init(const int inum, const int host_inum, const int max_nbors,
|
|||||||
void PairGPUNbor::alloc(bool &success) {
|
void PairGPUNbor::alloc(bool &success) {
|
||||||
dev_nbor.clear();
|
dev_nbor.clear();
|
||||||
host_acc.clear();
|
host_acc.clear();
|
||||||
|
int nt=_max_atoms+_max_host;
|
||||||
if (_use_packing==false || _gpu_nbor)
|
if (_use_packing==false || _gpu_nbor)
|
||||||
success=success && (dev_nbor.alloc((_max_nbors+2)*_max_atoms,*dev,
|
success=success && (dev_nbor.alloc((_max_nbors+2)*_max_atoms,*dev,
|
||||||
UCL_READ_ONLY)==UCL_SUCCESS);
|
UCL_READ_ONLY)==UCL_SUCCESS);
|
||||||
else
|
else
|
||||||
success=success && (dev_nbor.alloc(3*_max_atoms,*dev,
|
success=success && (dev_nbor.alloc(3*_max_atoms,*dev,
|
||||||
UCL_READ_ONLY)==UCL_SUCCESS);
|
UCL_READ_ONLY)==UCL_SUCCESS);
|
||||||
success=success && (host_acc.alloc((_max_atoms+_max_host)*2,*dev,
|
success=success && (host_acc.alloc(nt*2,*dev,
|
||||||
UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
|
UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
|
||||||
|
|
||||||
_c_bytes=dev_nbor.row_bytes();
|
_c_bytes=dev_nbor.row_bytes();
|
||||||
@ -108,11 +112,31 @@ void PairGPUNbor::alloc(bool &success) {
|
|||||||
if (_max_host>0) {
|
if (_max_host>0) {
|
||||||
host_nbor.clear();
|
host_nbor.clear();
|
||||||
dev_host_nbor.clear();
|
dev_host_nbor.clear();
|
||||||
success=success && (host_nbor.alloc((_max_nbors+1)*_max_host,*dev,
|
dev_host_numj.clear();
|
||||||
|
host_ilist.clear();
|
||||||
|
host_jlist.clear();
|
||||||
|
|
||||||
|
success=success && (host_nbor.alloc(_max_nbors*_max_host,*dev,
|
||||||
UCL_RW_OPTIMIZED)==UCL_SUCCESS);
|
UCL_RW_OPTIMIZED)==UCL_SUCCESS);
|
||||||
success=success && (dev_host_nbor.alloc((_max_nbors+1)*_max_host,
|
success=success && (dev_host_nbor.alloc(_max_nbors*_max_host,
|
||||||
*dev,UCL_WRITE_ONLY)==UCL_SUCCESS);
|
*dev,UCL_WRITE_ONLY)==UCL_SUCCESS);
|
||||||
_c_bytes+=dev_host_nbor.row_bytes();
|
success=success && (dev_host_numj.alloc(_max_host,*dev,
|
||||||
|
UCL_WRITE_ONLY)==UCL_SUCCESS);
|
||||||
|
success=success && (host_ilist.alloc(nt,*dev,UCL_NOT_PINNED)==UCL_SUCCESS);
|
||||||
|
if (!success)
|
||||||
|
return;
|
||||||
|
for (int i=0; i<nt; i++)
|
||||||
|
host_ilist[i]=i;
|
||||||
|
success=success && (host_jlist.alloc(_max_host,*dev,
|
||||||
|
UCL_NOT_PINNED)==UCL_SUCCESS);
|
||||||
|
if (!success)
|
||||||
|
return;
|
||||||
|
int *ptr=host_nbor.begin();
|
||||||
|
for (int i=0; i<_max_host; i++) {
|
||||||
|
host_jlist[i]=ptr;
|
||||||
|
ptr+=_max_nbors;
|
||||||
|
}
|
||||||
|
_c_bytes+=dev_host_nbor.row_bytes()+dev_host_numj.row_bytes();
|
||||||
}
|
}
|
||||||
if (_maxspecial>0) {
|
if (_maxspecial>0) {
|
||||||
dev_nspecial.clear();
|
dev_nspecial.clear();
|
||||||
@ -145,6 +169,9 @@ void PairGPUNbor::clear() {
|
|||||||
dev_host_nbor.clear();
|
dev_host_nbor.clear();
|
||||||
dev_packed.clear();
|
dev_packed.clear();
|
||||||
host_nbor.clear();
|
host_nbor.clear();
|
||||||
|
dev_host_numj.clear();
|
||||||
|
host_ilist.clear();
|
||||||
|
host_jlist.clear();
|
||||||
dev_nspecial.clear();
|
dev_nspecial.clear();
|
||||||
dev_special.clear();
|
dev_special.clear();
|
||||||
dev_special_t.clear();
|
dev_special_t.clear();
|
||||||
@ -152,27 +179,13 @@ void PairGPUNbor::clear() {
|
|||||||
time_kernel.clear();
|
time_kernel.clear();
|
||||||
time_nbor.clear();
|
time_nbor.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (_compiled) {
|
|
||||||
if (_gpu_nbor) {
|
|
||||||
k_cell_id.clear();
|
|
||||||
k_cell_counts.clear();
|
|
||||||
k_build_nbor.clear();
|
|
||||||
k_transpose.clear();
|
|
||||||
k_special.clear();
|
|
||||||
delete build_program;
|
|
||||||
} else {
|
|
||||||
k_nbor.clear();
|
|
||||||
delete nbor_program;
|
|
||||||
}
|
|
||||||
_compiled=false;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
double PairGPUNbor::host_memory_usage() const {
|
double PairGPUNbor::host_memory_usage() const {
|
||||||
if (_gpu_nbor) {
|
if (_gpu_nbor) {
|
||||||
if (_gpu_host)
|
if (_gpu_host)
|
||||||
return host_nbor.row_bytes()*host_nbor.rows();
|
return host_nbor.row_bytes()*host_nbor.rows()+host_ilist.row_bytes()+
|
||||||
|
host_jlist.row_bytes();
|
||||||
else
|
else
|
||||||
return 0;
|
return 0;
|
||||||
} else
|
} else
|
||||||
@ -186,7 +199,7 @@ void PairGPUNbor::get_host(const int inum, int *ilist, int *numj,
|
|||||||
|
|
||||||
UCL_H_Vec<int> ilist_view;
|
UCL_H_Vec<int> ilist_view;
|
||||||
ilist_view.view(ilist,inum,*dev);
|
ilist_view.view(ilist,inum,*dev);
|
||||||
ucl_copy(dev_nbor,ilist_view,true);
|
ucl_copy(dev_nbor,ilist_view,false);
|
||||||
|
|
||||||
UCL_D_Vec<int> nbor_offset;
|
UCL_D_Vec<int> nbor_offset;
|
||||||
UCL_H_Vec<int> host_offset;
|
UCL_H_Vec<int> host_offset;
|
||||||
@ -238,46 +251,20 @@ void PairGPUNbor::get_host(const int inum, int *ilist, int *numj,
|
|||||||
if (_use_packing==false) {
|
if (_use_packing==false) {
|
||||||
time_kernel.start();
|
time_kernel.start();
|
||||||
int GX=static_cast<int>(ceil(static_cast<double>(inum)/block_size));
|
int GX=static_cast<int>(ceil(static_cast<double>(inum)/block_size));
|
||||||
k_nbor.set_size(GX,block_size);
|
_shared->k_nbor.set_size(GX,block_size);
|
||||||
k_nbor.run(&dev_nbor.begin(), &dev_packed.begin(), &inum);
|
_shared->k_nbor.run(&dev_nbor.begin(), &dev_packed.begin(), &inum);
|
||||||
time_kernel.stop();
|
time_kernel.stop();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void PairGPUNbor::compile_kernels(UCL_Device &dev) {
|
|
||||||
std::string flags="-cl-fast-relaxed-math -cl-mad-enable";
|
|
||||||
|
|
||||||
if (_gpu_nbor==false) {
|
|
||||||
nbor_program=new UCL_Program(dev);
|
|
||||||
nbor_program->load_string(pair_gpu_nbor_kernel,flags.c_str());
|
|
||||||
k_nbor.set_function(*nbor_program,"kernel_unpack");
|
|
||||||
} else {
|
|
||||||
build_program=new UCL_Program(dev);
|
|
||||||
#ifdef USE_OPENCL
|
|
||||||
std::cerr << "CANNOT CURRENTLY USE GPU NEIGHBORING WITH OPENCL\n";
|
|
||||||
exit(1);
|
|
||||||
#else
|
|
||||||
build_program->load_string(pair_gpu_build_kernel,flags.c_str());
|
|
||||||
#endif
|
|
||||||
k_cell_id.set_function(*build_program,"calc_cell_id");
|
|
||||||
k_cell_counts.set_function(*build_program,"kernel_calc_cell_counts");
|
|
||||||
k_build_nbor.set_function(*build_program,"calc_neigh_list_cell");
|
|
||||||
k_transpose.set_function(*build_program,"transpose");
|
|
||||||
k_special.set_function(*build_program,"kernel_special");
|
|
||||||
neigh_tex.get_texture(*build_program,"neigh_tex");
|
|
||||||
}
|
|
||||||
_compiled=true;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
void PairGPUNbor::build_nbor_list(const int inum, const int host_inum,
|
void PairGPUNbor::build_nbor_list(const int inum, const int host_inum,
|
||||||
const int nall,
|
const int nall,
|
||||||
PairGPUAtom<numtyp,acctyp> &atom,
|
PairGPUAtom<numtyp,acctyp> &atom,
|
||||||
double *boxlo, double *boxhi, int *tag,
|
double *sublo, double *subhi, int *tag,
|
||||||
int **nspecial, int **special, bool &success,
|
int **nspecial, int **special, bool &success,
|
||||||
int &mn) {
|
int &mn) {
|
||||||
const int nt=inum+host_inum;
|
const int nt=inum+host_inum;
|
||||||
|
|
||||||
if (_maxspecial>0) {
|
if (_maxspecial>0) {
|
||||||
time_nbor.start();
|
time_nbor.start();
|
||||||
UCL_H_Vec<int> view_nspecial, view_special, view_tag;
|
UCL_H_Vec<int> view_nspecial, view_special, view_tag;
|
||||||
@ -290,25 +277,25 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum,
|
|||||||
time_nbor.stop();
|
time_nbor.stop();
|
||||||
time_nbor.add_to_total();
|
time_nbor.add_to_total();
|
||||||
time_kernel.start();
|
time_kernel.start();
|
||||||
const int b2x=8;
|
const int b2x=_block_cell_2d;
|
||||||
const int b2y=8;
|
const int b2y=_block_cell_2d;
|
||||||
const int g2x=static_cast<int>(ceil(static_cast<double>(_maxspecial)/b2x));
|
const int g2x=static_cast<int>(ceil(static_cast<double>(_maxspecial)/b2x));
|
||||||
const int g2y=static_cast<int>(ceil(static_cast<double>(nt)/b2y));
|
const int g2y=static_cast<int>(ceil(static_cast<double>(nt)/b2y));
|
||||||
k_transpose.set_size(g2x,g2y,b2x,b2y);
|
_shared->k_transpose.set_size(g2x,g2y,b2x,b2y);
|
||||||
k_transpose.run(&dev_special.begin(),&dev_special_t.begin(),&_maxspecial,
|
_shared->k_transpose.run(&dev_special.begin(),&dev_special_t.begin(),
|
||||||
&nt);
|
&_maxspecial,&nt);
|
||||||
} else
|
} else
|
||||||
time_kernel.start();
|
time_kernel.start();
|
||||||
|
|
||||||
_nbor_pitch=inum;
|
_nbor_pitch=inum;
|
||||||
neigh_tex.bind_float(atom.dev_x,4);
|
_shared->neigh_tex.bind_float(atom.dev_x,4);
|
||||||
|
|
||||||
int ncellx, ncelly, ncellz, ncell_3d;
|
int ncellx, ncelly, ncellz, ncell_3d;
|
||||||
ncellx = static_cast<int>(ceil(((boxhi[0] - boxlo[0]) +
|
ncellx = static_cast<int>(ceil(((subhi[0] - sublo[0]) +
|
||||||
2.0*_cell_size)/_cell_size));
|
2.0*_cell_size)/_cell_size));
|
||||||
ncelly = static_cast<int>(ceil(((boxhi[1] - boxlo[1]) +
|
ncelly = static_cast<int>(ceil(((subhi[1] - sublo[1]) +
|
||||||
2.0*_cell_size)/_cell_size));
|
2.0*_cell_size)/_cell_size));
|
||||||
ncellz = static_cast<int>(ceil(((boxhi[2] - boxlo[2]) +
|
ncellz = static_cast<int>(ceil(((subhi[2] - sublo[2]) +
|
||||||
2.0*_cell_size)/_cell_size));
|
2.0*_cell_size)/_cell_size));
|
||||||
ncell_3d = ncellx * ncelly * ncellz;
|
ncell_3d = ncellx * ncelly * ncellz;
|
||||||
UCL_D_Vec<int> cell_counts;
|
UCL_D_Vec<int> cell_counts;
|
||||||
@ -316,34 +303,35 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum,
|
|||||||
_cell_bytes=cell_counts.row_bytes();
|
_cell_bytes=cell_counts.row_bytes();
|
||||||
|
|
||||||
/* build cell list on GPU */
|
/* build cell list on GPU */
|
||||||
const int neigh_block=128;
|
const int neigh_block=_block_cell_id;
|
||||||
const int GX=(int)ceil((float)nall/neigh_block);
|
const int GX=(int)ceil((float)nall/neigh_block);
|
||||||
const numtyp boxlo0=static_cast<numtyp>(boxlo[0]);
|
const numtyp sublo0=static_cast<numtyp>(sublo[0]);
|
||||||
const numtyp boxlo1=static_cast<numtyp>(boxlo[1]);
|
const numtyp sublo1=static_cast<numtyp>(sublo[1]);
|
||||||
const numtyp boxlo2=static_cast<numtyp>(boxlo[2]);
|
const numtyp sublo2=static_cast<numtyp>(sublo[2]);
|
||||||
const numtyp boxhi0=static_cast<numtyp>(boxhi[0]);
|
const numtyp subhi0=static_cast<numtyp>(subhi[0]);
|
||||||
const numtyp boxhi1=static_cast<numtyp>(boxhi[1]);
|
const numtyp subhi1=static_cast<numtyp>(subhi[1]);
|
||||||
const numtyp boxhi2=static_cast<numtyp>(boxhi[2]);
|
const numtyp subhi2=static_cast<numtyp>(subhi[2]);
|
||||||
const numtyp cell_size_cast=static_cast<numtyp>(_cell_size);
|
const numtyp cell_size_cast=static_cast<numtyp>(_cell_size);
|
||||||
k_cell_id.set_size(GX,neigh_block);
|
_shared->k_cell_id.set_size(GX,neigh_block);
|
||||||
k_cell_id.run(&atom.dev_x.begin(), &atom.dev_cell_id.begin(),
|
_shared->k_cell_id.run(&atom.dev_x.begin(), &atom.dev_cell_id.begin(),
|
||||||
&atom.dev_particle_id.begin(),
|
&atom.dev_particle_id.begin(),
|
||||||
&boxlo0, &boxlo1, &boxlo2, &boxhi0, &boxhi1,
|
&sublo0, &sublo1, &sublo2, &subhi0, &subhi1,
|
||||||
&boxhi2, &cell_size_cast, &ncellx, &ncelly, &nall);
|
&subhi2, &cell_size_cast, &ncellx, &ncelly, &nall);
|
||||||
|
|
||||||
atom.sort_neighbor(nall);
|
atom.sort_neighbor(nall);
|
||||||
|
|
||||||
/* calculate cell count */
|
/* calculate cell count */
|
||||||
k_cell_counts.set_size(GX,neigh_block);
|
_shared->k_cell_counts.set_size(GX,neigh_block);
|
||||||
k_cell_counts.run(&atom.dev_cell_id.begin(), &cell_counts.begin(), &nall,
|
_shared->k_cell_counts.run(&atom.dev_cell_id.begin(), &cell_counts.begin(),
|
||||||
&ncell_3d);
|
&nall, &ncell_3d);
|
||||||
|
|
||||||
/* build the neighbor list */
|
/* build the neighbor list */
|
||||||
const int cell_block=64;
|
const int cell_block=_block_nbor_build;
|
||||||
k_build_nbor.set_size(ncellx, ncelly*ncellz, cell_block, 1);
|
_shared->k_build_nbor.set_size(ncellx, ncelly*ncellz, cell_block, 1);
|
||||||
k_build_nbor.run(&atom.dev_x.begin(), &atom.dev_particle_id.begin(),
|
_shared->k_build_nbor.run(&atom.dev_x.begin(), &atom.dev_particle_id.begin(),
|
||||||
&cell_counts.begin(), &dev_nbor.begin(),
|
&cell_counts.begin(), &dev_nbor.begin(),
|
||||||
&dev_host_nbor.begin(), &_max_nbors, &cell_size_cast,
|
&dev_host_nbor.begin(), &dev_host_numj.begin(),
|
||||||
|
&_max_nbors,&cell_size_cast,
|
||||||
&ncellx, &ncelly, &ncellz, &inum, &nt, &nall);
|
&ncellx, &ncelly, &ncellz, &inum, &nt, &nall);
|
||||||
|
|
||||||
/* Get the maximum number of nbors and realloc if necessary */
|
/* Get the maximum number of nbors and realloc if necessary */
|
||||||
@ -353,7 +341,7 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum,
|
|||||||
if (nt>inum) {
|
if (nt>inum) {
|
||||||
UCL_H_Vec<int> host_offset;
|
UCL_H_Vec<int> host_offset;
|
||||||
host_offset.view_offset(inum,host_acc,nt-inum);
|
host_offset.view_offset(inum,host_acc,nt-inum);
|
||||||
ucl_copy(host_offset,dev_host_nbor,nt-inum,false);
|
ucl_copy(host_offset,dev_host_numj,nt-inum,false);
|
||||||
}
|
}
|
||||||
mn=host_acc[0];
|
mn=host_acc[0];
|
||||||
for (int i=1; i<nt; i++)
|
for (int i=1; i<nt; i++)
|
||||||
@ -368,10 +356,15 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum,
|
|||||||
if (_max_host>0) {
|
if (_max_host>0) {
|
||||||
host_nbor.clear();
|
host_nbor.clear();
|
||||||
dev_host_nbor.clear();
|
dev_host_nbor.clear();
|
||||||
success=success && (host_nbor.alloc((mn+1)*_max_host,dev_nbor,
|
success=success && (host_nbor.alloc(mn*_max_host,dev_nbor,
|
||||||
UCL_RW_OPTIMIZED)==UCL_SUCCESS);
|
UCL_RW_OPTIMIZED)==UCL_SUCCESS);
|
||||||
success=success && (dev_host_nbor.alloc((mn+1)*_max_host,
|
success=success && (dev_host_nbor.alloc(mn*_max_host,
|
||||||
dev_nbor,UCL_WRITE_ONLY)==UCL_SUCCESS);
|
dev_nbor,UCL_WRITE_ONLY)==UCL_SUCCESS);
|
||||||
|
int *ptr=host_nbor.begin();
|
||||||
|
for (int i=0; i<_max_host; i++) {
|
||||||
|
host_jlist[i]=ptr;
|
||||||
|
ptr+=mn;
|
||||||
|
}
|
||||||
_gpu_bytes+=dev_host_nbor.row_bytes();
|
_gpu_bytes+=dev_host_nbor.row_bytes();
|
||||||
}
|
}
|
||||||
if (_alloc_packed) {
|
if (_alloc_packed) {
|
||||||
@ -385,28 +378,29 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum,
|
|||||||
_max_nbors=mn;
|
_max_nbors=mn;
|
||||||
time_kernel.stop();
|
time_kernel.stop();
|
||||||
time_kernel.add_to_total();
|
time_kernel.add_to_total();
|
||||||
build_nbor_list(inum, host_inum, nall, atom, boxlo, boxhi, tag, nspecial,
|
build_nbor_list(inum, host_inum, nall, atom, sublo, subhi, tag, nspecial,
|
||||||
special, success, mn);
|
special, success, mn);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (_maxspecial>0) {
|
if (_maxspecial>0) {
|
||||||
const int GX2=static_cast<int>(ceil(static_cast<double>(nt)/cell_block));
|
const int GX2=static_cast<int>(ceil(static_cast<double>(nt)/cell_block));
|
||||||
k_special.set_size(GX2,cell_block);
|
_shared->k_special.set_size(GX2,cell_block);
|
||||||
k_special.run(&dev_nbor.begin(), &dev_host_nbor.begin(),
|
_shared->k_special.run(&dev_nbor.begin(), &dev_host_nbor.begin(),
|
||||||
&atom.dev_tag.begin(), &dev_nspecial.begin(),
|
&dev_host_numj.begin(), &atom.dev_tag.begin(),
|
||||||
&dev_special.begin(), &inum, &nt, &nall);
|
&dev_nspecial.begin(), &dev_special.begin(),
|
||||||
|
&inum, &nt, &nall, &_max_nbors);
|
||||||
}
|
}
|
||||||
time_kernel.stop();
|
time_kernel.stop();
|
||||||
|
|
||||||
time_nbor.start();
|
time_nbor.start();
|
||||||
if (_gpu_host)
|
if (_gpu_host)
|
||||||
ucl_copy(host_nbor,dev_host_nbor,host_inum*(mn+1),false);
|
ucl_copy(host_nbor,dev_host_nbor,false);
|
||||||
time_nbor.stop();
|
time_nbor.stop();
|
||||||
}
|
}
|
||||||
|
|
||||||
template void PairGPUNbor::build_nbor_list<PRECISION,ACC_PRECISION>
|
template void PairGPUNbor::build_nbor_list<PRECISION,ACC_PRECISION>
|
||||||
(const int inum, const int host_inum, const int nall,
|
(const int inum, const int host_inum, const int nall,
|
||||||
PairGPUAtom<PRECISION,ACC_PRECISION> &atom, double *boxlo, double *boxhi,
|
PairGPUAtom<PRECISION,ACC_PRECISION> &atom, double *sublo, double *subhi,
|
||||||
int *, int **, int **, bool &success, int &mn);
|
int *, int **, int **, bool &success, int &mn);
|
||||||
|
|
||||||
|
|||||||
@ -19,32 +19,27 @@
|
|||||||
#define PAIR_GPU_NBOR_H
|
#define PAIR_GPU_NBOR_H
|
||||||
|
|
||||||
#include "pair_gpu_atom.h"
|
#include "pair_gpu_atom.h"
|
||||||
|
#include "pair_gpu_nbor_shared.h"
|
||||||
|
|
||||||
#define IJ_SIZE 131072
|
#define IJ_SIZE 131072
|
||||||
|
|
||||||
#ifdef USE_OPENCL
|
#ifdef USE_OPENCL
|
||||||
|
|
||||||
#include "geryon/ocl_device.h"
|
|
||||||
#include "geryon/ocl_timer.h"
|
#include "geryon/ocl_timer.h"
|
||||||
#include "geryon/ocl_mat.h"
|
#include "geryon/ocl_mat.h"
|
||||||
#include "geryon/ocl_kernel.h"
|
|
||||||
#include "geryon/ocl_texture.h"
|
|
||||||
using namespace ucl_opencl;
|
using namespace ucl_opencl;
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
#include "geryon/nvd_device.h"
|
|
||||||
#include "geryon/nvd_timer.h"
|
#include "geryon/nvd_timer.h"
|
||||||
#include "geryon/nvd_mat.h"
|
#include "geryon/nvd_mat.h"
|
||||||
#include "geryon/nvd_kernel.h"
|
|
||||||
#include "geryon/nvd_texture.h"
|
|
||||||
using namespace ucl_cudadr;
|
using namespace ucl_cudadr;
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
class PairGPUNbor {
|
class PairGPUNbor {
|
||||||
public:
|
public:
|
||||||
PairGPUNbor() : _allocated(false), _use_packing(false), _compiled(false) {}
|
PairGPUNbor() : _allocated(false), _use_packing(false) {}
|
||||||
~PairGPUNbor() { clear(); }
|
~PairGPUNbor() { clear(); }
|
||||||
|
|
||||||
/// Determine whether neighbor unpacking should be used
|
/// Determine whether neighbor unpacking should be used
|
||||||
@ -62,9 +57,11 @@ class PairGPUNbor {
|
|||||||
* 2 if gpu_nbor is true, and host needs a full nbor list
|
* 2 if gpu_nbor is true, and host needs a full nbor list
|
||||||
* \param pre_cut True if cutoff test will be performed in separate kernel
|
* \param pre_cut True if cutoff test will be performed in separate kernel
|
||||||
* than the force kernel **/
|
* than the force kernel **/
|
||||||
bool init(const int inum, const int host_inum, const int max_nbors,
|
bool init(PairGPUNborShared *shared, const int inum, const int host_inum,
|
||||||
const int maxspecial, UCL_Device &dev, const bool gpu_nbor,
|
const int max_nbors, const int maxspecial, UCL_Device &dev,
|
||||||
const int gpu_host, const bool pre_cut);
|
const bool gpu_nbor, const int gpu_host, const bool pre_cut,
|
||||||
|
const int block_cell_2d, const int block_cell_id,
|
||||||
|
const int block_nbor_build);
|
||||||
|
|
||||||
/// Set the size of the cutoff+skin
|
/// Set the size of the cutoff+skin
|
||||||
inline void cell_size(const double size) { _cell_size=size; }
|
inline void cell_size(const double size) { _cell_size=size; }
|
||||||
@ -131,18 +128,18 @@ class PairGPUNbor {
|
|||||||
inline int max_nbors() const { return _max_nbors; }
|
inline int max_nbors() const { return _max_nbors; }
|
||||||
|
|
||||||
/// Loop through neighbor count array and return maximum nbors for a particle
|
/// Loop through neighbor count array and return maximum nbors for a particle
|
||||||
inline int max_nbor_loop(const int inum, int *numj) const {
|
inline int max_nbor_loop(const int inum, int *numj, int *ilist) const {
|
||||||
int mn=0;
|
int mn=0;
|
||||||
for (int i=0; i<inum; i++)
|
for (int i=0; i<inum; i++)
|
||||||
mn=std::max(mn,numj[i]);
|
mn=std::max(mn,numj[ilist[i]]);
|
||||||
return mn;
|
return mn;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Build nbor list on the device
|
/// Build nbor list on the device
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
void build_nbor_list(const int inum, const int host_inum, const int nall,
|
void build_nbor_list(const int inum, const int host_inum, const int nall,
|
||||||
PairGPUAtom<numtyp,acctyp> &atom, double *boxlo,
|
PairGPUAtom<numtyp,acctyp> &atom, double *sublo,
|
||||||
double *boxhi, int *tag, int **nspecial, int **special,
|
double *subhi, int *tag, int **nspecial, int **special,
|
||||||
bool &success, int &max_nbors);
|
bool &success, int &max_nbors);
|
||||||
|
|
||||||
/// Return the number of bytes used on device
|
/// Return the number of bytes used on device
|
||||||
@ -176,31 +173,31 @@ class PairGPUNbor {
|
|||||||
UCL_H_Vec<int> host_nbor;
|
UCL_H_Vec<int> host_nbor;
|
||||||
/// Device storage for neighbor list matrix that will be copied to host
|
/// Device storage for neighbor list matrix that will be copied to host
|
||||||
/** - 1st row is numj
|
/** - 1st row is numj
|
||||||
* - Remaining rows are nbors **/
|
* - Remaining rows are by atom, columns are nbors **/
|
||||||
UCL_D_Vec<int> dev_host_nbor;
|
UCL_D_Vec<int> dev_host_nbor;
|
||||||
|
UCL_D_Vec<int> dev_host_numj;
|
||||||
|
UCL_H_Vec<int> host_ilist;
|
||||||
|
UCL_H_Vec<int*> host_jlist;
|
||||||
/// Device storage for special neighbor counts
|
/// Device storage for special neighbor counts
|
||||||
UCL_D_Vec<int> dev_nspecial;
|
UCL_D_Vec<int> dev_nspecial;
|
||||||
/// Device storage for special neighbors
|
/// Device storage for special neighbors
|
||||||
UCL_D_Vec<int> dev_special, dev_special_t;
|
UCL_D_Vec<int> dev_special, dev_special_t;
|
||||||
/// Texture for cached position/type access with CUDA
|
|
||||||
UCL_Texture neigh_tex;
|
|
||||||
|
|
||||||
/// Device timers
|
/// Device timers
|
||||||
UCL_Timer time_nbor, time_kernel;
|
UCL_Timer time_nbor, time_kernel;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
PairGPUNborShared *_shared;
|
||||||
UCL_Device *dev;
|
UCL_Device *dev;
|
||||||
UCL_Program *nbor_program, *build_program;
|
bool _allocated, _use_packing;
|
||||||
UCL_Kernel k_nbor, k_cell_id, k_cell_counts, k_build_nbor;
|
|
||||||
UCL_Kernel k_transpose, k_special;
|
|
||||||
bool _allocated, _use_packing, _compiled;
|
|
||||||
void compile_kernels(UCL_Device &dev);
|
|
||||||
int _max_atoms, _max_nbors, _max_host, _nbor_pitch, _maxspecial;
|
int _max_atoms, _max_nbors, _max_host, _nbor_pitch, _maxspecial;
|
||||||
bool _gpu_nbor, _gpu_host, _alloc_packed;
|
bool _gpu_nbor, _gpu_host, _alloc_packed;
|
||||||
double _cell_size;
|
double _cell_size;
|
||||||
|
|
||||||
double _gpu_bytes, _c_bytes, _cell_bytes;
|
double _gpu_bytes, _c_bytes, _cell_bytes;
|
||||||
void alloc(bool &success);
|
void alloc(bool &success);
|
||||||
|
|
||||||
|
int _block_cell_2d, _block_cell_id, _block_nbor_build;
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -84,8 +84,6 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) {
|
|||||||
#define acctyp4 _lgpu_float4
|
#define acctyp4 _lgpu_float4
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define MAX_SHARED_TYPES 8
|
|
||||||
#define MAX_BIO_SHARED_TYPES 128
|
|
||||||
enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
|
enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
Reference in New Issue
Block a user