From 5f799182b3822786373f4e10b43a405711bb27d2 Mon Sep 17 00:00:00 2001 From: sjplimp Date: Mon, 2 May 2011 15:02:52 +0000 Subject: [PATCH] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6053 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- lib/gpu/Makefile.fermi | 2 +- lib/gpu/Makefile.lens | 6 +- lib/gpu/Makefile.lincoln | 2 +- lib/gpu/Makefile.linux | 2 +- lib/gpu/Makefile.linux_opencl | 2 +- lib/gpu/Makefile.longhorn | 2 +- lib/gpu/Makefile.mac | 2 +- lib/gpu/Makefile.mac_opencl | 2 +- lib/gpu/Nvidia.makefile | 103 +++++- lib/gpu/Opencl.makefile | 84 +++-- lib/gpu/README | 1 + lib/gpu/atomic_gpu_memory.cpp | 133 ++++--- lib/gpu/atomic_gpu_memory.h | 66 ++-- lib/gpu/charge_gpu_memory.cpp | 140 +++++--- lib/gpu/charge_gpu_memory.h | 72 ++-- lib/gpu/cmm_cut_gpu.cpp | 68 ++-- lib/gpu/cmm_cut_gpu_kernel.cu | 199 ++++++++--- lib/gpu/cmm_cut_gpu_memory.cpp | 49 +-- lib/gpu/cmm_cut_gpu_memory.h | 21 +- lib/gpu/cmmc_long_gpu.cpp | 82 ++--- lib/gpu/cmmc_long_gpu_kernel.cu | 208 ++++++++--- lib/gpu/cmmc_long_gpu_memory.cpp | 57 +-- lib/gpu/cmmc_long_gpu_memory.h | 25 +- lib/gpu/crml_gpu.cpp | 95 ++--- lib/gpu/crml_gpu_kernel.cu | 212 ++++++++--- lib/gpu/crml_gpu_memory.cpp | 42 ++- lib/gpu/crml_gpu_memory.h | 29 +- lib/gpu/gb_gpu.cpp | 203 ++++++----- lib/gpu/gb_gpu_extra.h | 5 +- lib/gpu/gb_gpu_kernel.cu | 535 ++++++++++++++------------- lib/gpu/gb_gpu_kernel_lj.cu | 261 ++++++++++---- lib/gpu/gb_gpu_kernel_nbor.cu | 5 +- lib/gpu/gb_gpu_memory.cpp | 94 +++-- lib/gpu/gb_gpu_memory.h | 75 ++-- lib/gpu/geryon/VERSION.txt | 4 +- lib/gpu/geryon/nvc_device.h | 4 +- lib/gpu/geryon/nvd_device.h | 16 +- lib/gpu/geryon/nvd_timer.h | 12 + lib/gpu/geryon/ocl_timer.h | 12 + lib/gpu/geryon/ucl_arg_kludge.h | 597 ++++++++++++++++++++++++++++++- lib/gpu/geryon/ucl_d_mat.h | 40 ++- lib/gpu/geryon/ucl_d_vec.h | 35 +- lib/gpu/geryon/ucl_h_mat.h | 44 ++- lib/gpu/geryon/ucl_h_vec.h | 40 ++- lib/gpu/geryon/ucl_nv_kernel.h | 19 +- lib/gpu/lj96_cut_gpu.cpp | 68 ++-- lib/gpu/lj96_cut_gpu_kernel.cu | 197 +++++++--- lib/gpu/lj96_cut_gpu_memory.cpp | 35 +- lib/gpu/lj96_cut_gpu_memory.h | 21 +- lib/gpu/lj_cut_gpu.cpp | 67 ++-- lib/gpu/lj_cut_gpu_kernel.cu | 197 +++++++--- lib/gpu/lj_cut_gpu_memory.cpp | 49 +-- lib/gpu/lj_cut_gpu_memory.h | 21 +- lib/gpu/ljc_cut_gpu.cpp | 82 ++--- lib/gpu/ljc_cut_gpu_kernel.cu | 209 ++++++++--- lib/gpu/ljc_cut_gpu_memory.cpp | 52 +-- lib/gpu/ljc_cut_gpu_memory.h | 25 +- lib/gpu/ljcl_cut_gpu.cpp | 82 ++--- lib/gpu/ljcl_cut_gpu_kernel.cu | 208 ++++++++--- lib/gpu/ljcl_cut_gpu_memory.cpp | 35 +- lib/gpu/ljcl_cut_gpu_memory.h | 25 +- lib/gpu/pair_gpu_atom.cpp | 360 +++---------------- lib/gpu/pair_gpu_atom.h | 270 +++++++------- lib/gpu/pair_gpu_balance.h | 87 ++--- lib/gpu/pair_gpu_build_kernel.cu | 77 ++-- lib/gpu/pair_gpu_device.cpp | 448 ++++++++++++++++++++--- lib/gpu/pair_gpu_device.h | 181 +++++++++- lib/gpu/pair_gpu_nbor.cpp | 196 +++++----- lib/gpu/pair_gpu_nbor.h | 41 +-- lib/gpu/pair_gpu_precision.h | 2 - 70 files changed, 4489 insertions(+), 2253 deletions(-) diff --git a/lib/gpu/Makefile.fermi b/lib/gpu/Makefile.fermi index d830c8924c..98c823cf40 100644 --- a/lib/gpu/Makefile.fermi +++ b/lib/gpu/Makefile.fermi @@ -26,7 +26,7 @@ CUDA_INCLUDE = -I$(CUDA_HOME)/include CUDA_LIB = -L$(CUDA_HOME)/lib64 -Xlinker -rpath -Xlinker $(CUDA_HOME)/lib64 CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math -CUDR_CPP = mpic++ -DMPI_GERYON -I$(CUDA_HOME)/include +CUDR_CPP = mpic++ -DMPI_GERYON -DUCL_NO_EXIT -I$(CUDA_HOME)/include CUDR_OPTS = -O3 -ffast-math -funroll-loops -DMPI_GERYON BIN_DIR = ./ diff --git a/lib/gpu/Makefile.lens b/lib/gpu/Makefile.lens index 3b6301277f..d049967c5f 100644 --- a/lib/gpu/Makefile.lens +++ b/lib/gpu/Makefile.lens @@ -17,16 +17,16 @@ # Paul Crozier (SNL), pscrozi@sandia.gov # ------------------------------------------------------------------------- */ -CUDA_HOME = /sw/analysis-x64/cuda/3.0/sl5.0_binary/ +CUDA_HOME = /sw/analysis-x64/cuda/3.2/centos5.5_binary/ NVCC = nvcc CUDA_ARCH = -arch=sm_13 -CUDA_PRECISION = -D_SINGLE_SINGLE +CUDA_PRECISION = -D_SINGLE_DOUBLE CUDA_INCLUDE = -I$(CUDA_HOME)/include CUDA_LIB = -L$(CUDA_HOME)/lib64 CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math -CUDR_CPP = mpic++ -DMPI_GERYON -openmp +CUDR_CPP = mpic++ -DMPI_GERYON -DUCL_NO_EXIT -openmp CUDR_OPTS = -O2 -xSSE2 -ip -use-intel-optimized-headers -fno-alias BIN_DIR = ./ diff --git a/lib/gpu/Makefile.lincoln b/lib/gpu/Makefile.lincoln index 97a7901811..bbaca61ef1 100644 --- a/lib/gpu/Makefile.lincoln +++ b/lib/gpu/Makefile.lincoln @@ -24,7 +24,7 @@ CUDA_INCLUDE = -I$(CUDA_HOME)/include CUDA_LIB = -L$(CUDA_HOME)/lib64 -Wl,-rpath,$(CUDA_HOME)/lib64 CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math -CUDR_CPP = mpic++ -DMPI_GERYON +CUDR_CPP = mpic++ -DMPI_GERYON -DUCL_NO_EXIT CUDR_OPTS = -O3 -DMPI_GERYON -ffast-math -funroll-loops BIN_DIR = ./ diff --git a/lib/gpu/Makefile.linux b/lib/gpu/Makefile.linux index c0001a54ab..d69a00a817 100644 --- a/lib/gpu/Makefile.linux +++ b/lib/gpu/Makefile.linux @@ -26,7 +26,7 @@ CUDA_INCLUDE = -I$(CUDA_HOME)/include CUDA_LIB = -L$(CUDA_HOME)/lib64 CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math -CUDR_CPP = mpic++ -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK +CUDR_CPP = mpic++ -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK CUDR_OPTS = -O2 # -xHost -no-prec-div -ansi-alias BIN_DIR = ./ diff --git a/lib/gpu/Makefile.linux_opencl b/lib/gpu/Makefile.linux_opencl index 69522298c5..3d65c9dc48 100644 --- a/lib/gpu/Makefile.linux_opencl +++ b/lib/gpu/Makefile.linux_opencl @@ -17,7 +17,7 @@ # Paul Crozier (SNL), pscrozi@sandia.gov # ------------------------------------------------------------------------- */ -OCL_CPP = mpic++ -I./geryon/opencl -O3 -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK +OCL_CPP = mpic++ -O3 -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK OCL_LINK = -lOpenCL OCL_PREC = -D_SINGLE_SINGLE diff --git a/lib/gpu/Makefile.longhorn b/lib/gpu/Makefile.longhorn index ba921f0f68..cc41174332 100644 --- a/lib/gpu/Makefile.longhorn +++ b/lib/gpu/Makefile.longhorn @@ -23,7 +23,7 @@ CUDA_INCLUDE = -I$(CUDA_HOME)/include CUDA_LIB = -L$(TACC_CUDA_LIB) -Wl,-rpath,$(TACC_CUDA_LIB) CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math -CUDR_CPP = mpicxx -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK +CUDR_CPP = mpicxx -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK CUDR_OPTS = -O2 # -xHost -no-prec-div -ansi-alias BIN_DIR = ./ diff --git a/lib/gpu/Makefile.mac b/lib/gpu/Makefile.mac index f061a1a68a..5276ac10b2 100644 --- a/lib/gpu/Makefile.mac +++ b/lib/gpu/Makefile.mac @@ -24,7 +24,7 @@ CUDA_ARCH = -arch=sm_11 CUDA_PRECISION = -D_SINGLE_SINGLE CUDA_INCLUDE = -I$(CUDA_HOME)/include CUDA_LIB = -L$(CUDA_HOME)/lib -CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math -m32 +CUDA_OPTS = -DUNIX -DUCL_NO_EXIT -O3 -Xptxas -v --use_fast_math -m32 CUDR_CPP = mpic++ CUDR_OPTS = -O2 -m32 -g diff --git a/lib/gpu/Makefile.mac_opencl b/lib/gpu/Makefile.mac_opencl index 53d6d466e2..50ed67e9c3 100644 --- a/lib/gpu/Makefile.mac_opencl +++ b/lib/gpu/Makefile.mac_opencl @@ -17,7 +17,7 @@ # Paul Crozier (SNL), pscrozi@sandia.gov # ------------------------------------------------------------------------- */ -OCL_CPP = mpic++ -I./geryon/opencl_1_0 -O3 -DMPI_GERYON +OCL_CPP = mpic++ -I./geryon/opencl_1_0 -O3 -DMPI_GERYON -DUCL_NO_EXIT OCL_LINK = -framework OpenCL OCL_PREC = -D_SINGLE_SINGLE diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile index adf281e156..17f616ab37 100644 --- a/lib/gpu/Nvidia.makefile +++ b/lib/gpu/Nvidia.makefile @@ -13,7 +13,8 @@ # # /* ---------------------------------------------------------------------- # Contributing authors: Mike Brown (ORNL), brownw@ornl.gov -# Peng Wang (Nvidia), penwang@nvidia.com +# Peng Wang (Nvidia), penwang@nvidia.com +# Inderaj Bains (NVIDIA), ibains@nvidia.com # Paul Crozier (SNL), pscrozi@sandia.gov # ------------------------------------------------------------------------- */ @@ -28,10 +29,11 @@ GPU_LIB = $(LIB_DIR)/libgpu.a # Headers for Geryon UCL_H = $(wildcard ./geryon/ucl*.h) NVC_H = $(wildcard ./geryon/nvc*.h) $(UCL_H) -NVD_H = $(wildcard ./geryon/nvd*.h) $(UCL_H) +NVD_H = $(wildcard ./geryon/nvd*.h) $(UCL_H) nv_kernel_def.h # Headers for Pair Stuff -PAIR_H = pair_gpu_atom.h pair_gpu_nbor.h pair_gpu_precision.h \ - pair_gpu_device.h pair_gpu_balance.h +PAIR_H = pair_gpu_atom.h pair_gpu_ans.h pair_gpu_nbor_shared.h \ + pair_gpu_nbor.h pair_gpu_precision.h pair_gpu_device.h \ + pair_gpu_balance.h pppm_gpu_memory.h ALL_H = $(NVD_H) $(PAIR_H) @@ -39,28 +41,37 @@ EXECS = $(BIN_DIR)/nvc_get_devices CUDPP = $(OBJ_DIR)/cudpp.o $(OBJ_DIR)/cudpp_plan.o \ $(OBJ_DIR)/cudpp_maximal_launch.o $(OBJ_DIR)/cudpp_plan_manager.o \ $(OBJ_DIR)/radixsort_app.cu_o $(OBJ_DIR)/scan_app.cu_o -OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_nbor.o \ - $(OBJ_DIR)/pair_gpu_device.o $(OBJ_DIR)/atomic_gpu_memory.o \ - $(OBJ_DIR)/charge_gpu_memory.o \ +OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_ans.o \ + $(OBJ_DIR)/pair_gpu_nbor.o $(OBJ_DIR)/pair_gpu_nbor_shared.o \ + $(OBJ_DIR)/pair_gpu_device.o \ + $(OBJ_DIR)/atomic_gpu_memory.o $(OBJ_DIR)/charge_gpu_memory.o \ + $(OBJ_DIR)/pppm_gpu_memory.o $(OBJ_DIR)/pppm_l_gpu.o \ $(OBJ_DIR)/gb_gpu_memory.o $(OBJ_DIR)/gb_gpu.o \ $(OBJ_DIR)/lj_cut_gpu_memory.o $(OBJ_DIR)/lj_cut_gpu.o \ $(OBJ_DIR)/lj96_cut_gpu_memory.o $(OBJ_DIR)/lj96_cut_gpu.o \ + $(OBJ_DIR)/lj_expand_gpu_memory.o $(OBJ_DIR)/lj_expand_gpu.o \ $(OBJ_DIR)/ljc_cut_gpu_memory.o $(OBJ_DIR)/ljc_cut_gpu.o \ $(OBJ_DIR)/ljcl_cut_gpu_memory.o $(OBJ_DIR)/ljcl_cut_gpu.o \ + $(OBJ_DIR)/morse_gpu_memory.o $(OBJ_DIR)/morse_gpu.o \ $(OBJ_DIR)/crml_gpu_memory.o $(OBJ_DIR)/crml_gpu.o \ $(OBJ_DIR)/cmm_cut_gpu_memory.o $(OBJ_DIR)/cmm_cut_gpu.o \ $(OBJ_DIR)/cmmc_long_gpu_memory.o $(OBJ_DIR)/cmmc_long_gpu.o \ $(CUDPP) -PTXS = $(OBJ_DIR)/pair_gpu_atom_kernel.ptx $(OBJ_DIR)/pair_gpu_atom_ptx.h \ +PTXS = $(OBJ_DIR)/pair_gpu_dev_kernel.ptx \ + $(OBJ_DIR)/pair_gpu_atom_kernel.ptx $(OBJ_DIR)/pair_gpu_atom_ptx.h \ $(OBJ_DIR)/pair_gpu_nbor_kernel.ptx $(OBJ_DIR)/pair_gpu_nbor_ptx.h \ $(OBJ_DIR)/pair_gpu_build_kernel.ptx $(OBJ_DIR)/pair_gpu_build_ptx.h \ + $(OBJ_DIR)/pppm_f_gpu_kernel.ptx $(OBJ_DIR)/pppm_f_gpu_ptx.h \ + $(OBJ_DIR)/pppm_d_gpu_kernel.ptx $(OBJ_DIR)/pppm_d_gpu_ptx.h \ $(OBJ_DIR)/gb_gpu_kernel_nbor.ptx $(OBJ_DIR)/gb_gpu_kernel.ptx \ $(OBJ_DIR)/gb_gpu_kernel_lj.ptx $(OBJ_DIR)/gb_gpu_ptx.h \ $(OBJ_DIR)/lj_cut_gpu_kernel.ptx $(OBJ_DIR)/lj_cut_gpu_ptx.h \ $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx $(OBJ_DIR)/lj96_cut_gpu_ptx.h \ + $(OBJ_DIR)/lj_expand_gpu_kernel.ptx $(OBJ_DIR)/lj_expand_gpu_ptx.h \ $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc_cut_gpu_ptx.h \ $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx $(OBJ_DIR)/ljcl_cut_gpu_ptx.h \ - $(OBJ_DIR)/crml_cut_gpu_kernel.ptx $(OBJ_DIR)/crml_cut_gpu_ptx.h \ + $(OBJ_DIR)/morse_gpu_kernel.ptx $(OBJ_DIR)/morse_gpu_ptx.h \ + $(OBJ_DIR)/crml_gpu_kernel.ptx $(OBJ_DIR)/crml_gpu_ptx.h \ $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx $(OBJ_DIR)/cmm_cut_gpu_ptx.h \ $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(OBJ_DIR)/cmmc_long_gpu_ptx.h @@ -93,6 +104,9 @@ $(OBJ_DIR)/pair_gpu_atom_ptx.h: $(OBJ_DIR)/pair_gpu_atom_kernel.ptx $(OBJ_DIR)/pair_gpu_atom.o: pair_gpu_atom.cpp pair_gpu_atom.h $(NVD_H) $(OBJ_DIR)/pair_gpu_atom_ptx.h $(CUDR) -o $@ -c pair_gpu_atom.cpp -I$(OBJ_DIR) +$(OBJ_DIR)/pair_gpu_ans.o: pair_gpu_ans.cpp pair_gpu_ans.h $(NVD_H) + $(CUDR) -o $@ -c pair_gpu_ans.cpp -I$(OBJ_DIR) + $(OBJ_DIR)/pair_gpu_nbor_kernel.ptx: pair_gpu_nbor_kernel.cu $(CUDA) --ptx -DNV_KERNEL -o $@ pair_gpu_nbor_kernel.cu @@ -105,11 +119,20 @@ $(OBJ_DIR)/pair_gpu_build_kernel.ptx: pair_gpu_build_kernel.cu $(OBJ_DIR)/pair_gpu_build_ptx.h: $(OBJ_DIR)/pair_gpu_build_kernel.ptx $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pair_gpu_build_kernel.ptx $(OBJ_DIR)/pair_gpu_build_ptx.h -$(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h $(OBJ_DIR)/pair_gpu_nbor_ptx.h $(OBJ_DIR)/pair_gpu_build_ptx.h $(NVD_H) +$(OBJ_DIR)/pair_gpu_nbor_shared.o: pair_gpu_nbor_shared.cpp pair_gpu_nbor_shared.h $(OBJ_DIR)/pair_gpu_nbor_ptx.h $(OBJ_DIR)/pair_gpu_build_ptx.h $(NVD_H) + $(CUDR) -o $@ -c pair_gpu_nbor_shared.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h pair_gpu_nbor_shared.h $(NVD_H) $(CUDR) -o $@ -c pair_gpu_nbor.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(NVD_H) - $(CUDR) -o $@ -c pair_gpu_device.cpp +$(OBJ_DIR)/pair_gpu_dev_kernel.ptx: pair_gpu_dev_kernel.cu + $(CUDA) --ptx -DNV_KERNEL -o $@ pair_gpu_dev_kernel.cu + +$(OBJ_DIR)/pair_gpu_dev_ptx.h: $(OBJ_DIR)/pair_gpu_dev_kernel.ptx + $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pair_gpu_dev_kernel.ptx $(OBJ_DIR)/pair_gpu_dev_ptx.h + +$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(ALL_H) $(OBJ_DIR)/pair_gpu_dev_ptx.h + $(CUDR) -o $@ -c pair_gpu_device.cpp -I$(OBJ_DIR) $(OBJ_DIR)/atomic_gpu_memory.o: $(ALL_H) atomic_gpu_memory.h atomic_gpu_memory.cpp $(CUDR) -o $@ -c atomic_gpu_memory.cpp @@ -117,6 +140,24 @@ $(OBJ_DIR)/atomic_gpu_memory.o: $(ALL_H) atomic_gpu_memory.h atomic_gpu_memory.c $(OBJ_DIR)/charge_gpu_memory.o: $(ALL_H) charge_gpu_memory.h charge_gpu_memory.cpp $(CUDR) -o $@ -c charge_gpu_memory.cpp +$(OBJ_DIR)/pppm_f_gpu_kernel.ptx: pppm_gpu_kernel.cu pair_gpu_precision.h + $(CUDA) --ptx -DNV_KERNEL -Dgrdtyp=float -Dgrdtyp4=float4 -o $@ pppm_gpu_kernel.cu + +$(OBJ_DIR)/pppm_f_gpu_ptx.h: $(OBJ_DIR)/pppm_f_gpu_kernel.ptx + $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pppm_f_gpu_kernel.ptx $(OBJ_DIR)/pppm_f_gpu_ptx.h + +$(OBJ_DIR)/pppm_d_gpu_kernel.ptx: pppm_gpu_kernel.cu pair_gpu_precision.h + $(CUDA) --ptx -DNV_KERNEL -Dgrdtyp=double -Dgrdtyp4=double4 -o $@ pppm_gpu_kernel.cu + +$(OBJ_DIR)/pppm_d_gpu_ptx.h: $(OBJ_DIR)/pppm_d_gpu_kernel.ptx + $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pppm_d_gpu_kernel.ptx $(OBJ_DIR)/pppm_d_gpu_ptx.h + +$(OBJ_DIR)/pppm_gpu_memory.o: $(ALL_H) pppm_gpu_memory.h pppm_gpu_memory.cpp $(OBJ_DIR)/pppm_f_gpu_ptx.h $(OBJ_DIR)/pppm_d_gpu_ptx.h + $(CUDR) -o $@ -c pppm_gpu_memory.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/pppm_l_gpu.o: $(ALL_H) pppm_gpu_memory.h pppm_l_gpu.cpp + $(CUDR) -o $@ -c pppm_l_gpu.cpp -I$(OBJ_DIR) + $(OBJ_DIR)/gb_gpu_kernel.ptx: gb_gpu_kernel.cu pair_gpu_precision.h gb_gpu_extra.h $(CUDA) --ptx -DNV_KERNEL -o $@ gb_gpu_kernel.cu @@ -144,7 +185,7 @@ $(OBJ_DIR)/lj_cut_gpu_ptx.h: $(OBJ_DIR)/lj_cut_gpu_kernel.ptx $(OBJ_DIR)/lj_cut_ $(OBJ_DIR)/lj_cut_gpu_memory.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu_memory.cpp $(OBJ_DIR)/lj_cut_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o $(CUDR) -o $@ -c lj_cut_gpu_memory.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/lj_cut_gpu.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu.cpp +$(OBJ_DIR)/lj_cut_gpu.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu.cpp atomic_gpu_memory.h $(CUDR) -o $@ -c lj_cut_gpu.cpp -I$(OBJ_DIR) $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx: ljc_cut_gpu_kernel.cu pair_gpu_precision.h @@ -156,7 +197,7 @@ $(OBJ_DIR)/ljc_cut_gpu_ptx.h: $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc_c $(OBJ_DIR)/ljc_cut_gpu_memory.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu_memory.cpp $(OBJ_DIR)/ljc_cut_gpu_ptx.h $(OBJ_DIR)/charge_gpu_memory.o $(CUDR) -o $@ -c ljc_cut_gpu_memory.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/ljc_cut_gpu.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu.cpp +$(OBJ_DIR)/ljc_cut_gpu.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu.cpp charge_gpu_memory.h $(CUDR) -o $@ -c ljc_cut_gpu.cpp -I$(OBJ_DIR) $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx: ljcl_cut_gpu_kernel.cu pair_gpu_precision.h @@ -168,9 +209,21 @@ $(OBJ_DIR)/ljcl_cut_gpu_ptx.h: $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc $(OBJ_DIR)/ljcl_cut_gpu_memory.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu_memory.cpp $(OBJ_DIR)/ljcl_cut_gpu_ptx.h $(OBJ_DIR)/charge_gpu_memory.o $(CUDR) -o $@ -c ljcl_cut_gpu_memory.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp +$(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp charge_gpu_memory.h $(CUDR) -o $@ -c ljcl_cut_gpu.cpp -I$(OBJ_DIR) +$(OBJ_DIR)/morse_gpu_kernel.ptx: morse_gpu_kernel.cu pair_gpu_precision.h + $(CUDA) --ptx -DNV_KERNEL -o $@ morse_gpu_kernel.cu + +$(OBJ_DIR)/morse_gpu_ptx.h: $(OBJ_DIR)/morse_gpu_kernel.ptx $(OBJ_DIR)/morse_gpu_kernel.ptx + $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/morse_gpu_kernel.ptx $(OBJ_DIR)/morse_gpu_ptx.h + +$(OBJ_DIR)/morse_gpu_memory.o: $(ALL_H) morse_gpu_memory.h morse_gpu_memory.cpp $(OBJ_DIR)/morse_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o + $(CUDR) -o $@ -c morse_gpu_memory.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/morse_gpu.o: $(ALL_H) morse_gpu_memory.h morse_gpu.cpp atomic_gpu_memory.h + $(CUDR) -o $@ -c morse_gpu.cpp -I$(OBJ_DIR) + $(OBJ_DIR)/crml_gpu_kernel.ptx: crml_gpu_kernel.cu pair_gpu_precision.h $(CUDA) --ptx -DNV_KERNEL -o $@ crml_gpu_kernel.cu @@ -180,7 +233,7 @@ $(OBJ_DIR)/crml_gpu_ptx.h: $(OBJ_DIR)/crml_gpu_kernel.ptx $(OBJ_DIR)/crml_gpu_ke $(OBJ_DIR)/crml_gpu_memory.o: $(ALL_H) crml_gpu_memory.h crml_gpu_memory.cpp $(OBJ_DIR)/crml_gpu_ptx.h $(OBJ_DIR)/charge_gpu_memory.o $(CUDR) -o $@ -c crml_gpu_memory.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/crml_gpu.o: $(ALL_H) crml_gpu_memory.h crml_gpu.cpp +$(OBJ_DIR)/crml_gpu.o: $(ALL_H) crml_gpu_memory.h crml_gpu.cpp charge_gpu_memory.h $(CUDR) -o $@ -c crml_gpu.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx: lj96_cut_gpu_kernel.cu pair_gpu_precision.h @@ -192,9 +245,21 @@ $(OBJ_DIR)/lj96_cut_gpu_ptx.h: $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx $(OBJ_DIR)/lj9 $(OBJ_DIR)/lj96_cut_gpu_memory.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu_memory.cpp $(OBJ_DIR)/lj96_cut_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o $(CUDR) -o $@ -c lj96_cut_gpu_memory.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/lj96_cut_gpu.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu.cpp +$(OBJ_DIR)/lj96_cut_gpu.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu.cpp atomic_gpu_memory.h $(CUDR) -o $@ -c lj96_cut_gpu.cpp -I$(OBJ_DIR) +$(OBJ_DIR)/lj_expand_gpu_kernel.ptx: lj_expand_gpu_kernel.cu pair_gpu_precision.h + $(CUDA) --ptx -DNV_KERNEL -o $@ lj_expand_gpu_kernel.cu + +$(OBJ_DIR)/lj_expand_gpu_ptx.h: $(OBJ_DIR)/lj_expand_gpu_kernel.ptx $(OBJ_DIR)/lj_expand_gpu_kernel.ptx + $(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/lj_expand_gpu_kernel.ptx $(OBJ_DIR)/lj_expand_gpu_ptx.h + +$(OBJ_DIR)/lj_expand_gpu_memory.o: $(ALL_H) lj_expand_gpu_memory.h lj_expand_gpu_memory.cpp $(OBJ_DIR)/lj_expand_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o + $(CUDR) -o $@ -c lj_expand_gpu_memory.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lj_expand_gpu.o: $(ALL_H) lj_expand_gpu_memory.h lj_expand_gpu.cpp atomic_gpu_memory.h + $(CUDR) -o $@ -c lj_expand_gpu.cpp -I$(OBJ_DIR) + $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx: cmm_cut_gpu_kernel.cu pair_gpu_precision.h $(CUDA) --ptx -DNV_KERNEL -o $@ cmm_cut_gpu_kernel.cu @@ -204,7 +269,7 @@ $(OBJ_DIR)/cmm_cut_gpu_ptx.h: $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx $(OBJ_DIR)/cmm_c $(OBJ_DIR)/cmm_cut_gpu_memory.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu_memory.cpp $(OBJ_DIR)/cmm_cut_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o $(CUDR) -o $@ -c cmm_cut_gpu_memory.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/cmm_cut_gpu.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu.cpp +$(OBJ_DIR)/cmm_cut_gpu.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu.cpp atomic_gpu_memory.h $(CUDR) -o $@ -c cmm_cut_gpu.cpp -I$(OBJ_DIR) $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx: cmmc_long_gpu_kernel.cu pair_gpu_precision.h @@ -216,7 +281,7 @@ $(OBJ_DIR)/cmmc_long_gpu_ptx.h: $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(OBJ_DIR)/c $(OBJ_DIR)/cmmc_long_gpu_memory.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu_memory.cpp $(OBJ_DIR)/cmmc_long_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o $(CUDR) -o $@ -c cmmc_long_gpu_memory.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/cmmc_long_gpu.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu.cpp +$(OBJ_DIR)/cmmc_long_gpu.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu.cpp charge_gpu_memory.h $(CUDR) -o $@ -c cmmc_long_gpu.cpp -I$(OBJ_DIR) $(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVC_H) diff --git a/lib/gpu/Opencl.makefile b/lib/gpu/Opencl.makefile index ac7aecc2ee..45e21736a3 100644 --- a/lib/gpu/Opencl.makefile +++ b/lib/gpu/Opencl.makefile @@ -14,6 +14,7 @@ # /* ---------------------------------------------------------------------- # Contributing authors: Mike Brown (ORNL), brownw@ornl.gov # Peng Wang (Nvidia), penwang@nvidia.com +# Inderaj Bains (NVIDIA), ibains@nvidia.com # Paul Crozier (SNL), pscrozi@sandia.gov # ------------------------------------------------------------------------- */ @@ -23,30 +24,37 @@ OCL_LIB = $(LIB_DIR)/libgpu.a UCL_H = $(wildcard ./geryon/ucl*.h) OCL_H = $(wildcard ./geryon/ocl*.h) $(UCL_H) # Headers for Pair Stuff -PAIR_H = pair_gpu_atom.h pair_gpu_nbor.h pair_gpu_precision.h \ - pair_gpu_device.h pair_gpu_balance.h +PAIR_H = pair_gpu_atom.h pair_gpu_ans.h pair_gpu_nbor_shared.h \ + pair_gpu_nbor.h pair_gpu_precision.h pair_gpu_device.h \ + pair_gpu_balance.h pppm_gpu_memory.h ALL_H = $(OCL_H) $(PAIR_H) EXECS = $(BIN_DIR)/ocl_get_devices -OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_nbor.o \ - $(OBJ_DIR)/pair_gpu_device.o $(OBJ_DIR)/atomic_gpu_memory.o \ - $(OBJ_DIR)/charge_gpu_memory.o \ +OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_ans.o \ + $(OBJ_DIR)/pair_gpu_nbor_shared.o $(OBJ_DIR)/pair_gpu_nbor.o \ + $(OBJ_DIR)/pair_gpu_device.o \ + $(OBJ_DIR)/atomic_gpu_memory.o $(OBJ_DIR)/charge_gpu_memory.o \ + $(OBJ_DIR)/pppm_gpu_memory.o $(OBJ_DIR)/pppm_l_gpu.o \ $(OBJ_DIR)/gb_gpu_memory.o $(OBJ_DIR)/gb_gpu.o \ $(OBJ_DIR)/lj_cut_gpu_memory.o $(OBJ_DIR)/lj_cut_gpu.o \ $(OBJ_DIR)/lj96_cut_gpu_memory.o $(OBJ_DIR)/lj96_cut_gpu.o \ + $(OBJ_DIR)/lj_expand_gpu_memory.o $(OBJ_DIR)/lj_expand_gpu.o \ $(OBJ_DIR)/ljc_cut_gpu_memory.o $(OBJ_DIR)/ljc_cut_gpu.o \ $(OBJ_DIR)/ljcl_cut_gpu_memory.o $(OBJ_DIR)/ljcl_cut_gpu.o \ + $(OBJ_DIR)/morse_gpu_memory.o $(OBJ_DIR)/morse_gpu.o \ $(OBJ_DIR)/crml_gpu_memory.o $(OBJ_DIR)/crml_gpu.o \ $(OBJ_DIR)/cmm_cut_gpu_memory.o $(OBJ_DIR)/cmm_cut_gpu.o \ $(OBJ_DIR)/cmmc_long_gpu_memory.o $(OBJ_DIR)/cmmc_long_gpu.o -KERS = $(OBJ_DIR)/pair_gpu_atom_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h \ +KERS = $(OBJ_DIR)/pair_gpu_dev_cl.h $(OBJ_DIR)/pair_gpu_atom_cl.h \ + $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/pppm_gpu_cl.h \ $(OBJ_DIR)/gb_gpu_nbor_cl.h $(OBJ_DIR)/gb_gpu_cl.h \ $(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/lj96_cut_gpu_cl.h \ - $(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/ljcl_cut_gpu_cl.h \ - $(OBJ_DIR)/crml_gpu_cl.h \ - $(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/cmmc_long_gpu_cl.h - + $(OBJ_DIR)/lj_expand_gpu_cl.h $(OBJ_DIR)/ljc_cut_gpu_cl.h \ + $(OBJ_DIR)/ljcl_cut_gpu_cl.h $(OBJ_DIR)/morse_gpu_cl.h \ + $(OBJ_DIR)/crml_gpu_cl.h $(OBJ_DIR)/cmm_cut_gpu_cl.h \ + $(OBJ_DIR)/cmmc_long_gpu_cl.h + OCL_EXECS = $(BIN_DIR)/ocl_get_devices all: $(OCL_LIB) $(EXECS) @@ -57,14 +65,23 @@ $(OBJ_DIR)/pair_gpu_atom_cl.h: pair_gpu_atom_kernel.cu $(OBJ_DIR)/pair_gpu_atom.o: pair_gpu_atom.cpp pair_gpu_atom.h $(OCL_H) $(OBJ_DIR)/pair_gpu_atom_cl.h $(OCL) -o $@ -c pair_gpu_atom.cpp -I$(OBJ_DIR) +$(OBJ_DIR)/pair_gpu_ans.o: pair_gpu_ans.cpp pair_gpu_ans.h $(OCL_H) + $(OCL) -o $@ -c pair_gpu_ans.cpp -I$(OBJ_DIR) + $(OBJ_DIR)/pair_gpu_nbor_cl.h: pair_gpu_nbor_kernel.cu $(BSH) ./geryon/file_to_cstr.sh pair_gpu_nbor_kernel.cu $(OBJ_DIR)/pair_gpu_nbor_cl.h -$(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h $(OCL_H) $(OBJ_DIR)/pair_gpu_nbor_cl.h +$(OBJ_DIR)/pair_gpu_nbor_shared.o: pair_gpu_nbor_shared.cpp pair_gpu_nbor_shared.h $(OCL_H) $(OBJ_DIR)/pair_gpu_nbor_cl.h + $(OCL) -o $@ -c pair_gpu_nbor_shared.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h $(OCL_H) pair_gpu_nbor_shared.h $(OCL) -o $@ -c pair_gpu_nbor.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(OCL_H) - $(OCL) -o $@ -c pair_gpu_device.cpp +$(OBJ_DIR)/pair_gpu_dev_cl.h: pair_gpu_dev_kernel.cu + $(BSH) ./geryon/file_to_cstr.sh pair_gpu_dev_kernel.cu $(OBJ_DIR)/pair_gpu_dev_cl.h + +$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(ALL_H) $(OBJ_DIR)/pair_gpu_dev_cl.h + $(OCL) -o $@ -c pair_gpu_device.cpp -I$(OBJ_DIR) $(OBJ_DIR)/atomic_gpu_memory.o: $(OCL_H) atomic_gpu_memory.h atomic_gpu_memory.cpp $(OCL) -o $@ -c atomic_gpu_memory.cpp @@ -72,6 +89,15 @@ $(OBJ_DIR)/atomic_gpu_memory.o: $(OCL_H) atomic_gpu_memory.h atomic_gpu_memory.c $(OBJ_DIR)/charge_gpu_memory.o: $(OCL_H) charge_gpu_memory.h charge_gpu_memory.cpp $(OCL) -o $@ -c charge_gpu_memory.cpp +$(OBJ_DIR)/pppm_gpu_cl.h: pppm_gpu_kernel.cu + $(BSH) ./geryon/file_to_cstr.sh pppm_gpu_kernel.cu $(OBJ_DIR)/pppm_gpu_cl.h; + +$(OBJ_DIR)/pppm_gpu_memory.o: $(ALL_H) pppm_gpu_memory.h pppm_gpu_memory.cpp $(OBJ_DIR)/pppm_gpu_cl.h $(OBJ_DIR)/pppm_gpu_cl.h + $(OCL) -o $@ -c pppm_gpu_memory.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/pppm_l_gpu.o: $(ALL_H) pppm_gpu_memory.h pppm_l_gpu.cpp + $(OCL) -o $@ -c pppm_l_gpu.cpp -I$(OBJ_DIR) + $(OBJ_DIR)/gb_gpu_nbor_cl.h: gb_gpu_kernel_nbor.cu $(BSH) ./geryon/file_to_cstr.sh gb_gpu_kernel_nbor.cu $(OBJ_DIR)/gb_gpu_nbor_cl.h @@ -93,7 +119,7 @@ $(OBJ_DIR)/lj_cut_gpu_cl.h: lj_cut_gpu_kernel.cu $(OBJ_DIR)/lj_cut_gpu_memory.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu_memory.cpp $(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o $(OCL) -o $@ -c lj_cut_gpu_memory.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/lj_cut_gpu.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu.cpp +$(OBJ_DIR)/lj_cut_gpu.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu.cpp atomic_gpu_memory.h $(OCL) -o $@ -c lj_cut_gpu.cpp -I$(OBJ_DIR) $(OBJ_DIR)/ljc_cut_gpu_cl.h: ljc_cut_gpu_kernel.cu @@ -102,7 +128,7 @@ $(OBJ_DIR)/ljc_cut_gpu_cl.h: ljc_cut_gpu_kernel.cu $(OBJ_DIR)/ljc_cut_gpu_memory.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu_memory.cpp $(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/charge_gpu_memory.o $(OCL) -o $@ -c ljc_cut_gpu_memory.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/ljc_cut_gpu.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu.cpp +$(OBJ_DIR)/ljc_cut_gpu.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu.cpp charge_gpu_memory.h $(OCL) -o $@ -c ljc_cut_gpu.cpp -I$(OBJ_DIR) $(OBJ_DIR)/ljcl_cut_gpu_cl.h: ljcl_cut_gpu_kernel.cu @@ -111,16 +137,25 @@ $(OBJ_DIR)/ljcl_cut_gpu_cl.h: ljcl_cut_gpu_kernel.cu $(OBJ_DIR)/ljcl_cut_gpu_memory.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu_memory.cpp $(OBJ_DIR)/ljcl_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/ljcl_cut_gpu_cl.h $(OBJ_DIR)/charge_gpu_memory.o $(OCL) -o $@ -c ljcl_cut_gpu_memory.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp +$(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp charge_gpu_memory.h $(OCL) -o $@ -c ljcl_cut_gpu.cpp -I$(OBJ_DIR) +$(OBJ_DIR)/morse_gpu_cl.h: morse_gpu_kernel.cu + $(BSH) ./geryon/file_to_cstr.sh morse_gpu_kernel.cu $(OBJ_DIR)/morse_gpu_cl.h; + +$(OBJ_DIR)/morse_gpu_memory.o: $(ALL_H) morse_gpu_memory.h morse_gpu_memory.cpp $(OBJ_DIR)/morse_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/morse_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o + $(OCL) -o $@ -c morse_gpu_memory.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/morse_gpu.o: $(ALL_H) morse_gpu_memory.h morse_gpu.cpp atomic_gpu_memory.h + $(OCL) -o $@ -c morse_gpu.cpp -I$(OBJ_DIR) + $(OBJ_DIR)/crml_gpu_cl.h: crml_gpu_kernel.cu $(BSH) ./geryon/file_to_cstr.sh crml_gpu_kernel.cu $(OBJ_DIR)/crml_gpu_cl.h; $(OBJ_DIR)/crml_gpu_memory.o: $(ALL_H) crml_gpu_memory.h crml_gpu_memory.cpp $(OBJ_DIR)/crml_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/crml_gpu_cl.h $(OBJ_DIR)/charge_gpu_memory.o $(OCL) -o $@ -c crml_gpu_memory.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/crml_gpu.o: $(ALL_H) crml_gpu_memory.h crml_gpu.cpp +$(OBJ_DIR)/crml_gpu.o: $(ALL_H) crml_gpu_memory.h crml_gpu.cpp charge_gpu_memory.h $(OCL) -o $@ -c crml_gpu.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lj96_cut_gpu_cl.h: lj96_cut_gpu_kernel.cu @@ -129,16 +164,25 @@ $(OBJ_DIR)/lj96_cut_gpu_cl.h: lj96_cut_gpu_kernel.cu $(OBJ_DIR)/lj96_cut_gpu_memory.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu_memory.cpp $(OBJ_DIR)/lj96_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/lj96_cut_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o $(OCL) -o $@ -c lj96_cut_gpu_memory.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/lj96_cut_gpu.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu.cpp +$(OBJ_DIR)/lj96_cut_gpu.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu.cpp atomic_gpu_memory.h $(OCL) -o $@ -c lj96_cut_gpu.cpp -I$(OBJ_DIR) +$(OBJ_DIR)/lj_expand_gpu_cl.h: lj_expand_gpu_kernel.cu + $(BSH) ./geryon/file_to_cstr.sh lj_expand_gpu_kernel.cu $(OBJ_DIR)/lj_expand_gpu_cl.h; + +$(OBJ_DIR)/lj_expand_gpu_memory.o: $(ALL_H) lj_expand_gpu_memory.h lj_expand_gpu_memory.cpp $(OBJ_DIR)/lj_expand_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/lj_expand_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o + $(OCL) -o $@ -c lj_expand_gpu_memory.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lj_expand_gpu.o: $(ALL_H) lj_expand_gpu_memory.h lj_expand_gpu.cpp atomic_gpu_memory.h + $(OCL) -o $@ -c lj_expand_gpu.cpp -I$(OBJ_DIR) + $(OBJ_DIR)/cmm_cut_gpu_cl.h: cmm_cut_gpu_kernel.cu $(BSH) ./geryon/file_to_cstr.sh cmm_cut_gpu_kernel.cu $(OBJ_DIR)/cmm_cut_gpu_cl.h; $(OBJ_DIR)/cmm_cut_gpu_memory.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu_memory.cpp $(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o $(OCL) -o $@ -c cmm_cut_gpu_memory.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/cmm_cut_gpu.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu.cpp +$(OBJ_DIR)/cmm_cut_gpu.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu.cpp atomic_gpu_memory.h $(OCL) -o $@ -c cmm_cut_gpu.cpp -I$(OBJ_DIR) $(OBJ_DIR)/cmmc_long_gpu_cl.h: cmmc_long_gpu_kernel.cu @@ -147,7 +191,7 @@ $(OBJ_DIR)/cmmc_long_gpu_cl.h: cmmc_long_gpu_kernel.cu $(OBJ_DIR)/cmmc_long_gpu_memory.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu_memory.cpp $(OBJ_DIR)/cmmc_long_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/cmmc_long_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o $(OCL) -o $@ -c cmmc_long_gpu_memory.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/cmmc_long_gpu.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu.cpp +$(OBJ_DIR)/cmmc_long_gpu.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu.cpp charge_gpu_memory.h $(OCL) -o $@ -c cmmc_long_gpu.cpp -I$(OBJ_DIR) $(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp diff --git a/lib/gpu/README b/lib/gpu/README index 567d81886b..a60d43064a 100644 --- a/lib/gpu/README +++ b/lib/gpu/README @@ -14,6 +14,7 @@ /* ---------------------------------------------------------------------- Contributing authors: Mike Brown (ORNL), brownw@ornl.gov Peng Wang (Nvidia), penwang@nvidia.com + Inderaj Bains (NVIDIA), ibains@nvidia.com Paul Crozier (SNL), pscrozi@sandia.gov ------------------------------------------------------------------------- */ diff --git a/lib/gpu/atomic_gpu_memory.cpp b/lib/gpu/atomic_gpu_memory.cpp index e1cc48048b..531ea4000d 100644 --- a/lib/gpu/atomic_gpu_memory.cpp +++ b/lib/gpu/atomic_gpu_memory.cpp @@ -23,23 +23,28 @@ extern PairGPUDevice pair_gpu_device; template AtomicGPUMemoryT::AtomicGPUMemory() : _compiled(false), _max_bytes(0) { device=&pair_gpu_device; + ans=new PairGPUAns(); + nbor=new PairGPUNbor(); } template AtomicGPUMemoryT::~AtomicGPUMemory() { + delete ans; + delete nbor; } template int AtomicGPUMemoryT::bytes_per_atom_atomic(const int max_nbors) const { - return device->atom.bytes_per_atom()+device->nbor.bytes_per_atom(max_nbors); + return device->atom.bytes_per_atom()+ans->bytes_per_atom()+ + nbor->bytes_per_atom(max_nbors); } template -bool AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall, - const int max_nbors, const int maxspecial, - const double cell_size, - const double gpu_split, FILE *_screen, - const char *pair_program) { +int AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, + const double gpu_split, FILE *_screen, + const char *pair_program) { nbor_time_avail=false; screen=_screen; @@ -48,24 +53,30 @@ bool AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall, gpu_nbor=true; int _gpu_host=0; - int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_nbor,gpu_split); + int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor); if (host_nlocal>0) _gpu_host=1; - if (!device->init(false,false,nlocal,host_nlocal,nall,maxspecial,gpu_nbor, - _gpu_host,max_nbors,cell_size,false)) - return false; + _threads_per_atom=device->threads_per_atom(); + if (_threads_per_atom>1 && gpu_nbor==false) { + nbor->packing(true); + _nbor_data=&(nbor->dev_packed); + } else + _nbor_data=&(nbor->dev_nbor); + + int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor, + maxspecial,_gpu_host,max_nbors,cell_size,false); + if (success!=0) + return success; + ucl_device=device->gpu; atom=&device->atom; - nbor=&device->nbor; - _block_size=BLOCK_1D; - if (static_cast(_block_size)>ucl_device->group_size()) - _block_size=ucl_device->group_size(); + _block_size=device->pair_block_size(); compile_kernels(*ucl_device,pair_program); // Initialize host-device load balancer - hd_balancer.init(device,gpu_split); + hd_balancer.init(device,gpu_nbor,gpu_split); // Initialize timers for the selected GPU time_pair.init(*ucl_device); @@ -73,9 +84,14 @@ bool AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall, pos_tex.bind_float(atom->dev_x,4); - _max_an_bytes=atom->gpu_bytes()+nbor->gpu_bytes(); + _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes(); - return true; + return 0; +} + +template +void AtomicGPUMemoryT::estimate_gpu_overhead() { + device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead); } template @@ -83,7 +99,10 @@ void AtomicGPUMemoryT::clear_atomic() { // Output any timing information acc_timers(); double avg_split=hd_balancer.all_avg_split(); - device->output_times(time_pair,avg_split,_max_bytes+_max_an_bytes,screen); + _gpu_overhead*=hd_balancer.timestep(); + _driver_overhead*=hd_balancer.timestep(); + device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes, + _gpu_overhead,_driver_overhead,_threads_per_atom,screen); if (_compiled) { k_pair_fast.clear(); @@ -107,8 +126,7 @@ int * AtomicGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist, success=true; nbor_time_avail=true; - - int mn=nbor->max_nbor_loop(inum,numj); + int mn=nbor->max_nbor_loop(inum,numj,ilist); resize_atom(inum,nall,success); resize_local(inum,mn,success); if (!success) @@ -116,7 +134,7 @@ int * AtomicGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist, nbor->get_host(inum,ilist,numj,firstneigh,block_size()); - double bytes=atom->gpu_bytes()+nbor->gpu_bytes(); + double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); if (bytes>_max_an_bytes) _max_an_bytes=bytes; @@ -130,8 +148,8 @@ template inline void AtomicGPUMemoryT::build_nbor_list(const int inum, const int host_inum, const int nall, double **host_x, - int *host_type, double *boxlo, - double *boxhi, int *tag, + int *host_type, double *sublo, + double *subhi, int *tag, int **nspecial, int **special, bool &success) { nbor_time_avail=true; @@ -144,10 +162,10 @@ inline void AtomicGPUMemoryT::build_nbor_list(const int inum, atom->cast_copy_x(host_x,host_type); int mn; - nbor->build_nbor_list(inum, host_inum, nall, *atom, boxlo, boxhi, tag, + nbor->build_nbor_list(inum, host_inum, nall, *atom, sublo, subhi, tag, nspecial, special, success, mn); - double bytes=atom->gpu_bytes()+nbor->gpu_bytes(); + double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); if (bytes>_max_an_bytes) _max_an_bytes=bytes; } @@ -156,24 +174,25 @@ inline void AtomicGPUMemoryT::build_nbor_list(const int inum, // Copy nbor list from host if necessary and then calculate forces, virials,.. // --------------------------------------------------------------------------- template -void AtomicGPUMemoryT::compute(const int timestep, const int f_ago, - const int inum_full, const int nall, - double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, - const bool eatom, const bool vatom, - int &host_start, const double cpu_time, - bool &success) { +void AtomicGPUMemoryT::compute(const int f_ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, + int &host_start, const double cpu_time, + bool &success) { acc_timers(); if (inum_full==0) { + host_start=0; + // Make sure textures are correct if realloc by a different hybrid style + resize_atom(0,nall,success); zero_timers(); return; } int ago=hd_balancer.ago_first(f_ago); - int inum=hd_balancer.balance(timestep,ago,inum_full,cpu_time, - nbor->gpu_nbor()); - atom->inum(inum); + int inum=hd_balancer.balance(ago,inum_full,cpu_time); + ans->inum(inum); host_start=inum; if (ago==0) { @@ -187,7 +206,8 @@ void AtomicGPUMemoryT::compute(const int timestep, const int f_ago, atom->add_x_data(host_x,host_type); loop(eflag,vflag); - atom->copy_answers(eflag,vflag,eatom,vatom,ilist); + ans->copy_answers(eflag,vflag,eatom,vatom,ilist); + device->add_ans_object(ans); hd_balancer.stop_timer(); } @@ -195,29 +215,32 @@ void AtomicGPUMemoryT::compute(const int timestep, const int f_ago, // Reneighbor on GPU if necessary and then compute forces, virials, energies // --------------------------------------------------------------------------- template -int * AtomicGPUMemoryT::compute(const int timestep, const int ago, - const int inum_full, const int nall, - double **host_x, int *host_type, double *boxlo, - double *boxhi, int *tag, int **nspecial, - int **special, const bool eflag, - const bool vflag, const bool eatom, - const bool vatom, int &host_start, - const double cpu_time, bool &success) { +int ** AtomicGPUMemoryT::compute(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, int *tag, + int **nspecial, int **special, const bool eflag, + const bool vflag, const bool eatom, + const bool vatom, int &host_start, + int **ilist, int **jnum, + const double cpu_time, bool &success) { acc_timers(); if (inum_full==0) { + host_start=0; + // Make sure textures are correct if realloc by a different hybrid style + resize_atom(0,nall,success); zero_timers(); return NULL; } - hd_balancer.balance(cpu_time,nbor->gpu_nbor()); - int inum=hd_balancer.get_gpu_count(timestep,ago,inum_full); - atom->inum(inum); + hd_balancer.balance(cpu_time); + int inum=hd_balancer.get_gpu_count(ago,inum_full); + ans->inum(inum); host_start=inum; // Build neighbor list on GPU if necessary if (ago==0) { build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, - boxlo, boxhi, tag, nspecial, special, success); + sublo, subhi, tag, nspecial, special, success); if (!success) return NULL; hd_balancer.start_timer(); @@ -226,19 +249,21 @@ int * AtomicGPUMemoryT::compute(const int timestep, const int ago, hd_balancer.start_timer(); atom->add_x_data(host_x,host_type); } + *ilist=nbor->host_ilist.begin(); + *jnum=nbor->host_acc.begin(); loop(eflag,vflag); - atom->copy_answers(eflag,vflag,eatom,vatom); + ans->copy_answers(eflag,vflag,eatom,vatom); + device->add_ans_object(ans); hd_balancer.stop_timer(); - return device->nbor.host_nbor.begin(); + return nbor->host_jlist.begin()-host_start; } template double AtomicGPUMemoryT::host_memory_usage_atomic() const { - return device->atom.host_memory_usage()+ - device->nbor.host_memory_usage()+4*sizeof(numtyp)+ - sizeof(AtomicGPUMemory); + return device->atom.host_memory_usage()+nbor->host_memory_usage()+ + 4*sizeof(numtyp)+sizeof(AtomicGPUMemory); } template diff --git a/lib/gpu/atomic_gpu_memory.h b/lib/gpu/atomic_gpu_memory.h index 81de41f3b7..238a4d9c1e 100644 --- a/lib/gpu/atomic_gpu_memory.h +++ b/lib/gpu/atomic_gpu_memory.h @@ -18,8 +18,6 @@ #ifndef ATOMIC_GPU_MEMORY_H #define ATOMIC_GPU_MEMORY_H -#define BLOCK_1D 64 - #include "pair_gpu_device.h" #include "pair_gpu_balance.h" #include "mpi.h" @@ -39,17 +37,28 @@ class AtomicGPUMemory { /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin - * \param gpu_split fraction of particles handled by device **/ - bool init_atomic(const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, - const double gpu_split, FILE *screen, - const char *pair_program); + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init_atomic(const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, + const char *pair_program); + + /// Estimate the overhead for GPU context changes and CPU driver + void estimate_gpu_overhead(); /// Check if there is enough storage for atom arrays and realloc if not /** \param success set to false if insufficient memory **/ inline void resize_atom(const int inum, const int nall, bool &success) { - if (atom->resize(inum, nall, success)) + if (atom->resize(nall, success)) pos_tex.bind_float(atom->dev_x,4); + ans->resize(inum,success); } /// Check if there is enough storage for neighbors and realloc if not @@ -85,13 +94,16 @@ class AtomicGPUMemory { /// Accumulate timers inline void acc_timers() { - if (nbor_time_avail) { - nbor->time_nbor.add_to_total(); - nbor->time_kernel.add_to_total(); - nbor_time_avail=false; + if (device->time_device()) { + if (nbor_time_avail) { + nbor->time_nbor.add_to_total(); + nbor->time_kernel.add_to_total(); + nbor_time_avail=false; + } + time_pair.add_to_total(); + atom->acc_timers(); + ans->acc_timers(); } - time_pair.add_to_total(); - atom->acc_timers(); } /// Zero timers @@ -99,6 +111,7 @@ class AtomicGPUMemory { nbor_time_avail=false; time_pair.zero(); atom->zero_timers(); + ans->zero_timers(); } /// Copy neighbor list from host @@ -108,24 +121,32 @@ class AtomicGPUMemory { /// Build neighbor list on device void build_nbor_list(const int inum, const int host_inum, const int nall, double **host_x, int *host_type, - double *boxlo, double *boxhi, int *tag, int **nspecial, + double *sublo, double *subhi, int *tag, int **nspecial, int **special, bool &success); /// Pair loop with host neighboring - void compute(const int timestep, const int f_ago, const int inum_full, + void compute(const int f_ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success); /// Pair loop with device neighboring - int * compute(const int timestep, const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, double *boxlo, - double *boxhi, int *tag, int **nspecial, + int * compute(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, double *sublo, + double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success); + /// Pair loop with device neighboring + int ** compute(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, double *sublo, + double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **numj, const double cpu_time, bool &success); + // -------------------------- DEVICE DATA ------------------------- /// Device Properties and Atom and Neighbor storage @@ -148,6 +169,9 @@ class AtomicGPUMemory { /// Atom Data PairGPUAtom *atom; + // ------------------------ FORCE/ENERGY DATA ----------------------- + + PairGPUAns *ans; // --------------------------- NBOR DATA ---------------------------- @@ -167,8 +191,10 @@ class AtomicGPUMemory { protected: bool _compiled; - int _block_size; + int _block_size, _threads_per_atom; double _max_bytes, _max_an_bytes; + double _gpu_overhead, _driver_overhead; + UCL_D_Vec *_nbor_data; void compile_kernels(UCL_Device &dev, const char *pair_string); diff --git a/lib/gpu/charge_gpu_memory.cpp b/lib/gpu/charge_gpu_memory.cpp index ce43fdfda1..412596f5f2 100644 --- a/lib/gpu/charge_gpu_memory.cpp +++ b/lib/gpu/charge_gpu_memory.cpp @@ -23,23 +23,28 @@ extern PairGPUDevice pair_gpu_device; template ChargeGPUMemoryT::ChargeGPUMemory() : _compiled(false), _max_bytes(0) { device=&pair_gpu_device; + ans=new PairGPUAns(); + nbor=new PairGPUNbor(); } template ChargeGPUMemoryT::~ChargeGPUMemory() { + delete ans; + delete nbor; } template int ChargeGPUMemoryT::bytes_per_atom_atomic(const int max_nbors) const { - return device->atom.bytes_per_atom()+device->nbor.bytes_per_atom(max_nbors); + return device->atom.bytes_per_atom()+ans->bytes_per_atom()+ + nbor->bytes_per_atom(max_nbors); } template -bool ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall, - const int max_nbors, const int maxspecial, - const double cell_size, - const double gpu_split, FILE *_screen, - const char *pair_program) { +int ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, + const double gpu_split, FILE *_screen, + const char *pair_program) { nbor_time_avail=false; screen=_screen; @@ -48,24 +53,31 @@ bool ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall, gpu_nbor=true; int _gpu_host=0; - int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_nbor,gpu_split); + int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor); if (host_nlocal>0) _gpu_host=1; - if (!device->init(true,false,nlocal,host_nlocal,nall,maxspecial,gpu_nbor, - _gpu_host,max_nbors,cell_size,false)) - return false; + _threads_per_atom=device->threads_per_charge(); + if (_threads_per_atom>1 && gpu_nbor==false) { + nbor->packing(true); + _nbor_data=&(nbor->dev_packed); + } else + _nbor_data=&(nbor->dev_nbor); + + int success=device->init(*ans,true,false,nlocal,host_nlocal,nall,nbor, + maxspecial,_gpu_host,max_nbors,cell_size,false); + if (success!=0) + return success; + ucl_device=device->gpu; atom=&device->atom; - nbor=&device->nbor; - _block_size=BLOCK_1D; - if (static_cast(_block_size)>ucl_device->group_size()) - _block_size=ucl_device->group_size(); + _block_size=device->pair_block_size(); + _block_bio_size=device->block_bio_pair(); compile_kernels(*ucl_device,pair_program); // Initialize host-device load balancer - hd_balancer.init(device,gpu_split); + hd_balancer.init(device,gpu_nbor,gpu_split); // Initialize timers for the selected GPU time_pair.init(*ucl_device); @@ -74,9 +86,14 @@ bool ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall, pos_tex.bind_float(atom->dev_x,4); q_tex.bind_float(atom->dev_q,1); - _max_an_bytes=atom->gpu_bytes()+nbor->gpu_bytes(); + _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes(); - return true; + return success; +} + +template +void ChargeGPUMemoryT::estimate_gpu_overhead() { + device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead); } template @@ -84,7 +101,10 @@ void ChargeGPUMemoryT::clear_atomic() { // Output any timing information acc_timers(); double avg_split=hd_balancer.all_avg_split(); - device->output_times(time_pair,avg_split,_max_bytes+_max_an_bytes,screen); + _gpu_overhead*=hd_balancer.timestep(); + _driver_overhead*=hd_balancer.timestep(); + device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes, + _gpu_overhead,_driver_overhead,_threads_per_atom,screen); if (_compiled) { k_pair_fast.clear(); @@ -109,7 +129,7 @@ int * ChargeGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist, nbor_time_avail=true; - int mn=nbor->max_nbor_loop(inum,numj); + int mn=nbor->max_nbor_loop(inum,numj,ilist); resize_atom(inum,nall,success); resize_local(inum,mn,success); if (!success) @@ -117,7 +137,7 @@ int * ChargeGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist, nbor->get_host(inum,ilist,numj,firstneigh,block_size()); - double bytes=atom->gpu_bytes()+nbor->gpu_bytes(); + double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); if (bytes>_max_an_bytes) _max_an_bytes=bytes; @@ -131,8 +151,8 @@ template inline void ChargeGPUMemoryT::build_nbor_list(const int inum, const int host_inum, const int nall, double **host_x, - int *host_type, double *boxlo, - double *boxhi, int *tag, + int *host_type, double *sublo, + double *subhi, int *tag, int **nspecial, int **special, bool &success) { nbor_time_avail=true; @@ -145,10 +165,10 @@ inline void ChargeGPUMemoryT::build_nbor_list(const int inum, atom->cast_copy_x(host_x,host_type); int mn; - nbor->build_nbor_list(inum, host_inum, nall, *atom, boxlo, boxhi, tag, + nbor->build_nbor_list(inum, host_inum, nall, *atom, sublo, subhi, tag, nspecial, special, success, mn); - double bytes=atom->gpu_bytes()+nbor->gpu_bytes(); + double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); if (bytes>_max_an_bytes) _max_an_bytes=bytes; } @@ -157,24 +177,26 @@ inline void ChargeGPUMemoryT::build_nbor_list(const int inum, // Copy nbor list from host if necessary and then calculate forces, virials,.. // --------------------------------------------------------------------------- template -void ChargeGPUMemoryT::compute(const int timestep, const int f_ago, - const int inum_full, const int nall, - double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, - const bool eatom, const bool vatom, - int &host_start, const double cpu_time, - bool &success, double *host_q) { +void ChargeGPUMemoryT::compute(const int f_ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, + int &host_start, const double cpu_time, + bool &success, double *host_q, + const int nlocal, double *boxlo, double *prd) { acc_timers(); if (inum_full==0) { + host_start=0; + // Make sure textures are correct if realloc by a different hybrid style + resize_atom(0,nall,success); zero_timers(); return; } int ago=hd_balancer.ago_first(f_ago); - int inum=hd_balancer.balance(timestep,ago,inum_full,cpu_time, - nbor->gpu_nbor()); - atom->inum(inum); + int inum=hd_balancer.balance(ago,inum_full,cpu_time); + ans->inum(inum); host_start=inum; if (ago==0) { @@ -187,10 +209,14 @@ void ChargeGPUMemoryT::compute(const int timestep, const int f_ago, atom->cast_q_data(host_q); hd_balancer.start_timer(); atom->add_x_data(host_x,host_type); - atom->add_other_data(); + atom->add_q_data(); + + device->precompute(f_ago,nlocal,nall,host_x,host_type,success,host_q, + boxlo, prd); loop(eflag,vflag); - atom->copy_answers(eflag,vflag,eatom,vatom,ilist); + ans->copy_answers(eflag,vflag,eatom,vatom,ilist); + device->add_ans_object(ans); hd_balancer.stop_timer(); } @@ -198,30 +224,33 @@ void ChargeGPUMemoryT::compute(const int timestep, const int f_ago, // Reneighbor on GPU if necessary and then compute forces, virials, energies // --------------------------------------------------------------------------- template -int * ChargeGPUMemoryT::compute(const int timestep, const int ago, - const int inum_full, const int nall, - double **host_x, int *host_type, double *boxlo, - double *boxhi, int *tag, int **nspecial, - int **special, const bool eflag, +int** ChargeGPUMemoryT::compute(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, int *tag, + int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, bool &success, - double *host_q) { + double *host_q, double *boxlo, double *prd) { acc_timers(); if (inum_full==0) { + host_start=0; + // Make sure textures are correct if realloc by a different hybrid style + resize_atom(0,nall,success); zero_timers(); return NULL; } - hd_balancer.balance(cpu_time,nbor->gpu_nbor()); - int inum=hd_balancer.get_gpu_count(timestep,ago,inum_full); - atom->inum(inum); + hd_balancer.balance(cpu_time); + int inum=hd_balancer.get_gpu_count(ago,inum_full); + ans->inum(inum); host_start=inum; // Build neighbor list on GPU if necessary if (ago==0) { build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, - boxlo, boxhi, tag, nspecial, special, success); + sublo, subhi, tag, nspecial, special, success); if (!success) return NULL; atom->cast_q_data(host_q); @@ -232,20 +261,25 @@ int * ChargeGPUMemoryT::compute(const int timestep, const int ago, hd_balancer.start_timer(); atom->add_x_data(host_x,host_type); } - atom->add_other_data(); + atom->add_q_data(); + *ilist=nbor->host_ilist.begin(); + *jnum=nbor->host_acc.begin(); + + device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q, + boxlo, prd); loop(eflag,vflag); - atom->copy_answers(eflag,vflag,eatom,vatom); + ans->copy_answers(eflag,vflag,eatom,vatom); + device->add_ans_object(ans); hd_balancer.stop_timer(); - return device->nbor.host_nbor.begin(); + return nbor->host_jlist.begin()-host_start; } template double ChargeGPUMemoryT::host_memory_usage_atomic() const { - return device->atom.host_memory_usage()+ - device->nbor.host_memory_usage()+4*sizeof(numtyp)+ - sizeof(ChargeGPUMemory); + return device->atom.host_memory_usage()+nbor->host_memory_usage()+ + 4*sizeof(numtyp)+sizeof(ChargeGPUMemory); } template diff --git a/lib/gpu/charge_gpu_memory.h b/lib/gpu/charge_gpu_memory.h index d18857e4d6..768f0e0c08 100644 --- a/lib/gpu/charge_gpu_memory.h +++ b/lib/gpu/charge_gpu_memory.h @@ -18,8 +18,6 @@ #ifndef CHARGE_GPU_MEMORY_H #define CHARGE_GPU_MEMORY_H -#define BLOCK_1D 64 - #include "pair_gpu_device.h" #include "pair_gpu_balance.h" #include "mpi.h" @@ -39,19 +37,30 @@ class ChargeGPUMemory { /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin - * \param gpu_split fraction of particles handled by device **/ - bool init_atomic(const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, - const double gpu_split, FILE *screen, - const char *pair_program); + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init_atomic(const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, + const char *pair_program); + + /// Estimate the overhead for GPU context changes and CPU driver + void estimate_gpu_overhead(); /// Check if there is enough storage for atom arrays and realloc if not /** \param success set to false if insufficient memory **/ inline void resize_atom(const int inum, const int nall, bool &success) { - if (atom->resize(inum, nall, success)) { + if (atom->resize(nall, success)) { pos_tex.bind_float(atom->dev_x,4); q_tex.bind_float(atom->dev_q,1); } + ans->resize(inum,success); } /// Check if there is enough storage for neighbors and realloc if not @@ -87,13 +96,16 @@ class ChargeGPUMemory { /// Accumulate timers inline void acc_timers() { - if (nbor_time_avail) { - nbor->time_nbor.add_to_total(); - nbor->time_kernel.add_to_total(); - nbor_time_avail=false; + if (device->time_device()) { + if (nbor_time_avail) { + nbor->time_nbor.add_to_total(); + nbor->time_kernel.add_to_total(); + nbor_time_avail=false; + } + time_pair.add_to_total(); + atom->acc_timers(); + ans->acc_timers(); } - time_pair.add_to_total(); - atom->acc_timers(); } /// Zero timers @@ -101,6 +113,7 @@ class ChargeGPUMemory { nbor_time_avail=false; time_pair.zero(); atom->zero_timers(); + ans->zero_timers(); } /// Copy neighbor list from host @@ -110,24 +123,25 @@ class ChargeGPUMemory { /// Build neighbor list on device void build_nbor_list(const int inum, const int host_inum, const int nall, double **host_x, int *host_type, - double *boxlo, double *boxhi, int *tag, int **nspecial, + double *sublo, double *subhi, int *tag, int **nspecial, int **special, bool &success); /// Pair loop with host neighboring - void compute(const int timestep, const int f_ago, const int inum_full, - const int nall, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, const bool eflag, - const bool vflag, const bool eatom, const bool vatom, - int &host_start, const double cpu_time, bool &success, - double *charge); + void compute(const int f_ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *charge, + const int nlocal, double *boxlo, double *prd); /// Pair loop with device neighboring - int * compute(const int timestep, const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, double *boxlo, - double *boxhi, int *tag, int **nspecial, + int** compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, double *charge); + int **ilist, int **numj, const double cpu_time, bool &success, + double *charge, double *boxlo, double *prd); // -------------------------- DEVICE DATA ------------------------- @@ -152,6 +166,10 @@ class ChargeGPUMemory { PairGPUAtom *atom; + // ------------------------ FORCE/ENERGY DATA ----------------------- + + PairGPUAns *ans; + // --------------------------- NBOR DATA ---------------------------- /// Neighbor data @@ -171,8 +189,10 @@ class ChargeGPUMemory { protected: bool _compiled; - int _block_size; + int _block_size, _block_bio_size, _threads_per_atom; double _max_bytes, _max_an_bytes; + double _gpu_overhead, _driver_overhead; + UCL_D_Vec *_nbor_data; void compile_kernels(UCL_Device &dev, const char *pair_string); diff --git a/lib/gpu/cmm_cut_gpu.cpp b/lib/gpu/cmm_cut_gpu.cpp index 53976ff7e8..7be958615a 100644 --- a/lib/gpu/cmm_cut_gpu.cpp +++ b/lib/gpu/cmm_cut_gpu.cpp @@ -28,12 +28,12 @@ static CMM_GPU_Memory CMMMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- -bool cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types, - double **host_lj1, double **host_lj2, double **host_lj3, - double **host_lj4, double **offset, double *special_lj, - const int inum, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, int &gpu_mode, - FILE *screen) { +int cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **offset, double *special_lj, + const int inum, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, int &gpu_mode, + FILE *screen) { CMMMF.clear(); gpu_mode=CMMMF.device->gpu_mode(); double gpu_split=CMMMF.device->particle_split(); @@ -54,13 +54,11 @@ bool cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types, fflush(screen); } - if (world_me==0) { - bool init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3, - host_lj4, offset, special_lj, inum, nall, 300, - maxspecial, cell_size, gpu_split, screen); - if (!init_ok) - return false; - } + int init_ok=0; + if (world_me==0) + init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3, + host_lj4, offset, special_lj, inum, nall, 300, + maxspecial, cell_size, gpu_split, screen); CMMMF.device->world_barrier(); if (message) @@ -75,45 +73,45 @@ bool cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types, last_gpu,i); fflush(screen); } - if (gpu_rank==i && world_me!=0) { - bool init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3, - host_lj4, offset, special_lj, inum, nall, 300, - maxspecial, cell_size, gpu_split, - screen); - if (!init_ok) - return false; - } + if (gpu_rank==i && world_me!=0) + init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3, + host_lj4, offset, special_lj, inum, nall, 300, + maxspecial, cell_size, gpu_split, screen); + CMMMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); - return true; + + if (init_ok==0) + CMMMF.estimate_gpu_overhead(); + return init_ok; } void cmm_gpu_clear() { CMMMF.clear(); } -int * cmm_gpu_compute_n(const int timestep, const int ago, const int inum_full, +int** cmm_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, - double *boxlo, double *boxhi, int *tag, int **nspecial, + double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success) { - return CMMMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo, - boxhi, tag, nspecial, special, eflag, vflag, eatom, - vatom, host_start, cpu_time, success); + int **ilist, int **jnum, const double cpu_time, + bool &success) { + return CMMMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success); } -void cmm_gpu_compute(const int timestep, const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, const double cpu_time, - bool &success) { - CMMMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj, +void cmm_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success) { + CMMMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success); } diff --git a/lib/gpu/cmm_cut_gpu_kernel.cu b/lib/gpu/cmm_cut_gpu_kernel.cu index 47504f621e..08cc31ed7f 100644 --- a/lib/gpu/cmm_cut_gpu_kernel.cu +++ b/lib/gpu/cmm_cut_gpu_kernel.cu @@ -18,8 +18,6 @@ #ifndef CMM_GPU_KERNEL #define CMM_GPU_KERNEL -#define MAX_SHARED_TYPES 8 - #ifdef _DOUBLE_DOUBLE #define numtyp double #define numtyp2 double2 @@ -46,7 +44,7 @@ #ifdef NV_KERNEL -#include "geryon/ucl_nv_kernel.h" +#include "nv_kernel_def.h" texture pos_tex; #ifdef _DOUBLE_DOUBLE @@ -72,6 +70,8 @@ __inline float4 fetch_pos(const int& i, const float4 *pos) #define __inline inline #define fetch_pos(i,y) x_[i] +#define BLOCK_PAIR 64 +#define MAX_SHARED_TYPES 8 #endif @@ -82,40 +82,56 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; } __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, __global numtyp4* lj3, const int lj_types, __global numtyp *sp_lj_in, __global int *dev_nbor, - __global acctyp4 *ans, __global acctyp *engv, - const int eflag, const int vflag, const int inum, - const int nall, const int nbor_pitch) { - // ii indexes the two interacting particles in gi - int ii=GLOBAL_ID_X; + __global int *dev_packed, __global acctyp4 *ans, + __global acctyp *engv, const int eflag, + const int vflag, const int inum, const int nall, + const int nbor_pitch, const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + __local numtyp sp_lj[4]; sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; sp_lj[2]=sp_lj_in[2]; sp_lj[3]=sp_lj_in[3]; - if (ii1) { + __local acctyp red_acc[6][BLOCK_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; - // Store answers + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<4; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1=energy; @@ -183,49 +238,64 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, __global numtyp4* lj3_in, - __global numtyp* sp_lj_in,__global int *dev_nbor, - __global acctyp4 *ans, __global acctyp *engv, - const int eflag, const int vflag, const int inum, - const int nall, const int nbor_pitch) { - // ii indexes the two interacting particles in gi - int ii=THREAD_ID_X; + __global numtyp* sp_lj_in,__global int *dev_nbor, + __global int *dev_packed, __global acctyp4 *ans, + __global acctyp *engv, const int eflag, + const int vflag, const int inum, const int nall, + const int nbor_pitch, const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; - if (ii<4) - sp_lj[ii]=sp_lj_in[ii]; - if (ii0) - lj3[ii]=lj3_in[ii]; + lj3[tid]=lj3_in[tid]; } - ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X); + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + __syncthreads(); if (ii1) { + __local acctyp red_acc[6][BLOCK_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; - // Store answers + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<4; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1=energy; diff --git a/lib/gpu/cmm_cut_gpu_memory.cpp b/lib/gpu/cmm_cut_gpu_memory.cpp index e5a83e5872..8a5949c9e7 100644 --- a/lib/gpu/cmm_cut_gpu_memory.cpp +++ b/lib/gpu/cmm_cut_gpu_memory.cpp @@ -42,22 +42,26 @@ int CMM_GPU_MemoryT::bytes_per_atom(const int max_nbors) const { } template -bool CMM_GPU_MemoryT::init(const int ntypes, double **host_cutsq, - int **host_cg_type, double **host_lj1, - double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, - double *host_special_lj, const int nlocal, - const int nall, const int max_nbors, - const int maxspecial, const double cell_size, - const double gpu_split, FILE *_screen) { - this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, - _screen,cmm_cut_gpu_kernel); +int CMM_GPU_MemoryT::init(const int ntypes, double **host_cutsq, + int **host_cg_type, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, + double *host_special_lj, const int nlocal, + const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen) { + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,cmm_cut_gpu_kernel); + if (success!=0) + return success; // If atom type constants fit in shared memory use fast kernel int cmm_types=ntypes; shared_types=false; - if (cmm_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) { - cmm_types=MAX_SHARED_TYPES; + int max_shared_types=this->device->max_shared_types(); + if (cmm_types<=max_shared_types && this->_block_size>=max_shared_types) { + cmm_types=max_shared_types; shared_types=true; } _cmm_types=cmm_types; @@ -84,7 +88,7 @@ bool CMM_GPU_MemoryT::init(const int ntypes, double **host_cutsq, _allocated=true; this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes(); - return true; + return 0; } template @@ -122,9 +126,10 @@ void CMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { else vflag=0; - int GX=static_cast(ceil(static_cast(this->atom->inum())/BX)); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); - int ainum=this->atom->inum(); + int ainum=this->ans->inum(); int anall=this->atom->nall(); int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); @@ -133,16 +138,18 @@ void CMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->atom->dev_ans.begin(), - &this->atom->dev_engv.begin(), &eflag, &vflag, - &ainum, &anall, &nbor_pitch); + &this->_nbor_data->begin(), + &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, + &ainum, &anall, &nbor_pitch, + &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &_cmm_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->atom->dev_ans.begin(), - &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum, - &anall, &nbor_pitch); + &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, + &anall, &nbor_pitch, &this->_threads_per_atom); } this->time_pair.stop(); } diff --git a/lib/gpu/cmm_cut_gpu_memory.h b/lib/gpu/cmm_cut_gpu_memory.h index 8099d5b9c4..fff90e477d 100644 --- a/lib/gpu/cmm_cut_gpu_memory.h +++ b/lib/gpu/cmm_cut_gpu_memory.h @@ -29,13 +29,20 @@ class CMM_GPU_Memory : public AtomicGPUMemory { /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin - * \param gpu_split fraction of particles handled by device **/ - bool init(const int ntypes, double **host_cutsq, int **host_cg_type, - double **host_lj1, double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, - const double gpu_split, FILE *screen); + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, int **host_cg_type, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen); /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ diff --git a/lib/gpu/cmmc_long_gpu.cpp b/lib/gpu/cmmc_long_gpu.cpp index a3fcf336c6..a6f3d090af 100644 --- a/lib/gpu/cmmc_long_gpu.cpp +++ b/lib/gpu/cmmc_long_gpu.cpp @@ -28,14 +28,14 @@ static CMML_GPU_Memory CMMLMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- -bool cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type, - double **host_lj1, double **host_lj2, double **host_lj3, - double **host_lj4, double **offset, double *special_lj, - const int inum, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, int &gpu_mode, - FILE *screen, double **host_cut_ljsq, double host_cut_coulsq, - double *host_special_coul, const double qqrd2e, - const double g_ewald) { +int cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **offset, double *special_lj, + const int inum, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, int &gpu_mode, + FILE *screen, double **host_cut_ljsq, double host_cut_coulsq, + double *host_special_coul, const double qqrd2e, + const double g_ewald) { CMMLMF.clear(); gpu_mode=CMMLMF.device->gpu_mode(); double gpu_split=CMMLMF.device->particle_split(); @@ -56,15 +56,12 @@ bool cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type, fflush(screen); } - if (world_me==0) { - bool init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, - host_lj3, host_lj4, offset, special_lj, inum, - nall, 300, maxspecial, cell_size, gpu_split, - screen, host_cut_ljsq, host_cut_coulsq, - host_special_coul, qqrd2e,g_ewald); - if (!init_ok) - return false; - } + int init_ok=0; + if (world_me==0) + init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, host_lj3, + host_lj4, offset, special_lj, inum, nall, 300, + maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, + host_cut_coulsq, host_special_coul, qqrd2e,g_ewald); CMMLMF.device->world_barrier(); if (message) @@ -79,48 +76,51 @@ bool cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type, last_gpu,i); fflush(screen); } - if (gpu_rank==i && world_me!=0) { - bool init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, - host_lj3, host_lj4, offset, special_lj, inum, - nall, 300, maxspecial, cell_size, gpu_split, - screen, host_cut_ljsq, host_cut_coulsq, - host_special_coul, qqrd2e, g_ewald); - if (!init_ok) - return false; - } + if (gpu_rank==i && world_me!=0) + init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, host_lj3, + host_lj4, offset, special_lj, inum, nall, 300, + maxspecial, cell_size, gpu_split, screen, + host_cut_ljsq, host_cut_coulsq, host_special_coul, + qqrd2e, g_ewald); CMMLMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); - return true; + + if (init_ok==0) + CMMLMF.estimate_gpu_overhead(); + return init_ok; } void cmml_gpu_clear() { CMMLMF.clear(); } -int * cmml_gpu_compute_n(const int timestep, const int ago, const int inum_full, +int** cmml_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, - double *boxlo, double *boxhi, int *tag, int **nspecial, + double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, double *host_q) { - return CMMLMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo, - boxhi, tag, nspecial, special, eflag, vflag, eatom, - vatom, host_start, cpu_time, success, host_q); + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, + double *prd) { + return CMMLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success, + host_q,boxlo,prd); } -void cmml_gpu_compute(const int timestep, const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, const double cpu_time, - bool &success, double *host_q) { - CMMLMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj, +void cmml_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_q, + const int nlocal, double *boxlo, double *prd) { + CMMLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success, - host_q); + host_q,nlocal,boxlo,prd); } double cmml_gpu_bytes() { diff --git a/lib/gpu/cmmc_long_gpu_kernel.cu b/lib/gpu/cmmc_long_gpu_kernel.cu index 4a19b5fe03..5153cb5016 100644 --- a/lib/gpu/cmmc_long_gpu_kernel.cu +++ b/lib/gpu/cmmc_long_gpu_kernel.cu @@ -18,8 +18,6 @@ #ifndef CMML_GPU_KERNEL #define CMML_GPU_KERNEL -#define MAX_SHARED_TYPES 8 - #ifdef _DOUBLE_DOUBLE #define numtyp double #define numtyp2 double2 @@ -54,7 +52,7 @@ #ifdef NV_KERNEL -#include "geryon/ucl_nv_kernel.h" +#include "nv_kernel_def.h" texture pos_tex; texture q_tex; @@ -90,6 +88,8 @@ __inline float fetch_q(const int& i, const float *q) #define fetch_pos(i,y) x_[i] #define fetch_q(i,y) q_[i] +#define BLOCK_PAIR 64 +#define MAX_SHARED_TYPES 8 #endif @@ -100,13 +100,17 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; } __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, __global numtyp4* lj3, const int lj_types, __global numtyp *sp_lj_in, __global int *dev_nbor, - __global acctyp4 *ans, __global acctyp *engv, - const int eflag, const int vflag, const int inum, - const int nall, const int nbor_pitch, - __global numtyp *q_ , const numtyp cut_coulsq, - const numtyp qqrd2e, const numtyp g_ewald) { - // ii indexes the two interacting particles in gi - int ii=GLOBAL_ID_X; + __global int *dev_packed, __global acctyp4 *ans, + __global acctyp *engv, const int eflag, + const int vflag, const int inum, const int nall, + const int nbor_pitch, __global numtyp *q_ , + const numtyp cut_coulsq, const numtyp qqrd2e, + const numtyp g_ewald, const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + __local numtyp sp_lj[8]; sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; @@ -117,29 +121,41 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, sp_lj[6]=sp_lj_in[6]; sp_lj[7]=sp_lj_in[7]; + acctyp energy=(acctyp)0; + acctyp e_coul=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + if (ii1) { + __local acctyp red_acc[6][BLOCK_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; + red_acc[4][tid]=e_coul; - // Store answers + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<5; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + e_coul=red_acc[4][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1=energy; @@ -234,51 +291,67 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, __global numtyp4* lj3_in, - __global numtyp* sp_lj_in, __global int *dev_nbor, + __global numtyp* sp_lj_in, + __global int *dev_nbor, __global int *dev_packed, __global acctyp4 *ans, __global acctyp *engv, const int eflag, const int vflag, const int inum, const int nall, const int nbor_pitch, __global numtyp *q_ , const numtyp cut_coulsq, - const numtyp qqrd2e, const numtyp g_ewald) { - // ii indexes the two interacting particles in gi - int ii=THREAD_ID_X; + const numtyp qqrd2e, const numtyp g_ewald, + const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[8]; - if (ii<8) - sp_lj[ii]=sp_lj_in[ii]; - if (ii1) { + __local acctyp red_acc[6][BLOCK_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; + red_acc[4][tid]=e_coul; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<5; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + e_coul=red_acc[4][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1=energy; diff --git a/lib/gpu/cmmc_long_gpu_memory.cpp b/lib/gpu/cmmc_long_gpu_memory.cpp index 9a63bc5628..e2f99fceca 100644 --- a/lib/gpu/cmmc_long_gpu_memory.cpp +++ b/lib/gpu/cmmc_long_gpu_memory.cpp @@ -43,26 +43,30 @@ int CMML_GPU_MemoryT::bytes_per_atom(const int max_nbors) const { } template -bool CMML_GPU_MemoryT::init(const int ntypes, double **host_cutsq, - int **host_cg_type, double **host_lj1, - double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, - double *host_special_lj, const int nlocal, - const int nall, const int max_nbors, - const int maxspecial, const double cell_size, - const double gpu_split, FILE *_screen, - double **host_cut_ljsq, - const double host_cut_coulsq, - double *host_special_coul, const double qqrd2e, - const double g_ewald) { - this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, - _screen,cmmc_long_gpu_kernel); +int CMML_GPU_MemoryT::init(const int ntypes, double **host_cutsq, + int **host_cg_type, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, + double *host_special_lj, const int nlocal, + const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen, + double **host_cut_ljsq, + const double host_cut_coulsq, + double *host_special_coul, const double qqrd2e, + const double g_ewald) { + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,cmmc_long_gpu_kernel); + if (success!=0) + return success; // If atom type constants fit in shared memory use fast kernel int lj_types=ntypes; shared_types=false; - if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) { - lj_types=MAX_SHARED_TYPES; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; shared_types=true; } _lj_types=lj_types; @@ -95,7 +99,7 @@ bool CMML_GPU_MemoryT::init(const int ntypes, double **host_cutsq, _allocated=true; this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes(); - return true; + return 0; } template @@ -133,9 +137,10 @@ void CMML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { else vflag=0; - int GX=static_cast(ceil(static_cast(this->atom->inum())/BX)); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); - int ainum=this->atom->inum(); + int ainum=this->ans->inum(); int anall=this->atom->nall(); int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); @@ -144,19 +149,21 @@ void CMML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->atom->dev_ans.begin(), - &this->atom->dev_engv.begin(), &eflag, &vflag, + &this->_nbor_data->begin(), + &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &anall, &nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq, - &_qqrd2e, &_g_ewald); + &_qqrd2e, &_g_ewald, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->atom->dev_ans.begin(), - &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum, + &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &anall, &nbor_pitch, &this->atom->dev_q.begin(), - &_cut_coulsq, &_qqrd2e, &_g_ewald); + &_cut_coulsq, &_qqrd2e, &_g_ewald, + &this->_threads_per_atom); } this->time_pair.stop(); } diff --git a/lib/gpu/cmmc_long_gpu_memory.h b/lib/gpu/cmmc_long_gpu_memory.h index 8192c78249..45090368a5 100644 --- a/lib/gpu/cmmc_long_gpu_memory.h +++ b/lib/gpu/cmmc_long_gpu_memory.h @@ -29,15 +29,22 @@ class CMML_GPU_Memory : public ChargeGPUMemory { /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin - * \param gpu_split fraction of particles handled by device **/ - bool init(const int ntypes, double **host_cutsq, int ** cg_type, - double **host_lj1, double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, - const double gpu_split, FILE *screen, double **host_cut_ljsq, - const double host_cut_coulsq, double *host_special_coul, - const double qqrd2e, const double g_ewald); + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, int ** cg_type, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, double **host_cut_ljsq, + const double host_cut_coulsq, double *host_special_coul, + const double qqrd2e, const double g_ewald); /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ diff --git a/lib/gpu/crml_gpu.cpp b/lib/gpu/crml_gpu.cpp index 7458300907..1e59562ed5 100644 --- a/lib/gpu/crml_gpu.cpp +++ b/lib/gpu/crml_gpu.cpp @@ -28,16 +28,16 @@ static CRML_GPU_Memory CRMLMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- -bool crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1, - double **host_lj2, double **host_lj3, double **host_lj4, - double **offset, double *special_lj, const int inum, - const int nall, const int max_nbors, const int maxspecial, - const double cell_size, int &gpu_mode, FILE *screen, - double host_cut_ljsq, double host_cut_coulsq, - double *host_special_coul, const double qqrd2e, - const double g_ewald, const double cut_lj_innersq, - const double denom_lj, double **epsilon, - double **sigma, const bool mix_arithmetic) { +int crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double *special_lj, const int inum, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen, + double host_cut_ljsq, double host_cut_coulsq, + double *host_special_coul, const double qqrd2e, + const double g_ewald, const double cut_lj_innersq, + const double denom_lj, double **epsilon, + double **sigma, const bool mix_arithmetic) { CRMLMF.clear(); gpu_mode=CRMLMF.device->gpu_mode(); double gpu_split=CRMLMF.device->particle_split(); @@ -58,16 +58,13 @@ bool crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1, fflush(screen); } - if (world_me==0) { - bool init_ok=CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3, - host_lj4, offset, special_lj, inum, nall, 300, - maxspecial, cell_size, gpu_split, screen, - host_cut_ljsq, host_cut_coulsq, host_special_coul, - qqrd2e, g_ewald, cut_lj_innersq, denom_lj, - epsilon,sigma,mix_arithmetic); - if (!init_ok) - return false; - } + int init_ok=0; + if (world_me==0) + CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3, host_lj4, + offset, special_lj, inum, nall, 300, maxspecial, cell_size, + gpu_split, screen, host_cut_ljsq, host_cut_coulsq, + host_special_coul, qqrd2e, g_ewald, cut_lj_innersq, denom_lj, + epsilon,sigma,mix_arithmetic); CRMLMF.device->world_barrier(); if (message) @@ -82,50 +79,54 @@ bool crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1, last_gpu,i); fflush(screen); } - if (gpu_rank==i && world_me!=0) { - bool init_ok=CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3, - host_lj4, offset, special_lj, inum, nall, 300, - maxspecial, cell_size, gpu_split, - screen, host_cut_ljsq, host_cut_coulsq, - host_special_coul, qqrd2e, g_ewald, - cut_lj_innersq, denom_lj, epsilon, sigma, - mix_arithmetic); - if (!init_ok) - return false; - } + if (gpu_rank==i && world_me!=0) + init_ok=CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3, + host_lj4, offset, special_lj, inum, nall, 300, + maxspecial, cell_size, gpu_split, screen, + host_cut_ljsq, host_cut_coulsq, host_special_coul, + qqrd2e, g_ewald, cut_lj_innersq, denom_lj, epsilon, + sigma, mix_arithmetic); + CRMLMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); - return true; + + if (init_ok==0) + CRMLMF.estimate_gpu_overhead(); + return init_ok; } void crml_gpu_clear() { CRMLMF.clear(); } -int * crml_gpu_compute_n(const int timestep, const int ago, const int inum_full, +int** crml_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, - double *boxlo, double *boxhi, int *tag, int **nspecial, + double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, double *host_q) { - return CRMLMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo, - boxhi, tag, nspecial, special, eflag, vflag, eatom, - vatom, host_start, cpu_time, success, host_q); + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, + double *prd) { + return CRMLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success, + host_q, boxlo, prd); } -void crml_gpu_compute(const int timestep, const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, const double cpu_time, - bool &success, double *host_q) { - CRMLMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj, - firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success, - host_q); +void crml_gpu_compute(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, + const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, const double cpu_time, + bool &success, double *host_q, const int nlocal, + double *boxlo, double *prd) { + CRMLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh, + eflag,vflag,eatom,vatom,host_start,cpu_time,success,host_q, + nlocal,boxlo,prd); } double crml_gpu_bytes() { diff --git a/lib/gpu/crml_gpu_kernel.cu b/lib/gpu/crml_gpu_kernel.cu index 6ba6eaedca..63ce924581 100644 --- a/lib/gpu/crml_gpu_kernel.cu +++ b/lib/gpu/crml_gpu_kernel.cu @@ -54,7 +54,7 @@ #ifdef NV_KERNEL -#include "geryon/ucl_nv_kernel.h" +#include "nv_kernel_def.h" texture pos_tex; texture q_tex; @@ -90,6 +90,7 @@ __inline float fetch_q(const int& i, const float *q) #define fetch_pos(i,y) x_[i] #define fetch_q(i,y) q_[i] +#define BLOCK_BIO_PAIR 64 #endif @@ -98,18 +99,22 @@ __inline float fetch_q(const int& i, const float *q) __inline int sbmask(int j) { return j >> SBBITS & 3; } __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, - const int lj_types, - __global numtyp *sp_lj_in, __global int *dev_nbor, + const int lj_types, __global numtyp *sp_lj_in, + __global int *dev_nbor, __global int *dev_packed, __global acctyp4 *ans, __global acctyp *engv, const int eflag, const int vflag, const int inum, const int nall, const int nbor_pitch, __global numtyp *q_, const numtyp cut_coulsq, const numtyp qqrd2e, const numtyp g_ewald, const numtyp denom_lj, const numtyp cut_bothsq, - const numtyp cut_ljsq, const numtyp cut_lj_innersq) { + const numtyp cut_ljsq, const numtyp cut_lj_innersq, + const int t_per_atom) { + + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; - // ii indexes the two interacting particles in gi - int ii=GLOBAL_ID_X; __local numtyp sp_lj[8]; sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; @@ -120,29 +125,41 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, sp_lj[6]=sp_lj_in[6]; sp_lj[7]=sp_lj_in[7]; - if (ii1) { + __local acctyp red_acc[6][BLOCK_BIO_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; + red_acc[4][tid]=e_coul; - // Store answers + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<5; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + e_coul=red_acc[4][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1=energy; @@ -240,50 +298,65 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in, __global numtyp* sp_lj_in, __global int *dev_nbor, - __global acctyp4 *ans, __global acctyp *engv, - const int eflag, const int vflag, const int inum, - const int nall, const int nbor_pitch, - __global numtyp *q_, const numtyp cut_coulsq, - const numtyp qqrd2e, const numtyp g_ewald, - const numtyp denom_lj, const numtyp cut_bothsq, - const numtyp cut_ljsq, - const numtyp cut_lj_innersq) { - // ii indexes the two interacting particles in gi - int ii=THREAD_ID_X; + __global int *dev_packed, __global acctyp4 *ans, + __global acctyp *engv, const int eflag, + const int vflag, const int inum, const int nall, + const int nbor_pitch, __global numtyp *q_, + const numtyp cut_coulsq, const numtyp qqrd2e, + const numtyp g_ewald, const numtyp denom_lj, + const numtyp cut_bothsq, const numtyp cut_ljsq, + const numtyp cut_lj_innersq, + const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + __local numtyp2 ljd[MAX_BIO_SHARED_TYPES]; __local numtyp sp_lj[8]; - if (ii<8) - sp_lj[ii]=sp_lj_in[ii]; - ljd[ii]=ljd_in[ii]; - ljd[ii+64]=ljd_in[ii+64]; - - ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X); + if (tid<8) + sp_lj[tid]=sp_lj_in[tid]; + ljd[tid]=ljd_in[tid]; + if (tid+BLOCK_BIO_PAIR1) { + __local acctyp red_acc[6][BLOCK_BIO_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; + red_acc[4][tid]=e_coul; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<5; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + e_coul=red_acc[4][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1=energy; diff --git a/lib/gpu/crml_gpu_memory.cpp b/lib/gpu/crml_gpu_memory.cpp index e877503e87..6661f67585 100644 --- a/lib/gpu/crml_gpu_memory.cpp +++ b/lib/gpu/crml_gpu_memory.cpp @@ -43,7 +43,7 @@ int CRML_GPU_MemoryT::bytes_per_atom(const int max_nbors) const { } template -bool CRML_GPU_MemoryT::init(const int ntypes, +int CRML_GPU_MemoryT::init(const int ntypes, double host_cut_bothsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, @@ -56,20 +56,24 @@ bool CRML_GPU_MemoryT::init(const int ntypes, const double g_ewald, const double cut_lj_innersq, const double denom_lj, double **epsilon, double **sigma, const bool mix_arithmetic) { - this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, - _screen,crml_gpu_kernel); + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,crml_gpu_kernel); + if (success!=0) + return success; // If atom type constants fit in shared memory use fast kernel int lj_types=ntypes; shared_types=false; - if (this->_block_size>=64 && mix_arithmetic) + if (this->_block_bio_size>=64 && mix_arithmetic) shared_types=true; _lj_types=lj_types; // Allocate a host write buffer for data initialization int h_size=lj_types*lj_types; - if (h_sizedevice->max_bio_shared_types(); + if (h_size host_write(h_size*32,*(this->ucl_device), UCL_WRITE_OPTIMIZED); for (int i=0; iatom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2, host_lj3,host_lj4); - ljd.alloc(MAX_BIO_SHARED_TYPES,*(this->ucl_device),UCL_READ_ONLY); + ljd.alloc(max_bio_shared_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->self_pack2(ntypes,ljd,host_write,epsilon,sigma); sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY); @@ -99,7 +103,7 @@ bool CRML_GPU_MemoryT::init(const int ntypes, _allocated=true; this->_max_bytes=lj1.row_bytes()+ljd.row_bytes()+sp_lj.row_bytes(); - return true; + return 0; } template @@ -125,7 +129,7 @@ double CRML_GPU_MemoryT::host_memory_usage() const { template void CRML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { // Compute the block size and grid size to keep all cores busy - const int BX=this->block_size(); + const int BX=this->_block_bio_size; int eflag, vflag; if (_eflag) eflag=1; @@ -137,9 +141,10 @@ void CRML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { else vflag=0; - int GX=static_cast(ceil(static_cast(this->atom->inum())/BX)); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); - int ainum=this->atom->inum(); + int ainum=this->ans->inum(); int anall=this->atom->nall(); int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); @@ -147,21 +152,24 @@ void CRML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.set_size(GX,BX); this->k_pair_fast.run(&this->atom->dev_x.begin(), &ljd.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->atom->dev_ans.begin(), - &this->atom->dev_engv.begin(), &eflag, &vflag, + &this->_nbor_data->begin(), + &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &anall, &nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj, &_cut_bothsq, - &_cut_ljsq, &_cut_lj_innersq); + &_cut_ljsq, &_cut_lj_innersq, + &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->atom->dev_ans.begin(), - &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum, + &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &anall, &nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj, - &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq); + &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq, + &this->_threads_per_atom); } this->time_pair.stop(); } diff --git a/lib/gpu/crml_gpu_memory.h b/lib/gpu/crml_gpu_memory.h index 5520cd3a17..a474d5982d 100644 --- a/lib/gpu/crml_gpu_memory.h +++ b/lib/gpu/crml_gpu_memory.h @@ -29,17 +29,24 @@ class CRML_GPU_Memory : public ChargeGPUMemory { /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin - * \param gpu_split fraction of particles handled by device **/ - bool init(const int ntypes, double host_cut_bothsq, - double **host_lj1, double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, - const double gpu_split, FILE *screen, double host_cut_ljsq, - const double host_cut_coulsq, double *host_special_coul, - const double qqrd2e, const double g_ewald, - const double cut_lj_innersq, const double denom_lj, - double **epsilon, double **sigma, const bool mix_arithmetic); + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double host_cut_bothsq, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, double host_cut_ljsq, + const double host_cut_coulsq, double *host_special_coul, + const double qqrd2e, const double g_ewald, + const double cut_lj_innersq, const double denom_lj, + double **epsilon, double **sigma, const bool mix_arithmetic); /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ diff --git a/lib/gpu/gb_gpu.cpp b/lib/gpu/gb_gpu.cpp index 5ca88fd70f..70eb4d9344 100644 --- a/lib/gpu/gb_gpu.cpp +++ b/lib/gpu/gb_gpu.cpp @@ -49,14 +49,14 @@ void gb_gpu_pack_nbors(GBMT &gbm, const int GX, const int BX, const int start, // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- -bool gb_gpu_init(const int ntypes, const double gamma, - const double upsilon, const double mu, double **shape, - double **well, double **cutsq, double **sigma, - double **epsilon, double *host_lshape, int **form, - double **host_lj1, double **host_lj2, double **host_lj3, - double **host_lj4, double **offset, double *special_lj, - const int inum, const int nall, const int max_nbors, - const double cell_size, int &gpu_mode, FILE *screen) { +int gb_gpu_init(const int ntypes, const double gamma, + const double upsilon, const double mu, double **shape, + double **well, double **cutsq, double **sigma, + double **epsilon, double *host_lshape, int **form, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **offset, double *special_lj, + const int inum, const int nall, const int max_nbors, + const double cell_size, int &gpu_mode, FILE *screen) { GBMF.clear(); gpu_mode=GBMF.device->gpu_mode(); double gpu_split=GBMF.device->particle_split(); @@ -77,14 +77,12 @@ bool gb_gpu_init(const int ntypes, const double gamma, fflush(screen); } - if (world_me==0) { - bool init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq, - sigma, epsilon, host_lshape, form, host_lj1, - host_lj2, host_lj3, host_lj4, offset, special_lj, - inum, nall, max_nbors, cell_size, gpu_split, screen); - if (!init_ok) - return false; - } + int init_ok=0; + if (world_me==0) + init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq, + sigma, epsilon, host_lshape, form, host_lj1, + host_lj2, host_lj3, host_lj4, offset, special_lj, + inum, nall, max_nbors, cell_size, gpu_split, screen); GBMF.device->world_barrier(); if (message) @@ -99,22 +97,22 @@ bool gb_gpu_init(const int ntypes, const double gamma, last_gpu,i); fflush(screen); } - if (gpu_rank==i && world_me!=0) { - bool init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq, - sigma, epsilon, host_lshape, form, host_lj1, - host_lj2, host_lj3, host_lj4, offset, special_lj, - inum, nall, max_nbors, cell_size, gpu_split, - screen); - if (!init_ok) - return false; - } + if (gpu_rank==i && world_me!=0) + init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq, sigma, + epsilon, host_lshape, form, host_lj1, host_lj2, + host_lj3, host_lj4, offset, special_lj, inum, nall, + max_nbors, cell_size, gpu_split, screen); + GBMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); - return true; + + if (init_ok==0) + GBMF.estimate_gpu_overhead(); + return init_ok; } // --------------------------------------------------------------------------- @@ -131,8 +129,8 @@ template inline void _gb_gpu_build_nbor_list(gbmtyp &gbm, const int inum, const int host_inum, const int nall, double **host_x, double **host_quat, - int *host_type, double *boxlo, - double *boxhi, bool &success) { + int *host_type, double *sublo, + double *subhi, bool &success) { gbm.nbor_time_avail=true; success=true; @@ -144,7 +142,7 @@ inline void _gb_gpu_build_nbor_list(gbmtyp &gbm, const int inum, gbm.atom->cast_copy_x(host_x,host_type); int mn; gbm.nbor->build_nbor_list(inum, host_inum, nall, *gbm.atom, - boxlo, boxhi, NULL, NULL, NULL, success, mn); + sublo, subhi, NULL, NULL, NULL, success, mn); gbm.nbor->copy_unpacked(inum,mn); gbm.last_ellipse=inum; gbm.max_last_ellipse=inum; @@ -163,7 +161,7 @@ void _gb_gpu_reset_nbors(gbmtyp &gbm, const int nall, gbm.nbor_time_avail=true; - int mn=gbm.nbor->max_nbor_loop(inum,numj); + int mn=gbm.nbor->max_nbor_loop(inum,numj,ilist); gbm.resize_atom(inum,nall,success); gbm.resize_local(inum,0,mn,osize,success); if (!success) @@ -216,9 +214,10 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) { else vflag=0; - int GX=static_cast(ceil(static_cast(gbm.atom->inum())/BX)); + int GX=static_cast(ceil(static_cast(gbm.ans->inum())/ + (BX/gbm._threads_per_atom))); int stride=gbm.nbor->nbor_pitch(); - int ainum=gbm.atom->inum(); + int ainum=gbm.ans->inum(); int anall=gbm.atom->nall(); if (gbm.multiple_forms) { @@ -226,7 +225,7 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) { if (gbm.last_ellipse>0) { // ------------ ELLIPSE_ELLIPSE and ELLIPSE_SPHERE --------------- GX=static_cast(ceil(static_cast(gbm.last_ellipse)/ - static_cast(BX))); + (BX/gbm._threads_per_atom))); gb_gpu_pack_nbors(gbm,GX,BX, 0, gbm.last_ellipse,ELLIPSE_SPHERE, ELLIPSE_ELLIPSE); gbm.time_kernel.stop(); @@ -237,11 +236,12 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) { &gbm.atom->dev_quat.begin(), &gbm.shape.begin(), &gbm.well.begin(), &gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(), &gbm._lj_types, &gbm.lshape.begin(), &gbm.nbor->dev_nbor.begin(), - &stride, &gbm.atom->dev_ans.begin(),&ainum,&gbm.atom->dev_engv.begin(), - &gbm.dev_error.begin(), &eflag, &vflag, &gbm.last_ellipse, &anall); + &stride, &gbm.ans->dev_ans.begin(),&ainum,&gbm.ans->dev_engv.begin(), + &gbm.dev_error.begin(), &eflag, &vflag, &gbm.last_ellipse, &anall, + &gbm._threads_per_atom); gbm.time_gayberne.stop(); - if (gbm.last_ellipse==gbm.atom->inum()) { + if (gbm.last_ellipse==gbm.ans->inum()) { gbm.time_kernel2.start(); gbm.time_kernel2.stop(); gbm.time_gayberne2.start(); @@ -254,9 +254,10 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) { // ------------ SPHERE_ELLIPSE --------------- gbm.time_kernel2.start(); - GX=static_cast(ceil(static_cast(gbm.atom->inum()- - gbm.last_ellipse)/BX)); - gb_gpu_pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.atom->inum(), + GX=static_cast(ceil(static_cast(gbm.ans->inum()- + gbm.last_ellipse)/ + (BX/gbm._threads_per_atom))); + gb_gpu_pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.ans->inum(), SPHERE_ELLIPSE,SPHERE_ELLIPSE); gbm.time_kernel2.stop(); @@ -266,13 +267,14 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) { &gbm.shape.begin(), &gbm.well.begin(), &gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(), &gbm._lj_types, &gbm.lshape.begin(), - &gbm.nbor->dev_nbor.begin(), &stride, &gbm.atom->dev_ans.begin(), - &gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(), &eflag, - &vflag, &gbm.last_ellipse, &ainum, &anall); + &gbm.nbor->dev_nbor.begin(), &stride, &gbm.ans->dev_ans.begin(), + &gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(), &eflag, + &vflag, &gbm.last_ellipse, &ainum, &anall, + &gbm._threads_per_atom); gbm.time_gayberne2.stop(); } else { - gbm.atom->dev_ans.zero(); - gbm.atom->dev_engv.zero(); + gbm.ans->dev_ans.zero(); + gbm.ans->dev_engv.zero(); gbm.time_kernel.stop(); gbm.time_gayberne.start(); gbm.time_gayberne.stop(); @@ -284,29 +286,31 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) { // ------------ LJ --------------- gbm.time_pair.start(); - if (gbm.last_ellipseinum()) { + if (gbm.last_ellipseinum()) { if (gbm.shared_types) { GBMF.k_lj_fast.set_size(GX,BX); GBMF.k_lj_fast.run(&gbm.atom->dev_x.begin(), &gbm.lj1.begin(), &gbm.lj3.begin(), &gbm.gamma_upsilon_mu.begin(), &stride, &gbm.nbor->dev_packed.begin(), - &gbm.atom->dev_ans.begin(), - &gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(), - &eflag, &vflag, &gbm.last_ellipse, &ainum, &anall); + &gbm.ans->dev_ans.begin(), + &gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(), + &eflag, &vflag, &gbm.last_ellipse, &ainum, &anall, + &gbm._threads_per_atom); } else { GBMF.k_lj.set_size(GX,BX); GBMF.k_lj.run(&gbm.atom->dev_x.begin(), &gbm.lj1.begin(), &gbm.lj3.begin(), &gbm._lj_types, &gbm.gamma_upsilon_mu.begin(), &stride, - &gbm.nbor->dev_packed.begin(), &gbm.atom->dev_ans.begin(), - &gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(), - &eflag, &vflag, &gbm.last_ellipse, &ainum, &anall); + &gbm.nbor->dev_packed.begin(), &gbm.ans->dev_ans.begin(), + &gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(), + &eflag, &vflag, &gbm.last_ellipse, &ainum, &anall, + &gbm._threads_per_atom); } } gbm.time_pair.stop(); } else { gbm.time_kernel.start(); - gb_gpu_pack_nbors(gbm, GX, BX, 0, gbm.atom->inum(),SPHERE_SPHERE, + gb_gpu_pack_nbors(gbm, GX, BX, 0, gbm.ans->inum(),SPHERE_SPHERE, ELLIPSE_ELLIPSE); gbm.time_kernel.stop(); gbm.time_gayberne.start(); @@ -315,9 +319,9 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) { &gbm.shape.begin(), &gbm.well.begin(), &gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(), &gbm._lj_types, &gbm.lshape.begin(), &gbm.nbor->dev_nbor.begin(), - &stride, &gbm.atom->dev_ans.begin(), &ainum, - &gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(), - &eflag, &vflag, &ainum, &anall); + &stride, &gbm.ans->dev_ans.begin(), &ainum, + &gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(), + &eflag, &vflag, &ainum, &anall, &gbm._threads_per_atom); gbm.time_gayberne.stop(); } } @@ -326,30 +330,31 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) { // Reneighbor on GPU if necessary and then compute forces, torques, energies // --------------------------------------------------------------------------- template -inline int * _gb_gpu_compute_n(gbmtyp &gbm, const int timestep, const int ago, - const int inum_full, const int nall, - double **host_x, int *host_type, - double *boxlo, double *boxhi, const bool eflag, - const bool vflag, const bool eatom, +inline int** _gb_gpu_compute_n(gbmtyp &gbm, const int ago, + const int inum_full, const int nall, + double **host_x, int *host_type, + double *sublo, double *subhi, const bool eflag, + const bool vflag, const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, - double **host_quat) { + int **ilist, int **jnum, const double cpu_time, + bool &success, double **host_quat) { gbm.acc_timers(); if (inum_full==0) { + host_start=0; gbm.zero_timers(); return NULL; } - gbm.hd_balancer.balance(cpu_time,gbm.nbor->gpu_nbor()); - int inum=gbm.hd_balancer.get_gpu_count(timestep,ago,inum_full); - gbm.atom->inum(inum); + gbm.hd_balancer.balance(cpu_time); + int inum=gbm.hd_balancer.get_gpu_count(ago,inum_full); + gbm.ans->inum(inum); gbm.last_ellipse=std::min(inum,gbm.max_last_ellipse); host_start=inum; // Build neighbor list on GPU if necessary if (ago==0) { _gb_gpu_build_nbor_list(gbm, inum, inum_full-inum, nall, host_x, - host_quat, host_type, boxlo, boxhi, success); + host_quat, host_type, sublo, subhi, success); if (!success) return NULL; gbm.atom->cast_quat_data(host_quat[0]); @@ -361,47 +366,49 @@ inline int * _gb_gpu_compute_n(gbmtyp &gbm, const int timestep, const int ago, gbm.atom->add_x_data(host_x,host_type); } - gbm.atom->add_other_data(); + gbm.atom->add_quat_data(); + *ilist=gbm.nbor->host_ilist.begin(); + *jnum=gbm.nbor->host_acc.begin(); _gb_gpu_gayberne(gbm,eflag,vflag); - gbm.atom->copy_answers(eflag,vflag,eatom,vatom); + gbm.ans->copy_answers(eflag,vflag,eatom,vatom); + gbm.device->add_ans_object(gbm.ans); gbm.hd_balancer.stop_timer(); - return gbm.device->nbor.host_nbor.begin(); + return gbm.nbor->host_jlist.begin()-host_start; } -int * gb_gpu_compute_n(const int timestep, const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - double *boxlo, double *boxhi, const bool eflag, - const bool vflag, const bool eatom, const bool vatom, - int &host_start, const double cpu_time, bool &success, - double **host_quat) { - return _gb_gpu_compute_n(GBMF, timestep, ago, inum_full, nall, host_x, - host_type, boxlo, boxhi, eflag, vflag, eatom, vatom, - host_start, cpu_time, success, host_quat); +int** gb_gpu_compute_n(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double **host_quat) { + return _gb_gpu_compute_n(GBMF, ago, inum_full, nall, host_x, host_type, sublo, + subhi, eflag, vflag, eatom, vatom, host_start, ilist, + jnum, cpu_time, success, host_quat); } // --------------------------------------------------------------------------- // Copy nbor list from host if necessary and then calculate forces, torques,.. // --------------------------------------------------------------------------- template -inline int * _gb_gpu_compute(gbmtyp &gbm, const int timestep, const int f_ago, - const int inum_full,const int nall,double **host_x, - int *host_type, int *ilist, int *numj, - int **firstneigh, const bool eflag, - const bool vflag, const bool eatom, - const bool vatom, int &host_start, - const double cpu_time, bool &success, - double **host_quat) { +inline int * _gb_gpu_compute(gbmtyp &gbm, const int f_ago, const int inum_full, + const int nall,double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, + int &host_start, const double cpu_time, + bool &success, double **host_quat) { gbm.acc_timers(); if (inum_full==0) { + host_start=0; gbm.zero_timers(); return NULL; } int ago=gbm.hd_balancer.ago_first(f_ago); - int inum=gbm.hd_balancer.balance(timestep,ago,inum_full,cpu_time, - gbm.nbor->gpu_nbor()); - gbm.atom->inum(inum); + int inum=gbm.hd_balancer.balance(ago,inum_full,cpu_time); + gbm.ans->inum(inum); gbm.last_ellipse=std::min(inum,gbm.max_last_ellipse); host_start=inum; @@ -421,21 +428,21 @@ inline int * _gb_gpu_compute(gbmtyp &gbm, const int timestep, const int f_ago, gbm.atom->cast_quat_data(host_quat[0]); gbm.hd_balancer.start_timer(); gbm.atom->add_x_data(host_x,host_type); - gbm.atom->add_other_data(); + gbm.atom->add_quat_data(); _gb_gpu_gayberne(gbm,eflag,vflag); - gbm.atom->copy_answers(eflag,vflag,eatom,vatom,list); + gbm.ans->copy_answers(eflag,vflag,eatom,vatom,list); + gbm.device->add_ans_object(gbm.ans); gbm.hd_balancer.stop_timer(); return list; } -int * gb_gpu_compute(const int timestep, const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, const double cpu_time, - bool &success, double **host_quat) { - return _gb_gpu_compute(GBMF, timestep, ago, inum_full, nall, host_x, +int * gb_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double **host_quat) { + return _gb_gpu_compute(GBMF, ago, inum_full, nall, host_x, host_type, ilist, numj, firstneigh, eflag, vflag, eatom, vatom, host_start, cpu_time, success, host_quat); diff --git a/lib/gpu/gb_gpu_extra.h b/lib/gpu/gb_gpu_extra.h index 6ac390437a..a341940c0a 100644 --- a/lib/gpu/gb_gpu_extra.h +++ b/lib/gpu/gb_gpu_extra.h @@ -18,7 +18,6 @@ #ifndef GB_GPU_EXTRA_H #define GB_GPU_EXTRA_H -#define MAX_SHARED_TYPES 8 enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE}; #ifdef _DOUBLE_DOUBLE @@ -47,7 +46,7 @@ enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE}; #ifdef NV_KERNEL -#include "geryon/ucl_nv_kernel.h" +#include "nv_kernel_def.h" #else @@ -58,6 +57,8 @@ enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE}; #define BLOCK_SIZE_X get_local_size(0) #define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE) #define __inline inline +#define BLOCK_PAIR 64 +#define MAX_SHARED_TYPES 8 #endif diff --git a/lib/gpu/gb_gpu_kernel.cu b/lib/gpu/gb_gpu_kernel.cu index b8d06ec6da..7bb320f5d0 100644 --- a/lib/gpu/gb_gpu_kernel.cu +++ b/lib/gpu/gb_gpu_kernel.cu @@ -97,17 +97,17 @@ __kernel void kernel_gayberne(__global numtyp4* x_,__global numtyp4 *q, __global acctyp4 *ans, const int astride, __global acctyp *engv, __global int *err_flag, const int eflag, const int vflag, const int inum, - const int nall) { + const int nall, const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + __local numtyp sp_lj[4]; - - // ii indexes the two interacting particles in gi - int ii=THREAD_ID_X; - if (ii<4) - sp_lj[ii]=gum[ii+3]; - ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X); - __syncthreads(); - - if (ii0) + energy+=u_r*temp2; + numtyp temp1 = -eta*u_r*factor_lj; + if (vflag>0) { + r12[0]*=-r; + r12[1]*=-r; + r12[2]*=-r; + numtyp ft=temp1*dchi[0]-temp2*dUr[0]; + f.x+=ft; + virial[0]+=r12[0]*ft; + ft=temp1*dchi[1]-temp2*dUr[1]; + f.y+=ft; + virial[1]+=r12[1]*ft; + virial[3]+=r12[0]*ft; + ft=temp1*dchi[2]-temp2*dUr[2]; + f.z+=ft; + virial[2]+=r12[2]*ft; + virial[4]+=r12[0]*ft; + virial[5]+=r12[1]*ft; + } else { + f.x+=temp1*dchi[0]-temp2*dUr[0]; + f.y+=temp1*dchi[1]-temp2*dUr[1]; + f.z+=temp1*dchi[2]-temp2*dUr[2]; + } + + // Torque on 1 + temp1 = -u_r*eta*factor_lj; + temp2 = -u_r*chi*factor_lj; + numtyp temp3 = -chi*eta*factor_lj; + tor.x+=temp1*tchi[0]+temp2*teta[0]+temp3*tUr[0]; + tor.y+=temp1*tchi[1]+temp2*teta[1]+temp3*tUr[1]; + tor.z+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2]; + + } // for nbor + } // if ii - // energy - - // compute u_r and dUr - numtyp uslj_rsq; - { - // Compute distance of closest approach - numtyp h12, sigma12; - sigma12 = gpu_dot3(r12,kappa); - sigma12 = rsqrt((numtyp)0.5*sigma12); - h12 = r-sigma12; + // Reduce answers + if (t_per_atom>1) { + __local acctyp red_acc[7][BLOCK_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=tor.x; + red_acc[4][tid]=tor.y; + red_acc[5][tid]=tor.z; - // -- kappa is now ok - kappa[0]*=r; - kappa[1]*=r; - kappa[2]*=r; - - int mtype=mul24(ntypes,itype)+jtype; - numtyp sigma = sig_eps[mtype].x; - numtyp epsilon = sig_eps[mtype].y; - numtyp varrho = sigma/(h12+gum[0]*sigma); - numtyp varrho6 = varrho*varrho*varrho; - varrho6*=varrho6; - numtyp varrho12 = varrho6*varrho6; - u_r = (numtyp)4.0*epsilon*(varrho12-varrho6); - - numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma; - temp1 = temp1*(numtyp)24.0*epsilon; - uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5; - numtyp temp2 = gpu_dot3(kappa,r12); - uslj_rsq = uslj_rsq*ir*ir; - - dUr[0] = temp1*r12[0]+uslj_rsq*(kappa[0]-temp2*r12[0]); - dUr[1] = temp1*r12[1]+uslj_rsq*(kappa[1]-temp2*r12[1]); - dUr[2] = temp1*r12[2]+uslj_rsq*(kappa[2]-temp2*r12[2]); - } - - // torque for particle 1 - { - numtyp tempv[3], tempv2[3]; - tempv[0] = -uslj_rsq*kappa[0]; - tempv[1] = -uslj_rsq*kappa[1]; - tempv[2] = -uslj_rsq*kappa[2]; - gpu_row_times3(kappa,g1,tempv2); - gpu_cross3(tempv,tempv2,tUr); - } + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; } } - - // Compute eta - { - eta = (numtyp)2.0*lshape[itype]*lshape[jtype]; - numtyp det_g12 = gpu_det3(g12); - eta = pow(eta/det_g12,gum[1]); - } - // Compute teta - numtyp temp[9], tempv[3], tempv2[3]; - compute_eta_torque(g12,a1,ishape,temp); - numtyp temp1 = -eta*gum[1]; + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + tor.x=red_acc[3][tid]; + tor.y=red_acc[4][tid]; + tor.z=red_acc[5][tid]; - tempv[0] = temp1*temp[0]; - tempv[1] = temp1*temp[1]; - tempv[2] = temp1*temp[2]; - gpu_cross3(a1,tempv,tempv2); - teta[0] = tempv2[0]; - teta[1] = tempv2[1]; - teta[2] = tempv2[2]; - - tempv[0] = temp1*temp[3]; - tempv[1] = temp1*temp[4]; - tempv[2] = temp1*temp[5]; - gpu_cross3(a1+3,tempv,tempv2); - teta[0] += tempv2[0]; - teta[1] += tempv2[1]; - teta[2] += tempv2[2]; + if (eflag>0 || vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + red_acc[6][tid]=energy; - tempv[0] = temp1*temp[6]; - tempv[1] = temp1*temp[7]; - tempv[2] = temp1*temp[8]; - gpu_cross3(a1+6,tempv,tempv2); - teta[0] += tempv2[0]; - teta[1] += tempv2[1]; - teta[2] += tempv2[2]; - } - - numtyp chi, dchi[3], tchi[3]; - { // Compute chi and dchi - - // Compute b12 - numtyp b2[9], b12[9]; - { - gpu_times3(well[jtype],a2,b12); - gpu_transpose_times3(a2,b12,b2); - gpu_plus3(b1,b2,b12); + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<7; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + energy=red_acc[6][tid]; } - - // compute chi_12 - r12[0]*=r; - r12[1]*=r; - r12[2]*=r; - numtyp iota[3]; - gpu_mldivide3(b12,r12,iota,err_flag); - // -- iota is now iota/r - iota[0]*=ir; - iota[1]*=ir; - iota[2]*=ir; - r12[0]*=ir; - r12[1]*=ir; - r12[2]*=ir; - chi = gpu_dot3(r12,iota); - chi = pow(chi*(numtyp)2.0,gum[2]); - - // -- iota is now ok - iota[0]*=r; - iota[1]*=r; - iota[2]*=r; - - numtyp temp1 = gpu_dot3(iota,r12); - numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/ - gum[2]); - dchi[0] = temp2*(iota[0]-temp1*r12[0]); - dchi[1] = temp2*(iota[1]-temp1*r12[1]); - dchi[2] = temp2*(iota[2]-temp1*r12[2]); - - // compute t_chi - numtyp tempv[3]; - gpu_row_times3(iota,b1,tempv); - gpu_cross3(tempv,iota,tchi); - temp1 = (numtyp)-4.0*ir*ir; - tchi[0] *= temp1; - tchi[1] *= temp1; - tchi[2] *= temp1; } - numtyp temp2 = factor_lj*eta*chi; - if (eflag>0) - energy+=u_r*temp2; - numtyp temp1 = -eta*u_r*factor_lj; - if (vflag>0) { - r12[0]*=-r; - r12[1]*=-r; - r12[2]*=-r; - numtyp ft=temp1*dchi[0]-temp2*dUr[0]; - f.x+=ft; - virial[0]+=r12[0]*ft; - ft=temp1*dchi[1]-temp2*dUr[1]; - f.y+=ft; - virial[1]+=r12[1]*ft; - virial[3]+=r12[0]*ft; - ft=temp1*dchi[2]-temp2*dUr[2]; - f.z+=ft; - virial[2]+=r12[2]*ft; - virial[4]+=r12[0]*ft; - virial[5]+=r12[1]*ft; - } else { - f.x+=temp1*dchi[0]-temp2*dUr[0]; - f.y+=temp1*dchi[1]-temp2*dUr[1]; - f.z+=temp1*dchi[2]-temp2*dUr[2]; - } - - // Torque on 1 - temp1 = -u_r*eta*factor_lj; - temp2 = -u_r*chi*factor_lj; - numtyp temp3 = -chi*eta*factor_lj; - tor.x+=temp1*tchi[0]+temp2*teta[0]+temp3*tUr[0]; - tor.y+=temp1*tchi[1]+temp2*teta[1]+temp3*tUr[1]; - tor.z+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2]; - - } // for nbor - // Store answers - __global acctyp *ap1=engv+ii; - if (eflag>0) { - *ap1=energy; - ap1+=astride; - } - if (vflag>0) { - for (int i=0; i<6; i++) { - *ap1=virial[i]; + if (ii0) { + *ap1=energy; ap1+=astride; } - } - ans[ii]=f; - ans[ii+astride]=tor; + if (vflag>0) { + for (int i=0; i<6; i++) { + *ap1=virial[i]; + ap1+=astride; + } + } + ans[ii]=f; + ans[ii+astride]=tor; } // if ii } diff --git a/lib/gpu/gb_gpu_kernel_lj.cu b/lib/gpu/gb_gpu_kernel_lj.cu index 3e42cbcbbc..657fc20cd5 100644 --- a/lib/gpu/gb_gpu_kernel_lj.cu +++ b/lib/gpu/gb_gpu_kernel_lj.cu @@ -34,33 +34,36 @@ __kernel void kernel_sphere_gb(__global numtyp4 *x_,__global numtyp4 *q, __global acctyp4 *ans, __global acctyp *engv, __global int *err_flag, const int eflag, const int vflag,const int start, const int inum, - const int nall) { - __local numtyp sp_lj[4]; + const int nall, const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom+start; + int offset=tid%t_per_atom; - // ii indexes the two interacting particles in gi - int ii=THREAD_ID_X; - if (ii<4) - sp_lj[ii]=gum[ii+3]; - ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X)+start; - __syncthreads(); + __local numtyp sp_lj[4]; + sp_lj[0]=gum[3]; + sp_lj[1]=gum[4]; + sp_lj[2]=gum[5]; + sp_lj[3]=gum[6]; + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; if (ii1) { + __local acctyp red_acc[6][BLOCK_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; - // Store answers + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<4; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1=energy; @@ -265,39 +307,42 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1, __global acctyp4 *ans, __global acctyp *engv, __global int *err_flag, const int eflag, const int vflag, const int start, const int inum, - const int nall) { - __local numtyp sp_lj[4]; - - // ii indexes the two interacting particles in gi - int ii=THREAD_ID_X; - if (ii<4) - sp_lj[ii]=gum[ii+3]; - ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X)+start; - __syncthreads(); + const int nall, const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom+start; + int offset=tid%t_per_atom; + __local numtyp sp_lj[4]; + sp_lj[0]=gum[3]; + sp_lj[1]=gum[4]; + sp_lj[2]=gum[5]; + sp_lj[3]=gum[6]; + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + if (ii1) { + __local acctyp red_acc[6][BLOCK_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; - // Store answers + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<4; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1+=energy; @@ -361,50 +445,54 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1, __kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, __global numtyp4* lj3_in, __global numtyp *gum, - const int stride, - __global int *dev_ij, __global acctyp4 *ans, - __global acctyp *engv, __global int *err_flag, - const int eflag,const int vflag, const int start, - const int inum, const int nall) { - // ii indexes the two interacting particles in gi - int ii=THREAD_ID_X; + const int stride, __global int *dev_ij, + __global acctyp4 *ans, __global acctyp *engv, + __global int *err_flag, const int eflag, + const int vflag, const int start, const int inum, + const int nall, const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom+start; + int offset=tid%t_per_atom; + __local numtyp sp_lj[4]; __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; - if (ii<4) - sp_lj[ii]=gum[ii+3]; - if (ii0) - lj3[ii]=lj3_in[ii]; + lj3[tid]=lj3_in[tid]; } - ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X)+start; + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + __syncthreads(); if (ii1) { + __local acctyp red_acc[6][BLOCK_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; - // Store answers + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<4; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1+=energy; diff --git a/lib/gpu/gb_gpu_kernel_nbor.cu b/lib/gpu/gb_gpu_kernel_nbor.cu index 80da8b8d9d..1b1d81fa42 100644 --- a/lib/gpu/gb_gpu_kernel_nbor.cu +++ b/lib/gpu/gb_gpu_kernel_nbor.cu @@ -18,8 +18,6 @@ #ifndef PAIR_GPU_KERNEL_H #define PAIR_GPU_KERNEL_H -#define MAX_SHARED_TYPES 8 - #ifdef _DOUBLE_DOUBLE #define numtyp double #define numtyp2 double2 @@ -32,7 +30,7 @@ #ifdef NV_KERNEL -#include "geryon/ucl_nv_kernel.h" +#include "nv_kernel_def.h" #else @@ -42,6 +40,7 @@ #define BLOCK_ID_X get_group_id(0) #define BLOCK_SIZE_X get_local_size(0) #define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE) +#define MAX_SHARED_TYPES 8 #endif diff --git a/lib/gpu/gb_gpu_memory.cpp b/lib/gpu/gb_gpu_memory.cpp index 1d78204031..971649c6e8 100644 --- a/lib/gpu/gb_gpu_memory.cpp +++ b/lib/gpu/gb_gpu_memory.cpp @@ -32,30 +32,35 @@ template GB_GPU_MemoryT::GB_GPU_Memory() : _allocated(false), _compiled(false), _max_bytes(0.0) { device=&pair_gpu_device; + ans=new PairGPUAns(); + nbor=new PairGPUNbor; } template GB_GPU_MemoryT::~GB_GPU_Memory() { clear(); + delete ans; + delete nbor; } template int GB_GPU_MemoryT::bytes_per_atom(const int max_nbors) const { - return device->atom.bytes_per_atom()+device->nbor.bytes_per_atom(max_nbors); + return device->atom.bytes_per_atom()+ans->bytes_per_atom()+ + nbor->bytes_per_atom(max_nbors); } template -bool GB_GPU_MemoryT::init(const int ntypes, const double gamma, - const double upsilon, const double mu, - double **host_shape, double **host_well, - double **host_cutsq, double **host_sigma, - double **host_epsilon, double *host_lshape, - int **h_form, double **host_lj1, double **host_lj2, - double **host_lj3, double **host_lj4, - double **host_offset, const double *host_special_lj, - const int nlocal, const int nall, - const int max_nbors, const double cell_size, - const double gpu_split, FILE *_screen) { +int GB_GPU_MemoryT::init(const int ntypes, const double gamma, + const double upsilon, const double mu, + double **host_shape, double **host_well, + double **host_cutsq, double **host_sigma, + double **host_epsilon, double *host_lshape, + int **h_form, double **host_lj1, double **host_lj2, + double **host_lj3, double **host_lj4, + double **host_offset, const double *host_special_lj, + const int nlocal, const int nall, + const int max_nbors, const double cell_size, + const double gpu_split, FILE *_screen) { nbor_time_avail=false; screen=_screen; @@ -64,24 +69,24 @@ bool GB_GPU_MemoryT::init(const int ntypes, const double gamma, gpu_nbor=true; int _gpu_host=0; - int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_nbor,gpu_split); + int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor); if (host_nlocal>0) _gpu_host=1; - if (!device->init(false,true,nlocal,host_nlocal,nall,0,gpu_nbor,_gpu_host, - max_nbors,cell_size,true)) - return false; + _threads_per_atom=device->threads_per_atom(); + int success=device->init(*ans,false,true,nlocal,host_nlocal,nall,nbor,0, + _gpu_host,max_nbors,cell_size,true); + if (success!=0) + return success; + ucl_device=device->gpu; atom=&device->atom; - nbor=&device->nbor; - _block_size=BLOCK_1D; - if (static_cast(_block_size)>ucl_device->group_size()) - _block_size=ucl_device->group_size(); + _block_size=device->pair_block_size(); compile_kernels(*ucl_device); // Initialize host-device load balancer - hd_balancer.init(device,gpu_split); + hd_balancer.init(device,gpu_nbor,gpu_split); // Initialize timers for the selected GPU time_pair.init(*ucl_device); @@ -90,8 +95,9 @@ bool GB_GPU_MemoryT::init(const int ntypes, const double gamma, // If atom type constants fit in shared memory use fast kernel int lj_types=ntypes; shared_types=false; - if (lj_types<=MAX_SHARED_TYPES && _block_size>=MAX_SHARED_TYPES) { - lj_types=MAX_SHARED_TYPES; + int max_shared_types=device->max_shared_types(); + if (lj_types<=max_shared_types && _block_size>=max_shared_types) { + lj_types=max_shared_types; shared_types=true; } _lj_types=lj_types; @@ -186,12 +192,19 @@ bool GB_GPU_MemoryT::init(const int ntypes, const double gamma, } if (multiple_forms) - atom->dev_ans.zero(); + ans->dev_ans.zero(); - _max_bytes=atom->gpu_bytes()+nbor->gpu_bytes(); + _max_bytes=ans->gpu_bytes()+nbor->gpu_bytes(); // Memory for ilist ordered by particle type - return (host_olist.alloc(nbor->max_atoms(),*ucl_device)==UCL_SUCCESS); + if (host_olist.alloc(nbor->max_atoms(),*ucl_device)==UCL_SUCCESS) + return 0; + else return -3; +} + +template +void GB_GPU_MemoryT::estimate_gpu_overhead() { + device->estimate_gpu_overhead(2,_gpu_overhead,_driver_overhead); } template @@ -209,9 +222,9 @@ void GB_GPU_MemoryT::clear() { // Output any timing information acc_timers(); - double single[6], times[6]; + double single[9], times[9]; - single[0]=atom->transfer_time(); + single[0]=atom->transfer_time()+ans->transfer_time(); single[1]=nbor->time_nbor.total_seconds(); single[2]=time_kernel.total_seconds()+time_kernel2.total_seconds()+ nbor->time_kernel.total_seconds(); @@ -220,15 +233,18 @@ void GB_GPU_MemoryT::clear() { single[4]=time_pair.total_seconds(); else single[4]=0; - single[5]=atom->cast_time(); + single[5]=atom->cast_time()+ans->cast_time(); + single[6]=_gpu_overhead; + single[7]=_driver_overhead; + single[8]=ans->cpu_idle_time(); - MPI_Reduce(single,times,6,MPI_DOUBLE,MPI_SUM,0,device->replica()); + MPI_Reduce(single,times,9,MPI_DOUBLE,MPI_SUM,0,device->replica()); double avg_split=hd_balancer.all_avg_split(); _max_bytes+=dev_error.row_bytes()+lj1.row_bytes()+lj3.row_bytes()+ sigma_epsilon.row_bytes()+cut_form.row_bytes()+ shape.row_bytes()+well.row_bytes()+lshape.row_bytes()+ - gamma_upsilon_mu.row_bytes(); + gamma_upsilon_mu.row_bytes()+atom->max_gpu_bytes(); double mpi_max_bytes; MPI_Reduce(&_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0, device->replica()); @@ -255,10 +271,19 @@ void GB_GPU_MemoryT::clear() { fprintf(screen,"Force calc: %.4f s.\n",times[3]/replica_size); fprintf(screen,"LJ calc: %.4f s.\n",times[4]/replica_size); } + fprintf(screen,"GPU Overhead: %.4f s.\n",times[6]/replica_size); fprintf(screen,"Average split: %.4f.\n",avg_split); fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb); + fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[7]/replica_size); + fprintf(screen,"CPU Idle_Time: %.4f s.\n",times[8]/replica_size); fprintf(screen,"-------------------------------------"); fprintf(screen,"--------------------------------\n\n"); + + + fprintf(screen,"Average split: %.4f.\n",avg_split); + fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb); + + } _max_bytes=0.0; @@ -299,10 +324,9 @@ void GB_GPU_MemoryT::clear() { template double GB_GPU_MemoryT::host_memory_usage() const { - return device->atom.host_memory_usage()+ - device->nbor.host_memory_usage()+4*sizeof(numtyp)+ - sizeof(GB_GPU_Memory)+ - device->nbor.max_atoms()*sizeof(int); + return device->atom.host_memory_usage()+nbor->host_memory_usage()+ + 4*sizeof(numtyp)+sizeof(GB_GPU_Memory)+ + nbor->max_atoms()*sizeof(int); } template diff --git a/lib/gpu/gb_gpu_memory.h b/lib/gpu/gb_gpu_memory.h index 2cfc805cd8..40ed8bec51 100644 --- a/lib/gpu/gb_gpu_memory.h +++ b/lib/gpu/gb_gpu_memory.h @@ -18,8 +18,6 @@ #ifndef GB_GPU_MEMORY_H #define GB_GPU_MEMORY_H -#define BLOCK_1D 64 - #include "pair_gpu_device.h" #include "pair_gpu_balance.h" #include "mpi.h" @@ -35,23 +33,34 @@ class GB_GPU_Memory { * \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device - * \return false if there is not sufficient memory or device init prob **/ - bool init(const int ntypes, const double gamma, - const double upsilon, const double mu, double **host_shape, - double **host_well, double **host_cutsq, double **host_sigma, - double **host_epsilon, double *host_lshape, int **h_form, - double **host_lj1, double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, - const double *host_special_lj, const int nlocal, const int nall, - const int max_nbors, const double cell_size, - const double gpu_split, FILE *screen); + * \return false if there is not sufficient memory or device init prob + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, const double gamma, + const double upsilon, const double mu, double **host_shape, + double **host_well, double **host_cutsq, double **host_sigma, + double **host_epsilon, double *host_lshape, int **h_form, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, + const double *host_special_lj, const int nlocal, const int nall, + const int max_nbors, const double cell_size, + const double gpu_split, FILE *screen); + + /// Estimate the overhead for GPU context changes and CPU driver + void estimate_gpu_overhead(); /// Check if there is enough storage for atom arrays and realloc if not /** \param success set to false if insufficient memory **/ inline void resize_atom(const int inum, const int nall, bool &success) { - atom->resize(inum, nall, success); - if (multiple_forms) atom->dev_ans.zero(); - double bytes=atom->gpu_bytes()+nbor->gpu_bytes(); + atom->resize(nall, success); + ans->resize(inum, success); + if (multiple_forms) ans->dev_ans.zero(); + double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); if (bytes>_max_bytes) _max_bytes=bytes; } @@ -74,7 +83,7 @@ class GB_GPU_Memory { success=success && (host_olist.alloc(new_size,*ucl_device)==UCL_SUCCESS); } nbor->resize(nlocal,host_inum,max_nbors,success); - double bytes=atom->gpu_bytes()+nbor->gpu_bytes(); + double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); if (bytes>_max_bytes) _max_bytes=bytes; } @@ -91,19 +100,22 @@ class GB_GPU_Memory { /// Accumulate timers inline void acc_timers() { - if (nbor_time_avail) { - nbor->time_nbor.add_to_total(); - nbor->time_kernel.add_to_total(); - nbor_time_avail=false; + if (device->time_device()) { + if (nbor_time_avail) { + nbor->time_nbor.add_to_total(); + nbor->time_kernel.add_to_total(); + nbor_time_avail=false; + } + time_kernel.add_to_total(); + time_gayberne.add_to_total(); + if (multiple_forms) { + time_kernel2.add_to_total(); + time_gayberne2.add_to_total(); + time_pair.add_to_total(); + } + atom->acc_timers(); + ans->acc_timers(); } - time_kernel.add_to_total(); - time_gayberne.add_to_total(); - if (multiple_forms) { - time_kernel2.add_to_total(); - time_gayberne2.add_to_total(); - time_pair.add_to_total(); - } - atom->acc_timers(); } /// Accumulate timers @@ -117,6 +129,7 @@ class GB_GPU_Memory { time_pair.zero(); } atom->zero_timers(); + ans->zero_timers(); } // -------------------------- DEVICE DATA ------------------------- @@ -168,6 +181,10 @@ class GB_GPU_Memory { int last_ellipse, max_last_ellipse; + // ------------------------ FORCE/ENERGY DATA ----------------------- + + PairGPUAns *ans; + // --------------------------- NBOR DATA ---------------------------- /// Neighbor data @@ -183,10 +200,12 @@ class GB_GPU_Memory { UCL_Kernel k_gayberne, k_sphere_gb, k_lj_fast, k_lj; inline int block_size() { return _block_size; } + int _threads_per_atom; private: bool _allocated, _compiled; int _block_size; double _max_bytes; + double _gpu_overhead, _driver_overhead; void compile_kernels(UCL_Device &dev); }; diff --git a/lib/gpu/geryon/VERSION.txt b/lib/gpu/geryon/VERSION.txt index 77e0a073c7..d260cab24e 100644 --- a/lib/gpu/geryon/VERSION.txt +++ b/lib/gpu/geryon/VERSION.txt @@ -1,2 +1,2 @@ -Geryon Version 10.280 - \ No newline at end of file +Geryon Version 11.094 + diff --git a/lib/gpu/geryon/nvc_device.h b/lib/gpu/geryon/nvc_device.h index ed445716f6..6a232986ff 100644 --- a/lib/gpu/geryon/nvc_device.h +++ b/lib/gpu/geryon/nvc_device.h @@ -167,6 +167,7 @@ class UCL_Device { int _device, _num_devices; std::vector _properties; std::vector _cq; + std::vector _device_ids; }; // Grabs the properties for all devices @@ -178,6 +179,7 @@ inline UCL_Device::UCL_Device() { if (deviceProp.major == 9999 && deviceProp.minor == 9999) break; _properties.push_back(deviceProp); + _device_ids.push_back(dev); } _device=-1; _cq.push_back(cudaStream_t()); @@ -194,7 +196,7 @@ inline void UCL_Device::set(int num) { return; for (int i=1; i + inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21) { + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); + } + + template + inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22) { + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); + } + + template + inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22, t23 *a23) { + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); + } + + template + inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22, t23 *a23, t24 *a24) { + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); + } + + template + inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25) { + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); + } + + template + inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25, + t26 *a26) { + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); + add_arg(a26); + } + + template + inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25, + t26 *a26, t27 *a27) { + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); + add_arg(a26); add_arg(a27); + } + + template + inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25, + t26 *a26, t27 *a27, t28 *a28) { + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); + add_arg(a26); add_arg(a27); add_arg(a28); + } + + template + inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25, + t26 *a26, t27 *a27, t28 *a28, t29 *a29) { + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); + add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); + } + + template + inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25, + t26 *a26, t27 *a27, t28 *a28, t29 *a29, t30 *a30) { + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); + add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30); + } + // --------------------------------------------------------------------------- @@ -439,6 +624,211 @@ run(); } + template + inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); + run(); + } + + template + inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); + run(); + } + + template + inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22, t23 *a23) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); + run(); + } + + template + inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22, t23 *a23, t24 *a24) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); + run(); + } + + template + inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); + run(); + } + + template + inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25, + t26 *a26) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); + add_arg(a26); + run(); + } + + template + inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25, + t26 *a26, t27 *a27) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); + add_arg(a26); add_arg(a27); + run(); + } + + template + inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25, + t26 *a26, t27 *a27, t28 *a28) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); + add_arg(a26); add_arg(a27); add_arg(a28); + run(); + } + + template + inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25, + t26 *a26, t27 *a27, t28 *a28, t29 *a29) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); + add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); + run(); + } + + template + inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25, + t26 *a26, t27 *a27, t28 *a28, t29 *a29, t30 *a30) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); + add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30); + run(); + } + // --------------------------------------------------------------------------- template @@ -671,3 +1061,208 @@ run(cq); } + template + inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); + run(cq); + } + + template + inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); + run(cq); + } + + template + inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22, t23 *a23) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); + run(cq); + } + + template + inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22, t23 *a23, t24 *a24) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); + run(cq); + } + + template + inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); + run(cq); + } + + template + inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25, + t26 *a26) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); + add_arg(a26); + run(cq); + } + + template + inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25, + t26 *a26, t27 *a27) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); + add_arg(a26); add_arg(a27); + run(cq); + } + + template + inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25, + t26 *a26, t27 *a27, t28 *a28) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); + add_arg(a26); add_arg(a27); add_arg(a28); + run(cq); + } + + template + inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25, + t26 *a26, t27 *a27, t28 *a28, t29 *a29) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); + add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); + run(cq); + } + + template + inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, + t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, + t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, + t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, + t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25, + t26 *a26, t27 *a27, t28 *a28, t29 *a29, t30 *a30) { + clear_args(); + add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); + add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); + add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); + add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); + add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); + add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30); + run(cq); + } + diff --git a/lib/gpu/geryon/ucl_d_mat.h b/lib/gpu/geryon/ucl_d_mat.h index c0531b2f29..11ec58629a 100644 --- a/lib/gpu/geryon/ucl_d_mat.h +++ b/lib/gpu/geryon/ucl_d_mat.h @@ -13,7 +13,7 @@ copyright : (C) 2009 by W. Michael Brown email : brownw@ornl.gov ***************************************************************************/ - + /* ----------------------------------------------------------------------- Copyright (2009) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains @@ -61,20 +61,23 @@ class UCL_D_Mat : public UCL_BaseMat { inline int alloc(const size_t rows, const size_t cols, mat_type &cq, const enum UCL_MEMOPT kind=UCL_READ_WRITE) { clear(); - _kind=kind; - _rows=rows; - _cols=cols; + int err=_device_alloc(*this,cq,rows,cols,_pitch,kind); - _row_size=_pitch/sizeof(numtyp); - #ifndef _UCL_DEVICE_PTR_MAT - _end=_array+_row_size*cols; - #endif - #ifndef UCL_NO_EXIT if (err!=UCL_SUCCESS) { + #ifndef UCL_NO_EXIT std::cerr << "UCL Error: Could not allocate " << rows*cols*sizeof(numtyp) << " bytes on device.\n"; exit(1); + #endif + return err; } + + _kind=kind; + _rows=rows; + _cols=cols; + _row_size=_pitch/sizeof(numtyp); + #ifndef _UCL_DEVICE_PTR_MAT + _end=_array+_row_size*cols; #endif #ifdef _OCL_MAT _offset=0; @@ -94,20 +97,23 @@ class UCL_D_Mat : public UCL_BaseMat { inline int alloc(const size_t rows, const size_t cols, UCL_Device &device, const enum UCL_MEMOPT kind=UCL_READ_WRITE) { clear(); - _kind=kind; - _rows=rows; - _cols=cols; + int err=_device_alloc(*this,device,rows,cols,_pitch,kind); - _row_size=_pitch/sizeof(numtyp); - #ifndef _UCL_DEVICE_PTR_MAT - _end=_array+_row_size*cols; - #endif - #ifndef UCL_NO_EXIT if (err!=UCL_SUCCESS) { + #ifndef UCL_NO_EXIT std::cerr << "UCL Error: Could not allocate " << rows*cols*sizeof(numtyp) << " bytes on device.\n"; exit(1); + #endif + return err; } + + _kind=kind; + _rows=rows; + _cols=cols; + _row_size=_pitch/sizeof(numtyp); + #ifndef _UCL_DEVICE_PTR_MAT + _end=_array+_row_size*cols; #endif #ifdef _OCL_MAT _offset=0; diff --git a/lib/gpu/geryon/ucl_d_vec.h b/lib/gpu/geryon/ucl_d_vec.h index 45c94bee82..0be063c940 100644 --- a/lib/gpu/geryon/ucl_d_vec.h +++ b/lib/gpu/geryon/ucl_d_vec.h @@ -13,7 +13,7 @@ copyright : (C) 2009 by W. Michael Brown email : brownw@ornl.gov ***************************************************************************/ - + /* ----------------------------------------------------------------------- Copyright (2009) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains @@ -60,19 +60,24 @@ class UCL_D_Vec : public UCL_BaseMat { const enum UCL_MEMOPT kind=UCL_READ_WRITE) { clear(); - _kind=kind; - _cols=cols; + _row_bytes=cols*sizeof(numtyp); int err=_device_alloc(*this,cq,_row_bytes,kind); - #ifndef _UCL_DEVICE_PTR_MAT - _end=_array+cols; - #endif - #ifndef UCL_NO_EXIT if (err!=UCL_SUCCESS) { + #ifndef UCL_NO_EXIT std::cerr << "UCL Error: Could not allocate " << _row_bytes << " bytes on device.\n"; + _row_bytes=0; exit(1); + #endif + _row_bytes=0; + return err; } + + _kind=kind; + _cols=cols; + #ifndef _UCL_DEVICE_PTR_MAT + _end=_array+cols; #endif #ifdef _OCL_MAT _offset=0; @@ -90,19 +95,23 @@ class UCL_D_Vec : public UCL_BaseMat { inline int alloc(const size_t cols, UCL_Device &device, const enum UCL_MEMOPT kind=UCL_READ_WRITE) { clear(); - _kind=kind; - _cols=cols; _row_bytes=cols*sizeof(numtyp); int err=_device_alloc(*this,device,_row_bytes,kind); - #ifndef _UCL_DEVICE_PTR_MAT - _end=_array+cols; - #endif - #ifndef UCL_NO_EXIT if (err!=UCL_SUCCESS) { + #ifndef UCL_NO_EXIT std::cerr << "UCL Error: Could not allocate " << _row_bytes << " bytes on device.\n"; + _row_bytes=0; exit(1); + #endif + _row_bytes=0; + return err; } + + _kind=kind; + _cols=cols; + #ifndef _UCL_DEVICE_PTR_MAT + _end=_array+cols; #endif #ifdef _OCL_MAT _offset=0; diff --git a/lib/gpu/geryon/ucl_h_mat.h b/lib/gpu/geryon/ucl_h_mat.h index 51593cfa23..762bb03131 100644 --- a/lib/gpu/geryon/ucl_h_mat.h +++ b/lib/gpu/geryon/ucl_h_mat.h @@ -13,7 +13,7 @@ copyright : (C) 2009 by W. Michael Brown email : brownw@ornl.gov ***************************************************************************/ - + /* ----------------------------------------------------------------------- Copyright (2009) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains @@ -39,7 +39,11 @@ class UCL_H_Mat : public UCL_BaseMat { }; typedef numtyp data_type; - UCL_H_Mat() : _kind(UCL_VIEW), _rows(0) { } + UCL_H_Mat() : _kind(UCL_VIEW), _rows(0) { + #ifdef _OCL_MAT + _carray=(cl_mem)(0); + #endif + } ~UCL_H_Mat() { if (_kind!=UCL_VIEW) _host_free(*this,_kind); } /// Construct with specied number of rows and columns @@ -59,18 +63,23 @@ class UCL_H_Mat : public UCL_BaseMat { inline int alloc(const size_t rows, const size_t cols, mat_type &cq, const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) { clear(); - _cols=cols; - _rows=rows; + _row_bytes=cols*sizeof(numtyp); - _kind=kind; - int err=_host_alloc(*this,cq,_row_bytes*_rows,kind); - #ifndef UCL_NO_EXIT + int err=_host_alloc(*this,cq,_row_bytes*rows,kind); if (err!=UCL_SUCCESS) { + #ifndef UCL_NO_EXIT std::cerr << "UCL Error: Could not allocate " << _row_bytes*_rows << " bytes on host.\n"; + _row_bytes=0; exit(1); + #endif + _row_bytes=0; + return err; } - #endif + + _cols=cols; + _rows=rows; + _kind=kind; _end=_array+rows*cols; return err; } @@ -85,19 +94,24 @@ class UCL_H_Mat : public UCL_BaseMat { inline int alloc(const size_t rows, const size_t cols, UCL_Device &device, const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) { clear(); - _cols=cols; - _rows=rows; + _row_bytes=cols*sizeof(numtyp); - _kind=kind; - int err=_host_alloc(*this,device,_row_bytes*_rows,kind); - _end=_array+rows*cols; - #ifndef UCL_NO_EXIT + int err=_host_alloc(*this,device,_row_bytes*rows,kind); if (err!=UCL_SUCCESS) { + #ifndef UCL_NO_EXIT std::cerr << "UCL Error: Could not allocate " << _row_bytes*_rows << " bytes on host.\n"; + _row_bytes=0; exit(1); + #endif + _row_bytes=0; + return err; } - #endif + + _cols=cols; + _rows=rows; + _kind=kind; + _end=_array+rows*cols; return err; } diff --git a/lib/gpu/geryon/ucl_h_vec.h b/lib/gpu/geryon/ucl_h_vec.h index ca1dd12a47..4af1e2179f 100644 --- a/lib/gpu/geryon/ucl_h_vec.h +++ b/lib/gpu/geryon/ucl_h_vec.h @@ -13,7 +13,7 @@ copyright : (C) 2009 by W. Michael Brown email : brownw@ornl.gov ***************************************************************************/ - + /* ----------------------------------------------------------------------- Copyright (2009) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains @@ -39,7 +39,11 @@ class UCL_H_Vec : public UCL_BaseMat { }; typedef numtyp data_type; - UCL_H_Vec() : _kind(UCL_VIEW), _cols(0) { } + UCL_H_Vec() : _kind(UCL_VIEW), _cols(0) { + #ifdef _OCL_MAT + _carray=(cl_mem)(0); + #endif + } ~UCL_H_Vec() { if (_kind!=UCL_VIEW) _host_free(*this,_kind); } /// Construct with n columns @@ -59,18 +63,24 @@ class UCL_H_Vec : public UCL_BaseMat { inline int alloc(const size_t cols, mat_type &cq, const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) { clear(); - _cols=cols; + _row_bytes=cols*sizeof(numtyp); - _kind=kind; int err=_host_alloc(*this,cq,_row_bytes,kind); - _end=_array+cols; - #ifndef UCL_NO_EXIT + if (err!=UCL_SUCCESS) { + #ifndef UCL_NO_EXIT std::cerr << "UCL Error: Could not allocate " << _row_bytes << " bytes on host.\n"; + _row_bytes=0; exit(1); + #endif + _row_bytes=0; + return err; } - #endif + + _cols=cols; + _kind=kind; + _end=_array+cols; return err; } @@ -84,18 +94,24 @@ class UCL_H_Vec : public UCL_BaseMat { inline int alloc(const size_t cols, UCL_Device &device, const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) { clear(); - _cols=cols; + _row_bytes=cols*sizeof(numtyp); - _kind=kind; int err=_host_alloc(*this,device,_row_bytes,kind); - _end=_array+cols; - #ifndef UCL_NO_EXIT + if (err!=UCL_SUCCESS) { + #ifndef UCL_NO_EXIT std::cerr << "UCL Error: Could not allocate " << _row_bytes << " bytes on host.\n"; + _row_bytes=0; exit(1); + #endif + _row_bytes=0; + return err; } - #endif + + _cols=cols; + _kind=kind; + _end=_array+cols; return err; } diff --git a/lib/gpu/geryon/ucl_nv_kernel.h b/lib/gpu/geryon/ucl_nv_kernel.h index 1ea9175e3a..5c45dc3a87 100644 --- a/lib/gpu/geryon/ucl_nv_kernel.h +++ b/lib/gpu/geryon/ucl_nv_kernel.h @@ -13,7 +13,7 @@ copyright : (C) 2010 by W. Michael Brown email : brownw@ornl.gov ***************************************************************************/ - + /* ----------------------------------------------------------------------- Copyright (2010) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains @@ -25,8 +25,18 @@ #ifndef UCL_NV_KERNEL_H #define UCL_NV_KERNEL_H -#define GLOBAL_ID_X threadIdx.x+__mul24(blockIdx.x,blockDim.x) -#define GLOBAL_ID_Y threadIdx.y+__mul24(blockIdx.y,blockDim.y) +#if (__CUDA_ARCH__ < 200) +#define mul24 __mul24 +#define MEM_THREADS 16 +#else +#define mul24(X,Y) (X)*(Y) +#define MEM_THREADS 32 +#endif + +#define GLOBAL_ID_X threadIdx.x+mul24(blockIdx.x,blockDim.x) +#define GLOBAL_ID_Y threadIdx.y+mul24(blockIdx.y,blockDim.y) +#define GLOBAL_SIZE_X mul24(gridDim.x,blockDim.x); +#define GLOBAL_SIZE_Y mul24(gridDim.y,blockDim.y); #define THREAD_ID_X threadIdx.x #define THREAD_ID_Y threadIdx.y #define BLOCK_ID_X blockIdx.x @@ -35,8 +45,9 @@ #define BLOCK_SIZE_Y blockDim.y #define __kernel extern "C" __global__ #define __local __shared__ -#define mul24 __mul24 #define __global #define __inline static __inline__ __device__ +#define atom_add atomicAdd #endif + diff --git a/lib/gpu/lj96_cut_gpu.cpp b/lib/gpu/lj96_cut_gpu.cpp index 24fb5d8570..df83afd521 100644 --- a/lib/gpu/lj96_cut_gpu.cpp +++ b/lib/gpu/lj96_cut_gpu.cpp @@ -28,11 +28,11 @@ static LJ96_GPU_Memory LJ96MF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- -bool lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1, - double **host_lj2, double **host_lj3, double **host_lj4, - double **offset, double *special_lj, const int inum, - const int nall, const int max_nbors, const int maxspecial, - const double cell_size, int &gpu_mode, FILE *screen) { +int lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double *special_lj, const int inum, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen) { LJ96MF.clear(); gpu_mode=LJ96MF.device->gpu_mode(); double gpu_split=LJ96MF.device->particle_split(); @@ -53,13 +53,11 @@ bool lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1, fflush(screen); } - if (world_me==0) { - bool init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, - host_lj4, offset, special_lj, inum, nall, 300, - maxspecial, cell_size, gpu_split, screen); - if (!init_ok) - return false; - } + int init_ok=0; + if (world_me==0) + init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, + host_lj4, offset, special_lj, inum, nall, 300, + maxspecial, cell_size, gpu_split, screen); LJ96MF.device->world_barrier(); if (message) @@ -74,46 +72,46 @@ bool lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1, last_gpu,i); fflush(screen); } - if (gpu_rank==i && world_me!=0) { - bool init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, - host_lj4, offset, special_lj, inum, - nall, 300, maxspecial, cell_size, gpu_split, - screen); - if (!init_ok) - return false; - } + if (gpu_rank==i && world_me!=0) + init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, + offset, special_lj, inum, nall, 300, maxspecial, + cell_size, gpu_split, screen); + LJ96MF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); - return true; + + if (init_ok==0) + LJ96MF.estimate_gpu_overhead(); + return init_ok; } void lj96_gpu_clear() { LJ96MF.clear(); } -int * lj96_gpu_compute_n(const int timestep, const int ago, const int inum_full, +int** lj96_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, - double *boxlo, double *boxhi, int *tag, int **nspecial, + double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success) { - return LJ96MF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo, - boxhi, tag, nspecial, special, eflag, vflag, eatom, - vatom, host_start, cpu_time, success); + int **ilist, int **jnum, const double cpu_time, + bool &success) { + return LJ96MF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success); } -void lj96_gpu_compute(const int timestep, const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, const double cpu_time, - bool &success) { - LJ96MF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj, - firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success); +void lj96_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success) { + LJ96MF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh, + eflag,vflag,eatom,vatom,host_start,cpu_time,success); } double lj96_gpu_bytes() { diff --git a/lib/gpu/lj96_cut_gpu_kernel.cu b/lib/gpu/lj96_cut_gpu_kernel.cu index 0d3a01fbac..3fc6a2f308 100644 --- a/lib/gpu/lj96_cut_gpu_kernel.cu +++ b/lib/gpu/lj96_cut_gpu_kernel.cu @@ -18,8 +18,6 @@ #ifndef LJ96_GPU_KERNEL #define LJ96_GPU_KERNEL -#define MAX_SHARED_TYPES 8 - #ifdef _DOUBLE_DOUBLE #define numtyp double #define numtyp2 double2 @@ -46,7 +44,7 @@ #ifdef NV_KERNEL -#include "geryon/ucl_nv_kernel.h" +#include "nv_kernel_def.h" texture pos_tex; #ifdef _DOUBLE_DOUBLE @@ -72,6 +70,8 @@ __inline float4 fetch_pos(const int& i, const float4 *pos) #define __inline inline #define fetch_pos(i,y) x_[i] +#define BLOCK_PAIR 64 +#define MAX_SHARED_TYPES 8 #endif @@ -82,40 +82,55 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; } __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, __global numtyp4* lj3, const int lj_types, __global numtyp *sp_lj_in, __global int *dev_nbor, - __global acctyp4 *ans, __global acctyp *engv, - const int eflag, const int vflag, const int inum, - const int nall, const int nbor_pitch) { - // ii indexes the two interacting particles in gi - int ii=GLOBAL_ID_X; + __global int *dev_packed, __global acctyp4 *ans, + __global acctyp *engv, const int eflag, + const int vflag, const int inum, const int nall, + const int nbor_pitch, const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + __local numtyp sp_lj[4]; sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; sp_lj[2]=sp_lj_in[2]; sp_lj[3]=sp_lj_in[3]; + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + if (ii1) { + __local acctyp red_acc[6][BLOCK_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; - // Store answers + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<4; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1=energy; @@ -176,49 +230,65 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, __global numtyp4* lj3_in, - __global numtyp* sp_lj_in, __global int *dev_nbor, + __global numtyp* sp_lj_in, + __global int *dev_nbor, __global int *dev_packed, __global acctyp4 *ans, __global acctyp *engv, const int eflag, const int vflag, const int inum, - const int nall, const int nbor_pitch) { - // ii indexes the two interacting particles in gi - int ii=THREAD_ID_X; + const int nall, const int nbor_pitch, + const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; - if (ii<4) - sp_lj[ii]=sp_lj_in[ii]; - if (ii0) - lj3[ii]=lj3_in[ii]; + lj3[tid]=lj3_in[tid]; } - ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X); + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + __syncthreads(); if (ii1) { + __local acctyp red_acc[6][BLOCK_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; - // Store answers + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<4; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1=energy; diff --git a/lib/gpu/lj96_cut_gpu_memory.cpp b/lib/gpu/lj96_cut_gpu_memory.cpp index d365d71044..0b066c0973 100644 --- a/lib/gpu/lj96_cut_gpu_memory.cpp +++ b/lib/gpu/lj96_cut_gpu_memory.cpp @@ -42,7 +42,7 @@ int LJ96_GPU_MemoryT::bytes_per_atom(const int max_nbors) const { } template -bool LJ96_GPU_MemoryT::init(const int ntypes, +int LJ96_GPU_MemoryT::init(const int ntypes, double **host_cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, @@ -50,14 +50,18 @@ bool LJ96_GPU_MemoryT::init(const int ntypes, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *_screen) { - this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, - _screen,lj96_cut_gpu_kernel); + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,lj96_cut_gpu_kernel); + if (success!=0) + return success; // If atom type constants fit in shared memory use fast kernel int lj_types=ntypes; shared_types=false; - if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) { - lj_types=MAX_SHARED_TYPES; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; shared_types=true; } _lj_types=lj_types; @@ -84,7 +88,7 @@ bool LJ96_GPU_MemoryT::init(const int ntypes, _allocated=true; this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes(); - return true; + return 0; } template @@ -122,9 +126,10 @@ void LJ96_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { else vflag=0; - int GX=static_cast(ceil(static_cast(this->atom->inum())/BX)); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); - int ainum=this->atom->inum(); + int ainum=this->ans->inum(); int anall=this->atom->nall(); int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); @@ -133,16 +138,18 @@ void LJ96_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->atom->dev_ans.begin(), - &this->atom->dev_engv.begin(), &eflag, &vflag, - &ainum, &anall, &nbor_pitch); + &this->_nbor_data->begin(), + &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, + &ainum, &anall, &nbor_pitch, + &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->atom->dev_ans.begin(), - &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum, - &anall, &nbor_pitch); + &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, + &anall, &nbor_pitch, &this->_threads_per_atom); } this->time_pair.stop(); } diff --git a/lib/gpu/lj96_cut_gpu_memory.h b/lib/gpu/lj96_cut_gpu_memory.h index 483ef05570..fe0a0b1665 100644 --- a/lib/gpu/lj96_cut_gpu_memory.h +++ b/lib/gpu/lj96_cut_gpu_memory.h @@ -29,13 +29,20 @@ class LJ96_GPU_Memory : public AtomicGPUMemory { /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin - * \param gpu_split fraction of particles handled by device **/ - bool init(const int ntypes, double **host_cutsq, double **host_lj1, - double **host_lj2, double **host_lj3, double **host_lj4, - double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, - const double gpu_split, FILE *screen); + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **host_offset, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen); /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ diff --git a/lib/gpu/lj_cut_gpu.cpp b/lib/gpu/lj_cut_gpu.cpp index 12fab2f9f1..aef085f7c9 100644 --- a/lib/gpu/lj_cut_gpu.cpp +++ b/lib/gpu/lj_cut_gpu.cpp @@ -28,12 +28,11 @@ static LJL_GPU_Memory LJLMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- -bool ljl_gpu_init(const int ntypes, double **cutsq, - double **host_lj1, double **host_lj2, double **host_lj3, - double **host_lj4, double **offset, double *special_lj, - const int inum, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, int &gpu_mode, - FILE *screen) { +int ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double *special_lj, const int inum, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen) { LJLMF.clear(); gpu_mode=LJLMF.device->gpu_mode(); double gpu_split=LJLMF.device->particle_split(); @@ -54,13 +53,11 @@ bool ljl_gpu_init(const int ntypes, double **cutsq, fflush(screen); } - if (world_me==0) { - bool init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, - host_lj4, offset, special_lj, inum, nall, 300, - maxspecial, cell_size, gpu_split, screen); - if (!init_ok) - return false; - } + int init_ok=0; + if (world_me==0) + init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, + host_lj4, offset, special_lj, inum, nall, 300, + maxspecial, cell_size, gpu_split, screen); LJLMF.device->world_barrier(); if (message) @@ -75,45 +72,45 @@ bool ljl_gpu_init(const int ntypes, double **cutsq, last_gpu,i); fflush(screen); } - if (gpu_rank==i && world_me!=0) { - bool init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, - host_lj4, offset, special_lj, inum, nall, 300, - maxspecial, cell_size, gpu_split, - screen); - if (!init_ok) - return false; - } + if (gpu_rank==i && world_me!=0) + init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, + offset, special_lj, inum, nall, 300, maxspecial, + cell_size, gpu_split, screen); + LJLMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); - return true; + + if (init_ok==0) + LJLMF.estimate_gpu_overhead(); + return init_ok; } void ljl_gpu_clear() { LJLMF.clear(); } -int * ljl_gpu_compute_n(const int timestep, const int ago, const int inum_full, +int ** ljl_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, - double *boxlo, double *boxhi, int *tag, int **nspecial, + double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success) { - return LJLMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo, - boxhi, tag, nspecial, special, eflag, vflag, eatom, - vatom, host_start, cpu_time, success); + int **ilist, int **jnum, const double cpu_time, + bool &success) { + return LJLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success); } -void ljl_gpu_compute(const int timestep, const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, const double cpu_time, - bool &success) { - LJLMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj, +void ljl_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success) { + LJLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success); } diff --git a/lib/gpu/lj_cut_gpu_kernel.cu b/lib/gpu/lj_cut_gpu_kernel.cu index 0e72e41f36..75f36446f7 100644 --- a/lib/gpu/lj_cut_gpu_kernel.cu +++ b/lib/gpu/lj_cut_gpu_kernel.cu @@ -18,8 +18,6 @@ #ifndef LJ_GPU_KERNEL #define LJ_GPU_KERNEL -#define MAX_SHARED_TYPES 8 - #ifdef _DOUBLE_DOUBLE #define numtyp double #define numtyp2 double2 @@ -46,7 +44,7 @@ #ifdef NV_KERNEL -#include "geryon/ucl_nv_kernel.h" +#include "nv_kernel_def.h" texture pos_tex; #ifdef _DOUBLE_DOUBLE @@ -72,6 +70,8 @@ __inline float4 fetch_pos(const int& i, const float4 *pos) #define __inline inline #define fetch_pos(i,y) x_[i] +#define BLOCK_PAIR 64 +#define MAX_SHARED_TYPES 8 #endif @@ -82,40 +82,55 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; } __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, __global numtyp4* lj3, const int lj_types, __global numtyp *sp_lj_in, __global int *dev_nbor, - __global acctyp4 *ans, __global acctyp *engv, - const int eflag, const int vflag, const int inum, - const int nall, const int nbor_pitch) { - // ii indexes the two interacting particles in gi - int ii=GLOBAL_ID_X; + __global int *dev_packed, __global acctyp4 *ans, + __global acctyp *engv, const int eflag, + const int vflag, const int inum, const int nall, + const int nbor_pitch, const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + __local numtyp sp_lj[4]; sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; sp_lj[2]=sp_lj_in[2]; sp_lj[3]=sp_lj_in[3]; + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + if (ii1) { + __local acctyp red_acc[6][BLOCK_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; - // Store answers + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<4; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1=energy; @@ -175,49 +229,65 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, __global numtyp4* lj3_in, - __global numtyp* sp_lj_in, __global int *dev_nbor, + __global numtyp* sp_lj_in, + __global int *dev_nbor, __global int *dev_packed, __global acctyp4 *ans, __global acctyp *engv, const int eflag, const int vflag, const int inum, - const int nall, const int nbor_pitch) { - // ii indexes the two interacting particles in gi - int ii=THREAD_ID_X; + const int nall, const int nbor_pitch, + const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[4]; - if (ii<4) - sp_lj[ii]=sp_lj_in[ii]; - if (ii0) - lj3[ii]=lj3_in[ii]; + lj3[tid]=lj3_in[tid]; } - ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X); + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + __syncthreads(); if (ii1) { + __local acctyp red_acc[6][BLOCK_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; - // Store answers + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<4; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1=energy; diff --git a/lib/gpu/lj_cut_gpu_memory.cpp b/lib/gpu/lj_cut_gpu_memory.cpp index 23b2fcf6d0..a294eb647f 100644 --- a/lib/gpu/lj_cut_gpu_memory.cpp +++ b/lib/gpu/lj_cut_gpu_memory.cpp @@ -42,22 +42,26 @@ int LJL_GPU_MemoryT::bytes_per_atom(const int max_nbors) const { } template -bool LJL_GPU_MemoryT::init(const int ntypes, - double **host_cutsq, double **host_lj1, - double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, - double *host_special_lj, const int nlocal, - const int nall, const int max_nbors, - const int maxspecial, const double cell_size, - const double gpu_split, FILE *_screen) { - this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, - _screen,lj_cut_gpu_kernel); +int LJL_GPU_MemoryT::init(const int ntypes, + double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, + double *host_special_lj, const int nlocal, + const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen) { + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,lj_cut_gpu_kernel); + if (success!=0) + return success; // If atom type constants fit in shared memory use fast kernel int lj_types=ntypes; shared_types=false; - if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) { - lj_types=MAX_SHARED_TYPES; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; shared_types=true; } _lj_types=lj_types; @@ -84,7 +88,7 @@ bool LJL_GPU_MemoryT::init(const int ntypes, _allocated=true; this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes(); - return true; + return 0; } template @@ -122,9 +126,10 @@ void LJL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { else vflag=0; - int GX=static_cast(ceil(static_cast(this->atom->inum())/BX)); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); - int ainum=this->atom->inum(); + int ainum=this->ans->inum(); int anall=this->atom->nall(); int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); @@ -133,16 +138,18 @@ void LJL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->atom->dev_ans.begin(), - &this->atom->dev_engv.begin(), &eflag, &vflag, - &ainum, &anall, &nbor_pitch); + &this->_nbor_data->begin(), + &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, + &ainum, &anall, &nbor_pitch, + &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->atom->dev_ans.begin(), - &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum, - &anall, &nbor_pitch); + &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, + &anall, &nbor_pitch, &this->_threads_per_atom); } this->time_pair.stop(); } diff --git a/lib/gpu/lj_cut_gpu_memory.h b/lib/gpu/lj_cut_gpu_memory.h index 123b739649..4b86b133a1 100644 --- a/lib/gpu/lj_cut_gpu_memory.h +++ b/lib/gpu/lj_cut_gpu_memory.h @@ -29,13 +29,20 @@ class LJL_GPU_Memory : public AtomicGPUMemory { /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin - * \param gpu_split fraction of particles handled by device **/ - bool init(const int ntypes, double **host_cutsq, - double **host_lj1, double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, - const double gpu_split, FILE *screen); + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen); /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ diff --git a/lib/gpu/ljc_cut_gpu.cpp b/lib/gpu/ljc_cut_gpu.cpp index 955a25adce..de6f4f3e62 100644 --- a/lib/gpu/ljc_cut_gpu.cpp +++ b/lib/gpu/ljc_cut_gpu.cpp @@ -28,13 +28,13 @@ static LJC_GPU_Memory LJCMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- -bool ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1, - double **host_lj2, double **host_lj3, double **host_lj4, - double **offset, double *special_lj, const int inum, - const int nall, const int max_nbors, const int maxspecial, - const double cell_size, int &gpu_mode, FILE *screen, - double **host_cut_ljsq, double **host_cut_coulsq, - double *host_special_coul, const double qqrd2e) { +int ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double *special_lj, const int inum, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen, + double **host_cut_ljsq, double **host_cut_coulsq, + double *host_special_coul, const double qqrd2e) { LJCMF.clear(); gpu_mode=LJCMF.device->gpu_mode(); double gpu_split=LJCMF.device->particle_split(); @@ -55,15 +55,12 @@ bool ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1, fflush(screen); } - if (world_me==0) { - bool init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, - host_lj4, offset, special_lj, inum, nall, 300, - maxspecial, cell_size, gpu_split, screen, - host_cut_ljsq, host_cut_coulsq, host_special_coul, - qqrd2e); - if (!init_ok) - return false; - } + int init_ok=0; + if (world_me==0) + init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, + host_lj4, offset, special_lj, inum, nall, 300, + maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, + host_cut_coulsq, host_special_coul, qqrd2e); LJCMF.device->world_barrier(); if (message) @@ -78,48 +75,51 @@ bool ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1, last_gpu,i); fflush(screen); } - if (gpu_rank==i && world_me!=0) { - bool init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, - host_lj4, offset, special_lj, inum, nall, 300, - maxspecial, cell_size, gpu_split, - screen, host_cut_ljsq, host_cut_coulsq, - host_special_coul, qqrd2e); - if (!init_ok) - return false; - } + if (gpu_rank==i && world_me!=0) + init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, + offset, special_lj, inum, nall, 300, maxspecial, + cell_size, gpu_split, screen, host_cut_ljsq, + host_cut_coulsq, host_special_coul, qqrd2e); + LJCMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); - return true; + + if (init_ok==0) + LJCMF.estimate_gpu_overhead(); + return init_ok; } void ljc_gpu_clear() { LJCMF.clear(); } -int * ljc_gpu_compute_n(const int timestep, const int ago, const int inum_full, +int** ljc_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, - double *boxlo, double *boxhi, int *tag, int **nspecial, + double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, double *host_q) { - return LJCMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo, - boxhi, tag, nspecial, special, eflag, vflag, eatom, - vatom, host_start, cpu_time, success, host_q); + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, + double *prd) { + return LJCMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success, + host_q, boxlo, prd); } -void ljc_gpu_compute(const int timestep, const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, const double cpu_time, - bool &success, double *host_q) { - LJCMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj, - firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success, - host_q); +void ljc_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_q, + const int nlocal, double *boxlo, double *prd) { + LJCMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag, + vflag,eatom,vatom,host_start,cpu_time,success,host_q, + nlocal,boxlo,prd); } double ljc_gpu_bytes() { diff --git a/lib/gpu/ljc_cut_gpu_kernel.cu b/lib/gpu/ljc_cut_gpu_kernel.cu index 2751e20702..44a607588a 100644 --- a/lib/gpu/ljc_cut_gpu_kernel.cu +++ b/lib/gpu/ljc_cut_gpu_kernel.cu @@ -18,8 +18,6 @@ #ifndef LJC_GPU_KERNEL #define LJC_GPU_KERNEL -#define MAX_SHARED_TYPES 8 - #ifdef _DOUBLE_DOUBLE #define numtyp double #define numtyp2 double2 @@ -46,7 +44,7 @@ #ifdef NV_KERNEL -#include "geryon/ucl_nv_kernel.h" +#include "nv_kernel_def.h" texture pos_tex; texture q_tex; @@ -82,6 +80,8 @@ __inline float fetch_q(const int& i, const float *q) #define fetch_pos(i,y) x_[i] #define fetch_q(i,y) q_[i] +#define BLOCK_PAIR 64 +#define MAX_SHARED_TYPES 8 #endif @@ -92,13 +92,17 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; } __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, __global numtyp4* lj3, const int lj_types, __global numtyp *sp_lj_in, __global int *dev_nbor, - __global acctyp4 *ans, __global acctyp *engv, - const int eflag, const int vflag, const int inum, - const int nall, const int nbor_pitch, - __global numtyp *q_ , __global numtyp *cutsq, - const numtyp qqrd2e) { - // ii indexes the two interacting particles in gi - int ii=GLOBAL_ID_X; + __global int *dev_packed, __global acctyp4 *ans, + __global acctyp *engv, const int eflag, + const int vflag, const int inum, const int nall, + const int nbor_pitch, __global numtyp *q_ , + __global numtyp *cutsq, const numtyp qqrd2e, + const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + __local numtyp sp_lj[8]; sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; @@ -109,29 +113,41 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, sp_lj[6]=sp_lj_in[6]; sp_lj[7]=sp_lj_in[7]; - if (ii1) { + __local acctyp red_acc[6][BLOCK_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; + red_acc[4][tid]=e_coul; - // Store answers + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<5; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + e_coul=red_acc[4][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1=energy; @@ -209,54 +266,69 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, __global numtyp4* lj3_in, - __global numtyp* sp_lj_in, __global int *dev_nbor, + __global numtyp* sp_lj_in, + __global int *dev_nbor, __global int *dev_packed, __global acctyp4 *ans, __global acctyp *engv, const int eflag, const int vflag, const int inum, const int nall, const int nbor_pitch, __global numtyp *q_ , __global numtyp *_cutsq, - const numtyp qqrd2e) { - // ii indexes the two interacting particles in gi - int ii=THREAD_ID_X; + const numtyp qqrd2e, const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[8]; - if (ii<8) - sp_lj[ii]=sp_lj_in[ii]; - if (ii0) - lj3[ii]=lj3_in[ii]; + lj3[tid]=lj3_in[tid]; } - ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X); + + acctyp energy=(acctyp)0; + acctyp e_coul=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + __syncthreads(); if (ii1) { + __local acctyp red_acc[6][BLOCK_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; + red_acc[4][tid]=e_coul; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<5; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + e_coul=red_acc[4][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1=energy; diff --git a/lib/gpu/ljc_cut_gpu_memory.cpp b/lib/gpu/ljc_cut_gpu_memory.cpp index d63ed6e5d9..642ff6ecc7 100644 --- a/lib/gpu/ljc_cut_gpu_memory.cpp +++ b/lib/gpu/ljc_cut_gpu_memory.cpp @@ -43,24 +43,28 @@ int LJC_GPU_MemoryT::bytes_per_atom(const int max_nbors) const { } template -bool LJC_GPU_MemoryT::init(const int ntypes, - double **host_cutsq, double **host_lj1, - double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, - double *host_special_lj, const int nlocal, - const int nall, const int max_nbors, - const int maxspecial, const double cell_size, - const double gpu_split, FILE *_screen, - double **host_cut_ljsq, double **host_cut_coulsq, - double *host_special_coul, const double qqrd2e) { - this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, - _screen,ljc_cut_gpu_kernel); +int LJC_GPU_MemoryT::init(const int ntypes, + double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, + double *host_special_lj, const int nlocal, + const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen, + double **host_cut_ljsq, double **host_cut_coulsq, + double *host_special_coul, const double qqrd2e) { + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,ljc_cut_gpu_kernel); + if (success!=0) + return success; // If atom type constants fit in shared memory use fast kernel int lj_types=ntypes; shared_types=false; - if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) { - lj_types=MAX_SHARED_TYPES; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; shared_types=true; } _lj_types=lj_types; @@ -95,7 +99,7 @@ bool LJC_GPU_MemoryT::init(const int ntypes, _allocated=true; this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+cutsq.row_bytes()+ sp_lj.row_bytes(); - return true; + return 0; } template @@ -134,9 +138,10 @@ void LJC_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { else vflag=0; - int GX=static_cast(ceil(static_cast(this->atom->inum())/BX)); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); - int ainum=this->atom->inum(); + int ainum=this->ans->inum(); int anall=this->atom->nall(); int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); @@ -145,19 +150,20 @@ void LJC_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->atom->dev_ans.begin(), - &this->atom->dev_engv.begin(), &eflag, &vflag, + &this->_nbor_data->begin(), + &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &anall, &nbor_pitch, &this->atom->dev_q.begin(), &cutsq.begin(), - &_qqrd2e); + &_qqrd2e, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->atom->dev_ans.begin(), - &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum, + &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &anall, &nbor_pitch, &this->atom->dev_q.begin(), - &cutsq.begin(), &_qqrd2e); + &cutsq.begin(), &_qqrd2e, &this->_threads_per_atom); } this->time_pair.stop(); } diff --git a/lib/gpu/ljc_cut_gpu_memory.h b/lib/gpu/ljc_cut_gpu_memory.h index 4dedce957a..552f9d9881 100644 --- a/lib/gpu/ljc_cut_gpu_memory.h +++ b/lib/gpu/ljc_cut_gpu_memory.h @@ -29,15 +29,22 @@ class LJC_GPU_Memory : public ChargeGPUMemory { /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin - * \param gpu_split fraction of particles handled by device **/ - bool init(const int ntypes, double **host_cutsq, double **host_lj1, - double **host_lj2, double **host_lj3, double **host_lj4, - double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, - const double gpu_split, FILE *screen, double **host_cut_ljsq, - double **host_cut_coulsq, double *host_special_coul, - const double qqrd2e); + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **host_offset, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, double **host_cut_ljsq, + double **host_cut_coulsq, double *host_special_coul, + const double qqrd2e); /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ diff --git a/lib/gpu/ljcl_cut_gpu.cpp b/lib/gpu/ljcl_cut_gpu.cpp index 8fa15998bf..167f41b374 100644 --- a/lib/gpu/ljcl_cut_gpu.cpp +++ b/lib/gpu/ljcl_cut_gpu.cpp @@ -28,14 +28,14 @@ static LJCL_GPU_Memory LJCLMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- -bool ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, - double **host_lj2, double **host_lj3, double **host_lj4, - double **offset, double *special_lj, const int inum, - const int nall, const int max_nbors, const int maxspecial, - const double cell_size, int &gpu_mode, FILE *screen, - double **host_cut_ljsq, double host_cut_coulsq, - double *host_special_coul, const double qqrd2e, - const double g_ewald) { +int ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double *special_lj, const int inum, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen, + double **host_cut_ljsq, double host_cut_coulsq, + double *host_special_coul, const double qqrd2e, + const double g_ewald) { LJCLMF.clear(); gpu_mode=LJCLMF.device->gpu_mode(); double gpu_split=LJCLMF.device->particle_split(); @@ -56,15 +56,12 @@ bool ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, fflush(screen); } - if (world_me==0) { - bool init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, - host_lj4, offset, special_lj, inum, nall, 300, - maxspecial, cell_size, gpu_split, screen, - host_cut_ljsq, host_cut_coulsq, host_special_coul, - qqrd2e,g_ewald); - if (!init_ok) - return false; - } + int init_ok=0; + if (world_me==0) + init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, + offset, special_lj, inum, nall, 300, maxspecial, + cell_size, gpu_split, screen, host_cut_ljsq, + host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); LJCLMF.device->world_barrier(); if (message) @@ -79,48 +76,51 @@ bool ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, last_gpu,i); fflush(screen); } - if (gpu_rank==i && world_me!=0) { - bool init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, - host_lj4, offset, special_lj, inum, nall, 300, - maxspecial, cell_size, gpu_split, - screen, host_cut_ljsq, host_cut_coulsq, - host_special_coul, qqrd2e, g_ewald); - if (!init_ok) - return false; - } + if (gpu_rank==i && world_me!=0) + init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, + offset, special_lj, inum, nall, 300, maxspecial, + cell_size, gpu_split, screen, host_cut_ljsq, + host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); + LJCLMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); - return true; + + if (init_ok==0) + LJCLMF.estimate_gpu_overhead(); + return init_ok; } void ljcl_gpu_clear() { LJCLMF.clear(); } -int * ljcl_gpu_compute_n(const int timestep, const int ago, const int inum_full, +int** ljcl_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, - double *boxlo, double *boxhi, int *tag, int **nspecial, + double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, double *host_q) { - return LJCLMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo, - boxhi, tag, nspecial, special, eflag, vflag, eatom, - vatom, host_start, cpu_time, success, host_q); + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, + double *prd) { + return LJCLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success, + host_q, boxlo, prd); } -void ljcl_gpu_compute(const int timestep, const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, const double cpu_time, - bool &success, double *host_q) { - LJCLMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj, +void ljcl_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_q, + const int nlocal, double *boxlo, double *prd) { + LJCLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success, - host_q); + host_q,nlocal,boxlo,prd); } double ljcl_gpu_bytes() { diff --git a/lib/gpu/ljcl_cut_gpu_kernel.cu b/lib/gpu/ljcl_cut_gpu_kernel.cu index a0b27f0259..7be7a86114 100644 --- a/lib/gpu/ljcl_cut_gpu_kernel.cu +++ b/lib/gpu/ljcl_cut_gpu_kernel.cu @@ -18,8 +18,6 @@ #ifndef LJCL_GPU_KERNEL #define LJCL_GPU_KERNEL -#define MAX_SHARED_TYPES 8 - #ifdef _DOUBLE_DOUBLE #define numtyp double #define numtyp2 double2 @@ -54,7 +52,7 @@ #ifdef NV_KERNEL -#include "geryon/ucl_nv_kernel.h" +#include "nv_kernel_def.h" texture pos_tex; texture q_tex; @@ -90,6 +88,8 @@ __inline float fetch_q(const int& i, const float *q) #define fetch_pos(i,y) x_[i] #define fetch_q(i,y) q_[i] +#define BLOCK_PAIR 64 +#define MAX_SHARED_TYPES 8 #endif @@ -100,13 +100,17 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; } __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, __global numtyp4* lj3, const int lj_types, __global numtyp *sp_lj_in, __global int *dev_nbor, - __global acctyp4 *ans, __global acctyp *engv, - const int eflag, const int vflag, const int inum, - const int nall, const int nbor_pitch, - __global numtyp *q_ , const numtyp cut_coulsq, - const numtyp qqrd2e, const numtyp g_ewald) { - // ii indexes the two interacting particles in gi - int ii=GLOBAL_ID_X; + __global int *dev_packed, __global acctyp4 *ans, + __global acctyp *engv, const int eflag, + const int vflag, const int inum, const int nall, + const int nbor_pitch, __global numtyp *q_, + const numtyp cut_coulsq, const numtyp qqrd2e, + const numtyp g_ewald, const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + __local numtyp sp_lj[8]; sp_lj[0]=sp_lj_in[0]; sp_lj[1]=sp_lj_in[1]; @@ -117,29 +121,41 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, sp_lj[6]=sp_lj_in[6]; sp_lj[7]=sp_lj_in[7]; - if (ii1) { + __local acctyp red_acc[6][BLOCK_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; + red_acc[4][tid]=e_coul; - // Store answers + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<5; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + e_coul=red_acc[4][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1=energy; @@ -225,52 +282,68 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, __global numtyp4* lj3_in, - __global numtyp* sp_lj_in, __global int *dev_nbor, + __global numtyp* sp_lj_in, + __global int *dev_nbor, __global int *dev_packed, __global acctyp4 *ans, __global acctyp *engv, const int eflag, const int vflag, const int inum, const int nall, const int nbor_pitch, __global numtyp *q_ , const numtyp cut_coulsq, - const numtyp qqrd2e, const numtyp g_ewald) { - // ii indexes the two interacting particles in gi - int ii=THREAD_ID_X; + const numtyp qqrd2e, const numtyp g_ewald, + const int t_per_atom) { + int tid=THREAD_ID_X; + int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); + ii+=tid/t_per_atom; + int offset=tid%t_per_atom; + __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; __local numtyp sp_lj[8]; - if (ii<8) - sp_lj[ii]=sp_lj_in[ii]; - if (ii0) - lj3[ii]=lj3_in[ii]; + lj3[tid]=lj3_in[tid]; } - ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X); + + acctyp energy=(acctyp)0; + acctyp e_coul=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; + f.y=(acctyp)0; + f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + __syncthreads(); if (ii1) { + __local acctyp red_acc[6][BLOCK_PAIR]; + + red_acc[0][tid]=f.x; + red_acc[1][tid]=f.y; + red_acc[2][tid]=f.z; + red_acc[3][tid]=energy; + red_acc[4][tid]=e_coul; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<5; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + f.x=red_acc[0][tid]; + f.y=red_acc[1][tid]; + f.z=red_acc[2][tid]; + energy=red_acc[3][tid]; + e_coul=red_acc[4][tid]; + + if (vflag>0) { + for (int r=0; r<6; r++) + red_acc[r][tid]=virial[r]; + + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + + for (int r=0; r<6; r++) + virial[r]=red_acc[r][tid]; + } + } + + // Store answers + if (ii0) { *ap1=energy; diff --git a/lib/gpu/ljcl_cut_gpu_memory.cpp b/lib/gpu/ljcl_cut_gpu_memory.cpp index a126309a92..f37e6b1857 100644 --- a/lib/gpu/ljcl_cut_gpu_memory.cpp +++ b/lib/gpu/ljcl_cut_gpu_memory.cpp @@ -43,7 +43,7 @@ int LJCL_GPU_MemoryT::bytes_per_atom(const int max_nbors) const { } template -bool LJCL_GPU_MemoryT::init(const int ntypes, +int LJCL_GPU_MemoryT::init(const int ntypes, double **host_cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_offset, @@ -54,14 +54,18 @@ bool LJCL_GPU_MemoryT::init(const int ntypes, double **host_cut_ljsq, const double host_cut_coulsq, double *host_special_coul, const double qqrd2e, const double g_ewald) { - this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, - _screen,ljcl_cut_gpu_kernel); + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,ljcl_cut_gpu_kernel); + if (success!=0) + return success; // If atom type constants fit in shared memory use fast kernel int lj_types=ntypes; shared_types=false; - if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) { - lj_types=MAX_SHARED_TYPES; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; shared_types=true; } _lj_types=lj_types; @@ -94,7 +98,7 @@ bool LJCL_GPU_MemoryT::init(const int ntypes, _allocated=true; this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes(); - return true; + return 0; } template @@ -132,9 +136,10 @@ void LJCL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { else vflag=0; - int GX=static_cast(ceil(static_cast(this->atom->inum())/BX)); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); - int ainum=this->atom->inum(); + int ainum=this->ans->inum(); int anall=this->atom->nall(); int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); @@ -143,19 +148,21 @@ void LJCL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) { this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->atom->dev_ans.begin(), - &this->atom->dev_engv.begin(), &eflag, &vflag, + &this->_nbor_data->begin(), + &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &anall, &nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq, - &_qqrd2e, &_g_ewald); + &_qqrd2e, &_g_ewald, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->atom->dev_ans.begin(), - &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum, + &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), + &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, &anall, &nbor_pitch, &this->atom->dev_q.begin(), - &_cut_coulsq, &_qqrd2e, &_g_ewald); + &_cut_coulsq, &_qqrd2e, &_g_ewald, + &this->_threads_per_atom); } this->time_pair.stop(); } diff --git a/lib/gpu/ljcl_cut_gpu_memory.h b/lib/gpu/ljcl_cut_gpu_memory.h index 056ba0e41f..fae4c07040 100644 --- a/lib/gpu/ljcl_cut_gpu_memory.h +++ b/lib/gpu/ljcl_cut_gpu_memory.h @@ -29,15 +29,22 @@ class LJCL_GPU_Memory : public ChargeGPUMemory { /// Clear any previous data and set up for a new LAMMPS run /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin - * \param gpu_split fraction of particles handled by device **/ - bool init(const int ntypes, double **host_cutsq, - double **host_lj1, double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, double *host_special_lj, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, - const double gpu_split, FILE *screen, double **host_cut_ljsq, - const double host_cut_coulsq, double *host_special_coul, - const double qqrd2e, const double g_ewald); + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, double **host_cut_ljsq, + const double host_cut_coulsq, double *host_special_coul, + const double qqrd2e, const double g_ewald); /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ diff --git a/lib/gpu/pair_gpu_atom.cpp b/lib/gpu/pair_gpu_atom.cpp index 0ca2345087..e34a15c0b9 100644 --- a/lib/gpu/pair_gpu_atom.cpp +++ b/lib/gpu/pair_gpu_atom.cpp @@ -29,9 +29,8 @@ __win_sort _win_sort; #endif template -PairGPUAtomT::PairGPUAtom() : _compiled(false),_allocated(false),_eflag(false), - _vflag(false),_inum(0),_ilist(NULL), - _newton(false) { +PairGPUAtomT::PairGPUAtom() : _compiled(false),_allocated(false), + _max_gpu_bytes(0) { #ifndef USE_OPENCL sort_config.op = CUDPP_ADD; sort_config.datatype = CUDPP_UINT; @@ -56,28 +55,20 @@ int PairGPUAtomT::bytes_per_atom() const { int id_space=0; if (_gpu_nbor) id_space=2; - int bytes=4*sizeof(numtyp)+11*sizeof(acctyp)+id_space; + int bytes=4*sizeof(numtyp)+id_space; if (_rot) - bytes+=4*sizeof(numtyp)+4*sizeof(acctyp); + bytes+=4*sizeof(numtyp); if (_charge) bytes+=sizeof(numtyp); return bytes; } template -bool PairGPUAtomT::alloc(const int inum, const int nall) { +bool PairGPUAtomT::alloc(const int nall) { _max_atoms=static_cast(static_cast(nall)*1.10); - if (_newton) - _max_local=_max_atoms; - else - _max_local=static_cast(static_cast(inum)*1.10); bool success=true; - int ans_elements=4; - if (_rot) - ans_elements+=4; - // Ignore host/device transfers? bool cpuview=false; if (dev->device_type()==UCL_CPU) @@ -107,8 +98,6 @@ bool PairGPUAtomT::alloc(const int inum, const int nall) { success=success && (host_x.alloc(_max_atoms*4,*dev, UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); #endif - success=success &&(host_ans.alloc(ans_elements*_max_local,*dev)==UCL_SUCCESS); - success=success &&(host_engv.alloc(_ev_fields*_max_local,*dev)==UCL_SUCCESS); // Buffer for casting only if different precisions if (_charge) success=success && (host_q.alloc(_max_atoms,*dev, @@ -120,15 +109,13 @@ bool PairGPUAtomT::alloc(const int inum, const int nall) { // --------------------------- Device allocations - _gpu_bytes=0; + int gpu_bytes=0; if (cpuview) { #ifdef GPU_CAST assert(0==1); #else dev_x.view(host_x); #endif - dev_engv.view(host_engv); - dev_ans.view(host_ans); if (_rot) dev_quat.view(host_quat); if (_charge) @@ -140,49 +127,80 @@ bool PairGPUAtomT::alloc(const int inum, const int nall) { dev_x_cast.alloc(_max_atoms*3,*dev,UCL_READ_ONLY)); success=success && (UCL_SUCCESS== dev_type_cast.alloc(_max_atoms,*dev,UCL_READ_ONLY)); - _gpu_bytes+=dev_x_cast.row_bytes()+dev_type_cast.row_bytes(); + gpu_bytes+=dev_x_cast.row_bytes()+dev_type_cast.row_bytes(); #else success=success && (UCL_SUCCESS== dev_x.alloc(_max_atoms*4,*dev,UCL_READ_ONLY)); #endif - success=success && (dev_engv.alloc(_ev_fields*_max_local,*dev, - UCL_WRITE_ONLY)==UCL_SUCCESS); - success=success && (dev_ans.alloc(ans_elements*_max_local, - *dev,UCL_WRITE_ONLY)==UCL_SUCCESS); if (_charge) { success=success && (dev_q.alloc(_max_atoms,*dev, UCL_READ_ONLY)==UCL_SUCCESS); - _gpu_bytes+=dev_q.row_bytes(); + gpu_bytes+=dev_q.row_bytes(); } if (_rot) { success=success && (dev_quat.alloc(_max_atoms*4,*dev, UCL_READ_ONLY)==UCL_SUCCESS); - _gpu_bytes+=dev_quat.row_bytes(); + gpu_bytes+=dev_quat.row_bytes(); } } if (_gpu_nbor) { success=success && (dev_cell_id.alloc(_max_atoms,*dev)==UCL_SUCCESS); success=success && (dev_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS); - _gpu_bytes+=dev_cell_id.row_bytes()+dev_particle_id.row_bytes(); + gpu_bytes+=dev_cell_id.row_bytes()+dev_particle_id.row_bytes(); if (_bonds) { success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS); - _gpu_bytes+=dev_tag.row_bytes(); + gpu_bytes+=dev_tag.row_bytes(); } } - _gpu_bytes+=dev_x.row_bytes()+dev_engv.row_bytes()+dev_ans.row_bytes(); + gpu_bytes+=dev_x.row_bytes(); + if (gpu_bytes>_max_gpu_bytes) + _max_gpu_bytes=gpu_bytes; _allocated=true; return success; } template -bool PairGPUAtomT::init(const int inum, const int nall, const bool charge, - const bool rot, UCL_Device &devi, const bool gpu_nbor, +bool PairGPUAtomT::add_fields(const bool charge, const bool rot, + const bool gpu_nbor, const bool bonds) { + bool realloc=false; + if (charge && _charge==false) { + _charge=true; + realloc=true; + } + if (rot && _rot==false) { + _rot=true; + realloc=true; + } + if (gpu_nbor && _gpu_nbor==false) { + _gpu_nbor=true; + realloc=true; + } + if (bonds && _bonds==false) { + _bonds=true; + realloc=true; + } + if (realloc) { + _other=_charge || _rot; + int max_atoms=_max_atoms; + clear_resize(); + return alloc(max_atoms); + } + return true; +} + +template +bool PairGPUAtomT::init(const int nall, const bool charge, const bool rot, + UCL_Device &devi, const bool gpu_nbor, const bool bonds) { clear(); bool success=true; + _x_avail=false; + _q_avail=false; + _quat_avail=false; + _resized=false; _gpu_nbor=gpu_nbor; _bonds=bonds; _charge=charge; @@ -190,33 +208,25 @@ bool PairGPUAtomT::init(const int inum, const int nall, const bool charge, _other=_charge || _rot; dev=&devi; - _e_fields=1; - if (_charge) - _e_fields++; - _ev_fields=6+_e_fields; - // Initialize atom and nbor data - int ef_inum=inum; - if (ef_inum==0) - ef_inum=1000; int ef_nall=nall; - if (ef_nall<=ef_inum) - ef_nall=ef_inum*2; + if (ef_nall==0) + ef_nall=2000; // Initialize timers for the selected device time_pos.init(*dev); - time_other.init(*dev); - time_answer.init(*dev); + time_q.init(*dev); + time_quat.init(*dev); time_pos.zero(); - time_other.zero(); - time_answer.zero(); + time_q.zero(); + time_quat.zero(); _time_cast=0.0; #ifdef GPU_CAST compile_kernels(*dev); #endif - return success && alloc(ef_inum,ef_nall); + return success && alloc(ef_nall); } template @@ -234,16 +244,12 @@ void PairGPUAtomT::clear_resize() { dev_quat.clear(); host_quat.clear(); } - dev_ans.clear(); - dev_engv.clear(); #ifndef GPU_CAST host_x.clear(); #else host_x_cast.clear(); host_type_cast.clear(); #endif - host_ans.clear(); - host_engv.clear(); dev_cell_id.clear(); dev_particle_id.clear(); dev_tag.clear(); @@ -261,17 +267,14 @@ void PairGPUAtomT::clear_resize() { template void PairGPUAtomT::clear() { - _gpu_bytes=0; + _max_gpu_bytes=0; if (!_allocated) return; time_pos.clear(); - time_other.clear(); - time_answer.clear(); + time_q.clear(); + time_quat.clear(); clear_resize(); - _inum=0; - _eflag=false; - _vflag=false; #ifdef GPU_CAST if (_compiled) { @@ -289,255 +292,10 @@ double PairGPUAtomT::host_memory_usage() const { atom_bytes+=1; if (_rot) atom_bytes+=4; - int ans_bytes=atom_bytes+_ev_fields; return _max_atoms*atom_bytes*sizeof(numtyp)+ - ans_bytes*(_max_local)*sizeof(acctyp)+ sizeof(PairGPUAtom); } -template -void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag, - const bool ef_atom, const bool vf_atom) { - time_answer.start(); - _eflag=eflag; - _vflag=vflag; - _ef_atom=ef_atom; - _vf_atom=vf_atom; - - int csize=_ev_fields; - if (!eflag) - csize-=_e_fields; - if (!vflag) - csize-=6; - - if (csize>0) - ucl_copy(host_engv,dev_engv,_inum*csize,true); - if (_rot) - ucl_copy(host_ans,dev_ans,_inum*4*2,true); - else - ucl_copy(host_ans,dev_ans,_inum*4,true); - time_answer.stop(); -} - -template -void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag, - const bool ef_atom, const bool vf_atom, - int *ilist) { - _ilist=ilist; - copy_answers(eflag,vflag,ef_atom,vf_atom); -} - -template -double PairGPUAtomT::energy_virial(double *eatom, double **vatom, - double *virial) { - if (_eflag==false && _vflag==false) - return 0.0; - - double evdwl=0.0; - if (_gpu_nbor) { - for (int i=0; i<_inum; i++) { - acctyp *ap=host_engv.begin()+i; - if (_eflag) { - if (_ef_atom) { - evdwl+=*ap; - eatom[i]+=*ap*0.5; - ap+=_inum; - } else { - evdwl+=*ap; - ap+=_inum; - } - } - if (_vflag) { - if (_vf_atom) { - for (int j=0; j<6; j++) { - vatom[i][j]+=*ap*0.5; - virial[j]+=*ap; - ap+=_inum; - } - } else { - for (int j=0; j<6; j++) { - virial[j]+=*ap; - ap+=_inum; - } - } - } - } - for (int j=0; j<6; j++) - virial[j]*=0.5; - } else { - for (int i=0; i<_inum; i++) { - acctyp *ap=host_engv.begin()+i; - int ii=_ilist[i]; - if (_eflag) { - if (_ef_atom) { - evdwl+=*ap; - eatom[ii]+=*ap*0.5; - ap+=_inum; - } else { - evdwl+=*ap; - ap+=_inum; - } - } - if (_vflag) { - if (_vf_atom) { - for (int j=0; j<6; j++) { - vatom[ii][j]+=*ap*0.5; - virial[j]+=*ap; - ap+=_inum; - } - } else { - for (int j=0; j<6; j++) { - virial[j]+=*ap; - ap+=_inum; - } - } - } - } - for (int j=0; j<6; j++) - virial[j]*=0.5; - } - - evdwl*=0.5; - return evdwl; -} - -template -double PairGPUAtomT::energy_virial(double *eatom, double **vatom, - double *virial, double &ecoul) { - if (_eflag==false && _vflag==false) { - ecoul=0.0; - return 0.0; - } - - if (_charge==false) - return energy_virial(eatom,vatom,virial); - - double evdwl=0.0; - double _ecoul=0.0; - if (_gpu_nbor) { - for (int i=0; i<_inum; i++) { - acctyp *ap=host_engv.begin()+i; - if (_eflag) { - if (_ef_atom) { - evdwl+=*ap; - eatom[i]+=*ap*0.5; - ap+=_inum; - _ecoul+=*ap; - eatom[i]+=*ap*0.5; - ap+=_inum; - } else { - evdwl+=*ap; - ap+=_inum; - _ecoul+=*ap; - ap+=_inum; - } - } - if (_vflag) { - if (_vf_atom) { - for (int j=0; j<6; j++) { - vatom[i][j]+=*ap*0.5; - virial[j]+=*ap; - ap+=_inum; - } - } else { - for (int j=0; j<6; j++) { - virial[j]+=*ap; - ap+=_inum; - } - } - } - } - for (int j=0; j<6; j++) - virial[j]*=0.5; - } else { - for (int i=0; i<_inum; i++) { - acctyp *ap=host_engv.begin()+i; - int ii=_ilist[i]; - if (_eflag) { - if (_ef_atom) { - evdwl+=*ap; - eatom[ii]+=*ap*0.5; - ap+=_inum; - _ecoul+=*ap; - eatom[ii]+=*ap*0.5; - ap+=_inum; - } else { - evdwl+=*ap; - ap+=_inum; - _ecoul+=*ap; - ap+=_inum; - } - } - if (_vflag) { - if (_vf_atom) { - for (int j=0; j<6; j++) { - vatom[ii][j]+=*ap*0.5; - virial[j]+=*ap; - ap+=_inum; - } - } else { - for (int j=0; j<6; j++) { - virial[j]+=*ap; - ap+=_inum; - } - } - } - } - for (int j=0; j<6; j++) - virial[j]*=0.5; - } - - evdwl*=0.5; - ecoul+=_ecoul*0.5; - return evdwl; -} - -template -void PairGPUAtomT::get_answers(double **f, double **tor) { - acctyp *ap=host_ans.begin(); - if (_gpu_nbor) { - for (int i=0; i<_inum; i++) { - f[i][0]+=*ap; - ap++; - f[i][1]+=*ap; - ap++; - f[i][2]+=*ap; - ap+=2; - } - if (_rot) { - for (int i=0; i<_inum; i++) { - tor[i][0]+=*ap; - ap++; - tor[i][1]+=*ap; - ap++; - tor[i][2]+=*ap; - ap+=2; - } - } - } else { - for (int i=0; i<_inum; i++) { - int ii=_ilist[i]; - f[ii][0]+=*ap; - ap++; - f[ii][1]+=*ap; - ap++; - f[ii][2]+=*ap; - ap+=2; - } - if (_rot) { - for (int i=0; i<_inum; i++) { - int ii=_ilist[i]; - tor[ii][0]+=*ap; - ap++; - tor[ii][1]+=*ap; - ap++; - tor[ii][2]+=*ap; - ap+=2; - } - } - } -} - // Sort arrays for neighbor list calculation template void PairGPUAtomT::sort_neighbor(const int num_atoms) { diff --git a/lib/gpu/pair_gpu_atom.h b/lib/gpu/pair_gpu_atom.h index e0a1fd9fb1..526c146f37 100644 --- a/lib/gpu/pair_gpu_atom.h +++ b/lib/gpu/pair_gpu_atom.h @@ -23,7 +23,6 @@ #ifdef USE_OPENCL -#include "geryon/ocl_device.h" #include "geryon/ocl_timer.h" #include "geryon/ocl_mat.h" #include "geryon/ocl_kernel.h" @@ -32,7 +31,6 @@ using namespace ucl_opencl; #else #include "cudpp.h" -#include "geryon/nvd_device.h" #include "geryon/nvd_timer.h" #include "geryon/nvd_mat.h" #include "geryon/nvd_kernel.h" @@ -40,10 +38,6 @@ using namespace ucl_cudadr; #endif -#ifndef int2 -struct int2 { int x; int y; }; -#endif - #include "pair_gpu_precision.h" template @@ -56,13 +50,9 @@ class PairGPUAtom { inline int max_atoms() const { return _max_atoms; } /// Current number of local+ghost atoms stored inline int nall() const { return _nall; } - /// Current number of local atoms stored - inline int inum() const { return _inum; } /// Set number of local+ghost atoms for future copy operations inline void nall(const int n) { _nall=n; } - /// Set number of local atoms for future copy operations - inline void inum(const int n) { _inum=n; } /// Memory usage per atom in this class int bytes_per_atom() const; @@ -70,21 +60,33 @@ class PairGPUAtom { /// Clear any previous data and set up for a new LAMMPS run /** \param rot True if atom storage needs quaternions * \param gpu_nbor True if neighboring will be performed on device **/ - bool init(const int inum, const int nall, const bool charge, const bool rot, + bool init(const int nall, const bool charge, const bool rot, UCL_Device &dev, const bool gpu_nbor=false, const bool bonds=false); /// Check if we have enough device storage and realloc if not - inline bool resize(const int inum, const int nall, bool &success) { - _inum=inum; + /** Returns true if resized with any call during this timestep **/ + inline bool resize(const int nall, bool &success) { _nall=nall; - if (inum>_max_local || nall>_max_atoms) { + if (nall>_max_atoms) { clear_resize(); - success = success && alloc(inum,nall); - return true; + success = success && alloc(nall); + _resized=true; } - return false; + return _resized; } - + + /// If already initialized by another LAMMPS style, add fields as necessary + /** \param rot True if atom storage needs quaternions + * \param gpu_nbor True if neighboring will be performed on device **/ + bool add_fields(const bool charge, const bool rot, const bool gpu_nbor, + const bool bonds); + + /// Returns true if GPU is using charges + bool charge() { return _charge; } + + /// Returns true if GPU is using quaternions + bool quat() { return _rot; } + /// Only free matrices of length inum or nall for resizing void clear_resize(); @@ -100,28 +102,42 @@ class PairGPUAtom { /// Add copy times to timers inline void acc_timers() { time_pos.add_to_total(); - time_answer.add_to_total(); - if (_other) - time_other.add_to_total(); + if (_charge) + time_q.add_to_total(); + if (_rot) + time_quat.add_to_total(); } /// Add copy times to timers inline void zero_timers() { time_pos.zero(); - time_answer.zero(); - if (_other) - time_other.zero(); + if (_charge) + time_q.zero(); + if (_rot) + time_quat.zero(); } /// Return the total time for host/device data transfer + /** Zeros the total so that the atom times are only included once **/ inline double transfer_time() { - double total=time_pos.total_seconds()+time_answer.total_seconds(); - if (_other) total+=time_other.total_seconds(); + double total=time_pos.total_seconds(); + time_pos.zero_total(); + if (_charge) { + total+=time_q.total_seconds(); + time_q.zero_total(); + } + if (_rot) { + total+=time_q.total_seconds(); + time_quat.zero_total(); + } + return total; } /// Return the total time for data cast/pack - inline double cast_time() { return _time_cast; } + /** Zeros the time so that atom times are only included once **/ + inline double cast_time() + { double t=_time_cast; _time_cast=0.0; return t; } /// Pack LAMMPS atom type constants into matrix and copy to device template @@ -216,43 +232,52 @@ class PairGPUAtom { // -------------------------COPY TO GPU ---------------------------------- + /// Signal that we need to transfer atom data for next timestep + inline void data_unavail() + { _x_avail=false; _q_avail=false; _quat_avail=false; _resized=false; } + /// Cast positions and types to write buffer inline void cast_x_data(double **host_ptr, const int *host_type) { - double t=MPI_Wtime(); - #ifdef GPU_CAST - memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double)); - memcpy(host_type_cast.begin(),host_type,_nall*sizeof(int)); - #else - numtyp *_write_loc=host_x.begin(); - for (int i=0; i<_nall; i++) { - *_write_loc=host_ptr[i][0]; - _write_loc++; - *_write_loc=host_ptr[i][1]; - _write_loc++; - *_write_loc=host_ptr[i][2]; - _write_loc++; - *_write_loc=host_type[i]; - _write_loc++; + if (_x_avail==false) { + double t=MPI_Wtime(); + #ifdef GPU_CAST + memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double)); + memcpy(host_type_cast.begin(),host_type,_nall*sizeof(int)); + #else + numtyp *_write_loc=host_x.begin(); + for (int i=0; i<_nall; i++) { + *_write_loc=host_ptr[i][0]; + _write_loc++; + *_write_loc=host_ptr[i][1]; + _write_loc++; + *_write_loc=host_ptr[i][2]; + _write_loc++; + *_write_loc=host_type[i]; + _write_loc++; + } + #endif + _time_cast+=MPI_Wtime()-t; } - #endif - _time_cast+=MPI_Wtime()-t; - } + } /// Copy positions and types to device asynchronously /** Copies nall() elements **/ inline void add_x_data(double **host_ptr, int *host_type) { time_pos.start(); - #ifdef GPU_CAST - ucl_copy(dev_x_cast,host_x_cast,_nall*3,true); - ucl_copy(dev_type_cast,host_type_cast,_nall,true); - int block_size=64; - int GX=static_cast(ceil(static_cast(_nall)/block_size)); - k_cast_x.set_size(GX,block_size); - k_cast_x.run(&dev_x.begin(), &dev_x_cast.begin(), &dev_type_cast.begin(), - &_nall); - #else - ucl_copy(dev_x,host_x,_nall*4,true); - #endif + if (_x_avail==false) { + #ifdef GPU_CAST + ucl_copy(dev_x_cast,host_x_cast,_nall*3,true); + ucl_copy(dev_type_cast,host_type_cast,_nall,true); + int block_size=64; + int GX=static_cast(ceil(static_cast(_nall)/block_size)); + k_cast_x.set_size(GX,block_size); + k_cast_x.run(&dev_x.begin(), &dev_x_cast.begin(), &dev_type_cast.begin(), + &_nall); + #else + ucl_copy(dev_x,host_x,_nall*4,true); + #endif + _x_avail=true; + } time_pos.stop(); } @@ -262,87 +287,68 @@ class PairGPUAtom { add_x_data(host_ptr,host_type); } - /// Cast charges to write buffer + // Cast charges to write buffer template inline void cast_q_data(cpytyp *host_ptr) { - double t=MPI_Wtime(); - if (dev->device_type()==UCL_CPU) { - if (sizeof(numtyp)==sizeof(double)) { - host_q.view((numtyp*)host_ptr,_nall,*dev); - dev_q.view(host_q); - } else - for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i]; - } else { - if (sizeof(numtyp)==sizeof(double)) - memcpy(host_q.begin(),host_ptr,_nall*sizeof(numtyp)); - else - for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i]; + if (_q_avail==false) { + double t=MPI_Wtime(); + if (dev->device_type()==UCL_CPU) { + if (sizeof(numtyp)==sizeof(double)) { + host_q.view((numtyp*)host_ptr,_nall,*dev); + dev_q.view(host_q); + } else + for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i]; + } else { + if (sizeof(numtyp)==sizeof(double)) + memcpy(host_q.begin(),host_ptr,_nall*sizeof(numtyp)); + else + for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i]; + } + _time_cast+=MPI_Wtime()-t; } - _time_cast+=MPI_Wtime()-t; } - /// Copy charges to device asynchronously + // Copy charges to device asynchronously inline void add_q_data() { - ucl_copy(dev_q,host_q,_nall,true); + if (_q_avail==false) { + ucl_copy(dev_q,host_q,_nall,true); + _q_avail=true; + } } - /// Cast quaternions to write buffer + // Cast quaternions to write buffer template inline void cast_quat_data(cpytyp *host_ptr) { - double t=MPI_Wtime(); - if (dev->device_type()==UCL_CPU) { - if (sizeof(numtyp)==sizeof(double)) { - host_quat.view((numtyp*)host_ptr,_nall*4,*dev); - dev_quat.view(host_quat); - } else - for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i]; - } else { - if (sizeof(numtyp)==sizeof(double)) - memcpy(host_quat.begin(),host_ptr,_nall*4*sizeof(numtyp)); - else - for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i]; + if (_quat_avail==false) { + double t=MPI_Wtime(); + if (dev->device_type()==UCL_CPU) { + if (sizeof(numtyp)==sizeof(double)) { + host_quat.view((numtyp*)host_ptr,_nall*4,*dev); + dev_quat.view(host_quat); + } else + for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i]; + } else { + if (sizeof(numtyp)==sizeof(double)) + memcpy(host_quat.begin(),host_ptr,_nall*4*sizeof(numtyp)); + else + for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i]; + } + _time_cast+=MPI_Wtime()-t; } - _time_cast+=MPI_Wtime()-t; } - /// Copy quaternions to device + // Copy quaternions to device /** Copies nall()*4 elements **/ inline void add_quat_data() { - ucl_copy(dev_quat,host_quat,_nall*4,true); + if (_quat_avail==false) { + ucl_copy(dev_quat,host_quat,_nall*4,true); + _quat_avail=true; + } } - /// Copy data other than pos and data to device - inline void add_other_data() { - time_other.start(); - if (_charge) - add_q_data(); - if (_rot) - add_quat_data(); - time_other.stop(); - } - /// Return number of bytes used on device - inline double gpu_bytes() { return _gpu_bytes; } - - // -------------------------COPY FROM GPU ------------------------------- - - /// Copy answers from device into read buffer asynchronously - void copy_answers(const bool eflag, const bool vflag, - const bool ef_atom, const bool vf_atom); - - /// Copy answers from device into read buffer asynchronously - void copy_answers(const bool eflag, const bool vflag, - const bool ef_atom, const bool vf_atom, int *ilist); - - /// Copy energy and virial data into LAMMPS memory - double energy_virial(double *eatom, double **vatom, double *virial); - - /// Copy energy and virial data into LAMMPS memory - double energy_virial(double *eatom, double **vatom, double *virial, - double &ecoul); - - /// Add forces and torques from the GPU into a LAMMPS pointer - void get_answers(double **f, double **tor); + inline double max_gpu_bytes() + { double m=_max_gpu_bytes; _max_gpu_bytes=0.0; return m; } // ------------------------------ DATA ---------------------------------- @@ -352,10 +358,6 @@ class PairGPUAtom { UCL_D_Vec dev_q; /// Quaterions UCL_D_Vec dev_quat; - /// Force and possibly torque - UCL_D_Vec dev_ans; - /// Energy and virial per-atom storage - UCL_D_Vec dev_engv; #ifdef GPU_CAST UCL_D_Vec dev_x_cast; @@ -370,10 +372,6 @@ class PairGPUAtom { UCL_H_Vec host_q; /// Buffer for moving quat data to GPU UCL_H_Vec host_quat; - /// Force and possibly torque data on host - UCL_H_Vec host_ans; - /// Energy/virial data on host - UCL_H_Vec host_engv; /// Cell list identifiers for device nbor builds UCL_D_Vec dev_cell_id; @@ -383,7 +381,7 @@ class PairGPUAtom { UCL_D_Vec dev_tag; /// Device timers - UCL_Timer time_pos, time_other, time_answer; + UCL_Timer time_pos, time_q, time_quat; /// Geryon device UCL_Device *dev; @@ -396,19 +394,19 @@ class PairGPUAtom { #endif bool _compiled; - - bool alloc(const int inum, const int nall); - bool _allocated, _eflag, _vflag, _ef_atom, _vf_atom, _rot, _charge, _other; - int _max_local, _max_atoms, _nall, _inum, _e_fields, _ev_fields; + // True if data has been copied to device already + bool _x_avail, _q_avail, _quat_avail, _resized; + + bool alloc(const int nall); + + bool _allocated, _rot, _charge, _other; + int _max_atoms, _nall; bool _gpu_nbor, _bonds; - int *_ilist; double _time_cast; - double _gpu_bytes; + double _max_gpu_bytes; - bool _newton; - #ifndef USE_OPENCL CUDPPConfiguration sort_config; CUDPPHandle sort_plan; diff --git a/lib/gpu/pair_gpu_balance.h b/lib/gpu/pair_gpu_balance.h index a3a0f61a62..9e14ad60d8 100644 --- a/lib/gpu/pair_gpu_balance.h +++ b/lib/gpu/pair_gpu_balance.h @@ -23,7 +23,7 @@ #define _HD_BALANCE_EVERY 25 #define _HD_BALANCE_WEIGHT 0.5 -#define _HD_BALANCE_GAP 1.05 +#define _HD_BALANCE_GAP 1.10 /// Host/device load balancer template @@ -33,7 +33,8 @@ class PairGPUBalance { inline ~PairGPUBalance() { clear(); } /// Clear any old data and setup for new LAMMPS run - inline void init(PairGPUDevice *gpu, const double split); + inline void init(PairGPUDevice *gpu, const bool gpu_nbor, + const double split); /// Clear all host and device data inline void clear() { @@ -43,23 +44,25 @@ class PairGPUBalance { _init_done=false; } } + + /// Return the timestep since initialization + inline int timestep() { return _timestep; } /// Get a count of the number of particles host will handle for initial alloc - inline int first_host_count(const int nlocal,const bool gpu_nbor, - const double gpu_split) const { + inline int first_host_count(const int nlocal, const double gpu_split, + const bool gpu_nbor) const { int host_nlocal=0; if (gpu_nbor && gpu_split!=1.0) { if (gpu_split>0) host_nlocal=static_cast(ceil((1.0-gpu_split)*nlocal)); else - host_nlocal=static_cast(ceil(0.1*nlocal)); + host_nlocal=static_cast(ceil(0.05*nlocal)); } return host_nlocal; } /// Return the number of particles the device will handle this timestep - inline int get_gpu_count(const int timestep, const int ago, - const int inum_full); + inline int get_gpu_count(const int ago, const int inum_full); /// Return the average fraction of particles handled by device on all procs inline double all_avg_split() { @@ -82,10 +85,10 @@ class PairGPUBalance { if (_measure_this_step) { _device->gpu->sync(); _device->gpu_barrier(); + _device->start_host_timer(); _device_time.start(); _device->gpu->sync(); _device->gpu_barrier(); - _device->start_host_timer(); } } @@ -95,34 +98,34 @@ class PairGPUBalance { /// Calculate the new host/device split based on the cpu and device times /** \note Only does calculation every _HD_BALANCE_EVERY timesteps (and first 10) **/ - inline void balance(const double cpu_time, const bool gpu_nbor); + inline void balance(const double cpu_time); /// Calls balance() and then get_gpu_count() - inline int balance(const int timestep, const int ago, const int inum_full, - const double cpu_time, const bool gpu_nbor) { - balance(cpu_time,gpu_nbor); - return get_gpu_count(timestep,ago,inum_full); + inline int balance(const int ago,const int inum_full,const double cpu_time) { + balance(cpu_time); + return get_gpu_count(ago,inum_full); } private: PairGPUDevice *_device; UCL_Timer _device_time; - bool _init_done; + bool _init_done, _gpu_nbor; bool _load_balance; double _actual_split, _avg_split, _desired_split, _max_split; int _avg_count; bool _measure_this_step; - int _inum, _inum_full; + int _inum, _inum_full, _timestep; }; #define PairGPUBalanceT PairGPUBalance template -void PairGPUBalanceT::init(PairGPUDevice *gpu, - const double split) { +void PairGPUBalanceT::init(PairGPUDevice *gpu, + const bool gpu_nbor, const double split) { clear(); + _gpu_nbor=gpu_nbor; _init_done=true; _device=gpu; @@ -130,7 +133,7 @@ void PairGPUBalanceT::init(PairGPUDevice *gpu, if (split<0.0) { _load_balance=true; - _desired_split=0.9; + _desired_split=0.90; } else { _load_balance=false; _desired_split=split; @@ -138,14 +141,14 @@ void PairGPUBalanceT::init(PairGPUDevice *gpu, _actual_split=_desired_split; _avg_split=0.0; _avg_count=0; + _timestep=0; } template -int PairGPUBalanceT::get_gpu_count(const int timestep, const int ago, - const int inum_full) { +int PairGPUBalanceT::get_gpu_count(const int ago, const int inum_full) { _measure_this_step=false; if (_load_balance) { - if (_avg_count<11 || timestep%_HD_BALANCE_EVERY==0) { + if (_avg_count<11 || _timestep%_HD_BALANCE_EVERY==0) { _measure_this_step=true; _inum_full=inum_full; } @@ -156,44 +159,44 @@ int PairGPUBalanceT::get_gpu_count(const int timestep, const int ago, } _inum=static_cast(floor(_actual_split*inum_full)); if (_inum==0) _inum++; + _timestep++; return _inum; } template -void PairGPUBalanceT::balance(const double cpu_time, const bool gpu_nbor) { +void PairGPUBalanceT::balance(const double cpu_time) { if (_measure_this_step) { + _measure_this_step=false; + double gpu_time=_device_time.seconds(); + + double max_gpu_time; + MPI_Allreduce(&gpu_time,&max_gpu_time,1,MPI_DOUBLE,MPI_MAX, + _device->gpu_comm()); + if (_inum_full==_inum) { _desired_split=1.0; return; } - _measure_this_step=false; - double gpu_time=_device_time.seconds(); + double cpu_time_per_atom=cpu_time/(_inum_full-_inum); + double cpu_other_time=_device->host_time()-cpu_time; + int host_inum=static_cast((max_gpu_time-cpu_other_time)/ + cpu_time_per_atom); - double cpu_gpu_time[3], max_times[3]; - cpu_gpu_time[0]=cpu_time/(_inum_full-_inum); - cpu_gpu_time[1]=gpu_time/_inum; - cpu_gpu_time[2]=(_device->host_time()-cpu_time)/_inum_full; + double split=static_cast(_inum_full-host_inum)/_inum_full; + _desired_split=split*_HD_BALANCE_GAP; + if (_desired_split>1.0) + _desired_split=1.0; + if (_desired_split<0.0) + _desired_split=0.0; - MPI_Allreduce(cpu_gpu_time,max_times,3,MPI_DOUBLE,MPI_MAX, - _device->gpu_comm()); - double split=(max_times[0]+max_times[2])/(max_times[0]+max_times[1]); - split*=_HD_BALANCE_GAP; - - if (split>1.0) - split=1.0; - if (_avg_count<10) - _desired_split=(_desired_split*_avg_count+split)/(_avg_count+1); - else - _desired_split=_desired_split*(1.0-_HD_BALANCE_WEIGHT)+ - _HD_BALANCE_WEIGHT*split; - - if (!gpu_nbor) { + if (!_gpu_nbor) { if (_desired_split<_max_split) _actual_split=_desired_split; else _actual_split=_max_split; } +//std::cout << gpu_time << " " << max_gpu_time << " " << cpu_other_time << " " << cpu_time_per_atom << " " << cpu_time << " " << _desired_split << " " << host_inum << std::endl; } _avg_split+=_desired_split; _avg_count++; diff --git a/lib/gpu/pair_gpu_build_kernel.cu b/lib/gpu/pair_gpu_build_kernel.cu index bcf41c0050..33742a4cba 100644 --- a/lib/gpu/pair_gpu_build_kernel.cu +++ b/lib/gpu/pair_gpu_build_kernel.cu @@ -18,7 +18,7 @@ #ifdef NV_KERNEL -#include "geryon/ucl_nv_kernel.h" +#include "nv_kernel_def.h" texture neigh_tex; #ifdef _DOUBLE_DOUBLE @@ -36,6 +36,7 @@ __inline float4 fetch_pos(const int& i, const float4 *pos) #else #define fetch_pos(i,y) x_[i] +#define BLOCK_NBOR_BUILD 64 #endif @@ -54,29 +55,30 @@ __inline float4 fetch_pos(const int& i, const float4 *pos) #define numtyp4 float4 #endif -#define CELL_BLOCK_SIZE 64 -#define BLOCK_2D 8 +#define BLOCK_CELL_2D 8 + +#define SBBITS 30 #define SBBITS 30 __kernel void transpose(int *out, int *in, int columns_in, int rows_in) { - __local float block[BLOCK_2D][BLOCK_2D+1]; + __local float block[BLOCK_CELL_2D][BLOCK_CELL_2D+1]; unsigned ti=THREAD_ID_X; unsigned tj=THREAD_ID_Y; unsigned bi=BLOCK_ID_X; unsigned bj=BLOCK_ID_Y; - unsigned i=bi*BLOCK_2D+ti; - unsigned j=bj*BLOCK_2D+tj; + unsigned i=bi*BLOCK_CELL_2D+ti; + unsigned j=bj*BLOCK_CELL_2D+tj; if ((ipid_i) { - diff.x = atom_i.x - pos_sh[j].x; - diff.y = atom_i.y - pos_sh[j].y; - diff.z = atom_i.z - pos_sh[j].z; + diff.x = atom_i.x - pos_sh[j].x; + diff.y = atom_i.y - pos_sh[j].y; + diff.z = atom_i.z - pos_sh[j].z; - r2 = diff.x*diff.x + diff.y*diff.y + diff.z*diff.z; - if (r2 < cell_size*cell_size && r2 > 1e-5) { - if (cnt < neigh_bin_size) { - *neigh_list = pid_j; - neigh_list+=stride; - } - cnt++; - } - } + r2 = diff.x*diff.x + diff.y*diff.y + diff.z*diff.z; + if (r2 < cell_size*cell_size && r2 > 1e-5) { + if (cnt < neigh_bin_size) { + *neigh_list = pid_j; + neigh_list+=stride; + } + cnt++; + } } } __syncthreads(); @@ -249,9 +250,10 @@ __kernel void calc_neigh_list_cell(numtyp4 *pos, } __kernel void kernel_special(__global int *dev_nbor, - __global int *host_nbor_list, __global int *tag, + __global int *host_nbor_list, + __global int *host_numj, __global int *tag, __global int *nspecial, __global int *special, - int inum, int nt, int nall) { + int inum, int nt, int nall, int max_nbors) { // ii indexes the two interacting particles in gi int ii=GLOBAL_ID_X; @@ -263,15 +265,17 @@ __kernel void kernel_special(__global int *dev_nbor, int n2=nspecial[ii*3+1]; int n3=nspecial[ii*3+2]; + int numj; if (ii < inum) { stride=inum; list=dev_nbor+stride+ii; + numj=*list; + list+=stride; } else { - stride=nt-inum; - list=host_nbor_list+ii-inum; + stride=1; + list=host_nbor_list+(ii-inum)*max_nbors; + numj=host_numj[ii-inum]; } - int numj=*list; - list+=stride; list_end=list+numj*stride; for ( ; list #include +#ifdef _OPENMP +#include +#endif + +#ifdef USE_OPENCL +#include "pair_gpu_dev_cl.h" +#else +#include "pair_gpu_dev_ptx.h" +#endif #define PairGPUDeviceT PairGPUDevice template PairGPUDeviceT::PairGPUDevice() : _init_count(0), _device_init(false), _gpu_mode(GPU_FORCE), _first_device(0), - _last_device(0) { + _last_device(0), _compiled(false) { } template @@ -34,14 +43,19 @@ PairGPUDeviceT::~PairGPUDevice() { } template -bool PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica, - const int first_gpu, const int last_gpu, - const int gpu_mode, const double p_split, - const int nthreads) { +int PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica, + const int first_gpu, const int last_gpu, + const int gpu_mode, const double p_split, + const int nthreads, const int t_per_atom) { _nthreads=nthreads; + #ifdef _OPENMP + omp_set_num_threads(nthreads); + #endif + _threads_per_atom=t_per_atom; + _threads_per_charge=t_per_atom; if (_device_init) - return true; + return 0; _device_init=true; _comm_world=world; _comm_replica=replica; @@ -96,7 +110,12 @@ bool PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica, // set the device ID _procs_per_gpu=static_cast(ceil(static_cast(procs_per_node)/ (last_gpu-first_gpu+1))); - int my_gpu=node_rank/_procs_per_gpu; + int my_gpu=node_rank/_procs_per_gpu+first_gpu; + + // Time on the device only if 1 proc per gpu + _time_device=true; + if (_procs_per_gpu>1) + _time_device=false; // Set up a per device communicator MPI_Comm_split(node_comm,my_gpu,0,&_comm_gpu); @@ -104,39 +123,109 @@ bool PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica, gpu=new UCL_Device(); if (my_gpu>=gpu->num_devices()) - return false; + return -2; gpu->set(my_gpu); - return true; + + _long_range_precompute=0; + + int flag=compile_kernels(); + + return flag; } template -bool PairGPUDeviceT::init(const bool charge, const bool rot, const int nlocal, - const int host_nlocal, const int nall, - const int maxspecial, const bool gpu_nbor, - const int gpu_host, const int max_nbors, - const double cell_size, const bool pre_cut) { +int PairGPUDeviceT::init(PairGPUAns &ans, const bool charge, + const bool rot, const int nlocal, + const int host_nlocal, const int nall, + PairGPUNbor *nbor, const int maxspecial, + const int gpu_host, const int max_nbors, + const double cell_size, const bool pre_cut) { if (!_device_init) - return false; + return -1; + if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false) + return -5; + + // Counts of data transfers for timing overhead estimates + _data_in_estimate=0; + _data_out_estimate=1; + + // Initial number of local particles + int ef_nlocal=nlocal; + if (_particle_split<1.0 && _particle_split>0.0) + ef_nlocal=static_cast(_particle_split*nlocal); + + bool gpu_nbor=false; + if (_gpu_mode==GPU_NEIGH) + gpu_nbor=true; + if (_init_count==0) { // Initialize atom and nbor data - int ef_nlocal=nlocal; - if (_particle_split<1.0 && _particle_split>0.0) - ef_nlocal=static_cast(_particle_split*nlocal); - if (!atom.init(ef_nlocal,nall,charge,rot,*gpu,gpu_nbor, - gpu_nbor && maxspecial>0)) - return false; - if (!nbor.init(ef_nlocal,host_nlocal,max_nbors,maxspecial,*gpu,gpu_nbor, - gpu_host,pre_cut)) - return false; - nbor.cell_size(cell_size); + if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor && maxspecial>0)) + return -3; + + _data_in_estimate++; + if (charge) + _data_in_estimate++; + if (rot) + _data_in_estimate++; } else { - if (cell_size>nbor.cell_size()) - nbor.cell_size(cell_size); + if (atom.charge()==false && charge) + _data_in_estimate++; + if (atom.quat()==false && rot) + _data_in_estimate++; + if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor && maxspecial)) + return -3; } + + if (!ans.init(ef_nlocal,charge,rot,*gpu)) + return -3; + + if (!nbor->init(&_nbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial, + *gpu,gpu_nbor,gpu_host,pre_cut, _block_cell_2d, + _block_cell_id, _block_nbor_build)) + return -3; + nbor->cell_size(cell_size); _init_count++; - return true; + return 0; +} + +template +int PairGPUDeviceT::init(PairGPUAns &ans, const int nlocal, + const int nall) { + if (!_device_init) + return -1; + if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false) + return -5; + + if (_init_count==0) { + // Initialize atom and nbor data + if (!atom.init(nall,true,false,*gpu,false,false)) + return -3; + } else + if (!atom.add_fields(true,false,false,false)) + return -3; + + if (!ans.init(nlocal,true,false,*gpu)) + return -3; + + _init_count++; + return 0; +} + +template +void PairGPUDeviceT::set_single_precompute + (PPPMGPUMemory *pppm) { + _long_range_precompute=1; + pppm_single=pppm; +} + +template +void PairGPUDeviceT::set_double_precompute + (PPPMGPUMemory *pppm) { + _long_range_precompute=2; + pppm_double=pppm; } template @@ -152,11 +241,17 @@ void PairGPUDeviceT::init_message(FILE *screen, const char *name, fprintf(screen,"\n-------------------------------------"); fprintf(screen,"-------------------------------------\n"); fprintf(screen,"- Using GPGPU acceleration for %s:\n",name); - fprintf(screen,"- with %d procs per device.\n",_procs_per_gpu); + fprintf(screen,"- with %d proc(s) per device.\n",_procs_per_gpu); + #ifdef _OPENMP + fprintf(screen,"- with %d thread(s) per proc.\n",_nthreads); + #endif fprintf(screen,"-------------------------------------"); fprintf(screen,"-------------------------------------\n"); - for (int i=first_gpu; i<=last_gpu; i++) { + int last=last_gpu+1; + if (last>gpu->num_devices()) + last=gpu->num_devices(); + for (int i=first_gpu; iname(i)+", "+toa(gpu->cores(i))+" cores, "+fs+ toa(gpu->gigabytes(i))+" GB, "+toa(gpu->clock_rate(i))+ " GHZ ("; @@ -177,32 +272,152 @@ void PairGPUDeviceT::init_message(FILE *screen, const char *name, } template -void PairGPUDeviceT::output_times(UCL_Timer &time_pair, const double avg_split, - const double max_bytes, FILE *screen) { - double single[5], times[5]; +void PairGPUDeviceT::estimate_gpu_overhead(const int kernel_calls, + double &gpu_overhead, + double &gpu_driver_overhead) { + UCL_H_Vec *host_data_in=NULL, *host_data_out=NULL; + UCL_D_Vec *dev_data_in=NULL, *dev_data_out=NULL, *kernel_data=NULL; + UCL_Timer *timers_in=NULL, *timers_out=NULL, *timers_kernel=NULL; + UCL_Timer over_timer(*gpu); - single[0]=atom.transfer_time(); + if (_data_in_estimate>0) { + host_data_in=new UCL_H_Vec[_data_in_estimate]; + dev_data_in=new UCL_D_Vec[_data_in_estimate]; + timers_in=new UCL_Timer[_data_in_estimate]; + } + + if (_data_out_estimate>0) { + host_data_out=new UCL_H_Vec[_data_out_estimate]; + dev_data_out=new UCL_D_Vec[_data_out_estimate]; + timers_out=new UCL_Timer[_data_out_estimate]; + } + + if (kernel_calls>0) { + kernel_data=new UCL_D_Vec[kernel_calls]; + timers_kernel=new UCL_Timer[kernel_calls]; + } + + for (int i=0; i<_data_in_estimate; i++) { + host_data_in[i].alloc(1,*gpu); + dev_data_in[i].alloc(1,*gpu); + timers_in[i].init(*gpu); + } + + for (int i=0; i<_data_out_estimate; i++) { + host_data_out[i].alloc(1,*gpu); + dev_data_out[i].alloc(1,*gpu); + timers_out[i].init(*gpu); + } + + for (int i=0; isync(); + gpu_barrier(); + over_timer.start(); + gpu->sync(); + gpu_barrier(); + + double driver_time=MPI_Wtime(); + for (int i=0; i<_data_in_estimate; i++) { + timers_in[i].start(); + ucl_copy(dev_data_in[i],host_data_in[i],true); + timers_in[i].stop(); + } + + for (int i=0; i0) { + delete [] host_data_in; + delete [] dev_data_in; + delete [] timers_in; + } + + if (_data_out_estimate>0) { + delete [] host_data_out; + delete [] dev_data_out; + delete [] timers_out; + } + + if (kernel_calls>0) { + delete [] kernel_data; + delete [] timers_kernel; + } +} + +template +void PairGPUDeviceT::output_times(UCL_Timer &time_pair, + PairGPUAns &ans, + PairGPUNbor &nbor, const double avg_split, + const double max_bytes, + const double gpu_overhead, + const double driver_overhead, + const int threads_per_atom, FILE *screen) { + double single[8], times[8]; + + single[0]=atom.transfer_time()+ans.transfer_time(); single[1]=nbor.time_nbor.total_seconds(); single[2]=nbor.time_kernel.total_seconds(); single[3]=time_pair.total_seconds(); - single[4]=atom.cast_time(); + single[4]=atom.cast_time()+ans.cast_time(); + single[5]=gpu_overhead; + single[6]=driver_overhead; + single[7]=ans.cpu_idle_time(); - MPI_Reduce(single,times,5,MPI_DOUBLE,MPI_SUM,0,_comm_replica); + MPI_Reduce(single,times,8,MPI_DOUBLE,MPI_SUM,0,_comm_replica); - double my_max_bytes=max_bytes; + double my_max_bytes=max_bytes+atom.max_gpu_bytes(); double mpi_max_bytes; MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica); double max_mb=mpi_max_bytes/(1024.0*1024.0); if (replica_me()==0) - if (screen && times[3]>0.0) { + if (screen && times[5]>0.0) { fprintf(screen,"\n\n-------------------------------------"); fprintf(screen,"--------------------------------\n"); fprintf(screen," GPU Time Info (average): "); fprintf(screen,"\n-------------------------------------"); fprintf(screen,"--------------------------------\n"); - if (procs_per_gpu()==1) { + if (time_device()) { fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/_replica_size); fprintf(screen,"Data Cast/Pack: %.4f s.\n",times[4]/_replica_size); fprintf(screen,"Neighbor copy: %.4f s.\n",times[1]/_replica_size); @@ -212,7 +427,71 @@ void PairGPUDeviceT::output_times(UCL_Timer &time_pair, const double avg_split, fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/_replica_size); fprintf(screen,"Force calc: %.4f s.\n",times[3]/_replica_size); } + fprintf(screen,"GPU Overhead: %.4f s.\n",times[5]/_replica_size); fprintf(screen,"Average split: %.4f.\n",avg_split); + fprintf(screen,"Threads / atom: %d.\n",threads_per_atom); + fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb); + fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[6]/_replica_size); + fprintf(screen,"CPU Idle_Time: %.4f s.\n",times[7]/_replica_size); + + fprintf(screen,"-------------------------------------"); + fprintf(screen,"--------------------------------\n\n"); + } +} + +template +void PairGPUDeviceT::output_kspace_times(UCL_Timer &time_in, + UCL_Timer &time_out, + UCL_Timer &time_map, + UCL_Timer &time_rho, + UCL_Timer &time_interp, + PairGPUAns &ans, + const double max_bytes, + const double cpu_time, + const double idle_time, FILE *screen) { + double single[8], times[8]; + + single[0]=time_out.total_seconds(); + single[1]=time_in.total_seconds()+atom.transfer_time()+atom.cast_time(); + single[2]=time_map.total_seconds(); + single[3]=time_rho.total_seconds(); + single[4]=time_interp.total_seconds(); + single[5]=ans.transfer_time()+ans.cast_time(); + single[6]=cpu_time; + single[7]=idle_time; + + MPI_Reduce(single,times,8,MPI_DOUBLE,MPI_SUM,0,_comm_replica); + + double my_max_bytes=max_bytes+atom.max_gpu_bytes(); + double mpi_max_bytes; + MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica); + double max_mb=mpi_max_bytes/(1024.0*1024.0); + + if (replica_me()==0) + if (screen && times[6]>0.0) { + fprintf(screen,"\n\n-------------------------------------"); + fprintf(screen,"--------------------------------\n"); + fprintf(screen," GPU Time Info (average): "); + fprintf(screen,"\n-------------------------------------"); + fprintf(screen,"--------------------------------\n"); + + if (time_device()) { + fprintf(screen,"Data Out: %.4f s.\n",times[0]/_replica_size); + fprintf(screen,"Data In: %.4f s.\n",times[1]/_replica_size); + fprintf(screen,"Kernel (map): %.4f s.\n",times[2]/_replica_size); + fprintf(screen,"Kernel (rho): %.4f s.\n",times[3]/_replica_size); + fprintf(screen,"Force interp: %.4f s.\n",times[4]/_replica_size); + fprintf(screen,"Total rho: %.4f s.\n", + (times[0]+times[2]+times[3])/_replica_size); + fprintf(screen,"Total interp: %.4f s.\n", + (times[1]+times[4])/_replica_size); + fprintf(screen,"Force copy/cast: %.4f s.\n",times[5]/_replica_size); + fprintf(screen,"Total: %.4f s.\n", + (times[0]+times[1]+times[2]+times[3]+times[4]+times[5])/ + _replica_size); + } + fprintf(screen,"CPU Poisson: %.4f s.\n",times[6]/_replica_size); + fprintf(screen,"CPU Idle Time: %.4f s.\n",times[7]/_replica_size); fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb); fprintf(screen,"-------------------------------------"); @@ -223,10 +502,17 @@ void PairGPUDeviceT::output_times(UCL_Timer &time_pair, const double avg_split, template void PairGPUDeviceT::clear() { if (_init_count>0) { + _long_range_precompute=0; _init_count--; if (_init_count==0) { atom.clear(); - nbor.clear(); + _nbor_shared.clear(); + if (_compiled) { + k_zero.clear(); + k_info.clear(); + delete dev_program; + _compiled=false; + } } } } @@ -241,21 +527,80 @@ void PairGPUDeviceT::clear_device() { } } +template +int PairGPUDeviceT::compile_kernels() { + int flag=0; + + if (_compiled) + return flag; + + std::string flags="-cl-mad-enable"; + dev_program=new UCL_Program(*gpu); + int success=dev_program->load_string(pair_gpu_dev_kernel,flags.c_str()); + if (success!=UCL_SUCCESS) + return -4; + k_zero.set_function(*dev_program,"kernel_zero"); + k_info.set_function(*dev_program,"kernel_info"); + _compiled=true; + + UCL_H_Vec h_gpu_lib_data(14,*gpu,UCL_NOT_PINNED); + UCL_D_Vec d_gpu_lib_data(14,*gpu); + k_info.set_size(1,1); + k_info.run(&d_gpu_lib_data.begin()); + ucl_copy(h_gpu_lib_data,d_gpu_lib_data,false); + + #ifndef USE_OPENCL + if (static_cast(h_gpu_lib_data[0])/100.0>gpu->arch()) + return -4; + #endif + + _num_mem_threads=h_gpu_lib_data[1]; + _warp_size=h_gpu_lib_data[2]; + if (_threads_per_atom<1) + _threads_per_atom=h_gpu_lib_data[3]; + if (_threads_per_charge<1) + _threads_per_charge=h_gpu_lib_data[13]; + _pppm_max_spline=h_gpu_lib_data[4]; + _pppm_block=h_gpu_lib_data[5]; + _block_pair=h_gpu_lib_data[6]; + _max_shared_types=h_gpu_lib_data[7]; + _block_cell_2d=h_gpu_lib_data[8]; + _block_cell_id=h_gpu_lib_data[9]; + _block_nbor_build=h_gpu_lib_data[10]; + _block_bio_pair=h_gpu_lib_data[11]; + _max_bio_shared_types=h_gpu_lib_data[12]; + + if (static_cast(_block_pair)>gpu->group_size()) + _block_pair=gpu->group_size(); + if (static_cast(_block_bio_pair)>gpu->group_size()) + _block_bio_pair=gpu->group_size(); + if (_threads_per_atom>_warp_size) + _threads_per_atom=_warp_size; + if (_warp_size%_threads_per_atom!=0) + _threads_per_atom=1; + if (_threads_per_charge>_warp_size) + _threads_per_charge=_warp_size; + if (_warp_size%_threads_per_charge!=0) + _threads_per_charge=1; + + return flag; +} + template double PairGPUDeviceT::host_memory_usage() const { - return atom.host_memory_usage()+ - nbor.host_memory_usage()+4*sizeof(numtyp)+ + return atom.host_memory_usage()+4*sizeof(numtyp)+ sizeof(PairGPUDevice); } template class PairGPUDevice; PairGPUDevice pair_gpu_device; -bool lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, - const int last_gpu, const int gpu_mode, - const double particle_split, const int nthreads) { +int lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, + const int last_gpu, const int gpu_mode, + const double particle_split, const int nthreads, + const int t_per_atom) { return pair_gpu_device.init_device(world,replica,first_gpu,last_gpu,gpu_mode, - particle_split,nthreads); + particle_split,nthreads,t_per_atom); } void lmp_clear_device() { @@ -264,14 +609,5 @@ void lmp_clear_device() { double lmp_gpu_forces(double **f, double **tor, double *eatom, double **vatom, double *virial, double &ecoul) { - if (pair_gpu_device.init_count()) { - pair_gpu_device.stop_host_timer(); - pair_gpu_device.gpu->sync(); - double evdw=pair_gpu_device.atom.energy_virial(eatom,vatom,virial,ecoul); - pair_gpu_device.atom.get_answers(f,tor); - - return evdw; - } - return 0.0; + return pair_gpu_device.fix_gpu(f,tor,eatom,vatom,virial,ecoul); } - diff --git a/lib/gpu/pair_gpu_device.h b/lib/gpu/pair_gpu_device.h index 33aa54959b..1e7e15e6a8 100644 --- a/lib/gpu/pair_gpu_device.h +++ b/lib/gpu/pair_gpu_device.h @@ -19,11 +19,17 @@ #define PAIR_GPU_DEVICE_H #include "pair_gpu_atom.h" +#include "pair_gpu_ans.h" #include "pair_gpu_nbor.h" +#include "pppm_gpu_memory.h" #include "mpi.h" #include #include "stdio.h" #include +#include + +template class PPPMGPUMemory; template class PairGPUDevice { @@ -33,10 +39,15 @@ class PairGPUDevice { /// Initialize the device for use by this process /** Sets up a per-device MPI communicator for load balancing and initializes - * the device (>=first_gpu and <=last_gpu) that this proc will be using **/ - bool init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, + * the device (>=first_gpu and <=last_gpu) that this proc will be using + * Returns: + * - 0 if successfull + * - -2 if GPU not found + * - -4 if GPU library not compiled for GPU **/ + int init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, const int last_gpu, const int gpu_mode, - const double particle_split, const int nthreads); + const double particle_split, const int nthreads, + const int t_per_atom); /// Initialize the device for Atom and Neighbor storage /** \param rot True if quaternions need to be stored @@ -50,19 +61,67 @@ class PairGPUDevice { * \param max_nbors Initial number of rows in the neighbor matrix * \param cell_size cutoff+skin * \param pre_cut True if cutoff test will be performed in separate kernel - * than the force kernel **/ - bool init(const bool charge, const bool rot, const int nlocal, - const int host_nlocal, const int nall, const int maxspecial, - const bool gpu_nbor, const int gpu_host, const int max_nbors, - const double cell_size, const bool pre_cut); + * than the force kernel + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(PairGPUAns &a, const bool charge, const bool rot, + const int nlocal, const int host_nlocal, const int nall, + PairGPUNbor *nbor, const int maxspecial, const int gpu_host, + const int max_nbors, const double cell_size, const bool pre_cut); + + /// Initialize the device for Atom storage only + /** \param nlocal Total number of local particles to allocate memory for + * \param nall Total number of local+ghost particles + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(PairGPUAns &ans, const int nlocal, const int nall); /// Output a message for pair_style acceleration with device stats void init_message(FILE *screen, const char *name, const int first_gpu, const int last_gpu); + /// Perform charge assignment asynchronously for PPPM + void set_single_precompute(PPPMGPUMemory *pppm); + + /// Perform charge assignment asynchronously for PPPM + void set_double_precompute(PPPMGPUMemory *pppm); + + /// Esimate the overhead from GPU calls from multiple procs + /** \param kernel_calls Number of kernel calls/timestep for timing estimated + * overhead + * \param gpu_overhead Estimated gpu overhead per timestep (sec) + * \param driver_overhead Estimated overhead from driver per timestep (s) **/ + void estimate_gpu_overhead(const int kernel_calls, double &gpu_overhead, + double &gpu_driver_overhead); + + /// Returns true if double precision is supported on card + inline bool double_precision() { return gpu->double_precision(); } + /// Output a message with timing information - void output_times(UCL_Timer &time_pair, const double avg_split, - const double max_bytes, FILE *screen); + void output_times(UCL_Timer &time_pair, PairGPUAns &ans, + PairGPUNbor &nbor, const double avg_split, + const double max_bytes, const double gpu_overhead, + const double driver_overhead, + const int threads_per_atom, FILE *screen); + + /// Output a message with timing information + void output_kspace_times(UCL_Timer &time_in, UCL_Timer &time_out, + UCL_Timer & time_map, UCL_Timer & time_rho, + UCL_Timer &time_interp, + PairGPUAns &ans, + const double max_bytes, const double cpu_time, + const double cpu_idle_time, FILE *screen); /// Clear all memory on host and device associated with atom and nbor data void clear(); @@ -70,11 +129,37 @@ class PairGPUDevice { /// Clear all memory on host and device void clear_device(); + /// Add an answer object for putting forces, energies, etc from GPU to LAMMPS + inline void add_ans_object(PairGPUAns *ans) + { ans_queue.push(ans); } + + /// Add "answers" (force,energies,etc.) into LAMMPS structures + inline double fix_gpu(double **f, double **tor, double *eatom, + double **vatom, double *virial, double &ecoul) { + atom.data_unavail(); + if (ans_queue.empty()==false) { + stop_host_timer(); + double evdw=0.0; + while (ans_queue.empty()==false) { + evdw+=ans_queue.front()->get_answers(f,tor,eatom,vatom,virial,ecoul); + ans_queue.pop(); + } + return evdw; + } + return 0.0; + } + /// Start timer on host - inline void start_host_timer() { _cpu_full=MPI_Wtime(); } + inline void start_host_timer() + { _cpu_full=MPI_Wtime(); _host_timer_started=true; } /// Stop timer on host - inline void stop_host_timer() { _cpu_full=MPI_Wtime()-_cpu_full; } + inline void stop_host_timer() { + if (_host_timer_started) { + _cpu_full=MPI_Wtime()-_cpu_full; + _host_timer_started=false; + } + } /// Return host time inline double host_time() { return _cpu_full; } @@ -114,6 +199,42 @@ class PairGPUDevice { inline double particle_split() const { return _particle_split; } /// Return the initialization count for the device inline int init_count() const { return _init_count; } + /// True if device is being timed + inline bool time_device() const { return _time_device; } + + /// Return the number of threads accessing memory simulatenously + inline int num_mem_threads() const { return _num_mem_threads; } + /// Return the number of threads per atom for pair styles + inline int threads_per_atom() const { return _threads_per_atom; } + /// Return the number of threads per atom for pair styles using charge + inline int threads_per_charge() const { return _threads_per_charge; } + /// Return the min of the pair block size or the device max block size + inline int pair_block_size() const { return _block_pair; } + /// Return the maximum number of atom types that can be used with shared mem + inline int max_shared_types() const { return _max_shared_types; } + /// Return the maximum order for PPPM splines + inline int pppm_max_spline() const { return _pppm_max_spline; } + /// Return the block size for PPPM kernels + inline int pppm_block() const { return _pppm_block; } + /// Return the block size for neighbor binning + inline int block_cell_2d() const { return _block_cell_2d; } + /// Return the block size for atom mapping for neighbor builds + inline int block_cell_id() const { return _block_cell_id; } + /// Return the block size for neighbor build kernel + inline int block_nbor_build() const { return _block_nbor_build; } + /// Return the block size for "bio" pair styles + inline int block_bio_pair() const { return _block_bio_pair; } + /// Return the maximum number of atom types for shared mem with "bio" styles + inline int max_bio_shared_types() const { return _max_bio_shared_types; } + + // -------------------- SHARED DEVICE ROUTINES -------------------- + // Perform asynchronous zero of integer array + void zero(UCL_D_Vec &mem, const int numel) { + int num_blocks=static_cast(ceil(static_cast(numel)/ + _block_pair)); + k_zero.set_size(num_blocks,_block_pair); + k_zero.run(&mem.begin(),&numel); + } // -------------------------- DEVICE DATA ------------------------- @@ -130,11 +251,30 @@ class PairGPUDevice { // --------------------------- NBOR DATA ---------------------------- /// Neighbor Data - PairGPUNbor nbor; + PairGPUNborShared _nbor_shared; + + // ------------------------ LONG RANGE DATA ------------------------- + + // Long Range Data + int _long_range_precompute; + PPPMGPUMemory *pppm_single; + PPPMGPUMemory *pppm_double; + /// Precomputations for long range charge assignment (asynchronously) + inline void precompute(const int ago, const int nlocal, const int nall, + double **host_x, int *host_type, bool &success, + double *charge, double *boxlo, double *prd) { + if (_long_range_precompute==1) + pppm_single->precompute(ago,nlocal,nall,host_x,host_type,success,charge, + boxlo,prd); + else if (_long_range_precompute==2) + pppm_double->precompute(ago,nlocal,nall,host_x,host_type,success,charge, + boxlo,prd); + } private: + std::queue *> ans_queue; int _init_count; - bool _device_init; + bool _device_init, _host_timer_started, _time_device; MPI_Comm _comm_world, _comm_replica, _comm_gpu; int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me, _replica_size; @@ -142,6 +282,19 @@ class PairGPUDevice { double _particle_split; double _cpu_full; + int _num_mem_threads, _warp_size, _threads_per_atom, _threads_per_charge; + int _pppm_max_spline, _pppm_block; + int _block_pair, _max_shared_types; + int _block_cell_2d, _block_cell_id, _block_nbor_build; + int _block_bio_pair, _max_bio_shared_types; + + UCL_Program *dev_program; + UCL_Kernel k_zero, k_info; + bool _compiled; + int compile_kernels(); + + int _data_in_estimate, _data_out_estimate; + template inline std::string toa(const t& in) { std::ostringstream o; diff --git a/lib/gpu/pair_gpu_nbor.cpp b/lib/gpu/pair_gpu_nbor.cpp index 123fbe54aa..df138a7eff 100644 --- a/lib/gpu/pair_gpu_nbor.cpp +++ b/lib/gpu/pair_gpu_nbor.cpp @@ -18,15 +18,9 @@ #include "pair_gpu_precision.h" #include "pair_gpu_nbor.h" +#include "pair_gpu_device.h" #include "math.h" -#ifdef USE_OPENCL -#include "pair_gpu_nbor_cl.h" -#else -#include "pair_gpu_nbor_ptx.h" -#include "pair_gpu_build_ptx.h" -#endif - int PairGPUNbor::bytes_per_atom(const int max_nbors) const { if (_gpu_nbor) return (max_nbors+2)*sizeof(int); @@ -36,12 +30,18 @@ int PairGPUNbor::bytes_per_atom(const int max_nbors) const { return (max_nbors+3)*sizeof(int); } -bool PairGPUNbor::init(const int inum, const int host_inum, const int max_nbors, +bool PairGPUNbor::init(PairGPUNborShared *shared, const int inum, + const int host_inum, const int max_nbors, const int maxspecial, UCL_Device &devi, const bool gpu_nbor, const int gpu_host, - const bool pre_cut) { + const bool pre_cut, const int block_cell_2d, + const int block_cell_id, const int block_nbor_build) { clear(); + _block_cell_2d=block_cell_2d; + _block_cell_id=block_cell_id; + _block_nbor_build=block_nbor_build; + _shared=shared; dev=&devi; _gpu_nbor=gpu_nbor; if (gpu_host==0) @@ -80,8 +80,11 @@ bool PairGPUNbor::init(const int inum, const int host_inum, const int max_nbors, success=success && (host_packed.alloc(2*IJ_SIZE,*dev, UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); alloc(success); + if (!success) + return false; + if (_use_packing==false) - compile_kernels(devi); + _shared->compile_kernels(devi,gpu_nbor); return success; } @@ -89,13 +92,14 @@ bool PairGPUNbor::init(const int inum, const int host_inum, const int max_nbors, void PairGPUNbor::alloc(bool &success) { dev_nbor.clear(); host_acc.clear(); + int nt=_max_atoms+_max_host; if (_use_packing==false || _gpu_nbor) success=success && (dev_nbor.alloc((_max_nbors+2)*_max_atoms,*dev, UCL_READ_ONLY)==UCL_SUCCESS); else success=success && (dev_nbor.alloc(3*_max_atoms,*dev, UCL_READ_ONLY)==UCL_SUCCESS); - success=success && (host_acc.alloc((_max_atoms+_max_host)*2,*dev, + success=success && (host_acc.alloc(nt*2,*dev, UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); _c_bytes=dev_nbor.row_bytes(); @@ -108,11 +112,31 @@ void PairGPUNbor::alloc(bool &success) { if (_max_host>0) { host_nbor.clear(); dev_host_nbor.clear(); - success=success && (host_nbor.alloc((_max_nbors+1)*_max_host,*dev, + dev_host_numj.clear(); + host_ilist.clear(); + host_jlist.clear(); + + success=success && (host_nbor.alloc(_max_nbors*_max_host,*dev, UCL_RW_OPTIMIZED)==UCL_SUCCESS); - success=success && (dev_host_nbor.alloc((_max_nbors+1)*_max_host, + success=success && (dev_host_nbor.alloc(_max_nbors*_max_host, *dev,UCL_WRITE_ONLY)==UCL_SUCCESS); - _c_bytes+=dev_host_nbor.row_bytes(); + success=success && (dev_host_numj.alloc(_max_host,*dev, + UCL_WRITE_ONLY)==UCL_SUCCESS); + success=success && (host_ilist.alloc(nt,*dev,UCL_NOT_PINNED)==UCL_SUCCESS); + if (!success) + return; + for (int i=0; i0) { dev_nspecial.clear(); @@ -145,6 +169,9 @@ void PairGPUNbor::clear() { dev_host_nbor.clear(); dev_packed.clear(); host_nbor.clear(); + dev_host_numj.clear(); + host_ilist.clear(); + host_jlist.clear(); dev_nspecial.clear(); dev_special.clear(); dev_special_t.clear(); @@ -152,27 +179,13 @@ void PairGPUNbor::clear() { time_kernel.clear(); time_nbor.clear(); } - - if (_compiled) { - if (_gpu_nbor) { - k_cell_id.clear(); - k_cell_counts.clear(); - k_build_nbor.clear(); - k_transpose.clear(); - k_special.clear(); - delete build_program; - } else { - k_nbor.clear(); - delete nbor_program; - } - _compiled=false; - } } double PairGPUNbor::host_memory_usage() const { if (_gpu_nbor) { if (_gpu_host) - return host_nbor.row_bytes()*host_nbor.rows(); + return host_nbor.row_bytes()*host_nbor.rows()+host_ilist.row_bytes()+ + host_jlist.row_bytes(); else return 0; } else @@ -186,7 +199,7 @@ void PairGPUNbor::get_host(const int inum, int *ilist, int *numj, UCL_H_Vec ilist_view; ilist_view.view(ilist,inum,*dev); - ucl_copy(dev_nbor,ilist_view,true); + ucl_copy(dev_nbor,ilist_view,false); UCL_D_Vec nbor_offset; UCL_H_Vec host_offset; @@ -238,46 +251,20 @@ void PairGPUNbor::get_host(const int inum, int *ilist, int *numj, if (_use_packing==false) { time_kernel.start(); int GX=static_cast(ceil(static_cast(inum)/block_size)); - k_nbor.set_size(GX,block_size); - k_nbor.run(&dev_nbor.begin(), &dev_packed.begin(), &inum); + _shared->k_nbor.set_size(GX,block_size); + _shared->k_nbor.run(&dev_nbor.begin(), &dev_packed.begin(), &inum); time_kernel.stop(); } } -void PairGPUNbor::compile_kernels(UCL_Device &dev) { - std::string flags="-cl-fast-relaxed-math -cl-mad-enable"; - - if (_gpu_nbor==false) { - nbor_program=new UCL_Program(dev); - nbor_program->load_string(pair_gpu_nbor_kernel,flags.c_str()); - k_nbor.set_function(*nbor_program,"kernel_unpack"); - } else { - build_program=new UCL_Program(dev); - #ifdef USE_OPENCL - std::cerr << "CANNOT CURRENTLY USE GPU NEIGHBORING WITH OPENCL\n"; - exit(1); - #else - build_program->load_string(pair_gpu_build_kernel,flags.c_str()); - #endif - k_cell_id.set_function(*build_program,"calc_cell_id"); - k_cell_counts.set_function(*build_program,"kernel_calc_cell_counts"); - k_build_nbor.set_function(*build_program,"calc_neigh_list_cell"); - k_transpose.set_function(*build_program,"transpose"); - k_special.set_function(*build_program,"kernel_special"); - neigh_tex.get_texture(*build_program,"neigh_tex"); - } - _compiled=true; -} - template void PairGPUNbor::build_nbor_list(const int inum, const int host_inum, const int nall, PairGPUAtom &atom, - double *boxlo, double *boxhi, int *tag, + double *sublo, double *subhi, int *tag, int **nspecial, int **special, bool &success, int &mn) { const int nt=inum+host_inum; - if (_maxspecial>0) { time_nbor.start(); UCL_H_Vec view_nspecial, view_special, view_tag; @@ -290,25 +277,25 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum, time_nbor.stop(); time_nbor.add_to_total(); time_kernel.start(); - const int b2x=8; - const int b2y=8; + const int b2x=_block_cell_2d; + const int b2y=_block_cell_2d; const int g2x=static_cast(ceil(static_cast(_maxspecial)/b2x)); const int g2y=static_cast(ceil(static_cast(nt)/b2y)); - k_transpose.set_size(g2x,g2y,b2x,b2y); - k_transpose.run(&dev_special.begin(),&dev_special_t.begin(),&_maxspecial, - &nt); + _shared->k_transpose.set_size(g2x,g2y,b2x,b2y); + _shared->k_transpose.run(&dev_special.begin(),&dev_special_t.begin(), + &_maxspecial,&nt); } else time_kernel.start(); _nbor_pitch=inum; - neigh_tex.bind_float(atom.dev_x,4); + _shared->neigh_tex.bind_float(atom.dev_x,4); int ncellx, ncelly, ncellz, ncell_3d; - ncellx = static_cast(ceil(((boxhi[0] - boxlo[0]) + + ncellx = static_cast(ceil(((subhi[0] - sublo[0]) + 2.0*_cell_size)/_cell_size)); - ncelly = static_cast(ceil(((boxhi[1] - boxlo[1]) + + ncelly = static_cast(ceil(((subhi[1] - sublo[1]) + 2.0*_cell_size)/_cell_size)); - ncellz = static_cast(ceil(((boxhi[2] - boxlo[2]) + + ncellz = static_cast(ceil(((subhi[2] - sublo[2]) + 2.0*_cell_size)/_cell_size)); ncell_3d = ncellx * ncelly * ncellz; UCL_D_Vec cell_counts; @@ -316,35 +303,36 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum, _cell_bytes=cell_counts.row_bytes(); /* build cell list on GPU */ - const int neigh_block=128; + const int neigh_block=_block_cell_id; const int GX=(int)ceil((float)nall/neigh_block); - const numtyp boxlo0=static_cast(boxlo[0]); - const numtyp boxlo1=static_cast(boxlo[1]); - const numtyp boxlo2=static_cast(boxlo[2]); - const numtyp boxhi0=static_cast(boxhi[0]); - const numtyp boxhi1=static_cast(boxhi[1]); - const numtyp boxhi2=static_cast(boxhi[2]); + const numtyp sublo0=static_cast(sublo[0]); + const numtyp sublo1=static_cast(sublo[1]); + const numtyp sublo2=static_cast(sublo[2]); + const numtyp subhi0=static_cast(subhi[0]); + const numtyp subhi1=static_cast(subhi[1]); + const numtyp subhi2=static_cast(subhi[2]); const numtyp cell_size_cast=static_cast(_cell_size); - k_cell_id.set_size(GX,neigh_block); - k_cell_id.run(&atom.dev_x.begin(), &atom.dev_cell_id.begin(), - &atom.dev_particle_id.begin(), - &boxlo0, &boxlo1, &boxlo2, &boxhi0, &boxhi1, - &boxhi2, &cell_size_cast, &ncellx, &ncelly, &nall); + _shared->k_cell_id.set_size(GX,neigh_block); + _shared->k_cell_id.run(&atom.dev_x.begin(), &atom.dev_cell_id.begin(), + &atom.dev_particle_id.begin(), + &sublo0, &sublo1, &sublo2, &subhi0, &subhi1, + &subhi2, &cell_size_cast, &ncellx, &ncelly, &nall); atom.sort_neighbor(nall); /* calculate cell count */ - k_cell_counts.set_size(GX,neigh_block); - k_cell_counts.run(&atom.dev_cell_id.begin(), &cell_counts.begin(), &nall, - &ncell_3d); + _shared->k_cell_counts.set_size(GX,neigh_block); + _shared->k_cell_counts.run(&atom.dev_cell_id.begin(), &cell_counts.begin(), + &nall, &ncell_3d); /* build the neighbor list */ - const int cell_block=64; - k_build_nbor.set_size(ncellx, ncelly*ncellz, cell_block, 1); - k_build_nbor.run(&atom.dev_x.begin(), &atom.dev_particle_id.begin(), - &cell_counts.begin(), &dev_nbor.begin(), - &dev_host_nbor.begin(), &_max_nbors, &cell_size_cast, - &ncellx, &ncelly, &ncellz, &inum, &nt, &nall); + const int cell_block=_block_nbor_build; + _shared->k_build_nbor.set_size(ncellx, ncelly*ncellz, cell_block, 1); + _shared->k_build_nbor.run(&atom.dev_x.begin(), &atom.dev_particle_id.begin(), + &cell_counts.begin(), &dev_nbor.begin(), + &dev_host_nbor.begin(), &dev_host_numj.begin(), + &_max_nbors,&cell_size_cast, + &ncellx, &ncelly, &ncellz, &inum, &nt, &nall); /* Get the maximum number of nbors and realloc if necessary */ UCL_D_Vec numj; @@ -353,7 +341,7 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum, if (nt>inum) { UCL_H_Vec host_offset; host_offset.view_offset(inum,host_acc,nt-inum); - ucl_copy(host_offset,dev_host_nbor,nt-inum,false); + ucl_copy(host_offset,dev_host_numj,nt-inum,false); } mn=host_acc[0]; for (int i=1; i0) { host_nbor.clear(); dev_host_nbor.clear(); - success=success && (host_nbor.alloc((mn+1)*_max_host,dev_nbor, + success=success && (host_nbor.alloc(mn*_max_host,dev_nbor, UCL_RW_OPTIMIZED)==UCL_SUCCESS); - success=success && (dev_host_nbor.alloc((mn+1)*_max_host, + success=success && (dev_host_nbor.alloc(mn*_max_host, dev_nbor,UCL_WRITE_ONLY)==UCL_SUCCESS); + int *ptr=host_nbor.begin(); + for (int i=0; i<_max_host; i++) { + host_jlist[i]=ptr; + ptr+=mn; + } _gpu_bytes+=dev_host_nbor.row_bytes(); } if (_alloc_packed) { @@ -385,28 +378,29 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum, _max_nbors=mn; time_kernel.stop(); time_kernel.add_to_total(); - build_nbor_list(inum, host_inum, nall, atom, boxlo, boxhi, tag, nspecial, + build_nbor_list(inum, host_inum, nall, atom, sublo, subhi, tag, nspecial, special, success, mn); return; } if (_maxspecial>0) { const int GX2=static_cast(ceil(static_cast(nt)/cell_block)); - k_special.set_size(GX2,cell_block); - k_special.run(&dev_nbor.begin(), &dev_host_nbor.begin(), - &atom.dev_tag.begin(), &dev_nspecial.begin(), - &dev_special.begin(), &inum, &nt, &nall); + _shared->k_special.set_size(GX2,cell_block); + _shared->k_special.run(&dev_nbor.begin(), &dev_host_nbor.begin(), + &dev_host_numj.begin(), &atom.dev_tag.begin(), + &dev_nspecial.begin(), &dev_special.begin(), + &inum, &nt, &nall, &_max_nbors); } time_kernel.stop(); time_nbor.start(); if (_gpu_host) - ucl_copy(host_nbor,dev_host_nbor,host_inum*(mn+1),false); + ucl_copy(host_nbor,dev_host_nbor,false); time_nbor.stop(); } template void PairGPUNbor::build_nbor_list - (const int inum, const int host_inum, const int nall, - PairGPUAtom &atom, double *boxlo, double *boxhi, + (const int inum, const int host_inum, const int nall, + PairGPUAtom &atom, double *sublo, double *subhi, int *, int **, int **, bool &success, int &mn); diff --git a/lib/gpu/pair_gpu_nbor.h b/lib/gpu/pair_gpu_nbor.h index 403bd7aed4..02ad4b201b 100644 --- a/lib/gpu/pair_gpu_nbor.h +++ b/lib/gpu/pair_gpu_nbor.h @@ -19,32 +19,27 @@ #define PAIR_GPU_NBOR_H #include "pair_gpu_atom.h" +#include "pair_gpu_nbor_shared.h" #define IJ_SIZE 131072 #ifdef USE_OPENCL -#include "geryon/ocl_device.h" #include "geryon/ocl_timer.h" #include "geryon/ocl_mat.h" -#include "geryon/ocl_kernel.h" -#include "geryon/ocl_texture.h" using namespace ucl_opencl; #else -#include "geryon/nvd_device.h" #include "geryon/nvd_timer.h" #include "geryon/nvd_mat.h" -#include "geryon/nvd_kernel.h" -#include "geryon/nvd_texture.h" using namespace ucl_cudadr; #endif class PairGPUNbor { public: - PairGPUNbor() : _allocated(false), _use_packing(false), _compiled(false) {} + PairGPUNbor() : _allocated(false), _use_packing(false) {} ~PairGPUNbor() { clear(); } /// Determine whether neighbor unpacking should be used @@ -62,9 +57,11 @@ class PairGPUNbor { * 2 if gpu_nbor is true, and host needs a full nbor list * \param pre_cut True if cutoff test will be performed in separate kernel * than the force kernel **/ - bool init(const int inum, const int host_inum, const int max_nbors, - const int maxspecial, UCL_Device &dev, const bool gpu_nbor, - const int gpu_host, const bool pre_cut); + bool init(PairGPUNborShared *shared, const int inum, const int host_inum, + const int max_nbors, const int maxspecial, UCL_Device &dev, + const bool gpu_nbor, const int gpu_host, const bool pre_cut, + const int block_cell_2d, const int block_cell_id, + const int block_nbor_build); /// Set the size of the cutoff+skin inline void cell_size(const double size) { _cell_size=size; } @@ -131,18 +128,18 @@ class PairGPUNbor { inline int max_nbors() const { return _max_nbors; } /// Loop through neighbor count array and return maximum nbors for a particle - inline int max_nbor_loop(const int inum, int *numj) const { + inline int max_nbor_loop(const int inum, int *numj, int *ilist) const { int mn=0; for (int i=0; i void build_nbor_list(const int inum, const int host_inum, const int nall, - PairGPUAtom &atom, double *boxlo, - double *boxhi, int *tag, int **nspecial, int **special, + PairGPUAtom &atom, double *sublo, + double *subhi, int *tag, int **nspecial, int **special, bool &success, int &max_nbors); /// Return the number of bytes used on device @@ -176,31 +173,31 @@ class PairGPUNbor { UCL_H_Vec host_nbor; /// Device storage for neighbor list matrix that will be copied to host /** - 1st row is numj - * - Remaining rows are nbors **/ + * - Remaining rows are by atom, columns are nbors **/ UCL_D_Vec dev_host_nbor; + UCL_D_Vec dev_host_numj; + UCL_H_Vec host_ilist; + UCL_H_Vec host_jlist; /// Device storage for special neighbor counts UCL_D_Vec dev_nspecial; /// Device storage for special neighbors UCL_D_Vec dev_special, dev_special_t; - /// Texture for cached position/type access with CUDA - UCL_Texture neigh_tex; /// Device timers UCL_Timer time_nbor, time_kernel; private: + PairGPUNborShared *_shared; UCL_Device *dev; - UCL_Program *nbor_program, *build_program; - UCL_Kernel k_nbor, k_cell_id, k_cell_counts, k_build_nbor; - UCL_Kernel k_transpose, k_special; - bool _allocated, _use_packing, _compiled; - void compile_kernels(UCL_Device &dev); + bool _allocated, _use_packing; int _max_atoms, _max_nbors, _max_host, _nbor_pitch, _maxspecial; bool _gpu_nbor, _gpu_host, _alloc_packed; double _cell_size; double _gpu_bytes, _c_bytes, _cell_bytes; void alloc(bool &success); + + int _block_cell_2d, _block_cell_id, _block_nbor_build; }; #endif diff --git a/lib/gpu/pair_gpu_precision.h b/lib/gpu/pair_gpu_precision.h index a5f57c1f95..902975be0b 100644 --- a/lib/gpu/pair_gpu_precision.h +++ b/lib/gpu/pair_gpu_precision.h @@ -84,8 +84,6 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) { #define acctyp4 _lgpu_float4 #endif -#define MAX_SHARED_TYPES 8 -#define MAX_BIO_SHARED_TYPES 128 enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE}; #endif