diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile index 0824d048b8..31c687369a 100644 --- a/lib/gpu/Nvidia.makefile +++ b/lib/gpu/Nvidia.makefile @@ -3,6 +3,7 @@ CUDA = $(NVCC) $(CUDA_INCLUDE) $(CUDA_OPTS) -Icudpp_mini $(CUDA_ARCH) \ CUDR = $(CUDR_CPP) $(CUDR_OPTS) $(CUDA_PRECISION) $(CUDA_INCLUDE) \ $(CUDPP_OPT) CUDA_LINK = $(CUDA_LIB) -lcudart +BIN2C = $(CUDA_HOME)/bin/bin2c GPU_LIB = $(LIB_DIR)/libgpu.a @@ -27,6 +28,7 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_ans.o \ $(OBJ_DIR)/lal_neighbor.o $(OBJ_DIR)/lal_neighbor_shared.o \ $(OBJ_DIR)/lal_device.o $(OBJ_DIR)/lal_base_atomic.o \ $(OBJ_DIR)/lal_base_charge.o $(OBJ_DIR)/lal_base_ellipsoid.o \ + $(OBJ_DIR)/lal_base_dipole.o \ $(OBJ_DIR)/lal_pppm.o $(OBJ_DIR)/lal_pppm_ext.o \ $(OBJ_DIR)/lal_gayberne.o $(OBJ_DIR)/lal_gayberne_ext.o \ $(OBJ_DIR)/lal_re_squared.o $(OBJ_DIR)/lal_re_squared_ext.o \ @@ -35,6 +37,7 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_ans.o \ $(OBJ_DIR)/lal_lj_expand.o $(OBJ_DIR)/lal_lj_expand_ext.o \ $(OBJ_DIR)/lal_lj_coul.o $(OBJ_DIR)/lal_lj_coul_ext.o \ $(OBJ_DIR)/lal_lj_coul_long.o $(OBJ_DIR)/lal_lj_coul_long_ext.o \ + $(OBJ_DIR)/lal_lj_dsf.o $(OBJ_DIR)/lal_lj_dsf_ext.o \ $(OBJ_DIR)/lal_lj_class2_long.o $(OBJ_DIR)/lal_lj_class2_long_ext.o \ $(OBJ_DIR)/lal_coul_long.o $(OBJ_DIR)/lal_coul_long_ext.o \ $(OBJ_DIR)/lal_morse.o $(OBJ_DIR)/lal_morse_ext.o \ @@ -46,35 +49,57 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_ans.o \ $(OBJ_DIR)/lal_buck_coul.o $(OBJ_DIR)/lal_buck_coul_ext.o \ $(OBJ_DIR)/lal_buck_coul_long.o $(OBJ_DIR)/lal_buck_coul_long_ext.o \ $(OBJ_DIR)/lal_table.o $(OBJ_DIR)/lal_table_ext.o \ - $(OBJ_DIR)/lal_yukawa.o $(OBJ_DIR)/lal_yukawa_ext.o -PTXS = $(OBJ_DIR)/device.ptx $(OBJ_DIR)/device_ptx.h \ - $(OBJ_DIR)/atom.ptx $(OBJ_DIR)/atom_ptx.h \ - $(OBJ_DIR)/neighbor_cpu.ptx $(OBJ_DIR)/neighbor_cpu_ptx.h \ - $(OBJ_DIR)/neighbor_gpu.ptx $(OBJ_DIR)/neighbor_gpu_ptx.h \ - $(OBJ_DIR)/pppm_f.ptx $(OBJ_DIR)/pppm_f_ptx.h \ - $(OBJ_DIR)/pppm_d.ptx $(OBJ_DIR)/pppm_d_ptx.h \ - $(OBJ_DIR)/ellipsoid_nbor.ptx $(OBJ_DIR)/ellipsoid_nbor_ptx.h \ - $(OBJ_DIR)/gayberne.ptx $(OBJ_DIR)/gayberne_lj.ptx \ - $(OBJ_DIR)/gayberne_ptx.h $(OBJ_DIR)/gayberne_lj_ptx.h \ - $(OBJ_DIR)/re_squared.ptx $(OBJ_DIR)/re_squared_lj.ptx \ - $(OBJ_DIR)/re_squared_ptx.h $(OBJ_DIR)/re_squared_lj_ptx.h \ - $(OBJ_DIR)/lj.ptx $(OBJ_DIR)/lj_ptx.h \ - $(OBJ_DIR)/lj96.ptx $(OBJ_DIR)/lj96_ptx.h \ - $(OBJ_DIR)/lj_expand.ptx $(OBJ_DIR)/lj_expand_ptx.h \ - $(OBJ_DIR)/lj_coul.ptx $(OBJ_DIR)/lj_coul_ptx.h \ - $(OBJ_DIR)/lj_coul_long.ptx $(OBJ_DIR)/lj_coul_long_ptx.h \ - $(OBJ_DIR)/lj_class2_long.ptx $(OBJ_DIR)/lj_class2_long_ptx.h \ - $(OBJ_DIR)/coul_long.ptx $(OBJ_DIR)/coul_long_ptx.h \ - $(OBJ_DIR)/morse.ptx $(OBJ_DIR)/morse_ptx.h \ - $(OBJ_DIR)/charmm_long.ptx $(OBJ_DIR)/charmm_long_ptx.h \ - $(OBJ_DIR)/cg_cmm.ptx $(OBJ_DIR)/cg_cmm_ptx.h \ - $(OBJ_DIR)/cg_cmm_long.ptx $(OBJ_DIR)/cg_cmm_long_ptx.h \ - $(OBJ_DIR)/eam.ptx $(OBJ_DIR)/eam_ptx.h \ - $(OBJ_DIR)/buck.ptx $(OBJ_DIR)/buck_ptx.h \ - $(OBJ_DIR)/buck_coul.ptx $(OBJ_DIR)/buck_coul_ptx.h \ - $(OBJ_DIR)/buck_coul_long.ptx $(OBJ_DIR)/buck_coul_long_ptx.h \ - $(OBJ_DIR)/table.ptx $(OBJ_DIR)/table_ptx.h \ - $(OBJ_DIR)/yukawa.ptx $(OBJ_DIR)/yukawa_ptx.h + $(OBJ_DIR)/lal_yukawa.o $(OBJ_DIR)/lal_yukawa_ext.o \ + $(OBJ_DIR)/lal_born.o $(OBJ_DIR)/lal_born_ext.o \ + $(OBJ_DIR)/lal_born_coul_wolf.o $(OBJ_DIR)/lal_born_coul_wolf_ext.o \ + $(OBJ_DIR)/lal_born_coul_long.o $(OBJ_DIR)/lal_born_coul_long_ext.o \ + $(OBJ_DIR)/lal_dipole_lj.o $(OBJ_DIR)/lal_dipole_lj_ext.o \ + $(OBJ_DIR)/lal_dipole_lj_sf.o $(OBJ_DIR)/lal_dipole_lj_sf_ext.o \ + $(OBJ_DIR)/lal_colloid.o $(OBJ_DIR)/lal_colloid_ext.o \ + $(OBJ_DIR)/lal_gauss.o $(OBJ_DIR)/lal_gauss_ext.o \ + $(OBJ_DIR)/lal_yukawa_colloid.o $(OBJ_DIR)/lal_yukawa_colloid_ext.o \ + $(OBJ_DIR)/lal_lj_coul_debye.o $(OBJ_DIR)/lal_lj_coul_debye_ext.o \ + $(OBJ_DIR)/lal_coul_dsf.o $(OBJ_DIR)/lal_coul_dsf_ext.o + +CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \ + $(OBJ_DIR)/atom.cubin $(OBJ_DIR)/atom_cubin.h \ + $(OBJ_DIR)/neighbor_cpu.cubin $(OBJ_DIR)/neighbor_cpu_cubin.h \ + $(OBJ_DIR)/neighbor_gpu.cubin $(OBJ_DIR)/neighbor_gpu_cubin.h \ + $(OBJ_DIR)/pppm_f.cubin $(OBJ_DIR)/pppm_f_cubin.h \ + $(OBJ_DIR)/pppm_d.cubin $(OBJ_DIR)/pppm_d_cubin.h \ + $(OBJ_DIR)/ellipsoid_nbor.cubin $(OBJ_DIR)/ellipsoid_nbor_cubin.h \ + $(OBJ_DIR)/gayberne.cubin $(OBJ_DIR)/gayberne_lj.cubin \ + $(OBJ_DIR)/gayberne_cubin.h $(OBJ_DIR)/gayberne_lj_cubin.h \ + $(OBJ_DIR)/re_squared.cubin $(OBJ_DIR)/re_squared_lj.cubin \ + $(OBJ_DIR)/re_squared_cubin.h $(OBJ_DIR)/re_squared_lj_cubin.h \ + $(OBJ_DIR)/lj.cubin $(OBJ_DIR)/lj_cubin.h \ + $(OBJ_DIR)/lj96.cubin $(OBJ_DIR)/lj96_cubin.h \ + $(OBJ_DIR)/lj_expand.cubin $(OBJ_DIR)/lj_expand_cubin.h \ + $(OBJ_DIR)/lj_coul.cubin $(OBJ_DIR)/lj_coul_cubin.h \ + $(OBJ_DIR)/lj_coul_long.cubin $(OBJ_DIR)/lj_coul_long_cubin.h \ + $(OBJ_DIR)/lj_dsf.cubin $(OBJ_DIR)/lj_dsf_cubin.h \ + $(OBJ_DIR)/lj_class2_long.cubin $(OBJ_DIR)/lj_class2_long_cubin.h \ + $(OBJ_DIR)/coul_long.cubin $(OBJ_DIR)/coul_long_cubin.h \ + $(OBJ_DIR)/morse.cubin $(OBJ_DIR)/morse_cubin.h \ + $(OBJ_DIR)/charmm_long.cubin $(OBJ_DIR)/charmm_long_cubin.h \ + $(OBJ_DIR)/cg_cmm.cubin $(OBJ_DIR)/cg_cmm_cubin.h \ + $(OBJ_DIR)/cg_cmm_long.cubin $(OBJ_DIR)/cg_cmm_long_cubin.h \ + $(OBJ_DIR)/eam.cubin $(OBJ_DIR)/eam_cubin.h \ + $(OBJ_DIR)/buck.cubin $(OBJ_DIR)/buck_cubin.h \ + $(OBJ_DIR)/buck_coul_long.cubin $(OBJ_DIR)/buck_coul_long_cubin.h \ + $(OBJ_DIR)/buck_coul_wolf.cubin $(OBJ_DIR)/buck_coul_wolf_cubin.h \ + $(OBJ_DIR)/table.cubin $(OBJ_DIR)/table_cubin.h \ + $(OBJ_DIR)/yukawa.cubin $(OBJ_DIR)/yukawa_cubin.h \ + $(OBJ_DIR)/born.cubin $(OBJ_DIR)/born_cubin.h \ + $(OBJ_DIR)/born_coul_wolf.cubin $(OBJ_DIR)/born_coul_wolf_cubin.h \ + $(OBJ_DIR)/born_coul_long.cubin $(OBJ_DIR)/born_coul_long_cubin.h \ + $(OBJ_DIR)/dipole_lj.cubin $(OBJ_DIR)/dipole_lj_cubin.h \ + $(OBJ_DIR)/dipole_lj_sf.cubin $(OBJ_DIR)/dipole_lj_sf_cubin.h \ + $(OBJ_DIR)/colloid.cubin $(OBJ_DIR)/colloid_cubin.h \ + $(OBJ_DIR)/gauss.cubin $(OBJ_DIR)/gauss_cubin.h \ + $(OBJ_DIR)/yukawa_colloid.cubin $(OBJ_DIR)/yukawa_colloid_cubin.h \ + $(OBJ_DIR)/lj_coul_debye.cubin $(OBJ_DIR)/lj_coul_debye_cubin.h \ + $(OBJ_DIR)/coul_dsf.cubin $(OBJ_DIR)/coul_dsf_cubin.h all: $(GPU_LIB) $(EXECS) @@ -96,43 +121,43 @@ $(OBJ_DIR)/radixsort_app.cu_o: cudpp_mini/radixsort_app.cu $(OBJ_DIR)/scan_app.cu_o: cudpp_mini/scan_app.cu $(CUDA) -o $@ -c cudpp_mini/scan_app.cu -$(OBJ_DIR)/atom.ptx: lal_atom.cu lal_preprocessor.h - $(CUDA) --ptx -DNV_KERNEL -o $@ lal_atom.cu +$(OBJ_DIR)/atom.cubin: lal_atom.cu lal_preprocessor.h + $(CUDA) --cubin -DNV_KERNEL -o $@ lal_atom.cu -$(OBJ_DIR)/atom_ptx.h: $(OBJ_DIR)/atom.ptx - $(BSH) ./geryon/file_to_cstr.sh atom $(OBJ_DIR)/atom.ptx $(OBJ_DIR)/atom_ptx.h +$(OBJ_DIR)/atom_cubin.h: $(OBJ_DIR)/atom.cubin + $(BIN2C) -c -n atom $(OBJ_DIR)/atom.cubin > $(OBJ_DIR)/atom_cubin.h -$(OBJ_DIR)/lal_atom.o: lal_atom.cpp lal_atom.h $(NVD_H) $(OBJ_DIR)/atom_ptx.h +$(OBJ_DIR)/lal_atom.o: lal_atom.cpp lal_atom.h $(NVD_H) $(OBJ_DIR)/atom_cubin.h $(CUDR) -o $@ -c lal_atom.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lal_ans.o: lal_answer.cpp lal_answer.h $(NVD_H) $(CUDR) -o $@ -c lal_answer.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/neighbor_cpu.ptx: lal_neighbor_cpu.cu lal_preprocessor.h - $(CUDA) --ptx -DNV_KERNEL -o $@ lal_neighbor_cpu.cu +$(OBJ_DIR)/neighbor_cpu.cubin: lal_neighbor_cpu.cu lal_preprocessor.h + $(CUDA) --cubin -DNV_KERNEL -o $@ lal_neighbor_cpu.cu -$(OBJ_DIR)/neighbor_cpu_ptx.h: $(OBJ_DIR)/neighbor_cpu.ptx - $(BSH) ./geryon/file_to_cstr.sh neighbor_cpu $(OBJ_DIR)/neighbor_cpu.ptx $(OBJ_DIR)/neighbor_cpu_ptx.h +$(OBJ_DIR)/neighbor_cpu_cubin.h: $(OBJ_DIR)/neighbor_cpu.cubin + $(BIN2C) -c -n neighbor_cpu $(OBJ_DIR)/neighbor_cpu.cubin > $(OBJ_DIR)/neighbor_cpu_cubin.h -$(OBJ_DIR)/neighbor_gpu.ptx: lal_neighbor_gpu.cu lal_preprocessor.h - $(CUDA) --ptx -DNV_KERNEL -o $@ lal_neighbor_gpu.cu +$(OBJ_DIR)/neighbor_gpu.cubin: lal_neighbor_gpu.cu lal_preprocessor.h + $(CUDA) --cubin -DNV_KERNEL -o $@ lal_neighbor_gpu.cu -$(OBJ_DIR)/neighbor_gpu_ptx.h: $(OBJ_DIR)/neighbor_gpu.ptx - $(BSH) ./geryon/file_to_cstr.sh neighbor_gpu $(OBJ_DIR)/neighbor_gpu.ptx $(OBJ_DIR)/neighbor_gpu_ptx.h +$(OBJ_DIR)/neighbor_gpu_cubin.h: $(OBJ_DIR)/neighbor_gpu.cubin + $(BIN2C) -c -n neighbor_gpu $(OBJ_DIR)/neighbor_gpu.cubin > $(OBJ_DIR)/neighbor_gpu_cubin.h -$(OBJ_DIR)/lal_neighbor_shared.o: lal_neighbor_shared.cpp lal_neighbor_shared.h $(OBJ_DIR)/neighbor_cpu_ptx.h $(OBJ_DIR)/neighbor_gpu_ptx.h $(NVD_H) +$(OBJ_DIR)/lal_neighbor_shared.o: lal_neighbor_shared.cpp lal_neighbor_shared.h $(OBJ_DIR)/neighbor_cpu_cubin.h $(OBJ_DIR)/neighbor_gpu_cubin.h $(NVD_H) $(CUDR) -o $@ -c lal_neighbor_shared.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lal_neighbor.o: lal_neighbor.cpp lal_neighbor.h lal_neighbor_shared.h $(NVD_H) $(CUDR) -o $@ -c lal_neighbor.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/device.ptx: lal_device.cu lal_preprocessor.h - $(CUDA) --ptx -DNV_KERNEL -o $@ lal_device.cu +$(OBJ_DIR)/device.cubin: lal_device.cu lal_preprocessor.h + $(CUDA) --cubin -DNV_KERNEL -o $@ lal_device.cu -$(OBJ_DIR)/device_ptx.h: $(OBJ_DIR)/device.ptx - $(BSH) ./geryon/file_to_cstr.sh device $(OBJ_DIR)/device.ptx $(OBJ_DIR)/device_ptx.h +$(OBJ_DIR)/device_cubin.h: $(OBJ_DIR)/device.cubin + $(BIN2C) -c -n device $(OBJ_DIR)/device.cubin > $(OBJ_DIR)/device_cubin.h -$(OBJ_DIR)/lal_device.o: lal_device.cpp lal_device.h $(ALL_H) $(OBJ_DIR)/device_ptx.h +$(OBJ_DIR)/lal_device.o: lal_device.cpp lal_device.h $(ALL_H) $(OBJ_DIR)/device_cubin.h $(CUDR) -o $@ -c lal_device.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lal_base_atomic.o: $(ALL_H) lal_base_atomic.h lal_base_atomic.cpp @@ -141,273 +166,408 @@ $(OBJ_DIR)/lal_base_atomic.o: $(ALL_H) lal_base_atomic.h lal_base_atomic.cpp $(OBJ_DIR)/lal_base_charge.o: $(ALL_H) lal_base_charge.h lal_base_charge.cpp $(CUDR) -o $@ -c lal_base_charge.cpp -$(OBJ_DIR)/lal_base_ellipsoid.o: $(ALL_H) lal_base_ellipsoid.h lal_base_ellipsoid.cpp $(OBJ_DIR)/ellipsoid_nbor_ptx.h +$(OBJ_DIR)/lal_base_ellipsoid.o: $(ALL_H) lal_base_ellipsoid.h lal_base_ellipsoid.cpp $(OBJ_DIR)/ellipsoid_nbor_cubin.h $(CUDR) -o $@ -c lal_base_ellipsoid.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/pppm_f.ptx: lal_pppm.cu lal_precision.h lal_preprocessor.h - $(CUDA) --ptx -DNV_KERNEL -Dgrdtyp=float -Dgrdtyp4=float4 -o $@ lal_pppm.cu +$(OBJ_DIR)/lal_base_dipole.o: $(ALL_H) lal_base_dipole.h lal_base_dipole.cpp + $(CUDR) -o $@ -c lal_base_dipole.cpp -$(OBJ_DIR)/pppm_f_ptx.h: $(OBJ_DIR)/pppm_f.ptx - $(BSH) ./geryon/file_to_cstr.sh pppm_f $(OBJ_DIR)/pppm_f.ptx $(OBJ_DIR)/pppm_f_ptx.h +$(OBJ_DIR)/pppm_f.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h + $(CUDA) --cubin -DNV_KERNEL -Dgrdtyp=float -Dgrdtyp4=float4 -o $@ lal_pppm.cu -$(OBJ_DIR)/pppm_d.ptx: lal_pppm.cu lal_precision.h lal_preprocessor.h - $(CUDA) --ptx -DNV_KERNEL -Dgrdtyp=double -Dgrdtyp4=double4 -o $@ lal_pppm.cu +$(OBJ_DIR)/pppm_f_cubin.h: $(OBJ_DIR)/pppm_f.cubin + $(BIN2C) -c -n pppm_f $(OBJ_DIR)/pppm_f.cubin > $(OBJ_DIR)/pppm_f_cubin.h -$(OBJ_DIR)/pppm_d_ptx.h: $(OBJ_DIR)/pppm_d.ptx - $(BSH) ./geryon/file_to_cstr.sh pppm_d $(OBJ_DIR)/pppm_d.ptx $(OBJ_DIR)/pppm_d_ptx.h +$(OBJ_DIR)/pppm_d.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h + $(CUDA) --cubin -DNV_KERNEL -Dgrdtyp=double -Dgrdtyp4=double4 -o $@ lal_pppm.cu -$(OBJ_DIR)/lal_pppm.o: $(ALL_H) lal_pppm.h lal_pppm.cpp $(OBJ_DIR)/pppm_f_ptx.h $(OBJ_DIR)/pppm_d_ptx.h +$(OBJ_DIR)/pppm_d_cubin.h: $(OBJ_DIR)/pppm_d.cubin + $(BIN2C) -c -n pppm_d $(OBJ_DIR)/pppm_d.cubin > $(OBJ_DIR)/pppm_d_cubin.h + +$(OBJ_DIR)/lal_pppm.o: $(ALL_H) lal_pppm.h lal_pppm.cpp $(OBJ_DIR)/pppm_f_cubin.h $(OBJ_DIR)/pppm_d_cubin.h $(CUDR) -o $@ -c lal_pppm.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lal_pppm_ext.o: $(ALL_H) lal_pppm.h lal_pppm_ext.cpp $(CUDR) -o $@ -c lal_pppm_ext.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/ellipsoid_nbor.ptx: lal_ellipsoid_nbor.cu lal_precision.h lal_preprocessor.h - $(CUDA) --ptx -DNV_KERNEL -o $@ lal_ellipsoid_nbor.cu +$(OBJ_DIR)/ellipsoid_nbor.cubin: lal_ellipsoid_nbor.cu lal_precision.h lal_preprocessor.h + $(CUDA) --cubin -DNV_KERNEL -o $@ lal_ellipsoid_nbor.cu -$(OBJ_DIR)/ellipsoid_nbor_ptx.h: $(OBJ_DIR)/ellipsoid_nbor.ptx - $(BSH) ./geryon/file_to_cstr.sh ellipsoid_nbor $(OBJ_DIR)/ellipsoid_nbor.ptx $(OBJ_DIR)/ellipsoid_nbor_ptx.h +$(OBJ_DIR)/ellipsoid_nbor_cubin.h: $(OBJ_DIR)/ellipsoid_nbor.cubin + $(BIN2C) -c -n ellipsoid_nbor $(OBJ_DIR)/ellipsoid_nbor.cubin > $(OBJ_DIR)/ellipsoid_nbor_cubin.h -$(OBJ_DIR)/gayberne.ptx: lal_gayberne.cu lal_precision.h lal_ellipsoid_extra.h lal_preprocessor.h - $(CUDA) --ptx -DNV_KERNEL -o $@ lal_gayberne.cu +$(OBJ_DIR)/gayberne.cubin: lal_gayberne.cu lal_precision.h lal_ellipsoid_extra.h lal_preprocessor.h + $(CUDA) --cubin -DNV_KERNEL -o $@ lal_gayberne.cu -$(OBJ_DIR)/gayberne_lj.ptx: lal_gayberne_lj.cu lal_precision.h lal_ellipsoid_extra.h lal_preprocessor.h - $(CUDA) --ptx -DNV_KERNEL -o $@ lal_gayberne_lj.cu +$(OBJ_DIR)/gayberne_lj.cubin: lal_gayberne_lj.cu lal_precision.h lal_ellipsoid_extra.h lal_preprocessor.h + $(CUDA) --cubin -DNV_KERNEL -o $@ lal_gayberne_lj.cu -$(OBJ_DIR)/gayberne_ptx.h: $(OBJ_DIR)/gayberne.ptx - $(BSH) ./geryon/file_to_cstr.sh gayberne $(OBJ_DIR)/gayberne.ptx $(OBJ_DIR)/gayberne_ptx.h +$(OBJ_DIR)/gayberne_cubin.h: $(OBJ_DIR)/gayberne.cubin + $(BIN2C) -c -n gayberne $(OBJ_DIR)/gayberne.cubin > $(OBJ_DIR)/gayberne_cubin.h -$(OBJ_DIR)/gayberne_lj_ptx.h: $(OBJ_DIR)/gayberne_lj.ptx - $(BSH) ./geryon/file_to_cstr.sh gayberne_lj $(OBJ_DIR)/gayberne_lj.ptx $(OBJ_DIR)/gayberne_lj_ptx.h +$(OBJ_DIR)/gayberne_lj_cubin.h: $(OBJ_DIR)/gayberne_lj.cubin + $(BIN2C) -c -n gayberne_lj $(OBJ_DIR)/gayberne_lj.cubin > $(OBJ_DIR)/gayberne_lj_cubin.h -$(OBJ_DIR)/lal_gayberne.o: $(ALL_H) lal_gayberne.h lal_gayberne.cpp $(OBJ_DIR)/gayberne_ptx.h $(OBJ_DIR)/gayberne_lj_ptx.h $(OBJ_DIR)/lal_base_ellipsoid.o +$(OBJ_DIR)/lal_gayberne.o: $(ALL_H) lal_gayberne.h lal_gayberne.cpp $(OBJ_DIR)/gayberne_cubin.h $(OBJ_DIR)/gayberne_lj_cubin.h $(OBJ_DIR)/lal_base_ellipsoid.o $(CUDR) -o $@ -c lal_gayberne.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lal_gayberne_ext.o: $(ALL_H) $(OBJ_DIR)/lal_gayberne.o lal_gayberne_ext.cpp $(CUDR) -o $@ -c lal_gayberne_ext.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/re_squared.ptx: lal_re_squared.cu lal_precision.h lal_ellipsoid_extra.h lal_preprocessor.h - $(CUDA) --ptx -DNV_KERNEL -o $@ lal_re_squared.cu +$(OBJ_DIR)/re_squared.cubin: lal_re_squared.cu lal_precision.h lal_ellipsoid_extra.h lal_preprocessor.h + $(CUDA) --cubin -DNV_KERNEL -o $@ lal_re_squared.cu -$(OBJ_DIR)/re_squared_lj.ptx: lal_re_squared_lj.cu lal_precision.h lal_ellipsoid_extra.h lal_preprocessor.h - $(CUDA) --ptx -DNV_KERNEL -o $@ lal_re_squared_lj.cu +$(OBJ_DIR)/re_squared_lj.cubin: lal_re_squared_lj.cu lal_precision.h lal_ellipsoid_extra.h lal_preprocessor.h + $(CUDA) --cubin -DNV_KERNEL -o $@ lal_re_squared_lj.cu -$(OBJ_DIR)/re_squared_ptx.h: $(OBJ_DIR)/re_squared.ptx - $(BSH) ./geryon/file_to_cstr.sh re_squared $(OBJ_DIR)/re_squared.ptx $(OBJ_DIR)/re_squared_ptx.h +$(OBJ_DIR)/re_squared_cubin.h: $(OBJ_DIR)/re_squared.cubin + $(BIN2C) -c -n re_squared $(OBJ_DIR)/re_squared.cubin > $(OBJ_DIR)/re_squared_cubin.h -$(OBJ_DIR)/re_squared_lj_ptx.h: $(OBJ_DIR)/re_squared_lj.ptx - $(BSH) ./geryon/file_to_cstr.sh re_squared_lj $(OBJ_DIR)/re_squared_lj.ptx $(OBJ_DIR)/re_squared_lj_ptx.h +$(OBJ_DIR)/re_squared_lj_cubin.h: $(OBJ_DIR)/re_squared_lj.cubin + $(BIN2C) -c -n re_squared_lj $(OBJ_DIR)/re_squared_lj.cubin > $(OBJ_DIR)/re_squared_lj_cubin.h -$(OBJ_DIR)/lal_re_squared.o: $(ALL_H) lal_re_squared.h lal_re_squared.cpp $(OBJ_DIR)/re_squared_ptx.h $(OBJ_DIR)/re_squared_lj_ptx.h $(OBJ_DIR)/lal_base_ellipsoid.o +$(OBJ_DIR)/lal_re_squared.o: $(ALL_H) lal_re_squared.h lal_re_squared.cpp $(OBJ_DIR)/re_squared_cubin.h $(OBJ_DIR)/re_squared_lj_cubin.h $(OBJ_DIR)/lal_base_ellipsoid.o $(CUDR) -o $@ -c lal_re_squared.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lal_re_squared_ext.o: $(ALL_H) $(OBJ_DIR)/lal_re_squared.o lal_re_squared_ext.cpp $(CUDR) -o $@ -c lal_re_squared_ext.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/lj.ptx: lal_lj.cu lal_precision.h lal_preprocessor.h - $(CUDA) --ptx -DNV_KERNEL -o $@ lal_lj.cu +$(OBJ_DIR)/lj.cubin: lal_lj.cu lal_precision.h lal_preprocessor.h + $(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj.cu -$(OBJ_DIR)/lj_ptx.h: $(OBJ_DIR)/lj.ptx $(OBJ_DIR)/lj.ptx - $(BSH) ./geryon/file_to_cstr.sh lj $(OBJ_DIR)/lj.ptx $(OBJ_DIR)/lj_ptx.h +$(OBJ_DIR)/lj_cubin.h: $(OBJ_DIR)/lj.cubin $(OBJ_DIR)/lj.cubin + $(BIN2C) -c -n lj $(OBJ_DIR)/lj.cubin > $(OBJ_DIR)/lj_cubin.h -$(OBJ_DIR)/lal_lj.o: $(ALL_H) lal_lj.h lal_lj.cpp $(OBJ_DIR)/lj_ptx.h $(OBJ_DIR)/lal_base_atomic.o +$(OBJ_DIR)/lal_lj.o: $(ALL_H) lal_lj.h lal_lj.cpp $(OBJ_DIR)/lj_cubin.h $(OBJ_DIR)/lal_base_atomic.o $(CUDR) -o $@ -c lal_lj.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lal_lj_ext.o: $(ALL_H) lal_lj.h lal_lj_ext.cpp lal_base_atomic.h $(CUDR) -o $@ -c lal_lj_ext.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/lj_coul.ptx: lal_lj_coul.cu lal_precision.h lal_preprocessor.h - $(CUDA) --ptx -DNV_KERNEL -o $@ lal_lj_coul.cu +$(OBJ_DIR)/lj_coul.cubin: lal_lj_coul.cu lal_precision.h lal_preprocessor.h + $(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj_coul.cu -$(OBJ_DIR)/lj_coul_ptx.h: $(OBJ_DIR)/lj_coul.ptx $(OBJ_DIR)/lj_coul.ptx - $(BSH) ./geryon/file_to_cstr.sh lj_coul $(OBJ_DIR)/lj_coul.ptx $(OBJ_DIR)/lj_coul_ptx.h +$(OBJ_DIR)/lj_coul_cubin.h: $(OBJ_DIR)/lj_coul.cubin $(OBJ_DIR)/lj_coul.cubin + $(BIN2C) -c -n lj_coul $(OBJ_DIR)/lj_coul.cubin > $(OBJ_DIR)/lj_coul_cubin.h -$(OBJ_DIR)/lal_lj_coul.o: $(ALL_H) lal_lj_coul.h lal_lj_coul.cpp $(OBJ_DIR)/lj_coul_ptx.h $(OBJ_DIR)/lal_base_charge.o +$(OBJ_DIR)/lal_lj_coul.o: $(ALL_H) lal_lj_coul.h lal_lj_coul.cpp $(OBJ_DIR)/lj_coul_cubin.h $(OBJ_DIR)/lal_base_charge.o $(CUDR) -o $@ -c lal_lj_coul.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lal_lj_coul_ext.o: $(ALL_H) lal_lj_coul.h lal_lj_coul_ext.cpp lal_base_charge.h $(CUDR) -o $@ -c lal_lj_coul_ext.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/lj_class2_long.ptx: lal_lj_class2_long.cu lal_precision.h lal_preprocessor.h - $(CUDA) --ptx -DNV_KERNEL -o $@ lal_lj_class2_long.cu +$(OBJ_DIR)/lj_class2_long.cubin: lal_lj_class2_long.cu lal_precision.h lal_preprocessor.h + $(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj_class2_long.cu -$(OBJ_DIR)/lj_class2_long_ptx.h: $(OBJ_DIR)/lj_class2_long.ptx $(OBJ_DIR)/lj_class2_long.ptx - $(BSH) ./geryon/file_to_cstr.sh lj_class2_long $(OBJ_DIR)/lj_class2_long.ptx $(OBJ_DIR)/lj_class2_long_ptx.h +$(OBJ_DIR)/lj_class2_long_cubin.h: $(OBJ_DIR)/lj_class2_long.cubin $(OBJ_DIR)/lj_class2_long.cubin + $(BIN2C) -c -n lj_class2_long $(OBJ_DIR)/lj_class2_long.cubin > $(OBJ_DIR)/lj_class2_long_cubin.h -$(OBJ_DIR)/lal_lj_class2_long.o: $(ALL_H) lal_lj_class2_long.h lal_lj_class2_long.cpp $(OBJ_DIR)/lj_class2_long_ptx.h $(OBJ_DIR)/lal_base_charge.o +$(OBJ_DIR)/lal_lj_class2_long.o: $(ALL_H) lal_lj_class2_long.h lal_lj_class2_long.cpp $(OBJ_DIR)/lj_class2_long_cubin.h $(OBJ_DIR)/lal_base_charge.o $(CUDR) -o $@ -c lal_lj_class2_long.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lal_lj_class2_long_ext.o: $(ALL_H) lal_lj_class2_long.h lal_lj_class2_long_ext.cpp lal_base_charge.h $(CUDR) -o $@ -c lal_lj_class2_long_ext.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/coul_long.ptx: lal_coul_long.cu lal_precision.h lal_preprocessor.h - $(CUDA) --ptx -DNV_KERNEL -o $@ lal_coul_long.cu +$(OBJ_DIR)/coul_long.cubin: lal_coul_long.cu lal_precision.h lal_preprocessor.h + $(CUDA) --cubin -DNV_KERNEL -o $@ lal_coul_long.cu -$(OBJ_DIR)/coul_long_ptx.h: $(OBJ_DIR)/coul_long.ptx $(OBJ_DIR)/coul_long.ptx - $(BSH) ./geryon/file_to_cstr.sh coul_long $(OBJ_DIR)/coul_long.ptx $(OBJ_DIR)/coul_long_ptx.h +$(OBJ_DIR)/coul_long_cubin.h: $(OBJ_DIR)/coul_long.cubin $(OBJ_DIR)/coul_long.cubin + $(BIN2C) -c -n coul_long $(OBJ_DIR)/coul_long.cubin > $(OBJ_DIR)/coul_long_cubin.h -$(OBJ_DIR)/lal_coul_long.o: $(ALL_H) lal_coul_long.h lal_coul_long.cpp $(OBJ_DIR)/coul_long_ptx.h $(OBJ_DIR)/lal_base_charge.o +$(OBJ_DIR)/lal_coul_long.o: $(ALL_H) lal_coul_long.h lal_coul_long.cpp $(OBJ_DIR)/coul_long_cubin.h $(OBJ_DIR)/lal_base_charge.o $(CUDR) -o $@ -c lal_coul_long.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lal_coul_long_ext.o: $(ALL_H) lal_coul_long.h lal_coul_long_ext.cpp lal_base_charge.h $(CUDR) -o $@ -c lal_coul_long_ext.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/lj_coul_long.ptx: lal_lj_coul_long.cu lal_precision.h lal_preprocessor.h - $(CUDA) --ptx -DNV_KERNEL -o $@ lal_lj_coul_long.cu +$(OBJ_DIR)/lj_coul_long.cubin: lal_lj_coul_long.cu lal_precision.h lal_preprocessor.h + $(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj_coul_long.cu -$(OBJ_DIR)/lj_coul_long_ptx.h: $(OBJ_DIR)/lj_coul_long.ptx $(OBJ_DIR)/lj_coul_long.ptx - $(BSH) ./geryon/file_to_cstr.sh lj_coul_long $(OBJ_DIR)/lj_coul_long.ptx $(OBJ_DIR)/lj_coul_long_ptx.h +$(OBJ_DIR)/lj_coul_long_cubin.h: $(OBJ_DIR)/lj_coul_long.cubin $(OBJ_DIR)/lj_coul_long.cubin + $(BIN2C) -c -n lj_coul_long $(OBJ_DIR)/lj_coul_long.cubin > $(OBJ_DIR)/lj_coul_long_cubin.h -$(OBJ_DIR)/lal_lj_coul_long.o: $(ALL_H) lal_lj_coul_long.h lal_lj_coul_long.cpp $(OBJ_DIR)/lj_coul_long_ptx.h $(OBJ_DIR)/lal_base_charge.o +$(OBJ_DIR)/lal_lj_coul_long.o: $(ALL_H) lal_lj_coul_long.h lal_lj_coul_long.cpp $(OBJ_DIR)/lj_coul_long_cubin.h $(OBJ_DIR)/lal_base_charge.o $(CUDR) -o $@ -c lal_lj_coul_long.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lal_lj_coul_long_ext.o: $(ALL_H) lal_lj_coul_long.h lal_lj_coul_long_ext.cpp lal_base_charge.h $(CUDR) -o $@ -c lal_lj_coul_long_ext.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/morse.ptx: lal_morse.cu lal_precision.h lal_preprocessor.h - $(CUDA) --ptx -DNV_KERNEL -o $@ lal_morse.cu +$(OBJ_DIR)/lj_dsf.cubin: lal_lj_dsf.cu lal_precision.h lal_preprocessor.h + $(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj_dsf.cu -$(OBJ_DIR)/morse_ptx.h: $(OBJ_DIR)/morse.ptx $(OBJ_DIR)/morse.ptx - $(BSH) ./geryon/file_to_cstr.sh morse $(OBJ_DIR)/morse.ptx $(OBJ_DIR)/morse_ptx.h +$(OBJ_DIR)/lj_dsf_cubin.h: $(OBJ_DIR)/lj_dsf.cubin $(OBJ_DIR)/lj_dsf.cubin + $(BIN2C) -c -n lj_dsf $(OBJ_DIR)/lj_dsf.cubin > $(OBJ_DIR)/lj_dsf_cubin.h -$(OBJ_DIR)/lal_morse.o: $(ALL_H) lal_morse.h lal_morse.cpp $(OBJ_DIR)/morse_ptx.h $(OBJ_DIR)/lal_base_atomic.o +$(OBJ_DIR)/lal_lj_dsf.o: $(ALL_H) lal_lj_dsf.h lal_lj_dsf.cpp $(OBJ_DIR)/lj_dsf_cubin.h $(OBJ_DIR)/lal_base_charge.o + $(CUDR) -o $@ -c lal_lj_dsf.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_lj_dsf_ext.o: $(ALL_H) lal_lj_dsf.h lal_lj_dsf_ext.cpp lal_base_charge.h + $(CUDR) -o $@ -c lal_lj_dsf_ext.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/morse.cubin: lal_morse.cu lal_precision.h lal_preprocessor.h + $(CUDA) --cubin -DNV_KERNEL -o $@ lal_morse.cu + +$(OBJ_DIR)/morse_cubin.h: $(OBJ_DIR)/morse.cubin $(OBJ_DIR)/morse.cubin + $(BIN2C) -c -n morse $(OBJ_DIR)/morse.cubin > $(OBJ_DIR)/morse_cubin.h + +$(OBJ_DIR)/lal_morse.o: $(ALL_H) lal_morse.h lal_morse.cpp $(OBJ_DIR)/morse_cubin.h $(OBJ_DIR)/lal_base_atomic.o $(CUDR) -o $@ -c lal_morse.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lal_morse_ext.o: $(ALL_H) lal_morse.h lal_morse_ext.cpp lal_base_atomic.h $(CUDR) -o $@ -c lal_morse_ext.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/charmm_long.ptx: lal_charmm_long.cu lal_precision.h lal_preprocessor.h - $(CUDA) --ptx -DNV_KERNEL -o $@ lal_charmm_long.cu +$(OBJ_DIR)/charmm_long.cubin: lal_charmm_long.cu lal_precision.h lal_preprocessor.h + $(CUDA) --cubin -DNV_KERNEL -o $@ lal_charmm_long.cu -$(OBJ_DIR)/charmm_long_ptx.h: $(OBJ_DIR)/charmm_long.ptx $(OBJ_DIR)/charmm_long.ptx - $(BSH) ./geryon/file_to_cstr.sh charmm_long $(OBJ_DIR)/charmm_long.ptx $(OBJ_DIR)/charmm_long_ptx.h +$(OBJ_DIR)/charmm_long_cubin.h: $(OBJ_DIR)/charmm_long.cubin $(OBJ_DIR)/charmm_long.cubin + $(BIN2C) -c -n charmm_long $(OBJ_DIR)/charmm_long.cubin > $(OBJ_DIR)/charmm_long_cubin.h -$(OBJ_DIR)/lal_charmm_long.o: $(ALL_H) lal_charmm_long.h lal_charmm_long.cpp $(OBJ_DIR)/charmm_long_ptx.h $(OBJ_DIR)/lal_base_charge.o +$(OBJ_DIR)/lal_charmm_long.o: $(ALL_H) lal_charmm_long.h lal_charmm_long.cpp $(OBJ_DIR)/charmm_long_cubin.h $(OBJ_DIR)/lal_base_charge.o $(CUDR) -o $@ -c lal_charmm_long.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lal_charmm_long_ext.o: $(ALL_H) lal_charmm_long.h lal_charmm_long_ext.cpp lal_base_charge.h $(CUDR) -o $@ -c lal_charmm_long_ext.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/lj96.ptx: lal_lj96.cu lal_precision.h lal_preprocessor.h - $(CUDA) --ptx -DNV_KERNEL -o $@ lal_lj96.cu +$(OBJ_DIR)/lj96.cubin: lal_lj96.cu lal_precision.h lal_preprocessor.h + $(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj96.cu -$(OBJ_DIR)/lj96_ptx.h: $(OBJ_DIR)/lj96.ptx $(OBJ_DIR)/lj96.ptx - $(BSH) ./geryon/file_to_cstr.sh lj96 $(OBJ_DIR)/lj96.ptx $(OBJ_DIR)/lj96_ptx.h +$(OBJ_DIR)/lj96_cubin.h: $(OBJ_DIR)/lj96.cubin $(OBJ_DIR)/lj96.cubin + $(BIN2C) -c -n lj96 $(OBJ_DIR)/lj96.cubin > $(OBJ_DIR)/lj96_cubin.h -$(OBJ_DIR)/lal_lj96.o: $(ALL_H) lal_lj96.h lal_lj96.cpp $(OBJ_DIR)/lj96_ptx.h $(OBJ_DIR)/lal_base_atomic.o +$(OBJ_DIR)/lal_lj96.o: $(ALL_H) lal_lj96.h lal_lj96.cpp $(OBJ_DIR)/lj96_cubin.h $(OBJ_DIR)/lal_base_atomic.o $(CUDR) -o $@ -c lal_lj96.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lal_lj96_ext.o: $(ALL_H) lal_lj96.h lal_lj96_ext.cpp lal_base_atomic.h $(CUDR) -o $@ -c lal_lj96_ext.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/lj_expand.ptx: lal_lj_expand.cu lal_precision.h lal_preprocessor.h - $(CUDA) --ptx -DNV_KERNEL -o $@ lal_lj_expand.cu +$(OBJ_DIR)/lj_expand.cubin: lal_lj_expand.cu lal_precision.h lal_preprocessor.h + $(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj_expand.cu -$(OBJ_DIR)/lj_expand_ptx.h: $(OBJ_DIR)/lj_expand.ptx $(OBJ_DIR)/lj_expand.ptx - $(BSH) ./geryon/file_to_cstr.sh lj_expand $(OBJ_DIR)/lj_expand.ptx $(OBJ_DIR)/lj_expand_ptx.h +$(OBJ_DIR)/lj_expand_cubin.h: $(OBJ_DIR)/lj_expand.cubin $(OBJ_DIR)/lj_expand.cubin + $(BIN2C) -c -n lj_expand $(OBJ_DIR)/lj_expand.cubin > $(OBJ_DIR)/lj_expand_cubin.h -$(OBJ_DIR)/lal_lj_expand.o: $(ALL_H) lal_lj_expand.h lal_lj_expand.cpp $(OBJ_DIR)/lj_expand_ptx.h $(OBJ_DIR)/lal_base_atomic.o +$(OBJ_DIR)/lal_lj_expand.o: $(ALL_H) lal_lj_expand.h lal_lj_expand.cpp $(OBJ_DIR)/lj_expand_cubin.h $(OBJ_DIR)/lal_base_atomic.o $(CUDR) -o $@ -c lal_lj_expand.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lal_lj_expand_ext.o: $(ALL_H) lal_lj_expand.h lal_lj_expand_ext.cpp lal_base_atomic.h $(CUDR) -o $@ -c lal_lj_expand_ext.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/cg_cmm.ptx: lal_cg_cmm.cu lal_precision.h lal_preprocessor.h - $(CUDA) --ptx -DNV_KERNEL -o $@ lal_cg_cmm.cu +$(OBJ_DIR)/cg_cmm.cubin: lal_cg_cmm.cu lal_precision.h lal_preprocessor.h + $(CUDA) --cubin -DNV_KERNEL -o $@ lal_cg_cmm.cu -$(OBJ_DIR)/cg_cmm_ptx.h: $(OBJ_DIR)/cg_cmm.ptx $(OBJ_DIR)/cg_cmm.ptx - $(BSH) ./geryon/file_to_cstr.sh cg_cmm $(OBJ_DIR)/cg_cmm.ptx $(OBJ_DIR)/cg_cmm_ptx.h +$(OBJ_DIR)/cg_cmm_cubin.h: $(OBJ_DIR)/cg_cmm.cubin $(OBJ_DIR)/cg_cmm.cubin + $(BIN2C) -c -n cg_cmm $(OBJ_DIR)/cg_cmm.cubin > $(OBJ_DIR)/cg_cmm_cubin.h -$(OBJ_DIR)/lal_cg_cmm.o: $(ALL_H) lal_cg_cmm.h lal_cg_cmm.cpp $(OBJ_DIR)/cg_cmm_ptx.h $(OBJ_DIR)/lal_base_atomic.o +$(OBJ_DIR)/lal_cg_cmm.o: $(ALL_H) lal_cg_cmm.h lal_cg_cmm.cpp $(OBJ_DIR)/cg_cmm_cubin.h $(OBJ_DIR)/lal_base_atomic.o $(CUDR) -o $@ -c lal_cg_cmm.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lal_cg_cmm_ext.o: $(ALL_H) lal_cg_cmm.h lal_cg_cmm_ext.cpp lal_base_atomic.h $(CUDR) -o $@ -c lal_cg_cmm_ext.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/cg_cmm_long.ptx: lal_cg_cmm_long.cu lal_precision.h lal_preprocessor.h - $(CUDA) --ptx -DNV_KERNEL -o $@ lal_cg_cmm_long.cu +$(OBJ_DIR)/cg_cmm_long.cubin: lal_cg_cmm_long.cu lal_precision.h lal_preprocessor.h + $(CUDA) --cubin -DNV_KERNEL -o $@ lal_cg_cmm_long.cu -$(OBJ_DIR)/cg_cmm_long_ptx.h: $(OBJ_DIR)/cg_cmm_long.ptx $(OBJ_DIR)/cg_cmm_long.ptx - $(BSH) ./geryon/file_to_cstr.sh cg_cmm_long $(OBJ_DIR)/cg_cmm_long.ptx $(OBJ_DIR)/cg_cmm_long_ptx.h +$(OBJ_DIR)/cg_cmm_long_cubin.h: $(OBJ_DIR)/cg_cmm_long.cubin $(OBJ_DIR)/cg_cmm_long.cubin + $(BIN2C) -c -n cg_cmm_long $(OBJ_DIR)/cg_cmm_long.cubin > $(OBJ_DIR)/cg_cmm_long_cubin.h -$(OBJ_DIR)/lal_cg_cmm_long.o: $(ALL_H) lal_cg_cmm_long.h lal_cg_cmm_long.cpp $(OBJ_DIR)/cg_cmm_long_ptx.h $(OBJ_DIR)/lal_base_atomic.o +$(OBJ_DIR)/lal_cg_cmm_long.o: $(ALL_H) lal_cg_cmm_long.h lal_cg_cmm_long.cpp $(OBJ_DIR)/cg_cmm_long_cubin.h $(OBJ_DIR)/lal_base_atomic.o $(CUDR) -o $@ -c lal_cg_cmm_long.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lal_cg_cmm_long_ext.o: $(ALL_H) lal_cg_cmm_long.h lal_cg_cmm_long_ext.cpp lal_base_charge.h $(CUDR) -o $@ -c lal_cg_cmm_long_ext.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/eam.ptx: lal_eam.cu lal_precision.h lal_preprocessor.h - $(CUDA) --ptx -DNV_KERNEL -o $@ lal_eam.cu - -$(OBJ_DIR)/eam_ptx.h: $(OBJ_DIR)/eam.ptx $(OBJ_DIR)/eam.ptx - $(BSH) ./geryon/file_to_cstr.sh eam $(OBJ_DIR)/eam.ptx $(OBJ_DIR)/eam_ptx.h - -$(OBJ_DIR)/lal_eam.o: $(ALL_H) lal_eam.h lal_eam.cpp $(OBJ_DIR)/eam_ptx.h $(OBJ_DIR)/lal_base_atomic.o +$(OBJ_DIR)/eam.cubin: lal_eam.cu lal_precision.h lal_preprocessor.h + $(CUDA) --cubin -DNV_KERNEL -o $@ lal_eam.cu + +$(OBJ_DIR)/eam_cubin.h: $(OBJ_DIR)/eam.cubin $(OBJ_DIR)/eam.cubin + $(BIN2C) -c -n eam $(OBJ_DIR)/eam.cubin > $(OBJ_DIR)/eam_cubin.h + +$(OBJ_DIR)/lal_eam.o: $(ALL_H) lal_eam.h lal_eam.cpp $(OBJ_DIR)/eam_cubin.h $(OBJ_DIR)/lal_base_atomic.o $(CUDR) -o $@ -c lal_eam.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lal_eam_ext.o: $(ALL_H) lal_eam.h lal_eam_ext.cpp lal_base_atomic.h $(CUDR) -o $@ -c lal_eam_ext.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/buck.ptx: lal_buck.cu lal_precision.h lal_preprocessor.h - $(CUDA) --ptx -DNV_KERNEL -o $@ lal_buck.cu - -$(OBJ_DIR)/buck_ptx.h: $(OBJ_DIR)/buck.ptx $(OBJ_DIR)/buck.ptx - $(BSH) ./geryon/file_to_cstr.sh buck $(OBJ_DIR)/buck.ptx $(OBJ_DIR)/buck_ptx.h - -$(OBJ_DIR)/lal_buck.o: $(ALL_H) lal_buck.h lal_buck.cpp $(OBJ_DIR)/buck_ptx.h $(OBJ_DIR)/lal_base_atomic.o +$(OBJ_DIR)/buck.cubin: lal_buck.cu lal_precision.h lal_preprocessor.h + $(CUDA) --cubin -DNV_KERNEL -o $@ lal_buck.cu + +$(OBJ_DIR)/buck_cubin.h: $(OBJ_DIR)/buck.cubin $(OBJ_DIR)/buck.cubin + $(BIN2C) -c -n buck $(OBJ_DIR)/buck.cubin > $(OBJ_DIR)/buck_cubin.h + +$(OBJ_DIR)/lal_buck.o: $(ALL_H) lal_buck.h lal_buck.cpp $(OBJ_DIR)/buck_cubin.h $(OBJ_DIR)/lal_base_atomic.o $(CUDR) -o $@ -c lal_buck.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lal_buck_ext.o: $(ALL_H) lal_buck.h lal_buck_ext.cpp lal_base_atomic.h $(CUDR) -o $@ -c lal_buck_ext.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/buck_coul.ptx: lal_buck_coul.cu lal_precision.h lal_preprocessor.h - $(CUDA) --ptx -DNV_KERNEL -o $@ lal_buck_coul.cu - -$(OBJ_DIR)/buck_coul_ptx.h: $(OBJ_DIR)/buck_coul.ptx $(OBJ_DIR)/buck_coul.ptx - $(BSH) ./geryon/file_to_cstr.sh buck_coul $(OBJ_DIR)/buck_coul.ptx $(OBJ_DIR)/buck_coul_ptx.h - -$(OBJ_DIR)/lal_buck_coul.o: $(ALL_H) lal_buck_coul.h lal_buck_coul.cpp $(OBJ_DIR)/buck_coul_ptx.h $(OBJ_DIR)/lal_base_charge.o +$(OBJ_DIR)/buck_coul.cubin: lal_buck_coul.cu lal_precision.h lal_preprocessor.h + $(CUDA) --cubin -DNV_KERNEL -o $@ lal_buck_coul.cu + +$(OBJ_DIR)/buck_coul_cubin.h: $(OBJ_DIR)/buck_coul.cubin $(OBJ_DIR)/buck_coul.cubin + $(BIN2C) -c -n buck_coul $(OBJ_DIR)/buck_coul.cubin > $(OBJ_DIR)/buck_coul_cubin.h + +$(OBJ_DIR)/lal_buck_coul.o: $(ALL_H) lal_buck_coul.h lal_buck_coul.cpp $(OBJ_DIR)/buck_coul_cubin.h $(OBJ_DIR)/lal_base_charge.o $(CUDR) -o $@ -c lal_buck_coul.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lal_buck_coul_ext.o: $(ALL_H) lal_buck_coul.h lal_buck_coul_ext.cpp lal_base_charge.h $(CUDR) -o $@ -c lal_buck_coul_ext.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/buck_coul_long.ptx: lal_buck_coul_long.cu lal_precision.h lal_preprocessor.h - $(CUDA) --ptx -DNV_KERNEL -o $@ lal_buck_coul_long.cu - -$(OBJ_DIR)/buck_coul_long_ptx.h: $(OBJ_DIR)/buck_coul_long.ptx $(OBJ_DIR)/buck_coul_long.ptx - $(BSH) ./geryon/file_to_cstr.sh buck_coul_long $(OBJ_DIR)/buck_coul_long.ptx $(OBJ_DIR)/buck_coul_long_ptx.h - -$(OBJ_DIR)/lal_buck_coul_long.o: $(ALL_H) lal_buck_coul_long.h lal_buck_coul_long.cpp $(OBJ_DIR)/buck_coul_long_ptx.h $(OBJ_DIR)/lal_base_charge.o +$(OBJ_DIR)/buck_coul_long.cubin: lal_buck_coul_long.cu lal_precision.h lal_preprocessor.h + $(CUDA) --cubin -DNV_KERNEL -o $@ lal_buck_coul_long.cu + +$(OBJ_DIR)/buck_coul_long_cubin.h: $(OBJ_DIR)/buck_coul_long.cubin $(OBJ_DIR)/buck_coul_long.cubin + $(BIN2C) -c -n buck_coul_long $(OBJ_DIR)/buck_coul_long.cubin > $(OBJ_DIR)/buck_coul_long_cubin.h + +$(OBJ_DIR)/lal_buck_coul_long.o: $(ALL_H) lal_buck_coul_long.h lal_buck_coul_long.cpp $(OBJ_DIR)/buck_coul_long_cubin.h $(OBJ_DIR)/lal_base_charge.o $(CUDR) -o $@ -c lal_buck_coul_long.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lal_buck_coul_long_ext.o: $(ALL_H) lal_buck_coul_long.h lal_buck_coul_long_ext.cpp lal_base_charge.h $(CUDR) -o $@ -c lal_buck_coul_long_ext.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/table.ptx: lal_table.cu lal_precision.h lal_preprocessor.h - $(CUDA) --ptx -DNV_KERNEL -o $@ lal_table.cu - -$(OBJ_DIR)/table_ptx.h: $(OBJ_DIR)/table.ptx $(OBJ_DIR)/table.ptx - $(BSH) ./geryon/file_to_cstr.sh table $(OBJ_DIR)/table.ptx $(OBJ_DIR)/table_ptx.h - -$(OBJ_DIR)/lal_table.o: $(ALL_H) lal_table.h lal_table.cpp $(OBJ_DIR)/table_ptx.h $(OBJ_DIR)/lal_base_atomic.o +$(OBJ_DIR)/table.cubin: lal_table.cu lal_precision.h lal_preprocessor.h + $(CUDA) --cubin -DNV_KERNEL -o $@ lal_table.cu + +$(OBJ_DIR)/table_cubin.h: $(OBJ_DIR)/table.cubin $(OBJ_DIR)/table.cubin + $(BIN2C) -c -n table $(OBJ_DIR)/table.cubin > $(OBJ_DIR)/table_cubin.h + +$(OBJ_DIR)/lal_table.o: $(ALL_H) lal_table.h lal_table.cpp $(OBJ_DIR)/table_cubin.h $(OBJ_DIR)/lal_base_atomic.o $(CUDR) -o $@ -c lal_table.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lal_table_ext.o: $(ALL_H) lal_table.h lal_table_ext.cpp lal_base_atomic.h $(CUDR) -o $@ -c lal_table_ext.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/yukawa.ptx: lal_yukawa.cu lal_precision.h lal_preprocessor.h - $(CUDA) --ptx -DNV_KERNEL -o $@ lal_yukawa.cu - -$(OBJ_DIR)/yukawa_ptx.h: $(OBJ_DIR)/yukawa.ptx $(OBJ_DIR)/yukawa.ptx - $(BSH) ./geryon/file_to_cstr.sh yukawa $(OBJ_DIR)/yukawa.ptx $(OBJ_DIR)/yukawa_ptx.h - -$(OBJ_DIR)/lal_yukawa.o: $(ALL_H) lal_yukawa.h lal_yukawa.cpp $(OBJ_DIR)/yukawa_ptx.h $(OBJ_DIR)/lal_base_atomic.o +$(OBJ_DIR)/yukawa.cubin: lal_yukawa.cu lal_precision.h lal_preprocessor.h + $(CUDA) --cubin -DNV_KERNEL -o $@ lal_yukawa.cu + +$(OBJ_DIR)/yukawa_cubin.h: $(OBJ_DIR)/yukawa.cubin $(OBJ_DIR)/yukawa.cubin + $(BIN2C) -c -n yukawa $(OBJ_DIR)/yukawa.cubin > $(OBJ_DIR)/yukawa_cubin.h + +$(OBJ_DIR)/lal_yukawa.o: $(ALL_H) lal_yukawa.h lal_yukawa.cpp $(OBJ_DIR)/yukawa_cubin.h $(OBJ_DIR)/lal_base_atomic.o $(CUDR) -o $@ -c lal_yukawa.cpp -I$(OBJ_DIR) $(OBJ_DIR)/lal_yukawa_ext.o: $(ALL_H) lal_yukawa.h lal_yukawa_ext.cpp lal_base_atomic.h $(CUDR) -o $@ -c lal_yukawa_ext.cpp -I$(OBJ_DIR) +$(OBJ_DIR)/born.cubin: lal_born.cu lal_precision.h lal_preprocessor.h + $(CUDA) --cubin -DNV_KERNEL -o $@ lal_born.cu + +$(OBJ_DIR)/born_cubin.h: $(OBJ_DIR)/born.cubin $(OBJ_DIR)/born.cubin + $(BIN2C) -c -n born $(OBJ_DIR)/born.cubin > $(OBJ_DIR)/born_cubin.h + +$(OBJ_DIR)/lal_born.o: $(ALL_H) lal_born.h lal_born.cpp $(OBJ_DIR)/born_cubin.h $(OBJ_DIR)/lal_base_atomic.o + $(CUDR) -o $@ -c lal_born.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_born_ext.o: $(ALL_H) lal_born.h lal_born_ext.cpp lal_base_atomic.h + $(CUDR) -o $@ -c lal_born_ext.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/born_coul_wolf.cubin: lal_born_coul_wolf.cu lal_precision.h lal_preprocessor.h + $(CUDA) --cubin -DNV_KERNEL -o $@ lal_born_coul_wolf.cu + +$(OBJ_DIR)/born_coul_wolf_cubin.h: $(OBJ_DIR)/born_coul_wolf.cubin $(OBJ_DIR)/born_coul_wolf.cubin + $(BIN2C) -c -n born_coul_wolf $(OBJ_DIR)/born_coul_wolf.cubin > $(OBJ_DIR)/born_coul_wolf_cubin.h + +$(OBJ_DIR)/lal_born_coul_wolf.o: $(ALL_H) lal_born_coul_wolf.h lal_born_coul_wolf.cpp $(OBJ_DIR)/born_coul_wolf_cubin.h $(OBJ_DIR)/lal_base_charge.o + $(CUDR) -o $@ -c lal_born_coul_wolf.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_born_coul_wolf_ext.o: $(ALL_H) lal_born_coul_wolf.h lal_born_coul_wolf_ext.cpp lal_base_charge.h + $(CUDR) -o $@ -c lal_born_coul_wolf_ext.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/born_coul_long.cubin: lal_born_coul_long.cu lal_precision.h lal_preprocessor.h + $(CUDA) --cubin -DNV_KERNEL -o $@ lal_born_coul_long.cu + +$(OBJ_DIR)/born_coul_long_cubin.h: $(OBJ_DIR)/born_coul_long.cubin $(OBJ_DIR)/born_coul_long.cubin + $(BIN2C) -c -n born_coul_long $(OBJ_DIR)/born_coul_long.cubin > $(OBJ_DIR)/born_coul_long_cubin.h + +$(OBJ_DIR)/lal_born_coul_long.o: $(ALL_H) lal_born_coul_long.h lal_born_coul_long.cpp $(OBJ_DIR)/born_coul_long_cubin.h $(OBJ_DIR)/lal_base_charge.o + $(CUDR) -o $@ -c lal_born_coul_long.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_born_coul_long_ext.o: $(ALL_H) lal_born_coul_long.h lal_born_coul_long_ext.cpp lal_base_charge.h + $(CUDR) -o $@ -c lal_born_coul_long_ext.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/dipole_lj.cubin: lal_dipole_lj.cu lal_precision.h lal_preprocessor.h + $(CUDA) --cubin -DNV_KERNEL -o $@ lal_dipole_lj.cu + +$(OBJ_DIR)/dipole_lj_cubin.h: $(OBJ_DIR)/dipole_lj.cubin $(OBJ_DIR)/dipole_lj.cubin + $(BIN2C) -c -n dipole_lj $(OBJ_DIR)/dipole_lj.cubin > $(OBJ_DIR)/dipole_lj_cubin.h + +$(OBJ_DIR)/lal_dipole_lj.o: $(ALL_H) lal_dipole_lj.h lal_dipole_lj.cpp $(OBJ_DIR)/dipole_lj_cubin.h $(OBJ_DIR)/lal_base_dipole.o + $(CUDR) -o $@ -c lal_dipole_lj.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_dipole_lj_ext.o: $(ALL_H) lal_dipole_lj.h lal_dipole_lj_ext.cpp lal_base_dipole.h + $(CUDR) -o $@ -c lal_dipole_lj_ext.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/dipole_lj_sf.cubin: lal_dipole_lj_sf.cu lal_precision.h lal_preprocessor.h + $(CUDA) --cubin -DNV_KERNEL -o $@ lal_dipole_lj_sf.cu + +$(OBJ_DIR)/dipole_lj_sf_cubin.h: $(OBJ_DIR)/dipole_lj_sf.cubin $(OBJ_DIR)/dipole_lj_sf.cubin + $(BIN2C) -c -n dipole_lj_sf $(OBJ_DIR)/dipole_lj_sf.cubin > $(OBJ_DIR)/dipole_lj_sf_cubin.h + +$(OBJ_DIR)/lal_dipole_lj_sf.o: $(ALL_H) lal_dipole_lj_sf.h lal_dipole_lj_sf.cpp $(OBJ_DIR)/dipole_lj_sf_cubin.h $(OBJ_DIR)/lal_base_dipole.o + $(CUDR) -o $@ -c lal_dipole_lj_sf.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_dipole_lj_sf_ext.o: $(ALL_H) lal_dipole_lj_sf.h lal_dipole_lj_sf_ext.cpp lal_base_dipole.h + $(CUDR) -o $@ -c lal_dipole_lj_sf_ext.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/colloid.cubin: lal_colloid.cu lal_precision.h lal_preprocessor.h + $(CUDA) --cubin -DNV_KERNEL -o $@ lal_colloid.cu + +$(OBJ_DIR)/colloid_cubin.h: $(OBJ_DIR)/colloid.cubin $(OBJ_DIR)/colloid.cubin + $(BIN2C) -c -n colloid $(OBJ_DIR)/colloid.cubin > $(OBJ_DIR)/colloid_cubin.h + +$(OBJ_DIR)/lal_colloid.o: $(ALL_H) lal_colloid.h lal_colloid.cpp $(OBJ_DIR)/colloid_cubin.h $(OBJ_DIR)/lal_base_atomic.o + $(CUDR) -o $@ -c lal_colloid.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_colloid_ext.o: $(ALL_H) lal_colloid.h lal_colloid_ext.cpp lal_base_atomic.h + $(CUDR) -o $@ -c lal_colloid_ext.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/gauss.cubin: lal_gauss.cu lal_precision.h lal_preprocessor.h + $(CUDA) --cubin -DNV_KERNEL -o $@ lal_gauss.cu + +$(OBJ_DIR)/gauss_cubin.h: $(OBJ_DIR)/gauss.cubin $(OBJ_DIR)/gauss.cubin + $(BIN2C) -c -n gauss $(OBJ_DIR)/gauss.cubin > $(OBJ_DIR)/gauss_cubin.h + +$(OBJ_DIR)/lal_gauss.o: $(ALL_H) lal_gauss.h lal_gauss.cpp $(OBJ_DIR)/gauss_cubin.h $(OBJ_DIR)/lal_base_atomic.o + $(CUDR) -o $@ -c lal_gauss.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_gauss_ext.o: $(ALL_H) lal_gauss.h lal_gauss_ext.cpp lal_base_atomic.h + $(CUDR) -o $@ -c lal_gauss_ext.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/yukawa_colloid.cubin: lal_yukawa_colloid.cu lal_precision.h lal_preprocessor.h + $(CUDA) --cubin -DNV_KERNEL -o $@ lal_yukawa_colloid.cu + +$(OBJ_DIR)/yukawa_colloid_cubin.h: $(OBJ_DIR)/yukawa_colloid.cubin $(OBJ_DIR)/yukawa_colloid.cubin + $(BIN2C) -c -n yukawa_colloid $(OBJ_DIR)/yukawa_colloid.cubin > $(OBJ_DIR)/yukawa_colloid_cubin.h + +$(OBJ_DIR)/lal_yukawa_colloid.o: $(ALL_H) lal_yukawa_colloid.h lal_yukawa_colloid.cpp $(OBJ_DIR)/yukawa_colloid_cubin.h $(OBJ_DIR)/lal_base_atomic.o + $(CUDR) -o $@ -c lal_yukawa_colloid.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_yukawa_colloid_ext.o: $(ALL_H) lal_yukawa_colloid.h lal_yukawa_colloid_ext.cpp lal_base_atomic.h + $(CUDR) -o $@ -c lal_yukawa_colloid_ext.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lj_coul_debye.cubin: lal_lj_coul_debye.cu lal_precision.h lal_preprocessor.h + $(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj_coul_debye.cu + +$(OBJ_DIR)/lj_coul_debye_cubin.h: $(OBJ_DIR)/lj_coul_debye.cubin $(OBJ_DIR)/lj_coul_debye.cubin + $(BIN2C) -c -n lj_coul_debye $(OBJ_DIR)/lj_coul_debye.cubin > $(OBJ_DIR)/lj_coul_debye_cubin.h + +$(OBJ_DIR)/lal_lj_coul_debye.o: $(ALL_H) lal_lj_coul_debye.h lal_lj_coul_debye.cpp $(OBJ_DIR)/lj_coul_debye_cubin.h $(OBJ_DIR)/lal_base_charge.o + $(CUDR) -o $@ -c lal_lj_coul_debye.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_lj_coul_debye_ext.o: $(ALL_H) lal_lj_coul_debye.h lal_lj_coul_debye_ext.cpp lal_base_charge.h + $(CUDR) -o $@ -c lal_lj_coul_debye_ext.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/coul_dsf.cubin: lal_coul_dsf.cu lal_precision.h lal_preprocessor.h + $(CUDA) --cubin -DNV_KERNEL -o $@ lal_coul_dsf.cu + +$(OBJ_DIR)/coul_dsf_cubin.h: $(OBJ_DIR)/coul_dsf.cubin $(OBJ_DIR)/coul_dsf.cubin + $(BIN2C) -c -n coul_dsf $(OBJ_DIR)/coul_dsf.cubin > $(OBJ_DIR)/coul_dsf_cubin.h + +$(OBJ_DIR)/lal_coul_dsf.o: $(ALL_H) lal_coul_dsf.h lal_coul_dsf.cpp $(OBJ_DIR)/coul_dsf_cubin.h $(OBJ_DIR)/lal_base_charge.o + $(CUDR) -o $@ -c lal_coul_dsf.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_coul_dsf_ext.o: $(ALL_H) lal_coul_dsf.h lal_coul_dsf_ext.cpp lal_base_charge.h + $(CUDR) -o $@ -c lal_coul_dsf_ext.cpp -I$(OBJ_DIR) + $(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVD_H) $(CUDR) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_CUDADR $(CUDA_LIB) -lcuda @@ -415,10 +575,10 @@ $(GPU_LIB): $(OBJS) $(CUDPP) $(AR) -crusv $(GPU_LIB) $(OBJS) $(CUDPP) clean: - rm -f $(EXECS) $(GPU_LIB) $(OBJS) $(CUDPP) $(PTXS) *.linkinfo + rm -f $(EXECS) $(GPU_LIB) $(OBJS) $(CUDPP) $(CBNS) *.linkinfo veryclean: clean rm -rf *~ *.linkinfo cleanlib: - rm -f $(EXECS) $(GPU_LIB) $(OBJS) $(PTXS) *.linkinfo + rm -f $(EXECS) $(GPU_LIB) $(OBJS) $(CBNS) *.linkinfo diff --git a/lib/gpu/Opencl.makefile b/lib/gpu/Opencl.makefile index 8435cddb72..51bd78fbd7 100644 --- a/lib/gpu/Opencl.makefile +++ b/lib/gpu/Opencl.makefile @@ -17,6 +17,7 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_answer.o \ $(OBJ_DIR)/lal_neighbor_shared.o $(OBJ_DIR)/lal_neighbor.o \ $(OBJ_DIR)/lal_device.o $(OBJ_DIR)/lal_base_atomic.o \ $(OBJ_DIR)/lal_base_charge.o $(OBJ_DIR)/lal_base_ellipsoid.o \ + $(OBJ_DIR)/lal_base_dipole.o \ $(OBJ_DIR)/lal_pppm.o $(OBJ_DIR)/lal_pppm_ext.o \ $(OBJ_DIR)/lal_gayberne.o $(OBJ_DIR)/lal_gayberne_ext.o \ $(OBJ_DIR)/lal_re_squared.o $(OBJ_DIR)/lal_re_squared_ext.o \ @@ -25,6 +26,7 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_answer.o \ $(OBJ_DIR)/lal_lj_expand.o $(OBJ_DIR)/lal_lj_expand_ext.o \ $(OBJ_DIR)/lal_lj_coul.o $(OBJ_DIR)/lal_lj_coul_ext.o \ $(OBJ_DIR)/lal_lj_coul_long.o $(OBJ_DIR)/lal_lj_coul_long_ext.o \ + $(OBJ_DIR)/lal_lj_dsf.o $(OBJ_DIR)/lal_lj_dsf_ext.o \ $(OBJ_DIR)/lal_lj_class2_long.o $(OBJ_DIR)/lal_lj_class2_long_ext.o \ $(OBJ_DIR)/lal_coul_long.o $(OBJ_DIR)/lal_coul_long_ext.o \ $(OBJ_DIR)/lal_morse.o $(OBJ_DIR)/lal_morse_ext.o \ @@ -36,20 +38,43 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_answer.o \ $(OBJ_DIR)/lal_buck_coul.o $(OBJ_DIR)/lal_buck_coul_ext.o \ $(OBJ_DIR)/lal_buck_coul_long.o $(OBJ_DIR)/lal_buck_coul_long_ext.o \ $(OBJ_DIR)/lal_table.o $(OBJ_DIR)/lal_table_ext.o \ - $(OBJ_DIR)/lal_yukawa.o $(OBJ_DIR)/lal_yukawa_ext.o + $(OBJ_DIR)/lal_yukawa.o $(OBJ_DIR)/lal_yukawa_ext.o \ + $(OBJ_DIR)/lal_born.o $(OBJ_DIR)/lal_born_ext.o \ + $(OBJ_DIR)/lal_born_coul_wolf.o $(OBJ_DIR)/lal_born_coul_wolf_ext.o \ + $(OBJ_DIR)/lal_born_coul_long.o $(OBJ_DIR)/lal_born_coul_long_ext.o \ + $(OBJ_DIR)/lal_dipole_lj.o $(OBJ_DIR)/lal_dipole_lj_ext.o \ + $(OBJ_DIR)/lal_dipole_lj_sf.o $(OBJ_DIR)/lal_dipole_lj_sf_ext.o \ + $(OBJ_DIR)/lal_colloid.o $(OBJ_DIR)/lal_colloid_ext.o \ + $(OBJ_DIR)/lal_gauss.o $(OBJ_DIR)/lal_gauss_ext.o \ + $(OBJ_DIR)/lal_yukawa_colloid.o $(OBJ_DIR)/lal_yukawa_colloid_ext.o \ + $(OBJ_DIR)/lal_lj_coul_debye.o $(OBJ_DIR)/lal_lj_coul_debye_ext.o \ + $(OBJ_DIR)/lal_coul_dsf.o $(OBJ_DIR)/lal_coul_dsf_ext.o + KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \ $(OBJ_DIR)/neighbor_cpu_cl.h $(OBJ_DIR)/pppm_cl.h \ $(OBJ_DIR)/ellipsoid_nbor_cl.h $(OBJ_DIR)/gayberne_cl.h \ $(OBJ_DIR)/gayberne_lj_cl.h $(OBJ_DIR)/re_squared_cl.h \ $(OBJ_DIR)/re_squared_lj_cl.h $(OBJ_DIR)/lj_cl.h $(OBJ_DIR)/lj96_cl.h \ $(OBJ_DIR)/lj_expand_cl.h $(OBJ_DIR)/lj_coul_cl.h \ - $(OBJ_DIR)/lj_coul_long_cl.h $(OBJ_DIR)/lj_class2_long_cl.h \ + $(OBJ_DIR)/lj_coul_long_cl.h $(OBJ_DIR)/lj_dsf_cl.h \ + $(OBJ_DIR)/lj_class2_long_cl.h \ $(OBJ_DIR)/coul_long_cl.h $(OBJ_DIR)/morse_cl.h \ $(OBJ_DIR)/charmm_long_cl.h $(OBJ_DIR)/cg_cmm_cl.h \ $(OBJ_DIR)/cg_cmm_long_cl.h $(OBJ_DIR)/neighbor_gpu_cl.h \ $(OBJ_DIR)/eam_cl.h $(OBJ_DIR)/buck_cl.h \ $(OBJ_DIR)/buck_coul_cl.h $(OBJ_DIR)/buck_coul_long_cl.h \ - $(OBJ_DIR)/table_cl.h $(OBJ_DIR)/yukawa_cl.h + $(OBJ_DIR)/table_cl.h $(OBJ_DIR)/yukawa_cl.h \ + $(OBJ_DIR)/born.cubin $(OBJ_DIR)/born_cubin.h \ + $(OBJ_DIR)/born_coul_wolf.cubin $(OBJ_DIR)/born_coul_wolf_cubin.h \ + $(OBJ_DIR)/born_coul_long.cubin $(OBJ_DIR)/born_coul_long_cubin.h \ + $(OBJ_DIR)/dipole_lj.cubin $(OBJ_DIR)/dipole_lj_cubin.h \ + $(OBJ_DIR)/dipole_lj_sf.cubin $(OBJ_DIR)/dipole_lj_sf_cubin.h \ + $(OBJ_DIR)/colloid.cubin $(OBJ_DIR)/colloid_cubin.h \ + $(OBJ_DIR)/gauss.cubin $(OBJ_DIR)/gauss_cubin.h \ + $(OBJ_DIR)/yukawa_colloid.cubin $(OBJ_DIR)/yukawa_colloid_cubin.h \ + $(OBJ_DIR)/lj_coul_debye.cubin $(OBJ_DIR)/lj_coul_debye_cubin.h \ + $(OBJ_DIR)/coul_dsf.cubin $(OBJ_DIR)/coul_dsf_cubin.h + OCL_EXECS = $(BIN_DIR)/ocl_get_devices @@ -91,6 +116,9 @@ $(OBJ_DIR)/lal_base_charge.o: $(OCL_H) lal_base_charge.h lal_base_charge.cpp $(OBJ_DIR)/lal_base_ellipsoid.o: $(OCL_H) lal_base_ellipsoid.h lal_base_ellipsoid.cpp $(OBJ_DIR)/ellipsoid_nbor_cl.h $(OCL) -o $@ -c lal_base_ellipsoid.cpp -I$(OBJ_DIR) +$(OBJ_DIR)/lal_base_dipole.o: $(OCL_H) lal_base_dipole.h lal_base_dipole.cpp + $(OCL) -o $@ -c lal_base_dipole.cpp -I$(OBJ_DIR) + $(OBJ_DIR)/pppm_cl.h: lal_pppm.cu lal_preprocessor.h $(BSH) ./geryon/file_to_cstr.sh pppm lal_preprocessor.h lal_pppm.cu $(OBJ_DIR)/pppm_cl.h; @@ -154,6 +182,15 @@ $(OBJ_DIR)/lal_lj_coul_long.o: $(ALL_H) lal_lj_coul_long.h lal_lj_coul_long.cpp $(OBJ_DIR)/lal_lj_coul_long_ext.o: $(ALL_H) lal_lj_coul_long.h lal_lj_coul_long_ext.cpp lal_base_charge.h $(OCL) -o $@ -c lal_lj_coul_long_ext.cpp -I$(OBJ_DIR) +$(OBJ_DIR)/lj_dsf_cl.h: lal_lj_dsf.cu $(PRE1_H) + $(BSH) ./geryon/file_to_cstr.sh lj_dsf $(PRE1_H) lal_lj_dsf.cu $(OBJ_DIR)/lj_dsf_cl.h; + +$(OBJ_DIR)/lal_lj_dsf.o: $(ALL_H) lal_lj_dsf.h lal_lj_dsf.cpp $(OBJ_DIR)/lj_dsf_cl.h $(OBJ_DIR)/lj_dsf_cl.h $(OBJ_DIR)/lal_base_charge.o + $(OCL) -o $@ -c lal_lj_dsf.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_lj_dsf_ext.o: $(ALL_H) lal_lj_dsf.h lal_lj_dsf_ext.cpp lal_base_charge.h + $(OCL) -o $@ -c lal_lj_dsf_ext.cpp -I$(OBJ_DIR) + $(OBJ_DIR)/lj_class2_long_cl.h: lal_lj_class2_long.cu $(PRE1_H) $(BSH) ./geryon/file_to_cstr.sh lj_class2_long $(PRE1_H) lal_lj_class2_long.cu $(OBJ_DIR)/lj_class2_long_cl.h; @@ -280,6 +317,96 @@ $(OBJ_DIR)/lal_yukawa.o: $(ALL_H) lal_yukawa.h lal_yukawa.cpp $(OBJ_DIR)/yukawa $(OBJ_DIR)/lal_yukawa_ext.o: $(ALL_H) lal_yukawa.h lal_yukawa_ext.cpp lal_base_atomic.h $(OCL) -o $@ -c lal_yukawa_ext.cpp -I$(OBJ_DIR) +$(OBJ_DIR)/born_cl.h: lal_born.cu $(PRE1_H) + $(BSH) ./geryon/file_to_cstr.sh born $(PRE1_H) lal_born.cu $(OBJ_DIR)/born_cl.h; + +$(OBJ_DIR)/lal_born.o: $(ALL_H) lal_born.h lal_born.cpp $(OBJ_DIR)/born_cl.h $(OBJ_DIR)/born_cl.h $(OBJ_DIR)/lal_base_atomic.o + $(OCL) -o $@ -c lal_born.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_born_ext.o: $(ALL_H) lal_born.h lal_born_ext.cpp lal_base_atomic.h + $(OCL) -o $@ -c lal_born_ext.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/born_coul_wolf_cl.h: lal_born_coul_wolf.cu $(PRE1_H) + $(BSH) ./geryon/file_to_cstr.sh born_coul_wolf $(PRE1_H) lal_born_coul_wolf.cu $(OBJ_DIR)/born_coul_wolf_cl.h; + +$(OBJ_DIR)/lal_born_coul_wolf.o: $(ALL_H) lal_born_coul_wolf.h lal_born_coul_wolf.cpp $(OBJ_DIR)/born_coul_wolf_cl.h $(OBJ_DIR)/born_coul_wolf_cl.h $(OBJ_DIR)/lal_base_charge.o + $(OCL) -o $@ -c lal_born_coul_wolf.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_born_coul_wolf_ext.o: $(ALL_H) lal_born_coul_wolf.h lal_born_coul_wolf_ext.cpp lal_base_charge.h + $(OCL) -o $@ -c lal_born_coul_wolf_ext.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/born_coul_long_cl.h: lal_born_coul_long.cu $(PRE1_H) + $(BSH) ./geryon/file_to_cstr.sh born_coul_long $(PRE1_H) lal_born_coul_long.cu $(OBJ_DIR)/born_coul_long_cl.h; + +$(OBJ_DIR)/lal_born_coul_long.o: $(ALL_H) lal_born_coul_long.h lal_born_coul_long.cpp $(OBJ_DIR)/born_coul_long_cl.h $(OBJ_DIR)/born_coul_long_cl.h $(OBJ_DIR)/lal_base_charge.o + $(OCL) -o $@ -c lal_born_coul_long.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_born_coul_long_ext.o: $(ALL_H) lal_born_coul_long.h lal_born_coul_long_ext.cpp lal_base_charge.h + $(OCL) -o $@ -c lal_born_coul_long_ext.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/dipole_lj_cl.h: lal_dipole_lj.cu $(PRE1_H) + $(BSH) ./geryon/file_to_cstr.sh dipole_lj $(PRE1_H) lal_dipole_lj.cu $(OBJ_DIR)/dipole_lj_cl.h; + +$(OBJ_DIR)/lal_dipole_lj.o: $(ALL_H) lal_dipole_lj.h lal_dipole_lj.cpp $(OBJ_DIR)/dipole_lj_cl.h $(OBJ_DIR)/dipole_lj_cl.h $(OBJ_DIR)/lal_base_dipole.o + $(OCL) -o $@ -c lal_dipole_lj.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_dipole_lj_ext.o: $(ALL_H) lal_dipole_lj.h lal_dipole_lj_ext.cpp lal_base_dipole.h + $(OCL) -o $@ -c lal_dipole_lj_ext.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/dipole_lj_sf_cl.h: lal_dipole_lj_sf.cu $(PRE1_H) + $(BSH) ./geryon/file_to_cstr.sh dipole_lj_sf $(PRE1_H) lal_dipole_lj_sf.cu $(OBJ_DIR)/dipole_lj_sf_cl.h; + +$(OBJ_DIR)/lal_dipole_lj_sf.o: $(ALL_H) lal_dipole_lj_sf.h lal_dipole_lj_sf.cpp $(OBJ_DIR)/dipole_lj_sf_cl.h $(OBJ_DIR)/dipole_lj_sf_cl.h $(OBJ_DIR)/lal_base_dipole.o + $(OCL) -o $@ -c lal_dipole_lj_sf.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_dipole_lj_sf_ext.o: $(ALL_H) lal_dipole_lj_sf.h lal_dipole_lj_sf_ext.cpp lal_base_dipole.h + $(OCL) -o $@ -c lal_dipole_lj_sf_ext.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/colloid_cl.h: lal_colloid.cu $(PRE1_H) + $(BSH) ./geryon/file_to_cstr.sh colloid $(PRE1_H) lal_colloid.cu $(OBJ_DIR)/colloid_cl.h; + +$(OBJ_DIR)/lal_colloid.o: $(ALL_H) lal_colloid.h lal_colloid.cpp $(OBJ_DIR)/colloid_cl.h $(OBJ_DIR)/colloid_cl.h $(OBJ_DIR)/lal_base_atomic.o + $(OCL) -o $@ -c lal_colloid.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_colloid_ext.o: $(ALL_H) lal_colloid.h lal_colloid_ext.cpp lal_base_atomic.h + $(OCL) -o $@ -c lal_colloid_ext.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/gauss_cl.h: lal_gauss.cu $(PRE1_H) + $(BSH) ./geryon/file_to_cstr.sh gauss $(PRE1_H) lal_gauss.cu $(OBJ_DIR)/gauss_cl.h; + +$(OBJ_DIR)/lal_gauss.o: $(ALL_H) lal_gauss.h lal_gauss.cpp $(OBJ_DIR)/gauss_cl.h $(OBJ_DIR)/gauss_cl.h $(OBJ_DIR)/lal_base_atomic.o + $(OCL) -o $@ -c lal_gauss.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_gauss_ext.o: $(ALL_H) lal_gauss.h lal_gauss_ext.cpp lal_base_atomic.h + $(OCL) -o $@ -c lal_gauss_ext.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/yukawa_colloid_cl.h: lal_yukawa_colloid.cu $(PRE1_H) + $(BSH) ./geryon/file_to_cstr.sh yukawa_colloid $(PRE1_H) lal_yukawa_colloid.cu $(OBJ_DIR)/yukawa_colloid_cl.h; + +$(OBJ_DIR)/lal_yukawa_colloid.o: $(ALL_H) lal_yukawa_colloid.h lal_yukawa_colloid.cpp $(OBJ_DIR)/yukawa_colloid_cl.h $(OBJ_DIR)/yukawa_colloid_cl.h $(OBJ_DIR)/lal_base_atomic.o + $(OCL) -o $@ -c lal_yukawa_colloid.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_yukawa_colloid_ext.o: $(ALL_H) lal_yukawa_colloid.h lal_yukawa_colloid_ext.cpp lal_base_atomic.h + $(OCL) -o $@ -c lal_yukawa_colloid_ext.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lj_coul_debye_cl.h: lal_lj_coul_debye.cu $(PRE1_H) + $(BSH) ./geryon/file_to_cstr.sh lj_coul_debye $(PRE1_H) lal_lj_coul_debye.cu $(OBJ_DIR)/lj_coul_debye_cl.h; + +$(OBJ_DIR)/lal_lj_coul_debye.o: $(ALL_H) lal_lj_coul_debye.h lal_lj_coul_debye.cpp $(OBJ_DIR)/lj_coul_debye_cl.h $(OBJ_DIR)/lj_coul_debye_cl.h $(OBJ_DIR)/lal_base_charge.o + $(OCL) -o $@ -c lal_lj_coul_debye.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_lj_coul_debye_ext.o: $(ALL_H) lal_lj_coul_debye.h lal_lj_coul_debye_ext.cpp lal_base_charge.h + $(OCL) -o $@ -c lal_lj_coul_debye_ext.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/coul_dsf_cl.h: lal_coul_dsf.cu $(PRE1_H) + $(BSH) ./geryon/file_to_cstr.sh coul_dsf $(PRE1_H) lal_coul_dsf.cu $(OBJ_DIR)/coul_dsf_cl.h; + +$(OBJ_DIR)/lal_coul_dsf.o: $(ALL_H) lal_coul_dsf.h lal_coul_dsf.cpp $(OBJ_DIR)/coul_dsf_cl.h $(OBJ_DIR)/coul_dsf_cl.h $(OBJ_DIR)/lal_base_charge.o + $(OCL) -o $@ -c lal_coul_dsf.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_coul_dsf_ext.o: $(ALL_H) lal_coul_dsf.h lal_coul_dsf_ext.cpp lal_base_charge.h + $(OCL) -o $@ -c lal_coul_dsf_ext.cpp -I$(OBJ_DIR) + $(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp $(OCL) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_OPENCL $(OCL_LINK) diff --git a/lib/gpu/geryon/README b/lib/gpu/geryon/README index 601c19dc3c..018e9cff7f 100644 --- a/lib/gpu/geryon/README +++ b/lib/gpu/geryon/README @@ -1,3 +1,7 @@ +NOTE: This Geryon distribution has been modified to remove files not + necessary for the LAMMPS implementation. The full distribution + is available at http://users.nccs.gov/~wb8/geryon/index.htm + Geryon Copyright (2010) Sandia Corporation. Under the terms of Contract diff --git a/lib/gpu/geryon/VERSION.txt b/lib/gpu/geryon/VERSION.txt index 313907d611..47cefed44d 100644 --- a/lib/gpu/geryon/VERSION.txt +++ b/lib/gpu/geryon/VERSION.txt @@ -1 +1 @@ -Geryon Version 12.034 +Geryon Version 12.033 diff --git a/lib/gpu/geryon/nvd_device.h b/lib/gpu/geryon/nvd_device.h index c17c1943c3..938e1d3bd6 100644 --- a/lib/gpu/geryon/nvd_device.h +++ b/lib/gpu/geryon/nvd_device.h @@ -141,6 +141,11 @@ class UCL_Device { /// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT) inline int device_type(const int i) { return UCL_GPU; } + /// Returns true if host memory is efficiently addressable from device + inline bool shared_memory() { return shared_memory(_device); } + /// Returns true if host memory is efficiently addressable from device + inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; } + /// Returns true if double precision is support for the current device bool double_precision() { return double_precision(_device); } /// Returns true if double precision is support for the device diff --git a/lib/gpu/geryon/nvd_kernel.h b/lib/gpu/geryon/nvd_kernel.h index b72c89c51f..fecd85eeb8 100644 --- a/lib/gpu/geryon/nvd_kernel.h +++ b/lib/gpu/geryon/nvd_kernel.h @@ -30,11 +30,23 @@ namespace ucl_cudadr { class UCL_Texture; +template class UCL_D_Vec; +template class UCL_D_Mat; +template class UCL_Vector; +template class UCL_Matrix; +#define UCL_MAX_KERNEL_ARGS 256 /// Class storing 1 or more kernel functions from a single string or file class UCL_Program { public: inline UCL_Program(UCL_Device &device) { _cq=device.cq(); } + inline UCL_Program(UCL_Device &device, const void *program, + const char *flags="", std::string *log=NULL) { + _cq=device.cq(); + init(device); + load_string(program,flags,log); + } + inline ~UCL_Program() {} /// Initialize the program with a device @@ -64,10 +76,10 @@ class UCL_Program { } /// Load a program from a string and compile with flags - inline int load_string(const char *program, const char *flags="", + inline int load_string(const void *program, const char *flags="", std::string *log=NULL) { if (std::string(flags)=="BINARY") - return load_binary(program); + return load_binary((const char *)program); const unsigned int num_opts=2; CUjit_option options[num_opts]; void *values[num_opts]; @@ -134,15 +146,25 @@ class UCL_Program { friend class UCL_Texture; }; -/// Class for dealing with OpenCL kernels +/// Class for dealing with CUDA Driver kernels class UCL_Kernel { public: - UCL_Kernel() : _dimensions(1), _num_args(0), _param_size(0) - { _num_blocks[0]=0; } + UCL_Kernel() : _dimensions(1), _num_args(0) { + #if CUDA_VERSION < 4000 + _param_size=0; + #endif + _num_blocks[0]=0; + } UCL_Kernel(UCL_Program &program, const char *function) : - _dimensions(1), _num_args(0), _param_size(0) - { _num_blocks[0]=0; set_function(program,function); _cq=program._cq; } + _dimensions(1), _num_args(0) { + #if CUDA_VERSION < 4000 + _param_size=0; + #endif + _num_blocks[0]=0; + set_function(program,function); + _cq=program._cq; + } ~UCL_Kernel() {} @@ -170,78 +192,190 @@ class UCL_Kernel { * changes * \note To set kernel parameter i (i>0), parameter i-1 must be set **/ template - inline void set_arg(const unsigned index, dtype *arg) { + inline void set_arg(const unsigned index, const dtype * const arg) { if (index==_num_args) add_arg(arg); else if (index<_num_args) + #if CUDA_VERSION >= 4000 + _kernel_args[index]=arg; + #else CU_SAFE_CALL(cuParamSetv(_kernel, _offsets[index], arg, sizeof(dtype))); + #endif else assert(0==1); // Must add kernel parameters in sequential order } + /// Set a geryon container as a kernel argument. + template + inline void set_arg(const UCL_D_Vec * const arg) + { set_arg(&arg->begin()); } + + /// Set a geryon container as a kernel argument. + template + inline void set_arg(const UCL_D_Mat * const arg) + { set_arg(&arg->begin()); } + + /// Set a geryon container as a kernel argument. + template + inline void set_arg(const UCL_Vector * const arg) + { set_arg(&arg->device.begin()); } + + /// Set a geryon container as a kernel argument. + template + inline void set_arg(const UCL_Matrix * const arg) + { set_arg(&arg->device.begin()); } + /// Add a kernel argument. inline void add_arg(const CUdeviceptr* const arg) { + #if CUDA_VERSION >= 4000 + _kernel_args[_num_args]=(void *)arg; + #else void* ptr = (void*)(size_t)(*arg); _param_size = (_param_size + __alignof(ptr) - 1) & ~(__alignof(ptr) - 1); CU_SAFE_CALL(cuParamSetv(_kernel, _param_size, &ptr, sizeof(ptr))); _offsets.push_back(_param_size); _param_size+=sizeof(ptr); + #endif _num_args++; + if (_num_args>UCL_MAX_KERNEL_ARGS) assert(0==1); } /// Add a kernel argument. template inline void add_arg(const dtype* const arg) { + #if CUDA_VERSION >= 4000 + _kernel_args[_num_args]=const_cast(arg); + #else _param_size = (_param_size+__alignof(dtype)-1) & ~(__alignof(dtype)-1); CU_SAFE_CALL(cuParamSetv(_kernel,_param_size,(void*)arg,sizeof(dtype))); _offsets.push_back(_param_size); _param_size+=sizeof(dtype); + #endif _num_args++; + if (_num_args>UCL_MAX_KERNEL_ARGS) assert(0==1); } + /// Add a geryon container as a kernel argument. + template + inline void add_arg(const UCL_D_Vec * const arg) + { add_arg(&arg->begin()); } + + /// Add a geryon container as a kernel argument. + template + inline void add_arg(const UCL_D_Mat * const arg) + { add_arg(&arg->begin()); } + + /// Add a geryon container as a kernel argument. + template + inline void add_arg(const UCL_Vector * const arg) + { add_arg(&arg->device.begin()); } + + /// Add a geryon container as a kernel argument. + template + inline void add_arg(const UCL_Matrix * const arg) + { add_arg(&arg->device.begin()); } + /// Set the number of thread blocks and the number of threads in each block - /** \note This should be called after all arguments have been added **/ + /** \note This should be called before any arguments have been added + \note The default command queue is used for the kernel execution **/ inline void set_size(const size_t num_blocks, const size_t block_size) { _dimensions=1; _num_blocks[0]=num_blocks; - _num_blocks[1]=1; + _num_blocks[1]=1; + _num_blocks[2]=1; + #if CUDA_VERSION >= 4000 + _block_size[0]=block_size; + _block_size[1]=1; + _block_size[2]=1; + #else CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size,1,1)); + #endif } /// Set the number of thread blocks and the number of threads in each block + /** \note This should be called before any arguments have been added + \note The default command queue for the kernel is changed to cq **/ + inline void set_size(const size_t num_blocks, const size_t block_size, + command_queue &cq) + { _cq=cq; set_size(num_blocks,block_size); } + + /// Set the number of thread blocks and the number of threads in each block + /** \note This should be called before any arguments have been added + \note The default command queue is used for the kernel execution **/ inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y, const size_t block_size_x, const size_t block_size_y) { _dimensions=2; _num_blocks[0]=num_blocks_x; _num_blocks[1]=num_blocks_y; + _num_blocks[2]=1; + #if CUDA_VERSION >= 4000 + _block_size[0]=block_size_x; + _block_size[1]=block_size_y; + _block_size[2]=1; + #else CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size_x,block_size_y,1)); + #endif } /// Set the number of thread blocks and the number of threads in each block + /** \note This should be called before any arguments have been added + \note The default command queue for the kernel is changed to cq **/ + inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y, + const size_t block_size_x, const size_t block_size_y, + command_queue &cq) + {_cq=cq; set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y);} + + /// Set the number of thread blocks and the number of threads in each block + /** \note This should be called before any arguments have been added + \note The default command queue is used for the kernel execution **/ inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y, const size_t block_size_x, const size_t block_size_y, const size_t block_size_z) { _dimensions=2; _num_blocks[0]=num_blocks_x; _num_blocks[1]=num_blocks_y; + _num_blocks[2]=1; + #if CUDA_VERSION >= 4000 + _block_size[0]=block_size_x; + _block_size[1]=block_size_y; + _block_size[2]=block_size_z; + #else CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size_x,block_size_y, - block_size_z)); + block_size_z)); + #endif } - /// Run the kernel in the default command queue - inline void run() { - CU_SAFE_CALL(cuParamSetSize(_kernel,_param_size)); - CU_SAFE_CALL(cuLaunchGridAsync(_kernel,_num_blocks[0],_num_blocks[1],_cq)); + /// Set the number of thread blocks and the number of threads in each block + /** \note This should be called before any arguments have been added + \note The default command queue is used for the kernel execution **/ + inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y, + const size_t block_size_x, const size_t block_size_y, + const size_t block_size_z, command_queue &cq) { + _cq=cq; + set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y, + block_size_z); } - /// Run the kernel in the specified command queue - inline void run(command_queue &cq) { + /// Run the kernel in the default command queue + inline void run() { + #if CUDA_VERSION >= 4000 + CU_SAFE_CALL(cuLaunchKernel(_kernel,_num_blocks[0],_num_blocks[1], + _num_blocks[2],_block_size[0],_block_size[1], + _block_size[2],0,_cq,_kernel_args,NULL)); + #else CU_SAFE_CALL(cuParamSetSize(_kernel,_param_size)); - CU_SAFE_CALL(cuLaunchGridAsync(_kernel,_num_blocks[0],_num_blocks[1],cq)); + CU_SAFE_CALL(cuLaunchGridAsync(_kernel,_num_blocks[0],_num_blocks[1],_cq)); + #endif } /// Clear any arguments associated with the kernel - inline void clear_args() { _num_args=0; _offsets.clear(); _param_size=0; } + inline void clear_args() { + _num_args=0; + #if CUDA_VERSION < 4000 + _offsets.clear(); + _param_size=0; + #endif + } #include "ucl_arg_kludge.h" @@ -249,11 +383,17 @@ class UCL_Kernel { CUfunction _kernel; CUstream _cq; unsigned _dimensions; - unsigned _num_blocks[2]; + unsigned _num_blocks[3]; unsigned _num_args; + friend class UCL_Texture; + + #if CUDA_VERSION >= 4000 + unsigned _block_size[3]; + void * _kernel_args[UCL_MAX_KERNEL_ARGS]; + #else std::vector _offsets; unsigned _param_size; - friend class UCL_Texture; + #endif }; } // namespace diff --git a/lib/gpu/geryon/nvd_mat.h b/lib/gpu/geryon/nvd_mat.h index ed42305a70..51cfe1d56f 100644 --- a/lib/gpu/geryon/nvd_mat.h +++ b/lib/gpu/geryon/nvd_mat.h @@ -38,6 +38,9 @@ namespace ucl_cudadr { #include "ucl_h_mat.h" #include "ucl_d_vec.h" #include "ucl_d_mat.h" +#include "ucl_s_obj_help.h" +#include "ucl_vector.h" +#include "ucl_matrix.h" #undef _UCL_DEVICE_PTR_MAT #undef _UCL_MAT_ALLOW diff --git a/lib/gpu/geryon/nvd_memory.h b/lib/gpu/geryon/nvd_memory.h index dc70aa1b3c..335418fe5f 100644 --- a/lib/gpu/geryon/nvd_memory.h +++ b/lib/gpu/geryon/nvd_memory.h @@ -85,6 +85,21 @@ inline void _host_free(mat_type &mat, const enum UCL_MEMOPT kind) { free(mat.begin()); } +template +inline int _host_resize(mat_type &mat, const size_t n) { + _host_free(mat,mat.kind()); + CUresult err=CUDA_SUCCESS; + if (mat.kind()==UCL_RW_OPTIMIZED) + err=cuMemAllocHost((void **)mat.host_ptr(),n); + else if (mat.kind()==UCL_WRITE_OPTIMIZED) + err=cuMemHostAlloc((void **)mat.host_ptr(),n,CU_MEMHOSTALLOC_WRITECOMBINED); + else + *(mat.host_ptr())=(typename mat_type::data_type*)malloc(n); + if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL) + return UCL_MEMORY_ERROR; + return UCL_SUCCESS; +} + // -------------------------------------------------------------------------- // - DEVICE MEMORY ALLOCATION ROUTINES // -------------------------------------------------------------------------- @@ -143,6 +158,29 @@ inline void _device_free(mat_type &mat) { CU_DESTRUCT_CALL(cuMemFree(mat.cbegin())); } +template +inline int _device_resize(mat_type &mat, const size_t n) { + _device_free(mat); + CUresult err=cuMemAlloc(&mat.cbegin(),n); + if (err!=CUDA_SUCCESS) + return UCL_MEMORY_ERROR; + return UCL_SUCCESS; +} + +template +inline int _device_resize(mat_type &mat, const size_t rows, + const size_t cols, size_t &pitch) { + _device_free(mat); + CUresult err; + CUDA_INT_TYPE upitch; + err=cuMemAllocPitch(&mat.cbegin(),&upitch, + cols*sizeof(typename mat_type::data_type),rows,16); + pitch=static_cast(upitch); + if (err!=CUDA_SUCCESS) + return UCL_MEMORY_ERROR; + return UCL_SUCCESS; +} + inline void _device_view(CUdeviceptr *ptr, CUdeviceptr &in) { *ptr=in; } diff --git a/lib/gpu/geryon/nvd_texture.h b/lib/gpu/geryon/nvd_texture.h index 3fbf80180b..07650263a5 100644 --- a/lib/gpu/geryon/nvd_texture.h +++ b/lib/gpu/geryon/nvd_texture.h @@ -42,27 +42,56 @@ class UCL_Texture { { CU_SAFE_CALL(cuModuleGetTexRef(&_tex, prog._module, texture_name)); } /// Bind a float array where each fetch grabs a vector of length numel - template - inline void bind_float(mat_typ &vec, const unsigned numel) { - #ifdef UCL_DEBUG - assert(numel!=0 && numel<5); - #endif - CU_SAFE_CALL(cuTexRefSetAddress(NULL, _tex, vec.cbegin(), - vec.numel()*vec.element_size())); - CU_SAFE_CALL(cuTexRefSetFormat(_tex, CU_AD_FORMAT_FLOAT, numel)); - } + template + inline void bind_float(UCL_D_Vec &vec, const unsigned numel) + { _bind_float(vec,numel); } + + /// Bind a float array where each fetch grabs a vector of length numel + template + inline void bind_float(UCL_D_Mat &vec, const unsigned numel) + { _bind_float(vec,numel); } + + /// Bind a float array where each fetch grabs a vector of length numel + template + inline void bind_float(UCL_Vector &vec, const unsigned numel) + { _bind_float(vec.device,numel); } + + /// Bind a float array where each fetch grabs a vector of length numel + template + inline void bind_float(UCL_Matrix &vec, const unsigned numel) + { _bind_float(vec.device,numel); } /// Unbind the texture reference from the memory allocation inline void unbind() { } /// Make a texture reference available to kernel inline void allow(UCL_Kernel &kernel) { + #if CUDA_VERSION < 4000 CU_SAFE_CALL(cuParamSetTexRef(kernel._kernel, CU_PARAM_TR_DEFAULT, _tex)); + #endif } private: CUtexref _tex; friend class UCL_Kernel; + + template + inline void _bind_float(mat_typ &vec, const unsigned numel) { + #ifdef UCL_DEBUG + assert(numel!=0 && numel<5); + #endif + CU_SAFE_CALL(cuTexRefSetAddress(NULL, _tex, vec.cbegin(), + vec.numel()*vec.element_size())); + if (vec.element_size()==sizeof(float)) + CU_SAFE_CALL(cuTexRefSetFormat(_tex, CU_AD_FORMAT_FLOAT, numel)); + else { + if (numel>2) + CU_SAFE_CALL(cuTexRefSetFormat(_tex, CU_AD_FORMAT_SIGNED_INT32, numel)); + else + CU_SAFE_CALL(cuTexRefSetFormat(_tex,CU_AD_FORMAT_SIGNED_INT32,numel*2)); + } + } + }; } // namespace diff --git a/lib/gpu/geryon/ocl_device.h b/lib/gpu/geryon/ocl_device.h index 0fde8c2acf..391eeb9d95 100644 --- a/lib/gpu/geryon/ocl_device.h +++ b/lib/gpu/geryon/ocl_device.h @@ -158,6 +158,11 @@ class UCL_Device { /// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT) inline int device_type(const int i); + /// Returns true if host memory is efficiently addressable from device + inline bool shared_memory() { return shared_memory(_device); } + /// Returns true if host memory is efficiently addressable from device + inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; } + /// Returns true if double precision is support for the current device bool double_precision() { return double_precision(_device); } /// Returns true if double precision is support for the device diff --git a/lib/gpu/geryon/ocl_kernel.h b/lib/gpu/geryon/ocl_kernel.h index 4d77c85021..74bcea8f5e 100644 --- a/lib/gpu/geryon/ocl_kernel.h +++ b/lib/gpu/geryon/ocl_kernel.h @@ -29,11 +29,25 @@ namespace ucl_opencl { +class UCL_Texture; +template class UCL_D_Vec; +template class UCL_D_Mat; +template class UCL_Vector; +template class UCL_Matrix; +#define UCL_MAX_KERNEL_ARGS 256 + /// Class storing 1 or more kernel functions from a single string or file class UCL_Program { public: inline UCL_Program() : _init_done(false) {} inline UCL_Program(UCL_Device &device) : _init_done(false) { init(device); } + inline UCL_Program(UCL_Device &device, const void *program, + const char *flags="", std::string *log=NULL) : + _init_done(false) { + init(device); + load_string(program,flags,log); + } + inline ~UCL_Program() { clear(); } /// Initialize the program with a device @@ -78,10 +92,10 @@ class UCL_Program { } /// Load a program from a string and compile with flags - inline int load_string(const char *program, const char *flags="", + inline int load_string(const void *program, const char *flags="", std::string *log=NULL) { cl_int error_flag; - const char *prog=program; + const char *prog=(const char *)program; _program=clCreateProgramWithSource(_context,1,&prog,NULL,&error_flag); CL_CHECK_ERR(error_flag); error_flag = clBuildProgram(_program,1,&_device,flags,NULL,NULL); @@ -159,19 +173,61 @@ class UCL_Kernel { /** If not a device pointer, this must be repeated each time the argument * changes **/ template - inline void set_arg(const cl_uint index, dtype *arg) { + inline void set_arg(const cl_uint index, const dtype * const arg) { CL_SAFE_CALL(clSetKernelArg(_kernel,index,sizeof(dtype),arg)); if (index>_num_args) _num_args=index; } + /// Set a geryon container as a kernel argument. + template + inline void set_arg(const UCL_D_Vec * const arg) + { set_arg(&arg->begin()); } + + /// Set a geryon container as a kernel argument. + template + inline void set_arg(const UCL_D_Mat * const arg) + { set_arg(&arg->begin()); } + + /// Set a geryon container as a kernel argument. + template + inline void set_arg(const UCL_Vector * const arg) + { set_arg(&arg->device.begin()); } + + /// Set a geryon container as a kernel argument. + template + inline void set_arg(const UCL_Matrix * const arg) + { set_arg(&arg->device.begin()); } + /// Add a kernel argument. template - inline void add_arg(dtype *arg) { + inline void add_arg(const dtype * const arg) { CL_SAFE_CALL(clSetKernelArg(_kernel,_num_args,sizeof(dtype),arg)); _num_args++; } + /// Add a geryon container as a kernel argument. + template + inline void add_arg(const UCL_D_Vec * const arg) + { add_arg(&arg->begin()); } + + /// Add a geryon container as a kernel argument. + template + inline void add_arg(const UCL_D_Mat * const arg) + { add_arg(&arg->begin()); } + + /// Add a geryon container as a kernel argument. + template + inline void add_arg(const UCL_Vector * const arg) + { add_arg(&arg->device.begin()); } + + /// Add a geryon container as a kernel argument. + template + inline void add_arg(const UCL_Matrix * const arg) + { add_arg(&arg->device.begin()); } + /// Set the number of thread blocks and the number of threads in each block + /** \note This should be called before any arguments have been added + \note The default command queue is used for the kernel execution **/ inline void set_size(const size_t num_blocks, const size_t block_size) { _dimensions=1; _num_blocks[0]=num_blocks*block_size; @@ -179,6 +235,15 @@ class UCL_Kernel { } /// Set the number of thread blocks and the number of threads in each block + /** \note This should be called before any arguments have been added + \note The default command queue for the kernel is changed to cq **/ + inline void set_size(const size_t num_blocks, const size_t block_size, + command_queue &cq) + { _cq=cq; set_size(num_blocks,block_size); } + + /// Set the number of thread blocks and the number of threads in each block + /** \note This should be called before any arguments have been added + \note The default command queue is used for the kernel execution **/ inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y, const size_t block_size_x, const size_t block_size_y) { _dimensions=2; @@ -189,6 +254,16 @@ class UCL_Kernel { } /// Set the number of thread blocks and the number of threads in each block + /** \note This should be called before any arguments have been added + \note The default command queue for the kernel is changed to cq **/ + inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y, + const size_t block_size_x, const size_t block_size_y, + command_queue &cq) + {_cq=cq; set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y);} + + /// Set the number of thread blocks and the number of threads in each block + /** \note This should be called before any arguments have been added + \note The default command queue is used for the kernel execution **/ inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y, const size_t block_size_x, const size_t block_size_y, const size_t block_size_z) { @@ -202,14 +277,20 @@ class UCL_Kernel { _block_size[2]=block_size_z; } - /// Run the kernel in the default command queue - inline void run() { - run(_cq); + /// Set the number of thread blocks and the number of threads in each block + /** \note This should be called before any arguments have been added + \note The default command queue is used for the kernel execution **/ + inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y, + const size_t block_size_x, const size_t block_size_y, + const size_t block_size_z, command_queue &cq) { + _cq=cq; + set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y, + block_size_z); } - /// Run the kernel in the specified command queue - inline void run(command_queue &cq) { - CL_SAFE_CALL(clEnqueueNDRangeKernel(cq,_kernel,_dimensions,NULL, + /// Run the kernel in the default command queue + inline void run() { + CL_SAFE_CALL(clEnqueueNDRangeKernel(_cq,_kernel,_dimensions,NULL, _num_blocks,_block_size,0,NULL,NULL)); } diff --git a/lib/gpu/geryon/ocl_mat.h b/lib/gpu/geryon/ocl_mat.h index 180b292d3b..2909d72a72 100644 --- a/lib/gpu/geryon/ocl_mat.h +++ b/lib/gpu/geryon/ocl_mat.h @@ -39,6 +39,9 @@ namespace ucl_opencl { #include "ucl_h_mat.h" #include "ucl_d_vec.h" #include "ucl_d_mat.h" +#include "ucl_s_obj_help.h" +#include "ucl_vector.h" +#include "ucl_matrix.h" #undef _UCL_DEVICE_PTR_MAT #undef _OCL_MAT #undef _UCL_MAT_ALLOW diff --git a/lib/gpu/geryon/ocl_memory.h b/lib/gpu/geryon/ocl_memory.h index a049f1ea14..6051ee7b3e 100644 --- a/lib/gpu/geryon/ocl_memory.h +++ b/lib/gpu/geryon/ocl_memory.h @@ -132,6 +132,37 @@ inline void _host_free(mat_type &mat, const enum UCL_MEMOPT kind) { CL_DESTRUCT_CALL(clReleaseCommandQueue(mat.cq())); } +template +inline int _host_resize(mat_type &mat, const size_t n) { + cl_int error_flag; + cl_context context; + CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_CONTEXT,sizeof(context), + &context,NULL)); + + CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin())); + if (mat.kind()==UCL_WRITE_OPTIMIZED) { + mat.cbegin()=clCreateBuffer(context, + CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, + n,NULL,&error_flag); + if (error_flag != CL_SUCCESS) + return UCL_MEMORY_ERROR; + *mat.host_ptr() = (typename mat_type::data_type*) + clEnqueueMapBuffer(mat.cq(),mat.cbegin(),CL_TRUE, + CL_MAP_WRITE,0,n,0,NULL,NULL,NULL); + } else { + mat.cbegin()=clCreateBuffer(context, + CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, + n,NULL,&error_flag); + if (error_flag != CL_SUCCESS) + return UCL_MEMORY_ERROR; + *mat.host_ptr() = (typename mat_type::data_type*) + clEnqueueMapBuffer(mat.cq(),mat.cbegin(),CL_TRUE, + CL_MAP_READ | CL_MAP_WRITE, + 0,n,0,NULL,NULL,NULL); + } + return UCL_SUCCESS; +} + // -------------------------------------------------------------------------- // - DEVICE MEMORY ALLOCATION ROUTINES // -------------------------------------------------------------------------- @@ -211,6 +242,61 @@ inline void _device_free(mat_type &mat) { CL_DESTRUCT_CALL(clReleaseCommandQueue(mat.cq())); } +template +inline int _device_resize(mat_type &mat, const size_t n) { + cl_int error_flag; + + cl_context context; + CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_CONTEXT,sizeof(context), + &context,NULL)); + CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin())); + + cl_mem_flags flag; + if (mat.kind()==UCL_READ_WRITE) + flag=CL_MEM_READ_WRITE; + else if (mat.kind()==UCL_READ_ONLY) + flag=CL_MEM_READ_ONLY; + else if (mat.kind()==UCL_WRITE_ONLY) + flag=CL_MEM_WRITE_ONLY; + else + assert(0==1); + mat.cbegin()=clCreateBuffer(context,flag,n,NULL,&error_flag); + if (error_flag != CL_SUCCESS) + return UCL_MEMORY_ERROR; + return UCL_SUCCESS; +} + +template +inline int _device_resize(mat_type &mat, const size_t rows, + const size_t cols, size_t &pitch) { + size_t padded_cols=cols; + if (cols%256!=0) + padded_cols+=256-cols%256; + pitch=padded_cols*sizeof(typename mat_type::data_type); + + cl_int error_flag; + + cl_context context; + CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_CONTEXT,sizeof(context), + &context,NULL)); + CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin())); + + cl_mem_flags flag; + if (mat.kind()==UCL_READ_WRITE) + flag=CL_MEM_READ_WRITE; + else if (mat.kind()==UCL_READ_ONLY) + flag=CL_MEM_READ_ONLY; + else if (mat.kind()==UCL_WRITE_ONLY) + flag=CL_MEM_WRITE_ONLY; + else + assert(0==1); + mat.cbegin()=clCreateBuffer(context,flag,pitch*rows,NULL,&error_flag); + if (error_flag != CL_SUCCESS) + return UCL_MEMORY_ERROR; + return UCL_SUCCESS; +} + + // -------------------------------------------------------------------------- // - ZERO ROUTINES // -------------------------------------------------------------------------- diff --git a/lib/gpu/geryon/ucl_arg_kludge.h b/lib/gpu/geryon/ucl_arg_kludge.h index f039a2ff42..646aa4d68f 100644 --- a/lib/gpu/geryon/ucl_arg_kludge.h +++ b/lib/gpu/geryon/ucl_arg_kludge.h @@ -828,441 +828,3 @@ add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30); run(); } - -// --------------------------------------------------------------------------- - - template - inline void run_cq(command_queue &cq, t1 *a1) { - clear_args(); - add_arg(a1); - run(cq); - } - - template - inline void run_cq(command_queue &cq, t1 *a1, t2 *a2) { - clear_args(); - add_arg(a1); add_arg(a2); - run(cq); - } - - template - inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3) { - clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); - run(cq); - } - - template - inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4) { - clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); - run(cq); - } - - template - inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5) { - clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - run(cq); - } - - template - inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, - t6 *a6) { - clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); - run(cq); - } - - template - inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, - t6 *a6, t7 *a7) { - clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); add_arg(a7); - run(cq); - } - - template - inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, - t6 *a6, t7 *a7, t8 *a8) { - clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); add_arg(a7); add_arg(a8); - run(cq); - } - - template - inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, - t6 *a6, t7 *a7, t8 *a8, t9 *a9) { - clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); - run(cq); - } - - template - inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, - t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10) { - clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); - run(cq); - } - - template - inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, - t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, - t11 *a11) { - clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); - add_arg(a11); - run(cq); - } - - template - inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, - t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, - t11 *a11, t12 *a12) { - clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); - add_arg(a11); add_arg(a12); - run(cq); - } - - template - inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, - t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, - t11 *a11, t12 *a12, t13 *a13) { - clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); - add_arg(a11); add_arg(a12); add_arg(a13); - run(cq); - } - - template - inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, - t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, - t11 *a11, t12 *a12, t13 *a13, t14 *a14) { - clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); - add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); - run(cq); - } - - template - inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, - t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, - t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15) { - clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); - add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); - run(cq); - } - - template - inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, - t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, - t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, - t16 *a16) { - clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); - add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); - add_arg(a16); - run(cq); - } - - template - inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, - t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, - t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, - t16 *a16, t17 *a17) { - clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); - add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); - add_arg(a16); add_arg(a17); - run(cq); - } - - template - inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, - t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, - t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, - t16 *a16, t17 *a17, t18 *a18) { - clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); - add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); - add_arg(a16); add_arg(a17); add_arg(a18); - run(cq); - } - - template - inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, - t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, - t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, - t16 *a16, t17 *a17, t18 *a18, t19 *a19) { - clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); - add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); - add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); - run(cq); - } - - template - inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, - t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, - t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, - t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20) { - clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); - add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); - add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); - run(cq); - } - - template - inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, - t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, - t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, - t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, - t21 *a21) { - clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); - add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); - add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); - add_arg(a21); - run(cq); - } - - template - inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, - t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, - t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, - t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, - t21 *a21, t22 *a22) { - clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); - add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); - add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); - add_arg(a21); add_arg(a22); - run(cq); - } - - template - inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, - t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, - t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, - t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, - t21 *a21, t22 *a22, t23 *a23) { - clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); - add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); - add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); - add_arg(a21); add_arg(a22); add_arg(a23); - run(cq); - } - - template - inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, - t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, - t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, - t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, - t21 *a21, t22 *a22, t23 *a23, t24 *a24) { - clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); - add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); - add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); - add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); - run(cq); - } - - template - inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, - t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, - t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, - t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, - t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25) { - clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); - add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); - add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); - add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); - run(cq); - } - - template - inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, - t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, - t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, - t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, - t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25, - t26 *a26) { - clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); - add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); - add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); - add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); - add_arg(a26); - run(cq); - } - - template - inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, - t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, - t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, - t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, - t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25, - t26 *a26, t27 *a27) { - clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); - add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); - add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); - add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); - add_arg(a26); add_arg(a27); - run(cq); - } - - template - inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, - t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, - t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, - t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, - t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25, - t26 *a26, t27 *a27, t28 *a28) { - clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); - add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); - add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); - add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); - add_arg(a26); add_arg(a27); add_arg(a28); - run(cq); - } - - template - inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, - t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, - t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, - t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, - t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25, - t26 *a26, t27 *a27, t28 *a28, t29 *a29) { - clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); - add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); - add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); - add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); - add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); - run(cq); - } - - template - inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5, - t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10, - t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15, - t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20, - t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25, - t26 *a26, t27 *a27, t28 *a28, t29 *a29, t30 *a30) { - clear_args(); - add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); - add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); - add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); - add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); - add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); - add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30); - run(cq); - } - diff --git a/lib/gpu/geryon/ucl_d_mat.h b/lib/gpu/geryon/ucl_d_mat.h index 9777de4586..b065a8b644 100644 --- a/lib/gpu/geryon/ucl_d_mat.h +++ b/lib/gpu/geryon/ucl_d_mat.h @@ -344,6 +344,39 @@ class UCL_D_Mat : public UCL_BaseMat { inline void clear() { _rows=0; if (_kind!=UCL_VIEW) { _kind=UCL_VIEW; _device_free(*this); } } + /// Resize the allocation to contain cols elements + /** \note Cannot be used on views **/ + inline int resize(const int rows, const int cols) { + assert(_kind!=UCL_VIEW); + + int err=_device_resize(*this,rows,cols,_pitch); + if (err!=UCL_SUCCESS) { + #ifndef UCL_NO_EXIT + std::cerr << "UCL Error: Could not allocate " + << rows*cols*sizeof(numtyp) << " bytes on device.\n"; + UCL_GERYON_EXIT; + #endif + return err; + } + + _rows=rows; + _cols=cols; + _row_size=_pitch/sizeof(numtyp); + #ifndef _UCL_DEVICE_PTR_MAT + _end=_array+_row_size*cols; + #endif + #ifdef _OCL_MAT + _offset=0; + #endif + return err; + } + + /// Resize (only if bigger) the allocation to contain rows x cols elements + /** \note Cannot be used on views **/ + inline int resize_ib(const int rows, const int cols) + { if (cols>_cols || rows>_rows) return resize(rows,cols); + else return UCL_SUCCESS; } + /// Set each element to zero inline void zero() { _device_zero(*this,row_bytes()*_rows); } @@ -357,9 +390,9 @@ class UCL_D_Mat : public UCL_BaseMat { inline const device_ptr & begin() const { return _array; } #else /// For CUDA-RT, get device pointer to first element - inline numtyp * begin() { return _array; } + inline numtyp * & begin() { return _array; } /// For CUDA-RT, get device pointer to first element - inline const numtyp * begin() const { return _array; } + inline numtyp * const & begin() const { return _array; } /// For CUDA-RT, get device pointer to one past last element inline numtyp * end() { return _end; } /// For CUDA-RT, get device pointer to one past last element diff --git a/lib/gpu/geryon/ucl_d_vec.h b/lib/gpu/geryon/ucl_d_vec.h index 83063ba070..11107437ea 100644 --- a/lib/gpu/geryon/ucl_d_vec.h +++ b/lib/gpu/geryon/ucl_d_vec.h @@ -340,6 +340,39 @@ class UCL_D_Vec : public UCL_BaseMat { inline void clear() { if (_kind!=UCL_VIEW) { _cols=0; _kind=UCL_VIEW; _device_free(*this); } } + /// Resize the allocation to contain cols elements + /** \note Cannot be used on views **/ + inline int resize(const int cols) { + assert(_kind!=UCL_VIEW); + + _row_bytes=cols*sizeof(numtyp); + int err=_device_resize(*this,_row_bytes); + if (err!=UCL_SUCCESS) { + #ifndef UCL_NO_EXIT + std::cerr << "UCL Error: Could not allocate " << _row_bytes + << " bytes on device.\n"; + _row_bytes=0; + UCL_GERYON_EXIT; + #endif + _row_bytes=0; + return err; + } + + _cols=cols; + #ifndef _UCL_DEVICE_PTR_MAT + _end=_array+cols; + #endif + #ifdef _OCL_MAT + _offset=0; + #endif + return err; + } + + /// Resize (only if bigger) the allocation to contain cols elements + /** \note Cannot be used on views **/ + inline int resize_ib(const int cols) + { if (cols>_cols) return resize(cols); else return UCL_SUCCESS; } + /// Set each element to zero inline void zero() { _device_zero(*this,row_bytes()); } @@ -353,13 +386,13 @@ class UCL_D_Vec : public UCL_BaseMat { inline const device_ptr & begin() const { return _array; } #else /// For CUDA-RT, get device pointer to first element - inline numtyp * begin() { return _array; } + inline numtyp * & begin() { return _array; } /// For CUDA-RT, get device pointer to first element - inline const numtyp * begin() const { return _array; } + inline numtyp * const & begin() const { return _array; } /// For CUDA-RT, get device pointer to one past last element inline numtyp * end() { return _end; } /// For CUDA-RT, get device pointer to one past last element - inline const numtyp * end() const { return _end; } + inline numtyp * end() const { return _end; } #endif #ifdef _UCL_DEVICE_PTR_MAT diff --git a/lib/gpu/geryon/ucl_h_mat.h b/lib/gpu/geryon/ucl_h_mat.h index 5c13a003aa..806b930630 100644 --- a/lib/gpu/geryon/ucl_h_mat.h +++ b/lib/gpu/geryon/ucl_h_mat.h @@ -318,6 +318,36 @@ class UCL_H_Mat : public UCL_BaseMat { inline void clear() { if (_kind!=UCL_VIEW) {_rows=0; _kind=UCL_VIEW; _host_free(*this,_kind); }} + /// Resize the allocation to rows x cols elements + /** \note Cannot be used on views **/ + inline int resize(const int rows, const int cols) { + assert(_kind!=UCL_VIEW); + + _row_bytes=cols*sizeof(numtyp); + int err=_host_resize(*this,_row_bytes*rows); + if (err!=UCL_SUCCESS) { + #ifndef UCL_NO_EXIT + std::cerr << "UCL Error: Could not allocate " << _row_bytes*_rows + << " bytes on host.\n"; + _row_bytes=0; + UCL_GERYON_EXIT; + #endif + _row_bytes=0; + return err; + } + + _cols=cols; + _rows=rows; + _end=_array+rows*cols; + return err; + } + + /// Resize (only if bigger) the allocation to contain rows x cols elements + /** \note Cannot be used on views **/ + inline int resize_ib(const int rows, const int cols) + { if (cols>_cols || rows>_rows) return resize(rows,cols); + else return UCL_SUCCESS; } + /// Set each element to zero inline void zero() { _host_zero(_array,_rows*row_bytes()); } /// Set first n elements to zero diff --git a/lib/gpu/geryon/ucl_h_vec.h b/lib/gpu/geryon/ucl_h_vec.h index 2de68b487c..3a53113153 100644 --- a/lib/gpu/geryon/ucl_h_vec.h +++ b/lib/gpu/geryon/ucl_h_vec.h @@ -316,6 +316,34 @@ class UCL_H_Vec : public UCL_BaseMat { inline void clear() { if (_kind!=UCL_VIEW) {_kind=UCL_VIEW; _cols=0; _host_free(*this,_kind);}} + /// Resize the allocation to contain cols elements + /** \note Cannot be used on views **/ + inline int resize(const int cols) { + assert(_kind!=UCL_VIEW); + _row_bytes=cols*sizeof(numtyp); + int err=_host_resize(*this,_row_bytes); + + if (err!=UCL_SUCCESS) { + #ifndef UCL_NO_EXIT + std::cerr << "UCL Error: Could not allocate " << _row_bytes + << " bytes on host.\n"; + _row_bytes=0; + UCL_GERYON_EXIT; + #endif + _row_bytes=0; + return err; + } + + _cols=cols; + _end=_array+cols; + return err; + } + + /// Resize (only if bigger) the allocation to contain cols elements + /** \note Cannot be used on views **/ + inline int resize_ib(const int cols) + { if (cols>_cols) return resize(cols); else return UCL_SUCCESS; } + /// Set each element to zero inline void zero() { _host_zero(_array,row_bytes()); } diff --git a/lib/gpu/geryon/ucl_print.h b/lib/gpu/geryon/ucl_print.h index a8ab19a05d..87b3d3d7ff 100644 --- a/lib/gpu/geryon/ucl_print.h +++ b/lib/gpu/geryon/ucl_print.h @@ -270,4 +270,13 @@ template inline std::ostream & operator << (std::ostream &out, UCL_D_Mat &mat) { ucl_print(mat,out); return out; } + +template +inline std::ostream & operator << (std::ostream &out, UCL_Vector &mat) + { ucl_print(mat.host,out); return out; } + +template +inline std::ostream & operator << (std::ostream &out, UCL_Matrix &mat) + { ucl_print(mat.host,out); return out; } + #endif diff --git a/lib/gpu/geryon/ucl_types.h b/lib/gpu/geryon/ucl_types.h index 9dabf16687..615bffea95 100644 --- a/lib/gpu/geryon/ucl_types.h +++ b/lib/gpu/geryon/ucl_types.h @@ -117,5 +117,61 @@ enum UCL_ERROR_FLAG { template const char * ucl_template_name() { return _UCL_DATA_ID::name(); } +template struct ucl_same_type; + +template <> struct ucl_same_type { enum { ans=1 }; }; +template <> struct ucl_same_type { enum { ans=1 }; }; +template <> struct ucl_same_type { enum { ans=1 }; }; +template <> struct ucl_same_type { enum { ans=1 }; }; +template <> struct ucl_same_type { enum { ans=1 }; }; +template <> struct ucl_same_type { enum { ans=1 }; }; +template <> struct ucl_same_type { enum { ans=1 }; }; +template <> struct ucl_same_type { enum { ans=1 }; }; +template <> struct ucl_same_type { enum { ans=1 }; }; +template <> struct ucl_same_type { enum { ans=1 }; }; +template <> struct ucl_same_type { enum { ans=1 }; }; +template <> struct ucl_same_type { enum { ans=1 }; }; + +template <> struct ucl_same_type { enum { ans=1 }; }; +template <> struct ucl_same_type { enum { ans=1 }; }; +template <> struct ucl_same_type { enum { ans=1 }; }; +template <> struct ucl_same_type { enum { ans=1 }; }; +template <> struct ucl_same_type { enum { ans=1 }; }; +template <> struct ucl_same_type { enum { ans=1 }; }; +template <> struct ucl_same_type { enum { ans=1 }; }; +template <> struct ucl_same_type { enum { ans=1 }; }; +template <> struct ucl_same_type { enum { ans=1 }; }; +template <> struct ucl_same_type { enum { ans=1 }; }; +template <> struct ucl_same_type { enum { ans=1 }; }; +template <> struct ucl_same_type { enum { ans=1 }; }; + +template <> struct ucl_same_type { enum { ans=1 }; }; +template <> struct ucl_same_type { enum { ans=1 }; }; +template <> struct ucl_same_type { enum { ans=1 }; }; +template <> struct ucl_same_type { enum { ans=1 }; }; +template <> struct ucl_same_type { enum { ans=1 }; }; +template <> struct ucl_same_type { enum { ans=1 }; }; +template <> struct ucl_same_type { enum { ans=1 }; }; +template <> struct ucl_same_type { enum { ans=1 }; }; +template <> struct ucl_same_type { enum { ans=1 }; }; +template <> struct ucl_same_type { enum { ans=1 }; }; +template <> struct ucl_same_type { enum { ans=1 }; }; +template <> struct ucl_same_type { enum { ans=1 }; }; + +template <> struct ucl_same_type { enum { ans=1 }; }; +template <> struct ucl_same_type { enum { ans=1 }; }; +template <> struct ucl_same_type { enum { ans=1 }; }; +template <> struct ucl_same_type { enum { ans=1 }; }; +template <> struct ucl_same_type { enum { ans=1 }; }; +template <> struct ucl_same_type { enum { ans=1 }; }; +template <> struct ucl_same_type { enum { ans=1 }; }; +template <> struct ucl_same_type { enum { ans=1 }; }; +template <> struct ucl_same_type { enum { ans=1 }; }; +template <> struct ucl_same_type { enum { ans=1 }; }; +template <> struct ucl_same_type { enum { ans=1 }; }; +template <> struct ucl_same_type { enum { ans=1 }; }; + +template struct ucl_same_type { enum { ans=0 }; }; + #endif diff --git a/lib/gpu/lal_answer.cpp b/lib/gpu/lal_answer.cpp index 4ead777609..6f42790ca3 100644 --- a/lib/gpu/lal_answer.cpp +++ b/lib/gpu/lal_answer.cpp @@ -39,30 +39,16 @@ bool AnswerT::alloc(const int inum) { bool success=true; - int ans_elements=4; + _ans_fields=4; if (_rot) - ans_elements+=4; + _ans_fields+=4; - // Ignore host/device transfers? - bool cpuview=false; - if (dev->device_type()==UCL_CPU) - cpuview=true; - - // -------------------------- Host allocations - success=success &&(host_ans.alloc(ans_elements*_max_local,*dev)==UCL_SUCCESS); - success=success &&(host_engv.alloc(_ev_fields*_max_local,*dev)==UCL_SUCCESS); - // --------------------------- Device allocations - if (cpuview) { - dev_engv.view(host_engv); - dev_ans.view(host_ans); - } else { - success=success && (dev_engv.alloc(_ev_fields*_max_local,*dev, - UCL_WRITE_ONLY)==UCL_SUCCESS); - success=success && (dev_ans.alloc(ans_elements*_max_local, - *dev,UCL_WRITE_ONLY)==UCL_SUCCESS); - } - _gpu_bytes=dev_engv.row_bytes()+dev_ans.row_bytes(); + success=success && (engv.alloc(_ev_fields*_max_local,*dev,UCL_RW_OPTIMIZED, + UCL_WRITE_ONLY)==UCL_SUCCESS); + success=success && (force.alloc(_ans_fields*_max_local,*dev,UCL_RW_OPTIMIZED, + UCL_WRITE_ONLY)==UCL_SUCCESS); + _gpu_bytes=engv.device.row_bytes()+force.device.row_bytes(); _allocated=true; return success; @@ -114,32 +100,24 @@ bool AnswerT::add_fields(const bool charge, const bool rot) { if (realloc) { _other=_charge || _rot; int inum=_max_local; - clear_resize(); + force.clear(); + engv.clear(); + _allocated=false; return alloc(inum); } return true; } -template -void AnswerT::clear_resize() { - if (!_allocated) - return; - _allocated=false; - - dev_ans.clear(); - dev_engv.clear(); - host_ans.clear(); - host_engv.clear(); -} - template void AnswerT::clear() { _gpu_bytes=0; if (!_allocated) return; + _allocated=false; + force.clear(); + engv.clear(); time_answer.clear(); - clear_resize(); _inum=0; _ilist=NULL; _eflag=false; @@ -174,11 +152,11 @@ void AnswerT::copy_answers(const bool eflag, const bool vflag, csize-=6; if (csize>0) - ucl_copy(host_engv,dev_engv,_inum*csize,true); + engv.update_host(_inum*csize,true); if (_rot) - ucl_copy(host_ans,dev_ans,_inum*4*2,true); + force.update_host(_inum*4*2,true); else - ucl_copy(host_ans,dev_ans,_inum*4,true); + force.update_host(_inum*4,true); time_answer.stop(); } @@ -201,28 +179,28 @@ double AnswerT::energy_virial(double *eatom, double **vatom, for (int i=0; i<6; i++) virial_acc[i]=0.0; if (_ilist==NULL) { for (int i=0; i<_inum; i++) { - acctyp *ap=host_engv.begin()+i; + int al=i; if (_eflag) { if (_ef_atom) { - evdwl+=*ap; - eatom[i]+=*ap*0.5; - ap+=_inum; + evdwl+=engv[al]; + eatom[i]+=engv[al]*0.5; + al+=_inum; } else { - evdwl+=*ap; - ap+=_inum; + evdwl+=engv[al]; + al+=_inum; } } if (_vflag) { if (_vf_atom) { for (int j=0; j<6; j++) { - vatom[i][j]+=*ap*0.5; - virial_acc[j]+=*ap; - ap+=_inum; + vatom[i][j]+=engv[al]*0.5; + virial_acc[j]+=engv[al]; + al+=_inum; } } else { for (int j=0; j<6; j++) { - virial_acc[j]+=*ap; - ap+=_inum; + virial_acc[j]+=engv[al]; + al+=_inum; } } } @@ -231,29 +209,29 @@ double AnswerT::energy_virial(double *eatom, double **vatom, virial[j]+=virial_acc[j]*0.5; } else { for (int i=0; i<_inum; i++) { - acctyp *ap=host_engv.begin()+i; + int al=i; int ii=_ilist[i]; if (_eflag) { if (_ef_atom) { - evdwl+=*ap; - eatom[ii]+=*ap*0.5; - ap+=_inum; + evdwl+=engv[al]; + eatom[ii]+=engv[al]*0.5; + al+=_inum; } else { - evdwl+=*ap; - ap+=_inum; + evdwl+=engv[al]; + al+=_inum; } } if (_vflag) { if (_vf_atom) { for (int j=0; j<6; j++) { - vatom[ii][j]+=*ap*0.5; - virial_acc[j]+=*ap; - ap+=_inum; + vatom[ii][j]+=engv[al]*0.5; + virial_acc[j]+=engv[al]; + al+=_inum; } } else { for (int j=0; j<6; j++) { - virial_acc[j]+=*ap; - ap+=_inum; + virial_acc[j]+=engv[al]; + al+=_inum; } } } @@ -281,33 +259,33 @@ double AnswerT::energy_virial(double *eatom, double **vatom, for (int i=0; i<6; i++) virial_acc[i]=0.0; if (_ilist==NULL) { for (int i=0; i<_inum; i++) { - acctyp *ap=host_engv.begin()+i; + int al=i; if (_eflag) { if (_ef_atom) { - evdwl+=*ap; - eatom[i]+=*ap*0.5; - ap+=_inum; - _ecoul+=*ap; - eatom[i]+=*ap*0.5; - ap+=_inum; + evdwl+=engv[al]; + eatom[i]+=engv[al]*0.5; + al+=_inum; + _ecoul+=engv[al]; + eatom[i]+=engv[al]*0.5; + al+=_inum; } else { - evdwl+=*ap; - ap+=_inum; - _ecoul+=*ap; - ap+=_inum; + evdwl+=engv[al]; + al+=_inum; + _ecoul+=engv[al]; + al+=_inum; } } if (_vflag) { if (_vf_atom) { for (int j=0; j<6; j++) { - vatom[i][j]+=*ap*0.5; - virial_acc[j]+=*ap; - ap+=_inum; + vatom[i][j]+=engv[al]*0.5; + virial_acc[j]+=engv[al]; + al+=_inum; } } else { for (int j=0; j<6; j++) { - virial_acc[j]+=*ap; - ap+=_inum; + virial_acc[j]+=engv[al]; + al+=_inum; } } } @@ -316,34 +294,34 @@ double AnswerT::energy_virial(double *eatom, double **vatom, virial[j]+=virial_acc[j]*0.5; } else { for (int i=0; i<_inum; i++) { - acctyp *ap=host_engv.begin()+i; + int al=i; int ii=_ilist[i]; if (_eflag) { if (_ef_atom) { - evdwl+=*ap; - eatom[ii]+=*ap*0.5; - ap+=_inum; - _ecoul+=*ap; - eatom[ii]+=*ap*0.5; - ap+=_inum; + evdwl+=engv[al]; + eatom[ii]+=engv[al]*0.5; + al+=_inum; + _ecoul+=engv[al]; + eatom[ii]+=engv[al]*0.5; + al+=_inum; } else { - evdwl+=*ap; - ap+=_inum; - _ecoul+=*ap; - ap+=_inum; + evdwl+=engv[al]; + al+=_inum; + _ecoul+=engv[al]; + al+=_inum; } } if (_vflag) { if (_vf_atom) { for (int j=0; j<6; j++) { - vatom[ii][j]+=*ap*0.5; - virial_acc[j]+=*ap; - ap+=_inum; + vatom[ii][j]+=engv[al]*0.5; + virial_acc[j]+=engv[al]; + al+=_inum; } } else { for (int j=0; j<6; j++) { - virial_acc[j]+=*ap; - ap+=_inum; + virial_acc[j]+=engv[al]; + al+=_inum; } } } @@ -359,45 +337,37 @@ double AnswerT::energy_virial(double *eatom, double **vatom, template void AnswerT::get_answers(double **f, double **tor) { - acctyp *ap=host_ans.begin(); + int fl=0; if (_ilist==NULL) { for (int i=0; i<_inum; i++) { - f[i][0]+=*ap; - ap++; - f[i][1]+=*ap; - ap++; - f[i][2]+=*ap; - ap+=2; + f[i][0]+=force[fl]; + f[i][1]+=force[fl+1]; + f[i][2]+=force[fl+2]; + fl+=4; } if (_rot) { for (int i=0; i<_inum; i++) { - tor[i][0]+=*ap; - ap++; - tor[i][1]+=*ap; - ap++; - tor[i][2]+=*ap; - ap+=2; + tor[i][0]+=force[fl]; + tor[i][1]+=force[fl+1]; + tor[i][2]+=force[fl+2]; + fl+=4; } } } else { for (int i=0; i<_inum; i++) { int ii=_ilist[i]; - f[ii][0]+=*ap; - ap++; - f[ii][1]+=*ap; - ap++; - f[ii][2]+=*ap; - ap+=2; + f[ii][0]+=force[fl]; + f[ii][1]+=force[fl+1]; + f[ii][2]+=force[fl+2]; + fl+=4; } if (_rot) { for (int i=0; i<_inum; i++) { int ii=_ilist[i]; - tor[ii][0]+=*ap; - ap++; - tor[ii][1]+=*ap; - ap++; - tor[ii][2]+=*ap; - ap+=2; + tor[ii][0]+=force[fl]; + tor[ii][1]+=force[fl+1]; + tor[ii][2]+=force[fl+2]; + fl+=4; } } } diff --git a/lib/gpu/lal_answer.h b/lib/gpu/lal_answer.h index 721e16cdd5..c642781c07 100644 --- a/lib/gpu/lal_answer.h +++ b/lib/gpu/lal_answer.h @@ -19,18 +19,18 @@ #include #include "mpi.h" -#ifdef USE_OPENCL - +#if defined(USE_OPENCL) #include "geryon/ocl_timer.h" #include "geryon/ocl_mat.h" using namespace ucl_opencl; - +#elif defined(USE_CUDART) +#include "geryon/nvc_timer.h" +#include "geryon/nvc_mat.h" +using namespace ucl_cudart; #else - #include "geryon/nvd_timer.h" #include "geryon/nvd_mat.h" using namespace ucl_cudadr; - #endif #include "lal_precision.h" @@ -59,8 +59,10 @@ class Answer { inline void resize(const int inum, bool &success) { _inum=inum; if (inum>_max_local) { - clear_resize(); - success = success && alloc(inum); + _max_local=static_cast(static_cast(inum)*1.10); + success=success && (force.resize(_max_local*_ans_fields)==UCL_SUCCESS); + success=success && (engv.resize(_max_local*_ev_fields)==UCL_SUCCESS); + _gpu_bytes=engv.device.row_bytes()+force.device.row_bytes(); } } @@ -68,9 +70,6 @@ class Answer { /** \param rot True if atom storage needs quaternions **/ bool add_fields(const bool charge, const bool rot); - /// Free all memory on host and device needed to realloc for more atoms - void clear_resize(); - /// Free all memory on host and device void clear(); @@ -136,14 +135,9 @@ class Answer { // ------------------------------ DATA ---------------------------------- /// Force and possibly torque - UCL_D_Vec dev_ans; + UCL_Vector force; /// Energy and virial per-atom storage - UCL_D_Vec dev_engv; - - /// Force and possibly torque data on host - UCL_H_Vec host_ans; - /// Energy/virial data on host - UCL_H_Vec host_engv; + UCL_Vector engv; /// Device timers UCL_Timer time_answer; @@ -155,7 +149,7 @@ class Answer { bool alloc(const int inum); bool _allocated, _eflag, _vflag, _ef_atom, _vf_atom, _rot, _charge, _other; - int _max_local, _inum, _e_fields, _ev_fields; + int _max_local, _inum, _e_fields, _ev_fields, _ans_fields; int *_ilist; double _time_cast, _time_cpu_idle; diff --git a/lib/gpu/lal_atom.cpp b/lib/gpu/lal_atom.cpp index 357316c5a3..5cf46c8751 100644 --- a/lib/gpu/lal_atom.cpp +++ b/lib/gpu/lal_atom.cpp @@ -51,10 +51,14 @@ bool AtomT::alloc(const int nall) { bool success=true; // Ignore host/device transfers? - bool cpuview=false; - if (dev->device_type()==UCL_CPU) - cpuview=true; - + _host_view=false; + if (dev->shared_memory()) { + _host_view=true; + #ifdef GPU_CAST + assert(0==1); + #endif + } + // Allocate storage for CUDPP sort #ifdef USE_CUDPP if (_gpu_nbor==1) { @@ -64,63 +68,101 @@ bool AtomT::alloc(const int nall) { } #endif - // -------------------------- Host allocations - // Get a host write only buffer - #ifdef GPU_CAST - success=success && (host_x_cast.alloc(_max_atoms*3,*dev, - UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); - success=success && (host_type_cast.alloc(_max_atoms,*dev, - UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); - #else - success=success && (host_x.alloc(_max_atoms*4,*dev, - UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); - #endif - // Buffer for casting only if different precisions - if (_charge) - success=success && (host_q.alloc(_max_atoms,*dev, - UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); - // Buffer for casting only if different precisions - if (_rot) - success=success && (host_quat.alloc(_max_atoms*4,*dev, - UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); - - // --------------------------- Device allocations int gpu_bytes=0; - if (cpuview) { - #ifdef GPU_CAST - assert(0==1); - #else - dev_x.view(host_x); - #endif - if (_rot) - dev_quat.view(host_quat); - if (_charge) - dev_q.view(host_q); - } else { - #ifdef GPU_CAST - success=success && (UCL_SUCCESS==dev_x.alloc(_max_atoms*4,*dev)); - success=success && (UCL_SUCCESS== - dev_x_cast.alloc(_max_atoms*3,*dev,UCL_READ_ONLY)); - success=success && (UCL_SUCCESS== - dev_type_cast.alloc(_max_atoms,*dev,UCL_READ_ONLY)); - gpu_bytes+=dev_x_cast.row_bytes()+dev_type_cast.row_bytes(); - #else - success=success && (UCL_SUCCESS== - dev_x.alloc(_max_atoms*4,*dev,UCL_READ_ONLY)); - #endif - if (_charge) { - success=success && (dev_q.alloc(_max_atoms,*dev, - UCL_READ_ONLY)==UCL_SUCCESS); - gpu_bytes+=dev_q.row_bytes(); + success=success && (x.alloc(_max_atoms*4,*dev,UCL_WRITE_OPTIMIZED, + UCL_READ_ONLY)==UCL_SUCCESS); + #ifdef GPU_CAST + success=success && (x_cast.alloc(_max_atoms*3,*dev,UCL_READ_ONLY)== + UCL_SUCCESS); + success=success && (type_cast.alloc(_max_atoms,*dev,UCL_READ_ONLY)== + UCL_SUCCESS); + gpu_bytes+=x_cast.device.row_bytes()+type_cast.device.row_bytes(); + #endif + + if (_charge && _host_view==false) { + success=success && (q.alloc(_max_atoms,*dev,UCL_WRITE_OPTIMIZED, + UCL_READ_ONLY)==UCL_SUCCESS); + gpu_bytes+=q.device.row_bytes(); + } + if (_rot && _host_view==false) { + success=success && (quat.alloc(_max_atoms*4,*dev,UCL_WRITE_OPTIMIZED, + UCL_READ_ONLY)==UCL_SUCCESS); + gpu_bytes+=quat.device.row_bytes(); + } + + if (_gpu_nbor>0) { + if (_bonds) { + success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS); + gpu_bytes+=dev_tag.row_bytes(); } - if (_rot) { - success=success && (dev_quat.alloc(_max_atoms*4,*dev, - UCL_READ_ONLY)==UCL_SUCCESS); - gpu_bytes+=dev_quat.row_bytes(); + if (_gpu_nbor==1) { + success=success && (dev_cell_id.alloc(_max_atoms,*dev)==UCL_SUCCESS); + gpu_bytes+=dev_cell_id.row_bytes(); + } else { + success=success && (host_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS); + success=success && + (host_cell_id.alloc(_max_atoms,*dev,UCL_NOT_PINNED)==UCL_SUCCESS); + } + if (_gpu_nbor==2 && _host_view) + dev_particle_id.view(host_particle_id); + else + success=success && (dev_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS); + gpu_bytes+=dev_particle_id.row_bytes(); + } + + gpu_bytes+=x.device.row_bytes(); + if (gpu_bytes>_max_gpu_bytes) + _max_gpu_bytes=gpu_bytes; + + _allocated=true; + return success; +} + +template +bool AtomT::add_fields(const bool charge, const bool rot, + const int gpu_nbor, const bool bonds) { + bool success=true; + // Ignore host/device transfers? + int gpu_bytes=0; + + if (charge && _charge==false) { + _charge=true; + _other=true; + if (_host_view==false) { + success=success && (q.alloc(_max_atoms,*dev,UCL_WRITE_OPTIMIZED, + UCL_READ_ONLY)==UCL_SUCCESS); + gpu_bytes+=q.device.row_bytes(); } } - if (_gpu_nbor>0) { + + if (rot && _rot==false) { + _rot=true; + _other=true; + if (_host_view==false) { + success=success && (quat.alloc(_max_atoms*4,*dev,UCL_WRITE_OPTIMIZED, + UCL_READ_ONLY)==UCL_SUCCESS); + gpu_bytes+=quat.device.row_bytes(); + } + } + + if (bonds && _bonds==false) { + _bonds=true; + if (_bonds && _gpu_nbor>0) { + success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS); + gpu_bytes+=dev_tag.row_bytes(); + } + } + + if (gpu_nbor>0 && _gpu_nbor==0) { + _gpu_nbor=gpu_nbor; + #ifdef USE_CUDPP + if (_gpu_nbor==1) { + CUDPPResult result = cudppPlan(&sort_plan, sort_config, _max_atoms, 1, 0); + if (CUDPP_SUCCESS != result) + return false; + } + #endif success=success && (dev_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS); gpu_bytes+=dev_particle_id.row_bytes(); if (_bonds) { @@ -137,43 +179,9 @@ bool AtomT::alloc(const int nall) { } } - gpu_bytes+=dev_x.row_bytes(); - if (gpu_bytes>_max_gpu_bytes) - _max_gpu_bytes=gpu_bytes; - - _allocated=true; return success; } -template -bool AtomT::add_fields(const bool charge, const bool rot, - const int gpu_nbor, const bool bonds) { - bool realloc=false; - if (charge && _charge==false) { - _charge=true; - realloc=true; - } - if (rot && _rot==false) { - _rot=true; - realloc=true; - } - if (gpu_nbor>0 && _gpu_nbor==0) { - _gpu_nbor=gpu_nbor; - realloc=true; - } - if (bonds && _bonds==false) { - _bonds=true; - realloc=true; - } - if (realloc) { - _other=_charge || _rot; - int max_atoms=_max_atoms; - clear_resize(); - return alloc(max_atoms); - } - return true; -} - template bool AtomT::init(const int nall, const bool charge, const bool rot, UCL_Device &devi, const int gpu_nbor, const bool bonds) { @@ -219,27 +227,18 @@ void AtomT::clear_resize() { return; _allocated=false; - dev_x.clear(); - if (_charge) { - dev_q.clear(); - host_q.clear(); - } - if (_rot) { - dev_quat.clear(); - host_quat.clear(); - } - #ifndef GPU_CAST - host_x.clear(); - #else - host_x_cast.clear(); - host_type_cast.clear(); - #endif + x.clear(); + if (_charge) + q.clear(); + if (_rot) + quat.clear(); + dev_cell_id.clear(); dev_particle_id.clear(); dev_tag.clear(); #ifdef GPU_CAST - dev_x_cast.clear(); - dev_type_cast.clear(); + x_cast.clear(); + type_cast.clear(); #endif #ifdef USE_CUDPP @@ -279,8 +278,7 @@ double AtomT::host_memory_usage() const { atom_bytes+=1; if (_rot) atom_bytes+=4; - return _max_atoms*atom_bytes*sizeof(numtyp)+ - sizeof(Atom); + return _max_atoms*atom_bytes*sizeof(numtyp)+sizeof(Atom); } // Sort arrays for neighbor list calculation @@ -292,16 +290,18 @@ void AtomT::sort_neighbor(const int num_atoms) { 8*sizeof(unsigned), num_atoms); if (CUDPP_SUCCESS != result) { printf("Error in cudppSort\n"); - NVD_GERYON_EXIT; + UCL_GERYON_EXIT; } #endif } #ifdef GPU_CAST -#ifdef USE_OPENCL +#if defined(USE_OPENCL) #include "atom_cl.h" +#elif defined(USE_CUDART) +const char *atom=0; #else -#include "atom_ptx.h" +#include "atom_cubin.h" #endif template @@ -316,3 +316,4 @@ void AtomT::compile_kernels(UCL_Device &dev) { #endif template class Atom; + diff --git a/lib/gpu/lal_atom.h b/lib/gpu/lal_atom.h index 642fce07ad..171141f7ea 100644 --- a/lib/gpu/lal_atom.h +++ b/lib/gpu/lal_atom.h @@ -19,20 +19,21 @@ #include #include "mpi.h" -#ifdef USE_OPENCL - +#if defined(USE_OPENCL) #include "geryon/ocl_timer.h" #include "geryon/ocl_mat.h" #include "geryon/ocl_kernel.h" using namespace ucl_opencl; - +#elif defined(USE_CUDART) +#include "geryon/nvc_timer.h" +#include "geryon/nvc_mat.h" +#include "geryon/nvc_kernel.h" +using namespace ucl_cudart; #else - #include "geryon/nvd_timer.h" #include "geryon/nvd_mat.h" #include "geryon/nvd_kernel.h" using namespace ucl_cudadr; - #endif #ifdef USE_CUDPP @@ -92,7 +93,7 @@ class Atom { bool charge() { return _charge; } /// Returns true if GPU is using quaternions - bool quat() { return _rot; } + bool quaternion() { return _rot; } /// Only free matrices of length inum or nall for resizing void clear_resize(); @@ -148,9 +149,9 @@ class Atom { /// Pack LAMMPS atom type constants into matrix and copy to device template - inline void type_pack1(const int n, const int m_size, - UCL_D_Vec &dev_v, UCL_H_Vec &buffer, - t1 **one) { + inline void type_pack1(const int n, const int m_size, + UCL_D_Vec &dev_v, UCL_H_Vec &buffer, + t1 **one) { int ii=0; for (int i=0; i inline void type_pack2(const int n, const int m_size, - UCL_D_Vec &dev_v, UCL_H_Vec &buffer, - t1 **one, t2 **two) { + UCL_D_Vec &dev_v, UCL_H_Vec &buffer, + t1 **one, t2 **two) { int ii=0; for (int i=0; i inline void type_pack4(const int n, const int m_size, - UCL_D_Vec &dev_v, UCL_H_Vec &buffer, - t1 **one, t2 **two, t3 **three) { + UCL_D_Vec &dev_v, UCL_H_Vec &buffer, + t1 **one, t2 **two, t3 **three) { int ii=0; for (int i=0; i inline void type_pack4(const int n, const int m_size, - UCL_D_Vec &dev_v, UCL_H_Vec &buffer, - t1 **one, t2 **two, t3 **three, t4 **four) { + UCL_D_Vec &dev_v, UCL_H_Vec &buffer, + t1 **one, t2 **two, t3 **three, t4 **four) { int ii=0; for (int i=0; i(ceil(static_cast(_nall)/block_size)); k_cast_x.set_size(GX,block_size); - k_cast_x.run(&dev_x.begin(), &dev_x_cast.begin(), &dev_type_cast.begin(), - &_nall); + k_cast_x.run(&x, &x_cast, &type_cast, &_nall); #else - ucl_copy(dev_x,host_x,_nall*4,true); + x.update_device(_nall*4,true); #endif _x_avail=true; } @@ -299,18 +296,14 @@ class Atom { inline void cast_q_data(cpytyp *host_ptr) { if (_q_avail==false) { double t=MPI_Wtime(); - if (dev->device_type()==UCL_CPU) { - if (sizeof(numtyp)==sizeof(double)) { - host_q.view((numtyp*)host_ptr,_nall,*dev); - dev_q.view(host_q); - } else - for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i]; - } else { - if (sizeof(numtyp)==sizeof(double)) - memcpy(host_q.begin(),host_ptr,_nall*sizeof(numtyp)); - else - for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i]; - } + // If double precision, still memcpy for async transfers + if (_host_view) { + q.host.view((numtyp*)host_ptr,_nall,*dev); + q.device.view(q.host); + } else if (sizeof(numtyp)==sizeof(double)) + memcpy(q.host.begin(),host_ptr,_nall*sizeof(numtyp)); + else + for (int i=0; i<_nall; i++) q[i]=host_ptr[i]; _time_cast+=MPI_Wtime()-t; } } @@ -318,7 +311,7 @@ class Atom { // Copy charges to device asynchronously inline void add_q_data() { if (_q_avail==false) { - ucl_copy(dev_q,host_q,_nall,true); + q.update_device(_nall,true); _q_avail=true; } } @@ -328,18 +321,13 @@ class Atom { inline void cast_quat_data(cpytyp *host_ptr) { if (_quat_avail==false) { double t=MPI_Wtime(); - if (dev->device_type()==UCL_CPU) { - if (sizeof(numtyp)==sizeof(double)) { - host_quat.view((numtyp*)host_ptr,_nall*4,*dev); - dev_quat.view(host_quat); - } else - for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i]; - } else { - if (sizeof(numtyp)==sizeof(double)) - memcpy(host_quat.begin(),host_ptr,_nall*4*sizeof(numtyp)); - else - for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i]; - } + if (_host_view) { + quat.host.view((numtyp*)host_ptr,_nall*4,*dev); + quat.device.view(quat.host); + } else if (sizeof(numtyp)==sizeof(double)) + memcpy(quat.host.begin(),host_ptr,_nall*4*sizeof(numtyp)); + else + for (int i=0; i<_nall*4; i++) quat[i]=host_ptr[i]; _time_cast+=MPI_Wtime()-t; } } @@ -348,7 +336,7 @@ class Atom { /** Copies nall()*4 elements **/ inline void add_quat_data() { if (_quat_avail==false) { - ucl_copy(dev_quat,host_quat,_nall*4,true); + quat.update_device(_nall*4,true); _quat_avail=true; } } @@ -363,29 +351,23 @@ class Atom { inline double max_gpu_bytes() { double m=_max_gpu_bytes; _max_gpu_bytes=0.0; return m; } + /// Returns true if the device is addressing memory on the host + inline bool host_view() { return _host_view; } + // ------------------------------ DATA ---------------------------------- /// Atom coordinates and types ([0] is x, [1] is y, [2] is z, [3] is type - UCL_D_Vec dev_x; + UCL_Vector x; /// Charges - UCL_D_Vec dev_q; + UCL_Vector q; /// Quaterions - UCL_D_Vec dev_quat; + UCL_Vector quat; #ifdef GPU_CAST - UCL_D_Vec dev_x_cast; - UCL_D_Vec dev_type_cast; - UCL_H_Vec host_x_cast; - UCL_H_Vec host_type_cast; + UCL_Vector x_cast; + UCL_Vector type_cast; #endif - /// Buffer for moving positions to device - UCL_H_Vec host_x; - /// Buffer for moving charge data to GPU - UCL_H_Vec host_q; - /// Buffer for moving quat data to GPU - UCL_H_Vec host_quat; - /// Cell list identifiers for device nbor builds UCL_D_Vec dev_cell_id; /// Cell list identifiers for device nbor builds @@ -418,9 +400,9 @@ class Atom { bool alloc(const int nall); - bool _allocated, _rot, _charge, _other; + bool _allocated, _rot, _charge, _bonds, _other; int _max_atoms, _nall, _gpu_nbor; - bool _bonds; + bool _host_view; double _time_cast, _time_transfer; double _max_gpu_bytes; @@ -434,3 +416,4 @@ class Atom { } #endif + diff --git a/lib/gpu/lal_base_atomic.cpp b/lib/gpu/lal_base_atomic.cpp index 553352e84b..f88c4417af 100644 --- a/lib/gpu/lal_base_atomic.cpp +++ b/lib/gpu/lal_base_atomic.cpp @@ -40,10 +40,10 @@ int BaseAtomicT::bytes_per_atom_atomic(const int max_nbors) const { template int BaseAtomicT::init_atomic(const int nlocal, const int nall, - const int max_nbors, const int maxspecial, - const double cell_size, - const double gpu_split, FILE *_screen, - const char *pair_program) { + const int max_nbors, const int maxspecial, + const double cell_size, const double gpu_split, + FILE *_screen, const void *pair_program, + const char *k_name) { screen=_screen; int gpu_nbor=0; @@ -74,7 +74,7 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall, atom=&device->atom; _block_size=device->pair_block_size(); - compile_kernels(*ucl_device,pair_program); + compile_kernels(*ucl_device,pair_program,k_name); // Initialize host-device load balancer hd_balancer.init(device,gpu_nbor,gpu_split); @@ -83,7 +83,7 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall, time_pair.init(*ucl_device); time_pair.zero(); - pos_tex.bind_float(atom->dev_x,4); + pos_tex.bind_float(atom->x,4); _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes(); @@ -266,18 +266,20 @@ double BaseAtomicT::host_memory_usage_atomic() const { } template -void BaseAtomicT::compile_kernels(UCL_Device &dev, const char *pair_str) { +void BaseAtomicT::compile_kernels(UCL_Device &dev, const void *pair_str, + const char *kname) { if (_compiled) return; + std::string s_fast=std::string(kname)+"_fast"; std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+ std::string(OCL_PRECISION_COMPILE)+" -D"+ std::string(OCL_VENDOR); pair_program=new UCL_Program(dev); pair_program->load_string(pair_str,flags.c_str()); - k_pair_fast.set_function(*pair_program,"kernel_pair_fast"); - k_pair.set_function(*pair_program,"kernel_pair"); + k_pair_fast.set_function(*pair_program,s_fast.c_str()); + k_pair.set_function(*pair_program,kname); pos_tex.get_texture(*pair_program,"pos_tex"); _compiled=true; diff --git a/lib/gpu/lal_base_atomic.h b/lib/gpu/lal_base_atomic.h index 7e9a911385..74c8530f7f 100644 --- a/lib/gpu/lal_base_atomic.h +++ b/lib/gpu/lal_base_atomic.h @@ -20,8 +20,10 @@ #include "lal_balance.h" #include "mpi.h" -#ifdef USE_OPENCL +#if defined(USE_OPENCL) #include "geryon/ocl_texture.h" +#elif defined(USE_CUDART) +#include "geryon/nvc_texture.h" #else #include "geryon/nvd_texture.h" #endif @@ -38,6 +40,7 @@ class BaseAtomic { /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device + * \param k_name name for the kernel for force calculation * * Returns: * - 0 if successfull @@ -48,7 +51,7 @@ class BaseAtomic { int init_atomic(const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, - const char *pair_program); + const void *pair_program, const char *k_name); /// Estimate the overhead for GPU context changes and CPU driver void estimate_gpu_overhead(); @@ -57,7 +60,7 @@ class BaseAtomic { /** \param success set to false if insufficient memory **/ inline void resize_atom(const int inum, const int nall, bool &success) { if (atom->resize(nall, success)) - pos_tex.bind_float(atom->dev_x,4); + pos_tex.bind_float(atom->x,4); ans->resize(inum,success); } @@ -188,7 +191,7 @@ class BaseAtomic { double _gpu_overhead, _driver_overhead; UCL_D_Vec *_nbor_data; - void compile_kernels(UCL_Device &dev, const char *pair_string); + void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k); virtual void loop(const bool _eflag, const bool _vflag) = 0; }; diff --git a/lib/gpu/lal_base_charge.cpp b/lib/gpu/lal_base_charge.cpp index 3ac63666b0..f9bb2a52f3 100644 --- a/lib/gpu/lal_base_charge.cpp +++ b/lib/gpu/lal_base_charge.cpp @@ -41,10 +41,10 @@ int BaseChargeT::bytes_per_atom_atomic(const int max_nbors) const { template int BaseChargeT::init_atomic(const int nlocal, const int nall, - const int max_nbors, const int maxspecial, - const double cell_size, - const double gpu_split, FILE *_screen, - const char *pair_program) { + const int max_nbors, const int maxspecial, + const double cell_size, const double gpu_split, + FILE *_screen, const void *pair_program, + const char *k_name) { screen=_screen; int gpu_nbor=0; @@ -76,7 +76,7 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall, _block_size=device->pair_block_size(); _block_bio_size=device->block_bio_pair(); - compile_kernels(*ucl_device,pair_program); + compile_kernels(*ucl_device,pair_program,k_name); // Initialize host-device load balancer hd_balancer.init(device,gpu_nbor,gpu_split); @@ -85,8 +85,8 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall, time_pair.init(*ucl_device); time_pair.zero(); - pos_tex.bind_float(atom->dev_x,4); - q_tex.bind_float(atom->dev_q,1); + pos_tex.bind_float(atom->x,4); + q_tex.bind_float(atom->q,1); _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes(); @@ -282,18 +282,20 @@ double BaseChargeT::host_memory_usage_atomic() const { } template -void BaseChargeT::compile_kernels(UCL_Device &dev, const char *pair_str) { +void BaseChargeT::compile_kernels(UCL_Device &dev, const void *pair_str, + const char *kname) { if (_compiled) return; + std::string s_fast=std::string(kname)+"_fast"; std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+ std::string(OCL_PRECISION_COMPILE)+" -D"+ std::string(OCL_VENDOR); pair_program=new UCL_Program(dev); pair_program->load_string(pair_str,flags.c_str()); - k_pair_fast.set_function(*pair_program,"kernel_pair_fast"); - k_pair.set_function(*pair_program,"kernel_pair"); + k_pair_fast.set_function(*pair_program,s_fast.c_str()); + k_pair.set_function(*pair_program,kname); pos_tex.get_texture(*pair_program,"pos_tex"); q_tex.get_texture(*pair_program,"q_tex"); diff --git a/lib/gpu/lal_base_charge.h b/lib/gpu/lal_base_charge.h index a0a42be671..3ca4705177 100644 --- a/lib/gpu/lal_base_charge.h +++ b/lib/gpu/lal_base_charge.h @@ -21,8 +21,10 @@ #include "lal_balance.h" #include "mpi.h" -#ifdef USE_OPENCL +#if defined(USE_OPENCL) #include "geryon/ocl_texture.h" +#elif defined(USE_CUDART) +#include "geryon/nvc_texture.h" #else #include "geryon/nvd_texture.h" #endif @@ -39,6 +41,7 @@ class BaseCharge { /** \param max_nbors initial number of rows in the neighbor matrix * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device + * \param k_name name for the kernel for force calculation * * Returns: * - 0 if successfull @@ -49,7 +52,7 @@ class BaseCharge { int init_atomic(const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, - const char *pair_program); + const void *pair_program, const char *k_name); /// Estimate the overhead for GPU context changes and CPU driver void estimate_gpu_overhead(); @@ -58,8 +61,8 @@ class BaseCharge { /** \param success set to false if insufficient memory **/ inline void resize_atom(const int inum, const int nall, bool &success) { if (atom->resize(nall, success)) { - pos_tex.bind_float(atom->dev_x,4); - q_tex.bind_float(atom->dev_q,1); + pos_tex.bind_float(atom->x,4); + q_tex.bind_float(atom->q,1); } ans->resize(inum,success); } @@ -187,7 +190,7 @@ class BaseCharge { double _gpu_overhead, _driver_overhead; UCL_D_Vec *_nbor_data; - void compile_kernels(UCL_Device &dev, const char *pair_string); + void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k); virtual void loop(const bool _eflag, const bool _vflag) = 0; }; diff --git a/lib/gpu/lal_base_ellipsoid.cpp b/lib/gpu/lal_base_ellipsoid.cpp index ee43cd8f75..7e86d03e50 100644 --- a/lib/gpu/lal_base_ellipsoid.cpp +++ b/lib/gpu/lal_base_ellipsoid.cpp @@ -17,10 +17,12 @@ #include using namespace LAMMPS_AL; -#ifdef USE_OPENCL +#if defined(USE_OPENCL) #include "ellipsoid_nbor_cl.h" +#elif defined(USE_CUDART) +const char *ellipsoid_nbor=0; #else -#include "ellipsoid_nbor_ptx.h" +#include "ellipsoid_nbor_cubin.h" #endif #define BaseEllipsoidT BaseEllipsoid @@ -50,8 +52,9 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *_screen, const int ntypes, int **h_form, - const char *ellipsoid_program, - const char *lj_program, const bool ellip_sphere) { + const void *ellipsoid_program, + const void *lj_program, const char *k_name, + const bool ellip_sphere) { screen=_screen; _ellipsoid_sphere=ellip_sphere; @@ -78,7 +81,7 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall, atom=&device->atom; _block_size=device->pair_block_size(); - compile_kernels(*ucl_device,ellipsoid_program,lj_program,ellip_sphere); + compile_kernels(*ucl_device,ellipsoid_program,lj_program,k_name,ellip_sphere); // Initialize host-device load balancer hd_balancer.init(device,gpu_nbor,gpu_split); @@ -112,7 +115,7 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall, } if (_multiple_forms) - ans->dev_ans.zero(); + ans->force.zero(); // Memory for ilist ordered by particle type if (host_olist.alloc(nbor->max_atoms(),*ucl_device)==UCL_SUCCESS) @@ -121,6 +124,12 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall, _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes(); + neigh_tex.bind_float(atom->x,4); + pos_tex.bind_float(atom->x,4); + quat_tex.bind_float(atom->quat,4); + lj_pos_tex.bind_float(atom->x,4); + lj_quat_tex.bind_float(atom->quat,4); + return 0; } @@ -241,14 +250,12 @@ void BaseEllipsoidT::pack_nbors(const int GX, const int BX, const int start, int stride=nbor->nbor_pitch(); if (shared_types) { k_nbor_fast.set_size(GX,BX); - k_nbor_fast.run(&atom->dev_x.begin(), &cut_form.begin(), - &nbor->dev_nbor.begin(), &stride, &start, &inum, - &nbor->dev_packed.begin(), &form_low, &form_high); + k_nbor_fast.run(&atom->x, &cut_form, &nbor->dev_nbor, &stride, &start, + &inum, &nbor->dev_packed, &form_low, &form_high); } else { k_nbor.set_size(GX,BX); - k_nbor.run(&atom->dev_x.begin(), &cut_form.begin(), &ntypes, - &nbor->dev_nbor.begin(), &stride, &start, &inum, - &nbor->dev_packed.begin(), &form_low, &form_high); + k_nbor.run(&atom->x, &cut_form, &ntypes, &nbor->dev_nbor, &stride, + &start, &inum, &nbor->dev_packed, &form_low, &form_high); } } @@ -437,11 +444,18 @@ double BaseEllipsoidT::host_memory_usage_base() const { template void BaseEllipsoidT::compile_kernels(UCL_Device &dev, - const char *ellipsoid_string, - const char *lj_string, const bool e_s) { + const void *ellipsoid_string, + const void *lj_string, + const char *kname, const bool e_s) { if (_compiled) return; + std::string kns=kname; + std::string s_sphere_ellipsoid=kns+"_sphere_ellipsoid"; + std::string s_ellipsoid_sphere=kns+"_ellipsoid_sphere"; + std::string s_lj=kns+"_lj"; + std::string s_lj_fast=kns+"_lj_fast"; + std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+ std::string(OCL_PRECISION_COMPILE)+" -D"+ std::string(OCL_VENDOR); @@ -450,18 +464,23 @@ void BaseEllipsoidT::compile_kernels(UCL_Device &dev, nbor_program->load_string(ellipsoid_nbor,flags.c_str()); k_nbor_fast.set_function(*nbor_program,"kernel_nbor_fast"); k_nbor.set_function(*nbor_program,"kernel_nbor"); + neigh_tex.get_texture(*nbor_program,"pos_tex"); ellipsoid_program=new UCL_Program(dev); ellipsoid_program->load_string(ellipsoid_string,flags.c_str()); - k_ellipsoid.set_function(*ellipsoid_program,"kernel_ellipsoid"); + k_ellipsoid.set_function(*ellipsoid_program,kname); + pos_tex.get_texture(*ellipsoid_program,"pos_tex"); + quat_tex.get_texture(*ellipsoid_program,"quat_tex"); lj_program=new UCL_Program(dev); lj_program->load_string(lj_string,flags.c_str()); - k_sphere_ellipsoid.set_function(*lj_program,"kernel_sphere_ellipsoid"); - k_lj_fast.set_function(*lj_program,"kernel_lj_fast"); - k_lj.set_function(*lj_program,"kernel_lj"); + k_sphere_ellipsoid.set_function(*lj_program,s_sphere_ellipsoid.c_str()); + k_lj_fast.set_function(*lj_program,s_lj_fast.c_str()); + k_lj.set_function(*lj_program,s_lj.c_str()); if (e_s) - k_ellipsoid_sphere.set_function(*lj_program,"kernel_ellipsoid_sphere"); + k_ellipsoid_sphere.set_function(*lj_program,s_ellipsoid_sphere.c_str()); + lj_pos_tex.get_texture(*lj_program,"pos_tex"); + lj_quat_tex.get_texture(*lj_program,"quat_tex"); _compiled=true; } diff --git a/lib/gpu/lal_base_ellipsoid.h b/lib/gpu/lal_base_ellipsoid.h index 7ccf5691d0..96e2e3ee50 100644 --- a/lib/gpu/lal_base_ellipsoid.h +++ b/lib/gpu/lal_base_ellipsoid.h @@ -20,8 +20,10 @@ #include "lal_balance.h" #include "mpi.h" -#ifdef USE_OPENCL +#if defined(USE_OPENCL) #include "geryon/ocl_texture.h" +#elif defined(USE_CUDART) +#include "geryon/nvc_texture.h" #else #include "geryon/nvd_texture.h" #endif @@ -39,6 +41,7 @@ class BaseEllipsoid { * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device * \param ellipsoid_sphere true if ellipsoid-sphere case handled separately + * \param k_name name for the kernel for force calculation * * Returns: * - 0 if successfull @@ -49,8 +52,9 @@ class BaseEllipsoid { int init_base(const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, const int ntypes, - int **h_form, const char *ellipsoid_program, - const char *lj_program, const bool ellipsoid_sphere=false); + int **h_form, const void *ellipsoid_program, + const void *lj_program, const char *k_name, + const bool ellipsoid_sphere=false); /// Estimate the overhead for GPU context changes and CPU driver void estimate_gpu_overhead(); @@ -58,7 +62,13 @@ class BaseEllipsoid { /// Check if there is enough storage for atom arrays and realloc if not /** \param success set to false if insufficient memory **/ inline void resize_atom(const int nall, bool &success) { - atom->resize(nall, success); + if (atom->resize(nall, success)) { + neigh_tex.bind_float(atom->x,4); + pos_tex.bind_float(atom->x,4); + quat_tex.bind_float(atom->quat,4); + lj_pos_tex.bind_float(atom->x,4); + lj_quat_tex.bind_float(atom->quat,4); + } } /// Check if there is enough storage for neighbors and realloc if not @@ -74,7 +84,7 @@ class BaseEllipsoid { const int max_nbors, const int olist_size, bool &success) { ans->resize(nlocal, success); - if (_multiple_forms) ans->dev_ans.zero(); + if (_multiple_forms) ans->force.zero(); if (olist_size>static_cast(host_olist.numel())) { host_olist.clear(); @@ -221,8 +231,7 @@ class BaseEllipsoid { inline int block_size() { return _block_size; } // --------------------------- TEXTURES ----------------------------- - UCL_Texture pos_tex; - UCL_Texture q_tex; + UCL_Texture pos_tex, quat_tex, lj_pos_tex, lj_quat_tex, neigh_tex; protected: bool _compiled, _ellipsoid_sphere; @@ -236,8 +245,8 @@ class BaseEllipsoid { int **_host_form; int _last_ellipse, _max_last_ellipse; - void compile_kernels(UCL_Device &dev, const char *ellipsoid_string, - const char *lj_string, const bool e_s); + void compile_kernels(UCL_Device &dev, const void *ellipsoid_string, + const void *lj_string, const char *kname,const bool e_s); virtual void loop(const bool _eflag, const bool _vflag) = 0; }; diff --git a/lib/gpu/lal_buck.cpp b/lib/gpu/lal_buck.cpp index 33b73568be..091ae0f62a 100644 --- a/lib/gpu/lal_buck.cpp +++ b/lib/gpu/lal_buck.cpp @@ -13,10 +13,12 @@ email : nguyentd@ornl.gov ***************************************************************************/ -#ifdef USE_OPENCL +#if defined(USE_OPENCL) #include "buck_cl.h" +#elif defined(USE_CUDART) +const char *buck=0; #else -#include "buck_ptx.h" +#include "buck_cubin.h" #endif #include "lal_buck.h" @@ -50,7 +52,7 @@ int BuckT::init(const int ntypes, double **host_cutsq, const double gpu_split, FILE *_screen) { int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, - _screen,buck); + _screen,buck,"k_buck"); if (success!=0) return success; @@ -132,20 +134,17 @@ void BuckT::loop(const bool _eflag, const bool _vflag) { this->time_pair.start(); if (shared_types) { this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->dev_x.begin(), &coeff1.begin(), - &coeff2.begin(), &sp_lj.begin(), - &this->nbor->dev_nbor.begin(), - &this->_nbor_data->begin(), - &this->ans->dev_ans.begin(), - &this->ans->dev_engv.begin(), &eflag, &vflag, - &ainum, &nbor_pitch, &this->_threads_per_atom); + this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, + &vflag, &ainum, &nbor_pitch, + &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->dev_x.begin(), &coeff1.begin(), &coeff2.begin(), - &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), - &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, - &nbor_pitch, &this->_threads_per_atom); + this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, &vflag, + &ainum, &nbor_pitch, &this->_threads_per_atom); } this->time_pair.stop(); } diff --git a/lib/gpu/lal_buck.cu b/lib/gpu/lal_buck.cu index 1281fef645..b0c817ad35 100644 --- a/lib/gpu/lal_buck.cu +++ b/lib/gpu/lal_buck.cu @@ -15,14 +15,16 @@ #ifdef NV_KERNEL #include "lal_aux_fun1.h" -texture pos_tex; #ifndef _DOUBLE_DOUBLE -ucl_inline float4 fetch_pos(const int& i, const float4 *pos) - { return tex1Dfetch(pos_tex, i); } +texture pos_tex; +#else +texture pos_tex; #endif +#else +#define pos_tex x_ #endif -__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1, +__kernel void k_buck(__global numtyp4 *x_, __global numtyp4 *coeff1, __global numtyp4* coeff2, const int lj_types, __global numtyp *sp_lj_in, __global int *dev_nbor, __global int *dev_packed, __global acctyp4 *ans, @@ -51,7 +53,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,list_end,nbor); - numtyp4 ix=fetch_pos(i,x_); //x_[i]; + numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; int itype=ix.w; numtyp factor_lj; @@ -61,7 +63,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1, factor_lj = sp_lj[sbmask(j)]; j &= NEIGHMASK; - numtyp4 jx=fetch_pos(j,x_); //x_[j]; + numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; int jtype=jx.w; // Compute r12 @@ -104,7 +106,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1, } // if ii } -__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in, +__kernel void k_buck_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in, __global numtyp4* coeff2_in, __global numtyp* sp_lj_in, __global int *dev_nbor, __global int *dev_packed, @@ -140,7 +142,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,list_end,nbor); - numtyp4 ix=fetch_pos(i,x_); //x_[i]; + numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; int iw=ix.w; int itype=fast_mul((int)MAX_SHARED_TYPES,iw); @@ -151,7 +153,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in factor_lj = sp_lj[sbmask(j)]; j &= NEIGHMASK; - numtyp4 jx=fetch_pos(j,x_); //x_[j]; + numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; int mtype=itype+jx.w; // Compute r12 diff --git a/lib/gpu/lal_buck_coul.cpp b/lib/gpu/lal_buck_coul.cpp index 42dbfb3e76..75e7231027 100644 --- a/lib/gpu/lal_buck_coul.cpp +++ b/lib/gpu/lal_buck_coul.cpp @@ -13,10 +13,12 @@ email : nguyentd@ornl.gov ***************************************************************************/ -#ifdef USE_OPENCL +#if defined(USE_OPENCL) #include "buck_coul_cl.h" +#elif defined(USE_CUDART) +const char *buck_coul=0; #else -#include "buck_coul_ptx.h" +#include "buck_coul_cubin.h" #endif #include "lal_buck_coul.h" @@ -52,7 +54,7 @@ int BuckCoulT::init(const int ntypes, double **host_cutsq, const double qqrd2e) { int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, - _screen,buck_coul); + _screen,buck_coul,"k_buck_coul"); if (success!=0) return success; @@ -142,23 +144,18 @@ void BuckCoulT::loop(const bool _eflag, const bool _vflag) { this->time_pair.start(); if (shared_types) { this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->dev_x.begin(), &coeff1.begin(), - &coeff2.begin(), &sp_lj.begin(), - &this->nbor->dev_nbor.begin(), - &this->_nbor_data->begin(), - &this->ans->dev_ans.begin(), - &this->ans->dev_engv.begin(), &eflag, &vflag, - &ainum, &nbor_pitch, - &this->atom->dev_q.begin(), &cutsq.begin(), - &_qqrd2e, &this->_threads_per_atom); + this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, + &vflag, &ainum, &nbor_pitch, &this->atom->q, + &cutsq, &_qqrd2e, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->dev_x.begin(), &coeff1.begin(), &coeff2.begin(), - &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), - &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, - &nbor_pitch, &this->atom->dev_q.begin(), - &cutsq.begin(), &_qqrd2e, &this->_threads_per_atom); + this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, &vflag, + &ainum, &nbor_pitch, &this->atom->q, + &cutsq, &_qqrd2e, &this->_threads_per_atom); } this->time_pair.stop(); } diff --git a/lib/gpu/lal_buck_coul.cu b/lib/gpu/lal_buck_coul.cu index df4a824b40..45cc36b0ce 100644 --- a/lib/gpu/lal_buck_coul.cu +++ b/lib/gpu/lal_buck_coul.cu @@ -14,18 +14,22 @@ // ***************************************************************************/ #ifdef NV_KERNEL + #include "lal_aux_fun1.h" +#ifndef _DOUBLE_DOUBLE texture pos_tex; texture q_tex; -#ifndef _DOUBLE_DOUBLE -ucl_inline float4 fetch_pos(const int& i, const float4 *pos) - { return tex1Dfetch(pos_tex, i); } -ucl_inline float fetch_q(const int& i, const float *q) - { return tex1Dfetch(q_tex, i); } -#endif +#else +texture pos_tex; +texture q_tex; #endif -__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1, +#else +#define pos_tex x_ +#define q_tex q_ +#endif + +__kernel void k_buck_coul(__global numtyp4 *x_, __global numtyp4 *coeff1, __global numtyp4* coeff2, const int lj_types, __global numtyp *sp_lj_in, __global int *dev_nbor, __global int *dev_packed, __global acctyp4 *ans, @@ -61,8 +65,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,list_end,nbor); - numtyp4 ix=fetch_pos(i,x_); //x_[i]; - numtyp qtmp=fetch_q(i,q_); + numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; + numtyp qtmp; fetch(qtmp,i,q_tex); int itype=ix.w; for ( ; nborinit_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, - _screen,buck_coul_long); + _screen,buck_coul_long,"k_buck_coul_long"); if (success!=0) return success; @@ -145,25 +147,19 @@ void BuckCoulLongT::loop(const bool _eflag, const bool _vflag) { this->time_pair.start(); if (shared_types) { this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->dev_x.begin(), &coeff1.begin(), - &coeff2.begin(), &sp_lj.begin(), - &this->nbor->dev_nbor.begin(), - &this->_nbor_data->begin(), - &this->ans->dev_ans.begin(), - &this->ans->dev_engv.begin(), &eflag, &vflag, - &ainum, &nbor_pitch, &this->atom->dev_q.begin(), - &cutsq.begin(), &_cut_coulsq, &_qqrd2e, + this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, + &vflag, &ainum, &nbor_pitch, &this->atom->q, + &cutsq, &_cut_coulsq, &_qqrd2e, &_g_ewald, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->dev_x.begin(), &coeff1.begin(), - &coeff2.begin(), &_lj_types, &sp_lj.begin(), - &this->nbor->dev_nbor.begin(), - &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), - &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, - &nbor_pitch, &this->atom->dev_q.begin(), - &cutsq.begin(), &_cut_coulsq, - &_qqrd2e, &_g_ewald, &this->_threads_per_atom); + this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, &vflag, + &ainum, &nbor_pitch, &this->atom->q, &cutsq, + &_cut_coulsq, &_qqrd2e, &_g_ewald, &this->_threads_per_atom); } this->time_pair.stop(); } diff --git a/lib/gpu/lal_buck_coul_long.cu b/lib/gpu/lal_buck_coul_long.cu index 95c13dc263..9ab3ed32c5 100644 --- a/lib/gpu/lal_buck_coul_long.cu +++ b/lib/gpu/lal_buck_coul_long.cu @@ -14,18 +14,22 @@ // ***************************************************************************/ #ifdef NV_KERNEL + #include "lal_aux_fun1.h" +#ifndef _DOUBLE_DOUBLE texture pos_tex; texture q_tex; -#ifndef _DOUBLE_DOUBLE -ucl_inline float4 fetch_pos(const int& i, const float4 *pos) - { return tex1Dfetch(pos_tex, i); } -ucl_inline float fetch_q(const int& i, const float *q) - { return tex1Dfetch(q_tex, i); } -#endif +#else +texture pos_tex; +texture q_tex; #endif -__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1, +#else +#define pos_tex x_ +#define q_tex q_ +#endif + +__kernel void k_buck_coul_long(__global numtyp4 *x_, __global numtyp4 *coeff1, __global numtyp4* coeff2, const int lj_types, __global numtyp *sp_lj_in, __global int *dev_nbor, __global int *dev_packed, __global acctyp4 *ans, @@ -62,8 +66,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,list_end,nbor); - numtyp4 ix=fetch_pos(i,x_); //x_[i]; - numtyp qtmp=fetch_q(i,q_); + numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; + numtyp qtmp; fetch(qtmp,i,q_tex); int itype=ix.w; for ( ; nborinit_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, - _screen,cg_cmm); + _screen,cg_cmm,"k_cg_cmm"); if (success!=0) return success; @@ -133,19 +135,17 @@ void CGCMMT::loop(const bool _eflag, const bool _vflag) { this->time_pair.start(); if (shared_types) { this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), - &lj3.begin(), &sp_lj.begin(), - &this->nbor->dev_nbor.begin(), - &this->_nbor_data->begin(), - &this->ans->dev_ans.begin(), - &this->ans->dev_engv.begin(), &eflag, &vflag, - &ainum, &nbor_pitch, &this->_threads_per_atom); + this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, + &vflag, &ainum, &nbor_pitch, + &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), - &_cmm_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), - &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, + this->k_pair.run(&this->atom->x, &lj1, &lj3, + &_cmm_types, &sp_lj, &this->nbor->dev_nbor, + &this->_nbor_data->begin(), &this->ans->force, + &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); } this->time_pair.stop(); diff --git a/lib/gpu/lal_cg_cmm.cu b/lib/gpu/lal_cg_cmm.cu index 46d1cbab50..4543320cc7 100644 --- a/lib/gpu/lal_cg_cmm.cu +++ b/lib/gpu/lal_cg_cmm.cu @@ -15,14 +15,16 @@ #ifdef NV_KERNEL #include "lal_aux_fun1.h" -texture pos_tex; #ifndef _DOUBLE_DOUBLE -ucl_inline float4 fetch_pos(const int& i, const float4 *pos) - { return tex1Dfetch(pos_tex, i); } +texture pos_tex; +#else +texture pos_tex; #endif +#else +#define pos_tex x_ #endif -__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, +__kernel void k_cg_cmm(__global numtyp4 *x_, __global numtyp4 *lj1, __global numtyp4* lj3, const int lj_types, __global numtyp *sp_lj_in, __global int *dev_nbor, __global int *dev_packed, __global acctyp4 *ans, @@ -51,7 +53,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,list_end,nbor); - numtyp4 ix=fetch_pos(i,x_); //x_[i]; + numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; int itype=ix.w; numtyp factor_lj; @@ -61,7 +63,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, factor_lj = sp_lj[sbmask(j)]; j &= NEIGHMASK; - numtyp4 jx=fetch_pos(j,x_); //x_[j]; + numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; int jtype=jx.w; // Compute r12 @@ -109,7 +111,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, } // if ii } -__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, +__kernel void k_cg_cmm_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, __global numtyp4* lj3_in, __global numtyp* sp_lj_in,__global int *dev_nbor, __global int *dev_packed, __global acctyp4 *ans, @@ -145,7 +147,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,list_end,nbor); - numtyp4 ix=fetch_pos(i,x_); //x_[i]; + numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; int iw=ix.w; int itype=fast_mul((int)MAX_SHARED_TYPES,iw); @@ -156,7 +158,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, factor_lj = sp_lj[sbmask(j)]; j &= NEIGHMASK; - numtyp4 jx=fetch_pos(j,x_); //x_[j]; + numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; int mtype=itype+jx.w; // Compute r12 diff --git a/lib/gpu/lal_cg_cmm_long.cpp b/lib/gpu/lal_cg_cmm_long.cpp index 2c12125b52..50e2977c70 100644 --- a/lib/gpu/lal_cg_cmm_long.cpp +++ b/lib/gpu/lal_cg_cmm_long.cpp @@ -13,10 +13,12 @@ email : brownw@ornl.gov ***************************************************************************/ -#ifdef USE_OPENCL +#if defined(USE_OPENCL) #include "cg_cmm_long_cl.h" +#elif defined(USE_CUDART) +const char *cg_cmm_long=0; #else -#include "cg_cmm_long_ptx.h" +#include "cg_cmm_long_cubin.h" #endif #include "lal_cg_cmm_long.h" @@ -56,7 +58,7 @@ int CGCMMLongT::init(const int ntypes, double **host_cutsq, const double g_ewald) { int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, - _screen,cg_cmm_long); + _screen,cg_cmm_long,"k_cg_cmm_long"); if (success!=0) return success; @@ -144,24 +146,19 @@ void CGCMMLongT::loop(const bool _eflag, const bool _vflag) { this->time_pair.start(); if (shared_types) { this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), - &lj3.begin(), &sp_lj.begin(), - &this->nbor->dev_nbor.begin(), - &this->_nbor_data->begin(), - &this->ans->dev_ans.begin(), - &this->ans->dev_engv.begin(), &eflag, &vflag, - &ainum, &nbor_pitch, - &this->atom->dev_q.begin(), &_cut_coulsq, - &_qqrd2e, &_g_ewald, &this->_threads_per_atom); + this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, + &vflag, &ainum, &nbor_pitch, &this->atom->q, + &_cut_coulsq, &_qqrd2e, &_g_ewald, + &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), - &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), - &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, - &nbor_pitch, &this->atom->dev_q.begin(), - &_cut_coulsq, &_qqrd2e, &_g_ewald, - &this->_threads_per_atom); + this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, &vflag, + &ainum, &nbor_pitch, &this->atom->q, &_cut_coulsq, + &_qqrd2e, &_g_ewald, &this->_threads_per_atom); } this->time_pair.stop(); } diff --git a/lib/gpu/lal_cg_cmm_long.cu b/lib/gpu/lal_cg_cmm_long.cu index b0db9d2aa3..2f1d9f2d21 100644 --- a/lib/gpu/lal_cg_cmm_long.cu +++ b/lib/gpu/lal_cg_cmm_long.cu @@ -14,18 +14,22 @@ // ***************************************************************************/ #ifdef NV_KERNEL + #include "lal_aux_fun1.h" +#ifndef _DOUBLE_DOUBLE texture pos_tex; texture q_tex; -#ifndef _DOUBLE_DOUBLE -ucl_inline float4 fetch_pos(const int& i, const float4 *pos) - { return tex1Dfetch(pos_tex, i); } -ucl_inline float fetch_q(const int& i, const float *q) - { return tex1Dfetch(q_tex, i); } -#endif +#else +texture pos_tex; +texture q_tex; #endif -__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, +#else +#define pos_tex x_ +#define q_tex q_ +#endif + +__kernel void k_cg_cmm_long(__global numtyp4 *x_, __global numtyp4 *lj1, __global numtyp4* lj3, const int lj_types, __global numtyp *sp_lj_in, __global int *dev_nbor, __global int *dev_packed, __global acctyp4 *ans, @@ -61,8 +65,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,list_end,nbor); - numtyp4 ix=fetch_pos(i,x_); //x_[i]; - numtyp qtmp=fetch_q(i,q_); + numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; + numtyp qtmp; fetch(qtmp,i,q_tex); int itype=ix.w; for ( ; nborinit_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, - _screen,charmm_long); + _screen,charmm_long,"k_charmm_long"); if (success!=0) return success; @@ -148,22 +150,19 @@ void CHARMMLongT::loop(const bool _eflag, const bool _vflag) { this->time_pair.start(); if (shared_types) { this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->dev_x.begin(), &ljd.begin(), - &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->_nbor_data->begin(), - &this->ans->dev_ans.begin(), - &this->ans->dev_engv.begin(), &eflag, &vflag, - &ainum, &nbor_pitch, &this->atom->dev_q.begin(), + this->k_pair_fast.run(&this->atom->x, &ljd, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, + &vflag, &ainum, &nbor_pitch, &this->atom->q, &_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj, &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), - &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), - &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, - &nbor_pitch, &this->atom->dev_q.begin(), + this->k_pair.run(&this->atom->x, &lj1, &_lj_types, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, &vflag, + &ainum, &nbor_pitch, &this->atom->q, &_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj, &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq, &this->_threads_per_atom); diff --git a/lib/gpu/lal_charmm_long.cu b/lib/gpu/lal_charmm_long.cu index aa88967d66..9b884e473a 100644 --- a/lib/gpu/lal_charmm_long.cu +++ b/lib/gpu/lal_charmm_long.cu @@ -14,18 +14,22 @@ // ***************************************************************************/ #ifdef NV_KERNEL + #include "lal_aux_fun1.h" +#ifndef _DOUBLE_DOUBLE texture pos_tex; texture q_tex; -#ifndef _DOUBLE_DOUBLE -ucl_inline float4 fetch_pos(const int& i, const float4 *pos) - { return tex1Dfetch(pos_tex, i); } -ucl_inline float fetch_q(const int& i, const float *q) - { return tex1Dfetch(q_tex, i); } -#endif +#else +texture pos_tex; +texture q_tex; #endif -__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, +#else +#define pos_tex x_ +#define q_tex q_ +#endif + +__kernel void k_charmm_long(__global numtyp4 *x_, __global numtyp4 *lj1, const int lj_types, __global numtyp *sp_lj_in, __global int *dev_nbor, __global int *dev_packed, __global acctyp4 *ans, __global acctyp *engv, @@ -62,8 +66,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,list_end,nbor); - numtyp4 ix=fetch_pos(i,x_); //x_[i]; - numtyp qtmp=fetch_q(i,q_); + numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; + numtyp qtmp; fetch(qtmp,i,q_tex); int itype=ix.w; for ( ; nborinit_atomic(nlocal,nall,max_nbors,maxspecial,cell_size, - gpu_split,_screen,coul_long); + gpu_split,_screen,coul_long,"k_coul_long"); if (success!=0) return success; @@ -132,22 +134,18 @@ void CoulLongT::loop(const bool _eflag, const bool _vflag) { this->time_pair.start(); if (shared_types) { this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), - &lj3.begin(), &sp_cl.begin(), - &this->nbor->dev_nbor.begin(), - &this->_nbor_data->begin(), - &this->ans->dev_ans.begin(), - &this->ans->dev_engv.begin(), &eflag, &vflag, - &ainum, &nbor_pitch, &this->atom->dev_q.begin(), - &_cut_coulsq, &_qqrd2e, &_g_ewald, + this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_cl, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, + &eflag, &vflag, &ainum, &nbor_pitch, + &this->atom->q, &_cut_coulsq, &_qqrd2e, &_g_ewald, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), - &_lj_types, &sp_cl.begin(), &this->nbor->dev_nbor.begin(), - &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), - &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, - &nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq, + this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_cl, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, &vflag, + &ainum, &nbor_pitch, &this->atom->q, &_cut_coulsq, &_qqrd2e, &_g_ewald, &this->_threads_per_atom); } this->time_pair.stop(); diff --git a/lib/gpu/lal_coul_long.cu b/lib/gpu/lal_coul_long.cu index 88d7406206..b93010c959 100644 --- a/lib/gpu/lal_coul_long.cu +++ b/lib/gpu/lal_coul_long.cu @@ -14,18 +14,22 @@ // ***************************************************************************/ #ifdef NV_KERNEL + #include "lal_aux_fun1.h" +#ifndef _DOUBLE_DOUBLE texture pos_tex; texture q_tex; -#ifndef _DOUBLE_DOUBLE -ucl_inline float4 fetch_pos(const int& i, const float4 *pos) - { return tex1Dfetch(pos_tex, i); } -ucl_inline float fetch_q(const int& i, const float *q) - { return tex1Dfetch(q_tex, i); } -#endif +#else +texture pos_tex; +texture q_tex; #endif -__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, +#else +#define pos_tex x_ +#define q_tex q_ +#endif + +__kernel void k_coul_long(__global numtyp4 *x_, __global numtyp4 *lj1, __global numtyp4* lj3, const int lj_types, __global numtyp *sp_cl_in, __global int *dev_nbor, __global int *dev_packed, __global acctyp4 *ans, @@ -56,8 +60,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,list_end,nbor); - numtyp4 ix=fetch_pos(i,x_); //x_[i]; - numtyp qtmp=fetch_q(i,q_); + numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; + numtyp qtmp; fetch(qtmp,i,q_tex); for ( ; nbor #endif -#ifdef USE_OPENCL +#if defined(USE_OPENCL) #include "device_cl.h" +#elif defined(USE_CUDART) +const char *device=0; #else -#include "device_ptx.h" +#include "device_cubin.h" #endif using namespace LAMMPS_AL; @@ -42,10 +44,10 @@ DeviceT::~Device() { } template -int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, - const int first_gpu, const int last_gpu, - const int gpu_mode, const double p_split, - const int nthreads, const int t_per_atom) { +int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, + const int last_gpu, const int gpu_mode, + const double p_split, const int nthreads, + const int t_per_atom, const double cell_size) { _nthreads=nthreads; #ifdef _OPENMP omp_set_num_threads(nthreads); @@ -62,6 +64,7 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, _last_device=last_gpu; _gpu_mode=gpu_mode; _particle_split=p_split; + _cell_size=cell_size; // Get the rank/size within the world MPI_Comm_rank(_comm_world,&_world_me); @@ -191,7 +194,7 @@ int DeviceT::init(Answer &ans, const bool charge, } else { if (atom.charge()==false && charge) _data_in_estimate++; - if (atom.quat()==false && rot) + if (atom.quaternion()==false && rot) _data_in_estimate++; if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial)) return -3; @@ -205,7 +208,10 @@ int DeviceT::init(Answer &ans, const bool charge, _block_cell_id, _block_nbor_build, threads_per_atom, _warp_size, _time_device)) return -3; - nbor->cell_size(cell_size); + if (_cell_size<0.0) + nbor->cell_size(cell_size,cell_size); + else + nbor->cell_size(_cell_size,cell_size); _init_count++; return 0; @@ -251,7 +257,9 @@ void DeviceT::set_double_precompute template void DeviceT::init_message(FILE *screen, const char *name, const int first_gpu, const int last_gpu) { - #ifdef USE_OPENCL + #if defined(USE_OPENCL) + std::string fs=""; + #elif defined(USE_CUDART) std::string fs=""; #else std::string fs=toa(gpu->free_gigabytes())+"/"; @@ -411,13 +419,11 @@ void DeviceT::estimate_gpu_overhead(const int kernel_calls, } template -void DeviceT::output_times(UCL_Timer &time_pair, - Answer &ans, - Neighbor &nbor, const double avg_split, - const double max_bytes, - const double gpu_overhead, - const double driver_overhead, - const int threads_per_atom, FILE *screen) { +void DeviceT::output_times(UCL_Timer &time_pair, Answer &ans, + Neighbor &nbor, const double avg_split, + const double max_bytes, const double gpu_overhead, + const double driver_overhead, + const int threads_per_atom, FILE *screen) { double single[9], times[9]; single[0]=atom.transfer_time()+ans.transfer_time(); @@ -574,33 +580,32 @@ int DeviceT::compile_kernels() { k_info.set_function(*dev_program,"kernel_info"); _compiled=true; - UCL_H_Vec h_gpu_lib_data(14,*gpu,UCL_NOT_PINNED); - UCL_D_Vec d_gpu_lib_data(14,*gpu); + UCL_Vector gpu_lib_data(14,*gpu,UCL_NOT_PINNED); k_info.set_size(1,1); - k_info.run(&d_gpu_lib_data.begin()); - ucl_copy(h_gpu_lib_data,d_gpu_lib_data,false); + k_info.run(&gpu_lib_data); + gpu_lib_data.update_host(false); - _ptx_arch=static_cast(h_gpu_lib_data[0])/100.0; + _ptx_arch=static_cast(gpu_lib_data[0])/100.0; #ifndef USE_OPENCL if (_ptx_arch>gpu->arch()) return -4; #endif - _num_mem_threads=h_gpu_lib_data[1]; - _warp_size=h_gpu_lib_data[2]; + _num_mem_threads=gpu_lib_data[1]; + _warp_size=gpu_lib_data[2]; if (_threads_per_atom<1) - _threads_per_atom=h_gpu_lib_data[3]; + _threads_per_atom=gpu_lib_data[3]; if (_threads_per_charge<1) - _threads_per_charge=h_gpu_lib_data[13]; - _pppm_max_spline=h_gpu_lib_data[4]; - _pppm_block=h_gpu_lib_data[5]; - _block_pair=h_gpu_lib_data[6]; - _max_shared_types=h_gpu_lib_data[7]; - _block_cell_2d=h_gpu_lib_data[8]; - _block_cell_id=h_gpu_lib_data[9]; - _block_nbor_build=h_gpu_lib_data[10]; - _block_bio_pair=h_gpu_lib_data[11]; - _max_bio_shared_types=h_gpu_lib_data[12]; + _threads_per_charge=gpu_lib_data[13]; + _pppm_max_spline=gpu_lib_data[4]; + _pppm_block=gpu_lib_data[5]; + _block_pair=gpu_lib_data[6]; + _max_shared_types=gpu_lib_data[7]; + _block_cell_2d=gpu_lib_data[8]; + _block_cell_id=gpu_lib_data[9]; + _block_nbor_build=gpu_lib_data[10]; + _block_bio_pair=gpu_lib_data[11]; + _max_bio_shared_types=gpu_lib_data[12]; if (static_cast(_block_pair)>gpu->group_size()) _block_pair=gpu->group_size(); @@ -634,9 +639,10 @@ Device global_device; int lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, const int last_gpu, const int gpu_mode, const double particle_split, const int nthreads, - const int t_per_atom) { + const int t_per_atom, const double cell_size) { return global_device.init_device(world,replica,first_gpu,last_gpu,gpu_mode, - particle_split,nthreads,t_per_atom); + particle_split,nthreads,t_per_atom, + cell_size); } void lmp_clear_device() { diff --git a/lib/gpu/lal_device.h b/lib/gpu/lal_device.h index e71c22de8b..6cfad82054 100644 --- a/lib/gpu/lal_device.h +++ b/lib/gpu/lal_device.h @@ -49,7 +49,7 @@ class Device { int init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, const int last_gpu, const int gpu_mode, const double particle_split, const int nthreads, - const int t_per_atom); + const int t_per_atom, const double cell_size); /// Initialize the device for Atom and Neighbor storage /** \param rot True if quaternions need to be stored @@ -239,7 +239,7 @@ class Device { int num_blocks=static_cast(ceil(static_cast(numel)/ _block_pair)); k_zero.set_size(num_blocks,_block_pair); - k_zero.run(&mem.begin(),&numel); + k_zero.run(&mem,&numel); } // -------------------------- DEVICE DATA ------------------------- @@ -288,6 +288,7 @@ class Device { double _particle_split; double _cpu_full; double _ptx_arch; + double _cell_size; // -1 if the cutoff is used int _num_mem_threads, _warp_size, _threads_per_atom, _threads_per_charge; int _pppm_max_spline, _pppm_block; diff --git a/lib/gpu/lal_eam.cpp b/lib/gpu/lal_eam.cpp index 5642e5bbfe..5182f0b11f 100644 --- a/lib/gpu/lal_eam.cpp +++ b/lib/gpu/lal_eam.cpp @@ -13,10 +13,12 @@ email : brownw@ornl.gov nguyentd@ornl.gov ***************************************************************************/ -#ifdef USE_OPENCL +#if defined(USE_OPENCL) #include "eam_cl.h" +#elif defined(USE_CUDART) +const char *eam=0; #else -#include "eam_ptx.h" +#include "eam_cubin.h" #endif #include "lal_eam.h" @@ -51,32 +53,24 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor, { int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size, - gpu_split,_screen,eam); + gpu_split,_screen,eam,"k_eam"); if (success!=0) return success; // allocate fp - bool cpuview=false; - if (this->ucl_device->device_type()==UCL_CPU) - cpuview=true; - int ef_nall=nall; if (ef_nall==0) ef_nall=2000; _max_fp_size=static_cast(static_cast(ef_nall)*1.10); - host_fp.alloc(_max_fp_size,*(this->ucl_device)); - if (cpuview) - dev_fp.view(host_fp); - else - dev_fp.alloc(_max_fp_size,*(this->ucl_device),UCL_WRITE_ONLY); + _fp.alloc(_max_fp_size,*(this->ucl_device),UCL_RW_OPTIMIZED,UCL_WRITE_ONLY); - k_energy.set_function(*(this->pair_program),"kernel_energy"); - k_energy_fast.set_function(*(this->pair_program),"kernel_energy_fast"); + k_energy.set_function(*(this->pair_program),"k_energy"); + k_energy_fast.set_function(*(this->pair_program),"k_energy_fast"); fp_tex.get_texture(*(this->pair_program),"fp_tex"); - fp_tex.bind_float(dev_fp,1); + fp_tex.bind_float(_fp,1); _compiled_energy = true; // Initialize timers for selected GPU @@ -236,7 +230,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor, + frho_spline2.row_bytes() + z2r_spline1.row_bytes() + z2r_spline2.row_bytes() - + dev_fp.row_bytes(); + + _fp.device.row_bytes(); return 0; } @@ -255,8 +249,7 @@ void EAMT::clear() { z2r_spline1.clear(); z2r_spline2.clear(); - host_fp.clear(); - dev_fp.clear(); + _fp.clear(); time_pair2.clear(); time_fp1.clear(); @@ -303,19 +296,11 @@ void EAMT::compute(const int f_ago, const int inum_full, const int nlocal, // ------------------- Resize FP Array for EAM -------------------- if (nall>_max_fp_size) { - dev_fp.clear(); - host_fp.clear(); - _max_fp_size=static_cast(static_cast(nall)*1.10); - host_fp.alloc(_max_fp_size,*(this->ucl_device)); - if (this->ucl_device->device_type()==UCL_CPU) - dev_fp.view(host_fp); - else - dev_fp.alloc(_max_fp_size,*(this->ucl_device)); - - fp_tex.bind_float(dev_fp,1); + _fp.resize(_max_fp_size); + fp_tex.bind_float(_fp,1); } - *fp_ptr=host_fp.begin(); + *fp_ptr=_fp.host.begin(); // ---------------------------------------------------------------- @@ -348,7 +333,7 @@ void EAMT::compute(const int f_ago, const int inum_full, const int nlocal, // copy fp from device to host for comm _nlocal=nlocal; time_fp1.start(); - ucl_copy(host_fp,dev_fp,nlocal,true); + _fp.update_host(nlocal,true); time_fp1.stop(); time_fp1.sync_stop(); } @@ -380,19 +365,11 @@ int** EAMT::compute(const int ago, const int inum_full, const int nall, // ------------------- Resize FP Array for EAM -------------------- if (nall>_max_fp_size) { - dev_fp.clear(); - host_fp.clear(); - _max_fp_size=static_cast(static_cast(nall)*1.10); - host_fp.alloc(_max_fp_size,*(this->ucl_device)); - if (this->ucl_device->device_type()==UCL_CPU) - dev_fp.view(host_fp); - else - dev_fp.alloc(_max_fp_size,*(this->ucl_device)); - - fp_tex.bind_float(dev_fp,1); + _fp.resize(_max_fp_size); + fp_tex.bind_float(_fp,1); } - *fp_ptr=host_fp.begin(); + *fp_ptr=_fp.host.begin(); // ----------------------------------------------------------------- @@ -428,7 +405,7 @@ int** EAMT::compute(const int ago, const int inum_full, const int nall, // copy fp from device to host for comm _nlocal=inum_full; time_fp1.start(); - ucl_copy(host_fp,dev_fp,inum_full,true); + _fp.update_host(inum_full,true); time_fp1.stop(); time_fp1.sync_stop(); @@ -486,22 +463,18 @@ void EAMT::loop(const bool _eflag, const bool _vflag) { if (shared_types) { this->k_energy_fast.set_size(GX,BX); - this->k_energy_fast.run(&this->atom->dev_x.begin(), &type2rhor_z2r.begin(), - &type2frho.begin(), &rhor_spline2.begin(), - &frho_spline1.begin(),&frho_spline2.begin(), - &this->nbor->dev_nbor.begin(), - &this->_nbor_data->begin(), &dev_fp.begin(), - &this->ans->dev_engv.begin(), &eflag, &ainum, + this->k_energy_fast.run(&this->atom->x, &type2rhor_z2r, &type2frho, + &rhor_spline2, &frho_spline1,&frho_spline2, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &_fp, &this->ans->engv, &eflag, &ainum, &nbor_pitch, &_ntypes, &_cutforcesq, &_rdr, &_rdrho, &_nrho, &_nr, &this->_threads_per_atom); } else { this->k_energy.set_size(GX,BX); - this->k_energy.run(&this->atom->dev_x.begin(), &type2rhor_z2r.begin(), - &type2frho.begin(), &rhor_spline2.begin(), - &frho_spline1.begin(),&frho_spline2.begin(), - &this->nbor->dev_nbor.begin(), - &this->_nbor_data->begin(), &dev_fp.begin(), - &this->ans->dev_engv.begin(),&eflag, &ainum, &nbor_pitch, + this->k_energy.run(&this->atom->x, &type2rhor_z2r, &type2frho, + &rhor_spline2, &frho_spline1, &frho_spline2, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), &_fp, + &this->ans->engv,&eflag, &ainum, &nbor_pitch, &_ntypes, &_cutforcesq, &_rdr, &_rdrho, &_nrho, &_nr, &this->_threads_per_atom); } @@ -536,28 +509,20 @@ void EAMT::loop2(const bool _eflag, const bool _vflag) { if (shared_types) { this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->dev_x.begin(), &dev_fp.begin(), - &type2rhor_z2r.begin(), - &rhor_spline1.begin(), - &z2r_spline1.begin(), - &z2r_spline2.begin(), - &this->nbor->dev_nbor.begin(), - &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), - &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, - &nbor_pitch, &_cutforcesq, &_rdr, &_nr, - &this->_threads_per_atom); + this->k_pair_fast.run(&this->atom->x, &_fp, &type2rhor_z2r, + &rhor_spline1, &z2r_spline1, &z2r_spline2, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, + &vflag, &ainum, &nbor_pitch, &_cutforcesq, &_rdr, + &_nr, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->dev_x.begin(), &dev_fp.begin(), - &type2rhor_z2r.begin(), - &rhor_spline1.begin(), - &z2r_spline1.begin(), - &z2r_spline2.begin(), - &this->nbor->dev_nbor.begin(), - &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), - &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, - &nbor_pitch, &_ntypes, &_cutforcesq, &_rdr, &_nr, - &this->_threads_per_atom); + this->k_pair.run(&this->atom->x, &_fp, &type2rhor_z2r, &rhor_spline1, + &z2r_spline1, &z2r_spline2, &this->nbor->dev_nbor, + &this->_nbor_data->begin(), &this->ans->force, + &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, + &_ntypes, &_cutforcesq, &_rdr, &_nr, + &this->_threads_per_atom); } this->time_pair2.stop(); diff --git a/lib/gpu/lal_eam.cu b/lib/gpu/lal_eam.cu index 5909adaacd..ec20bd672f 100644 --- a/lib/gpu/lal_eam.cu +++ b/lib/gpu/lal_eam.cu @@ -15,66 +15,37 @@ #ifdef NV_KERNEL #include "lal_aux_fun1.h" + +#ifndef _DOUBLE_DOUBLE texture pos_tex; texture fp_tex; - texture rhor_sp1_tex; texture rhor_sp2_tex; texture frho_sp1_tex; texture frho_sp2_tex; texture z2r_sp1_tex; texture z2r_sp2_tex; - -#ifdef _DOUBLE_DOUBLE -ucl_inline double4 fetch_rhor_sp1(const int& i, const double4 *rhor_spline1) { - return rhor_spline1[i]; -} -ucl_inline double4 fetch_rhor_sp2(const int& i, const double4 *rhor_spline2) { - return rhor_spline2[i]; -} -ucl_inline double4 fetch_frho_sp1(const int& i, const double4 *frho_spline1) { - return frho_spline1[i]; -} -ucl_inline double4 fetch_frho_sp2(const int& i, const double4 *frho_spline2) { - return frho_spline2[i]; -} -ucl_inline double4 fetch_z2r_sp1(const int& i, const double4 *z2r_spline1) { - return z2r_spline1[i]; -} -ucl_inline double4 fetch_z2r_sp2(const int& i, const double4 *z2r_spline2) { - return z2r_spline2[i]; -} +#else +texture pos_tex; +texture fp_tex; +texture rhor_sp1_tex; +texture rhor_sp2_tex; +texture frho_sp1_tex; +texture frho_sp2_tex; +texture z2r_sp1_tex; +texture z2r_sp2_tex; #endif -#ifndef _DOUBLE_DOUBLE -ucl_inline float4 fetch_pos(const int& i, const float4 *pos) - { return tex1Dfetch(pos_tex, i); } -ucl_inline float fetch_q(const int& i, const float *fp) - { return tex1Dfetch(fp_tex, i); } +#else -ucl_inline float4 fetch_rhor_sp1(const int& i, const float4 *rhor_spline1) - { return tex1Dfetch(rhor_sp1_tex, i); } -ucl_inline float4 fetch_rhor_sp2(const int& i, const float4 *rhor_spline2) - { return tex1Dfetch(rhor_sp2_tex, i); } -ucl_inline float4 fetch_frho_sp1(const int& i, const float4 *frho_spline1) - { return tex1Dfetch(frho_sp1_tex, i); } -ucl_inline float4 fetch_frho_sp2(const int& i, const float4 *frho_spline2) - { return tex1Dfetch(frho_sp2_tex, i); } -ucl_inline float4 fetch_z2r_sp1(const int& i, const float4 *z2r_spline1) - { return tex1Dfetch(z2r_sp1_tex, i); } -ucl_inline float4 fetch_z2r_sp2(const int& i, const float4 *z2r_spline2) - { return tex1Dfetch(z2r_sp2_tex, i); } -#endif - -#else // OPENCL - -#define fetch_q(i,y) fp_[i] -#define fetch_rhor_sp1(i,y) rhor_spline1[i] -#define fetch_rhor_sp2(i,y) rhor_spline2[i] -#define fetch_frho_sp1(i,y) frho_spline1[i] -#define fetch_frho_sp2(i,y) frho_spline2[i] -#define fetch_z2r_sp1(i,y) z2r_spline1[i] -#define fetch_z2r_sp2(i,y) z2r_spline2[i] +#define pos_tex x_ +#define fp_tex fp_ +#define rhor_sp1_tex rhor_spline1 +#define rhor_sp2_tex rhor_spline2 +#define frho_sp1_tex frho_spline1 +#define frho_sp2_tex frho_spline2 +#define z2r_sp1_tex z2r_spline1 +#define z2r_sp2_tex z2r_spline2 #endif @@ -99,11 +70,11 @@ ucl_inline float4 fetch_z2r_sp2(const int& i, const float4 *z2r_spline2) p -= m; \ p = MIN(p,(numtyp)1.0); \ int index = type2frho[itype]*(nrho+1)+m; \ - numtyp4 coeff = fetch_frho_sp1(index, frho_spline1); \ + numtyp4 coeff; fetch4(coeff,index,frho_sp1_tex); \ numtyp fp = (coeff.x*p + coeff.y)*p + coeff.z; \ fp_[i]=fp; \ if (eflag>0) { \ - coeff = fetch_frho_sp2(index, frho_spline2); \ + fetch4(coeff,index,frho_sp2_tex); \ energy = ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w; \ engv[ii]=(acctyp)2.0*energy; \ } \ @@ -154,7 +125,7 @@ ucl_inline float4 fetch_z2r_sp2(const int& i, const float4 *z2r_spline2) ans[ii]=f; \ } -__kernel void kernel_energy(__global numtyp4 *x_, __global int2 *type2rhor_z2r, +__kernel void k_energy(__global numtyp4 *x_, __global int2 *type2rhor_z2r, __global int *type2frho, __global numtyp4 *rhor_spline2, __global numtyp4 *frho_spline1, @@ -178,14 +149,14 @@ __kernel void kernel_energy(__global numtyp4 *x_, __global int2 *type2rhor_z2r, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,list_end,nbor); - numtyp4 ix=fetch_pos(i,x_); //x_[i]; + numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; int itype=ix.w; for ( ; nbor { if (nghost>0) { UCL_H_Vec host_view; UCL_D_Vec dev_view; - host_view.view_offset(_nlocal,host_fp); - dev_view.view_offset(_nlocal,dev_fp); + host_view.view_offset(_nlocal,_fp.host); + dev_view.view_offset(_nlocal,_fp.device); ucl_copy(dev_view,host_view,nghost,true); } } @@ -128,8 +128,7 @@ class EAM : public BaseAtomic { bool _compiled_energy; /// Per-atom arrays - UCL_H_Vec host_fp; - UCL_D_Vec dev_fp; + UCL_Vector _fp; protected: bool _allocated; diff --git a/lib/gpu/lal_ellipsoid_extra.h b/lib/gpu/lal_ellipsoid_extra.h index cd963ffe68..e2287c0af2 100644 --- a/lib/gpu/lal_ellipsoid_extra.h +++ b/lib/gpu/lal_ellipsoid_extra.h @@ -20,6 +20,14 @@ enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE}; #ifdef NV_KERNEL #include "lal_preprocessor.h" +#ifndef _DOUBLE_DOUBLE +texture pos_tex, quat_tex; +#else +texture pos_tex, quat_tex; +#endif +#else +#define pos_tex x_ +#define quat_tex qif #endif #define atom_info(t_per_atom, ii, tid, offset) \ @@ -411,7 +419,7 @@ ucl_inline void gpu_mldivide3(const numtyp m[9], const numtyp *v, numtyp *ans, ucl_inline void gpu_quat_to_mat_trans(__global const numtyp4 *qif, const int qi, numtyp mat[9]) { - numtyp4 q=qif[qi]; + numtyp4 q; fetch4(q,qi,quat_tex); numtyp w2 = q.x*q.x; numtyp i2 = q.y*q.y; diff --git a/lib/gpu/lal_ellipsoid_nbor.cu b/lib/gpu/lal_ellipsoid_nbor.cu index 669973a7e5..0be6c0922d 100644 --- a/lib/gpu/lal_ellipsoid_nbor.cu +++ b/lib/gpu/lal_ellipsoid_nbor.cu @@ -15,6 +15,13 @@ #ifdef NV_KERNEL #include "lal_preprocessor.h" +#ifndef _DOUBLE_DOUBLE +texture pos_tex; +#else +texture pos_tex; +#endif +#else +#define pos_tex x_ #endif // --------------------------------------------------------------------------- @@ -40,14 +47,14 @@ __kernel void kernel_nbor(__global numtyp4 *x_, __global numtyp2 *cut_form, __global int *list_end=nbor+fast_mul(numj,nbor_pitch); __global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch; - numtyp4 ix=x_[i]; + numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; int iw=ix.w; int itype=fast_mul(iw,ntypes); int newj=0; for ( ; nborinit_base(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, - _screen,ntypes,h_form,gayberne,gayberne_lj); + _screen,ntypes,h_form,gayberne,gayberne_lj, + "k_gayberne"); if (success!=0) return success; @@ -210,13 +214,13 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) { this->time_ellipsoid.start(); this->k_ellipsoid.set_size(GX,BX); - this->k_ellipsoid.run(&this->atom->dev_x.begin(), - &this->atom->dev_quat.begin(), &this->shape.begin(), &this->well.begin(), - &this->gamma_upsilon_mu.begin(), &this->sigma_epsilon.begin(), - &this->_lj_types, &this->lshape.begin(), &this->nbor->dev_nbor.begin(), - &stride, &this->ans->dev_ans.begin(),&ainum,&this->ans->dev_engv.begin(), - &this->dev_error.begin(), &eflag, &vflag, &this->_last_ellipse, - &this->_threads_per_atom); + this->k_ellipsoid.run(&this->atom->x, &this->atom->quat, + &this->shape, &this->well, &this->gamma_upsilon_mu, + &this->sigma_epsilon, &this->_lj_types, + &this->lshape, &this->nbor->dev_nbor, &stride, + &this->ans->force, &ainum, &this->ans->engv, + &this->dev_error, &eflag, &vflag, + &this->_last_ellipse, &this->_threads_per_atom); this->time_ellipsoid.stop(); if (this->_last_ellipse==this->ans->inum()) { @@ -243,17 +247,19 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) { this->time_ellipsoid2.start(); this->k_sphere_ellipsoid.set_size(GX,BX); - this->k_sphere_ellipsoid.run(&this->atom->dev_x.begin(), - &this->atom->dev_quat.begin(), &this->shape.begin(), - &this->well.begin(), &this->gamma_upsilon_mu.begin(), - &this->sigma_epsilon.begin(), &this->_lj_types, &this->lshape.begin(), - &this->nbor->dev_nbor.begin(), &stride, &this->ans->dev_ans.begin(), - &this->ans->dev_engv.begin(), &this->dev_error.begin(), &eflag, - &vflag, &this->_last_ellipse, &ainum, &this->_threads_per_atom); + this->k_sphere_ellipsoid.run(&this->atom->x, &this->atom->quat, + &this->shape, &this->well, + &this->gamma_upsilon_mu, + &this->sigma_epsilon, &this->_lj_types, + &this->lshape, &this->nbor->dev_nbor, + &stride, &this->ans->force, + &this->ans->engv, &this->dev_error, + &eflag, &vflag, &this->_last_ellipse, + &ainum, &this->_threads_per_atom); this->time_ellipsoid2.stop(); } else { - this->ans->dev_ans.zero(); - this->ans->dev_engv.zero(); + this->ans->force.zero(); + this->ans->engv.zero(); this->time_nbor1.stop(); this->time_ellipsoid.start(); this->time_ellipsoid.stop(); @@ -268,19 +274,20 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) { if (this->_last_ellipseans->inum()) { if (this->_shared_types) { this->k_lj_fast.set_size(GX,BX); - this->k_lj_fast.run(&this->atom->dev_x.begin(), &this->lj1.begin(), - &this->lj3.begin(), &this->gamma_upsilon_mu.begin(), &stride, - &this->nbor->dev_packed.begin(), &this->ans->dev_ans.begin(), - &this->ans->dev_engv.begin(), &this->dev_error.begin(), - &eflag, &vflag, &this->_last_ellipse, &ainum, - &this->_threads_per_atom); + this->k_lj_fast.run(&this->atom->x, &this->lj1, &this->lj3, + &this->gamma_upsilon_mu, &stride, + &this->nbor->dev_packed, &this->ans->force, + &this->ans->engv, &this->dev_error, &eflag, + &vflag, &this->_last_ellipse, &ainum, + &this->_threads_per_atom); } else { this->k_lj.set_size(GX,BX); - this->k_lj.run(&this->atom->dev_x.begin(), &this->lj1.begin(), - &this->lj3.begin(), &this->_lj_types, &this->gamma_upsilon_mu.begin(), - &stride, &this->nbor->dev_packed.begin(), &this->ans->dev_ans.begin(), - &this->ans->dev_engv.begin(), &this->dev_error.begin(), &eflag, - &vflag, &this->_last_ellipse, &ainum, &this->_threads_per_atom); + this->k_lj.run(&this->atom->x, &this->lj1, &this->lj3, + &this->_lj_types, &this->gamma_upsilon_mu, &stride, + &this->nbor->dev_packed, &this->ans->force, + &this->ans->engv, &this->dev_error, &eflag, + &vflag, &this->_last_ellipse, &ainum, + &this->_threads_per_atom); } } this->time_lj.stop(); @@ -294,13 +301,12 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) { this->time_nbor1.stop(); this->time_ellipsoid.start(); this->k_ellipsoid.set_size(GX,BX); - this->k_ellipsoid.run(&this->atom->dev_x.begin(), - &this->atom->dev_quat.begin(), &this->shape.begin(), &this->well.begin(), - &this->gamma_upsilon_mu.begin(), &this->sigma_epsilon.begin(), - &this->_lj_types, &this->lshape.begin(), &this->nbor->dev_nbor.begin(), - &stride, &this->ans->dev_ans.begin(), &ainum, - &this->ans->dev_engv.begin(), &this->dev_error.begin(), - &eflag, &vflag, &ainum, &this->_threads_per_atom); + this->k_ellipsoid.run(&this->atom->x, &this->atom->quat, + &this->shape, &this->well, &this->gamma_upsilon_mu, + &this->sigma_epsilon, &this->_lj_types, &this->lshape, + &this->nbor->dev_nbor, &stride, &this->ans->force, + &ainum, &this->ans->engv, &this->dev_error, + &eflag, &vflag, &ainum, &this->_threads_per_atom); this->time_ellipsoid.stop(); } } diff --git a/lib/gpu/lal_gayberne.cu b/lib/gpu/lal_gayberne.cu index e2bfe4b1b5..dbff1178ef 100644 --- a/lib/gpu/lal_gayberne.cu +++ b/lib/gpu/lal_gayberne.cu @@ -18,7 +18,7 @@ #endif ucl_inline void compute_eta_torque(numtyp m[9],numtyp m2[9], const numtyp4 shape, - numtyp ans[9]) + numtyp ans[9]) { numtyp den = m[3]*m[2]*m[7]-m[0]*m[5]*m[7]- m[2]*m[6]*m[4]+m[1]*m[6]*m[5]- @@ -80,15 +80,15 @@ ucl_inline void compute_eta_torque(numtyp m[9],numtyp m2[9], const numtyp4 shape m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den; } -__kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q, - __global numtyp4* shape, __global numtyp4* well, - __global numtyp *gum, __global numtyp2* sig_eps, - const int ntypes, __global numtyp *lshape, - __global int *dev_nbor, const int stride, - __global acctyp4 *ans, const int astride, - __global acctyp *engv, __global int *err_flag, - const int eflag, const int vflag, const int inum, - const int t_per_atom) { +__kernel void k_gayberne(__global numtyp4* x_,__global numtyp4 *q, + __global numtyp4* shape, __global numtyp4* well, + __global numtyp *gum, __global numtyp2* sig_eps, + const int ntypes, __global numtyp *lshape, + __global int *dev_nbor, const int stride, + __global acctyp4 *ans, const int astride, + __global acctyp *engv, __global int *err_flag, + const int eflag, const int vflag, const int inum, + const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -117,7 +117,7 @@ __kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q, nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj, n_stride,nbor_end,nbor); - numtyp4 ix=x_[i]; + numtyp4 ix; fetch4(ix,i,pos_tex); int itype=ix.w; numtyp a1[9], b1[9], g1[9]; numtyp4 ishape=shape[itype]; @@ -136,7 +136,7 @@ __kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q, factor_lj = sp_lj[sbmask(j)]; j &= NEIGHMASK; - numtyp4 jx=x_[j]; + numtyp4 jx; fetch4(jx,j,pos_tex); int jtype=jx.w; // Compute r12 diff --git a/lib/gpu/lal_gayberne_lj.cu b/lib/gpu/lal_gayberne_lj.cu index bf294e1bb4..05a9b1008a 100644 --- a/lib/gpu/lal_gayberne_lj.cu +++ b/lib/gpu/lal_gayberne_lj.cu @@ -17,15 +17,15 @@ #include "lal_ellipsoid_extra.h" #endif -__kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q, - __global numtyp4* shape,__global numtyp4* well, - __global numtyp *gum, __global numtyp2* sig_eps, - const int ntypes, __global numtyp *lshape, - __global int *dev_nbor, const int stride, - __global acctyp4 *ans, __global acctyp *engv, - __global int *err_flag, const int eflag, - const int vflag,const int start, const int inum, - const int t_per_atom) { +__kernel void k_gayberne_sphere_ellipsoid(__global numtyp4 *x_, + __global numtyp4 *q, __global numtyp4* shape, + __global numtyp4* well, __global numtyp *gum, + __global numtyp2* sig_eps, const int ntypes, + __global numtyp *lshape, __global int *dev_nbor, + const int stride, __global acctyp4 *ans, + __global acctyp *engv, __global int *err_flag, + const int eflag, const int vflag,const int start, + const int inum, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); ii+=start; @@ -51,7 +51,7 @@ __kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q, nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj, n_stride,nbor_end,nbor); - numtyp4 ix=x_[i]; + numtyp4 ix; fetch4(ix,i,pos_tex); int itype=ix.w; numtyp oner=shape[itype].x; @@ -64,7 +64,7 @@ __kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q, factor_lj = sp_lj[sbmask(j)]; j &= NEIGHMASK; - numtyp4 jx=x_[j]; + numtyp4 jx; fetch4(jx,j,pos_tex); int jtype=jx.w; // Compute r12 @@ -236,14 +236,13 @@ __kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q, } // if ii } -__kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1, - __global numtyp4* lj3, const int lj_types, - __global numtyp *gum, - const int stride, __global int *dev_ij, - __global acctyp4 *ans, __global acctyp *engv, - __global int *err_flag, const int eflag, - const int vflag, const int start, const int inum, - const int t_per_atom) { +__kernel void k_gayberne_lj(__global numtyp4 *x_, __global numtyp4 *lj1, + __global numtyp4* lj3, const int lj_types, + __global numtyp *gum, const int stride, + __global int *dev_ij, __global acctyp4 *ans, + __global acctyp *engv, __global int *err_flag, + const int eflag, const int vflag, const int start, + const int inum, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); ii+=start; @@ -269,7 +268,7 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1, nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj, n_stride,list_end,nbor); - numtyp4 ix=x_[i]; + numtyp4 ix; fetch4(ix,i,pos_tex); int itype=ix.w; numtyp factor_lj; @@ -279,7 +278,7 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1, factor_lj = sp_lj[sbmask(j)]; j &= NEIGHMASK; - numtyp4 jx=x_[j]; + numtyp4 jx; fetch4(jx,j,pos_tex); int jtype=jx.w; // Compute r12 @@ -319,13 +318,13 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1, } // if ii } -__kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, - __global numtyp4* lj3_in, __global numtyp *gum, - const int stride, __global int *dev_ij, - __global acctyp4 *ans, __global acctyp *engv, - __global int *err_flag, const int eflag, - const int vflag, const int start, const int inum, - const int t_per_atom) { +__kernel void k_gayberne_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, + __global numtyp4* lj3_in, __global numtyp *gum, + const int stride, __global int *dev_ij, + __global acctyp4 *ans, __global acctyp *engv, + __global int *err_flag, const int eflag, + const int vflag, const int start, + const int inum, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); ii+=start; @@ -358,7 +357,7 @@ __kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj, n_stride,list_end,nbor); - numtyp4 ix=x_[i]; + numtyp4 ix; fetch4(ix,i,pos_tex); int iw=ix.w; int itype=fast_mul((int)MAX_SHARED_TYPES,iw); @@ -369,7 +368,7 @@ __kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, factor_lj = sp_lj[sbmask(j)]; j &= NEIGHMASK; - numtyp4 jx=x_[j]; + numtyp4 jx; fetch4(jx,j,pos_tex); int mtype=itype+jx.w; // Compute r12 @@ -406,3 +405,4 @@ __kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, ans,engv); } // if ii } + diff --git a/lib/gpu/lal_lj.cpp b/lib/gpu/lal_lj.cpp index a90e96f174..6c0609a17b 100644 --- a/lib/gpu/lal_lj.cpp +++ b/lib/gpu/lal_lj.cpp @@ -13,10 +13,12 @@ email : brownw@ornl.gov ***************************************************************************/ -#ifdef USE_OPENCL +#if defined(USE_OPENCL) #include "lj_cl.h" +#elif defined(USE_CUDART) +const char *lj=0; #else -#include "lj_ptx.h" +#include "lj_cubin.h" #endif #include "lal_lj.h" @@ -51,7 +53,7 @@ int LJT::init(const int ntypes, const double gpu_split, FILE *_screen) { int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, - _screen,lj); + _screen,lj,"k_lj"); if (success!=0) return success; @@ -133,20 +135,17 @@ void LJT::loop(const bool _eflag, const bool _vflag) { this->time_pair.start(); if (shared_types) { this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), - &lj3.begin(), &sp_lj.begin(), - &this->nbor->dev_nbor.begin(), - &this->_nbor_data->begin(), - &this->ans->dev_ans.begin(), - &this->ans->dev_engv.begin(), &eflag, &vflag, - &ainum, &nbor_pitch, &this->_threads_per_atom); + this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, + &vflag, &ainum, &nbor_pitch, + &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), - &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), - &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, - &nbor_pitch, &this->_threads_per_atom); + this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, &vflag, + &ainum, &nbor_pitch, &this->_threads_per_atom); } this->time_pair.stop(); } diff --git a/lib/gpu/lal_lj.cu b/lib/gpu/lal_lj.cu index 12e2a487ec..0a049d187e 100644 --- a/lib/gpu/lal_lj.cu +++ b/lib/gpu/lal_lj.cu @@ -15,14 +15,16 @@ #ifdef NV_KERNEL #include "lal_aux_fun1.h" -texture pos_tex; #ifndef _DOUBLE_DOUBLE -ucl_inline float4 fetch_pos(const int& i, const float4 *pos) - { return tex1Dfetch(pos_tex, i); } +texture pos_tex; +#else +texture pos_tex; #endif +#else +#define pos_tex x_ #endif -__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, +__kernel void k_lj(__global numtyp4 *x_, __global numtyp4 *lj1, __global numtyp4* lj3, const int lj_types, __global numtyp *sp_lj_in, __global int *dev_nbor, __global int *dev_packed, __global acctyp4 *ans, @@ -51,7 +53,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,list_end,nbor); - numtyp4 ix=fetch_pos(i,x_); //x_[i]; + numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; int itype=ix.w; numtyp factor_lj; @@ -61,7 +63,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, factor_lj = sp_lj[sbmask(j)]; j &= NEIGHMASK; - numtyp4 jx=fetch_pos(j,x_); //x_[j]; + numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; int jtype=jx.w; // Compute r12 @@ -101,7 +103,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, } // if ii } -__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, +__kernel void k_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, __global numtyp4* lj3_in, __global numtyp* sp_lj_in, __global int *dev_nbor, __global int *dev_packed, @@ -137,7 +139,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,list_end,nbor); - numtyp4 ix=fetch_pos(i,x_); //x_[i]; + numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; int iw=ix.w; int itype=fast_mul((int)MAX_SHARED_TYPES,iw); @@ -148,7 +150,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, factor_lj = sp_lj[sbmask(j)]; j &= NEIGHMASK; - numtyp4 jx=fetch_pos(j,x_); //x_[j]; + numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; int mtype=itype+jx.w; // Compute r12 diff --git a/lib/gpu/lal_lj96.cpp b/lib/gpu/lal_lj96.cpp index 6331574b85..05feadc5e2 100644 --- a/lib/gpu/lal_lj96.cpp +++ b/lib/gpu/lal_lj96.cpp @@ -13,10 +13,12 @@ email : brownw@ornl.gov ***************************************************************************/ -#ifdef USE_OPENCL +#if defined(USE_OPENCL) #include "lj96_cl.h" +#elif defined(USE_CUDART) +const char *lj96=0; #else -#include "lj96_ptx.h" +#include "lj96_cubin.h" #endif #include "lal_lj96.h" @@ -51,7 +53,7 @@ int LJ96T::init(const int ntypes, const double gpu_split, FILE *_screen) { int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, - _screen,lj96); + _screen,lj96,"k_lj96"); if (success!=0) return success; @@ -133,19 +135,17 @@ void LJ96T::loop(const bool _eflag, const bool _vflag) { this->time_pair.start(); if (shared_types) { this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), - &lj3.begin(), &sp_lj.begin(), - &this->nbor->dev_nbor.begin(), - &this->_nbor_data->begin(), - &this->ans->dev_ans.begin(), - &this->ans->dev_engv.begin(), &eflag, &vflag, - &ainum, &nbor_pitch, &this->_threads_per_atom); + this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, + &vflag, &ainum, &nbor_pitch, + &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), - &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), - &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, + this->k_pair.run(&this->atom->x, &lj1, &lj3, + &_lj_types, &sp_lj, &this->nbor->dev_nbor, + &this->_nbor_data->begin(), &this->ans->force, + &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); } this->time_pair.stop(); diff --git a/lib/gpu/lal_lj96.cu b/lib/gpu/lal_lj96.cu index c5ea89a74b..a5fc6c89df 100644 --- a/lib/gpu/lal_lj96.cu +++ b/lib/gpu/lal_lj96.cu @@ -15,14 +15,16 @@ #ifdef NV_KERNEL #include "lal_aux_fun1.h" -texture pos_tex; #ifndef _DOUBLE_DOUBLE -ucl_inline float4 fetch_pos(const int& i, const float4 *pos) - { return tex1Dfetch(pos_tex, i); } +texture pos_tex; +#else +texture pos_tex; #endif +#else +#define pos_tex x_ #endif -__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, +__kernel void k_lj96(__global numtyp4 *x_, __global numtyp4 *lj1, __global numtyp4* lj3, const int lj_types, __global numtyp *sp_lj_in, __global int *dev_nbor, __global int *dev_packed, __global acctyp4 *ans, @@ -51,7 +53,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,list_end,nbor); - numtyp4 ix=fetch_pos(i,x_); //x_[i]; + numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; int itype=ix.w; numtyp factor_lj; @@ -61,7 +63,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, factor_lj = sp_lj[sbmask(j)]; j &= NEIGHMASK; - numtyp4 jx=fetch_pos(j,x_); //x_[j]; + numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; int jtype=jx.w; // Compute r12 @@ -102,7 +104,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, } // if ii } -__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, +__kernel void k_lj96_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, __global numtyp4* lj3_in, __global numtyp* sp_lj_in, __global int *dev_nbor, __global int *dev_packed, @@ -138,7 +140,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,list_end,nbor); - numtyp4 ix=fetch_pos(i,x_); //x_[i]; + numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; int iw=ix.w; int itype=fast_mul((int)MAX_SHARED_TYPES,iw); @@ -149,7 +151,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, factor_lj = sp_lj[sbmask(j)]; j &= NEIGHMASK; - numtyp4 jx=fetch_pos(j,x_); //x_[j]; + numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; int mtype=itype+jx.w; // Compute r12 diff --git a/lib/gpu/lal_lj_class2_long.cpp b/lib/gpu/lal_lj_class2_long.cpp index d5d67e8d39..cdb040815f 100644 --- a/lib/gpu/lal_lj_class2_long.cpp +++ b/lib/gpu/lal_lj_class2_long.cpp @@ -13,10 +13,12 @@ email : brownw@ornl.gov ***************************************************************************/ -#ifdef USE_OPENCL +#if defined(USE_OPENCL) #include "lj_class2_long_cl.h" +#elif defined(USE_CUDART) +const char *lj_class2_long=0; #else -#include "lj_class2_long_ptx.h" +#include "lj_class2_long_cubin.h" #endif #include "lal_lj_class2_long.h" @@ -55,7 +57,7 @@ int LJClass2LongT::init(const int ntypes, double **host_cutsq, const double g_ewald) { int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, - _screen,lj_class2_long); + _screen,lj_class2_long,"k_lj_class2_long"); if (success!=0) return success; @@ -143,22 +145,19 @@ void LJClass2LongT::loop(const bool _eflag, const bool _vflag) { this->time_pair.start(); if (shared_types) { this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), - &lj3.begin(), &sp_lj.begin(), - &this->nbor->dev_nbor.begin(), - &this->_nbor_data->begin(), - &this->ans->dev_ans.begin(), - &this->ans->dev_engv.begin(), &eflag, &vflag, - &ainum, &nbor_pitch, &this->atom->dev_q.begin(), + this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, + &vflag, &ainum, &nbor_pitch, &this->atom->q, &_cut_coulsq, &_qqrd2e, &_g_ewald, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), - &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), - &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, - &nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq, + this->k_pair.run(&this->atom->x, &lj1, &lj3, + &_lj_types, &sp_lj, &this->nbor->dev_nbor, + &this->_nbor_data->begin(), &this->ans->force, + &this->ans->engv, &eflag, &vflag, &ainum, + &nbor_pitch, &this->atom->q, &_cut_coulsq, &_qqrd2e, &_g_ewald, &this->_threads_per_atom); } this->time_pair.stop(); diff --git a/lib/gpu/lal_lj_class2_long.cu b/lib/gpu/lal_lj_class2_long.cu index aabdbb9c2d..e08baf5a5f 100644 --- a/lib/gpu/lal_lj_class2_long.cu +++ b/lib/gpu/lal_lj_class2_long.cu @@ -14,18 +14,22 @@ // ***************************************************************************/ #ifdef NV_KERNEL + #include "lal_aux_fun1.h" +#ifndef _DOUBLE_DOUBLE texture pos_tex; texture q_tex; -#ifndef _DOUBLE_DOUBLE -ucl_inline float4 fetch_pos(const int& i, const float4 *pos) - { return tex1Dfetch(pos_tex, i); } -ucl_inline float fetch_q(const int& i, const float *q) - { return tex1Dfetch(q_tex, i); } -#endif +#else +texture pos_tex; +texture q_tex; #endif -__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, +#else +#define pos_tex x_ +#define q_tex q_ +#endif + +__kernel void k_lj_class2_long(__global numtyp4 *x_, __global numtyp4 *lj1, __global numtyp4* lj3, const int lj_types, __global numtyp *sp_lj_in, __global int *dev_nbor, __global int *dev_packed, __global acctyp4 *ans, @@ -61,8 +65,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,list_end,nbor); - numtyp4 ix=fetch_pos(i,x_); //x_[i]; - numtyp qtmp=fetch_q(i,q_); + numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; + numtyp qtmp; fetch(qtmp,i,q_tex); int itype=ix.w; for ( ; nborinit_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, - _screen,lj_coul); + _screen,lj_coul,"k_lj_coul"); if (success!=0) return success; @@ -145,23 +147,18 @@ void LJCoulT::loop(const bool _eflag, const bool _vflag) { this->time_pair.start(); if (shared_types) { this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), - &lj3.begin(), &sp_lj.begin(), - &this->nbor->dev_nbor.begin(), - &this->_nbor_data->begin(), - &this->ans->dev_ans.begin(), - &this->ans->dev_engv.begin(), &eflag, &vflag, - &ainum, &nbor_pitch, - &this->atom->dev_q.begin(), &cutsq.begin(), - &_qqrd2e, &this->_threads_per_atom); + this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, + &vflag, &ainum, &nbor_pitch, &this->atom->q, + &cutsq, &_qqrd2e, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), - &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), - &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, - &nbor_pitch, &this->atom->dev_q.begin(), - &cutsq.begin(), &_qqrd2e, &this->_threads_per_atom); + this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, + &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, + &cutsq, &_qqrd2e, &this->_threads_per_atom); } this->time_pair.stop(); } diff --git a/lib/gpu/lal_lj_coul.cu b/lib/gpu/lal_lj_coul.cu index 221e5cdc8f..feccf7fc4c 100644 --- a/lib/gpu/lal_lj_coul.cu +++ b/lib/gpu/lal_lj_coul.cu @@ -14,18 +14,22 @@ // ***************************************************************************/ #ifdef NV_KERNEL + #include "lal_aux_fun1.h" +#ifndef _DOUBLE_DOUBLE texture pos_tex; texture q_tex; -#ifndef _DOUBLE_DOUBLE -ucl_inline float4 fetch_pos(const int& i, const float4 *pos) - { return tex1Dfetch(pos_tex, i); } -ucl_inline float fetch_q(const int& i, const float *q) - { return tex1Dfetch(q_tex, i); } -#endif +#else +texture pos_tex; +texture q_tex; #endif -__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, +#else +#define pos_tex x_ +#define q_tex q_ +#endif + +__kernel void k_lj_coul(__global numtyp4 *x_, __global numtyp4 *lj1, __global numtyp4* lj3, const int lj_types, __global numtyp *sp_lj_in, __global int *dev_nbor, __global int *dev_packed, __global acctyp4 *ans, @@ -61,8 +65,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,list_end,nbor); - numtyp4 ix=fetch_pos(i,x_); //x_[i]; - numtyp qtmp=fetch_q(i,q_); + numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; + numtyp qtmp; fetch(qtmp,i,q_tex); int itype=ix.w; for ( ; nborinit_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, - _screen,lj_coul_long); + _screen,lj_coul_long,"k_lj_coul_long"); if (success!=0) return success; @@ -143,22 +145,19 @@ void LJCoulLongT::loop(const bool _eflag, const bool _vflag) { this->time_pair.start(); if (shared_types) { this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), - &lj3.begin(), &sp_lj.begin(), - &this->nbor->dev_nbor.begin(), - &this->_nbor_data->begin(), - &this->ans->dev_ans.begin(), - &this->ans->dev_engv.begin(), &eflag, &vflag, - &ainum, &nbor_pitch, &this->atom->dev_q.begin(), + this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, + &vflag, &ainum, &nbor_pitch, &this->atom->q, &_cut_coulsq, &_qqrd2e, &_g_ewald, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), - &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), - &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, - &nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq, + this->k_pair.run(&this->atom->x, &lj1, &lj3, + &_lj_types, &sp_lj, &this->nbor->dev_nbor, + &this->_nbor_data->begin(), &this->ans->force, + &this->ans->engv, &eflag, &vflag, &ainum, + &nbor_pitch, &this->atom->q, &_cut_coulsq, &_qqrd2e, &_g_ewald, &this->_threads_per_atom); } this->time_pair.stop(); diff --git a/lib/gpu/lal_lj_coul_long.cu b/lib/gpu/lal_lj_coul_long.cu index 686186a4e4..9b655ab304 100644 --- a/lib/gpu/lal_lj_coul_long.cu +++ b/lib/gpu/lal_lj_coul_long.cu @@ -14,18 +14,22 @@ // ***************************************************************************/ #ifdef NV_KERNEL + #include "lal_aux_fun1.h" +#ifndef _DOUBLE_DOUBLE texture pos_tex; texture q_tex; -#ifndef _DOUBLE_DOUBLE -ucl_inline float4 fetch_pos(const int& i, const float4 *pos) - { return tex1Dfetch(pos_tex, i); } -ucl_inline float fetch_q(const int& i, const float *q) - { return tex1Dfetch(q_tex, i); } -#endif +#else +texture pos_tex; +texture q_tex; #endif -__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, +#else +#define pos_tex x_ +#define q_tex q_ +#endif + +__kernel void k_lj_coul_long(__global numtyp4 *x_, __global numtyp4 *lj1, __global numtyp4* lj3, const int lj_types, __global numtyp *sp_lj_in, __global int *dev_nbor, __global int *dev_packed, __global acctyp4 *ans, @@ -61,8 +65,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,list_end,nbor); - numtyp4 ix=fetch_pos(i,x_); //x_[i]; - numtyp qtmp=fetch_q(i,q_); + numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; + numtyp qtmp; fetch(qtmp,i,q_tex); int itype=ix.w; for ( ; nborinit_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, - _screen,lj_expand); + _screen,lj_expand,"k_lj_expand"); if (success!=0) return success; @@ -133,20 +135,17 @@ void LJExpandT::loop(const bool _eflag, const bool _vflag) { this->time_pair.start(); if (shared_types) { this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(), - &lj3.begin(), &sp_lj.begin(), - &this->nbor->dev_nbor.begin(), - &this->_nbor_data->begin(), - &this->ans->dev_ans.begin(), - &this->ans->dev_engv.begin(), &eflag, &vflag, - &ainum, &nbor_pitch, &this->_threads_per_atom); + this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, + &vflag, &ainum, &nbor_pitch, + &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(), - &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), - &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, - &nbor_pitch, &this->_threads_per_atom); + this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, &vflag, + &ainum, &nbor_pitch, &this->_threads_per_atom); } this->time_pair.stop(); } diff --git a/lib/gpu/lal_lj_expand.cu b/lib/gpu/lal_lj_expand.cu index c4d59ab189..3e98ed7d9c 100644 --- a/lib/gpu/lal_lj_expand.cu +++ b/lib/gpu/lal_lj_expand.cu @@ -14,15 +14,19 @@ // ***************************************************************************/ #ifdef NV_KERNEL + #include "lal_aux_fun1.h" -texture pos_tex; #ifndef _DOUBLE_DOUBLE -ucl_inline float4 fetch_pos(const int& i, const float4 *pos) - { return tex1Dfetch(pos_tex, i); } -#endif +texture pos_tex; +#else +texture pos_tex; #endif -__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, +#else +#define pos_tex x_ +#endif + +__kernel void k_lj_expand(__global numtyp4 *x_, __global numtyp4 *lj1, __global numtyp4* lj3, const int lj_types, __global numtyp *sp_lj_in, __global int *dev_nbor, __global int *dev_packed, __global acctyp4 *ans, @@ -51,7 +55,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,list_end,nbor); - numtyp4 ix=fetch_pos(i,x_); //x_[i]; + numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; int itype=ix.w; numtyp factor_lj; @@ -61,7 +65,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, factor_lj = sp_lj[sbmask(j)]; j &= NEIGHMASK; - numtyp4 jx=fetch_pos(j,x_); //x_[j]; + numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; int jtype=jx.w; // Compute r12 @@ -104,7 +108,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1, } // if ii } -__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, +__kernel void k_lj_expand_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, __global numtyp4* lj3_in, __global numtyp* sp_lj_in, __global int *dev_nbor, __global int *dev_packed, @@ -140,7 +144,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,list_end,nbor); - numtyp4 ix=fetch_pos(i,x_); //x_[i]; + numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; int iw=ix.w; int itype=fast_mul((int)MAX_SHARED_TYPES,iw); @@ -151,7 +155,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, factor_lj = sp_lj[sbmask(j)]; j &= NEIGHMASK; - numtyp4 jx=fetch_pos(j,x_); //x_[j]; + numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; int mtype=itype+jx.w; // Compute r12 diff --git a/lib/gpu/lal_morse.cpp b/lib/gpu/lal_morse.cpp index 9666517750..5ec2afa3b5 100644 --- a/lib/gpu/lal_morse.cpp +++ b/lib/gpu/lal_morse.cpp @@ -13,10 +13,12 @@ email : brownw@ornl.gov ***************************************************************************/ -#ifdef USE_OPENCL +#if defined(USE_OPENCL) #include "morse_cl.h" +#elif defined(USE_CUDART) +const char *morse=0; #else -#include "morse_ptx.h" +#include "morse_cubin.h" #endif #include "lal_morse.h" @@ -51,7 +53,7 @@ int MorseT::init(const int ntypes, const double gpu_split, FILE *_screen) { int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, - _screen,morse); + _screen,morse,"k_morse"); if (success!=0) return success; @@ -132,20 +134,17 @@ void MorseT::loop(const bool _eflag, const bool _vflag) { this->time_pair.start(); if (shared_types) { this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->dev_x.begin(), &mor1.begin(), - &mor2.begin(), &sp_lj.begin(), - &this->nbor->dev_nbor.begin(), - &this->_nbor_data->begin(), - &this->ans->dev_ans.begin(), - &this->ans->dev_engv.begin(), &eflag, &vflag, - &ainum, &nbor_pitch, &this->_threads_per_atom); + this->k_pair_fast.run(&this->atom->x, &mor1, &mor2, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, + &vflag, &ainum, &nbor_pitch, + &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->dev_x.begin(), &mor1.begin(), &mor2.begin(), - &_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), - &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, - &nbor_pitch, &this->_threads_per_atom); + this->k_pair.run(&this->atom->x, &mor1, &mor2, &_types, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, &vflag, + &ainum, &nbor_pitch, &this->_threads_per_atom); } this->time_pair.stop(); } diff --git a/lib/gpu/lal_morse.cu b/lib/gpu/lal_morse.cu index bd9ae49c8c..4d89180390 100644 --- a/lib/gpu/lal_morse.cu +++ b/lib/gpu/lal_morse.cu @@ -14,15 +14,19 @@ // ***************************************************************************/ #ifdef NV_KERNEL + #include "lal_aux_fun1.h" -texture pos_tex; #ifndef _DOUBLE_DOUBLE -ucl_inline float4 fetch_pos(const int& i, const float4 *pos) - { return tex1Dfetch(pos_tex, i); } -#endif +texture pos_tex; +#else +texture pos_tex; #endif -__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *mor1, +#else +#define pos_tex x_ +#endif + +__kernel void k_morse(__global numtyp4 *x_, __global numtyp4 *mor1, __global numtyp2* mor2, const int lj_types, __global numtyp *sp_lj_in, __global int *dev_nbor, __global int *dev_packed, __global acctyp4 *ans, @@ -51,7 +55,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *mor1, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,list_end,nbor); - numtyp4 ix=fetch_pos(i,x_); //x_[i]; + numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; int itype=ix.w; numtyp factor_lj; @@ -61,7 +65,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *mor1, factor_lj = sp_lj[sbmask(j)]; j &= NEIGHMASK; - numtyp4 jx=fetch_pos(j,x_); //x_[j]; + numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; int jtype=jx.w; // Compute r12 @@ -102,7 +106,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *mor1, } // if ii } -__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *mor1_in, +__kernel void k_morse_fast(__global numtyp4 *x_, __global numtyp4 *mor1_in, __global numtyp2* mor2_in, __global numtyp* sp_lj_in, __global int *dev_nbor, __global int *dev_packed, @@ -138,7 +142,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *mor1_in, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,list_end,nbor); - numtyp4 ix=fetch_pos(i,x_); //x_[i]; + numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; int iw=ix.w; int itype=fast_mul((int)MAX_SHARED_TYPES,iw); @@ -149,7 +153,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *mor1_in, factor_lj = sp_lj[sbmask(j)]; j &= NEIGHMASK; - numtyp4 jx=fetch_pos(j,x_); //x_[j]; + numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; int mtype=itype+jx.w; // Compute r12 diff --git a/lib/gpu/lal_neighbor.cpp b/lib/gpu/lal_neighbor.cpp index 6a086745c5..a033b507a4 100644 --- a/lib/gpu/lal_neighbor.cpp +++ b/lib/gpu/lal_neighbor.cpp @@ -84,7 +84,7 @@ bool Neighbor::init(NeighborShared *shared, const int inum, _max_atoms=1000; _max_host=static_cast(static_cast(host_inum)*1.10); - _max_nbors=max_nbors; + _max_nbors=(max_nbors/threads_per_atom+1)*threads_per_atom; _maxspecial=maxspecial; if (gpu_nbor==0) @@ -124,17 +124,14 @@ void Neighbor::alloc(bool &success) { _c_bytes+=dev_packed.row_bytes(); } if (_max_host>0) { - host_nbor.clear(); - dev_host_nbor.clear(); - dev_host_numj.clear(); + nbor_host.clear(); + dev_numj_host.clear(); host_ilist.clear(); host_jlist.clear(); - success=success && (host_nbor.alloc(_max_nbors*_max_host,*dev, - UCL_RW_OPTIMIZED)==UCL_SUCCESS); - success=success && (dev_host_nbor.alloc(_max_nbors*_max_host, - *dev,UCL_WRITE_ONLY)==UCL_SUCCESS); - success=success && (dev_host_numj.alloc(_max_host,*dev, + success=(nbor_host.alloc(_max_nbors*_max_host,*dev,UCL_RW_OPTIMIZED, + UCL_WRITE_ONLY)==UCL_SUCCESS) && success; + success=success && (dev_numj_host.alloc(_max_host,*dev, UCL_WRITE_ONLY)==UCL_SUCCESS); success=success && (host_ilist.alloc(nt,*dev,UCL_NOT_PINNED)==UCL_SUCCESS); if (!success) @@ -145,16 +142,16 @@ void Neighbor::alloc(bool &success) { UCL_NOT_PINNED)==UCL_SUCCESS); if (!success) return; - int *ptr=host_nbor.begin(); + int *ptr=nbor_host.host.begin(); for (int i=0; i<_max_host; i++) { host_jlist[i]=ptr; ptr+=_max_nbors; } - _c_bytes+=dev_host_nbor.row_bytes()+dev_host_numj.row_bytes(); + _c_bytes+=nbor_host.device.row_bytes()+dev_numj_host.row_bytes(); } else { // Some OpenCL implementations return errors for NULL pointers as args - dev_host_nbor.view(dev_nbor); - dev_host_numj.view(dev_nbor); + nbor_host.device.view(dev_nbor); + dev_numj_host.view(dev_nbor); } if (_maxspecial>0) { dev_nspecial.clear(); @@ -194,10 +191,9 @@ void Neighbor::clear() { host_packed.clear(); host_acc.clear(); dev_nbor.clear(); - dev_host_nbor.clear(); + nbor_host.clear(); dev_packed.clear(); - host_nbor.clear(); - dev_host_numj.clear(); + dev_numj_host.clear(); host_ilist.clear(); host_jlist.clear(); dev_nspecial.clear(); @@ -215,8 +211,8 @@ void Neighbor::clear() { double Neighbor::host_memory_usage() const { if (_gpu_nbor>0) { if (_gpu_host) - return host_nbor.row_bytes()*host_nbor.rows()+host_ilist.row_bytes()+ - host_jlist.row_bytes(); + return nbor_host.device.row_bytes()*nbor_host.rows()+ + host_ilist.row_bytes()+host_jlist.row_bytes(); else return 0; } else @@ -285,8 +281,7 @@ void Neighbor::get_host(const int inum, int *ilist, int *numj, int GX=static_cast(ceil(static_cast(inum)*_threads_per_atom/ block_size)); _shared->k_nbor.set_size(GX,block_size); - _shared->k_nbor.run(&dev_nbor.begin(), &dev_packed.begin(), &inum, - &_threads_per_atom); + _shared->k_nbor.run(&dev_nbor, &dev_packed, &inum, &_threads_per_atom); time_kernel.stop(); } } @@ -295,31 +290,23 @@ template void Neighbor::resize_max_neighbors(const int maxn, bool &success) { if (maxn>_max_nbors) { int mn=static_cast(static_cast(maxn)*1.10); - dev_nbor.clear(); - success=success && - (dev_nbor.alloc((mn+1)*_max_atoms,*dev)==UCL_SUCCESS); + mn=(mn/_threads_per_atom+1)*_threads_per_atom; + success=success && (dev_nbor.resize((mn+1)*_max_atoms)==UCL_SUCCESS); _gpu_bytes=dev_nbor.row_bytes(); if (_max_host>0) { - host_nbor.clear(); - dev_host_nbor.clear(); - success=success && (host_nbor.alloc(mn*_max_host,*dev, - UCL_RW_OPTIMIZED)==UCL_SUCCESS); - success=success && (dev_host_nbor.alloc(mn*_max_host, - *dev,UCL_WRITE_ONLY)==UCL_SUCCESS); - int *ptr=host_nbor.begin(); + success=success && (nbor_host.resize(mn*_max_host)==UCL_SUCCESS); + int *ptr=nbor_host.host.begin(); for (int i=0; i<_max_host; i++) { host_jlist[i]=ptr; ptr+=mn; } - _gpu_bytes+=dev_host_nbor.row_bytes(); + _gpu_bytes+=nbor_host.row_bytes(); } else { - dev_host_nbor.view(dev_nbor); - dev_host_numj.view(dev_nbor); + nbor_host.device.view(dev_nbor); + dev_numj_host.view(dev_nbor); } if (_alloc_packed) { - dev_packed.clear(); - success=success && (dev_packed.alloc((mn+2)*_max_atoms,*dev, - UCL_READ_ONLY)==UCL_SUCCESS); + success=success && (dev_packed.resize((mn+2)*_max_atoms)==UCL_SUCCESS); _gpu_bytes+=dev_packed.row_bytes(); } _max_nbors=mn; @@ -337,16 +324,12 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, // Calculate number of cells and allocate storage for binning as necessary int ncellx, ncelly, ncellz, ncell_3d; - ncellx = static_cast(ceil(((subhi[0] - sublo[0]) + - 2.0*_cell_size)/_cell_size)); - ncelly = static_cast(ceil(((subhi[1] - sublo[1]) + - 2.0*_cell_size)/_cell_size)); - ncellz = static_cast(ceil(((subhi[2] - sublo[2]) + - 2.0*_cell_size)/_cell_size)); + int ghost_cells=2*_cells_in_cutoff; + ncellx = static_cast(ceil((subhi[0]-sublo[0])/_cell_size))+ghost_cells; + ncelly = static_cast(ceil((subhi[1]-sublo[1])/_cell_size))+ghost_cells; + ncellz = static_cast(ceil((subhi[2]-sublo[2])/_cell_size))+ghost_cells; ncell_3d = ncellx * ncelly * ncellz; if (ncell_3d+1>_ncells) { - dev_cell_counts.clear(); - dev_cell_counts.alloc(ncell_3d+1,dev_nbor); if (_gpu_nbor==2) { if (_ncells>0) { host_cell_counts.clear(); @@ -355,11 +338,19 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, cell_iter = new int[ncell_3d+1]; host_cell_counts.alloc(ncell_3d+1,dev_nbor); } + + if (_gpu_nbor==2 && atom.host_view()) + dev_cell_counts.view(host_cell_counts); + else { + dev_cell_counts.clear(); + dev_cell_counts.alloc(ncell_3d+1,dev_nbor); + } + _ncells=ncell_3d+1; _cell_bytes=dev_cell_counts.row_bytes(); } - const numtyp cell_size_cast=static_cast(_cell_size); + const numtyp cutoff_cast=static_cast(_cutoff); if (_maxspecial>0) { time_nbor.start(); @@ -379,8 +370,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, const int g2x=static_cast(ceil(static_cast(_maxspecial)/b2x)); const int g2y=static_cast(ceil(static_cast(nt)/b2y)); _shared->k_transpose.set_size(g2x,g2y,b2x,b2y); - _shared->k_transpose.run(&dev_special.begin(),&dev_special_t.begin(), - &_maxspecial,&nt); + _shared->k_transpose.run(&dev_special,&dev_special_t,&_maxspecial,&nt); time_transpose.stop(); } @@ -392,28 +382,48 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, // Build cell list on CPU host_cell_counts.zero(); - double m_cell_size=-_cell_size; - double dx=subhi[0]-sublo[0]+_cell_size; - double dy=subhi[1]-sublo[1]+_cell_size; - double dz=subhi[2]-sublo[2]+_cell_size; + double i_cell_size=1.0/_cell_size; - for (int i=0; idx) px=dx; - if (py>dy) py=dy; - if (pz>dz) pz=dz; + + int ix = static_cast(px*i_cell_size+1); + ix = std::max(ix,_cells_in_cutoff); + ix = std::min(ix,ncellx-offset_hi); + int iy = static_cast(py*i_cell_size+1); + iy = std::max(iy,_cells_in_cutoff); + iy = std::min(iy,ncelly-offset_hi); + int iz = static_cast(pz*i_cell_size+1); + iz = std::max(iz,_cells_in_cutoff); + iz = std::min(iz,ncellz-offset_hi); - int id=static_cast(px/_cell_size + 1.0) + - static_cast(py/_cell_size + 1.0) * ncellx + - static_cast(pz/_cell_size + 1.0) * ncellx * ncelly; + int id = ix+iy*ncellx+iz*ncellx*ncelly; + cell_id[i] = id; + host_cell_counts[id+1]++; + } - cell_id[i]=id; + for (int i=nt; i(px*i_cell_size+1); + ix = std::max(ix,0); + ix = std::min(ix,ncellx-1); + int iy = static_cast(py*i_cell_size+1); + iy = std::max(iy,0); + iy = std::min(iy,ncelly-1); + int iz = static_cast(pz*i_cell_size+1); + iz = std::max(iz,0); + iz = std::min(iz,ncellz-1); + + int id = ix+iy*ncellx+iz*ncellx*ncelly; + cell_id[i] = id; host_cell_counts[id+1]++; } @@ -451,41 +461,39 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, time_kernel.start(); _nbor_pitch=inum; - _shared->neigh_tex.bind_float(atom.dev_x,4); + _shared->neigh_tex.bind_float(atom.x,4); // If binning on GPU, do this now if (_gpu_nbor==1) { + const numtyp i_cell_size=static_cast(1.0/_cell_size); const int neigh_block=_block_cell_id; const int GX=(int)ceil((float)nall/neigh_block); const numtyp sublo0=static_cast(sublo[0]); const numtyp sublo1=static_cast(sublo[1]); const numtyp sublo2=static_cast(sublo[2]); - const numtyp subhi0=static_cast(subhi[0]); - const numtyp subhi1=static_cast(subhi[1]); - const numtyp subhi2=static_cast(subhi[2]); _shared->k_cell_id.set_size(GX,neigh_block); - _shared->k_cell_id.run(&atom.dev_x.begin(), &atom.dev_cell_id.begin(), - &atom.dev_particle_id.begin(), - &sublo0, &sublo1, &sublo2, &subhi0, &subhi1, - &subhi2, &cell_size_cast, &ncellx, &ncelly, &nall); + _shared->k_cell_id.run(&atom.x, &atom.dev_cell_id, + &atom.dev_particle_id, &sublo0, &sublo1, + &sublo2, &i_cell_size, &ncellx, &ncelly, &ncellz, + &nt, &nall, &_cells_in_cutoff); atom.sort_neighbor(nall); /* calculate cell count */ _shared->k_cell_counts.set_size(GX,neigh_block); - _shared->k_cell_counts.run(&atom.dev_cell_id.begin(), - &dev_cell_counts.begin(), &nall, &ncell_3d); + _shared->k_cell_counts.run(&atom.dev_cell_id, &dev_cell_counts, &nall, + &ncell_3d); } /* build the neighbor list */ const int cell_block=_block_nbor_build; - _shared->k_build_nbor.set_size(ncellx, ncelly*ncellz, cell_block, 1); - _shared->k_build_nbor.run(&atom.dev_x.begin(), &atom.dev_particle_id.begin(), - &dev_cell_counts.begin(), &dev_nbor.begin(), - &dev_host_nbor.begin(), &dev_host_numj.begin(), - &_max_nbors,&cell_size_cast, - &ncellx, &ncelly, &ncellz, &inum, &nt, &nall, - &_threads_per_atom); + _shared->k_build_nbor.set_size(ncellx-ghost_cells,(ncelly-ghost_cells)* + (ncellz-ghost_cells),cell_block,1); + _shared->k_build_nbor.run(&atom.x, &atom.dev_particle_id, + &dev_cell_counts, &dev_nbor, &nbor_host, + &dev_numj_host, &_max_nbors, &cutoff_cast, &ncellx, + &ncelly, &ncellz, &inum, &nt, &nall, + &_threads_per_atom, &_cells_in_cutoff); /* Get the maximum number of nbors and realloc if necessary */ UCL_D_Vec numj; @@ -494,7 +502,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, if (nt>inum) { UCL_H_Vec host_offset; host_offset.view_offset(inum,host_acc,nt-inum); - ucl_copy(host_offset,dev_host_numj,nt-inum,true); + ucl_copy(host_offset,dev_numj_host,nt-inum,true); } if (_gpu_nbor!=2) { @@ -521,17 +529,16 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, const int GX2=static_cast(ceil(static_cast (nt*_threads_per_atom)/cell_block)); _shared->k_special.set_size(GX2,cell_block); - _shared->k_special.run(&dev_nbor.begin(), &dev_host_nbor.begin(), - &dev_host_numj.begin(), &atom.dev_tag.begin(), - &dev_nspecial.begin(), &dev_special.begin(), + _shared->k_special.run(&dev_nbor, &nbor_host, &dev_numj_host, + &atom.dev_tag, &dev_nspecial, &dev_special, &inum, &nt, &_max_nbors, &_threads_per_atom); } time_kernel.stop(); time_nbor.start(); if (inumsize) + _cells_in_cutoff=static_cast(ceil(cutoff/size)); + else + _cells_in_cutoff=1; + } /// Get the size of the cutoff+skin inline double cell_size() const { return _cell_size; } @@ -203,14 +196,11 @@ class Neighbor { // ----------------- Data for GPU Neighbor Calculation --------------- - /// Host storage for device calculated neighbor lists - /** Same storage format as device matrix **/ - UCL_H_Vec host_nbor; - /// Device storage for neighbor list matrix that will be copied to host + /// Host/Device storage for device calculated neighbor lists /** - 1st row is numj * - Remaining rows are by atom, columns are nbors **/ - UCL_D_Vec dev_host_nbor; - UCL_D_Vec dev_host_numj; + UCL_Vector nbor_host; + UCL_D_Vec dev_numj_host; UCL_H_Vec host_ilist; UCL_H_Vec host_jlist; /// Device storage for special neighbor counts @@ -232,13 +222,14 @@ class Neighbor { bool _allocated, _use_packing, _nbor_time_avail, _time_device; int _gpu_nbor, _max_atoms, _max_nbors, _max_host, _nbor_pitch, _maxspecial; bool _gpu_host, _alloc_packed; - double _cell_size, _bin_time; + double _cutoff, _cell_size, _bin_time; double _gpu_bytes, _c_bytes, _cell_bytes; void alloc(bool &success); int _block_cell_2d, _block_cell_id, _max_block_nbor_build, _block_nbor_build; int _ncells, _threads_per_atom, _total_atoms; + int _cells_in_cutoff; template inline void resize_max_neighbors(const int maxn, bool &success); diff --git a/lib/gpu/lal_neighbor_gpu.cu b/lib/gpu/lal_neighbor_gpu.cu index 99990ece67..ebd18e2b2b 100644 --- a/lib/gpu/lal_neighbor_gpu.cu +++ b/lib/gpu/lal_neighbor_gpu.cu @@ -16,38 +16,48 @@ #ifdef NV_KERNEL #include "lal_preprocessor.h" -texture neigh_tex; #ifndef _DOUBLE_DOUBLE -ucl_inline float4 fetch_pos(const int& i, const float4 *pos) - { return tex1Dfetch(neigh_tex, i); } +texture pos_tex; +#else +texture pos_tex; #endif __kernel void calc_cell_id(numtyp4 *pos, unsigned *cell_id, int *particle_id, - numtyp boxlo0, - numtyp boxlo1, numtyp boxlo2, numtyp boxhi0, - numtyp boxhi1, numtyp boxhi2, numtyp cell_size, - int ncellx, int ncelly, int nall) { + numtyp boxlo0, numtyp boxlo1, numtyp boxlo2, + numtyp i_cell_size, int ncellx, int ncelly, + int ncellz, int inum, int nall, + int cells_in_cutoff) { int i = threadIdx.x + blockIdx.x*blockDim.x; if (i < nall) { - numtyp4 p = fetch_pos(i,pos); //pos[i]; + numtyp4 p; + fetch4(p,i,pos_tex); //pos[i]; p.x -= boxlo0; p.y -= boxlo1; p.z -= boxlo2; - p.x = fmaxf(p.x, -cell_size); - p.x = fminf(p.x, boxhi0-boxlo0+cell_size); - p.y = fmaxf(p.y, -cell_size); - p.y = fminf(p.y, boxhi1-boxlo1+cell_size); - p.z = fmaxf(p.z, -cell_size); - p.z = fminf(p.z, boxhi2-boxlo2+cell_size); + int ix = int(p.x*i_cell_size+cells_in_cutoff); + int iy = int(p.y*i_cell_size+cells_in_cutoff); + int iz = int(p.z*i_cell_size+cells_in_cutoff); - unsigned int id = (unsigned int)(p.x/cell_size + 1.0) - + (unsigned int)(p.y/cell_size + 1.0) * ncellx - + (unsigned int)(p.z/cell_size + 1.0) * ncellx * ncelly; + int offset_lo, offset_hi; + if (i @@ -51,7 +54,7 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen, const int nylo_out, const int nzlo_out, const int nxhi_out, const int nyhi_out, const int nzhi_out, grdtyp **rho_coeff, - grdtyp **vd_brick, const double slab_volfactor, + grdtyp **vd_brick_p, const double slab_volfactor, const int nx_pppm, const int ny_pppm, const int nz_pppm, const bool split, int &flag) { _max_bytes=10; @@ -92,8 +95,8 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen, time_interp.init(*ucl_device); time_interp.zero(); - pos_tex.bind_float(atom->dev_x,4); - q_tex.bind_float(atom->dev_q,1); + pos_tex.bind_float(atom->x,4); + q_tex.bind_float(atom->q,1); _allocated=true; _max_bytes=0; @@ -133,14 +136,12 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen, _npts_y=nyhi_out-nylo_out+1; _npts_z=nzhi_out-nzlo_out+1; _npts_yx=_npts_x*_npts_y; - success=success && (d_brick.alloc(_npts_x*_npts_y*_npts_z*4,*ucl_device)== + success=success && (brick.alloc(_npts_x*_npts_y*_npts_z,*ucl_device)== UCL_SUCCESS); - success=success && (h_brick.alloc(_npts_x*_npts_y*_npts_z,*ucl_device)== + success=success && (vd_brick.alloc(_npts_x*_npts_y*_npts_z*4,*ucl_device)== UCL_SUCCESS); - success=success && (h_vd_brick.alloc(_npts_x*_npts_y*_npts_z*4,*ucl_device)== - UCL_SUCCESS); - *vd_brick=h_vd_brick.begin(); - _max_bytes+=d_brick.row_bytes(); + *vd_brick_p=vd_brick.host.begin(); + _max_bytes+=brick.device.row_bytes()+vd_brick.device.row_bytes(); // Allocate vector with count of atoms assigned to each grid point _nlocal_x=_npts_x+_nlower-_nupper; @@ -158,20 +159,19 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen, _max_bytes+=d_brick_atoms.row_bytes(); // Allocate error flags for checking out of bounds atoms - success=success && (h_error_flag.alloc(1,*ucl_device)==UCL_SUCCESS); - success=success && (d_error_flag.alloc(1,*ucl_device,UCL_WRITE_ONLY)== - UCL_SUCCESS); + success=success && (error_flag.alloc(1,*ucl_device,UCL_RW_OPTIMIZED, + UCL_WRITE_ONLY)==UCL_SUCCESS); if (!success) { flag=-3; return 0; } - d_error_flag.zero(); + error_flag.device.zero(); _max_bytes+=1; _cpu_idle_time=0.0; - return h_brick.begin(); + return brick.host.begin(); } template @@ -181,12 +181,10 @@ void PPPMT::clear(const double cpu_time) { _allocated=false; _precompute_done=false; - d_brick.clear(); - h_brick.clear(); - h_vd_brick.clear(); + brick.clear(); + vd_brick.clear(); d_brick_counts.clear(); - h_error_flag.clear(); - d_error_flag.clear(); + error_flag.clear(); d_brick_atoms.clear(); acc_timers(); @@ -269,11 +267,11 @@ void PPPMT::_precompute(const int ago, const int nlocal, const int nall, device->zero(d_brick_counts,d_brick_counts.numel()); k_particle_map.set_size(GX,BX); - k_particle_map.run(&atom->dev_x.begin(), &atom->dev_q.begin(), &f_delvolinv, - &ainum, &d_brick_counts.begin(), &d_brick_atoms.begin(), - &_brick_x, &_brick_y, &_brick_z, &_delxinv, &_delyinv, - &_delzinv, &_nlocal_x, &_nlocal_y, &_nlocal_z, - &_atom_stride, &_max_brick_atoms, &d_error_flag.begin()); + k_particle_map.run(&atom->x, &atom->q, &f_delvolinv, &ainum, + &d_brick_counts, &d_brick_atoms, &_brick_x, &_brick_y, + &_brick_z, &_delxinv, &_delyinv, &_delzinv, &_nlocal_x, + &_nlocal_y, &_nlocal_z, &_atom_stride, &_max_brick_atoms, + &error_flag); time_map.stop(); time_rho.start(); @@ -282,15 +280,14 @@ void PPPMT::_precompute(const int ago, const int nlocal, const int nall, GX=static_cast(ceil(static_cast(_npts_y*_npts_z)/ _block_pencils)); k_make_rho.set_size(GX,BX); - k_make_rho.run(&d_brick_counts.begin(), &d_brick_atoms.begin(), - &d_brick.begin(), &d_rho_coeff.begin(), &_atom_stride, - &_npts_x, &_npts_y, &_npts_z, &_nlocal_x, &_nlocal_y, - &_nlocal_z, &_order_m_1, &_order, &_order2); + k_make_rho.run(&d_brick_counts, &d_brick_atoms, &brick, &d_rho_coeff, + &_atom_stride, &_npts_x, &_npts_y, &_npts_z, &_nlocal_x, + &_nlocal_y, &_nlocal_z, &_order_m_1, &_order, &_order2); time_rho.stop(); time_out.start(); - ucl_copy(h_brick,d_brick,_npts_yx*_npts_z,true); - ucl_copy(h_error_flag,d_error_flag,true); + brick.update_host(_npts_yx*_npts_z,true); + error_flag.update_host(true); time_out.stop(); _precompute_done=true; @@ -322,18 +319,17 @@ int PPPMT::spread(const int ago, const int nlocal, const int nall, _precompute_done=false; - if (h_error_flag[0]==2) { + if (error_flag[0]==2) { // Not enough storage for atoms on the brick _max_brick_atoms*=2; - d_error_flag.zero(); - d_brick_atoms.clear(); - d_brick_atoms.alloc(_atom_stride*_max_brick_atoms,*ucl_device); + error_flag.device.zero(); + d_brick_atoms.resize(_atom_stride*_max_brick_atoms); _max_bytes+=d_brick_atoms.row_bytes(); return spread(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo, delxinv,delyinv,delzinv); } - return h_error_flag[0]; + return error_flag[0]; } // --------------------------------------------------------------------------- @@ -342,7 +338,7 @@ int PPPMT::spread(const int ago, const int nlocal, const int nall, template void PPPMT::interp(const grdtyp qqrd2e_scale) { time_in.start(); - ucl_copy(d_brick,h_vd_brick,true); + vd_brick.update_device(true); time_in.stop(); time_interp.start(); @@ -353,10 +349,10 @@ void PPPMT::interp(const grdtyp qqrd2e_scale) { int ainum=this->ans->inum(); k_interp.set_size(GX,BX); - k_interp.run(&atom->dev_x.begin(), &atom->dev_q.begin(), &ainum, - &d_brick.begin(), &d_rho_coeff.begin(), &_npts_x, &_npts_yx, - &_brick_x, &_brick_y, &_brick_z, &_delxinv, &_delyinv, &_delzinv, - &_order, &_order2, &qqrd2e_scale, &ans->dev_ans.begin()); + k_interp.run(&atom->x, &atom->q, &ainum, &vd_brick, &d_rho_coeff, + &_npts_x, &_npts_yx, &_brick_x, &_brick_y, &_brick_z, &_delxinv, + &_delyinv, &_delzinv, &_order, &_order2, &qqrd2e_scale, + &ans->force); time_interp.stop(); ans->copy_answers(false,false,false,false); @@ -408,4 +404,3 @@ void PPPMT::compile_kernels(UCL_Device &dev) { template class PPPM; template class PPPM; - diff --git a/lib/gpu/lal_pppm.cu b/lib/gpu/lal_pppm.cu index 5204180e83..646afa5900 100644 --- a/lib/gpu/lal_pppm.cu +++ b/lib/gpu/lal_pppm.cu @@ -14,14 +14,14 @@ // ***************************************************************************/ #ifdef NV_KERNEL + #include "lal_preprocessor.h" +#ifndef _DOUBLE_DOUBLE texture pos_tex; texture q_tex; -#ifndef _DOUBLE_DOUBLE -ucl_inline float4 fetch_pos(const int& i, const float4 *pos) - { return tex1Dfetch(pos_tex, i); } -ucl_inline float fetch_q(const int& i, const float *q) - { return tex1Dfetch(q_tex, i); } +#else +texture pos_tex; +texture q_tex; #endif // Allow PPPM to compile without atomics for NVIDIA 1.0 cards, error @@ -31,6 +31,8 @@ ucl_inline float fetch_q(const int& i, const float *q) #endif #else +#define pos_tex x_ +#define q_tex q_ #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics: enable #endif @@ -59,9 +61,11 @@ __kernel void particle_map(__global numtyp4 *x_, __global numtyp *q_, int nx,ny,nz; if (iiresize(nall, success)) { - pos_tex.bind_float(atom->dev_x,4); - q_tex.bind_float(atom->dev_q,1); + pos_tex.bind_float(atom->x,4); + q_tex.bind_float(atom->q,1); } ans->resize(inum,success); } @@ -138,8 +140,8 @@ class PPPM { // --------------------------- GRID DATA -------------------------- - UCL_H_Vec h_brick, h_vd_brick; - UCL_D_Vec d_brick; + UCL_Vector brick; + UCL_Vector vd_brick; // Count of number of atoms assigned to each grid point UCL_D_Vec d_brick_counts; @@ -147,8 +149,7 @@ class PPPM { UCL_D_Vec d_brick_atoms; // Error checking for out of bounds atoms - UCL_D_Vec d_error_flag; - UCL_H_Vec h_error_flag; + UCL_Vector error_flag; // Number of grid points in brick (including ghost) int _npts_x, _npts_y, _npts_z, _npts_yx; diff --git a/lib/gpu/lal_precision.h b/lib/gpu/lal_precision.h index 03c41a0df3..e31b10037e 100644 --- a/lib/gpu/lal_precision.h +++ b/lib/gpu/lal_precision.h @@ -16,6 +16,10 @@ #ifndef LAL_PRECISION_H #define LAL_PRECISION_H +#if defined(USE_CUDART) +#include +#endif + struct _lgpu_int2 { int x; int y; }; @@ -108,3 +112,4 @@ enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE}; #endif #endif + diff --git a/lib/gpu/lal_preprocessor.h b/lib/gpu/lal_preprocessor.h index 0ea3a1ca6d..b817bbe551 100644 --- a/lib/gpu/lal_preprocessor.h +++ b/lib/gpu/lal_preprocessor.h @@ -107,7 +107,7 @@ #define BLOCK_NBOR_BUILD 128 #define BLOCK_PAIR 128 #define BLOCK_BIO_PAIR 128 -#define MAX_SHARED_TYPES 11 +#define MAX_SHARED_TYPES 8 #else @@ -129,8 +129,21 @@ #define MAX_BIO_SHARED_TYPES 128 #ifdef _DOUBLE_DOUBLE -ucl_inline double4 fetch_pos(const int& i, const double4 *pos) { return pos[i]; }; -ucl_inline double fetch_q(const int& i, const double *q) { return q[i]; }; +#define fetch4(ans,i,pos_tex) { \ + int4 xy = tex1Dfetch(pos_tex,i*2); \ + int4 zt = tex1Dfetch(pos_tex,i*2+1); \ + ans.x=__hiloint2double(xy.y, xy.x); \ + ans.y=__hiloint2double(xy.w, xy.z); \ + ans.z=__hiloint2double(zt.y, zt.x); \ + ans.w=__hiloint2double(zt.w, zt.z); \ +} +#define fetch(ans,i,q_tex) { \ + int2 qt = tex1Dfetch(q_tex,i); \ + ans=__hiloint2double(qt.y, qt.x); \ +} +#else +#define fetch4(ans,i,pos_tex) ans=tex1Dfetch(pos_tex, i); +#define fetch(ans,i,q_tex) ans=tex1Dfetch(q_tex,i); #endif #if (__CUDA_ARCH__ < 200) @@ -293,8 +306,8 @@ typedef struct _double4 double4; #define BLOCK_ID_Y get_group_id(1) #define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE) #define ucl_inline inline -#define fetch_pos(i,y) x_[i] -#define fetch_q(i,y) q_[i] +#define fetch4(ans,i,x) ans=x[i] +#define fetch(ans,i,q) ans=q[i] #define ucl_atan atan #define ucl_cbrt cbrt diff --git a/lib/gpu/lal_re_squared.cpp b/lib/gpu/lal_re_squared.cpp index 50f0503b3d..8f7ef24a11 100644 --- a/lib/gpu/lal_re_squared.cpp +++ b/lib/gpu/lal_re_squared.cpp @@ -13,12 +13,15 @@ email : brownw@ornl.gov ***************************************************************************/ -#ifdef USE_OPENCL +#if defined(USE_OPENCL) #include "re_squared_cl.h" #include "re_squared_lj_cl.h" +#elif defined(USE_CUDART) +const char *re_squared=0; +const char *re_squared_lj=0; #else -#include "re_squared_ptx.h" -#include "re_squared_lj_ptx.h" +#include "re_squared_cubin.h" +#include "re_squared_lj_cubin.h" #endif #include "lal_re_squared.h" @@ -54,7 +57,8 @@ int RESquaredT::init(const int ntypes, double **host_shape, double **host_well, const double gpu_split, FILE *_screen) { int success; success=this->init_base(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, - _screen,ntypes,h_form,re_squared,re_squared_lj,true); + _screen,ntypes,h_form,re_squared,re_squared_lj, + "k_resquared",true); if (success!=0) return success; @@ -198,13 +202,13 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) { this->time_ellipsoid.start(); this->k_ellipsoid.set_size(GX,BX); - this->k_ellipsoid.run(&this->atom->dev_x.begin(), - &this->atom->dev_quat.begin(), &this->shape.begin(), &this->well.begin(), - &this->special_lj.begin(), &this->sigma_epsilon.begin(), - &this->_lj_types, &this->nbor->dev_nbor.begin(), &stride, - &this->ans->dev_ans.begin(),&ainum,&this->ans->dev_engv.begin(), - &this->dev_error.begin(), &eflag, &vflag, &this->_last_ellipse, - &this->_threads_per_atom); + this->k_ellipsoid.run(&this->atom->x, &this->atom->quat, + &this->shape, &this->well, &this->special_lj, + &this->sigma_epsilon, &this->_lj_types, + &this->nbor->dev_nbor, &stride, + &this->ans->force,&ainum, &this->ans->engv, + &this->dev_error, &eflag, &vflag, + &this->_last_ellipse, &this->_threads_per_atom); this->time_ellipsoid.stop(); // ------------ ELLIPSE_SPHERE --------------- @@ -215,13 +219,14 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) { this->time_ellipsoid2.start(); this->k_ellipsoid_sphere.set_size(GX,BX); - this->k_ellipsoid_sphere.run(&this->atom->dev_x.begin(), - &this->atom->dev_quat.begin(), &this->shape.begin(), &this->well.begin(), - &this->special_lj.begin(), &this->sigma_epsilon.begin(), - &this->_lj_types, &this->nbor->dev_nbor.begin(), &stride, - &this->ans->dev_ans.begin(),&ainum,&this->ans->dev_engv.begin(), - &this->dev_error.begin(), &eflag, &vflag, &this->_last_ellipse, - &this->_threads_per_atom); + this->k_ellipsoid_sphere.run(&this->atom->x, &this->atom->quat, + &this->shape, &this->well, &this->special_lj, + &this->sigma_epsilon, &this->_lj_types, + &this->nbor->dev_nbor, &stride, + &this->ans->force,&ainum, + &this->ans->engv, &this->dev_error, + &eflag, &vflag, &this->_last_ellipse, + &this->_threads_per_atom); this->time_ellipsoid2.stop(); if (this->_last_ellipse==this->ans->inum()) { @@ -245,17 +250,18 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) { this->time_ellipsoid3.start(); this->k_sphere_ellipsoid.set_size(GX,BX); - this->k_sphere_ellipsoid.run(&this->atom->dev_x.begin(), - &this->atom->dev_quat.begin(), &this->shape.begin(), - &this->well.begin(), &this->special_lj.begin(), - &this->sigma_epsilon.begin(), &this->_lj_types, - &this->nbor->dev_nbor.begin(), &stride, &this->ans->dev_ans.begin(), - &this->ans->dev_engv.begin(), &this->dev_error.begin(), &eflag, - &vflag, &this->_last_ellipse, &ainum, &this->_threads_per_atom); + this->k_sphere_ellipsoid.run(&this->atom->x, &this->atom->quat, + &this->shape, &this->well, &this->special_lj, + &this->sigma_epsilon, &this->_lj_types, + &this->nbor->dev_nbor, &stride, + &this->ans->force, &this->ans->engv, + &this->dev_error, &eflag, &vflag, + &this->_last_ellipse, &ainum, + &this->_threads_per_atom); this->time_ellipsoid3.stop(); } else { - this->ans->dev_ans.zero(); - this->ans->dev_engv.zero(); + this->ans->force.zero(); + this->ans->engv.zero(); this->time_nbor1.zero(); this->time_ellipsoid.zero(); this->time_nbor2.zero(); @@ -269,19 +275,19 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) { if (this->_last_ellipseans->inum()) { if (this->_shared_types) { this->k_lj_fast.set_size(GX,BX); - this->k_lj_fast.run(&this->atom->dev_x.begin(), &this->lj1.begin(), - &this->lj3.begin(), &this->special_lj.begin(), &stride, - &this->nbor->dev_packed.begin(), &this->ans->dev_ans.begin(), - &this->ans->dev_engv.begin(), &this->dev_error.begin(), - &eflag, &vflag, &this->_last_ellipse, &ainum, - &this->_threads_per_atom); + this->k_lj_fast.run(&this->atom->x, &this->lj1, &this->lj3, + &this->special_lj, &stride, + &this->nbor->dev_packed, &this->ans->force, + &this->ans->engv, &this->dev_error, + &eflag, &vflag, &this->_last_ellipse, &ainum, + &this->_threads_per_atom); } else { this->k_lj.set_size(GX,BX); - this->k_lj.run(&this->atom->dev_x.begin(), &this->lj1.begin(), - &this->lj3.begin(), &this->_lj_types, &this->special_lj.begin(), - &stride, &this->nbor->dev_packed.begin(), &this->ans->dev_ans.begin(), - &this->ans->dev_engv.begin(), &this->dev_error.begin(), &eflag, - &vflag, &this->_last_ellipse, &ainum, &this->_threads_per_atom); + this->k_lj.run(&this->atom->x, &this->lj1, &this->lj3, + &this->_lj_types, &this->special_lj, &stride, + &this->nbor->dev_packed, &this->ans->force, + &this->ans->engv, &this->dev_error, &eflag, &vflag, + &this->_last_ellipse, &ainum, &this->_threads_per_atom); } } this->time_lj.stop(); @@ -295,13 +301,12 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) { this->time_nbor1.stop(); this->time_ellipsoid.start(); this->k_ellipsoid.set_size(GX,BX); - this->k_ellipsoid.run(&this->atom->dev_x.begin(), - &this->atom->dev_quat.begin(), &this->shape.begin(), &this->well.begin(), - &this->special_lj.begin(), &this->sigma_epsilon.begin(), - &this->_lj_types, &this->nbor->dev_nbor.begin(), &stride, - &this->ans->dev_ans.begin(), &ainum, &this->ans->dev_engv.begin(), - &this->dev_error.begin(), &eflag, &vflag, &ainum, - &this->_threads_per_atom); + this->k_ellipsoid.run(&this->atom->x, &this->atom->quat, + &this->shape, &this->well, &this->special_lj, + &this->sigma_epsilon, &this->_lj_types, + &this->nbor->dev_nbor, &stride, &this->ans->force, + &ainum, &this->ans->engv, &this->dev_error, + &eflag, &vflag, &ainum, &this->_threads_per_atom); this->time_ellipsoid.stop(); } } diff --git a/lib/gpu/lal_re_squared.cu b/lib/gpu/lal_re_squared.cu index 63057a30d9..c858b09801 100644 --- a/lib/gpu/lal_re_squared.cu +++ b/lib/gpu/lal_re_squared.cu @@ -32,15 +32,15 @@ ucl_inline numtyp det_prime(const numtyp m[9], const numtyp m2[9]) return ans; } -__kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q, - __global numtyp4* shape, __global numtyp4* well, - __global numtyp *splj, __global numtyp2* sig_eps, - const int ntypes, __global int *dev_nbor, - const int stride, __global acctyp4 *ans, - const int astride, __global acctyp *engv, - __global int *err_flag, const int eflag, - const int vflag, const int inum, - const int t_per_atom) { +__kernel void k_resquared(__global numtyp4* x_,__global numtyp4 *q, + __global numtyp4* shape, __global numtyp4* well, + __global numtyp *splj, __global numtyp2* sig_eps, + const int ntypes, __global int *dev_nbor, + const int stride, __global acctyp4 *ans, + const int astride, __global acctyp *engv, + __global int *err_flag, const int eflag, + const int vflag, const int inum, + const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -73,7 +73,7 @@ __kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q, nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj, n_stride,nbor_end,nbor); - numtyp4 ix=x_[i]; + numtyp4 ix; fetch4(ix,i,pos_tex); int itype=ix.w; numtyp a1[9]; // Rotation matrix (lab->body) @@ -122,7 +122,7 @@ __kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q, factor_lj = sp_lj[sbmask(j)]; j &= NEIGHMASK; - numtyp4 jx=x_[j]; + numtyp4 jx; fetch4(jx,j,pos_tex); int jtype=jx.w; // Compute r12 diff --git a/lib/gpu/lal_re_squared_lj.cu b/lib/gpu/lal_re_squared_lj.cu index a0c82ea294..5c46c21e45 100644 --- a/lib/gpu/lal_re_squared_lj.cu +++ b/lib/gpu/lal_re_squared_lj.cu @@ -17,12 +17,13 @@ #include "lal_ellipsoid_extra.h" #endif -__kernel void kernel_ellipsoid_sphere(__global numtyp4* x_,__global numtyp4 *q, - __global numtyp4* shape, __global numtyp4* well, - __global numtyp *splj, __global numtyp2* sig_eps, - const int ntypes, __global int *dev_nbor, const int stride, - __global acctyp4 *ans, const int astride, - __global acctyp *engv, __global int *err_flag, +__kernel void k_resquared_ellipsoid_sphere(__global numtyp4* x_, + __global numtyp4 *q, __global numtyp4* shape, + __global numtyp4* well, __global numtyp *splj, + __global numtyp2* sig_eps, const int ntypes, + __global int *dev_nbor, const int stride, + __global acctyp4 *ans, const int astride, + __global acctyp *engv, __global int *err_flag, const int eflag, const int vflag, const int inum, const int t_per_atom) { int tid, ii, offset; @@ -59,7 +60,7 @@ __kernel void kernel_ellipsoid_sphere(__global numtyp4* x_,__global numtyp4 *q, nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj, n_stride,nbor_end,nbor); - numtyp4 ix=x_[i]; + numtyp4 ix; fetch4(ix,i,pos_tex); int itype=ix.w; numtyp a[9]; // Rotation matrix (lab->body) @@ -84,7 +85,7 @@ __kernel void kernel_ellipsoid_sphere(__global numtyp4* x_,__global numtyp4 *q, factor_lj = sp_lj[sbmask(j)]; j &= NEIGHMASK; - numtyp4 jx=x_[j]; + numtyp4 jx; fetch4(jx,j,pos_tex); int jtype=jx.w; // Compute r12 @@ -331,14 +332,14 @@ __kernel void kernel_ellipsoid_sphere(__global numtyp4* x_,__global numtyp4 *q, } // if ii } -__kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q, - __global numtyp4* shape,__global numtyp4* well, - __global numtyp *splj, __global numtyp2* sig_eps, - const int ntypes, __global int *dev_nbor, - const int stride, __global acctyp4 *ans, - __global acctyp *engv, __global int *err_flag, - const int eflag, const int vflag,const int start, - const int inum, const int t_per_atom) { +__kernel void k_resquared_sphere_ellipsoid(__global numtyp4 *x_, + __global numtyp4 *q, __global numtyp4* shape, + __global numtyp4* well, __global numtyp *splj, + __global numtyp2* sig_eps, const int ntypes, + __global int *dev_nbor, const int stride, + __global acctyp4 *ans, __global acctyp *engv, + __global int *err_flag, const int eflag, const int vflag, + const int start, const int inum, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); ii+=start; @@ -370,7 +371,7 @@ __kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q, nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,j,numj, n_stride,nbor_end,nbor); - numtyp4 jx=x_[j]; + numtyp4 jx; fetch4(jx,j,pos_tex); int jtype=jx.w; numtyp factor_lj; @@ -379,7 +380,7 @@ __kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q, factor_lj = sp_lj[sbmask(i)]; i &= NEIGHMASK; - numtyp4 ix=x_[i]; + numtyp4 ix; fetch4(ix,i,pos_tex); int itype=ix.w; numtyp a[9]; // Rotation matrix (lab->body) @@ -524,14 +525,13 @@ __kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q, } // if ii } -__kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1, - __global numtyp4* lj3, const int lj_types, - __global numtyp *gum, - const int stride, __global int *dev_ij, - __global acctyp4 *ans, __global acctyp *engv, - __global int *err_flag, const int eflag, - const int vflag, const int start, const int inum, - const int t_per_atom) { +__kernel void k_resquared_lj(__global numtyp4 *x_, __global numtyp4 *lj1, + __global numtyp4* lj3, const int lj_types, + __global numtyp *gum, const int stride, + __global int *dev_ij, __global acctyp4 *ans, + __global acctyp *engv, __global int *err_flag, + const int eflag, const int vflag, const int start, + const int inum, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); ii+=start; @@ -557,7 +557,7 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1, nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj, n_stride,list_end,nbor); - numtyp4 ix=x_[i]; + numtyp4 ix; fetch4(ix,i,pos_tex); int itype=ix.w; numtyp factor_lj; @@ -567,7 +567,7 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1, factor_lj = sp_lj[sbmask(j)]; j &= NEIGHMASK; - numtyp4 jx=x_[j]; + numtyp4 jx; fetch4(jx,j,pos_tex); int jtype=jx.w; // Compute r12 @@ -606,13 +606,12 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1, } // if ii } -__kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, - __global numtyp4* lj3_in, __global numtyp *gum, - const int stride, __global int *dev_ij, - __global acctyp4 *ans, __global acctyp *engv, - __global int *err_flag, const int eflag, - const int vflag, const int start, const int inum, - const int t_per_atom) { +__kernel void k_resquared_lj_fast(__global numtyp4 *x_, + __global numtyp4 *lj1_in, __global numtyp4* lj3_in, + __global numtyp *gum, const int stride, __global int *dev_ij, + __global acctyp4 *ans, __global acctyp *engv, + __global int *err_flag, const int eflag, const int vflag, + const int start, const int inum, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); ii+=start; @@ -645,7 +644,7 @@ __kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj, n_stride,list_end,nbor); - numtyp4 ix=x_[i]; + numtyp4 ix; fetch4(ix,i,pos_tex); int iw=ix.w; int itype=fast_mul((int)MAX_SHARED_TYPES,iw); @@ -656,7 +655,7 @@ __kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, factor_lj = sp_lj[sbmask(j)]; j &= NEIGHMASK; - numtyp4 jx=x_[j]; + numtyp4 jx; fetch4(jx,j,pos_tex); int mtype=itype+jx.w; // Compute r12 diff --git a/lib/gpu/lal_table.cpp b/lib/gpu/lal_table.cpp index 5b3d934e53..d7c84d65ca 100644 --- a/lib/gpu/lal_table.cpp +++ b/lib/gpu/lal_table.cpp @@ -13,10 +13,12 @@ email : nguyentd@ornl.gov ***************************************************************************/ -#ifdef USE_OPENCL +#if defined(USE_OPENCL) #include "table_cl.h" +#elif defined(USE_CUDART) +const char *table=0; #else -#include "table_ptx.h" +#include "table_cubin.h" #endif #include "lal_table.h" @@ -56,17 +58,17 @@ int TableT::init(const int ntypes, const double gpu_split, FILE *_screen, int tabstyle, int ntables, int tablength) { int success; - success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, - _screen,table); + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size, + gpu_split,_screen,table,"k_table"); if (success!=0) return success; - k_pair_linear.set_function(*(this->pair_program),"kernel_pair_linear"); - k_pair_linear_fast.set_function(*(this->pair_program),"kernel_pair_linear_fast"); - k_pair_spline.set_function(*(this->pair_program),"kernel_pair_spline"); - k_pair_spline_fast.set_function(*(this->pair_program),"kernel_pair_spline_fast"); - k_pair_bitmap.set_function(*(this->pair_program),"kernel_pair_bitmap"); - k_pair_bitmap_fast.set_function(*(this->pair_program),"kernel_pair_bitmap_fast"); + k_pair_linear.set_function(*(this->pair_program),"k_table_linear"); + k_pair_linear_fast.set_function(*(this->pair_program),"k_table_linear_fast"); + k_pair_spline.set_function(*(this->pair_program),"k_table_spline"); + k_pair_spline_fast.set_function(*(this->pair_program),"k_table_spline_fast"); + k_pair_bitmap.set_function(*(this->pair_program),"k_table_bitmap"); + k_pair_bitmap_fast.set_function(*(this->pair_program),"k_table_bitmap_fast"); _compiled_styles = true; // If atom type constants fit in shared memory use fast kernel @@ -264,84 +266,71 @@ void TableT::loop(const bool _eflag, const bool _vflag) { if (shared_types) { if (_tabstyle == LOOKUP) { this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->dev_x.begin(), &tabindex.begin(), - &coeff2.begin(), &coeff3.begin(), - &coeff4.begin(), &cutsq.begin(), &sp_lj.begin(), - &this->nbor->dev_nbor.begin(), - &this->_nbor_data->begin(), - &this->ans->dev_ans.begin(), - &this->ans->dev_engv.begin(), &eflag, &vflag, - &ainum, &nbor_pitch, &this->_threads_per_atom, - &_tablength); + this->k_pair_fast.run(&this->atom->x, &tabindex, &coeff2, &coeff3, + &coeff4, &cutsq, &sp_lj, &this->nbor->dev_nbor, + &this->_nbor_data->begin(), &this->ans->force, + &this->ans->engv, &eflag, &vflag, &ainum, + &nbor_pitch, &this->_threads_per_atom, &_tablength); } else if (_tabstyle == LINEAR) { this->k_pair_linear_fast.set_size(GX,BX); - this->k_pair_linear_fast.run(&this->atom->dev_x.begin(), &tabindex.begin(), - &coeff2.begin(), &coeff3.begin(), - &coeff4.begin(), &cutsq.begin(), &sp_lj.begin(), - &this->nbor->dev_nbor.begin(), - &this->_nbor_data->begin(), - &this->ans->dev_ans.begin(), - &this->ans->dev_engv.begin(), &eflag, &vflag, - &ainum, &nbor_pitch, &this->_threads_per_atom, - &_tablength); + this->k_pair_linear_fast.run(&this->atom->x, &tabindex, &coeff2, + &coeff3, &coeff4, &cutsq, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, + &eflag, &vflag, &ainum, &nbor_pitch, + &this->_threads_per_atom, &_tablength); } else if (_tabstyle == SPLINE) { this->k_pair_spline_fast.set_size(GX,BX); - this->k_pair_spline_fast.run(&this->atom->dev_x.begin(), &tabindex.begin(), - &coeff2.begin(), &coeff3.begin(), - &coeff4.begin(), &cutsq.begin(), &sp_lj.begin(), - &this->nbor->dev_nbor.begin(), - &this->_nbor_data->begin(), - &this->ans->dev_ans.begin(), - &this->ans->dev_engv.begin(), &eflag, &vflag, - &ainum, &nbor_pitch, &this->_threads_per_atom, - &_tablength); + this->k_pair_spline_fast.run(&this->atom->x, &tabindex, &coeff2, + &coeff3, &coeff4, &cutsq, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, + &eflag, &vflag, &ainum, &nbor_pitch, + &this->_threads_per_atom, &_tablength); } else if (_tabstyle == BITMAP) { this->k_pair_bitmap_fast.set_size(GX,BX); - this->k_pair_bitmap_fast.run(&this->atom->dev_x.begin(), &tabindex.begin(), - &nshiftbits.begin(), &nmask.begin(), - &coeff2.begin(), &coeff3.begin(), - &coeff4.begin(), &cutsq.begin(), &sp_lj.begin(), - &this->nbor->dev_nbor.begin(), - &this->_nbor_data->begin(), - &this->ans->dev_ans.begin(), - &this->ans->dev_engv.begin(), &eflag, &vflag, - &ainum, &nbor_pitch, &this->_threads_per_atom, - &_tablength); + this->k_pair_bitmap_fast.run(&this->atom->x, &tabindex, &nshiftbits, + &nmask, &coeff2, &coeff3, &coeff4, &cutsq, + &sp_lj, &this->nbor->dev_nbor, + &this->_nbor_data->begin(), &this->ans->force, + &this->ans->engv, &eflag, &vflag, + &ainum, &nbor_pitch, + &this->_threads_per_atom, &_tablength); } } else { if (_tabstyle == LOOKUP) { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->dev_x.begin(), &tabindex.begin(), - &coeff2.begin(), &coeff3.begin(), &coeff4.begin(), &_lj_types, - &cutsq.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), - &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, - &nbor_pitch, &this->_threads_per_atom, &_tablength); + this->k_pair.run(&this->atom->x, &tabindex, &coeff2, &coeff3, + &coeff4, &_lj_types, &cutsq, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, + &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, + &_tablength); } else if (_tabstyle == LINEAR) { this->k_pair_linear.set_size(GX,BX); - this->k_pair_linear.run(&this->atom->dev_x.begin(), &tabindex.begin(), - &coeff2.begin(), &coeff3.begin(), &coeff4.begin(), &_lj_types, - &cutsq.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), - &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, - &nbor_pitch, &this->_threads_per_atom, &_tablength); + this->k_pair_linear.run(&this->atom->x, &tabindex, &coeff2, &coeff3, + &coeff4, &_lj_types, &cutsq, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, + &vflag, &ainum, &nbor_pitch, + &this->_threads_per_atom, &_tablength); } else if (_tabstyle == SPLINE) { this->k_pair_spline.set_size(GX,BX); - this->k_pair_spline.run(&this->atom->dev_x.begin(), &tabindex.begin(), - &coeff2.begin(), &coeff3.begin(), &coeff4.begin(), &_lj_types, - &cutsq.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), - &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, - &nbor_pitch, &this->_threads_per_atom, &_tablength); + this->k_pair_spline.run(&this->atom->x, &tabindex, &coeff2, &coeff3, + &coeff4, &_lj_types, &cutsq, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, + &vflag, &ainum, &nbor_pitch, + &this->_threads_per_atom, &_tablength); } else if (_tabstyle == BITMAP) { this->k_pair_bitmap.set_size(GX,BX); - this->k_pair_bitmap.run(&this->atom->dev_x.begin(), &tabindex.begin(), - &nshiftbits.begin(), &nmask.begin(), - &coeff2.begin(), &coeff3.begin(), &coeff4.begin(), &_lj_types, - &cutsq.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), - &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, - &nbor_pitch, &this->_threads_per_atom, &_tablength); + this->k_pair_bitmap.run(&this->atom->x, &tabindex, &nshiftbits, + &nmask, &coeff2, &coeff3, &coeff4, &_lj_types, + &cutsq, &sp_lj, &this->nbor->dev_nbor, + &this->_nbor_data->begin(), &this->ans->force, + &this->ans->engv, &eflag, &vflag, &ainum, + &nbor_pitch, &this->_threads_per_atom, + &_tablength); } } this->time_pair.stop(); diff --git a/lib/gpu/lal_table.cu b/lib/gpu/lal_table.cu index 65db9b9b15..4730c52cd1 100644 --- a/lib/gpu/lal_table.cu +++ b/lib/gpu/lal_table.cu @@ -15,11 +15,13 @@ #ifdef NV_KERNEL #include "lal_aux_fun1.h" -texture pos_tex; #ifndef _DOUBLE_DOUBLE -ucl_inline float4 fetch_pos(const int& i, const float4 *pos) - { return tex1Dfetch(pos_tex, i); } +texture pos_tex; +#else +texture pos_tex; #endif +#else +#define pos_tex x_ #endif #define LOOKUP 0 @@ -37,7 +39,7 @@ typedef union { /// ---------------- LOOKUP ------------------------------------------------- -__kernel void kernel_pair(__global numtyp4 *x_, __global int *tabindex, +__kernel void k_table(__global numtyp4 *x_, __global int *tabindex, __global numtyp4* coeff2, __global numtyp4 *coeff3, __global numtyp4 *coeff4, @@ -73,7 +75,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global int *tabindex, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,list_end,nbor); - numtyp4 ix=fetch_pos(i,x_); //x_[i]; + numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; int itype=ix.w; numtyp factor_lj; @@ -83,7 +85,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global int *tabindex, factor_lj = sp_lj[sbmask(j)]; j &= NEIGHMASK; - numtyp4 jx=fetch_pos(j,x_); //x_[j]; + numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; int mtype=itype*lj_types+jx.w; int tbindex = tabindex[mtype]; @@ -128,7 +130,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global int *tabindex, } // if ii } -__kernel void kernel_pair_fast(__global numtyp4 *x_, __global int *tabindex, +__kernel void k_table_fast(__global numtyp4 *x_, __global int *tabindex, __global numtyp4* coeff2, __global numtyp4 *coeff3, __global numtyp4 *coeff4, @@ -167,7 +169,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global int *tabindex, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,list_end,nbor); - numtyp4 ix=fetch_pos(i,x_); //x_[i]; + numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; int iw=ix.w; int itype=fast_mul((int)MAX_SHARED_TYPES,iw); @@ -178,7 +180,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global int *tabindex, factor_lj = sp_lj[sbmask(j)]; j &= NEIGHMASK; - numtyp4 jx=fetch_pos(j,x_); //x_[j]; + numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; int mtype=itype+jx.w; int tbindex = tabindex[mtype]; @@ -225,7 +227,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global int *tabindex, /// ---------------- LINEAR ------------------------------------------------- -__kernel void kernel_pair_linear(__global numtyp4 *x_, __global int *tabindex, +__kernel void k_table_linear(__global numtyp4 *x_, __global int *tabindex, __global numtyp4* coeff2, __global numtyp4 *coeff3, __global numtyp4 *coeff4, @@ -261,7 +263,7 @@ __kernel void kernel_pair_linear(__global numtyp4 *x_, __global int *tabindex, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,list_end,nbor); - numtyp4 ix=fetch_pos(i,x_); //x_[i]; + numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; int itype=ix.w; numtyp factor_lj; @@ -271,7 +273,7 @@ __kernel void kernel_pair_linear(__global numtyp4 *x_, __global int *tabindex, factor_lj = sp_lj[sbmask(j)]; j &= NEIGHMASK; - numtyp4 jx=fetch_pos(j,x_); //x_[j]; + numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; int mtype=itype*lj_types+jx.w; int tbindex = tabindex[mtype]; @@ -320,7 +322,7 @@ __kernel void kernel_pair_linear(__global numtyp4 *x_, __global int *tabindex, } // if ii } -__kernel void kernel_pair_linear_fast(__global numtyp4 *x_, __global int *tabindex, +__kernel void k_table_linear_fast(__global numtyp4 *x_, __global int *tabindex, __global numtyp4* coeff2, __global numtyp4 *coeff3, __global numtyp4 *coeff4, @@ -359,7 +361,7 @@ __kernel void kernel_pair_linear_fast(__global numtyp4 *x_, __global int *tabind nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,list_end,nbor); - numtyp4 ix=fetch_pos(i,x_); //x_[i]; + numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; int iw=ix.w; int itype=fast_mul((int)MAX_SHARED_TYPES,iw); @@ -370,7 +372,7 @@ __kernel void kernel_pair_linear_fast(__global numtyp4 *x_, __global int *tabind factor_lj = sp_lj[sbmask(j)]; j &= NEIGHMASK; - numtyp4 jx=fetch_pos(j,x_); //x_[j]; + numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; int mtype=itype+jx.w; int tbindex = tabindex[mtype]; @@ -421,7 +423,7 @@ __kernel void kernel_pair_linear_fast(__global numtyp4 *x_, __global int *tabind /// ---------------- SPLINE ------------------------------------------------- -__kernel void kernel_pair_spline(__global numtyp4 *x_, __global int *tabindex, +__kernel void k_table_spline(__global numtyp4 *x_, __global int *tabindex, __global numtyp4* coeff2, __global numtyp4 *coeff3, __global numtyp4 *coeff4, @@ -457,7 +459,7 @@ __kernel void kernel_pair_spline(__global numtyp4 *x_, __global int *tabindex, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,list_end,nbor); - numtyp4 ix=fetch_pos(i,x_); //x_[i]; + numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; int itype=ix.w; numtyp factor_lj; @@ -467,7 +469,7 @@ __kernel void kernel_pair_spline(__global numtyp4 *x_, __global int *tabindex, factor_lj = sp_lj[sbmask(j)]; j &= NEIGHMASK; - numtyp4 jx=fetch_pos(j,x_); //x_[j]; + numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; int mtype=itype*lj_types+jx.w; int tbindex = tabindex[mtype]; @@ -523,7 +525,7 @@ __kernel void kernel_pair_spline(__global numtyp4 *x_, __global int *tabindex, } // if ii } -__kernel void kernel_pair_spline_fast(__global numtyp4 *x_, __global int *tabindex, +__kernel void k_table_spline_fast(__global numtyp4 *x_, __global int *tabindex, __global numtyp4* coeff2, __global numtyp4 *coeff3, __global numtyp4 *coeff4, @@ -562,7 +564,7 @@ __kernel void kernel_pair_spline_fast(__global numtyp4 *x_, __global int *tabind nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,list_end,nbor); - numtyp4 ix=fetch_pos(i,x_); //x_[i]; + numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; int iw=ix.w; int itype=fast_mul((int)MAX_SHARED_TYPES,iw); @@ -573,7 +575,7 @@ __kernel void kernel_pair_spline_fast(__global numtyp4 *x_, __global int *tabind factor_lj = sp_lj[sbmask(j)]; j &= NEIGHMASK; - numtyp4 jx=fetch_pos(j,x_); //x_[j]; + numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; int mtype=itype+jx.w; int tbindex = tabindex[mtype]; @@ -631,7 +633,7 @@ __kernel void kernel_pair_spline_fast(__global numtyp4 *x_, __global int *tabind /// ---------------- BITMAP ------------------------------------------------- -__kernel void kernel_pair_bitmap(__global numtyp4 *x_, __global int *tabindex, +__kernel void k_table_bitmap(__global numtyp4 *x_, __global int *tabindex, __global int *nshiftbits, __global int *nmask, __global numtyp4* coeff2, __global numtyp4 *coeff3, @@ -668,7 +670,7 @@ __kernel void kernel_pair_bitmap(__global numtyp4 *x_, __global int *tabindex, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,list_end,nbor); - numtyp4 ix=fetch_pos(i,x_); //x_[i]; + numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; int itype=ix.w; numtyp factor_lj; @@ -678,7 +680,7 @@ __kernel void kernel_pair_bitmap(__global numtyp4 *x_, __global int *tabindex, factor_lj = sp_lj[sbmask(j)]; j &= NEIGHMASK; - numtyp4 jx=fetch_pos(j,x_); //x_[j]; + numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; int mtype=itype*lj_types+jx.w; int tbindex = tabindex[mtype]; @@ -730,7 +732,7 @@ __kernel void kernel_pair_bitmap(__global numtyp4 *x_, __global int *tabindex, } // if ii } -__kernel void kernel_pair_bitmap_fast(__global numtyp4 *x_, __global int *tabindex, +__kernel void k_table_bitmap_fast(__global numtyp4 *x_, __global int *tabindex, __global int *nshiftbits, __global int *nmask, __global numtyp4* coeff2, __global numtyp4 *coeff3, @@ -770,7 +772,7 @@ __kernel void kernel_pair_bitmap_fast(__global numtyp4 *x_, __global int *tabind nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,list_end,nbor); - numtyp4 ix=fetch_pos(i,x_); //x_[i]; + numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; int iw=ix.w; int itype=fast_mul((int)MAX_SHARED_TYPES,iw); @@ -781,7 +783,7 @@ __kernel void kernel_pair_bitmap_fast(__global numtyp4 *x_, __global int *tabind factor_lj = sp_lj[sbmask(j)]; j &= NEIGHMASK; - numtyp4 jx=fetch_pos(j,x_); //x_[j]; + numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; int mtype=itype+jx.w; int tbindex = tabindex[mtype]; diff --git a/lib/gpu/lal_yukawa.cpp b/lib/gpu/lal_yukawa.cpp index 5ab94ae817..acde5e9890 100644 --- a/lib/gpu/lal_yukawa.cpp +++ b/lib/gpu/lal_yukawa.cpp @@ -13,10 +13,12 @@ email : nguyentd@ornl.gov ***************************************************************************/ -#ifdef USE_OPENCL +#if defined(USE_OPENCL) #include "yukawa_cl.h" +#elif defined(USE_CUDART) +const char *yukawa=0; #else -#include "yukawa_ptx.h" +#include "yukawa_cubin.h" #endif #include "lal_yukawa.h" @@ -50,7 +52,7 @@ int YukawaT::init(const int ntypes, const double gpu_split, FILE *_screen) { int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, - _screen,yukawa); + _screen,yukawa,"k_yukawa"); if (success!=0) return success; @@ -129,20 +131,17 @@ void YukawaT::loop(const bool _eflag, const bool _vflag) { this->time_pair.start(); if (shared_types) { this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->dev_x.begin(), &coeff.begin(), &_kappa, - &sp_lj.begin(), - &this->nbor->dev_nbor.begin(), - &this->_nbor_data->begin(), - &this->ans->dev_ans.begin(), - &this->ans->dev_engv.begin(), &eflag, &vflag, - &ainum, &nbor_pitch, &this->_threads_per_atom); + this->k_pair_fast.run(&this->atom->x, &coeff, &_kappa, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, + &vflag, &ainum, &nbor_pitch, + &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->dev_x.begin(), &coeff.begin(), &_kappa, - &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(), - &this->_nbor_data->begin(), &this->ans->dev_ans.begin(), - &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum, - &nbor_pitch, &this->_threads_per_atom); + this->k_pair.run(&this->atom->x, &coeff, &_kappa, &_lj_types, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, &vflag, + &ainum, &nbor_pitch, &this->_threads_per_atom); } this->time_pair.stop(); } diff --git a/lib/gpu/lal_yukawa.cu b/lib/gpu/lal_yukawa.cu index e2fa11aa36..593d123776 100644 --- a/lib/gpu/lal_yukawa.cu +++ b/lib/gpu/lal_yukawa.cu @@ -15,14 +15,16 @@ #ifdef NV_KERNEL #include "lal_aux_fun1.h" -texture pos_tex; #ifndef _DOUBLE_DOUBLE -ucl_inline float4 fetch_pos(const int& i, const float4 *pos) - { return tex1Dfetch(pos_tex, i); } +texture pos_tex; +#else +texture pos_tex; #endif +#else +#define pos_tex x_ #endif -__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff, +__kernel void k_yukawa(__global numtyp4 *x_, __global numtyp4 *coeff, const numtyp kappa, const int lj_types, __global numtyp *sp_lj_in, __global int *dev_nbor, __global int *dev_packed, __global acctyp4 *ans, @@ -51,7 +53,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,list_end,nbor); - numtyp4 ix=fetch_pos(i,x_); //x_[i]; + numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; int itype=ix.w; numtyp factor_lj; @@ -61,7 +63,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff, factor_lj = sp_lj[sbmask(j)]; j &= NEIGHMASK; - numtyp4 jx=fetch_pos(j,x_); //x_[j]; + numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; int jtype=jx.w; // Compute r12 @@ -103,7 +105,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff, } // if ii } -__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff_in, +__kernel void k_yukawa_fast(__global numtyp4 *x_, __global numtyp4 *coeff_in, const numtyp kappa, __global numtyp* sp_lj_in, __global int *dev_nbor, __global int *dev_packed, __global acctyp4 *ans, __global acctyp *engv, @@ -135,7 +137,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff_in, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,list_end,nbor); - numtyp4 ix=fetch_pos(i,x_); //x_[i]; + numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; int iw=ix.w; int itype=fast_mul((int)MAX_SHARED_TYPES,iw); @@ -146,7 +148,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff_in, factor_lj = sp_lj[sbmask(j)]; j &= NEIGHMASK; - numtyp4 jx=fetch_pos(j,x_); //x_[j]; + numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; int mtype=itype+jx.w; // Compute r12