git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@8693 f3b2605a-c512-4ea7-a41b-209d697bcdaa

This commit is contained in:
sjplimp
2012-08-21 13:57:32 +00:00
parent 9a99e27552
commit 31551d81fd
85 changed files with 2630 additions and 2172 deletions

View File

@ -3,6 +3,7 @@ CUDA = $(NVCC) $(CUDA_INCLUDE) $(CUDA_OPTS) -Icudpp_mini $(CUDA_ARCH) \
CUDR = $(CUDR_CPP) $(CUDR_OPTS) $(CUDA_PRECISION) $(CUDA_INCLUDE) \
$(CUDPP_OPT)
CUDA_LINK = $(CUDA_LIB) -lcudart
BIN2C = $(CUDA_HOME)/bin/bin2c
GPU_LIB = $(LIB_DIR)/libgpu.a
@ -27,6 +28,7 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_ans.o \
$(OBJ_DIR)/lal_neighbor.o $(OBJ_DIR)/lal_neighbor_shared.o \
$(OBJ_DIR)/lal_device.o $(OBJ_DIR)/lal_base_atomic.o \
$(OBJ_DIR)/lal_base_charge.o $(OBJ_DIR)/lal_base_ellipsoid.o \
$(OBJ_DIR)/lal_base_dipole.o \
$(OBJ_DIR)/lal_pppm.o $(OBJ_DIR)/lal_pppm_ext.o \
$(OBJ_DIR)/lal_gayberne.o $(OBJ_DIR)/lal_gayberne_ext.o \
$(OBJ_DIR)/lal_re_squared.o $(OBJ_DIR)/lal_re_squared_ext.o \
@ -35,6 +37,7 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_ans.o \
$(OBJ_DIR)/lal_lj_expand.o $(OBJ_DIR)/lal_lj_expand_ext.o \
$(OBJ_DIR)/lal_lj_coul.o $(OBJ_DIR)/lal_lj_coul_ext.o \
$(OBJ_DIR)/lal_lj_coul_long.o $(OBJ_DIR)/lal_lj_coul_long_ext.o \
$(OBJ_DIR)/lal_lj_dsf.o $(OBJ_DIR)/lal_lj_dsf_ext.o \
$(OBJ_DIR)/lal_lj_class2_long.o $(OBJ_DIR)/lal_lj_class2_long_ext.o \
$(OBJ_DIR)/lal_coul_long.o $(OBJ_DIR)/lal_coul_long_ext.o \
$(OBJ_DIR)/lal_morse.o $(OBJ_DIR)/lal_morse_ext.o \
@ -46,35 +49,57 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_ans.o \
$(OBJ_DIR)/lal_buck_coul.o $(OBJ_DIR)/lal_buck_coul_ext.o \
$(OBJ_DIR)/lal_buck_coul_long.o $(OBJ_DIR)/lal_buck_coul_long_ext.o \
$(OBJ_DIR)/lal_table.o $(OBJ_DIR)/lal_table_ext.o \
$(OBJ_DIR)/lal_yukawa.o $(OBJ_DIR)/lal_yukawa_ext.o
PTXS = $(OBJ_DIR)/device.ptx $(OBJ_DIR)/device_ptx.h \
$(OBJ_DIR)/atom.ptx $(OBJ_DIR)/atom_ptx.h \
$(OBJ_DIR)/neighbor_cpu.ptx $(OBJ_DIR)/neighbor_cpu_ptx.h \
$(OBJ_DIR)/neighbor_gpu.ptx $(OBJ_DIR)/neighbor_gpu_ptx.h \
$(OBJ_DIR)/pppm_f.ptx $(OBJ_DIR)/pppm_f_ptx.h \
$(OBJ_DIR)/pppm_d.ptx $(OBJ_DIR)/pppm_d_ptx.h \
$(OBJ_DIR)/ellipsoid_nbor.ptx $(OBJ_DIR)/ellipsoid_nbor_ptx.h \
$(OBJ_DIR)/gayberne.ptx $(OBJ_DIR)/gayberne_lj.ptx \
$(OBJ_DIR)/gayberne_ptx.h $(OBJ_DIR)/gayberne_lj_ptx.h \
$(OBJ_DIR)/re_squared.ptx $(OBJ_DIR)/re_squared_lj.ptx \
$(OBJ_DIR)/re_squared_ptx.h $(OBJ_DIR)/re_squared_lj_ptx.h \
$(OBJ_DIR)/lj.ptx $(OBJ_DIR)/lj_ptx.h \
$(OBJ_DIR)/lj96.ptx $(OBJ_DIR)/lj96_ptx.h \
$(OBJ_DIR)/lj_expand.ptx $(OBJ_DIR)/lj_expand_ptx.h \
$(OBJ_DIR)/lj_coul.ptx $(OBJ_DIR)/lj_coul_ptx.h \
$(OBJ_DIR)/lj_coul_long.ptx $(OBJ_DIR)/lj_coul_long_ptx.h \
$(OBJ_DIR)/lj_class2_long.ptx $(OBJ_DIR)/lj_class2_long_ptx.h \
$(OBJ_DIR)/coul_long.ptx $(OBJ_DIR)/coul_long_ptx.h \
$(OBJ_DIR)/morse.ptx $(OBJ_DIR)/morse_ptx.h \
$(OBJ_DIR)/charmm_long.ptx $(OBJ_DIR)/charmm_long_ptx.h \
$(OBJ_DIR)/cg_cmm.ptx $(OBJ_DIR)/cg_cmm_ptx.h \
$(OBJ_DIR)/cg_cmm_long.ptx $(OBJ_DIR)/cg_cmm_long_ptx.h \
$(OBJ_DIR)/eam.ptx $(OBJ_DIR)/eam_ptx.h \
$(OBJ_DIR)/buck.ptx $(OBJ_DIR)/buck_ptx.h \
$(OBJ_DIR)/buck_coul.ptx $(OBJ_DIR)/buck_coul_ptx.h \
$(OBJ_DIR)/buck_coul_long.ptx $(OBJ_DIR)/buck_coul_long_ptx.h \
$(OBJ_DIR)/table.ptx $(OBJ_DIR)/table_ptx.h \
$(OBJ_DIR)/yukawa.ptx $(OBJ_DIR)/yukawa_ptx.h
$(OBJ_DIR)/lal_yukawa.o $(OBJ_DIR)/lal_yukawa_ext.o \
$(OBJ_DIR)/lal_born.o $(OBJ_DIR)/lal_born_ext.o \
$(OBJ_DIR)/lal_born_coul_wolf.o $(OBJ_DIR)/lal_born_coul_wolf_ext.o \
$(OBJ_DIR)/lal_born_coul_long.o $(OBJ_DIR)/lal_born_coul_long_ext.o \
$(OBJ_DIR)/lal_dipole_lj.o $(OBJ_DIR)/lal_dipole_lj_ext.o \
$(OBJ_DIR)/lal_dipole_lj_sf.o $(OBJ_DIR)/lal_dipole_lj_sf_ext.o \
$(OBJ_DIR)/lal_colloid.o $(OBJ_DIR)/lal_colloid_ext.o \
$(OBJ_DIR)/lal_gauss.o $(OBJ_DIR)/lal_gauss_ext.o \
$(OBJ_DIR)/lal_yukawa_colloid.o $(OBJ_DIR)/lal_yukawa_colloid_ext.o \
$(OBJ_DIR)/lal_lj_coul_debye.o $(OBJ_DIR)/lal_lj_coul_debye_ext.o \
$(OBJ_DIR)/lal_coul_dsf.o $(OBJ_DIR)/lal_coul_dsf_ext.o
CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \
$(OBJ_DIR)/atom.cubin $(OBJ_DIR)/atom_cubin.h \
$(OBJ_DIR)/neighbor_cpu.cubin $(OBJ_DIR)/neighbor_cpu_cubin.h \
$(OBJ_DIR)/neighbor_gpu.cubin $(OBJ_DIR)/neighbor_gpu_cubin.h \
$(OBJ_DIR)/pppm_f.cubin $(OBJ_DIR)/pppm_f_cubin.h \
$(OBJ_DIR)/pppm_d.cubin $(OBJ_DIR)/pppm_d_cubin.h \
$(OBJ_DIR)/ellipsoid_nbor.cubin $(OBJ_DIR)/ellipsoid_nbor_cubin.h \
$(OBJ_DIR)/gayberne.cubin $(OBJ_DIR)/gayberne_lj.cubin \
$(OBJ_DIR)/gayberne_cubin.h $(OBJ_DIR)/gayberne_lj_cubin.h \
$(OBJ_DIR)/re_squared.cubin $(OBJ_DIR)/re_squared_lj.cubin \
$(OBJ_DIR)/re_squared_cubin.h $(OBJ_DIR)/re_squared_lj_cubin.h \
$(OBJ_DIR)/lj.cubin $(OBJ_DIR)/lj_cubin.h \
$(OBJ_DIR)/lj96.cubin $(OBJ_DIR)/lj96_cubin.h \
$(OBJ_DIR)/lj_expand.cubin $(OBJ_DIR)/lj_expand_cubin.h \
$(OBJ_DIR)/lj_coul.cubin $(OBJ_DIR)/lj_coul_cubin.h \
$(OBJ_DIR)/lj_coul_long.cubin $(OBJ_DIR)/lj_coul_long_cubin.h \
$(OBJ_DIR)/lj_dsf.cubin $(OBJ_DIR)/lj_dsf_cubin.h \
$(OBJ_DIR)/lj_class2_long.cubin $(OBJ_DIR)/lj_class2_long_cubin.h \
$(OBJ_DIR)/coul_long.cubin $(OBJ_DIR)/coul_long_cubin.h \
$(OBJ_DIR)/morse.cubin $(OBJ_DIR)/morse_cubin.h \
$(OBJ_DIR)/charmm_long.cubin $(OBJ_DIR)/charmm_long_cubin.h \
$(OBJ_DIR)/cg_cmm.cubin $(OBJ_DIR)/cg_cmm_cubin.h \
$(OBJ_DIR)/cg_cmm_long.cubin $(OBJ_DIR)/cg_cmm_long_cubin.h \
$(OBJ_DIR)/eam.cubin $(OBJ_DIR)/eam_cubin.h \
$(OBJ_DIR)/buck.cubin $(OBJ_DIR)/buck_cubin.h \
$(OBJ_DIR)/buck_coul_long.cubin $(OBJ_DIR)/buck_coul_long_cubin.h \
$(OBJ_DIR)/buck_coul_wolf.cubin $(OBJ_DIR)/buck_coul_wolf_cubin.h \
$(OBJ_DIR)/table.cubin $(OBJ_DIR)/table_cubin.h \
$(OBJ_DIR)/yukawa.cubin $(OBJ_DIR)/yukawa_cubin.h \
$(OBJ_DIR)/born.cubin $(OBJ_DIR)/born_cubin.h \
$(OBJ_DIR)/born_coul_wolf.cubin $(OBJ_DIR)/born_coul_wolf_cubin.h \
$(OBJ_DIR)/born_coul_long.cubin $(OBJ_DIR)/born_coul_long_cubin.h \
$(OBJ_DIR)/dipole_lj.cubin $(OBJ_DIR)/dipole_lj_cubin.h \
$(OBJ_DIR)/dipole_lj_sf.cubin $(OBJ_DIR)/dipole_lj_sf_cubin.h \
$(OBJ_DIR)/colloid.cubin $(OBJ_DIR)/colloid_cubin.h \
$(OBJ_DIR)/gauss.cubin $(OBJ_DIR)/gauss_cubin.h \
$(OBJ_DIR)/yukawa_colloid.cubin $(OBJ_DIR)/yukawa_colloid_cubin.h \
$(OBJ_DIR)/lj_coul_debye.cubin $(OBJ_DIR)/lj_coul_debye_cubin.h \
$(OBJ_DIR)/coul_dsf.cubin $(OBJ_DIR)/coul_dsf_cubin.h
all: $(GPU_LIB) $(EXECS)
@ -96,43 +121,43 @@ $(OBJ_DIR)/radixsort_app.cu_o: cudpp_mini/radixsort_app.cu
$(OBJ_DIR)/scan_app.cu_o: cudpp_mini/scan_app.cu
$(CUDA) -o $@ -c cudpp_mini/scan_app.cu
$(OBJ_DIR)/atom.ptx: lal_atom.cu lal_preprocessor.h
$(CUDA) --ptx -DNV_KERNEL -o $@ lal_atom.cu
$(OBJ_DIR)/atom.cubin: lal_atom.cu lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_atom.cu
$(OBJ_DIR)/atom_ptx.h: $(OBJ_DIR)/atom.ptx
$(BSH) ./geryon/file_to_cstr.sh atom $(OBJ_DIR)/atom.ptx $(OBJ_DIR)/atom_ptx.h
$(OBJ_DIR)/atom_cubin.h: $(OBJ_DIR)/atom.cubin
$(BIN2C) -c -n atom $(OBJ_DIR)/atom.cubin > $(OBJ_DIR)/atom_cubin.h
$(OBJ_DIR)/lal_atom.o: lal_atom.cpp lal_atom.h $(NVD_H) $(OBJ_DIR)/atom_ptx.h
$(OBJ_DIR)/lal_atom.o: lal_atom.cpp lal_atom.h $(NVD_H) $(OBJ_DIR)/atom_cubin.h
$(CUDR) -o $@ -c lal_atom.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_ans.o: lal_answer.cpp lal_answer.h $(NVD_H)
$(CUDR) -o $@ -c lal_answer.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/neighbor_cpu.ptx: lal_neighbor_cpu.cu lal_preprocessor.h
$(CUDA) --ptx -DNV_KERNEL -o $@ lal_neighbor_cpu.cu
$(OBJ_DIR)/neighbor_cpu.cubin: lal_neighbor_cpu.cu lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_neighbor_cpu.cu
$(OBJ_DIR)/neighbor_cpu_ptx.h: $(OBJ_DIR)/neighbor_cpu.ptx
$(BSH) ./geryon/file_to_cstr.sh neighbor_cpu $(OBJ_DIR)/neighbor_cpu.ptx $(OBJ_DIR)/neighbor_cpu_ptx.h
$(OBJ_DIR)/neighbor_cpu_cubin.h: $(OBJ_DIR)/neighbor_cpu.cubin
$(BIN2C) -c -n neighbor_cpu $(OBJ_DIR)/neighbor_cpu.cubin > $(OBJ_DIR)/neighbor_cpu_cubin.h
$(OBJ_DIR)/neighbor_gpu.ptx: lal_neighbor_gpu.cu lal_preprocessor.h
$(CUDA) --ptx -DNV_KERNEL -o $@ lal_neighbor_gpu.cu
$(OBJ_DIR)/neighbor_gpu.cubin: lal_neighbor_gpu.cu lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_neighbor_gpu.cu
$(OBJ_DIR)/neighbor_gpu_ptx.h: $(OBJ_DIR)/neighbor_gpu.ptx
$(BSH) ./geryon/file_to_cstr.sh neighbor_gpu $(OBJ_DIR)/neighbor_gpu.ptx $(OBJ_DIR)/neighbor_gpu_ptx.h
$(OBJ_DIR)/neighbor_gpu_cubin.h: $(OBJ_DIR)/neighbor_gpu.cubin
$(BIN2C) -c -n neighbor_gpu $(OBJ_DIR)/neighbor_gpu.cubin > $(OBJ_DIR)/neighbor_gpu_cubin.h
$(OBJ_DIR)/lal_neighbor_shared.o: lal_neighbor_shared.cpp lal_neighbor_shared.h $(OBJ_DIR)/neighbor_cpu_ptx.h $(OBJ_DIR)/neighbor_gpu_ptx.h $(NVD_H)
$(OBJ_DIR)/lal_neighbor_shared.o: lal_neighbor_shared.cpp lal_neighbor_shared.h $(OBJ_DIR)/neighbor_cpu_cubin.h $(OBJ_DIR)/neighbor_gpu_cubin.h $(NVD_H)
$(CUDR) -o $@ -c lal_neighbor_shared.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_neighbor.o: lal_neighbor.cpp lal_neighbor.h lal_neighbor_shared.h $(NVD_H)
$(CUDR) -o $@ -c lal_neighbor.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/device.ptx: lal_device.cu lal_preprocessor.h
$(CUDA) --ptx -DNV_KERNEL -o $@ lal_device.cu
$(OBJ_DIR)/device.cubin: lal_device.cu lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_device.cu
$(OBJ_DIR)/device_ptx.h: $(OBJ_DIR)/device.ptx
$(BSH) ./geryon/file_to_cstr.sh device $(OBJ_DIR)/device.ptx $(OBJ_DIR)/device_ptx.h
$(OBJ_DIR)/device_cubin.h: $(OBJ_DIR)/device.cubin
$(BIN2C) -c -n device $(OBJ_DIR)/device.cubin > $(OBJ_DIR)/device_cubin.h
$(OBJ_DIR)/lal_device.o: lal_device.cpp lal_device.h $(ALL_H) $(OBJ_DIR)/device_ptx.h
$(OBJ_DIR)/lal_device.o: lal_device.cpp lal_device.h $(ALL_H) $(OBJ_DIR)/device_cubin.h
$(CUDR) -o $@ -c lal_device.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_base_atomic.o: $(ALL_H) lal_base_atomic.h lal_base_atomic.cpp
@ -141,273 +166,408 @@ $(OBJ_DIR)/lal_base_atomic.o: $(ALL_H) lal_base_atomic.h lal_base_atomic.cpp
$(OBJ_DIR)/lal_base_charge.o: $(ALL_H) lal_base_charge.h lal_base_charge.cpp
$(CUDR) -o $@ -c lal_base_charge.cpp
$(OBJ_DIR)/lal_base_ellipsoid.o: $(ALL_H) lal_base_ellipsoid.h lal_base_ellipsoid.cpp $(OBJ_DIR)/ellipsoid_nbor_ptx.h
$(OBJ_DIR)/lal_base_ellipsoid.o: $(ALL_H) lal_base_ellipsoid.h lal_base_ellipsoid.cpp $(OBJ_DIR)/ellipsoid_nbor_cubin.h
$(CUDR) -o $@ -c lal_base_ellipsoid.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/pppm_f.ptx: lal_pppm.cu lal_precision.h lal_preprocessor.h
$(CUDA) --ptx -DNV_KERNEL -Dgrdtyp=float -Dgrdtyp4=float4 -o $@ lal_pppm.cu
$(OBJ_DIR)/lal_base_dipole.o: $(ALL_H) lal_base_dipole.h lal_base_dipole.cpp
$(CUDR) -o $@ -c lal_base_dipole.cpp
$(OBJ_DIR)/pppm_f_ptx.h: $(OBJ_DIR)/pppm_f.ptx
$(BSH) ./geryon/file_to_cstr.sh pppm_f $(OBJ_DIR)/pppm_f.ptx $(OBJ_DIR)/pppm_f_ptx.h
$(OBJ_DIR)/pppm_f.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -Dgrdtyp=float -Dgrdtyp4=float4 -o $@ lal_pppm.cu
$(OBJ_DIR)/pppm_d.ptx: lal_pppm.cu lal_precision.h lal_preprocessor.h
$(CUDA) --ptx -DNV_KERNEL -Dgrdtyp=double -Dgrdtyp4=double4 -o $@ lal_pppm.cu
$(OBJ_DIR)/pppm_f_cubin.h: $(OBJ_DIR)/pppm_f.cubin
$(BIN2C) -c -n pppm_f $(OBJ_DIR)/pppm_f.cubin > $(OBJ_DIR)/pppm_f_cubin.h
$(OBJ_DIR)/pppm_d_ptx.h: $(OBJ_DIR)/pppm_d.ptx
$(BSH) ./geryon/file_to_cstr.sh pppm_d $(OBJ_DIR)/pppm_d.ptx $(OBJ_DIR)/pppm_d_ptx.h
$(OBJ_DIR)/pppm_d.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -Dgrdtyp=double -Dgrdtyp4=double4 -o $@ lal_pppm.cu
$(OBJ_DIR)/lal_pppm.o: $(ALL_H) lal_pppm.h lal_pppm.cpp $(OBJ_DIR)/pppm_f_ptx.h $(OBJ_DIR)/pppm_d_ptx.h
$(OBJ_DIR)/pppm_d_cubin.h: $(OBJ_DIR)/pppm_d.cubin
$(BIN2C) -c -n pppm_d $(OBJ_DIR)/pppm_d.cubin > $(OBJ_DIR)/pppm_d_cubin.h
$(OBJ_DIR)/lal_pppm.o: $(ALL_H) lal_pppm.h lal_pppm.cpp $(OBJ_DIR)/pppm_f_cubin.h $(OBJ_DIR)/pppm_d_cubin.h
$(CUDR) -o $@ -c lal_pppm.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_pppm_ext.o: $(ALL_H) lal_pppm.h lal_pppm_ext.cpp
$(CUDR) -o $@ -c lal_pppm_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/ellipsoid_nbor.ptx: lal_ellipsoid_nbor.cu lal_precision.h lal_preprocessor.h
$(CUDA) --ptx -DNV_KERNEL -o $@ lal_ellipsoid_nbor.cu
$(OBJ_DIR)/ellipsoid_nbor.cubin: lal_ellipsoid_nbor.cu lal_precision.h lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_ellipsoid_nbor.cu
$(OBJ_DIR)/ellipsoid_nbor_ptx.h: $(OBJ_DIR)/ellipsoid_nbor.ptx
$(BSH) ./geryon/file_to_cstr.sh ellipsoid_nbor $(OBJ_DIR)/ellipsoid_nbor.ptx $(OBJ_DIR)/ellipsoid_nbor_ptx.h
$(OBJ_DIR)/ellipsoid_nbor_cubin.h: $(OBJ_DIR)/ellipsoid_nbor.cubin
$(BIN2C) -c -n ellipsoid_nbor $(OBJ_DIR)/ellipsoid_nbor.cubin > $(OBJ_DIR)/ellipsoid_nbor_cubin.h
$(OBJ_DIR)/gayberne.ptx: lal_gayberne.cu lal_precision.h lal_ellipsoid_extra.h lal_preprocessor.h
$(CUDA) --ptx -DNV_KERNEL -o $@ lal_gayberne.cu
$(OBJ_DIR)/gayberne.cubin: lal_gayberne.cu lal_precision.h lal_ellipsoid_extra.h lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_gayberne.cu
$(OBJ_DIR)/gayberne_lj.ptx: lal_gayberne_lj.cu lal_precision.h lal_ellipsoid_extra.h lal_preprocessor.h
$(CUDA) --ptx -DNV_KERNEL -o $@ lal_gayberne_lj.cu
$(OBJ_DIR)/gayberne_lj.cubin: lal_gayberne_lj.cu lal_precision.h lal_ellipsoid_extra.h lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_gayberne_lj.cu
$(OBJ_DIR)/gayberne_ptx.h: $(OBJ_DIR)/gayberne.ptx
$(BSH) ./geryon/file_to_cstr.sh gayberne $(OBJ_DIR)/gayberne.ptx $(OBJ_DIR)/gayberne_ptx.h
$(OBJ_DIR)/gayberne_cubin.h: $(OBJ_DIR)/gayberne.cubin
$(BIN2C) -c -n gayberne $(OBJ_DIR)/gayberne.cubin > $(OBJ_DIR)/gayberne_cubin.h
$(OBJ_DIR)/gayberne_lj_ptx.h: $(OBJ_DIR)/gayberne_lj.ptx
$(BSH) ./geryon/file_to_cstr.sh gayberne_lj $(OBJ_DIR)/gayberne_lj.ptx $(OBJ_DIR)/gayberne_lj_ptx.h
$(OBJ_DIR)/gayberne_lj_cubin.h: $(OBJ_DIR)/gayberne_lj.cubin
$(BIN2C) -c -n gayberne_lj $(OBJ_DIR)/gayberne_lj.cubin > $(OBJ_DIR)/gayberne_lj_cubin.h
$(OBJ_DIR)/lal_gayberne.o: $(ALL_H) lal_gayberne.h lal_gayberne.cpp $(OBJ_DIR)/gayberne_ptx.h $(OBJ_DIR)/gayberne_lj_ptx.h $(OBJ_DIR)/lal_base_ellipsoid.o
$(OBJ_DIR)/lal_gayberne.o: $(ALL_H) lal_gayberne.h lal_gayberne.cpp $(OBJ_DIR)/gayberne_cubin.h $(OBJ_DIR)/gayberne_lj_cubin.h $(OBJ_DIR)/lal_base_ellipsoid.o
$(CUDR) -o $@ -c lal_gayberne.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_gayberne_ext.o: $(ALL_H) $(OBJ_DIR)/lal_gayberne.o lal_gayberne_ext.cpp
$(CUDR) -o $@ -c lal_gayberne_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/re_squared.ptx: lal_re_squared.cu lal_precision.h lal_ellipsoid_extra.h lal_preprocessor.h
$(CUDA) --ptx -DNV_KERNEL -o $@ lal_re_squared.cu
$(OBJ_DIR)/re_squared.cubin: lal_re_squared.cu lal_precision.h lal_ellipsoid_extra.h lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_re_squared.cu
$(OBJ_DIR)/re_squared_lj.ptx: lal_re_squared_lj.cu lal_precision.h lal_ellipsoid_extra.h lal_preprocessor.h
$(CUDA) --ptx -DNV_KERNEL -o $@ lal_re_squared_lj.cu
$(OBJ_DIR)/re_squared_lj.cubin: lal_re_squared_lj.cu lal_precision.h lal_ellipsoid_extra.h lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_re_squared_lj.cu
$(OBJ_DIR)/re_squared_ptx.h: $(OBJ_DIR)/re_squared.ptx
$(BSH) ./geryon/file_to_cstr.sh re_squared $(OBJ_DIR)/re_squared.ptx $(OBJ_DIR)/re_squared_ptx.h
$(OBJ_DIR)/re_squared_cubin.h: $(OBJ_DIR)/re_squared.cubin
$(BIN2C) -c -n re_squared $(OBJ_DIR)/re_squared.cubin > $(OBJ_DIR)/re_squared_cubin.h
$(OBJ_DIR)/re_squared_lj_ptx.h: $(OBJ_DIR)/re_squared_lj.ptx
$(BSH) ./geryon/file_to_cstr.sh re_squared_lj $(OBJ_DIR)/re_squared_lj.ptx $(OBJ_DIR)/re_squared_lj_ptx.h
$(OBJ_DIR)/re_squared_lj_cubin.h: $(OBJ_DIR)/re_squared_lj.cubin
$(BIN2C) -c -n re_squared_lj $(OBJ_DIR)/re_squared_lj.cubin > $(OBJ_DIR)/re_squared_lj_cubin.h
$(OBJ_DIR)/lal_re_squared.o: $(ALL_H) lal_re_squared.h lal_re_squared.cpp $(OBJ_DIR)/re_squared_ptx.h $(OBJ_DIR)/re_squared_lj_ptx.h $(OBJ_DIR)/lal_base_ellipsoid.o
$(OBJ_DIR)/lal_re_squared.o: $(ALL_H) lal_re_squared.h lal_re_squared.cpp $(OBJ_DIR)/re_squared_cubin.h $(OBJ_DIR)/re_squared_lj_cubin.h $(OBJ_DIR)/lal_base_ellipsoid.o
$(CUDR) -o $@ -c lal_re_squared.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_re_squared_ext.o: $(ALL_H) $(OBJ_DIR)/lal_re_squared.o lal_re_squared_ext.cpp
$(CUDR) -o $@ -c lal_re_squared_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lj.ptx: lal_lj.cu lal_precision.h lal_preprocessor.h
$(CUDA) --ptx -DNV_KERNEL -o $@ lal_lj.cu
$(OBJ_DIR)/lj.cubin: lal_lj.cu lal_precision.h lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj.cu
$(OBJ_DIR)/lj_ptx.h: $(OBJ_DIR)/lj.ptx $(OBJ_DIR)/lj.ptx
$(BSH) ./geryon/file_to_cstr.sh lj $(OBJ_DIR)/lj.ptx $(OBJ_DIR)/lj_ptx.h
$(OBJ_DIR)/lj_cubin.h: $(OBJ_DIR)/lj.cubin $(OBJ_DIR)/lj.cubin
$(BIN2C) -c -n lj $(OBJ_DIR)/lj.cubin > $(OBJ_DIR)/lj_cubin.h
$(OBJ_DIR)/lal_lj.o: $(ALL_H) lal_lj.h lal_lj.cpp $(OBJ_DIR)/lj_ptx.h $(OBJ_DIR)/lal_base_atomic.o
$(OBJ_DIR)/lal_lj.o: $(ALL_H) lal_lj.h lal_lj.cpp $(OBJ_DIR)/lj_cubin.h $(OBJ_DIR)/lal_base_atomic.o
$(CUDR) -o $@ -c lal_lj.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_lj_ext.o: $(ALL_H) lal_lj.h lal_lj_ext.cpp lal_base_atomic.h
$(CUDR) -o $@ -c lal_lj_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lj_coul.ptx: lal_lj_coul.cu lal_precision.h lal_preprocessor.h
$(CUDA) --ptx -DNV_KERNEL -o $@ lal_lj_coul.cu
$(OBJ_DIR)/lj_coul.cubin: lal_lj_coul.cu lal_precision.h lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj_coul.cu
$(OBJ_DIR)/lj_coul_ptx.h: $(OBJ_DIR)/lj_coul.ptx $(OBJ_DIR)/lj_coul.ptx
$(BSH) ./geryon/file_to_cstr.sh lj_coul $(OBJ_DIR)/lj_coul.ptx $(OBJ_DIR)/lj_coul_ptx.h
$(OBJ_DIR)/lj_coul_cubin.h: $(OBJ_DIR)/lj_coul.cubin $(OBJ_DIR)/lj_coul.cubin
$(BIN2C) -c -n lj_coul $(OBJ_DIR)/lj_coul.cubin > $(OBJ_DIR)/lj_coul_cubin.h
$(OBJ_DIR)/lal_lj_coul.o: $(ALL_H) lal_lj_coul.h lal_lj_coul.cpp $(OBJ_DIR)/lj_coul_ptx.h $(OBJ_DIR)/lal_base_charge.o
$(OBJ_DIR)/lal_lj_coul.o: $(ALL_H) lal_lj_coul.h lal_lj_coul.cpp $(OBJ_DIR)/lj_coul_cubin.h $(OBJ_DIR)/lal_base_charge.o
$(CUDR) -o $@ -c lal_lj_coul.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_lj_coul_ext.o: $(ALL_H) lal_lj_coul.h lal_lj_coul_ext.cpp lal_base_charge.h
$(CUDR) -o $@ -c lal_lj_coul_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lj_class2_long.ptx: lal_lj_class2_long.cu lal_precision.h lal_preprocessor.h
$(CUDA) --ptx -DNV_KERNEL -o $@ lal_lj_class2_long.cu
$(OBJ_DIR)/lj_class2_long.cubin: lal_lj_class2_long.cu lal_precision.h lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj_class2_long.cu
$(OBJ_DIR)/lj_class2_long_ptx.h: $(OBJ_DIR)/lj_class2_long.ptx $(OBJ_DIR)/lj_class2_long.ptx
$(BSH) ./geryon/file_to_cstr.sh lj_class2_long $(OBJ_DIR)/lj_class2_long.ptx $(OBJ_DIR)/lj_class2_long_ptx.h
$(OBJ_DIR)/lj_class2_long_cubin.h: $(OBJ_DIR)/lj_class2_long.cubin $(OBJ_DIR)/lj_class2_long.cubin
$(BIN2C) -c -n lj_class2_long $(OBJ_DIR)/lj_class2_long.cubin > $(OBJ_DIR)/lj_class2_long_cubin.h
$(OBJ_DIR)/lal_lj_class2_long.o: $(ALL_H) lal_lj_class2_long.h lal_lj_class2_long.cpp $(OBJ_DIR)/lj_class2_long_ptx.h $(OBJ_DIR)/lal_base_charge.o
$(OBJ_DIR)/lal_lj_class2_long.o: $(ALL_H) lal_lj_class2_long.h lal_lj_class2_long.cpp $(OBJ_DIR)/lj_class2_long_cubin.h $(OBJ_DIR)/lal_base_charge.o
$(CUDR) -o $@ -c lal_lj_class2_long.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_lj_class2_long_ext.o: $(ALL_H) lal_lj_class2_long.h lal_lj_class2_long_ext.cpp lal_base_charge.h
$(CUDR) -o $@ -c lal_lj_class2_long_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/coul_long.ptx: lal_coul_long.cu lal_precision.h lal_preprocessor.h
$(CUDA) --ptx -DNV_KERNEL -o $@ lal_coul_long.cu
$(OBJ_DIR)/coul_long.cubin: lal_coul_long.cu lal_precision.h lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_coul_long.cu
$(OBJ_DIR)/coul_long_ptx.h: $(OBJ_DIR)/coul_long.ptx $(OBJ_DIR)/coul_long.ptx
$(BSH) ./geryon/file_to_cstr.sh coul_long $(OBJ_DIR)/coul_long.ptx $(OBJ_DIR)/coul_long_ptx.h
$(OBJ_DIR)/coul_long_cubin.h: $(OBJ_DIR)/coul_long.cubin $(OBJ_DIR)/coul_long.cubin
$(BIN2C) -c -n coul_long $(OBJ_DIR)/coul_long.cubin > $(OBJ_DIR)/coul_long_cubin.h
$(OBJ_DIR)/lal_coul_long.o: $(ALL_H) lal_coul_long.h lal_coul_long.cpp $(OBJ_DIR)/coul_long_ptx.h $(OBJ_DIR)/lal_base_charge.o
$(OBJ_DIR)/lal_coul_long.o: $(ALL_H) lal_coul_long.h lal_coul_long.cpp $(OBJ_DIR)/coul_long_cubin.h $(OBJ_DIR)/lal_base_charge.o
$(CUDR) -o $@ -c lal_coul_long.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_coul_long_ext.o: $(ALL_H) lal_coul_long.h lal_coul_long_ext.cpp lal_base_charge.h
$(CUDR) -o $@ -c lal_coul_long_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lj_coul_long.ptx: lal_lj_coul_long.cu lal_precision.h lal_preprocessor.h
$(CUDA) --ptx -DNV_KERNEL -o $@ lal_lj_coul_long.cu
$(OBJ_DIR)/lj_coul_long.cubin: lal_lj_coul_long.cu lal_precision.h lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj_coul_long.cu
$(OBJ_DIR)/lj_coul_long_ptx.h: $(OBJ_DIR)/lj_coul_long.ptx $(OBJ_DIR)/lj_coul_long.ptx
$(BSH) ./geryon/file_to_cstr.sh lj_coul_long $(OBJ_DIR)/lj_coul_long.ptx $(OBJ_DIR)/lj_coul_long_ptx.h
$(OBJ_DIR)/lj_coul_long_cubin.h: $(OBJ_DIR)/lj_coul_long.cubin $(OBJ_DIR)/lj_coul_long.cubin
$(BIN2C) -c -n lj_coul_long $(OBJ_DIR)/lj_coul_long.cubin > $(OBJ_DIR)/lj_coul_long_cubin.h
$(OBJ_DIR)/lal_lj_coul_long.o: $(ALL_H) lal_lj_coul_long.h lal_lj_coul_long.cpp $(OBJ_DIR)/lj_coul_long_ptx.h $(OBJ_DIR)/lal_base_charge.o
$(OBJ_DIR)/lal_lj_coul_long.o: $(ALL_H) lal_lj_coul_long.h lal_lj_coul_long.cpp $(OBJ_DIR)/lj_coul_long_cubin.h $(OBJ_DIR)/lal_base_charge.o
$(CUDR) -o $@ -c lal_lj_coul_long.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_lj_coul_long_ext.o: $(ALL_H) lal_lj_coul_long.h lal_lj_coul_long_ext.cpp lal_base_charge.h
$(CUDR) -o $@ -c lal_lj_coul_long_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/morse.ptx: lal_morse.cu lal_precision.h lal_preprocessor.h
$(CUDA) --ptx -DNV_KERNEL -o $@ lal_morse.cu
$(OBJ_DIR)/lj_dsf.cubin: lal_lj_dsf.cu lal_precision.h lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj_dsf.cu
$(OBJ_DIR)/morse_ptx.h: $(OBJ_DIR)/morse.ptx $(OBJ_DIR)/morse.ptx
$(BSH) ./geryon/file_to_cstr.sh morse $(OBJ_DIR)/morse.ptx $(OBJ_DIR)/morse_ptx.h
$(OBJ_DIR)/lj_dsf_cubin.h: $(OBJ_DIR)/lj_dsf.cubin $(OBJ_DIR)/lj_dsf.cubin
$(BIN2C) -c -n lj_dsf $(OBJ_DIR)/lj_dsf.cubin > $(OBJ_DIR)/lj_dsf_cubin.h
$(OBJ_DIR)/lal_morse.o: $(ALL_H) lal_morse.h lal_morse.cpp $(OBJ_DIR)/morse_ptx.h $(OBJ_DIR)/lal_base_atomic.o
$(OBJ_DIR)/lal_lj_dsf.o: $(ALL_H) lal_lj_dsf.h lal_lj_dsf.cpp $(OBJ_DIR)/lj_dsf_cubin.h $(OBJ_DIR)/lal_base_charge.o
$(CUDR) -o $@ -c lal_lj_dsf.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_lj_dsf_ext.o: $(ALL_H) lal_lj_dsf.h lal_lj_dsf_ext.cpp lal_base_charge.h
$(CUDR) -o $@ -c lal_lj_dsf_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/morse.cubin: lal_morse.cu lal_precision.h lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_morse.cu
$(OBJ_DIR)/morse_cubin.h: $(OBJ_DIR)/morse.cubin $(OBJ_DIR)/morse.cubin
$(BIN2C) -c -n morse $(OBJ_DIR)/morse.cubin > $(OBJ_DIR)/morse_cubin.h
$(OBJ_DIR)/lal_morse.o: $(ALL_H) lal_morse.h lal_morse.cpp $(OBJ_DIR)/morse_cubin.h $(OBJ_DIR)/lal_base_atomic.o
$(CUDR) -o $@ -c lal_morse.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_morse_ext.o: $(ALL_H) lal_morse.h lal_morse_ext.cpp lal_base_atomic.h
$(CUDR) -o $@ -c lal_morse_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/charmm_long.ptx: lal_charmm_long.cu lal_precision.h lal_preprocessor.h
$(CUDA) --ptx -DNV_KERNEL -o $@ lal_charmm_long.cu
$(OBJ_DIR)/charmm_long.cubin: lal_charmm_long.cu lal_precision.h lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_charmm_long.cu
$(OBJ_DIR)/charmm_long_ptx.h: $(OBJ_DIR)/charmm_long.ptx $(OBJ_DIR)/charmm_long.ptx
$(BSH) ./geryon/file_to_cstr.sh charmm_long $(OBJ_DIR)/charmm_long.ptx $(OBJ_DIR)/charmm_long_ptx.h
$(OBJ_DIR)/charmm_long_cubin.h: $(OBJ_DIR)/charmm_long.cubin $(OBJ_DIR)/charmm_long.cubin
$(BIN2C) -c -n charmm_long $(OBJ_DIR)/charmm_long.cubin > $(OBJ_DIR)/charmm_long_cubin.h
$(OBJ_DIR)/lal_charmm_long.o: $(ALL_H) lal_charmm_long.h lal_charmm_long.cpp $(OBJ_DIR)/charmm_long_ptx.h $(OBJ_DIR)/lal_base_charge.o
$(OBJ_DIR)/lal_charmm_long.o: $(ALL_H) lal_charmm_long.h lal_charmm_long.cpp $(OBJ_DIR)/charmm_long_cubin.h $(OBJ_DIR)/lal_base_charge.o
$(CUDR) -o $@ -c lal_charmm_long.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_charmm_long_ext.o: $(ALL_H) lal_charmm_long.h lal_charmm_long_ext.cpp lal_base_charge.h
$(CUDR) -o $@ -c lal_charmm_long_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lj96.ptx: lal_lj96.cu lal_precision.h lal_preprocessor.h
$(CUDA) --ptx -DNV_KERNEL -o $@ lal_lj96.cu
$(OBJ_DIR)/lj96.cubin: lal_lj96.cu lal_precision.h lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj96.cu
$(OBJ_DIR)/lj96_ptx.h: $(OBJ_DIR)/lj96.ptx $(OBJ_DIR)/lj96.ptx
$(BSH) ./geryon/file_to_cstr.sh lj96 $(OBJ_DIR)/lj96.ptx $(OBJ_DIR)/lj96_ptx.h
$(OBJ_DIR)/lj96_cubin.h: $(OBJ_DIR)/lj96.cubin $(OBJ_DIR)/lj96.cubin
$(BIN2C) -c -n lj96 $(OBJ_DIR)/lj96.cubin > $(OBJ_DIR)/lj96_cubin.h
$(OBJ_DIR)/lal_lj96.o: $(ALL_H) lal_lj96.h lal_lj96.cpp $(OBJ_DIR)/lj96_ptx.h $(OBJ_DIR)/lal_base_atomic.o
$(OBJ_DIR)/lal_lj96.o: $(ALL_H) lal_lj96.h lal_lj96.cpp $(OBJ_DIR)/lj96_cubin.h $(OBJ_DIR)/lal_base_atomic.o
$(CUDR) -o $@ -c lal_lj96.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_lj96_ext.o: $(ALL_H) lal_lj96.h lal_lj96_ext.cpp lal_base_atomic.h
$(CUDR) -o $@ -c lal_lj96_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lj_expand.ptx: lal_lj_expand.cu lal_precision.h lal_preprocessor.h
$(CUDA) --ptx -DNV_KERNEL -o $@ lal_lj_expand.cu
$(OBJ_DIR)/lj_expand.cubin: lal_lj_expand.cu lal_precision.h lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj_expand.cu
$(OBJ_DIR)/lj_expand_ptx.h: $(OBJ_DIR)/lj_expand.ptx $(OBJ_DIR)/lj_expand.ptx
$(BSH) ./geryon/file_to_cstr.sh lj_expand $(OBJ_DIR)/lj_expand.ptx $(OBJ_DIR)/lj_expand_ptx.h
$(OBJ_DIR)/lj_expand_cubin.h: $(OBJ_DIR)/lj_expand.cubin $(OBJ_DIR)/lj_expand.cubin
$(BIN2C) -c -n lj_expand $(OBJ_DIR)/lj_expand.cubin > $(OBJ_DIR)/lj_expand_cubin.h
$(OBJ_DIR)/lal_lj_expand.o: $(ALL_H) lal_lj_expand.h lal_lj_expand.cpp $(OBJ_DIR)/lj_expand_ptx.h $(OBJ_DIR)/lal_base_atomic.o
$(OBJ_DIR)/lal_lj_expand.o: $(ALL_H) lal_lj_expand.h lal_lj_expand.cpp $(OBJ_DIR)/lj_expand_cubin.h $(OBJ_DIR)/lal_base_atomic.o
$(CUDR) -o $@ -c lal_lj_expand.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_lj_expand_ext.o: $(ALL_H) lal_lj_expand.h lal_lj_expand_ext.cpp lal_base_atomic.h
$(CUDR) -o $@ -c lal_lj_expand_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/cg_cmm.ptx: lal_cg_cmm.cu lal_precision.h lal_preprocessor.h
$(CUDA) --ptx -DNV_KERNEL -o $@ lal_cg_cmm.cu
$(OBJ_DIR)/cg_cmm.cubin: lal_cg_cmm.cu lal_precision.h lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_cg_cmm.cu
$(OBJ_DIR)/cg_cmm_ptx.h: $(OBJ_DIR)/cg_cmm.ptx $(OBJ_DIR)/cg_cmm.ptx
$(BSH) ./geryon/file_to_cstr.sh cg_cmm $(OBJ_DIR)/cg_cmm.ptx $(OBJ_DIR)/cg_cmm_ptx.h
$(OBJ_DIR)/cg_cmm_cubin.h: $(OBJ_DIR)/cg_cmm.cubin $(OBJ_DIR)/cg_cmm.cubin
$(BIN2C) -c -n cg_cmm $(OBJ_DIR)/cg_cmm.cubin > $(OBJ_DIR)/cg_cmm_cubin.h
$(OBJ_DIR)/lal_cg_cmm.o: $(ALL_H) lal_cg_cmm.h lal_cg_cmm.cpp $(OBJ_DIR)/cg_cmm_ptx.h $(OBJ_DIR)/lal_base_atomic.o
$(OBJ_DIR)/lal_cg_cmm.o: $(ALL_H) lal_cg_cmm.h lal_cg_cmm.cpp $(OBJ_DIR)/cg_cmm_cubin.h $(OBJ_DIR)/lal_base_atomic.o
$(CUDR) -o $@ -c lal_cg_cmm.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_cg_cmm_ext.o: $(ALL_H) lal_cg_cmm.h lal_cg_cmm_ext.cpp lal_base_atomic.h
$(CUDR) -o $@ -c lal_cg_cmm_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/cg_cmm_long.ptx: lal_cg_cmm_long.cu lal_precision.h lal_preprocessor.h
$(CUDA) --ptx -DNV_KERNEL -o $@ lal_cg_cmm_long.cu
$(OBJ_DIR)/cg_cmm_long.cubin: lal_cg_cmm_long.cu lal_precision.h lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_cg_cmm_long.cu
$(OBJ_DIR)/cg_cmm_long_ptx.h: $(OBJ_DIR)/cg_cmm_long.ptx $(OBJ_DIR)/cg_cmm_long.ptx
$(BSH) ./geryon/file_to_cstr.sh cg_cmm_long $(OBJ_DIR)/cg_cmm_long.ptx $(OBJ_DIR)/cg_cmm_long_ptx.h
$(OBJ_DIR)/cg_cmm_long_cubin.h: $(OBJ_DIR)/cg_cmm_long.cubin $(OBJ_DIR)/cg_cmm_long.cubin
$(BIN2C) -c -n cg_cmm_long $(OBJ_DIR)/cg_cmm_long.cubin > $(OBJ_DIR)/cg_cmm_long_cubin.h
$(OBJ_DIR)/lal_cg_cmm_long.o: $(ALL_H) lal_cg_cmm_long.h lal_cg_cmm_long.cpp $(OBJ_DIR)/cg_cmm_long_ptx.h $(OBJ_DIR)/lal_base_atomic.o
$(OBJ_DIR)/lal_cg_cmm_long.o: $(ALL_H) lal_cg_cmm_long.h lal_cg_cmm_long.cpp $(OBJ_DIR)/cg_cmm_long_cubin.h $(OBJ_DIR)/lal_base_atomic.o
$(CUDR) -o $@ -c lal_cg_cmm_long.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_cg_cmm_long_ext.o: $(ALL_H) lal_cg_cmm_long.h lal_cg_cmm_long_ext.cpp lal_base_charge.h
$(CUDR) -o $@ -c lal_cg_cmm_long_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/eam.ptx: lal_eam.cu lal_precision.h lal_preprocessor.h
$(CUDA) --ptx -DNV_KERNEL -o $@ lal_eam.cu
$(OBJ_DIR)/eam.cubin: lal_eam.cu lal_precision.h lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_eam.cu
$(OBJ_DIR)/eam_ptx.h: $(OBJ_DIR)/eam.ptx $(OBJ_DIR)/eam.ptx
$(BSH) ./geryon/file_to_cstr.sh eam $(OBJ_DIR)/eam.ptx $(OBJ_DIR)/eam_ptx.h
$(OBJ_DIR)/eam_cubin.h: $(OBJ_DIR)/eam.cubin $(OBJ_DIR)/eam.cubin
$(BIN2C) -c -n eam $(OBJ_DIR)/eam.cubin > $(OBJ_DIR)/eam_cubin.h
$(OBJ_DIR)/lal_eam.o: $(ALL_H) lal_eam.h lal_eam.cpp $(OBJ_DIR)/eam_ptx.h $(OBJ_DIR)/lal_base_atomic.o
$(OBJ_DIR)/lal_eam.o: $(ALL_H) lal_eam.h lal_eam.cpp $(OBJ_DIR)/eam_cubin.h $(OBJ_DIR)/lal_base_atomic.o
$(CUDR) -o $@ -c lal_eam.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_eam_ext.o: $(ALL_H) lal_eam.h lal_eam_ext.cpp lal_base_atomic.h
$(CUDR) -o $@ -c lal_eam_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/buck.ptx: lal_buck.cu lal_precision.h lal_preprocessor.h
$(CUDA) --ptx -DNV_KERNEL -o $@ lal_buck.cu
$(OBJ_DIR)/buck.cubin: lal_buck.cu lal_precision.h lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_buck.cu
$(OBJ_DIR)/buck_ptx.h: $(OBJ_DIR)/buck.ptx $(OBJ_DIR)/buck.ptx
$(BSH) ./geryon/file_to_cstr.sh buck $(OBJ_DIR)/buck.ptx $(OBJ_DIR)/buck_ptx.h
$(OBJ_DIR)/buck_cubin.h: $(OBJ_DIR)/buck.cubin $(OBJ_DIR)/buck.cubin
$(BIN2C) -c -n buck $(OBJ_DIR)/buck.cubin > $(OBJ_DIR)/buck_cubin.h
$(OBJ_DIR)/lal_buck.o: $(ALL_H) lal_buck.h lal_buck.cpp $(OBJ_DIR)/buck_ptx.h $(OBJ_DIR)/lal_base_atomic.o
$(OBJ_DIR)/lal_buck.o: $(ALL_H) lal_buck.h lal_buck.cpp $(OBJ_DIR)/buck_cubin.h $(OBJ_DIR)/lal_base_atomic.o
$(CUDR) -o $@ -c lal_buck.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_buck_ext.o: $(ALL_H) lal_buck.h lal_buck_ext.cpp lal_base_atomic.h
$(CUDR) -o $@ -c lal_buck_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/buck_coul.ptx: lal_buck_coul.cu lal_precision.h lal_preprocessor.h
$(CUDA) --ptx -DNV_KERNEL -o $@ lal_buck_coul.cu
$(OBJ_DIR)/buck_coul.cubin: lal_buck_coul.cu lal_precision.h lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_buck_coul.cu
$(OBJ_DIR)/buck_coul_ptx.h: $(OBJ_DIR)/buck_coul.ptx $(OBJ_DIR)/buck_coul.ptx
$(BSH) ./geryon/file_to_cstr.sh buck_coul $(OBJ_DIR)/buck_coul.ptx $(OBJ_DIR)/buck_coul_ptx.h
$(OBJ_DIR)/buck_coul_cubin.h: $(OBJ_DIR)/buck_coul.cubin $(OBJ_DIR)/buck_coul.cubin
$(BIN2C) -c -n buck_coul $(OBJ_DIR)/buck_coul.cubin > $(OBJ_DIR)/buck_coul_cubin.h
$(OBJ_DIR)/lal_buck_coul.o: $(ALL_H) lal_buck_coul.h lal_buck_coul.cpp $(OBJ_DIR)/buck_coul_ptx.h $(OBJ_DIR)/lal_base_charge.o
$(OBJ_DIR)/lal_buck_coul.o: $(ALL_H) lal_buck_coul.h lal_buck_coul.cpp $(OBJ_DIR)/buck_coul_cubin.h $(OBJ_DIR)/lal_base_charge.o
$(CUDR) -o $@ -c lal_buck_coul.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_buck_coul_ext.o: $(ALL_H) lal_buck_coul.h lal_buck_coul_ext.cpp lal_base_charge.h
$(CUDR) -o $@ -c lal_buck_coul_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/buck_coul_long.ptx: lal_buck_coul_long.cu lal_precision.h lal_preprocessor.h
$(CUDA) --ptx -DNV_KERNEL -o $@ lal_buck_coul_long.cu
$(OBJ_DIR)/buck_coul_long.cubin: lal_buck_coul_long.cu lal_precision.h lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_buck_coul_long.cu
$(OBJ_DIR)/buck_coul_long_ptx.h: $(OBJ_DIR)/buck_coul_long.ptx $(OBJ_DIR)/buck_coul_long.ptx
$(BSH) ./geryon/file_to_cstr.sh buck_coul_long $(OBJ_DIR)/buck_coul_long.ptx $(OBJ_DIR)/buck_coul_long_ptx.h
$(OBJ_DIR)/buck_coul_long_cubin.h: $(OBJ_DIR)/buck_coul_long.cubin $(OBJ_DIR)/buck_coul_long.cubin
$(BIN2C) -c -n buck_coul_long $(OBJ_DIR)/buck_coul_long.cubin > $(OBJ_DIR)/buck_coul_long_cubin.h
$(OBJ_DIR)/lal_buck_coul_long.o: $(ALL_H) lal_buck_coul_long.h lal_buck_coul_long.cpp $(OBJ_DIR)/buck_coul_long_ptx.h $(OBJ_DIR)/lal_base_charge.o
$(OBJ_DIR)/lal_buck_coul_long.o: $(ALL_H) lal_buck_coul_long.h lal_buck_coul_long.cpp $(OBJ_DIR)/buck_coul_long_cubin.h $(OBJ_DIR)/lal_base_charge.o
$(CUDR) -o $@ -c lal_buck_coul_long.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_buck_coul_long_ext.o: $(ALL_H) lal_buck_coul_long.h lal_buck_coul_long_ext.cpp lal_base_charge.h
$(CUDR) -o $@ -c lal_buck_coul_long_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/table.ptx: lal_table.cu lal_precision.h lal_preprocessor.h
$(CUDA) --ptx -DNV_KERNEL -o $@ lal_table.cu
$(OBJ_DIR)/table.cubin: lal_table.cu lal_precision.h lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_table.cu
$(OBJ_DIR)/table_ptx.h: $(OBJ_DIR)/table.ptx $(OBJ_DIR)/table.ptx
$(BSH) ./geryon/file_to_cstr.sh table $(OBJ_DIR)/table.ptx $(OBJ_DIR)/table_ptx.h
$(OBJ_DIR)/table_cubin.h: $(OBJ_DIR)/table.cubin $(OBJ_DIR)/table.cubin
$(BIN2C) -c -n table $(OBJ_DIR)/table.cubin > $(OBJ_DIR)/table_cubin.h
$(OBJ_DIR)/lal_table.o: $(ALL_H) lal_table.h lal_table.cpp $(OBJ_DIR)/table_ptx.h $(OBJ_DIR)/lal_base_atomic.o
$(OBJ_DIR)/lal_table.o: $(ALL_H) lal_table.h lal_table.cpp $(OBJ_DIR)/table_cubin.h $(OBJ_DIR)/lal_base_atomic.o
$(CUDR) -o $@ -c lal_table.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_table_ext.o: $(ALL_H) lal_table.h lal_table_ext.cpp lal_base_atomic.h
$(CUDR) -o $@ -c lal_table_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/yukawa.ptx: lal_yukawa.cu lal_precision.h lal_preprocessor.h
$(CUDA) --ptx -DNV_KERNEL -o $@ lal_yukawa.cu
$(OBJ_DIR)/yukawa.cubin: lal_yukawa.cu lal_precision.h lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_yukawa.cu
$(OBJ_DIR)/yukawa_ptx.h: $(OBJ_DIR)/yukawa.ptx $(OBJ_DIR)/yukawa.ptx
$(BSH) ./geryon/file_to_cstr.sh yukawa $(OBJ_DIR)/yukawa.ptx $(OBJ_DIR)/yukawa_ptx.h
$(OBJ_DIR)/yukawa_cubin.h: $(OBJ_DIR)/yukawa.cubin $(OBJ_DIR)/yukawa.cubin
$(BIN2C) -c -n yukawa $(OBJ_DIR)/yukawa.cubin > $(OBJ_DIR)/yukawa_cubin.h
$(OBJ_DIR)/lal_yukawa.o: $(ALL_H) lal_yukawa.h lal_yukawa.cpp $(OBJ_DIR)/yukawa_ptx.h $(OBJ_DIR)/lal_base_atomic.o
$(OBJ_DIR)/lal_yukawa.o: $(ALL_H) lal_yukawa.h lal_yukawa.cpp $(OBJ_DIR)/yukawa_cubin.h $(OBJ_DIR)/lal_base_atomic.o
$(CUDR) -o $@ -c lal_yukawa.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_yukawa_ext.o: $(ALL_H) lal_yukawa.h lal_yukawa_ext.cpp lal_base_atomic.h
$(CUDR) -o $@ -c lal_yukawa_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/born.cubin: lal_born.cu lal_precision.h lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_born.cu
$(OBJ_DIR)/born_cubin.h: $(OBJ_DIR)/born.cubin $(OBJ_DIR)/born.cubin
$(BIN2C) -c -n born $(OBJ_DIR)/born.cubin > $(OBJ_DIR)/born_cubin.h
$(OBJ_DIR)/lal_born.o: $(ALL_H) lal_born.h lal_born.cpp $(OBJ_DIR)/born_cubin.h $(OBJ_DIR)/lal_base_atomic.o
$(CUDR) -o $@ -c lal_born.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_born_ext.o: $(ALL_H) lal_born.h lal_born_ext.cpp lal_base_atomic.h
$(CUDR) -o $@ -c lal_born_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/born_coul_wolf.cubin: lal_born_coul_wolf.cu lal_precision.h lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_born_coul_wolf.cu
$(OBJ_DIR)/born_coul_wolf_cubin.h: $(OBJ_DIR)/born_coul_wolf.cubin $(OBJ_DIR)/born_coul_wolf.cubin
$(BIN2C) -c -n born_coul_wolf $(OBJ_DIR)/born_coul_wolf.cubin > $(OBJ_DIR)/born_coul_wolf_cubin.h
$(OBJ_DIR)/lal_born_coul_wolf.o: $(ALL_H) lal_born_coul_wolf.h lal_born_coul_wolf.cpp $(OBJ_DIR)/born_coul_wolf_cubin.h $(OBJ_DIR)/lal_base_charge.o
$(CUDR) -o $@ -c lal_born_coul_wolf.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_born_coul_wolf_ext.o: $(ALL_H) lal_born_coul_wolf.h lal_born_coul_wolf_ext.cpp lal_base_charge.h
$(CUDR) -o $@ -c lal_born_coul_wolf_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/born_coul_long.cubin: lal_born_coul_long.cu lal_precision.h lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_born_coul_long.cu
$(OBJ_DIR)/born_coul_long_cubin.h: $(OBJ_DIR)/born_coul_long.cubin $(OBJ_DIR)/born_coul_long.cubin
$(BIN2C) -c -n born_coul_long $(OBJ_DIR)/born_coul_long.cubin > $(OBJ_DIR)/born_coul_long_cubin.h
$(OBJ_DIR)/lal_born_coul_long.o: $(ALL_H) lal_born_coul_long.h lal_born_coul_long.cpp $(OBJ_DIR)/born_coul_long_cubin.h $(OBJ_DIR)/lal_base_charge.o
$(CUDR) -o $@ -c lal_born_coul_long.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_born_coul_long_ext.o: $(ALL_H) lal_born_coul_long.h lal_born_coul_long_ext.cpp lal_base_charge.h
$(CUDR) -o $@ -c lal_born_coul_long_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/dipole_lj.cubin: lal_dipole_lj.cu lal_precision.h lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_dipole_lj.cu
$(OBJ_DIR)/dipole_lj_cubin.h: $(OBJ_DIR)/dipole_lj.cubin $(OBJ_DIR)/dipole_lj.cubin
$(BIN2C) -c -n dipole_lj $(OBJ_DIR)/dipole_lj.cubin > $(OBJ_DIR)/dipole_lj_cubin.h
$(OBJ_DIR)/lal_dipole_lj.o: $(ALL_H) lal_dipole_lj.h lal_dipole_lj.cpp $(OBJ_DIR)/dipole_lj_cubin.h $(OBJ_DIR)/lal_base_dipole.o
$(CUDR) -o $@ -c lal_dipole_lj.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_dipole_lj_ext.o: $(ALL_H) lal_dipole_lj.h lal_dipole_lj_ext.cpp lal_base_dipole.h
$(CUDR) -o $@ -c lal_dipole_lj_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/dipole_lj_sf.cubin: lal_dipole_lj_sf.cu lal_precision.h lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_dipole_lj_sf.cu
$(OBJ_DIR)/dipole_lj_sf_cubin.h: $(OBJ_DIR)/dipole_lj_sf.cubin $(OBJ_DIR)/dipole_lj_sf.cubin
$(BIN2C) -c -n dipole_lj_sf $(OBJ_DIR)/dipole_lj_sf.cubin > $(OBJ_DIR)/dipole_lj_sf_cubin.h
$(OBJ_DIR)/lal_dipole_lj_sf.o: $(ALL_H) lal_dipole_lj_sf.h lal_dipole_lj_sf.cpp $(OBJ_DIR)/dipole_lj_sf_cubin.h $(OBJ_DIR)/lal_base_dipole.o
$(CUDR) -o $@ -c lal_dipole_lj_sf.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_dipole_lj_sf_ext.o: $(ALL_H) lal_dipole_lj_sf.h lal_dipole_lj_sf_ext.cpp lal_base_dipole.h
$(CUDR) -o $@ -c lal_dipole_lj_sf_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/colloid.cubin: lal_colloid.cu lal_precision.h lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_colloid.cu
$(OBJ_DIR)/colloid_cubin.h: $(OBJ_DIR)/colloid.cubin $(OBJ_DIR)/colloid.cubin
$(BIN2C) -c -n colloid $(OBJ_DIR)/colloid.cubin > $(OBJ_DIR)/colloid_cubin.h
$(OBJ_DIR)/lal_colloid.o: $(ALL_H) lal_colloid.h lal_colloid.cpp $(OBJ_DIR)/colloid_cubin.h $(OBJ_DIR)/lal_base_atomic.o
$(CUDR) -o $@ -c lal_colloid.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_colloid_ext.o: $(ALL_H) lal_colloid.h lal_colloid_ext.cpp lal_base_atomic.h
$(CUDR) -o $@ -c lal_colloid_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/gauss.cubin: lal_gauss.cu lal_precision.h lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_gauss.cu
$(OBJ_DIR)/gauss_cubin.h: $(OBJ_DIR)/gauss.cubin $(OBJ_DIR)/gauss.cubin
$(BIN2C) -c -n gauss $(OBJ_DIR)/gauss.cubin > $(OBJ_DIR)/gauss_cubin.h
$(OBJ_DIR)/lal_gauss.o: $(ALL_H) lal_gauss.h lal_gauss.cpp $(OBJ_DIR)/gauss_cubin.h $(OBJ_DIR)/lal_base_atomic.o
$(CUDR) -o $@ -c lal_gauss.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_gauss_ext.o: $(ALL_H) lal_gauss.h lal_gauss_ext.cpp lal_base_atomic.h
$(CUDR) -o $@ -c lal_gauss_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/yukawa_colloid.cubin: lal_yukawa_colloid.cu lal_precision.h lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_yukawa_colloid.cu
$(OBJ_DIR)/yukawa_colloid_cubin.h: $(OBJ_DIR)/yukawa_colloid.cubin $(OBJ_DIR)/yukawa_colloid.cubin
$(BIN2C) -c -n yukawa_colloid $(OBJ_DIR)/yukawa_colloid.cubin > $(OBJ_DIR)/yukawa_colloid_cubin.h
$(OBJ_DIR)/lal_yukawa_colloid.o: $(ALL_H) lal_yukawa_colloid.h lal_yukawa_colloid.cpp $(OBJ_DIR)/yukawa_colloid_cubin.h $(OBJ_DIR)/lal_base_atomic.o
$(CUDR) -o $@ -c lal_yukawa_colloid.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_yukawa_colloid_ext.o: $(ALL_H) lal_yukawa_colloid.h lal_yukawa_colloid_ext.cpp lal_base_atomic.h
$(CUDR) -o $@ -c lal_yukawa_colloid_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lj_coul_debye.cubin: lal_lj_coul_debye.cu lal_precision.h lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj_coul_debye.cu
$(OBJ_DIR)/lj_coul_debye_cubin.h: $(OBJ_DIR)/lj_coul_debye.cubin $(OBJ_DIR)/lj_coul_debye.cubin
$(BIN2C) -c -n lj_coul_debye $(OBJ_DIR)/lj_coul_debye.cubin > $(OBJ_DIR)/lj_coul_debye_cubin.h
$(OBJ_DIR)/lal_lj_coul_debye.o: $(ALL_H) lal_lj_coul_debye.h lal_lj_coul_debye.cpp $(OBJ_DIR)/lj_coul_debye_cubin.h $(OBJ_DIR)/lal_base_charge.o
$(CUDR) -o $@ -c lal_lj_coul_debye.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_lj_coul_debye_ext.o: $(ALL_H) lal_lj_coul_debye.h lal_lj_coul_debye_ext.cpp lal_base_charge.h
$(CUDR) -o $@ -c lal_lj_coul_debye_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/coul_dsf.cubin: lal_coul_dsf.cu lal_precision.h lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_coul_dsf.cu
$(OBJ_DIR)/coul_dsf_cubin.h: $(OBJ_DIR)/coul_dsf.cubin $(OBJ_DIR)/coul_dsf.cubin
$(BIN2C) -c -n coul_dsf $(OBJ_DIR)/coul_dsf.cubin > $(OBJ_DIR)/coul_dsf_cubin.h
$(OBJ_DIR)/lal_coul_dsf.o: $(ALL_H) lal_coul_dsf.h lal_coul_dsf.cpp $(OBJ_DIR)/coul_dsf_cubin.h $(OBJ_DIR)/lal_base_charge.o
$(CUDR) -o $@ -c lal_coul_dsf.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_coul_dsf_ext.o: $(ALL_H) lal_coul_dsf.h lal_coul_dsf_ext.cpp lal_base_charge.h
$(CUDR) -o $@ -c lal_coul_dsf_ext.cpp -I$(OBJ_DIR)
$(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVD_H)
$(CUDR) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_CUDADR $(CUDA_LIB) -lcuda
@ -415,10 +575,10 @@ $(GPU_LIB): $(OBJS) $(CUDPP)
$(AR) -crusv $(GPU_LIB) $(OBJS) $(CUDPP)
clean:
rm -f $(EXECS) $(GPU_LIB) $(OBJS) $(CUDPP) $(PTXS) *.linkinfo
rm -f $(EXECS) $(GPU_LIB) $(OBJS) $(CUDPP) $(CBNS) *.linkinfo
veryclean: clean
rm -rf *~ *.linkinfo
cleanlib:
rm -f $(EXECS) $(GPU_LIB) $(OBJS) $(PTXS) *.linkinfo
rm -f $(EXECS) $(GPU_LIB) $(OBJS) $(CBNS) *.linkinfo

View File

@ -17,6 +17,7 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_answer.o \
$(OBJ_DIR)/lal_neighbor_shared.o $(OBJ_DIR)/lal_neighbor.o \
$(OBJ_DIR)/lal_device.o $(OBJ_DIR)/lal_base_atomic.o \
$(OBJ_DIR)/lal_base_charge.o $(OBJ_DIR)/lal_base_ellipsoid.o \
$(OBJ_DIR)/lal_base_dipole.o \
$(OBJ_DIR)/lal_pppm.o $(OBJ_DIR)/lal_pppm_ext.o \
$(OBJ_DIR)/lal_gayberne.o $(OBJ_DIR)/lal_gayberne_ext.o \
$(OBJ_DIR)/lal_re_squared.o $(OBJ_DIR)/lal_re_squared_ext.o \
@ -25,6 +26,7 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_answer.o \
$(OBJ_DIR)/lal_lj_expand.o $(OBJ_DIR)/lal_lj_expand_ext.o \
$(OBJ_DIR)/lal_lj_coul.o $(OBJ_DIR)/lal_lj_coul_ext.o \
$(OBJ_DIR)/lal_lj_coul_long.o $(OBJ_DIR)/lal_lj_coul_long_ext.o \
$(OBJ_DIR)/lal_lj_dsf.o $(OBJ_DIR)/lal_lj_dsf_ext.o \
$(OBJ_DIR)/lal_lj_class2_long.o $(OBJ_DIR)/lal_lj_class2_long_ext.o \
$(OBJ_DIR)/lal_coul_long.o $(OBJ_DIR)/lal_coul_long_ext.o \
$(OBJ_DIR)/lal_morse.o $(OBJ_DIR)/lal_morse_ext.o \
@ -36,20 +38,43 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_answer.o \
$(OBJ_DIR)/lal_buck_coul.o $(OBJ_DIR)/lal_buck_coul_ext.o \
$(OBJ_DIR)/lal_buck_coul_long.o $(OBJ_DIR)/lal_buck_coul_long_ext.o \
$(OBJ_DIR)/lal_table.o $(OBJ_DIR)/lal_table_ext.o \
$(OBJ_DIR)/lal_yukawa.o $(OBJ_DIR)/lal_yukawa_ext.o
$(OBJ_DIR)/lal_yukawa.o $(OBJ_DIR)/lal_yukawa_ext.o \
$(OBJ_DIR)/lal_born.o $(OBJ_DIR)/lal_born_ext.o \
$(OBJ_DIR)/lal_born_coul_wolf.o $(OBJ_DIR)/lal_born_coul_wolf_ext.o \
$(OBJ_DIR)/lal_born_coul_long.o $(OBJ_DIR)/lal_born_coul_long_ext.o \
$(OBJ_DIR)/lal_dipole_lj.o $(OBJ_DIR)/lal_dipole_lj_ext.o \
$(OBJ_DIR)/lal_dipole_lj_sf.o $(OBJ_DIR)/lal_dipole_lj_sf_ext.o \
$(OBJ_DIR)/lal_colloid.o $(OBJ_DIR)/lal_colloid_ext.o \
$(OBJ_DIR)/lal_gauss.o $(OBJ_DIR)/lal_gauss_ext.o \
$(OBJ_DIR)/lal_yukawa_colloid.o $(OBJ_DIR)/lal_yukawa_colloid_ext.o \
$(OBJ_DIR)/lal_lj_coul_debye.o $(OBJ_DIR)/lal_lj_coul_debye_ext.o \
$(OBJ_DIR)/lal_coul_dsf.o $(OBJ_DIR)/lal_coul_dsf_ext.o
KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \
$(OBJ_DIR)/neighbor_cpu_cl.h $(OBJ_DIR)/pppm_cl.h \
$(OBJ_DIR)/ellipsoid_nbor_cl.h $(OBJ_DIR)/gayberne_cl.h \
$(OBJ_DIR)/gayberne_lj_cl.h $(OBJ_DIR)/re_squared_cl.h \
$(OBJ_DIR)/re_squared_lj_cl.h $(OBJ_DIR)/lj_cl.h $(OBJ_DIR)/lj96_cl.h \
$(OBJ_DIR)/lj_expand_cl.h $(OBJ_DIR)/lj_coul_cl.h \
$(OBJ_DIR)/lj_coul_long_cl.h $(OBJ_DIR)/lj_class2_long_cl.h \
$(OBJ_DIR)/lj_coul_long_cl.h $(OBJ_DIR)/lj_dsf_cl.h \
$(OBJ_DIR)/lj_class2_long_cl.h \
$(OBJ_DIR)/coul_long_cl.h $(OBJ_DIR)/morse_cl.h \
$(OBJ_DIR)/charmm_long_cl.h $(OBJ_DIR)/cg_cmm_cl.h \
$(OBJ_DIR)/cg_cmm_long_cl.h $(OBJ_DIR)/neighbor_gpu_cl.h \
$(OBJ_DIR)/eam_cl.h $(OBJ_DIR)/buck_cl.h \
$(OBJ_DIR)/buck_coul_cl.h $(OBJ_DIR)/buck_coul_long_cl.h \
$(OBJ_DIR)/table_cl.h $(OBJ_DIR)/yukawa_cl.h
$(OBJ_DIR)/table_cl.h $(OBJ_DIR)/yukawa_cl.h \
$(OBJ_DIR)/born.cubin $(OBJ_DIR)/born_cubin.h \
$(OBJ_DIR)/born_coul_wolf.cubin $(OBJ_DIR)/born_coul_wolf_cubin.h \
$(OBJ_DIR)/born_coul_long.cubin $(OBJ_DIR)/born_coul_long_cubin.h \
$(OBJ_DIR)/dipole_lj.cubin $(OBJ_DIR)/dipole_lj_cubin.h \
$(OBJ_DIR)/dipole_lj_sf.cubin $(OBJ_DIR)/dipole_lj_sf_cubin.h \
$(OBJ_DIR)/colloid.cubin $(OBJ_DIR)/colloid_cubin.h \
$(OBJ_DIR)/gauss.cubin $(OBJ_DIR)/gauss_cubin.h \
$(OBJ_DIR)/yukawa_colloid.cubin $(OBJ_DIR)/yukawa_colloid_cubin.h \
$(OBJ_DIR)/lj_coul_debye.cubin $(OBJ_DIR)/lj_coul_debye_cubin.h \
$(OBJ_DIR)/coul_dsf.cubin $(OBJ_DIR)/coul_dsf_cubin.h
OCL_EXECS = $(BIN_DIR)/ocl_get_devices
@ -91,6 +116,9 @@ $(OBJ_DIR)/lal_base_charge.o: $(OCL_H) lal_base_charge.h lal_base_charge.cpp
$(OBJ_DIR)/lal_base_ellipsoid.o: $(OCL_H) lal_base_ellipsoid.h lal_base_ellipsoid.cpp $(OBJ_DIR)/ellipsoid_nbor_cl.h
$(OCL) -o $@ -c lal_base_ellipsoid.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_base_dipole.o: $(OCL_H) lal_base_dipole.h lal_base_dipole.cpp
$(OCL) -o $@ -c lal_base_dipole.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/pppm_cl.h: lal_pppm.cu lal_preprocessor.h
$(BSH) ./geryon/file_to_cstr.sh pppm lal_preprocessor.h lal_pppm.cu $(OBJ_DIR)/pppm_cl.h;
@ -154,6 +182,15 @@ $(OBJ_DIR)/lal_lj_coul_long.o: $(ALL_H) lal_lj_coul_long.h lal_lj_coul_long.cpp
$(OBJ_DIR)/lal_lj_coul_long_ext.o: $(ALL_H) lal_lj_coul_long.h lal_lj_coul_long_ext.cpp lal_base_charge.h
$(OCL) -o $@ -c lal_lj_coul_long_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lj_dsf_cl.h: lal_lj_dsf.cu $(PRE1_H)
$(BSH) ./geryon/file_to_cstr.sh lj_dsf $(PRE1_H) lal_lj_dsf.cu $(OBJ_DIR)/lj_dsf_cl.h;
$(OBJ_DIR)/lal_lj_dsf.o: $(ALL_H) lal_lj_dsf.h lal_lj_dsf.cpp $(OBJ_DIR)/lj_dsf_cl.h $(OBJ_DIR)/lj_dsf_cl.h $(OBJ_DIR)/lal_base_charge.o
$(OCL) -o $@ -c lal_lj_dsf.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_lj_dsf_ext.o: $(ALL_H) lal_lj_dsf.h lal_lj_dsf_ext.cpp lal_base_charge.h
$(OCL) -o $@ -c lal_lj_dsf_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lj_class2_long_cl.h: lal_lj_class2_long.cu $(PRE1_H)
$(BSH) ./geryon/file_to_cstr.sh lj_class2_long $(PRE1_H) lal_lj_class2_long.cu $(OBJ_DIR)/lj_class2_long_cl.h;
@ -280,6 +317,96 @@ $(OBJ_DIR)/lal_yukawa.o: $(ALL_H) lal_yukawa.h lal_yukawa.cpp $(OBJ_DIR)/yukawa
$(OBJ_DIR)/lal_yukawa_ext.o: $(ALL_H) lal_yukawa.h lal_yukawa_ext.cpp lal_base_atomic.h
$(OCL) -o $@ -c lal_yukawa_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/born_cl.h: lal_born.cu $(PRE1_H)
$(BSH) ./geryon/file_to_cstr.sh born $(PRE1_H) lal_born.cu $(OBJ_DIR)/born_cl.h;
$(OBJ_DIR)/lal_born.o: $(ALL_H) lal_born.h lal_born.cpp $(OBJ_DIR)/born_cl.h $(OBJ_DIR)/born_cl.h $(OBJ_DIR)/lal_base_atomic.o
$(OCL) -o $@ -c lal_born.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_born_ext.o: $(ALL_H) lal_born.h lal_born_ext.cpp lal_base_atomic.h
$(OCL) -o $@ -c lal_born_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/born_coul_wolf_cl.h: lal_born_coul_wolf.cu $(PRE1_H)
$(BSH) ./geryon/file_to_cstr.sh born_coul_wolf $(PRE1_H) lal_born_coul_wolf.cu $(OBJ_DIR)/born_coul_wolf_cl.h;
$(OBJ_DIR)/lal_born_coul_wolf.o: $(ALL_H) lal_born_coul_wolf.h lal_born_coul_wolf.cpp $(OBJ_DIR)/born_coul_wolf_cl.h $(OBJ_DIR)/born_coul_wolf_cl.h $(OBJ_DIR)/lal_base_charge.o
$(OCL) -o $@ -c lal_born_coul_wolf.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_born_coul_wolf_ext.o: $(ALL_H) lal_born_coul_wolf.h lal_born_coul_wolf_ext.cpp lal_base_charge.h
$(OCL) -o $@ -c lal_born_coul_wolf_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/born_coul_long_cl.h: lal_born_coul_long.cu $(PRE1_H)
$(BSH) ./geryon/file_to_cstr.sh born_coul_long $(PRE1_H) lal_born_coul_long.cu $(OBJ_DIR)/born_coul_long_cl.h;
$(OBJ_DIR)/lal_born_coul_long.o: $(ALL_H) lal_born_coul_long.h lal_born_coul_long.cpp $(OBJ_DIR)/born_coul_long_cl.h $(OBJ_DIR)/born_coul_long_cl.h $(OBJ_DIR)/lal_base_charge.o
$(OCL) -o $@ -c lal_born_coul_long.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_born_coul_long_ext.o: $(ALL_H) lal_born_coul_long.h lal_born_coul_long_ext.cpp lal_base_charge.h
$(OCL) -o $@ -c lal_born_coul_long_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/dipole_lj_cl.h: lal_dipole_lj.cu $(PRE1_H)
$(BSH) ./geryon/file_to_cstr.sh dipole_lj $(PRE1_H) lal_dipole_lj.cu $(OBJ_DIR)/dipole_lj_cl.h;
$(OBJ_DIR)/lal_dipole_lj.o: $(ALL_H) lal_dipole_lj.h lal_dipole_lj.cpp $(OBJ_DIR)/dipole_lj_cl.h $(OBJ_DIR)/dipole_lj_cl.h $(OBJ_DIR)/lal_base_dipole.o
$(OCL) -o $@ -c lal_dipole_lj.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_dipole_lj_ext.o: $(ALL_H) lal_dipole_lj.h lal_dipole_lj_ext.cpp lal_base_dipole.h
$(OCL) -o $@ -c lal_dipole_lj_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/dipole_lj_sf_cl.h: lal_dipole_lj_sf.cu $(PRE1_H)
$(BSH) ./geryon/file_to_cstr.sh dipole_lj_sf $(PRE1_H) lal_dipole_lj_sf.cu $(OBJ_DIR)/dipole_lj_sf_cl.h;
$(OBJ_DIR)/lal_dipole_lj_sf.o: $(ALL_H) lal_dipole_lj_sf.h lal_dipole_lj_sf.cpp $(OBJ_DIR)/dipole_lj_sf_cl.h $(OBJ_DIR)/dipole_lj_sf_cl.h $(OBJ_DIR)/lal_base_dipole.o
$(OCL) -o $@ -c lal_dipole_lj_sf.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_dipole_lj_sf_ext.o: $(ALL_H) lal_dipole_lj_sf.h lal_dipole_lj_sf_ext.cpp lal_base_dipole.h
$(OCL) -o $@ -c lal_dipole_lj_sf_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/colloid_cl.h: lal_colloid.cu $(PRE1_H)
$(BSH) ./geryon/file_to_cstr.sh colloid $(PRE1_H) lal_colloid.cu $(OBJ_DIR)/colloid_cl.h;
$(OBJ_DIR)/lal_colloid.o: $(ALL_H) lal_colloid.h lal_colloid.cpp $(OBJ_DIR)/colloid_cl.h $(OBJ_DIR)/colloid_cl.h $(OBJ_DIR)/lal_base_atomic.o
$(OCL) -o $@ -c lal_colloid.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_colloid_ext.o: $(ALL_H) lal_colloid.h lal_colloid_ext.cpp lal_base_atomic.h
$(OCL) -o $@ -c lal_colloid_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/gauss_cl.h: lal_gauss.cu $(PRE1_H)
$(BSH) ./geryon/file_to_cstr.sh gauss $(PRE1_H) lal_gauss.cu $(OBJ_DIR)/gauss_cl.h;
$(OBJ_DIR)/lal_gauss.o: $(ALL_H) lal_gauss.h lal_gauss.cpp $(OBJ_DIR)/gauss_cl.h $(OBJ_DIR)/gauss_cl.h $(OBJ_DIR)/lal_base_atomic.o
$(OCL) -o $@ -c lal_gauss.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_gauss_ext.o: $(ALL_H) lal_gauss.h lal_gauss_ext.cpp lal_base_atomic.h
$(OCL) -o $@ -c lal_gauss_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/yukawa_colloid_cl.h: lal_yukawa_colloid.cu $(PRE1_H)
$(BSH) ./geryon/file_to_cstr.sh yukawa_colloid $(PRE1_H) lal_yukawa_colloid.cu $(OBJ_DIR)/yukawa_colloid_cl.h;
$(OBJ_DIR)/lal_yukawa_colloid.o: $(ALL_H) lal_yukawa_colloid.h lal_yukawa_colloid.cpp $(OBJ_DIR)/yukawa_colloid_cl.h $(OBJ_DIR)/yukawa_colloid_cl.h $(OBJ_DIR)/lal_base_atomic.o
$(OCL) -o $@ -c lal_yukawa_colloid.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_yukawa_colloid_ext.o: $(ALL_H) lal_yukawa_colloid.h lal_yukawa_colloid_ext.cpp lal_base_atomic.h
$(OCL) -o $@ -c lal_yukawa_colloid_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lj_coul_debye_cl.h: lal_lj_coul_debye.cu $(PRE1_H)
$(BSH) ./geryon/file_to_cstr.sh lj_coul_debye $(PRE1_H) lal_lj_coul_debye.cu $(OBJ_DIR)/lj_coul_debye_cl.h;
$(OBJ_DIR)/lal_lj_coul_debye.o: $(ALL_H) lal_lj_coul_debye.h lal_lj_coul_debye.cpp $(OBJ_DIR)/lj_coul_debye_cl.h $(OBJ_DIR)/lj_coul_debye_cl.h $(OBJ_DIR)/lal_base_charge.o
$(OCL) -o $@ -c lal_lj_coul_debye.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_lj_coul_debye_ext.o: $(ALL_H) lal_lj_coul_debye.h lal_lj_coul_debye_ext.cpp lal_base_charge.h
$(OCL) -o $@ -c lal_lj_coul_debye_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/coul_dsf_cl.h: lal_coul_dsf.cu $(PRE1_H)
$(BSH) ./geryon/file_to_cstr.sh coul_dsf $(PRE1_H) lal_coul_dsf.cu $(OBJ_DIR)/coul_dsf_cl.h;
$(OBJ_DIR)/lal_coul_dsf.o: $(ALL_H) lal_coul_dsf.h lal_coul_dsf.cpp $(OBJ_DIR)/coul_dsf_cl.h $(OBJ_DIR)/coul_dsf_cl.h $(OBJ_DIR)/lal_base_charge.o
$(OCL) -o $@ -c lal_coul_dsf.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_coul_dsf_ext.o: $(ALL_H) lal_coul_dsf.h lal_coul_dsf_ext.cpp lal_base_charge.h
$(OCL) -o $@ -c lal_coul_dsf_ext.cpp -I$(OBJ_DIR)
$(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp
$(OCL) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_OPENCL $(OCL_LINK)

View File

@ -1,3 +1,7 @@
NOTE: This Geryon distribution has been modified to remove files not
necessary for the LAMMPS implementation. The full distribution
is available at http://users.nccs.gov/~wb8/geryon/index.htm
Geryon
Copyright (2010) Sandia Corporation. Under the terms of Contract

View File

@ -1 +1 @@
Geryon Version 12.034
Geryon Version 12.033

View File

@ -141,6 +141,11 @@ class UCL_Device {
/// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
inline int device_type(const int i) { return UCL_GPU; }
/// Returns true if host memory is efficiently addressable from device
inline bool shared_memory() { return shared_memory(_device); }
/// Returns true if host memory is efficiently addressable from device
inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; }
/// Returns true if double precision is support for the current device
bool double_precision() { return double_precision(_device); }
/// Returns true if double precision is support for the device

View File

@ -30,11 +30,23 @@
namespace ucl_cudadr {
class UCL_Texture;
template <class numtyp> class UCL_D_Vec;
template <class numtyp> class UCL_D_Mat;
template <class hosttype, class devtype> class UCL_Vector;
template <class hosttype, class devtype> class UCL_Matrix;
#define UCL_MAX_KERNEL_ARGS 256
/// Class storing 1 or more kernel functions from a single string or file
class UCL_Program {
public:
inline UCL_Program(UCL_Device &device) { _cq=device.cq(); }
inline UCL_Program(UCL_Device &device, const void *program,
const char *flags="", std::string *log=NULL) {
_cq=device.cq();
init(device);
load_string(program,flags,log);
}
inline ~UCL_Program() {}
/// Initialize the program with a device
@ -64,10 +76,10 @@ class UCL_Program {
}
/// Load a program from a string and compile with flags
inline int load_string(const char *program, const char *flags="",
inline int load_string(const void *program, const char *flags="",
std::string *log=NULL) {
if (std::string(flags)=="BINARY")
return load_binary(program);
return load_binary((const char *)program);
const unsigned int num_opts=2;
CUjit_option options[num_opts];
void *values[num_opts];
@ -134,15 +146,25 @@ class UCL_Program {
friend class UCL_Texture;
};
/// Class for dealing with OpenCL kernels
/// Class for dealing with CUDA Driver kernels
class UCL_Kernel {
public:
UCL_Kernel() : _dimensions(1), _num_args(0), _param_size(0)
{ _num_blocks[0]=0; }
UCL_Kernel() : _dimensions(1), _num_args(0) {
#if CUDA_VERSION < 4000
_param_size=0;
#endif
_num_blocks[0]=0;
}
UCL_Kernel(UCL_Program &program, const char *function) :
_dimensions(1), _num_args(0), _param_size(0)
{ _num_blocks[0]=0; set_function(program,function); _cq=program._cq; }
_dimensions(1), _num_args(0) {
#if CUDA_VERSION < 4000
_param_size=0;
#endif
_num_blocks[0]=0;
set_function(program,function);
_cq=program._cq;
}
~UCL_Kernel() {}
@ -170,78 +192,190 @@ class UCL_Kernel {
* changes
* \note To set kernel parameter i (i>0), parameter i-1 must be set **/
template <class dtype>
inline void set_arg(const unsigned index, dtype *arg) {
inline void set_arg(const unsigned index, const dtype * const arg) {
if (index==_num_args)
add_arg(arg);
else if (index<_num_args)
#if CUDA_VERSION >= 4000
_kernel_args[index]=arg;
#else
CU_SAFE_CALL(cuParamSetv(_kernel, _offsets[index], arg, sizeof(dtype)));
#endif
else
assert(0==1); // Must add kernel parameters in sequential order
}
/// Set a geryon container as a kernel argument.
template <class numtyp>
inline void set_arg(const UCL_D_Vec<numtyp> * const arg)
{ set_arg(&arg->begin()); }
/// Set a geryon container as a kernel argument.
template <class numtyp>
inline void set_arg(const UCL_D_Mat<numtyp> * const arg)
{ set_arg(&arg->begin()); }
/// Set a geryon container as a kernel argument.
template <class hosttype, class devtype>
inline void set_arg(const UCL_Vector<hosttype, devtype> * const arg)
{ set_arg(&arg->device.begin()); }
/// Set a geryon container as a kernel argument.
template <class hosttype, class devtype>
inline void set_arg(const UCL_Matrix<hosttype, devtype> * const arg)
{ set_arg(&arg->device.begin()); }
/// Add a kernel argument.
inline void add_arg(const CUdeviceptr* const arg) {
#if CUDA_VERSION >= 4000
_kernel_args[_num_args]=(void *)arg;
#else
void* ptr = (void*)(size_t)(*arg);
_param_size = (_param_size + __alignof(ptr) - 1) & ~(__alignof(ptr) - 1);
CU_SAFE_CALL(cuParamSetv(_kernel, _param_size, &ptr, sizeof(ptr)));
_offsets.push_back(_param_size);
_param_size+=sizeof(ptr);
#endif
_num_args++;
if (_num_args>UCL_MAX_KERNEL_ARGS) assert(0==1);
}
/// Add a kernel argument.
template <class dtype>
inline void add_arg(const dtype* const arg) {
#if CUDA_VERSION >= 4000
_kernel_args[_num_args]=const_cast<dtype * const>(arg);
#else
_param_size = (_param_size+__alignof(dtype)-1) & ~(__alignof(dtype)-1);
CU_SAFE_CALL(cuParamSetv(_kernel,_param_size,(void*)arg,sizeof(dtype)));
_offsets.push_back(_param_size);
_param_size+=sizeof(dtype);
#endif
_num_args++;
if (_num_args>UCL_MAX_KERNEL_ARGS) assert(0==1);
}
/// Add a geryon container as a kernel argument.
template <class numtyp>
inline void add_arg(const UCL_D_Vec<numtyp> * const arg)
{ add_arg(&arg->begin()); }
/// Add a geryon container as a kernel argument.
template <class numtyp>
inline void add_arg(const UCL_D_Mat<numtyp> * const arg)
{ add_arg(&arg->begin()); }
/// Add a geryon container as a kernel argument.
template <class hosttype, class devtype>
inline void add_arg(const UCL_Vector<hosttype, devtype> * const arg)
{ add_arg(&arg->device.begin()); }
/// Add a geryon container as a kernel argument.
template <class hosttype, class devtype>
inline void add_arg(const UCL_Matrix<hosttype, devtype> * const arg)
{ add_arg(&arg->device.begin()); }
/// Set the number of thread blocks and the number of threads in each block
/** \note This should be called after all arguments have been added **/
/** \note This should be called before any arguments have been added
\note The default command queue is used for the kernel execution **/
inline void set_size(const size_t num_blocks, const size_t block_size) {
_dimensions=1;
_num_blocks[0]=num_blocks;
_num_blocks[1]=1;
_num_blocks[2]=1;
#if CUDA_VERSION >= 4000
_block_size[0]=block_size;
_block_size[1]=1;
_block_size[2]=1;
#else
CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size,1,1));
#endif
}
/// Set the number of thread blocks and the number of threads in each block
/** \note This should be called before any arguments have been added
\note The default command queue for the kernel is changed to cq **/
inline void set_size(const size_t num_blocks, const size_t block_size,
command_queue &cq)
{ _cq=cq; set_size(num_blocks,block_size); }
/// Set the number of thread blocks and the number of threads in each block
/** \note This should be called before any arguments have been added
\note The default command queue is used for the kernel execution **/
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
const size_t block_size_x, const size_t block_size_y) {
_dimensions=2;
_num_blocks[0]=num_blocks_x;
_num_blocks[1]=num_blocks_y;
_num_blocks[2]=1;
#if CUDA_VERSION >= 4000
_block_size[0]=block_size_x;
_block_size[1]=block_size_y;
_block_size[2]=1;
#else
CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size_x,block_size_y,1));
#endif
}
/// Set the number of thread blocks and the number of threads in each block
/** \note This should be called before any arguments have been added
\note The default command queue for the kernel is changed to cq **/
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
const size_t block_size_x, const size_t block_size_y,
command_queue &cq)
{_cq=cq; set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y);}
/// Set the number of thread blocks and the number of threads in each block
/** \note This should be called before any arguments have been added
\note The default command queue is used for the kernel execution **/
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
const size_t block_size_x,
const size_t block_size_y, const size_t block_size_z) {
_dimensions=2;
_num_blocks[0]=num_blocks_x;
_num_blocks[1]=num_blocks_y;
_num_blocks[2]=1;
#if CUDA_VERSION >= 4000
_block_size[0]=block_size_x;
_block_size[1]=block_size_y;
_block_size[2]=block_size_z;
#else
CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size_x,block_size_y,
block_size_z));
#endif
}
/// Set the number of thread blocks and the number of threads in each block
/** \note This should be called before any arguments have been added
\note The default command queue is used for the kernel execution **/
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
const size_t block_size_x, const size_t block_size_y,
const size_t block_size_z, command_queue &cq) {
_cq=cq;
set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y,
block_size_z);
}
/// Run the kernel in the default command queue
inline void run() {
#if CUDA_VERSION >= 4000
CU_SAFE_CALL(cuLaunchKernel(_kernel,_num_blocks[0],_num_blocks[1],
_num_blocks[2],_block_size[0],_block_size[1],
_block_size[2],0,_cq,_kernel_args,NULL));
#else
CU_SAFE_CALL(cuParamSetSize(_kernel,_param_size));
CU_SAFE_CALL(cuLaunchGridAsync(_kernel,_num_blocks[0],_num_blocks[1],_cq));
}
/// Run the kernel in the specified command queue
inline void run(command_queue &cq) {
CU_SAFE_CALL(cuParamSetSize(_kernel,_param_size));
CU_SAFE_CALL(cuLaunchGridAsync(_kernel,_num_blocks[0],_num_blocks[1],cq));
#endif
}
/// Clear any arguments associated with the kernel
inline void clear_args() { _num_args=0; _offsets.clear(); _param_size=0; }
inline void clear_args() {
_num_args=0;
#if CUDA_VERSION < 4000
_offsets.clear();
_param_size=0;
#endif
}
#include "ucl_arg_kludge.h"
@ -249,11 +383,17 @@ class UCL_Kernel {
CUfunction _kernel;
CUstream _cq;
unsigned _dimensions;
unsigned _num_blocks[2];
unsigned _num_blocks[3];
unsigned _num_args;
friend class UCL_Texture;
#if CUDA_VERSION >= 4000
unsigned _block_size[3];
void * _kernel_args[UCL_MAX_KERNEL_ARGS];
#else
std::vector<unsigned> _offsets;
unsigned _param_size;
friend class UCL_Texture;
#endif
};
} // namespace

View File

@ -38,6 +38,9 @@ namespace ucl_cudadr {
#include "ucl_h_mat.h"
#include "ucl_d_vec.h"
#include "ucl_d_mat.h"
#include "ucl_s_obj_help.h"
#include "ucl_vector.h"
#include "ucl_matrix.h"
#undef _UCL_DEVICE_PTR_MAT
#undef _UCL_MAT_ALLOW

View File

@ -85,6 +85,21 @@ inline void _host_free(mat_type &mat, const enum UCL_MEMOPT kind) {
free(mat.begin());
}
template <class mat_type>
inline int _host_resize(mat_type &mat, const size_t n) {
_host_free(mat,mat.kind());
CUresult err=CUDA_SUCCESS;
if (mat.kind()==UCL_RW_OPTIMIZED)
err=cuMemAllocHost((void **)mat.host_ptr(),n);
else if (mat.kind()==UCL_WRITE_OPTIMIZED)
err=cuMemHostAlloc((void **)mat.host_ptr(),n,CU_MEMHOSTALLOC_WRITECOMBINED);
else
*(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL)
return UCL_MEMORY_ERROR;
return UCL_SUCCESS;
}
// --------------------------------------------------------------------------
// - DEVICE MEMORY ALLOCATION ROUTINES
// --------------------------------------------------------------------------
@ -143,6 +158,29 @@ inline void _device_free(mat_type &mat) {
CU_DESTRUCT_CALL(cuMemFree(mat.cbegin()));
}
template <class mat_type>
inline int _device_resize(mat_type &mat, const size_t n) {
_device_free(mat);
CUresult err=cuMemAlloc(&mat.cbegin(),n);
if (err!=CUDA_SUCCESS)
return UCL_MEMORY_ERROR;
return UCL_SUCCESS;
}
template <class mat_type>
inline int _device_resize(mat_type &mat, const size_t rows,
const size_t cols, size_t &pitch) {
_device_free(mat);
CUresult err;
CUDA_INT_TYPE upitch;
err=cuMemAllocPitch(&mat.cbegin(),&upitch,
cols*sizeof(typename mat_type::data_type),rows,16);
pitch=static_cast<size_t>(upitch);
if (err!=CUDA_SUCCESS)
return UCL_MEMORY_ERROR;
return UCL_SUCCESS;
}
inline void _device_view(CUdeviceptr *ptr, CUdeviceptr &in) {
*ptr=in;
}

View File

@ -42,27 +42,56 @@ class UCL_Texture {
{ CU_SAFE_CALL(cuModuleGetTexRef(&_tex, prog._module, texture_name)); }
/// Bind a float array where each fetch grabs a vector of length numel
template<class mat_typ>
inline void bind_float(mat_typ &vec, const unsigned numel) {
#ifdef UCL_DEBUG
assert(numel!=0 && numel<5);
#endif
CU_SAFE_CALL(cuTexRefSetAddress(NULL, _tex, vec.cbegin(),
vec.numel()*vec.element_size()));
CU_SAFE_CALL(cuTexRefSetFormat(_tex, CU_AD_FORMAT_FLOAT, numel));
}
template<class numtyp>
inline void bind_float(UCL_D_Vec<numtyp> &vec, const unsigned numel)
{ _bind_float(vec,numel); }
/// Bind a float array where each fetch grabs a vector of length numel
template<class numtyp>
inline void bind_float(UCL_D_Mat<numtyp> &vec, const unsigned numel)
{ _bind_float(vec,numel); }
/// Bind a float array where each fetch grabs a vector of length numel
template<class numtyp, class devtyp>
inline void bind_float(UCL_Vector<numtyp, devtyp> &vec, const unsigned numel)
{ _bind_float(vec.device,numel); }
/// Bind a float array where each fetch grabs a vector of length numel
template<class numtyp, class devtyp>
inline void bind_float(UCL_Matrix<numtyp, devtyp> &vec, const unsigned numel)
{ _bind_float(vec.device,numel); }
/// Unbind the texture reference from the memory allocation
inline void unbind() { }
/// Make a texture reference available to kernel
inline void allow(UCL_Kernel &kernel) {
#if CUDA_VERSION < 4000
CU_SAFE_CALL(cuParamSetTexRef(kernel._kernel, CU_PARAM_TR_DEFAULT, _tex));
#endif
}
private:
CUtexref _tex;
friend class UCL_Kernel;
template<class mat_typ>
inline void _bind_float(mat_typ &vec, const unsigned numel) {
#ifdef UCL_DEBUG
assert(numel!=0 && numel<5);
#endif
CU_SAFE_CALL(cuTexRefSetAddress(NULL, _tex, vec.cbegin(),
vec.numel()*vec.element_size()));
if (vec.element_size()==sizeof(float))
CU_SAFE_CALL(cuTexRefSetFormat(_tex, CU_AD_FORMAT_FLOAT, numel));
else {
if (numel>2)
CU_SAFE_CALL(cuTexRefSetFormat(_tex, CU_AD_FORMAT_SIGNED_INT32, numel));
else
CU_SAFE_CALL(cuTexRefSetFormat(_tex,CU_AD_FORMAT_SIGNED_INT32,numel*2));
}
}
};
} // namespace

View File

@ -158,6 +158,11 @@ class UCL_Device {
/// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
inline int device_type(const int i);
/// Returns true if host memory is efficiently addressable from device
inline bool shared_memory() { return shared_memory(_device); }
/// Returns true if host memory is efficiently addressable from device
inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; }
/// Returns true if double precision is support for the current device
bool double_precision() { return double_precision(_device); }
/// Returns true if double precision is support for the device

View File

@ -29,11 +29,25 @@
namespace ucl_opencl {
class UCL_Texture;
template <class numtyp> class UCL_D_Vec;
template <class numtyp> class UCL_D_Mat;
template <class hosttype, class devtype> class UCL_Vector;
template <class hosttype, class devtype> class UCL_Matrix;
#define UCL_MAX_KERNEL_ARGS 256
/// Class storing 1 or more kernel functions from a single string or file
class UCL_Program {
public:
inline UCL_Program() : _init_done(false) {}
inline UCL_Program(UCL_Device &device) : _init_done(false) { init(device); }
inline UCL_Program(UCL_Device &device, const void *program,
const char *flags="", std::string *log=NULL) :
_init_done(false) {
init(device);
load_string(program,flags,log);
}
inline ~UCL_Program() { clear(); }
/// Initialize the program with a device
@ -78,10 +92,10 @@ class UCL_Program {
}
/// Load a program from a string and compile with flags
inline int load_string(const char *program, const char *flags="",
inline int load_string(const void *program, const char *flags="",
std::string *log=NULL) {
cl_int error_flag;
const char *prog=program;
const char *prog=(const char *)program;
_program=clCreateProgramWithSource(_context,1,&prog,NULL,&error_flag);
CL_CHECK_ERR(error_flag);
error_flag = clBuildProgram(_program,1,&_device,flags,NULL,NULL);
@ -159,19 +173,61 @@ class UCL_Kernel {
/** If not a device pointer, this must be repeated each time the argument
* changes **/
template <class dtype>
inline void set_arg(const cl_uint index, dtype *arg) {
inline void set_arg(const cl_uint index, const dtype * const arg) {
CL_SAFE_CALL(clSetKernelArg(_kernel,index,sizeof(dtype),arg));
if (index>_num_args) _num_args=index;
}
/// Set a geryon container as a kernel argument.
template <class numtyp>
inline void set_arg(const UCL_D_Vec<numtyp> * const arg)
{ set_arg(&arg->begin()); }
/// Set a geryon container as a kernel argument.
template <class numtyp>
inline void set_arg(const UCL_D_Mat<numtyp> * const arg)
{ set_arg(&arg->begin()); }
/// Set a geryon container as a kernel argument.
template <class hosttype, class devtype>
inline void set_arg(const UCL_Vector<hosttype, devtype> * const arg)
{ set_arg(&arg->device.begin()); }
/// Set a geryon container as a kernel argument.
template <class hosttype, class devtype>
inline void set_arg(const UCL_Matrix<hosttype, devtype> * const arg)
{ set_arg(&arg->device.begin()); }
/// Add a kernel argument.
template <class dtype>
inline void add_arg(dtype *arg) {
inline void add_arg(const dtype * const arg) {
CL_SAFE_CALL(clSetKernelArg(_kernel,_num_args,sizeof(dtype),arg));
_num_args++;
}
/// Add a geryon container as a kernel argument.
template <class numtyp>
inline void add_arg(const UCL_D_Vec<numtyp> * const arg)
{ add_arg(&arg->begin()); }
/// Add a geryon container as a kernel argument.
template <class numtyp>
inline void add_arg(const UCL_D_Mat<numtyp> * const arg)
{ add_arg(&arg->begin()); }
/// Add a geryon container as a kernel argument.
template <class hosttype, class devtype>
inline void add_arg(const UCL_Vector<hosttype, devtype> * const arg)
{ add_arg(&arg->device.begin()); }
/// Add a geryon container as a kernel argument.
template <class hosttype, class devtype>
inline void add_arg(const UCL_Matrix<hosttype, devtype> * const arg)
{ add_arg(&arg->device.begin()); }
/// Set the number of thread blocks and the number of threads in each block
/** \note This should be called before any arguments have been added
\note The default command queue is used for the kernel execution **/
inline void set_size(const size_t num_blocks, const size_t block_size) {
_dimensions=1;
_num_blocks[0]=num_blocks*block_size;
@ -179,6 +235,15 @@ class UCL_Kernel {
}
/// Set the number of thread blocks and the number of threads in each block
/** \note This should be called before any arguments have been added
\note The default command queue for the kernel is changed to cq **/
inline void set_size(const size_t num_blocks, const size_t block_size,
command_queue &cq)
{ _cq=cq; set_size(num_blocks,block_size); }
/// Set the number of thread blocks and the number of threads in each block
/** \note This should be called before any arguments have been added
\note The default command queue is used for the kernel execution **/
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
const size_t block_size_x, const size_t block_size_y) {
_dimensions=2;
@ -189,6 +254,16 @@ class UCL_Kernel {
}
/// Set the number of thread blocks and the number of threads in each block
/** \note This should be called before any arguments have been added
\note The default command queue for the kernel is changed to cq **/
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
const size_t block_size_x, const size_t block_size_y,
command_queue &cq)
{_cq=cq; set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y);}
/// Set the number of thread blocks and the number of threads in each block
/** \note This should be called before any arguments have been added
\note The default command queue is used for the kernel execution **/
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
const size_t block_size_x,
const size_t block_size_y, const size_t block_size_z) {
@ -202,14 +277,20 @@ class UCL_Kernel {
_block_size[2]=block_size_z;
}
/// Run the kernel in the default command queue
inline void run() {
run(_cq);
/// Set the number of thread blocks and the number of threads in each block
/** \note This should be called before any arguments have been added
\note The default command queue is used for the kernel execution **/
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
const size_t block_size_x, const size_t block_size_y,
const size_t block_size_z, command_queue &cq) {
_cq=cq;
set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y,
block_size_z);
}
/// Run the kernel in the specified command queue
inline void run(command_queue &cq) {
CL_SAFE_CALL(clEnqueueNDRangeKernel(cq,_kernel,_dimensions,NULL,
/// Run the kernel in the default command queue
inline void run() {
CL_SAFE_CALL(clEnqueueNDRangeKernel(_cq,_kernel,_dimensions,NULL,
_num_blocks,_block_size,0,NULL,NULL));
}

View File

@ -39,6 +39,9 @@ namespace ucl_opencl {
#include "ucl_h_mat.h"
#include "ucl_d_vec.h"
#include "ucl_d_mat.h"
#include "ucl_s_obj_help.h"
#include "ucl_vector.h"
#include "ucl_matrix.h"
#undef _UCL_DEVICE_PTR_MAT
#undef _OCL_MAT
#undef _UCL_MAT_ALLOW

View File

@ -132,6 +132,37 @@ inline void _host_free(mat_type &mat, const enum UCL_MEMOPT kind) {
CL_DESTRUCT_CALL(clReleaseCommandQueue(mat.cq()));
}
template <class mat_type>
inline int _host_resize(mat_type &mat, const size_t n) {
cl_int error_flag;
cl_context context;
CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_CONTEXT,sizeof(context),
&context,NULL));
CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
if (mat.kind()==UCL_WRITE_OPTIMIZED) {
mat.cbegin()=clCreateBuffer(context,
CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
n,NULL,&error_flag);
if (error_flag != CL_SUCCESS)
return UCL_MEMORY_ERROR;
*mat.host_ptr() = (typename mat_type::data_type*)
clEnqueueMapBuffer(mat.cq(),mat.cbegin(),CL_TRUE,
CL_MAP_WRITE,0,n,0,NULL,NULL,NULL);
} else {
mat.cbegin()=clCreateBuffer(context,
CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
n,NULL,&error_flag);
if (error_flag != CL_SUCCESS)
return UCL_MEMORY_ERROR;
*mat.host_ptr() = (typename mat_type::data_type*)
clEnqueueMapBuffer(mat.cq(),mat.cbegin(),CL_TRUE,
CL_MAP_READ | CL_MAP_WRITE,
0,n,0,NULL,NULL,NULL);
}
return UCL_SUCCESS;
}
// --------------------------------------------------------------------------
// - DEVICE MEMORY ALLOCATION ROUTINES
// --------------------------------------------------------------------------
@ -211,6 +242,61 @@ inline void _device_free(mat_type &mat) {
CL_DESTRUCT_CALL(clReleaseCommandQueue(mat.cq()));
}
template <class mat_type>
inline int _device_resize(mat_type &mat, const size_t n) {
cl_int error_flag;
cl_context context;
CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_CONTEXT,sizeof(context),
&context,NULL));
CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
cl_mem_flags flag;
if (mat.kind()==UCL_READ_WRITE)
flag=CL_MEM_READ_WRITE;
else if (mat.kind()==UCL_READ_ONLY)
flag=CL_MEM_READ_ONLY;
else if (mat.kind()==UCL_WRITE_ONLY)
flag=CL_MEM_WRITE_ONLY;
else
assert(0==1);
mat.cbegin()=clCreateBuffer(context,flag,n,NULL,&error_flag);
if (error_flag != CL_SUCCESS)
return UCL_MEMORY_ERROR;
return UCL_SUCCESS;
}
template <class mat_type>
inline int _device_resize(mat_type &mat, const size_t rows,
const size_t cols, size_t &pitch) {
size_t padded_cols=cols;
if (cols%256!=0)
padded_cols+=256-cols%256;
pitch=padded_cols*sizeof(typename mat_type::data_type);
cl_int error_flag;
cl_context context;
CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_CONTEXT,sizeof(context),
&context,NULL));
CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
cl_mem_flags flag;
if (mat.kind()==UCL_READ_WRITE)
flag=CL_MEM_READ_WRITE;
else if (mat.kind()==UCL_READ_ONLY)
flag=CL_MEM_READ_ONLY;
else if (mat.kind()==UCL_WRITE_ONLY)
flag=CL_MEM_WRITE_ONLY;
else
assert(0==1);
mat.cbegin()=clCreateBuffer(context,flag,pitch*rows,NULL,&error_flag);
if (error_flag != CL_SUCCESS)
return UCL_MEMORY_ERROR;
return UCL_SUCCESS;
}
// --------------------------------------------------------------------------
// - ZERO ROUTINES
// --------------------------------------------------------------------------

View File

@ -828,441 +828,3 @@
add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30);
run();
}
// ---------------------------------------------------------------------------
template <class t1>
inline void run_cq(command_queue &cq, t1 *a1) {
clear_args();
add_arg(a1);
run(cq);
}
template <class t1, class t2>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2) {
clear_args();
add_arg(a1); add_arg(a2);
run(cq);
}
template <class t1, class t2, class t3>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3);
run(cq);
}
template <class t1, class t2, class t3, class t4>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4);
run(cq);
}
template <class t1, class t2, class t3, class t4, class t5>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
run(cq);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6);
run(cq);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7);
run(cq);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8);
run(cq);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9);
run(cq);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
run(cq);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11);
run(cq);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12);
run(cq);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13);
run(cq);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14);
run(cq);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
run(cq);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16);
run(cq);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17);
run(cq);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17, class t18>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18);
run(cq);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17, class t18, class t19>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19);
run(cq);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17, class t18, class t19, class t20>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
run(cq);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17, class t18, class t19, class t20,
class t21>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21);
run(cq);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17, class t18, class t19, class t20,
class t21, class t22>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21, t22 *a22) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22);
run(cq);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17, class t18, class t19, class t20,
class t21, class t22, class t23>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21, t22 *a22, t23 *a23) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22); add_arg(a23);
run(cq);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17, class t18, class t19, class t20,
class t21, class t22, class t23, class t24>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21, t22 *a22, t23 *a23, t24 *a24) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24);
run(cq);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17, class t18, class t19, class t20,
class t21, class t22, class t23, class t24, class t25>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
run(cq);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17, class t18, class t19, class t20,
class t21, class t22, class t23, class t24, class t25,
class t26>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
t26 *a26) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
add_arg(a26);
run(cq);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17, class t18, class t19, class t20,
class t21, class t22, class t23, class t24, class t25,
class t26, class t27>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
t26 *a26, t27 *a27) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
add_arg(a26); add_arg(a27);
run(cq);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17, class t18, class t19, class t20,
class t21, class t22, class t23, class t24, class t25,
class t26, class t27, class t28>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
t26 *a26, t27 *a27, t28 *a28) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
add_arg(a26); add_arg(a27); add_arg(a28);
run(cq);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17, class t18, class t19, class t20,
class t21, class t22, class t23, class t24, class t25,
class t26, class t27, class t28, class t29>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
t26 *a26, t27 *a27, t28 *a28, t29 *a29) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29);
run(cq);
}
template <class t1, class t2, class t3, class t4, class t5,
class t6, class t7, class t8, class t9, class t10,
class t11, class t12, class t13, class t14, class t15,
class t16, class t17, class t18, class t19, class t20,
class t21, class t22, class t23, class t24, class t25,
class t26, class t27, class t28, class t29, class t30>
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
t26 *a26, t27 *a27, t28 *a28, t29 *a29, t30 *a30) {
clear_args();
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30);
run(cq);
}

View File

@ -344,6 +344,39 @@ class UCL_D_Mat : public UCL_BaseMat {
inline void clear()
{ _rows=0; if (_kind!=UCL_VIEW) { _kind=UCL_VIEW; _device_free(*this); } }
/// Resize the allocation to contain cols elements
/** \note Cannot be used on views **/
inline int resize(const int rows, const int cols) {
assert(_kind!=UCL_VIEW);
int err=_device_resize(*this,rows,cols,_pitch);
if (err!=UCL_SUCCESS) {
#ifndef UCL_NO_EXIT
std::cerr << "UCL Error: Could not allocate "
<< rows*cols*sizeof(numtyp) << " bytes on device.\n";
UCL_GERYON_EXIT;
#endif
return err;
}
_rows=rows;
_cols=cols;
_row_size=_pitch/sizeof(numtyp);
#ifndef _UCL_DEVICE_PTR_MAT
_end=_array+_row_size*cols;
#endif
#ifdef _OCL_MAT
_offset=0;
#endif
return err;
}
/// Resize (only if bigger) the allocation to contain rows x cols elements
/** \note Cannot be used on views **/
inline int resize_ib(const int rows, const int cols)
{ if (cols>_cols || rows>_rows) return resize(rows,cols);
else return UCL_SUCCESS; }
/// Set each element to zero
inline void zero() { _device_zero(*this,row_bytes()*_rows); }
@ -357,9 +390,9 @@ class UCL_D_Mat : public UCL_BaseMat {
inline const device_ptr & begin() const { return _array; }
#else
/// For CUDA-RT, get device pointer to first element
inline numtyp * begin() { return _array; }
inline numtyp * & begin() { return _array; }
/// For CUDA-RT, get device pointer to first element
inline const numtyp * begin() const { return _array; }
inline numtyp * const & begin() const { return _array; }
/// For CUDA-RT, get device pointer to one past last element
inline numtyp * end() { return _end; }
/// For CUDA-RT, get device pointer to one past last element

View File

@ -340,6 +340,39 @@ class UCL_D_Vec : public UCL_BaseMat {
inline void clear()
{ if (_kind!=UCL_VIEW) { _cols=0; _kind=UCL_VIEW; _device_free(*this); } }
/// Resize the allocation to contain cols elements
/** \note Cannot be used on views **/
inline int resize(const int cols) {
assert(_kind!=UCL_VIEW);
_row_bytes=cols*sizeof(numtyp);
int err=_device_resize(*this,_row_bytes);
if (err!=UCL_SUCCESS) {
#ifndef UCL_NO_EXIT
std::cerr << "UCL Error: Could not allocate " << _row_bytes
<< " bytes on device.\n";
_row_bytes=0;
UCL_GERYON_EXIT;
#endif
_row_bytes=0;
return err;
}
_cols=cols;
#ifndef _UCL_DEVICE_PTR_MAT
_end=_array+cols;
#endif
#ifdef _OCL_MAT
_offset=0;
#endif
return err;
}
/// Resize (only if bigger) the allocation to contain cols elements
/** \note Cannot be used on views **/
inline int resize_ib(const int cols)
{ if (cols>_cols) return resize(cols); else return UCL_SUCCESS; }
/// Set each element to zero
inline void zero() { _device_zero(*this,row_bytes()); }
@ -353,13 +386,13 @@ class UCL_D_Vec : public UCL_BaseMat {
inline const device_ptr & begin() const { return _array; }
#else
/// For CUDA-RT, get device pointer to first element
inline numtyp * begin() { return _array; }
inline numtyp * & begin() { return _array; }
/// For CUDA-RT, get device pointer to first element
inline const numtyp * begin() const { return _array; }
inline numtyp * const & begin() const { return _array; }
/// For CUDA-RT, get device pointer to one past last element
inline numtyp * end() { return _end; }
/// For CUDA-RT, get device pointer to one past last element
inline const numtyp * end() const { return _end; }
inline numtyp * end() const { return _end; }
#endif
#ifdef _UCL_DEVICE_PTR_MAT

View File

@ -318,6 +318,36 @@ class UCL_H_Mat : public UCL_BaseMat {
inline void clear()
{ if (_kind!=UCL_VIEW) {_rows=0; _kind=UCL_VIEW; _host_free(*this,_kind); }}
/// Resize the allocation to rows x cols elements
/** \note Cannot be used on views **/
inline int resize(const int rows, const int cols) {
assert(_kind!=UCL_VIEW);
_row_bytes=cols*sizeof(numtyp);
int err=_host_resize(*this,_row_bytes*rows);
if (err!=UCL_SUCCESS) {
#ifndef UCL_NO_EXIT
std::cerr << "UCL Error: Could not allocate " << _row_bytes*_rows
<< " bytes on host.\n";
_row_bytes=0;
UCL_GERYON_EXIT;
#endif
_row_bytes=0;
return err;
}
_cols=cols;
_rows=rows;
_end=_array+rows*cols;
return err;
}
/// Resize (only if bigger) the allocation to contain rows x cols elements
/** \note Cannot be used on views **/
inline int resize_ib(const int rows, const int cols)
{ if (cols>_cols || rows>_rows) return resize(rows,cols);
else return UCL_SUCCESS; }
/// Set each element to zero
inline void zero() { _host_zero(_array,_rows*row_bytes()); }
/// Set first n elements to zero

View File

@ -316,6 +316,34 @@ class UCL_H_Vec : public UCL_BaseMat {
inline void clear()
{ if (_kind!=UCL_VIEW) {_kind=UCL_VIEW; _cols=0; _host_free(*this,_kind);}}
/// Resize the allocation to contain cols elements
/** \note Cannot be used on views **/
inline int resize(const int cols) {
assert(_kind!=UCL_VIEW);
_row_bytes=cols*sizeof(numtyp);
int err=_host_resize(*this,_row_bytes);
if (err!=UCL_SUCCESS) {
#ifndef UCL_NO_EXIT
std::cerr << "UCL Error: Could not allocate " << _row_bytes
<< " bytes on host.\n";
_row_bytes=0;
UCL_GERYON_EXIT;
#endif
_row_bytes=0;
return err;
}
_cols=cols;
_end=_array+cols;
return err;
}
/// Resize (only if bigger) the allocation to contain cols elements
/** \note Cannot be used on views **/
inline int resize_ib(const int cols)
{ if (cols>_cols) return resize(cols); else return UCL_SUCCESS; }
/// Set each element to zero
inline void zero() { _host_zero(_array,row_bytes()); }

View File

@ -270,4 +270,13 @@ template <class numtyp>
inline std::ostream & operator << (std::ostream &out, UCL_D_Mat<numtyp> &mat)
{ ucl_print(mat,out); return out; }
template <class t1, class t2>
inline std::ostream & operator << (std::ostream &out, UCL_Vector<t1,t2> &mat)
{ ucl_print(mat.host,out); return out; }
template <class t1, class t2>
inline std::ostream & operator << (std::ostream &out, UCL_Matrix<t1,t2> &mat)
{ ucl_print(mat.host,out); return out; }
#endif

View File

@ -117,5 +117,61 @@ enum UCL_ERROR_FLAG {
template <class numtyp>
const char * ucl_template_name() { return _UCL_DATA_ID<numtyp>::name(); }
template <class t1, class t2> struct ucl_same_type;
template <> struct ucl_same_type<bool,bool> { enum { ans=1 }; };
template <> struct ucl_same_type<char,char> { enum { ans=1 }; };
template <> struct ucl_same_type<unsigned char,unsigned char> { enum { ans=1 }; };
template <> struct ucl_same_type<int,int> { enum { ans=1 }; };
template <> struct ucl_same_type<unsigned,unsigned> { enum { ans=1 }; };
template <> struct ucl_same_type<short,short> { enum { ans=1 }; };
template <> struct ucl_same_type<unsigned short,unsigned short> { enum { ans=1 }; };
template <> struct ucl_same_type<long,long> { enum { ans=1 }; };
template <> struct ucl_same_type<unsigned long,unsigned long> { enum { ans=1 }; };
template <> struct ucl_same_type<float,float> { enum { ans=1 }; };
template <> struct ucl_same_type<double,double> { enum { ans=1 }; };
template <> struct ucl_same_type<long double,long double> { enum { ans=1 }; };
template <> struct ucl_same_type<const bool,bool> { enum { ans=1 }; };
template <> struct ucl_same_type<const char,char> { enum { ans=1 }; };
template <> struct ucl_same_type<const unsigned char,unsigned char> { enum { ans=1 }; };
template <> struct ucl_same_type<const int,int> { enum { ans=1 }; };
template <> struct ucl_same_type<const unsigned,unsigned> { enum { ans=1 }; };
template <> struct ucl_same_type<const short,short> { enum { ans=1 }; };
template <> struct ucl_same_type<const unsigned short,unsigned short> { enum { ans=1 }; };
template <> struct ucl_same_type<const long,long> { enum { ans=1 }; };
template <> struct ucl_same_type<const unsigned long,unsigned long> { enum { ans=1 }; };
template <> struct ucl_same_type<const float,float> { enum { ans=1 }; };
template <> struct ucl_same_type<const double,double> { enum { ans=1 }; };
template <> struct ucl_same_type<const long double,long double> { enum { ans=1 }; };
template <> struct ucl_same_type<bool,const bool> { enum { ans=1 }; };
template <> struct ucl_same_type<char,const char> { enum { ans=1 }; };
template <> struct ucl_same_type<unsigned char,const unsigned char> { enum { ans=1 }; };
template <> struct ucl_same_type<int,const int> { enum { ans=1 }; };
template <> struct ucl_same_type<unsigned,const unsigned> { enum { ans=1 }; };
template <> struct ucl_same_type<short,const short> { enum { ans=1 }; };
template <> struct ucl_same_type<unsigned short,const unsigned short> { enum { ans=1 }; };
template <> struct ucl_same_type<long,const long> { enum { ans=1 }; };
template <> struct ucl_same_type<unsigned long,const unsigned long> { enum { ans=1 }; };
template <> struct ucl_same_type<float,const float> { enum { ans=1 }; };
template <> struct ucl_same_type<double,const double> { enum { ans=1 }; };
template <> struct ucl_same_type<long double,const long double> { enum { ans=1 }; };
template <> struct ucl_same_type<const bool,const bool> { enum { ans=1 }; };
template <> struct ucl_same_type<const char,const char> { enum { ans=1 }; };
template <> struct ucl_same_type<const unsigned char,const unsigned char> { enum { ans=1 }; };
template <> struct ucl_same_type<const int,const int> { enum { ans=1 }; };
template <> struct ucl_same_type<const unsigned,const unsigned> { enum { ans=1 }; };
template <> struct ucl_same_type<const short,const short> { enum { ans=1 }; };
template <> struct ucl_same_type<const unsigned short,const unsigned short> { enum { ans=1 }; };
template <> struct ucl_same_type<const long,const long> { enum { ans=1 }; };
template <> struct ucl_same_type<const unsigned long,const unsigned long> { enum { ans=1 }; };
template <> struct ucl_same_type<const float,const float> { enum { ans=1 }; };
template <> struct ucl_same_type<const double,const double> { enum { ans=1 }; };
template <> struct ucl_same_type<const long double,const long double> { enum { ans=1 }; };
template <class t1, class t2> struct ucl_same_type { enum { ans=0 }; };
#endif

View File

@ -39,30 +39,16 @@ bool AnswerT::alloc(const int inum) {
bool success=true;
int ans_elements=4;
_ans_fields=4;
if (_rot)
ans_elements+=4;
// Ignore host/device transfers?
bool cpuview=false;
if (dev->device_type()==UCL_CPU)
cpuview=true;
// -------------------------- Host allocations
success=success &&(host_ans.alloc(ans_elements*_max_local,*dev)==UCL_SUCCESS);
success=success &&(host_engv.alloc(_ev_fields*_max_local,*dev)==UCL_SUCCESS);
_ans_fields+=4;
// --------------------------- Device allocations
if (cpuview) {
dev_engv.view(host_engv);
dev_ans.view(host_ans);
} else {
success=success && (dev_engv.alloc(_ev_fields*_max_local,*dev,
success=success && (engv.alloc(_ev_fields*_max_local,*dev,UCL_RW_OPTIMIZED,
UCL_WRITE_ONLY)==UCL_SUCCESS);
success=success && (dev_ans.alloc(ans_elements*_max_local,
*dev,UCL_WRITE_ONLY)==UCL_SUCCESS);
}
_gpu_bytes=dev_engv.row_bytes()+dev_ans.row_bytes();
success=success && (force.alloc(_ans_fields*_max_local,*dev,UCL_RW_OPTIMIZED,
UCL_WRITE_ONLY)==UCL_SUCCESS);
_gpu_bytes=engv.device.row_bytes()+force.device.row_bytes();
_allocated=true;
return success;
@ -114,32 +100,24 @@ bool AnswerT::add_fields(const bool charge, const bool rot) {
if (realloc) {
_other=_charge || _rot;
int inum=_max_local;
clear_resize();
force.clear();
engv.clear();
_allocated=false;
return alloc(inum);
}
return true;
}
template <class numtyp, class acctyp>
void AnswerT::clear_resize() {
if (!_allocated)
return;
_allocated=false;
dev_ans.clear();
dev_engv.clear();
host_ans.clear();
host_engv.clear();
}
template <class numtyp, class acctyp>
void AnswerT::clear() {
_gpu_bytes=0;
if (!_allocated)
return;
_allocated=false;
force.clear();
engv.clear();
time_answer.clear();
clear_resize();
_inum=0;
_ilist=NULL;
_eflag=false;
@ -174,11 +152,11 @@ void AnswerT::copy_answers(const bool eflag, const bool vflag,
csize-=6;
if (csize>0)
ucl_copy(host_engv,dev_engv,_inum*csize,true);
engv.update_host(_inum*csize,true);
if (_rot)
ucl_copy(host_ans,dev_ans,_inum*4*2,true);
force.update_host(_inum*4*2,true);
else
ucl_copy(host_ans,dev_ans,_inum*4,true);
force.update_host(_inum*4,true);
time_answer.stop();
}
@ -201,28 +179,28 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
for (int i=0; i<6; i++) virial_acc[i]=0.0;
if (_ilist==NULL) {
for (int i=0; i<_inum; i++) {
acctyp *ap=host_engv.begin()+i;
int al=i;
if (_eflag) {
if (_ef_atom) {
evdwl+=*ap;
eatom[i]+=*ap*0.5;
ap+=_inum;
evdwl+=engv[al];
eatom[i]+=engv[al]*0.5;
al+=_inum;
} else {
evdwl+=*ap;
ap+=_inum;
evdwl+=engv[al];
al+=_inum;
}
}
if (_vflag) {
if (_vf_atom) {
for (int j=0; j<6; j++) {
vatom[i][j]+=*ap*0.5;
virial_acc[j]+=*ap;
ap+=_inum;
vatom[i][j]+=engv[al]*0.5;
virial_acc[j]+=engv[al];
al+=_inum;
}
} else {
for (int j=0; j<6; j++) {
virial_acc[j]+=*ap;
ap+=_inum;
virial_acc[j]+=engv[al];
al+=_inum;
}
}
}
@ -231,29 +209,29 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
virial[j]+=virial_acc[j]*0.5;
} else {
for (int i=0; i<_inum; i++) {
acctyp *ap=host_engv.begin()+i;
int al=i;
int ii=_ilist[i];
if (_eflag) {
if (_ef_atom) {
evdwl+=*ap;
eatom[ii]+=*ap*0.5;
ap+=_inum;
evdwl+=engv[al];
eatom[ii]+=engv[al]*0.5;
al+=_inum;
} else {
evdwl+=*ap;
ap+=_inum;
evdwl+=engv[al];
al+=_inum;
}
}
if (_vflag) {
if (_vf_atom) {
for (int j=0; j<6; j++) {
vatom[ii][j]+=*ap*0.5;
virial_acc[j]+=*ap;
ap+=_inum;
vatom[ii][j]+=engv[al]*0.5;
virial_acc[j]+=engv[al];
al+=_inum;
}
} else {
for (int j=0; j<6; j++) {
virial_acc[j]+=*ap;
ap+=_inum;
virial_acc[j]+=engv[al];
al+=_inum;
}
}
}
@ -281,33 +259,33 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
for (int i=0; i<6; i++) virial_acc[i]=0.0;
if (_ilist==NULL) {
for (int i=0; i<_inum; i++) {
acctyp *ap=host_engv.begin()+i;
int al=i;
if (_eflag) {
if (_ef_atom) {
evdwl+=*ap;
eatom[i]+=*ap*0.5;
ap+=_inum;
_ecoul+=*ap;
eatom[i]+=*ap*0.5;
ap+=_inum;
evdwl+=engv[al];
eatom[i]+=engv[al]*0.5;
al+=_inum;
_ecoul+=engv[al];
eatom[i]+=engv[al]*0.5;
al+=_inum;
} else {
evdwl+=*ap;
ap+=_inum;
_ecoul+=*ap;
ap+=_inum;
evdwl+=engv[al];
al+=_inum;
_ecoul+=engv[al];
al+=_inum;
}
}
if (_vflag) {
if (_vf_atom) {
for (int j=0; j<6; j++) {
vatom[i][j]+=*ap*0.5;
virial_acc[j]+=*ap;
ap+=_inum;
vatom[i][j]+=engv[al]*0.5;
virial_acc[j]+=engv[al];
al+=_inum;
}
} else {
for (int j=0; j<6; j++) {
virial_acc[j]+=*ap;
ap+=_inum;
virial_acc[j]+=engv[al];
al+=_inum;
}
}
}
@ -316,34 +294,34 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
virial[j]+=virial_acc[j]*0.5;
} else {
for (int i=0; i<_inum; i++) {
acctyp *ap=host_engv.begin()+i;
int al=i;
int ii=_ilist[i];
if (_eflag) {
if (_ef_atom) {
evdwl+=*ap;
eatom[ii]+=*ap*0.5;
ap+=_inum;
_ecoul+=*ap;
eatom[ii]+=*ap*0.5;
ap+=_inum;
evdwl+=engv[al];
eatom[ii]+=engv[al]*0.5;
al+=_inum;
_ecoul+=engv[al];
eatom[ii]+=engv[al]*0.5;
al+=_inum;
} else {
evdwl+=*ap;
ap+=_inum;
_ecoul+=*ap;
ap+=_inum;
evdwl+=engv[al];
al+=_inum;
_ecoul+=engv[al];
al+=_inum;
}
}
if (_vflag) {
if (_vf_atom) {
for (int j=0; j<6; j++) {
vatom[ii][j]+=*ap*0.5;
virial_acc[j]+=*ap;
ap+=_inum;
vatom[ii][j]+=engv[al]*0.5;
virial_acc[j]+=engv[al];
al+=_inum;
}
} else {
for (int j=0; j<6; j++) {
virial_acc[j]+=*ap;
ap+=_inum;
virial_acc[j]+=engv[al];
al+=_inum;
}
}
}
@ -359,45 +337,37 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
template <class numtyp, class acctyp>
void AnswerT::get_answers(double **f, double **tor) {
acctyp *ap=host_ans.begin();
int fl=0;
if (_ilist==NULL) {
for (int i=0; i<_inum; i++) {
f[i][0]+=*ap;
ap++;
f[i][1]+=*ap;
ap++;
f[i][2]+=*ap;
ap+=2;
f[i][0]+=force[fl];
f[i][1]+=force[fl+1];
f[i][2]+=force[fl+2];
fl+=4;
}
if (_rot) {
for (int i=0; i<_inum; i++) {
tor[i][0]+=*ap;
ap++;
tor[i][1]+=*ap;
ap++;
tor[i][2]+=*ap;
ap+=2;
tor[i][0]+=force[fl];
tor[i][1]+=force[fl+1];
tor[i][2]+=force[fl+2];
fl+=4;
}
}
} else {
for (int i=0; i<_inum; i++) {
int ii=_ilist[i];
f[ii][0]+=*ap;
ap++;
f[ii][1]+=*ap;
ap++;
f[ii][2]+=*ap;
ap+=2;
f[ii][0]+=force[fl];
f[ii][1]+=force[fl+1];
f[ii][2]+=force[fl+2];
fl+=4;
}
if (_rot) {
for (int i=0; i<_inum; i++) {
int ii=_ilist[i];
tor[ii][0]+=*ap;
ap++;
tor[ii][1]+=*ap;
ap++;
tor[ii][2]+=*ap;
ap+=2;
tor[ii][0]+=force[fl];
tor[ii][1]+=force[fl+1];
tor[ii][2]+=force[fl+2];
fl+=4;
}
}
}

View File

@ -19,18 +19,18 @@
#include <math.h>
#include "mpi.h"
#ifdef USE_OPENCL
#if defined(USE_OPENCL)
#include "geryon/ocl_timer.h"
#include "geryon/ocl_mat.h"
using namespace ucl_opencl;
#elif defined(USE_CUDART)
#include "geryon/nvc_timer.h"
#include "geryon/nvc_mat.h"
using namespace ucl_cudart;
#else
#include "geryon/nvd_timer.h"
#include "geryon/nvd_mat.h"
using namespace ucl_cudadr;
#endif
#include "lal_precision.h"
@ -59,8 +59,10 @@ class Answer {
inline void resize(const int inum, bool &success) {
_inum=inum;
if (inum>_max_local) {
clear_resize();
success = success && alloc(inum);
_max_local=static_cast<int>(static_cast<double>(inum)*1.10);
success=success && (force.resize(_max_local*_ans_fields)==UCL_SUCCESS);
success=success && (engv.resize(_max_local*_ev_fields)==UCL_SUCCESS);
_gpu_bytes=engv.device.row_bytes()+force.device.row_bytes();
}
}
@ -68,9 +70,6 @@ class Answer {
/** \param rot True if atom storage needs quaternions **/
bool add_fields(const bool charge, const bool rot);
/// Free all memory on host and device needed to realloc for more atoms
void clear_resize();
/// Free all memory on host and device
void clear();
@ -136,14 +135,9 @@ class Answer {
// ------------------------------ DATA ----------------------------------
/// Force and possibly torque
UCL_D_Vec<acctyp> dev_ans;
UCL_Vector<acctyp,acctyp> force;
/// Energy and virial per-atom storage
UCL_D_Vec<acctyp> dev_engv;
/// Force and possibly torque data on host
UCL_H_Vec<acctyp> host_ans;
/// Energy/virial data on host
UCL_H_Vec<acctyp> host_engv;
UCL_Vector<acctyp,acctyp> engv;
/// Device timers
UCL_Timer time_answer;
@ -155,7 +149,7 @@ class Answer {
bool alloc(const int inum);
bool _allocated, _eflag, _vflag, _ef_atom, _vf_atom, _rot, _charge, _other;
int _max_local, _inum, _e_fields, _ev_fields;
int _max_local, _inum, _e_fields, _ev_fields, _ans_fields;
int *_ilist;
double _time_cast, _time_cpu_idle;

View File

@ -51,9 +51,13 @@ bool AtomT::alloc(const int nall) {
bool success=true;
// Ignore host/device transfers?
bool cpuview=false;
if (dev->device_type()==UCL_CPU)
cpuview=true;
_host_view=false;
if (dev->shared_memory()) {
_host_view=true;
#ifdef GPU_CAST
assert(0==1);
#endif
}
// Allocate storage for CUDPP sort
#ifdef USE_CUDPP
@ -64,63 +68,101 @@ bool AtomT::alloc(const int nall) {
}
#endif
// -------------------------- Host allocations
// Get a host write only buffer
#ifdef GPU_CAST
success=success && (host_x_cast.alloc(_max_atoms*3,*dev,
UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
success=success && (host_type_cast.alloc(_max_atoms,*dev,
UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
#else
success=success && (host_x.alloc(_max_atoms*4,*dev,
UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
#endif
// Buffer for casting only if different precisions
if (_charge)
success=success && (host_q.alloc(_max_atoms,*dev,
UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
// Buffer for casting only if different precisions
if (_rot)
success=success && (host_quat.alloc(_max_atoms*4,*dev,
UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
// --------------------------- Device allocations
int gpu_bytes=0;
if (cpuview) {
#ifdef GPU_CAST
assert(0==1);
#else
dev_x.view(host_x);
#endif
if (_rot)
dev_quat.view(host_quat);
if (_charge)
dev_q.view(host_q);
} else {
#ifdef GPU_CAST
success=success && (UCL_SUCCESS==dev_x.alloc(_max_atoms*4,*dev));
success=success && (UCL_SUCCESS==
dev_x_cast.alloc(_max_atoms*3,*dev,UCL_READ_ONLY));
success=success && (UCL_SUCCESS==
dev_type_cast.alloc(_max_atoms,*dev,UCL_READ_ONLY));
gpu_bytes+=dev_x_cast.row_bytes()+dev_type_cast.row_bytes();
#else
success=success && (UCL_SUCCESS==
dev_x.alloc(_max_atoms*4,*dev,UCL_READ_ONLY));
#endif
if (_charge) {
success=success && (dev_q.alloc(_max_atoms,*dev,
success=success && (x.alloc(_max_atoms*4,*dev,UCL_WRITE_OPTIMIZED,
UCL_READ_ONLY)==UCL_SUCCESS);
gpu_bytes+=dev_q.row_bytes();
}
if (_rot) {
success=success && (dev_quat.alloc(_max_atoms*4,*dev,
#ifdef GPU_CAST
success=success && (x_cast.alloc(_max_atoms*3,*dev,UCL_READ_ONLY)==
UCL_SUCCESS);
success=success && (type_cast.alloc(_max_atoms,*dev,UCL_READ_ONLY)==
UCL_SUCCESS);
gpu_bytes+=x_cast.device.row_bytes()+type_cast.device.row_bytes();
#endif
if (_charge && _host_view==false) {
success=success && (q.alloc(_max_atoms,*dev,UCL_WRITE_OPTIMIZED,
UCL_READ_ONLY)==UCL_SUCCESS);
gpu_bytes+=dev_quat.row_bytes();
gpu_bytes+=q.device.row_bytes();
}
if (_rot && _host_view==false) {
success=success && (quat.alloc(_max_atoms*4,*dev,UCL_WRITE_OPTIMIZED,
UCL_READ_ONLY)==UCL_SUCCESS);
gpu_bytes+=quat.device.row_bytes();
}
if (_gpu_nbor>0) {
if (_bonds) {
success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS);
gpu_bytes+=dev_tag.row_bytes();
}
if (_gpu_nbor==1) {
success=success && (dev_cell_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
gpu_bytes+=dev_cell_id.row_bytes();
} else {
success=success && (host_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
success=success &&
(host_cell_id.alloc(_max_atoms,*dev,UCL_NOT_PINNED)==UCL_SUCCESS);
}
if (_gpu_nbor==2 && _host_view)
dev_particle_id.view(host_particle_id);
else
success=success && (dev_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
gpu_bytes+=dev_particle_id.row_bytes();
}
gpu_bytes+=x.device.row_bytes();
if (gpu_bytes>_max_gpu_bytes)
_max_gpu_bytes=gpu_bytes;
_allocated=true;
return success;
}
template <class numtyp, class acctyp>
bool AtomT::add_fields(const bool charge, const bool rot,
const int gpu_nbor, const bool bonds) {
bool success=true;
// Ignore host/device transfers?
int gpu_bytes=0;
if (charge && _charge==false) {
_charge=true;
_other=true;
if (_host_view==false) {
success=success && (q.alloc(_max_atoms,*dev,UCL_WRITE_OPTIMIZED,
UCL_READ_ONLY)==UCL_SUCCESS);
gpu_bytes+=q.device.row_bytes();
}
}
if (rot && _rot==false) {
_rot=true;
_other=true;
if (_host_view==false) {
success=success && (quat.alloc(_max_atoms*4,*dev,UCL_WRITE_OPTIMIZED,
UCL_READ_ONLY)==UCL_SUCCESS);
gpu_bytes+=quat.device.row_bytes();
}
}
if (bonds && _bonds==false) {
_bonds=true;
if (_bonds && _gpu_nbor>0) {
success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS);
gpu_bytes+=dev_tag.row_bytes();
}
}
if (gpu_nbor>0 && _gpu_nbor==0) {
_gpu_nbor=gpu_nbor;
#ifdef USE_CUDPP
if (_gpu_nbor==1) {
CUDPPResult result = cudppPlan(&sort_plan, sort_config, _max_atoms, 1, 0);
if (CUDPP_SUCCESS != result)
return false;
}
#endif
success=success && (dev_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
gpu_bytes+=dev_particle_id.row_bytes();
if (_bonds) {
@ -137,43 +179,9 @@ bool AtomT::alloc(const int nall) {
}
}
gpu_bytes+=dev_x.row_bytes();
if (gpu_bytes>_max_gpu_bytes)
_max_gpu_bytes=gpu_bytes;
_allocated=true;
return success;
}
template <class numtyp, class acctyp>
bool AtomT::add_fields(const bool charge, const bool rot,
const int gpu_nbor, const bool bonds) {
bool realloc=false;
if (charge && _charge==false) {
_charge=true;
realloc=true;
}
if (rot && _rot==false) {
_rot=true;
realloc=true;
}
if (gpu_nbor>0 && _gpu_nbor==0) {
_gpu_nbor=gpu_nbor;
realloc=true;
}
if (bonds && _bonds==false) {
_bonds=true;
realloc=true;
}
if (realloc) {
_other=_charge || _rot;
int max_atoms=_max_atoms;
clear_resize();
return alloc(max_atoms);
}
return true;
}
template <class numtyp, class acctyp>
bool AtomT::init(const int nall, const bool charge, const bool rot,
UCL_Device &devi, const int gpu_nbor, const bool bonds) {
@ -219,27 +227,18 @@ void AtomT::clear_resize() {
return;
_allocated=false;
dev_x.clear();
if (_charge) {
dev_q.clear();
host_q.clear();
}
if (_rot) {
dev_quat.clear();
host_quat.clear();
}
#ifndef GPU_CAST
host_x.clear();
#else
host_x_cast.clear();
host_type_cast.clear();
#endif
x.clear();
if (_charge)
q.clear();
if (_rot)
quat.clear();
dev_cell_id.clear();
dev_particle_id.clear();
dev_tag.clear();
#ifdef GPU_CAST
dev_x_cast.clear();
dev_type_cast.clear();
x_cast.clear();
type_cast.clear();
#endif
#ifdef USE_CUDPP
@ -279,8 +278,7 @@ double AtomT::host_memory_usage() const {
atom_bytes+=1;
if (_rot)
atom_bytes+=4;
return _max_atoms*atom_bytes*sizeof(numtyp)+
sizeof(Atom<numtyp,acctyp>);
return _max_atoms*atom_bytes*sizeof(numtyp)+sizeof(Atom<numtyp,acctyp>);
}
// Sort arrays for neighbor list calculation
@ -292,16 +290,18 @@ void AtomT::sort_neighbor(const int num_atoms) {
8*sizeof(unsigned), num_atoms);
if (CUDPP_SUCCESS != result) {
printf("Error in cudppSort\n");
NVD_GERYON_EXIT;
UCL_GERYON_EXIT;
}
#endif
}
#ifdef GPU_CAST
#ifdef USE_OPENCL
#if defined(USE_OPENCL)
#include "atom_cl.h"
#elif defined(USE_CUDART)
const char *atom=0;
#else
#include "atom_ptx.h"
#include "atom_cubin.h"
#endif
template <class numtyp, class acctyp>
@ -316,3 +316,4 @@ void AtomT::compile_kernels(UCL_Device &dev) {
#endif
template class Atom<PRECISION,ACC_PRECISION>;

View File

@ -19,20 +19,21 @@
#include <math.h>
#include "mpi.h"
#ifdef USE_OPENCL
#if defined(USE_OPENCL)
#include "geryon/ocl_timer.h"
#include "geryon/ocl_mat.h"
#include "geryon/ocl_kernel.h"
using namespace ucl_opencl;
#elif defined(USE_CUDART)
#include "geryon/nvc_timer.h"
#include "geryon/nvc_mat.h"
#include "geryon/nvc_kernel.h"
using namespace ucl_cudart;
#else
#include "geryon/nvd_timer.h"
#include "geryon/nvd_mat.h"
#include "geryon/nvd_kernel.h"
using namespace ucl_cudadr;
#endif
#ifdef USE_CUDPP
@ -92,7 +93,7 @@ class Atom {
bool charge() { return _charge; }
/// Returns true if GPU is using quaternions
bool quat() { return _rot; }
bool quaternion() { return _rot; }
/// Only free matrices of length inum or nall for resizing
void clear_resize();
@ -251,16 +252,13 @@ class Atom {
memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
memcpy(host_type_cast.begin(),host_type,_nall*sizeof(int));
#else
numtyp *_write_loc=host_x.begin();
int wl=0;
for (int i=0; i<_nall; i++) {
*_write_loc=host_ptr[i][0];
_write_loc++;
*_write_loc=host_ptr[i][1];
_write_loc++;
*_write_loc=host_ptr[i][2];
_write_loc++;
*_write_loc=host_type[i];
_write_loc++;
x[wl]=host_ptr[i][0];
x[wl+1]=host_ptr[i][1];
x[wl+2]=host_ptr[i][2];
x[wl+3]=host_type[i];
wl+=4;
}
#endif
_time_cast+=MPI_Wtime()-t;
@ -273,15 +271,14 @@ class Atom {
time_pos.start();
if (_x_avail==false) {
#ifdef GPU_CAST
ucl_copy(dev_x_cast,host_x_cast,_nall*3,true);
ucl_copy(dev_type_cast,host_type_cast,_nall,true);
x_cast.update_device(_nall*3,true);
type_cast.update_device(_nall,true);
int block_size=64;
int GX=static_cast<int>(ceil(static_cast<double>(_nall)/block_size));
k_cast_x.set_size(GX,block_size);
k_cast_x.run(&dev_x.begin(), &dev_x_cast.begin(), &dev_type_cast.begin(),
&_nall);
k_cast_x.run(&x, &x_cast, &type_cast, &_nall);
#else
ucl_copy(dev_x,host_x,_nall*4,true);
x.update_device(_nall*4,true);
#endif
_x_avail=true;
}
@ -299,18 +296,14 @@ class Atom {
inline void cast_q_data(cpytyp *host_ptr) {
if (_q_avail==false) {
double t=MPI_Wtime();
if (dev->device_type()==UCL_CPU) {
if (sizeof(numtyp)==sizeof(double)) {
host_q.view((numtyp*)host_ptr,_nall,*dev);
dev_q.view(host_q);
} else
for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i];
} else {
if (sizeof(numtyp)==sizeof(double))
memcpy(host_q.begin(),host_ptr,_nall*sizeof(numtyp));
// If double precision, still memcpy for async transfers
if (_host_view) {
q.host.view((numtyp*)host_ptr,_nall,*dev);
q.device.view(q.host);
} else if (sizeof(numtyp)==sizeof(double))
memcpy(q.host.begin(),host_ptr,_nall*sizeof(numtyp));
else
for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i];
}
for (int i=0; i<_nall; i++) q[i]=host_ptr[i];
_time_cast+=MPI_Wtime()-t;
}
}
@ -318,7 +311,7 @@ class Atom {
// Copy charges to device asynchronously
inline void add_q_data() {
if (_q_avail==false) {
ucl_copy(dev_q,host_q,_nall,true);
q.update_device(_nall,true);
_q_avail=true;
}
}
@ -328,18 +321,13 @@ class Atom {
inline void cast_quat_data(cpytyp *host_ptr) {
if (_quat_avail==false) {
double t=MPI_Wtime();
if (dev->device_type()==UCL_CPU) {
if (sizeof(numtyp)==sizeof(double)) {
host_quat.view((numtyp*)host_ptr,_nall*4,*dev);
dev_quat.view(host_quat);
} else
for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i];
} else {
if (sizeof(numtyp)==sizeof(double))
memcpy(host_quat.begin(),host_ptr,_nall*4*sizeof(numtyp));
if (_host_view) {
quat.host.view((numtyp*)host_ptr,_nall*4,*dev);
quat.device.view(quat.host);
} else if (sizeof(numtyp)==sizeof(double))
memcpy(quat.host.begin(),host_ptr,_nall*4*sizeof(numtyp));
else
for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i];
}
for (int i=0; i<_nall*4; i++) quat[i]=host_ptr[i];
_time_cast+=MPI_Wtime()-t;
}
}
@ -348,7 +336,7 @@ class Atom {
/** Copies nall()*4 elements **/
inline void add_quat_data() {
if (_quat_avail==false) {
ucl_copy(dev_quat,host_quat,_nall*4,true);
quat.update_device(_nall*4,true);
_quat_avail=true;
}
}
@ -363,29 +351,23 @@ class Atom {
inline double max_gpu_bytes()
{ double m=_max_gpu_bytes; _max_gpu_bytes=0.0; return m; }
/// Returns true if the device is addressing memory on the host
inline bool host_view() { return _host_view; }
// ------------------------------ DATA ----------------------------------
/// Atom coordinates and types ([0] is x, [1] is y, [2] is z, [3] is type
UCL_D_Vec<numtyp> dev_x;
UCL_Vector<numtyp,numtyp> x;
/// Charges
UCL_D_Vec<numtyp> dev_q;
UCL_Vector<numtyp,numtyp> q;
/// Quaterions
UCL_D_Vec<numtyp> dev_quat;
UCL_Vector<numtyp,numtyp> quat;
#ifdef GPU_CAST
UCL_D_Vec<double> dev_x_cast;
UCL_D_Vec<int> dev_type_cast;
UCL_H_Vec<double> host_x_cast;
UCL_H_Vec<int> host_type_cast;
UCL_Vector<double,double> x_cast;
UCL_Vector<int,int> type_cast;
#endif
/// Buffer for moving positions to device
UCL_H_Vec<numtyp> host_x;
/// Buffer for moving charge data to GPU
UCL_H_Vec<numtyp> host_q;
/// Buffer for moving quat data to GPU
UCL_H_Vec<numtyp> host_quat;
/// Cell list identifiers for device nbor builds
UCL_D_Vec<unsigned> dev_cell_id;
/// Cell list identifiers for device nbor builds
@ -418,9 +400,9 @@ class Atom {
bool alloc(const int nall);
bool _allocated, _rot, _charge, _other;
bool _allocated, _rot, _charge, _bonds, _other;
int _max_atoms, _nall, _gpu_nbor;
bool _bonds;
bool _host_view;
double _time_cast, _time_transfer;
double _max_gpu_bytes;
@ -434,3 +416,4 @@ class Atom {
}
#endif

View File

@ -41,9 +41,9 @@ int BaseAtomicT::bytes_per_atom_atomic(const int max_nbors) const {
template <class numtyp, class acctyp>
int BaseAtomicT::init_atomic(const int nlocal, const int nall,
const int max_nbors, const int maxspecial,
const double cell_size,
const double gpu_split, FILE *_screen,
const char *pair_program) {
const double cell_size, const double gpu_split,
FILE *_screen, const void *pair_program,
const char *k_name) {
screen=_screen;
int gpu_nbor=0;
@ -74,7 +74,7 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall,
atom=&device->atom;
_block_size=device->pair_block_size();
compile_kernels(*ucl_device,pair_program);
compile_kernels(*ucl_device,pair_program,k_name);
// Initialize host-device load balancer
hd_balancer.init(device,gpu_nbor,gpu_split);
@ -83,7 +83,7 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall,
time_pair.init(*ucl_device);
time_pair.zero();
pos_tex.bind_float(atom->dev_x,4);
pos_tex.bind_float(atom->x,4);
_max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
@ -266,18 +266,20 @@ double BaseAtomicT::host_memory_usage_atomic() const {
}
template <class numtyp, class acctyp>
void BaseAtomicT::compile_kernels(UCL_Device &dev, const char *pair_str) {
void BaseAtomicT::compile_kernels(UCL_Device &dev, const void *pair_str,
const char *kname) {
if (_compiled)
return;
std::string s_fast=std::string(kname)+"_fast";
std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
std::string(OCL_PRECISION_COMPILE)+" -D"+
std::string(OCL_VENDOR);
pair_program=new UCL_Program(dev);
pair_program->load_string(pair_str,flags.c_str());
k_pair_fast.set_function(*pair_program,"kernel_pair_fast");
k_pair.set_function(*pair_program,"kernel_pair");
k_pair_fast.set_function(*pair_program,s_fast.c_str());
k_pair.set_function(*pair_program,kname);
pos_tex.get_texture(*pair_program,"pos_tex");
_compiled=true;

View File

@ -20,8 +20,10 @@
#include "lal_balance.h"
#include "mpi.h"
#ifdef USE_OPENCL
#if defined(USE_OPENCL)
#include "geryon/ocl_texture.h"
#elif defined(USE_CUDART)
#include "geryon/nvc_texture.h"
#else
#include "geryon/nvd_texture.h"
#endif
@ -38,6 +40,7 @@ class BaseAtomic {
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
* \param k_name name for the kernel for force calculation
*
* Returns:
* - 0 if successfull
@ -48,7 +51,7 @@ class BaseAtomic {
int init_atomic(const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen,
const char *pair_program);
const void *pair_program, const char *k_name);
/// Estimate the overhead for GPU context changes and CPU driver
void estimate_gpu_overhead();
@ -57,7 +60,7 @@ class BaseAtomic {
/** \param success set to false if insufficient memory **/
inline void resize_atom(const int inum, const int nall, bool &success) {
if (atom->resize(nall, success))
pos_tex.bind_float(atom->dev_x,4);
pos_tex.bind_float(atom->x,4);
ans->resize(inum,success);
}
@ -188,7 +191,7 @@ class BaseAtomic {
double _gpu_overhead, _driver_overhead;
UCL_D_Vec<int> *_nbor_data;
void compile_kernels(UCL_Device &dev, const char *pair_string);
void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k);
virtual void loop(const bool _eflag, const bool _vflag) = 0;
};

View File

@ -42,9 +42,9 @@ int BaseChargeT::bytes_per_atom_atomic(const int max_nbors) const {
template <class numtyp, class acctyp>
int BaseChargeT::init_atomic(const int nlocal, const int nall,
const int max_nbors, const int maxspecial,
const double cell_size,
const double gpu_split, FILE *_screen,
const char *pair_program) {
const double cell_size, const double gpu_split,
FILE *_screen, const void *pair_program,
const char *k_name) {
screen=_screen;
int gpu_nbor=0;
@ -76,7 +76,7 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall,
_block_size=device->pair_block_size();
_block_bio_size=device->block_bio_pair();
compile_kernels(*ucl_device,pair_program);
compile_kernels(*ucl_device,pair_program,k_name);
// Initialize host-device load balancer
hd_balancer.init(device,gpu_nbor,gpu_split);
@ -85,8 +85,8 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall,
time_pair.init(*ucl_device);
time_pair.zero();
pos_tex.bind_float(atom->dev_x,4);
q_tex.bind_float(atom->dev_q,1);
pos_tex.bind_float(atom->x,4);
q_tex.bind_float(atom->q,1);
_max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
@ -282,18 +282,20 @@ double BaseChargeT::host_memory_usage_atomic() const {
}
template <class numtyp, class acctyp>
void BaseChargeT::compile_kernels(UCL_Device &dev, const char *pair_str) {
void BaseChargeT::compile_kernels(UCL_Device &dev, const void *pair_str,
const char *kname) {
if (_compiled)
return;
std::string s_fast=std::string(kname)+"_fast";
std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
std::string(OCL_PRECISION_COMPILE)+" -D"+
std::string(OCL_VENDOR);
pair_program=new UCL_Program(dev);
pair_program->load_string(pair_str,flags.c_str());
k_pair_fast.set_function(*pair_program,"kernel_pair_fast");
k_pair.set_function(*pair_program,"kernel_pair");
k_pair_fast.set_function(*pair_program,s_fast.c_str());
k_pair.set_function(*pair_program,kname);
pos_tex.get_texture(*pair_program,"pos_tex");
q_tex.get_texture(*pair_program,"q_tex");

View File

@ -21,8 +21,10 @@
#include "lal_balance.h"
#include "mpi.h"
#ifdef USE_OPENCL
#if defined(USE_OPENCL)
#include "geryon/ocl_texture.h"
#elif defined(USE_CUDART)
#include "geryon/nvc_texture.h"
#else
#include "geryon/nvd_texture.h"
#endif
@ -39,6 +41,7 @@ class BaseCharge {
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
* \param k_name name for the kernel for force calculation
*
* Returns:
* - 0 if successfull
@ -49,7 +52,7 @@ class BaseCharge {
int init_atomic(const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen,
const char *pair_program);
const void *pair_program, const char *k_name);
/// Estimate the overhead for GPU context changes and CPU driver
void estimate_gpu_overhead();
@ -58,8 +61,8 @@ class BaseCharge {
/** \param success set to false if insufficient memory **/
inline void resize_atom(const int inum, const int nall, bool &success) {
if (atom->resize(nall, success)) {
pos_tex.bind_float(atom->dev_x,4);
q_tex.bind_float(atom->dev_q,1);
pos_tex.bind_float(atom->x,4);
q_tex.bind_float(atom->q,1);
}
ans->resize(inum,success);
}
@ -187,7 +190,7 @@ class BaseCharge {
double _gpu_overhead, _driver_overhead;
UCL_D_Vec<int> *_nbor_data;
void compile_kernels(UCL_Device &dev, const char *pair_string);
void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k);
virtual void loop(const bool _eflag, const bool _vflag) = 0;
};

View File

@ -17,10 +17,12 @@
#include <cstdlib>
using namespace LAMMPS_AL;
#ifdef USE_OPENCL
#if defined(USE_OPENCL)
#include "ellipsoid_nbor_cl.h"
#elif defined(USE_CUDART)
const char *ellipsoid_nbor=0;
#else
#include "ellipsoid_nbor_ptx.h"
#include "ellipsoid_nbor_cubin.h"
#endif
#define BaseEllipsoidT BaseEllipsoid<numtyp, acctyp>
@ -50,8 +52,9 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,
const int max_nbors, const int maxspecial,
const double cell_size, const double gpu_split,
FILE *_screen, const int ntypes, int **h_form,
const char *ellipsoid_program,
const char *lj_program, const bool ellip_sphere) {
const void *ellipsoid_program,
const void *lj_program, const char *k_name,
const bool ellip_sphere) {
screen=_screen;
_ellipsoid_sphere=ellip_sphere;
@ -78,7 +81,7 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,
atom=&device->atom;
_block_size=device->pair_block_size();
compile_kernels(*ucl_device,ellipsoid_program,lj_program,ellip_sphere);
compile_kernels(*ucl_device,ellipsoid_program,lj_program,k_name,ellip_sphere);
// Initialize host-device load balancer
hd_balancer.init(device,gpu_nbor,gpu_split);
@ -112,7 +115,7 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,
}
if (_multiple_forms)
ans->dev_ans.zero();
ans->force.zero();
// Memory for ilist ordered by particle type
if (host_olist.alloc(nbor->max_atoms(),*ucl_device)==UCL_SUCCESS)
@ -121,6 +124,12 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,
_max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
neigh_tex.bind_float(atom->x,4);
pos_tex.bind_float(atom->x,4);
quat_tex.bind_float(atom->quat,4);
lj_pos_tex.bind_float(atom->x,4);
lj_quat_tex.bind_float(atom->quat,4);
return 0;
}
@ -241,14 +250,12 @@ void BaseEllipsoidT::pack_nbors(const int GX, const int BX, const int start,
int stride=nbor->nbor_pitch();
if (shared_types) {
k_nbor_fast.set_size(GX,BX);
k_nbor_fast.run(&atom->dev_x.begin(), &cut_form.begin(),
&nbor->dev_nbor.begin(), &stride, &start, &inum,
&nbor->dev_packed.begin(), &form_low, &form_high);
k_nbor_fast.run(&atom->x, &cut_form, &nbor->dev_nbor, &stride, &start,
&inum, &nbor->dev_packed, &form_low, &form_high);
} else {
k_nbor.set_size(GX,BX);
k_nbor.run(&atom->dev_x.begin(), &cut_form.begin(), &ntypes,
&nbor->dev_nbor.begin(), &stride, &start, &inum,
&nbor->dev_packed.begin(), &form_low, &form_high);
k_nbor.run(&atom->x, &cut_form, &ntypes, &nbor->dev_nbor, &stride,
&start, &inum, &nbor->dev_packed, &form_low, &form_high);
}
}
@ -437,11 +444,18 @@ double BaseEllipsoidT::host_memory_usage_base() const {
template <class numtyp, class acctyp>
void BaseEllipsoidT::compile_kernels(UCL_Device &dev,
const char *ellipsoid_string,
const char *lj_string, const bool e_s) {
const void *ellipsoid_string,
const void *lj_string,
const char *kname, const bool e_s) {
if (_compiled)
return;
std::string kns=kname;
std::string s_sphere_ellipsoid=kns+"_sphere_ellipsoid";
std::string s_ellipsoid_sphere=kns+"_ellipsoid_sphere";
std::string s_lj=kns+"_lj";
std::string s_lj_fast=kns+"_lj_fast";
std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
std::string(OCL_PRECISION_COMPILE)+" -D"+
std::string(OCL_VENDOR);
@ -450,18 +464,23 @@ void BaseEllipsoidT::compile_kernels(UCL_Device &dev,
nbor_program->load_string(ellipsoid_nbor,flags.c_str());
k_nbor_fast.set_function(*nbor_program,"kernel_nbor_fast");
k_nbor.set_function(*nbor_program,"kernel_nbor");
neigh_tex.get_texture(*nbor_program,"pos_tex");
ellipsoid_program=new UCL_Program(dev);
ellipsoid_program->load_string(ellipsoid_string,flags.c_str());
k_ellipsoid.set_function(*ellipsoid_program,"kernel_ellipsoid");
k_ellipsoid.set_function(*ellipsoid_program,kname);
pos_tex.get_texture(*ellipsoid_program,"pos_tex");
quat_tex.get_texture(*ellipsoid_program,"quat_tex");
lj_program=new UCL_Program(dev);
lj_program->load_string(lj_string,flags.c_str());
k_sphere_ellipsoid.set_function(*lj_program,"kernel_sphere_ellipsoid");
k_lj_fast.set_function(*lj_program,"kernel_lj_fast");
k_lj.set_function(*lj_program,"kernel_lj");
k_sphere_ellipsoid.set_function(*lj_program,s_sphere_ellipsoid.c_str());
k_lj_fast.set_function(*lj_program,s_lj_fast.c_str());
k_lj.set_function(*lj_program,s_lj.c_str());
if (e_s)
k_ellipsoid_sphere.set_function(*lj_program,"kernel_ellipsoid_sphere");
k_ellipsoid_sphere.set_function(*lj_program,s_ellipsoid_sphere.c_str());
lj_pos_tex.get_texture(*lj_program,"pos_tex");
lj_quat_tex.get_texture(*lj_program,"quat_tex");
_compiled=true;
}

View File

@ -20,8 +20,10 @@
#include "lal_balance.h"
#include "mpi.h"
#ifdef USE_OPENCL
#if defined(USE_OPENCL)
#include "geryon/ocl_texture.h"
#elif defined(USE_CUDART)
#include "geryon/nvc_texture.h"
#else
#include "geryon/nvd_texture.h"
#endif
@ -39,6 +41,7 @@ class BaseEllipsoid {
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
* \param ellipsoid_sphere true if ellipsoid-sphere case handled separately
* \param k_name name for the kernel for force calculation
*
* Returns:
* - 0 if successfull
@ -49,8 +52,9 @@ class BaseEllipsoid {
int init_base(const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen, const int ntypes,
int **h_form, const char *ellipsoid_program,
const char *lj_program, const bool ellipsoid_sphere=false);
int **h_form, const void *ellipsoid_program,
const void *lj_program, const char *k_name,
const bool ellipsoid_sphere=false);
/// Estimate the overhead for GPU context changes and CPU driver
void estimate_gpu_overhead();
@ -58,7 +62,13 @@ class BaseEllipsoid {
/// Check if there is enough storage for atom arrays and realloc if not
/** \param success set to false if insufficient memory **/
inline void resize_atom(const int nall, bool &success) {
atom->resize(nall, success);
if (atom->resize(nall, success)) {
neigh_tex.bind_float(atom->x,4);
pos_tex.bind_float(atom->x,4);
quat_tex.bind_float(atom->quat,4);
lj_pos_tex.bind_float(atom->x,4);
lj_quat_tex.bind_float(atom->quat,4);
}
}
/// Check if there is enough storage for neighbors and realloc if not
@ -74,7 +84,7 @@ class BaseEllipsoid {
const int max_nbors, const int olist_size,
bool &success) {
ans->resize(nlocal, success);
if (_multiple_forms) ans->dev_ans.zero();
if (_multiple_forms) ans->force.zero();
if (olist_size>static_cast<int>(host_olist.numel())) {
host_olist.clear();
@ -221,8 +231,7 @@ class BaseEllipsoid {
inline int block_size() { return _block_size; }
// --------------------------- TEXTURES -----------------------------
UCL_Texture pos_tex;
UCL_Texture q_tex;
UCL_Texture pos_tex, quat_tex, lj_pos_tex, lj_quat_tex, neigh_tex;
protected:
bool _compiled, _ellipsoid_sphere;
@ -236,8 +245,8 @@ class BaseEllipsoid {
int **_host_form;
int _last_ellipse, _max_last_ellipse;
void compile_kernels(UCL_Device &dev, const char *ellipsoid_string,
const char *lj_string, const bool e_s);
void compile_kernels(UCL_Device &dev, const void *ellipsoid_string,
const void *lj_string, const char *kname,const bool e_s);
virtual void loop(const bool _eflag, const bool _vflag) = 0;
};

View File

@ -13,10 +13,12 @@
email : nguyentd@ornl.gov
***************************************************************************/
#ifdef USE_OPENCL
#if defined(USE_OPENCL)
#include "buck_cl.h"
#elif defined(USE_CUDART)
const char *buck=0;
#else
#include "buck_ptx.h"
#include "buck_cubin.h"
#endif
#include "lal_buck.h"
@ -50,7 +52,7 @@ int BuckT::init(const int ntypes, double **host_cutsq,
const double gpu_split, FILE *_screen) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,buck);
_screen,buck,"k_buck");
if (success!=0)
return success;
@ -132,20 +134,17 @@ void BuckT::loop(const bool _eflag, const bool _vflag) {
this->time_pair.start();
if (shared_types) {
this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->dev_x.begin(), &coeff1.begin(),
&coeff2.begin(), &sp_lj.begin(),
&this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(),
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag,
&ainum, &nbor_pitch, &this->_threads_per_atom);
this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag,
&vflag, &ainum, &nbor_pitch,
&this->_threads_per_atom);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->dev_x.begin(), &coeff1.begin(), &coeff2.begin(),
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom);
this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &vflag,
&ainum, &nbor_pitch, &this->_threads_per_atom);
}
this->time_pair.stop();
}

View File

@ -15,14 +15,16 @@
#ifdef NV_KERNEL
#include "lal_aux_fun1.h"
texture<float4> pos_tex;
#ifndef _DOUBLE_DOUBLE
ucl_inline float4 fetch_pos(const int& i, const float4 *pos)
{ return tex1Dfetch(pos_tex, i); }
texture<float4> pos_tex;
#else
texture<int4,1> pos_tex;
#endif
#else
#define pos_tex x_
#endif
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
__kernel void k_buck(__global numtyp4 *x_, __global numtyp4 *coeff1,
__global numtyp4* coeff2, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
@ -51,7 +53,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
numtyp factor_lj;
@ -61,7 +63,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
// Compute r12
@ -104,7 +106,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
} // if ii
}
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
__kernel void k_buck_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
__global numtyp4* coeff2_in,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
@ -140,7 +142,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
@ -151,7 +153,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int mtype=itype+jx.w;
// Compute r12

View File

@ -13,10 +13,12 @@
email : nguyentd@ornl.gov
***************************************************************************/
#ifdef USE_OPENCL
#if defined(USE_OPENCL)
#include "buck_coul_cl.h"
#elif defined(USE_CUDART)
const char *buck_coul=0;
#else
#include "buck_coul_ptx.h"
#include "buck_coul_cubin.h"
#endif
#include "lal_buck_coul.h"
@ -52,7 +54,7 @@ int BuckCoulT::init(const int ntypes, double **host_cutsq,
const double qqrd2e) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,buck_coul);
_screen,buck_coul,"k_buck_coul");
if (success!=0)
return success;
@ -142,23 +144,18 @@ void BuckCoulT::loop(const bool _eflag, const bool _vflag) {
this->time_pair.start();
if (shared_types) {
this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->dev_x.begin(), &coeff1.begin(),
&coeff2.begin(), &sp_lj.begin(),
&this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(),
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag,
&ainum, &nbor_pitch,
&this->atom->dev_q.begin(), &cutsq.begin(),
&_qqrd2e, &this->_threads_per_atom);
this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag,
&vflag, &ainum, &nbor_pitch, &this->atom->q,
&cutsq, &_qqrd2e, &this->_threads_per_atom);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->dev_x.begin(), &coeff1.begin(), &coeff2.begin(),
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
&nbor_pitch, &this->atom->dev_q.begin(),
&cutsq.begin(), &_qqrd2e, &this->_threads_per_atom);
this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &vflag,
&ainum, &nbor_pitch, &this->atom->q,
&cutsq, &_qqrd2e, &this->_threads_per_atom);
}
this->time_pair.stop();
}

View File

@ -14,18 +14,22 @@
// ***************************************************************************/
#ifdef NV_KERNEL
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
#ifndef _DOUBLE_DOUBLE
ucl_inline float4 fetch_pos(const int& i, const float4 *pos)
{ return tex1Dfetch(pos_tex, i); }
ucl_inline float fetch_q(const int& i, const float *q)
{ return tex1Dfetch(q_tex, i); }
#endif
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
#endif
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
#else
#define pos_tex x_
#define q_tex q_
#endif
__kernel void k_buck_coul(__global numtyp4 *x_, __global numtyp4 *coeff1,
__global numtyp4* coeff2, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
@ -61,8 +65,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp qtmp=fetch_q(i,q_);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
int itype=ix.w;
for ( ; nbor<list_end; nbor+=n_stride) {
@ -73,7 +77,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
factor_coul = sp_lj[sbmask(j)+4];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
// Compute r12
@ -97,9 +101,10 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
} else
forcebuck = (numtyp)0.0;
if (rsq < coeff2[mtype].z) // coul
forcecoul = qqrd2e*qtmp*fetch_q(j,q_)*ucl_rsqrt(rsq)*factor_coul;
else
if (rsq < coeff2[mtype].z) {
fetch(forcecoul,j,q_tex);
forcecoul *= qqrd2e*qtmp*ucl_rsqrt(rsq)*factor_coul;
} else
forcecoul = (numtyp)0.0;
force = (forcebuck + forcecoul) * r2inv;
@ -131,7 +136,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
} // if ii
}
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
__kernel void k_buck_coul_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
__global numtyp4* coeff2_in,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
@ -172,8 +177,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp qtmp=fetch_q(i,q_);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
@ -185,7 +190,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in
factor_coul = sp_lj[sbmask(j)+4];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int mtype=itype+jx.w;
// Compute r12
@ -208,9 +213,10 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in
} else
forcebuck = (numtyp)0.0;
if (rsq < cutsq[mtype].z) // coul
forcecoul = qqrd2e*qtmp*fetch_q(j,q_)*ucl_rsqrt(rsq)*factor_coul;
else
if (rsq < cutsq[mtype].z) {
fetch(forcecoul,j,q_tex);
forcecoul *= qqrd2e*qtmp*ucl_rsqrt(rsq)*factor_coul;
} else
forcecoul = (numtyp)0.0;
force = (forcebuck + forcecoul) * r2inv;

View File

@ -13,10 +13,12 @@
email : nguyentd@ornl.gov
***************************************************************************/
#ifdef USE_OPENCL
#if defined(USE_OPENCL)
#include "buck_coul_long_cl.h"
#elif defined(USE_CUDART)
const char *buck_coul_long=0;
#else
#include "buck_coul_long_ptx.h"
#include "buck_coul_long_cubin.h"
#endif
#include "lal_buck_coul_long.h"
@ -54,7 +56,7 @@ int BuckCoulLongT::init(const int ntypes, double **host_cutsq,
const double g_ewald) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,buck_coul_long);
_screen,buck_coul_long,"k_buck_coul_long");
if (success!=0)
return success;
@ -145,25 +147,19 @@ void BuckCoulLongT::loop(const bool _eflag, const bool _vflag) {
this->time_pair.start();
if (shared_types) {
this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->dev_x.begin(), &coeff1.begin(),
&coeff2.begin(), &sp_lj.begin(),
&this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(),
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag,
&ainum, &nbor_pitch, &this->atom->dev_q.begin(),
&cutsq.begin(), &_cut_coulsq, &_qqrd2e,
this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag,
&vflag, &ainum, &nbor_pitch, &this->atom->q,
&cutsq, &_cut_coulsq, &_qqrd2e,
&_g_ewald, &this->_threads_per_atom);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->dev_x.begin(), &coeff1.begin(),
&coeff2.begin(), &_lj_types, &sp_lj.begin(),
&this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
&nbor_pitch, &this->atom->dev_q.begin(),
&cutsq.begin(), &_cut_coulsq,
&_qqrd2e, &_g_ewald, &this->_threads_per_atom);
this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &vflag,
&ainum, &nbor_pitch, &this->atom->q, &cutsq,
&_cut_coulsq, &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
}
this->time_pair.stop();
}

View File

@ -14,18 +14,22 @@
// ***************************************************************************/
#ifdef NV_KERNEL
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
#ifndef _DOUBLE_DOUBLE
ucl_inline float4 fetch_pos(const int& i, const float4 *pos)
{ return tex1Dfetch(pos_tex, i); }
ucl_inline float fetch_q(const int& i, const float *q)
{ return tex1Dfetch(q_tex, i); }
#endif
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
#endif
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
#else
#define pos_tex x_
#define q_tex q_
#endif
__kernel void k_buck_coul_long(__global numtyp4 *x_, __global numtyp4 *coeff1,
__global numtyp4* coeff2, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
@ -62,8 +66,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp qtmp=fetch_q(i,q_);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
int itype=ix.w;
for ( ; nbor<list_end; nbor+=n_stride) {
@ -74,7 +78,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
// Compute r12
@ -104,7 +108,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
numtyp expm2 = ucl_exp(-grij*grij);
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
fetch(prefactor,j,q_tex);
prefactor *= qqrd2e * qtmp/r;
forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
} else
forcecoul = (numtyp)0.0;
@ -139,7 +144,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
} // if ii
}
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
__kernel void k_buck_coul_long_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
__global numtyp4* coeff2_in,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
@ -179,8 +184,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp qtmp=fetch_q(i,q_);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
@ -192,7 +197,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int mtype=itype+jx.w;
// Compute r12
@ -221,7 +226,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in
numtyp expm2 = ucl_exp(-grij*grij);
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
fetch(prefactor,j,q_tex);
prefactor *= qqrd2e * qtmp/r;
forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
} else
forcecoul = (numtyp)0.0;

View File

@ -13,10 +13,12 @@
email : brownw@ornl.gov
***************************************************************************/
#ifdef USE_OPENCL
#if defined(USE_OPENCL)
#include "cg_cmm_cl.h"
#elif defined(USE_CUDART)
const char *cg_cmm=0;
#else
#include "cg_cmm_ptx.h"
#include "cg_cmm_cubin.h"
#endif
#include "lal_cg_cmm.h"
@ -51,7 +53,7 @@ int CGCMMT::init(const int ntypes, double **host_cutsq,
const double gpu_split, FILE *_screen) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,cg_cmm);
_screen,cg_cmm,"k_cg_cmm");
if (success!=0)
return success;
@ -133,19 +135,17 @@ void CGCMMT::loop(const bool _eflag, const bool _vflag) {
this->time_pair.start();
if (shared_types) {
this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
&lj3.begin(), &sp_lj.begin(),
&this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(),
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag,
&ainum, &nbor_pitch, &this->_threads_per_atom);
this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag,
&vflag, &ainum, &nbor_pitch,
&this->_threads_per_atom);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
&_cmm_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
this->k_pair.run(&this->atom->x, &lj1, &lj3,
&_cmm_types, &sp_lj, &this->nbor->dev_nbor,
&this->_nbor_data->begin(), &this->ans->force,
&this->ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom);
}
this->time_pair.stop();

View File

@ -15,14 +15,16 @@
#ifdef NV_KERNEL
#include "lal_aux_fun1.h"
texture<float4> pos_tex;
#ifndef _DOUBLE_DOUBLE
ucl_inline float4 fetch_pos(const int& i, const float4 *pos)
{ return tex1Dfetch(pos_tex, i); }
texture<float4> pos_tex;
#else
texture<int4,1> pos_tex;
#endif
#else
#define pos_tex x_
#endif
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
__kernel void k_cg_cmm(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
@ -51,7 +53,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
numtyp factor_lj;
@ -61,7 +63,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
// Compute r12
@ -109,7 +111,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
} // if ii
}
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__kernel void k_cg_cmm_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in,
__global numtyp* sp_lj_in,__global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
@ -145,7 +147,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
@ -156,7 +158,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int mtype=itype+jx.w;
// Compute r12

View File

@ -13,10 +13,12 @@
email : brownw@ornl.gov
***************************************************************************/
#ifdef USE_OPENCL
#if defined(USE_OPENCL)
#include "cg_cmm_long_cl.h"
#elif defined(USE_CUDART)
const char *cg_cmm_long=0;
#else
#include "cg_cmm_long_ptx.h"
#include "cg_cmm_long_cubin.h"
#endif
#include "lal_cg_cmm_long.h"
@ -56,7 +58,7 @@ int CGCMMLongT::init(const int ntypes, double **host_cutsq,
const double g_ewald) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,cg_cmm_long);
_screen,cg_cmm_long,"k_cg_cmm_long");
if (success!=0)
return success;
@ -144,24 +146,19 @@ void CGCMMLongT::loop(const bool _eflag, const bool _vflag) {
this->time_pair.start();
if (shared_types) {
this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
&lj3.begin(), &sp_lj.begin(),
&this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(),
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag,
&ainum, &nbor_pitch,
&this->atom->dev_q.begin(), &_cut_coulsq,
&_qqrd2e, &_g_ewald, &this->_threads_per_atom);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
&nbor_pitch, &this->atom->dev_q.begin(),
this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag,
&vflag, &ainum, &nbor_pitch, &this->atom->q,
&_cut_coulsq, &_qqrd2e, &_g_ewald,
&this->_threads_per_atom);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &vflag,
&ainum, &nbor_pitch, &this->atom->q, &_cut_coulsq,
&_qqrd2e, &_g_ewald, &this->_threads_per_atom);
}
this->time_pair.stop();
}

View File

@ -14,18 +14,22 @@
// ***************************************************************************/
#ifdef NV_KERNEL
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
#ifndef _DOUBLE_DOUBLE
ucl_inline float4 fetch_pos(const int& i, const float4 *pos)
{ return tex1Dfetch(pos_tex, i); }
ucl_inline float fetch_q(const int& i, const float *q)
{ return tex1Dfetch(q_tex, i); }
#endif
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
#endif
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
#else
#define pos_tex x_
#define q_tex q_
#endif
__kernel void k_cg_cmm_long(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
@ -61,8 +65,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp qtmp=fetch_q(i,q_);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
int itype=ix.w;
for ( ; nbor<list_end; nbor+=n_stride) {
@ -73,7 +77,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
// Compute r12
@ -108,7 +112,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
numtyp expm2 = ucl_exp(-grij*grij);
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
fetch(prefactor,j,q_tex);
prefactor *= qqrd2e * qtmp/r;
forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
} else
forcecoul = (numtyp)0.0;
@ -143,7 +148,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
} // if ii
}
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__kernel void k_cg_cmm_long_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
@ -181,8 +186,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp qtmp=fetch_q(i,q_);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
@ -194,7 +199,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int mtype=itype+jx.w;
// Compute r12
@ -228,7 +233,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
numtyp expm2 = ucl_exp(-grij*grij);
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
fetch(prefactor,j,q_tex);
prefactor *= qqrd2e * qtmp/r;
forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
} else
forcecoul = (numtyp)0.0;

View File

@ -13,10 +13,12 @@
email : brownw@ornl.gov
***************************************************************************/
#ifdef USE_OPENCL
#if defined(USE_OPENCL)
#include "charmm_long_cl.h"
#elif defined(USE_CUDART)
const char *charmm_long=0;
#else
#include "charmm_long_ptx.h"
#include "charmm_long_cubin.h"
#endif
#include "lal_charmm_long.h"
@ -57,7 +59,7 @@ int CHARMMLongT::init(const int ntypes,
double **sigma, const bool mix_arithmetic) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,charmm_long);
_screen,charmm_long,"k_charmm_long");
if (success!=0)
return success;
@ -148,22 +150,19 @@ void CHARMMLongT::loop(const bool _eflag, const bool _vflag) {
this->time_pair.start();
if (shared_types) {
this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->dev_x.begin(), &ljd.begin(),
&sp_lj.begin(), &this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(),
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag,
&ainum, &nbor_pitch, &this->atom->dev_q.begin(),
this->k_pair_fast.run(&this->atom->x, &ljd, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag,
&vflag, &ainum, &nbor_pitch, &this->atom->q,
&_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj,
&_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq,
&this->_threads_per_atom);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(),
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
&nbor_pitch, &this->atom->dev_q.begin(),
this->k_pair.run(&this->atom->x, &lj1, &_lj_types, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &vflag,
&ainum, &nbor_pitch, &this->atom->q,
&_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj,
&_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq,
&this->_threads_per_atom);

View File

@ -14,18 +14,22 @@
// ***************************************************************************/
#ifdef NV_KERNEL
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
#ifndef _DOUBLE_DOUBLE
ucl_inline float4 fetch_pos(const int& i, const float4 *pos)
{ return tex1Dfetch(pos_tex, i); }
ucl_inline float fetch_q(const int& i, const float *q)
{ return tex1Dfetch(q_tex, i); }
#endif
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
#endif
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
#else
#define pos_tex x_
#define q_tex q_
#endif
__kernel void k_charmm_long(__global numtyp4 *x_, __global numtyp4 *lj1,
const int lj_types, __global numtyp *sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
@ -62,8 +66,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp qtmp=fetch_q(i,q_);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
int itype=ix.w;
for ( ; nbor<list_end; nbor+=n_stride) {
@ -74,7 +78,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
// Compute r12
@ -110,7 +114,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
numtyp expm2 = ucl_exp(-grij*grij);
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
fetch(prefactor,j,q_tex);
prefactor *= qqrd2e * qtmp/r;
forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
} else
forcecoul = (numtyp)0.0;
@ -147,7 +152,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
} // if ii
}
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
__kernel void k_charmm_long_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
__global numtyp* sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
@ -185,8 +190,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp qtmp=fetch_q(i,q_);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
int itype=ix.w;
for ( ; nbor<list_end; nbor+=n_stride) {
@ -197,7 +202,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
// Compute r12
@ -239,7 +244,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
numtyp expm2 = ucl_exp(-grij*grij);
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
fetch(prefactor,j,q_tex);
prefactor *= qqrd2e * qtmp/r;
forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
} else
forcecoul = (numtyp)0.0;

View File

@ -13,10 +13,12 @@
email : a.kohlmeyer@temple.edu
***************************************************************************/
#ifdef USE_OPENCL
#if defined(USE_OPENCL)
#include "coul_long_cl.h"
#elif defined(USE_CUDART)
const char *coul_long=0;
#else
#include "coul_long_ptx.h"
#include "coul_long_cubin.h"
#endif
#include "lal_coul_long.h"
@ -48,7 +50,7 @@ int CoulLongT::init(const int nlocal, const int nall, const int max_nbors,
const double qqrd2e, const double g_ewald) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,
gpu_split,_screen,coul_long);
gpu_split,_screen,coul_long,"k_coul_long");
if (success!=0)
return success;
@ -132,22 +134,18 @@ void CoulLongT::loop(const bool _eflag, const bool _vflag) {
this->time_pair.start();
if (shared_types) {
this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
&lj3.begin(), &sp_cl.begin(),
&this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(),
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag,
&ainum, &nbor_pitch, &this->atom->dev_q.begin(),
&_cut_coulsq, &_qqrd2e, &_g_ewald,
this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_cl,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv,
&eflag, &vflag, &ainum, &nbor_pitch,
&this->atom->q, &_cut_coulsq, &_qqrd2e, &_g_ewald,
&this->_threads_per_atom);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
&_lj_types, &sp_cl.begin(), &this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
&nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq,
this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_cl,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &vflag,
&ainum, &nbor_pitch, &this->atom->q, &_cut_coulsq,
&_qqrd2e, &_g_ewald, &this->_threads_per_atom);
}
this->time_pair.stop();

View File

@ -14,18 +14,22 @@
// ***************************************************************************/
#ifdef NV_KERNEL
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
#ifndef _DOUBLE_DOUBLE
ucl_inline float4 fetch_pos(const int& i, const float4 *pos)
{ return tex1Dfetch(pos_tex, i); }
ucl_inline float fetch_q(const int& i, const float *q)
{ return tex1Dfetch(q_tex, i); }
#endif
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
#endif
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
#else
#define pos_tex x_
#define q_tex q_
#endif
__kernel void k_coul_long(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *sp_cl_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
@ -56,8 +60,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp qtmp=fetch_q(i,q_);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
@ -66,7 +70,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
factor_coul = (numtyp)1.0-sp_cl[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
// Compute r12
numtyp delx = ix.x-jx.x;
@ -83,7 +87,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
numtyp expm2 = ucl_exp(-grij*grij);
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
fetch(prefactor,j,q_tex);
prefactor *= qqrd2e * qtmp/r;
force = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul) * r2inv;
f.x+=delx*force;
@ -162,7 +167,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
} // if ii
}
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__kernel void k_coul_long_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in,
__global numtyp* sp_cl_in,
__global int *dev_nbor, __global int *dev_packed,
@ -193,8 +198,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp qtmp=fetch_q(i,q_);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
@ -203,7 +208,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
factor_coul = (numtyp)1.0-sp_cl[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
// Compute r12
numtyp delx = ix.x-jx.x;
@ -220,7 +225,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
numtyp expm2 = ucl_exp(-grij*grij);
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
fetch(prefactor,j,q_tex);
prefactor *= qqrd2e * qtmp/r;
force = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul) * r2inv;
f.x+=delx*force;

View File

@ -21,10 +21,12 @@
#include <omp.h>
#endif
#ifdef USE_OPENCL
#if defined(USE_OPENCL)
#include "device_cl.h"
#elif defined(USE_CUDART)
const char *device=0;
#else
#include "device_ptx.h"
#include "device_cubin.h"
#endif
using namespace LAMMPS_AL;
@ -42,10 +44,10 @@ DeviceT::~Device() {
}
template <class numtyp, class acctyp>
int DeviceT::init_device(MPI_Comm world, MPI_Comm replica,
const int first_gpu, const int last_gpu,
const int gpu_mode, const double p_split,
const int nthreads, const int t_per_atom) {
int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
const int last_gpu, const int gpu_mode,
const double p_split, const int nthreads,
const int t_per_atom, const double cell_size) {
_nthreads=nthreads;
#ifdef _OPENMP
omp_set_num_threads(nthreads);
@ -62,6 +64,7 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica,
_last_device=last_gpu;
_gpu_mode=gpu_mode;
_particle_split=p_split;
_cell_size=cell_size;
// Get the rank/size within the world
MPI_Comm_rank(_comm_world,&_world_me);
@ -191,7 +194,7 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
} else {
if (atom.charge()==false && charge)
_data_in_estimate++;
if (atom.quat()==false && rot)
if (atom.quaternion()==false && rot)
_data_in_estimate++;
if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial))
return -3;
@ -205,7 +208,10 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
_block_cell_id, _block_nbor_build, threads_per_atom,
_warp_size, _time_device))
return -3;
nbor->cell_size(cell_size);
if (_cell_size<0.0)
nbor->cell_size(cell_size,cell_size);
else
nbor->cell_size(_cell_size,cell_size);
_init_count++;
return 0;
@ -251,7 +257,9 @@ void DeviceT::set_double_precompute
template <class numtyp, class acctyp>
void DeviceT::init_message(FILE *screen, const char *name,
const int first_gpu, const int last_gpu) {
#ifdef USE_OPENCL
#if defined(USE_OPENCL)
std::string fs="";
#elif defined(USE_CUDART)
std::string fs="";
#else
std::string fs=toa(gpu->free_gigabytes())+"/";
@ -411,11 +419,9 @@ void DeviceT::estimate_gpu_overhead(const int kernel_calls,
}
template <class numtyp, class acctyp>
void DeviceT::output_times(UCL_Timer &time_pair,
Answer<numtyp,acctyp> &ans,
void DeviceT::output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans,
Neighbor &nbor, const double avg_split,
const double max_bytes,
const double gpu_overhead,
const double max_bytes, const double gpu_overhead,
const double driver_overhead,
const int threads_per_atom, FILE *screen) {
double single[9], times[9];
@ -574,33 +580,32 @@ int DeviceT::compile_kernels() {
k_info.set_function(*dev_program,"kernel_info");
_compiled=true;
UCL_H_Vec<int> h_gpu_lib_data(14,*gpu,UCL_NOT_PINNED);
UCL_D_Vec<int> d_gpu_lib_data(14,*gpu);
UCL_Vector<int,int> gpu_lib_data(14,*gpu,UCL_NOT_PINNED);
k_info.set_size(1,1);
k_info.run(&d_gpu_lib_data.begin());
ucl_copy(h_gpu_lib_data,d_gpu_lib_data,false);
k_info.run(&gpu_lib_data);
gpu_lib_data.update_host(false);
_ptx_arch=static_cast<double>(h_gpu_lib_data[0])/100.0;
_ptx_arch=static_cast<double>(gpu_lib_data[0])/100.0;
#ifndef USE_OPENCL
if (_ptx_arch>gpu->arch())
return -4;
#endif
_num_mem_threads=h_gpu_lib_data[1];
_warp_size=h_gpu_lib_data[2];
_num_mem_threads=gpu_lib_data[1];
_warp_size=gpu_lib_data[2];
if (_threads_per_atom<1)
_threads_per_atom=h_gpu_lib_data[3];
_threads_per_atom=gpu_lib_data[3];
if (_threads_per_charge<1)
_threads_per_charge=h_gpu_lib_data[13];
_pppm_max_spline=h_gpu_lib_data[4];
_pppm_block=h_gpu_lib_data[5];
_block_pair=h_gpu_lib_data[6];
_max_shared_types=h_gpu_lib_data[7];
_block_cell_2d=h_gpu_lib_data[8];
_block_cell_id=h_gpu_lib_data[9];
_block_nbor_build=h_gpu_lib_data[10];
_block_bio_pair=h_gpu_lib_data[11];
_max_bio_shared_types=h_gpu_lib_data[12];
_threads_per_charge=gpu_lib_data[13];
_pppm_max_spline=gpu_lib_data[4];
_pppm_block=gpu_lib_data[5];
_block_pair=gpu_lib_data[6];
_max_shared_types=gpu_lib_data[7];
_block_cell_2d=gpu_lib_data[8];
_block_cell_id=gpu_lib_data[9];
_block_nbor_build=gpu_lib_data[10];
_block_bio_pair=gpu_lib_data[11];
_max_bio_shared_types=gpu_lib_data[12];
if (static_cast<size_t>(_block_pair)>gpu->group_size())
_block_pair=gpu->group_size();
@ -634,9 +639,10 @@ Device<PRECISION,ACC_PRECISION> global_device;
int lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
const int last_gpu, const int gpu_mode,
const double particle_split, const int nthreads,
const int t_per_atom) {
const int t_per_atom, const double cell_size) {
return global_device.init_device(world,replica,first_gpu,last_gpu,gpu_mode,
particle_split,nthreads,t_per_atom);
particle_split,nthreads,t_per_atom,
cell_size);
}
void lmp_clear_device() {

View File

@ -49,7 +49,7 @@ class Device {
int init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
const int last_gpu, const int gpu_mode,
const double particle_split, const int nthreads,
const int t_per_atom);
const int t_per_atom, const double cell_size);
/// Initialize the device for Atom and Neighbor storage
/** \param rot True if quaternions need to be stored
@ -239,7 +239,7 @@ class Device {
int num_blocks=static_cast<int>(ceil(static_cast<double>(numel)/
_block_pair));
k_zero.set_size(num_blocks,_block_pair);
k_zero.run(&mem.begin(),&numel);
k_zero.run(&mem,&numel);
}
// -------------------------- DEVICE DATA -------------------------
@ -288,6 +288,7 @@ class Device {
double _particle_split;
double _cpu_full;
double _ptx_arch;
double _cell_size; // -1 if the cutoff is used
int _num_mem_threads, _warp_size, _threads_per_atom, _threads_per_charge;
int _pppm_max_spline, _pppm_block;

View File

@ -13,10 +13,12 @@
email : brownw@ornl.gov nguyentd@ornl.gov
***************************************************************************/
#ifdef USE_OPENCL
#if defined(USE_OPENCL)
#include "eam_cl.h"
#elif defined(USE_CUDART)
const char *eam=0;
#else
#include "eam_ptx.h"
#include "eam_cubin.h"
#endif
#include "lal_eam.h"
@ -51,32 +53,24 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
{
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,
gpu_split,_screen,eam);
gpu_split,_screen,eam,"k_eam");
if (success!=0)
return success;
// allocate fp
bool cpuview=false;
if (this->ucl_device->device_type()==UCL_CPU)
cpuview=true;
int ef_nall=nall;
if (ef_nall==0)
ef_nall=2000;
_max_fp_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
host_fp.alloc(_max_fp_size,*(this->ucl_device));
if (cpuview)
dev_fp.view(host_fp);
else
dev_fp.alloc(_max_fp_size,*(this->ucl_device),UCL_WRITE_ONLY);
_fp.alloc(_max_fp_size,*(this->ucl_device),UCL_RW_OPTIMIZED,UCL_WRITE_ONLY);
k_energy.set_function(*(this->pair_program),"kernel_energy");
k_energy_fast.set_function(*(this->pair_program),"kernel_energy_fast");
k_energy.set_function(*(this->pair_program),"k_energy");
k_energy_fast.set_function(*(this->pair_program),"k_energy_fast");
fp_tex.get_texture(*(this->pair_program),"fp_tex");
fp_tex.bind_float(dev_fp,1);
fp_tex.bind_float(_fp,1);
_compiled_energy = true;
// Initialize timers for selected GPU
@ -236,7 +230,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
+ frho_spline2.row_bytes()
+ z2r_spline1.row_bytes()
+ z2r_spline2.row_bytes()
+ dev_fp.row_bytes();
+ _fp.device.row_bytes();
return 0;
}
@ -255,8 +249,7 @@ void EAMT::clear() {
z2r_spline1.clear();
z2r_spline2.clear();
host_fp.clear();
dev_fp.clear();
_fp.clear();
time_pair2.clear();
time_fp1.clear();
@ -303,19 +296,11 @@ void EAMT::compute(const int f_ago, const int inum_full, const int nlocal,
// ------------------- Resize FP Array for EAM --------------------
if (nall>_max_fp_size) {
dev_fp.clear();
host_fp.clear();
_max_fp_size=static_cast<int>(static_cast<double>(nall)*1.10);
host_fp.alloc(_max_fp_size,*(this->ucl_device));
if (this->ucl_device->device_type()==UCL_CPU)
dev_fp.view(host_fp);
else
dev_fp.alloc(_max_fp_size,*(this->ucl_device));
fp_tex.bind_float(dev_fp,1);
_fp.resize(_max_fp_size);
fp_tex.bind_float(_fp,1);
}
*fp_ptr=host_fp.begin();
*fp_ptr=_fp.host.begin();
// ----------------------------------------------------------------
@ -348,7 +333,7 @@ void EAMT::compute(const int f_ago, const int inum_full, const int nlocal,
// copy fp from device to host for comm
_nlocal=nlocal;
time_fp1.start();
ucl_copy(host_fp,dev_fp,nlocal,true);
_fp.update_host(nlocal,true);
time_fp1.stop();
time_fp1.sync_stop();
}
@ -380,19 +365,11 @@ int** EAMT::compute(const int ago, const int inum_full, const int nall,
// ------------------- Resize FP Array for EAM --------------------
if (nall>_max_fp_size) {
dev_fp.clear();
host_fp.clear();
_max_fp_size=static_cast<int>(static_cast<double>(nall)*1.10);
host_fp.alloc(_max_fp_size,*(this->ucl_device));
if (this->ucl_device->device_type()==UCL_CPU)
dev_fp.view(host_fp);
else
dev_fp.alloc(_max_fp_size,*(this->ucl_device));
fp_tex.bind_float(dev_fp,1);
_fp.resize(_max_fp_size);
fp_tex.bind_float(_fp,1);
}
*fp_ptr=host_fp.begin();
*fp_ptr=_fp.host.begin();
// -----------------------------------------------------------------
@ -428,7 +405,7 @@ int** EAMT::compute(const int ago, const int inum_full, const int nall,
// copy fp from device to host for comm
_nlocal=inum_full;
time_fp1.start();
ucl_copy(host_fp,dev_fp,inum_full,true);
_fp.update_host(inum_full,true);
time_fp1.stop();
time_fp1.sync_stop();
@ -486,22 +463,18 @@ void EAMT::loop(const bool _eflag, const bool _vflag) {
if (shared_types) {
this->k_energy_fast.set_size(GX,BX);
this->k_energy_fast.run(&this->atom->dev_x.begin(), &type2rhor_z2r.begin(),
&type2frho.begin(), &rhor_spline2.begin(),
&frho_spline1.begin(),&frho_spline2.begin(),
&this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(), &dev_fp.begin(),
&this->ans->dev_engv.begin(), &eflag, &ainum,
this->k_energy_fast.run(&this->atom->x, &type2rhor_z2r, &type2frho,
&rhor_spline2, &frho_spline1,&frho_spline2,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&_fp, &this->ans->engv, &eflag, &ainum,
&nbor_pitch, &_ntypes, &_cutforcesq, &_rdr, &_rdrho,
&_nrho, &_nr, &this->_threads_per_atom);
} else {
this->k_energy.set_size(GX,BX);
this->k_energy.run(&this->atom->dev_x.begin(), &type2rhor_z2r.begin(),
&type2frho.begin(), &rhor_spline2.begin(),
&frho_spline1.begin(),&frho_spline2.begin(),
&this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(), &dev_fp.begin(),
&this->ans->dev_engv.begin(),&eflag, &ainum, &nbor_pitch,
this->k_energy.run(&this->atom->x, &type2rhor_z2r, &type2frho,
&rhor_spline2, &frho_spline1, &frho_spline2,
&this->nbor->dev_nbor, &this->_nbor_data->begin(), &_fp,
&this->ans->engv,&eflag, &ainum, &nbor_pitch,
&_ntypes, &_cutforcesq, &_rdr, &_rdrho, &_nrho, &_nr,
&this->_threads_per_atom);
}
@ -536,27 +509,19 @@ void EAMT::loop2(const bool _eflag, const bool _vflag) {
if (shared_types) {
this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->dev_x.begin(), &dev_fp.begin(),
&type2rhor_z2r.begin(),
&rhor_spline1.begin(),
&z2r_spline1.begin(),
&z2r_spline2.begin(),
&this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
&nbor_pitch, &_cutforcesq, &_rdr, &_nr,
&this->_threads_per_atom);
this->k_pair_fast.run(&this->atom->x, &_fp, &type2rhor_z2r,
&rhor_spline1, &z2r_spline1, &z2r_spline2,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag,
&vflag, &ainum, &nbor_pitch, &_cutforcesq, &_rdr,
&_nr, &this->_threads_per_atom);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->dev_x.begin(), &dev_fp.begin(),
&type2rhor_z2r.begin(),
&rhor_spline1.begin(),
&z2r_spline1.begin(),
&z2r_spline2.begin(),
&this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
&nbor_pitch, &_ntypes, &_cutforcesq, &_rdr, &_nr,
this->k_pair.run(&this->atom->x, &_fp, &type2rhor_z2r, &rhor_spline1,
&z2r_spline1, &z2r_spline2, &this->nbor->dev_nbor,
&this->_nbor_data->begin(), &this->ans->force,
&this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch,
&_ntypes, &_cutforcesq, &_rdr, &_nr,
&this->_threads_per_atom);
}

View File

@ -15,66 +15,37 @@
#ifdef NV_KERNEL
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> fp_tex;
texture<float4> rhor_sp1_tex;
texture<float4> rhor_sp2_tex;
texture<float4> frho_sp1_tex;
texture<float4> frho_sp2_tex;
texture<float4> z2r_sp1_tex;
texture<float4> z2r_sp2_tex;
#ifdef _DOUBLE_DOUBLE
ucl_inline double4 fetch_rhor_sp1(const int& i, const double4 *rhor_spline1) {
return rhor_spline1[i];
}
ucl_inline double4 fetch_rhor_sp2(const int& i, const double4 *rhor_spline2) {
return rhor_spline2[i];
}
ucl_inline double4 fetch_frho_sp1(const int& i, const double4 *frho_spline1) {
return frho_spline1[i];
}
ucl_inline double4 fetch_frho_sp2(const int& i, const double4 *frho_spline2) {
return frho_spline2[i];
}
ucl_inline double4 fetch_z2r_sp1(const int& i, const double4 *z2r_spline1) {
return z2r_spline1[i];
}
ucl_inline double4 fetch_z2r_sp2(const int& i, const double4 *z2r_spline2) {
return z2r_spline2[i];
}
#else
texture<int4> pos_tex;
texture<int2> fp_tex;
texture<int4> rhor_sp1_tex;
texture<int4> rhor_sp2_tex;
texture<int4> frho_sp1_tex;
texture<int4> frho_sp2_tex;
texture<int4> z2r_sp1_tex;
texture<int4> z2r_sp2_tex;
#endif
#ifndef _DOUBLE_DOUBLE
ucl_inline float4 fetch_pos(const int& i, const float4 *pos)
{ return tex1Dfetch(pos_tex, i); }
ucl_inline float fetch_q(const int& i, const float *fp)
{ return tex1Dfetch(fp_tex, i); }
#else
ucl_inline float4 fetch_rhor_sp1(const int& i, const float4 *rhor_spline1)
{ return tex1Dfetch(rhor_sp1_tex, i); }
ucl_inline float4 fetch_rhor_sp2(const int& i, const float4 *rhor_spline2)
{ return tex1Dfetch(rhor_sp2_tex, i); }
ucl_inline float4 fetch_frho_sp1(const int& i, const float4 *frho_spline1)
{ return tex1Dfetch(frho_sp1_tex, i); }
ucl_inline float4 fetch_frho_sp2(const int& i, const float4 *frho_spline2)
{ return tex1Dfetch(frho_sp2_tex, i); }
ucl_inline float4 fetch_z2r_sp1(const int& i, const float4 *z2r_spline1)
{ return tex1Dfetch(z2r_sp1_tex, i); }
ucl_inline float4 fetch_z2r_sp2(const int& i, const float4 *z2r_spline2)
{ return tex1Dfetch(z2r_sp2_tex, i); }
#endif
#else // OPENCL
#define fetch_q(i,y) fp_[i]
#define fetch_rhor_sp1(i,y) rhor_spline1[i]
#define fetch_rhor_sp2(i,y) rhor_spline2[i]
#define fetch_frho_sp1(i,y) frho_spline1[i]
#define fetch_frho_sp2(i,y) frho_spline2[i]
#define fetch_z2r_sp1(i,y) z2r_spline1[i]
#define fetch_z2r_sp2(i,y) z2r_spline2[i]
#define pos_tex x_
#define fp_tex fp_
#define rhor_sp1_tex rhor_spline1
#define rhor_sp2_tex rhor_spline2
#define frho_sp1_tex frho_spline1
#define frho_sp2_tex frho_spline2
#define z2r_sp1_tex z2r_spline1
#define z2r_sp2_tex z2r_spline2
#endif
@ -99,11 +70,11 @@ ucl_inline float4 fetch_z2r_sp2(const int& i, const float4 *z2r_spline2)
p -= m; \
p = MIN(p,(numtyp)1.0); \
int index = type2frho[itype]*(nrho+1)+m; \
numtyp4 coeff = fetch_frho_sp1(index, frho_spline1); \
numtyp4 coeff; fetch4(coeff,index,frho_sp1_tex); \
numtyp fp = (coeff.x*p + coeff.y)*p + coeff.z; \
fp_[i]=fp; \
if (eflag>0) { \
coeff = fetch_frho_sp2(index, frho_spline2); \
fetch4(coeff,index,frho_sp2_tex); \
energy = ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w; \
engv[ii]=(acctyp)2.0*energy; \
} \
@ -154,7 +125,7 @@ ucl_inline float4 fetch_z2r_sp2(const int& i, const float4 *z2r_spline2)
ans[ii]=f; \
}
__kernel void kernel_energy(__global numtyp4 *x_, __global int2 *type2rhor_z2r,
__kernel void k_energy(__global numtyp4 *x_, __global int2 *type2rhor_z2r,
__global int *type2frho,
__global numtyp4 *rhor_spline2,
__global numtyp4 *frho_spline1,
@ -178,14 +149,14 @@ __kernel void kernel_energy(__global numtyp4 *x_, __global int2 *type2rhor_z2r,
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
// Compute r12
@ -203,7 +174,7 @@ __kernel void kernel_energy(__global numtyp4 *x_, __global int2 *type2rhor_z2r,
int mtype = jtype*ntypes+itype;
int index = type2rhor_z2r[mtype].x*(nr+1)+m;
numtyp4 coeff = fetch_rhor_sp2(index, rhor_spline2);
numtyp4 coeff; fetch4(coeff,index,rhor_sp2_tex);
rho += ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w;
}
} // for nbor
@ -213,7 +184,7 @@ __kernel void kernel_energy(__global numtyp4 *x_, __global int2 *type2rhor_z2r,
} // if ii
}
__kernel void kernel_energy_fast(__global numtyp4 *x_,
__kernel void k_energy_fast(__global numtyp4 *x_,
__global int2 *type2rhor_z2r_in,
__global int *type2frho_in,
__global numtyp4 *rhor_spline2,
@ -252,14 +223,14 @@ __kernel void kernel_energy_fast(__global numtyp4 *x_,
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
// Compute r12
numtyp delx = ix.x-jx.x;
@ -277,7 +248,7 @@ __kernel void kernel_energy_fast(__global numtyp4 *x_,
int jtype=fast_mul((int)MAX_SHARED_TYPES,jx.w);
int mtype = jtype+itype;
int index = type2rhor_z2r[mtype].x*(nr+1)+m;
numtyp4 coeff = fetch_rhor_sp2(index, rhor_spline2);
numtyp4 coeff; fetch4(coeff,index,rhor_sp2_tex);
rho += ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w;
}
} // for nbor
@ -287,7 +258,7 @@ __kernel void kernel_energy_fast(__global numtyp4 *x_,
} // if ii
}
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp *fp_,
__kernel void k_eam(__global numtyp4 *x_, __global numtyp *fp_,
__global int2 *type2rhor_z2r,
__global numtyp4 *rhor_spline1,
__global numtyp4 *z2r_spline1,
@ -317,15 +288,15 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp *fp_,
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp ifp=fetch_q(i,fp_); //fp_[i];
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp ifp; fetch(ifp,i,fp_tex); //fp_[i];
int itype=ix.w;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
// Compute r12
@ -347,25 +318,27 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp *fp_,
mtype = itype*ntypes+jtype;
index = type2rhor_z2r[mtype].x*(nr+1)+m;
coeff = fetch_rhor_sp1(index, rhor_spline1);
fetch4(coeff,index,rhor_sp1_tex);
numtyp rhoip = (coeff.x*p + coeff.y)*p + coeff.z;
mtype = jtype*ntypes+itype;
index = type2rhor_z2r[mtype].x*(nr+1)+m;
coeff = fetch_rhor_sp1(index, rhor_spline1);
fetch4(coeff,index,rhor_sp1_tex);
numtyp rhojp = (coeff.x*p + coeff.y)*p + coeff.z;
mtype = itype*ntypes+jtype;
index = type2rhor_z2r[mtype].y*(nr+1)+m;
coeff = fetch_z2r_sp1(index, z2r_spline1);
fetch4(coeff,index,z2r_sp1_tex);
numtyp z2p = (coeff.x*p + coeff.y)*p + coeff.z;
coeff = fetch_z2r_sp2(index, z2r_spline2);
fetch4(coeff,index,z2r_sp2_tex);
numtyp z2 = ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w;
numtyp recip = ucl_recip(r);
numtyp phi = z2*recip;
numtyp phip = z2p*recip - phi*recip;
numtyp psip = ifp*rhojp + fetch_q(j,fp_)*rhoip + phip;
numtyp psip;
fetch(psip,j,fp_tex);
psip = ifp*rhojp + psip*rhoip + phip;
numtyp force = -psip*recip;
f.x+=delx*force;
@ -391,7 +364,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp *fp_,
}
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp *fp_,
__kernel void k_eam_fast(__global numtyp4 *x_, __global numtyp *fp_,
__global int2 *type2rhor_z2r_in,
__global numtyp4 *rhor_spline1,
__global numtyp4 *z2r_spline1,
@ -427,8 +400,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp *fp_,
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp ifp=fetch_q(i,fp_); //fp_[i];
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp ifp; fetch(ifp,i,fp_tex); //fp_[i];
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
@ -436,7 +409,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp *fp_,
int j=*nbor;
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jw=jx.w;
int jtype=fast_mul((int)MAX_SHARED_TYPES,jw);
@ -459,25 +432,27 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp *fp_,
mtype = itype+jw;
index = type2rhor_z2r[mtype].x*(nr+1)+m;
coeff = fetch_rhor_sp1(index, rhor_spline1);
fetch4(coeff,index,rhor_sp1_tex);
numtyp rhoip = (coeff.x*p + coeff.y)*p + coeff.z;
mtype = jtype+iw;
index = type2rhor_z2r[mtype].x*(nr+1)+m;
coeff = fetch_rhor_sp1(index, rhor_spline1);
fetch4(coeff,index,rhor_sp1_tex);
numtyp rhojp = (coeff.x*p + coeff.y)*p + coeff.z;
mtype = itype+jw;
index = type2rhor_z2r[mtype].y*(nr+1)+m;
coeff = fetch_z2r_sp1(index, z2r_spline1);
fetch4(coeff,index,z2r_sp1_tex);
numtyp z2p = (coeff.x*p + coeff.y)*p + coeff.z;
coeff = fetch_z2r_sp2(index, z2r_spline2);
fetch4(coeff,index,z2r_sp2_tex);
numtyp z2 = ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w;
numtyp recip = ucl_recip(r);
numtyp phi = z2*recip;
numtyp phip = z2p*recip - phi*recip;
numtyp psip = ifp*rhojp + fetch_q(j,fp_)*rhoip + phip;
numtyp psip;
fetch(psip,j,fp_tex);
psip = ifp*rhojp + psip*rhoip + phip;
numtyp force = -psip*recip;
f.x+=delx*force;

View File

@ -52,8 +52,8 @@ class EAM : public BaseAtomic<numtyp, acctyp> {
if (nghost>0) {
UCL_H_Vec<numtyp> host_view;
UCL_D_Vec<numtyp> dev_view;
host_view.view_offset(_nlocal,host_fp);
dev_view.view_offset(_nlocal,dev_fp);
host_view.view_offset(_nlocal,_fp.host);
dev_view.view_offset(_nlocal,_fp.device);
ucl_copy(dev_view,host_view,nghost,true);
}
}
@ -128,8 +128,7 @@ class EAM : public BaseAtomic<numtyp, acctyp> {
bool _compiled_energy;
/// Per-atom arrays
UCL_H_Vec<numtyp> host_fp;
UCL_D_Vec<numtyp> dev_fp;
UCL_Vector<numtyp,numtyp> _fp;
protected:
bool _allocated;

View File

@ -20,6 +20,14 @@ enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
#ifdef NV_KERNEL
#include "lal_preprocessor.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex, quat_tex;
#else
texture<int4,1> pos_tex, quat_tex;
#endif
#else
#define pos_tex x_
#define quat_tex qif
#endif
#define atom_info(t_per_atom, ii, tid, offset) \
@ -411,7 +419,7 @@ ucl_inline void gpu_mldivide3(const numtyp m[9], const numtyp *v, numtyp *ans,
ucl_inline void gpu_quat_to_mat_trans(__global const numtyp4 *qif, const int qi,
numtyp mat[9])
{
numtyp4 q=qif[qi];
numtyp4 q; fetch4(q,qi,quat_tex);
numtyp w2 = q.x*q.x;
numtyp i2 = q.y*q.y;

View File

@ -15,6 +15,13 @@
#ifdef NV_KERNEL
#include "lal_preprocessor.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
#else
texture<int4,1> pos_tex;
#endif
#else
#define pos_tex x_
#endif
// ---------------------------------------------------------------------------
@ -40,14 +47,14 @@ __kernel void kernel_nbor(__global numtyp4 *x_, __global numtyp2 *cut_form,
__global int *list_end=nbor+fast_mul(numj,nbor_pitch);
__global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
numtyp4 ix=x_[i];
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int iw=ix.w;
int itype=fast_mul(iw,ntypes);
int newj=0;
for ( ; nbor<list_end; nbor+=nbor_pitch) {
int j=*nbor;
j &= NEIGHMASK;
numtyp4 jx=x_[j];
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
int mtype=itype+jtype;
numtyp2 cf=cut_form[mtype];
@ -102,7 +109,7 @@ __kernel void kernel_nbor_fast(__global numtyp4 *x_, __global numtyp2 *cut_form,
__global int *list_end=nbor+fast_mul(numj,nbor_pitch);
__global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
numtyp4 ix=x_[i];
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
@ -110,7 +117,7 @@ __kernel void kernel_nbor_fast(__global numtyp4 *x_, __global numtyp2 *cut_form,
for ( ; nbor<list_end; nbor+=nbor_pitch) {
int j=*nbor;
j &= NEIGHMASK;
numtyp4 jx=x_[j];
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
int mtype=itype+jtype;

View File

@ -13,12 +13,15 @@
email : brownw@ornl.gov
***************************************************************************/
#ifdef USE_OPENCL
#if defined(USE_OPENCL)
#include "gayberne_cl.h"
#include "gayberne_lj_cl.h"
#elif defined(USE_CUDART)
const char *gayberne=0;
const char *gayberne_lj=0;
#else
#include "gayberne_ptx.h"
#include "gayberne_lj_ptx.h"
#include "gayberne_cubin.h"
#include "gayberne_lj_cubin.h"
#endif
#include "lal_gayberne.h"
@ -57,7 +60,8 @@ int GayBerneT::init(const int ntypes, const double gamma,
const double gpu_split, FILE *_screen) {
int success;
success=this->init_base(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,ntypes,h_form,gayberne,gayberne_lj);
_screen,ntypes,h_form,gayberne,gayberne_lj,
"k_gayberne");
if (success!=0)
return success;
@ -210,13 +214,13 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
this->time_ellipsoid.start();
this->k_ellipsoid.set_size(GX,BX);
this->k_ellipsoid.run(&this->atom->dev_x.begin(),
&this->atom->dev_quat.begin(), &this->shape.begin(), &this->well.begin(),
&this->gamma_upsilon_mu.begin(), &this->sigma_epsilon.begin(),
&this->_lj_types, &this->lshape.begin(), &this->nbor->dev_nbor.begin(),
&stride, &this->ans->dev_ans.begin(),&ainum,&this->ans->dev_engv.begin(),
&this->dev_error.begin(), &eflag, &vflag, &this->_last_ellipse,
&this->_threads_per_atom);
this->k_ellipsoid.run(&this->atom->x, &this->atom->quat,
&this->shape, &this->well, &this->gamma_upsilon_mu,
&this->sigma_epsilon, &this->_lj_types,
&this->lshape, &this->nbor->dev_nbor, &stride,
&this->ans->force, &ainum, &this->ans->engv,
&this->dev_error, &eflag, &vflag,
&this->_last_ellipse, &this->_threads_per_atom);
this->time_ellipsoid.stop();
if (this->_last_ellipse==this->ans->inum()) {
@ -243,17 +247,19 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
this->time_ellipsoid2.start();
this->k_sphere_ellipsoid.set_size(GX,BX);
this->k_sphere_ellipsoid.run(&this->atom->dev_x.begin(),
&this->atom->dev_quat.begin(), &this->shape.begin(),
&this->well.begin(), &this->gamma_upsilon_mu.begin(),
&this->sigma_epsilon.begin(), &this->_lj_types, &this->lshape.begin(),
&this->nbor->dev_nbor.begin(), &stride, &this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &this->dev_error.begin(), &eflag,
&vflag, &this->_last_ellipse, &ainum, &this->_threads_per_atom);
this->k_sphere_ellipsoid.run(&this->atom->x, &this->atom->quat,
&this->shape, &this->well,
&this->gamma_upsilon_mu,
&this->sigma_epsilon, &this->_lj_types,
&this->lshape, &this->nbor->dev_nbor,
&stride, &this->ans->force,
&this->ans->engv, &this->dev_error,
&eflag, &vflag, &this->_last_ellipse,
&ainum, &this->_threads_per_atom);
this->time_ellipsoid2.stop();
} else {
this->ans->dev_ans.zero();
this->ans->dev_engv.zero();
this->ans->force.zero();
this->ans->engv.zero();
this->time_nbor1.stop();
this->time_ellipsoid.start();
this->time_ellipsoid.stop();
@ -268,19 +274,20 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
if (this->_last_ellipse<this->ans->inum()) {
if (this->_shared_types) {
this->k_lj_fast.set_size(GX,BX);
this->k_lj_fast.run(&this->atom->dev_x.begin(), &this->lj1.begin(),
&this->lj3.begin(), &this->gamma_upsilon_mu.begin(), &stride,
&this->nbor->dev_packed.begin(), &this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &this->dev_error.begin(),
&eflag, &vflag, &this->_last_ellipse, &ainum,
this->k_lj_fast.run(&this->atom->x, &this->lj1, &this->lj3,
&this->gamma_upsilon_mu, &stride,
&this->nbor->dev_packed, &this->ans->force,
&this->ans->engv, &this->dev_error, &eflag,
&vflag, &this->_last_ellipse, &ainum,
&this->_threads_per_atom);
} else {
this->k_lj.set_size(GX,BX);
this->k_lj.run(&this->atom->dev_x.begin(), &this->lj1.begin(),
&this->lj3.begin(), &this->_lj_types, &this->gamma_upsilon_mu.begin(),
&stride, &this->nbor->dev_packed.begin(), &this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &this->dev_error.begin(), &eflag,
&vflag, &this->_last_ellipse, &ainum, &this->_threads_per_atom);
this->k_lj.run(&this->atom->x, &this->lj1, &this->lj3,
&this->_lj_types, &this->gamma_upsilon_mu, &stride,
&this->nbor->dev_packed, &this->ans->force,
&this->ans->engv, &this->dev_error, &eflag,
&vflag, &this->_last_ellipse, &ainum,
&this->_threads_per_atom);
}
}
this->time_lj.stop();
@ -294,12 +301,11 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
this->time_nbor1.stop();
this->time_ellipsoid.start();
this->k_ellipsoid.set_size(GX,BX);
this->k_ellipsoid.run(&this->atom->dev_x.begin(),
&this->atom->dev_quat.begin(), &this->shape.begin(), &this->well.begin(),
&this->gamma_upsilon_mu.begin(), &this->sigma_epsilon.begin(),
&this->_lj_types, &this->lshape.begin(), &this->nbor->dev_nbor.begin(),
&stride, &this->ans->dev_ans.begin(), &ainum,
&this->ans->dev_engv.begin(), &this->dev_error.begin(),
this->k_ellipsoid.run(&this->atom->x, &this->atom->quat,
&this->shape, &this->well, &this->gamma_upsilon_mu,
&this->sigma_epsilon, &this->_lj_types, &this->lshape,
&this->nbor->dev_nbor, &stride, &this->ans->force,
&ainum, &this->ans->engv, &this->dev_error,
&eflag, &vflag, &ainum, &this->_threads_per_atom);
this->time_ellipsoid.stop();
}

View File

@ -80,7 +80,7 @@ ucl_inline void compute_eta_torque(numtyp m[9],numtyp m2[9], const numtyp4 shape
m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den;
}
__kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q,
__kernel void k_gayberne(__global numtyp4* x_,__global numtyp4 *q,
__global numtyp4* shape, __global numtyp4* well,
__global numtyp *gum, __global numtyp2* sig_eps,
const int ntypes, __global numtyp *lshape,
@ -117,7 +117,7 @@ __kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q,
nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);
numtyp4 ix=x_[i];
numtyp4 ix; fetch4(ix,i,pos_tex);
int itype=ix.w;
numtyp a1[9], b1[9], g1[9];
numtyp4 ishape=shape[itype];
@ -136,7 +136,7 @@ __kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q,
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=x_[j];
numtyp4 jx; fetch4(jx,j,pos_tex);
int jtype=jx.w;
// Compute r12

View File

@ -17,15 +17,15 @@
#include "lal_ellipsoid_extra.h"
#endif
__kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
__global numtyp4* shape,__global numtyp4* well,
__global numtyp *gum, __global numtyp2* sig_eps,
const int ntypes, __global numtyp *lshape,
__global int *dev_nbor, const int stride,
__global acctyp4 *ans, __global acctyp *engv,
__global int *err_flag, const int eflag,
const int vflag,const int start, const int inum,
const int t_per_atom) {
__kernel void k_gayberne_sphere_ellipsoid(__global numtyp4 *x_,
__global numtyp4 *q, __global numtyp4* shape,
__global numtyp4* well, __global numtyp *gum,
__global numtyp2* sig_eps, const int ntypes,
__global numtyp *lshape, __global int *dev_nbor,
const int stride, __global acctyp4 *ans,
__global acctyp *engv, __global int *err_flag,
const int eflag, const int vflag,const int start,
const int inum, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
ii+=start;
@ -51,7 +51,7 @@ __kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);
numtyp4 ix=x_[i];
numtyp4 ix; fetch4(ix,i,pos_tex);
int itype=ix.w;
numtyp oner=shape[itype].x;
@ -64,7 +64,7 @@ __kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=x_[j];
numtyp4 jx; fetch4(jx,j,pos_tex);
int jtype=jx.w;
// Compute r12
@ -236,14 +236,13 @@ __kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
} // if ii
}
__kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
__kernel void k_gayberne_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *gum,
const int stride, __global int *dev_ij,
__global acctyp4 *ans, __global acctyp *engv,
__global int *err_flag, const int eflag,
const int vflag, const int start, const int inum,
const int t_per_atom) {
__global numtyp *gum, const int stride,
__global int *dev_ij, __global acctyp4 *ans,
__global acctyp *engv, __global int *err_flag,
const int eflag, const int vflag, const int start,
const int inum, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
ii+=start;
@ -269,7 +268,7 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix=x_[i];
numtyp4 ix; fetch4(ix,i,pos_tex);
int itype=ix.w;
numtyp factor_lj;
@ -279,7 +278,7 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=x_[j];
numtyp4 jx; fetch4(jx,j,pos_tex);
int jtype=jx.w;
// Compute r12
@ -319,13 +318,13 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
} // if ii
}
__kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__kernel void k_gayberne_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in, __global numtyp *gum,
const int stride, __global int *dev_ij,
__global acctyp4 *ans, __global acctyp *engv,
__global int *err_flag, const int eflag,
const int vflag, const int start, const int inum,
const int t_per_atom) {
const int vflag, const int start,
const int inum, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
ii+=start;
@ -358,7 +357,7 @@ __kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix=x_[i];
numtyp4 ix; fetch4(ix,i,pos_tex);
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
@ -369,7 +368,7 @@ __kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=x_[j];
numtyp4 jx; fetch4(jx,j,pos_tex);
int mtype=itype+jx.w;
// Compute r12
@ -406,3 +405,4 @@ __kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
ans,engv);
} // if ii
}

View File

@ -13,10 +13,12 @@
email : brownw@ornl.gov
***************************************************************************/
#ifdef USE_OPENCL
#if defined(USE_OPENCL)
#include "lj_cl.h"
#elif defined(USE_CUDART)
const char *lj=0;
#else
#include "lj_ptx.h"
#include "lj_cubin.h"
#endif
#include "lal_lj.h"
@ -51,7 +53,7 @@ int LJT::init(const int ntypes,
const double gpu_split, FILE *_screen) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,lj);
_screen,lj,"k_lj");
if (success!=0)
return success;
@ -133,20 +135,17 @@ void LJT::loop(const bool _eflag, const bool _vflag) {
this->time_pair.start();
if (shared_types) {
this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
&lj3.begin(), &sp_lj.begin(),
&this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(),
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag,
&ainum, &nbor_pitch, &this->_threads_per_atom);
this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag,
&vflag, &ainum, &nbor_pitch,
&this->_threads_per_atom);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom);
this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &vflag,
&ainum, &nbor_pitch, &this->_threads_per_atom);
}
this->time_pair.stop();
}

View File

@ -15,14 +15,16 @@
#ifdef NV_KERNEL
#include "lal_aux_fun1.h"
texture<float4> pos_tex;
#ifndef _DOUBLE_DOUBLE
ucl_inline float4 fetch_pos(const int& i, const float4 *pos)
{ return tex1Dfetch(pos_tex, i); }
texture<float4> pos_tex;
#else
texture<int4,1> pos_tex;
#endif
#else
#define pos_tex x_
#endif
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
__kernel void k_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
@ -51,7 +53,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
numtyp factor_lj;
@ -61,7 +63,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
// Compute r12
@ -101,7 +103,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
} // if ii
}
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__kernel void k_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
@ -137,7 +139,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
@ -148,7 +150,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int mtype=itype+jx.w;
// Compute r12

View File

@ -13,10 +13,12 @@
email : brownw@ornl.gov
***************************************************************************/
#ifdef USE_OPENCL
#if defined(USE_OPENCL)
#include "lj96_cl.h"
#elif defined(USE_CUDART)
const char *lj96=0;
#else
#include "lj96_ptx.h"
#include "lj96_cubin.h"
#endif
#include "lal_lj96.h"
@ -51,7 +53,7 @@ int LJ96T::init(const int ntypes,
const double gpu_split, FILE *_screen) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,lj96);
_screen,lj96,"k_lj96");
if (success!=0)
return success;
@ -133,19 +135,17 @@ void LJ96T::loop(const bool _eflag, const bool _vflag) {
this->time_pair.start();
if (shared_types) {
this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
&lj3.begin(), &sp_lj.begin(),
&this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(),
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag,
&ainum, &nbor_pitch, &this->_threads_per_atom);
this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag,
&vflag, &ainum, &nbor_pitch,
&this->_threads_per_atom);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
this->k_pair.run(&this->atom->x, &lj1, &lj3,
&_lj_types, &sp_lj, &this->nbor->dev_nbor,
&this->_nbor_data->begin(), &this->ans->force,
&this->ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom);
}
this->time_pair.stop();

View File

@ -15,14 +15,16 @@
#ifdef NV_KERNEL
#include "lal_aux_fun1.h"
texture<float4> pos_tex;
#ifndef _DOUBLE_DOUBLE
ucl_inline float4 fetch_pos(const int& i, const float4 *pos)
{ return tex1Dfetch(pos_tex, i); }
texture<float4> pos_tex;
#else
texture<int4,1> pos_tex;
#endif
#else
#define pos_tex x_
#endif
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
__kernel void k_lj96(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
@ -51,7 +53,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
numtyp factor_lj;
@ -61,7 +63,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
// Compute r12
@ -102,7 +104,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
} // if ii
}
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__kernel void k_lj96_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
@ -138,7 +140,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
@ -149,7 +151,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int mtype=itype+jx.w;
// Compute r12

View File

@ -13,10 +13,12 @@
email : brownw@ornl.gov
***************************************************************************/
#ifdef USE_OPENCL
#if defined(USE_OPENCL)
#include "lj_class2_long_cl.h"
#elif defined(USE_CUDART)
const char *lj_class2_long=0;
#else
#include "lj_class2_long_ptx.h"
#include "lj_class2_long_cubin.h"
#endif
#include "lal_lj_class2_long.h"
@ -55,7 +57,7 @@ int LJClass2LongT::init(const int ntypes, double **host_cutsq,
const double g_ewald) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,lj_class2_long);
_screen,lj_class2_long,"k_lj_class2_long");
if (success!=0)
return success;
@ -143,22 +145,19 @@ void LJClass2LongT::loop(const bool _eflag, const bool _vflag) {
this->time_pair.start();
if (shared_types) {
this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
&lj3.begin(), &sp_lj.begin(),
&this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(),
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag,
&ainum, &nbor_pitch, &this->atom->dev_q.begin(),
this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag,
&vflag, &ainum, &nbor_pitch, &this->atom->q,
&_cut_coulsq, &_qqrd2e, &_g_ewald,
&this->_threads_per_atom);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
&nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq,
this->k_pair.run(&this->atom->x, &lj1, &lj3,
&_lj_types, &sp_lj, &this->nbor->dev_nbor,
&this->_nbor_data->begin(), &this->ans->force,
&this->ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->atom->q, &_cut_coulsq,
&_qqrd2e, &_g_ewald, &this->_threads_per_atom);
}
this->time_pair.stop();

View File

@ -14,18 +14,22 @@
// ***************************************************************************/
#ifdef NV_KERNEL
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
#ifndef _DOUBLE_DOUBLE
ucl_inline float4 fetch_pos(const int& i, const float4 *pos)
{ return tex1Dfetch(pos_tex, i); }
ucl_inline float fetch_q(const int& i, const float *q)
{ return tex1Dfetch(q_tex, i); }
#endif
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
#endif
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
#else
#define pos_tex x_
#define q_tex q_
#endif
__kernel void k_lj_class2_long(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
@ -61,8 +65,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp qtmp=fetch_q(i,q_);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
int itype=ix.w;
for ( ; nbor<list_end; nbor+=n_stride) {
@ -73,7 +77,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
// Compute r12
@ -101,7 +105,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
numtyp expm2 = ucl_exp(-grij*grij);
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
fetch(prefactor,j,q_tex);
prefactor *= qqrd2e * qtmp/r;
forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
} else
forcecoul = (numtyp)0.0;
@ -136,7 +141,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
} // if ii
}
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__kernel void k_lj_class2_long_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
@ -175,8 +180,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp qtmp=fetch_q(i,q_);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
@ -188,7 +193,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int mtype=itype+jx.w;
// Compute r12
@ -215,7 +220,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
numtyp expm2 = ucl_exp(-grij*grij);
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
fetch(prefactor,j,q_tex);
prefactor *= qqrd2e * qtmp/r;
forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
} else
forcecoul = (numtyp)0.0;

View File

@ -13,10 +13,12 @@
email : brownw@ornl.gov
***************************************************************************/
#ifdef USE_OPENCL
#if defined(USE_OPENCL)
#include "lj_coul_cl.h"
#elif defined(USE_CUDART)
const char *lj_coul=0;
#else
#include "lj_coul_ptx.h"
#include "lj_coul_cubin.h"
#endif
#include "lal_lj_coul.h"
@ -54,7 +56,7 @@ int LJCoulT::init(const int ntypes,
double *host_special_coul, const double qqrd2e) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,lj_coul);
_screen,lj_coul,"k_lj_coul");
if (success!=0)
return success;
@ -145,23 +147,18 @@ void LJCoulT::loop(const bool _eflag, const bool _vflag) {
this->time_pair.start();
if (shared_types) {
this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
&lj3.begin(), &sp_lj.begin(),
&this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(),
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag,
&ainum, &nbor_pitch,
&this->atom->dev_q.begin(), &cutsq.begin(),
&_qqrd2e, &this->_threads_per_atom);
this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag,
&vflag, &ainum, &nbor_pitch, &this->atom->q,
&cutsq, &_qqrd2e, &this->_threads_per_atom);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
&nbor_pitch, &this->atom->dev_q.begin(),
&cutsq.begin(), &_qqrd2e, &this->_threads_per_atom);
this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv,
&eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q,
&cutsq, &_qqrd2e, &this->_threads_per_atom);
}
this->time_pair.stop();
}

View File

@ -14,18 +14,22 @@
// ***************************************************************************/
#ifdef NV_KERNEL
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
#ifndef _DOUBLE_DOUBLE
ucl_inline float4 fetch_pos(const int& i, const float4 *pos)
{ return tex1Dfetch(pos_tex, i); }
ucl_inline float fetch_q(const int& i, const float *q)
{ return tex1Dfetch(q_tex, i); }
#endif
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
#endif
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
#else
#define pos_tex x_
#define q_tex q_
#endif
__kernel void k_lj_coul(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
@ -61,8 +65,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp qtmp=fetch_q(i,q_);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
int itype=ix.w;
for ( ; nbor<list_end; nbor+=n_stride) {
@ -73,7 +77,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
factor_coul = sp_lj[sbmask(j)+4];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
// Compute r12
@ -93,9 +97,10 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
} else
force_lj = (numtyp)0.0;
if (rsq < lj1[mtype].w)
forcecoul = qqrd2e*qtmp*fetch_q(j,q_)*ucl_rsqrt(rsq)*factor_coul;
else
if (rsq < lj1[mtype].w) {
fetch(forcecoul,j,q_tex);
forcecoul *= qqrd2e*qtmp*ucl_rsqrt(rsq)*factor_coul;
} else
forcecoul = (numtyp)0.0;
force = (force_lj + forcecoul) * r2inv;
@ -127,7 +132,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
} // if ii
}
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__kernel void k_lj_coul_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
@ -168,8 +173,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp qtmp=fetch_q(i,q_);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
@ -181,7 +186,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
factor_coul = sp_lj[sbmask(j)+4];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int mtype=itype+jx.w;
// Compute r12
@ -200,9 +205,10 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
} else
force_lj = (numtyp)0.0;
if (rsq < lj1[mtype].w)
forcecoul = qqrd2e*qtmp*fetch_q(j,q_)*ucl_rsqrt(rsq)*factor_coul;
else
if (rsq < lj1[mtype].w) {
fetch(forcecoul,j,q_tex);
forcecoul *= qqrd2e*qtmp*ucl_rsqrt(rsq)*factor_coul;
} else
forcecoul = (numtyp)0.0;
force = (force_lj + forcecoul) * r2inv;

View File

@ -13,10 +13,12 @@
email : brownw@ornl.gov
***************************************************************************/
#ifdef USE_OPENCL
#if defined(USE_OPENCL)
#include "lj_coul_long_cl.h"
#elif defined(USE_CUDART)
const char *lj_coul_long=0;
#else
#include "lj_coul_long_ptx.h"
#include "lj_coul_long_cubin.h"
#endif
#include "lal_lj_coul_long.h"
@ -55,7 +57,7 @@ int LJCoulLongT::init(const int ntypes,
const double g_ewald) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,lj_coul_long);
_screen,lj_coul_long,"k_lj_coul_long");
if (success!=0)
return success;
@ -143,22 +145,19 @@ void LJCoulLongT::loop(const bool _eflag, const bool _vflag) {
this->time_pair.start();
if (shared_types) {
this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
&lj3.begin(), &sp_lj.begin(),
&this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(),
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag,
&ainum, &nbor_pitch, &this->atom->dev_q.begin(),
this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag,
&vflag, &ainum, &nbor_pitch, &this->atom->q,
&_cut_coulsq, &_qqrd2e, &_g_ewald,
&this->_threads_per_atom);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
&nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq,
this->k_pair.run(&this->atom->x, &lj1, &lj3,
&_lj_types, &sp_lj, &this->nbor->dev_nbor,
&this->_nbor_data->begin(), &this->ans->force,
&this->ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->atom->q, &_cut_coulsq,
&_qqrd2e, &_g_ewald, &this->_threads_per_atom);
}
this->time_pair.stop();

View File

@ -14,18 +14,22 @@
// ***************************************************************************/
#ifdef NV_KERNEL
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
#ifndef _DOUBLE_DOUBLE
ucl_inline float4 fetch_pos(const int& i, const float4 *pos)
{ return tex1Dfetch(pos_tex, i); }
ucl_inline float fetch_q(const int& i, const float *q)
{ return tex1Dfetch(q_tex, i); }
#endif
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
#endif
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
#else
#define pos_tex x_
#define q_tex q_
#endif
__kernel void k_lj_coul_long(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
@ -61,8 +65,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp qtmp=fetch_q(i,q_);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
int itype=ix.w;
for ( ; nbor<list_end; nbor+=n_stride) {
@ -73,7 +77,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
// Compute r12
@ -99,7 +103,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
numtyp expm2 = ucl_exp(-grij*grij);
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
fetch(prefactor,j,q_tex);
prefactor *= qqrd2e * qtmp/r;
forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
} else
forcecoul = (numtyp)0.0;
@ -134,7 +139,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
} // if ii
}
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__kernel void k_lj_coul_long_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
@ -173,8 +178,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp qtmp=fetch_q(i,q_);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
@ -186,7 +191,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int mtype=itype+jx.w;
// Compute r12
@ -211,7 +216,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
numtyp expm2 = ucl_exp(-grij*grij);
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
fetch(prefactor,j,q_tex);
prefactor *= qqrd2e * qtmp/r;
forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
} else
forcecoul = (numtyp)0.0;

View File

@ -13,10 +13,12 @@
email : ibains@nvidia.com
***************************************************************************/
#ifdef USE_OPENCL
#if defined(USE_OPENCL)
#include "lj_expand_cl.h"
#elif defined(USE_CUDART)
const char *lj_expand=0;
#else
#include "lj_expand_ptx.h"
#include "lj_expand_cubin.h"
#endif
#include "lal_lj_expand.h"
@ -51,7 +53,7 @@ int LJExpandT::init(const int ntypes, double **host_cutsq,
const double gpu_split, FILE *_screen) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,lj_expand);
_screen,lj_expand,"k_lj_expand");
if (success!=0)
return success;
@ -133,20 +135,17 @@ void LJExpandT::loop(const bool _eflag, const bool _vflag) {
this->time_pair.start();
if (shared_types) {
this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
&lj3.begin(), &sp_lj.begin(),
&this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(),
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag,
&ainum, &nbor_pitch, &this->_threads_per_atom);
this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag,
&vflag, &ainum, &nbor_pitch,
&this->_threads_per_atom);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom);
this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &vflag,
&ainum, &nbor_pitch, &this->_threads_per_atom);
}
this->time_pair.stop();
}

View File

@ -14,15 +14,19 @@
// ***************************************************************************/
#ifdef NV_KERNEL
#include "lal_aux_fun1.h"
texture<float4> pos_tex;
#ifndef _DOUBLE_DOUBLE
ucl_inline float4 fetch_pos(const int& i, const float4 *pos)
{ return tex1Dfetch(pos_tex, i); }
#endif
texture<float4> pos_tex;
#else
texture<int4,1> pos_tex;
#endif
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
#else
#define pos_tex x_
#endif
__kernel void k_lj_expand(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
@ -51,7 +55,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
numtyp factor_lj;
@ -61,7 +65,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
// Compute r12
@ -104,7 +108,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
} // if ii
}
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__kernel void k_lj_expand_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
@ -140,7 +144,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
@ -151,7 +155,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int mtype=itype+jx.w;
// Compute r12

View File

@ -13,10 +13,12 @@
email : brownw@ornl.gov
***************************************************************************/
#ifdef USE_OPENCL
#if defined(USE_OPENCL)
#include "morse_cl.h"
#elif defined(USE_CUDART)
const char *morse=0;
#else
#include "morse_ptx.h"
#include "morse_cubin.h"
#endif
#include "lal_morse.h"
@ -51,7 +53,7 @@ int MorseT::init(const int ntypes,
const double gpu_split, FILE *_screen) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,morse);
_screen,morse,"k_morse");
if (success!=0)
return success;
@ -132,20 +134,17 @@ void MorseT::loop(const bool _eflag, const bool _vflag) {
this->time_pair.start();
if (shared_types) {
this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->dev_x.begin(), &mor1.begin(),
&mor2.begin(), &sp_lj.begin(),
&this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(),
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag,
&ainum, &nbor_pitch, &this->_threads_per_atom);
this->k_pair_fast.run(&this->atom->x, &mor1, &mor2, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag,
&vflag, &ainum, &nbor_pitch,
&this->_threads_per_atom);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->dev_x.begin(), &mor1.begin(), &mor2.begin(),
&_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom);
this->k_pair.run(&this->atom->x, &mor1, &mor2, &_types, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &vflag,
&ainum, &nbor_pitch, &this->_threads_per_atom);
}
this->time_pair.stop();
}

View File

@ -14,15 +14,19 @@
// ***************************************************************************/
#ifdef NV_KERNEL
#include "lal_aux_fun1.h"
texture<float4> pos_tex;
#ifndef _DOUBLE_DOUBLE
ucl_inline float4 fetch_pos(const int& i, const float4 *pos)
{ return tex1Dfetch(pos_tex, i); }
#endif
texture<float4> pos_tex;
#else
texture<int4,1> pos_tex;
#endif
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *mor1,
#else
#define pos_tex x_
#endif
__kernel void k_morse(__global numtyp4 *x_, __global numtyp4 *mor1,
__global numtyp2* mor2, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
@ -51,7 +55,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *mor1,
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
numtyp factor_lj;
@ -61,7 +65,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *mor1,
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
// Compute r12
@ -102,7 +106,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *mor1,
} // if ii
}
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *mor1_in,
__kernel void k_morse_fast(__global numtyp4 *x_, __global numtyp4 *mor1_in,
__global numtyp2* mor2_in,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
@ -138,7 +142,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *mor1_in,
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
@ -149,7 +153,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *mor1_in,
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int mtype=itype+jx.w;
// Compute r12

View File

@ -84,7 +84,7 @@ bool Neighbor::init(NeighborShared *shared, const int inum,
_max_atoms=1000;
_max_host=static_cast<int>(static_cast<double>(host_inum)*1.10);
_max_nbors=max_nbors;
_max_nbors=(max_nbors/threads_per_atom+1)*threads_per_atom;
_maxspecial=maxspecial;
if (gpu_nbor==0)
@ -124,17 +124,14 @@ void Neighbor::alloc(bool &success) {
_c_bytes+=dev_packed.row_bytes();
}
if (_max_host>0) {
host_nbor.clear();
dev_host_nbor.clear();
dev_host_numj.clear();
nbor_host.clear();
dev_numj_host.clear();
host_ilist.clear();
host_jlist.clear();
success=success && (host_nbor.alloc(_max_nbors*_max_host,*dev,
UCL_RW_OPTIMIZED)==UCL_SUCCESS);
success=success && (dev_host_nbor.alloc(_max_nbors*_max_host,
*dev,UCL_WRITE_ONLY)==UCL_SUCCESS);
success=success && (dev_host_numj.alloc(_max_host,*dev,
success=(nbor_host.alloc(_max_nbors*_max_host,*dev,UCL_RW_OPTIMIZED,
UCL_WRITE_ONLY)==UCL_SUCCESS) && success;
success=success && (dev_numj_host.alloc(_max_host,*dev,
UCL_WRITE_ONLY)==UCL_SUCCESS);
success=success && (host_ilist.alloc(nt,*dev,UCL_NOT_PINNED)==UCL_SUCCESS);
if (!success)
@ -145,16 +142,16 @@ void Neighbor::alloc(bool &success) {
UCL_NOT_PINNED)==UCL_SUCCESS);
if (!success)
return;
int *ptr=host_nbor.begin();
int *ptr=nbor_host.host.begin();
for (int i=0; i<_max_host; i++) {
host_jlist[i]=ptr;
ptr+=_max_nbors;
}
_c_bytes+=dev_host_nbor.row_bytes()+dev_host_numj.row_bytes();
_c_bytes+=nbor_host.device.row_bytes()+dev_numj_host.row_bytes();
} else {
// Some OpenCL implementations return errors for NULL pointers as args
dev_host_nbor.view(dev_nbor);
dev_host_numj.view(dev_nbor);
nbor_host.device.view(dev_nbor);
dev_numj_host.view(dev_nbor);
}
if (_maxspecial>0) {
dev_nspecial.clear();
@ -194,10 +191,9 @@ void Neighbor::clear() {
host_packed.clear();
host_acc.clear();
dev_nbor.clear();
dev_host_nbor.clear();
nbor_host.clear();
dev_packed.clear();
host_nbor.clear();
dev_host_numj.clear();
dev_numj_host.clear();
host_ilist.clear();
host_jlist.clear();
dev_nspecial.clear();
@ -215,8 +211,8 @@ void Neighbor::clear() {
double Neighbor::host_memory_usage() const {
if (_gpu_nbor>0) {
if (_gpu_host)
return host_nbor.row_bytes()*host_nbor.rows()+host_ilist.row_bytes()+
host_jlist.row_bytes();
return nbor_host.device.row_bytes()*nbor_host.rows()+
host_ilist.row_bytes()+host_jlist.row_bytes();
else
return 0;
} else
@ -285,8 +281,7 @@ void Neighbor::get_host(const int inum, int *ilist, int *numj,
int GX=static_cast<int>(ceil(static_cast<double>(inum)*_threads_per_atom/
block_size));
_shared->k_nbor.set_size(GX,block_size);
_shared->k_nbor.run(&dev_nbor.begin(), &dev_packed.begin(), &inum,
&_threads_per_atom);
_shared->k_nbor.run(&dev_nbor, &dev_packed, &inum, &_threads_per_atom);
time_kernel.stop();
}
}
@ -295,31 +290,23 @@ template <class numtyp, class acctyp>
void Neighbor::resize_max_neighbors(const int maxn, bool &success) {
if (maxn>_max_nbors) {
int mn=static_cast<int>(static_cast<double>(maxn)*1.10);
dev_nbor.clear();
success=success &&
(dev_nbor.alloc((mn+1)*_max_atoms,*dev)==UCL_SUCCESS);
mn=(mn/_threads_per_atom+1)*_threads_per_atom;
success=success && (dev_nbor.resize((mn+1)*_max_atoms)==UCL_SUCCESS);
_gpu_bytes=dev_nbor.row_bytes();
if (_max_host>0) {
host_nbor.clear();
dev_host_nbor.clear();
success=success && (host_nbor.alloc(mn*_max_host,*dev,
UCL_RW_OPTIMIZED)==UCL_SUCCESS);
success=success && (dev_host_nbor.alloc(mn*_max_host,
*dev,UCL_WRITE_ONLY)==UCL_SUCCESS);
int *ptr=host_nbor.begin();
success=success && (nbor_host.resize(mn*_max_host)==UCL_SUCCESS);
int *ptr=nbor_host.host.begin();
for (int i=0; i<_max_host; i++) {
host_jlist[i]=ptr;
ptr+=mn;
}
_gpu_bytes+=dev_host_nbor.row_bytes();
_gpu_bytes+=nbor_host.row_bytes();
} else {
dev_host_nbor.view(dev_nbor);
dev_host_numj.view(dev_nbor);
nbor_host.device.view(dev_nbor);
dev_numj_host.view(dev_nbor);
}
if (_alloc_packed) {
dev_packed.clear();
success=success && (dev_packed.alloc((mn+2)*_max_atoms,*dev,
UCL_READ_ONLY)==UCL_SUCCESS);
success=success && (dev_packed.resize((mn+2)*_max_atoms)==UCL_SUCCESS);
_gpu_bytes+=dev_packed.row_bytes();
}
_max_nbors=mn;
@ -337,16 +324,12 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
// Calculate number of cells and allocate storage for binning as necessary
int ncellx, ncelly, ncellz, ncell_3d;
ncellx = static_cast<int>(ceil(((subhi[0] - sublo[0]) +
2.0*_cell_size)/_cell_size));
ncelly = static_cast<int>(ceil(((subhi[1] - sublo[1]) +
2.0*_cell_size)/_cell_size));
ncellz = static_cast<int>(ceil(((subhi[2] - sublo[2]) +
2.0*_cell_size)/_cell_size));
int ghost_cells=2*_cells_in_cutoff;
ncellx = static_cast<int>(ceil((subhi[0]-sublo[0])/_cell_size))+ghost_cells;
ncelly = static_cast<int>(ceil((subhi[1]-sublo[1])/_cell_size))+ghost_cells;
ncellz = static_cast<int>(ceil((subhi[2]-sublo[2])/_cell_size))+ghost_cells;
ncell_3d = ncellx * ncelly * ncellz;
if (ncell_3d+1>_ncells) {
dev_cell_counts.clear();
dev_cell_counts.alloc(ncell_3d+1,dev_nbor);
if (_gpu_nbor==2) {
if (_ncells>0) {
host_cell_counts.clear();
@ -355,11 +338,19 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
cell_iter = new int[ncell_3d+1];
host_cell_counts.alloc(ncell_3d+1,dev_nbor);
}
if (_gpu_nbor==2 && atom.host_view())
dev_cell_counts.view(host_cell_counts);
else {
dev_cell_counts.clear();
dev_cell_counts.alloc(ncell_3d+1,dev_nbor);
}
_ncells=ncell_3d+1;
_cell_bytes=dev_cell_counts.row_bytes();
}
const numtyp cell_size_cast=static_cast<numtyp>(_cell_size);
const numtyp cutoff_cast=static_cast<numtyp>(_cutoff);
if (_maxspecial>0) {
time_nbor.start();
@ -379,8 +370,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
const int g2x=static_cast<int>(ceil(static_cast<double>(_maxspecial)/b2x));
const int g2y=static_cast<int>(ceil(static_cast<double>(nt)/b2y));
_shared->k_transpose.set_size(g2x,g2y,b2x,b2y);
_shared->k_transpose.run(&dev_special.begin(),&dev_special_t.begin(),
&_maxspecial,&nt);
_shared->k_transpose.run(&dev_special,&dev_special_t,&_maxspecial,&nt);
time_transpose.stop();
}
@ -392,28 +382,48 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
// Build cell list on CPU
host_cell_counts.zero();
double m_cell_size=-_cell_size;
double dx=subhi[0]-sublo[0]+_cell_size;
double dy=subhi[1]-sublo[1]+_cell_size;
double dz=subhi[2]-sublo[2]+_cell_size;
double i_cell_size=1.0/_cell_size;
for (int i=0; i<nall; i++) {
int offset_hi=_cells_in_cutoff+1;
for (int i=0; i<nt; i++) {
double px, py, pz;
px=x[i][0]-sublo[0];
py=x[i][1]-sublo[1];
pz=x[i][2]-sublo[2];
if (px<m_cell_size) px=m_cell_size;
if (py<m_cell_size) py=m_cell_size;
if (pz<m_cell_size) pz=m_cell_size;
if (px>dx) px=dx;
if (py>dy) py=dy;
if (pz>dz) pz=dz;
int id=static_cast<int>(px/_cell_size + 1.0) +
static_cast<int>(py/_cell_size + 1.0) * ncellx +
static_cast<int>(pz/_cell_size + 1.0) * ncellx * ncelly;
int ix = static_cast<int>(px*i_cell_size+1);
ix = std::max(ix,_cells_in_cutoff);
ix = std::min(ix,ncellx-offset_hi);
int iy = static_cast<int>(py*i_cell_size+1);
iy = std::max(iy,_cells_in_cutoff);
iy = std::min(iy,ncelly-offset_hi);
int iz = static_cast<int>(pz*i_cell_size+1);
iz = std::max(iz,_cells_in_cutoff);
iz = std::min(iz,ncellz-offset_hi);
cell_id[i]=id;
int id = ix+iy*ncellx+iz*ncellx*ncelly;
cell_id[i] = id;
host_cell_counts[id+1]++;
}
for (int i=nt; i<nall; i++) {
double px, py, pz;
px=x[i][0]-sublo[0];
py=x[i][1]-sublo[1];
pz=x[i][2]-sublo[2];
int ix = static_cast<int>(px*i_cell_size+1);
ix = std::max(ix,0);
ix = std::min(ix,ncellx-1);
int iy = static_cast<int>(py*i_cell_size+1);
iy = std::max(iy,0);
iy = std::min(iy,ncelly-1);
int iz = static_cast<int>(pz*i_cell_size+1);
iz = std::max(iz,0);
iz = std::min(iz,ncellz-1);
int id = ix+iy*ncellx+iz*ncellx*ncelly;
cell_id[i] = id;
host_cell_counts[id+1]++;
}
@ -451,41 +461,39 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
time_kernel.start();
_nbor_pitch=inum;
_shared->neigh_tex.bind_float(atom.dev_x,4);
_shared->neigh_tex.bind_float(atom.x,4);
// If binning on GPU, do this now
if (_gpu_nbor==1) {
const numtyp i_cell_size=static_cast<numtyp>(1.0/_cell_size);
const int neigh_block=_block_cell_id;
const int GX=(int)ceil((float)nall/neigh_block);
const numtyp sublo0=static_cast<numtyp>(sublo[0]);
const numtyp sublo1=static_cast<numtyp>(sublo[1]);
const numtyp sublo2=static_cast<numtyp>(sublo[2]);
const numtyp subhi0=static_cast<numtyp>(subhi[0]);
const numtyp subhi1=static_cast<numtyp>(subhi[1]);
const numtyp subhi2=static_cast<numtyp>(subhi[2]);
_shared->k_cell_id.set_size(GX,neigh_block);
_shared->k_cell_id.run(&atom.dev_x.begin(), &atom.dev_cell_id.begin(),
&atom.dev_particle_id.begin(),
&sublo0, &sublo1, &sublo2, &subhi0, &subhi1,
&subhi2, &cell_size_cast, &ncellx, &ncelly, &nall);
_shared->k_cell_id.run(&atom.x, &atom.dev_cell_id,
&atom.dev_particle_id, &sublo0, &sublo1,
&sublo2, &i_cell_size, &ncellx, &ncelly, &ncellz,
&nt, &nall, &_cells_in_cutoff);
atom.sort_neighbor(nall);
/* calculate cell count */
_shared->k_cell_counts.set_size(GX,neigh_block);
_shared->k_cell_counts.run(&atom.dev_cell_id.begin(),
&dev_cell_counts.begin(), &nall, &ncell_3d);
_shared->k_cell_counts.run(&atom.dev_cell_id, &dev_cell_counts, &nall,
&ncell_3d);
}
/* build the neighbor list */
const int cell_block=_block_nbor_build;
_shared->k_build_nbor.set_size(ncellx, ncelly*ncellz, cell_block, 1);
_shared->k_build_nbor.run(&atom.dev_x.begin(), &atom.dev_particle_id.begin(),
&dev_cell_counts.begin(), &dev_nbor.begin(),
&dev_host_nbor.begin(), &dev_host_numj.begin(),
&_max_nbors,&cell_size_cast,
&ncellx, &ncelly, &ncellz, &inum, &nt, &nall,
&_threads_per_atom);
_shared->k_build_nbor.set_size(ncellx-ghost_cells,(ncelly-ghost_cells)*
(ncellz-ghost_cells),cell_block,1);
_shared->k_build_nbor.run(&atom.x, &atom.dev_particle_id,
&dev_cell_counts, &dev_nbor, &nbor_host,
&dev_numj_host, &_max_nbors, &cutoff_cast, &ncellx,
&ncelly, &ncellz, &inum, &nt, &nall,
&_threads_per_atom, &_cells_in_cutoff);
/* Get the maximum number of nbors and realloc if necessary */
UCL_D_Vec<int> numj;
@ -494,7 +502,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
if (nt>inum) {
UCL_H_Vec<int> host_offset;
host_offset.view_offset(inum,host_acc,nt-inum);
ucl_copy(host_offset,dev_host_numj,nt-inum,true);
ucl_copy(host_offset,dev_numj_host,nt-inum,true);
}
if (_gpu_nbor!=2) {
@ -521,17 +529,16 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
const int GX2=static_cast<int>(ceil(static_cast<double>
(nt*_threads_per_atom)/cell_block));
_shared->k_special.set_size(GX2,cell_block);
_shared->k_special.run(&dev_nbor.begin(), &dev_host_nbor.begin(),
&dev_host_numj.begin(), &atom.dev_tag.begin(),
&dev_nspecial.begin(), &dev_special.begin(),
_shared->k_special.run(&dev_nbor, &nbor_host, &dev_numj_host,
&atom.dev_tag, &dev_nspecial, &dev_special,
&inum, &nt, &_max_nbors, &_threads_per_atom);
}
time_kernel.stop();
time_nbor.start();
if (inum<nt) {
ucl_copy(host_nbor,dev_host_nbor,true);
host_nbor.sync();
nbor_host.update_host(true);
nbor_host.sync();
}
time_nbor.stop();
}

View File

@ -22,20 +22,6 @@
#define IJ_SIZE 131072
#ifdef USE_OPENCL
#include "geryon/ocl_timer.h"
#include "geryon/ocl_mat.h"
using namespace ucl_opencl;
#else
#include "geryon/nvd_timer.h"
#include "geryon/nvd_mat.h"
using namespace ucl_cudadr;
#endif
namespace LAMMPS_AL {
class Neighbor {
@ -70,7 +56,14 @@ class Neighbor {
const int warp_size, const bool time_device);
/// Set the size of the cutoff+skin
inline void cell_size(const double size) { _cell_size=size; }
inline void cell_size(const double size, const double cutoff) {
_cell_size=size;
_cutoff=cutoff;
if (cutoff>size)
_cells_in_cutoff=static_cast<int>(ceil(cutoff/size));
else
_cells_in_cutoff=1;
}
/// Get the size of the cutoff+skin
inline double cell_size() const { return _cell_size; }
@ -203,14 +196,11 @@ class Neighbor {
// ----------------- Data for GPU Neighbor Calculation ---------------
/// Host storage for device calculated neighbor lists
/** Same storage format as device matrix **/
UCL_H_Vec<int> host_nbor;
/// Device storage for neighbor list matrix that will be copied to host
/// Host/Device storage for device calculated neighbor lists
/** - 1st row is numj
* - Remaining rows are by atom, columns are nbors **/
UCL_D_Vec<int> dev_host_nbor;
UCL_D_Vec<int> dev_host_numj;
UCL_Vector<int,int> nbor_host;
UCL_D_Vec<int> dev_numj_host;
UCL_H_Vec<int> host_ilist;
UCL_H_Vec<int*> host_jlist;
/// Device storage for special neighbor counts
@ -232,13 +222,14 @@ class Neighbor {
bool _allocated, _use_packing, _nbor_time_avail, _time_device;
int _gpu_nbor, _max_atoms, _max_nbors, _max_host, _nbor_pitch, _maxspecial;
bool _gpu_host, _alloc_packed;
double _cell_size, _bin_time;
double _cutoff, _cell_size, _bin_time;
double _gpu_bytes, _c_bytes, _cell_bytes;
void alloc(bool &success);
int _block_cell_2d, _block_cell_id, _max_block_nbor_build, _block_nbor_build;
int _ncells, _threads_per_atom, _total_atoms;
int _cells_in_cutoff;
template <class numtyp, class acctyp>
inline void resize_max_neighbors(const int maxn, bool &success);

View File

@ -16,38 +16,48 @@
#ifdef NV_KERNEL
#include "lal_preprocessor.h"
texture<float4> neigh_tex;
#ifndef _DOUBLE_DOUBLE
ucl_inline float4 fetch_pos(const int& i, const float4 *pos)
{ return tex1Dfetch(neigh_tex, i); }
texture<float4> pos_tex;
#else
texture<int4,1> pos_tex;
#endif
__kernel void calc_cell_id(numtyp4 *pos, unsigned *cell_id, int *particle_id,
numtyp boxlo0,
numtyp boxlo1, numtyp boxlo2, numtyp boxhi0,
numtyp boxhi1, numtyp boxhi2, numtyp cell_size,
int ncellx, int ncelly, int nall) {
numtyp boxlo0, numtyp boxlo1, numtyp boxlo2,
numtyp i_cell_size, int ncellx, int ncelly,
int ncellz, int inum, int nall,
int cells_in_cutoff) {
int i = threadIdx.x + blockIdx.x*blockDim.x;
if (i < nall) {
numtyp4 p = fetch_pos(i,pos); //pos[i];
numtyp4 p;
fetch4(p,i,pos_tex); //pos[i];
p.x -= boxlo0;
p.y -= boxlo1;
p.z -= boxlo2;
p.x = fmaxf(p.x, -cell_size);
p.x = fminf(p.x, boxhi0-boxlo0+cell_size);
p.y = fmaxf(p.y, -cell_size);
p.y = fminf(p.y, boxhi1-boxlo1+cell_size);
p.z = fmaxf(p.z, -cell_size);
p.z = fminf(p.z, boxhi2-boxlo2+cell_size);
int ix = int(p.x*i_cell_size+cells_in_cutoff);
int iy = int(p.y*i_cell_size+cells_in_cutoff);
int iz = int(p.z*i_cell_size+cells_in_cutoff);
unsigned int id = (unsigned int)(p.x/cell_size + 1.0)
+ (unsigned int)(p.y/cell_size + 1.0) * ncellx
+ (unsigned int)(p.z/cell_size + 1.0) * ncellx * ncelly;
int offset_lo, offset_hi;
if (i<inum) {
offset_lo=cells_in_cutoff;
offset_hi=cells_in_cutoff+1;
} else {
offset_lo=0;
offset_hi=1;
}
cell_id[i] = id;
ix = max(ix,offset_lo);
ix = min(ix,ncellx-offset_hi);
iy = max(iy,offset_lo);
iy = min(iy,ncelly-offset_hi);
iz = max(iz,offset_lo);
iz = min(iz,ncellz-offset_hi);
cell_id[i] = ix+iy*ncellx+iz*ncellx*ncelly;
particle_id[i] = i;
}
}
@ -78,6 +88,8 @@ __kernel void kernel_calc_cell_counts(unsigned *cell_id,
}
}
#else
#define pos_tex x_
#endif
@ -113,12 +125,13 @@ __kernel void calc_neigh_list_cell(__global numtyp4 *x_,
__global int *host_numj,
int neigh_bin_size, numtyp cell_size,
int ncellx, int ncelly, int ncellz,
int inum, int nt, int nall, int t_per_atom)
int inum, int nt, int nall, int t_per_atom,
int cells_in_cutoff)
{
int tid = THREAD_ID_X;
int ix = BLOCK_ID_X;
int iy = BLOCK_ID_Y % ncelly;
int iz = BLOCK_ID_Y / ncelly;
int ix = BLOCK_ID_X + cells_in_cutoff;
int iy = BLOCK_ID_Y % (ncelly - cells_in_cutoff*2) + cells_in_cutoff;
int iz = BLOCK_ID_Y / (ncelly - cells_in_cutoff*2) + cells_in_cutoff;
int bsx = BLOCK_SIZE_X;
int icell = ix + iy*ncellx + iz*ncellx*ncelly;
@ -129,9 +142,9 @@ __kernel void calc_neigh_list_cell(__global numtyp4 *x_,
int icell_begin = cell_counts[icell];
int icell_end = cell_counts[icell+1];
int nborz0 = max(iz-1,0), nborz1 = min(iz+1, ncellz-1),
nbory0 = max(iy-1,0), nbory1 = min(iy+1, ncelly-1),
nborx0 = max(ix-1,0), nborx1 = min(ix+1, ncellx-1);
int nborz0 = iz-cells_in_cutoff, nborz1 = iz+cells_in_cutoff,
nbory0 = iy-cells_in_cutoff, nbory1 = iy+cells_in_cutoff,
nborx0 = ix-cells_in_cutoff, nborx1 = ix+cells_in_cutoff;
numtyp4 diff;
numtyp r2;
@ -147,7 +160,7 @@ __kernel void calc_neigh_list_cell(__global numtyp4 *x_,
pid_i = cell_particle_id[i];
if (pid_i < nt) {
atom_i = fetch_pos(pid_i,x_); //pos[pid_i];
fetch4(atom_i,pid_i,pos_tex); //pos[i];
}
if (pid_i < inum) {
stride=inum;
@ -182,7 +195,7 @@ __kernel void calc_neigh_list_cell(__global numtyp4 *x_,
if (tid < end_idx) {
pid_j = cell_particle_id[tid+k*bsx+jcell_begin];
cell_list_sh[tid] = pid_j;
atom_j = fetch_pos(pid_j,x_); //[pid_j];
fetch4(atom_j,pid_j,pos_tex); //[pid_j];
pos_sh[tid].x = atom_j.x;
pos_sh[tid].y = atom_j.y;
pos_sh[tid].z = atom_j.z;

View File

@ -16,12 +16,15 @@
#include "lal_precision.h"
#include "lal_neighbor_shared.h"
#ifdef USE_OPENCL
#if defined(USE_OPENCL)
#include "neighbor_cpu_cl.h"
#include "neighbor_gpu_cl.h"
#elif defined(USE_CUDART)
const char *neighbor_cpu=0;
const char *neighbor_gpu=0;
#else
#include "neighbor_cpu_ptx.h"
#include "neighbor_gpu_ptx.h"
#include "neighbor_cpu_cubin.h"
#include "neighbor_gpu_cubin.h"
#endif
using namespace LAMMPS_AL;
@ -69,7 +72,7 @@ void NeighborShared::compile_kernels(UCL_Device &dev, const int gpu_nbor) {
k_build_nbor.set_function(*build_program,"calc_neigh_list_cell");
k_transpose.set_function(*build_program,"transpose");
k_special.set_function(*build_program,"kernel_special");
neigh_tex.get_texture(*build_program,"neigh_tex");
neigh_tex.get_texture(*build_program,"pos_tex");
}
_compiled=true;
}

View File

@ -16,18 +16,18 @@
#ifndef LAL_NEIGHBOR_SHARED_H
#define LAL_NEIGHBOR_SHARED_H
#ifdef USE_OPENCL
#if defined(USE_OPENCL)
#include "geryon/ocl_kernel.h"
#include "geryon/ocl_texture.h"
using namespace ucl_opencl;
#elif defined(USE_CUDART)
#include "geryon/nvc_kernel.h"
#include "geryon/nvc_texture.h"
using namespace ucl_cudart;
#else
#include "geryon/nvd_kernel.h"
#include "geryon/nvd_texture.h"
using namespace ucl_cudadr;
#endif
namespace LAMMPS_AL {

View File

@ -13,11 +13,14 @@
email : brownw@ornl.gov
***************************************************************************/
#ifdef USE_OPENCL
#if defined(USE_OPENCL)
#include "pppm_cl.h"
#elif defined(USE_CUDART)
const char *pppm_f=0;
const char *pppm_d=0;
#else
#include "pppm_f_ptx.h"
#include "pppm_d_ptx.h"
#include "pppm_f_cubin.h"
#include "pppm_d_cubin.h"
#endif
#include "lal_pppm.h"
#include <cassert>
@ -51,7 +54,7 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen,
const int nylo_out, const int nzlo_out,
const int nxhi_out, const int nyhi_out,
const int nzhi_out, grdtyp **rho_coeff,
grdtyp **vd_brick, const double slab_volfactor,
grdtyp **vd_brick_p, const double slab_volfactor,
const int nx_pppm, const int ny_pppm,
const int nz_pppm, const bool split, int &flag) {
_max_bytes=10;
@ -92,8 +95,8 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen,
time_interp.init(*ucl_device);
time_interp.zero();
pos_tex.bind_float(atom->dev_x,4);
q_tex.bind_float(atom->dev_q,1);
pos_tex.bind_float(atom->x,4);
q_tex.bind_float(atom->q,1);
_allocated=true;
_max_bytes=0;
@ -133,14 +136,12 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen,
_npts_y=nyhi_out-nylo_out+1;
_npts_z=nzhi_out-nzlo_out+1;
_npts_yx=_npts_x*_npts_y;
success=success && (d_brick.alloc(_npts_x*_npts_y*_npts_z*4,*ucl_device)==
success=success && (brick.alloc(_npts_x*_npts_y*_npts_z,*ucl_device)==
UCL_SUCCESS);
success=success && (h_brick.alloc(_npts_x*_npts_y*_npts_z,*ucl_device)==
success=success && (vd_brick.alloc(_npts_x*_npts_y*_npts_z*4,*ucl_device)==
UCL_SUCCESS);
success=success && (h_vd_brick.alloc(_npts_x*_npts_y*_npts_z*4,*ucl_device)==
UCL_SUCCESS);
*vd_brick=h_vd_brick.begin();
_max_bytes+=d_brick.row_bytes();
*vd_brick_p=vd_brick.host.begin();
_max_bytes+=brick.device.row_bytes()+vd_brick.device.row_bytes();
// Allocate vector with count of atoms assigned to each grid point
_nlocal_x=_npts_x+_nlower-_nupper;
@ -158,20 +159,19 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen,
_max_bytes+=d_brick_atoms.row_bytes();
// Allocate error flags for checking out of bounds atoms
success=success && (h_error_flag.alloc(1,*ucl_device)==UCL_SUCCESS);
success=success && (d_error_flag.alloc(1,*ucl_device,UCL_WRITE_ONLY)==
UCL_SUCCESS);
success=success && (error_flag.alloc(1,*ucl_device,UCL_RW_OPTIMIZED,
UCL_WRITE_ONLY)==UCL_SUCCESS);
if (!success) {
flag=-3;
return 0;
}
d_error_flag.zero();
error_flag.device.zero();
_max_bytes+=1;
_cpu_idle_time=0.0;
return h_brick.begin();
return brick.host.begin();
}
template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
@ -181,12 +181,10 @@ void PPPMT::clear(const double cpu_time) {
_allocated=false;
_precompute_done=false;
d_brick.clear();
h_brick.clear();
h_vd_brick.clear();
brick.clear();
vd_brick.clear();
d_brick_counts.clear();
h_error_flag.clear();
d_error_flag.clear();
error_flag.clear();
d_brick_atoms.clear();
acc_timers();
@ -269,11 +267,11 @@ void PPPMT::_precompute(const int ago, const int nlocal, const int nall,
device->zero(d_brick_counts,d_brick_counts.numel());
k_particle_map.set_size(GX,BX);
k_particle_map.run(&atom->dev_x.begin(), &atom->dev_q.begin(), &f_delvolinv,
&ainum, &d_brick_counts.begin(), &d_brick_atoms.begin(),
&_brick_x, &_brick_y, &_brick_z, &_delxinv, &_delyinv,
&_delzinv, &_nlocal_x, &_nlocal_y, &_nlocal_z,
&_atom_stride, &_max_brick_atoms, &d_error_flag.begin());
k_particle_map.run(&atom->x, &atom->q, &f_delvolinv, &ainum,
&d_brick_counts, &d_brick_atoms, &_brick_x, &_brick_y,
&_brick_z, &_delxinv, &_delyinv, &_delzinv, &_nlocal_x,
&_nlocal_y, &_nlocal_z, &_atom_stride, &_max_brick_atoms,
&error_flag);
time_map.stop();
time_rho.start();
@ -282,15 +280,14 @@ void PPPMT::_precompute(const int ago, const int nlocal, const int nall,
GX=static_cast<int>(ceil(static_cast<double>(_npts_y*_npts_z)/
_block_pencils));
k_make_rho.set_size(GX,BX);
k_make_rho.run(&d_brick_counts.begin(), &d_brick_atoms.begin(),
&d_brick.begin(), &d_rho_coeff.begin(), &_atom_stride,
&_npts_x, &_npts_y, &_npts_z, &_nlocal_x, &_nlocal_y,
&_nlocal_z, &_order_m_1, &_order, &_order2);
k_make_rho.run(&d_brick_counts, &d_brick_atoms, &brick, &d_rho_coeff,
&_atom_stride, &_npts_x, &_npts_y, &_npts_z, &_nlocal_x,
&_nlocal_y, &_nlocal_z, &_order_m_1, &_order, &_order2);
time_rho.stop();
time_out.start();
ucl_copy(h_brick,d_brick,_npts_yx*_npts_z,true);
ucl_copy(h_error_flag,d_error_flag,true);
brick.update_host(_npts_yx*_npts_z,true);
error_flag.update_host(true);
time_out.stop();
_precompute_done=true;
@ -322,18 +319,17 @@ int PPPMT::spread(const int ago, const int nlocal, const int nall,
_precompute_done=false;
if (h_error_flag[0]==2) {
if (error_flag[0]==2) {
// Not enough storage for atoms on the brick
_max_brick_atoms*=2;
d_error_flag.zero();
d_brick_atoms.clear();
d_brick_atoms.alloc(_atom_stride*_max_brick_atoms,*ucl_device);
error_flag.device.zero();
d_brick_atoms.resize(_atom_stride*_max_brick_atoms);
_max_bytes+=d_brick_atoms.row_bytes();
return spread(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo,
delxinv,delyinv,delzinv);
}
return h_error_flag[0];
return error_flag[0];
}
// ---------------------------------------------------------------------------
@ -342,7 +338,7 @@ int PPPMT::spread(const int ago, const int nlocal, const int nall,
template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
void PPPMT::interp(const grdtyp qqrd2e_scale) {
time_in.start();
ucl_copy(d_brick,h_vd_brick,true);
vd_brick.update_device(true);
time_in.stop();
time_interp.start();
@ -353,10 +349,10 @@ void PPPMT::interp(const grdtyp qqrd2e_scale) {
int ainum=this->ans->inum();
k_interp.set_size(GX,BX);
k_interp.run(&atom->dev_x.begin(), &atom->dev_q.begin(), &ainum,
&d_brick.begin(), &d_rho_coeff.begin(), &_npts_x, &_npts_yx,
&_brick_x, &_brick_y, &_brick_z, &_delxinv, &_delyinv, &_delzinv,
&_order, &_order2, &qqrd2e_scale, &ans->dev_ans.begin());
k_interp.run(&atom->x, &atom->q, &ainum, &vd_brick, &d_rho_coeff,
&_npts_x, &_npts_yx, &_brick_x, &_brick_y, &_brick_z, &_delxinv,
&_delyinv, &_delzinv, &_order, &_order2, &qqrd2e_scale,
&ans->force);
time_interp.stop();
ans->copy_answers(false,false,false,false);
@ -408,4 +404,3 @@ void PPPMT::compile_kernels(UCL_Device &dev) {
template class PPPM<PRECISION,ACC_PRECISION,float,_lgpu_float4>;
template class PPPM<PRECISION,ACC_PRECISION,double,_lgpu_double4>;

View File

@ -14,14 +14,14 @@
// ***************************************************************************/
#ifdef NV_KERNEL
#include "lal_preprocessor.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
#ifndef _DOUBLE_DOUBLE
ucl_inline float4 fetch_pos(const int& i, const float4 *pos)
{ return tex1Dfetch(pos_tex, i); }
ucl_inline float fetch_q(const int& i, const float *q)
{ return tex1Dfetch(q_tex, i); }
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
#endif
// Allow PPPM to compile without atomics for NVIDIA 1.0 cards, error
@ -31,6 +31,8 @@ ucl_inline float fetch_q(const int& i, const float *q)
#endif
#else
#define pos_tex x_
#define q_tex q_
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics: enable
#endif
@ -59,9 +61,11 @@ __kernel void particle_map(__global numtyp4 *x_, __global numtyp *q_,
int nx,ny,nz;
if (ii<nlocal) {
numtyp4 p=fetch_pos(ii,x_);
numtyp4 p;
fetch4(p,ii,pos_tex);
grdtyp4 delta;
delta.w=delvolinv*fetch_q(ii,q_);
fetch(delta.w,ii,q_tex);
delta.w*=delvolinv;
if (delta.w!=(grdtyp)0.0) {
delta.x=(p.x-b_lo_x)*delxinv;
@ -212,8 +216,11 @@ __kernel void interp(__global numtyp4 *x_, __global numtyp *q_,
grdtyp tx,ty,tz;
if (ii<nlocal) {
numtyp4 p=fetch_pos(ii,x_);
grdtyp qs=qqrd2e_scale*fetch_q(ii,q_);
numtyp4 p;
fetch4(p,ii,pos_tex);
grdtyp qs;
fetch(qs,ii,q_tex);
qs*=qqrd2e_scale;
acctyp4 ek;
ek.x=(acctyp)0.0;

View File

@ -19,8 +19,10 @@
#include "mpi.h"
#include "lal_device.h"
#ifdef USE_OPENCL
#if defined(USE_OPENCL)
#include "geryon/ocl_texture.h"
#elif defined(USE_CUDART)
#include "geryon/nvc_texture.h"
#else
#include "geryon/nvd_texture.h"
#endif
@ -55,8 +57,8 @@ class PPPM {
/** \param success set to false if insufficient memory **/
inline void resize_atom(const int inum, const int nall, bool &success) {
if (atom->resize(nall, success)) {
pos_tex.bind_float(atom->dev_x,4);
q_tex.bind_float(atom->dev_q,1);
pos_tex.bind_float(atom->x,4);
q_tex.bind_float(atom->q,1);
}
ans->resize(inum,success);
}
@ -138,8 +140,8 @@ class PPPM {
// --------------------------- GRID DATA --------------------------
UCL_H_Vec<grdtyp> h_brick, h_vd_brick;
UCL_D_Vec<grdtyp> d_brick;
UCL_Vector<grdtyp,grdtyp> brick;
UCL_Vector<grdtyp,grdtyp> vd_brick;
// Count of number of atoms assigned to each grid point
UCL_D_Vec<int> d_brick_counts;
@ -147,8 +149,7 @@ class PPPM {
UCL_D_Vec<grdtyp4> d_brick_atoms;
// Error checking for out of bounds atoms
UCL_D_Vec<int> d_error_flag;
UCL_H_Vec<int> h_error_flag;
UCL_Vector<int,int> error_flag;
// Number of grid points in brick (including ghost)
int _npts_x, _npts_y, _npts_z, _npts_yx;

View File

@ -16,6 +16,10 @@
#ifndef LAL_PRECISION_H
#define LAL_PRECISION_H
#if defined(USE_CUDART)
#include <cuda_runtime.h>
#endif
struct _lgpu_int2 {
int x; int y;
};
@ -108,3 +112,4 @@ enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
#endif
#endif

View File

@ -107,7 +107,7 @@
#define BLOCK_NBOR_BUILD 128
#define BLOCK_PAIR 128
#define BLOCK_BIO_PAIR 128
#define MAX_SHARED_TYPES 11
#define MAX_SHARED_TYPES 8
#else
@ -129,8 +129,21 @@
#define MAX_BIO_SHARED_TYPES 128
#ifdef _DOUBLE_DOUBLE
ucl_inline double4 fetch_pos(const int& i, const double4 *pos) { return pos[i]; };
ucl_inline double fetch_q(const int& i, const double *q) { return q[i]; };
#define fetch4(ans,i,pos_tex) { \
int4 xy = tex1Dfetch(pos_tex,i*2); \
int4 zt = tex1Dfetch(pos_tex,i*2+1); \
ans.x=__hiloint2double(xy.y, xy.x); \
ans.y=__hiloint2double(xy.w, xy.z); \
ans.z=__hiloint2double(zt.y, zt.x); \
ans.w=__hiloint2double(zt.w, zt.z); \
}
#define fetch(ans,i,q_tex) { \
int2 qt = tex1Dfetch(q_tex,i); \
ans=__hiloint2double(qt.y, qt.x); \
}
#else
#define fetch4(ans,i,pos_tex) ans=tex1Dfetch(pos_tex, i);
#define fetch(ans,i,q_tex) ans=tex1Dfetch(q_tex,i);
#endif
#if (__CUDA_ARCH__ < 200)
@ -293,8 +306,8 @@ typedef struct _double4 double4;
#define BLOCK_ID_Y get_group_id(1)
#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
#define ucl_inline inline
#define fetch_pos(i,y) x_[i]
#define fetch_q(i,y) q_[i]
#define fetch4(ans,i,x) ans=x[i]
#define fetch(ans,i,q) ans=q[i]
#define ucl_atan atan
#define ucl_cbrt cbrt

View File

@ -13,12 +13,15 @@
email : brownw@ornl.gov
***************************************************************************/
#ifdef USE_OPENCL
#if defined(USE_OPENCL)
#include "re_squared_cl.h"
#include "re_squared_lj_cl.h"
#elif defined(USE_CUDART)
const char *re_squared=0;
const char *re_squared_lj=0;
#else
#include "re_squared_ptx.h"
#include "re_squared_lj_ptx.h"
#include "re_squared_cubin.h"
#include "re_squared_lj_cubin.h"
#endif
#include "lal_re_squared.h"
@ -54,7 +57,8 @@ int RESquaredT::init(const int ntypes, double **host_shape, double **host_well,
const double gpu_split, FILE *_screen) {
int success;
success=this->init_base(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,ntypes,h_form,re_squared,re_squared_lj,true);
_screen,ntypes,h_form,re_squared,re_squared_lj,
"k_resquared",true);
if (success!=0)
return success;
@ -198,13 +202,13 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
this->time_ellipsoid.start();
this->k_ellipsoid.set_size(GX,BX);
this->k_ellipsoid.run(&this->atom->dev_x.begin(),
&this->atom->dev_quat.begin(), &this->shape.begin(), &this->well.begin(),
&this->special_lj.begin(), &this->sigma_epsilon.begin(),
&this->_lj_types, &this->nbor->dev_nbor.begin(), &stride,
&this->ans->dev_ans.begin(),&ainum,&this->ans->dev_engv.begin(),
&this->dev_error.begin(), &eflag, &vflag, &this->_last_ellipse,
&this->_threads_per_atom);
this->k_ellipsoid.run(&this->atom->x, &this->atom->quat,
&this->shape, &this->well, &this->special_lj,
&this->sigma_epsilon, &this->_lj_types,
&this->nbor->dev_nbor, &stride,
&this->ans->force,&ainum, &this->ans->engv,
&this->dev_error, &eflag, &vflag,
&this->_last_ellipse, &this->_threads_per_atom);
this->time_ellipsoid.stop();
// ------------ ELLIPSE_SPHERE ---------------
@ -215,12 +219,13 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
this->time_ellipsoid2.start();
this->k_ellipsoid_sphere.set_size(GX,BX);
this->k_ellipsoid_sphere.run(&this->atom->dev_x.begin(),
&this->atom->dev_quat.begin(), &this->shape.begin(), &this->well.begin(),
&this->special_lj.begin(), &this->sigma_epsilon.begin(),
&this->_lj_types, &this->nbor->dev_nbor.begin(), &stride,
&this->ans->dev_ans.begin(),&ainum,&this->ans->dev_engv.begin(),
&this->dev_error.begin(), &eflag, &vflag, &this->_last_ellipse,
this->k_ellipsoid_sphere.run(&this->atom->x, &this->atom->quat,
&this->shape, &this->well, &this->special_lj,
&this->sigma_epsilon, &this->_lj_types,
&this->nbor->dev_nbor, &stride,
&this->ans->force,&ainum,
&this->ans->engv, &this->dev_error,
&eflag, &vflag, &this->_last_ellipse,
&this->_threads_per_atom);
this->time_ellipsoid2.stop();
@ -245,17 +250,18 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
this->time_ellipsoid3.start();
this->k_sphere_ellipsoid.set_size(GX,BX);
this->k_sphere_ellipsoid.run(&this->atom->dev_x.begin(),
&this->atom->dev_quat.begin(), &this->shape.begin(),
&this->well.begin(), &this->special_lj.begin(),
&this->sigma_epsilon.begin(), &this->_lj_types,
&this->nbor->dev_nbor.begin(), &stride, &this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &this->dev_error.begin(), &eflag,
&vflag, &this->_last_ellipse, &ainum, &this->_threads_per_atom);
this->k_sphere_ellipsoid.run(&this->atom->x, &this->atom->quat,
&this->shape, &this->well, &this->special_lj,
&this->sigma_epsilon, &this->_lj_types,
&this->nbor->dev_nbor, &stride,
&this->ans->force, &this->ans->engv,
&this->dev_error, &eflag, &vflag,
&this->_last_ellipse, &ainum,
&this->_threads_per_atom);
this->time_ellipsoid3.stop();
} else {
this->ans->dev_ans.zero();
this->ans->dev_engv.zero();
this->ans->force.zero();
this->ans->engv.zero();
this->time_nbor1.zero();
this->time_ellipsoid.zero();
this->time_nbor2.zero();
@ -269,19 +275,19 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
if (this->_last_ellipse<this->ans->inum()) {
if (this->_shared_types) {
this->k_lj_fast.set_size(GX,BX);
this->k_lj_fast.run(&this->atom->dev_x.begin(), &this->lj1.begin(),
&this->lj3.begin(), &this->special_lj.begin(), &stride,
&this->nbor->dev_packed.begin(), &this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &this->dev_error.begin(),
this->k_lj_fast.run(&this->atom->x, &this->lj1, &this->lj3,
&this->special_lj, &stride,
&this->nbor->dev_packed, &this->ans->force,
&this->ans->engv, &this->dev_error,
&eflag, &vflag, &this->_last_ellipse, &ainum,
&this->_threads_per_atom);
} else {
this->k_lj.set_size(GX,BX);
this->k_lj.run(&this->atom->dev_x.begin(), &this->lj1.begin(),
&this->lj3.begin(), &this->_lj_types, &this->special_lj.begin(),
&stride, &this->nbor->dev_packed.begin(), &this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &this->dev_error.begin(), &eflag,
&vflag, &this->_last_ellipse, &ainum, &this->_threads_per_atom);
this->k_lj.run(&this->atom->x, &this->lj1, &this->lj3,
&this->_lj_types, &this->special_lj, &stride,
&this->nbor->dev_packed, &this->ans->force,
&this->ans->engv, &this->dev_error, &eflag, &vflag,
&this->_last_ellipse, &ainum, &this->_threads_per_atom);
}
}
this->time_lj.stop();
@ -295,13 +301,12 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
this->time_nbor1.stop();
this->time_ellipsoid.start();
this->k_ellipsoid.set_size(GX,BX);
this->k_ellipsoid.run(&this->atom->dev_x.begin(),
&this->atom->dev_quat.begin(), &this->shape.begin(), &this->well.begin(),
&this->special_lj.begin(), &this->sigma_epsilon.begin(),
&this->_lj_types, &this->nbor->dev_nbor.begin(), &stride,
&this->ans->dev_ans.begin(), &ainum, &this->ans->dev_engv.begin(),
&this->dev_error.begin(), &eflag, &vflag, &ainum,
&this->_threads_per_atom);
this->k_ellipsoid.run(&this->atom->x, &this->atom->quat,
&this->shape, &this->well, &this->special_lj,
&this->sigma_epsilon, &this->_lj_types,
&this->nbor->dev_nbor, &stride, &this->ans->force,
&ainum, &this->ans->engv, &this->dev_error,
&eflag, &vflag, &ainum, &this->_threads_per_atom);
this->time_ellipsoid.stop();
}
}

View File

@ -32,7 +32,7 @@ ucl_inline numtyp det_prime(const numtyp m[9], const numtyp m2[9])
return ans;
}
__kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q,
__kernel void k_resquared(__global numtyp4* x_,__global numtyp4 *q,
__global numtyp4* shape, __global numtyp4* well,
__global numtyp *splj, __global numtyp2* sig_eps,
const int ntypes, __global int *dev_nbor,
@ -73,7 +73,7 @@ __kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q,
nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);
numtyp4 ix=x_[i];
numtyp4 ix; fetch4(ix,i,pos_tex);
int itype=ix.w;
numtyp a1[9]; // Rotation matrix (lab->body)
@ -122,7 +122,7 @@ __kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q,
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=x_[j];
numtyp4 jx; fetch4(jx,j,pos_tex);
int jtype=jx.w;
// Compute r12

View File

@ -17,10 +17,11 @@
#include "lal_ellipsoid_extra.h"
#endif
__kernel void kernel_ellipsoid_sphere(__global numtyp4* x_,__global numtyp4 *q,
__global numtyp4* shape, __global numtyp4* well,
__global numtyp *splj, __global numtyp2* sig_eps,
const int ntypes, __global int *dev_nbor, const int stride,
__kernel void k_resquared_ellipsoid_sphere(__global numtyp4* x_,
__global numtyp4 *q, __global numtyp4* shape,
__global numtyp4* well, __global numtyp *splj,
__global numtyp2* sig_eps, const int ntypes,
__global int *dev_nbor, const int stride,
__global acctyp4 *ans, const int astride,
__global acctyp *engv, __global int *err_flag,
const int eflag, const int vflag, const int inum,
@ -59,7 +60,7 @@ __kernel void kernel_ellipsoid_sphere(__global numtyp4* x_,__global numtyp4 *q,
nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);
numtyp4 ix=x_[i];
numtyp4 ix; fetch4(ix,i,pos_tex);
int itype=ix.w;
numtyp a[9]; // Rotation matrix (lab->body)
@ -84,7 +85,7 @@ __kernel void kernel_ellipsoid_sphere(__global numtyp4* x_,__global numtyp4 *q,
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=x_[j];
numtyp4 jx; fetch4(jx,j,pos_tex);
int jtype=jx.w;
// Compute r12
@ -331,14 +332,14 @@ __kernel void kernel_ellipsoid_sphere(__global numtyp4* x_,__global numtyp4 *q,
} // if ii
}
__kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
__global numtyp4* shape,__global numtyp4* well,
__global numtyp *splj, __global numtyp2* sig_eps,
const int ntypes, __global int *dev_nbor,
const int stride, __global acctyp4 *ans,
__global acctyp *engv, __global int *err_flag,
const int eflag, const int vflag,const int start,
const int inum, const int t_per_atom) {
__kernel void k_resquared_sphere_ellipsoid(__global numtyp4 *x_,
__global numtyp4 *q, __global numtyp4* shape,
__global numtyp4* well, __global numtyp *splj,
__global numtyp2* sig_eps, const int ntypes,
__global int *dev_nbor, const int stride,
__global acctyp4 *ans, __global acctyp *engv,
__global int *err_flag, const int eflag, const int vflag,
const int start, const int inum, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
ii+=start;
@ -370,7 +371,7 @@ __kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,j,numj,
n_stride,nbor_end,nbor);
numtyp4 jx=x_[j];
numtyp4 jx; fetch4(jx,j,pos_tex);
int jtype=jx.w;
numtyp factor_lj;
@ -379,7 +380,7 @@ __kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
factor_lj = sp_lj[sbmask(i)];
i &= NEIGHMASK;
numtyp4 ix=x_[i];
numtyp4 ix; fetch4(ix,i,pos_tex);
int itype=ix.w;
numtyp a[9]; // Rotation matrix (lab->body)
@ -524,14 +525,13 @@ __kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
} // if ii
}
__kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
__kernel void k_resquared_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *gum,
const int stride, __global int *dev_ij,
__global acctyp4 *ans, __global acctyp *engv,
__global int *err_flag, const int eflag,
const int vflag, const int start, const int inum,
const int t_per_atom) {
__global numtyp *gum, const int stride,
__global int *dev_ij, __global acctyp4 *ans,
__global acctyp *engv, __global int *err_flag,
const int eflag, const int vflag, const int start,
const int inum, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
ii+=start;
@ -557,7 +557,7 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix=x_[i];
numtyp4 ix; fetch4(ix,i,pos_tex);
int itype=ix.w;
numtyp factor_lj;
@ -567,7 +567,7 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=x_[j];
numtyp4 jx; fetch4(jx,j,pos_tex);
int jtype=jx.w;
// Compute r12
@ -606,13 +606,12 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
} // if ii
}
__kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in, __global numtyp *gum,
const int stride, __global int *dev_ij,
__kernel void k_resquared_lj_fast(__global numtyp4 *x_,
__global numtyp4 *lj1_in, __global numtyp4* lj3_in,
__global numtyp *gum, const int stride, __global int *dev_ij,
__global acctyp4 *ans, __global acctyp *engv,
__global int *err_flag, const int eflag,
const int vflag, const int start, const int inum,
const int t_per_atom) {
__global int *err_flag, const int eflag, const int vflag,
const int start, const int inum, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
ii+=start;
@ -645,7 +644,7 @@ __kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix=x_[i];
numtyp4 ix; fetch4(ix,i,pos_tex);
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
@ -656,7 +655,7 @@ __kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=x_[j];
numtyp4 jx; fetch4(jx,j,pos_tex);
int mtype=itype+jx.w;
// Compute r12

View File

@ -13,10 +13,12 @@
email : nguyentd@ornl.gov
***************************************************************************/
#ifdef USE_OPENCL
#if defined(USE_OPENCL)
#include "table_cl.h"
#elif defined(USE_CUDART)
const char *table=0;
#else
#include "table_ptx.h"
#include "table_cubin.h"
#endif
#include "lal_table.h"
@ -56,17 +58,17 @@ int TableT::init(const int ntypes,
const double gpu_split, FILE *_screen,
int tabstyle, int ntables, int tablength) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,table);
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,
gpu_split,_screen,table,"k_table");
if (success!=0)
return success;
k_pair_linear.set_function(*(this->pair_program),"kernel_pair_linear");
k_pair_linear_fast.set_function(*(this->pair_program),"kernel_pair_linear_fast");
k_pair_spline.set_function(*(this->pair_program),"kernel_pair_spline");
k_pair_spline_fast.set_function(*(this->pair_program),"kernel_pair_spline_fast");
k_pair_bitmap.set_function(*(this->pair_program),"kernel_pair_bitmap");
k_pair_bitmap_fast.set_function(*(this->pair_program),"kernel_pair_bitmap_fast");
k_pair_linear.set_function(*(this->pair_program),"k_table_linear");
k_pair_linear_fast.set_function(*(this->pair_program),"k_table_linear_fast");
k_pair_spline.set_function(*(this->pair_program),"k_table_spline");
k_pair_spline_fast.set_function(*(this->pair_program),"k_table_spline_fast");
k_pair_bitmap.set_function(*(this->pair_program),"k_table_bitmap");
k_pair_bitmap_fast.set_function(*(this->pair_program),"k_table_bitmap_fast");
_compiled_styles = true;
// If atom type constants fit in shared memory use fast kernel
@ -264,84 +266,71 @@ void TableT::loop(const bool _eflag, const bool _vflag) {
if (shared_types) {
if (_tabstyle == LOOKUP) {
this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->dev_x.begin(), &tabindex.begin(),
&coeff2.begin(), &coeff3.begin(),
&coeff4.begin(), &cutsq.begin(), &sp_lj.begin(),
&this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(),
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag,
&ainum, &nbor_pitch, &this->_threads_per_atom,
&_tablength);
this->k_pair_fast.run(&this->atom->x, &tabindex, &coeff2, &coeff3,
&coeff4, &cutsq, &sp_lj, &this->nbor->dev_nbor,
&this->_nbor_data->begin(), &this->ans->force,
&this->ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &_tablength);
} else if (_tabstyle == LINEAR) {
this->k_pair_linear_fast.set_size(GX,BX);
this->k_pair_linear_fast.run(&this->atom->dev_x.begin(), &tabindex.begin(),
&coeff2.begin(), &coeff3.begin(),
&coeff4.begin(), &cutsq.begin(), &sp_lj.begin(),
&this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(),
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag,
&ainum, &nbor_pitch, &this->_threads_per_atom,
&_tablength);
this->k_pair_linear_fast.run(&this->atom->x, &tabindex, &coeff2,
&coeff3, &coeff4, &cutsq, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv,
&eflag, &vflag, &ainum, &nbor_pitch,
&this->_threads_per_atom, &_tablength);
} else if (_tabstyle == SPLINE) {
this->k_pair_spline_fast.set_size(GX,BX);
this->k_pair_spline_fast.run(&this->atom->dev_x.begin(), &tabindex.begin(),
&coeff2.begin(), &coeff3.begin(),
&coeff4.begin(), &cutsq.begin(), &sp_lj.begin(),
&this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(),
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag,
&ainum, &nbor_pitch, &this->_threads_per_atom,
&_tablength);
this->k_pair_spline_fast.run(&this->atom->x, &tabindex, &coeff2,
&coeff3, &coeff4, &cutsq, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv,
&eflag, &vflag, &ainum, &nbor_pitch,
&this->_threads_per_atom, &_tablength);
} else if (_tabstyle == BITMAP) {
this->k_pair_bitmap_fast.set_size(GX,BX);
this->k_pair_bitmap_fast.run(&this->atom->dev_x.begin(), &tabindex.begin(),
&nshiftbits.begin(), &nmask.begin(),
&coeff2.begin(), &coeff3.begin(),
&coeff4.begin(), &cutsq.begin(), &sp_lj.begin(),
&this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(),
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag,
&ainum, &nbor_pitch, &this->_threads_per_atom,
&_tablength);
this->k_pair_bitmap_fast.run(&this->atom->x, &tabindex, &nshiftbits,
&nmask, &coeff2, &coeff3, &coeff4, &cutsq,
&sp_lj, &this->nbor->dev_nbor,
&this->_nbor_data->begin(), &this->ans->force,
&this->ans->engv, &eflag, &vflag,
&ainum, &nbor_pitch,
&this->_threads_per_atom, &_tablength);
}
} else {
if (_tabstyle == LOOKUP) {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->dev_x.begin(), &tabindex.begin(),
&coeff2.begin(), &coeff3.begin(), &coeff4.begin(), &_lj_types,
&cutsq.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &_tablength);
this->k_pair.run(&this->atom->x, &tabindex, &coeff2, &coeff3,
&coeff4, &_lj_types, &cutsq, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag,
&vflag, &ainum, &nbor_pitch, &this->_threads_per_atom,
&_tablength);
} else if (_tabstyle == LINEAR) {
this->k_pair_linear.set_size(GX,BX);
this->k_pair_linear.run(&this->atom->dev_x.begin(), &tabindex.begin(),
&coeff2.begin(), &coeff3.begin(), &coeff4.begin(), &_lj_types,
&cutsq.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &_tablength);
this->k_pair_linear.run(&this->atom->x, &tabindex, &coeff2, &coeff3,
&coeff4, &_lj_types, &cutsq, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag,
&vflag, &ainum, &nbor_pitch,
&this->_threads_per_atom, &_tablength);
} else if (_tabstyle == SPLINE) {
this->k_pair_spline.set_size(GX,BX);
this->k_pair_spline.run(&this->atom->dev_x.begin(), &tabindex.begin(),
&coeff2.begin(), &coeff3.begin(), &coeff4.begin(), &_lj_types,
&cutsq.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &_tablength);
this->k_pair_spline.run(&this->atom->x, &tabindex, &coeff2, &coeff3,
&coeff4, &_lj_types, &cutsq, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag,
&vflag, &ainum, &nbor_pitch,
&this->_threads_per_atom, &_tablength);
} else if (_tabstyle == BITMAP) {
this->k_pair_bitmap.set_size(GX,BX);
this->k_pair_bitmap.run(&this->atom->dev_x.begin(), &tabindex.begin(),
&nshiftbits.begin(), &nmask.begin(),
&coeff2.begin(), &coeff3.begin(), &coeff4.begin(), &_lj_types,
&cutsq.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &_tablength);
this->k_pair_bitmap.run(&this->atom->x, &tabindex, &nshiftbits,
&nmask, &coeff2, &coeff3, &coeff4, &_lj_types,
&cutsq, &sp_lj, &this->nbor->dev_nbor,
&this->_nbor_data->begin(), &this->ans->force,
&this->ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom,
&_tablength);
}
}
this->time_pair.stop();

View File

@ -15,11 +15,13 @@
#ifdef NV_KERNEL
#include "lal_aux_fun1.h"
texture<float4> pos_tex;
#ifndef _DOUBLE_DOUBLE
ucl_inline float4 fetch_pos(const int& i, const float4 *pos)
{ return tex1Dfetch(pos_tex, i); }
texture<float4> pos_tex;
#else
texture<int4,1> pos_tex;
#endif
#else
#define pos_tex x_
#endif
#define LOOKUP 0
@ -37,7 +39,7 @@ typedef union {
/// ---------------- LOOKUP -------------------------------------------------
__kernel void kernel_pair(__global numtyp4 *x_, __global int *tabindex,
__kernel void k_table(__global numtyp4 *x_, __global int *tabindex,
__global numtyp4* coeff2,
__global numtyp4 *coeff3,
__global numtyp4 *coeff4,
@ -73,7 +75,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global int *tabindex,
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
numtyp factor_lj;
@ -83,7 +85,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global int *tabindex,
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int mtype=itype*lj_types+jx.w;
int tbindex = tabindex[mtype];
@ -128,7 +130,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global int *tabindex,
} // if ii
}
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global int *tabindex,
__kernel void k_table_fast(__global numtyp4 *x_, __global int *tabindex,
__global numtyp4* coeff2,
__global numtyp4 *coeff3,
__global numtyp4 *coeff4,
@ -167,7 +169,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global int *tabindex,
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
@ -178,7 +180,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global int *tabindex,
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int mtype=itype+jx.w;
int tbindex = tabindex[mtype];
@ -225,7 +227,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global int *tabindex,
/// ---------------- LINEAR -------------------------------------------------
__kernel void kernel_pair_linear(__global numtyp4 *x_, __global int *tabindex,
__kernel void k_table_linear(__global numtyp4 *x_, __global int *tabindex,
__global numtyp4* coeff2,
__global numtyp4 *coeff3,
__global numtyp4 *coeff4,
@ -261,7 +263,7 @@ __kernel void kernel_pair_linear(__global numtyp4 *x_, __global int *tabindex,
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
numtyp factor_lj;
@ -271,7 +273,7 @@ __kernel void kernel_pair_linear(__global numtyp4 *x_, __global int *tabindex,
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int mtype=itype*lj_types+jx.w;
int tbindex = tabindex[mtype];
@ -320,7 +322,7 @@ __kernel void kernel_pair_linear(__global numtyp4 *x_, __global int *tabindex,
} // if ii
}
__kernel void kernel_pair_linear_fast(__global numtyp4 *x_, __global int *tabindex,
__kernel void k_table_linear_fast(__global numtyp4 *x_, __global int *tabindex,
__global numtyp4* coeff2,
__global numtyp4 *coeff3,
__global numtyp4 *coeff4,
@ -359,7 +361,7 @@ __kernel void kernel_pair_linear_fast(__global numtyp4 *x_, __global int *tabind
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
@ -370,7 +372,7 @@ __kernel void kernel_pair_linear_fast(__global numtyp4 *x_, __global int *tabind
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int mtype=itype+jx.w;
int tbindex = tabindex[mtype];
@ -421,7 +423,7 @@ __kernel void kernel_pair_linear_fast(__global numtyp4 *x_, __global int *tabind
/// ---------------- SPLINE -------------------------------------------------
__kernel void kernel_pair_spline(__global numtyp4 *x_, __global int *tabindex,
__kernel void k_table_spline(__global numtyp4 *x_, __global int *tabindex,
__global numtyp4* coeff2,
__global numtyp4 *coeff3,
__global numtyp4 *coeff4,
@ -457,7 +459,7 @@ __kernel void kernel_pair_spline(__global numtyp4 *x_, __global int *tabindex,
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
numtyp factor_lj;
@ -467,7 +469,7 @@ __kernel void kernel_pair_spline(__global numtyp4 *x_, __global int *tabindex,
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int mtype=itype*lj_types+jx.w;
int tbindex = tabindex[mtype];
@ -523,7 +525,7 @@ __kernel void kernel_pair_spline(__global numtyp4 *x_, __global int *tabindex,
} // if ii
}
__kernel void kernel_pair_spline_fast(__global numtyp4 *x_, __global int *tabindex,
__kernel void k_table_spline_fast(__global numtyp4 *x_, __global int *tabindex,
__global numtyp4* coeff2,
__global numtyp4 *coeff3,
__global numtyp4 *coeff4,
@ -562,7 +564,7 @@ __kernel void kernel_pair_spline_fast(__global numtyp4 *x_, __global int *tabind
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
@ -573,7 +575,7 @@ __kernel void kernel_pair_spline_fast(__global numtyp4 *x_, __global int *tabind
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int mtype=itype+jx.w;
int tbindex = tabindex[mtype];
@ -631,7 +633,7 @@ __kernel void kernel_pair_spline_fast(__global numtyp4 *x_, __global int *tabind
/// ---------------- BITMAP -------------------------------------------------
__kernel void kernel_pair_bitmap(__global numtyp4 *x_, __global int *tabindex,
__kernel void k_table_bitmap(__global numtyp4 *x_, __global int *tabindex,
__global int *nshiftbits, __global int *nmask,
__global numtyp4* coeff2,
__global numtyp4 *coeff3,
@ -668,7 +670,7 @@ __kernel void kernel_pair_bitmap(__global numtyp4 *x_, __global int *tabindex,
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
numtyp factor_lj;
@ -678,7 +680,7 @@ __kernel void kernel_pair_bitmap(__global numtyp4 *x_, __global int *tabindex,
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int mtype=itype*lj_types+jx.w;
int tbindex = tabindex[mtype];
@ -730,7 +732,7 @@ __kernel void kernel_pair_bitmap(__global numtyp4 *x_, __global int *tabindex,
} // if ii
}
__kernel void kernel_pair_bitmap_fast(__global numtyp4 *x_, __global int *tabindex,
__kernel void k_table_bitmap_fast(__global numtyp4 *x_, __global int *tabindex,
__global int *nshiftbits, __global int *nmask,
__global numtyp4* coeff2,
__global numtyp4 *coeff3,
@ -770,7 +772,7 @@ __kernel void kernel_pair_bitmap_fast(__global numtyp4 *x_, __global int *tabind
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
@ -781,7 +783,7 @@ __kernel void kernel_pair_bitmap_fast(__global numtyp4 *x_, __global int *tabind
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int mtype=itype+jx.w;
int tbindex = tabindex[mtype];

View File

@ -13,10 +13,12 @@
email : nguyentd@ornl.gov
***************************************************************************/
#ifdef USE_OPENCL
#if defined(USE_OPENCL)
#include "yukawa_cl.h"
#elif defined(USE_CUDART)
const char *yukawa=0;
#else
#include "yukawa_ptx.h"
#include "yukawa_cubin.h"
#endif
#include "lal_yukawa.h"
@ -50,7 +52,7 @@ int YukawaT::init(const int ntypes,
const double gpu_split, FILE *_screen) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,yukawa);
_screen,yukawa,"k_yukawa");
if (success!=0)
return success;
@ -129,20 +131,17 @@ void YukawaT::loop(const bool _eflag, const bool _vflag) {
this->time_pair.start();
if (shared_types) {
this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->dev_x.begin(), &coeff.begin(), &_kappa,
&sp_lj.begin(),
&this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(),
&this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag,
&ainum, &nbor_pitch, &this->_threads_per_atom);
this->k_pair_fast.run(&this->atom->x, &coeff, &_kappa, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag,
&vflag, &ainum, &nbor_pitch,
&this->_threads_per_atom);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->dev_x.begin(), &coeff.begin(), &_kappa,
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom);
this->k_pair.run(&this->atom->x, &coeff, &_kappa, &_lj_types, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &vflag,
&ainum, &nbor_pitch, &this->_threads_per_atom);
}
this->time_pair.stop();
}

View File

@ -15,14 +15,16 @@
#ifdef NV_KERNEL
#include "lal_aux_fun1.h"
texture<float4> pos_tex;
#ifndef _DOUBLE_DOUBLE
ucl_inline float4 fetch_pos(const int& i, const float4 *pos)
{ return tex1Dfetch(pos_tex, i); }
texture<float4> pos_tex;
#else
texture<int4,1> pos_tex;
#endif
#else
#define pos_tex x_
#endif
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff,
__kernel void k_yukawa(__global numtyp4 *x_, __global numtyp4 *coeff,
const numtyp kappa, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
@ -51,7 +53,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff,
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
numtyp factor_lj;
@ -61,7 +63,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff,
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
// Compute r12
@ -103,7 +105,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff,
} // if ii
}
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff_in,
__kernel void k_yukawa_fast(__global numtyp4 *x_, __global numtyp4 *coeff_in,
const numtyp kappa, __global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
@ -135,7 +137,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff_in,
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix=fetch_pos(i,x_); //x_[i];
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
@ -146,7 +148,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff_in,
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx=fetch_pos(j,x_); //x_[j];
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int mtype=itype+jx.w;
// Compute r12