git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@8693 f3b2605a-c512-4ea7-a41b-209d697bcdaa
This commit is contained in:
@ -3,6 +3,7 @@ CUDA = $(NVCC) $(CUDA_INCLUDE) $(CUDA_OPTS) -Icudpp_mini $(CUDA_ARCH) \
|
||||
CUDR = $(CUDR_CPP) $(CUDR_OPTS) $(CUDA_PRECISION) $(CUDA_INCLUDE) \
|
||||
$(CUDPP_OPT)
|
||||
CUDA_LINK = $(CUDA_LIB) -lcudart
|
||||
BIN2C = $(CUDA_HOME)/bin/bin2c
|
||||
|
||||
GPU_LIB = $(LIB_DIR)/libgpu.a
|
||||
|
||||
@ -27,6 +28,7 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_ans.o \
|
||||
$(OBJ_DIR)/lal_neighbor.o $(OBJ_DIR)/lal_neighbor_shared.o \
|
||||
$(OBJ_DIR)/lal_device.o $(OBJ_DIR)/lal_base_atomic.o \
|
||||
$(OBJ_DIR)/lal_base_charge.o $(OBJ_DIR)/lal_base_ellipsoid.o \
|
||||
$(OBJ_DIR)/lal_base_dipole.o \
|
||||
$(OBJ_DIR)/lal_pppm.o $(OBJ_DIR)/lal_pppm_ext.o \
|
||||
$(OBJ_DIR)/lal_gayberne.o $(OBJ_DIR)/lal_gayberne_ext.o \
|
||||
$(OBJ_DIR)/lal_re_squared.o $(OBJ_DIR)/lal_re_squared_ext.o \
|
||||
@ -35,6 +37,7 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_ans.o \
|
||||
$(OBJ_DIR)/lal_lj_expand.o $(OBJ_DIR)/lal_lj_expand_ext.o \
|
||||
$(OBJ_DIR)/lal_lj_coul.o $(OBJ_DIR)/lal_lj_coul_ext.o \
|
||||
$(OBJ_DIR)/lal_lj_coul_long.o $(OBJ_DIR)/lal_lj_coul_long_ext.o \
|
||||
$(OBJ_DIR)/lal_lj_dsf.o $(OBJ_DIR)/lal_lj_dsf_ext.o \
|
||||
$(OBJ_DIR)/lal_lj_class2_long.o $(OBJ_DIR)/lal_lj_class2_long_ext.o \
|
||||
$(OBJ_DIR)/lal_coul_long.o $(OBJ_DIR)/lal_coul_long_ext.o \
|
||||
$(OBJ_DIR)/lal_morse.o $(OBJ_DIR)/lal_morse_ext.o \
|
||||
@ -46,35 +49,57 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_ans.o \
|
||||
$(OBJ_DIR)/lal_buck_coul.o $(OBJ_DIR)/lal_buck_coul_ext.o \
|
||||
$(OBJ_DIR)/lal_buck_coul_long.o $(OBJ_DIR)/lal_buck_coul_long_ext.o \
|
||||
$(OBJ_DIR)/lal_table.o $(OBJ_DIR)/lal_table_ext.o \
|
||||
$(OBJ_DIR)/lal_yukawa.o $(OBJ_DIR)/lal_yukawa_ext.o
|
||||
PTXS = $(OBJ_DIR)/device.ptx $(OBJ_DIR)/device_ptx.h \
|
||||
$(OBJ_DIR)/atom.ptx $(OBJ_DIR)/atom_ptx.h \
|
||||
$(OBJ_DIR)/neighbor_cpu.ptx $(OBJ_DIR)/neighbor_cpu_ptx.h \
|
||||
$(OBJ_DIR)/neighbor_gpu.ptx $(OBJ_DIR)/neighbor_gpu_ptx.h \
|
||||
$(OBJ_DIR)/pppm_f.ptx $(OBJ_DIR)/pppm_f_ptx.h \
|
||||
$(OBJ_DIR)/pppm_d.ptx $(OBJ_DIR)/pppm_d_ptx.h \
|
||||
$(OBJ_DIR)/ellipsoid_nbor.ptx $(OBJ_DIR)/ellipsoid_nbor_ptx.h \
|
||||
$(OBJ_DIR)/gayberne.ptx $(OBJ_DIR)/gayberne_lj.ptx \
|
||||
$(OBJ_DIR)/gayberne_ptx.h $(OBJ_DIR)/gayberne_lj_ptx.h \
|
||||
$(OBJ_DIR)/re_squared.ptx $(OBJ_DIR)/re_squared_lj.ptx \
|
||||
$(OBJ_DIR)/re_squared_ptx.h $(OBJ_DIR)/re_squared_lj_ptx.h \
|
||||
$(OBJ_DIR)/lj.ptx $(OBJ_DIR)/lj_ptx.h \
|
||||
$(OBJ_DIR)/lj96.ptx $(OBJ_DIR)/lj96_ptx.h \
|
||||
$(OBJ_DIR)/lj_expand.ptx $(OBJ_DIR)/lj_expand_ptx.h \
|
||||
$(OBJ_DIR)/lj_coul.ptx $(OBJ_DIR)/lj_coul_ptx.h \
|
||||
$(OBJ_DIR)/lj_coul_long.ptx $(OBJ_DIR)/lj_coul_long_ptx.h \
|
||||
$(OBJ_DIR)/lj_class2_long.ptx $(OBJ_DIR)/lj_class2_long_ptx.h \
|
||||
$(OBJ_DIR)/coul_long.ptx $(OBJ_DIR)/coul_long_ptx.h \
|
||||
$(OBJ_DIR)/morse.ptx $(OBJ_DIR)/morse_ptx.h \
|
||||
$(OBJ_DIR)/charmm_long.ptx $(OBJ_DIR)/charmm_long_ptx.h \
|
||||
$(OBJ_DIR)/cg_cmm.ptx $(OBJ_DIR)/cg_cmm_ptx.h \
|
||||
$(OBJ_DIR)/cg_cmm_long.ptx $(OBJ_DIR)/cg_cmm_long_ptx.h \
|
||||
$(OBJ_DIR)/eam.ptx $(OBJ_DIR)/eam_ptx.h \
|
||||
$(OBJ_DIR)/buck.ptx $(OBJ_DIR)/buck_ptx.h \
|
||||
$(OBJ_DIR)/buck_coul.ptx $(OBJ_DIR)/buck_coul_ptx.h \
|
||||
$(OBJ_DIR)/buck_coul_long.ptx $(OBJ_DIR)/buck_coul_long_ptx.h \
|
||||
$(OBJ_DIR)/table.ptx $(OBJ_DIR)/table_ptx.h \
|
||||
$(OBJ_DIR)/yukawa.ptx $(OBJ_DIR)/yukawa_ptx.h
|
||||
$(OBJ_DIR)/lal_yukawa.o $(OBJ_DIR)/lal_yukawa_ext.o \
|
||||
$(OBJ_DIR)/lal_born.o $(OBJ_DIR)/lal_born_ext.o \
|
||||
$(OBJ_DIR)/lal_born_coul_wolf.o $(OBJ_DIR)/lal_born_coul_wolf_ext.o \
|
||||
$(OBJ_DIR)/lal_born_coul_long.o $(OBJ_DIR)/lal_born_coul_long_ext.o \
|
||||
$(OBJ_DIR)/lal_dipole_lj.o $(OBJ_DIR)/lal_dipole_lj_ext.o \
|
||||
$(OBJ_DIR)/lal_dipole_lj_sf.o $(OBJ_DIR)/lal_dipole_lj_sf_ext.o \
|
||||
$(OBJ_DIR)/lal_colloid.o $(OBJ_DIR)/lal_colloid_ext.o \
|
||||
$(OBJ_DIR)/lal_gauss.o $(OBJ_DIR)/lal_gauss_ext.o \
|
||||
$(OBJ_DIR)/lal_yukawa_colloid.o $(OBJ_DIR)/lal_yukawa_colloid_ext.o \
|
||||
$(OBJ_DIR)/lal_lj_coul_debye.o $(OBJ_DIR)/lal_lj_coul_debye_ext.o \
|
||||
$(OBJ_DIR)/lal_coul_dsf.o $(OBJ_DIR)/lal_coul_dsf_ext.o
|
||||
|
||||
CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \
|
||||
$(OBJ_DIR)/atom.cubin $(OBJ_DIR)/atom_cubin.h \
|
||||
$(OBJ_DIR)/neighbor_cpu.cubin $(OBJ_DIR)/neighbor_cpu_cubin.h \
|
||||
$(OBJ_DIR)/neighbor_gpu.cubin $(OBJ_DIR)/neighbor_gpu_cubin.h \
|
||||
$(OBJ_DIR)/pppm_f.cubin $(OBJ_DIR)/pppm_f_cubin.h \
|
||||
$(OBJ_DIR)/pppm_d.cubin $(OBJ_DIR)/pppm_d_cubin.h \
|
||||
$(OBJ_DIR)/ellipsoid_nbor.cubin $(OBJ_DIR)/ellipsoid_nbor_cubin.h \
|
||||
$(OBJ_DIR)/gayberne.cubin $(OBJ_DIR)/gayberne_lj.cubin \
|
||||
$(OBJ_DIR)/gayberne_cubin.h $(OBJ_DIR)/gayberne_lj_cubin.h \
|
||||
$(OBJ_DIR)/re_squared.cubin $(OBJ_DIR)/re_squared_lj.cubin \
|
||||
$(OBJ_DIR)/re_squared_cubin.h $(OBJ_DIR)/re_squared_lj_cubin.h \
|
||||
$(OBJ_DIR)/lj.cubin $(OBJ_DIR)/lj_cubin.h \
|
||||
$(OBJ_DIR)/lj96.cubin $(OBJ_DIR)/lj96_cubin.h \
|
||||
$(OBJ_DIR)/lj_expand.cubin $(OBJ_DIR)/lj_expand_cubin.h \
|
||||
$(OBJ_DIR)/lj_coul.cubin $(OBJ_DIR)/lj_coul_cubin.h \
|
||||
$(OBJ_DIR)/lj_coul_long.cubin $(OBJ_DIR)/lj_coul_long_cubin.h \
|
||||
$(OBJ_DIR)/lj_dsf.cubin $(OBJ_DIR)/lj_dsf_cubin.h \
|
||||
$(OBJ_DIR)/lj_class2_long.cubin $(OBJ_DIR)/lj_class2_long_cubin.h \
|
||||
$(OBJ_DIR)/coul_long.cubin $(OBJ_DIR)/coul_long_cubin.h \
|
||||
$(OBJ_DIR)/morse.cubin $(OBJ_DIR)/morse_cubin.h \
|
||||
$(OBJ_DIR)/charmm_long.cubin $(OBJ_DIR)/charmm_long_cubin.h \
|
||||
$(OBJ_DIR)/cg_cmm.cubin $(OBJ_DIR)/cg_cmm_cubin.h \
|
||||
$(OBJ_DIR)/cg_cmm_long.cubin $(OBJ_DIR)/cg_cmm_long_cubin.h \
|
||||
$(OBJ_DIR)/eam.cubin $(OBJ_DIR)/eam_cubin.h \
|
||||
$(OBJ_DIR)/buck.cubin $(OBJ_DIR)/buck_cubin.h \
|
||||
$(OBJ_DIR)/buck_coul_long.cubin $(OBJ_DIR)/buck_coul_long_cubin.h \
|
||||
$(OBJ_DIR)/buck_coul_wolf.cubin $(OBJ_DIR)/buck_coul_wolf_cubin.h \
|
||||
$(OBJ_DIR)/table.cubin $(OBJ_DIR)/table_cubin.h \
|
||||
$(OBJ_DIR)/yukawa.cubin $(OBJ_DIR)/yukawa_cubin.h \
|
||||
$(OBJ_DIR)/born.cubin $(OBJ_DIR)/born_cubin.h \
|
||||
$(OBJ_DIR)/born_coul_wolf.cubin $(OBJ_DIR)/born_coul_wolf_cubin.h \
|
||||
$(OBJ_DIR)/born_coul_long.cubin $(OBJ_DIR)/born_coul_long_cubin.h \
|
||||
$(OBJ_DIR)/dipole_lj.cubin $(OBJ_DIR)/dipole_lj_cubin.h \
|
||||
$(OBJ_DIR)/dipole_lj_sf.cubin $(OBJ_DIR)/dipole_lj_sf_cubin.h \
|
||||
$(OBJ_DIR)/colloid.cubin $(OBJ_DIR)/colloid_cubin.h \
|
||||
$(OBJ_DIR)/gauss.cubin $(OBJ_DIR)/gauss_cubin.h \
|
||||
$(OBJ_DIR)/yukawa_colloid.cubin $(OBJ_DIR)/yukawa_colloid_cubin.h \
|
||||
$(OBJ_DIR)/lj_coul_debye.cubin $(OBJ_DIR)/lj_coul_debye_cubin.h \
|
||||
$(OBJ_DIR)/coul_dsf.cubin $(OBJ_DIR)/coul_dsf_cubin.h
|
||||
|
||||
all: $(GPU_LIB) $(EXECS)
|
||||
|
||||
@ -96,43 +121,43 @@ $(OBJ_DIR)/radixsort_app.cu_o: cudpp_mini/radixsort_app.cu
|
||||
$(OBJ_DIR)/scan_app.cu_o: cudpp_mini/scan_app.cu
|
||||
$(CUDA) -o $@ -c cudpp_mini/scan_app.cu
|
||||
|
||||
$(OBJ_DIR)/atom.ptx: lal_atom.cu lal_preprocessor.h
|
||||
$(CUDA) --ptx -DNV_KERNEL -o $@ lal_atom.cu
|
||||
$(OBJ_DIR)/atom.cubin: lal_atom.cu lal_preprocessor.h
|
||||
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_atom.cu
|
||||
|
||||
$(OBJ_DIR)/atom_ptx.h: $(OBJ_DIR)/atom.ptx
|
||||
$(BSH) ./geryon/file_to_cstr.sh atom $(OBJ_DIR)/atom.ptx $(OBJ_DIR)/atom_ptx.h
|
||||
$(OBJ_DIR)/atom_cubin.h: $(OBJ_DIR)/atom.cubin
|
||||
$(BIN2C) -c -n atom $(OBJ_DIR)/atom.cubin > $(OBJ_DIR)/atom_cubin.h
|
||||
|
||||
$(OBJ_DIR)/lal_atom.o: lal_atom.cpp lal_atom.h $(NVD_H) $(OBJ_DIR)/atom_ptx.h
|
||||
$(OBJ_DIR)/lal_atom.o: lal_atom.cpp lal_atom.h $(NVD_H) $(OBJ_DIR)/atom_cubin.h
|
||||
$(CUDR) -o $@ -c lal_atom.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_ans.o: lal_answer.cpp lal_answer.h $(NVD_H)
|
||||
$(CUDR) -o $@ -c lal_answer.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/neighbor_cpu.ptx: lal_neighbor_cpu.cu lal_preprocessor.h
|
||||
$(CUDA) --ptx -DNV_KERNEL -o $@ lal_neighbor_cpu.cu
|
||||
$(OBJ_DIR)/neighbor_cpu.cubin: lal_neighbor_cpu.cu lal_preprocessor.h
|
||||
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_neighbor_cpu.cu
|
||||
|
||||
$(OBJ_DIR)/neighbor_cpu_ptx.h: $(OBJ_DIR)/neighbor_cpu.ptx
|
||||
$(BSH) ./geryon/file_to_cstr.sh neighbor_cpu $(OBJ_DIR)/neighbor_cpu.ptx $(OBJ_DIR)/neighbor_cpu_ptx.h
|
||||
$(OBJ_DIR)/neighbor_cpu_cubin.h: $(OBJ_DIR)/neighbor_cpu.cubin
|
||||
$(BIN2C) -c -n neighbor_cpu $(OBJ_DIR)/neighbor_cpu.cubin > $(OBJ_DIR)/neighbor_cpu_cubin.h
|
||||
|
||||
$(OBJ_DIR)/neighbor_gpu.ptx: lal_neighbor_gpu.cu lal_preprocessor.h
|
||||
$(CUDA) --ptx -DNV_KERNEL -o $@ lal_neighbor_gpu.cu
|
||||
$(OBJ_DIR)/neighbor_gpu.cubin: lal_neighbor_gpu.cu lal_preprocessor.h
|
||||
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_neighbor_gpu.cu
|
||||
|
||||
$(OBJ_DIR)/neighbor_gpu_ptx.h: $(OBJ_DIR)/neighbor_gpu.ptx
|
||||
$(BSH) ./geryon/file_to_cstr.sh neighbor_gpu $(OBJ_DIR)/neighbor_gpu.ptx $(OBJ_DIR)/neighbor_gpu_ptx.h
|
||||
$(OBJ_DIR)/neighbor_gpu_cubin.h: $(OBJ_DIR)/neighbor_gpu.cubin
|
||||
$(BIN2C) -c -n neighbor_gpu $(OBJ_DIR)/neighbor_gpu.cubin > $(OBJ_DIR)/neighbor_gpu_cubin.h
|
||||
|
||||
$(OBJ_DIR)/lal_neighbor_shared.o: lal_neighbor_shared.cpp lal_neighbor_shared.h $(OBJ_DIR)/neighbor_cpu_ptx.h $(OBJ_DIR)/neighbor_gpu_ptx.h $(NVD_H)
|
||||
$(OBJ_DIR)/lal_neighbor_shared.o: lal_neighbor_shared.cpp lal_neighbor_shared.h $(OBJ_DIR)/neighbor_cpu_cubin.h $(OBJ_DIR)/neighbor_gpu_cubin.h $(NVD_H)
|
||||
$(CUDR) -o $@ -c lal_neighbor_shared.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_neighbor.o: lal_neighbor.cpp lal_neighbor.h lal_neighbor_shared.h $(NVD_H)
|
||||
$(CUDR) -o $@ -c lal_neighbor.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/device.ptx: lal_device.cu lal_preprocessor.h
|
||||
$(CUDA) --ptx -DNV_KERNEL -o $@ lal_device.cu
|
||||
$(OBJ_DIR)/device.cubin: lal_device.cu lal_preprocessor.h
|
||||
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_device.cu
|
||||
|
||||
$(OBJ_DIR)/device_ptx.h: $(OBJ_DIR)/device.ptx
|
||||
$(BSH) ./geryon/file_to_cstr.sh device $(OBJ_DIR)/device.ptx $(OBJ_DIR)/device_ptx.h
|
||||
$(OBJ_DIR)/device_cubin.h: $(OBJ_DIR)/device.cubin
|
||||
$(BIN2C) -c -n device $(OBJ_DIR)/device.cubin > $(OBJ_DIR)/device_cubin.h
|
||||
|
||||
$(OBJ_DIR)/lal_device.o: lal_device.cpp lal_device.h $(ALL_H) $(OBJ_DIR)/device_ptx.h
|
||||
$(OBJ_DIR)/lal_device.o: lal_device.cpp lal_device.h $(ALL_H) $(OBJ_DIR)/device_cubin.h
|
||||
$(CUDR) -o $@ -c lal_device.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_base_atomic.o: $(ALL_H) lal_base_atomic.h lal_base_atomic.cpp
|
||||
@ -141,273 +166,408 @@ $(OBJ_DIR)/lal_base_atomic.o: $(ALL_H) lal_base_atomic.h lal_base_atomic.cpp
|
||||
$(OBJ_DIR)/lal_base_charge.o: $(ALL_H) lal_base_charge.h lal_base_charge.cpp
|
||||
$(CUDR) -o $@ -c lal_base_charge.cpp
|
||||
|
||||
$(OBJ_DIR)/lal_base_ellipsoid.o: $(ALL_H) lal_base_ellipsoid.h lal_base_ellipsoid.cpp $(OBJ_DIR)/ellipsoid_nbor_ptx.h
|
||||
$(OBJ_DIR)/lal_base_ellipsoid.o: $(ALL_H) lal_base_ellipsoid.h lal_base_ellipsoid.cpp $(OBJ_DIR)/ellipsoid_nbor_cubin.h
|
||||
$(CUDR) -o $@ -c lal_base_ellipsoid.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/pppm_f.ptx: lal_pppm.cu lal_precision.h lal_preprocessor.h
|
||||
$(CUDA) --ptx -DNV_KERNEL -Dgrdtyp=float -Dgrdtyp4=float4 -o $@ lal_pppm.cu
|
||||
$(OBJ_DIR)/lal_base_dipole.o: $(ALL_H) lal_base_dipole.h lal_base_dipole.cpp
|
||||
$(CUDR) -o $@ -c lal_base_dipole.cpp
|
||||
|
||||
$(OBJ_DIR)/pppm_f_ptx.h: $(OBJ_DIR)/pppm_f.ptx
|
||||
$(BSH) ./geryon/file_to_cstr.sh pppm_f $(OBJ_DIR)/pppm_f.ptx $(OBJ_DIR)/pppm_f_ptx.h
|
||||
$(OBJ_DIR)/pppm_f.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h
|
||||
$(CUDA) --cubin -DNV_KERNEL -Dgrdtyp=float -Dgrdtyp4=float4 -o $@ lal_pppm.cu
|
||||
|
||||
$(OBJ_DIR)/pppm_d.ptx: lal_pppm.cu lal_precision.h lal_preprocessor.h
|
||||
$(CUDA) --ptx -DNV_KERNEL -Dgrdtyp=double -Dgrdtyp4=double4 -o $@ lal_pppm.cu
|
||||
$(OBJ_DIR)/pppm_f_cubin.h: $(OBJ_DIR)/pppm_f.cubin
|
||||
$(BIN2C) -c -n pppm_f $(OBJ_DIR)/pppm_f.cubin > $(OBJ_DIR)/pppm_f_cubin.h
|
||||
|
||||
$(OBJ_DIR)/pppm_d_ptx.h: $(OBJ_DIR)/pppm_d.ptx
|
||||
$(BSH) ./geryon/file_to_cstr.sh pppm_d $(OBJ_DIR)/pppm_d.ptx $(OBJ_DIR)/pppm_d_ptx.h
|
||||
$(OBJ_DIR)/pppm_d.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h
|
||||
$(CUDA) --cubin -DNV_KERNEL -Dgrdtyp=double -Dgrdtyp4=double4 -o $@ lal_pppm.cu
|
||||
|
||||
$(OBJ_DIR)/lal_pppm.o: $(ALL_H) lal_pppm.h lal_pppm.cpp $(OBJ_DIR)/pppm_f_ptx.h $(OBJ_DIR)/pppm_d_ptx.h
|
||||
$(OBJ_DIR)/pppm_d_cubin.h: $(OBJ_DIR)/pppm_d.cubin
|
||||
$(BIN2C) -c -n pppm_d $(OBJ_DIR)/pppm_d.cubin > $(OBJ_DIR)/pppm_d_cubin.h
|
||||
|
||||
$(OBJ_DIR)/lal_pppm.o: $(ALL_H) lal_pppm.h lal_pppm.cpp $(OBJ_DIR)/pppm_f_cubin.h $(OBJ_DIR)/pppm_d_cubin.h
|
||||
$(CUDR) -o $@ -c lal_pppm.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_pppm_ext.o: $(ALL_H) lal_pppm.h lal_pppm_ext.cpp
|
||||
$(CUDR) -o $@ -c lal_pppm_ext.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/ellipsoid_nbor.ptx: lal_ellipsoid_nbor.cu lal_precision.h lal_preprocessor.h
|
||||
$(CUDA) --ptx -DNV_KERNEL -o $@ lal_ellipsoid_nbor.cu
|
||||
$(OBJ_DIR)/ellipsoid_nbor.cubin: lal_ellipsoid_nbor.cu lal_precision.h lal_preprocessor.h
|
||||
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_ellipsoid_nbor.cu
|
||||
|
||||
$(OBJ_DIR)/ellipsoid_nbor_ptx.h: $(OBJ_DIR)/ellipsoid_nbor.ptx
|
||||
$(BSH) ./geryon/file_to_cstr.sh ellipsoid_nbor $(OBJ_DIR)/ellipsoid_nbor.ptx $(OBJ_DIR)/ellipsoid_nbor_ptx.h
|
||||
$(OBJ_DIR)/ellipsoid_nbor_cubin.h: $(OBJ_DIR)/ellipsoid_nbor.cubin
|
||||
$(BIN2C) -c -n ellipsoid_nbor $(OBJ_DIR)/ellipsoid_nbor.cubin > $(OBJ_DIR)/ellipsoid_nbor_cubin.h
|
||||
|
||||
$(OBJ_DIR)/gayberne.ptx: lal_gayberne.cu lal_precision.h lal_ellipsoid_extra.h lal_preprocessor.h
|
||||
$(CUDA) --ptx -DNV_KERNEL -o $@ lal_gayberne.cu
|
||||
$(OBJ_DIR)/gayberne.cubin: lal_gayberne.cu lal_precision.h lal_ellipsoid_extra.h lal_preprocessor.h
|
||||
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_gayberne.cu
|
||||
|
||||
$(OBJ_DIR)/gayberne_lj.ptx: lal_gayberne_lj.cu lal_precision.h lal_ellipsoid_extra.h lal_preprocessor.h
|
||||
$(CUDA) --ptx -DNV_KERNEL -o $@ lal_gayberne_lj.cu
|
||||
$(OBJ_DIR)/gayberne_lj.cubin: lal_gayberne_lj.cu lal_precision.h lal_ellipsoid_extra.h lal_preprocessor.h
|
||||
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_gayberne_lj.cu
|
||||
|
||||
$(OBJ_DIR)/gayberne_ptx.h: $(OBJ_DIR)/gayberne.ptx
|
||||
$(BSH) ./geryon/file_to_cstr.sh gayberne $(OBJ_DIR)/gayberne.ptx $(OBJ_DIR)/gayberne_ptx.h
|
||||
$(OBJ_DIR)/gayberne_cubin.h: $(OBJ_DIR)/gayberne.cubin
|
||||
$(BIN2C) -c -n gayberne $(OBJ_DIR)/gayberne.cubin > $(OBJ_DIR)/gayberne_cubin.h
|
||||
|
||||
$(OBJ_DIR)/gayberne_lj_ptx.h: $(OBJ_DIR)/gayberne_lj.ptx
|
||||
$(BSH) ./geryon/file_to_cstr.sh gayberne_lj $(OBJ_DIR)/gayberne_lj.ptx $(OBJ_DIR)/gayberne_lj_ptx.h
|
||||
$(OBJ_DIR)/gayberne_lj_cubin.h: $(OBJ_DIR)/gayberne_lj.cubin
|
||||
$(BIN2C) -c -n gayberne_lj $(OBJ_DIR)/gayberne_lj.cubin > $(OBJ_DIR)/gayberne_lj_cubin.h
|
||||
|
||||
$(OBJ_DIR)/lal_gayberne.o: $(ALL_H) lal_gayberne.h lal_gayberne.cpp $(OBJ_DIR)/gayberne_ptx.h $(OBJ_DIR)/gayberne_lj_ptx.h $(OBJ_DIR)/lal_base_ellipsoid.o
|
||||
$(OBJ_DIR)/lal_gayberne.o: $(ALL_H) lal_gayberne.h lal_gayberne.cpp $(OBJ_DIR)/gayberne_cubin.h $(OBJ_DIR)/gayberne_lj_cubin.h $(OBJ_DIR)/lal_base_ellipsoid.o
|
||||
$(CUDR) -o $@ -c lal_gayberne.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_gayberne_ext.o: $(ALL_H) $(OBJ_DIR)/lal_gayberne.o lal_gayberne_ext.cpp
|
||||
$(CUDR) -o $@ -c lal_gayberne_ext.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/re_squared.ptx: lal_re_squared.cu lal_precision.h lal_ellipsoid_extra.h lal_preprocessor.h
|
||||
$(CUDA) --ptx -DNV_KERNEL -o $@ lal_re_squared.cu
|
||||
$(OBJ_DIR)/re_squared.cubin: lal_re_squared.cu lal_precision.h lal_ellipsoid_extra.h lal_preprocessor.h
|
||||
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_re_squared.cu
|
||||
|
||||
$(OBJ_DIR)/re_squared_lj.ptx: lal_re_squared_lj.cu lal_precision.h lal_ellipsoid_extra.h lal_preprocessor.h
|
||||
$(CUDA) --ptx -DNV_KERNEL -o $@ lal_re_squared_lj.cu
|
||||
$(OBJ_DIR)/re_squared_lj.cubin: lal_re_squared_lj.cu lal_precision.h lal_ellipsoid_extra.h lal_preprocessor.h
|
||||
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_re_squared_lj.cu
|
||||
|
||||
$(OBJ_DIR)/re_squared_ptx.h: $(OBJ_DIR)/re_squared.ptx
|
||||
$(BSH) ./geryon/file_to_cstr.sh re_squared $(OBJ_DIR)/re_squared.ptx $(OBJ_DIR)/re_squared_ptx.h
|
||||
$(OBJ_DIR)/re_squared_cubin.h: $(OBJ_DIR)/re_squared.cubin
|
||||
$(BIN2C) -c -n re_squared $(OBJ_DIR)/re_squared.cubin > $(OBJ_DIR)/re_squared_cubin.h
|
||||
|
||||
$(OBJ_DIR)/re_squared_lj_ptx.h: $(OBJ_DIR)/re_squared_lj.ptx
|
||||
$(BSH) ./geryon/file_to_cstr.sh re_squared_lj $(OBJ_DIR)/re_squared_lj.ptx $(OBJ_DIR)/re_squared_lj_ptx.h
|
||||
$(OBJ_DIR)/re_squared_lj_cubin.h: $(OBJ_DIR)/re_squared_lj.cubin
|
||||
$(BIN2C) -c -n re_squared_lj $(OBJ_DIR)/re_squared_lj.cubin > $(OBJ_DIR)/re_squared_lj_cubin.h
|
||||
|
||||
$(OBJ_DIR)/lal_re_squared.o: $(ALL_H) lal_re_squared.h lal_re_squared.cpp $(OBJ_DIR)/re_squared_ptx.h $(OBJ_DIR)/re_squared_lj_ptx.h $(OBJ_DIR)/lal_base_ellipsoid.o
|
||||
$(OBJ_DIR)/lal_re_squared.o: $(ALL_H) lal_re_squared.h lal_re_squared.cpp $(OBJ_DIR)/re_squared_cubin.h $(OBJ_DIR)/re_squared_lj_cubin.h $(OBJ_DIR)/lal_base_ellipsoid.o
|
||||
$(CUDR) -o $@ -c lal_re_squared.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_re_squared_ext.o: $(ALL_H) $(OBJ_DIR)/lal_re_squared.o lal_re_squared_ext.cpp
|
||||
$(CUDR) -o $@ -c lal_re_squared_ext.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lj.ptx: lal_lj.cu lal_precision.h lal_preprocessor.h
|
||||
$(CUDA) --ptx -DNV_KERNEL -o $@ lal_lj.cu
|
||||
$(OBJ_DIR)/lj.cubin: lal_lj.cu lal_precision.h lal_preprocessor.h
|
||||
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj.cu
|
||||
|
||||
$(OBJ_DIR)/lj_ptx.h: $(OBJ_DIR)/lj.ptx $(OBJ_DIR)/lj.ptx
|
||||
$(BSH) ./geryon/file_to_cstr.sh lj $(OBJ_DIR)/lj.ptx $(OBJ_DIR)/lj_ptx.h
|
||||
$(OBJ_DIR)/lj_cubin.h: $(OBJ_DIR)/lj.cubin $(OBJ_DIR)/lj.cubin
|
||||
$(BIN2C) -c -n lj $(OBJ_DIR)/lj.cubin > $(OBJ_DIR)/lj_cubin.h
|
||||
|
||||
$(OBJ_DIR)/lal_lj.o: $(ALL_H) lal_lj.h lal_lj.cpp $(OBJ_DIR)/lj_ptx.h $(OBJ_DIR)/lal_base_atomic.o
|
||||
$(OBJ_DIR)/lal_lj.o: $(ALL_H) lal_lj.h lal_lj.cpp $(OBJ_DIR)/lj_cubin.h $(OBJ_DIR)/lal_base_atomic.o
|
||||
$(CUDR) -o $@ -c lal_lj.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_lj_ext.o: $(ALL_H) lal_lj.h lal_lj_ext.cpp lal_base_atomic.h
|
||||
$(CUDR) -o $@ -c lal_lj_ext.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lj_coul.ptx: lal_lj_coul.cu lal_precision.h lal_preprocessor.h
|
||||
$(CUDA) --ptx -DNV_KERNEL -o $@ lal_lj_coul.cu
|
||||
$(OBJ_DIR)/lj_coul.cubin: lal_lj_coul.cu lal_precision.h lal_preprocessor.h
|
||||
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj_coul.cu
|
||||
|
||||
$(OBJ_DIR)/lj_coul_ptx.h: $(OBJ_DIR)/lj_coul.ptx $(OBJ_DIR)/lj_coul.ptx
|
||||
$(BSH) ./geryon/file_to_cstr.sh lj_coul $(OBJ_DIR)/lj_coul.ptx $(OBJ_DIR)/lj_coul_ptx.h
|
||||
$(OBJ_DIR)/lj_coul_cubin.h: $(OBJ_DIR)/lj_coul.cubin $(OBJ_DIR)/lj_coul.cubin
|
||||
$(BIN2C) -c -n lj_coul $(OBJ_DIR)/lj_coul.cubin > $(OBJ_DIR)/lj_coul_cubin.h
|
||||
|
||||
$(OBJ_DIR)/lal_lj_coul.o: $(ALL_H) lal_lj_coul.h lal_lj_coul.cpp $(OBJ_DIR)/lj_coul_ptx.h $(OBJ_DIR)/lal_base_charge.o
|
||||
$(OBJ_DIR)/lal_lj_coul.o: $(ALL_H) lal_lj_coul.h lal_lj_coul.cpp $(OBJ_DIR)/lj_coul_cubin.h $(OBJ_DIR)/lal_base_charge.o
|
||||
$(CUDR) -o $@ -c lal_lj_coul.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_lj_coul_ext.o: $(ALL_H) lal_lj_coul.h lal_lj_coul_ext.cpp lal_base_charge.h
|
||||
$(CUDR) -o $@ -c lal_lj_coul_ext.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lj_class2_long.ptx: lal_lj_class2_long.cu lal_precision.h lal_preprocessor.h
|
||||
$(CUDA) --ptx -DNV_KERNEL -o $@ lal_lj_class2_long.cu
|
||||
$(OBJ_DIR)/lj_class2_long.cubin: lal_lj_class2_long.cu lal_precision.h lal_preprocessor.h
|
||||
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj_class2_long.cu
|
||||
|
||||
$(OBJ_DIR)/lj_class2_long_ptx.h: $(OBJ_DIR)/lj_class2_long.ptx $(OBJ_DIR)/lj_class2_long.ptx
|
||||
$(BSH) ./geryon/file_to_cstr.sh lj_class2_long $(OBJ_DIR)/lj_class2_long.ptx $(OBJ_DIR)/lj_class2_long_ptx.h
|
||||
$(OBJ_DIR)/lj_class2_long_cubin.h: $(OBJ_DIR)/lj_class2_long.cubin $(OBJ_DIR)/lj_class2_long.cubin
|
||||
$(BIN2C) -c -n lj_class2_long $(OBJ_DIR)/lj_class2_long.cubin > $(OBJ_DIR)/lj_class2_long_cubin.h
|
||||
|
||||
$(OBJ_DIR)/lal_lj_class2_long.o: $(ALL_H) lal_lj_class2_long.h lal_lj_class2_long.cpp $(OBJ_DIR)/lj_class2_long_ptx.h $(OBJ_DIR)/lal_base_charge.o
|
||||
$(OBJ_DIR)/lal_lj_class2_long.o: $(ALL_H) lal_lj_class2_long.h lal_lj_class2_long.cpp $(OBJ_DIR)/lj_class2_long_cubin.h $(OBJ_DIR)/lal_base_charge.o
|
||||
$(CUDR) -o $@ -c lal_lj_class2_long.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_lj_class2_long_ext.o: $(ALL_H) lal_lj_class2_long.h lal_lj_class2_long_ext.cpp lal_base_charge.h
|
||||
$(CUDR) -o $@ -c lal_lj_class2_long_ext.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/coul_long.ptx: lal_coul_long.cu lal_precision.h lal_preprocessor.h
|
||||
$(CUDA) --ptx -DNV_KERNEL -o $@ lal_coul_long.cu
|
||||
$(OBJ_DIR)/coul_long.cubin: lal_coul_long.cu lal_precision.h lal_preprocessor.h
|
||||
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_coul_long.cu
|
||||
|
||||
$(OBJ_DIR)/coul_long_ptx.h: $(OBJ_DIR)/coul_long.ptx $(OBJ_DIR)/coul_long.ptx
|
||||
$(BSH) ./geryon/file_to_cstr.sh coul_long $(OBJ_DIR)/coul_long.ptx $(OBJ_DIR)/coul_long_ptx.h
|
||||
$(OBJ_DIR)/coul_long_cubin.h: $(OBJ_DIR)/coul_long.cubin $(OBJ_DIR)/coul_long.cubin
|
||||
$(BIN2C) -c -n coul_long $(OBJ_DIR)/coul_long.cubin > $(OBJ_DIR)/coul_long_cubin.h
|
||||
|
||||
$(OBJ_DIR)/lal_coul_long.o: $(ALL_H) lal_coul_long.h lal_coul_long.cpp $(OBJ_DIR)/coul_long_ptx.h $(OBJ_DIR)/lal_base_charge.o
|
||||
$(OBJ_DIR)/lal_coul_long.o: $(ALL_H) lal_coul_long.h lal_coul_long.cpp $(OBJ_DIR)/coul_long_cubin.h $(OBJ_DIR)/lal_base_charge.o
|
||||
$(CUDR) -o $@ -c lal_coul_long.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_coul_long_ext.o: $(ALL_H) lal_coul_long.h lal_coul_long_ext.cpp lal_base_charge.h
|
||||
$(CUDR) -o $@ -c lal_coul_long_ext.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lj_coul_long.ptx: lal_lj_coul_long.cu lal_precision.h lal_preprocessor.h
|
||||
$(CUDA) --ptx -DNV_KERNEL -o $@ lal_lj_coul_long.cu
|
||||
$(OBJ_DIR)/lj_coul_long.cubin: lal_lj_coul_long.cu lal_precision.h lal_preprocessor.h
|
||||
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj_coul_long.cu
|
||||
|
||||
$(OBJ_DIR)/lj_coul_long_ptx.h: $(OBJ_DIR)/lj_coul_long.ptx $(OBJ_DIR)/lj_coul_long.ptx
|
||||
$(BSH) ./geryon/file_to_cstr.sh lj_coul_long $(OBJ_DIR)/lj_coul_long.ptx $(OBJ_DIR)/lj_coul_long_ptx.h
|
||||
$(OBJ_DIR)/lj_coul_long_cubin.h: $(OBJ_DIR)/lj_coul_long.cubin $(OBJ_DIR)/lj_coul_long.cubin
|
||||
$(BIN2C) -c -n lj_coul_long $(OBJ_DIR)/lj_coul_long.cubin > $(OBJ_DIR)/lj_coul_long_cubin.h
|
||||
|
||||
$(OBJ_DIR)/lal_lj_coul_long.o: $(ALL_H) lal_lj_coul_long.h lal_lj_coul_long.cpp $(OBJ_DIR)/lj_coul_long_ptx.h $(OBJ_DIR)/lal_base_charge.o
|
||||
$(OBJ_DIR)/lal_lj_coul_long.o: $(ALL_H) lal_lj_coul_long.h lal_lj_coul_long.cpp $(OBJ_DIR)/lj_coul_long_cubin.h $(OBJ_DIR)/lal_base_charge.o
|
||||
$(CUDR) -o $@ -c lal_lj_coul_long.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_lj_coul_long_ext.o: $(ALL_H) lal_lj_coul_long.h lal_lj_coul_long_ext.cpp lal_base_charge.h
|
||||
$(CUDR) -o $@ -c lal_lj_coul_long_ext.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/morse.ptx: lal_morse.cu lal_precision.h lal_preprocessor.h
|
||||
$(CUDA) --ptx -DNV_KERNEL -o $@ lal_morse.cu
|
||||
$(OBJ_DIR)/lj_dsf.cubin: lal_lj_dsf.cu lal_precision.h lal_preprocessor.h
|
||||
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj_dsf.cu
|
||||
|
||||
$(OBJ_DIR)/morse_ptx.h: $(OBJ_DIR)/morse.ptx $(OBJ_DIR)/morse.ptx
|
||||
$(BSH) ./geryon/file_to_cstr.sh morse $(OBJ_DIR)/morse.ptx $(OBJ_DIR)/morse_ptx.h
|
||||
$(OBJ_DIR)/lj_dsf_cubin.h: $(OBJ_DIR)/lj_dsf.cubin $(OBJ_DIR)/lj_dsf.cubin
|
||||
$(BIN2C) -c -n lj_dsf $(OBJ_DIR)/lj_dsf.cubin > $(OBJ_DIR)/lj_dsf_cubin.h
|
||||
|
||||
$(OBJ_DIR)/lal_morse.o: $(ALL_H) lal_morse.h lal_morse.cpp $(OBJ_DIR)/morse_ptx.h $(OBJ_DIR)/lal_base_atomic.o
|
||||
$(OBJ_DIR)/lal_lj_dsf.o: $(ALL_H) lal_lj_dsf.h lal_lj_dsf.cpp $(OBJ_DIR)/lj_dsf_cubin.h $(OBJ_DIR)/lal_base_charge.o
|
||||
$(CUDR) -o $@ -c lal_lj_dsf.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_lj_dsf_ext.o: $(ALL_H) lal_lj_dsf.h lal_lj_dsf_ext.cpp lal_base_charge.h
|
||||
$(CUDR) -o $@ -c lal_lj_dsf_ext.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/morse.cubin: lal_morse.cu lal_precision.h lal_preprocessor.h
|
||||
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_morse.cu
|
||||
|
||||
$(OBJ_DIR)/morse_cubin.h: $(OBJ_DIR)/morse.cubin $(OBJ_DIR)/morse.cubin
|
||||
$(BIN2C) -c -n morse $(OBJ_DIR)/morse.cubin > $(OBJ_DIR)/morse_cubin.h
|
||||
|
||||
$(OBJ_DIR)/lal_morse.o: $(ALL_H) lal_morse.h lal_morse.cpp $(OBJ_DIR)/morse_cubin.h $(OBJ_DIR)/lal_base_atomic.o
|
||||
$(CUDR) -o $@ -c lal_morse.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_morse_ext.o: $(ALL_H) lal_morse.h lal_morse_ext.cpp lal_base_atomic.h
|
||||
$(CUDR) -o $@ -c lal_morse_ext.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/charmm_long.ptx: lal_charmm_long.cu lal_precision.h lal_preprocessor.h
|
||||
$(CUDA) --ptx -DNV_KERNEL -o $@ lal_charmm_long.cu
|
||||
$(OBJ_DIR)/charmm_long.cubin: lal_charmm_long.cu lal_precision.h lal_preprocessor.h
|
||||
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_charmm_long.cu
|
||||
|
||||
$(OBJ_DIR)/charmm_long_ptx.h: $(OBJ_DIR)/charmm_long.ptx $(OBJ_DIR)/charmm_long.ptx
|
||||
$(BSH) ./geryon/file_to_cstr.sh charmm_long $(OBJ_DIR)/charmm_long.ptx $(OBJ_DIR)/charmm_long_ptx.h
|
||||
$(OBJ_DIR)/charmm_long_cubin.h: $(OBJ_DIR)/charmm_long.cubin $(OBJ_DIR)/charmm_long.cubin
|
||||
$(BIN2C) -c -n charmm_long $(OBJ_DIR)/charmm_long.cubin > $(OBJ_DIR)/charmm_long_cubin.h
|
||||
|
||||
$(OBJ_DIR)/lal_charmm_long.o: $(ALL_H) lal_charmm_long.h lal_charmm_long.cpp $(OBJ_DIR)/charmm_long_ptx.h $(OBJ_DIR)/lal_base_charge.o
|
||||
$(OBJ_DIR)/lal_charmm_long.o: $(ALL_H) lal_charmm_long.h lal_charmm_long.cpp $(OBJ_DIR)/charmm_long_cubin.h $(OBJ_DIR)/lal_base_charge.o
|
||||
$(CUDR) -o $@ -c lal_charmm_long.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_charmm_long_ext.o: $(ALL_H) lal_charmm_long.h lal_charmm_long_ext.cpp lal_base_charge.h
|
||||
$(CUDR) -o $@ -c lal_charmm_long_ext.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lj96.ptx: lal_lj96.cu lal_precision.h lal_preprocessor.h
|
||||
$(CUDA) --ptx -DNV_KERNEL -o $@ lal_lj96.cu
|
||||
$(OBJ_DIR)/lj96.cubin: lal_lj96.cu lal_precision.h lal_preprocessor.h
|
||||
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj96.cu
|
||||
|
||||
$(OBJ_DIR)/lj96_ptx.h: $(OBJ_DIR)/lj96.ptx $(OBJ_DIR)/lj96.ptx
|
||||
$(BSH) ./geryon/file_to_cstr.sh lj96 $(OBJ_DIR)/lj96.ptx $(OBJ_DIR)/lj96_ptx.h
|
||||
$(OBJ_DIR)/lj96_cubin.h: $(OBJ_DIR)/lj96.cubin $(OBJ_DIR)/lj96.cubin
|
||||
$(BIN2C) -c -n lj96 $(OBJ_DIR)/lj96.cubin > $(OBJ_DIR)/lj96_cubin.h
|
||||
|
||||
$(OBJ_DIR)/lal_lj96.o: $(ALL_H) lal_lj96.h lal_lj96.cpp $(OBJ_DIR)/lj96_ptx.h $(OBJ_DIR)/lal_base_atomic.o
|
||||
$(OBJ_DIR)/lal_lj96.o: $(ALL_H) lal_lj96.h lal_lj96.cpp $(OBJ_DIR)/lj96_cubin.h $(OBJ_DIR)/lal_base_atomic.o
|
||||
$(CUDR) -o $@ -c lal_lj96.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_lj96_ext.o: $(ALL_H) lal_lj96.h lal_lj96_ext.cpp lal_base_atomic.h
|
||||
$(CUDR) -o $@ -c lal_lj96_ext.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lj_expand.ptx: lal_lj_expand.cu lal_precision.h lal_preprocessor.h
|
||||
$(CUDA) --ptx -DNV_KERNEL -o $@ lal_lj_expand.cu
|
||||
$(OBJ_DIR)/lj_expand.cubin: lal_lj_expand.cu lal_precision.h lal_preprocessor.h
|
||||
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj_expand.cu
|
||||
|
||||
$(OBJ_DIR)/lj_expand_ptx.h: $(OBJ_DIR)/lj_expand.ptx $(OBJ_DIR)/lj_expand.ptx
|
||||
$(BSH) ./geryon/file_to_cstr.sh lj_expand $(OBJ_DIR)/lj_expand.ptx $(OBJ_DIR)/lj_expand_ptx.h
|
||||
$(OBJ_DIR)/lj_expand_cubin.h: $(OBJ_DIR)/lj_expand.cubin $(OBJ_DIR)/lj_expand.cubin
|
||||
$(BIN2C) -c -n lj_expand $(OBJ_DIR)/lj_expand.cubin > $(OBJ_DIR)/lj_expand_cubin.h
|
||||
|
||||
$(OBJ_DIR)/lal_lj_expand.o: $(ALL_H) lal_lj_expand.h lal_lj_expand.cpp $(OBJ_DIR)/lj_expand_ptx.h $(OBJ_DIR)/lal_base_atomic.o
|
||||
$(OBJ_DIR)/lal_lj_expand.o: $(ALL_H) lal_lj_expand.h lal_lj_expand.cpp $(OBJ_DIR)/lj_expand_cubin.h $(OBJ_DIR)/lal_base_atomic.o
|
||||
$(CUDR) -o $@ -c lal_lj_expand.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_lj_expand_ext.o: $(ALL_H) lal_lj_expand.h lal_lj_expand_ext.cpp lal_base_atomic.h
|
||||
$(CUDR) -o $@ -c lal_lj_expand_ext.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/cg_cmm.ptx: lal_cg_cmm.cu lal_precision.h lal_preprocessor.h
|
||||
$(CUDA) --ptx -DNV_KERNEL -o $@ lal_cg_cmm.cu
|
||||
$(OBJ_DIR)/cg_cmm.cubin: lal_cg_cmm.cu lal_precision.h lal_preprocessor.h
|
||||
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_cg_cmm.cu
|
||||
|
||||
$(OBJ_DIR)/cg_cmm_ptx.h: $(OBJ_DIR)/cg_cmm.ptx $(OBJ_DIR)/cg_cmm.ptx
|
||||
$(BSH) ./geryon/file_to_cstr.sh cg_cmm $(OBJ_DIR)/cg_cmm.ptx $(OBJ_DIR)/cg_cmm_ptx.h
|
||||
$(OBJ_DIR)/cg_cmm_cubin.h: $(OBJ_DIR)/cg_cmm.cubin $(OBJ_DIR)/cg_cmm.cubin
|
||||
$(BIN2C) -c -n cg_cmm $(OBJ_DIR)/cg_cmm.cubin > $(OBJ_DIR)/cg_cmm_cubin.h
|
||||
|
||||
$(OBJ_DIR)/lal_cg_cmm.o: $(ALL_H) lal_cg_cmm.h lal_cg_cmm.cpp $(OBJ_DIR)/cg_cmm_ptx.h $(OBJ_DIR)/lal_base_atomic.o
|
||||
$(OBJ_DIR)/lal_cg_cmm.o: $(ALL_H) lal_cg_cmm.h lal_cg_cmm.cpp $(OBJ_DIR)/cg_cmm_cubin.h $(OBJ_DIR)/lal_base_atomic.o
|
||||
$(CUDR) -o $@ -c lal_cg_cmm.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_cg_cmm_ext.o: $(ALL_H) lal_cg_cmm.h lal_cg_cmm_ext.cpp lal_base_atomic.h
|
||||
$(CUDR) -o $@ -c lal_cg_cmm_ext.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/cg_cmm_long.ptx: lal_cg_cmm_long.cu lal_precision.h lal_preprocessor.h
|
||||
$(CUDA) --ptx -DNV_KERNEL -o $@ lal_cg_cmm_long.cu
|
||||
$(OBJ_DIR)/cg_cmm_long.cubin: lal_cg_cmm_long.cu lal_precision.h lal_preprocessor.h
|
||||
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_cg_cmm_long.cu
|
||||
|
||||
$(OBJ_DIR)/cg_cmm_long_ptx.h: $(OBJ_DIR)/cg_cmm_long.ptx $(OBJ_DIR)/cg_cmm_long.ptx
|
||||
$(BSH) ./geryon/file_to_cstr.sh cg_cmm_long $(OBJ_DIR)/cg_cmm_long.ptx $(OBJ_DIR)/cg_cmm_long_ptx.h
|
||||
$(OBJ_DIR)/cg_cmm_long_cubin.h: $(OBJ_DIR)/cg_cmm_long.cubin $(OBJ_DIR)/cg_cmm_long.cubin
|
||||
$(BIN2C) -c -n cg_cmm_long $(OBJ_DIR)/cg_cmm_long.cubin > $(OBJ_DIR)/cg_cmm_long_cubin.h
|
||||
|
||||
$(OBJ_DIR)/lal_cg_cmm_long.o: $(ALL_H) lal_cg_cmm_long.h lal_cg_cmm_long.cpp $(OBJ_DIR)/cg_cmm_long_ptx.h $(OBJ_DIR)/lal_base_atomic.o
|
||||
$(OBJ_DIR)/lal_cg_cmm_long.o: $(ALL_H) lal_cg_cmm_long.h lal_cg_cmm_long.cpp $(OBJ_DIR)/cg_cmm_long_cubin.h $(OBJ_DIR)/lal_base_atomic.o
|
||||
$(CUDR) -o $@ -c lal_cg_cmm_long.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_cg_cmm_long_ext.o: $(ALL_H) lal_cg_cmm_long.h lal_cg_cmm_long_ext.cpp lal_base_charge.h
|
||||
$(CUDR) -o $@ -c lal_cg_cmm_long_ext.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/eam.ptx: lal_eam.cu lal_precision.h lal_preprocessor.h
|
||||
$(CUDA) --ptx -DNV_KERNEL -o $@ lal_eam.cu
|
||||
$(OBJ_DIR)/eam.cubin: lal_eam.cu lal_precision.h lal_preprocessor.h
|
||||
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_eam.cu
|
||||
|
||||
$(OBJ_DIR)/eam_ptx.h: $(OBJ_DIR)/eam.ptx $(OBJ_DIR)/eam.ptx
|
||||
$(BSH) ./geryon/file_to_cstr.sh eam $(OBJ_DIR)/eam.ptx $(OBJ_DIR)/eam_ptx.h
|
||||
$(OBJ_DIR)/eam_cubin.h: $(OBJ_DIR)/eam.cubin $(OBJ_DIR)/eam.cubin
|
||||
$(BIN2C) -c -n eam $(OBJ_DIR)/eam.cubin > $(OBJ_DIR)/eam_cubin.h
|
||||
|
||||
$(OBJ_DIR)/lal_eam.o: $(ALL_H) lal_eam.h lal_eam.cpp $(OBJ_DIR)/eam_ptx.h $(OBJ_DIR)/lal_base_atomic.o
|
||||
$(OBJ_DIR)/lal_eam.o: $(ALL_H) lal_eam.h lal_eam.cpp $(OBJ_DIR)/eam_cubin.h $(OBJ_DIR)/lal_base_atomic.o
|
||||
$(CUDR) -o $@ -c lal_eam.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_eam_ext.o: $(ALL_H) lal_eam.h lal_eam_ext.cpp lal_base_atomic.h
|
||||
$(CUDR) -o $@ -c lal_eam_ext.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/buck.ptx: lal_buck.cu lal_precision.h lal_preprocessor.h
|
||||
$(CUDA) --ptx -DNV_KERNEL -o $@ lal_buck.cu
|
||||
$(OBJ_DIR)/buck.cubin: lal_buck.cu lal_precision.h lal_preprocessor.h
|
||||
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_buck.cu
|
||||
|
||||
$(OBJ_DIR)/buck_ptx.h: $(OBJ_DIR)/buck.ptx $(OBJ_DIR)/buck.ptx
|
||||
$(BSH) ./geryon/file_to_cstr.sh buck $(OBJ_DIR)/buck.ptx $(OBJ_DIR)/buck_ptx.h
|
||||
$(OBJ_DIR)/buck_cubin.h: $(OBJ_DIR)/buck.cubin $(OBJ_DIR)/buck.cubin
|
||||
$(BIN2C) -c -n buck $(OBJ_DIR)/buck.cubin > $(OBJ_DIR)/buck_cubin.h
|
||||
|
||||
$(OBJ_DIR)/lal_buck.o: $(ALL_H) lal_buck.h lal_buck.cpp $(OBJ_DIR)/buck_ptx.h $(OBJ_DIR)/lal_base_atomic.o
|
||||
$(OBJ_DIR)/lal_buck.o: $(ALL_H) lal_buck.h lal_buck.cpp $(OBJ_DIR)/buck_cubin.h $(OBJ_DIR)/lal_base_atomic.o
|
||||
$(CUDR) -o $@ -c lal_buck.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_buck_ext.o: $(ALL_H) lal_buck.h lal_buck_ext.cpp lal_base_atomic.h
|
||||
$(CUDR) -o $@ -c lal_buck_ext.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/buck_coul.ptx: lal_buck_coul.cu lal_precision.h lal_preprocessor.h
|
||||
$(CUDA) --ptx -DNV_KERNEL -o $@ lal_buck_coul.cu
|
||||
$(OBJ_DIR)/buck_coul.cubin: lal_buck_coul.cu lal_precision.h lal_preprocessor.h
|
||||
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_buck_coul.cu
|
||||
|
||||
$(OBJ_DIR)/buck_coul_ptx.h: $(OBJ_DIR)/buck_coul.ptx $(OBJ_DIR)/buck_coul.ptx
|
||||
$(BSH) ./geryon/file_to_cstr.sh buck_coul $(OBJ_DIR)/buck_coul.ptx $(OBJ_DIR)/buck_coul_ptx.h
|
||||
$(OBJ_DIR)/buck_coul_cubin.h: $(OBJ_DIR)/buck_coul.cubin $(OBJ_DIR)/buck_coul.cubin
|
||||
$(BIN2C) -c -n buck_coul $(OBJ_DIR)/buck_coul.cubin > $(OBJ_DIR)/buck_coul_cubin.h
|
||||
|
||||
$(OBJ_DIR)/lal_buck_coul.o: $(ALL_H) lal_buck_coul.h lal_buck_coul.cpp $(OBJ_DIR)/buck_coul_ptx.h $(OBJ_DIR)/lal_base_charge.o
|
||||
$(OBJ_DIR)/lal_buck_coul.o: $(ALL_H) lal_buck_coul.h lal_buck_coul.cpp $(OBJ_DIR)/buck_coul_cubin.h $(OBJ_DIR)/lal_base_charge.o
|
||||
$(CUDR) -o $@ -c lal_buck_coul.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_buck_coul_ext.o: $(ALL_H) lal_buck_coul.h lal_buck_coul_ext.cpp lal_base_charge.h
|
||||
$(CUDR) -o $@ -c lal_buck_coul_ext.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/buck_coul_long.ptx: lal_buck_coul_long.cu lal_precision.h lal_preprocessor.h
|
||||
$(CUDA) --ptx -DNV_KERNEL -o $@ lal_buck_coul_long.cu
|
||||
$(OBJ_DIR)/buck_coul_long.cubin: lal_buck_coul_long.cu lal_precision.h lal_preprocessor.h
|
||||
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_buck_coul_long.cu
|
||||
|
||||
$(OBJ_DIR)/buck_coul_long_ptx.h: $(OBJ_DIR)/buck_coul_long.ptx $(OBJ_DIR)/buck_coul_long.ptx
|
||||
$(BSH) ./geryon/file_to_cstr.sh buck_coul_long $(OBJ_DIR)/buck_coul_long.ptx $(OBJ_DIR)/buck_coul_long_ptx.h
|
||||
$(OBJ_DIR)/buck_coul_long_cubin.h: $(OBJ_DIR)/buck_coul_long.cubin $(OBJ_DIR)/buck_coul_long.cubin
|
||||
$(BIN2C) -c -n buck_coul_long $(OBJ_DIR)/buck_coul_long.cubin > $(OBJ_DIR)/buck_coul_long_cubin.h
|
||||
|
||||
$(OBJ_DIR)/lal_buck_coul_long.o: $(ALL_H) lal_buck_coul_long.h lal_buck_coul_long.cpp $(OBJ_DIR)/buck_coul_long_ptx.h $(OBJ_DIR)/lal_base_charge.o
|
||||
$(OBJ_DIR)/lal_buck_coul_long.o: $(ALL_H) lal_buck_coul_long.h lal_buck_coul_long.cpp $(OBJ_DIR)/buck_coul_long_cubin.h $(OBJ_DIR)/lal_base_charge.o
|
||||
$(CUDR) -o $@ -c lal_buck_coul_long.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_buck_coul_long_ext.o: $(ALL_H) lal_buck_coul_long.h lal_buck_coul_long_ext.cpp lal_base_charge.h
|
||||
$(CUDR) -o $@ -c lal_buck_coul_long_ext.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/table.ptx: lal_table.cu lal_precision.h lal_preprocessor.h
|
||||
$(CUDA) --ptx -DNV_KERNEL -o $@ lal_table.cu
|
||||
$(OBJ_DIR)/table.cubin: lal_table.cu lal_precision.h lal_preprocessor.h
|
||||
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_table.cu
|
||||
|
||||
$(OBJ_DIR)/table_ptx.h: $(OBJ_DIR)/table.ptx $(OBJ_DIR)/table.ptx
|
||||
$(BSH) ./geryon/file_to_cstr.sh table $(OBJ_DIR)/table.ptx $(OBJ_DIR)/table_ptx.h
|
||||
$(OBJ_DIR)/table_cubin.h: $(OBJ_DIR)/table.cubin $(OBJ_DIR)/table.cubin
|
||||
$(BIN2C) -c -n table $(OBJ_DIR)/table.cubin > $(OBJ_DIR)/table_cubin.h
|
||||
|
||||
$(OBJ_DIR)/lal_table.o: $(ALL_H) lal_table.h lal_table.cpp $(OBJ_DIR)/table_ptx.h $(OBJ_DIR)/lal_base_atomic.o
|
||||
$(OBJ_DIR)/lal_table.o: $(ALL_H) lal_table.h lal_table.cpp $(OBJ_DIR)/table_cubin.h $(OBJ_DIR)/lal_base_atomic.o
|
||||
$(CUDR) -o $@ -c lal_table.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_table_ext.o: $(ALL_H) lal_table.h lal_table_ext.cpp lal_base_atomic.h
|
||||
$(CUDR) -o $@ -c lal_table_ext.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/yukawa.ptx: lal_yukawa.cu lal_precision.h lal_preprocessor.h
|
||||
$(CUDA) --ptx -DNV_KERNEL -o $@ lal_yukawa.cu
|
||||
$(OBJ_DIR)/yukawa.cubin: lal_yukawa.cu lal_precision.h lal_preprocessor.h
|
||||
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_yukawa.cu
|
||||
|
||||
$(OBJ_DIR)/yukawa_ptx.h: $(OBJ_DIR)/yukawa.ptx $(OBJ_DIR)/yukawa.ptx
|
||||
$(BSH) ./geryon/file_to_cstr.sh yukawa $(OBJ_DIR)/yukawa.ptx $(OBJ_DIR)/yukawa_ptx.h
|
||||
$(OBJ_DIR)/yukawa_cubin.h: $(OBJ_DIR)/yukawa.cubin $(OBJ_DIR)/yukawa.cubin
|
||||
$(BIN2C) -c -n yukawa $(OBJ_DIR)/yukawa.cubin > $(OBJ_DIR)/yukawa_cubin.h
|
||||
|
||||
$(OBJ_DIR)/lal_yukawa.o: $(ALL_H) lal_yukawa.h lal_yukawa.cpp $(OBJ_DIR)/yukawa_ptx.h $(OBJ_DIR)/lal_base_atomic.o
|
||||
$(OBJ_DIR)/lal_yukawa.o: $(ALL_H) lal_yukawa.h lal_yukawa.cpp $(OBJ_DIR)/yukawa_cubin.h $(OBJ_DIR)/lal_base_atomic.o
|
||||
$(CUDR) -o $@ -c lal_yukawa.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_yukawa_ext.o: $(ALL_H) lal_yukawa.h lal_yukawa_ext.cpp lal_base_atomic.h
|
||||
$(CUDR) -o $@ -c lal_yukawa_ext.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/born.cubin: lal_born.cu lal_precision.h lal_preprocessor.h
|
||||
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_born.cu
|
||||
|
||||
$(OBJ_DIR)/born_cubin.h: $(OBJ_DIR)/born.cubin $(OBJ_DIR)/born.cubin
|
||||
$(BIN2C) -c -n born $(OBJ_DIR)/born.cubin > $(OBJ_DIR)/born_cubin.h
|
||||
|
||||
$(OBJ_DIR)/lal_born.o: $(ALL_H) lal_born.h lal_born.cpp $(OBJ_DIR)/born_cubin.h $(OBJ_DIR)/lal_base_atomic.o
|
||||
$(CUDR) -o $@ -c lal_born.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_born_ext.o: $(ALL_H) lal_born.h lal_born_ext.cpp lal_base_atomic.h
|
||||
$(CUDR) -o $@ -c lal_born_ext.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/born_coul_wolf.cubin: lal_born_coul_wolf.cu lal_precision.h lal_preprocessor.h
|
||||
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_born_coul_wolf.cu
|
||||
|
||||
$(OBJ_DIR)/born_coul_wolf_cubin.h: $(OBJ_DIR)/born_coul_wolf.cubin $(OBJ_DIR)/born_coul_wolf.cubin
|
||||
$(BIN2C) -c -n born_coul_wolf $(OBJ_DIR)/born_coul_wolf.cubin > $(OBJ_DIR)/born_coul_wolf_cubin.h
|
||||
|
||||
$(OBJ_DIR)/lal_born_coul_wolf.o: $(ALL_H) lal_born_coul_wolf.h lal_born_coul_wolf.cpp $(OBJ_DIR)/born_coul_wolf_cubin.h $(OBJ_DIR)/lal_base_charge.o
|
||||
$(CUDR) -o $@ -c lal_born_coul_wolf.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_born_coul_wolf_ext.o: $(ALL_H) lal_born_coul_wolf.h lal_born_coul_wolf_ext.cpp lal_base_charge.h
|
||||
$(CUDR) -o $@ -c lal_born_coul_wolf_ext.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/born_coul_long.cubin: lal_born_coul_long.cu lal_precision.h lal_preprocessor.h
|
||||
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_born_coul_long.cu
|
||||
|
||||
$(OBJ_DIR)/born_coul_long_cubin.h: $(OBJ_DIR)/born_coul_long.cubin $(OBJ_DIR)/born_coul_long.cubin
|
||||
$(BIN2C) -c -n born_coul_long $(OBJ_DIR)/born_coul_long.cubin > $(OBJ_DIR)/born_coul_long_cubin.h
|
||||
|
||||
$(OBJ_DIR)/lal_born_coul_long.o: $(ALL_H) lal_born_coul_long.h lal_born_coul_long.cpp $(OBJ_DIR)/born_coul_long_cubin.h $(OBJ_DIR)/lal_base_charge.o
|
||||
$(CUDR) -o $@ -c lal_born_coul_long.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_born_coul_long_ext.o: $(ALL_H) lal_born_coul_long.h lal_born_coul_long_ext.cpp lal_base_charge.h
|
||||
$(CUDR) -o $@ -c lal_born_coul_long_ext.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/dipole_lj.cubin: lal_dipole_lj.cu lal_precision.h lal_preprocessor.h
|
||||
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_dipole_lj.cu
|
||||
|
||||
$(OBJ_DIR)/dipole_lj_cubin.h: $(OBJ_DIR)/dipole_lj.cubin $(OBJ_DIR)/dipole_lj.cubin
|
||||
$(BIN2C) -c -n dipole_lj $(OBJ_DIR)/dipole_lj.cubin > $(OBJ_DIR)/dipole_lj_cubin.h
|
||||
|
||||
$(OBJ_DIR)/lal_dipole_lj.o: $(ALL_H) lal_dipole_lj.h lal_dipole_lj.cpp $(OBJ_DIR)/dipole_lj_cubin.h $(OBJ_DIR)/lal_base_dipole.o
|
||||
$(CUDR) -o $@ -c lal_dipole_lj.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_dipole_lj_ext.o: $(ALL_H) lal_dipole_lj.h lal_dipole_lj_ext.cpp lal_base_dipole.h
|
||||
$(CUDR) -o $@ -c lal_dipole_lj_ext.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/dipole_lj_sf.cubin: lal_dipole_lj_sf.cu lal_precision.h lal_preprocessor.h
|
||||
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_dipole_lj_sf.cu
|
||||
|
||||
$(OBJ_DIR)/dipole_lj_sf_cubin.h: $(OBJ_DIR)/dipole_lj_sf.cubin $(OBJ_DIR)/dipole_lj_sf.cubin
|
||||
$(BIN2C) -c -n dipole_lj_sf $(OBJ_DIR)/dipole_lj_sf.cubin > $(OBJ_DIR)/dipole_lj_sf_cubin.h
|
||||
|
||||
$(OBJ_DIR)/lal_dipole_lj_sf.o: $(ALL_H) lal_dipole_lj_sf.h lal_dipole_lj_sf.cpp $(OBJ_DIR)/dipole_lj_sf_cubin.h $(OBJ_DIR)/lal_base_dipole.o
|
||||
$(CUDR) -o $@ -c lal_dipole_lj_sf.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_dipole_lj_sf_ext.o: $(ALL_H) lal_dipole_lj_sf.h lal_dipole_lj_sf_ext.cpp lal_base_dipole.h
|
||||
$(CUDR) -o $@ -c lal_dipole_lj_sf_ext.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/colloid.cubin: lal_colloid.cu lal_precision.h lal_preprocessor.h
|
||||
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_colloid.cu
|
||||
|
||||
$(OBJ_DIR)/colloid_cubin.h: $(OBJ_DIR)/colloid.cubin $(OBJ_DIR)/colloid.cubin
|
||||
$(BIN2C) -c -n colloid $(OBJ_DIR)/colloid.cubin > $(OBJ_DIR)/colloid_cubin.h
|
||||
|
||||
$(OBJ_DIR)/lal_colloid.o: $(ALL_H) lal_colloid.h lal_colloid.cpp $(OBJ_DIR)/colloid_cubin.h $(OBJ_DIR)/lal_base_atomic.o
|
||||
$(CUDR) -o $@ -c lal_colloid.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_colloid_ext.o: $(ALL_H) lal_colloid.h lal_colloid_ext.cpp lal_base_atomic.h
|
||||
$(CUDR) -o $@ -c lal_colloid_ext.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/gauss.cubin: lal_gauss.cu lal_precision.h lal_preprocessor.h
|
||||
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_gauss.cu
|
||||
|
||||
$(OBJ_DIR)/gauss_cubin.h: $(OBJ_DIR)/gauss.cubin $(OBJ_DIR)/gauss.cubin
|
||||
$(BIN2C) -c -n gauss $(OBJ_DIR)/gauss.cubin > $(OBJ_DIR)/gauss_cubin.h
|
||||
|
||||
$(OBJ_DIR)/lal_gauss.o: $(ALL_H) lal_gauss.h lal_gauss.cpp $(OBJ_DIR)/gauss_cubin.h $(OBJ_DIR)/lal_base_atomic.o
|
||||
$(CUDR) -o $@ -c lal_gauss.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_gauss_ext.o: $(ALL_H) lal_gauss.h lal_gauss_ext.cpp lal_base_atomic.h
|
||||
$(CUDR) -o $@ -c lal_gauss_ext.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/yukawa_colloid.cubin: lal_yukawa_colloid.cu lal_precision.h lal_preprocessor.h
|
||||
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_yukawa_colloid.cu
|
||||
|
||||
$(OBJ_DIR)/yukawa_colloid_cubin.h: $(OBJ_DIR)/yukawa_colloid.cubin $(OBJ_DIR)/yukawa_colloid.cubin
|
||||
$(BIN2C) -c -n yukawa_colloid $(OBJ_DIR)/yukawa_colloid.cubin > $(OBJ_DIR)/yukawa_colloid_cubin.h
|
||||
|
||||
$(OBJ_DIR)/lal_yukawa_colloid.o: $(ALL_H) lal_yukawa_colloid.h lal_yukawa_colloid.cpp $(OBJ_DIR)/yukawa_colloid_cubin.h $(OBJ_DIR)/lal_base_atomic.o
|
||||
$(CUDR) -o $@ -c lal_yukawa_colloid.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_yukawa_colloid_ext.o: $(ALL_H) lal_yukawa_colloid.h lal_yukawa_colloid_ext.cpp lal_base_atomic.h
|
||||
$(CUDR) -o $@ -c lal_yukawa_colloid_ext.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lj_coul_debye.cubin: lal_lj_coul_debye.cu lal_precision.h lal_preprocessor.h
|
||||
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj_coul_debye.cu
|
||||
|
||||
$(OBJ_DIR)/lj_coul_debye_cubin.h: $(OBJ_DIR)/lj_coul_debye.cubin $(OBJ_DIR)/lj_coul_debye.cubin
|
||||
$(BIN2C) -c -n lj_coul_debye $(OBJ_DIR)/lj_coul_debye.cubin > $(OBJ_DIR)/lj_coul_debye_cubin.h
|
||||
|
||||
$(OBJ_DIR)/lal_lj_coul_debye.o: $(ALL_H) lal_lj_coul_debye.h lal_lj_coul_debye.cpp $(OBJ_DIR)/lj_coul_debye_cubin.h $(OBJ_DIR)/lal_base_charge.o
|
||||
$(CUDR) -o $@ -c lal_lj_coul_debye.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_lj_coul_debye_ext.o: $(ALL_H) lal_lj_coul_debye.h lal_lj_coul_debye_ext.cpp lal_base_charge.h
|
||||
$(CUDR) -o $@ -c lal_lj_coul_debye_ext.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/coul_dsf.cubin: lal_coul_dsf.cu lal_precision.h lal_preprocessor.h
|
||||
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_coul_dsf.cu
|
||||
|
||||
$(OBJ_DIR)/coul_dsf_cubin.h: $(OBJ_DIR)/coul_dsf.cubin $(OBJ_DIR)/coul_dsf.cubin
|
||||
$(BIN2C) -c -n coul_dsf $(OBJ_DIR)/coul_dsf.cubin > $(OBJ_DIR)/coul_dsf_cubin.h
|
||||
|
||||
$(OBJ_DIR)/lal_coul_dsf.o: $(ALL_H) lal_coul_dsf.h lal_coul_dsf.cpp $(OBJ_DIR)/coul_dsf_cubin.h $(OBJ_DIR)/lal_base_charge.o
|
||||
$(CUDR) -o $@ -c lal_coul_dsf.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_coul_dsf_ext.o: $(ALL_H) lal_coul_dsf.h lal_coul_dsf_ext.cpp lal_base_charge.h
|
||||
$(CUDR) -o $@ -c lal_coul_dsf_ext.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVD_H)
|
||||
$(CUDR) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_CUDADR $(CUDA_LIB) -lcuda
|
||||
|
||||
@ -415,10 +575,10 @@ $(GPU_LIB): $(OBJS) $(CUDPP)
|
||||
$(AR) -crusv $(GPU_LIB) $(OBJS) $(CUDPP)
|
||||
|
||||
clean:
|
||||
rm -f $(EXECS) $(GPU_LIB) $(OBJS) $(CUDPP) $(PTXS) *.linkinfo
|
||||
rm -f $(EXECS) $(GPU_LIB) $(OBJS) $(CUDPP) $(CBNS) *.linkinfo
|
||||
|
||||
veryclean: clean
|
||||
rm -rf *~ *.linkinfo
|
||||
|
||||
cleanlib:
|
||||
rm -f $(EXECS) $(GPU_LIB) $(OBJS) $(PTXS) *.linkinfo
|
||||
rm -f $(EXECS) $(GPU_LIB) $(OBJS) $(CBNS) *.linkinfo
|
||||
|
||||
@ -17,6 +17,7 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_answer.o \
|
||||
$(OBJ_DIR)/lal_neighbor_shared.o $(OBJ_DIR)/lal_neighbor.o \
|
||||
$(OBJ_DIR)/lal_device.o $(OBJ_DIR)/lal_base_atomic.o \
|
||||
$(OBJ_DIR)/lal_base_charge.o $(OBJ_DIR)/lal_base_ellipsoid.o \
|
||||
$(OBJ_DIR)/lal_base_dipole.o \
|
||||
$(OBJ_DIR)/lal_pppm.o $(OBJ_DIR)/lal_pppm_ext.o \
|
||||
$(OBJ_DIR)/lal_gayberne.o $(OBJ_DIR)/lal_gayberne_ext.o \
|
||||
$(OBJ_DIR)/lal_re_squared.o $(OBJ_DIR)/lal_re_squared_ext.o \
|
||||
@ -25,6 +26,7 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_answer.o \
|
||||
$(OBJ_DIR)/lal_lj_expand.o $(OBJ_DIR)/lal_lj_expand_ext.o \
|
||||
$(OBJ_DIR)/lal_lj_coul.o $(OBJ_DIR)/lal_lj_coul_ext.o \
|
||||
$(OBJ_DIR)/lal_lj_coul_long.o $(OBJ_DIR)/lal_lj_coul_long_ext.o \
|
||||
$(OBJ_DIR)/lal_lj_dsf.o $(OBJ_DIR)/lal_lj_dsf_ext.o \
|
||||
$(OBJ_DIR)/lal_lj_class2_long.o $(OBJ_DIR)/lal_lj_class2_long_ext.o \
|
||||
$(OBJ_DIR)/lal_coul_long.o $(OBJ_DIR)/lal_coul_long_ext.o \
|
||||
$(OBJ_DIR)/lal_morse.o $(OBJ_DIR)/lal_morse_ext.o \
|
||||
@ -36,20 +38,43 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_answer.o \
|
||||
$(OBJ_DIR)/lal_buck_coul.o $(OBJ_DIR)/lal_buck_coul_ext.o \
|
||||
$(OBJ_DIR)/lal_buck_coul_long.o $(OBJ_DIR)/lal_buck_coul_long_ext.o \
|
||||
$(OBJ_DIR)/lal_table.o $(OBJ_DIR)/lal_table_ext.o \
|
||||
$(OBJ_DIR)/lal_yukawa.o $(OBJ_DIR)/lal_yukawa_ext.o
|
||||
$(OBJ_DIR)/lal_yukawa.o $(OBJ_DIR)/lal_yukawa_ext.o \
|
||||
$(OBJ_DIR)/lal_born.o $(OBJ_DIR)/lal_born_ext.o \
|
||||
$(OBJ_DIR)/lal_born_coul_wolf.o $(OBJ_DIR)/lal_born_coul_wolf_ext.o \
|
||||
$(OBJ_DIR)/lal_born_coul_long.o $(OBJ_DIR)/lal_born_coul_long_ext.o \
|
||||
$(OBJ_DIR)/lal_dipole_lj.o $(OBJ_DIR)/lal_dipole_lj_ext.o \
|
||||
$(OBJ_DIR)/lal_dipole_lj_sf.o $(OBJ_DIR)/lal_dipole_lj_sf_ext.o \
|
||||
$(OBJ_DIR)/lal_colloid.o $(OBJ_DIR)/lal_colloid_ext.o \
|
||||
$(OBJ_DIR)/lal_gauss.o $(OBJ_DIR)/lal_gauss_ext.o \
|
||||
$(OBJ_DIR)/lal_yukawa_colloid.o $(OBJ_DIR)/lal_yukawa_colloid_ext.o \
|
||||
$(OBJ_DIR)/lal_lj_coul_debye.o $(OBJ_DIR)/lal_lj_coul_debye_ext.o \
|
||||
$(OBJ_DIR)/lal_coul_dsf.o $(OBJ_DIR)/lal_coul_dsf_ext.o
|
||||
|
||||
KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \
|
||||
$(OBJ_DIR)/neighbor_cpu_cl.h $(OBJ_DIR)/pppm_cl.h \
|
||||
$(OBJ_DIR)/ellipsoid_nbor_cl.h $(OBJ_DIR)/gayberne_cl.h \
|
||||
$(OBJ_DIR)/gayberne_lj_cl.h $(OBJ_DIR)/re_squared_cl.h \
|
||||
$(OBJ_DIR)/re_squared_lj_cl.h $(OBJ_DIR)/lj_cl.h $(OBJ_DIR)/lj96_cl.h \
|
||||
$(OBJ_DIR)/lj_expand_cl.h $(OBJ_DIR)/lj_coul_cl.h \
|
||||
$(OBJ_DIR)/lj_coul_long_cl.h $(OBJ_DIR)/lj_class2_long_cl.h \
|
||||
$(OBJ_DIR)/lj_coul_long_cl.h $(OBJ_DIR)/lj_dsf_cl.h \
|
||||
$(OBJ_DIR)/lj_class2_long_cl.h \
|
||||
$(OBJ_DIR)/coul_long_cl.h $(OBJ_DIR)/morse_cl.h \
|
||||
$(OBJ_DIR)/charmm_long_cl.h $(OBJ_DIR)/cg_cmm_cl.h \
|
||||
$(OBJ_DIR)/cg_cmm_long_cl.h $(OBJ_DIR)/neighbor_gpu_cl.h \
|
||||
$(OBJ_DIR)/eam_cl.h $(OBJ_DIR)/buck_cl.h \
|
||||
$(OBJ_DIR)/buck_coul_cl.h $(OBJ_DIR)/buck_coul_long_cl.h \
|
||||
$(OBJ_DIR)/table_cl.h $(OBJ_DIR)/yukawa_cl.h
|
||||
$(OBJ_DIR)/table_cl.h $(OBJ_DIR)/yukawa_cl.h \
|
||||
$(OBJ_DIR)/born.cubin $(OBJ_DIR)/born_cubin.h \
|
||||
$(OBJ_DIR)/born_coul_wolf.cubin $(OBJ_DIR)/born_coul_wolf_cubin.h \
|
||||
$(OBJ_DIR)/born_coul_long.cubin $(OBJ_DIR)/born_coul_long_cubin.h \
|
||||
$(OBJ_DIR)/dipole_lj.cubin $(OBJ_DIR)/dipole_lj_cubin.h \
|
||||
$(OBJ_DIR)/dipole_lj_sf.cubin $(OBJ_DIR)/dipole_lj_sf_cubin.h \
|
||||
$(OBJ_DIR)/colloid.cubin $(OBJ_DIR)/colloid_cubin.h \
|
||||
$(OBJ_DIR)/gauss.cubin $(OBJ_DIR)/gauss_cubin.h \
|
||||
$(OBJ_DIR)/yukawa_colloid.cubin $(OBJ_DIR)/yukawa_colloid_cubin.h \
|
||||
$(OBJ_DIR)/lj_coul_debye.cubin $(OBJ_DIR)/lj_coul_debye_cubin.h \
|
||||
$(OBJ_DIR)/coul_dsf.cubin $(OBJ_DIR)/coul_dsf_cubin.h
|
||||
|
||||
|
||||
OCL_EXECS = $(BIN_DIR)/ocl_get_devices
|
||||
|
||||
@ -91,6 +116,9 @@ $(OBJ_DIR)/lal_base_charge.o: $(OCL_H) lal_base_charge.h lal_base_charge.cpp
|
||||
$(OBJ_DIR)/lal_base_ellipsoid.o: $(OCL_H) lal_base_ellipsoid.h lal_base_ellipsoid.cpp $(OBJ_DIR)/ellipsoid_nbor_cl.h
|
||||
$(OCL) -o $@ -c lal_base_ellipsoid.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_base_dipole.o: $(OCL_H) lal_base_dipole.h lal_base_dipole.cpp
|
||||
$(OCL) -o $@ -c lal_base_dipole.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/pppm_cl.h: lal_pppm.cu lal_preprocessor.h
|
||||
$(BSH) ./geryon/file_to_cstr.sh pppm lal_preprocessor.h lal_pppm.cu $(OBJ_DIR)/pppm_cl.h;
|
||||
|
||||
@ -154,6 +182,15 @@ $(OBJ_DIR)/lal_lj_coul_long.o: $(ALL_H) lal_lj_coul_long.h lal_lj_coul_long.cpp
|
||||
$(OBJ_DIR)/lal_lj_coul_long_ext.o: $(ALL_H) lal_lj_coul_long.h lal_lj_coul_long_ext.cpp lal_base_charge.h
|
||||
$(OCL) -o $@ -c lal_lj_coul_long_ext.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lj_dsf_cl.h: lal_lj_dsf.cu $(PRE1_H)
|
||||
$(BSH) ./geryon/file_to_cstr.sh lj_dsf $(PRE1_H) lal_lj_dsf.cu $(OBJ_DIR)/lj_dsf_cl.h;
|
||||
|
||||
$(OBJ_DIR)/lal_lj_dsf.o: $(ALL_H) lal_lj_dsf.h lal_lj_dsf.cpp $(OBJ_DIR)/lj_dsf_cl.h $(OBJ_DIR)/lj_dsf_cl.h $(OBJ_DIR)/lal_base_charge.o
|
||||
$(OCL) -o $@ -c lal_lj_dsf.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_lj_dsf_ext.o: $(ALL_H) lal_lj_dsf.h lal_lj_dsf_ext.cpp lal_base_charge.h
|
||||
$(OCL) -o $@ -c lal_lj_dsf_ext.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lj_class2_long_cl.h: lal_lj_class2_long.cu $(PRE1_H)
|
||||
$(BSH) ./geryon/file_to_cstr.sh lj_class2_long $(PRE1_H) lal_lj_class2_long.cu $(OBJ_DIR)/lj_class2_long_cl.h;
|
||||
|
||||
@ -280,6 +317,96 @@ $(OBJ_DIR)/lal_yukawa.o: $(ALL_H) lal_yukawa.h lal_yukawa.cpp $(OBJ_DIR)/yukawa
|
||||
$(OBJ_DIR)/lal_yukawa_ext.o: $(ALL_H) lal_yukawa.h lal_yukawa_ext.cpp lal_base_atomic.h
|
||||
$(OCL) -o $@ -c lal_yukawa_ext.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/born_cl.h: lal_born.cu $(PRE1_H)
|
||||
$(BSH) ./geryon/file_to_cstr.sh born $(PRE1_H) lal_born.cu $(OBJ_DIR)/born_cl.h;
|
||||
|
||||
$(OBJ_DIR)/lal_born.o: $(ALL_H) lal_born.h lal_born.cpp $(OBJ_DIR)/born_cl.h $(OBJ_DIR)/born_cl.h $(OBJ_DIR)/lal_base_atomic.o
|
||||
$(OCL) -o $@ -c lal_born.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_born_ext.o: $(ALL_H) lal_born.h lal_born_ext.cpp lal_base_atomic.h
|
||||
$(OCL) -o $@ -c lal_born_ext.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/born_coul_wolf_cl.h: lal_born_coul_wolf.cu $(PRE1_H)
|
||||
$(BSH) ./geryon/file_to_cstr.sh born_coul_wolf $(PRE1_H) lal_born_coul_wolf.cu $(OBJ_DIR)/born_coul_wolf_cl.h;
|
||||
|
||||
$(OBJ_DIR)/lal_born_coul_wolf.o: $(ALL_H) lal_born_coul_wolf.h lal_born_coul_wolf.cpp $(OBJ_DIR)/born_coul_wolf_cl.h $(OBJ_DIR)/born_coul_wolf_cl.h $(OBJ_DIR)/lal_base_charge.o
|
||||
$(OCL) -o $@ -c lal_born_coul_wolf.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_born_coul_wolf_ext.o: $(ALL_H) lal_born_coul_wolf.h lal_born_coul_wolf_ext.cpp lal_base_charge.h
|
||||
$(OCL) -o $@ -c lal_born_coul_wolf_ext.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/born_coul_long_cl.h: lal_born_coul_long.cu $(PRE1_H)
|
||||
$(BSH) ./geryon/file_to_cstr.sh born_coul_long $(PRE1_H) lal_born_coul_long.cu $(OBJ_DIR)/born_coul_long_cl.h;
|
||||
|
||||
$(OBJ_DIR)/lal_born_coul_long.o: $(ALL_H) lal_born_coul_long.h lal_born_coul_long.cpp $(OBJ_DIR)/born_coul_long_cl.h $(OBJ_DIR)/born_coul_long_cl.h $(OBJ_DIR)/lal_base_charge.o
|
||||
$(OCL) -o $@ -c lal_born_coul_long.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_born_coul_long_ext.o: $(ALL_H) lal_born_coul_long.h lal_born_coul_long_ext.cpp lal_base_charge.h
|
||||
$(OCL) -o $@ -c lal_born_coul_long_ext.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/dipole_lj_cl.h: lal_dipole_lj.cu $(PRE1_H)
|
||||
$(BSH) ./geryon/file_to_cstr.sh dipole_lj $(PRE1_H) lal_dipole_lj.cu $(OBJ_DIR)/dipole_lj_cl.h;
|
||||
|
||||
$(OBJ_DIR)/lal_dipole_lj.o: $(ALL_H) lal_dipole_lj.h lal_dipole_lj.cpp $(OBJ_DIR)/dipole_lj_cl.h $(OBJ_DIR)/dipole_lj_cl.h $(OBJ_DIR)/lal_base_dipole.o
|
||||
$(OCL) -o $@ -c lal_dipole_lj.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_dipole_lj_ext.o: $(ALL_H) lal_dipole_lj.h lal_dipole_lj_ext.cpp lal_base_dipole.h
|
||||
$(OCL) -o $@ -c lal_dipole_lj_ext.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/dipole_lj_sf_cl.h: lal_dipole_lj_sf.cu $(PRE1_H)
|
||||
$(BSH) ./geryon/file_to_cstr.sh dipole_lj_sf $(PRE1_H) lal_dipole_lj_sf.cu $(OBJ_DIR)/dipole_lj_sf_cl.h;
|
||||
|
||||
$(OBJ_DIR)/lal_dipole_lj_sf.o: $(ALL_H) lal_dipole_lj_sf.h lal_dipole_lj_sf.cpp $(OBJ_DIR)/dipole_lj_sf_cl.h $(OBJ_DIR)/dipole_lj_sf_cl.h $(OBJ_DIR)/lal_base_dipole.o
|
||||
$(OCL) -o $@ -c lal_dipole_lj_sf.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_dipole_lj_sf_ext.o: $(ALL_H) lal_dipole_lj_sf.h lal_dipole_lj_sf_ext.cpp lal_base_dipole.h
|
||||
$(OCL) -o $@ -c lal_dipole_lj_sf_ext.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/colloid_cl.h: lal_colloid.cu $(PRE1_H)
|
||||
$(BSH) ./geryon/file_to_cstr.sh colloid $(PRE1_H) lal_colloid.cu $(OBJ_DIR)/colloid_cl.h;
|
||||
|
||||
$(OBJ_DIR)/lal_colloid.o: $(ALL_H) lal_colloid.h lal_colloid.cpp $(OBJ_DIR)/colloid_cl.h $(OBJ_DIR)/colloid_cl.h $(OBJ_DIR)/lal_base_atomic.o
|
||||
$(OCL) -o $@ -c lal_colloid.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_colloid_ext.o: $(ALL_H) lal_colloid.h lal_colloid_ext.cpp lal_base_atomic.h
|
||||
$(OCL) -o $@ -c lal_colloid_ext.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/gauss_cl.h: lal_gauss.cu $(PRE1_H)
|
||||
$(BSH) ./geryon/file_to_cstr.sh gauss $(PRE1_H) lal_gauss.cu $(OBJ_DIR)/gauss_cl.h;
|
||||
|
||||
$(OBJ_DIR)/lal_gauss.o: $(ALL_H) lal_gauss.h lal_gauss.cpp $(OBJ_DIR)/gauss_cl.h $(OBJ_DIR)/gauss_cl.h $(OBJ_DIR)/lal_base_atomic.o
|
||||
$(OCL) -o $@ -c lal_gauss.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_gauss_ext.o: $(ALL_H) lal_gauss.h lal_gauss_ext.cpp lal_base_atomic.h
|
||||
$(OCL) -o $@ -c lal_gauss_ext.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/yukawa_colloid_cl.h: lal_yukawa_colloid.cu $(PRE1_H)
|
||||
$(BSH) ./geryon/file_to_cstr.sh yukawa_colloid $(PRE1_H) lal_yukawa_colloid.cu $(OBJ_DIR)/yukawa_colloid_cl.h;
|
||||
|
||||
$(OBJ_DIR)/lal_yukawa_colloid.o: $(ALL_H) lal_yukawa_colloid.h lal_yukawa_colloid.cpp $(OBJ_DIR)/yukawa_colloid_cl.h $(OBJ_DIR)/yukawa_colloid_cl.h $(OBJ_DIR)/lal_base_atomic.o
|
||||
$(OCL) -o $@ -c lal_yukawa_colloid.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_yukawa_colloid_ext.o: $(ALL_H) lal_yukawa_colloid.h lal_yukawa_colloid_ext.cpp lal_base_atomic.h
|
||||
$(OCL) -o $@ -c lal_yukawa_colloid_ext.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lj_coul_debye_cl.h: lal_lj_coul_debye.cu $(PRE1_H)
|
||||
$(BSH) ./geryon/file_to_cstr.sh lj_coul_debye $(PRE1_H) lal_lj_coul_debye.cu $(OBJ_DIR)/lj_coul_debye_cl.h;
|
||||
|
||||
$(OBJ_DIR)/lal_lj_coul_debye.o: $(ALL_H) lal_lj_coul_debye.h lal_lj_coul_debye.cpp $(OBJ_DIR)/lj_coul_debye_cl.h $(OBJ_DIR)/lj_coul_debye_cl.h $(OBJ_DIR)/lal_base_charge.o
|
||||
$(OCL) -o $@ -c lal_lj_coul_debye.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_lj_coul_debye_ext.o: $(ALL_H) lal_lj_coul_debye.h lal_lj_coul_debye_ext.cpp lal_base_charge.h
|
||||
$(OCL) -o $@ -c lal_lj_coul_debye_ext.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/coul_dsf_cl.h: lal_coul_dsf.cu $(PRE1_H)
|
||||
$(BSH) ./geryon/file_to_cstr.sh coul_dsf $(PRE1_H) lal_coul_dsf.cu $(OBJ_DIR)/coul_dsf_cl.h;
|
||||
|
||||
$(OBJ_DIR)/lal_coul_dsf.o: $(ALL_H) lal_coul_dsf.h lal_coul_dsf.cpp $(OBJ_DIR)/coul_dsf_cl.h $(OBJ_DIR)/coul_dsf_cl.h $(OBJ_DIR)/lal_base_charge.o
|
||||
$(OCL) -o $@ -c lal_coul_dsf.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_coul_dsf_ext.o: $(ALL_H) lal_coul_dsf.h lal_coul_dsf_ext.cpp lal_base_charge.h
|
||||
$(OCL) -o $@ -c lal_coul_dsf_ext.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp
|
||||
$(OCL) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_OPENCL $(OCL_LINK)
|
||||
|
||||
|
||||
@ -1,3 +1,7 @@
|
||||
NOTE: This Geryon distribution has been modified to remove files not
|
||||
necessary for the LAMMPS implementation. The full distribution
|
||||
is available at http://users.nccs.gov/~wb8/geryon/index.htm
|
||||
|
||||
Geryon
|
||||
|
||||
Copyright (2010) Sandia Corporation. Under the terms of Contract
|
||||
|
||||
@ -1 +1 @@
|
||||
Geryon Version 12.034
|
||||
Geryon Version 12.033
|
||||
|
||||
@ -141,6 +141,11 @@ class UCL_Device {
|
||||
/// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
|
||||
inline int device_type(const int i) { return UCL_GPU; }
|
||||
|
||||
/// Returns true if host memory is efficiently addressable from device
|
||||
inline bool shared_memory() { return shared_memory(_device); }
|
||||
/// Returns true if host memory is efficiently addressable from device
|
||||
inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; }
|
||||
|
||||
/// Returns true if double precision is support for the current device
|
||||
bool double_precision() { return double_precision(_device); }
|
||||
/// Returns true if double precision is support for the device
|
||||
|
||||
@ -30,11 +30,23 @@
|
||||
namespace ucl_cudadr {
|
||||
|
||||
class UCL_Texture;
|
||||
template <class numtyp> class UCL_D_Vec;
|
||||
template <class numtyp> class UCL_D_Mat;
|
||||
template <class hosttype, class devtype> class UCL_Vector;
|
||||
template <class hosttype, class devtype> class UCL_Matrix;
|
||||
#define UCL_MAX_KERNEL_ARGS 256
|
||||
|
||||
/// Class storing 1 or more kernel functions from a single string or file
|
||||
class UCL_Program {
|
||||
public:
|
||||
inline UCL_Program(UCL_Device &device) { _cq=device.cq(); }
|
||||
inline UCL_Program(UCL_Device &device, const void *program,
|
||||
const char *flags="", std::string *log=NULL) {
|
||||
_cq=device.cq();
|
||||
init(device);
|
||||
load_string(program,flags,log);
|
||||
}
|
||||
|
||||
inline ~UCL_Program() {}
|
||||
|
||||
/// Initialize the program with a device
|
||||
@ -64,10 +76,10 @@ class UCL_Program {
|
||||
}
|
||||
|
||||
/// Load a program from a string and compile with flags
|
||||
inline int load_string(const char *program, const char *flags="",
|
||||
inline int load_string(const void *program, const char *flags="",
|
||||
std::string *log=NULL) {
|
||||
if (std::string(flags)=="BINARY")
|
||||
return load_binary(program);
|
||||
return load_binary((const char *)program);
|
||||
const unsigned int num_opts=2;
|
||||
CUjit_option options[num_opts];
|
||||
void *values[num_opts];
|
||||
@ -134,15 +146,25 @@ class UCL_Program {
|
||||
friend class UCL_Texture;
|
||||
};
|
||||
|
||||
/// Class for dealing with OpenCL kernels
|
||||
/// Class for dealing with CUDA Driver kernels
|
||||
class UCL_Kernel {
|
||||
public:
|
||||
UCL_Kernel() : _dimensions(1), _num_args(0), _param_size(0)
|
||||
{ _num_blocks[0]=0; }
|
||||
UCL_Kernel() : _dimensions(1), _num_args(0) {
|
||||
#if CUDA_VERSION < 4000
|
||||
_param_size=0;
|
||||
#endif
|
||||
_num_blocks[0]=0;
|
||||
}
|
||||
|
||||
UCL_Kernel(UCL_Program &program, const char *function) :
|
||||
_dimensions(1), _num_args(0), _param_size(0)
|
||||
{ _num_blocks[0]=0; set_function(program,function); _cq=program._cq; }
|
||||
_dimensions(1), _num_args(0) {
|
||||
#if CUDA_VERSION < 4000
|
||||
_param_size=0;
|
||||
#endif
|
||||
_num_blocks[0]=0;
|
||||
set_function(program,function);
|
||||
_cq=program._cq;
|
||||
}
|
||||
|
||||
~UCL_Kernel() {}
|
||||
|
||||
@ -170,78 +192,190 @@ class UCL_Kernel {
|
||||
* changes
|
||||
* \note To set kernel parameter i (i>0), parameter i-1 must be set **/
|
||||
template <class dtype>
|
||||
inline void set_arg(const unsigned index, dtype *arg) {
|
||||
inline void set_arg(const unsigned index, const dtype * const arg) {
|
||||
if (index==_num_args)
|
||||
add_arg(arg);
|
||||
else if (index<_num_args)
|
||||
#if CUDA_VERSION >= 4000
|
||||
_kernel_args[index]=arg;
|
||||
#else
|
||||
CU_SAFE_CALL(cuParamSetv(_kernel, _offsets[index], arg, sizeof(dtype)));
|
||||
#endif
|
||||
else
|
||||
assert(0==1); // Must add kernel parameters in sequential order
|
||||
}
|
||||
|
||||
/// Set a geryon container as a kernel argument.
|
||||
template <class numtyp>
|
||||
inline void set_arg(const UCL_D_Vec<numtyp> * const arg)
|
||||
{ set_arg(&arg->begin()); }
|
||||
|
||||
/// Set a geryon container as a kernel argument.
|
||||
template <class numtyp>
|
||||
inline void set_arg(const UCL_D_Mat<numtyp> * const arg)
|
||||
{ set_arg(&arg->begin()); }
|
||||
|
||||
/// Set a geryon container as a kernel argument.
|
||||
template <class hosttype, class devtype>
|
||||
inline void set_arg(const UCL_Vector<hosttype, devtype> * const arg)
|
||||
{ set_arg(&arg->device.begin()); }
|
||||
|
||||
/// Set a geryon container as a kernel argument.
|
||||
template <class hosttype, class devtype>
|
||||
inline void set_arg(const UCL_Matrix<hosttype, devtype> * const arg)
|
||||
{ set_arg(&arg->device.begin()); }
|
||||
|
||||
/// Add a kernel argument.
|
||||
inline void add_arg(const CUdeviceptr* const arg) {
|
||||
#if CUDA_VERSION >= 4000
|
||||
_kernel_args[_num_args]=(void *)arg;
|
||||
#else
|
||||
void* ptr = (void*)(size_t)(*arg);
|
||||
_param_size = (_param_size + __alignof(ptr) - 1) & ~(__alignof(ptr) - 1);
|
||||
CU_SAFE_CALL(cuParamSetv(_kernel, _param_size, &ptr, sizeof(ptr)));
|
||||
_offsets.push_back(_param_size);
|
||||
_param_size+=sizeof(ptr);
|
||||
#endif
|
||||
_num_args++;
|
||||
if (_num_args>UCL_MAX_KERNEL_ARGS) assert(0==1);
|
||||
}
|
||||
|
||||
/// Add a kernel argument.
|
||||
template <class dtype>
|
||||
inline void add_arg(const dtype* const arg) {
|
||||
#if CUDA_VERSION >= 4000
|
||||
_kernel_args[_num_args]=const_cast<dtype * const>(arg);
|
||||
#else
|
||||
_param_size = (_param_size+__alignof(dtype)-1) & ~(__alignof(dtype)-1);
|
||||
CU_SAFE_CALL(cuParamSetv(_kernel,_param_size,(void*)arg,sizeof(dtype)));
|
||||
_offsets.push_back(_param_size);
|
||||
_param_size+=sizeof(dtype);
|
||||
#endif
|
||||
_num_args++;
|
||||
if (_num_args>UCL_MAX_KERNEL_ARGS) assert(0==1);
|
||||
}
|
||||
|
||||
/// Add a geryon container as a kernel argument.
|
||||
template <class numtyp>
|
||||
inline void add_arg(const UCL_D_Vec<numtyp> * const arg)
|
||||
{ add_arg(&arg->begin()); }
|
||||
|
||||
/// Add a geryon container as a kernel argument.
|
||||
template <class numtyp>
|
||||
inline void add_arg(const UCL_D_Mat<numtyp> * const arg)
|
||||
{ add_arg(&arg->begin()); }
|
||||
|
||||
/// Add a geryon container as a kernel argument.
|
||||
template <class hosttype, class devtype>
|
||||
inline void add_arg(const UCL_Vector<hosttype, devtype> * const arg)
|
||||
{ add_arg(&arg->device.begin()); }
|
||||
|
||||
/// Add a geryon container as a kernel argument.
|
||||
template <class hosttype, class devtype>
|
||||
inline void add_arg(const UCL_Matrix<hosttype, devtype> * const arg)
|
||||
{ add_arg(&arg->device.begin()); }
|
||||
|
||||
/// Set the number of thread blocks and the number of threads in each block
|
||||
/** \note This should be called after all arguments have been added **/
|
||||
/** \note This should be called before any arguments have been added
|
||||
\note The default command queue is used for the kernel execution **/
|
||||
inline void set_size(const size_t num_blocks, const size_t block_size) {
|
||||
_dimensions=1;
|
||||
_num_blocks[0]=num_blocks;
|
||||
_num_blocks[1]=1;
|
||||
_num_blocks[2]=1;
|
||||
#if CUDA_VERSION >= 4000
|
||||
_block_size[0]=block_size;
|
||||
_block_size[1]=1;
|
||||
_block_size[2]=1;
|
||||
#else
|
||||
CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size,1,1));
|
||||
#endif
|
||||
}
|
||||
|
||||
/// Set the number of thread blocks and the number of threads in each block
|
||||
/** \note This should be called before any arguments have been added
|
||||
\note The default command queue for the kernel is changed to cq **/
|
||||
inline void set_size(const size_t num_blocks, const size_t block_size,
|
||||
command_queue &cq)
|
||||
{ _cq=cq; set_size(num_blocks,block_size); }
|
||||
|
||||
/// Set the number of thread blocks and the number of threads in each block
|
||||
/** \note This should be called before any arguments have been added
|
||||
\note The default command queue is used for the kernel execution **/
|
||||
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
|
||||
const size_t block_size_x, const size_t block_size_y) {
|
||||
_dimensions=2;
|
||||
_num_blocks[0]=num_blocks_x;
|
||||
_num_blocks[1]=num_blocks_y;
|
||||
_num_blocks[2]=1;
|
||||
#if CUDA_VERSION >= 4000
|
||||
_block_size[0]=block_size_x;
|
||||
_block_size[1]=block_size_y;
|
||||
_block_size[2]=1;
|
||||
#else
|
||||
CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size_x,block_size_y,1));
|
||||
#endif
|
||||
}
|
||||
|
||||
/// Set the number of thread blocks and the number of threads in each block
|
||||
/** \note This should be called before any arguments have been added
|
||||
\note The default command queue for the kernel is changed to cq **/
|
||||
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
|
||||
const size_t block_size_x, const size_t block_size_y,
|
||||
command_queue &cq)
|
||||
{_cq=cq; set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y);}
|
||||
|
||||
/// Set the number of thread blocks and the number of threads in each block
|
||||
/** \note This should be called before any arguments have been added
|
||||
\note The default command queue is used for the kernel execution **/
|
||||
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
|
||||
const size_t block_size_x,
|
||||
const size_t block_size_y, const size_t block_size_z) {
|
||||
_dimensions=2;
|
||||
_num_blocks[0]=num_blocks_x;
|
||||
_num_blocks[1]=num_blocks_y;
|
||||
_num_blocks[2]=1;
|
||||
#if CUDA_VERSION >= 4000
|
||||
_block_size[0]=block_size_x;
|
||||
_block_size[1]=block_size_y;
|
||||
_block_size[2]=block_size_z;
|
||||
#else
|
||||
CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size_x,block_size_y,
|
||||
block_size_z));
|
||||
#endif
|
||||
}
|
||||
|
||||
/// Set the number of thread blocks and the number of threads in each block
|
||||
/** \note This should be called before any arguments have been added
|
||||
\note The default command queue is used for the kernel execution **/
|
||||
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
|
||||
const size_t block_size_x, const size_t block_size_y,
|
||||
const size_t block_size_z, command_queue &cq) {
|
||||
_cq=cq;
|
||||
set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y,
|
||||
block_size_z);
|
||||
}
|
||||
|
||||
/// Run the kernel in the default command queue
|
||||
inline void run() {
|
||||
#if CUDA_VERSION >= 4000
|
||||
CU_SAFE_CALL(cuLaunchKernel(_kernel,_num_blocks[0],_num_blocks[1],
|
||||
_num_blocks[2],_block_size[0],_block_size[1],
|
||||
_block_size[2],0,_cq,_kernel_args,NULL));
|
||||
#else
|
||||
CU_SAFE_CALL(cuParamSetSize(_kernel,_param_size));
|
||||
CU_SAFE_CALL(cuLaunchGridAsync(_kernel,_num_blocks[0],_num_blocks[1],_cq));
|
||||
}
|
||||
|
||||
/// Run the kernel in the specified command queue
|
||||
inline void run(command_queue &cq) {
|
||||
CU_SAFE_CALL(cuParamSetSize(_kernel,_param_size));
|
||||
CU_SAFE_CALL(cuLaunchGridAsync(_kernel,_num_blocks[0],_num_blocks[1],cq));
|
||||
#endif
|
||||
}
|
||||
|
||||
/// Clear any arguments associated with the kernel
|
||||
inline void clear_args() { _num_args=0; _offsets.clear(); _param_size=0; }
|
||||
inline void clear_args() {
|
||||
_num_args=0;
|
||||
#if CUDA_VERSION < 4000
|
||||
_offsets.clear();
|
||||
_param_size=0;
|
||||
#endif
|
||||
}
|
||||
|
||||
#include "ucl_arg_kludge.h"
|
||||
|
||||
@ -249,11 +383,17 @@ class UCL_Kernel {
|
||||
CUfunction _kernel;
|
||||
CUstream _cq;
|
||||
unsigned _dimensions;
|
||||
unsigned _num_blocks[2];
|
||||
unsigned _num_blocks[3];
|
||||
unsigned _num_args;
|
||||
friend class UCL_Texture;
|
||||
|
||||
#if CUDA_VERSION >= 4000
|
||||
unsigned _block_size[3];
|
||||
void * _kernel_args[UCL_MAX_KERNEL_ARGS];
|
||||
#else
|
||||
std::vector<unsigned> _offsets;
|
||||
unsigned _param_size;
|
||||
friend class UCL_Texture;
|
||||
#endif
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
@ -38,6 +38,9 @@ namespace ucl_cudadr {
|
||||
#include "ucl_h_mat.h"
|
||||
#include "ucl_d_vec.h"
|
||||
#include "ucl_d_mat.h"
|
||||
#include "ucl_s_obj_help.h"
|
||||
#include "ucl_vector.h"
|
||||
#include "ucl_matrix.h"
|
||||
#undef _UCL_DEVICE_PTR_MAT
|
||||
#undef _UCL_MAT_ALLOW
|
||||
|
||||
|
||||
@ -85,6 +85,21 @@ inline void _host_free(mat_type &mat, const enum UCL_MEMOPT kind) {
|
||||
free(mat.begin());
|
||||
}
|
||||
|
||||
template <class mat_type>
|
||||
inline int _host_resize(mat_type &mat, const size_t n) {
|
||||
_host_free(mat,mat.kind());
|
||||
CUresult err=CUDA_SUCCESS;
|
||||
if (mat.kind()==UCL_RW_OPTIMIZED)
|
||||
err=cuMemAllocHost((void **)mat.host_ptr(),n);
|
||||
else if (mat.kind()==UCL_WRITE_OPTIMIZED)
|
||||
err=cuMemHostAlloc((void **)mat.host_ptr(),n,CU_MEMHOSTALLOC_WRITECOMBINED);
|
||||
else
|
||||
*(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
|
||||
if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL)
|
||||
return UCL_MEMORY_ERROR;
|
||||
return UCL_SUCCESS;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// - DEVICE MEMORY ALLOCATION ROUTINES
|
||||
// --------------------------------------------------------------------------
|
||||
@ -143,6 +158,29 @@ inline void _device_free(mat_type &mat) {
|
||||
CU_DESTRUCT_CALL(cuMemFree(mat.cbegin()));
|
||||
}
|
||||
|
||||
template <class mat_type>
|
||||
inline int _device_resize(mat_type &mat, const size_t n) {
|
||||
_device_free(mat);
|
||||
CUresult err=cuMemAlloc(&mat.cbegin(),n);
|
||||
if (err!=CUDA_SUCCESS)
|
||||
return UCL_MEMORY_ERROR;
|
||||
return UCL_SUCCESS;
|
||||
}
|
||||
|
||||
template <class mat_type>
|
||||
inline int _device_resize(mat_type &mat, const size_t rows,
|
||||
const size_t cols, size_t &pitch) {
|
||||
_device_free(mat);
|
||||
CUresult err;
|
||||
CUDA_INT_TYPE upitch;
|
||||
err=cuMemAllocPitch(&mat.cbegin(),&upitch,
|
||||
cols*sizeof(typename mat_type::data_type),rows,16);
|
||||
pitch=static_cast<size_t>(upitch);
|
||||
if (err!=CUDA_SUCCESS)
|
||||
return UCL_MEMORY_ERROR;
|
||||
return UCL_SUCCESS;
|
||||
}
|
||||
|
||||
inline void _device_view(CUdeviceptr *ptr, CUdeviceptr &in) {
|
||||
*ptr=in;
|
||||
}
|
||||
|
||||
@ -42,27 +42,56 @@ class UCL_Texture {
|
||||
{ CU_SAFE_CALL(cuModuleGetTexRef(&_tex, prog._module, texture_name)); }
|
||||
|
||||
/// Bind a float array where each fetch grabs a vector of length numel
|
||||
template<class mat_typ>
|
||||
inline void bind_float(mat_typ &vec, const unsigned numel) {
|
||||
#ifdef UCL_DEBUG
|
||||
assert(numel!=0 && numel<5);
|
||||
#endif
|
||||
CU_SAFE_CALL(cuTexRefSetAddress(NULL, _tex, vec.cbegin(),
|
||||
vec.numel()*vec.element_size()));
|
||||
CU_SAFE_CALL(cuTexRefSetFormat(_tex, CU_AD_FORMAT_FLOAT, numel));
|
||||
}
|
||||
template<class numtyp>
|
||||
inline void bind_float(UCL_D_Vec<numtyp> &vec, const unsigned numel)
|
||||
{ _bind_float(vec,numel); }
|
||||
|
||||
/// Bind a float array where each fetch grabs a vector of length numel
|
||||
template<class numtyp>
|
||||
inline void bind_float(UCL_D_Mat<numtyp> &vec, const unsigned numel)
|
||||
{ _bind_float(vec,numel); }
|
||||
|
||||
/// Bind a float array where each fetch grabs a vector of length numel
|
||||
template<class numtyp, class devtyp>
|
||||
inline void bind_float(UCL_Vector<numtyp, devtyp> &vec, const unsigned numel)
|
||||
{ _bind_float(vec.device,numel); }
|
||||
|
||||
/// Bind a float array where each fetch grabs a vector of length numel
|
||||
template<class numtyp, class devtyp>
|
||||
inline void bind_float(UCL_Matrix<numtyp, devtyp> &vec, const unsigned numel)
|
||||
{ _bind_float(vec.device,numel); }
|
||||
|
||||
/// Unbind the texture reference from the memory allocation
|
||||
inline void unbind() { }
|
||||
|
||||
/// Make a texture reference available to kernel
|
||||
inline void allow(UCL_Kernel &kernel) {
|
||||
#if CUDA_VERSION < 4000
|
||||
CU_SAFE_CALL(cuParamSetTexRef(kernel._kernel, CU_PARAM_TR_DEFAULT, _tex));
|
||||
#endif
|
||||
}
|
||||
|
||||
private:
|
||||
CUtexref _tex;
|
||||
friend class UCL_Kernel;
|
||||
|
||||
template<class mat_typ>
|
||||
inline void _bind_float(mat_typ &vec, const unsigned numel) {
|
||||
#ifdef UCL_DEBUG
|
||||
assert(numel!=0 && numel<5);
|
||||
#endif
|
||||
CU_SAFE_CALL(cuTexRefSetAddress(NULL, _tex, vec.cbegin(),
|
||||
vec.numel()*vec.element_size()));
|
||||
if (vec.element_size()==sizeof(float))
|
||||
CU_SAFE_CALL(cuTexRefSetFormat(_tex, CU_AD_FORMAT_FLOAT, numel));
|
||||
else {
|
||||
if (numel>2)
|
||||
CU_SAFE_CALL(cuTexRefSetFormat(_tex, CU_AD_FORMAT_SIGNED_INT32, numel));
|
||||
else
|
||||
CU_SAFE_CALL(cuTexRefSetFormat(_tex,CU_AD_FORMAT_SIGNED_INT32,numel*2));
|
||||
}
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
@ -158,6 +158,11 @@ class UCL_Device {
|
||||
/// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
|
||||
inline int device_type(const int i);
|
||||
|
||||
/// Returns true if host memory is efficiently addressable from device
|
||||
inline bool shared_memory() { return shared_memory(_device); }
|
||||
/// Returns true if host memory is efficiently addressable from device
|
||||
inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; }
|
||||
|
||||
/// Returns true if double precision is support for the current device
|
||||
bool double_precision() { return double_precision(_device); }
|
||||
/// Returns true if double precision is support for the device
|
||||
|
||||
@ -29,11 +29,25 @@
|
||||
|
||||
namespace ucl_opencl {
|
||||
|
||||
class UCL_Texture;
|
||||
template <class numtyp> class UCL_D_Vec;
|
||||
template <class numtyp> class UCL_D_Mat;
|
||||
template <class hosttype, class devtype> class UCL_Vector;
|
||||
template <class hosttype, class devtype> class UCL_Matrix;
|
||||
#define UCL_MAX_KERNEL_ARGS 256
|
||||
|
||||
/// Class storing 1 or more kernel functions from a single string or file
|
||||
class UCL_Program {
|
||||
public:
|
||||
inline UCL_Program() : _init_done(false) {}
|
||||
inline UCL_Program(UCL_Device &device) : _init_done(false) { init(device); }
|
||||
inline UCL_Program(UCL_Device &device, const void *program,
|
||||
const char *flags="", std::string *log=NULL) :
|
||||
_init_done(false) {
|
||||
init(device);
|
||||
load_string(program,flags,log);
|
||||
}
|
||||
|
||||
inline ~UCL_Program() { clear(); }
|
||||
|
||||
/// Initialize the program with a device
|
||||
@ -78,10 +92,10 @@ class UCL_Program {
|
||||
}
|
||||
|
||||
/// Load a program from a string and compile with flags
|
||||
inline int load_string(const char *program, const char *flags="",
|
||||
inline int load_string(const void *program, const char *flags="",
|
||||
std::string *log=NULL) {
|
||||
cl_int error_flag;
|
||||
const char *prog=program;
|
||||
const char *prog=(const char *)program;
|
||||
_program=clCreateProgramWithSource(_context,1,&prog,NULL,&error_flag);
|
||||
CL_CHECK_ERR(error_flag);
|
||||
error_flag = clBuildProgram(_program,1,&_device,flags,NULL,NULL);
|
||||
@ -159,19 +173,61 @@ class UCL_Kernel {
|
||||
/** If not a device pointer, this must be repeated each time the argument
|
||||
* changes **/
|
||||
template <class dtype>
|
||||
inline void set_arg(const cl_uint index, dtype *arg) {
|
||||
inline void set_arg(const cl_uint index, const dtype * const arg) {
|
||||
CL_SAFE_CALL(clSetKernelArg(_kernel,index,sizeof(dtype),arg));
|
||||
if (index>_num_args) _num_args=index;
|
||||
}
|
||||
|
||||
/// Set a geryon container as a kernel argument.
|
||||
template <class numtyp>
|
||||
inline void set_arg(const UCL_D_Vec<numtyp> * const arg)
|
||||
{ set_arg(&arg->begin()); }
|
||||
|
||||
/// Set a geryon container as a kernel argument.
|
||||
template <class numtyp>
|
||||
inline void set_arg(const UCL_D_Mat<numtyp> * const arg)
|
||||
{ set_arg(&arg->begin()); }
|
||||
|
||||
/// Set a geryon container as a kernel argument.
|
||||
template <class hosttype, class devtype>
|
||||
inline void set_arg(const UCL_Vector<hosttype, devtype> * const arg)
|
||||
{ set_arg(&arg->device.begin()); }
|
||||
|
||||
/// Set a geryon container as a kernel argument.
|
||||
template <class hosttype, class devtype>
|
||||
inline void set_arg(const UCL_Matrix<hosttype, devtype> * const arg)
|
||||
{ set_arg(&arg->device.begin()); }
|
||||
|
||||
/// Add a kernel argument.
|
||||
template <class dtype>
|
||||
inline void add_arg(dtype *arg) {
|
||||
inline void add_arg(const dtype * const arg) {
|
||||
CL_SAFE_CALL(clSetKernelArg(_kernel,_num_args,sizeof(dtype),arg));
|
||||
_num_args++;
|
||||
}
|
||||
|
||||
/// Add a geryon container as a kernel argument.
|
||||
template <class numtyp>
|
||||
inline void add_arg(const UCL_D_Vec<numtyp> * const arg)
|
||||
{ add_arg(&arg->begin()); }
|
||||
|
||||
/// Add a geryon container as a kernel argument.
|
||||
template <class numtyp>
|
||||
inline void add_arg(const UCL_D_Mat<numtyp> * const arg)
|
||||
{ add_arg(&arg->begin()); }
|
||||
|
||||
/// Add a geryon container as a kernel argument.
|
||||
template <class hosttype, class devtype>
|
||||
inline void add_arg(const UCL_Vector<hosttype, devtype> * const arg)
|
||||
{ add_arg(&arg->device.begin()); }
|
||||
|
||||
/// Add a geryon container as a kernel argument.
|
||||
template <class hosttype, class devtype>
|
||||
inline void add_arg(const UCL_Matrix<hosttype, devtype> * const arg)
|
||||
{ add_arg(&arg->device.begin()); }
|
||||
|
||||
/// Set the number of thread blocks and the number of threads in each block
|
||||
/** \note This should be called before any arguments have been added
|
||||
\note The default command queue is used for the kernel execution **/
|
||||
inline void set_size(const size_t num_blocks, const size_t block_size) {
|
||||
_dimensions=1;
|
||||
_num_blocks[0]=num_blocks*block_size;
|
||||
@ -179,6 +235,15 @@ class UCL_Kernel {
|
||||
}
|
||||
|
||||
/// Set the number of thread blocks and the number of threads in each block
|
||||
/** \note This should be called before any arguments have been added
|
||||
\note The default command queue for the kernel is changed to cq **/
|
||||
inline void set_size(const size_t num_blocks, const size_t block_size,
|
||||
command_queue &cq)
|
||||
{ _cq=cq; set_size(num_blocks,block_size); }
|
||||
|
||||
/// Set the number of thread blocks and the number of threads in each block
|
||||
/** \note This should be called before any arguments have been added
|
||||
\note The default command queue is used for the kernel execution **/
|
||||
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
|
||||
const size_t block_size_x, const size_t block_size_y) {
|
||||
_dimensions=2;
|
||||
@ -189,6 +254,16 @@ class UCL_Kernel {
|
||||
}
|
||||
|
||||
/// Set the number of thread blocks and the number of threads in each block
|
||||
/** \note This should be called before any arguments have been added
|
||||
\note The default command queue for the kernel is changed to cq **/
|
||||
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
|
||||
const size_t block_size_x, const size_t block_size_y,
|
||||
command_queue &cq)
|
||||
{_cq=cq; set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y);}
|
||||
|
||||
/// Set the number of thread blocks and the number of threads in each block
|
||||
/** \note This should be called before any arguments have been added
|
||||
\note The default command queue is used for the kernel execution **/
|
||||
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
|
||||
const size_t block_size_x,
|
||||
const size_t block_size_y, const size_t block_size_z) {
|
||||
@ -202,14 +277,20 @@ class UCL_Kernel {
|
||||
_block_size[2]=block_size_z;
|
||||
}
|
||||
|
||||
/// Run the kernel in the default command queue
|
||||
inline void run() {
|
||||
run(_cq);
|
||||
/// Set the number of thread blocks and the number of threads in each block
|
||||
/** \note This should be called before any arguments have been added
|
||||
\note The default command queue is used for the kernel execution **/
|
||||
inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
|
||||
const size_t block_size_x, const size_t block_size_y,
|
||||
const size_t block_size_z, command_queue &cq) {
|
||||
_cq=cq;
|
||||
set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y,
|
||||
block_size_z);
|
||||
}
|
||||
|
||||
/// Run the kernel in the specified command queue
|
||||
inline void run(command_queue &cq) {
|
||||
CL_SAFE_CALL(clEnqueueNDRangeKernel(cq,_kernel,_dimensions,NULL,
|
||||
/// Run the kernel in the default command queue
|
||||
inline void run() {
|
||||
CL_SAFE_CALL(clEnqueueNDRangeKernel(_cq,_kernel,_dimensions,NULL,
|
||||
_num_blocks,_block_size,0,NULL,NULL));
|
||||
}
|
||||
|
||||
|
||||
@ -39,6 +39,9 @@ namespace ucl_opencl {
|
||||
#include "ucl_h_mat.h"
|
||||
#include "ucl_d_vec.h"
|
||||
#include "ucl_d_mat.h"
|
||||
#include "ucl_s_obj_help.h"
|
||||
#include "ucl_vector.h"
|
||||
#include "ucl_matrix.h"
|
||||
#undef _UCL_DEVICE_PTR_MAT
|
||||
#undef _OCL_MAT
|
||||
#undef _UCL_MAT_ALLOW
|
||||
|
||||
@ -132,6 +132,37 @@ inline void _host_free(mat_type &mat, const enum UCL_MEMOPT kind) {
|
||||
CL_DESTRUCT_CALL(clReleaseCommandQueue(mat.cq()));
|
||||
}
|
||||
|
||||
template <class mat_type>
|
||||
inline int _host_resize(mat_type &mat, const size_t n) {
|
||||
cl_int error_flag;
|
||||
cl_context context;
|
||||
CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_CONTEXT,sizeof(context),
|
||||
&context,NULL));
|
||||
|
||||
CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
|
||||
if (mat.kind()==UCL_WRITE_OPTIMIZED) {
|
||||
mat.cbegin()=clCreateBuffer(context,
|
||||
CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
|
||||
n,NULL,&error_flag);
|
||||
if (error_flag != CL_SUCCESS)
|
||||
return UCL_MEMORY_ERROR;
|
||||
*mat.host_ptr() = (typename mat_type::data_type*)
|
||||
clEnqueueMapBuffer(mat.cq(),mat.cbegin(),CL_TRUE,
|
||||
CL_MAP_WRITE,0,n,0,NULL,NULL,NULL);
|
||||
} else {
|
||||
mat.cbegin()=clCreateBuffer(context,
|
||||
CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
|
||||
n,NULL,&error_flag);
|
||||
if (error_flag != CL_SUCCESS)
|
||||
return UCL_MEMORY_ERROR;
|
||||
*mat.host_ptr() = (typename mat_type::data_type*)
|
||||
clEnqueueMapBuffer(mat.cq(),mat.cbegin(),CL_TRUE,
|
||||
CL_MAP_READ | CL_MAP_WRITE,
|
||||
0,n,0,NULL,NULL,NULL);
|
||||
}
|
||||
return UCL_SUCCESS;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// - DEVICE MEMORY ALLOCATION ROUTINES
|
||||
// --------------------------------------------------------------------------
|
||||
@ -211,6 +242,61 @@ inline void _device_free(mat_type &mat) {
|
||||
CL_DESTRUCT_CALL(clReleaseCommandQueue(mat.cq()));
|
||||
}
|
||||
|
||||
template <class mat_type>
|
||||
inline int _device_resize(mat_type &mat, const size_t n) {
|
||||
cl_int error_flag;
|
||||
|
||||
cl_context context;
|
||||
CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_CONTEXT,sizeof(context),
|
||||
&context,NULL));
|
||||
CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
|
||||
|
||||
cl_mem_flags flag;
|
||||
if (mat.kind()==UCL_READ_WRITE)
|
||||
flag=CL_MEM_READ_WRITE;
|
||||
else if (mat.kind()==UCL_READ_ONLY)
|
||||
flag=CL_MEM_READ_ONLY;
|
||||
else if (mat.kind()==UCL_WRITE_ONLY)
|
||||
flag=CL_MEM_WRITE_ONLY;
|
||||
else
|
||||
assert(0==1);
|
||||
mat.cbegin()=clCreateBuffer(context,flag,n,NULL,&error_flag);
|
||||
if (error_flag != CL_SUCCESS)
|
||||
return UCL_MEMORY_ERROR;
|
||||
return UCL_SUCCESS;
|
||||
}
|
||||
|
||||
template <class mat_type>
|
||||
inline int _device_resize(mat_type &mat, const size_t rows,
|
||||
const size_t cols, size_t &pitch) {
|
||||
size_t padded_cols=cols;
|
||||
if (cols%256!=0)
|
||||
padded_cols+=256-cols%256;
|
||||
pitch=padded_cols*sizeof(typename mat_type::data_type);
|
||||
|
||||
cl_int error_flag;
|
||||
|
||||
cl_context context;
|
||||
CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_CONTEXT,sizeof(context),
|
||||
&context,NULL));
|
||||
CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
|
||||
|
||||
cl_mem_flags flag;
|
||||
if (mat.kind()==UCL_READ_WRITE)
|
||||
flag=CL_MEM_READ_WRITE;
|
||||
else if (mat.kind()==UCL_READ_ONLY)
|
||||
flag=CL_MEM_READ_ONLY;
|
||||
else if (mat.kind()==UCL_WRITE_ONLY)
|
||||
flag=CL_MEM_WRITE_ONLY;
|
||||
else
|
||||
assert(0==1);
|
||||
mat.cbegin()=clCreateBuffer(context,flag,pitch*rows,NULL,&error_flag);
|
||||
if (error_flag != CL_SUCCESS)
|
||||
return UCL_MEMORY_ERROR;
|
||||
return UCL_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// - ZERO ROUTINES
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
@ -828,441 +828,3 @@
|
||||
add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30);
|
||||
run();
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
template <class t1>
|
||||
inline void run_cq(command_queue &cq, t1 *a1) {
|
||||
clear_args();
|
||||
add_arg(a1);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17, class t18>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17, class t18, class t19>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17, class t18, class t19, class t20>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17, class t18, class t19, class t20,
|
||||
class t21>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17, class t18, class t19, class t20,
|
||||
class t21, class t22>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21, t22 *a22) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17, class t18, class t19, class t20,
|
||||
class t21, class t22, class t23>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21, t22 *a22, t23 *a23) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17, class t18, class t19, class t20,
|
||||
class t21, class t22, class t23, class t24>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21, t22 *a22, t23 *a23, t24 *a24) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17, class t18, class t19, class t20,
|
||||
class t21, class t22, class t23, class t24, class t25>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17, class t18, class t19, class t20,
|
||||
class t21, class t22, class t23, class t24, class t25,
|
||||
class t26>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
|
||||
t26 *a26) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||
add_arg(a26);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17, class t18, class t19, class t20,
|
||||
class t21, class t22, class t23, class t24, class t25,
|
||||
class t26, class t27>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
|
||||
t26 *a26, t27 *a27) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||
add_arg(a26); add_arg(a27);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17, class t18, class t19, class t20,
|
||||
class t21, class t22, class t23, class t24, class t25,
|
||||
class t26, class t27, class t28>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
|
||||
t26 *a26, t27 *a27, t28 *a28) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||
add_arg(a26); add_arg(a27); add_arg(a28);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17, class t18, class t19, class t20,
|
||||
class t21, class t22, class t23, class t24, class t25,
|
||||
class t26, class t27, class t28, class t29>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
|
||||
t26 *a26, t27 *a27, t28 *a28, t29 *a29) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||
add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3, class t4, class t5,
|
||||
class t6, class t7, class t8, class t9, class t10,
|
||||
class t11, class t12, class t13, class t14, class t15,
|
||||
class t16, class t17, class t18, class t19, class t20,
|
||||
class t21, class t22, class t23, class t24, class t25,
|
||||
class t26, class t27, class t28, class t29, class t30>
|
||||
inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
|
||||
t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
|
||||
t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
|
||||
t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
|
||||
t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
|
||||
t26 *a26, t27 *a27, t28 *a28, t29 *a29, t30 *a30) {
|
||||
clear_args();
|
||||
add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
|
||||
add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
|
||||
add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
|
||||
add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
|
||||
add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
|
||||
add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30);
|
||||
run(cq);
|
||||
}
|
||||
|
||||
|
||||
@ -344,6 +344,39 @@ class UCL_D_Mat : public UCL_BaseMat {
|
||||
inline void clear()
|
||||
{ _rows=0; if (_kind!=UCL_VIEW) { _kind=UCL_VIEW; _device_free(*this); } }
|
||||
|
||||
/// Resize the allocation to contain cols elements
|
||||
/** \note Cannot be used on views **/
|
||||
inline int resize(const int rows, const int cols) {
|
||||
assert(_kind!=UCL_VIEW);
|
||||
|
||||
int err=_device_resize(*this,rows,cols,_pitch);
|
||||
if (err!=UCL_SUCCESS) {
|
||||
#ifndef UCL_NO_EXIT
|
||||
std::cerr << "UCL Error: Could not allocate "
|
||||
<< rows*cols*sizeof(numtyp) << " bytes on device.\n";
|
||||
UCL_GERYON_EXIT;
|
||||
#endif
|
||||
return err;
|
||||
}
|
||||
|
||||
_rows=rows;
|
||||
_cols=cols;
|
||||
_row_size=_pitch/sizeof(numtyp);
|
||||
#ifndef _UCL_DEVICE_PTR_MAT
|
||||
_end=_array+_row_size*cols;
|
||||
#endif
|
||||
#ifdef _OCL_MAT
|
||||
_offset=0;
|
||||
#endif
|
||||
return err;
|
||||
}
|
||||
|
||||
/// Resize (only if bigger) the allocation to contain rows x cols elements
|
||||
/** \note Cannot be used on views **/
|
||||
inline int resize_ib(const int rows, const int cols)
|
||||
{ if (cols>_cols || rows>_rows) return resize(rows,cols);
|
||||
else return UCL_SUCCESS; }
|
||||
|
||||
/// Set each element to zero
|
||||
inline void zero() { _device_zero(*this,row_bytes()*_rows); }
|
||||
|
||||
@ -357,9 +390,9 @@ class UCL_D_Mat : public UCL_BaseMat {
|
||||
inline const device_ptr & begin() const { return _array; }
|
||||
#else
|
||||
/// For CUDA-RT, get device pointer to first element
|
||||
inline numtyp * begin() { return _array; }
|
||||
inline numtyp * & begin() { return _array; }
|
||||
/// For CUDA-RT, get device pointer to first element
|
||||
inline const numtyp * begin() const { return _array; }
|
||||
inline numtyp * const & begin() const { return _array; }
|
||||
/// For CUDA-RT, get device pointer to one past last element
|
||||
inline numtyp * end() { return _end; }
|
||||
/// For CUDA-RT, get device pointer to one past last element
|
||||
|
||||
@ -340,6 +340,39 @@ class UCL_D_Vec : public UCL_BaseMat {
|
||||
inline void clear()
|
||||
{ if (_kind!=UCL_VIEW) { _cols=0; _kind=UCL_VIEW; _device_free(*this); } }
|
||||
|
||||
/// Resize the allocation to contain cols elements
|
||||
/** \note Cannot be used on views **/
|
||||
inline int resize(const int cols) {
|
||||
assert(_kind!=UCL_VIEW);
|
||||
|
||||
_row_bytes=cols*sizeof(numtyp);
|
||||
int err=_device_resize(*this,_row_bytes);
|
||||
if (err!=UCL_SUCCESS) {
|
||||
#ifndef UCL_NO_EXIT
|
||||
std::cerr << "UCL Error: Could not allocate " << _row_bytes
|
||||
<< " bytes on device.\n";
|
||||
_row_bytes=0;
|
||||
UCL_GERYON_EXIT;
|
||||
#endif
|
||||
_row_bytes=0;
|
||||
return err;
|
||||
}
|
||||
|
||||
_cols=cols;
|
||||
#ifndef _UCL_DEVICE_PTR_MAT
|
||||
_end=_array+cols;
|
||||
#endif
|
||||
#ifdef _OCL_MAT
|
||||
_offset=0;
|
||||
#endif
|
||||
return err;
|
||||
}
|
||||
|
||||
/// Resize (only if bigger) the allocation to contain cols elements
|
||||
/** \note Cannot be used on views **/
|
||||
inline int resize_ib(const int cols)
|
||||
{ if (cols>_cols) return resize(cols); else return UCL_SUCCESS; }
|
||||
|
||||
/// Set each element to zero
|
||||
inline void zero() { _device_zero(*this,row_bytes()); }
|
||||
|
||||
@ -353,13 +386,13 @@ class UCL_D_Vec : public UCL_BaseMat {
|
||||
inline const device_ptr & begin() const { return _array; }
|
||||
#else
|
||||
/// For CUDA-RT, get device pointer to first element
|
||||
inline numtyp * begin() { return _array; }
|
||||
inline numtyp * & begin() { return _array; }
|
||||
/// For CUDA-RT, get device pointer to first element
|
||||
inline const numtyp * begin() const { return _array; }
|
||||
inline numtyp * const & begin() const { return _array; }
|
||||
/// For CUDA-RT, get device pointer to one past last element
|
||||
inline numtyp * end() { return _end; }
|
||||
/// For CUDA-RT, get device pointer to one past last element
|
||||
inline const numtyp * end() const { return _end; }
|
||||
inline numtyp * end() const { return _end; }
|
||||
#endif
|
||||
|
||||
#ifdef _UCL_DEVICE_PTR_MAT
|
||||
|
||||
@ -318,6 +318,36 @@ class UCL_H_Mat : public UCL_BaseMat {
|
||||
inline void clear()
|
||||
{ if (_kind!=UCL_VIEW) {_rows=0; _kind=UCL_VIEW; _host_free(*this,_kind); }}
|
||||
|
||||
/// Resize the allocation to rows x cols elements
|
||||
/** \note Cannot be used on views **/
|
||||
inline int resize(const int rows, const int cols) {
|
||||
assert(_kind!=UCL_VIEW);
|
||||
|
||||
_row_bytes=cols*sizeof(numtyp);
|
||||
int err=_host_resize(*this,_row_bytes*rows);
|
||||
if (err!=UCL_SUCCESS) {
|
||||
#ifndef UCL_NO_EXIT
|
||||
std::cerr << "UCL Error: Could not allocate " << _row_bytes*_rows
|
||||
<< " bytes on host.\n";
|
||||
_row_bytes=0;
|
||||
UCL_GERYON_EXIT;
|
||||
#endif
|
||||
_row_bytes=0;
|
||||
return err;
|
||||
}
|
||||
|
||||
_cols=cols;
|
||||
_rows=rows;
|
||||
_end=_array+rows*cols;
|
||||
return err;
|
||||
}
|
||||
|
||||
/// Resize (only if bigger) the allocation to contain rows x cols elements
|
||||
/** \note Cannot be used on views **/
|
||||
inline int resize_ib(const int rows, const int cols)
|
||||
{ if (cols>_cols || rows>_rows) return resize(rows,cols);
|
||||
else return UCL_SUCCESS; }
|
||||
|
||||
/// Set each element to zero
|
||||
inline void zero() { _host_zero(_array,_rows*row_bytes()); }
|
||||
/// Set first n elements to zero
|
||||
|
||||
@ -316,6 +316,34 @@ class UCL_H_Vec : public UCL_BaseMat {
|
||||
inline void clear()
|
||||
{ if (_kind!=UCL_VIEW) {_kind=UCL_VIEW; _cols=0; _host_free(*this,_kind);}}
|
||||
|
||||
/// Resize the allocation to contain cols elements
|
||||
/** \note Cannot be used on views **/
|
||||
inline int resize(const int cols) {
|
||||
assert(_kind!=UCL_VIEW);
|
||||
_row_bytes=cols*sizeof(numtyp);
|
||||
int err=_host_resize(*this,_row_bytes);
|
||||
|
||||
if (err!=UCL_SUCCESS) {
|
||||
#ifndef UCL_NO_EXIT
|
||||
std::cerr << "UCL Error: Could not allocate " << _row_bytes
|
||||
<< " bytes on host.\n";
|
||||
_row_bytes=0;
|
||||
UCL_GERYON_EXIT;
|
||||
#endif
|
||||
_row_bytes=0;
|
||||
return err;
|
||||
}
|
||||
|
||||
_cols=cols;
|
||||
_end=_array+cols;
|
||||
return err;
|
||||
}
|
||||
|
||||
/// Resize (only if bigger) the allocation to contain cols elements
|
||||
/** \note Cannot be used on views **/
|
||||
inline int resize_ib(const int cols)
|
||||
{ if (cols>_cols) return resize(cols); else return UCL_SUCCESS; }
|
||||
|
||||
/// Set each element to zero
|
||||
inline void zero() { _host_zero(_array,row_bytes()); }
|
||||
|
||||
|
||||
@ -270,4 +270,13 @@ template <class numtyp>
|
||||
inline std::ostream & operator << (std::ostream &out, UCL_D_Mat<numtyp> &mat)
|
||||
{ ucl_print(mat,out); return out; }
|
||||
|
||||
|
||||
template <class t1, class t2>
|
||||
inline std::ostream & operator << (std::ostream &out, UCL_Vector<t1,t2> &mat)
|
||||
{ ucl_print(mat.host,out); return out; }
|
||||
|
||||
template <class t1, class t2>
|
||||
inline std::ostream & operator << (std::ostream &out, UCL_Matrix<t1,t2> &mat)
|
||||
{ ucl_print(mat.host,out); return out; }
|
||||
|
||||
#endif
|
||||
|
||||
@ -117,5 +117,61 @@ enum UCL_ERROR_FLAG {
|
||||
template <class numtyp>
|
||||
const char * ucl_template_name() { return _UCL_DATA_ID<numtyp>::name(); }
|
||||
|
||||
template <class t1, class t2> struct ucl_same_type;
|
||||
|
||||
template <> struct ucl_same_type<bool,bool> { enum { ans=1 }; };
|
||||
template <> struct ucl_same_type<char,char> { enum { ans=1 }; };
|
||||
template <> struct ucl_same_type<unsigned char,unsigned char> { enum { ans=1 }; };
|
||||
template <> struct ucl_same_type<int,int> { enum { ans=1 }; };
|
||||
template <> struct ucl_same_type<unsigned,unsigned> { enum { ans=1 }; };
|
||||
template <> struct ucl_same_type<short,short> { enum { ans=1 }; };
|
||||
template <> struct ucl_same_type<unsigned short,unsigned short> { enum { ans=1 }; };
|
||||
template <> struct ucl_same_type<long,long> { enum { ans=1 }; };
|
||||
template <> struct ucl_same_type<unsigned long,unsigned long> { enum { ans=1 }; };
|
||||
template <> struct ucl_same_type<float,float> { enum { ans=1 }; };
|
||||
template <> struct ucl_same_type<double,double> { enum { ans=1 }; };
|
||||
template <> struct ucl_same_type<long double,long double> { enum { ans=1 }; };
|
||||
|
||||
template <> struct ucl_same_type<const bool,bool> { enum { ans=1 }; };
|
||||
template <> struct ucl_same_type<const char,char> { enum { ans=1 }; };
|
||||
template <> struct ucl_same_type<const unsigned char,unsigned char> { enum { ans=1 }; };
|
||||
template <> struct ucl_same_type<const int,int> { enum { ans=1 }; };
|
||||
template <> struct ucl_same_type<const unsigned,unsigned> { enum { ans=1 }; };
|
||||
template <> struct ucl_same_type<const short,short> { enum { ans=1 }; };
|
||||
template <> struct ucl_same_type<const unsigned short,unsigned short> { enum { ans=1 }; };
|
||||
template <> struct ucl_same_type<const long,long> { enum { ans=1 }; };
|
||||
template <> struct ucl_same_type<const unsigned long,unsigned long> { enum { ans=1 }; };
|
||||
template <> struct ucl_same_type<const float,float> { enum { ans=1 }; };
|
||||
template <> struct ucl_same_type<const double,double> { enum { ans=1 }; };
|
||||
template <> struct ucl_same_type<const long double,long double> { enum { ans=1 }; };
|
||||
|
||||
template <> struct ucl_same_type<bool,const bool> { enum { ans=1 }; };
|
||||
template <> struct ucl_same_type<char,const char> { enum { ans=1 }; };
|
||||
template <> struct ucl_same_type<unsigned char,const unsigned char> { enum { ans=1 }; };
|
||||
template <> struct ucl_same_type<int,const int> { enum { ans=1 }; };
|
||||
template <> struct ucl_same_type<unsigned,const unsigned> { enum { ans=1 }; };
|
||||
template <> struct ucl_same_type<short,const short> { enum { ans=1 }; };
|
||||
template <> struct ucl_same_type<unsigned short,const unsigned short> { enum { ans=1 }; };
|
||||
template <> struct ucl_same_type<long,const long> { enum { ans=1 }; };
|
||||
template <> struct ucl_same_type<unsigned long,const unsigned long> { enum { ans=1 }; };
|
||||
template <> struct ucl_same_type<float,const float> { enum { ans=1 }; };
|
||||
template <> struct ucl_same_type<double,const double> { enum { ans=1 }; };
|
||||
template <> struct ucl_same_type<long double,const long double> { enum { ans=1 }; };
|
||||
|
||||
template <> struct ucl_same_type<const bool,const bool> { enum { ans=1 }; };
|
||||
template <> struct ucl_same_type<const char,const char> { enum { ans=1 }; };
|
||||
template <> struct ucl_same_type<const unsigned char,const unsigned char> { enum { ans=1 }; };
|
||||
template <> struct ucl_same_type<const int,const int> { enum { ans=1 }; };
|
||||
template <> struct ucl_same_type<const unsigned,const unsigned> { enum { ans=1 }; };
|
||||
template <> struct ucl_same_type<const short,const short> { enum { ans=1 }; };
|
||||
template <> struct ucl_same_type<const unsigned short,const unsigned short> { enum { ans=1 }; };
|
||||
template <> struct ucl_same_type<const long,const long> { enum { ans=1 }; };
|
||||
template <> struct ucl_same_type<const unsigned long,const unsigned long> { enum { ans=1 }; };
|
||||
template <> struct ucl_same_type<const float,const float> { enum { ans=1 }; };
|
||||
template <> struct ucl_same_type<const double,const double> { enum { ans=1 }; };
|
||||
template <> struct ucl_same_type<const long double,const long double> { enum { ans=1 }; };
|
||||
|
||||
template <class t1, class t2> struct ucl_same_type { enum { ans=0 }; };
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
@ -39,30 +39,16 @@ bool AnswerT::alloc(const int inum) {
|
||||
|
||||
bool success=true;
|
||||
|
||||
int ans_elements=4;
|
||||
_ans_fields=4;
|
||||
if (_rot)
|
||||
ans_elements+=4;
|
||||
|
||||
// Ignore host/device transfers?
|
||||
bool cpuview=false;
|
||||
if (dev->device_type()==UCL_CPU)
|
||||
cpuview=true;
|
||||
|
||||
// -------------------------- Host allocations
|
||||
success=success &&(host_ans.alloc(ans_elements*_max_local,*dev)==UCL_SUCCESS);
|
||||
success=success &&(host_engv.alloc(_ev_fields*_max_local,*dev)==UCL_SUCCESS);
|
||||
_ans_fields+=4;
|
||||
|
||||
// --------------------------- Device allocations
|
||||
if (cpuview) {
|
||||
dev_engv.view(host_engv);
|
||||
dev_ans.view(host_ans);
|
||||
} else {
|
||||
success=success && (dev_engv.alloc(_ev_fields*_max_local,*dev,
|
||||
success=success && (engv.alloc(_ev_fields*_max_local,*dev,UCL_RW_OPTIMIZED,
|
||||
UCL_WRITE_ONLY)==UCL_SUCCESS);
|
||||
success=success && (dev_ans.alloc(ans_elements*_max_local,
|
||||
*dev,UCL_WRITE_ONLY)==UCL_SUCCESS);
|
||||
}
|
||||
_gpu_bytes=dev_engv.row_bytes()+dev_ans.row_bytes();
|
||||
success=success && (force.alloc(_ans_fields*_max_local,*dev,UCL_RW_OPTIMIZED,
|
||||
UCL_WRITE_ONLY)==UCL_SUCCESS);
|
||||
_gpu_bytes=engv.device.row_bytes()+force.device.row_bytes();
|
||||
|
||||
_allocated=true;
|
||||
return success;
|
||||
@ -114,32 +100,24 @@ bool AnswerT::add_fields(const bool charge, const bool rot) {
|
||||
if (realloc) {
|
||||
_other=_charge || _rot;
|
||||
int inum=_max_local;
|
||||
clear_resize();
|
||||
force.clear();
|
||||
engv.clear();
|
||||
_allocated=false;
|
||||
return alloc(inum);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void AnswerT::clear_resize() {
|
||||
if (!_allocated)
|
||||
return;
|
||||
_allocated=false;
|
||||
|
||||
dev_ans.clear();
|
||||
dev_engv.clear();
|
||||
host_ans.clear();
|
||||
host_engv.clear();
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void AnswerT::clear() {
|
||||
_gpu_bytes=0;
|
||||
if (!_allocated)
|
||||
return;
|
||||
_allocated=false;
|
||||
|
||||
force.clear();
|
||||
engv.clear();
|
||||
time_answer.clear();
|
||||
clear_resize();
|
||||
_inum=0;
|
||||
_ilist=NULL;
|
||||
_eflag=false;
|
||||
@ -174,11 +152,11 @@ void AnswerT::copy_answers(const bool eflag, const bool vflag,
|
||||
csize-=6;
|
||||
|
||||
if (csize>0)
|
||||
ucl_copy(host_engv,dev_engv,_inum*csize,true);
|
||||
engv.update_host(_inum*csize,true);
|
||||
if (_rot)
|
||||
ucl_copy(host_ans,dev_ans,_inum*4*2,true);
|
||||
force.update_host(_inum*4*2,true);
|
||||
else
|
||||
ucl_copy(host_ans,dev_ans,_inum*4,true);
|
||||
force.update_host(_inum*4,true);
|
||||
time_answer.stop();
|
||||
}
|
||||
|
||||
@ -201,28 +179,28 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
|
||||
for (int i=0; i<6; i++) virial_acc[i]=0.0;
|
||||
if (_ilist==NULL) {
|
||||
for (int i=0; i<_inum; i++) {
|
||||
acctyp *ap=host_engv.begin()+i;
|
||||
int al=i;
|
||||
if (_eflag) {
|
||||
if (_ef_atom) {
|
||||
evdwl+=*ap;
|
||||
eatom[i]+=*ap*0.5;
|
||||
ap+=_inum;
|
||||
evdwl+=engv[al];
|
||||
eatom[i]+=engv[al]*0.5;
|
||||
al+=_inum;
|
||||
} else {
|
||||
evdwl+=*ap;
|
||||
ap+=_inum;
|
||||
evdwl+=engv[al];
|
||||
al+=_inum;
|
||||
}
|
||||
}
|
||||
if (_vflag) {
|
||||
if (_vf_atom) {
|
||||
for (int j=0; j<6; j++) {
|
||||
vatom[i][j]+=*ap*0.5;
|
||||
virial_acc[j]+=*ap;
|
||||
ap+=_inum;
|
||||
vatom[i][j]+=engv[al]*0.5;
|
||||
virial_acc[j]+=engv[al];
|
||||
al+=_inum;
|
||||
}
|
||||
} else {
|
||||
for (int j=0; j<6; j++) {
|
||||
virial_acc[j]+=*ap;
|
||||
ap+=_inum;
|
||||
virial_acc[j]+=engv[al];
|
||||
al+=_inum;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -231,29 +209,29 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
|
||||
virial[j]+=virial_acc[j]*0.5;
|
||||
} else {
|
||||
for (int i=0; i<_inum; i++) {
|
||||
acctyp *ap=host_engv.begin()+i;
|
||||
int al=i;
|
||||
int ii=_ilist[i];
|
||||
if (_eflag) {
|
||||
if (_ef_atom) {
|
||||
evdwl+=*ap;
|
||||
eatom[ii]+=*ap*0.5;
|
||||
ap+=_inum;
|
||||
evdwl+=engv[al];
|
||||
eatom[ii]+=engv[al]*0.5;
|
||||
al+=_inum;
|
||||
} else {
|
||||
evdwl+=*ap;
|
||||
ap+=_inum;
|
||||
evdwl+=engv[al];
|
||||
al+=_inum;
|
||||
}
|
||||
}
|
||||
if (_vflag) {
|
||||
if (_vf_atom) {
|
||||
for (int j=0; j<6; j++) {
|
||||
vatom[ii][j]+=*ap*0.5;
|
||||
virial_acc[j]+=*ap;
|
||||
ap+=_inum;
|
||||
vatom[ii][j]+=engv[al]*0.5;
|
||||
virial_acc[j]+=engv[al];
|
||||
al+=_inum;
|
||||
}
|
||||
} else {
|
||||
for (int j=0; j<6; j++) {
|
||||
virial_acc[j]+=*ap;
|
||||
ap+=_inum;
|
||||
virial_acc[j]+=engv[al];
|
||||
al+=_inum;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -281,33 +259,33 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
|
||||
for (int i=0; i<6; i++) virial_acc[i]=0.0;
|
||||
if (_ilist==NULL) {
|
||||
for (int i=0; i<_inum; i++) {
|
||||
acctyp *ap=host_engv.begin()+i;
|
||||
int al=i;
|
||||
if (_eflag) {
|
||||
if (_ef_atom) {
|
||||
evdwl+=*ap;
|
||||
eatom[i]+=*ap*0.5;
|
||||
ap+=_inum;
|
||||
_ecoul+=*ap;
|
||||
eatom[i]+=*ap*0.5;
|
||||
ap+=_inum;
|
||||
evdwl+=engv[al];
|
||||
eatom[i]+=engv[al]*0.5;
|
||||
al+=_inum;
|
||||
_ecoul+=engv[al];
|
||||
eatom[i]+=engv[al]*0.5;
|
||||
al+=_inum;
|
||||
} else {
|
||||
evdwl+=*ap;
|
||||
ap+=_inum;
|
||||
_ecoul+=*ap;
|
||||
ap+=_inum;
|
||||
evdwl+=engv[al];
|
||||
al+=_inum;
|
||||
_ecoul+=engv[al];
|
||||
al+=_inum;
|
||||
}
|
||||
}
|
||||
if (_vflag) {
|
||||
if (_vf_atom) {
|
||||
for (int j=0; j<6; j++) {
|
||||
vatom[i][j]+=*ap*0.5;
|
||||
virial_acc[j]+=*ap;
|
||||
ap+=_inum;
|
||||
vatom[i][j]+=engv[al]*0.5;
|
||||
virial_acc[j]+=engv[al];
|
||||
al+=_inum;
|
||||
}
|
||||
} else {
|
||||
for (int j=0; j<6; j++) {
|
||||
virial_acc[j]+=*ap;
|
||||
ap+=_inum;
|
||||
virial_acc[j]+=engv[al];
|
||||
al+=_inum;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -316,34 +294,34 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
|
||||
virial[j]+=virial_acc[j]*0.5;
|
||||
} else {
|
||||
for (int i=0; i<_inum; i++) {
|
||||
acctyp *ap=host_engv.begin()+i;
|
||||
int al=i;
|
||||
int ii=_ilist[i];
|
||||
if (_eflag) {
|
||||
if (_ef_atom) {
|
||||
evdwl+=*ap;
|
||||
eatom[ii]+=*ap*0.5;
|
||||
ap+=_inum;
|
||||
_ecoul+=*ap;
|
||||
eatom[ii]+=*ap*0.5;
|
||||
ap+=_inum;
|
||||
evdwl+=engv[al];
|
||||
eatom[ii]+=engv[al]*0.5;
|
||||
al+=_inum;
|
||||
_ecoul+=engv[al];
|
||||
eatom[ii]+=engv[al]*0.5;
|
||||
al+=_inum;
|
||||
} else {
|
||||
evdwl+=*ap;
|
||||
ap+=_inum;
|
||||
_ecoul+=*ap;
|
||||
ap+=_inum;
|
||||
evdwl+=engv[al];
|
||||
al+=_inum;
|
||||
_ecoul+=engv[al];
|
||||
al+=_inum;
|
||||
}
|
||||
}
|
||||
if (_vflag) {
|
||||
if (_vf_atom) {
|
||||
for (int j=0; j<6; j++) {
|
||||
vatom[ii][j]+=*ap*0.5;
|
||||
virial_acc[j]+=*ap;
|
||||
ap+=_inum;
|
||||
vatom[ii][j]+=engv[al]*0.5;
|
||||
virial_acc[j]+=engv[al];
|
||||
al+=_inum;
|
||||
}
|
||||
} else {
|
||||
for (int j=0; j<6; j++) {
|
||||
virial_acc[j]+=*ap;
|
||||
ap+=_inum;
|
||||
virial_acc[j]+=engv[al];
|
||||
al+=_inum;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -359,45 +337,37 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void AnswerT::get_answers(double **f, double **tor) {
|
||||
acctyp *ap=host_ans.begin();
|
||||
int fl=0;
|
||||
if (_ilist==NULL) {
|
||||
for (int i=0; i<_inum; i++) {
|
||||
f[i][0]+=*ap;
|
||||
ap++;
|
||||
f[i][1]+=*ap;
|
||||
ap++;
|
||||
f[i][2]+=*ap;
|
||||
ap+=2;
|
||||
f[i][0]+=force[fl];
|
||||
f[i][1]+=force[fl+1];
|
||||
f[i][2]+=force[fl+2];
|
||||
fl+=4;
|
||||
}
|
||||
if (_rot) {
|
||||
for (int i=0; i<_inum; i++) {
|
||||
tor[i][0]+=*ap;
|
||||
ap++;
|
||||
tor[i][1]+=*ap;
|
||||
ap++;
|
||||
tor[i][2]+=*ap;
|
||||
ap+=2;
|
||||
tor[i][0]+=force[fl];
|
||||
tor[i][1]+=force[fl+1];
|
||||
tor[i][2]+=force[fl+2];
|
||||
fl+=4;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int i=0; i<_inum; i++) {
|
||||
int ii=_ilist[i];
|
||||
f[ii][0]+=*ap;
|
||||
ap++;
|
||||
f[ii][1]+=*ap;
|
||||
ap++;
|
||||
f[ii][2]+=*ap;
|
||||
ap+=2;
|
||||
f[ii][0]+=force[fl];
|
||||
f[ii][1]+=force[fl+1];
|
||||
f[ii][2]+=force[fl+2];
|
||||
fl+=4;
|
||||
}
|
||||
if (_rot) {
|
||||
for (int i=0; i<_inum; i++) {
|
||||
int ii=_ilist[i];
|
||||
tor[ii][0]+=*ap;
|
||||
ap++;
|
||||
tor[ii][1]+=*ap;
|
||||
ap++;
|
||||
tor[ii][2]+=*ap;
|
||||
ap+=2;
|
||||
tor[ii][0]+=force[fl];
|
||||
tor[ii][1]+=force[fl+1];
|
||||
tor[ii][2]+=force[fl+2];
|
||||
fl+=4;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -19,18 +19,18 @@
|
||||
#include <math.h>
|
||||
#include "mpi.h"
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
|
||||
#if defined(USE_OPENCL)
|
||||
#include "geryon/ocl_timer.h"
|
||||
#include "geryon/ocl_mat.h"
|
||||
using namespace ucl_opencl;
|
||||
|
||||
#elif defined(USE_CUDART)
|
||||
#include "geryon/nvc_timer.h"
|
||||
#include "geryon/nvc_mat.h"
|
||||
using namespace ucl_cudart;
|
||||
#else
|
||||
|
||||
#include "geryon/nvd_timer.h"
|
||||
#include "geryon/nvd_mat.h"
|
||||
using namespace ucl_cudadr;
|
||||
|
||||
#endif
|
||||
|
||||
#include "lal_precision.h"
|
||||
@ -59,8 +59,10 @@ class Answer {
|
||||
inline void resize(const int inum, bool &success) {
|
||||
_inum=inum;
|
||||
if (inum>_max_local) {
|
||||
clear_resize();
|
||||
success = success && alloc(inum);
|
||||
_max_local=static_cast<int>(static_cast<double>(inum)*1.10);
|
||||
success=success && (force.resize(_max_local*_ans_fields)==UCL_SUCCESS);
|
||||
success=success && (engv.resize(_max_local*_ev_fields)==UCL_SUCCESS);
|
||||
_gpu_bytes=engv.device.row_bytes()+force.device.row_bytes();
|
||||
}
|
||||
}
|
||||
|
||||
@ -68,9 +70,6 @@ class Answer {
|
||||
/** \param rot True if atom storage needs quaternions **/
|
||||
bool add_fields(const bool charge, const bool rot);
|
||||
|
||||
/// Free all memory on host and device needed to realloc for more atoms
|
||||
void clear_resize();
|
||||
|
||||
/// Free all memory on host and device
|
||||
void clear();
|
||||
|
||||
@ -136,14 +135,9 @@ class Answer {
|
||||
// ------------------------------ DATA ----------------------------------
|
||||
|
||||
/// Force and possibly torque
|
||||
UCL_D_Vec<acctyp> dev_ans;
|
||||
UCL_Vector<acctyp,acctyp> force;
|
||||
/// Energy and virial per-atom storage
|
||||
UCL_D_Vec<acctyp> dev_engv;
|
||||
|
||||
/// Force and possibly torque data on host
|
||||
UCL_H_Vec<acctyp> host_ans;
|
||||
/// Energy/virial data on host
|
||||
UCL_H_Vec<acctyp> host_engv;
|
||||
UCL_Vector<acctyp,acctyp> engv;
|
||||
|
||||
/// Device timers
|
||||
UCL_Timer time_answer;
|
||||
@ -155,7 +149,7 @@ class Answer {
|
||||
bool alloc(const int inum);
|
||||
|
||||
bool _allocated, _eflag, _vflag, _ef_atom, _vf_atom, _rot, _charge, _other;
|
||||
int _max_local, _inum, _e_fields, _ev_fields;
|
||||
int _max_local, _inum, _e_fields, _ev_fields, _ans_fields;
|
||||
int *_ilist;
|
||||
double _time_cast, _time_cpu_idle;
|
||||
|
||||
|
||||
@ -51,9 +51,13 @@ bool AtomT::alloc(const int nall) {
|
||||
bool success=true;
|
||||
|
||||
// Ignore host/device transfers?
|
||||
bool cpuview=false;
|
||||
if (dev->device_type()==UCL_CPU)
|
||||
cpuview=true;
|
||||
_host_view=false;
|
||||
if (dev->shared_memory()) {
|
||||
_host_view=true;
|
||||
#ifdef GPU_CAST
|
||||
assert(0==1);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Allocate storage for CUDPP sort
|
||||
#ifdef USE_CUDPP
|
||||
@ -64,63 +68,101 @@ bool AtomT::alloc(const int nall) {
|
||||
}
|
||||
#endif
|
||||
|
||||
// -------------------------- Host allocations
|
||||
// Get a host write only buffer
|
||||
#ifdef GPU_CAST
|
||||
success=success && (host_x_cast.alloc(_max_atoms*3,*dev,
|
||||
UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
|
||||
success=success && (host_type_cast.alloc(_max_atoms,*dev,
|
||||
UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
|
||||
#else
|
||||
success=success && (host_x.alloc(_max_atoms*4,*dev,
|
||||
UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
|
||||
#endif
|
||||
// Buffer for casting only if different precisions
|
||||
if (_charge)
|
||||
success=success && (host_q.alloc(_max_atoms,*dev,
|
||||
UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
|
||||
// Buffer for casting only if different precisions
|
||||
if (_rot)
|
||||
success=success && (host_quat.alloc(_max_atoms*4,*dev,
|
||||
UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
|
||||
|
||||
|
||||
// --------------------------- Device allocations
|
||||
int gpu_bytes=0;
|
||||
if (cpuview) {
|
||||
#ifdef GPU_CAST
|
||||
assert(0==1);
|
||||
#else
|
||||
dev_x.view(host_x);
|
||||
#endif
|
||||
if (_rot)
|
||||
dev_quat.view(host_quat);
|
||||
if (_charge)
|
||||
dev_q.view(host_q);
|
||||
} else {
|
||||
#ifdef GPU_CAST
|
||||
success=success && (UCL_SUCCESS==dev_x.alloc(_max_atoms*4,*dev));
|
||||
success=success && (UCL_SUCCESS==
|
||||
dev_x_cast.alloc(_max_atoms*3,*dev,UCL_READ_ONLY));
|
||||
success=success && (UCL_SUCCESS==
|
||||
dev_type_cast.alloc(_max_atoms,*dev,UCL_READ_ONLY));
|
||||
gpu_bytes+=dev_x_cast.row_bytes()+dev_type_cast.row_bytes();
|
||||
#else
|
||||
success=success && (UCL_SUCCESS==
|
||||
dev_x.alloc(_max_atoms*4,*dev,UCL_READ_ONLY));
|
||||
#endif
|
||||
if (_charge) {
|
||||
success=success && (dev_q.alloc(_max_atoms,*dev,
|
||||
success=success && (x.alloc(_max_atoms*4,*dev,UCL_WRITE_OPTIMIZED,
|
||||
UCL_READ_ONLY)==UCL_SUCCESS);
|
||||
gpu_bytes+=dev_q.row_bytes();
|
||||
}
|
||||
if (_rot) {
|
||||
success=success && (dev_quat.alloc(_max_atoms*4,*dev,
|
||||
#ifdef GPU_CAST
|
||||
success=success && (x_cast.alloc(_max_atoms*3,*dev,UCL_READ_ONLY)==
|
||||
UCL_SUCCESS);
|
||||
success=success && (type_cast.alloc(_max_atoms,*dev,UCL_READ_ONLY)==
|
||||
UCL_SUCCESS);
|
||||
gpu_bytes+=x_cast.device.row_bytes()+type_cast.device.row_bytes();
|
||||
#endif
|
||||
|
||||
if (_charge && _host_view==false) {
|
||||
success=success && (q.alloc(_max_atoms,*dev,UCL_WRITE_OPTIMIZED,
|
||||
UCL_READ_ONLY)==UCL_SUCCESS);
|
||||
gpu_bytes+=dev_quat.row_bytes();
|
||||
gpu_bytes+=q.device.row_bytes();
|
||||
}
|
||||
if (_rot && _host_view==false) {
|
||||
success=success && (quat.alloc(_max_atoms*4,*dev,UCL_WRITE_OPTIMIZED,
|
||||
UCL_READ_ONLY)==UCL_SUCCESS);
|
||||
gpu_bytes+=quat.device.row_bytes();
|
||||
}
|
||||
|
||||
if (_gpu_nbor>0) {
|
||||
if (_bonds) {
|
||||
success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS);
|
||||
gpu_bytes+=dev_tag.row_bytes();
|
||||
}
|
||||
if (_gpu_nbor==1) {
|
||||
success=success && (dev_cell_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
|
||||
gpu_bytes+=dev_cell_id.row_bytes();
|
||||
} else {
|
||||
success=success && (host_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
|
||||
success=success &&
|
||||
(host_cell_id.alloc(_max_atoms,*dev,UCL_NOT_PINNED)==UCL_SUCCESS);
|
||||
}
|
||||
if (_gpu_nbor==2 && _host_view)
|
||||
dev_particle_id.view(host_particle_id);
|
||||
else
|
||||
success=success && (dev_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
|
||||
gpu_bytes+=dev_particle_id.row_bytes();
|
||||
}
|
||||
|
||||
gpu_bytes+=x.device.row_bytes();
|
||||
if (gpu_bytes>_max_gpu_bytes)
|
||||
_max_gpu_bytes=gpu_bytes;
|
||||
|
||||
_allocated=true;
|
||||
return success;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
bool AtomT::add_fields(const bool charge, const bool rot,
|
||||
const int gpu_nbor, const bool bonds) {
|
||||
bool success=true;
|
||||
// Ignore host/device transfers?
|
||||
int gpu_bytes=0;
|
||||
|
||||
if (charge && _charge==false) {
|
||||
_charge=true;
|
||||
_other=true;
|
||||
if (_host_view==false) {
|
||||
success=success && (q.alloc(_max_atoms,*dev,UCL_WRITE_OPTIMIZED,
|
||||
UCL_READ_ONLY)==UCL_SUCCESS);
|
||||
gpu_bytes+=q.device.row_bytes();
|
||||
}
|
||||
}
|
||||
|
||||
if (rot && _rot==false) {
|
||||
_rot=true;
|
||||
_other=true;
|
||||
if (_host_view==false) {
|
||||
success=success && (quat.alloc(_max_atoms*4,*dev,UCL_WRITE_OPTIMIZED,
|
||||
UCL_READ_ONLY)==UCL_SUCCESS);
|
||||
gpu_bytes+=quat.device.row_bytes();
|
||||
}
|
||||
}
|
||||
|
||||
if (bonds && _bonds==false) {
|
||||
_bonds=true;
|
||||
if (_bonds && _gpu_nbor>0) {
|
||||
success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS);
|
||||
gpu_bytes+=dev_tag.row_bytes();
|
||||
}
|
||||
}
|
||||
|
||||
if (gpu_nbor>0 && _gpu_nbor==0) {
|
||||
_gpu_nbor=gpu_nbor;
|
||||
#ifdef USE_CUDPP
|
||||
if (_gpu_nbor==1) {
|
||||
CUDPPResult result = cudppPlan(&sort_plan, sort_config, _max_atoms, 1, 0);
|
||||
if (CUDPP_SUCCESS != result)
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
success=success && (dev_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
|
||||
gpu_bytes+=dev_particle_id.row_bytes();
|
||||
if (_bonds) {
|
||||
@ -137,43 +179,9 @@ bool AtomT::alloc(const int nall) {
|
||||
}
|
||||
}
|
||||
|
||||
gpu_bytes+=dev_x.row_bytes();
|
||||
if (gpu_bytes>_max_gpu_bytes)
|
||||
_max_gpu_bytes=gpu_bytes;
|
||||
|
||||
_allocated=true;
|
||||
return success;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
bool AtomT::add_fields(const bool charge, const bool rot,
|
||||
const int gpu_nbor, const bool bonds) {
|
||||
bool realloc=false;
|
||||
if (charge && _charge==false) {
|
||||
_charge=true;
|
||||
realloc=true;
|
||||
}
|
||||
if (rot && _rot==false) {
|
||||
_rot=true;
|
||||
realloc=true;
|
||||
}
|
||||
if (gpu_nbor>0 && _gpu_nbor==0) {
|
||||
_gpu_nbor=gpu_nbor;
|
||||
realloc=true;
|
||||
}
|
||||
if (bonds && _bonds==false) {
|
||||
_bonds=true;
|
||||
realloc=true;
|
||||
}
|
||||
if (realloc) {
|
||||
_other=_charge || _rot;
|
||||
int max_atoms=_max_atoms;
|
||||
clear_resize();
|
||||
return alloc(max_atoms);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
bool AtomT::init(const int nall, const bool charge, const bool rot,
|
||||
UCL_Device &devi, const int gpu_nbor, const bool bonds) {
|
||||
@ -219,27 +227,18 @@ void AtomT::clear_resize() {
|
||||
return;
|
||||
_allocated=false;
|
||||
|
||||
dev_x.clear();
|
||||
if (_charge) {
|
||||
dev_q.clear();
|
||||
host_q.clear();
|
||||
}
|
||||
if (_rot) {
|
||||
dev_quat.clear();
|
||||
host_quat.clear();
|
||||
}
|
||||
#ifndef GPU_CAST
|
||||
host_x.clear();
|
||||
#else
|
||||
host_x_cast.clear();
|
||||
host_type_cast.clear();
|
||||
#endif
|
||||
x.clear();
|
||||
if (_charge)
|
||||
q.clear();
|
||||
if (_rot)
|
||||
quat.clear();
|
||||
|
||||
dev_cell_id.clear();
|
||||
dev_particle_id.clear();
|
||||
dev_tag.clear();
|
||||
#ifdef GPU_CAST
|
||||
dev_x_cast.clear();
|
||||
dev_type_cast.clear();
|
||||
x_cast.clear();
|
||||
type_cast.clear();
|
||||
#endif
|
||||
|
||||
#ifdef USE_CUDPP
|
||||
@ -279,8 +278,7 @@ double AtomT::host_memory_usage() const {
|
||||
atom_bytes+=1;
|
||||
if (_rot)
|
||||
atom_bytes+=4;
|
||||
return _max_atoms*atom_bytes*sizeof(numtyp)+
|
||||
sizeof(Atom<numtyp,acctyp>);
|
||||
return _max_atoms*atom_bytes*sizeof(numtyp)+sizeof(Atom<numtyp,acctyp>);
|
||||
}
|
||||
|
||||
// Sort arrays for neighbor list calculation
|
||||
@ -292,16 +290,18 @@ void AtomT::sort_neighbor(const int num_atoms) {
|
||||
8*sizeof(unsigned), num_atoms);
|
||||
if (CUDPP_SUCCESS != result) {
|
||||
printf("Error in cudppSort\n");
|
||||
NVD_GERYON_EXIT;
|
||||
UCL_GERYON_EXIT;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef GPU_CAST
|
||||
#ifdef USE_OPENCL
|
||||
#if defined(USE_OPENCL)
|
||||
#include "atom_cl.h"
|
||||
#elif defined(USE_CUDART)
|
||||
const char *atom=0;
|
||||
#else
|
||||
#include "atom_ptx.h"
|
||||
#include "atom_cubin.h"
|
||||
#endif
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
@ -316,3 +316,4 @@ void AtomT::compile_kernels(UCL_Device &dev) {
|
||||
#endif
|
||||
|
||||
template class Atom<PRECISION,ACC_PRECISION>;
|
||||
|
||||
|
||||
@ -19,20 +19,21 @@
|
||||
#include <math.h>
|
||||
#include "mpi.h"
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
|
||||
#if defined(USE_OPENCL)
|
||||
#include "geryon/ocl_timer.h"
|
||||
#include "geryon/ocl_mat.h"
|
||||
#include "geryon/ocl_kernel.h"
|
||||
using namespace ucl_opencl;
|
||||
|
||||
#elif defined(USE_CUDART)
|
||||
#include "geryon/nvc_timer.h"
|
||||
#include "geryon/nvc_mat.h"
|
||||
#include "geryon/nvc_kernel.h"
|
||||
using namespace ucl_cudart;
|
||||
#else
|
||||
|
||||
#include "geryon/nvd_timer.h"
|
||||
#include "geryon/nvd_mat.h"
|
||||
#include "geryon/nvd_kernel.h"
|
||||
using namespace ucl_cudadr;
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef USE_CUDPP
|
||||
@ -92,7 +93,7 @@ class Atom {
|
||||
bool charge() { return _charge; }
|
||||
|
||||
/// Returns true if GPU is using quaternions
|
||||
bool quat() { return _rot; }
|
||||
bool quaternion() { return _rot; }
|
||||
|
||||
/// Only free matrices of length inum or nall for resizing
|
||||
void clear_resize();
|
||||
@ -251,16 +252,13 @@ class Atom {
|
||||
memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
|
||||
memcpy(host_type_cast.begin(),host_type,_nall*sizeof(int));
|
||||
#else
|
||||
numtyp *_write_loc=host_x.begin();
|
||||
int wl=0;
|
||||
for (int i=0; i<_nall; i++) {
|
||||
*_write_loc=host_ptr[i][0];
|
||||
_write_loc++;
|
||||
*_write_loc=host_ptr[i][1];
|
||||
_write_loc++;
|
||||
*_write_loc=host_ptr[i][2];
|
||||
_write_loc++;
|
||||
*_write_loc=host_type[i];
|
||||
_write_loc++;
|
||||
x[wl]=host_ptr[i][0];
|
||||
x[wl+1]=host_ptr[i][1];
|
||||
x[wl+2]=host_ptr[i][2];
|
||||
x[wl+3]=host_type[i];
|
||||
wl+=4;
|
||||
}
|
||||
#endif
|
||||
_time_cast+=MPI_Wtime()-t;
|
||||
@ -273,15 +271,14 @@ class Atom {
|
||||
time_pos.start();
|
||||
if (_x_avail==false) {
|
||||
#ifdef GPU_CAST
|
||||
ucl_copy(dev_x_cast,host_x_cast,_nall*3,true);
|
||||
ucl_copy(dev_type_cast,host_type_cast,_nall,true);
|
||||
x_cast.update_device(_nall*3,true);
|
||||
type_cast.update_device(_nall,true);
|
||||
int block_size=64;
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(_nall)/block_size));
|
||||
k_cast_x.set_size(GX,block_size);
|
||||
k_cast_x.run(&dev_x.begin(), &dev_x_cast.begin(), &dev_type_cast.begin(),
|
||||
&_nall);
|
||||
k_cast_x.run(&x, &x_cast, &type_cast, &_nall);
|
||||
#else
|
||||
ucl_copy(dev_x,host_x,_nall*4,true);
|
||||
x.update_device(_nall*4,true);
|
||||
#endif
|
||||
_x_avail=true;
|
||||
}
|
||||
@ -299,18 +296,14 @@ class Atom {
|
||||
inline void cast_q_data(cpytyp *host_ptr) {
|
||||
if (_q_avail==false) {
|
||||
double t=MPI_Wtime();
|
||||
if (dev->device_type()==UCL_CPU) {
|
||||
if (sizeof(numtyp)==sizeof(double)) {
|
||||
host_q.view((numtyp*)host_ptr,_nall,*dev);
|
||||
dev_q.view(host_q);
|
||||
} else
|
||||
for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i];
|
||||
} else {
|
||||
if (sizeof(numtyp)==sizeof(double))
|
||||
memcpy(host_q.begin(),host_ptr,_nall*sizeof(numtyp));
|
||||
// If double precision, still memcpy for async transfers
|
||||
if (_host_view) {
|
||||
q.host.view((numtyp*)host_ptr,_nall,*dev);
|
||||
q.device.view(q.host);
|
||||
} else if (sizeof(numtyp)==sizeof(double))
|
||||
memcpy(q.host.begin(),host_ptr,_nall*sizeof(numtyp));
|
||||
else
|
||||
for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i];
|
||||
}
|
||||
for (int i=0; i<_nall; i++) q[i]=host_ptr[i];
|
||||
_time_cast+=MPI_Wtime()-t;
|
||||
}
|
||||
}
|
||||
@ -318,7 +311,7 @@ class Atom {
|
||||
// Copy charges to device asynchronously
|
||||
inline void add_q_data() {
|
||||
if (_q_avail==false) {
|
||||
ucl_copy(dev_q,host_q,_nall,true);
|
||||
q.update_device(_nall,true);
|
||||
_q_avail=true;
|
||||
}
|
||||
}
|
||||
@ -328,18 +321,13 @@ class Atom {
|
||||
inline void cast_quat_data(cpytyp *host_ptr) {
|
||||
if (_quat_avail==false) {
|
||||
double t=MPI_Wtime();
|
||||
if (dev->device_type()==UCL_CPU) {
|
||||
if (sizeof(numtyp)==sizeof(double)) {
|
||||
host_quat.view((numtyp*)host_ptr,_nall*4,*dev);
|
||||
dev_quat.view(host_quat);
|
||||
} else
|
||||
for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i];
|
||||
} else {
|
||||
if (sizeof(numtyp)==sizeof(double))
|
||||
memcpy(host_quat.begin(),host_ptr,_nall*4*sizeof(numtyp));
|
||||
if (_host_view) {
|
||||
quat.host.view((numtyp*)host_ptr,_nall*4,*dev);
|
||||
quat.device.view(quat.host);
|
||||
} else if (sizeof(numtyp)==sizeof(double))
|
||||
memcpy(quat.host.begin(),host_ptr,_nall*4*sizeof(numtyp));
|
||||
else
|
||||
for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i];
|
||||
}
|
||||
for (int i=0; i<_nall*4; i++) quat[i]=host_ptr[i];
|
||||
_time_cast+=MPI_Wtime()-t;
|
||||
}
|
||||
}
|
||||
@ -348,7 +336,7 @@ class Atom {
|
||||
/** Copies nall()*4 elements **/
|
||||
inline void add_quat_data() {
|
||||
if (_quat_avail==false) {
|
||||
ucl_copy(dev_quat,host_quat,_nall*4,true);
|
||||
quat.update_device(_nall*4,true);
|
||||
_quat_avail=true;
|
||||
}
|
||||
}
|
||||
@ -363,29 +351,23 @@ class Atom {
|
||||
inline double max_gpu_bytes()
|
||||
{ double m=_max_gpu_bytes; _max_gpu_bytes=0.0; return m; }
|
||||
|
||||
/// Returns true if the device is addressing memory on the host
|
||||
inline bool host_view() { return _host_view; }
|
||||
|
||||
// ------------------------------ DATA ----------------------------------
|
||||
|
||||
/// Atom coordinates and types ([0] is x, [1] is y, [2] is z, [3] is type
|
||||
UCL_D_Vec<numtyp> dev_x;
|
||||
UCL_Vector<numtyp,numtyp> x;
|
||||
/// Charges
|
||||
UCL_D_Vec<numtyp> dev_q;
|
||||
UCL_Vector<numtyp,numtyp> q;
|
||||
/// Quaterions
|
||||
UCL_D_Vec<numtyp> dev_quat;
|
||||
UCL_Vector<numtyp,numtyp> quat;
|
||||
|
||||
#ifdef GPU_CAST
|
||||
UCL_D_Vec<double> dev_x_cast;
|
||||
UCL_D_Vec<int> dev_type_cast;
|
||||
UCL_H_Vec<double> host_x_cast;
|
||||
UCL_H_Vec<int> host_type_cast;
|
||||
UCL_Vector<double,double> x_cast;
|
||||
UCL_Vector<int,int> type_cast;
|
||||
#endif
|
||||
|
||||
/// Buffer for moving positions to device
|
||||
UCL_H_Vec<numtyp> host_x;
|
||||
/// Buffer for moving charge data to GPU
|
||||
UCL_H_Vec<numtyp> host_q;
|
||||
/// Buffer for moving quat data to GPU
|
||||
UCL_H_Vec<numtyp> host_quat;
|
||||
|
||||
/// Cell list identifiers for device nbor builds
|
||||
UCL_D_Vec<unsigned> dev_cell_id;
|
||||
/// Cell list identifiers for device nbor builds
|
||||
@ -418,9 +400,9 @@ class Atom {
|
||||
|
||||
bool alloc(const int nall);
|
||||
|
||||
bool _allocated, _rot, _charge, _other;
|
||||
bool _allocated, _rot, _charge, _bonds, _other;
|
||||
int _max_atoms, _nall, _gpu_nbor;
|
||||
bool _bonds;
|
||||
bool _host_view;
|
||||
double _time_cast, _time_transfer;
|
||||
|
||||
double _max_gpu_bytes;
|
||||
@ -434,3 +416,4 @@ class Atom {
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
@ -41,9 +41,9 @@ int BaseAtomicT::bytes_per_atom_atomic(const int max_nbors) const {
|
||||
template <class numtyp, class acctyp>
|
||||
int BaseAtomicT::init_atomic(const int nlocal, const int nall,
|
||||
const int max_nbors, const int maxspecial,
|
||||
const double cell_size,
|
||||
const double gpu_split, FILE *_screen,
|
||||
const char *pair_program) {
|
||||
const double cell_size, const double gpu_split,
|
||||
FILE *_screen, const void *pair_program,
|
||||
const char *k_name) {
|
||||
screen=_screen;
|
||||
|
||||
int gpu_nbor=0;
|
||||
@ -74,7 +74,7 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall,
|
||||
atom=&device->atom;
|
||||
|
||||
_block_size=device->pair_block_size();
|
||||
compile_kernels(*ucl_device,pair_program);
|
||||
compile_kernels(*ucl_device,pair_program,k_name);
|
||||
|
||||
// Initialize host-device load balancer
|
||||
hd_balancer.init(device,gpu_nbor,gpu_split);
|
||||
@ -83,7 +83,7 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall,
|
||||
time_pair.init(*ucl_device);
|
||||
time_pair.zero();
|
||||
|
||||
pos_tex.bind_float(atom->dev_x,4);
|
||||
pos_tex.bind_float(atom->x,4);
|
||||
|
||||
_max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||
|
||||
@ -266,18 +266,20 @@ double BaseAtomicT::host_memory_usage_atomic() const {
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void BaseAtomicT::compile_kernels(UCL_Device &dev, const char *pair_str) {
|
||||
void BaseAtomicT::compile_kernels(UCL_Device &dev, const void *pair_str,
|
||||
const char *kname) {
|
||||
if (_compiled)
|
||||
return;
|
||||
|
||||
std::string s_fast=std::string(kname)+"_fast";
|
||||
std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
|
||||
std::string(OCL_PRECISION_COMPILE)+" -D"+
|
||||
std::string(OCL_VENDOR);
|
||||
|
||||
pair_program=new UCL_Program(dev);
|
||||
pair_program->load_string(pair_str,flags.c_str());
|
||||
k_pair_fast.set_function(*pair_program,"kernel_pair_fast");
|
||||
k_pair.set_function(*pair_program,"kernel_pair");
|
||||
k_pair_fast.set_function(*pair_program,s_fast.c_str());
|
||||
k_pair.set_function(*pair_program,kname);
|
||||
pos_tex.get_texture(*pair_program,"pos_tex");
|
||||
|
||||
_compiled=true;
|
||||
|
||||
@ -20,8 +20,10 @@
|
||||
#include "lal_balance.h"
|
||||
#include "mpi.h"
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
#if defined(USE_OPENCL)
|
||||
#include "geryon/ocl_texture.h"
|
||||
#elif defined(USE_CUDART)
|
||||
#include "geryon/nvc_texture.h"
|
||||
#else
|
||||
#include "geryon/nvd_texture.h"
|
||||
#endif
|
||||
@ -38,6 +40,7 @@ class BaseAtomic {
|
||||
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||
* \param cell_size cutoff + skin
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
* \param k_name name for the kernel for force calculation
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
@ -48,7 +51,7 @@ class BaseAtomic {
|
||||
int init_atomic(const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen,
|
||||
const char *pair_program);
|
||||
const void *pair_program, const char *k_name);
|
||||
|
||||
/// Estimate the overhead for GPU context changes and CPU driver
|
||||
void estimate_gpu_overhead();
|
||||
@ -57,7 +60,7 @@ class BaseAtomic {
|
||||
/** \param success set to false if insufficient memory **/
|
||||
inline void resize_atom(const int inum, const int nall, bool &success) {
|
||||
if (atom->resize(nall, success))
|
||||
pos_tex.bind_float(atom->dev_x,4);
|
||||
pos_tex.bind_float(atom->x,4);
|
||||
ans->resize(inum,success);
|
||||
}
|
||||
|
||||
@ -188,7 +191,7 @@ class BaseAtomic {
|
||||
double _gpu_overhead, _driver_overhead;
|
||||
UCL_D_Vec<int> *_nbor_data;
|
||||
|
||||
void compile_kernels(UCL_Device &dev, const char *pair_string);
|
||||
void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k);
|
||||
|
||||
virtual void loop(const bool _eflag, const bool _vflag) = 0;
|
||||
};
|
||||
|
||||
@ -42,9 +42,9 @@ int BaseChargeT::bytes_per_atom_atomic(const int max_nbors) const {
|
||||
template <class numtyp, class acctyp>
|
||||
int BaseChargeT::init_atomic(const int nlocal, const int nall,
|
||||
const int max_nbors, const int maxspecial,
|
||||
const double cell_size,
|
||||
const double gpu_split, FILE *_screen,
|
||||
const char *pair_program) {
|
||||
const double cell_size, const double gpu_split,
|
||||
FILE *_screen, const void *pair_program,
|
||||
const char *k_name) {
|
||||
screen=_screen;
|
||||
|
||||
int gpu_nbor=0;
|
||||
@ -76,7 +76,7 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall,
|
||||
|
||||
_block_size=device->pair_block_size();
|
||||
_block_bio_size=device->block_bio_pair();
|
||||
compile_kernels(*ucl_device,pair_program);
|
||||
compile_kernels(*ucl_device,pair_program,k_name);
|
||||
|
||||
// Initialize host-device load balancer
|
||||
hd_balancer.init(device,gpu_nbor,gpu_split);
|
||||
@ -85,8 +85,8 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall,
|
||||
time_pair.init(*ucl_device);
|
||||
time_pair.zero();
|
||||
|
||||
pos_tex.bind_float(atom->dev_x,4);
|
||||
q_tex.bind_float(atom->dev_q,1);
|
||||
pos_tex.bind_float(atom->x,4);
|
||||
q_tex.bind_float(atom->q,1);
|
||||
|
||||
_max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||
|
||||
@ -282,18 +282,20 @@ double BaseChargeT::host_memory_usage_atomic() const {
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void BaseChargeT::compile_kernels(UCL_Device &dev, const char *pair_str) {
|
||||
void BaseChargeT::compile_kernels(UCL_Device &dev, const void *pair_str,
|
||||
const char *kname) {
|
||||
if (_compiled)
|
||||
return;
|
||||
|
||||
std::string s_fast=std::string(kname)+"_fast";
|
||||
std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
|
||||
std::string(OCL_PRECISION_COMPILE)+" -D"+
|
||||
std::string(OCL_VENDOR);
|
||||
|
||||
pair_program=new UCL_Program(dev);
|
||||
pair_program->load_string(pair_str,flags.c_str());
|
||||
k_pair_fast.set_function(*pair_program,"kernel_pair_fast");
|
||||
k_pair.set_function(*pair_program,"kernel_pair");
|
||||
k_pair_fast.set_function(*pair_program,s_fast.c_str());
|
||||
k_pair.set_function(*pair_program,kname);
|
||||
pos_tex.get_texture(*pair_program,"pos_tex");
|
||||
q_tex.get_texture(*pair_program,"q_tex");
|
||||
|
||||
|
||||
@ -21,8 +21,10 @@
|
||||
#include "lal_balance.h"
|
||||
#include "mpi.h"
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
#if defined(USE_OPENCL)
|
||||
#include "geryon/ocl_texture.h"
|
||||
#elif defined(USE_CUDART)
|
||||
#include "geryon/nvc_texture.h"
|
||||
#else
|
||||
#include "geryon/nvd_texture.h"
|
||||
#endif
|
||||
@ -39,6 +41,7 @@ class BaseCharge {
|
||||
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||
* \param cell_size cutoff + skin
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
* \param k_name name for the kernel for force calculation
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
@ -49,7 +52,7 @@ class BaseCharge {
|
||||
int init_atomic(const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen,
|
||||
const char *pair_program);
|
||||
const void *pair_program, const char *k_name);
|
||||
|
||||
/// Estimate the overhead for GPU context changes and CPU driver
|
||||
void estimate_gpu_overhead();
|
||||
@ -58,8 +61,8 @@ class BaseCharge {
|
||||
/** \param success set to false if insufficient memory **/
|
||||
inline void resize_atom(const int inum, const int nall, bool &success) {
|
||||
if (atom->resize(nall, success)) {
|
||||
pos_tex.bind_float(atom->dev_x,4);
|
||||
q_tex.bind_float(atom->dev_q,1);
|
||||
pos_tex.bind_float(atom->x,4);
|
||||
q_tex.bind_float(atom->q,1);
|
||||
}
|
||||
ans->resize(inum,success);
|
||||
}
|
||||
@ -187,7 +190,7 @@ class BaseCharge {
|
||||
double _gpu_overhead, _driver_overhead;
|
||||
UCL_D_Vec<int> *_nbor_data;
|
||||
|
||||
void compile_kernels(UCL_Device &dev, const char *pair_string);
|
||||
void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k);
|
||||
|
||||
virtual void loop(const bool _eflag, const bool _vflag) = 0;
|
||||
};
|
||||
|
||||
@ -17,10 +17,12 @@
|
||||
#include <cstdlib>
|
||||
using namespace LAMMPS_AL;
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
#if defined(USE_OPENCL)
|
||||
#include "ellipsoid_nbor_cl.h"
|
||||
#elif defined(USE_CUDART)
|
||||
const char *ellipsoid_nbor=0;
|
||||
#else
|
||||
#include "ellipsoid_nbor_ptx.h"
|
||||
#include "ellipsoid_nbor_cubin.h"
|
||||
#endif
|
||||
|
||||
#define BaseEllipsoidT BaseEllipsoid<numtyp, acctyp>
|
||||
@ -50,8 +52,9 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,
|
||||
const int max_nbors, const int maxspecial,
|
||||
const double cell_size, const double gpu_split,
|
||||
FILE *_screen, const int ntypes, int **h_form,
|
||||
const char *ellipsoid_program,
|
||||
const char *lj_program, const bool ellip_sphere) {
|
||||
const void *ellipsoid_program,
|
||||
const void *lj_program, const char *k_name,
|
||||
const bool ellip_sphere) {
|
||||
screen=_screen;
|
||||
_ellipsoid_sphere=ellip_sphere;
|
||||
|
||||
@ -78,7 +81,7 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,
|
||||
atom=&device->atom;
|
||||
|
||||
_block_size=device->pair_block_size();
|
||||
compile_kernels(*ucl_device,ellipsoid_program,lj_program,ellip_sphere);
|
||||
compile_kernels(*ucl_device,ellipsoid_program,lj_program,k_name,ellip_sphere);
|
||||
|
||||
// Initialize host-device load balancer
|
||||
hd_balancer.init(device,gpu_nbor,gpu_split);
|
||||
@ -112,7 +115,7 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,
|
||||
}
|
||||
|
||||
if (_multiple_forms)
|
||||
ans->dev_ans.zero();
|
||||
ans->force.zero();
|
||||
|
||||
// Memory for ilist ordered by particle type
|
||||
if (host_olist.alloc(nbor->max_atoms(),*ucl_device)==UCL_SUCCESS)
|
||||
@ -121,6 +124,12 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,
|
||||
|
||||
_max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||
|
||||
neigh_tex.bind_float(atom->x,4);
|
||||
pos_tex.bind_float(atom->x,4);
|
||||
quat_tex.bind_float(atom->quat,4);
|
||||
lj_pos_tex.bind_float(atom->x,4);
|
||||
lj_quat_tex.bind_float(atom->quat,4);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -241,14 +250,12 @@ void BaseEllipsoidT::pack_nbors(const int GX, const int BX, const int start,
|
||||
int stride=nbor->nbor_pitch();
|
||||
if (shared_types) {
|
||||
k_nbor_fast.set_size(GX,BX);
|
||||
k_nbor_fast.run(&atom->dev_x.begin(), &cut_form.begin(),
|
||||
&nbor->dev_nbor.begin(), &stride, &start, &inum,
|
||||
&nbor->dev_packed.begin(), &form_low, &form_high);
|
||||
k_nbor_fast.run(&atom->x, &cut_form, &nbor->dev_nbor, &stride, &start,
|
||||
&inum, &nbor->dev_packed, &form_low, &form_high);
|
||||
} else {
|
||||
k_nbor.set_size(GX,BX);
|
||||
k_nbor.run(&atom->dev_x.begin(), &cut_form.begin(), &ntypes,
|
||||
&nbor->dev_nbor.begin(), &stride, &start, &inum,
|
||||
&nbor->dev_packed.begin(), &form_low, &form_high);
|
||||
k_nbor.run(&atom->x, &cut_form, &ntypes, &nbor->dev_nbor, &stride,
|
||||
&start, &inum, &nbor->dev_packed, &form_low, &form_high);
|
||||
}
|
||||
}
|
||||
|
||||
@ -437,11 +444,18 @@ double BaseEllipsoidT::host_memory_usage_base() const {
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void BaseEllipsoidT::compile_kernels(UCL_Device &dev,
|
||||
const char *ellipsoid_string,
|
||||
const char *lj_string, const bool e_s) {
|
||||
const void *ellipsoid_string,
|
||||
const void *lj_string,
|
||||
const char *kname, const bool e_s) {
|
||||
if (_compiled)
|
||||
return;
|
||||
|
||||
std::string kns=kname;
|
||||
std::string s_sphere_ellipsoid=kns+"_sphere_ellipsoid";
|
||||
std::string s_ellipsoid_sphere=kns+"_ellipsoid_sphere";
|
||||
std::string s_lj=kns+"_lj";
|
||||
std::string s_lj_fast=kns+"_lj_fast";
|
||||
|
||||
std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
|
||||
std::string(OCL_PRECISION_COMPILE)+" -D"+
|
||||
std::string(OCL_VENDOR);
|
||||
@ -450,18 +464,23 @@ void BaseEllipsoidT::compile_kernels(UCL_Device &dev,
|
||||
nbor_program->load_string(ellipsoid_nbor,flags.c_str());
|
||||
k_nbor_fast.set_function(*nbor_program,"kernel_nbor_fast");
|
||||
k_nbor.set_function(*nbor_program,"kernel_nbor");
|
||||
neigh_tex.get_texture(*nbor_program,"pos_tex");
|
||||
|
||||
ellipsoid_program=new UCL_Program(dev);
|
||||
ellipsoid_program->load_string(ellipsoid_string,flags.c_str());
|
||||
k_ellipsoid.set_function(*ellipsoid_program,"kernel_ellipsoid");
|
||||
k_ellipsoid.set_function(*ellipsoid_program,kname);
|
||||
pos_tex.get_texture(*ellipsoid_program,"pos_tex");
|
||||
quat_tex.get_texture(*ellipsoid_program,"quat_tex");
|
||||
|
||||
lj_program=new UCL_Program(dev);
|
||||
lj_program->load_string(lj_string,flags.c_str());
|
||||
k_sphere_ellipsoid.set_function(*lj_program,"kernel_sphere_ellipsoid");
|
||||
k_lj_fast.set_function(*lj_program,"kernel_lj_fast");
|
||||
k_lj.set_function(*lj_program,"kernel_lj");
|
||||
k_sphere_ellipsoid.set_function(*lj_program,s_sphere_ellipsoid.c_str());
|
||||
k_lj_fast.set_function(*lj_program,s_lj_fast.c_str());
|
||||
k_lj.set_function(*lj_program,s_lj.c_str());
|
||||
if (e_s)
|
||||
k_ellipsoid_sphere.set_function(*lj_program,"kernel_ellipsoid_sphere");
|
||||
k_ellipsoid_sphere.set_function(*lj_program,s_ellipsoid_sphere.c_str());
|
||||
lj_pos_tex.get_texture(*lj_program,"pos_tex");
|
||||
lj_quat_tex.get_texture(*lj_program,"quat_tex");
|
||||
|
||||
_compiled=true;
|
||||
}
|
||||
|
||||
@ -20,8 +20,10 @@
|
||||
#include "lal_balance.h"
|
||||
#include "mpi.h"
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
#if defined(USE_OPENCL)
|
||||
#include "geryon/ocl_texture.h"
|
||||
#elif defined(USE_CUDART)
|
||||
#include "geryon/nvc_texture.h"
|
||||
#else
|
||||
#include "geryon/nvd_texture.h"
|
||||
#endif
|
||||
@ -39,6 +41,7 @@ class BaseEllipsoid {
|
||||
* \param cell_size cutoff + skin
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
* \param ellipsoid_sphere true if ellipsoid-sphere case handled separately
|
||||
* \param k_name name for the kernel for force calculation
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
@ -49,8 +52,9 @@ class BaseEllipsoid {
|
||||
int init_base(const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen, const int ntypes,
|
||||
int **h_form, const char *ellipsoid_program,
|
||||
const char *lj_program, const bool ellipsoid_sphere=false);
|
||||
int **h_form, const void *ellipsoid_program,
|
||||
const void *lj_program, const char *k_name,
|
||||
const bool ellipsoid_sphere=false);
|
||||
|
||||
/// Estimate the overhead for GPU context changes and CPU driver
|
||||
void estimate_gpu_overhead();
|
||||
@ -58,7 +62,13 @@ class BaseEllipsoid {
|
||||
/// Check if there is enough storage for atom arrays and realloc if not
|
||||
/** \param success set to false if insufficient memory **/
|
||||
inline void resize_atom(const int nall, bool &success) {
|
||||
atom->resize(nall, success);
|
||||
if (atom->resize(nall, success)) {
|
||||
neigh_tex.bind_float(atom->x,4);
|
||||
pos_tex.bind_float(atom->x,4);
|
||||
quat_tex.bind_float(atom->quat,4);
|
||||
lj_pos_tex.bind_float(atom->x,4);
|
||||
lj_quat_tex.bind_float(atom->quat,4);
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if there is enough storage for neighbors and realloc if not
|
||||
@ -74,7 +84,7 @@ class BaseEllipsoid {
|
||||
const int max_nbors, const int olist_size,
|
||||
bool &success) {
|
||||
ans->resize(nlocal, success);
|
||||
if (_multiple_forms) ans->dev_ans.zero();
|
||||
if (_multiple_forms) ans->force.zero();
|
||||
|
||||
if (olist_size>static_cast<int>(host_olist.numel())) {
|
||||
host_olist.clear();
|
||||
@ -221,8 +231,7 @@ class BaseEllipsoid {
|
||||
inline int block_size() { return _block_size; }
|
||||
|
||||
// --------------------------- TEXTURES -----------------------------
|
||||
UCL_Texture pos_tex;
|
||||
UCL_Texture q_tex;
|
||||
UCL_Texture pos_tex, quat_tex, lj_pos_tex, lj_quat_tex, neigh_tex;
|
||||
|
||||
protected:
|
||||
bool _compiled, _ellipsoid_sphere;
|
||||
@ -236,8 +245,8 @@ class BaseEllipsoid {
|
||||
int **_host_form;
|
||||
int _last_ellipse, _max_last_ellipse;
|
||||
|
||||
void compile_kernels(UCL_Device &dev, const char *ellipsoid_string,
|
||||
const char *lj_string, const bool e_s);
|
||||
void compile_kernels(UCL_Device &dev, const void *ellipsoid_string,
|
||||
const void *lj_string, const char *kname,const bool e_s);
|
||||
|
||||
virtual void loop(const bool _eflag, const bool _vflag) = 0;
|
||||
};
|
||||
|
||||
@ -13,10 +13,12 @@
|
||||
email : nguyentd@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
#if defined(USE_OPENCL)
|
||||
#include "buck_cl.h"
|
||||
#elif defined(USE_CUDART)
|
||||
const char *buck=0;
|
||||
#else
|
||||
#include "buck_ptx.h"
|
||||
#include "buck_cubin.h"
|
||||
#endif
|
||||
|
||||
#include "lal_buck.h"
|
||||
@ -50,7 +52,7 @@ int BuckT::init(const int ntypes, double **host_cutsq,
|
||||
const double gpu_split, FILE *_screen) {
|
||||
int success;
|
||||
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||
_screen,buck);
|
||||
_screen,buck,"k_buck");
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
@ -132,20 +134,17 @@ void BuckT::loop(const bool _eflag, const bool _vflag) {
|
||||
this->time_pair.start();
|
||||
if (shared_types) {
|
||||
this->k_pair_fast.set_size(GX,BX);
|
||||
this->k_pair_fast.run(&this->atom->dev_x.begin(), &coeff1.begin(),
|
||||
&coeff2.begin(), &sp_lj.begin(),
|
||||
&this->nbor->dev_nbor.begin(),
|
||||
&this->_nbor_data->begin(),
|
||||
&this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag,
|
||||
&ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag,
|
||||
&vflag, &ainum, &nbor_pitch,
|
||||
&this->_threads_per_atom);
|
||||
} else {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->dev_x.begin(), &coeff1.begin(), &coeff2.begin(),
|
||||
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
||||
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag, &vflag,
|
||||
&ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
}
|
||||
this->time_pair.stop();
|
||||
}
|
||||
|
||||
@ -15,14 +15,16 @@
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#include "lal_aux_fun1.h"
|
||||
texture<float4> pos_tex;
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
ucl_inline float4 fetch_pos(const int& i, const float4 *pos)
|
||||
{ return tex1Dfetch(pos_tex, i); }
|
||||
texture<float4> pos_tex;
|
||||
#else
|
||||
texture<int4,1> pos_tex;
|
||||
#endif
|
||||
#else
|
||||
#define pos_tex x_
|
||||
#endif
|
||||
|
||||
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
|
||||
__kernel void k_buck(__global numtyp4 *x_, __global numtyp4 *coeff1,
|
||||
__global numtyp4* coeff2, const int lj_types,
|
||||
__global numtyp *sp_lj_in, __global int *dev_nbor,
|
||||
__global int *dev_packed, __global acctyp4 *ans,
|
||||
@ -51,7 +53,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
@ -61,7 +63,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jtype=jx.w;
|
||||
|
||||
// Compute r12
|
||||
@ -104,7 +106,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
|
||||
__kernel void k_buck_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
|
||||
__global numtyp4* coeff2_in,
|
||||
__global numtyp* sp_lj_in,
|
||||
__global int *dev_nbor, __global int *dev_packed,
|
||||
@ -140,7 +142,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
@ -151,7 +153,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int mtype=itype+jx.w;
|
||||
|
||||
// Compute r12
|
||||
|
||||
@ -13,10 +13,12 @@
|
||||
email : nguyentd@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
#if defined(USE_OPENCL)
|
||||
#include "buck_coul_cl.h"
|
||||
#elif defined(USE_CUDART)
|
||||
const char *buck_coul=0;
|
||||
#else
|
||||
#include "buck_coul_ptx.h"
|
||||
#include "buck_coul_cubin.h"
|
||||
#endif
|
||||
|
||||
#include "lal_buck_coul.h"
|
||||
@ -52,7 +54,7 @@ int BuckCoulT::init(const int ntypes, double **host_cutsq,
|
||||
const double qqrd2e) {
|
||||
int success;
|
||||
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||
_screen,buck_coul);
|
||||
_screen,buck_coul,"k_buck_coul");
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
@ -142,23 +144,18 @@ void BuckCoulT::loop(const bool _eflag, const bool _vflag) {
|
||||
this->time_pair.start();
|
||||
if (shared_types) {
|
||||
this->k_pair_fast.set_size(GX,BX);
|
||||
this->k_pair_fast.run(&this->atom->dev_x.begin(), &coeff1.begin(),
|
||||
&coeff2.begin(), &sp_lj.begin(),
|
||||
&this->nbor->dev_nbor.begin(),
|
||||
&this->_nbor_data->begin(),
|
||||
&this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag,
|
||||
&ainum, &nbor_pitch,
|
||||
&this->atom->dev_q.begin(), &cutsq.begin(),
|
||||
&_qqrd2e, &this->_threads_per_atom);
|
||||
this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag,
|
||||
&vflag, &ainum, &nbor_pitch, &this->atom->q,
|
||||
&cutsq, &_qqrd2e, &this->_threads_per_atom);
|
||||
} else {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->dev_x.begin(), &coeff1.begin(), &coeff2.begin(),
|
||||
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
||||
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->atom->dev_q.begin(),
|
||||
&cutsq.begin(), &_qqrd2e, &this->_threads_per_atom);
|
||||
this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag, &vflag,
|
||||
&ainum, &nbor_pitch, &this->atom->q,
|
||||
&cutsq, &_qqrd2e, &this->_threads_per_atom);
|
||||
}
|
||||
this->time_pair.stop();
|
||||
}
|
||||
|
||||
@ -14,18 +14,22 @@
|
||||
// ***************************************************************************/
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
|
||||
#include "lal_aux_fun1.h"
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
texture<float4> pos_tex;
|
||||
texture<float> q_tex;
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
ucl_inline float4 fetch_pos(const int& i, const float4 *pos)
|
||||
{ return tex1Dfetch(pos_tex, i); }
|
||||
ucl_inline float fetch_q(const int& i, const float *q)
|
||||
{ return tex1Dfetch(q_tex, i); }
|
||||
#endif
|
||||
#else
|
||||
texture<int4,1> pos_tex;
|
||||
texture<int2> q_tex;
|
||||
#endif
|
||||
|
||||
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
|
||||
#else
|
||||
#define pos_tex x_
|
||||
#define q_tex q_
|
||||
#endif
|
||||
|
||||
__kernel void k_buck_coul(__global numtyp4 *x_, __global numtyp4 *coeff1,
|
||||
__global numtyp4* coeff2, const int lj_types,
|
||||
__global numtyp *sp_lj_in, __global int *dev_nbor,
|
||||
__global int *dev_packed, __global acctyp4 *ans,
|
||||
@ -61,8 +65,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp qtmp=fetch_q(i,q_);
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
@ -73,7 +77,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
|
||||
factor_coul = sp_lj[sbmask(j)+4];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jtype=jx.w;
|
||||
|
||||
// Compute r12
|
||||
@ -97,9 +101,10 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
|
||||
} else
|
||||
forcebuck = (numtyp)0.0;
|
||||
|
||||
if (rsq < coeff2[mtype].z) // coul
|
||||
forcecoul = qqrd2e*qtmp*fetch_q(j,q_)*ucl_rsqrt(rsq)*factor_coul;
|
||||
else
|
||||
if (rsq < coeff2[mtype].z) {
|
||||
fetch(forcecoul,j,q_tex);
|
||||
forcecoul *= qqrd2e*qtmp*ucl_rsqrt(rsq)*factor_coul;
|
||||
} else
|
||||
forcecoul = (numtyp)0.0;
|
||||
|
||||
force = (forcebuck + forcecoul) * r2inv;
|
||||
@ -131,7 +136,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
|
||||
__kernel void k_buck_coul_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
|
||||
__global numtyp4* coeff2_in,
|
||||
__global numtyp* sp_lj_in,
|
||||
__global int *dev_nbor, __global int *dev_packed,
|
||||
@ -172,8 +177,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp qtmp=fetch_q(i,q_);
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
@ -185,7 +190,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in
|
||||
factor_coul = sp_lj[sbmask(j)+4];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int mtype=itype+jx.w;
|
||||
|
||||
// Compute r12
|
||||
@ -208,9 +213,10 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in
|
||||
} else
|
||||
forcebuck = (numtyp)0.0;
|
||||
|
||||
if (rsq < cutsq[mtype].z) // coul
|
||||
forcecoul = qqrd2e*qtmp*fetch_q(j,q_)*ucl_rsqrt(rsq)*factor_coul;
|
||||
else
|
||||
if (rsq < cutsq[mtype].z) {
|
||||
fetch(forcecoul,j,q_tex);
|
||||
forcecoul *= qqrd2e*qtmp*ucl_rsqrt(rsq)*factor_coul;
|
||||
} else
|
||||
forcecoul = (numtyp)0.0;
|
||||
|
||||
force = (forcebuck + forcecoul) * r2inv;
|
||||
|
||||
@ -13,10 +13,12 @@
|
||||
email : nguyentd@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
#if defined(USE_OPENCL)
|
||||
#include "buck_coul_long_cl.h"
|
||||
#elif defined(USE_CUDART)
|
||||
const char *buck_coul_long=0;
|
||||
#else
|
||||
#include "buck_coul_long_ptx.h"
|
||||
#include "buck_coul_long_cubin.h"
|
||||
#endif
|
||||
|
||||
#include "lal_buck_coul_long.h"
|
||||
@ -54,7 +56,7 @@ int BuckCoulLongT::init(const int ntypes, double **host_cutsq,
|
||||
const double g_ewald) {
|
||||
int success;
|
||||
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||
_screen,buck_coul_long);
|
||||
_screen,buck_coul_long,"k_buck_coul_long");
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
@ -145,25 +147,19 @@ void BuckCoulLongT::loop(const bool _eflag, const bool _vflag) {
|
||||
this->time_pair.start();
|
||||
if (shared_types) {
|
||||
this->k_pair_fast.set_size(GX,BX);
|
||||
this->k_pair_fast.run(&this->atom->dev_x.begin(), &coeff1.begin(),
|
||||
&coeff2.begin(), &sp_lj.begin(),
|
||||
&this->nbor->dev_nbor.begin(),
|
||||
&this->_nbor_data->begin(),
|
||||
&this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag,
|
||||
&ainum, &nbor_pitch, &this->atom->dev_q.begin(),
|
||||
&cutsq.begin(), &_cut_coulsq, &_qqrd2e,
|
||||
this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag,
|
||||
&vflag, &ainum, &nbor_pitch, &this->atom->q,
|
||||
&cutsq, &_cut_coulsq, &_qqrd2e,
|
||||
&_g_ewald, &this->_threads_per_atom);
|
||||
} else {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->dev_x.begin(), &coeff1.begin(),
|
||||
&coeff2.begin(), &_lj_types, &sp_lj.begin(),
|
||||
&this->nbor->dev_nbor.begin(),
|
||||
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->atom->dev_q.begin(),
|
||||
&cutsq.begin(), &_cut_coulsq,
|
||||
&_qqrd2e, &_g_ewald, &this->_threads_per_atom);
|
||||
this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag, &vflag,
|
||||
&ainum, &nbor_pitch, &this->atom->q, &cutsq,
|
||||
&_cut_coulsq, &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
|
||||
}
|
||||
this->time_pair.stop();
|
||||
}
|
||||
|
||||
@ -14,18 +14,22 @@
|
||||
// ***************************************************************************/
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
|
||||
#include "lal_aux_fun1.h"
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
texture<float4> pos_tex;
|
||||
texture<float> q_tex;
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
ucl_inline float4 fetch_pos(const int& i, const float4 *pos)
|
||||
{ return tex1Dfetch(pos_tex, i); }
|
||||
ucl_inline float fetch_q(const int& i, const float *q)
|
||||
{ return tex1Dfetch(q_tex, i); }
|
||||
#endif
|
||||
#else
|
||||
texture<int4,1> pos_tex;
|
||||
texture<int2> q_tex;
|
||||
#endif
|
||||
|
||||
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
|
||||
#else
|
||||
#define pos_tex x_
|
||||
#define q_tex q_
|
||||
#endif
|
||||
|
||||
__kernel void k_buck_coul_long(__global numtyp4 *x_, __global numtyp4 *coeff1,
|
||||
__global numtyp4* coeff2, const int lj_types,
|
||||
__global numtyp *sp_lj_in, __global int *dev_nbor,
|
||||
__global int *dev_packed, __global acctyp4 *ans,
|
||||
@ -62,8 +66,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp qtmp=fetch_q(i,q_);
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
@ -74,7 +78,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
|
||||
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jtype=jx.w;
|
||||
|
||||
// Compute r12
|
||||
@ -104,7 +108,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
|
||||
numtyp expm2 = ucl_exp(-grij*grij);
|
||||
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
|
||||
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
|
||||
prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
|
||||
fetch(prefactor,j,q_tex);
|
||||
prefactor *= qqrd2e * qtmp/r;
|
||||
forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
|
||||
} else
|
||||
forcecoul = (numtyp)0.0;
|
||||
@ -139,7 +144,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
|
||||
__kernel void k_buck_coul_long_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
|
||||
__global numtyp4* coeff2_in,
|
||||
__global numtyp* sp_lj_in,
|
||||
__global int *dev_nbor, __global int *dev_packed,
|
||||
@ -179,8 +184,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp qtmp=fetch_q(i,q_);
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
@ -192,7 +197,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in
|
||||
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int mtype=itype+jx.w;
|
||||
|
||||
// Compute r12
|
||||
@ -221,7 +226,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in
|
||||
numtyp expm2 = ucl_exp(-grij*grij);
|
||||
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
|
||||
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
|
||||
prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
|
||||
fetch(prefactor,j,q_tex);
|
||||
prefactor *= qqrd2e * qtmp/r;
|
||||
forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
|
||||
} else
|
||||
forcecoul = (numtyp)0.0;
|
||||
|
||||
@ -13,10 +13,12 @@
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
#if defined(USE_OPENCL)
|
||||
#include "cg_cmm_cl.h"
|
||||
#elif defined(USE_CUDART)
|
||||
const char *cg_cmm=0;
|
||||
#else
|
||||
#include "cg_cmm_ptx.h"
|
||||
#include "cg_cmm_cubin.h"
|
||||
#endif
|
||||
|
||||
#include "lal_cg_cmm.h"
|
||||
@ -51,7 +53,7 @@ int CGCMMT::init(const int ntypes, double **host_cutsq,
|
||||
const double gpu_split, FILE *_screen) {
|
||||
int success;
|
||||
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||
_screen,cg_cmm);
|
||||
_screen,cg_cmm,"k_cg_cmm");
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
@ -133,19 +135,17 @@ void CGCMMT::loop(const bool _eflag, const bool _vflag) {
|
||||
this->time_pair.start();
|
||||
if (shared_types) {
|
||||
this->k_pair_fast.set_size(GX,BX);
|
||||
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
|
||||
&lj3.begin(), &sp_lj.begin(),
|
||||
&this->nbor->dev_nbor.begin(),
|
||||
&this->_nbor_data->begin(),
|
||||
&this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag,
|
||||
&ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag,
|
||||
&vflag, &ainum, &nbor_pitch,
|
||||
&this->_threads_per_atom);
|
||||
} else {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
|
||||
&_cmm_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
||||
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
this->k_pair.run(&this->atom->x, &lj1, &lj3,
|
||||
&_cmm_types, &sp_lj, &this->nbor->dev_nbor,
|
||||
&this->_nbor_data->begin(), &this->ans->force,
|
||||
&this->ans->engv, &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
}
|
||||
this->time_pair.stop();
|
||||
|
||||
@ -15,14 +15,16 @@
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#include "lal_aux_fun1.h"
|
||||
texture<float4> pos_tex;
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
ucl_inline float4 fetch_pos(const int& i, const float4 *pos)
|
||||
{ return tex1Dfetch(pos_tex, i); }
|
||||
texture<float4> pos_tex;
|
||||
#else
|
||||
texture<int4,1> pos_tex;
|
||||
#endif
|
||||
#else
|
||||
#define pos_tex x_
|
||||
#endif
|
||||
|
||||
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
__kernel void k_cg_cmm(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
__global numtyp4* lj3, const int lj_types,
|
||||
__global numtyp *sp_lj_in, __global int *dev_nbor,
|
||||
__global int *dev_packed, __global acctyp4 *ans,
|
||||
@ -51,7 +53,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
@ -61,7 +63,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jtype=jx.w;
|
||||
|
||||
// Compute r12
|
||||
@ -109,7 +111,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
__kernel void k_cg_cmm_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
__global numtyp4* lj3_in,
|
||||
__global numtyp* sp_lj_in,__global int *dev_nbor,
|
||||
__global int *dev_packed, __global acctyp4 *ans,
|
||||
@ -145,7 +147,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
@ -156,7 +158,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int mtype=itype+jx.w;
|
||||
|
||||
// Compute r12
|
||||
|
||||
@ -13,10 +13,12 @@
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
#if defined(USE_OPENCL)
|
||||
#include "cg_cmm_long_cl.h"
|
||||
#elif defined(USE_CUDART)
|
||||
const char *cg_cmm_long=0;
|
||||
#else
|
||||
#include "cg_cmm_long_ptx.h"
|
||||
#include "cg_cmm_long_cubin.h"
|
||||
#endif
|
||||
|
||||
#include "lal_cg_cmm_long.h"
|
||||
@ -56,7 +58,7 @@ int CGCMMLongT::init(const int ntypes, double **host_cutsq,
|
||||
const double g_ewald) {
|
||||
int success;
|
||||
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||
_screen,cg_cmm_long);
|
||||
_screen,cg_cmm_long,"k_cg_cmm_long");
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
@ -144,24 +146,19 @@ void CGCMMLongT::loop(const bool _eflag, const bool _vflag) {
|
||||
this->time_pair.start();
|
||||
if (shared_types) {
|
||||
this->k_pair_fast.set_size(GX,BX);
|
||||
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
|
||||
&lj3.begin(), &sp_lj.begin(),
|
||||
&this->nbor->dev_nbor.begin(),
|
||||
&this->_nbor_data->begin(),
|
||||
&this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag,
|
||||
&ainum, &nbor_pitch,
|
||||
&this->atom->dev_q.begin(), &_cut_coulsq,
|
||||
&_qqrd2e, &_g_ewald, &this->_threads_per_atom);
|
||||
} else {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
|
||||
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
||||
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->atom->dev_q.begin(),
|
||||
this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag,
|
||||
&vflag, &ainum, &nbor_pitch, &this->atom->q,
|
||||
&_cut_coulsq, &_qqrd2e, &_g_ewald,
|
||||
&this->_threads_per_atom);
|
||||
} else {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag, &vflag,
|
||||
&ainum, &nbor_pitch, &this->atom->q, &_cut_coulsq,
|
||||
&_qqrd2e, &_g_ewald, &this->_threads_per_atom);
|
||||
}
|
||||
this->time_pair.stop();
|
||||
}
|
||||
|
||||
@ -14,18 +14,22 @@
|
||||
// ***************************************************************************/
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
|
||||
#include "lal_aux_fun1.h"
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
texture<float4> pos_tex;
|
||||
texture<float> q_tex;
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
ucl_inline float4 fetch_pos(const int& i, const float4 *pos)
|
||||
{ return tex1Dfetch(pos_tex, i); }
|
||||
ucl_inline float fetch_q(const int& i, const float *q)
|
||||
{ return tex1Dfetch(q_tex, i); }
|
||||
#endif
|
||||
#else
|
||||
texture<int4,1> pos_tex;
|
||||
texture<int2> q_tex;
|
||||
#endif
|
||||
|
||||
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
#else
|
||||
#define pos_tex x_
|
||||
#define q_tex q_
|
||||
#endif
|
||||
|
||||
__kernel void k_cg_cmm_long(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
__global numtyp4* lj3, const int lj_types,
|
||||
__global numtyp *sp_lj_in, __global int *dev_nbor,
|
||||
__global int *dev_packed, __global acctyp4 *ans,
|
||||
@ -61,8 +65,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp qtmp=fetch_q(i,q_);
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
@ -73,7 +77,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jtype=jx.w;
|
||||
|
||||
// Compute r12
|
||||
@ -108,7 +112,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
numtyp expm2 = ucl_exp(-grij*grij);
|
||||
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
|
||||
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
|
||||
prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
|
||||
fetch(prefactor,j,q_tex);
|
||||
prefactor *= qqrd2e * qtmp/r;
|
||||
forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
|
||||
} else
|
||||
forcecoul = (numtyp)0.0;
|
||||
@ -143,7 +148,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
__kernel void k_cg_cmm_long_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
__global numtyp4* lj3_in,
|
||||
__global numtyp* sp_lj_in,
|
||||
__global int *dev_nbor, __global int *dev_packed,
|
||||
@ -181,8 +186,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp qtmp=fetch_q(i,q_);
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
@ -194,7 +199,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int mtype=itype+jx.w;
|
||||
|
||||
// Compute r12
|
||||
@ -228,7 +233,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
numtyp expm2 = ucl_exp(-grij*grij);
|
||||
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
|
||||
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
|
||||
prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
|
||||
fetch(prefactor,j,q_tex);
|
||||
prefactor *= qqrd2e * qtmp/r;
|
||||
forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
|
||||
} else
|
||||
forcecoul = (numtyp)0.0;
|
||||
|
||||
@ -13,10 +13,12 @@
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
#if defined(USE_OPENCL)
|
||||
#include "charmm_long_cl.h"
|
||||
#elif defined(USE_CUDART)
|
||||
const char *charmm_long=0;
|
||||
#else
|
||||
#include "charmm_long_ptx.h"
|
||||
#include "charmm_long_cubin.h"
|
||||
#endif
|
||||
|
||||
#include "lal_charmm_long.h"
|
||||
@ -57,7 +59,7 @@ int CHARMMLongT::init(const int ntypes,
|
||||
double **sigma, const bool mix_arithmetic) {
|
||||
int success;
|
||||
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||
_screen,charmm_long);
|
||||
_screen,charmm_long,"k_charmm_long");
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
@ -148,22 +150,19 @@ void CHARMMLongT::loop(const bool _eflag, const bool _vflag) {
|
||||
this->time_pair.start();
|
||||
if (shared_types) {
|
||||
this->k_pair_fast.set_size(GX,BX);
|
||||
this->k_pair_fast.run(&this->atom->dev_x.begin(), &ljd.begin(),
|
||||
&sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
||||
&this->_nbor_data->begin(),
|
||||
&this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag,
|
||||
&ainum, &nbor_pitch, &this->atom->dev_q.begin(),
|
||||
this->k_pair_fast.run(&this->atom->x, &ljd, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag,
|
||||
&vflag, &ainum, &nbor_pitch, &this->atom->q,
|
||||
&_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj,
|
||||
&_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq,
|
||||
&this->_threads_per_atom);
|
||||
} else {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(),
|
||||
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
||||
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->atom->dev_q.begin(),
|
||||
this->k_pair.run(&this->atom->x, &lj1, &_lj_types, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag, &vflag,
|
||||
&ainum, &nbor_pitch, &this->atom->q,
|
||||
&_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj,
|
||||
&_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq,
|
||||
&this->_threads_per_atom);
|
||||
|
||||
@ -14,18 +14,22 @@
|
||||
// ***************************************************************************/
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
|
||||
#include "lal_aux_fun1.h"
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
texture<float4> pos_tex;
|
||||
texture<float> q_tex;
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
ucl_inline float4 fetch_pos(const int& i, const float4 *pos)
|
||||
{ return tex1Dfetch(pos_tex, i); }
|
||||
ucl_inline float fetch_q(const int& i, const float *q)
|
||||
{ return tex1Dfetch(q_tex, i); }
|
||||
#endif
|
||||
#else
|
||||
texture<int4,1> pos_tex;
|
||||
texture<int2> q_tex;
|
||||
#endif
|
||||
|
||||
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
#else
|
||||
#define pos_tex x_
|
||||
#define q_tex q_
|
||||
#endif
|
||||
|
||||
__kernel void k_charmm_long(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
const int lj_types, __global numtyp *sp_lj_in,
|
||||
__global int *dev_nbor, __global int *dev_packed,
|
||||
__global acctyp4 *ans, __global acctyp *engv,
|
||||
@ -62,8 +66,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp qtmp=fetch_q(i,q_);
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
@ -74,7 +78,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jtype=jx.w;
|
||||
|
||||
// Compute r12
|
||||
@ -110,7 +114,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
numtyp expm2 = ucl_exp(-grij*grij);
|
||||
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
|
||||
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
|
||||
prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
|
||||
fetch(prefactor,j,q_tex);
|
||||
prefactor *= qqrd2e * qtmp/r;
|
||||
forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
|
||||
} else
|
||||
forcecoul = (numtyp)0.0;
|
||||
@ -147,7 +152,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
|
||||
__kernel void k_charmm_long_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
|
||||
__global numtyp* sp_lj_in, __global int *dev_nbor,
|
||||
__global int *dev_packed, __global acctyp4 *ans,
|
||||
__global acctyp *engv, const int eflag,
|
||||
@ -185,8 +190,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp qtmp=fetch_q(i,q_);
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
@ -197,7 +202,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
|
||||
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jtype=jx.w;
|
||||
|
||||
// Compute r12
|
||||
@ -239,7 +244,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
|
||||
numtyp expm2 = ucl_exp(-grij*grij);
|
||||
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
|
||||
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
|
||||
prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
|
||||
fetch(prefactor,j,q_tex);
|
||||
prefactor *= qqrd2e * qtmp/r;
|
||||
forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
|
||||
} else
|
||||
forcecoul = (numtyp)0.0;
|
||||
|
||||
@ -13,10 +13,12 @@
|
||||
email : a.kohlmeyer@temple.edu
|
||||
***************************************************************************/
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
#if defined(USE_OPENCL)
|
||||
#include "coul_long_cl.h"
|
||||
#elif defined(USE_CUDART)
|
||||
const char *coul_long=0;
|
||||
#else
|
||||
#include "coul_long_ptx.h"
|
||||
#include "coul_long_cubin.h"
|
||||
#endif
|
||||
|
||||
#include "lal_coul_long.h"
|
||||
@ -48,7 +50,7 @@ int CoulLongT::init(const int nlocal, const int nall, const int max_nbors,
|
||||
const double qqrd2e, const double g_ewald) {
|
||||
int success;
|
||||
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,
|
||||
gpu_split,_screen,coul_long);
|
||||
gpu_split,_screen,coul_long,"k_coul_long");
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
@ -132,22 +134,18 @@ void CoulLongT::loop(const bool _eflag, const bool _vflag) {
|
||||
this->time_pair.start();
|
||||
if (shared_types) {
|
||||
this->k_pair_fast.set_size(GX,BX);
|
||||
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
|
||||
&lj3.begin(), &sp_cl.begin(),
|
||||
&this->nbor->dev_nbor.begin(),
|
||||
&this->_nbor_data->begin(),
|
||||
&this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag,
|
||||
&ainum, &nbor_pitch, &this->atom->dev_q.begin(),
|
||||
&_cut_coulsq, &_qqrd2e, &_g_ewald,
|
||||
this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_cl,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv,
|
||||
&eflag, &vflag, &ainum, &nbor_pitch,
|
||||
&this->atom->q, &_cut_coulsq, &_qqrd2e, &_g_ewald,
|
||||
&this->_threads_per_atom);
|
||||
} else {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
|
||||
&_lj_types, &sp_cl.begin(), &this->nbor->dev_nbor.begin(),
|
||||
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq,
|
||||
this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_cl,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag, &vflag,
|
||||
&ainum, &nbor_pitch, &this->atom->q, &_cut_coulsq,
|
||||
&_qqrd2e, &_g_ewald, &this->_threads_per_atom);
|
||||
}
|
||||
this->time_pair.stop();
|
||||
|
||||
@ -14,18 +14,22 @@
|
||||
// ***************************************************************************/
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
|
||||
#include "lal_aux_fun1.h"
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
texture<float4> pos_tex;
|
||||
texture<float> q_tex;
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
ucl_inline float4 fetch_pos(const int& i, const float4 *pos)
|
||||
{ return tex1Dfetch(pos_tex, i); }
|
||||
ucl_inline float fetch_q(const int& i, const float *q)
|
||||
{ return tex1Dfetch(q_tex, i); }
|
||||
#endif
|
||||
#else
|
||||
texture<int4,1> pos_tex;
|
||||
texture<int2> q_tex;
|
||||
#endif
|
||||
|
||||
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
#else
|
||||
#define pos_tex x_
|
||||
#define q_tex q_
|
||||
#endif
|
||||
|
||||
__kernel void k_coul_long(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
__global numtyp4* lj3, const int lj_types,
|
||||
__global numtyp *sp_cl_in, __global int *dev_nbor,
|
||||
__global int *dev_packed, __global acctyp4 *ans,
|
||||
@ -56,8 +60,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp qtmp=fetch_q(i,q_);
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
@ -66,7 +70,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
factor_coul = (numtyp)1.0-sp_cl[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
@ -83,7 +87,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
numtyp expm2 = ucl_exp(-grij*grij);
|
||||
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
|
||||
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
|
||||
prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
|
||||
fetch(prefactor,j,q_tex);
|
||||
prefactor *= qqrd2e * qtmp/r;
|
||||
force = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul) * r2inv;
|
||||
|
||||
f.x+=delx*force;
|
||||
@ -162,7 +167,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
__kernel void k_coul_long_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
__global numtyp4* lj3_in,
|
||||
__global numtyp* sp_cl_in,
|
||||
__global int *dev_nbor, __global int *dev_packed,
|
||||
@ -193,8 +198,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp qtmp=fetch_q(i,q_);
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
@ -203,7 +208,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
factor_coul = (numtyp)1.0-sp_cl[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
@ -220,7 +225,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
numtyp expm2 = ucl_exp(-grij*grij);
|
||||
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
|
||||
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
|
||||
prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
|
||||
fetch(prefactor,j,q_tex);
|
||||
prefactor *= qqrd2e * qtmp/r;
|
||||
force = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul) * r2inv;
|
||||
|
||||
f.x+=delx*force;
|
||||
|
||||
@ -21,10 +21,12 @@
|
||||
#include <omp.h>
|
||||
#endif
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
#if defined(USE_OPENCL)
|
||||
#include "device_cl.h"
|
||||
#elif defined(USE_CUDART)
|
||||
const char *device=0;
|
||||
#else
|
||||
#include "device_ptx.h"
|
||||
#include "device_cubin.h"
|
||||
#endif
|
||||
|
||||
using namespace LAMMPS_AL;
|
||||
@ -42,10 +44,10 @@ DeviceT::~Device() {
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int DeviceT::init_device(MPI_Comm world, MPI_Comm replica,
|
||||
const int first_gpu, const int last_gpu,
|
||||
const int gpu_mode, const double p_split,
|
||||
const int nthreads, const int t_per_atom) {
|
||||
int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
|
||||
const int last_gpu, const int gpu_mode,
|
||||
const double p_split, const int nthreads,
|
||||
const int t_per_atom, const double cell_size) {
|
||||
_nthreads=nthreads;
|
||||
#ifdef _OPENMP
|
||||
omp_set_num_threads(nthreads);
|
||||
@ -62,6 +64,7 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica,
|
||||
_last_device=last_gpu;
|
||||
_gpu_mode=gpu_mode;
|
||||
_particle_split=p_split;
|
||||
_cell_size=cell_size;
|
||||
|
||||
// Get the rank/size within the world
|
||||
MPI_Comm_rank(_comm_world,&_world_me);
|
||||
@ -191,7 +194,7 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
|
||||
} else {
|
||||
if (atom.charge()==false && charge)
|
||||
_data_in_estimate++;
|
||||
if (atom.quat()==false && rot)
|
||||
if (atom.quaternion()==false && rot)
|
||||
_data_in_estimate++;
|
||||
if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial))
|
||||
return -3;
|
||||
@ -205,7 +208,10 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
|
||||
_block_cell_id, _block_nbor_build, threads_per_atom,
|
||||
_warp_size, _time_device))
|
||||
return -3;
|
||||
nbor->cell_size(cell_size);
|
||||
if (_cell_size<0.0)
|
||||
nbor->cell_size(cell_size,cell_size);
|
||||
else
|
||||
nbor->cell_size(_cell_size,cell_size);
|
||||
|
||||
_init_count++;
|
||||
return 0;
|
||||
@ -251,7 +257,9 @@ void DeviceT::set_double_precompute
|
||||
template <class numtyp, class acctyp>
|
||||
void DeviceT::init_message(FILE *screen, const char *name,
|
||||
const int first_gpu, const int last_gpu) {
|
||||
#ifdef USE_OPENCL
|
||||
#if defined(USE_OPENCL)
|
||||
std::string fs="";
|
||||
#elif defined(USE_CUDART)
|
||||
std::string fs="";
|
||||
#else
|
||||
std::string fs=toa(gpu->free_gigabytes())+"/";
|
||||
@ -411,11 +419,9 @@ void DeviceT::estimate_gpu_overhead(const int kernel_calls,
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void DeviceT::output_times(UCL_Timer &time_pair,
|
||||
Answer<numtyp,acctyp> &ans,
|
||||
void DeviceT::output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans,
|
||||
Neighbor &nbor, const double avg_split,
|
||||
const double max_bytes,
|
||||
const double gpu_overhead,
|
||||
const double max_bytes, const double gpu_overhead,
|
||||
const double driver_overhead,
|
||||
const int threads_per_atom, FILE *screen) {
|
||||
double single[9], times[9];
|
||||
@ -574,33 +580,32 @@ int DeviceT::compile_kernels() {
|
||||
k_info.set_function(*dev_program,"kernel_info");
|
||||
_compiled=true;
|
||||
|
||||
UCL_H_Vec<int> h_gpu_lib_data(14,*gpu,UCL_NOT_PINNED);
|
||||
UCL_D_Vec<int> d_gpu_lib_data(14,*gpu);
|
||||
UCL_Vector<int,int> gpu_lib_data(14,*gpu,UCL_NOT_PINNED);
|
||||
k_info.set_size(1,1);
|
||||
k_info.run(&d_gpu_lib_data.begin());
|
||||
ucl_copy(h_gpu_lib_data,d_gpu_lib_data,false);
|
||||
k_info.run(&gpu_lib_data);
|
||||
gpu_lib_data.update_host(false);
|
||||
|
||||
_ptx_arch=static_cast<double>(h_gpu_lib_data[0])/100.0;
|
||||
_ptx_arch=static_cast<double>(gpu_lib_data[0])/100.0;
|
||||
#ifndef USE_OPENCL
|
||||
if (_ptx_arch>gpu->arch())
|
||||
return -4;
|
||||
#endif
|
||||
|
||||
_num_mem_threads=h_gpu_lib_data[1];
|
||||
_warp_size=h_gpu_lib_data[2];
|
||||
_num_mem_threads=gpu_lib_data[1];
|
||||
_warp_size=gpu_lib_data[2];
|
||||
if (_threads_per_atom<1)
|
||||
_threads_per_atom=h_gpu_lib_data[3];
|
||||
_threads_per_atom=gpu_lib_data[3];
|
||||
if (_threads_per_charge<1)
|
||||
_threads_per_charge=h_gpu_lib_data[13];
|
||||
_pppm_max_spline=h_gpu_lib_data[4];
|
||||
_pppm_block=h_gpu_lib_data[5];
|
||||
_block_pair=h_gpu_lib_data[6];
|
||||
_max_shared_types=h_gpu_lib_data[7];
|
||||
_block_cell_2d=h_gpu_lib_data[8];
|
||||
_block_cell_id=h_gpu_lib_data[9];
|
||||
_block_nbor_build=h_gpu_lib_data[10];
|
||||
_block_bio_pair=h_gpu_lib_data[11];
|
||||
_max_bio_shared_types=h_gpu_lib_data[12];
|
||||
_threads_per_charge=gpu_lib_data[13];
|
||||
_pppm_max_spline=gpu_lib_data[4];
|
||||
_pppm_block=gpu_lib_data[5];
|
||||
_block_pair=gpu_lib_data[6];
|
||||
_max_shared_types=gpu_lib_data[7];
|
||||
_block_cell_2d=gpu_lib_data[8];
|
||||
_block_cell_id=gpu_lib_data[9];
|
||||
_block_nbor_build=gpu_lib_data[10];
|
||||
_block_bio_pair=gpu_lib_data[11];
|
||||
_max_bio_shared_types=gpu_lib_data[12];
|
||||
|
||||
if (static_cast<size_t>(_block_pair)>gpu->group_size())
|
||||
_block_pair=gpu->group_size();
|
||||
@ -634,9 +639,10 @@ Device<PRECISION,ACC_PRECISION> global_device;
|
||||
int lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
|
||||
const int last_gpu, const int gpu_mode,
|
||||
const double particle_split, const int nthreads,
|
||||
const int t_per_atom) {
|
||||
const int t_per_atom, const double cell_size) {
|
||||
return global_device.init_device(world,replica,first_gpu,last_gpu,gpu_mode,
|
||||
particle_split,nthreads,t_per_atom);
|
||||
particle_split,nthreads,t_per_atom,
|
||||
cell_size);
|
||||
}
|
||||
|
||||
void lmp_clear_device() {
|
||||
|
||||
@ -49,7 +49,7 @@ class Device {
|
||||
int init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
|
||||
const int last_gpu, const int gpu_mode,
|
||||
const double particle_split, const int nthreads,
|
||||
const int t_per_atom);
|
||||
const int t_per_atom, const double cell_size);
|
||||
|
||||
/// Initialize the device for Atom and Neighbor storage
|
||||
/** \param rot True if quaternions need to be stored
|
||||
@ -239,7 +239,7 @@ class Device {
|
||||
int num_blocks=static_cast<int>(ceil(static_cast<double>(numel)/
|
||||
_block_pair));
|
||||
k_zero.set_size(num_blocks,_block_pair);
|
||||
k_zero.run(&mem.begin(),&numel);
|
||||
k_zero.run(&mem,&numel);
|
||||
}
|
||||
|
||||
// -------------------------- DEVICE DATA -------------------------
|
||||
@ -288,6 +288,7 @@ class Device {
|
||||
double _particle_split;
|
||||
double _cpu_full;
|
||||
double _ptx_arch;
|
||||
double _cell_size; // -1 if the cutoff is used
|
||||
|
||||
int _num_mem_threads, _warp_size, _threads_per_atom, _threads_per_charge;
|
||||
int _pppm_max_spline, _pppm_block;
|
||||
|
||||
@ -13,10 +13,12 @@
|
||||
email : brownw@ornl.gov nguyentd@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
#if defined(USE_OPENCL)
|
||||
#include "eam_cl.h"
|
||||
#elif defined(USE_CUDART)
|
||||
const char *eam=0;
|
||||
#else
|
||||
#include "eam_ptx.h"
|
||||
#include "eam_cubin.h"
|
||||
#endif
|
||||
|
||||
#include "lal_eam.h"
|
||||
@ -51,32 +53,24 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
|
||||
{
|
||||
int success;
|
||||
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,
|
||||
gpu_split,_screen,eam);
|
||||
gpu_split,_screen,eam,"k_eam");
|
||||
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
// allocate fp
|
||||
|
||||
bool cpuview=false;
|
||||
if (this->ucl_device->device_type()==UCL_CPU)
|
||||
cpuview=true;
|
||||
|
||||
int ef_nall=nall;
|
||||
if (ef_nall==0)
|
||||
ef_nall=2000;
|
||||
|
||||
_max_fp_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
|
||||
host_fp.alloc(_max_fp_size,*(this->ucl_device));
|
||||
if (cpuview)
|
||||
dev_fp.view(host_fp);
|
||||
else
|
||||
dev_fp.alloc(_max_fp_size,*(this->ucl_device),UCL_WRITE_ONLY);
|
||||
_fp.alloc(_max_fp_size,*(this->ucl_device),UCL_RW_OPTIMIZED,UCL_WRITE_ONLY);
|
||||
|
||||
k_energy.set_function(*(this->pair_program),"kernel_energy");
|
||||
k_energy_fast.set_function(*(this->pair_program),"kernel_energy_fast");
|
||||
k_energy.set_function(*(this->pair_program),"k_energy");
|
||||
k_energy_fast.set_function(*(this->pair_program),"k_energy_fast");
|
||||
fp_tex.get_texture(*(this->pair_program),"fp_tex");
|
||||
fp_tex.bind_float(dev_fp,1);
|
||||
fp_tex.bind_float(_fp,1);
|
||||
_compiled_energy = true;
|
||||
|
||||
// Initialize timers for selected GPU
|
||||
@ -236,7 +230,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
|
||||
+ frho_spline2.row_bytes()
|
||||
+ z2r_spline1.row_bytes()
|
||||
+ z2r_spline2.row_bytes()
|
||||
+ dev_fp.row_bytes();
|
||||
+ _fp.device.row_bytes();
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -255,8 +249,7 @@ void EAMT::clear() {
|
||||
z2r_spline1.clear();
|
||||
z2r_spline2.clear();
|
||||
|
||||
host_fp.clear();
|
||||
dev_fp.clear();
|
||||
_fp.clear();
|
||||
|
||||
time_pair2.clear();
|
||||
time_fp1.clear();
|
||||
@ -303,19 +296,11 @@ void EAMT::compute(const int f_ago, const int inum_full, const int nlocal,
|
||||
// ------------------- Resize FP Array for EAM --------------------
|
||||
|
||||
if (nall>_max_fp_size) {
|
||||
dev_fp.clear();
|
||||
host_fp.clear();
|
||||
|
||||
_max_fp_size=static_cast<int>(static_cast<double>(nall)*1.10);
|
||||
host_fp.alloc(_max_fp_size,*(this->ucl_device));
|
||||
if (this->ucl_device->device_type()==UCL_CPU)
|
||||
dev_fp.view(host_fp);
|
||||
else
|
||||
dev_fp.alloc(_max_fp_size,*(this->ucl_device));
|
||||
|
||||
fp_tex.bind_float(dev_fp,1);
|
||||
_fp.resize(_max_fp_size);
|
||||
fp_tex.bind_float(_fp,1);
|
||||
}
|
||||
*fp_ptr=host_fp.begin();
|
||||
*fp_ptr=_fp.host.begin();
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
|
||||
@ -348,7 +333,7 @@ void EAMT::compute(const int f_ago, const int inum_full, const int nlocal,
|
||||
// copy fp from device to host for comm
|
||||
_nlocal=nlocal;
|
||||
time_fp1.start();
|
||||
ucl_copy(host_fp,dev_fp,nlocal,true);
|
||||
_fp.update_host(nlocal,true);
|
||||
time_fp1.stop();
|
||||
time_fp1.sync_stop();
|
||||
}
|
||||
@ -380,19 +365,11 @@ int** EAMT::compute(const int ago, const int inum_full, const int nall,
|
||||
// ------------------- Resize FP Array for EAM --------------------
|
||||
|
||||
if (nall>_max_fp_size) {
|
||||
dev_fp.clear();
|
||||
host_fp.clear();
|
||||
|
||||
_max_fp_size=static_cast<int>(static_cast<double>(nall)*1.10);
|
||||
host_fp.alloc(_max_fp_size,*(this->ucl_device));
|
||||
if (this->ucl_device->device_type()==UCL_CPU)
|
||||
dev_fp.view(host_fp);
|
||||
else
|
||||
dev_fp.alloc(_max_fp_size,*(this->ucl_device));
|
||||
|
||||
fp_tex.bind_float(dev_fp,1);
|
||||
_fp.resize(_max_fp_size);
|
||||
fp_tex.bind_float(_fp,1);
|
||||
}
|
||||
*fp_ptr=host_fp.begin();
|
||||
*fp_ptr=_fp.host.begin();
|
||||
|
||||
// -----------------------------------------------------------------
|
||||
|
||||
@ -428,7 +405,7 @@ int** EAMT::compute(const int ago, const int inum_full, const int nall,
|
||||
// copy fp from device to host for comm
|
||||
_nlocal=inum_full;
|
||||
time_fp1.start();
|
||||
ucl_copy(host_fp,dev_fp,inum_full,true);
|
||||
_fp.update_host(inum_full,true);
|
||||
time_fp1.stop();
|
||||
time_fp1.sync_stop();
|
||||
|
||||
@ -486,22 +463,18 @@ void EAMT::loop(const bool _eflag, const bool _vflag) {
|
||||
|
||||
if (shared_types) {
|
||||
this->k_energy_fast.set_size(GX,BX);
|
||||
this->k_energy_fast.run(&this->atom->dev_x.begin(), &type2rhor_z2r.begin(),
|
||||
&type2frho.begin(), &rhor_spline2.begin(),
|
||||
&frho_spline1.begin(),&frho_spline2.begin(),
|
||||
&this->nbor->dev_nbor.begin(),
|
||||
&this->_nbor_data->begin(), &dev_fp.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &ainum,
|
||||
this->k_energy_fast.run(&this->atom->x, &type2rhor_z2r, &type2frho,
|
||||
&rhor_spline2, &frho_spline1,&frho_spline2,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&_fp, &this->ans->engv, &eflag, &ainum,
|
||||
&nbor_pitch, &_ntypes, &_cutforcesq, &_rdr, &_rdrho,
|
||||
&_nrho, &_nr, &this->_threads_per_atom);
|
||||
} else {
|
||||
this->k_energy.set_size(GX,BX);
|
||||
this->k_energy.run(&this->atom->dev_x.begin(), &type2rhor_z2r.begin(),
|
||||
&type2frho.begin(), &rhor_spline2.begin(),
|
||||
&frho_spline1.begin(),&frho_spline2.begin(),
|
||||
&this->nbor->dev_nbor.begin(),
|
||||
&this->_nbor_data->begin(), &dev_fp.begin(),
|
||||
&this->ans->dev_engv.begin(),&eflag, &ainum, &nbor_pitch,
|
||||
this->k_energy.run(&this->atom->x, &type2rhor_z2r, &type2frho,
|
||||
&rhor_spline2, &frho_spline1, &frho_spline2,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(), &_fp,
|
||||
&this->ans->engv,&eflag, &ainum, &nbor_pitch,
|
||||
&_ntypes, &_cutforcesq, &_rdr, &_rdrho, &_nrho, &_nr,
|
||||
&this->_threads_per_atom);
|
||||
}
|
||||
@ -536,27 +509,19 @@ void EAMT::loop2(const bool _eflag, const bool _vflag) {
|
||||
|
||||
if (shared_types) {
|
||||
this->k_pair_fast.set_size(GX,BX);
|
||||
this->k_pair_fast.run(&this->atom->dev_x.begin(), &dev_fp.begin(),
|
||||
&type2rhor_z2r.begin(),
|
||||
&rhor_spline1.begin(),
|
||||
&z2r_spline1.begin(),
|
||||
&z2r_spline2.begin(),
|
||||
&this->nbor->dev_nbor.begin(),
|
||||
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &_cutforcesq, &_rdr, &_nr,
|
||||
&this->_threads_per_atom);
|
||||
this->k_pair_fast.run(&this->atom->x, &_fp, &type2rhor_z2r,
|
||||
&rhor_spline1, &z2r_spline1, &z2r_spline2,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag,
|
||||
&vflag, &ainum, &nbor_pitch, &_cutforcesq, &_rdr,
|
||||
&_nr, &this->_threads_per_atom);
|
||||
} else {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->dev_x.begin(), &dev_fp.begin(),
|
||||
&type2rhor_z2r.begin(),
|
||||
&rhor_spline1.begin(),
|
||||
&z2r_spline1.begin(),
|
||||
&z2r_spline2.begin(),
|
||||
&this->nbor->dev_nbor.begin(),
|
||||
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &_ntypes, &_cutforcesq, &_rdr, &_nr,
|
||||
this->k_pair.run(&this->atom->x, &_fp, &type2rhor_z2r, &rhor_spline1,
|
||||
&z2r_spline1, &z2r_spline2, &this->nbor->dev_nbor,
|
||||
&this->_nbor_data->begin(), &this->ans->force,
|
||||
&this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch,
|
||||
&_ntypes, &_cutforcesq, &_rdr, &_nr,
|
||||
&this->_threads_per_atom);
|
||||
}
|
||||
|
||||
|
||||
@ -15,66 +15,37 @@
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#include "lal_aux_fun1.h"
|
||||
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
texture<float4> pos_tex;
|
||||
texture<float> fp_tex;
|
||||
|
||||
texture<float4> rhor_sp1_tex;
|
||||
texture<float4> rhor_sp2_tex;
|
||||
texture<float4> frho_sp1_tex;
|
||||
texture<float4> frho_sp2_tex;
|
||||
texture<float4> z2r_sp1_tex;
|
||||
texture<float4> z2r_sp2_tex;
|
||||
|
||||
#ifdef _DOUBLE_DOUBLE
|
||||
ucl_inline double4 fetch_rhor_sp1(const int& i, const double4 *rhor_spline1) {
|
||||
return rhor_spline1[i];
|
||||
}
|
||||
ucl_inline double4 fetch_rhor_sp2(const int& i, const double4 *rhor_spline2) {
|
||||
return rhor_spline2[i];
|
||||
}
|
||||
ucl_inline double4 fetch_frho_sp1(const int& i, const double4 *frho_spline1) {
|
||||
return frho_spline1[i];
|
||||
}
|
||||
ucl_inline double4 fetch_frho_sp2(const int& i, const double4 *frho_spline2) {
|
||||
return frho_spline2[i];
|
||||
}
|
||||
ucl_inline double4 fetch_z2r_sp1(const int& i, const double4 *z2r_spline1) {
|
||||
return z2r_spline1[i];
|
||||
}
|
||||
ucl_inline double4 fetch_z2r_sp2(const int& i, const double4 *z2r_spline2) {
|
||||
return z2r_spline2[i];
|
||||
}
|
||||
#else
|
||||
texture<int4> pos_tex;
|
||||
texture<int2> fp_tex;
|
||||
texture<int4> rhor_sp1_tex;
|
||||
texture<int4> rhor_sp2_tex;
|
||||
texture<int4> frho_sp1_tex;
|
||||
texture<int4> frho_sp2_tex;
|
||||
texture<int4> z2r_sp1_tex;
|
||||
texture<int4> z2r_sp2_tex;
|
||||
#endif
|
||||
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
ucl_inline float4 fetch_pos(const int& i, const float4 *pos)
|
||||
{ return tex1Dfetch(pos_tex, i); }
|
||||
ucl_inline float fetch_q(const int& i, const float *fp)
|
||||
{ return tex1Dfetch(fp_tex, i); }
|
||||
#else
|
||||
|
||||
ucl_inline float4 fetch_rhor_sp1(const int& i, const float4 *rhor_spline1)
|
||||
{ return tex1Dfetch(rhor_sp1_tex, i); }
|
||||
ucl_inline float4 fetch_rhor_sp2(const int& i, const float4 *rhor_spline2)
|
||||
{ return tex1Dfetch(rhor_sp2_tex, i); }
|
||||
ucl_inline float4 fetch_frho_sp1(const int& i, const float4 *frho_spline1)
|
||||
{ return tex1Dfetch(frho_sp1_tex, i); }
|
||||
ucl_inline float4 fetch_frho_sp2(const int& i, const float4 *frho_spline2)
|
||||
{ return tex1Dfetch(frho_sp2_tex, i); }
|
||||
ucl_inline float4 fetch_z2r_sp1(const int& i, const float4 *z2r_spline1)
|
||||
{ return tex1Dfetch(z2r_sp1_tex, i); }
|
||||
ucl_inline float4 fetch_z2r_sp2(const int& i, const float4 *z2r_spline2)
|
||||
{ return tex1Dfetch(z2r_sp2_tex, i); }
|
||||
#endif
|
||||
|
||||
#else // OPENCL
|
||||
|
||||
#define fetch_q(i,y) fp_[i]
|
||||
#define fetch_rhor_sp1(i,y) rhor_spline1[i]
|
||||
#define fetch_rhor_sp2(i,y) rhor_spline2[i]
|
||||
#define fetch_frho_sp1(i,y) frho_spline1[i]
|
||||
#define fetch_frho_sp2(i,y) frho_spline2[i]
|
||||
#define fetch_z2r_sp1(i,y) z2r_spline1[i]
|
||||
#define fetch_z2r_sp2(i,y) z2r_spline2[i]
|
||||
#define pos_tex x_
|
||||
#define fp_tex fp_
|
||||
#define rhor_sp1_tex rhor_spline1
|
||||
#define rhor_sp2_tex rhor_spline2
|
||||
#define frho_sp1_tex frho_spline1
|
||||
#define frho_sp2_tex frho_spline2
|
||||
#define z2r_sp1_tex z2r_spline1
|
||||
#define z2r_sp2_tex z2r_spline2
|
||||
|
||||
#endif
|
||||
|
||||
@ -99,11 +70,11 @@ ucl_inline float4 fetch_z2r_sp2(const int& i, const float4 *z2r_spline2)
|
||||
p -= m; \
|
||||
p = MIN(p,(numtyp)1.0); \
|
||||
int index = type2frho[itype]*(nrho+1)+m; \
|
||||
numtyp4 coeff = fetch_frho_sp1(index, frho_spline1); \
|
||||
numtyp4 coeff; fetch4(coeff,index,frho_sp1_tex); \
|
||||
numtyp fp = (coeff.x*p + coeff.y)*p + coeff.z; \
|
||||
fp_[i]=fp; \
|
||||
if (eflag>0) { \
|
||||
coeff = fetch_frho_sp2(index, frho_spline2); \
|
||||
fetch4(coeff,index,frho_sp2_tex); \
|
||||
energy = ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w; \
|
||||
engv[ii]=(acctyp)2.0*energy; \
|
||||
} \
|
||||
@ -154,7 +125,7 @@ ucl_inline float4 fetch_z2r_sp2(const int& i, const float4 *z2r_spline2)
|
||||
ans[ii]=f; \
|
||||
}
|
||||
|
||||
__kernel void kernel_energy(__global numtyp4 *x_, __global int2 *type2rhor_z2r,
|
||||
__kernel void k_energy(__global numtyp4 *x_, __global int2 *type2rhor_z2r,
|
||||
__global int *type2frho,
|
||||
__global numtyp4 *rhor_spline2,
|
||||
__global numtyp4 *frho_spline1,
|
||||
@ -178,14 +149,14 @@ __kernel void kernel_energy(__global numtyp4 *x_, __global int2 *type2rhor_z2r,
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jtype=jx.w;
|
||||
|
||||
// Compute r12
|
||||
@ -203,7 +174,7 @@ __kernel void kernel_energy(__global numtyp4 *x_, __global int2 *type2rhor_z2r,
|
||||
|
||||
int mtype = jtype*ntypes+itype;
|
||||
int index = type2rhor_z2r[mtype].x*(nr+1)+m;
|
||||
numtyp4 coeff = fetch_rhor_sp2(index, rhor_spline2);
|
||||
numtyp4 coeff; fetch4(coeff,index,rhor_sp2_tex);
|
||||
rho += ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w;
|
||||
}
|
||||
} // for nbor
|
||||
@ -213,7 +184,7 @@ __kernel void kernel_energy(__global numtyp4 *x_, __global int2 *type2rhor_z2r,
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void kernel_energy_fast(__global numtyp4 *x_,
|
||||
__kernel void k_energy_fast(__global numtyp4 *x_,
|
||||
__global int2 *type2rhor_z2r_in,
|
||||
__global int *type2frho_in,
|
||||
__global numtyp4 *rhor_spline2,
|
||||
@ -252,14 +223,14 @@ __kernel void kernel_energy_fast(__global numtyp4 *x_,
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
@ -277,7 +248,7 @@ __kernel void kernel_energy_fast(__global numtyp4 *x_,
|
||||
int jtype=fast_mul((int)MAX_SHARED_TYPES,jx.w);
|
||||
int mtype = jtype+itype;
|
||||
int index = type2rhor_z2r[mtype].x*(nr+1)+m;
|
||||
numtyp4 coeff = fetch_rhor_sp2(index, rhor_spline2);
|
||||
numtyp4 coeff; fetch4(coeff,index,rhor_sp2_tex);
|
||||
rho += ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w;
|
||||
}
|
||||
} // for nbor
|
||||
@ -287,7 +258,7 @@ __kernel void kernel_energy_fast(__global numtyp4 *x_,
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp *fp_,
|
||||
__kernel void k_eam(__global numtyp4 *x_, __global numtyp *fp_,
|
||||
__global int2 *type2rhor_z2r,
|
||||
__global numtyp4 *rhor_spline1,
|
||||
__global numtyp4 *z2r_spline1,
|
||||
@ -317,15 +288,15 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp *fp_,
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp ifp=fetch_q(i,fp_); //fp_[i];
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp ifp; fetch(ifp,i,fp_tex); //fp_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
int j=*nbor;
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jtype=jx.w;
|
||||
|
||||
// Compute r12
|
||||
@ -347,25 +318,27 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp *fp_,
|
||||
|
||||
mtype = itype*ntypes+jtype;
|
||||
index = type2rhor_z2r[mtype].x*(nr+1)+m;
|
||||
coeff = fetch_rhor_sp1(index, rhor_spline1);
|
||||
fetch4(coeff,index,rhor_sp1_tex);
|
||||
numtyp rhoip = (coeff.x*p + coeff.y)*p + coeff.z;
|
||||
|
||||
mtype = jtype*ntypes+itype;
|
||||
index = type2rhor_z2r[mtype].x*(nr+1)+m;
|
||||
coeff = fetch_rhor_sp1(index, rhor_spline1);
|
||||
fetch4(coeff,index,rhor_sp1_tex);
|
||||
numtyp rhojp = (coeff.x*p + coeff.y)*p + coeff.z;
|
||||
|
||||
mtype = itype*ntypes+jtype;
|
||||
index = type2rhor_z2r[mtype].y*(nr+1)+m;
|
||||
coeff = fetch_z2r_sp1(index, z2r_spline1);
|
||||
fetch4(coeff,index,z2r_sp1_tex);
|
||||
numtyp z2p = (coeff.x*p + coeff.y)*p + coeff.z;
|
||||
coeff = fetch_z2r_sp2(index, z2r_spline2);
|
||||
fetch4(coeff,index,z2r_sp2_tex);
|
||||
numtyp z2 = ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w;
|
||||
|
||||
numtyp recip = ucl_recip(r);
|
||||
numtyp phi = z2*recip;
|
||||
numtyp phip = z2p*recip - phi*recip;
|
||||
numtyp psip = ifp*rhojp + fetch_q(j,fp_)*rhoip + phip;
|
||||
numtyp psip;
|
||||
fetch(psip,j,fp_tex);
|
||||
psip = ifp*rhojp + psip*rhoip + phip;
|
||||
numtyp force = -psip*recip;
|
||||
|
||||
f.x+=delx*force;
|
||||
@ -391,7 +364,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp *fp_,
|
||||
|
||||
}
|
||||
|
||||
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp *fp_,
|
||||
__kernel void k_eam_fast(__global numtyp4 *x_, __global numtyp *fp_,
|
||||
__global int2 *type2rhor_z2r_in,
|
||||
__global numtyp4 *rhor_spline1,
|
||||
__global numtyp4 *z2r_spline1,
|
||||
@ -427,8 +400,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp *fp_,
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp ifp=fetch_q(i,fp_); //fp_[i];
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp ifp; fetch(ifp,i,fp_tex); //fp_[i];
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
@ -436,7 +409,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp *fp_,
|
||||
int j=*nbor;
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jw=jx.w;
|
||||
int jtype=fast_mul((int)MAX_SHARED_TYPES,jw);
|
||||
|
||||
@ -459,25 +432,27 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp *fp_,
|
||||
|
||||
mtype = itype+jw;
|
||||
index = type2rhor_z2r[mtype].x*(nr+1)+m;
|
||||
coeff = fetch_rhor_sp1(index, rhor_spline1);
|
||||
fetch4(coeff,index,rhor_sp1_tex);
|
||||
numtyp rhoip = (coeff.x*p + coeff.y)*p + coeff.z;
|
||||
|
||||
mtype = jtype+iw;
|
||||
index = type2rhor_z2r[mtype].x*(nr+1)+m;
|
||||
coeff = fetch_rhor_sp1(index, rhor_spline1);
|
||||
fetch4(coeff,index,rhor_sp1_tex);
|
||||
numtyp rhojp = (coeff.x*p + coeff.y)*p + coeff.z;
|
||||
|
||||
mtype = itype+jw;
|
||||
index = type2rhor_z2r[mtype].y*(nr+1)+m;
|
||||
coeff = fetch_z2r_sp1(index, z2r_spline1);
|
||||
fetch4(coeff,index,z2r_sp1_tex);
|
||||
numtyp z2p = (coeff.x*p + coeff.y)*p + coeff.z;
|
||||
coeff = fetch_z2r_sp2(index, z2r_spline2);
|
||||
fetch4(coeff,index,z2r_sp2_tex);
|
||||
numtyp z2 = ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w;
|
||||
|
||||
numtyp recip = ucl_recip(r);
|
||||
numtyp phi = z2*recip;
|
||||
numtyp phip = z2p*recip - phi*recip;
|
||||
numtyp psip = ifp*rhojp + fetch_q(j,fp_)*rhoip + phip;
|
||||
numtyp psip;
|
||||
fetch(psip,j,fp_tex);
|
||||
psip = ifp*rhojp + psip*rhoip + phip;
|
||||
numtyp force = -psip*recip;
|
||||
|
||||
f.x+=delx*force;
|
||||
|
||||
@ -52,8 +52,8 @@ class EAM : public BaseAtomic<numtyp, acctyp> {
|
||||
if (nghost>0) {
|
||||
UCL_H_Vec<numtyp> host_view;
|
||||
UCL_D_Vec<numtyp> dev_view;
|
||||
host_view.view_offset(_nlocal,host_fp);
|
||||
dev_view.view_offset(_nlocal,dev_fp);
|
||||
host_view.view_offset(_nlocal,_fp.host);
|
||||
dev_view.view_offset(_nlocal,_fp.device);
|
||||
ucl_copy(dev_view,host_view,nghost,true);
|
||||
}
|
||||
}
|
||||
@ -128,8 +128,7 @@ class EAM : public BaseAtomic<numtyp, acctyp> {
|
||||
bool _compiled_energy;
|
||||
|
||||
/// Per-atom arrays
|
||||
UCL_H_Vec<numtyp> host_fp;
|
||||
UCL_D_Vec<numtyp> dev_fp;
|
||||
UCL_Vector<numtyp,numtyp> _fp;
|
||||
|
||||
protected:
|
||||
bool _allocated;
|
||||
|
||||
@ -20,6 +20,14 @@ enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#include "lal_preprocessor.h"
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
texture<float4> pos_tex, quat_tex;
|
||||
#else
|
||||
texture<int4,1> pos_tex, quat_tex;
|
||||
#endif
|
||||
#else
|
||||
#define pos_tex x_
|
||||
#define quat_tex qif
|
||||
#endif
|
||||
|
||||
#define atom_info(t_per_atom, ii, tid, offset) \
|
||||
@ -411,7 +419,7 @@ ucl_inline void gpu_mldivide3(const numtyp m[9], const numtyp *v, numtyp *ans,
|
||||
ucl_inline void gpu_quat_to_mat_trans(__global const numtyp4 *qif, const int qi,
|
||||
numtyp mat[9])
|
||||
{
|
||||
numtyp4 q=qif[qi];
|
||||
numtyp4 q; fetch4(q,qi,quat_tex);
|
||||
|
||||
numtyp w2 = q.x*q.x;
|
||||
numtyp i2 = q.y*q.y;
|
||||
|
||||
@ -15,6 +15,13 @@
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#include "lal_preprocessor.h"
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
texture<float4> pos_tex;
|
||||
#else
|
||||
texture<int4,1> pos_tex;
|
||||
#endif
|
||||
#else
|
||||
#define pos_tex x_
|
||||
#endif
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
@ -40,14 +47,14 @@ __kernel void kernel_nbor(__global numtyp4 *x_, __global numtyp2 *cut_form,
|
||||
__global int *list_end=nbor+fast_mul(numj,nbor_pitch);
|
||||
__global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
|
||||
|
||||
numtyp4 ix=x_[i];
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul(iw,ntypes);
|
||||
int newj=0;
|
||||
for ( ; nbor<list_end; nbor+=nbor_pitch) {
|
||||
int j=*nbor;
|
||||
j &= NEIGHMASK;
|
||||
numtyp4 jx=x_[j];
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jtype=jx.w;
|
||||
int mtype=itype+jtype;
|
||||
numtyp2 cf=cut_form[mtype];
|
||||
@ -102,7 +109,7 @@ __kernel void kernel_nbor_fast(__global numtyp4 *x_, __global numtyp2 *cut_form,
|
||||
__global int *list_end=nbor+fast_mul(numj,nbor_pitch);
|
||||
__global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
|
||||
|
||||
numtyp4 ix=x_[i];
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
@ -110,7 +117,7 @@ __kernel void kernel_nbor_fast(__global numtyp4 *x_, __global numtyp2 *cut_form,
|
||||
for ( ; nbor<list_end; nbor+=nbor_pitch) {
|
||||
int j=*nbor;
|
||||
j &= NEIGHMASK;
|
||||
numtyp4 jx=x_[j];
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jtype=jx.w;
|
||||
int mtype=itype+jtype;
|
||||
|
||||
|
||||
@ -13,12 +13,15 @@
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
#if defined(USE_OPENCL)
|
||||
#include "gayberne_cl.h"
|
||||
#include "gayberne_lj_cl.h"
|
||||
#elif defined(USE_CUDART)
|
||||
const char *gayberne=0;
|
||||
const char *gayberne_lj=0;
|
||||
#else
|
||||
#include "gayberne_ptx.h"
|
||||
#include "gayberne_lj_ptx.h"
|
||||
#include "gayberne_cubin.h"
|
||||
#include "gayberne_lj_cubin.h"
|
||||
#endif
|
||||
|
||||
#include "lal_gayberne.h"
|
||||
@ -57,7 +60,8 @@ int GayBerneT::init(const int ntypes, const double gamma,
|
||||
const double gpu_split, FILE *_screen) {
|
||||
int success;
|
||||
success=this->init_base(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||
_screen,ntypes,h_form,gayberne,gayberne_lj);
|
||||
_screen,ntypes,h_form,gayberne,gayberne_lj,
|
||||
"k_gayberne");
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
@ -210,13 +214,13 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
|
||||
|
||||
this->time_ellipsoid.start();
|
||||
this->k_ellipsoid.set_size(GX,BX);
|
||||
this->k_ellipsoid.run(&this->atom->dev_x.begin(),
|
||||
&this->atom->dev_quat.begin(), &this->shape.begin(), &this->well.begin(),
|
||||
&this->gamma_upsilon_mu.begin(), &this->sigma_epsilon.begin(),
|
||||
&this->_lj_types, &this->lshape.begin(), &this->nbor->dev_nbor.begin(),
|
||||
&stride, &this->ans->dev_ans.begin(),&ainum,&this->ans->dev_engv.begin(),
|
||||
&this->dev_error.begin(), &eflag, &vflag, &this->_last_ellipse,
|
||||
&this->_threads_per_atom);
|
||||
this->k_ellipsoid.run(&this->atom->x, &this->atom->quat,
|
||||
&this->shape, &this->well, &this->gamma_upsilon_mu,
|
||||
&this->sigma_epsilon, &this->_lj_types,
|
||||
&this->lshape, &this->nbor->dev_nbor, &stride,
|
||||
&this->ans->force, &ainum, &this->ans->engv,
|
||||
&this->dev_error, &eflag, &vflag,
|
||||
&this->_last_ellipse, &this->_threads_per_atom);
|
||||
this->time_ellipsoid.stop();
|
||||
|
||||
if (this->_last_ellipse==this->ans->inum()) {
|
||||
@ -243,17 +247,19 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
|
||||
|
||||
this->time_ellipsoid2.start();
|
||||
this->k_sphere_ellipsoid.set_size(GX,BX);
|
||||
this->k_sphere_ellipsoid.run(&this->atom->dev_x.begin(),
|
||||
&this->atom->dev_quat.begin(), &this->shape.begin(),
|
||||
&this->well.begin(), &this->gamma_upsilon_mu.begin(),
|
||||
&this->sigma_epsilon.begin(), &this->_lj_types, &this->lshape.begin(),
|
||||
&this->nbor->dev_nbor.begin(), &stride, &this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &this->dev_error.begin(), &eflag,
|
||||
&vflag, &this->_last_ellipse, &ainum, &this->_threads_per_atom);
|
||||
this->k_sphere_ellipsoid.run(&this->atom->x, &this->atom->quat,
|
||||
&this->shape, &this->well,
|
||||
&this->gamma_upsilon_mu,
|
||||
&this->sigma_epsilon, &this->_lj_types,
|
||||
&this->lshape, &this->nbor->dev_nbor,
|
||||
&stride, &this->ans->force,
|
||||
&this->ans->engv, &this->dev_error,
|
||||
&eflag, &vflag, &this->_last_ellipse,
|
||||
&ainum, &this->_threads_per_atom);
|
||||
this->time_ellipsoid2.stop();
|
||||
} else {
|
||||
this->ans->dev_ans.zero();
|
||||
this->ans->dev_engv.zero();
|
||||
this->ans->force.zero();
|
||||
this->ans->engv.zero();
|
||||
this->time_nbor1.stop();
|
||||
this->time_ellipsoid.start();
|
||||
this->time_ellipsoid.stop();
|
||||
@ -268,19 +274,20 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
|
||||
if (this->_last_ellipse<this->ans->inum()) {
|
||||
if (this->_shared_types) {
|
||||
this->k_lj_fast.set_size(GX,BX);
|
||||
this->k_lj_fast.run(&this->atom->dev_x.begin(), &this->lj1.begin(),
|
||||
&this->lj3.begin(), &this->gamma_upsilon_mu.begin(), &stride,
|
||||
&this->nbor->dev_packed.begin(), &this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &this->dev_error.begin(),
|
||||
&eflag, &vflag, &this->_last_ellipse, &ainum,
|
||||
this->k_lj_fast.run(&this->atom->x, &this->lj1, &this->lj3,
|
||||
&this->gamma_upsilon_mu, &stride,
|
||||
&this->nbor->dev_packed, &this->ans->force,
|
||||
&this->ans->engv, &this->dev_error, &eflag,
|
||||
&vflag, &this->_last_ellipse, &ainum,
|
||||
&this->_threads_per_atom);
|
||||
} else {
|
||||
this->k_lj.set_size(GX,BX);
|
||||
this->k_lj.run(&this->atom->dev_x.begin(), &this->lj1.begin(),
|
||||
&this->lj3.begin(), &this->_lj_types, &this->gamma_upsilon_mu.begin(),
|
||||
&stride, &this->nbor->dev_packed.begin(), &this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &this->dev_error.begin(), &eflag,
|
||||
&vflag, &this->_last_ellipse, &ainum, &this->_threads_per_atom);
|
||||
this->k_lj.run(&this->atom->x, &this->lj1, &this->lj3,
|
||||
&this->_lj_types, &this->gamma_upsilon_mu, &stride,
|
||||
&this->nbor->dev_packed, &this->ans->force,
|
||||
&this->ans->engv, &this->dev_error, &eflag,
|
||||
&vflag, &this->_last_ellipse, &ainum,
|
||||
&this->_threads_per_atom);
|
||||
}
|
||||
}
|
||||
this->time_lj.stop();
|
||||
@ -294,12 +301,11 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
|
||||
this->time_nbor1.stop();
|
||||
this->time_ellipsoid.start();
|
||||
this->k_ellipsoid.set_size(GX,BX);
|
||||
this->k_ellipsoid.run(&this->atom->dev_x.begin(),
|
||||
&this->atom->dev_quat.begin(), &this->shape.begin(), &this->well.begin(),
|
||||
&this->gamma_upsilon_mu.begin(), &this->sigma_epsilon.begin(),
|
||||
&this->_lj_types, &this->lshape.begin(), &this->nbor->dev_nbor.begin(),
|
||||
&stride, &this->ans->dev_ans.begin(), &ainum,
|
||||
&this->ans->dev_engv.begin(), &this->dev_error.begin(),
|
||||
this->k_ellipsoid.run(&this->atom->x, &this->atom->quat,
|
||||
&this->shape, &this->well, &this->gamma_upsilon_mu,
|
||||
&this->sigma_epsilon, &this->_lj_types, &this->lshape,
|
||||
&this->nbor->dev_nbor, &stride, &this->ans->force,
|
||||
&ainum, &this->ans->engv, &this->dev_error,
|
||||
&eflag, &vflag, &ainum, &this->_threads_per_atom);
|
||||
this->time_ellipsoid.stop();
|
||||
}
|
||||
|
||||
@ -80,7 +80,7 @@ ucl_inline void compute_eta_torque(numtyp m[9],numtyp m2[9], const numtyp4 shape
|
||||
m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den;
|
||||
}
|
||||
|
||||
__kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q,
|
||||
__kernel void k_gayberne(__global numtyp4* x_,__global numtyp4 *q,
|
||||
__global numtyp4* shape, __global numtyp4* well,
|
||||
__global numtyp *gum, __global numtyp2* sig_eps,
|
||||
const int ntypes, __global numtyp *lshape,
|
||||
@ -117,7 +117,7 @@ __kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q,
|
||||
nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix=x_[i];
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex);
|
||||
int itype=ix.w;
|
||||
numtyp a1[9], b1[9], g1[9];
|
||||
numtyp4 ishape=shape[itype];
|
||||
@ -136,7 +136,7 @@ __kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q,
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=x_[j];
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex);
|
||||
int jtype=jx.w;
|
||||
|
||||
// Compute r12
|
||||
|
||||
@ -17,15 +17,15 @@
|
||||
#include "lal_ellipsoid_extra.h"
|
||||
#endif
|
||||
|
||||
__kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
|
||||
__global numtyp4* shape,__global numtyp4* well,
|
||||
__global numtyp *gum, __global numtyp2* sig_eps,
|
||||
const int ntypes, __global numtyp *lshape,
|
||||
__global int *dev_nbor, const int stride,
|
||||
__global acctyp4 *ans, __global acctyp *engv,
|
||||
__global int *err_flag, const int eflag,
|
||||
const int vflag,const int start, const int inum,
|
||||
const int t_per_atom) {
|
||||
__kernel void k_gayberne_sphere_ellipsoid(__global numtyp4 *x_,
|
||||
__global numtyp4 *q, __global numtyp4* shape,
|
||||
__global numtyp4* well, __global numtyp *gum,
|
||||
__global numtyp2* sig_eps, const int ntypes,
|
||||
__global numtyp *lshape, __global int *dev_nbor,
|
||||
const int stride, __global acctyp4 *ans,
|
||||
__global acctyp *engv, __global int *err_flag,
|
||||
const int eflag, const int vflag,const int start,
|
||||
const int inum, const int t_per_atom) {
|
||||
int tid, ii, offset;
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
ii+=start;
|
||||
@ -51,7 +51,7 @@ __kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
|
||||
nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix=x_[i];
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex);
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp oner=shape[itype].x;
|
||||
@ -64,7 +64,7 @@ __kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=x_[j];
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex);
|
||||
int jtype=jx.w;
|
||||
|
||||
// Compute r12
|
||||
@ -236,14 +236,13 @@ __kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
__kernel void k_gayberne_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
__global numtyp4* lj3, const int lj_types,
|
||||
__global numtyp *gum,
|
||||
const int stride, __global int *dev_ij,
|
||||
__global acctyp4 *ans, __global acctyp *engv,
|
||||
__global int *err_flag, const int eflag,
|
||||
const int vflag, const int start, const int inum,
|
||||
const int t_per_atom) {
|
||||
__global numtyp *gum, const int stride,
|
||||
__global int *dev_ij, __global acctyp4 *ans,
|
||||
__global acctyp *engv, __global int *err_flag,
|
||||
const int eflag, const int vflag, const int start,
|
||||
const int inum, const int t_per_atom) {
|
||||
int tid, ii, offset;
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
ii+=start;
|
||||
@ -269,7 +268,7 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
|
||||
numtyp4 ix=x_[i];
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex);
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
@ -279,7 +278,7 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=x_[j];
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex);
|
||||
int jtype=jx.w;
|
||||
|
||||
// Compute r12
|
||||
@ -319,13 +318,13 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
__kernel void k_gayberne_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
__global numtyp4* lj3_in, __global numtyp *gum,
|
||||
const int stride, __global int *dev_ij,
|
||||
__global acctyp4 *ans, __global acctyp *engv,
|
||||
__global int *err_flag, const int eflag,
|
||||
const int vflag, const int start, const int inum,
|
||||
const int t_per_atom) {
|
||||
const int vflag, const int start,
|
||||
const int inum, const int t_per_atom) {
|
||||
int tid, ii, offset;
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
ii+=start;
|
||||
@ -358,7 +357,7 @@ __kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
|
||||
numtyp4 ix=x_[i];
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex);
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
@ -369,7 +368,7 @@ __kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=x_[j];
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex);
|
||||
int mtype=itype+jx.w;
|
||||
|
||||
// Compute r12
|
||||
@ -406,3 +405,4 @@ __kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
ans,engv);
|
||||
} // if ii
|
||||
}
|
||||
|
||||
|
||||
@ -13,10 +13,12 @@
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
#if defined(USE_OPENCL)
|
||||
#include "lj_cl.h"
|
||||
#elif defined(USE_CUDART)
|
||||
const char *lj=0;
|
||||
#else
|
||||
#include "lj_ptx.h"
|
||||
#include "lj_cubin.h"
|
||||
#endif
|
||||
|
||||
#include "lal_lj.h"
|
||||
@ -51,7 +53,7 @@ int LJT::init(const int ntypes,
|
||||
const double gpu_split, FILE *_screen) {
|
||||
int success;
|
||||
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||
_screen,lj);
|
||||
_screen,lj,"k_lj");
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
@ -133,20 +135,17 @@ void LJT::loop(const bool _eflag, const bool _vflag) {
|
||||
this->time_pair.start();
|
||||
if (shared_types) {
|
||||
this->k_pair_fast.set_size(GX,BX);
|
||||
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
|
||||
&lj3.begin(), &sp_lj.begin(),
|
||||
&this->nbor->dev_nbor.begin(),
|
||||
&this->_nbor_data->begin(),
|
||||
&this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag,
|
||||
&ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag,
|
||||
&vflag, &ainum, &nbor_pitch,
|
||||
&this->_threads_per_atom);
|
||||
} else {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
|
||||
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
||||
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag, &vflag,
|
||||
&ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
}
|
||||
this->time_pair.stop();
|
||||
}
|
||||
|
||||
@ -15,14 +15,16 @@
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#include "lal_aux_fun1.h"
|
||||
texture<float4> pos_tex;
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
ucl_inline float4 fetch_pos(const int& i, const float4 *pos)
|
||||
{ return tex1Dfetch(pos_tex, i); }
|
||||
texture<float4> pos_tex;
|
||||
#else
|
||||
texture<int4,1> pos_tex;
|
||||
#endif
|
||||
#else
|
||||
#define pos_tex x_
|
||||
#endif
|
||||
|
||||
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
__kernel void k_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
__global numtyp4* lj3, const int lj_types,
|
||||
__global numtyp *sp_lj_in, __global int *dev_nbor,
|
||||
__global int *dev_packed, __global acctyp4 *ans,
|
||||
@ -51,7 +53,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
@ -61,7 +63,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jtype=jx.w;
|
||||
|
||||
// Compute r12
|
||||
@ -101,7 +103,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
__kernel void k_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
__global numtyp4* lj3_in,
|
||||
__global numtyp* sp_lj_in,
|
||||
__global int *dev_nbor, __global int *dev_packed,
|
||||
@ -137,7 +139,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
@ -148,7 +150,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int mtype=itype+jx.w;
|
||||
|
||||
// Compute r12
|
||||
|
||||
@ -13,10 +13,12 @@
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
#if defined(USE_OPENCL)
|
||||
#include "lj96_cl.h"
|
||||
#elif defined(USE_CUDART)
|
||||
const char *lj96=0;
|
||||
#else
|
||||
#include "lj96_ptx.h"
|
||||
#include "lj96_cubin.h"
|
||||
#endif
|
||||
|
||||
#include "lal_lj96.h"
|
||||
@ -51,7 +53,7 @@ int LJ96T::init(const int ntypes,
|
||||
const double gpu_split, FILE *_screen) {
|
||||
int success;
|
||||
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||
_screen,lj96);
|
||||
_screen,lj96,"k_lj96");
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
@ -133,19 +135,17 @@ void LJ96T::loop(const bool _eflag, const bool _vflag) {
|
||||
this->time_pair.start();
|
||||
if (shared_types) {
|
||||
this->k_pair_fast.set_size(GX,BX);
|
||||
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
|
||||
&lj3.begin(), &sp_lj.begin(),
|
||||
&this->nbor->dev_nbor.begin(),
|
||||
&this->_nbor_data->begin(),
|
||||
&this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag,
|
||||
&ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag,
|
||||
&vflag, &ainum, &nbor_pitch,
|
||||
&this->_threads_per_atom);
|
||||
} else {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
|
||||
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
||||
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
this->k_pair.run(&this->atom->x, &lj1, &lj3,
|
||||
&_lj_types, &sp_lj, &this->nbor->dev_nbor,
|
||||
&this->_nbor_data->begin(), &this->ans->force,
|
||||
&this->ans->engv, &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
}
|
||||
this->time_pair.stop();
|
||||
|
||||
@ -15,14 +15,16 @@
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#include "lal_aux_fun1.h"
|
||||
texture<float4> pos_tex;
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
ucl_inline float4 fetch_pos(const int& i, const float4 *pos)
|
||||
{ return tex1Dfetch(pos_tex, i); }
|
||||
texture<float4> pos_tex;
|
||||
#else
|
||||
texture<int4,1> pos_tex;
|
||||
#endif
|
||||
#else
|
||||
#define pos_tex x_
|
||||
#endif
|
||||
|
||||
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
__kernel void k_lj96(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
__global numtyp4* lj3, const int lj_types,
|
||||
__global numtyp *sp_lj_in, __global int *dev_nbor,
|
||||
__global int *dev_packed, __global acctyp4 *ans,
|
||||
@ -51,7 +53,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
@ -61,7 +63,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jtype=jx.w;
|
||||
|
||||
// Compute r12
|
||||
@ -102,7 +104,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
__kernel void k_lj96_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
__global numtyp4* lj3_in,
|
||||
__global numtyp* sp_lj_in,
|
||||
__global int *dev_nbor, __global int *dev_packed,
|
||||
@ -138,7 +140,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
@ -149,7 +151,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int mtype=itype+jx.w;
|
||||
|
||||
// Compute r12
|
||||
|
||||
@ -13,10 +13,12 @@
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
#if defined(USE_OPENCL)
|
||||
#include "lj_class2_long_cl.h"
|
||||
#elif defined(USE_CUDART)
|
||||
const char *lj_class2_long=0;
|
||||
#else
|
||||
#include "lj_class2_long_ptx.h"
|
||||
#include "lj_class2_long_cubin.h"
|
||||
#endif
|
||||
|
||||
#include "lal_lj_class2_long.h"
|
||||
@ -55,7 +57,7 @@ int LJClass2LongT::init(const int ntypes, double **host_cutsq,
|
||||
const double g_ewald) {
|
||||
int success;
|
||||
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||
_screen,lj_class2_long);
|
||||
_screen,lj_class2_long,"k_lj_class2_long");
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
@ -143,22 +145,19 @@ void LJClass2LongT::loop(const bool _eflag, const bool _vflag) {
|
||||
this->time_pair.start();
|
||||
if (shared_types) {
|
||||
this->k_pair_fast.set_size(GX,BX);
|
||||
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
|
||||
&lj3.begin(), &sp_lj.begin(),
|
||||
&this->nbor->dev_nbor.begin(),
|
||||
&this->_nbor_data->begin(),
|
||||
&this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag,
|
||||
&ainum, &nbor_pitch, &this->atom->dev_q.begin(),
|
||||
this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag,
|
||||
&vflag, &ainum, &nbor_pitch, &this->atom->q,
|
||||
&_cut_coulsq, &_qqrd2e, &_g_ewald,
|
||||
&this->_threads_per_atom);
|
||||
} else {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
|
||||
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
||||
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq,
|
||||
this->k_pair.run(&this->atom->x, &lj1, &lj3,
|
||||
&_lj_types, &sp_lj, &this->nbor->dev_nbor,
|
||||
&this->_nbor_data->begin(), &this->ans->force,
|
||||
&this->ans->engv, &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->atom->q, &_cut_coulsq,
|
||||
&_qqrd2e, &_g_ewald, &this->_threads_per_atom);
|
||||
}
|
||||
this->time_pair.stop();
|
||||
|
||||
@ -14,18 +14,22 @@
|
||||
// ***************************************************************************/
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
|
||||
#include "lal_aux_fun1.h"
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
texture<float4> pos_tex;
|
||||
texture<float> q_tex;
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
ucl_inline float4 fetch_pos(const int& i, const float4 *pos)
|
||||
{ return tex1Dfetch(pos_tex, i); }
|
||||
ucl_inline float fetch_q(const int& i, const float *q)
|
||||
{ return tex1Dfetch(q_tex, i); }
|
||||
#endif
|
||||
#else
|
||||
texture<int4,1> pos_tex;
|
||||
texture<int2> q_tex;
|
||||
#endif
|
||||
|
||||
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
#else
|
||||
#define pos_tex x_
|
||||
#define q_tex q_
|
||||
#endif
|
||||
|
||||
__kernel void k_lj_class2_long(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
__global numtyp4* lj3, const int lj_types,
|
||||
__global numtyp *sp_lj_in, __global int *dev_nbor,
|
||||
__global int *dev_packed, __global acctyp4 *ans,
|
||||
@ -61,8 +65,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp qtmp=fetch_q(i,q_);
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
@ -73,7 +77,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jtype=jx.w;
|
||||
|
||||
// Compute r12
|
||||
@ -101,7 +105,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
numtyp expm2 = ucl_exp(-grij*grij);
|
||||
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
|
||||
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
|
||||
prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
|
||||
fetch(prefactor,j,q_tex);
|
||||
prefactor *= qqrd2e * qtmp/r;
|
||||
forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
|
||||
} else
|
||||
forcecoul = (numtyp)0.0;
|
||||
@ -136,7 +141,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
__kernel void k_lj_class2_long_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
__global numtyp4* lj3_in,
|
||||
__global numtyp* sp_lj_in,
|
||||
__global int *dev_nbor, __global int *dev_packed,
|
||||
@ -175,8 +180,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp qtmp=fetch_q(i,q_);
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
@ -188,7 +193,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int mtype=itype+jx.w;
|
||||
|
||||
// Compute r12
|
||||
@ -215,7 +220,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
numtyp expm2 = ucl_exp(-grij*grij);
|
||||
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
|
||||
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
|
||||
prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
|
||||
fetch(prefactor,j,q_tex);
|
||||
prefactor *= qqrd2e * qtmp/r;
|
||||
forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
|
||||
} else
|
||||
forcecoul = (numtyp)0.0;
|
||||
|
||||
@ -13,10 +13,12 @@
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
#if defined(USE_OPENCL)
|
||||
#include "lj_coul_cl.h"
|
||||
#elif defined(USE_CUDART)
|
||||
const char *lj_coul=0;
|
||||
#else
|
||||
#include "lj_coul_ptx.h"
|
||||
#include "lj_coul_cubin.h"
|
||||
#endif
|
||||
|
||||
#include "lal_lj_coul.h"
|
||||
@ -54,7 +56,7 @@ int LJCoulT::init(const int ntypes,
|
||||
double *host_special_coul, const double qqrd2e) {
|
||||
int success;
|
||||
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||
_screen,lj_coul);
|
||||
_screen,lj_coul,"k_lj_coul");
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
@ -145,23 +147,18 @@ void LJCoulT::loop(const bool _eflag, const bool _vflag) {
|
||||
this->time_pair.start();
|
||||
if (shared_types) {
|
||||
this->k_pair_fast.set_size(GX,BX);
|
||||
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
|
||||
&lj3.begin(), &sp_lj.begin(),
|
||||
&this->nbor->dev_nbor.begin(),
|
||||
&this->_nbor_data->begin(),
|
||||
&this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag,
|
||||
&ainum, &nbor_pitch,
|
||||
&this->atom->dev_q.begin(), &cutsq.begin(),
|
||||
&_qqrd2e, &this->_threads_per_atom);
|
||||
this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag,
|
||||
&vflag, &ainum, &nbor_pitch, &this->atom->q,
|
||||
&cutsq, &_qqrd2e, &this->_threads_per_atom);
|
||||
} else {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
|
||||
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
||||
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->atom->dev_q.begin(),
|
||||
&cutsq.begin(), &_qqrd2e, &this->_threads_per_atom);
|
||||
this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv,
|
||||
&eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q,
|
||||
&cutsq, &_qqrd2e, &this->_threads_per_atom);
|
||||
}
|
||||
this->time_pair.stop();
|
||||
}
|
||||
|
||||
@ -14,18 +14,22 @@
|
||||
// ***************************************************************************/
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
|
||||
#include "lal_aux_fun1.h"
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
texture<float4> pos_tex;
|
||||
texture<float> q_tex;
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
ucl_inline float4 fetch_pos(const int& i, const float4 *pos)
|
||||
{ return tex1Dfetch(pos_tex, i); }
|
||||
ucl_inline float fetch_q(const int& i, const float *q)
|
||||
{ return tex1Dfetch(q_tex, i); }
|
||||
#endif
|
||||
#else
|
||||
texture<int4,1> pos_tex;
|
||||
texture<int2> q_tex;
|
||||
#endif
|
||||
|
||||
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
#else
|
||||
#define pos_tex x_
|
||||
#define q_tex q_
|
||||
#endif
|
||||
|
||||
__kernel void k_lj_coul(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
__global numtyp4* lj3, const int lj_types,
|
||||
__global numtyp *sp_lj_in, __global int *dev_nbor,
|
||||
__global int *dev_packed, __global acctyp4 *ans,
|
||||
@ -61,8 +65,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp qtmp=fetch_q(i,q_);
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
@ -73,7 +77,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
factor_coul = sp_lj[sbmask(j)+4];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jtype=jx.w;
|
||||
|
||||
// Compute r12
|
||||
@ -93,9 +97,10 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
} else
|
||||
force_lj = (numtyp)0.0;
|
||||
|
||||
if (rsq < lj1[mtype].w)
|
||||
forcecoul = qqrd2e*qtmp*fetch_q(j,q_)*ucl_rsqrt(rsq)*factor_coul;
|
||||
else
|
||||
if (rsq < lj1[mtype].w) {
|
||||
fetch(forcecoul,j,q_tex);
|
||||
forcecoul *= qqrd2e*qtmp*ucl_rsqrt(rsq)*factor_coul;
|
||||
} else
|
||||
forcecoul = (numtyp)0.0;
|
||||
|
||||
force = (force_lj + forcecoul) * r2inv;
|
||||
@ -127,7 +132,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
__kernel void k_lj_coul_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
__global numtyp4* lj3_in,
|
||||
__global numtyp* sp_lj_in,
|
||||
__global int *dev_nbor, __global int *dev_packed,
|
||||
@ -168,8 +173,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp qtmp=fetch_q(i,q_);
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
@ -181,7 +186,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
factor_coul = sp_lj[sbmask(j)+4];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int mtype=itype+jx.w;
|
||||
|
||||
// Compute r12
|
||||
@ -200,9 +205,10 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
} else
|
||||
force_lj = (numtyp)0.0;
|
||||
|
||||
if (rsq < lj1[mtype].w)
|
||||
forcecoul = qqrd2e*qtmp*fetch_q(j,q_)*ucl_rsqrt(rsq)*factor_coul;
|
||||
else
|
||||
if (rsq < lj1[mtype].w) {
|
||||
fetch(forcecoul,j,q_tex);
|
||||
forcecoul *= qqrd2e*qtmp*ucl_rsqrt(rsq)*factor_coul;
|
||||
} else
|
||||
forcecoul = (numtyp)0.0;
|
||||
|
||||
force = (force_lj + forcecoul) * r2inv;
|
||||
|
||||
@ -13,10 +13,12 @@
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
#if defined(USE_OPENCL)
|
||||
#include "lj_coul_long_cl.h"
|
||||
#elif defined(USE_CUDART)
|
||||
const char *lj_coul_long=0;
|
||||
#else
|
||||
#include "lj_coul_long_ptx.h"
|
||||
#include "lj_coul_long_cubin.h"
|
||||
#endif
|
||||
|
||||
#include "lal_lj_coul_long.h"
|
||||
@ -55,7 +57,7 @@ int LJCoulLongT::init(const int ntypes,
|
||||
const double g_ewald) {
|
||||
int success;
|
||||
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||
_screen,lj_coul_long);
|
||||
_screen,lj_coul_long,"k_lj_coul_long");
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
@ -143,22 +145,19 @@ void LJCoulLongT::loop(const bool _eflag, const bool _vflag) {
|
||||
this->time_pair.start();
|
||||
if (shared_types) {
|
||||
this->k_pair_fast.set_size(GX,BX);
|
||||
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
|
||||
&lj3.begin(), &sp_lj.begin(),
|
||||
&this->nbor->dev_nbor.begin(),
|
||||
&this->_nbor_data->begin(),
|
||||
&this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag,
|
||||
&ainum, &nbor_pitch, &this->atom->dev_q.begin(),
|
||||
this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag,
|
||||
&vflag, &ainum, &nbor_pitch, &this->atom->q,
|
||||
&_cut_coulsq, &_qqrd2e, &_g_ewald,
|
||||
&this->_threads_per_atom);
|
||||
} else {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
|
||||
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
||||
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq,
|
||||
this->k_pair.run(&this->atom->x, &lj1, &lj3,
|
||||
&_lj_types, &sp_lj, &this->nbor->dev_nbor,
|
||||
&this->_nbor_data->begin(), &this->ans->force,
|
||||
&this->ans->engv, &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->atom->q, &_cut_coulsq,
|
||||
&_qqrd2e, &_g_ewald, &this->_threads_per_atom);
|
||||
}
|
||||
this->time_pair.stop();
|
||||
|
||||
@ -14,18 +14,22 @@
|
||||
// ***************************************************************************/
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
|
||||
#include "lal_aux_fun1.h"
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
texture<float4> pos_tex;
|
||||
texture<float> q_tex;
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
ucl_inline float4 fetch_pos(const int& i, const float4 *pos)
|
||||
{ return tex1Dfetch(pos_tex, i); }
|
||||
ucl_inline float fetch_q(const int& i, const float *q)
|
||||
{ return tex1Dfetch(q_tex, i); }
|
||||
#endif
|
||||
#else
|
||||
texture<int4,1> pos_tex;
|
||||
texture<int2> q_tex;
|
||||
#endif
|
||||
|
||||
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
#else
|
||||
#define pos_tex x_
|
||||
#define q_tex q_
|
||||
#endif
|
||||
|
||||
__kernel void k_lj_coul_long(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
__global numtyp4* lj3, const int lj_types,
|
||||
__global numtyp *sp_lj_in, __global int *dev_nbor,
|
||||
__global int *dev_packed, __global acctyp4 *ans,
|
||||
@ -61,8 +65,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp qtmp=fetch_q(i,q_);
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
@ -73,7 +77,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jtype=jx.w;
|
||||
|
||||
// Compute r12
|
||||
@ -99,7 +103,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
numtyp expm2 = ucl_exp(-grij*grij);
|
||||
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
|
||||
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
|
||||
prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
|
||||
fetch(prefactor,j,q_tex);
|
||||
prefactor *= qqrd2e * qtmp/r;
|
||||
forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
|
||||
} else
|
||||
forcecoul = (numtyp)0.0;
|
||||
@ -134,7 +139,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
__kernel void k_lj_coul_long_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
__global numtyp4* lj3_in,
|
||||
__global numtyp* sp_lj_in,
|
||||
__global int *dev_nbor, __global int *dev_packed,
|
||||
@ -173,8 +178,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp qtmp=fetch_q(i,q_);
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
@ -186,7 +191,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int mtype=itype+jx.w;
|
||||
|
||||
// Compute r12
|
||||
@ -211,7 +216,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
numtyp expm2 = ucl_exp(-grij*grij);
|
||||
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
|
||||
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
|
||||
prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
|
||||
fetch(prefactor,j,q_tex);
|
||||
prefactor *= qqrd2e * qtmp/r;
|
||||
forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
|
||||
} else
|
||||
forcecoul = (numtyp)0.0;
|
||||
|
||||
@ -13,10 +13,12 @@
|
||||
email : ibains@nvidia.com
|
||||
***************************************************************************/
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
#if defined(USE_OPENCL)
|
||||
#include "lj_expand_cl.h"
|
||||
#elif defined(USE_CUDART)
|
||||
const char *lj_expand=0;
|
||||
#else
|
||||
#include "lj_expand_ptx.h"
|
||||
#include "lj_expand_cubin.h"
|
||||
#endif
|
||||
|
||||
#include "lal_lj_expand.h"
|
||||
@ -51,7 +53,7 @@ int LJExpandT::init(const int ntypes, double **host_cutsq,
|
||||
const double gpu_split, FILE *_screen) {
|
||||
int success;
|
||||
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||
_screen,lj_expand);
|
||||
_screen,lj_expand,"k_lj_expand");
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
@ -133,20 +135,17 @@ void LJExpandT::loop(const bool _eflag, const bool _vflag) {
|
||||
this->time_pair.start();
|
||||
if (shared_types) {
|
||||
this->k_pair_fast.set_size(GX,BX);
|
||||
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
|
||||
&lj3.begin(), &sp_lj.begin(),
|
||||
&this->nbor->dev_nbor.begin(),
|
||||
&this->_nbor_data->begin(),
|
||||
&this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag,
|
||||
&ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag,
|
||||
&vflag, &ainum, &nbor_pitch,
|
||||
&this->_threads_per_atom);
|
||||
} else {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
|
||||
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
||||
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag, &vflag,
|
||||
&ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
}
|
||||
this->time_pair.stop();
|
||||
}
|
||||
|
||||
@ -14,15 +14,19 @@
|
||||
// ***************************************************************************/
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
|
||||
#include "lal_aux_fun1.h"
|
||||
texture<float4> pos_tex;
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
ucl_inline float4 fetch_pos(const int& i, const float4 *pos)
|
||||
{ return tex1Dfetch(pos_tex, i); }
|
||||
#endif
|
||||
texture<float4> pos_tex;
|
||||
#else
|
||||
texture<int4,1> pos_tex;
|
||||
#endif
|
||||
|
||||
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
#else
|
||||
#define pos_tex x_
|
||||
#endif
|
||||
|
||||
__kernel void k_lj_expand(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
__global numtyp4* lj3, const int lj_types,
|
||||
__global numtyp *sp_lj_in, __global int *dev_nbor,
|
||||
__global int *dev_packed, __global acctyp4 *ans,
|
||||
@ -51,7 +55,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
@ -61,7 +65,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jtype=jx.w;
|
||||
|
||||
// Compute r12
|
||||
@ -104,7 +108,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
__kernel void k_lj_expand_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
__global numtyp4* lj3_in,
|
||||
__global numtyp* sp_lj_in,
|
||||
__global int *dev_nbor, __global int *dev_packed,
|
||||
@ -140,7 +144,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
@ -151,7 +155,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int mtype=itype+jx.w;
|
||||
|
||||
// Compute r12
|
||||
|
||||
@ -13,10 +13,12 @@
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
#if defined(USE_OPENCL)
|
||||
#include "morse_cl.h"
|
||||
#elif defined(USE_CUDART)
|
||||
const char *morse=0;
|
||||
#else
|
||||
#include "morse_ptx.h"
|
||||
#include "morse_cubin.h"
|
||||
#endif
|
||||
|
||||
#include "lal_morse.h"
|
||||
@ -51,7 +53,7 @@ int MorseT::init(const int ntypes,
|
||||
const double gpu_split, FILE *_screen) {
|
||||
int success;
|
||||
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||
_screen,morse);
|
||||
_screen,morse,"k_morse");
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
@ -132,20 +134,17 @@ void MorseT::loop(const bool _eflag, const bool _vflag) {
|
||||
this->time_pair.start();
|
||||
if (shared_types) {
|
||||
this->k_pair_fast.set_size(GX,BX);
|
||||
this->k_pair_fast.run(&this->atom->dev_x.begin(), &mor1.begin(),
|
||||
&mor2.begin(), &sp_lj.begin(),
|
||||
&this->nbor->dev_nbor.begin(),
|
||||
&this->_nbor_data->begin(),
|
||||
&this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag,
|
||||
&ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
this->k_pair_fast.run(&this->atom->x, &mor1, &mor2, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag,
|
||||
&vflag, &ainum, &nbor_pitch,
|
||||
&this->_threads_per_atom);
|
||||
} else {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->dev_x.begin(), &mor1.begin(), &mor2.begin(),
|
||||
&_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
||||
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
this->k_pair.run(&this->atom->x, &mor1, &mor2, &_types, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag, &vflag,
|
||||
&ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
}
|
||||
this->time_pair.stop();
|
||||
}
|
||||
|
||||
@ -14,15 +14,19 @@
|
||||
// ***************************************************************************/
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
|
||||
#include "lal_aux_fun1.h"
|
||||
texture<float4> pos_tex;
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
ucl_inline float4 fetch_pos(const int& i, const float4 *pos)
|
||||
{ return tex1Dfetch(pos_tex, i); }
|
||||
#endif
|
||||
texture<float4> pos_tex;
|
||||
#else
|
||||
texture<int4,1> pos_tex;
|
||||
#endif
|
||||
|
||||
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *mor1,
|
||||
#else
|
||||
#define pos_tex x_
|
||||
#endif
|
||||
|
||||
__kernel void k_morse(__global numtyp4 *x_, __global numtyp4 *mor1,
|
||||
__global numtyp2* mor2, const int lj_types,
|
||||
__global numtyp *sp_lj_in, __global int *dev_nbor,
|
||||
__global int *dev_packed, __global acctyp4 *ans,
|
||||
@ -51,7 +55,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *mor1,
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
@ -61,7 +65,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *mor1,
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jtype=jx.w;
|
||||
|
||||
// Compute r12
|
||||
@ -102,7 +106,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *mor1,
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *mor1_in,
|
||||
__kernel void k_morse_fast(__global numtyp4 *x_, __global numtyp4 *mor1_in,
|
||||
__global numtyp2* mor2_in,
|
||||
__global numtyp* sp_lj_in,
|
||||
__global int *dev_nbor, __global int *dev_packed,
|
||||
@ -138,7 +142,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *mor1_in,
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
@ -149,7 +153,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *mor1_in,
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int mtype=itype+jx.w;
|
||||
|
||||
// Compute r12
|
||||
|
||||
@ -84,7 +84,7 @@ bool Neighbor::init(NeighborShared *shared, const int inum,
|
||||
_max_atoms=1000;
|
||||
|
||||
_max_host=static_cast<int>(static_cast<double>(host_inum)*1.10);
|
||||
_max_nbors=max_nbors;
|
||||
_max_nbors=(max_nbors/threads_per_atom+1)*threads_per_atom;
|
||||
|
||||
_maxspecial=maxspecial;
|
||||
if (gpu_nbor==0)
|
||||
@ -124,17 +124,14 @@ void Neighbor::alloc(bool &success) {
|
||||
_c_bytes+=dev_packed.row_bytes();
|
||||
}
|
||||
if (_max_host>0) {
|
||||
host_nbor.clear();
|
||||
dev_host_nbor.clear();
|
||||
dev_host_numj.clear();
|
||||
nbor_host.clear();
|
||||
dev_numj_host.clear();
|
||||
host_ilist.clear();
|
||||
host_jlist.clear();
|
||||
|
||||
success=success && (host_nbor.alloc(_max_nbors*_max_host,*dev,
|
||||
UCL_RW_OPTIMIZED)==UCL_SUCCESS);
|
||||
success=success && (dev_host_nbor.alloc(_max_nbors*_max_host,
|
||||
*dev,UCL_WRITE_ONLY)==UCL_SUCCESS);
|
||||
success=success && (dev_host_numj.alloc(_max_host,*dev,
|
||||
success=(nbor_host.alloc(_max_nbors*_max_host,*dev,UCL_RW_OPTIMIZED,
|
||||
UCL_WRITE_ONLY)==UCL_SUCCESS) && success;
|
||||
success=success && (dev_numj_host.alloc(_max_host,*dev,
|
||||
UCL_WRITE_ONLY)==UCL_SUCCESS);
|
||||
success=success && (host_ilist.alloc(nt,*dev,UCL_NOT_PINNED)==UCL_SUCCESS);
|
||||
if (!success)
|
||||
@ -145,16 +142,16 @@ void Neighbor::alloc(bool &success) {
|
||||
UCL_NOT_PINNED)==UCL_SUCCESS);
|
||||
if (!success)
|
||||
return;
|
||||
int *ptr=host_nbor.begin();
|
||||
int *ptr=nbor_host.host.begin();
|
||||
for (int i=0; i<_max_host; i++) {
|
||||
host_jlist[i]=ptr;
|
||||
ptr+=_max_nbors;
|
||||
}
|
||||
_c_bytes+=dev_host_nbor.row_bytes()+dev_host_numj.row_bytes();
|
||||
_c_bytes+=nbor_host.device.row_bytes()+dev_numj_host.row_bytes();
|
||||
} else {
|
||||
// Some OpenCL implementations return errors for NULL pointers as args
|
||||
dev_host_nbor.view(dev_nbor);
|
||||
dev_host_numj.view(dev_nbor);
|
||||
nbor_host.device.view(dev_nbor);
|
||||
dev_numj_host.view(dev_nbor);
|
||||
}
|
||||
if (_maxspecial>0) {
|
||||
dev_nspecial.clear();
|
||||
@ -194,10 +191,9 @@ void Neighbor::clear() {
|
||||
host_packed.clear();
|
||||
host_acc.clear();
|
||||
dev_nbor.clear();
|
||||
dev_host_nbor.clear();
|
||||
nbor_host.clear();
|
||||
dev_packed.clear();
|
||||
host_nbor.clear();
|
||||
dev_host_numj.clear();
|
||||
dev_numj_host.clear();
|
||||
host_ilist.clear();
|
||||
host_jlist.clear();
|
||||
dev_nspecial.clear();
|
||||
@ -215,8 +211,8 @@ void Neighbor::clear() {
|
||||
double Neighbor::host_memory_usage() const {
|
||||
if (_gpu_nbor>0) {
|
||||
if (_gpu_host)
|
||||
return host_nbor.row_bytes()*host_nbor.rows()+host_ilist.row_bytes()+
|
||||
host_jlist.row_bytes();
|
||||
return nbor_host.device.row_bytes()*nbor_host.rows()+
|
||||
host_ilist.row_bytes()+host_jlist.row_bytes();
|
||||
else
|
||||
return 0;
|
||||
} else
|
||||
@ -285,8 +281,7 @@ void Neighbor::get_host(const int inum, int *ilist, int *numj,
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(inum)*_threads_per_atom/
|
||||
block_size));
|
||||
_shared->k_nbor.set_size(GX,block_size);
|
||||
_shared->k_nbor.run(&dev_nbor.begin(), &dev_packed.begin(), &inum,
|
||||
&_threads_per_atom);
|
||||
_shared->k_nbor.run(&dev_nbor, &dev_packed, &inum, &_threads_per_atom);
|
||||
time_kernel.stop();
|
||||
}
|
||||
}
|
||||
@ -295,31 +290,23 @@ template <class numtyp, class acctyp>
|
||||
void Neighbor::resize_max_neighbors(const int maxn, bool &success) {
|
||||
if (maxn>_max_nbors) {
|
||||
int mn=static_cast<int>(static_cast<double>(maxn)*1.10);
|
||||
dev_nbor.clear();
|
||||
success=success &&
|
||||
(dev_nbor.alloc((mn+1)*_max_atoms,*dev)==UCL_SUCCESS);
|
||||
mn=(mn/_threads_per_atom+1)*_threads_per_atom;
|
||||
success=success && (dev_nbor.resize((mn+1)*_max_atoms)==UCL_SUCCESS);
|
||||
_gpu_bytes=dev_nbor.row_bytes();
|
||||
if (_max_host>0) {
|
||||
host_nbor.clear();
|
||||
dev_host_nbor.clear();
|
||||
success=success && (host_nbor.alloc(mn*_max_host,*dev,
|
||||
UCL_RW_OPTIMIZED)==UCL_SUCCESS);
|
||||
success=success && (dev_host_nbor.alloc(mn*_max_host,
|
||||
*dev,UCL_WRITE_ONLY)==UCL_SUCCESS);
|
||||
int *ptr=host_nbor.begin();
|
||||
success=success && (nbor_host.resize(mn*_max_host)==UCL_SUCCESS);
|
||||
int *ptr=nbor_host.host.begin();
|
||||
for (int i=0; i<_max_host; i++) {
|
||||
host_jlist[i]=ptr;
|
||||
ptr+=mn;
|
||||
}
|
||||
_gpu_bytes+=dev_host_nbor.row_bytes();
|
||||
_gpu_bytes+=nbor_host.row_bytes();
|
||||
} else {
|
||||
dev_host_nbor.view(dev_nbor);
|
||||
dev_host_numj.view(dev_nbor);
|
||||
nbor_host.device.view(dev_nbor);
|
||||
dev_numj_host.view(dev_nbor);
|
||||
}
|
||||
if (_alloc_packed) {
|
||||
dev_packed.clear();
|
||||
success=success && (dev_packed.alloc((mn+2)*_max_atoms,*dev,
|
||||
UCL_READ_ONLY)==UCL_SUCCESS);
|
||||
success=success && (dev_packed.resize((mn+2)*_max_atoms)==UCL_SUCCESS);
|
||||
_gpu_bytes+=dev_packed.row_bytes();
|
||||
}
|
||||
_max_nbors=mn;
|
||||
@ -337,16 +324,12 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
|
||||
|
||||
// Calculate number of cells and allocate storage for binning as necessary
|
||||
int ncellx, ncelly, ncellz, ncell_3d;
|
||||
ncellx = static_cast<int>(ceil(((subhi[0] - sublo[0]) +
|
||||
2.0*_cell_size)/_cell_size));
|
||||
ncelly = static_cast<int>(ceil(((subhi[1] - sublo[1]) +
|
||||
2.0*_cell_size)/_cell_size));
|
||||
ncellz = static_cast<int>(ceil(((subhi[2] - sublo[2]) +
|
||||
2.0*_cell_size)/_cell_size));
|
||||
int ghost_cells=2*_cells_in_cutoff;
|
||||
ncellx = static_cast<int>(ceil((subhi[0]-sublo[0])/_cell_size))+ghost_cells;
|
||||
ncelly = static_cast<int>(ceil((subhi[1]-sublo[1])/_cell_size))+ghost_cells;
|
||||
ncellz = static_cast<int>(ceil((subhi[2]-sublo[2])/_cell_size))+ghost_cells;
|
||||
ncell_3d = ncellx * ncelly * ncellz;
|
||||
if (ncell_3d+1>_ncells) {
|
||||
dev_cell_counts.clear();
|
||||
dev_cell_counts.alloc(ncell_3d+1,dev_nbor);
|
||||
if (_gpu_nbor==2) {
|
||||
if (_ncells>0) {
|
||||
host_cell_counts.clear();
|
||||
@ -355,11 +338,19 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
|
||||
cell_iter = new int[ncell_3d+1];
|
||||
host_cell_counts.alloc(ncell_3d+1,dev_nbor);
|
||||
}
|
||||
|
||||
if (_gpu_nbor==2 && atom.host_view())
|
||||
dev_cell_counts.view(host_cell_counts);
|
||||
else {
|
||||
dev_cell_counts.clear();
|
||||
dev_cell_counts.alloc(ncell_3d+1,dev_nbor);
|
||||
}
|
||||
|
||||
_ncells=ncell_3d+1;
|
||||
_cell_bytes=dev_cell_counts.row_bytes();
|
||||
}
|
||||
|
||||
const numtyp cell_size_cast=static_cast<numtyp>(_cell_size);
|
||||
const numtyp cutoff_cast=static_cast<numtyp>(_cutoff);
|
||||
|
||||
if (_maxspecial>0) {
|
||||
time_nbor.start();
|
||||
@ -379,8 +370,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
|
||||
const int g2x=static_cast<int>(ceil(static_cast<double>(_maxspecial)/b2x));
|
||||
const int g2y=static_cast<int>(ceil(static_cast<double>(nt)/b2y));
|
||||
_shared->k_transpose.set_size(g2x,g2y,b2x,b2y);
|
||||
_shared->k_transpose.run(&dev_special.begin(),&dev_special_t.begin(),
|
||||
&_maxspecial,&nt);
|
||||
_shared->k_transpose.run(&dev_special,&dev_special_t,&_maxspecial,&nt);
|
||||
time_transpose.stop();
|
||||
}
|
||||
|
||||
@ -392,28 +382,48 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
|
||||
|
||||
// Build cell list on CPU
|
||||
host_cell_counts.zero();
|
||||
double m_cell_size=-_cell_size;
|
||||
double dx=subhi[0]-sublo[0]+_cell_size;
|
||||
double dy=subhi[1]-sublo[1]+_cell_size;
|
||||
double dz=subhi[2]-sublo[2]+_cell_size;
|
||||
double i_cell_size=1.0/_cell_size;
|
||||
|
||||
for (int i=0; i<nall; i++) {
|
||||
int offset_hi=_cells_in_cutoff+1;
|
||||
for (int i=0; i<nt; i++) {
|
||||
double px, py, pz;
|
||||
px=x[i][0]-sublo[0];
|
||||
py=x[i][1]-sublo[1];
|
||||
pz=x[i][2]-sublo[2];
|
||||
if (px<m_cell_size) px=m_cell_size;
|
||||
if (py<m_cell_size) py=m_cell_size;
|
||||
if (pz<m_cell_size) pz=m_cell_size;
|
||||
if (px>dx) px=dx;
|
||||
if (py>dy) py=dy;
|
||||
if (pz>dz) pz=dz;
|
||||
|
||||
int id=static_cast<int>(px/_cell_size + 1.0) +
|
||||
static_cast<int>(py/_cell_size + 1.0) * ncellx +
|
||||
static_cast<int>(pz/_cell_size + 1.0) * ncellx * ncelly;
|
||||
int ix = static_cast<int>(px*i_cell_size+1);
|
||||
ix = std::max(ix,_cells_in_cutoff);
|
||||
ix = std::min(ix,ncellx-offset_hi);
|
||||
int iy = static_cast<int>(py*i_cell_size+1);
|
||||
iy = std::max(iy,_cells_in_cutoff);
|
||||
iy = std::min(iy,ncelly-offset_hi);
|
||||
int iz = static_cast<int>(pz*i_cell_size+1);
|
||||
iz = std::max(iz,_cells_in_cutoff);
|
||||
iz = std::min(iz,ncellz-offset_hi);
|
||||
|
||||
cell_id[i]=id;
|
||||
int id = ix+iy*ncellx+iz*ncellx*ncelly;
|
||||
cell_id[i] = id;
|
||||
host_cell_counts[id+1]++;
|
||||
}
|
||||
|
||||
for (int i=nt; i<nall; i++) {
|
||||
double px, py, pz;
|
||||
px=x[i][0]-sublo[0];
|
||||
py=x[i][1]-sublo[1];
|
||||
pz=x[i][2]-sublo[2];
|
||||
|
||||
int ix = static_cast<int>(px*i_cell_size+1);
|
||||
ix = std::max(ix,0);
|
||||
ix = std::min(ix,ncellx-1);
|
||||
int iy = static_cast<int>(py*i_cell_size+1);
|
||||
iy = std::max(iy,0);
|
||||
iy = std::min(iy,ncelly-1);
|
||||
int iz = static_cast<int>(pz*i_cell_size+1);
|
||||
iz = std::max(iz,0);
|
||||
iz = std::min(iz,ncellz-1);
|
||||
|
||||
int id = ix+iy*ncellx+iz*ncellx*ncelly;
|
||||
cell_id[i] = id;
|
||||
host_cell_counts[id+1]++;
|
||||
}
|
||||
|
||||
@ -451,41 +461,39 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
|
||||
time_kernel.start();
|
||||
|
||||
_nbor_pitch=inum;
|
||||
_shared->neigh_tex.bind_float(atom.dev_x,4);
|
||||
_shared->neigh_tex.bind_float(atom.x,4);
|
||||
|
||||
// If binning on GPU, do this now
|
||||
if (_gpu_nbor==1) {
|
||||
const numtyp i_cell_size=static_cast<numtyp>(1.0/_cell_size);
|
||||
const int neigh_block=_block_cell_id;
|
||||
const int GX=(int)ceil((float)nall/neigh_block);
|
||||
const numtyp sublo0=static_cast<numtyp>(sublo[0]);
|
||||
const numtyp sublo1=static_cast<numtyp>(sublo[1]);
|
||||
const numtyp sublo2=static_cast<numtyp>(sublo[2]);
|
||||
const numtyp subhi0=static_cast<numtyp>(subhi[0]);
|
||||
const numtyp subhi1=static_cast<numtyp>(subhi[1]);
|
||||
const numtyp subhi2=static_cast<numtyp>(subhi[2]);
|
||||
_shared->k_cell_id.set_size(GX,neigh_block);
|
||||
_shared->k_cell_id.run(&atom.dev_x.begin(), &atom.dev_cell_id.begin(),
|
||||
&atom.dev_particle_id.begin(),
|
||||
&sublo0, &sublo1, &sublo2, &subhi0, &subhi1,
|
||||
&subhi2, &cell_size_cast, &ncellx, &ncelly, &nall);
|
||||
_shared->k_cell_id.run(&atom.x, &atom.dev_cell_id,
|
||||
&atom.dev_particle_id, &sublo0, &sublo1,
|
||||
&sublo2, &i_cell_size, &ncellx, &ncelly, &ncellz,
|
||||
&nt, &nall, &_cells_in_cutoff);
|
||||
|
||||
atom.sort_neighbor(nall);
|
||||
|
||||
/* calculate cell count */
|
||||
_shared->k_cell_counts.set_size(GX,neigh_block);
|
||||
_shared->k_cell_counts.run(&atom.dev_cell_id.begin(),
|
||||
&dev_cell_counts.begin(), &nall, &ncell_3d);
|
||||
_shared->k_cell_counts.run(&atom.dev_cell_id, &dev_cell_counts, &nall,
|
||||
&ncell_3d);
|
||||
}
|
||||
|
||||
/* build the neighbor list */
|
||||
const int cell_block=_block_nbor_build;
|
||||
_shared->k_build_nbor.set_size(ncellx, ncelly*ncellz, cell_block, 1);
|
||||
_shared->k_build_nbor.run(&atom.dev_x.begin(), &atom.dev_particle_id.begin(),
|
||||
&dev_cell_counts.begin(), &dev_nbor.begin(),
|
||||
&dev_host_nbor.begin(), &dev_host_numj.begin(),
|
||||
&_max_nbors,&cell_size_cast,
|
||||
&ncellx, &ncelly, &ncellz, &inum, &nt, &nall,
|
||||
&_threads_per_atom);
|
||||
_shared->k_build_nbor.set_size(ncellx-ghost_cells,(ncelly-ghost_cells)*
|
||||
(ncellz-ghost_cells),cell_block,1);
|
||||
_shared->k_build_nbor.run(&atom.x, &atom.dev_particle_id,
|
||||
&dev_cell_counts, &dev_nbor, &nbor_host,
|
||||
&dev_numj_host, &_max_nbors, &cutoff_cast, &ncellx,
|
||||
&ncelly, &ncellz, &inum, &nt, &nall,
|
||||
&_threads_per_atom, &_cells_in_cutoff);
|
||||
|
||||
/* Get the maximum number of nbors and realloc if necessary */
|
||||
UCL_D_Vec<int> numj;
|
||||
@ -494,7 +502,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
|
||||
if (nt>inum) {
|
||||
UCL_H_Vec<int> host_offset;
|
||||
host_offset.view_offset(inum,host_acc,nt-inum);
|
||||
ucl_copy(host_offset,dev_host_numj,nt-inum,true);
|
||||
ucl_copy(host_offset,dev_numj_host,nt-inum,true);
|
||||
}
|
||||
|
||||
if (_gpu_nbor!=2) {
|
||||
@ -521,17 +529,16 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
|
||||
const int GX2=static_cast<int>(ceil(static_cast<double>
|
||||
(nt*_threads_per_atom)/cell_block));
|
||||
_shared->k_special.set_size(GX2,cell_block);
|
||||
_shared->k_special.run(&dev_nbor.begin(), &dev_host_nbor.begin(),
|
||||
&dev_host_numj.begin(), &atom.dev_tag.begin(),
|
||||
&dev_nspecial.begin(), &dev_special.begin(),
|
||||
_shared->k_special.run(&dev_nbor, &nbor_host, &dev_numj_host,
|
||||
&atom.dev_tag, &dev_nspecial, &dev_special,
|
||||
&inum, &nt, &_max_nbors, &_threads_per_atom);
|
||||
}
|
||||
time_kernel.stop();
|
||||
|
||||
time_nbor.start();
|
||||
if (inum<nt) {
|
||||
ucl_copy(host_nbor,dev_host_nbor,true);
|
||||
host_nbor.sync();
|
||||
nbor_host.update_host(true);
|
||||
nbor_host.sync();
|
||||
}
|
||||
time_nbor.stop();
|
||||
}
|
||||
|
||||
@ -22,20 +22,6 @@
|
||||
|
||||
#define IJ_SIZE 131072
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
|
||||
#include "geryon/ocl_timer.h"
|
||||
#include "geryon/ocl_mat.h"
|
||||
using namespace ucl_opencl;
|
||||
|
||||
#else
|
||||
|
||||
#include "geryon/nvd_timer.h"
|
||||
#include "geryon/nvd_mat.h"
|
||||
using namespace ucl_cudadr;
|
||||
|
||||
#endif
|
||||
|
||||
namespace LAMMPS_AL {
|
||||
|
||||
class Neighbor {
|
||||
@ -70,7 +56,14 @@ class Neighbor {
|
||||
const int warp_size, const bool time_device);
|
||||
|
||||
/// Set the size of the cutoff+skin
|
||||
inline void cell_size(const double size) { _cell_size=size; }
|
||||
inline void cell_size(const double size, const double cutoff) {
|
||||
_cell_size=size;
|
||||
_cutoff=cutoff;
|
||||
if (cutoff>size)
|
||||
_cells_in_cutoff=static_cast<int>(ceil(cutoff/size));
|
||||
else
|
||||
_cells_in_cutoff=1;
|
||||
}
|
||||
|
||||
/// Get the size of the cutoff+skin
|
||||
inline double cell_size() const { return _cell_size; }
|
||||
@ -203,14 +196,11 @@ class Neighbor {
|
||||
|
||||
// ----------------- Data for GPU Neighbor Calculation ---------------
|
||||
|
||||
/// Host storage for device calculated neighbor lists
|
||||
/** Same storage format as device matrix **/
|
||||
UCL_H_Vec<int> host_nbor;
|
||||
/// Device storage for neighbor list matrix that will be copied to host
|
||||
/// Host/Device storage for device calculated neighbor lists
|
||||
/** - 1st row is numj
|
||||
* - Remaining rows are by atom, columns are nbors **/
|
||||
UCL_D_Vec<int> dev_host_nbor;
|
||||
UCL_D_Vec<int> dev_host_numj;
|
||||
UCL_Vector<int,int> nbor_host;
|
||||
UCL_D_Vec<int> dev_numj_host;
|
||||
UCL_H_Vec<int> host_ilist;
|
||||
UCL_H_Vec<int*> host_jlist;
|
||||
/// Device storage for special neighbor counts
|
||||
@ -232,13 +222,14 @@ class Neighbor {
|
||||
bool _allocated, _use_packing, _nbor_time_avail, _time_device;
|
||||
int _gpu_nbor, _max_atoms, _max_nbors, _max_host, _nbor_pitch, _maxspecial;
|
||||
bool _gpu_host, _alloc_packed;
|
||||
double _cell_size, _bin_time;
|
||||
double _cutoff, _cell_size, _bin_time;
|
||||
|
||||
double _gpu_bytes, _c_bytes, _cell_bytes;
|
||||
void alloc(bool &success);
|
||||
|
||||
int _block_cell_2d, _block_cell_id, _max_block_nbor_build, _block_nbor_build;
|
||||
int _ncells, _threads_per_atom, _total_atoms;
|
||||
int _cells_in_cutoff;
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
inline void resize_max_neighbors(const int maxn, bool &success);
|
||||
|
||||
@ -16,38 +16,48 @@
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#include "lal_preprocessor.h"
|
||||
texture<float4> neigh_tex;
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
ucl_inline float4 fetch_pos(const int& i, const float4 *pos)
|
||||
{ return tex1Dfetch(neigh_tex, i); }
|
||||
texture<float4> pos_tex;
|
||||
#else
|
||||
texture<int4,1> pos_tex;
|
||||
#endif
|
||||
|
||||
__kernel void calc_cell_id(numtyp4 *pos, unsigned *cell_id, int *particle_id,
|
||||
numtyp boxlo0,
|
||||
numtyp boxlo1, numtyp boxlo2, numtyp boxhi0,
|
||||
numtyp boxhi1, numtyp boxhi2, numtyp cell_size,
|
||||
int ncellx, int ncelly, int nall) {
|
||||
numtyp boxlo0, numtyp boxlo1, numtyp boxlo2,
|
||||
numtyp i_cell_size, int ncellx, int ncelly,
|
||||
int ncellz, int inum, int nall,
|
||||
int cells_in_cutoff) {
|
||||
int i = threadIdx.x + blockIdx.x*blockDim.x;
|
||||
|
||||
if (i < nall) {
|
||||
numtyp4 p = fetch_pos(i,pos); //pos[i];
|
||||
numtyp4 p;
|
||||
fetch4(p,i,pos_tex); //pos[i];
|
||||
|
||||
p.x -= boxlo0;
|
||||
p.y -= boxlo1;
|
||||
p.z -= boxlo2;
|
||||
|
||||
p.x = fmaxf(p.x, -cell_size);
|
||||
p.x = fminf(p.x, boxhi0-boxlo0+cell_size);
|
||||
p.y = fmaxf(p.y, -cell_size);
|
||||
p.y = fminf(p.y, boxhi1-boxlo1+cell_size);
|
||||
p.z = fmaxf(p.z, -cell_size);
|
||||
p.z = fminf(p.z, boxhi2-boxlo2+cell_size);
|
||||
int ix = int(p.x*i_cell_size+cells_in_cutoff);
|
||||
int iy = int(p.y*i_cell_size+cells_in_cutoff);
|
||||
int iz = int(p.z*i_cell_size+cells_in_cutoff);
|
||||
|
||||
unsigned int id = (unsigned int)(p.x/cell_size + 1.0)
|
||||
+ (unsigned int)(p.y/cell_size + 1.0) * ncellx
|
||||
+ (unsigned int)(p.z/cell_size + 1.0) * ncellx * ncelly;
|
||||
int offset_lo, offset_hi;
|
||||
if (i<inum) {
|
||||
offset_lo=cells_in_cutoff;
|
||||
offset_hi=cells_in_cutoff+1;
|
||||
} else {
|
||||
offset_lo=0;
|
||||
offset_hi=1;
|
||||
}
|
||||
|
||||
cell_id[i] = id;
|
||||
ix = max(ix,offset_lo);
|
||||
ix = min(ix,ncellx-offset_hi);
|
||||
iy = max(iy,offset_lo);
|
||||
iy = min(iy,ncelly-offset_hi);
|
||||
iz = max(iz,offset_lo);
|
||||
iz = min(iz,ncellz-offset_hi);
|
||||
|
||||
cell_id[i] = ix+iy*ncellx+iz*ncellx*ncelly;
|
||||
particle_id[i] = i;
|
||||
}
|
||||
}
|
||||
@ -78,6 +88,8 @@ __kernel void kernel_calc_cell_counts(unsigned *cell_id,
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
#define pos_tex x_
|
||||
#endif
|
||||
|
||||
|
||||
@ -113,12 +125,13 @@ __kernel void calc_neigh_list_cell(__global numtyp4 *x_,
|
||||
__global int *host_numj,
|
||||
int neigh_bin_size, numtyp cell_size,
|
||||
int ncellx, int ncelly, int ncellz,
|
||||
int inum, int nt, int nall, int t_per_atom)
|
||||
int inum, int nt, int nall, int t_per_atom,
|
||||
int cells_in_cutoff)
|
||||
{
|
||||
int tid = THREAD_ID_X;
|
||||
int ix = BLOCK_ID_X;
|
||||
int iy = BLOCK_ID_Y % ncelly;
|
||||
int iz = BLOCK_ID_Y / ncelly;
|
||||
int ix = BLOCK_ID_X + cells_in_cutoff;
|
||||
int iy = BLOCK_ID_Y % (ncelly - cells_in_cutoff*2) + cells_in_cutoff;
|
||||
int iz = BLOCK_ID_Y / (ncelly - cells_in_cutoff*2) + cells_in_cutoff;
|
||||
int bsx = BLOCK_SIZE_X;
|
||||
|
||||
int icell = ix + iy*ncellx + iz*ncellx*ncelly;
|
||||
@ -129,9 +142,9 @@ __kernel void calc_neigh_list_cell(__global numtyp4 *x_,
|
||||
int icell_begin = cell_counts[icell];
|
||||
int icell_end = cell_counts[icell+1];
|
||||
|
||||
int nborz0 = max(iz-1,0), nborz1 = min(iz+1, ncellz-1),
|
||||
nbory0 = max(iy-1,0), nbory1 = min(iy+1, ncelly-1),
|
||||
nborx0 = max(ix-1,0), nborx1 = min(ix+1, ncellx-1);
|
||||
int nborz0 = iz-cells_in_cutoff, nborz1 = iz+cells_in_cutoff,
|
||||
nbory0 = iy-cells_in_cutoff, nbory1 = iy+cells_in_cutoff,
|
||||
nborx0 = ix-cells_in_cutoff, nborx1 = ix+cells_in_cutoff;
|
||||
|
||||
numtyp4 diff;
|
||||
numtyp r2;
|
||||
@ -147,7 +160,7 @@ __kernel void calc_neigh_list_cell(__global numtyp4 *x_,
|
||||
pid_i = cell_particle_id[i];
|
||||
|
||||
if (pid_i < nt) {
|
||||
atom_i = fetch_pos(pid_i,x_); //pos[pid_i];
|
||||
fetch4(atom_i,pid_i,pos_tex); //pos[i];
|
||||
}
|
||||
if (pid_i < inum) {
|
||||
stride=inum;
|
||||
@ -182,7 +195,7 @@ __kernel void calc_neigh_list_cell(__global numtyp4 *x_,
|
||||
if (tid < end_idx) {
|
||||
pid_j = cell_particle_id[tid+k*bsx+jcell_begin];
|
||||
cell_list_sh[tid] = pid_j;
|
||||
atom_j = fetch_pos(pid_j,x_); //[pid_j];
|
||||
fetch4(atom_j,pid_j,pos_tex); //[pid_j];
|
||||
pos_sh[tid].x = atom_j.x;
|
||||
pos_sh[tid].y = atom_j.y;
|
||||
pos_sh[tid].z = atom_j.z;
|
||||
|
||||
@ -16,12 +16,15 @@
|
||||
#include "lal_precision.h"
|
||||
#include "lal_neighbor_shared.h"
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
#if defined(USE_OPENCL)
|
||||
#include "neighbor_cpu_cl.h"
|
||||
#include "neighbor_gpu_cl.h"
|
||||
#elif defined(USE_CUDART)
|
||||
const char *neighbor_cpu=0;
|
||||
const char *neighbor_gpu=0;
|
||||
#else
|
||||
#include "neighbor_cpu_ptx.h"
|
||||
#include "neighbor_gpu_ptx.h"
|
||||
#include "neighbor_cpu_cubin.h"
|
||||
#include "neighbor_gpu_cubin.h"
|
||||
#endif
|
||||
|
||||
using namespace LAMMPS_AL;
|
||||
@ -69,7 +72,7 @@ void NeighborShared::compile_kernels(UCL_Device &dev, const int gpu_nbor) {
|
||||
k_build_nbor.set_function(*build_program,"calc_neigh_list_cell");
|
||||
k_transpose.set_function(*build_program,"transpose");
|
||||
k_special.set_function(*build_program,"kernel_special");
|
||||
neigh_tex.get_texture(*build_program,"neigh_tex");
|
||||
neigh_tex.get_texture(*build_program,"pos_tex");
|
||||
}
|
||||
_compiled=true;
|
||||
}
|
||||
|
||||
@ -16,18 +16,18 @@
|
||||
#ifndef LAL_NEIGHBOR_SHARED_H
|
||||
#define LAL_NEIGHBOR_SHARED_H
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
|
||||
#if defined(USE_OPENCL)
|
||||
#include "geryon/ocl_kernel.h"
|
||||
#include "geryon/ocl_texture.h"
|
||||
using namespace ucl_opencl;
|
||||
|
||||
#elif defined(USE_CUDART)
|
||||
#include "geryon/nvc_kernel.h"
|
||||
#include "geryon/nvc_texture.h"
|
||||
using namespace ucl_cudart;
|
||||
#else
|
||||
|
||||
#include "geryon/nvd_kernel.h"
|
||||
#include "geryon/nvd_texture.h"
|
||||
using namespace ucl_cudadr;
|
||||
|
||||
#endif
|
||||
|
||||
namespace LAMMPS_AL {
|
||||
|
||||
@ -13,11 +13,14 @@
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
#if defined(USE_OPENCL)
|
||||
#include "pppm_cl.h"
|
||||
#elif defined(USE_CUDART)
|
||||
const char *pppm_f=0;
|
||||
const char *pppm_d=0;
|
||||
#else
|
||||
#include "pppm_f_ptx.h"
|
||||
#include "pppm_d_ptx.h"
|
||||
#include "pppm_f_cubin.h"
|
||||
#include "pppm_d_cubin.h"
|
||||
#endif
|
||||
#include "lal_pppm.h"
|
||||
#include <cassert>
|
||||
@ -51,7 +54,7 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen,
|
||||
const int nylo_out, const int nzlo_out,
|
||||
const int nxhi_out, const int nyhi_out,
|
||||
const int nzhi_out, grdtyp **rho_coeff,
|
||||
grdtyp **vd_brick, const double slab_volfactor,
|
||||
grdtyp **vd_brick_p, const double slab_volfactor,
|
||||
const int nx_pppm, const int ny_pppm,
|
||||
const int nz_pppm, const bool split, int &flag) {
|
||||
_max_bytes=10;
|
||||
@ -92,8 +95,8 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen,
|
||||
time_interp.init(*ucl_device);
|
||||
time_interp.zero();
|
||||
|
||||
pos_tex.bind_float(atom->dev_x,4);
|
||||
q_tex.bind_float(atom->dev_q,1);
|
||||
pos_tex.bind_float(atom->x,4);
|
||||
q_tex.bind_float(atom->q,1);
|
||||
|
||||
_allocated=true;
|
||||
_max_bytes=0;
|
||||
@ -133,14 +136,12 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen,
|
||||
_npts_y=nyhi_out-nylo_out+1;
|
||||
_npts_z=nzhi_out-nzlo_out+1;
|
||||
_npts_yx=_npts_x*_npts_y;
|
||||
success=success && (d_brick.alloc(_npts_x*_npts_y*_npts_z*4,*ucl_device)==
|
||||
success=success && (brick.alloc(_npts_x*_npts_y*_npts_z,*ucl_device)==
|
||||
UCL_SUCCESS);
|
||||
success=success && (h_brick.alloc(_npts_x*_npts_y*_npts_z,*ucl_device)==
|
||||
success=success && (vd_brick.alloc(_npts_x*_npts_y*_npts_z*4,*ucl_device)==
|
||||
UCL_SUCCESS);
|
||||
success=success && (h_vd_brick.alloc(_npts_x*_npts_y*_npts_z*4,*ucl_device)==
|
||||
UCL_SUCCESS);
|
||||
*vd_brick=h_vd_brick.begin();
|
||||
_max_bytes+=d_brick.row_bytes();
|
||||
*vd_brick_p=vd_brick.host.begin();
|
||||
_max_bytes+=brick.device.row_bytes()+vd_brick.device.row_bytes();
|
||||
|
||||
// Allocate vector with count of atoms assigned to each grid point
|
||||
_nlocal_x=_npts_x+_nlower-_nupper;
|
||||
@ -158,20 +159,19 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen,
|
||||
_max_bytes+=d_brick_atoms.row_bytes();
|
||||
|
||||
// Allocate error flags for checking out of bounds atoms
|
||||
success=success && (h_error_flag.alloc(1,*ucl_device)==UCL_SUCCESS);
|
||||
success=success && (d_error_flag.alloc(1,*ucl_device,UCL_WRITE_ONLY)==
|
||||
UCL_SUCCESS);
|
||||
success=success && (error_flag.alloc(1,*ucl_device,UCL_RW_OPTIMIZED,
|
||||
UCL_WRITE_ONLY)==UCL_SUCCESS);
|
||||
if (!success) {
|
||||
flag=-3;
|
||||
return 0;
|
||||
}
|
||||
|
||||
d_error_flag.zero();
|
||||
error_flag.device.zero();
|
||||
_max_bytes+=1;
|
||||
|
||||
_cpu_idle_time=0.0;
|
||||
|
||||
return h_brick.begin();
|
||||
return brick.host.begin();
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
|
||||
@ -181,12 +181,10 @@ void PPPMT::clear(const double cpu_time) {
|
||||
_allocated=false;
|
||||
_precompute_done=false;
|
||||
|
||||
d_brick.clear();
|
||||
h_brick.clear();
|
||||
h_vd_brick.clear();
|
||||
brick.clear();
|
||||
vd_brick.clear();
|
||||
d_brick_counts.clear();
|
||||
h_error_flag.clear();
|
||||
d_error_flag.clear();
|
||||
error_flag.clear();
|
||||
d_brick_atoms.clear();
|
||||
|
||||
acc_timers();
|
||||
@ -269,11 +267,11 @@ void PPPMT::_precompute(const int ago, const int nlocal, const int nall,
|
||||
|
||||
device->zero(d_brick_counts,d_brick_counts.numel());
|
||||
k_particle_map.set_size(GX,BX);
|
||||
k_particle_map.run(&atom->dev_x.begin(), &atom->dev_q.begin(), &f_delvolinv,
|
||||
&ainum, &d_brick_counts.begin(), &d_brick_atoms.begin(),
|
||||
&_brick_x, &_brick_y, &_brick_z, &_delxinv, &_delyinv,
|
||||
&_delzinv, &_nlocal_x, &_nlocal_y, &_nlocal_z,
|
||||
&_atom_stride, &_max_brick_atoms, &d_error_flag.begin());
|
||||
k_particle_map.run(&atom->x, &atom->q, &f_delvolinv, &ainum,
|
||||
&d_brick_counts, &d_brick_atoms, &_brick_x, &_brick_y,
|
||||
&_brick_z, &_delxinv, &_delyinv, &_delzinv, &_nlocal_x,
|
||||
&_nlocal_y, &_nlocal_z, &_atom_stride, &_max_brick_atoms,
|
||||
&error_flag);
|
||||
time_map.stop();
|
||||
|
||||
time_rho.start();
|
||||
@ -282,15 +280,14 @@ void PPPMT::_precompute(const int ago, const int nlocal, const int nall,
|
||||
GX=static_cast<int>(ceil(static_cast<double>(_npts_y*_npts_z)/
|
||||
_block_pencils));
|
||||
k_make_rho.set_size(GX,BX);
|
||||
k_make_rho.run(&d_brick_counts.begin(), &d_brick_atoms.begin(),
|
||||
&d_brick.begin(), &d_rho_coeff.begin(), &_atom_stride,
|
||||
&_npts_x, &_npts_y, &_npts_z, &_nlocal_x, &_nlocal_y,
|
||||
&_nlocal_z, &_order_m_1, &_order, &_order2);
|
||||
k_make_rho.run(&d_brick_counts, &d_brick_atoms, &brick, &d_rho_coeff,
|
||||
&_atom_stride, &_npts_x, &_npts_y, &_npts_z, &_nlocal_x,
|
||||
&_nlocal_y, &_nlocal_z, &_order_m_1, &_order, &_order2);
|
||||
time_rho.stop();
|
||||
|
||||
time_out.start();
|
||||
ucl_copy(h_brick,d_brick,_npts_yx*_npts_z,true);
|
||||
ucl_copy(h_error_flag,d_error_flag,true);
|
||||
brick.update_host(_npts_yx*_npts_z,true);
|
||||
error_flag.update_host(true);
|
||||
time_out.stop();
|
||||
|
||||
_precompute_done=true;
|
||||
@ -322,18 +319,17 @@ int PPPMT::spread(const int ago, const int nlocal, const int nall,
|
||||
|
||||
_precompute_done=false;
|
||||
|
||||
if (h_error_flag[0]==2) {
|
||||
if (error_flag[0]==2) {
|
||||
// Not enough storage for atoms on the brick
|
||||
_max_brick_atoms*=2;
|
||||
d_error_flag.zero();
|
||||
d_brick_atoms.clear();
|
||||
d_brick_atoms.alloc(_atom_stride*_max_brick_atoms,*ucl_device);
|
||||
error_flag.device.zero();
|
||||
d_brick_atoms.resize(_atom_stride*_max_brick_atoms);
|
||||
_max_bytes+=d_brick_atoms.row_bytes();
|
||||
return spread(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo,
|
||||
delxinv,delyinv,delzinv);
|
||||
}
|
||||
|
||||
return h_error_flag[0];
|
||||
return error_flag[0];
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
@ -342,7 +338,7 @@ int PPPMT::spread(const int ago, const int nlocal, const int nall,
|
||||
template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
|
||||
void PPPMT::interp(const grdtyp qqrd2e_scale) {
|
||||
time_in.start();
|
||||
ucl_copy(d_brick,h_vd_brick,true);
|
||||
vd_brick.update_device(true);
|
||||
time_in.stop();
|
||||
|
||||
time_interp.start();
|
||||
@ -353,10 +349,10 @@ void PPPMT::interp(const grdtyp qqrd2e_scale) {
|
||||
int ainum=this->ans->inum();
|
||||
|
||||
k_interp.set_size(GX,BX);
|
||||
k_interp.run(&atom->dev_x.begin(), &atom->dev_q.begin(), &ainum,
|
||||
&d_brick.begin(), &d_rho_coeff.begin(), &_npts_x, &_npts_yx,
|
||||
&_brick_x, &_brick_y, &_brick_z, &_delxinv, &_delyinv, &_delzinv,
|
||||
&_order, &_order2, &qqrd2e_scale, &ans->dev_ans.begin());
|
||||
k_interp.run(&atom->x, &atom->q, &ainum, &vd_brick, &d_rho_coeff,
|
||||
&_npts_x, &_npts_yx, &_brick_x, &_brick_y, &_brick_z, &_delxinv,
|
||||
&_delyinv, &_delzinv, &_order, &_order2, &qqrd2e_scale,
|
||||
&ans->force);
|
||||
time_interp.stop();
|
||||
|
||||
ans->copy_answers(false,false,false,false);
|
||||
@ -408,4 +404,3 @@ void PPPMT::compile_kernels(UCL_Device &dev) {
|
||||
|
||||
template class PPPM<PRECISION,ACC_PRECISION,float,_lgpu_float4>;
|
||||
template class PPPM<PRECISION,ACC_PRECISION,double,_lgpu_double4>;
|
||||
|
||||
|
||||
@ -14,14 +14,14 @@
|
||||
// ***************************************************************************/
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
|
||||
#include "lal_preprocessor.h"
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
texture<float4> pos_tex;
|
||||
texture<float> q_tex;
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
ucl_inline float4 fetch_pos(const int& i, const float4 *pos)
|
||||
{ return tex1Dfetch(pos_tex, i); }
|
||||
ucl_inline float fetch_q(const int& i, const float *q)
|
||||
{ return tex1Dfetch(q_tex, i); }
|
||||
#else
|
||||
texture<int4,1> pos_tex;
|
||||
texture<int2> q_tex;
|
||||
#endif
|
||||
|
||||
// Allow PPPM to compile without atomics for NVIDIA 1.0 cards, error
|
||||
@ -31,6 +31,8 @@ ucl_inline float fetch_q(const int& i, const float *q)
|
||||
#endif
|
||||
|
||||
#else
|
||||
#define pos_tex x_
|
||||
#define q_tex q_
|
||||
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics: enable
|
||||
#endif
|
||||
|
||||
@ -59,9 +61,11 @@ __kernel void particle_map(__global numtyp4 *x_, __global numtyp *q_,
|
||||
int nx,ny,nz;
|
||||
|
||||
if (ii<nlocal) {
|
||||
numtyp4 p=fetch_pos(ii,x_);
|
||||
numtyp4 p;
|
||||
fetch4(p,ii,pos_tex);
|
||||
grdtyp4 delta;
|
||||
delta.w=delvolinv*fetch_q(ii,q_);
|
||||
fetch(delta.w,ii,q_tex);
|
||||
delta.w*=delvolinv;
|
||||
|
||||
if (delta.w!=(grdtyp)0.0) {
|
||||
delta.x=(p.x-b_lo_x)*delxinv;
|
||||
@ -212,8 +216,11 @@ __kernel void interp(__global numtyp4 *x_, __global numtyp *q_,
|
||||
grdtyp tx,ty,tz;
|
||||
|
||||
if (ii<nlocal) {
|
||||
numtyp4 p=fetch_pos(ii,x_);
|
||||
grdtyp qs=qqrd2e_scale*fetch_q(ii,q_);
|
||||
numtyp4 p;
|
||||
fetch4(p,ii,pos_tex);
|
||||
grdtyp qs;
|
||||
fetch(qs,ii,q_tex);
|
||||
qs*=qqrd2e_scale;
|
||||
|
||||
acctyp4 ek;
|
||||
ek.x=(acctyp)0.0;
|
||||
|
||||
@ -19,8 +19,10 @@
|
||||
#include "mpi.h"
|
||||
#include "lal_device.h"
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
#if defined(USE_OPENCL)
|
||||
#include "geryon/ocl_texture.h"
|
||||
#elif defined(USE_CUDART)
|
||||
#include "geryon/nvc_texture.h"
|
||||
#else
|
||||
#include "geryon/nvd_texture.h"
|
||||
#endif
|
||||
@ -55,8 +57,8 @@ class PPPM {
|
||||
/** \param success set to false if insufficient memory **/
|
||||
inline void resize_atom(const int inum, const int nall, bool &success) {
|
||||
if (atom->resize(nall, success)) {
|
||||
pos_tex.bind_float(atom->dev_x,4);
|
||||
q_tex.bind_float(atom->dev_q,1);
|
||||
pos_tex.bind_float(atom->x,4);
|
||||
q_tex.bind_float(atom->q,1);
|
||||
}
|
||||
ans->resize(inum,success);
|
||||
}
|
||||
@ -138,8 +140,8 @@ class PPPM {
|
||||
|
||||
// --------------------------- GRID DATA --------------------------
|
||||
|
||||
UCL_H_Vec<grdtyp> h_brick, h_vd_brick;
|
||||
UCL_D_Vec<grdtyp> d_brick;
|
||||
UCL_Vector<grdtyp,grdtyp> brick;
|
||||
UCL_Vector<grdtyp,grdtyp> vd_brick;
|
||||
|
||||
// Count of number of atoms assigned to each grid point
|
||||
UCL_D_Vec<int> d_brick_counts;
|
||||
@ -147,8 +149,7 @@ class PPPM {
|
||||
UCL_D_Vec<grdtyp4> d_brick_atoms;
|
||||
|
||||
// Error checking for out of bounds atoms
|
||||
UCL_D_Vec<int> d_error_flag;
|
||||
UCL_H_Vec<int> h_error_flag;
|
||||
UCL_Vector<int,int> error_flag;
|
||||
|
||||
// Number of grid points in brick (including ghost)
|
||||
int _npts_x, _npts_y, _npts_z, _npts_yx;
|
||||
|
||||
@ -16,6 +16,10 @@
|
||||
#ifndef LAL_PRECISION_H
|
||||
#define LAL_PRECISION_H
|
||||
|
||||
#if defined(USE_CUDART)
|
||||
#include <cuda_runtime.h>
|
||||
#endif
|
||||
|
||||
struct _lgpu_int2 {
|
||||
int x; int y;
|
||||
};
|
||||
@ -108,3 +112,4 @@ enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
@ -107,7 +107,7 @@
|
||||
#define BLOCK_NBOR_BUILD 128
|
||||
#define BLOCK_PAIR 128
|
||||
#define BLOCK_BIO_PAIR 128
|
||||
#define MAX_SHARED_TYPES 11
|
||||
#define MAX_SHARED_TYPES 8
|
||||
|
||||
#else
|
||||
|
||||
@ -129,8 +129,21 @@
|
||||
#define MAX_BIO_SHARED_TYPES 128
|
||||
|
||||
#ifdef _DOUBLE_DOUBLE
|
||||
ucl_inline double4 fetch_pos(const int& i, const double4 *pos) { return pos[i]; };
|
||||
ucl_inline double fetch_q(const int& i, const double *q) { return q[i]; };
|
||||
#define fetch4(ans,i,pos_tex) { \
|
||||
int4 xy = tex1Dfetch(pos_tex,i*2); \
|
||||
int4 zt = tex1Dfetch(pos_tex,i*2+1); \
|
||||
ans.x=__hiloint2double(xy.y, xy.x); \
|
||||
ans.y=__hiloint2double(xy.w, xy.z); \
|
||||
ans.z=__hiloint2double(zt.y, zt.x); \
|
||||
ans.w=__hiloint2double(zt.w, zt.z); \
|
||||
}
|
||||
#define fetch(ans,i,q_tex) { \
|
||||
int2 qt = tex1Dfetch(q_tex,i); \
|
||||
ans=__hiloint2double(qt.y, qt.x); \
|
||||
}
|
||||
#else
|
||||
#define fetch4(ans,i,pos_tex) ans=tex1Dfetch(pos_tex, i);
|
||||
#define fetch(ans,i,q_tex) ans=tex1Dfetch(q_tex,i);
|
||||
#endif
|
||||
|
||||
#if (__CUDA_ARCH__ < 200)
|
||||
@ -293,8 +306,8 @@ typedef struct _double4 double4;
|
||||
#define BLOCK_ID_Y get_group_id(1)
|
||||
#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
|
||||
#define ucl_inline inline
|
||||
#define fetch_pos(i,y) x_[i]
|
||||
#define fetch_q(i,y) q_[i]
|
||||
#define fetch4(ans,i,x) ans=x[i]
|
||||
#define fetch(ans,i,q) ans=q[i]
|
||||
|
||||
#define ucl_atan atan
|
||||
#define ucl_cbrt cbrt
|
||||
|
||||
@ -13,12 +13,15 @@
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
#if defined(USE_OPENCL)
|
||||
#include "re_squared_cl.h"
|
||||
#include "re_squared_lj_cl.h"
|
||||
#elif defined(USE_CUDART)
|
||||
const char *re_squared=0;
|
||||
const char *re_squared_lj=0;
|
||||
#else
|
||||
#include "re_squared_ptx.h"
|
||||
#include "re_squared_lj_ptx.h"
|
||||
#include "re_squared_cubin.h"
|
||||
#include "re_squared_lj_cubin.h"
|
||||
#endif
|
||||
|
||||
#include "lal_re_squared.h"
|
||||
@ -54,7 +57,8 @@ int RESquaredT::init(const int ntypes, double **host_shape, double **host_well,
|
||||
const double gpu_split, FILE *_screen) {
|
||||
int success;
|
||||
success=this->init_base(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||
_screen,ntypes,h_form,re_squared,re_squared_lj,true);
|
||||
_screen,ntypes,h_form,re_squared,re_squared_lj,
|
||||
"k_resquared",true);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
@ -198,13 +202,13 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
|
||||
|
||||
this->time_ellipsoid.start();
|
||||
this->k_ellipsoid.set_size(GX,BX);
|
||||
this->k_ellipsoid.run(&this->atom->dev_x.begin(),
|
||||
&this->atom->dev_quat.begin(), &this->shape.begin(), &this->well.begin(),
|
||||
&this->special_lj.begin(), &this->sigma_epsilon.begin(),
|
||||
&this->_lj_types, &this->nbor->dev_nbor.begin(), &stride,
|
||||
&this->ans->dev_ans.begin(),&ainum,&this->ans->dev_engv.begin(),
|
||||
&this->dev_error.begin(), &eflag, &vflag, &this->_last_ellipse,
|
||||
&this->_threads_per_atom);
|
||||
this->k_ellipsoid.run(&this->atom->x, &this->atom->quat,
|
||||
&this->shape, &this->well, &this->special_lj,
|
||||
&this->sigma_epsilon, &this->_lj_types,
|
||||
&this->nbor->dev_nbor, &stride,
|
||||
&this->ans->force,&ainum, &this->ans->engv,
|
||||
&this->dev_error, &eflag, &vflag,
|
||||
&this->_last_ellipse, &this->_threads_per_atom);
|
||||
this->time_ellipsoid.stop();
|
||||
|
||||
// ------------ ELLIPSE_SPHERE ---------------
|
||||
@ -215,12 +219,13 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
|
||||
|
||||
this->time_ellipsoid2.start();
|
||||
this->k_ellipsoid_sphere.set_size(GX,BX);
|
||||
this->k_ellipsoid_sphere.run(&this->atom->dev_x.begin(),
|
||||
&this->atom->dev_quat.begin(), &this->shape.begin(), &this->well.begin(),
|
||||
&this->special_lj.begin(), &this->sigma_epsilon.begin(),
|
||||
&this->_lj_types, &this->nbor->dev_nbor.begin(), &stride,
|
||||
&this->ans->dev_ans.begin(),&ainum,&this->ans->dev_engv.begin(),
|
||||
&this->dev_error.begin(), &eflag, &vflag, &this->_last_ellipse,
|
||||
this->k_ellipsoid_sphere.run(&this->atom->x, &this->atom->quat,
|
||||
&this->shape, &this->well, &this->special_lj,
|
||||
&this->sigma_epsilon, &this->_lj_types,
|
||||
&this->nbor->dev_nbor, &stride,
|
||||
&this->ans->force,&ainum,
|
||||
&this->ans->engv, &this->dev_error,
|
||||
&eflag, &vflag, &this->_last_ellipse,
|
||||
&this->_threads_per_atom);
|
||||
this->time_ellipsoid2.stop();
|
||||
|
||||
@ -245,17 +250,18 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
|
||||
|
||||
this->time_ellipsoid3.start();
|
||||
this->k_sphere_ellipsoid.set_size(GX,BX);
|
||||
this->k_sphere_ellipsoid.run(&this->atom->dev_x.begin(),
|
||||
&this->atom->dev_quat.begin(), &this->shape.begin(),
|
||||
&this->well.begin(), &this->special_lj.begin(),
|
||||
&this->sigma_epsilon.begin(), &this->_lj_types,
|
||||
&this->nbor->dev_nbor.begin(), &stride, &this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &this->dev_error.begin(), &eflag,
|
||||
&vflag, &this->_last_ellipse, &ainum, &this->_threads_per_atom);
|
||||
this->k_sphere_ellipsoid.run(&this->atom->x, &this->atom->quat,
|
||||
&this->shape, &this->well, &this->special_lj,
|
||||
&this->sigma_epsilon, &this->_lj_types,
|
||||
&this->nbor->dev_nbor, &stride,
|
||||
&this->ans->force, &this->ans->engv,
|
||||
&this->dev_error, &eflag, &vflag,
|
||||
&this->_last_ellipse, &ainum,
|
||||
&this->_threads_per_atom);
|
||||
this->time_ellipsoid3.stop();
|
||||
} else {
|
||||
this->ans->dev_ans.zero();
|
||||
this->ans->dev_engv.zero();
|
||||
this->ans->force.zero();
|
||||
this->ans->engv.zero();
|
||||
this->time_nbor1.zero();
|
||||
this->time_ellipsoid.zero();
|
||||
this->time_nbor2.zero();
|
||||
@ -269,19 +275,19 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
|
||||
if (this->_last_ellipse<this->ans->inum()) {
|
||||
if (this->_shared_types) {
|
||||
this->k_lj_fast.set_size(GX,BX);
|
||||
this->k_lj_fast.run(&this->atom->dev_x.begin(), &this->lj1.begin(),
|
||||
&this->lj3.begin(), &this->special_lj.begin(), &stride,
|
||||
&this->nbor->dev_packed.begin(), &this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &this->dev_error.begin(),
|
||||
this->k_lj_fast.run(&this->atom->x, &this->lj1, &this->lj3,
|
||||
&this->special_lj, &stride,
|
||||
&this->nbor->dev_packed, &this->ans->force,
|
||||
&this->ans->engv, &this->dev_error,
|
||||
&eflag, &vflag, &this->_last_ellipse, &ainum,
|
||||
&this->_threads_per_atom);
|
||||
} else {
|
||||
this->k_lj.set_size(GX,BX);
|
||||
this->k_lj.run(&this->atom->dev_x.begin(), &this->lj1.begin(),
|
||||
&this->lj3.begin(), &this->_lj_types, &this->special_lj.begin(),
|
||||
&stride, &this->nbor->dev_packed.begin(), &this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &this->dev_error.begin(), &eflag,
|
||||
&vflag, &this->_last_ellipse, &ainum, &this->_threads_per_atom);
|
||||
this->k_lj.run(&this->atom->x, &this->lj1, &this->lj3,
|
||||
&this->_lj_types, &this->special_lj, &stride,
|
||||
&this->nbor->dev_packed, &this->ans->force,
|
||||
&this->ans->engv, &this->dev_error, &eflag, &vflag,
|
||||
&this->_last_ellipse, &ainum, &this->_threads_per_atom);
|
||||
}
|
||||
}
|
||||
this->time_lj.stop();
|
||||
@ -295,13 +301,12 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
|
||||
this->time_nbor1.stop();
|
||||
this->time_ellipsoid.start();
|
||||
this->k_ellipsoid.set_size(GX,BX);
|
||||
this->k_ellipsoid.run(&this->atom->dev_x.begin(),
|
||||
&this->atom->dev_quat.begin(), &this->shape.begin(), &this->well.begin(),
|
||||
&this->special_lj.begin(), &this->sigma_epsilon.begin(),
|
||||
&this->_lj_types, &this->nbor->dev_nbor.begin(), &stride,
|
||||
&this->ans->dev_ans.begin(), &ainum, &this->ans->dev_engv.begin(),
|
||||
&this->dev_error.begin(), &eflag, &vflag, &ainum,
|
||||
&this->_threads_per_atom);
|
||||
this->k_ellipsoid.run(&this->atom->x, &this->atom->quat,
|
||||
&this->shape, &this->well, &this->special_lj,
|
||||
&this->sigma_epsilon, &this->_lj_types,
|
||||
&this->nbor->dev_nbor, &stride, &this->ans->force,
|
||||
&ainum, &this->ans->engv, &this->dev_error,
|
||||
&eflag, &vflag, &ainum, &this->_threads_per_atom);
|
||||
this->time_ellipsoid.stop();
|
||||
}
|
||||
}
|
||||
|
||||
@ -32,7 +32,7 @@ ucl_inline numtyp det_prime(const numtyp m[9], const numtyp m2[9])
|
||||
return ans;
|
||||
}
|
||||
|
||||
__kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q,
|
||||
__kernel void k_resquared(__global numtyp4* x_,__global numtyp4 *q,
|
||||
__global numtyp4* shape, __global numtyp4* well,
|
||||
__global numtyp *splj, __global numtyp2* sig_eps,
|
||||
const int ntypes, __global int *dev_nbor,
|
||||
@ -73,7 +73,7 @@ __kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q,
|
||||
nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix=x_[i];
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex);
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp a1[9]; // Rotation matrix (lab->body)
|
||||
@ -122,7 +122,7 @@ __kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q,
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=x_[j];
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex);
|
||||
int jtype=jx.w;
|
||||
|
||||
// Compute r12
|
||||
|
||||
@ -17,10 +17,11 @@
|
||||
#include "lal_ellipsoid_extra.h"
|
||||
#endif
|
||||
|
||||
__kernel void kernel_ellipsoid_sphere(__global numtyp4* x_,__global numtyp4 *q,
|
||||
__global numtyp4* shape, __global numtyp4* well,
|
||||
__global numtyp *splj, __global numtyp2* sig_eps,
|
||||
const int ntypes, __global int *dev_nbor, const int stride,
|
||||
__kernel void k_resquared_ellipsoid_sphere(__global numtyp4* x_,
|
||||
__global numtyp4 *q, __global numtyp4* shape,
|
||||
__global numtyp4* well, __global numtyp *splj,
|
||||
__global numtyp2* sig_eps, const int ntypes,
|
||||
__global int *dev_nbor, const int stride,
|
||||
__global acctyp4 *ans, const int astride,
|
||||
__global acctyp *engv, __global int *err_flag,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
@ -59,7 +60,7 @@ __kernel void kernel_ellipsoid_sphere(__global numtyp4* x_,__global numtyp4 *q,
|
||||
nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix=x_[i];
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex);
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp a[9]; // Rotation matrix (lab->body)
|
||||
@ -84,7 +85,7 @@ __kernel void kernel_ellipsoid_sphere(__global numtyp4* x_,__global numtyp4 *q,
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=x_[j];
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex);
|
||||
int jtype=jx.w;
|
||||
|
||||
// Compute r12
|
||||
@ -331,14 +332,14 @@ __kernel void kernel_ellipsoid_sphere(__global numtyp4* x_,__global numtyp4 *q,
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
|
||||
__global numtyp4* shape,__global numtyp4* well,
|
||||
__global numtyp *splj, __global numtyp2* sig_eps,
|
||||
const int ntypes, __global int *dev_nbor,
|
||||
const int stride, __global acctyp4 *ans,
|
||||
__global acctyp *engv, __global int *err_flag,
|
||||
const int eflag, const int vflag,const int start,
|
||||
const int inum, const int t_per_atom) {
|
||||
__kernel void k_resquared_sphere_ellipsoid(__global numtyp4 *x_,
|
||||
__global numtyp4 *q, __global numtyp4* shape,
|
||||
__global numtyp4* well, __global numtyp *splj,
|
||||
__global numtyp2* sig_eps, const int ntypes,
|
||||
__global int *dev_nbor, const int stride,
|
||||
__global acctyp4 *ans, __global acctyp *engv,
|
||||
__global int *err_flag, const int eflag, const int vflag,
|
||||
const int start, const int inum, const int t_per_atom) {
|
||||
int tid, ii, offset;
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
ii+=start;
|
||||
@ -370,7 +371,7 @@ __kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
|
||||
nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,j,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 jx=x_[j];
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex);
|
||||
int jtype=jx.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
@ -379,7 +380,7 @@ __kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
|
||||
factor_lj = sp_lj[sbmask(i)];
|
||||
i &= NEIGHMASK;
|
||||
|
||||
numtyp4 ix=x_[i];
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex);
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp a[9]; // Rotation matrix (lab->body)
|
||||
@ -524,14 +525,13 @@ __kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
__kernel void k_resquared_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
__global numtyp4* lj3, const int lj_types,
|
||||
__global numtyp *gum,
|
||||
const int stride, __global int *dev_ij,
|
||||
__global acctyp4 *ans, __global acctyp *engv,
|
||||
__global int *err_flag, const int eflag,
|
||||
const int vflag, const int start, const int inum,
|
||||
const int t_per_atom) {
|
||||
__global numtyp *gum, const int stride,
|
||||
__global int *dev_ij, __global acctyp4 *ans,
|
||||
__global acctyp *engv, __global int *err_flag,
|
||||
const int eflag, const int vflag, const int start,
|
||||
const int inum, const int t_per_atom) {
|
||||
int tid, ii, offset;
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
ii+=start;
|
||||
@ -557,7 +557,7 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
|
||||
numtyp4 ix=x_[i];
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex);
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
@ -567,7 +567,7 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=x_[j];
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex);
|
||||
int jtype=jx.w;
|
||||
|
||||
// Compute r12
|
||||
@ -606,13 +606,12 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
__global numtyp4* lj3_in, __global numtyp *gum,
|
||||
const int stride, __global int *dev_ij,
|
||||
__kernel void k_resquared_lj_fast(__global numtyp4 *x_,
|
||||
__global numtyp4 *lj1_in, __global numtyp4* lj3_in,
|
||||
__global numtyp *gum, const int stride, __global int *dev_ij,
|
||||
__global acctyp4 *ans, __global acctyp *engv,
|
||||
__global int *err_flag, const int eflag,
|
||||
const int vflag, const int start, const int inum,
|
||||
const int t_per_atom) {
|
||||
__global int *err_flag, const int eflag, const int vflag,
|
||||
const int start, const int inum, const int t_per_atom) {
|
||||
int tid, ii, offset;
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
ii+=start;
|
||||
@ -645,7 +644,7 @@ __kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
|
||||
numtyp4 ix=x_[i];
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex);
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
@ -656,7 +655,7 @@ __kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=x_[j];
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex);
|
||||
int mtype=itype+jx.w;
|
||||
|
||||
// Compute r12
|
||||
|
||||
@ -13,10 +13,12 @@
|
||||
email : nguyentd@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
#if defined(USE_OPENCL)
|
||||
#include "table_cl.h"
|
||||
#elif defined(USE_CUDART)
|
||||
const char *table=0;
|
||||
#else
|
||||
#include "table_ptx.h"
|
||||
#include "table_cubin.h"
|
||||
#endif
|
||||
|
||||
#include "lal_table.h"
|
||||
@ -56,17 +58,17 @@ int TableT::init(const int ntypes,
|
||||
const double gpu_split, FILE *_screen,
|
||||
int tabstyle, int ntables, int tablength) {
|
||||
int success;
|
||||
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||
_screen,table);
|
||||
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,
|
||||
gpu_split,_screen,table,"k_table");
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
k_pair_linear.set_function(*(this->pair_program),"kernel_pair_linear");
|
||||
k_pair_linear_fast.set_function(*(this->pair_program),"kernel_pair_linear_fast");
|
||||
k_pair_spline.set_function(*(this->pair_program),"kernel_pair_spline");
|
||||
k_pair_spline_fast.set_function(*(this->pair_program),"kernel_pair_spline_fast");
|
||||
k_pair_bitmap.set_function(*(this->pair_program),"kernel_pair_bitmap");
|
||||
k_pair_bitmap_fast.set_function(*(this->pair_program),"kernel_pair_bitmap_fast");
|
||||
k_pair_linear.set_function(*(this->pair_program),"k_table_linear");
|
||||
k_pair_linear_fast.set_function(*(this->pair_program),"k_table_linear_fast");
|
||||
k_pair_spline.set_function(*(this->pair_program),"k_table_spline");
|
||||
k_pair_spline_fast.set_function(*(this->pair_program),"k_table_spline_fast");
|
||||
k_pair_bitmap.set_function(*(this->pair_program),"k_table_bitmap");
|
||||
k_pair_bitmap_fast.set_function(*(this->pair_program),"k_table_bitmap_fast");
|
||||
_compiled_styles = true;
|
||||
|
||||
// If atom type constants fit in shared memory use fast kernel
|
||||
@ -264,84 +266,71 @@ void TableT::loop(const bool _eflag, const bool _vflag) {
|
||||
if (shared_types) {
|
||||
if (_tabstyle == LOOKUP) {
|
||||
this->k_pair_fast.set_size(GX,BX);
|
||||
this->k_pair_fast.run(&this->atom->dev_x.begin(), &tabindex.begin(),
|
||||
&coeff2.begin(), &coeff3.begin(),
|
||||
&coeff4.begin(), &cutsq.begin(), &sp_lj.begin(),
|
||||
&this->nbor->dev_nbor.begin(),
|
||||
&this->_nbor_data->begin(),
|
||||
&this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag,
|
||||
&ainum, &nbor_pitch, &this->_threads_per_atom,
|
||||
&_tablength);
|
||||
this->k_pair_fast.run(&this->atom->x, &tabindex, &coeff2, &coeff3,
|
||||
&coeff4, &cutsq, &sp_lj, &this->nbor->dev_nbor,
|
||||
&this->_nbor_data->begin(), &this->ans->force,
|
||||
&this->ans->engv, &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom, &_tablength);
|
||||
} else if (_tabstyle == LINEAR) {
|
||||
this->k_pair_linear_fast.set_size(GX,BX);
|
||||
this->k_pair_linear_fast.run(&this->atom->dev_x.begin(), &tabindex.begin(),
|
||||
&coeff2.begin(), &coeff3.begin(),
|
||||
&coeff4.begin(), &cutsq.begin(), &sp_lj.begin(),
|
||||
&this->nbor->dev_nbor.begin(),
|
||||
&this->_nbor_data->begin(),
|
||||
&this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag,
|
||||
&ainum, &nbor_pitch, &this->_threads_per_atom,
|
||||
&_tablength);
|
||||
this->k_pair_linear_fast.run(&this->atom->x, &tabindex, &coeff2,
|
||||
&coeff3, &coeff4, &cutsq, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv,
|
||||
&eflag, &vflag, &ainum, &nbor_pitch,
|
||||
&this->_threads_per_atom, &_tablength);
|
||||
} else if (_tabstyle == SPLINE) {
|
||||
this->k_pair_spline_fast.set_size(GX,BX);
|
||||
this->k_pair_spline_fast.run(&this->atom->dev_x.begin(), &tabindex.begin(),
|
||||
&coeff2.begin(), &coeff3.begin(),
|
||||
&coeff4.begin(), &cutsq.begin(), &sp_lj.begin(),
|
||||
&this->nbor->dev_nbor.begin(),
|
||||
&this->_nbor_data->begin(),
|
||||
&this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag,
|
||||
&ainum, &nbor_pitch, &this->_threads_per_atom,
|
||||
&_tablength);
|
||||
this->k_pair_spline_fast.run(&this->atom->x, &tabindex, &coeff2,
|
||||
&coeff3, &coeff4, &cutsq, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv,
|
||||
&eflag, &vflag, &ainum, &nbor_pitch,
|
||||
&this->_threads_per_atom, &_tablength);
|
||||
} else if (_tabstyle == BITMAP) {
|
||||
this->k_pair_bitmap_fast.set_size(GX,BX);
|
||||
this->k_pair_bitmap_fast.run(&this->atom->dev_x.begin(), &tabindex.begin(),
|
||||
&nshiftbits.begin(), &nmask.begin(),
|
||||
&coeff2.begin(), &coeff3.begin(),
|
||||
&coeff4.begin(), &cutsq.begin(), &sp_lj.begin(),
|
||||
&this->nbor->dev_nbor.begin(),
|
||||
&this->_nbor_data->begin(),
|
||||
&this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag,
|
||||
&ainum, &nbor_pitch, &this->_threads_per_atom,
|
||||
&_tablength);
|
||||
this->k_pair_bitmap_fast.run(&this->atom->x, &tabindex, &nshiftbits,
|
||||
&nmask, &coeff2, &coeff3, &coeff4, &cutsq,
|
||||
&sp_lj, &this->nbor->dev_nbor,
|
||||
&this->_nbor_data->begin(), &this->ans->force,
|
||||
&this->ans->engv, &eflag, &vflag,
|
||||
&ainum, &nbor_pitch,
|
||||
&this->_threads_per_atom, &_tablength);
|
||||
}
|
||||
} else {
|
||||
if (_tabstyle == LOOKUP) {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->dev_x.begin(), &tabindex.begin(),
|
||||
&coeff2.begin(), &coeff3.begin(), &coeff4.begin(), &_lj_types,
|
||||
&cutsq.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
||||
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom, &_tablength);
|
||||
this->k_pair.run(&this->atom->x, &tabindex, &coeff2, &coeff3,
|
||||
&coeff4, &_lj_types, &cutsq, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag,
|
||||
&vflag, &ainum, &nbor_pitch, &this->_threads_per_atom,
|
||||
&_tablength);
|
||||
} else if (_tabstyle == LINEAR) {
|
||||
this->k_pair_linear.set_size(GX,BX);
|
||||
this->k_pair_linear.run(&this->atom->dev_x.begin(), &tabindex.begin(),
|
||||
&coeff2.begin(), &coeff3.begin(), &coeff4.begin(), &_lj_types,
|
||||
&cutsq.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
||||
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom, &_tablength);
|
||||
this->k_pair_linear.run(&this->atom->x, &tabindex, &coeff2, &coeff3,
|
||||
&coeff4, &_lj_types, &cutsq, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag,
|
||||
&vflag, &ainum, &nbor_pitch,
|
||||
&this->_threads_per_atom, &_tablength);
|
||||
} else if (_tabstyle == SPLINE) {
|
||||
this->k_pair_spline.set_size(GX,BX);
|
||||
this->k_pair_spline.run(&this->atom->dev_x.begin(), &tabindex.begin(),
|
||||
&coeff2.begin(), &coeff3.begin(), &coeff4.begin(), &_lj_types,
|
||||
&cutsq.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
||||
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom, &_tablength);
|
||||
this->k_pair_spline.run(&this->atom->x, &tabindex, &coeff2, &coeff3,
|
||||
&coeff4, &_lj_types, &cutsq, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag,
|
||||
&vflag, &ainum, &nbor_pitch,
|
||||
&this->_threads_per_atom, &_tablength);
|
||||
} else if (_tabstyle == BITMAP) {
|
||||
this->k_pair_bitmap.set_size(GX,BX);
|
||||
this->k_pair_bitmap.run(&this->atom->dev_x.begin(), &tabindex.begin(),
|
||||
&nshiftbits.begin(), &nmask.begin(),
|
||||
&coeff2.begin(), &coeff3.begin(), &coeff4.begin(), &_lj_types,
|
||||
&cutsq.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
||||
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom, &_tablength);
|
||||
this->k_pair_bitmap.run(&this->atom->x, &tabindex, &nshiftbits,
|
||||
&nmask, &coeff2, &coeff3, &coeff4, &_lj_types,
|
||||
&cutsq, &sp_lj, &this->nbor->dev_nbor,
|
||||
&this->_nbor_data->begin(), &this->ans->force,
|
||||
&this->ans->engv, &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom,
|
||||
&_tablength);
|
||||
}
|
||||
}
|
||||
this->time_pair.stop();
|
||||
|
||||
@ -15,11 +15,13 @@
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#include "lal_aux_fun1.h"
|
||||
texture<float4> pos_tex;
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
ucl_inline float4 fetch_pos(const int& i, const float4 *pos)
|
||||
{ return tex1Dfetch(pos_tex, i); }
|
||||
texture<float4> pos_tex;
|
||||
#else
|
||||
texture<int4,1> pos_tex;
|
||||
#endif
|
||||
#else
|
||||
#define pos_tex x_
|
||||
#endif
|
||||
|
||||
#define LOOKUP 0
|
||||
@ -37,7 +39,7 @@ typedef union {
|
||||
|
||||
/// ---------------- LOOKUP -------------------------------------------------
|
||||
|
||||
__kernel void kernel_pair(__global numtyp4 *x_, __global int *tabindex,
|
||||
__kernel void k_table(__global numtyp4 *x_, __global int *tabindex,
|
||||
__global numtyp4* coeff2,
|
||||
__global numtyp4 *coeff3,
|
||||
__global numtyp4 *coeff4,
|
||||
@ -73,7 +75,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global int *tabindex,
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
@ -83,7 +85,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global int *tabindex,
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int mtype=itype*lj_types+jx.w;
|
||||
int tbindex = tabindex[mtype];
|
||||
|
||||
@ -128,7 +130,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global int *tabindex,
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global int *tabindex,
|
||||
__kernel void k_table_fast(__global numtyp4 *x_, __global int *tabindex,
|
||||
__global numtyp4* coeff2,
|
||||
__global numtyp4 *coeff3,
|
||||
__global numtyp4 *coeff4,
|
||||
@ -167,7 +169,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global int *tabindex,
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
@ -178,7 +180,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global int *tabindex,
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int mtype=itype+jx.w;
|
||||
int tbindex = tabindex[mtype];
|
||||
|
||||
@ -225,7 +227,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global int *tabindex,
|
||||
|
||||
/// ---------------- LINEAR -------------------------------------------------
|
||||
|
||||
__kernel void kernel_pair_linear(__global numtyp4 *x_, __global int *tabindex,
|
||||
__kernel void k_table_linear(__global numtyp4 *x_, __global int *tabindex,
|
||||
__global numtyp4* coeff2,
|
||||
__global numtyp4 *coeff3,
|
||||
__global numtyp4 *coeff4,
|
||||
@ -261,7 +263,7 @@ __kernel void kernel_pair_linear(__global numtyp4 *x_, __global int *tabindex,
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
@ -271,7 +273,7 @@ __kernel void kernel_pair_linear(__global numtyp4 *x_, __global int *tabindex,
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int mtype=itype*lj_types+jx.w;
|
||||
int tbindex = tabindex[mtype];
|
||||
|
||||
@ -320,7 +322,7 @@ __kernel void kernel_pair_linear(__global numtyp4 *x_, __global int *tabindex,
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void kernel_pair_linear_fast(__global numtyp4 *x_, __global int *tabindex,
|
||||
__kernel void k_table_linear_fast(__global numtyp4 *x_, __global int *tabindex,
|
||||
__global numtyp4* coeff2,
|
||||
__global numtyp4 *coeff3,
|
||||
__global numtyp4 *coeff4,
|
||||
@ -359,7 +361,7 @@ __kernel void kernel_pair_linear_fast(__global numtyp4 *x_, __global int *tabind
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
@ -370,7 +372,7 @@ __kernel void kernel_pair_linear_fast(__global numtyp4 *x_, __global int *tabind
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int mtype=itype+jx.w;
|
||||
int tbindex = tabindex[mtype];
|
||||
|
||||
@ -421,7 +423,7 @@ __kernel void kernel_pair_linear_fast(__global numtyp4 *x_, __global int *tabind
|
||||
|
||||
/// ---------------- SPLINE -------------------------------------------------
|
||||
|
||||
__kernel void kernel_pair_spline(__global numtyp4 *x_, __global int *tabindex,
|
||||
__kernel void k_table_spline(__global numtyp4 *x_, __global int *tabindex,
|
||||
__global numtyp4* coeff2,
|
||||
__global numtyp4 *coeff3,
|
||||
__global numtyp4 *coeff4,
|
||||
@ -457,7 +459,7 @@ __kernel void kernel_pair_spline(__global numtyp4 *x_, __global int *tabindex,
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
@ -467,7 +469,7 @@ __kernel void kernel_pair_spline(__global numtyp4 *x_, __global int *tabindex,
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int mtype=itype*lj_types+jx.w;
|
||||
int tbindex = tabindex[mtype];
|
||||
|
||||
@ -523,7 +525,7 @@ __kernel void kernel_pair_spline(__global numtyp4 *x_, __global int *tabindex,
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void kernel_pair_spline_fast(__global numtyp4 *x_, __global int *tabindex,
|
||||
__kernel void k_table_spline_fast(__global numtyp4 *x_, __global int *tabindex,
|
||||
__global numtyp4* coeff2,
|
||||
__global numtyp4 *coeff3,
|
||||
__global numtyp4 *coeff4,
|
||||
@ -562,7 +564,7 @@ __kernel void kernel_pair_spline_fast(__global numtyp4 *x_, __global int *tabind
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
@ -573,7 +575,7 @@ __kernel void kernel_pair_spline_fast(__global numtyp4 *x_, __global int *tabind
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int mtype=itype+jx.w;
|
||||
int tbindex = tabindex[mtype];
|
||||
|
||||
@ -631,7 +633,7 @@ __kernel void kernel_pair_spline_fast(__global numtyp4 *x_, __global int *tabind
|
||||
|
||||
/// ---------------- BITMAP -------------------------------------------------
|
||||
|
||||
__kernel void kernel_pair_bitmap(__global numtyp4 *x_, __global int *tabindex,
|
||||
__kernel void k_table_bitmap(__global numtyp4 *x_, __global int *tabindex,
|
||||
__global int *nshiftbits, __global int *nmask,
|
||||
__global numtyp4* coeff2,
|
||||
__global numtyp4 *coeff3,
|
||||
@ -668,7 +670,7 @@ __kernel void kernel_pair_bitmap(__global numtyp4 *x_, __global int *tabindex,
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
@ -678,7 +680,7 @@ __kernel void kernel_pair_bitmap(__global numtyp4 *x_, __global int *tabindex,
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int mtype=itype*lj_types+jx.w;
|
||||
int tbindex = tabindex[mtype];
|
||||
|
||||
@ -730,7 +732,7 @@ __kernel void kernel_pair_bitmap(__global numtyp4 *x_, __global int *tabindex,
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void kernel_pair_bitmap_fast(__global numtyp4 *x_, __global int *tabindex,
|
||||
__kernel void k_table_bitmap_fast(__global numtyp4 *x_, __global int *tabindex,
|
||||
__global int *nshiftbits, __global int *nmask,
|
||||
__global numtyp4* coeff2,
|
||||
__global numtyp4 *coeff3,
|
||||
@ -770,7 +772,7 @@ __kernel void kernel_pair_bitmap_fast(__global numtyp4 *x_, __global int *tabind
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
@ -781,7 +783,7 @@ __kernel void kernel_pair_bitmap_fast(__global numtyp4 *x_, __global int *tabind
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int mtype=itype+jx.w;
|
||||
int tbindex = tabindex[mtype];
|
||||
|
||||
|
||||
@ -13,10 +13,12 @@
|
||||
email : nguyentd@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
#if defined(USE_OPENCL)
|
||||
#include "yukawa_cl.h"
|
||||
#elif defined(USE_CUDART)
|
||||
const char *yukawa=0;
|
||||
#else
|
||||
#include "yukawa_ptx.h"
|
||||
#include "yukawa_cubin.h"
|
||||
#endif
|
||||
|
||||
#include "lal_yukawa.h"
|
||||
@ -50,7 +52,7 @@ int YukawaT::init(const int ntypes,
|
||||
const double gpu_split, FILE *_screen) {
|
||||
int success;
|
||||
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||
_screen,yukawa);
|
||||
_screen,yukawa,"k_yukawa");
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
@ -129,20 +131,17 @@ void YukawaT::loop(const bool _eflag, const bool _vflag) {
|
||||
this->time_pair.start();
|
||||
if (shared_types) {
|
||||
this->k_pair_fast.set_size(GX,BX);
|
||||
this->k_pair_fast.run(&this->atom->dev_x.begin(), &coeff.begin(), &_kappa,
|
||||
&sp_lj.begin(),
|
||||
&this->nbor->dev_nbor.begin(),
|
||||
&this->_nbor_data->begin(),
|
||||
&this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag,
|
||||
&ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
this->k_pair_fast.run(&this->atom->x, &coeff, &_kappa, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag,
|
||||
&vflag, &ainum, &nbor_pitch,
|
||||
&this->_threads_per_atom);
|
||||
} else {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->dev_x.begin(), &coeff.begin(), &_kappa,
|
||||
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
||||
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
this->k_pair.run(&this->atom->x, &coeff, &_kappa, &_lj_types, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag, &vflag,
|
||||
&ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
}
|
||||
this->time_pair.stop();
|
||||
}
|
||||
|
||||
@ -15,14 +15,16 @@
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
#include "lal_aux_fun1.h"
|
||||
texture<float4> pos_tex;
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
ucl_inline float4 fetch_pos(const int& i, const float4 *pos)
|
||||
{ return tex1Dfetch(pos_tex, i); }
|
||||
texture<float4> pos_tex;
|
||||
#else
|
||||
texture<int4,1> pos_tex;
|
||||
#endif
|
||||
#else
|
||||
#define pos_tex x_
|
||||
#endif
|
||||
|
||||
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff,
|
||||
__kernel void k_yukawa(__global numtyp4 *x_, __global numtyp4 *coeff,
|
||||
const numtyp kappa, const int lj_types,
|
||||
__global numtyp *sp_lj_in, __global int *dev_nbor,
|
||||
__global int *dev_packed, __global acctyp4 *ans,
|
||||
@ -51,7 +53,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff,
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
@ -61,7 +63,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff,
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jtype=jx.w;
|
||||
|
||||
// Compute r12
|
||||
@ -103,7 +105,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff,
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff_in,
|
||||
__kernel void k_yukawa_fast(__global numtyp4 *x_, __global numtyp4 *coeff_in,
|
||||
const numtyp kappa, __global numtyp* sp_lj_in,
|
||||
__global int *dev_nbor, __global int *dev_packed,
|
||||
__global acctyp4 *ans, __global acctyp *engv,
|
||||
@ -135,7 +137,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff_in,
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,list_end,nbor);
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
@ -146,7 +148,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff_in,
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int mtype=itype+jx.w;
|
||||
|
||||
// Compute r12
|
||||
|
||||
Reference in New Issue
Block a user