git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@10667 f3b2605a-c512-4ea7-a41b-209d697bcdaa
This commit is contained in:
@ -4,9 +4,14 @@
|
|||||||
|
|
||||||
# which file will be copied to Makefile.lammps
|
# which file will be copied to Makefile.lammps
|
||||||
|
|
||||||
EXTRAMAKE = Makefile.lammps.standard
|
EXTRAMAKE = Makefile.lammps.opencl
|
||||||
|
|
||||||
OCL_CPP = mpic++ -O3 -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK
|
OCL_TUNE = -DFERMI_OCL # -- Uncomment for NVIDIA Fermi
|
||||||
|
# OCL_TUNE = -DKEPLER_OCL # -- Uncomment for NVIDIA Kepler
|
||||||
|
# OCL_TUNE = -DCYPRESS_OCL # -- Uncomment for AMD Cypress
|
||||||
|
# OCL_TUNE = -DGENERIC_OCL # -- Uncomment for generic device
|
||||||
|
|
||||||
|
OCL_CPP = mpic++ $(DEFAULT_DEVICE) -O3 -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK
|
||||||
OCL_LINK = -lOpenCL
|
OCL_LINK = -lOpenCL
|
||||||
OCL_PREC = -D_SINGLE_SINGLE
|
OCL_PREC = -D_SINGLE_SINGLE
|
||||||
|
|
||||||
|
|||||||
@ -4,7 +4,12 @@
|
|||||||
|
|
||||||
# which file will be copied to Makefile.lammps
|
# which file will be copied to Makefile.lammps
|
||||||
|
|
||||||
EXTRAMAKE = Makefile.lammps.standard
|
EXTRAMAKE = Makefile.lammps.mac_ocl
|
||||||
|
|
||||||
|
OCL_TUNE = -DFERMI_OCL # -- Uncomment for NVIDIA Fermi
|
||||||
|
# OCL_TUNE = -DKEPLER_OCL # -- Uncomment for NVIDIA Kepler
|
||||||
|
# OCL_TUNE = -DCYPRESS_OCL # -- Uncomment for AMD Cypress
|
||||||
|
# OCL_TUNE = -DGENERIC_OCL # -- Uncomment for generic device
|
||||||
|
|
||||||
OCL_CPP = mpic++ -O3 -DMPI_GERYON -DUCL_NO_EXIT
|
OCL_CPP = mpic++ -O3 -DMPI_GERYON -DUCL_NO_EXIT
|
||||||
OCL_LINK = -framework OpenCL
|
OCL_LINK = -framework OpenCL
|
||||||
|
|||||||
@ -28,7 +28,7 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_ans.o \
|
|||||||
$(OBJ_DIR)/lal_neighbor.o $(OBJ_DIR)/lal_neighbor_shared.o \
|
$(OBJ_DIR)/lal_neighbor.o $(OBJ_DIR)/lal_neighbor_shared.o \
|
||||||
$(OBJ_DIR)/lal_device.o $(OBJ_DIR)/lal_base_atomic.o \
|
$(OBJ_DIR)/lal_device.o $(OBJ_DIR)/lal_base_atomic.o \
|
||||||
$(OBJ_DIR)/lal_base_charge.o $(OBJ_DIR)/lal_base_ellipsoid.o \
|
$(OBJ_DIR)/lal_base_charge.o $(OBJ_DIR)/lal_base_ellipsoid.o \
|
||||||
$(OBJ_DIR)/lal_base_dipole.o \
|
$(OBJ_DIR)/lal_base_dipole.o $(OBJ_DIR)/lal_base_three.o \
|
||||||
$(OBJ_DIR)/lal_pppm.o $(OBJ_DIR)/lal_pppm_ext.o \
|
$(OBJ_DIR)/lal_pppm.o $(OBJ_DIR)/lal_pppm_ext.o \
|
||||||
$(OBJ_DIR)/lal_gayberne.o $(OBJ_DIR)/lal_gayberne_ext.o \
|
$(OBJ_DIR)/lal_gayberne.o $(OBJ_DIR)/lal_gayberne_ext.o \
|
||||||
$(OBJ_DIR)/lal_re_squared.o $(OBJ_DIR)/lal_re_squared_ext.o \
|
$(OBJ_DIR)/lal_re_squared.o $(OBJ_DIR)/lal_re_squared_ext.o \
|
||||||
@ -59,7 +59,12 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_ans.o \
|
|||||||
$(OBJ_DIR)/lal_gauss.o $(OBJ_DIR)/lal_gauss_ext.o \
|
$(OBJ_DIR)/lal_gauss.o $(OBJ_DIR)/lal_gauss_ext.o \
|
||||||
$(OBJ_DIR)/lal_yukawa_colloid.o $(OBJ_DIR)/lal_yukawa_colloid_ext.o \
|
$(OBJ_DIR)/lal_yukawa_colloid.o $(OBJ_DIR)/lal_yukawa_colloid_ext.o \
|
||||||
$(OBJ_DIR)/lal_lj_coul_debye.o $(OBJ_DIR)/lal_lj_coul_debye_ext.o \
|
$(OBJ_DIR)/lal_lj_coul_debye.o $(OBJ_DIR)/lal_lj_coul_debye_ext.o \
|
||||||
$(OBJ_DIR)/lal_coul_dsf.o $(OBJ_DIR)/lal_coul_dsf_ext.o
|
$(OBJ_DIR)/lal_coul_dsf.o $(OBJ_DIR)/lal_coul_dsf_ext.o \
|
||||||
|
$(OBJ_DIR)/lal_sw.o $(OBJ_DIR)/lal_sw_ext.o \
|
||||||
|
$(OBJ_DIR)/lal_beck.o $(OBJ_DIR)/lal_beck_ext.o \
|
||||||
|
$(OBJ_DIR)/lal_mie.o $(OBJ_DIR)/lal_mie_ext.o \
|
||||||
|
$(OBJ_DIR)/lal_soft.o $(OBJ_DIR)/lal_soft_ext.o \
|
||||||
|
$(OBJ_DIR)/lal_lj_coul_msm.o $(OBJ_DIR)/lal_lj_coul_msm_ext.o
|
||||||
|
|
||||||
CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \
|
CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \
|
||||||
$(OBJ_DIR)/atom.cubin $(OBJ_DIR)/atom_cubin.h \
|
$(OBJ_DIR)/atom.cubin $(OBJ_DIR)/atom_cubin.h \
|
||||||
@ -99,7 +104,12 @@ CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \
|
|||||||
$(OBJ_DIR)/gauss.cubin $(OBJ_DIR)/gauss_cubin.h \
|
$(OBJ_DIR)/gauss.cubin $(OBJ_DIR)/gauss_cubin.h \
|
||||||
$(OBJ_DIR)/yukawa_colloid.cubin $(OBJ_DIR)/yukawa_colloid_cubin.h \
|
$(OBJ_DIR)/yukawa_colloid.cubin $(OBJ_DIR)/yukawa_colloid_cubin.h \
|
||||||
$(OBJ_DIR)/lj_coul_debye.cubin $(OBJ_DIR)/lj_coul_debye_cubin.h \
|
$(OBJ_DIR)/lj_coul_debye.cubin $(OBJ_DIR)/lj_coul_debye_cubin.h \
|
||||||
$(OBJ_DIR)/coul_dsf.cubin $(OBJ_DIR)/coul_dsf_cubin.h
|
$(OBJ_DIR)/coul_dsf.cubin $(OBJ_DIR)/coul_dsf_cubin.h \
|
||||||
|
$(OBJ_DIR)/sw.cubin $(OBJ_DIR)/sw_cubin.h \
|
||||||
|
$(OBJ_DIR)/beck.cubin $(OBJ_DIR)/beck_cubin.h \
|
||||||
|
$(OBJ_DIR)/mie.cubin $(OBJ_DIR)/mie_cubin.h \
|
||||||
|
$(OBJ_DIR)/soft.cubin $(OBJ_DIR)/soft_cubin.h \
|
||||||
|
$(OBJ_DIR)/lj_coul_msm.cubin $(OBJ_DIR)/lj_coul_msm_cubin.h
|
||||||
|
|
||||||
all: $(OBJ_DIR) $(GPU_LIB) $(EXECS)
|
all: $(OBJ_DIR) $(GPU_LIB) $(EXECS)
|
||||||
|
|
||||||
@ -175,6 +185,9 @@ $(OBJ_DIR)/lal_base_ellipsoid.o: $(ALL_H) lal_base_ellipsoid.h lal_base_ellipsoi
|
|||||||
$(OBJ_DIR)/lal_base_dipole.o: $(ALL_H) lal_base_dipole.h lal_base_dipole.cpp
|
$(OBJ_DIR)/lal_base_dipole.o: $(ALL_H) lal_base_dipole.h lal_base_dipole.cpp
|
||||||
$(CUDR) -o $@ -c lal_base_dipole.cpp
|
$(CUDR) -o $@ -c lal_base_dipole.cpp
|
||||||
|
|
||||||
|
$(OBJ_DIR)/lal_base_three.o: $(ALL_H) lal_base_three.h lal_base_three.cpp
|
||||||
|
$(CUDR) -o $@ -c lal_base_three.cpp
|
||||||
|
|
||||||
$(OBJ_DIR)/pppm_f.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h
|
$(OBJ_DIR)/pppm_f.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h
|
||||||
$(CUDA) --cubin -DNV_KERNEL -Dgrdtyp=float -Dgrdtyp4=float4 -o $@ lal_pppm.cu
|
$(CUDA) --cubin -DNV_KERNEL -Dgrdtyp=float -Dgrdtyp4=float4 -o $@ lal_pppm.cu
|
||||||
|
|
||||||
@ -571,6 +584,66 @@ $(OBJ_DIR)/lal_coul_dsf.o: $(ALL_H) lal_coul_dsf.h lal_coul_dsf.cpp $(OBJ_DIR)/c
|
|||||||
$(OBJ_DIR)/lal_coul_dsf_ext.o: $(ALL_H) lal_coul_dsf.h lal_coul_dsf_ext.cpp lal_base_charge.h
|
$(OBJ_DIR)/lal_coul_dsf_ext.o: $(ALL_H) lal_coul_dsf.h lal_coul_dsf_ext.cpp lal_base_charge.h
|
||||||
$(CUDR) -o $@ -c lal_coul_dsf_ext.cpp -I$(OBJ_DIR)
|
$(CUDR) -o $@ -c lal_coul_dsf_ext.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
|
$(OBJ_DIR)/sw.cubin: lal_sw.cu lal_precision.h lal_preprocessor.h
|
||||||
|
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_sw.cu
|
||||||
|
|
||||||
|
$(OBJ_DIR)/sw_cubin.h: $(OBJ_DIR)/sw.cubin $(OBJ_DIR)/sw.cubin
|
||||||
|
$(BIN2C) -c -n sw $(OBJ_DIR)/sw.cubin > $(OBJ_DIR)/sw_cubin.h
|
||||||
|
|
||||||
|
$(OBJ_DIR)/lal_sw.o: $(ALL_H) lal_sw.h lal_sw.cpp $(OBJ_DIR)/sw_cubin.h $(OBJ_DIR)/lal_base_three.o
|
||||||
|
$(CUDR) -o $@ -c lal_sw.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
|
$(OBJ_DIR)/lal_sw_ext.o: $(ALL_H) lal_sw.h lal_sw_ext.cpp lal_base_three.h
|
||||||
|
$(CUDR) -o $@ -c lal_sw_ext.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
|
$(OBJ_DIR)/beck.cubin: lal_beck.cu lal_precision.h lal_preprocessor.h
|
||||||
|
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_beck.cu
|
||||||
|
|
||||||
|
$(OBJ_DIR)/beck_cubin.h: $(OBJ_DIR)/beck.cubin $(OBJ_DIR)/beck.cubin
|
||||||
|
$(BIN2C) -c -n beck $(OBJ_DIR)/beck.cubin > $(OBJ_DIR)/beck_cubin.h
|
||||||
|
|
||||||
|
$(OBJ_DIR)/lal_beck.o: $(ALL_H) lal_beck.h lal_beck.cpp $(OBJ_DIR)/beck_cubin.h $(OBJ_DIR)/lal_base_atomic.o
|
||||||
|
$(CUDR) -o $@ -c lal_beck.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
|
$(OBJ_DIR)/lal_beck_ext.o: $(ALL_H) lal_beck.h lal_beck_ext.cpp lal_base_atomic.h
|
||||||
|
$(CUDR) -o $@ -c lal_beck_ext.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
|
$(OBJ_DIR)/mie.cubin: lal_mie.cu lal_precision.h lal_preprocessor.h
|
||||||
|
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_mie.cu
|
||||||
|
|
||||||
|
$(OBJ_DIR)/mie_cubin.h: $(OBJ_DIR)/mie.cubin $(OBJ_DIR)/mie.cubin
|
||||||
|
$(BIN2C) -c -n mie $(OBJ_DIR)/mie.cubin > $(OBJ_DIR)/mie_cubin.h
|
||||||
|
|
||||||
|
$(OBJ_DIR)/lal_mie.o: $(ALL_H) lal_mie.h lal_mie.cpp $(OBJ_DIR)/mie_cubin.h $(OBJ_DIR)/lal_base_atomic.o
|
||||||
|
$(CUDR) -o $@ -c lal_mie.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
|
$(OBJ_DIR)/lal_mie_ext.o: $(ALL_H) lal_mie.h lal_mie_ext.cpp lal_base_atomic.h
|
||||||
|
$(CUDR) -o $@ -c lal_mie_ext.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
|
$(OBJ_DIR)/soft.cubin: lal_soft.cu lal_precision.h lal_preprocessor.h
|
||||||
|
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_soft.cu
|
||||||
|
|
||||||
|
$(OBJ_DIR)/soft_cubin.h: $(OBJ_DIR)/soft.cubin $(OBJ_DIR)/soft.cubin
|
||||||
|
$(BIN2C) -c -n soft $(OBJ_DIR)/soft.cubin > $(OBJ_DIR)/soft_cubin.h
|
||||||
|
|
||||||
|
$(OBJ_DIR)/lal_soft.o: $(ALL_H) lal_soft.h lal_soft.cpp $(OBJ_DIR)/soft_cubin.h $(OBJ_DIR)/lal_base_atomic.o
|
||||||
|
$(CUDR) -o $@ -c lal_soft.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
|
$(OBJ_DIR)/lal_soft_ext.o: $(ALL_H) lal_soft.h lal_soft_ext.cpp lal_base_atomic.h
|
||||||
|
$(CUDR) -o $@ -c lal_soft_ext.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
|
$(OBJ_DIR)/lj_coul_msm.cubin: lal_lj_coul_msm.cu lal_precision.h lal_preprocessor.h
|
||||||
|
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj_coul_msm.cu
|
||||||
|
|
||||||
|
$(OBJ_DIR)/lj_coul_msm_cubin.h: $(OBJ_DIR)/lj_coul_msm.cubin $(OBJ_DIR)/lj_coul_msm.cubin
|
||||||
|
$(BIN2C) -c -n lj_coul_msm $(OBJ_DIR)/lj_coul_msm.cubin > $(OBJ_DIR)/lj_coul_msm_cubin.h
|
||||||
|
|
||||||
|
$(OBJ_DIR)/lal_lj_coul_msm.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm.cpp $(OBJ_DIR)/lj_coul_msm_cubin.h $(OBJ_DIR)/lal_base_charge.o
|
||||||
|
$(CUDR) -o $@ -c lal_lj_coul_msm.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
|
$(OBJ_DIR)/lal_lj_coul_msm_ext.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm_ext.cpp lal_base_charge.h
|
||||||
|
$(CUDR) -o $@ -c lal_lj_coul_msm_ext.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
$(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVD_H)
|
$(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVD_H)
|
||||||
$(CUDR) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_CUDADR $(CUDA_LIB) -lcuda
|
$(CUDR) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_CUDADR $(CUDA_LIB) -lcuda
|
||||||
|
|
||||||
|
|||||||
@ -17,7 +17,7 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_answer.o \
|
|||||||
$(OBJ_DIR)/lal_neighbor_shared.o $(OBJ_DIR)/lal_neighbor.o \
|
$(OBJ_DIR)/lal_neighbor_shared.o $(OBJ_DIR)/lal_neighbor.o \
|
||||||
$(OBJ_DIR)/lal_device.o $(OBJ_DIR)/lal_base_atomic.o \
|
$(OBJ_DIR)/lal_device.o $(OBJ_DIR)/lal_base_atomic.o \
|
||||||
$(OBJ_DIR)/lal_base_charge.o $(OBJ_DIR)/lal_base_ellipsoid.o \
|
$(OBJ_DIR)/lal_base_charge.o $(OBJ_DIR)/lal_base_ellipsoid.o \
|
||||||
$(OBJ_DIR)/lal_base_dipole.o \
|
$(OBJ_DIR)/lal_base_dipole.o $(OBJ_DIR)/lal_base_three.o \
|
||||||
$(OBJ_DIR)/lal_pppm.o $(OBJ_DIR)/lal_pppm_ext.o \
|
$(OBJ_DIR)/lal_pppm.o $(OBJ_DIR)/lal_pppm_ext.o \
|
||||||
$(OBJ_DIR)/lal_gayberne.o $(OBJ_DIR)/lal_gayberne_ext.o \
|
$(OBJ_DIR)/lal_gayberne.o $(OBJ_DIR)/lal_gayberne_ext.o \
|
||||||
$(OBJ_DIR)/lal_re_squared.o $(OBJ_DIR)/lal_re_squared_ext.o \
|
$(OBJ_DIR)/lal_re_squared.o $(OBJ_DIR)/lal_re_squared_ext.o \
|
||||||
@ -48,7 +48,12 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_answer.o \
|
|||||||
$(OBJ_DIR)/lal_gauss.o $(OBJ_DIR)/lal_gauss_ext.o \
|
$(OBJ_DIR)/lal_gauss.o $(OBJ_DIR)/lal_gauss_ext.o \
|
||||||
$(OBJ_DIR)/lal_yukawa_colloid.o $(OBJ_DIR)/lal_yukawa_colloid_ext.o \
|
$(OBJ_DIR)/lal_yukawa_colloid.o $(OBJ_DIR)/lal_yukawa_colloid_ext.o \
|
||||||
$(OBJ_DIR)/lal_lj_coul_debye.o $(OBJ_DIR)/lal_lj_coul_debye_ext.o \
|
$(OBJ_DIR)/lal_lj_coul_debye.o $(OBJ_DIR)/lal_lj_coul_debye_ext.o \
|
||||||
$(OBJ_DIR)/lal_coul_dsf.o $(OBJ_DIR)/lal_coul_dsf_ext.o
|
$(OBJ_DIR)/lal_coul_dsf.o $(OBJ_DIR)/lal_coul_dsf_ext.o \
|
||||||
|
$(OBJ_DIR)/lal_sw.o $(OBJ_DIR)/lal_sw_ext.o \
|
||||||
|
$(OBJ_DIR)/lal_beck.o $(OBJ_DIR)/lal_beck_ext.o \
|
||||||
|
$(OBJ_DIR)/lal_mie.o $(OBJ_DIR)/lal_mie_ext.o \
|
||||||
|
$(OBJ_DIR)/lal_soft.o $(OBJ_DIR)/lal_soft_ext.o \
|
||||||
|
$(OBJ_DIR)/lal_lj_coul_msm.o $(OBJ_DIR)/lal_lj_coul_msm_ext.o
|
||||||
|
|
||||||
KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \
|
KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \
|
||||||
$(OBJ_DIR)/neighbor_cpu_cl.h $(OBJ_DIR)/pppm_cl.h \
|
$(OBJ_DIR)/neighbor_cpu_cl.h $(OBJ_DIR)/pppm_cl.h \
|
||||||
@ -68,7 +73,9 @@ KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \
|
|||||||
$(OBJ_DIR)/born_coul_long_cl.h $(OBJ_DIR)/dipole_lj_cl.h \
|
$(OBJ_DIR)/born_coul_long_cl.h $(OBJ_DIR)/dipole_lj_cl.h \
|
||||||
$(OBJ_DIR)/dipole_lj_sf_cl.h $(OBJ_DIR)/colloid_cl.h \
|
$(OBJ_DIR)/dipole_lj_sf_cl.h $(OBJ_DIR)/colloid_cl.h \
|
||||||
$(OBJ_DIR)/gauss_cl.h $(OBJ_DIR)/yukawa_colloid_cl.h \
|
$(OBJ_DIR)/gauss_cl.h $(OBJ_DIR)/yukawa_colloid_cl.h \
|
||||||
$(OBJ_DIR)/lj_coul_debye_cl.h $(OBJ_DIR)/coul_dsf_cl.h
|
$(OBJ_DIR)/lj_coul_debye_cl.h $(OBJ_DIR)/coul_dsf_cl.h \
|
||||||
|
$(OBJ_DIR)/sw_cl.h $(OBJ_DIR)/beck_cl.h $(OBJ_DIR)/mie_cl.h \
|
||||||
|
$(OBJ_DIR)/soft_cl.h $(OBJ_DIR)/lj_coul_msm_cl.h
|
||||||
|
|
||||||
|
|
||||||
OCL_EXECS = $(BIN_DIR)/ocl_get_devices
|
OCL_EXECS = $(BIN_DIR)/ocl_get_devices
|
||||||
@ -117,6 +124,9 @@ $(OBJ_DIR)/lal_base_ellipsoid.o: $(OCL_H) lal_base_ellipsoid.h lal_base_ellipsoi
|
|||||||
$(OBJ_DIR)/lal_base_dipole.o: $(OCL_H) lal_base_dipole.h lal_base_dipole.cpp
|
$(OBJ_DIR)/lal_base_dipole.o: $(OCL_H) lal_base_dipole.h lal_base_dipole.cpp
|
||||||
$(OCL) -o $@ -c lal_base_dipole.cpp -I$(OBJ_DIR)
|
$(OCL) -o $@ -c lal_base_dipole.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
|
$(OBJ_DIR)/lal_base_three.o: $(OCL_H) lal_base_three.h lal_base_three.cpp
|
||||||
|
$(OCL) -o $@ -c lal_base_three.cpp
|
||||||
|
|
||||||
$(OBJ_DIR)/pppm_cl.h: lal_pppm.cu lal_preprocessor.h
|
$(OBJ_DIR)/pppm_cl.h: lal_pppm.cu lal_preprocessor.h
|
||||||
$(BSH) ./geryon/file_to_cstr.sh pppm lal_preprocessor.h lal_pppm.cu $(OBJ_DIR)/pppm_cl.h;
|
$(BSH) ./geryon/file_to_cstr.sh pppm lal_preprocessor.h lal_pppm.cu $(OBJ_DIR)/pppm_cl.h;
|
||||||
|
|
||||||
@ -405,6 +415,51 @@ $(OBJ_DIR)/lal_coul_dsf.o: $(ALL_H) lal_coul_dsf.h lal_coul_dsf.cpp $(OBJ_DIR)/
|
|||||||
$(OBJ_DIR)/lal_coul_dsf_ext.o: $(ALL_H) lal_coul_dsf.h lal_coul_dsf_ext.cpp lal_base_charge.h
|
$(OBJ_DIR)/lal_coul_dsf_ext.o: $(ALL_H) lal_coul_dsf.h lal_coul_dsf_ext.cpp lal_base_charge.h
|
||||||
$(OCL) -o $@ -c lal_coul_dsf_ext.cpp -I$(OBJ_DIR)
|
$(OCL) -o $@ -c lal_coul_dsf_ext.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
|
$(OBJ_DIR)/sw_cl.h: lal_sw.cu $(PRE1_H)
|
||||||
|
$(BSH) ./geryon/file_to_cstr.sh sw $(PRE1_H) lal_sw.cu $(OBJ_DIR)/sw_cl.h;
|
||||||
|
|
||||||
|
$(OBJ_DIR)/lal_sw.o: $(ALL_H) lal_sw.h lal_sw.cpp $(OBJ_DIR)/sw_cl.h $(OBJ_DIR)/sw_cl.h $(OBJ_DIR)/lal_base_three.o
|
||||||
|
$(OCL) -o $@ -c lal_sw.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
|
$(OBJ_DIR)/lal_sw_ext.o: $(ALL_H) lal_sw.h lal_sw_ext.cpp lal_base_three.h
|
||||||
|
$(OCL) -o $@ -c lal_sw_ext.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
|
$(OBJ_DIR)/beck_cl.h: lal_beck.cu $(PRE1_H)
|
||||||
|
$(BSH) ./geryon/file_to_cstr.sh beck $(PRE1_H) lal_beck.cu $(OBJ_DIR)/beck_cl.h;
|
||||||
|
|
||||||
|
$(OBJ_DIR)/lal_beck.o: $(ALL_H) lal_beck.h lal_beck.cpp $(OBJ_DIR)/beck_cl.h $(OBJ_DIR)/beck_cl.h $(OBJ_DIR)/lal_base_atomic.o
|
||||||
|
$(OCL) -o $@ -c lal_beck.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
|
$(OBJ_DIR)/lal_beck_ext.o: $(ALL_H) lal_beck.h lal_beck_ext.cpp lal_base_atomic.h
|
||||||
|
$(OCL) -o $@ -c lal_beck_ext.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
|
$(OBJ_DIR)/mie_cl.h: lal_mie.cu $(PRE1_H)
|
||||||
|
$(BSH) ./geryon/file_to_cstr.sh mie $(PRE1_H) lal_mie.cu $(OBJ_DIR)/mie_cl.h;
|
||||||
|
|
||||||
|
$(OBJ_DIR)/lal_mie.o: $(ALL_H) lal_mie.h lal_mie.cpp $(OBJ_DIR)/mie_cl.h $(OBJ_DIR)/mie_cl.h $(OBJ_DIR)/lal_base_atomic.o
|
||||||
|
$(OCL) -o $@ -c lal_mie.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
|
$(OBJ_DIR)/lal_mie_ext.o: $(ALL_H) lal_mie.h lal_mie_ext.cpp lal_base_atomic.h
|
||||||
|
$(OCL) -o $@ -c lal_mie_ext.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
|
$(OBJ_DIR)/soft_cl.h: lal_soft.cu $(PRE1_H)
|
||||||
|
$(BSH) ./geryon/file_to_cstr.sh soft $(PRE1_H) lal_soft.cu $(OBJ_DIR)/soft_cl.h;
|
||||||
|
|
||||||
|
$(OBJ_DIR)/lal_soft.o: $(ALL_H) lal_soft.h lal_soft.cpp $(OBJ_DIR)/soft_cl.h $(OBJ_DIR)/soft_cl.h $(OBJ_DIR)/lal_base_atomic.o
|
||||||
|
$(OCL) -o $@ -c lal_soft.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
|
$(OBJ_DIR)/lal_soft_ext.o: $(ALL_H) lal_soft.h lal_soft_ext.cpp lal_base_atomic.h
|
||||||
|
$(OCL) -o $@ -c lal_soft_ext.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
|
$(OBJ_DIR)/lj_coul_msm_cl.h: lal_lj_coul_msm.cu $(PRE1_H)
|
||||||
|
$(BSH) ./geryon/file_to_cstr.sh lj_coul_msm $(PRE1_H) lal_lj_coul_msm.cu $(OBJ_DIR)/lj_coul_msm_cl.h;
|
||||||
|
|
||||||
|
$(OBJ_DIR)/lal_lj_coul_msm.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm.cpp $(OBJ_DIR)/lj_coul_msm_cl.h $(OBJ_DIR)/lj_coul_msm_cl.h $(OBJ_DIR)/lal_base_charge.o
|
||||||
|
$(OCL) -o $@ -c lal_lj_coul_msm.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
|
$(OBJ_DIR)/lal_lj_coul_msm_ext.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm_ext.cpp lal_base_charge.h
|
||||||
|
$(OCL) -o $@ -c lal_lj_coul_msm_ext.cpp -I$(OBJ_DIR)
|
||||||
|
|
||||||
$(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp
|
$(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp
|
||||||
$(OCL) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_OPENCL $(OCL_LINK)
|
$(OCL) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_OPENCL $(OCL_LINK)
|
||||||
|
|
||||||
|
|||||||
@ -3,6 +3,7 @@
|
|||||||
--------------------------------
|
--------------------------------
|
||||||
|
|
||||||
W. Michael Brown (ORNL)
|
W. Michael Brown (ORNL)
|
||||||
|
Trung Dac Nguyen (ORNL)
|
||||||
Peng Wang (NVIDIA)
|
Peng Wang (NVIDIA)
|
||||||
Axel Kohlmeyer (Temple)
|
Axel Kohlmeyer (Temple)
|
||||||
Steve Plimpton (SNL)
|
Steve Plimpton (SNL)
|
||||||
@ -60,6 +61,8 @@ devices on your system. A Makefile for OpenCL compilation is provided,
|
|||||||
but support for OpenCL use is not currently provided by the developers.
|
but support for OpenCL use is not currently provided by the developers.
|
||||||
Details of the implementation are provided in:
|
Details of the implementation are provided in:
|
||||||
|
|
||||||
|
----
|
||||||
|
|
||||||
Brown, W.M., Wang, P. Plimpton, S.J., Tharrington, A.N. Implementing
|
Brown, W.M., Wang, P. Plimpton, S.J., Tharrington, A.N. Implementing
|
||||||
Molecular Dynamics on Hybrid High Performance Computers - Short Range
|
Molecular Dynamics on Hybrid High Performance Computers - Short Range
|
||||||
Forces. Computer Physics Communications. 2011. 182: p. 898-911.
|
Forces. Computer Physics Communications. 2011. 182: p. 898-911.
|
||||||
@ -68,28 +71,64 @@ and
|
|||||||
|
|
||||||
Brown, W.M., Kohlmeyer, A. Plimpton, S.J., Tharrington, A.N. Implementing
|
Brown, W.M., Kohlmeyer, A. Plimpton, S.J., Tharrington, A.N. Implementing
|
||||||
Molecular Dynamics on Hybrid High Performance Computers - Particle-Particle
|
Molecular Dynamics on Hybrid High Performance Computers - Particle-Particle
|
||||||
Particle-Mesh. Computer Physics Communications. 2011. In press.
|
Particle-Mesh. Computer Physics Communications. 2012. 183: p. 449-459.
|
||||||
|
|
||||||
|
and
|
||||||
|
|
||||||
|
Brown, W.M., Masako, Y. Implementing Molecular Dynamics on Hybrid High
|
||||||
|
Performance Computers - Three-Body Potentials. Computer Physics Communications.
|
||||||
|
2013. In press.
|
||||||
|
|
||||||
|
----
|
||||||
|
|
||||||
NOTE: Installation of the CUDA SDK is not required.
|
NOTE: Installation of the CUDA SDK is not required.
|
||||||
|
|
||||||
Current styles supporting GPU acceleration:
|
Current styles supporting GPU acceleration:
|
||||||
|
|
||||||
1. lj/cut
|
1 beck
|
||||||
2. lj96/cut
|
2 born/coul/long
|
||||||
3. lj/expand
|
3 born/coul/wolf
|
||||||
4. lj/cut/coul/cut
|
4 born
|
||||||
5. lj/cut/coul/long
|
5 buck/coul/cut
|
||||||
6. lj/charmm/coul/long
|
6 buck/coul/long
|
||||||
7. lj/class2
|
7 buck
|
||||||
8. lj/class2/coul/long
|
8 colloid
|
||||||
9. morse
|
9 coul/dsf
|
||||||
10. cg/cmm
|
10 coul/long
|
||||||
11. cg/cmm/coul/long
|
11 eam/alloy
|
||||||
12. coul/long
|
12 eam/fs
|
||||||
13. gayberne
|
13 eam
|
||||||
14. resquared
|
14 eam/lj
|
||||||
15. pppm
|
15 gauss
|
||||||
|
16 gayberne
|
||||||
|
17 lj96/cut
|
||||||
|
18 lj/charmm/coul/long
|
||||||
|
19 lj/class2/coul/long
|
||||||
|
20 lj/class2
|
||||||
|
21 lj/cut/coul/cut
|
||||||
|
22 lj/cut/coul/debye
|
||||||
|
23 lj/cut/coul/dsf
|
||||||
|
24 lj/cut/coul/long
|
||||||
|
25 lj/cut/coul/msm
|
||||||
|
26 lj/cut/coul/wolf/fsw
|
||||||
|
27 lj/cut/dipole/cut
|
||||||
|
28 lj/cut
|
||||||
|
29 lj/cut/tgpu
|
||||||
|
30 lj/expand
|
||||||
|
31 lj/sdk/coul/long
|
||||||
|
32 cg/cmm/coul/long
|
||||||
|
33 lj/sdk
|
||||||
|
34 cg/cmm
|
||||||
|
35 lj/sf/dipole/sf
|
||||||
|
36 mie/cut
|
||||||
|
37 morse
|
||||||
|
38 resquared
|
||||||
|
39 soft
|
||||||
|
40 sw
|
||||||
|
41 table
|
||||||
|
42 yukawa/colloid
|
||||||
|
43 yukawa
|
||||||
|
44 pppm
|
||||||
|
|
||||||
|
|
||||||
MULTIPLE LAMMPS PROCESSES
|
MULTIPLE LAMMPS PROCESSES
|
||||||
@ -170,3 +209,4 @@ make yes-asphere
|
|||||||
make yes-kspace
|
make yes-kspace
|
||||||
make yes-gpu
|
make yes-gpu
|
||||||
make linux
|
make linux
|
||||||
|
|
||||||
|
|||||||
@ -1 +1 @@
|
|||||||
Geryon Version 12.033
|
Geryon Version 13.209
|
||||||
|
|||||||
@ -65,15 +65,19 @@ class UCL_Device {
|
|||||||
public:
|
public:
|
||||||
/// Collect properties for every GPU on the node
|
/// Collect properties for every GPU on the node
|
||||||
/** \note You must set the active GPU with set() before using the device **/
|
/** \note You must set the active GPU with set() before using the device **/
|
||||||
UCL_Device();
|
inline UCL_Device();
|
||||||
|
|
||||||
~UCL_Device();
|
inline ~UCL_Device();
|
||||||
|
|
||||||
/// Returns 1 (For compatibility with OpenCL)
|
/// Returns 1 (For compatibility with OpenCL)
|
||||||
inline int num_platforms() { return 1; }
|
inline int num_platforms() { return 1; }
|
||||||
|
|
||||||
/// Return a string with name and info of the current platform
|
/// Return a string with name and info of the current platform
|
||||||
std::string platform_name() { return "NVIDIA Corporation NVIDIA CUDA Driver"; }
|
inline std::string platform_name()
|
||||||
|
{ return "NVIDIA Corporation NVIDIA CUDA Driver"; }
|
||||||
|
|
||||||
|
/// Delete any contexts/data and set the platform number to be used
|
||||||
|
inline int set_platform(const int pid);
|
||||||
|
|
||||||
/// Return the number of devices that support CUDA
|
/// Return the number of devices that support CUDA
|
||||||
inline int num_devices() { return _properties.size(); }
|
inline int num_devices() { return _properties.size(); }
|
||||||
@ -81,8 +85,12 @@ class UCL_Device {
|
|||||||
/// Set the CUDA device to the specified device number
|
/// Set the CUDA device to the specified device number
|
||||||
/** A context and default command queue will be created for the device
|
/** A context and default command queue will be created for the device
|
||||||
* Returns UCL_SUCCESS if successful or UCL_ERROR if the device could not
|
* Returns UCL_SUCCESS if successful or UCL_ERROR if the device could not
|
||||||
* be allocated for use **/
|
* be allocated for use. clear() is called to delete any contexts and
|
||||||
int set(int num);
|
* associated data from previous calls to set(). **/
|
||||||
|
inline int set(int num);
|
||||||
|
|
||||||
|
/// Delete any context and associated data stored from a call to set()
|
||||||
|
inline void clear();
|
||||||
|
|
||||||
/// Get the current device number
|
/// Get the current device number
|
||||||
inline int device_num() { return _device; }
|
inline int device_num() { return _device; }
|
||||||
@ -147,16 +155,17 @@ class UCL_Device {
|
|||||||
inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; }
|
inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; }
|
||||||
|
|
||||||
/// Returns true if double precision is support for the current device
|
/// Returns true if double precision is support for the current device
|
||||||
bool double_precision() { return double_precision(_device); }
|
inline bool double_precision() { return double_precision(_device); }
|
||||||
/// Returns true if double precision is support for the device
|
/// Returns true if double precision is support for the device
|
||||||
bool double_precision(const int i) {return arch(i)>=1.3;}
|
inline bool double_precision(const int i) {return arch(i)>=1.3;}
|
||||||
|
|
||||||
/// Get the number of cores in the current device
|
/// Get the number of cores in the current device
|
||||||
inline unsigned cores() { return cores(_device); }
|
inline unsigned cores() { return cores(_device); }
|
||||||
/// Get the number of cores
|
/// Get the number of cores
|
||||||
inline unsigned cores(const int i)
|
inline unsigned cores(const int i)
|
||||||
{ if (arch(i)<2.0) return _properties[i].multiProcessorCount*8;
|
{ if (arch(i)<2.0) return _properties[i].multiProcessorCount*8;
|
||||||
else if (arch(i)<3.0) return _properties[i].multiProcessorCount*32;
|
else if (arch(i)<2.1) return _properties[i].multiProcessorCount*32;
|
||||||
|
else if (arch(i)<3.0) return _properties[i].multiProcessorCount*48;
|
||||||
else return _properties[i].multiProcessorCount*192; }
|
else return _properties[i].multiProcessorCount*192; }
|
||||||
|
|
||||||
/// Get the gigabytes of global memory in the current device
|
/// Get the gigabytes of global memory in the current device
|
||||||
@ -216,8 +225,34 @@ class UCL_Device {
|
|||||||
inline bool sharing_supported(const int i)
|
inline bool sharing_supported(const int i)
|
||||||
{ return (_properties[i].computeMode == CU_COMPUTEMODE_DEFAULT); }
|
{ return (_properties[i].computeMode == CU_COMPUTEMODE_DEFAULT); }
|
||||||
|
|
||||||
|
/// True if splitting device into equal subdevices supported
|
||||||
|
inline bool fission_equal()
|
||||||
|
{ return fission_equal(_device); }
|
||||||
|
/// True if splitting device into equal subdevices supported
|
||||||
|
inline bool fission_equal(const int i)
|
||||||
|
{ return false; }
|
||||||
|
/// True if splitting device into subdevices by specified counts supported
|
||||||
|
inline bool fission_by_counts()
|
||||||
|
{ return fission_by_counts(_device); }
|
||||||
|
/// True if splitting device into subdevices by specified counts supported
|
||||||
|
inline bool fission_by_counts(const int i)
|
||||||
|
{ return false; }
|
||||||
|
/// True if splitting device into subdevices by affinity domains supported
|
||||||
|
inline bool fission_by_affinity()
|
||||||
|
{ return fission_by_affinity(_device); }
|
||||||
|
/// True if splitting device into subdevices by affinity domains supported
|
||||||
|
inline bool fission_by_affinity(const int i)
|
||||||
|
{ return false; }
|
||||||
|
|
||||||
|
/// Maximum number of subdevices allowed from device fission
|
||||||
|
inline int max_sub_devices()
|
||||||
|
{ return max_sub_devices(_device); }
|
||||||
|
/// Maximum number of subdevices allowed from device fission
|
||||||
|
inline int max_sub_devices(const int i)
|
||||||
|
{ return 0; }
|
||||||
|
|
||||||
/// List all devices along with all properties
|
/// List all devices along with all properties
|
||||||
void print_all(std::ostream &out);
|
inline void print_all(std::ostream &out);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
int _device, _num_devices;
|
int _device, _num_devices;
|
||||||
@ -228,7 +263,7 @@ class UCL_Device {
|
|||||||
};
|
};
|
||||||
|
|
||||||
// Grabs the properties for all devices
|
// Grabs the properties for all devices
|
||||||
inline UCL_Device::UCL_Device() {
|
UCL_Device::UCL_Device() {
|
||||||
CU_SAFE_CALL_NS(cuInit(0));
|
CU_SAFE_CALL_NS(cuInit(0));
|
||||||
CU_SAFE_CALL_NS(cuDeviceGetCount(&_num_devices));
|
CU_SAFE_CALL_NS(cuDeviceGetCount(&_num_devices));
|
||||||
for (int dev=0; dev<_num_devices; ++dev) {
|
for (int dev=0; dev<_num_devices; ++dev) {
|
||||||
@ -280,22 +315,21 @@ inline UCL_Device::UCL_Device() {
|
|||||||
_cq.back()=0;
|
_cq.back()=0;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline UCL_Device::~UCL_Device() {
|
UCL_Device::~UCL_Device() {
|
||||||
if (_device>-1) {
|
clear();
|
||||||
for (int i=1; i<num_queues(); i++) pop_command_queue();
|
}
|
||||||
cuCtxDestroy(_context);
|
|
||||||
}
|
int UCL_Device::set_platform(const int pid) {
|
||||||
|
clear();
|
||||||
|
#ifdef UCL_DEBUG
|
||||||
|
assert(pid<num_platforms());
|
||||||
|
#endif
|
||||||
|
return UCL_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Set the CUDA device to the specified device number
|
// Set the CUDA device to the specified device number
|
||||||
inline int UCL_Device::set(int num) {
|
int UCL_Device::set(int num) {
|
||||||
if (_device==num)
|
clear();
|
||||||
return UCL_SUCCESS;
|
|
||||||
if (_device>-1) {
|
|
||||||
CU_SAFE_CALL_NS(cuCtxDestroy(_context));
|
|
||||||
for (int i=1; i<num_queues(); i++) pop_command_queue();
|
|
||||||
_cq[0]=0;
|
|
||||||
}
|
|
||||||
_device=_properties[num].device_id;
|
_device=_properties[num].device_id;
|
||||||
CU_SAFE_CALL_NS(cuDeviceGet(&_cu_device,_device));
|
CU_SAFE_CALL_NS(cuDeviceGet(&_cu_device,_device));
|
||||||
CUresult err=cuCtxCreate(&_context,0,_cu_device);
|
CUresult err=cuCtxCreate(&_context,0,_cu_device);
|
||||||
@ -310,8 +344,16 @@ inline int UCL_Device::set(int num) {
|
|||||||
return UCL_SUCCESS;
|
return UCL_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void UCL_Device::clear() {
|
||||||
|
if (_device>-1) {
|
||||||
|
for (int i=1; i<num_queues(); i++) pop_command_queue();
|
||||||
|
cuCtxDestroy(_context);
|
||||||
|
}
|
||||||
|
_device=-1;
|
||||||
|
}
|
||||||
|
|
||||||
// List all devices along with all properties
|
// List all devices along with all properties
|
||||||
inline void UCL_Device::print_all(std::ostream &out) {
|
void UCL_Device::print_all(std::ostream &out) {
|
||||||
#if CUDA_VERSION >= 2020
|
#if CUDA_VERSION >= 2020
|
||||||
int driver_version;
|
int driver_version;
|
||||||
cuDriverGetVersion(&driver_version);
|
cuDriverGetVersion(&driver_version);
|
||||||
|
|||||||
@ -377,6 +377,10 @@ class UCL_Kernel {
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Return the default command queue/stream associated with this data
|
||||||
|
inline command_queue & cq() { return _cq; }
|
||||||
|
/// Change the default command queue associated with matrix
|
||||||
|
inline void cq(command_queue &cq_in) { _cq=cq_in; }
|
||||||
#include "ucl_arg_kludge.h"
|
#include "ucl_arg_kludge.h"
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|||||||
@ -47,14 +47,14 @@ typedef CUdeviceptr device_ptr;
|
|||||||
// --------------------------------------------------------------------------
|
// --------------------------------------------------------------------------
|
||||||
template <class mat_type, class copy_type>
|
template <class mat_type, class copy_type>
|
||||||
inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
|
inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
|
||||||
const enum UCL_MEMOPT kind) {
|
const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
|
||||||
CUresult err=CUDA_SUCCESS;
|
CUresult err=CUDA_SUCCESS;
|
||||||
if (kind==UCL_RW_OPTIMIZED)
|
if (kind==UCL_NOT_PINNED)
|
||||||
err=cuMemAllocHost((void **)mat.host_ptr(),n);
|
*(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
|
||||||
else if (kind==UCL_WRITE_OPTIMIZED)
|
else if (kind==UCL_WRITE_ONLY)
|
||||||
err=cuMemHostAlloc((void **)mat.host_ptr(),n,CU_MEMHOSTALLOC_WRITECOMBINED);
|
err=cuMemHostAlloc((void **)mat.host_ptr(),n,CU_MEMHOSTALLOC_WRITECOMBINED);
|
||||||
else
|
else
|
||||||
*(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
|
err=cuMemAllocHost((void **)mat.host_ptr(),n);
|
||||||
if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL)
|
if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL)
|
||||||
return UCL_MEMORY_ERROR;
|
return UCL_MEMORY_ERROR;
|
||||||
mat.cq()=cm.cq();
|
mat.cq()=cm.cq();
|
||||||
@ -63,14 +63,14 @@ inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
|
|||||||
|
|
||||||
template <class mat_type>
|
template <class mat_type>
|
||||||
inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
|
inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
|
||||||
const enum UCL_MEMOPT kind) {
|
const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
|
||||||
CUresult err=CUDA_SUCCESS;
|
CUresult err=CUDA_SUCCESS;
|
||||||
if (kind==UCL_RW_OPTIMIZED)
|
if (kind==UCL_NOT_PINNED)
|
||||||
err=cuMemAllocHost((void **)mat.host_ptr(),n);
|
*(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
|
||||||
else if (kind==UCL_WRITE_OPTIMIZED)
|
else if (kind==UCL_WRITE_ONLY)
|
||||||
err=cuMemHostAlloc((void **)mat.host_ptr(),n,CU_MEMHOSTALLOC_WRITECOMBINED);
|
err=cuMemHostAlloc((void **)mat.host_ptr(),n,CU_MEMHOSTALLOC_WRITECOMBINED);
|
||||||
else
|
else
|
||||||
*(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
|
err=cuMemAllocHost((void **)mat.host_ptr(),n);
|
||||||
if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL)
|
if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL)
|
||||||
return UCL_MEMORY_ERROR;
|
return UCL_MEMORY_ERROR;
|
||||||
mat.cq()=dev.cq();
|
mat.cq()=dev.cq();
|
||||||
@ -78,8 +78,10 @@ inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <class mat_type>
|
template <class mat_type>
|
||||||
inline void _host_free(mat_type &mat, const enum UCL_MEMOPT kind) {
|
inline void _host_free(mat_type &mat) {
|
||||||
if (kind!=UCL_NOT_PINNED)
|
if (mat.kind()==UCL_VIEW)
|
||||||
|
return;
|
||||||
|
else if (mat.kind()!=UCL_NOT_PINNED)
|
||||||
CU_DESTRUCT_CALL(cuMemFreeHost(mat.begin()));
|
CU_DESTRUCT_CALL(cuMemFreeHost(mat.begin()));
|
||||||
else
|
else
|
||||||
free(mat.begin());
|
free(mat.begin());
|
||||||
@ -87,14 +89,14 @@ inline void _host_free(mat_type &mat, const enum UCL_MEMOPT kind) {
|
|||||||
|
|
||||||
template <class mat_type>
|
template <class mat_type>
|
||||||
inline int _host_resize(mat_type &mat, const size_t n) {
|
inline int _host_resize(mat_type &mat, const size_t n) {
|
||||||
_host_free(mat,mat.kind());
|
_host_free(mat);
|
||||||
CUresult err=CUDA_SUCCESS;
|
CUresult err=CUDA_SUCCESS;
|
||||||
if (mat.kind()==UCL_RW_OPTIMIZED)
|
if (mat.kind()==UCL_NOT_PINNED)
|
||||||
err=cuMemAllocHost((void **)mat.host_ptr(),n);
|
*(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
|
||||||
else if (mat.kind()==UCL_WRITE_OPTIMIZED)
|
else if (mat.kind()==UCL_WRITE_ONLY)
|
||||||
err=cuMemHostAlloc((void **)mat.host_ptr(),n,CU_MEMHOSTALLOC_WRITECOMBINED);
|
err=cuMemHostAlloc((void **)mat.host_ptr(),n,CU_MEMHOSTALLOC_WRITECOMBINED);
|
||||||
else
|
else
|
||||||
*(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
|
err=cuMemAllocHost((void **)mat.host_ptr(),n);
|
||||||
if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL)
|
if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL)
|
||||||
return UCL_MEMORY_ERROR;
|
return UCL_MEMORY_ERROR;
|
||||||
return UCL_SUCCESS;
|
return UCL_SUCCESS;
|
||||||
@ -155,7 +157,8 @@ inline int _device_alloc(mat_type &mat, UCL_Device &d, const size_t rows,
|
|||||||
|
|
||||||
template <class mat_type>
|
template <class mat_type>
|
||||||
inline void _device_free(mat_type &mat) {
|
inline void _device_free(mat_type &mat) {
|
||||||
CU_DESTRUCT_CALL(cuMemFree(mat.cbegin()));
|
if (mat.kind()!=UCL_VIEW)
|
||||||
|
CU_DESTRUCT_CALL(cuMemFree(mat.cbegin()));
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class mat_type>
|
template <class mat_type>
|
||||||
@ -229,13 +232,13 @@ inline void _host_zero(void *ptr, const size_t n) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <class mat_type>
|
template <class mat_type>
|
||||||
inline void _device_zero(mat_type &mat, const size_t n) {
|
inline void _device_zero(mat_type &mat, const size_t n, command_queue &cq) {
|
||||||
if (n%32==0)
|
if (n%32==0)
|
||||||
CU_SAFE_CALL(cuMemsetD32(mat.cbegin(),0,n/4));
|
CU_SAFE_CALL(cuMemsetD32Async(mat.cbegin(),0,n/4,cq));
|
||||||
else if (n%16==0)
|
else if (n%16==0)
|
||||||
CU_SAFE_CALL(cuMemsetD16(mat.cbegin(),0,n/2));
|
CU_SAFE_CALL(cuMemsetD16Async(mat.cbegin(),0,n/2,cq));
|
||||||
else
|
else
|
||||||
CU_SAFE_CALL(cuMemsetD8(mat.cbegin(),0,n));
|
CU_SAFE_CALL(cuMemsetD8Async(mat.cbegin(),0,n,cq));
|
||||||
}
|
}
|
||||||
|
|
||||||
// --------------------------------------------------------------------------
|
// --------------------------------------------------------------------------
|
||||||
|
|||||||
@ -51,6 +51,10 @@ inline void ucl_sync(cl_command_queue &cq) {
|
|||||||
CL_SAFE_CALL(clFinish(cq));
|
CL_SAFE_CALL(clFinish(cq));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline bool _shared_mem_device(cl_device_type &device_type) {
|
||||||
|
return (device_type==CL_DEVICE_TYPE_CPU);
|
||||||
|
}
|
||||||
|
|
||||||
struct OCLProperties {
|
struct OCLProperties {
|
||||||
std::string name;
|
std::string name;
|
||||||
cl_device_type device_type;
|
cl_device_type device_type;
|
||||||
@ -64,6 +68,10 @@ struct OCLProperties {
|
|||||||
bool double_precision;
|
bool double_precision;
|
||||||
int alignment;
|
int alignment;
|
||||||
size_t timer_resolution;
|
size_t timer_resolution;
|
||||||
|
bool ecc_support;
|
||||||
|
std::string c_version;
|
||||||
|
bool partition_equal, partition_counts, partition_affinity;
|
||||||
|
cl_uint max_sub_devices;
|
||||||
};
|
};
|
||||||
|
|
||||||
/// Class for looking at data parallel device properties
|
/// Class for looking at data parallel device properties
|
||||||
@ -74,15 +82,18 @@ class UCL_Device {
|
|||||||
public:
|
public:
|
||||||
/// Collect properties for every device on the node
|
/// Collect properties for every device on the node
|
||||||
/** \note You must set the active GPU with set() before using the device **/
|
/** \note You must set the active GPU with set() before using the device **/
|
||||||
UCL_Device();
|
inline UCL_Device();
|
||||||
|
|
||||||
~UCL_Device();
|
inline ~UCL_Device();
|
||||||
|
|
||||||
/// Return the number of platforms (0 if error or no platforms)
|
/// Return the number of platforms (0 if error or no platforms)
|
||||||
inline int num_platforms() { return _num_platforms; }
|
inline int num_platforms() { return _num_platforms; }
|
||||||
|
|
||||||
/// Return a string with name and info of the current platform
|
/// Return a string with name and info of the current platform
|
||||||
std::string platform_name();
|
inline std::string platform_name();
|
||||||
|
|
||||||
|
/// Delete any contexts/data and set the platform number to be used
|
||||||
|
inline int set_platform(const int pid);
|
||||||
|
|
||||||
/// Return the number of devices that support OpenCL
|
/// Return the number of devices that support OpenCL
|
||||||
inline int num_devices() { return _num_devices; }
|
inline int num_devices() { return _num_devices; }
|
||||||
@ -90,8 +101,12 @@ class UCL_Device {
|
|||||||
/// Set the OpenCL device to the specified device number
|
/// Set the OpenCL device to the specified device number
|
||||||
/** A context and default command queue will be created for the device *
|
/** A context and default command queue will be created for the device *
|
||||||
* Returns UCL_SUCCESS if successful or UCL_ERROR if the device could not
|
* Returns UCL_SUCCESS if successful or UCL_ERROR if the device could not
|
||||||
* be allocated for use **/
|
* be allocated for use. clear() is called to delete any contexts and
|
||||||
int set(int num);
|
* associated data from previous calls to set(). **/
|
||||||
|
inline int set(int num);
|
||||||
|
|
||||||
|
/// Delete any context and associated data stored from a call to set()
|
||||||
|
inline void clear();
|
||||||
|
|
||||||
/// Get the current device number
|
/// Get the current device number
|
||||||
inline int device_num() { return _device; }
|
inline int device_num() { return _device; }
|
||||||
@ -161,12 +176,14 @@ class UCL_Device {
|
|||||||
/// Returns true if host memory is efficiently addressable from device
|
/// Returns true if host memory is efficiently addressable from device
|
||||||
inline bool shared_memory() { return shared_memory(_device); }
|
inline bool shared_memory() { return shared_memory(_device); }
|
||||||
/// Returns true if host memory is efficiently addressable from device
|
/// Returns true if host memory is efficiently addressable from device
|
||||||
inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; }
|
inline bool shared_memory(const int i)
|
||||||
|
{ return _shared_mem_device(_properties[i].device_type); }
|
||||||
|
|
||||||
/// Returns true if double precision is support for the current device
|
/// Returns true if double precision is support for the current device
|
||||||
bool double_precision() { return double_precision(_device); }
|
inline bool double_precision() { return double_precision(_device); }
|
||||||
/// Returns true if double precision is support for the device
|
/// Returns true if double precision is support for the device
|
||||||
bool double_precision(const int i) {return _properties[i].double_precision;}
|
inline bool double_precision(const int i)
|
||||||
|
{return _properties[i].double_precision;}
|
||||||
|
|
||||||
/// Get the number of cores in the current device
|
/// Get the number of cores in the current device
|
||||||
inline unsigned cores() { return cores(_device); }
|
inline unsigned cores() { return cores(_device); }
|
||||||
@ -227,8 +244,34 @@ class UCL_Device {
|
|||||||
inline bool sharing_supported(const int i)
|
inline bool sharing_supported(const int i)
|
||||||
{ return true; }
|
{ return true; }
|
||||||
|
|
||||||
|
/// True if splitting device into equal subdevices supported
|
||||||
|
inline bool fission_equal()
|
||||||
|
{ return fission_equal(_device); }
|
||||||
|
/// True if splitting device into equal subdevices supported
|
||||||
|
inline bool fission_equal(const int i)
|
||||||
|
{ return _properties[i].partition_equal; }
|
||||||
|
/// True if splitting device into subdevices by specified counts supported
|
||||||
|
inline bool fission_by_counts()
|
||||||
|
{ return fission_by_counts(_device); }
|
||||||
|
/// True if splitting device into subdevices by specified counts supported
|
||||||
|
inline bool fission_by_counts(const int i)
|
||||||
|
{ return _properties[i].partition_counts; }
|
||||||
|
/// True if splitting device into subdevices by affinity domains supported
|
||||||
|
inline bool fission_by_affinity()
|
||||||
|
{ return fission_by_affinity(_device); }
|
||||||
|
/// True if splitting device into subdevices by affinity domains supported
|
||||||
|
inline bool fission_by_affinity(const int i)
|
||||||
|
{ return _properties[i].partition_affinity; }
|
||||||
|
|
||||||
|
/// Maximum number of subdevices allowed from device fission
|
||||||
|
inline int max_sub_devices()
|
||||||
|
{ return max_sub_devices(_device); }
|
||||||
|
/// Maximum number of subdevices allowed from device fission
|
||||||
|
inline int max_sub_devices(const int i)
|
||||||
|
{ return _properties[i].max_sub_devices; }
|
||||||
|
|
||||||
/// List all devices along with all properties
|
/// List all devices along with all properties
|
||||||
void print_all(std::ostream &out);
|
inline void print_all(std::ostream &out);
|
||||||
|
|
||||||
/// Return the OpenCL type for the device
|
/// Return the OpenCL type for the device
|
||||||
inline cl_device_id & cl_device() { return _cl_device; }
|
inline cl_device_id & cl_device() { return _cl_device; }
|
||||||
@ -237,7 +280,8 @@ class UCL_Device {
|
|||||||
int _num_platforms; // Number of platforms
|
int _num_platforms; // Number of platforms
|
||||||
int _platform; // UCL_Device ID for current platform
|
int _platform; // UCL_Device ID for current platform
|
||||||
cl_platform_id _cl_platform; // OpenCL ID for current platform
|
cl_platform_id _cl_platform; // OpenCL ID for current platform
|
||||||
cl_context _context; // Context used for accessing the device
|
cl_platform_id _cl_platforms[20]; // OpenCL IDs for all platforms
|
||||||
|
cl_context _context; // Context used for accessing the device
|
||||||
std::vector<cl_command_queue> _cq;// The default command queue for this device
|
std::vector<cl_command_queue> _cq;// The default command queue for this device
|
||||||
int _device; // UCL_Device ID for current device
|
int _device; // UCL_Device ID for current device
|
||||||
cl_device_id _cl_device; // OpenCL ID for current device
|
cl_device_id _cl_device; // OpenCL ID for current device
|
||||||
@ -245,24 +289,18 @@ class UCL_Device {
|
|||||||
int _num_devices; // Number of devices
|
int _num_devices; // Number of devices
|
||||||
std::vector<OCLProperties> _properties; // Properties for each device
|
std::vector<OCLProperties> _properties; // Properties for each device
|
||||||
|
|
||||||
void add_properties(cl_device_id);
|
inline void add_properties(cl_device_id);
|
||||||
int create_context();
|
inline int create_context();
|
||||||
int _default_cq;
|
int _default_cq;
|
||||||
};
|
};
|
||||||
|
|
||||||
// Grabs the properties for all devices
|
// Grabs the properties for all devices
|
||||||
inline UCL_Device::UCL_Device() {
|
UCL_Device::UCL_Device() {
|
||||||
cl_int errorv;
|
|
||||||
cl_uint nplatforms;
|
|
||||||
|
|
||||||
_cl_device=0;
|
|
||||||
_device=-1;
|
_device=-1;
|
||||||
_num_devices=0;
|
|
||||||
_platform=0;
|
|
||||||
_default_cq=0;
|
|
||||||
|
|
||||||
// --- Get Number of Platforms
|
// --- Get Number of Platforms
|
||||||
errorv=clGetPlatformIDs(1,&_cl_platform,&nplatforms);
|
cl_uint nplatforms;
|
||||||
|
cl_int errorv=clGetPlatformIDs(20,_cl_platforms,&nplatforms);
|
||||||
|
|
||||||
if (errorv!=CL_SUCCESS) {
|
if (errorv!=CL_SUCCESS) {
|
||||||
_num_platforms=0;
|
_num_platforms=0;
|
||||||
@ -270,6 +308,38 @@ inline UCL_Device::UCL_Device() {
|
|||||||
} else
|
} else
|
||||||
_num_platforms=static_cast<int>(nplatforms);
|
_num_platforms=static_cast<int>(nplatforms);
|
||||||
|
|
||||||
|
set_platform(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
UCL_Device::~UCL_Device() {
|
||||||
|
clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
void UCL_Device::clear() {
|
||||||
|
if (_device>-1) {
|
||||||
|
for (size_t i=0; i<_cq.size(); i++) {
|
||||||
|
CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq.back()));
|
||||||
|
_cq.pop_back();
|
||||||
|
}
|
||||||
|
CL_DESTRUCT_CALL(clReleaseContext(_context));
|
||||||
|
}
|
||||||
|
_device=-1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int UCL_Device::set_platform(int pid) {
|
||||||
|
clear();
|
||||||
|
cl_int errorv;
|
||||||
|
|
||||||
|
_cl_device=0;
|
||||||
|
_device=-1;
|
||||||
|
_num_devices=0;
|
||||||
|
_default_cq=0;
|
||||||
|
|
||||||
|
#ifdef UCL_DEBUG
|
||||||
|
assert(pid<num_platforms());
|
||||||
|
#endif
|
||||||
|
_platform=pid;
|
||||||
|
_cl_platform=_cl_platforms[_platform];
|
||||||
|
|
||||||
// --- Get Number of Devices
|
// --- Get Number of Devices
|
||||||
cl_uint n;
|
cl_uint n;
|
||||||
@ -277,7 +347,7 @@ inline UCL_Device::UCL_Device() {
|
|||||||
_num_devices=n;
|
_num_devices=n;
|
||||||
if (errorv!=CL_SUCCESS || _num_devices==0) {
|
if (errorv!=CL_SUCCESS || _num_devices==0) {
|
||||||
_num_devices=0;
|
_num_devices=0;
|
||||||
return;
|
return UCL_ERROR;
|
||||||
}
|
}
|
||||||
cl_device_id device_list[_num_devices];
|
cl_device_id device_list[_num_devices];
|
||||||
CL_SAFE_CALL(clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,n,device_list,
|
CL_SAFE_CALL(clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,n,device_list,
|
||||||
@ -288,19 +358,11 @@ inline UCL_Device::UCL_Device() {
|
|||||||
_cl_devices.push_back(device_list[i]);
|
_cl_devices.push_back(device_list[i]);
|
||||||
add_properties(device_list[i]);
|
add_properties(device_list[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return UCL_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline UCL_Device::~UCL_Device() {
|
int UCL_Device::create_context() {
|
||||||
if (_device>-1) {
|
|
||||||
for (size_t i=0; i<_cq.size(); i++) {
|
|
||||||
CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq.back()));
|
|
||||||
_cq.pop_back();
|
|
||||||
}
|
|
||||||
CL_DESTRUCT_CALL(clReleaseContext(_context));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
inline int UCL_Device::create_context() {
|
|
||||||
cl_int errorv;
|
cl_int errorv;
|
||||||
cl_context_properties props[3];
|
cl_context_properties props[3];
|
||||||
props[0]=CL_CONTEXT_PLATFORM;
|
props[0]=CL_CONTEXT_PLATFORM;
|
||||||
@ -320,9 +382,10 @@ inline int UCL_Device::create_context() {
|
|||||||
return UCL_SUCCESS;
|
return UCL_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void UCL_Device::add_properties(cl_device_id device_list) {
|
void UCL_Device::add_properties(cl_device_id device_list) {
|
||||||
OCLProperties op;
|
OCLProperties op;
|
||||||
char buffer[1024];
|
char buffer[1024];
|
||||||
|
cl_bool ans_bool;
|
||||||
|
|
||||||
CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_NAME,1024,buffer,NULL));
|
CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_NAME,1024,buffer,NULL));
|
||||||
op.name=buffer;
|
op.name=buffer;
|
||||||
@ -363,10 +426,49 @@ inline void UCL_Device::add_properties(cl_device_id device_list) {
|
|||||||
CL_DEVICE_PROFILING_TIMER_RESOLUTION,
|
CL_DEVICE_PROFILING_TIMER_RESOLUTION,
|
||||||
sizeof(size_t),&op.timer_resolution,NULL));
|
sizeof(size_t),&op.timer_resolution,NULL));
|
||||||
|
|
||||||
|
|
||||||
|
op.ecc_support=false;
|
||||||
|
CL_SAFE_CALL(clGetDeviceInfo(device_list,
|
||||||
|
CL_DEVICE_ERROR_CORRECTION_SUPPORT,
|
||||||
|
sizeof(ans_bool),&ans_bool,NULL));
|
||||||
|
if (ans_bool==CL_TRUE)
|
||||||
|
op.ecc_support=true;
|
||||||
|
|
||||||
|
op.c_version="";
|
||||||
|
op.partition_equal=false;
|
||||||
|
op.partition_counts=false;
|
||||||
|
op.partition_affinity=false;
|
||||||
|
|
||||||
|
#ifdef CL_VERSION_1_2
|
||||||
|
size_t return_bytes;
|
||||||
|
CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_OPENCL_C_VERSION,1024,
|
||||||
|
buffer,NULL));
|
||||||
|
op.c_version=buffer;
|
||||||
|
|
||||||
|
cl_device_partition_property pinfo[4];
|
||||||
|
CL_SAFE_CALL(clGetDeviceInfo(device_list,
|
||||||
|
CL_DEVICE_PARTITION_PROPERTIES,
|
||||||
|
4*sizeof(cl_device_partition_property),
|
||||||
|
pinfo,&return_bytes));
|
||||||
|
int nprops=return_bytes/sizeof(cl_device_partition_property);
|
||||||
|
for (int i=0; i<nprops; i++) {
|
||||||
|
if (pinfo[i]==CL_DEVICE_PARTITION_EQUALLY)
|
||||||
|
op.partition_equal=true;
|
||||||
|
else if (pinfo[i]==CL_DEVICE_PARTITION_BY_COUNTS)
|
||||||
|
op.partition_counts=true;
|
||||||
|
else if (pinfo[i]==CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN)
|
||||||
|
op.partition_affinity=true;
|
||||||
|
}
|
||||||
|
|
||||||
|
CL_SAFE_CALL(clGetDeviceInfo(device_list,
|
||||||
|
CL_DEVICE_PARTITION_MAX_SUB_DEVICES,
|
||||||
|
sizeof(cl_uint),&op.max_sub_devices,NULL));
|
||||||
|
#endif
|
||||||
|
|
||||||
_properties.push_back(op);
|
_properties.push_back(op);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline std::string UCL_Device::platform_name() {
|
std::string UCL_Device::platform_name() {
|
||||||
char info[1024];
|
char info[1024];
|
||||||
|
|
||||||
CL_SAFE_CALL(clGetPlatformInfo(_cl_platform,CL_PLATFORM_VENDOR,1024,info,
|
CL_SAFE_CALL(clGetPlatformInfo(_cl_platform,CL_PLATFORM_VENDOR,1024,info,
|
||||||
@ -385,7 +487,7 @@ inline std::string UCL_Device::platform_name() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Get a string telling the type of the device
|
// Get a string telling the type of the device
|
||||||
inline std::string UCL_Device::device_type_name(const int i) {
|
std::string UCL_Device::device_type_name(const int i) {
|
||||||
if (_properties[i].device_type==CL_DEVICE_TYPE_CPU)
|
if (_properties[i].device_type==CL_DEVICE_TYPE_CPU)
|
||||||
return "CPU";
|
return "CPU";
|
||||||
else if (_properties[i].device_type==CL_DEVICE_TYPE_GPU)
|
else if (_properties[i].device_type==CL_DEVICE_TYPE_GPU)
|
||||||
@ -397,7 +499,7 @@ inline std::string UCL_Device::device_type_name(const int i) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Get a string telling the type of the device
|
// Get a string telling the type of the device
|
||||||
inline int UCL_Device::device_type(const int i) {
|
int UCL_Device::device_type(const int i) {
|
||||||
if (_properties[i].device_type==CL_DEVICE_TYPE_CPU)
|
if (_properties[i].device_type==CL_DEVICE_TYPE_CPU)
|
||||||
return UCL_CPU;
|
return UCL_CPU;
|
||||||
else if (_properties[i].device_type==CL_DEVICE_TYPE_GPU)
|
else if (_properties[i].device_type==CL_DEVICE_TYPE_GPU)
|
||||||
@ -409,17 +511,8 @@ inline int UCL_Device::device_type(const int i) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Set the CUDA device to the specified device number
|
// Set the CUDA device to the specified device number
|
||||||
inline int UCL_Device::set(int num) {
|
int UCL_Device::set(int num) {
|
||||||
if (_device==num)
|
clear();
|
||||||
return UCL_SUCCESS;
|
|
||||||
|
|
||||||
if (_device>-1) {
|
|
||||||
for (size_t i=0; i<_cq.size(); i++) {
|
|
||||||
CL_SAFE_CALL(clReleaseCommandQueue(_cq.back()));
|
|
||||||
_cq.pop_back();
|
|
||||||
}
|
|
||||||
CL_SAFE_CALL(clReleaseContext(_context));
|
|
||||||
}
|
|
||||||
|
|
||||||
cl_device_id device_list[_num_devices];
|
cl_device_id device_list[_num_devices];
|
||||||
cl_uint n;
|
cl_uint n;
|
||||||
@ -432,7 +525,7 @@ inline int UCL_Device::set(int num) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// List all devices along with all properties
|
// List all devices along with all properties
|
||||||
inline void UCL_Device::print_all(std::ostream &out) {
|
void UCL_Device::print_all(std::ostream &out) {
|
||||||
if (num_devices() == 0)
|
if (num_devices() == 0)
|
||||||
out << "There is no device supporting OpenCL\n";
|
out << "There is no device supporting OpenCL\n";
|
||||||
for (int i=0; i<num_devices(); ++i) {
|
for (int i=0; i<num_devices(); ++i) {
|
||||||
@ -475,6 +568,28 @@ inline void UCL_Device::print_all(std::ostream &out) {
|
|||||||
out << " Clock rate: "
|
out << " Clock rate: "
|
||||||
<< clock_rate(i) << " GHz\n";
|
<< clock_rate(i) << " GHz\n";
|
||||||
//out << " Concurrent copy and execution: ";
|
//out << " Concurrent copy and execution: ";
|
||||||
|
out << " ECC support: ";
|
||||||
|
if (_properties[i].ecc_support)
|
||||||
|
out << "Yes\n";
|
||||||
|
else
|
||||||
|
out << "No\n";
|
||||||
|
out << " Device fission into equal partitions: ";
|
||||||
|
if (fission_equal(i))
|
||||||
|
out << "Yes\n";
|
||||||
|
else
|
||||||
|
out << "No\n";
|
||||||
|
out << " Device fission by counts: ";
|
||||||
|
if (fission_by_counts(i))
|
||||||
|
out << "Yes\n";
|
||||||
|
else
|
||||||
|
out << "No\n";
|
||||||
|
out << " Device fission by affinity: ";
|
||||||
|
if (fission_by_affinity(i))
|
||||||
|
out << "Yes\n";
|
||||||
|
else
|
||||||
|
out << "No\n";
|
||||||
|
out << " Maximum subdevices from fission: "
|
||||||
|
<< max_sub_devices(i) << std::endl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -134,6 +134,11 @@ class UCL_Program {
|
|||||||
return UCL_SUCCESS;
|
return UCL_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Return the default command queue/stream associated with this data
|
||||||
|
inline command_queue & cq() { return _cq; }
|
||||||
|
/// Change the default command queue associated with matrix
|
||||||
|
inline void cq(command_queue &cq_in) { _cq=cq_in; }
|
||||||
|
|
||||||
friend class UCL_Kernel;
|
friend class UCL_Kernel;
|
||||||
private:
|
private:
|
||||||
bool _init_done;
|
bool _init_done;
|
||||||
@ -175,7 +180,16 @@ class UCL_Kernel {
|
|||||||
template <class dtype>
|
template <class dtype>
|
||||||
inline void set_arg(const cl_uint index, const dtype * const arg) {
|
inline void set_arg(const cl_uint index, const dtype * const arg) {
|
||||||
CL_SAFE_CALL(clSetKernelArg(_kernel,index,sizeof(dtype),arg));
|
CL_SAFE_CALL(clSetKernelArg(_kernel,index,sizeof(dtype),arg));
|
||||||
if (index>_num_args) _num_args=index;
|
if (index>_num_args) {
|
||||||
|
_num_args=index;
|
||||||
|
#ifdef UCL_DEBUG
|
||||||
|
if (_num_args>_kernel_info_nargs) {
|
||||||
|
std::cerr << "TOO MANY ARGUMENTS TO OPENCL FUNCTION: "
|
||||||
|
<< _kernel_info_name << std::endl;
|
||||||
|
assert(0==1);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Set a geryon container as a kernel argument.
|
/// Set a geryon container as a kernel argument.
|
||||||
@ -203,6 +217,13 @@ class UCL_Kernel {
|
|||||||
inline void add_arg(const dtype * const arg) {
|
inline void add_arg(const dtype * const arg) {
|
||||||
CL_SAFE_CALL(clSetKernelArg(_kernel,_num_args,sizeof(dtype),arg));
|
CL_SAFE_CALL(clSetKernelArg(_kernel,_num_args,sizeof(dtype),arg));
|
||||||
_num_args++;
|
_num_args++;
|
||||||
|
#ifdef UCL_DEBUG
|
||||||
|
if (_num_args>_kernel_info_nargs) {
|
||||||
|
std::cerr << "TOO MANY ARGUMENTS TO OPENCL FUNCTION: "
|
||||||
|
<< _kernel_info_name << std::endl;
|
||||||
|
assert(0==1);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Add a geryon container as a kernel argument.
|
/// Add a geryon container as a kernel argument.
|
||||||
@ -289,10 +310,7 @@ class UCL_Kernel {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Run the kernel in the default command queue
|
/// Run the kernel in the default command queue
|
||||||
inline void run() {
|
inline void run();
|
||||||
CL_SAFE_CALL(clEnqueueNDRangeKernel(_cq,_kernel,_dimensions,NULL,
|
|
||||||
_num_blocks,_block_size,0,NULL,NULL));
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Clear any arguments associated with the kernel
|
/// Clear any arguments associated with the kernel
|
||||||
inline void clear_args() { _num_args=0; }
|
inline void clear_args() { _num_args=0; }
|
||||||
@ -309,6 +327,12 @@ class UCL_Kernel {
|
|||||||
|
|
||||||
cl_command_queue _cq; // The default command queue for this kernel
|
cl_command_queue _cq; // The default command queue for this kernel
|
||||||
unsigned _num_args;
|
unsigned _num_args;
|
||||||
|
|
||||||
|
#ifdef UCL_DEBUG
|
||||||
|
std::string _kernel_info_name;
|
||||||
|
unsigned _kernel_info_nargs;
|
||||||
|
//std::string _kernel_info_args[256];
|
||||||
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
inline int UCL_Kernel::set_function(UCL_Program &program, const char *function) {
|
inline int UCL_Kernel::set_function(UCL_Program &program, const char *function) {
|
||||||
@ -329,9 +353,32 @@ inline int UCL_Kernel::set_function(UCL_Program &program, const char *function)
|
|||||||
#endif
|
#endif
|
||||||
return UCL_FUNCTION_NOT_FOUND;
|
return UCL_FUNCTION_NOT_FOUND;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef UCL_DEBUG
|
||||||
|
_kernel_info_name=function;
|
||||||
|
cl_uint nargs;
|
||||||
|
CL_SAFE_CALL(clGetKernelInfo(_kernel,CL_KERNEL_NUM_ARGS,sizeof(cl_uint),
|
||||||
|
&nargs,NULL));
|
||||||
|
_kernel_info_nargs=nargs;
|
||||||
|
#ifdef NOT_TEST_CL_VERSION_1_2
|
||||||
|
char tname[256];
|
||||||
|
size_t ret;
|
||||||
|
for (cl_uint i=0; i<nargs; i++) {
|
||||||
|
CL_SAFE_CALL(clGetKernelArgInfo(_kernel,i,CL_KERNEL_ARG_TYPE_NAME,256,
|
||||||
|
tname,&ret));
|
||||||
|
_kernel_info_args[i]=tname;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
return UCL_SUCCESS;
|
return UCL_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void UCL_Kernel::run() {
|
||||||
|
CL_SAFE_CALL(clEnqueueNDRangeKernel(_cq,_kernel,_dimensions,NULL,
|
||||||
|
_num_blocks,_block_size,0,NULL,NULL));
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -54,82 +54,138 @@ typedef cl_mem device_ptr;
|
|||||||
|
|
||||||
template <class mat_type, class copy_type>
|
template <class mat_type, class copy_type>
|
||||||
inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
|
inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
|
||||||
const enum UCL_MEMOPT kind) {
|
const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
|
||||||
cl_int error_flag;
|
cl_int error_flag;
|
||||||
cl_context context;
|
cl_context context;
|
||||||
CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_CONTEXT,sizeof(context),
|
CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_CONTEXT,sizeof(context),
|
||||||
&context,NULL));
|
&context,NULL));
|
||||||
|
|
||||||
if (kind==UCL_VIEW) {
|
cl_mem_flags buffer_perm;
|
||||||
mat.cbegin()=clCreateBuffer(context, CL_MEM_USE_HOST_PTR,n,mat.host_ptr(),
|
cl_map_flags map_perm;
|
||||||
&error_flag);
|
if (kind2==UCL_NOT_SPECIFIED) {
|
||||||
CL_CHECK_ERR(error_flag);
|
if (kind==UCL_READ_ONLY) {
|
||||||
return UCL_SUCCESS;
|
#ifdef CL_VERSION_1_2
|
||||||
}
|
buffer_perm=CL_MEM_HOST_READ_ONLY|CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR;
|
||||||
if (kind==UCL_WRITE_OPTIMIZED) {
|
#else
|
||||||
mat.cbegin()=clCreateBuffer(context,
|
buffer_perm=CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR;
|
||||||
CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
|
#endif
|
||||||
n,NULL,&error_flag);
|
map_perm=CL_MAP_READ;
|
||||||
if (error_flag != CL_SUCCESS)
|
} else if (kind==UCL_WRITE_ONLY) {
|
||||||
return UCL_MEMORY_ERROR;
|
#ifdef CL_VERSION_1_2
|
||||||
*mat.host_ptr() = (typename mat_type::data_type*)
|
buffer_perm=CL_MEM_HOST_WRITE_ONLY|CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR;
|
||||||
clEnqueueMapBuffer(cm.cq(),mat.cbegin(),CL_TRUE,
|
#else
|
||||||
CL_MAP_WRITE,0,n,0,NULL,NULL,NULL);
|
buffer_perm=CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR;
|
||||||
|
#endif
|
||||||
|
map_perm=CL_MAP_WRITE;
|
||||||
|
} else {
|
||||||
|
buffer_perm=CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR;
|
||||||
|
map_perm=CL_MAP_READ | CL_MAP_WRITE;
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
mat.cbegin()=clCreateBuffer(context,
|
if (kind2==UCL_READ_ONLY)
|
||||||
CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
|
buffer_perm=CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR;
|
||||||
n,NULL,&error_flag);
|
else if (kind2==UCL_WRITE_ONLY)
|
||||||
if (error_flag != CL_SUCCESS)
|
buffer_perm=CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR;
|
||||||
return UCL_MEMORY_ERROR;
|
else
|
||||||
|
buffer_perm=CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR;
|
||||||
|
|
||||||
|
if (kind==UCL_READ_ONLY) {
|
||||||
|
#ifdef CL_VERSION_1_2
|
||||||
|
buffer_perm=buffer_perm | CL_MEM_HOST_READ_ONLY;
|
||||||
|
#endif
|
||||||
|
map_perm=CL_MAP_READ;
|
||||||
|
} else if (kind==UCL_WRITE_ONLY) {
|
||||||
|
#ifdef CL_VERSION_1_2
|
||||||
|
buffer_perm=buffer_perm | CL_MEM_HOST_WRITE_ONLY;
|
||||||
|
#endif
|
||||||
|
map_perm=CL_MAP_WRITE;
|
||||||
|
} else
|
||||||
|
map_perm=CL_MAP_READ | CL_MAP_WRITE;
|
||||||
|
}
|
||||||
|
|
||||||
|
mat.cbegin()=clCreateBuffer(context,buffer_perm,n,NULL,&error_flag);
|
||||||
|
if (error_flag != CL_SUCCESS)
|
||||||
|
return UCL_MEMORY_ERROR;
|
||||||
*mat.host_ptr() = (typename mat_type::data_type*)
|
*mat.host_ptr() = (typename mat_type::data_type*)
|
||||||
clEnqueueMapBuffer(cm.cq(),mat.cbegin(),CL_TRUE,
|
clEnqueueMapBuffer(cm.cq(),mat.cbegin(),CL_TRUE,
|
||||||
CL_MAP_READ | CL_MAP_WRITE,
|
map_perm,0,n,0,NULL,NULL,NULL);
|
||||||
0,n,0,NULL,NULL,NULL);
|
|
||||||
}
|
|
||||||
mat.cq()=cm.cq();
|
mat.cq()=cm.cq();
|
||||||
CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
|
CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
|
||||||
return UCL_SUCCESS;
|
return UCL_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <class mat_type, class copy_type>
|
||||||
|
inline int _host_view(mat_type &mat, copy_type &cm, const size_t n) {
|
||||||
|
cl_int error_flag;
|
||||||
|
cl_context context;
|
||||||
|
CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_CONTEXT,sizeof(context),
|
||||||
|
&context,NULL));
|
||||||
|
cl_mem_flags orig_flags;
|
||||||
|
CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_FLAGS,sizeof(orig_flags),
|
||||||
|
&orig_flags,NULL));
|
||||||
|
orig_flags=orig_flags & ~CL_MEM_ALLOC_HOST_PTR;
|
||||||
|
|
||||||
|
mat.cbegin()=clCreateBuffer(context, CL_MEM_USE_HOST_PTR | orig_flags, n,
|
||||||
|
mat.host_ptr(), &error_flag);
|
||||||
|
CL_CHECK_ERR(error_flag);
|
||||||
|
CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
|
||||||
|
return UCL_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
template <class mat_type>
|
template <class mat_type>
|
||||||
inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
|
inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
|
||||||
const enum UCL_MEMOPT kind) {
|
const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
|
||||||
cl_int error_flag;
|
cl_mem_flags buffer_perm;
|
||||||
if (kind==UCL_VIEW) {
|
cl_map_flags map_perm;
|
||||||
mat.cbegin()=clCreateBuffer(dev.context(), CL_MEM_USE_HOST_PTR,
|
if (kind==UCL_READ_ONLY) {
|
||||||
n,mat.host_ptr(),&error_flag);
|
#ifdef CL_VERSION_1_2
|
||||||
CL_CHECK_ERR(error_flag);
|
buffer_perm=CL_MEM_HOST_READ_ONLY|CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR;
|
||||||
return UCL_SUCCESS;
|
#else
|
||||||
}
|
buffer_perm=CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR;
|
||||||
if (kind==UCL_WRITE_OPTIMIZED) {
|
#endif
|
||||||
mat.cbegin()=clCreateBuffer(dev.context(),
|
map_perm=CL_MAP_READ;
|
||||||
CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
|
} else if (kind==UCL_WRITE_ONLY) {
|
||||||
n,NULL,&error_flag);
|
#ifdef CL_VERSION_1_2
|
||||||
if (error_flag != CL_SUCCESS)
|
buffer_perm=CL_MEM_HOST_WRITE_ONLY|CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR;
|
||||||
return UCL_MEMORY_ERROR;
|
#else
|
||||||
*mat.host_ptr() = (typename mat_type::data_type*)
|
buffer_perm=CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR;
|
||||||
clEnqueueMapBuffer(dev.cq(),mat.cbegin(),CL_TRUE,
|
#endif
|
||||||
CL_MAP_WRITE,0,n,0,NULL,NULL,NULL);
|
map_perm=CL_MAP_WRITE;
|
||||||
} else {
|
} else {
|
||||||
mat.cbegin()=clCreateBuffer(dev.context(),
|
buffer_perm=CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR;
|
||||||
CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
|
map_perm=CL_MAP_READ | CL_MAP_WRITE;
|
||||||
n,NULL,&error_flag);
|
|
||||||
if (error_flag != CL_SUCCESS)
|
|
||||||
return UCL_MEMORY_ERROR;
|
|
||||||
*mat.host_ptr() = (typename mat_type::data_type*)
|
|
||||||
clEnqueueMapBuffer(dev.cq(),mat.cbegin(),CL_TRUE,
|
|
||||||
CL_MAP_READ & CL_MAP_WRITE,
|
|
||||||
0,n,0,NULL,NULL,NULL);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
cl_int error_flag;
|
||||||
|
mat.cbegin()=clCreateBuffer(dev.context(),buffer_perm,n,NULL,&error_flag);
|
||||||
|
if (error_flag != CL_SUCCESS)
|
||||||
|
return UCL_MEMORY_ERROR;
|
||||||
|
|
||||||
|
*mat.host_ptr() = (typename mat_type::data_type*)
|
||||||
|
clEnqueueMapBuffer(dev.cq(),mat.cbegin(),CL_TRUE,
|
||||||
|
map_perm,0,n,0,NULL,NULL,NULL);
|
||||||
mat.cq()=dev.cq();
|
mat.cq()=dev.cq();
|
||||||
CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
|
CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
|
||||||
return UCL_SUCCESS;
|
return UCL_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class mat_type>
|
template <class mat_type>
|
||||||
inline void _host_free(mat_type &mat, const enum UCL_MEMOPT kind) {
|
inline int _host_view(mat_type &mat, UCL_Device &dev, const size_t n) {
|
||||||
CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
|
cl_int error_flag;
|
||||||
CL_DESTRUCT_CALL(clReleaseCommandQueue(mat.cq()));
|
mat.cbegin()=clCreateBuffer(dev.context(), CL_MEM_USE_HOST_PTR,
|
||||||
|
n,mat.host_ptr(),&error_flag);
|
||||||
|
CL_CHECK_ERR(error_flag);
|
||||||
|
CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
|
||||||
|
return UCL_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class mat_type>
|
||||||
|
inline void _host_free(mat_type &mat) {
|
||||||
|
if (mat.cols()>0) {
|
||||||
|
CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
|
||||||
|
CL_DESTRUCT_CALL(clReleaseCommandQueue(mat.cq()));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class mat_type>
|
template <class mat_type>
|
||||||
@ -138,28 +194,26 @@ inline int _host_resize(mat_type &mat, const size_t n) {
|
|||||||
cl_context context;
|
cl_context context;
|
||||||
CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_CONTEXT,sizeof(context),
|
CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_CONTEXT,sizeof(context),
|
||||||
&context,NULL));
|
&context,NULL));
|
||||||
|
cl_mem_flags buffer_perm;
|
||||||
|
CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_FLAGS,sizeof(buffer_perm),
|
||||||
|
&buffer_perm,NULL));
|
||||||
|
|
||||||
CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
|
CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
|
||||||
if (mat.kind()==UCL_WRITE_OPTIMIZED) {
|
|
||||||
mat.cbegin()=clCreateBuffer(context,
|
cl_map_flags map_perm;
|
||||||
CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
|
if (mat.kind()==UCL_READ_ONLY)
|
||||||
n,NULL,&error_flag);
|
map_perm=CL_MAP_READ;
|
||||||
if (error_flag != CL_SUCCESS)
|
else if (mat.kind()==UCL_WRITE_ONLY)
|
||||||
return UCL_MEMORY_ERROR;
|
map_perm=CL_MAP_WRITE;
|
||||||
*mat.host_ptr() = (typename mat_type::data_type*)
|
else
|
||||||
clEnqueueMapBuffer(mat.cq(),mat.cbegin(),CL_TRUE,
|
map_perm=CL_MAP_READ | CL_MAP_WRITE;
|
||||||
CL_MAP_WRITE,0,n,0,NULL,NULL,NULL);
|
|
||||||
} else {
|
mat.cbegin()=clCreateBuffer(context,buffer_perm,n,NULL,&error_flag);
|
||||||
mat.cbegin()=clCreateBuffer(context,
|
if (error_flag != CL_SUCCESS)
|
||||||
CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
|
return UCL_MEMORY_ERROR;
|
||||||
n,NULL,&error_flag);
|
*mat.host_ptr() = (typename mat_type::data_type*)
|
||||||
if (error_flag != CL_SUCCESS)
|
clEnqueueMapBuffer(mat.cq(),mat.cbegin(),CL_TRUE,
|
||||||
return UCL_MEMORY_ERROR;
|
map_perm,0,n,0,NULL,NULL,NULL);
|
||||||
*mat.host_ptr() = (typename mat_type::data_type*)
|
|
||||||
clEnqueueMapBuffer(mat.cq(),mat.cbegin(),CL_TRUE,
|
|
||||||
CL_MAP_READ | CL_MAP_WRITE,
|
|
||||||
0,n,0,NULL,NULL,NULL);
|
|
||||||
}
|
|
||||||
return UCL_SUCCESS;
|
return UCL_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -179,9 +233,17 @@ inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t n,
|
|||||||
if (kind==UCL_READ_WRITE)
|
if (kind==UCL_READ_WRITE)
|
||||||
flag=CL_MEM_READ_WRITE;
|
flag=CL_MEM_READ_WRITE;
|
||||||
else if (kind==UCL_READ_ONLY)
|
else if (kind==UCL_READ_ONLY)
|
||||||
|
#ifdef CL_VERSION_1_2
|
||||||
|
flag=CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY;
|
||||||
|
#else
|
||||||
flag=CL_MEM_READ_ONLY;
|
flag=CL_MEM_READ_ONLY;
|
||||||
|
#endif
|
||||||
else if (kind==UCL_WRITE_ONLY)
|
else if (kind==UCL_WRITE_ONLY)
|
||||||
|
#ifdef CL_VERSION_1_2
|
||||||
|
flag=CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY;
|
||||||
|
#else
|
||||||
flag=CL_MEM_WRITE_ONLY;
|
flag=CL_MEM_WRITE_ONLY;
|
||||||
|
#endif
|
||||||
else
|
else
|
||||||
assert(0==1);
|
assert(0==1);
|
||||||
mat.cbegin()=clCreateBuffer(context,flag,n,NULL,&error_flag);
|
mat.cbegin()=clCreateBuffer(context,flag,n,NULL,&error_flag);
|
||||||
@ -200,9 +262,17 @@ inline int _device_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
|
|||||||
if (kind==UCL_READ_WRITE)
|
if (kind==UCL_READ_WRITE)
|
||||||
flag=CL_MEM_READ_WRITE;
|
flag=CL_MEM_READ_WRITE;
|
||||||
else if (kind==UCL_READ_ONLY)
|
else if (kind==UCL_READ_ONLY)
|
||||||
|
#ifdef CL_VERSION_1_2
|
||||||
|
flag=CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY;
|
||||||
|
#else
|
||||||
flag=CL_MEM_READ_ONLY;
|
flag=CL_MEM_READ_ONLY;
|
||||||
|
#endif
|
||||||
else if (kind==UCL_WRITE_ONLY)
|
else if (kind==UCL_WRITE_ONLY)
|
||||||
|
#ifdef CL_VERSION_1_2
|
||||||
|
flag=CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY;
|
||||||
|
#else
|
||||||
flag=CL_MEM_WRITE_ONLY;
|
flag=CL_MEM_WRITE_ONLY;
|
||||||
|
#endif
|
||||||
else
|
else
|
||||||
assert(0==1);
|
assert(0==1);
|
||||||
mat.cbegin()=clCreateBuffer(dev.context(),flag,n,NULL,
|
mat.cbegin()=clCreateBuffer(dev.context(),flag,n,NULL,
|
||||||
@ -238,8 +308,10 @@ inline int _device_alloc(mat_type &mat, UCL_Device &dev, const size_t rows,
|
|||||||
|
|
||||||
template <class mat_type>
|
template <class mat_type>
|
||||||
inline void _device_free(mat_type &mat) {
|
inline void _device_free(mat_type &mat) {
|
||||||
CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
|
if (mat.cols()>0) {
|
||||||
CL_DESTRUCT_CALL(clReleaseCommandQueue(mat.cq()));
|
CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
|
||||||
|
CL_DESTRUCT_CALL(clReleaseCommandQueue(mat.cq()));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class mat_type>
|
template <class mat_type>
|
||||||
@ -255,9 +327,17 @@ inline int _device_resize(mat_type &mat, const size_t n) {
|
|||||||
if (mat.kind()==UCL_READ_WRITE)
|
if (mat.kind()==UCL_READ_WRITE)
|
||||||
flag=CL_MEM_READ_WRITE;
|
flag=CL_MEM_READ_WRITE;
|
||||||
else if (mat.kind()==UCL_READ_ONLY)
|
else if (mat.kind()==UCL_READ_ONLY)
|
||||||
|
#ifdef CL_VERSION_1_2
|
||||||
|
flag=CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY;
|
||||||
|
#else
|
||||||
flag=CL_MEM_READ_ONLY;
|
flag=CL_MEM_READ_ONLY;
|
||||||
|
#endif
|
||||||
else if (mat.kind()==UCL_WRITE_ONLY)
|
else if (mat.kind()==UCL_WRITE_ONLY)
|
||||||
|
#ifdef CL_VERSION_1_2
|
||||||
|
flag=CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY;
|
||||||
|
#else
|
||||||
flag=CL_MEM_WRITE_ONLY;
|
flag=CL_MEM_WRITE_ONLY;
|
||||||
|
#endif
|
||||||
else
|
else
|
||||||
assert(0==1);
|
assert(0==1);
|
||||||
mat.cbegin()=clCreateBuffer(context,flag,n,NULL,&error_flag);
|
mat.cbegin()=clCreateBuffer(context,flag,n,NULL,&error_flag);
|
||||||
@ -285,9 +365,17 @@ inline int _device_resize(mat_type &mat, const size_t rows,
|
|||||||
if (mat.kind()==UCL_READ_WRITE)
|
if (mat.kind()==UCL_READ_WRITE)
|
||||||
flag=CL_MEM_READ_WRITE;
|
flag=CL_MEM_READ_WRITE;
|
||||||
else if (mat.kind()==UCL_READ_ONLY)
|
else if (mat.kind()==UCL_READ_ONLY)
|
||||||
|
#ifdef CL_VERSION_1_2
|
||||||
|
flag=CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY;
|
||||||
|
#else
|
||||||
flag=CL_MEM_READ_ONLY;
|
flag=CL_MEM_READ_ONLY;
|
||||||
|
#endif
|
||||||
else if (mat.kind()==UCL_WRITE_ONLY)
|
else if (mat.kind()==UCL_WRITE_ONLY)
|
||||||
|
#ifdef CL_VERSION_1_2
|
||||||
|
flag=CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY;
|
||||||
|
#else
|
||||||
flag=CL_MEM_WRITE_ONLY;
|
flag=CL_MEM_WRITE_ONLY;
|
||||||
|
#endif
|
||||||
else
|
else
|
||||||
assert(0==1);
|
assert(0==1);
|
||||||
mat.cbegin()=clCreateBuffer(context,flag,pitch*rows,NULL,&error_flag);
|
mat.cbegin()=clCreateBuffer(context,flag,pitch*rows,NULL,&error_flag);
|
||||||
@ -344,7 +432,19 @@ inline void _ocl_kernel_from_source(cl_context &context, cl_device_id &device,
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <class mat_type>
|
template <class mat_type>
|
||||||
inline void _device_zero(mat_type &mat, const size_t n) {
|
inline void _device_zero(mat_type &mat, const size_t n, command_queue &cq) {
|
||||||
|
#ifdef CL_VERSION_1_2
|
||||||
|
#ifndef __APPLE__
|
||||||
|
#define UCL_CL_ZERO
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef UCL_CL_ZERO
|
||||||
|
cl_int zeroint=0;
|
||||||
|
CL_SAFE_CALL(clEnqueueFillBuffer(cq,mat.begin(),&zeroint,sizeof(cl_int),
|
||||||
|
mat.byteoff(),n,0,NULL,NULL));
|
||||||
|
|
||||||
|
#else
|
||||||
cl_context context;
|
cl_context context;
|
||||||
CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_CONTEXT,sizeof(context),
|
CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_CONTEXT,sizeof(context),
|
||||||
&context,NULL));
|
&context,NULL));
|
||||||
@ -354,17 +454,20 @@ inline void _device_zero(mat_type &mat, const size_t n) {
|
|||||||
|
|
||||||
const char * szero[3]={
|
const char * szero[3]={
|
||||||
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
|
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
|
||||||
"__kernel void _device_zero(__global NUMTYP *a)",
|
"__kernel void _device_zero(__global NUMTYP *a, const int offset)",
|
||||||
" { int gid=get_global_id(0); a[gid]=(NUMTYP)0; }"
|
" { int gid=get_global_id(0)+offset; a[gid]=(NUMTYP)0; }"
|
||||||
};
|
};
|
||||||
|
|
||||||
cl_kernel kzero;
|
cl_kernel kzero;
|
||||||
_ocl_kernel_from_source(context,device,szero,3,kzero,"_device_zero",
|
_ocl_kernel_from_source(context,device,szero,3,kzero,"_device_zero",
|
||||||
_UCL_DATA_ID<typename mat_type::data_type>::numtyp_flag());
|
_UCL_DATA_ID<typename mat_type::data_type>::numtyp_flag());
|
||||||
|
|
||||||
|
cl_int offset=mat.offset();
|
||||||
CL_SAFE_CALL(clSetKernelArg(kzero,0,sizeof(cl_mem),(void *)&mat.begin()));
|
CL_SAFE_CALL(clSetKernelArg(kzero,0,sizeof(cl_mem),(void *)&mat.begin()));
|
||||||
|
CL_SAFE_CALL(clSetKernelArg(kzero,1,sizeof(cl_int),(void *)&offset));
|
||||||
size_t kn=n/sizeof(typename mat_type::data_type);
|
size_t kn=n/sizeof(typename mat_type::data_type);
|
||||||
CL_SAFE_CALL(clEnqueueNDRangeKernel(mat.cq(),kzero,1,0,&kn,0,0,0,0));
|
CL_SAFE_CALL(clEnqueueNDRangeKernel(cq,kzero,1,0,&kn,0,0,0,0));
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
// --------------------------------------------------------------------------
|
// --------------------------------------------------------------------------
|
||||||
@ -470,9 +573,15 @@ template <> struct _ucl_memcpy<1,0> {
|
|||||||
cl_command_queue &cq, const cl_bool block,
|
cl_command_queue &cq, const cl_bool block,
|
||||||
const size_t dst_offset, const size_t src_offset) {
|
const size_t dst_offset, const size_t src_offset) {
|
||||||
if (src.cbegin()==dst.cbegin()) {
|
if (src.cbegin()==dst.cbegin()) {
|
||||||
|
#ifdef UCL_DBG_MEM_TRACE
|
||||||
|
std::cerr << "UCL_COPY 1S\n";
|
||||||
|
#endif
|
||||||
if (block) ucl_sync(cq);
|
if (block) ucl_sync(cq);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
#ifdef UCL_DBG_MEM_TRACE
|
||||||
|
std::cerr << "UCL_COPY 1NS\n";
|
||||||
|
#endif
|
||||||
CL_SAFE_CALL(clEnqueueReadBuffer(cq,src.cbegin(),block,src_offset,n,
|
CL_SAFE_CALL(clEnqueueReadBuffer(cq,src.cbegin(),block,src_offset,n,
|
||||||
dst.begin(),0,NULL,NULL));
|
dst.begin(),0,NULL,NULL));
|
||||||
}
|
}
|
||||||
@ -484,8 +593,14 @@ template <> struct _ucl_memcpy<1,0> {
|
|||||||
size_t dst_offset, size_t src_offset) {
|
size_t dst_offset, size_t src_offset) {
|
||||||
if (src.cbegin()==dst.cbegin()) {
|
if (src.cbegin()==dst.cbegin()) {
|
||||||
if (block) ucl_sync(cq);
|
if (block) ucl_sync(cq);
|
||||||
|
#ifdef UCL_DBG_MEM_TRACE
|
||||||
|
std::cerr << "UCL_COPY 2S\n";
|
||||||
|
#endif
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
#ifdef UCL_DBG_MEM_TRACE
|
||||||
|
std::cerr << "UCL_COPY 2NS\n";
|
||||||
|
#endif
|
||||||
if (spitch==dpitch && dst.cols()==src.cols() &&
|
if (spitch==dpitch && dst.cols()==src.cols() &&
|
||||||
src.cols()==cols/src.element_size())
|
src.cols()==cols/src.element_size())
|
||||||
CL_SAFE_CALL(clEnqueueReadBuffer(cq,src.cbegin(),block,src_offset,
|
CL_SAFE_CALL(clEnqueueReadBuffer(cq,src.cbegin(),block,src_offset,
|
||||||
@ -511,8 +626,14 @@ template <> struct _ucl_memcpy<0,1> {
|
|||||||
const size_t dst_offset, const size_t src_offset) {
|
const size_t dst_offset, const size_t src_offset) {
|
||||||
if (src.cbegin()==dst.cbegin()) {
|
if (src.cbegin()==dst.cbegin()) {
|
||||||
if (block) ucl_sync(cq);
|
if (block) ucl_sync(cq);
|
||||||
|
#ifdef UCL_DBG_MEM_TRACE
|
||||||
|
std::cerr << "UCL_COPY 3S\n";
|
||||||
|
#endif
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
#ifdef UCL_DBG_MEM_TRACE
|
||||||
|
std::cerr << "UCL_COPY 3NS\n";
|
||||||
|
#endif
|
||||||
CL_SAFE_CALL(clEnqueueWriteBuffer(cq,dst.cbegin(),block,dst_offset,n,
|
CL_SAFE_CALL(clEnqueueWriteBuffer(cq,dst.cbegin(),block,dst_offset,n,
|
||||||
src.begin(),0,NULL,NULL));
|
src.begin(),0,NULL,NULL));
|
||||||
}
|
}
|
||||||
@ -524,8 +645,14 @@ template <> struct _ucl_memcpy<0,1> {
|
|||||||
size_t dst_offset, size_t src_offset) {
|
size_t dst_offset, size_t src_offset) {
|
||||||
if (src.cbegin()==dst.cbegin()) {
|
if (src.cbegin()==dst.cbegin()) {
|
||||||
if (block) ucl_sync(cq);
|
if (block) ucl_sync(cq);
|
||||||
|
#ifdef UCL_DBG_MEM_TRACE
|
||||||
|
std::cerr << "UCL_COPY 4S\n";
|
||||||
|
#endif
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
#ifdef UCL_DBG_MEM_TRACE
|
||||||
|
std::cerr << "UCL_COPY 4NS\n";
|
||||||
|
#endif
|
||||||
if (spitch==dpitch && dst.cols()==src.cols() &&
|
if (spitch==dpitch && dst.cols()==src.cols() &&
|
||||||
src.cols()==cols/src.element_size())
|
src.cols()==cols/src.element_size())
|
||||||
CL_SAFE_CALL(clEnqueueWriteBuffer(cq,dst.cbegin(),block,dst_offset,
|
CL_SAFE_CALL(clEnqueueWriteBuffer(cq,dst.cbegin(),block,dst_offset,
|
||||||
@ -549,9 +676,17 @@ template <int mem1, int mem2> struct _ucl_memcpy {
|
|||||||
static inline void mc(p1 &dst, const p2 &src, const size_t n,
|
static inline void mc(p1 &dst, const p2 &src, const size_t n,
|
||||||
cl_command_queue &cq, const cl_bool block,
|
cl_command_queue &cq, const cl_bool block,
|
||||||
const size_t dst_offset, const size_t src_offset) {
|
const size_t dst_offset, const size_t src_offset) {
|
||||||
if (src.cbegin()!=dst.cbegin() || src_offset!=dst_offset)
|
if (src.cbegin()!=dst.cbegin() || src_offset!=dst_offset) {
|
||||||
CL_SAFE_CALL(clEnqueueCopyBuffer(cq,src.cbegin(),dst.cbegin(),src_offset,
|
CL_SAFE_CALL(clEnqueueCopyBuffer(cq,src.cbegin(),dst.cbegin(),src_offset,
|
||||||
dst_offset,n,0,NULL,NULL));
|
dst_offset,n,0,NULL,NULL));
|
||||||
|
#ifdef UCL_DBG_MEM_TRACE
|
||||||
|
std::cerr << "UCL_COPY 6NS\n";
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
#ifdef UCL_DBG_MEM_TRACE
|
||||||
|
else std::cerr << "UCL_COPY 6S\n";
|
||||||
|
#endif
|
||||||
|
|
||||||
if (block==CL_TRUE) ucl_sync(cq);
|
if (block==CL_TRUE) ucl_sync(cq);
|
||||||
}
|
}
|
||||||
template <class p1, class p2>
|
template <class p1, class p2>
|
||||||
@ -561,6 +696,9 @@ template <int mem1, int mem2> struct _ucl_memcpy {
|
|||||||
const cl_bool block,
|
const cl_bool block,
|
||||||
size_t dst_offset, size_t src_offset) {
|
size_t dst_offset, size_t src_offset) {
|
||||||
if (src.cbegin()!=dst.cbegin() || src_offset!=dst_offset) {
|
if (src.cbegin()!=dst.cbegin() || src_offset!=dst_offset) {
|
||||||
|
#ifdef UCL_DBG_MEM_TRACE
|
||||||
|
std::cerr << "UCL_COPY 7NS\n";
|
||||||
|
#endif
|
||||||
if (spitch==dpitch && dst.cols()==src.cols() &&
|
if (spitch==dpitch && dst.cols()==src.cols() &&
|
||||||
src.cols()==cols/src.element_size())
|
src.cols()==cols/src.element_size())
|
||||||
CL_SAFE_CALL(clEnqueueCopyBuffer(cq,src.cbegin(),dst.cbegin(),src_offset,
|
CL_SAFE_CALL(clEnqueueCopyBuffer(cq,src.cbegin(),dst.cbegin(),src_offset,
|
||||||
@ -575,6 +713,10 @@ template <int mem1, int mem2> struct _ucl_memcpy {
|
|||||||
dst_offset+=dpitch;
|
dst_offset+=dpitch;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#ifdef UCL_DBG_MEM_TRACE
|
||||||
|
else std::cerr << "UCL_COPY 7S\n";
|
||||||
|
#endif
|
||||||
|
|
||||||
if (block==CL_TRUE) ucl_sync(cq);
|
if (block==CL_TRUE) ucl_sync(cq);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|||||||
@ -27,6 +27,12 @@
|
|||||||
#include "ocl_macros.h"
|
#include "ocl_macros.h"
|
||||||
#include "ocl_device.h"
|
#include "ocl_device.h"
|
||||||
|
|
||||||
|
#ifdef CL_VERSION_1_2
|
||||||
|
#define UCL_OCL_MARKER(cq,event) clEnqueueMarkerWithWaitList(cq,0,NULL,event)
|
||||||
|
#else
|
||||||
|
#define UCL_OCL_MARKER clEnqueueMarker
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace ucl_opencl {
|
namespace ucl_opencl {
|
||||||
|
|
||||||
/// Class for timing OpenCL events
|
/// Class for timing OpenCL events
|
||||||
@ -63,10 +69,10 @@ class UCL_Timer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Start timing on default command queue
|
/// Start timing on default command queue
|
||||||
inline void start() { clEnqueueMarker(_cq,&start_event); }
|
inline void start() { UCL_OCL_MARKER(_cq,&start_event); }
|
||||||
|
|
||||||
/// Stop timing on default command queue
|
/// Stop timing on default command queue
|
||||||
inline void stop() { clEnqueueMarker(_cq,&stop_event); }
|
inline void stop() { UCL_OCL_MARKER(_cq,&stop_event); }
|
||||||
|
|
||||||
/// Block until the start event has been reached on device
|
/// Block until the start event has been reached on device
|
||||||
inline void sync_start()
|
inline void sync_start()
|
||||||
@ -78,7 +84,7 @@ class UCL_Timer {
|
|||||||
|
|
||||||
/// Set the time elapsed to zero (not the total_time)
|
/// Set the time elapsed to zero (not the total_time)
|
||||||
inline void zero()
|
inline void zero()
|
||||||
{ clEnqueueMarker(_cq,&start_event); clEnqueueMarker(_cq,&stop_event); }
|
{ UCL_OCL_MARKER(_cq,&start_event); UCL_OCL_MARKER(_cq,&stop_event); }
|
||||||
|
|
||||||
/// Set the total time to zero
|
/// Set the total time to zero
|
||||||
inline void zero_total() { _total_time=0.0; }
|
inline void zero_total() { _total_time=0.0; }
|
||||||
|
|||||||
@ -58,19 +58,36 @@
|
|||||||
* calls for reserving and copying memory **/
|
* calls for reserving and copying memory **/
|
||||||
class UCL_BaseMat {
|
class UCL_BaseMat {
|
||||||
public:
|
public:
|
||||||
UCL_BaseMat() : _cq(0) { }
|
UCL_BaseMat() : _cq(0), _kind(UCL_VIEW) { }
|
||||||
virtual ~UCL_BaseMat() { }
|
virtual ~UCL_BaseMat() { }
|
||||||
/// Return the default command queue/stream associated with this data
|
/// Return the default command queue/stream associated with this data
|
||||||
inline command_queue & cq() { return _cq; }
|
inline command_queue & cq() { return _cq; }
|
||||||
|
/// Change the default command queue associated with matrix
|
||||||
|
inline void cq(command_queue &cq_in) { _cq=cq_in; }
|
||||||
/// Block until command_queue associated with matrix is complete
|
/// Block until command_queue associated with matrix is complete
|
||||||
inline void sync() { ucl_sync(_cq); }
|
inline void sync() { ucl_sync(_cq); }
|
||||||
|
/// Return the type/permissions of memory allocation
|
||||||
|
/** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, UCL_NOT_PINNED
|
||||||
|
* or UCL_VIEW **/
|
||||||
|
inline enum UCL_MEMOPT kind() const { return _kind; }
|
||||||
|
|
||||||
|
inline bool shared_mem_device() {
|
||||||
|
#ifdef _OCL_MAT
|
||||||
|
cl_device_id device;
|
||||||
|
CL_SAFE_CALL(clGetCommandQueueInfo(_cq,CL_QUEUE_DEVICE,
|
||||||
|
sizeof(cl_device_id),&device,NULL));
|
||||||
|
cl_device_type device_type;
|
||||||
|
CL_SAFE_CALL(clGetDeviceInfo(device,CL_DEVICE_TYPE,
|
||||||
|
sizeof(device_type),&device_type,NULL));
|
||||||
|
return _shared_mem_device(device_type);
|
||||||
|
#else
|
||||||
|
return false;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef UCL_DEBUG
|
|
||||||
// Returns the type of host allocation
|
|
||||||
virtual inline enum UCL_MEMOPT kind() const { return UCL_NOT_PINNED; }
|
|
||||||
#endif
|
|
||||||
protected:
|
protected:
|
||||||
command_queue _cq;
|
command_queue _cq;
|
||||||
|
enum UCL_MEMOPT _kind;
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -102,6 +102,30 @@
|
|||||||
// Only allow this file to be included by nvc_memory.h and ocl_memory.h
|
// Only allow this file to be included by nvc_memory.h and ocl_memory.h
|
||||||
#ifdef UCL_COPY_ALLOW
|
#ifdef UCL_COPY_ALLOW
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
// - CHECK PERMISSIONS FOR SOURCE AND DESTINATION IN COPY
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
template <class mat1, class mat2>
|
||||||
|
inline void _check_ucl_copy_perm(mat1 &dst, mat2 &src) {
|
||||||
|
if ((int)mat1::MEM_TYPE==(int)mat2::MEM_TYPE) {
|
||||||
|
if (dst.kind()==UCL_READ_ONLY) {
|
||||||
|
std::cerr << "Attempt to copy where destination is UCL_READ_ONLY\n";
|
||||||
|
assert(0==1);
|
||||||
|
} else if (src.kind()==UCL_WRITE_ONLY) {
|
||||||
|
std::cerr << "Attempt to copy where source is UCL_WRITE_ONLY\n";
|
||||||
|
assert(0==1);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (dst.kind()==UCL_WRITE_ONLY) {
|
||||||
|
std::cerr << "Destination in host-device copy cannot be UCL_WRITE_ONLY\n";
|
||||||
|
assert(0==1);
|
||||||
|
} else if (src.kind()==UCL_READ_ONLY) {
|
||||||
|
std::cerr << "Source in host-device copy cannot be UCL_READ_ONLY\n";
|
||||||
|
assert(0==1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// --------------------------------------------------------------------------
|
// --------------------------------------------------------------------------
|
||||||
// - HOST-HOST COPY ROUTINES
|
// - HOST-HOST COPY ROUTINES
|
||||||
// --------------------------------------------------------------------------
|
// --------------------------------------------------------------------------
|
||||||
@ -117,9 +141,20 @@ template <> struct _host_host_copy<1,1> {
|
|||||||
assert(mat1::PADDED==0 && mat2::PADDED==0);
|
assert(mat1::PADDED==0 && mat2::PADDED==0);
|
||||||
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
|
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
|
||||||
#endif
|
#endif
|
||||||
if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE && mat1::DATA_TYPE!=0)
|
if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE && mat1::DATA_TYPE!=0) {
|
||||||
|
#ifdef _OCL_MAT
|
||||||
|
if (dst.begin()==src.begin()) {
|
||||||
|
#ifdef UCL_DBG_MEM_TRACE
|
||||||
|
std::cerr << "UCL_COPY 7S\n";
|
||||||
|
#endif
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
memcpy(dst.begin(),src.begin(),numel*sizeof(typename mat1::data_type));
|
memcpy(dst.begin(),src.begin(),numel*sizeof(typename mat1::data_type));
|
||||||
else
|
#ifdef UCL_DBG_MEM_TRACE
|
||||||
|
std::cerr << "UCL_COPY 7NS\n";
|
||||||
|
#endif
|
||||||
|
} else
|
||||||
for (size_t i=0; i<numel; i++)
|
for (size_t i=0; i<numel; i++)
|
||||||
dst[i]=static_cast<typename mat1::data_type>(src[i]);
|
dst[i]=static_cast<typename mat1::data_type>(src[i]);
|
||||||
}
|
}
|
||||||
@ -138,15 +173,27 @@ template <> struct _host_host_copy<1,1> {
|
|||||||
src_row_size=cols;
|
src_row_size=cols;
|
||||||
else
|
else
|
||||||
src_row_size=src.row_size();
|
src_row_size=src.row_size();
|
||||||
if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE && mat1::DATA_TYPE!=0)
|
if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE && mat1::DATA_TYPE!=0) {
|
||||||
|
#ifdef _OCL_MAT
|
||||||
|
if (dst.begin()==src.begin()) {
|
||||||
|
#ifdef UCL_DBG_MEM_TRACE
|
||||||
|
std::cerr << "UCL_COPY 8S\n";
|
||||||
|
#endif
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef UCL_DBG_MEM_TRACE
|
||||||
|
std::cerr << "UCL_COPY 8NS\n";
|
||||||
|
#endif
|
||||||
for (size_t i=0; i<rows; i++)
|
for (size_t i=0; i<rows; i++)
|
||||||
memcpy(dst.begin()+i*dst_row_size,src.begin()+i*src_row_size,
|
memcpy(dst.begin()+i*dst_row_size,src.begin()+i*src_row_size,
|
||||||
cols*sizeof(typename mat1::data_type));
|
cols*sizeof(typename mat1::data_type));
|
||||||
else
|
} else
|
||||||
for (size_t j=0; j<rows; j++) {
|
for (size_t j=0; j<rows; j++) {
|
||||||
int dst_i=j*dst_row_size;
|
size_t dst_i=j*dst_row_size;
|
||||||
int d_end=dst_i+cols;
|
size_t d_end=dst_i+cols;
|
||||||
int src_i=j*src_row_size;
|
size_t src_i=j*src_row_size;
|
||||||
for (; dst_i<d_end; dst_i++) {
|
for (; dst_i<d_end; dst_i++) {
|
||||||
dst[dst_i]=static_cast<typename mat1::data_type>(src[src_i]);
|
dst[dst_i]=static_cast<typename mat1::data_type>(src[src_i]);
|
||||||
src_i++;
|
src_i++;
|
||||||
@ -216,15 +263,14 @@ template <int host_type2> struct _ucl_cast_copy<1,host_type2> {
|
|||||||
ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
|
ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
|
||||||
src.row_bytes(),cols*sizeof(typename mat2::data_type),
|
src.row_bytes(),cols*sizeof(typename mat2::data_type),
|
||||||
rows);
|
rows);
|
||||||
int dst_i=0;
|
size_t dst_i=0, buff_i=0, doff=dst.cols()-cols;
|
||||||
int buff_i=0;
|
|
||||||
for (size_t i=0; i<rows; i++) {
|
for (size_t i=0; i<rows; i++) {
|
||||||
for (size_t j=0; j<cols; j++) {
|
for (size_t j=0; j<cols; j++) {
|
||||||
dst[dst_i]=static_cast<typename mat1::data_type>(cast_buffer[buff_i]);
|
dst[dst_i]=static_cast<typename mat1::data_type>(cast_buffer[buff_i]);
|
||||||
buff_i++;
|
buff_i++;
|
||||||
dst_i++;
|
dst_i++;
|
||||||
}
|
}
|
||||||
dst_i+=dst.cols()-cols;
|
dst_i+=doff;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -255,15 +301,14 @@ template <int host_type2> struct _ucl_cast_copy<1,host_type2> {
|
|||||||
src.row_bytes(),cols*sizeof(typename mat2::data_type),
|
src.row_bytes(),cols*sizeof(typename mat2::data_type),
|
||||||
rows,cq);
|
rows,cq);
|
||||||
cast_buffer.sync();
|
cast_buffer.sync();
|
||||||
int dst_i=0;
|
size_t dst_i=0, buff_i=0, doff=dst.cols()-cols;
|
||||||
int buff_i=0;
|
|
||||||
for (size_t i=0; i<rows; i++) {
|
for (size_t i=0; i<rows; i++) {
|
||||||
for (size_t j=0; j<cols; j++) {
|
for (size_t j=0; j<cols; j++) {
|
||||||
dst[dst_i]=static_cast<typename mat1::data_type>(cast_buffer[buff_i]);
|
dst[dst_i]=static_cast<typename mat1::data_type>(cast_buffer[buff_i]);
|
||||||
buff_i++;
|
buff_i++;
|
||||||
dst_i++;
|
dst_i++;
|
||||||
}
|
}
|
||||||
dst_i+=dst.cols()-cols;
|
dst_i+=doff;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -293,38 +338,62 @@ template <int host_type1> struct _ucl_cast_copy<host_type1,1> {
|
|||||||
assert(src.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
|
assert(src.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
|
||||||
if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
|
if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
|
||||||
if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
|
if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
|
||||||
|
if (mat3::VECTOR==0) {
|
||||||
|
assert(cast_buffer.rows()>=rows && cast_buffer.cols()>=cols);
|
||||||
|
assert(dst.rows()>=rows && dst.cols()>=cols);
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
if (mat2::VECTOR) {
|
if (mat2::VECTOR) {
|
||||||
for (size_t i=0; i<rows*cols; i++)
|
if (mat3::VECTOR==0) {
|
||||||
cast_buffer[i]=static_cast<typename mat3::data_type>(src[i]);
|
size_t ci=0, si=0, co=cast_buffer.cols()-cols, so=src.cols()-cols;
|
||||||
ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,
|
for (size_t i=0; i<rows; i++) {
|
||||||
cols*sizeof(typename mat1::data_type),
|
for (size_t j=0; j<cols; j++) {
|
||||||
cols*sizeof(typename mat1::data_type),rows);
|
cast_buffer[ci]=static_cast<typename mat3::data_type>(src[si]);
|
||||||
|
ci++;
|
||||||
|
si++;
|
||||||
|
}
|
||||||
|
ci+=co;
|
||||||
|
si+=so;
|
||||||
|
}
|
||||||
|
ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,cast_buffer.row_bytes(),
|
||||||
|
cols*sizeof(typename mat1::data_type),rows);
|
||||||
|
} else {
|
||||||
|
for (size_t i=0; i<rows*cols; i++)
|
||||||
|
cast_buffer[i]=static_cast<typename mat3::data_type>(src[i]);
|
||||||
|
ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,
|
||||||
|
cols*sizeof(typename mat1::data_type),
|
||||||
|
cols*sizeof(typename mat1::data_type),rows);
|
||||||
|
}
|
||||||
} else if (mat1::VECTOR) {
|
} else if (mat1::VECTOR) {
|
||||||
int src_i=0;
|
size_t src_i=0, buf_i=0, soff=src.cols()-cols;
|
||||||
int buf_i=0;
|
|
||||||
for (size_t i=0; i<rows; i++) {
|
for (size_t i=0; i<rows; i++) {
|
||||||
for (size_t j=0; j<cols; j++) {
|
for (size_t j=0; j<cols; j++) {
|
||||||
cast_buffer[buf_i]=static_cast<typename mat3::data_type>(src[src_i]);
|
cast_buffer[buf_i]=static_cast<typename mat3::data_type>(src[src_i]);
|
||||||
buf_i++;
|
buf_i++;
|
||||||
src_i++;
|
src_i++;
|
||||||
}
|
}
|
||||||
src_i+=src.cols()-cols;
|
src_i+=soff;
|
||||||
}
|
}
|
||||||
ucl_mv_cpy(dst,cast_buffer,cols*sizeof(typename mat1::data_type)*rows);
|
ucl_mv_cpy(dst,cast_buffer,cols*sizeof(typename mat1::data_type)*rows);
|
||||||
} else {
|
} else {
|
||||||
int src_i=0;
|
size_t src_i=0, buf_i=0, so=src.cols()-cols, co, spitch;
|
||||||
int buf_i=0;
|
if (mat3::VECTOR==0) {
|
||||||
|
co=cast_buffer.cols()-cols;
|
||||||
|
spitch=cast_buffer.row_bytes();
|
||||||
|
} else {
|
||||||
|
co=0;
|
||||||
|
spitch=cols*sizeof(typename mat1::data_type);
|
||||||
|
}
|
||||||
for (size_t i=0; i<rows; i++) {
|
for (size_t i=0; i<rows; i++) {
|
||||||
for (size_t j=0; j<cols; j++) {
|
for (size_t j=0; j<cols; j++) {
|
||||||
cast_buffer[buf_i]=static_cast<typename mat3::data_type>(src[src_i]);
|
cast_buffer[buf_i]=static_cast<typename mat3::data_type>(src[src_i]);
|
||||||
buf_i++;
|
buf_i++;
|
||||||
src_i++;
|
src_i++;
|
||||||
}
|
}
|
||||||
src_i+=src.cols()-cols;
|
src_i+=so;
|
||||||
|
buf_i+=co;
|
||||||
}
|
}
|
||||||
ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,
|
ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,spitch,
|
||||||
cols*sizeof(typename mat1::data_type),
|
|
||||||
cols*sizeof(typename mat1::data_type),rows);
|
cols*sizeof(typename mat1::data_type),rows);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -337,38 +406,62 @@ template <int host_type1> struct _ucl_cast_copy<host_type1,1> {
|
|||||||
assert(src.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
|
assert(src.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
|
||||||
if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
|
if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
|
||||||
if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
|
if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
|
||||||
|
if (mat3::VECTOR==0) {
|
||||||
|
assert(cast_buffer.rows()>=rows && cast_buffer.cols()>=cols);
|
||||||
|
assert(dst.rows()>=rows && dst.cols()>=cols);
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
if (mat2::VECTOR) {
|
if (mat2::VECTOR) {
|
||||||
for (size_t i=0; i<rows*cols; i++)
|
if (mat3::VECTOR==0) {
|
||||||
cast_buffer[i]=static_cast<typename mat3::data_type>(src[i]);
|
size_t ci=0, si=0, co=cast_buffer.cols()-cols, so=src.cols()-cols;
|
||||||
ucl_mv_cpy(dst,dst.row_bytes(),
|
for (size_t i=0; i<rows; i++) {
|
||||||
cast_buffer,cols*sizeof(typename mat1::data_type),
|
for (size_t j=0; j<cols; j++) {
|
||||||
cols*sizeof(typename mat1::data_type),rows,cq);
|
cast_buffer[ci]=static_cast<typename mat3::data_type>(src[si]);
|
||||||
|
ci++;
|
||||||
|
si++;
|
||||||
|
}
|
||||||
|
ci+=co;
|
||||||
|
si+=so;
|
||||||
|
}
|
||||||
|
ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,cast_buffer.row_bytes(),
|
||||||
|
cols*sizeof(typename mat1::data_type),rows);
|
||||||
|
} else {
|
||||||
|
for (size_t i=0; i<rows*cols; i++)
|
||||||
|
cast_buffer[i]=static_cast<typename mat3::data_type>(src[i]);
|
||||||
|
ucl_mv_cpy(dst,dst.row_bytes(),
|
||||||
|
cast_buffer,cols*sizeof(typename mat1::data_type),
|
||||||
|
cols*sizeof(typename mat1::data_type),rows,cq);
|
||||||
|
}
|
||||||
} else if (mat1::VECTOR) {
|
} else if (mat1::VECTOR) {
|
||||||
int src_i=0;
|
size_t src_i=0, buf_i=0, soff=src.cols()-cols;
|
||||||
int buf_i=0;
|
|
||||||
for (size_t i=0; i<rows; i++) {
|
for (size_t i=0; i<rows; i++) {
|
||||||
for (size_t j=0; j<cols; j++) {
|
for (size_t j=0; j<cols; j++) {
|
||||||
cast_buffer[buf_i]=static_cast<typename mat3::data_type>(src[src_i]);
|
cast_buffer[buf_i]=static_cast<typename mat3::data_type>(src[src_i]);
|
||||||
buf_i++;
|
buf_i++;
|
||||||
src_i++;
|
src_i++;
|
||||||
}
|
}
|
||||||
src_i+=src.cols()-cols;
|
src_i+=soff;
|
||||||
}
|
}
|
||||||
ucl_mv_cpy(dst,cast_buffer,cols*sizeof(typename mat1::data_type)*rows,cq);
|
ucl_mv_cpy(dst,cast_buffer,cols*sizeof(typename mat1::data_type)*rows,cq);
|
||||||
} else {
|
} else {
|
||||||
int src_i=0;
|
size_t src_i=0, buf_i=0, so=src.cols()-cols, co, spitch;
|
||||||
int buf_i=0;
|
if (mat3::VECTOR==0) {
|
||||||
|
co=cast_buffer.cols()-cols;
|
||||||
|
spitch=cast_buffer.row_bytes();
|
||||||
|
} else {
|
||||||
|
co=0;
|
||||||
|
spitch=cols*sizeof(typename mat1::data_type);
|
||||||
|
}
|
||||||
for (size_t i=0; i<rows; i++) {
|
for (size_t i=0; i<rows; i++) {
|
||||||
for (size_t j=0; j<cols; j++) {
|
for (size_t j=0; j<cols; j++) {
|
||||||
cast_buffer[buf_i]=static_cast<typename mat3::data_type>(src[src_i]);
|
cast_buffer[buf_i]=static_cast<typename mat3::data_type>(src[src_i]);
|
||||||
buf_i++;
|
buf_i++;
|
||||||
src_i++;
|
src_i++;
|
||||||
}
|
}
|
||||||
src_i+=src.cols()-cols;
|
src_i+=so;
|
||||||
|
buf_i+=co;
|
||||||
}
|
}
|
||||||
ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,
|
ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,spitch,
|
||||||
cols*sizeof(typename mat1::data_type),
|
|
||||||
cols*sizeof(typename mat1::data_type),rows,cq);
|
cols*sizeof(typename mat1::data_type),rows,cq);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -444,9 +537,13 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel,
|
|||||||
#endif
|
#endif
|
||||||
if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
|
if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
|
||||||
ucl_copy(dst,src,numel,cq);
|
ucl_copy(dst,src,numel,cq);
|
||||||
else
|
else {
|
||||||
|
#ifdef UCL_DEBUG
|
||||||
|
_check_ucl_copy_perm(dst,src);
|
||||||
|
#endif
|
||||||
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
|
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
|
||||||
cast_buffer,cq);
|
cast_buffer,cq);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Asynchronous copy of matrix/vector with cast (Device/Host transfer)
|
/// Asynchronous copy of matrix/vector with cast (Device/Host transfer)
|
||||||
@ -463,6 +560,7 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel,
|
|||||||
assert(dst.numel()>=numel && src.numel()>=numel);
|
assert(dst.numel()>=numel && src.numel()>=numel);
|
||||||
assert(cast_buffer.numel()>=numel);
|
assert(cast_buffer.numel()>=numel);
|
||||||
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
|
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
|
||||||
|
_check_ucl_copy_perm(dst,src);
|
||||||
#endif
|
#endif
|
||||||
if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
|
if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
|
||||||
ucl_copy(dst,src,numel,async);
|
ucl_copy(dst,src,numel,async);
|
||||||
@ -491,6 +589,7 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
|
|||||||
assert(dst.row_size()*dst.rows()>=numel && src.row_size()*src.rows()>=numel);
|
assert(dst.row_size()*dst.rows()>=numel && src.row_size()*src.rows()>=numel);
|
||||||
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
|
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
|
||||||
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
|
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
|
||||||
|
_check_ucl_copy_perm(dst,src);
|
||||||
#endif
|
#endif
|
||||||
if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
|
if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
|
||||||
_host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,numel);
|
_host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,numel);
|
||||||
@ -498,12 +597,12 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
|
|||||||
(mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
|
(mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
|
||||||
if (mat1::MEM_TYPE==1) {
|
if (mat1::MEM_TYPE==1) {
|
||||||
UCL_H_Vec<typename mat2::data_type> cast_buffer;
|
UCL_H_Vec<typename mat2::data_type> cast_buffer;
|
||||||
cast_buffer.alloc(numel,dst,UCL_RW_OPTIMIZED);
|
cast_buffer.alloc(numel,dst,UCL_READ_ONLY);
|
||||||
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
|
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
|
||||||
cast_buffer,cq);
|
cast_buffer,cq);
|
||||||
} else {
|
} else {
|
||||||
UCL_H_Vec<typename mat1::data_type> cast_buffer;
|
UCL_H_Vec<typename mat1::data_type> cast_buffer;
|
||||||
cast_buffer.alloc(numel,dst,UCL_WRITE_OPTIMIZED);
|
cast_buffer.alloc(numel,dst,UCL_WRITE_ONLY);
|
||||||
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
|
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
|
||||||
cast_buffer,cq);
|
cast_buffer,cq);
|
||||||
}
|
}
|
||||||
@ -529,6 +628,7 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
|
|||||||
#ifdef UCL_DEBUG
|
#ifdef UCL_DEBUG
|
||||||
assert(dst.row_size()*dst.rows()>=numel && src.row_size()*src.rows()>=numel);
|
assert(dst.row_size()*dst.rows()>=numel && src.row_size()*src.rows()>=numel);
|
||||||
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
|
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
|
||||||
|
_check_ucl_copy_perm(dst,src);
|
||||||
#endif
|
#endif
|
||||||
if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
|
if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
|
||||||
_host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,numel);
|
_host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,numel);
|
||||||
@ -538,12 +638,12 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
|
|||||||
(mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
|
(mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
|
||||||
if (mat1::MEM_TYPE==1) {
|
if (mat1::MEM_TYPE==1) {
|
||||||
UCL_H_Vec<typename mat2::data_type> cast_buffer;
|
UCL_H_Vec<typename mat2::data_type> cast_buffer;
|
||||||
cast_buffer.alloc(numel,dst,UCL_RW_OPTIMIZED);
|
cast_buffer.alloc(numel,dst,UCL_READ_ONLY);
|
||||||
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
|
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
|
||||||
cast_buffer);
|
cast_buffer);
|
||||||
} else {
|
} else {
|
||||||
UCL_H_Vec<typename mat1::data_type> cast_buffer;
|
UCL_H_Vec<typename mat1::data_type> cast_buffer;
|
||||||
cast_buffer.alloc(numel,dst,UCL_WRITE_OPTIMIZED);
|
cast_buffer.alloc(numel,dst,UCL_WRITE_ONLY);
|
||||||
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
|
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
|
||||||
cast_buffer);
|
cast_buffer);
|
||||||
}
|
}
|
||||||
@ -574,9 +674,13 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t rows,
|
|||||||
ucl_copy(dst,src,rows,cols,async);
|
ucl_copy(dst,src,rows,cols,async);
|
||||||
else if (async)
|
else if (async)
|
||||||
ucl_copy(dst,src,rows,cols,dst.cq());
|
ucl_copy(dst,src,rows,cols,dst.cq());
|
||||||
else
|
else {
|
||||||
|
#ifdef UCL_DEBUG
|
||||||
|
_check_ucl_copy_perm(dst,src);
|
||||||
|
#endif
|
||||||
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
|
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
|
||||||
cast_buffer);
|
cast_buffer);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Asynchronous copy subset matrix rows,cols with cast (Device/Host transfer)
|
/// Asynchronous copy subset matrix rows,cols with cast (Device/Host transfer)
|
||||||
@ -595,9 +699,13 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t rows,
|
|||||||
command_queue &cq) {
|
command_queue &cq) {
|
||||||
if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
|
if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
|
||||||
ucl_copy(dst,src,rows,cols,cq);
|
ucl_copy(dst,src,rows,cols,cq);
|
||||||
else
|
else {
|
||||||
|
#ifdef UCL_DEBUG
|
||||||
|
_check_ucl_copy_perm(dst,src);
|
||||||
|
#endif
|
||||||
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
|
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
|
||||||
cast_buffer,cq);
|
cast_buffer,cq);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Asynchronous copy of subset matrix rows,cols (memory already allocated)
|
/// Asynchronous copy of subset matrix rows,cols (memory already allocated)
|
||||||
@ -617,18 +725,21 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t rows,
|
|||||||
template <class mat1, class mat2>
|
template <class mat1, class mat2>
|
||||||
inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
|
inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
|
||||||
const size_t cols, command_queue &cq) {
|
const size_t cols, command_queue &cq) {
|
||||||
|
#ifdef UCL_DEBUG
|
||||||
|
_check_ucl_copy_perm(dst,src);
|
||||||
|
#endif
|
||||||
if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
|
if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
|
||||||
_host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,rows,cols);
|
_host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,rows,cols);
|
||||||
else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE &&
|
else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE &&
|
||||||
(mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
|
(mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
|
||||||
if (mat1::MEM_TYPE==1) {
|
if (mat1::MEM_TYPE==1) {
|
||||||
UCL_H_Vec<typename mat2::data_type> cast_buffer;
|
UCL_H_Vec<typename mat2::data_type> cast_buffer;
|
||||||
cast_buffer.alloc(rows*cols,dst,UCL_RW_OPTIMIZED);
|
cast_buffer.alloc(rows*cols,dst,UCL_READ_ONLY);
|
||||||
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
|
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
|
||||||
cast_buffer,cq);
|
cast_buffer,cq);
|
||||||
} else {
|
} else {
|
||||||
UCL_H_Vec<typename mat1::data_type> cast_buffer;
|
UCL_H_Vec<typename mat1::data_type> cast_buffer;
|
||||||
cast_buffer.alloc(rows*cols,dst,UCL_WRITE_OPTIMIZED);
|
cast_buffer.alloc(rows*cols,dst,UCL_WRITE_ONLY);
|
||||||
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
|
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
|
||||||
cast_buffer,cq);
|
cast_buffer,cq);
|
||||||
}
|
}
|
||||||
@ -678,6 +789,9 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
|
|||||||
template <class mat1, class mat2>
|
template <class mat1, class mat2>
|
||||||
inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
|
inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
|
||||||
const size_t cols, const bool async) {
|
const size_t cols, const bool async) {
|
||||||
|
#ifdef UCL_DEBUG
|
||||||
|
_check_ucl_copy_perm(dst,src);
|
||||||
|
#endif
|
||||||
if (async)
|
if (async)
|
||||||
ucl_copy(dst,src,rows,cols,dst.cq());
|
ucl_copy(dst,src,rows,cols,dst.cq());
|
||||||
else if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
|
else if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
|
||||||
@ -686,12 +800,12 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
|
|||||||
(mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
|
(mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
|
||||||
if (mat1::MEM_TYPE==1) {
|
if (mat1::MEM_TYPE==1) {
|
||||||
UCL_H_Vec<typename mat2::data_type> cast_buffer;
|
UCL_H_Vec<typename mat2::data_type> cast_buffer;
|
||||||
cast_buffer.alloc(rows*cols,dst,UCL_RW_OPTIMIZED);
|
cast_buffer.alloc(rows*cols,dst,UCL_READ_ONLY);
|
||||||
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
|
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
|
||||||
cast_buffer);
|
cast_buffer);
|
||||||
} else {
|
} else {
|
||||||
UCL_H_Vec<typename mat1::data_type> cast_buffer;
|
UCL_H_Vec<typename mat1::data_type> cast_buffer;
|
||||||
cast_buffer.alloc(rows*cols,dst,UCL_WRITE_OPTIMIZED);
|
cast_buffer.alloc(rows*cols,dst,UCL_WRITE_ONLY);
|
||||||
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
|
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
|
||||||
cast_buffer);
|
cast_buffer);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -39,14 +39,14 @@ class UCL_D_Mat : public UCL_BaseMat {
|
|||||||
};
|
};
|
||||||
typedef numtyp data_type;
|
typedef numtyp data_type;
|
||||||
|
|
||||||
UCL_D_Mat() : _rows(0), _kind(UCL_VIEW) {}
|
UCL_D_Mat() : _cols(0) {}
|
||||||
~UCL_D_Mat() { if (_kind!=UCL_VIEW) _device_free(*this); }
|
~UCL_D_Mat() { _device_free(*this); }
|
||||||
|
|
||||||
/// Construct with specified rows and cols
|
/// Construct with specified rows and cols
|
||||||
/** \sa alloc() **/
|
/** \sa alloc() **/
|
||||||
UCL_D_Mat(const size_t rows, const size_t cols, UCL_Device &device,
|
UCL_D_Mat(const size_t rows, const size_t cols, UCL_Device &device,
|
||||||
const enum UCL_MEMOPT kind=UCL_READ_WRITE) :
|
const enum UCL_MEMOPT kind=UCL_READ_WRITE) :
|
||||||
_rows(0), _kind(UCL_VIEW) { alloc(rows,cols,device,kind); }
|
_cols(0) { alloc(rows,cols,device,kind); }
|
||||||
|
|
||||||
/// Row major matrix on device
|
/// Row major matrix on device
|
||||||
/** The kind parameter controls memory optimizations as follows:
|
/** The kind parameter controls memory optimizations as follows:
|
||||||
@ -121,15 +121,11 @@ class UCL_D_Mat : public UCL_BaseMat {
|
|||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return the type of memory allocation
|
|
||||||
/** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, or UCL_VIEW **/
|
|
||||||
inline enum UCL_MEMOPT kind() const { return _kind; }
|
|
||||||
|
|
||||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||||
/** This function must be passed a Geryon vector or matrix container.
|
/** This function must be passed a Geryon vector or matrix container.
|
||||||
* No memory is freed when the object is destructed.
|
* No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container
|
* allocating container when using CUDA APIs
|
||||||
* \param stride Number of _elements_ between the start of each row **/
|
* \param stride Number of _elements_ between the start of each row **/
|
||||||
template <class ucl_type>
|
template <class ucl_type>
|
||||||
inline void view(ucl_type &input, const size_t rows, const size_t cols,
|
inline void view(ucl_type &input, const size_t rows, const size_t cols,
|
||||||
@ -142,8 +138,10 @@ class UCL_D_Mat : public UCL_BaseMat {
|
|||||||
_row_size=stride;
|
_row_size=stride;
|
||||||
this->_cq=input.cq();
|
this->_cq=input.cq();
|
||||||
#ifdef _OCL_MAT
|
#ifdef _OCL_MAT
|
||||||
_offset=0;
|
_offset=input.offset();
|
||||||
_array=input.cbegin();
|
_array=input.cbegin();
|
||||||
|
CL_SAFE_CALL(clRetainMemObject(input.cbegin()));
|
||||||
|
CL_SAFE_CALL(clRetainCommandQueue(input.cq()));
|
||||||
#else
|
#else
|
||||||
_device_view(&_array,input.begin());
|
_device_view(&_array,input.begin());
|
||||||
#endif
|
#endif
|
||||||
@ -157,7 +155,7 @@ class UCL_D_Mat : public UCL_BaseMat {
|
|||||||
/** This function must be passed a Geryon vector or matrix container.
|
/** This function must be passed a Geryon vector or matrix container.
|
||||||
* No memory is freed when the object is destructed.
|
* No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container **/
|
* allocating container when using CUDA APIs **/
|
||||||
template <class ucl_type>
|
template <class ucl_type>
|
||||||
inline void view(ucl_type &input, const size_t rows, const size_t cols)
|
inline void view(ucl_type &input, const size_t rows, const size_t cols)
|
||||||
{ view(input,rows,cols,input.row_size()); }
|
{ view(input,rows,cols,input.row_size()); }
|
||||||
@ -166,7 +164,7 @@ class UCL_D_Mat : public UCL_BaseMat {
|
|||||||
/** This function must be passed a Geryon vector or matrix container.
|
/** This function must be passed a Geryon vector or matrix container.
|
||||||
* No memory is freed when the object is destructed.
|
* No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container
|
* allocating container when using CUDA APIs
|
||||||
* - If a matrix is used a input, all elements (including padding)
|
* - If a matrix is used a input, all elements (including padding)
|
||||||
* will be used for view **/
|
* will be used for view **/
|
||||||
template <class ucl_type>
|
template <class ucl_type>
|
||||||
@ -177,7 +175,7 @@ class UCL_D_Mat : public UCL_BaseMat {
|
|||||||
/** This function must be passed a Geryon vector or matrix container.
|
/** This function must be passed a Geryon vector or matrix container.
|
||||||
* No memory is freed when the object is destructed.
|
* No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container
|
* allocating container when using CUDA APIs
|
||||||
* - If a matrix is used a input, all elements (including padding)
|
* - If a matrix is used a input, all elements (including padding)
|
||||||
* will be used for view **/
|
* will be used for view **/
|
||||||
template <class ucl_type>
|
template <class ucl_type>
|
||||||
@ -187,7 +185,7 @@ class UCL_D_Mat : public UCL_BaseMat {
|
|||||||
/// Do not allocate memory, instead use an existing allocation
|
/// Do not allocate memory, instead use an existing allocation
|
||||||
/** - No memory is freed when the object is destructed.
|
/** - No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container
|
* allocating container when using CUDA APIs
|
||||||
* \param stride Number of _elements_ between the start of each row **/
|
* \param stride Number of _elements_ between the start of each row **/
|
||||||
template <class ptr_type>
|
template <class ptr_type>
|
||||||
inline void view(ptr_type input, const size_t rows, const size_t cols,
|
inline void view(ptr_type input, const size_t rows, const size_t cols,
|
||||||
@ -205,13 +203,15 @@ class UCL_D_Mat : public UCL_BaseMat {
|
|||||||
#endif
|
#endif
|
||||||
#ifdef _OCL_MAT
|
#ifdef _OCL_MAT
|
||||||
_offset=0;
|
_offset=0;
|
||||||
|
CL_SAFE_CALL(clRetainMemObject(input));
|
||||||
|
CL_SAFE_CALL(clRetainCommandQueue(dev.cq()));
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Do not allocate memory, instead use an existing allocation
|
/// Do not allocate memory, instead use an existing allocation
|
||||||
/** - No memory is freed when the object is destructed.
|
/** - No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container **/
|
* allocating container when using CUDA APIs **/
|
||||||
template <class ptr_type>
|
template <class ptr_type>
|
||||||
inline void view(ptr_type input, const size_t rows, const size_t cols,
|
inline void view(ptr_type input, const size_t rows, const size_t cols,
|
||||||
UCL_Device &dev) { view(input,rows,cols,cols,dev); }
|
UCL_Device &dev) { view(input,rows,cols,cols,dev); }
|
||||||
@ -219,7 +219,7 @@ class UCL_D_Mat : public UCL_BaseMat {
|
|||||||
/// Do not allocate memory, instead use an existing allocation
|
/// Do not allocate memory, instead use an existing allocation
|
||||||
/** - No memory is freed when the object is destructed.
|
/** - No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container **/
|
* allocating container when using CUDA APIs **/
|
||||||
template <class ptr_type>
|
template <class ptr_type>
|
||||||
inline void view(ptr_type input, const size_t cols, UCL_Device &dev)
|
inline void view(ptr_type input, const size_t cols, UCL_Device &dev)
|
||||||
{ view(input,1,cols,dev); }
|
{ view(input,1,cols,dev); }
|
||||||
@ -228,7 +228,7 @@ class UCL_D_Mat : public UCL_BaseMat {
|
|||||||
/** This function must be passed a Geryon vector or matrix container.
|
/** This function must be passed a Geryon vector or matrix container.
|
||||||
* No memory is freed when the object is destructed.
|
* No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container
|
* allocating container when using CUDA APIs
|
||||||
* \param stride Number of _elements_ between the start of each row **/
|
* \param stride Number of _elements_ between the start of each row **/
|
||||||
template <class ucl_type>
|
template <class ucl_type>
|
||||||
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
|
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
|
||||||
@ -242,7 +242,9 @@ class UCL_D_Mat : public UCL_BaseMat {
|
|||||||
this->_cq=input.cq();
|
this->_cq=input.cq();
|
||||||
#ifdef _OCL_MAT
|
#ifdef _OCL_MAT
|
||||||
_array=input.begin();
|
_array=input.begin();
|
||||||
_offset=offset;
|
_offset=offset+input.offset();
|
||||||
|
CL_SAFE_CALL(clRetainMemObject(input.cbegin()));
|
||||||
|
CL_SAFE_CALL(clRetainCommandQueue(input.cq()));
|
||||||
#else
|
#else
|
||||||
_device_view(&_array,input.begin(),offset,sizeof(numtyp));
|
_device_view(&_array,input.begin(),offset,sizeof(numtyp));
|
||||||
#endif
|
#endif
|
||||||
@ -256,7 +258,7 @@ class UCL_D_Mat : public UCL_BaseMat {
|
|||||||
/** This function must be passed a Geryon vector or matrix container.
|
/** This function must be passed a Geryon vector or matrix container.
|
||||||
* No memory is freed when the object is destructed.
|
* No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container **/
|
* allocating container when using CUDA APIs **/
|
||||||
template <class ucl_type>
|
template <class ucl_type>
|
||||||
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
|
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
|
||||||
const size_t cols)
|
const size_t cols)
|
||||||
@ -266,7 +268,7 @@ class UCL_D_Mat : public UCL_BaseMat {
|
|||||||
/** This function must be passed a Geryon vector or matrix container.
|
/** This function must be passed a Geryon vector or matrix container.
|
||||||
* No memory is freed when the object is destructed.
|
* No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container
|
* allocating container when using CUDA APIs
|
||||||
* - If a matrix is used a input, all elements (including padding)
|
* - If a matrix is used a input, all elements (including padding)
|
||||||
* will be used for view **/
|
* will be used for view **/
|
||||||
template <class ucl_type>
|
template <class ucl_type>
|
||||||
@ -277,7 +279,7 @@ class UCL_D_Mat : public UCL_BaseMat {
|
|||||||
/** This function must be passed a Geryon vector or matrix container.
|
/** This function must be passed a Geryon vector or matrix container.
|
||||||
* No memory is freed when the object is destructed.
|
* No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container
|
* allocating container when using CUDA APIs
|
||||||
* - If a matrix is used a input, all elements (including padding)
|
* - If a matrix is used a input, all elements (including padding)
|
||||||
* will be used for view **/
|
* will be used for view **/
|
||||||
template <class ucl_type>
|
template <class ucl_type>
|
||||||
@ -292,7 +294,7 @@ class UCL_D_Mat : public UCL_BaseMat {
|
|||||||
/// Do not allocate memory, instead use an existing allocation
|
/// Do not allocate memory, instead use an existing allocation
|
||||||
/** - No memory is freed when the object is destructed.
|
/** - No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container
|
* allocating container when using CUDA APIs
|
||||||
* \param stride Number of _elements_ between the start of each row **/
|
* \param stride Number of _elements_ between the start of each row **/
|
||||||
template <class ptr_type>
|
template <class ptr_type>
|
||||||
inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
|
inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
|
||||||
@ -309,6 +311,8 @@ class UCL_D_Mat : public UCL_BaseMat {
|
|||||||
#ifdef _OCL_MAT
|
#ifdef _OCL_MAT
|
||||||
_array=input;
|
_array=input;
|
||||||
_offset=offset;
|
_offset=offset;
|
||||||
|
CL_SAFE_CALL(clRetainMemObject(input));
|
||||||
|
CL_SAFE_CALL(clRetainCommandQueue(dev.cq()));
|
||||||
#else
|
#else
|
||||||
#ifdef _UCL_DEVICE_PTR_MAT
|
#ifdef _UCL_DEVICE_PTR_MAT
|
||||||
_array=input+offset*sizeof(numtyp);
|
_array=input+offset*sizeof(numtyp);
|
||||||
@ -325,7 +329,7 @@ class UCL_D_Mat : public UCL_BaseMat {
|
|||||||
/// Do not allocate memory, instead use an existing allocation
|
/// Do not allocate memory, instead use an existing allocation
|
||||||
/** - No memory is freed when the object is destructed.
|
/** - No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container **/
|
* allocating container when using CUDA APIs **/
|
||||||
template <class ptr_type>
|
template <class ptr_type>
|
||||||
inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
|
inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
|
||||||
const size_t cols, UCL_Device &dev)
|
const size_t cols, UCL_Device &dev)
|
||||||
@ -334,7 +338,7 @@ class UCL_D_Mat : public UCL_BaseMat {
|
|||||||
/// Do not allocate memory, instead use an existing allocation
|
/// Do not allocate memory, instead use an existing allocation
|
||||||
/** - No memory is freed when the object is destructed.
|
/** - No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container **/
|
* allocating container when using CUDA APIs **/
|
||||||
template <class ptr_type>
|
template <class ptr_type>
|
||||||
inline void view_offset(const size_t offset, ptr_type input,
|
inline void view_offset(const size_t offset, ptr_type input,
|
||||||
const size_t cols, UCL_Device &dev)
|
const size_t cols, UCL_Device &dev)
|
||||||
@ -342,7 +346,7 @@ class UCL_D_Mat : public UCL_BaseMat {
|
|||||||
|
|
||||||
/// Free memory and set size to 0
|
/// Free memory and set size to 0
|
||||||
inline void clear()
|
inline void clear()
|
||||||
{ _rows=0; if (_kind!=UCL_VIEW) { _kind=UCL_VIEW; _device_free(*this); } }
|
{ _device_free(*this); _cols=0; _kind=UCL_VIEW; }
|
||||||
|
|
||||||
/// Resize the allocation to contain cols elements
|
/// Resize the allocation to contain cols elements
|
||||||
/** \note Cannot be used on views **/
|
/** \note Cannot be used on views **/
|
||||||
@ -377,11 +381,17 @@ class UCL_D_Mat : public UCL_BaseMat {
|
|||||||
{ if (cols>_cols || rows>_rows) return resize(rows,cols);
|
{ if (cols>_cols || rows>_rows) return resize(rows,cols);
|
||||||
else return UCL_SUCCESS; }
|
else return UCL_SUCCESS; }
|
||||||
|
|
||||||
/// Set each element to zero
|
/// Set each element to zero asynchronously in the default command_queue
|
||||||
inline void zero() { _device_zero(*this,row_bytes()*_rows); }
|
inline void zero() { zero(_cq); }
|
||||||
|
/// Set first n elements to zero asynchronously in the default command_queue
|
||||||
|
inline void zero(const int n) { zero(n,_cq); }
|
||||||
|
/// Set each element to zero asynchronously
|
||||||
|
inline void zero(command_queue &cq)
|
||||||
|
{ _device_zero(*this,row_bytes()*_rows,cq); }
|
||||||
|
/// Set first n elements to zero asynchronously
|
||||||
|
inline void zero(const int n, command_queue &cq)
|
||||||
|
{ _device_zero(*this,n*sizeof(numtyp),cq); }
|
||||||
|
|
||||||
/// Set first n elements to zero
|
|
||||||
inline void zero(const int n) { _device_zero(*this,n*sizeof(numtyp)); }
|
|
||||||
|
|
||||||
#ifdef _UCL_DEVICE_PTR_MAT
|
#ifdef _UCL_DEVICE_PTR_MAT
|
||||||
/// For OpenCL, returns a (void *) device pointer to memory allocation
|
/// For OpenCL, returns a (void *) device pointer to memory allocation
|
||||||
@ -452,7 +462,6 @@ class UCL_D_Mat : public UCL_BaseMat {
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
size_t _pitch, _row_size, _rows, _cols;
|
size_t _pitch, _row_size, _rows, _cols;
|
||||||
enum UCL_MEMOPT _kind;
|
|
||||||
|
|
||||||
#ifdef _UCL_DEVICE_PTR_MAT
|
#ifdef _UCL_DEVICE_PTR_MAT
|
||||||
device_ptr _array;
|
device_ptr _array;
|
||||||
|
|||||||
@ -39,14 +39,14 @@ class UCL_D_Vec : public UCL_BaseMat {
|
|||||||
};
|
};
|
||||||
typedef numtyp data_type;
|
typedef numtyp data_type;
|
||||||
|
|
||||||
UCL_D_Vec() : _cols(0), _kind(UCL_VIEW) {}
|
UCL_D_Vec() : _cols(0) {}
|
||||||
~UCL_D_Vec() { if (_kind!=UCL_VIEW) _device_free(*this); }
|
~UCL_D_Vec() { _device_free(*this); }
|
||||||
|
|
||||||
/// Construct with n columns
|
/// Construct with n columns
|
||||||
/** \sa alloc() **/
|
/** \sa alloc() **/
|
||||||
UCL_D_Vec(const size_t n, UCL_Device &device,
|
UCL_D_Vec(const size_t n, UCL_Device &device,
|
||||||
const enum UCL_MEMOPT kind=UCL_READ_WRITE) :
|
const enum UCL_MEMOPT kind=UCL_READ_WRITE) :
|
||||||
_cols(0), _kind(UCL_VIEW) { alloc(n,device,kind); }
|
_cols(0) { alloc(n,device,kind); }
|
||||||
|
|
||||||
/// Set up host vector with 'cols' columns and reserve memory
|
/// Set up host vector with 'cols' columns and reserve memory
|
||||||
/** The kind parameter controls memory optimizations as follows:
|
/** The kind parameter controls memory optimizations as follows:
|
||||||
@ -119,15 +119,11 @@ class UCL_D_Vec : public UCL_BaseMat {
|
|||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return the type of memory allocation
|
|
||||||
/** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, or UCL_VIEW **/
|
|
||||||
inline enum UCL_MEMOPT kind() const { return _kind; }
|
|
||||||
|
|
||||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||||
/** This function must be passed a Geryon vector or matrix container.
|
/** This function must be passed a Geryon vector or matrix container.
|
||||||
* No memory is freed when the object is destructed.
|
* No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container **/
|
* allocating container when using CUDA APIs **/
|
||||||
template <class ucl_type>
|
template <class ucl_type>
|
||||||
inline void view(ucl_type &input, const size_t rows, const size_t cols) {
|
inline void view(ucl_type &input, const size_t rows, const size_t cols) {
|
||||||
#ifdef UCL_DEBUG
|
#ifdef UCL_DEBUG
|
||||||
@ -139,8 +135,10 @@ class UCL_D_Vec : public UCL_BaseMat {
|
|||||||
_row_bytes=_cols*sizeof(numtyp);
|
_row_bytes=_cols*sizeof(numtyp);
|
||||||
this->_cq=input.cq();
|
this->_cq=input.cq();
|
||||||
#ifdef _OCL_MAT
|
#ifdef _OCL_MAT
|
||||||
_offset=0;
|
_offset=input.offset();
|
||||||
_array=input.cbegin();
|
_array=input.cbegin();
|
||||||
|
CL_SAFE_CALL(clRetainMemObject(input.cbegin()));
|
||||||
|
CL_SAFE_CALL(clRetainCommandQueue(input.cq()));
|
||||||
#else
|
#else
|
||||||
_device_view(&_array,input.begin());
|
_device_view(&_array,input.begin());
|
||||||
#endif
|
#endif
|
||||||
@ -154,7 +152,7 @@ class UCL_D_Vec : public UCL_BaseMat {
|
|||||||
/** This function must be passed a Geryon vector or matrix container.
|
/** This function must be passed a Geryon vector or matrix container.
|
||||||
* No memory is freed when the object is destructed.
|
* No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container
|
* allocating container when using CUDA APIs
|
||||||
* \param stride Number of _elements_ between the start of each row **/
|
* \param stride Number of _elements_ between the start of each row **/
|
||||||
template <class ucl_type>
|
template <class ucl_type>
|
||||||
inline void view(ucl_type &input, const size_t rows, const size_t cols,
|
inline void view(ucl_type &input, const size_t rows, const size_t cols,
|
||||||
@ -164,7 +162,7 @@ class UCL_D_Vec : public UCL_BaseMat {
|
|||||||
/** This function must be passed a Geryon vector or matrix container.
|
/** This function must be passed a Geryon vector or matrix container.
|
||||||
* No memory is freed when the object is destructed.
|
* No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container
|
* allocating container when using CUDA APIs
|
||||||
* - If a matrix is used a input, all elements (including padding)
|
* - If a matrix is used a input, all elements (including padding)
|
||||||
* will be used for view **/
|
* will be used for view **/
|
||||||
template <class ucl_type>
|
template <class ucl_type>
|
||||||
@ -175,7 +173,7 @@ class UCL_D_Vec : public UCL_BaseMat {
|
|||||||
/** This function must be passed a Geryon vector or matrix container.
|
/** This function must be passed a Geryon vector or matrix container.
|
||||||
* No memory is freed when the object is destructed.
|
* No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container
|
* allocating container when using CUDA APIs
|
||||||
* - If a matrix is used a input, all elements (including padding)
|
* - If a matrix is used a input, all elements (including padding)
|
||||||
* will be used for view **/
|
* will be used for view **/
|
||||||
template <class ucl_type>
|
template <class ucl_type>
|
||||||
@ -185,7 +183,7 @@ class UCL_D_Vec : public UCL_BaseMat {
|
|||||||
/// Do not allocate memory, instead use an existing allocation
|
/// Do not allocate memory, instead use an existing allocation
|
||||||
/** - No memory is freed when the object is destructed.
|
/** - No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container **/
|
* allocating container when using CUDA APIs **/
|
||||||
template <class ptr_type>
|
template <class ptr_type>
|
||||||
inline void view(ptr_type input, const size_t rows, const size_t cols,
|
inline void view(ptr_type input, const size_t rows, const size_t cols,
|
||||||
UCL_Device &dev) {
|
UCL_Device &dev) {
|
||||||
@ -203,13 +201,15 @@ class UCL_D_Vec : public UCL_BaseMat {
|
|||||||
#endif
|
#endif
|
||||||
#ifdef _OCL_MAT
|
#ifdef _OCL_MAT
|
||||||
_offset=0;
|
_offset=0;
|
||||||
|
CL_SAFE_CALL(clRetainMemObject(input));
|
||||||
|
CL_SAFE_CALL(clRetainCommandQueue(dev.cq()));
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Do not allocate memory, instead use an existing allocation
|
/// Do not allocate memory, instead use an existing allocation
|
||||||
/** - No memory is freed when the object is destructed.
|
/** - No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container
|
* allocating container when using CUDA APIs
|
||||||
* \param stride Number of _elements_ between the start of each row **/
|
* \param stride Number of _elements_ between the start of each row **/
|
||||||
template <class ptr_type>
|
template <class ptr_type>
|
||||||
inline void view(ptr_type input, const size_t rows, const size_t cols,
|
inline void view(ptr_type input, const size_t rows, const size_t cols,
|
||||||
@ -219,7 +219,7 @@ class UCL_D_Vec : public UCL_BaseMat {
|
|||||||
/// Do not allocate memory, instead use an existing allocation
|
/// Do not allocate memory, instead use an existing allocation
|
||||||
/** - No memory is freed when the object is destructed.
|
/** - No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container **/
|
* allocating container when using CUDA APIs **/
|
||||||
template <class ptr_type>
|
template <class ptr_type>
|
||||||
inline void view(ptr_type input, const size_t cols, UCL_Device &dev)
|
inline void view(ptr_type input, const size_t cols, UCL_Device &dev)
|
||||||
{ view(input,1,cols,dev); }
|
{ view(input,1,cols,dev); }
|
||||||
@ -228,7 +228,7 @@ class UCL_D_Vec : public UCL_BaseMat {
|
|||||||
/** This function must be passed a Geryon vector or matrix container.
|
/** This function must be passed a Geryon vector or matrix container.
|
||||||
* No memory is freed when the object is destructed.
|
* No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container **/
|
* allocating container when using CUDA APIs **/
|
||||||
template <class ucl_type>
|
template <class ucl_type>
|
||||||
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
|
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
|
||||||
const size_t cols) {
|
const size_t cols) {
|
||||||
@ -242,7 +242,9 @@ class UCL_D_Vec : public UCL_BaseMat {
|
|||||||
this->_cq=input.cq();
|
this->_cq=input.cq();
|
||||||
#ifdef _OCL_MAT
|
#ifdef _OCL_MAT
|
||||||
_array=input.begin();
|
_array=input.begin();
|
||||||
_offset=offset;
|
_offset=offset+input.offset();
|
||||||
|
CL_SAFE_CALL(clRetainMemObject(input.begin()));
|
||||||
|
CL_SAFE_CALL(clRetainCommandQueue(input.cq()));
|
||||||
#else
|
#else
|
||||||
_device_view(&_array,input.begin(),offset,sizeof(numtyp));
|
_device_view(&_array,input.begin(),offset,sizeof(numtyp));
|
||||||
#endif
|
#endif
|
||||||
@ -256,7 +258,7 @@ class UCL_D_Vec : public UCL_BaseMat {
|
|||||||
/** This function must be passed a Geryon vector or matrix container.
|
/** This function must be passed a Geryon vector or matrix container.
|
||||||
* No memory is freed when the object is destructed.
|
* No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container
|
* allocating container when using CUDA APIs
|
||||||
* \param stride Number of _elements_ between the start of each row **/
|
* \param stride Number of _elements_ between the start of each row **/
|
||||||
template <class ucl_type>
|
template <class ucl_type>
|
||||||
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
|
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
|
||||||
@ -267,7 +269,7 @@ class UCL_D_Vec : public UCL_BaseMat {
|
|||||||
/** This function must be passed a Geryon vector or matrix container.
|
/** This function must be passed a Geryon vector or matrix container.
|
||||||
* No memory is freed when the object is destructed.
|
* No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container
|
* allocating container when using CUDA APIs
|
||||||
* - If a matrix is used a input, all elements (including padding)
|
* - If a matrix is used a input, all elements (including padding)
|
||||||
* will be used for view **/
|
* will be used for view **/
|
||||||
template <class ucl_type>
|
template <class ucl_type>
|
||||||
@ -278,7 +280,7 @@ class UCL_D_Vec : public UCL_BaseMat {
|
|||||||
/** This function must be passed a Geryon vector or matrix container.
|
/** This function must be passed a Geryon vector or matrix container.
|
||||||
* No memory is freed when the object is destructed.
|
* No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container
|
* allocating container when using CUDA APIs
|
||||||
* - If a matrix is used a input, all elements (including padding)
|
* - If a matrix is used a input, all elements (including padding)
|
||||||
* will be used for view **/
|
* will be used for view **/
|
||||||
template <class ucl_type>
|
template <class ucl_type>
|
||||||
@ -288,7 +290,7 @@ class UCL_D_Vec : public UCL_BaseMat {
|
|||||||
/// Do not allocate memory, instead use an existing allocation
|
/// Do not allocate memory, instead use an existing allocation
|
||||||
/** - No memory is freed when the object is destructed.
|
/** - No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container **/
|
* allocating container when using CUDA APIs **/
|
||||||
template <class ptr_type>
|
template <class ptr_type>
|
||||||
inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
|
inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
|
||||||
const size_t cols, UCL_Device &dev) {
|
const size_t cols, UCL_Device &dev) {
|
||||||
@ -304,6 +306,8 @@ class UCL_D_Vec : public UCL_BaseMat {
|
|||||||
#ifdef _OCL_MAT
|
#ifdef _OCL_MAT
|
||||||
_array=input;
|
_array=input;
|
||||||
_offset=offset;
|
_offset=offset;
|
||||||
|
CL_SAFE_CALL(clRetainMemObject(input));
|
||||||
|
CL_SAFE_CALL(clRetainCommandQueue(dev.cq()));
|
||||||
#else
|
#else
|
||||||
#ifdef _UCL_DEVICE_PTR_MAT
|
#ifdef _UCL_DEVICE_PTR_MAT
|
||||||
_array=input+offset*sizeof(numtyp);
|
_array=input+offset*sizeof(numtyp);
|
||||||
@ -320,7 +324,7 @@ class UCL_D_Vec : public UCL_BaseMat {
|
|||||||
/// Do not allocate memory, instead use an existing allocation
|
/// Do not allocate memory, instead use an existing allocation
|
||||||
/** - No memory is freed when the object is destructed.
|
/** - No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container
|
* allocating container when using CUDA APIs
|
||||||
* \param stride Number of _elements_ between the start of each row **/
|
* \param stride Number of _elements_ between the start of each row **/
|
||||||
template <class ptr_type>
|
template <class ptr_type>
|
||||||
inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
|
inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
|
||||||
@ -330,7 +334,7 @@ class UCL_D_Vec : public UCL_BaseMat {
|
|||||||
/// Do not allocate memory, instead use an existing allocation
|
/// Do not allocate memory, instead use an existing allocation
|
||||||
/** - No memory is freed when the object is destructed.
|
/** - No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container **/
|
* allocating container when using CUDA APIs **/
|
||||||
template <class ptr_type>
|
template <class ptr_type>
|
||||||
inline void view_offset(const size_t offset, ptr_type input,
|
inline void view_offset(const size_t offset, ptr_type input,
|
||||||
const size_t cols, UCL_Device &dev)
|
const size_t cols, UCL_Device &dev)
|
||||||
@ -338,7 +342,7 @@ class UCL_D_Vec : public UCL_BaseMat {
|
|||||||
|
|
||||||
/// Free memory and set size to 0
|
/// Free memory and set size to 0
|
||||||
inline void clear()
|
inline void clear()
|
||||||
{ if (_kind!=UCL_VIEW) { _cols=0; _kind=UCL_VIEW; _device_free(*this); } }
|
{ _device_free(*this); _cols=0; _kind=UCL_VIEW; }
|
||||||
|
|
||||||
/// Resize the allocation to contain cols elements
|
/// Resize the allocation to contain cols elements
|
||||||
/** \note Cannot be used on views **/
|
/** \note Cannot be used on views **/
|
||||||
@ -373,11 +377,15 @@ class UCL_D_Vec : public UCL_BaseMat {
|
|||||||
inline int resize_ib(const int cols)
|
inline int resize_ib(const int cols)
|
||||||
{ if (cols>_cols) return resize(cols); else return UCL_SUCCESS; }
|
{ if (cols>_cols) return resize(cols); else return UCL_SUCCESS; }
|
||||||
|
|
||||||
/// Set each element to zero
|
/// Set each element to zero asynchronously in the default command_queue
|
||||||
inline void zero() { _device_zero(*this,row_bytes()); }
|
inline void zero() { zero(_cq); }
|
||||||
|
/// Set first n elements to zero asynchronously in the default command_queue
|
||||||
/// Set first n elements to zero
|
inline void zero(const int n) { zero(n,_cq); }
|
||||||
inline void zero(const int n) { _device_zero(*this,n*sizeof(numtyp)); }
|
/// Set each element to zero asynchronously
|
||||||
|
inline void zero(command_queue &cq) { _device_zero(*this,row_bytes(),cq); }
|
||||||
|
/// Set first n elements to zero asynchronously
|
||||||
|
inline void zero(const int n, command_queue &cq)
|
||||||
|
{ _device_zero(*this,n*sizeof(numtyp),cq); }
|
||||||
|
|
||||||
#ifdef _UCL_DEVICE_PTR_MAT
|
#ifdef _UCL_DEVICE_PTR_MAT
|
||||||
/// For OpenCL, returns a (void *) device pointer to memory allocation
|
/// For OpenCL, returns a (void *) device pointer to memory allocation
|
||||||
@ -465,7 +473,6 @@ class UCL_D_Vec : public UCL_BaseMat {
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
size_t _row_bytes, _row_size, _rows, _cols;
|
size_t _row_bytes, _row_size, _rows, _cols;
|
||||||
enum UCL_MEMOPT _kind;
|
|
||||||
|
|
||||||
#ifdef _UCL_DEVICE_PTR_MAT
|
#ifdef _UCL_DEVICE_PTR_MAT
|
||||||
device_ptr _array;
|
device_ptr _array;
|
||||||
|
|||||||
@ -39,33 +39,35 @@ class UCL_H_Mat : public UCL_BaseMat {
|
|||||||
};
|
};
|
||||||
typedef numtyp data_type;
|
typedef numtyp data_type;
|
||||||
|
|
||||||
UCL_H_Mat() : _kind(UCL_VIEW), _rows(0) {
|
UCL_H_Mat() : _cols(0) {
|
||||||
#ifdef _OCL_MAT
|
#ifdef _OCL_MAT
|
||||||
_carray=(cl_mem)(0);
|
_carray=(cl_mem)(0);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
~UCL_H_Mat() { if (_kind!=UCL_VIEW) _host_free(*this,_kind); }
|
~UCL_H_Mat() { _host_free(*this); }
|
||||||
|
|
||||||
/// Construct with specied number of rows and columns
|
/// Construct with specied number of rows and columns
|
||||||
/** \sa alloc() **/
|
/** \sa alloc() **/
|
||||||
UCL_H_Mat(const size_t rows, const size_t cols, UCL_Device &device,
|
UCL_H_Mat(const size_t rows, const size_t cols, UCL_Device &device,
|
||||||
const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED)
|
const enum UCL_MEMOPT kind=UCL_READ_WRITE)
|
||||||
{ _rows=0; _kind=UCL_VIEW; alloc(rows,cols,device,kind); }
|
{ _cols=0; _kind=UCL_VIEW; alloc(rows,cols,device,kind); }
|
||||||
|
|
||||||
/// Set up host matrix with specied # of rows/cols and reserve memory
|
/// Set up host matrix with specied # of rows/cols and reserve memory
|
||||||
/** The kind parameter controls memory pinning as follows:
|
/** The kind parameter controls memory pinning as follows:
|
||||||
* - UCL_NOT_PINNED - Memory is not pinned
|
* - UCL_READ_WRITE - Specify that you will read and write from host
|
||||||
* - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
|
* - UCL_WRITE_ONLY - Specify that you will only write from host
|
||||||
* - UCL_RW_OPTIMIZED - Memory can be pinned
|
* - UCL_READ_ONLY - Specify that you will only read from host
|
||||||
|
* - UCL_NOT_PINNED - Memory is not pinned/page-locked on host
|
||||||
* \param cq Default command queue for operations copied from another mat
|
* \param cq Default command queue for operations copied from another mat
|
||||||
* \return UCL_SUCCESS if the memory allocation is successful **/
|
* \return UCL_SUCCESS if the memory allocation is successful **/
|
||||||
template <class mat_type>
|
template <class mat_type>
|
||||||
inline int alloc(const size_t rows, const size_t cols, mat_type &cq,
|
inline int alloc(const size_t rows, const size_t cols, mat_type &cq,
|
||||||
const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
|
const enum UCL_MEMOPT kind=UCL_READ_WRITE,
|
||||||
|
const enum UCL_MEMOPT kind2=UCL_NOT_SPECIFIED) {
|
||||||
clear();
|
clear();
|
||||||
|
|
||||||
_row_bytes=cols*sizeof(numtyp);
|
_row_bytes=cols*sizeof(numtyp);
|
||||||
int err=_host_alloc(*this,cq,_row_bytes*rows,kind);
|
int err=_host_alloc(*this,cq,_row_bytes*rows,kind,kind2);
|
||||||
if (err!=UCL_SUCCESS) {
|
if (err!=UCL_SUCCESS) {
|
||||||
#ifndef UCL_NO_EXIT
|
#ifndef UCL_NO_EXIT
|
||||||
std::cerr << "UCL Error: Could not allocate " << _row_bytes*_rows
|
std::cerr << "UCL Error: Could not allocate " << _row_bytes*_rows
|
||||||
@ -86,17 +88,19 @@ class UCL_H_Mat : public UCL_BaseMat {
|
|||||||
|
|
||||||
/// Set up host matrix with specied # of rows/cols and reserve memory
|
/// Set up host matrix with specied # of rows/cols and reserve memory
|
||||||
/** The kind parameter controls memory pinning as follows:
|
/** The kind parameter controls memory pinning as follows:
|
||||||
* - UCL_NOT_PINNED - Memory is not pinned
|
* - UCL_READ_WRITE - Specify that you will read and write from host
|
||||||
* - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
|
* - UCL_WRITE_ONLY - Specify that you will only write from host
|
||||||
* - UCL_RW_OPTIMIZED - Memory can be pinned
|
* - UCL_READ_ONLY - Specify that you will only read from host
|
||||||
|
* - UCL_NOT_PINNED - Memory is not pinned/page-locked on host
|
||||||
* \param device Used to get the default command queue for operations
|
* \param device Used to get the default command queue for operations
|
||||||
* \return UCL_SUCCESS if the memory allocation is successful **/
|
* \return UCL_SUCCESS if the memory allocation is successful **/
|
||||||
inline int alloc(const size_t rows, const size_t cols, UCL_Device &device,
|
inline int alloc(const size_t rows, const size_t cols, UCL_Device &device,
|
||||||
const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
|
const enum UCL_MEMOPT kind=UCL_READ_WRITE,
|
||||||
|
const enum UCL_MEMOPT kind2=UCL_NOT_SPECIFIED) {
|
||||||
clear();
|
clear();
|
||||||
|
|
||||||
_row_bytes=cols*sizeof(numtyp);
|
_row_bytes=cols*sizeof(numtyp);
|
||||||
int err=_host_alloc(*this,device,_row_bytes*rows,kind);
|
int err=_host_alloc(*this,device,_row_bytes*rows,kind,kind2);
|
||||||
if (err!=UCL_SUCCESS) {
|
if (err!=UCL_SUCCESS) {
|
||||||
#ifndef UCL_NO_EXIT
|
#ifndef UCL_NO_EXIT
|
||||||
std::cerr << "UCL Error: Could not allocate " << _row_bytes*_rows
|
std::cerr << "UCL Error: Could not allocate " << _row_bytes*_rows
|
||||||
@ -115,15 +119,11 @@ class UCL_H_Mat : public UCL_BaseMat {
|
|||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return the type of memory allocation
|
|
||||||
/** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, or UCL_VIEW **/
|
|
||||||
inline enum UCL_MEMOPT kind() const { return _kind; }
|
|
||||||
|
|
||||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||||
/** This function must be passed a Geryon vector or matrix container.
|
/** This function must be passed a Geryon vector or matrix container.
|
||||||
* No memory is freed when the object is destructed.
|
* No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container
|
* allocating container when using CUDA APIs
|
||||||
* - Viewing a device container on the host is not supported
|
* - Viewing a device container on the host is not supported
|
||||||
* \param stride Number of _elements_ between the start of each row **/
|
* \param stride Number of _elements_ between the start of each row **/
|
||||||
template <class ucl_type>
|
template <class ucl_type>
|
||||||
@ -140,6 +140,8 @@ class UCL_H_Mat : public UCL_BaseMat {
|
|||||||
_end=_array+_cols;
|
_end=_array+_cols;
|
||||||
#ifdef _OCL_MAT
|
#ifdef _OCL_MAT
|
||||||
_carray=input.cbegin();
|
_carray=input.cbegin();
|
||||||
|
CL_SAFE_CALL(clRetainMemObject(input.cbegin()));
|
||||||
|
CL_SAFE_CALL(clRetainCommandQueue(input.cq()));
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -147,7 +149,7 @@ class UCL_H_Mat : public UCL_BaseMat {
|
|||||||
/** This function must be passed a Geryon vector or matrix container.
|
/** This function must be passed a Geryon vector or matrix container.
|
||||||
* No memory is freed when the object is destructed.
|
* No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container
|
* allocating container when using CUDA APIs
|
||||||
* - Viewing a device container on the host is not supported **/
|
* - Viewing a device container on the host is not supported **/
|
||||||
template <class ucl_type>
|
template <class ucl_type>
|
||||||
inline void view(ucl_type &input, const size_t rows, const size_t cols)
|
inline void view(ucl_type &input, const size_t rows, const size_t cols)
|
||||||
@ -157,7 +159,7 @@ class UCL_H_Mat : public UCL_BaseMat {
|
|||||||
/** This function must be passed a Geryon vector or matrix container.
|
/** This function must be passed a Geryon vector or matrix container.
|
||||||
* No memory is freed when the object is destructed.
|
* No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container
|
* allocating container when using CUDA APIs
|
||||||
* - If a matrix is used a input, all elements (including padding)
|
* - If a matrix is used a input, all elements (including padding)
|
||||||
* will be used for view
|
* will be used for view
|
||||||
* - Viewing a device container on the host is not supported **/
|
* - Viewing a device container on the host is not supported **/
|
||||||
@ -169,9 +171,9 @@ class UCL_H_Mat : public UCL_BaseMat {
|
|||||||
/** This function must be passed a Geryon vector or matrix container.
|
/** This function must be passed a Geryon vector or matrix container.
|
||||||
* No memory is freed when the object is destructed.
|
* No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container
|
* allocating container when using CUDA APIs
|
||||||
* - If a matrix is used a input, all elements (including padding)
|
* - If a matrix is used a input, all elements (including padding)
|
||||||
* will be used for view
|
* will be used for view when using CUDA APIs
|
||||||
* - Viewing a device container on the host is not supported **/
|
* - Viewing a device container on the host is not supported **/
|
||||||
template <class ucl_type>
|
template <class ucl_type>
|
||||||
inline void view(ucl_type &input)
|
inline void view(ucl_type &input)
|
||||||
@ -180,7 +182,7 @@ class UCL_H_Mat : public UCL_BaseMat {
|
|||||||
/// Do not allocate memory, instead use an existing allocation
|
/// Do not allocate memory, instead use an existing allocation
|
||||||
/** - No memory is freed when the object is destructed.
|
/** - No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container
|
* allocating container when using CUDA APIs
|
||||||
* - Viewing a device pointer on the host is not supported
|
* - Viewing a device pointer on the host is not supported
|
||||||
* \param stride Number of _elements_ between the start of each row **/
|
* \param stride Number of _elements_ between the start of each row **/
|
||||||
template <class ptr_type>
|
template <class ptr_type>
|
||||||
@ -197,14 +199,14 @@ class UCL_H_Mat : public UCL_BaseMat {
|
|||||||
_end=_array+_cols;
|
_end=_array+_cols;
|
||||||
|
|
||||||
#ifdef _OCL_MAT
|
#ifdef _OCL_MAT
|
||||||
_host_alloc(*this,dev,_row_bytes,UCL_VIEW);
|
_host_view(*this,dev,_row_bytes*rows);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Do not allocate memory, instead use an existing allocation
|
/// Do not allocate memory, instead use an existing allocation
|
||||||
/** - No memory is freed when the object is destructed.
|
/** - No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container
|
* allocating container when using CUDA APIs
|
||||||
* - Viewing a device pointer on the host is not supported **/
|
* - Viewing a device pointer on the host is not supported **/
|
||||||
template <class ptr_type>
|
template <class ptr_type>
|
||||||
inline void view(ptr_type *input, const size_t rows, const size_t cols,
|
inline void view(ptr_type *input, const size_t rows, const size_t cols,
|
||||||
@ -213,7 +215,7 @@ class UCL_H_Mat : public UCL_BaseMat {
|
|||||||
/// Do not allocate memory, instead use an existing allocation
|
/// Do not allocate memory, instead use an existing allocation
|
||||||
/** - No memory is freed when the object is destructed.
|
/** - No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container
|
* allocating container when using CUDA APIs
|
||||||
* - Viewing a device pointer on the host is not supported **/
|
* - Viewing a device pointer on the host is not supported **/
|
||||||
template <class ptr_type>
|
template <class ptr_type>
|
||||||
inline void view(ptr_type *input, const size_t cols, UCL_Device &dev)
|
inline void view(ptr_type *input, const size_t cols, UCL_Device &dev)
|
||||||
@ -223,7 +225,7 @@ class UCL_H_Mat : public UCL_BaseMat {
|
|||||||
/** This function must be passed a Geryon vector or matrix container.
|
/** This function must be passed a Geryon vector or matrix container.
|
||||||
* No memory is freed when the object is destructed.
|
* No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container
|
* allocating container when using CUDA APIs
|
||||||
* - Viewing a device container on the host is not supported
|
* - Viewing a device container on the host is not supported
|
||||||
* \param stride Number of _elements_ between the start of each row **/
|
* \param stride Number of _elements_ between the start of each row **/
|
||||||
template <class ucl_type>
|
template <class ucl_type>
|
||||||
@ -239,7 +241,7 @@ class UCL_H_Mat : public UCL_BaseMat {
|
|||||||
_array=input.begin()+offset;
|
_array=input.begin()+offset;
|
||||||
_end=_array+_cols;
|
_end=_array+_cols;
|
||||||
#ifdef _OCL_MAT
|
#ifdef _OCL_MAT
|
||||||
_host_alloc(*this,input,_row_bytes,UCL_VIEW);
|
_host_view(*this,input,_row_bytes*_rows);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -247,7 +249,7 @@ class UCL_H_Mat : public UCL_BaseMat {
|
|||||||
/** This function must be passed a Geryon vector or matrix container.
|
/** This function must be passed a Geryon vector or matrix container.
|
||||||
* No memory is freed when the object is destructed.
|
* No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container
|
* allocating container when using CUDA APIs
|
||||||
* - Viewing a device container on the host is not supported **/
|
* - Viewing a device container on the host is not supported **/
|
||||||
template <class ucl_type>
|
template <class ucl_type>
|
||||||
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
|
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
|
||||||
@ -258,7 +260,7 @@ class UCL_H_Mat : public UCL_BaseMat {
|
|||||||
/** This function must be passed a Geryon vector or matrix container.
|
/** This function must be passed a Geryon vector or matrix container.
|
||||||
* No memory is freed when the object is destructed.
|
* No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container
|
* allocating container when using CUDA APIs
|
||||||
* - If a matrix is used a input, all elements (including padding)
|
* - If a matrix is used a input, all elements (including padding)
|
||||||
* will be used for view
|
* will be used for view
|
||||||
* - Viewing a device container on the host is not supported **/
|
* - Viewing a device container on the host is not supported **/
|
||||||
@ -270,7 +272,7 @@ class UCL_H_Mat : public UCL_BaseMat {
|
|||||||
/** This function must be passed a Geryon vector or matrix container.
|
/** This function must be passed a Geryon vector or matrix container.
|
||||||
* No memory is freed when the object is destructed.
|
* No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container
|
* allocating container when using CUDA APIs
|
||||||
* - If a matrix is used a input, all elements (including padding)
|
* - If a matrix is used a input, all elements (including padding)
|
||||||
* will be used for view
|
* will be used for view
|
||||||
* - Viewing a device container on the host is not supported **/
|
* - Viewing a device container on the host is not supported **/
|
||||||
@ -296,7 +298,7 @@ class UCL_H_Mat : public UCL_BaseMat {
|
|||||||
/// Do not allocate memory, instead use an existing allocation
|
/// Do not allocate memory, instead use an existing allocation
|
||||||
/** - No memory is freed when the object is destructed.
|
/** - No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container
|
* allocating container when using CUDA APIs
|
||||||
* - Viewing a device pointer on the host is not supported
|
* - Viewing a device pointer on the host is not supported
|
||||||
* \param stride Number of _elements_ between the start of each row **/
|
* \param stride Number of _elements_ between the start of each row **/
|
||||||
template <class ptr_type>
|
template <class ptr_type>
|
||||||
@ -307,7 +309,7 @@ class UCL_H_Mat : public UCL_BaseMat {
|
|||||||
/// Do not allocate memory, instead use an existing allocation
|
/// Do not allocate memory, instead use an existing allocation
|
||||||
/** - No memory is freed when the object is destructed.
|
/** - No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container
|
* allocating container when using CUDA APIs
|
||||||
* - Viewing a device pointer on the host is not supported **/
|
* - Viewing a device pointer on the host is not supported **/
|
||||||
template <class ptr_type>
|
template <class ptr_type>
|
||||||
inline void view_offset(const size_t offset, ptr_type *input,
|
inline void view_offset(const size_t offset, ptr_type *input,
|
||||||
@ -316,7 +318,7 @@ class UCL_H_Mat : public UCL_BaseMat {
|
|||||||
|
|
||||||
/// Free memory and set size to 0
|
/// Free memory and set size to 0
|
||||||
inline void clear()
|
inline void clear()
|
||||||
{ if (_kind!=UCL_VIEW) {_rows=0; _kind=UCL_VIEW; _host_free(*this,_kind); }}
|
{ _host_free(*this); _cols=0; _kind=UCL_VIEW; }
|
||||||
|
|
||||||
/// Resize the allocation to rows x cols elements
|
/// Resize the allocation to rows x cols elements
|
||||||
/** \note Cannot be used on views **/
|
/** \note Cannot be used on views **/
|
||||||
@ -409,7 +411,6 @@ class UCL_H_Mat : public UCL_BaseMat {
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
private:
|
private:
|
||||||
enum UCL_MEMOPT _kind;
|
|
||||||
numtyp *_array, *_end;
|
numtyp *_array, *_end;
|
||||||
size_t _row_bytes, _rows, _cols;
|
size_t _row_bytes, _rows, _cols;
|
||||||
|
|
||||||
|
|||||||
@ -39,33 +39,35 @@ class UCL_H_Vec : public UCL_BaseMat {
|
|||||||
};
|
};
|
||||||
typedef numtyp data_type;
|
typedef numtyp data_type;
|
||||||
|
|
||||||
UCL_H_Vec() : _kind(UCL_VIEW), _cols(0) {
|
UCL_H_Vec() : _cols(0) {
|
||||||
#ifdef _OCL_MAT
|
#ifdef _OCL_MAT
|
||||||
_carray=(cl_mem)(0);
|
_carray=(cl_mem)(0);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
~UCL_H_Vec() { if (_kind!=UCL_VIEW) _host_free(*this,_kind); }
|
~UCL_H_Vec() { _host_free(*this); }
|
||||||
|
|
||||||
/// Construct with n columns
|
/// Construct with n columns
|
||||||
/** \sa alloc() **/
|
/** \sa alloc() **/
|
||||||
UCL_H_Vec(const size_t n, UCL_Device &device,
|
UCL_H_Vec(const size_t n, UCL_Device &device,
|
||||||
const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED)
|
const enum UCL_MEMOPT kind=UCL_READ_WRITE)
|
||||||
{ _cols=0; _kind=UCL_VIEW; alloc(n,device,kind); }
|
{ _cols=0; _kind=UCL_VIEW; alloc(n,device,kind); }
|
||||||
|
|
||||||
/// Set up host vector with 'cols' columns and reserve memory
|
/// Set up host vector with 'cols' columns and reserve memory
|
||||||
/** The kind parameter controls memory pinning as follows:
|
/** The kind parameter controls memory pinning as follows:
|
||||||
* - UCL_NOT_PINNED - Memory is not pinned
|
* - UCL_READ_WRITE - Specify that you will read and write from host
|
||||||
* - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
|
* - UCL_WRITE_ONLY - Specify that you will only write from host
|
||||||
* - UCL_RW_OPTIMIZED - Memory can be pinned
|
* - UCL_READ_ONLY - Specify that you will only read from host
|
||||||
|
* - UCL_NOT_PINNED - Memory is not pinned/page-locked on host
|
||||||
* \param cq Default command queue for operations copied from another mat
|
* \param cq Default command queue for operations copied from another mat
|
||||||
* \return UCL_SUCCESS if the memory allocation is successful **/
|
* \return UCL_SUCCESS if the memory allocation is successful **/
|
||||||
template <class mat_type>
|
template <class mat_type>
|
||||||
inline int alloc(const size_t cols, mat_type &cq,
|
inline int alloc(const size_t cols, mat_type &cq,
|
||||||
const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
|
const enum UCL_MEMOPT kind=UCL_READ_WRITE,
|
||||||
|
const enum UCL_MEMOPT kind2=UCL_NOT_SPECIFIED) {
|
||||||
clear();
|
clear();
|
||||||
|
|
||||||
_row_bytes=cols*sizeof(numtyp);
|
_row_bytes=cols*sizeof(numtyp);
|
||||||
int err=_host_alloc(*this,cq,_row_bytes,kind);
|
int err=_host_alloc(*this,cq,_row_bytes,kind,kind2);
|
||||||
|
|
||||||
if (err!=UCL_SUCCESS) {
|
if (err!=UCL_SUCCESS) {
|
||||||
#ifndef UCL_NO_EXIT
|
#ifndef UCL_NO_EXIT
|
||||||
@ -86,17 +88,19 @@ class UCL_H_Vec : public UCL_BaseMat {
|
|||||||
|
|
||||||
/// Set up host vector with 'cols' columns and reserve memory
|
/// Set up host vector with 'cols' columns and reserve memory
|
||||||
/** The kind parameter controls memory pinning as follows:
|
/** The kind parameter controls memory pinning as follows:
|
||||||
* - UCL_NOT_PINNED - Memory is not pinned
|
* - UCL_READ_WRITE - Specify that you will read and write from host
|
||||||
* - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
|
* - UCL_WRITE_ONLY - Specify that you will only write from host
|
||||||
* - UCL_RW_OPTIMIZED - Memory can be pinned
|
* - UCL_READ_ONLY - Specify that you will only read from host
|
||||||
|
* - UCL_NOT_PINNED - Memory is not pinned/page-locked on host
|
||||||
* \param device Used to get the default command queue for operations
|
* \param device Used to get the default command queue for operations
|
||||||
* \return UCL_SUCCESS if the memory allocation is successful **/
|
* \return UCL_SUCCESS if the memory allocation is successful **/
|
||||||
inline int alloc(const size_t cols, UCL_Device &device,
|
inline int alloc(const size_t cols, UCL_Device &device,
|
||||||
const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
|
const enum UCL_MEMOPT kind=UCL_READ_WRITE,
|
||||||
|
const enum UCL_MEMOPT kind2=UCL_NOT_SPECIFIED) {
|
||||||
clear();
|
clear();
|
||||||
|
|
||||||
_row_bytes=cols*sizeof(numtyp);
|
_row_bytes=cols*sizeof(numtyp);
|
||||||
int err=_host_alloc(*this,device,_row_bytes,kind);
|
int err=_host_alloc(*this,device,_row_bytes,kind,kind2);
|
||||||
|
|
||||||
if (err!=UCL_SUCCESS) {
|
if (err!=UCL_SUCCESS) {
|
||||||
#ifndef UCL_NO_EXIT
|
#ifndef UCL_NO_EXIT
|
||||||
@ -115,15 +119,11 @@ class UCL_H_Vec : public UCL_BaseMat {
|
|||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return the type of memory allocation
|
|
||||||
/** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, or UCL_VIEW **/
|
|
||||||
inline enum UCL_MEMOPT kind() const { return _kind; }
|
|
||||||
|
|
||||||
/// Do not allocate memory, instead use an existing allocation from Geryon
|
/// Do not allocate memory, instead use an existing allocation from Geryon
|
||||||
/** This function must be passed a Geryon vector or matrix container.
|
/** This function must be passed a Geryon vector or matrix container.
|
||||||
* No memory is freed when the object is destructed.
|
* No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container
|
* allocating container when using CUDA APIs
|
||||||
* - Viewing a device container on the host is not supported **/
|
* - Viewing a device container on the host is not supported **/
|
||||||
template <class ucl_type>
|
template <class ucl_type>
|
||||||
inline void view(ucl_type &input, const size_t rows, const size_t cols) {
|
inline void view(ucl_type &input, const size_t rows, const size_t cols) {
|
||||||
@ -139,6 +139,8 @@ class UCL_H_Vec : public UCL_BaseMat {
|
|||||||
_end=_array+_cols;
|
_end=_array+_cols;
|
||||||
#ifdef _OCL_MAT
|
#ifdef _OCL_MAT
|
||||||
_carray=input.cbegin();
|
_carray=input.cbegin();
|
||||||
|
CL_SAFE_CALL(clRetainMemObject(input.cbegin()));
|
||||||
|
CL_SAFE_CALL(clRetainCommandQueue(input.cq()));
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -146,7 +148,7 @@ class UCL_H_Vec : public UCL_BaseMat {
|
|||||||
/** This function must be passed a Geryon vector or matrix container.
|
/** This function must be passed a Geryon vector or matrix container.
|
||||||
* No memory is freed when the object is destructed.
|
* No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container
|
* allocating container when using CUDA APIs
|
||||||
* - Viewing a device container on the host is not supported
|
* - Viewing a device container on the host is not supported
|
||||||
* \param stride Number of _elements_ between the start of each row **/
|
* \param stride Number of _elements_ between the start of each row **/
|
||||||
template <class ucl_type>
|
template <class ucl_type>
|
||||||
@ -157,7 +159,7 @@ class UCL_H_Vec : public UCL_BaseMat {
|
|||||||
/** This function must be passed a Geryon vector or matrix container.
|
/** This function must be passed a Geryon vector or matrix container.
|
||||||
* No memory is freed when the object is destructed.
|
* No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container
|
* allocating container when using CUDA APIs
|
||||||
* - If a matrix is used a input, all elements (including padding)
|
* - If a matrix is used a input, all elements (including padding)
|
||||||
* will be used for view
|
* will be used for view
|
||||||
* - Viewing a device container on the host is not supported **/
|
* - Viewing a device container on the host is not supported **/
|
||||||
@ -180,7 +182,7 @@ class UCL_H_Vec : public UCL_BaseMat {
|
|||||||
/// Do not allocate memory, instead use an existing allocation
|
/// Do not allocate memory, instead use an existing allocation
|
||||||
/** - No memory is freed when the object is destructed.
|
/** - No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container
|
* allocating container when using CUDA APIs
|
||||||
* - Viewing a device pointer on the host is not supported **/
|
* - Viewing a device pointer on the host is not supported **/
|
||||||
template <class ptr_type>
|
template <class ptr_type>
|
||||||
inline void view(ptr_type *input, const size_t rows, const size_t cols,
|
inline void view(ptr_type *input, const size_t rows, const size_t cols,
|
||||||
@ -197,14 +199,14 @@ class UCL_H_Vec : public UCL_BaseMat {
|
|||||||
_end=_array+_cols;
|
_end=_array+_cols;
|
||||||
|
|
||||||
#ifdef _OCL_MAT
|
#ifdef _OCL_MAT
|
||||||
_host_alloc(*this,dev,_row_bytes,UCL_VIEW);
|
_host_view(*this,dev,_row_bytes);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Do not allocate memory, instead use an existing allocation
|
/// Do not allocate memory, instead use an existing allocation
|
||||||
/** - No memory is freed when the object is destructed.
|
/** - No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container
|
* allocating container when using CUDA APIs
|
||||||
* - Viewing a device pointer on the host is not supported
|
* - Viewing a device pointer on the host is not supported
|
||||||
* \param stride Number of _elements_ between the start of each row **/
|
* \param stride Number of _elements_ between the start of each row **/
|
||||||
template <class ptr_type>
|
template <class ptr_type>
|
||||||
@ -215,7 +217,7 @@ class UCL_H_Vec : public UCL_BaseMat {
|
|||||||
/// Do not allocate memory, instead use an existing allocation
|
/// Do not allocate memory, instead use an existing allocation
|
||||||
/** - No memory is freed when the object is destructed.
|
/** - No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container
|
* allocating container when using CUDA APIs
|
||||||
* - Viewing a device pointer on the host is not supported **/
|
* - Viewing a device pointer on the host is not supported **/
|
||||||
template <class ptr_type>
|
template <class ptr_type>
|
||||||
inline void view(ptr_type *input, const size_t cols, UCL_Device &dev)
|
inline void view(ptr_type *input, const size_t cols, UCL_Device &dev)
|
||||||
@ -225,7 +227,7 @@ class UCL_H_Vec : public UCL_BaseMat {
|
|||||||
/** This function must be passed a Geryon vector or matrix container.
|
/** This function must be passed a Geryon vector or matrix container.
|
||||||
* No memory is freed when the object is destructed.
|
* No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container
|
* allocating container when using CUDA APIs
|
||||||
* - Viewing a device container on the host is not supported **/
|
* - Viewing a device container on the host is not supported **/
|
||||||
template <class ucl_type>
|
template <class ucl_type>
|
||||||
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
|
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
|
||||||
@ -241,7 +243,7 @@ class UCL_H_Vec : public UCL_BaseMat {
|
|||||||
_array=input.begin()+offset;
|
_array=input.begin()+offset;
|
||||||
_end=_array+_cols;
|
_end=_array+_cols;
|
||||||
#ifdef _OCL_MAT
|
#ifdef _OCL_MAT
|
||||||
_host_alloc(*this,input,_row_bytes,UCL_VIEW);
|
_host_view(*this,input,_row_bytes);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -249,7 +251,7 @@ class UCL_H_Vec : public UCL_BaseMat {
|
|||||||
/** This function must be passed a Geryon vector or matrix container.
|
/** This function must be passed a Geryon vector or matrix container.
|
||||||
* No memory is freed when the object is destructed.
|
* No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container
|
* allocating container when using CUDA APIs
|
||||||
* - Viewing a device container on the host is not supported
|
* - Viewing a device container on the host is not supported
|
||||||
* \param stride Number of _elements_ between the start of each row **/
|
* \param stride Number of _elements_ between the start of each row **/
|
||||||
template <class ucl_type>
|
template <class ucl_type>
|
||||||
@ -261,7 +263,7 @@ class UCL_H_Vec : public UCL_BaseMat {
|
|||||||
/** This function must be passed a Geryon vector or matrix container.
|
/** This function must be passed a Geryon vector or matrix container.
|
||||||
* No memory is freed when the object is destructed.
|
* No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container
|
* allocating container when using CUDA APIs
|
||||||
* - If a matrix is used a input, all elements (including padding)
|
* - If a matrix is used a input, all elements (including padding)
|
||||||
* will be used for view
|
* will be used for view
|
||||||
* - Viewing a device container on the host is not supported **/
|
* - Viewing a device container on the host is not supported **/
|
||||||
@ -273,7 +275,7 @@ class UCL_H_Vec : public UCL_BaseMat {
|
|||||||
/** This function must be passed a Geryon vector or matrix container.
|
/** This function must be passed a Geryon vector or matrix container.
|
||||||
* No memory is freed when the object is destructed.
|
* No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container
|
* allocating container when using CUDA APIs
|
||||||
* - If a matrix is used a input, all elements (including padding)
|
* - If a matrix is used a input, all elements (including padding)
|
||||||
* will be used for view
|
* will be used for view
|
||||||
* - Viewing a device container on the host is not supported **/
|
* - Viewing a device container on the host is not supported **/
|
||||||
@ -284,7 +286,7 @@ class UCL_H_Vec : public UCL_BaseMat {
|
|||||||
/// Do not allocate memory, instead use an existing allocation
|
/// Do not allocate memory, instead use an existing allocation
|
||||||
/** - No memory is freed when the object is destructed.
|
/** - No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container
|
* allocating container when using CUDA APIs
|
||||||
* - Viewing a device pointer on the host is not supported **/
|
* - Viewing a device pointer on the host is not supported **/
|
||||||
template <class ptr_type>
|
template <class ptr_type>
|
||||||
inline void view_offset(const size_t offset,ptr_type *input,const size_t rows,
|
inline void view_offset(const size_t offset,ptr_type *input,const size_t rows,
|
||||||
@ -294,7 +296,7 @@ class UCL_H_Vec : public UCL_BaseMat {
|
|||||||
/// Do not allocate memory, instead use an existing allocation
|
/// Do not allocate memory, instead use an existing allocation
|
||||||
/** - No memory is freed when the object is destructed.
|
/** - No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container
|
* allocating container when using CUDA APIs
|
||||||
* - Viewing a device pointer on the host is not supported
|
* - Viewing a device pointer on the host is not supported
|
||||||
* \param stride Number of _elements_ between the start of each row **/
|
* \param stride Number of _elements_ between the start of each row **/
|
||||||
template <class ptr_type>
|
template <class ptr_type>
|
||||||
@ -305,7 +307,7 @@ class UCL_H_Vec : public UCL_BaseMat {
|
|||||||
/// Do not allocate memory, instead use an existing allocation
|
/// Do not allocate memory, instead use an existing allocation
|
||||||
/** - No memory is freed when the object is destructed.
|
/** - No memory is freed when the object is destructed.
|
||||||
* - The view does not prevent the memory from being freed by the
|
* - The view does not prevent the memory from being freed by the
|
||||||
* allocating container
|
* allocating container when using CUDA APIs
|
||||||
* - Viewing a device pointer on the host is not supported **/
|
* - Viewing a device pointer on the host is not supported **/
|
||||||
template <class ptr_type>
|
template <class ptr_type>
|
||||||
inline void view_offset(const size_t offset, ptr_type *input,
|
inline void view_offset(const size_t offset, ptr_type *input,
|
||||||
@ -314,7 +316,7 @@ class UCL_H_Vec : public UCL_BaseMat {
|
|||||||
|
|
||||||
/// Free memory and set size to 0
|
/// Free memory and set size to 0
|
||||||
inline void clear()
|
inline void clear()
|
||||||
{ if (_kind!=UCL_VIEW) {_kind=UCL_VIEW; _cols=0; _host_free(*this,_kind);}}
|
{ _host_free(*this); _kind=UCL_VIEW; _cols=0; }
|
||||||
|
|
||||||
/// Resize the allocation to contain cols elements
|
/// Resize the allocation to contain cols elements
|
||||||
/** \note Cannot be used on views **/
|
/** \note Cannot be used on views **/
|
||||||
@ -401,7 +403,6 @@ class UCL_H_Vec : public UCL_BaseMat {
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
private:
|
private:
|
||||||
enum UCL_MEMOPT _kind;
|
|
||||||
numtyp *_array, *_end;
|
numtyp *_array, *_end;
|
||||||
size_t _row_bytes, _cols;
|
size_t _row_bytes, _cols;
|
||||||
|
|
||||||
|
|||||||
@ -48,17 +48,18 @@ class UCL_Matrix {
|
|||||||
/// Construct with specied number of rows and columns
|
/// Construct with specied number of rows and columns
|
||||||
/** \sa alloc() **/
|
/** \sa alloc() **/
|
||||||
UCL_Matrix(const size_t rows, const size_t cols, UCL_Device &acc,
|
UCL_Matrix(const size_t rows, const size_t cols, UCL_Device &acc,
|
||||||
const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED,
|
const enum UCL_MEMOPT kind1=UCL_READ_WRITE,
|
||||||
const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
|
const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
|
||||||
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
||||||
alloc(host,device,_buffer,rows,cols,acc,kind1,kind2); }
|
alloc(host,device,_buffer,rows,cols,acc,kind1,kind2); }
|
||||||
|
|
||||||
/// Set up host matrix with specied # of rows/cols and reserve memory
|
/// Set up host matrix with specied # of rows/cols and reserve memory
|
||||||
/** The kind1 parameter controls memory pinning as follows:
|
/** The kind1 parameter controls memory access from the host
|
||||||
* - UCL_NOT_PINNED - Memory is not pinned
|
* - UCL_READ_WRITE - Specify that you will read and write from host
|
||||||
* - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
|
* - UCL_WRITE_ONLY - Specify that you will only write from host
|
||||||
* - UCL_RW_OPTIMIZED - Memory can be pinned
|
* - UCL_READ_ONLY - Specify that you will only read from host
|
||||||
* The kind2 parameter controls memory optimizations as follows:
|
* - UCL_NOT_PINNED - Memory is not pinned/page-locked on host
|
||||||
|
* The kind2 parameter controls memory optimizations from the device:
|
||||||
* - UCL_READ_WRITE - Specify that you will read and write in kernels
|
* - UCL_READ_WRITE - Specify that you will read and write in kernels
|
||||||
* - UCL_WRITE_ONLY - Specify that you will only write in kernels
|
* - UCL_WRITE_ONLY - Specify that you will only write in kernels
|
||||||
* - UCL_READ_ONLY - Specify that you will only read in kernels
|
* - UCL_READ_ONLY - Specify that you will only read in kernels
|
||||||
@ -69,24 +70,25 @@ class UCL_Matrix {
|
|||||||
* \return UCL_SUCCESS if the memory allocation is successful **/
|
* \return UCL_SUCCESS if the memory allocation is successful **/
|
||||||
template <class mat_type>
|
template <class mat_type>
|
||||||
inline int alloc(const size_t rows, const size_t cols, mat_type &cq,
|
inline int alloc(const size_t rows, const size_t cols, mat_type &cq,
|
||||||
const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED,
|
const enum UCL_MEMOPT kind1=UCL_READ_WRITE,
|
||||||
const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
|
const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
|
||||||
{ return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
{ return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
||||||
alloc(host,device,_buffer,rows,cols,cq,kind1,kind2); }
|
alloc(host,device,_buffer,rows,cols,cq,kind1,kind2); }
|
||||||
|
|
||||||
/// Set up host matrix with specied # of rows/cols and reserve memory
|
/// Set up host matrix with specied # of rows/cols and reserve memory
|
||||||
/** The kind1 parameter controls memory pinning as follows:
|
/** The kind1 parameter controls memory access from the host
|
||||||
* - UCL_NOT_PINNED - Memory is not pinned
|
* - UCL_READ_WRITE - Specify that you will read and write from host
|
||||||
* - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
|
* - UCL_WRITE_ONLY - Specify that you will only write from host
|
||||||
* - UCL_RW_OPTIMIZED - Memory can be pinned
|
* - UCL_READ_ONLY - Specify that you will only read from host
|
||||||
* The kind2 parameter controls memory optimizations as follows:
|
* - UCL_NOT_PINNED - Memory is not pinned/page-locked on host
|
||||||
|
* The kind2 parameter controls memory optimizations from the device:
|
||||||
* - UCL_READ_WRITE - Specify that you will read and write in kernels
|
* - UCL_READ_WRITE - Specify that you will read and write in kernels
|
||||||
* - UCL_WRITE_ONLY - Specify that you will only write in kernels
|
* - UCL_WRITE_ONLY - Specify that you will only write in kernels
|
||||||
* - UCL_READ_ONLY - Specify that you will only read in kernels
|
* - UCL_READ_ONLY - Specify that you will only read in kernels
|
||||||
* \param device Used to get the default command queue for operations
|
* \param device Used to get the default command queue for operations
|
||||||
* \return UCL_SUCCESS if the memory allocation is successful **/
|
* \return UCL_SUCCESS if the memory allocation is successful **/
|
||||||
inline int alloc(const size_t rows, const size_t cols, UCL_Device &acc,
|
inline int alloc(const size_t rows, const size_t cols, UCL_Device &acc,
|
||||||
const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED,
|
const enum UCL_MEMOPT kind1=UCL_READ_WRITE,
|
||||||
const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
|
const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
|
||||||
{ return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
{ return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
||||||
alloc(host,device,_buffer,rows,cols,acc,kind1,kind2); }
|
alloc(host,device,_buffer,rows,cols,acc,kind1,kind2); }
|
||||||
@ -110,11 +112,22 @@ class UCL_Matrix {
|
|||||||
{ if (new_rows>rows() || new_cols>cols()) return resize(new_rows,new_cols);
|
{ if (new_rows>rows() || new_cols>cols()) return resize(new_rows,new_cols);
|
||||||
else return UCL_SUCCESS; }
|
else return UCL_SUCCESS; }
|
||||||
|
|
||||||
/// Set each element to zero
|
/// Set each element to zero (asynchronously on device)
|
||||||
inline void zero() { host.zero(); device.zero(); }
|
inline void zero() { zero(cq()); }
|
||||||
|
/// Set first n elements to zero (asynchronously on device)
|
||||||
/// Set first n elements to zero
|
inline void zero(const int n) { zero(n,cq()); }
|
||||||
inline void zero(const int n) { host.zero(n); device.zero(n); }
|
/// Set each element to zero (asynchronously on device)
|
||||||
|
inline void zero(command_queue &cq) {
|
||||||
|
host.zero();
|
||||||
|
if (device.kind()!=UCL_VIEW) device.zero(cq);
|
||||||
|
else if (_buffer.numel()>0) _buffer.zero();
|
||||||
|
}
|
||||||
|
/// Set first n elements to zero (asynchronously on device)
|
||||||
|
inline void zero(const int n, command_queue &cq) {
|
||||||
|
host.zero(n);
|
||||||
|
if (device.kind()!=UCL_VIEW) device.zero(n,cq);
|
||||||
|
else if (_buffer.numel()>0) _buffer.zero();
|
||||||
|
}
|
||||||
|
|
||||||
/// Get the number of elements
|
/// Get the number of elements
|
||||||
inline size_t numel() const { return host.numel(); }
|
inline size_t numel() const { return host.numel(); }
|
||||||
@ -145,6 +158,8 @@ class UCL_Matrix {
|
|||||||
|
|
||||||
/// Return the default command queue/stream associated with this data
|
/// Return the default command queue/stream associated with this data
|
||||||
inline command_queue & cq() { return host.cq(); }
|
inline command_queue & cq() { return host.cq(); }
|
||||||
|
/// Change the default command queue associated with this data
|
||||||
|
inline void cq(command_queue &cq_in) { host.cq(cq_in); device.cq(cq_in); }
|
||||||
/// Block until command_queue associated with matrix is complete
|
/// Block until command_queue associated with matrix is complete
|
||||||
inline void sync() { host.sync(); }
|
inline void sync() { host.sync(); }
|
||||||
|
|
||||||
|
|||||||
@ -32,14 +32,24 @@ template <> struct _ucl_s_obj_help<1> {
|
|||||||
const enum UCL_MEMOPT kind1,
|
const enum UCL_MEMOPT kind1,
|
||||||
const enum UCL_MEMOPT kind2) {
|
const enum UCL_MEMOPT kind2) {
|
||||||
int e1;
|
int e1;
|
||||||
e1=host.alloc(cols,acc,kind1);
|
|
||||||
if (e1!=UCL_SUCCESS)
|
|
||||||
return e1;
|
|
||||||
if (acc.shared_memory()) {
|
if (acc.shared_memory()) {
|
||||||
|
#ifdef UCL_DBG_MEM_TRACE
|
||||||
|
std::cerr << "UCL_ALLOC 1S\n";
|
||||||
|
#endif
|
||||||
|
e1=host.alloc(cols,acc,kind1,kind2);
|
||||||
|
if (e1!=UCL_SUCCESS)
|
||||||
|
return e1;
|
||||||
device.view(host);
|
device.view(host);
|
||||||
return UCL_SUCCESS;
|
return UCL_SUCCESS;
|
||||||
} else
|
} else {
|
||||||
|
#ifdef UCL_DBG_MEM_TRACE
|
||||||
|
std::cerr << "UCL_ALLOC 1NS\n";
|
||||||
|
#endif
|
||||||
|
e1=host.alloc(cols,acc,kind1);
|
||||||
|
if (e1!=UCL_SUCCESS)
|
||||||
|
return e1;
|
||||||
return device.alloc(cols,acc,kind2);
|
return device.alloc(cols,acc,kind2);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class t1, class t2, class t3, class mat_type>
|
template <class t1, class t2, class t3, class mat_type>
|
||||||
@ -48,10 +58,24 @@ template <> struct _ucl_s_obj_help<1> {
|
|||||||
const enum UCL_MEMOPT kind1,
|
const enum UCL_MEMOPT kind1,
|
||||||
const enum UCL_MEMOPT kind2) {
|
const enum UCL_MEMOPT kind2) {
|
||||||
int e1;
|
int e1;
|
||||||
e1=host.alloc(cols,cq,kind1);
|
if (cq.shared_mem_device()) {
|
||||||
if (e1!=UCL_SUCCESS)
|
#ifdef UCL_DBG_MEM_TRACE
|
||||||
return e1;
|
std::cerr << "UCL_ALLOC 2S\n";
|
||||||
return device.alloc(cols,cq,kind2);
|
#endif
|
||||||
|
e1=host.alloc(cols,cq,kind1,kind2);
|
||||||
|
if (e1!=UCL_SUCCESS)
|
||||||
|
return e1;
|
||||||
|
device.view(host);
|
||||||
|
return UCL_SUCCESS;
|
||||||
|
} else {
|
||||||
|
#ifdef UCL_DBG_MEM_TRACE
|
||||||
|
std::cerr << "UCL_ALLOC 2NS\n";
|
||||||
|
#endif
|
||||||
|
e1=host.alloc(cols,cq,kind1);
|
||||||
|
if (e1!=UCL_SUCCESS)
|
||||||
|
return e1;
|
||||||
|
return device.alloc(cols,cq,kind2);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class t1, class t2, class t3>
|
template <class t1, class t2, class t3>
|
||||||
@ -60,14 +84,24 @@ template <> struct _ucl_s_obj_help<1> {
|
|||||||
const enum UCL_MEMOPT kind1,
|
const enum UCL_MEMOPT kind1,
|
||||||
const enum UCL_MEMOPT kind2) {
|
const enum UCL_MEMOPT kind2) {
|
||||||
int e1;
|
int e1;
|
||||||
e1=host.alloc(rows,cols,acc,kind1);
|
|
||||||
if (e1!=UCL_SUCCESS)
|
|
||||||
return e1;
|
|
||||||
if (acc.shared_memory()) {
|
if (acc.shared_memory()) {
|
||||||
|
#ifdef UCL_DBG_MEM_TRACE
|
||||||
|
std::cerr << "UCL_ALLOC 3S\n";
|
||||||
|
#endif
|
||||||
|
e1=host.alloc(rows,cols,acc,kind1,kind2);
|
||||||
|
if (e1!=UCL_SUCCESS)
|
||||||
|
return e1;
|
||||||
device.view(host);
|
device.view(host);
|
||||||
return UCL_SUCCESS;
|
return UCL_SUCCESS;
|
||||||
} else
|
} else {
|
||||||
|
e1=host.alloc(rows,cols,acc,kind1);
|
||||||
|
if (e1!=UCL_SUCCESS)
|
||||||
|
return e1;
|
||||||
|
#ifdef UCL_DBG_MEM_TRACE
|
||||||
|
std::cerr << "UCL_ALLOC 3NS\n";
|
||||||
|
#endif
|
||||||
return device.alloc(rows,cols,acc,kind2);
|
return device.alloc(rows,cols,acc,kind2);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class t1, class t2, class t3, class mat_type>
|
template <class t1, class t2, class t3, class mat_type>
|
||||||
@ -76,10 +110,24 @@ template <> struct _ucl_s_obj_help<1> {
|
|||||||
const enum UCL_MEMOPT kind1,
|
const enum UCL_MEMOPT kind1,
|
||||||
const enum UCL_MEMOPT kind2) {
|
const enum UCL_MEMOPT kind2) {
|
||||||
int e1;
|
int e1;
|
||||||
e1=host.alloc(rows,cols,cq,kind1);
|
if (cq.shared_mem_device()) {
|
||||||
if (e1!=UCL_SUCCESS)
|
#ifdef UCL_DBG_MEM_TRACE
|
||||||
return e1;
|
std::cerr << "UCL_ALLOC 4S\n";
|
||||||
return device.alloc(rows,cols,cq,kind2);
|
#endif
|
||||||
|
e1=host.alloc(rows,cols,cq,kind1,kind2);
|
||||||
|
if (e1!=UCL_SUCCESS)
|
||||||
|
return e1;
|
||||||
|
device.view(host);
|
||||||
|
return UCL_SUCCESS;
|
||||||
|
} else {
|
||||||
|
#ifdef UCL_DBG_MEM_TRACE
|
||||||
|
std::cerr << "UCL_ALLOC 4NS\n";
|
||||||
|
#endif
|
||||||
|
e1=host.alloc(rows,cols,cq,kind1);
|
||||||
|
if (e1!=UCL_SUCCESS)
|
||||||
|
return e1;
|
||||||
|
return device.alloc(rows,cols,cq,kind2);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class t1, class t2, class t3>
|
template <class t1, class t2, class t3>
|
||||||
@ -121,8 +169,15 @@ template <> struct _ucl_s_obj_help<1> {
|
|||||||
if (device.kind()==UCL_VIEW) {
|
if (device.kind()==UCL_VIEW) {
|
||||||
device.view(host);
|
device.view(host);
|
||||||
return UCL_SUCCESS;
|
return UCL_SUCCESS;
|
||||||
} else
|
#ifdef UCL_DBG_MEM_TRACE
|
||||||
|
std::cerr << "UCL_ALLOC 5S\n";
|
||||||
|
#endif
|
||||||
|
} else {
|
||||||
|
#ifdef UCL_DBG_MEM_TRACE
|
||||||
|
std::cerr << "UCL_ALLOC 5NS\n";
|
||||||
|
#endif
|
||||||
return device.resize(cols);
|
return device.resize(cols);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class t1, class t2, class t3>
|
template <class t1, class t2, class t3>
|
||||||
@ -130,9 +185,16 @@ template <> struct _ucl_s_obj_help<1> {
|
|||||||
const int cols) {
|
const int cols) {
|
||||||
if (device.kind()==UCL_VIEW) {
|
if (device.kind()==UCL_VIEW) {
|
||||||
device.view(host);
|
device.view(host);
|
||||||
|
#ifdef UCL_DBG_MEM_TRACE
|
||||||
|
std::cerr << "UCL_ALLOC 6S\n";
|
||||||
|
#endif
|
||||||
return UCL_SUCCESS;
|
return UCL_SUCCESS;
|
||||||
} else
|
} else {
|
||||||
|
#ifdef UCL_DBG_MEM_TRACE
|
||||||
|
std::cerr << "UCL_ALLOC 6NS\n";
|
||||||
|
#endif
|
||||||
return device.resize(rows,cols);
|
return device.resize(rows,cols);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -145,17 +207,27 @@ template <int st> struct _ucl_s_obj_help {
|
|||||||
const enum UCL_MEMOPT kind2) {
|
const enum UCL_MEMOPT kind2) {
|
||||||
int e1;
|
int e1;
|
||||||
e1=host.alloc(cols,acc,UCL_NOT_PINNED);
|
e1=host.alloc(cols,acc,UCL_NOT_PINNED);
|
||||||
if (e1!=UCL_SUCCESS)
|
|
||||||
return e1;
|
|
||||||
e1=_buffer.alloc(cols,acc,kind1);
|
|
||||||
if (e1!=UCL_SUCCESS)
|
if (e1!=UCL_SUCCESS)
|
||||||
return e1;
|
return e1;
|
||||||
|
|
||||||
if (acc.shared_memory()) {
|
if (acc.shared_memory()) {
|
||||||
|
#ifdef UCL_DBG_MEM_TRACE
|
||||||
|
std::cerr << "UCL_ALLOC 7S\n";
|
||||||
|
#endif
|
||||||
|
e1=_buffer.alloc(cols,acc,kind1,kind2);
|
||||||
|
if (e1!=UCL_SUCCESS)
|
||||||
|
return e1;
|
||||||
device.view(_buffer);
|
device.view(_buffer);
|
||||||
return UCL_SUCCESS;
|
return UCL_SUCCESS;
|
||||||
} else
|
} else {
|
||||||
|
#ifdef UCL_DBG_MEM_TRACE
|
||||||
|
std::cerr << "UCL_ALLOC 7NS\n";
|
||||||
|
#endif
|
||||||
|
e1=_buffer.alloc(cols,acc,kind1);
|
||||||
|
if (e1!=UCL_SUCCESS)
|
||||||
|
return e1;
|
||||||
return device.alloc(cols,acc,kind2);
|
return device.alloc(cols,acc,kind2);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class t1, class t2, class t3, class mat_type>
|
template <class t1, class t2, class t3, class mat_type>
|
||||||
@ -167,10 +239,24 @@ template <int st> struct _ucl_s_obj_help {
|
|||||||
e1=host.alloc(cols,cq,UCL_NOT_PINNED);
|
e1=host.alloc(cols,cq,UCL_NOT_PINNED);
|
||||||
if (e1!=UCL_SUCCESS)
|
if (e1!=UCL_SUCCESS)
|
||||||
return e1;
|
return e1;
|
||||||
e1=_buffer.alloc(cols,cq,kind1);
|
if (cq.shared_mem_device()) {
|
||||||
if (e1!=UCL_SUCCESS)
|
#ifdef UCL_DBG_MEM_TRACE
|
||||||
return e1;
|
std::cerr << "UCL_ALLOC 8S\n";
|
||||||
return device.alloc(cols,cq,kind2);
|
#endif
|
||||||
|
e1=_buffer.alloc(cols,cq,kind1,kind2);
|
||||||
|
if (e1!=UCL_SUCCESS)
|
||||||
|
return e1;
|
||||||
|
device.view(_buffer);
|
||||||
|
return UCL_SUCCESS;
|
||||||
|
} else {
|
||||||
|
#ifdef UCL_DBG_MEM_TRACE
|
||||||
|
std::cerr << "UCL_ALLOC 8NS\n";
|
||||||
|
#endif
|
||||||
|
e1=_buffer.alloc(cols,cq,kind1);
|
||||||
|
if (e1!=UCL_SUCCESS)
|
||||||
|
return e1;
|
||||||
|
return device.alloc(cols,cq,kind2);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class t1, class t2, class t3>
|
template <class t1, class t2, class t3>
|
||||||
@ -180,17 +266,27 @@ template <int st> struct _ucl_s_obj_help {
|
|||||||
const enum UCL_MEMOPT kind2) {
|
const enum UCL_MEMOPT kind2) {
|
||||||
int e1;
|
int e1;
|
||||||
e1=host.alloc(rows,cols,acc,UCL_NOT_PINNED);
|
e1=host.alloc(rows,cols,acc,UCL_NOT_PINNED);
|
||||||
if (e1!=UCL_SUCCESS)
|
|
||||||
return e1;
|
|
||||||
e1=_buffer.alloc(rows,cols,acc,kind1);
|
|
||||||
if (e1!=UCL_SUCCESS)
|
if (e1!=UCL_SUCCESS)
|
||||||
return e1;
|
return e1;
|
||||||
|
|
||||||
if (acc.shared_memory()) {
|
if (acc.shared_memory()) {
|
||||||
|
#ifdef UCL_DBG_MEM_TRACE
|
||||||
|
std::cerr << "UCL_ALLOC 9S\n";
|
||||||
|
#endif
|
||||||
|
e1=_buffer.alloc(rows,cols,acc,kind1,kind2);
|
||||||
|
if (e1!=UCL_SUCCESS)
|
||||||
|
return e1;
|
||||||
device.view(_buffer);
|
device.view(_buffer);
|
||||||
return UCL_SUCCESS;
|
return UCL_SUCCESS;
|
||||||
} else
|
} else {
|
||||||
|
#ifdef UCL_DBG_MEM_TRACE
|
||||||
|
std::cerr << "UCL_ALLOC 9NS\n";
|
||||||
|
#endif
|
||||||
|
e1=_buffer.alloc(rows,cols,acc,kind1);
|
||||||
|
if (e1!=UCL_SUCCESS)
|
||||||
|
return e1;
|
||||||
return device.alloc(rows,cols,acc,kind2);
|
return device.alloc(rows,cols,acc,kind2);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class t1, class t2, class t3, class mat_type>
|
template <class t1, class t2, class t3, class mat_type>
|
||||||
@ -202,10 +298,24 @@ template <int st> struct _ucl_s_obj_help {
|
|||||||
e1=host.alloc(rows,cols,cq,UCL_NOT_PINNED);
|
e1=host.alloc(rows,cols,cq,UCL_NOT_PINNED);
|
||||||
if (e1!=UCL_SUCCESS)
|
if (e1!=UCL_SUCCESS)
|
||||||
return e1;
|
return e1;
|
||||||
e1=_buffer.alloc(rows,cols,cq,kind1);
|
if (cq.shared_mem_device()) {
|
||||||
if (e1!=UCL_SUCCESS)
|
#ifdef UCL_DBG_MEM_TRACE
|
||||||
return e1;
|
std::cerr << "UCL_ALLOC 10S\n";
|
||||||
return device.alloc(rows,cols,cq,kind2);
|
#endif
|
||||||
|
e1=_buffer.alloc(rows,cols,cq,kind1,kind2);
|
||||||
|
if (e1!=UCL_SUCCESS)
|
||||||
|
return e1;
|
||||||
|
device.view(_buffer);
|
||||||
|
return UCL_SUCCESS;
|
||||||
|
} else {
|
||||||
|
#ifdef UCL_DBG_MEM_TRACE
|
||||||
|
std::cerr << "UCL_ALLOC 10NS\n";
|
||||||
|
#endif
|
||||||
|
e1=_buffer.alloc(rows,cols,cq,kind1);
|
||||||
|
if (e1!=UCL_SUCCESS)
|
||||||
|
return e1;
|
||||||
|
return device.alloc(rows,cols,cq,kind2);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class t1, class t2, class t3>
|
template <class t1, class t2, class t3>
|
||||||
@ -250,9 +360,16 @@ template <int st> struct _ucl_s_obj_help {
|
|||||||
|
|
||||||
if (device.kind()==UCL_VIEW) {
|
if (device.kind()==UCL_VIEW) {
|
||||||
device.view(buff);
|
device.view(buff);
|
||||||
|
#ifdef UCL_DBG_MEM_TRACE
|
||||||
|
std::cerr << "UCL_ALLOC 11S\n";
|
||||||
|
#endif
|
||||||
return UCL_SUCCESS;
|
return UCL_SUCCESS;
|
||||||
} else
|
} else {
|
||||||
|
#ifdef UCL_DBG_MEM_TRACE
|
||||||
|
std::cerr << "UCL_ALLOC 11NS\n";
|
||||||
|
#endif
|
||||||
return device.resize(cols);
|
return device.resize(cols);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class t1, class t2, class t3>
|
template <class t1, class t2, class t3>
|
||||||
@ -264,9 +381,17 @@ template <int st> struct _ucl_s_obj_help {
|
|||||||
|
|
||||||
if (device.kind()==UCL_VIEW) {
|
if (device.kind()==UCL_VIEW) {
|
||||||
device.view(buff);
|
device.view(buff);
|
||||||
|
#ifdef UCL_DBG_MEM_TRACE
|
||||||
|
std::cerr << "UCL_ALLOC 12S\n";
|
||||||
|
#endif
|
||||||
return UCL_SUCCESS;
|
return UCL_SUCCESS;
|
||||||
} else
|
} else {
|
||||||
|
#ifdef UCL_DBG_MEM_TRACE
|
||||||
|
std::cerr << "UCL_ALLOC 12NS\n";
|
||||||
|
#endif
|
||||||
return device.resize(rows,cols);
|
return device.resize(rows,cols);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@ -92,10 +92,9 @@ enum UCL_MEMOPT {
|
|||||||
UCL_WRITE_ONLY, ///< Allow any optimizations for memory that is write only
|
UCL_WRITE_ONLY, ///< Allow any optimizations for memory that is write only
|
||||||
UCL_READ_ONLY, ///< Allow any optimizations for memory that is read only
|
UCL_READ_ONLY, ///< Allow any optimizations for memory that is read only
|
||||||
UCL_READ_WRITE, ///< Allow read and write
|
UCL_READ_WRITE, ///< Allow read and write
|
||||||
UCL_WRITE_OPTIMIZED,///< Allow host memory to be pinned (write combined)
|
|
||||||
UCL_RW_OPTIMIZED, ///< Allow host memory to be pinned
|
|
||||||
UCL_NOT_PINNED, ///< Host memory is not to be pinned
|
UCL_NOT_PINNED, ///< Host memory is not to be pinned
|
||||||
UCL_VIEW ///< View of another memory allocation
|
UCL_VIEW, ///< View of another memory allocation
|
||||||
|
UCL_NOT_SPECIFIED
|
||||||
};
|
};
|
||||||
|
|
||||||
enum UCL_DEVICE_TYPE {
|
enum UCL_DEVICE_TYPE {
|
||||||
|
|||||||
@ -48,17 +48,18 @@ class UCL_Vector {
|
|||||||
/// Construct with n columns
|
/// Construct with n columns
|
||||||
/** \sa alloc() **/
|
/** \sa alloc() **/
|
||||||
UCL_Vector(const size_t cols, UCL_Device &acc,
|
UCL_Vector(const size_t cols, UCL_Device &acc,
|
||||||
const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED,
|
const enum UCL_MEMOPT kind1=UCL_READ_WRITE,
|
||||||
const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
|
const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
|
||||||
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
||||||
alloc(host,device,_buffer,cols,acc,kind1,kind2); }
|
alloc(host,device,_buffer,cols,acc,kind1,kind2); }
|
||||||
|
|
||||||
/// Set up the vector with 'cols' columns and reserve memory
|
/// Set up the vector with 'cols' columns and reserve memory
|
||||||
/** The kind1 parameter controls memory pinning as follows:
|
/** The kind1 parameter controls memory access from the host
|
||||||
* - UCL_NOT_PINNED - Memory is not pinned
|
* - UCL_READ_WRITE - Specify that you will read and write from host
|
||||||
* - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
|
* - UCL_WRITE_ONLY - Specify that you will only write from host
|
||||||
* - UCL_RW_OPTIMIZED - Memory can be pinned
|
* - UCL_READ_ONLY - Specify that you will only read from host
|
||||||
* The kind2 parameter controls memory optimizations as follows:
|
* - UCL_NOT_PINNED - Memory is not pinned/page-locked on host
|
||||||
|
* The kind2 parameter controls memory optimizations from the device:
|
||||||
* - UCL_READ_WRITE - Specify that you will read and write in kernels
|
* - UCL_READ_WRITE - Specify that you will read and write in kernels
|
||||||
* - UCL_WRITE_ONLY - Specify that you will only write in kernels
|
* - UCL_WRITE_ONLY - Specify that you will only write in kernels
|
||||||
* - UCL_READ_ONLY - Specify that you will only read in kernels
|
* - UCL_READ_ONLY - Specify that you will only read in kernels
|
||||||
@ -69,24 +70,25 @@ class UCL_Vector {
|
|||||||
* \return UCL_SUCCESS if the memory allocation is successful **/
|
* \return UCL_SUCCESS if the memory allocation is successful **/
|
||||||
template <class mat_type>
|
template <class mat_type>
|
||||||
inline int alloc(const size_t cols, mat_type &cq,
|
inline int alloc(const size_t cols, mat_type &cq,
|
||||||
const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED,
|
const enum UCL_MEMOPT kind1=UCL_READ_WRITE,
|
||||||
const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
|
const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
|
||||||
{ return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
{ return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
||||||
alloc(host,device,_buffer,cols,cq,kind1,kind2); }
|
alloc(host,device,_buffer,cols,cq,kind1,kind2); }
|
||||||
|
|
||||||
/// Set up host vector with 'cols' columns and reserve memory
|
/// Set up host vector with 'cols' columns and reserve memory
|
||||||
/** The kind1 parameter controls memory pinning as follows:
|
/** The kind1 parameter controls memory access from the host
|
||||||
* - UCL_NOT_PINNED - Memory is not pinned
|
* - UCL_READ_WRITE - Specify that you will read and write from host
|
||||||
* - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
|
* - UCL_WRITE_ONLY - Specify that you will only write from host
|
||||||
* - UCL_RW_OPTIMIZED - Memory can be pinned
|
* - UCL_READ_ONLY - Specify that you will only read from host
|
||||||
* The kind2 parameter controls memory optimizations as follows:
|
* - UCL_NOT_PINNED - Memory is not pinned/page-locked on host
|
||||||
|
* The kind2 parameter controls memory optimizations from the device:
|
||||||
* - UCL_READ_WRITE - Specify that you will read and write in kernels
|
* - UCL_READ_WRITE - Specify that you will read and write in kernels
|
||||||
* - UCL_WRITE_ONLY - Specify that you will only write in kernels
|
* - UCL_WRITE_ONLY - Specify that you will only write in kernels
|
||||||
* - UCL_READ_ONLY - Specify that you will only read in kernels
|
* - UCL_READ_ONLY - Specify that you will only read in kernels
|
||||||
* \param device Used to get the default command queue for operations
|
* \param device Used to get the default command queue for operations
|
||||||
* \return UCL_SUCCESS if the memory allocation is successful **/
|
* \return UCL_SUCCESS if the memory allocation is successful **/
|
||||||
inline int alloc(const size_t cols, UCL_Device &acc,
|
inline int alloc(const size_t cols, UCL_Device &acc,
|
||||||
const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED,
|
const enum UCL_MEMOPT kind1=UCL_READ_WRITE,
|
||||||
const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
|
const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
|
||||||
{ return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
{ return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
||||||
alloc(host,device,_buffer,cols,acc,kind1,kind2); }
|
alloc(host,device,_buffer,cols,acc,kind1,kind2); }
|
||||||
@ -109,11 +111,22 @@ class UCL_Vector {
|
|||||||
inline int resize_ib(const int new_cols)
|
inline int resize_ib(const int new_cols)
|
||||||
{ if (new_cols>cols()) return resize(new_cols); else return UCL_SUCCESS; }
|
{ if (new_cols>cols()) return resize(new_cols); else return UCL_SUCCESS; }
|
||||||
|
|
||||||
/// Set each element to zero
|
/// Set each element to zero (asynchronously on device)
|
||||||
inline void zero() { host.zero(); device.zero(); }
|
inline void zero() { zero(cq()); }
|
||||||
|
/// Set first n elements to zero (asynchronously on device)
|
||||||
/// Set first n elements to zero
|
inline void zero(const int n) { zero(n,cq()); }
|
||||||
inline void zero(const int n) { host.zero(n); device.zero(n); }
|
/// Set each element to zero (asynchronously on device)
|
||||||
|
inline void zero(command_queue &cq) {
|
||||||
|
host.zero();
|
||||||
|
if (device.kind()!=UCL_VIEW) device.zero(cq);
|
||||||
|
else if (_buffer.numel()>0) _buffer.zero();
|
||||||
|
}
|
||||||
|
/// Set first n elements to zero (asynchronously on device)
|
||||||
|
inline void zero(const int n, command_queue &cq) {
|
||||||
|
host.zero(n);
|
||||||
|
if (device.kind()!=UCL_VIEW) device.zero(n,cq);
|
||||||
|
else if (_buffer.numel()>0) _buffer.zero();
|
||||||
|
}
|
||||||
|
|
||||||
/// Get the number of elements
|
/// Get the number of elements
|
||||||
inline size_t numel() const { return host.numel(); }
|
inline size_t numel() const { return host.numel(); }
|
||||||
@ -145,6 +158,8 @@ class UCL_Vector {
|
|||||||
|
|
||||||
/// Return the default command queue/stream associated with this data
|
/// Return the default command queue/stream associated with this data
|
||||||
inline command_queue & cq() { return host.cq(); }
|
inline command_queue & cq() { return host.cq(); }
|
||||||
|
/// Change the default command queue associated with this data
|
||||||
|
inline void cq(command_queue &cq_in) { host.cq(cq_in); device.cq(cq_in); }
|
||||||
/// Block until command_queue associated with matrix is complete
|
/// Block until command_queue associated with matrix is complete
|
||||||
inline void sync() { host.sync(); }
|
inline void sync() { host.sync(); }
|
||||||
|
|
||||||
|
|||||||
@ -44,10 +44,10 @@ bool AnswerT::alloc(const int inum) {
|
|||||||
_ans_fields+=4;
|
_ans_fields+=4;
|
||||||
|
|
||||||
// --------------------------- Device allocations
|
// --------------------------- Device allocations
|
||||||
success=success && (engv.alloc(_ev_fields*_max_local,*dev,UCL_RW_OPTIMIZED,
|
success=success && (engv.alloc(_ev_fields*_max_local,*dev,UCL_READ_ONLY,
|
||||||
UCL_WRITE_ONLY)==UCL_SUCCESS);
|
UCL_READ_WRITE)==UCL_SUCCESS);
|
||||||
success=success && (force.alloc(_ans_fields*_max_local,*dev,UCL_RW_OPTIMIZED,
|
success=success && (force.alloc(_ans_fields*_max_local,*dev,UCL_READ_ONLY,
|
||||||
UCL_WRITE_ONLY)==UCL_SUCCESS);
|
UCL_READ_WRITE)==UCL_SUCCESS);
|
||||||
_gpu_bytes=engv.device.row_bytes()+force.device.row_bytes();
|
_gpu_bytes=engv.device.row_bytes()+force.device.row_bytes();
|
||||||
|
|
||||||
_allocated=true;
|
_allocated=true;
|
||||||
@ -175,78 +175,42 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
|
|||||||
return 0.0;
|
return 0.0;
|
||||||
|
|
||||||
double evdwl=0.0;
|
double evdwl=0.0;
|
||||||
double virial_acc[6];
|
int vstart=0;
|
||||||
for (int i=0; i<6; i++) virial_acc[i]=0.0;
|
if (_eflag) {
|
||||||
if (_ilist==NULL) {
|
for (int i=0; i<_inum; i++)
|
||||||
for (int i=0; i<_inum; i++) {
|
evdwl+=engv[i];
|
||||||
int al=i;
|
if (_ef_atom)
|
||||||
if (_eflag) {
|
if (_ilist==NULL)
|
||||||
if (_ef_atom) {
|
for (int i=0; i<_inum; i++)
|
||||||
evdwl+=engv[al];
|
eatom[i]+=engv[i];
|
||||||
eatom[i]+=engv[al]*0.5;
|
else
|
||||||
al+=_inum;
|
for (int i=0; i<_inum; i++)
|
||||||
} else {
|
eatom[_ilist[i]]+=engv[i];
|
||||||
evdwl+=engv[al];
|
vstart=_inum;
|
||||||
al+=_inum;
|
}
|
||||||
}
|
if (_vflag) {
|
||||||
}
|
int iend=vstart+_inum;
|
||||||
if (_vflag) {
|
for (int j=0; j<6; j++) {
|
||||||
if (_vf_atom) {
|
for (int i=vstart; i<iend; i++)
|
||||||
for (int j=0; j<6; j++) {
|
virial[j]+=engv[i];
|
||||||
vatom[i][j]+=engv[al]*0.5;
|
if (_vf_atom)
|
||||||
virial_acc[j]+=engv[al];
|
if (_ilist==NULL)
|
||||||
al+=_inum;
|
for (int i=vstart; i<iend; i++)
|
||||||
}
|
vatom[i][j]+=engv[i];
|
||||||
} else {
|
else
|
||||||
for (int j=0; j<6; j++) {
|
for (int i=vstart; i<iend; i++)
|
||||||
virial_acc[j]+=engv[al];
|
vatom[_ilist[i]][j]+=engv[i];
|
||||||
al+=_inum;
|
vstart+=_inum;
|
||||||
}
|
iend+=_inum;
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
for (int j=0; j<6; j++)
|
|
||||||
virial[j]+=virial_acc[j]*0.5;
|
|
||||||
} else {
|
|
||||||
for (int i=0; i<_inum; i++) {
|
|
||||||
int al=i;
|
|
||||||
int ii=_ilist[i];
|
|
||||||
if (_eflag) {
|
|
||||||
if (_ef_atom) {
|
|
||||||
evdwl+=engv[al];
|
|
||||||
eatom[ii]+=engv[al]*0.5;
|
|
||||||
al+=_inum;
|
|
||||||
} else {
|
|
||||||
evdwl+=engv[al];
|
|
||||||
al+=_inum;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (_vflag) {
|
|
||||||
if (_vf_atom) {
|
|
||||||
for (int j=0; j<6; j++) {
|
|
||||||
vatom[ii][j]+=engv[al]*0.5;
|
|
||||||
virial_acc[j]+=engv[al];
|
|
||||||
al+=_inum;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for (int j=0; j<6; j++) {
|
|
||||||
virial_acc[j]+=engv[al];
|
|
||||||
al+=_inum;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for (int j=0; j<6; j++)
|
|
||||||
virial[j]+=virial_acc[j]*0.5;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
evdwl*=0.5;
|
|
||||||
return evdwl;
|
return evdwl;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
double AnswerT::energy_virial(double *eatom, double **vatom,
|
double AnswerT::energy_virial(double *eatom, double **vatom,
|
||||||
double *virial, double &ecoul) {
|
double *virial, double &ecoul) {
|
||||||
if (_eflag==false && _vflag==false)
|
if (_eflag==false && _vflag==false)
|
||||||
return 0.0;
|
return 0.0;
|
||||||
|
|
||||||
@ -254,84 +218,43 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
|
|||||||
return energy_virial(eatom,vatom,virial);
|
return energy_virial(eatom,vatom,virial);
|
||||||
|
|
||||||
double evdwl=0.0;
|
double evdwl=0.0;
|
||||||
double _ecoul=0.0;
|
int vstart=0, iend=_inum*2;
|
||||||
double virial_acc[6];
|
if (_eflag) {
|
||||||
for (int i=0; i<6; i++) virial_acc[i]=0.0;
|
for (int i=0; i<_inum; i++)
|
||||||
if (_ilist==NULL) {
|
evdwl+=engv[i];
|
||||||
for (int i=0; i<_inum; i++) {
|
for (int i=_inum; i<iend; i++)
|
||||||
int al=i;
|
ecoul+=engv[i];
|
||||||
if (_eflag) {
|
if (_ef_atom)
|
||||||
if (_ef_atom) {
|
if (_ilist==NULL) {
|
||||||
evdwl+=engv[al];
|
for (int i=0; i<_inum; i++)
|
||||||
eatom[i]+=engv[al]*0.5;
|
eatom[i]+=engv[i];
|
||||||
al+=_inum;
|
for (int i=_inum; i<iend; i++)
|
||||||
_ecoul+=engv[al];
|
eatom[i]+=engv[i];
|
||||||
eatom[i]+=engv[al]*0.5;
|
} else {
|
||||||
al+=_inum;
|
for (int i=0; i<_inum; i++)
|
||||||
} else {
|
eatom[_ilist[i]]+=engv[i];
|
||||||
evdwl+=engv[al];
|
for (int i=_inum; i<iend; i++)
|
||||||
al+=_inum;
|
eatom[_ilist[i]]+=engv[i];
|
||||||
_ecoul+=engv[al];
|
|
||||||
al+=_inum;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (_vflag) {
|
|
||||||
if (_vf_atom) {
|
|
||||||
for (int j=0; j<6; j++) {
|
|
||||||
vatom[i][j]+=engv[al]*0.5;
|
|
||||||
virial_acc[j]+=engv[al];
|
|
||||||
al+=_inum;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for (int j=0; j<6; j++) {
|
|
||||||
virial_acc[j]+=engv[al];
|
|
||||||
al+=_inum;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
vstart=iend;
|
||||||
|
iend+=_inum;
|
||||||
|
}
|
||||||
|
if (_vflag) {
|
||||||
|
for (int j=0; j<6; j++) {
|
||||||
|
for (int i=vstart; i<iend; i++)
|
||||||
|
virial[j]+=engv[i];
|
||||||
|
if (_vf_atom)
|
||||||
|
if (_ilist==NULL)
|
||||||
|
for (int i=vstart; i<iend; i++)
|
||||||
|
vatom[i][j]+=engv[i];
|
||||||
|
else
|
||||||
|
for (int i=vstart; i<iend; i++)
|
||||||
|
vatom[_ilist[i]][j]+=engv[i];
|
||||||
|
vstart+=_inum;
|
||||||
|
iend+=_inum;
|
||||||
}
|
}
|
||||||
for (int j=0; j<6; j++)
|
|
||||||
virial[j]+=virial_acc[j]*0.5;
|
|
||||||
} else {
|
|
||||||
for (int i=0; i<_inum; i++) {
|
|
||||||
int al=i;
|
|
||||||
int ii=_ilist[i];
|
|
||||||
if (_eflag) {
|
|
||||||
if (_ef_atom) {
|
|
||||||
evdwl+=engv[al];
|
|
||||||
eatom[ii]+=engv[al]*0.5;
|
|
||||||
al+=_inum;
|
|
||||||
_ecoul+=engv[al];
|
|
||||||
eatom[ii]+=engv[al]*0.5;
|
|
||||||
al+=_inum;
|
|
||||||
} else {
|
|
||||||
evdwl+=engv[al];
|
|
||||||
al+=_inum;
|
|
||||||
_ecoul+=engv[al];
|
|
||||||
al+=_inum;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (_vflag) {
|
|
||||||
if (_vf_atom) {
|
|
||||||
for (int j=0; j<6; j++) {
|
|
||||||
vatom[ii][j]+=engv[al]*0.5;
|
|
||||||
virial_acc[j]+=engv[al];
|
|
||||||
al+=_inum;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for (int j=0; j<6; j++) {
|
|
||||||
virial_acc[j]+=engv[al];
|
|
||||||
al+=_inum;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for (int j=0; j<6; j++)
|
|
||||||
virial[j]+=virial_acc[j]*0.5;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
evdwl*=0.5;
|
|
||||||
ecoul+=_ecoul*0.5;
|
|
||||||
return evdwl;
|
return evdwl;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -373,4 +296,14 @@ void AnswerT::get_answers(double **f, double **tor) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
void AnswerT::cq(const int cq_index) {
|
||||||
|
engv.cq(dev->cq(cq_index));
|
||||||
|
force.cq(dev->cq(cq_index));
|
||||||
|
time_answer.clear();
|
||||||
|
time_answer.init(*dev,dev->cq(cq_index));
|
||||||
|
time_answer.zero();
|
||||||
|
}
|
||||||
|
|
||||||
template class Answer<PRECISION,ACC_PRECISION>;
|
template class Answer<PRECISION,ACC_PRECISION>;
|
||||||
|
|
||||||
|
|||||||
@ -47,6 +47,8 @@ class Answer {
|
|||||||
inline int inum() const { return _inum; }
|
inline int inum() const { return _inum; }
|
||||||
/// Set number of local atoms for future copy operations
|
/// Set number of local atoms for future copy operations
|
||||||
inline void inum(const int n) { _inum=n; }
|
inline void inum(const int n) { _inum=n; }
|
||||||
|
/// Return the maximum number of atoms that can be stored currently
|
||||||
|
inline int max_inum() const { return _max_local; }
|
||||||
|
|
||||||
/// Memory usage per atom in this class
|
/// Memory usage per atom in this class
|
||||||
int bytes_per_atom() const;
|
int bytes_per_atom() const;
|
||||||
@ -132,6 +134,9 @@ class Answer {
|
|||||||
/// Return the time the CPU was idle waiting for GPU
|
/// Return the time the CPU was idle waiting for GPU
|
||||||
inline double cpu_idle_time() { return _time_cpu_idle; }
|
inline double cpu_idle_time() { return _time_cpu_idle; }
|
||||||
|
|
||||||
|
/// Change the command queue used for copies and timers
|
||||||
|
void cq(const int cq_index);
|
||||||
|
|
||||||
// ------------------------------ DATA ----------------------------------
|
// ------------------------------ DATA ----------------------------------
|
||||||
|
|
||||||
/// Force and possibly torque
|
/// Force and possibly torque
|
||||||
|
|||||||
@ -70,44 +70,47 @@ bool AtomT::alloc(const int nall) {
|
|||||||
|
|
||||||
// --------------------------- Device allocations
|
// --------------------------- Device allocations
|
||||||
int gpu_bytes=0;
|
int gpu_bytes=0;
|
||||||
success=success && (x.alloc(_max_atoms*4,*dev,UCL_WRITE_OPTIMIZED,
|
success=success && (x.alloc(_max_atoms*4,*dev,UCL_WRITE_ONLY,
|
||||||
UCL_READ_ONLY)==UCL_SUCCESS);
|
UCL_READ_ONLY)==UCL_SUCCESS);
|
||||||
#ifdef GPU_CAST
|
#ifdef GPU_CAST
|
||||||
success=success && (x_cast.alloc(_max_atoms*3,*dev,UCL_READ_ONLY)==
|
success=success && (x_cast.alloc(_max_atoms*3,*dev,UCL_WRITE_ONLY,
|
||||||
UCL_SUCCESS);
|
UCL_READ_ONLY)==UCL_SUCCESS);
|
||||||
success=success && (type_cast.alloc(_max_atoms,*dev,UCL_READ_ONLY)==
|
success=success && (type_cast.alloc(_max_atoms,*dev,UCL_WRITE_ONLY,
|
||||||
UCL_SUCCESS);
|
UCL_READ_ONLY)==UCL_SUCCESS);
|
||||||
gpu_bytes+=x_cast.device.row_bytes()+type_cast.device.row_bytes();
|
gpu_bytes+=x_cast.device.row_bytes()+type_cast.device.row_bytes();
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (_charge && _host_view==false) {
|
if (_charge && _host_view==false) {
|
||||||
success=success && (q.alloc(_max_atoms,*dev,UCL_WRITE_OPTIMIZED,
|
success=success && (q.alloc(_max_atoms,*dev,UCL_WRITE_ONLY,
|
||||||
UCL_READ_ONLY)==UCL_SUCCESS);
|
UCL_READ_ONLY)==UCL_SUCCESS);
|
||||||
gpu_bytes+=q.device.row_bytes();
|
gpu_bytes+=q.device.row_bytes();
|
||||||
}
|
}
|
||||||
if (_rot && _host_view==false) {
|
if (_rot && _host_view==false) {
|
||||||
success=success && (quat.alloc(_max_atoms*4,*dev,UCL_WRITE_OPTIMIZED,
|
success=success && (quat.alloc(_max_atoms*4,*dev,UCL_WRITE_ONLY,
|
||||||
UCL_READ_ONLY)==UCL_SUCCESS);
|
UCL_READ_ONLY)==UCL_SUCCESS);
|
||||||
gpu_bytes+=quat.device.row_bytes();
|
gpu_bytes+=quat.device.row_bytes();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (_gpu_nbor>0) {
|
if (_gpu_nbor>0) {
|
||||||
if (_bonds) {
|
if (_bonds) {
|
||||||
success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS);
|
success=success && (dev_tag.alloc(_max_atoms,*dev,
|
||||||
|
UCL_READ_ONLY)==UCL_SUCCESS);
|
||||||
gpu_bytes+=dev_tag.row_bytes();
|
gpu_bytes+=dev_tag.row_bytes();
|
||||||
}
|
}
|
||||||
if (_gpu_nbor==1) {
|
if (_gpu_nbor==1) {
|
||||||
success=success && (dev_cell_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
|
success=success && (dev_cell_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
|
||||||
gpu_bytes+=dev_cell_id.row_bytes();
|
gpu_bytes+=dev_cell_id.row_bytes();
|
||||||
} else {
|
} else {
|
||||||
success=success && (host_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
|
success=success && (host_particle_id.alloc(_max_atoms,*dev,
|
||||||
|
UCL_WRITE_ONLY)==UCL_SUCCESS);
|
||||||
success=success &&
|
success=success &&
|
||||||
(host_cell_id.alloc(_max_atoms,*dev,UCL_NOT_PINNED)==UCL_SUCCESS);
|
(host_cell_id.alloc(_max_atoms,*dev,UCL_NOT_PINNED)==UCL_SUCCESS);
|
||||||
}
|
}
|
||||||
if (_gpu_nbor==2 && _host_view)
|
if (_gpu_nbor==2 && _host_view)
|
||||||
dev_particle_id.view(host_particle_id);
|
dev_particle_id.view(host_particle_id);
|
||||||
else
|
else
|
||||||
success=success && (dev_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
|
success=success && (dev_particle_id.alloc(_max_atoms,*dev,
|
||||||
|
UCL_READ_ONLY)==UCL_SUCCESS);
|
||||||
gpu_bytes+=dev_particle_id.row_bytes();
|
gpu_bytes+=dev_particle_id.row_bytes();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -130,7 +133,7 @@ bool AtomT::add_fields(const bool charge, const bool rot,
|
|||||||
_charge=true;
|
_charge=true;
|
||||||
_other=true;
|
_other=true;
|
||||||
if (_host_view==false) {
|
if (_host_view==false) {
|
||||||
success=success && (q.alloc(_max_atoms,*dev,UCL_WRITE_OPTIMIZED,
|
success=success && (q.alloc(_max_atoms,*dev,UCL_WRITE_ONLY,
|
||||||
UCL_READ_ONLY)==UCL_SUCCESS);
|
UCL_READ_ONLY)==UCL_SUCCESS);
|
||||||
gpu_bytes+=q.device.row_bytes();
|
gpu_bytes+=q.device.row_bytes();
|
||||||
}
|
}
|
||||||
@ -140,7 +143,7 @@ bool AtomT::add_fields(const bool charge, const bool rot,
|
|||||||
_rot=true;
|
_rot=true;
|
||||||
_other=true;
|
_other=true;
|
||||||
if (_host_view==false) {
|
if (_host_view==false) {
|
||||||
success=success && (quat.alloc(_max_atoms*4,*dev,UCL_WRITE_OPTIMIZED,
|
success=success && (quat.alloc(_max_atoms*4,*dev,UCL_WRITE_ONLY,
|
||||||
UCL_READ_ONLY)==UCL_SUCCESS);
|
UCL_READ_ONLY)==UCL_SUCCESS);
|
||||||
gpu_bytes+=quat.device.row_bytes();
|
gpu_bytes+=quat.device.row_bytes();
|
||||||
}
|
}
|
||||||
@ -149,7 +152,8 @@ bool AtomT::add_fields(const bool charge, const bool rot,
|
|||||||
if (bonds && _bonds==false) {
|
if (bonds && _bonds==false) {
|
||||||
_bonds=true;
|
_bonds=true;
|
||||||
if (_bonds && _gpu_nbor>0) {
|
if (_bonds && _gpu_nbor>0) {
|
||||||
success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS);
|
success=success && (dev_tag.alloc(_max_atoms,*dev,
|
||||||
|
UCL_READ_ONLY)==UCL_SUCCESS);
|
||||||
gpu_bytes+=dev_tag.row_bytes();
|
gpu_bytes+=dev_tag.row_bytes();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -163,17 +167,20 @@ bool AtomT::add_fields(const bool charge, const bool rot,
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
success=success && (dev_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
|
success=success && (dev_particle_id.alloc(_max_atoms,*dev,
|
||||||
|
UCL_READ_ONLY)==UCL_SUCCESS);
|
||||||
gpu_bytes+=dev_particle_id.row_bytes();
|
gpu_bytes+=dev_particle_id.row_bytes();
|
||||||
if (_bonds) {
|
if (_bonds) {
|
||||||
success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS);
|
success=success && (dev_tag.alloc(_max_atoms,*dev,
|
||||||
|
UCL_READ_ONLY)==UCL_SUCCESS);
|
||||||
gpu_bytes+=dev_tag.row_bytes();
|
gpu_bytes+=dev_tag.row_bytes();
|
||||||
}
|
}
|
||||||
if (_gpu_nbor==1) {
|
if (_gpu_nbor==1) {
|
||||||
success=success && (dev_cell_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
|
success=success && (dev_cell_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
|
||||||
gpu_bytes+=dev_cell_id.row_bytes();
|
gpu_bytes+=dev_cell_id.row_bytes();
|
||||||
} else {
|
} else {
|
||||||
success=success && (host_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
|
success=success && (host_particle_id.alloc(_max_atoms,*dev,
|
||||||
|
UCL_WRITE_ONLY)==UCL_SUCCESS);
|
||||||
success=success &&
|
success=success &&
|
||||||
(host_cell_id.alloc(_max_atoms,*dev,UCL_NOT_PINNED)==UCL_SUCCESS);
|
(host_cell_id.alloc(_max_atoms,*dev,UCL_NOT_PINNED)==UCL_SUCCESS);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -77,12 +77,12 @@
|
|||||||
if (offset==0) { \
|
if (offset==0) { \
|
||||||
engv+=ii; \
|
engv+=ii; \
|
||||||
if (eflag>0) { \
|
if (eflag>0) { \
|
||||||
*engv=energy; \
|
*engv=energy*(acctyp)0.5; \
|
||||||
engv+=inum; \
|
engv+=inum; \
|
||||||
} \
|
} \
|
||||||
if (vflag>0) { \
|
if (vflag>0) { \
|
||||||
for (int i=0; i<6; i++) { \
|
for (int i=0; i<6; i++) { \
|
||||||
*engv=virial[i]; \
|
*engv=virial[i]*(acctyp)0.5; \
|
||||||
engv+=inum; \
|
engv+=inum; \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
@ -125,14 +125,14 @@
|
|||||||
if (offset==0) { \
|
if (offset==0) { \
|
||||||
engv+=ii; \
|
engv+=ii; \
|
||||||
if (eflag>0) { \
|
if (eflag>0) { \
|
||||||
*engv=energy; \
|
*engv=energy*(acctyp)0.5; \
|
||||||
engv+=inum; \
|
engv+=inum; \
|
||||||
*engv=e_coul; \
|
*engv=e_coul*(acctyp)0.5; \
|
||||||
engv+=inum; \
|
engv+=inum; \
|
||||||
} \
|
} \
|
||||||
if (vflag>0) { \
|
if (vflag>0) { \
|
||||||
for (int i=0; i<6; i++) { \
|
for (int i=0; i<6; i++) { \
|
||||||
*engv=virial[i]; \
|
*engv=virial[i]*(acctyp)0.5; \
|
||||||
engv+=inum; \
|
engv+=inum; \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
@ -160,12 +160,12 @@
|
|||||||
if (offset==0) { \
|
if (offset==0) { \
|
||||||
engv+=ii; \
|
engv+=ii; \
|
||||||
if (eflag>0) { \
|
if (eflag>0) { \
|
||||||
*engv=energy; \
|
*engv=energy*(acctyp)0.5; \
|
||||||
engv+=inum; \
|
engv+=inum; \
|
||||||
} \
|
} \
|
||||||
if (vflag>0) { \
|
if (vflag>0) { \
|
||||||
for (int i=0; i<6; i++) { \
|
for (int i=0; i<6; i++) { \
|
||||||
*engv=virial[i]; \
|
*engv=virial[i]*(acctyp)0.5; \
|
||||||
engv+=inum; \
|
engv+=inum; \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
@ -192,14 +192,14 @@
|
|||||||
if (offset==0) { \
|
if (offset==0) { \
|
||||||
engv+=ii; \
|
engv+=ii; \
|
||||||
if (eflag>0) { \
|
if (eflag>0) { \
|
||||||
*engv=energy; \
|
*engv=energy*(acctyp)0.5; \
|
||||||
engv+=inum; \
|
engv+=inum; \
|
||||||
*engv=e_coul; \
|
*engv=e_coul*(acctyp)0.5; \
|
||||||
engv+=inum; \
|
engv+=inum; \
|
||||||
} \
|
} \
|
||||||
if (vflag>0) { \
|
if (vflag>0) { \
|
||||||
for (int i=0; i<6; i++) { \
|
for (int i=0; i<6; i++) { \
|
||||||
*engv=virial[i]; \
|
*engv=virial[i]*(acctyp)0.5; \
|
||||||
engv+=inum; \
|
engv+=inum; \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
|
|||||||
@ -272,12 +272,8 @@ void BaseAtomicT::compile_kernels(UCL_Device &dev, const void *pair_str,
|
|||||||
return;
|
return;
|
||||||
|
|
||||||
std::string s_fast=std::string(kname)+"_fast";
|
std::string s_fast=std::string(kname)+"_fast";
|
||||||
std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
|
|
||||||
std::string(OCL_PRECISION_COMPILE)+" -D"+
|
|
||||||
std::string(OCL_VENDOR);
|
|
||||||
|
|
||||||
pair_program=new UCL_Program(dev);
|
pair_program=new UCL_Program(dev);
|
||||||
pair_program->load_string(pair_str,flags.c_str());
|
pair_program->load_string(pair_str,device->compile_string().c_str());
|
||||||
k_pair_fast.set_function(*pair_program,s_fast.c_str());
|
k_pair_fast.set_function(*pair_program,s_fast.c_str());
|
||||||
k_pair.set_function(*pair_program,kname);
|
k_pair.set_function(*pair_program,kname);
|
||||||
pos_tex.get_texture(*pair_program,"pos_tex");
|
pos_tex.get_texture(*pair_program,"pos_tex");
|
||||||
|
|||||||
@ -288,12 +288,8 @@ void BaseChargeT::compile_kernels(UCL_Device &dev, const void *pair_str,
|
|||||||
return;
|
return;
|
||||||
|
|
||||||
std::string s_fast=std::string(kname)+"_fast";
|
std::string s_fast=std::string(kname)+"_fast";
|
||||||
std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
|
|
||||||
std::string(OCL_PRECISION_COMPILE)+" -D"+
|
|
||||||
std::string(OCL_VENDOR);
|
|
||||||
|
|
||||||
pair_program=new UCL_Program(dev);
|
pair_program=new UCL_Program(dev);
|
||||||
pair_program->load_string(pair_str,flags.c_str());
|
pair_program->load_string(pair_str,device->compile_string().c_str());
|
||||||
k_pair_fast.set_function(*pair_program,s_fast.c_str());
|
k_pair_fast.set_function(*pair_program,s_fast.c_str());
|
||||||
k_pair.set_function(*pair_program,kname);
|
k_pair.set_function(*pair_program,kname);
|
||||||
pos_tex.get_texture(*pair_program,"pos_tex");
|
pos_tex.get_texture(*pair_program,"pos_tex");
|
||||||
|
|||||||
@ -296,12 +296,8 @@ void BaseDipoleT::compile_kernels(UCL_Device &dev, const void *pair_str,
|
|||||||
return;
|
return;
|
||||||
|
|
||||||
std::string s_fast=std::string(kname)+"_fast";
|
std::string s_fast=std::string(kname)+"_fast";
|
||||||
std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
|
|
||||||
std::string(OCL_PRECISION_COMPILE)+" -D"+
|
|
||||||
std::string(OCL_VENDOR);
|
|
||||||
|
|
||||||
pair_program=new UCL_Program(dev);
|
pair_program=new UCL_Program(dev);
|
||||||
pair_program->load_string(pair_str,flags.c_str());
|
pair_program->load_string(pair_str,device->compile_string().c_str());
|
||||||
k_pair_fast.set_function(*pair_program,s_fast.c_str());
|
k_pair_fast.set_function(*pair_program,s_fast.c_str());
|
||||||
k_pair.set_function(*pair_program,kname);
|
k_pair.set_function(*pair_program,kname);
|
||||||
pos_tex.get_texture(*pair_program,"pos_tex");
|
pos_tex.get_texture(*pair_program,"pos_tex");
|
||||||
|
|||||||
@ -455,9 +455,7 @@ void BaseEllipsoidT::compile_kernels(UCL_Device &dev,
|
|||||||
std::string s_lj=kns+"_lj";
|
std::string s_lj=kns+"_lj";
|
||||||
std::string s_lj_fast=kns+"_lj_fast";
|
std::string s_lj_fast=kns+"_lj_fast";
|
||||||
|
|
||||||
std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
|
std::string flags=device->compile_string();
|
||||||
std::string(OCL_PRECISION_COMPILE)+" -D"+
|
|
||||||
std::string(OCL_VENDOR);
|
|
||||||
|
|
||||||
nbor_program=new UCL_Program(dev);
|
nbor_program=new UCL_Program(dev);
|
||||||
nbor_program->load_string(ellipsoid_nbor,flags.c_str());
|
nbor_program->load_string(ellipsoid_nbor,flags.c_str());
|
||||||
|
|||||||
@ -69,7 +69,7 @@ int BornT::init(const int ntypes, double **host_cutsq,
|
|||||||
|
|
||||||
// Allocate a host write buffer for data initialization
|
// Allocate a host write buffer for data initialization
|
||||||
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
||||||
UCL_WRITE_OPTIMIZED);
|
UCL_WRITE_ONLY);
|
||||||
|
|
||||||
for (int i=0; i<lj_types*lj_types; i++)
|
for (int i=0; i<lj_types*lj_types; i++)
|
||||||
host_write[i]=0.0;
|
host_write[i]=0.0;
|
||||||
|
|||||||
@ -54,7 +54,8 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
@ -148,7 +149,8 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
|
|||||||
@ -73,7 +73,7 @@ int BornCoulLongT::init(const int ntypes, double **host_cutsq, double **host_rho
|
|||||||
|
|
||||||
// Allocate a host write buffer for data initialization
|
// Allocate a host write buffer for data initialization
|
||||||
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
||||||
UCL_WRITE_OPTIMIZED);
|
UCL_WRITE_ONLY);
|
||||||
|
|
||||||
for (int i=0; i<lj_types*lj_types; i++)
|
for (int i=0; i<lj_types*lj_types; i++)
|
||||||
host_write[i]=0.0;
|
host_write[i]=0.0;
|
||||||
|
|||||||
@ -67,7 +67,8 @@ __kernel void k_born_long(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
@ -188,7 +189,8 @@ __kernel void k_born_long_fast(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
|
|||||||
@ -73,7 +73,7 @@ int BornCoulWolfT::init(const int ntypes, double **host_cutsq, double **host_rho
|
|||||||
|
|
||||||
// Allocate a host write buffer for data initialization
|
// Allocate a host write buffer for data initialization
|
||||||
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
||||||
UCL_WRITE_OPTIMIZED);
|
UCL_WRITE_ONLY);
|
||||||
|
|
||||||
for (int i=0; i<lj_types*lj_types; i++)
|
for (int i=0; i<lj_types*lj_types; i++)
|
||||||
host_write[i]=0.0;
|
host_write[i]=0.0;
|
||||||
|
|||||||
@ -70,7 +70,8 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
@ -201,7 +202,8 @@ __kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
|
|||||||
@ -68,7 +68,7 @@ int BuckT::init(const int ntypes, double **host_cutsq,
|
|||||||
|
|
||||||
// Allocate a host write buffer for data initialization
|
// Allocate a host write buffer for data initialization
|
||||||
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
||||||
UCL_WRITE_OPTIMIZED);
|
UCL_WRITE_ONLY);
|
||||||
|
|
||||||
for (int i=0; i<lj_types*lj_types; i++)
|
for (int i=0; i<lj_types*lj_types; i++)
|
||||||
host_write[i]=0.0;
|
host_write[i]=0.0;
|
||||||
|
|||||||
@ -53,7 +53,8 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
@ -145,7 +146,8 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
|
|||||||
@ -70,7 +70,7 @@ int BuckCoulT::init(const int ntypes, double **host_cutsq,
|
|||||||
|
|
||||||
// Allocate a host write buffer for data initialization
|
// Allocate a host write buffer for data initialization
|
||||||
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
||||||
UCL_WRITE_OPTIMIZED);
|
UCL_WRITE_ONLY);
|
||||||
|
|
||||||
for (int i=0; i<lj_types*lj_types; i++)
|
for (int i=0; i<lj_types*lj_types; i++)
|
||||||
host_write[i]=0.0;
|
host_write[i]=0.0;
|
||||||
|
|||||||
@ -66,7 +66,8 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
@ -182,7 +183,8 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
|
|||||||
@ -72,7 +72,7 @@ int BuckCoulLongT::init(const int ntypes, double **host_cutsq,
|
|||||||
|
|
||||||
// Allocate a host write buffer for data initialization
|
// Allocate a host write buffer for data initialization
|
||||||
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
||||||
UCL_WRITE_OPTIMIZED);
|
UCL_WRITE_ONLY);
|
||||||
|
|
||||||
for (int i=0; i<lj_types*lj_types; i++)
|
for (int i=0; i<lj_types*lj_types; i++)
|
||||||
host_write[i]=0.0;
|
host_write[i]=0.0;
|
||||||
|
|||||||
@ -67,7 +67,8 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
@ -190,7 +191,8 @@ __kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
|
|||||||
@ -69,7 +69,7 @@ int CGCMMT::init(const int ntypes, double **host_cutsq,
|
|||||||
|
|
||||||
// Allocate a host write buffer for data initialization
|
// Allocate a host write buffer for data initialization
|
||||||
UCL_H_Vec<numtyp> host_write(cmm_types*cmm_types*32,*(this->ucl_device),
|
UCL_H_Vec<numtyp> host_write(cmm_types*cmm_types*32,*(this->ucl_device),
|
||||||
UCL_WRITE_OPTIMIZED);
|
UCL_WRITE_ONLY);
|
||||||
|
|
||||||
for (int i=0; i<cmm_types*cmm_types; i++)
|
for (int i=0; i<cmm_types*cmm_types; i++)
|
||||||
host_write[i]=0.0;
|
host_write[i]=0.0;
|
||||||
|
|||||||
@ -53,7 +53,8 @@ __kernel void k_cg_cmm(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
@ -150,7 +151,8 @@ __kernel void k_cg_cmm_fast(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
|
|||||||
@ -74,7 +74,7 @@ int CGCMMLongT::init(const int ntypes, double **host_cutsq,
|
|||||||
|
|
||||||
// Allocate a host write buffer for data initialization
|
// Allocate a host write buffer for data initialization
|
||||||
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
||||||
UCL_WRITE_OPTIMIZED);
|
UCL_WRITE_ONLY);
|
||||||
|
|
||||||
for (int i=0; i<lj_types*lj_types; i++)
|
for (int i=0; i<lj_types*lj_types; i++)
|
||||||
host_write[i]=0.0;
|
host_write[i]=0.0;
|
||||||
|
|||||||
@ -66,7 +66,8 @@ __kernel void k_cg_cmm_long(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
@ -191,7 +192,8 @@ __kernel void k_cg_cmm_long_fast(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
|
|||||||
@ -76,7 +76,7 @@ int CHARMMLongT::init(const int ntypes,
|
|||||||
if (h_size<max_bio_shared_types)
|
if (h_size<max_bio_shared_types)
|
||||||
h_size=max_bio_shared_types;
|
h_size=max_bio_shared_types;
|
||||||
UCL_H_Vec<numtyp> host_write(h_size*32,*(this->ucl_device),
|
UCL_H_Vec<numtyp> host_write(h_size*32,*(this->ucl_device),
|
||||||
UCL_WRITE_OPTIMIZED);
|
UCL_WRITE_ONLY);
|
||||||
for (int i=0; i<h_size*32; i++)
|
for (int i=0; i<h_size*32; i++)
|
||||||
host_write[i]=0.0;
|
host_write[i]=0.0;
|
||||||
|
|
||||||
|
|||||||
@ -67,7 +67,8 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
@ -196,7 +197,8 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
|
|||||||
@ -73,7 +73,7 @@ int ColloidT::init(const int ntypes,
|
|||||||
|
|
||||||
// Allocate a host write buffer for data initialization
|
// Allocate a host write buffer for data initialization
|
||||||
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
||||||
UCL_WRITE_OPTIMIZED);
|
UCL_WRITE_ONLY);
|
||||||
|
|
||||||
for (int i=0; i<lj_types*lj_types; i++)
|
for (int i=0; i<lj_types*lj_types; i++)
|
||||||
host_write[i]=0.0;
|
host_write[i]=0.0;
|
||||||
@ -95,7 +95,7 @@ int ColloidT::init(const int ntypes,
|
|||||||
host_sigma3,host_sigma6);
|
host_sigma3,host_sigma6);
|
||||||
|
|
||||||
UCL_H_Vec<int> dview_form(lj_types*lj_types,*(this->ucl_device),
|
UCL_H_Vec<int> dview_form(lj_types*lj_types,*(this->ucl_device),
|
||||||
UCL_WRITE_OPTIMIZED);
|
UCL_WRITE_ONLY);
|
||||||
for (int i=0; i<lj_types*lj_types; i++) dview_form[i]=0;
|
for (int i=0; i<lj_types*lj_types; i++) dview_form[i]=0;
|
||||||
|
|
||||||
form.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
form.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||||
|
|||||||
@ -56,7 +56,8 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
@ -219,7 +220,8 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
|
|||||||
@ -73,7 +73,7 @@ int CoulDSFT::init(const int ntypes, const int nlocal, const int nall,
|
|||||||
|
|
||||||
// Allocate a host write buffer for data initialization
|
// Allocate a host write buffer for data initialization
|
||||||
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
||||||
UCL_WRITE_OPTIMIZED);
|
UCL_WRITE_ONLY);
|
||||||
|
|
||||||
for (int i=0; i<lj_types*lj_types; i++)
|
for (int i=0; i<lj_types*lj_types; i++)
|
||||||
host_write[i]=0.0;
|
host_write[i]=0.0;
|
||||||
|
|||||||
@ -63,7 +63,8 @@ __kernel void k_coul_dsf(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
@ -163,7 +164,8 @@ __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
|
|||||||
@ -68,7 +68,7 @@ int CoulLongT::init(const int nlocal, const int nall, const int max_nbors,
|
|||||||
|
|
||||||
// Allocate a host write buffer for data initialization
|
// Allocate a host write buffer for data initialization
|
||||||
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
||||||
UCL_WRITE_OPTIMIZED);
|
UCL_WRITE_ONLY);
|
||||||
|
|
||||||
for (int i=0; i<lj_types*lj_types; i++)
|
for (int i=0; i<lj_types*lj_types; i++)
|
||||||
host_write[i]=0.0;
|
host_write[i]=0.0;
|
||||||
|
|||||||
@ -74,12 +74,12 @@ texture<int2> q_tex;
|
|||||||
if (eflag>0) { \
|
if (eflag>0) { \
|
||||||
*ap1=(acctyp)0; \
|
*ap1=(acctyp)0; \
|
||||||
ap1+=inum; \
|
ap1+=inum; \
|
||||||
*ap1=e_coul; \
|
*ap1=e_coul*(acctyp)0.5; \
|
||||||
ap1+=inum; \
|
ap1+=inum; \
|
||||||
} \
|
} \
|
||||||
if (vflag>0) { \
|
if (vflag>0) { \
|
||||||
for (int i=0; i<6; i++) { \
|
for (int i=0; i<6; i++) { \
|
||||||
*ap1=virial[i]; \
|
*ap1=virial[i]*(acctyp)0.5; \
|
||||||
ap1+=inum; \
|
ap1+=inum; \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
@ -109,12 +109,12 @@ texture<int2> q_tex;
|
|||||||
if (eflag>0) { \
|
if (eflag>0) { \
|
||||||
*ap1=(acctyp)0; \
|
*ap1=(acctyp)0; \
|
||||||
ap1+=inum; \
|
ap1+=inum; \
|
||||||
*ap1=e_coul; \
|
*ap1=e_coul*(acctyp)0.5; \
|
||||||
ap1+=inum; \
|
ap1+=inum; \
|
||||||
} \
|
} \
|
||||||
if (vflag>0) { \
|
if (vflag>0) { \
|
||||||
for (int i=0; i<6; i++) { \
|
for (int i=0; i<6; i++) { \
|
||||||
*ap1=virial[i]; \
|
*ap1=virial[i]*(acctyp)0.5; \
|
||||||
ap1+=inum; \
|
ap1+=inum; \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
@ -155,7 +155,8 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
@ -244,7 +245,8 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
|
|||||||
@ -47,7 +47,8 @@ template <class numtyp, class acctyp>
|
|||||||
int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
|
int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
|
||||||
const int last_gpu, const int gpu_mode,
|
const int last_gpu, const int gpu_mode,
|
||||||
const double p_split, const int nthreads,
|
const double p_split, const int nthreads,
|
||||||
const int t_per_atom, const double cell_size) {
|
const int t_per_atom, const double cell_size,
|
||||||
|
char *ocl_vendor) {
|
||||||
_nthreads=nthreads;
|
_nthreads=nthreads;
|
||||||
#ifdef _OPENMP
|
#ifdef _OPENMP
|
||||||
omp_set_num_threads(nthreads);
|
omp_set_num_threads(nthreads);
|
||||||
@ -140,6 +141,9 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
|
|||||||
|
|
||||||
_long_range_precompute=0;
|
_long_range_precompute=0;
|
||||||
|
|
||||||
|
if (set_ocl_params(ocl_vendor)!=0)
|
||||||
|
return -11;
|
||||||
|
|
||||||
int flag=0;
|
int flag=0;
|
||||||
for (int i=0; i<_procs_per_gpu; i++) {
|
for (int i=0; i<_procs_per_gpu; i++) {
|
||||||
if (_gpu_rank==i)
|
if (_gpu_rank==i)
|
||||||
@ -149,6 +153,64 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
|
|||||||
return flag;
|
return flag;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
int DeviceT::set_ocl_params(char *ocl_vendor) {
|
||||||
|
#ifdef USE_OPENCL
|
||||||
|
std::string s_vendor=OCL_DEFAULT_VENDOR;
|
||||||
|
if (ocl_vendor!=NULL)
|
||||||
|
s_vendor=ocl_vendor;
|
||||||
|
if (s_vendor=="none")
|
||||||
|
s_vendor="generic";
|
||||||
|
|
||||||
|
if (s_vendor=="kepler") {
|
||||||
|
_ocl_vendor_name="NVIDIA Kepler";
|
||||||
|
#if defined (__APPLE__) || defined(MACOSX)
|
||||||
|
_ocl_vendor_string="-DKEPLER_OCL -DNO_OCL_PTX";
|
||||||
|
#else
|
||||||
|
_ocl_vendor_string="-DKEPLER_OCL";
|
||||||
|
#endif
|
||||||
|
} else if (s_vendor=="fermi") {
|
||||||
|
_ocl_vendor_name="NVIDIA Fermi";
|
||||||
|
_ocl_vendor_string="-DFERMI_OCL";
|
||||||
|
} else if (s_vendor=="cypress") {
|
||||||
|
_ocl_vendor_name="AMD Cypress";
|
||||||
|
_ocl_vendor_string="-DCYPRESS_OCL";
|
||||||
|
} else if (s_vendor=="generic") {
|
||||||
|
_ocl_vendor_name="GENERIC";
|
||||||
|
_ocl_vendor_string="-DGENERIC_OCL";
|
||||||
|
} else {
|
||||||
|
_ocl_vendor_name="CUSTOM";
|
||||||
|
_ocl_vendor_string="-DUSE_OPENCL";
|
||||||
|
int token_count=0;
|
||||||
|
std::string params[13];
|
||||||
|
char *pch = strtok(ocl_vendor,"\" ");
|
||||||
|
while (pch != NULL) {
|
||||||
|
if (token_count==13)
|
||||||
|
return -11;
|
||||||
|
params[token_count]=pch;
|
||||||
|
token_count++;
|
||||||
|
pch = strtok(NULL,"\" ");
|
||||||
|
}
|
||||||
|
_ocl_vendor_string+=" -DMEM_THREADS="+params[0]+
|
||||||
|
" -DTHREADS_PER_ATOM="+params[1]+
|
||||||
|
" -DTHREADS_PER_CHARGE="+params[2]+
|
||||||
|
" -DBLOCK_PAIR="+params[3]+
|
||||||
|
" -DMAX_SHARED_TYPES="+params[4]+
|
||||||
|
" -DBLOCK_NBOR_BUILD="+params[5]+
|
||||||
|
" -DBLOCK_BIO_PAIR="+params[6]+
|
||||||
|
" -DBLOCK_ELLIPSE="+params[7]+
|
||||||
|
" -DWARP_SIZE="+params[8]+
|
||||||
|
" -DPPPM_BLOCK_1D="+params[9]+
|
||||||
|
" -DBLOCK_CELL_2D="+params[10]+
|
||||||
|
" -DBLOCK_CELL_ID="+params[11]+
|
||||||
|
" -DMAX_BIO_SHARED_TYPES="+params[12];
|
||||||
|
}
|
||||||
|
_ocl_compile_string="-cl-fast-relaxed-math -cl-mad-enable "+
|
||||||
|
std::string(OCL_PRECISION_COMPILE)+" "+_ocl_vendor_string;
|
||||||
|
#endif
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
|
int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
|
||||||
const bool rot, const int nlocal,
|
const bool rot, const int nlocal,
|
||||||
@ -206,7 +268,7 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
|
|||||||
if (!nbor->init(&_neighbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial,
|
if (!nbor->init(&_neighbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial,
|
||||||
*gpu,gpu_nbor,gpu_host,pre_cut, _block_cell_2d,
|
*gpu,gpu_nbor,gpu_host,pre_cut, _block_cell_2d,
|
||||||
_block_cell_id, _block_nbor_build, threads_per_atom,
|
_block_cell_id, _block_nbor_build, threads_per_atom,
|
||||||
_warp_size, _time_device))
|
_warp_size, _time_device, compile_string()))
|
||||||
return -3;
|
return -3;
|
||||||
if (_cell_size<0.0)
|
if (_cell_size<0.0)
|
||||||
nbor->cell_size(cell_size,cell_size);
|
nbor->cell_size(cell_size,cell_size);
|
||||||
@ -274,7 +336,8 @@ void DeviceT::init_message(FILE *screen, const char *name,
|
|||||||
fprintf(screen,"- with %d thread(s) per proc.\n",_nthreads);
|
fprintf(screen,"- with %d thread(s) per proc.\n",_nthreads);
|
||||||
#endif
|
#endif
|
||||||
#ifdef USE_OPENCL
|
#ifdef USE_OPENCL
|
||||||
fprintf(screen,"- with OpenCL Parameters for: %s\n",OCL_VENDOR);
|
fprintf(screen,"- with OpenCL Parameters for: %s\n",
|
||||||
|
_ocl_vendor_name.c_str());
|
||||||
#endif
|
#endif
|
||||||
fprintf(screen,"-------------------------------------");
|
fprintf(screen,"-------------------------------------");
|
||||||
fprintf(screen,"-------------------------------------\n");
|
fprintf(screen,"-------------------------------------\n");
|
||||||
@ -571,9 +634,8 @@ int DeviceT::compile_kernels() {
|
|||||||
if (_compiled)
|
if (_compiled)
|
||||||
return flag;
|
return flag;
|
||||||
|
|
||||||
std::string flags="-cl-mad-enable -D"+std::string(OCL_VENDOR);
|
|
||||||
dev_program=new UCL_Program(*gpu);
|
dev_program=new UCL_Program(*gpu);
|
||||||
int success=dev_program->load_string(device,flags.c_str());
|
int success=dev_program->load_string(device,compile_string().c_str());
|
||||||
if (success!=UCL_SUCCESS)
|
if (success!=UCL_SUCCESS)
|
||||||
return -4;
|
return -4;
|
||||||
k_zero.set_function(*dev_program,"kernel_zero");
|
k_zero.set_function(*dev_program,"kernel_zero");
|
||||||
@ -640,10 +702,11 @@ Device<PRECISION,ACC_PRECISION> global_device;
|
|||||||
int lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
|
int lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
|
||||||
const int last_gpu, const int gpu_mode,
|
const int last_gpu, const int gpu_mode,
|
||||||
const double particle_split, const int nthreads,
|
const double particle_split, const int nthreads,
|
||||||
const int t_per_atom, const double cell_size) {
|
const int t_per_atom, const double cell_size,
|
||||||
|
char *opencl_vendor) {
|
||||||
return global_device.init_device(world,replica,first_gpu,last_gpu,gpu_mode,
|
return global_device.init_device(world,replica,first_gpu,last_gpu,gpu_mode,
|
||||||
particle_split,nthreads,t_per_atom,
|
particle_split,nthreads,t_per_atom,
|
||||||
cell_size);
|
cell_size,opencl_vendor);
|
||||||
}
|
}
|
||||||
|
|
||||||
void lmp_clear_device() {
|
void lmp_clear_device() {
|
||||||
@ -654,3 +717,4 @@ double lmp_gpu_forces(double **f, double **tor, double *eatom,
|
|||||||
double **vatom, double *virial, double &ecoul) {
|
double **vatom, double *virial, double &ecoul) {
|
||||||
return global_device.fix_gpu(f,tor,eatom,vatom,virial,ecoul);
|
return global_device.fix_gpu(f,tor,eatom,vatom,virial,ecoul);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -45,11 +45,13 @@ class Device {
|
|||||||
* - -2 if GPU not found
|
* - -2 if GPU not found
|
||||||
* - -4 if GPU library not compiled for GPU
|
* - -4 if GPU library not compiled for GPU
|
||||||
* - -6 if GPU could not be initialized for use
|
* - -6 if GPU could not be initialized for use
|
||||||
* - -7 if accelerator sharing is not currently allowed on system **/
|
* - -7 if accelerator sharing is not currently allowed on system
|
||||||
|
* - -11 if vendor_string has the wrong number of parameters **/
|
||||||
int init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
|
int init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
|
||||||
const int last_gpu, const int gpu_mode,
|
const int last_gpu, const int gpu_mode,
|
||||||
const double particle_split, const int nthreads,
|
const double particle_split, const int nthreads,
|
||||||
const int t_per_atom, const double cell_size);
|
const int t_per_atom, const double cell_size,
|
||||||
|
char *vendor_string);
|
||||||
|
|
||||||
/// Initialize the device for Atom and Neighbor storage
|
/// Initialize the device for Atom and Neighbor storage
|
||||||
/** \param rot True if quaternions need to be stored
|
/** \param rot True if quaternions need to be stored
|
||||||
@ -234,6 +236,8 @@ class Device {
|
|||||||
inline int max_bio_shared_types() const { return _max_bio_shared_types; }
|
inline int max_bio_shared_types() const { return _max_bio_shared_types; }
|
||||||
/// Architecture gpu code compiled for (returns 0 for OpenCL)
|
/// Architecture gpu code compiled for (returns 0 for OpenCL)
|
||||||
inline double ptx_arch() const { return _ptx_arch; }
|
inline double ptx_arch() const { return _ptx_arch; }
|
||||||
|
/// Number of threads executing concurrently on same multiproc
|
||||||
|
inline int warp_size() const { return _warp_size; }
|
||||||
|
|
||||||
// -------------------- SHARED DEVICE ROUTINES --------------------
|
// -------------------- SHARED DEVICE ROUTINES --------------------
|
||||||
// Perform asynchronous zero of integer array
|
// Perform asynchronous zero of integer array
|
||||||
@ -279,6 +283,8 @@ class Device {
|
|||||||
boxlo,prd);
|
boxlo,prd);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline std::string compile_string() { return _ocl_compile_string; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
std::queue<Answer<numtyp,acctyp> *> ans_queue;
|
std::queue<Answer<numtyp,acctyp> *> ans_queue;
|
||||||
int _init_count;
|
int _init_count;
|
||||||
@ -305,6 +311,9 @@ class Device {
|
|||||||
|
|
||||||
int _data_in_estimate, _data_out_estimate;
|
int _data_in_estimate, _data_out_estimate;
|
||||||
|
|
||||||
|
std::string _ocl_vendor_name, _ocl_vendor_string, _ocl_compile_string;
|
||||||
|
int set_ocl_params(char *);
|
||||||
|
|
||||||
template <class t>
|
template <class t>
|
||||||
inline std::string toa(const t& in) {
|
inline std::string toa(const t& in) {
|
||||||
std::ostringstream o;
|
std::ostringstream o;
|
||||||
|
|||||||
@ -72,7 +72,7 @@ int DipoleLJT::init(const int ntypes,
|
|||||||
|
|
||||||
// Allocate a host write buffer for data initialization
|
// Allocate a host write buffer for data initialization
|
||||||
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
||||||
UCL_WRITE_OPTIMIZED);
|
UCL_WRITE_ONLY);
|
||||||
|
|
||||||
for (int i=0; i<lj_types*lj_types; i++)
|
for (int i=0; i<lj_types*lj_types; i++)
|
||||||
host_write[i]=0.0;
|
host_write[i]=0.0;
|
||||||
|
|||||||
@ -75,14 +75,14 @@ texture<int4,1> mu_tex;
|
|||||||
if (offset==0) { \
|
if (offset==0) { \
|
||||||
engv+=ii; \
|
engv+=ii; \
|
||||||
if (eflag>0) { \
|
if (eflag>0) { \
|
||||||
*engv=energy; \
|
*engv=energy*(acctyp)0.5; \
|
||||||
engv+=inum; \
|
engv+=inum; \
|
||||||
*engv=e_coul; \
|
*engv=e_coul*(acctyp)0.5; \
|
||||||
engv+=inum; \
|
engv+=inum; \
|
||||||
} \
|
} \
|
||||||
if (vflag>0) { \
|
if (vflag>0) { \
|
||||||
for (int i=0; i<6; i++) { \
|
for (int i=0; i<6; i++) { \
|
||||||
*engv=virial[i]; \
|
*engv=virial[i]*(acctyp)0.5; \
|
||||||
engv+=inum; \
|
engv+=inum; \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
@ -115,14 +115,14 @@ texture<int4,1> mu_tex;
|
|||||||
if (offset==0) { \
|
if (offset==0) { \
|
||||||
engv+=ii; \
|
engv+=ii; \
|
||||||
if (eflag>0) { \
|
if (eflag>0) { \
|
||||||
*engv=energy; \
|
*engv=energy*(acctyp)0.5; \
|
||||||
engv+=inum; \
|
engv+=inum; \
|
||||||
*engv=e_coul; \
|
*engv=e_coul*(acctyp)0.5; \
|
||||||
engv+=inum; \
|
engv+=inum; \
|
||||||
} \
|
} \
|
||||||
if (vflag>0) { \
|
if (vflag>0) { \
|
||||||
for (int i=0; i<6; i++) { \
|
for (int i=0; i<6; i++) { \
|
||||||
*engv=virial[i]; \
|
*engv=virial[i]*(acctyp)0.5; \
|
||||||
engv+=inum; \
|
engv+=inum; \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
@ -174,7 +174,8 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
@ -385,7 +386,8 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
|
|||||||
@ -72,7 +72,7 @@ int DipoleLJSFT::init(const int ntypes,
|
|||||||
|
|
||||||
// Allocate a host write buffer for data initialization
|
// Allocate a host write buffer for data initialization
|
||||||
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
||||||
UCL_WRITE_OPTIMIZED);
|
UCL_WRITE_ONLY);
|
||||||
|
|
||||||
for (int i=0; i<lj_types*lj_types; i++)
|
for (int i=0; i<lj_types*lj_types; i++)
|
||||||
host_write[i]=0.0;
|
host_write[i]=0.0;
|
||||||
|
|||||||
@ -76,14 +76,14 @@ texture<int4,1> mu_tex;
|
|||||||
if (offset==0) { \
|
if (offset==0) { \
|
||||||
engv+=ii; \
|
engv+=ii; \
|
||||||
if (eflag>0) { \
|
if (eflag>0) { \
|
||||||
*engv=energy; \
|
*engv=energy*(acctyp)0.5; \
|
||||||
engv+=inum; \
|
engv+=inum; \
|
||||||
*engv=e_coul; \
|
*engv=e_coul*(acctyp)0.5; \
|
||||||
engv+=inum; \
|
engv+=inum; \
|
||||||
} \
|
} \
|
||||||
if (vflag>0) { \
|
if (vflag>0) { \
|
||||||
for (int i=0; i<6; i++) { \
|
for (int i=0; i<6; i++) { \
|
||||||
*engv=virial[i]; \
|
*engv=virial[i]*(acctyp)0.5; \
|
||||||
engv+=inum; \
|
engv+=inum; \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
@ -116,14 +116,14 @@ texture<int4,1> mu_tex;
|
|||||||
if (offset==0) { \
|
if (offset==0) { \
|
||||||
engv+=ii; \
|
engv+=ii; \
|
||||||
if (eflag>0) { \
|
if (eflag>0) { \
|
||||||
*engv=energy; \
|
*engv=energy*(acctyp)0.5; \
|
||||||
engv+=inum; \
|
engv+=inum; \
|
||||||
*engv=e_coul; \
|
*engv=e_coul*(acctyp)0.5; \
|
||||||
engv+=inum; \
|
engv+=inum; \
|
||||||
} \
|
} \
|
||||||
if (vflag>0) { \
|
if (vflag>0) { \
|
||||||
for (int i=0; i<6; i++) { \
|
for (int i=0; i<6; i++) { \
|
||||||
*engv=virial[i]; \
|
*engv=virial[i]*(acctyp)0.5; \
|
||||||
engv+=inum; \
|
engv+=inum; \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
@ -175,7 +175,8 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
@ -418,7 +419,8 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
|
|||||||
@ -66,7 +66,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
|
|||||||
ef_nall=2000;
|
ef_nall=2000;
|
||||||
|
|
||||||
_max_fp_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
|
_max_fp_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
|
||||||
_fp.alloc(_max_fp_size,*(this->ucl_device),UCL_RW_OPTIMIZED,UCL_WRITE_ONLY);
|
_fp.alloc(_max_fp_size,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
|
||||||
|
|
||||||
k_energy.set_function(*(this->pair_program),"k_energy");
|
k_energy.set_function(*(this->pair_program),"k_energy");
|
||||||
k_energy_fast.set_function(*(this->pair_program),"k_energy_fast");
|
k_energy_fast.set_function(*(this->pair_program),"k_energy_fast");
|
||||||
@ -106,7 +106,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
|
|||||||
_nr=nr;
|
_nr=nr;
|
||||||
|
|
||||||
UCL_H_Vec<int2> dview_type(lj_types*lj_types,*(this->ucl_device),
|
UCL_H_Vec<int2> dview_type(lj_types*lj_types,*(this->ucl_device),
|
||||||
UCL_WRITE_OPTIMIZED);
|
UCL_WRITE_ONLY);
|
||||||
|
|
||||||
for (int i=0; i<lj_types*lj_types; i++) {
|
for (int i=0; i<lj_types*lj_types; i++) {
|
||||||
dview_type[i].x=0; dview_type[i].y=0;
|
dview_type[i].x=0; dview_type[i].y=0;
|
||||||
@ -126,7 +126,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
|
|||||||
|
|
||||||
// pack type2frho
|
// pack type2frho
|
||||||
UCL_H_Vec<int> dview_type2frho(lj_types,*(this->ucl_device),
|
UCL_H_Vec<int> dview_type2frho(lj_types,*(this->ucl_device),
|
||||||
UCL_WRITE_OPTIMIZED);
|
UCL_WRITE_ONLY);
|
||||||
|
|
||||||
type2frho.alloc(lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
type2frho.alloc(lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||||
for (int i=0; i<ntypes; i++)
|
for (int i=0; i<ntypes; i++)
|
||||||
@ -135,7 +135,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
|
|||||||
|
|
||||||
// pack frho_spline
|
// pack frho_spline
|
||||||
UCL_H_Vec<numtyp4> dview_frho_spline(nfrho*(nrho+1),*(this->ucl_device),
|
UCL_H_Vec<numtyp4> dview_frho_spline(nfrho*(nrho+1),*(this->ucl_device),
|
||||||
UCL_WRITE_OPTIMIZED);
|
UCL_WRITE_ONLY);
|
||||||
|
|
||||||
for (int ix=0; ix<nfrho; ix++)
|
for (int ix=0; ix<nfrho; ix++)
|
||||||
for (int iy=0; iy<nrho+1; iy++) {
|
for (int iy=0; iy<nrho+1; iy++) {
|
||||||
@ -165,7 +165,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
|
|||||||
|
|
||||||
// pack rhor_spline
|
// pack rhor_spline
|
||||||
UCL_H_Vec<numtyp4> dview_rhor_spline(nrhor*(nr+1),*(this->ucl_device),
|
UCL_H_Vec<numtyp4> dview_rhor_spline(nrhor*(nr+1),*(this->ucl_device),
|
||||||
UCL_WRITE_OPTIMIZED);
|
UCL_WRITE_ONLY);
|
||||||
|
|
||||||
for (int ix=0; ix<nrhor; ix++)
|
for (int ix=0; ix<nrhor; ix++)
|
||||||
for (int iy=0; iy<nr+1; iy++) {
|
for (int iy=0; iy<nr+1; iy++) {
|
||||||
@ -195,7 +195,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
|
|||||||
|
|
||||||
// pack z2r_spline
|
// pack z2r_spline
|
||||||
UCL_H_Vec<numtyp4> dview_z2r_spline(nz2r*(nr+1),*(this->ucl_device),
|
UCL_H_Vec<numtyp4> dview_z2r_spline(nz2r*(nr+1),*(this->ucl_device),
|
||||||
UCL_WRITE_OPTIMIZED);
|
UCL_WRITE_ONLY);
|
||||||
|
|
||||||
for (int ix=0; ix<nz2r; ix++)
|
for (int ix=0; ix<nz2r; ix++)
|
||||||
for (int iy=0; iy<nr+1; iy++) {
|
for (int iy=0; iy<nr+1; iy++) {
|
||||||
|
|||||||
@ -79,7 +79,7 @@ texture<int4> z2r_sp2_tex;
|
|||||||
fetch4(coeff,index,frho_sp2_tex); \
|
fetch4(coeff,index,frho_sp2_tex); \
|
||||||
energy = ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w; \
|
energy = ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w; \
|
||||||
if (rho > rhomax) energy += fp*(rho-rhomax); \
|
if (rho > rhomax) energy += fp*(rho-rhomax); \
|
||||||
engv[ii]=(acctyp)2.0*energy; \
|
engv[ii]=energy; \
|
||||||
} \
|
} \
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -116,12 +116,12 @@ texture<int4> z2r_sp2_tex;
|
|||||||
} \
|
} \
|
||||||
if (offset==0) { \
|
if (offset==0) { \
|
||||||
if (eflag>0) { \
|
if (eflag>0) { \
|
||||||
engv[ii]+=energy; \
|
engv[ii]+=energy*(acctyp)0.5; \
|
||||||
engv+=inum; \
|
engv+=inum; \
|
||||||
} \
|
} \
|
||||||
if (vflag>0) { \
|
if (vflag>0) { \
|
||||||
for (int i=0; i<6; i++) { \
|
for (int i=0; i<6; i++) { \
|
||||||
engv[ii]=virial[i]; \
|
engv[ii]=virial[i]*(acctyp)0.5; \
|
||||||
engv+=inum; \
|
engv+=inum; \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
@ -150,7 +150,7 @@ texture<int4> z2r_sp2_tex;
|
|||||||
fetch4(coeff,index,frho_sp2_tex); \
|
fetch4(coeff,index,frho_sp2_tex); \
|
||||||
energy = ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w; \
|
energy = ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w; \
|
||||||
if (rho > rhomax) energy += fp*(rho-rhomax); \
|
if (rho > rhomax) energy += fp*(rho-rhomax); \
|
||||||
engv[ii]=(acctyp)2.0*energy; \
|
engv[ii]=energy; \
|
||||||
} \
|
} \
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -173,12 +173,12 @@ texture<int4> z2r_sp2_tex;
|
|||||||
if (offset==0) { \
|
if (offset==0) { \
|
||||||
engv+=ii; \
|
engv+=ii; \
|
||||||
if (eflag>0) { \
|
if (eflag>0) { \
|
||||||
*engv+=energy; \
|
*engv+=energy*(acctyp)0.5; \
|
||||||
engv+=inum; \
|
engv+=inum; \
|
||||||
} \
|
} \
|
||||||
if (vflag>0) { \
|
if (vflag>0) { \
|
||||||
for (int i=0; i<6; i++) { \
|
for (int i=0; i<6; i++) { \
|
||||||
*engv=virial[i]; \
|
*engv=virial[i]*(acctyp)0.5; \
|
||||||
engv+=inum; \
|
engv+=inum; \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
@ -210,7 +210,8 @@ __kernel void k_energy(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
@ -286,7 +287,8 @@ __kernel void k_energy_fast(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
@ -353,7 +355,8 @@ __kernel void k_eam(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
@ -466,7 +469,8 @@ __kernel void k_eam_fast(const __global numtyp4 *x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
|
|||||||
@ -83,12 +83,12 @@ texture<int4,1> pos_tex, quat_tex;
|
|||||||
if (offset==0) { \
|
if (offset==0) { \
|
||||||
__global acctyp *ap1=engv+ii; \
|
__global acctyp *ap1=engv+ii; \
|
||||||
if (eflag>0) { \
|
if (eflag>0) { \
|
||||||
*ap1=energy; \
|
*ap1=energy*(acctyp)0.5; \
|
||||||
ap1+=astride; \
|
ap1+=astride; \
|
||||||
} \
|
} \
|
||||||
if (vflag>0) { \
|
if (vflag>0) { \
|
||||||
for (int i=0; i<6; i++) { \
|
for (int i=0; i<6; i++) { \
|
||||||
*ap1=virial[i]; \
|
*ap1=virial[i]*(acctyp)0.5; \
|
||||||
ap1+=astride; \
|
ap1+=astride; \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
@ -130,12 +130,12 @@ texture<int4,1> pos_tex, quat_tex;
|
|||||||
if (offset==0) { \
|
if (offset==0) { \
|
||||||
engv+=ii; \
|
engv+=ii; \
|
||||||
if (eflag>0) { \
|
if (eflag>0) { \
|
||||||
*engv+=energy; \
|
*engv+=energy*(acctyp)0.5; \
|
||||||
engv+=inum; \
|
engv+=inum; \
|
||||||
} \
|
} \
|
||||||
if (vflag>0) { \
|
if (vflag>0) { \
|
||||||
for (int i=0; i<6; i++) { \
|
for (int i=0; i<6; i++) { \
|
||||||
*engv+=virial[i]; \
|
*engv+=virial[i]*(acctyp)0.5; \
|
||||||
engv+=inum; \
|
engv+=inum; \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
@ -170,12 +170,12 @@ texture<int4,1> pos_tex, quat_tex;
|
|||||||
if (offset==0) { \
|
if (offset==0) { \
|
||||||
__global acctyp *ap1=engv+ii; \
|
__global acctyp *ap1=engv+ii; \
|
||||||
if (eflag>0) { \
|
if (eflag>0) { \
|
||||||
*ap1=energy; \
|
*ap1=energy*(acctyp)0.5; \
|
||||||
ap1+=astride; \
|
ap1+=astride; \
|
||||||
} \
|
} \
|
||||||
if (vflag>0) { \
|
if (vflag>0) { \
|
||||||
for (int i=0; i<6; i++) { \
|
for (int i=0; i<6; i++) { \
|
||||||
*ap1=virial[i]; \
|
*ap1=virial[i]*(acctyp)0.5; \
|
||||||
ap1+=astride; \
|
ap1+=astride; \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
@ -202,12 +202,12 @@ texture<int4,1> pos_tex, quat_tex;
|
|||||||
if (offset==0) { \
|
if (offset==0) { \
|
||||||
engv+=ii; \
|
engv+=ii; \
|
||||||
if (eflag>0) { \
|
if (eflag>0) { \
|
||||||
*engv+=energy; \
|
*engv+=energy*(acctyp)0.5; \
|
||||||
engv+=inum; \
|
engv+=inum; \
|
||||||
} \
|
} \
|
||||||
if (vflag>0) { \
|
if (vflag>0) { \
|
||||||
for (int i=0; i<6; i++) { \
|
for (int i=0; i<6; i++) { \
|
||||||
*engv+=virial[i]; \
|
*engv+=virial[i]*(acctyp)0.5; \
|
||||||
engv+=inum; \
|
engv+=inum; \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
|
|||||||
@ -68,7 +68,7 @@ int GaussT::init(const int ntypes,
|
|||||||
|
|
||||||
// Allocate a host write buffer for data initialization
|
// Allocate a host write buffer for data initialization
|
||||||
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
||||||
UCL_WRITE_OPTIMIZED);
|
UCL_WRITE_ONLY);
|
||||||
|
|
||||||
for (int i=0; i<lj_types*lj_types; i++)
|
for (int i=0; i<lj_types*lj_types; i++)
|
||||||
host_write[i]=0.0;
|
host_write[i]=0.0;
|
||||||
|
|||||||
@ -52,7 +52,8 @@ __kernel void k_gauss(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
@ -138,7 +139,8 @@ __kernel void k_gauss_fast(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
|
|||||||
@ -77,7 +77,7 @@ int GayBerneT::init(const int ntypes, const double gamma,
|
|||||||
|
|
||||||
// Allocate a host write buffer for copying type data
|
// Allocate a host write buffer for copying type data
|
||||||
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
||||||
UCL_WRITE_OPTIMIZED);
|
UCL_WRITE_ONLY);
|
||||||
|
|
||||||
for (int i=0; i<lj_types*lj_types; i++)
|
for (int i=0; i<lj_types*lj_types; i++)
|
||||||
host_write[i]=0.0;
|
host_write[i]=0.0;
|
||||||
@ -98,7 +98,7 @@ int GayBerneT::init(const int ntypes, const double gamma,
|
|||||||
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
|
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
|
||||||
host_offset);
|
host_offset);
|
||||||
|
|
||||||
dev_error.alloc(1,*(this->ucl_device));
|
dev_error.alloc(1,*(this->ucl_device),UCL_WRITE_ONLY);
|
||||||
dev_error.zero();
|
dev_error.zero();
|
||||||
|
|
||||||
// Allocate, cast and asynchronous memcpy of constant data
|
// Allocate, cast and asynchronous memcpy of constant data
|
||||||
@ -258,6 +258,9 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
|
|||||||
&ainum, &this->_threads_per_atom);
|
&ainum, &this->_threads_per_atom);
|
||||||
this->time_ellipsoid2.stop();
|
this->time_ellipsoid2.stop();
|
||||||
} else {
|
} else {
|
||||||
|
GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum()-
|
||||||
|
this->_last_ellipse)/
|
||||||
|
(BX/this->_threads_per_atom)));
|
||||||
this->ans->force.zero();
|
this->ans->force.zero();
|
||||||
this->ans->engv.zero();
|
this->ans->engv.zero();
|
||||||
this->time_nbor1.stop();
|
this->time_nbor1.stop();
|
||||||
|
|||||||
@ -120,7 +120,8 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *nbor_end;
|
const __global int *nbor, *nbor_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
|
nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,nbor_end,nbor);
|
n_stride,nbor_end,nbor);
|
||||||
|
|
||||||
|
|||||||
@ -54,7 +54,8 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *nbor_end;
|
const __global int *nbor, *nbor_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
|
nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,nbor_end,nbor);
|
n_stride,nbor_end,nbor);
|
||||||
|
|
||||||
@ -276,7 +277,8 @@ __kernel void k_gayberne_lj(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
|
nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
@ -370,7 +372,8 @@ __kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
|
nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
|
|||||||
@ -69,7 +69,7 @@ int LJT::init(const int ntypes,
|
|||||||
|
|
||||||
// Allocate a host write buffer for data initialization
|
// Allocate a host write buffer for data initialization
|
||||||
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
||||||
UCL_WRITE_OPTIMIZED);
|
UCL_WRITE_ONLY);
|
||||||
|
|
||||||
for (int i=0; i<lj_types*lj_types; i++)
|
for (int i=0; i<lj_types*lj_types; i++)
|
||||||
host_write[i]=0.0;
|
host_write[i]=0.0;
|
||||||
|
|||||||
@ -53,7 +53,8 @@ __kernel void k_lj(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
@ -142,7 +143,8 @@ __kernel void k_lj_fast(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
|
|||||||
@ -69,7 +69,7 @@ int LJ96T::init(const int ntypes,
|
|||||||
|
|
||||||
// Allocate a host write buffer for data initialization
|
// Allocate a host write buffer for data initialization
|
||||||
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
||||||
UCL_WRITE_OPTIMIZED);
|
UCL_WRITE_ONLY);
|
||||||
|
|
||||||
for (int i=0; i<lj_types*lj_types; i++)
|
for (int i=0; i<lj_types*lj_types; i++)
|
||||||
host_write[i]=0.0;
|
host_write[i]=0.0;
|
||||||
|
|||||||
@ -53,7 +53,8 @@ __kernel void k_lj96(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
@ -143,7 +144,8 @@ __kernel void k_lj96_fast(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
|
|||||||
@ -73,7 +73,7 @@ int LJClass2LongT::init(const int ntypes, double **host_cutsq,
|
|||||||
|
|
||||||
// Allocate a host write buffer for data initialization
|
// Allocate a host write buffer for data initialization
|
||||||
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
||||||
UCL_WRITE_OPTIMIZED);
|
UCL_WRITE_ONLY);
|
||||||
|
|
||||||
for (int i=0; i<lj_types*lj_types; i++)
|
for (int i=0; i<lj_types*lj_types; i++)
|
||||||
host_write[i]=0.0;
|
host_write[i]=0.0;
|
||||||
|
|||||||
@ -66,7 +66,8 @@ __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
@ -187,7 +188,8 @@ __kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
|
|||||||
@ -72,7 +72,7 @@ int LJCoulT::init(const int ntypes,
|
|||||||
|
|
||||||
// Allocate a host write buffer for data initialization
|
// Allocate a host write buffer for data initialization
|
||||||
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
||||||
UCL_WRITE_OPTIMIZED);
|
UCL_WRITE_ONLY);
|
||||||
|
|
||||||
for (int i=0; i<lj_types*lj_types; i++)
|
for (int i=0; i<lj_types*lj_types; i++)
|
||||||
host_write[i]=0.0;
|
host_write[i]=0.0;
|
||||||
|
|||||||
@ -66,7 +66,8 @@ __kernel void k_lj_coul(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
@ -178,7 +179,8 @@ __kernel void k_lj_coul_fast(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
|
|||||||
@ -73,7 +73,7 @@ int LJCoulDebyeT::init(const int ntypes,
|
|||||||
|
|
||||||
// Allocate a host write buffer for data initialization
|
// Allocate a host write buffer for data initialization
|
||||||
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
||||||
UCL_WRITE_OPTIMIZED);
|
UCL_WRITE_ONLY);
|
||||||
|
|
||||||
for (int i=0; i<lj_types*lj_types; i++)
|
for (int i=0; i<lj_types*lj_types; i++)
|
||||||
host_write[i]=0.0;
|
host_write[i]=0.0;
|
||||||
|
|||||||
@ -67,7 +67,8 @@ __kernel void k_lj_debye(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
@ -186,7 +187,8 @@ __kernel void k_lj_debye_fast(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
|
|||||||
@ -73,7 +73,7 @@ int LJCoulLongT::init(const int ntypes,
|
|||||||
|
|
||||||
// Allocate a host write buffer for data initialization
|
// Allocate a host write buffer for data initialization
|
||||||
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
||||||
UCL_WRITE_OPTIMIZED);
|
UCL_WRITE_ONLY);
|
||||||
|
|
||||||
for (int i=0; i<lj_types*lj_types; i++)
|
for (int i=0; i<lj_types*lj_types; i++)
|
||||||
host_write[i]=0.0;
|
host_write[i]=0.0;
|
||||||
|
|||||||
@ -66,7 +66,8 @@ __kernel void k_lj_coul_long(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
@ -183,7 +184,8 @@ __kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
|
|||||||
@ -77,7 +77,7 @@ int LJDSFT::init(const int ntypes, double **host_cutsq, double **host_lj1,
|
|||||||
|
|
||||||
// Allocate a host write buffer for data initialization
|
// Allocate a host write buffer for data initialization
|
||||||
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
||||||
UCL_WRITE_OPTIMIZED);
|
UCL_WRITE_ONLY);
|
||||||
|
|
||||||
for (int i=0; i<lj_types*lj_types; i++)
|
for (int i=0; i<lj_types*lj_types; i++)
|
||||||
host_write[i]=0.0;
|
host_write[i]=0.0;
|
||||||
|
|||||||
@ -69,7 +69,8 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
@ -195,7 +196,8 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
|
|||||||
@ -69,7 +69,7 @@ int LJExpandT::init(const int ntypes, double **host_cutsq,
|
|||||||
|
|
||||||
// Allocate a host write buffer for data initialization
|
// Allocate a host write buffer for data initialization
|
||||||
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
||||||
UCL_WRITE_OPTIMIZED);
|
UCL_WRITE_ONLY);
|
||||||
|
|
||||||
for (int i=0; i<lj_types*lj_types; i++)
|
for (int i=0; i<lj_types*lj_types; i++)
|
||||||
host_write[i]=0.0;
|
host_write[i]=0.0;
|
||||||
|
|||||||
@ -55,7 +55,8 @@ __kernel void k_lj_expand(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
@ -147,7 +148,8 @@ __kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
|
|||||||
@ -69,7 +69,7 @@ int MorseT::init(const int ntypes,
|
|||||||
|
|
||||||
// Allocate a host write buffer for data initialization
|
// Allocate a host write buffer for data initialization
|
||||||
UCL_H_Vec<numtyp> host_write(types*types*32,*(this->ucl_device),
|
UCL_H_Vec<numtyp> host_write(types*types*32,*(this->ucl_device),
|
||||||
UCL_WRITE_OPTIMIZED);
|
UCL_WRITE_ONLY);
|
||||||
|
|
||||||
for (int i=0; i<types*types; i++)
|
for (int i=0; i<types*types; i++)
|
||||||
host_write[i]=0.0;
|
host_write[i]=0.0;
|
||||||
|
|||||||
@ -55,7 +55,8 @@ __kernel void k_morse(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
@ -145,7 +146,8 @@ __kernel void k_morse_fast(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
|
|||||||
@ -38,7 +38,8 @@ bool Neighbor::init(NeighborShared *shared, const int inum,
|
|||||||
const bool pre_cut, const int block_cell_2d,
|
const bool pre_cut, const int block_cell_2d,
|
||||||
const int block_cell_id, const int block_nbor_build,
|
const int block_cell_id, const int block_nbor_build,
|
||||||
const int threads_per_atom, const int warp_size,
|
const int threads_per_atom, const int warp_size,
|
||||||
const bool time_device) {
|
const bool time_device,
|
||||||
|
const std::string compile_flags) {
|
||||||
clear();
|
clear();
|
||||||
|
|
||||||
_threads_per_atom=threads_per_atom;
|
_threads_per_atom=threads_per_atom;
|
||||||
@ -92,13 +93,13 @@ bool Neighbor::init(NeighborShared *shared, const int inum,
|
|||||||
|
|
||||||
if (gpu_nbor==0)
|
if (gpu_nbor==0)
|
||||||
success=success && (host_packed.alloc(2*IJ_SIZE,*dev,
|
success=success && (host_packed.alloc(2*IJ_SIZE,*dev,
|
||||||
UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
|
UCL_WRITE_ONLY)==UCL_SUCCESS);
|
||||||
alloc(success);
|
alloc(success);
|
||||||
if (!success)
|
if (!success)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
if (_use_packing==false)
|
if (_use_packing==false)
|
||||||
_shared->compile_kernels(devi,gpu_nbor);
|
_shared->compile_kernels(devi,gpu_nbor,compile_flags);
|
||||||
|
|
||||||
return success;
|
return success;
|
||||||
}
|
}
|
||||||
@ -114,7 +115,7 @@ void Neighbor::alloc(bool &success) {
|
|||||||
success=success && (dev_nbor.alloc(3*_max_atoms,*dev,
|
success=success && (dev_nbor.alloc(3*_max_atoms,*dev,
|
||||||
UCL_READ_ONLY)==UCL_SUCCESS);
|
UCL_READ_ONLY)==UCL_SUCCESS);
|
||||||
success=success && (host_acc.alloc(nt*2,*dev,
|
success=success && (host_acc.alloc(nt*2,*dev,
|
||||||
UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
|
UCL_READ_WRITE)==UCL_SUCCESS);
|
||||||
|
|
||||||
_c_bytes=dev_nbor.row_bytes();
|
_c_bytes=dev_nbor.row_bytes();
|
||||||
if (_alloc_packed) {
|
if (_alloc_packed) {
|
||||||
@ -129,10 +130,10 @@ void Neighbor::alloc(bool &success) {
|
|||||||
host_ilist.clear();
|
host_ilist.clear();
|
||||||
host_jlist.clear();
|
host_jlist.clear();
|
||||||
|
|
||||||
success=(nbor_host.alloc(_max_nbors*_max_host,*dev,UCL_RW_OPTIMIZED,
|
success=(nbor_host.alloc(_max_nbors*_max_host,*dev,UCL_READ_WRITE,
|
||||||
UCL_WRITE_ONLY)==UCL_SUCCESS) && success;
|
UCL_READ_WRITE)==UCL_SUCCESS) && success;
|
||||||
success=success && (dev_numj_host.alloc(_max_host,*dev,
|
success=success && (dev_numj_host.alloc(_max_host,*dev,
|
||||||
UCL_WRITE_ONLY)==UCL_SUCCESS);
|
UCL_READ_WRITE)==UCL_SUCCESS);
|
||||||
success=success && (host_ilist.alloc(nt,*dev,UCL_NOT_PINNED)==UCL_SUCCESS);
|
success=success && (host_ilist.alloc(nt,*dev,UCL_NOT_PINNED)==UCL_SUCCESS);
|
||||||
if (!success)
|
if (!success)
|
||||||
return;
|
return;
|
||||||
@ -161,7 +162,7 @@ void Neighbor::alloc(bool &success) {
|
|||||||
success=success && (dev_nspecial.alloc(3*at,*dev,
|
success=success && (dev_nspecial.alloc(3*at,*dev,
|
||||||
UCL_READ_ONLY)==UCL_SUCCESS);
|
UCL_READ_ONLY)==UCL_SUCCESS);
|
||||||
success=success && (dev_special.alloc(_maxspecial*at,*dev,
|
success=success && (dev_special.alloc(_maxspecial*at,*dev,
|
||||||
UCL_READ_ONLY)==UCL_SUCCESS);
|
UCL_READ_WRITE)==UCL_SUCCESS);
|
||||||
success=success && (dev_special_t.alloc(_maxspecial*at,*dev,
|
success=success && (dev_special_t.alloc(_maxspecial*at,*dev,
|
||||||
UCL_READ_ONLY)==UCL_SUCCESS);
|
UCL_READ_ONLY)==UCL_SUCCESS);
|
||||||
_gpu_bytes+=dev_nspecial.row_bytes()+dev_special.row_bytes()+
|
_gpu_bytes+=dev_nspecial.row_bytes()+dev_special.row_bytes()+
|
||||||
@ -178,11 +179,9 @@ void Neighbor::clear() {
|
|||||||
_bin_time=0.0;
|
_bin_time=0.0;
|
||||||
if (_ncells>0) {
|
if (_ncells>0) {
|
||||||
_ncells=0;
|
_ncells=0;
|
||||||
dev_cell_counts.clear();
|
cell_counts.clear();
|
||||||
if (_gpu_nbor==2) {
|
if (_gpu_nbor==2)
|
||||||
host_cell_counts.clear();
|
|
||||||
delete [] cell_iter;
|
delete [] cell_iter;
|
||||||
}
|
|
||||||
}
|
}
|
||||||
if (_allocated) {
|
if (_allocated) {
|
||||||
_allocated=false;
|
_allocated=false;
|
||||||
@ -286,6 +285,80 @@ void Neighbor::get_host(const int inum, int *ilist, int *numj,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// This is the same as get host, but the requirement that ilist[i]=i and
|
||||||
|
// inum=nlocal is forced to be true to allow direct indexing of neighbors of
|
||||||
|
// neighbors
|
||||||
|
void Neighbor::get_host3(const int inum, const int nlist, int *ilist, int *numj,
|
||||||
|
int **firstneigh, const int block_size) {
|
||||||
|
_nbor_time_avail=true;
|
||||||
|
time_nbor.start();
|
||||||
|
|
||||||
|
UCL_H_Vec<int> ilist_view;
|
||||||
|
ilist_view.view(ilist,inum,*dev);
|
||||||
|
ucl_copy(dev_nbor,ilist_view,false);
|
||||||
|
|
||||||
|
UCL_D_Vec<int> nbor_offset;
|
||||||
|
UCL_H_Vec<int> host_offset;
|
||||||
|
|
||||||
|
int copy_count=0;
|
||||||
|
int ij_count=0;
|
||||||
|
int acc_count=0;
|
||||||
|
int dev_count=0;
|
||||||
|
int *h_ptr=host_packed.begin();
|
||||||
|
_nbor_pitch=inum;
|
||||||
|
|
||||||
|
if (nlist!=inum)
|
||||||
|
host_acc.zero(inum);
|
||||||
|
|
||||||
|
for (int ii=0; ii<nlist; ii++) {
|
||||||
|
int i=ilist[ii];
|
||||||
|
int nj=numj[i];
|
||||||
|
host_acc[i]=nj;
|
||||||
|
host_acc[i+inum]=acc_count;
|
||||||
|
acc_count+=nj;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i=0; i<inum; i++) {
|
||||||
|
int nj=host_acc[i];
|
||||||
|
int *jlist=firstneigh[i];
|
||||||
|
for (int jj=0; jj<nj; jj++) {
|
||||||
|
*h_ptr=jlist[jj];
|
||||||
|
h_ptr++;
|
||||||
|
ij_count++;
|
||||||
|
|
||||||
|
if (ij_count==IJ_SIZE) {
|
||||||
|
dev_nbor.sync();
|
||||||
|
host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,IJ_SIZE);
|
||||||
|
nbor_offset.view_offset(dev_count,dev_packed,IJ_SIZE);
|
||||||
|
ucl_copy(nbor_offset,host_offset,true);
|
||||||
|
copy_count++;
|
||||||
|
ij_count=0;
|
||||||
|
dev_count+=IJ_SIZE;
|
||||||
|
h_ptr=host_packed.begin()+(IJ_SIZE*(copy_count%2));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (ij_count!=0) {
|
||||||
|
dev_nbor.sync();
|
||||||
|
host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,ij_count);
|
||||||
|
nbor_offset.view_offset(dev_count,dev_packed,ij_count);
|
||||||
|
ucl_copy(nbor_offset,host_offset,true);
|
||||||
|
}
|
||||||
|
UCL_D_Vec<int> acc_view;
|
||||||
|
acc_view.view_offset(inum,dev_nbor,inum*2);
|
||||||
|
ucl_copy(acc_view,host_acc,true);
|
||||||
|
time_nbor.stop();
|
||||||
|
|
||||||
|
if (_use_packing==false) {
|
||||||
|
time_kernel.start();
|
||||||
|
int GX=static_cast<int>(ceil(static_cast<double>(inum)*_threads_per_atom/
|
||||||
|
block_size));
|
||||||
|
_shared->k_nbor.set_size(GX,block_size);
|
||||||
|
_shared->k_nbor.run(&dev_nbor, &dev_packed, &inum, &_threads_per_atom);
|
||||||
|
time_kernel.stop();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
void Neighbor::resize_max_neighbors(const int maxn, bool &success) {
|
void Neighbor::resize_max_neighbors(const int maxn, bool &success) {
|
||||||
if (maxn>_max_nbors) {
|
if (maxn>_max_nbors) {
|
||||||
@ -330,24 +403,20 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
|
|||||||
ncellz = static_cast<int>(ceil((subhi[2]-sublo[2])/_cell_size))+ghost_cells;
|
ncellz = static_cast<int>(ceil((subhi[2]-sublo[2])/_cell_size))+ghost_cells;
|
||||||
ncell_3d = ncellx * ncelly * ncellz;
|
ncell_3d = ncellx * ncelly * ncellz;
|
||||||
if (ncell_3d+1>_ncells) {
|
if (ncell_3d+1>_ncells) {
|
||||||
if (_gpu_nbor==2) {
|
cell_counts.clear();
|
||||||
if (_ncells>0) {
|
|
||||||
host_cell_counts.clear();
|
|
||||||
delete [] cell_iter;
|
|
||||||
}
|
|
||||||
cell_iter = new int[ncell_3d+1];
|
|
||||||
host_cell_counts.alloc(ncell_3d+1,dev_nbor);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (_gpu_nbor==2 && atom.host_view())
|
if (_gpu_nbor==2) {
|
||||||
dev_cell_counts.view(host_cell_counts);
|
if (_ncells>0)
|
||||||
else {
|
delete [] cell_iter;
|
||||||
dev_cell_counts.clear();
|
cell_iter = new int[ncell_3d+1];
|
||||||
dev_cell_counts.alloc(ncell_3d+1,dev_nbor);
|
cell_counts.alloc(ncell_3d+1,dev_nbor,UCL_READ_WRITE,UCL_READ_ONLY);
|
||||||
|
} else {
|
||||||
|
cell_counts.device.clear();
|
||||||
|
cell_counts.device.alloc(ncell_3d+1,dev_nbor);
|
||||||
}
|
}
|
||||||
|
|
||||||
_ncells=ncell_3d+1;
|
_ncells=ncell_3d+1;
|
||||||
_cell_bytes=dev_cell_counts.row_bytes();
|
_cell_bytes=cell_counts.device.row_bytes();
|
||||||
}
|
}
|
||||||
|
|
||||||
const numtyp cutoff_cast=static_cast<numtyp>(_cutoff);
|
const numtyp cutoff_cast=static_cast<numtyp>(_cutoff);
|
||||||
@ -381,7 +450,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
|
|||||||
int *particle_id=atom.host_particle_id.begin();
|
int *particle_id=atom.host_particle_id.begin();
|
||||||
|
|
||||||
// Build cell list on CPU
|
// Build cell list on CPU
|
||||||
host_cell_counts.zero();
|
cell_counts.host.zero();
|
||||||
double i_cell_size=1.0/_cell_size;
|
double i_cell_size=1.0/_cell_size;
|
||||||
|
|
||||||
int offset_hi=_cells_in_cutoff+1;
|
int offset_hi=_cells_in_cutoff+1;
|
||||||
@ -403,7 +472,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
|
|||||||
|
|
||||||
int id = ix+iy*ncellx+iz*ncellx*ncelly;
|
int id = ix+iy*ncellx+iz*ncellx*ncelly;
|
||||||
cell_id[i] = id;
|
cell_id[i] = id;
|
||||||
host_cell_counts[id+1]++;
|
cell_counts[id+1]++;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i=nt; i<nall; i++) {
|
for (int i=nt; i<nall; i++) {
|
||||||
@ -424,12 +493,12 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
|
|||||||
|
|
||||||
int id = ix+iy*ncellx+iz*ncellx*ncelly;
|
int id = ix+iy*ncellx+iz*ncellx*ncelly;
|
||||||
cell_id[i] = id;
|
cell_id[i] = id;
|
||||||
host_cell_counts[id+1]++;
|
cell_counts[id+1]++;
|
||||||
}
|
}
|
||||||
|
|
||||||
mn=0;
|
mn=0;
|
||||||
for (int i=0; i<_ncells; i++)
|
for (int i=0; i<_ncells; i++)
|
||||||
mn=std::max(mn,host_cell_counts[i]);
|
mn=std::max(mn,cell_counts[i]);
|
||||||
mn*=8;
|
mn*=8;
|
||||||
set_nbor_block_size(mn/2);
|
set_nbor_block_size(mn/2);
|
||||||
|
|
||||||
@ -440,11 +509,11 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
|
|||||||
|
|
||||||
cell_iter[0]=0;
|
cell_iter[0]=0;
|
||||||
for (int i=1; i<_ncells; i++) {
|
for (int i=1; i<_ncells; i++) {
|
||||||
host_cell_counts[i]+=host_cell_counts[i-1];
|
cell_counts[i]+=cell_counts[i-1];
|
||||||
cell_iter[i]=host_cell_counts[i];
|
cell_iter[i]=cell_counts[i];
|
||||||
}
|
}
|
||||||
time_hybrid1.start();
|
time_hybrid1.start();
|
||||||
ucl_copy(dev_cell_counts,host_cell_counts,true);
|
cell_counts.update_device(true);
|
||||||
time_hybrid1.stop();
|
time_hybrid1.stop();
|
||||||
for (int i=0; i<nall; i++) {
|
for (int i=0; i<nall; i++) {
|
||||||
int celli=cell_id[i];
|
int celli=cell_id[i];
|
||||||
@ -481,7 +550,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
|
|||||||
|
|
||||||
/* calculate cell count */
|
/* calculate cell count */
|
||||||
_shared->k_cell_counts.set_size(GX,neigh_block);
|
_shared->k_cell_counts.set_size(GX,neigh_block);
|
||||||
_shared->k_cell_counts.run(&atom.dev_cell_id, &dev_cell_counts, &nall,
|
_shared->k_cell_counts.run(&atom.dev_cell_id, &cell_counts, &nall,
|
||||||
&ncell_3d);
|
&ncell_3d);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -490,7 +559,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
|
|||||||
_shared->k_build_nbor.set_size(ncellx-ghost_cells,(ncelly-ghost_cells)*
|
_shared->k_build_nbor.set_size(ncellx-ghost_cells,(ncelly-ghost_cells)*
|
||||||
(ncellz-ghost_cells),cell_block,1);
|
(ncellz-ghost_cells),cell_block,1);
|
||||||
_shared->k_build_nbor.run(&atom.x, &atom.dev_particle_id,
|
_shared->k_build_nbor.run(&atom.x, &atom.dev_particle_id,
|
||||||
&dev_cell_counts, &dev_nbor, &nbor_host,
|
&cell_counts, &dev_nbor, &nbor_host,
|
||||||
&dev_numj_host, &_max_nbors, &cutoff_cast, &ncellx,
|
&dev_numj_host, &_max_nbors, &cutoff_cast, &ncellx,
|
||||||
&ncelly, &ncellz, &inum, &nt, &nall,
|
&ncelly, &ncellz, &inum, &nt, &nall,
|
||||||
&_threads_per_atom, &_cells_in_cutoff);
|
&_threads_per_atom, &_cells_in_cutoff);
|
||||||
|
|||||||
@ -47,13 +47,15 @@ class Neighbor {
|
|||||||
* \param pre_cut True if cutoff test will be performed in separate kernel
|
* \param pre_cut True if cutoff test will be performed in separate kernel
|
||||||
* than the force kernel
|
* than the force kernel
|
||||||
* \param threads_per_atom Number of threads used per atom for force
|
* \param threads_per_atom Number of threads used per atom for force
|
||||||
* calculation **/
|
* calculation
|
||||||
|
* \param compile_flags Flags for JIT compiling **/
|
||||||
bool init(NeighborShared *shared, const int inum, const int host_inum,
|
bool init(NeighborShared *shared, const int inum, const int host_inum,
|
||||||
const int max_nbors, const int maxspecial, UCL_Device &dev,
|
const int max_nbors, const int maxspecial, UCL_Device &dev,
|
||||||
const int gpu_nbor, const int gpu_host, const bool pre_cut,
|
const int gpu_nbor, const int gpu_host, const bool pre_cut,
|
||||||
const int block_cell_2d, const int block_cell_id,
|
const int block_cell_2d, const int block_cell_id,
|
||||||
const int block_nbor_build, const int threads_per_atom,
|
const int block_nbor_build, const int threads_per_atom,
|
||||||
const int warp_size, const bool time_device);
|
const int warp_size, const bool time_device,
|
||||||
|
const std::string compile_flags);
|
||||||
|
|
||||||
/// Set the size of the cutoff+skin
|
/// Set the size of the cutoff+skin
|
||||||
inline void cell_size(const double size, const double cutoff) {
|
inline void cell_size(const double size, const double cutoff) {
|
||||||
@ -143,6 +145,10 @@ class Neighbor {
|
|||||||
void get_host(const int inum, int *ilist, int *numj,
|
void get_host(const int inum, int *ilist, int *numj,
|
||||||
int **firstneigh, const int block_size);
|
int **firstneigh, const int block_size);
|
||||||
|
|
||||||
|
/// Copy neighbor list from host for 3-body (first time or from a rebuild)
|
||||||
|
void get_host3(const int inum, const int nlist, int *ilist, int *numj,
|
||||||
|
int **firstneigh, const int block_size);
|
||||||
|
|
||||||
/// Return the stride in elements for each nbor row
|
/// Return the stride in elements for each nbor row
|
||||||
inline int nbor_pitch() const { return _nbor_pitch; }
|
inline int nbor_pitch() const { return _nbor_pitch; }
|
||||||
|
|
||||||
@ -207,11 +213,9 @@ class Neighbor {
|
|||||||
UCL_D_Vec<int> dev_nspecial;
|
UCL_D_Vec<int> dev_nspecial;
|
||||||
/// Device storage for special neighbors
|
/// Device storage for special neighbors
|
||||||
UCL_D_Vec<int> dev_special, dev_special_t;
|
UCL_D_Vec<int> dev_special, dev_special_t;
|
||||||
/// Host storage for number of particles per cell
|
/// Host/Device storage for number of particles per cell
|
||||||
UCL_H_Vec<int> host_cell_counts;
|
UCL_Vector<int,int> cell_counts;
|
||||||
int *cell_iter;
|
int *cell_iter;
|
||||||
/// Device storage for number of particles per cell
|
|
||||||
UCL_D_Vec<int> dev_cell_counts;
|
|
||||||
|
|
||||||
/// Device timers
|
/// Device timers
|
||||||
UCL_Timer time_nbor, time_kernel, time_hybrid1, time_hybrid2, time_transpose;
|
UCL_Timer time_nbor, time_kernel, time_hybrid1, time_hybrid2, time_transpose;
|
||||||
|
|||||||
@ -48,15 +48,12 @@ void NeighborShared::clear() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void NeighborShared::compile_kernels(UCL_Device &dev, const int gpu_nbor) {
|
void NeighborShared::compile_kernels(UCL_Device &dev, const int gpu_nbor,
|
||||||
|
const std::string flags) {
|
||||||
if (_compiled)
|
if (_compiled)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
_gpu_nbor=gpu_nbor;
|
_gpu_nbor=gpu_nbor;
|
||||||
std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
|
|
||||||
std::string(OCL_PRECISION_COMPILE)+" -D"+
|
|
||||||
std::string(OCL_VENDOR);
|
|
||||||
|
|
||||||
if (_gpu_nbor==0) {
|
if (_gpu_nbor==0) {
|
||||||
nbor_program=new UCL_Program(dev);
|
nbor_program=new UCL_Program(dev);
|
||||||
nbor_program->load_string(neighbor_cpu,flags.c_str());
|
nbor_program->load_string(neighbor_cpu,flags.c_str());
|
||||||
|
|||||||
@ -44,7 +44,8 @@ class NeighborShared {
|
|||||||
UCL_Texture neigh_tex;
|
UCL_Texture neigh_tex;
|
||||||
|
|
||||||
/// Compile kernels for neighbor lists
|
/// Compile kernels for neighbor lists
|
||||||
void compile_kernels(UCL_Device &dev, const int gpu_nbor);
|
void compile_kernels(UCL_Device &dev, const int gpu_nbor,
|
||||||
|
const std::string flags);
|
||||||
|
|
||||||
// ----------------------------- Kernels
|
// ----------------------------- Kernels
|
||||||
UCL_Program *nbor_program, *build_program;
|
UCL_Program *nbor_program, *build_program;
|
||||||
|
|||||||
@ -136,10 +136,10 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen,
|
|||||||
_npts_y=nyhi_out-nylo_out+1;
|
_npts_y=nyhi_out-nylo_out+1;
|
||||||
_npts_z=nzhi_out-nzlo_out+1;
|
_npts_z=nzhi_out-nzlo_out+1;
|
||||||
_npts_yx=_npts_x*_npts_y;
|
_npts_yx=_npts_x*_npts_y;
|
||||||
success=success && (brick.alloc(_npts_x*_npts_y*_npts_z,*ucl_device)==
|
success=success && (brick.alloc(_npts_x*_npts_y*_npts_z,*ucl_device,
|
||||||
UCL_SUCCESS);
|
UCL_READ_ONLY,UCL_WRITE_ONLY)==UCL_SUCCESS);
|
||||||
success=success && (vd_brick.alloc(_npts_x*_npts_y*_npts_z*4,*ucl_device)==
|
success=success && (vd_brick.alloc(_npts_x*_npts_y*_npts_z*4,*ucl_device,
|
||||||
UCL_SUCCESS);
|
UCL_READ_WRITE,UCL_READ_ONLY)==UCL_SUCCESS);
|
||||||
*vd_brick_p=vd_brick.host.begin();
|
*vd_brick_p=vd_brick.host.begin();
|
||||||
_max_bytes+=brick.device.row_bytes()+vd_brick.device.row_bytes();
|
_max_bytes+=brick.device.row_bytes()+vd_brick.device.row_bytes();
|
||||||
|
|
||||||
@ -159,7 +159,7 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen,
|
|||||||
_max_bytes+=d_brick_atoms.row_bytes();
|
_max_bytes+=d_brick_atoms.row_bytes();
|
||||||
|
|
||||||
// Allocate error flags for checking out of bounds atoms
|
// Allocate error flags for checking out of bounds atoms
|
||||||
success=success && (error_flag.alloc(1,*ucl_device,UCL_RW_OPTIMIZED,
|
success=success && (error_flag.alloc(1,*ucl_device,UCL_READ_ONLY,
|
||||||
UCL_WRITE_ONLY)==UCL_SUCCESS);
|
UCL_WRITE_ONLY)==UCL_SUCCESS);
|
||||||
if (!success) {
|
if (!success) {
|
||||||
flag=-3;
|
flag=-3;
|
||||||
@ -374,9 +374,7 @@ void PPPMT::compile_kernels(UCL_Device &dev) {
|
|||||||
if (sizeof(grdtyp)==sizeof(double) && ucl_device->double_precision()==false)
|
if (sizeof(grdtyp)==sizeof(double) && ucl_device->double_precision()==false)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
|
std::string flags=device->compile_string();
|
||||||
std::string(OCL_PRECISION_COMPILE)+" -D"+
|
|
||||||
std::string(OCL_VENDOR);
|
|
||||||
#ifdef USE_OPENCL
|
#ifdef USE_OPENCL
|
||||||
flags+=std::string(" -Dgrdtyp=")+ucl_template_name<grdtyp>()+" -Dgrdtyp4="+
|
flags+=std::string(" -Dgrdtyp=")+ucl_template_name<grdtyp>()+" -Dgrdtyp4="+
|
||||||
ucl_template_name<grdtyp>()+"4";
|
ucl_template_name<grdtyp>()+"4";
|
||||||
|
|||||||
@ -97,11 +97,11 @@ float * pppm_gpu_init_f(const int nlocal, const int nall, FILE *screen,
|
|||||||
const int nzhi_out, float **rho_coeff,
|
const int nzhi_out, float **rho_coeff,
|
||||||
float **vd_brick, const double slab_volfactor,
|
float **vd_brick, const double slab_volfactor,
|
||||||
const int nx_pppm, const int ny_pppm, const int nz_pppm,
|
const int nx_pppm, const int ny_pppm, const int nz_pppm,
|
||||||
const bool split, int &success) {
|
const bool split, const bool respa, int &success) {
|
||||||
float *b=pppm_gpu_init(PPPMF,nlocal,nall,screen,order,nxlo_out,nylo_out,
|
float *b=pppm_gpu_init(PPPMF,nlocal,nall,screen,order,nxlo_out,nylo_out,
|
||||||
nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff,vd_brick,
|
nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff,vd_brick,
|
||||||
slab_volfactor,nx_pppm,ny_pppm,nz_pppm,split,success);
|
slab_volfactor,nx_pppm,ny_pppm,nz_pppm,split,success);
|
||||||
if (split==false)
|
if (split==false && respa==false)
|
||||||
PPPMF.device->set_single_precompute(&PPPMF);
|
PPPMF.device->set_single_precompute(&PPPMF);
|
||||||
return b;
|
return b;
|
||||||
}
|
}
|
||||||
@ -139,12 +139,13 @@ double * pppm_gpu_init_d(const int nlocal, const int nall, FILE *screen,
|
|||||||
const int nzhi_out, double **rho_coeff,
|
const int nzhi_out, double **rho_coeff,
|
||||||
double **vd_brick, const double slab_volfactor,
|
double **vd_brick, const double slab_volfactor,
|
||||||
const int nx_pppm, const int ny_pppm,
|
const int nx_pppm, const int ny_pppm,
|
||||||
const int nz_pppm, const bool split, int &success) {
|
const int nz_pppm, const bool split,
|
||||||
|
const bool respa, int &success) {
|
||||||
double *b=pppm_gpu_init(PPPMD,nlocal,nall,screen,order,nxlo_out,nylo_out,
|
double *b=pppm_gpu_init(PPPMD,nlocal,nall,screen,order,nxlo_out,nylo_out,
|
||||||
nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff,
|
nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff,
|
||||||
vd_brick,slab_volfactor,nx_pppm,ny_pppm,nz_pppm,
|
vd_brick,slab_volfactor,nx_pppm,ny_pppm,nz_pppm,
|
||||||
split,success);
|
split,success);
|
||||||
if (split==false)
|
if (split==false && respa==false)
|
||||||
PPPMD.device->set_double_precompute(&PPPMD);
|
PPPMD.device->set_double_precompute(&PPPMD);
|
||||||
return b;
|
return b;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -96,19 +96,27 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) {
|
|||||||
|
|
||||||
enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
|
enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
|
||||||
|
|
||||||
// OCL_VENDOR: preprocessor define for hardware
|
// OCL_DEFAULT_VENDOR: preprocessor define for hardware
|
||||||
// specific sizes of OpenCL kernel related constants
|
// specific sizes of OpenCL kernel related constants
|
||||||
|
|
||||||
#ifdef FERMI_OCL
|
#ifdef FERMI_OCL
|
||||||
#define OCL_VENDOR "FERMI_OCL"
|
#define OCL_DEFAULT_VENDOR "fermi"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef KEPLER_OCL
|
||||||
|
#define OCL_DEFAULT_VENDOR "kepler"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef CYPRESS_OCL
|
#ifdef CYPRESS_OCL
|
||||||
#define OCL_VENDOR "CYPRESS_OCL"
|
#define OCL_DEFAULT_VENDOR "cypress"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef OCL_VENDOR
|
#ifdef GENERIC_OCL
|
||||||
#define OCL_VENDOR "GENERIC_OCL"
|
#define OCL_DEFAULT_VENDOR "generic"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef OCL_DEFAULT_VENDOR
|
||||||
|
#define OCL_DEFAULT_VENDOR "none"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -214,6 +214,30 @@ typedef struct _double4 double4;
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// -------------------------------------------------------------------------
|
||||||
|
// NVIDIA GENERIC OPENCL DEFINITIONS
|
||||||
|
// -------------------------------------------------------------------------
|
||||||
|
|
||||||
|
#ifdef NV_GENERIC_OCL
|
||||||
|
|
||||||
|
#define USE_OPENCL
|
||||||
|
#define fast_mul mul24
|
||||||
|
#define MEM_THREADS 16
|
||||||
|
#define THREADS_PER_ATOM 1
|
||||||
|
#define THREADS_PER_CHARGE 1
|
||||||
|
#define BLOCK_PAIR 64
|
||||||
|
#define MAX_SHARED_TYPES 8
|
||||||
|
#define BLOCK_NBOR_BUILD 64
|
||||||
|
#define BLOCK_BIO_PAIR 64
|
||||||
|
|
||||||
|
#define WARP_SIZE 32
|
||||||
|
#define PPPM_BLOCK_1D 64
|
||||||
|
#define BLOCK_CELL_2D 8
|
||||||
|
#define BLOCK_CELL_ID 128
|
||||||
|
#define MAX_BIO_SHARED_TYPES 128
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
// NVIDIA FERMI OPENCL DEFINITIONS
|
// NVIDIA FERMI OPENCL DEFINITIONS
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
@ -221,9 +245,6 @@ typedef struct _double4 double4;
|
|||||||
#ifdef FERMI_OCL
|
#ifdef FERMI_OCL
|
||||||
|
|
||||||
#define USE_OPENCL
|
#define USE_OPENCL
|
||||||
#define fast_mul(X,Y) (X)*(Y)
|
|
||||||
#define ARCH 0
|
|
||||||
#define DRIVER 0
|
|
||||||
#define MEM_THREADS 32
|
#define MEM_THREADS 32
|
||||||
#define THREADS_PER_ATOM 4
|
#define THREADS_PER_ATOM 4
|
||||||
#define THREADS_PER_CHARGE 8
|
#define THREADS_PER_CHARGE 8
|
||||||
@ -238,7 +259,54 @@ typedef struct _double4 double4;
|
|||||||
#define BLOCK_CELL_ID 128
|
#define BLOCK_CELL_ID 128
|
||||||
#define MAX_BIO_SHARED_TYPES 128
|
#define MAX_BIO_SHARED_TYPES 128
|
||||||
|
|
||||||
#pragma OPENCL EXTENSION cl_khr_fp64: enable
|
#endif
|
||||||
|
|
||||||
|
// -------------------------------------------------------------------------
|
||||||
|
// NVIDIA KEPLER OPENCL DEFINITIONS
|
||||||
|
// -------------------------------------------------------------------------
|
||||||
|
|
||||||
|
#ifdef KEPLER_OCL
|
||||||
|
|
||||||
|
#define USE_OPENCL
|
||||||
|
#define MEM_THREADS 32
|
||||||
|
#define THREADS_PER_ATOM 4
|
||||||
|
#define THREADS_PER_CHARGE 8
|
||||||
|
#define BLOCK_PAIR 256
|
||||||
|
#define MAX_SHARED_TYPES 11
|
||||||
|
#define BLOCK_NBOR_BUILD 128
|
||||||
|
#define BLOCK_BIO_PAIR 256
|
||||||
|
#define BLOCK_ELLIPSE 128
|
||||||
|
|
||||||
|
#define WARP_SIZE 32
|
||||||
|
#define PPPM_BLOCK_1D 64
|
||||||
|
#define BLOCK_CELL_2D 8
|
||||||
|
#define BLOCK_CELL_ID 128
|
||||||
|
#define MAX_BIO_SHARED_TYPES 128
|
||||||
|
|
||||||
|
#ifndef NO_OCL_PTX
|
||||||
|
#define ARCH 300
|
||||||
|
#ifdef _SINGLE_SINGLE
|
||||||
|
inline float shfl_xor(float var, int laneMask, int width) {
|
||||||
|
float ret;
|
||||||
|
int c;
|
||||||
|
c = ((WARP_SIZE-width) << 8) | 0x1f;
|
||||||
|
asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=f"(ret) : "f"(var), "r"(laneMask), "r"(c));
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
|
||||||
|
inline double shfl_xor(double var, int laneMask, int width) {
|
||||||
|
int c = ((WARP_SIZE-width) << 8) | 0x1f;
|
||||||
|
int x,y,x2,y2;
|
||||||
|
double ans;
|
||||||
|
asm volatile ("mov.b64 {%0, %1}, %2;" : "=r"(y), "=r"(x) : "d"(var));
|
||||||
|
asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=r"(x2) : "r"(x), "r"(laneMask), "r"(c));
|
||||||
|
asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=r"(y2) : "r"(y), "r"(laneMask), "r"(c));
|
||||||
|
asm volatile ("mov.b64 %0, {%1, %2};" : "=d"(ans) : "r"(y2), "r"(x2));
|
||||||
|
return ans;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -249,9 +317,6 @@ typedef struct _double4 double4;
|
|||||||
#ifdef CYPRESS_OCL
|
#ifdef CYPRESS_OCL
|
||||||
|
|
||||||
#define USE_OPENCL
|
#define USE_OPENCL
|
||||||
#define fast_mul(X,Y) (X)*(Y)
|
|
||||||
#define ARCH 0
|
|
||||||
#define DRIVER 0
|
|
||||||
#define MEM_THREADS 32
|
#define MEM_THREADS 32
|
||||||
#define THREADS_PER_ATOM 4
|
#define THREADS_PER_ATOM 4
|
||||||
#define THREADS_PER_CHARGE 8
|
#define THREADS_PER_CHARGE 8
|
||||||
@ -266,12 +331,6 @@ typedef struct _double4 double4;
|
|||||||
#define BLOCK_CELL_ID 128
|
#define BLOCK_CELL_ID 128
|
||||||
#define MAX_BIO_SHARED_TYPES 128
|
#define MAX_BIO_SHARED_TYPES 128
|
||||||
|
|
||||||
#if defined(cl_khr_fp64)
|
|
||||||
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
|
|
||||||
#elif defined(cl_amd_fp64)
|
|
||||||
#pragma OPENCL EXTENSION cl_amd_fp64 : enable
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
@ -281,9 +340,6 @@ typedef struct _double4 double4;
|
|||||||
#ifdef GENERIC_OCL
|
#ifdef GENERIC_OCL
|
||||||
|
|
||||||
#define USE_OPENCL
|
#define USE_OPENCL
|
||||||
#define fast_mul mul24
|
|
||||||
#define ARCH 0
|
|
||||||
#define DRIVER 0
|
|
||||||
#define MEM_THREADS 16
|
#define MEM_THREADS 16
|
||||||
#define THREADS_PER_ATOM 1
|
#define THREADS_PER_ATOM 1
|
||||||
#define THREADS_PER_CHARGE 1
|
#define THREADS_PER_CHARGE 1
|
||||||
@ -298,6 +354,20 @@ typedef struct _double4 double4;
|
|||||||
#define BLOCK_CELL_ID 128
|
#define BLOCK_CELL_ID 128
|
||||||
#define MAX_BIO_SHARED_TYPES 128
|
#define MAX_BIO_SHARED_TYPES 128
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// -------------------------------------------------------------------------
|
||||||
|
// OPENCL Stuff for All Hardware
|
||||||
|
// -------------------------------------------------------------------------
|
||||||
|
#ifdef USE_OPENCL
|
||||||
|
|
||||||
|
#ifndef _SINGLE_SINGLE
|
||||||
|
|
||||||
|
#ifndef cl_khr_fp64
|
||||||
|
#ifndef cl_amd_fp64
|
||||||
|
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
#if defined(cl_khr_fp64)
|
#if defined(cl_khr_fp64)
|
||||||
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
|
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
|
||||||
#elif defined(cl_amd_fp64)
|
#elif defined(cl_amd_fp64)
|
||||||
@ -306,10 +376,17 @@ typedef struct _double4 double4;
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// -------------------------------------------------------------------------
|
#ifndef fast_mul
|
||||||
// OPENCL Stuff for All Hardware
|
#define fast_mul(X,Y) (X)*(Y)
|
||||||
// -------------------------------------------------------------------------
|
#endif
|
||||||
#ifdef USE_OPENCL
|
|
||||||
|
#ifndef ARCH
|
||||||
|
#define ARCH 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef DRIVER
|
||||||
|
#define DRIVER 0
|
||||||
|
#endif
|
||||||
|
|
||||||
#define GLOBAL_ID_X get_global_id(0)
|
#define GLOBAL_ID_X get_global_id(0)
|
||||||
#define THREAD_ID_X get_local_id(0)
|
#define THREAD_ID_X get_local_id(0)
|
||||||
|
|||||||
@ -74,7 +74,7 @@ int RESquaredT::init(const int ntypes, double **host_shape, double **host_well,
|
|||||||
|
|
||||||
// Allocate a host write buffer for copying type data
|
// Allocate a host write buffer for copying type data
|
||||||
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
||||||
UCL_WRITE_OPTIMIZED);
|
UCL_WRITE_ONLY);
|
||||||
|
|
||||||
for (int i=0; i<lj_types*lj_types; i++)
|
for (int i=0; i<lj_types*lj_types; i++)
|
||||||
host_write[i]=0.0;
|
host_write[i]=0.0;
|
||||||
@ -95,7 +95,7 @@ int RESquaredT::init(const int ntypes, double **host_shape, double **host_well,
|
|||||||
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
|
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
|
||||||
host_offset);
|
host_offset);
|
||||||
|
|
||||||
dev_error.alloc(1,*(this->ucl_device));
|
dev_error.alloc(1,*(this->ucl_device),UCL_WRITE_ONLY);
|
||||||
dev_error.zero();
|
dev_error.zero();
|
||||||
|
|
||||||
// Allocate, cast and asynchronous memcpy of constant data
|
// Allocate, cast and asynchronous memcpy of constant data
|
||||||
@ -260,6 +260,9 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
|
|||||||
&this->_threads_per_atom);
|
&this->_threads_per_atom);
|
||||||
this->time_ellipsoid3.stop();
|
this->time_ellipsoid3.stop();
|
||||||
} else {
|
} else {
|
||||||
|
GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum()-
|
||||||
|
this->_last_ellipse)/
|
||||||
|
(BX/this->_threads_per_atom)));
|
||||||
this->ans->force.zero();
|
this->ans->force.zero();
|
||||||
this->ans->engv.zero();
|
this->ans->engv.zero();
|
||||||
this->time_nbor1.zero();
|
this->time_nbor1.zero();
|
||||||
|
|||||||
@ -75,7 +75,8 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *nbor_end;
|
const __global int *nbor, *nbor_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
|
nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,nbor_end,nbor);
|
n_stride,nbor_end,nbor);
|
||||||
|
|
||||||
|
|||||||
@ -59,12 +59,12 @@
|
|||||||
if (offset==0) { \
|
if (offset==0) { \
|
||||||
__global acctyp *ap1=engv+ii; \
|
__global acctyp *ap1=engv+ii; \
|
||||||
if (eflag>0) { \
|
if (eflag>0) { \
|
||||||
*ap1+=energy; \
|
*ap1+=energy*(acctyp)0.5; \
|
||||||
ap1+=astride; \
|
ap1+=astride; \
|
||||||
} \
|
} \
|
||||||
if (vflag>0) { \
|
if (vflag>0) { \
|
||||||
for (int i=0; i<6; i++) { \
|
for (int i=0; i<6; i++) { \
|
||||||
*ap1+=virial[i]; \
|
*ap1+=virial[i]*(acctyp)0.5; \
|
||||||
ap1+=astride; \
|
ap1+=astride; \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
@ -104,12 +104,12 @@
|
|||||||
if (offset==0) { \
|
if (offset==0) { \
|
||||||
__global acctyp *ap1=engv+ii; \
|
__global acctyp *ap1=engv+ii; \
|
||||||
if (eflag>0) { \
|
if (eflag>0) { \
|
||||||
*ap1+=energy; \
|
*ap1+=energy*(acctyp)0.5; \
|
||||||
ap1+=astride; \
|
ap1+=astride; \
|
||||||
} \
|
} \
|
||||||
if (vflag>0) { \
|
if (vflag>0) { \
|
||||||
for (int i=0; i<6; i++) { \
|
for (int i=0; i<6; i++) { \
|
||||||
*ap1+=virial[i]; \
|
*ap1+=virial[i]*(acctyp)0.5; \
|
||||||
ap1+=astride; \
|
ap1+=astride; \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
@ -173,7 +173,8 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *nbor_end;
|
const __global int *nbor, *nbor_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
|
nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,nbor_end,nbor);
|
n_stride,nbor_end,nbor);
|
||||||
|
|
||||||
@ -424,7 +425,8 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *nbor_end;
|
const __global int *nbor, *nbor_end;
|
||||||
int j, numj, n_stride;
|
int j, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,j,numj,
|
nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,j,numj,
|
||||||
n_stride,nbor_end,nbor);
|
n_stride,nbor_end,nbor);
|
||||||
|
|
||||||
@ -615,7 +617,8 @@ __kernel void k_resquared_lj(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
|
nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
@ -708,7 +711,8 @@ __kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
|
nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
|
|||||||
@ -88,7 +88,7 @@ int TableT::init(const int ntypes,
|
|||||||
|
|
||||||
// Allocate a host write buffer for data initialization
|
// Allocate a host write buffer for data initialization
|
||||||
UCL_H_Vec<int> host_write_int(lj_types*lj_types,*(this->ucl_device),
|
UCL_H_Vec<int> host_write_int(lj_types*lj_types,*(this->ucl_device),
|
||||||
UCL_WRITE_OPTIMIZED);
|
UCL_WRITE_ONLY);
|
||||||
|
|
||||||
for (int i=0; i<lj_types*lj_types; i++)
|
for (int i=0; i<lj_types*lj_types; i++)
|
||||||
host_write_int[i] = 0;
|
host_write_int[i] = 0;
|
||||||
@ -113,7 +113,7 @@ int TableT::init(const int ntypes,
|
|||||||
ucl_copy(nmask,host_write_int,false);
|
ucl_copy(nmask,host_write_int,false);
|
||||||
|
|
||||||
UCL_H_Vec<numtyp4> host_write(lj_types*lj_types,*(this->ucl_device),
|
UCL_H_Vec<numtyp4> host_write(lj_types*lj_types,*(this->ucl_device),
|
||||||
UCL_WRITE_OPTIMIZED);
|
UCL_WRITE_ONLY);
|
||||||
|
|
||||||
coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||||
for (int ix=1; ix<ntypes; ix++)
|
for (int ix=1; ix<ntypes; ix++)
|
||||||
@ -127,7 +127,7 @@ int TableT::init(const int ntypes,
|
|||||||
|
|
||||||
// Allocate tablength arrays
|
// Allocate tablength arrays
|
||||||
UCL_H_Vec<numtyp4> host_write2(_ntables*_tablength,*(this->ucl_device),
|
UCL_H_Vec<numtyp4> host_write2(_ntables*_tablength,*(this->ucl_device),
|
||||||
UCL_WRITE_OPTIMIZED);
|
UCL_WRITE_ONLY);
|
||||||
for (int i=0; i<_ntables*_tablength; i++) {
|
for (int i=0; i<_ntables*_tablength; i++) {
|
||||||
host_write2[i].x = 0.0;
|
host_write2[i].x = 0.0;
|
||||||
host_write2[i].y = 0.0;
|
host_write2[i].y = 0.0;
|
||||||
@ -190,7 +190,7 @@ int TableT::init(const int ntypes,
|
|||||||
ucl_copy(coeff4,host_write2,false);
|
ucl_copy(coeff4,host_write2,false);
|
||||||
|
|
||||||
UCL_H_Vec<numtyp> host_rsq(lj_types*lj_types,*(this->ucl_device),
|
UCL_H_Vec<numtyp> host_rsq(lj_types*lj_types,*(this->ucl_device),
|
||||||
UCL_WRITE_OPTIMIZED);
|
UCL_WRITE_ONLY);
|
||||||
cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||||
this->atom->type_pack1(ntypes,lj_types,cutsq,host_rsq,host_cutsq);
|
this->atom->type_pack1(ntypes,lj_types,cutsq,host_rsq,host_cutsq);
|
||||||
|
|
||||||
|
|||||||
@ -74,7 +74,8 @@ __kernel void k_table(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
@ -171,7 +172,8 @@ __kernel void k_table_fast(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
@ -268,7 +270,8 @@ __kernel void k_table_linear(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
@ -369,7 +372,8 @@ __kernel void k_table_linear_fast(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
@ -470,7 +474,8 @@ __kernel void k_table_spline(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
@ -578,7 +583,8 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
@ -688,7 +694,8 @@ __kernel void k_table_bitmap(const __global numtyp4 *x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
@ -794,7 +801,8 @@ __kernel void k_table_bitmap_fast(const __global numtyp4 *x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
const __global int *nbor, *list_end;
|
const __global int *nbor, *list_end;
|
||||||
int i, numj, n_stride;
|
int i, numj;
|
||||||
|
__local int n_stride;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,list_end,nbor);
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user