git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@10667 f3b2605a-c512-4ea7-a41b-209d697bcdaa

This commit is contained in:
sjplimp
2013-08-23 14:41:20 +00:00
parent 4047288238
commit 402d1a8605
105 changed files with 2003 additions and 954 deletions

View File

@ -4,9 +4,14 @@
# which file will be copied to Makefile.lammps # which file will be copied to Makefile.lammps
EXTRAMAKE = Makefile.lammps.standard EXTRAMAKE = Makefile.lammps.opencl
OCL_CPP = mpic++ -O3 -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK OCL_TUNE = -DFERMI_OCL # -- Uncomment for NVIDIA Fermi
# OCL_TUNE = -DKEPLER_OCL # -- Uncomment for NVIDIA Kepler
# OCL_TUNE = -DCYPRESS_OCL # -- Uncomment for AMD Cypress
# OCL_TUNE = -DGENERIC_OCL # -- Uncomment for generic device
OCL_CPP = mpic++ $(DEFAULT_DEVICE) -O3 -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK
OCL_LINK = -lOpenCL OCL_LINK = -lOpenCL
OCL_PREC = -D_SINGLE_SINGLE OCL_PREC = -D_SINGLE_SINGLE

View File

@ -4,7 +4,12 @@
# which file will be copied to Makefile.lammps # which file will be copied to Makefile.lammps
EXTRAMAKE = Makefile.lammps.standard EXTRAMAKE = Makefile.lammps.mac_ocl
OCL_TUNE = -DFERMI_OCL # -- Uncomment for NVIDIA Fermi
# OCL_TUNE = -DKEPLER_OCL # -- Uncomment for NVIDIA Kepler
# OCL_TUNE = -DCYPRESS_OCL # -- Uncomment for AMD Cypress
# OCL_TUNE = -DGENERIC_OCL # -- Uncomment for generic device
OCL_CPP = mpic++ -O3 -DMPI_GERYON -DUCL_NO_EXIT OCL_CPP = mpic++ -O3 -DMPI_GERYON -DUCL_NO_EXIT
OCL_LINK = -framework OpenCL OCL_LINK = -framework OpenCL

View File

@ -28,7 +28,7 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_ans.o \
$(OBJ_DIR)/lal_neighbor.o $(OBJ_DIR)/lal_neighbor_shared.o \ $(OBJ_DIR)/lal_neighbor.o $(OBJ_DIR)/lal_neighbor_shared.o \
$(OBJ_DIR)/lal_device.o $(OBJ_DIR)/lal_base_atomic.o \ $(OBJ_DIR)/lal_device.o $(OBJ_DIR)/lal_base_atomic.o \
$(OBJ_DIR)/lal_base_charge.o $(OBJ_DIR)/lal_base_ellipsoid.o \ $(OBJ_DIR)/lal_base_charge.o $(OBJ_DIR)/lal_base_ellipsoid.o \
$(OBJ_DIR)/lal_base_dipole.o \ $(OBJ_DIR)/lal_base_dipole.o $(OBJ_DIR)/lal_base_three.o \
$(OBJ_DIR)/lal_pppm.o $(OBJ_DIR)/lal_pppm_ext.o \ $(OBJ_DIR)/lal_pppm.o $(OBJ_DIR)/lal_pppm_ext.o \
$(OBJ_DIR)/lal_gayberne.o $(OBJ_DIR)/lal_gayberne_ext.o \ $(OBJ_DIR)/lal_gayberne.o $(OBJ_DIR)/lal_gayberne_ext.o \
$(OBJ_DIR)/lal_re_squared.o $(OBJ_DIR)/lal_re_squared_ext.o \ $(OBJ_DIR)/lal_re_squared.o $(OBJ_DIR)/lal_re_squared_ext.o \
@ -59,7 +59,12 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_ans.o \
$(OBJ_DIR)/lal_gauss.o $(OBJ_DIR)/lal_gauss_ext.o \ $(OBJ_DIR)/lal_gauss.o $(OBJ_DIR)/lal_gauss_ext.o \
$(OBJ_DIR)/lal_yukawa_colloid.o $(OBJ_DIR)/lal_yukawa_colloid_ext.o \ $(OBJ_DIR)/lal_yukawa_colloid.o $(OBJ_DIR)/lal_yukawa_colloid_ext.o \
$(OBJ_DIR)/lal_lj_coul_debye.o $(OBJ_DIR)/lal_lj_coul_debye_ext.o \ $(OBJ_DIR)/lal_lj_coul_debye.o $(OBJ_DIR)/lal_lj_coul_debye_ext.o \
$(OBJ_DIR)/lal_coul_dsf.o $(OBJ_DIR)/lal_coul_dsf_ext.o $(OBJ_DIR)/lal_coul_dsf.o $(OBJ_DIR)/lal_coul_dsf_ext.o \
$(OBJ_DIR)/lal_sw.o $(OBJ_DIR)/lal_sw_ext.o \
$(OBJ_DIR)/lal_beck.o $(OBJ_DIR)/lal_beck_ext.o \
$(OBJ_DIR)/lal_mie.o $(OBJ_DIR)/lal_mie_ext.o \
$(OBJ_DIR)/lal_soft.o $(OBJ_DIR)/lal_soft_ext.o \
$(OBJ_DIR)/lal_lj_coul_msm.o $(OBJ_DIR)/lal_lj_coul_msm_ext.o
CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \ CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \
$(OBJ_DIR)/atom.cubin $(OBJ_DIR)/atom_cubin.h \ $(OBJ_DIR)/atom.cubin $(OBJ_DIR)/atom_cubin.h \
@ -99,7 +104,12 @@ CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \
$(OBJ_DIR)/gauss.cubin $(OBJ_DIR)/gauss_cubin.h \ $(OBJ_DIR)/gauss.cubin $(OBJ_DIR)/gauss_cubin.h \
$(OBJ_DIR)/yukawa_colloid.cubin $(OBJ_DIR)/yukawa_colloid_cubin.h \ $(OBJ_DIR)/yukawa_colloid.cubin $(OBJ_DIR)/yukawa_colloid_cubin.h \
$(OBJ_DIR)/lj_coul_debye.cubin $(OBJ_DIR)/lj_coul_debye_cubin.h \ $(OBJ_DIR)/lj_coul_debye.cubin $(OBJ_DIR)/lj_coul_debye_cubin.h \
$(OBJ_DIR)/coul_dsf.cubin $(OBJ_DIR)/coul_dsf_cubin.h $(OBJ_DIR)/coul_dsf.cubin $(OBJ_DIR)/coul_dsf_cubin.h \
$(OBJ_DIR)/sw.cubin $(OBJ_DIR)/sw_cubin.h \
$(OBJ_DIR)/beck.cubin $(OBJ_DIR)/beck_cubin.h \
$(OBJ_DIR)/mie.cubin $(OBJ_DIR)/mie_cubin.h \
$(OBJ_DIR)/soft.cubin $(OBJ_DIR)/soft_cubin.h \
$(OBJ_DIR)/lj_coul_msm.cubin $(OBJ_DIR)/lj_coul_msm_cubin.h
all: $(OBJ_DIR) $(GPU_LIB) $(EXECS) all: $(OBJ_DIR) $(GPU_LIB) $(EXECS)
@ -175,6 +185,9 @@ $(OBJ_DIR)/lal_base_ellipsoid.o: $(ALL_H) lal_base_ellipsoid.h lal_base_ellipsoi
$(OBJ_DIR)/lal_base_dipole.o: $(ALL_H) lal_base_dipole.h lal_base_dipole.cpp $(OBJ_DIR)/lal_base_dipole.o: $(ALL_H) lal_base_dipole.h lal_base_dipole.cpp
$(CUDR) -o $@ -c lal_base_dipole.cpp $(CUDR) -o $@ -c lal_base_dipole.cpp
$(OBJ_DIR)/lal_base_three.o: $(ALL_H) lal_base_three.h lal_base_three.cpp
$(CUDR) -o $@ -c lal_base_three.cpp
$(OBJ_DIR)/pppm_f.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h $(OBJ_DIR)/pppm_f.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -Dgrdtyp=float -Dgrdtyp4=float4 -o $@ lal_pppm.cu $(CUDA) --cubin -DNV_KERNEL -Dgrdtyp=float -Dgrdtyp4=float4 -o $@ lal_pppm.cu
@ -571,6 +584,66 @@ $(OBJ_DIR)/lal_coul_dsf.o: $(ALL_H) lal_coul_dsf.h lal_coul_dsf.cpp $(OBJ_DIR)/c
$(OBJ_DIR)/lal_coul_dsf_ext.o: $(ALL_H) lal_coul_dsf.h lal_coul_dsf_ext.cpp lal_base_charge.h $(OBJ_DIR)/lal_coul_dsf_ext.o: $(ALL_H) lal_coul_dsf.h lal_coul_dsf_ext.cpp lal_base_charge.h
$(CUDR) -o $@ -c lal_coul_dsf_ext.cpp -I$(OBJ_DIR) $(CUDR) -o $@ -c lal_coul_dsf_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/sw.cubin: lal_sw.cu lal_precision.h lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_sw.cu
$(OBJ_DIR)/sw_cubin.h: $(OBJ_DIR)/sw.cubin $(OBJ_DIR)/sw.cubin
$(BIN2C) -c -n sw $(OBJ_DIR)/sw.cubin > $(OBJ_DIR)/sw_cubin.h
$(OBJ_DIR)/lal_sw.o: $(ALL_H) lal_sw.h lal_sw.cpp $(OBJ_DIR)/sw_cubin.h $(OBJ_DIR)/lal_base_three.o
$(CUDR) -o $@ -c lal_sw.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_sw_ext.o: $(ALL_H) lal_sw.h lal_sw_ext.cpp lal_base_three.h
$(CUDR) -o $@ -c lal_sw_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/beck.cubin: lal_beck.cu lal_precision.h lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_beck.cu
$(OBJ_DIR)/beck_cubin.h: $(OBJ_DIR)/beck.cubin $(OBJ_DIR)/beck.cubin
$(BIN2C) -c -n beck $(OBJ_DIR)/beck.cubin > $(OBJ_DIR)/beck_cubin.h
$(OBJ_DIR)/lal_beck.o: $(ALL_H) lal_beck.h lal_beck.cpp $(OBJ_DIR)/beck_cubin.h $(OBJ_DIR)/lal_base_atomic.o
$(CUDR) -o $@ -c lal_beck.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_beck_ext.o: $(ALL_H) lal_beck.h lal_beck_ext.cpp lal_base_atomic.h
$(CUDR) -o $@ -c lal_beck_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/mie.cubin: lal_mie.cu lal_precision.h lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_mie.cu
$(OBJ_DIR)/mie_cubin.h: $(OBJ_DIR)/mie.cubin $(OBJ_DIR)/mie.cubin
$(BIN2C) -c -n mie $(OBJ_DIR)/mie.cubin > $(OBJ_DIR)/mie_cubin.h
$(OBJ_DIR)/lal_mie.o: $(ALL_H) lal_mie.h lal_mie.cpp $(OBJ_DIR)/mie_cubin.h $(OBJ_DIR)/lal_base_atomic.o
$(CUDR) -o $@ -c lal_mie.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_mie_ext.o: $(ALL_H) lal_mie.h lal_mie_ext.cpp lal_base_atomic.h
$(CUDR) -o $@ -c lal_mie_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/soft.cubin: lal_soft.cu lal_precision.h lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_soft.cu
$(OBJ_DIR)/soft_cubin.h: $(OBJ_DIR)/soft.cubin $(OBJ_DIR)/soft.cubin
$(BIN2C) -c -n soft $(OBJ_DIR)/soft.cubin > $(OBJ_DIR)/soft_cubin.h
$(OBJ_DIR)/lal_soft.o: $(ALL_H) lal_soft.h lal_soft.cpp $(OBJ_DIR)/soft_cubin.h $(OBJ_DIR)/lal_base_atomic.o
$(CUDR) -o $@ -c lal_soft.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_soft_ext.o: $(ALL_H) lal_soft.h lal_soft_ext.cpp lal_base_atomic.h
$(CUDR) -o $@ -c lal_soft_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lj_coul_msm.cubin: lal_lj_coul_msm.cu lal_precision.h lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj_coul_msm.cu
$(OBJ_DIR)/lj_coul_msm_cubin.h: $(OBJ_DIR)/lj_coul_msm.cubin $(OBJ_DIR)/lj_coul_msm.cubin
$(BIN2C) -c -n lj_coul_msm $(OBJ_DIR)/lj_coul_msm.cubin > $(OBJ_DIR)/lj_coul_msm_cubin.h
$(OBJ_DIR)/lal_lj_coul_msm.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm.cpp $(OBJ_DIR)/lj_coul_msm_cubin.h $(OBJ_DIR)/lal_base_charge.o
$(CUDR) -o $@ -c lal_lj_coul_msm.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_lj_coul_msm_ext.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm_ext.cpp lal_base_charge.h
$(CUDR) -o $@ -c lal_lj_coul_msm_ext.cpp -I$(OBJ_DIR)
$(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVD_H) $(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVD_H)
$(CUDR) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_CUDADR $(CUDA_LIB) -lcuda $(CUDR) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_CUDADR $(CUDA_LIB) -lcuda

View File

@ -17,7 +17,7 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_answer.o \
$(OBJ_DIR)/lal_neighbor_shared.o $(OBJ_DIR)/lal_neighbor.o \ $(OBJ_DIR)/lal_neighbor_shared.o $(OBJ_DIR)/lal_neighbor.o \
$(OBJ_DIR)/lal_device.o $(OBJ_DIR)/lal_base_atomic.o \ $(OBJ_DIR)/lal_device.o $(OBJ_DIR)/lal_base_atomic.o \
$(OBJ_DIR)/lal_base_charge.o $(OBJ_DIR)/lal_base_ellipsoid.o \ $(OBJ_DIR)/lal_base_charge.o $(OBJ_DIR)/lal_base_ellipsoid.o \
$(OBJ_DIR)/lal_base_dipole.o \ $(OBJ_DIR)/lal_base_dipole.o $(OBJ_DIR)/lal_base_three.o \
$(OBJ_DIR)/lal_pppm.o $(OBJ_DIR)/lal_pppm_ext.o \ $(OBJ_DIR)/lal_pppm.o $(OBJ_DIR)/lal_pppm_ext.o \
$(OBJ_DIR)/lal_gayberne.o $(OBJ_DIR)/lal_gayberne_ext.o \ $(OBJ_DIR)/lal_gayberne.o $(OBJ_DIR)/lal_gayberne_ext.o \
$(OBJ_DIR)/lal_re_squared.o $(OBJ_DIR)/lal_re_squared_ext.o \ $(OBJ_DIR)/lal_re_squared.o $(OBJ_DIR)/lal_re_squared_ext.o \
@ -48,7 +48,12 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_answer.o \
$(OBJ_DIR)/lal_gauss.o $(OBJ_DIR)/lal_gauss_ext.o \ $(OBJ_DIR)/lal_gauss.o $(OBJ_DIR)/lal_gauss_ext.o \
$(OBJ_DIR)/lal_yukawa_colloid.o $(OBJ_DIR)/lal_yukawa_colloid_ext.o \ $(OBJ_DIR)/lal_yukawa_colloid.o $(OBJ_DIR)/lal_yukawa_colloid_ext.o \
$(OBJ_DIR)/lal_lj_coul_debye.o $(OBJ_DIR)/lal_lj_coul_debye_ext.o \ $(OBJ_DIR)/lal_lj_coul_debye.o $(OBJ_DIR)/lal_lj_coul_debye_ext.o \
$(OBJ_DIR)/lal_coul_dsf.o $(OBJ_DIR)/lal_coul_dsf_ext.o $(OBJ_DIR)/lal_coul_dsf.o $(OBJ_DIR)/lal_coul_dsf_ext.o \
$(OBJ_DIR)/lal_sw.o $(OBJ_DIR)/lal_sw_ext.o \
$(OBJ_DIR)/lal_beck.o $(OBJ_DIR)/lal_beck_ext.o \
$(OBJ_DIR)/lal_mie.o $(OBJ_DIR)/lal_mie_ext.o \
$(OBJ_DIR)/lal_soft.o $(OBJ_DIR)/lal_soft_ext.o \
$(OBJ_DIR)/lal_lj_coul_msm.o $(OBJ_DIR)/lal_lj_coul_msm_ext.o
KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \ KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \
$(OBJ_DIR)/neighbor_cpu_cl.h $(OBJ_DIR)/pppm_cl.h \ $(OBJ_DIR)/neighbor_cpu_cl.h $(OBJ_DIR)/pppm_cl.h \
@ -68,7 +73,9 @@ KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \
$(OBJ_DIR)/born_coul_long_cl.h $(OBJ_DIR)/dipole_lj_cl.h \ $(OBJ_DIR)/born_coul_long_cl.h $(OBJ_DIR)/dipole_lj_cl.h \
$(OBJ_DIR)/dipole_lj_sf_cl.h $(OBJ_DIR)/colloid_cl.h \ $(OBJ_DIR)/dipole_lj_sf_cl.h $(OBJ_DIR)/colloid_cl.h \
$(OBJ_DIR)/gauss_cl.h $(OBJ_DIR)/yukawa_colloid_cl.h \ $(OBJ_DIR)/gauss_cl.h $(OBJ_DIR)/yukawa_colloid_cl.h \
$(OBJ_DIR)/lj_coul_debye_cl.h $(OBJ_DIR)/coul_dsf_cl.h $(OBJ_DIR)/lj_coul_debye_cl.h $(OBJ_DIR)/coul_dsf_cl.h \
$(OBJ_DIR)/sw_cl.h $(OBJ_DIR)/beck_cl.h $(OBJ_DIR)/mie_cl.h \
$(OBJ_DIR)/soft_cl.h $(OBJ_DIR)/lj_coul_msm_cl.h
OCL_EXECS = $(BIN_DIR)/ocl_get_devices OCL_EXECS = $(BIN_DIR)/ocl_get_devices
@ -117,6 +124,9 @@ $(OBJ_DIR)/lal_base_ellipsoid.o: $(OCL_H) lal_base_ellipsoid.h lal_base_ellipsoi
$(OBJ_DIR)/lal_base_dipole.o: $(OCL_H) lal_base_dipole.h lal_base_dipole.cpp $(OBJ_DIR)/lal_base_dipole.o: $(OCL_H) lal_base_dipole.h lal_base_dipole.cpp
$(OCL) -o $@ -c lal_base_dipole.cpp -I$(OBJ_DIR) $(OCL) -o $@ -c lal_base_dipole.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_base_three.o: $(OCL_H) lal_base_three.h lal_base_three.cpp
$(OCL) -o $@ -c lal_base_three.cpp
$(OBJ_DIR)/pppm_cl.h: lal_pppm.cu lal_preprocessor.h $(OBJ_DIR)/pppm_cl.h: lal_pppm.cu lal_preprocessor.h
$(BSH) ./geryon/file_to_cstr.sh pppm lal_preprocessor.h lal_pppm.cu $(OBJ_DIR)/pppm_cl.h; $(BSH) ./geryon/file_to_cstr.sh pppm lal_preprocessor.h lal_pppm.cu $(OBJ_DIR)/pppm_cl.h;
@ -405,6 +415,51 @@ $(OBJ_DIR)/lal_coul_dsf.o: $(ALL_H) lal_coul_dsf.h lal_coul_dsf.cpp $(OBJ_DIR)/
$(OBJ_DIR)/lal_coul_dsf_ext.o: $(ALL_H) lal_coul_dsf.h lal_coul_dsf_ext.cpp lal_base_charge.h $(OBJ_DIR)/lal_coul_dsf_ext.o: $(ALL_H) lal_coul_dsf.h lal_coul_dsf_ext.cpp lal_base_charge.h
$(OCL) -o $@ -c lal_coul_dsf_ext.cpp -I$(OBJ_DIR) $(OCL) -o $@ -c lal_coul_dsf_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/sw_cl.h: lal_sw.cu $(PRE1_H)
$(BSH) ./geryon/file_to_cstr.sh sw $(PRE1_H) lal_sw.cu $(OBJ_DIR)/sw_cl.h;
$(OBJ_DIR)/lal_sw.o: $(ALL_H) lal_sw.h lal_sw.cpp $(OBJ_DIR)/sw_cl.h $(OBJ_DIR)/sw_cl.h $(OBJ_DIR)/lal_base_three.o
$(OCL) -o $@ -c lal_sw.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_sw_ext.o: $(ALL_H) lal_sw.h lal_sw_ext.cpp lal_base_three.h
$(OCL) -o $@ -c lal_sw_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/beck_cl.h: lal_beck.cu $(PRE1_H)
$(BSH) ./geryon/file_to_cstr.sh beck $(PRE1_H) lal_beck.cu $(OBJ_DIR)/beck_cl.h;
$(OBJ_DIR)/lal_beck.o: $(ALL_H) lal_beck.h lal_beck.cpp $(OBJ_DIR)/beck_cl.h $(OBJ_DIR)/beck_cl.h $(OBJ_DIR)/lal_base_atomic.o
$(OCL) -o $@ -c lal_beck.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_beck_ext.o: $(ALL_H) lal_beck.h lal_beck_ext.cpp lal_base_atomic.h
$(OCL) -o $@ -c lal_beck_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/mie_cl.h: lal_mie.cu $(PRE1_H)
$(BSH) ./geryon/file_to_cstr.sh mie $(PRE1_H) lal_mie.cu $(OBJ_DIR)/mie_cl.h;
$(OBJ_DIR)/lal_mie.o: $(ALL_H) lal_mie.h lal_mie.cpp $(OBJ_DIR)/mie_cl.h $(OBJ_DIR)/mie_cl.h $(OBJ_DIR)/lal_base_atomic.o
$(OCL) -o $@ -c lal_mie.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_mie_ext.o: $(ALL_H) lal_mie.h lal_mie_ext.cpp lal_base_atomic.h
$(OCL) -o $@ -c lal_mie_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/soft_cl.h: lal_soft.cu $(PRE1_H)
$(BSH) ./geryon/file_to_cstr.sh soft $(PRE1_H) lal_soft.cu $(OBJ_DIR)/soft_cl.h;
$(OBJ_DIR)/lal_soft.o: $(ALL_H) lal_soft.h lal_soft.cpp $(OBJ_DIR)/soft_cl.h $(OBJ_DIR)/soft_cl.h $(OBJ_DIR)/lal_base_atomic.o
$(OCL) -o $@ -c lal_soft.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_soft_ext.o: $(ALL_H) lal_soft.h lal_soft_ext.cpp lal_base_atomic.h
$(OCL) -o $@ -c lal_soft_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lj_coul_msm_cl.h: lal_lj_coul_msm.cu $(PRE1_H)
$(BSH) ./geryon/file_to_cstr.sh lj_coul_msm $(PRE1_H) lal_lj_coul_msm.cu $(OBJ_DIR)/lj_coul_msm_cl.h;
$(OBJ_DIR)/lal_lj_coul_msm.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm.cpp $(OBJ_DIR)/lj_coul_msm_cl.h $(OBJ_DIR)/lj_coul_msm_cl.h $(OBJ_DIR)/lal_base_charge.o
$(OCL) -o $@ -c lal_lj_coul_msm.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_lj_coul_msm_ext.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm_ext.cpp lal_base_charge.h
$(OCL) -o $@ -c lal_lj_coul_msm_ext.cpp -I$(OBJ_DIR)
$(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp $(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp
$(OCL) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_OPENCL $(OCL_LINK) $(OCL) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_OPENCL $(OCL_LINK)

View File

@ -3,6 +3,7 @@
-------------------------------- --------------------------------
W. Michael Brown (ORNL) W. Michael Brown (ORNL)
Trung Dac Nguyen (ORNL)
Peng Wang (NVIDIA) Peng Wang (NVIDIA)
Axel Kohlmeyer (Temple) Axel Kohlmeyer (Temple)
Steve Plimpton (SNL) Steve Plimpton (SNL)
@ -60,6 +61,8 @@ devices on your system. A Makefile for OpenCL compilation is provided,
but support for OpenCL use is not currently provided by the developers. but support for OpenCL use is not currently provided by the developers.
Details of the implementation are provided in: Details of the implementation are provided in:
----
Brown, W.M., Wang, P. Plimpton, S.J., Tharrington, A.N. Implementing Brown, W.M., Wang, P. Plimpton, S.J., Tharrington, A.N. Implementing
Molecular Dynamics on Hybrid High Performance Computers - Short Range Molecular Dynamics on Hybrid High Performance Computers - Short Range
Forces. Computer Physics Communications. 2011. 182: p. 898-911. Forces. Computer Physics Communications. 2011. 182: p. 898-911.
@ -68,28 +71,64 @@ and
Brown, W.M., Kohlmeyer, A. Plimpton, S.J., Tharrington, A.N. Implementing Brown, W.M., Kohlmeyer, A. Plimpton, S.J., Tharrington, A.N. Implementing
Molecular Dynamics on Hybrid High Performance Computers - Particle-Particle Molecular Dynamics on Hybrid High Performance Computers - Particle-Particle
Particle-Mesh. Computer Physics Communications. 2011. In press. Particle-Mesh. Computer Physics Communications. 2012. 183: p. 449-459.
and
Brown, W.M., Masako, Y. Implementing Molecular Dynamics on Hybrid High
Performance Computers - Three-Body Potentials. Computer Physics Communications.
2013. In press.
----
NOTE: Installation of the CUDA SDK is not required. NOTE: Installation of the CUDA SDK is not required.
Current styles supporting GPU acceleration: Current styles supporting GPU acceleration:
1. lj/cut 1 beck
2. lj96/cut 2 born/coul/long
3. lj/expand 3 born/coul/wolf
4. lj/cut/coul/cut 4 born
5. lj/cut/coul/long 5 buck/coul/cut
6. lj/charmm/coul/long 6 buck/coul/long
7. lj/class2 7 buck
8. lj/class2/coul/long 8 colloid
9. morse 9 coul/dsf
10. cg/cmm 10 coul/long
11. cg/cmm/coul/long 11 eam/alloy
12. coul/long 12 eam/fs
13. gayberne 13 eam
14. resquared 14 eam/lj
15. pppm 15 gauss
16 gayberne
17 lj96/cut
18 lj/charmm/coul/long
19 lj/class2/coul/long
20 lj/class2
21 lj/cut/coul/cut
22 lj/cut/coul/debye
23 lj/cut/coul/dsf
24 lj/cut/coul/long
25 lj/cut/coul/msm
26 lj/cut/coul/wolf/fsw
27 lj/cut/dipole/cut
28 lj/cut
29 lj/cut/tgpu
30 lj/expand
31 lj/sdk/coul/long
32 cg/cmm/coul/long
33 lj/sdk
34 cg/cmm
35 lj/sf/dipole/sf
36 mie/cut
37 morse
38 resquared
39 soft
40 sw
41 table
42 yukawa/colloid
43 yukawa
44 pppm
MULTIPLE LAMMPS PROCESSES MULTIPLE LAMMPS PROCESSES
@ -170,3 +209,4 @@ make yes-asphere
make yes-kspace make yes-kspace
make yes-gpu make yes-gpu
make linux make linux

View File

@ -1 +1 @@
Geryon Version 12.033 Geryon Version 13.209

View File

@ -65,15 +65,19 @@ class UCL_Device {
public: public:
/// Collect properties for every GPU on the node /// Collect properties for every GPU on the node
/** \note You must set the active GPU with set() before using the device **/ /** \note You must set the active GPU with set() before using the device **/
UCL_Device(); inline UCL_Device();
~UCL_Device(); inline ~UCL_Device();
/// Returns 1 (For compatibility with OpenCL) /// Returns 1 (For compatibility with OpenCL)
inline int num_platforms() { return 1; } inline int num_platforms() { return 1; }
/// Return a string with name and info of the current platform /// Return a string with name and info of the current platform
std::string platform_name() { return "NVIDIA Corporation NVIDIA CUDA Driver"; } inline std::string platform_name()
{ return "NVIDIA Corporation NVIDIA CUDA Driver"; }
/// Delete any contexts/data and set the platform number to be used
inline int set_platform(const int pid);
/// Return the number of devices that support CUDA /// Return the number of devices that support CUDA
inline int num_devices() { return _properties.size(); } inline int num_devices() { return _properties.size(); }
@ -81,8 +85,12 @@ class UCL_Device {
/// Set the CUDA device to the specified device number /// Set the CUDA device to the specified device number
/** A context and default command queue will be created for the device /** A context and default command queue will be created for the device
* Returns UCL_SUCCESS if successful or UCL_ERROR if the device could not * Returns UCL_SUCCESS if successful or UCL_ERROR if the device could not
* be allocated for use **/ * be allocated for use. clear() is called to delete any contexts and
int set(int num); * associated data from previous calls to set(). **/
inline int set(int num);
/// Delete any context and associated data stored from a call to set()
inline void clear();
/// Get the current device number /// Get the current device number
inline int device_num() { return _device; } inline int device_num() { return _device; }
@ -147,16 +155,17 @@ class UCL_Device {
inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; } inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; }
/// Returns true if double precision is support for the current device /// Returns true if double precision is support for the current device
bool double_precision() { return double_precision(_device); } inline bool double_precision() { return double_precision(_device); }
/// Returns true if double precision is support for the device /// Returns true if double precision is support for the device
bool double_precision(const int i) {return arch(i)>=1.3;} inline bool double_precision(const int i) {return arch(i)>=1.3;}
/// Get the number of cores in the current device /// Get the number of cores in the current device
inline unsigned cores() { return cores(_device); } inline unsigned cores() { return cores(_device); }
/// Get the number of cores /// Get the number of cores
inline unsigned cores(const int i) inline unsigned cores(const int i)
{ if (arch(i)<2.0) return _properties[i].multiProcessorCount*8; { if (arch(i)<2.0) return _properties[i].multiProcessorCount*8;
else if (arch(i)<3.0) return _properties[i].multiProcessorCount*32; else if (arch(i)<2.1) return _properties[i].multiProcessorCount*32;
else if (arch(i)<3.0) return _properties[i].multiProcessorCount*48;
else return _properties[i].multiProcessorCount*192; } else return _properties[i].multiProcessorCount*192; }
/// Get the gigabytes of global memory in the current device /// Get the gigabytes of global memory in the current device
@ -216,8 +225,34 @@ class UCL_Device {
inline bool sharing_supported(const int i) inline bool sharing_supported(const int i)
{ return (_properties[i].computeMode == CU_COMPUTEMODE_DEFAULT); } { return (_properties[i].computeMode == CU_COMPUTEMODE_DEFAULT); }
/// True if splitting device into equal subdevices supported
inline bool fission_equal()
{ return fission_equal(_device); }
/// True if splitting device into equal subdevices supported
inline bool fission_equal(const int i)
{ return false; }
/// True if splitting device into subdevices by specified counts supported
inline bool fission_by_counts()
{ return fission_by_counts(_device); }
/// True if splitting device into subdevices by specified counts supported
inline bool fission_by_counts(const int i)
{ return false; }
/// True if splitting device into subdevices by affinity domains supported
inline bool fission_by_affinity()
{ return fission_by_affinity(_device); }
/// True if splitting device into subdevices by affinity domains supported
inline bool fission_by_affinity(const int i)
{ return false; }
/// Maximum number of subdevices allowed from device fission
inline int max_sub_devices()
{ return max_sub_devices(_device); }
/// Maximum number of subdevices allowed from device fission
inline int max_sub_devices(const int i)
{ return 0; }
/// List all devices along with all properties /// List all devices along with all properties
void print_all(std::ostream &out); inline void print_all(std::ostream &out);
private: private:
int _device, _num_devices; int _device, _num_devices;
@ -228,7 +263,7 @@ class UCL_Device {
}; };
// Grabs the properties for all devices // Grabs the properties for all devices
inline UCL_Device::UCL_Device() { UCL_Device::UCL_Device() {
CU_SAFE_CALL_NS(cuInit(0)); CU_SAFE_CALL_NS(cuInit(0));
CU_SAFE_CALL_NS(cuDeviceGetCount(&_num_devices)); CU_SAFE_CALL_NS(cuDeviceGetCount(&_num_devices));
for (int dev=0; dev<_num_devices; ++dev) { for (int dev=0; dev<_num_devices; ++dev) {
@ -280,22 +315,21 @@ inline UCL_Device::UCL_Device() {
_cq.back()=0; _cq.back()=0;
} }
inline UCL_Device::~UCL_Device() { UCL_Device::~UCL_Device() {
if (_device>-1) { clear();
for (int i=1; i<num_queues(); i++) pop_command_queue(); }
cuCtxDestroy(_context);
} int UCL_Device::set_platform(const int pid) {
clear();
#ifdef UCL_DEBUG
assert(pid<num_platforms());
#endif
return UCL_SUCCESS;
} }
// Set the CUDA device to the specified device number // Set the CUDA device to the specified device number
inline int UCL_Device::set(int num) { int UCL_Device::set(int num) {
if (_device==num) clear();
return UCL_SUCCESS;
if (_device>-1) {
CU_SAFE_CALL_NS(cuCtxDestroy(_context));
for (int i=1; i<num_queues(); i++) pop_command_queue();
_cq[0]=0;
}
_device=_properties[num].device_id; _device=_properties[num].device_id;
CU_SAFE_CALL_NS(cuDeviceGet(&_cu_device,_device)); CU_SAFE_CALL_NS(cuDeviceGet(&_cu_device,_device));
CUresult err=cuCtxCreate(&_context,0,_cu_device); CUresult err=cuCtxCreate(&_context,0,_cu_device);
@ -310,8 +344,16 @@ inline int UCL_Device::set(int num) {
return UCL_SUCCESS; return UCL_SUCCESS;
} }
void UCL_Device::clear() {
if (_device>-1) {
for (int i=1; i<num_queues(); i++) pop_command_queue();
cuCtxDestroy(_context);
}
_device=-1;
}
// List all devices along with all properties // List all devices along with all properties
inline void UCL_Device::print_all(std::ostream &out) { void UCL_Device::print_all(std::ostream &out) {
#if CUDA_VERSION >= 2020 #if CUDA_VERSION >= 2020
int driver_version; int driver_version;
cuDriverGetVersion(&driver_version); cuDriverGetVersion(&driver_version);

View File

@ -377,6 +377,10 @@ class UCL_Kernel {
#endif #endif
} }
/// Return the default command queue/stream associated with this data
inline command_queue & cq() { return _cq; }
/// Change the default command queue associated with matrix
inline void cq(command_queue &cq_in) { _cq=cq_in; }
#include "ucl_arg_kludge.h" #include "ucl_arg_kludge.h"
private: private:

View File

@ -47,14 +47,14 @@ typedef CUdeviceptr device_ptr;
// -------------------------------------------------------------------------- // --------------------------------------------------------------------------
template <class mat_type, class copy_type> template <class mat_type, class copy_type>
inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n, inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
const enum UCL_MEMOPT kind) { const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
CUresult err=CUDA_SUCCESS; CUresult err=CUDA_SUCCESS;
if (kind==UCL_RW_OPTIMIZED) if (kind==UCL_NOT_PINNED)
err=cuMemAllocHost((void **)mat.host_ptr(),n); *(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
else if (kind==UCL_WRITE_OPTIMIZED) else if (kind==UCL_WRITE_ONLY)
err=cuMemHostAlloc((void **)mat.host_ptr(),n,CU_MEMHOSTALLOC_WRITECOMBINED); err=cuMemHostAlloc((void **)mat.host_ptr(),n,CU_MEMHOSTALLOC_WRITECOMBINED);
else else
*(mat.host_ptr())=(typename mat_type::data_type*)malloc(n); err=cuMemAllocHost((void **)mat.host_ptr(),n);
if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL) if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL)
return UCL_MEMORY_ERROR; return UCL_MEMORY_ERROR;
mat.cq()=cm.cq(); mat.cq()=cm.cq();
@ -63,14 +63,14 @@ inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
template <class mat_type> template <class mat_type>
inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n, inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
const enum UCL_MEMOPT kind) { const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
CUresult err=CUDA_SUCCESS; CUresult err=CUDA_SUCCESS;
if (kind==UCL_RW_OPTIMIZED) if (kind==UCL_NOT_PINNED)
err=cuMemAllocHost((void **)mat.host_ptr(),n); *(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
else if (kind==UCL_WRITE_OPTIMIZED) else if (kind==UCL_WRITE_ONLY)
err=cuMemHostAlloc((void **)mat.host_ptr(),n,CU_MEMHOSTALLOC_WRITECOMBINED); err=cuMemHostAlloc((void **)mat.host_ptr(),n,CU_MEMHOSTALLOC_WRITECOMBINED);
else else
*(mat.host_ptr())=(typename mat_type::data_type*)malloc(n); err=cuMemAllocHost((void **)mat.host_ptr(),n);
if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL) if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL)
return UCL_MEMORY_ERROR; return UCL_MEMORY_ERROR;
mat.cq()=dev.cq(); mat.cq()=dev.cq();
@ -78,8 +78,10 @@ inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
} }
template <class mat_type> template <class mat_type>
inline void _host_free(mat_type &mat, const enum UCL_MEMOPT kind) { inline void _host_free(mat_type &mat) {
if (kind!=UCL_NOT_PINNED) if (mat.kind()==UCL_VIEW)
return;
else if (mat.kind()!=UCL_NOT_PINNED)
CU_DESTRUCT_CALL(cuMemFreeHost(mat.begin())); CU_DESTRUCT_CALL(cuMemFreeHost(mat.begin()));
else else
free(mat.begin()); free(mat.begin());
@ -87,14 +89,14 @@ inline void _host_free(mat_type &mat, const enum UCL_MEMOPT kind) {
template <class mat_type> template <class mat_type>
inline int _host_resize(mat_type &mat, const size_t n) { inline int _host_resize(mat_type &mat, const size_t n) {
_host_free(mat,mat.kind()); _host_free(mat);
CUresult err=CUDA_SUCCESS; CUresult err=CUDA_SUCCESS;
if (mat.kind()==UCL_RW_OPTIMIZED) if (mat.kind()==UCL_NOT_PINNED)
err=cuMemAllocHost((void **)mat.host_ptr(),n); *(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
else if (mat.kind()==UCL_WRITE_OPTIMIZED) else if (mat.kind()==UCL_WRITE_ONLY)
err=cuMemHostAlloc((void **)mat.host_ptr(),n,CU_MEMHOSTALLOC_WRITECOMBINED); err=cuMemHostAlloc((void **)mat.host_ptr(),n,CU_MEMHOSTALLOC_WRITECOMBINED);
else else
*(mat.host_ptr())=(typename mat_type::data_type*)malloc(n); err=cuMemAllocHost((void **)mat.host_ptr(),n);
if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL) if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL)
return UCL_MEMORY_ERROR; return UCL_MEMORY_ERROR;
return UCL_SUCCESS; return UCL_SUCCESS;
@ -155,7 +157,8 @@ inline int _device_alloc(mat_type &mat, UCL_Device &d, const size_t rows,
template <class mat_type> template <class mat_type>
inline void _device_free(mat_type &mat) { inline void _device_free(mat_type &mat) {
CU_DESTRUCT_CALL(cuMemFree(mat.cbegin())); if (mat.kind()!=UCL_VIEW)
CU_DESTRUCT_CALL(cuMemFree(mat.cbegin()));
} }
template <class mat_type> template <class mat_type>
@ -229,13 +232,13 @@ inline void _host_zero(void *ptr, const size_t n) {
} }
template <class mat_type> template <class mat_type>
inline void _device_zero(mat_type &mat, const size_t n) { inline void _device_zero(mat_type &mat, const size_t n, command_queue &cq) {
if (n%32==0) if (n%32==0)
CU_SAFE_CALL(cuMemsetD32(mat.cbegin(),0,n/4)); CU_SAFE_CALL(cuMemsetD32Async(mat.cbegin(),0,n/4,cq));
else if (n%16==0) else if (n%16==0)
CU_SAFE_CALL(cuMemsetD16(mat.cbegin(),0,n/2)); CU_SAFE_CALL(cuMemsetD16Async(mat.cbegin(),0,n/2,cq));
else else
CU_SAFE_CALL(cuMemsetD8(mat.cbegin(),0,n)); CU_SAFE_CALL(cuMemsetD8Async(mat.cbegin(),0,n,cq));
} }
// -------------------------------------------------------------------------- // --------------------------------------------------------------------------

View File

@ -51,6 +51,10 @@ inline void ucl_sync(cl_command_queue &cq) {
CL_SAFE_CALL(clFinish(cq)); CL_SAFE_CALL(clFinish(cq));
} }
inline bool _shared_mem_device(cl_device_type &device_type) {
return (device_type==CL_DEVICE_TYPE_CPU);
}
struct OCLProperties { struct OCLProperties {
std::string name; std::string name;
cl_device_type device_type; cl_device_type device_type;
@ -64,6 +68,10 @@ struct OCLProperties {
bool double_precision; bool double_precision;
int alignment; int alignment;
size_t timer_resolution; size_t timer_resolution;
bool ecc_support;
std::string c_version;
bool partition_equal, partition_counts, partition_affinity;
cl_uint max_sub_devices;
}; };
/// Class for looking at data parallel device properties /// Class for looking at data parallel device properties
@ -74,15 +82,18 @@ class UCL_Device {
public: public:
/// Collect properties for every device on the node /// Collect properties for every device on the node
/** \note You must set the active GPU with set() before using the device **/ /** \note You must set the active GPU with set() before using the device **/
UCL_Device(); inline UCL_Device();
~UCL_Device(); inline ~UCL_Device();
/// Return the number of platforms (0 if error or no platforms) /// Return the number of platforms (0 if error or no platforms)
inline int num_platforms() { return _num_platforms; } inline int num_platforms() { return _num_platforms; }
/// Return a string with name and info of the current platform /// Return a string with name and info of the current platform
std::string platform_name(); inline std::string platform_name();
/// Delete any contexts/data and set the platform number to be used
inline int set_platform(const int pid);
/// Return the number of devices that support OpenCL /// Return the number of devices that support OpenCL
inline int num_devices() { return _num_devices; } inline int num_devices() { return _num_devices; }
@ -90,8 +101,12 @@ class UCL_Device {
/// Set the OpenCL device to the specified device number /// Set the OpenCL device to the specified device number
/** A context and default command queue will be created for the device * /** A context and default command queue will be created for the device *
* Returns UCL_SUCCESS if successful or UCL_ERROR if the device could not * Returns UCL_SUCCESS if successful or UCL_ERROR if the device could not
* be allocated for use **/ * be allocated for use. clear() is called to delete any contexts and
int set(int num); * associated data from previous calls to set(). **/
inline int set(int num);
/// Delete any context and associated data stored from a call to set()
inline void clear();
/// Get the current device number /// Get the current device number
inline int device_num() { return _device; } inline int device_num() { return _device; }
@ -161,12 +176,14 @@ class UCL_Device {
/// Returns true if host memory is efficiently addressable from device /// Returns true if host memory is efficiently addressable from device
inline bool shared_memory() { return shared_memory(_device); } inline bool shared_memory() { return shared_memory(_device); }
/// Returns true if host memory is efficiently addressable from device /// Returns true if host memory is efficiently addressable from device
inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; } inline bool shared_memory(const int i)
{ return _shared_mem_device(_properties[i].device_type); }
/// Returns true if double precision is support for the current device /// Returns true if double precision is support for the current device
bool double_precision() { return double_precision(_device); } inline bool double_precision() { return double_precision(_device); }
/// Returns true if double precision is support for the device /// Returns true if double precision is support for the device
bool double_precision(const int i) {return _properties[i].double_precision;} inline bool double_precision(const int i)
{return _properties[i].double_precision;}
/// Get the number of cores in the current device /// Get the number of cores in the current device
inline unsigned cores() { return cores(_device); } inline unsigned cores() { return cores(_device); }
@ -227,8 +244,34 @@ class UCL_Device {
inline bool sharing_supported(const int i) inline bool sharing_supported(const int i)
{ return true; } { return true; }
/// True if splitting device into equal subdevices supported
inline bool fission_equal()
{ return fission_equal(_device); }
/// True if splitting device into equal subdevices supported
inline bool fission_equal(const int i)
{ return _properties[i].partition_equal; }
/// True if splitting device into subdevices by specified counts supported
inline bool fission_by_counts()
{ return fission_by_counts(_device); }
/// True if splitting device into subdevices by specified counts supported
inline bool fission_by_counts(const int i)
{ return _properties[i].partition_counts; }
/// True if splitting device into subdevices by affinity domains supported
inline bool fission_by_affinity()
{ return fission_by_affinity(_device); }
/// True if splitting device into subdevices by affinity domains supported
inline bool fission_by_affinity(const int i)
{ return _properties[i].partition_affinity; }
/// Maximum number of subdevices allowed from device fission
inline int max_sub_devices()
{ return max_sub_devices(_device); }
/// Maximum number of subdevices allowed from device fission
inline int max_sub_devices(const int i)
{ return _properties[i].max_sub_devices; }
/// List all devices along with all properties /// List all devices along with all properties
void print_all(std::ostream &out); inline void print_all(std::ostream &out);
/// Return the OpenCL type for the device /// Return the OpenCL type for the device
inline cl_device_id & cl_device() { return _cl_device; } inline cl_device_id & cl_device() { return _cl_device; }
@ -237,7 +280,8 @@ class UCL_Device {
int _num_platforms; // Number of platforms int _num_platforms; // Number of platforms
int _platform; // UCL_Device ID for current platform int _platform; // UCL_Device ID for current platform
cl_platform_id _cl_platform; // OpenCL ID for current platform cl_platform_id _cl_platform; // OpenCL ID for current platform
cl_context _context; // Context used for accessing the device cl_platform_id _cl_platforms[20]; // OpenCL IDs for all platforms
cl_context _context; // Context used for accessing the device
std::vector<cl_command_queue> _cq;// The default command queue for this device std::vector<cl_command_queue> _cq;// The default command queue for this device
int _device; // UCL_Device ID for current device int _device; // UCL_Device ID for current device
cl_device_id _cl_device; // OpenCL ID for current device cl_device_id _cl_device; // OpenCL ID for current device
@ -245,24 +289,18 @@ class UCL_Device {
int _num_devices; // Number of devices int _num_devices; // Number of devices
std::vector<OCLProperties> _properties; // Properties for each device std::vector<OCLProperties> _properties; // Properties for each device
void add_properties(cl_device_id); inline void add_properties(cl_device_id);
int create_context(); inline int create_context();
int _default_cq; int _default_cq;
}; };
// Grabs the properties for all devices // Grabs the properties for all devices
inline UCL_Device::UCL_Device() { UCL_Device::UCL_Device() {
cl_int errorv;
cl_uint nplatforms;
_cl_device=0;
_device=-1; _device=-1;
_num_devices=0;
_platform=0;
_default_cq=0;
// --- Get Number of Platforms // --- Get Number of Platforms
errorv=clGetPlatformIDs(1,&_cl_platform,&nplatforms); cl_uint nplatforms;
cl_int errorv=clGetPlatformIDs(20,_cl_platforms,&nplatforms);
if (errorv!=CL_SUCCESS) { if (errorv!=CL_SUCCESS) {
_num_platforms=0; _num_platforms=0;
@ -270,6 +308,38 @@ inline UCL_Device::UCL_Device() {
} else } else
_num_platforms=static_cast<int>(nplatforms); _num_platforms=static_cast<int>(nplatforms);
set_platform(0);
}
UCL_Device::~UCL_Device() {
clear();
}
void UCL_Device::clear() {
if (_device>-1) {
for (size_t i=0; i<_cq.size(); i++) {
CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq.back()));
_cq.pop_back();
}
CL_DESTRUCT_CALL(clReleaseContext(_context));
}
_device=-1;
}
int UCL_Device::set_platform(int pid) {
clear();
cl_int errorv;
_cl_device=0;
_device=-1;
_num_devices=0;
_default_cq=0;
#ifdef UCL_DEBUG
assert(pid<num_platforms());
#endif
_platform=pid;
_cl_platform=_cl_platforms[_platform];
// --- Get Number of Devices // --- Get Number of Devices
cl_uint n; cl_uint n;
@ -277,7 +347,7 @@ inline UCL_Device::UCL_Device() {
_num_devices=n; _num_devices=n;
if (errorv!=CL_SUCCESS || _num_devices==0) { if (errorv!=CL_SUCCESS || _num_devices==0) {
_num_devices=0; _num_devices=0;
return; return UCL_ERROR;
} }
cl_device_id device_list[_num_devices]; cl_device_id device_list[_num_devices];
CL_SAFE_CALL(clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,n,device_list, CL_SAFE_CALL(clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,n,device_list,
@ -288,19 +358,11 @@ inline UCL_Device::UCL_Device() {
_cl_devices.push_back(device_list[i]); _cl_devices.push_back(device_list[i]);
add_properties(device_list[i]); add_properties(device_list[i]);
} }
return UCL_SUCCESS;
} }
inline UCL_Device::~UCL_Device() { int UCL_Device::create_context() {
if (_device>-1) {
for (size_t i=0; i<_cq.size(); i++) {
CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq.back()));
_cq.pop_back();
}
CL_DESTRUCT_CALL(clReleaseContext(_context));
}
}
inline int UCL_Device::create_context() {
cl_int errorv; cl_int errorv;
cl_context_properties props[3]; cl_context_properties props[3];
props[0]=CL_CONTEXT_PLATFORM; props[0]=CL_CONTEXT_PLATFORM;
@ -320,9 +382,10 @@ inline int UCL_Device::create_context() {
return UCL_SUCCESS; return UCL_SUCCESS;
} }
inline void UCL_Device::add_properties(cl_device_id device_list) { void UCL_Device::add_properties(cl_device_id device_list) {
OCLProperties op; OCLProperties op;
char buffer[1024]; char buffer[1024];
cl_bool ans_bool;
CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_NAME,1024,buffer,NULL)); CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_NAME,1024,buffer,NULL));
op.name=buffer; op.name=buffer;
@ -363,10 +426,49 @@ inline void UCL_Device::add_properties(cl_device_id device_list) {
CL_DEVICE_PROFILING_TIMER_RESOLUTION, CL_DEVICE_PROFILING_TIMER_RESOLUTION,
sizeof(size_t),&op.timer_resolution,NULL)); sizeof(size_t),&op.timer_resolution,NULL));
op.ecc_support=false;
CL_SAFE_CALL(clGetDeviceInfo(device_list,
CL_DEVICE_ERROR_CORRECTION_SUPPORT,
sizeof(ans_bool),&ans_bool,NULL));
if (ans_bool==CL_TRUE)
op.ecc_support=true;
op.c_version="";
op.partition_equal=false;
op.partition_counts=false;
op.partition_affinity=false;
#ifdef CL_VERSION_1_2
size_t return_bytes;
CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_OPENCL_C_VERSION,1024,
buffer,NULL));
op.c_version=buffer;
cl_device_partition_property pinfo[4];
CL_SAFE_CALL(clGetDeviceInfo(device_list,
CL_DEVICE_PARTITION_PROPERTIES,
4*sizeof(cl_device_partition_property),
pinfo,&return_bytes));
int nprops=return_bytes/sizeof(cl_device_partition_property);
for (int i=0; i<nprops; i++) {
if (pinfo[i]==CL_DEVICE_PARTITION_EQUALLY)
op.partition_equal=true;
else if (pinfo[i]==CL_DEVICE_PARTITION_BY_COUNTS)
op.partition_counts=true;
else if (pinfo[i]==CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN)
op.partition_affinity=true;
}
CL_SAFE_CALL(clGetDeviceInfo(device_list,
CL_DEVICE_PARTITION_MAX_SUB_DEVICES,
sizeof(cl_uint),&op.max_sub_devices,NULL));
#endif
_properties.push_back(op); _properties.push_back(op);
} }
inline std::string UCL_Device::platform_name() { std::string UCL_Device::platform_name() {
char info[1024]; char info[1024];
CL_SAFE_CALL(clGetPlatformInfo(_cl_platform,CL_PLATFORM_VENDOR,1024,info, CL_SAFE_CALL(clGetPlatformInfo(_cl_platform,CL_PLATFORM_VENDOR,1024,info,
@ -385,7 +487,7 @@ inline std::string UCL_Device::platform_name() {
} }
// Get a string telling the type of the device // Get a string telling the type of the device
inline std::string UCL_Device::device_type_name(const int i) { std::string UCL_Device::device_type_name(const int i) {
if (_properties[i].device_type==CL_DEVICE_TYPE_CPU) if (_properties[i].device_type==CL_DEVICE_TYPE_CPU)
return "CPU"; return "CPU";
else if (_properties[i].device_type==CL_DEVICE_TYPE_GPU) else if (_properties[i].device_type==CL_DEVICE_TYPE_GPU)
@ -397,7 +499,7 @@ inline std::string UCL_Device::device_type_name(const int i) {
} }
// Get a string telling the type of the device // Get a string telling the type of the device
inline int UCL_Device::device_type(const int i) { int UCL_Device::device_type(const int i) {
if (_properties[i].device_type==CL_DEVICE_TYPE_CPU) if (_properties[i].device_type==CL_DEVICE_TYPE_CPU)
return UCL_CPU; return UCL_CPU;
else if (_properties[i].device_type==CL_DEVICE_TYPE_GPU) else if (_properties[i].device_type==CL_DEVICE_TYPE_GPU)
@ -409,17 +511,8 @@ inline int UCL_Device::device_type(const int i) {
} }
// Set the CUDA device to the specified device number // Set the CUDA device to the specified device number
inline int UCL_Device::set(int num) { int UCL_Device::set(int num) {
if (_device==num) clear();
return UCL_SUCCESS;
if (_device>-1) {
for (size_t i=0; i<_cq.size(); i++) {
CL_SAFE_CALL(clReleaseCommandQueue(_cq.back()));
_cq.pop_back();
}
CL_SAFE_CALL(clReleaseContext(_context));
}
cl_device_id device_list[_num_devices]; cl_device_id device_list[_num_devices];
cl_uint n; cl_uint n;
@ -432,7 +525,7 @@ inline int UCL_Device::set(int num) {
} }
// List all devices along with all properties // List all devices along with all properties
inline void UCL_Device::print_all(std::ostream &out) { void UCL_Device::print_all(std::ostream &out) {
if (num_devices() == 0) if (num_devices() == 0)
out << "There is no device supporting OpenCL\n"; out << "There is no device supporting OpenCL\n";
for (int i=0; i<num_devices(); ++i) { for (int i=0; i<num_devices(); ++i) {
@ -475,6 +568,28 @@ inline void UCL_Device::print_all(std::ostream &out) {
out << " Clock rate: " out << " Clock rate: "
<< clock_rate(i) << " GHz\n"; << clock_rate(i) << " GHz\n";
//out << " Concurrent copy and execution: "; //out << " Concurrent copy and execution: ";
out << " ECC support: ";
if (_properties[i].ecc_support)
out << "Yes\n";
else
out << "No\n";
out << " Device fission into equal partitions: ";
if (fission_equal(i))
out << "Yes\n";
else
out << "No\n";
out << " Device fission by counts: ";
if (fission_by_counts(i))
out << "Yes\n";
else
out << "No\n";
out << " Device fission by affinity: ";
if (fission_by_affinity(i))
out << "Yes\n";
else
out << "No\n";
out << " Maximum subdevices from fission: "
<< max_sub_devices(i) << std::endl;
} }
} }

View File

@ -134,6 +134,11 @@ class UCL_Program {
return UCL_SUCCESS; return UCL_SUCCESS;
} }
/// Return the default command queue/stream associated with this data
inline command_queue & cq() { return _cq; }
/// Change the default command queue associated with matrix
inline void cq(command_queue &cq_in) { _cq=cq_in; }
friend class UCL_Kernel; friend class UCL_Kernel;
private: private:
bool _init_done; bool _init_done;
@ -175,7 +180,16 @@ class UCL_Kernel {
template <class dtype> template <class dtype>
inline void set_arg(const cl_uint index, const dtype * const arg) { inline void set_arg(const cl_uint index, const dtype * const arg) {
CL_SAFE_CALL(clSetKernelArg(_kernel,index,sizeof(dtype),arg)); CL_SAFE_CALL(clSetKernelArg(_kernel,index,sizeof(dtype),arg));
if (index>_num_args) _num_args=index; if (index>_num_args) {
_num_args=index;
#ifdef UCL_DEBUG
if (_num_args>_kernel_info_nargs) {
std::cerr << "TOO MANY ARGUMENTS TO OPENCL FUNCTION: "
<< _kernel_info_name << std::endl;
assert(0==1);
}
#endif
}
} }
/// Set a geryon container as a kernel argument. /// Set a geryon container as a kernel argument.
@ -203,6 +217,13 @@ class UCL_Kernel {
inline void add_arg(const dtype * const arg) { inline void add_arg(const dtype * const arg) {
CL_SAFE_CALL(clSetKernelArg(_kernel,_num_args,sizeof(dtype),arg)); CL_SAFE_CALL(clSetKernelArg(_kernel,_num_args,sizeof(dtype),arg));
_num_args++; _num_args++;
#ifdef UCL_DEBUG
if (_num_args>_kernel_info_nargs) {
std::cerr << "TOO MANY ARGUMENTS TO OPENCL FUNCTION: "
<< _kernel_info_name << std::endl;
assert(0==1);
}
#endif
} }
/// Add a geryon container as a kernel argument. /// Add a geryon container as a kernel argument.
@ -289,10 +310,7 @@ class UCL_Kernel {
} }
/// Run the kernel in the default command queue /// Run the kernel in the default command queue
inline void run() { inline void run();
CL_SAFE_CALL(clEnqueueNDRangeKernel(_cq,_kernel,_dimensions,NULL,
_num_blocks,_block_size,0,NULL,NULL));
}
/// Clear any arguments associated with the kernel /// Clear any arguments associated with the kernel
inline void clear_args() { _num_args=0; } inline void clear_args() { _num_args=0; }
@ -309,6 +327,12 @@ class UCL_Kernel {
cl_command_queue _cq; // The default command queue for this kernel cl_command_queue _cq; // The default command queue for this kernel
unsigned _num_args; unsigned _num_args;
#ifdef UCL_DEBUG
std::string _kernel_info_name;
unsigned _kernel_info_nargs;
//std::string _kernel_info_args[256];
#endif
}; };
inline int UCL_Kernel::set_function(UCL_Program &program, const char *function) { inline int UCL_Kernel::set_function(UCL_Program &program, const char *function) {
@ -329,9 +353,32 @@ inline int UCL_Kernel::set_function(UCL_Program &program, const char *function)
#endif #endif
return UCL_FUNCTION_NOT_FOUND; return UCL_FUNCTION_NOT_FOUND;
} }
#ifdef UCL_DEBUG
_kernel_info_name=function;
cl_uint nargs;
CL_SAFE_CALL(clGetKernelInfo(_kernel,CL_KERNEL_NUM_ARGS,sizeof(cl_uint),
&nargs,NULL));
_kernel_info_nargs=nargs;
#ifdef NOT_TEST_CL_VERSION_1_2
char tname[256];
size_t ret;
for (cl_uint i=0; i<nargs; i++) {
CL_SAFE_CALL(clGetKernelArgInfo(_kernel,i,CL_KERNEL_ARG_TYPE_NAME,256,
tname,&ret));
_kernel_info_args[i]=tname;
}
#endif
#endif
return UCL_SUCCESS; return UCL_SUCCESS;
} }
void UCL_Kernel::run() {
CL_SAFE_CALL(clEnqueueNDRangeKernel(_cq,_kernel,_dimensions,NULL,
_num_blocks,_block_size,0,NULL,NULL));
}
} // namespace } // namespace
#endif #endif

View File

@ -54,82 +54,138 @@ typedef cl_mem device_ptr;
template <class mat_type, class copy_type> template <class mat_type, class copy_type>
inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n, inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
const enum UCL_MEMOPT kind) { const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
cl_int error_flag; cl_int error_flag;
cl_context context; cl_context context;
CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_CONTEXT,sizeof(context), CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_CONTEXT,sizeof(context),
&context,NULL)); &context,NULL));
if (kind==UCL_VIEW) { cl_mem_flags buffer_perm;
mat.cbegin()=clCreateBuffer(context, CL_MEM_USE_HOST_PTR,n,mat.host_ptr(), cl_map_flags map_perm;
&error_flag); if (kind2==UCL_NOT_SPECIFIED) {
CL_CHECK_ERR(error_flag); if (kind==UCL_READ_ONLY) {
return UCL_SUCCESS; #ifdef CL_VERSION_1_2
} buffer_perm=CL_MEM_HOST_READ_ONLY|CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR;
if (kind==UCL_WRITE_OPTIMIZED) { #else
mat.cbegin()=clCreateBuffer(context, buffer_perm=CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR;
CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, #endif
n,NULL,&error_flag); map_perm=CL_MAP_READ;
if (error_flag != CL_SUCCESS) } else if (kind==UCL_WRITE_ONLY) {
return UCL_MEMORY_ERROR; #ifdef CL_VERSION_1_2
*mat.host_ptr() = (typename mat_type::data_type*) buffer_perm=CL_MEM_HOST_WRITE_ONLY|CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR;
clEnqueueMapBuffer(cm.cq(),mat.cbegin(),CL_TRUE, #else
CL_MAP_WRITE,0,n,0,NULL,NULL,NULL); buffer_perm=CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR;
#endif
map_perm=CL_MAP_WRITE;
} else {
buffer_perm=CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR;
map_perm=CL_MAP_READ | CL_MAP_WRITE;
}
} else { } else {
mat.cbegin()=clCreateBuffer(context, if (kind2==UCL_READ_ONLY)
CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, buffer_perm=CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR;
n,NULL,&error_flag); else if (kind2==UCL_WRITE_ONLY)
if (error_flag != CL_SUCCESS) buffer_perm=CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR;
return UCL_MEMORY_ERROR; else
buffer_perm=CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR;
if (kind==UCL_READ_ONLY) {
#ifdef CL_VERSION_1_2
buffer_perm=buffer_perm | CL_MEM_HOST_READ_ONLY;
#endif
map_perm=CL_MAP_READ;
} else if (kind==UCL_WRITE_ONLY) {
#ifdef CL_VERSION_1_2
buffer_perm=buffer_perm | CL_MEM_HOST_WRITE_ONLY;
#endif
map_perm=CL_MAP_WRITE;
} else
map_perm=CL_MAP_READ | CL_MAP_WRITE;
}
mat.cbegin()=clCreateBuffer(context,buffer_perm,n,NULL,&error_flag);
if (error_flag != CL_SUCCESS)
return UCL_MEMORY_ERROR;
*mat.host_ptr() = (typename mat_type::data_type*) *mat.host_ptr() = (typename mat_type::data_type*)
clEnqueueMapBuffer(cm.cq(),mat.cbegin(),CL_TRUE, clEnqueueMapBuffer(cm.cq(),mat.cbegin(),CL_TRUE,
CL_MAP_READ | CL_MAP_WRITE, map_perm,0,n,0,NULL,NULL,NULL);
0,n,0,NULL,NULL,NULL);
}
mat.cq()=cm.cq(); mat.cq()=cm.cq();
CL_SAFE_CALL(clRetainCommandQueue(mat.cq())); CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
return UCL_SUCCESS; return UCL_SUCCESS;
} }
template <class mat_type, class copy_type>
inline int _host_view(mat_type &mat, copy_type &cm, const size_t n) {
cl_int error_flag;
cl_context context;
CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_CONTEXT,sizeof(context),
&context,NULL));
cl_mem_flags orig_flags;
CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_FLAGS,sizeof(orig_flags),
&orig_flags,NULL));
orig_flags=orig_flags & ~CL_MEM_ALLOC_HOST_PTR;
mat.cbegin()=clCreateBuffer(context, CL_MEM_USE_HOST_PTR | orig_flags, n,
mat.host_ptr(), &error_flag);
CL_CHECK_ERR(error_flag);
CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
return UCL_SUCCESS;
}
template <class mat_type> template <class mat_type>
inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n, inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
const enum UCL_MEMOPT kind) { const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
cl_int error_flag; cl_mem_flags buffer_perm;
if (kind==UCL_VIEW) { cl_map_flags map_perm;
mat.cbegin()=clCreateBuffer(dev.context(), CL_MEM_USE_HOST_PTR, if (kind==UCL_READ_ONLY) {
n,mat.host_ptr(),&error_flag); #ifdef CL_VERSION_1_2
CL_CHECK_ERR(error_flag); buffer_perm=CL_MEM_HOST_READ_ONLY|CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR;
return UCL_SUCCESS; #else
} buffer_perm=CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR;
if (kind==UCL_WRITE_OPTIMIZED) { #endif
mat.cbegin()=clCreateBuffer(dev.context(), map_perm=CL_MAP_READ;
CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, } else if (kind==UCL_WRITE_ONLY) {
n,NULL,&error_flag); #ifdef CL_VERSION_1_2
if (error_flag != CL_SUCCESS) buffer_perm=CL_MEM_HOST_WRITE_ONLY|CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR;
return UCL_MEMORY_ERROR; #else
*mat.host_ptr() = (typename mat_type::data_type*) buffer_perm=CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR;
clEnqueueMapBuffer(dev.cq(),mat.cbegin(),CL_TRUE, #endif
CL_MAP_WRITE,0,n,0,NULL,NULL,NULL); map_perm=CL_MAP_WRITE;
} else { } else {
mat.cbegin()=clCreateBuffer(dev.context(), buffer_perm=CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR;
CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, map_perm=CL_MAP_READ | CL_MAP_WRITE;
n,NULL,&error_flag);
if (error_flag != CL_SUCCESS)
return UCL_MEMORY_ERROR;
*mat.host_ptr() = (typename mat_type::data_type*)
clEnqueueMapBuffer(dev.cq(),mat.cbegin(),CL_TRUE,
CL_MAP_READ & CL_MAP_WRITE,
0,n,0,NULL,NULL,NULL);
} }
cl_int error_flag;
mat.cbegin()=clCreateBuffer(dev.context(),buffer_perm,n,NULL,&error_flag);
if (error_flag != CL_SUCCESS)
return UCL_MEMORY_ERROR;
*mat.host_ptr() = (typename mat_type::data_type*)
clEnqueueMapBuffer(dev.cq(),mat.cbegin(),CL_TRUE,
map_perm,0,n,0,NULL,NULL,NULL);
mat.cq()=dev.cq(); mat.cq()=dev.cq();
CL_SAFE_CALL(clRetainCommandQueue(mat.cq())); CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
return UCL_SUCCESS; return UCL_SUCCESS;
} }
template <class mat_type> template <class mat_type>
inline void _host_free(mat_type &mat, const enum UCL_MEMOPT kind) { inline int _host_view(mat_type &mat, UCL_Device &dev, const size_t n) {
CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin())); cl_int error_flag;
CL_DESTRUCT_CALL(clReleaseCommandQueue(mat.cq())); mat.cbegin()=clCreateBuffer(dev.context(), CL_MEM_USE_HOST_PTR,
n,mat.host_ptr(),&error_flag);
CL_CHECK_ERR(error_flag);
CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
return UCL_SUCCESS;
}
template <class mat_type>
inline void _host_free(mat_type &mat) {
if (mat.cols()>0) {
CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
CL_DESTRUCT_CALL(clReleaseCommandQueue(mat.cq()));
}
} }
template <class mat_type> template <class mat_type>
@ -138,28 +194,26 @@ inline int _host_resize(mat_type &mat, const size_t n) {
cl_context context; cl_context context;
CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_CONTEXT,sizeof(context), CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_CONTEXT,sizeof(context),
&context,NULL)); &context,NULL));
cl_mem_flags buffer_perm;
CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_FLAGS,sizeof(buffer_perm),
&buffer_perm,NULL));
CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin())); CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
if (mat.kind()==UCL_WRITE_OPTIMIZED) {
mat.cbegin()=clCreateBuffer(context, cl_map_flags map_perm;
CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, if (mat.kind()==UCL_READ_ONLY)
n,NULL,&error_flag); map_perm=CL_MAP_READ;
if (error_flag != CL_SUCCESS) else if (mat.kind()==UCL_WRITE_ONLY)
return UCL_MEMORY_ERROR; map_perm=CL_MAP_WRITE;
*mat.host_ptr() = (typename mat_type::data_type*) else
clEnqueueMapBuffer(mat.cq(),mat.cbegin(),CL_TRUE, map_perm=CL_MAP_READ | CL_MAP_WRITE;
CL_MAP_WRITE,0,n,0,NULL,NULL,NULL);
} else { mat.cbegin()=clCreateBuffer(context,buffer_perm,n,NULL,&error_flag);
mat.cbegin()=clCreateBuffer(context, if (error_flag != CL_SUCCESS)
CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, return UCL_MEMORY_ERROR;
n,NULL,&error_flag); *mat.host_ptr() = (typename mat_type::data_type*)
if (error_flag != CL_SUCCESS) clEnqueueMapBuffer(mat.cq(),mat.cbegin(),CL_TRUE,
return UCL_MEMORY_ERROR; map_perm,0,n,0,NULL,NULL,NULL);
*mat.host_ptr() = (typename mat_type::data_type*)
clEnqueueMapBuffer(mat.cq(),mat.cbegin(),CL_TRUE,
CL_MAP_READ | CL_MAP_WRITE,
0,n,0,NULL,NULL,NULL);
}
return UCL_SUCCESS; return UCL_SUCCESS;
} }
@ -179,9 +233,17 @@ inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t n,
if (kind==UCL_READ_WRITE) if (kind==UCL_READ_WRITE)
flag=CL_MEM_READ_WRITE; flag=CL_MEM_READ_WRITE;
else if (kind==UCL_READ_ONLY) else if (kind==UCL_READ_ONLY)
#ifdef CL_VERSION_1_2
flag=CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY;
#else
flag=CL_MEM_READ_ONLY; flag=CL_MEM_READ_ONLY;
#endif
else if (kind==UCL_WRITE_ONLY) else if (kind==UCL_WRITE_ONLY)
#ifdef CL_VERSION_1_2
flag=CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY;
#else
flag=CL_MEM_WRITE_ONLY; flag=CL_MEM_WRITE_ONLY;
#endif
else else
assert(0==1); assert(0==1);
mat.cbegin()=clCreateBuffer(context,flag,n,NULL,&error_flag); mat.cbegin()=clCreateBuffer(context,flag,n,NULL,&error_flag);
@ -200,9 +262,17 @@ inline int _device_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
if (kind==UCL_READ_WRITE) if (kind==UCL_READ_WRITE)
flag=CL_MEM_READ_WRITE; flag=CL_MEM_READ_WRITE;
else if (kind==UCL_READ_ONLY) else if (kind==UCL_READ_ONLY)
#ifdef CL_VERSION_1_2
flag=CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY;
#else
flag=CL_MEM_READ_ONLY; flag=CL_MEM_READ_ONLY;
#endif
else if (kind==UCL_WRITE_ONLY) else if (kind==UCL_WRITE_ONLY)
#ifdef CL_VERSION_1_2
flag=CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY;
#else
flag=CL_MEM_WRITE_ONLY; flag=CL_MEM_WRITE_ONLY;
#endif
else else
assert(0==1); assert(0==1);
mat.cbegin()=clCreateBuffer(dev.context(),flag,n,NULL, mat.cbegin()=clCreateBuffer(dev.context(),flag,n,NULL,
@ -238,8 +308,10 @@ inline int _device_alloc(mat_type &mat, UCL_Device &dev, const size_t rows,
template <class mat_type> template <class mat_type>
inline void _device_free(mat_type &mat) { inline void _device_free(mat_type &mat) {
CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin())); if (mat.cols()>0) {
CL_DESTRUCT_CALL(clReleaseCommandQueue(mat.cq())); CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
CL_DESTRUCT_CALL(clReleaseCommandQueue(mat.cq()));
}
} }
template <class mat_type> template <class mat_type>
@ -255,9 +327,17 @@ inline int _device_resize(mat_type &mat, const size_t n) {
if (mat.kind()==UCL_READ_WRITE) if (mat.kind()==UCL_READ_WRITE)
flag=CL_MEM_READ_WRITE; flag=CL_MEM_READ_WRITE;
else if (mat.kind()==UCL_READ_ONLY) else if (mat.kind()==UCL_READ_ONLY)
#ifdef CL_VERSION_1_2
flag=CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY;
#else
flag=CL_MEM_READ_ONLY; flag=CL_MEM_READ_ONLY;
#endif
else if (mat.kind()==UCL_WRITE_ONLY) else if (mat.kind()==UCL_WRITE_ONLY)
#ifdef CL_VERSION_1_2
flag=CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY;
#else
flag=CL_MEM_WRITE_ONLY; flag=CL_MEM_WRITE_ONLY;
#endif
else else
assert(0==1); assert(0==1);
mat.cbegin()=clCreateBuffer(context,flag,n,NULL,&error_flag); mat.cbegin()=clCreateBuffer(context,flag,n,NULL,&error_flag);
@ -285,9 +365,17 @@ inline int _device_resize(mat_type &mat, const size_t rows,
if (mat.kind()==UCL_READ_WRITE) if (mat.kind()==UCL_READ_WRITE)
flag=CL_MEM_READ_WRITE; flag=CL_MEM_READ_WRITE;
else if (mat.kind()==UCL_READ_ONLY) else if (mat.kind()==UCL_READ_ONLY)
#ifdef CL_VERSION_1_2
flag=CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY;
#else
flag=CL_MEM_READ_ONLY; flag=CL_MEM_READ_ONLY;
#endif
else if (mat.kind()==UCL_WRITE_ONLY) else if (mat.kind()==UCL_WRITE_ONLY)
#ifdef CL_VERSION_1_2
flag=CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY;
#else
flag=CL_MEM_WRITE_ONLY; flag=CL_MEM_WRITE_ONLY;
#endif
else else
assert(0==1); assert(0==1);
mat.cbegin()=clCreateBuffer(context,flag,pitch*rows,NULL,&error_flag); mat.cbegin()=clCreateBuffer(context,flag,pitch*rows,NULL,&error_flag);
@ -344,7 +432,19 @@ inline void _ocl_kernel_from_source(cl_context &context, cl_device_id &device,
} }
template <class mat_type> template <class mat_type>
inline void _device_zero(mat_type &mat, const size_t n) { inline void _device_zero(mat_type &mat, const size_t n, command_queue &cq) {
#ifdef CL_VERSION_1_2
#ifndef __APPLE__
#define UCL_CL_ZERO
#endif
#endif
#ifdef UCL_CL_ZERO
cl_int zeroint=0;
CL_SAFE_CALL(clEnqueueFillBuffer(cq,mat.begin(),&zeroint,sizeof(cl_int),
mat.byteoff(),n,0,NULL,NULL));
#else
cl_context context; cl_context context;
CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_CONTEXT,sizeof(context), CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_CONTEXT,sizeof(context),
&context,NULL)); &context,NULL));
@ -354,17 +454,20 @@ inline void _device_zero(mat_type &mat, const size_t n) {
const char * szero[3]={ const char * szero[3]={
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
"__kernel void _device_zero(__global NUMTYP *a)", "__kernel void _device_zero(__global NUMTYP *a, const int offset)",
" { int gid=get_global_id(0); a[gid]=(NUMTYP)0; }" " { int gid=get_global_id(0)+offset; a[gid]=(NUMTYP)0; }"
}; };
cl_kernel kzero; cl_kernel kzero;
_ocl_kernel_from_source(context,device,szero,3,kzero,"_device_zero", _ocl_kernel_from_source(context,device,szero,3,kzero,"_device_zero",
_UCL_DATA_ID<typename mat_type::data_type>::numtyp_flag()); _UCL_DATA_ID<typename mat_type::data_type>::numtyp_flag());
cl_int offset=mat.offset();
CL_SAFE_CALL(clSetKernelArg(kzero,0,sizeof(cl_mem),(void *)&mat.begin())); CL_SAFE_CALL(clSetKernelArg(kzero,0,sizeof(cl_mem),(void *)&mat.begin()));
CL_SAFE_CALL(clSetKernelArg(kzero,1,sizeof(cl_int),(void *)&offset));
size_t kn=n/sizeof(typename mat_type::data_type); size_t kn=n/sizeof(typename mat_type::data_type);
CL_SAFE_CALL(clEnqueueNDRangeKernel(mat.cq(),kzero,1,0,&kn,0,0,0,0)); CL_SAFE_CALL(clEnqueueNDRangeKernel(cq,kzero,1,0,&kn,0,0,0,0));
#endif
} }
// -------------------------------------------------------------------------- // --------------------------------------------------------------------------
@ -470,9 +573,15 @@ template <> struct _ucl_memcpy<1,0> {
cl_command_queue &cq, const cl_bool block, cl_command_queue &cq, const cl_bool block,
const size_t dst_offset, const size_t src_offset) { const size_t dst_offset, const size_t src_offset) {
if (src.cbegin()==dst.cbegin()) { if (src.cbegin()==dst.cbegin()) {
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 1S\n";
#endif
if (block) ucl_sync(cq); if (block) ucl_sync(cq);
return; return;
} }
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 1NS\n";
#endif
CL_SAFE_CALL(clEnqueueReadBuffer(cq,src.cbegin(),block,src_offset,n, CL_SAFE_CALL(clEnqueueReadBuffer(cq,src.cbegin(),block,src_offset,n,
dst.begin(),0,NULL,NULL)); dst.begin(),0,NULL,NULL));
} }
@ -484,8 +593,14 @@ template <> struct _ucl_memcpy<1,0> {
size_t dst_offset, size_t src_offset) { size_t dst_offset, size_t src_offset) {
if (src.cbegin()==dst.cbegin()) { if (src.cbegin()==dst.cbegin()) {
if (block) ucl_sync(cq); if (block) ucl_sync(cq);
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 2S\n";
#endif
return; return;
} }
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 2NS\n";
#endif
if (spitch==dpitch && dst.cols()==src.cols() && if (spitch==dpitch && dst.cols()==src.cols() &&
src.cols()==cols/src.element_size()) src.cols()==cols/src.element_size())
CL_SAFE_CALL(clEnqueueReadBuffer(cq,src.cbegin(),block,src_offset, CL_SAFE_CALL(clEnqueueReadBuffer(cq,src.cbegin(),block,src_offset,
@ -511,8 +626,14 @@ template <> struct _ucl_memcpy<0,1> {
const size_t dst_offset, const size_t src_offset) { const size_t dst_offset, const size_t src_offset) {
if (src.cbegin()==dst.cbegin()) { if (src.cbegin()==dst.cbegin()) {
if (block) ucl_sync(cq); if (block) ucl_sync(cq);
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 3S\n";
#endif
return; return;
} }
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 3NS\n";
#endif
CL_SAFE_CALL(clEnqueueWriteBuffer(cq,dst.cbegin(),block,dst_offset,n, CL_SAFE_CALL(clEnqueueWriteBuffer(cq,dst.cbegin(),block,dst_offset,n,
src.begin(),0,NULL,NULL)); src.begin(),0,NULL,NULL));
} }
@ -524,8 +645,14 @@ template <> struct _ucl_memcpy<0,1> {
size_t dst_offset, size_t src_offset) { size_t dst_offset, size_t src_offset) {
if (src.cbegin()==dst.cbegin()) { if (src.cbegin()==dst.cbegin()) {
if (block) ucl_sync(cq); if (block) ucl_sync(cq);
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 4S\n";
#endif
return; return;
} }
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 4NS\n";
#endif
if (spitch==dpitch && dst.cols()==src.cols() && if (spitch==dpitch && dst.cols()==src.cols() &&
src.cols()==cols/src.element_size()) src.cols()==cols/src.element_size())
CL_SAFE_CALL(clEnqueueWriteBuffer(cq,dst.cbegin(),block,dst_offset, CL_SAFE_CALL(clEnqueueWriteBuffer(cq,dst.cbegin(),block,dst_offset,
@ -549,9 +676,17 @@ template <int mem1, int mem2> struct _ucl_memcpy {
static inline void mc(p1 &dst, const p2 &src, const size_t n, static inline void mc(p1 &dst, const p2 &src, const size_t n,
cl_command_queue &cq, const cl_bool block, cl_command_queue &cq, const cl_bool block,
const size_t dst_offset, const size_t src_offset) { const size_t dst_offset, const size_t src_offset) {
if (src.cbegin()!=dst.cbegin() || src_offset!=dst_offset) if (src.cbegin()!=dst.cbegin() || src_offset!=dst_offset) {
CL_SAFE_CALL(clEnqueueCopyBuffer(cq,src.cbegin(),dst.cbegin(),src_offset, CL_SAFE_CALL(clEnqueueCopyBuffer(cq,src.cbegin(),dst.cbegin(),src_offset,
dst_offset,n,0,NULL,NULL)); dst_offset,n,0,NULL,NULL));
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 6NS\n";
#endif
}
#ifdef UCL_DBG_MEM_TRACE
else std::cerr << "UCL_COPY 6S\n";
#endif
if (block==CL_TRUE) ucl_sync(cq); if (block==CL_TRUE) ucl_sync(cq);
} }
template <class p1, class p2> template <class p1, class p2>
@ -561,6 +696,9 @@ template <int mem1, int mem2> struct _ucl_memcpy {
const cl_bool block, const cl_bool block,
size_t dst_offset, size_t src_offset) { size_t dst_offset, size_t src_offset) {
if (src.cbegin()!=dst.cbegin() || src_offset!=dst_offset) { if (src.cbegin()!=dst.cbegin() || src_offset!=dst_offset) {
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 7NS\n";
#endif
if (spitch==dpitch && dst.cols()==src.cols() && if (spitch==dpitch && dst.cols()==src.cols() &&
src.cols()==cols/src.element_size()) src.cols()==cols/src.element_size())
CL_SAFE_CALL(clEnqueueCopyBuffer(cq,src.cbegin(),dst.cbegin(),src_offset, CL_SAFE_CALL(clEnqueueCopyBuffer(cq,src.cbegin(),dst.cbegin(),src_offset,
@ -575,6 +713,10 @@ template <int mem1, int mem2> struct _ucl_memcpy {
dst_offset+=dpitch; dst_offset+=dpitch;
} }
} }
#ifdef UCL_DBG_MEM_TRACE
else std::cerr << "UCL_COPY 7S\n";
#endif
if (block==CL_TRUE) ucl_sync(cq); if (block==CL_TRUE) ucl_sync(cq);
} }
}; };

View File

@ -27,6 +27,12 @@
#include "ocl_macros.h" #include "ocl_macros.h"
#include "ocl_device.h" #include "ocl_device.h"
#ifdef CL_VERSION_1_2
#define UCL_OCL_MARKER(cq,event) clEnqueueMarkerWithWaitList(cq,0,NULL,event)
#else
#define UCL_OCL_MARKER clEnqueueMarker
#endif
namespace ucl_opencl { namespace ucl_opencl {
/// Class for timing OpenCL events /// Class for timing OpenCL events
@ -63,10 +69,10 @@ class UCL_Timer {
} }
/// Start timing on default command queue /// Start timing on default command queue
inline void start() { clEnqueueMarker(_cq,&start_event); } inline void start() { UCL_OCL_MARKER(_cq,&start_event); }
/// Stop timing on default command queue /// Stop timing on default command queue
inline void stop() { clEnqueueMarker(_cq,&stop_event); } inline void stop() { UCL_OCL_MARKER(_cq,&stop_event); }
/// Block until the start event has been reached on device /// Block until the start event has been reached on device
inline void sync_start() inline void sync_start()
@ -78,7 +84,7 @@ class UCL_Timer {
/// Set the time elapsed to zero (not the total_time) /// Set the time elapsed to zero (not the total_time)
inline void zero() inline void zero()
{ clEnqueueMarker(_cq,&start_event); clEnqueueMarker(_cq,&stop_event); } { UCL_OCL_MARKER(_cq,&start_event); UCL_OCL_MARKER(_cq,&stop_event); }
/// Set the total time to zero /// Set the total time to zero
inline void zero_total() { _total_time=0.0; } inline void zero_total() { _total_time=0.0; }

View File

@ -58,19 +58,36 @@
* calls for reserving and copying memory **/ * calls for reserving and copying memory **/
class UCL_BaseMat { class UCL_BaseMat {
public: public:
UCL_BaseMat() : _cq(0) { } UCL_BaseMat() : _cq(0), _kind(UCL_VIEW) { }
virtual ~UCL_BaseMat() { } virtual ~UCL_BaseMat() { }
/// Return the default command queue/stream associated with this data /// Return the default command queue/stream associated with this data
inline command_queue & cq() { return _cq; } inline command_queue & cq() { return _cq; }
/// Change the default command queue associated with matrix
inline void cq(command_queue &cq_in) { _cq=cq_in; }
/// Block until command_queue associated with matrix is complete /// Block until command_queue associated with matrix is complete
inline void sync() { ucl_sync(_cq); } inline void sync() { ucl_sync(_cq); }
/// Return the type/permissions of memory allocation
/** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, UCL_NOT_PINNED
* or UCL_VIEW **/
inline enum UCL_MEMOPT kind() const { return _kind; }
inline bool shared_mem_device() {
#ifdef _OCL_MAT
cl_device_id device;
CL_SAFE_CALL(clGetCommandQueueInfo(_cq,CL_QUEUE_DEVICE,
sizeof(cl_device_id),&device,NULL));
cl_device_type device_type;
CL_SAFE_CALL(clGetDeviceInfo(device,CL_DEVICE_TYPE,
sizeof(device_type),&device_type,NULL));
return _shared_mem_device(device_type);
#else
return false;
#endif
}
#ifdef UCL_DEBUG
// Returns the type of host allocation
virtual inline enum UCL_MEMOPT kind() const { return UCL_NOT_PINNED; }
#endif
protected: protected:
command_queue _cq; command_queue _cq;
enum UCL_MEMOPT _kind;
}; };
#endif #endif

View File

@ -102,6 +102,30 @@
// Only allow this file to be included by nvc_memory.h and ocl_memory.h // Only allow this file to be included by nvc_memory.h and ocl_memory.h
#ifdef UCL_COPY_ALLOW #ifdef UCL_COPY_ALLOW
// --------------------------------------------------------------------------
// - CHECK PERMISSIONS FOR SOURCE AND DESTINATION IN COPY
// --------------------------------------------------------------------------
template <class mat1, class mat2>
inline void _check_ucl_copy_perm(mat1 &dst, mat2 &src) {
if ((int)mat1::MEM_TYPE==(int)mat2::MEM_TYPE) {
if (dst.kind()==UCL_READ_ONLY) {
std::cerr << "Attempt to copy where destination is UCL_READ_ONLY\n";
assert(0==1);
} else if (src.kind()==UCL_WRITE_ONLY) {
std::cerr << "Attempt to copy where source is UCL_WRITE_ONLY\n";
assert(0==1);
}
} else {
if (dst.kind()==UCL_WRITE_ONLY) {
std::cerr << "Destination in host-device copy cannot be UCL_WRITE_ONLY\n";
assert(0==1);
} else if (src.kind()==UCL_READ_ONLY) {
std::cerr << "Source in host-device copy cannot be UCL_READ_ONLY\n";
assert(0==1);
}
}
}
// -------------------------------------------------------------------------- // --------------------------------------------------------------------------
// - HOST-HOST COPY ROUTINES // - HOST-HOST COPY ROUTINES
// -------------------------------------------------------------------------- // --------------------------------------------------------------------------
@ -117,9 +141,20 @@ template <> struct _host_host_copy<1,1> {
assert(mat1::PADDED==0 && mat2::PADDED==0); assert(mat1::PADDED==0 && mat2::PADDED==0);
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1); assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
#endif #endif
if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE && mat1::DATA_TYPE!=0) if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE && mat1::DATA_TYPE!=0) {
#ifdef _OCL_MAT
if (dst.begin()==src.begin()) {
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 7S\n";
#endif
return;
}
#endif
memcpy(dst.begin(),src.begin(),numel*sizeof(typename mat1::data_type)); memcpy(dst.begin(),src.begin(),numel*sizeof(typename mat1::data_type));
else #ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 7NS\n";
#endif
} else
for (size_t i=0; i<numel; i++) for (size_t i=0; i<numel; i++)
dst[i]=static_cast<typename mat1::data_type>(src[i]); dst[i]=static_cast<typename mat1::data_type>(src[i]);
} }
@ -138,15 +173,27 @@ template <> struct _host_host_copy<1,1> {
src_row_size=cols; src_row_size=cols;
else else
src_row_size=src.row_size(); src_row_size=src.row_size();
if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE && mat1::DATA_TYPE!=0) if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE && mat1::DATA_TYPE!=0) {
#ifdef _OCL_MAT
if (dst.begin()==src.begin()) {
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 8S\n";
#endif
return;
}
#endif
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 8NS\n";
#endif
for (size_t i=0; i<rows; i++) for (size_t i=0; i<rows; i++)
memcpy(dst.begin()+i*dst_row_size,src.begin()+i*src_row_size, memcpy(dst.begin()+i*dst_row_size,src.begin()+i*src_row_size,
cols*sizeof(typename mat1::data_type)); cols*sizeof(typename mat1::data_type));
else } else
for (size_t j=0; j<rows; j++) { for (size_t j=0; j<rows; j++) {
int dst_i=j*dst_row_size; size_t dst_i=j*dst_row_size;
int d_end=dst_i+cols; size_t d_end=dst_i+cols;
int src_i=j*src_row_size; size_t src_i=j*src_row_size;
for (; dst_i<d_end; dst_i++) { for (; dst_i<d_end; dst_i++) {
dst[dst_i]=static_cast<typename mat1::data_type>(src[src_i]); dst[dst_i]=static_cast<typename mat1::data_type>(src[src_i]);
src_i++; src_i++;
@ -216,15 +263,14 @@ template <int host_type2> struct _ucl_cast_copy<1,host_type2> {
ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src, ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
src.row_bytes(),cols*sizeof(typename mat2::data_type), src.row_bytes(),cols*sizeof(typename mat2::data_type),
rows); rows);
int dst_i=0; size_t dst_i=0, buff_i=0, doff=dst.cols()-cols;
int buff_i=0;
for (size_t i=0; i<rows; i++) { for (size_t i=0; i<rows; i++) {
for (size_t j=0; j<cols; j++) { for (size_t j=0; j<cols; j++) {
dst[dst_i]=static_cast<typename mat1::data_type>(cast_buffer[buff_i]); dst[dst_i]=static_cast<typename mat1::data_type>(cast_buffer[buff_i]);
buff_i++; buff_i++;
dst_i++; dst_i++;
} }
dst_i+=dst.cols()-cols; dst_i+=doff;
} }
} }
} }
@ -255,15 +301,14 @@ template <int host_type2> struct _ucl_cast_copy<1,host_type2> {
src.row_bytes(),cols*sizeof(typename mat2::data_type), src.row_bytes(),cols*sizeof(typename mat2::data_type),
rows,cq); rows,cq);
cast_buffer.sync(); cast_buffer.sync();
int dst_i=0; size_t dst_i=0, buff_i=0, doff=dst.cols()-cols;
int buff_i=0;
for (size_t i=0; i<rows; i++) { for (size_t i=0; i<rows; i++) {
for (size_t j=0; j<cols; j++) { for (size_t j=0; j<cols; j++) {
dst[dst_i]=static_cast<typename mat1::data_type>(cast_buffer[buff_i]); dst[dst_i]=static_cast<typename mat1::data_type>(cast_buffer[buff_i]);
buff_i++; buff_i++;
dst_i++; dst_i++;
} }
dst_i+=dst.cols()-cols; dst_i+=doff;
} }
} }
} }
@ -293,38 +338,62 @@ template <int host_type1> struct _ucl_cast_copy<host_type1,1> {
assert(src.numel()>=rows*cols && cast_buffer.numel()>=rows*cols); assert(src.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols); if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols); if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
if (mat3::VECTOR==0) {
assert(cast_buffer.rows()>=rows && cast_buffer.cols()>=cols);
assert(dst.rows()>=rows && dst.cols()>=cols);
}
#endif #endif
if (mat2::VECTOR) { if (mat2::VECTOR) {
for (size_t i=0; i<rows*cols; i++) if (mat3::VECTOR==0) {
cast_buffer[i]=static_cast<typename mat3::data_type>(src[i]); size_t ci=0, si=0, co=cast_buffer.cols()-cols, so=src.cols()-cols;
ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer, for (size_t i=0; i<rows; i++) {
cols*sizeof(typename mat1::data_type), for (size_t j=0; j<cols; j++) {
cols*sizeof(typename mat1::data_type),rows); cast_buffer[ci]=static_cast<typename mat3::data_type>(src[si]);
ci++;
si++;
}
ci+=co;
si+=so;
}
ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,cast_buffer.row_bytes(),
cols*sizeof(typename mat1::data_type),rows);
} else {
for (size_t i=0; i<rows*cols; i++)
cast_buffer[i]=static_cast<typename mat3::data_type>(src[i]);
ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,
cols*sizeof(typename mat1::data_type),
cols*sizeof(typename mat1::data_type),rows);
}
} else if (mat1::VECTOR) { } else if (mat1::VECTOR) {
int src_i=0; size_t src_i=0, buf_i=0, soff=src.cols()-cols;
int buf_i=0;
for (size_t i=0; i<rows; i++) { for (size_t i=0; i<rows; i++) {
for (size_t j=0; j<cols; j++) { for (size_t j=0; j<cols; j++) {
cast_buffer[buf_i]=static_cast<typename mat3::data_type>(src[src_i]); cast_buffer[buf_i]=static_cast<typename mat3::data_type>(src[src_i]);
buf_i++; buf_i++;
src_i++; src_i++;
} }
src_i+=src.cols()-cols; src_i+=soff;
} }
ucl_mv_cpy(dst,cast_buffer,cols*sizeof(typename mat1::data_type)*rows); ucl_mv_cpy(dst,cast_buffer,cols*sizeof(typename mat1::data_type)*rows);
} else { } else {
int src_i=0; size_t src_i=0, buf_i=0, so=src.cols()-cols, co, spitch;
int buf_i=0; if (mat3::VECTOR==0) {
co=cast_buffer.cols()-cols;
spitch=cast_buffer.row_bytes();
} else {
co=0;
spitch=cols*sizeof(typename mat1::data_type);
}
for (size_t i=0; i<rows; i++) { for (size_t i=0; i<rows; i++) {
for (size_t j=0; j<cols; j++) { for (size_t j=0; j<cols; j++) {
cast_buffer[buf_i]=static_cast<typename mat3::data_type>(src[src_i]); cast_buffer[buf_i]=static_cast<typename mat3::data_type>(src[src_i]);
buf_i++; buf_i++;
src_i++; src_i++;
} }
src_i+=src.cols()-cols; src_i+=so;
buf_i+=co;
} }
ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer, ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,spitch,
cols*sizeof(typename mat1::data_type),
cols*sizeof(typename mat1::data_type),rows); cols*sizeof(typename mat1::data_type),rows);
} }
} }
@ -337,38 +406,62 @@ template <int host_type1> struct _ucl_cast_copy<host_type1,1> {
assert(src.numel()>=rows*cols && cast_buffer.numel()>=rows*cols); assert(src.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols); if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols); if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
if (mat3::VECTOR==0) {
assert(cast_buffer.rows()>=rows && cast_buffer.cols()>=cols);
assert(dst.rows()>=rows && dst.cols()>=cols);
}
#endif #endif
if (mat2::VECTOR) { if (mat2::VECTOR) {
for (size_t i=0; i<rows*cols; i++) if (mat3::VECTOR==0) {
cast_buffer[i]=static_cast<typename mat3::data_type>(src[i]); size_t ci=0, si=0, co=cast_buffer.cols()-cols, so=src.cols()-cols;
ucl_mv_cpy(dst,dst.row_bytes(), for (size_t i=0; i<rows; i++) {
cast_buffer,cols*sizeof(typename mat1::data_type), for (size_t j=0; j<cols; j++) {
cols*sizeof(typename mat1::data_type),rows,cq); cast_buffer[ci]=static_cast<typename mat3::data_type>(src[si]);
ci++;
si++;
}
ci+=co;
si+=so;
}
ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,cast_buffer.row_bytes(),
cols*sizeof(typename mat1::data_type),rows);
} else {
for (size_t i=0; i<rows*cols; i++)
cast_buffer[i]=static_cast<typename mat3::data_type>(src[i]);
ucl_mv_cpy(dst,dst.row_bytes(),
cast_buffer,cols*sizeof(typename mat1::data_type),
cols*sizeof(typename mat1::data_type),rows,cq);
}
} else if (mat1::VECTOR) { } else if (mat1::VECTOR) {
int src_i=0; size_t src_i=0, buf_i=0, soff=src.cols()-cols;
int buf_i=0;
for (size_t i=0; i<rows; i++) { for (size_t i=0; i<rows; i++) {
for (size_t j=0; j<cols; j++) { for (size_t j=0; j<cols; j++) {
cast_buffer[buf_i]=static_cast<typename mat3::data_type>(src[src_i]); cast_buffer[buf_i]=static_cast<typename mat3::data_type>(src[src_i]);
buf_i++; buf_i++;
src_i++; src_i++;
} }
src_i+=src.cols()-cols; src_i+=soff;
} }
ucl_mv_cpy(dst,cast_buffer,cols*sizeof(typename mat1::data_type)*rows,cq); ucl_mv_cpy(dst,cast_buffer,cols*sizeof(typename mat1::data_type)*rows,cq);
} else { } else {
int src_i=0; size_t src_i=0, buf_i=0, so=src.cols()-cols, co, spitch;
int buf_i=0; if (mat3::VECTOR==0) {
co=cast_buffer.cols()-cols;
spitch=cast_buffer.row_bytes();
} else {
co=0;
spitch=cols*sizeof(typename mat1::data_type);
}
for (size_t i=0; i<rows; i++) { for (size_t i=0; i<rows; i++) {
for (size_t j=0; j<cols; j++) { for (size_t j=0; j<cols; j++) {
cast_buffer[buf_i]=static_cast<typename mat3::data_type>(src[src_i]); cast_buffer[buf_i]=static_cast<typename mat3::data_type>(src[src_i]);
buf_i++; buf_i++;
src_i++; src_i++;
} }
src_i+=src.cols()-cols; src_i+=so;
buf_i+=co;
} }
ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer, ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,spitch,
cols*sizeof(typename mat1::data_type),
cols*sizeof(typename mat1::data_type),rows,cq); cols*sizeof(typename mat1::data_type),rows,cq);
} }
} }
@ -444,9 +537,13 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel,
#endif #endif
if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE) if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
ucl_copy(dst,src,numel,cq); ucl_copy(dst,src,numel,cq);
else else {
#ifdef UCL_DEBUG
_check_ucl_copy_perm(dst,src);
#endif
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel, _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
cast_buffer,cq); cast_buffer,cq);
}
} }
/// Asynchronous copy of matrix/vector with cast (Device/Host transfer) /// Asynchronous copy of matrix/vector with cast (Device/Host transfer)
@ -463,6 +560,7 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel,
assert(dst.numel()>=numel && src.numel()>=numel); assert(dst.numel()>=numel && src.numel()>=numel);
assert(cast_buffer.numel()>=numel); assert(cast_buffer.numel()>=numel);
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1); assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
_check_ucl_copy_perm(dst,src);
#endif #endif
if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE) if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
ucl_copy(dst,src,numel,async); ucl_copy(dst,src,numel,async);
@ -491,6 +589,7 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
assert(dst.row_size()*dst.rows()>=numel && src.row_size()*src.rows()>=numel); assert(dst.row_size()*dst.rows()>=numel && src.row_size()*src.rows()>=numel);
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1); assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1); assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
_check_ucl_copy_perm(dst,src);
#endif #endif
if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1) if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
_host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,numel); _host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,numel);
@ -498,12 +597,12 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
(mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) { (mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
if (mat1::MEM_TYPE==1) { if (mat1::MEM_TYPE==1) {
UCL_H_Vec<typename mat2::data_type> cast_buffer; UCL_H_Vec<typename mat2::data_type> cast_buffer;
cast_buffer.alloc(numel,dst,UCL_RW_OPTIMIZED); cast_buffer.alloc(numel,dst,UCL_READ_ONLY);
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel, _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
cast_buffer,cq); cast_buffer,cq);
} else { } else {
UCL_H_Vec<typename mat1::data_type> cast_buffer; UCL_H_Vec<typename mat1::data_type> cast_buffer;
cast_buffer.alloc(numel,dst,UCL_WRITE_OPTIMIZED); cast_buffer.alloc(numel,dst,UCL_WRITE_ONLY);
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel, _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
cast_buffer,cq); cast_buffer,cq);
} }
@ -529,6 +628,7 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
#ifdef UCL_DEBUG #ifdef UCL_DEBUG
assert(dst.row_size()*dst.rows()>=numel && src.row_size()*src.rows()>=numel); assert(dst.row_size()*dst.rows()>=numel && src.row_size()*src.rows()>=numel);
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1); assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
_check_ucl_copy_perm(dst,src);
#endif #endif
if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1) if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
_host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,numel); _host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,numel);
@ -538,12 +638,12 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
(mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) { (mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
if (mat1::MEM_TYPE==1) { if (mat1::MEM_TYPE==1) {
UCL_H_Vec<typename mat2::data_type> cast_buffer; UCL_H_Vec<typename mat2::data_type> cast_buffer;
cast_buffer.alloc(numel,dst,UCL_RW_OPTIMIZED); cast_buffer.alloc(numel,dst,UCL_READ_ONLY);
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel, _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
cast_buffer); cast_buffer);
} else { } else {
UCL_H_Vec<typename mat1::data_type> cast_buffer; UCL_H_Vec<typename mat1::data_type> cast_buffer;
cast_buffer.alloc(numel,dst,UCL_WRITE_OPTIMIZED); cast_buffer.alloc(numel,dst,UCL_WRITE_ONLY);
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel, _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
cast_buffer); cast_buffer);
} }
@ -574,9 +674,13 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t rows,
ucl_copy(dst,src,rows,cols,async); ucl_copy(dst,src,rows,cols,async);
else if (async) else if (async)
ucl_copy(dst,src,rows,cols,dst.cq()); ucl_copy(dst,src,rows,cols,dst.cq());
else else {
#ifdef UCL_DEBUG
_check_ucl_copy_perm(dst,src);
#endif
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols, _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
cast_buffer); cast_buffer);
}
} }
/// Asynchronous copy subset matrix rows,cols with cast (Device/Host transfer) /// Asynchronous copy subset matrix rows,cols with cast (Device/Host transfer)
@ -595,9 +699,13 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t rows,
command_queue &cq) { command_queue &cq) {
if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE) if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
ucl_copy(dst,src,rows,cols,cq); ucl_copy(dst,src,rows,cols,cq);
else else {
#ifdef UCL_DEBUG
_check_ucl_copy_perm(dst,src);
#endif
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols, _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
cast_buffer,cq); cast_buffer,cq);
}
} }
/// Asynchronous copy of subset matrix rows,cols (memory already allocated) /// Asynchronous copy of subset matrix rows,cols (memory already allocated)
@ -617,18 +725,21 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t rows,
template <class mat1, class mat2> template <class mat1, class mat2>
inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows, inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
const size_t cols, command_queue &cq) { const size_t cols, command_queue &cq) {
#ifdef UCL_DEBUG
_check_ucl_copy_perm(dst,src);
#endif
if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1) if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
_host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,rows,cols); _host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,rows,cols);
else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE && else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE &&
(mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) { (mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
if (mat1::MEM_TYPE==1) { if (mat1::MEM_TYPE==1) {
UCL_H_Vec<typename mat2::data_type> cast_buffer; UCL_H_Vec<typename mat2::data_type> cast_buffer;
cast_buffer.alloc(rows*cols,dst,UCL_RW_OPTIMIZED); cast_buffer.alloc(rows*cols,dst,UCL_READ_ONLY);
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols, _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
cast_buffer,cq); cast_buffer,cq);
} else { } else {
UCL_H_Vec<typename mat1::data_type> cast_buffer; UCL_H_Vec<typename mat1::data_type> cast_buffer;
cast_buffer.alloc(rows*cols,dst,UCL_WRITE_OPTIMIZED); cast_buffer.alloc(rows*cols,dst,UCL_WRITE_ONLY);
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols, _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
cast_buffer,cq); cast_buffer,cq);
} }
@ -678,6 +789,9 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
template <class mat1, class mat2> template <class mat1, class mat2>
inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows, inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
const size_t cols, const bool async) { const size_t cols, const bool async) {
#ifdef UCL_DEBUG
_check_ucl_copy_perm(dst,src);
#endif
if (async) if (async)
ucl_copy(dst,src,rows,cols,dst.cq()); ucl_copy(dst,src,rows,cols,dst.cq());
else if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1) else if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
@ -686,12 +800,12 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
(mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) { (mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
if (mat1::MEM_TYPE==1) { if (mat1::MEM_TYPE==1) {
UCL_H_Vec<typename mat2::data_type> cast_buffer; UCL_H_Vec<typename mat2::data_type> cast_buffer;
cast_buffer.alloc(rows*cols,dst,UCL_RW_OPTIMIZED); cast_buffer.alloc(rows*cols,dst,UCL_READ_ONLY);
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols, _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
cast_buffer); cast_buffer);
} else { } else {
UCL_H_Vec<typename mat1::data_type> cast_buffer; UCL_H_Vec<typename mat1::data_type> cast_buffer;
cast_buffer.alloc(rows*cols,dst,UCL_WRITE_OPTIMIZED); cast_buffer.alloc(rows*cols,dst,UCL_WRITE_ONLY);
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols, _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
cast_buffer); cast_buffer);
} }

View File

@ -39,14 +39,14 @@ class UCL_D_Mat : public UCL_BaseMat {
}; };
typedef numtyp data_type; typedef numtyp data_type;
UCL_D_Mat() : _rows(0), _kind(UCL_VIEW) {} UCL_D_Mat() : _cols(0) {}
~UCL_D_Mat() { if (_kind!=UCL_VIEW) _device_free(*this); } ~UCL_D_Mat() { _device_free(*this); }
/// Construct with specified rows and cols /// Construct with specified rows and cols
/** \sa alloc() **/ /** \sa alloc() **/
UCL_D_Mat(const size_t rows, const size_t cols, UCL_Device &device, UCL_D_Mat(const size_t rows, const size_t cols, UCL_Device &device,
const enum UCL_MEMOPT kind=UCL_READ_WRITE) : const enum UCL_MEMOPT kind=UCL_READ_WRITE) :
_rows(0), _kind(UCL_VIEW) { alloc(rows,cols,device,kind); } _cols(0) { alloc(rows,cols,device,kind); }
/// Row major matrix on device /// Row major matrix on device
/** The kind parameter controls memory optimizations as follows: /** The kind parameter controls memory optimizations as follows:
@ -121,15 +121,11 @@ class UCL_D_Mat : public UCL_BaseMat {
return err; return err;
} }
/// Return the type of memory allocation
/** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, or UCL_VIEW **/
inline enum UCL_MEMOPT kind() const { return _kind; }
/// Do not allocate memory, instead use an existing allocation from Geryon /// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container * allocating container when using CUDA APIs
* \param stride Number of _elements_ between the start of each row **/ * \param stride Number of _elements_ between the start of each row **/
template <class ucl_type> template <class ucl_type>
inline void view(ucl_type &input, const size_t rows, const size_t cols, inline void view(ucl_type &input, const size_t rows, const size_t cols,
@ -142,8 +138,10 @@ class UCL_D_Mat : public UCL_BaseMat {
_row_size=stride; _row_size=stride;
this->_cq=input.cq(); this->_cq=input.cq();
#ifdef _OCL_MAT #ifdef _OCL_MAT
_offset=0; _offset=input.offset();
_array=input.cbegin(); _array=input.cbegin();
CL_SAFE_CALL(clRetainMemObject(input.cbegin()));
CL_SAFE_CALL(clRetainCommandQueue(input.cq()));
#else #else
_device_view(&_array,input.begin()); _device_view(&_array,input.begin());
#endif #endif
@ -157,7 +155,7 @@ class UCL_D_Mat : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container **/ * allocating container when using CUDA APIs **/
template <class ucl_type> template <class ucl_type>
inline void view(ucl_type &input, const size_t rows, const size_t cols) inline void view(ucl_type &input, const size_t rows, const size_t cols)
{ view(input,rows,cols,input.row_size()); } { view(input,rows,cols,input.row_size()); }
@ -166,7 +164,7 @@ class UCL_D_Mat : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container * allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding) * - If a matrix is used a input, all elements (including padding)
* will be used for view **/ * will be used for view **/
template <class ucl_type> template <class ucl_type>
@ -177,7 +175,7 @@ class UCL_D_Mat : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container * allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding) * - If a matrix is used a input, all elements (including padding)
* will be used for view **/ * will be used for view **/
template <class ucl_type> template <class ucl_type>
@ -187,7 +185,7 @@ class UCL_D_Mat : public UCL_BaseMat {
/// Do not allocate memory, instead use an existing allocation /// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed. /** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container * allocating container when using CUDA APIs
* \param stride Number of _elements_ between the start of each row **/ * \param stride Number of _elements_ between the start of each row **/
template <class ptr_type> template <class ptr_type>
inline void view(ptr_type input, const size_t rows, const size_t cols, inline void view(ptr_type input, const size_t rows, const size_t cols,
@ -205,13 +203,15 @@ class UCL_D_Mat : public UCL_BaseMat {
#endif #endif
#ifdef _OCL_MAT #ifdef _OCL_MAT
_offset=0; _offset=0;
CL_SAFE_CALL(clRetainMemObject(input));
CL_SAFE_CALL(clRetainCommandQueue(dev.cq()));
#endif #endif
} }
/// Do not allocate memory, instead use an existing allocation /// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed. /** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container **/ * allocating container when using CUDA APIs **/
template <class ptr_type> template <class ptr_type>
inline void view(ptr_type input, const size_t rows, const size_t cols, inline void view(ptr_type input, const size_t rows, const size_t cols,
UCL_Device &dev) { view(input,rows,cols,cols,dev); } UCL_Device &dev) { view(input,rows,cols,cols,dev); }
@ -219,7 +219,7 @@ class UCL_D_Mat : public UCL_BaseMat {
/// Do not allocate memory, instead use an existing allocation /// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed. /** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container **/ * allocating container when using CUDA APIs **/
template <class ptr_type> template <class ptr_type>
inline void view(ptr_type input, const size_t cols, UCL_Device &dev) inline void view(ptr_type input, const size_t cols, UCL_Device &dev)
{ view(input,1,cols,dev); } { view(input,1,cols,dev); }
@ -228,7 +228,7 @@ class UCL_D_Mat : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container * allocating container when using CUDA APIs
* \param stride Number of _elements_ between the start of each row **/ * \param stride Number of _elements_ between the start of each row **/
template <class ucl_type> template <class ucl_type>
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows, inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
@ -242,7 +242,9 @@ class UCL_D_Mat : public UCL_BaseMat {
this->_cq=input.cq(); this->_cq=input.cq();
#ifdef _OCL_MAT #ifdef _OCL_MAT
_array=input.begin(); _array=input.begin();
_offset=offset; _offset=offset+input.offset();
CL_SAFE_CALL(clRetainMemObject(input.cbegin()));
CL_SAFE_CALL(clRetainCommandQueue(input.cq()));
#else #else
_device_view(&_array,input.begin(),offset,sizeof(numtyp)); _device_view(&_array,input.begin(),offset,sizeof(numtyp));
#endif #endif
@ -256,7 +258,7 @@ class UCL_D_Mat : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container **/ * allocating container when using CUDA APIs **/
template <class ucl_type> template <class ucl_type>
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows, inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
const size_t cols) const size_t cols)
@ -266,7 +268,7 @@ class UCL_D_Mat : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container * allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding) * - If a matrix is used a input, all elements (including padding)
* will be used for view **/ * will be used for view **/
template <class ucl_type> template <class ucl_type>
@ -277,7 +279,7 @@ class UCL_D_Mat : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container * allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding) * - If a matrix is used a input, all elements (including padding)
* will be used for view **/ * will be used for view **/
template <class ucl_type> template <class ucl_type>
@ -292,7 +294,7 @@ class UCL_D_Mat : public UCL_BaseMat {
/// Do not allocate memory, instead use an existing allocation /// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed. /** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container * allocating container when using CUDA APIs
* \param stride Number of _elements_ between the start of each row **/ * \param stride Number of _elements_ between the start of each row **/
template <class ptr_type> template <class ptr_type>
inline void view_offset(const size_t offset,ptr_type input,const size_t rows, inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
@ -309,6 +311,8 @@ class UCL_D_Mat : public UCL_BaseMat {
#ifdef _OCL_MAT #ifdef _OCL_MAT
_array=input; _array=input;
_offset=offset; _offset=offset;
CL_SAFE_CALL(clRetainMemObject(input));
CL_SAFE_CALL(clRetainCommandQueue(dev.cq()));
#else #else
#ifdef _UCL_DEVICE_PTR_MAT #ifdef _UCL_DEVICE_PTR_MAT
_array=input+offset*sizeof(numtyp); _array=input+offset*sizeof(numtyp);
@ -325,7 +329,7 @@ class UCL_D_Mat : public UCL_BaseMat {
/// Do not allocate memory, instead use an existing allocation /// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed. /** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container **/ * allocating container when using CUDA APIs **/
template <class ptr_type> template <class ptr_type>
inline void view_offset(const size_t offset,ptr_type input,const size_t rows, inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
const size_t cols, UCL_Device &dev) const size_t cols, UCL_Device &dev)
@ -334,7 +338,7 @@ class UCL_D_Mat : public UCL_BaseMat {
/// Do not allocate memory, instead use an existing allocation /// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed. /** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container **/ * allocating container when using CUDA APIs **/
template <class ptr_type> template <class ptr_type>
inline void view_offset(const size_t offset, ptr_type input, inline void view_offset(const size_t offset, ptr_type input,
const size_t cols, UCL_Device &dev) const size_t cols, UCL_Device &dev)
@ -342,7 +346,7 @@ class UCL_D_Mat : public UCL_BaseMat {
/// Free memory and set size to 0 /// Free memory and set size to 0
inline void clear() inline void clear()
{ _rows=0; if (_kind!=UCL_VIEW) { _kind=UCL_VIEW; _device_free(*this); } } { _device_free(*this); _cols=0; _kind=UCL_VIEW; }
/// Resize the allocation to contain cols elements /// Resize the allocation to contain cols elements
/** \note Cannot be used on views **/ /** \note Cannot be used on views **/
@ -377,11 +381,17 @@ class UCL_D_Mat : public UCL_BaseMat {
{ if (cols>_cols || rows>_rows) return resize(rows,cols); { if (cols>_cols || rows>_rows) return resize(rows,cols);
else return UCL_SUCCESS; } else return UCL_SUCCESS; }
/// Set each element to zero /// Set each element to zero asynchronously in the default command_queue
inline void zero() { _device_zero(*this,row_bytes()*_rows); } inline void zero() { zero(_cq); }
/// Set first n elements to zero asynchronously in the default command_queue
inline void zero(const int n) { zero(n,_cq); }
/// Set each element to zero asynchronously
inline void zero(command_queue &cq)
{ _device_zero(*this,row_bytes()*_rows,cq); }
/// Set first n elements to zero asynchronously
inline void zero(const int n, command_queue &cq)
{ _device_zero(*this,n*sizeof(numtyp),cq); }
/// Set first n elements to zero
inline void zero(const int n) { _device_zero(*this,n*sizeof(numtyp)); }
#ifdef _UCL_DEVICE_PTR_MAT #ifdef _UCL_DEVICE_PTR_MAT
/// For OpenCL, returns a (void *) device pointer to memory allocation /// For OpenCL, returns a (void *) device pointer to memory allocation
@ -452,7 +462,6 @@ class UCL_D_Mat : public UCL_BaseMat {
private: private:
size_t _pitch, _row_size, _rows, _cols; size_t _pitch, _row_size, _rows, _cols;
enum UCL_MEMOPT _kind;
#ifdef _UCL_DEVICE_PTR_MAT #ifdef _UCL_DEVICE_PTR_MAT
device_ptr _array; device_ptr _array;

View File

@ -39,14 +39,14 @@ class UCL_D_Vec : public UCL_BaseMat {
}; };
typedef numtyp data_type; typedef numtyp data_type;
UCL_D_Vec() : _cols(0), _kind(UCL_VIEW) {} UCL_D_Vec() : _cols(0) {}
~UCL_D_Vec() { if (_kind!=UCL_VIEW) _device_free(*this); } ~UCL_D_Vec() { _device_free(*this); }
/// Construct with n columns /// Construct with n columns
/** \sa alloc() **/ /** \sa alloc() **/
UCL_D_Vec(const size_t n, UCL_Device &device, UCL_D_Vec(const size_t n, UCL_Device &device,
const enum UCL_MEMOPT kind=UCL_READ_WRITE) : const enum UCL_MEMOPT kind=UCL_READ_WRITE) :
_cols(0), _kind(UCL_VIEW) { alloc(n,device,kind); } _cols(0) { alloc(n,device,kind); }
/// Set up host vector with 'cols' columns and reserve memory /// Set up host vector with 'cols' columns and reserve memory
/** The kind parameter controls memory optimizations as follows: /** The kind parameter controls memory optimizations as follows:
@ -119,15 +119,11 @@ class UCL_D_Vec : public UCL_BaseMat {
return err; return err;
} }
/// Return the type of memory allocation
/** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, or UCL_VIEW **/
inline enum UCL_MEMOPT kind() const { return _kind; }
/// Do not allocate memory, instead use an existing allocation from Geryon /// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container **/ * allocating container when using CUDA APIs **/
template <class ucl_type> template <class ucl_type>
inline void view(ucl_type &input, const size_t rows, const size_t cols) { inline void view(ucl_type &input, const size_t rows, const size_t cols) {
#ifdef UCL_DEBUG #ifdef UCL_DEBUG
@ -139,8 +135,10 @@ class UCL_D_Vec : public UCL_BaseMat {
_row_bytes=_cols*sizeof(numtyp); _row_bytes=_cols*sizeof(numtyp);
this->_cq=input.cq(); this->_cq=input.cq();
#ifdef _OCL_MAT #ifdef _OCL_MAT
_offset=0; _offset=input.offset();
_array=input.cbegin(); _array=input.cbegin();
CL_SAFE_CALL(clRetainMemObject(input.cbegin()));
CL_SAFE_CALL(clRetainCommandQueue(input.cq()));
#else #else
_device_view(&_array,input.begin()); _device_view(&_array,input.begin());
#endif #endif
@ -154,7 +152,7 @@ class UCL_D_Vec : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container * allocating container when using CUDA APIs
* \param stride Number of _elements_ between the start of each row **/ * \param stride Number of _elements_ between the start of each row **/
template <class ucl_type> template <class ucl_type>
inline void view(ucl_type &input, const size_t rows, const size_t cols, inline void view(ucl_type &input, const size_t rows, const size_t cols,
@ -164,7 +162,7 @@ class UCL_D_Vec : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container * allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding) * - If a matrix is used a input, all elements (including padding)
* will be used for view **/ * will be used for view **/
template <class ucl_type> template <class ucl_type>
@ -175,7 +173,7 @@ class UCL_D_Vec : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container * allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding) * - If a matrix is used a input, all elements (including padding)
* will be used for view **/ * will be used for view **/
template <class ucl_type> template <class ucl_type>
@ -185,7 +183,7 @@ class UCL_D_Vec : public UCL_BaseMat {
/// Do not allocate memory, instead use an existing allocation /// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed. /** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container **/ * allocating container when using CUDA APIs **/
template <class ptr_type> template <class ptr_type>
inline void view(ptr_type input, const size_t rows, const size_t cols, inline void view(ptr_type input, const size_t rows, const size_t cols,
UCL_Device &dev) { UCL_Device &dev) {
@ -203,13 +201,15 @@ class UCL_D_Vec : public UCL_BaseMat {
#endif #endif
#ifdef _OCL_MAT #ifdef _OCL_MAT
_offset=0; _offset=0;
CL_SAFE_CALL(clRetainMemObject(input));
CL_SAFE_CALL(clRetainCommandQueue(dev.cq()));
#endif #endif
} }
/// Do not allocate memory, instead use an existing allocation /// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed. /** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container * allocating container when using CUDA APIs
* \param stride Number of _elements_ between the start of each row **/ * \param stride Number of _elements_ between the start of each row **/
template <class ptr_type> template <class ptr_type>
inline void view(ptr_type input, const size_t rows, const size_t cols, inline void view(ptr_type input, const size_t rows, const size_t cols,
@ -219,7 +219,7 @@ class UCL_D_Vec : public UCL_BaseMat {
/// Do not allocate memory, instead use an existing allocation /// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed. /** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container **/ * allocating container when using CUDA APIs **/
template <class ptr_type> template <class ptr_type>
inline void view(ptr_type input, const size_t cols, UCL_Device &dev) inline void view(ptr_type input, const size_t cols, UCL_Device &dev)
{ view(input,1,cols,dev); } { view(input,1,cols,dev); }
@ -228,7 +228,7 @@ class UCL_D_Vec : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container **/ * allocating container when using CUDA APIs **/
template <class ucl_type> template <class ucl_type>
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows, inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
const size_t cols) { const size_t cols) {
@ -242,7 +242,9 @@ class UCL_D_Vec : public UCL_BaseMat {
this->_cq=input.cq(); this->_cq=input.cq();
#ifdef _OCL_MAT #ifdef _OCL_MAT
_array=input.begin(); _array=input.begin();
_offset=offset; _offset=offset+input.offset();
CL_SAFE_CALL(clRetainMemObject(input.begin()));
CL_SAFE_CALL(clRetainCommandQueue(input.cq()));
#else #else
_device_view(&_array,input.begin(),offset,sizeof(numtyp)); _device_view(&_array,input.begin(),offset,sizeof(numtyp));
#endif #endif
@ -256,7 +258,7 @@ class UCL_D_Vec : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container * allocating container when using CUDA APIs
* \param stride Number of _elements_ between the start of each row **/ * \param stride Number of _elements_ between the start of each row **/
template <class ucl_type> template <class ucl_type>
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows, inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
@ -267,7 +269,7 @@ class UCL_D_Vec : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container * allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding) * - If a matrix is used a input, all elements (including padding)
* will be used for view **/ * will be used for view **/
template <class ucl_type> template <class ucl_type>
@ -278,7 +280,7 @@ class UCL_D_Vec : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container * allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding) * - If a matrix is used a input, all elements (including padding)
* will be used for view **/ * will be used for view **/
template <class ucl_type> template <class ucl_type>
@ -288,7 +290,7 @@ class UCL_D_Vec : public UCL_BaseMat {
/// Do not allocate memory, instead use an existing allocation /// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed. /** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container **/ * allocating container when using CUDA APIs **/
template <class ptr_type> template <class ptr_type>
inline void view_offset(const size_t offset,ptr_type input,const size_t rows, inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
const size_t cols, UCL_Device &dev) { const size_t cols, UCL_Device &dev) {
@ -304,6 +306,8 @@ class UCL_D_Vec : public UCL_BaseMat {
#ifdef _OCL_MAT #ifdef _OCL_MAT
_array=input; _array=input;
_offset=offset; _offset=offset;
CL_SAFE_CALL(clRetainMemObject(input));
CL_SAFE_CALL(clRetainCommandQueue(dev.cq()));
#else #else
#ifdef _UCL_DEVICE_PTR_MAT #ifdef _UCL_DEVICE_PTR_MAT
_array=input+offset*sizeof(numtyp); _array=input+offset*sizeof(numtyp);
@ -320,7 +324,7 @@ class UCL_D_Vec : public UCL_BaseMat {
/// Do not allocate memory, instead use an existing allocation /// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed. /** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container * allocating container when using CUDA APIs
* \param stride Number of _elements_ between the start of each row **/ * \param stride Number of _elements_ between the start of each row **/
template <class ptr_type> template <class ptr_type>
inline void view_offset(const size_t offset,ptr_type input,const size_t rows, inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
@ -330,7 +334,7 @@ class UCL_D_Vec : public UCL_BaseMat {
/// Do not allocate memory, instead use an existing allocation /// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed. /** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container **/ * allocating container when using CUDA APIs **/
template <class ptr_type> template <class ptr_type>
inline void view_offset(const size_t offset, ptr_type input, inline void view_offset(const size_t offset, ptr_type input,
const size_t cols, UCL_Device &dev) const size_t cols, UCL_Device &dev)
@ -338,7 +342,7 @@ class UCL_D_Vec : public UCL_BaseMat {
/// Free memory and set size to 0 /// Free memory and set size to 0
inline void clear() inline void clear()
{ if (_kind!=UCL_VIEW) { _cols=0; _kind=UCL_VIEW; _device_free(*this); } } { _device_free(*this); _cols=0; _kind=UCL_VIEW; }
/// Resize the allocation to contain cols elements /// Resize the allocation to contain cols elements
/** \note Cannot be used on views **/ /** \note Cannot be used on views **/
@ -373,11 +377,15 @@ class UCL_D_Vec : public UCL_BaseMat {
inline int resize_ib(const int cols) inline int resize_ib(const int cols)
{ if (cols>_cols) return resize(cols); else return UCL_SUCCESS; } { if (cols>_cols) return resize(cols); else return UCL_SUCCESS; }
/// Set each element to zero /// Set each element to zero asynchronously in the default command_queue
inline void zero() { _device_zero(*this,row_bytes()); } inline void zero() { zero(_cq); }
/// Set first n elements to zero asynchronously in the default command_queue
/// Set first n elements to zero inline void zero(const int n) { zero(n,_cq); }
inline void zero(const int n) { _device_zero(*this,n*sizeof(numtyp)); } /// Set each element to zero asynchronously
inline void zero(command_queue &cq) { _device_zero(*this,row_bytes(),cq); }
/// Set first n elements to zero asynchronously
inline void zero(const int n, command_queue &cq)
{ _device_zero(*this,n*sizeof(numtyp),cq); }
#ifdef _UCL_DEVICE_PTR_MAT #ifdef _UCL_DEVICE_PTR_MAT
/// For OpenCL, returns a (void *) device pointer to memory allocation /// For OpenCL, returns a (void *) device pointer to memory allocation
@ -465,7 +473,6 @@ class UCL_D_Vec : public UCL_BaseMat {
private: private:
size_t _row_bytes, _row_size, _rows, _cols; size_t _row_bytes, _row_size, _rows, _cols;
enum UCL_MEMOPT _kind;
#ifdef _UCL_DEVICE_PTR_MAT #ifdef _UCL_DEVICE_PTR_MAT
device_ptr _array; device_ptr _array;

View File

@ -39,33 +39,35 @@ class UCL_H_Mat : public UCL_BaseMat {
}; };
typedef numtyp data_type; typedef numtyp data_type;
UCL_H_Mat() : _kind(UCL_VIEW), _rows(0) { UCL_H_Mat() : _cols(0) {
#ifdef _OCL_MAT #ifdef _OCL_MAT
_carray=(cl_mem)(0); _carray=(cl_mem)(0);
#endif #endif
} }
~UCL_H_Mat() { if (_kind!=UCL_VIEW) _host_free(*this,_kind); } ~UCL_H_Mat() { _host_free(*this); }
/// Construct with specied number of rows and columns /// Construct with specied number of rows and columns
/** \sa alloc() **/ /** \sa alloc() **/
UCL_H_Mat(const size_t rows, const size_t cols, UCL_Device &device, UCL_H_Mat(const size_t rows, const size_t cols, UCL_Device &device,
const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) const enum UCL_MEMOPT kind=UCL_READ_WRITE)
{ _rows=0; _kind=UCL_VIEW; alloc(rows,cols,device,kind); } { _cols=0; _kind=UCL_VIEW; alloc(rows,cols,device,kind); }
/// Set up host matrix with specied # of rows/cols and reserve memory /// Set up host matrix with specied # of rows/cols and reserve memory
/** The kind parameter controls memory pinning as follows: /** The kind parameter controls memory pinning as follows:
* - UCL_NOT_PINNED - Memory is not pinned * - UCL_READ_WRITE - Specify that you will read and write from host
* - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined) * - UCL_WRITE_ONLY - Specify that you will only write from host
* - UCL_RW_OPTIMIZED - Memory can be pinned * - UCL_READ_ONLY - Specify that you will only read from host
* - UCL_NOT_PINNED - Memory is not pinned/page-locked on host
* \param cq Default command queue for operations copied from another mat * \param cq Default command queue for operations copied from another mat
* \return UCL_SUCCESS if the memory allocation is successful **/ * \return UCL_SUCCESS if the memory allocation is successful **/
template <class mat_type> template <class mat_type>
inline int alloc(const size_t rows, const size_t cols, mat_type &cq, inline int alloc(const size_t rows, const size_t cols, mat_type &cq,
const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) { const enum UCL_MEMOPT kind=UCL_READ_WRITE,
const enum UCL_MEMOPT kind2=UCL_NOT_SPECIFIED) {
clear(); clear();
_row_bytes=cols*sizeof(numtyp); _row_bytes=cols*sizeof(numtyp);
int err=_host_alloc(*this,cq,_row_bytes*rows,kind); int err=_host_alloc(*this,cq,_row_bytes*rows,kind,kind2);
if (err!=UCL_SUCCESS) { if (err!=UCL_SUCCESS) {
#ifndef UCL_NO_EXIT #ifndef UCL_NO_EXIT
std::cerr << "UCL Error: Could not allocate " << _row_bytes*_rows std::cerr << "UCL Error: Could not allocate " << _row_bytes*_rows
@ -86,17 +88,19 @@ class UCL_H_Mat : public UCL_BaseMat {
/// Set up host matrix with specied # of rows/cols and reserve memory /// Set up host matrix with specied # of rows/cols and reserve memory
/** The kind parameter controls memory pinning as follows: /** The kind parameter controls memory pinning as follows:
* - UCL_NOT_PINNED - Memory is not pinned * - UCL_READ_WRITE - Specify that you will read and write from host
* - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined) * - UCL_WRITE_ONLY - Specify that you will only write from host
* - UCL_RW_OPTIMIZED - Memory can be pinned * - UCL_READ_ONLY - Specify that you will only read from host
* - UCL_NOT_PINNED - Memory is not pinned/page-locked on host
* \param device Used to get the default command queue for operations * \param device Used to get the default command queue for operations
* \return UCL_SUCCESS if the memory allocation is successful **/ * \return UCL_SUCCESS if the memory allocation is successful **/
inline int alloc(const size_t rows, const size_t cols, UCL_Device &device, inline int alloc(const size_t rows, const size_t cols, UCL_Device &device,
const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) { const enum UCL_MEMOPT kind=UCL_READ_WRITE,
const enum UCL_MEMOPT kind2=UCL_NOT_SPECIFIED) {
clear(); clear();
_row_bytes=cols*sizeof(numtyp); _row_bytes=cols*sizeof(numtyp);
int err=_host_alloc(*this,device,_row_bytes*rows,kind); int err=_host_alloc(*this,device,_row_bytes*rows,kind,kind2);
if (err!=UCL_SUCCESS) { if (err!=UCL_SUCCESS) {
#ifndef UCL_NO_EXIT #ifndef UCL_NO_EXIT
std::cerr << "UCL Error: Could not allocate " << _row_bytes*_rows std::cerr << "UCL Error: Could not allocate " << _row_bytes*_rows
@ -115,15 +119,11 @@ class UCL_H_Mat : public UCL_BaseMat {
return err; return err;
} }
/// Return the type of memory allocation
/** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, or UCL_VIEW **/
inline enum UCL_MEMOPT kind() const { return _kind; }
/// Do not allocate memory, instead use an existing allocation from Geryon /// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container * allocating container when using CUDA APIs
* - Viewing a device container on the host is not supported * - Viewing a device container on the host is not supported
* \param stride Number of _elements_ between the start of each row **/ * \param stride Number of _elements_ between the start of each row **/
template <class ucl_type> template <class ucl_type>
@ -140,6 +140,8 @@ class UCL_H_Mat : public UCL_BaseMat {
_end=_array+_cols; _end=_array+_cols;
#ifdef _OCL_MAT #ifdef _OCL_MAT
_carray=input.cbegin(); _carray=input.cbegin();
CL_SAFE_CALL(clRetainMemObject(input.cbegin()));
CL_SAFE_CALL(clRetainCommandQueue(input.cq()));
#endif #endif
} }
@ -147,7 +149,7 @@ class UCL_H_Mat : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container * allocating container when using CUDA APIs
* - Viewing a device container on the host is not supported **/ * - Viewing a device container on the host is not supported **/
template <class ucl_type> template <class ucl_type>
inline void view(ucl_type &input, const size_t rows, const size_t cols) inline void view(ucl_type &input, const size_t rows, const size_t cols)
@ -157,7 +159,7 @@ class UCL_H_Mat : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container * allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding) * - If a matrix is used a input, all elements (including padding)
* will be used for view * will be used for view
* - Viewing a device container on the host is not supported **/ * - Viewing a device container on the host is not supported **/
@ -169,9 +171,9 @@ class UCL_H_Mat : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container * allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding) * - If a matrix is used a input, all elements (including padding)
* will be used for view * will be used for view when using CUDA APIs
* - Viewing a device container on the host is not supported **/ * - Viewing a device container on the host is not supported **/
template <class ucl_type> template <class ucl_type>
inline void view(ucl_type &input) inline void view(ucl_type &input)
@ -180,7 +182,7 @@ class UCL_H_Mat : public UCL_BaseMat {
/// Do not allocate memory, instead use an existing allocation /// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed. /** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container * allocating container when using CUDA APIs
* - Viewing a device pointer on the host is not supported * - Viewing a device pointer on the host is not supported
* \param stride Number of _elements_ between the start of each row **/ * \param stride Number of _elements_ between the start of each row **/
template <class ptr_type> template <class ptr_type>
@ -197,14 +199,14 @@ class UCL_H_Mat : public UCL_BaseMat {
_end=_array+_cols; _end=_array+_cols;
#ifdef _OCL_MAT #ifdef _OCL_MAT
_host_alloc(*this,dev,_row_bytes,UCL_VIEW); _host_view(*this,dev,_row_bytes*rows);
#endif #endif
} }
/// Do not allocate memory, instead use an existing allocation /// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed. /** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container * allocating container when using CUDA APIs
* - Viewing a device pointer on the host is not supported **/ * - Viewing a device pointer on the host is not supported **/
template <class ptr_type> template <class ptr_type>
inline void view(ptr_type *input, const size_t rows, const size_t cols, inline void view(ptr_type *input, const size_t rows, const size_t cols,
@ -213,7 +215,7 @@ class UCL_H_Mat : public UCL_BaseMat {
/// Do not allocate memory, instead use an existing allocation /// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed. /** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container * allocating container when using CUDA APIs
* - Viewing a device pointer on the host is not supported **/ * - Viewing a device pointer on the host is not supported **/
template <class ptr_type> template <class ptr_type>
inline void view(ptr_type *input, const size_t cols, UCL_Device &dev) inline void view(ptr_type *input, const size_t cols, UCL_Device &dev)
@ -223,7 +225,7 @@ class UCL_H_Mat : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container * allocating container when using CUDA APIs
* - Viewing a device container on the host is not supported * - Viewing a device container on the host is not supported
* \param stride Number of _elements_ between the start of each row **/ * \param stride Number of _elements_ between the start of each row **/
template <class ucl_type> template <class ucl_type>
@ -239,7 +241,7 @@ class UCL_H_Mat : public UCL_BaseMat {
_array=input.begin()+offset; _array=input.begin()+offset;
_end=_array+_cols; _end=_array+_cols;
#ifdef _OCL_MAT #ifdef _OCL_MAT
_host_alloc(*this,input,_row_bytes,UCL_VIEW); _host_view(*this,input,_row_bytes*_rows);
#endif #endif
} }
@ -247,7 +249,7 @@ class UCL_H_Mat : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container * allocating container when using CUDA APIs
* - Viewing a device container on the host is not supported **/ * - Viewing a device container on the host is not supported **/
template <class ucl_type> template <class ucl_type>
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows, inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
@ -258,7 +260,7 @@ class UCL_H_Mat : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container * allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding) * - If a matrix is used a input, all elements (including padding)
* will be used for view * will be used for view
* - Viewing a device container on the host is not supported **/ * - Viewing a device container on the host is not supported **/
@ -270,7 +272,7 @@ class UCL_H_Mat : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container * allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding) * - If a matrix is used a input, all elements (including padding)
* will be used for view * will be used for view
* - Viewing a device container on the host is not supported **/ * - Viewing a device container on the host is not supported **/
@ -296,7 +298,7 @@ class UCL_H_Mat : public UCL_BaseMat {
/// Do not allocate memory, instead use an existing allocation /// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed. /** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container * allocating container when using CUDA APIs
* - Viewing a device pointer on the host is not supported * - Viewing a device pointer on the host is not supported
* \param stride Number of _elements_ between the start of each row **/ * \param stride Number of _elements_ between the start of each row **/
template <class ptr_type> template <class ptr_type>
@ -307,7 +309,7 @@ class UCL_H_Mat : public UCL_BaseMat {
/// Do not allocate memory, instead use an existing allocation /// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed. /** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container * allocating container when using CUDA APIs
* - Viewing a device pointer on the host is not supported **/ * - Viewing a device pointer on the host is not supported **/
template <class ptr_type> template <class ptr_type>
inline void view_offset(const size_t offset, ptr_type *input, inline void view_offset(const size_t offset, ptr_type *input,
@ -316,7 +318,7 @@ class UCL_H_Mat : public UCL_BaseMat {
/// Free memory and set size to 0 /// Free memory and set size to 0
inline void clear() inline void clear()
{ if (_kind!=UCL_VIEW) {_rows=0; _kind=UCL_VIEW; _host_free(*this,_kind); }} { _host_free(*this); _cols=0; _kind=UCL_VIEW; }
/// Resize the allocation to rows x cols elements /// Resize the allocation to rows x cols elements
/** \note Cannot be used on views **/ /** \note Cannot be used on views **/
@ -409,7 +411,6 @@ class UCL_H_Mat : public UCL_BaseMat {
#endif #endif
private: private:
enum UCL_MEMOPT _kind;
numtyp *_array, *_end; numtyp *_array, *_end;
size_t _row_bytes, _rows, _cols; size_t _row_bytes, _rows, _cols;

View File

@ -39,33 +39,35 @@ class UCL_H_Vec : public UCL_BaseMat {
}; };
typedef numtyp data_type; typedef numtyp data_type;
UCL_H_Vec() : _kind(UCL_VIEW), _cols(0) { UCL_H_Vec() : _cols(0) {
#ifdef _OCL_MAT #ifdef _OCL_MAT
_carray=(cl_mem)(0); _carray=(cl_mem)(0);
#endif #endif
} }
~UCL_H_Vec() { if (_kind!=UCL_VIEW) _host_free(*this,_kind); } ~UCL_H_Vec() { _host_free(*this); }
/// Construct with n columns /// Construct with n columns
/** \sa alloc() **/ /** \sa alloc() **/
UCL_H_Vec(const size_t n, UCL_Device &device, UCL_H_Vec(const size_t n, UCL_Device &device,
const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) const enum UCL_MEMOPT kind=UCL_READ_WRITE)
{ _cols=0; _kind=UCL_VIEW; alloc(n,device,kind); } { _cols=0; _kind=UCL_VIEW; alloc(n,device,kind); }
/// Set up host vector with 'cols' columns and reserve memory /// Set up host vector with 'cols' columns and reserve memory
/** The kind parameter controls memory pinning as follows: /** The kind parameter controls memory pinning as follows:
* - UCL_NOT_PINNED - Memory is not pinned * - UCL_READ_WRITE - Specify that you will read and write from host
* - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined) * - UCL_WRITE_ONLY - Specify that you will only write from host
* - UCL_RW_OPTIMIZED - Memory can be pinned * - UCL_READ_ONLY - Specify that you will only read from host
* - UCL_NOT_PINNED - Memory is not pinned/page-locked on host
* \param cq Default command queue for operations copied from another mat * \param cq Default command queue for operations copied from another mat
* \return UCL_SUCCESS if the memory allocation is successful **/ * \return UCL_SUCCESS if the memory allocation is successful **/
template <class mat_type> template <class mat_type>
inline int alloc(const size_t cols, mat_type &cq, inline int alloc(const size_t cols, mat_type &cq,
const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) { const enum UCL_MEMOPT kind=UCL_READ_WRITE,
const enum UCL_MEMOPT kind2=UCL_NOT_SPECIFIED) {
clear(); clear();
_row_bytes=cols*sizeof(numtyp); _row_bytes=cols*sizeof(numtyp);
int err=_host_alloc(*this,cq,_row_bytes,kind); int err=_host_alloc(*this,cq,_row_bytes,kind,kind2);
if (err!=UCL_SUCCESS) { if (err!=UCL_SUCCESS) {
#ifndef UCL_NO_EXIT #ifndef UCL_NO_EXIT
@ -86,17 +88,19 @@ class UCL_H_Vec : public UCL_BaseMat {
/// Set up host vector with 'cols' columns and reserve memory /// Set up host vector with 'cols' columns and reserve memory
/** The kind parameter controls memory pinning as follows: /** The kind parameter controls memory pinning as follows:
* - UCL_NOT_PINNED - Memory is not pinned * - UCL_READ_WRITE - Specify that you will read and write from host
* - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined) * - UCL_WRITE_ONLY - Specify that you will only write from host
* - UCL_RW_OPTIMIZED - Memory can be pinned * - UCL_READ_ONLY - Specify that you will only read from host
* - UCL_NOT_PINNED - Memory is not pinned/page-locked on host
* \param device Used to get the default command queue for operations * \param device Used to get the default command queue for operations
* \return UCL_SUCCESS if the memory allocation is successful **/ * \return UCL_SUCCESS if the memory allocation is successful **/
inline int alloc(const size_t cols, UCL_Device &device, inline int alloc(const size_t cols, UCL_Device &device,
const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) { const enum UCL_MEMOPT kind=UCL_READ_WRITE,
const enum UCL_MEMOPT kind2=UCL_NOT_SPECIFIED) {
clear(); clear();
_row_bytes=cols*sizeof(numtyp); _row_bytes=cols*sizeof(numtyp);
int err=_host_alloc(*this,device,_row_bytes,kind); int err=_host_alloc(*this,device,_row_bytes,kind,kind2);
if (err!=UCL_SUCCESS) { if (err!=UCL_SUCCESS) {
#ifndef UCL_NO_EXIT #ifndef UCL_NO_EXIT
@ -115,15 +119,11 @@ class UCL_H_Vec : public UCL_BaseMat {
return err; return err;
} }
/// Return the type of memory allocation
/** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, or UCL_VIEW **/
inline enum UCL_MEMOPT kind() const { return _kind; }
/// Do not allocate memory, instead use an existing allocation from Geryon /// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container * allocating container when using CUDA APIs
* - Viewing a device container on the host is not supported **/ * - Viewing a device container on the host is not supported **/
template <class ucl_type> template <class ucl_type>
inline void view(ucl_type &input, const size_t rows, const size_t cols) { inline void view(ucl_type &input, const size_t rows, const size_t cols) {
@ -139,6 +139,8 @@ class UCL_H_Vec : public UCL_BaseMat {
_end=_array+_cols; _end=_array+_cols;
#ifdef _OCL_MAT #ifdef _OCL_MAT
_carray=input.cbegin(); _carray=input.cbegin();
CL_SAFE_CALL(clRetainMemObject(input.cbegin()));
CL_SAFE_CALL(clRetainCommandQueue(input.cq()));
#endif #endif
} }
@ -146,7 +148,7 @@ class UCL_H_Vec : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container * allocating container when using CUDA APIs
* - Viewing a device container on the host is not supported * - Viewing a device container on the host is not supported
* \param stride Number of _elements_ between the start of each row **/ * \param stride Number of _elements_ between the start of each row **/
template <class ucl_type> template <class ucl_type>
@ -157,7 +159,7 @@ class UCL_H_Vec : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container * allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding) * - If a matrix is used a input, all elements (including padding)
* will be used for view * will be used for view
* - Viewing a device container on the host is not supported **/ * - Viewing a device container on the host is not supported **/
@ -180,7 +182,7 @@ class UCL_H_Vec : public UCL_BaseMat {
/// Do not allocate memory, instead use an existing allocation /// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed. /** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container * allocating container when using CUDA APIs
* - Viewing a device pointer on the host is not supported **/ * - Viewing a device pointer on the host is not supported **/
template <class ptr_type> template <class ptr_type>
inline void view(ptr_type *input, const size_t rows, const size_t cols, inline void view(ptr_type *input, const size_t rows, const size_t cols,
@ -197,14 +199,14 @@ class UCL_H_Vec : public UCL_BaseMat {
_end=_array+_cols; _end=_array+_cols;
#ifdef _OCL_MAT #ifdef _OCL_MAT
_host_alloc(*this,dev,_row_bytes,UCL_VIEW); _host_view(*this,dev,_row_bytes);
#endif #endif
} }
/// Do not allocate memory, instead use an existing allocation /// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed. /** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container * allocating container when using CUDA APIs
* - Viewing a device pointer on the host is not supported * - Viewing a device pointer on the host is not supported
* \param stride Number of _elements_ between the start of each row **/ * \param stride Number of _elements_ between the start of each row **/
template <class ptr_type> template <class ptr_type>
@ -215,7 +217,7 @@ class UCL_H_Vec : public UCL_BaseMat {
/// Do not allocate memory, instead use an existing allocation /// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed. /** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container * allocating container when using CUDA APIs
* - Viewing a device pointer on the host is not supported **/ * - Viewing a device pointer on the host is not supported **/
template <class ptr_type> template <class ptr_type>
inline void view(ptr_type *input, const size_t cols, UCL_Device &dev) inline void view(ptr_type *input, const size_t cols, UCL_Device &dev)
@ -225,7 +227,7 @@ class UCL_H_Vec : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container * allocating container when using CUDA APIs
* - Viewing a device container on the host is not supported **/ * - Viewing a device container on the host is not supported **/
template <class ucl_type> template <class ucl_type>
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows, inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
@ -241,7 +243,7 @@ class UCL_H_Vec : public UCL_BaseMat {
_array=input.begin()+offset; _array=input.begin()+offset;
_end=_array+_cols; _end=_array+_cols;
#ifdef _OCL_MAT #ifdef _OCL_MAT
_host_alloc(*this,input,_row_bytes,UCL_VIEW); _host_view(*this,input,_row_bytes);
#endif #endif
} }
@ -249,7 +251,7 @@ class UCL_H_Vec : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container * allocating container when using CUDA APIs
* - Viewing a device container on the host is not supported * - Viewing a device container on the host is not supported
* \param stride Number of _elements_ between the start of each row **/ * \param stride Number of _elements_ between the start of each row **/
template <class ucl_type> template <class ucl_type>
@ -261,7 +263,7 @@ class UCL_H_Vec : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container * allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding) * - If a matrix is used a input, all elements (including padding)
* will be used for view * will be used for view
* - Viewing a device container on the host is not supported **/ * - Viewing a device container on the host is not supported **/
@ -273,7 +275,7 @@ class UCL_H_Vec : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container. /** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed. * No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container * allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding) * - If a matrix is used a input, all elements (including padding)
* will be used for view * will be used for view
* - Viewing a device container on the host is not supported **/ * - Viewing a device container on the host is not supported **/
@ -284,7 +286,7 @@ class UCL_H_Vec : public UCL_BaseMat {
/// Do not allocate memory, instead use an existing allocation /// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed. /** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container * allocating container when using CUDA APIs
* - Viewing a device pointer on the host is not supported **/ * - Viewing a device pointer on the host is not supported **/
template <class ptr_type> template <class ptr_type>
inline void view_offset(const size_t offset,ptr_type *input,const size_t rows, inline void view_offset(const size_t offset,ptr_type *input,const size_t rows,
@ -294,7 +296,7 @@ class UCL_H_Vec : public UCL_BaseMat {
/// Do not allocate memory, instead use an existing allocation /// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed. /** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container * allocating container when using CUDA APIs
* - Viewing a device pointer on the host is not supported * - Viewing a device pointer on the host is not supported
* \param stride Number of _elements_ between the start of each row **/ * \param stride Number of _elements_ between the start of each row **/
template <class ptr_type> template <class ptr_type>
@ -305,7 +307,7 @@ class UCL_H_Vec : public UCL_BaseMat {
/// Do not allocate memory, instead use an existing allocation /// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed. /** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the * - The view does not prevent the memory from being freed by the
* allocating container * allocating container when using CUDA APIs
* - Viewing a device pointer on the host is not supported **/ * - Viewing a device pointer on the host is not supported **/
template <class ptr_type> template <class ptr_type>
inline void view_offset(const size_t offset, ptr_type *input, inline void view_offset(const size_t offset, ptr_type *input,
@ -314,7 +316,7 @@ class UCL_H_Vec : public UCL_BaseMat {
/// Free memory and set size to 0 /// Free memory and set size to 0
inline void clear() inline void clear()
{ if (_kind!=UCL_VIEW) {_kind=UCL_VIEW; _cols=0; _host_free(*this,_kind);}} { _host_free(*this); _kind=UCL_VIEW; _cols=0; }
/// Resize the allocation to contain cols elements /// Resize the allocation to contain cols elements
/** \note Cannot be used on views **/ /** \note Cannot be used on views **/
@ -401,7 +403,6 @@ class UCL_H_Vec : public UCL_BaseMat {
#endif #endif
private: private:
enum UCL_MEMOPT _kind;
numtyp *_array, *_end; numtyp *_array, *_end;
size_t _row_bytes, _cols; size_t _row_bytes, _cols;

View File

@ -48,17 +48,18 @@ class UCL_Matrix {
/// Construct with specied number of rows and columns /// Construct with specied number of rows and columns
/** \sa alloc() **/ /** \sa alloc() **/
UCL_Matrix(const size_t rows, const size_t cols, UCL_Device &acc, UCL_Matrix(const size_t rows, const size_t cols, UCL_Device &acc,
const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED, const enum UCL_MEMOPT kind1=UCL_READ_WRITE,
const enum UCL_MEMOPT kind2=UCL_READ_WRITE) const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >:: { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
alloc(host,device,_buffer,rows,cols,acc,kind1,kind2); } alloc(host,device,_buffer,rows,cols,acc,kind1,kind2); }
/// Set up host matrix with specied # of rows/cols and reserve memory /// Set up host matrix with specied # of rows/cols and reserve memory
/** The kind1 parameter controls memory pinning as follows: /** The kind1 parameter controls memory access from the host
* - UCL_NOT_PINNED - Memory is not pinned * - UCL_READ_WRITE - Specify that you will read and write from host
* - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined) * - UCL_WRITE_ONLY - Specify that you will only write from host
* - UCL_RW_OPTIMIZED - Memory can be pinned * - UCL_READ_ONLY - Specify that you will only read from host
* The kind2 parameter controls memory optimizations as follows: * - UCL_NOT_PINNED - Memory is not pinned/page-locked on host
* The kind2 parameter controls memory optimizations from the device:
* - UCL_READ_WRITE - Specify that you will read and write in kernels * - UCL_READ_WRITE - Specify that you will read and write in kernels
* - UCL_WRITE_ONLY - Specify that you will only write in kernels * - UCL_WRITE_ONLY - Specify that you will only write in kernels
* - UCL_READ_ONLY - Specify that you will only read in kernels * - UCL_READ_ONLY - Specify that you will only read in kernels
@ -69,24 +70,25 @@ class UCL_Matrix {
* \return UCL_SUCCESS if the memory allocation is successful **/ * \return UCL_SUCCESS if the memory allocation is successful **/
template <class mat_type> template <class mat_type>
inline int alloc(const size_t rows, const size_t cols, mat_type &cq, inline int alloc(const size_t rows, const size_t cols, mat_type &cq,
const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED, const enum UCL_MEMOPT kind1=UCL_READ_WRITE,
const enum UCL_MEMOPT kind2=UCL_READ_WRITE) const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
{ return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >:: { return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
alloc(host,device,_buffer,rows,cols,cq,kind1,kind2); } alloc(host,device,_buffer,rows,cols,cq,kind1,kind2); }
/// Set up host matrix with specied # of rows/cols and reserve memory /// Set up host matrix with specied # of rows/cols and reserve memory
/** The kind1 parameter controls memory pinning as follows: /** The kind1 parameter controls memory access from the host
* - UCL_NOT_PINNED - Memory is not pinned * - UCL_READ_WRITE - Specify that you will read and write from host
* - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined) * - UCL_WRITE_ONLY - Specify that you will only write from host
* - UCL_RW_OPTIMIZED - Memory can be pinned * - UCL_READ_ONLY - Specify that you will only read from host
* The kind2 parameter controls memory optimizations as follows: * - UCL_NOT_PINNED - Memory is not pinned/page-locked on host
* The kind2 parameter controls memory optimizations from the device:
* - UCL_READ_WRITE - Specify that you will read and write in kernels * - UCL_READ_WRITE - Specify that you will read and write in kernels
* - UCL_WRITE_ONLY - Specify that you will only write in kernels * - UCL_WRITE_ONLY - Specify that you will only write in kernels
* - UCL_READ_ONLY - Specify that you will only read in kernels * - UCL_READ_ONLY - Specify that you will only read in kernels
* \param device Used to get the default command queue for operations * \param device Used to get the default command queue for operations
* \return UCL_SUCCESS if the memory allocation is successful **/ * \return UCL_SUCCESS if the memory allocation is successful **/
inline int alloc(const size_t rows, const size_t cols, UCL_Device &acc, inline int alloc(const size_t rows, const size_t cols, UCL_Device &acc,
const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED, const enum UCL_MEMOPT kind1=UCL_READ_WRITE,
const enum UCL_MEMOPT kind2=UCL_READ_WRITE) const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
{ return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >:: { return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
alloc(host,device,_buffer,rows,cols,acc,kind1,kind2); } alloc(host,device,_buffer,rows,cols,acc,kind1,kind2); }
@ -110,11 +112,22 @@ class UCL_Matrix {
{ if (new_rows>rows() || new_cols>cols()) return resize(new_rows,new_cols); { if (new_rows>rows() || new_cols>cols()) return resize(new_rows,new_cols);
else return UCL_SUCCESS; } else return UCL_SUCCESS; }
/// Set each element to zero /// Set each element to zero (asynchronously on device)
inline void zero() { host.zero(); device.zero(); } inline void zero() { zero(cq()); }
/// Set first n elements to zero (asynchronously on device)
/// Set first n elements to zero inline void zero(const int n) { zero(n,cq()); }
inline void zero(const int n) { host.zero(n); device.zero(n); } /// Set each element to zero (asynchronously on device)
inline void zero(command_queue &cq) {
host.zero();
if (device.kind()!=UCL_VIEW) device.zero(cq);
else if (_buffer.numel()>0) _buffer.zero();
}
/// Set first n elements to zero (asynchronously on device)
inline void zero(const int n, command_queue &cq) {
host.zero(n);
if (device.kind()!=UCL_VIEW) device.zero(n,cq);
else if (_buffer.numel()>0) _buffer.zero();
}
/// Get the number of elements /// Get the number of elements
inline size_t numel() const { return host.numel(); } inline size_t numel() const { return host.numel(); }
@ -145,6 +158,8 @@ class UCL_Matrix {
/// Return the default command queue/stream associated with this data /// Return the default command queue/stream associated with this data
inline command_queue & cq() { return host.cq(); } inline command_queue & cq() { return host.cq(); }
/// Change the default command queue associated with this data
inline void cq(command_queue &cq_in) { host.cq(cq_in); device.cq(cq_in); }
/// Block until command_queue associated with matrix is complete /// Block until command_queue associated with matrix is complete
inline void sync() { host.sync(); } inline void sync() { host.sync(); }

View File

@ -32,14 +32,24 @@ template <> struct _ucl_s_obj_help<1> {
const enum UCL_MEMOPT kind1, const enum UCL_MEMOPT kind1,
const enum UCL_MEMOPT kind2) { const enum UCL_MEMOPT kind2) {
int e1; int e1;
e1=host.alloc(cols,acc,kind1);
if (e1!=UCL_SUCCESS)
return e1;
if (acc.shared_memory()) { if (acc.shared_memory()) {
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_ALLOC 1S\n";
#endif
e1=host.alloc(cols,acc,kind1,kind2);
if (e1!=UCL_SUCCESS)
return e1;
device.view(host); device.view(host);
return UCL_SUCCESS; return UCL_SUCCESS;
} else } else {
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_ALLOC 1NS\n";
#endif
e1=host.alloc(cols,acc,kind1);
if (e1!=UCL_SUCCESS)
return e1;
return device.alloc(cols,acc,kind2); return device.alloc(cols,acc,kind2);
}
} }
template <class t1, class t2, class t3, class mat_type> template <class t1, class t2, class t3, class mat_type>
@ -48,10 +58,24 @@ template <> struct _ucl_s_obj_help<1> {
const enum UCL_MEMOPT kind1, const enum UCL_MEMOPT kind1,
const enum UCL_MEMOPT kind2) { const enum UCL_MEMOPT kind2) {
int e1; int e1;
e1=host.alloc(cols,cq,kind1); if (cq.shared_mem_device()) {
if (e1!=UCL_SUCCESS) #ifdef UCL_DBG_MEM_TRACE
return e1; std::cerr << "UCL_ALLOC 2S\n";
return device.alloc(cols,cq,kind2); #endif
e1=host.alloc(cols,cq,kind1,kind2);
if (e1!=UCL_SUCCESS)
return e1;
device.view(host);
return UCL_SUCCESS;
} else {
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_ALLOC 2NS\n";
#endif
e1=host.alloc(cols,cq,kind1);
if (e1!=UCL_SUCCESS)
return e1;
return device.alloc(cols,cq,kind2);
}
} }
template <class t1, class t2, class t3> template <class t1, class t2, class t3>
@ -60,14 +84,24 @@ template <> struct _ucl_s_obj_help<1> {
const enum UCL_MEMOPT kind1, const enum UCL_MEMOPT kind1,
const enum UCL_MEMOPT kind2) { const enum UCL_MEMOPT kind2) {
int e1; int e1;
e1=host.alloc(rows,cols,acc,kind1);
if (e1!=UCL_SUCCESS)
return e1;
if (acc.shared_memory()) { if (acc.shared_memory()) {
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_ALLOC 3S\n";
#endif
e1=host.alloc(rows,cols,acc,kind1,kind2);
if (e1!=UCL_SUCCESS)
return e1;
device.view(host); device.view(host);
return UCL_SUCCESS; return UCL_SUCCESS;
} else } else {
e1=host.alloc(rows,cols,acc,kind1);
if (e1!=UCL_SUCCESS)
return e1;
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_ALLOC 3NS\n";
#endif
return device.alloc(rows,cols,acc,kind2); return device.alloc(rows,cols,acc,kind2);
}
} }
template <class t1, class t2, class t3, class mat_type> template <class t1, class t2, class t3, class mat_type>
@ -76,10 +110,24 @@ template <> struct _ucl_s_obj_help<1> {
const enum UCL_MEMOPT kind1, const enum UCL_MEMOPT kind1,
const enum UCL_MEMOPT kind2) { const enum UCL_MEMOPT kind2) {
int e1; int e1;
e1=host.alloc(rows,cols,cq,kind1); if (cq.shared_mem_device()) {
if (e1!=UCL_SUCCESS) #ifdef UCL_DBG_MEM_TRACE
return e1; std::cerr << "UCL_ALLOC 4S\n";
return device.alloc(rows,cols,cq,kind2); #endif
e1=host.alloc(rows,cols,cq,kind1,kind2);
if (e1!=UCL_SUCCESS)
return e1;
device.view(host);
return UCL_SUCCESS;
} else {
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_ALLOC 4NS\n";
#endif
e1=host.alloc(rows,cols,cq,kind1);
if (e1!=UCL_SUCCESS)
return e1;
return device.alloc(rows,cols,cq,kind2);
}
} }
template <class t1, class t2, class t3> template <class t1, class t2, class t3>
@ -121,8 +169,15 @@ template <> struct _ucl_s_obj_help<1> {
if (device.kind()==UCL_VIEW) { if (device.kind()==UCL_VIEW) {
device.view(host); device.view(host);
return UCL_SUCCESS; return UCL_SUCCESS;
} else #ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_ALLOC 5S\n";
#endif
} else {
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_ALLOC 5NS\n";
#endif
return device.resize(cols); return device.resize(cols);
}
} }
template <class t1, class t2, class t3> template <class t1, class t2, class t3>
@ -130,9 +185,16 @@ template <> struct _ucl_s_obj_help<1> {
const int cols) { const int cols) {
if (device.kind()==UCL_VIEW) { if (device.kind()==UCL_VIEW) {
device.view(host); device.view(host);
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_ALLOC 6S\n";
#endif
return UCL_SUCCESS; return UCL_SUCCESS;
} else } else {
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_ALLOC 6NS\n";
#endif
return device.resize(rows,cols); return device.resize(rows,cols);
}
} }
}; };
@ -145,17 +207,27 @@ template <int st> struct _ucl_s_obj_help {
const enum UCL_MEMOPT kind2) { const enum UCL_MEMOPT kind2) {
int e1; int e1;
e1=host.alloc(cols,acc,UCL_NOT_PINNED); e1=host.alloc(cols,acc,UCL_NOT_PINNED);
if (e1!=UCL_SUCCESS)
return e1;
e1=_buffer.alloc(cols,acc,kind1);
if (e1!=UCL_SUCCESS) if (e1!=UCL_SUCCESS)
return e1; return e1;
if (acc.shared_memory()) { if (acc.shared_memory()) {
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_ALLOC 7S\n";
#endif
e1=_buffer.alloc(cols,acc,kind1,kind2);
if (e1!=UCL_SUCCESS)
return e1;
device.view(_buffer); device.view(_buffer);
return UCL_SUCCESS; return UCL_SUCCESS;
} else } else {
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_ALLOC 7NS\n";
#endif
e1=_buffer.alloc(cols,acc,kind1);
if (e1!=UCL_SUCCESS)
return e1;
return device.alloc(cols,acc,kind2); return device.alloc(cols,acc,kind2);
}
} }
template <class t1, class t2, class t3, class mat_type> template <class t1, class t2, class t3, class mat_type>
@ -167,10 +239,24 @@ template <int st> struct _ucl_s_obj_help {
e1=host.alloc(cols,cq,UCL_NOT_PINNED); e1=host.alloc(cols,cq,UCL_NOT_PINNED);
if (e1!=UCL_SUCCESS) if (e1!=UCL_SUCCESS)
return e1; return e1;
e1=_buffer.alloc(cols,cq,kind1); if (cq.shared_mem_device()) {
if (e1!=UCL_SUCCESS) #ifdef UCL_DBG_MEM_TRACE
return e1; std::cerr << "UCL_ALLOC 8S\n";
return device.alloc(cols,cq,kind2); #endif
e1=_buffer.alloc(cols,cq,kind1,kind2);
if (e1!=UCL_SUCCESS)
return e1;
device.view(_buffer);
return UCL_SUCCESS;
} else {
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_ALLOC 8NS\n";
#endif
e1=_buffer.alloc(cols,cq,kind1);
if (e1!=UCL_SUCCESS)
return e1;
return device.alloc(cols,cq,kind2);
}
} }
template <class t1, class t2, class t3> template <class t1, class t2, class t3>
@ -180,17 +266,27 @@ template <int st> struct _ucl_s_obj_help {
const enum UCL_MEMOPT kind2) { const enum UCL_MEMOPT kind2) {
int e1; int e1;
e1=host.alloc(rows,cols,acc,UCL_NOT_PINNED); e1=host.alloc(rows,cols,acc,UCL_NOT_PINNED);
if (e1!=UCL_SUCCESS)
return e1;
e1=_buffer.alloc(rows,cols,acc,kind1);
if (e1!=UCL_SUCCESS) if (e1!=UCL_SUCCESS)
return e1; return e1;
if (acc.shared_memory()) { if (acc.shared_memory()) {
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_ALLOC 9S\n";
#endif
e1=_buffer.alloc(rows,cols,acc,kind1,kind2);
if (e1!=UCL_SUCCESS)
return e1;
device.view(_buffer); device.view(_buffer);
return UCL_SUCCESS; return UCL_SUCCESS;
} else } else {
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_ALLOC 9NS\n";
#endif
e1=_buffer.alloc(rows,cols,acc,kind1);
if (e1!=UCL_SUCCESS)
return e1;
return device.alloc(rows,cols,acc,kind2); return device.alloc(rows,cols,acc,kind2);
}
} }
template <class t1, class t2, class t3, class mat_type> template <class t1, class t2, class t3, class mat_type>
@ -202,10 +298,24 @@ template <int st> struct _ucl_s_obj_help {
e1=host.alloc(rows,cols,cq,UCL_NOT_PINNED); e1=host.alloc(rows,cols,cq,UCL_NOT_PINNED);
if (e1!=UCL_SUCCESS) if (e1!=UCL_SUCCESS)
return e1; return e1;
e1=_buffer.alloc(rows,cols,cq,kind1); if (cq.shared_mem_device()) {
if (e1!=UCL_SUCCESS) #ifdef UCL_DBG_MEM_TRACE
return e1; std::cerr << "UCL_ALLOC 10S\n";
return device.alloc(rows,cols,cq,kind2); #endif
e1=_buffer.alloc(rows,cols,cq,kind1,kind2);
if (e1!=UCL_SUCCESS)
return e1;
device.view(_buffer);
return UCL_SUCCESS;
} else {
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_ALLOC 10NS\n";
#endif
e1=_buffer.alloc(rows,cols,cq,kind1);
if (e1!=UCL_SUCCESS)
return e1;
return device.alloc(rows,cols,cq,kind2);
}
} }
template <class t1, class t2, class t3> template <class t1, class t2, class t3>
@ -250,9 +360,16 @@ template <int st> struct _ucl_s_obj_help {
if (device.kind()==UCL_VIEW) { if (device.kind()==UCL_VIEW) {
device.view(buff); device.view(buff);
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_ALLOC 11S\n";
#endif
return UCL_SUCCESS; return UCL_SUCCESS;
} else } else {
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_ALLOC 11NS\n";
#endif
return device.resize(cols); return device.resize(cols);
}
} }
template <class t1, class t2, class t3> template <class t1, class t2, class t3>
@ -264,9 +381,17 @@ template <int st> struct _ucl_s_obj_help {
if (device.kind()==UCL_VIEW) { if (device.kind()==UCL_VIEW) {
device.view(buff); device.view(buff);
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_ALLOC 12S\n";
#endif
return UCL_SUCCESS; return UCL_SUCCESS;
} else } else {
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_ALLOC 12NS\n";
#endif
return device.resize(rows,cols); return device.resize(rows,cols);
}
} }
}; };

View File

@ -92,10 +92,9 @@ enum UCL_MEMOPT {
UCL_WRITE_ONLY, ///< Allow any optimizations for memory that is write only UCL_WRITE_ONLY, ///< Allow any optimizations for memory that is write only
UCL_READ_ONLY, ///< Allow any optimizations for memory that is read only UCL_READ_ONLY, ///< Allow any optimizations for memory that is read only
UCL_READ_WRITE, ///< Allow read and write UCL_READ_WRITE, ///< Allow read and write
UCL_WRITE_OPTIMIZED,///< Allow host memory to be pinned (write combined)
UCL_RW_OPTIMIZED, ///< Allow host memory to be pinned
UCL_NOT_PINNED, ///< Host memory is not to be pinned UCL_NOT_PINNED, ///< Host memory is not to be pinned
UCL_VIEW ///< View of another memory allocation UCL_VIEW, ///< View of another memory allocation
UCL_NOT_SPECIFIED
}; };
enum UCL_DEVICE_TYPE { enum UCL_DEVICE_TYPE {

View File

@ -48,17 +48,18 @@ class UCL_Vector {
/// Construct with n columns /// Construct with n columns
/** \sa alloc() **/ /** \sa alloc() **/
UCL_Vector(const size_t cols, UCL_Device &acc, UCL_Vector(const size_t cols, UCL_Device &acc,
const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED, const enum UCL_MEMOPT kind1=UCL_READ_WRITE,
const enum UCL_MEMOPT kind2=UCL_READ_WRITE) const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >:: { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
alloc(host,device,_buffer,cols,acc,kind1,kind2); } alloc(host,device,_buffer,cols,acc,kind1,kind2); }
/// Set up the vector with 'cols' columns and reserve memory /// Set up the vector with 'cols' columns and reserve memory
/** The kind1 parameter controls memory pinning as follows: /** The kind1 parameter controls memory access from the host
* - UCL_NOT_PINNED - Memory is not pinned * - UCL_READ_WRITE - Specify that you will read and write from host
* - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined) * - UCL_WRITE_ONLY - Specify that you will only write from host
* - UCL_RW_OPTIMIZED - Memory can be pinned * - UCL_READ_ONLY - Specify that you will only read from host
* The kind2 parameter controls memory optimizations as follows: * - UCL_NOT_PINNED - Memory is not pinned/page-locked on host
* The kind2 parameter controls memory optimizations from the device:
* - UCL_READ_WRITE - Specify that you will read and write in kernels * - UCL_READ_WRITE - Specify that you will read and write in kernels
* - UCL_WRITE_ONLY - Specify that you will only write in kernels * - UCL_WRITE_ONLY - Specify that you will only write in kernels
* - UCL_READ_ONLY - Specify that you will only read in kernels * - UCL_READ_ONLY - Specify that you will only read in kernels
@ -69,24 +70,25 @@ class UCL_Vector {
* \return UCL_SUCCESS if the memory allocation is successful **/ * \return UCL_SUCCESS if the memory allocation is successful **/
template <class mat_type> template <class mat_type>
inline int alloc(const size_t cols, mat_type &cq, inline int alloc(const size_t cols, mat_type &cq,
const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED, const enum UCL_MEMOPT kind1=UCL_READ_WRITE,
const enum UCL_MEMOPT kind2=UCL_READ_WRITE) const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
{ return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >:: { return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
alloc(host,device,_buffer,cols,cq,kind1,kind2); } alloc(host,device,_buffer,cols,cq,kind1,kind2); }
/// Set up host vector with 'cols' columns and reserve memory /// Set up host vector with 'cols' columns and reserve memory
/** The kind1 parameter controls memory pinning as follows: /** The kind1 parameter controls memory access from the host
* - UCL_NOT_PINNED - Memory is not pinned * - UCL_READ_WRITE - Specify that you will read and write from host
* - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined) * - UCL_WRITE_ONLY - Specify that you will only write from host
* - UCL_RW_OPTIMIZED - Memory can be pinned * - UCL_READ_ONLY - Specify that you will only read from host
* The kind2 parameter controls memory optimizations as follows: * - UCL_NOT_PINNED - Memory is not pinned/page-locked on host
* The kind2 parameter controls memory optimizations from the device:
* - UCL_READ_WRITE - Specify that you will read and write in kernels * - UCL_READ_WRITE - Specify that you will read and write in kernels
* - UCL_WRITE_ONLY - Specify that you will only write in kernels * - UCL_WRITE_ONLY - Specify that you will only write in kernels
* - UCL_READ_ONLY - Specify that you will only read in kernels * - UCL_READ_ONLY - Specify that you will only read in kernels
* \param device Used to get the default command queue for operations * \param device Used to get the default command queue for operations
* \return UCL_SUCCESS if the memory allocation is successful **/ * \return UCL_SUCCESS if the memory allocation is successful **/
inline int alloc(const size_t cols, UCL_Device &acc, inline int alloc(const size_t cols, UCL_Device &acc,
const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED, const enum UCL_MEMOPT kind1=UCL_READ_WRITE,
const enum UCL_MEMOPT kind2=UCL_READ_WRITE) const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
{ return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >:: { return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
alloc(host,device,_buffer,cols,acc,kind1,kind2); } alloc(host,device,_buffer,cols,acc,kind1,kind2); }
@ -109,11 +111,22 @@ class UCL_Vector {
inline int resize_ib(const int new_cols) inline int resize_ib(const int new_cols)
{ if (new_cols>cols()) return resize(new_cols); else return UCL_SUCCESS; } { if (new_cols>cols()) return resize(new_cols); else return UCL_SUCCESS; }
/// Set each element to zero /// Set each element to zero (asynchronously on device)
inline void zero() { host.zero(); device.zero(); } inline void zero() { zero(cq()); }
/// Set first n elements to zero (asynchronously on device)
/// Set first n elements to zero inline void zero(const int n) { zero(n,cq()); }
inline void zero(const int n) { host.zero(n); device.zero(n); } /// Set each element to zero (asynchronously on device)
inline void zero(command_queue &cq) {
host.zero();
if (device.kind()!=UCL_VIEW) device.zero(cq);
else if (_buffer.numel()>0) _buffer.zero();
}
/// Set first n elements to zero (asynchronously on device)
inline void zero(const int n, command_queue &cq) {
host.zero(n);
if (device.kind()!=UCL_VIEW) device.zero(n,cq);
else if (_buffer.numel()>0) _buffer.zero();
}
/// Get the number of elements /// Get the number of elements
inline size_t numel() const { return host.numel(); } inline size_t numel() const { return host.numel(); }
@ -145,6 +158,8 @@ class UCL_Vector {
/// Return the default command queue/stream associated with this data /// Return the default command queue/stream associated with this data
inline command_queue & cq() { return host.cq(); } inline command_queue & cq() { return host.cq(); }
/// Change the default command queue associated with this data
inline void cq(command_queue &cq_in) { host.cq(cq_in); device.cq(cq_in); }
/// Block until command_queue associated with matrix is complete /// Block until command_queue associated with matrix is complete
inline void sync() { host.sync(); } inline void sync() { host.sync(); }

View File

@ -44,10 +44,10 @@ bool AnswerT::alloc(const int inum) {
_ans_fields+=4; _ans_fields+=4;
// --------------------------- Device allocations // --------------------------- Device allocations
success=success && (engv.alloc(_ev_fields*_max_local,*dev,UCL_RW_OPTIMIZED, success=success && (engv.alloc(_ev_fields*_max_local,*dev,UCL_READ_ONLY,
UCL_WRITE_ONLY)==UCL_SUCCESS); UCL_READ_WRITE)==UCL_SUCCESS);
success=success && (force.alloc(_ans_fields*_max_local,*dev,UCL_RW_OPTIMIZED, success=success && (force.alloc(_ans_fields*_max_local,*dev,UCL_READ_ONLY,
UCL_WRITE_ONLY)==UCL_SUCCESS); UCL_READ_WRITE)==UCL_SUCCESS);
_gpu_bytes=engv.device.row_bytes()+force.device.row_bytes(); _gpu_bytes=engv.device.row_bytes()+force.device.row_bytes();
_allocated=true; _allocated=true;
@ -175,78 +175,42 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
return 0.0; return 0.0;
double evdwl=0.0; double evdwl=0.0;
double virial_acc[6]; int vstart=0;
for (int i=0; i<6; i++) virial_acc[i]=0.0; if (_eflag) {
if (_ilist==NULL) { for (int i=0; i<_inum; i++)
for (int i=0; i<_inum; i++) { evdwl+=engv[i];
int al=i; if (_ef_atom)
if (_eflag) { if (_ilist==NULL)
if (_ef_atom) { for (int i=0; i<_inum; i++)
evdwl+=engv[al]; eatom[i]+=engv[i];
eatom[i]+=engv[al]*0.5; else
al+=_inum; for (int i=0; i<_inum; i++)
} else { eatom[_ilist[i]]+=engv[i];
evdwl+=engv[al]; vstart=_inum;
al+=_inum; }
} if (_vflag) {
} int iend=vstart+_inum;
if (_vflag) { for (int j=0; j<6; j++) {
if (_vf_atom) { for (int i=vstart; i<iend; i++)
for (int j=0; j<6; j++) { virial[j]+=engv[i];
vatom[i][j]+=engv[al]*0.5; if (_vf_atom)
virial_acc[j]+=engv[al]; if (_ilist==NULL)
al+=_inum; for (int i=vstart; i<iend; i++)
} vatom[i][j]+=engv[i];
} else { else
for (int j=0; j<6; j++) { for (int i=vstart; i<iend; i++)
virial_acc[j]+=engv[al]; vatom[_ilist[i]][j]+=engv[i];
al+=_inum; vstart+=_inum;
} iend+=_inum;
}
}
} }
for (int j=0; j<6; j++)
virial[j]+=virial_acc[j]*0.5;
} else {
for (int i=0; i<_inum; i++) {
int al=i;
int ii=_ilist[i];
if (_eflag) {
if (_ef_atom) {
evdwl+=engv[al];
eatom[ii]+=engv[al]*0.5;
al+=_inum;
} else {
evdwl+=engv[al];
al+=_inum;
}
}
if (_vflag) {
if (_vf_atom) {
for (int j=0; j<6; j++) {
vatom[ii][j]+=engv[al]*0.5;
virial_acc[j]+=engv[al];
al+=_inum;
}
} else {
for (int j=0; j<6; j++) {
virial_acc[j]+=engv[al];
al+=_inum;
}
}
}
}
for (int j=0; j<6; j++)
virial[j]+=virial_acc[j]*0.5;
} }
evdwl*=0.5;
return evdwl; return evdwl;
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
double AnswerT::energy_virial(double *eatom, double **vatom, double AnswerT::energy_virial(double *eatom, double **vatom,
double *virial, double &ecoul) { double *virial, double &ecoul) {
if (_eflag==false && _vflag==false) if (_eflag==false && _vflag==false)
return 0.0; return 0.0;
@ -254,84 +218,43 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
return energy_virial(eatom,vatom,virial); return energy_virial(eatom,vatom,virial);
double evdwl=0.0; double evdwl=0.0;
double _ecoul=0.0; int vstart=0, iend=_inum*2;
double virial_acc[6]; if (_eflag) {
for (int i=0; i<6; i++) virial_acc[i]=0.0; for (int i=0; i<_inum; i++)
if (_ilist==NULL) { evdwl+=engv[i];
for (int i=0; i<_inum; i++) { for (int i=_inum; i<iend; i++)
int al=i; ecoul+=engv[i];
if (_eflag) { if (_ef_atom)
if (_ef_atom) { if (_ilist==NULL) {
evdwl+=engv[al]; for (int i=0; i<_inum; i++)
eatom[i]+=engv[al]*0.5; eatom[i]+=engv[i];
al+=_inum; for (int i=_inum; i<iend; i++)
_ecoul+=engv[al]; eatom[i]+=engv[i];
eatom[i]+=engv[al]*0.5; } else {
al+=_inum; for (int i=0; i<_inum; i++)
} else { eatom[_ilist[i]]+=engv[i];
evdwl+=engv[al]; for (int i=_inum; i<iend; i++)
al+=_inum; eatom[_ilist[i]]+=engv[i];
_ecoul+=engv[al];
al+=_inum;
}
}
if (_vflag) {
if (_vf_atom) {
for (int j=0; j<6; j++) {
vatom[i][j]+=engv[al]*0.5;
virial_acc[j]+=engv[al];
al+=_inum;
}
} else {
for (int j=0; j<6; j++) {
virial_acc[j]+=engv[al];
al+=_inum;
}
}
} }
vstart=iend;
iend+=_inum;
}
if (_vflag) {
for (int j=0; j<6; j++) {
for (int i=vstart; i<iend; i++)
virial[j]+=engv[i];
if (_vf_atom)
if (_ilist==NULL)
for (int i=vstart; i<iend; i++)
vatom[i][j]+=engv[i];
else
for (int i=vstart; i<iend; i++)
vatom[_ilist[i]][j]+=engv[i];
vstart+=_inum;
iend+=_inum;
} }
for (int j=0; j<6; j++)
virial[j]+=virial_acc[j]*0.5;
} else {
for (int i=0; i<_inum; i++) {
int al=i;
int ii=_ilist[i];
if (_eflag) {
if (_ef_atom) {
evdwl+=engv[al];
eatom[ii]+=engv[al]*0.5;
al+=_inum;
_ecoul+=engv[al];
eatom[ii]+=engv[al]*0.5;
al+=_inum;
} else {
evdwl+=engv[al];
al+=_inum;
_ecoul+=engv[al];
al+=_inum;
}
}
if (_vflag) {
if (_vf_atom) {
for (int j=0; j<6; j++) {
vatom[ii][j]+=engv[al]*0.5;
virial_acc[j]+=engv[al];
al+=_inum;
}
} else {
for (int j=0; j<6; j++) {
virial_acc[j]+=engv[al];
al+=_inum;
}
}
}
}
for (int j=0; j<6; j++)
virial[j]+=virial_acc[j]*0.5;
} }
evdwl*=0.5;
ecoul+=_ecoul*0.5;
return evdwl; return evdwl;
} }
@ -373,4 +296,14 @@ void AnswerT::get_answers(double **f, double **tor) {
} }
} }
template <class numtyp, class acctyp>
void AnswerT::cq(const int cq_index) {
engv.cq(dev->cq(cq_index));
force.cq(dev->cq(cq_index));
time_answer.clear();
time_answer.init(*dev,dev->cq(cq_index));
time_answer.zero();
}
template class Answer<PRECISION,ACC_PRECISION>; template class Answer<PRECISION,ACC_PRECISION>;

View File

@ -47,6 +47,8 @@ class Answer {
inline int inum() const { return _inum; } inline int inum() const { return _inum; }
/// Set number of local atoms for future copy operations /// Set number of local atoms for future copy operations
inline void inum(const int n) { _inum=n; } inline void inum(const int n) { _inum=n; }
/// Return the maximum number of atoms that can be stored currently
inline int max_inum() const { return _max_local; }
/// Memory usage per atom in this class /// Memory usage per atom in this class
int bytes_per_atom() const; int bytes_per_atom() const;
@ -132,6 +134,9 @@ class Answer {
/// Return the time the CPU was idle waiting for GPU /// Return the time the CPU was idle waiting for GPU
inline double cpu_idle_time() { return _time_cpu_idle; } inline double cpu_idle_time() { return _time_cpu_idle; }
/// Change the command queue used for copies and timers
void cq(const int cq_index);
// ------------------------------ DATA ---------------------------------- // ------------------------------ DATA ----------------------------------
/// Force and possibly torque /// Force and possibly torque

View File

@ -70,44 +70,47 @@ bool AtomT::alloc(const int nall) {
// --------------------------- Device allocations // --------------------------- Device allocations
int gpu_bytes=0; int gpu_bytes=0;
success=success && (x.alloc(_max_atoms*4,*dev,UCL_WRITE_OPTIMIZED, success=success && (x.alloc(_max_atoms*4,*dev,UCL_WRITE_ONLY,
UCL_READ_ONLY)==UCL_SUCCESS); UCL_READ_ONLY)==UCL_SUCCESS);
#ifdef GPU_CAST #ifdef GPU_CAST
success=success && (x_cast.alloc(_max_atoms*3,*dev,UCL_READ_ONLY)== success=success && (x_cast.alloc(_max_atoms*3,*dev,UCL_WRITE_ONLY,
UCL_SUCCESS); UCL_READ_ONLY)==UCL_SUCCESS);
success=success && (type_cast.alloc(_max_atoms,*dev,UCL_READ_ONLY)== success=success && (type_cast.alloc(_max_atoms,*dev,UCL_WRITE_ONLY,
UCL_SUCCESS); UCL_READ_ONLY)==UCL_SUCCESS);
gpu_bytes+=x_cast.device.row_bytes()+type_cast.device.row_bytes(); gpu_bytes+=x_cast.device.row_bytes()+type_cast.device.row_bytes();
#endif #endif
if (_charge && _host_view==false) { if (_charge && _host_view==false) {
success=success && (q.alloc(_max_atoms,*dev,UCL_WRITE_OPTIMIZED, success=success && (q.alloc(_max_atoms,*dev,UCL_WRITE_ONLY,
UCL_READ_ONLY)==UCL_SUCCESS); UCL_READ_ONLY)==UCL_SUCCESS);
gpu_bytes+=q.device.row_bytes(); gpu_bytes+=q.device.row_bytes();
} }
if (_rot && _host_view==false) { if (_rot && _host_view==false) {
success=success && (quat.alloc(_max_atoms*4,*dev,UCL_WRITE_OPTIMIZED, success=success && (quat.alloc(_max_atoms*4,*dev,UCL_WRITE_ONLY,
UCL_READ_ONLY)==UCL_SUCCESS); UCL_READ_ONLY)==UCL_SUCCESS);
gpu_bytes+=quat.device.row_bytes(); gpu_bytes+=quat.device.row_bytes();
} }
if (_gpu_nbor>0) { if (_gpu_nbor>0) {
if (_bonds) { if (_bonds) {
success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS); success=success && (dev_tag.alloc(_max_atoms,*dev,
UCL_READ_ONLY)==UCL_SUCCESS);
gpu_bytes+=dev_tag.row_bytes(); gpu_bytes+=dev_tag.row_bytes();
} }
if (_gpu_nbor==1) { if (_gpu_nbor==1) {
success=success && (dev_cell_id.alloc(_max_atoms,*dev)==UCL_SUCCESS); success=success && (dev_cell_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
gpu_bytes+=dev_cell_id.row_bytes(); gpu_bytes+=dev_cell_id.row_bytes();
} else { } else {
success=success && (host_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS); success=success && (host_particle_id.alloc(_max_atoms,*dev,
UCL_WRITE_ONLY)==UCL_SUCCESS);
success=success && success=success &&
(host_cell_id.alloc(_max_atoms,*dev,UCL_NOT_PINNED)==UCL_SUCCESS); (host_cell_id.alloc(_max_atoms,*dev,UCL_NOT_PINNED)==UCL_SUCCESS);
} }
if (_gpu_nbor==2 && _host_view) if (_gpu_nbor==2 && _host_view)
dev_particle_id.view(host_particle_id); dev_particle_id.view(host_particle_id);
else else
success=success && (dev_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS); success=success && (dev_particle_id.alloc(_max_atoms,*dev,
UCL_READ_ONLY)==UCL_SUCCESS);
gpu_bytes+=dev_particle_id.row_bytes(); gpu_bytes+=dev_particle_id.row_bytes();
} }
@ -130,7 +133,7 @@ bool AtomT::add_fields(const bool charge, const bool rot,
_charge=true; _charge=true;
_other=true; _other=true;
if (_host_view==false) { if (_host_view==false) {
success=success && (q.alloc(_max_atoms,*dev,UCL_WRITE_OPTIMIZED, success=success && (q.alloc(_max_atoms,*dev,UCL_WRITE_ONLY,
UCL_READ_ONLY)==UCL_SUCCESS); UCL_READ_ONLY)==UCL_SUCCESS);
gpu_bytes+=q.device.row_bytes(); gpu_bytes+=q.device.row_bytes();
} }
@ -140,7 +143,7 @@ bool AtomT::add_fields(const bool charge, const bool rot,
_rot=true; _rot=true;
_other=true; _other=true;
if (_host_view==false) { if (_host_view==false) {
success=success && (quat.alloc(_max_atoms*4,*dev,UCL_WRITE_OPTIMIZED, success=success && (quat.alloc(_max_atoms*4,*dev,UCL_WRITE_ONLY,
UCL_READ_ONLY)==UCL_SUCCESS); UCL_READ_ONLY)==UCL_SUCCESS);
gpu_bytes+=quat.device.row_bytes(); gpu_bytes+=quat.device.row_bytes();
} }
@ -149,7 +152,8 @@ bool AtomT::add_fields(const bool charge, const bool rot,
if (bonds && _bonds==false) { if (bonds && _bonds==false) {
_bonds=true; _bonds=true;
if (_bonds && _gpu_nbor>0) { if (_bonds && _gpu_nbor>0) {
success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS); success=success && (dev_tag.alloc(_max_atoms,*dev,
UCL_READ_ONLY)==UCL_SUCCESS);
gpu_bytes+=dev_tag.row_bytes(); gpu_bytes+=dev_tag.row_bytes();
} }
} }
@ -163,17 +167,20 @@ bool AtomT::add_fields(const bool charge, const bool rot,
return false; return false;
} }
#endif #endif
success=success && (dev_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS); success=success && (dev_particle_id.alloc(_max_atoms,*dev,
UCL_READ_ONLY)==UCL_SUCCESS);
gpu_bytes+=dev_particle_id.row_bytes(); gpu_bytes+=dev_particle_id.row_bytes();
if (_bonds) { if (_bonds) {
success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS); success=success && (dev_tag.alloc(_max_atoms,*dev,
UCL_READ_ONLY)==UCL_SUCCESS);
gpu_bytes+=dev_tag.row_bytes(); gpu_bytes+=dev_tag.row_bytes();
} }
if (_gpu_nbor==1) { if (_gpu_nbor==1) {
success=success && (dev_cell_id.alloc(_max_atoms,*dev)==UCL_SUCCESS); success=success && (dev_cell_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
gpu_bytes+=dev_cell_id.row_bytes(); gpu_bytes+=dev_cell_id.row_bytes();
} else { } else {
success=success && (host_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS); success=success && (host_particle_id.alloc(_max_atoms,*dev,
UCL_WRITE_ONLY)==UCL_SUCCESS);
success=success && success=success &&
(host_cell_id.alloc(_max_atoms,*dev,UCL_NOT_PINNED)==UCL_SUCCESS); (host_cell_id.alloc(_max_atoms,*dev,UCL_NOT_PINNED)==UCL_SUCCESS);
} }

View File

@ -77,12 +77,12 @@
if (offset==0) { \ if (offset==0) { \
engv+=ii; \ engv+=ii; \
if (eflag>0) { \ if (eflag>0) { \
*engv=energy; \ *engv=energy*(acctyp)0.5; \
engv+=inum; \ engv+=inum; \
} \ } \
if (vflag>0) { \ if (vflag>0) { \
for (int i=0; i<6; i++) { \ for (int i=0; i<6; i++) { \
*engv=virial[i]; \ *engv=virial[i]*(acctyp)0.5; \
engv+=inum; \ engv+=inum; \
} \ } \
} \ } \
@ -125,14 +125,14 @@
if (offset==0) { \ if (offset==0) { \
engv+=ii; \ engv+=ii; \
if (eflag>0) { \ if (eflag>0) { \
*engv=energy; \ *engv=energy*(acctyp)0.5; \
engv+=inum; \ engv+=inum; \
*engv=e_coul; \ *engv=e_coul*(acctyp)0.5; \
engv+=inum; \ engv+=inum; \
} \ } \
if (vflag>0) { \ if (vflag>0) { \
for (int i=0; i<6; i++) { \ for (int i=0; i<6; i++) { \
*engv=virial[i]; \ *engv=virial[i]*(acctyp)0.5; \
engv+=inum; \ engv+=inum; \
} \ } \
} \ } \
@ -160,12 +160,12 @@
if (offset==0) { \ if (offset==0) { \
engv+=ii; \ engv+=ii; \
if (eflag>0) { \ if (eflag>0) { \
*engv=energy; \ *engv=energy*(acctyp)0.5; \
engv+=inum; \ engv+=inum; \
} \ } \
if (vflag>0) { \ if (vflag>0) { \
for (int i=0; i<6; i++) { \ for (int i=0; i<6; i++) { \
*engv=virial[i]; \ *engv=virial[i]*(acctyp)0.5; \
engv+=inum; \ engv+=inum; \
} \ } \
} \ } \
@ -192,14 +192,14 @@
if (offset==0) { \ if (offset==0) { \
engv+=ii; \ engv+=ii; \
if (eflag>0) { \ if (eflag>0) { \
*engv=energy; \ *engv=energy*(acctyp)0.5; \
engv+=inum; \ engv+=inum; \
*engv=e_coul; \ *engv=e_coul*(acctyp)0.5; \
engv+=inum; \ engv+=inum; \
} \ } \
if (vflag>0) { \ if (vflag>0) { \
for (int i=0; i<6; i++) { \ for (int i=0; i<6; i++) { \
*engv=virial[i]; \ *engv=virial[i]*(acctyp)0.5; \
engv+=inum; \ engv+=inum; \
} \ } \
} \ } \

View File

@ -272,12 +272,8 @@ void BaseAtomicT::compile_kernels(UCL_Device &dev, const void *pair_str,
return; return;
std::string s_fast=std::string(kname)+"_fast"; std::string s_fast=std::string(kname)+"_fast";
std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
std::string(OCL_PRECISION_COMPILE)+" -D"+
std::string(OCL_VENDOR);
pair_program=new UCL_Program(dev); pair_program=new UCL_Program(dev);
pair_program->load_string(pair_str,flags.c_str()); pair_program->load_string(pair_str,device->compile_string().c_str());
k_pair_fast.set_function(*pair_program,s_fast.c_str()); k_pair_fast.set_function(*pair_program,s_fast.c_str());
k_pair.set_function(*pair_program,kname); k_pair.set_function(*pair_program,kname);
pos_tex.get_texture(*pair_program,"pos_tex"); pos_tex.get_texture(*pair_program,"pos_tex");

View File

@ -288,12 +288,8 @@ void BaseChargeT::compile_kernels(UCL_Device &dev, const void *pair_str,
return; return;
std::string s_fast=std::string(kname)+"_fast"; std::string s_fast=std::string(kname)+"_fast";
std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
std::string(OCL_PRECISION_COMPILE)+" -D"+
std::string(OCL_VENDOR);
pair_program=new UCL_Program(dev); pair_program=new UCL_Program(dev);
pair_program->load_string(pair_str,flags.c_str()); pair_program->load_string(pair_str,device->compile_string().c_str());
k_pair_fast.set_function(*pair_program,s_fast.c_str()); k_pair_fast.set_function(*pair_program,s_fast.c_str());
k_pair.set_function(*pair_program,kname); k_pair.set_function(*pair_program,kname);
pos_tex.get_texture(*pair_program,"pos_tex"); pos_tex.get_texture(*pair_program,"pos_tex");

View File

@ -296,12 +296,8 @@ void BaseDipoleT::compile_kernels(UCL_Device &dev, const void *pair_str,
return; return;
std::string s_fast=std::string(kname)+"_fast"; std::string s_fast=std::string(kname)+"_fast";
std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
std::string(OCL_PRECISION_COMPILE)+" -D"+
std::string(OCL_VENDOR);
pair_program=new UCL_Program(dev); pair_program=new UCL_Program(dev);
pair_program->load_string(pair_str,flags.c_str()); pair_program->load_string(pair_str,device->compile_string().c_str());
k_pair_fast.set_function(*pair_program,s_fast.c_str()); k_pair_fast.set_function(*pair_program,s_fast.c_str());
k_pair.set_function(*pair_program,kname); k_pair.set_function(*pair_program,kname);
pos_tex.get_texture(*pair_program,"pos_tex"); pos_tex.get_texture(*pair_program,"pos_tex");

View File

@ -455,9 +455,7 @@ void BaseEllipsoidT::compile_kernels(UCL_Device &dev,
std::string s_lj=kns+"_lj"; std::string s_lj=kns+"_lj";
std::string s_lj_fast=kns+"_lj_fast"; std::string s_lj_fast=kns+"_lj_fast";
std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+ std::string flags=device->compile_string();
std::string(OCL_PRECISION_COMPILE)+" -D"+
std::string(OCL_VENDOR);
nbor_program=new UCL_Program(dev); nbor_program=new UCL_Program(dev);
nbor_program->load_string(ellipsoid_nbor,flags.c_str()); nbor_program->load_string(ellipsoid_nbor,flags.c_str());

View File

@ -69,7 +69,7 @@ int BornT::init(const int ntypes, double **host_cutsq,
// Allocate a host write buffer for data initialization // Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device), UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED); UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++) for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0; host_write[i]=0.0;

View File

@ -54,7 +54,8 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);
@ -148,7 +149,8 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);

View File

@ -73,7 +73,7 @@ int BornCoulLongT::init(const int ntypes, double **host_cutsq, double **host_rho
// Allocate a host write buffer for data initialization // Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device), UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED); UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++) for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0; host_write[i]=0.0;

View File

@ -67,7 +67,8 @@ __kernel void k_born_long(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);
@ -188,7 +189,8 @@ __kernel void k_born_long_fast(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);

View File

@ -73,7 +73,7 @@ int BornCoulWolfT::init(const int ntypes, double **host_cutsq, double **host_rho
// Allocate a host write buffer for data initialization // Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device), UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED); UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++) for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0; host_write[i]=0.0;

View File

@ -70,7 +70,8 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);
@ -201,7 +202,8 @@ __kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);

View File

@ -68,7 +68,7 @@ int BuckT::init(const int ntypes, double **host_cutsq,
// Allocate a host write buffer for data initialization // Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device), UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED); UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++) for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0; host_write[i]=0.0;

View File

@ -53,7 +53,8 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);
@ -145,7 +146,8 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);

View File

@ -70,7 +70,7 @@ int BuckCoulT::init(const int ntypes, double **host_cutsq,
// Allocate a host write buffer for data initialization // Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device), UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED); UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++) for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0; host_write[i]=0.0;

View File

@ -66,7 +66,8 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);
@ -182,7 +183,8 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);

View File

@ -72,7 +72,7 @@ int BuckCoulLongT::init(const int ntypes, double **host_cutsq,
// Allocate a host write buffer for data initialization // Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device), UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED); UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++) for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0; host_write[i]=0.0;

View File

@ -67,7 +67,8 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);
@ -190,7 +191,8 @@ __kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);

View File

@ -69,7 +69,7 @@ int CGCMMT::init(const int ntypes, double **host_cutsq,
// Allocate a host write buffer for data initialization // Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(cmm_types*cmm_types*32,*(this->ucl_device), UCL_H_Vec<numtyp> host_write(cmm_types*cmm_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED); UCL_WRITE_ONLY);
for (int i=0; i<cmm_types*cmm_types; i++) for (int i=0; i<cmm_types*cmm_types; i++)
host_write[i]=0.0; host_write[i]=0.0;

View File

@ -53,7 +53,8 @@ __kernel void k_cg_cmm(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);
@ -150,7 +151,8 @@ __kernel void k_cg_cmm_fast(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);

View File

@ -74,7 +74,7 @@ int CGCMMLongT::init(const int ntypes, double **host_cutsq,
// Allocate a host write buffer for data initialization // Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device), UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED); UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++) for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0; host_write[i]=0.0;

View File

@ -66,7 +66,8 @@ __kernel void k_cg_cmm_long(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);
@ -191,7 +192,8 @@ __kernel void k_cg_cmm_long_fast(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);

View File

@ -76,7 +76,7 @@ int CHARMMLongT::init(const int ntypes,
if (h_size<max_bio_shared_types) if (h_size<max_bio_shared_types)
h_size=max_bio_shared_types; h_size=max_bio_shared_types;
UCL_H_Vec<numtyp> host_write(h_size*32,*(this->ucl_device), UCL_H_Vec<numtyp> host_write(h_size*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED); UCL_WRITE_ONLY);
for (int i=0; i<h_size*32; i++) for (int i=0; i<h_size*32; i++)
host_write[i]=0.0; host_write[i]=0.0;

View File

@ -67,7 +67,8 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);
@ -196,7 +197,8 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);

View File

@ -73,7 +73,7 @@ int ColloidT::init(const int ntypes,
// Allocate a host write buffer for data initialization // Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device), UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED); UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++) for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0; host_write[i]=0.0;
@ -95,7 +95,7 @@ int ColloidT::init(const int ntypes,
host_sigma3,host_sigma6); host_sigma3,host_sigma6);
UCL_H_Vec<int> dview_form(lj_types*lj_types,*(this->ucl_device), UCL_H_Vec<int> dview_form(lj_types*lj_types,*(this->ucl_device),
UCL_WRITE_OPTIMIZED); UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++) dview_form[i]=0; for (int i=0; i<lj_types*lj_types; i++) dview_form[i]=0;
form.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); form.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);

View File

@ -56,7 +56,8 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);
@ -219,7 +220,8 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);

View File

@ -73,7 +73,7 @@ int CoulDSFT::init(const int ntypes, const int nlocal, const int nall,
// Allocate a host write buffer for data initialization // Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device), UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED); UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++) for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0; host_write[i]=0.0;

View File

@ -63,7 +63,8 @@ __kernel void k_coul_dsf(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);
@ -163,7 +164,8 @@ __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);

View File

@ -68,7 +68,7 @@ int CoulLongT::init(const int nlocal, const int nall, const int max_nbors,
// Allocate a host write buffer for data initialization // Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device), UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED); UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++) for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0; host_write[i]=0.0;

View File

@ -74,12 +74,12 @@ texture<int2> q_tex;
if (eflag>0) { \ if (eflag>0) { \
*ap1=(acctyp)0; \ *ap1=(acctyp)0; \
ap1+=inum; \ ap1+=inum; \
*ap1=e_coul; \ *ap1=e_coul*(acctyp)0.5; \
ap1+=inum; \ ap1+=inum; \
} \ } \
if (vflag>0) { \ if (vflag>0) { \
for (int i=0; i<6; i++) { \ for (int i=0; i<6; i++) { \
*ap1=virial[i]; \ *ap1=virial[i]*(acctyp)0.5; \
ap1+=inum; \ ap1+=inum; \
} \ } \
} \ } \
@ -109,12 +109,12 @@ texture<int2> q_tex;
if (eflag>0) { \ if (eflag>0) { \
*ap1=(acctyp)0; \ *ap1=(acctyp)0; \
ap1+=inum; \ ap1+=inum; \
*ap1=e_coul; \ *ap1=e_coul*(acctyp)0.5; \
ap1+=inum; \ ap1+=inum; \
} \ } \
if (vflag>0) { \ if (vflag>0) { \
for (int i=0; i<6; i++) { \ for (int i=0; i<6; i++) { \
*ap1=virial[i]; \ *ap1=virial[i]*(acctyp)0.5; \
ap1+=inum; \ ap1+=inum; \
} \ } \
} \ } \
@ -155,7 +155,8 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);
@ -244,7 +245,8 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);

View File

@ -47,7 +47,8 @@ template <class numtyp, class acctyp>
int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
const int last_gpu, const int gpu_mode, const int last_gpu, const int gpu_mode,
const double p_split, const int nthreads, const double p_split, const int nthreads,
const int t_per_atom, const double cell_size) { const int t_per_atom, const double cell_size,
char *ocl_vendor) {
_nthreads=nthreads; _nthreads=nthreads;
#ifdef _OPENMP #ifdef _OPENMP
omp_set_num_threads(nthreads); omp_set_num_threads(nthreads);
@ -140,6 +141,9 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
_long_range_precompute=0; _long_range_precompute=0;
if (set_ocl_params(ocl_vendor)!=0)
return -11;
int flag=0; int flag=0;
for (int i=0; i<_procs_per_gpu; i++) { for (int i=0; i<_procs_per_gpu; i++) {
if (_gpu_rank==i) if (_gpu_rank==i)
@ -149,6 +153,64 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
return flag; return flag;
} }
template <class numtyp, class acctyp>
int DeviceT::set_ocl_params(char *ocl_vendor) {
#ifdef USE_OPENCL
std::string s_vendor=OCL_DEFAULT_VENDOR;
if (ocl_vendor!=NULL)
s_vendor=ocl_vendor;
if (s_vendor=="none")
s_vendor="generic";
if (s_vendor=="kepler") {
_ocl_vendor_name="NVIDIA Kepler";
#if defined (__APPLE__) || defined(MACOSX)
_ocl_vendor_string="-DKEPLER_OCL -DNO_OCL_PTX";
#else
_ocl_vendor_string="-DKEPLER_OCL";
#endif
} else if (s_vendor=="fermi") {
_ocl_vendor_name="NVIDIA Fermi";
_ocl_vendor_string="-DFERMI_OCL";
} else if (s_vendor=="cypress") {
_ocl_vendor_name="AMD Cypress";
_ocl_vendor_string="-DCYPRESS_OCL";
} else if (s_vendor=="generic") {
_ocl_vendor_name="GENERIC";
_ocl_vendor_string="-DGENERIC_OCL";
} else {
_ocl_vendor_name="CUSTOM";
_ocl_vendor_string="-DUSE_OPENCL";
int token_count=0;
std::string params[13];
char *pch = strtok(ocl_vendor,"\" ");
while (pch != NULL) {
if (token_count==13)
return -11;
params[token_count]=pch;
token_count++;
pch = strtok(NULL,"\" ");
}
_ocl_vendor_string+=" -DMEM_THREADS="+params[0]+
" -DTHREADS_PER_ATOM="+params[1]+
" -DTHREADS_PER_CHARGE="+params[2]+
" -DBLOCK_PAIR="+params[3]+
" -DMAX_SHARED_TYPES="+params[4]+
" -DBLOCK_NBOR_BUILD="+params[5]+
" -DBLOCK_BIO_PAIR="+params[6]+
" -DBLOCK_ELLIPSE="+params[7]+
" -DWARP_SIZE="+params[8]+
" -DPPPM_BLOCK_1D="+params[9]+
" -DBLOCK_CELL_2D="+params[10]+
" -DBLOCK_CELL_ID="+params[11]+
" -DMAX_BIO_SHARED_TYPES="+params[12];
}
_ocl_compile_string="-cl-fast-relaxed-math -cl-mad-enable "+
std::string(OCL_PRECISION_COMPILE)+" "+_ocl_vendor_string;
#endif
return 0;
}
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge, int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
const bool rot, const int nlocal, const bool rot, const int nlocal,
@ -206,7 +268,7 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
if (!nbor->init(&_neighbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial, if (!nbor->init(&_neighbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial,
*gpu,gpu_nbor,gpu_host,pre_cut, _block_cell_2d, *gpu,gpu_nbor,gpu_host,pre_cut, _block_cell_2d,
_block_cell_id, _block_nbor_build, threads_per_atom, _block_cell_id, _block_nbor_build, threads_per_atom,
_warp_size, _time_device)) _warp_size, _time_device, compile_string()))
return -3; return -3;
if (_cell_size<0.0) if (_cell_size<0.0)
nbor->cell_size(cell_size,cell_size); nbor->cell_size(cell_size,cell_size);
@ -274,7 +336,8 @@ void DeviceT::init_message(FILE *screen, const char *name,
fprintf(screen,"- with %d thread(s) per proc.\n",_nthreads); fprintf(screen,"- with %d thread(s) per proc.\n",_nthreads);
#endif #endif
#ifdef USE_OPENCL #ifdef USE_OPENCL
fprintf(screen,"- with OpenCL Parameters for: %s\n",OCL_VENDOR); fprintf(screen,"- with OpenCL Parameters for: %s\n",
_ocl_vendor_name.c_str());
#endif #endif
fprintf(screen,"-------------------------------------"); fprintf(screen,"-------------------------------------");
fprintf(screen,"-------------------------------------\n"); fprintf(screen,"-------------------------------------\n");
@ -571,9 +634,8 @@ int DeviceT::compile_kernels() {
if (_compiled) if (_compiled)
return flag; return flag;
std::string flags="-cl-mad-enable -D"+std::string(OCL_VENDOR);
dev_program=new UCL_Program(*gpu); dev_program=new UCL_Program(*gpu);
int success=dev_program->load_string(device,flags.c_str()); int success=dev_program->load_string(device,compile_string().c_str());
if (success!=UCL_SUCCESS) if (success!=UCL_SUCCESS)
return -4; return -4;
k_zero.set_function(*dev_program,"kernel_zero"); k_zero.set_function(*dev_program,"kernel_zero");
@ -640,10 +702,11 @@ Device<PRECISION,ACC_PRECISION> global_device;
int lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, int lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
const int last_gpu, const int gpu_mode, const int last_gpu, const int gpu_mode,
const double particle_split, const int nthreads, const double particle_split, const int nthreads,
const int t_per_atom, const double cell_size) { const int t_per_atom, const double cell_size,
char *opencl_vendor) {
return global_device.init_device(world,replica,first_gpu,last_gpu,gpu_mode, return global_device.init_device(world,replica,first_gpu,last_gpu,gpu_mode,
particle_split,nthreads,t_per_atom, particle_split,nthreads,t_per_atom,
cell_size); cell_size,opencl_vendor);
} }
void lmp_clear_device() { void lmp_clear_device() {
@ -654,3 +717,4 @@ double lmp_gpu_forces(double **f, double **tor, double *eatom,
double **vatom, double *virial, double &ecoul) { double **vatom, double *virial, double &ecoul) {
return global_device.fix_gpu(f,tor,eatom,vatom,virial,ecoul); return global_device.fix_gpu(f,tor,eatom,vatom,virial,ecoul);
} }

View File

@ -45,11 +45,13 @@ class Device {
* - -2 if GPU not found * - -2 if GPU not found
* - -4 if GPU library not compiled for GPU * - -4 if GPU library not compiled for GPU
* - -6 if GPU could not be initialized for use * - -6 if GPU could not be initialized for use
* - -7 if accelerator sharing is not currently allowed on system **/ * - -7 if accelerator sharing is not currently allowed on system
* - -11 if vendor_string has the wrong number of parameters **/
int init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, int init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
const int last_gpu, const int gpu_mode, const int last_gpu, const int gpu_mode,
const double particle_split, const int nthreads, const double particle_split, const int nthreads,
const int t_per_atom, const double cell_size); const int t_per_atom, const double cell_size,
char *vendor_string);
/// Initialize the device for Atom and Neighbor storage /// Initialize the device for Atom and Neighbor storage
/** \param rot True if quaternions need to be stored /** \param rot True if quaternions need to be stored
@ -234,6 +236,8 @@ class Device {
inline int max_bio_shared_types() const { return _max_bio_shared_types; } inline int max_bio_shared_types() const { return _max_bio_shared_types; }
/// Architecture gpu code compiled for (returns 0 for OpenCL) /// Architecture gpu code compiled for (returns 0 for OpenCL)
inline double ptx_arch() const { return _ptx_arch; } inline double ptx_arch() const { return _ptx_arch; }
/// Number of threads executing concurrently on same multiproc
inline int warp_size() const { return _warp_size; }
// -------------------- SHARED DEVICE ROUTINES -------------------- // -------------------- SHARED DEVICE ROUTINES --------------------
// Perform asynchronous zero of integer array // Perform asynchronous zero of integer array
@ -279,6 +283,8 @@ class Device {
boxlo,prd); boxlo,prd);
} }
inline std::string compile_string() { return _ocl_compile_string; }
private: private:
std::queue<Answer<numtyp,acctyp> *> ans_queue; std::queue<Answer<numtyp,acctyp> *> ans_queue;
int _init_count; int _init_count;
@ -305,6 +311,9 @@ class Device {
int _data_in_estimate, _data_out_estimate; int _data_in_estimate, _data_out_estimate;
std::string _ocl_vendor_name, _ocl_vendor_string, _ocl_compile_string;
int set_ocl_params(char *);
template <class t> template <class t>
inline std::string toa(const t& in) { inline std::string toa(const t& in) {
std::ostringstream o; std::ostringstream o;

View File

@ -72,7 +72,7 @@ int DipoleLJT::init(const int ntypes,
// Allocate a host write buffer for data initialization // Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device), UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED); UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++) for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0; host_write[i]=0.0;

View File

@ -75,14 +75,14 @@ texture<int4,1> mu_tex;
if (offset==0) { \ if (offset==0) { \
engv+=ii; \ engv+=ii; \
if (eflag>0) { \ if (eflag>0) { \
*engv=energy; \ *engv=energy*(acctyp)0.5; \
engv+=inum; \ engv+=inum; \
*engv=e_coul; \ *engv=e_coul*(acctyp)0.5; \
engv+=inum; \ engv+=inum; \
} \ } \
if (vflag>0) { \ if (vflag>0) { \
for (int i=0; i<6; i++) { \ for (int i=0; i<6; i++) { \
*engv=virial[i]; \ *engv=virial[i]*(acctyp)0.5; \
engv+=inum; \ engv+=inum; \
} \ } \
} \ } \
@ -115,14 +115,14 @@ texture<int4,1> mu_tex;
if (offset==0) { \ if (offset==0) { \
engv+=ii; \ engv+=ii; \
if (eflag>0) { \ if (eflag>0) { \
*engv=energy; \ *engv=energy*(acctyp)0.5; \
engv+=inum; \ engv+=inum; \
*engv=e_coul; \ *engv=e_coul*(acctyp)0.5; \
engv+=inum; \ engv+=inum; \
} \ } \
if (vflag>0) { \ if (vflag>0) { \
for (int i=0; i<6; i++) { \ for (int i=0; i<6; i++) { \
*engv=virial[i]; \ *engv=virial[i]*(acctyp)0.5; \
engv+=inum; \ engv+=inum; \
} \ } \
} \ } \
@ -174,7 +174,8 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);
@ -385,7 +386,8 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);

View File

@ -72,7 +72,7 @@ int DipoleLJSFT::init(const int ntypes,
// Allocate a host write buffer for data initialization // Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device), UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED); UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++) for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0; host_write[i]=0.0;

View File

@ -76,14 +76,14 @@ texture<int4,1> mu_tex;
if (offset==0) { \ if (offset==0) { \
engv+=ii; \ engv+=ii; \
if (eflag>0) { \ if (eflag>0) { \
*engv=energy; \ *engv=energy*(acctyp)0.5; \
engv+=inum; \ engv+=inum; \
*engv=e_coul; \ *engv=e_coul*(acctyp)0.5; \
engv+=inum; \ engv+=inum; \
} \ } \
if (vflag>0) { \ if (vflag>0) { \
for (int i=0; i<6; i++) { \ for (int i=0; i<6; i++) { \
*engv=virial[i]; \ *engv=virial[i]*(acctyp)0.5; \
engv+=inum; \ engv+=inum; \
} \ } \
} \ } \
@ -116,14 +116,14 @@ texture<int4,1> mu_tex;
if (offset==0) { \ if (offset==0) { \
engv+=ii; \ engv+=ii; \
if (eflag>0) { \ if (eflag>0) { \
*engv=energy; \ *engv=energy*(acctyp)0.5; \
engv+=inum; \ engv+=inum; \
*engv=e_coul; \ *engv=e_coul*(acctyp)0.5; \
engv+=inum; \ engv+=inum; \
} \ } \
if (vflag>0) { \ if (vflag>0) { \
for (int i=0; i<6; i++) { \ for (int i=0; i<6; i++) { \
*engv=virial[i]; \ *engv=virial[i]*(acctyp)0.5; \
engv+=inum; \ engv+=inum; \
} \ } \
} \ } \
@ -175,7 +175,8 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);
@ -418,7 +419,8 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);

View File

@ -66,7 +66,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
ef_nall=2000; ef_nall=2000;
_max_fp_size=static_cast<int>(static_cast<double>(ef_nall)*1.10); _max_fp_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
_fp.alloc(_max_fp_size,*(this->ucl_device),UCL_RW_OPTIMIZED,UCL_WRITE_ONLY); _fp.alloc(_max_fp_size,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
k_energy.set_function(*(this->pair_program),"k_energy"); k_energy.set_function(*(this->pair_program),"k_energy");
k_energy_fast.set_function(*(this->pair_program),"k_energy_fast"); k_energy_fast.set_function(*(this->pair_program),"k_energy_fast");
@ -106,7 +106,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
_nr=nr; _nr=nr;
UCL_H_Vec<int2> dview_type(lj_types*lj_types,*(this->ucl_device), UCL_H_Vec<int2> dview_type(lj_types*lj_types,*(this->ucl_device),
UCL_WRITE_OPTIMIZED); UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++) { for (int i=0; i<lj_types*lj_types; i++) {
dview_type[i].x=0; dview_type[i].y=0; dview_type[i].x=0; dview_type[i].y=0;
@ -126,7 +126,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
// pack type2frho // pack type2frho
UCL_H_Vec<int> dview_type2frho(lj_types,*(this->ucl_device), UCL_H_Vec<int> dview_type2frho(lj_types,*(this->ucl_device),
UCL_WRITE_OPTIMIZED); UCL_WRITE_ONLY);
type2frho.alloc(lj_types,*(this->ucl_device),UCL_READ_ONLY); type2frho.alloc(lj_types,*(this->ucl_device),UCL_READ_ONLY);
for (int i=0; i<ntypes; i++) for (int i=0; i<ntypes; i++)
@ -135,7 +135,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
// pack frho_spline // pack frho_spline
UCL_H_Vec<numtyp4> dview_frho_spline(nfrho*(nrho+1),*(this->ucl_device), UCL_H_Vec<numtyp4> dview_frho_spline(nfrho*(nrho+1),*(this->ucl_device),
UCL_WRITE_OPTIMIZED); UCL_WRITE_ONLY);
for (int ix=0; ix<nfrho; ix++) for (int ix=0; ix<nfrho; ix++)
for (int iy=0; iy<nrho+1; iy++) { for (int iy=0; iy<nrho+1; iy++) {
@ -165,7 +165,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
// pack rhor_spline // pack rhor_spline
UCL_H_Vec<numtyp4> dview_rhor_spline(nrhor*(nr+1),*(this->ucl_device), UCL_H_Vec<numtyp4> dview_rhor_spline(nrhor*(nr+1),*(this->ucl_device),
UCL_WRITE_OPTIMIZED); UCL_WRITE_ONLY);
for (int ix=0; ix<nrhor; ix++) for (int ix=0; ix<nrhor; ix++)
for (int iy=0; iy<nr+1; iy++) { for (int iy=0; iy<nr+1; iy++) {
@ -195,7 +195,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
// pack z2r_spline // pack z2r_spline
UCL_H_Vec<numtyp4> dview_z2r_spline(nz2r*(nr+1),*(this->ucl_device), UCL_H_Vec<numtyp4> dview_z2r_spline(nz2r*(nr+1),*(this->ucl_device),
UCL_WRITE_OPTIMIZED); UCL_WRITE_ONLY);
for (int ix=0; ix<nz2r; ix++) for (int ix=0; ix<nz2r; ix++)
for (int iy=0; iy<nr+1; iy++) { for (int iy=0; iy<nr+1; iy++) {

View File

@ -79,7 +79,7 @@ texture<int4> z2r_sp2_tex;
fetch4(coeff,index,frho_sp2_tex); \ fetch4(coeff,index,frho_sp2_tex); \
energy = ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w; \ energy = ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w; \
if (rho > rhomax) energy += fp*(rho-rhomax); \ if (rho > rhomax) energy += fp*(rho-rhomax); \
engv[ii]=(acctyp)2.0*energy; \ engv[ii]=energy; \
} \ } \
} }
@ -116,12 +116,12 @@ texture<int4> z2r_sp2_tex;
} \ } \
if (offset==0) { \ if (offset==0) { \
if (eflag>0) { \ if (eflag>0) { \
engv[ii]+=energy; \ engv[ii]+=energy*(acctyp)0.5; \
engv+=inum; \ engv+=inum; \
} \ } \
if (vflag>0) { \ if (vflag>0) { \
for (int i=0; i<6; i++) { \ for (int i=0; i<6; i++) { \
engv[ii]=virial[i]; \ engv[ii]=virial[i]*(acctyp)0.5; \
engv+=inum; \ engv+=inum; \
} \ } \
} \ } \
@ -150,7 +150,7 @@ texture<int4> z2r_sp2_tex;
fetch4(coeff,index,frho_sp2_tex); \ fetch4(coeff,index,frho_sp2_tex); \
energy = ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w; \ energy = ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w; \
if (rho > rhomax) energy += fp*(rho-rhomax); \ if (rho > rhomax) energy += fp*(rho-rhomax); \
engv[ii]=(acctyp)2.0*energy; \ engv[ii]=energy; \
} \ } \
} }
@ -173,12 +173,12 @@ texture<int4> z2r_sp2_tex;
if (offset==0) { \ if (offset==0) { \
engv+=ii; \ engv+=ii; \
if (eflag>0) { \ if (eflag>0) { \
*engv+=energy; \ *engv+=energy*(acctyp)0.5; \
engv+=inum; \ engv+=inum; \
} \ } \
if (vflag>0) { \ if (vflag>0) { \
for (int i=0; i<6; i++) { \ for (int i=0; i<6; i++) { \
*engv=virial[i]; \ *engv=virial[i]*(acctyp)0.5; \
engv+=inum; \ engv+=inum; \
} \ } \
} \ } \
@ -210,7 +210,8 @@ __kernel void k_energy(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);
@ -286,7 +287,8 @@ __kernel void k_energy_fast(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);
@ -353,7 +355,8 @@ __kernel void k_eam(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);
@ -466,7 +469,8 @@ __kernel void k_eam_fast(const __global numtyp4 *x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);

View File

@ -83,12 +83,12 @@ texture<int4,1> pos_tex, quat_tex;
if (offset==0) { \ if (offset==0) { \
__global acctyp *ap1=engv+ii; \ __global acctyp *ap1=engv+ii; \
if (eflag>0) { \ if (eflag>0) { \
*ap1=energy; \ *ap1=energy*(acctyp)0.5; \
ap1+=astride; \ ap1+=astride; \
} \ } \
if (vflag>0) { \ if (vflag>0) { \
for (int i=0; i<6; i++) { \ for (int i=0; i<6; i++) { \
*ap1=virial[i]; \ *ap1=virial[i]*(acctyp)0.5; \
ap1+=astride; \ ap1+=astride; \
} \ } \
} \ } \
@ -130,12 +130,12 @@ texture<int4,1> pos_tex, quat_tex;
if (offset==0) { \ if (offset==0) { \
engv+=ii; \ engv+=ii; \
if (eflag>0) { \ if (eflag>0) { \
*engv+=energy; \ *engv+=energy*(acctyp)0.5; \
engv+=inum; \ engv+=inum; \
} \ } \
if (vflag>0) { \ if (vflag>0) { \
for (int i=0; i<6; i++) { \ for (int i=0; i<6; i++) { \
*engv+=virial[i]; \ *engv+=virial[i]*(acctyp)0.5; \
engv+=inum; \ engv+=inum; \
} \ } \
} \ } \
@ -170,12 +170,12 @@ texture<int4,1> pos_tex, quat_tex;
if (offset==0) { \ if (offset==0) { \
__global acctyp *ap1=engv+ii; \ __global acctyp *ap1=engv+ii; \
if (eflag>0) { \ if (eflag>0) { \
*ap1=energy; \ *ap1=energy*(acctyp)0.5; \
ap1+=astride; \ ap1+=astride; \
} \ } \
if (vflag>0) { \ if (vflag>0) { \
for (int i=0; i<6; i++) { \ for (int i=0; i<6; i++) { \
*ap1=virial[i]; \ *ap1=virial[i]*(acctyp)0.5; \
ap1+=astride; \ ap1+=astride; \
} \ } \
} \ } \
@ -202,12 +202,12 @@ texture<int4,1> pos_tex, quat_tex;
if (offset==0) { \ if (offset==0) { \
engv+=ii; \ engv+=ii; \
if (eflag>0) { \ if (eflag>0) { \
*engv+=energy; \ *engv+=energy*(acctyp)0.5; \
engv+=inum; \ engv+=inum; \
} \ } \
if (vflag>0) { \ if (vflag>0) { \
for (int i=0; i<6; i++) { \ for (int i=0; i<6; i++) { \
*engv+=virial[i]; \ *engv+=virial[i]*(acctyp)0.5; \
engv+=inum; \ engv+=inum; \
} \ } \
} \ } \

View File

@ -68,7 +68,7 @@ int GaussT::init(const int ntypes,
// Allocate a host write buffer for data initialization // Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device), UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED); UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++) for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0; host_write[i]=0.0;

View File

@ -52,7 +52,8 @@ __kernel void k_gauss(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);
@ -138,7 +139,8 @@ __kernel void k_gauss_fast(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);

View File

@ -77,7 +77,7 @@ int GayBerneT::init(const int ntypes, const double gamma,
// Allocate a host write buffer for copying type data // Allocate a host write buffer for copying type data
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device), UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED); UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++) for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0; host_write[i]=0.0;
@ -98,7 +98,7 @@ int GayBerneT::init(const int ntypes, const double gamma,
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4, this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
host_offset); host_offset);
dev_error.alloc(1,*(this->ucl_device)); dev_error.alloc(1,*(this->ucl_device),UCL_WRITE_ONLY);
dev_error.zero(); dev_error.zero();
// Allocate, cast and asynchronous memcpy of constant data // Allocate, cast and asynchronous memcpy of constant data
@ -258,6 +258,9 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
&ainum, &this->_threads_per_atom); &ainum, &this->_threads_per_atom);
this->time_ellipsoid2.stop(); this->time_ellipsoid2.stop();
} else { } else {
GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum()-
this->_last_ellipse)/
(BX/this->_threads_per_atom)));
this->ans->force.zero(); this->ans->force.zero();
this->ans->engv.zero(); this->ans->engv.zero();
this->time_nbor1.stop(); this->time_nbor1.stop();

View File

@ -120,7 +120,8 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *nbor_end; const __global int *nbor, *nbor_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj, nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);

View File

@ -54,7 +54,8 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *nbor_end; const __global int *nbor, *nbor_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj, nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
@ -276,7 +277,8 @@ __kernel void k_gayberne_lj(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj, nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);
@ -370,7 +372,8 @@ __kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj, nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);

View File

@ -69,7 +69,7 @@ int LJT::init(const int ntypes,
// Allocate a host write buffer for data initialization // Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device), UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED); UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++) for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0; host_write[i]=0.0;

View File

@ -53,7 +53,8 @@ __kernel void k_lj(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);
@ -142,7 +143,8 @@ __kernel void k_lj_fast(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);

View File

@ -69,7 +69,7 @@ int LJ96T::init(const int ntypes,
// Allocate a host write buffer for data initialization // Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device), UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED); UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++) for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0; host_write[i]=0.0;

View File

@ -53,7 +53,8 @@ __kernel void k_lj96(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);
@ -143,7 +144,8 @@ __kernel void k_lj96_fast(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);

View File

@ -73,7 +73,7 @@ int LJClass2LongT::init(const int ntypes, double **host_cutsq,
// Allocate a host write buffer for data initialization // Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device), UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED); UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++) for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0; host_write[i]=0.0;

View File

@ -66,7 +66,8 @@ __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);
@ -187,7 +188,8 @@ __kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);

View File

@ -72,7 +72,7 @@ int LJCoulT::init(const int ntypes,
// Allocate a host write buffer for data initialization // Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device), UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED); UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++) for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0; host_write[i]=0.0;

View File

@ -66,7 +66,8 @@ __kernel void k_lj_coul(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);
@ -178,7 +179,8 @@ __kernel void k_lj_coul_fast(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);

View File

@ -73,7 +73,7 @@ int LJCoulDebyeT::init(const int ntypes,
// Allocate a host write buffer for data initialization // Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device), UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED); UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++) for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0; host_write[i]=0.0;

View File

@ -67,7 +67,8 @@ __kernel void k_lj_debye(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);
@ -186,7 +187,8 @@ __kernel void k_lj_debye_fast(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);

View File

@ -73,7 +73,7 @@ int LJCoulLongT::init(const int ntypes,
// Allocate a host write buffer for data initialization // Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device), UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED); UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++) for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0; host_write[i]=0.0;

View File

@ -66,7 +66,8 @@ __kernel void k_lj_coul_long(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);
@ -183,7 +184,8 @@ __kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);

View File

@ -77,7 +77,7 @@ int LJDSFT::init(const int ntypes, double **host_cutsq, double **host_lj1,
// Allocate a host write buffer for data initialization // Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device), UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED); UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++) for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0; host_write[i]=0.0;

View File

@ -69,7 +69,8 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);
@ -195,7 +196,8 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);

View File

@ -69,7 +69,7 @@ int LJExpandT::init(const int ntypes, double **host_cutsq,
// Allocate a host write buffer for data initialization // Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device), UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED); UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++) for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0; host_write[i]=0.0;

View File

@ -55,7 +55,8 @@ __kernel void k_lj_expand(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);
@ -147,7 +148,8 @@ __kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);

View File

@ -69,7 +69,7 @@ int MorseT::init(const int ntypes,
// Allocate a host write buffer for data initialization // Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(types*types*32,*(this->ucl_device), UCL_H_Vec<numtyp> host_write(types*types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED); UCL_WRITE_ONLY);
for (int i=0; i<types*types; i++) for (int i=0; i<types*types; i++)
host_write[i]=0.0; host_write[i]=0.0;

View File

@ -55,7 +55,8 @@ __kernel void k_morse(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);
@ -145,7 +146,8 @@ __kernel void k_morse_fast(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);

View File

@ -38,7 +38,8 @@ bool Neighbor::init(NeighborShared *shared, const int inum,
const bool pre_cut, const int block_cell_2d, const bool pre_cut, const int block_cell_2d,
const int block_cell_id, const int block_nbor_build, const int block_cell_id, const int block_nbor_build,
const int threads_per_atom, const int warp_size, const int threads_per_atom, const int warp_size,
const bool time_device) { const bool time_device,
const std::string compile_flags) {
clear(); clear();
_threads_per_atom=threads_per_atom; _threads_per_atom=threads_per_atom;
@ -92,13 +93,13 @@ bool Neighbor::init(NeighborShared *shared, const int inum,
if (gpu_nbor==0) if (gpu_nbor==0)
success=success && (host_packed.alloc(2*IJ_SIZE,*dev, success=success && (host_packed.alloc(2*IJ_SIZE,*dev,
UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); UCL_WRITE_ONLY)==UCL_SUCCESS);
alloc(success); alloc(success);
if (!success) if (!success)
return false; return false;
if (_use_packing==false) if (_use_packing==false)
_shared->compile_kernels(devi,gpu_nbor); _shared->compile_kernels(devi,gpu_nbor,compile_flags);
return success; return success;
} }
@ -114,7 +115,7 @@ void Neighbor::alloc(bool &success) {
success=success && (dev_nbor.alloc(3*_max_atoms,*dev, success=success && (dev_nbor.alloc(3*_max_atoms,*dev,
UCL_READ_ONLY)==UCL_SUCCESS); UCL_READ_ONLY)==UCL_SUCCESS);
success=success && (host_acc.alloc(nt*2,*dev, success=success && (host_acc.alloc(nt*2,*dev,
UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); UCL_READ_WRITE)==UCL_SUCCESS);
_c_bytes=dev_nbor.row_bytes(); _c_bytes=dev_nbor.row_bytes();
if (_alloc_packed) { if (_alloc_packed) {
@ -129,10 +130,10 @@ void Neighbor::alloc(bool &success) {
host_ilist.clear(); host_ilist.clear();
host_jlist.clear(); host_jlist.clear();
success=(nbor_host.alloc(_max_nbors*_max_host,*dev,UCL_RW_OPTIMIZED, success=(nbor_host.alloc(_max_nbors*_max_host,*dev,UCL_READ_WRITE,
UCL_WRITE_ONLY)==UCL_SUCCESS) && success; UCL_READ_WRITE)==UCL_SUCCESS) && success;
success=success && (dev_numj_host.alloc(_max_host,*dev, success=success && (dev_numj_host.alloc(_max_host,*dev,
UCL_WRITE_ONLY)==UCL_SUCCESS); UCL_READ_WRITE)==UCL_SUCCESS);
success=success && (host_ilist.alloc(nt,*dev,UCL_NOT_PINNED)==UCL_SUCCESS); success=success && (host_ilist.alloc(nt,*dev,UCL_NOT_PINNED)==UCL_SUCCESS);
if (!success) if (!success)
return; return;
@ -161,7 +162,7 @@ void Neighbor::alloc(bool &success) {
success=success && (dev_nspecial.alloc(3*at,*dev, success=success && (dev_nspecial.alloc(3*at,*dev,
UCL_READ_ONLY)==UCL_SUCCESS); UCL_READ_ONLY)==UCL_SUCCESS);
success=success && (dev_special.alloc(_maxspecial*at,*dev, success=success && (dev_special.alloc(_maxspecial*at,*dev,
UCL_READ_ONLY)==UCL_SUCCESS); UCL_READ_WRITE)==UCL_SUCCESS);
success=success && (dev_special_t.alloc(_maxspecial*at,*dev, success=success && (dev_special_t.alloc(_maxspecial*at,*dev,
UCL_READ_ONLY)==UCL_SUCCESS); UCL_READ_ONLY)==UCL_SUCCESS);
_gpu_bytes+=dev_nspecial.row_bytes()+dev_special.row_bytes()+ _gpu_bytes+=dev_nspecial.row_bytes()+dev_special.row_bytes()+
@ -178,11 +179,9 @@ void Neighbor::clear() {
_bin_time=0.0; _bin_time=0.0;
if (_ncells>0) { if (_ncells>0) {
_ncells=0; _ncells=0;
dev_cell_counts.clear(); cell_counts.clear();
if (_gpu_nbor==2) { if (_gpu_nbor==2)
host_cell_counts.clear();
delete [] cell_iter; delete [] cell_iter;
}
} }
if (_allocated) { if (_allocated) {
_allocated=false; _allocated=false;
@ -286,6 +285,80 @@ void Neighbor::get_host(const int inum, int *ilist, int *numj,
} }
} }
// This is the same as get host, but the requirement that ilist[i]=i and
// inum=nlocal is forced to be true to allow direct indexing of neighbors of
// neighbors
void Neighbor::get_host3(const int inum, const int nlist, int *ilist, int *numj,
int **firstneigh, const int block_size) {
_nbor_time_avail=true;
time_nbor.start();
UCL_H_Vec<int> ilist_view;
ilist_view.view(ilist,inum,*dev);
ucl_copy(dev_nbor,ilist_view,false);
UCL_D_Vec<int> nbor_offset;
UCL_H_Vec<int> host_offset;
int copy_count=0;
int ij_count=0;
int acc_count=0;
int dev_count=0;
int *h_ptr=host_packed.begin();
_nbor_pitch=inum;
if (nlist!=inum)
host_acc.zero(inum);
for (int ii=0; ii<nlist; ii++) {
int i=ilist[ii];
int nj=numj[i];
host_acc[i]=nj;
host_acc[i+inum]=acc_count;
acc_count+=nj;
}
for (int i=0; i<inum; i++) {
int nj=host_acc[i];
int *jlist=firstneigh[i];
for (int jj=0; jj<nj; jj++) {
*h_ptr=jlist[jj];
h_ptr++;
ij_count++;
if (ij_count==IJ_SIZE) {
dev_nbor.sync();
host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,IJ_SIZE);
nbor_offset.view_offset(dev_count,dev_packed,IJ_SIZE);
ucl_copy(nbor_offset,host_offset,true);
copy_count++;
ij_count=0;
dev_count+=IJ_SIZE;
h_ptr=host_packed.begin()+(IJ_SIZE*(copy_count%2));
}
}
}
if (ij_count!=0) {
dev_nbor.sync();
host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,ij_count);
nbor_offset.view_offset(dev_count,dev_packed,ij_count);
ucl_copy(nbor_offset,host_offset,true);
}
UCL_D_Vec<int> acc_view;
acc_view.view_offset(inum,dev_nbor,inum*2);
ucl_copy(acc_view,host_acc,true);
time_nbor.stop();
if (_use_packing==false) {
time_kernel.start();
int GX=static_cast<int>(ceil(static_cast<double>(inum)*_threads_per_atom/
block_size));
_shared->k_nbor.set_size(GX,block_size);
_shared->k_nbor.run(&dev_nbor, &dev_packed, &inum, &_threads_per_atom);
time_kernel.stop();
}
}
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
void Neighbor::resize_max_neighbors(const int maxn, bool &success) { void Neighbor::resize_max_neighbors(const int maxn, bool &success) {
if (maxn>_max_nbors) { if (maxn>_max_nbors) {
@ -330,24 +403,20 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
ncellz = static_cast<int>(ceil((subhi[2]-sublo[2])/_cell_size))+ghost_cells; ncellz = static_cast<int>(ceil((subhi[2]-sublo[2])/_cell_size))+ghost_cells;
ncell_3d = ncellx * ncelly * ncellz; ncell_3d = ncellx * ncelly * ncellz;
if (ncell_3d+1>_ncells) { if (ncell_3d+1>_ncells) {
if (_gpu_nbor==2) { cell_counts.clear();
if (_ncells>0) {
host_cell_counts.clear();
delete [] cell_iter;
}
cell_iter = new int[ncell_3d+1];
host_cell_counts.alloc(ncell_3d+1,dev_nbor);
}
if (_gpu_nbor==2 && atom.host_view()) if (_gpu_nbor==2) {
dev_cell_counts.view(host_cell_counts); if (_ncells>0)
else { delete [] cell_iter;
dev_cell_counts.clear(); cell_iter = new int[ncell_3d+1];
dev_cell_counts.alloc(ncell_3d+1,dev_nbor); cell_counts.alloc(ncell_3d+1,dev_nbor,UCL_READ_WRITE,UCL_READ_ONLY);
} else {
cell_counts.device.clear();
cell_counts.device.alloc(ncell_3d+1,dev_nbor);
} }
_ncells=ncell_3d+1; _ncells=ncell_3d+1;
_cell_bytes=dev_cell_counts.row_bytes(); _cell_bytes=cell_counts.device.row_bytes();
} }
const numtyp cutoff_cast=static_cast<numtyp>(_cutoff); const numtyp cutoff_cast=static_cast<numtyp>(_cutoff);
@ -381,7 +450,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
int *particle_id=atom.host_particle_id.begin(); int *particle_id=atom.host_particle_id.begin();
// Build cell list on CPU // Build cell list on CPU
host_cell_counts.zero(); cell_counts.host.zero();
double i_cell_size=1.0/_cell_size; double i_cell_size=1.0/_cell_size;
int offset_hi=_cells_in_cutoff+1; int offset_hi=_cells_in_cutoff+1;
@ -403,7 +472,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
int id = ix+iy*ncellx+iz*ncellx*ncelly; int id = ix+iy*ncellx+iz*ncellx*ncelly;
cell_id[i] = id; cell_id[i] = id;
host_cell_counts[id+1]++; cell_counts[id+1]++;
} }
for (int i=nt; i<nall; i++) { for (int i=nt; i<nall; i++) {
@ -424,12 +493,12 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
int id = ix+iy*ncellx+iz*ncellx*ncelly; int id = ix+iy*ncellx+iz*ncellx*ncelly;
cell_id[i] = id; cell_id[i] = id;
host_cell_counts[id+1]++; cell_counts[id+1]++;
} }
mn=0; mn=0;
for (int i=0; i<_ncells; i++) for (int i=0; i<_ncells; i++)
mn=std::max(mn,host_cell_counts[i]); mn=std::max(mn,cell_counts[i]);
mn*=8; mn*=8;
set_nbor_block_size(mn/2); set_nbor_block_size(mn/2);
@ -440,11 +509,11 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
cell_iter[0]=0; cell_iter[0]=0;
for (int i=1; i<_ncells; i++) { for (int i=1; i<_ncells; i++) {
host_cell_counts[i]+=host_cell_counts[i-1]; cell_counts[i]+=cell_counts[i-1];
cell_iter[i]=host_cell_counts[i]; cell_iter[i]=cell_counts[i];
} }
time_hybrid1.start(); time_hybrid1.start();
ucl_copy(dev_cell_counts,host_cell_counts,true); cell_counts.update_device(true);
time_hybrid1.stop(); time_hybrid1.stop();
for (int i=0; i<nall; i++) { for (int i=0; i<nall; i++) {
int celli=cell_id[i]; int celli=cell_id[i];
@ -481,7 +550,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
/* calculate cell count */ /* calculate cell count */
_shared->k_cell_counts.set_size(GX,neigh_block); _shared->k_cell_counts.set_size(GX,neigh_block);
_shared->k_cell_counts.run(&atom.dev_cell_id, &dev_cell_counts, &nall, _shared->k_cell_counts.run(&atom.dev_cell_id, &cell_counts, &nall,
&ncell_3d); &ncell_3d);
} }
@ -490,7 +559,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
_shared->k_build_nbor.set_size(ncellx-ghost_cells,(ncelly-ghost_cells)* _shared->k_build_nbor.set_size(ncellx-ghost_cells,(ncelly-ghost_cells)*
(ncellz-ghost_cells),cell_block,1); (ncellz-ghost_cells),cell_block,1);
_shared->k_build_nbor.run(&atom.x, &atom.dev_particle_id, _shared->k_build_nbor.run(&atom.x, &atom.dev_particle_id,
&dev_cell_counts, &dev_nbor, &nbor_host, &cell_counts, &dev_nbor, &nbor_host,
&dev_numj_host, &_max_nbors, &cutoff_cast, &ncellx, &dev_numj_host, &_max_nbors, &cutoff_cast, &ncellx,
&ncelly, &ncellz, &inum, &nt, &nall, &ncelly, &ncellz, &inum, &nt, &nall,
&_threads_per_atom, &_cells_in_cutoff); &_threads_per_atom, &_cells_in_cutoff);

View File

@ -47,13 +47,15 @@ class Neighbor {
* \param pre_cut True if cutoff test will be performed in separate kernel * \param pre_cut True if cutoff test will be performed in separate kernel
* than the force kernel * than the force kernel
* \param threads_per_atom Number of threads used per atom for force * \param threads_per_atom Number of threads used per atom for force
* calculation **/ * calculation
* \param compile_flags Flags for JIT compiling **/
bool init(NeighborShared *shared, const int inum, const int host_inum, bool init(NeighborShared *shared, const int inum, const int host_inum,
const int max_nbors, const int maxspecial, UCL_Device &dev, const int max_nbors, const int maxspecial, UCL_Device &dev,
const int gpu_nbor, const int gpu_host, const bool pre_cut, const int gpu_nbor, const int gpu_host, const bool pre_cut,
const int block_cell_2d, const int block_cell_id, const int block_cell_2d, const int block_cell_id,
const int block_nbor_build, const int threads_per_atom, const int block_nbor_build, const int threads_per_atom,
const int warp_size, const bool time_device); const int warp_size, const bool time_device,
const std::string compile_flags);
/// Set the size of the cutoff+skin /// Set the size of the cutoff+skin
inline void cell_size(const double size, const double cutoff) { inline void cell_size(const double size, const double cutoff) {
@ -143,6 +145,10 @@ class Neighbor {
void get_host(const int inum, int *ilist, int *numj, void get_host(const int inum, int *ilist, int *numj,
int **firstneigh, const int block_size); int **firstneigh, const int block_size);
/// Copy neighbor list from host for 3-body (first time or from a rebuild)
void get_host3(const int inum, const int nlist, int *ilist, int *numj,
int **firstneigh, const int block_size);
/// Return the stride in elements for each nbor row /// Return the stride in elements for each nbor row
inline int nbor_pitch() const { return _nbor_pitch; } inline int nbor_pitch() const { return _nbor_pitch; }
@ -207,11 +213,9 @@ class Neighbor {
UCL_D_Vec<int> dev_nspecial; UCL_D_Vec<int> dev_nspecial;
/// Device storage for special neighbors /// Device storage for special neighbors
UCL_D_Vec<int> dev_special, dev_special_t; UCL_D_Vec<int> dev_special, dev_special_t;
/// Host storage for number of particles per cell /// Host/Device storage for number of particles per cell
UCL_H_Vec<int> host_cell_counts; UCL_Vector<int,int> cell_counts;
int *cell_iter; int *cell_iter;
/// Device storage for number of particles per cell
UCL_D_Vec<int> dev_cell_counts;
/// Device timers /// Device timers
UCL_Timer time_nbor, time_kernel, time_hybrid1, time_hybrid2, time_transpose; UCL_Timer time_nbor, time_kernel, time_hybrid1, time_hybrid2, time_transpose;

View File

@ -48,15 +48,12 @@ void NeighborShared::clear() {
} }
} }
void NeighborShared::compile_kernels(UCL_Device &dev, const int gpu_nbor) { void NeighborShared::compile_kernels(UCL_Device &dev, const int gpu_nbor,
const std::string flags) {
if (_compiled) if (_compiled)
return; return;
_gpu_nbor=gpu_nbor; _gpu_nbor=gpu_nbor;
std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
std::string(OCL_PRECISION_COMPILE)+" -D"+
std::string(OCL_VENDOR);
if (_gpu_nbor==0) { if (_gpu_nbor==0) {
nbor_program=new UCL_Program(dev); nbor_program=new UCL_Program(dev);
nbor_program->load_string(neighbor_cpu,flags.c_str()); nbor_program->load_string(neighbor_cpu,flags.c_str());

View File

@ -44,7 +44,8 @@ class NeighborShared {
UCL_Texture neigh_tex; UCL_Texture neigh_tex;
/// Compile kernels for neighbor lists /// Compile kernels for neighbor lists
void compile_kernels(UCL_Device &dev, const int gpu_nbor); void compile_kernels(UCL_Device &dev, const int gpu_nbor,
const std::string flags);
// ----------------------------- Kernels // ----------------------------- Kernels
UCL_Program *nbor_program, *build_program; UCL_Program *nbor_program, *build_program;

View File

@ -136,10 +136,10 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen,
_npts_y=nyhi_out-nylo_out+1; _npts_y=nyhi_out-nylo_out+1;
_npts_z=nzhi_out-nzlo_out+1; _npts_z=nzhi_out-nzlo_out+1;
_npts_yx=_npts_x*_npts_y; _npts_yx=_npts_x*_npts_y;
success=success && (brick.alloc(_npts_x*_npts_y*_npts_z,*ucl_device)== success=success && (brick.alloc(_npts_x*_npts_y*_npts_z,*ucl_device,
UCL_SUCCESS); UCL_READ_ONLY,UCL_WRITE_ONLY)==UCL_SUCCESS);
success=success && (vd_brick.alloc(_npts_x*_npts_y*_npts_z*4,*ucl_device)== success=success && (vd_brick.alloc(_npts_x*_npts_y*_npts_z*4,*ucl_device,
UCL_SUCCESS); UCL_READ_WRITE,UCL_READ_ONLY)==UCL_SUCCESS);
*vd_brick_p=vd_brick.host.begin(); *vd_brick_p=vd_brick.host.begin();
_max_bytes+=brick.device.row_bytes()+vd_brick.device.row_bytes(); _max_bytes+=brick.device.row_bytes()+vd_brick.device.row_bytes();
@ -159,7 +159,7 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen,
_max_bytes+=d_brick_atoms.row_bytes(); _max_bytes+=d_brick_atoms.row_bytes();
// Allocate error flags for checking out of bounds atoms // Allocate error flags for checking out of bounds atoms
success=success && (error_flag.alloc(1,*ucl_device,UCL_RW_OPTIMIZED, success=success && (error_flag.alloc(1,*ucl_device,UCL_READ_ONLY,
UCL_WRITE_ONLY)==UCL_SUCCESS); UCL_WRITE_ONLY)==UCL_SUCCESS);
if (!success) { if (!success) {
flag=-3; flag=-3;
@ -374,9 +374,7 @@ void PPPMT::compile_kernels(UCL_Device &dev) {
if (sizeof(grdtyp)==sizeof(double) && ucl_device->double_precision()==false) if (sizeof(grdtyp)==sizeof(double) && ucl_device->double_precision()==false)
return; return;
std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+ std::string flags=device->compile_string();
std::string(OCL_PRECISION_COMPILE)+" -D"+
std::string(OCL_VENDOR);
#ifdef USE_OPENCL #ifdef USE_OPENCL
flags+=std::string(" -Dgrdtyp=")+ucl_template_name<grdtyp>()+" -Dgrdtyp4="+ flags+=std::string(" -Dgrdtyp=")+ucl_template_name<grdtyp>()+" -Dgrdtyp4="+
ucl_template_name<grdtyp>()+"4"; ucl_template_name<grdtyp>()+"4";

View File

@ -97,11 +97,11 @@ float * pppm_gpu_init_f(const int nlocal, const int nall, FILE *screen,
const int nzhi_out, float **rho_coeff, const int nzhi_out, float **rho_coeff,
float **vd_brick, const double slab_volfactor, float **vd_brick, const double slab_volfactor,
const int nx_pppm, const int ny_pppm, const int nz_pppm, const int nx_pppm, const int ny_pppm, const int nz_pppm,
const bool split, int &success) { const bool split, const bool respa, int &success) {
float *b=pppm_gpu_init(PPPMF,nlocal,nall,screen,order,nxlo_out,nylo_out, float *b=pppm_gpu_init(PPPMF,nlocal,nall,screen,order,nxlo_out,nylo_out,
nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff,vd_brick, nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff,vd_brick,
slab_volfactor,nx_pppm,ny_pppm,nz_pppm,split,success); slab_volfactor,nx_pppm,ny_pppm,nz_pppm,split,success);
if (split==false) if (split==false && respa==false)
PPPMF.device->set_single_precompute(&PPPMF); PPPMF.device->set_single_precompute(&PPPMF);
return b; return b;
} }
@ -139,12 +139,13 @@ double * pppm_gpu_init_d(const int nlocal, const int nall, FILE *screen,
const int nzhi_out, double **rho_coeff, const int nzhi_out, double **rho_coeff,
double **vd_brick, const double slab_volfactor, double **vd_brick, const double slab_volfactor,
const int nx_pppm, const int ny_pppm, const int nx_pppm, const int ny_pppm,
const int nz_pppm, const bool split, int &success) { const int nz_pppm, const bool split,
const bool respa, int &success) {
double *b=pppm_gpu_init(PPPMD,nlocal,nall,screen,order,nxlo_out,nylo_out, double *b=pppm_gpu_init(PPPMD,nlocal,nall,screen,order,nxlo_out,nylo_out,
nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff, nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff,
vd_brick,slab_volfactor,nx_pppm,ny_pppm,nz_pppm, vd_brick,slab_volfactor,nx_pppm,ny_pppm,nz_pppm,
split,success); split,success);
if (split==false) if (split==false && respa==false)
PPPMD.device->set_double_precompute(&PPPMD); PPPMD.device->set_double_precompute(&PPPMD);
return b; return b;
} }

View File

@ -96,19 +96,27 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) {
enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE}; enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
// OCL_VENDOR: preprocessor define for hardware // OCL_DEFAULT_VENDOR: preprocessor define for hardware
// specific sizes of OpenCL kernel related constants // specific sizes of OpenCL kernel related constants
#ifdef FERMI_OCL #ifdef FERMI_OCL
#define OCL_VENDOR "FERMI_OCL" #define OCL_DEFAULT_VENDOR "fermi"
#endif
#ifdef KEPLER_OCL
#define OCL_DEFAULT_VENDOR "kepler"
#endif #endif
#ifdef CYPRESS_OCL #ifdef CYPRESS_OCL
#define OCL_VENDOR "CYPRESS_OCL" #define OCL_DEFAULT_VENDOR "cypress"
#endif #endif
#ifndef OCL_VENDOR #ifdef GENERIC_OCL
#define OCL_VENDOR "GENERIC_OCL" #define OCL_DEFAULT_VENDOR "generic"
#endif
#ifndef OCL_DEFAULT_VENDOR
#define OCL_DEFAULT_VENDOR "none"
#endif #endif
#endif #endif

View File

@ -214,6 +214,30 @@ typedef struct _double4 double4;
#endif #endif
// -------------------------------------------------------------------------
// NVIDIA GENERIC OPENCL DEFINITIONS
// -------------------------------------------------------------------------
#ifdef NV_GENERIC_OCL
#define USE_OPENCL
#define fast_mul mul24
#define MEM_THREADS 16
#define THREADS_PER_ATOM 1
#define THREADS_PER_CHARGE 1
#define BLOCK_PAIR 64
#define MAX_SHARED_TYPES 8
#define BLOCK_NBOR_BUILD 64
#define BLOCK_BIO_PAIR 64
#define WARP_SIZE 32
#define PPPM_BLOCK_1D 64
#define BLOCK_CELL_2D 8
#define BLOCK_CELL_ID 128
#define MAX_BIO_SHARED_TYPES 128
#endif
// ------------------------------------------------------------------------- // -------------------------------------------------------------------------
// NVIDIA FERMI OPENCL DEFINITIONS // NVIDIA FERMI OPENCL DEFINITIONS
// ------------------------------------------------------------------------- // -------------------------------------------------------------------------
@ -221,9 +245,6 @@ typedef struct _double4 double4;
#ifdef FERMI_OCL #ifdef FERMI_OCL
#define USE_OPENCL #define USE_OPENCL
#define fast_mul(X,Y) (X)*(Y)
#define ARCH 0
#define DRIVER 0
#define MEM_THREADS 32 #define MEM_THREADS 32
#define THREADS_PER_ATOM 4 #define THREADS_PER_ATOM 4
#define THREADS_PER_CHARGE 8 #define THREADS_PER_CHARGE 8
@ -238,7 +259,54 @@ typedef struct _double4 double4;
#define BLOCK_CELL_ID 128 #define BLOCK_CELL_ID 128
#define MAX_BIO_SHARED_TYPES 128 #define MAX_BIO_SHARED_TYPES 128
#pragma OPENCL EXTENSION cl_khr_fp64: enable #endif
// -------------------------------------------------------------------------
// NVIDIA KEPLER OPENCL DEFINITIONS
// -------------------------------------------------------------------------
#ifdef KEPLER_OCL
#define USE_OPENCL
#define MEM_THREADS 32
#define THREADS_PER_ATOM 4
#define THREADS_PER_CHARGE 8
#define BLOCK_PAIR 256
#define MAX_SHARED_TYPES 11
#define BLOCK_NBOR_BUILD 128
#define BLOCK_BIO_PAIR 256
#define BLOCK_ELLIPSE 128
#define WARP_SIZE 32
#define PPPM_BLOCK_1D 64
#define BLOCK_CELL_2D 8
#define BLOCK_CELL_ID 128
#define MAX_BIO_SHARED_TYPES 128
#ifndef NO_OCL_PTX
#define ARCH 300
#ifdef _SINGLE_SINGLE
inline float shfl_xor(float var, int laneMask, int width) {
float ret;
int c;
c = ((WARP_SIZE-width) << 8) | 0x1f;
asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=f"(ret) : "f"(var), "r"(laneMask), "r"(c));
return ret;
}
#else
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
inline double shfl_xor(double var, int laneMask, int width) {
int c = ((WARP_SIZE-width) << 8) | 0x1f;
int x,y,x2,y2;
double ans;
asm volatile ("mov.b64 {%0, %1}, %2;" : "=r"(y), "=r"(x) : "d"(var));
asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=r"(x2) : "r"(x), "r"(laneMask), "r"(c));
asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=r"(y2) : "r"(y), "r"(laneMask), "r"(c));
asm volatile ("mov.b64 %0, {%1, %2};" : "=d"(ans) : "r"(y2), "r"(x2));
return ans;
}
#endif
#endif
#endif #endif
@ -249,9 +317,6 @@ typedef struct _double4 double4;
#ifdef CYPRESS_OCL #ifdef CYPRESS_OCL
#define USE_OPENCL #define USE_OPENCL
#define fast_mul(X,Y) (X)*(Y)
#define ARCH 0
#define DRIVER 0
#define MEM_THREADS 32 #define MEM_THREADS 32
#define THREADS_PER_ATOM 4 #define THREADS_PER_ATOM 4
#define THREADS_PER_CHARGE 8 #define THREADS_PER_CHARGE 8
@ -266,12 +331,6 @@ typedef struct _double4 double4;
#define BLOCK_CELL_ID 128 #define BLOCK_CELL_ID 128
#define MAX_BIO_SHARED_TYPES 128 #define MAX_BIO_SHARED_TYPES 128
#if defined(cl_khr_fp64)
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#elif defined(cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64 : enable
#endif
#endif #endif
// ------------------------------------------------------------------------- // -------------------------------------------------------------------------
@ -281,9 +340,6 @@ typedef struct _double4 double4;
#ifdef GENERIC_OCL #ifdef GENERIC_OCL
#define USE_OPENCL #define USE_OPENCL
#define fast_mul mul24
#define ARCH 0
#define DRIVER 0
#define MEM_THREADS 16 #define MEM_THREADS 16
#define THREADS_PER_ATOM 1 #define THREADS_PER_ATOM 1
#define THREADS_PER_CHARGE 1 #define THREADS_PER_CHARGE 1
@ -298,6 +354,20 @@ typedef struct _double4 double4;
#define BLOCK_CELL_ID 128 #define BLOCK_CELL_ID 128
#define MAX_BIO_SHARED_TYPES 128 #define MAX_BIO_SHARED_TYPES 128
#endif
// -------------------------------------------------------------------------
// OPENCL Stuff for All Hardware
// -------------------------------------------------------------------------
#ifdef USE_OPENCL
#ifndef _SINGLE_SINGLE
#ifndef cl_khr_fp64
#ifndef cl_amd_fp64
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#endif
#endif
#if defined(cl_khr_fp64) #if defined(cl_khr_fp64)
#pragma OPENCL EXTENSION cl_khr_fp64 : enable #pragma OPENCL EXTENSION cl_khr_fp64 : enable
#elif defined(cl_amd_fp64) #elif defined(cl_amd_fp64)
@ -306,10 +376,17 @@ typedef struct _double4 double4;
#endif #endif
// ------------------------------------------------------------------------- #ifndef fast_mul
// OPENCL Stuff for All Hardware #define fast_mul(X,Y) (X)*(Y)
// ------------------------------------------------------------------------- #endif
#ifdef USE_OPENCL
#ifndef ARCH
#define ARCH 0
#endif
#ifndef DRIVER
#define DRIVER 0
#endif
#define GLOBAL_ID_X get_global_id(0) #define GLOBAL_ID_X get_global_id(0)
#define THREAD_ID_X get_local_id(0) #define THREAD_ID_X get_local_id(0)

View File

@ -74,7 +74,7 @@ int RESquaredT::init(const int ntypes, double **host_shape, double **host_well,
// Allocate a host write buffer for copying type data // Allocate a host write buffer for copying type data
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device), UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED); UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++) for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0; host_write[i]=0.0;
@ -95,7 +95,7 @@ int RESquaredT::init(const int ntypes, double **host_shape, double **host_well,
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4, this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
host_offset); host_offset);
dev_error.alloc(1,*(this->ucl_device)); dev_error.alloc(1,*(this->ucl_device),UCL_WRITE_ONLY);
dev_error.zero(); dev_error.zero();
// Allocate, cast and asynchronous memcpy of constant data // Allocate, cast and asynchronous memcpy of constant data
@ -260,6 +260,9 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
&this->_threads_per_atom); &this->_threads_per_atom);
this->time_ellipsoid3.stop(); this->time_ellipsoid3.stop();
} else { } else {
GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum()-
this->_last_ellipse)/
(BX/this->_threads_per_atom)));
this->ans->force.zero(); this->ans->force.zero();
this->ans->engv.zero(); this->ans->engv.zero();
this->time_nbor1.zero(); this->time_nbor1.zero();

View File

@ -75,7 +75,8 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *nbor_end; const __global int *nbor, *nbor_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj, nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);

View File

@ -59,12 +59,12 @@
if (offset==0) { \ if (offset==0) { \
__global acctyp *ap1=engv+ii; \ __global acctyp *ap1=engv+ii; \
if (eflag>0) { \ if (eflag>0) { \
*ap1+=energy; \ *ap1+=energy*(acctyp)0.5; \
ap1+=astride; \ ap1+=astride; \
} \ } \
if (vflag>0) { \ if (vflag>0) { \
for (int i=0; i<6; i++) { \ for (int i=0; i<6; i++) { \
*ap1+=virial[i]; \ *ap1+=virial[i]*(acctyp)0.5; \
ap1+=astride; \ ap1+=astride; \
} \ } \
} \ } \
@ -104,12 +104,12 @@
if (offset==0) { \ if (offset==0) { \
__global acctyp *ap1=engv+ii; \ __global acctyp *ap1=engv+ii; \
if (eflag>0) { \ if (eflag>0) { \
*ap1+=energy; \ *ap1+=energy*(acctyp)0.5; \
ap1+=astride; \ ap1+=astride; \
} \ } \
if (vflag>0) { \ if (vflag>0) { \
for (int i=0; i<6; i++) { \ for (int i=0; i<6; i++) { \
*ap1+=virial[i]; \ *ap1+=virial[i]*(acctyp)0.5; \
ap1+=astride; \ ap1+=astride; \
} \ } \
} \ } \
@ -173,7 +173,8 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *nbor_end; const __global int *nbor, *nbor_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj, nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
@ -424,7 +425,8 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *nbor_end; const __global int *nbor, *nbor_end;
int j, numj, n_stride; int j, numj;
__local int n_stride;
nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,j,numj, nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,j,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
@ -615,7 +617,8 @@ __kernel void k_resquared_lj(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj, nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);
@ -708,7 +711,8 @@ __kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj, nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);

View File

@ -88,7 +88,7 @@ int TableT::init(const int ntypes,
// Allocate a host write buffer for data initialization // Allocate a host write buffer for data initialization
UCL_H_Vec<int> host_write_int(lj_types*lj_types,*(this->ucl_device), UCL_H_Vec<int> host_write_int(lj_types*lj_types,*(this->ucl_device),
UCL_WRITE_OPTIMIZED); UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++) for (int i=0; i<lj_types*lj_types; i++)
host_write_int[i] = 0; host_write_int[i] = 0;
@ -113,7 +113,7 @@ int TableT::init(const int ntypes,
ucl_copy(nmask,host_write_int,false); ucl_copy(nmask,host_write_int,false);
UCL_H_Vec<numtyp4> host_write(lj_types*lj_types,*(this->ucl_device), UCL_H_Vec<numtyp4> host_write(lj_types*lj_types,*(this->ucl_device),
UCL_WRITE_OPTIMIZED); UCL_WRITE_ONLY);
coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
for (int ix=1; ix<ntypes; ix++) for (int ix=1; ix<ntypes; ix++)
@ -127,7 +127,7 @@ int TableT::init(const int ntypes,
// Allocate tablength arrays // Allocate tablength arrays
UCL_H_Vec<numtyp4> host_write2(_ntables*_tablength,*(this->ucl_device), UCL_H_Vec<numtyp4> host_write2(_ntables*_tablength,*(this->ucl_device),
UCL_WRITE_OPTIMIZED); UCL_WRITE_ONLY);
for (int i=0; i<_ntables*_tablength; i++) { for (int i=0; i<_ntables*_tablength; i++) {
host_write2[i].x = 0.0; host_write2[i].x = 0.0;
host_write2[i].y = 0.0; host_write2[i].y = 0.0;
@ -190,7 +190,7 @@ int TableT::init(const int ntypes,
ucl_copy(coeff4,host_write2,false); ucl_copy(coeff4,host_write2,false);
UCL_H_Vec<numtyp> host_rsq(lj_types*lj_types,*(this->ucl_device), UCL_H_Vec<numtyp> host_rsq(lj_types*lj_types,*(this->ucl_device),
UCL_WRITE_OPTIMIZED); UCL_WRITE_ONLY);
cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack1(ntypes,lj_types,cutsq,host_rsq,host_cutsq); this->atom->type_pack1(ntypes,lj_types,cutsq,host_rsq,host_cutsq);

View File

@ -74,7 +74,8 @@ __kernel void k_table(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);
@ -171,7 +172,8 @@ __kernel void k_table_fast(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);
@ -268,7 +270,8 @@ __kernel void k_table_linear(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);
@ -369,7 +372,8 @@ __kernel void k_table_linear_fast(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);
@ -470,7 +474,8 @@ __kernel void k_table_spline(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);
@ -578,7 +583,8 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);
@ -688,7 +694,8 @@ __kernel void k_table_bitmap(const __global numtyp4 *x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);
@ -794,7 +801,8 @@ __kernel void k_table_bitmap_fast(const __global numtyp4 *x_,
if (ii<inum) { if (ii<inum) {
const __global int *nbor, *list_end; const __global int *nbor, *list_end;
int i, numj, n_stride; int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor); n_stride,list_end,nbor);

Some files were not shown because too many files have changed in this diff Show More