diff --git a/lib/gpu/Makefile.linux_opencl b/lib/gpu/Makefile.linux_opencl index 8610917d93..71c82d25f3 100644 --- a/lib/gpu/Makefile.linux_opencl +++ b/lib/gpu/Makefile.linux_opencl @@ -4,9 +4,14 @@ # which file will be copied to Makefile.lammps -EXTRAMAKE = Makefile.lammps.standard +EXTRAMAKE = Makefile.lammps.opencl -OCL_CPP = mpic++ -O3 -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK +OCL_TUNE = -DFERMI_OCL # -- Uncomment for NVIDIA Fermi +# OCL_TUNE = -DKEPLER_OCL # -- Uncomment for NVIDIA Kepler +# OCL_TUNE = -DCYPRESS_OCL # -- Uncomment for AMD Cypress +# OCL_TUNE = -DGENERIC_OCL # -- Uncomment for generic device + +OCL_CPP = mpic++ $(DEFAULT_DEVICE) -O3 -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK OCL_LINK = -lOpenCL OCL_PREC = -D_SINGLE_SINGLE diff --git a/lib/gpu/Makefile.mac_opencl b/lib/gpu/Makefile.mac_opencl index fa227ecd3e..62b58c1cef 100644 --- a/lib/gpu/Makefile.mac_opencl +++ b/lib/gpu/Makefile.mac_opencl @@ -4,7 +4,12 @@ # which file will be copied to Makefile.lammps -EXTRAMAKE = Makefile.lammps.standard +EXTRAMAKE = Makefile.lammps.mac_ocl + +OCL_TUNE = -DFERMI_OCL # -- Uncomment for NVIDIA Fermi +# OCL_TUNE = -DKEPLER_OCL # -- Uncomment for NVIDIA Kepler +# OCL_TUNE = -DCYPRESS_OCL # -- Uncomment for AMD Cypress +# OCL_TUNE = -DGENERIC_OCL # -- Uncomment for generic device OCL_CPP = mpic++ -O3 -DMPI_GERYON -DUCL_NO_EXIT OCL_LINK = -framework OpenCL diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile index ce6132dd56..a5df7bf23d 100644 --- a/lib/gpu/Nvidia.makefile +++ b/lib/gpu/Nvidia.makefile @@ -28,7 +28,7 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_ans.o \ $(OBJ_DIR)/lal_neighbor.o $(OBJ_DIR)/lal_neighbor_shared.o \ $(OBJ_DIR)/lal_device.o $(OBJ_DIR)/lal_base_atomic.o \ $(OBJ_DIR)/lal_base_charge.o $(OBJ_DIR)/lal_base_ellipsoid.o \ - $(OBJ_DIR)/lal_base_dipole.o \ + $(OBJ_DIR)/lal_base_dipole.o $(OBJ_DIR)/lal_base_three.o \ $(OBJ_DIR)/lal_pppm.o $(OBJ_DIR)/lal_pppm_ext.o \ $(OBJ_DIR)/lal_gayberne.o $(OBJ_DIR)/lal_gayberne_ext.o \ $(OBJ_DIR)/lal_re_squared.o $(OBJ_DIR)/lal_re_squared_ext.o \ @@ -59,7 +59,12 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_ans.o \ $(OBJ_DIR)/lal_gauss.o $(OBJ_DIR)/lal_gauss_ext.o \ $(OBJ_DIR)/lal_yukawa_colloid.o $(OBJ_DIR)/lal_yukawa_colloid_ext.o \ $(OBJ_DIR)/lal_lj_coul_debye.o $(OBJ_DIR)/lal_lj_coul_debye_ext.o \ - $(OBJ_DIR)/lal_coul_dsf.o $(OBJ_DIR)/lal_coul_dsf_ext.o + $(OBJ_DIR)/lal_coul_dsf.o $(OBJ_DIR)/lal_coul_dsf_ext.o \ + $(OBJ_DIR)/lal_sw.o $(OBJ_DIR)/lal_sw_ext.o \ + $(OBJ_DIR)/lal_beck.o $(OBJ_DIR)/lal_beck_ext.o \ + $(OBJ_DIR)/lal_mie.o $(OBJ_DIR)/lal_mie_ext.o \ + $(OBJ_DIR)/lal_soft.o $(OBJ_DIR)/lal_soft_ext.o \ + $(OBJ_DIR)/lal_lj_coul_msm.o $(OBJ_DIR)/lal_lj_coul_msm_ext.o CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \ $(OBJ_DIR)/atom.cubin $(OBJ_DIR)/atom_cubin.h \ @@ -99,7 +104,12 @@ CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \ $(OBJ_DIR)/gauss.cubin $(OBJ_DIR)/gauss_cubin.h \ $(OBJ_DIR)/yukawa_colloid.cubin $(OBJ_DIR)/yukawa_colloid_cubin.h \ $(OBJ_DIR)/lj_coul_debye.cubin $(OBJ_DIR)/lj_coul_debye_cubin.h \ - $(OBJ_DIR)/coul_dsf.cubin $(OBJ_DIR)/coul_dsf_cubin.h + $(OBJ_DIR)/coul_dsf.cubin $(OBJ_DIR)/coul_dsf_cubin.h \ + $(OBJ_DIR)/sw.cubin $(OBJ_DIR)/sw_cubin.h \ + $(OBJ_DIR)/beck.cubin $(OBJ_DIR)/beck_cubin.h \ + $(OBJ_DIR)/mie.cubin $(OBJ_DIR)/mie_cubin.h \ + $(OBJ_DIR)/soft.cubin $(OBJ_DIR)/soft_cubin.h \ + $(OBJ_DIR)/lj_coul_msm.cubin $(OBJ_DIR)/lj_coul_msm_cubin.h all: $(OBJ_DIR) $(GPU_LIB) $(EXECS) @@ -175,6 +185,9 @@ $(OBJ_DIR)/lal_base_ellipsoid.o: $(ALL_H) lal_base_ellipsoid.h lal_base_ellipsoi $(OBJ_DIR)/lal_base_dipole.o: $(ALL_H) lal_base_dipole.h lal_base_dipole.cpp $(CUDR) -o $@ -c lal_base_dipole.cpp +$(OBJ_DIR)/lal_base_three.o: $(ALL_H) lal_base_three.h lal_base_three.cpp + $(CUDR) -o $@ -c lal_base_three.cpp + $(OBJ_DIR)/pppm_f.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h $(CUDA) --cubin -DNV_KERNEL -Dgrdtyp=float -Dgrdtyp4=float4 -o $@ lal_pppm.cu @@ -571,6 +584,66 @@ $(OBJ_DIR)/lal_coul_dsf.o: $(ALL_H) lal_coul_dsf.h lal_coul_dsf.cpp $(OBJ_DIR)/c $(OBJ_DIR)/lal_coul_dsf_ext.o: $(ALL_H) lal_coul_dsf.h lal_coul_dsf_ext.cpp lal_base_charge.h $(CUDR) -o $@ -c lal_coul_dsf_ext.cpp -I$(OBJ_DIR) +$(OBJ_DIR)/sw.cubin: lal_sw.cu lal_precision.h lal_preprocessor.h + $(CUDA) --cubin -DNV_KERNEL -o $@ lal_sw.cu + +$(OBJ_DIR)/sw_cubin.h: $(OBJ_DIR)/sw.cubin $(OBJ_DIR)/sw.cubin + $(BIN2C) -c -n sw $(OBJ_DIR)/sw.cubin > $(OBJ_DIR)/sw_cubin.h + +$(OBJ_DIR)/lal_sw.o: $(ALL_H) lal_sw.h lal_sw.cpp $(OBJ_DIR)/sw_cubin.h $(OBJ_DIR)/lal_base_three.o + $(CUDR) -o $@ -c lal_sw.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_sw_ext.o: $(ALL_H) lal_sw.h lal_sw_ext.cpp lal_base_three.h + $(CUDR) -o $@ -c lal_sw_ext.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/beck.cubin: lal_beck.cu lal_precision.h lal_preprocessor.h + $(CUDA) --cubin -DNV_KERNEL -o $@ lal_beck.cu + +$(OBJ_DIR)/beck_cubin.h: $(OBJ_DIR)/beck.cubin $(OBJ_DIR)/beck.cubin + $(BIN2C) -c -n beck $(OBJ_DIR)/beck.cubin > $(OBJ_DIR)/beck_cubin.h + +$(OBJ_DIR)/lal_beck.o: $(ALL_H) lal_beck.h lal_beck.cpp $(OBJ_DIR)/beck_cubin.h $(OBJ_DIR)/lal_base_atomic.o + $(CUDR) -o $@ -c lal_beck.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_beck_ext.o: $(ALL_H) lal_beck.h lal_beck_ext.cpp lal_base_atomic.h + $(CUDR) -o $@ -c lal_beck_ext.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/mie.cubin: lal_mie.cu lal_precision.h lal_preprocessor.h + $(CUDA) --cubin -DNV_KERNEL -o $@ lal_mie.cu + +$(OBJ_DIR)/mie_cubin.h: $(OBJ_DIR)/mie.cubin $(OBJ_DIR)/mie.cubin + $(BIN2C) -c -n mie $(OBJ_DIR)/mie.cubin > $(OBJ_DIR)/mie_cubin.h + +$(OBJ_DIR)/lal_mie.o: $(ALL_H) lal_mie.h lal_mie.cpp $(OBJ_DIR)/mie_cubin.h $(OBJ_DIR)/lal_base_atomic.o + $(CUDR) -o $@ -c lal_mie.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_mie_ext.o: $(ALL_H) lal_mie.h lal_mie_ext.cpp lal_base_atomic.h + $(CUDR) -o $@ -c lal_mie_ext.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/soft.cubin: lal_soft.cu lal_precision.h lal_preprocessor.h + $(CUDA) --cubin -DNV_KERNEL -o $@ lal_soft.cu + +$(OBJ_DIR)/soft_cubin.h: $(OBJ_DIR)/soft.cubin $(OBJ_DIR)/soft.cubin + $(BIN2C) -c -n soft $(OBJ_DIR)/soft.cubin > $(OBJ_DIR)/soft_cubin.h + +$(OBJ_DIR)/lal_soft.o: $(ALL_H) lal_soft.h lal_soft.cpp $(OBJ_DIR)/soft_cubin.h $(OBJ_DIR)/lal_base_atomic.o + $(CUDR) -o $@ -c lal_soft.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_soft_ext.o: $(ALL_H) lal_soft.h lal_soft_ext.cpp lal_base_atomic.h + $(CUDR) -o $@ -c lal_soft_ext.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lj_coul_msm.cubin: lal_lj_coul_msm.cu lal_precision.h lal_preprocessor.h + $(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj_coul_msm.cu + +$(OBJ_DIR)/lj_coul_msm_cubin.h: $(OBJ_DIR)/lj_coul_msm.cubin $(OBJ_DIR)/lj_coul_msm.cubin + $(BIN2C) -c -n lj_coul_msm $(OBJ_DIR)/lj_coul_msm.cubin > $(OBJ_DIR)/lj_coul_msm_cubin.h + +$(OBJ_DIR)/lal_lj_coul_msm.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm.cpp $(OBJ_DIR)/lj_coul_msm_cubin.h $(OBJ_DIR)/lal_base_charge.o + $(CUDR) -o $@ -c lal_lj_coul_msm.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_lj_coul_msm_ext.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm_ext.cpp lal_base_charge.h + $(CUDR) -o $@ -c lal_lj_coul_msm_ext.cpp -I$(OBJ_DIR) + $(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVD_H) $(CUDR) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_CUDADR $(CUDA_LIB) -lcuda diff --git a/lib/gpu/Opencl.makefile b/lib/gpu/Opencl.makefile index b476003ecd..81e80588cd 100644 --- a/lib/gpu/Opencl.makefile +++ b/lib/gpu/Opencl.makefile @@ -17,7 +17,7 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_answer.o \ $(OBJ_DIR)/lal_neighbor_shared.o $(OBJ_DIR)/lal_neighbor.o \ $(OBJ_DIR)/lal_device.o $(OBJ_DIR)/lal_base_atomic.o \ $(OBJ_DIR)/lal_base_charge.o $(OBJ_DIR)/lal_base_ellipsoid.o \ - $(OBJ_DIR)/lal_base_dipole.o \ + $(OBJ_DIR)/lal_base_dipole.o $(OBJ_DIR)/lal_base_three.o \ $(OBJ_DIR)/lal_pppm.o $(OBJ_DIR)/lal_pppm_ext.o \ $(OBJ_DIR)/lal_gayberne.o $(OBJ_DIR)/lal_gayberne_ext.o \ $(OBJ_DIR)/lal_re_squared.o $(OBJ_DIR)/lal_re_squared_ext.o \ @@ -48,7 +48,12 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_answer.o \ $(OBJ_DIR)/lal_gauss.o $(OBJ_DIR)/lal_gauss_ext.o \ $(OBJ_DIR)/lal_yukawa_colloid.o $(OBJ_DIR)/lal_yukawa_colloid_ext.o \ $(OBJ_DIR)/lal_lj_coul_debye.o $(OBJ_DIR)/lal_lj_coul_debye_ext.o \ - $(OBJ_DIR)/lal_coul_dsf.o $(OBJ_DIR)/lal_coul_dsf_ext.o + $(OBJ_DIR)/lal_coul_dsf.o $(OBJ_DIR)/lal_coul_dsf_ext.o \ + $(OBJ_DIR)/lal_sw.o $(OBJ_DIR)/lal_sw_ext.o \ + $(OBJ_DIR)/lal_beck.o $(OBJ_DIR)/lal_beck_ext.o \ + $(OBJ_DIR)/lal_mie.o $(OBJ_DIR)/lal_mie_ext.o \ + $(OBJ_DIR)/lal_soft.o $(OBJ_DIR)/lal_soft_ext.o \ + $(OBJ_DIR)/lal_lj_coul_msm.o $(OBJ_DIR)/lal_lj_coul_msm_ext.o KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \ $(OBJ_DIR)/neighbor_cpu_cl.h $(OBJ_DIR)/pppm_cl.h \ @@ -68,7 +73,9 @@ KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \ $(OBJ_DIR)/born_coul_long_cl.h $(OBJ_DIR)/dipole_lj_cl.h \ $(OBJ_DIR)/dipole_lj_sf_cl.h $(OBJ_DIR)/colloid_cl.h \ $(OBJ_DIR)/gauss_cl.h $(OBJ_DIR)/yukawa_colloid_cl.h \ - $(OBJ_DIR)/lj_coul_debye_cl.h $(OBJ_DIR)/coul_dsf_cl.h + $(OBJ_DIR)/lj_coul_debye_cl.h $(OBJ_DIR)/coul_dsf_cl.h \ + $(OBJ_DIR)/sw_cl.h $(OBJ_DIR)/beck_cl.h $(OBJ_DIR)/mie_cl.h \ + $(OBJ_DIR)/soft_cl.h $(OBJ_DIR)/lj_coul_msm_cl.h OCL_EXECS = $(BIN_DIR)/ocl_get_devices @@ -117,6 +124,9 @@ $(OBJ_DIR)/lal_base_ellipsoid.o: $(OCL_H) lal_base_ellipsoid.h lal_base_ellipsoi $(OBJ_DIR)/lal_base_dipole.o: $(OCL_H) lal_base_dipole.h lal_base_dipole.cpp $(OCL) -o $@ -c lal_base_dipole.cpp -I$(OBJ_DIR) +$(OBJ_DIR)/lal_base_three.o: $(OCL_H) lal_base_three.h lal_base_three.cpp + $(OCL) -o $@ -c lal_base_three.cpp + $(OBJ_DIR)/pppm_cl.h: lal_pppm.cu lal_preprocessor.h $(BSH) ./geryon/file_to_cstr.sh pppm lal_preprocessor.h lal_pppm.cu $(OBJ_DIR)/pppm_cl.h; @@ -405,6 +415,51 @@ $(OBJ_DIR)/lal_coul_dsf.o: $(ALL_H) lal_coul_dsf.h lal_coul_dsf.cpp $(OBJ_DIR)/ $(OBJ_DIR)/lal_coul_dsf_ext.o: $(ALL_H) lal_coul_dsf.h lal_coul_dsf_ext.cpp lal_base_charge.h $(OCL) -o $@ -c lal_coul_dsf_ext.cpp -I$(OBJ_DIR) +$(OBJ_DIR)/sw_cl.h: lal_sw.cu $(PRE1_H) + $(BSH) ./geryon/file_to_cstr.sh sw $(PRE1_H) lal_sw.cu $(OBJ_DIR)/sw_cl.h; + +$(OBJ_DIR)/lal_sw.o: $(ALL_H) lal_sw.h lal_sw.cpp $(OBJ_DIR)/sw_cl.h $(OBJ_DIR)/sw_cl.h $(OBJ_DIR)/lal_base_three.o + $(OCL) -o $@ -c lal_sw.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_sw_ext.o: $(ALL_H) lal_sw.h lal_sw_ext.cpp lal_base_three.h + $(OCL) -o $@ -c lal_sw_ext.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/beck_cl.h: lal_beck.cu $(PRE1_H) + $(BSH) ./geryon/file_to_cstr.sh beck $(PRE1_H) lal_beck.cu $(OBJ_DIR)/beck_cl.h; + +$(OBJ_DIR)/lal_beck.o: $(ALL_H) lal_beck.h lal_beck.cpp $(OBJ_DIR)/beck_cl.h $(OBJ_DIR)/beck_cl.h $(OBJ_DIR)/lal_base_atomic.o + $(OCL) -o $@ -c lal_beck.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_beck_ext.o: $(ALL_H) lal_beck.h lal_beck_ext.cpp lal_base_atomic.h + $(OCL) -o $@ -c lal_beck_ext.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/mie_cl.h: lal_mie.cu $(PRE1_H) + $(BSH) ./geryon/file_to_cstr.sh mie $(PRE1_H) lal_mie.cu $(OBJ_DIR)/mie_cl.h; + +$(OBJ_DIR)/lal_mie.o: $(ALL_H) lal_mie.h lal_mie.cpp $(OBJ_DIR)/mie_cl.h $(OBJ_DIR)/mie_cl.h $(OBJ_DIR)/lal_base_atomic.o + $(OCL) -o $@ -c lal_mie.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_mie_ext.o: $(ALL_H) lal_mie.h lal_mie_ext.cpp lal_base_atomic.h + $(OCL) -o $@ -c lal_mie_ext.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/soft_cl.h: lal_soft.cu $(PRE1_H) + $(BSH) ./geryon/file_to_cstr.sh soft $(PRE1_H) lal_soft.cu $(OBJ_DIR)/soft_cl.h; + +$(OBJ_DIR)/lal_soft.o: $(ALL_H) lal_soft.h lal_soft.cpp $(OBJ_DIR)/soft_cl.h $(OBJ_DIR)/soft_cl.h $(OBJ_DIR)/lal_base_atomic.o + $(OCL) -o $@ -c lal_soft.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_soft_ext.o: $(ALL_H) lal_soft.h lal_soft_ext.cpp lal_base_atomic.h + $(OCL) -o $@ -c lal_soft_ext.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lj_coul_msm_cl.h: lal_lj_coul_msm.cu $(PRE1_H) + $(BSH) ./geryon/file_to_cstr.sh lj_coul_msm $(PRE1_H) lal_lj_coul_msm.cu $(OBJ_DIR)/lj_coul_msm_cl.h; + +$(OBJ_DIR)/lal_lj_coul_msm.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm.cpp $(OBJ_DIR)/lj_coul_msm_cl.h $(OBJ_DIR)/lj_coul_msm_cl.h $(OBJ_DIR)/lal_base_charge.o + $(OCL) -o $@ -c lal_lj_coul_msm.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_lj_coul_msm_ext.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm_ext.cpp lal_base_charge.h + $(OCL) -o $@ -c lal_lj_coul_msm_ext.cpp -I$(OBJ_DIR) + $(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp $(OCL) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_OPENCL $(OCL_LINK) diff --git a/lib/gpu/README b/lib/gpu/README index f14496efc9..900cffd35c 100644 --- a/lib/gpu/README +++ b/lib/gpu/README @@ -3,6 +3,7 @@ -------------------------------- W. Michael Brown (ORNL) + Trung Dac Nguyen (ORNL) Peng Wang (NVIDIA) Axel Kohlmeyer (Temple) Steve Plimpton (SNL) @@ -60,6 +61,8 @@ devices on your system. A Makefile for OpenCL compilation is provided, but support for OpenCL use is not currently provided by the developers. Details of the implementation are provided in: +---- + Brown, W.M., Wang, P. Plimpton, S.J., Tharrington, A.N. Implementing Molecular Dynamics on Hybrid High Performance Computers - Short Range Forces. Computer Physics Communications. 2011. 182: p. 898-911. @@ -68,28 +71,64 @@ and Brown, W.M., Kohlmeyer, A. Plimpton, S.J., Tharrington, A.N. Implementing Molecular Dynamics on Hybrid High Performance Computers - Particle-Particle -Particle-Mesh. Computer Physics Communications. 2011. In press. +Particle-Mesh. Computer Physics Communications. 2012. 183: p. 449-459. +and + +Brown, W.M., Masako, Y. Implementing Molecular Dynamics on Hybrid High +Performance Computers - Three-Body Potentials. Computer Physics Communications. +2013. In press. + +---- NOTE: Installation of the CUDA SDK is not required. Current styles supporting GPU acceleration: - 1. lj/cut - 2. lj96/cut - 3. lj/expand - 4. lj/cut/coul/cut - 5. lj/cut/coul/long - 6. lj/charmm/coul/long - 7. lj/class2 - 8. lj/class2/coul/long - 9. morse - 10. cg/cmm - 11. cg/cmm/coul/long - 12. coul/long - 13. gayberne - 14. resquared - 15. pppm + 1 beck + 2 born/coul/long + 3 born/coul/wolf + 4 born + 5 buck/coul/cut + 6 buck/coul/long + 7 buck + 8 colloid + 9 coul/dsf + 10 coul/long + 11 eam/alloy + 12 eam/fs + 13 eam + 14 eam/lj + 15 gauss + 16 gayberne + 17 lj96/cut + 18 lj/charmm/coul/long + 19 lj/class2/coul/long + 20 lj/class2 + 21 lj/cut/coul/cut + 22 lj/cut/coul/debye + 23 lj/cut/coul/dsf + 24 lj/cut/coul/long + 25 lj/cut/coul/msm + 26 lj/cut/coul/wolf/fsw + 27 lj/cut/dipole/cut + 28 lj/cut + 29 lj/cut/tgpu + 30 lj/expand + 31 lj/sdk/coul/long + 32 cg/cmm/coul/long + 33 lj/sdk + 34 cg/cmm + 35 lj/sf/dipole/sf + 36 mie/cut + 37 morse + 38 resquared + 39 soft + 40 sw + 41 table + 42 yukawa/colloid + 43 yukawa + 44 pppm MULTIPLE LAMMPS PROCESSES @@ -170,3 +209,4 @@ make yes-asphere make yes-kspace make yes-gpu make linux + diff --git a/lib/gpu/geryon/VERSION.txt b/lib/gpu/geryon/VERSION.txt index 47cefed44d..733136e842 100644 --- a/lib/gpu/geryon/VERSION.txt +++ b/lib/gpu/geryon/VERSION.txt @@ -1 +1 @@ -Geryon Version 12.033 +Geryon Version 13.209 diff --git a/lib/gpu/geryon/nvd_device.h b/lib/gpu/geryon/nvd_device.h index 938e1d3bd6..5fffe77c82 100644 --- a/lib/gpu/geryon/nvd_device.h +++ b/lib/gpu/geryon/nvd_device.h @@ -65,15 +65,19 @@ class UCL_Device { public: /// Collect properties for every GPU on the node /** \note You must set the active GPU with set() before using the device **/ - UCL_Device(); + inline UCL_Device(); - ~UCL_Device(); + inline ~UCL_Device(); /// Returns 1 (For compatibility with OpenCL) inline int num_platforms() { return 1; } /// Return a string with name and info of the current platform - std::string platform_name() { return "NVIDIA Corporation NVIDIA CUDA Driver"; } + inline std::string platform_name() + { return "NVIDIA Corporation NVIDIA CUDA Driver"; } + + /// Delete any contexts/data and set the platform number to be used + inline int set_platform(const int pid); /// Return the number of devices that support CUDA inline int num_devices() { return _properties.size(); } @@ -81,8 +85,12 @@ class UCL_Device { /// Set the CUDA device to the specified device number /** A context and default command queue will be created for the device * Returns UCL_SUCCESS if successful or UCL_ERROR if the device could not - * be allocated for use **/ - int set(int num); + * be allocated for use. clear() is called to delete any contexts and + * associated data from previous calls to set(). **/ + inline int set(int num); + + /// Delete any context and associated data stored from a call to set() + inline void clear(); /// Get the current device number inline int device_num() { return _device; } @@ -147,16 +155,17 @@ class UCL_Device { inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; } /// Returns true if double precision is support for the current device - bool double_precision() { return double_precision(_device); } + inline bool double_precision() { return double_precision(_device); } /// Returns true if double precision is support for the device - bool double_precision(const int i) {return arch(i)>=1.3;} + inline bool double_precision(const int i) {return arch(i)>=1.3;} /// Get the number of cores in the current device inline unsigned cores() { return cores(_device); } /// Get the number of cores inline unsigned cores(const int i) { if (arch(i)<2.0) return _properties[i].multiProcessorCount*8; - else if (arch(i)<3.0) return _properties[i].multiProcessorCount*32; + else if (arch(i)<2.1) return _properties[i].multiProcessorCount*32; + else if (arch(i)<3.0) return _properties[i].multiProcessorCount*48; else return _properties[i].multiProcessorCount*192; } /// Get the gigabytes of global memory in the current device @@ -216,8 +225,34 @@ class UCL_Device { inline bool sharing_supported(const int i) { return (_properties[i].computeMode == CU_COMPUTEMODE_DEFAULT); } + /// True if splitting device into equal subdevices supported + inline bool fission_equal() + { return fission_equal(_device); } + /// True if splitting device into equal subdevices supported + inline bool fission_equal(const int i) + { return false; } + /// True if splitting device into subdevices by specified counts supported + inline bool fission_by_counts() + { return fission_by_counts(_device); } + /// True if splitting device into subdevices by specified counts supported + inline bool fission_by_counts(const int i) + { return false; } + /// True if splitting device into subdevices by affinity domains supported + inline bool fission_by_affinity() + { return fission_by_affinity(_device); } + /// True if splitting device into subdevices by affinity domains supported + inline bool fission_by_affinity(const int i) + { return false; } + + /// Maximum number of subdevices allowed from device fission + inline int max_sub_devices() + { return max_sub_devices(_device); } + /// Maximum number of subdevices allowed from device fission + inline int max_sub_devices(const int i) + { return 0; } + /// List all devices along with all properties - void print_all(std::ostream &out); + inline void print_all(std::ostream &out); private: int _device, _num_devices; @@ -228,7 +263,7 @@ class UCL_Device { }; // Grabs the properties for all devices -inline UCL_Device::UCL_Device() { +UCL_Device::UCL_Device() { CU_SAFE_CALL_NS(cuInit(0)); CU_SAFE_CALL_NS(cuDeviceGetCount(&_num_devices)); for (int dev=0; dev<_num_devices; ++dev) { @@ -280,22 +315,21 @@ inline UCL_Device::UCL_Device() { _cq.back()=0; } -inline UCL_Device::~UCL_Device() { - if (_device>-1) { - for (int i=1; i-1) { - CU_SAFE_CALL_NS(cuCtxDestroy(_context)); - for (int i=1; i-1) { + for (int i=1; i= 2020 int driver_version; cuDriverGetVersion(&driver_version); diff --git a/lib/gpu/geryon/nvd_kernel.h b/lib/gpu/geryon/nvd_kernel.h index fecd85eeb8..e0bfb1bb5e 100644 --- a/lib/gpu/geryon/nvd_kernel.h +++ b/lib/gpu/geryon/nvd_kernel.h @@ -377,6 +377,10 @@ class UCL_Kernel { #endif } + /// Return the default command queue/stream associated with this data + inline command_queue & cq() { return _cq; } + /// Change the default command queue associated with matrix + inline void cq(command_queue &cq_in) { _cq=cq_in; } #include "ucl_arg_kludge.h" private: diff --git a/lib/gpu/geryon/nvd_memory.h b/lib/gpu/geryon/nvd_memory.h index 335418fe5f..5f7b98ba5c 100644 --- a/lib/gpu/geryon/nvd_memory.h +++ b/lib/gpu/geryon/nvd_memory.h @@ -47,14 +47,14 @@ typedef CUdeviceptr device_ptr; // -------------------------------------------------------------------------- template inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n, - const enum UCL_MEMOPT kind) { + const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){ CUresult err=CUDA_SUCCESS; - if (kind==UCL_RW_OPTIMIZED) - err=cuMemAllocHost((void **)mat.host_ptr(),n); - else if (kind==UCL_WRITE_OPTIMIZED) + if (kind==UCL_NOT_PINNED) + *(mat.host_ptr())=(typename mat_type::data_type*)malloc(n); + else if (kind==UCL_WRITE_ONLY) err=cuMemHostAlloc((void **)mat.host_ptr(),n,CU_MEMHOSTALLOC_WRITECOMBINED); else - *(mat.host_ptr())=(typename mat_type::data_type*)malloc(n); + err=cuMemAllocHost((void **)mat.host_ptr(),n); if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL) return UCL_MEMORY_ERROR; mat.cq()=cm.cq(); @@ -63,14 +63,14 @@ inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n, template inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n, - const enum UCL_MEMOPT kind) { + const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){ CUresult err=CUDA_SUCCESS; - if (kind==UCL_RW_OPTIMIZED) - err=cuMemAllocHost((void **)mat.host_ptr(),n); - else if (kind==UCL_WRITE_OPTIMIZED) + if (kind==UCL_NOT_PINNED) + *(mat.host_ptr())=(typename mat_type::data_type*)malloc(n); + else if (kind==UCL_WRITE_ONLY) err=cuMemHostAlloc((void **)mat.host_ptr(),n,CU_MEMHOSTALLOC_WRITECOMBINED); else - *(mat.host_ptr())=(typename mat_type::data_type*)malloc(n); + err=cuMemAllocHost((void **)mat.host_ptr(),n); if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL) return UCL_MEMORY_ERROR; mat.cq()=dev.cq(); @@ -78,8 +78,10 @@ inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n, } template -inline void _host_free(mat_type &mat, const enum UCL_MEMOPT kind) { - if (kind!=UCL_NOT_PINNED) +inline void _host_free(mat_type &mat) { + if (mat.kind()==UCL_VIEW) + return; + else if (mat.kind()!=UCL_NOT_PINNED) CU_DESTRUCT_CALL(cuMemFreeHost(mat.begin())); else free(mat.begin()); @@ -87,14 +89,14 @@ inline void _host_free(mat_type &mat, const enum UCL_MEMOPT kind) { template inline int _host_resize(mat_type &mat, const size_t n) { - _host_free(mat,mat.kind()); + _host_free(mat); CUresult err=CUDA_SUCCESS; - if (mat.kind()==UCL_RW_OPTIMIZED) - err=cuMemAllocHost((void **)mat.host_ptr(),n); - else if (mat.kind()==UCL_WRITE_OPTIMIZED) - err=cuMemHostAlloc((void **)mat.host_ptr(),n,CU_MEMHOSTALLOC_WRITECOMBINED); - else + if (mat.kind()==UCL_NOT_PINNED) *(mat.host_ptr())=(typename mat_type::data_type*)malloc(n); + else if (mat.kind()==UCL_WRITE_ONLY) + err=cuMemHostAlloc((void **)mat.host_ptr(),n,CU_MEMHOSTALLOC_WRITECOMBINED); + else + err=cuMemAllocHost((void **)mat.host_ptr(),n); if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL) return UCL_MEMORY_ERROR; return UCL_SUCCESS; @@ -155,7 +157,8 @@ inline int _device_alloc(mat_type &mat, UCL_Device &d, const size_t rows, template inline void _device_free(mat_type &mat) { - CU_DESTRUCT_CALL(cuMemFree(mat.cbegin())); + if (mat.kind()!=UCL_VIEW) + CU_DESTRUCT_CALL(cuMemFree(mat.cbegin())); } template @@ -229,13 +232,13 @@ inline void _host_zero(void *ptr, const size_t n) { } template -inline void _device_zero(mat_type &mat, const size_t n) { +inline void _device_zero(mat_type &mat, const size_t n, command_queue &cq) { if (n%32==0) - CU_SAFE_CALL(cuMemsetD32(mat.cbegin(),0,n/4)); + CU_SAFE_CALL(cuMemsetD32Async(mat.cbegin(),0,n/4,cq)); else if (n%16==0) - CU_SAFE_CALL(cuMemsetD16(mat.cbegin(),0,n/2)); + CU_SAFE_CALL(cuMemsetD16Async(mat.cbegin(),0,n/2,cq)); else - CU_SAFE_CALL(cuMemsetD8(mat.cbegin(),0,n)); + CU_SAFE_CALL(cuMemsetD8Async(mat.cbegin(),0,n,cq)); } // -------------------------------------------------------------------------- diff --git a/lib/gpu/geryon/ocl_device.h b/lib/gpu/geryon/ocl_device.h index 391eeb9d95..79fa53d552 100644 --- a/lib/gpu/geryon/ocl_device.h +++ b/lib/gpu/geryon/ocl_device.h @@ -51,6 +51,10 @@ inline void ucl_sync(cl_command_queue &cq) { CL_SAFE_CALL(clFinish(cq)); } +inline bool _shared_mem_device(cl_device_type &device_type) { + return (device_type==CL_DEVICE_TYPE_CPU); +} + struct OCLProperties { std::string name; cl_device_type device_type; @@ -64,6 +68,10 @@ struct OCLProperties { bool double_precision; int alignment; size_t timer_resolution; + bool ecc_support; + std::string c_version; + bool partition_equal, partition_counts, partition_affinity; + cl_uint max_sub_devices; }; /// Class for looking at data parallel device properties @@ -74,15 +82,18 @@ class UCL_Device { public: /// Collect properties for every device on the node /** \note You must set the active GPU with set() before using the device **/ - UCL_Device(); + inline UCL_Device(); - ~UCL_Device(); + inline ~UCL_Device(); /// Return the number of platforms (0 if error or no platforms) inline int num_platforms() { return _num_platforms; } /// Return a string with name and info of the current platform - std::string platform_name(); + inline std::string platform_name(); + + /// Delete any contexts/data and set the platform number to be used + inline int set_platform(const int pid); /// Return the number of devices that support OpenCL inline int num_devices() { return _num_devices; } @@ -90,8 +101,12 @@ class UCL_Device { /// Set the OpenCL device to the specified device number /** A context and default command queue will be created for the device * * Returns UCL_SUCCESS if successful or UCL_ERROR if the device could not - * be allocated for use **/ - int set(int num); + * be allocated for use. clear() is called to delete any contexts and + * associated data from previous calls to set(). **/ + inline int set(int num); + + /// Delete any context and associated data stored from a call to set() + inline void clear(); /// Get the current device number inline int device_num() { return _device; } @@ -161,12 +176,14 @@ class UCL_Device { /// Returns true if host memory is efficiently addressable from device inline bool shared_memory() { return shared_memory(_device); } /// Returns true if host memory is efficiently addressable from device - inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; } + inline bool shared_memory(const int i) + { return _shared_mem_device(_properties[i].device_type); } /// Returns true if double precision is support for the current device - bool double_precision() { return double_precision(_device); } + inline bool double_precision() { return double_precision(_device); } /// Returns true if double precision is support for the device - bool double_precision(const int i) {return _properties[i].double_precision;} + inline bool double_precision(const int i) + {return _properties[i].double_precision;} /// Get the number of cores in the current device inline unsigned cores() { return cores(_device); } @@ -227,8 +244,34 @@ class UCL_Device { inline bool sharing_supported(const int i) { return true; } + /// True if splitting device into equal subdevices supported + inline bool fission_equal() + { return fission_equal(_device); } + /// True if splitting device into equal subdevices supported + inline bool fission_equal(const int i) + { return _properties[i].partition_equal; } + /// True if splitting device into subdevices by specified counts supported + inline bool fission_by_counts() + { return fission_by_counts(_device); } + /// True if splitting device into subdevices by specified counts supported + inline bool fission_by_counts(const int i) + { return _properties[i].partition_counts; } + /// True if splitting device into subdevices by affinity domains supported + inline bool fission_by_affinity() + { return fission_by_affinity(_device); } + /// True if splitting device into subdevices by affinity domains supported + inline bool fission_by_affinity(const int i) + { return _properties[i].partition_affinity; } + + /// Maximum number of subdevices allowed from device fission + inline int max_sub_devices() + { return max_sub_devices(_device); } + /// Maximum number of subdevices allowed from device fission + inline int max_sub_devices(const int i) + { return _properties[i].max_sub_devices; } + /// List all devices along with all properties - void print_all(std::ostream &out); + inline void print_all(std::ostream &out); /// Return the OpenCL type for the device inline cl_device_id & cl_device() { return _cl_device; } @@ -237,7 +280,8 @@ class UCL_Device { int _num_platforms; // Number of platforms int _platform; // UCL_Device ID for current platform cl_platform_id _cl_platform; // OpenCL ID for current platform - cl_context _context; // Context used for accessing the device + cl_platform_id _cl_platforms[20]; // OpenCL IDs for all platforms + cl_context _context; // Context used for accessing the device std::vector _cq;// The default command queue for this device int _device; // UCL_Device ID for current device cl_device_id _cl_device; // OpenCL ID for current device @@ -245,31 +289,57 @@ class UCL_Device { int _num_devices; // Number of devices std::vector _properties; // Properties for each device - void add_properties(cl_device_id); - int create_context(); + inline void add_properties(cl_device_id); + inline int create_context(); int _default_cq; }; // Grabs the properties for all devices -inline UCL_Device::UCL_Device() { - cl_int errorv; - cl_uint nplatforms; - - _cl_device=0; +UCL_Device::UCL_Device() { _device=-1; - _num_devices=0; - _platform=0; - _default_cq=0; // --- Get Number of Platforms - errorv=clGetPlatformIDs(1,&_cl_platform,&nplatforms); + cl_uint nplatforms; + cl_int errorv=clGetPlatformIDs(20,_cl_platforms,&nplatforms); if (errorv!=CL_SUCCESS) { _num_platforms=0; return; } else _num_platforms=static_cast(nplatforms); + + set_platform(0); +} + +UCL_Device::~UCL_Device() { + clear(); +} + +void UCL_Device::clear() { + if (_device>-1) { + for (size_t i=0; i<_cq.size(); i++) { + CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq.back())); + _cq.pop_back(); + } + CL_DESTRUCT_CALL(clReleaseContext(_context)); + } + _device=-1; +} + +int UCL_Device::set_platform(int pid) { + clear(); + cl_int errorv; + + _cl_device=0; + _device=-1; + _num_devices=0; + _default_cq=0; + #ifdef UCL_DEBUG + assert(pid-1) { - for (size_t i=0; i<_cq.size(); i++) { - CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq.back())); - _cq.pop_back(); - } - CL_DESTRUCT_CALL(clReleaseContext(_context)); - } -} - -inline int UCL_Device::create_context() { +int UCL_Device::create_context() { cl_int errorv; cl_context_properties props[3]; props[0]=CL_CONTEXT_PLATFORM; @@ -320,9 +382,10 @@ inline int UCL_Device::create_context() { return UCL_SUCCESS; } -inline void UCL_Device::add_properties(cl_device_id device_list) { +void UCL_Device::add_properties(cl_device_id device_list) { OCLProperties op; char buffer[1024]; + cl_bool ans_bool; CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_NAME,1024,buffer,NULL)); op.name=buffer; @@ -363,10 +426,49 @@ inline void UCL_Device::add_properties(cl_device_id device_list) { CL_DEVICE_PROFILING_TIMER_RESOLUTION, sizeof(size_t),&op.timer_resolution,NULL)); + + op.ecc_support=false; + CL_SAFE_CALL(clGetDeviceInfo(device_list, + CL_DEVICE_ERROR_CORRECTION_SUPPORT, + sizeof(ans_bool),&ans_bool,NULL)); + if (ans_bool==CL_TRUE) + op.ecc_support=true; + + op.c_version=""; + op.partition_equal=false; + op.partition_counts=false; + op.partition_affinity=false; + + #ifdef CL_VERSION_1_2 + size_t return_bytes; + CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_OPENCL_C_VERSION,1024, + buffer,NULL)); + op.c_version=buffer; + + cl_device_partition_property pinfo[4]; + CL_SAFE_CALL(clGetDeviceInfo(device_list, + CL_DEVICE_PARTITION_PROPERTIES, + 4*sizeof(cl_device_partition_property), + pinfo,&return_bytes)); + int nprops=return_bytes/sizeof(cl_device_partition_property); + for (int i=0; i-1) { - for (size_t i=0; i<_cq.size(); i++) { - CL_SAFE_CALL(clReleaseCommandQueue(_cq.back())); - _cq.pop_back(); - } - CL_SAFE_CALL(clReleaseContext(_context)); - } +int UCL_Device::set(int num) { + clear(); cl_device_id device_list[_num_devices]; cl_uint n; @@ -432,7 +525,7 @@ inline int UCL_Device::set(int num) { } // List all devices along with all properties -inline void UCL_Device::print_all(std::ostream &out) { +void UCL_Device::print_all(std::ostream &out) { if (num_devices() == 0) out << "There is no device supporting OpenCL\n"; for (int i=0; i inline void set_arg(const cl_uint index, const dtype * const arg) { CL_SAFE_CALL(clSetKernelArg(_kernel,index,sizeof(dtype),arg)); - if (index>_num_args) _num_args=index; + if (index>_num_args) { + _num_args=index; + #ifdef UCL_DEBUG + if (_num_args>_kernel_info_nargs) { + std::cerr << "TOO MANY ARGUMENTS TO OPENCL FUNCTION: " + << _kernel_info_name << std::endl; + assert(0==1); + } + #endif + } } /// Set a geryon container as a kernel argument. @@ -203,6 +217,13 @@ class UCL_Kernel { inline void add_arg(const dtype * const arg) { CL_SAFE_CALL(clSetKernelArg(_kernel,_num_args,sizeof(dtype),arg)); _num_args++; + #ifdef UCL_DEBUG + if (_num_args>_kernel_info_nargs) { + std::cerr << "TOO MANY ARGUMENTS TO OPENCL FUNCTION: " + << _kernel_info_name << std::endl; + assert(0==1); + } + #endif } /// Add a geryon container as a kernel argument. @@ -289,10 +310,7 @@ class UCL_Kernel { } /// Run the kernel in the default command queue - inline void run() { - CL_SAFE_CALL(clEnqueueNDRangeKernel(_cq,_kernel,_dimensions,NULL, - _num_blocks,_block_size,0,NULL,NULL)); - } + inline void run(); /// Clear any arguments associated with the kernel inline void clear_args() { _num_args=0; } @@ -309,6 +327,12 @@ class UCL_Kernel { cl_command_queue _cq; // The default command queue for this kernel unsigned _num_args; + + #ifdef UCL_DEBUG + std::string _kernel_info_name; + unsigned _kernel_info_nargs; + //std::string _kernel_info_args[256]; + #endif }; inline int UCL_Kernel::set_function(UCL_Program &program, const char *function) { @@ -329,9 +353,32 @@ inline int UCL_Kernel::set_function(UCL_Program &program, const char *function) #endif return UCL_FUNCTION_NOT_FOUND; } + + #ifdef UCL_DEBUG + _kernel_info_name=function; + cl_uint nargs; + CL_SAFE_CALL(clGetKernelInfo(_kernel,CL_KERNEL_NUM_ARGS,sizeof(cl_uint), + &nargs,NULL)); + _kernel_info_nargs=nargs; + #ifdef NOT_TEST_CL_VERSION_1_2 + char tname[256]; + size_t ret; + for (cl_uint i=0; i inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n, - const enum UCL_MEMOPT kind) { + const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){ cl_int error_flag; cl_context context; CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_CONTEXT,sizeof(context), &context,NULL)); - - if (kind==UCL_VIEW) { - mat.cbegin()=clCreateBuffer(context, CL_MEM_USE_HOST_PTR,n,mat.host_ptr(), - &error_flag); - CL_CHECK_ERR(error_flag); - return UCL_SUCCESS; - } - if (kind==UCL_WRITE_OPTIMIZED) { - mat.cbegin()=clCreateBuffer(context, - CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, - n,NULL,&error_flag); - if (error_flag != CL_SUCCESS) - return UCL_MEMORY_ERROR; - *mat.host_ptr() = (typename mat_type::data_type*) - clEnqueueMapBuffer(cm.cq(),mat.cbegin(),CL_TRUE, - CL_MAP_WRITE,0,n,0,NULL,NULL,NULL); + + cl_mem_flags buffer_perm; + cl_map_flags map_perm; + if (kind2==UCL_NOT_SPECIFIED) { + if (kind==UCL_READ_ONLY) { + #ifdef CL_VERSION_1_2 + buffer_perm=CL_MEM_HOST_READ_ONLY|CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR; + #else + buffer_perm=CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR; + #endif + map_perm=CL_MAP_READ; + } else if (kind==UCL_WRITE_ONLY) { + #ifdef CL_VERSION_1_2 + buffer_perm=CL_MEM_HOST_WRITE_ONLY|CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR; + #else + buffer_perm=CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR; + #endif + map_perm=CL_MAP_WRITE; + } else { + buffer_perm=CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR; + map_perm=CL_MAP_READ | CL_MAP_WRITE; + } } else { - mat.cbegin()=clCreateBuffer(context, - CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, - n,NULL,&error_flag); - if (error_flag != CL_SUCCESS) - return UCL_MEMORY_ERROR; + if (kind2==UCL_READ_ONLY) + buffer_perm=CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR; + else if (kind2==UCL_WRITE_ONLY) + buffer_perm=CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR; + else + buffer_perm=CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR; + + if (kind==UCL_READ_ONLY) { + #ifdef CL_VERSION_1_2 + buffer_perm=buffer_perm | CL_MEM_HOST_READ_ONLY; + #endif + map_perm=CL_MAP_READ; + } else if (kind==UCL_WRITE_ONLY) { + #ifdef CL_VERSION_1_2 + buffer_perm=buffer_perm | CL_MEM_HOST_WRITE_ONLY; + #endif + map_perm=CL_MAP_WRITE; + } else + map_perm=CL_MAP_READ | CL_MAP_WRITE; + } + + mat.cbegin()=clCreateBuffer(context,buffer_perm,n,NULL,&error_flag); + if (error_flag != CL_SUCCESS) + return UCL_MEMORY_ERROR; *mat.host_ptr() = (typename mat_type::data_type*) clEnqueueMapBuffer(cm.cq(),mat.cbegin(),CL_TRUE, - CL_MAP_READ | CL_MAP_WRITE, - 0,n,0,NULL,NULL,NULL); - } + map_perm,0,n,0,NULL,NULL,NULL); + mat.cq()=cm.cq(); CL_SAFE_CALL(clRetainCommandQueue(mat.cq())); return UCL_SUCCESS; } +template +inline int _host_view(mat_type &mat, copy_type &cm, const size_t n) { + cl_int error_flag; + cl_context context; + CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_CONTEXT,sizeof(context), + &context,NULL)); + cl_mem_flags orig_flags; + CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_FLAGS,sizeof(orig_flags), + &orig_flags,NULL)); + orig_flags=orig_flags & ~CL_MEM_ALLOC_HOST_PTR; + + mat.cbegin()=clCreateBuffer(context, CL_MEM_USE_HOST_PTR | orig_flags, n, + mat.host_ptr(), &error_flag); + CL_CHECK_ERR(error_flag); + CL_SAFE_CALL(clRetainCommandQueue(mat.cq())); + return UCL_SUCCESS; +} + template inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n, - const enum UCL_MEMOPT kind) { - cl_int error_flag; - if (kind==UCL_VIEW) { - mat.cbegin()=clCreateBuffer(dev.context(), CL_MEM_USE_HOST_PTR, - n,mat.host_ptr(),&error_flag); - CL_CHECK_ERR(error_flag); - return UCL_SUCCESS; - } - if (kind==UCL_WRITE_OPTIMIZED) { - mat.cbegin()=clCreateBuffer(dev.context(), - CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, - n,NULL,&error_flag); - if (error_flag != CL_SUCCESS) - return UCL_MEMORY_ERROR; - *mat.host_ptr() = (typename mat_type::data_type*) - clEnqueueMapBuffer(dev.cq(),mat.cbegin(),CL_TRUE, - CL_MAP_WRITE,0,n,0,NULL,NULL,NULL); + const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){ + cl_mem_flags buffer_perm; + cl_map_flags map_perm; + if (kind==UCL_READ_ONLY) { + #ifdef CL_VERSION_1_2 + buffer_perm=CL_MEM_HOST_READ_ONLY|CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR; + #else + buffer_perm=CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR; + #endif + map_perm=CL_MAP_READ; + } else if (kind==UCL_WRITE_ONLY) { + #ifdef CL_VERSION_1_2 + buffer_perm=CL_MEM_HOST_WRITE_ONLY|CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR; + #else + buffer_perm=CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR; + #endif + map_perm=CL_MAP_WRITE; } else { - mat.cbegin()=clCreateBuffer(dev.context(), - CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, - n,NULL,&error_flag); - if (error_flag != CL_SUCCESS) - return UCL_MEMORY_ERROR; - *mat.host_ptr() = (typename mat_type::data_type*) - clEnqueueMapBuffer(dev.cq(),mat.cbegin(),CL_TRUE, - CL_MAP_READ & CL_MAP_WRITE, - 0,n,0,NULL,NULL,NULL); + buffer_perm=CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR; + map_perm=CL_MAP_READ | CL_MAP_WRITE; } + + cl_int error_flag; + mat.cbegin()=clCreateBuffer(dev.context(),buffer_perm,n,NULL,&error_flag); + if (error_flag != CL_SUCCESS) + return UCL_MEMORY_ERROR; + + *mat.host_ptr() = (typename mat_type::data_type*) + clEnqueueMapBuffer(dev.cq(),mat.cbegin(),CL_TRUE, + map_perm,0,n,0,NULL,NULL,NULL); mat.cq()=dev.cq(); CL_SAFE_CALL(clRetainCommandQueue(mat.cq())); return UCL_SUCCESS; } template -inline void _host_free(mat_type &mat, const enum UCL_MEMOPT kind) { - CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin())); - CL_DESTRUCT_CALL(clReleaseCommandQueue(mat.cq())); +inline int _host_view(mat_type &mat, UCL_Device &dev, const size_t n) { + cl_int error_flag; + mat.cbegin()=clCreateBuffer(dev.context(), CL_MEM_USE_HOST_PTR, + n,mat.host_ptr(),&error_flag); + CL_CHECK_ERR(error_flag); + CL_SAFE_CALL(clRetainCommandQueue(mat.cq())); + return UCL_SUCCESS; +} + +template +inline void _host_free(mat_type &mat) { + if (mat.cols()>0) { + CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin())); + CL_DESTRUCT_CALL(clReleaseCommandQueue(mat.cq())); + } } template @@ -138,28 +194,26 @@ inline int _host_resize(mat_type &mat, const size_t n) { cl_context context; CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_CONTEXT,sizeof(context), &context,NULL)); + cl_mem_flags buffer_perm; + CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_FLAGS,sizeof(buffer_perm), + &buffer_perm,NULL)); CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin())); - if (mat.kind()==UCL_WRITE_OPTIMIZED) { - mat.cbegin()=clCreateBuffer(context, - CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, - n,NULL,&error_flag); - if (error_flag != CL_SUCCESS) - return UCL_MEMORY_ERROR; - *mat.host_ptr() = (typename mat_type::data_type*) - clEnqueueMapBuffer(mat.cq(),mat.cbegin(),CL_TRUE, - CL_MAP_WRITE,0,n,0,NULL,NULL,NULL); - } else { - mat.cbegin()=clCreateBuffer(context, - CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, - n,NULL,&error_flag); - if (error_flag != CL_SUCCESS) - return UCL_MEMORY_ERROR; - *mat.host_ptr() = (typename mat_type::data_type*) - clEnqueueMapBuffer(mat.cq(),mat.cbegin(),CL_TRUE, - CL_MAP_READ | CL_MAP_WRITE, - 0,n,0,NULL,NULL,NULL); - } + + cl_map_flags map_perm; + if (mat.kind()==UCL_READ_ONLY) + map_perm=CL_MAP_READ; + else if (mat.kind()==UCL_WRITE_ONLY) + map_perm=CL_MAP_WRITE; + else + map_perm=CL_MAP_READ | CL_MAP_WRITE; + + mat.cbegin()=clCreateBuffer(context,buffer_perm,n,NULL,&error_flag); + if (error_flag != CL_SUCCESS) + return UCL_MEMORY_ERROR; + *mat.host_ptr() = (typename mat_type::data_type*) + clEnqueueMapBuffer(mat.cq(),mat.cbegin(),CL_TRUE, + map_perm,0,n,0,NULL,NULL,NULL); return UCL_SUCCESS; } @@ -179,9 +233,17 @@ inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t n, if (kind==UCL_READ_WRITE) flag=CL_MEM_READ_WRITE; else if (kind==UCL_READ_ONLY) + #ifdef CL_VERSION_1_2 + flag=CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY; + #else flag=CL_MEM_READ_ONLY; + #endif else if (kind==UCL_WRITE_ONLY) + #ifdef CL_VERSION_1_2 + flag=CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY; + #else flag=CL_MEM_WRITE_ONLY; + #endif else assert(0==1); mat.cbegin()=clCreateBuffer(context,flag,n,NULL,&error_flag); @@ -200,9 +262,17 @@ inline int _device_alloc(mat_type &mat, UCL_Device &dev, const size_t n, if (kind==UCL_READ_WRITE) flag=CL_MEM_READ_WRITE; else if (kind==UCL_READ_ONLY) + #ifdef CL_VERSION_1_2 + flag=CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY; + #else flag=CL_MEM_READ_ONLY; + #endif else if (kind==UCL_WRITE_ONLY) + #ifdef CL_VERSION_1_2 + flag=CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY; + #else flag=CL_MEM_WRITE_ONLY; + #endif else assert(0==1); mat.cbegin()=clCreateBuffer(dev.context(),flag,n,NULL, @@ -238,8 +308,10 @@ inline int _device_alloc(mat_type &mat, UCL_Device &dev, const size_t rows, template inline void _device_free(mat_type &mat) { - CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin())); - CL_DESTRUCT_CALL(clReleaseCommandQueue(mat.cq())); + if (mat.cols()>0) { + CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin())); + CL_DESTRUCT_CALL(clReleaseCommandQueue(mat.cq())); + } } template @@ -255,9 +327,17 @@ inline int _device_resize(mat_type &mat, const size_t n) { if (mat.kind()==UCL_READ_WRITE) flag=CL_MEM_READ_WRITE; else if (mat.kind()==UCL_READ_ONLY) + #ifdef CL_VERSION_1_2 + flag=CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY; + #else flag=CL_MEM_READ_ONLY; + #endif else if (mat.kind()==UCL_WRITE_ONLY) + #ifdef CL_VERSION_1_2 + flag=CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY; + #else flag=CL_MEM_WRITE_ONLY; + #endif else assert(0==1); mat.cbegin()=clCreateBuffer(context,flag,n,NULL,&error_flag); @@ -285,9 +365,17 @@ inline int _device_resize(mat_type &mat, const size_t rows, if (mat.kind()==UCL_READ_WRITE) flag=CL_MEM_READ_WRITE; else if (mat.kind()==UCL_READ_ONLY) + #ifdef CL_VERSION_1_2 + flag=CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY; + #else flag=CL_MEM_READ_ONLY; + #endif else if (mat.kind()==UCL_WRITE_ONLY) + #ifdef CL_VERSION_1_2 + flag=CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY; + #else flag=CL_MEM_WRITE_ONLY; + #endif else assert(0==1); mat.cbegin()=clCreateBuffer(context,flag,pitch*rows,NULL,&error_flag); @@ -344,7 +432,19 @@ inline void _ocl_kernel_from_source(cl_context &context, cl_device_id &device, } template -inline void _device_zero(mat_type &mat, const size_t n) { +inline void _device_zero(mat_type &mat, const size_t n, command_queue &cq) { + #ifdef CL_VERSION_1_2 + #ifndef __APPLE__ + #define UCL_CL_ZERO + #endif + #endif + + #ifdef UCL_CL_ZERO + cl_int zeroint=0; + CL_SAFE_CALL(clEnqueueFillBuffer(cq,mat.begin(),&zeroint,sizeof(cl_int), + mat.byteoff(),n,0,NULL,NULL)); + + #else cl_context context; CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_CONTEXT,sizeof(context), &context,NULL)); @@ -354,17 +454,20 @@ inline void _device_zero(mat_type &mat, const size_t n) { const char * szero[3]={ "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", - "__kernel void _device_zero(__global NUMTYP *a)", - " { int gid=get_global_id(0); a[gid]=(NUMTYP)0; }" + "__kernel void _device_zero(__global NUMTYP *a, const int offset)", + " { int gid=get_global_id(0)+offset; a[gid]=(NUMTYP)0; }" }; cl_kernel kzero; _ocl_kernel_from_source(context,device,szero,3,kzero,"_device_zero", _UCL_DATA_ID::numtyp_flag()); + cl_int offset=mat.offset(); CL_SAFE_CALL(clSetKernelArg(kzero,0,sizeof(cl_mem),(void *)&mat.begin())); + CL_SAFE_CALL(clSetKernelArg(kzero,1,sizeof(cl_int),(void *)&offset)); size_t kn=n/sizeof(typename mat_type::data_type); - CL_SAFE_CALL(clEnqueueNDRangeKernel(mat.cq(),kzero,1,0,&kn,0,0,0,0)); + CL_SAFE_CALL(clEnqueueNDRangeKernel(cq,kzero,1,0,&kn,0,0,0,0)); + #endif } // -------------------------------------------------------------------------- @@ -470,9 +573,15 @@ template <> struct _ucl_memcpy<1,0> { cl_command_queue &cq, const cl_bool block, const size_t dst_offset, const size_t src_offset) { if (src.cbegin()==dst.cbegin()) { + #ifdef UCL_DBG_MEM_TRACE + std::cerr << "UCL_COPY 1S\n"; + #endif if (block) ucl_sync(cq); return; } + #ifdef UCL_DBG_MEM_TRACE + std::cerr << "UCL_COPY 1NS\n"; + #endif CL_SAFE_CALL(clEnqueueReadBuffer(cq,src.cbegin(),block,src_offset,n, dst.begin(),0,NULL,NULL)); } @@ -484,8 +593,14 @@ template <> struct _ucl_memcpy<1,0> { size_t dst_offset, size_t src_offset) { if (src.cbegin()==dst.cbegin()) { if (block) ucl_sync(cq); + #ifdef UCL_DBG_MEM_TRACE + std::cerr << "UCL_COPY 2S\n"; + #endif return; } + #ifdef UCL_DBG_MEM_TRACE + std::cerr << "UCL_COPY 2NS\n"; + #endif if (spitch==dpitch && dst.cols()==src.cols() && src.cols()==cols/src.element_size()) CL_SAFE_CALL(clEnqueueReadBuffer(cq,src.cbegin(),block,src_offset, @@ -511,8 +626,14 @@ template <> struct _ucl_memcpy<0,1> { const size_t dst_offset, const size_t src_offset) { if (src.cbegin()==dst.cbegin()) { if (block) ucl_sync(cq); + #ifdef UCL_DBG_MEM_TRACE + std::cerr << "UCL_COPY 3S\n"; + #endif return; } + #ifdef UCL_DBG_MEM_TRACE + std::cerr << "UCL_COPY 3NS\n"; + #endif CL_SAFE_CALL(clEnqueueWriteBuffer(cq,dst.cbegin(),block,dst_offset,n, src.begin(),0,NULL,NULL)); } @@ -524,8 +645,14 @@ template <> struct _ucl_memcpy<0,1> { size_t dst_offset, size_t src_offset) { if (src.cbegin()==dst.cbegin()) { if (block) ucl_sync(cq); + #ifdef UCL_DBG_MEM_TRACE + std::cerr << "UCL_COPY 4S\n"; + #endif return; } + #ifdef UCL_DBG_MEM_TRACE + std::cerr << "UCL_COPY 4NS\n"; + #endif if (spitch==dpitch && dst.cols()==src.cols() && src.cols()==cols/src.element_size()) CL_SAFE_CALL(clEnqueueWriteBuffer(cq,dst.cbegin(),block,dst_offset, @@ -549,9 +676,17 @@ template struct _ucl_memcpy { static inline void mc(p1 &dst, const p2 &src, const size_t n, cl_command_queue &cq, const cl_bool block, const size_t dst_offset, const size_t src_offset) { - if (src.cbegin()!=dst.cbegin() || src_offset!=dst_offset) + if (src.cbegin()!=dst.cbegin() || src_offset!=dst_offset) { CL_SAFE_CALL(clEnqueueCopyBuffer(cq,src.cbegin(),dst.cbegin(),src_offset, dst_offset,n,0,NULL,NULL)); + #ifdef UCL_DBG_MEM_TRACE + std::cerr << "UCL_COPY 6NS\n"; + #endif + } + #ifdef UCL_DBG_MEM_TRACE + else std::cerr << "UCL_COPY 6S\n"; + #endif + if (block==CL_TRUE) ucl_sync(cq); } template @@ -561,6 +696,9 @@ template struct _ucl_memcpy { const cl_bool block, size_t dst_offset, size_t src_offset) { if (src.cbegin()!=dst.cbegin() || src_offset!=dst_offset) { + #ifdef UCL_DBG_MEM_TRACE + std::cerr << "UCL_COPY 7NS\n"; + #endif if (spitch==dpitch && dst.cols()==src.cols() && src.cols()==cols/src.element_size()) CL_SAFE_CALL(clEnqueueCopyBuffer(cq,src.cbegin(),dst.cbegin(),src_offset, @@ -575,6 +713,10 @@ template struct _ucl_memcpy { dst_offset+=dpitch; } } + #ifdef UCL_DBG_MEM_TRACE + else std::cerr << "UCL_COPY 7S\n"; + #endif + if (block==CL_TRUE) ucl_sync(cq); } }; diff --git a/lib/gpu/geryon/ocl_timer.h b/lib/gpu/geryon/ocl_timer.h index d3d6810259..627d19d66f 100644 --- a/lib/gpu/geryon/ocl_timer.h +++ b/lib/gpu/geryon/ocl_timer.h @@ -27,6 +27,12 @@ #include "ocl_macros.h" #include "ocl_device.h" +#ifdef CL_VERSION_1_2 +#define UCL_OCL_MARKER(cq,event) clEnqueueMarkerWithWaitList(cq,0,NULL,event) +#else +#define UCL_OCL_MARKER clEnqueueMarker +#endif + namespace ucl_opencl { /// Class for timing OpenCL events @@ -63,10 +69,10 @@ class UCL_Timer { } /// Start timing on default command queue - inline void start() { clEnqueueMarker(_cq,&start_event); } + inline void start() { UCL_OCL_MARKER(_cq,&start_event); } /// Stop timing on default command queue - inline void stop() { clEnqueueMarker(_cq,&stop_event); } + inline void stop() { UCL_OCL_MARKER(_cq,&stop_event); } /// Block until the start event has been reached on device inline void sync_start() @@ -78,7 +84,7 @@ class UCL_Timer { /// Set the time elapsed to zero (not the total_time) inline void zero() - { clEnqueueMarker(_cq,&start_event); clEnqueueMarker(_cq,&stop_event); } + { UCL_OCL_MARKER(_cq,&start_event); UCL_OCL_MARKER(_cq,&stop_event); } /// Set the total time to zero inline void zero_total() { _total_time=0.0; } diff --git a/lib/gpu/geryon/ucl_basemat.h b/lib/gpu/geryon/ucl_basemat.h index 844071c9b5..4edf83e057 100644 --- a/lib/gpu/geryon/ucl_basemat.h +++ b/lib/gpu/geryon/ucl_basemat.h @@ -58,19 +58,36 @@ * calls for reserving and copying memory **/ class UCL_BaseMat { public: - UCL_BaseMat() : _cq(0) { } + UCL_BaseMat() : _cq(0), _kind(UCL_VIEW) { } virtual ~UCL_BaseMat() { } /// Return the default command queue/stream associated with this data inline command_queue & cq() { return _cq; } + /// Change the default command queue associated with matrix + inline void cq(command_queue &cq_in) { _cq=cq_in; } /// Block until command_queue associated with matrix is complete inline void sync() { ucl_sync(_cq); } + /// Return the type/permissions of memory allocation + /** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, UCL_NOT_PINNED + * or UCL_VIEW **/ + inline enum UCL_MEMOPT kind() const { return _kind; } + + inline bool shared_mem_device() { + #ifdef _OCL_MAT + cl_device_id device; + CL_SAFE_CALL(clGetCommandQueueInfo(_cq,CL_QUEUE_DEVICE, + sizeof(cl_device_id),&device,NULL)); + cl_device_type device_type; + CL_SAFE_CALL(clGetDeviceInfo(device,CL_DEVICE_TYPE, + sizeof(device_type),&device_type,NULL)); + return _shared_mem_device(device_type); + #else + return false; + #endif + } - #ifdef UCL_DEBUG - // Returns the type of host allocation - virtual inline enum UCL_MEMOPT kind() const { return UCL_NOT_PINNED; } - #endif protected: command_queue _cq; + enum UCL_MEMOPT _kind; }; #endif diff --git a/lib/gpu/geryon/ucl_copy.h b/lib/gpu/geryon/ucl_copy.h index c201cc0b12..c6bff97a8c 100644 --- a/lib/gpu/geryon/ucl_copy.h +++ b/lib/gpu/geryon/ucl_copy.h @@ -102,6 +102,30 @@ // Only allow this file to be included by nvc_memory.h and ocl_memory.h #ifdef UCL_COPY_ALLOW +// -------------------------------------------------------------------------- +// - CHECK PERMISSIONS FOR SOURCE AND DESTINATION IN COPY +// -------------------------------------------------------------------------- +template +inline void _check_ucl_copy_perm(mat1 &dst, mat2 &src) { + if ((int)mat1::MEM_TYPE==(int)mat2::MEM_TYPE) { + if (dst.kind()==UCL_READ_ONLY) { + std::cerr << "Attempt to copy where destination is UCL_READ_ONLY\n"; + assert(0==1); + } else if (src.kind()==UCL_WRITE_ONLY) { + std::cerr << "Attempt to copy where source is UCL_WRITE_ONLY\n"; + assert(0==1); + } + } else { + if (dst.kind()==UCL_WRITE_ONLY) { + std::cerr << "Destination in host-device copy cannot be UCL_WRITE_ONLY\n"; + assert(0==1); + } else if (src.kind()==UCL_READ_ONLY) { + std::cerr << "Source in host-device copy cannot be UCL_READ_ONLY\n"; + assert(0==1); + } + } +} + // -------------------------------------------------------------------------- // - HOST-HOST COPY ROUTINES // -------------------------------------------------------------------------- @@ -117,9 +141,20 @@ template <> struct _host_host_copy<1,1> { assert(mat1::PADDED==0 && mat2::PADDED==0); assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1); #endif - if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE && mat1::DATA_TYPE!=0) + if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE && mat1::DATA_TYPE!=0) { + #ifdef _OCL_MAT + if (dst.begin()==src.begin()) { + #ifdef UCL_DBG_MEM_TRACE + std::cerr << "UCL_COPY 7S\n"; + #endif + return; + } + #endif memcpy(dst.begin(),src.begin(),numel*sizeof(typename mat1::data_type)); - else + #ifdef UCL_DBG_MEM_TRACE + std::cerr << "UCL_COPY 7NS\n"; + #endif + } else for (size_t i=0; i(src[i]); } @@ -138,15 +173,27 @@ template <> struct _host_host_copy<1,1> { src_row_size=cols; else src_row_size=src.row_size(); - if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE && mat1::DATA_TYPE!=0) + if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE && mat1::DATA_TYPE!=0) { + #ifdef _OCL_MAT + if (dst.begin()==src.begin()) { + #ifdef UCL_DBG_MEM_TRACE + std::cerr << "UCL_COPY 8S\n"; + #endif + return; + } + #endif + + #ifdef UCL_DBG_MEM_TRACE + std::cerr << "UCL_COPY 8NS\n"; + #endif for (size_t i=0; i(src[src_i]); src_i++; @@ -216,15 +263,14 @@ template struct _ucl_cast_copy<1,host_type2> { ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src, src.row_bytes(),cols*sizeof(typename mat2::data_type), rows); - int dst_i=0; - int buff_i=0; + size_t dst_i=0, buff_i=0, doff=dst.cols()-cols; for (size_t i=0; i(cast_buffer[buff_i]); buff_i++; dst_i++; } - dst_i+=dst.cols()-cols; + dst_i+=doff; } } } @@ -255,15 +301,14 @@ template struct _ucl_cast_copy<1,host_type2> { src.row_bytes(),cols*sizeof(typename mat2::data_type), rows,cq); cast_buffer.sync(); - int dst_i=0; - int buff_i=0; + size_t dst_i=0, buff_i=0, doff=dst.cols()-cols; for (size_t i=0; i(cast_buffer[buff_i]); buff_i++; dst_i++; } - dst_i+=dst.cols()-cols; + dst_i+=doff; } } } @@ -293,38 +338,62 @@ template struct _ucl_cast_copy { assert(src.numel()>=rows*cols && cast_buffer.numel()>=rows*cols); if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols); if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols); + if (mat3::VECTOR==0) { + assert(cast_buffer.rows()>=rows && cast_buffer.cols()>=cols); + assert(dst.rows()>=rows && dst.cols()>=cols); + } #endif if (mat2::VECTOR) { - for (size_t i=0; i(src[i]); - ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer, - cols*sizeof(typename mat1::data_type), - cols*sizeof(typename mat1::data_type),rows); + if (mat3::VECTOR==0) { + size_t ci=0, si=0, co=cast_buffer.cols()-cols, so=src.cols()-cols; + for (size_t i=0; i(src[si]); + ci++; + si++; + } + ci+=co; + si+=so; + } + ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,cast_buffer.row_bytes(), + cols*sizeof(typename mat1::data_type),rows); + } else { + for (size_t i=0; i(src[i]); + ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer, + cols*sizeof(typename mat1::data_type), + cols*sizeof(typename mat1::data_type),rows); + } } else if (mat1::VECTOR) { - int src_i=0; - int buf_i=0; + size_t src_i=0, buf_i=0, soff=src.cols()-cols; for (size_t i=0; i(src[src_i]); buf_i++; src_i++; } - src_i+=src.cols()-cols; + src_i+=soff; } ucl_mv_cpy(dst,cast_buffer,cols*sizeof(typename mat1::data_type)*rows); } else { - int src_i=0; - int buf_i=0; + size_t src_i=0, buf_i=0, so=src.cols()-cols, co, spitch; + if (mat3::VECTOR==0) { + co=cast_buffer.cols()-cols; + spitch=cast_buffer.row_bytes(); + } else { + co=0; + spitch=cols*sizeof(typename mat1::data_type); + } for (size_t i=0; i(src[src_i]); buf_i++; src_i++; } - src_i+=src.cols()-cols; + src_i+=so; + buf_i+=co; } - ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer, - cols*sizeof(typename mat1::data_type), + ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,spitch, cols*sizeof(typename mat1::data_type),rows); } } @@ -337,38 +406,62 @@ template struct _ucl_cast_copy { assert(src.numel()>=rows*cols && cast_buffer.numel()>=rows*cols); if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols); if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols); + if (mat3::VECTOR==0) { + assert(cast_buffer.rows()>=rows && cast_buffer.cols()>=cols); + assert(dst.rows()>=rows && dst.cols()>=cols); + } #endif if (mat2::VECTOR) { - for (size_t i=0; i(src[i]); - ucl_mv_cpy(dst,dst.row_bytes(), - cast_buffer,cols*sizeof(typename mat1::data_type), - cols*sizeof(typename mat1::data_type),rows,cq); + if (mat3::VECTOR==0) { + size_t ci=0, si=0, co=cast_buffer.cols()-cols, so=src.cols()-cols; + for (size_t i=0; i(src[si]); + ci++; + si++; + } + ci+=co; + si+=so; + } + ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,cast_buffer.row_bytes(), + cols*sizeof(typename mat1::data_type),rows); + } else { + for (size_t i=0; i(src[i]); + ucl_mv_cpy(dst,dst.row_bytes(), + cast_buffer,cols*sizeof(typename mat1::data_type), + cols*sizeof(typename mat1::data_type),rows,cq); + } } else if (mat1::VECTOR) { - int src_i=0; - int buf_i=0; + size_t src_i=0, buf_i=0, soff=src.cols()-cols; for (size_t i=0; i(src[src_i]); buf_i++; src_i++; } - src_i+=src.cols()-cols; + src_i+=soff; } ucl_mv_cpy(dst,cast_buffer,cols*sizeof(typename mat1::data_type)*rows,cq); } else { - int src_i=0; - int buf_i=0; + size_t src_i=0, buf_i=0, so=src.cols()-cols, co, spitch; + if (mat3::VECTOR==0) { + co=cast_buffer.cols()-cols; + spitch=cast_buffer.row_bytes(); + } else { + co=0; + spitch=cols*sizeof(typename mat1::data_type); + } for (size_t i=0; i(src[src_i]); buf_i++; src_i++; } - src_i+=src.cols()-cols; + src_i+=so; + buf_i+=co; } - ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer, - cols*sizeof(typename mat1::data_type), + ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,spitch, cols*sizeof(typename mat1::data_type),rows,cq); } } @@ -444,9 +537,13 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel, #endif if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE) ucl_copy(dst,src,numel,cq); - else + else { + #ifdef UCL_DEBUG + _check_ucl_copy_perm(dst,src); + #endif _ucl_cast_copy::cc(dst,src,numel, cast_buffer,cq); + } } /// Asynchronous copy of matrix/vector with cast (Device/Host transfer) @@ -463,6 +560,7 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel, assert(dst.numel()>=numel && src.numel()>=numel); assert(cast_buffer.numel()>=numel); assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1); + _check_ucl_copy_perm(dst,src); #endif if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE) ucl_copy(dst,src,numel,async); @@ -491,6 +589,7 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel, assert(dst.row_size()*dst.rows()>=numel && src.row_size()*src.rows()>=numel); assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1); assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1); + _check_ucl_copy_perm(dst,src); #endif if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1) _host_host_copy::hhc(dst,src,numel); @@ -498,12 +597,12 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel, (mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) { if (mat1::MEM_TYPE==1) { UCL_H_Vec cast_buffer; - cast_buffer.alloc(numel,dst,UCL_RW_OPTIMIZED); + cast_buffer.alloc(numel,dst,UCL_READ_ONLY); _ucl_cast_copy::cc(dst,src,numel, cast_buffer,cq); } else { UCL_H_Vec cast_buffer; - cast_buffer.alloc(numel,dst,UCL_WRITE_OPTIMIZED); + cast_buffer.alloc(numel,dst,UCL_WRITE_ONLY); _ucl_cast_copy::cc(dst,src,numel, cast_buffer,cq); } @@ -529,6 +628,7 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel, #ifdef UCL_DEBUG assert(dst.row_size()*dst.rows()>=numel && src.row_size()*src.rows()>=numel); assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1); + _check_ucl_copy_perm(dst,src); #endif if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1) _host_host_copy::hhc(dst,src,numel); @@ -538,12 +638,12 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel, (mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) { if (mat1::MEM_TYPE==1) { UCL_H_Vec cast_buffer; - cast_buffer.alloc(numel,dst,UCL_RW_OPTIMIZED); + cast_buffer.alloc(numel,dst,UCL_READ_ONLY); _ucl_cast_copy::cc(dst,src,numel, cast_buffer); } else { UCL_H_Vec cast_buffer; - cast_buffer.alloc(numel,dst,UCL_WRITE_OPTIMIZED); + cast_buffer.alloc(numel,dst,UCL_WRITE_ONLY); _ucl_cast_copy::cc(dst,src,numel, cast_buffer); } @@ -574,9 +674,13 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t rows, ucl_copy(dst,src,rows,cols,async); else if (async) ucl_copy(dst,src,rows,cols,dst.cq()); - else + else { + #ifdef UCL_DEBUG + _check_ucl_copy_perm(dst,src); + #endif _ucl_cast_copy::cc(dst,src,rows,cols, cast_buffer); + } } /// Asynchronous copy subset matrix rows,cols with cast (Device/Host transfer) @@ -595,9 +699,13 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t rows, command_queue &cq) { if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE) ucl_copy(dst,src,rows,cols,cq); - else + else { + #ifdef UCL_DEBUG + _check_ucl_copy_perm(dst,src); + #endif _ucl_cast_copy::cc(dst,src,rows,cols, cast_buffer,cq); + } } /// Asynchronous copy of subset matrix rows,cols (memory already allocated) @@ -617,18 +725,21 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t rows, template inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows, const size_t cols, command_queue &cq) { + #ifdef UCL_DEBUG + _check_ucl_copy_perm(dst,src); + #endif if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1) _host_host_copy::hhc(dst,src,rows,cols); else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE && (mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) { if (mat1::MEM_TYPE==1) { UCL_H_Vec cast_buffer; - cast_buffer.alloc(rows*cols,dst,UCL_RW_OPTIMIZED); + cast_buffer.alloc(rows*cols,dst,UCL_READ_ONLY); _ucl_cast_copy::cc(dst,src,rows,cols, cast_buffer,cq); } else { UCL_H_Vec cast_buffer; - cast_buffer.alloc(rows*cols,dst,UCL_WRITE_OPTIMIZED); + cast_buffer.alloc(rows*cols,dst,UCL_WRITE_ONLY); _ucl_cast_copy::cc(dst,src,rows,cols, cast_buffer,cq); } @@ -678,6 +789,9 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows, template inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows, const size_t cols, const bool async) { + #ifdef UCL_DEBUG + _check_ucl_copy_perm(dst,src); + #endif if (async) ucl_copy(dst,src,rows,cols,dst.cq()); else if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1) @@ -686,12 +800,12 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows, (mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) { if (mat1::MEM_TYPE==1) { UCL_H_Vec cast_buffer; - cast_buffer.alloc(rows*cols,dst,UCL_RW_OPTIMIZED); + cast_buffer.alloc(rows*cols,dst,UCL_READ_ONLY); _ucl_cast_copy::cc(dst,src,rows,cols, cast_buffer); } else { UCL_H_Vec cast_buffer; - cast_buffer.alloc(rows*cols,dst,UCL_WRITE_OPTIMIZED); + cast_buffer.alloc(rows*cols,dst,UCL_WRITE_ONLY); _ucl_cast_copy::cc(dst,src,rows,cols, cast_buffer); } diff --git a/lib/gpu/geryon/ucl_d_mat.h b/lib/gpu/geryon/ucl_d_mat.h index b065a8b644..f1aaa27903 100644 --- a/lib/gpu/geryon/ucl_d_mat.h +++ b/lib/gpu/geryon/ucl_d_mat.h @@ -39,14 +39,14 @@ class UCL_D_Mat : public UCL_BaseMat { }; typedef numtyp data_type; - UCL_D_Mat() : _rows(0), _kind(UCL_VIEW) {} - ~UCL_D_Mat() { if (_kind!=UCL_VIEW) _device_free(*this); } + UCL_D_Mat() : _cols(0) {} + ~UCL_D_Mat() { _device_free(*this); } /// Construct with specified rows and cols /** \sa alloc() **/ UCL_D_Mat(const size_t rows, const size_t cols, UCL_Device &device, const enum UCL_MEMOPT kind=UCL_READ_WRITE) : - _rows(0), _kind(UCL_VIEW) { alloc(rows,cols,device,kind); } + _cols(0) { alloc(rows,cols,device,kind); } /// Row major matrix on device /** The kind parameter controls memory optimizations as follows: @@ -121,15 +121,11 @@ class UCL_D_Mat : public UCL_BaseMat { return err; } - /// Return the type of memory allocation - /** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, or UCL_VIEW **/ - inline enum UCL_MEMOPT kind() const { return _kind; } - /// Do not allocate memory, instead use an existing allocation from Geryon /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container + * allocating container when using CUDA APIs * \param stride Number of _elements_ between the start of each row **/ template inline void view(ucl_type &input, const size_t rows, const size_t cols, @@ -142,8 +138,10 @@ class UCL_D_Mat : public UCL_BaseMat { _row_size=stride; this->_cq=input.cq(); #ifdef _OCL_MAT - _offset=0; + _offset=input.offset(); _array=input.cbegin(); + CL_SAFE_CALL(clRetainMemObject(input.cbegin())); + CL_SAFE_CALL(clRetainCommandQueue(input.cq())); #else _device_view(&_array,input.begin()); #endif @@ -157,7 +155,7 @@ class UCL_D_Mat : public UCL_BaseMat { /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container **/ + * allocating container when using CUDA APIs **/ template inline void view(ucl_type &input, const size_t rows, const size_t cols) { view(input,rows,cols,input.row_size()); } @@ -166,7 +164,7 @@ class UCL_D_Mat : public UCL_BaseMat { /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container + * allocating container when using CUDA APIs * - If a matrix is used a input, all elements (including padding) * will be used for view **/ template @@ -177,7 +175,7 @@ class UCL_D_Mat : public UCL_BaseMat { /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container + * allocating container when using CUDA APIs * - If a matrix is used a input, all elements (including padding) * will be used for view **/ template @@ -187,7 +185,7 @@ class UCL_D_Mat : public UCL_BaseMat { /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container + * allocating container when using CUDA APIs * \param stride Number of _elements_ between the start of each row **/ template inline void view(ptr_type input, const size_t rows, const size_t cols, @@ -205,13 +203,15 @@ class UCL_D_Mat : public UCL_BaseMat { #endif #ifdef _OCL_MAT _offset=0; + CL_SAFE_CALL(clRetainMemObject(input)); + CL_SAFE_CALL(clRetainCommandQueue(dev.cq())); #endif } /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container **/ + * allocating container when using CUDA APIs **/ template inline void view(ptr_type input, const size_t rows, const size_t cols, UCL_Device &dev) { view(input,rows,cols,cols,dev); } @@ -219,7 +219,7 @@ class UCL_D_Mat : public UCL_BaseMat { /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container **/ + * allocating container when using CUDA APIs **/ template inline void view(ptr_type input, const size_t cols, UCL_Device &dev) { view(input,1,cols,dev); } @@ -228,7 +228,7 @@ class UCL_D_Mat : public UCL_BaseMat { /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container + * allocating container when using CUDA APIs * \param stride Number of _elements_ between the start of each row **/ template inline void view_offset(const size_t offset,ucl_type &input,const size_t rows, @@ -242,7 +242,9 @@ class UCL_D_Mat : public UCL_BaseMat { this->_cq=input.cq(); #ifdef _OCL_MAT _array=input.begin(); - _offset=offset; + _offset=offset+input.offset(); + CL_SAFE_CALL(clRetainMemObject(input.cbegin())); + CL_SAFE_CALL(clRetainCommandQueue(input.cq())); #else _device_view(&_array,input.begin(),offset,sizeof(numtyp)); #endif @@ -256,7 +258,7 @@ class UCL_D_Mat : public UCL_BaseMat { /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container **/ + * allocating container when using CUDA APIs **/ template inline void view_offset(const size_t offset,ucl_type &input,const size_t rows, const size_t cols) @@ -266,7 +268,7 @@ class UCL_D_Mat : public UCL_BaseMat { /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container + * allocating container when using CUDA APIs * - If a matrix is used a input, all elements (including padding) * will be used for view **/ template @@ -277,7 +279,7 @@ class UCL_D_Mat : public UCL_BaseMat { /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container + * allocating container when using CUDA APIs * - If a matrix is used a input, all elements (including padding) * will be used for view **/ template @@ -292,7 +294,7 @@ class UCL_D_Mat : public UCL_BaseMat { /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container + * allocating container when using CUDA APIs * \param stride Number of _elements_ between the start of each row **/ template inline void view_offset(const size_t offset,ptr_type input,const size_t rows, @@ -309,6 +311,8 @@ class UCL_D_Mat : public UCL_BaseMat { #ifdef _OCL_MAT _array=input; _offset=offset; + CL_SAFE_CALL(clRetainMemObject(input)); + CL_SAFE_CALL(clRetainCommandQueue(dev.cq())); #else #ifdef _UCL_DEVICE_PTR_MAT _array=input+offset*sizeof(numtyp); @@ -325,7 +329,7 @@ class UCL_D_Mat : public UCL_BaseMat { /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container **/ + * allocating container when using CUDA APIs **/ template inline void view_offset(const size_t offset,ptr_type input,const size_t rows, const size_t cols, UCL_Device &dev) @@ -334,7 +338,7 @@ class UCL_D_Mat : public UCL_BaseMat { /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container **/ + * allocating container when using CUDA APIs **/ template inline void view_offset(const size_t offset, ptr_type input, const size_t cols, UCL_Device &dev) @@ -342,7 +346,7 @@ class UCL_D_Mat : public UCL_BaseMat { /// Free memory and set size to 0 inline void clear() - { _rows=0; if (_kind!=UCL_VIEW) { _kind=UCL_VIEW; _device_free(*this); } } + { _device_free(*this); _cols=0; _kind=UCL_VIEW; } /// Resize the allocation to contain cols elements /** \note Cannot be used on views **/ @@ -377,11 +381,17 @@ class UCL_D_Mat : public UCL_BaseMat { { if (cols>_cols || rows>_rows) return resize(rows,cols); else return UCL_SUCCESS; } - /// Set each element to zero - inline void zero() { _device_zero(*this,row_bytes()*_rows); } - - /// Set first n elements to zero - inline void zero(const int n) { _device_zero(*this,n*sizeof(numtyp)); } + /// Set each element to zero asynchronously in the default command_queue + inline void zero() { zero(_cq); } + /// Set first n elements to zero asynchronously in the default command_queue + inline void zero(const int n) { zero(n,_cq); } + /// Set each element to zero asynchronously + inline void zero(command_queue &cq) + { _device_zero(*this,row_bytes()*_rows,cq); } + /// Set first n elements to zero asynchronously + inline void zero(const int n, command_queue &cq) + { _device_zero(*this,n*sizeof(numtyp),cq); } + #ifdef _UCL_DEVICE_PTR_MAT /// For OpenCL, returns a (void *) device pointer to memory allocation @@ -452,7 +462,6 @@ class UCL_D_Mat : public UCL_BaseMat { private: size_t _pitch, _row_size, _rows, _cols; - enum UCL_MEMOPT _kind; #ifdef _UCL_DEVICE_PTR_MAT device_ptr _array; diff --git a/lib/gpu/geryon/ucl_d_vec.h b/lib/gpu/geryon/ucl_d_vec.h index 11107437ea..fc1977f4b5 100644 --- a/lib/gpu/geryon/ucl_d_vec.h +++ b/lib/gpu/geryon/ucl_d_vec.h @@ -39,14 +39,14 @@ class UCL_D_Vec : public UCL_BaseMat { }; typedef numtyp data_type; - UCL_D_Vec() : _cols(0), _kind(UCL_VIEW) {} - ~UCL_D_Vec() { if (_kind!=UCL_VIEW) _device_free(*this); } + UCL_D_Vec() : _cols(0) {} + ~UCL_D_Vec() { _device_free(*this); } /// Construct with n columns /** \sa alloc() **/ UCL_D_Vec(const size_t n, UCL_Device &device, const enum UCL_MEMOPT kind=UCL_READ_WRITE) : - _cols(0), _kind(UCL_VIEW) { alloc(n,device,kind); } + _cols(0) { alloc(n,device,kind); } /// Set up host vector with 'cols' columns and reserve memory /** The kind parameter controls memory optimizations as follows: @@ -119,15 +119,11 @@ class UCL_D_Vec : public UCL_BaseMat { return err; } - /// Return the type of memory allocation - /** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, or UCL_VIEW **/ - inline enum UCL_MEMOPT kind() const { return _kind; } - /// Do not allocate memory, instead use an existing allocation from Geryon /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container **/ + * allocating container when using CUDA APIs **/ template inline void view(ucl_type &input, const size_t rows, const size_t cols) { #ifdef UCL_DEBUG @@ -139,8 +135,10 @@ class UCL_D_Vec : public UCL_BaseMat { _row_bytes=_cols*sizeof(numtyp); this->_cq=input.cq(); #ifdef _OCL_MAT - _offset=0; + _offset=input.offset(); _array=input.cbegin(); + CL_SAFE_CALL(clRetainMemObject(input.cbegin())); + CL_SAFE_CALL(clRetainCommandQueue(input.cq())); #else _device_view(&_array,input.begin()); #endif @@ -154,7 +152,7 @@ class UCL_D_Vec : public UCL_BaseMat { /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container + * allocating container when using CUDA APIs * \param stride Number of _elements_ between the start of each row **/ template inline void view(ucl_type &input, const size_t rows, const size_t cols, @@ -164,7 +162,7 @@ class UCL_D_Vec : public UCL_BaseMat { /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container + * allocating container when using CUDA APIs * - If a matrix is used a input, all elements (including padding) * will be used for view **/ template @@ -175,7 +173,7 @@ class UCL_D_Vec : public UCL_BaseMat { /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container + * allocating container when using CUDA APIs * - If a matrix is used a input, all elements (including padding) * will be used for view **/ template @@ -185,7 +183,7 @@ class UCL_D_Vec : public UCL_BaseMat { /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container **/ + * allocating container when using CUDA APIs **/ template inline void view(ptr_type input, const size_t rows, const size_t cols, UCL_Device &dev) { @@ -203,13 +201,15 @@ class UCL_D_Vec : public UCL_BaseMat { #endif #ifdef _OCL_MAT _offset=0; + CL_SAFE_CALL(clRetainMemObject(input)); + CL_SAFE_CALL(clRetainCommandQueue(dev.cq())); #endif } /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container + * allocating container when using CUDA APIs * \param stride Number of _elements_ between the start of each row **/ template inline void view(ptr_type input, const size_t rows, const size_t cols, @@ -219,7 +219,7 @@ class UCL_D_Vec : public UCL_BaseMat { /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container **/ + * allocating container when using CUDA APIs **/ template inline void view(ptr_type input, const size_t cols, UCL_Device &dev) { view(input,1,cols,dev); } @@ -228,7 +228,7 @@ class UCL_D_Vec : public UCL_BaseMat { /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container **/ + * allocating container when using CUDA APIs **/ template inline void view_offset(const size_t offset,ucl_type &input,const size_t rows, const size_t cols) { @@ -242,7 +242,9 @@ class UCL_D_Vec : public UCL_BaseMat { this->_cq=input.cq(); #ifdef _OCL_MAT _array=input.begin(); - _offset=offset; + _offset=offset+input.offset(); + CL_SAFE_CALL(clRetainMemObject(input.begin())); + CL_SAFE_CALL(clRetainCommandQueue(input.cq())); #else _device_view(&_array,input.begin(),offset,sizeof(numtyp)); #endif @@ -256,7 +258,7 @@ class UCL_D_Vec : public UCL_BaseMat { /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container + * allocating container when using CUDA APIs * \param stride Number of _elements_ between the start of each row **/ template inline void view_offset(const size_t offset,ucl_type &input,const size_t rows, @@ -267,7 +269,7 @@ class UCL_D_Vec : public UCL_BaseMat { /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container + * allocating container when using CUDA APIs * - If a matrix is used a input, all elements (including padding) * will be used for view **/ template @@ -278,7 +280,7 @@ class UCL_D_Vec : public UCL_BaseMat { /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container + * allocating container when using CUDA APIs * - If a matrix is used a input, all elements (including padding) * will be used for view **/ template @@ -288,7 +290,7 @@ class UCL_D_Vec : public UCL_BaseMat { /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container **/ + * allocating container when using CUDA APIs **/ template inline void view_offset(const size_t offset,ptr_type input,const size_t rows, const size_t cols, UCL_Device &dev) { @@ -304,6 +306,8 @@ class UCL_D_Vec : public UCL_BaseMat { #ifdef _OCL_MAT _array=input; _offset=offset; + CL_SAFE_CALL(clRetainMemObject(input)); + CL_SAFE_CALL(clRetainCommandQueue(dev.cq())); #else #ifdef _UCL_DEVICE_PTR_MAT _array=input+offset*sizeof(numtyp); @@ -320,7 +324,7 @@ class UCL_D_Vec : public UCL_BaseMat { /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container + * allocating container when using CUDA APIs * \param stride Number of _elements_ between the start of each row **/ template inline void view_offset(const size_t offset,ptr_type input,const size_t rows, @@ -330,7 +334,7 @@ class UCL_D_Vec : public UCL_BaseMat { /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container **/ + * allocating container when using CUDA APIs **/ template inline void view_offset(const size_t offset, ptr_type input, const size_t cols, UCL_Device &dev) @@ -338,7 +342,7 @@ class UCL_D_Vec : public UCL_BaseMat { /// Free memory and set size to 0 inline void clear() - { if (_kind!=UCL_VIEW) { _cols=0; _kind=UCL_VIEW; _device_free(*this); } } + { _device_free(*this); _cols=0; _kind=UCL_VIEW; } /// Resize the allocation to contain cols elements /** \note Cannot be used on views **/ @@ -373,11 +377,15 @@ class UCL_D_Vec : public UCL_BaseMat { inline int resize_ib(const int cols) { if (cols>_cols) return resize(cols); else return UCL_SUCCESS; } - /// Set each element to zero - inline void zero() { _device_zero(*this,row_bytes()); } - - /// Set first n elements to zero - inline void zero(const int n) { _device_zero(*this,n*sizeof(numtyp)); } + /// Set each element to zero asynchronously in the default command_queue + inline void zero() { zero(_cq); } + /// Set first n elements to zero asynchronously in the default command_queue + inline void zero(const int n) { zero(n,_cq); } + /// Set each element to zero asynchronously + inline void zero(command_queue &cq) { _device_zero(*this,row_bytes(),cq); } + /// Set first n elements to zero asynchronously + inline void zero(const int n, command_queue &cq) + { _device_zero(*this,n*sizeof(numtyp),cq); } #ifdef _UCL_DEVICE_PTR_MAT /// For OpenCL, returns a (void *) device pointer to memory allocation @@ -465,7 +473,6 @@ class UCL_D_Vec : public UCL_BaseMat { private: size_t _row_bytes, _row_size, _rows, _cols; - enum UCL_MEMOPT _kind; #ifdef _UCL_DEVICE_PTR_MAT device_ptr _array; diff --git a/lib/gpu/geryon/ucl_h_mat.h b/lib/gpu/geryon/ucl_h_mat.h index 806b930630..dc6da3de0c 100644 --- a/lib/gpu/geryon/ucl_h_mat.h +++ b/lib/gpu/geryon/ucl_h_mat.h @@ -39,33 +39,35 @@ class UCL_H_Mat : public UCL_BaseMat { }; typedef numtyp data_type; - UCL_H_Mat() : _kind(UCL_VIEW), _rows(0) { + UCL_H_Mat() : _cols(0) { #ifdef _OCL_MAT _carray=(cl_mem)(0); #endif } - ~UCL_H_Mat() { if (_kind!=UCL_VIEW) _host_free(*this,_kind); } + ~UCL_H_Mat() { _host_free(*this); } /// Construct with specied number of rows and columns /** \sa alloc() **/ UCL_H_Mat(const size_t rows, const size_t cols, UCL_Device &device, - const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) - { _rows=0; _kind=UCL_VIEW; alloc(rows,cols,device,kind); } + const enum UCL_MEMOPT kind=UCL_READ_WRITE) + { _cols=0; _kind=UCL_VIEW; alloc(rows,cols,device,kind); } /// Set up host matrix with specied # of rows/cols and reserve memory /** The kind parameter controls memory pinning as follows: - * - UCL_NOT_PINNED - Memory is not pinned - * - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined) - * - UCL_RW_OPTIMIZED - Memory can be pinned + * - UCL_READ_WRITE - Specify that you will read and write from host + * - UCL_WRITE_ONLY - Specify that you will only write from host + * - UCL_READ_ONLY - Specify that you will only read from host + * - UCL_NOT_PINNED - Memory is not pinned/page-locked on host * \param cq Default command queue for operations copied from another mat * \return UCL_SUCCESS if the memory allocation is successful **/ template inline int alloc(const size_t rows, const size_t cols, mat_type &cq, - const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) { + const enum UCL_MEMOPT kind=UCL_READ_WRITE, + const enum UCL_MEMOPT kind2=UCL_NOT_SPECIFIED) { clear(); _row_bytes=cols*sizeof(numtyp); - int err=_host_alloc(*this,cq,_row_bytes*rows,kind); + int err=_host_alloc(*this,cq,_row_bytes*rows,kind,kind2); if (err!=UCL_SUCCESS) { #ifndef UCL_NO_EXIT std::cerr << "UCL Error: Could not allocate " << _row_bytes*_rows @@ -86,17 +88,19 @@ class UCL_H_Mat : public UCL_BaseMat { /// Set up host matrix with specied # of rows/cols and reserve memory /** The kind parameter controls memory pinning as follows: - * - UCL_NOT_PINNED - Memory is not pinned - * - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined) - * - UCL_RW_OPTIMIZED - Memory can be pinned + * - UCL_READ_WRITE - Specify that you will read and write from host + * - UCL_WRITE_ONLY - Specify that you will only write from host + * - UCL_READ_ONLY - Specify that you will only read from host + * - UCL_NOT_PINNED - Memory is not pinned/page-locked on host * \param device Used to get the default command queue for operations * \return UCL_SUCCESS if the memory allocation is successful **/ inline int alloc(const size_t rows, const size_t cols, UCL_Device &device, - const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) { + const enum UCL_MEMOPT kind=UCL_READ_WRITE, + const enum UCL_MEMOPT kind2=UCL_NOT_SPECIFIED) { clear(); _row_bytes=cols*sizeof(numtyp); - int err=_host_alloc(*this,device,_row_bytes*rows,kind); + int err=_host_alloc(*this,device,_row_bytes*rows,kind,kind2); if (err!=UCL_SUCCESS) { #ifndef UCL_NO_EXIT std::cerr << "UCL Error: Could not allocate " << _row_bytes*_rows @@ -115,15 +119,11 @@ class UCL_H_Mat : public UCL_BaseMat { return err; } - /// Return the type of memory allocation - /** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, or UCL_VIEW **/ - inline enum UCL_MEMOPT kind() const { return _kind; } - /// Do not allocate memory, instead use an existing allocation from Geryon /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container + * allocating container when using CUDA APIs * - Viewing a device container on the host is not supported * \param stride Number of _elements_ between the start of each row **/ template @@ -140,6 +140,8 @@ class UCL_H_Mat : public UCL_BaseMat { _end=_array+_cols; #ifdef _OCL_MAT _carray=input.cbegin(); + CL_SAFE_CALL(clRetainMemObject(input.cbegin())); + CL_SAFE_CALL(clRetainCommandQueue(input.cq())); #endif } @@ -147,7 +149,7 @@ class UCL_H_Mat : public UCL_BaseMat { /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container + * allocating container when using CUDA APIs * - Viewing a device container on the host is not supported **/ template inline void view(ucl_type &input, const size_t rows, const size_t cols) @@ -157,7 +159,7 @@ class UCL_H_Mat : public UCL_BaseMat { /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container + * allocating container when using CUDA APIs * - If a matrix is used a input, all elements (including padding) * will be used for view * - Viewing a device container on the host is not supported **/ @@ -169,9 +171,9 @@ class UCL_H_Mat : public UCL_BaseMat { /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container + * allocating container when using CUDA APIs * - If a matrix is used a input, all elements (including padding) - * will be used for view + * will be used for view when using CUDA APIs * - Viewing a device container on the host is not supported **/ template inline void view(ucl_type &input) @@ -180,7 +182,7 @@ class UCL_H_Mat : public UCL_BaseMat { /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container + * allocating container when using CUDA APIs * - Viewing a device pointer on the host is not supported * \param stride Number of _elements_ between the start of each row **/ template @@ -197,14 +199,14 @@ class UCL_H_Mat : public UCL_BaseMat { _end=_array+_cols; #ifdef _OCL_MAT - _host_alloc(*this,dev,_row_bytes,UCL_VIEW); + _host_view(*this,dev,_row_bytes*rows); #endif } /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container + * allocating container when using CUDA APIs * - Viewing a device pointer on the host is not supported **/ template inline void view(ptr_type *input, const size_t rows, const size_t cols, @@ -213,7 +215,7 @@ class UCL_H_Mat : public UCL_BaseMat { /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container + * allocating container when using CUDA APIs * - Viewing a device pointer on the host is not supported **/ template inline void view(ptr_type *input, const size_t cols, UCL_Device &dev) @@ -223,7 +225,7 @@ class UCL_H_Mat : public UCL_BaseMat { /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container + * allocating container when using CUDA APIs * - Viewing a device container on the host is not supported * \param stride Number of _elements_ between the start of each row **/ template @@ -239,7 +241,7 @@ class UCL_H_Mat : public UCL_BaseMat { _array=input.begin()+offset; _end=_array+_cols; #ifdef _OCL_MAT - _host_alloc(*this,input,_row_bytes,UCL_VIEW); + _host_view(*this,input,_row_bytes*_rows); #endif } @@ -247,7 +249,7 @@ class UCL_H_Mat : public UCL_BaseMat { /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container + * allocating container when using CUDA APIs * - Viewing a device container on the host is not supported **/ template inline void view_offset(const size_t offset,ucl_type &input,const size_t rows, @@ -258,7 +260,7 @@ class UCL_H_Mat : public UCL_BaseMat { /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container + * allocating container when using CUDA APIs * - If a matrix is used a input, all elements (including padding) * will be used for view * - Viewing a device container on the host is not supported **/ @@ -270,7 +272,7 @@ class UCL_H_Mat : public UCL_BaseMat { /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container + * allocating container when using CUDA APIs * - If a matrix is used a input, all elements (including padding) * will be used for view * - Viewing a device container on the host is not supported **/ @@ -296,7 +298,7 @@ class UCL_H_Mat : public UCL_BaseMat { /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container + * allocating container when using CUDA APIs * - Viewing a device pointer on the host is not supported * \param stride Number of _elements_ between the start of each row **/ template @@ -307,7 +309,7 @@ class UCL_H_Mat : public UCL_BaseMat { /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container + * allocating container when using CUDA APIs * - Viewing a device pointer on the host is not supported **/ template inline void view_offset(const size_t offset, ptr_type *input, @@ -316,7 +318,7 @@ class UCL_H_Mat : public UCL_BaseMat { /// Free memory and set size to 0 inline void clear() - { if (_kind!=UCL_VIEW) {_rows=0; _kind=UCL_VIEW; _host_free(*this,_kind); }} + { _host_free(*this); _cols=0; _kind=UCL_VIEW; } /// Resize the allocation to rows x cols elements /** \note Cannot be used on views **/ @@ -409,7 +411,6 @@ class UCL_H_Mat : public UCL_BaseMat { #endif private: - enum UCL_MEMOPT _kind; numtyp *_array, *_end; size_t _row_bytes, _rows, _cols; diff --git a/lib/gpu/geryon/ucl_h_vec.h b/lib/gpu/geryon/ucl_h_vec.h index 3a53113153..773facdea0 100644 --- a/lib/gpu/geryon/ucl_h_vec.h +++ b/lib/gpu/geryon/ucl_h_vec.h @@ -39,33 +39,35 @@ class UCL_H_Vec : public UCL_BaseMat { }; typedef numtyp data_type; - UCL_H_Vec() : _kind(UCL_VIEW), _cols(0) { + UCL_H_Vec() : _cols(0) { #ifdef _OCL_MAT _carray=(cl_mem)(0); #endif } - ~UCL_H_Vec() { if (_kind!=UCL_VIEW) _host_free(*this,_kind); } + ~UCL_H_Vec() { _host_free(*this); } /// Construct with n columns /** \sa alloc() **/ UCL_H_Vec(const size_t n, UCL_Device &device, - const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) + const enum UCL_MEMOPT kind=UCL_READ_WRITE) { _cols=0; _kind=UCL_VIEW; alloc(n,device,kind); } /// Set up host vector with 'cols' columns and reserve memory /** The kind parameter controls memory pinning as follows: - * - UCL_NOT_PINNED - Memory is not pinned - * - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined) - * - UCL_RW_OPTIMIZED - Memory can be pinned + * - UCL_READ_WRITE - Specify that you will read and write from host + * - UCL_WRITE_ONLY - Specify that you will only write from host + * - UCL_READ_ONLY - Specify that you will only read from host + * - UCL_NOT_PINNED - Memory is not pinned/page-locked on host * \param cq Default command queue for operations copied from another mat * \return UCL_SUCCESS if the memory allocation is successful **/ template inline int alloc(const size_t cols, mat_type &cq, - const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) { + const enum UCL_MEMOPT kind=UCL_READ_WRITE, + const enum UCL_MEMOPT kind2=UCL_NOT_SPECIFIED) { clear(); _row_bytes=cols*sizeof(numtyp); - int err=_host_alloc(*this,cq,_row_bytes,kind); + int err=_host_alloc(*this,cq,_row_bytes,kind,kind2); if (err!=UCL_SUCCESS) { #ifndef UCL_NO_EXIT @@ -86,17 +88,19 @@ class UCL_H_Vec : public UCL_BaseMat { /// Set up host vector with 'cols' columns and reserve memory /** The kind parameter controls memory pinning as follows: - * - UCL_NOT_PINNED - Memory is not pinned - * - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined) - * - UCL_RW_OPTIMIZED - Memory can be pinned + * - UCL_READ_WRITE - Specify that you will read and write from host + * - UCL_WRITE_ONLY - Specify that you will only write from host + * - UCL_READ_ONLY - Specify that you will only read from host + * - UCL_NOT_PINNED - Memory is not pinned/page-locked on host * \param device Used to get the default command queue for operations * \return UCL_SUCCESS if the memory allocation is successful **/ inline int alloc(const size_t cols, UCL_Device &device, - const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) { + const enum UCL_MEMOPT kind=UCL_READ_WRITE, + const enum UCL_MEMOPT kind2=UCL_NOT_SPECIFIED) { clear(); _row_bytes=cols*sizeof(numtyp); - int err=_host_alloc(*this,device,_row_bytes,kind); + int err=_host_alloc(*this,device,_row_bytes,kind,kind2); if (err!=UCL_SUCCESS) { #ifndef UCL_NO_EXIT @@ -115,15 +119,11 @@ class UCL_H_Vec : public UCL_BaseMat { return err; } - /// Return the type of memory allocation - /** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, or UCL_VIEW **/ - inline enum UCL_MEMOPT kind() const { return _kind; } - /// Do not allocate memory, instead use an existing allocation from Geryon /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container + * allocating container when using CUDA APIs * - Viewing a device container on the host is not supported **/ template inline void view(ucl_type &input, const size_t rows, const size_t cols) { @@ -139,6 +139,8 @@ class UCL_H_Vec : public UCL_BaseMat { _end=_array+_cols; #ifdef _OCL_MAT _carray=input.cbegin(); + CL_SAFE_CALL(clRetainMemObject(input.cbegin())); + CL_SAFE_CALL(clRetainCommandQueue(input.cq())); #endif } @@ -146,7 +148,7 @@ class UCL_H_Vec : public UCL_BaseMat { /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container + * allocating container when using CUDA APIs * - Viewing a device container on the host is not supported * \param stride Number of _elements_ between the start of each row **/ template @@ -157,7 +159,7 @@ class UCL_H_Vec : public UCL_BaseMat { /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container + * allocating container when using CUDA APIs * - If a matrix is used a input, all elements (including padding) * will be used for view * - Viewing a device container on the host is not supported **/ @@ -180,7 +182,7 @@ class UCL_H_Vec : public UCL_BaseMat { /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container + * allocating container when using CUDA APIs * - Viewing a device pointer on the host is not supported **/ template inline void view(ptr_type *input, const size_t rows, const size_t cols, @@ -197,14 +199,14 @@ class UCL_H_Vec : public UCL_BaseMat { _end=_array+_cols; #ifdef _OCL_MAT - _host_alloc(*this,dev,_row_bytes,UCL_VIEW); + _host_view(*this,dev,_row_bytes); #endif } /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container + * allocating container when using CUDA APIs * - Viewing a device pointer on the host is not supported * \param stride Number of _elements_ between the start of each row **/ template @@ -215,7 +217,7 @@ class UCL_H_Vec : public UCL_BaseMat { /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container + * allocating container when using CUDA APIs * - Viewing a device pointer on the host is not supported **/ template inline void view(ptr_type *input, const size_t cols, UCL_Device &dev) @@ -225,7 +227,7 @@ class UCL_H_Vec : public UCL_BaseMat { /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container + * allocating container when using CUDA APIs * - Viewing a device container on the host is not supported **/ template inline void view_offset(const size_t offset,ucl_type &input,const size_t rows, @@ -241,7 +243,7 @@ class UCL_H_Vec : public UCL_BaseMat { _array=input.begin()+offset; _end=_array+_cols; #ifdef _OCL_MAT - _host_alloc(*this,input,_row_bytes,UCL_VIEW); + _host_view(*this,input,_row_bytes); #endif } @@ -249,7 +251,7 @@ class UCL_H_Vec : public UCL_BaseMat { /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container + * allocating container when using CUDA APIs * - Viewing a device container on the host is not supported * \param stride Number of _elements_ between the start of each row **/ template @@ -261,7 +263,7 @@ class UCL_H_Vec : public UCL_BaseMat { /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container + * allocating container when using CUDA APIs * - If a matrix is used a input, all elements (including padding) * will be used for view * - Viewing a device container on the host is not supported **/ @@ -273,7 +275,7 @@ class UCL_H_Vec : public UCL_BaseMat { /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container + * allocating container when using CUDA APIs * - If a matrix is used a input, all elements (including padding) * will be used for view * - Viewing a device container on the host is not supported **/ @@ -284,7 +286,7 @@ class UCL_H_Vec : public UCL_BaseMat { /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container + * allocating container when using CUDA APIs * - Viewing a device pointer on the host is not supported **/ template inline void view_offset(const size_t offset,ptr_type *input,const size_t rows, @@ -294,7 +296,7 @@ class UCL_H_Vec : public UCL_BaseMat { /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container + * allocating container when using CUDA APIs * - Viewing a device pointer on the host is not supported * \param stride Number of _elements_ between the start of each row **/ template @@ -305,7 +307,7 @@ class UCL_H_Vec : public UCL_BaseMat { /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the - * allocating container + * allocating container when using CUDA APIs * - Viewing a device pointer on the host is not supported **/ template inline void view_offset(const size_t offset, ptr_type *input, @@ -314,7 +316,7 @@ class UCL_H_Vec : public UCL_BaseMat { /// Free memory and set size to 0 inline void clear() - { if (_kind!=UCL_VIEW) {_kind=UCL_VIEW; _cols=0; _host_free(*this,_kind);}} + { _host_free(*this); _kind=UCL_VIEW; _cols=0; } /// Resize the allocation to contain cols elements /** \note Cannot be used on views **/ @@ -401,7 +403,6 @@ class UCL_H_Vec : public UCL_BaseMat { #endif private: - enum UCL_MEMOPT _kind; numtyp *_array, *_end; size_t _row_bytes, _cols; diff --git a/lib/gpu/geryon/ucl_matrix.h b/lib/gpu/geryon/ucl_matrix.h index 803cd78b4c..301325b454 100644 --- a/lib/gpu/geryon/ucl_matrix.h +++ b/lib/gpu/geryon/ucl_matrix.h @@ -48,17 +48,18 @@ class UCL_Matrix { /// Construct with specied number of rows and columns /** \sa alloc() **/ UCL_Matrix(const size_t rows, const size_t cols, UCL_Device &acc, - const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED, + const enum UCL_MEMOPT kind1=UCL_READ_WRITE, const enum UCL_MEMOPT kind2=UCL_READ_WRITE) { _ucl_s_obj_help< ucl_same_type::ans >:: alloc(host,device,_buffer,rows,cols,acc,kind1,kind2); } /// Set up host matrix with specied # of rows/cols and reserve memory - /** The kind1 parameter controls memory pinning as follows: - * - UCL_NOT_PINNED - Memory is not pinned - * - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined) - * - UCL_RW_OPTIMIZED - Memory can be pinned - * The kind2 parameter controls memory optimizations as follows: + /** The kind1 parameter controls memory access from the host + * - UCL_READ_WRITE - Specify that you will read and write from host + * - UCL_WRITE_ONLY - Specify that you will only write from host + * - UCL_READ_ONLY - Specify that you will only read from host + * - UCL_NOT_PINNED - Memory is not pinned/page-locked on host + * The kind2 parameter controls memory optimizations from the device: * - UCL_READ_WRITE - Specify that you will read and write in kernels * - UCL_WRITE_ONLY - Specify that you will only write in kernels * - UCL_READ_ONLY - Specify that you will only read in kernels @@ -69,24 +70,25 @@ class UCL_Matrix { * \return UCL_SUCCESS if the memory allocation is successful **/ template inline int alloc(const size_t rows, const size_t cols, mat_type &cq, - const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED, + const enum UCL_MEMOPT kind1=UCL_READ_WRITE, const enum UCL_MEMOPT kind2=UCL_READ_WRITE) { return _ucl_s_obj_help< ucl_same_type::ans >:: alloc(host,device,_buffer,rows,cols,cq,kind1,kind2); } /// Set up host matrix with specied # of rows/cols and reserve memory - /** The kind1 parameter controls memory pinning as follows: - * - UCL_NOT_PINNED - Memory is not pinned - * - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined) - * - UCL_RW_OPTIMIZED - Memory can be pinned - * The kind2 parameter controls memory optimizations as follows: + /** The kind1 parameter controls memory access from the host + * - UCL_READ_WRITE - Specify that you will read and write from host + * - UCL_WRITE_ONLY - Specify that you will only write from host + * - UCL_READ_ONLY - Specify that you will only read from host + * - UCL_NOT_PINNED - Memory is not pinned/page-locked on host + * The kind2 parameter controls memory optimizations from the device: * - UCL_READ_WRITE - Specify that you will read and write in kernels * - UCL_WRITE_ONLY - Specify that you will only write in kernels * - UCL_READ_ONLY - Specify that you will only read in kernels * \param device Used to get the default command queue for operations * \return UCL_SUCCESS if the memory allocation is successful **/ inline int alloc(const size_t rows, const size_t cols, UCL_Device &acc, - const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED, + const enum UCL_MEMOPT kind1=UCL_READ_WRITE, const enum UCL_MEMOPT kind2=UCL_READ_WRITE) { return _ucl_s_obj_help< ucl_same_type::ans >:: alloc(host,device,_buffer,rows,cols,acc,kind1,kind2); } @@ -110,11 +112,22 @@ class UCL_Matrix { { if (new_rows>rows() || new_cols>cols()) return resize(new_rows,new_cols); else return UCL_SUCCESS; } - /// Set each element to zero - inline void zero() { host.zero(); device.zero(); } - - /// Set first n elements to zero - inline void zero(const int n) { host.zero(n); device.zero(n); } + /// Set each element to zero (asynchronously on device) + inline void zero() { zero(cq()); } + /// Set first n elements to zero (asynchronously on device) + inline void zero(const int n) { zero(n,cq()); } + /// Set each element to zero (asynchronously on device) + inline void zero(command_queue &cq) { + host.zero(); + if (device.kind()!=UCL_VIEW) device.zero(cq); + else if (_buffer.numel()>0) _buffer.zero(); + } + /// Set first n elements to zero (asynchronously on device) + inline void zero(const int n, command_queue &cq) { + host.zero(n); + if (device.kind()!=UCL_VIEW) device.zero(n,cq); + else if (_buffer.numel()>0) _buffer.zero(); + } /// Get the number of elements inline size_t numel() const { return host.numel(); } @@ -145,6 +158,8 @@ class UCL_Matrix { /// Return the default command queue/stream associated with this data inline command_queue & cq() { return host.cq(); } + /// Change the default command queue associated with this data + inline void cq(command_queue &cq_in) { host.cq(cq_in); device.cq(cq_in); } /// Block until command_queue associated with matrix is complete inline void sync() { host.sync(); } diff --git a/lib/gpu/geryon/ucl_s_obj_help.h b/lib/gpu/geryon/ucl_s_obj_help.h index ea772d3608..0b8c0251c1 100644 --- a/lib/gpu/geryon/ucl_s_obj_help.h +++ b/lib/gpu/geryon/ucl_s_obj_help.h @@ -32,14 +32,24 @@ template <> struct _ucl_s_obj_help<1> { const enum UCL_MEMOPT kind1, const enum UCL_MEMOPT kind2) { int e1; - e1=host.alloc(cols,acc,kind1); - if (e1!=UCL_SUCCESS) - return e1; if (acc.shared_memory()) { + #ifdef UCL_DBG_MEM_TRACE + std::cerr << "UCL_ALLOC 1S\n"; + #endif + e1=host.alloc(cols,acc,kind1,kind2); + if (e1!=UCL_SUCCESS) + return e1; device.view(host); return UCL_SUCCESS; - } else + } else { + #ifdef UCL_DBG_MEM_TRACE + std::cerr << "UCL_ALLOC 1NS\n"; + #endif + e1=host.alloc(cols,acc,kind1); + if (e1!=UCL_SUCCESS) + return e1; return device.alloc(cols,acc,kind2); + } } template @@ -48,10 +58,24 @@ template <> struct _ucl_s_obj_help<1> { const enum UCL_MEMOPT kind1, const enum UCL_MEMOPT kind2) { int e1; - e1=host.alloc(cols,cq,kind1); - if (e1!=UCL_SUCCESS) - return e1; - return device.alloc(cols,cq,kind2); + if (cq.shared_mem_device()) { + #ifdef UCL_DBG_MEM_TRACE + std::cerr << "UCL_ALLOC 2S\n"; + #endif + e1=host.alloc(cols,cq,kind1,kind2); + if (e1!=UCL_SUCCESS) + return e1; + device.view(host); + return UCL_SUCCESS; + } else { + #ifdef UCL_DBG_MEM_TRACE + std::cerr << "UCL_ALLOC 2NS\n"; + #endif + e1=host.alloc(cols,cq,kind1); + if (e1!=UCL_SUCCESS) + return e1; + return device.alloc(cols,cq,kind2); + } } template @@ -60,14 +84,24 @@ template <> struct _ucl_s_obj_help<1> { const enum UCL_MEMOPT kind1, const enum UCL_MEMOPT kind2) { int e1; - e1=host.alloc(rows,cols,acc,kind1); - if (e1!=UCL_SUCCESS) - return e1; if (acc.shared_memory()) { + #ifdef UCL_DBG_MEM_TRACE + std::cerr << "UCL_ALLOC 3S\n"; + #endif + e1=host.alloc(rows,cols,acc,kind1,kind2); + if (e1!=UCL_SUCCESS) + return e1; device.view(host); return UCL_SUCCESS; - } else + } else { + e1=host.alloc(rows,cols,acc,kind1); + if (e1!=UCL_SUCCESS) + return e1; + #ifdef UCL_DBG_MEM_TRACE + std::cerr << "UCL_ALLOC 3NS\n"; + #endif return device.alloc(rows,cols,acc,kind2); + } } template @@ -76,10 +110,24 @@ template <> struct _ucl_s_obj_help<1> { const enum UCL_MEMOPT kind1, const enum UCL_MEMOPT kind2) { int e1; - e1=host.alloc(rows,cols,cq,kind1); - if (e1!=UCL_SUCCESS) - return e1; - return device.alloc(rows,cols,cq,kind2); + if (cq.shared_mem_device()) { + #ifdef UCL_DBG_MEM_TRACE + std::cerr << "UCL_ALLOC 4S\n"; + #endif + e1=host.alloc(rows,cols,cq,kind1,kind2); + if (e1!=UCL_SUCCESS) + return e1; + device.view(host); + return UCL_SUCCESS; + } else { + #ifdef UCL_DBG_MEM_TRACE + std::cerr << "UCL_ALLOC 4NS\n"; + #endif + e1=host.alloc(rows,cols,cq,kind1); + if (e1!=UCL_SUCCESS) + return e1; + return device.alloc(rows,cols,cq,kind2); + } } template @@ -121,8 +169,15 @@ template <> struct _ucl_s_obj_help<1> { if (device.kind()==UCL_VIEW) { device.view(host); return UCL_SUCCESS; - } else + #ifdef UCL_DBG_MEM_TRACE + std::cerr << "UCL_ALLOC 5S\n"; + #endif + } else { + #ifdef UCL_DBG_MEM_TRACE + std::cerr << "UCL_ALLOC 5NS\n"; + #endif return device.resize(cols); + } } template @@ -130,9 +185,16 @@ template <> struct _ucl_s_obj_help<1> { const int cols) { if (device.kind()==UCL_VIEW) { device.view(host); + #ifdef UCL_DBG_MEM_TRACE + std::cerr << "UCL_ALLOC 6S\n"; + #endif return UCL_SUCCESS; - } else + } else { + #ifdef UCL_DBG_MEM_TRACE + std::cerr << "UCL_ALLOC 6NS\n"; + #endif return device.resize(rows,cols); + } } }; @@ -145,17 +207,27 @@ template struct _ucl_s_obj_help { const enum UCL_MEMOPT kind2) { int e1; e1=host.alloc(cols,acc,UCL_NOT_PINNED); - if (e1!=UCL_SUCCESS) - return e1; - e1=_buffer.alloc(cols,acc,kind1); if (e1!=UCL_SUCCESS) return e1; if (acc.shared_memory()) { + #ifdef UCL_DBG_MEM_TRACE + std::cerr << "UCL_ALLOC 7S\n"; + #endif + e1=_buffer.alloc(cols,acc,kind1,kind2); + if (e1!=UCL_SUCCESS) + return e1; device.view(_buffer); return UCL_SUCCESS; - } else + } else { + #ifdef UCL_DBG_MEM_TRACE + std::cerr << "UCL_ALLOC 7NS\n"; + #endif + e1=_buffer.alloc(cols,acc,kind1); + if (e1!=UCL_SUCCESS) + return e1; return device.alloc(cols,acc,kind2); + } } template @@ -167,10 +239,24 @@ template struct _ucl_s_obj_help { e1=host.alloc(cols,cq,UCL_NOT_PINNED); if (e1!=UCL_SUCCESS) return e1; - e1=_buffer.alloc(cols,cq,kind1); - if (e1!=UCL_SUCCESS) - return e1; - return device.alloc(cols,cq,kind2); + if (cq.shared_mem_device()) { + #ifdef UCL_DBG_MEM_TRACE + std::cerr << "UCL_ALLOC 8S\n"; + #endif + e1=_buffer.alloc(cols,cq,kind1,kind2); + if (e1!=UCL_SUCCESS) + return e1; + device.view(_buffer); + return UCL_SUCCESS; + } else { + #ifdef UCL_DBG_MEM_TRACE + std::cerr << "UCL_ALLOC 8NS\n"; + #endif + e1=_buffer.alloc(cols,cq,kind1); + if (e1!=UCL_SUCCESS) + return e1; + return device.alloc(cols,cq,kind2); + } } template @@ -180,17 +266,27 @@ template struct _ucl_s_obj_help { const enum UCL_MEMOPT kind2) { int e1; e1=host.alloc(rows,cols,acc,UCL_NOT_PINNED); - if (e1!=UCL_SUCCESS) - return e1; - e1=_buffer.alloc(rows,cols,acc,kind1); if (e1!=UCL_SUCCESS) return e1; if (acc.shared_memory()) { + #ifdef UCL_DBG_MEM_TRACE + std::cerr << "UCL_ALLOC 9S\n"; + #endif + e1=_buffer.alloc(rows,cols,acc,kind1,kind2); + if (e1!=UCL_SUCCESS) + return e1; device.view(_buffer); return UCL_SUCCESS; - } else + } else { + #ifdef UCL_DBG_MEM_TRACE + std::cerr << "UCL_ALLOC 9NS\n"; + #endif + e1=_buffer.alloc(rows,cols,acc,kind1); + if (e1!=UCL_SUCCESS) + return e1; return device.alloc(rows,cols,acc,kind2); + } } template @@ -202,10 +298,24 @@ template struct _ucl_s_obj_help { e1=host.alloc(rows,cols,cq,UCL_NOT_PINNED); if (e1!=UCL_SUCCESS) return e1; - e1=_buffer.alloc(rows,cols,cq,kind1); - if (e1!=UCL_SUCCESS) - return e1; - return device.alloc(rows,cols,cq,kind2); + if (cq.shared_mem_device()) { + #ifdef UCL_DBG_MEM_TRACE + std::cerr << "UCL_ALLOC 10S\n"; + #endif + e1=_buffer.alloc(rows,cols,cq,kind1,kind2); + if (e1!=UCL_SUCCESS) + return e1; + device.view(_buffer); + return UCL_SUCCESS; + } else { + #ifdef UCL_DBG_MEM_TRACE + std::cerr << "UCL_ALLOC 10NS\n"; + #endif + e1=_buffer.alloc(rows,cols,cq,kind1); + if (e1!=UCL_SUCCESS) + return e1; + return device.alloc(rows,cols,cq,kind2); + } } template @@ -250,9 +360,16 @@ template struct _ucl_s_obj_help { if (device.kind()==UCL_VIEW) { device.view(buff); + #ifdef UCL_DBG_MEM_TRACE + std::cerr << "UCL_ALLOC 11S\n"; + #endif return UCL_SUCCESS; - } else + } else { + #ifdef UCL_DBG_MEM_TRACE + std::cerr << "UCL_ALLOC 11NS\n"; + #endif return device.resize(cols); + } } template @@ -264,9 +381,17 @@ template struct _ucl_s_obj_help { if (device.kind()==UCL_VIEW) { device.view(buff); + #ifdef UCL_DBG_MEM_TRACE + std::cerr << "UCL_ALLOC 12S\n"; + #endif return UCL_SUCCESS; - } else + } else { + #ifdef UCL_DBG_MEM_TRACE + std::cerr << "UCL_ALLOC 12NS\n"; + #endif return device.resize(rows,cols); + } } }; + diff --git a/lib/gpu/geryon/ucl_types.h b/lib/gpu/geryon/ucl_types.h index 615bffea95..46be4bcb06 100644 --- a/lib/gpu/geryon/ucl_types.h +++ b/lib/gpu/geryon/ucl_types.h @@ -92,10 +92,9 @@ enum UCL_MEMOPT { UCL_WRITE_ONLY, ///< Allow any optimizations for memory that is write only UCL_READ_ONLY, ///< Allow any optimizations for memory that is read only UCL_READ_WRITE, ///< Allow read and write - UCL_WRITE_OPTIMIZED,///< Allow host memory to be pinned (write combined) - UCL_RW_OPTIMIZED, ///< Allow host memory to be pinned UCL_NOT_PINNED, ///< Host memory is not to be pinned - UCL_VIEW ///< View of another memory allocation + UCL_VIEW, ///< View of another memory allocation + UCL_NOT_SPECIFIED }; enum UCL_DEVICE_TYPE { diff --git a/lib/gpu/geryon/ucl_vector.h b/lib/gpu/geryon/ucl_vector.h index 01346127c2..89f1528969 100644 --- a/lib/gpu/geryon/ucl_vector.h +++ b/lib/gpu/geryon/ucl_vector.h @@ -48,17 +48,18 @@ class UCL_Vector { /// Construct with n columns /** \sa alloc() **/ UCL_Vector(const size_t cols, UCL_Device &acc, - const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED, + const enum UCL_MEMOPT kind1=UCL_READ_WRITE, const enum UCL_MEMOPT kind2=UCL_READ_WRITE) { _ucl_s_obj_help< ucl_same_type::ans >:: alloc(host,device,_buffer,cols,acc,kind1,kind2); } /// Set up the vector with 'cols' columns and reserve memory - /** The kind1 parameter controls memory pinning as follows: - * - UCL_NOT_PINNED - Memory is not pinned - * - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined) - * - UCL_RW_OPTIMIZED - Memory can be pinned - * The kind2 parameter controls memory optimizations as follows: + /** The kind1 parameter controls memory access from the host + * - UCL_READ_WRITE - Specify that you will read and write from host + * - UCL_WRITE_ONLY - Specify that you will only write from host + * - UCL_READ_ONLY - Specify that you will only read from host + * - UCL_NOT_PINNED - Memory is not pinned/page-locked on host + * The kind2 parameter controls memory optimizations from the device: * - UCL_READ_WRITE - Specify that you will read and write in kernels * - UCL_WRITE_ONLY - Specify that you will only write in kernels * - UCL_READ_ONLY - Specify that you will only read in kernels @@ -69,24 +70,25 @@ class UCL_Vector { * \return UCL_SUCCESS if the memory allocation is successful **/ template inline int alloc(const size_t cols, mat_type &cq, - const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED, + const enum UCL_MEMOPT kind1=UCL_READ_WRITE, const enum UCL_MEMOPT kind2=UCL_READ_WRITE) { return _ucl_s_obj_help< ucl_same_type::ans >:: alloc(host,device,_buffer,cols,cq,kind1,kind2); } /// Set up host vector with 'cols' columns and reserve memory - /** The kind1 parameter controls memory pinning as follows: - * - UCL_NOT_PINNED - Memory is not pinned - * - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined) - * - UCL_RW_OPTIMIZED - Memory can be pinned - * The kind2 parameter controls memory optimizations as follows: + /** The kind1 parameter controls memory access from the host + * - UCL_READ_WRITE - Specify that you will read and write from host + * - UCL_WRITE_ONLY - Specify that you will only write from host + * - UCL_READ_ONLY - Specify that you will only read from host + * - UCL_NOT_PINNED - Memory is not pinned/page-locked on host + * The kind2 parameter controls memory optimizations from the device: * - UCL_READ_WRITE - Specify that you will read and write in kernels * - UCL_WRITE_ONLY - Specify that you will only write in kernels * - UCL_READ_ONLY - Specify that you will only read in kernels * \param device Used to get the default command queue for operations * \return UCL_SUCCESS if the memory allocation is successful **/ inline int alloc(const size_t cols, UCL_Device &acc, - const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED, + const enum UCL_MEMOPT kind1=UCL_READ_WRITE, const enum UCL_MEMOPT kind2=UCL_READ_WRITE) { return _ucl_s_obj_help< ucl_same_type::ans >:: alloc(host,device,_buffer,cols,acc,kind1,kind2); } @@ -109,11 +111,22 @@ class UCL_Vector { inline int resize_ib(const int new_cols) { if (new_cols>cols()) return resize(new_cols); else return UCL_SUCCESS; } - /// Set each element to zero - inline void zero() { host.zero(); device.zero(); } - - /// Set first n elements to zero - inline void zero(const int n) { host.zero(n); device.zero(n); } + /// Set each element to zero (asynchronously on device) + inline void zero() { zero(cq()); } + /// Set first n elements to zero (asynchronously on device) + inline void zero(const int n) { zero(n,cq()); } + /// Set each element to zero (asynchronously on device) + inline void zero(command_queue &cq) { + host.zero(); + if (device.kind()!=UCL_VIEW) device.zero(cq); + else if (_buffer.numel()>0) _buffer.zero(); + } + /// Set first n elements to zero (asynchronously on device) + inline void zero(const int n, command_queue &cq) { + host.zero(n); + if (device.kind()!=UCL_VIEW) device.zero(n,cq); + else if (_buffer.numel()>0) _buffer.zero(); + } /// Get the number of elements inline size_t numel() const { return host.numel(); } @@ -145,6 +158,8 @@ class UCL_Vector { /// Return the default command queue/stream associated with this data inline command_queue & cq() { return host.cq(); } + /// Change the default command queue associated with this data + inline void cq(command_queue &cq_in) { host.cq(cq_in); device.cq(cq_in); } /// Block until command_queue associated with matrix is complete inline void sync() { host.sync(); } diff --git a/lib/gpu/lal_answer.cpp b/lib/gpu/lal_answer.cpp index 6f42790ca3..7dbd875cd1 100644 --- a/lib/gpu/lal_answer.cpp +++ b/lib/gpu/lal_answer.cpp @@ -44,10 +44,10 @@ bool AnswerT::alloc(const int inum) { _ans_fields+=4; // --------------------------- Device allocations - success=success && (engv.alloc(_ev_fields*_max_local,*dev,UCL_RW_OPTIMIZED, - UCL_WRITE_ONLY)==UCL_SUCCESS); - success=success && (force.alloc(_ans_fields*_max_local,*dev,UCL_RW_OPTIMIZED, - UCL_WRITE_ONLY)==UCL_SUCCESS); + success=success && (engv.alloc(_ev_fields*_max_local,*dev,UCL_READ_ONLY, + UCL_READ_WRITE)==UCL_SUCCESS); + success=success && (force.alloc(_ans_fields*_max_local,*dev,UCL_READ_ONLY, + UCL_READ_WRITE)==UCL_SUCCESS); _gpu_bytes=engv.device.row_bytes()+force.device.row_bytes(); _allocated=true; @@ -175,78 +175,42 @@ double AnswerT::energy_virial(double *eatom, double **vatom, return 0.0; double evdwl=0.0; - double virial_acc[6]; - for (int i=0; i<6; i++) virial_acc[i]=0.0; - if (_ilist==NULL) { - for (int i=0; i<_inum; i++) { - int al=i; - if (_eflag) { - if (_ef_atom) { - evdwl+=engv[al]; - eatom[i]+=engv[al]*0.5; - al+=_inum; - } else { - evdwl+=engv[al]; - al+=_inum; - } - } - if (_vflag) { - if (_vf_atom) { - for (int j=0; j<6; j++) { - vatom[i][j]+=engv[al]*0.5; - virial_acc[j]+=engv[al]; - al+=_inum; - } - } else { - for (int j=0; j<6; j++) { - virial_acc[j]+=engv[al]; - al+=_inum; - } - } - } + int vstart=0; + if (_eflag) { + for (int i=0; i<_inum; i++) + evdwl+=engv[i]; + if (_ef_atom) + if (_ilist==NULL) + for (int i=0; i<_inum; i++) + eatom[i]+=engv[i]; + else + for (int i=0; i<_inum; i++) + eatom[_ilist[i]]+=engv[i]; + vstart=_inum; + } + if (_vflag) { + int iend=vstart+_inum; + for (int j=0; j<6; j++) { + for (int i=vstart; i double AnswerT::energy_virial(double *eatom, double **vatom, - double *virial, double &ecoul) { + double *virial, double &ecoul) { if (_eflag==false && _vflag==false) return 0.0; @@ -254,84 +218,43 @@ double AnswerT::energy_virial(double *eatom, double **vatom, return energy_virial(eatom,vatom,virial); double evdwl=0.0; - double _ecoul=0.0; - double virial_acc[6]; - for (int i=0; i<6; i++) virial_acc[i]=0.0; - if (_ilist==NULL) { - for (int i=0; i<_inum; i++) { - int al=i; - if (_eflag) { - if (_ef_atom) { - evdwl+=engv[al]; - eatom[i]+=engv[al]*0.5; - al+=_inum; - _ecoul+=engv[al]; - eatom[i]+=engv[al]*0.5; - al+=_inum; - } else { - evdwl+=engv[al]; - al+=_inum; - _ecoul+=engv[al]; - al+=_inum; - } - } - if (_vflag) { - if (_vf_atom) { - for (int j=0; j<6; j++) { - vatom[i][j]+=engv[al]*0.5; - virial_acc[j]+=engv[al]; - al+=_inum; - } - } else { - for (int j=0; j<6; j++) { - virial_acc[j]+=engv[al]; - al+=_inum; - } - } + int vstart=0, iend=_inum*2; + if (_eflag) { + for (int i=0; i<_inum; i++) + evdwl+=engv[i]; + for (int i=_inum; i +void AnswerT::cq(const int cq_index) { + engv.cq(dev->cq(cq_index)); + force.cq(dev->cq(cq_index)); + time_answer.clear(); + time_answer.init(*dev,dev->cq(cq_index)); + time_answer.zero(); +} + template class Answer; + diff --git a/lib/gpu/lal_answer.h b/lib/gpu/lal_answer.h index c642781c07..149e8e9705 100644 --- a/lib/gpu/lal_answer.h +++ b/lib/gpu/lal_answer.h @@ -47,6 +47,8 @@ class Answer { inline int inum() const { return _inum; } /// Set number of local atoms for future copy operations inline void inum(const int n) { _inum=n; } + /// Return the maximum number of atoms that can be stored currently + inline int max_inum() const { return _max_local; } /// Memory usage per atom in this class int bytes_per_atom() const; @@ -132,6 +134,9 @@ class Answer { /// Return the time the CPU was idle waiting for GPU inline double cpu_idle_time() { return _time_cpu_idle; } + /// Change the command queue used for copies and timers + void cq(const int cq_index); + // ------------------------------ DATA ---------------------------------- /// Force and possibly torque diff --git a/lib/gpu/lal_atom.cpp b/lib/gpu/lal_atom.cpp index 5cf46c8751..92cda1239e 100644 --- a/lib/gpu/lal_atom.cpp +++ b/lib/gpu/lal_atom.cpp @@ -70,44 +70,47 @@ bool AtomT::alloc(const int nall) { // --------------------------- Device allocations int gpu_bytes=0; - success=success && (x.alloc(_max_atoms*4,*dev,UCL_WRITE_OPTIMIZED, + success=success && (x.alloc(_max_atoms*4,*dev,UCL_WRITE_ONLY, UCL_READ_ONLY)==UCL_SUCCESS); #ifdef GPU_CAST - success=success && (x_cast.alloc(_max_atoms*3,*dev,UCL_READ_ONLY)== - UCL_SUCCESS); - success=success && (type_cast.alloc(_max_atoms,*dev,UCL_READ_ONLY)== - UCL_SUCCESS); + success=success && (x_cast.alloc(_max_atoms*3,*dev,UCL_WRITE_ONLY, + UCL_READ_ONLY)==UCL_SUCCESS); + success=success && (type_cast.alloc(_max_atoms,*dev,UCL_WRITE_ONLY, + UCL_READ_ONLY)==UCL_SUCCESS); gpu_bytes+=x_cast.device.row_bytes()+type_cast.device.row_bytes(); #endif if (_charge && _host_view==false) { - success=success && (q.alloc(_max_atoms,*dev,UCL_WRITE_OPTIMIZED, + success=success && (q.alloc(_max_atoms,*dev,UCL_WRITE_ONLY, UCL_READ_ONLY)==UCL_SUCCESS); gpu_bytes+=q.device.row_bytes(); } if (_rot && _host_view==false) { - success=success && (quat.alloc(_max_atoms*4,*dev,UCL_WRITE_OPTIMIZED, + success=success && (quat.alloc(_max_atoms*4,*dev,UCL_WRITE_ONLY, UCL_READ_ONLY)==UCL_SUCCESS); gpu_bytes+=quat.device.row_bytes(); } if (_gpu_nbor>0) { if (_bonds) { - success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS); + success=success && (dev_tag.alloc(_max_atoms,*dev, + UCL_READ_ONLY)==UCL_SUCCESS); gpu_bytes+=dev_tag.row_bytes(); } if (_gpu_nbor==1) { success=success && (dev_cell_id.alloc(_max_atoms,*dev)==UCL_SUCCESS); gpu_bytes+=dev_cell_id.row_bytes(); } else { - success=success && (host_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS); + success=success && (host_particle_id.alloc(_max_atoms,*dev, + UCL_WRITE_ONLY)==UCL_SUCCESS); success=success && (host_cell_id.alloc(_max_atoms,*dev,UCL_NOT_PINNED)==UCL_SUCCESS); } if (_gpu_nbor==2 && _host_view) dev_particle_id.view(host_particle_id); else - success=success && (dev_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS); + success=success && (dev_particle_id.alloc(_max_atoms,*dev, + UCL_READ_ONLY)==UCL_SUCCESS); gpu_bytes+=dev_particle_id.row_bytes(); } @@ -130,7 +133,7 @@ bool AtomT::add_fields(const bool charge, const bool rot, _charge=true; _other=true; if (_host_view==false) { - success=success && (q.alloc(_max_atoms,*dev,UCL_WRITE_OPTIMIZED, + success=success && (q.alloc(_max_atoms,*dev,UCL_WRITE_ONLY, UCL_READ_ONLY)==UCL_SUCCESS); gpu_bytes+=q.device.row_bytes(); } @@ -140,7 +143,7 @@ bool AtomT::add_fields(const bool charge, const bool rot, _rot=true; _other=true; if (_host_view==false) { - success=success && (quat.alloc(_max_atoms*4,*dev,UCL_WRITE_OPTIMIZED, + success=success && (quat.alloc(_max_atoms*4,*dev,UCL_WRITE_ONLY, UCL_READ_ONLY)==UCL_SUCCESS); gpu_bytes+=quat.device.row_bytes(); } @@ -149,7 +152,8 @@ bool AtomT::add_fields(const bool charge, const bool rot, if (bonds && _bonds==false) { _bonds=true; if (_bonds && _gpu_nbor>0) { - success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS); + success=success && (dev_tag.alloc(_max_atoms,*dev, + UCL_READ_ONLY)==UCL_SUCCESS); gpu_bytes+=dev_tag.row_bytes(); } } @@ -163,17 +167,20 @@ bool AtomT::add_fields(const bool charge, const bool rot, return false; } #endif - success=success && (dev_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS); + success=success && (dev_particle_id.alloc(_max_atoms,*dev, + UCL_READ_ONLY)==UCL_SUCCESS); gpu_bytes+=dev_particle_id.row_bytes(); if (_bonds) { - success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS); + success=success && (dev_tag.alloc(_max_atoms,*dev, + UCL_READ_ONLY)==UCL_SUCCESS); gpu_bytes+=dev_tag.row_bytes(); } if (_gpu_nbor==1) { success=success && (dev_cell_id.alloc(_max_atoms,*dev)==UCL_SUCCESS); gpu_bytes+=dev_cell_id.row_bytes(); } else { - success=success && (host_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS); + success=success && (host_particle_id.alloc(_max_atoms,*dev, + UCL_WRITE_ONLY)==UCL_SUCCESS); success=success && (host_cell_id.alloc(_max_atoms,*dev,UCL_NOT_PINNED)==UCL_SUCCESS); } diff --git a/lib/gpu/lal_aux_fun1.h b/lib/gpu/lal_aux_fun1.h index 92a03eb7d4..2c57e1f6a5 100644 --- a/lib/gpu/lal_aux_fun1.h +++ b/lib/gpu/lal_aux_fun1.h @@ -77,12 +77,12 @@ if (offset==0) { \ engv+=ii; \ if (eflag>0) { \ - *engv=energy; \ + *engv=energy*(acctyp)0.5; \ engv+=inum; \ } \ if (vflag>0) { \ for (int i=0; i<6; i++) { \ - *engv=virial[i]; \ + *engv=virial[i]*(acctyp)0.5; \ engv+=inum; \ } \ } \ @@ -125,14 +125,14 @@ if (offset==0) { \ engv+=ii; \ if (eflag>0) { \ - *engv=energy; \ + *engv=energy*(acctyp)0.5; \ engv+=inum; \ - *engv=e_coul; \ + *engv=e_coul*(acctyp)0.5; \ engv+=inum; \ } \ if (vflag>0) { \ for (int i=0; i<6; i++) { \ - *engv=virial[i]; \ + *engv=virial[i]*(acctyp)0.5; \ engv+=inum; \ } \ } \ @@ -160,12 +160,12 @@ if (offset==0) { \ engv+=ii; \ if (eflag>0) { \ - *engv=energy; \ + *engv=energy*(acctyp)0.5; \ engv+=inum; \ } \ if (vflag>0) { \ for (int i=0; i<6; i++) { \ - *engv=virial[i]; \ + *engv=virial[i]*(acctyp)0.5; \ engv+=inum; \ } \ } \ @@ -192,14 +192,14 @@ if (offset==0) { \ engv+=ii; \ if (eflag>0) { \ - *engv=energy; \ + *engv=energy*(acctyp)0.5; \ engv+=inum; \ - *engv=e_coul; \ + *engv=e_coul*(acctyp)0.5; \ engv+=inum; \ } \ if (vflag>0) { \ for (int i=0; i<6; i++) { \ - *engv=virial[i]; \ + *engv=virial[i]*(acctyp)0.5; \ engv+=inum; \ } \ } \ diff --git a/lib/gpu/lal_base_atomic.cpp b/lib/gpu/lal_base_atomic.cpp index 6597dbfe98..b24f7e9661 100644 --- a/lib/gpu/lal_base_atomic.cpp +++ b/lib/gpu/lal_base_atomic.cpp @@ -272,12 +272,8 @@ void BaseAtomicT::compile_kernels(UCL_Device &dev, const void *pair_str, return; std::string s_fast=std::string(kname)+"_fast"; - std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+ - std::string(OCL_PRECISION_COMPILE)+" -D"+ - std::string(OCL_VENDOR); - pair_program=new UCL_Program(dev); - pair_program->load_string(pair_str,flags.c_str()); + pair_program->load_string(pair_str,device->compile_string().c_str()); k_pair_fast.set_function(*pair_program,s_fast.c_str()); k_pair.set_function(*pair_program,kname); pos_tex.get_texture(*pair_program,"pos_tex"); diff --git a/lib/gpu/lal_base_charge.cpp b/lib/gpu/lal_base_charge.cpp index f61950cfee..8e06a4d18c 100644 --- a/lib/gpu/lal_base_charge.cpp +++ b/lib/gpu/lal_base_charge.cpp @@ -288,12 +288,8 @@ void BaseChargeT::compile_kernels(UCL_Device &dev, const void *pair_str, return; std::string s_fast=std::string(kname)+"_fast"; - std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+ - std::string(OCL_PRECISION_COMPILE)+" -D"+ - std::string(OCL_VENDOR); - pair_program=new UCL_Program(dev); - pair_program->load_string(pair_str,flags.c_str()); + pair_program->load_string(pair_str,device->compile_string().c_str()); k_pair_fast.set_function(*pair_program,s_fast.c_str()); k_pair.set_function(*pair_program,kname); pos_tex.get_texture(*pair_program,"pos_tex"); diff --git a/lib/gpu/lal_base_dipole.cpp b/lib/gpu/lal_base_dipole.cpp index 8c793f554e..ed3d720d2c 100644 --- a/lib/gpu/lal_base_dipole.cpp +++ b/lib/gpu/lal_base_dipole.cpp @@ -296,12 +296,8 @@ void BaseDipoleT::compile_kernels(UCL_Device &dev, const void *pair_str, return; std::string s_fast=std::string(kname)+"_fast"; - std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+ - std::string(OCL_PRECISION_COMPILE)+" -D"+ - std::string(OCL_VENDOR); - pair_program=new UCL_Program(dev); - pair_program->load_string(pair_str,flags.c_str()); + pair_program->load_string(pair_str,device->compile_string().c_str()); k_pair_fast.set_function(*pair_program,s_fast.c_str()); k_pair.set_function(*pair_program,kname); pos_tex.get_texture(*pair_program,"pos_tex"); diff --git a/lib/gpu/lal_base_ellipsoid.cpp b/lib/gpu/lal_base_ellipsoid.cpp index fca3151b82..641087a6c4 100644 --- a/lib/gpu/lal_base_ellipsoid.cpp +++ b/lib/gpu/lal_base_ellipsoid.cpp @@ -455,9 +455,7 @@ void BaseEllipsoidT::compile_kernels(UCL_Device &dev, std::string s_lj=kns+"_lj"; std::string s_lj_fast=kns+"_lj_fast"; - std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+ - std::string(OCL_PRECISION_COMPILE)+" -D"+ - std::string(OCL_VENDOR); + std::string flags=device->compile_string(); nbor_program=new UCL_Program(dev); nbor_program->load_string(ellipsoid_nbor,flags.c_str()); diff --git a/lib/gpu/lal_born.cpp b/lib/gpu/lal_born.cpp index 1665717c2d..719b1845d8 100644 --- a/lib/gpu/lal_born.cpp +++ b/lib/gpu/lal_born.cpp @@ -69,7 +69,7 @@ int BornT::init(const int ntypes, double **host_cutsq, // Allocate a host write buffer for data initialization UCL_H_Vec host_write(lj_types*lj_types*32,*(this->ucl_device), - UCL_WRITE_OPTIMIZED); + UCL_WRITE_ONLY); for (int i=0; i host_write(lj_types*lj_types*32,*(this->ucl_device), - UCL_WRITE_OPTIMIZED); + UCL_WRITE_ONLY); for (int i=0; i host_write(lj_types*lj_types*32,*(this->ucl_device), - UCL_WRITE_OPTIMIZED); + UCL_WRITE_ONLY); for (int i=0; i host_write(lj_types*lj_types*32,*(this->ucl_device), - UCL_WRITE_OPTIMIZED); + UCL_WRITE_ONLY); for (int i=0; i host_write(lj_types*lj_types*32,*(this->ucl_device), - UCL_WRITE_OPTIMIZED); + UCL_WRITE_ONLY); for (int i=0; i host_write(lj_types*lj_types*32,*(this->ucl_device), - UCL_WRITE_OPTIMIZED); + UCL_WRITE_ONLY); for (int i=0; i host_write(cmm_types*cmm_types*32,*(this->ucl_device), - UCL_WRITE_OPTIMIZED); + UCL_WRITE_ONLY); for (int i=0; i host_write(lj_types*lj_types*32,*(this->ucl_device), - UCL_WRITE_OPTIMIZED); + UCL_WRITE_ONLY); for (int i=0; i host_write(h_size*32,*(this->ucl_device), - UCL_WRITE_OPTIMIZED); + UCL_WRITE_ONLY); for (int i=0; i host_write(lj_types*lj_types*32,*(this->ucl_device), - UCL_WRITE_OPTIMIZED); + UCL_WRITE_ONLY); for (int i=0; i dview_form(lj_types*lj_types,*(this->ucl_device), - UCL_WRITE_OPTIMIZED); + UCL_WRITE_ONLY); for (int i=0; iucl_device),UCL_READ_ONLY); diff --git a/lib/gpu/lal_colloid.cu b/lib/gpu/lal_colloid.cu index e918feca8b..a8f0cf63ca 100644 --- a/lib/gpu/lal_colloid.cu +++ b/lib/gpu/lal_colloid.cu @@ -56,7 +56,8 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_, if (ii host_write(lj_types*lj_types*32,*(this->ucl_device), - UCL_WRITE_OPTIMIZED); + UCL_WRITE_ONLY); for (int i=0; i host_write(lj_types*lj_types*32,*(this->ucl_device), - UCL_WRITE_OPTIMIZED); + UCL_WRITE_ONLY); for (int i=0; i q_tex; if (eflag>0) { \ *ap1=(acctyp)0; \ ap1+=inum; \ - *ap1=e_coul; \ + *ap1=e_coul*(acctyp)0.5; \ ap1+=inum; \ } \ if (vflag>0) { \ for (int i=0; i<6; i++) { \ - *ap1=virial[i]; \ + *ap1=virial[i]*(acctyp)0.5; \ ap1+=inum; \ } \ } \ @@ -109,12 +109,12 @@ texture q_tex; if (eflag>0) { \ *ap1=(acctyp)0; \ ap1+=inum; \ - *ap1=e_coul; \ + *ap1=e_coul*(acctyp)0.5; \ ap1+=inum; \ } \ if (vflag>0) { \ for (int i=0; i<6; i++) { \ - *ap1=virial[i]; \ + *ap1=virial[i]*(acctyp)0.5; \ ap1+=inum; \ } \ } \ @@ -155,7 +155,8 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_, if (ii int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, const int last_gpu, const int gpu_mode, const double p_split, const int nthreads, - const int t_per_atom, const double cell_size) { + const int t_per_atom, const double cell_size, + char *ocl_vendor) { _nthreads=nthreads; #ifdef _OPENMP omp_set_num_threads(nthreads); @@ -140,6 +141,9 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, _long_range_precompute=0; + if (set_ocl_params(ocl_vendor)!=0) + return -11; + int flag=0; for (int i=0; i<_procs_per_gpu; i++) { if (_gpu_rank==i) @@ -149,6 +153,64 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, return flag; } +template +int DeviceT::set_ocl_params(char *ocl_vendor) { + #ifdef USE_OPENCL + std::string s_vendor=OCL_DEFAULT_VENDOR; + if (ocl_vendor!=NULL) + s_vendor=ocl_vendor; + if (s_vendor=="none") + s_vendor="generic"; + + if (s_vendor=="kepler") { + _ocl_vendor_name="NVIDIA Kepler"; + #if defined (__APPLE__) || defined(MACOSX) + _ocl_vendor_string="-DKEPLER_OCL -DNO_OCL_PTX"; + #else + _ocl_vendor_string="-DKEPLER_OCL"; + #endif + } else if (s_vendor=="fermi") { + _ocl_vendor_name="NVIDIA Fermi"; + _ocl_vendor_string="-DFERMI_OCL"; + } else if (s_vendor=="cypress") { + _ocl_vendor_name="AMD Cypress"; + _ocl_vendor_string="-DCYPRESS_OCL"; + } else if (s_vendor=="generic") { + _ocl_vendor_name="GENERIC"; + _ocl_vendor_string="-DGENERIC_OCL"; + } else { + _ocl_vendor_name="CUSTOM"; + _ocl_vendor_string="-DUSE_OPENCL"; + int token_count=0; + std::string params[13]; + char *pch = strtok(ocl_vendor,"\" "); + while (pch != NULL) { + if (token_count==13) + return -11; + params[token_count]=pch; + token_count++; + pch = strtok(NULL,"\" "); + } + _ocl_vendor_string+=" -DMEM_THREADS="+params[0]+ + " -DTHREADS_PER_ATOM="+params[1]+ + " -DTHREADS_PER_CHARGE="+params[2]+ + " -DBLOCK_PAIR="+params[3]+ + " -DMAX_SHARED_TYPES="+params[4]+ + " -DBLOCK_NBOR_BUILD="+params[5]+ + " -DBLOCK_BIO_PAIR="+params[6]+ + " -DBLOCK_ELLIPSE="+params[7]+ + " -DWARP_SIZE="+params[8]+ + " -DPPPM_BLOCK_1D="+params[9]+ + " -DBLOCK_CELL_2D="+params[10]+ + " -DBLOCK_CELL_ID="+params[11]+ + " -DMAX_BIO_SHARED_TYPES="+params[12]; + } + _ocl_compile_string="-cl-fast-relaxed-math -cl-mad-enable "+ + std::string(OCL_PRECISION_COMPILE)+" "+_ocl_vendor_string; + #endif + return 0; +} + template int DeviceT::init(Answer &ans, const bool charge, const bool rot, const int nlocal, @@ -206,7 +268,7 @@ int DeviceT::init(Answer &ans, const bool charge, if (!nbor->init(&_neighbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial, *gpu,gpu_nbor,gpu_host,pre_cut, _block_cell_2d, _block_cell_id, _block_nbor_build, threads_per_atom, - _warp_size, _time_device)) + _warp_size, _time_device, compile_string())) return -3; if (_cell_size<0.0) nbor->cell_size(cell_size,cell_size); @@ -274,7 +336,8 @@ void DeviceT::init_message(FILE *screen, const char *name, fprintf(screen,"- with %d thread(s) per proc.\n",_nthreads); #endif #ifdef USE_OPENCL - fprintf(screen,"- with OpenCL Parameters for: %s\n",OCL_VENDOR); + fprintf(screen,"- with OpenCL Parameters for: %s\n", + _ocl_vendor_name.c_str()); #endif fprintf(screen,"-------------------------------------"); fprintf(screen,"-------------------------------------\n"); @@ -571,9 +634,8 @@ int DeviceT::compile_kernels() { if (_compiled) return flag; - std::string flags="-cl-mad-enable -D"+std::string(OCL_VENDOR); dev_program=new UCL_Program(*gpu); - int success=dev_program->load_string(device,flags.c_str()); + int success=dev_program->load_string(device,compile_string().c_str()); if (success!=UCL_SUCCESS) return -4; k_zero.set_function(*dev_program,"kernel_zero"); @@ -640,10 +702,11 @@ Device global_device; int lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, const int last_gpu, const int gpu_mode, const double particle_split, const int nthreads, - const int t_per_atom, const double cell_size) { + const int t_per_atom, const double cell_size, + char *opencl_vendor) { return global_device.init_device(world,replica,first_gpu,last_gpu,gpu_mode, particle_split,nthreads,t_per_atom, - cell_size); + cell_size,opencl_vendor); } void lmp_clear_device() { @@ -654,3 +717,4 @@ double lmp_gpu_forces(double **f, double **tor, double *eatom, double **vatom, double *virial, double &ecoul) { return global_device.fix_gpu(f,tor,eatom,vatom,virial,ecoul); } + diff --git a/lib/gpu/lal_device.h b/lib/gpu/lal_device.h index 62b0d862bd..d7bc70edaf 100644 --- a/lib/gpu/lal_device.h +++ b/lib/gpu/lal_device.h @@ -45,11 +45,13 @@ class Device { * - -2 if GPU not found * - -4 if GPU library not compiled for GPU * - -6 if GPU could not be initialized for use - * - -7 if accelerator sharing is not currently allowed on system **/ + * - -7 if accelerator sharing is not currently allowed on system + * - -11 if vendor_string has the wrong number of parameters **/ int init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, - const int last_gpu, const int gpu_mode, - const double particle_split, const int nthreads, - const int t_per_atom, const double cell_size); + const int last_gpu, const int gpu_mode, + const double particle_split, const int nthreads, + const int t_per_atom, const double cell_size, + char *vendor_string); /// Initialize the device for Atom and Neighbor storage /** \param rot True if quaternions need to be stored @@ -234,6 +236,8 @@ class Device { inline int max_bio_shared_types() const { return _max_bio_shared_types; } /// Architecture gpu code compiled for (returns 0 for OpenCL) inline double ptx_arch() const { return _ptx_arch; } + /// Number of threads executing concurrently on same multiproc + inline int warp_size() const { return _warp_size; } // -------------------- SHARED DEVICE ROUTINES -------------------- // Perform asynchronous zero of integer array @@ -278,6 +282,8 @@ class Device { pppm_double->precompute(ago,nlocal,nall,host_x,host_type,success,charge, boxlo,prd); } + + inline std::string compile_string() { return _ocl_compile_string; } private: std::queue *> ans_queue; @@ -305,6 +311,9 @@ class Device { int _data_in_estimate, _data_out_estimate; + std::string _ocl_vendor_name, _ocl_vendor_string, _ocl_compile_string; + int set_ocl_params(char *); + template inline std::string toa(const t& in) { std::ostringstream o; diff --git a/lib/gpu/lal_dipole_lj.cpp b/lib/gpu/lal_dipole_lj.cpp index 41ec84648d..e96e15eaf9 100644 --- a/lib/gpu/lal_dipole_lj.cpp +++ b/lib/gpu/lal_dipole_lj.cpp @@ -72,7 +72,7 @@ int DipoleLJT::init(const int ntypes, // Allocate a host write buffer for data initialization UCL_H_Vec host_write(lj_types*lj_types*32,*(this->ucl_device), - UCL_WRITE_OPTIMIZED); + UCL_WRITE_ONLY); for (int i=0; i mu_tex; if (offset==0) { \ engv+=ii; \ if (eflag>0) { \ - *engv=energy; \ + *engv=energy*(acctyp)0.5; \ engv+=inum; \ - *engv=e_coul; \ + *engv=e_coul*(acctyp)0.5; \ engv+=inum; \ } \ if (vflag>0) { \ for (int i=0; i<6; i++) { \ - *engv=virial[i]; \ + *engv=virial[i]*(acctyp)0.5; \ engv+=inum; \ } \ } \ @@ -115,14 +115,14 @@ texture mu_tex; if (offset==0) { \ engv+=ii; \ if (eflag>0) { \ - *engv=energy; \ + *engv=energy*(acctyp)0.5; \ engv+=inum; \ - *engv=e_coul; \ + *engv=e_coul*(acctyp)0.5; \ engv+=inum; \ } \ if (vflag>0) { \ for (int i=0; i<6; i++) { \ - *engv=virial[i]; \ + *engv=virial[i]*(acctyp)0.5; \ engv+=inum; \ } \ } \ @@ -174,7 +174,8 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_, if (ii host_write(lj_types*lj_types*32,*(this->ucl_device), - UCL_WRITE_OPTIMIZED); + UCL_WRITE_ONLY); for (int i=0; i mu_tex; if (offset==0) { \ engv+=ii; \ if (eflag>0) { \ - *engv=energy; \ + *engv=energy*(acctyp)0.5; \ engv+=inum; \ - *engv=e_coul; \ + *engv=e_coul*(acctyp)0.5; \ engv+=inum; \ } \ if (vflag>0) { \ for (int i=0; i<6; i++) { \ - *engv=virial[i]; \ + *engv=virial[i]*(acctyp)0.5; \ engv+=inum; \ } \ } \ @@ -116,14 +116,14 @@ texture mu_tex; if (offset==0) { \ engv+=ii; \ if (eflag>0) { \ - *engv=energy; \ + *engv=energy*(acctyp)0.5; \ engv+=inum; \ - *engv=e_coul; \ + *engv=e_coul*(acctyp)0.5; \ engv+=inum; \ } \ if (vflag>0) { \ for (int i=0; i<6; i++) { \ - *engv=virial[i]; \ + *engv=virial[i]*(acctyp)0.5; \ engv+=inum; \ } \ } \ @@ -175,7 +175,8 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_, if (ii(static_cast(ef_nall)*1.10); - _fp.alloc(_max_fp_size,*(this->ucl_device),UCL_RW_OPTIMIZED,UCL_WRITE_ONLY); + _fp.alloc(_max_fp_size,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); k_energy.set_function(*(this->pair_program),"k_energy"); k_energy_fast.set_function(*(this->pair_program),"k_energy_fast"); @@ -106,7 +106,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor, _nr=nr; UCL_H_Vec dview_type(lj_types*lj_types,*(this->ucl_device), - UCL_WRITE_OPTIMIZED); + UCL_WRITE_ONLY); for (int i=0; i dview_type2frho(lj_types,*(this->ucl_device), - UCL_WRITE_OPTIMIZED); + UCL_WRITE_ONLY); type2frho.alloc(lj_types,*(this->ucl_device),UCL_READ_ONLY); for (int i=0; i dview_frho_spline(nfrho*(nrho+1),*(this->ucl_device), - UCL_WRITE_OPTIMIZED); + UCL_WRITE_ONLY); for (int ix=0; ix dview_rhor_spline(nrhor*(nr+1),*(this->ucl_device), - UCL_WRITE_OPTIMIZED); + UCL_WRITE_ONLY); for (int ix=0; ix dview_z2r_spline(nz2r*(nr+1),*(this->ucl_device), - UCL_WRITE_OPTIMIZED); + UCL_WRITE_ONLY); for (int ix=0; ix z2r_sp2_tex; fetch4(coeff,index,frho_sp2_tex); \ energy = ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w; \ if (rho > rhomax) energy += fp*(rho-rhomax); \ - engv[ii]=(acctyp)2.0*energy; \ + engv[ii]=energy; \ } \ } @@ -116,12 +116,12 @@ texture z2r_sp2_tex; } \ if (offset==0) { \ if (eflag>0) { \ - engv[ii]+=energy; \ + engv[ii]+=energy*(acctyp)0.5; \ engv+=inum; \ } \ if (vflag>0) { \ for (int i=0; i<6; i++) { \ - engv[ii]=virial[i]; \ + engv[ii]=virial[i]*(acctyp)0.5; \ engv+=inum; \ } \ } \ @@ -150,7 +150,7 @@ texture z2r_sp2_tex; fetch4(coeff,index,frho_sp2_tex); \ energy = ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w; \ if (rho > rhomax) energy += fp*(rho-rhomax); \ - engv[ii]=(acctyp)2.0*energy; \ + engv[ii]=energy; \ } \ } @@ -173,12 +173,12 @@ texture z2r_sp2_tex; if (offset==0) { \ engv+=ii; \ if (eflag>0) { \ - *engv+=energy; \ + *engv+=energy*(acctyp)0.5; \ engv+=inum; \ } \ if (vflag>0) { \ for (int i=0; i<6; i++) { \ - *engv=virial[i]; \ + *engv=virial[i]*(acctyp)0.5; \ engv+=inum; \ } \ } \ @@ -210,7 +210,8 @@ __kernel void k_energy(const __global numtyp4 *restrict x_, if (ii pos_tex, quat_tex; if (offset==0) { \ __global acctyp *ap1=engv+ii; \ if (eflag>0) { \ - *ap1=energy; \ + *ap1=energy*(acctyp)0.5; \ ap1+=astride; \ } \ if (vflag>0) { \ for (int i=0; i<6; i++) { \ - *ap1=virial[i]; \ + *ap1=virial[i]*(acctyp)0.5; \ ap1+=astride; \ } \ } \ @@ -130,12 +130,12 @@ texture pos_tex, quat_tex; if (offset==0) { \ engv+=ii; \ if (eflag>0) { \ - *engv+=energy; \ + *engv+=energy*(acctyp)0.5; \ engv+=inum; \ } \ if (vflag>0) { \ for (int i=0; i<6; i++) { \ - *engv+=virial[i]; \ + *engv+=virial[i]*(acctyp)0.5; \ engv+=inum; \ } \ } \ @@ -170,12 +170,12 @@ texture pos_tex, quat_tex; if (offset==0) { \ __global acctyp *ap1=engv+ii; \ if (eflag>0) { \ - *ap1=energy; \ + *ap1=energy*(acctyp)0.5; \ ap1+=astride; \ } \ if (vflag>0) { \ for (int i=0; i<6; i++) { \ - *ap1=virial[i]; \ + *ap1=virial[i]*(acctyp)0.5; \ ap1+=astride; \ } \ } \ @@ -202,12 +202,12 @@ texture pos_tex, quat_tex; if (offset==0) { \ engv+=ii; \ if (eflag>0) { \ - *engv+=energy; \ + *engv+=energy*(acctyp)0.5; \ engv+=inum; \ } \ if (vflag>0) { \ for (int i=0; i<6; i++) { \ - *engv+=virial[i]; \ + *engv+=virial[i]*(acctyp)0.5; \ engv+=inum; \ } \ } \ diff --git a/lib/gpu/lal_gauss.cpp b/lib/gpu/lal_gauss.cpp index d05573f751..9e296e6872 100644 --- a/lib/gpu/lal_gauss.cpp +++ b/lib/gpu/lal_gauss.cpp @@ -68,7 +68,7 @@ int GaussT::init(const int ntypes, // Allocate a host write buffer for data initialization UCL_H_Vec host_write(lj_types*lj_types*32,*(this->ucl_device), - UCL_WRITE_OPTIMIZED); + UCL_WRITE_ONLY); for (int i=0; i host_write(lj_types*lj_types*32,*(this->ucl_device), - UCL_WRITE_OPTIMIZED); + UCL_WRITE_ONLY); for (int i=0; iatom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4, host_offset); - dev_error.alloc(1,*(this->ucl_device)); + dev_error.alloc(1,*(this->ucl_device),UCL_WRITE_ONLY); dev_error.zero(); // Allocate, cast and asynchronous memcpy of constant data @@ -258,6 +258,9 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) { &ainum, &this->_threads_per_atom); this->time_ellipsoid2.stop(); } else { + GX=static_cast(ceil(static_cast(this->ans->inum()- + this->_last_ellipse)/ + (BX/this->_threads_per_atom))); this->ans->force.zero(); this->ans->engv.zero(); this->time_nbor1.stop(); diff --git a/lib/gpu/lal_gayberne.cu b/lib/gpu/lal_gayberne.cu index da236fa79a..74072fc673 100644 --- a/lib/gpu/lal_gayberne.cu +++ b/lib/gpu/lal_gayberne.cu @@ -120,7 +120,8 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_, if (ii host_write(lj_types*lj_types*32,*(this->ucl_device), - UCL_WRITE_OPTIMIZED); + UCL_WRITE_ONLY); for (int i=0; i host_write(lj_types*lj_types*32,*(this->ucl_device), - UCL_WRITE_OPTIMIZED); + UCL_WRITE_ONLY); for (int i=0; i host_write(lj_types*lj_types*32,*(this->ucl_device), - UCL_WRITE_OPTIMIZED); + UCL_WRITE_ONLY); for (int i=0; i host_write(lj_types*lj_types*32,*(this->ucl_device), - UCL_WRITE_OPTIMIZED); + UCL_WRITE_ONLY); for (int i=0; i host_write(lj_types*lj_types*32,*(this->ucl_device), - UCL_WRITE_OPTIMIZED); + UCL_WRITE_ONLY); for (int i=0; i host_write(lj_types*lj_types*32,*(this->ucl_device), - UCL_WRITE_OPTIMIZED); + UCL_WRITE_ONLY); for (int i=0; i host_write(lj_types*lj_types*32,*(this->ucl_device), - UCL_WRITE_OPTIMIZED); + UCL_WRITE_ONLY); for (int i=0; i host_write(lj_types*lj_types*32,*(this->ucl_device), - UCL_WRITE_OPTIMIZED); + UCL_WRITE_ONLY); for (int i=0; i host_write(types*types*32,*(this->ucl_device), - UCL_WRITE_OPTIMIZED); + UCL_WRITE_ONLY); for (int i=0; icompile_kernels(devi,gpu_nbor); + _shared->compile_kernels(devi,gpu_nbor,compile_flags); return success; } @@ -114,7 +115,7 @@ void Neighbor::alloc(bool &success) { success=success && (dev_nbor.alloc(3*_max_atoms,*dev, UCL_READ_ONLY)==UCL_SUCCESS); success=success && (host_acc.alloc(nt*2,*dev, - UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); + UCL_READ_WRITE)==UCL_SUCCESS); _c_bytes=dev_nbor.row_bytes(); if (_alloc_packed) { @@ -129,10 +130,10 @@ void Neighbor::alloc(bool &success) { host_ilist.clear(); host_jlist.clear(); - success=(nbor_host.alloc(_max_nbors*_max_host,*dev,UCL_RW_OPTIMIZED, - UCL_WRITE_ONLY)==UCL_SUCCESS) && success; + success=(nbor_host.alloc(_max_nbors*_max_host,*dev,UCL_READ_WRITE, + UCL_READ_WRITE)==UCL_SUCCESS) && success; success=success && (dev_numj_host.alloc(_max_host,*dev, - UCL_WRITE_ONLY)==UCL_SUCCESS); + UCL_READ_WRITE)==UCL_SUCCESS); success=success && (host_ilist.alloc(nt,*dev,UCL_NOT_PINNED)==UCL_SUCCESS); if (!success) return; @@ -161,7 +162,7 @@ void Neighbor::alloc(bool &success) { success=success && (dev_nspecial.alloc(3*at,*dev, UCL_READ_ONLY)==UCL_SUCCESS); success=success && (dev_special.alloc(_maxspecial*at,*dev, - UCL_READ_ONLY)==UCL_SUCCESS); + UCL_READ_WRITE)==UCL_SUCCESS); success=success && (dev_special_t.alloc(_maxspecial*at,*dev, UCL_READ_ONLY)==UCL_SUCCESS); _gpu_bytes+=dev_nspecial.row_bytes()+dev_special.row_bytes()+ @@ -178,11 +179,9 @@ void Neighbor::clear() { _bin_time=0.0; if (_ncells>0) { _ncells=0; - dev_cell_counts.clear(); - if (_gpu_nbor==2) { - host_cell_counts.clear(); + cell_counts.clear(); + if (_gpu_nbor==2) delete [] cell_iter; - } } if (_allocated) { _allocated=false; @@ -286,6 +285,80 @@ void Neighbor::get_host(const int inum, int *ilist, int *numj, } } +// This is the same as get host, but the requirement that ilist[i]=i and +// inum=nlocal is forced to be true to allow direct indexing of neighbors of +// neighbors +void Neighbor::get_host3(const int inum, const int nlist, int *ilist, int *numj, + int **firstneigh, const int block_size) { + _nbor_time_avail=true; + time_nbor.start(); + + UCL_H_Vec ilist_view; + ilist_view.view(ilist,inum,*dev); + ucl_copy(dev_nbor,ilist_view,false); + + UCL_D_Vec nbor_offset; + UCL_H_Vec host_offset; + + int copy_count=0; + int ij_count=0; + int acc_count=0; + int dev_count=0; + int *h_ptr=host_packed.begin(); + _nbor_pitch=inum; + + if (nlist!=inum) + host_acc.zero(inum); + + for (int ii=0; ii acc_view; + acc_view.view_offset(inum,dev_nbor,inum*2); + ucl_copy(acc_view,host_acc,true); + time_nbor.stop(); + + if (_use_packing==false) { + time_kernel.start(); + int GX=static_cast(ceil(static_cast(inum)*_threads_per_atom/ + block_size)); + _shared->k_nbor.set_size(GX,block_size); + _shared->k_nbor.run(&dev_nbor, &dev_packed, &inum, &_threads_per_atom); + time_kernel.stop(); + } +} + template void Neighbor::resize_max_neighbors(const int maxn, bool &success) { if (maxn>_max_nbors) { @@ -330,24 +403,20 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, ncellz = static_cast(ceil((subhi[2]-sublo[2])/_cell_size))+ghost_cells; ncell_3d = ncellx * ncelly * ncellz; if (ncell_3d+1>_ncells) { + cell_counts.clear(); + if (_gpu_nbor==2) { - if (_ncells>0) { - host_cell_counts.clear(); + if (_ncells>0) delete [] cell_iter; - } cell_iter = new int[ncell_3d+1]; - host_cell_counts.alloc(ncell_3d+1,dev_nbor); - } - - if (_gpu_nbor==2 && atom.host_view()) - dev_cell_counts.view(host_cell_counts); - else { - dev_cell_counts.clear(); - dev_cell_counts.alloc(ncell_3d+1,dev_nbor); + cell_counts.alloc(ncell_3d+1,dev_nbor,UCL_READ_WRITE,UCL_READ_ONLY); + } else { + cell_counts.device.clear(); + cell_counts.device.alloc(ncell_3d+1,dev_nbor); } _ncells=ncell_3d+1; - _cell_bytes=dev_cell_counts.row_bytes(); + _cell_bytes=cell_counts.device.row_bytes(); } const numtyp cutoff_cast=static_cast(_cutoff); @@ -381,7 +450,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, int *particle_id=atom.host_particle_id.begin(); // Build cell list on CPU - host_cell_counts.zero(); + cell_counts.host.zero(); double i_cell_size=1.0/_cell_size; int offset_hi=_cells_in_cutoff+1; @@ -403,7 +472,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, int id = ix+iy*ncellx+iz*ncellx*ncelly; cell_id[i] = id; - host_cell_counts[id+1]++; + cell_counts[id+1]++; } for (int i=nt; ik_cell_counts.set_size(GX,neigh_block); - _shared->k_cell_counts.run(&atom.dev_cell_id, &dev_cell_counts, &nall, + _shared->k_cell_counts.run(&atom.dev_cell_id, &cell_counts, &nall, &ncell_3d); } @@ -490,7 +559,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, _shared->k_build_nbor.set_size(ncellx-ghost_cells,(ncelly-ghost_cells)* (ncellz-ghost_cells),cell_block,1); _shared->k_build_nbor.run(&atom.x, &atom.dev_particle_id, - &dev_cell_counts, &dev_nbor, &nbor_host, + &cell_counts, &dev_nbor, &nbor_host, &dev_numj_host, &_max_nbors, &cutoff_cast, &ncellx, &ncelly, &ncellz, &inum, &nt, &nall, &_threads_per_atom, &_cells_in_cutoff); diff --git a/lib/gpu/lal_neighbor.h b/lib/gpu/lal_neighbor.h index 5ac01bf331..7949221ea5 100644 --- a/lib/gpu/lal_neighbor.h +++ b/lib/gpu/lal_neighbor.h @@ -47,13 +47,15 @@ class Neighbor { * \param pre_cut True if cutoff test will be performed in separate kernel * than the force kernel * \param threads_per_atom Number of threads used per atom for force - * calculation **/ + * calculation + * \param compile_flags Flags for JIT compiling **/ bool init(NeighborShared *shared, const int inum, const int host_inum, const int max_nbors, const int maxspecial, UCL_Device &dev, const int gpu_nbor, const int gpu_host, const bool pre_cut, const int block_cell_2d, const int block_cell_id, const int block_nbor_build, const int threads_per_atom, - const int warp_size, const bool time_device); + const int warp_size, const bool time_device, + const std::string compile_flags); /// Set the size of the cutoff+skin inline void cell_size(const double size, const double cutoff) { @@ -143,6 +145,10 @@ class Neighbor { void get_host(const int inum, int *ilist, int *numj, int **firstneigh, const int block_size); + /// Copy neighbor list from host for 3-body (first time or from a rebuild) + void get_host3(const int inum, const int nlist, int *ilist, int *numj, + int **firstneigh, const int block_size); + /// Return the stride in elements for each nbor row inline int nbor_pitch() const { return _nbor_pitch; } @@ -207,11 +213,9 @@ class Neighbor { UCL_D_Vec dev_nspecial; /// Device storage for special neighbors UCL_D_Vec dev_special, dev_special_t; - /// Host storage for number of particles per cell - UCL_H_Vec host_cell_counts; + /// Host/Device storage for number of particles per cell + UCL_Vector cell_counts; int *cell_iter; - /// Device storage for number of particles per cell - UCL_D_Vec dev_cell_counts; /// Device timers UCL_Timer time_nbor, time_kernel, time_hybrid1, time_hybrid2, time_transpose; diff --git a/lib/gpu/lal_neighbor_shared.cpp b/lib/gpu/lal_neighbor_shared.cpp index ba948b4287..d5d37883cb 100644 --- a/lib/gpu/lal_neighbor_shared.cpp +++ b/lib/gpu/lal_neighbor_shared.cpp @@ -48,15 +48,12 @@ void NeighborShared::clear() { } } -void NeighborShared::compile_kernels(UCL_Device &dev, const int gpu_nbor) { +void NeighborShared::compile_kernels(UCL_Device &dev, const int gpu_nbor, + const std::string flags) { if (_compiled) return; _gpu_nbor=gpu_nbor; - std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+ - std::string(OCL_PRECISION_COMPILE)+" -D"+ - std::string(OCL_VENDOR); - if (_gpu_nbor==0) { nbor_program=new UCL_Program(dev); nbor_program->load_string(neighbor_cpu,flags.c_str()); diff --git a/lib/gpu/lal_neighbor_shared.h b/lib/gpu/lal_neighbor_shared.h index dcd776669c..31d74b0fa6 100644 --- a/lib/gpu/lal_neighbor_shared.h +++ b/lib/gpu/lal_neighbor_shared.h @@ -44,7 +44,8 @@ class NeighborShared { UCL_Texture neigh_tex; /// Compile kernels for neighbor lists - void compile_kernels(UCL_Device &dev, const int gpu_nbor); + void compile_kernels(UCL_Device &dev, const int gpu_nbor, + const std::string flags); // ----------------------------- Kernels UCL_Program *nbor_program, *build_program; diff --git a/lib/gpu/lal_pppm.cpp b/lib/gpu/lal_pppm.cpp index 5a929d9e69..188d23096c 100644 --- a/lib/gpu/lal_pppm.cpp +++ b/lib/gpu/lal_pppm.cpp @@ -136,10 +136,10 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen, _npts_y=nyhi_out-nylo_out+1; _npts_z=nzhi_out-nzlo_out+1; _npts_yx=_npts_x*_npts_y; - success=success && (brick.alloc(_npts_x*_npts_y*_npts_z,*ucl_device)== - UCL_SUCCESS); - success=success && (vd_brick.alloc(_npts_x*_npts_y*_npts_z*4,*ucl_device)== - UCL_SUCCESS); + success=success && (brick.alloc(_npts_x*_npts_y*_npts_z,*ucl_device, + UCL_READ_ONLY,UCL_WRITE_ONLY)==UCL_SUCCESS); + success=success && (vd_brick.alloc(_npts_x*_npts_y*_npts_z*4,*ucl_device, + UCL_READ_WRITE,UCL_READ_ONLY)==UCL_SUCCESS); *vd_brick_p=vd_brick.host.begin(); _max_bytes+=brick.device.row_bytes()+vd_brick.device.row_bytes(); @@ -159,7 +159,7 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen, _max_bytes+=d_brick_atoms.row_bytes(); // Allocate error flags for checking out of bounds atoms - success=success && (error_flag.alloc(1,*ucl_device,UCL_RW_OPTIMIZED, + success=success && (error_flag.alloc(1,*ucl_device,UCL_READ_ONLY, UCL_WRITE_ONLY)==UCL_SUCCESS); if (!success) { flag=-3; @@ -374,9 +374,7 @@ void PPPMT::compile_kernels(UCL_Device &dev) { if (sizeof(grdtyp)==sizeof(double) && ucl_device->double_precision()==false) return; - std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+ - std::string(OCL_PRECISION_COMPILE)+" -D"+ - std::string(OCL_VENDOR); + std::string flags=device->compile_string(); #ifdef USE_OPENCL flags+=std::string(" -Dgrdtyp=")+ucl_template_name()+" -Dgrdtyp4="+ ucl_template_name()+"4"; diff --git a/lib/gpu/lal_pppm_ext.cpp b/lib/gpu/lal_pppm_ext.cpp index c65508987d..08f2c94e90 100644 --- a/lib/gpu/lal_pppm_ext.cpp +++ b/lib/gpu/lal_pppm_ext.cpp @@ -97,11 +97,11 @@ float * pppm_gpu_init_f(const int nlocal, const int nall, FILE *screen, const int nzhi_out, float **rho_coeff, float **vd_brick, const double slab_volfactor, const int nx_pppm, const int ny_pppm, const int nz_pppm, - const bool split, int &success) { + const bool split, const bool respa, int &success) { float *b=pppm_gpu_init(PPPMF,nlocal,nall,screen,order,nxlo_out,nylo_out, nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff,vd_brick, slab_volfactor,nx_pppm,ny_pppm,nz_pppm,split,success); - if (split==false) + if (split==false && respa==false) PPPMF.device->set_single_precompute(&PPPMF); return b; } @@ -139,12 +139,13 @@ double * pppm_gpu_init_d(const int nlocal, const int nall, FILE *screen, const int nzhi_out, double **rho_coeff, double **vd_brick, const double slab_volfactor, const int nx_pppm, const int ny_pppm, - const int nz_pppm, const bool split, int &success) { + const int nz_pppm, const bool split, + const bool respa, int &success) { double *b=pppm_gpu_init(PPPMD,nlocal,nall,screen,order,nxlo_out,nylo_out, nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff, vd_brick,slab_volfactor,nx_pppm,ny_pppm,nz_pppm, split,success); - if (split==false) + if (split==false && respa==false) PPPMD.device->set_double_precompute(&PPPMD); return b; } diff --git a/lib/gpu/lal_precision.h b/lib/gpu/lal_precision.h index e31b10037e..88890d67f3 100644 --- a/lib/gpu/lal_precision.h +++ b/lib/gpu/lal_precision.h @@ -96,19 +96,27 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) { enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE}; -// OCL_VENDOR: preprocessor define for hardware +// OCL_DEFAULT_VENDOR: preprocessor define for hardware // specific sizes of OpenCL kernel related constants #ifdef FERMI_OCL -#define OCL_VENDOR "FERMI_OCL" +#define OCL_DEFAULT_VENDOR "fermi" +#endif + +#ifdef KEPLER_OCL +#define OCL_DEFAULT_VENDOR "kepler" #endif #ifdef CYPRESS_OCL -#define OCL_VENDOR "CYPRESS_OCL" +#define OCL_DEFAULT_VENDOR "cypress" #endif -#ifndef OCL_VENDOR -#define OCL_VENDOR "GENERIC_OCL" +#ifdef GENERIC_OCL +#define OCL_DEFAULT_VENDOR "generic" +#endif + +#ifndef OCL_DEFAULT_VENDOR +#define OCL_DEFAULT_VENDOR "none" #endif #endif diff --git a/lib/gpu/lal_preprocessor.h b/lib/gpu/lal_preprocessor.h index 16e4c29610..05e3e1d57b 100644 --- a/lib/gpu/lal_preprocessor.h +++ b/lib/gpu/lal_preprocessor.h @@ -214,6 +214,30 @@ typedef struct _double4 double4; #endif +// ------------------------------------------------------------------------- +// NVIDIA GENERIC OPENCL DEFINITIONS +// ------------------------------------------------------------------------- + +#ifdef NV_GENERIC_OCL + +#define USE_OPENCL +#define fast_mul mul24 +#define MEM_THREADS 16 +#define THREADS_PER_ATOM 1 +#define THREADS_PER_CHARGE 1 +#define BLOCK_PAIR 64 +#define MAX_SHARED_TYPES 8 +#define BLOCK_NBOR_BUILD 64 +#define BLOCK_BIO_PAIR 64 + +#define WARP_SIZE 32 +#define PPPM_BLOCK_1D 64 +#define BLOCK_CELL_2D 8 +#define BLOCK_CELL_ID 128 +#define MAX_BIO_SHARED_TYPES 128 + +#endif + // ------------------------------------------------------------------------- // NVIDIA FERMI OPENCL DEFINITIONS // ------------------------------------------------------------------------- @@ -221,9 +245,6 @@ typedef struct _double4 double4; #ifdef FERMI_OCL #define USE_OPENCL -#define fast_mul(X,Y) (X)*(Y) -#define ARCH 0 -#define DRIVER 0 #define MEM_THREADS 32 #define THREADS_PER_ATOM 4 #define THREADS_PER_CHARGE 8 @@ -238,7 +259,54 @@ typedef struct _double4 double4; #define BLOCK_CELL_ID 128 #define MAX_BIO_SHARED_TYPES 128 -#pragma OPENCL EXTENSION cl_khr_fp64: enable +#endif + +// ------------------------------------------------------------------------- +// NVIDIA KEPLER OPENCL DEFINITIONS +// ------------------------------------------------------------------------- + +#ifdef KEPLER_OCL + +#define USE_OPENCL +#define MEM_THREADS 32 +#define THREADS_PER_ATOM 4 +#define THREADS_PER_CHARGE 8 +#define BLOCK_PAIR 256 +#define MAX_SHARED_TYPES 11 +#define BLOCK_NBOR_BUILD 128 +#define BLOCK_BIO_PAIR 256 +#define BLOCK_ELLIPSE 128 + +#define WARP_SIZE 32 +#define PPPM_BLOCK_1D 64 +#define BLOCK_CELL_2D 8 +#define BLOCK_CELL_ID 128 +#define MAX_BIO_SHARED_TYPES 128 + +#ifndef NO_OCL_PTX +#define ARCH 300 +#ifdef _SINGLE_SINGLE +inline float shfl_xor(float var, int laneMask, int width) { + float ret; + int c; + c = ((WARP_SIZE-width) << 8) | 0x1f; + asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=f"(ret) : "f"(var), "r"(laneMask), "r"(c)); + return ret; +} +#else +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +inline double shfl_xor(double var, int laneMask, int width) { + int c = ((WARP_SIZE-width) << 8) | 0x1f; + int x,y,x2,y2; + double ans; + asm volatile ("mov.b64 {%0, %1}, %2;" : "=r"(y), "=r"(x) : "d"(var)); + asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=r"(x2) : "r"(x), "r"(laneMask), "r"(c)); + asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=r"(y2) : "r"(y), "r"(laneMask), "r"(c)); + asm volatile ("mov.b64 %0, {%1, %2};" : "=d"(ans) : "r"(y2), "r"(x2)); + return ans; +} +#endif +#endif #endif @@ -249,9 +317,6 @@ typedef struct _double4 double4; #ifdef CYPRESS_OCL #define USE_OPENCL -#define fast_mul(X,Y) (X)*(Y) -#define ARCH 0 -#define DRIVER 0 #define MEM_THREADS 32 #define THREADS_PER_ATOM 4 #define THREADS_PER_CHARGE 8 @@ -266,12 +331,6 @@ typedef struct _double4 double4; #define BLOCK_CELL_ID 128 #define MAX_BIO_SHARED_TYPES 128 -#if defined(cl_khr_fp64) -#pragma OPENCL EXTENSION cl_khr_fp64 : enable -#elif defined(cl_amd_fp64) -#pragma OPENCL EXTENSION cl_amd_fp64 : enable -#endif - #endif // ------------------------------------------------------------------------- @@ -281,9 +340,6 @@ typedef struct _double4 double4; #ifdef GENERIC_OCL #define USE_OPENCL -#define fast_mul mul24 -#define ARCH 0 -#define DRIVER 0 #define MEM_THREADS 16 #define THREADS_PER_ATOM 1 #define THREADS_PER_CHARGE 1 @@ -298,6 +354,20 @@ typedef struct _double4 double4; #define BLOCK_CELL_ID 128 #define MAX_BIO_SHARED_TYPES 128 +#endif + +// ------------------------------------------------------------------------- +// OPENCL Stuff for All Hardware +// ------------------------------------------------------------------------- +#ifdef USE_OPENCL + +#ifndef _SINGLE_SINGLE + +#ifndef cl_khr_fp64 +#ifndef cl_amd_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +#endif +#endif #if defined(cl_khr_fp64) #pragma OPENCL EXTENSION cl_khr_fp64 : enable #elif defined(cl_amd_fp64) @@ -306,10 +376,17 @@ typedef struct _double4 double4; #endif -// ------------------------------------------------------------------------- -// OPENCL Stuff for All Hardware -// ------------------------------------------------------------------------- -#ifdef USE_OPENCL +#ifndef fast_mul +#define fast_mul(X,Y) (X)*(Y) +#endif + +#ifndef ARCH +#define ARCH 0 +#endif + +#ifndef DRIVER +#define DRIVER 0 +#endif #define GLOBAL_ID_X get_global_id(0) #define THREAD_ID_X get_local_id(0) diff --git a/lib/gpu/lal_re_squared.cpp b/lib/gpu/lal_re_squared.cpp index 8f7ef24a11..cbf50fab7d 100644 --- a/lib/gpu/lal_re_squared.cpp +++ b/lib/gpu/lal_re_squared.cpp @@ -74,7 +74,7 @@ int RESquaredT::init(const int ntypes, double **host_shape, double **host_well, // Allocate a host write buffer for copying type data UCL_H_Vec host_write(lj_types*lj_types*32,*(this->ucl_device), - UCL_WRITE_OPTIMIZED); + UCL_WRITE_ONLY); for (int i=0; iatom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4, host_offset); - dev_error.alloc(1,*(this->ucl_device)); + dev_error.alloc(1,*(this->ucl_device),UCL_WRITE_ONLY); dev_error.zero(); // Allocate, cast and asynchronous memcpy of constant data @@ -260,6 +260,9 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) { &this->_threads_per_atom); this->time_ellipsoid3.stop(); } else { + GX=static_cast(ceil(static_cast(this->ans->inum()- + this->_last_ellipse)/ + (BX/this->_threads_per_atom))); this->ans->force.zero(); this->ans->engv.zero(); this->time_nbor1.zero(); diff --git a/lib/gpu/lal_re_squared.cu b/lib/gpu/lal_re_squared.cu index 28e15b5cdf..1308a70a7f 100644 --- a/lib/gpu/lal_re_squared.cu +++ b/lib/gpu/lal_re_squared.cu @@ -75,7 +75,8 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_, if (ii0) { \ - *ap1+=energy; \ + *ap1+=energy*(acctyp)0.5; \ ap1+=astride; \ } \ if (vflag>0) { \ for (int i=0; i<6; i++) { \ - *ap1+=virial[i]; \ + *ap1+=virial[i]*(acctyp)0.5; \ ap1+=astride; \ } \ } \ @@ -104,12 +104,12 @@ if (offset==0) { \ __global acctyp *ap1=engv+ii; \ if (eflag>0) { \ - *ap1+=energy; \ + *ap1+=energy*(acctyp)0.5; \ ap1+=astride; \ } \ if (vflag>0) { \ for (int i=0; i<6; i++) { \ - *ap1+=virial[i]; \ + *ap1+=virial[i]*(acctyp)0.5; \ ap1+=astride; \ } \ } \ @@ -173,7 +173,8 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_, if (ii host_write_int(lj_types*lj_types,*(this->ucl_device), - UCL_WRITE_OPTIMIZED); + UCL_WRITE_ONLY); for (int i=0; i host_write(lj_types*lj_types,*(this->ucl_device), - UCL_WRITE_OPTIMIZED); + UCL_WRITE_ONLY); coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); for (int ix=1; ix host_write2(_ntables*_tablength,*(this->ucl_device), - UCL_WRITE_OPTIMIZED); + UCL_WRITE_ONLY); for (int i=0; i<_ntables*_tablength; i++) { host_write2[i].x = 0.0; host_write2[i].y = 0.0; @@ -190,7 +190,7 @@ int TableT::init(const int ntypes, ucl_copy(coeff4,host_write2,false); UCL_H_Vec host_rsq(lj_types*lj_types,*(this->ucl_device), - UCL_WRITE_OPTIMIZED); + UCL_WRITE_ONLY); cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); this->atom->type_pack1(ntypes,lj_types,cutsq,host_rsq,host_cutsq); diff --git a/lib/gpu/lal_table.cu b/lib/gpu/lal_table.cu index ed843b59ca..afc6267902 100644 --- a/lib/gpu/lal_table.cu +++ b/lib/gpu/lal_table.cu @@ -74,7 +74,8 @@ __kernel void k_table(const __global numtyp4 *restrict x_, if (ii host_write(lj_types*lj_types*32,*(this->ucl_device), - UCL_WRITE_OPTIMIZED); + UCL_WRITE_ONLY); for (int i=0; iucl_device->shared_memory() && sizeof(numtyp)==sizeof(double)) + _shared_view=true; + else + _shared_view=false; + // allocate rad - bool cpuview=false; - if (this->ucl_device->device_type()==UCL_CPU) - cpuview=true; - int ef_nall=nall; if (ef_nall==0) ef_nall=2000; _max_rad_size=static_cast(static_cast(ef_nall)*1.10); - host_rad.alloc(_max_rad_size,*(this->ucl_device)); - if (cpuview) - dev_rad.view(host_rad); - else - dev_rad.alloc(_max_rad_size,*(this->ucl_device),UCL_WRITE_ONLY); + + if (_shared_view==false) + c_rad.alloc(_max_rad_size,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY); rad_tex.get_texture(*(this->pair_program),"rad_tex"); - rad_tex.bind_float(dev_rad,1); + rad_tex.bind_float(c_rad,1); // If atom type constants fit in shared memory use fast kernel int lj_types=ntypes; @@ -90,7 +89,7 @@ int YukawaColloidT::init(const int ntypes, // Allocate a host write buffer for data initialization UCL_H_Vec host_write(lj_types*lj_types*32,*(this->ucl_device), - UCL_WRITE_OPTIMIZED); + UCL_WRITE_ONLY); for (int i=0; iclear_atomic(); } @@ -143,22 +141,11 @@ void YukawaColloidT::compute(const int f_ago, const int inum_full, // ------------------- Resize rad array -------------------------- if (nall>_max_rad_size) { - dev_rad.clear(); - host_rad.clear(); - _max_rad_size=static_cast(static_cast(nall)*1.10); - host_rad.alloc(_max_rad_size,*(this->ucl_device)); - - if (this->ucl_device->device_type()==UCL_CPU) { - if (sizeof(numtyp)==sizeof(double)) { - host_rad.view((numtyp*)rad,nall,*(this->ucl_device)); - dev_rad.view(host_rad); - } - } else { - dev_rad.alloc(_max_rad_size,*(this->ucl_device)); + if (_shared_view==false) { + c_rad.resize(_max_rad_size); + rad_tex.bind_float(c_rad,1); } - - rad_tex.bind_float(dev_rad,1); } // ---------------------------------------------------------------- @@ -212,22 +199,11 @@ int** YukawaColloidT::compute(const int ago, const int inum_full, const int nall // ------------------- Resize rad array ---------------------------- if (nall>_max_rad_size) { - dev_rad.clear(); - host_rad.clear(); - _max_rad_size=static_cast(static_cast(nall)*1.10); - host_rad.alloc(_max_rad_size,*(this->ucl_device)); - - if (this->ucl_device->device_type()==UCL_CPU) { - if (sizeof(numtyp)==sizeof(double)) { - host_rad.view((numtyp*)rad,nall,*(this->ucl_device)); - dev_rad.view(host_rad); - } - } else { - dev_rad.alloc(_max_rad_size,*(this->ucl_device)); + if (_shared_view==false) { + c_rad.resize(_max_rad_size); + rad_tex.bind_float(c_rad,1); } - - rad_tex.bind_float(dev_rad,1); } // ----------------------------------------------------------------- @@ -298,13 +274,13 @@ void YukawaColloidT::loop(const bool _eflag, const bool _vflag) { this->time_pair.start(); if (shared_types) { this->k_pair_fast.set_size(GX,BX); - this->k_pair_fast.run(&this->atom->x, &dev_rad, &coeff, &sp_lj, + this->k_pair_fast.run(&this->atom->x, &c_rad, &coeff, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &_kappa); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->x, &dev_rad, &coeff, &_lj_types, &sp_lj, + this->k_pair.run(&this->atom->x, &c_rad, &coeff, &_lj_types, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &_kappa); diff --git a/lib/gpu/lal_yukawa_colloid.cu b/lib/gpu/lal_yukawa_colloid.cu index 55f52e70e7..65da63f8f7 100644 --- a/lib/gpu/lal_yukawa_colloid.cu +++ b/lib/gpu/lal_yukawa_colloid.cu @@ -59,7 +59,8 @@ __kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_, if (ii { inline void cast_rad_data(double* rad) { int nall = this->atom->nall(); - if (this->ucl_device->device_type()==UCL_CPU) { - if (sizeof(numtyp)==sizeof(double)) { - host_rad.view((numtyp*)rad,nall,*(this->ucl_device)); - dev_rad.view(host_rad); - } else { - for (int i=0; iucl_device)); + c_rad.device.view(c_rad.host); } else { if (sizeof(numtyp)==sizeof(double)) - memcpy(host_rad.begin(),rad,nall*sizeof(numtyp)); - else { - for (int i=0; iatom->nall(),true); + c_rad.update_device(this->atom->nall(),true); } /// Clear all host and device data @@ -114,10 +109,10 @@ class YukawaColloid : public BaseAtomic { numtyp _kappa; /// Per-atom arrays - UCL_H_Vec host_rad; - UCL_D_Vec dev_rad; + UCL_Vector c_rad; private: + bool _shared_view; bool _allocated; void loop(const bool _eflag, const bool _vflag); };