git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@10667 f3b2605a-c512-4ea7-a41b-209d697bcdaa

This commit is contained in:
sjplimp
2013-08-23 14:41:20 +00:00
parent 4047288238
commit 402d1a8605
105 changed files with 2003 additions and 954 deletions

View File

@ -4,9 +4,14 @@
# which file will be copied to Makefile.lammps
EXTRAMAKE = Makefile.lammps.standard
EXTRAMAKE = Makefile.lammps.opencl
OCL_CPP = mpic++ -O3 -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK
OCL_TUNE = -DFERMI_OCL # -- Uncomment for NVIDIA Fermi
# OCL_TUNE = -DKEPLER_OCL # -- Uncomment for NVIDIA Kepler
# OCL_TUNE = -DCYPRESS_OCL # -- Uncomment for AMD Cypress
# OCL_TUNE = -DGENERIC_OCL # -- Uncomment for generic device
OCL_CPP = mpic++ $(DEFAULT_DEVICE) -O3 -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK
OCL_LINK = -lOpenCL
OCL_PREC = -D_SINGLE_SINGLE

View File

@ -4,7 +4,12 @@
# which file will be copied to Makefile.lammps
EXTRAMAKE = Makefile.lammps.standard
EXTRAMAKE = Makefile.lammps.mac_ocl
OCL_TUNE = -DFERMI_OCL # -- Uncomment for NVIDIA Fermi
# OCL_TUNE = -DKEPLER_OCL # -- Uncomment for NVIDIA Kepler
# OCL_TUNE = -DCYPRESS_OCL # -- Uncomment for AMD Cypress
# OCL_TUNE = -DGENERIC_OCL # -- Uncomment for generic device
OCL_CPP = mpic++ -O3 -DMPI_GERYON -DUCL_NO_EXIT
OCL_LINK = -framework OpenCL

View File

@ -28,7 +28,7 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_ans.o \
$(OBJ_DIR)/lal_neighbor.o $(OBJ_DIR)/lal_neighbor_shared.o \
$(OBJ_DIR)/lal_device.o $(OBJ_DIR)/lal_base_atomic.o \
$(OBJ_DIR)/lal_base_charge.o $(OBJ_DIR)/lal_base_ellipsoid.o \
$(OBJ_DIR)/lal_base_dipole.o \
$(OBJ_DIR)/lal_base_dipole.o $(OBJ_DIR)/lal_base_three.o \
$(OBJ_DIR)/lal_pppm.o $(OBJ_DIR)/lal_pppm_ext.o \
$(OBJ_DIR)/lal_gayberne.o $(OBJ_DIR)/lal_gayberne_ext.o \
$(OBJ_DIR)/lal_re_squared.o $(OBJ_DIR)/lal_re_squared_ext.o \
@ -59,7 +59,12 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_ans.o \
$(OBJ_DIR)/lal_gauss.o $(OBJ_DIR)/lal_gauss_ext.o \
$(OBJ_DIR)/lal_yukawa_colloid.o $(OBJ_DIR)/lal_yukawa_colloid_ext.o \
$(OBJ_DIR)/lal_lj_coul_debye.o $(OBJ_DIR)/lal_lj_coul_debye_ext.o \
$(OBJ_DIR)/lal_coul_dsf.o $(OBJ_DIR)/lal_coul_dsf_ext.o
$(OBJ_DIR)/lal_coul_dsf.o $(OBJ_DIR)/lal_coul_dsf_ext.o \
$(OBJ_DIR)/lal_sw.o $(OBJ_DIR)/lal_sw_ext.o \
$(OBJ_DIR)/lal_beck.o $(OBJ_DIR)/lal_beck_ext.o \
$(OBJ_DIR)/lal_mie.o $(OBJ_DIR)/lal_mie_ext.o \
$(OBJ_DIR)/lal_soft.o $(OBJ_DIR)/lal_soft_ext.o \
$(OBJ_DIR)/lal_lj_coul_msm.o $(OBJ_DIR)/lal_lj_coul_msm_ext.o
CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \
$(OBJ_DIR)/atom.cubin $(OBJ_DIR)/atom_cubin.h \
@ -99,7 +104,12 @@ CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \
$(OBJ_DIR)/gauss.cubin $(OBJ_DIR)/gauss_cubin.h \
$(OBJ_DIR)/yukawa_colloid.cubin $(OBJ_DIR)/yukawa_colloid_cubin.h \
$(OBJ_DIR)/lj_coul_debye.cubin $(OBJ_DIR)/lj_coul_debye_cubin.h \
$(OBJ_DIR)/coul_dsf.cubin $(OBJ_DIR)/coul_dsf_cubin.h
$(OBJ_DIR)/coul_dsf.cubin $(OBJ_DIR)/coul_dsf_cubin.h \
$(OBJ_DIR)/sw.cubin $(OBJ_DIR)/sw_cubin.h \
$(OBJ_DIR)/beck.cubin $(OBJ_DIR)/beck_cubin.h \
$(OBJ_DIR)/mie.cubin $(OBJ_DIR)/mie_cubin.h \
$(OBJ_DIR)/soft.cubin $(OBJ_DIR)/soft_cubin.h \
$(OBJ_DIR)/lj_coul_msm.cubin $(OBJ_DIR)/lj_coul_msm_cubin.h
all: $(OBJ_DIR) $(GPU_LIB) $(EXECS)
@ -175,6 +185,9 @@ $(OBJ_DIR)/lal_base_ellipsoid.o: $(ALL_H) lal_base_ellipsoid.h lal_base_ellipsoi
$(OBJ_DIR)/lal_base_dipole.o: $(ALL_H) lal_base_dipole.h lal_base_dipole.cpp
$(CUDR) -o $@ -c lal_base_dipole.cpp
$(OBJ_DIR)/lal_base_three.o: $(ALL_H) lal_base_three.h lal_base_three.cpp
$(CUDR) -o $@ -c lal_base_three.cpp
$(OBJ_DIR)/pppm_f.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -Dgrdtyp=float -Dgrdtyp4=float4 -o $@ lal_pppm.cu
@ -571,6 +584,66 @@ $(OBJ_DIR)/lal_coul_dsf.o: $(ALL_H) lal_coul_dsf.h lal_coul_dsf.cpp $(OBJ_DIR)/c
$(OBJ_DIR)/lal_coul_dsf_ext.o: $(ALL_H) lal_coul_dsf.h lal_coul_dsf_ext.cpp lal_base_charge.h
$(CUDR) -o $@ -c lal_coul_dsf_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/sw.cubin: lal_sw.cu lal_precision.h lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_sw.cu
$(OBJ_DIR)/sw_cubin.h: $(OBJ_DIR)/sw.cubin $(OBJ_DIR)/sw.cubin
$(BIN2C) -c -n sw $(OBJ_DIR)/sw.cubin > $(OBJ_DIR)/sw_cubin.h
$(OBJ_DIR)/lal_sw.o: $(ALL_H) lal_sw.h lal_sw.cpp $(OBJ_DIR)/sw_cubin.h $(OBJ_DIR)/lal_base_three.o
$(CUDR) -o $@ -c lal_sw.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_sw_ext.o: $(ALL_H) lal_sw.h lal_sw_ext.cpp lal_base_three.h
$(CUDR) -o $@ -c lal_sw_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/beck.cubin: lal_beck.cu lal_precision.h lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_beck.cu
$(OBJ_DIR)/beck_cubin.h: $(OBJ_DIR)/beck.cubin $(OBJ_DIR)/beck.cubin
$(BIN2C) -c -n beck $(OBJ_DIR)/beck.cubin > $(OBJ_DIR)/beck_cubin.h
$(OBJ_DIR)/lal_beck.o: $(ALL_H) lal_beck.h lal_beck.cpp $(OBJ_DIR)/beck_cubin.h $(OBJ_DIR)/lal_base_atomic.o
$(CUDR) -o $@ -c lal_beck.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_beck_ext.o: $(ALL_H) lal_beck.h lal_beck_ext.cpp lal_base_atomic.h
$(CUDR) -o $@ -c lal_beck_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/mie.cubin: lal_mie.cu lal_precision.h lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_mie.cu
$(OBJ_DIR)/mie_cubin.h: $(OBJ_DIR)/mie.cubin $(OBJ_DIR)/mie.cubin
$(BIN2C) -c -n mie $(OBJ_DIR)/mie.cubin > $(OBJ_DIR)/mie_cubin.h
$(OBJ_DIR)/lal_mie.o: $(ALL_H) lal_mie.h lal_mie.cpp $(OBJ_DIR)/mie_cubin.h $(OBJ_DIR)/lal_base_atomic.o
$(CUDR) -o $@ -c lal_mie.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_mie_ext.o: $(ALL_H) lal_mie.h lal_mie_ext.cpp lal_base_atomic.h
$(CUDR) -o $@ -c lal_mie_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/soft.cubin: lal_soft.cu lal_precision.h lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_soft.cu
$(OBJ_DIR)/soft_cubin.h: $(OBJ_DIR)/soft.cubin $(OBJ_DIR)/soft.cubin
$(BIN2C) -c -n soft $(OBJ_DIR)/soft.cubin > $(OBJ_DIR)/soft_cubin.h
$(OBJ_DIR)/lal_soft.o: $(ALL_H) lal_soft.h lal_soft.cpp $(OBJ_DIR)/soft_cubin.h $(OBJ_DIR)/lal_base_atomic.o
$(CUDR) -o $@ -c lal_soft.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_soft_ext.o: $(ALL_H) lal_soft.h lal_soft_ext.cpp lal_base_atomic.h
$(CUDR) -o $@ -c lal_soft_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lj_coul_msm.cubin: lal_lj_coul_msm.cu lal_precision.h lal_preprocessor.h
$(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj_coul_msm.cu
$(OBJ_DIR)/lj_coul_msm_cubin.h: $(OBJ_DIR)/lj_coul_msm.cubin $(OBJ_DIR)/lj_coul_msm.cubin
$(BIN2C) -c -n lj_coul_msm $(OBJ_DIR)/lj_coul_msm.cubin > $(OBJ_DIR)/lj_coul_msm_cubin.h
$(OBJ_DIR)/lal_lj_coul_msm.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm.cpp $(OBJ_DIR)/lj_coul_msm_cubin.h $(OBJ_DIR)/lal_base_charge.o
$(CUDR) -o $@ -c lal_lj_coul_msm.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_lj_coul_msm_ext.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm_ext.cpp lal_base_charge.h
$(CUDR) -o $@ -c lal_lj_coul_msm_ext.cpp -I$(OBJ_DIR)
$(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVD_H)
$(CUDR) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_CUDADR $(CUDA_LIB) -lcuda

View File

@ -17,7 +17,7 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_answer.o \
$(OBJ_DIR)/lal_neighbor_shared.o $(OBJ_DIR)/lal_neighbor.o \
$(OBJ_DIR)/lal_device.o $(OBJ_DIR)/lal_base_atomic.o \
$(OBJ_DIR)/lal_base_charge.o $(OBJ_DIR)/lal_base_ellipsoid.o \
$(OBJ_DIR)/lal_base_dipole.o \
$(OBJ_DIR)/lal_base_dipole.o $(OBJ_DIR)/lal_base_three.o \
$(OBJ_DIR)/lal_pppm.o $(OBJ_DIR)/lal_pppm_ext.o \
$(OBJ_DIR)/lal_gayberne.o $(OBJ_DIR)/lal_gayberne_ext.o \
$(OBJ_DIR)/lal_re_squared.o $(OBJ_DIR)/lal_re_squared_ext.o \
@ -48,7 +48,12 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_answer.o \
$(OBJ_DIR)/lal_gauss.o $(OBJ_DIR)/lal_gauss_ext.o \
$(OBJ_DIR)/lal_yukawa_colloid.o $(OBJ_DIR)/lal_yukawa_colloid_ext.o \
$(OBJ_DIR)/lal_lj_coul_debye.o $(OBJ_DIR)/lal_lj_coul_debye_ext.o \
$(OBJ_DIR)/lal_coul_dsf.o $(OBJ_DIR)/lal_coul_dsf_ext.o
$(OBJ_DIR)/lal_coul_dsf.o $(OBJ_DIR)/lal_coul_dsf_ext.o \
$(OBJ_DIR)/lal_sw.o $(OBJ_DIR)/lal_sw_ext.o \
$(OBJ_DIR)/lal_beck.o $(OBJ_DIR)/lal_beck_ext.o \
$(OBJ_DIR)/lal_mie.o $(OBJ_DIR)/lal_mie_ext.o \
$(OBJ_DIR)/lal_soft.o $(OBJ_DIR)/lal_soft_ext.o \
$(OBJ_DIR)/lal_lj_coul_msm.o $(OBJ_DIR)/lal_lj_coul_msm_ext.o
KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \
$(OBJ_DIR)/neighbor_cpu_cl.h $(OBJ_DIR)/pppm_cl.h \
@ -68,7 +73,9 @@ KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \
$(OBJ_DIR)/born_coul_long_cl.h $(OBJ_DIR)/dipole_lj_cl.h \
$(OBJ_DIR)/dipole_lj_sf_cl.h $(OBJ_DIR)/colloid_cl.h \
$(OBJ_DIR)/gauss_cl.h $(OBJ_DIR)/yukawa_colloid_cl.h \
$(OBJ_DIR)/lj_coul_debye_cl.h $(OBJ_DIR)/coul_dsf_cl.h
$(OBJ_DIR)/lj_coul_debye_cl.h $(OBJ_DIR)/coul_dsf_cl.h \
$(OBJ_DIR)/sw_cl.h $(OBJ_DIR)/beck_cl.h $(OBJ_DIR)/mie_cl.h \
$(OBJ_DIR)/soft_cl.h $(OBJ_DIR)/lj_coul_msm_cl.h
OCL_EXECS = $(BIN_DIR)/ocl_get_devices
@ -117,6 +124,9 @@ $(OBJ_DIR)/lal_base_ellipsoid.o: $(OCL_H) lal_base_ellipsoid.h lal_base_ellipsoi
$(OBJ_DIR)/lal_base_dipole.o: $(OCL_H) lal_base_dipole.h lal_base_dipole.cpp
$(OCL) -o $@ -c lal_base_dipole.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_base_three.o: $(OCL_H) lal_base_three.h lal_base_three.cpp
$(OCL) -o $@ -c lal_base_three.cpp
$(OBJ_DIR)/pppm_cl.h: lal_pppm.cu lal_preprocessor.h
$(BSH) ./geryon/file_to_cstr.sh pppm lal_preprocessor.h lal_pppm.cu $(OBJ_DIR)/pppm_cl.h;
@ -405,6 +415,51 @@ $(OBJ_DIR)/lal_coul_dsf.o: $(ALL_H) lal_coul_dsf.h lal_coul_dsf.cpp $(OBJ_DIR)/
$(OBJ_DIR)/lal_coul_dsf_ext.o: $(ALL_H) lal_coul_dsf.h lal_coul_dsf_ext.cpp lal_base_charge.h
$(OCL) -o $@ -c lal_coul_dsf_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/sw_cl.h: lal_sw.cu $(PRE1_H)
$(BSH) ./geryon/file_to_cstr.sh sw $(PRE1_H) lal_sw.cu $(OBJ_DIR)/sw_cl.h;
$(OBJ_DIR)/lal_sw.o: $(ALL_H) lal_sw.h lal_sw.cpp $(OBJ_DIR)/sw_cl.h $(OBJ_DIR)/sw_cl.h $(OBJ_DIR)/lal_base_three.o
$(OCL) -o $@ -c lal_sw.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_sw_ext.o: $(ALL_H) lal_sw.h lal_sw_ext.cpp lal_base_three.h
$(OCL) -o $@ -c lal_sw_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/beck_cl.h: lal_beck.cu $(PRE1_H)
$(BSH) ./geryon/file_to_cstr.sh beck $(PRE1_H) lal_beck.cu $(OBJ_DIR)/beck_cl.h;
$(OBJ_DIR)/lal_beck.o: $(ALL_H) lal_beck.h lal_beck.cpp $(OBJ_DIR)/beck_cl.h $(OBJ_DIR)/beck_cl.h $(OBJ_DIR)/lal_base_atomic.o
$(OCL) -o $@ -c lal_beck.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_beck_ext.o: $(ALL_H) lal_beck.h lal_beck_ext.cpp lal_base_atomic.h
$(OCL) -o $@ -c lal_beck_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/mie_cl.h: lal_mie.cu $(PRE1_H)
$(BSH) ./geryon/file_to_cstr.sh mie $(PRE1_H) lal_mie.cu $(OBJ_DIR)/mie_cl.h;
$(OBJ_DIR)/lal_mie.o: $(ALL_H) lal_mie.h lal_mie.cpp $(OBJ_DIR)/mie_cl.h $(OBJ_DIR)/mie_cl.h $(OBJ_DIR)/lal_base_atomic.o
$(OCL) -o $@ -c lal_mie.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_mie_ext.o: $(ALL_H) lal_mie.h lal_mie_ext.cpp lal_base_atomic.h
$(OCL) -o $@ -c lal_mie_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/soft_cl.h: lal_soft.cu $(PRE1_H)
$(BSH) ./geryon/file_to_cstr.sh soft $(PRE1_H) lal_soft.cu $(OBJ_DIR)/soft_cl.h;
$(OBJ_DIR)/lal_soft.o: $(ALL_H) lal_soft.h lal_soft.cpp $(OBJ_DIR)/soft_cl.h $(OBJ_DIR)/soft_cl.h $(OBJ_DIR)/lal_base_atomic.o
$(OCL) -o $@ -c lal_soft.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_soft_ext.o: $(ALL_H) lal_soft.h lal_soft_ext.cpp lal_base_atomic.h
$(OCL) -o $@ -c lal_soft_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lj_coul_msm_cl.h: lal_lj_coul_msm.cu $(PRE1_H)
$(BSH) ./geryon/file_to_cstr.sh lj_coul_msm $(PRE1_H) lal_lj_coul_msm.cu $(OBJ_DIR)/lj_coul_msm_cl.h;
$(OBJ_DIR)/lal_lj_coul_msm.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm.cpp $(OBJ_DIR)/lj_coul_msm_cl.h $(OBJ_DIR)/lj_coul_msm_cl.h $(OBJ_DIR)/lal_base_charge.o
$(OCL) -o $@ -c lal_lj_coul_msm.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_lj_coul_msm_ext.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm_ext.cpp lal_base_charge.h
$(OCL) -o $@ -c lal_lj_coul_msm_ext.cpp -I$(OBJ_DIR)
$(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp
$(OCL) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_OPENCL $(OCL_LINK)

View File

@ -3,6 +3,7 @@
--------------------------------
W. Michael Brown (ORNL)
Trung Dac Nguyen (ORNL)
Peng Wang (NVIDIA)
Axel Kohlmeyer (Temple)
Steve Plimpton (SNL)
@ -60,6 +61,8 @@ devices on your system. A Makefile for OpenCL compilation is provided,
but support for OpenCL use is not currently provided by the developers.
Details of the implementation are provided in:
----
Brown, W.M., Wang, P. Plimpton, S.J., Tharrington, A.N. Implementing
Molecular Dynamics on Hybrid High Performance Computers - Short Range
Forces. Computer Physics Communications. 2011. 182: p. 898-911.
@ -68,28 +71,64 @@ and
Brown, W.M., Kohlmeyer, A. Plimpton, S.J., Tharrington, A.N. Implementing
Molecular Dynamics on Hybrid High Performance Computers - Particle-Particle
Particle-Mesh. Computer Physics Communications. 2011. In press.
Particle-Mesh. Computer Physics Communications. 2012. 183: p. 449-459.
and
Brown, W.M., Masako, Y. Implementing Molecular Dynamics on Hybrid High
Performance Computers - Three-Body Potentials. Computer Physics Communications.
2013. In press.
----
NOTE: Installation of the CUDA SDK is not required.
Current styles supporting GPU acceleration:
1. lj/cut
2. lj96/cut
3. lj/expand
4. lj/cut/coul/cut
5. lj/cut/coul/long
6. lj/charmm/coul/long
7. lj/class2
8. lj/class2/coul/long
9. morse
10. cg/cmm
11. cg/cmm/coul/long
12. coul/long
13. gayberne
14. resquared
15. pppm
1 beck
2 born/coul/long
3 born/coul/wolf
4 born
5 buck/coul/cut
6 buck/coul/long
7 buck
8 colloid
9 coul/dsf
10 coul/long
11 eam/alloy
12 eam/fs
13 eam
14 eam/lj
15 gauss
16 gayberne
17 lj96/cut
18 lj/charmm/coul/long
19 lj/class2/coul/long
20 lj/class2
21 lj/cut/coul/cut
22 lj/cut/coul/debye
23 lj/cut/coul/dsf
24 lj/cut/coul/long
25 lj/cut/coul/msm
26 lj/cut/coul/wolf/fsw
27 lj/cut/dipole/cut
28 lj/cut
29 lj/cut/tgpu
30 lj/expand
31 lj/sdk/coul/long
32 cg/cmm/coul/long
33 lj/sdk
34 cg/cmm
35 lj/sf/dipole/sf
36 mie/cut
37 morse
38 resquared
39 soft
40 sw
41 table
42 yukawa/colloid
43 yukawa
44 pppm
MULTIPLE LAMMPS PROCESSES
@ -170,3 +209,4 @@ make yes-asphere
make yes-kspace
make yes-gpu
make linux

View File

@ -1 +1 @@
Geryon Version 12.033
Geryon Version 13.209

View File

@ -65,15 +65,19 @@ class UCL_Device {
public:
/// Collect properties for every GPU on the node
/** \note You must set the active GPU with set() before using the device **/
UCL_Device();
inline UCL_Device();
~UCL_Device();
inline ~UCL_Device();
/// Returns 1 (For compatibility with OpenCL)
inline int num_platforms() { return 1; }
/// Return a string with name and info of the current platform
std::string platform_name() { return "NVIDIA Corporation NVIDIA CUDA Driver"; }
inline std::string platform_name()
{ return "NVIDIA Corporation NVIDIA CUDA Driver"; }
/// Delete any contexts/data and set the platform number to be used
inline int set_platform(const int pid);
/// Return the number of devices that support CUDA
inline int num_devices() { return _properties.size(); }
@ -81,8 +85,12 @@ class UCL_Device {
/// Set the CUDA device to the specified device number
/** A context and default command queue will be created for the device
* Returns UCL_SUCCESS if successful or UCL_ERROR if the device could not
* be allocated for use **/
int set(int num);
* be allocated for use. clear() is called to delete any contexts and
* associated data from previous calls to set(). **/
inline int set(int num);
/// Delete any context and associated data stored from a call to set()
inline void clear();
/// Get the current device number
inline int device_num() { return _device; }
@ -147,16 +155,17 @@ class UCL_Device {
inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; }
/// Returns true if double precision is support for the current device
bool double_precision() { return double_precision(_device); }
inline bool double_precision() { return double_precision(_device); }
/// Returns true if double precision is support for the device
bool double_precision(const int i) {return arch(i)>=1.3;}
inline bool double_precision(const int i) {return arch(i)>=1.3;}
/// Get the number of cores in the current device
inline unsigned cores() { return cores(_device); }
/// Get the number of cores
inline unsigned cores(const int i)
{ if (arch(i)<2.0) return _properties[i].multiProcessorCount*8;
else if (arch(i)<3.0) return _properties[i].multiProcessorCount*32;
else if (arch(i)<2.1) return _properties[i].multiProcessorCount*32;
else if (arch(i)<3.0) return _properties[i].multiProcessorCount*48;
else return _properties[i].multiProcessorCount*192; }
/// Get the gigabytes of global memory in the current device
@ -216,8 +225,34 @@ class UCL_Device {
inline bool sharing_supported(const int i)
{ return (_properties[i].computeMode == CU_COMPUTEMODE_DEFAULT); }
/// True if splitting device into equal subdevices supported
inline bool fission_equal()
{ return fission_equal(_device); }
/// True if splitting device into equal subdevices supported
inline bool fission_equal(const int i)
{ return false; }
/// True if splitting device into subdevices by specified counts supported
inline bool fission_by_counts()
{ return fission_by_counts(_device); }
/// True if splitting device into subdevices by specified counts supported
inline bool fission_by_counts(const int i)
{ return false; }
/// True if splitting device into subdevices by affinity domains supported
inline bool fission_by_affinity()
{ return fission_by_affinity(_device); }
/// True if splitting device into subdevices by affinity domains supported
inline bool fission_by_affinity(const int i)
{ return false; }
/// Maximum number of subdevices allowed from device fission
inline int max_sub_devices()
{ return max_sub_devices(_device); }
/// Maximum number of subdevices allowed from device fission
inline int max_sub_devices(const int i)
{ return 0; }
/// List all devices along with all properties
void print_all(std::ostream &out);
inline void print_all(std::ostream &out);
private:
int _device, _num_devices;
@ -228,7 +263,7 @@ class UCL_Device {
};
// Grabs the properties for all devices
inline UCL_Device::UCL_Device() {
UCL_Device::UCL_Device() {
CU_SAFE_CALL_NS(cuInit(0));
CU_SAFE_CALL_NS(cuDeviceGetCount(&_num_devices));
for (int dev=0; dev<_num_devices; ++dev) {
@ -280,22 +315,21 @@ inline UCL_Device::UCL_Device() {
_cq.back()=0;
}
inline UCL_Device::~UCL_Device() {
if (_device>-1) {
for (int i=1; i<num_queues(); i++) pop_command_queue();
cuCtxDestroy(_context);
}
UCL_Device::~UCL_Device() {
clear();
}
int UCL_Device::set_platform(const int pid) {
clear();
#ifdef UCL_DEBUG
assert(pid<num_platforms());
#endif
return UCL_SUCCESS;
}
// Set the CUDA device to the specified device number
inline int UCL_Device::set(int num) {
if (_device==num)
return UCL_SUCCESS;
if (_device>-1) {
CU_SAFE_CALL_NS(cuCtxDestroy(_context));
for (int i=1; i<num_queues(); i++) pop_command_queue();
_cq[0]=0;
}
int UCL_Device::set(int num) {
clear();
_device=_properties[num].device_id;
CU_SAFE_CALL_NS(cuDeviceGet(&_cu_device,_device));
CUresult err=cuCtxCreate(&_context,0,_cu_device);
@ -310,8 +344,16 @@ inline int UCL_Device::set(int num) {
return UCL_SUCCESS;
}
void UCL_Device::clear() {
if (_device>-1) {
for (int i=1; i<num_queues(); i++) pop_command_queue();
cuCtxDestroy(_context);
}
_device=-1;
}
// List all devices along with all properties
inline void UCL_Device::print_all(std::ostream &out) {
void UCL_Device::print_all(std::ostream &out) {
#if CUDA_VERSION >= 2020
int driver_version;
cuDriverGetVersion(&driver_version);

View File

@ -377,6 +377,10 @@ class UCL_Kernel {
#endif
}
/// Return the default command queue/stream associated with this data
inline command_queue & cq() { return _cq; }
/// Change the default command queue associated with matrix
inline void cq(command_queue &cq_in) { _cq=cq_in; }
#include "ucl_arg_kludge.h"
private:

View File

@ -47,14 +47,14 @@ typedef CUdeviceptr device_ptr;
// --------------------------------------------------------------------------
template <class mat_type, class copy_type>
inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
const enum UCL_MEMOPT kind) {
const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
CUresult err=CUDA_SUCCESS;
if (kind==UCL_RW_OPTIMIZED)
err=cuMemAllocHost((void **)mat.host_ptr(),n);
else if (kind==UCL_WRITE_OPTIMIZED)
if (kind==UCL_NOT_PINNED)
*(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
else if (kind==UCL_WRITE_ONLY)
err=cuMemHostAlloc((void **)mat.host_ptr(),n,CU_MEMHOSTALLOC_WRITECOMBINED);
else
*(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
err=cuMemAllocHost((void **)mat.host_ptr(),n);
if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL)
return UCL_MEMORY_ERROR;
mat.cq()=cm.cq();
@ -63,14 +63,14 @@ inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
template <class mat_type>
inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
const enum UCL_MEMOPT kind) {
const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
CUresult err=CUDA_SUCCESS;
if (kind==UCL_RW_OPTIMIZED)
err=cuMemAllocHost((void **)mat.host_ptr(),n);
else if (kind==UCL_WRITE_OPTIMIZED)
if (kind==UCL_NOT_PINNED)
*(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
else if (kind==UCL_WRITE_ONLY)
err=cuMemHostAlloc((void **)mat.host_ptr(),n,CU_MEMHOSTALLOC_WRITECOMBINED);
else
*(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
err=cuMemAllocHost((void **)mat.host_ptr(),n);
if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL)
return UCL_MEMORY_ERROR;
mat.cq()=dev.cq();
@ -78,8 +78,10 @@ inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
}
template <class mat_type>
inline void _host_free(mat_type &mat, const enum UCL_MEMOPT kind) {
if (kind!=UCL_NOT_PINNED)
inline void _host_free(mat_type &mat) {
if (mat.kind()==UCL_VIEW)
return;
else if (mat.kind()!=UCL_NOT_PINNED)
CU_DESTRUCT_CALL(cuMemFreeHost(mat.begin()));
else
free(mat.begin());
@ -87,14 +89,14 @@ inline void _host_free(mat_type &mat, const enum UCL_MEMOPT kind) {
template <class mat_type>
inline int _host_resize(mat_type &mat, const size_t n) {
_host_free(mat,mat.kind());
_host_free(mat);
CUresult err=CUDA_SUCCESS;
if (mat.kind()==UCL_RW_OPTIMIZED)
err=cuMemAllocHost((void **)mat.host_ptr(),n);
else if (mat.kind()==UCL_WRITE_OPTIMIZED)
if (mat.kind()==UCL_NOT_PINNED)
*(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
else if (mat.kind()==UCL_WRITE_ONLY)
err=cuMemHostAlloc((void **)mat.host_ptr(),n,CU_MEMHOSTALLOC_WRITECOMBINED);
else
*(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
err=cuMemAllocHost((void **)mat.host_ptr(),n);
if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL)
return UCL_MEMORY_ERROR;
return UCL_SUCCESS;
@ -155,7 +157,8 @@ inline int _device_alloc(mat_type &mat, UCL_Device &d, const size_t rows,
template <class mat_type>
inline void _device_free(mat_type &mat) {
CU_DESTRUCT_CALL(cuMemFree(mat.cbegin()));
if (mat.kind()!=UCL_VIEW)
CU_DESTRUCT_CALL(cuMemFree(mat.cbegin()));
}
template <class mat_type>
@ -229,13 +232,13 @@ inline void _host_zero(void *ptr, const size_t n) {
}
template <class mat_type>
inline void _device_zero(mat_type &mat, const size_t n) {
inline void _device_zero(mat_type &mat, const size_t n, command_queue &cq) {
if (n%32==0)
CU_SAFE_CALL(cuMemsetD32(mat.cbegin(),0,n/4));
CU_SAFE_CALL(cuMemsetD32Async(mat.cbegin(),0,n/4,cq));
else if (n%16==0)
CU_SAFE_CALL(cuMemsetD16(mat.cbegin(),0,n/2));
CU_SAFE_CALL(cuMemsetD16Async(mat.cbegin(),0,n/2,cq));
else
CU_SAFE_CALL(cuMemsetD8(mat.cbegin(),0,n));
CU_SAFE_CALL(cuMemsetD8Async(mat.cbegin(),0,n,cq));
}
// --------------------------------------------------------------------------

View File

@ -51,6 +51,10 @@ inline void ucl_sync(cl_command_queue &cq) {
CL_SAFE_CALL(clFinish(cq));
}
inline bool _shared_mem_device(cl_device_type &device_type) {
return (device_type==CL_DEVICE_TYPE_CPU);
}
struct OCLProperties {
std::string name;
cl_device_type device_type;
@ -64,6 +68,10 @@ struct OCLProperties {
bool double_precision;
int alignment;
size_t timer_resolution;
bool ecc_support;
std::string c_version;
bool partition_equal, partition_counts, partition_affinity;
cl_uint max_sub_devices;
};
/// Class for looking at data parallel device properties
@ -74,15 +82,18 @@ class UCL_Device {
public:
/// Collect properties for every device on the node
/** \note You must set the active GPU with set() before using the device **/
UCL_Device();
inline UCL_Device();
~UCL_Device();
inline ~UCL_Device();
/// Return the number of platforms (0 if error or no platforms)
inline int num_platforms() { return _num_platforms; }
/// Return a string with name and info of the current platform
std::string platform_name();
inline std::string platform_name();
/// Delete any contexts/data and set the platform number to be used
inline int set_platform(const int pid);
/// Return the number of devices that support OpenCL
inline int num_devices() { return _num_devices; }
@ -90,8 +101,12 @@ class UCL_Device {
/// Set the OpenCL device to the specified device number
/** A context and default command queue will be created for the device *
* Returns UCL_SUCCESS if successful or UCL_ERROR if the device could not
* be allocated for use **/
int set(int num);
* be allocated for use. clear() is called to delete any contexts and
* associated data from previous calls to set(). **/
inline int set(int num);
/// Delete any context and associated data stored from a call to set()
inline void clear();
/// Get the current device number
inline int device_num() { return _device; }
@ -161,12 +176,14 @@ class UCL_Device {
/// Returns true if host memory is efficiently addressable from device
inline bool shared_memory() { return shared_memory(_device); }
/// Returns true if host memory is efficiently addressable from device
inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; }
inline bool shared_memory(const int i)
{ return _shared_mem_device(_properties[i].device_type); }
/// Returns true if double precision is support for the current device
bool double_precision() { return double_precision(_device); }
inline bool double_precision() { return double_precision(_device); }
/// Returns true if double precision is support for the device
bool double_precision(const int i) {return _properties[i].double_precision;}
inline bool double_precision(const int i)
{return _properties[i].double_precision;}
/// Get the number of cores in the current device
inline unsigned cores() { return cores(_device); }
@ -227,8 +244,34 @@ class UCL_Device {
inline bool sharing_supported(const int i)
{ return true; }
/// True if splitting device into equal subdevices supported
inline bool fission_equal()
{ return fission_equal(_device); }
/// True if splitting device into equal subdevices supported
inline bool fission_equal(const int i)
{ return _properties[i].partition_equal; }
/// True if splitting device into subdevices by specified counts supported
inline bool fission_by_counts()
{ return fission_by_counts(_device); }
/// True if splitting device into subdevices by specified counts supported
inline bool fission_by_counts(const int i)
{ return _properties[i].partition_counts; }
/// True if splitting device into subdevices by affinity domains supported
inline bool fission_by_affinity()
{ return fission_by_affinity(_device); }
/// True if splitting device into subdevices by affinity domains supported
inline bool fission_by_affinity(const int i)
{ return _properties[i].partition_affinity; }
/// Maximum number of subdevices allowed from device fission
inline int max_sub_devices()
{ return max_sub_devices(_device); }
/// Maximum number of subdevices allowed from device fission
inline int max_sub_devices(const int i)
{ return _properties[i].max_sub_devices; }
/// List all devices along with all properties
void print_all(std::ostream &out);
inline void print_all(std::ostream &out);
/// Return the OpenCL type for the device
inline cl_device_id & cl_device() { return _cl_device; }
@ -237,7 +280,8 @@ class UCL_Device {
int _num_platforms; // Number of platforms
int _platform; // UCL_Device ID for current platform
cl_platform_id _cl_platform; // OpenCL ID for current platform
cl_context _context; // Context used for accessing the device
cl_platform_id _cl_platforms[20]; // OpenCL IDs for all platforms
cl_context _context; // Context used for accessing the device
std::vector<cl_command_queue> _cq;// The default command queue for this device
int _device; // UCL_Device ID for current device
cl_device_id _cl_device; // OpenCL ID for current device
@ -245,24 +289,18 @@ class UCL_Device {
int _num_devices; // Number of devices
std::vector<OCLProperties> _properties; // Properties for each device
void add_properties(cl_device_id);
int create_context();
inline void add_properties(cl_device_id);
inline int create_context();
int _default_cq;
};
// Grabs the properties for all devices
inline UCL_Device::UCL_Device() {
cl_int errorv;
cl_uint nplatforms;
_cl_device=0;
UCL_Device::UCL_Device() {
_device=-1;
_num_devices=0;
_platform=0;
_default_cq=0;
// --- Get Number of Platforms
errorv=clGetPlatformIDs(1,&_cl_platform,&nplatforms);
cl_uint nplatforms;
cl_int errorv=clGetPlatformIDs(20,_cl_platforms,&nplatforms);
if (errorv!=CL_SUCCESS) {
_num_platforms=0;
@ -270,6 +308,38 @@ inline UCL_Device::UCL_Device() {
} else
_num_platforms=static_cast<int>(nplatforms);
set_platform(0);
}
UCL_Device::~UCL_Device() {
clear();
}
void UCL_Device::clear() {
if (_device>-1) {
for (size_t i=0; i<_cq.size(); i++) {
CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq.back()));
_cq.pop_back();
}
CL_DESTRUCT_CALL(clReleaseContext(_context));
}
_device=-1;
}
int UCL_Device::set_platform(int pid) {
clear();
cl_int errorv;
_cl_device=0;
_device=-1;
_num_devices=0;
_default_cq=0;
#ifdef UCL_DEBUG
assert(pid<num_platforms());
#endif
_platform=pid;
_cl_platform=_cl_platforms[_platform];
// --- Get Number of Devices
cl_uint n;
@ -277,7 +347,7 @@ inline UCL_Device::UCL_Device() {
_num_devices=n;
if (errorv!=CL_SUCCESS || _num_devices==0) {
_num_devices=0;
return;
return UCL_ERROR;
}
cl_device_id device_list[_num_devices];
CL_SAFE_CALL(clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,n,device_list,
@ -288,19 +358,11 @@ inline UCL_Device::UCL_Device() {
_cl_devices.push_back(device_list[i]);
add_properties(device_list[i]);
}
return UCL_SUCCESS;
}
inline UCL_Device::~UCL_Device() {
if (_device>-1) {
for (size_t i=0; i<_cq.size(); i++) {
CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq.back()));
_cq.pop_back();
}
CL_DESTRUCT_CALL(clReleaseContext(_context));
}
}
inline int UCL_Device::create_context() {
int UCL_Device::create_context() {
cl_int errorv;
cl_context_properties props[3];
props[0]=CL_CONTEXT_PLATFORM;
@ -320,9 +382,10 @@ inline int UCL_Device::create_context() {
return UCL_SUCCESS;
}
inline void UCL_Device::add_properties(cl_device_id device_list) {
void UCL_Device::add_properties(cl_device_id device_list) {
OCLProperties op;
char buffer[1024];
cl_bool ans_bool;
CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_NAME,1024,buffer,NULL));
op.name=buffer;
@ -363,10 +426,49 @@ inline void UCL_Device::add_properties(cl_device_id device_list) {
CL_DEVICE_PROFILING_TIMER_RESOLUTION,
sizeof(size_t),&op.timer_resolution,NULL));
op.ecc_support=false;
CL_SAFE_CALL(clGetDeviceInfo(device_list,
CL_DEVICE_ERROR_CORRECTION_SUPPORT,
sizeof(ans_bool),&ans_bool,NULL));
if (ans_bool==CL_TRUE)
op.ecc_support=true;
op.c_version="";
op.partition_equal=false;
op.partition_counts=false;
op.partition_affinity=false;
#ifdef CL_VERSION_1_2
size_t return_bytes;
CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_OPENCL_C_VERSION,1024,
buffer,NULL));
op.c_version=buffer;
cl_device_partition_property pinfo[4];
CL_SAFE_CALL(clGetDeviceInfo(device_list,
CL_DEVICE_PARTITION_PROPERTIES,
4*sizeof(cl_device_partition_property),
pinfo,&return_bytes));
int nprops=return_bytes/sizeof(cl_device_partition_property);
for (int i=0; i<nprops; i++) {
if (pinfo[i]==CL_DEVICE_PARTITION_EQUALLY)
op.partition_equal=true;
else if (pinfo[i]==CL_DEVICE_PARTITION_BY_COUNTS)
op.partition_counts=true;
else if (pinfo[i]==CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN)
op.partition_affinity=true;
}
CL_SAFE_CALL(clGetDeviceInfo(device_list,
CL_DEVICE_PARTITION_MAX_SUB_DEVICES,
sizeof(cl_uint),&op.max_sub_devices,NULL));
#endif
_properties.push_back(op);
}
inline std::string UCL_Device::platform_name() {
std::string UCL_Device::platform_name() {
char info[1024];
CL_SAFE_CALL(clGetPlatformInfo(_cl_platform,CL_PLATFORM_VENDOR,1024,info,
@ -385,7 +487,7 @@ inline std::string UCL_Device::platform_name() {
}
// Get a string telling the type of the device
inline std::string UCL_Device::device_type_name(const int i) {
std::string UCL_Device::device_type_name(const int i) {
if (_properties[i].device_type==CL_DEVICE_TYPE_CPU)
return "CPU";
else if (_properties[i].device_type==CL_DEVICE_TYPE_GPU)
@ -397,7 +499,7 @@ inline std::string UCL_Device::device_type_name(const int i) {
}
// Get a string telling the type of the device
inline int UCL_Device::device_type(const int i) {
int UCL_Device::device_type(const int i) {
if (_properties[i].device_type==CL_DEVICE_TYPE_CPU)
return UCL_CPU;
else if (_properties[i].device_type==CL_DEVICE_TYPE_GPU)
@ -409,17 +511,8 @@ inline int UCL_Device::device_type(const int i) {
}
// Set the CUDA device to the specified device number
inline int UCL_Device::set(int num) {
if (_device==num)
return UCL_SUCCESS;
if (_device>-1) {
for (size_t i=0; i<_cq.size(); i++) {
CL_SAFE_CALL(clReleaseCommandQueue(_cq.back()));
_cq.pop_back();
}
CL_SAFE_CALL(clReleaseContext(_context));
}
int UCL_Device::set(int num) {
clear();
cl_device_id device_list[_num_devices];
cl_uint n;
@ -432,7 +525,7 @@ inline int UCL_Device::set(int num) {
}
// List all devices along with all properties
inline void UCL_Device::print_all(std::ostream &out) {
void UCL_Device::print_all(std::ostream &out) {
if (num_devices() == 0)
out << "There is no device supporting OpenCL\n";
for (int i=0; i<num_devices(); ++i) {
@ -475,6 +568,28 @@ inline void UCL_Device::print_all(std::ostream &out) {
out << " Clock rate: "
<< clock_rate(i) << " GHz\n";
//out << " Concurrent copy and execution: ";
out << " ECC support: ";
if (_properties[i].ecc_support)
out << "Yes\n";
else
out << "No\n";
out << " Device fission into equal partitions: ";
if (fission_equal(i))
out << "Yes\n";
else
out << "No\n";
out << " Device fission by counts: ";
if (fission_by_counts(i))
out << "Yes\n";
else
out << "No\n";
out << " Device fission by affinity: ";
if (fission_by_affinity(i))
out << "Yes\n";
else
out << "No\n";
out << " Maximum subdevices from fission: "
<< max_sub_devices(i) << std::endl;
}
}

View File

@ -134,6 +134,11 @@ class UCL_Program {
return UCL_SUCCESS;
}
/// Return the default command queue/stream associated with this data
inline command_queue & cq() { return _cq; }
/// Change the default command queue associated with matrix
inline void cq(command_queue &cq_in) { _cq=cq_in; }
friend class UCL_Kernel;
private:
bool _init_done;
@ -175,7 +180,16 @@ class UCL_Kernel {
template <class dtype>
inline void set_arg(const cl_uint index, const dtype * const arg) {
CL_SAFE_CALL(clSetKernelArg(_kernel,index,sizeof(dtype),arg));
if (index>_num_args) _num_args=index;
if (index>_num_args) {
_num_args=index;
#ifdef UCL_DEBUG
if (_num_args>_kernel_info_nargs) {
std::cerr << "TOO MANY ARGUMENTS TO OPENCL FUNCTION: "
<< _kernel_info_name << std::endl;
assert(0==1);
}
#endif
}
}
/// Set a geryon container as a kernel argument.
@ -203,6 +217,13 @@ class UCL_Kernel {
inline void add_arg(const dtype * const arg) {
CL_SAFE_CALL(clSetKernelArg(_kernel,_num_args,sizeof(dtype),arg));
_num_args++;
#ifdef UCL_DEBUG
if (_num_args>_kernel_info_nargs) {
std::cerr << "TOO MANY ARGUMENTS TO OPENCL FUNCTION: "
<< _kernel_info_name << std::endl;
assert(0==1);
}
#endif
}
/// Add a geryon container as a kernel argument.
@ -289,10 +310,7 @@ class UCL_Kernel {
}
/// Run the kernel in the default command queue
inline void run() {
CL_SAFE_CALL(clEnqueueNDRangeKernel(_cq,_kernel,_dimensions,NULL,
_num_blocks,_block_size,0,NULL,NULL));
}
inline void run();
/// Clear any arguments associated with the kernel
inline void clear_args() { _num_args=0; }
@ -309,6 +327,12 @@ class UCL_Kernel {
cl_command_queue _cq; // The default command queue for this kernel
unsigned _num_args;
#ifdef UCL_DEBUG
std::string _kernel_info_name;
unsigned _kernel_info_nargs;
//std::string _kernel_info_args[256];
#endif
};
inline int UCL_Kernel::set_function(UCL_Program &program, const char *function) {
@ -329,9 +353,32 @@ inline int UCL_Kernel::set_function(UCL_Program &program, const char *function)
#endif
return UCL_FUNCTION_NOT_FOUND;
}
#ifdef UCL_DEBUG
_kernel_info_name=function;
cl_uint nargs;
CL_SAFE_CALL(clGetKernelInfo(_kernel,CL_KERNEL_NUM_ARGS,sizeof(cl_uint),
&nargs,NULL));
_kernel_info_nargs=nargs;
#ifdef NOT_TEST_CL_VERSION_1_2
char tname[256];
size_t ret;
for (cl_uint i=0; i<nargs; i++) {
CL_SAFE_CALL(clGetKernelArgInfo(_kernel,i,CL_KERNEL_ARG_TYPE_NAME,256,
tname,&ret));
_kernel_info_args[i]=tname;
}
#endif
#endif
return UCL_SUCCESS;
}
void UCL_Kernel::run() {
CL_SAFE_CALL(clEnqueueNDRangeKernel(_cq,_kernel,_dimensions,NULL,
_num_blocks,_block_size,0,NULL,NULL));
}
} // namespace
#endif

View File

@ -54,82 +54,138 @@ typedef cl_mem device_ptr;
template <class mat_type, class copy_type>
inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
const enum UCL_MEMOPT kind) {
const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
cl_int error_flag;
cl_context context;
CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_CONTEXT,sizeof(context),
&context,NULL));
if (kind==UCL_VIEW) {
mat.cbegin()=clCreateBuffer(context, CL_MEM_USE_HOST_PTR,n,mat.host_ptr(),
&error_flag);
CL_CHECK_ERR(error_flag);
return UCL_SUCCESS;
}
if (kind==UCL_WRITE_OPTIMIZED) {
mat.cbegin()=clCreateBuffer(context,
CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
n,NULL,&error_flag);
if (error_flag != CL_SUCCESS)
return UCL_MEMORY_ERROR;
*mat.host_ptr() = (typename mat_type::data_type*)
clEnqueueMapBuffer(cm.cq(),mat.cbegin(),CL_TRUE,
CL_MAP_WRITE,0,n,0,NULL,NULL,NULL);
cl_mem_flags buffer_perm;
cl_map_flags map_perm;
if (kind2==UCL_NOT_SPECIFIED) {
if (kind==UCL_READ_ONLY) {
#ifdef CL_VERSION_1_2
buffer_perm=CL_MEM_HOST_READ_ONLY|CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR;
#else
buffer_perm=CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR;
#endif
map_perm=CL_MAP_READ;
} else if (kind==UCL_WRITE_ONLY) {
#ifdef CL_VERSION_1_2
buffer_perm=CL_MEM_HOST_WRITE_ONLY|CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR;
#else
buffer_perm=CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR;
#endif
map_perm=CL_MAP_WRITE;
} else {
buffer_perm=CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR;
map_perm=CL_MAP_READ | CL_MAP_WRITE;
}
} else {
mat.cbegin()=clCreateBuffer(context,
CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
n,NULL,&error_flag);
if (error_flag != CL_SUCCESS)
return UCL_MEMORY_ERROR;
if (kind2==UCL_READ_ONLY)
buffer_perm=CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR;
else if (kind2==UCL_WRITE_ONLY)
buffer_perm=CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR;
else
buffer_perm=CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR;
if (kind==UCL_READ_ONLY) {
#ifdef CL_VERSION_1_2
buffer_perm=buffer_perm | CL_MEM_HOST_READ_ONLY;
#endif
map_perm=CL_MAP_READ;
} else if (kind==UCL_WRITE_ONLY) {
#ifdef CL_VERSION_1_2
buffer_perm=buffer_perm | CL_MEM_HOST_WRITE_ONLY;
#endif
map_perm=CL_MAP_WRITE;
} else
map_perm=CL_MAP_READ | CL_MAP_WRITE;
}
mat.cbegin()=clCreateBuffer(context,buffer_perm,n,NULL,&error_flag);
if (error_flag != CL_SUCCESS)
return UCL_MEMORY_ERROR;
*mat.host_ptr() = (typename mat_type::data_type*)
clEnqueueMapBuffer(cm.cq(),mat.cbegin(),CL_TRUE,
CL_MAP_READ | CL_MAP_WRITE,
0,n,0,NULL,NULL,NULL);
}
map_perm,0,n,0,NULL,NULL,NULL);
mat.cq()=cm.cq();
CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
return UCL_SUCCESS;
}
template <class mat_type, class copy_type>
inline int _host_view(mat_type &mat, copy_type &cm, const size_t n) {
cl_int error_flag;
cl_context context;
CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_CONTEXT,sizeof(context),
&context,NULL));
cl_mem_flags orig_flags;
CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_FLAGS,sizeof(orig_flags),
&orig_flags,NULL));
orig_flags=orig_flags & ~CL_MEM_ALLOC_HOST_PTR;
mat.cbegin()=clCreateBuffer(context, CL_MEM_USE_HOST_PTR | orig_flags, n,
mat.host_ptr(), &error_flag);
CL_CHECK_ERR(error_flag);
CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
return UCL_SUCCESS;
}
template <class mat_type>
inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
const enum UCL_MEMOPT kind) {
cl_int error_flag;
if (kind==UCL_VIEW) {
mat.cbegin()=clCreateBuffer(dev.context(), CL_MEM_USE_HOST_PTR,
n,mat.host_ptr(),&error_flag);
CL_CHECK_ERR(error_flag);
return UCL_SUCCESS;
}
if (kind==UCL_WRITE_OPTIMIZED) {
mat.cbegin()=clCreateBuffer(dev.context(),
CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
n,NULL,&error_flag);
if (error_flag != CL_SUCCESS)
return UCL_MEMORY_ERROR;
*mat.host_ptr() = (typename mat_type::data_type*)
clEnqueueMapBuffer(dev.cq(),mat.cbegin(),CL_TRUE,
CL_MAP_WRITE,0,n,0,NULL,NULL,NULL);
const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
cl_mem_flags buffer_perm;
cl_map_flags map_perm;
if (kind==UCL_READ_ONLY) {
#ifdef CL_VERSION_1_2
buffer_perm=CL_MEM_HOST_READ_ONLY|CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR;
#else
buffer_perm=CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR;
#endif
map_perm=CL_MAP_READ;
} else if (kind==UCL_WRITE_ONLY) {
#ifdef CL_VERSION_1_2
buffer_perm=CL_MEM_HOST_WRITE_ONLY|CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR;
#else
buffer_perm=CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR;
#endif
map_perm=CL_MAP_WRITE;
} else {
mat.cbegin()=clCreateBuffer(dev.context(),
CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
n,NULL,&error_flag);
if (error_flag != CL_SUCCESS)
return UCL_MEMORY_ERROR;
*mat.host_ptr() = (typename mat_type::data_type*)
clEnqueueMapBuffer(dev.cq(),mat.cbegin(),CL_TRUE,
CL_MAP_READ & CL_MAP_WRITE,
0,n,0,NULL,NULL,NULL);
buffer_perm=CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR;
map_perm=CL_MAP_READ | CL_MAP_WRITE;
}
cl_int error_flag;
mat.cbegin()=clCreateBuffer(dev.context(),buffer_perm,n,NULL,&error_flag);
if (error_flag != CL_SUCCESS)
return UCL_MEMORY_ERROR;
*mat.host_ptr() = (typename mat_type::data_type*)
clEnqueueMapBuffer(dev.cq(),mat.cbegin(),CL_TRUE,
map_perm,0,n,0,NULL,NULL,NULL);
mat.cq()=dev.cq();
CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
return UCL_SUCCESS;
}
template <class mat_type>
inline void _host_free(mat_type &mat, const enum UCL_MEMOPT kind) {
CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
CL_DESTRUCT_CALL(clReleaseCommandQueue(mat.cq()));
inline int _host_view(mat_type &mat, UCL_Device &dev, const size_t n) {
cl_int error_flag;
mat.cbegin()=clCreateBuffer(dev.context(), CL_MEM_USE_HOST_PTR,
n,mat.host_ptr(),&error_flag);
CL_CHECK_ERR(error_flag);
CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
return UCL_SUCCESS;
}
template <class mat_type>
inline void _host_free(mat_type &mat) {
if (mat.cols()>0) {
CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
CL_DESTRUCT_CALL(clReleaseCommandQueue(mat.cq()));
}
}
template <class mat_type>
@ -138,28 +194,26 @@ inline int _host_resize(mat_type &mat, const size_t n) {
cl_context context;
CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_CONTEXT,sizeof(context),
&context,NULL));
cl_mem_flags buffer_perm;
CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_FLAGS,sizeof(buffer_perm),
&buffer_perm,NULL));
CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
if (mat.kind()==UCL_WRITE_OPTIMIZED) {
mat.cbegin()=clCreateBuffer(context,
CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
n,NULL,&error_flag);
if (error_flag != CL_SUCCESS)
return UCL_MEMORY_ERROR;
*mat.host_ptr() = (typename mat_type::data_type*)
clEnqueueMapBuffer(mat.cq(),mat.cbegin(),CL_TRUE,
CL_MAP_WRITE,0,n,0,NULL,NULL,NULL);
} else {
mat.cbegin()=clCreateBuffer(context,
CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
n,NULL,&error_flag);
if (error_flag != CL_SUCCESS)
return UCL_MEMORY_ERROR;
*mat.host_ptr() = (typename mat_type::data_type*)
clEnqueueMapBuffer(mat.cq(),mat.cbegin(),CL_TRUE,
CL_MAP_READ | CL_MAP_WRITE,
0,n,0,NULL,NULL,NULL);
}
cl_map_flags map_perm;
if (mat.kind()==UCL_READ_ONLY)
map_perm=CL_MAP_READ;
else if (mat.kind()==UCL_WRITE_ONLY)
map_perm=CL_MAP_WRITE;
else
map_perm=CL_MAP_READ | CL_MAP_WRITE;
mat.cbegin()=clCreateBuffer(context,buffer_perm,n,NULL,&error_flag);
if (error_flag != CL_SUCCESS)
return UCL_MEMORY_ERROR;
*mat.host_ptr() = (typename mat_type::data_type*)
clEnqueueMapBuffer(mat.cq(),mat.cbegin(),CL_TRUE,
map_perm,0,n,0,NULL,NULL,NULL);
return UCL_SUCCESS;
}
@ -179,9 +233,17 @@ inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t n,
if (kind==UCL_READ_WRITE)
flag=CL_MEM_READ_WRITE;
else if (kind==UCL_READ_ONLY)
#ifdef CL_VERSION_1_2
flag=CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY;
#else
flag=CL_MEM_READ_ONLY;
#endif
else if (kind==UCL_WRITE_ONLY)
#ifdef CL_VERSION_1_2
flag=CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY;
#else
flag=CL_MEM_WRITE_ONLY;
#endif
else
assert(0==1);
mat.cbegin()=clCreateBuffer(context,flag,n,NULL,&error_flag);
@ -200,9 +262,17 @@ inline int _device_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
if (kind==UCL_READ_WRITE)
flag=CL_MEM_READ_WRITE;
else if (kind==UCL_READ_ONLY)
#ifdef CL_VERSION_1_2
flag=CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY;
#else
flag=CL_MEM_READ_ONLY;
#endif
else if (kind==UCL_WRITE_ONLY)
#ifdef CL_VERSION_1_2
flag=CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY;
#else
flag=CL_MEM_WRITE_ONLY;
#endif
else
assert(0==1);
mat.cbegin()=clCreateBuffer(dev.context(),flag,n,NULL,
@ -238,8 +308,10 @@ inline int _device_alloc(mat_type &mat, UCL_Device &dev, const size_t rows,
template <class mat_type>
inline void _device_free(mat_type &mat) {
CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
CL_DESTRUCT_CALL(clReleaseCommandQueue(mat.cq()));
if (mat.cols()>0) {
CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
CL_DESTRUCT_CALL(clReleaseCommandQueue(mat.cq()));
}
}
template <class mat_type>
@ -255,9 +327,17 @@ inline int _device_resize(mat_type &mat, const size_t n) {
if (mat.kind()==UCL_READ_WRITE)
flag=CL_MEM_READ_WRITE;
else if (mat.kind()==UCL_READ_ONLY)
#ifdef CL_VERSION_1_2
flag=CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY;
#else
flag=CL_MEM_READ_ONLY;
#endif
else if (mat.kind()==UCL_WRITE_ONLY)
#ifdef CL_VERSION_1_2
flag=CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY;
#else
flag=CL_MEM_WRITE_ONLY;
#endif
else
assert(0==1);
mat.cbegin()=clCreateBuffer(context,flag,n,NULL,&error_flag);
@ -285,9 +365,17 @@ inline int _device_resize(mat_type &mat, const size_t rows,
if (mat.kind()==UCL_READ_WRITE)
flag=CL_MEM_READ_WRITE;
else if (mat.kind()==UCL_READ_ONLY)
#ifdef CL_VERSION_1_2
flag=CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY;
#else
flag=CL_MEM_READ_ONLY;
#endif
else if (mat.kind()==UCL_WRITE_ONLY)
#ifdef CL_VERSION_1_2
flag=CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY;
#else
flag=CL_MEM_WRITE_ONLY;
#endif
else
assert(0==1);
mat.cbegin()=clCreateBuffer(context,flag,pitch*rows,NULL,&error_flag);
@ -344,7 +432,19 @@ inline void _ocl_kernel_from_source(cl_context &context, cl_device_id &device,
}
template <class mat_type>
inline void _device_zero(mat_type &mat, const size_t n) {
inline void _device_zero(mat_type &mat, const size_t n, command_queue &cq) {
#ifdef CL_VERSION_1_2
#ifndef __APPLE__
#define UCL_CL_ZERO
#endif
#endif
#ifdef UCL_CL_ZERO
cl_int zeroint=0;
CL_SAFE_CALL(clEnqueueFillBuffer(cq,mat.begin(),&zeroint,sizeof(cl_int),
mat.byteoff(),n,0,NULL,NULL));
#else
cl_context context;
CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_CONTEXT,sizeof(context),
&context,NULL));
@ -354,17 +454,20 @@ inline void _device_zero(mat_type &mat, const size_t n) {
const char * szero[3]={
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
"__kernel void _device_zero(__global NUMTYP *a)",
" { int gid=get_global_id(0); a[gid]=(NUMTYP)0; }"
"__kernel void _device_zero(__global NUMTYP *a, const int offset)",
" { int gid=get_global_id(0)+offset; a[gid]=(NUMTYP)0; }"
};
cl_kernel kzero;
_ocl_kernel_from_source(context,device,szero,3,kzero,"_device_zero",
_UCL_DATA_ID<typename mat_type::data_type>::numtyp_flag());
cl_int offset=mat.offset();
CL_SAFE_CALL(clSetKernelArg(kzero,0,sizeof(cl_mem),(void *)&mat.begin()));
CL_SAFE_CALL(clSetKernelArg(kzero,1,sizeof(cl_int),(void *)&offset));
size_t kn=n/sizeof(typename mat_type::data_type);
CL_SAFE_CALL(clEnqueueNDRangeKernel(mat.cq(),kzero,1,0,&kn,0,0,0,0));
CL_SAFE_CALL(clEnqueueNDRangeKernel(cq,kzero,1,0,&kn,0,0,0,0));
#endif
}
// --------------------------------------------------------------------------
@ -470,9 +573,15 @@ template <> struct _ucl_memcpy<1,0> {
cl_command_queue &cq, const cl_bool block,
const size_t dst_offset, const size_t src_offset) {
if (src.cbegin()==dst.cbegin()) {
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 1S\n";
#endif
if (block) ucl_sync(cq);
return;
}
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 1NS\n";
#endif
CL_SAFE_CALL(clEnqueueReadBuffer(cq,src.cbegin(),block,src_offset,n,
dst.begin(),0,NULL,NULL));
}
@ -484,8 +593,14 @@ template <> struct _ucl_memcpy<1,0> {
size_t dst_offset, size_t src_offset) {
if (src.cbegin()==dst.cbegin()) {
if (block) ucl_sync(cq);
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 2S\n";
#endif
return;
}
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 2NS\n";
#endif
if (spitch==dpitch && dst.cols()==src.cols() &&
src.cols()==cols/src.element_size())
CL_SAFE_CALL(clEnqueueReadBuffer(cq,src.cbegin(),block,src_offset,
@ -511,8 +626,14 @@ template <> struct _ucl_memcpy<0,1> {
const size_t dst_offset, const size_t src_offset) {
if (src.cbegin()==dst.cbegin()) {
if (block) ucl_sync(cq);
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 3S\n";
#endif
return;
}
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 3NS\n";
#endif
CL_SAFE_CALL(clEnqueueWriteBuffer(cq,dst.cbegin(),block,dst_offset,n,
src.begin(),0,NULL,NULL));
}
@ -524,8 +645,14 @@ template <> struct _ucl_memcpy<0,1> {
size_t dst_offset, size_t src_offset) {
if (src.cbegin()==dst.cbegin()) {
if (block) ucl_sync(cq);
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 4S\n";
#endif
return;
}
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 4NS\n";
#endif
if (spitch==dpitch && dst.cols()==src.cols() &&
src.cols()==cols/src.element_size())
CL_SAFE_CALL(clEnqueueWriteBuffer(cq,dst.cbegin(),block,dst_offset,
@ -549,9 +676,17 @@ template <int mem1, int mem2> struct _ucl_memcpy {
static inline void mc(p1 &dst, const p2 &src, const size_t n,
cl_command_queue &cq, const cl_bool block,
const size_t dst_offset, const size_t src_offset) {
if (src.cbegin()!=dst.cbegin() || src_offset!=dst_offset)
if (src.cbegin()!=dst.cbegin() || src_offset!=dst_offset) {
CL_SAFE_CALL(clEnqueueCopyBuffer(cq,src.cbegin(),dst.cbegin(),src_offset,
dst_offset,n,0,NULL,NULL));
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 6NS\n";
#endif
}
#ifdef UCL_DBG_MEM_TRACE
else std::cerr << "UCL_COPY 6S\n";
#endif
if (block==CL_TRUE) ucl_sync(cq);
}
template <class p1, class p2>
@ -561,6 +696,9 @@ template <int mem1, int mem2> struct _ucl_memcpy {
const cl_bool block,
size_t dst_offset, size_t src_offset) {
if (src.cbegin()!=dst.cbegin() || src_offset!=dst_offset) {
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 7NS\n";
#endif
if (spitch==dpitch && dst.cols()==src.cols() &&
src.cols()==cols/src.element_size())
CL_SAFE_CALL(clEnqueueCopyBuffer(cq,src.cbegin(),dst.cbegin(),src_offset,
@ -575,6 +713,10 @@ template <int mem1, int mem2> struct _ucl_memcpy {
dst_offset+=dpitch;
}
}
#ifdef UCL_DBG_MEM_TRACE
else std::cerr << "UCL_COPY 7S\n";
#endif
if (block==CL_TRUE) ucl_sync(cq);
}
};

View File

@ -27,6 +27,12 @@
#include "ocl_macros.h"
#include "ocl_device.h"
#ifdef CL_VERSION_1_2
#define UCL_OCL_MARKER(cq,event) clEnqueueMarkerWithWaitList(cq,0,NULL,event)
#else
#define UCL_OCL_MARKER clEnqueueMarker
#endif
namespace ucl_opencl {
/// Class for timing OpenCL events
@ -63,10 +69,10 @@ class UCL_Timer {
}
/// Start timing on default command queue
inline void start() { clEnqueueMarker(_cq,&start_event); }
inline void start() { UCL_OCL_MARKER(_cq,&start_event); }
/// Stop timing on default command queue
inline void stop() { clEnqueueMarker(_cq,&stop_event); }
inline void stop() { UCL_OCL_MARKER(_cq,&stop_event); }
/// Block until the start event has been reached on device
inline void sync_start()
@ -78,7 +84,7 @@ class UCL_Timer {
/// Set the time elapsed to zero (not the total_time)
inline void zero()
{ clEnqueueMarker(_cq,&start_event); clEnqueueMarker(_cq,&stop_event); }
{ UCL_OCL_MARKER(_cq,&start_event); UCL_OCL_MARKER(_cq,&stop_event); }
/// Set the total time to zero
inline void zero_total() { _total_time=0.0; }

View File

@ -58,19 +58,36 @@
* calls for reserving and copying memory **/
class UCL_BaseMat {
public:
UCL_BaseMat() : _cq(0) { }
UCL_BaseMat() : _cq(0), _kind(UCL_VIEW) { }
virtual ~UCL_BaseMat() { }
/// Return the default command queue/stream associated with this data
inline command_queue & cq() { return _cq; }
/// Change the default command queue associated with matrix
inline void cq(command_queue &cq_in) { _cq=cq_in; }
/// Block until command_queue associated with matrix is complete
inline void sync() { ucl_sync(_cq); }
/// Return the type/permissions of memory allocation
/** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, UCL_NOT_PINNED
* or UCL_VIEW **/
inline enum UCL_MEMOPT kind() const { return _kind; }
inline bool shared_mem_device() {
#ifdef _OCL_MAT
cl_device_id device;
CL_SAFE_CALL(clGetCommandQueueInfo(_cq,CL_QUEUE_DEVICE,
sizeof(cl_device_id),&device,NULL));
cl_device_type device_type;
CL_SAFE_CALL(clGetDeviceInfo(device,CL_DEVICE_TYPE,
sizeof(device_type),&device_type,NULL));
return _shared_mem_device(device_type);
#else
return false;
#endif
}
#ifdef UCL_DEBUG
// Returns the type of host allocation
virtual inline enum UCL_MEMOPT kind() const { return UCL_NOT_PINNED; }
#endif
protected:
command_queue _cq;
enum UCL_MEMOPT _kind;
};
#endif

View File

@ -102,6 +102,30 @@
// Only allow this file to be included by nvc_memory.h and ocl_memory.h
#ifdef UCL_COPY_ALLOW
// --------------------------------------------------------------------------
// - CHECK PERMISSIONS FOR SOURCE AND DESTINATION IN COPY
// --------------------------------------------------------------------------
template <class mat1, class mat2>
inline void _check_ucl_copy_perm(mat1 &dst, mat2 &src) {
if ((int)mat1::MEM_TYPE==(int)mat2::MEM_TYPE) {
if (dst.kind()==UCL_READ_ONLY) {
std::cerr << "Attempt to copy where destination is UCL_READ_ONLY\n";
assert(0==1);
} else if (src.kind()==UCL_WRITE_ONLY) {
std::cerr << "Attempt to copy where source is UCL_WRITE_ONLY\n";
assert(0==1);
}
} else {
if (dst.kind()==UCL_WRITE_ONLY) {
std::cerr << "Destination in host-device copy cannot be UCL_WRITE_ONLY\n";
assert(0==1);
} else if (src.kind()==UCL_READ_ONLY) {
std::cerr << "Source in host-device copy cannot be UCL_READ_ONLY\n";
assert(0==1);
}
}
}
// --------------------------------------------------------------------------
// - HOST-HOST COPY ROUTINES
// --------------------------------------------------------------------------
@ -117,9 +141,20 @@ template <> struct _host_host_copy<1,1> {
assert(mat1::PADDED==0 && mat2::PADDED==0);
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
#endif
if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE && mat1::DATA_TYPE!=0)
if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE && mat1::DATA_TYPE!=0) {
#ifdef _OCL_MAT
if (dst.begin()==src.begin()) {
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 7S\n";
#endif
return;
}
#endif
memcpy(dst.begin(),src.begin(),numel*sizeof(typename mat1::data_type));
else
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 7NS\n";
#endif
} else
for (size_t i=0; i<numel; i++)
dst[i]=static_cast<typename mat1::data_type>(src[i]);
}
@ -138,15 +173,27 @@ template <> struct _host_host_copy<1,1> {
src_row_size=cols;
else
src_row_size=src.row_size();
if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE && mat1::DATA_TYPE!=0)
if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE && mat1::DATA_TYPE!=0) {
#ifdef _OCL_MAT
if (dst.begin()==src.begin()) {
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 8S\n";
#endif
return;
}
#endif
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 8NS\n";
#endif
for (size_t i=0; i<rows; i++)
memcpy(dst.begin()+i*dst_row_size,src.begin()+i*src_row_size,
cols*sizeof(typename mat1::data_type));
else
} else
for (size_t j=0; j<rows; j++) {
int dst_i=j*dst_row_size;
int d_end=dst_i+cols;
int src_i=j*src_row_size;
size_t dst_i=j*dst_row_size;
size_t d_end=dst_i+cols;
size_t src_i=j*src_row_size;
for (; dst_i<d_end; dst_i++) {
dst[dst_i]=static_cast<typename mat1::data_type>(src[src_i]);
src_i++;
@ -216,15 +263,14 @@ template <int host_type2> struct _ucl_cast_copy<1,host_type2> {
ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
src.row_bytes(),cols*sizeof(typename mat2::data_type),
rows);
int dst_i=0;
int buff_i=0;
size_t dst_i=0, buff_i=0, doff=dst.cols()-cols;
for (size_t i=0; i<rows; i++) {
for (size_t j=0; j<cols; j++) {
dst[dst_i]=static_cast<typename mat1::data_type>(cast_buffer[buff_i]);
buff_i++;
dst_i++;
}
dst_i+=dst.cols()-cols;
dst_i+=doff;
}
}
}
@ -255,15 +301,14 @@ template <int host_type2> struct _ucl_cast_copy<1,host_type2> {
src.row_bytes(),cols*sizeof(typename mat2::data_type),
rows,cq);
cast_buffer.sync();
int dst_i=0;
int buff_i=0;
size_t dst_i=0, buff_i=0, doff=dst.cols()-cols;
for (size_t i=0; i<rows; i++) {
for (size_t j=0; j<cols; j++) {
dst[dst_i]=static_cast<typename mat1::data_type>(cast_buffer[buff_i]);
buff_i++;
dst_i++;
}
dst_i+=dst.cols()-cols;
dst_i+=doff;
}
}
}
@ -293,38 +338,62 @@ template <int host_type1> struct _ucl_cast_copy<host_type1,1> {
assert(src.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
if (mat3::VECTOR==0) {
assert(cast_buffer.rows()>=rows && cast_buffer.cols()>=cols);
assert(dst.rows()>=rows && dst.cols()>=cols);
}
#endif
if (mat2::VECTOR) {
for (size_t i=0; i<rows*cols; i++)
cast_buffer[i]=static_cast<typename mat3::data_type>(src[i]);
ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,
cols*sizeof(typename mat1::data_type),
cols*sizeof(typename mat1::data_type),rows);
if (mat3::VECTOR==0) {
size_t ci=0, si=0, co=cast_buffer.cols()-cols, so=src.cols()-cols;
for (size_t i=0; i<rows; i++) {
for (size_t j=0; j<cols; j++) {
cast_buffer[ci]=static_cast<typename mat3::data_type>(src[si]);
ci++;
si++;
}
ci+=co;
si+=so;
}
ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,cast_buffer.row_bytes(),
cols*sizeof(typename mat1::data_type),rows);
} else {
for (size_t i=0; i<rows*cols; i++)
cast_buffer[i]=static_cast<typename mat3::data_type>(src[i]);
ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,
cols*sizeof(typename mat1::data_type),
cols*sizeof(typename mat1::data_type),rows);
}
} else if (mat1::VECTOR) {
int src_i=0;
int buf_i=0;
size_t src_i=0, buf_i=0, soff=src.cols()-cols;
for (size_t i=0; i<rows; i++) {
for (size_t j=0; j<cols; j++) {
cast_buffer[buf_i]=static_cast<typename mat3::data_type>(src[src_i]);
buf_i++;
src_i++;
}
src_i+=src.cols()-cols;
src_i+=soff;
}
ucl_mv_cpy(dst,cast_buffer,cols*sizeof(typename mat1::data_type)*rows);
} else {
int src_i=0;
int buf_i=0;
size_t src_i=0, buf_i=0, so=src.cols()-cols, co, spitch;
if (mat3::VECTOR==0) {
co=cast_buffer.cols()-cols;
spitch=cast_buffer.row_bytes();
} else {
co=0;
spitch=cols*sizeof(typename mat1::data_type);
}
for (size_t i=0; i<rows; i++) {
for (size_t j=0; j<cols; j++) {
cast_buffer[buf_i]=static_cast<typename mat3::data_type>(src[src_i]);
buf_i++;
src_i++;
}
src_i+=src.cols()-cols;
src_i+=so;
buf_i+=co;
}
ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,
cols*sizeof(typename mat1::data_type),
ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,spitch,
cols*sizeof(typename mat1::data_type),rows);
}
}
@ -337,38 +406,62 @@ template <int host_type1> struct _ucl_cast_copy<host_type1,1> {
assert(src.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
if (mat3::VECTOR==0) {
assert(cast_buffer.rows()>=rows && cast_buffer.cols()>=cols);
assert(dst.rows()>=rows && dst.cols()>=cols);
}
#endif
if (mat2::VECTOR) {
for (size_t i=0; i<rows*cols; i++)
cast_buffer[i]=static_cast<typename mat3::data_type>(src[i]);
ucl_mv_cpy(dst,dst.row_bytes(),
cast_buffer,cols*sizeof(typename mat1::data_type),
cols*sizeof(typename mat1::data_type),rows,cq);
if (mat3::VECTOR==0) {
size_t ci=0, si=0, co=cast_buffer.cols()-cols, so=src.cols()-cols;
for (size_t i=0; i<rows; i++) {
for (size_t j=0; j<cols; j++) {
cast_buffer[ci]=static_cast<typename mat3::data_type>(src[si]);
ci++;
si++;
}
ci+=co;
si+=so;
}
ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,cast_buffer.row_bytes(),
cols*sizeof(typename mat1::data_type),rows);
} else {
for (size_t i=0; i<rows*cols; i++)
cast_buffer[i]=static_cast<typename mat3::data_type>(src[i]);
ucl_mv_cpy(dst,dst.row_bytes(),
cast_buffer,cols*sizeof(typename mat1::data_type),
cols*sizeof(typename mat1::data_type),rows,cq);
}
} else if (mat1::VECTOR) {
int src_i=0;
int buf_i=0;
size_t src_i=0, buf_i=0, soff=src.cols()-cols;
for (size_t i=0; i<rows; i++) {
for (size_t j=0; j<cols; j++) {
cast_buffer[buf_i]=static_cast<typename mat3::data_type>(src[src_i]);
buf_i++;
src_i++;
}
src_i+=src.cols()-cols;
src_i+=soff;
}
ucl_mv_cpy(dst,cast_buffer,cols*sizeof(typename mat1::data_type)*rows,cq);
} else {
int src_i=0;
int buf_i=0;
size_t src_i=0, buf_i=0, so=src.cols()-cols, co, spitch;
if (mat3::VECTOR==0) {
co=cast_buffer.cols()-cols;
spitch=cast_buffer.row_bytes();
} else {
co=0;
spitch=cols*sizeof(typename mat1::data_type);
}
for (size_t i=0; i<rows; i++) {
for (size_t j=0; j<cols; j++) {
cast_buffer[buf_i]=static_cast<typename mat3::data_type>(src[src_i]);
buf_i++;
src_i++;
}
src_i+=src.cols()-cols;
src_i+=so;
buf_i+=co;
}
ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,
cols*sizeof(typename mat1::data_type),
ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,spitch,
cols*sizeof(typename mat1::data_type),rows,cq);
}
}
@ -444,9 +537,13 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel,
#endif
if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
ucl_copy(dst,src,numel,cq);
else
else {
#ifdef UCL_DEBUG
_check_ucl_copy_perm(dst,src);
#endif
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
cast_buffer,cq);
}
}
/// Asynchronous copy of matrix/vector with cast (Device/Host transfer)
@ -463,6 +560,7 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel,
assert(dst.numel()>=numel && src.numel()>=numel);
assert(cast_buffer.numel()>=numel);
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
_check_ucl_copy_perm(dst,src);
#endif
if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
ucl_copy(dst,src,numel,async);
@ -491,6 +589,7 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
assert(dst.row_size()*dst.rows()>=numel && src.row_size()*src.rows()>=numel);
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
_check_ucl_copy_perm(dst,src);
#endif
if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
_host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,numel);
@ -498,12 +597,12 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
(mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
if (mat1::MEM_TYPE==1) {
UCL_H_Vec<typename mat2::data_type> cast_buffer;
cast_buffer.alloc(numel,dst,UCL_RW_OPTIMIZED);
cast_buffer.alloc(numel,dst,UCL_READ_ONLY);
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
cast_buffer,cq);
} else {
UCL_H_Vec<typename mat1::data_type> cast_buffer;
cast_buffer.alloc(numel,dst,UCL_WRITE_OPTIMIZED);
cast_buffer.alloc(numel,dst,UCL_WRITE_ONLY);
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
cast_buffer,cq);
}
@ -529,6 +628,7 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
#ifdef UCL_DEBUG
assert(dst.row_size()*dst.rows()>=numel && src.row_size()*src.rows()>=numel);
assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
_check_ucl_copy_perm(dst,src);
#endif
if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
_host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,numel);
@ -538,12 +638,12 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
(mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
if (mat1::MEM_TYPE==1) {
UCL_H_Vec<typename mat2::data_type> cast_buffer;
cast_buffer.alloc(numel,dst,UCL_RW_OPTIMIZED);
cast_buffer.alloc(numel,dst,UCL_READ_ONLY);
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
cast_buffer);
} else {
UCL_H_Vec<typename mat1::data_type> cast_buffer;
cast_buffer.alloc(numel,dst,UCL_WRITE_OPTIMIZED);
cast_buffer.alloc(numel,dst,UCL_WRITE_ONLY);
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
cast_buffer);
}
@ -574,9 +674,13 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t rows,
ucl_copy(dst,src,rows,cols,async);
else if (async)
ucl_copy(dst,src,rows,cols,dst.cq());
else
else {
#ifdef UCL_DEBUG
_check_ucl_copy_perm(dst,src);
#endif
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
cast_buffer);
}
}
/// Asynchronous copy subset matrix rows,cols with cast (Device/Host transfer)
@ -595,9 +699,13 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t rows,
command_queue &cq) {
if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
ucl_copy(dst,src,rows,cols,cq);
else
else {
#ifdef UCL_DEBUG
_check_ucl_copy_perm(dst,src);
#endif
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
cast_buffer,cq);
}
}
/// Asynchronous copy of subset matrix rows,cols (memory already allocated)
@ -617,18 +725,21 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t rows,
template <class mat1, class mat2>
inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
const size_t cols, command_queue &cq) {
#ifdef UCL_DEBUG
_check_ucl_copy_perm(dst,src);
#endif
if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
_host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,rows,cols);
else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE &&
(mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
if (mat1::MEM_TYPE==1) {
UCL_H_Vec<typename mat2::data_type> cast_buffer;
cast_buffer.alloc(rows*cols,dst,UCL_RW_OPTIMIZED);
cast_buffer.alloc(rows*cols,dst,UCL_READ_ONLY);
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
cast_buffer,cq);
} else {
UCL_H_Vec<typename mat1::data_type> cast_buffer;
cast_buffer.alloc(rows*cols,dst,UCL_WRITE_OPTIMIZED);
cast_buffer.alloc(rows*cols,dst,UCL_WRITE_ONLY);
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
cast_buffer,cq);
}
@ -678,6 +789,9 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
template <class mat1, class mat2>
inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
const size_t cols, const bool async) {
#ifdef UCL_DEBUG
_check_ucl_copy_perm(dst,src);
#endif
if (async)
ucl_copy(dst,src,rows,cols,dst.cq());
else if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
@ -686,12 +800,12 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
(mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
if (mat1::MEM_TYPE==1) {
UCL_H_Vec<typename mat2::data_type> cast_buffer;
cast_buffer.alloc(rows*cols,dst,UCL_RW_OPTIMIZED);
cast_buffer.alloc(rows*cols,dst,UCL_READ_ONLY);
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
cast_buffer);
} else {
UCL_H_Vec<typename mat1::data_type> cast_buffer;
cast_buffer.alloc(rows*cols,dst,UCL_WRITE_OPTIMIZED);
cast_buffer.alloc(rows*cols,dst,UCL_WRITE_ONLY);
_ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
cast_buffer);
}

View File

@ -39,14 +39,14 @@ class UCL_D_Mat : public UCL_BaseMat {
};
typedef numtyp data_type;
UCL_D_Mat() : _rows(0), _kind(UCL_VIEW) {}
~UCL_D_Mat() { if (_kind!=UCL_VIEW) _device_free(*this); }
UCL_D_Mat() : _cols(0) {}
~UCL_D_Mat() { _device_free(*this); }
/// Construct with specified rows and cols
/** \sa alloc() **/
UCL_D_Mat(const size_t rows, const size_t cols, UCL_Device &device,
const enum UCL_MEMOPT kind=UCL_READ_WRITE) :
_rows(0), _kind(UCL_VIEW) { alloc(rows,cols,device,kind); }
_cols(0) { alloc(rows,cols,device,kind); }
/// Row major matrix on device
/** The kind parameter controls memory optimizations as follows:
@ -121,15 +121,11 @@ class UCL_D_Mat : public UCL_BaseMat {
return err;
}
/// Return the type of memory allocation
/** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, or UCL_VIEW **/
inline enum UCL_MEMOPT kind() const { return _kind; }
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* allocating container when using CUDA APIs
* \param stride Number of _elements_ between the start of each row **/
template <class ucl_type>
inline void view(ucl_type &input, const size_t rows, const size_t cols,
@ -142,8 +138,10 @@ class UCL_D_Mat : public UCL_BaseMat {
_row_size=stride;
this->_cq=input.cq();
#ifdef _OCL_MAT
_offset=0;
_offset=input.offset();
_array=input.cbegin();
CL_SAFE_CALL(clRetainMemObject(input.cbegin()));
CL_SAFE_CALL(clRetainCommandQueue(input.cq()));
#else
_device_view(&_array,input.begin());
#endif
@ -157,7 +155,7 @@ class UCL_D_Mat : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container **/
* allocating container when using CUDA APIs **/
template <class ucl_type>
inline void view(ucl_type &input, const size_t rows, const size_t cols)
{ view(input,rows,cols,input.row_size()); }
@ -166,7 +164,7 @@ class UCL_D_Mat : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding)
* will be used for view **/
template <class ucl_type>
@ -177,7 +175,7 @@ class UCL_D_Mat : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding)
* will be used for view **/
template <class ucl_type>
@ -187,7 +185,7 @@ class UCL_D_Mat : public UCL_BaseMat {
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* allocating container when using CUDA APIs
* \param stride Number of _elements_ between the start of each row **/
template <class ptr_type>
inline void view(ptr_type input, const size_t rows, const size_t cols,
@ -205,13 +203,15 @@ class UCL_D_Mat : public UCL_BaseMat {
#endif
#ifdef _OCL_MAT
_offset=0;
CL_SAFE_CALL(clRetainMemObject(input));
CL_SAFE_CALL(clRetainCommandQueue(dev.cq()));
#endif
}
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container **/
* allocating container when using CUDA APIs **/
template <class ptr_type>
inline void view(ptr_type input, const size_t rows, const size_t cols,
UCL_Device &dev) { view(input,rows,cols,cols,dev); }
@ -219,7 +219,7 @@ class UCL_D_Mat : public UCL_BaseMat {
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container **/
* allocating container when using CUDA APIs **/
template <class ptr_type>
inline void view(ptr_type input, const size_t cols, UCL_Device &dev)
{ view(input,1,cols,dev); }
@ -228,7 +228,7 @@ class UCL_D_Mat : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* allocating container when using CUDA APIs
* \param stride Number of _elements_ between the start of each row **/
template <class ucl_type>
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
@ -242,7 +242,9 @@ class UCL_D_Mat : public UCL_BaseMat {
this->_cq=input.cq();
#ifdef _OCL_MAT
_array=input.begin();
_offset=offset;
_offset=offset+input.offset();
CL_SAFE_CALL(clRetainMemObject(input.cbegin()));
CL_SAFE_CALL(clRetainCommandQueue(input.cq()));
#else
_device_view(&_array,input.begin(),offset,sizeof(numtyp));
#endif
@ -256,7 +258,7 @@ class UCL_D_Mat : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container **/
* allocating container when using CUDA APIs **/
template <class ucl_type>
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
const size_t cols)
@ -266,7 +268,7 @@ class UCL_D_Mat : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding)
* will be used for view **/
template <class ucl_type>
@ -277,7 +279,7 @@ class UCL_D_Mat : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding)
* will be used for view **/
template <class ucl_type>
@ -292,7 +294,7 @@ class UCL_D_Mat : public UCL_BaseMat {
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* allocating container when using CUDA APIs
* \param stride Number of _elements_ between the start of each row **/
template <class ptr_type>
inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
@ -309,6 +311,8 @@ class UCL_D_Mat : public UCL_BaseMat {
#ifdef _OCL_MAT
_array=input;
_offset=offset;
CL_SAFE_CALL(clRetainMemObject(input));
CL_SAFE_CALL(clRetainCommandQueue(dev.cq()));
#else
#ifdef _UCL_DEVICE_PTR_MAT
_array=input+offset*sizeof(numtyp);
@ -325,7 +329,7 @@ class UCL_D_Mat : public UCL_BaseMat {
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container **/
* allocating container when using CUDA APIs **/
template <class ptr_type>
inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
const size_t cols, UCL_Device &dev)
@ -334,7 +338,7 @@ class UCL_D_Mat : public UCL_BaseMat {
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container **/
* allocating container when using CUDA APIs **/
template <class ptr_type>
inline void view_offset(const size_t offset, ptr_type input,
const size_t cols, UCL_Device &dev)
@ -342,7 +346,7 @@ class UCL_D_Mat : public UCL_BaseMat {
/// Free memory and set size to 0
inline void clear()
{ _rows=0; if (_kind!=UCL_VIEW) { _kind=UCL_VIEW; _device_free(*this); } }
{ _device_free(*this); _cols=0; _kind=UCL_VIEW; }
/// Resize the allocation to contain cols elements
/** \note Cannot be used on views **/
@ -377,11 +381,17 @@ class UCL_D_Mat : public UCL_BaseMat {
{ if (cols>_cols || rows>_rows) return resize(rows,cols);
else return UCL_SUCCESS; }
/// Set each element to zero
inline void zero() { _device_zero(*this,row_bytes()*_rows); }
/// Set each element to zero asynchronously in the default command_queue
inline void zero() { zero(_cq); }
/// Set first n elements to zero asynchronously in the default command_queue
inline void zero(const int n) { zero(n,_cq); }
/// Set each element to zero asynchronously
inline void zero(command_queue &cq)
{ _device_zero(*this,row_bytes()*_rows,cq); }
/// Set first n elements to zero asynchronously
inline void zero(const int n, command_queue &cq)
{ _device_zero(*this,n*sizeof(numtyp),cq); }
/// Set first n elements to zero
inline void zero(const int n) { _device_zero(*this,n*sizeof(numtyp)); }
#ifdef _UCL_DEVICE_PTR_MAT
/// For OpenCL, returns a (void *) device pointer to memory allocation
@ -452,7 +462,6 @@ class UCL_D_Mat : public UCL_BaseMat {
private:
size_t _pitch, _row_size, _rows, _cols;
enum UCL_MEMOPT _kind;
#ifdef _UCL_DEVICE_PTR_MAT
device_ptr _array;

View File

@ -39,14 +39,14 @@ class UCL_D_Vec : public UCL_BaseMat {
};
typedef numtyp data_type;
UCL_D_Vec() : _cols(0), _kind(UCL_VIEW) {}
~UCL_D_Vec() { if (_kind!=UCL_VIEW) _device_free(*this); }
UCL_D_Vec() : _cols(0) {}
~UCL_D_Vec() { _device_free(*this); }
/// Construct with n columns
/** \sa alloc() **/
UCL_D_Vec(const size_t n, UCL_Device &device,
const enum UCL_MEMOPT kind=UCL_READ_WRITE) :
_cols(0), _kind(UCL_VIEW) { alloc(n,device,kind); }
_cols(0) { alloc(n,device,kind); }
/// Set up host vector with 'cols' columns and reserve memory
/** The kind parameter controls memory optimizations as follows:
@ -119,15 +119,11 @@ class UCL_D_Vec : public UCL_BaseMat {
return err;
}
/// Return the type of memory allocation
/** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, or UCL_VIEW **/
inline enum UCL_MEMOPT kind() const { return _kind; }
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container **/
* allocating container when using CUDA APIs **/
template <class ucl_type>
inline void view(ucl_type &input, const size_t rows, const size_t cols) {
#ifdef UCL_DEBUG
@ -139,8 +135,10 @@ class UCL_D_Vec : public UCL_BaseMat {
_row_bytes=_cols*sizeof(numtyp);
this->_cq=input.cq();
#ifdef _OCL_MAT
_offset=0;
_offset=input.offset();
_array=input.cbegin();
CL_SAFE_CALL(clRetainMemObject(input.cbegin()));
CL_SAFE_CALL(clRetainCommandQueue(input.cq()));
#else
_device_view(&_array,input.begin());
#endif
@ -154,7 +152,7 @@ class UCL_D_Vec : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* allocating container when using CUDA APIs
* \param stride Number of _elements_ between the start of each row **/
template <class ucl_type>
inline void view(ucl_type &input, const size_t rows, const size_t cols,
@ -164,7 +162,7 @@ class UCL_D_Vec : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding)
* will be used for view **/
template <class ucl_type>
@ -175,7 +173,7 @@ class UCL_D_Vec : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding)
* will be used for view **/
template <class ucl_type>
@ -185,7 +183,7 @@ class UCL_D_Vec : public UCL_BaseMat {
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container **/
* allocating container when using CUDA APIs **/
template <class ptr_type>
inline void view(ptr_type input, const size_t rows, const size_t cols,
UCL_Device &dev) {
@ -203,13 +201,15 @@ class UCL_D_Vec : public UCL_BaseMat {
#endif
#ifdef _OCL_MAT
_offset=0;
CL_SAFE_CALL(clRetainMemObject(input));
CL_SAFE_CALL(clRetainCommandQueue(dev.cq()));
#endif
}
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* allocating container when using CUDA APIs
* \param stride Number of _elements_ between the start of each row **/
template <class ptr_type>
inline void view(ptr_type input, const size_t rows, const size_t cols,
@ -219,7 +219,7 @@ class UCL_D_Vec : public UCL_BaseMat {
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container **/
* allocating container when using CUDA APIs **/
template <class ptr_type>
inline void view(ptr_type input, const size_t cols, UCL_Device &dev)
{ view(input,1,cols,dev); }
@ -228,7 +228,7 @@ class UCL_D_Vec : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container **/
* allocating container when using CUDA APIs **/
template <class ucl_type>
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
const size_t cols) {
@ -242,7 +242,9 @@ class UCL_D_Vec : public UCL_BaseMat {
this->_cq=input.cq();
#ifdef _OCL_MAT
_array=input.begin();
_offset=offset;
_offset=offset+input.offset();
CL_SAFE_CALL(clRetainMemObject(input.begin()));
CL_SAFE_CALL(clRetainCommandQueue(input.cq()));
#else
_device_view(&_array,input.begin(),offset,sizeof(numtyp));
#endif
@ -256,7 +258,7 @@ class UCL_D_Vec : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* allocating container when using CUDA APIs
* \param stride Number of _elements_ between the start of each row **/
template <class ucl_type>
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
@ -267,7 +269,7 @@ class UCL_D_Vec : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding)
* will be used for view **/
template <class ucl_type>
@ -278,7 +280,7 @@ class UCL_D_Vec : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding)
* will be used for view **/
template <class ucl_type>
@ -288,7 +290,7 @@ class UCL_D_Vec : public UCL_BaseMat {
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container **/
* allocating container when using CUDA APIs **/
template <class ptr_type>
inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
const size_t cols, UCL_Device &dev) {
@ -304,6 +306,8 @@ class UCL_D_Vec : public UCL_BaseMat {
#ifdef _OCL_MAT
_array=input;
_offset=offset;
CL_SAFE_CALL(clRetainMemObject(input));
CL_SAFE_CALL(clRetainCommandQueue(dev.cq()));
#else
#ifdef _UCL_DEVICE_PTR_MAT
_array=input+offset*sizeof(numtyp);
@ -320,7 +324,7 @@ class UCL_D_Vec : public UCL_BaseMat {
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* allocating container when using CUDA APIs
* \param stride Number of _elements_ between the start of each row **/
template <class ptr_type>
inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
@ -330,7 +334,7 @@ class UCL_D_Vec : public UCL_BaseMat {
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container **/
* allocating container when using CUDA APIs **/
template <class ptr_type>
inline void view_offset(const size_t offset, ptr_type input,
const size_t cols, UCL_Device &dev)
@ -338,7 +342,7 @@ class UCL_D_Vec : public UCL_BaseMat {
/// Free memory and set size to 0
inline void clear()
{ if (_kind!=UCL_VIEW) { _cols=0; _kind=UCL_VIEW; _device_free(*this); } }
{ _device_free(*this); _cols=0; _kind=UCL_VIEW; }
/// Resize the allocation to contain cols elements
/** \note Cannot be used on views **/
@ -373,11 +377,15 @@ class UCL_D_Vec : public UCL_BaseMat {
inline int resize_ib(const int cols)
{ if (cols>_cols) return resize(cols); else return UCL_SUCCESS; }
/// Set each element to zero
inline void zero() { _device_zero(*this,row_bytes()); }
/// Set first n elements to zero
inline void zero(const int n) { _device_zero(*this,n*sizeof(numtyp)); }
/// Set each element to zero asynchronously in the default command_queue
inline void zero() { zero(_cq); }
/// Set first n elements to zero asynchronously in the default command_queue
inline void zero(const int n) { zero(n,_cq); }
/// Set each element to zero asynchronously
inline void zero(command_queue &cq) { _device_zero(*this,row_bytes(),cq); }
/// Set first n elements to zero asynchronously
inline void zero(const int n, command_queue &cq)
{ _device_zero(*this,n*sizeof(numtyp),cq); }
#ifdef _UCL_DEVICE_PTR_MAT
/// For OpenCL, returns a (void *) device pointer to memory allocation
@ -465,7 +473,6 @@ class UCL_D_Vec : public UCL_BaseMat {
private:
size_t _row_bytes, _row_size, _rows, _cols;
enum UCL_MEMOPT _kind;
#ifdef _UCL_DEVICE_PTR_MAT
device_ptr _array;

View File

@ -39,33 +39,35 @@ class UCL_H_Mat : public UCL_BaseMat {
};
typedef numtyp data_type;
UCL_H_Mat() : _kind(UCL_VIEW), _rows(0) {
UCL_H_Mat() : _cols(0) {
#ifdef _OCL_MAT
_carray=(cl_mem)(0);
#endif
}
~UCL_H_Mat() { if (_kind!=UCL_VIEW) _host_free(*this,_kind); }
~UCL_H_Mat() { _host_free(*this); }
/// Construct with specied number of rows and columns
/** \sa alloc() **/
UCL_H_Mat(const size_t rows, const size_t cols, UCL_Device &device,
const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED)
{ _rows=0; _kind=UCL_VIEW; alloc(rows,cols,device,kind); }
const enum UCL_MEMOPT kind=UCL_READ_WRITE)
{ _cols=0; _kind=UCL_VIEW; alloc(rows,cols,device,kind); }
/// Set up host matrix with specied # of rows/cols and reserve memory
/** The kind parameter controls memory pinning as follows:
* - UCL_NOT_PINNED - Memory is not pinned
* - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
* - UCL_RW_OPTIMIZED - Memory can be pinned
* - UCL_READ_WRITE - Specify that you will read and write from host
* - UCL_WRITE_ONLY - Specify that you will only write from host
* - UCL_READ_ONLY - Specify that you will only read from host
* - UCL_NOT_PINNED - Memory is not pinned/page-locked on host
* \param cq Default command queue for operations copied from another mat
* \return UCL_SUCCESS if the memory allocation is successful **/
template <class mat_type>
inline int alloc(const size_t rows, const size_t cols, mat_type &cq,
const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
const enum UCL_MEMOPT kind=UCL_READ_WRITE,
const enum UCL_MEMOPT kind2=UCL_NOT_SPECIFIED) {
clear();
_row_bytes=cols*sizeof(numtyp);
int err=_host_alloc(*this,cq,_row_bytes*rows,kind);
int err=_host_alloc(*this,cq,_row_bytes*rows,kind,kind2);
if (err!=UCL_SUCCESS) {
#ifndef UCL_NO_EXIT
std::cerr << "UCL Error: Could not allocate " << _row_bytes*_rows
@ -86,17 +88,19 @@ class UCL_H_Mat : public UCL_BaseMat {
/// Set up host matrix with specied # of rows/cols and reserve memory
/** The kind parameter controls memory pinning as follows:
* - UCL_NOT_PINNED - Memory is not pinned
* - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
* - UCL_RW_OPTIMIZED - Memory can be pinned
* - UCL_READ_WRITE - Specify that you will read and write from host
* - UCL_WRITE_ONLY - Specify that you will only write from host
* - UCL_READ_ONLY - Specify that you will only read from host
* - UCL_NOT_PINNED - Memory is not pinned/page-locked on host
* \param device Used to get the default command queue for operations
* \return UCL_SUCCESS if the memory allocation is successful **/
inline int alloc(const size_t rows, const size_t cols, UCL_Device &device,
const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
const enum UCL_MEMOPT kind=UCL_READ_WRITE,
const enum UCL_MEMOPT kind2=UCL_NOT_SPECIFIED) {
clear();
_row_bytes=cols*sizeof(numtyp);
int err=_host_alloc(*this,device,_row_bytes*rows,kind);
int err=_host_alloc(*this,device,_row_bytes*rows,kind,kind2);
if (err!=UCL_SUCCESS) {
#ifndef UCL_NO_EXIT
std::cerr << "UCL Error: Could not allocate " << _row_bytes*_rows
@ -115,15 +119,11 @@ class UCL_H_Mat : public UCL_BaseMat {
return err;
}
/// Return the type of memory allocation
/** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, or UCL_VIEW **/
inline enum UCL_MEMOPT kind() const { return _kind; }
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* allocating container when using CUDA APIs
* - Viewing a device container on the host is not supported
* \param stride Number of _elements_ between the start of each row **/
template <class ucl_type>
@ -140,6 +140,8 @@ class UCL_H_Mat : public UCL_BaseMat {
_end=_array+_cols;
#ifdef _OCL_MAT
_carray=input.cbegin();
CL_SAFE_CALL(clRetainMemObject(input.cbegin()));
CL_SAFE_CALL(clRetainCommandQueue(input.cq()));
#endif
}
@ -147,7 +149,7 @@ class UCL_H_Mat : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* allocating container when using CUDA APIs
* - Viewing a device container on the host is not supported **/
template <class ucl_type>
inline void view(ucl_type &input, const size_t rows, const size_t cols)
@ -157,7 +159,7 @@ class UCL_H_Mat : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding)
* will be used for view
* - Viewing a device container on the host is not supported **/
@ -169,9 +171,9 @@ class UCL_H_Mat : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding)
* will be used for view
* will be used for view when using CUDA APIs
* - Viewing a device container on the host is not supported **/
template <class ucl_type>
inline void view(ucl_type &input)
@ -180,7 +182,7 @@ class UCL_H_Mat : public UCL_BaseMat {
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* allocating container when using CUDA APIs
* - Viewing a device pointer on the host is not supported
* \param stride Number of _elements_ between the start of each row **/
template <class ptr_type>
@ -197,14 +199,14 @@ class UCL_H_Mat : public UCL_BaseMat {
_end=_array+_cols;
#ifdef _OCL_MAT
_host_alloc(*this,dev,_row_bytes,UCL_VIEW);
_host_view(*this,dev,_row_bytes*rows);
#endif
}
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* allocating container when using CUDA APIs
* - Viewing a device pointer on the host is not supported **/
template <class ptr_type>
inline void view(ptr_type *input, const size_t rows, const size_t cols,
@ -213,7 +215,7 @@ class UCL_H_Mat : public UCL_BaseMat {
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* allocating container when using CUDA APIs
* - Viewing a device pointer on the host is not supported **/
template <class ptr_type>
inline void view(ptr_type *input, const size_t cols, UCL_Device &dev)
@ -223,7 +225,7 @@ class UCL_H_Mat : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* allocating container when using CUDA APIs
* - Viewing a device container on the host is not supported
* \param stride Number of _elements_ between the start of each row **/
template <class ucl_type>
@ -239,7 +241,7 @@ class UCL_H_Mat : public UCL_BaseMat {
_array=input.begin()+offset;
_end=_array+_cols;
#ifdef _OCL_MAT
_host_alloc(*this,input,_row_bytes,UCL_VIEW);
_host_view(*this,input,_row_bytes*_rows);
#endif
}
@ -247,7 +249,7 @@ class UCL_H_Mat : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* allocating container when using CUDA APIs
* - Viewing a device container on the host is not supported **/
template <class ucl_type>
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
@ -258,7 +260,7 @@ class UCL_H_Mat : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding)
* will be used for view
* - Viewing a device container on the host is not supported **/
@ -270,7 +272,7 @@ class UCL_H_Mat : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding)
* will be used for view
* - Viewing a device container on the host is not supported **/
@ -296,7 +298,7 @@ class UCL_H_Mat : public UCL_BaseMat {
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* allocating container when using CUDA APIs
* - Viewing a device pointer on the host is not supported
* \param stride Number of _elements_ between the start of each row **/
template <class ptr_type>
@ -307,7 +309,7 @@ class UCL_H_Mat : public UCL_BaseMat {
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* allocating container when using CUDA APIs
* - Viewing a device pointer on the host is not supported **/
template <class ptr_type>
inline void view_offset(const size_t offset, ptr_type *input,
@ -316,7 +318,7 @@ class UCL_H_Mat : public UCL_BaseMat {
/// Free memory and set size to 0
inline void clear()
{ if (_kind!=UCL_VIEW) {_rows=0; _kind=UCL_VIEW; _host_free(*this,_kind); }}
{ _host_free(*this); _cols=0; _kind=UCL_VIEW; }
/// Resize the allocation to rows x cols elements
/** \note Cannot be used on views **/
@ -409,7 +411,6 @@ class UCL_H_Mat : public UCL_BaseMat {
#endif
private:
enum UCL_MEMOPT _kind;
numtyp *_array, *_end;
size_t _row_bytes, _rows, _cols;

View File

@ -39,33 +39,35 @@ class UCL_H_Vec : public UCL_BaseMat {
};
typedef numtyp data_type;
UCL_H_Vec() : _kind(UCL_VIEW), _cols(0) {
UCL_H_Vec() : _cols(0) {
#ifdef _OCL_MAT
_carray=(cl_mem)(0);
#endif
}
~UCL_H_Vec() { if (_kind!=UCL_VIEW) _host_free(*this,_kind); }
~UCL_H_Vec() { _host_free(*this); }
/// Construct with n columns
/** \sa alloc() **/
UCL_H_Vec(const size_t n, UCL_Device &device,
const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED)
const enum UCL_MEMOPT kind=UCL_READ_WRITE)
{ _cols=0; _kind=UCL_VIEW; alloc(n,device,kind); }
/// Set up host vector with 'cols' columns and reserve memory
/** The kind parameter controls memory pinning as follows:
* - UCL_NOT_PINNED - Memory is not pinned
* - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
* - UCL_RW_OPTIMIZED - Memory can be pinned
* - UCL_READ_WRITE - Specify that you will read and write from host
* - UCL_WRITE_ONLY - Specify that you will only write from host
* - UCL_READ_ONLY - Specify that you will only read from host
* - UCL_NOT_PINNED - Memory is not pinned/page-locked on host
* \param cq Default command queue for operations copied from another mat
* \return UCL_SUCCESS if the memory allocation is successful **/
template <class mat_type>
inline int alloc(const size_t cols, mat_type &cq,
const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
const enum UCL_MEMOPT kind=UCL_READ_WRITE,
const enum UCL_MEMOPT kind2=UCL_NOT_SPECIFIED) {
clear();
_row_bytes=cols*sizeof(numtyp);
int err=_host_alloc(*this,cq,_row_bytes,kind);
int err=_host_alloc(*this,cq,_row_bytes,kind,kind2);
if (err!=UCL_SUCCESS) {
#ifndef UCL_NO_EXIT
@ -86,17 +88,19 @@ class UCL_H_Vec : public UCL_BaseMat {
/// Set up host vector with 'cols' columns and reserve memory
/** The kind parameter controls memory pinning as follows:
* - UCL_NOT_PINNED - Memory is not pinned
* - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
* - UCL_RW_OPTIMIZED - Memory can be pinned
* - UCL_READ_WRITE - Specify that you will read and write from host
* - UCL_WRITE_ONLY - Specify that you will only write from host
* - UCL_READ_ONLY - Specify that you will only read from host
* - UCL_NOT_PINNED - Memory is not pinned/page-locked on host
* \param device Used to get the default command queue for operations
* \return UCL_SUCCESS if the memory allocation is successful **/
inline int alloc(const size_t cols, UCL_Device &device,
const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
const enum UCL_MEMOPT kind=UCL_READ_WRITE,
const enum UCL_MEMOPT kind2=UCL_NOT_SPECIFIED) {
clear();
_row_bytes=cols*sizeof(numtyp);
int err=_host_alloc(*this,device,_row_bytes,kind);
int err=_host_alloc(*this,device,_row_bytes,kind,kind2);
if (err!=UCL_SUCCESS) {
#ifndef UCL_NO_EXIT
@ -115,15 +119,11 @@ class UCL_H_Vec : public UCL_BaseMat {
return err;
}
/// Return the type of memory allocation
/** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, or UCL_VIEW **/
inline enum UCL_MEMOPT kind() const { return _kind; }
/// Do not allocate memory, instead use an existing allocation from Geryon
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* allocating container when using CUDA APIs
* - Viewing a device container on the host is not supported **/
template <class ucl_type>
inline void view(ucl_type &input, const size_t rows, const size_t cols) {
@ -139,6 +139,8 @@ class UCL_H_Vec : public UCL_BaseMat {
_end=_array+_cols;
#ifdef _OCL_MAT
_carray=input.cbegin();
CL_SAFE_CALL(clRetainMemObject(input.cbegin()));
CL_SAFE_CALL(clRetainCommandQueue(input.cq()));
#endif
}
@ -146,7 +148,7 @@ class UCL_H_Vec : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* allocating container when using CUDA APIs
* - Viewing a device container on the host is not supported
* \param stride Number of _elements_ between the start of each row **/
template <class ucl_type>
@ -157,7 +159,7 @@ class UCL_H_Vec : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding)
* will be used for view
* - Viewing a device container on the host is not supported **/
@ -180,7 +182,7 @@ class UCL_H_Vec : public UCL_BaseMat {
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* allocating container when using CUDA APIs
* - Viewing a device pointer on the host is not supported **/
template <class ptr_type>
inline void view(ptr_type *input, const size_t rows, const size_t cols,
@ -197,14 +199,14 @@ class UCL_H_Vec : public UCL_BaseMat {
_end=_array+_cols;
#ifdef _OCL_MAT
_host_alloc(*this,dev,_row_bytes,UCL_VIEW);
_host_view(*this,dev,_row_bytes);
#endif
}
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* allocating container when using CUDA APIs
* - Viewing a device pointer on the host is not supported
* \param stride Number of _elements_ between the start of each row **/
template <class ptr_type>
@ -215,7 +217,7 @@ class UCL_H_Vec : public UCL_BaseMat {
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* allocating container when using CUDA APIs
* - Viewing a device pointer on the host is not supported **/
template <class ptr_type>
inline void view(ptr_type *input, const size_t cols, UCL_Device &dev)
@ -225,7 +227,7 @@ class UCL_H_Vec : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* allocating container when using CUDA APIs
* - Viewing a device container on the host is not supported **/
template <class ucl_type>
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
@ -241,7 +243,7 @@ class UCL_H_Vec : public UCL_BaseMat {
_array=input.begin()+offset;
_end=_array+_cols;
#ifdef _OCL_MAT
_host_alloc(*this,input,_row_bytes,UCL_VIEW);
_host_view(*this,input,_row_bytes);
#endif
}
@ -249,7 +251,7 @@ class UCL_H_Vec : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* allocating container when using CUDA APIs
* - Viewing a device container on the host is not supported
* \param stride Number of _elements_ between the start of each row **/
template <class ucl_type>
@ -261,7 +263,7 @@ class UCL_H_Vec : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding)
* will be used for view
* - Viewing a device container on the host is not supported **/
@ -273,7 +275,7 @@ class UCL_H_Vec : public UCL_BaseMat {
/** This function must be passed a Geryon vector or matrix container.
* No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* allocating container when using CUDA APIs
* - If a matrix is used a input, all elements (including padding)
* will be used for view
* - Viewing a device container on the host is not supported **/
@ -284,7 +286,7 @@ class UCL_H_Vec : public UCL_BaseMat {
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* allocating container when using CUDA APIs
* - Viewing a device pointer on the host is not supported **/
template <class ptr_type>
inline void view_offset(const size_t offset,ptr_type *input,const size_t rows,
@ -294,7 +296,7 @@ class UCL_H_Vec : public UCL_BaseMat {
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* allocating container when using CUDA APIs
* - Viewing a device pointer on the host is not supported
* \param stride Number of _elements_ between the start of each row **/
template <class ptr_type>
@ -305,7 +307,7 @@ class UCL_H_Vec : public UCL_BaseMat {
/// Do not allocate memory, instead use an existing allocation
/** - No memory is freed when the object is destructed.
* - The view does not prevent the memory from being freed by the
* allocating container
* allocating container when using CUDA APIs
* - Viewing a device pointer on the host is not supported **/
template <class ptr_type>
inline void view_offset(const size_t offset, ptr_type *input,
@ -314,7 +316,7 @@ class UCL_H_Vec : public UCL_BaseMat {
/// Free memory and set size to 0
inline void clear()
{ if (_kind!=UCL_VIEW) {_kind=UCL_VIEW; _cols=0; _host_free(*this,_kind);}}
{ _host_free(*this); _kind=UCL_VIEW; _cols=0; }
/// Resize the allocation to contain cols elements
/** \note Cannot be used on views **/
@ -401,7 +403,6 @@ class UCL_H_Vec : public UCL_BaseMat {
#endif
private:
enum UCL_MEMOPT _kind;
numtyp *_array, *_end;
size_t _row_bytes, _cols;

View File

@ -48,17 +48,18 @@ class UCL_Matrix {
/// Construct with specied number of rows and columns
/** \sa alloc() **/
UCL_Matrix(const size_t rows, const size_t cols, UCL_Device &acc,
const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED,
const enum UCL_MEMOPT kind1=UCL_READ_WRITE,
const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
alloc(host,device,_buffer,rows,cols,acc,kind1,kind2); }
/// Set up host matrix with specied # of rows/cols and reserve memory
/** The kind1 parameter controls memory pinning as follows:
* - UCL_NOT_PINNED - Memory is not pinned
* - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
* - UCL_RW_OPTIMIZED - Memory can be pinned
* The kind2 parameter controls memory optimizations as follows:
/** The kind1 parameter controls memory access from the host
* - UCL_READ_WRITE - Specify that you will read and write from host
* - UCL_WRITE_ONLY - Specify that you will only write from host
* - UCL_READ_ONLY - Specify that you will only read from host
* - UCL_NOT_PINNED - Memory is not pinned/page-locked on host
* The kind2 parameter controls memory optimizations from the device:
* - UCL_READ_WRITE - Specify that you will read and write in kernels
* - UCL_WRITE_ONLY - Specify that you will only write in kernels
* - UCL_READ_ONLY - Specify that you will only read in kernels
@ -69,24 +70,25 @@ class UCL_Matrix {
* \return UCL_SUCCESS if the memory allocation is successful **/
template <class mat_type>
inline int alloc(const size_t rows, const size_t cols, mat_type &cq,
const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED,
const enum UCL_MEMOPT kind1=UCL_READ_WRITE,
const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
{ return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
alloc(host,device,_buffer,rows,cols,cq,kind1,kind2); }
/// Set up host matrix with specied # of rows/cols and reserve memory
/** The kind1 parameter controls memory pinning as follows:
* - UCL_NOT_PINNED - Memory is not pinned
* - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
* - UCL_RW_OPTIMIZED - Memory can be pinned
* The kind2 parameter controls memory optimizations as follows:
/** The kind1 parameter controls memory access from the host
* - UCL_READ_WRITE - Specify that you will read and write from host
* - UCL_WRITE_ONLY - Specify that you will only write from host
* - UCL_READ_ONLY - Specify that you will only read from host
* - UCL_NOT_PINNED - Memory is not pinned/page-locked on host
* The kind2 parameter controls memory optimizations from the device:
* - UCL_READ_WRITE - Specify that you will read and write in kernels
* - UCL_WRITE_ONLY - Specify that you will only write in kernels
* - UCL_READ_ONLY - Specify that you will only read in kernels
* \param device Used to get the default command queue for operations
* \return UCL_SUCCESS if the memory allocation is successful **/
inline int alloc(const size_t rows, const size_t cols, UCL_Device &acc,
const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED,
const enum UCL_MEMOPT kind1=UCL_READ_WRITE,
const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
{ return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
alloc(host,device,_buffer,rows,cols,acc,kind1,kind2); }
@ -110,11 +112,22 @@ class UCL_Matrix {
{ if (new_rows>rows() || new_cols>cols()) return resize(new_rows,new_cols);
else return UCL_SUCCESS; }
/// Set each element to zero
inline void zero() { host.zero(); device.zero(); }
/// Set first n elements to zero
inline void zero(const int n) { host.zero(n); device.zero(n); }
/// Set each element to zero (asynchronously on device)
inline void zero() { zero(cq()); }
/// Set first n elements to zero (asynchronously on device)
inline void zero(const int n) { zero(n,cq()); }
/// Set each element to zero (asynchronously on device)
inline void zero(command_queue &cq) {
host.zero();
if (device.kind()!=UCL_VIEW) device.zero(cq);
else if (_buffer.numel()>0) _buffer.zero();
}
/// Set first n elements to zero (asynchronously on device)
inline void zero(const int n, command_queue &cq) {
host.zero(n);
if (device.kind()!=UCL_VIEW) device.zero(n,cq);
else if (_buffer.numel()>0) _buffer.zero();
}
/// Get the number of elements
inline size_t numel() const { return host.numel(); }
@ -145,6 +158,8 @@ class UCL_Matrix {
/// Return the default command queue/stream associated with this data
inline command_queue & cq() { return host.cq(); }
/// Change the default command queue associated with this data
inline void cq(command_queue &cq_in) { host.cq(cq_in); device.cq(cq_in); }
/// Block until command_queue associated with matrix is complete
inline void sync() { host.sync(); }

View File

@ -32,14 +32,24 @@ template <> struct _ucl_s_obj_help<1> {
const enum UCL_MEMOPT kind1,
const enum UCL_MEMOPT kind2) {
int e1;
e1=host.alloc(cols,acc,kind1);
if (e1!=UCL_SUCCESS)
return e1;
if (acc.shared_memory()) {
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_ALLOC 1S\n";
#endif
e1=host.alloc(cols,acc,kind1,kind2);
if (e1!=UCL_SUCCESS)
return e1;
device.view(host);
return UCL_SUCCESS;
} else
} else {
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_ALLOC 1NS\n";
#endif
e1=host.alloc(cols,acc,kind1);
if (e1!=UCL_SUCCESS)
return e1;
return device.alloc(cols,acc,kind2);
}
}
template <class t1, class t2, class t3, class mat_type>
@ -48,10 +58,24 @@ template <> struct _ucl_s_obj_help<1> {
const enum UCL_MEMOPT kind1,
const enum UCL_MEMOPT kind2) {
int e1;
e1=host.alloc(cols,cq,kind1);
if (e1!=UCL_SUCCESS)
return e1;
return device.alloc(cols,cq,kind2);
if (cq.shared_mem_device()) {
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_ALLOC 2S\n";
#endif
e1=host.alloc(cols,cq,kind1,kind2);
if (e1!=UCL_SUCCESS)
return e1;
device.view(host);
return UCL_SUCCESS;
} else {
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_ALLOC 2NS\n";
#endif
e1=host.alloc(cols,cq,kind1);
if (e1!=UCL_SUCCESS)
return e1;
return device.alloc(cols,cq,kind2);
}
}
template <class t1, class t2, class t3>
@ -60,14 +84,24 @@ template <> struct _ucl_s_obj_help<1> {
const enum UCL_MEMOPT kind1,
const enum UCL_MEMOPT kind2) {
int e1;
e1=host.alloc(rows,cols,acc,kind1);
if (e1!=UCL_SUCCESS)
return e1;
if (acc.shared_memory()) {
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_ALLOC 3S\n";
#endif
e1=host.alloc(rows,cols,acc,kind1,kind2);
if (e1!=UCL_SUCCESS)
return e1;
device.view(host);
return UCL_SUCCESS;
} else
} else {
e1=host.alloc(rows,cols,acc,kind1);
if (e1!=UCL_SUCCESS)
return e1;
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_ALLOC 3NS\n";
#endif
return device.alloc(rows,cols,acc,kind2);
}
}
template <class t1, class t2, class t3, class mat_type>
@ -76,10 +110,24 @@ template <> struct _ucl_s_obj_help<1> {
const enum UCL_MEMOPT kind1,
const enum UCL_MEMOPT kind2) {
int e1;
e1=host.alloc(rows,cols,cq,kind1);
if (e1!=UCL_SUCCESS)
return e1;
return device.alloc(rows,cols,cq,kind2);
if (cq.shared_mem_device()) {
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_ALLOC 4S\n";
#endif
e1=host.alloc(rows,cols,cq,kind1,kind2);
if (e1!=UCL_SUCCESS)
return e1;
device.view(host);
return UCL_SUCCESS;
} else {
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_ALLOC 4NS\n";
#endif
e1=host.alloc(rows,cols,cq,kind1);
if (e1!=UCL_SUCCESS)
return e1;
return device.alloc(rows,cols,cq,kind2);
}
}
template <class t1, class t2, class t3>
@ -121,8 +169,15 @@ template <> struct _ucl_s_obj_help<1> {
if (device.kind()==UCL_VIEW) {
device.view(host);
return UCL_SUCCESS;
} else
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_ALLOC 5S\n";
#endif
} else {
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_ALLOC 5NS\n";
#endif
return device.resize(cols);
}
}
template <class t1, class t2, class t3>
@ -130,9 +185,16 @@ template <> struct _ucl_s_obj_help<1> {
const int cols) {
if (device.kind()==UCL_VIEW) {
device.view(host);
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_ALLOC 6S\n";
#endif
return UCL_SUCCESS;
} else
} else {
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_ALLOC 6NS\n";
#endif
return device.resize(rows,cols);
}
}
};
@ -145,17 +207,27 @@ template <int st> struct _ucl_s_obj_help {
const enum UCL_MEMOPT kind2) {
int e1;
e1=host.alloc(cols,acc,UCL_NOT_PINNED);
if (e1!=UCL_SUCCESS)
return e1;
e1=_buffer.alloc(cols,acc,kind1);
if (e1!=UCL_SUCCESS)
return e1;
if (acc.shared_memory()) {
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_ALLOC 7S\n";
#endif
e1=_buffer.alloc(cols,acc,kind1,kind2);
if (e1!=UCL_SUCCESS)
return e1;
device.view(_buffer);
return UCL_SUCCESS;
} else
} else {
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_ALLOC 7NS\n";
#endif
e1=_buffer.alloc(cols,acc,kind1);
if (e1!=UCL_SUCCESS)
return e1;
return device.alloc(cols,acc,kind2);
}
}
template <class t1, class t2, class t3, class mat_type>
@ -167,10 +239,24 @@ template <int st> struct _ucl_s_obj_help {
e1=host.alloc(cols,cq,UCL_NOT_PINNED);
if (e1!=UCL_SUCCESS)
return e1;
e1=_buffer.alloc(cols,cq,kind1);
if (e1!=UCL_SUCCESS)
return e1;
return device.alloc(cols,cq,kind2);
if (cq.shared_mem_device()) {
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_ALLOC 8S\n";
#endif
e1=_buffer.alloc(cols,cq,kind1,kind2);
if (e1!=UCL_SUCCESS)
return e1;
device.view(_buffer);
return UCL_SUCCESS;
} else {
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_ALLOC 8NS\n";
#endif
e1=_buffer.alloc(cols,cq,kind1);
if (e1!=UCL_SUCCESS)
return e1;
return device.alloc(cols,cq,kind2);
}
}
template <class t1, class t2, class t3>
@ -180,17 +266,27 @@ template <int st> struct _ucl_s_obj_help {
const enum UCL_MEMOPT kind2) {
int e1;
e1=host.alloc(rows,cols,acc,UCL_NOT_PINNED);
if (e1!=UCL_SUCCESS)
return e1;
e1=_buffer.alloc(rows,cols,acc,kind1);
if (e1!=UCL_SUCCESS)
return e1;
if (acc.shared_memory()) {
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_ALLOC 9S\n";
#endif
e1=_buffer.alloc(rows,cols,acc,kind1,kind2);
if (e1!=UCL_SUCCESS)
return e1;
device.view(_buffer);
return UCL_SUCCESS;
} else
} else {
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_ALLOC 9NS\n";
#endif
e1=_buffer.alloc(rows,cols,acc,kind1);
if (e1!=UCL_SUCCESS)
return e1;
return device.alloc(rows,cols,acc,kind2);
}
}
template <class t1, class t2, class t3, class mat_type>
@ -202,10 +298,24 @@ template <int st> struct _ucl_s_obj_help {
e1=host.alloc(rows,cols,cq,UCL_NOT_PINNED);
if (e1!=UCL_SUCCESS)
return e1;
e1=_buffer.alloc(rows,cols,cq,kind1);
if (e1!=UCL_SUCCESS)
return e1;
return device.alloc(rows,cols,cq,kind2);
if (cq.shared_mem_device()) {
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_ALLOC 10S\n";
#endif
e1=_buffer.alloc(rows,cols,cq,kind1,kind2);
if (e1!=UCL_SUCCESS)
return e1;
device.view(_buffer);
return UCL_SUCCESS;
} else {
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_ALLOC 10NS\n";
#endif
e1=_buffer.alloc(rows,cols,cq,kind1);
if (e1!=UCL_SUCCESS)
return e1;
return device.alloc(rows,cols,cq,kind2);
}
}
template <class t1, class t2, class t3>
@ -250,9 +360,16 @@ template <int st> struct _ucl_s_obj_help {
if (device.kind()==UCL_VIEW) {
device.view(buff);
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_ALLOC 11S\n";
#endif
return UCL_SUCCESS;
} else
} else {
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_ALLOC 11NS\n";
#endif
return device.resize(cols);
}
}
template <class t1, class t2, class t3>
@ -264,9 +381,17 @@ template <int st> struct _ucl_s_obj_help {
if (device.kind()==UCL_VIEW) {
device.view(buff);
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_ALLOC 12S\n";
#endif
return UCL_SUCCESS;
} else
} else {
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_ALLOC 12NS\n";
#endif
return device.resize(rows,cols);
}
}
};

View File

@ -92,10 +92,9 @@ enum UCL_MEMOPT {
UCL_WRITE_ONLY, ///< Allow any optimizations for memory that is write only
UCL_READ_ONLY, ///< Allow any optimizations for memory that is read only
UCL_READ_WRITE, ///< Allow read and write
UCL_WRITE_OPTIMIZED,///< Allow host memory to be pinned (write combined)
UCL_RW_OPTIMIZED, ///< Allow host memory to be pinned
UCL_NOT_PINNED, ///< Host memory is not to be pinned
UCL_VIEW ///< View of another memory allocation
UCL_VIEW, ///< View of another memory allocation
UCL_NOT_SPECIFIED
};
enum UCL_DEVICE_TYPE {

View File

@ -48,17 +48,18 @@ class UCL_Vector {
/// Construct with n columns
/** \sa alloc() **/
UCL_Vector(const size_t cols, UCL_Device &acc,
const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED,
const enum UCL_MEMOPT kind1=UCL_READ_WRITE,
const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
alloc(host,device,_buffer,cols,acc,kind1,kind2); }
/// Set up the vector with 'cols' columns and reserve memory
/** The kind1 parameter controls memory pinning as follows:
* - UCL_NOT_PINNED - Memory is not pinned
* - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
* - UCL_RW_OPTIMIZED - Memory can be pinned
* The kind2 parameter controls memory optimizations as follows:
/** The kind1 parameter controls memory access from the host
* - UCL_READ_WRITE - Specify that you will read and write from host
* - UCL_WRITE_ONLY - Specify that you will only write from host
* - UCL_READ_ONLY - Specify that you will only read from host
* - UCL_NOT_PINNED - Memory is not pinned/page-locked on host
* The kind2 parameter controls memory optimizations from the device:
* - UCL_READ_WRITE - Specify that you will read and write in kernels
* - UCL_WRITE_ONLY - Specify that you will only write in kernels
* - UCL_READ_ONLY - Specify that you will only read in kernels
@ -69,24 +70,25 @@ class UCL_Vector {
* \return UCL_SUCCESS if the memory allocation is successful **/
template <class mat_type>
inline int alloc(const size_t cols, mat_type &cq,
const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED,
const enum UCL_MEMOPT kind1=UCL_READ_WRITE,
const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
{ return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
alloc(host,device,_buffer,cols,cq,kind1,kind2); }
/// Set up host vector with 'cols' columns and reserve memory
/** The kind1 parameter controls memory pinning as follows:
* - UCL_NOT_PINNED - Memory is not pinned
* - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
* - UCL_RW_OPTIMIZED - Memory can be pinned
* The kind2 parameter controls memory optimizations as follows:
/** The kind1 parameter controls memory access from the host
* - UCL_READ_WRITE - Specify that you will read and write from host
* - UCL_WRITE_ONLY - Specify that you will only write from host
* - UCL_READ_ONLY - Specify that you will only read from host
* - UCL_NOT_PINNED - Memory is not pinned/page-locked on host
* The kind2 parameter controls memory optimizations from the device:
* - UCL_READ_WRITE - Specify that you will read and write in kernels
* - UCL_WRITE_ONLY - Specify that you will only write in kernels
* - UCL_READ_ONLY - Specify that you will only read in kernels
* \param device Used to get the default command queue for operations
* \return UCL_SUCCESS if the memory allocation is successful **/
inline int alloc(const size_t cols, UCL_Device &acc,
const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED,
const enum UCL_MEMOPT kind1=UCL_READ_WRITE,
const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
{ return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
alloc(host,device,_buffer,cols,acc,kind1,kind2); }
@ -109,11 +111,22 @@ class UCL_Vector {
inline int resize_ib(const int new_cols)
{ if (new_cols>cols()) return resize(new_cols); else return UCL_SUCCESS; }
/// Set each element to zero
inline void zero() { host.zero(); device.zero(); }
/// Set first n elements to zero
inline void zero(const int n) { host.zero(n); device.zero(n); }
/// Set each element to zero (asynchronously on device)
inline void zero() { zero(cq()); }
/// Set first n elements to zero (asynchronously on device)
inline void zero(const int n) { zero(n,cq()); }
/// Set each element to zero (asynchronously on device)
inline void zero(command_queue &cq) {
host.zero();
if (device.kind()!=UCL_VIEW) device.zero(cq);
else if (_buffer.numel()>0) _buffer.zero();
}
/// Set first n elements to zero (asynchronously on device)
inline void zero(const int n, command_queue &cq) {
host.zero(n);
if (device.kind()!=UCL_VIEW) device.zero(n,cq);
else if (_buffer.numel()>0) _buffer.zero();
}
/// Get the number of elements
inline size_t numel() const { return host.numel(); }
@ -145,6 +158,8 @@ class UCL_Vector {
/// Return the default command queue/stream associated with this data
inline command_queue & cq() { return host.cq(); }
/// Change the default command queue associated with this data
inline void cq(command_queue &cq_in) { host.cq(cq_in); device.cq(cq_in); }
/// Block until command_queue associated with matrix is complete
inline void sync() { host.sync(); }

View File

@ -44,10 +44,10 @@ bool AnswerT::alloc(const int inum) {
_ans_fields+=4;
// --------------------------- Device allocations
success=success && (engv.alloc(_ev_fields*_max_local,*dev,UCL_RW_OPTIMIZED,
UCL_WRITE_ONLY)==UCL_SUCCESS);
success=success && (force.alloc(_ans_fields*_max_local,*dev,UCL_RW_OPTIMIZED,
UCL_WRITE_ONLY)==UCL_SUCCESS);
success=success && (engv.alloc(_ev_fields*_max_local,*dev,UCL_READ_ONLY,
UCL_READ_WRITE)==UCL_SUCCESS);
success=success && (force.alloc(_ans_fields*_max_local,*dev,UCL_READ_ONLY,
UCL_READ_WRITE)==UCL_SUCCESS);
_gpu_bytes=engv.device.row_bytes()+force.device.row_bytes();
_allocated=true;
@ -175,78 +175,42 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
return 0.0;
double evdwl=0.0;
double virial_acc[6];
for (int i=0; i<6; i++) virial_acc[i]=0.0;
if (_ilist==NULL) {
for (int i=0; i<_inum; i++) {
int al=i;
if (_eflag) {
if (_ef_atom) {
evdwl+=engv[al];
eatom[i]+=engv[al]*0.5;
al+=_inum;
} else {
evdwl+=engv[al];
al+=_inum;
}
}
if (_vflag) {
if (_vf_atom) {
for (int j=0; j<6; j++) {
vatom[i][j]+=engv[al]*0.5;
virial_acc[j]+=engv[al];
al+=_inum;
}
} else {
for (int j=0; j<6; j++) {
virial_acc[j]+=engv[al];
al+=_inum;
}
}
}
int vstart=0;
if (_eflag) {
for (int i=0; i<_inum; i++)
evdwl+=engv[i];
if (_ef_atom)
if (_ilist==NULL)
for (int i=0; i<_inum; i++)
eatom[i]+=engv[i];
else
for (int i=0; i<_inum; i++)
eatom[_ilist[i]]+=engv[i];
vstart=_inum;
}
if (_vflag) {
int iend=vstart+_inum;
for (int j=0; j<6; j++) {
for (int i=vstart; i<iend; i++)
virial[j]+=engv[i];
if (_vf_atom)
if (_ilist==NULL)
for (int i=vstart; i<iend; i++)
vatom[i][j]+=engv[i];
else
for (int i=vstart; i<iend; i++)
vatom[_ilist[i]][j]+=engv[i];
vstart+=_inum;
iend+=_inum;
}
for (int j=0; j<6; j++)
virial[j]+=virial_acc[j]*0.5;
} else {
for (int i=0; i<_inum; i++) {
int al=i;
int ii=_ilist[i];
if (_eflag) {
if (_ef_atom) {
evdwl+=engv[al];
eatom[ii]+=engv[al]*0.5;
al+=_inum;
} else {
evdwl+=engv[al];
al+=_inum;
}
}
if (_vflag) {
if (_vf_atom) {
for (int j=0; j<6; j++) {
vatom[ii][j]+=engv[al]*0.5;
virial_acc[j]+=engv[al];
al+=_inum;
}
} else {
for (int j=0; j<6; j++) {
virial_acc[j]+=engv[al];
al+=_inum;
}
}
}
}
for (int j=0; j<6; j++)
virial[j]+=virial_acc[j]*0.5;
}
evdwl*=0.5;
return evdwl;
}
template <class numtyp, class acctyp>
double AnswerT::energy_virial(double *eatom, double **vatom,
double *virial, double &ecoul) {
double *virial, double &ecoul) {
if (_eflag==false && _vflag==false)
return 0.0;
@ -254,84 +218,43 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
return energy_virial(eatom,vatom,virial);
double evdwl=0.0;
double _ecoul=0.0;
double virial_acc[6];
for (int i=0; i<6; i++) virial_acc[i]=0.0;
if (_ilist==NULL) {
for (int i=0; i<_inum; i++) {
int al=i;
if (_eflag) {
if (_ef_atom) {
evdwl+=engv[al];
eatom[i]+=engv[al]*0.5;
al+=_inum;
_ecoul+=engv[al];
eatom[i]+=engv[al]*0.5;
al+=_inum;
} else {
evdwl+=engv[al];
al+=_inum;
_ecoul+=engv[al];
al+=_inum;
}
}
if (_vflag) {
if (_vf_atom) {
for (int j=0; j<6; j++) {
vatom[i][j]+=engv[al]*0.5;
virial_acc[j]+=engv[al];
al+=_inum;
}
} else {
for (int j=0; j<6; j++) {
virial_acc[j]+=engv[al];
al+=_inum;
}
}
int vstart=0, iend=_inum*2;
if (_eflag) {
for (int i=0; i<_inum; i++)
evdwl+=engv[i];
for (int i=_inum; i<iend; i++)
ecoul+=engv[i];
if (_ef_atom)
if (_ilist==NULL) {
for (int i=0; i<_inum; i++)
eatom[i]+=engv[i];
for (int i=_inum; i<iend; i++)
eatom[i]+=engv[i];
} else {
for (int i=0; i<_inum; i++)
eatom[_ilist[i]]+=engv[i];
for (int i=_inum; i<iend; i++)
eatom[_ilist[i]]+=engv[i];
}
vstart=iend;
iend+=_inum;
}
if (_vflag) {
for (int j=0; j<6; j++) {
for (int i=vstart; i<iend; i++)
virial[j]+=engv[i];
if (_vf_atom)
if (_ilist==NULL)
for (int i=vstart; i<iend; i++)
vatom[i][j]+=engv[i];
else
for (int i=vstart; i<iend; i++)
vatom[_ilist[i]][j]+=engv[i];
vstart+=_inum;
iend+=_inum;
}
for (int j=0; j<6; j++)
virial[j]+=virial_acc[j]*0.5;
} else {
for (int i=0; i<_inum; i++) {
int al=i;
int ii=_ilist[i];
if (_eflag) {
if (_ef_atom) {
evdwl+=engv[al];
eatom[ii]+=engv[al]*0.5;
al+=_inum;
_ecoul+=engv[al];
eatom[ii]+=engv[al]*0.5;
al+=_inum;
} else {
evdwl+=engv[al];
al+=_inum;
_ecoul+=engv[al];
al+=_inum;
}
}
if (_vflag) {
if (_vf_atom) {
for (int j=0; j<6; j++) {
vatom[ii][j]+=engv[al]*0.5;
virial_acc[j]+=engv[al];
al+=_inum;
}
} else {
for (int j=0; j<6; j++) {
virial_acc[j]+=engv[al];
al+=_inum;
}
}
}
}
for (int j=0; j<6; j++)
virial[j]+=virial_acc[j]*0.5;
}
evdwl*=0.5;
ecoul+=_ecoul*0.5;
return evdwl;
}
@ -373,4 +296,14 @@ void AnswerT::get_answers(double **f, double **tor) {
}
}
template <class numtyp, class acctyp>
void AnswerT::cq(const int cq_index) {
engv.cq(dev->cq(cq_index));
force.cq(dev->cq(cq_index));
time_answer.clear();
time_answer.init(*dev,dev->cq(cq_index));
time_answer.zero();
}
template class Answer<PRECISION,ACC_PRECISION>;

View File

@ -47,6 +47,8 @@ class Answer {
inline int inum() const { return _inum; }
/// Set number of local atoms for future copy operations
inline void inum(const int n) { _inum=n; }
/// Return the maximum number of atoms that can be stored currently
inline int max_inum() const { return _max_local; }
/// Memory usage per atom in this class
int bytes_per_atom() const;
@ -132,6 +134,9 @@ class Answer {
/// Return the time the CPU was idle waiting for GPU
inline double cpu_idle_time() { return _time_cpu_idle; }
/// Change the command queue used for copies and timers
void cq(const int cq_index);
// ------------------------------ DATA ----------------------------------
/// Force and possibly torque

View File

@ -70,44 +70,47 @@ bool AtomT::alloc(const int nall) {
// --------------------------- Device allocations
int gpu_bytes=0;
success=success && (x.alloc(_max_atoms*4,*dev,UCL_WRITE_OPTIMIZED,
success=success && (x.alloc(_max_atoms*4,*dev,UCL_WRITE_ONLY,
UCL_READ_ONLY)==UCL_SUCCESS);
#ifdef GPU_CAST
success=success && (x_cast.alloc(_max_atoms*3,*dev,UCL_READ_ONLY)==
UCL_SUCCESS);
success=success && (type_cast.alloc(_max_atoms,*dev,UCL_READ_ONLY)==
UCL_SUCCESS);
success=success && (x_cast.alloc(_max_atoms*3,*dev,UCL_WRITE_ONLY,
UCL_READ_ONLY)==UCL_SUCCESS);
success=success && (type_cast.alloc(_max_atoms,*dev,UCL_WRITE_ONLY,
UCL_READ_ONLY)==UCL_SUCCESS);
gpu_bytes+=x_cast.device.row_bytes()+type_cast.device.row_bytes();
#endif
if (_charge && _host_view==false) {
success=success && (q.alloc(_max_atoms,*dev,UCL_WRITE_OPTIMIZED,
success=success && (q.alloc(_max_atoms,*dev,UCL_WRITE_ONLY,
UCL_READ_ONLY)==UCL_SUCCESS);
gpu_bytes+=q.device.row_bytes();
}
if (_rot && _host_view==false) {
success=success && (quat.alloc(_max_atoms*4,*dev,UCL_WRITE_OPTIMIZED,
success=success && (quat.alloc(_max_atoms*4,*dev,UCL_WRITE_ONLY,
UCL_READ_ONLY)==UCL_SUCCESS);
gpu_bytes+=quat.device.row_bytes();
}
if (_gpu_nbor>0) {
if (_bonds) {
success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS);
success=success && (dev_tag.alloc(_max_atoms,*dev,
UCL_READ_ONLY)==UCL_SUCCESS);
gpu_bytes+=dev_tag.row_bytes();
}
if (_gpu_nbor==1) {
success=success && (dev_cell_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
gpu_bytes+=dev_cell_id.row_bytes();
} else {
success=success && (host_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
success=success && (host_particle_id.alloc(_max_atoms,*dev,
UCL_WRITE_ONLY)==UCL_SUCCESS);
success=success &&
(host_cell_id.alloc(_max_atoms,*dev,UCL_NOT_PINNED)==UCL_SUCCESS);
}
if (_gpu_nbor==2 && _host_view)
dev_particle_id.view(host_particle_id);
else
success=success && (dev_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
success=success && (dev_particle_id.alloc(_max_atoms,*dev,
UCL_READ_ONLY)==UCL_SUCCESS);
gpu_bytes+=dev_particle_id.row_bytes();
}
@ -130,7 +133,7 @@ bool AtomT::add_fields(const bool charge, const bool rot,
_charge=true;
_other=true;
if (_host_view==false) {
success=success && (q.alloc(_max_atoms,*dev,UCL_WRITE_OPTIMIZED,
success=success && (q.alloc(_max_atoms,*dev,UCL_WRITE_ONLY,
UCL_READ_ONLY)==UCL_SUCCESS);
gpu_bytes+=q.device.row_bytes();
}
@ -140,7 +143,7 @@ bool AtomT::add_fields(const bool charge, const bool rot,
_rot=true;
_other=true;
if (_host_view==false) {
success=success && (quat.alloc(_max_atoms*4,*dev,UCL_WRITE_OPTIMIZED,
success=success && (quat.alloc(_max_atoms*4,*dev,UCL_WRITE_ONLY,
UCL_READ_ONLY)==UCL_SUCCESS);
gpu_bytes+=quat.device.row_bytes();
}
@ -149,7 +152,8 @@ bool AtomT::add_fields(const bool charge, const bool rot,
if (bonds && _bonds==false) {
_bonds=true;
if (_bonds && _gpu_nbor>0) {
success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS);
success=success && (dev_tag.alloc(_max_atoms,*dev,
UCL_READ_ONLY)==UCL_SUCCESS);
gpu_bytes+=dev_tag.row_bytes();
}
}
@ -163,17 +167,20 @@ bool AtomT::add_fields(const bool charge, const bool rot,
return false;
}
#endif
success=success && (dev_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
success=success && (dev_particle_id.alloc(_max_atoms,*dev,
UCL_READ_ONLY)==UCL_SUCCESS);
gpu_bytes+=dev_particle_id.row_bytes();
if (_bonds) {
success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS);
success=success && (dev_tag.alloc(_max_atoms,*dev,
UCL_READ_ONLY)==UCL_SUCCESS);
gpu_bytes+=dev_tag.row_bytes();
}
if (_gpu_nbor==1) {
success=success && (dev_cell_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
gpu_bytes+=dev_cell_id.row_bytes();
} else {
success=success && (host_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
success=success && (host_particle_id.alloc(_max_atoms,*dev,
UCL_WRITE_ONLY)==UCL_SUCCESS);
success=success &&
(host_cell_id.alloc(_max_atoms,*dev,UCL_NOT_PINNED)==UCL_SUCCESS);
}

View File

@ -77,12 +77,12 @@
if (offset==0) { \
engv+=ii; \
if (eflag>0) { \
*engv=energy; \
*engv=energy*(acctyp)0.5; \
engv+=inum; \
} \
if (vflag>0) { \
for (int i=0; i<6; i++) { \
*engv=virial[i]; \
*engv=virial[i]*(acctyp)0.5; \
engv+=inum; \
} \
} \
@ -125,14 +125,14 @@
if (offset==0) { \
engv+=ii; \
if (eflag>0) { \
*engv=energy; \
*engv=energy*(acctyp)0.5; \
engv+=inum; \
*engv=e_coul; \
*engv=e_coul*(acctyp)0.5; \
engv+=inum; \
} \
if (vflag>0) { \
for (int i=0; i<6; i++) { \
*engv=virial[i]; \
*engv=virial[i]*(acctyp)0.5; \
engv+=inum; \
} \
} \
@ -160,12 +160,12 @@
if (offset==0) { \
engv+=ii; \
if (eflag>0) { \
*engv=energy; \
*engv=energy*(acctyp)0.5; \
engv+=inum; \
} \
if (vflag>0) { \
for (int i=0; i<6; i++) { \
*engv=virial[i]; \
*engv=virial[i]*(acctyp)0.5; \
engv+=inum; \
} \
} \
@ -192,14 +192,14 @@
if (offset==0) { \
engv+=ii; \
if (eflag>0) { \
*engv=energy; \
*engv=energy*(acctyp)0.5; \
engv+=inum; \
*engv=e_coul; \
*engv=e_coul*(acctyp)0.5; \
engv+=inum; \
} \
if (vflag>0) { \
for (int i=0; i<6; i++) { \
*engv=virial[i]; \
*engv=virial[i]*(acctyp)0.5; \
engv+=inum; \
} \
} \

View File

@ -272,12 +272,8 @@ void BaseAtomicT::compile_kernels(UCL_Device &dev, const void *pair_str,
return;
std::string s_fast=std::string(kname)+"_fast";
std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
std::string(OCL_PRECISION_COMPILE)+" -D"+
std::string(OCL_VENDOR);
pair_program=new UCL_Program(dev);
pair_program->load_string(pair_str,flags.c_str());
pair_program->load_string(pair_str,device->compile_string().c_str());
k_pair_fast.set_function(*pair_program,s_fast.c_str());
k_pair.set_function(*pair_program,kname);
pos_tex.get_texture(*pair_program,"pos_tex");

View File

@ -288,12 +288,8 @@ void BaseChargeT::compile_kernels(UCL_Device &dev, const void *pair_str,
return;
std::string s_fast=std::string(kname)+"_fast";
std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
std::string(OCL_PRECISION_COMPILE)+" -D"+
std::string(OCL_VENDOR);
pair_program=new UCL_Program(dev);
pair_program->load_string(pair_str,flags.c_str());
pair_program->load_string(pair_str,device->compile_string().c_str());
k_pair_fast.set_function(*pair_program,s_fast.c_str());
k_pair.set_function(*pair_program,kname);
pos_tex.get_texture(*pair_program,"pos_tex");

View File

@ -296,12 +296,8 @@ void BaseDipoleT::compile_kernels(UCL_Device &dev, const void *pair_str,
return;
std::string s_fast=std::string(kname)+"_fast";
std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
std::string(OCL_PRECISION_COMPILE)+" -D"+
std::string(OCL_VENDOR);
pair_program=new UCL_Program(dev);
pair_program->load_string(pair_str,flags.c_str());
pair_program->load_string(pair_str,device->compile_string().c_str());
k_pair_fast.set_function(*pair_program,s_fast.c_str());
k_pair.set_function(*pair_program,kname);
pos_tex.get_texture(*pair_program,"pos_tex");

View File

@ -455,9 +455,7 @@ void BaseEllipsoidT::compile_kernels(UCL_Device &dev,
std::string s_lj=kns+"_lj";
std::string s_lj_fast=kns+"_lj_fast";
std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
std::string(OCL_PRECISION_COMPILE)+" -D"+
std::string(OCL_VENDOR);
std::string flags=device->compile_string();
nbor_program=new UCL_Program(dev);
nbor_program->load_string(ellipsoid_nbor,flags.c_str());

View File

@ -69,7 +69,7 @@ int BornT::init(const int ntypes, double **host_cutsq,
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0;

View File

@ -54,7 +54,8 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -148,7 +149,8 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);

View File

@ -73,7 +73,7 @@ int BornCoulLongT::init(const int ntypes, double **host_cutsq, double **host_rho
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0;

View File

@ -67,7 +67,8 @@ __kernel void k_born_long(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -188,7 +189,8 @@ __kernel void k_born_long_fast(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);

View File

@ -73,7 +73,7 @@ int BornCoulWolfT::init(const int ntypes, double **host_cutsq, double **host_rho
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0;

View File

@ -70,7 +70,8 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -201,7 +202,8 @@ __kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);

View File

@ -68,7 +68,7 @@ int BuckT::init(const int ntypes, double **host_cutsq,
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0;

View File

@ -53,7 +53,8 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -145,7 +146,8 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);

View File

@ -70,7 +70,7 @@ int BuckCoulT::init(const int ntypes, double **host_cutsq,
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0;

View File

@ -66,7 +66,8 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -182,7 +183,8 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);

View File

@ -72,7 +72,7 @@ int BuckCoulLongT::init(const int ntypes, double **host_cutsq,
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0;

View File

@ -67,7 +67,8 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -190,7 +191,8 @@ __kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);

View File

@ -69,7 +69,7 @@ int CGCMMT::init(const int ntypes, double **host_cutsq,
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(cmm_types*cmm_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
UCL_WRITE_ONLY);
for (int i=0; i<cmm_types*cmm_types; i++)
host_write[i]=0.0;

View File

@ -53,7 +53,8 @@ __kernel void k_cg_cmm(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -150,7 +151,8 @@ __kernel void k_cg_cmm_fast(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);

View File

@ -74,7 +74,7 @@ int CGCMMLongT::init(const int ntypes, double **host_cutsq,
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0;

View File

@ -66,7 +66,8 @@ __kernel void k_cg_cmm_long(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -191,7 +192,8 @@ __kernel void k_cg_cmm_long_fast(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);

View File

@ -76,7 +76,7 @@ int CHARMMLongT::init(const int ntypes,
if (h_size<max_bio_shared_types)
h_size=max_bio_shared_types;
UCL_H_Vec<numtyp> host_write(h_size*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
UCL_WRITE_ONLY);
for (int i=0; i<h_size*32; i++)
host_write[i]=0.0;

View File

@ -67,7 +67,8 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -196,7 +197,8 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);

View File

@ -73,7 +73,7 @@ int ColloidT::init(const int ntypes,
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0;
@ -95,7 +95,7 @@ int ColloidT::init(const int ntypes,
host_sigma3,host_sigma6);
UCL_H_Vec<int> dview_form(lj_types*lj_types,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++) dview_form[i]=0;
form.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);

View File

@ -56,7 +56,8 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -219,7 +220,8 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);

View File

@ -73,7 +73,7 @@ int CoulDSFT::init(const int ntypes, const int nlocal, const int nall,
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0;

View File

@ -63,7 +63,8 @@ __kernel void k_coul_dsf(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -163,7 +164,8 @@ __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);

View File

@ -68,7 +68,7 @@ int CoulLongT::init(const int nlocal, const int nall, const int max_nbors,
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0;

View File

@ -74,12 +74,12 @@ texture<int2> q_tex;
if (eflag>0) { \
*ap1=(acctyp)0; \
ap1+=inum; \
*ap1=e_coul; \
*ap1=e_coul*(acctyp)0.5; \
ap1+=inum; \
} \
if (vflag>0) { \
for (int i=0; i<6; i++) { \
*ap1=virial[i]; \
*ap1=virial[i]*(acctyp)0.5; \
ap1+=inum; \
} \
} \
@ -109,12 +109,12 @@ texture<int2> q_tex;
if (eflag>0) { \
*ap1=(acctyp)0; \
ap1+=inum; \
*ap1=e_coul; \
*ap1=e_coul*(acctyp)0.5; \
ap1+=inum; \
} \
if (vflag>0) { \
for (int i=0; i<6; i++) { \
*ap1=virial[i]; \
*ap1=virial[i]*(acctyp)0.5; \
ap1+=inum; \
} \
} \
@ -155,7 +155,8 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -244,7 +245,8 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);

View File

@ -47,7 +47,8 @@ template <class numtyp, class acctyp>
int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
const int last_gpu, const int gpu_mode,
const double p_split, const int nthreads,
const int t_per_atom, const double cell_size) {
const int t_per_atom, const double cell_size,
char *ocl_vendor) {
_nthreads=nthreads;
#ifdef _OPENMP
omp_set_num_threads(nthreads);
@ -140,6 +141,9 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
_long_range_precompute=0;
if (set_ocl_params(ocl_vendor)!=0)
return -11;
int flag=0;
for (int i=0; i<_procs_per_gpu; i++) {
if (_gpu_rank==i)
@ -149,6 +153,64 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
return flag;
}
template <class numtyp, class acctyp>
int DeviceT::set_ocl_params(char *ocl_vendor) {
#ifdef USE_OPENCL
std::string s_vendor=OCL_DEFAULT_VENDOR;
if (ocl_vendor!=NULL)
s_vendor=ocl_vendor;
if (s_vendor=="none")
s_vendor="generic";
if (s_vendor=="kepler") {
_ocl_vendor_name="NVIDIA Kepler";
#if defined (__APPLE__) || defined(MACOSX)
_ocl_vendor_string="-DKEPLER_OCL -DNO_OCL_PTX";
#else
_ocl_vendor_string="-DKEPLER_OCL";
#endif
} else if (s_vendor=="fermi") {
_ocl_vendor_name="NVIDIA Fermi";
_ocl_vendor_string="-DFERMI_OCL";
} else if (s_vendor=="cypress") {
_ocl_vendor_name="AMD Cypress";
_ocl_vendor_string="-DCYPRESS_OCL";
} else if (s_vendor=="generic") {
_ocl_vendor_name="GENERIC";
_ocl_vendor_string="-DGENERIC_OCL";
} else {
_ocl_vendor_name="CUSTOM";
_ocl_vendor_string="-DUSE_OPENCL";
int token_count=0;
std::string params[13];
char *pch = strtok(ocl_vendor,"\" ");
while (pch != NULL) {
if (token_count==13)
return -11;
params[token_count]=pch;
token_count++;
pch = strtok(NULL,"\" ");
}
_ocl_vendor_string+=" -DMEM_THREADS="+params[0]+
" -DTHREADS_PER_ATOM="+params[1]+
" -DTHREADS_PER_CHARGE="+params[2]+
" -DBLOCK_PAIR="+params[3]+
" -DMAX_SHARED_TYPES="+params[4]+
" -DBLOCK_NBOR_BUILD="+params[5]+
" -DBLOCK_BIO_PAIR="+params[6]+
" -DBLOCK_ELLIPSE="+params[7]+
" -DWARP_SIZE="+params[8]+
" -DPPPM_BLOCK_1D="+params[9]+
" -DBLOCK_CELL_2D="+params[10]+
" -DBLOCK_CELL_ID="+params[11]+
" -DMAX_BIO_SHARED_TYPES="+params[12];
}
_ocl_compile_string="-cl-fast-relaxed-math -cl-mad-enable "+
std::string(OCL_PRECISION_COMPILE)+" "+_ocl_vendor_string;
#endif
return 0;
}
template <class numtyp, class acctyp>
int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
const bool rot, const int nlocal,
@ -206,7 +268,7 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
if (!nbor->init(&_neighbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial,
*gpu,gpu_nbor,gpu_host,pre_cut, _block_cell_2d,
_block_cell_id, _block_nbor_build, threads_per_atom,
_warp_size, _time_device))
_warp_size, _time_device, compile_string()))
return -3;
if (_cell_size<0.0)
nbor->cell_size(cell_size,cell_size);
@ -274,7 +336,8 @@ void DeviceT::init_message(FILE *screen, const char *name,
fprintf(screen,"- with %d thread(s) per proc.\n",_nthreads);
#endif
#ifdef USE_OPENCL
fprintf(screen,"- with OpenCL Parameters for: %s\n",OCL_VENDOR);
fprintf(screen,"- with OpenCL Parameters for: %s\n",
_ocl_vendor_name.c_str());
#endif
fprintf(screen,"-------------------------------------");
fprintf(screen,"-------------------------------------\n");
@ -571,9 +634,8 @@ int DeviceT::compile_kernels() {
if (_compiled)
return flag;
std::string flags="-cl-mad-enable -D"+std::string(OCL_VENDOR);
dev_program=new UCL_Program(*gpu);
int success=dev_program->load_string(device,flags.c_str());
int success=dev_program->load_string(device,compile_string().c_str());
if (success!=UCL_SUCCESS)
return -4;
k_zero.set_function(*dev_program,"kernel_zero");
@ -640,10 +702,11 @@ Device<PRECISION,ACC_PRECISION> global_device;
int lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
const int last_gpu, const int gpu_mode,
const double particle_split, const int nthreads,
const int t_per_atom, const double cell_size) {
const int t_per_atom, const double cell_size,
char *opencl_vendor) {
return global_device.init_device(world,replica,first_gpu,last_gpu,gpu_mode,
particle_split,nthreads,t_per_atom,
cell_size);
cell_size,opencl_vendor);
}
void lmp_clear_device() {
@ -654,3 +717,4 @@ double lmp_gpu_forces(double **f, double **tor, double *eatom,
double **vatom, double *virial, double &ecoul) {
return global_device.fix_gpu(f,tor,eatom,vatom,virial,ecoul);
}

View File

@ -45,11 +45,13 @@ class Device {
* - -2 if GPU not found
* - -4 if GPU library not compiled for GPU
* - -6 if GPU could not be initialized for use
* - -7 if accelerator sharing is not currently allowed on system **/
* - -7 if accelerator sharing is not currently allowed on system
* - -11 if vendor_string has the wrong number of parameters **/
int init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
const int last_gpu, const int gpu_mode,
const double particle_split, const int nthreads,
const int t_per_atom, const double cell_size);
const int last_gpu, const int gpu_mode,
const double particle_split, const int nthreads,
const int t_per_atom, const double cell_size,
char *vendor_string);
/// Initialize the device for Atom and Neighbor storage
/** \param rot True if quaternions need to be stored
@ -234,6 +236,8 @@ class Device {
inline int max_bio_shared_types() const { return _max_bio_shared_types; }
/// Architecture gpu code compiled for (returns 0 for OpenCL)
inline double ptx_arch() const { return _ptx_arch; }
/// Number of threads executing concurrently on same multiproc
inline int warp_size() const { return _warp_size; }
// -------------------- SHARED DEVICE ROUTINES --------------------
// Perform asynchronous zero of integer array
@ -279,6 +283,8 @@ class Device {
boxlo,prd);
}
inline std::string compile_string() { return _ocl_compile_string; }
private:
std::queue<Answer<numtyp,acctyp> *> ans_queue;
int _init_count;
@ -305,6 +311,9 @@ class Device {
int _data_in_estimate, _data_out_estimate;
std::string _ocl_vendor_name, _ocl_vendor_string, _ocl_compile_string;
int set_ocl_params(char *);
template <class t>
inline std::string toa(const t& in) {
std::ostringstream o;

View File

@ -72,7 +72,7 @@ int DipoleLJT::init(const int ntypes,
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0;

View File

@ -75,14 +75,14 @@ texture<int4,1> mu_tex;
if (offset==0) { \
engv+=ii; \
if (eflag>0) { \
*engv=energy; \
*engv=energy*(acctyp)0.5; \
engv+=inum; \
*engv=e_coul; \
*engv=e_coul*(acctyp)0.5; \
engv+=inum; \
} \
if (vflag>0) { \
for (int i=0; i<6; i++) { \
*engv=virial[i]; \
*engv=virial[i]*(acctyp)0.5; \
engv+=inum; \
} \
} \
@ -115,14 +115,14 @@ texture<int4,1> mu_tex;
if (offset==0) { \
engv+=ii; \
if (eflag>0) { \
*engv=energy; \
*engv=energy*(acctyp)0.5; \
engv+=inum; \
*engv=e_coul; \
*engv=e_coul*(acctyp)0.5; \
engv+=inum; \
} \
if (vflag>0) { \
for (int i=0; i<6; i++) { \
*engv=virial[i]; \
*engv=virial[i]*(acctyp)0.5; \
engv+=inum; \
} \
} \
@ -174,7 +174,8 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -385,7 +386,8 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);

View File

@ -72,7 +72,7 @@ int DipoleLJSFT::init(const int ntypes,
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0;

View File

@ -76,14 +76,14 @@ texture<int4,1> mu_tex;
if (offset==0) { \
engv+=ii; \
if (eflag>0) { \
*engv=energy; \
*engv=energy*(acctyp)0.5; \
engv+=inum; \
*engv=e_coul; \
*engv=e_coul*(acctyp)0.5; \
engv+=inum; \
} \
if (vflag>0) { \
for (int i=0; i<6; i++) { \
*engv=virial[i]; \
*engv=virial[i]*(acctyp)0.5; \
engv+=inum; \
} \
} \
@ -116,14 +116,14 @@ texture<int4,1> mu_tex;
if (offset==0) { \
engv+=ii; \
if (eflag>0) { \
*engv=energy; \
*engv=energy*(acctyp)0.5; \
engv+=inum; \
*engv=e_coul; \
*engv=e_coul*(acctyp)0.5; \
engv+=inum; \
} \
if (vflag>0) { \
for (int i=0; i<6; i++) { \
*engv=virial[i]; \
*engv=virial[i]*(acctyp)0.5; \
engv+=inum; \
} \
} \
@ -175,7 +175,8 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -418,7 +419,8 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);

View File

@ -66,7 +66,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
ef_nall=2000;
_max_fp_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
_fp.alloc(_max_fp_size,*(this->ucl_device),UCL_RW_OPTIMIZED,UCL_WRITE_ONLY);
_fp.alloc(_max_fp_size,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
k_energy.set_function(*(this->pair_program),"k_energy");
k_energy_fast.set_function(*(this->pair_program),"k_energy_fast");
@ -106,7 +106,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
_nr=nr;
UCL_H_Vec<int2> dview_type(lj_types*lj_types,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++) {
dview_type[i].x=0; dview_type[i].y=0;
@ -126,7 +126,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
// pack type2frho
UCL_H_Vec<int> dview_type2frho(lj_types,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
UCL_WRITE_ONLY);
type2frho.alloc(lj_types,*(this->ucl_device),UCL_READ_ONLY);
for (int i=0; i<ntypes; i++)
@ -135,7 +135,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
// pack frho_spline
UCL_H_Vec<numtyp4> dview_frho_spline(nfrho*(nrho+1),*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
UCL_WRITE_ONLY);
for (int ix=0; ix<nfrho; ix++)
for (int iy=0; iy<nrho+1; iy++) {
@ -165,7 +165,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
// pack rhor_spline
UCL_H_Vec<numtyp4> dview_rhor_spline(nrhor*(nr+1),*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
UCL_WRITE_ONLY);
for (int ix=0; ix<nrhor; ix++)
for (int iy=0; iy<nr+1; iy++) {
@ -195,7 +195,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
// pack z2r_spline
UCL_H_Vec<numtyp4> dview_z2r_spline(nz2r*(nr+1),*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
UCL_WRITE_ONLY);
for (int ix=0; ix<nz2r; ix++)
for (int iy=0; iy<nr+1; iy++) {

View File

@ -79,7 +79,7 @@ texture<int4> z2r_sp2_tex;
fetch4(coeff,index,frho_sp2_tex); \
energy = ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w; \
if (rho > rhomax) energy += fp*(rho-rhomax); \
engv[ii]=(acctyp)2.0*energy; \
engv[ii]=energy; \
} \
}
@ -116,12 +116,12 @@ texture<int4> z2r_sp2_tex;
} \
if (offset==0) { \
if (eflag>0) { \
engv[ii]+=energy; \
engv[ii]+=energy*(acctyp)0.5; \
engv+=inum; \
} \
if (vflag>0) { \
for (int i=0; i<6; i++) { \
engv[ii]=virial[i]; \
engv[ii]=virial[i]*(acctyp)0.5; \
engv+=inum; \
} \
} \
@ -150,7 +150,7 @@ texture<int4> z2r_sp2_tex;
fetch4(coeff,index,frho_sp2_tex); \
energy = ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w; \
if (rho > rhomax) energy += fp*(rho-rhomax); \
engv[ii]=(acctyp)2.0*energy; \
engv[ii]=energy; \
} \
}
@ -173,12 +173,12 @@ texture<int4> z2r_sp2_tex;
if (offset==0) { \
engv+=ii; \
if (eflag>0) { \
*engv+=energy; \
*engv+=energy*(acctyp)0.5; \
engv+=inum; \
} \
if (vflag>0) { \
for (int i=0; i<6; i++) { \
*engv=virial[i]; \
*engv=virial[i]*(acctyp)0.5; \
engv+=inum; \
} \
} \
@ -210,7 +210,8 @@ __kernel void k_energy(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -286,7 +287,8 @@ __kernel void k_energy_fast(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -353,7 +355,8 @@ __kernel void k_eam(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -466,7 +469,8 @@ __kernel void k_eam_fast(const __global numtyp4 *x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);

View File

@ -83,12 +83,12 @@ texture<int4,1> pos_tex, quat_tex;
if (offset==0) { \
__global acctyp *ap1=engv+ii; \
if (eflag>0) { \
*ap1=energy; \
*ap1=energy*(acctyp)0.5; \
ap1+=astride; \
} \
if (vflag>0) { \
for (int i=0; i<6; i++) { \
*ap1=virial[i]; \
*ap1=virial[i]*(acctyp)0.5; \
ap1+=astride; \
} \
} \
@ -130,12 +130,12 @@ texture<int4,1> pos_tex, quat_tex;
if (offset==0) { \
engv+=ii; \
if (eflag>0) { \
*engv+=energy; \
*engv+=energy*(acctyp)0.5; \
engv+=inum; \
} \
if (vflag>0) { \
for (int i=0; i<6; i++) { \
*engv+=virial[i]; \
*engv+=virial[i]*(acctyp)0.5; \
engv+=inum; \
} \
} \
@ -170,12 +170,12 @@ texture<int4,1> pos_tex, quat_tex;
if (offset==0) { \
__global acctyp *ap1=engv+ii; \
if (eflag>0) { \
*ap1=energy; \
*ap1=energy*(acctyp)0.5; \
ap1+=astride; \
} \
if (vflag>0) { \
for (int i=0; i<6; i++) { \
*ap1=virial[i]; \
*ap1=virial[i]*(acctyp)0.5; \
ap1+=astride; \
} \
} \
@ -202,12 +202,12 @@ texture<int4,1> pos_tex, quat_tex;
if (offset==0) { \
engv+=ii; \
if (eflag>0) { \
*engv+=energy; \
*engv+=energy*(acctyp)0.5; \
engv+=inum; \
} \
if (vflag>0) { \
for (int i=0; i<6; i++) { \
*engv+=virial[i]; \
*engv+=virial[i]*(acctyp)0.5; \
engv+=inum; \
} \
} \

View File

@ -68,7 +68,7 @@ int GaussT::init(const int ntypes,
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0;

View File

@ -52,7 +52,8 @@ __kernel void k_gauss(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -138,7 +139,8 @@ __kernel void k_gauss_fast(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);

View File

@ -77,7 +77,7 @@ int GayBerneT::init(const int ntypes, const double gamma,
// Allocate a host write buffer for copying type data
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0;
@ -98,7 +98,7 @@ int GayBerneT::init(const int ntypes, const double gamma,
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
host_offset);
dev_error.alloc(1,*(this->ucl_device));
dev_error.alloc(1,*(this->ucl_device),UCL_WRITE_ONLY);
dev_error.zero();
// Allocate, cast and asynchronous memcpy of constant data
@ -258,6 +258,9 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
&ainum, &this->_threads_per_atom);
this->time_ellipsoid2.stop();
} else {
GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum()-
this->_last_ellipse)/
(BX/this->_threads_per_atom)));
this->ans->force.zero();
this->ans->engv.zero();
this->time_nbor1.stop();

View File

@ -120,7 +120,8 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *nbor_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);

View File

@ -54,7 +54,8 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *nbor_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);
@ -276,7 +277,8 @@ __kernel void k_gayberne_lj(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -370,7 +372,8 @@ __kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);

View File

@ -69,7 +69,7 @@ int LJT::init(const int ntypes,
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0;

View File

@ -53,7 +53,8 @@ __kernel void k_lj(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -142,7 +143,8 @@ __kernel void k_lj_fast(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);

View File

@ -69,7 +69,7 @@ int LJ96T::init(const int ntypes,
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0;

View File

@ -53,7 +53,8 @@ __kernel void k_lj96(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -143,7 +144,8 @@ __kernel void k_lj96_fast(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);

View File

@ -73,7 +73,7 @@ int LJClass2LongT::init(const int ntypes, double **host_cutsq,
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0;

View File

@ -66,7 +66,8 @@ __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -187,7 +188,8 @@ __kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);

View File

@ -72,7 +72,7 @@ int LJCoulT::init(const int ntypes,
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0;

View File

@ -66,7 +66,8 @@ __kernel void k_lj_coul(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -178,7 +179,8 @@ __kernel void k_lj_coul_fast(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);

View File

@ -73,7 +73,7 @@ int LJCoulDebyeT::init(const int ntypes,
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0;

View File

@ -67,7 +67,8 @@ __kernel void k_lj_debye(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -186,7 +187,8 @@ __kernel void k_lj_debye_fast(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);

View File

@ -73,7 +73,7 @@ int LJCoulLongT::init(const int ntypes,
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0;

View File

@ -66,7 +66,8 @@ __kernel void k_lj_coul_long(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -183,7 +184,8 @@ __kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);

View File

@ -77,7 +77,7 @@ int LJDSFT::init(const int ntypes, double **host_cutsq, double **host_lj1,
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0;

View File

@ -69,7 +69,8 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -195,7 +196,8 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);

View File

@ -69,7 +69,7 @@ int LJExpandT::init(const int ntypes, double **host_cutsq,
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0;

View File

@ -55,7 +55,8 @@ __kernel void k_lj_expand(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -147,7 +148,8 @@ __kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);

View File

@ -69,7 +69,7 @@ int MorseT::init(const int ntypes,
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(types*types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
UCL_WRITE_ONLY);
for (int i=0; i<types*types; i++)
host_write[i]=0.0;

View File

@ -55,7 +55,8 @@ __kernel void k_morse(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -145,7 +146,8 @@ __kernel void k_morse_fast(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);

View File

@ -38,7 +38,8 @@ bool Neighbor::init(NeighborShared *shared, const int inum,
const bool pre_cut, const int block_cell_2d,
const int block_cell_id, const int block_nbor_build,
const int threads_per_atom, const int warp_size,
const bool time_device) {
const bool time_device,
const std::string compile_flags) {
clear();
_threads_per_atom=threads_per_atom;
@ -92,13 +93,13 @@ bool Neighbor::init(NeighborShared *shared, const int inum,
if (gpu_nbor==0)
success=success && (host_packed.alloc(2*IJ_SIZE,*dev,
UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
UCL_WRITE_ONLY)==UCL_SUCCESS);
alloc(success);
if (!success)
return false;
if (_use_packing==false)
_shared->compile_kernels(devi,gpu_nbor);
_shared->compile_kernels(devi,gpu_nbor,compile_flags);
return success;
}
@ -114,7 +115,7 @@ void Neighbor::alloc(bool &success) {
success=success && (dev_nbor.alloc(3*_max_atoms,*dev,
UCL_READ_ONLY)==UCL_SUCCESS);
success=success && (host_acc.alloc(nt*2,*dev,
UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
UCL_READ_WRITE)==UCL_SUCCESS);
_c_bytes=dev_nbor.row_bytes();
if (_alloc_packed) {
@ -129,10 +130,10 @@ void Neighbor::alloc(bool &success) {
host_ilist.clear();
host_jlist.clear();
success=(nbor_host.alloc(_max_nbors*_max_host,*dev,UCL_RW_OPTIMIZED,
UCL_WRITE_ONLY)==UCL_SUCCESS) && success;
success=(nbor_host.alloc(_max_nbors*_max_host,*dev,UCL_READ_WRITE,
UCL_READ_WRITE)==UCL_SUCCESS) && success;
success=success && (dev_numj_host.alloc(_max_host,*dev,
UCL_WRITE_ONLY)==UCL_SUCCESS);
UCL_READ_WRITE)==UCL_SUCCESS);
success=success && (host_ilist.alloc(nt,*dev,UCL_NOT_PINNED)==UCL_SUCCESS);
if (!success)
return;
@ -161,7 +162,7 @@ void Neighbor::alloc(bool &success) {
success=success && (dev_nspecial.alloc(3*at,*dev,
UCL_READ_ONLY)==UCL_SUCCESS);
success=success && (dev_special.alloc(_maxspecial*at,*dev,
UCL_READ_ONLY)==UCL_SUCCESS);
UCL_READ_WRITE)==UCL_SUCCESS);
success=success && (dev_special_t.alloc(_maxspecial*at,*dev,
UCL_READ_ONLY)==UCL_SUCCESS);
_gpu_bytes+=dev_nspecial.row_bytes()+dev_special.row_bytes()+
@ -178,11 +179,9 @@ void Neighbor::clear() {
_bin_time=0.0;
if (_ncells>0) {
_ncells=0;
dev_cell_counts.clear();
if (_gpu_nbor==2) {
host_cell_counts.clear();
cell_counts.clear();
if (_gpu_nbor==2)
delete [] cell_iter;
}
}
if (_allocated) {
_allocated=false;
@ -286,6 +285,80 @@ void Neighbor::get_host(const int inum, int *ilist, int *numj,
}
}
// This is the same as get host, but the requirement that ilist[i]=i and
// inum=nlocal is forced to be true to allow direct indexing of neighbors of
// neighbors
void Neighbor::get_host3(const int inum, const int nlist, int *ilist, int *numj,
int **firstneigh, const int block_size) {
_nbor_time_avail=true;
time_nbor.start();
UCL_H_Vec<int> ilist_view;
ilist_view.view(ilist,inum,*dev);
ucl_copy(dev_nbor,ilist_view,false);
UCL_D_Vec<int> nbor_offset;
UCL_H_Vec<int> host_offset;
int copy_count=0;
int ij_count=0;
int acc_count=0;
int dev_count=0;
int *h_ptr=host_packed.begin();
_nbor_pitch=inum;
if (nlist!=inum)
host_acc.zero(inum);
for (int ii=0; ii<nlist; ii++) {
int i=ilist[ii];
int nj=numj[i];
host_acc[i]=nj;
host_acc[i+inum]=acc_count;
acc_count+=nj;
}
for (int i=0; i<inum; i++) {
int nj=host_acc[i];
int *jlist=firstneigh[i];
for (int jj=0; jj<nj; jj++) {
*h_ptr=jlist[jj];
h_ptr++;
ij_count++;
if (ij_count==IJ_SIZE) {
dev_nbor.sync();
host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,IJ_SIZE);
nbor_offset.view_offset(dev_count,dev_packed,IJ_SIZE);
ucl_copy(nbor_offset,host_offset,true);
copy_count++;
ij_count=0;
dev_count+=IJ_SIZE;
h_ptr=host_packed.begin()+(IJ_SIZE*(copy_count%2));
}
}
}
if (ij_count!=0) {
dev_nbor.sync();
host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,ij_count);
nbor_offset.view_offset(dev_count,dev_packed,ij_count);
ucl_copy(nbor_offset,host_offset,true);
}
UCL_D_Vec<int> acc_view;
acc_view.view_offset(inum,dev_nbor,inum*2);
ucl_copy(acc_view,host_acc,true);
time_nbor.stop();
if (_use_packing==false) {
time_kernel.start();
int GX=static_cast<int>(ceil(static_cast<double>(inum)*_threads_per_atom/
block_size));
_shared->k_nbor.set_size(GX,block_size);
_shared->k_nbor.run(&dev_nbor, &dev_packed, &inum, &_threads_per_atom);
time_kernel.stop();
}
}
template <class numtyp, class acctyp>
void Neighbor::resize_max_neighbors(const int maxn, bool &success) {
if (maxn>_max_nbors) {
@ -330,24 +403,20 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
ncellz = static_cast<int>(ceil((subhi[2]-sublo[2])/_cell_size))+ghost_cells;
ncell_3d = ncellx * ncelly * ncellz;
if (ncell_3d+1>_ncells) {
if (_gpu_nbor==2) {
if (_ncells>0) {
host_cell_counts.clear();
delete [] cell_iter;
}
cell_iter = new int[ncell_3d+1];
host_cell_counts.alloc(ncell_3d+1,dev_nbor);
}
cell_counts.clear();
if (_gpu_nbor==2 && atom.host_view())
dev_cell_counts.view(host_cell_counts);
else {
dev_cell_counts.clear();
dev_cell_counts.alloc(ncell_3d+1,dev_nbor);
if (_gpu_nbor==2) {
if (_ncells>0)
delete [] cell_iter;
cell_iter = new int[ncell_3d+1];
cell_counts.alloc(ncell_3d+1,dev_nbor,UCL_READ_WRITE,UCL_READ_ONLY);
} else {
cell_counts.device.clear();
cell_counts.device.alloc(ncell_3d+1,dev_nbor);
}
_ncells=ncell_3d+1;
_cell_bytes=dev_cell_counts.row_bytes();
_cell_bytes=cell_counts.device.row_bytes();
}
const numtyp cutoff_cast=static_cast<numtyp>(_cutoff);
@ -381,7 +450,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
int *particle_id=atom.host_particle_id.begin();
// Build cell list on CPU
host_cell_counts.zero();
cell_counts.host.zero();
double i_cell_size=1.0/_cell_size;
int offset_hi=_cells_in_cutoff+1;
@ -403,7 +472,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
int id = ix+iy*ncellx+iz*ncellx*ncelly;
cell_id[i] = id;
host_cell_counts[id+1]++;
cell_counts[id+1]++;
}
for (int i=nt; i<nall; i++) {
@ -424,12 +493,12 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
int id = ix+iy*ncellx+iz*ncellx*ncelly;
cell_id[i] = id;
host_cell_counts[id+1]++;
cell_counts[id+1]++;
}
mn=0;
for (int i=0; i<_ncells; i++)
mn=std::max(mn,host_cell_counts[i]);
mn=std::max(mn,cell_counts[i]);
mn*=8;
set_nbor_block_size(mn/2);
@ -440,11 +509,11 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
cell_iter[0]=0;
for (int i=1; i<_ncells; i++) {
host_cell_counts[i]+=host_cell_counts[i-1];
cell_iter[i]=host_cell_counts[i];
cell_counts[i]+=cell_counts[i-1];
cell_iter[i]=cell_counts[i];
}
time_hybrid1.start();
ucl_copy(dev_cell_counts,host_cell_counts,true);
cell_counts.update_device(true);
time_hybrid1.stop();
for (int i=0; i<nall; i++) {
int celli=cell_id[i];
@ -481,7 +550,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
/* calculate cell count */
_shared->k_cell_counts.set_size(GX,neigh_block);
_shared->k_cell_counts.run(&atom.dev_cell_id, &dev_cell_counts, &nall,
_shared->k_cell_counts.run(&atom.dev_cell_id, &cell_counts, &nall,
&ncell_3d);
}
@ -490,7 +559,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
_shared->k_build_nbor.set_size(ncellx-ghost_cells,(ncelly-ghost_cells)*
(ncellz-ghost_cells),cell_block,1);
_shared->k_build_nbor.run(&atom.x, &atom.dev_particle_id,
&dev_cell_counts, &dev_nbor, &nbor_host,
&cell_counts, &dev_nbor, &nbor_host,
&dev_numj_host, &_max_nbors, &cutoff_cast, &ncellx,
&ncelly, &ncellz, &inum, &nt, &nall,
&_threads_per_atom, &_cells_in_cutoff);

View File

@ -47,13 +47,15 @@ class Neighbor {
* \param pre_cut True if cutoff test will be performed in separate kernel
* than the force kernel
* \param threads_per_atom Number of threads used per atom for force
* calculation **/
* calculation
* \param compile_flags Flags for JIT compiling **/
bool init(NeighborShared *shared, const int inum, const int host_inum,
const int max_nbors, const int maxspecial, UCL_Device &dev,
const int gpu_nbor, const int gpu_host, const bool pre_cut,
const int block_cell_2d, const int block_cell_id,
const int block_nbor_build, const int threads_per_atom,
const int warp_size, const bool time_device);
const int warp_size, const bool time_device,
const std::string compile_flags);
/// Set the size of the cutoff+skin
inline void cell_size(const double size, const double cutoff) {
@ -143,6 +145,10 @@ class Neighbor {
void get_host(const int inum, int *ilist, int *numj,
int **firstneigh, const int block_size);
/// Copy neighbor list from host for 3-body (first time or from a rebuild)
void get_host3(const int inum, const int nlist, int *ilist, int *numj,
int **firstneigh, const int block_size);
/// Return the stride in elements for each nbor row
inline int nbor_pitch() const { return _nbor_pitch; }
@ -207,11 +213,9 @@ class Neighbor {
UCL_D_Vec<int> dev_nspecial;
/// Device storage for special neighbors
UCL_D_Vec<int> dev_special, dev_special_t;
/// Host storage for number of particles per cell
UCL_H_Vec<int> host_cell_counts;
/// Host/Device storage for number of particles per cell
UCL_Vector<int,int> cell_counts;
int *cell_iter;
/// Device storage for number of particles per cell
UCL_D_Vec<int> dev_cell_counts;
/// Device timers
UCL_Timer time_nbor, time_kernel, time_hybrid1, time_hybrid2, time_transpose;

View File

@ -48,15 +48,12 @@ void NeighborShared::clear() {
}
}
void NeighborShared::compile_kernels(UCL_Device &dev, const int gpu_nbor) {
void NeighborShared::compile_kernels(UCL_Device &dev, const int gpu_nbor,
const std::string flags) {
if (_compiled)
return;
_gpu_nbor=gpu_nbor;
std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
std::string(OCL_PRECISION_COMPILE)+" -D"+
std::string(OCL_VENDOR);
if (_gpu_nbor==0) {
nbor_program=new UCL_Program(dev);
nbor_program->load_string(neighbor_cpu,flags.c_str());

View File

@ -44,7 +44,8 @@ class NeighborShared {
UCL_Texture neigh_tex;
/// Compile kernels for neighbor lists
void compile_kernels(UCL_Device &dev, const int gpu_nbor);
void compile_kernels(UCL_Device &dev, const int gpu_nbor,
const std::string flags);
// ----------------------------- Kernels
UCL_Program *nbor_program, *build_program;

View File

@ -136,10 +136,10 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen,
_npts_y=nyhi_out-nylo_out+1;
_npts_z=nzhi_out-nzlo_out+1;
_npts_yx=_npts_x*_npts_y;
success=success && (brick.alloc(_npts_x*_npts_y*_npts_z,*ucl_device)==
UCL_SUCCESS);
success=success && (vd_brick.alloc(_npts_x*_npts_y*_npts_z*4,*ucl_device)==
UCL_SUCCESS);
success=success && (brick.alloc(_npts_x*_npts_y*_npts_z,*ucl_device,
UCL_READ_ONLY,UCL_WRITE_ONLY)==UCL_SUCCESS);
success=success && (vd_brick.alloc(_npts_x*_npts_y*_npts_z*4,*ucl_device,
UCL_READ_WRITE,UCL_READ_ONLY)==UCL_SUCCESS);
*vd_brick_p=vd_brick.host.begin();
_max_bytes+=brick.device.row_bytes()+vd_brick.device.row_bytes();
@ -159,7 +159,7 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen,
_max_bytes+=d_brick_atoms.row_bytes();
// Allocate error flags for checking out of bounds atoms
success=success && (error_flag.alloc(1,*ucl_device,UCL_RW_OPTIMIZED,
success=success && (error_flag.alloc(1,*ucl_device,UCL_READ_ONLY,
UCL_WRITE_ONLY)==UCL_SUCCESS);
if (!success) {
flag=-3;
@ -374,9 +374,7 @@ void PPPMT::compile_kernels(UCL_Device &dev) {
if (sizeof(grdtyp)==sizeof(double) && ucl_device->double_precision()==false)
return;
std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
std::string(OCL_PRECISION_COMPILE)+" -D"+
std::string(OCL_VENDOR);
std::string flags=device->compile_string();
#ifdef USE_OPENCL
flags+=std::string(" -Dgrdtyp=")+ucl_template_name<grdtyp>()+" -Dgrdtyp4="+
ucl_template_name<grdtyp>()+"4";

View File

@ -97,11 +97,11 @@ float * pppm_gpu_init_f(const int nlocal, const int nall, FILE *screen,
const int nzhi_out, float **rho_coeff,
float **vd_brick, const double slab_volfactor,
const int nx_pppm, const int ny_pppm, const int nz_pppm,
const bool split, int &success) {
const bool split, const bool respa, int &success) {
float *b=pppm_gpu_init(PPPMF,nlocal,nall,screen,order,nxlo_out,nylo_out,
nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff,vd_brick,
slab_volfactor,nx_pppm,ny_pppm,nz_pppm,split,success);
if (split==false)
if (split==false && respa==false)
PPPMF.device->set_single_precompute(&PPPMF);
return b;
}
@ -139,12 +139,13 @@ double * pppm_gpu_init_d(const int nlocal, const int nall, FILE *screen,
const int nzhi_out, double **rho_coeff,
double **vd_brick, const double slab_volfactor,
const int nx_pppm, const int ny_pppm,
const int nz_pppm, const bool split, int &success) {
const int nz_pppm, const bool split,
const bool respa, int &success) {
double *b=pppm_gpu_init(PPPMD,nlocal,nall,screen,order,nxlo_out,nylo_out,
nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff,
vd_brick,slab_volfactor,nx_pppm,ny_pppm,nz_pppm,
split,success);
if (split==false)
if (split==false && respa==false)
PPPMD.device->set_double_precompute(&PPPMD);
return b;
}

View File

@ -96,19 +96,27 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) {
enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
// OCL_VENDOR: preprocessor define for hardware
// OCL_DEFAULT_VENDOR: preprocessor define for hardware
// specific sizes of OpenCL kernel related constants
#ifdef FERMI_OCL
#define OCL_VENDOR "FERMI_OCL"
#define OCL_DEFAULT_VENDOR "fermi"
#endif
#ifdef KEPLER_OCL
#define OCL_DEFAULT_VENDOR "kepler"
#endif
#ifdef CYPRESS_OCL
#define OCL_VENDOR "CYPRESS_OCL"
#define OCL_DEFAULT_VENDOR "cypress"
#endif
#ifndef OCL_VENDOR
#define OCL_VENDOR "GENERIC_OCL"
#ifdef GENERIC_OCL
#define OCL_DEFAULT_VENDOR "generic"
#endif
#ifndef OCL_DEFAULT_VENDOR
#define OCL_DEFAULT_VENDOR "none"
#endif
#endif

View File

@ -214,6 +214,30 @@ typedef struct _double4 double4;
#endif
// -------------------------------------------------------------------------
// NVIDIA GENERIC OPENCL DEFINITIONS
// -------------------------------------------------------------------------
#ifdef NV_GENERIC_OCL
#define USE_OPENCL
#define fast_mul mul24
#define MEM_THREADS 16
#define THREADS_PER_ATOM 1
#define THREADS_PER_CHARGE 1
#define BLOCK_PAIR 64
#define MAX_SHARED_TYPES 8
#define BLOCK_NBOR_BUILD 64
#define BLOCK_BIO_PAIR 64
#define WARP_SIZE 32
#define PPPM_BLOCK_1D 64
#define BLOCK_CELL_2D 8
#define BLOCK_CELL_ID 128
#define MAX_BIO_SHARED_TYPES 128
#endif
// -------------------------------------------------------------------------
// NVIDIA FERMI OPENCL DEFINITIONS
// -------------------------------------------------------------------------
@ -221,9 +245,6 @@ typedef struct _double4 double4;
#ifdef FERMI_OCL
#define USE_OPENCL
#define fast_mul(X,Y) (X)*(Y)
#define ARCH 0
#define DRIVER 0
#define MEM_THREADS 32
#define THREADS_PER_ATOM 4
#define THREADS_PER_CHARGE 8
@ -238,7 +259,54 @@ typedef struct _double4 double4;
#define BLOCK_CELL_ID 128
#define MAX_BIO_SHARED_TYPES 128
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#endif
// -------------------------------------------------------------------------
// NVIDIA KEPLER OPENCL DEFINITIONS
// -------------------------------------------------------------------------
#ifdef KEPLER_OCL
#define USE_OPENCL
#define MEM_THREADS 32
#define THREADS_PER_ATOM 4
#define THREADS_PER_CHARGE 8
#define BLOCK_PAIR 256
#define MAX_SHARED_TYPES 11
#define BLOCK_NBOR_BUILD 128
#define BLOCK_BIO_PAIR 256
#define BLOCK_ELLIPSE 128
#define WARP_SIZE 32
#define PPPM_BLOCK_1D 64
#define BLOCK_CELL_2D 8
#define BLOCK_CELL_ID 128
#define MAX_BIO_SHARED_TYPES 128
#ifndef NO_OCL_PTX
#define ARCH 300
#ifdef _SINGLE_SINGLE
inline float shfl_xor(float var, int laneMask, int width) {
float ret;
int c;
c = ((WARP_SIZE-width) << 8) | 0x1f;
asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=f"(ret) : "f"(var), "r"(laneMask), "r"(c));
return ret;
}
#else
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
inline double shfl_xor(double var, int laneMask, int width) {
int c = ((WARP_SIZE-width) << 8) | 0x1f;
int x,y,x2,y2;
double ans;
asm volatile ("mov.b64 {%0, %1}, %2;" : "=r"(y), "=r"(x) : "d"(var));
asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=r"(x2) : "r"(x), "r"(laneMask), "r"(c));
asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=r"(y2) : "r"(y), "r"(laneMask), "r"(c));
asm volatile ("mov.b64 %0, {%1, %2};" : "=d"(ans) : "r"(y2), "r"(x2));
return ans;
}
#endif
#endif
#endif
@ -249,9 +317,6 @@ typedef struct _double4 double4;
#ifdef CYPRESS_OCL
#define USE_OPENCL
#define fast_mul(X,Y) (X)*(Y)
#define ARCH 0
#define DRIVER 0
#define MEM_THREADS 32
#define THREADS_PER_ATOM 4
#define THREADS_PER_CHARGE 8
@ -266,12 +331,6 @@ typedef struct _double4 double4;
#define BLOCK_CELL_ID 128
#define MAX_BIO_SHARED_TYPES 128
#if defined(cl_khr_fp64)
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#elif defined(cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64 : enable
#endif
#endif
// -------------------------------------------------------------------------
@ -281,9 +340,6 @@ typedef struct _double4 double4;
#ifdef GENERIC_OCL
#define USE_OPENCL
#define fast_mul mul24
#define ARCH 0
#define DRIVER 0
#define MEM_THREADS 16
#define THREADS_PER_ATOM 1
#define THREADS_PER_CHARGE 1
@ -298,6 +354,20 @@ typedef struct _double4 double4;
#define BLOCK_CELL_ID 128
#define MAX_BIO_SHARED_TYPES 128
#endif
// -------------------------------------------------------------------------
// OPENCL Stuff for All Hardware
// -------------------------------------------------------------------------
#ifdef USE_OPENCL
#ifndef _SINGLE_SINGLE
#ifndef cl_khr_fp64
#ifndef cl_amd_fp64
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#endif
#endif
#if defined(cl_khr_fp64)
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#elif defined(cl_amd_fp64)
@ -306,10 +376,17 @@ typedef struct _double4 double4;
#endif
// -------------------------------------------------------------------------
// OPENCL Stuff for All Hardware
// -------------------------------------------------------------------------
#ifdef USE_OPENCL
#ifndef fast_mul
#define fast_mul(X,Y) (X)*(Y)
#endif
#ifndef ARCH
#define ARCH 0
#endif
#ifndef DRIVER
#define DRIVER 0
#endif
#define GLOBAL_ID_X get_global_id(0)
#define THREAD_ID_X get_local_id(0)

View File

@ -74,7 +74,7 @@ int RESquaredT::init(const int ntypes, double **host_shape, double **host_well,
// Allocate a host write buffer for copying type data
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0;
@ -95,7 +95,7 @@ int RESquaredT::init(const int ntypes, double **host_shape, double **host_well,
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
host_offset);
dev_error.alloc(1,*(this->ucl_device));
dev_error.alloc(1,*(this->ucl_device),UCL_WRITE_ONLY);
dev_error.zero();
// Allocate, cast and asynchronous memcpy of constant data
@ -260,6 +260,9 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
&this->_threads_per_atom);
this->time_ellipsoid3.stop();
} else {
GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum()-
this->_last_ellipse)/
(BX/this->_threads_per_atom)));
this->ans->force.zero();
this->ans->engv.zero();
this->time_nbor1.zero();

View File

@ -75,7 +75,8 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *nbor_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);

View File

@ -59,12 +59,12 @@
if (offset==0) { \
__global acctyp *ap1=engv+ii; \
if (eflag>0) { \
*ap1+=energy; \
*ap1+=energy*(acctyp)0.5; \
ap1+=astride; \
} \
if (vflag>0) { \
for (int i=0; i<6; i++) { \
*ap1+=virial[i]; \
*ap1+=virial[i]*(acctyp)0.5; \
ap1+=astride; \
} \
} \
@ -104,12 +104,12 @@
if (offset==0) { \
__global acctyp *ap1=engv+ii; \
if (eflag>0) { \
*ap1+=energy; \
*ap1+=energy*(acctyp)0.5; \
ap1+=astride; \
} \
if (vflag>0) { \
for (int i=0; i<6; i++) { \
*ap1+=virial[i]; \
*ap1+=virial[i]*(acctyp)0.5; \
ap1+=astride; \
} \
} \
@ -173,7 +173,8 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *nbor_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);
@ -424,7 +425,8 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *nbor_end;
int j, numj, n_stride;
int j, numj;
__local int n_stride;
nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,j,numj,
n_stride,nbor_end,nbor);
@ -615,7 +617,8 @@ __kernel void k_resquared_lj(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -708,7 +711,8 @@ __kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);

View File

@ -88,7 +88,7 @@ int TableT::init(const int ntypes,
// Allocate a host write buffer for data initialization
UCL_H_Vec<int> host_write_int(lj_types*lj_types,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
UCL_WRITE_ONLY);
for (int i=0; i<lj_types*lj_types; i++)
host_write_int[i] = 0;
@ -113,7 +113,7 @@ int TableT::init(const int ntypes,
ucl_copy(nmask,host_write_int,false);
UCL_H_Vec<numtyp4> host_write(lj_types*lj_types,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
UCL_WRITE_ONLY);
coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
for (int ix=1; ix<ntypes; ix++)
@ -127,7 +127,7 @@ int TableT::init(const int ntypes,
// Allocate tablength arrays
UCL_H_Vec<numtyp4> host_write2(_ntables*_tablength,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
UCL_WRITE_ONLY);
for (int i=0; i<_ntables*_tablength; i++) {
host_write2[i].x = 0.0;
host_write2[i].y = 0.0;
@ -190,7 +190,7 @@ int TableT::init(const int ntypes,
ucl_copy(coeff4,host_write2,false);
UCL_H_Vec<numtyp> host_rsq(lj_types*lj_types,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
UCL_WRITE_ONLY);
cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack1(ntypes,lj_types,cutsq,host_rsq,host_cutsq);

View File

@ -74,7 +74,8 @@ __kernel void k_table(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -171,7 +172,8 @@ __kernel void k_table_fast(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -268,7 +270,8 @@ __kernel void k_table_linear(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -369,7 +372,8 @@ __kernel void k_table_linear_fast(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -470,7 +474,8 @@ __kernel void k_table_spline(const __global numtyp4 *restrict x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -578,7 +583,8 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -688,7 +694,8 @@ __kernel void k_table_bitmap(const __global numtyp4 *x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
@ -794,7 +801,8 @@ __kernel void k_table_bitmap_fast(const __global numtyp4 *x_,
if (ii<inum) {
const __global int *nbor, *list_end;
int i, numj, n_stride;
int i, numj;
__local int n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);

Some files were not shown because too many files have changed in this diff Show More