git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@10667 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2013-08-23 14:41:20 +00:00
parent 4047288238
commit 402d1a8605
105 changed files with 2003 additions and 954 deletions
--- a/lib/gpu/Makefile.linux_opencl
+++ b/lib/gpu/Makefile.linux_opencl
@ -4,9 +4,14 @@

 # which file will be copied to Makefile.lammps

-EXTRAMAKE = Makefile.lammps.standard
+EXTRAMAKE = Makefile.lammps.opencl

-OCL_CPP = mpic++ -O3 -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK
+OCL_TUNE = -DFERMI_OCL       # -- Uncomment for NVIDIA Fermi
+# OCL_TUNE = -DKEPLER_OCL    # -- Uncomment for NVIDIA Kepler
+# OCL_TUNE = -DCYPRESS_OCL   # -- Uncomment for AMD Cypress
+# OCL_TUNE = -DGENERIC_OCL   # -- Uncomment for generic device
+
+OCL_CPP = mpic++ $(DEFAULT_DEVICE) -O3 -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK
 OCL_LINK = -lOpenCL
 OCL_PREC = -D_SINGLE_SINGLE

--- a/lib/gpu/Makefile.mac_opencl
+++ b/lib/gpu/Makefile.mac_opencl
@ -4,7 +4,12 @@

 # which file will be copied to Makefile.lammps

-EXTRAMAKE = Makefile.lammps.standard
+EXTRAMAKE = Makefile.lammps.mac_ocl
+
+OCL_TUNE = -DFERMI_OCL       # -- Uncomment for NVIDIA Fermi
+# OCL_TUNE = -DKEPLER_OCL    # -- Uncomment for NVIDIA Kepler
+# OCL_TUNE = -DCYPRESS_OCL   # -- Uncomment for AMD Cypress
+# OCL_TUNE = -DGENERIC_OCL   # -- Uncomment for generic device

 OCL_CPP = mpic++ -O3 -DMPI_GERYON -DUCL_NO_EXIT
 OCL_LINK = -framework OpenCL
--- a/lib/gpu/Nvidia.makefile
+++ b/lib/gpu/Nvidia.makefile
@ -28,7 +28,7 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_ans.o \
       $(OBJ_DIR)/lal_neighbor.o $(OBJ_DIR)/lal_neighbor_shared.o \
       $(OBJ_DIR)/lal_device.o $(OBJ_DIR)/lal_base_atomic.o \
       $(OBJ_DIR)/lal_base_charge.o $(OBJ_DIR)/lal_base_ellipsoid.o \
-       $(OBJ_DIR)/lal_base_dipole.o \
+       $(OBJ_DIR)/lal_base_dipole.o $(OBJ_DIR)/lal_base_three.o \
       $(OBJ_DIR)/lal_pppm.o $(OBJ_DIR)/lal_pppm_ext.o \
       $(OBJ_DIR)/lal_gayberne.o $(OBJ_DIR)/lal_gayberne_ext.o \
       $(OBJ_DIR)/lal_re_squared.o $(OBJ_DIR)/lal_re_squared_ext.o \
@ -59,7 +59,12 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_ans.o \
       $(OBJ_DIR)/lal_gauss.o $(OBJ_DIR)/lal_gauss_ext.o \
       $(OBJ_DIR)/lal_yukawa_colloid.o $(OBJ_DIR)/lal_yukawa_colloid_ext.o \
       $(OBJ_DIR)/lal_lj_coul_debye.o $(OBJ_DIR)/lal_lj_coul_debye_ext.o \
-       $(OBJ_DIR)/lal_coul_dsf.o $(OBJ_DIR)/lal_coul_dsf_ext.o
+       $(OBJ_DIR)/lal_coul_dsf.o $(OBJ_DIR)/lal_coul_dsf_ext.o \
+       $(OBJ_DIR)/lal_sw.o $(OBJ_DIR)/lal_sw_ext.o \
+       $(OBJ_DIR)/lal_beck.o $(OBJ_DIR)/lal_beck_ext.o \
+       $(OBJ_DIR)/lal_mie.o $(OBJ_DIR)/lal_mie_ext.o \
+       $(OBJ_DIR)/lal_soft.o $(OBJ_DIR)/lal_soft_ext.o \
+       $(OBJ_DIR)/lal_lj_coul_msm.o $(OBJ_DIR)/lal_lj_coul_msm_ext.o

 CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \
       $(OBJ_DIR)/atom.cubin $(OBJ_DIR)/atom_cubin.h \
@ -99,7 +104,12 @@ CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \
       $(OBJ_DIR)/gauss.cubin $(OBJ_DIR)/gauss_cubin.h \
       $(OBJ_DIR)/yukawa_colloid.cubin $(OBJ_DIR)/yukawa_colloid_cubin.h \
       $(OBJ_DIR)/lj_coul_debye.cubin $(OBJ_DIR)/lj_coul_debye_cubin.h \
-       $(OBJ_DIR)/coul_dsf.cubin $(OBJ_DIR)/coul_dsf_cubin.h
+       $(OBJ_DIR)/coul_dsf.cubin $(OBJ_DIR)/coul_dsf_cubin.h \
+       $(OBJ_DIR)/sw.cubin $(OBJ_DIR)/sw_cubin.h \
+       $(OBJ_DIR)/beck.cubin $(OBJ_DIR)/beck_cubin.h \
+       $(OBJ_DIR)/mie.cubin $(OBJ_DIR)/mie_cubin.h \
+       $(OBJ_DIR)/soft.cubin $(OBJ_DIR)/soft_cubin.h \
+       $(OBJ_DIR)/lj_coul_msm.cubin $(OBJ_DIR)/lj_coul_msm_cubin.h

 all: $(OBJ_DIR) $(GPU_LIB) $(EXECS)

@ -175,6 +185,9 @@ $(OBJ_DIR)/lal_base_ellipsoid.o: $(ALL_H) lal_base_ellipsoid.h lal_base_ellipsoi
 $(OBJ_DIR)/lal_base_dipole.o: $(ALL_H) lal_base_dipole.h lal_base_dipole.cpp
 	$(CUDR) -o $@ -c lal_base_dipole.cpp

+$(OBJ_DIR)/lal_base_three.o: $(ALL_H) lal_base_three.h lal_base_three.cpp
+	$(CUDR) -o $@ -c lal_base_three.cpp
+
 $(OBJ_DIR)/pppm_f.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h
 	$(CUDA) --cubin -DNV_KERNEL -Dgrdtyp=float -Dgrdtyp4=float4 -o $@ lal_pppm.cu

@ -571,6 +584,66 @@ $(OBJ_DIR)/lal_coul_dsf.o: $(ALL_H) lal_coul_dsf.h lal_coul_dsf.cpp $(OBJ_DIR)/c
 $(OBJ_DIR)/lal_coul_dsf_ext.o: $(ALL_H) lal_coul_dsf.h lal_coul_dsf_ext.cpp lal_base_charge.h
 	$(CUDR) -o $@ -c lal_coul_dsf_ext.cpp -I$(OBJ_DIR)

+$(OBJ_DIR)/sw.cubin: lal_sw.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_sw.cu
+
+$(OBJ_DIR)/sw_cubin.h: $(OBJ_DIR)/sw.cubin $(OBJ_DIR)/sw.cubin
+	$(BIN2C) -c -n sw $(OBJ_DIR)/sw.cubin > $(OBJ_DIR)/sw_cubin.h
+
+$(OBJ_DIR)/lal_sw.o: $(ALL_H) lal_sw.h lal_sw.cpp $(OBJ_DIR)/sw_cubin.h $(OBJ_DIR)/lal_base_three.o
+	$(CUDR) -o $@ -c lal_sw.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_sw_ext.o: $(ALL_H) lal_sw.h lal_sw_ext.cpp lal_base_three.h
+	$(CUDR) -o $@ -c lal_sw_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/beck.cubin: lal_beck.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_beck.cu
+
+$(OBJ_DIR)/beck_cubin.h: $(OBJ_DIR)/beck.cubin $(OBJ_DIR)/beck.cubin
+	$(BIN2C) -c -n beck $(OBJ_DIR)/beck.cubin > $(OBJ_DIR)/beck_cubin.h
+
+$(OBJ_DIR)/lal_beck.o: $(ALL_H) lal_beck.h lal_beck.cpp $(OBJ_DIR)/beck_cubin.h $(OBJ_DIR)/lal_base_atomic.o
+	$(CUDR) -o $@ -c lal_beck.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_beck_ext.o: $(ALL_H) lal_beck.h lal_beck_ext.cpp lal_base_atomic.h
+	$(CUDR) -o $@ -c lal_beck_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/mie.cubin: lal_mie.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_mie.cu
+
+$(OBJ_DIR)/mie_cubin.h: $(OBJ_DIR)/mie.cubin $(OBJ_DIR)/mie.cubin
+	$(BIN2C) -c -n mie $(OBJ_DIR)/mie.cubin > $(OBJ_DIR)/mie_cubin.h
+
+$(OBJ_DIR)/lal_mie.o: $(ALL_H) lal_mie.h lal_mie.cpp $(OBJ_DIR)/mie_cubin.h $(OBJ_DIR)/lal_base_atomic.o
+	$(CUDR) -o $@ -c lal_mie.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_mie_ext.o: $(ALL_H) lal_mie.h lal_mie_ext.cpp lal_base_atomic.h
+	$(CUDR) -o $@ -c lal_mie_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/soft.cubin: lal_soft.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_soft.cu
+
+$(OBJ_DIR)/soft_cubin.h: $(OBJ_DIR)/soft.cubin $(OBJ_DIR)/soft.cubin
+	$(BIN2C) -c -n soft $(OBJ_DIR)/soft.cubin > $(OBJ_DIR)/soft_cubin.h
+
+$(OBJ_DIR)/lal_soft.o: $(ALL_H) lal_soft.h lal_soft.cpp $(OBJ_DIR)/soft_cubin.h $(OBJ_DIR)/lal_base_atomic.o
+	$(CUDR) -o $@ -c lal_soft.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_soft_ext.o: $(ALL_H) lal_soft.h lal_soft_ext.cpp lal_base_atomic.h
+	$(CUDR) -o $@ -c lal_soft_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lj_coul_msm.cubin: lal_lj_coul_msm.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj_coul_msm.cu
+
+$(OBJ_DIR)/lj_coul_msm_cubin.h: $(OBJ_DIR)/lj_coul_msm.cubin $(OBJ_DIR)/lj_coul_msm.cubin
+	$(BIN2C) -c -n lj_coul_msm $(OBJ_DIR)/lj_coul_msm.cubin > $(OBJ_DIR)/lj_coul_msm_cubin.h
+
+$(OBJ_DIR)/lal_lj_coul_msm.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm.cpp $(OBJ_DIR)/lj_coul_msm_cubin.h $(OBJ_DIR)/lal_base_charge.o
+	$(CUDR) -o $@ -c lal_lj_coul_msm.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_lj_coul_msm_ext.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm_ext.cpp lal_base_charge.h
+	$(CUDR) -o $@ -c lal_lj_coul_msm_ext.cpp -I$(OBJ_DIR)
+
 $(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVD_H)
 	$(CUDR) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_CUDADR $(CUDA_LIB) -lcuda 

--- a/lib/gpu/Opencl.makefile
+++ b/lib/gpu/Opencl.makefile
@ -17,7 +17,7 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_answer.o \
       $(OBJ_DIR)/lal_neighbor_shared.o $(OBJ_DIR)/lal_neighbor.o \
       $(OBJ_DIR)/lal_device.o $(OBJ_DIR)/lal_base_atomic.o \
       $(OBJ_DIR)/lal_base_charge.o $(OBJ_DIR)/lal_base_ellipsoid.o \
-       $(OBJ_DIR)/lal_base_dipole.o \
+       $(OBJ_DIR)/lal_base_dipole.o $(OBJ_DIR)/lal_base_three.o \
       $(OBJ_DIR)/lal_pppm.o $(OBJ_DIR)/lal_pppm_ext.o \
       $(OBJ_DIR)/lal_gayberne.o $(OBJ_DIR)/lal_gayberne_ext.o \
       $(OBJ_DIR)/lal_re_squared.o $(OBJ_DIR)/lal_re_squared_ext.o \
@ -48,7 +48,12 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_answer.o \
       $(OBJ_DIR)/lal_gauss.o $(OBJ_DIR)/lal_gauss_ext.o \
       $(OBJ_DIR)/lal_yukawa_colloid.o $(OBJ_DIR)/lal_yukawa_colloid_ext.o \
       $(OBJ_DIR)/lal_lj_coul_debye.o $(OBJ_DIR)/lal_lj_coul_debye_ext.o \
-       $(OBJ_DIR)/lal_coul_dsf.o $(OBJ_DIR)/lal_coul_dsf_ext.o
+       $(OBJ_DIR)/lal_coul_dsf.o $(OBJ_DIR)/lal_coul_dsf_ext.o \
+       $(OBJ_DIR)/lal_sw.o $(OBJ_DIR)/lal_sw_ext.o \
+       $(OBJ_DIR)/lal_beck.o $(OBJ_DIR)/lal_beck_ext.o \
+       $(OBJ_DIR)/lal_mie.o $(OBJ_DIR)/lal_mie_ext.o \
+       $(OBJ_DIR)/lal_soft.o $(OBJ_DIR)/lal_soft_ext.o \
+       $(OBJ_DIR)/lal_lj_coul_msm.o $(OBJ_DIR)/lal_lj_coul_msm_ext.o

 KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \
       $(OBJ_DIR)/neighbor_cpu_cl.h $(OBJ_DIR)/pppm_cl.h \
@ -68,7 +73,9 @@ KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \
       $(OBJ_DIR)/born_coul_long_cl.h $(OBJ_DIR)/dipole_lj_cl.h \
       $(OBJ_DIR)/dipole_lj_sf_cl.h $(OBJ_DIR)/colloid_cl.h \
       $(OBJ_DIR)/gauss_cl.h $(OBJ_DIR)/yukawa_colloid_cl.h \
-       $(OBJ_DIR)/lj_coul_debye_cl.h $(OBJ_DIR)/coul_dsf_cl.h
+       $(OBJ_DIR)/lj_coul_debye_cl.h $(OBJ_DIR)/coul_dsf_cl.h \
+       $(OBJ_DIR)/sw_cl.h $(OBJ_DIR)/beck_cl.h $(OBJ_DIR)/mie_cl.h \
+       $(OBJ_DIR)/soft_cl.h $(OBJ_DIR)/lj_coul_msm_cl.h


 OCL_EXECS = $(BIN_DIR)/ocl_get_devices
@ -117,6 +124,9 @@ $(OBJ_DIR)/lal_base_ellipsoid.o: $(OCL_H) lal_base_ellipsoid.h lal_base_ellipsoi
 $(OBJ_DIR)/lal_base_dipole.o: $(OCL_H) lal_base_dipole.h lal_base_dipole.cpp
 	$(OCL) -o $@ -c lal_base_dipole.cpp -I$(OBJ_DIR)

+$(OBJ_DIR)/lal_base_three.o: $(OCL_H) lal_base_three.h lal_base_three.cpp
+	$(OCL) -o $@ -c lal_base_three.cpp
+
 $(OBJ_DIR)/pppm_cl.h: lal_pppm.cu lal_preprocessor.h
 	$(BSH) ./geryon/file_to_cstr.sh pppm lal_preprocessor.h lal_pppm.cu $(OBJ_DIR)/pppm_cl.h;

@ -405,6 +415,51 @@ $(OBJ_DIR)/lal_coul_dsf.o: $(ALL_H) lal_coul_dsf.h lal_coul_dsf.cpp  $(OBJ_DIR)/
 $(OBJ_DIR)/lal_coul_dsf_ext.o: $(ALL_H) lal_coul_dsf.h lal_coul_dsf_ext.cpp lal_base_charge.h
 	$(OCL) -o $@ -c lal_coul_dsf_ext.cpp -I$(OBJ_DIR)

+$(OBJ_DIR)/sw_cl.h: lal_sw.cu $(PRE1_H)
+	$(BSH) ./geryon/file_to_cstr.sh sw $(PRE1_H) lal_sw.cu $(OBJ_DIR)/sw_cl.h;
+
+$(OBJ_DIR)/lal_sw.o: $(ALL_H) lal_sw.h lal_sw.cpp  $(OBJ_DIR)/sw_cl.h $(OBJ_DIR)/sw_cl.h $(OBJ_DIR)/lal_base_three.o
+	$(OCL) -o $@ -c lal_sw.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_sw_ext.o: $(ALL_H) lal_sw.h lal_sw_ext.cpp lal_base_three.h
+	$(OCL) -o $@ -c lal_sw_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/beck_cl.h: lal_beck.cu $(PRE1_H)
+	$(BSH) ./geryon/file_to_cstr.sh beck $(PRE1_H) lal_beck.cu $(OBJ_DIR)/beck_cl.h;
+
+$(OBJ_DIR)/lal_beck.o: $(ALL_H) lal_beck.h lal_beck.cpp  $(OBJ_DIR)/beck_cl.h $(OBJ_DIR)/beck_cl.h $(OBJ_DIR)/lal_base_atomic.o
+	$(OCL) -o $@ -c lal_beck.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_beck_ext.o: $(ALL_H) lal_beck.h lal_beck_ext.cpp lal_base_atomic.h
+	$(OCL) -o $@ -c lal_beck_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/mie_cl.h: lal_mie.cu $(PRE1_H)
+	$(BSH) ./geryon/file_to_cstr.sh mie $(PRE1_H) lal_mie.cu $(OBJ_DIR)/mie_cl.h;
+
+$(OBJ_DIR)/lal_mie.o: $(ALL_H) lal_mie.h lal_mie.cpp  $(OBJ_DIR)/mie_cl.h $(OBJ_DIR)/mie_cl.h $(OBJ_DIR)/lal_base_atomic.o
+	$(OCL) -o $@ -c lal_mie.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_mie_ext.o: $(ALL_H) lal_mie.h lal_mie_ext.cpp lal_base_atomic.h
+	$(OCL) -o $@ -c lal_mie_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/soft_cl.h: lal_soft.cu $(PRE1_H)
+	$(BSH) ./geryon/file_to_cstr.sh soft $(PRE1_H) lal_soft.cu $(OBJ_DIR)/soft_cl.h;
+
+$(OBJ_DIR)/lal_soft.o: $(ALL_H) lal_soft.h lal_soft.cpp  $(OBJ_DIR)/soft_cl.h $(OBJ_DIR)/soft_cl.h $(OBJ_DIR)/lal_base_atomic.o
+	$(OCL) -o $@ -c lal_soft.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_soft_ext.o: $(ALL_H) lal_soft.h lal_soft_ext.cpp lal_base_atomic.h
+	$(OCL) -o $@ -c lal_soft_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lj_coul_msm_cl.h: lal_lj_coul_msm.cu $(PRE1_H)
+	$(BSH) ./geryon/file_to_cstr.sh lj_coul_msm $(PRE1_H) lal_lj_coul_msm.cu $(OBJ_DIR)/lj_coul_msm_cl.h;
+
+$(OBJ_DIR)/lal_lj_coul_msm.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm.cpp  $(OBJ_DIR)/lj_coul_msm_cl.h $(OBJ_DIR)/lj_coul_msm_cl.h $(OBJ_DIR)/lal_base_charge.o
+	$(OCL) -o $@ -c lal_lj_coul_msm.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_lj_coul_msm_ext.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm_ext.cpp lal_base_charge.h
+	$(OCL) -o $@ -c lal_lj_coul_msm_ext.cpp -I$(OBJ_DIR)
+
 $(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp
 	$(OCL) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_OPENCL $(OCL_LINK) 

--- a/lib/gpu/README
+++ b/lib/gpu/README
@ -3,6 +3,7 @@
                  --------------------------------
                     
                       W. Michael Brown (ORNL)
+                        Trung Dac Nguyen (ORNL)
                          Peng Wang (NVIDIA)
                       Axel Kohlmeyer (Temple)
                         Steve Plimpton (SNL)
@ -60,6 +61,8 @@ devices on your system. A Makefile for OpenCL compilation is provided,
 but support for OpenCL use is not currently provided by the developers.
 Details of the implementation are provided in:

+----
+
 Brown, W.M., Wang, P. Plimpton, S.J., Tharrington, A.N. Implementing 
 Molecular Dynamics on Hybrid High Performance Computers - Short Range 
 Forces. Computer Physics Communications. 2011. 182: p. 898-911. 
@ -68,28 +71,64 @@ and

 Brown, W.M., Kohlmeyer, A. Plimpton, S.J., Tharrington, A.N. Implementing 
 Molecular Dynamics on Hybrid High Performance Computers - Particle-Particle 
-Particle-Mesh. Computer Physics Communications. 2011. In press. 
+Particle-Mesh. Computer Physics Communications. 2012. 183: p. 449-459. 

+and
+
+Brown, W.M., Masako, Y. Implementing Molecular Dynamics on Hybrid High 
+Performance Computers - Three-Body Potentials. Computer Physics Communications. 
+2013. In press.
+
+----

 NOTE: Installation of the CUDA SDK is not required.

 Current styles supporting GPU acceleration:

-   1. lj/cut
-   2. lj96/cut
-   3. lj/expand
-   4. lj/cut/coul/cut
-   5. lj/cut/coul/long
-   6. lj/charmm/coul/long
-   7. lj/class2
-   8. lj/class2/coul/long
-   9. morse
-  10. cg/cmm
-  11. cg/cmm/coul/long
-  12. coul/long
-  13. gayberne
-  14. resquared
-  15. pppm
+     1  beck
+     2  born/coul/long
+     3  born/coul/wolf
+     4  born
+     5  buck/coul/cut
+     6  buck/coul/long
+     7  buck
+     8  colloid
+     9  coul/dsf
+    10  coul/long
+    11  eam/alloy
+    12  eam/fs
+    13  eam
+    14  eam/lj
+    15  gauss
+    16  gayberne
+    17  lj96/cut
+    18  lj/charmm/coul/long
+    19  lj/class2/coul/long
+    20  lj/class2
+    21  lj/cut/coul/cut
+    22  lj/cut/coul/debye
+    23  lj/cut/coul/dsf
+    24  lj/cut/coul/long
+    25  lj/cut/coul/msm
+    26  lj/cut/coul/wolf/fsw
+    27  lj/cut/dipole/cut
+    28  lj/cut
+    29  lj/cut/tgpu
+    30  lj/expand
+    31  lj/sdk/coul/long
+    32  cg/cmm/coul/long
+    33  lj/sdk
+    34  cg/cmm
+    35  lj/sf/dipole/sf
+    36  mie/cut
+    37  morse
+    38  resquared
+    39  soft
+    40  sw
+    41  table
+    42  yukawa/colloid
+    43  yukawa
+    44  pppm


                     MULTIPLE LAMMPS PROCESSES
@ -170,3 +209,4 @@ make yes-asphere
 make yes-kspace
 make yes-gpu
 make linux
+
--- a/lib/gpu/geryon/VERSION.txt
+++ b/lib/gpu/geryon/VERSION.txt
@ -1 +1 @@
-Geryon Version 12.033
+Geryon Version 13.209
--- a/lib/gpu/geryon/nvd_device.h
+++ b/lib/gpu/geryon/nvd_device.h
@ -65,15 +65,19 @@ class UCL_Device {
 public:
  /// Collect properties for every GPU on the node
  /** \note You must set the active GPU with set() before using the device **/
-  UCL_Device();
+  inline UCL_Device();
  
-  ~UCL_Device();
+  inline ~UCL_Device();

  /// Returns 1 (For compatibility with OpenCL)
  inline int num_platforms() { return 1; }

  /// Return a string with name and info of the current platform
-  std::string platform_name() { return "NVIDIA Corporation NVIDIA CUDA Driver"; }
+  inline std::string platform_name() 
+    { return "NVIDIA Corporation NVIDIA CUDA Driver"; }
+
+  /// Delete any contexts/data and set the platform number to be used
+  inline int set_platform(const int pid);

  /// Return the number of devices that support CUDA
  inline int num_devices() { return _properties.size(); }
@ -81,8 +85,12 @@ class UCL_Device {
  /// Set the CUDA device to the specified device number
  /** A context and default command queue will be created for the device
    * Returns UCL_SUCCESS if successful or UCL_ERROR if the device could not
-    * be allocated for use **/
-  int set(int num);
+    * be allocated for use. clear() is called to delete any contexts and
+    * associated data from previous calls to set(). **/
+  inline int set(int num);
+
+  /// Delete any context and associated data stored from a call to set()
+  inline void clear();

  /// Get the current device number
  inline int device_num() { return _device; }
@ -147,16 +155,17 @@ class UCL_Device {
  inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; }
  
  /// Returns true if double precision is support for the current device
-  bool double_precision() { return double_precision(_device); }
+  inline bool double_precision() { return double_precision(_device); }
  /// Returns true if double precision is support for the device
-  bool double_precision(const int i) {return arch(i)>=1.3;}
+  inline bool double_precision(const int i) {return arch(i)>=1.3;}
  
  /// Get the number of cores in the current device
  inline unsigned cores() { return cores(_device); }
  /// Get the number of cores
  inline unsigned cores(const int i) 
    { if (arch(i)<2.0) return _properties[i].multiProcessorCount*8; 
-      else if (arch(i)<3.0) return _properties[i].multiProcessorCount*32;
+      else if (arch(i)<2.1) return _properties[i].multiProcessorCount*32;
+      else if (arch(i)<3.0) return _properties[i].multiProcessorCount*48;
      else return _properties[i].multiProcessorCount*192; }
  
  /// Get the gigabytes of global memory in the current device
@ -216,8 +225,34 @@ class UCL_Device {
  inline bool sharing_supported(const int i)
    { return (_properties[i].computeMode == CU_COMPUTEMODE_DEFAULT); }

+  /// True if splitting device into equal subdevices supported
+  inline bool fission_equal()
+    { return fission_equal(_device); }
+  /// True if splitting device into equal subdevices supported
+  inline bool fission_equal(const int i)
+    { return false; }
+  /// True if splitting device into subdevices by specified counts supported
+  inline bool fission_by_counts()
+    { return fission_by_counts(_device); }
+  /// True if splitting device into subdevices by specified counts supported
+  inline bool fission_by_counts(const int i)
+    { return false; }    
+  /// True if splitting device into subdevices by affinity domains supported
+  inline bool fission_by_affinity()
+    { return fission_by_affinity(_device); }
+  /// True if splitting device into subdevices by affinity domains supported
+  inline bool fission_by_affinity(const int i)
+    { return false; }
+
+  /// Maximum number of subdevices allowed from device fission
+  inline int max_sub_devices()
+    { return max_sub_devices(_device); }
+  /// Maximum number of subdevices allowed from device fission
+  inline int max_sub_devices(const int i)
+    { return 0; }
+
  /// List all devices along with all properties
-  void print_all(std::ostream &out);
+  inline void print_all(std::ostream &out);
 
 private:
  int _device, _num_devices;
@ -228,7 +263,7 @@ class UCL_Device {
 };

 // Grabs the properties for all devices
-inline UCL_Device::UCL_Device() {
+UCL_Device::UCL_Device() {
  CU_SAFE_CALL_NS(cuInit(0));
  CU_SAFE_CALL_NS(cuDeviceGetCount(&_num_devices));
  for (int dev=0; dev<_num_devices; ++dev) {
@ -280,22 +315,21 @@ inline UCL_Device::UCL_Device() {
  _cq.back()=0;
 }

-inline UCL_Device::~UCL_Device() {
-  if (_device>-1) {
-    for (int i=1; i<num_queues(); i++) pop_command_queue();
-    cuCtxDestroy(_context);
-  }
+UCL_Device::~UCL_Device() {
+  clear();
+}
+
+int UCL_Device::set_platform(const int pid) {
+  clear();
+  #ifdef UCL_DEBUG
+  assert(pid<num_platforms());
+  #endif
+  return UCL_SUCCESS;
 }

 // Set the CUDA device to the specified device number
-inline int UCL_Device::set(int num) {
-  if (_device==num)
-    return UCL_SUCCESS;
-  if (_device>-1) {
-    CU_SAFE_CALL_NS(cuCtxDestroy(_context));
-    for (int i=1; i<num_queues(); i++) pop_command_queue();
-    _cq[0]=0;
-  }
+int UCL_Device::set(int num) {
+  clear();
  _device=_properties[num].device_id;
  CU_SAFE_CALL_NS(cuDeviceGet(&_cu_device,_device));
  CUresult err=cuCtxCreate(&_context,0,_cu_device);
@ -310,8 +344,16 @@ inline int UCL_Device::set(int num) {
  return UCL_SUCCESS;
 }

+void UCL_Device::clear() {
+  if (_device>-1) {
+    for (int i=1; i<num_queues(); i++) pop_command_queue();
+    cuCtxDestroy(_context);
+  }
+  _device=-1;
+}
+
 // List all devices along with all properties
-inline void UCL_Device::print_all(std::ostream &out) {
+void UCL_Device::print_all(std::ostream &out) {
  #if CUDA_VERSION >= 2020
  int driver_version;
  cuDriverGetVersion(&driver_version);
--- a/lib/gpu/geryon/nvd_kernel.h
+++ b/lib/gpu/geryon/nvd_kernel.h
@ -377,6 +377,10 @@ class UCL_Kernel {
    #endif
  }

+  /// Return the default command queue/stream associated with this data
+  inline command_queue & cq() { return _cq; }
+  /// Change the default command queue associated with matrix
+  inline void cq(command_queue &cq_in) { _cq=cq_in; }
  #include "ucl_arg_kludge.h"

 private:
--- a/lib/gpu/geryon/nvd_memory.h
+++ b/lib/gpu/geryon/nvd_memory.h
@ -47,14 +47,14 @@ typedef CUdeviceptr device_ptr;
 // --------------------------------------------------------------------------
 template <class mat_type, class copy_type>
 inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,  
-                        const enum UCL_MEMOPT kind) {
+                       const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
  CUresult err=CUDA_SUCCESS;
-  if (kind==UCL_RW_OPTIMIZED)  
-    err=cuMemAllocHost((void **)mat.host_ptr(),n);
-  else if (kind==UCL_WRITE_OPTIMIZED)
+  if (kind==UCL_NOT_PINNED)
+    *(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
+  else if (kind==UCL_WRITE_ONLY)
    err=cuMemHostAlloc((void **)mat.host_ptr(),n,CU_MEMHOSTALLOC_WRITECOMBINED);
  else
-    *(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
+    err=cuMemAllocHost((void **)mat.host_ptr(),n);
  if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL)
    return UCL_MEMORY_ERROR;
  mat.cq()=cm.cq();
@ -63,14 +63,14 @@ inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,

 template <class mat_type>
 inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,  
-                       const enum UCL_MEMOPT kind) {
+                       const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
  CUresult err=CUDA_SUCCESS;
-  if (kind==UCL_RW_OPTIMIZED)  
-    err=cuMemAllocHost((void **)mat.host_ptr(),n);
-  else if (kind==UCL_WRITE_OPTIMIZED)
+  if (kind==UCL_NOT_PINNED)
+    *(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
+  else if (kind==UCL_WRITE_ONLY)
    err=cuMemHostAlloc((void **)mat.host_ptr(),n,CU_MEMHOSTALLOC_WRITECOMBINED);
  else
-    *(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
+    err=cuMemAllocHost((void **)mat.host_ptr(),n);
  if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL)
    return UCL_MEMORY_ERROR;
  mat.cq()=dev.cq();
@ -78,8 +78,10 @@ inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
 }

 template <class mat_type>
-inline void _host_free(mat_type &mat, const enum UCL_MEMOPT kind) {
-  if (kind!=UCL_NOT_PINNED)
+inline void _host_free(mat_type &mat) {
+  if (mat.kind()==UCL_VIEW)
+    return;
+  else if (mat.kind()!=UCL_NOT_PINNED)
    CU_DESTRUCT_CALL(cuMemFreeHost(mat.begin()));
  else
    free(mat.begin());
@ -87,14 +89,14 @@ inline void _host_free(mat_type &mat, const enum UCL_MEMOPT kind) {

 template <class mat_type>
 inline int _host_resize(mat_type &mat, const size_t n) {
-  _host_free(mat,mat.kind());
+  _host_free(mat);
  CUresult err=CUDA_SUCCESS;
-  if (mat.kind()==UCL_RW_OPTIMIZED)  
-    err=cuMemAllocHost((void **)mat.host_ptr(),n);
-  else if (mat.kind()==UCL_WRITE_OPTIMIZED)
+  if (mat.kind()==UCL_NOT_PINNED)
+    *(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
+  else if (mat.kind()==UCL_WRITE_ONLY)
    err=cuMemHostAlloc((void **)mat.host_ptr(),n,CU_MEMHOSTALLOC_WRITECOMBINED);
  else  
-    *(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
+    err=cuMemAllocHost((void **)mat.host_ptr(),n);
  if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL)
    return UCL_MEMORY_ERROR;
  return UCL_SUCCESS;
@ -155,7 +157,8 @@ inline int _device_alloc(mat_type &mat, UCL_Device &d, const size_t rows,

 template <class mat_type>
 inline void _device_free(mat_type &mat) {
-  CU_DESTRUCT_CALL(cuMemFree(mat.cbegin()));
+  if (mat.kind()!=UCL_VIEW)
+    CU_DESTRUCT_CALL(cuMemFree(mat.cbegin()));
 }

 template <class mat_type>
@ -229,13 +232,13 @@ inline void _host_zero(void *ptr, const size_t n) {
 }

 template <class mat_type>
-inline void _device_zero(mat_type &mat, const size_t n) {
+inline void _device_zero(mat_type &mat, const size_t n, command_queue &cq) {
  if (n%32==0)
-    CU_SAFE_CALL(cuMemsetD32(mat.cbegin(),0,n/4));
+    CU_SAFE_CALL(cuMemsetD32Async(mat.cbegin(),0,n/4,cq));
  else if (n%16==0)
-    CU_SAFE_CALL(cuMemsetD16(mat.cbegin(),0,n/2));
+    CU_SAFE_CALL(cuMemsetD16Async(mat.cbegin(),0,n/2,cq));
  else
-    CU_SAFE_CALL(cuMemsetD8(mat.cbegin(),0,n));
+    CU_SAFE_CALL(cuMemsetD8Async(mat.cbegin(),0,n,cq));
 }

 // --------------------------------------------------------------------------
--- a/lib/gpu/geryon/ocl_device.h
+++ b/lib/gpu/geryon/ocl_device.h
@ -51,6 +51,10 @@ inline void ucl_sync(cl_command_queue &cq) {
  CL_SAFE_CALL(clFinish(cq));
 }

+inline bool _shared_mem_device(cl_device_type &device_type) {
+  return (device_type==CL_DEVICE_TYPE_CPU);
+}
+
 struct OCLProperties {
  std::string name;
  cl_device_type device_type;
@ -64,6 +68,10 @@ struct OCLProperties {
  bool double_precision;
  int alignment;
  size_t timer_resolution;
+  bool ecc_support;
+  std::string c_version;
+  bool partition_equal, partition_counts, partition_affinity;
+  cl_uint max_sub_devices;
 };

 /// Class for looking at data parallel device properties
@ -74,15 +82,18 @@ class UCL_Device {
 public:
  /// Collect properties for every device on the node
   /** \note You must set the active GPU with set() before using the device **/
-  UCL_Device();
+  inline UCL_Device();
  
-  ~UCL_Device();
+  inline ~UCL_Device();

  /// Return the number of platforms (0 if error or no platforms)
  inline int num_platforms() { return _num_platforms; }
  
  /// Return a string with name and info of the current platform
-  std::string platform_name();
+  inline std::string platform_name();
+
+  /// Delete any contexts/data and set the platform number to be used
+  inline int set_platform(const int pid);

  /// Return the number of devices that support OpenCL
  inline int num_devices() { return _num_devices; }
@ -90,8 +101,12 @@ class UCL_Device {
  /// Set the OpenCL device to the specified device number
  /** A context and default command queue will be created for the device *
    * Returns UCL_SUCCESS if successful or UCL_ERROR if the device could not
-    * be allocated for use **/
-  int set(int num);
+    * be allocated for use. clear() is called to delete any contexts and
+    * associated data from previous calls to set(). **/
+  inline int set(int num);
+  
+  /// Delete any context and associated data stored from a call to set()
+  inline void clear();

  /// Get the current device number
  inline int device_num() { return _device; }
@ -161,12 +176,14 @@ class UCL_Device {
  /// Returns true if host memory is efficiently addressable from device
  inline bool shared_memory() { return shared_memory(_device); }
  /// Returns true if host memory is efficiently addressable from device
-  inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; }
+  inline bool shared_memory(const int i) 
+    { return _shared_mem_device(_properties[i].device_type); }
  
  /// Returns true if double precision is support for the current device
-  bool double_precision() { return double_precision(_device); }
+  inline bool double_precision() { return double_precision(_device); }
  /// Returns true if double precision is support for the device
-  bool double_precision(const int i) {return _properties[i].double_precision;}
+  inline bool double_precision(const int i) 
+    {return _properties[i].double_precision;}
   
  /// Get the number of cores in the current device
  inline unsigned cores() { return cores(_device); }
@ -227,8 +244,34 @@ class UCL_Device {
  inline bool sharing_supported(const int i)
    { return true; }

+  /// True if splitting device into equal subdevices supported
+  inline bool fission_equal()
+    { return fission_equal(_device); }
+  /// True if splitting device into equal subdevices supported
+  inline bool fission_equal(const int i)
+    { return _properties[i].partition_equal; }
+  /// True if splitting device into subdevices by specified counts supported
+  inline bool fission_by_counts()
+    { return fission_by_counts(_device); }
+  /// True if splitting device into subdevices by specified counts supported
+  inline bool fission_by_counts(const int i)
+    { return _properties[i].partition_counts; }    
+  /// True if splitting device into subdevices by affinity domains supported
+  inline bool fission_by_affinity()
+    { return fission_by_affinity(_device); }
+  /// True if splitting device into subdevices by affinity domains supported
+  inline bool fission_by_affinity(const int i)
+    { return _properties[i].partition_affinity; }
+
+  /// Maximum number of subdevices allowed from device fission
+  inline int max_sub_devices()
+    { return max_sub_devices(_device); }
+  /// Maximum number of subdevices allowed from device fission
+  inline int max_sub_devices(const int i)
+    { return _properties[i].max_sub_devices; }
+
  /// List all devices along with all properties
-  void print_all(std::ostream &out);
+  inline void print_all(std::ostream &out);
  
  /// Return the OpenCL type for the device
  inline cl_device_id & cl_device() { return _cl_device; }
@ -237,7 +280,8 @@ class UCL_Device {
  int _num_platforms;          // Number of platforms
  int _platform;               // UCL_Device ID for current platform
  cl_platform_id _cl_platform; // OpenCL ID for current platform
-  cl_context _context;         // Context used for accessing the device
+  cl_platform_id _cl_platforms[20]; // OpenCL IDs for all platforms
+  cl_context _context;              // Context used for accessing the device
  std::vector<cl_command_queue> _cq;// The default command queue for this device
  int _device;                            // UCL_Device ID for current device
  cl_device_id _cl_device;                // OpenCL ID for current device
@ -245,24 +289,18 @@ class UCL_Device {
  int _num_devices;                       // Number of devices
  std::vector<OCLProperties> _properties; // Properties for each device
  
-  void add_properties(cl_device_id);
-  int create_context();
+  inline void add_properties(cl_device_id);
+  inline int create_context();
  int _default_cq;
 };

 // Grabs the properties for all devices
-inline UCL_Device::UCL_Device() {
-  cl_int errorv;
-  cl_uint nplatforms;
-  
-  _cl_device=0;
+UCL_Device::UCL_Device() {
  _device=-1;
-  _num_devices=0;
-  _platform=0;
-  _default_cq=0;

  // --- Get Number of Platforms
-  errorv=clGetPlatformIDs(1,&_cl_platform,&nplatforms);
+  cl_uint nplatforms;
+  cl_int errorv=clGetPlatformIDs(20,_cl_platforms,&nplatforms);
  
  if (errorv!=CL_SUCCESS) {
    _num_platforms=0;
@ -270,6 +308,38 @@ inline UCL_Device::UCL_Device() {
  } else
    _num_platforms=static_cast<int>(nplatforms);

+  set_platform(0);
+}
+
+UCL_Device::~UCL_Device() {
+  clear();
+}
+
+void UCL_Device::clear() {
+  if (_device>-1) {
+    for (size_t i=0; i<_cq.size(); i++) {
+      CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq.back()));
+      _cq.pop_back();
+    }
+    CL_DESTRUCT_CALL(clReleaseContext(_context));
+  }
+  _device=-1;
+}
+
+int UCL_Device::set_platform(int pid) {
+  clear();
+  cl_int errorv;
+  
+  _cl_device=0;
+  _device=-1;
+  _num_devices=0;
+  _default_cq=0;
+ 
+  #ifdef UCL_DEBUG
+  assert(pid<num_platforms());
+  #endif
+  _platform=pid;
+  _cl_platform=_cl_platforms[_platform];
  
  // --- Get Number of Devices
  cl_uint n;
@ -277,7 +347,7 @@ inline UCL_Device::UCL_Device() {
  _num_devices=n;
  if (errorv!=CL_SUCCESS || _num_devices==0) {
    _num_devices=0;
-    return;
+    return UCL_ERROR;
  }
  cl_device_id device_list[_num_devices];
  CL_SAFE_CALL(clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,n,device_list,
@ -288,19 +358,11 @@ inline UCL_Device::UCL_Device() {
    _cl_devices.push_back(device_list[i]);
    add_properties(device_list[i]);
  }
+
+  return UCL_SUCCESS;
 }

-inline UCL_Device::~UCL_Device() {
-  if (_device>-1) {
-    for (size_t i=0; i<_cq.size(); i++) {
-      CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq.back()));
-      _cq.pop_back();
-    }
-    CL_DESTRUCT_CALL(clReleaseContext(_context));
-  }
-}
-
-inline int UCL_Device::create_context() {
+int UCL_Device::create_context() {
  cl_int errorv;
  cl_context_properties props[3];
  props[0]=CL_CONTEXT_PLATFORM;
@ -320,9 +382,10 @@ inline int UCL_Device::create_context() {
  return UCL_SUCCESS;
 }

-inline void UCL_Device::add_properties(cl_device_id device_list) {
+void UCL_Device::add_properties(cl_device_id device_list) {
  OCLProperties op;
  char buffer[1024];
+  cl_bool ans_bool;
    
  CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_NAME,1024,buffer,NULL));
  op.name=buffer;
@ -363,10 +426,49 @@ inline void UCL_Device::add_properties(cl_device_id device_list) {
                               CL_DEVICE_PROFILING_TIMER_RESOLUTION,
                               sizeof(size_t),&op.timer_resolution,NULL));
  
+
+  op.ecc_support=false;
+  CL_SAFE_CALL(clGetDeviceInfo(device_list,
+                               CL_DEVICE_ERROR_CORRECTION_SUPPORT,
+                               sizeof(ans_bool),&ans_bool,NULL));
+  if (ans_bool==CL_TRUE)
+    op.ecc_support=true;
+  
+  op.c_version="";
+  op.partition_equal=false;
+  op.partition_counts=false;
+  op.partition_affinity=false;
+
+  #ifdef CL_VERSION_1_2
+  size_t return_bytes;
+  CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_OPENCL_C_VERSION,1024,
+                               buffer,NULL));
+  op.c_version=buffer;
+
+  cl_device_partition_property pinfo[4];
+  CL_SAFE_CALL(clGetDeviceInfo(device_list,
+                               CL_DEVICE_PARTITION_PROPERTIES,
+                               4*sizeof(cl_device_partition_property),
+                               pinfo,&return_bytes));
+  int nprops=return_bytes/sizeof(cl_device_partition_property);
+  for (int i=0; i<nprops; i++) {
+    if (pinfo[i]==CL_DEVICE_PARTITION_EQUALLY)
+      op.partition_equal=true;
+    else if (pinfo[i]==CL_DEVICE_PARTITION_BY_COUNTS)
+      op.partition_counts=true;
+    else if (pinfo[i]==CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN)
+      op.partition_affinity=true;
+  }
+  
+  CL_SAFE_CALL(clGetDeviceInfo(device_list,
+                               CL_DEVICE_PARTITION_MAX_SUB_DEVICES,
+                               sizeof(cl_uint),&op.max_sub_devices,NULL));
+  #endif
+  
  _properties.push_back(op);
 }

-inline std::string UCL_Device::platform_name() {
+std::string UCL_Device::platform_name() {
  char info[1024];
  
  CL_SAFE_CALL(clGetPlatformInfo(_cl_platform,CL_PLATFORM_VENDOR,1024,info,
@ -385,7 +487,7 @@ inline std::string UCL_Device::platform_name() {
 }

 // Get a string telling the type of the device
-inline std::string UCL_Device::device_type_name(const int i) {
+std::string UCL_Device::device_type_name(const int i) {
  if (_properties[i].device_type==CL_DEVICE_TYPE_CPU)
    return "CPU";
  else if (_properties[i].device_type==CL_DEVICE_TYPE_GPU)
@ -397,7 +499,7 @@ inline std::string UCL_Device::device_type_name(const int i) {
 }

 // Get a string telling the type of the device
-inline int UCL_Device::device_type(const int i) {
+int UCL_Device::device_type(const int i) {
  if (_properties[i].device_type==CL_DEVICE_TYPE_CPU)
    return UCL_CPU;
  else if (_properties[i].device_type==CL_DEVICE_TYPE_GPU)
@ -409,17 +511,8 @@ inline int UCL_Device::device_type(const int i) {
 }

 // Set the CUDA device to the specified device number
-inline int UCL_Device::set(int num) {
-  if (_device==num)
-    return UCL_SUCCESS;
-  
-  if (_device>-1) {
-    for (size_t i=0; i<_cq.size(); i++) {
-      CL_SAFE_CALL(clReleaseCommandQueue(_cq.back()));
-      _cq.pop_back();
-    }
-    CL_SAFE_CALL(clReleaseContext(_context));
-  }
+int UCL_Device::set(int num) {
+  clear();
  
  cl_device_id device_list[_num_devices];
  cl_uint n;
@ -432,7 +525,7 @@ inline int UCL_Device::set(int num) {
 }

 // List all devices along with all properties
-inline void UCL_Device::print_all(std::ostream &out) {
+void UCL_Device::print_all(std::ostream &out) {
  if (num_devices() == 0)
    out << "There is no device supporting OpenCL\n";
  for (int i=0; i<num_devices(); ++i) {
@ -475,6 +568,28 @@ inline void UCL_Device::print_all(std::ostream &out) {
    out << "  Clock rate:                                    "
        << clock_rate(i) << " GHz\n";
    //out << "  Concurrent copy and execution:                 ";
+    out << "  ECC support:                                   ";
+    if (_properties[i].ecc_support)
+      out << "Yes\n";
+    else
+      out << "No\n";
+    out << "  Device fission into equal partitions:          ";
+    if (fission_equal(i))
+      out << "Yes\n";
+    else
+      out << "No\n";
+    out << "  Device fission by counts:                      ";
+    if (fission_by_counts(i))
+      out << "Yes\n";
+    else
+      out << "No\n";
+    out << "  Device fission by affinity:                    ";
+    if (fission_by_affinity(i))
+      out << "Yes\n";
+    else
+      out << "No\n";
+    out << "  Maximum subdevices from fission:               "
+        << max_sub_devices(i) << std::endl;
  }
 }

--- a/lib/gpu/geryon/ocl_kernel.h
+++ b/lib/gpu/geryon/ocl_kernel.h
@ -134,6 +134,11 @@ class UCL_Program {
    return UCL_SUCCESS;
  }                                               
   
+  /// Return the default command queue/stream associated with this data
+  inline command_queue & cq() { return _cq; }
+  /// Change the default command queue associated with matrix
+  inline void cq(command_queue &cq_in) { _cq=cq_in; }
+
  friend class UCL_Kernel;
 private:
  bool _init_done;
@ -175,7 +180,16 @@ class UCL_Kernel {
  template <class dtype>
  inline void set_arg(const cl_uint index, const dtype * const arg) { 
    CL_SAFE_CALL(clSetKernelArg(_kernel,index,sizeof(dtype),arg)); 
-    if (index>_num_args) _num_args=index;
+    if (index>_num_args) {
+      _num_args=index;
+      #ifdef UCL_DEBUG
+      if (_num_args>_kernel_info_nargs) {
+        std::cerr << "TOO MANY ARGUMENTS TO OPENCL FUNCTION: " 
+                  << _kernel_info_name << std::endl;
+        assert(0==1);
+      }
+      #endif
+    }
  }
 
  /// Set a geryon container as a kernel argument.
@ -203,6 +217,13 @@ class UCL_Kernel {
  inline void add_arg(const dtype * const arg) {
    CL_SAFE_CALL(clSetKernelArg(_kernel,_num_args,sizeof(dtype),arg)); 
    _num_args++; 
+    #ifdef UCL_DEBUG
+    if (_num_args>_kernel_info_nargs) {
+      std::cerr << "TOO MANY ARGUMENTS TO OPENCL FUNCTION: " 
+                << _kernel_info_name << std::endl;
+      assert(0==1);
+    }
+    #endif
  }

  /// Add a geryon container as a kernel argument.
@ -289,10 +310,7 @@ class UCL_Kernel {
  }
  
  /// Run the kernel in the default command queue
-  inline void run() {
-    CL_SAFE_CALL(clEnqueueNDRangeKernel(_cq,_kernel,_dimensions,NULL,
-                                        _num_blocks,_block_size,0,NULL,NULL));
-  }
+  inline void run();
  
  /// Clear any arguments associated with the kernel
  inline void clear_args() { _num_args=0; }
@ -309,6 +327,12 @@ class UCL_Kernel {
  
  cl_command_queue _cq;        // The default command queue for this kernel
  unsigned _num_args;
+
+  #ifdef UCL_DEBUG
+  std::string _kernel_info_name;
+  unsigned _kernel_info_nargs;
+  //std::string _kernel_info_args[256];
+  #endif
 };

 inline int UCL_Kernel::set_function(UCL_Program &program, const char *function) {
@ -329,9 +353,32 @@ inline int UCL_Kernel::set_function(UCL_Program &program, const char *function)
    #endif
    return UCL_FUNCTION_NOT_FOUND;
  }
+  
+  #ifdef UCL_DEBUG
+  _kernel_info_name=function;
+  cl_uint nargs;
+  CL_SAFE_CALL(clGetKernelInfo(_kernel,CL_KERNEL_NUM_ARGS,sizeof(cl_uint),
+                               &nargs,NULL));
+  _kernel_info_nargs=nargs;
+  #ifdef NOT_TEST_CL_VERSION_1_2
+  char tname[256];
+  size_t ret;
+  for (cl_uint i=0; i<nargs; i++) {
+    CL_SAFE_CALL(clGetKernelArgInfo(_kernel,i,CL_KERNEL_ARG_TYPE_NAME,256,
+                                    tname,&ret));
+    _kernel_info_args[i]=tname;
+  }
+  #endif
+  #endif
+  
  return UCL_SUCCESS;                                               
 }

+void UCL_Kernel::run() {
+  CL_SAFE_CALL(clEnqueueNDRangeKernel(_cq,_kernel,_dimensions,NULL,
+                                      _num_blocks,_block_size,0,NULL,NULL));
+}
+
 } // namespace

 #endif
--- a/lib/gpu/geryon/ocl_memory.h
+++ b/lib/gpu/geryon/ocl_memory.h
@ -54,82 +54,138 @@ typedef cl_mem device_ptr;

 template <class mat_type, class copy_type>
 inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,  
-                       const enum UCL_MEMOPT kind) {
+                       const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
  cl_int error_flag;
  cl_context context;
  CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_CONTEXT,sizeof(context),
                                  &context,NULL));
  
-  if (kind==UCL_VIEW) {
-    mat.cbegin()=clCreateBuffer(context, CL_MEM_USE_HOST_PTR,n,mat.host_ptr(),
-                                &error_flag);
-    CL_CHECK_ERR(error_flag);
-    return UCL_SUCCESS;
-  }
-  if (kind==UCL_WRITE_OPTIMIZED) {
-    mat.cbegin()=clCreateBuffer(context,
-                                CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
-                                n,NULL,&error_flag);                        
-    if (error_flag != CL_SUCCESS) 
-      return UCL_MEMORY_ERROR;
-    *mat.host_ptr() = (typename mat_type::data_type*)
-                      clEnqueueMapBuffer(cm.cq(),mat.cbegin(),CL_TRUE,
-                                         CL_MAP_WRITE,0,n,0,NULL,NULL,NULL);
+  cl_mem_flags buffer_perm;
+  cl_map_flags map_perm;
+  if (kind2==UCL_NOT_SPECIFIED) {
+    if (kind==UCL_READ_ONLY) {
+      #ifdef CL_VERSION_1_2
+      buffer_perm=CL_MEM_HOST_READ_ONLY|CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR;
+      #else
+      buffer_perm=CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR;
+      #endif
+      map_perm=CL_MAP_READ;
+    } else if (kind==UCL_WRITE_ONLY) {
+      #ifdef CL_VERSION_1_2
+      buffer_perm=CL_MEM_HOST_WRITE_ONLY|CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR;
+      #else
+      buffer_perm=CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR;
+      #endif
+      map_perm=CL_MAP_WRITE;
+    } else {
+      buffer_perm=CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR;
+      map_perm=CL_MAP_READ | CL_MAP_WRITE;
+    }
  } else {
-    mat.cbegin()=clCreateBuffer(context,
-                                CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
-                                n,NULL,&error_flag);
-    if (error_flag != CL_SUCCESS) 
-      return UCL_MEMORY_ERROR;
+    if (kind2==UCL_READ_ONLY)
+      buffer_perm=CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR;
+    else if (kind2==UCL_WRITE_ONLY)
+      buffer_perm=CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR;
+    else
+      buffer_perm=CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR;
+    
+    if (kind==UCL_READ_ONLY) {
+      #ifdef CL_VERSION_1_2
+      buffer_perm=buffer_perm | CL_MEM_HOST_READ_ONLY;
+      #endif
+      map_perm=CL_MAP_READ;
+    } else if (kind==UCL_WRITE_ONLY) {
+      #ifdef CL_VERSION_1_2
+      buffer_perm=buffer_perm | CL_MEM_HOST_WRITE_ONLY;
+      #endif
+      map_perm=CL_MAP_WRITE;
+    } else
+      map_perm=CL_MAP_READ | CL_MAP_WRITE;
+  }
+    
+  mat.cbegin()=clCreateBuffer(context,buffer_perm,n,NULL,&error_flag);
+  if (error_flag != CL_SUCCESS) 
+    return UCL_MEMORY_ERROR;
    *mat.host_ptr() = (typename mat_type::data_type*)
                      clEnqueueMapBuffer(cm.cq(),mat.cbegin(),CL_TRUE,
-                                         CL_MAP_READ | CL_MAP_WRITE,
-                                         0,n,0,NULL,NULL,NULL);
-  }
+                                         map_perm,0,n,0,NULL,NULL,NULL);
+
  mat.cq()=cm.cq();
  CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
  return UCL_SUCCESS;
 }

+template <class mat_type, class copy_type>
+inline int _host_view(mat_type &mat, copy_type &cm, const size_t n) {
+  cl_int error_flag;
+  cl_context context;
+  CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_CONTEXT,sizeof(context),
+                                  &context,NULL));
+  cl_mem_flags orig_flags;
+  CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_FLAGS,sizeof(orig_flags),
+                                  &orig_flags,NULL));
+  orig_flags=orig_flags & ~CL_MEM_ALLOC_HOST_PTR;
+  
+  mat.cbegin()=clCreateBuffer(context, CL_MEM_USE_HOST_PTR | orig_flags, n,
+                              mat.host_ptr(), &error_flag);
+  CL_CHECK_ERR(error_flag);
+  CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
+  return UCL_SUCCESS;
+}
+
 template <class mat_type>
 inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,  
-                       const enum UCL_MEMOPT kind) {
-  cl_int error_flag;
-  if (kind==UCL_VIEW) {
-    mat.cbegin()=clCreateBuffer(dev.context(), CL_MEM_USE_HOST_PTR,
-                                n,mat.host_ptr(),&error_flag);
-    CL_CHECK_ERR(error_flag);
-    return UCL_SUCCESS;
-  }
-  if (kind==UCL_WRITE_OPTIMIZED) {
-    mat.cbegin()=clCreateBuffer(dev.context(),
-                                CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
-                                n,NULL,&error_flag);                        
-    if (error_flag != CL_SUCCESS) 
-      return UCL_MEMORY_ERROR;
-    *mat.host_ptr() = (typename mat_type::data_type*)
-                      clEnqueueMapBuffer(dev.cq(),mat.cbegin(),CL_TRUE,
-                                         CL_MAP_WRITE,0,n,0,NULL,NULL,NULL);
+                       const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
+  cl_mem_flags buffer_perm;
+  cl_map_flags map_perm;
+  if (kind==UCL_READ_ONLY) {
+    #ifdef CL_VERSION_1_2
+    buffer_perm=CL_MEM_HOST_READ_ONLY|CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR;
+    #else
+    buffer_perm=CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR;
+    #endif
+    map_perm=CL_MAP_READ;
+  } else if (kind==UCL_WRITE_ONLY) {
+    #ifdef CL_VERSION_1_2
+    buffer_perm=CL_MEM_HOST_WRITE_ONLY|CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR;
+    #else
+    buffer_perm=CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR;
+    #endif
+    map_perm=CL_MAP_WRITE;
  } else {
-    mat.cbegin()=clCreateBuffer(dev.context(),
-                                CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
-                                n,NULL,&error_flag);
-    if (error_flag != CL_SUCCESS) 
-      return UCL_MEMORY_ERROR;
-    *mat.host_ptr() = (typename mat_type::data_type*)
-                      clEnqueueMapBuffer(dev.cq(),mat.cbegin(),CL_TRUE,
-                                         CL_MAP_READ & CL_MAP_WRITE,
-                                         0,n,0,NULL,NULL,NULL);
+    buffer_perm=CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR;
+    map_perm=CL_MAP_READ | CL_MAP_WRITE;
  }
+
+  cl_int error_flag;
+  mat.cbegin()=clCreateBuffer(dev.context(),buffer_perm,n,NULL,&error_flag);
+  if (error_flag != CL_SUCCESS) 
+    return UCL_MEMORY_ERROR;
+
+  *mat.host_ptr() = (typename mat_type::data_type*)
+                    clEnqueueMapBuffer(dev.cq(),mat.cbegin(),CL_TRUE,
+                                       map_perm,0,n,0,NULL,NULL,NULL);
  mat.cq()=dev.cq();
  CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
  return UCL_SUCCESS;
 }

 template <class mat_type>
-inline void _host_free(mat_type &mat, const enum UCL_MEMOPT kind) {
-  CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
-  CL_DESTRUCT_CALL(clReleaseCommandQueue(mat.cq()));
+inline int _host_view(mat_type &mat, UCL_Device &dev, const size_t n) {
+  cl_int error_flag;
+  mat.cbegin()=clCreateBuffer(dev.context(), CL_MEM_USE_HOST_PTR,
+                              n,mat.host_ptr(),&error_flag);
+  CL_CHECK_ERR(error_flag);
+  CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
+  return UCL_SUCCESS;
+}
+
+template <class mat_type>
+inline void _host_free(mat_type &mat) {
+  if (mat.cols()>0) {
+    CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
+    CL_DESTRUCT_CALL(clReleaseCommandQueue(mat.cq()));
+  }
 }

 template <class mat_type>
@ -138,28 +194,26 @@ inline int _host_resize(mat_type &mat, const size_t n) {
  cl_context context;
  CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_CONTEXT,sizeof(context),
                                  &context,NULL));
+  cl_mem_flags buffer_perm;
+  CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_FLAGS,sizeof(buffer_perm),
+                                  &buffer_perm,NULL));

  CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
-  if (mat.kind()==UCL_WRITE_OPTIMIZED) {
-    mat.cbegin()=clCreateBuffer(context,
-                                CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
-                                n,NULL,&error_flag);                        
-    if (error_flag != CL_SUCCESS) 
-      return UCL_MEMORY_ERROR;
-    *mat.host_ptr() = (typename mat_type::data_type*)
-                      clEnqueueMapBuffer(mat.cq(),mat.cbegin(),CL_TRUE,
-                                         CL_MAP_WRITE,0,n,0,NULL,NULL,NULL);
-  } else {
-    mat.cbegin()=clCreateBuffer(context,
-                                CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
-                                n,NULL,&error_flag);
-    if (error_flag != CL_SUCCESS) 
-      return UCL_MEMORY_ERROR;
-    *mat.host_ptr() = (typename mat_type::data_type*)
-                      clEnqueueMapBuffer(mat.cq(),mat.cbegin(),CL_TRUE,
-                                         CL_MAP_READ | CL_MAP_WRITE,
-                                         0,n,0,NULL,NULL,NULL);
-  }
+
+  cl_map_flags map_perm;
+  if (mat.kind()==UCL_READ_ONLY)
+    map_perm=CL_MAP_READ;
+  else if (mat.kind()==UCL_WRITE_ONLY)
+    map_perm=CL_MAP_WRITE;
+  else
+    map_perm=CL_MAP_READ | CL_MAP_WRITE;
+
+  mat.cbegin()=clCreateBuffer(context,buffer_perm,n,NULL,&error_flag);
+  if (error_flag != CL_SUCCESS) 
+    return UCL_MEMORY_ERROR;
+  *mat.host_ptr() = (typename mat_type::data_type*)
+                    clEnqueueMapBuffer(mat.cq(),mat.cbegin(),CL_TRUE,
+                                       map_perm,0,n,0,NULL,NULL,NULL);
  return UCL_SUCCESS;
 }

@ -179,9 +233,17 @@ inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t n,
  if (kind==UCL_READ_WRITE)
    flag=CL_MEM_READ_WRITE;
  else if (kind==UCL_READ_ONLY)
+    #ifdef CL_VERSION_1_2
+    flag=CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY;
+    #else
    flag=CL_MEM_READ_ONLY;
+    #endif
  else if (kind==UCL_WRITE_ONLY)
+    #ifdef CL_VERSION_1_2
+    flag=CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY;
+    #else
    flag=CL_MEM_WRITE_ONLY;
+    #endif
  else
    assert(0==1);
  mat.cbegin()=clCreateBuffer(context,flag,n,NULL,&error_flag);
@ -200,9 +262,17 @@ inline int _device_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
  if (kind==UCL_READ_WRITE)
    flag=CL_MEM_READ_WRITE;
  else if (kind==UCL_READ_ONLY)
+    #ifdef CL_VERSION_1_2
+    flag=CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY;
+    #else
    flag=CL_MEM_READ_ONLY;
+    #endif
  else if (kind==UCL_WRITE_ONLY)
+    #ifdef CL_VERSION_1_2
+    flag=CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY;
+    #else
    flag=CL_MEM_WRITE_ONLY;
+    #endif
  else
    assert(0==1);
  mat.cbegin()=clCreateBuffer(dev.context(),flag,n,NULL,
@ -238,8 +308,10 @@ inline int _device_alloc(mat_type &mat, UCL_Device &dev, const size_t rows,

 template <class mat_type>
 inline void _device_free(mat_type &mat) {
-  CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
-  CL_DESTRUCT_CALL(clReleaseCommandQueue(mat.cq()));
+  if (mat.cols()>0) {
+    CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
+    CL_DESTRUCT_CALL(clReleaseCommandQueue(mat.cq()));
+  }
 }

 template <class mat_type>
@ -255,9 +327,17 @@ inline int _device_resize(mat_type &mat, const size_t n) {
  if (mat.kind()==UCL_READ_WRITE)
    flag=CL_MEM_READ_WRITE;
  else if (mat.kind()==UCL_READ_ONLY)
+    #ifdef CL_VERSION_1_2
+    flag=CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY;
+    #else
    flag=CL_MEM_READ_ONLY;
+    #endif
  else if (mat.kind()==UCL_WRITE_ONLY)
+    #ifdef CL_VERSION_1_2
+    flag=CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY;
+    #else
    flag=CL_MEM_WRITE_ONLY;
+    #endif
  else
    assert(0==1);
  mat.cbegin()=clCreateBuffer(context,flag,n,NULL,&error_flag);
@ -285,9 +365,17 @@ inline int _device_resize(mat_type &mat, const size_t rows,
  if (mat.kind()==UCL_READ_WRITE)
    flag=CL_MEM_READ_WRITE;
  else if (mat.kind()==UCL_READ_ONLY)
+    #ifdef CL_VERSION_1_2
+    flag=CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY;
+    #else
    flag=CL_MEM_READ_ONLY;
+    #endif
  else if (mat.kind()==UCL_WRITE_ONLY)
+    #ifdef CL_VERSION_1_2
+    flag=CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY;
+    #else
    flag=CL_MEM_WRITE_ONLY;
+    #endif
  else
    assert(0==1);
  mat.cbegin()=clCreateBuffer(context,flag,pitch*rows,NULL,&error_flag);
@ -344,7 +432,19 @@ inline void _ocl_kernel_from_source(cl_context &context, cl_device_id &device,
 }

 template <class mat_type>
-inline void _device_zero(mat_type &mat, const size_t n) {
+inline void _device_zero(mat_type &mat, const size_t n, command_queue &cq) {
+  #ifdef CL_VERSION_1_2
+  #ifndef __APPLE__
+  #define UCL_CL_ZERO
+  #endif
+  #endif
+
+  #ifdef UCL_CL_ZERO
+  cl_int zeroint=0;
+  CL_SAFE_CALL(clEnqueueFillBuffer(cq,mat.begin(),&zeroint,sizeof(cl_int),
+                                   mat.byteoff(),n,0,NULL,NULL));
+
+  #else
  cl_context context;
  CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_CONTEXT,sizeof(context),
                                  &context,NULL));
@ -354,17 +454,20 @@ inline void _device_zero(mat_type &mat, const size_t n) {
  
  const char * szero[3]={
    "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-    "__kernel void _device_zero(__global NUMTYP *a)",
-    "  { int gid=get_global_id(0); a[gid]=(NUMTYP)0; }"
+    "__kernel void _device_zero(__global NUMTYP *a, const int offset)",
+    "  { int gid=get_global_id(0)+offset; a[gid]=(NUMTYP)0; }"
  };
  
  cl_kernel kzero;
  _ocl_kernel_from_source(context,device,szero,3,kzero,"_device_zero",
                   _UCL_DATA_ID<typename mat_type::data_type>::numtyp_flag());
  
+  cl_int offset=mat.offset();
  CL_SAFE_CALL(clSetKernelArg(kzero,0,sizeof(cl_mem),(void *)&mat.begin()));
+  CL_SAFE_CALL(clSetKernelArg(kzero,1,sizeof(cl_int),(void *)&offset));
  size_t kn=n/sizeof(typename mat_type::data_type);
-  CL_SAFE_CALL(clEnqueueNDRangeKernel(mat.cq(),kzero,1,0,&kn,0,0,0,0));
+  CL_SAFE_CALL(clEnqueueNDRangeKernel(cq,kzero,1,0,&kn,0,0,0,0));
+  #endif
 }

 // --------------------------------------------------------------------------
@ -470,9 +573,15 @@ template <> struct _ucl_memcpy<1,0> {
                        cl_command_queue &cq, const cl_bool block,
                        const size_t dst_offset, const size_t src_offset) {
    if (src.cbegin()==dst.cbegin()) {
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_COPY 1S\n";
+      #endif
      if (block) ucl_sync(cq);
      return;
    }
+    #ifdef UCL_DBG_MEM_TRACE
+    std::cerr << "UCL_COPY 1NS\n";
+    #endif
    CL_SAFE_CALL(clEnqueueReadBuffer(cq,src.cbegin(),block,src_offset,n,
                                     dst.begin(),0,NULL,NULL));
  }
@ -484,8 +593,14 @@ template <> struct _ucl_memcpy<1,0> {
                        size_t dst_offset, size_t src_offset) {
    if (src.cbegin()==dst.cbegin()) {
      if (block) ucl_sync(cq);
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_COPY 2S\n";
+      #endif
      return;
    }
+    #ifdef UCL_DBG_MEM_TRACE
+    std::cerr << "UCL_COPY 2NS\n";
+    #endif
    if (spitch==dpitch && dst.cols()==src.cols() && 
        src.cols()==cols/src.element_size())
      CL_SAFE_CALL(clEnqueueReadBuffer(cq,src.cbegin(),block,src_offset,
@ -511,8 +626,14 @@ template <> struct _ucl_memcpy<0,1> {
                        const size_t dst_offset, const size_t src_offset) {
    if (src.cbegin()==dst.cbegin()) {
      if (block) ucl_sync(cq);
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_COPY 3S\n";
+      #endif
      return;                        
    }
+    #ifdef UCL_DBG_MEM_TRACE
+    std::cerr << "UCL_COPY 3NS\n";
+    #endif
    CL_SAFE_CALL(clEnqueueWriteBuffer(cq,dst.cbegin(),block,dst_offset,n,
                                      src.begin(),0,NULL,NULL));
  }
@ -524,8 +645,14 @@ template <> struct _ucl_memcpy<0,1> {
                        size_t dst_offset, size_t src_offset) {
    if (src.cbegin()==dst.cbegin()) {
      if (block) ucl_sync(cq);
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_COPY 4S\n";
+      #endif
      return;                        
    }
+    #ifdef UCL_DBG_MEM_TRACE
+    std::cerr << "UCL_COPY 4NS\n";
+    #endif
    if (spitch==dpitch && dst.cols()==src.cols() && 
        src.cols()==cols/src.element_size())
      CL_SAFE_CALL(clEnqueueWriteBuffer(cq,dst.cbegin(),block,dst_offset,
@ -549,9 +676,17 @@ template <int mem1, int mem2> struct _ucl_memcpy {
  static inline void mc(p1 &dst, const p2 &src, const size_t n,
                        cl_command_queue &cq, const cl_bool block,
                        const size_t dst_offset, const size_t src_offset) {
-    if (src.cbegin()!=dst.cbegin() || src_offset!=dst_offset)
+    if (src.cbegin()!=dst.cbegin() || src_offset!=dst_offset) {
      CL_SAFE_CALL(clEnqueueCopyBuffer(cq,src.cbegin(),dst.cbegin(),src_offset,
                                       dst_offset,n,0,NULL,NULL));
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_COPY 6NS\n";
+      #endif
+    }
+    #ifdef UCL_DBG_MEM_TRACE
+    else std::cerr << "UCL_COPY 6S\n";
+    #endif
+    
    if (block==CL_TRUE) ucl_sync(cq);
  }
  template <class p1, class p2>
@ -561,6 +696,9 @@ template <int mem1, int mem2> struct _ucl_memcpy {
                        const cl_bool block,
                        size_t dst_offset, size_t src_offset) {
    if (src.cbegin()!=dst.cbegin() || src_offset!=dst_offset) {                        
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_COPY 7NS\n";
+      #endif
      if (spitch==dpitch && dst.cols()==src.cols() && 
          src.cols()==cols/src.element_size())
        CL_SAFE_CALL(clEnqueueCopyBuffer(cq,src.cbegin(),dst.cbegin(),src_offset,
@ -575,6 +713,10 @@ template <int mem1, int mem2> struct _ucl_memcpy {
          dst_offset+=dpitch;
        }                                       
    }                                 
+    #ifdef UCL_DBG_MEM_TRACE
+    else std::cerr << "UCL_COPY 7S\n";
+    #endif
+
    if (block==CL_TRUE) ucl_sync(cq);
  }
 };
--- a/lib/gpu/geryon/ocl_timer.h
+++ b/lib/gpu/geryon/ocl_timer.h
@ -27,6 +27,12 @@
 #include "ocl_macros.h"
 #include "ocl_device.h"

+#ifdef CL_VERSION_1_2
+#define UCL_OCL_MARKER(cq,event) clEnqueueMarkerWithWaitList(cq,0,NULL,event)
+#else
+#define UCL_OCL_MARKER clEnqueueMarker
+#endif
+
 namespace ucl_opencl {

 /// Class for timing OpenCL events
@ -63,10 +69,10 @@ class UCL_Timer {
  }
  
  /// Start timing on default command queue
-  inline void start() { clEnqueueMarker(_cq,&start_event); }
+  inline void start() { UCL_OCL_MARKER(_cq,&start_event); }
  
  /// Stop timing on default command queue
-  inline void stop() { clEnqueueMarker(_cq,&stop_event); }
+  inline void stop() { UCL_OCL_MARKER(_cq,&stop_event); }
  
  /// Block until the start event has been reached on device
  inline void sync_start() 
@ -78,7 +84,7 @@ class UCL_Timer {

  /// Set the time elapsed to zero (not the total_time)
  inline void zero() 
-    { clEnqueueMarker(_cq,&start_event); clEnqueueMarker(_cq,&stop_event); } 
+    { UCL_OCL_MARKER(_cq,&start_event); UCL_OCL_MARKER(_cq,&stop_event); } 
  
  /// Set the total time to zero
  inline void zero_total() { _total_time=0.0; }
--- a/lib/gpu/geryon/ucl_basemat.h
+++ b/lib/gpu/geryon/ucl_basemat.h
@ -58,19 +58,36 @@
  * calls for reserving and copying memory **/ 
 class UCL_BaseMat {
 public:
-  UCL_BaseMat() : _cq(0) { }
+  UCL_BaseMat() : _cq(0), _kind(UCL_VIEW) { }
  virtual ~UCL_BaseMat() { }
  /// Return the default command queue/stream associated with this data
  inline command_queue & cq() { return _cq; }
+  /// Change the default command queue associated with matrix
+  inline void cq(command_queue &cq_in) { _cq=cq_in; }
  /// Block until command_queue associated with matrix is complete
  inline void sync() { ucl_sync(_cq); }
+  /// Return the type/permissions of memory allocation
+  /** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, UCL_NOT_PINNED
+    * or UCL_VIEW **/ 
+  inline enum UCL_MEMOPT kind() const { return _kind; }  
+
+  inline bool shared_mem_device() {
+    #ifdef _OCL_MAT
+    cl_device_id device;
+    CL_SAFE_CALL(clGetCommandQueueInfo(_cq,CL_QUEUE_DEVICE,
+                                       sizeof(cl_device_id),&device,NULL));
+    cl_device_type device_type;
+    CL_SAFE_CALL(clGetDeviceInfo(device,CL_DEVICE_TYPE,
+                                 sizeof(device_type),&device_type,NULL));
+    return _shared_mem_device(device_type);                                       
+    #else
+    return false;
+    #endif
+  }
  
-  #ifdef UCL_DEBUG
-  // Returns the type of host allocation
-  virtual inline enum UCL_MEMOPT kind() const { return UCL_NOT_PINNED; }
-  #endif 
 protected:
  command_queue _cq;
+  enum UCL_MEMOPT _kind;
 };

 #endif
--- a/lib/gpu/geryon/ucl_copy.h
+++ b/lib/gpu/geryon/ucl_copy.h
@ -102,6 +102,30 @@
 // Only allow this file to be included by nvc_memory.h and ocl_memory.h
 #ifdef UCL_COPY_ALLOW

+// --------------------------------------------------------------------------
+// - CHECK PERMISSIONS FOR SOURCE AND DESTINATION IN COPY
+// --------------------------------------------------------------------------
+template <class mat1, class mat2>
+inline void _check_ucl_copy_perm(mat1 &dst, mat2 &src) {
+  if ((int)mat1::MEM_TYPE==(int)mat2::MEM_TYPE) {
+    if (dst.kind()==UCL_READ_ONLY) {
+      std::cerr << "Attempt to copy where destination is UCL_READ_ONLY\n";
+      assert(0==1);
+    } else if (src.kind()==UCL_WRITE_ONLY) {
+      std::cerr << "Attempt to copy where source is UCL_WRITE_ONLY\n";
+      assert(0==1);
+    }
+  } else {
+    if (dst.kind()==UCL_WRITE_ONLY) {
+      std::cerr << "Destination in host-device copy cannot be UCL_WRITE_ONLY\n";
+      assert(0==1);
+    } else if (src.kind()==UCL_READ_ONLY) {
+      std::cerr << "Source in host-device copy cannot be UCL_READ_ONLY\n";
+      assert(0==1);
+    }
+  }
+} 
+
 // --------------------------------------------------------------------------
 // - HOST-HOST COPY ROUTINES
 // --------------------------------------------------------------------------
@ -117,9 +141,20 @@ template <> struct _host_host_copy<1,1> {
    assert(mat1::PADDED==0 && mat2::PADDED==0);
    assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
    #endif
-    if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE && mat1::DATA_TYPE!=0)
+    if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE && mat1::DATA_TYPE!=0) {
+      #ifdef _OCL_MAT
+      if (dst.begin()==src.begin()) {
+        #ifdef UCL_DBG_MEM_TRACE
+        std::cerr << "UCL_COPY 7S\n";
+        #endif
+        return;
+      }
+      #endif
      memcpy(dst.begin(),src.begin(),numel*sizeof(typename mat1::data_type));
-    else
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_COPY 7NS\n";
+      #endif
+    } else
      for (size_t i=0; i<numel; i++)
        dst[i]=static_cast<typename mat1::data_type>(src[i]);
  }
@ -138,15 +173,27 @@ template <> struct _host_host_copy<1,1> {
      src_row_size=cols;
    else
      src_row_size=src.row_size();
-    if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE && mat1::DATA_TYPE!=0)
+    if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE && mat1::DATA_TYPE!=0) {
+      #ifdef _OCL_MAT
+      if (dst.begin()==src.begin()) {
+        #ifdef UCL_DBG_MEM_TRACE
+        std::cerr << "UCL_COPY 8S\n";
+        #endif
+        return;
+      }
+      #endif
+      
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_COPY 8NS\n";
+      #endif
      for (size_t i=0; i<rows; i++)
        memcpy(dst.begin()+i*dst_row_size,src.begin()+i*src_row_size,
               cols*sizeof(typename mat1::data_type));
-    else
+    } else
      for (size_t j=0; j<rows; j++) {
-        int dst_i=j*dst_row_size;
-        int d_end=dst_i+cols;
-        int src_i=j*src_row_size;
+        size_t dst_i=j*dst_row_size;
+        size_t d_end=dst_i+cols;
+        size_t src_i=j*src_row_size;
        for (; dst_i<d_end; dst_i++) {
          dst[dst_i]=static_cast<typename mat1::data_type>(src[src_i]);
          src_i++;
@ -216,15 +263,14 @@ template <int host_type2> struct _ucl_cast_copy<1,host_type2> {
        ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
                   src.row_bytes(),cols*sizeof(typename mat2::data_type),
                   rows);
-      int dst_i=0;
-      int buff_i=0;
+      size_t dst_i=0, buff_i=0, doff=dst.cols()-cols;
      for (size_t i=0; i<rows; i++) {
        for (size_t j=0; j<cols; j++) {
          dst[dst_i]=static_cast<typename mat1::data_type>(cast_buffer[buff_i]);
          buff_i++;
          dst_i++;
        }
-        dst_i+=dst.cols()-cols;
+        dst_i+=doff;
      }
    }
  }
@ -255,15 +301,14 @@ template <int host_type2> struct _ucl_cast_copy<1,host_type2> {
                   src.row_bytes(),cols*sizeof(typename mat2::data_type),
                   rows,cq);
      cast_buffer.sync();
-      int dst_i=0;
-      int buff_i=0;
+      size_t dst_i=0, buff_i=0, doff=dst.cols()-cols;
      for (size_t i=0; i<rows; i++) {
        for (size_t j=0; j<cols; j++) {
          dst[dst_i]=static_cast<typename mat1::data_type>(cast_buffer[buff_i]);
          buff_i++;
          dst_i++;
        }
-        dst_i+=dst.cols()-cols;
+        dst_i+=doff;
      }
    }
  }
@ -293,38 +338,62 @@ template <int host_type1> struct _ucl_cast_copy<host_type1,1> {
    assert(src.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
    if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
    if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
+    if (mat3::VECTOR==0) { 
+      assert(cast_buffer.rows()>=rows && cast_buffer.cols()>=cols);
+      assert(dst.rows()>=rows && dst.cols()>=cols);
+    }
    #endif
    if (mat2::VECTOR) {
-      for (size_t i=0; i<rows*cols; i++)
-        cast_buffer[i]=static_cast<typename mat3::data_type>(src[i]);
-      ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,
-                 cols*sizeof(typename mat1::data_type),
-                 cols*sizeof(typename mat1::data_type),rows);
+      if (mat3::VECTOR==0) {
+        size_t ci=0, si=0, co=cast_buffer.cols()-cols, so=src.cols()-cols;
+        for (size_t i=0; i<rows; i++) {
+          for (size_t j=0; j<cols; j++) {
+            cast_buffer[ci]=static_cast<typename mat3::data_type>(src[si]);
+            ci++;
+            si++;
+          }
+          ci+=co;
+          si+=so;
+        }
+        ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,cast_buffer.row_bytes(),
+                   cols*sizeof(typename mat1::data_type),rows);
+      } else {
+        for (size_t i=0; i<rows*cols; i++)
+          cast_buffer[i]=static_cast<typename mat3::data_type>(src[i]);
+        ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,
+                   cols*sizeof(typename mat1::data_type),
+                   cols*sizeof(typename mat1::data_type),rows);
+      }
    } else if (mat1::VECTOR) {
-      int src_i=0;
-      int buf_i=0;
+      size_t src_i=0, buf_i=0, soff=src.cols()-cols;
      for (size_t i=0; i<rows; i++) {
        for (size_t j=0; j<cols; j++) {
          cast_buffer[buf_i]=static_cast<typename mat3::data_type>(src[src_i]);
          buf_i++;
          src_i++;
        }
-        src_i+=src.cols()-cols;
+        src_i+=soff;
      }
      ucl_mv_cpy(dst,cast_buffer,cols*sizeof(typename mat1::data_type)*rows);
    } else {
-      int src_i=0;
-      int buf_i=0;
+      size_t src_i=0, buf_i=0, so=src.cols()-cols, co, spitch;
+      if (mat3::VECTOR==0) {
+        co=cast_buffer.cols()-cols;
+        spitch=cast_buffer.row_bytes();
+      } else {
+        co=0;
+        spitch=cols*sizeof(typename mat1::data_type);
+      }
      for (size_t i=0; i<rows; i++) {
        for (size_t j=0; j<cols; j++) {
          cast_buffer[buf_i]=static_cast<typename mat3::data_type>(src[src_i]);
          buf_i++;
          src_i++;
        }
-        src_i+=src.cols()-cols;
+        src_i+=so;
+        buf_i+=co;
      }
-      ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,
-                 cols*sizeof(typename mat1::data_type),
+      ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,spitch,
                 cols*sizeof(typename mat1::data_type),rows);
    }
  }
@ -337,38 +406,62 @@ template <int host_type1> struct _ucl_cast_copy<host_type1,1> {
    assert(src.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
    if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);    
    if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);    
+    if (mat3::VECTOR==0) { 
+      assert(cast_buffer.rows()>=rows && cast_buffer.cols()>=cols);
+      assert(dst.rows()>=rows && dst.cols()>=cols);
+    }
    #endif
    if (mat2::VECTOR) {
-      for (size_t i=0; i<rows*cols; i++)
-        cast_buffer[i]=static_cast<typename mat3::data_type>(src[i]);
-      ucl_mv_cpy(dst,dst.row_bytes(),
-                 cast_buffer,cols*sizeof(typename mat1::data_type),
-                 cols*sizeof(typename mat1::data_type),rows,cq);
+      if (mat3::VECTOR==0) {
+        size_t ci=0, si=0, co=cast_buffer.cols()-cols, so=src.cols()-cols;
+        for (size_t i=0; i<rows; i++) {
+          for (size_t j=0; j<cols; j++) {
+            cast_buffer[ci]=static_cast<typename mat3::data_type>(src[si]);
+            ci++;
+            si++;
+          }
+          ci+=co;
+          si+=so;
+        }
+        ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,cast_buffer.row_bytes(),
+                   cols*sizeof(typename mat1::data_type),rows);
+      } else {
+        for (size_t i=0; i<rows*cols; i++)
+          cast_buffer[i]=static_cast<typename mat3::data_type>(src[i]);
+        ucl_mv_cpy(dst,dst.row_bytes(),
+                   cast_buffer,cols*sizeof(typename mat1::data_type),
+                   cols*sizeof(typename mat1::data_type),rows,cq);
+      }
    } else if (mat1::VECTOR) {
-      int src_i=0;
-      int buf_i=0;
+      size_t src_i=0, buf_i=0, soff=src.cols()-cols;
      for (size_t i=0; i<rows; i++) {
        for (size_t j=0; j<cols; j++) {
          cast_buffer[buf_i]=static_cast<typename mat3::data_type>(src[src_i]);
          buf_i++;
          src_i++;
        }
-        src_i+=src.cols()-cols;
+        src_i+=soff;
      }
      ucl_mv_cpy(dst,cast_buffer,cols*sizeof(typename mat1::data_type)*rows,cq);
    } else {
-      int src_i=0;
-      int buf_i=0;
+      size_t src_i=0, buf_i=0, so=src.cols()-cols, co, spitch;
+      if (mat3::VECTOR==0) {
+        co=cast_buffer.cols()-cols;
+        spitch=cast_buffer.row_bytes();
+      } else {
+        co=0;
+        spitch=cols*sizeof(typename mat1::data_type);
+      }
      for (size_t i=0; i<rows; i++) {
        for (size_t j=0; j<cols; j++) {
          cast_buffer[buf_i]=static_cast<typename mat3::data_type>(src[src_i]);
          buf_i++;
          src_i++;
        }
-        src_i+=src.cols()-cols;
+        src_i+=so;
+        buf_i+=co;
      }
-      ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,
-                 cols*sizeof(typename mat1::data_type),
+      ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,spitch,
                 cols*sizeof(typename mat1::data_type),rows,cq);
    }
  }
@ -444,9 +537,13 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel,
  #endif
  if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
    ucl_copy(dst,src,numel,cq);
-  else
+  else {
+    #ifdef UCL_DEBUG
+    _check_ucl_copy_perm(dst,src);
+    #endif
    _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
                                                      cast_buffer,cq);
+  }
 }

 /// Asynchronous copy of matrix/vector with cast (Device/Host transfer)
@ -463,6 +560,7 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel,
  assert(dst.numel()>=numel && src.numel()>=numel);
  assert(cast_buffer.numel()>=numel);
  assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
+  _check_ucl_copy_perm(dst,src);
  #endif
  if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
    ucl_copy(dst,src,numel,async);
@ -491,6 +589,7 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
  assert(dst.row_size()*dst.rows()>=numel && src.row_size()*src.rows()>=numel);
  assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
  assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
+  _check_ucl_copy_perm(dst,src);
  #endif
  if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
    _host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,numel);
@ -498,12 +597,12 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
      (mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
    if (mat1::MEM_TYPE==1) {
      UCL_H_Vec<typename mat2::data_type> cast_buffer;
-      cast_buffer.alloc(numel,dst,UCL_RW_OPTIMIZED);
+      cast_buffer.alloc(numel,dst,UCL_READ_ONLY);
      _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
                                                        cast_buffer,cq);
    } else {
      UCL_H_Vec<typename mat1::data_type> cast_buffer;
-      cast_buffer.alloc(numel,dst,UCL_WRITE_OPTIMIZED);
+      cast_buffer.alloc(numel,dst,UCL_WRITE_ONLY);
      _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
                                                        cast_buffer,cq);
    }
@ -529,6 +628,7 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
  #ifdef UCL_DEBUG
  assert(dst.row_size()*dst.rows()>=numel && src.row_size()*src.rows()>=numel);
  assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
+  _check_ucl_copy_perm(dst,src);
  #endif
  if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
    _host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,numel);
@ -538,12 +638,12 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
           (mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
    if (mat1::MEM_TYPE==1) {
      UCL_H_Vec<typename mat2::data_type> cast_buffer;
-      cast_buffer.alloc(numel,dst,UCL_RW_OPTIMIZED);
+      cast_buffer.alloc(numel,dst,UCL_READ_ONLY);
      _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
                                                        cast_buffer);
    } else {
      UCL_H_Vec<typename mat1::data_type> cast_buffer;
-      cast_buffer.alloc(numel,dst,UCL_WRITE_OPTIMIZED);
+      cast_buffer.alloc(numel,dst,UCL_WRITE_ONLY);
      _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
                                                        cast_buffer);
    }
@ -574,9 +674,13 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t rows,
    ucl_copy(dst,src,rows,cols,async);
  else if (async)
    ucl_copy(dst,src,rows,cols,dst.cq());
-  else
+  else {
+    #ifdef UCL_DEBUG
+    _check_ucl_copy_perm(dst,src);
+    #endif
    _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
                                                      cast_buffer);
+  }
 }

 /// Asynchronous copy subset matrix rows,cols with cast (Device/Host transfer)
@ -595,9 +699,13 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t rows,
                          command_queue &cq) {
  if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
    ucl_copy(dst,src,rows,cols,cq);
-  else 
+  else {
+    #ifdef UCL_DEBUG
+    _check_ucl_copy_perm(dst,src);
+    #endif
    _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
                                                      cast_buffer,cq);
+  }
 }

 /// Asynchronous copy of subset matrix rows,cols (memory already allocated)
@ -617,18 +725,21 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t rows,
 template <class mat1, class mat2>
 inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
                     const size_t cols, command_queue &cq) {
+  #ifdef UCL_DEBUG
+  _check_ucl_copy_perm(dst,src);
+  #endif
  if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
    _host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,rows,cols);
  else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE && 
           (mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
    if (mat1::MEM_TYPE==1) {
      UCL_H_Vec<typename mat2::data_type> cast_buffer;
-      cast_buffer.alloc(rows*cols,dst,UCL_RW_OPTIMIZED);
+      cast_buffer.alloc(rows*cols,dst,UCL_READ_ONLY);
      _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
                                                        cast_buffer,cq);
    } else {
      UCL_H_Vec<typename mat1::data_type> cast_buffer;
-      cast_buffer.alloc(rows*cols,dst,UCL_WRITE_OPTIMIZED);
+      cast_buffer.alloc(rows*cols,dst,UCL_WRITE_ONLY);
      _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
                                                        cast_buffer,cq);
    }
@ -678,6 +789,9 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
 template <class mat1, class mat2>
 inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
                     const size_t cols, const bool async) {
+  #ifdef UCL_DEBUG
+  _check_ucl_copy_perm(dst,src);
+  #endif
  if (async)
    ucl_copy(dst,src,rows,cols,dst.cq());
  else if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
@ -686,12 +800,12 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
           (mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
    if (mat1::MEM_TYPE==1) {
      UCL_H_Vec<typename mat2::data_type> cast_buffer;
-      cast_buffer.alloc(rows*cols,dst,UCL_RW_OPTIMIZED);
+      cast_buffer.alloc(rows*cols,dst,UCL_READ_ONLY);
      _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
                                                        cast_buffer);
    } else {
      UCL_H_Vec<typename mat1::data_type> cast_buffer;
-      cast_buffer.alloc(rows*cols,dst,UCL_WRITE_OPTIMIZED);
+      cast_buffer.alloc(rows*cols,dst,UCL_WRITE_ONLY);
      _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
                                                        cast_buffer);
    }
--- a/lib/gpu/geryon/ucl_d_mat.h
+++ b/lib/gpu/geryon/ucl_d_mat.h
@ -39,14 +39,14 @@ class UCL_D_Mat : public UCL_BaseMat {
  };
  typedef numtyp data_type; 

-  UCL_D_Mat() : _rows(0), _kind(UCL_VIEW) {}
-  ~UCL_D_Mat() { if (_kind!=UCL_VIEW) _device_free(*this); }
+  UCL_D_Mat() : _cols(0) {}
+  ~UCL_D_Mat() { _device_free(*this); }
  
  /// Construct with specified rows and cols
  /** \sa alloc() **/
  UCL_D_Mat(const size_t rows, const size_t cols, UCL_Device &device,
            const enum UCL_MEMOPT kind=UCL_READ_WRITE) : 
-    _rows(0), _kind(UCL_VIEW) { alloc(rows,cols,device,kind); }
+    _cols(0) { alloc(rows,cols,device,kind); }
    
  /// Row major matrix on device
  /** The kind parameter controls memory optimizations as follows:
@ -121,15 +121,11 @@ class UCL_D_Mat : public UCL_BaseMat {
    return err; 
  }
  
-  /// Return the type of memory allocation
-  /** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, or UCL_VIEW **/ 
-  inline enum UCL_MEMOPT kind() const { return _kind; }
-  
  /// Do not allocate memory, instead use an existing allocation from Geryon
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
    * \param stride Number of _elements_ between the start of each row **/ 
  template <class ucl_type>
  inline void view(ucl_type &input, const size_t rows, const size_t cols,
@ -142,8 +138,10 @@ class UCL_D_Mat : public UCL_BaseMat {
    _row_size=stride;
    this->_cq=input.cq();
    #ifdef _OCL_MAT
-    _offset=0;
+    _offset=input.offset();
    _array=input.cbegin();
+    CL_SAFE_CALL(clRetainMemObject(input.cbegin()));
+    CL_SAFE_CALL(clRetainCommandQueue(input.cq()));
    #else
    _device_view(&_array,input.begin());
    #endif
@ -157,7 +155,7 @@ class UCL_D_Mat : public UCL_BaseMat {
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container **/
+    *   allocating container when using CUDA APIs **/
  template <class ucl_type>
  inline void view(ucl_type &input, const size_t rows, const size_t cols) 
    { view(input,rows,cols,input.row_size()); }
@ -166,7 +164,7 @@ class UCL_D_Mat : public UCL_BaseMat {
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
    * - If a matrix is used a input, all elements (including padding)
    *   will be used for view **/
  template <class ucl_type>
@ -177,7 +175,7 @@ class UCL_D_Mat : public UCL_BaseMat {
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
    * - If a matrix is used a input, all elements (including padding)
    *   will be used for view **/
  template <class ucl_type>
@ -187,7 +185,7 @@ class UCL_D_Mat : public UCL_BaseMat {
  /// Do not allocate memory, instead use an existing allocation
  /** - No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
    * \param stride Number of _elements_ between the start of each row **/ 
  template <class ptr_type>
  inline void view(ptr_type input, const size_t rows, const size_t cols,
@ -205,13 +203,15 @@ class UCL_D_Mat : public UCL_BaseMat {
    #endif
    #ifdef _OCL_MAT
    _offset=0;
+    CL_SAFE_CALL(clRetainMemObject(input));
+    CL_SAFE_CALL(clRetainCommandQueue(dev.cq()));
    #endif
  }

  /// Do not allocate memory, instead use an existing allocation
  /** - No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container **/
+    *   allocating container when using CUDA APIs **/
  template <class ptr_type>
  inline void view(ptr_type input, const size_t rows, const size_t cols,
                   UCL_Device &dev) { view(input,rows,cols,cols,dev); }
@ -219,7 +219,7 @@ class UCL_D_Mat : public UCL_BaseMat {
  /// Do not allocate memory, instead use an existing allocation
  /** - No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container **/
+    *   allocating container when using CUDA APIs **/
  template <class ptr_type>
  inline void view(ptr_type input, const size_t cols, UCL_Device &dev)
    { view(input,1,cols,dev); }
@ -228,7 +228,7 @@ class UCL_D_Mat : public UCL_BaseMat {
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
    * \param stride Number of _elements_ between the start of each row **/ 
  template <class ucl_type>
  inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
@ -242,7 +242,9 @@ class UCL_D_Mat : public UCL_BaseMat {
    this->_cq=input.cq();
    #ifdef _OCL_MAT
    _array=input.begin();
-    _offset=offset;
+    _offset=offset+input.offset();
+    CL_SAFE_CALL(clRetainMemObject(input.cbegin()));
+    CL_SAFE_CALL(clRetainCommandQueue(input.cq()));
    #else
    _device_view(&_array,input.begin(),offset,sizeof(numtyp));
    #endif
@ -256,7 +258,7 @@ class UCL_D_Mat : public UCL_BaseMat {
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container **/
+    *   allocating container when using CUDA APIs **/
  template <class ucl_type>
  inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
                          const size_t cols) 
@ -266,7 +268,7 @@ class UCL_D_Mat : public UCL_BaseMat {
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
    * - If a matrix is used a input, all elements (including padding)
    *   will be used for view **/
  template <class ucl_type>
@ -277,7 +279,7 @@ class UCL_D_Mat : public UCL_BaseMat {
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
    * - If a matrix is used a input, all elements (including padding)
    *   will be used for view **/
  template <class ucl_type>
@ -292,7 +294,7 @@ class UCL_D_Mat : public UCL_BaseMat {
  /// Do not allocate memory, instead use an existing allocation
  /** - No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
    * \param stride Number of _elements_ between the start of each row **/ 
  template <class ptr_type>
  inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
@ -309,6 +311,8 @@ class UCL_D_Mat : public UCL_BaseMat {
    #ifdef _OCL_MAT
    _array=input;
    _offset=offset;
+    CL_SAFE_CALL(clRetainMemObject(input));
+    CL_SAFE_CALL(clRetainCommandQueue(dev.cq()));
    #else
    #ifdef _UCL_DEVICE_PTR_MAT
    _array=input+offset*sizeof(numtyp);
@ -325,7 +329,7 @@ class UCL_D_Mat : public UCL_BaseMat {
  /// Do not allocate memory, instead use an existing allocation
  /** - No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container **/
+    *   allocating container when using CUDA APIs **/
  template <class ptr_type>
  inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
                          const size_t cols, UCL_Device &dev) 
@ -334,7 +338,7 @@ class UCL_D_Mat : public UCL_BaseMat {
  /// Do not allocate memory, instead use an existing allocation
  /** - No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container **/
+    *   allocating container when using CUDA APIs **/
  template <class ptr_type>
  inline void view_offset(const size_t offset, ptr_type input, 
                          const size_t cols, UCL_Device &dev)
@ -342,7 +346,7 @@ class UCL_D_Mat : public UCL_BaseMat {
  
  /// Free memory and set size to 0
  inline void clear() 
-    { _rows=0; if (_kind!=UCL_VIEW) { _kind=UCL_VIEW; _device_free(*this); } }
+    { _device_free(*this); _cols=0; _kind=UCL_VIEW; }

  /// Resize the allocation to contain cols elements
  /** \note Cannot be used on views **/
@ -377,11 +381,17 @@ class UCL_D_Mat : public UCL_BaseMat {
    { if (cols>_cols || rows>_rows) return resize(rows,cols); 
      else return UCL_SUCCESS; }

-  /// Set each element to zero
-  inline void zero() { _device_zero(*this,row_bytes()*_rows); }
+  /// Set each element to zero asynchronously in the default command_queue
+  inline void zero() { zero(_cq); }
+  /// Set first n elements to zero asynchronously in the default command_queue
+  inline void zero(const int n) { zero(n,_cq); }
+  /// Set each element to zero asynchronously
+  inline void zero(command_queue &cq) 
+    { _device_zero(*this,row_bytes()*_rows,cq); }
+  /// Set first n elements to zero asynchronously
+  inline void zero(const int n, command_queue &cq) 
+    { _device_zero(*this,n*sizeof(numtyp),cq); }

-  /// Set first n elements to zero
-  inline void zero(const int n) { _device_zero(*this,n*sizeof(numtyp)); }

  #ifdef _UCL_DEVICE_PTR_MAT
  /// For OpenCL, returns a (void *) device pointer to memory allocation
@ -452,7 +462,6 @@ class UCL_D_Mat : public UCL_BaseMat {
  
 private:
  size_t _pitch, _row_size, _rows, _cols;
-  enum UCL_MEMOPT _kind;

  #ifdef _UCL_DEVICE_PTR_MAT
  device_ptr _array;
--- a/lib/gpu/geryon/ucl_d_vec.h
+++ b/lib/gpu/geryon/ucl_d_vec.h
@ -39,14 +39,14 @@ class UCL_D_Vec : public UCL_BaseMat {
  };
  typedef numtyp data_type; 

-  UCL_D_Vec() : _cols(0), _kind(UCL_VIEW) {}
-  ~UCL_D_Vec() { if (_kind!=UCL_VIEW) _device_free(*this); }
+  UCL_D_Vec() : _cols(0) {}
+  ~UCL_D_Vec() { _device_free(*this); }

  /// Construct with n columns
  /** \sa alloc() **/
  UCL_D_Vec(const size_t n, UCL_Device &device,
            const enum UCL_MEMOPT kind=UCL_READ_WRITE) : 
-    _cols(0), _kind(UCL_VIEW) { alloc(n,device,kind); }
+    _cols(0) { alloc(n,device,kind); }

  /// Set up host vector with 'cols' columns and reserve memory
  /** The kind parameter controls memory optimizations as follows:
@ -119,15 +119,11 @@ class UCL_D_Vec : public UCL_BaseMat {
    return err; 
  }

-  /// Return the type of memory allocation
-  /** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, or UCL_VIEW **/ 
-  inline enum UCL_MEMOPT kind() const { return _kind; }
-  
  /// Do not allocate memory, instead use an existing allocation from Geryon
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container **/
+    *   allocating container when using CUDA APIs **/
  template <class ucl_type>
  inline void view(ucl_type &input, const size_t rows, const size_t cols) {
    #ifdef UCL_DEBUG
@ -139,8 +135,10 @@ class UCL_D_Vec : public UCL_BaseMat {
    _row_bytes=_cols*sizeof(numtyp);
    this->_cq=input.cq();
    #ifdef _OCL_MAT
-    _offset=0;
+    _offset=input.offset();
    _array=input.cbegin();
+    CL_SAFE_CALL(clRetainMemObject(input.cbegin()));
+    CL_SAFE_CALL(clRetainCommandQueue(input.cq()));
    #else
    _device_view(&_array,input.begin());
    #endif
@ -154,7 +152,7 @@ class UCL_D_Vec : public UCL_BaseMat {
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
    * \param stride Number of _elements_ between the start of each row **/ 
  template <class ucl_type>
  inline void view(ucl_type &input, const size_t rows, const size_t cols,
@ -164,7 +162,7 @@ class UCL_D_Vec : public UCL_BaseMat {
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
    * - If a matrix is used a input, all elements (including padding)
    *   will be used for view **/
  template <class ucl_type>
@ -175,7 +173,7 @@ class UCL_D_Vec : public UCL_BaseMat {
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
    * - If a matrix is used a input, all elements (including padding)
    *   will be used for view **/
  template <class ucl_type>
@ -185,7 +183,7 @@ class UCL_D_Vec : public UCL_BaseMat {
  /// Do not allocate memory, instead use an existing allocation
  /** - No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container **/
+    *   allocating container when using CUDA APIs **/
  template <class ptr_type>
  inline void view(ptr_type input, const size_t rows, const size_t cols,
                   UCL_Device &dev) {
@ -203,13 +201,15 @@ class UCL_D_Vec : public UCL_BaseMat {
    #endif
    #ifdef _OCL_MAT
    _offset=0;
+    CL_SAFE_CALL(clRetainMemObject(input));
+    CL_SAFE_CALL(clRetainCommandQueue(dev.cq()));
    #endif
  }
  
  /// Do not allocate memory, instead use an existing allocation
  /** - No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
    * \param stride Number of _elements_ between the start of each row **/ 
  template <class ptr_type>
  inline void view(ptr_type input, const size_t rows, const size_t cols,
@ -219,7 +219,7 @@ class UCL_D_Vec : public UCL_BaseMat {
  /// Do not allocate memory, instead use an existing allocation
  /** - No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container **/
+    *   allocating container when using CUDA APIs **/
  template <class ptr_type>
  inline void view(ptr_type input, const size_t cols, UCL_Device &dev)
    { view(input,1,cols,dev); }
@ -228,7 +228,7 @@ class UCL_D_Vec : public UCL_BaseMat {
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container **/
+    *   allocating container when using CUDA APIs **/
  template <class ucl_type>
  inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
                          const size_t cols) {
@ -242,7 +242,9 @@ class UCL_D_Vec : public UCL_BaseMat {
    this->_cq=input.cq();
    #ifdef _OCL_MAT
    _array=input.begin();
-    _offset=offset;
+    _offset=offset+input.offset();
+    CL_SAFE_CALL(clRetainMemObject(input.begin()));
+    CL_SAFE_CALL(clRetainCommandQueue(input.cq()));
    #else
    _device_view(&_array,input.begin(),offset,sizeof(numtyp));
    #endif
@ -256,7 +258,7 @@ class UCL_D_Vec : public UCL_BaseMat {
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
    * \param stride Number of _elements_ between the start of each row **/ 
  template <class ucl_type>
  inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
@ -267,7 +269,7 @@ class UCL_D_Vec : public UCL_BaseMat {
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
    * - If a matrix is used a input, all elements (including padding)
    *   will be used for view **/
  template <class ucl_type>
@ -278,7 +280,7 @@ class UCL_D_Vec : public UCL_BaseMat {
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
    * - If a matrix is used a input, all elements (including padding)
    *   will be used for view **/
  template <class ucl_type>
@ -288,7 +290,7 @@ class UCL_D_Vec : public UCL_BaseMat {
  /// Do not allocate memory, instead use an existing allocation
  /** - No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container **/
+    *   allocating container when using CUDA APIs **/
  template <class ptr_type>
  inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
                          const size_t cols, UCL_Device &dev) {
@ -304,6 +306,8 @@ class UCL_D_Vec : public UCL_BaseMat {
    #ifdef _OCL_MAT
    _array=input;
    _offset=offset;
+    CL_SAFE_CALL(clRetainMemObject(input));
+    CL_SAFE_CALL(clRetainCommandQueue(dev.cq()));
    #else
    #ifdef _UCL_DEVICE_PTR_MAT
    _array=input+offset*sizeof(numtyp);
@ -320,7 +324,7 @@ class UCL_D_Vec : public UCL_BaseMat {
  /// Do not allocate memory, instead use an existing allocation
  /** - No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
    * \param stride Number of _elements_ between the start of each row **/ 
  template <class ptr_type>
  inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
@ -330,7 +334,7 @@ class UCL_D_Vec : public UCL_BaseMat {
  /// Do not allocate memory, instead use an existing allocation
  /** - No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container **/
+    *   allocating container when using CUDA APIs **/
  template <class ptr_type>
  inline void view_offset(const size_t offset, ptr_type input, 
                          const size_t cols, UCL_Device &dev)
@ -338,7 +342,7 @@ class UCL_D_Vec : public UCL_BaseMat {
  
  /// Free memory and set size to 0
  inline void clear() 
-    { if (_kind!=UCL_VIEW) { _cols=0; _kind=UCL_VIEW; _device_free(*this); } }
+    { _device_free(*this); _cols=0; _kind=UCL_VIEW;  }

  /// Resize the allocation to contain cols elements
  /** \note Cannot be used on views **/
@ -373,11 +377,15 @@ class UCL_D_Vec : public UCL_BaseMat {
  inline int resize_ib(const int cols)
    { if (cols>_cols) return resize(cols); else return UCL_SUCCESS; }

-  /// Set each element to zero
-  inline void zero() { _device_zero(*this,row_bytes()); }
-
-  /// Set first n elements to zero
-  inline void zero(const int n) { _device_zero(*this,n*sizeof(numtyp)); }
+  /// Set each element to zero asynchronously in the default command_queue
+  inline void zero() { zero(_cq); }
+  /// Set first n elements to zero asynchronously in the default command_queue
+  inline void zero(const int n) { zero(n,_cq); }
+  /// Set each element to zero asynchronously
+  inline void zero(command_queue &cq) { _device_zero(*this,row_bytes(),cq); }
+  /// Set first n elements to zero asynchronously
+  inline void zero(const int n, command_queue &cq) 
+    { _device_zero(*this,n*sizeof(numtyp),cq); }

  #ifdef _UCL_DEVICE_PTR_MAT
  /// For OpenCL, returns a (void *) device pointer to memory allocation
@ -465,7 +473,6 @@ class UCL_D_Vec : public UCL_BaseMat {

 private:
  size_t _row_bytes, _row_size, _rows, _cols;
-  enum UCL_MEMOPT _kind;
  
  #ifdef _UCL_DEVICE_PTR_MAT
  device_ptr _array;
--- a/lib/gpu/geryon/ucl_h_mat.h
+++ b/lib/gpu/geryon/ucl_h_mat.h
@ -39,33 +39,35 @@ class UCL_H_Mat : public UCL_BaseMat {
   };
   typedef numtyp data_type; 
   
-  UCL_H_Mat() : _kind(UCL_VIEW), _rows(0) {
+  UCL_H_Mat() : _cols(0) {
    #ifdef _OCL_MAT
    _carray=(cl_mem)(0);
    #endif
  }
-  ~UCL_H_Mat() { if (_kind!=UCL_VIEW) _host_free(*this,_kind); }
+  ~UCL_H_Mat() { _host_free(*this); }
  
  /// Construct with specied number of rows and columns
  /** \sa alloc() **/
  UCL_H_Mat(const size_t rows, const size_t cols, UCL_Device &device, 
-            const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) 
-    { _rows=0; _kind=UCL_VIEW; alloc(rows,cols,device,kind); }
+            const enum UCL_MEMOPT kind=UCL_READ_WRITE) 
+    { _cols=0; _kind=UCL_VIEW; alloc(rows,cols,device,kind); }
  
  /// Set up host matrix with specied # of rows/cols and reserve memory
  /** The kind parameter controls memory pinning as follows:
-    * - UCL_NOT_PINNED      - Memory is not pinned
-    * - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
-    * - UCL_RW_OPTIMIZED    - Memory can be pinned 
+    * - UCL_READ_WRITE - Specify that you will read and write from host
+    * - UCL_WRITE_ONLY - Specify that you will only write from host
+    * - UCL_READ_ONLY  - Specify that you will only read from host
+    * - UCL_NOT_PINNED - Memory is not pinned/page-locked on host
    * \param cq Default command queue for operations copied from another mat
    * \return UCL_SUCCESS if the memory allocation is successful **/
  template <class mat_type>
  inline int alloc(const size_t rows, const size_t cols, mat_type &cq,
-                   const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
+                   const enum UCL_MEMOPT kind=UCL_READ_WRITE,
+                   const enum UCL_MEMOPT kind2=UCL_NOT_SPECIFIED) {
    clear();

    _row_bytes=cols*sizeof(numtyp);
-    int err=_host_alloc(*this,cq,_row_bytes*rows,kind);
+    int err=_host_alloc(*this,cq,_row_bytes*rows,kind,kind2);
    if (err!=UCL_SUCCESS) {
      #ifndef UCL_NO_EXIT
      std::cerr << "UCL Error: Could not allocate " << _row_bytes*_rows
@ -86,17 +88,19 @@ class UCL_H_Mat : public UCL_BaseMat {

  /// Set up host matrix with specied # of rows/cols and reserve memory
  /** The kind parameter controls memory pinning as follows:
-    * - UCL_NOT_PINNED      - Memory is not pinned
-    * - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
-    * - UCL_RW_OPTIMIZED    - Memory can be pinned 
+    * - UCL_READ_WRITE - Specify that you will read and write from host
+    * - UCL_WRITE_ONLY - Specify that you will only write from host
+    * - UCL_READ_ONLY  - Specify that you will only read from host
+    * - UCL_NOT_PINNED - Memory is not pinned/page-locked on host
    * \param device Used to get the default command queue for operations
    * \return UCL_SUCCESS if the memory allocation is successful **/
  inline int alloc(const size_t rows, const size_t cols, UCL_Device &device,
-                   const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
+                   const enum UCL_MEMOPT kind=UCL_READ_WRITE,
+                   const enum UCL_MEMOPT kind2=UCL_NOT_SPECIFIED) {
    clear();

    _row_bytes=cols*sizeof(numtyp);
-    int err=_host_alloc(*this,device,_row_bytes*rows,kind);
+    int err=_host_alloc(*this,device,_row_bytes*rows,kind,kind2);
    if (err!=UCL_SUCCESS) {
      #ifndef UCL_NO_EXIT
      std::cerr << "UCL Error: Could not allocate " << _row_bytes*_rows
@ -115,15 +119,11 @@ class UCL_H_Mat : public UCL_BaseMat {
    return err;
  }    
  
-  /// Return the type of memory allocation
-  /** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, or UCL_VIEW **/ 
-  inline enum UCL_MEMOPT kind() const { return _kind; }
-  
  /// Do not allocate memory, instead use an existing allocation from Geryon
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
    * - Viewing a device container on the host is not supported 
    * \param stride Number of _elements_ between the start of each row **/ 
  template <class ucl_type>
@ -140,6 +140,8 @@ class UCL_H_Mat : public UCL_BaseMat {
    _end=_array+_cols;
    #ifdef _OCL_MAT
    _carray=input.cbegin();
+    CL_SAFE_CALL(clRetainMemObject(input.cbegin()));
+    CL_SAFE_CALL(clRetainCommandQueue(input.cq()));
    #endif
  }

@ -147,7 +149,7 @@ class UCL_H_Mat : public UCL_BaseMat {
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
    * - Viewing a device container on the host is not supported **/ 
  template <class ucl_type>
  inline void view(ucl_type &input, const size_t rows, const size_t cols) 
@ -157,7 +159,7 @@ class UCL_H_Mat : public UCL_BaseMat {
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
    * - If a matrix is used a input, all elements (including padding)
    *   will be used for view 
    * - Viewing a device container on the host is not supported **/ 
@ -169,9 +171,9 @@ class UCL_H_Mat : public UCL_BaseMat {
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
    * - If a matrix is used a input, all elements (including padding)
-    *   will be used for view 
+    *   will be used for view when using CUDA APIs 
    * - Viewing a device container on the host is not supported **/ 
  template <class ucl_type>
  inline void view(ucl_type &input) 
@ -180,7 +182,7 @@ class UCL_H_Mat : public UCL_BaseMat {
  /// Do not allocate memory, instead use an existing allocation
  /** - No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
    * - Viewing a device pointer on the host is not supported 
    * \param stride Number of _elements_ between the start of each row **/ 
  template <class ptr_type>
@ -197,14 +199,14 @@ class UCL_H_Mat : public UCL_BaseMat {
    _end=_array+_cols;
    
    #ifdef _OCL_MAT
-    _host_alloc(*this,dev,_row_bytes,UCL_VIEW);
+    _host_view(*this,dev,_row_bytes*rows);
    #endif 
  }

  /// Do not allocate memory, instead use an existing allocation
  /** - No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
    * - Viewing a device pointer on the host is not supported **/ 
  template <class ptr_type>
  inline void view(ptr_type *input, const size_t rows, const size_t cols,
@ -213,7 +215,7 @@ class UCL_H_Mat : public UCL_BaseMat {
  /// Do not allocate memory, instead use an existing allocation
  /** - No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
    * - Viewing a device pointer on the host is not supported **/ 
  template <class ptr_type>
  inline void view(ptr_type *input, const size_t cols, UCL_Device &dev)
@ -223,7 +225,7 @@ class UCL_H_Mat : public UCL_BaseMat {
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
    * - Viewing a device container on the host is not supported 
    * \param stride Number of _elements_ between the start of each row **/ 
  template <class ucl_type>
@ -239,7 +241,7 @@ class UCL_H_Mat : public UCL_BaseMat {
    _array=input.begin()+offset;
    _end=_array+_cols;
    #ifdef _OCL_MAT
-    _host_alloc(*this,input,_row_bytes,UCL_VIEW);
+    _host_view(*this,input,_row_bytes*_rows);
    #endif
  }
  
@ -247,7 +249,7 @@ class UCL_H_Mat : public UCL_BaseMat {
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
    * - Viewing a device container on the host is not supported **/ 
  template <class ucl_type>
  inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
@ -258,7 +260,7 @@ class UCL_H_Mat : public UCL_BaseMat {
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
    * - If a matrix is used a input, all elements (including padding)
    *   will be used for view 
    * - Viewing a device container on the host is not supported **/ 
@ -270,7 +272,7 @@ class UCL_H_Mat : public UCL_BaseMat {
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
    * - If a matrix is used a input, all elements (including padding)
    *   will be used for view 
    * - Viewing a device container on the host is not supported **/ 
@ -296,7 +298,7 @@ class UCL_H_Mat : public UCL_BaseMat {
  /// Do not allocate memory, instead use an existing allocation
  /** - No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
    * - Viewing a device pointer on the host is not supported 
    * \param stride Number of _elements_ between the start of each row **/ 
  template <class ptr_type>
@ -307,7 +309,7 @@ class UCL_H_Mat : public UCL_BaseMat {
  /// Do not allocate memory, instead use an existing allocation
  /** - No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
    * - Viewing a device pointer on the host is not supported **/ 
  template <class ptr_type>
  inline void view_offset(const size_t offset, ptr_type *input, 
@ -316,7 +318,7 @@ class UCL_H_Mat : public UCL_BaseMat {
  
  /// Free memory and set size to 0
  inline void clear() 
-    { if (_kind!=UCL_VIEW) {_rows=0; _kind=UCL_VIEW; _host_free(*this,_kind); }} 
+    { _host_free(*this); _cols=0; _kind=UCL_VIEW; } 

  /// Resize the allocation to rows x cols elements
  /** \note Cannot be used on views **/
@ -409,7 +411,6 @@ class UCL_H_Mat : public UCL_BaseMat {
  #endif
  
 private:
-  enum UCL_MEMOPT _kind;
  numtyp *_array, *_end;
  size_t _row_bytes, _rows, _cols;

--- a/lib/gpu/geryon/ucl_h_vec.h
+++ b/lib/gpu/geryon/ucl_h_vec.h
@ -39,33 +39,35 @@ class UCL_H_Vec : public UCL_BaseMat {
   };
   typedef numtyp data_type; 
   
-  UCL_H_Vec() : _kind(UCL_VIEW), _cols(0) {
+  UCL_H_Vec() : _cols(0) {
    #ifdef _OCL_MAT
    _carray=(cl_mem)(0);
    #endif
  }
-  ~UCL_H_Vec() { if (_kind!=UCL_VIEW) _host_free(*this,_kind); }
+  ~UCL_H_Vec() { _host_free(*this); }
  
  /// Construct with n columns
  /** \sa alloc() **/
  UCL_H_Vec(const size_t n, UCL_Device &device, 
-            const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) 
+            const enum UCL_MEMOPT kind=UCL_READ_WRITE) 
    { _cols=0; _kind=UCL_VIEW; alloc(n,device,kind); }
  
  /// Set up host vector with 'cols' columns and reserve memory
  /** The kind parameter controls memory pinning as follows:
-    * - UCL_NOT_PINNED      - Memory is not pinned
-    * - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
-    * - UCL_RW_OPTIMIZED    - Memory can be pinned 
+    * - UCL_READ_WRITE - Specify that you will read and write from host
+    * - UCL_WRITE_ONLY - Specify that you will only write from host
+    * - UCL_READ_ONLY  - Specify that you will only read from host
+    * - UCL_NOT_PINNED - Memory is not pinned/page-locked on host
    * \param cq Default command queue for operations copied from another mat
    * \return UCL_SUCCESS if the memory allocation is successful **/
  template <class mat_type>
  inline int alloc(const size_t cols, mat_type &cq,
-                   const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
+                   const enum UCL_MEMOPT kind=UCL_READ_WRITE,
+                   const enum UCL_MEMOPT kind2=UCL_NOT_SPECIFIED) {
    clear();

    _row_bytes=cols*sizeof(numtyp);
-    int err=_host_alloc(*this,cq,_row_bytes,kind);
+    int err=_host_alloc(*this,cq,_row_bytes,kind,kind2);

    if (err!=UCL_SUCCESS) {
      #ifndef UCL_NO_EXIT
@ -86,17 +88,19 @@ class UCL_H_Vec : public UCL_BaseMat {

  /// Set up host vector with 'cols' columns and reserve memory
  /** The kind parameter controls memory pinning as follows:
-    * - UCL_NOT_PINNED      - Memory is not pinned
-    * - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
-    * - UCL_RW_OPTIMIZED    - Memory can be pinned 
+    * - UCL_READ_WRITE - Specify that you will read and write from host
+    * - UCL_WRITE_ONLY - Specify that you will only write from host
+    * - UCL_READ_ONLY  - Specify that you will only read from host
+    * - UCL_NOT_PINNED - Memory is not pinned/page-locked on host
    * \param device Used to get the default command queue for operations
    * \return UCL_SUCCESS if the memory allocation is successful **/
  inline int alloc(const size_t cols, UCL_Device &device,
-                   const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
+                   const enum UCL_MEMOPT kind=UCL_READ_WRITE,
+                   const enum UCL_MEMOPT kind2=UCL_NOT_SPECIFIED) {
    clear();

    _row_bytes=cols*sizeof(numtyp);
-    int err=_host_alloc(*this,device,_row_bytes,kind);
+    int err=_host_alloc(*this,device,_row_bytes,kind,kind2);

    if (err!=UCL_SUCCESS) {
      #ifndef UCL_NO_EXIT
@ -115,15 +119,11 @@ class UCL_H_Vec : public UCL_BaseMat {
    return err;
  }
  
-  /// Return the type of memory allocation
-  /** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, or UCL_VIEW **/ 
-  inline enum UCL_MEMOPT kind() const { return _kind; }
-  
  /// Do not allocate memory, instead use an existing allocation from Geryon
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
    * - Viewing a device container on the host is not supported **/ 
  template <class ucl_type>
  inline void view(ucl_type &input, const size_t rows, const size_t cols) {
@ -139,6 +139,8 @@ class UCL_H_Vec : public UCL_BaseMat {
    _end=_array+_cols;
    #ifdef _OCL_MAT
    _carray=input.cbegin();
+    CL_SAFE_CALL(clRetainMemObject(input.cbegin()));
+    CL_SAFE_CALL(clRetainCommandQueue(input.cq()));
    #endif
  }
  
@ -146,7 +148,7 @@ class UCL_H_Vec : public UCL_BaseMat {
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs
    * - Viewing a device container on the host is not supported 
    * \param stride Number of _elements_ between the start of each row **/ 
  template <class ucl_type>
@ -157,7 +159,7 @@ class UCL_H_Vec : public UCL_BaseMat {
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
    * - If a matrix is used a input, all elements (including padding)
    *   will be used for view 
    * - Viewing a device container on the host is not supported **/ 
@ -180,7 +182,7 @@ class UCL_H_Vec : public UCL_BaseMat {
  /// Do not allocate memory, instead use an existing allocation
  /** - No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
    * - Viewing a device pointer on the host is not supported **/ 
  template <class ptr_type>
  inline void view(ptr_type *input, const size_t rows, const size_t cols,
@ -197,14 +199,14 @@ class UCL_H_Vec : public UCL_BaseMat {
    _end=_array+_cols;
    
    #ifdef _OCL_MAT
-    _host_alloc(*this,dev,_row_bytes,UCL_VIEW);
+    _host_view(*this,dev,_row_bytes);
    #endif 
  }
  
  /// Do not allocate memory, instead use an existing allocation
  /** - No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs
    * - Viewing a device pointer on the host is not supported 
    * \param stride Number of _elements_ between the start of each row **/ 
  template <class ptr_type>
@ -215,7 +217,7 @@ class UCL_H_Vec : public UCL_BaseMat {
  /// Do not allocate memory, instead use an existing allocation
  /** - No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs
    * - Viewing a device pointer on the host is not supported **/ 
  template <class ptr_type>
  inline void view(ptr_type *input, const size_t cols, UCL_Device &dev)
@ -225,7 +227,7 @@ class UCL_H_Vec : public UCL_BaseMat {
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs
    * - Viewing a device container on the host is not supported **/ 
  template <class ucl_type>
  inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
@ -241,7 +243,7 @@ class UCL_H_Vec : public UCL_BaseMat {
    _array=input.begin()+offset;
    _end=_array+_cols;
    #ifdef _OCL_MAT
-    _host_alloc(*this,input,_row_bytes,UCL_VIEW);
+    _host_view(*this,input,_row_bytes);
    #endif
  }
  
@ -249,7 +251,7 @@ class UCL_H_Vec : public UCL_BaseMat {
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
    * - Viewing a device container on the host is not supported 
    * \param stride Number of _elements_ between the start of each row **/ 
  template <class ucl_type>
@ -261,7 +263,7 @@ class UCL_H_Vec : public UCL_BaseMat {
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
    * - If a matrix is used a input, all elements (including padding)
    *   will be used for view 
    * - Viewing a device container on the host is not supported **/ 
@ -273,7 +275,7 @@ class UCL_H_Vec : public UCL_BaseMat {
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
    * - If a matrix is used a input, all elements (including padding)
    *   will be used for view 
    * - Viewing a device container on the host is not supported **/ 
@ -284,7 +286,7 @@ class UCL_H_Vec : public UCL_BaseMat {
  /// Do not allocate memory, instead use an existing allocation
  /** - No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
    * - Viewing a device pointer on the host is not supported **/ 
  template <class ptr_type>
  inline void view_offset(const size_t offset,ptr_type *input,const size_t rows,
@ -294,7 +296,7 @@ class UCL_H_Vec : public UCL_BaseMat {
  /// Do not allocate memory, instead use an existing allocation
  /** - No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
    * - Viewing a device pointer on the host is not supported 
    * \param stride Number of _elements_ between the start of each row **/ 
  template <class ptr_type>
@ -305,7 +307,7 @@ class UCL_H_Vec : public UCL_BaseMat {
  /// Do not allocate memory, instead use an existing allocation
  /** - No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
    * - Viewing a device pointer on the host is not supported **/ 
  template <class ptr_type>
  inline void view_offset(const size_t offset, ptr_type *input, 
@ -314,7 +316,7 @@ class UCL_H_Vec : public UCL_BaseMat {
  
  /// Free memory and set size to 0
  inline void clear() 
-    { if (_kind!=UCL_VIEW) {_kind=UCL_VIEW; _cols=0; _host_free(*this,_kind);}}
+    { _host_free(*this); _kind=UCL_VIEW; _cols=0; }

  /// Resize the allocation to contain cols elements
  /** \note Cannot be used on views **/
@ -401,7 +403,6 @@ class UCL_H_Vec : public UCL_BaseMat {
  #endif
  
 private:
-  enum UCL_MEMOPT _kind;
  numtyp *_array, *_end;
  size_t _row_bytes, _cols;

--- a/lib/gpu/geryon/ucl_matrix.h
+++ b/lib/gpu/geryon/ucl_matrix.h
@ -48,17 +48,18 @@ class UCL_Matrix {
  /// Construct with specied number of rows and columns
  /** \sa alloc() **/
  UCL_Matrix(const size_t rows, const size_t cols, UCL_Device &acc, 
-             const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED,
+             const enum UCL_MEMOPT kind1=UCL_READ_WRITE,
             const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
        alloc(host,device,_buffer,rows,cols,acc,kind1,kind2); }
  
  /// Set up host matrix with specied # of rows/cols and reserve memory
-  /** The kind1 parameter controls memory pinning as follows:
-    * - UCL_NOT_PINNED      - Memory is not pinned
-    * - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
-    * - UCL_RW_OPTIMIZED    - Memory can be pinned 
-    * The kind2 parameter controls memory optimizations as follows:
+  /** The kind1 parameter controls memory access from the host
+    * - UCL_READ_WRITE - Specify that you will read and write from host
+    * - UCL_WRITE_ONLY - Specify that you will only write from host
+    * - UCL_READ_ONLY  - Specify that you will only read from host
+    * - UCL_NOT_PINNED - Memory is not pinned/page-locked on host
+    * The kind2 parameter controls memory optimizations from the device:
    * - UCL_READ_WRITE - Specify that you will read and write in kernels
    * - UCL_WRITE_ONLY - Specify that you will only write in kernels
    * - UCL_READ_ONLY  - Specify that you will only read in kernels
@ -69,24 +70,25 @@ class UCL_Matrix {
    * \return UCL_SUCCESS if the memory allocation is successful **/
  template <class mat_type>
  inline int alloc(const size_t rows, const size_t cols, mat_type &cq,
-                   const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED,
+                   const enum UCL_MEMOPT kind1=UCL_READ_WRITE,
                   const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
    { return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
        alloc(host,device,_buffer,rows,cols,cq,kind1,kind2); }
  
  /// Set up host matrix with specied # of rows/cols and reserve memory
-  /** The kind1 parameter controls memory pinning as follows:
-    * - UCL_NOT_PINNED      - Memory is not pinned
-    * - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
-    * - UCL_RW_OPTIMIZED    - Memory can be pinned 
-    * The kind2 parameter controls memory optimizations as follows:
+  /** The kind1 parameter controls memory access from the host
+    * - UCL_READ_WRITE - Specify that you will read and write from host
+    * - UCL_WRITE_ONLY - Specify that you will only write from host
+    * - UCL_READ_ONLY  - Specify that you will only read from host
+    * - UCL_NOT_PINNED - Memory is not pinned/page-locked on host
+    * The kind2 parameter controls memory optimizations from the device:
    * - UCL_READ_WRITE - Specify that you will read and write in kernels
    * - UCL_WRITE_ONLY - Specify that you will only write in kernels
    * - UCL_READ_ONLY  - Specify that you will only read in kernels
    * \param device Used to get the default command queue for operations
    * \return UCL_SUCCESS if the memory allocation is successful **/
  inline int alloc(const size_t rows, const size_t cols, UCL_Device &acc,
-                   const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED,
+                   const enum UCL_MEMOPT kind1=UCL_READ_WRITE,
                   const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
    { return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
        alloc(host,device,_buffer,rows,cols,acc,kind1,kind2); }
@ -110,11 +112,22 @@ class UCL_Matrix {
    { if (new_rows>rows() || new_cols>cols()) return resize(new_rows,new_cols); 
      else return UCL_SUCCESS; }

-  /// Set each element to zero
-  inline void zero() { host.zero(); device.zero(); }
-  
-  /// Set first n elements to zero
-  inline void zero(const int n) { host.zero(n); device.zero(n); }
+  /// Set each element to zero (asynchronously on device)
+  inline void zero() { zero(cq()); }
+  /// Set first n elements to zero (asynchronously on device)
+  inline void zero(const int n) { zero(n,cq()); }
+  /// Set each element to zero (asynchronously on device)
+  inline void zero(command_queue &cq) {
+    host.zero(); 
+    if (device.kind()!=UCL_VIEW) device.zero(cq);
+    else if (_buffer.numel()>0) _buffer.zero();
+  }
+  /// Set first n elements to zero (asynchronously on device)
+  inline void zero(const int n, command_queue &cq) { 
+    host.zero(n); 
+    if (device.kind()!=UCL_VIEW) device.zero(n,cq); 
+    else if (_buffer.numel()>0) _buffer.zero();
+  }

  /// Get the number of elements
  inline size_t numel() const { return host.numel(); }
@ -145,6 +158,8 @@ class UCL_Matrix {
  
  /// Return the default command queue/stream associated with this data
  inline command_queue & cq() { return host.cq(); }
+  /// Change the default command queue associated with this data
+  inline void cq(command_queue &cq_in) { host.cq(cq_in); device.cq(cq_in); }
  /// Block until command_queue associated with matrix is complete
  inline void sync() { host.sync(); }

--- a/lib/gpu/geryon/ucl_s_obj_help.h
+++ b/lib/gpu/geryon/ucl_s_obj_help.h
@ -32,14 +32,24 @@ template <> struct _ucl_s_obj_help<1> {
                          const enum UCL_MEMOPT kind1,
                          const enum UCL_MEMOPT kind2) {
    int e1;
-    e1=host.alloc(cols,acc,kind1);
-    if (e1!=UCL_SUCCESS)
-      return e1;
    if (acc.shared_memory()) {
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_ALLOC 1S\n";
+      #endif
+      e1=host.alloc(cols,acc,kind1,kind2);
+      if (e1!=UCL_SUCCESS)
+        return e1;
      device.view(host);
      return UCL_SUCCESS;
-    } else
+    } else {
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_ALLOC 1NS\n";
+      #endif
+      e1=host.alloc(cols,acc,kind1);
+      if (e1!=UCL_SUCCESS)
+        return e1;
      return device.alloc(cols,acc,kind2);
+    }
  }

  template <class t1, class t2, class t3, class mat_type>
@ -48,10 +58,24 @@ template <> struct _ucl_s_obj_help<1> {
                          const enum UCL_MEMOPT kind1,
                          const enum UCL_MEMOPT kind2) {
    int e1;
-    e1=host.alloc(cols,cq,kind1);
-    if (e1!=UCL_SUCCESS)
-      return e1;
-    return device.alloc(cols,cq,kind2);
+    if (cq.shared_mem_device()) {
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_ALLOC 2S\n";
+      #endif
+      e1=host.alloc(cols,cq,kind1,kind2);
+      if (e1!=UCL_SUCCESS)
+        return e1;
+      device.view(host);
+      return UCL_SUCCESS;
+    } else {
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_ALLOC 2NS\n";
+      #endif
+      e1=host.alloc(cols,cq,kind1);
+      if (e1!=UCL_SUCCESS)
+        return e1;
+      return device.alloc(cols,cq,kind2);
+    }
  }

  template <class t1, class t2, class t3>
@ -60,14 +84,24 @@ template <> struct _ucl_s_obj_help<1> {
                          const enum UCL_MEMOPT kind1,
                          const enum UCL_MEMOPT kind2) {
    int e1;
-    e1=host.alloc(rows,cols,acc,kind1);
-    if (e1!=UCL_SUCCESS)
-      return e1;
    if (acc.shared_memory()) {
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_ALLOC 3S\n";
+      #endif
+      e1=host.alloc(rows,cols,acc,kind1,kind2);
+      if (e1!=UCL_SUCCESS)
+        return e1;
      device.view(host);
      return UCL_SUCCESS;
-    } else
+    } else {
+      e1=host.alloc(rows,cols,acc,kind1);
+      if (e1!=UCL_SUCCESS)
+        return e1;
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_ALLOC 3NS\n";
+      #endif
      return device.alloc(rows,cols,acc,kind2);
+    }
  }

  template <class t1, class t2, class t3, class mat_type>
@ -76,10 +110,24 @@ template <> struct _ucl_s_obj_help<1> {
                          const enum UCL_MEMOPT kind1,
                          const enum UCL_MEMOPT kind2) {
    int e1;
-    e1=host.alloc(rows,cols,cq,kind1);
-    if (e1!=UCL_SUCCESS)
-      return e1;
-    return device.alloc(rows,cols,cq,kind2);
+    if (cq.shared_mem_device()) {
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_ALLOC 4S\n";
+      #endif
+      e1=host.alloc(rows,cols,cq,kind1,kind2);
+      if (e1!=UCL_SUCCESS)
+        return e1;
+      device.view(host);
+      return UCL_SUCCESS;
+    } else {
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_ALLOC 4NS\n";
+      #endif
+      e1=host.alloc(rows,cols,cq,kind1);
+      if (e1!=UCL_SUCCESS)
+        return e1;
+      return device.alloc(rows,cols,cq,kind2);
+    }
  }

  template <class t1, class t2, class t3>
@ -121,8 +169,15 @@ template <> struct _ucl_s_obj_help<1> {
    if (device.kind()==UCL_VIEW) {
      device.view(host);
      return UCL_SUCCESS;
-    } else
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_ALLOC 5S\n";
+      #endif
+    } else {
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_ALLOC 5NS\n";
+      #endif
      return device.resize(cols);
+    }
  }

  template <class t1, class t2, class t3>
@ -130,9 +185,16 @@ template <> struct _ucl_s_obj_help<1> {
                               const int cols) {
    if (device.kind()==UCL_VIEW) {
      device.view(host);
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_ALLOC 6S\n";
+      #endif
      return UCL_SUCCESS;
-    } else
+    } else {
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_ALLOC 6NS\n";
+      #endif
      return device.resize(rows,cols);
+    }
  }
 };

@ -145,17 +207,27 @@ template <int st> struct _ucl_s_obj_help {
                          const enum UCL_MEMOPT kind2) {
    int e1;
    e1=host.alloc(cols,acc,UCL_NOT_PINNED);
-    if (e1!=UCL_SUCCESS)
-      return e1;
-    e1=_buffer.alloc(cols,acc,kind1);
    if (e1!=UCL_SUCCESS)
      return e1;

    if (acc.shared_memory()) {
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_ALLOC 7S\n";
+      #endif
+      e1=_buffer.alloc(cols,acc,kind1,kind2);
+      if (e1!=UCL_SUCCESS)
+        return e1;
      device.view(_buffer);
      return UCL_SUCCESS;
-    } else
+    } else {
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_ALLOC 7NS\n";
+      #endif
+      e1=_buffer.alloc(cols,acc,kind1);
+      if (e1!=UCL_SUCCESS)
+        return e1;
      return device.alloc(cols,acc,kind2);
+    }
  }

  template <class t1, class t2, class t3, class mat_type>
@ -167,10 +239,24 @@ template <int st> struct _ucl_s_obj_help {
    e1=host.alloc(cols,cq,UCL_NOT_PINNED);
    if (e1!=UCL_SUCCESS)
      return e1;
-    e1=_buffer.alloc(cols,cq,kind1);
-    if (e1!=UCL_SUCCESS)
-      return e1;
-    return device.alloc(cols,cq,kind2); 
+    if (cq.shared_mem_device()) {
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_ALLOC 8S\n";
+      #endif
+      e1=_buffer.alloc(cols,cq,kind1,kind2);
+      if (e1!=UCL_SUCCESS)
+        return e1;
+      device.view(_buffer);
+      return UCL_SUCCESS;
+    } else {
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_ALLOC 8NS\n";
+      #endif
+      e1=_buffer.alloc(cols,cq,kind1);
+      if (e1!=UCL_SUCCESS)
+        return e1;
+      return device.alloc(cols,cq,kind2); 
+    }
  }

  template <class t1, class t2, class t3>
@ -180,17 +266,27 @@ template <int st> struct _ucl_s_obj_help {
                          const enum UCL_MEMOPT kind2) {
    int e1;
    e1=host.alloc(rows,cols,acc,UCL_NOT_PINNED);
-    if (e1!=UCL_SUCCESS)
-      return e1;
-    e1=_buffer.alloc(rows,cols,acc,kind1);
    if (e1!=UCL_SUCCESS)
      return e1;

    if (acc.shared_memory()) {
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_ALLOC 9S\n";
+      #endif
+      e1=_buffer.alloc(rows,cols,acc,kind1,kind2);
+      if (e1!=UCL_SUCCESS)
+        return e1;
      device.view(_buffer);
      return UCL_SUCCESS;
-    } else
+    } else {
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_ALLOC 9NS\n";
+      #endif
+      e1=_buffer.alloc(rows,cols,acc,kind1);
+      if (e1!=UCL_SUCCESS)
+        return e1;
      return device.alloc(rows,cols,acc,kind2);
+    }
  }

  template <class t1, class t2, class t3, class mat_type>
@ -202,10 +298,24 @@ template <int st> struct _ucl_s_obj_help {
    e1=host.alloc(rows,cols,cq,UCL_NOT_PINNED);
    if (e1!=UCL_SUCCESS)
      return e1;
-    e1=_buffer.alloc(rows,cols,cq,kind1);
-    if (e1!=UCL_SUCCESS)
-      return e1;
-    return device.alloc(rows,cols,cq,kind2); 
+    if (cq.shared_mem_device()) {
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_ALLOC 10S\n";
+      #endif
+      e1=_buffer.alloc(rows,cols,cq,kind1,kind2);
+      if (e1!=UCL_SUCCESS)
+        return e1;
+      device.view(_buffer);
+      return UCL_SUCCESS;
+    } else {
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_ALLOC 10NS\n";
+      #endif
+      e1=_buffer.alloc(rows,cols,cq,kind1);
+      if (e1!=UCL_SUCCESS)
+        return e1;
+      return device.alloc(rows,cols,cq,kind2); 
+    }
  }

  template <class t1, class t2, class t3>
@ -250,9 +360,16 @@ template <int st> struct _ucl_s_obj_help {

    if (device.kind()==UCL_VIEW) {
      device.view(buff);
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_ALLOC 11S\n";
+      #endif
      return UCL_SUCCESS;
-    } else
+    } else {
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_ALLOC 11NS\n";
+      #endif
      return device.resize(cols);
+    }
  }

  template <class t1, class t2, class t3>
@ -264,9 +381,17 @@ template <int st> struct _ucl_s_obj_help {

    if (device.kind()==UCL_VIEW) {
      device.view(buff);
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_ALLOC 12S\n";
+      #endif
      return UCL_SUCCESS;
-    } else
+    } else {
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_ALLOC 12NS\n";
+      #endif
      return device.resize(rows,cols);
+    }
  }

 };
+
--- a/lib/gpu/geryon/ucl_types.h
+++ b/lib/gpu/geryon/ucl_types.h
@ -92,10 +92,9 @@ enum UCL_MEMOPT {
  UCL_WRITE_ONLY,     ///< Allow any optimizations for memory that is write only
  UCL_READ_ONLY,      ///< Allow any optimizations for memory that is read only
  UCL_READ_WRITE,     ///< Allow read and write
-  UCL_WRITE_OPTIMIZED,///< Allow host memory to be pinned (write combined)
-  UCL_RW_OPTIMIZED,   ///< Allow host memory to be pinned  
  UCL_NOT_PINNED,     ///< Host memory is not to be pinned
-  UCL_VIEW            ///< View of another memory allocation
+  UCL_VIEW,           ///< View of another memory allocation
+  UCL_NOT_SPECIFIED
 };

 enum UCL_DEVICE_TYPE { 
--- a/lib/gpu/geryon/ucl_vector.h
+++ b/lib/gpu/geryon/ucl_vector.h
@ -48,17 +48,18 @@ class UCL_Vector {
  /// Construct with n columns
  /** \sa alloc() **/
  UCL_Vector(const size_t cols, UCL_Device &acc, 
-             const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED,
+             const enum UCL_MEMOPT kind1=UCL_READ_WRITE,
             const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
        alloc(host,device,_buffer,cols,acc,kind1,kind2); }
  
  /// Set up the vector with 'cols' columns and reserve memory
-  /** The kind1 parameter controls memory pinning as follows:
-    * - UCL_NOT_PINNED      - Memory is not pinned
-    * - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
-    * - UCL_RW_OPTIMIZED    - Memory can be pinned 
-    * The kind2 parameter controls memory optimizations as follows:
+  /** The kind1 parameter controls memory access from the host
+    * - UCL_READ_WRITE - Specify that you will read and write from host
+    * - UCL_WRITE_ONLY - Specify that you will only write from host
+    * - UCL_READ_ONLY  - Specify that you will only read from host
+    * - UCL_NOT_PINNED - Memory is not pinned/page-locked on host
+    * The kind2 parameter controls memory optimizations from the device:
    * - UCL_READ_WRITE - Specify that you will read and write in kernels
    * - UCL_WRITE_ONLY - Specify that you will only write in kernels
    * - UCL_READ_ONLY  - Specify that you will only read in kernels
@ -69,24 +70,25 @@ class UCL_Vector {
    * \return UCL_SUCCESS if the memory allocation is successful **/
  template <class mat_type>
  inline int alloc(const size_t cols, mat_type &cq,
-                   const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED,
+                   const enum UCL_MEMOPT kind1=UCL_READ_WRITE,
                   const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
    { return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
        alloc(host,device,_buffer,cols,cq,kind1,kind2); }

  /// Set up host vector with 'cols' columns and reserve memory
-  /** The kind1 parameter controls memory pinning as follows:
-    * - UCL_NOT_PINNED      - Memory is not pinned
-    * - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
-    * - UCL_RW_OPTIMIZED    - Memory can be pinned 
-    * The kind2 parameter controls memory optimizations as follows:
+  /** The kind1 parameter controls memory access from the host
+    * - UCL_READ_WRITE - Specify that you will read and write from host
+    * - UCL_WRITE_ONLY - Specify that you will only write from host
+    * - UCL_READ_ONLY  - Specify that you will only read from host
+    * - UCL_NOT_PINNED - Memory is not pinned/page-locked on host
+    * The kind2 parameter controls memory optimizations from the device:
    * - UCL_READ_WRITE - Specify that you will read and write in kernels
    * - UCL_WRITE_ONLY - Specify that you will only write in kernels
    * - UCL_READ_ONLY  - Specify that you will only read in kernels
    * \param device Used to get the default command queue for operations
    * \return UCL_SUCCESS if the memory allocation is successful **/
  inline int alloc(const size_t cols, UCL_Device &acc,
-                   const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED,
+                   const enum UCL_MEMOPT kind1=UCL_READ_WRITE,
                   const enum UCL_MEMOPT kind2=UCL_READ_WRITE) 
    { return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
        alloc(host,device,_buffer,cols,acc,kind1,kind2); }
@ -109,11 +111,22 @@ class UCL_Vector {
  inline int resize_ib(const int new_cols)
    { if (new_cols>cols()) return resize(new_cols); else return UCL_SUCCESS; }

-  /// Set each element to zero
-  inline void zero() { host.zero(); device.zero(); }
-  
-  /// Set first n elements to zero
-  inline void zero(const int n) { host.zero(n); device.zero(n); }
+  /// Set each element to zero (asynchronously on device)
+  inline void zero() { zero(cq()); }
+  /// Set first n elements to zero (asynchronously on device)
+  inline void zero(const int n) { zero(n,cq()); }
+  /// Set each element to zero (asynchronously on device)
+  inline void zero(command_queue &cq) {
+    host.zero(); 
+    if (device.kind()!=UCL_VIEW) device.zero(cq);
+    else if (_buffer.numel()>0) _buffer.zero();
+  }
+  /// Set first n elements to zero (asynchronously on device)
+  inline void zero(const int n, command_queue &cq) { 
+    host.zero(n); 
+    if (device.kind()!=UCL_VIEW) device.zero(n,cq); 
+    else if (_buffer.numel()>0) _buffer.zero();
+  }

  /// Get the number of elements
  inline size_t numel() const { return host.numel(); }
@ -145,6 +158,8 @@ class UCL_Vector {
  
  /// Return the default command queue/stream associated with this data
  inline command_queue & cq() { return host.cq(); }
+  /// Change the default command queue associated with this data
+  inline void cq(command_queue &cq_in) { host.cq(cq_in); device.cq(cq_in); }
  /// Block until command_queue associated with matrix is complete
  inline void sync() { host.sync(); }

--- a/lib/gpu/lal_answer.cpp
+++ b/lib/gpu/lal_answer.cpp
@ -44,10 +44,10 @@ bool AnswerT::alloc(const int inum) {
    _ans_fields+=4;
  
  // ---------------------------  Device allocations
-  success=success && (engv.alloc(_ev_fields*_max_local,*dev,UCL_RW_OPTIMIZED,
-                                 UCL_WRITE_ONLY)==UCL_SUCCESS);
-  success=success && (force.alloc(_ans_fields*_max_local,*dev,UCL_RW_OPTIMIZED,
-                                UCL_WRITE_ONLY)==UCL_SUCCESS);
+  success=success && (engv.alloc(_ev_fields*_max_local,*dev,UCL_READ_ONLY,
+                                 UCL_READ_WRITE)==UCL_SUCCESS);
+  success=success && (force.alloc(_ans_fields*_max_local,*dev,UCL_READ_ONLY,
+                                UCL_READ_WRITE)==UCL_SUCCESS);
  _gpu_bytes=engv.device.row_bytes()+force.device.row_bytes();
  
  _allocated=true;  
@ -175,78 +175,42 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
    return 0.0;

  double evdwl=0.0;
-  double virial_acc[6];
-  for (int i=0; i<6; i++) virial_acc[i]=0.0;
-  if (_ilist==NULL) {
-    for (int i=0; i<_inum; i++) {
-      int al=i;
-      if (_eflag) {
-        if (_ef_atom) {
-          evdwl+=engv[al];
-          eatom[i]+=engv[al]*0.5;
-          al+=_inum;
-        } else {
-          evdwl+=engv[al];
-          al+=_inum;
-        }
-      }
-      if (_vflag) {
-        if (_vf_atom) {
-          for (int j=0; j<6; j++) {
-            vatom[i][j]+=engv[al]*0.5;
-            virial_acc[j]+=engv[al];
-            al+=_inum;
-          }
-        } else {
-          for (int j=0; j<6; j++) {
-            virial_acc[j]+=engv[al];
-            al+=_inum;
-          }
-        }
-      }
+  int vstart=0;
+  if (_eflag) {
+    for (int i=0; i<_inum; i++)
+      evdwl+=engv[i];
+    if (_ef_atom)
+      if (_ilist==NULL) 
+        for (int i=0; i<_inum; i++)
+          eatom[i]+=engv[i];
+      else
+        for (int i=0; i<_inum; i++)
+          eatom[_ilist[i]]+=engv[i];
+    vstart=_inum;
+  }
+  if (_vflag) {
+    int iend=vstart+_inum;
+    for (int j=0; j<6; j++) {
+      for (int i=vstart; i<iend; i++)
+        virial[j]+=engv[i];
+      if (_vf_atom)
+        if (_ilist==NULL)
+          for (int i=vstart; i<iend; i++)
+            vatom[i][j]+=engv[i];
+        else
+          for (int i=vstart; i<iend; i++)
+            vatom[_ilist[i]][j]+=engv[i];
+      vstart+=_inum;
+      iend+=_inum;
    }
-    for (int j=0; j<6; j++)
-      virial[j]+=virial_acc[j]*0.5;
-  } else {
-    for (int i=0; i<_inum; i++) {
-      int al=i;
-      int ii=_ilist[i];
-      if (_eflag) {
-        if (_ef_atom) {
-          evdwl+=engv[al];
-          eatom[ii]+=engv[al]*0.5;
-          al+=_inum;
-        } else {
-          evdwl+=engv[al];
-          al+=_inum;
-        }
-      }
-      if (_vflag) {
-        if (_vf_atom) {
-          for (int j=0; j<6; j++) {
-            vatom[ii][j]+=engv[al]*0.5;
-            virial_acc[j]+=engv[al];
-            al+=_inum;
-          }
-        } else {
-          for (int j=0; j<6; j++) {
-            virial_acc[j]+=engv[al];
-            al+=_inum;
-          }
-        }
-      }
-    }
-    for (int j=0; j<6; j++)
-      virial[j]+=virial_acc[j]*0.5;
  }
  
-  evdwl*=0.5;
  return evdwl;
 }

 template <class numtyp, class acctyp>
 double AnswerT::energy_virial(double *eatom, double **vatom,
-                                   double *virial, double &ecoul) {
+                              double *virial, double &ecoul) {
  if (_eflag==false && _vflag==false)
    return 0.0;

@ -254,84 +218,43 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
    return energy_virial(eatom,vatom,virial);

  double evdwl=0.0;
-  double _ecoul=0.0;
-  double virial_acc[6];
-  for (int i=0; i<6; i++) virial_acc[i]=0.0;
-  if (_ilist==NULL) {
-    for (int i=0; i<_inum; i++) {
-      int al=i;
-      if (_eflag) {
-        if (_ef_atom) {
-          evdwl+=engv[al];
-          eatom[i]+=engv[al]*0.5;
-          al+=_inum;
-          _ecoul+=engv[al];
-          eatom[i]+=engv[al]*0.5;
-          al+=_inum;
-        } else {
-          evdwl+=engv[al];
-          al+=_inum;
-          _ecoul+=engv[al];
-          al+=_inum;
-        }
-      }
-      if (_vflag) {
-        if (_vf_atom) {
-          for (int j=0; j<6; j++) {
-            vatom[i][j]+=engv[al]*0.5;
-            virial_acc[j]+=engv[al];
-            al+=_inum;
-          }
-        } else {
-          for (int j=0; j<6; j++) {
-            virial_acc[j]+=engv[al];
-            al+=_inum;
-          }
-        }
+  int vstart=0, iend=_inum*2;
+  if (_eflag) {
+    for (int i=0; i<_inum; i++)
+      evdwl+=engv[i];
+    for (int i=_inum; i<iend; i++)
+      ecoul+=engv[i];
+    if (_ef_atom)
+      if (_ilist==NULL) {
+        for (int i=0; i<_inum; i++)
+          eatom[i]+=engv[i];
+        for (int i=_inum; i<iend; i++)
+          eatom[i]+=engv[i];
+      } else {
+        for (int i=0; i<_inum; i++)
+          eatom[_ilist[i]]+=engv[i];
+        for (int i=_inum; i<iend; i++)
+          eatom[_ilist[i]]+=engv[i];
      }
+    vstart=iend;
+    iend+=_inum;
+  }
+  if (_vflag) {
+    for (int j=0; j<6; j++) {
+      for (int i=vstart; i<iend; i++)
+        virial[j]+=engv[i];
+      if (_vf_atom)
+        if (_ilist==NULL)
+          for (int i=vstart; i<iend; i++)
+              vatom[i][j]+=engv[i];
+        else
+          for (int i=vstart; i<iend; i++)
+              vatom[_ilist[i]][j]+=engv[i];
+      vstart+=_inum;
+      iend+=_inum;
    }
-    for (int j=0; j<6; j++)
-      virial[j]+=virial_acc[j]*0.5;
-  } else {
-    for (int i=0; i<_inum; i++) {
-      int al=i;
-      int ii=_ilist[i];
-      if (_eflag) {
-        if (_ef_atom) {
-          evdwl+=engv[al];
-          eatom[ii]+=engv[al]*0.5;
-          al+=_inum;
-          _ecoul+=engv[al];
-          eatom[ii]+=engv[al]*0.5;
-          al+=_inum;
-        } else {
-          evdwl+=engv[al];
-          al+=_inum;
-          _ecoul+=engv[al];
-          al+=_inum;
-        }
-      }
-      if (_vflag) {
-        if (_vf_atom) {
-          for (int j=0; j<6; j++) {
-            vatom[ii][j]+=engv[al]*0.5;
-            virial_acc[j]+=engv[al];
-            al+=_inum;
-          }
-        } else {
-          for (int j=0; j<6; j++) {
-            virial_acc[j]+=engv[al];
-            al+=_inum;
-          }
-        }
-      }
-    }
-    for (int j=0; j<6; j++)
-      virial[j]+=virial_acc[j]*0.5;
  }
  
-  evdwl*=0.5;
-  ecoul+=_ecoul*0.5;
  return evdwl;
 }

@ -373,4 +296,14 @@ void AnswerT::get_answers(double **f, double **tor) {
  }
 }

+template <class numtyp, class acctyp>
+void AnswerT::cq(const int cq_index) {
+  engv.cq(dev->cq(cq_index));
+  force.cq(dev->cq(cq_index));
+  time_answer.clear();
+  time_answer.init(*dev,dev->cq(cq_index));
+  time_answer.zero();
+}
+
 template class Answer<PRECISION,ACC_PRECISION>;
+
--- a/lib/gpu/lal_answer.h
+++ b/lib/gpu/lal_answer.h
@ -47,6 +47,8 @@ class Answer {
  inline int inum() const { return _inum; }
  /// Set number of local atoms for future copy operations
  inline void inum(const int n) { _inum=n; }
+  /// Return the maximum number of atoms that can be stored currently
+  inline int max_inum() const { return _max_local; }
  
  /// Memory usage per atom in this class
  int bytes_per_atom() const; 
@ -132,6 +134,9 @@ class Answer {
  /// Return the time the CPU was idle waiting for GPU
  inline double cpu_idle_time() { return _time_cpu_idle; }

+  /// Change the command queue used for copies and timers
+  void cq(const int cq_index);
+
  // ------------------------------ DATA ----------------------------------

  /// Force and possibly torque
--- a/lib/gpu/lal_atom.cpp
+++ b/lib/gpu/lal_atom.cpp
@ -70,44 +70,47 @@ bool AtomT::alloc(const int nall) {

  // ---------------------------  Device allocations
  int gpu_bytes=0;
-  success=success && (x.alloc(_max_atoms*4,*dev,UCL_WRITE_OPTIMIZED,
+  success=success && (x.alloc(_max_atoms*4,*dev,UCL_WRITE_ONLY,
                              UCL_READ_ONLY)==UCL_SUCCESS);
  #ifdef GPU_CAST
-  success=success && (x_cast.alloc(_max_atoms*3,*dev,UCL_READ_ONLY)==
-                      UCL_SUCCESS);
-  success=success && (type_cast.alloc(_max_atoms,*dev,UCL_READ_ONLY)==
-                      UCL_SUCCESS);
+  success=success && (x_cast.alloc(_max_atoms*3,*dev,UCL_WRITE_ONLY,
+                                   UCL_READ_ONLY)==UCL_SUCCESS);
+  success=success && (type_cast.alloc(_max_atoms,*dev,UCL_WRITE_ONLY,
+                                      UCL_READ_ONLY)==UCL_SUCCESS);
  gpu_bytes+=x_cast.device.row_bytes()+type_cast.device.row_bytes();
  #endif

  if (_charge && _host_view==false) {
-    success=success && (q.alloc(_max_atoms,*dev,UCL_WRITE_OPTIMIZED,
+    success=success && (q.alloc(_max_atoms,*dev,UCL_WRITE_ONLY,
                                UCL_READ_ONLY)==UCL_SUCCESS);
    gpu_bytes+=q.device.row_bytes();
  }
  if (_rot && _host_view==false) {
-    success=success && (quat.alloc(_max_atoms*4,*dev,UCL_WRITE_OPTIMIZED,
+    success=success && (quat.alloc(_max_atoms*4,*dev,UCL_WRITE_ONLY,
                                   UCL_READ_ONLY)==UCL_SUCCESS);
    gpu_bytes+=quat.device.row_bytes();
  }

  if (_gpu_nbor>0) {
    if (_bonds) {
-      success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS);
+      success=success && (dev_tag.alloc(_max_atoms,*dev,
+                                        UCL_READ_ONLY)==UCL_SUCCESS);
      gpu_bytes+=dev_tag.row_bytes();
    }
    if (_gpu_nbor==1) {
      success=success && (dev_cell_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
      gpu_bytes+=dev_cell_id.row_bytes();
    } else {
-      success=success && (host_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
+      success=success && (host_particle_id.alloc(_max_atoms,*dev,
+                                                 UCL_WRITE_ONLY)==UCL_SUCCESS);
      success=success && 
             (host_cell_id.alloc(_max_atoms,*dev,UCL_NOT_PINNED)==UCL_SUCCESS);
    }
    if (_gpu_nbor==2 && _host_view)
      dev_particle_id.view(host_particle_id);
    else
-      success=success && (dev_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
+      success=success && (dev_particle_id.alloc(_max_atoms,*dev,
+                                                UCL_READ_ONLY)==UCL_SUCCESS);
    gpu_bytes+=dev_particle_id.row_bytes();
  }

@ -130,7 +133,7 @@ bool AtomT::add_fields(const bool charge, const bool rot,
    _charge=true;
    _other=true;
    if (_host_view==false) {
-      success=success && (q.alloc(_max_atoms,*dev,UCL_WRITE_OPTIMIZED,
+      success=success && (q.alloc(_max_atoms,*dev,UCL_WRITE_ONLY,
                                  UCL_READ_ONLY)==UCL_SUCCESS);
      gpu_bytes+=q.device.row_bytes();
    }
@ -140,7 +143,7 @@ bool AtomT::add_fields(const bool charge, const bool rot,
    _rot=true;
    _other=true;
    if (_host_view==false) {
-      success=success && (quat.alloc(_max_atoms*4,*dev,UCL_WRITE_OPTIMIZED,
+      success=success && (quat.alloc(_max_atoms*4,*dev,UCL_WRITE_ONLY,
                                     UCL_READ_ONLY)==UCL_SUCCESS);
      gpu_bytes+=quat.device.row_bytes();
    }
@ -149,7 +152,8 @@ bool AtomT::add_fields(const bool charge, const bool rot,
  if (bonds && _bonds==false) {
    _bonds=true;
    if (_bonds && _gpu_nbor>0) {
-      success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS);
+      success=success && (dev_tag.alloc(_max_atoms,*dev,
+                                        UCL_READ_ONLY)==UCL_SUCCESS);
      gpu_bytes+=dev_tag.row_bytes();
    }
  }
@ -163,17 +167,20 @@ bool AtomT::add_fields(const bool charge, const bool rot,
        return false;
    }
    #endif
-    success=success && (dev_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
+    success=success && (dev_particle_id.alloc(_max_atoms,*dev,
+                                              UCL_READ_ONLY)==UCL_SUCCESS);
    gpu_bytes+=dev_particle_id.row_bytes();
    if (_bonds) {
-      success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS);
+      success=success && (dev_tag.alloc(_max_atoms,*dev,
+                                        UCL_READ_ONLY)==UCL_SUCCESS);
      gpu_bytes+=dev_tag.row_bytes();
    }
    if (_gpu_nbor==1) {
      success=success && (dev_cell_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
      gpu_bytes+=dev_cell_id.row_bytes();
    } else {
-      success=success && (host_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
+      success=success && (host_particle_id.alloc(_max_atoms,*dev,
+                                                 UCL_WRITE_ONLY)==UCL_SUCCESS);
      success=success && 
             (host_cell_id.alloc(_max_atoms,*dev,UCL_NOT_PINNED)==UCL_SUCCESS);
    }             
--- a/lib/gpu/lal_aux_fun1.h
+++ b/lib/gpu/lal_aux_fun1.h
@ -77,12 +77,12 @@
  if (offset==0) {                                                          \
    engv+=ii;                                                               \
    if (eflag>0) {                                                          \
-      *engv=energy;                                                         \
+      *engv=energy*(acctyp)0.5;                                             \
      engv+=inum;                                                           \
    }                                                                       \
    if (vflag>0) {                                                          \
      for (int i=0; i<6; i++) {                                             \
-        *engv=virial[i];                                                    \
+        *engv=virial[i]*(acctyp)0.5;                                        \
        engv+=inum;                                                         \
      }                                                                     \
    }                                                                       \
@ -125,14 +125,14 @@
  if (offset==0) {                                                          \
    engv+=ii;                                                               \
    if (eflag>0) {                                                          \
-      *engv=energy;                                                         \
+      *engv=energy*(acctyp)0.5;                                             \
      engv+=inum;                                                           \
-      *engv=e_coul;                                                         \
+      *engv=e_coul*(acctyp)0.5;                                             \
      engv+=inum;                                                           \
    }                                                                       \
    if (vflag>0) {                                                          \
      for (int i=0; i<6; i++) {                                             \
-        *engv=virial[i];                                                    \
+        *engv=virial[i]*(acctyp)0.5;                                        \
        engv+=inum;                                                         \
      }                                                                     \
    }                                                                       \
@ -160,12 +160,12 @@
  if (offset==0) {                                                          \
    engv+=ii;                                                               \
    if (eflag>0) {                                                          \
-      *engv=energy;                                                         \
+      *engv=energy*(acctyp)0.5;                                             \
      engv+=inum;                                                           \
    }                                                                       \
    if (vflag>0) {                                                          \
      for (int i=0; i<6; i++) {                                             \
-        *engv=virial[i];                                                    \
+        *engv=virial[i]*(acctyp)0.5;                                        \
        engv+=inum;                                                         \
      }                                                                     \
    }                                                                       \
@ -192,14 +192,14 @@
  if (offset==0) {                                                          \
    engv+=ii;                                                               \
    if (eflag>0) {                                                          \
-      *engv=energy;                                                         \
+      *engv=energy*(acctyp)0.5;                                             \
      engv+=inum;                                                           \
-      *engv=e_coul;                                                         \
+      *engv=e_coul*(acctyp)0.5;                                             \
      engv+=inum;                                                           \
    }                                                                       \
    if (vflag>0) {                                                          \
      for (int i=0; i<6; i++) {                                             \
-        *engv=virial[i];                                                    \
+        *engv=virial[i]*(acctyp)0.5;                                        \
        engv+=inum;                                                         \
      }                                                                     \
    }                                                                       \
--- a/lib/gpu/lal_base_atomic.cpp
+++ b/lib/gpu/lal_base_atomic.cpp
@ -272,12 +272,8 @@ void BaseAtomicT::compile_kernels(UCL_Device &dev, const void *pair_str,
    return;

  std::string s_fast=std::string(kname)+"_fast";
-  std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
-                    std::string(OCL_PRECISION_COMPILE)+" -D"+
-                    std::string(OCL_VENDOR);
-
  pair_program=new UCL_Program(dev);
-  pair_program->load_string(pair_str,flags.c_str());
+  pair_program->load_string(pair_str,device->compile_string().c_str());
  k_pair_fast.set_function(*pair_program,s_fast.c_str());
  k_pair.set_function(*pair_program,kname);
  pos_tex.get_texture(*pair_program,"pos_tex");
--- a/lib/gpu/lal_base_charge.cpp
+++ b/lib/gpu/lal_base_charge.cpp
@ -288,12 +288,8 @@ void BaseChargeT::compile_kernels(UCL_Device &dev, const void *pair_str,
    return;

  std::string s_fast=std::string(kname)+"_fast";
-  std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
-                    std::string(OCL_PRECISION_COMPILE)+" -D"+
-                    std::string(OCL_VENDOR);
-
  pair_program=new UCL_Program(dev);
-  pair_program->load_string(pair_str,flags.c_str());
+  pair_program->load_string(pair_str,device->compile_string().c_str());
  k_pair_fast.set_function(*pair_program,s_fast.c_str());
  k_pair.set_function(*pair_program,kname);
  pos_tex.get_texture(*pair_program,"pos_tex");
--- a/lib/gpu/lal_base_dipole.cpp
+++ b/lib/gpu/lal_base_dipole.cpp
@ -296,12 +296,8 @@ void BaseDipoleT::compile_kernels(UCL_Device &dev, const void *pair_str,
    return;

  std::string s_fast=std::string(kname)+"_fast";
-  std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
-                    std::string(OCL_PRECISION_COMPILE)+" -D"+
-                    std::string(OCL_VENDOR);
-
  pair_program=new UCL_Program(dev);
-  pair_program->load_string(pair_str,flags.c_str());
+  pair_program->load_string(pair_str,device->compile_string().c_str());
  k_pair_fast.set_function(*pair_program,s_fast.c_str());
  k_pair.set_function(*pair_program,kname);
  pos_tex.get_texture(*pair_program,"pos_tex");
--- a/lib/gpu/lal_base_ellipsoid.cpp
+++ b/lib/gpu/lal_base_ellipsoid.cpp
@ -455,9 +455,7 @@ void BaseEllipsoidT::compile_kernels(UCL_Device &dev,
  std::string s_lj=kns+"_lj";
  std::string s_lj_fast=kns+"_lj_fast";

-  std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
-                    std::string(OCL_PRECISION_COMPILE)+" -D"+
-                    std::string(OCL_VENDOR);
+  std::string flags=device->compile_string();

  nbor_program=new UCL_Program(dev);
  nbor_program->load_string(ellipsoid_nbor,flags.c_str());
--- a/lib/gpu/lal_born.cpp
+++ b/lib/gpu/lal_born.cpp
@ -69,7 +69,7 @@ int BornT::init(const int ntypes, double **host_cutsq,

  // Allocate a host write buffer for data initialization
  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);

  for (int i=0; i<lj_types*lj_types; i++)
    host_write[i]=0.0;
--- a/lib/gpu/lal_born.cu
+++ b/lib/gpu/lal_born.cu
@ -54,7 +54,8 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
@ -148,7 +149,8 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);

--- a/lib/gpu/lal_born_coul_long.cpp
+++ b/lib/gpu/lal_born_coul_long.cpp
@ -73,7 +73,7 @@ int BornCoulLongT::init(const int ntypes, double **host_cutsq, double **host_rho

  // Allocate a host write buffer for data initialization
  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);

  for (int i=0; i<lj_types*lj_types; i++)
    host_write[i]=0.0;
--- a/lib/gpu/lal_born_coul_long.cu
+++ b/lib/gpu/lal_born_coul_long.cu
@ -67,7 +67,8 @@ __kernel void k_born_long(const __global numtyp4 *restrict x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
@ -188,7 +189,8 @@ __kernel void k_born_long_fast(const __global numtyp4 *restrict x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
--- a/lib/gpu/lal_born_coul_wolf.cpp
+++ b/lib/gpu/lal_born_coul_wolf.cpp
@ -73,7 +73,7 @@ int BornCoulWolfT::init(const int ntypes, double **host_cutsq, double **host_rho

  // Allocate a host write buffer for data initialization
  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);

  for (int i=0; i<lj_types*lj_types; i++)
    host_write[i]=0.0;
--- a/lib/gpu/lal_born_coul_wolf.cu
+++ b/lib/gpu/lal_born_coul_wolf.cu
@ -70,7 +70,8 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
@ -201,7 +202,8 @@ __kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
--- a/lib/gpu/lal_buck.cpp
+++ b/lib/gpu/lal_buck.cpp
@ -68,7 +68,7 @@ int BuckT::init(const int ntypes, double **host_cutsq,

  // Allocate a host write buffer for data initialization
  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);

  for (int i=0; i<lj_types*lj_types; i++)
    host_write[i]=0.0;
--- a/lib/gpu/lal_buck.cu
+++ b/lib/gpu/lal_buck.cu
@ -53,7 +53,8 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
@ -145,7 +146,8 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);

--- a/lib/gpu/lal_buck_coul.cpp
+++ b/lib/gpu/lal_buck_coul.cpp
@ -70,7 +70,7 @@ int BuckCoulT::init(const int ntypes, double **host_cutsq,

  // Allocate a host write buffer for data initialization
  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);

  for (int i=0; i<lj_types*lj_types; i++)
    host_write[i]=0.0;
--- a/lib/gpu/lal_buck_coul.cu
+++ b/lib/gpu/lal_buck_coul.cu
@ -66,7 +66,8 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
@ -182,7 +183,8 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);

--- a/lib/gpu/lal_buck_coul_long.cpp
+++ b/lib/gpu/lal_buck_coul_long.cpp
@ -72,7 +72,7 @@ int BuckCoulLongT::init(const int ntypes, double **host_cutsq,

  // Allocate a host write buffer for data initialization
  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);

  for (int i=0; i<lj_types*lj_types; i++)
    host_write[i]=0.0;
--- a/lib/gpu/lal_buck_coul_long.cu
+++ b/lib/gpu/lal_buck_coul_long.cu
@ -67,7 +67,8 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
@ -190,7 +191,8 @@ __kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
--- a/lib/gpu/lal_cg_cmm.cpp
+++ b/lib/gpu/lal_cg_cmm.cpp
@ -69,7 +69,7 @@ int CGCMMT::init(const int ntypes, double **host_cutsq,

  // Allocate a host write buffer for data initialization
  UCL_H_Vec<numtyp> host_write(cmm_types*cmm_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);

  for (int i=0; i<cmm_types*cmm_types; i++)
    host_write[i]=0.0;
--- a/lib/gpu/lal_cg_cmm.cu
+++ b/lib/gpu/lal_cg_cmm.cu
@ -53,7 +53,8 @@ __kernel void k_cg_cmm(const __global numtyp4 *restrict x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
@ -150,7 +151,8 @@ __kernel void k_cg_cmm_fast(const __global numtyp4 *restrict x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
--- a/lib/gpu/lal_cg_cmm_long.cpp
+++ b/lib/gpu/lal_cg_cmm_long.cpp
@ -74,7 +74,7 @@ int CGCMMLongT::init(const int ntypes, double **host_cutsq,

  // Allocate a host write buffer for data initialization
  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);

  for (int i=0; i<lj_types*lj_types; i++)
    host_write[i]=0.0;
--- a/lib/gpu/lal_cg_cmm_long.cu
+++ b/lib/gpu/lal_cg_cmm_long.cu
@ -66,7 +66,8 @@ __kernel void k_cg_cmm_long(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
@ -191,7 +192,8 @@ __kernel void k_cg_cmm_long_fast(const __global numtyp4 *restrict x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
--- a/lib/gpu/lal_charmm_long.cpp
+++ b/lib/gpu/lal_charmm_long.cpp
@ -76,7 +76,7 @@ int CHARMMLongT::init(const int ntypes,
  if (h_size<max_bio_shared_types)
    h_size=max_bio_shared_types;
  UCL_H_Vec<numtyp> host_write(h_size*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);
  for (int i=0; i<h_size*32; i++)
    host_write[i]=0.0;

--- a/lib/gpu/lal_charmm_long.cu
+++ b/lib/gpu/lal_charmm_long.cu
@ -67,7 +67,8 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
@ -196,7 +197,8 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
--- a/lib/gpu/lal_colloid.cpp
+++ b/lib/gpu/lal_colloid.cpp
@ -73,7 +73,7 @@ int ColloidT::init(const int ntypes,

  // Allocate a host write buffer for data initialization
  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);

  for (int i=0; i<lj_types*lj_types; i++)
    host_write[i]=0.0;
@ -95,7 +95,7 @@ int ColloidT::init(const int ntypes,
                         host_sigma3,host_sigma6);

  UCL_H_Vec<int> dview_form(lj_types*lj_types,*(this->ucl_device),
-                             UCL_WRITE_OPTIMIZED);
+                             UCL_WRITE_ONLY);
  for (int i=0; i<lj_types*lj_types; i++) dview_form[i]=0;
                                
  form.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
--- a/lib/gpu/lal_colloid.cu
+++ b/lib/gpu/lal_colloid.cu
@ -56,7 +56,8 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
@ -219,7 +220,8 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);

--- a/lib/gpu/lal_coul_dsf.cpp
+++ b/lib/gpu/lal_coul_dsf.cpp
@ -73,7 +73,7 @@ int CoulDSFT::init(const int ntypes, const int nlocal, const int nall,

  // Allocate a host write buffer for data initialization
  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);

  for (int i=0; i<lj_types*lj_types; i++)
    host_write[i]=0.0;
--- a/lib/gpu/lal_coul_dsf.cu
+++ b/lib/gpu/lal_coul_dsf.cu
@ -63,7 +63,8 @@ __kernel void k_coul_dsf(const __global numtyp4 *restrict x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
@ -163,7 +164,8 @@ __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
--- a/lib/gpu/lal_coul_long.cpp
+++ b/lib/gpu/lal_coul_long.cpp
@ -68,7 +68,7 @@ int CoulLongT::init(const int nlocal, const int nall, const int max_nbors,

  // Allocate a host write buffer for data initialization
  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);

  for (int i=0; i<lj_types*lj_types; i++)
    host_write[i]=0.0;
--- a/lib/gpu/lal_coul_long.cu
+++ b/lib/gpu/lal_coul_long.cu
@ -74,12 +74,12 @@ texture<int2> q_tex;
    if (eflag>0) {                                                          \
      *ap1=(acctyp)0;                                                       \
      ap1+=inum;                                                            \
-      *ap1=e_coul;                                                          \
+      *ap1=e_coul*(acctyp)0.5;                                              \
      ap1+=inum;                                                            \
    }                                                                       \
    if (vflag>0) {                                                          \
      for (int i=0; i<6; i++) {                                             \
-        *ap1=virial[i];                                                     \
+        *ap1=virial[i]*(acctyp)0.5;                                         \
        ap1+=inum;                                                          \
      }                                                                     \
    }                                                                       \
@ -109,12 +109,12 @@ texture<int2> q_tex;
    if (eflag>0) {                                                          \
      *ap1=(acctyp)0;                                                       \
      ap1+=inum;                                                            \
-      *ap1=e_coul;                                                          \
+      *ap1=e_coul*(acctyp)0.5;                                              \
      ap1+=inum;                                                            \
    }                                                                       \
    if (vflag>0) {                                                          \
      for (int i=0; i<6; i++) {                                             \
-        *ap1=virial[i];                                                     \
+        *ap1=virial[i]*(acctyp)0.5;                                         \
        ap1+=inum;                                                          \
      }                                                                     \
    }                                                                       \
@ -155,7 +155,8 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);

@ -244,7 +245,8 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);

--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@ -47,7 +47,8 @@ template <class numtyp, class acctyp>
 int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
                         const int last_gpu, const int gpu_mode, 
                         const double p_split, const int nthreads, 
-                         const int t_per_atom, const double cell_size) {
+                         const int t_per_atom, const double cell_size,
+                         char *ocl_vendor) {
  _nthreads=nthreads;
  #ifdef _OPENMP
  omp_set_num_threads(nthreads);
@ -140,6 +141,9 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,

  _long_range_precompute=0;

+  if (set_ocl_params(ocl_vendor)!=0)
+    return -11;
+  
  int flag=0;
  for (int i=0; i<_procs_per_gpu; i++) {
    if (_gpu_rank==i)
@ -149,6 +153,64 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
  return flag;
 }

+template <class numtyp, class acctyp>
+int DeviceT::set_ocl_params(char *ocl_vendor) {
+  #ifdef USE_OPENCL
+  std::string s_vendor=OCL_DEFAULT_VENDOR;
+  if (ocl_vendor!=NULL)
+    s_vendor=ocl_vendor;
+  if (s_vendor=="none")
+    s_vendor="generic";
+  
+  if (s_vendor=="kepler") {
+    _ocl_vendor_name="NVIDIA Kepler";
+    #if defined (__APPLE__) || defined(MACOSX)
+    _ocl_vendor_string="-DKEPLER_OCL -DNO_OCL_PTX";
+    #else
+    _ocl_vendor_string="-DKEPLER_OCL";
+    #endif
+  } else if (s_vendor=="fermi") {    
+    _ocl_vendor_name="NVIDIA Fermi";
+    _ocl_vendor_string="-DFERMI_OCL";
+  } else if (s_vendor=="cypress") {    
+    _ocl_vendor_name="AMD Cypress";
+    _ocl_vendor_string="-DCYPRESS_OCL";
+  } else if (s_vendor=="generic") {    
+    _ocl_vendor_name="GENERIC";
+    _ocl_vendor_string="-DGENERIC_OCL";
+  } else {
+    _ocl_vendor_name="CUSTOM";
+    _ocl_vendor_string="-DUSE_OPENCL";
+    int token_count=0;
+    std::string params[13];
+    char *pch = strtok(ocl_vendor,"\" ");
+    while (pch != NULL) {
+      if (token_count==13)
+        return -11;
+      params[token_count]=pch;
+      token_count++;
+      pch = strtok(NULL,"\" ");
+    }
+    _ocl_vendor_string+=" -DMEM_THREADS="+params[0]+
+                        " -DTHREADS_PER_ATOM="+params[1]+
+                        " -DTHREADS_PER_CHARGE="+params[2]+
+                        " -DBLOCK_PAIR="+params[3]+
+                        " -DMAX_SHARED_TYPES="+params[4]+
+                        " -DBLOCK_NBOR_BUILD="+params[5]+
+                        " -DBLOCK_BIO_PAIR="+params[6]+
+                        " -DBLOCK_ELLIPSE="+params[7]+
+                        " -DWARP_SIZE="+params[8]+
+                        " -DPPPM_BLOCK_1D="+params[9]+
+                        " -DBLOCK_CELL_2D="+params[10]+
+                        " -DBLOCK_CELL_ID="+params[11]+
+                        " -DMAX_BIO_SHARED_TYPES="+params[12];
+  }
+  _ocl_compile_string="-cl-fast-relaxed-math -cl-mad-enable "+
+                      std::string(OCL_PRECISION_COMPILE)+" "+_ocl_vendor_string;
+  #endif
+  return 0;
+}
+
 template <class numtyp, class acctyp>
 int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
                         const bool rot, const int nlocal, 
@ -206,7 +268,7 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
  if (!nbor->init(&_neighbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial,
                  *gpu,gpu_nbor,gpu_host,pre_cut, _block_cell_2d, 
                  _block_cell_id, _block_nbor_build, threads_per_atom,
-                  _warp_size, _time_device))
+                  _warp_size, _time_device, compile_string()))
    return -3;
  if (_cell_size<0.0)
    nbor->cell_size(cell_size,cell_size);
@ -274,7 +336,8 @@ void DeviceT::init_message(FILE *screen, const char *name,
    fprintf(screen,"-  with %d thread(s) per proc.\n",_nthreads);
    #endif
    #ifdef USE_OPENCL
-    fprintf(screen,"-  with OpenCL Parameters for: %s\n",OCL_VENDOR);
+    fprintf(screen,"-  with OpenCL Parameters for: %s\n",
+            _ocl_vendor_name.c_str());
    #endif
    fprintf(screen,"-------------------------------------");
    fprintf(screen,"-------------------------------------\n");
@ -571,9 +634,8 @@ int DeviceT::compile_kernels() {
  if (_compiled)
  	return flag;
  	
-  std::string flags="-cl-mad-enable -D"+std::string(OCL_VENDOR);
  dev_program=new UCL_Program(*gpu);
-  int success=dev_program->load_string(device,flags.c_str());
+  int success=dev_program->load_string(device,compile_string().c_str());
  if (success!=UCL_SUCCESS)
    return -4;
  k_zero.set_function(*dev_program,"kernel_zero");
@ -640,10 +702,11 @@ Device<PRECISION,ACC_PRECISION> global_device;
 int lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
                    const int last_gpu, const int gpu_mode, 
                    const double particle_split, const int nthreads,
-                    const int t_per_atom, const double cell_size) {
+                    const int t_per_atom, const double cell_size, 
+                    char *opencl_vendor) {
  return global_device.init_device(world,replica,first_gpu,last_gpu,gpu_mode,
                                   particle_split,nthreads,t_per_atom, 
-                                   cell_size);
+                                   cell_size,opencl_vendor);
 }

 void lmp_clear_device() {
@ -654,3 +717,4 @@ double lmp_gpu_forces(double **f, double **tor, double *eatom,
                      double **vatom, double *virial, double &ecoul) {
  return global_device.fix_gpu(f,tor,eatom,vatom,virial,ecoul);
 }
+
--- a/lib/gpu/lal_device.h
+++ b/lib/gpu/lal_device.h
@ -45,11 +45,13 @@ class Device {
    * - -2 if GPU not found
    * - -4 if GPU library not compiled for GPU
    * - -6 if GPU could not be initialized for use
-    * - -7 if accelerator sharing is not currently allowed on system **/
+    * - -7 if accelerator sharing is not currently allowed on system 
+    * - -11 if vendor_string has the wrong number of parameters **/
  int init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, 
-                   const int last_gpu, const int gpu_mode, 
-                   const double particle_split, const int nthreads,
-                   const int t_per_atom, const double cell_size);
+                  const int last_gpu, const int gpu_mode, 
+                  const double particle_split, const int nthreads,
+                  const int t_per_atom, const double cell_size, 
+                  char *vendor_string);

  /// Initialize the device for Atom and Neighbor storage
  /** \param rot True if quaternions need to be stored
@ -234,6 +236,8 @@ class Device {
  inline int max_bio_shared_types() const { return _max_bio_shared_types; }
  /// Architecture gpu code compiled for (returns 0 for OpenCL)
  inline double ptx_arch() const { return _ptx_arch; }
+  /// Number of threads executing concurrently on same multiproc
+  inline int warp_size() const { return _warp_size; }

  // -------------------- SHARED DEVICE ROUTINES -------------------- 
  // Perform asynchronous zero of integer array 
@ -279,6 +283,8 @@ class Device {
                              boxlo,prd);
  }
  
+  inline std::string compile_string() { return _ocl_compile_string; }
+
 private:
  std::queue<Answer<numtyp,acctyp> *> ans_queue;
  int _init_count;
@ -305,6 +311,9 @@ class Device {

  int _data_in_estimate, _data_out_estimate;
  
+  std::string _ocl_vendor_name, _ocl_vendor_string, _ocl_compile_string;
+  int set_ocl_params(char *);
+  
  template <class t>
  inline std::string toa(const t& in) {
    std::ostringstream o;
--- a/lib/gpu/lal_dipole_lj.cpp
+++ b/lib/gpu/lal_dipole_lj.cpp
@ -72,7 +72,7 @@ int DipoleLJT::init(const int ntypes,

  // Allocate a host write buffer for data initialization
  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);

  for (int i=0; i<lj_types*lj_types; i++)
    host_write[i]=0.0;
--- a/lib/gpu/lal_dipole_lj.cu
+++ b/lib/gpu/lal_dipole_lj.cu
@ -75,14 +75,14 @@ texture<int4,1> mu_tex;
  if (offset==0) {                                                          \
    engv+=ii;                                                               \
    if (eflag>0) {                                                          \
-      *engv=energy;                                                         \
+      *engv=energy*(acctyp)0.5;                                             \
      engv+=inum;                                                           \
-      *engv=e_coul;                                                         \
+      *engv=e_coul*(acctyp)0.5;                                             \
      engv+=inum;                                                           \
    }                                                                       \
    if (vflag>0) {                                                          \
      for (int i=0; i<6; i++) {                                             \
-        *engv=virial[i];                                                    \
+        *engv=virial[i]*(acctyp)0.5;                                        \
        engv+=inum;                                                         \
      }                                                                     \
    }                                                                       \
@ -115,14 +115,14 @@ texture<int4,1> mu_tex;
  if (offset==0) {                                                          \
    engv+=ii;                                                               \
    if (eflag>0) {                                                          \
-      *engv=energy;                                                         \
+      *engv=energy*(acctyp)0.5;                                             \
      engv+=inum;                                                           \
-      *engv=e_coul;                                                         \
+      *engv=e_coul*(acctyp)0.5;                                             \
      engv+=inum;                                                           \
    }                                                                       \
    if (vflag>0) {                                                          \
      for (int i=0; i<6; i++) {                                             \
-        *engv=virial[i];                                                    \
+        *engv=virial[i]*(acctyp)0.5;                                        \
        engv+=inum;                                                         \
      }                                                                     \
    }                                                                       \
@ -174,7 +174,8 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
@ -385,7 +386,8 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
--- a/lib/gpu/lal_dipole_lj_sf.cpp
+++ b/lib/gpu/lal_dipole_lj_sf.cpp
@ -72,7 +72,7 @@ int DipoleLJSFT::init(const int ntypes,

  // Allocate a host write buffer for data initialization
  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);

  for (int i=0; i<lj_types*lj_types; i++)
    host_write[i]=0.0;
--- a/lib/gpu/lal_dipole_lj_sf.cu
+++ b/lib/gpu/lal_dipole_lj_sf.cu
@ -76,14 +76,14 @@ texture<int4,1> mu_tex;
  if (offset==0) {                                                          \
    engv+=ii;                                                               \
    if (eflag>0) {                                                          \
-      *engv=energy;                                                         \
+      *engv=energy*(acctyp)0.5;                                             \
      engv+=inum;                                                           \
-      *engv=e_coul;                                                         \
+      *engv=e_coul*(acctyp)0.5;                                             \
      engv+=inum;                                                           \
    }                                                                       \
    if (vflag>0) {                                                          \
      for (int i=0; i<6; i++) {                                             \
-        *engv=virial[i];                                                    \
+        *engv=virial[i]*(acctyp)0.5;                                        \
        engv+=inum;                                                         \
      }                                                                     \
    }                                                                       \
@ -116,14 +116,14 @@ texture<int4,1> mu_tex;
  if (offset==0) {                                                          \
    engv+=ii;                                                               \
    if (eflag>0) {                                                          \
-      *engv=energy;                                                         \
+      *engv=energy*(acctyp)0.5;                                             \
      engv+=inum;                                                           \
-      *engv=e_coul;                                                         \
+      *engv=e_coul*(acctyp)0.5;                                             \
      engv+=inum;                                                           \
    }                                                                       \
    if (vflag>0) {                                                          \
      for (int i=0; i<6; i++) {                                             \
-        *engv=virial[i];                                                    \
+        *engv=virial[i]*(acctyp)0.5;                                        \
        engv+=inum;                                                         \
      }                                                                     \
    }                                                                       \
@ -175,7 +175,8 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
@ -418,7 +419,8 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
--- a/lib/gpu/lal_eam.cpp
+++ b/lib/gpu/lal_eam.cpp
@ -66,7 +66,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
    ef_nall=2000;

  _max_fp_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
-  _fp.alloc(_max_fp_size,*(this->ucl_device),UCL_RW_OPTIMIZED,UCL_WRITE_ONLY);
+  _fp.alloc(_max_fp_size,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
                                     
  k_energy.set_function(*(this->pair_program),"k_energy");
  k_energy_fast.set_function(*(this->pair_program),"k_energy_fast");
@ -106,7 +106,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
  _nr=nr;
  
  UCL_H_Vec<int2> dview_type(lj_types*lj_types,*(this->ucl_device),
-                             UCL_WRITE_OPTIMIZED);
+                             UCL_WRITE_ONLY);
  
  for (int i=0; i<lj_types*lj_types; i++) {
    dview_type[i].x=0; dview_type[i].y=0;
@ -126,7 +126,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
  
  // pack type2frho
  UCL_H_Vec<int> dview_type2frho(lj_types,*(this->ucl_device),
-                                 UCL_WRITE_OPTIMIZED);
+                                 UCL_WRITE_ONLY);

  type2frho.alloc(lj_types,*(this->ucl_device),UCL_READ_ONLY);
  for (int i=0; i<ntypes; i++)
@ -135,7 +135,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,

  // pack frho_spline
  UCL_H_Vec<numtyp4> dview_frho_spline(nfrho*(nrho+1),*(this->ucl_device),
-                                       UCL_WRITE_OPTIMIZED);
+                                       UCL_WRITE_ONLY);
                               
  for (int ix=0; ix<nfrho; ix++)
    for (int iy=0; iy<nrho+1; iy++) {
@ -165,7 +165,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,

  // pack rhor_spline
  UCL_H_Vec<numtyp4> dview_rhor_spline(nrhor*(nr+1),*(this->ucl_device),
-                                       UCL_WRITE_OPTIMIZED);
+                                       UCL_WRITE_ONLY);
                               
  for (int ix=0; ix<nrhor; ix++)
    for (int iy=0; iy<nr+1; iy++) {
@ -195,7 +195,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,

  // pack z2r_spline
  UCL_H_Vec<numtyp4> dview_z2r_spline(nz2r*(nr+1),*(this->ucl_device),
-                                      UCL_WRITE_OPTIMIZED);
+                                      UCL_WRITE_ONLY);
                               
  for (int ix=0; ix<nz2r; ix++)
    for (int iy=0; iy<nr+1; iy++) {
--- a/lib/gpu/lal_eam.cu
+++ b/lib/gpu/lal_eam.cu
@ -79,7 +79,7 @@ texture<int4> z2r_sp2_tex;
      fetch4(coeff,index,frho_sp2_tex);                                     \
      energy = ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w;             \
      if (rho > rhomax) energy += fp*(rho-rhomax);                          \
-      engv[ii]=(acctyp)2.0*energy;                                          \
+      engv[ii]=energy;                                                      \
    }                                                                       \
  }
  
@ -116,12 +116,12 @@ texture<int4> z2r_sp2_tex;
  }                                                                         \
  if (offset==0) {                                                          \
    if (eflag>0) {                                                          \
-      engv[ii]+=energy;                                                     \
+      engv[ii]+=energy*(acctyp)0.5;                                         \
      engv+=inum;                                                           \
    }                                                                       \
    if (vflag>0) {                                                          \
      for (int i=0; i<6; i++) {                                             \
-        engv[ii]=virial[i];                                                 \
+        engv[ii]=virial[i]*(acctyp)0.5;                                     \
        engv+=inum;                                                         \
      }                                                                     \
    }                                                                       \
@ -150,7 +150,7 @@ texture<int4> z2r_sp2_tex;
      fetch4(coeff,index,frho_sp2_tex);                                     \
      energy = ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w;             \
      if (rho > rhomax) energy += fp*(rho-rhomax);                          \
-      engv[ii]=(acctyp)2.0*energy;                                          \
+      engv[ii]=energy;                                          \
    }                                                                       \
  }

@ -173,12 +173,12 @@ texture<int4> z2r_sp2_tex;
  if (offset==0) {                                                          \
    engv+=ii;                                                               \
    if (eflag>0) {                                                          \
-      *engv+=energy;                                                        \
+      *engv+=energy*(acctyp)0.5;                                            \
      engv+=inum;                                                           \
    }                                                                       \
    if (vflag>0) {                                                          \
      for (int i=0; i<6; i++) {                                             \
-        *engv=virial[i];                                                    \
+        *engv=virial[i]*(acctyp)0.5;                                        \
        engv+=inum;                                                         \
      }                                                                     \
    }                                                                       \
@ -210,7 +210,8 @@ __kernel void k_energy(const __global numtyp4 *restrict x_,
   
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
@ -286,7 +287,8 @@ __kernel void k_energy_fast(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
@ -353,7 +355,8 @@ __kernel void k_eam(const __global numtyp4 *restrict x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
@ -466,7 +469,8 @@ __kernel void k_eam_fast(const __global numtyp4 *x_,

  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);

--- a/lib/gpu/lal_ellipsoid_extra.h
+++ b/lib/gpu/lal_ellipsoid_extra.h
@ -83,12 +83,12 @@ texture<int4,1> pos_tex, quat_tex;
  if (offset==0) {                                                          \
    __global acctyp *ap1=engv+ii;                                           \
    if (eflag>0) {                                                          \
-      *ap1=energy;                                                          \
+      *ap1=energy*(acctyp)0.5;                                              \
      ap1+=astride;                                                         \
    }                                                                       \
    if (vflag>0) {                                                          \
      for (int i=0; i<6; i++) {                                             \
-        *ap1=virial[i];                                                     \
+        *ap1=virial[i]*(acctyp)0.5;                                         \
        ap1+=astride;                                                       \
      }                                                                     \
    }                                                                       \
@ -130,12 +130,12 @@ texture<int4,1> pos_tex, quat_tex;
  if (offset==0) {                                                          \
    engv+=ii;                                                               \
    if (eflag>0) {                                                          \
-      *engv+=energy;                                                        \
+      *engv+=energy*(acctyp)0.5;                                            \
      engv+=inum;                                                           \
    }                                                                       \
    if (vflag>0) {                                                          \
      for (int i=0; i<6; i++) {                                             \
-        *engv+=virial[i];                                                   \
+        *engv+=virial[i]*(acctyp)0.5;                                       \
        engv+=inum;                                                         \
      }                                                                     \
    }                                                                       \
@ -170,12 +170,12 @@ texture<int4,1> pos_tex, quat_tex;
  if (offset==0) {                                                          \
    __global acctyp *ap1=engv+ii;                                           \
    if (eflag>0) {                                                          \
-      *ap1=energy;                                                          \
+      *ap1=energy*(acctyp)0.5;                                              \
      ap1+=astride;                                                         \
    }                                                                       \
    if (vflag>0) {                                                          \
      for (int i=0; i<6; i++) {                                             \
-        *ap1=virial[i];                                                     \
+        *ap1=virial[i]*(acctyp)0.5;                                         \
        ap1+=astride;                                                       \
      }                                                                     \
    }                                                                       \
@ -202,12 +202,12 @@ texture<int4,1> pos_tex, quat_tex;
  if (offset==0) {                                                          \
    engv+=ii;                                                               \
    if (eflag>0) {                                                          \
-      *engv+=energy;                                                        \
+      *engv+=energy*(acctyp)0.5;                                            \
      engv+=inum;                                                           \
    }                                                                       \
    if (vflag>0) {                                                          \
      for (int i=0; i<6; i++) {                                             \
-        *engv+=virial[i];                                                   \
+        *engv+=virial[i]*(acctyp)0.5;                                       \
        engv+=inum;                                                         \
      }                                                                     \
    }                                                                       \
--- a/lib/gpu/lal_gauss.cpp
+++ b/lib/gpu/lal_gauss.cpp
@ -68,7 +68,7 @@ int GaussT::init(const int ntypes,

  // Allocate a host write buffer for data initialization
  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);

  for (int i=0; i<lj_types*lj_types; i++)
    host_write[i]=0.0;
--- a/lib/gpu/lal_gauss.cu
+++ b/lib/gpu/lal_gauss.cu
@ -52,7 +52,8 @@ __kernel void k_gauss(const __global numtyp4 *restrict x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
@ -138,7 +139,8 @@ __kernel void k_gauss_fast(const __global numtyp4 *restrict x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);

--- a/lib/gpu/lal_gayberne.cpp
+++ b/lib/gpu/lal_gayberne.cpp
@ -77,7 +77,7 @@ int GayBerneT::init(const int ntypes, const double gamma,

  // Allocate a host write buffer for copying type data
  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);

  for (int i=0; i<lj_types*lj_types; i++)
    host_write[i]=0.0;
@ -98,7 +98,7 @@ int GayBerneT::init(const int ntypes, const double gamma,
  this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
 		         host_offset);

-  dev_error.alloc(1,*(this->ucl_device));
+  dev_error.alloc(1,*(this->ucl_device),UCL_WRITE_ONLY);
  dev_error.zero();
    
  // Allocate, cast and asynchronous memcpy of constant data
@ -258,6 +258,9 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
                                   &ainum, &this->_threads_per_atom);
      this->time_ellipsoid2.stop();
   } else {
+      GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum()-
+                               this->_last_ellipse)/
+                               (BX/this->_threads_per_atom)));
      this->ans->force.zero();
      this->ans->engv.zero();
      this->time_nbor1.stop();
--- a/lib/gpu/lal_gayberne.cu
+++ b/lib/gpu/lal_gayberne.cu
@ -120,7 +120,8 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    const __global int *nbor, *nbor_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
                n_stride,nbor_end,nbor);
  
--- a/lib/gpu/lal_gayberne_lj.cu
+++ b/lib/gpu/lal_gayberne_lj.cu
@ -54,7 +54,8 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    const __global int *nbor, *nbor_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
                n_stride,nbor_end,nbor);
  
@ -276,7 +277,8 @@ __kernel void k_gayberne_lj(const __global numtyp4 *restrict x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
                n_stride,list_end,nbor);
  
@ -370,7 +372,8 @@ __kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
                n_stride,list_end,nbor);

--- a/lib/gpu/lal_lj.cpp
+++ b/lib/gpu/lal_lj.cpp
@ -69,7 +69,7 @@ int LJT::init(const int ntypes,

  // Allocate a host write buffer for data initialization
  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);

  for (int i=0; i<lj_types*lj_types; i++)
    host_write[i]=0.0;
--- a/lib/gpu/lal_lj.cu
+++ b/lib/gpu/lal_lj.cu
@ -53,7 +53,8 @@ __kernel void k_lj(const __global numtyp4 *restrict x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
@ -142,7 +143,8 @@ __kernel void k_lj_fast(const __global numtyp4 *restrict x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);

--- a/lib/gpu/lal_lj96.cpp
+++ b/lib/gpu/lal_lj96.cpp
@ -69,7 +69,7 @@ int LJ96T::init(const int ntypes,

  // Allocate a host write buffer for data initialization
  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);

  for (int i=0; i<lj_types*lj_types; i++)
    host_write[i]=0.0;
--- a/lib/gpu/lal_lj96.cu
+++ b/lib/gpu/lal_lj96.cu
@ -53,7 +53,8 @@ __kernel void k_lj96(const __global numtyp4 *restrict x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
@ -143,7 +144,8 @@ __kernel void k_lj96_fast(const __global numtyp4 *restrict x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
--- a/lib/gpu/lal_lj_class2_long.cpp
+++ b/lib/gpu/lal_lj_class2_long.cpp
@ -73,7 +73,7 @@ int LJClass2LongT::init(const int ntypes, double **host_cutsq,

  // Allocate a host write buffer for data initialization
  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);

  for (int i=0; i<lj_types*lj_types; i++)
    host_write[i]=0.0;
--- a/lib/gpu/lal_lj_class2_long.cu
+++ b/lib/gpu/lal_lj_class2_long.cu
@ -66,7 +66,8 @@ __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
@ -187,7 +188,8 @@ __kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
--- a/lib/gpu/lal_lj_coul.cpp
+++ b/lib/gpu/lal_lj_coul.cpp
@ -72,7 +72,7 @@ int LJCoulT::init(const int ntypes,

  // Allocate a host write buffer for data initialization
  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);

  for (int i=0; i<lj_types*lj_types; i++)
    host_write[i]=0.0;
--- a/lib/gpu/lal_lj_coul.cu
+++ b/lib/gpu/lal_lj_coul.cu
@ -66,7 +66,8 @@ __kernel void k_lj_coul(const __global numtyp4 *restrict x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
@ -178,7 +179,8 @@ __kernel void k_lj_coul_fast(const __global numtyp4 *restrict x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
--- a/lib/gpu/lal_lj_coul_debye.cpp
+++ b/lib/gpu/lal_lj_coul_debye.cpp
@ -73,7 +73,7 @@ int LJCoulDebyeT::init(const int ntypes,

  // Allocate a host write buffer for data initialization
  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);

  for (int i=0; i<lj_types*lj_types; i++)
    host_write[i]=0.0;
--- a/lib/gpu/lal_lj_coul_debye.cu
+++ b/lib/gpu/lal_lj_coul_debye.cu
@ -67,7 +67,8 @@ __kernel void k_lj_debye(const __global numtyp4 *restrict x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
@ -186,7 +187,8 @@ __kernel void k_lj_debye_fast(const __global numtyp4 *restrict x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
--- a/lib/gpu/lal_lj_coul_long.cpp
+++ b/lib/gpu/lal_lj_coul_long.cpp
@ -73,7 +73,7 @@ int LJCoulLongT::init(const int ntypes,

  // Allocate a host write buffer for data initialization
  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);

  for (int i=0; i<lj_types*lj_types; i++)
    host_write[i]=0.0;
--- a/lib/gpu/lal_lj_coul_long.cu
+++ b/lib/gpu/lal_lj_coul_long.cu
@ -66,7 +66,8 @@ __kernel void k_lj_coul_long(const __global numtyp4 *restrict x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
@ -183,7 +184,8 @@ __kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
--- a/lib/gpu/lal_lj_dsf.cpp
+++ b/lib/gpu/lal_lj_dsf.cpp
@ -77,7 +77,7 @@ int LJDSFT::init(const int ntypes, double **host_cutsq, double **host_lj1,

  // Allocate a host write buffer for data initialization
  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);

  for (int i=0; i<lj_types*lj_types; i++)
    host_write[i]=0.0;
--- a/lib/gpu/lal_lj_dsf.cu
+++ b/lib/gpu/lal_lj_dsf.cu
@ -69,7 +69,8 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
@ -195,7 +196,8 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
--- a/lib/gpu/lal_lj_expand.cpp
+++ b/lib/gpu/lal_lj_expand.cpp
@ -69,7 +69,7 @@ int LJExpandT::init(const int ntypes, double **host_cutsq,

  // Allocate a host write buffer for data initialization
  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);

  for (int i=0; i<lj_types*lj_types; i++)
    host_write[i]=0.0;
--- a/lib/gpu/lal_lj_expand.cu
+++ b/lib/gpu/lal_lj_expand.cu
@ -55,7 +55,8 @@ __kernel void k_lj_expand(const __global numtyp4 *restrict x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
@ -147,7 +148,8 @@ __kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
--- a/lib/gpu/lal_morse.cpp
+++ b/lib/gpu/lal_morse.cpp
@ -69,7 +69,7 @@ int MorseT::init(const int ntypes,

  // Allocate a host write buffer for data initialization
  UCL_H_Vec<numtyp> host_write(types*types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);

  for (int i=0; i<types*types; i++)
    host_write[i]=0.0;
--- a/lib/gpu/lal_morse.cu
+++ b/lib/gpu/lal_morse.cu
@ -55,7 +55,8 @@ __kernel void k_morse(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
@ -145,7 +146,8 @@ __kernel void k_morse_fast(const __global numtyp4 *restrict x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
--- a/lib/gpu/lal_neighbor.cpp
+++ b/lib/gpu/lal_neighbor.cpp
@ -38,7 +38,8 @@ bool Neighbor::init(NeighborShared *shared, const int inum,
                       const bool pre_cut, const int block_cell_2d,
                       const int block_cell_id, const int block_nbor_build,
                       const int threads_per_atom, const int warp_size,
-                       const bool time_device) {
+                       const bool time_device, 
+                       const std::string compile_flags) {
  clear();

  _threads_per_atom=threads_per_atom;
@ -92,13 +93,13 @@ bool Neighbor::init(NeighborShared *shared, const int inum,

  if (gpu_nbor==0)
    success=success && (host_packed.alloc(2*IJ_SIZE,*dev,
-                                          UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
+                                          UCL_WRITE_ONLY)==UCL_SUCCESS);
  alloc(success);
  if (!success)
    return false;
    
  if (_use_packing==false)
-    _shared->compile_kernels(devi,gpu_nbor);
+    _shared->compile_kernels(devi,gpu_nbor,compile_flags);

  return success;
 }
@ -114,7 +115,7 @@ void Neighbor::alloc(bool &success) {
    success=success && (dev_nbor.alloc(3*_max_atoms,*dev,
                                       UCL_READ_ONLY)==UCL_SUCCESS);
  success=success && (host_acc.alloc(nt*2,*dev,
-                                     UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
+                                     UCL_READ_WRITE)==UCL_SUCCESS);

  _c_bytes=dev_nbor.row_bytes();
  if (_alloc_packed) {
@ -129,10 +130,10 @@ void Neighbor::alloc(bool &success) {
    host_ilist.clear();
    host_jlist.clear();
    
-    success=(nbor_host.alloc(_max_nbors*_max_host,*dev,UCL_RW_OPTIMIZED,
-                             UCL_WRITE_ONLY)==UCL_SUCCESS) && success;
+    success=(nbor_host.alloc(_max_nbors*_max_host,*dev,UCL_READ_WRITE,
+                             UCL_READ_WRITE)==UCL_SUCCESS) && success;
    success=success && (dev_numj_host.alloc(_max_host,*dev,
-                                            UCL_WRITE_ONLY)==UCL_SUCCESS);
+                                            UCL_READ_WRITE)==UCL_SUCCESS);
    success=success && (host_ilist.alloc(nt,*dev,UCL_NOT_PINNED)==UCL_SUCCESS);
    if (!success)
      return;
@ -161,7 +162,7 @@ void Neighbor::alloc(bool &success) {
    success=success && (dev_nspecial.alloc(3*at,*dev,
                                           UCL_READ_ONLY)==UCL_SUCCESS);
    success=success && (dev_special.alloc(_maxspecial*at,*dev,
-                                          UCL_READ_ONLY)==UCL_SUCCESS);
+                                          UCL_READ_WRITE)==UCL_SUCCESS);
    success=success && (dev_special_t.alloc(_maxspecial*at,*dev,
                                            UCL_READ_ONLY)==UCL_SUCCESS);
    _gpu_bytes+=dev_nspecial.row_bytes()+dev_special.row_bytes()+
@ -178,11 +179,9 @@ void Neighbor::clear() {
  _bin_time=0.0;
  if (_ncells>0) {
    _ncells=0;
-    dev_cell_counts.clear();
-    if (_gpu_nbor==2) {
-      host_cell_counts.clear();
+    cell_counts.clear();
+    if (_gpu_nbor==2)
      delete [] cell_iter;
-    }
  }
  if (_allocated) {
    _allocated=false;
@ -286,6 +285,80 @@ void Neighbor::get_host(const int inum, int *ilist, int *numj,
  }
 }

+// This is the same as get host, but the requirement that ilist[i]=i and
+// inum=nlocal is forced to be true to allow direct indexing of neighbors of
+// neighbors
+void Neighbor::get_host3(const int inum, const int nlist, int *ilist, int *numj,
+                         int **firstneigh, const int block_size) {  
+  _nbor_time_avail=true;
+  time_nbor.start();
+
+  UCL_H_Vec<int> ilist_view;
+  ilist_view.view(ilist,inum,*dev);
+  ucl_copy(dev_nbor,ilist_view,false);
+
+  UCL_D_Vec<int> nbor_offset;
+  UCL_H_Vec<int> host_offset;
+
+  int copy_count=0;
+  int ij_count=0;
+  int acc_count=0;
+  int dev_count=0;
+  int *h_ptr=host_packed.begin();
+  _nbor_pitch=inum;
+  
+  if (nlist!=inum)
+    host_acc.zero(inum);
+
+  for (int ii=0; ii<nlist; ii++) {
+    int i=ilist[ii];
+    int nj=numj[i];
+    host_acc[i]=nj;
+    host_acc[i+inum]=acc_count;
+    acc_count+=nj;
+  }
+  
+  for (int i=0; i<inum; i++) {
+    int nj=host_acc[i];
+    int *jlist=firstneigh[i];
+    for (int jj=0; jj<nj; jj++) {
+      *h_ptr=jlist[jj];
+      h_ptr++;
+      ij_count++;
+       
+      if (ij_count==IJ_SIZE) {
+        dev_nbor.sync();
+        host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,IJ_SIZE);
+        nbor_offset.view_offset(dev_count,dev_packed,IJ_SIZE);
+        ucl_copy(nbor_offset,host_offset,true);
+        copy_count++;
+        ij_count=0;
+        dev_count+=IJ_SIZE;
+        h_ptr=host_packed.begin()+(IJ_SIZE*(copy_count%2));
+      }
+    }
+  }
+  if (ij_count!=0) {
+    dev_nbor.sync();
+    host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,ij_count);
+    nbor_offset.view_offset(dev_count,dev_packed,ij_count);
+    ucl_copy(nbor_offset,host_offset,true);
+  }
+  UCL_D_Vec<int> acc_view;
+  acc_view.view_offset(inum,dev_nbor,inum*2);
+  ucl_copy(acc_view,host_acc,true);
+  time_nbor.stop();
+  
+  if (_use_packing==false) {
+    time_kernel.start();
+    int GX=static_cast<int>(ceil(static_cast<double>(inum)*_threads_per_atom/
+                                 block_size));
+    _shared->k_nbor.set_size(GX,block_size);
+    _shared->k_nbor.run(&dev_nbor, &dev_packed, &inum, &_threads_per_atom);
+    time_kernel.stop();
+  }
+}
+
 template <class numtyp, class acctyp>
 void Neighbor::resize_max_neighbors(const int maxn, bool &success) {
  if (maxn>_max_nbors) {  
@ -330,24 +403,20 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
  ncellz = static_cast<int>(ceil((subhi[2]-sublo[2])/_cell_size))+ghost_cells;
  ncell_3d = ncellx * ncelly * ncellz;
  if (ncell_3d+1>_ncells) {
-    if (_gpu_nbor==2) {
-      if (_ncells>0) {
-        host_cell_counts.clear();
-        delete [] cell_iter;
-      }
-      cell_iter = new int[ncell_3d+1];
-      host_cell_counts.alloc(ncell_3d+1,dev_nbor);
-    }
+    cell_counts.clear();
  
-    if (_gpu_nbor==2 && atom.host_view())
-      dev_cell_counts.view(host_cell_counts);
-    else {
-      dev_cell_counts.clear();
-      dev_cell_counts.alloc(ncell_3d+1,dev_nbor);
+    if (_gpu_nbor==2) {
+      if (_ncells>0)
+        delete [] cell_iter;
+      cell_iter = new int[ncell_3d+1];
+      cell_counts.alloc(ncell_3d+1,dev_nbor,UCL_READ_WRITE,UCL_READ_ONLY);
+    } else {
+      cell_counts.device.clear();
+      cell_counts.device.alloc(ncell_3d+1,dev_nbor);
    }
    
    _ncells=ncell_3d+1;
-    _cell_bytes=dev_cell_counts.row_bytes();
+    _cell_bytes=cell_counts.device.row_bytes();
  }

  const numtyp cutoff_cast=static_cast<numtyp>(_cutoff);
@ -381,7 +450,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
    int *particle_id=atom.host_particle_id.begin();
    
    // Build cell list on CPU                               
-    host_cell_counts.zero();
+    cell_counts.host.zero();
    double i_cell_size=1.0/_cell_size;

    int offset_hi=_cells_in_cutoff+1;
@ -403,7 +472,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
    
      int id = ix+iy*ncellx+iz*ncellx*ncelly;
      cell_id[i] = id;
-      host_cell_counts[id+1]++;
+      cell_counts[id+1]++;
    }
    
    for (int i=nt; i<nall; i++) {
@ -424,12 +493,12 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
    
      int id = ix+iy*ncellx+iz*ncellx*ncelly;
      cell_id[i] = id;
-      host_cell_counts[id+1]++;
+      cell_counts[id+1]++;
    }
    
    mn=0;
    for (int i=0; i<_ncells; i++)
-      mn=std::max(mn,host_cell_counts[i]);
+      mn=std::max(mn,cell_counts[i]);
    mn*=8;
    set_nbor_block_size(mn/2);

@ -440,11 +509,11 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,

    cell_iter[0]=0;
    for (int i=1; i<_ncells; i++) {
-      host_cell_counts[i]+=host_cell_counts[i-1];
-      cell_iter[i]=host_cell_counts[i];
+      cell_counts[i]+=cell_counts[i-1];
+      cell_iter[i]=cell_counts[i];
    }
    time_hybrid1.start();
-    ucl_copy(dev_cell_counts,host_cell_counts,true);
+    cell_counts.update_device(true);
    time_hybrid1.stop();
    for (int i=0; i<nall; i++) {
      int celli=cell_id[i];
@ -481,7 +550,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,

    /* calculate cell count */
    _shared->k_cell_counts.set_size(GX,neigh_block);
-    _shared->k_cell_counts.run(&atom.dev_cell_id, &dev_cell_counts, &nall, 
+    _shared->k_cell_counts.run(&atom.dev_cell_id, &cell_counts, &nall, 
                               &ncell_3d);
  } 
  
@ -490,7 +559,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
  _shared->k_build_nbor.set_size(ncellx-ghost_cells,(ncelly-ghost_cells)*
                                 (ncellz-ghost_cells),cell_block,1);
  _shared->k_build_nbor.run(&atom.x, &atom.dev_particle_id,
-                            &dev_cell_counts, &dev_nbor, &nbor_host,
+                            &cell_counts, &dev_nbor, &nbor_host,
                            &dev_numj_host, &_max_nbors, &cutoff_cast, &ncellx,
                            &ncelly, &ncellz, &inum, &nt, &nall,
                            &_threads_per_atom, &_cells_in_cutoff);
--- a/lib/gpu/lal_neighbor.h
+++ b/lib/gpu/lal_neighbor.h
@ -47,13 +47,15 @@ class Neighbor {
    * \param pre_cut True if cutoff test will be performed in separate kernel
    *                than the force kernel 
    * \param threads_per_atom Number of threads used per atom for force
-    *                         calculation **/
+    *                         calculation 
+    * \param compile_flags Flags for JIT compiling **/
  bool init(NeighborShared *shared, const int inum, const int host_inum,
            const int max_nbors, const int maxspecial, UCL_Device &dev,
            const int gpu_nbor, const int gpu_host, const bool pre_cut,
            const int block_cell_2d, const int block_cell_id, 
            const int block_nbor_build, const int threads_per_atom,
-            const int warp_size, const bool time_device);
+            const int warp_size, const bool time_device, 
+            const std::string compile_flags);

  /// Set the size of the cutoff+skin
  inline void cell_size(const double size, const double cutoff) { 
@ -143,6 +145,10 @@ class Neighbor {
  void get_host(const int inum, int *ilist, int *numj, 
                int **firstneigh, const int block_size);
  
+  /// Copy neighbor list from host for 3-body (first time or from a rebuild)  
+  void get_host3(const int inum, const int nlist, int *ilist, int *numj, 
+                 int **firstneigh, const int block_size);
+  
  /// Return the stride in elements for each nbor row
  inline int nbor_pitch() const { return _nbor_pitch; }
  
@ -207,11 +213,9 @@ class Neighbor {
  UCL_D_Vec<int> dev_nspecial;
  /// Device storage for special neighbors
  UCL_D_Vec<int> dev_special, dev_special_t;
-  /// Host storage for number of particles per cell
-  UCL_H_Vec<int> host_cell_counts;
+  /// Host/Device storage for number of particles per cell
+  UCL_Vector<int,int> cell_counts;
  int *cell_iter;
-  /// Device storage for number of particles per cell
-  UCL_D_Vec<int> dev_cell_counts;

  /// Device timers
  UCL_Timer time_nbor, time_kernel, time_hybrid1, time_hybrid2, time_transpose;
--- a/lib/gpu/lal_neighbor_shared.cpp
+++ b/lib/gpu/lal_neighbor_shared.cpp
@ -48,15 +48,12 @@ void NeighborShared::clear() {
  }
 }

-void NeighborShared::compile_kernels(UCL_Device &dev, const int gpu_nbor) {
+void NeighborShared::compile_kernels(UCL_Device &dev, const int gpu_nbor,
+                                     const std::string flags) {
  if (_compiled)
  	return;
  	
  _gpu_nbor=gpu_nbor;
-  std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
-                    std::string(OCL_PRECISION_COMPILE)+" -D"+
-                    std::string(OCL_VENDOR);
-
  if (_gpu_nbor==0) {
    nbor_program=new UCL_Program(dev);
    nbor_program->load_string(neighbor_cpu,flags.c_str());
--- a/lib/gpu/lal_neighbor_shared.h
+++ b/lib/gpu/lal_neighbor_shared.h
@ -44,7 +44,8 @@ class NeighborShared {
  UCL_Texture neigh_tex;

  /// Compile kernels for neighbor lists
-  void compile_kernels(UCL_Device &dev, const int gpu_nbor);
+  void compile_kernels(UCL_Device &dev, const int gpu_nbor, 
+                       const std::string flags);

  // ----------------------------- Kernels
  UCL_Program *nbor_program, *build_program;
--- a/lib/gpu/lal_pppm.cpp
+++ b/lib/gpu/lal_pppm.cpp
@ -136,10 +136,10 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen,
  _npts_y=nyhi_out-nylo_out+1;
  _npts_z=nzhi_out-nzlo_out+1;
  _npts_yx=_npts_x*_npts_y;
-  success=success && (brick.alloc(_npts_x*_npts_y*_npts_z,*ucl_device)==
-                      UCL_SUCCESS);
-  success=success && (vd_brick.alloc(_npts_x*_npts_y*_npts_z*4,*ucl_device)==
-                      UCL_SUCCESS);
+  success=success && (brick.alloc(_npts_x*_npts_y*_npts_z,*ucl_device,
+                                  UCL_READ_ONLY,UCL_WRITE_ONLY)==UCL_SUCCESS);
+  success=success && (vd_brick.alloc(_npts_x*_npts_y*_npts_z*4,*ucl_device,
+                                    UCL_READ_WRITE,UCL_READ_ONLY)==UCL_SUCCESS);
  *vd_brick_p=vd_brick.host.begin();
  _max_bytes+=brick.device.row_bytes()+vd_brick.device.row_bytes();

@ -159,7 +159,7 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen,
  _max_bytes+=d_brick_atoms.row_bytes();

  // Allocate error flags for checking out of bounds atoms
-  success=success && (error_flag.alloc(1,*ucl_device,UCL_RW_OPTIMIZED,
+  success=success && (error_flag.alloc(1,*ucl_device,UCL_READ_ONLY,
                                       UCL_WRITE_ONLY)==UCL_SUCCESS);
  if (!success) {
    flag=-3;
@ -374,9 +374,7 @@ void PPPMT::compile_kernels(UCL_Device &dev) {
  if (sizeof(grdtyp)==sizeof(double) && ucl_device->double_precision()==false)
    return;

-  std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
-                    std::string(OCL_PRECISION_COMPILE)+" -D"+
-                    std::string(OCL_VENDOR);
+  std::string flags=device->compile_string();
  #ifdef USE_OPENCL
  flags+=std::string(" -Dgrdtyp=")+ucl_template_name<grdtyp>()+" -Dgrdtyp4="+
         ucl_template_name<grdtyp>()+"4";
--- a/lib/gpu/lal_pppm_ext.cpp
+++ b/lib/gpu/lal_pppm_ext.cpp
@ -97,11 +97,11 @@ float * pppm_gpu_init_f(const int nlocal, const int nall, FILE *screen,
                        const int nzhi_out, float **rho_coeff,
                        float **vd_brick, const double slab_volfactor,
                        const int nx_pppm, const int ny_pppm, const int nz_pppm,
-                        const bool split, int &success) {
+                        const bool split, const bool respa, int &success) {
  float *b=pppm_gpu_init(PPPMF,nlocal,nall,screen,order,nxlo_out,nylo_out,
                         nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff,vd_brick,
                         slab_volfactor,nx_pppm,ny_pppm,nz_pppm,split,success);
-  if (split==false)
+  if (split==false && respa==false)
    PPPMF.device->set_single_precompute(&PPPMF);                         
  return b;
 }
@ -139,12 +139,13 @@ double * pppm_gpu_init_d(const int nlocal, const int nall, FILE *screen,
                         const int nzhi_out, double **rho_coeff,
                         double **vd_brick, const double slab_volfactor,
                         const int nx_pppm, const int ny_pppm,
-                         const int nz_pppm, const bool split, int &success) {
+                         const int nz_pppm, const bool split, 
+                         const bool respa, int &success) {
  double *b=pppm_gpu_init(PPPMD,nlocal,nall,screen,order,nxlo_out,nylo_out,
                          nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff,
                          vd_brick,slab_volfactor,nx_pppm,ny_pppm,nz_pppm,
                          split,success);                        
-  if (split==false)
+  if (split==false && respa==false)
    PPPMD.device->set_double_precompute(&PPPMD);                         
  return b;
 }
--- a/lib/gpu/lal_precision.h
+++ b/lib/gpu/lal_precision.h
@ -96,19 +96,27 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) {

 enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};

-// OCL_VENDOR: preprocessor define for hardware
+// OCL_DEFAULT_VENDOR: preprocessor define for hardware
 // specific sizes of OpenCL kernel related constants

 #ifdef FERMI_OCL
-#define OCL_VENDOR "FERMI_OCL"
+#define OCL_DEFAULT_VENDOR "fermi"
+#endif
+
+#ifdef KEPLER_OCL
+#define OCL_DEFAULT_VENDOR "kepler"
 #endif

 #ifdef CYPRESS_OCL
-#define OCL_VENDOR "CYPRESS_OCL"
+#define OCL_DEFAULT_VENDOR "cypress"
 #endif

-#ifndef OCL_VENDOR
-#define OCL_VENDOR "GENERIC_OCL"
+#ifdef GENERIC_OCL
+#define OCL_DEFAULT_VENDOR "generic"
+#endif
+
+#ifndef OCL_DEFAULT_VENDOR
+#define OCL_DEFAULT_VENDOR "none"
 #endif

 #endif
--- a/lib/gpu/lal_preprocessor.h
+++ b/lib/gpu/lal_preprocessor.h
@ -214,6 +214,30 @@ typedef struct _double4 double4;

 #endif

+// -------------------------------------------------------------------------
+//                            NVIDIA GENERIC OPENCL DEFINITIONS
+// -------------------------------------------------------------------------
+
+#ifdef NV_GENERIC_OCL
+
+#define USE_OPENCL
+#define fast_mul mul24
+#define MEM_THREADS 16
+#define THREADS_PER_ATOM 1
+#define THREADS_PER_CHARGE 1
+#define BLOCK_PAIR 64
+#define MAX_SHARED_TYPES 8
+#define BLOCK_NBOR_BUILD 64
+#define BLOCK_BIO_PAIR 64
+
+#define WARP_SIZE 32
+#define PPPM_BLOCK_1D 64
+#define BLOCK_CELL_2D 8
+#define BLOCK_CELL_ID 128
+#define MAX_BIO_SHARED_TYPES 128
+
+#endif
+
 // -------------------------------------------------------------------------
 //                           NVIDIA FERMI OPENCL DEFINITIONS
 // -------------------------------------------------------------------------
@ -221,9 +245,6 @@ typedef struct _double4 double4;
 #ifdef FERMI_OCL

 #define USE_OPENCL
-#define fast_mul(X,Y) (X)*(Y)
-#define ARCH 0
-#define DRIVER 0
 #define MEM_THREADS 32
 #define THREADS_PER_ATOM 4
 #define THREADS_PER_CHARGE 8
@ -238,7 +259,54 @@ typedef struct _double4 double4;
 #define BLOCK_CELL_ID 128
 #define MAX_BIO_SHARED_TYPES 128

-#pragma OPENCL EXTENSION cl_khr_fp64: enable
+#endif
+
+// -------------------------------------------------------------------------
+//                           NVIDIA KEPLER OPENCL DEFINITIONS
+// -------------------------------------------------------------------------
+
+#ifdef KEPLER_OCL
+
+#define USE_OPENCL
+#define MEM_THREADS 32
+#define THREADS_PER_ATOM 4
+#define THREADS_PER_CHARGE 8
+#define BLOCK_PAIR 256
+#define MAX_SHARED_TYPES 11
+#define BLOCK_NBOR_BUILD 128
+#define BLOCK_BIO_PAIR 256
+#define BLOCK_ELLIPSE 128
+
+#define WARP_SIZE 32
+#define PPPM_BLOCK_1D 64
+#define BLOCK_CELL_2D 8
+#define BLOCK_CELL_ID 128
+#define MAX_BIO_SHARED_TYPES 128
+
+#ifndef NO_OCL_PTX
+#define ARCH 300
+#ifdef _SINGLE_SINGLE
+inline float shfl_xor(float var, int laneMask, int width) {
+  float ret;
+  int c;
+  c = ((WARP_SIZE-width) << 8) | 0x1f;
+  asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=f"(ret) : "f"(var), "r"(laneMask), "r"(c));
+  return ret;
+}
+#else
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+inline double shfl_xor(double var, int laneMask, int width) {
+  int c = ((WARP_SIZE-width) << 8) | 0x1f;
+  int x,y,x2,y2;
+  double ans;
+  asm volatile ("mov.b64 {%0, %1}, %2;" : "=r"(y), "=r"(x) : "d"(var));
+  asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=r"(x2) : "r"(x), "r"(laneMask), "r"(c));
+  asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=r"(y2) : "r"(y), "r"(laneMask), "r"(c));
+  asm volatile ("mov.b64 %0, {%1, %2};" : "=d"(ans) : "r"(y2), "r"(x2));
+  return ans;
+}
+#endif
+#endif

 #endif

@ -249,9 +317,6 @@ typedef struct _double4 double4;
 #ifdef CYPRESS_OCL

 #define USE_OPENCL
-#define fast_mul(X,Y) (X)*(Y)
-#define ARCH 0
-#define DRIVER 0
 #define MEM_THREADS 32
 #define THREADS_PER_ATOM 4
 #define THREADS_PER_CHARGE 8
@ -266,12 +331,6 @@ typedef struct _double4 double4;
 #define BLOCK_CELL_ID 128
 #define MAX_BIO_SHARED_TYPES 128

-#if defined(cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-#elif defined(cl_amd_fp64)
-#pragma OPENCL EXTENSION cl_amd_fp64 : enable
-#endif
-
 #endif

 // -------------------------------------------------------------------------
@ -281,9 +340,6 @@ typedef struct _double4 double4;
 #ifdef GENERIC_OCL

 #define USE_OPENCL
-#define fast_mul mul24
-#define ARCH 0
-#define DRIVER 0
 #define MEM_THREADS 16
 #define THREADS_PER_ATOM 1
 #define THREADS_PER_CHARGE 1
@ -298,6 +354,20 @@ typedef struct _double4 double4;
 #define BLOCK_CELL_ID 128
 #define MAX_BIO_SHARED_TYPES 128

+#endif
+
+// -------------------------------------------------------------------------
+//                     OPENCL Stuff for All Hardware
+// -------------------------------------------------------------------------
+#ifdef USE_OPENCL
+
+#ifndef _SINGLE_SINGLE
+
+#ifndef cl_khr_fp64
+#ifndef cl_amd_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#endif
+#endif
 #if defined(cl_khr_fp64)
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 #elif defined(cl_amd_fp64)
@ -306,10 +376,17 @@ typedef struct _double4 double4;

 #endif

-// -------------------------------------------------------------------------
-//                     OPENCL Stuff for All Hardware
-// -------------------------------------------------------------------------
-#ifdef USE_OPENCL
+#ifndef fast_mul
+#define fast_mul(X,Y) (X)*(Y)
+#endif
+
+#ifndef ARCH
+#define ARCH 0
+#endif
+
+#ifndef DRIVER
+#define DRIVER 0
+#endif

 #define GLOBAL_ID_X get_global_id(0)
 #define THREAD_ID_X get_local_id(0)
--- a/lib/gpu/lal_re_squared.cpp
+++ b/lib/gpu/lal_re_squared.cpp
@ -74,7 +74,7 @@ int RESquaredT::init(const int ntypes, double **host_shape, double **host_well,

  // Allocate a host write buffer for copying type data
  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);

  for (int i=0; i<lj_types*lj_types; i++)
    host_write[i]=0.0;
@ -95,7 +95,7 @@ int RESquaredT::init(const int ntypes, double **host_shape, double **host_well,
  this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
 		         host_offset);

-  dev_error.alloc(1,*(this->ucl_device));
+  dev_error.alloc(1,*(this->ucl_device),UCL_WRITE_ONLY);
  dev_error.zero();
    
  // Allocate, cast and asynchronous memcpy of constant data
@ -260,6 +260,9 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
                                   &this->_threads_per_atom);
      this->time_ellipsoid3.stop();
   } else {
+      GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum()-
+                               this->_last_ellipse)/
+                               (BX/this->_threads_per_atom)));
      this->ans->force.zero();
      this->ans->engv.zero();
      this->time_nbor1.zero();
--- a/lib/gpu/lal_re_squared.cu
+++ b/lib/gpu/lal_re_squared.cu
@ -75,7 +75,8 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    const __global int *nbor, *nbor_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
                n_stride,nbor_end,nbor);
  
--- a/lib/gpu/lal_re_squared_lj.cu
+++ b/lib/gpu/lal_re_squared_lj.cu
@ -59,12 +59,12 @@
  if (offset==0) {                                                           \
    __global acctyp *ap1=engv+ii;                                            \
    if (eflag>0) {                                                           \
-      *ap1+=energy;                                                          \
+      *ap1+=energy*(acctyp)0.5;                                              \
      ap1+=astride;                                                          \
    }                                                                        \
    if (vflag>0) {                                                           \
      for (int i=0; i<6; i++) {                                              \
-        *ap1+=virial[i];                                                     \
+        *ap1+=virial[i]*(acctyp)0.5;                                         \
        ap1+=astride;                                                        \
      }                                                                      \
    }                                                                        \
@ -104,12 +104,12 @@
  if (offset==0) {                                                          \
    __global acctyp *ap1=engv+ii;                                           \
    if (eflag>0) {                                                          \
-      *ap1+=energy;                                                         \
+      *ap1+=energy*(acctyp)0.5;                                             \
      ap1+=astride;                                                         \
    }                                                                       \
    if (vflag>0) {                                                          \
      for (int i=0; i<6; i++) {                                             \
-        *ap1+=virial[i];                                                    \
+        *ap1+=virial[i]*(acctyp)0.5;                                        \
        ap1+=astride;                                                       \
      }                                                                     \
    }                                                                       \
@ -173,7 +173,8 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    const __global int *nbor, *nbor_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
                n_stride,nbor_end,nbor);
  
@ -424,7 +425,8 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    const __global int *nbor, *nbor_end;
-    int j, numj, n_stride;
+    int j, numj;
+    __local int n_stride;
    nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,j,numj,
                n_stride,nbor_end,nbor);
  
@ -615,7 +617,8 @@ __kernel void k_resquared_lj(const __global numtyp4 *restrict x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
                n_stride,list_end,nbor);
  
@ -708,7 +711,8 @@ __kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
                n_stride,list_end,nbor);

--- a/lib/gpu/lal_table.cpp
+++ b/lib/gpu/lal_table.cpp
@ -88,7 +88,7 @@ int TableT::init(const int ntypes,
  
  // Allocate a host write buffer for data initialization
  UCL_H_Vec<int> host_write_int(lj_types*lj_types,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);

  for (int i=0; i<lj_types*lj_types; i++) 
    host_write_int[i] = 0;
@ -113,7 +113,7 @@ int TableT::init(const int ntypes,
  ucl_copy(nmask,host_write_int,false);
  
  UCL_H_Vec<numtyp4> host_write(lj_types*lj_types,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);

  coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  for (int ix=1; ix<ntypes; ix++)
@ -127,7 +127,7 @@ int TableT::init(const int ntypes,

  // Allocate tablength arrays
  UCL_H_Vec<numtyp4> host_write2(_ntables*_tablength,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                                 UCL_WRITE_ONLY);
  for (int i=0; i<_ntables*_tablength; i++) {
    host_write2[i].x = 0.0;
    host_write2[i].y = 0.0;
@ -190,7 +190,7 @@ int TableT::init(const int ntypes,
  ucl_copy(coeff4,host_write2,false);
  
  UCL_H_Vec<numtyp> host_rsq(lj_types*lj_types,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                             UCL_WRITE_ONLY);
  cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack1(ntypes,lj_types,cutsq,host_rsq,host_cutsq);
            
--- a/lib/gpu/lal_table.cu
+++ b/lib/gpu/lal_table.cu
@ -74,7 +74,8 @@ __kernel void k_table(const __global numtyp4 *restrict x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);

@ -171,7 +172,8 @@ __kernel void k_table_fast(const __global numtyp4 *restrict x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);

@ -268,7 +270,8 @@ __kernel void k_table_linear(const __global numtyp4 *restrict x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);

@ -369,7 +372,8 @@ __kernel void k_table_linear_fast(const __global numtyp4 *restrict x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);

@ -470,7 +474,8 @@ __kernel void k_table_spline(const __global numtyp4 *restrict x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);

@ -578,7 +583,8 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);

@ -688,7 +694,8 @@ __kernel void k_table_bitmap(const __global numtyp4 *x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);

@ -794,7 +801,8 @@ __kernel void k_table_bitmap_fast(const __global numtyp4 *x_,
  
  if (ii<inum) {
    const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);

--- a/Show More
+++ b/Show More