diff --git a/lib/gpu/Makefile.linux_opencl b/lib/gpu/Makefile.linux_opencl
index 8610917d93..71c82d25f3 100644
--- a/lib/gpu/Makefile.linux_opencl
+++ b/lib/gpu/Makefile.linux_opencl
@@ -4,9 +4,14 @@
 
 # which file will be copied to Makefile.lammps
 
-EXTRAMAKE = Makefile.lammps.standard
+EXTRAMAKE = Makefile.lammps.opencl
 
-OCL_CPP = mpic++ -O3 -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK
+OCL_TUNE = -DFERMI_OCL       # -- Uncomment for NVIDIA Fermi
+# OCL_TUNE = -DKEPLER_OCL    # -- Uncomment for NVIDIA Kepler
+# OCL_TUNE = -DCYPRESS_OCL   # -- Uncomment for AMD Cypress
+# OCL_TUNE = -DGENERIC_OCL   # -- Uncomment for generic device
+
+OCL_CPP = mpic++ $(DEFAULT_DEVICE) -O3 -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK
 OCL_LINK = -lOpenCL
 OCL_PREC = -D_SINGLE_SINGLE
 
diff --git a/lib/gpu/Makefile.mac_opencl b/lib/gpu/Makefile.mac_opencl
index fa227ecd3e..62b58c1cef 100644
--- a/lib/gpu/Makefile.mac_opencl
+++ b/lib/gpu/Makefile.mac_opencl
@@ -4,7 +4,12 @@
 
 # which file will be copied to Makefile.lammps
 
-EXTRAMAKE = Makefile.lammps.standard
+EXTRAMAKE = Makefile.lammps.mac_ocl
+
+OCL_TUNE = -DFERMI_OCL       # -- Uncomment for NVIDIA Fermi
+# OCL_TUNE = -DKEPLER_OCL    # -- Uncomment for NVIDIA Kepler
+# OCL_TUNE = -DCYPRESS_OCL   # -- Uncomment for AMD Cypress
+# OCL_TUNE = -DGENERIC_OCL   # -- Uncomment for generic device
 
 OCL_CPP = mpic++ -O3 -DMPI_GERYON -DUCL_NO_EXIT
 OCL_LINK = -framework OpenCL
diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile
index ce6132dd56..a5df7bf23d 100644
--- a/lib/gpu/Nvidia.makefile
+++ b/lib/gpu/Nvidia.makefile
@@ -28,7 +28,7 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_ans.o \
        $(OBJ_DIR)/lal_neighbor.o $(OBJ_DIR)/lal_neighbor_shared.o \
        $(OBJ_DIR)/lal_device.o $(OBJ_DIR)/lal_base_atomic.o \
        $(OBJ_DIR)/lal_base_charge.o $(OBJ_DIR)/lal_base_ellipsoid.o \
-       $(OBJ_DIR)/lal_base_dipole.o \
+       $(OBJ_DIR)/lal_base_dipole.o $(OBJ_DIR)/lal_base_three.o \
        $(OBJ_DIR)/lal_pppm.o $(OBJ_DIR)/lal_pppm_ext.o \
        $(OBJ_DIR)/lal_gayberne.o $(OBJ_DIR)/lal_gayberne_ext.o \
        $(OBJ_DIR)/lal_re_squared.o $(OBJ_DIR)/lal_re_squared_ext.o \
@@ -59,7 +59,12 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_ans.o \
        $(OBJ_DIR)/lal_gauss.o $(OBJ_DIR)/lal_gauss_ext.o \
        $(OBJ_DIR)/lal_yukawa_colloid.o $(OBJ_DIR)/lal_yukawa_colloid_ext.o \
        $(OBJ_DIR)/lal_lj_coul_debye.o $(OBJ_DIR)/lal_lj_coul_debye_ext.o \
-       $(OBJ_DIR)/lal_coul_dsf.o $(OBJ_DIR)/lal_coul_dsf_ext.o
+       $(OBJ_DIR)/lal_coul_dsf.o $(OBJ_DIR)/lal_coul_dsf_ext.o \
+       $(OBJ_DIR)/lal_sw.o $(OBJ_DIR)/lal_sw_ext.o \
+       $(OBJ_DIR)/lal_beck.o $(OBJ_DIR)/lal_beck_ext.o \
+       $(OBJ_DIR)/lal_mie.o $(OBJ_DIR)/lal_mie_ext.o \
+       $(OBJ_DIR)/lal_soft.o $(OBJ_DIR)/lal_soft_ext.o \
+       $(OBJ_DIR)/lal_lj_coul_msm.o $(OBJ_DIR)/lal_lj_coul_msm_ext.o
 
 CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \
        $(OBJ_DIR)/atom.cubin $(OBJ_DIR)/atom_cubin.h \
@@ -99,7 +104,12 @@ CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \
        $(OBJ_DIR)/gauss.cubin $(OBJ_DIR)/gauss_cubin.h \
        $(OBJ_DIR)/yukawa_colloid.cubin $(OBJ_DIR)/yukawa_colloid_cubin.h \
        $(OBJ_DIR)/lj_coul_debye.cubin $(OBJ_DIR)/lj_coul_debye_cubin.h \
-       $(OBJ_DIR)/coul_dsf.cubin $(OBJ_DIR)/coul_dsf_cubin.h
+       $(OBJ_DIR)/coul_dsf.cubin $(OBJ_DIR)/coul_dsf_cubin.h \
+       $(OBJ_DIR)/sw.cubin $(OBJ_DIR)/sw_cubin.h \
+       $(OBJ_DIR)/beck.cubin $(OBJ_DIR)/beck_cubin.h \
+       $(OBJ_DIR)/mie.cubin $(OBJ_DIR)/mie_cubin.h \
+       $(OBJ_DIR)/soft.cubin $(OBJ_DIR)/soft_cubin.h \
+       $(OBJ_DIR)/lj_coul_msm.cubin $(OBJ_DIR)/lj_coul_msm_cubin.h
 
 all: $(OBJ_DIR) $(GPU_LIB) $(EXECS)
 
@@ -175,6 +185,9 @@ $(OBJ_DIR)/lal_base_ellipsoid.o: $(ALL_H) lal_base_ellipsoid.h lal_base_ellipsoi
 $(OBJ_DIR)/lal_base_dipole.o: $(ALL_H) lal_base_dipole.h lal_base_dipole.cpp
 	$(CUDR) -o $@ -c lal_base_dipole.cpp
 
+$(OBJ_DIR)/lal_base_three.o: $(ALL_H) lal_base_three.h lal_base_three.cpp
+	$(CUDR) -o $@ -c lal_base_three.cpp
+
 $(OBJ_DIR)/pppm_f.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h
 	$(CUDA) --cubin -DNV_KERNEL -Dgrdtyp=float -Dgrdtyp4=float4 -o $@ lal_pppm.cu
 
@@ -571,6 +584,66 @@ $(OBJ_DIR)/lal_coul_dsf.o: $(ALL_H) lal_coul_dsf.h lal_coul_dsf.cpp $(OBJ_DIR)/c
 $(OBJ_DIR)/lal_coul_dsf_ext.o: $(ALL_H) lal_coul_dsf.h lal_coul_dsf_ext.cpp lal_base_charge.h
 	$(CUDR) -o $@ -c lal_coul_dsf_ext.cpp -I$(OBJ_DIR)
 
+$(OBJ_DIR)/sw.cubin: lal_sw.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_sw.cu
+
+$(OBJ_DIR)/sw_cubin.h: $(OBJ_DIR)/sw.cubin $(OBJ_DIR)/sw.cubin
+	$(BIN2C) -c -n sw $(OBJ_DIR)/sw.cubin > $(OBJ_DIR)/sw_cubin.h
+
+$(OBJ_DIR)/lal_sw.o: $(ALL_H) lal_sw.h lal_sw.cpp $(OBJ_DIR)/sw_cubin.h $(OBJ_DIR)/lal_base_three.o
+	$(CUDR) -o $@ -c lal_sw.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_sw_ext.o: $(ALL_H) lal_sw.h lal_sw_ext.cpp lal_base_three.h
+	$(CUDR) -o $@ -c lal_sw_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/beck.cubin: lal_beck.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_beck.cu
+
+$(OBJ_DIR)/beck_cubin.h: $(OBJ_DIR)/beck.cubin $(OBJ_DIR)/beck.cubin
+	$(BIN2C) -c -n beck $(OBJ_DIR)/beck.cubin > $(OBJ_DIR)/beck_cubin.h
+
+$(OBJ_DIR)/lal_beck.o: $(ALL_H) lal_beck.h lal_beck.cpp $(OBJ_DIR)/beck_cubin.h $(OBJ_DIR)/lal_base_atomic.o
+	$(CUDR) -o $@ -c lal_beck.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_beck_ext.o: $(ALL_H) lal_beck.h lal_beck_ext.cpp lal_base_atomic.h
+	$(CUDR) -o $@ -c lal_beck_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/mie.cubin: lal_mie.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_mie.cu
+
+$(OBJ_DIR)/mie_cubin.h: $(OBJ_DIR)/mie.cubin $(OBJ_DIR)/mie.cubin
+	$(BIN2C) -c -n mie $(OBJ_DIR)/mie.cubin > $(OBJ_DIR)/mie_cubin.h
+
+$(OBJ_DIR)/lal_mie.o: $(ALL_H) lal_mie.h lal_mie.cpp $(OBJ_DIR)/mie_cubin.h $(OBJ_DIR)/lal_base_atomic.o
+	$(CUDR) -o $@ -c lal_mie.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_mie_ext.o: $(ALL_H) lal_mie.h lal_mie_ext.cpp lal_base_atomic.h
+	$(CUDR) -o $@ -c lal_mie_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/soft.cubin: lal_soft.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_soft.cu
+
+$(OBJ_DIR)/soft_cubin.h: $(OBJ_DIR)/soft.cubin $(OBJ_DIR)/soft.cubin
+	$(BIN2C) -c -n soft $(OBJ_DIR)/soft.cubin > $(OBJ_DIR)/soft_cubin.h
+
+$(OBJ_DIR)/lal_soft.o: $(ALL_H) lal_soft.h lal_soft.cpp $(OBJ_DIR)/soft_cubin.h $(OBJ_DIR)/lal_base_atomic.o
+	$(CUDR) -o $@ -c lal_soft.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_soft_ext.o: $(ALL_H) lal_soft.h lal_soft_ext.cpp lal_base_atomic.h
+	$(CUDR) -o $@ -c lal_soft_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lj_coul_msm.cubin: lal_lj_coul_msm.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj_coul_msm.cu
+
+$(OBJ_DIR)/lj_coul_msm_cubin.h: $(OBJ_DIR)/lj_coul_msm.cubin $(OBJ_DIR)/lj_coul_msm.cubin
+	$(BIN2C) -c -n lj_coul_msm $(OBJ_DIR)/lj_coul_msm.cubin > $(OBJ_DIR)/lj_coul_msm_cubin.h
+
+$(OBJ_DIR)/lal_lj_coul_msm.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm.cpp $(OBJ_DIR)/lj_coul_msm_cubin.h $(OBJ_DIR)/lal_base_charge.o
+	$(CUDR) -o $@ -c lal_lj_coul_msm.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_lj_coul_msm_ext.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm_ext.cpp lal_base_charge.h
+	$(CUDR) -o $@ -c lal_lj_coul_msm_ext.cpp -I$(OBJ_DIR)
+
 $(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVD_H)
 	$(CUDR) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_CUDADR $(CUDA_LIB) -lcuda 
 
diff --git a/lib/gpu/Opencl.makefile b/lib/gpu/Opencl.makefile
index b476003ecd..81e80588cd 100644
--- a/lib/gpu/Opencl.makefile
+++ b/lib/gpu/Opencl.makefile
@@ -17,7 +17,7 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_answer.o \
        $(OBJ_DIR)/lal_neighbor_shared.o $(OBJ_DIR)/lal_neighbor.o \
        $(OBJ_DIR)/lal_device.o $(OBJ_DIR)/lal_base_atomic.o \
        $(OBJ_DIR)/lal_base_charge.o $(OBJ_DIR)/lal_base_ellipsoid.o \
-       $(OBJ_DIR)/lal_base_dipole.o \
+       $(OBJ_DIR)/lal_base_dipole.o $(OBJ_DIR)/lal_base_three.o \
        $(OBJ_DIR)/lal_pppm.o $(OBJ_DIR)/lal_pppm_ext.o \
        $(OBJ_DIR)/lal_gayberne.o $(OBJ_DIR)/lal_gayberne_ext.o \
        $(OBJ_DIR)/lal_re_squared.o $(OBJ_DIR)/lal_re_squared_ext.o \
@@ -48,7 +48,12 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_answer.o \
        $(OBJ_DIR)/lal_gauss.o $(OBJ_DIR)/lal_gauss_ext.o \
        $(OBJ_DIR)/lal_yukawa_colloid.o $(OBJ_DIR)/lal_yukawa_colloid_ext.o \
        $(OBJ_DIR)/lal_lj_coul_debye.o $(OBJ_DIR)/lal_lj_coul_debye_ext.o \
-       $(OBJ_DIR)/lal_coul_dsf.o $(OBJ_DIR)/lal_coul_dsf_ext.o
+       $(OBJ_DIR)/lal_coul_dsf.o $(OBJ_DIR)/lal_coul_dsf_ext.o \
+       $(OBJ_DIR)/lal_sw.o $(OBJ_DIR)/lal_sw_ext.o \
+       $(OBJ_DIR)/lal_beck.o $(OBJ_DIR)/lal_beck_ext.o \
+       $(OBJ_DIR)/lal_mie.o $(OBJ_DIR)/lal_mie_ext.o \
+       $(OBJ_DIR)/lal_soft.o $(OBJ_DIR)/lal_soft_ext.o \
+       $(OBJ_DIR)/lal_lj_coul_msm.o $(OBJ_DIR)/lal_lj_coul_msm_ext.o
 
 KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \
        $(OBJ_DIR)/neighbor_cpu_cl.h $(OBJ_DIR)/pppm_cl.h \
@@ -68,7 +73,9 @@ KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \
        $(OBJ_DIR)/born_coul_long_cl.h $(OBJ_DIR)/dipole_lj_cl.h \
        $(OBJ_DIR)/dipole_lj_sf_cl.h $(OBJ_DIR)/colloid_cl.h \
        $(OBJ_DIR)/gauss_cl.h $(OBJ_DIR)/yukawa_colloid_cl.h \
-       $(OBJ_DIR)/lj_coul_debye_cl.h $(OBJ_DIR)/coul_dsf_cl.h
+       $(OBJ_DIR)/lj_coul_debye_cl.h $(OBJ_DIR)/coul_dsf_cl.h \
+       $(OBJ_DIR)/sw_cl.h $(OBJ_DIR)/beck_cl.h $(OBJ_DIR)/mie_cl.h \
+       $(OBJ_DIR)/soft_cl.h $(OBJ_DIR)/lj_coul_msm_cl.h
 
 
 OCL_EXECS = $(BIN_DIR)/ocl_get_devices
@@ -117,6 +124,9 @@ $(OBJ_DIR)/lal_base_ellipsoid.o: $(OCL_H) lal_base_ellipsoid.h lal_base_ellipsoi
 $(OBJ_DIR)/lal_base_dipole.o: $(OCL_H) lal_base_dipole.h lal_base_dipole.cpp
 	$(OCL) -o $@ -c lal_base_dipole.cpp -I$(OBJ_DIR)
 
+$(OBJ_DIR)/lal_base_three.o: $(OCL_H) lal_base_three.h lal_base_three.cpp
+	$(OCL) -o $@ -c lal_base_three.cpp
+
 $(OBJ_DIR)/pppm_cl.h: lal_pppm.cu lal_preprocessor.h
 	$(BSH) ./geryon/file_to_cstr.sh pppm lal_preprocessor.h lal_pppm.cu $(OBJ_DIR)/pppm_cl.h;
 
@@ -405,6 +415,51 @@ $(OBJ_DIR)/lal_coul_dsf.o: $(ALL_H) lal_coul_dsf.h lal_coul_dsf.cpp  $(OBJ_DIR)/
 $(OBJ_DIR)/lal_coul_dsf_ext.o: $(ALL_H) lal_coul_dsf.h lal_coul_dsf_ext.cpp lal_base_charge.h
 	$(OCL) -o $@ -c lal_coul_dsf_ext.cpp -I$(OBJ_DIR)
 
+$(OBJ_DIR)/sw_cl.h: lal_sw.cu $(PRE1_H)
+	$(BSH) ./geryon/file_to_cstr.sh sw $(PRE1_H) lal_sw.cu $(OBJ_DIR)/sw_cl.h;
+
+$(OBJ_DIR)/lal_sw.o: $(ALL_H) lal_sw.h lal_sw.cpp  $(OBJ_DIR)/sw_cl.h $(OBJ_DIR)/sw_cl.h $(OBJ_DIR)/lal_base_three.o
+	$(OCL) -o $@ -c lal_sw.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_sw_ext.o: $(ALL_H) lal_sw.h lal_sw_ext.cpp lal_base_three.h
+	$(OCL) -o $@ -c lal_sw_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/beck_cl.h: lal_beck.cu $(PRE1_H)
+	$(BSH) ./geryon/file_to_cstr.sh beck $(PRE1_H) lal_beck.cu $(OBJ_DIR)/beck_cl.h;
+
+$(OBJ_DIR)/lal_beck.o: $(ALL_H) lal_beck.h lal_beck.cpp  $(OBJ_DIR)/beck_cl.h $(OBJ_DIR)/beck_cl.h $(OBJ_DIR)/lal_base_atomic.o
+	$(OCL) -o $@ -c lal_beck.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_beck_ext.o: $(ALL_H) lal_beck.h lal_beck_ext.cpp lal_base_atomic.h
+	$(OCL) -o $@ -c lal_beck_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/mie_cl.h: lal_mie.cu $(PRE1_H)
+	$(BSH) ./geryon/file_to_cstr.sh mie $(PRE1_H) lal_mie.cu $(OBJ_DIR)/mie_cl.h;
+
+$(OBJ_DIR)/lal_mie.o: $(ALL_H) lal_mie.h lal_mie.cpp  $(OBJ_DIR)/mie_cl.h $(OBJ_DIR)/mie_cl.h $(OBJ_DIR)/lal_base_atomic.o
+	$(OCL) -o $@ -c lal_mie.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_mie_ext.o: $(ALL_H) lal_mie.h lal_mie_ext.cpp lal_base_atomic.h
+	$(OCL) -o $@ -c lal_mie_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/soft_cl.h: lal_soft.cu $(PRE1_H)
+	$(BSH) ./geryon/file_to_cstr.sh soft $(PRE1_H) lal_soft.cu $(OBJ_DIR)/soft_cl.h;
+
+$(OBJ_DIR)/lal_soft.o: $(ALL_H) lal_soft.h lal_soft.cpp  $(OBJ_DIR)/soft_cl.h $(OBJ_DIR)/soft_cl.h $(OBJ_DIR)/lal_base_atomic.o
+	$(OCL) -o $@ -c lal_soft.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_soft_ext.o: $(ALL_H) lal_soft.h lal_soft_ext.cpp lal_base_atomic.h
+	$(OCL) -o $@ -c lal_soft_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lj_coul_msm_cl.h: lal_lj_coul_msm.cu $(PRE1_H)
+	$(BSH) ./geryon/file_to_cstr.sh lj_coul_msm $(PRE1_H) lal_lj_coul_msm.cu $(OBJ_DIR)/lj_coul_msm_cl.h;
+
+$(OBJ_DIR)/lal_lj_coul_msm.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm.cpp  $(OBJ_DIR)/lj_coul_msm_cl.h $(OBJ_DIR)/lj_coul_msm_cl.h $(OBJ_DIR)/lal_base_charge.o
+	$(OCL) -o $@ -c lal_lj_coul_msm.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_lj_coul_msm_ext.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm_ext.cpp lal_base_charge.h
+	$(OCL) -o $@ -c lal_lj_coul_msm_ext.cpp -I$(OBJ_DIR)
+
 $(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp
 	$(OCL) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_OPENCL $(OCL_LINK) 
 
diff --git a/lib/gpu/README b/lib/gpu/README
index f14496efc9..900cffd35c 100644
--- a/lib/gpu/README
+++ b/lib/gpu/README
@@ -3,6 +3,7 @@
                   --------------------------------
                      
                        W. Michael Brown (ORNL)
+                        Trung Dac Nguyen (ORNL)
                           Peng Wang (NVIDIA)
                        Axel Kohlmeyer (Temple)
                          Steve Plimpton (SNL)
@@ -60,6 +61,8 @@ devices on your system. A Makefile for OpenCL compilation is provided,
 but support for OpenCL use is not currently provided by the developers.
 Details of the implementation are provided in:
 
+----
+
 Brown, W.M., Wang, P. Plimpton, S.J., Tharrington, A.N. Implementing 
 Molecular Dynamics on Hybrid High Performance Computers - Short Range 
 Forces. Computer Physics Communications. 2011. 182: p. 898-911. 
@@ -68,28 +71,64 @@ and
 
 Brown, W.M., Kohlmeyer, A. Plimpton, S.J., Tharrington, A.N. Implementing 
 Molecular Dynamics on Hybrid High Performance Computers - Particle-Particle 
-Particle-Mesh. Computer Physics Communications. 2011. In press. 
+Particle-Mesh. Computer Physics Communications. 2012. 183: p. 449-459. 
 
+and
+
+Brown, W.M., Masako, Y. Implementing Molecular Dynamics on Hybrid High 
+Performance Computers - Three-Body Potentials. Computer Physics Communications. 
+2013. In press.
+
+----
 
 NOTE: Installation of the CUDA SDK is not required.
 
 Current styles supporting GPU acceleration:
 
-   1. lj/cut
-   2. lj96/cut
-   3. lj/expand
-   4. lj/cut/coul/cut
-   5. lj/cut/coul/long
-   6. lj/charmm/coul/long
-   7. lj/class2
-   8. lj/class2/coul/long
-   9. morse
-  10. cg/cmm
-  11. cg/cmm/coul/long
-  12. coul/long
-  13. gayberne
-  14. resquared
-  15. pppm
+     1  beck
+     2  born/coul/long
+     3  born/coul/wolf
+     4  born
+     5  buck/coul/cut
+     6  buck/coul/long
+     7  buck
+     8  colloid
+     9  coul/dsf
+    10  coul/long
+    11  eam/alloy
+    12  eam/fs
+    13  eam
+    14  eam/lj
+    15  gauss
+    16  gayberne
+    17  lj96/cut
+    18  lj/charmm/coul/long
+    19  lj/class2/coul/long
+    20  lj/class2
+    21  lj/cut/coul/cut
+    22  lj/cut/coul/debye
+    23  lj/cut/coul/dsf
+    24  lj/cut/coul/long
+    25  lj/cut/coul/msm
+    26  lj/cut/coul/wolf/fsw
+    27  lj/cut/dipole/cut
+    28  lj/cut
+    29  lj/cut/tgpu
+    30  lj/expand
+    31  lj/sdk/coul/long
+    32  cg/cmm/coul/long
+    33  lj/sdk
+    34  cg/cmm
+    35  lj/sf/dipole/sf
+    36  mie/cut
+    37  morse
+    38  resquared
+    39  soft
+    40  sw
+    41  table
+    42  yukawa/colloid
+    43  yukawa
+    44  pppm
 
 
                      MULTIPLE LAMMPS PROCESSES
@@ -170,3 +209,4 @@ make yes-asphere
 make yes-kspace
 make yes-gpu
 make linux
+
diff --git a/lib/gpu/geryon/VERSION.txt b/lib/gpu/geryon/VERSION.txt
index 47cefed44d..733136e842 100644
--- a/lib/gpu/geryon/VERSION.txt
+++ b/lib/gpu/geryon/VERSION.txt
@@ -1 +1 @@
-Geryon Version 12.033
+Geryon Version 13.209
diff --git a/lib/gpu/geryon/nvd_device.h b/lib/gpu/geryon/nvd_device.h
index 938e1d3bd6..5fffe77c82 100644
--- a/lib/gpu/geryon/nvd_device.h
+++ b/lib/gpu/geryon/nvd_device.h
@@ -65,15 +65,19 @@ class UCL_Device {
  public:
   /// Collect properties for every GPU on the node
   /** \note You must set the active GPU with set() before using the device **/
-  UCL_Device();
+  inline UCL_Device();
   
-  ~UCL_Device();
+  inline ~UCL_Device();
 
   /// Returns 1 (For compatibility with OpenCL)
   inline int num_platforms() { return 1; }
 
   /// Return a string with name and info of the current platform
-  std::string platform_name() { return "NVIDIA Corporation NVIDIA CUDA Driver"; }
+  inline std::string platform_name() 
+    { return "NVIDIA Corporation NVIDIA CUDA Driver"; }
+
+  /// Delete any contexts/data and set the platform number to be used
+  inline int set_platform(const int pid);
 
   /// Return the number of devices that support CUDA
   inline int num_devices() { return _properties.size(); }
@@ -81,8 +85,12 @@ class UCL_Device {
   /// Set the CUDA device to the specified device number
   /** A context and default command queue will be created for the device
     * Returns UCL_SUCCESS if successful or UCL_ERROR if the device could not
-    * be allocated for use **/
-  int set(int num);
+    * be allocated for use. clear() is called to delete any contexts and
+    * associated data from previous calls to set(). **/
+  inline int set(int num);
+
+  /// Delete any context and associated data stored from a call to set()
+  inline void clear();
 
   /// Get the current device number
   inline int device_num() { return _device; }
@@ -147,16 +155,17 @@ class UCL_Device {
   inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; }
   
   /// Returns true if double precision is support for the current device
-  bool double_precision() { return double_precision(_device); }
+  inline bool double_precision() { return double_precision(_device); }
   /// Returns true if double precision is support for the device
-  bool double_precision(const int i) {return arch(i)>=1.3;}
+  inline bool double_precision(const int i) {return arch(i)>=1.3;}
   
   /// Get the number of cores in the current device
   inline unsigned cores() { return cores(_device); }
   /// Get the number of cores
   inline unsigned cores(const int i) 
     { if (arch(i)<2.0) return _properties[i].multiProcessorCount*8; 
-      else if (arch(i)<3.0) return _properties[i].multiProcessorCount*32;
+      else if (arch(i)<2.1) return _properties[i].multiProcessorCount*32;
+      else if (arch(i)<3.0) return _properties[i].multiProcessorCount*48;
       else return _properties[i].multiProcessorCount*192; }
   
   /// Get the gigabytes of global memory in the current device
@@ -216,8 +225,34 @@ class UCL_Device {
   inline bool sharing_supported(const int i)
     { return (_properties[i].computeMode == CU_COMPUTEMODE_DEFAULT); }
 
+  /// True if splitting device into equal subdevices supported
+  inline bool fission_equal()
+    { return fission_equal(_device); }
+  /// True if splitting device into equal subdevices supported
+  inline bool fission_equal(const int i)
+    { return false; }
+  /// True if splitting device into subdevices by specified counts supported
+  inline bool fission_by_counts()
+    { return fission_by_counts(_device); }
+  /// True if splitting device into subdevices by specified counts supported
+  inline bool fission_by_counts(const int i)
+    { return false; }    
+  /// True if splitting device into subdevices by affinity domains supported
+  inline bool fission_by_affinity()
+    { return fission_by_affinity(_device); }
+  /// True if splitting device into subdevices by affinity domains supported
+  inline bool fission_by_affinity(const int i)
+    { return false; }
+
+  /// Maximum number of subdevices allowed from device fission
+  inline int max_sub_devices()
+    { return max_sub_devices(_device); }
+  /// Maximum number of subdevices allowed from device fission
+  inline int max_sub_devices(const int i)
+    { return 0; }
+
   /// List all devices along with all properties
-  void print_all(std::ostream &out);
+  inline void print_all(std::ostream &out);
  
  private:
   int _device, _num_devices;
@@ -228,7 +263,7 @@ class UCL_Device {
 };
 
 // Grabs the properties for all devices
-inline UCL_Device::UCL_Device() {
+UCL_Device::UCL_Device() {
   CU_SAFE_CALL_NS(cuInit(0));
   CU_SAFE_CALL_NS(cuDeviceGetCount(&_num_devices));
   for (int dev=0; dev<_num_devices; ++dev) {
@@ -280,22 +315,21 @@ inline UCL_Device::UCL_Device() {
   _cq.back()=0;
 }
 
-inline UCL_Device::~UCL_Device() {
-  if (_device>-1) {
-    for (int i=1; i<num_queues(); i++) pop_command_queue();
-    cuCtxDestroy(_context);
-  }
+UCL_Device::~UCL_Device() {
+  clear();
+}
+
+int UCL_Device::set_platform(const int pid) {
+  clear();
+  #ifdef UCL_DEBUG
+  assert(pid<num_platforms());
+  #endif
+  return UCL_SUCCESS;
 }
 
 // Set the CUDA device to the specified device number
-inline int UCL_Device::set(int num) {
-  if (_device==num)
-    return UCL_SUCCESS;
-  if (_device>-1) {
-    CU_SAFE_CALL_NS(cuCtxDestroy(_context));
-    for (int i=1; i<num_queues(); i++) pop_command_queue();
-    _cq[0]=0;
-  }
+int UCL_Device::set(int num) {
+  clear();
   _device=_properties[num].device_id;
   CU_SAFE_CALL_NS(cuDeviceGet(&_cu_device,_device));
   CUresult err=cuCtxCreate(&_context,0,_cu_device);
@@ -310,8 +344,16 @@ inline int UCL_Device::set(int num) {
   return UCL_SUCCESS;
 }
 
+void UCL_Device::clear() {
+  if (_device>-1) {
+    for (int i=1; i<num_queues(); i++) pop_command_queue();
+    cuCtxDestroy(_context);
+  }
+  _device=-1;
+}
+
 // List all devices along with all properties
-inline void UCL_Device::print_all(std::ostream &out) {
+void UCL_Device::print_all(std::ostream &out) {
   #if CUDA_VERSION >= 2020
   int driver_version;
   cuDriverGetVersion(&driver_version);
diff --git a/lib/gpu/geryon/nvd_kernel.h b/lib/gpu/geryon/nvd_kernel.h
index fecd85eeb8..e0bfb1bb5e 100644
--- a/lib/gpu/geryon/nvd_kernel.h
+++ b/lib/gpu/geryon/nvd_kernel.h
@@ -377,6 +377,10 @@ class UCL_Kernel {
     #endif
   }
 
+  /// Return the default command queue/stream associated with this data
+  inline command_queue & cq() { return _cq; }
+  /// Change the default command queue associated with matrix
+  inline void cq(command_queue &cq_in) { _cq=cq_in; }
   #include "ucl_arg_kludge.h"
 
  private:
diff --git a/lib/gpu/geryon/nvd_memory.h b/lib/gpu/geryon/nvd_memory.h
index 335418fe5f..5f7b98ba5c 100644
--- a/lib/gpu/geryon/nvd_memory.h
+++ b/lib/gpu/geryon/nvd_memory.h
@@ -47,14 +47,14 @@ typedef CUdeviceptr device_ptr;
 // --------------------------------------------------------------------------
 template <class mat_type, class copy_type>
 inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,  
-                        const enum UCL_MEMOPT kind) {
+                       const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
   CUresult err=CUDA_SUCCESS;
-  if (kind==UCL_RW_OPTIMIZED)  
-    err=cuMemAllocHost((void **)mat.host_ptr(),n);
-  else if (kind==UCL_WRITE_OPTIMIZED)
+  if (kind==UCL_NOT_PINNED)
+    *(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
+  else if (kind==UCL_WRITE_ONLY)
     err=cuMemHostAlloc((void **)mat.host_ptr(),n,CU_MEMHOSTALLOC_WRITECOMBINED);
   else
-    *(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
+    err=cuMemAllocHost((void **)mat.host_ptr(),n);
   if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL)
     return UCL_MEMORY_ERROR;
   mat.cq()=cm.cq();
@@ -63,14 +63,14 @@ inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
 
 template <class mat_type>
 inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,  
-                       const enum UCL_MEMOPT kind) {
+                       const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
   CUresult err=CUDA_SUCCESS;
-  if (kind==UCL_RW_OPTIMIZED)  
-    err=cuMemAllocHost((void **)mat.host_ptr(),n);
-  else if (kind==UCL_WRITE_OPTIMIZED)
+  if (kind==UCL_NOT_PINNED)
+    *(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
+  else if (kind==UCL_WRITE_ONLY)
     err=cuMemHostAlloc((void **)mat.host_ptr(),n,CU_MEMHOSTALLOC_WRITECOMBINED);
   else
-    *(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
+    err=cuMemAllocHost((void **)mat.host_ptr(),n);
   if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL)
     return UCL_MEMORY_ERROR;
   mat.cq()=dev.cq();
@@ -78,8 +78,10 @@ inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
 }
 
 template <class mat_type>
-inline void _host_free(mat_type &mat, const enum UCL_MEMOPT kind) {
-  if (kind!=UCL_NOT_PINNED)
+inline void _host_free(mat_type &mat) {
+  if (mat.kind()==UCL_VIEW)
+    return;
+  else if (mat.kind()!=UCL_NOT_PINNED)
     CU_DESTRUCT_CALL(cuMemFreeHost(mat.begin()));
   else
     free(mat.begin());
@@ -87,14 +89,14 @@ inline void _host_free(mat_type &mat, const enum UCL_MEMOPT kind) {
 
 template <class mat_type>
 inline int _host_resize(mat_type &mat, const size_t n) {
-  _host_free(mat,mat.kind());
+  _host_free(mat);
   CUresult err=CUDA_SUCCESS;
-  if (mat.kind()==UCL_RW_OPTIMIZED)  
-    err=cuMemAllocHost((void **)mat.host_ptr(),n);
-  else if (mat.kind()==UCL_WRITE_OPTIMIZED)
-    err=cuMemHostAlloc((void **)mat.host_ptr(),n,CU_MEMHOSTALLOC_WRITECOMBINED);
-  else
+  if (mat.kind()==UCL_NOT_PINNED)
     *(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
+  else if (mat.kind()==UCL_WRITE_ONLY)
+    err=cuMemHostAlloc((void **)mat.host_ptr(),n,CU_MEMHOSTALLOC_WRITECOMBINED);
+  else  
+    err=cuMemAllocHost((void **)mat.host_ptr(),n);
   if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL)
     return UCL_MEMORY_ERROR;
   return UCL_SUCCESS;
@@ -155,7 +157,8 @@ inline int _device_alloc(mat_type &mat, UCL_Device &d, const size_t rows,
 
 template <class mat_type>
 inline void _device_free(mat_type &mat) {
-  CU_DESTRUCT_CALL(cuMemFree(mat.cbegin()));
+  if (mat.kind()!=UCL_VIEW)
+    CU_DESTRUCT_CALL(cuMemFree(mat.cbegin()));
 }
 
 template <class mat_type>
@@ -229,13 +232,13 @@ inline void _host_zero(void *ptr, const size_t n) {
 }
 
 template <class mat_type>
-inline void _device_zero(mat_type &mat, const size_t n) {
+inline void _device_zero(mat_type &mat, const size_t n, command_queue &cq) {
   if (n%32==0)
-    CU_SAFE_CALL(cuMemsetD32(mat.cbegin(),0,n/4));
+    CU_SAFE_CALL(cuMemsetD32Async(mat.cbegin(),0,n/4,cq));
   else if (n%16==0)
-    CU_SAFE_CALL(cuMemsetD16(mat.cbegin(),0,n/2));
+    CU_SAFE_CALL(cuMemsetD16Async(mat.cbegin(),0,n/2,cq));
   else
-    CU_SAFE_CALL(cuMemsetD8(mat.cbegin(),0,n));
+    CU_SAFE_CALL(cuMemsetD8Async(mat.cbegin(),0,n,cq));
 }
 
 // --------------------------------------------------------------------------
diff --git a/lib/gpu/geryon/ocl_device.h b/lib/gpu/geryon/ocl_device.h
index 391eeb9d95..79fa53d552 100644
--- a/lib/gpu/geryon/ocl_device.h
+++ b/lib/gpu/geryon/ocl_device.h
@@ -51,6 +51,10 @@ inline void ucl_sync(cl_command_queue &cq) {
   CL_SAFE_CALL(clFinish(cq));
 }
 
+inline bool _shared_mem_device(cl_device_type &device_type) {
+  return (device_type==CL_DEVICE_TYPE_CPU);
+}
+
 struct OCLProperties {
   std::string name;
   cl_device_type device_type;
@@ -64,6 +68,10 @@ struct OCLProperties {
   bool double_precision;
   int alignment;
   size_t timer_resolution;
+  bool ecc_support;
+  std::string c_version;
+  bool partition_equal, partition_counts, partition_affinity;
+  cl_uint max_sub_devices;
 };
 
 /// Class for looking at data parallel device properties
@@ -74,15 +82,18 @@ class UCL_Device {
  public:
   /// Collect properties for every device on the node
    /** \note You must set the active GPU with set() before using the device **/
-  UCL_Device();
+  inline UCL_Device();
   
-  ~UCL_Device();
+  inline ~UCL_Device();
 
   /// Return the number of platforms (0 if error or no platforms)
   inline int num_platforms() { return _num_platforms; }
   
   /// Return a string with name and info of the current platform
-  std::string platform_name();
+  inline std::string platform_name();
+
+  /// Delete any contexts/data and set the platform number to be used
+  inline int set_platform(const int pid);
 
   /// Return the number of devices that support OpenCL
   inline int num_devices() { return _num_devices; }
@@ -90,8 +101,12 @@ class UCL_Device {
   /// Set the OpenCL device to the specified device number
   /** A context and default command queue will be created for the device *
     * Returns UCL_SUCCESS if successful or UCL_ERROR if the device could not
-    * be allocated for use **/
-  int set(int num);
+    * be allocated for use. clear() is called to delete any contexts and
+    * associated data from previous calls to set(). **/
+  inline int set(int num);
+  
+  /// Delete any context and associated data stored from a call to set()
+  inline void clear();
 
   /// Get the current device number
   inline int device_num() { return _device; }
@@ -161,12 +176,14 @@ class UCL_Device {
   /// Returns true if host memory is efficiently addressable from device
   inline bool shared_memory() { return shared_memory(_device); }
   /// Returns true if host memory is efficiently addressable from device
-  inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; }
+  inline bool shared_memory(const int i) 
+    { return _shared_mem_device(_properties[i].device_type); }
   
   /// Returns true if double precision is support for the current device
-  bool double_precision() { return double_precision(_device); }
+  inline bool double_precision() { return double_precision(_device); }
   /// Returns true if double precision is support for the device
-  bool double_precision(const int i) {return _properties[i].double_precision;}
+  inline bool double_precision(const int i) 
+    {return _properties[i].double_precision;}
    
   /// Get the number of cores in the current device
   inline unsigned cores() { return cores(_device); }
@@ -227,8 +244,34 @@ class UCL_Device {
   inline bool sharing_supported(const int i)
     { return true; }
 
+  /// True if splitting device into equal subdevices supported
+  inline bool fission_equal()
+    { return fission_equal(_device); }
+  /// True if splitting device into equal subdevices supported
+  inline bool fission_equal(const int i)
+    { return _properties[i].partition_equal; }
+  /// True if splitting device into subdevices by specified counts supported
+  inline bool fission_by_counts()
+    { return fission_by_counts(_device); }
+  /// True if splitting device into subdevices by specified counts supported
+  inline bool fission_by_counts(const int i)
+    { return _properties[i].partition_counts; }    
+  /// True if splitting device into subdevices by affinity domains supported
+  inline bool fission_by_affinity()
+    { return fission_by_affinity(_device); }
+  /// True if splitting device into subdevices by affinity domains supported
+  inline bool fission_by_affinity(const int i)
+    { return _properties[i].partition_affinity; }
+
+  /// Maximum number of subdevices allowed from device fission
+  inline int max_sub_devices()
+    { return max_sub_devices(_device); }
+  /// Maximum number of subdevices allowed from device fission
+  inline int max_sub_devices(const int i)
+    { return _properties[i].max_sub_devices; }
+
   /// List all devices along with all properties
-  void print_all(std::ostream &out);
+  inline void print_all(std::ostream &out);
   
   /// Return the OpenCL type for the device
   inline cl_device_id & cl_device() { return _cl_device; }
@@ -237,7 +280,8 @@ class UCL_Device {
   int _num_platforms;          // Number of platforms
   int _platform;               // UCL_Device ID for current platform
   cl_platform_id _cl_platform; // OpenCL ID for current platform
-  cl_context _context;         // Context used for accessing the device
+  cl_platform_id _cl_platforms[20]; // OpenCL IDs for all platforms
+  cl_context _context;              // Context used for accessing the device
   std::vector<cl_command_queue> _cq;// The default command queue for this device
   int _device;                            // UCL_Device ID for current device
   cl_device_id _cl_device;                // OpenCL ID for current device
@@ -245,31 +289,57 @@ class UCL_Device {
   int _num_devices;                       // Number of devices
   std::vector<OCLProperties> _properties; // Properties for each device
   
-  void add_properties(cl_device_id);
-  int create_context();
+  inline void add_properties(cl_device_id);
+  inline int create_context();
   int _default_cq;
 };
 
 // Grabs the properties for all devices
-inline UCL_Device::UCL_Device() {
-  cl_int errorv;
-  cl_uint nplatforms;
-  
-  _cl_device=0;
+UCL_Device::UCL_Device() {
   _device=-1;
-  _num_devices=0;
-  _platform=0;
-  _default_cq=0;
 
   // --- Get Number of Platforms
-  errorv=clGetPlatformIDs(1,&_cl_platform,&nplatforms);
+  cl_uint nplatforms;
+  cl_int errorv=clGetPlatformIDs(20,_cl_platforms,&nplatforms);
   
   if (errorv!=CL_SUCCESS) {
     _num_platforms=0;
     return;
   } else
     _num_platforms=static_cast<int>(nplatforms);
+
+  set_platform(0);
+}
+
+UCL_Device::~UCL_Device() {
+  clear();
+}
+
+void UCL_Device::clear() {
+  if (_device>-1) {
+    for (size_t i=0; i<_cq.size(); i++) {
+      CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq.back()));
+      _cq.pop_back();
+    }
+    CL_DESTRUCT_CALL(clReleaseContext(_context));
+  }
+  _device=-1;
+}
+
+int UCL_Device::set_platform(int pid) {
+  clear();
+  cl_int errorv;
+  
+  _cl_device=0;
+  _device=-1;
+  _num_devices=0;
+  _default_cq=0;
  
+  #ifdef UCL_DEBUG
+  assert(pid<num_platforms());
+  #endif
+  _platform=pid;
+  _cl_platform=_cl_platforms[_platform];
   
   // --- Get Number of Devices
   cl_uint n;
@@ -277,7 +347,7 @@ inline UCL_Device::UCL_Device() {
   _num_devices=n;
   if (errorv!=CL_SUCCESS || _num_devices==0) {
     _num_devices=0;
-    return;
+    return UCL_ERROR;
   }
   cl_device_id device_list[_num_devices];
   CL_SAFE_CALL(clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,n,device_list,
@@ -288,19 +358,11 @@ inline UCL_Device::UCL_Device() {
     _cl_devices.push_back(device_list[i]);
     add_properties(device_list[i]);
   }
+
+  return UCL_SUCCESS;
 }
 
-inline UCL_Device::~UCL_Device() {
-  if (_device>-1) {
-    for (size_t i=0; i<_cq.size(); i++) {
-      CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq.back()));
-      _cq.pop_back();
-    }
-    CL_DESTRUCT_CALL(clReleaseContext(_context));
-  }
-}
-
-inline int UCL_Device::create_context() {
+int UCL_Device::create_context() {
   cl_int errorv;
   cl_context_properties props[3];
   props[0]=CL_CONTEXT_PLATFORM;
@@ -320,9 +382,10 @@ inline int UCL_Device::create_context() {
   return UCL_SUCCESS;
 }
 
-inline void UCL_Device::add_properties(cl_device_id device_list) {
+void UCL_Device::add_properties(cl_device_id device_list) {
   OCLProperties op;
   char buffer[1024];
+  cl_bool ans_bool;
     
   CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_NAME,1024,buffer,NULL));
   op.name=buffer;
@@ -363,10 +426,49 @@ inline void UCL_Device::add_properties(cl_device_id device_list) {
                                CL_DEVICE_PROFILING_TIMER_RESOLUTION,
                                sizeof(size_t),&op.timer_resolution,NULL));
   
+
+  op.ecc_support=false;
+  CL_SAFE_CALL(clGetDeviceInfo(device_list,
+                               CL_DEVICE_ERROR_CORRECTION_SUPPORT,
+                               sizeof(ans_bool),&ans_bool,NULL));
+  if (ans_bool==CL_TRUE)
+    op.ecc_support=true;
+  
+  op.c_version="";
+  op.partition_equal=false;
+  op.partition_counts=false;
+  op.partition_affinity=false;
+
+  #ifdef CL_VERSION_1_2
+  size_t return_bytes;
+  CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_OPENCL_C_VERSION,1024,
+                               buffer,NULL));
+  op.c_version=buffer;
+
+  cl_device_partition_property pinfo[4];
+  CL_SAFE_CALL(clGetDeviceInfo(device_list,
+                               CL_DEVICE_PARTITION_PROPERTIES,
+                               4*sizeof(cl_device_partition_property),
+                               pinfo,&return_bytes));
+  int nprops=return_bytes/sizeof(cl_device_partition_property);
+  for (int i=0; i<nprops; i++) {
+    if (pinfo[i]==CL_DEVICE_PARTITION_EQUALLY)
+      op.partition_equal=true;
+    else if (pinfo[i]==CL_DEVICE_PARTITION_BY_COUNTS)
+      op.partition_counts=true;
+    else if (pinfo[i]==CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN)
+      op.partition_affinity=true;
+  }
+  
+  CL_SAFE_CALL(clGetDeviceInfo(device_list,
+                               CL_DEVICE_PARTITION_MAX_SUB_DEVICES,
+                               sizeof(cl_uint),&op.max_sub_devices,NULL));
+  #endif
+  
   _properties.push_back(op);
 }
 
-inline std::string UCL_Device::platform_name() {
+std::string UCL_Device::platform_name() {
   char info[1024];
   
   CL_SAFE_CALL(clGetPlatformInfo(_cl_platform,CL_PLATFORM_VENDOR,1024,info,
@@ -385,7 +487,7 @@ inline std::string UCL_Device::platform_name() {
 }
 
 // Get a string telling the type of the device
-inline std::string UCL_Device::device_type_name(const int i) {
+std::string UCL_Device::device_type_name(const int i) {
   if (_properties[i].device_type==CL_DEVICE_TYPE_CPU)
     return "CPU";
   else if (_properties[i].device_type==CL_DEVICE_TYPE_GPU)
@@ -397,7 +499,7 @@ inline std::string UCL_Device::device_type_name(const int i) {
 }
 
 // Get a string telling the type of the device
-inline int UCL_Device::device_type(const int i) {
+int UCL_Device::device_type(const int i) {
   if (_properties[i].device_type==CL_DEVICE_TYPE_CPU)
     return UCL_CPU;
   else if (_properties[i].device_type==CL_DEVICE_TYPE_GPU)
@@ -409,17 +511,8 @@ inline int UCL_Device::device_type(const int i) {
 }
 
 // Set the CUDA device to the specified device number
-inline int UCL_Device::set(int num) {
-  if (_device==num)
-    return UCL_SUCCESS;
-  
-  if (_device>-1) {
-    for (size_t i=0; i<_cq.size(); i++) {
-      CL_SAFE_CALL(clReleaseCommandQueue(_cq.back()));
-      _cq.pop_back();
-    }
-    CL_SAFE_CALL(clReleaseContext(_context));
-  }
+int UCL_Device::set(int num) {
+  clear();
   
   cl_device_id device_list[_num_devices];
   cl_uint n;
@@ -432,7 +525,7 @@ inline int UCL_Device::set(int num) {
 }
 
 // List all devices along with all properties
-inline void UCL_Device::print_all(std::ostream &out) {
+void UCL_Device::print_all(std::ostream &out) {
   if (num_devices() == 0)
     out << "There is no device supporting OpenCL\n";
   for (int i=0; i<num_devices(); ++i) {
@@ -475,6 +568,28 @@ inline void UCL_Device::print_all(std::ostream &out) {
     out << "  Clock rate:                                    "
         << clock_rate(i) << " GHz\n";
     //out << "  Concurrent copy and execution:                 ";
+    out << "  ECC support:                                   ";
+    if (_properties[i].ecc_support)
+      out << "Yes\n";
+    else
+      out << "No\n";
+    out << "  Device fission into equal partitions:          ";
+    if (fission_equal(i))
+      out << "Yes\n";
+    else
+      out << "No\n";
+    out << "  Device fission by counts:                      ";
+    if (fission_by_counts(i))
+      out << "Yes\n";
+    else
+      out << "No\n";
+    out << "  Device fission by affinity:                    ";
+    if (fission_by_affinity(i))
+      out << "Yes\n";
+    else
+      out << "No\n";
+    out << "  Maximum subdevices from fission:               "
+        << max_sub_devices(i) << std::endl;
   }
 }
 
diff --git a/lib/gpu/geryon/ocl_kernel.h b/lib/gpu/geryon/ocl_kernel.h
index 74bcea8f5e..0b5e43e93f 100644
--- a/lib/gpu/geryon/ocl_kernel.h
+++ b/lib/gpu/geryon/ocl_kernel.h
@@ -134,6 +134,11 @@ class UCL_Program {
     return UCL_SUCCESS;
   }                                               
    
+  /// Return the default command queue/stream associated with this data
+  inline command_queue & cq() { return _cq; }
+  /// Change the default command queue associated with matrix
+  inline void cq(command_queue &cq_in) { _cq=cq_in; }
+
   friend class UCL_Kernel;
  private:
   bool _init_done;
@@ -175,7 +180,16 @@ class UCL_Kernel {
   template <class dtype>
   inline void set_arg(const cl_uint index, const dtype * const arg) { 
     CL_SAFE_CALL(clSetKernelArg(_kernel,index,sizeof(dtype),arg)); 
-    if (index>_num_args) _num_args=index;
+    if (index>_num_args) {
+      _num_args=index;
+      #ifdef UCL_DEBUG
+      if (_num_args>_kernel_info_nargs) {
+        std::cerr << "TOO MANY ARGUMENTS TO OPENCL FUNCTION: " 
+                  << _kernel_info_name << std::endl;
+        assert(0==1);
+      }
+      #endif
+    }
   }
  
   /// Set a geryon container as a kernel argument.
@@ -203,6 +217,13 @@ class UCL_Kernel {
   inline void add_arg(const dtype * const arg) {
     CL_SAFE_CALL(clSetKernelArg(_kernel,_num_args,sizeof(dtype),arg)); 
     _num_args++; 
+    #ifdef UCL_DEBUG
+    if (_num_args>_kernel_info_nargs) {
+      std::cerr << "TOO MANY ARGUMENTS TO OPENCL FUNCTION: " 
+                << _kernel_info_name << std::endl;
+      assert(0==1);
+    }
+    #endif
   }
 
   /// Add a geryon container as a kernel argument.
@@ -289,10 +310,7 @@ class UCL_Kernel {
   }
   
   /// Run the kernel in the default command queue
-  inline void run() {
-    CL_SAFE_CALL(clEnqueueNDRangeKernel(_cq,_kernel,_dimensions,NULL,
-                                        _num_blocks,_block_size,0,NULL,NULL));
-  }
+  inline void run();
   
   /// Clear any arguments associated with the kernel
   inline void clear_args() { _num_args=0; }
@@ -309,6 +327,12 @@ class UCL_Kernel {
   
   cl_command_queue _cq;        // The default command queue for this kernel
   unsigned _num_args;
+
+  #ifdef UCL_DEBUG
+  std::string _kernel_info_name;
+  unsigned _kernel_info_nargs;
+  //std::string _kernel_info_args[256];
+  #endif
 };
 
 inline int UCL_Kernel::set_function(UCL_Program &program, const char *function) {
@@ -329,9 +353,32 @@ inline int UCL_Kernel::set_function(UCL_Program &program, const char *function)
     #endif
     return UCL_FUNCTION_NOT_FOUND;
   }
+  
+  #ifdef UCL_DEBUG
+  _kernel_info_name=function;
+  cl_uint nargs;
+  CL_SAFE_CALL(clGetKernelInfo(_kernel,CL_KERNEL_NUM_ARGS,sizeof(cl_uint),
+                               &nargs,NULL));
+  _kernel_info_nargs=nargs;
+  #ifdef NOT_TEST_CL_VERSION_1_2
+  char tname[256];
+  size_t ret;
+  for (cl_uint i=0; i<nargs; i++) {
+    CL_SAFE_CALL(clGetKernelArgInfo(_kernel,i,CL_KERNEL_ARG_TYPE_NAME,256,
+                                    tname,&ret));
+    _kernel_info_args[i]=tname;
+  }
+  #endif
+  #endif
+  
   return UCL_SUCCESS;                                               
 }
 
+void UCL_Kernel::run() {
+  CL_SAFE_CALL(clEnqueueNDRangeKernel(_cq,_kernel,_dimensions,NULL,
+                                      _num_blocks,_block_size,0,NULL,NULL));
+}
+
 } // namespace
 
 #endif
diff --git a/lib/gpu/geryon/ocl_memory.h b/lib/gpu/geryon/ocl_memory.h
index 6051ee7b3e..09089a52ce 100644
--- a/lib/gpu/geryon/ocl_memory.h
+++ b/lib/gpu/geryon/ocl_memory.h
@@ -54,82 +54,138 @@ typedef cl_mem device_ptr;
 
 template <class mat_type, class copy_type>
 inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,  
-                       const enum UCL_MEMOPT kind) {
+                       const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
   cl_int error_flag;
   cl_context context;
   CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_CONTEXT,sizeof(context),
                                   &context,NULL));
-
-  if (kind==UCL_VIEW) {
-    mat.cbegin()=clCreateBuffer(context, CL_MEM_USE_HOST_PTR,n,mat.host_ptr(),
-                                &error_flag);
-    CL_CHECK_ERR(error_flag);
-    return UCL_SUCCESS;
-  }
-  if (kind==UCL_WRITE_OPTIMIZED) {
-    mat.cbegin()=clCreateBuffer(context,
-                                CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
-                                n,NULL,&error_flag);                        
-    if (error_flag != CL_SUCCESS) 
-      return UCL_MEMORY_ERROR;
-    *mat.host_ptr() = (typename mat_type::data_type*)
-                      clEnqueueMapBuffer(cm.cq(),mat.cbegin(),CL_TRUE,
-                                         CL_MAP_WRITE,0,n,0,NULL,NULL,NULL);
+  
+  cl_mem_flags buffer_perm;
+  cl_map_flags map_perm;
+  if (kind2==UCL_NOT_SPECIFIED) {
+    if (kind==UCL_READ_ONLY) {
+      #ifdef CL_VERSION_1_2
+      buffer_perm=CL_MEM_HOST_READ_ONLY|CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR;
+      #else
+      buffer_perm=CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR;
+      #endif
+      map_perm=CL_MAP_READ;
+    } else if (kind==UCL_WRITE_ONLY) {
+      #ifdef CL_VERSION_1_2
+      buffer_perm=CL_MEM_HOST_WRITE_ONLY|CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR;
+      #else
+      buffer_perm=CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR;
+      #endif
+      map_perm=CL_MAP_WRITE;
+    } else {
+      buffer_perm=CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR;
+      map_perm=CL_MAP_READ | CL_MAP_WRITE;
+    }
   } else {
-    mat.cbegin()=clCreateBuffer(context,
-                                CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
-                                n,NULL,&error_flag);
-    if (error_flag != CL_SUCCESS) 
-      return UCL_MEMORY_ERROR;
+    if (kind2==UCL_READ_ONLY)
+      buffer_perm=CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR;
+    else if (kind2==UCL_WRITE_ONLY)
+      buffer_perm=CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR;
+    else
+      buffer_perm=CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR;
+    
+    if (kind==UCL_READ_ONLY) {
+      #ifdef CL_VERSION_1_2
+      buffer_perm=buffer_perm | CL_MEM_HOST_READ_ONLY;
+      #endif
+      map_perm=CL_MAP_READ;
+    } else if (kind==UCL_WRITE_ONLY) {
+      #ifdef CL_VERSION_1_2
+      buffer_perm=buffer_perm | CL_MEM_HOST_WRITE_ONLY;
+      #endif
+      map_perm=CL_MAP_WRITE;
+    } else
+      map_perm=CL_MAP_READ | CL_MAP_WRITE;
+  }
+    
+  mat.cbegin()=clCreateBuffer(context,buffer_perm,n,NULL,&error_flag);
+  if (error_flag != CL_SUCCESS) 
+    return UCL_MEMORY_ERROR;
     *mat.host_ptr() = (typename mat_type::data_type*)
                       clEnqueueMapBuffer(cm.cq(),mat.cbegin(),CL_TRUE,
-                                         CL_MAP_READ | CL_MAP_WRITE,
-                                         0,n,0,NULL,NULL,NULL);
-  }
+                                         map_perm,0,n,0,NULL,NULL,NULL);
+
   mat.cq()=cm.cq();
   CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
   return UCL_SUCCESS;
 }
 
+template <class mat_type, class copy_type>
+inline int _host_view(mat_type &mat, copy_type &cm, const size_t n) {
+  cl_int error_flag;
+  cl_context context;
+  CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_CONTEXT,sizeof(context),
+                                  &context,NULL));
+  cl_mem_flags orig_flags;
+  CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_FLAGS,sizeof(orig_flags),
+                                  &orig_flags,NULL));
+  orig_flags=orig_flags & ~CL_MEM_ALLOC_HOST_PTR;
+  
+  mat.cbegin()=clCreateBuffer(context, CL_MEM_USE_HOST_PTR | orig_flags, n,
+                              mat.host_ptr(), &error_flag);
+  CL_CHECK_ERR(error_flag);
+  CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
+  return UCL_SUCCESS;
+}
+
 template <class mat_type>
 inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,  
-                       const enum UCL_MEMOPT kind) {
-  cl_int error_flag;
-  if (kind==UCL_VIEW) {
-    mat.cbegin()=clCreateBuffer(dev.context(), CL_MEM_USE_HOST_PTR,
-                                n,mat.host_ptr(),&error_flag);
-    CL_CHECK_ERR(error_flag);
-    return UCL_SUCCESS;
-  }
-  if (kind==UCL_WRITE_OPTIMIZED) {
-    mat.cbegin()=clCreateBuffer(dev.context(),
-                                CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
-                                n,NULL,&error_flag);                        
-    if (error_flag != CL_SUCCESS) 
-      return UCL_MEMORY_ERROR;
-    *mat.host_ptr() = (typename mat_type::data_type*)
-                      clEnqueueMapBuffer(dev.cq(),mat.cbegin(),CL_TRUE,
-                                         CL_MAP_WRITE,0,n,0,NULL,NULL,NULL);
+                       const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
+  cl_mem_flags buffer_perm;
+  cl_map_flags map_perm;
+  if (kind==UCL_READ_ONLY) {
+    #ifdef CL_VERSION_1_2
+    buffer_perm=CL_MEM_HOST_READ_ONLY|CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR;
+    #else
+    buffer_perm=CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR;
+    #endif
+    map_perm=CL_MAP_READ;
+  } else if (kind==UCL_WRITE_ONLY) {
+    #ifdef CL_VERSION_1_2
+    buffer_perm=CL_MEM_HOST_WRITE_ONLY|CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR;
+    #else
+    buffer_perm=CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR;
+    #endif
+    map_perm=CL_MAP_WRITE;
   } else {
-    mat.cbegin()=clCreateBuffer(dev.context(),
-                                CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
-                                n,NULL,&error_flag);
-    if (error_flag != CL_SUCCESS) 
-      return UCL_MEMORY_ERROR;
-    *mat.host_ptr() = (typename mat_type::data_type*)
-                      clEnqueueMapBuffer(dev.cq(),mat.cbegin(),CL_TRUE,
-                                         CL_MAP_READ & CL_MAP_WRITE,
-                                         0,n,0,NULL,NULL,NULL);
+    buffer_perm=CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR;
+    map_perm=CL_MAP_READ | CL_MAP_WRITE;
   }
+
+  cl_int error_flag;
+  mat.cbegin()=clCreateBuffer(dev.context(),buffer_perm,n,NULL,&error_flag);
+  if (error_flag != CL_SUCCESS) 
+    return UCL_MEMORY_ERROR;
+
+  *mat.host_ptr() = (typename mat_type::data_type*)
+                    clEnqueueMapBuffer(dev.cq(),mat.cbegin(),CL_TRUE,
+                                       map_perm,0,n,0,NULL,NULL,NULL);
   mat.cq()=dev.cq();
   CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
   return UCL_SUCCESS;
 }
 
 template <class mat_type>
-inline void _host_free(mat_type &mat, const enum UCL_MEMOPT kind) {
-  CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
-  CL_DESTRUCT_CALL(clReleaseCommandQueue(mat.cq()));
+inline int _host_view(mat_type &mat, UCL_Device &dev, const size_t n) {
+  cl_int error_flag;
+  mat.cbegin()=clCreateBuffer(dev.context(), CL_MEM_USE_HOST_PTR,
+                              n,mat.host_ptr(),&error_flag);
+  CL_CHECK_ERR(error_flag);
+  CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
+  return UCL_SUCCESS;
+}
+
+template <class mat_type>
+inline void _host_free(mat_type &mat) {
+  if (mat.cols()>0) {
+    CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
+    CL_DESTRUCT_CALL(clReleaseCommandQueue(mat.cq()));
+  }
 }
 
 template <class mat_type>
@@ -138,28 +194,26 @@ inline int _host_resize(mat_type &mat, const size_t n) {
   cl_context context;
   CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_CONTEXT,sizeof(context),
                                   &context,NULL));
+  cl_mem_flags buffer_perm;
+  CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_FLAGS,sizeof(buffer_perm),
+                                  &buffer_perm,NULL));
 
   CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
-  if (mat.kind()==UCL_WRITE_OPTIMIZED) {
-    mat.cbegin()=clCreateBuffer(context,
-                                CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
-                                n,NULL,&error_flag);                        
-    if (error_flag != CL_SUCCESS) 
-      return UCL_MEMORY_ERROR;
-    *mat.host_ptr() = (typename mat_type::data_type*)
-                      clEnqueueMapBuffer(mat.cq(),mat.cbegin(),CL_TRUE,
-                                         CL_MAP_WRITE,0,n,0,NULL,NULL,NULL);
-  } else {
-    mat.cbegin()=clCreateBuffer(context,
-                                CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
-                                n,NULL,&error_flag);
-    if (error_flag != CL_SUCCESS) 
-      return UCL_MEMORY_ERROR;
-    *mat.host_ptr() = (typename mat_type::data_type*)
-                      clEnqueueMapBuffer(mat.cq(),mat.cbegin(),CL_TRUE,
-                                         CL_MAP_READ | CL_MAP_WRITE,
-                                         0,n,0,NULL,NULL,NULL);
-  }
+
+  cl_map_flags map_perm;
+  if (mat.kind()==UCL_READ_ONLY)
+    map_perm=CL_MAP_READ;
+  else if (mat.kind()==UCL_WRITE_ONLY)
+    map_perm=CL_MAP_WRITE;
+  else
+    map_perm=CL_MAP_READ | CL_MAP_WRITE;
+
+  mat.cbegin()=clCreateBuffer(context,buffer_perm,n,NULL,&error_flag);
+  if (error_flag != CL_SUCCESS) 
+    return UCL_MEMORY_ERROR;
+  *mat.host_ptr() = (typename mat_type::data_type*)
+                    clEnqueueMapBuffer(mat.cq(),mat.cbegin(),CL_TRUE,
+                                       map_perm,0,n,0,NULL,NULL,NULL);
   return UCL_SUCCESS;
 }
 
@@ -179,9 +233,17 @@ inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t n,
   if (kind==UCL_READ_WRITE)
     flag=CL_MEM_READ_WRITE;
   else if (kind==UCL_READ_ONLY)
+    #ifdef CL_VERSION_1_2
+    flag=CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY;
+    #else
     flag=CL_MEM_READ_ONLY;
+    #endif
   else if (kind==UCL_WRITE_ONLY)
+    #ifdef CL_VERSION_1_2
+    flag=CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY;
+    #else
     flag=CL_MEM_WRITE_ONLY;
+    #endif
   else
     assert(0==1);
   mat.cbegin()=clCreateBuffer(context,flag,n,NULL,&error_flag);
@@ -200,9 +262,17 @@ inline int _device_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
   if (kind==UCL_READ_WRITE)
     flag=CL_MEM_READ_WRITE;
   else if (kind==UCL_READ_ONLY)
+    #ifdef CL_VERSION_1_2
+    flag=CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY;
+    #else
     flag=CL_MEM_READ_ONLY;
+    #endif
   else if (kind==UCL_WRITE_ONLY)
+    #ifdef CL_VERSION_1_2
+    flag=CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY;
+    #else
     flag=CL_MEM_WRITE_ONLY;
+    #endif
   else
     assert(0==1);
   mat.cbegin()=clCreateBuffer(dev.context(),flag,n,NULL,
@@ -238,8 +308,10 @@ inline int _device_alloc(mat_type &mat, UCL_Device &dev, const size_t rows,
 
 template <class mat_type>
 inline void _device_free(mat_type &mat) {
-  CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
-  CL_DESTRUCT_CALL(clReleaseCommandQueue(mat.cq()));
+  if (mat.cols()>0) {
+    CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
+    CL_DESTRUCT_CALL(clReleaseCommandQueue(mat.cq()));
+  }
 }
 
 template <class mat_type>
@@ -255,9 +327,17 @@ inline int _device_resize(mat_type &mat, const size_t n) {
   if (mat.kind()==UCL_READ_WRITE)
     flag=CL_MEM_READ_WRITE;
   else if (mat.kind()==UCL_READ_ONLY)
+    #ifdef CL_VERSION_1_2
+    flag=CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY;
+    #else
     flag=CL_MEM_READ_ONLY;
+    #endif
   else if (mat.kind()==UCL_WRITE_ONLY)
+    #ifdef CL_VERSION_1_2
+    flag=CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY;
+    #else
     flag=CL_MEM_WRITE_ONLY;
+    #endif
   else
     assert(0==1);
   mat.cbegin()=clCreateBuffer(context,flag,n,NULL,&error_flag);
@@ -285,9 +365,17 @@ inline int _device_resize(mat_type &mat, const size_t rows,
   if (mat.kind()==UCL_READ_WRITE)
     flag=CL_MEM_READ_WRITE;
   else if (mat.kind()==UCL_READ_ONLY)
+    #ifdef CL_VERSION_1_2
+    flag=CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY;
+    #else
     flag=CL_MEM_READ_ONLY;
+    #endif
   else if (mat.kind()==UCL_WRITE_ONLY)
+    #ifdef CL_VERSION_1_2
+    flag=CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY;
+    #else
     flag=CL_MEM_WRITE_ONLY;
+    #endif
   else
     assert(0==1);
   mat.cbegin()=clCreateBuffer(context,flag,pitch*rows,NULL,&error_flag);
@@ -344,7 +432,19 @@ inline void _ocl_kernel_from_source(cl_context &context, cl_device_id &device,
 }
 
 template <class mat_type>
-inline void _device_zero(mat_type &mat, const size_t n) {
+inline void _device_zero(mat_type &mat, const size_t n, command_queue &cq) {
+  #ifdef CL_VERSION_1_2
+  #ifndef __APPLE__
+  #define UCL_CL_ZERO
+  #endif
+  #endif
+
+  #ifdef UCL_CL_ZERO
+  cl_int zeroint=0;
+  CL_SAFE_CALL(clEnqueueFillBuffer(cq,mat.begin(),&zeroint,sizeof(cl_int),
+                                   mat.byteoff(),n,0,NULL,NULL));
+
+  #else
   cl_context context;
   CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_CONTEXT,sizeof(context),
                                   &context,NULL));
@@ -354,17 +454,20 @@ inline void _device_zero(mat_type &mat, const size_t n) {
   
   const char * szero[3]={
     "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-    "__kernel void _device_zero(__global NUMTYP *a)",
-    "  { int gid=get_global_id(0); a[gid]=(NUMTYP)0; }"
+    "__kernel void _device_zero(__global NUMTYP *a, const int offset)",
+    "  { int gid=get_global_id(0)+offset; a[gid]=(NUMTYP)0; }"
   };
   
   cl_kernel kzero;
   _ocl_kernel_from_source(context,device,szero,3,kzero,"_device_zero",
                    _UCL_DATA_ID<typename mat_type::data_type>::numtyp_flag());
   
+  cl_int offset=mat.offset();
   CL_SAFE_CALL(clSetKernelArg(kzero,0,sizeof(cl_mem),(void *)&mat.begin()));
+  CL_SAFE_CALL(clSetKernelArg(kzero,1,sizeof(cl_int),(void *)&offset));
   size_t kn=n/sizeof(typename mat_type::data_type);
-  CL_SAFE_CALL(clEnqueueNDRangeKernel(mat.cq(),kzero,1,0,&kn,0,0,0,0));
+  CL_SAFE_CALL(clEnqueueNDRangeKernel(cq,kzero,1,0,&kn,0,0,0,0));
+  #endif
 }
 
 // --------------------------------------------------------------------------
@@ -470,9 +573,15 @@ template <> struct _ucl_memcpy<1,0> {
                         cl_command_queue &cq, const cl_bool block,
                         const size_t dst_offset, const size_t src_offset) {
     if (src.cbegin()==dst.cbegin()) {
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_COPY 1S\n";
+      #endif
       if (block) ucl_sync(cq);
       return;
     }
+    #ifdef UCL_DBG_MEM_TRACE
+    std::cerr << "UCL_COPY 1NS\n";
+    #endif
     CL_SAFE_CALL(clEnqueueReadBuffer(cq,src.cbegin(),block,src_offset,n,
                                      dst.begin(),0,NULL,NULL));
   }
@@ -484,8 +593,14 @@ template <> struct _ucl_memcpy<1,0> {
                         size_t dst_offset, size_t src_offset) {
     if (src.cbegin()==dst.cbegin()) {
       if (block) ucl_sync(cq);
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_COPY 2S\n";
+      #endif
       return;
     }
+    #ifdef UCL_DBG_MEM_TRACE
+    std::cerr << "UCL_COPY 2NS\n";
+    #endif
     if (spitch==dpitch && dst.cols()==src.cols() && 
         src.cols()==cols/src.element_size())
       CL_SAFE_CALL(clEnqueueReadBuffer(cq,src.cbegin(),block,src_offset,
@@ -511,8 +626,14 @@ template <> struct _ucl_memcpy<0,1> {
                         const size_t dst_offset, const size_t src_offset) {
     if (src.cbegin()==dst.cbegin()) {
       if (block) ucl_sync(cq);
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_COPY 3S\n";
+      #endif
       return;                        
     }
+    #ifdef UCL_DBG_MEM_TRACE
+    std::cerr << "UCL_COPY 3NS\n";
+    #endif
     CL_SAFE_CALL(clEnqueueWriteBuffer(cq,dst.cbegin(),block,dst_offset,n,
                                       src.begin(),0,NULL,NULL));
   }
@@ -524,8 +645,14 @@ template <> struct _ucl_memcpy<0,1> {
                         size_t dst_offset, size_t src_offset) {
     if (src.cbegin()==dst.cbegin()) {
       if (block) ucl_sync(cq);
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_COPY 4S\n";
+      #endif
       return;                        
     }
+    #ifdef UCL_DBG_MEM_TRACE
+    std::cerr << "UCL_COPY 4NS\n";
+    #endif
     if (spitch==dpitch && dst.cols()==src.cols() && 
         src.cols()==cols/src.element_size())
       CL_SAFE_CALL(clEnqueueWriteBuffer(cq,dst.cbegin(),block,dst_offset,
@@ -549,9 +676,17 @@ template <int mem1, int mem2> struct _ucl_memcpy {
   static inline void mc(p1 &dst, const p2 &src, const size_t n,
                         cl_command_queue &cq, const cl_bool block,
                         const size_t dst_offset, const size_t src_offset) {
-    if (src.cbegin()!=dst.cbegin() || src_offset!=dst_offset)
+    if (src.cbegin()!=dst.cbegin() || src_offset!=dst_offset) {
       CL_SAFE_CALL(clEnqueueCopyBuffer(cq,src.cbegin(),dst.cbegin(),src_offset,
                                        dst_offset,n,0,NULL,NULL));
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_COPY 6NS\n";
+      #endif
+    }
+    #ifdef UCL_DBG_MEM_TRACE
+    else std::cerr << "UCL_COPY 6S\n";
+    #endif
+    
     if (block==CL_TRUE) ucl_sync(cq);
   }
   template <class p1, class p2>
@@ -561,6 +696,9 @@ template <int mem1, int mem2> struct _ucl_memcpy {
                         const cl_bool block,
                         size_t dst_offset, size_t src_offset) {
     if (src.cbegin()!=dst.cbegin() || src_offset!=dst_offset) {                        
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_COPY 7NS\n";
+      #endif
       if (spitch==dpitch && dst.cols()==src.cols() && 
           src.cols()==cols/src.element_size())
         CL_SAFE_CALL(clEnqueueCopyBuffer(cq,src.cbegin(),dst.cbegin(),src_offset,
@@ -575,6 +713,10 @@ template <int mem1, int mem2> struct _ucl_memcpy {
           dst_offset+=dpitch;
         }                                       
     }                                 
+    #ifdef UCL_DBG_MEM_TRACE
+    else std::cerr << "UCL_COPY 7S\n";
+    #endif
+
     if (block==CL_TRUE) ucl_sync(cq);
   }
 };
diff --git a/lib/gpu/geryon/ocl_timer.h b/lib/gpu/geryon/ocl_timer.h
index d3d6810259..627d19d66f 100644
--- a/lib/gpu/geryon/ocl_timer.h
+++ b/lib/gpu/geryon/ocl_timer.h
@@ -27,6 +27,12 @@
 #include "ocl_macros.h"
 #include "ocl_device.h"
 
+#ifdef CL_VERSION_1_2
+#define UCL_OCL_MARKER(cq,event) clEnqueueMarkerWithWaitList(cq,0,NULL,event)
+#else
+#define UCL_OCL_MARKER clEnqueueMarker
+#endif
+
 namespace ucl_opencl {
 
 /// Class for timing OpenCL events
@@ -63,10 +69,10 @@ class UCL_Timer {
   }
   
   /// Start timing on default command queue
-  inline void start() { clEnqueueMarker(_cq,&start_event); }
+  inline void start() { UCL_OCL_MARKER(_cq,&start_event); }
   
   /// Stop timing on default command queue
-  inline void stop() { clEnqueueMarker(_cq,&stop_event); }
+  inline void stop() { UCL_OCL_MARKER(_cq,&stop_event); }
   
   /// Block until the start event has been reached on device
   inline void sync_start() 
@@ -78,7 +84,7 @@ class UCL_Timer {
 
   /// Set the time elapsed to zero (not the total_time)
   inline void zero() 
-    { clEnqueueMarker(_cq,&start_event); clEnqueueMarker(_cq,&stop_event); } 
+    { UCL_OCL_MARKER(_cq,&start_event); UCL_OCL_MARKER(_cq,&stop_event); } 
   
   /// Set the total time to zero
   inline void zero_total() { _total_time=0.0; }
diff --git a/lib/gpu/geryon/ucl_basemat.h b/lib/gpu/geryon/ucl_basemat.h
index 844071c9b5..4edf83e057 100644
--- a/lib/gpu/geryon/ucl_basemat.h
+++ b/lib/gpu/geryon/ucl_basemat.h
@@ -58,19 +58,36 @@
   * calls for reserving and copying memory **/ 
 class UCL_BaseMat {
  public:
-  UCL_BaseMat() : _cq(0) { }
+  UCL_BaseMat() : _cq(0), _kind(UCL_VIEW) { }
   virtual ~UCL_BaseMat() { }
   /// Return the default command queue/stream associated with this data
   inline command_queue & cq() { return _cq; }
+  /// Change the default command queue associated with matrix
+  inline void cq(command_queue &cq_in) { _cq=cq_in; }
   /// Block until command_queue associated with matrix is complete
   inline void sync() { ucl_sync(_cq); }
+  /// Return the type/permissions of memory allocation
+  /** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, UCL_NOT_PINNED
+    * or UCL_VIEW **/ 
+  inline enum UCL_MEMOPT kind() const { return _kind; }  
+
+  inline bool shared_mem_device() {
+    #ifdef _OCL_MAT
+    cl_device_id device;
+    CL_SAFE_CALL(clGetCommandQueueInfo(_cq,CL_QUEUE_DEVICE,
+                                       sizeof(cl_device_id),&device,NULL));
+    cl_device_type device_type;
+    CL_SAFE_CALL(clGetDeviceInfo(device,CL_DEVICE_TYPE,
+                                 sizeof(device_type),&device_type,NULL));
+    return _shared_mem_device(device_type);                                       
+    #else
+    return false;
+    #endif
+  }
   
-  #ifdef UCL_DEBUG
-  // Returns the type of host allocation
-  virtual inline enum UCL_MEMOPT kind() const { return UCL_NOT_PINNED; }
-  #endif 
  protected:
   command_queue _cq;
+  enum UCL_MEMOPT _kind;
 };
 
 #endif
diff --git a/lib/gpu/geryon/ucl_copy.h b/lib/gpu/geryon/ucl_copy.h
index c201cc0b12..c6bff97a8c 100644
--- a/lib/gpu/geryon/ucl_copy.h
+++ b/lib/gpu/geryon/ucl_copy.h
@@ -102,6 +102,30 @@
 // Only allow this file to be included by nvc_memory.h and ocl_memory.h
 #ifdef UCL_COPY_ALLOW
 
+// --------------------------------------------------------------------------
+// - CHECK PERMISSIONS FOR SOURCE AND DESTINATION IN COPY
+// --------------------------------------------------------------------------
+template <class mat1, class mat2>
+inline void _check_ucl_copy_perm(mat1 &dst, mat2 &src) {
+  if ((int)mat1::MEM_TYPE==(int)mat2::MEM_TYPE) {
+    if (dst.kind()==UCL_READ_ONLY) {
+      std::cerr << "Attempt to copy where destination is UCL_READ_ONLY\n";
+      assert(0==1);
+    } else if (src.kind()==UCL_WRITE_ONLY) {
+      std::cerr << "Attempt to copy where source is UCL_WRITE_ONLY\n";
+      assert(0==1);
+    }
+  } else {
+    if (dst.kind()==UCL_WRITE_ONLY) {
+      std::cerr << "Destination in host-device copy cannot be UCL_WRITE_ONLY\n";
+      assert(0==1);
+    } else if (src.kind()==UCL_READ_ONLY) {
+      std::cerr << "Source in host-device copy cannot be UCL_READ_ONLY\n";
+      assert(0==1);
+    }
+  }
+} 
+
 // --------------------------------------------------------------------------
 // - HOST-HOST COPY ROUTINES
 // --------------------------------------------------------------------------
@@ -117,9 +141,20 @@ template <> struct _host_host_copy<1,1> {
     assert(mat1::PADDED==0 && mat2::PADDED==0);
     assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
     #endif
-    if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE && mat1::DATA_TYPE!=0)
+    if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE && mat1::DATA_TYPE!=0) {
+      #ifdef _OCL_MAT
+      if (dst.begin()==src.begin()) {
+        #ifdef UCL_DBG_MEM_TRACE
+        std::cerr << "UCL_COPY 7S\n";
+        #endif
+        return;
+      }
+      #endif
       memcpy(dst.begin(),src.begin(),numel*sizeof(typename mat1::data_type));
-    else
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_COPY 7NS\n";
+      #endif
+    } else
       for (size_t i=0; i<numel; i++)
         dst[i]=static_cast<typename mat1::data_type>(src[i]);
   }
@@ -138,15 +173,27 @@ template <> struct _host_host_copy<1,1> {
       src_row_size=cols;
     else
       src_row_size=src.row_size();
-    if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE && mat1::DATA_TYPE!=0)
+    if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE && mat1::DATA_TYPE!=0) {
+      #ifdef _OCL_MAT
+      if (dst.begin()==src.begin()) {
+        #ifdef UCL_DBG_MEM_TRACE
+        std::cerr << "UCL_COPY 8S\n";
+        #endif
+        return;
+      }
+      #endif
+      
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_COPY 8NS\n";
+      #endif
       for (size_t i=0; i<rows; i++)
         memcpy(dst.begin()+i*dst_row_size,src.begin()+i*src_row_size,
                cols*sizeof(typename mat1::data_type));
-    else
+    } else
       for (size_t j=0; j<rows; j++) {
-        int dst_i=j*dst_row_size;
-        int d_end=dst_i+cols;
-        int src_i=j*src_row_size;
+        size_t dst_i=j*dst_row_size;
+        size_t d_end=dst_i+cols;
+        size_t src_i=j*src_row_size;
         for (; dst_i<d_end; dst_i++) {
           dst[dst_i]=static_cast<typename mat1::data_type>(src[src_i]);
           src_i++;
@@ -216,15 +263,14 @@ template <int host_type2> struct _ucl_cast_copy<1,host_type2> {
         ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
                    src.row_bytes(),cols*sizeof(typename mat2::data_type),
                    rows);
-      int dst_i=0;
-      int buff_i=0;
+      size_t dst_i=0, buff_i=0, doff=dst.cols()-cols;
       for (size_t i=0; i<rows; i++) {
         for (size_t j=0; j<cols; j++) {
           dst[dst_i]=static_cast<typename mat1::data_type>(cast_buffer[buff_i]);
           buff_i++;
           dst_i++;
         }
-        dst_i+=dst.cols()-cols;
+        dst_i+=doff;
       }
     }
   }
@@ -255,15 +301,14 @@ template <int host_type2> struct _ucl_cast_copy<1,host_type2> {
                    src.row_bytes(),cols*sizeof(typename mat2::data_type),
                    rows,cq);
       cast_buffer.sync();
-      int dst_i=0;
-      int buff_i=0;
+      size_t dst_i=0, buff_i=0, doff=dst.cols()-cols;
       for (size_t i=0; i<rows; i++) {
         for (size_t j=0; j<cols; j++) {
           dst[dst_i]=static_cast<typename mat1::data_type>(cast_buffer[buff_i]);
           buff_i++;
           dst_i++;
         }
-        dst_i+=dst.cols()-cols;
+        dst_i+=doff;
       }
     }
   }
@@ -293,38 +338,62 @@ template <int host_type1> struct _ucl_cast_copy<host_type1,1> {
     assert(src.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
     if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
     if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
+    if (mat3::VECTOR==0) { 
+      assert(cast_buffer.rows()>=rows && cast_buffer.cols()>=cols);
+      assert(dst.rows()>=rows && dst.cols()>=cols);
+    }
     #endif
     if (mat2::VECTOR) {
-      for (size_t i=0; i<rows*cols; i++)
-        cast_buffer[i]=static_cast<typename mat3::data_type>(src[i]);
-      ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,
-                 cols*sizeof(typename mat1::data_type),
-                 cols*sizeof(typename mat1::data_type),rows);
+      if (mat3::VECTOR==0) {
+        size_t ci=0, si=0, co=cast_buffer.cols()-cols, so=src.cols()-cols;
+        for (size_t i=0; i<rows; i++) {
+          for (size_t j=0; j<cols; j++) {
+            cast_buffer[ci]=static_cast<typename mat3::data_type>(src[si]);
+            ci++;
+            si++;
+          }
+          ci+=co;
+          si+=so;
+        }
+        ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,cast_buffer.row_bytes(),
+                   cols*sizeof(typename mat1::data_type),rows);
+      } else {
+        for (size_t i=0; i<rows*cols; i++)
+          cast_buffer[i]=static_cast<typename mat3::data_type>(src[i]);
+        ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,
+                   cols*sizeof(typename mat1::data_type),
+                   cols*sizeof(typename mat1::data_type),rows);
+      }
     } else if (mat1::VECTOR) {
-      int src_i=0;
-      int buf_i=0;
+      size_t src_i=0, buf_i=0, soff=src.cols()-cols;
       for (size_t i=0; i<rows; i++) {
         for (size_t j=0; j<cols; j++) {
           cast_buffer[buf_i]=static_cast<typename mat3::data_type>(src[src_i]);
           buf_i++;
           src_i++;
         }
-        src_i+=src.cols()-cols;
+        src_i+=soff;
       }
       ucl_mv_cpy(dst,cast_buffer,cols*sizeof(typename mat1::data_type)*rows);
     } else {
-      int src_i=0;
-      int buf_i=0;
+      size_t src_i=0, buf_i=0, so=src.cols()-cols, co, spitch;
+      if (mat3::VECTOR==0) {
+        co=cast_buffer.cols()-cols;
+        spitch=cast_buffer.row_bytes();
+      } else {
+        co=0;
+        spitch=cols*sizeof(typename mat1::data_type);
+      }
       for (size_t i=0; i<rows; i++) {
         for (size_t j=0; j<cols; j++) {
           cast_buffer[buf_i]=static_cast<typename mat3::data_type>(src[src_i]);
           buf_i++;
           src_i++;
         }
-        src_i+=src.cols()-cols;
+        src_i+=so;
+        buf_i+=co;
       }
-      ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,
-                 cols*sizeof(typename mat1::data_type),
+      ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,spitch,
                  cols*sizeof(typename mat1::data_type),rows);
     }
   }
@@ -337,38 +406,62 @@ template <int host_type1> struct _ucl_cast_copy<host_type1,1> {
     assert(src.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
     if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);    
     if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);    
+    if (mat3::VECTOR==0) { 
+      assert(cast_buffer.rows()>=rows && cast_buffer.cols()>=cols);
+      assert(dst.rows()>=rows && dst.cols()>=cols);
+    }
     #endif
     if (mat2::VECTOR) {
-      for (size_t i=0; i<rows*cols; i++)
-        cast_buffer[i]=static_cast<typename mat3::data_type>(src[i]);
-      ucl_mv_cpy(dst,dst.row_bytes(),
-                 cast_buffer,cols*sizeof(typename mat1::data_type),
-                 cols*sizeof(typename mat1::data_type),rows,cq);
+      if (mat3::VECTOR==0) {
+        size_t ci=0, si=0, co=cast_buffer.cols()-cols, so=src.cols()-cols;
+        for (size_t i=0; i<rows; i++) {
+          for (size_t j=0; j<cols; j++) {
+            cast_buffer[ci]=static_cast<typename mat3::data_type>(src[si]);
+            ci++;
+            si++;
+          }
+          ci+=co;
+          si+=so;
+        }
+        ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,cast_buffer.row_bytes(),
+                   cols*sizeof(typename mat1::data_type),rows);
+      } else {
+        for (size_t i=0; i<rows*cols; i++)
+          cast_buffer[i]=static_cast<typename mat3::data_type>(src[i]);
+        ucl_mv_cpy(dst,dst.row_bytes(),
+                   cast_buffer,cols*sizeof(typename mat1::data_type),
+                   cols*sizeof(typename mat1::data_type),rows,cq);
+      }
     } else if (mat1::VECTOR) {
-      int src_i=0;
-      int buf_i=0;
+      size_t src_i=0, buf_i=0, soff=src.cols()-cols;
       for (size_t i=0; i<rows; i++) {
         for (size_t j=0; j<cols; j++) {
           cast_buffer[buf_i]=static_cast<typename mat3::data_type>(src[src_i]);
           buf_i++;
           src_i++;
         }
-        src_i+=src.cols()-cols;
+        src_i+=soff;
       }
       ucl_mv_cpy(dst,cast_buffer,cols*sizeof(typename mat1::data_type)*rows,cq);
     } else {
-      int src_i=0;
-      int buf_i=0;
+      size_t src_i=0, buf_i=0, so=src.cols()-cols, co, spitch;
+      if (mat3::VECTOR==0) {
+        co=cast_buffer.cols()-cols;
+        spitch=cast_buffer.row_bytes();
+      } else {
+        co=0;
+        spitch=cols*sizeof(typename mat1::data_type);
+      }
       for (size_t i=0; i<rows; i++) {
         for (size_t j=0; j<cols; j++) {
           cast_buffer[buf_i]=static_cast<typename mat3::data_type>(src[src_i]);
           buf_i++;
           src_i++;
         }
-        src_i+=src.cols()-cols;
+        src_i+=so;
+        buf_i+=co;
       }
-      ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,
-                 cols*sizeof(typename mat1::data_type),
+      ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,spitch,
                  cols*sizeof(typename mat1::data_type),rows,cq);
     }
   }
@@ -444,9 +537,13 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel,
   #endif
   if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
     ucl_copy(dst,src,numel,cq);
-  else
+  else {
+    #ifdef UCL_DEBUG
+    _check_ucl_copy_perm(dst,src);
+    #endif
     _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
                                                       cast_buffer,cq);
+  }
 }
 
 /// Asynchronous copy of matrix/vector with cast (Device/Host transfer)
@@ -463,6 +560,7 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel,
   assert(dst.numel()>=numel && src.numel()>=numel);
   assert(cast_buffer.numel()>=numel);
   assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
+  _check_ucl_copy_perm(dst,src);
   #endif
   if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
     ucl_copy(dst,src,numel,async);
@@ -491,6 +589,7 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
   assert(dst.row_size()*dst.rows()>=numel && src.row_size()*src.rows()>=numel);
   assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
   assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
+  _check_ucl_copy_perm(dst,src);
   #endif
   if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
     _host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,numel);
@@ -498,12 +597,12 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
       (mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
     if (mat1::MEM_TYPE==1) {
       UCL_H_Vec<typename mat2::data_type> cast_buffer;
-      cast_buffer.alloc(numel,dst,UCL_RW_OPTIMIZED);
+      cast_buffer.alloc(numel,dst,UCL_READ_ONLY);
       _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
                                                         cast_buffer,cq);
     } else {
       UCL_H_Vec<typename mat1::data_type> cast_buffer;
-      cast_buffer.alloc(numel,dst,UCL_WRITE_OPTIMIZED);
+      cast_buffer.alloc(numel,dst,UCL_WRITE_ONLY);
       _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
                                                         cast_buffer,cq);
     }
@@ -529,6 +628,7 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
   #ifdef UCL_DEBUG
   assert(dst.row_size()*dst.rows()>=numel && src.row_size()*src.rows()>=numel);
   assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
+  _check_ucl_copy_perm(dst,src);
   #endif
   if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
     _host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,numel);
@@ -538,12 +638,12 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
            (mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
     if (mat1::MEM_TYPE==1) {
       UCL_H_Vec<typename mat2::data_type> cast_buffer;
-      cast_buffer.alloc(numel,dst,UCL_RW_OPTIMIZED);
+      cast_buffer.alloc(numel,dst,UCL_READ_ONLY);
       _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
                                                         cast_buffer);
     } else {
       UCL_H_Vec<typename mat1::data_type> cast_buffer;
-      cast_buffer.alloc(numel,dst,UCL_WRITE_OPTIMIZED);
+      cast_buffer.alloc(numel,dst,UCL_WRITE_ONLY);
       _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
                                                         cast_buffer);
     }
@@ -574,9 +674,13 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t rows,
     ucl_copy(dst,src,rows,cols,async);
   else if (async)
     ucl_copy(dst,src,rows,cols,dst.cq());
-  else
+  else {
+    #ifdef UCL_DEBUG
+    _check_ucl_copy_perm(dst,src);
+    #endif
     _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
                                                       cast_buffer);
+  }
 }
 
 /// Asynchronous copy subset matrix rows,cols with cast (Device/Host transfer)
@@ -595,9 +699,13 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t rows,
                           command_queue &cq) {
   if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
     ucl_copy(dst,src,rows,cols,cq);
-  else 
+  else {
+    #ifdef UCL_DEBUG
+    _check_ucl_copy_perm(dst,src);
+    #endif
     _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
                                                       cast_buffer,cq);
+  }
 }
 
 /// Asynchronous copy of subset matrix rows,cols (memory already allocated)
@@ -617,18 +725,21 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t rows,
 template <class mat1, class mat2>
 inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
                      const size_t cols, command_queue &cq) {
+  #ifdef UCL_DEBUG
+  _check_ucl_copy_perm(dst,src);
+  #endif
   if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
     _host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,rows,cols);
   else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE && 
            (mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
     if (mat1::MEM_TYPE==1) {
       UCL_H_Vec<typename mat2::data_type> cast_buffer;
-      cast_buffer.alloc(rows*cols,dst,UCL_RW_OPTIMIZED);
+      cast_buffer.alloc(rows*cols,dst,UCL_READ_ONLY);
       _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
                                                         cast_buffer,cq);
     } else {
       UCL_H_Vec<typename mat1::data_type> cast_buffer;
-      cast_buffer.alloc(rows*cols,dst,UCL_WRITE_OPTIMIZED);
+      cast_buffer.alloc(rows*cols,dst,UCL_WRITE_ONLY);
       _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
                                                         cast_buffer,cq);
     }
@@ -678,6 +789,9 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
 template <class mat1, class mat2>
 inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
                      const size_t cols, const bool async) {
+  #ifdef UCL_DEBUG
+  _check_ucl_copy_perm(dst,src);
+  #endif
   if (async)
     ucl_copy(dst,src,rows,cols,dst.cq());
   else if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
@@ -686,12 +800,12 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
            (mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
     if (mat1::MEM_TYPE==1) {
       UCL_H_Vec<typename mat2::data_type> cast_buffer;
-      cast_buffer.alloc(rows*cols,dst,UCL_RW_OPTIMIZED);
+      cast_buffer.alloc(rows*cols,dst,UCL_READ_ONLY);
       _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
                                                         cast_buffer);
     } else {
       UCL_H_Vec<typename mat1::data_type> cast_buffer;
-      cast_buffer.alloc(rows*cols,dst,UCL_WRITE_OPTIMIZED);
+      cast_buffer.alloc(rows*cols,dst,UCL_WRITE_ONLY);
       _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
                                                         cast_buffer);
     }
diff --git a/lib/gpu/geryon/ucl_d_mat.h b/lib/gpu/geryon/ucl_d_mat.h
index b065a8b644..f1aaa27903 100644
--- a/lib/gpu/geryon/ucl_d_mat.h
+++ b/lib/gpu/geryon/ucl_d_mat.h
@@ -39,14 +39,14 @@ class UCL_D_Mat : public UCL_BaseMat {
   };
   typedef numtyp data_type; 
 
-  UCL_D_Mat() : _rows(0), _kind(UCL_VIEW) {}
-  ~UCL_D_Mat() { if (_kind!=UCL_VIEW) _device_free(*this); }
+  UCL_D_Mat() : _cols(0) {}
+  ~UCL_D_Mat() { _device_free(*this); }
   
   /// Construct with specified rows and cols
   /** \sa alloc() **/
   UCL_D_Mat(const size_t rows, const size_t cols, UCL_Device &device,
             const enum UCL_MEMOPT kind=UCL_READ_WRITE) : 
-    _rows(0), _kind(UCL_VIEW) { alloc(rows,cols,device,kind); }
+    _cols(0) { alloc(rows,cols,device,kind); }
     
   /// Row major matrix on device
   /** The kind parameter controls memory optimizations as follows:
@@ -121,15 +121,11 @@ class UCL_D_Mat : public UCL_BaseMat {
     return err; 
   }
   
-  /// Return the type of memory allocation
-  /** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, or UCL_VIEW **/ 
-  inline enum UCL_MEMOPT kind() const { return _kind; }
-  
   /// Do not allocate memory, instead use an existing allocation from Geryon
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
     * \param stride Number of _elements_ between the start of each row **/ 
   template <class ucl_type>
   inline void view(ucl_type &input, const size_t rows, const size_t cols,
@@ -142,8 +138,10 @@ class UCL_D_Mat : public UCL_BaseMat {
     _row_size=stride;
     this->_cq=input.cq();
     #ifdef _OCL_MAT
-    _offset=0;
+    _offset=input.offset();
     _array=input.cbegin();
+    CL_SAFE_CALL(clRetainMemObject(input.cbegin()));
+    CL_SAFE_CALL(clRetainCommandQueue(input.cq()));
     #else
     _device_view(&_array,input.begin());
     #endif
@@ -157,7 +155,7 @@ class UCL_D_Mat : public UCL_BaseMat {
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container **/
+    *   allocating container when using CUDA APIs **/
   template <class ucl_type>
   inline void view(ucl_type &input, const size_t rows, const size_t cols) 
     { view(input,rows,cols,input.row_size()); }
@@ -166,7 +164,7 @@ class UCL_D_Mat : public UCL_BaseMat {
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
     * - If a matrix is used a input, all elements (including padding)
     *   will be used for view **/
   template <class ucl_type>
@@ -177,7 +175,7 @@ class UCL_D_Mat : public UCL_BaseMat {
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
     * - If a matrix is used a input, all elements (including padding)
     *   will be used for view **/
   template <class ucl_type>
@@ -187,7 +185,7 @@ class UCL_D_Mat : public UCL_BaseMat {
   /// Do not allocate memory, instead use an existing allocation
   /** - No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
     * \param stride Number of _elements_ between the start of each row **/ 
   template <class ptr_type>
   inline void view(ptr_type input, const size_t rows, const size_t cols,
@@ -205,13 +203,15 @@ class UCL_D_Mat : public UCL_BaseMat {
     #endif
     #ifdef _OCL_MAT
     _offset=0;
+    CL_SAFE_CALL(clRetainMemObject(input));
+    CL_SAFE_CALL(clRetainCommandQueue(dev.cq()));
     #endif
   }
 
   /// Do not allocate memory, instead use an existing allocation
   /** - No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container **/
+    *   allocating container when using CUDA APIs **/
   template <class ptr_type>
   inline void view(ptr_type input, const size_t rows, const size_t cols,
                    UCL_Device &dev) { view(input,rows,cols,cols,dev); }
@@ -219,7 +219,7 @@ class UCL_D_Mat : public UCL_BaseMat {
   /// Do not allocate memory, instead use an existing allocation
   /** - No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container **/
+    *   allocating container when using CUDA APIs **/
   template <class ptr_type>
   inline void view(ptr_type input, const size_t cols, UCL_Device &dev)
     { view(input,1,cols,dev); }
@@ -228,7 +228,7 @@ class UCL_D_Mat : public UCL_BaseMat {
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
     * \param stride Number of _elements_ between the start of each row **/ 
   template <class ucl_type>
   inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
@@ -242,7 +242,9 @@ class UCL_D_Mat : public UCL_BaseMat {
     this->_cq=input.cq();
     #ifdef _OCL_MAT
     _array=input.begin();
-    _offset=offset;
+    _offset=offset+input.offset();
+    CL_SAFE_CALL(clRetainMemObject(input.cbegin()));
+    CL_SAFE_CALL(clRetainCommandQueue(input.cq()));
     #else
     _device_view(&_array,input.begin(),offset,sizeof(numtyp));
     #endif
@@ -256,7 +258,7 @@ class UCL_D_Mat : public UCL_BaseMat {
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container **/
+    *   allocating container when using CUDA APIs **/
   template <class ucl_type>
   inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
                           const size_t cols) 
@@ -266,7 +268,7 @@ class UCL_D_Mat : public UCL_BaseMat {
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
     * - If a matrix is used a input, all elements (including padding)
     *   will be used for view **/
   template <class ucl_type>
@@ -277,7 +279,7 @@ class UCL_D_Mat : public UCL_BaseMat {
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
     * - If a matrix is used a input, all elements (including padding)
     *   will be used for view **/
   template <class ucl_type>
@@ -292,7 +294,7 @@ class UCL_D_Mat : public UCL_BaseMat {
   /// Do not allocate memory, instead use an existing allocation
   /** - No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
     * \param stride Number of _elements_ between the start of each row **/ 
   template <class ptr_type>
   inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
@@ -309,6 +311,8 @@ class UCL_D_Mat : public UCL_BaseMat {
     #ifdef _OCL_MAT
     _array=input;
     _offset=offset;
+    CL_SAFE_CALL(clRetainMemObject(input));
+    CL_SAFE_CALL(clRetainCommandQueue(dev.cq()));
     #else
     #ifdef _UCL_DEVICE_PTR_MAT
     _array=input+offset*sizeof(numtyp);
@@ -325,7 +329,7 @@ class UCL_D_Mat : public UCL_BaseMat {
   /// Do not allocate memory, instead use an existing allocation
   /** - No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container **/
+    *   allocating container when using CUDA APIs **/
   template <class ptr_type>
   inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
                           const size_t cols, UCL_Device &dev) 
@@ -334,7 +338,7 @@ class UCL_D_Mat : public UCL_BaseMat {
   /// Do not allocate memory, instead use an existing allocation
   /** - No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container **/
+    *   allocating container when using CUDA APIs **/
   template <class ptr_type>
   inline void view_offset(const size_t offset, ptr_type input, 
                           const size_t cols, UCL_Device &dev)
@@ -342,7 +346,7 @@ class UCL_D_Mat : public UCL_BaseMat {
   
   /// Free memory and set size to 0
   inline void clear() 
-    { _rows=0; if (_kind!=UCL_VIEW) { _kind=UCL_VIEW; _device_free(*this); } }
+    { _device_free(*this); _cols=0; _kind=UCL_VIEW; }
 
   /// Resize the allocation to contain cols elements
   /** \note Cannot be used on views **/
@@ -377,11 +381,17 @@ class UCL_D_Mat : public UCL_BaseMat {
     { if (cols>_cols || rows>_rows) return resize(rows,cols); 
       else return UCL_SUCCESS; }
 
-  /// Set each element to zero
-  inline void zero() { _device_zero(*this,row_bytes()*_rows); }
-  
-  /// Set first n elements to zero
-  inline void zero(const int n) { _device_zero(*this,n*sizeof(numtyp)); }
+  /// Set each element to zero asynchronously in the default command_queue
+  inline void zero() { zero(_cq); }
+  /// Set first n elements to zero asynchronously in the default command_queue
+  inline void zero(const int n) { zero(n,_cq); }
+  /// Set each element to zero asynchronously
+  inline void zero(command_queue &cq) 
+    { _device_zero(*this,row_bytes()*_rows,cq); }
+  /// Set first n elements to zero asynchronously
+  inline void zero(const int n, command_queue &cq) 
+    { _device_zero(*this,n*sizeof(numtyp),cq); }
+
 
   #ifdef _UCL_DEVICE_PTR_MAT
   /// For OpenCL, returns a (void *) device pointer to memory allocation
@@ -452,7 +462,6 @@ class UCL_D_Mat : public UCL_BaseMat {
   
  private:
   size_t _pitch, _row_size, _rows, _cols;
-  enum UCL_MEMOPT _kind;
 
   #ifdef _UCL_DEVICE_PTR_MAT
   device_ptr _array;
diff --git a/lib/gpu/geryon/ucl_d_vec.h b/lib/gpu/geryon/ucl_d_vec.h
index 11107437ea..fc1977f4b5 100644
--- a/lib/gpu/geryon/ucl_d_vec.h
+++ b/lib/gpu/geryon/ucl_d_vec.h
@@ -39,14 +39,14 @@ class UCL_D_Vec : public UCL_BaseMat {
   };
   typedef numtyp data_type; 
 
-  UCL_D_Vec() : _cols(0), _kind(UCL_VIEW) {}
-  ~UCL_D_Vec() { if (_kind!=UCL_VIEW) _device_free(*this); }
+  UCL_D_Vec() : _cols(0) {}
+  ~UCL_D_Vec() { _device_free(*this); }
 
   /// Construct with n columns
   /** \sa alloc() **/
   UCL_D_Vec(const size_t n, UCL_Device &device,
             const enum UCL_MEMOPT kind=UCL_READ_WRITE) : 
-    _cols(0), _kind(UCL_VIEW) { alloc(n,device,kind); }
+    _cols(0) { alloc(n,device,kind); }
 
   /// Set up host vector with 'cols' columns and reserve memory
   /** The kind parameter controls memory optimizations as follows:
@@ -119,15 +119,11 @@ class UCL_D_Vec : public UCL_BaseMat {
     return err; 
   }
 
-  /// Return the type of memory allocation
-  /** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, or UCL_VIEW **/ 
-  inline enum UCL_MEMOPT kind() const { return _kind; }
-  
   /// Do not allocate memory, instead use an existing allocation from Geryon
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container **/
+    *   allocating container when using CUDA APIs **/
   template <class ucl_type>
   inline void view(ucl_type &input, const size_t rows, const size_t cols) {
     #ifdef UCL_DEBUG
@@ -139,8 +135,10 @@ class UCL_D_Vec : public UCL_BaseMat {
     _row_bytes=_cols*sizeof(numtyp);
     this->_cq=input.cq();
     #ifdef _OCL_MAT
-    _offset=0;
+    _offset=input.offset();
     _array=input.cbegin();
+    CL_SAFE_CALL(clRetainMemObject(input.cbegin()));
+    CL_SAFE_CALL(clRetainCommandQueue(input.cq()));
     #else
     _device_view(&_array,input.begin());
     #endif
@@ -154,7 +152,7 @@ class UCL_D_Vec : public UCL_BaseMat {
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
     * \param stride Number of _elements_ between the start of each row **/ 
   template <class ucl_type>
   inline void view(ucl_type &input, const size_t rows, const size_t cols,
@@ -164,7 +162,7 @@ class UCL_D_Vec : public UCL_BaseMat {
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
     * - If a matrix is used a input, all elements (including padding)
     *   will be used for view **/
   template <class ucl_type>
@@ -175,7 +173,7 @@ class UCL_D_Vec : public UCL_BaseMat {
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
     * - If a matrix is used a input, all elements (including padding)
     *   will be used for view **/
   template <class ucl_type>
@@ -185,7 +183,7 @@ class UCL_D_Vec : public UCL_BaseMat {
   /// Do not allocate memory, instead use an existing allocation
   /** - No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container **/
+    *   allocating container when using CUDA APIs **/
   template <class ptr_type>
   inline void view(ptr_type input, const size_t rows, const size_t cols,
                    UCL_Device &dev) {
@@ -203,13 +201,15 @@ class UCL_D_Vec : public UCL_BaseMat {
     #endif
     #ifdef _OCL_MAT
     _offset=0;
+    CL_SAFE_CALL(clRetainMemObject(input));
+    CL_SAFE_CALL(clRetainCommandQueue(dev.cq()));
     #endif
   }
   
   /// Do not allocate memory, instead use an existing allocation
   /** - No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
     * \param stride Number of _elements_ between the start of each row **/ 
   template <class ptr_type>
   inline void view(ptr_type input, const size_t rows, const size_t cols,
@@ -219,7 +219,7 @@ class UCL_D_Vec : public UCL_BaseMat {
   /// Do not allocate memory, instead use an existing allocation
   /** - No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container **/
+    *   allocating container when using CUDA APIs **/
   template <class ptr_type>
   inline void view(ptr_type input, const size_t cols, UCL_Device &dev)
     { view(input,1,cols,dev); }
@@ -228,7 +228,7 @@ class UCL_D_Vec : public UCL_BaseMat {
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container **/
+    *   allocating container when using CUDA APIs **/
   template <class ucl_type>
   inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
                           const size_t cols) {
@@ -242,7 +242,9 @@ class UCL_D_Vec : public UCL_BaseMat {
     this->_cq=input.cq();
     #ifdef _OCL_MAT
     _array=input.begin();
-    _offset=offset;
+    _offset=offset+input.offset();
+    CL_SAFE_CALL(clRetainMemObject(input.begin()));
+    CL_SAFE_CALL(clRetainCommandQueue(input.cq()));
     #else
     _device_view(&_array,input.begin(),offset,sizeof(numtyp));
     #endif
@@ -256,7 +258,7 @@ class UCL_D_Vec : public UCL_BaseMat {
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
     * \param stride Number of _elements_ between the start of each row **/ 
   template <class ucl_type>
   inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
@@ -267,7 +269,7 @@ class UCL_D_Vec : public UCL_BaseMat {
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
     * - If a matrix is used a input, all elements (including padding)
     *   will be used for view **/
   template <class ucl_type>
@@ -278,7 +280,7 @@ class UCL_D_Vec : public UCL_BaseMat {
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
     * - If a matrix is used a input, all elements (including padding)
     *   will be used for view **/
   template <class ucl_type>
@@ -288,7 +290,7 @@ class UCL_D_Vec : public UCL_BaseMat {
   /// Do not allocate memory, instead use an existing allocation
   /** - No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container **/
+    *   allocating container when using CUDA APIs **/
   template <class ptr_type>
   inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
                           const size_t cols, UCL_Device &dev) {
@@ -304,6 +306,8 @@ class UCL_D_Vec : public UCL_BaseMat {
     #ifdef _OCL_MAT
     _array=input;
     _offset=offset;
+    CL_SAFE_CALL(clRetainMemObject(input));
+    CL_SAFE_CALL(clRetainCommandQueue(dev.cq()));
     #else
     #ifdef _UCL_DEVICE_PTR_MAT
     _array=input+offset*sizeof(numtyp);
@@ -320,7 +324,7 @@ class UCL_D_Vec : public UCL_BaseMat {
   /// Do not allocate memory, instead use an existing allocation
   /** - No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
     * \param stride Number of _elements_ between the start of each row **/ 
   template <class ptr_type>
   inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
@@ -330,7 +334,7 @@ class UCL_D_Vec : public UCL_BaseMat {
   /// Do not allocate memory, instead use an existing allocation
   /** - No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container **/
+    *   allocating container when using CUDA APIs **/
   template <class ptr_type>
   inline void view_offset(const size_t offset, ptr_type input, 
                           const size_t cols, UCL_Device &dev)
@@ -338,7 +342,7 @@ class UCL_D_Vec : public UCL_BaseMat {
   
   /// Free memory and set size to 0
   inline void clear() 
-    { if (_kind!=UCL_VIEW) { _cols=0; _kind=UCL_VIEW; _device_free(*this); } }
+    { _device_free(*this); _cols=0; _kind=UCL_VIEW;  }
 
   /// Resize the allocation to contain cols elements
   /** \note Cannot be used on views **/
@@ -373,11 +377,15 @@ class UCL_D_Vec : public UCL_BaseMat {
   inline int resize_ib(const int cols)
     { if (cols>_cols) return resize(cols); else return UCL_SUCCESS; }
 
-  /// Set each element to zero
-  inline void zero() { _device_zero(*this,row_bytes()); }
-
-  /// Set first n elements to zero
-  inline void zero(const int n) { _device_zero(*this,n*sizeof(numtyp)); }
+  /// Set each element to zero asynchronously in the default command_queue
+  inline void zero() { zero(_cq); }
+  /// Set first n elements to zero asynchronously in the default command_queue
+  inline void zero(const int n) { zero(n,_cq); }
+  /// Set each element to zero asynchronously
+  inline void zero(command_queue &cq) { _device_zero(*this,row_bytes(),cq); }
+  /// Set first n elements to zero asynchronously
+  inline void zero(const int n, command_queue &cq) 
+    { _device_zero(*this,n*sizeof(numtyp),cq); }
 
   #ifdef _UCL_DEVICE_PTR_MAT
   /// For OpenCL, returns a (void *) device pointer to memory allocation
@@ -465,7 +473,6 @@ class UCL_D_Vec : public UCL_BaseMat {
 
  private:
   size_t _row_bytes, _row_size, _rows, _cols;
-  enum UCL_MEMOPT _kind;
   
   #ifdef _UCL_DEVICE_PTR_MAT
   device_ptr _array;
diff --git a/lib/gpu/geryon/ucl_h_mat.h b/lib/gpu/geryon/ucl_h_mat.h
index 806b930630..dc6da3de0c 100644
--- a/lib/gpu/geryon/ucl_h_mat.h
+++ b/lib/gpu/geryon/ucl_h_mat.h
@@ -39,33 +39,35 @@ class UCL_H_Mat : public UCL_BaseMat {
    };
    typedef numtyp data_type; 
    
-  UCL_H_Mat() : _kind(UCL_VIEW), _rows(0) {
+  UCL_H_Mat() : _cols(0) {
     #ifdef _OCL_MAT
     _carray=(cl_mem)(0);
     #endif
   }
-  ~UCL_H_Mat() { if (_kind!=UCL_VIEW) _host_free(*this,_kind); }
+  ~UCL_H_Mat() { _host_free(*this); }
   
   /// Construct with specied number of rows and columns
   /** \sa alloc() **/
   UCL_H_Mat(const size_t rows, const size_t cols, UCL_Device &device, 
-            const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) 
-    { _rows=0; _kind=UCL_VIEW; alloc(rows,cols,device,kind); }
+            const enum UCL_MEMOPT kind=UCL_READ_WRITE) 
+    { _cols=0; _kind=UCL_VIEW; alloc(rows,cols,device,kind); }
   
   /// Set up host matrix with specied # of rows/cols and reserve memory
   /** The kind parameter controls memory pinning as follows:
-    * - UCL_NOT_PINNED      - Memory is not pinned
-    * - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
-    * - UCL_RW_OPTIMIZED    - Memory can be pinned 
+    * - UCL_READ_WRITE - Specify that you will read and write from host
+    * - UCL_WRITE_ONLY - Specify that you will only write from host
+    * - UCL_READ_ONLY  - Specify that you will only read from host
+    * - UCL_NOT_PINNED - Memory is not pinned/page-locked on host
     * \param cq Default command queue for operations copied from another mat
     * \return UCL_SUCCESS if the memory allocation is successful **/
   template <class mat_type>
   inline int alloc(const size_t rows, const size_t cols, mat_type &cq,
-                   const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
+                   const enum UCL_MEMOPT kind=UCL_READ_WRITE,
+                   const enum UCL_MEMOPT kind2=UCL_NOT_SPECIFIED) {
     clear();
 
     _row_bytes=cols*sizeof(numtyp);
-    int err=_host_alloc(*this,cq,_row_bytes*rows,kind);
+    int err=_host_alloc(*this,cq,_row_bytes*rows,kind,kind2);
     if (err!=UCL_SUCCESS) {
       #ifndef UCL_NO_EXIT
       std::cerr << "UCL Error: Could not allocate " << _row_bytes*_rows
@@ -86,17 +88,19 @@ class UCL_H_Mat : public UCL_BaseMat {
 
   /// Set up host matrix with specied # of rows/cols and reserve memory
   /** The kind parameter controls memory pinning as follows:
-    * - UCL_NOT_PINNED      - Memory is not pinned
-    * - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
-    * - UCL_RW_OPTIMIZED    - Memory can be pinned 
+    * - UCL_READ_WRITE - Specify that you will read and write from host
+    * - UCL_WRITE_ONLY - Specify that you will only write from host
+    * - UCL_READ_ONLY  - Specify that you will only read from host
+    * - UCL_NOT_PINNED - Memory is not pinned/page-locked on host
     * \param device Used to get the default command queue for operations
     * \return UCL_SUCCESS if the memory allocation is successful **/
   inline int alloc(const size_t rows, const size_t cols, UCL_Device &device,
-                   const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
+                   const enum UCL_MEMOPT kind=UCL_READ_WRITE,
+                   const enum UCL_MEMOPT kind2=UCL_NOT_SPECIFIED) {
     clear();
 
     _row_bytes=cols*sizeof(numtyp);
-    int err=_host_alloc(*this,device,_row_bytes*rows,kind);
+    int err=_host_alloc(*this,device,_row_bytes*rows,kind,kind2);
     if (err!=UCL_SUCCESS) {
       #ifndef UCL_NO_EXIT
       std::cerr << "UCL Error: Could not allocate " << _row_bytes*_rows
@@ -115,15 +119,11 @@ class UCL_H_Mat : public UCL_BaseMat {
     return err;
   }    
   
-  /// Return the type of memory allocation
-  /** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, or UCL_VIEW **/ 
-  inline enum UCL_MEMOPT kind() const { return _kind; }
-  
   /// Do not allocate memory, instead use an existing allocation from Geryon
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
     * - Viewing a device container on the host is not supported 
     * \param stride Number of _elements_ between the start of each row **/ 
   template <class ucl_type>
@@ -140,6 +140,8 @@ class UCL_H_Mat : public UCL_BaseMat {
     _end=_array+_cols;
     #ifdef _OCL_MAT
     _carray=input.cbegin();
+    CL_SAFE_CALL(clRetainMemObject(input.cbegin()));
+    CL_SAFE_CALL(clRetainCommandQueue(input.cq()));
     #endif
   }
 
@@ -147,7 +149,7 @@ class UCL_H_Mat : public UCL_BaseMat {
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
     * - Viewing a device container on the host is not supported **/ 
   template <class ucl_type>
   inline void view(ucl_type &input, const size_t rows, const size_t cols) 
@@ -157,7 +159,7 @@ class UCL_H_Mat : public UCL_BaseMat {
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
     * - If a matrix is used a input, all elements (including padding)
     *   will be used for view 
     * - Viewing a device container on the host is not supported **/ 
@@ -169,9 +171,9 @@ class UCL_H_Mat : public UCL_BaseMat {
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
     * - If a matrix is used a input, all elements (including padding)
-    *   will be used for view 
+    *   will be used for view when using CUDA APIs 
     * - Viewing a device container on the host is not supported **/ 
   template <class ucl_type>
   inline void view(ucl_type &input) 
@@ -180,7 +182,7 @@ class UCL_H_Mat : public UCL_BaseMat {
   /// Do not allocate memory, instead use an existing allocation
   /** - No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
     * - Viewing a device pointer on the host is not supported 
     * \param stride Number of _elements_ between the start of each row **/ 
   template <class ptr_type>
@@ -197,14 +199,14 @@ class UCL_H_Mat : public UCL_BaseMat {
     _end=_array+_cols;
     
     #ifdef _OCL_MAT
-    _host_alloc(*this,dev,_row_bytes,UCL_VIEW);
+    _host_view(*this,dev,_row_bytes*rows);
     #endif 
   }
 
   /// Do not allocate memory, instead use an existing allocation
   /** - No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
     * - Viewing a device pointer on the host is not supported **/ 
   template <class ptr_type>
   inline void view(ptr_type *input, const size_t rows, const size_t cols,
@@ -213,7 +215,7 @@ class UCL_H_Mat : public UCL_BaseMat {
   /// Do not allocate memory, instead use an existing allocation
   /** - No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
     * - Viewing a device pointer on the host is not supported **/ 
   template <class ptr_type>
   inline void view(ptr_type *input, const size_t cols, UCL_Device &dev)
@@ -223,7 +225,7 @@ class UCL_H_Mat : public UCL_BaseMat {
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
     * - Viewing a device container on the host is not supported 
     * \param stride Number of _elements_ between the start of each row **/ 
   template <class ucl_type>
@@ -239,7 +241,7 @@ class UCL_H_Mat : public UCL_BaseMat {
     _array=input.begin()+offset;
     _end=_array+_cols;
     #ifdef _OCL_MAT
-    _host_alloc(*this,input,_row_bytes,UCL_VIEW);
+    _host_view(*this,input,_row_bytes*_rows);
     #endif
   }
   
@@ -247,7 +249,7 @@ class UCL_H_Mat : public UCL_BaseMat {
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
     * - Viewing a device container on the host is not supported **/ 
   template <class ucl_type>
   inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
@@ -258,7 +260,7 @@ class UCL_H_Mat : public UCL_BaseMat {
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
     * - If a matrix is used a input, all elements (including padding)
     *   will be used for view 
     * - Viewing a device container on the host is not supported **/ 
@@ -270,7 +272,7 @@ class UCL_H_Mat : public UCL_BaseMat {
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
     * - If a matrix is used a input, all elements (including padding)
     *   will be used for view 
     * - Viewing a device container on the host is not supported **/ 
@@ -296,7 +298,7 @@ class UCL_H_Mat : public UCL_BaseMat {
   /// Do not allocate memory, instead use an existing allocation
   /** - No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
     * - Viewing a device pointer on the host is not supported 
     * \param stride Number of _elements_ between the start of each row **/ 
   template <class ptr_type>
@@ -307,7 +309,7 @@ class UCL_H_Mat : public UCL_BaseMat {
   /// Do not allocate memory, instead use an existing allocation
   /** - No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
     * - Viewing a device pointer on the host is not supported **/ 
   template <class ptr_type>
   inline void view_offset(const size_t offset, ptr_type *input, 
@@ -316,7 +318,7 @@ class UCL_H_Mat : public UCL_BaseMat {
   
   /// Free memory and set size to 0
   inline void clear() 
-    { if (_kind!=UCL_VIEW) {_rows=0; _kind=UCL_VIEW; _host_free(*this,_kind); }} 
+    { _host_free(*this); _cols=0; _kind=UCL_VIEW; } 
 
   /// Resize the allocation to rows x cols elements
   /** \note Cannot be used on views **/
@@ -409,7 +411,6 @@ class UCL_H_Mat : public UCL_BaseMat {
   #endif
   
  private:
-  enum UCL_MEMOPT _kind;
   numtyp *_array, *_end;
   size_t _row_bytes, _rows, _cols;
 
diff --git a/lib/gpu/geryon/ucl_h_vec.h b/lib/gpu/geryon/ucl_h_vec.h
index 3a53113153..773facdea0 100644
--- a/lib/gpu/geryon/ucl_h_vec.h
+++ b/lib/gpu/geryon/ucl_h_vec.h
@@ -39,33 +39,35 @@ class UCL_H_Vec : public UCL_BaseMat {
    };
    typedef numtyp data_type; 
    
-  UCL_H_Vec() : _kind(UCL_VIEW), _cols(0) {
+  UCL_H_Vec() : _cols(0) {
     #ifdef _OCL_MAT
     _carray=(cl_mem)(0);
     #endif
   }
-  ~UCL_H_Vec() { if (_kind!=UCL_VIEW) _host_free(*this,_kind); }
+  ~UCL_H_Vec() { _host_free(*this); }
   
   /// Construct with n columns
   /** \sa alloc() **/
   UCL_H_Vec(const size_t n, UCL_Device &device, 
-            const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) 
+            const enum UCL_MEMOPT kind=UCL_READ_WRITE) 
     { _cols=0; _kind=UCL_VIEW; alloc(n,device,kind); }
   
   /// Set up host vector with 'cols' columns and reserve memory
   /** The kind parameter controls memory pinning as follows:
-    * - UCL_NOT_PINNED      - Memory is not pinned
-    * - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
-    * - UCL_RW_OPTIMIZED    - Memory can be pinned 
+    * - UCL_READ_WRITE - Specify that you will read and write from host
+    * - UCL_WRITE_ONLY - Specify that you will only write from host
+    * - UCL_READ_ONLY  - Specify that you will only read from host
+    * - UCL_NOT_PINNED - Memory is not pinned/page-locked on host
     * \param cq Default command queue for operations copied from another mat
     * \return UCL_SUCCESS if the memory allocation is successful **/
   template <class mat_type>
   inline int alloc(const size_t cols, mat_type &cq,
-                   const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
+                   const enum UCL_MEMOPT kind=UCL_READ_WRITE,
+                   const enum UCL_MEMOPT kind2=UCL_NOT_SPECIFIED) {
     clear();
 
     _row_bytes=cols*sizeof(numtyp);
-    int err=_host_alloc(*this,cq,_row_bytes,kind);
+    int err=_host_alloc(*this,cq,_row_bytes,kind,kind2);
 
     if (err!=UCL_SUCCESS) {
       #ifndef UCL_NO_EXIT
@@ -86,17 +88,19 @@ class UCL_H_Vec : public UCL_BaseMat {
 
   /// Set up host vector with 'cols' columns and reserve memory
   /** The kind parameter controls memory pinning as follows:
-    * - UCL_NOT_PINNED      - Memory is not pinned
-    * - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
-    * - UCL_RW_OPTIMIZED    - Memory can be pinned 
+    * - UCL_READ_WRITE - Specify that you will read and write from host
+    * - UCL_WRITE_ONLY - Specify that you will only write from host
+    * - UCL_READ_ONLY  - Specify that you will only read from host
+    * - UCL_NOT_PINNED - Memory is not pinned/page-locked on host
     * \param device Used to get the default command queue for operations
     * \return UCL_SUCCESS if the memory allocation is successful **/
   inline int alloc(const size_t cols, UCL_Device &device,
-                   const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
+                   const enum UCL_MEMOPT kind=UCL_READ_WRITE,
+                   const enum UCL_MEMOPT kind2=UCL_NOT_SPECIFIED) {
     clear();
 
     _row_bytes=cols*sizeof(numtyp);
-    int err=_host_alloc(*this,device,_row_bytes,kind);
+    int err=_host_alloc(*this,device,_row_bytes,kind,kind2);
 
     if (err!=UCL_SUCCESS) {
       #ifndef UCL_NO_EXIT
@@ -115,15 +119,11 @@ class UCL_H_Vec : public UCL_BaseMat {
     return err;
   }
   
-  /// Return the type of memory allocation
-  /** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, or UCL_VIEW **/ 
-  inline enum UCL_MEMOPT kind() const { return _kind; }
-  
   /// Do not allocate memory, instead use an existing allocation from Geryon
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
     * - Viewing a device container on the host is not supported **/ 
   template <class ucl_type>
   inline void view(ucl_type &input, const size_t rows, const size_t cols) {
@@ -139,6 +139,8 @@ class UCL_H_Vec : public UCL_BaseMat {
     _end=_array+_cols;
     #ifdef _OCL_MAT
     _carray=input.cbegin();
+    CL_SAFE_CALL(clRetainMemObject(input.cbegin()));
+    CL_SAFE_CALL(clRetainCommandQueue(input.cq()));
     #endif
   }
   
@@ -146,7 +148,7 @@ class UCL_H_Vec : public UCL_BaseMat {
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs
     * - Viewing a device container on the host is not supported 
     * \param stride Number of _elements_ between the start of each row **/ 
   template <class ucl_type>
@@ -157,7 +159,7 @@ class UCL_H_Vec : public UCL_BaseMat {
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
     * - If a matrix is used a input, all elements (including padding)
     *   will be used for view 
     * - Viewing a device container on the host is not supported **/ 
@@ -180,7 +182,7 @@ class UCL_H_Vec : public UCL_BaseMat {
   /// Do not allocate memory, instead use an existing allocation
   /** - No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
     * - Viewing a device pointer on the host is not supported **/ 
   template <class ptr_type>
   inline void view(ptr_type *input, const size_t rows, const size_t cols,
@@ -197,14 +199,14 @@ class UCL_H_Vec : public UCL_BaseMat {
     _end=_array+_cols;
     
     #ifdef _OCL_MAT
-    _host_alloc(*this,dev,_row_bytes,UCL_VIEW);
+    _host_view(*this,dev,_row_bytes);
     #endif 
   }
   
   /// Do not allocate memory, instead use an existing allocation
   /** - No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs
     * - Viewing a device pointer on the host is not supported 
     * \param stride Number of _elements_ between the start of each row **/ 
   template <class ptr_type>
@@ -215,7 +217,7 @@ class UCL_H_Vec : public UCL_BaseMat {
   /// Do not allocate memory, instead use an existing allocation
   /** - No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs
     * - Viewing a device pointer on the host is not supported **/ 
   template <class ptr_type>
   inline void view(ptr_type *input, const size_t cols, UCL_Device &dev)
@@ -225,7 +227,7 @@ class UCL_H_Vec : public UCL_BaseMat {
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs
     * - Viewing a device container on the host is not supported **/ 
   template <class ucl_type>
   inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
@@ -241,7 +243,7 @@ class UCL_H_Vec : public UCL_BaseMat {
     _array=input.begin()+offset;
     _end=_array+_cols;
     #ifdef _OCL_MAT
-    _host_alloc(*this,input,_row_bytes,UCL_VIEW);
+    _host_view(*this,input,_row_bytes);
     #endif
   }
   
@@ -249,7 +251,7 @@ class UCL_H_Vec : public UCL_BaseMat {
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
     * - Viewing a device container on the host is not supported 
     * \param stride Number of _elements_ between the start of each row **/ 
   template <class ucl_type>
@@ -261,7 +263,7 @@ class UCL_H_Vec : public UCL_BaseMat {
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
     * - If a matrix is used a input, all elements (including padding)
     *   will be used for view 
     * - Viewing a device container on the host is not supported **/ 
@@ -273,7 +275,7 @@ class UCL_H_Vec : public UCL_BaseMat {
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
     * - If a matrix is used a input, all elements (including padding)
     *   will be used for view 
     * - Viewing a device container on the host is not supported **/ 
@@ -284,7 +286,7 @@ class UCL_H_Vec : public UCL_BaseMat {
   /// Do not allocate memory, instead use an existing allocation
   /** - No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
     * - Viewing a device pointer on the host is not supported **/ 
   template <class ptr_type>
   inline void view_offset(const size_t offset,ptr_type *input,const size_t rows,
@@ -294,7 +296,7 @@ class UCL_H_Vec : public UCL_BaseMat {
   /// Do not allocate memory, instead use an existing allocation
   /** - No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
     * - Viewing a device pointer on the host is not supported 
     * \param stride Number of _elements_ between the start of each row **/ 
   template <class ptr_type>
@@ -305,7 +307,7 @@ class UCL_H_Vec : public UCL_BaseMat {
   /// Do not allocate memory, instead use an existing allocation
   /** - No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container when using CUDA APIs 
     * - Viewing a device pointer on the host is not supported **/ 
   template <class ptr_type>
   inline void view_offset(const size_t offset, ptr_type *input, 
@@ -314,7 +316,7 @@ class UCL_H_Vec : public UCL_BaseMat {
   
   /// Free memory and set size to 0
   inline void clear() 
-    { if (_kind!=UCL_VIEW) {_kind=UCL_VIEW; _cols=0; _host_free(*this,_kind);}}
+    { _host_free(*this); _kind=UCL_VIEW; _cols=0; }
 
   /// Resize the allocation to contain cols elements
   /** \note Cannot be used on views **/
@@ -401,7 +403,6 @@ class UCL_H_Vec : public UCL_BaseMat {
   #endif
   
  private:
-  enum UCL_MEMOPT _kind;
   numtyp *_array, *_end;
   size_t _row_bytes, _cols;
 
diff --git a/lib/gpu/geryon/ucl_matrix.h b/lib/gpu/geryon/ucl_matrix.h
index 803cd78b4c..301325b454 100644
--- a/lib/gpu/geryon/ucl_matrix.h
+++ b/lib/gpu/geryon/ucl_matrix.h
@@ -48,17 +48,18 @@ class UCL_Matrix {
   /// Construct with specied number of rows and columns
   /** \sa alloc() **/
   UCL_Matrix(const size_t rows, const size_t cols, UCL_Device &acc, 
-             const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED,
+             const enum UCL_MEMOPT kind1=UCL_READ_WRITE,
              const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
     { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
         alloc(host,device,_buffer,rows,cols,acc,kind1,kind2); }
   
   /// Set up host matrix with specied # of rows/cols and reserve memory
-  /** The kind1 parameter controls memory pinning as follows:
-    * - UCL_NOT_PINNED      - Memory is not pinned
-    * - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
-    * - UCL_RW_OPTIMIZED    - Memory can be pinned 
-    * The kind2 parameter controls memory optimizations as follows:
+  /** The kind1 parameter controls memory access from the host
+    * - UCL_READ_WRITE - Specify that you will read and write from host
+    * - UCL_WRITE_ONLY - Specify that you will only write from host
+    * - UCL_READ_ONLY  - Specify that you will only read from host
+    * - UCL_NOT_PINNED - Memory is not pinned/page-locked on host
+    * The kind2 parameter controls memory optimizations from the device:
     * - UCL_READ_WRITE - Specify that you will read and write in kernels
     * - UCL_WRITE_ONLY - Specify that you will only write in kernels
     * - UCL_READ_ONLY  - Specify that you will only read in kernels
@@ -69,24 +70,25 @@ class UCL_Matrix {
     * \return UCL_SUCCESS if the memory allocation is successful **/
   template <class mat_type>
   inline int alloc(const size_t rows, const size_t cols, mat_type &cq,
-                   const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED,
+                   const enum UCL_MEMOPT kind1=UCL_READ_WRITE,
                    const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
     { return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
         alloc(host,device,_buffer,rows,cols,cq,kind1,kind2); }
   
   /// Set up host matrix with specied # of rows/cols and reserve memory
-  /** The kind1 parameter controls memory pinning as follows:
-    * - UCL_NOT_PINNED      - Memory is not pinned
-    * - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
-    * - UCL_RW_OPTIMIZED    - Memory can be pinned 
-    * The kind2 parameter controls memory optimizations as follows:
+  /** The kind1 parameter controls memory access from the host
+    * - UCL_READ_WRITE - Specify that you will read and write from host
+    * - UCL_WRITE_ONLY - Specify that you will only write from host
+    * - UCL_READ_ONLY  - Specify that you will only read from host
+    * - UCL_NOT_PINNED - Memory is not pinned/page-locked on host
+    * The kind2 parameter controls memory optimizations from the device:
     * - UCL_READ_WRITE - Specify that you will read and write in kernels
     * - UCL_WRITE_ONLY - Specify that you will only write in kernels
     * - UCL_READ_ONLY  - Specify that you will only read in kernels
     * \param device Used to get the default command queue for operations
     * \return UCL_SUCCESS if the memory allocation is successful **/
   inline int alloc(const size_t rows, const size_t cols, UCL_Device &acc,
-                   const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED,
+                   const enum UCL_MEMOPT kind1=UCL_READ_WRITE,
                    const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
     { return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
         alloc(host,device,_buffer,rows,cols,acc,kind1,kind2); }
@@ -110,11 +112,22 @@ class UCL_Matrix {
     { if (new_rows>rows() || new_cols>cols()) return resize(new_rows,new_cols); 
       else return UCL_SUCCESS; }
 
-  /// Set each element to zero
-  inline void zero() { host.zero(); device.zero(); }
-  
-  /// Set first n elements to zero
-  inline void zero(const int n) { host.zero(n); device.zero(n); }
+  /// Set each element to zero (asynchronously on device)
+  inline void zero() { zero(cq()); }
+  /// Set first n elements to zero (asynchronously on device)
+  inline void zero(const int n) { zero(n,cq()); }
+  /// Set each element to zero (asynchronously on device)
+  inline void zero(command_queue &cq) {
+    host.zero(); 
+    if (device.kind()!=UCL_VIEW) device.zero(cq);
+    else if (_buffer.numel()>0) _buffer.zero();
+  }
+  /// Set first n elements to zero (asynchronously on device)
+  inline void zero(const int n, command_queue &cq) { 
+    host.zero(n); 
+    if (device.kind()!=UCL_VIEW) device.zero(n,cq); 
+    else if (_buffer.numel()>0) _buffer.zero();
+  }
 
   /// Get the number of elements
   inline size_t numel() const { return host.numel(); }
@@ -145,6 +158,8 @@ class UCL_Matrix {
   
   /// Return the default command queue/stream associated with this data
   inline command_queue & cq() { return host.cq(); }
+  /// Change the default command queue associated with this data
+  inline void cq(command_queue &cq_in) { host.cq(cq_in); device.cq(cq_in); }
   /// Block until command_queue associated with matrix is complete
   inline void sync() { host.sync(); }
 
diff --git a/lib/gpu/geryon/ucl_s_obj_help.h b/lib/gpu/geryon/ucl_s_obj_help.h
index ea772d3608..0b8c0251c1 100644
--- a/lib/gpu/geryon/ucl_s_obj_help.h
+++ b/lib/gpu/geryon/ucl_s_obj_help.h
@@ -32,14 +32,24 @@ template <> struct _ucl_s_obj_help<1> {
                           const enum UCL_MEMOPT kind1,
                           const enum UCL_MEMOPT kind2) {
     int e1;
-    e1=host.alloc(cols,acc,kind1);
-    if (e1!=UCL_SUCCESS)
-      return e1;
     if (acc.shared_memory()) {
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_ALLOC 1S\n";
+      #endif
+      e1=host.alloc(cols,acc,kind1,kind2);
+      if (e1!=UCL_SUCCESS)
+        return e1;
       device.view(host);
       return UCL_SUCCESS;
-    } else
+    } else {
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_ALLOC 1NS\n";
+      #endif
+      e1=host.alloc(cols,acc,kind1);
+      if (e1!=UCL_SUCCESS)
+        return e1;
       return device.alloc(cols,acc,kind2);
+    }
   }
 
   template <class t1, class t2, class t3, class mat_type>
@@ -48,10 +58,24 @@ template <> struct _ucl_s_obj_help<1> {
                           const enum UCL_MEMOPT kind1,
                           const enum UCL_MEMOPT kind2) {
     int e1;
-    e1=host.alloc(cols,cq,kind1);
-    if (e1!=UCL_SUCCESS)
-      return e1;
-    return device.alloc(cols,cq,kind2);
+    if (cq.shared_mem_device()) {
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_ALLOC 2S\n";
+      #endif
+      e1=host.alloc(cols,cq,kind1,kind2);
+      if (e1!=UCL_SUCCESS)
+        return e1;
+      device.view(host);
+      return UCL_SUCCESS;
+    } else {
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_ALLOC 2NS\n";
+      #endif
+      e1=host.alloc(cols,cq,kind1);
+      if (e1!=UCL_SUCCESS)
+        return e1;
+      return device.alloc(cols,cq,kind2);
+    }
   }
 
   template <class t1, class t2, class t3>
@@ -60,14 +84,24 @@ template <> struct _ucl_s_obj_help<1> {
                           const enum UCL_MEMOPT kind1,
                           const enum UCL_MEMOPT kind2) {
     int e1;
-    e1=host.alloc(rows,cols,acc,kind1);
-    if (e1!=UCL_SUCCESS)
-      return e1;
     if (acc.shared_memory()) {
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_ALLOC 3S\n";
+      #endif
+      e1=host.alloc(rows,cols,acc,kind1,kind2);
+      if (e1!=UCL_SUCCESS)
+        return e1;
       device.view(host);
       return UCL_SUCCESS;
-    } else
+    } else {
+      e1=host.alloc(rows,cols,acc,kind1);
+      if (e1!=UCL_SUCCESS)
+        return e1;
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_ALLOC 3NS\n";
+      #endif
       return device.alloc(rows,cols,acc,kind2);
+    }
   }
 
   template <class t1, class t2, class t3, class mat_type>
@@ -76,10 +110,24 @@ template <> struct _ucl_s_obj_help<1> {
                           const enum UCL_MEMOPT kind1,
                           const enum UCL_MEMOPT kind2) {
     int e1;
-    e1=host.alloc(rows,cols,cq,kind1);
-    if (e1!=UCL_SUCCESS)
-      return e1;
-    return device.alloc(rows,cols,cq,kind2);
+    if (cq.shared_mem_device()) {
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_ALLOC 4S\n";
+      #endif
+      e1=host.alloc(rows,cols,cq,kind1,kind2);
+      if (e1!=UCL_SUCCESS)
+        return e1;
+      device.view(host);
+      return UCL_SUCCESS;
+    } else {
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_ALLOC 4NS\n";
+      #endif
+      e1=host.alloc(rows,cols,cq,kind1);
+      if (e1!=UCL_SUCCESS)
+        return e1;
+      return device.alloc(rows,cols,cq,kind2);
+    }
   }
 
   template <class t1, class t2, class t3>
@@ -121,8 +169,15 @@ template <> struct _ucl_s_obj_help<1> {
     if (device.kind()==UCL_VIEW) {
       device.view(host);
       return UCL_SUCCESS;
-    } else
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_ALLOC 5S\n";
+      #endif
+    } else {
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_ALLOC 5NS\n";
+      #endif
       return device.resize(cols);
+    }
   }
 
   template <class t1, class t2, class t3>
@@ -130,9 +185,16 @@ template <> struct _ucl_s_obj_help<1> {
                                const int cols) {
     if (device.kind()==UCL_VIEW) {
       device.view(host);
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_ALLOC 6S\n";
+      #endif
       return UCL_SUCCESS;
-    } else
+    } else {
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_ALLOC 6NS\n";
+      #endif
       return device.resize(rows,cols);
+    }
   }
 };
 
@@ -145,17 +207,27 @@ template <int st> struct _ucl_s_obj_help {
                           const enum UCL_MEMOPT kind2) {
     int e1;
     e1=host.alloc(cols,acc,UCL_NOT_PINNED);
-    if (e1!=UCL_SUCCESS)
-      return e1;
-    e1=_buffer.alloc(cols,acc,kind1);
     if (e1!=UCL_SUCCESS)
       return e1;
 
     if (acc.shared_memory()) {
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_ALLOC 7S\n";
+      #endif
+      e1=_buffer.alloc(cols,acc,kind1,kind2);
+      if (e1!=UCL_SUCCESS)
+        return e1;
       device.view(_buffer);
       return UCL_SUCCESS;
-    } else
+    } else {
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_ALLOC 7NS\n";
+      #endif
+      e1=_buffer.alloc(cols,acc,kind1);
+      if (e1!=UCL_SUCCESS)
+        return e1;
       return device.alloc(cols,acc,kind2);
+    }
   }
 
   template <class t1, class t2, class t3, class mat_type>
@@ -167,10 +239,24 @@ template <int st> struct _ucl_s_obj_help {
     e1=host.alloc(cols,cq,UCL_NOT_PINNED);
     if (e1!=UCL_SUCCESS)
       return e1;
-    e1=_buffer.alloc(cols,cq,kind1);
-    if (e1!=UCL_SUCCESS)
-      return e1;
-    return device.alloc(cols,cq,kind2); 
+    if (cq.shared_mem_device()) {
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_ALLOC 8S\n";
+      #endif
+      e1=_buffer.alloc(cols,cq,kind1,kind2);
+      if (e1!=UCL_SUCCESS)
+        return e1;
+      device.view(_buffer);
+      return UCL_SUCCESS;
+    } else {
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_ALLOC 8NS\n";
+      #endif
+      e1=_buffer.alloc(cols,cq,kind1);
+      if (e1!=UCL_SUCCESS)
+        return e1;
+      return device.alloc(cols,cq,kind2); 
+    }
   }
 
   template <class t1, class t2, class t3>
@@ -180,17 +266,27 @@ template <int st> struct _ucl_s_obj_help {
                           const enum UCL_MEMOPT kind2) {
     int e1;
     e1=host.alloc(rows,cols,acc,UCL_NOT_PINNED);
-    if (e1!=UCL_SUCCESS)
-      return e1;
-    e1=_buffer.alloc(rows,cols,acc,kind1);
     if (e1!=UCL_SUCCESS)
       return e1;
 
     if (acc.shared_memory()) {
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_ALLOC 9S\n";
+      #endif
+      e1=_buffer.alloc(rows,cols,acc,kind1,kind2);
+      if (e1!=UCL_SUCCESS)
+        return e1;
       device.view(_buffer);
       return UCL_SUCCESS;
-    } else
+    } else {
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_ALLOC 9NS\n";
+      #endif
+      e1=_buffer.alloc(rows,cols,acc,kind1);
+      if (e1!=UCL_SUCCESS)
+        return e1;
       return device.alloc(rows,cols,acc,kind2);
+    }
   }
 
   template <class t1, class t2, class t3, class mat_type>
@@ -202,10 +298,24 @@ template <int st> struct _ucl_s_obj_help {
     e1=host.alloc(rows,cols,cq,UCL_NOT_PINNED);
     if (e1!=UCL_SUCCESS)
       return e1;
-    e1=_buffer.alloc(rows,cols,cq,kind1);
-    if (e1!=UCL_SUCCESS)
-      return e1;
-    return device.alloc(rows,cols,cq,kind2); 
+    if (cq.shared_mem_device()) {
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_ALLOC 10S\n";
+      #endif
+      e1=_buffer.alloc(rows,cols,cq,kind1,kind2);
+      if (e1!=UCL_SUCCESS)
+        return e1;
+      device.view(_buffer);
+      return UCL_SUCCESS;
+    } else {
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_ALLOC 10NS\n";
+      #endif
+      e1=_buffer.alloc(rows,cols,cq,kind1);
+      if (e1!=UCL_SUCCESS)
+        return e1;
+      return device.alloc(rows,cols,cq,kind2); 
+    }
   }
 
   template <class t1, class t2, class t3>
@@ -250,9 +360,16 @@ template <int st> struct _ucl_s_obj_help {
 
     if (device.kind()==UCL_VIEW) {
       device.view(buff);
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_ALLOC 11S\n";
+      #endif
       return UCL_SUCCESS;
-    } else
+    } else {
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_ALLOC 11NS\n";
+      #endif
       return device.resize(cols);
+    }
   }
 
   template <class t1, class t2, class t3>
@@ -264,9 +381,17 @@ template <int st> struct _ucl_s_obj_help {
 
     if (device.kind()==UCL_VIEW) {
       device.view(buff);
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_ALLOC 12S\n";
+      #endif
       return UCL_SUCCESS;
-    } else
+    } else {
+      #ifdef UCL_DBG_MEM_TRACE
+      std::cerr << "UCL_ALLOC 12NS\n";
+      #endif
       return device.resize(rows,cols);
+    }
   }
 
 };
+
diff --git a/lib/gpu/geryon/ucl_types.h b/lib/gpu/geryon/ucl_types.h
index 615bffea95..46be4bcb06 100644
--- a/lib/gpu/geryon/ucl_types.h
+++ b/lib/gpu/geryon/ucl_types.h
@@ -92,10 +92,9 @@ enum UCL_MEMOPT {
   UCL_WRITE_ONLY,     ///< Allow any optimizations for memory that is write only
   UCL_READ_ONLY,      ///< Allow any optimizations for memory that is read only
   UCL_READ_WRITE,     ///< Allow read and write
-  UCL_WRITE_OPTIMIZED,///< Allow host memory to be pinned (write combined)
-  UCL_RW_OPTIMIZED,   ///< Allow host memory to be pinned  
   UCL_NOT_PINNED,     ///< Host memory is not to be pinned
-  UCL_VIEW            ///< View of another memory allocation
+  UCL_VIEW,           ///< View of another memory allocation
+  UCL_NOT_SPECIFIED
 };
 
 enum UCL_DEVICE_TYPE { 
diff --git a/lib/gpu/geryon/ucl_vector.h b/lib/gpu/geryon/ucl_vector.h
index 01346127c2..89f1528969 100644
--- a/lib/gpu/geryon/ucl_vector.h
+++ b/lib/gpu/geryon/ucl_vector.h
@@ -48,17 +48,18 @@ class UCL_Vector {
   /// Construct with n columns
   /** \sa alloc() **/
   UCL_Vector(const size_t cols, UCL_Device &acc, 
-             const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED,
+             const enum UCL_MEMOPT kind1=UCL_READ_WRITE,
              const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
     { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
         alloc(host,device,_buffer,cols,acc,kind1,kind2); }
   
   /// Set up the vector with 'cols' columns and reserve memory
-  /** The kind1 parameter controls memory pinning as follows:
-    * - UCL_NOT_PINNED      - Memory is not pinned
-    * - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
-    * - UCL_RW_OPTIMIZED    - Memory can be pinned 
-    * The kind2 parameter controls memory optimizations as follows:
+  /** The kind1 parameter controls memory access from the host
+    * - UCL_READ_WRITE - Specify that you will read and write from host
+    * - UCL_WRITE_ONLY - Specify that you will only write from host
+    * - UCL_READ_ONLY  - Specify that you will only read from host
+    * - UCL_NOT_PINNED - Memory is not pinned/page-locked on host
+    * The kind2 parameter controls memory optimizations from the device:
     * - UCL_READ_WRITE - Specify that you will read and write in kernels
     * - UCL_WRITE_ONLY - Specify that you will only write in kernels
     * - UCL_READ_ONLY  - Specify that you will only read in kernels
@@ -69,24 +70,25 @@ class UCL_Vector {
     * \return UCL_SUCCESS if the memory allocation is successful **/
   template <class mat_type>
   inline int alloc(const size_t cols, mat_type &cq,
-                   const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED,
+                   const enum UCL_MEMOPT kind1=UCL_READ_WRITE,
                    const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
     { return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
         alloc(host,device,_buffer,cols,cq,kind1,kind2); }
 
   /// Set up host vector with 'cols' columns and reserve memory
-  /** The kind1 parameter controls memory pinning as follows:
-    * - UCL_NOT_PINNED      - Memory is not pinned
-    * - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
-    * - UCL_RW_OPTIMIZED    - Memory can be pinned 
-    * The kind2 parameter controls memory optimizations as follows:
+  /** The kind1 parameter controls memory access from the host
+    * - UCL_READ_WRITE - Specify that you will read and write from host
+    * - UCL_WRITE_ONLY - Specify that you will only write from host
+    * - UCL_READ_ONLY  - Specify that you will only read from host
+    * - UCL_NOT_PINNED - Memory is not pinned/page-locked on host
+    * The kind2 parameter controls memory optimizations from the device:
     * - UCL_READ_WRITE - Specify that you will read and write in kernels
     * - UCL_WRITE_ONLY - Specify that you will only write in kernels
     * - UCL_READ_ONLY  - Specify that you will only read in kernels
     * \param device Used to get the default command queue for operations
     * \return UCL_SUCCESS if the memory allocation is successful **/
   inline int alloc(const size_t cols, UCL_Device &acc,
-                   const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED,
+                   const enum UCL_MEMOPT kind1=UCL_READ_WRITE,
                    const enum UCL_MEMOPT kind2=UCL_READ_WRITE) 
     { return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
         alloc(host,device,_buffer,cols,acc,kind1,kind2); }
@@ -109,11 +111,22 @@ class UCL_Vector {
   inline int resize_ib(const int new_cols)
     { if (new_cols>cols()) return resize(new_cols); else return UCL_SUCCESS; }
 
-  /// Set each element to zero
-  inline void zero() { host.zero(); device.zero(); }
-  
-  /// Set first n elements to zero
-  inline void zero(const int n) { host.zero(n); device.zero(n); }
+  /// Set each element to zero (asynchronously on device)
+  inline void zero() { zero(cq()); }
+  /// Set first n elements to zero (asynchronously on device)
+  inline void zero(const int n) { zero(n,cq()); }
+  /// Set each element to zero (asynchronously on device)
+  inline void zero(command_queue &cq) {
+    host.zero(); 
+    if (device.kind()!=UCL_VIEW) device.zero(cq);
+    else if (_buffer.numel()>0) _buffer.zero();
+  }
+  /// Set first n elements to zero (asynchronously on device)
+  inline void zero(const int n, command_queue &cq) { 
+    host.zero(n); 
+    if (device.kind()!=UCL_VIEW) device.zero(n,cq); 
+    else if (_buffer.numel()>0) _buffer.zero();
+  }
 
   /// Get the number of elements
   inline size_t numel() const { return host.numel(); }
@@ -145,6 +158,8 @@ class UCL_Vector {
   
   /// Return the default command queue/stream associated with this data
   inline command_queue & cq() { return host.cq(); }
+  /// Change the default command queue associated with this data
+  inline void cq(command_queue &cq_in) { host.cq(cq_in); device.cq(cq_in); }
   /// Block until command_queue associated with matrix is complete
   inline void sync() { host.sync(); }
 
diff --git a/lib/gpu/lal_answer.cpp b/lib/gpu/lal_answer.cpp
index 6f42790ca3..7dbd875cd1 100644
--- a/lib/gpu/lal_answer.cpp
+++ b/lib/gpu/lal_answer.cpp
@@ -44,10 +44,10 @@ bool AnswerT::alloc(const int inum) {
     _ans_fields+=4;
   
   // ---------------------------  Device allocations
-  success=success && (engv.alloc(_ev_fields*_max_local,*dev,UCL_RW_OPTIMIZED,
-                                 UCL_WRITE_ONLY)==UCL_SUCCESS);
-  success=success && (force.alloc(_ans_fields*_max_local,*dev,UCL_RW_OPTIMIZED,
-                                UCL_WRITE_ONLY)==UCL_SUCCESS);
+  success=success && (engv.alloc(_ev_fields*_max_local,*dev,UCL_READ_ONLY,
+                                 UCL_READ_WRITE)==UCL_SUCCESS);
+  success=success && (force.alloc(_ans_fields*_max_local,*dev,UCL_READ_ONLY,
+                                UCL_READ_WRITE)==UCL_SUCCESS);
   _gpu_bytes=engv.device.row_bytes()+force.device.row_bytes();
   
   _allocated=true;  
@@ -175,78 +175,42 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
     return 0.0;
 
   double evdwl=0.0;
-  double virial_acc[6];
-  for (int i=0; i<6; i++) virial_acc[i]=0.0;
-  if (_ilist==NULL) {
-    for (int i=0; i<_inum; i++) {
-      int al=i;
-      if (_eflag) {
-        if (_ef_atom) {
-          evdwl+=engv[al];
-          eatom[i]+=engv[al]*0.5;
-          al+=_inum;
-        } else {
-          evdwl+=engv[al];
-          al+=_inum;
-        }
-      }
-      if (_vflag) {
-        if (_vf_atom) {
-          for (int j=0; j<6; j++) {
-            vatom[i][j]+=engv[al]*0.5;
-            virial_acc[j]+=engv[al];
-            al+=_inum;
-          }
-        } else {
-          for (int j=0; j<6; j++) {
-            virial_acc[j]+=engv[al];
-            al+=_inum;
-          }
-        }
-      }
+  int vstart=0;
+  if (_eflag) {
+    for (int i=0; i<_inum; i++)
+      evdwl+=engv[i];
+    if (_ef_atom)
+      if (_ilist==NULL) 
+        for (int i=0; i<_inum; i++)
+          eatom[i]+=engv[i];
+      else
+        for (int i=0; i<_inum; i++)
+          eatom[_ilist[i]]+=engv[i];
+    vstart=_inum;
+  }
+  if (_vflag) {
+    int iend=vstart+_inum;
+    for (int j=0; j<6; j++) {
+      for (int i=vstart; i<iend; i++)
+        virial[j]+=engv[i];
+      if (_vf_atom)
+        if (_ilist==NULL)
+          for (int i=vstart; i<iend; i++)
+            vatom[i][j]+=engv[i];
+        else
+          for (int i=vstart; i<iend; i++)
+            vatom[_ilist[i]][j]+=engv[i];
+      vstart+=_inum;
+      iend+=_inum;
     }
-    for (int j=0; j<6; j++)
-      virial[j]+=virial_acc[j]*0.5;
-  } else {
-    for (int i=0; i<_inum; i++) {
-      int al=i;
-      int ii=_ilist[i];
-      if (_eflag) {
-        if (_ef_atom) {
-          evdwl+=engv[al];
-          eatom[ii]+=engv[al]*0.5;
-          al+=_inum;
-        } else {
-          evdwl+=engv[al];
-          al+=_inum;
-        }
-      }
-      if (_vflag) {
-        if (_vf_atom) {
-          for (int j=0; j<6; j++) {
-            vatom[ii][j]+=engv[al]*0.5;
-            virial_acc[j]+=engv[al];
-            al+=_inum;
-          }
-        } else {
-          for (int j=0; j<6; j++) {
-            virial_acc[j]+=engv[al];
-            al+=_inum;
-          }
-        }
-      }
-    }
-    for (int j=0; j<6; j++)
-      virial[j]+=virial_acc[j]*0.5;
   }
   
-  evdwl*=0.5;
   return evdwl;
 }
 
 template <class numtyp, class acctyp>
 double AnswerT::energy_virial(double *eatom, double **vatom,
-                                   double *virial, double &ecoul) {
+                              double *virial, double &ecoul) {
   if (_eflag==false && _vflag==false)
     return 0.0;
 
@@ -254,84 +218,43 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
     return energy_virial(eatom,vatom,virial);
 
   double evdwl=0.0;
-  double _ecoul=0.0;
-  double virial_acc[6];
-  for (int i=0; i<6; i++) virial_acc[i]=0.0;
-  if (_ilist==NULL) {
-    for (int i=0; i<_inum; i++) {
-      int al=i;
-      if (_eflag) {
-        if (_ef_atom) {
-          evdwl+=engv[al];
-          eatom[i]+=engv[al]*0.5;
-          al+=_inum;
-          _ecoul+=engv[al];
-          eatom[i]+=engv[al]*0.5;
-          al+=_inum;
-        } else {
-          evdwl+=engv[al];
-          al+=_inum;
-          _ecoul+=engv[al];
-          al+=_inum;
-        }
-      }
-      if (_vflag) {
-        if (_vf_atom) {
-          for (int j=0; j<6; j++) {
-            vatom[i][j]+=engv[al]*0.5;
-            virial_acc[j]+=engv[al];
-            al+=_inum;
-          }
-        } else {
-          for (int j=0; j<6; j++) {
-            virial_acc[j]+=engv[al];
-            al+=_inum;
-          }
-        }
+  int vstart=0, iend=_inum*2;
+  if (_eflag) {
+    for (int i=0; i<_inum; i++)
+      evdwl+=engv[i];
+    for (int i=_inum; i<iend; i++)
+      ecoul+=engv[i];
+    if (_ef_atom)
+      if (_ilist==NULL) {
+        for (int i=0; i<_inum; i++)
+          eatom[i]+=engv[i];
+        for (int i=_inum; i<iend; i++)
+          eatom[i]+=engv[i];
+      } else {
+        for (int i=0; i<_inum; i++)
+          eatom[_ilist[i]]+=engv[i];
+        for (int i=_inum; i<iend; i++)
+          eatom[_ilist[i]]+=engv[i];
       }
+    vstart=iend;
+    iend+=_inum;
+  }
+  if (_vflag) {
+    for (int j=0; j<6; j++) {
+      for (int i=vstart; i<iend; i++)
+        virial[j]+=engv[i];
+      if (_vf_atom)
+        if (_ilist==NULL)
+          for (int i=vstart; i<iend; i++)
+              vatom[i][j]+=engv[i];
+        else
+          for (int i=vstart; i<iend; i++)
+              vatom[_ilist[i]][j]+=engv[i];
+      vstart+=_inum;
+      iend+=_inum;
     }
-    for (int j=0; j<6; j++)
-      virial[j]+=virial_acc[j]*0.5;
-  } else {
-    for (int i=0; i<_inum; i++) {
-      int al=i;
-      int ii=_ilist[i];
-      if (_eflag) {
-        if (_ef_atom) {
-          evdwl+=engv[al];
-          eatom[ii]+=engv[al]*0.5;
-          al+=_inum;
-          _ecoul+=engv[al];
-          eatom[ii]+=engv[al]*0.5;
-          al+=_inum;
-        } else {
-          evdwl+=engv[al];
-          al+=_inum;
-          _ecoul+=engv[al];
-          al+=_inum;
-        }
-      }
-      if (_vflag) {
-        if (_vf_atom) {
-          for (int j=0; j<6; j++) {
-            vatom[ii][j]+=engv[al]*0.5;
-            virial_acc[j]+=engv[al];
-            al+=_inum;
-          }
-        } else {
-          for (int j=0; j<6; j++) {
-            virial_acc[j]+=engv[al];
-            al+=_inum;
-          }
-        }
-      }
-    }
-    for (int j=0; j<6; j++)
-      virial[j]+=virial_acc[j]*0.5;
   }
   
-  evdwl*=0.5;
-  ecoul+=_ecoul*0.5;
   return evdwl;
 }
 
@@ -373,4 +296,14 @@ void AnswerT::get_answers(double **f, double **tor) {
   }
 }
 
+template <class numtyp, class acctyp>
+void AnswerT::cq(const int cq_index) {
+  engv.cq(dev->cq(cq_index));
+  force.cq(dev->cq(cq_index));
+  time_answer.clear();
+  time_answer.init(*dev,dev->cq(cq_index));
+  time_answer.zero();
+}
+
 template class Answer<PRECISION,ACC_PRECISION>;
+
diff --git a/lib/gpu/lal_answer.h b/lib/gpu/lal_answer.h
index c642781c07..149e8e9705 100644
--- a/lib/gpu/lal_answer.h
+++ b/lib/gpu/lal_answer.h
@@ -47,6 +47,8 @@ class Answer {
   inline int inum() const { return _inum; }
   /// Set number of local atoms for future copy operations
   inline void inum(const int n) { _inum=n; }
+  /// Return the maximum number of atoms that can be stored currently
+  inline int max_inum() const { return _max_local; }
   
   /// Memory usage per atom in this class
   int bytes_per_atom() const; 
@@ -132,6 +134,9 @@ class Answer {
   /// Return the time the CPU was idle waiting for GPU
   inline double cpu_idle_time() { return _time_cpu_idle; }
 
+  /// Change the command queue used for copies and timers
+  void cq(const int cq_index);
+
   // ------------------------------ DATA ----------------------------------
 
   /// Force and possibly torque
diff --git a/lib/gpu/lal_atom.cpp b/lib/gpu/lal_atom.cpp
index 5cf46c8751..92cda1239e 100644
--- a/lib/gpu/lal_atom.cpp
+++ b/lib/gpu/lal_atom.cpp
@@ -70,44 +70,47 @@ bool AtomT::alloc(const int nall) {
 
   // ---------------------------  Device allocations
   int gpu_bytes=0;
-  success=success && (x.alloc(_max_atoms*4,*dev,UCL_WRITE_OPTIMIZED,
+  success=success && (x.alloc(_max_atoms*4,*dev,UCL_WRITE_ONLY,
                               UCL_READ_ONLY)==UCL_SUCCESS);
   #ifdef GPU_CAST
-  success=success && (x_cast.alloc(_max_atoms*3,*dev,UCL_READ_ONLY)==
-                      UCL_SUCCESS);
-  success=success && (type_cast.alloc(_max_atoms,*dev,UCL_READ_ONLY)==
-                      UCL_SUCCESS);
+  success=success && (x_cast.alloc(_max_atoms*3,*dev,UCL_WRITE_ONLY,
+                                   UCL_READ_ONLY)==UCL_SUCCESS);
+  success=success && (type_cast.alloc(_max_atoms,*dev,UCL_WRITE_ONLY,
+                                      UCL_READ_ONLY)==UCL_SUCCESS);
   gpu_bytes+=x_cast.device.row_bytes()+type_cast.device.row_bytes();
   #endif
 
   if (_charge && _host_view==false) {
-    success=success && (q.alloc(_max_atoms,*dev,UCL_WRITE_OPTIMIZED,
+    success=success && (q.alloc(_max_atoms,*dev,UCL_WRITE_ONLY,
                                 UCL_READ_ONLY)==UCL_SUCCESS);
     gpu_bytes+=q.device.row_bytes();
   }
   if (_rot && _host_view==false) {
-    success=success && (quat.alloc(_max_atoms*4,*dev,UCL_WRITE_OPTIMIZED,
+    success=success && (quat.alloc(_max_atoms*4,*dev,UCL_WRITE_ONLY,
                                    UCL_READ_ONLY)==UCL_SUCCESS);
     gpu_bytes+=quat.device.row_bytes();
   }
 
   if (_gpu_nbor>0) {
     if (_bonds) {
-      success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS);
+      success=success && (dev_tag.alloc(_max_atoms,*dev,
+                                        UCL_READ_ONLY)==UCL_SUCCESS);
       gpu_bytes+=dev_tag.row_bytes();
     }
     if (_gpu_nbor==1) {
       success=success && (dev_cell_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
       gpu_bytes+=dev_cell_id.row_bytes();
     } else {
-      success=success && (host_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
+      success=success && (host_particle_id.alloc(_max_atoms,*dev,
+                                                 UCL_WRITE_ONLY)==UCL_SUCCESS);
       success=success && 
              (host_cell_id.alloc(_max_atoms,*dev,UCL_NOT_PINNED)==UCL_SUCCESS);
     }
     if (_gpu_nbor==2 && _host_view)
       dev_particle_id.view(host_particle_id);
     else
-      success=success && (dev_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
+      success=success && (dev_particle_id.alloc(_max_atoms,*dev,
+                                                UCL_READ_ONLY)==UCL_SUCCESS);
     gpu_bytes+=dev_particle_id.row_bytes();
   }
 
@@ -130,7 +133,7 @@ bool AtomT::add_fields(const bool charge, const bool rot,
     _charge=true;
     _other=true;
     if (_host_view==false) {
-      success=success && (q.alloc(_max_atoms,*dev,UCL_WRITE_OPTIMIZED,
+      success=success && (q.alloc(_max_atoms,*dev,UCL_WRITE_ONLY,
                                   UCL_READ_ONLY)==UCL_SUCCESS);
       gpu_bytes+=q.device.row_bytes();
     }
@@ -140,7 +143,7 @@ bool AtomT::add_fields(const bool charge, const bool rot,
     _rot=true;
     _other=true;
     if (_host_view==false) {
-      success=success && (quat.alloc(_max_atoms*4,*dev,UCL_WRITE_OPTIMIZED,
+      success=success && (quat.alloc(_max_atoms*4,*dev,UCL_WRITE_ONLY,
                                      UCL_READ_ONLY)==UCL_SUCCESS);
       gpu_bytes+=quat.device.row_bytes();
     }
@@ -149,7 +152,8 @@ bool AtomT::add_fields(const bool charge, const bool rot,
   if (bonds && _bonds==false) {
     _bonds=true;
     if (_bonds && _gpu_nbor>0) {
-      success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS);
+      success=success && (dev_tag.alloc(_max_atoms,*dev,
+                                        UCL_READ_ONLY)==UCL_SUCCESS);
       gpu_bytes+=dev_tag.row_bytes();
     }
   }
@@ -163,17 +167,20 @@ bool AtomT::add_fields(const bool charge, const bool rot,
         return false;
     }
     #endif
-    success=success && (dev_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
+    success=success && (dev_particle_id.alloc(_max_atoms,*dev,
+                                              UCL_READ_ONLY)==UCL_SUCCESS);
     gpu_bytes+=dev_particle_id.row_bytes();
     if (_bonds) {
-      success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS);
+      success=success && (dev_tag.alloc(_max_atoms,*dev,
+                                        UCL_READ_ONLY)==UCL_SUCCESS);
       gpu_bytes+=dev_tag.row_bytes();
     }
     if (_gpu_nbor==1) {
       success=success && (dev_cell_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
       gpu_bytes+=dev_cell_id.row_bytes();
     } else {
-      success=success && (host_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
+      success=success && (host_particle_id.alloc(_max_atoms,*dev,
+                                                 UCL_WRITE_ONLY)==UCL_SUCCESS);
       success=success && 
              (host_cell_id.alloc(_max_atoms,*dev,UCL_NOT_PINNED)==UCL_SUCCESS);
     }             
diff --git a/lib/gpu/lal_aux_fun1.h b/lib/gpu/lal_aux_fun1.h
index 92a03eb7d4..2c57e1f6a5 100644
--- a/lib/gpu/lal_aux_fun1.h
+++ b/lib/gpu/lal_aux_fun1.h
@@ -77,12 +77,12 @@
   if (offset==0) {                                                          \
     engv+=ii;                                                               \
     if (eflag>0) {                                                          \
-      *engv=energy;                                                         \
+      *engv=energy*(acctyp)0.5;                                             \
       engv+=inum;                                                           \
     }                                                                       \
     if (vflag>0) {                                                          \
       for (int i=0; i<6; i++) {                                             \
-        *engv=virial[i];                                                    \
+        *engv=virial[i]*(acctyp)0.5;                                        \
         engv+=inum;                                                         \
       }                                                                     \
     }                                                                       \
@@ -125,14 +125,14 @@
   if (offset==0) {                                                          \
     engv+=ii;                                                               \
     if (eflag>0) {                                                          \
-      *engv=energy;                                                         \
+      *engv=energy*(acctyp)0.5;                                             \
       engv+=inum;                                                           \
-      *engv=e_coul;                                                         \
+      *engv=e_coul*(acctyp)0.5;                                             \
       engv+=inum;                                                           \
     }                                                                       \
     if (vflag>0) {                                                          \
       for (int i=0; i<6; i++) {                                             \
-        *engv=virial[i];                                                    \
+        *engv=virial[i]*(acctyp)0.5;                                        \
         engv+=inum;                                                         \
       }                                                                     \
     }                                                                       \
@@ -160,12 +160,12 @@
   if (offset==0) {                                                          \
     engv+=ii;                                                               \
     if (eflag>0) {                                                          \
-      *engv=energy;                                                         \
+      *engv=energy*(acctyp)0.5;                                             \
       engv+=inum;                                                           \
     }                                                                       \
     if (vflag>0) {                                                          \
       for (int i=0; i<6; i++) {                                             \
-        *engv=virial[i];                                                    \
+        *engv=virial[i]*(acctyp)0.5;                                        \
         engv+=inum;                                                         \
       }                                                                     \
     }                                                                       \
@@ -192,14 +192,14 @@
   if (offset==0) {                                                          \
     engv+=ii;                                                               \
     if (eflag>0) {                                                          \
-      *engv=energy;                                                         \
+      *engv=energy*(acctyp)0.5;                                             \
       engv+=inum;                                                           \
-      *engv=e_coul;                                                         \
+      *engv=e_coul*(acctyp)0.5;                                             \
       engv+=inum;                                                           \
     }                                                                       \
     if (vflag>0) {                                                          \
       for (int i=0; i<6; i++) {                                             \
-        *engv=virial[i];                                                    \
+        *engv=virial[i]*(acctyp)0.5;                                        \
         engv+=inum;                                                         \
       }                                                                     \
     }                                                                       \
diff --git a/lib/gpu/lal_base_atomic.cpp b/lib/gpu/lal_base_atomic.cpp
index 6597dbfe98..b24f7e9661 100644
--- a/lib/gpu/lal_base_atomic.cpp
+++ b/lib/gpu/lal_base_atomic.cpp
@@ -272,12 +272,8 @@ void BaseAtomicT::compile_kernels(UCL_Device &dev, const void *pair_str,
     return;
 
   std::string s_fast=std::string(kname)+"_fast";
-  std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
-                    std::string(OCL_PRECISION_COMPILE)+" -D"+
-                    std::string(OCL_VENDOR);
-
   pair_program=new UCL_Program(dev);
-  pair_program->load_string(pair_str,flags.c_str());
+  pair_program->load_string(pair_str,device->compile_string().c_str());
   k_pair_fast.set_function(*pair_program,s_fast.c_str());
   k_pair.set_function(*pair_program,kname);
   pos_tex.get_texture(*pair_program,"pos_tex");
diff --git a/lib/gpu/lal_base_charge.cpp b/lib/gpu/lal_base_charge.cpp
index f61950cfee..8e06a4d18c 100644
--- a/lib/gpu/lal_base_charge.cpp
+++ b/lib/gpu/lal_base_charge.cpp
@@ -288,12 +288,8 @@ void BaseChargeT::compile_kernels(UCL_Device &dev, const void *pair_str,
     return;
 
   std::string s_fast=std::string(kname)+"_fast";
-  std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
-                    std::string(OCL_PRECISION_COMPILE)+" -D"+
-                    std::string(OCL_VENDOR);
-
   pair_program=new UCL_Program(dev);
-  pair_program->load_string(pair_str,flags.c_str());
+  pair_program->load_string(pair_str,device->compile_string().c_str());
   k_pair_fast.set_function(*pair_program,s_fast.c_str());
   k_pair.set_function(*pair_program,kname);
   pos_tex.get_texture(*pair_program,"pos_tex");
diff --git a/lib/gpu/lal_base_dipole.cpp b/lib/gpu/lal_base_dipole.cpp
index 8c793f554e..ed3d720d2c 100644
--- a/lib/gpu/lal_base_dipole.cpp
+++ b/lib/gpu/lal_base_dipole.cpp
@@ -296,12 +296,8 @@ void BaseDipoleT::compile_kernels(UCL_Device &dev, const void *pair_str,
     return;
 
   std::string s_fast=std::string(kname)+"_fast";
-  std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
-                    std::string(OCL_PRECISION_COMPILE)+" -D"+
-                    std::string(OCL_VENDOR);
-
   pair_program=new UCL_Program(dev);
-  pair_program->load_string(pair_str,flags.c_str());
+  pair_program->load_string(pair_str,device->compile_string().c_str());
   k_pair_fast.set_function(*pair_program,s_fast.c_str());
   k_pair.set_function(*pair_program,kname);
   pos_tex.get_texture(*pair_program,"pos_tex");
diff --git a/lib/gpu/lal_base_ellipsoid.cpp b/lib/gpu/lal_base_ellipsoid.cpp
index fca3151b82..641087a6c4 100644
--- a/lib/gpu/lal_base_ellipsoid.cpp
+++ b/lib/gpu/lal_base_ellipsoid.cpp
@@ -455,9 +455,7 @@ void BaseEllipsoidT::compile_kernels(UCL_Device &dev,
   std::string s_lj=kns+"_lj";
   std::string s_lj_fast=kns+"_lj_fast";
 
-  std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
-                    std::string(OCL_PRECISION_COMPILE)+" -D"+
-                    std::string(OCL_VENDOR);
+  std::string flags=device->compile_string();
 
   nbor_program=new UCL_Program(dev);
   nbor_program->load_string(ellipsoid_nbor,flags.c_str());
diff --git a/lib/gpu/lal_born.cpp b/lib/gpu/lal_born.cpp
index 1665717c2d..719b1845d8 100644
--- a/lib/gpu/lal_born.cpp
+++ b/lib/gpu/lal_born.cpp
@@ -69,7 +69,7 @@ int BornT::init(const int ntypes, double **host_cutsq,
 
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);
 
   for (int i=0; i<lj_types*lj_types; i++)
     host_write[i]=0.0;
diff --git a/lib/gpu/lal_born.cu b/lib/gpu/lal_born.cu
index f04b79e142..b4b3a64d22 100644
--- a/lib/gpu/lal_born.cu
+++ b/lib/gpu/lal_born.cu
@@ -54,7 +54,8 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
@@ -148,7 +149,8 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
 
diff --git a/lib/gpu/lal_born_coul_long.cpp b/lib/gpu/lal_born_coul_long.cpp
index 157923d4fa..94becf8c69 100644
--- a/lib/gpu/lal_born_coul_long.cpp
+++ b/lib/gpu/lal_born_coul_long.cpp
@@ -73,7 +73,7 @@ int BornCoulLongT::init(const int ntypes, double **host_cutsq, double **host_rho
 
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);
 
   for (int i=0; i<lj_types*lj_types; i++)
     host_write[i]=0.0;
diff --git a/lib/gpu/lal_born_coul_long.cu b/lib/gpu/lal_born_coul_long.cu
index a602729baa..d8198d6832 100644
--- a/lib/gpu/lal_born_coul_long.cu
+++ b/lib/gpu/lal_born_coul_long.cu
@@ -67,7 +67,8 @@ __kernel void k_born_long(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
@@ -188,7 +189,8 @@ __kernel void k_born_long_fast(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
diff --git a/lib/gpu/lal_born_coul_wolf.cpp b/lib/gpu/lal_born_coul_wolf.cpp
index 936ce7095b..7615c1dd53 100644
--- a/lib/gpu/lal_born_coul_wolf.cpp
+++ b/lib/gpu/lal_born_coul_wolf.cpp
@@ -73,7 +73,7 @@ int BornCoulWolfT::init(const int ntypes, double **host_cutsq, double **host_rho
 
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);
 
   for (int i=0; i<lj_types*lj_types; i++)
     host_write[i]=0.0;
diff --git a/lib/gpu/lal_born_coul_wolf.cu b/lib/gpu/lal_born_coul_wolf.cu
index 07b2cdd5c8..9727f7a017 100644
--- a/lib/gpu/lal_born_coul_wolf.cu
+++ b/lib/gpu/lal_born_coul_wolf.cu
@@ -70,7 +70,8 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
@@ -201,7 +202,8 @@ __kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
diff --git a/lib/gpu/lal_buck.cpp b/lib/gpu/lal_buck.cpp
index 091ae0f62a..53bfe3fae3 100644
--- a/lib/gpu/lal_buck.cpp
+++ b/lib/gpu/lal_buck.cpp
@@ -68,7 +68,7 @@ int BuckT::init(const int ntypes, double **host_cutsq,
 
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);
 
   for (int i=0; i<lj_types*lj_types; i++)
     host_write[i]=0.0;
diff --git a/lib/gpu/lal_buck.cu b/lib/gpu/lal_buck.cu
index 8da8587f58..a592c71923 100644
--- a/lib/gpu/lal_buck.cu
+++ b/lib/gpu/lal_buck.cu
@@ -53,7 +53,8 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
@@ -145,7 +146,8 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
 
diff --git a/lib/gpu/lal_buck_coul.cpp b/lib/gpu/lal_buck_coul.cpp
index 75e7231027..bec640e7a6 100644
--- a/lib/gpu/lal_buck_coul.cpp
+++ b/lib/gpu/lal_buck_coul.cpp
@@ -70,7 +70,7 @@ int BuckCoulT::init(const int ntypes, double **host_cutsq,
 
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);
 
   for (int i=0; i<lj_types*lj_types; i++)
     host_write[i]=0.0;
diff --git a/lib/gpu/lal_buck_coul.cu b/lib/gpu/lal_buck_coul.cu
index e4febf5578..4a70efe197 100644
--- a/lib/gpu/lal_buck_coul.cu
+++ b/lib/gpu/lal_buck_coul.cu
@@ -66,7 +66,8 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
@@ -182,7 +183,8 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
 
diff --git a/lib/gpu/lal_buck_coul_long.cpp b/lib/gpu/lal_buck_coul_long.cpp
index 01f5cd1bea..98c251714f 100644
--- a/lib/gpu/lal_buck_coul_long.cpp
+++ b/lib/gpu/lal_buck_coul_long.cpp
@@ -72,7 +72,7 @@ int BuckCoulLongT::init(const int ntypes, double **host_cutsq,
 
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);
 
   for (int i=0; i<lj_types*lj_types; i++)
     host_write[i]=0.0;
diff --git a/lib/gpu/lal_buck_coul_long.cu b/lib/gpu/lal_buck_coul_long.cu
index ea59de17dd..e10943fd1b 100644
--- a/lib/gpu/lal_buck_coul_long.cu
+++ b/lib/gpu/lal_buck_coul_long.cu
@@ -67,7 +67,8 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
@@ -190,7 +191,8 @@ __kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
diff --git a/lib/gpu/lal_cg_cmm.cpp b/lib/gpu/lal_cg_cmm.cpp
index 2d4a44a14c..96455888f0 100644
--- a/lib/gpu/lal_cg_cmm.cpp
+++ b/lib/gpu/lal_cg_cmm.cpp
@@ -69,7 +69,7 @@ int CGCMMT::init(const int ntypes, double **host_cutsq,
 
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<numtyp> host_write(cmm_types*cmm_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);
 
   for (int i=0; i<cmm_types*cmm_types; i++)
     host_write[i]=0.0;
diff --git a/lib/gpu/lal_cg_cmm.cu b/lib/gpu/lal_cg_cmm.cu
index bce2904664..0295f59c30 100644
--- a/lib/gpu/lal_cg_cmm.cu
+++ b/lib/gpu/lal_cg_cmm.cu
@@ -53,7 +53,8 @@ __kernel void k_cg_cmm(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
@@ -150,7 +151,8 @@ __kernel void k_cg_cmm_fast(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
diff --git a/lib/gpu/lal_cg_cmm_long.cpp b/lib/gpu/lal_cg_cmm_long.cpp
index 50e2977c70..92e6bd04b5 100644
--- a/lib/gpu/lal_cg_cmm_long.cpp
+++ b/lib/gpu/lal_cg_cmm_long.cpp
@@ -74,7 +74,7 @@ int CGCMMLongT::init(const int ntypes, double **host_cutsq,
 
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);
 
   for (int i=0; i<lj_types*lj_types; i++)
     host_write[i]=0.0;
diff --git a/lib/gpu/lal_cg_cmm_long.cu b/lib/gpu/lal_cg_cmm_long.cu
index 487eb10355..1ff506a34d 100644
--- a/lib/gpu/lal_cg_cmm_long.cu
+++ b/lib/gpu/lal_cg_cmm_long.cu
@@ -66,7 +66,8 @@ __kernel void k_cg_cmm_long(const __global numtyp4 *restrict x_,
 
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
@@ -191,7 +192,8 @@ __kernel void k_cg_cmm_long_fast(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
diff --git a/lib/gpu/lal_charmm_long.cpp b/lib/gpu/lal_charmm_long.cpp
index 55a7f64be0..b2fafc46ff 100644
--- a/lib/gpu/lal_charmm_long.cpp
+++ b/lib/gpu/lal_charmm_long.cpp
@@ -76,7 +76,7 @@ int CHARMMLongT::init(const int ntypes,
   if (h_size<max_bio_shared_types)
     h_size=max_bio_shared_types;
   UCL_H_Vec<numtyp> host_write(h_size*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);
   for (int i=0; i<h_size*32; i++)
     host_write[i]=0.0;
 
diff --git a/lib/gpu/lal_charmm_long.cu b/lib/gpu/lal_charmm_long.cu
index 0f7b925f69..3e473a031a 100644
--- a/lib/gpu/lal_charmm_long.cu
+++ b/lib/gpu/lal_charmm_long.cu
@@ -67,7 +67,8 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
 
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
@@ -196,7 +197,8 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
diff --git a/lib/gpu/lal_colloid.cpp b/lib/gpu/lal_colloid.cpp
index cb6d23fc31..28045217d3 100644
--- a/lib/gpu/lal_colloid.cpp
+++ b/lib/gpu/lal_colloid.cpp
@@ -73,7 +73,7 @@ int ColloidT::init(const int ntypes,
 
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);
 
   for (int i=0; i<lj_types*lj_types; i++)
     host_write[i]=0.0;
@@ -95,7 +95,7 @@ int ColloidT::init(const int ntypes,
                          host_sigma3,host_sigma6);
 
   UCL_H_Vec<int> dview_form(lj_types*lj_types,*(this->ucl_device),
-                             UCL_WRITE_OPTIMIZED);
+                             UCL_WRITE_ONLY);
   for (int i=0; i<lj_types*lj_types; i++) dview_form[i]=0;
                                 
   form.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
diff --git a/lib/gpu/lal_colloid.cu b/lib/gpu/lal_colloid.cu
index e918feca8b..a8f0cf63ca 100644
--- a/lib/gpu/lal_colloid.cu
+++ b/lib/gpu/lal_colloid.cu
@@ -56,7 +56,8 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
@@ -219,7 +220,8 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
 
diff --git a/lib/gpu/lal_coul_dsf.cpp b/lib/gpu/lal_coul_dsf.cpp
index e597a325ff..4f338eb051 100644
--- a/lib/gpu/lal_coul_dsf.cpp
+++ b/lib/gpu/lal_coul_dsf.cpp
@@ -73,7 +73,7 @@ int CoulDSFT::init(const int ntypes, const int nlocal, const int nall,
 
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);
 
   for (int i=0; i<lj_types*lj_types; i++)
     host_write[i]=0.0;
diff --git a/lib/gpu/lal_coul_dsf.cu b/lib/gpu/lal_coul_dsf.cu
index f013fe663f..5a8b82bcb6 100644
--- a/lib/gpu/lal_coul_dsf.cu
+++ b/lib/gpu/lal_coul_dsf.cu
@@ -63,7 +63,8 @@ __kernel void k_coul_dsf(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
@@ -163,7 +164,8 @@ __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
diff --git a/lib/gpu/lal_coul_long.cpp b/lib/gpu/lal_coul_long.cpp
index b5e6f670b3..53de36474d 100644
--- a/lib/gpu/lal_coul_long.cpp
+++ b/lib/gpu/lal_coul_long.cpp
@@ -68,7 +68,7 @@ int CoulLongT::init(const int nlocal, const int nall, const int max_nbors,
 
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);
 
   for (int i=0; i<lj_types*lj_types; i++)
     host_write[i]=0.0;
diff --git a/lib/gpu/lal_coul_long.cu b/lib/gpu/lal_coul_long.cu
index c59c0a6899..5eb89f9c42 100644
--- a/lib/gpu/lal_coul_long.cu
+++ b/lib/gpu/lal_coul_long.cu
@@ -74,12 +74,12 @@ texture<int2> q_tex;
     if (eflag>0) {                                                          \
       *ap1=(acctyp)0;                                                       \
       ap1+=inum;                                                            \
-      *ap1=e_coul;                                                          \
+      *ap1=e_coul*(acctyp)0.5;                                              \
       ap1+=inum;                                                            \
     }                                                                       \
     if (vflag>0) {                                                          \
       for (int i=0; i<6; i++) {                                             \
-        *ap1=virial[i];                                                     \
+        *ap1=virial[i]*(acctyp)0.5;                                         \
         ap1+=inum;                                                          \
       }                                                                     \
     }                                                                       \
@@ -109,12 +109,12 @@ texture<int2> q_tex;
     if (eflag>0) {                                                          \
       *ap1=(acctyp)0;                                                       \
       ap1+=inum;                                                            \
-      *ap1=e_coul;                                                          \
+      *ap1=e_coul*(acctyp)0.5;                                              \
       ap1+=inum;                                                            \
     }                                                                       \
     if (vflag>0) {                                                          \
       for (int i=0; i<6; i++) {                                             \
-        *ap1=virial[i];                                                     \
+        *ap1=virial[i]*(acctyp)0.5;                                         \
         ap1+=inum;                                                          \
       }                                                                     \
     }                                                                       \
@@ -155,7 +155,8 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_,
 
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
 
@@ -244,7 +245,8 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
 
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
 
diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp
index 979ff2cbed..99e259213b 100644
--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@@ -47,7 +47,8 @@ template <class numtyp, class acctyp>
 int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
                          const int last_gpu, const int gpu_mode, 
                          const double p_split, const int nthreads, 
-                         const int t_per_atom, const double cell_size) {
+                         const int t_per_atom, const double cell_size,
+                         char *ocl_vendor) {
   _nthreads=nthreads;
   #ifdef _OPENMP
   omp_set_num_threads(nthreads);
@@ -140,6 +141,9 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
 
   _long_range_precompute=0;
 
+  if (set_ocl_params(ocl_vendor)!=0)
+    return -11;
+  
   int flag=0;
   for (int i=0; i<_procs_per_gpu; i++) {
     if (_gpu_rank==i)
@@ -149,6 +153,64 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
   return flag;
 }
 
+template <class numtyp, class acctyp>
+int DeviceT::set_ocl_params(char *ocl_vendor) {
+  #ifdef USE_OPENCL
+  std::string s_vendor=OCL_DEFAULT_VENDOR;
+  if (ocl_vendor!=NULL)
+    s_vendor=ocl_vendor;
+  if (s_vendor=="none")
+    s_vendor="generic";
+  
+  if (s_vendor=="kepler") {
+    _ocl_vendor_name="NVIDIA Kepler";
+    #if defined (__APPLE__) || defined(MACOSX)
+    _ocl_vendor_string="-DKEPLER_OCL -DNO_OCL_PTX";
+    #else
+    _ocl_vendor_string="-DKEPLER_OCL";
+    #endif
+  } else if (s_vendor=="fermi") {    
+    _ocl_vendor_name="NVIDIA Fermi";
+    _ocl_vendor_string="-DFERMI_OCL";
+  } else if (s_vendor=="cypress") {    
+    _ocl_vendor_name="AMD Cypress";
+    _ocl_vendor_string="-DCYPRESS_OCL";
+  } else if (s_vendor=="generic") {    
+    _ocl_vendor_name="GENERIC";
+    _ocl_vendor_string="-DGENERIC_OCL";
+  } else {
+    _ocl_vendor_name="CUSTOM";
+    _ocl_vendor_string="-DUSE_OPENCL";
+    int token_count=0;
+    std::string params[13];
+    char *pch = strtok(ocl_vendor,"\" ");
+    while (pch != NULL) {
+      if (token_count==13)
+        return -11;
+      params[token_count]=pch;
+      token_count++;
+      pch = strtok(NULL,"\" ");
+    }
+    _ocl_vendor_string+=" -DMEM_THREADS="+params[0]+
+                        " -DTHREADS_PER_ATOM="+params[1]+
+                        " -DTHREADS_PER_CHARGE="+params[2]+
+                        " -DBLOCK_PAIR="+params[3]+
+                        " -DMAX_SHARED_TYPES="+params[4]+
+                        " -DBLOCK_NBOR_BUILD="+params[5]+
+                        " -DBLOCK_BIO_PAIR="+params[6]+
+                        " -DBLOCK_ELLIPSE="+params[7]+
+                        " -DWARP_SIZE="+params[8]+
+                        " -DPPPM_BLOCK_1D="+params[9]+
+                        " -DBLOCK_CELL_2D="+params[10]+
+                        " -DBLOCK_CELL_ID="+params[11]+
+                        " -DMAX_BIO_SHARED_TYPES="+params[12];
+  }
+  _ocl_compile_string="-cl-fast-relaxed-math -cl-mad-enable "+
+                      std::string(OCL_PRECISION_COMPILE)+" "+_ocl_vendor_string;
+  #endif
+  return 0;
+}
+
 template <class numtyp, class acctyp>
 int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
                          const bool rot, const int nlocal, 
@@ -206,7 +268,7 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
   if (!nbor->init(&_neighbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial,
                   *gpu,gpu_nbor,gpu_host,pre_cut, _block_cell_2d, 
                   _block_cell_id, _block_nbor_build, threads_per_atom,
-                  _warp_size, _time_device))
+                  _warp_size, _time_device, compile_string()))
     return -3;
   if (_cell_size<0.0)
     nbor->cell_size(cell_size,cell_size);
@@ -274,7 +336,8 @@ void DeviceT::init_message(FILE *screen, const char *name,
     fprintf(screen,"-  with %d thread(s) per proc.\n",_nthreads);
     #endif
     #ifdef USE_OPENCL
-    fprintf(screen,"-  with OpenCL Parameters for: %s\n",OCL_VENDOR);
+    fprintf(screen,"-  with OpenCL Parameters for: %s\n",
+            _ocl_vendor_name.c_str());
     #endif
     fprintf(screen,"-------------------------------------");
     fprintf(screen,"-------------------------------------\n");
@@ -571,9 +634,8 @@ int DeviceT::compile_kernels() {
   if (_compiled)
   	return flag;
   	
-  std::string flags="-cl-mad-enable -D"+std::string(OCL_VENDOR);
   dev_program=new UCL_Program(*gpu);
-  int success=dev_program->load_string(device,flags.c_str());
+  int success=dev_program->load_string(device,compile_string().c_str());
   if (success!=UCL_SUCCESS)
     return -4;
   k_zero.set_function(*dev_program,"kernel_zero");
@@ -640,10 +702,11 @@ Device<PRECISION,ACC_PRECISION> global_device;
 int lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
                     const int last_gpu, const int gpu_mode, 
                     const double particle_split, const int nthreads,
-                    const int t_per_atom, const double cell_size) {
+                    const int t_per_atom, const double cell_size, 
+                    char *opencl_vendor) {
   return global_device.init_device(world,replica,first_gpu,last_gpu,gpu_mode,
                                    particle_split,nthreads,t_per_atom, 
-                                   cell_size);
+                                   cell_size,opencl_vendor);
 }
 
 void lmp_clear_device() {
@@ -654,3 +717,4 @@ double lmp_gpu_forces(double **f, double **tor, double *eatom,
                       double **vatom, double *virial, double &ecoul) {
   return global_device.fix_gpu(f,tor,eatom,vatom,virial,ecoul);
 }
+
diff --git a/lib/gpu/lal_device.h b/lib/gpu/lal_device.h
index 62b0d862bd..d7bc70edaf 100644
--- a/lib/gpu/lal_device.h
+++ b/lib/gpu/lal_device.h
@@ -45,11 +45,13 @@ class Device {
     * - -2 if GPU not found
     * - -4 if GPU library not compiled for GPU
     * - -6 if GPU could not be initialized for use
-    * - -7 if accelerator sharing is not currently allowed on system **/
+    * - -7 if accelerator sharing is not currently allowed on system 
+    * - -11 if vendor_string has the wrong number of parameters **/
   int init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, 
-                   const int last_gpu, const int gpu_mode, 
-                   const double particle_split, const int nthreads,
-                   const int t_per_atom, const double cell_size);
+                  const int last_gpu, const int gpu_mode, 
+                  const double particle_split, const int nthreads,
+                  const int t_per_atom, const double cell_size, 
+                  char *vendor_string);
 
   /// Initialize the device for Atom and Neighbor storage
   /** \param rot True if quaternions need to be stored
@@ -234,6 +236,8 @@ class Device {
   inline int max_bio_shared_types() const { return _max_bio_shared_types; }
   /// Architecture gpu code compiled for (returns 0 for OpenCL)
   inline double ptx_arch() const { return _ptx_arch; }
+  /// Number of threads executing concurrently on same multiproc
+  inline int warp_size() const { return _warp_size; }
 
   // -------------------- SHARED DEVICE ROUTINES -------------------- 
   // Perform asynchronous zero of integer array 
@@ -278,6 +282,8 @@ class Device {
       pppm_double->precompute(ago,nlocal,nall,host_x,host_type,success,charge,
                               boxlo,prd);
   }
+  
+  inline std::string compile_string() { return _ocl_compile_string; }
 
  private:
   std::queue<Answer<numtyp,acctyp> *> ans_queue;
@@ -305,6 +311,9 @@ class Device {
 
   int _data_in_estimate, _data_out_estimate;
   
+  std::string _ocl_vendor_name, _ocl_vendor_string, _ocl_compile_string;
+  int set_ocl_params(char *);
+  
   template <class t>
   inline std::string toa(const t& in) {
     std::ostringstream o;
diff --git a/lib/gpu/lal_dipole_lj.cpp b/lib/gpu/lal_dipole_lj.cpp
index 41ec84648d..e96e15eaf9 100644
--- a/lib/gpu/lal_dipole_lj.cpp
+++ b/lib/gpu/lal_dipole_lj.cpp
@@ -72,7 +72,7 @@ int DipoleLJT::init(const int ntypes,
 
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);
 
   for (int i=0; i<lj_types*lj_types; i++)
     host_write[i]=0.0;
diff --git a/lib/gpu/lal_dipole_lj.cu b/lib/gpu/lal_dipole_lj.cu
index 620f904eba..e5ca559dab 100644
--- a/lib/gpu/lal_dipole_lj.cu
+++ b/lib/gpu/lal_dipole_lj.cu
@@ -75,14 +75,14 @@ texture<int4,1> mu_tex;
   if (offset==0) {                                                          \
     engv+=ii;                                                               \
     if (eflag>0) {                                                          \
-      *engv=energy;                                                         \
+      *engv=energy*(acctyp)0.5;                                             \
       engv+=inum;                                                           \
-      *engv=e_coul;                                                         \
+      *engv=e_coul*(acctyp)0.5;                                             \
       engv+=inum;                                                           \
     }                                                                       \
     if (vflag>0) {                                                          \
       for (int i=0; i<6; i++) {                                             \
-        *engv=virial[i];                                                    \
+        *engv=virial[i]*(acctyp)0.5;                                        \
         engv+=inum;                                                         \
       }                                                                     \
     }                                                                       \
@@ -115,14 +115,14 @@ texture<int4,1> mu_tex;
   if (offset==0) {                                                          \
     engv+=ii;                                                               \
     if (eflag>0) {                                                          \
-      *engv=energy;                                                         \
+      *engv=energy*(acctyp)0.5;                                             \
       engv+=inum;                                                           \
-      *engv=e_coul;                                                         \
+      *engv=e_coul*(acctyp)0.5;                                             \
       engv+=inum;                                                           \
     }                                                                       \
     if (vflag>0) {                                                          \
       for (int i=0; i<6; i++) {                                             \
-        *engv=virial[i];                                                    \
+        *engv=virial[i]*(acctyp)0.5;                                        \
         engv+=inum;                                                         \
       }                                                                     \
     }                                                                       \
@@ -174,7 +174,8 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
@@ -385,7 +386,8 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
diff --git a/lib/gpu/lal_dipole_lj_sf.cpp b/lib/gpu/lal_dipole_lj_sf.cpp
index 4e962dc8c7..5a145dc762 100644
--- a/lib/gpu/lal_dipole_lj_sf.cpp
+++ b/lib/gpu/lal_dipole_lj_sf.cpp
@@ -72,7 +72,7 @@ int DipoleLJSFT::init(const int ntypes,
 
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);
 
   for (int i=0; i<lj_types*lj_types; i++)
     host_write[i]=0.0;
diff --git a/lib/gpu/lal_dipole_lj_sf.cu b/lib/gpu/lal_dipole_lj_sf.cu
index 83024e705c..e761b115f3 100644
--- a/lib/gpu/lal_dipole_lj_sf.cu
+++ b/lib/gpu/lal_dipole_lj_sf.cu
@@ -76,14 +76,14 @@ texture<int4,1> mu_tex;
   if (offset==0) {                                                          \
     engv+=ii;                                                               \
     if (eflag>0) {                                                          \
-      *engv=energy;                                                         \
+      *engv=energy*(acctyp)0.5;                                             \
       engv+=inum;                                                           \
-      *engv=e_coul;                                                         \
+      *engv=e_coul*(acctyp)0.5;                                             \
       engv+=inum;                                                           \
     }                                                                       \
     if (vflag>0) {                                                          \
       for (int i=0; i<6; i++) {                                             \
-        *engv=virial[i];                                                    \
+        *engv=virial[i]*(acctyp)0.5;                                        \
         engv+=inum;                                                         \
       }                                                                     \
     }                                                                       \
@@ -116,14 +116,14 @@ texture<int4,1> mu_tex;
   if (offset==0) {                                                          \
     engv+=ii;                                                               \
     if (eflag>0) {                                                          \
-      *engv=energy;                                                         \
+      *engv=energy*(acctyp)0.5;                                             \
       engv+=inum;                                                           \
-      *engv=e_coul;                                                         \
+      *engv=e_coul*(acctyp)0.5;                                             \
       engv+=inum;                                                           \
     }                                                                       \
     if (vflag>0) {                                                          \
       for (int i=0; i<6; i++) {                                             \
-        *engv=virial[i];                                                    \
+        *engv=virial[i]*(acctyp)0.5;                                        \
         engv+=inum;                                                         \
       }                                                                     \
     }                                                                       \
@@ -175,7 +175,8 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
@@ -418,7 +419,8 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
diff --git a/lib/gpu/lal_eam.cpp b/lib/gpu/lal_eam.cpp
index c6e8ab9fba..54e5998ddd 100644
--- a/lib/gpu/lal_eam.cpp
+++ b/lib/gpu/lal_eam.cpp
@@ -66,7 +66,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
     ef_nall=2000;
 
   _max_fp_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
-  _fp.alloc(_max_fp_size,*(this->ucl_device),UCL_RW_OPTIMIZED,UCL_WRITE_ONLY);
+  _fp.alloc(_max_fp_size,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
                                      
   k_energy.set_function(*(this->pair_program),"k_energy");
   k_energy_fast.set_function(*(this->pair_program),"k_energy_fast");
@@ -106,7 +106,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
   _nr=nr;
   
   UCL_H_Vec<int2> dview_type(lj_types*lj_types,*(this->ucl_device),
-                             UCL_WRITE_OPTIMIZED);
+                             UCL_WRITE_ONLY);
   
   for (int i=0; i<lj_types*lj_types; i++) {
     dview_type[i].x=0; dview_type[i].y=0;
@@ -126,7 +126,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
   
   // pack type2frho
   UCL_H_Vec<int> dview_type2frho(lj_types,*(this->ucl_device),
-                                 UCL_WRITE_OPTIMIZED);
+                                 UCL_WRITE_ONLY);
 
   type2frho.alloc(lj_types,*(this->ucl_device),UCL_READ_ONLY);
   for (int i=0; i<ntypes; i++)
@@ -135,7 +135,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
 
   // pack frho_spline
   UCL_H_Vec<numtyp4> dview_frho_spline(nfrho*(nrho+1),*(this->ucl_device),
-                                       UCL_WRITE_OPTIMIZED);
+                                       UCL_WRITE_ONLY);
                                
   for (int ix=0; ix<nfrho; ix++)
     for (int iy=0; iy<nrho+1; iy++) {
@@ -165,7 +165,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
 
   // pack rhor_spline
   UCL_H_Vec<numtyp4> dview_rhor_spline(nrhor*(nr+1),*(this->ucl_device),
-                                       UCL_WRITE_OPTIMIZED);
+                                       UCL_WRITE_ONLY);
                                
   for (int ix=0; ix<nrhor; ix++)
     for (int iy=0; iy<nr+1; iy++) {
@@ -195,7 +195,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
 
   // pack z2r_spline
   UCL_H_Vec<numtyp4> dview_z2r_spline(nz2r*(nr+1),*(this->ucl_device),
-                                      UCL_WRITE_OPTIMIZED);
+                                      UCL_WRITE_ONLY);
                                
   for (int ix=0; ix<nz2r; ix++)
     for (int iy=0; iy<nr+1; iy++) {
diff --git a/lib/gpu/lal_eam.cu b/lib/gpu/lal_eam.cu
index aedbf7458f..70301c7c5a 100644
--- a/lib/gpu/lal_eam.cu
+++ b/lib/gpu/lal_eam.cu
@@ -79,7 +79,7 @@ texture<int4> z2r_sp2_tex;
       fetch4(coeff,index,frho_sp2_tex);                                     \
       energy = ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w;             \
       if (rho > rhomax) energy += fp*(rho-rhomax);                          \
-      engv[ii]=(acctyp)2.0*energy;                                          \
+      engv[ii]=energy;                                                      \
     }                                                                       \
   }
   
@@ -116,12 +116,12 @@ texture<int4> z2r_sp2_tex;
   }                                                                         \
   if (offset==0) {                                                          \
     if (eflag>0) {                                                          \
-      engv[ii]+=energy;                                                     \
+      engv[ii]+=energy*(acctyp)0.5;                                         \
       engv+=inum;                                                           \
     }                                                                       \
     if (vflag>0) {                                                          \
       for (int i=0; i<6; i++) {                                             \
-        engv[ii]=virial[i];                                                 \
+        engv[ii]=virial[i]*(acctyp)0.5;                                     \
         engv+=inum;                                                         \
       }                                                                     \
     }                                                                       \
@@ -150,7 +150,7 @@ texture<int4> z2r_sp2_tex;
       fetch4(coeff,index,frho_sp2_tex);                                     \
       energy = ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w;             \
       if (rho > rhomax) energy += fp*(rho-rhomax);                          \
-      engv[ii]=(acctyp)2.0*energy;                                          \
+      engv[ii]=energy;                                          \
     }                                                                       \
   }
 
@@ -173,12 +173,12 @@ texture<int4> z2r_sp2_tex;
   if (offset==0) {                                                          \
     engv+=ii;                                                               \
     if (eflag>0) {                                                          \
-      *engv+=energy;                                                        \
+      *engv+=energy*(acctyp)0.5;                                            \
       engv+=inum;                                                           \
     }                                                                       \
     if (vflag>0) {                                                          \
       for (int i=0; i<6; i++) {                                             \
-        *engv=virial[i];                                                    \
+        *engv=virial[i]*(acctyp)0.5;                                        \
         engv+=inum;                                                         \
       }                                                                     \
     }                                                                       \
@@ -210,7 +210,8 @@ __kernel void k_energy(const __global numtyp4 *restrict x_,
    
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
@@ -286,7 +287,8 @@ __kernel void k_energy_fast(const __global numtyp4 *restrict x_,
 
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
@@ -353,7 +355,8 @@ __kernel void k_eam(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
@@ -466,7 +469,8 @@ __kernel void k_eam_fast(const __global numtyp4 *x_,
 
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
 
diff --git a/lib/gpu/lal_ellipsoid_extra.h b/lib/gpu/lal_ellipsoid_extra.h
index 8311f4d617..f4697b157f 100644
--- a/lib/gpu/lal_ellipsoid_extra.h
+++ b/lib/gpu/lal_ellipsoid_extra.h
@@ -83,12 +83,12 @@ texture<int4,1> pos_tex, quat_tex;
   if (offset==0) {                                                          \
     __global acctyp *ap1=engv+ii;                                           \
     if (eflag>0) {                                                          \
-      *ap1=energy;                                                          \
+      *ap1=energy*(acctyp)0.5;                                              \
       ap1+=astride;                                                         \
     }                                                                       \
     if (vflag>0) {                                                          \
       for (int i=0; i<6; i++) {                                             \
-        *ap1=virial[i];                                                     \
+        *ap1=virial[i]*(acctyp)0.5;                                         \
         ap1+=astride;                                                       \
       }                                                                     \
     }                                                                       \
@@ -130,12 +130,12 @@ texture<int4,1> pos_tex, quat_tex;
   if (offset==0) {                                                          \
     engv+=ii;                                                               \
     if (eflag>0) {                                                          \
-      *engv+=energy;                                                        \
+      *engv+=energy*(acctyp)0.5;                                            \
       engv+=inum;                                                           \
     }                                                                       \
     if (vflag>0) {                                                          \
       for (int i=0; i<6; i++) {                                             \
-        *engv+=virial[i];                                                   \
+        *engv+=virial[i]*(acctyp)0.5;                                       \
         engv+=inum;                                                         \
       }                                                                     \
     }                                                                       \
@@ -170,12 +170,12 @@ texture<int4,1> pos_tex, quat_tex;
   if (offset==0) {                                                          \
     __global acctyp *ap1=engv+ii;                                           \
     if (eflag>0) {                                                          \
-      *ap1=energy;                                                          \
+      *ap1=energy*(acctyp)0.5;                                              \
       ap1+=astride;                                                         \
     }                                                                       \
     if (vflag>0) {                                                          \
       for (int i=0; i<6; i++) {                                             \
-        *ap1=virial[i];                                                     \
+        *ap1=virial[i]*(acctyp)0.5;                                         \
         ap1+=astride;                                                       \
       }                                                                     \
     }                                                                       \
@@ -202,12 +202,12 @@ texture<int4,1> pos_tex, quat_tex;
   if (offset==0) {                                                          \
     engv+=ii;                                                               \
     if (eflag>0) {                                                          \
-      *engv+=energy;                                                        \
+      *engv+=energy*(acctyp)0.5;                                            \
       engv+=inum;                                                           \
     }                                                                       \
     if (vflag>0) {                                                          \
       for (int i=0; i<6; i++) {                                             \
-        *engv+=virial[i];                                                   \
+        *engv+=virial[i]*(acctyp)0.5;                                       \
         engv+=inum;                                                         \
       }                                                                     \
     }                                                                       \
diff --git a/lib/gpu/lal_gauss.cpp b/lib/gpu/lal_gauss.cpp
index d05573f751..9e296e6872 100644
--- a/lib/gpu/lal_gauss.cpp
+++ b/lib/gpu/lal_gauss.cpp
@@ -68,7 +68,7 @@ int GaussT::init(const int ntypes,
 
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);
 
   for (int i=0; i<lj_types*lj_types; i++)
     host_write[i]=0.0;
diff --git a/lib/gpu/lal_gauss.cu b/lib/gpu/lal_gauss.cu
index d3fcec6edf..3c2fe554df 100644
--- a/lib/gpu/lal_gauss.cu
+++ b/lib/gpu/lal_gauss.cu
@@ -52,7 +52,8 @@ __kernel void k_gauss(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
@@ -138,7 +139,8 @@ __kernel void k_gauss_fast(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
 
diff --git a/lib/gpu/lal_gayberne.cpp b/lib/gpu/lal_gayberne.cpp
index 48309b245f..1d38810ae8 100644
--- a/lib/gpu/lal_gayberne.cpp
+++ b/lib/gpu/lal_gayberne.cpp
@@ -77,7 +77,7 @@ int GayBerneT::init(const int ntypes, const double gamma,
 
   // Allocate a host write buffer for copying type data
   UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);
 
   for (int i=0; i<lj_types*lj_types; i++)
     host_write[i]=0.0;
@@ -98,7 +98,7 @@ int GayBerneT::init(const int ntypes, const double gamma,
   this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
 		         host_offset);
 
-  dev_error.alloc(1,*(this->ucl_device));
+  dev_error.alloc(1,*(this->ucl_device),UCL_WRITE_ONLY);
   dev_error.zero();
     
   // Allocate, cast and asynchronous memcpy of constant data
@@ -258,6 +258,9 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
                                    &ainum, &this->_threads_per_atom);
       this->time_ellipsoid2.stop();
    } else {
+      GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum()-
+                               this->_last_ellipse)/
+                               (BX/this->_threads_per_atom)));
       this->ans->force.zero();
       this->ans->engv.zero();
       this->time_nbor1.stop();
diff --git a/lib/gpu/lal_gayberne.cu b/lib/gpu/lal_gayberne.cu
index da236fa79a..74072fc673 100644
--- a/lib/gpu/lal_gayberne.cu
+++ b/lib/gpu/lal_gayberne.cu
@@ -120,7 +120,8 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_,
 
   if (ii<inum) {
     const __global int *nbor, *nbor_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
                 n_stride,nbor_end,nbor);
   
diff --git a/lib/gpu/lal_gayberne_lj.cu b/lib/gpu/lal_gayberne_lj.cu
index e48b060026..c73ffa2e03 100644
--- a/lib/gpu/lal_gayberne_lj.cu
+++ b/lib/gpu/lal_gayberne_lj.cu
@@ -54,7 +54,8 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
 
   if (ii<inum) {
     const __global int *nbor, *nbor_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
                 n_stride,nbor_end,nbor);
   
@@ -276,7 +277,8 @@ __kernel void k_gayberne_lj(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
                 n_stride,list_end,nbor);
   
@@ -370,7 +372,8 @@ __kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
                 n_stride,list_end,nbor);
 
diff --git a/lib/gpu/lal_lj.cpp b/lib/gpu/lal_lj.cpp
index 6c0609a17b..7cc3ee48b3 100644
--- a/lib/gpu/lal_lj.cpp
+++ b/lib/gpu/lal_lj.cpp
@@ -69,7 +69,7 @@ int LJT::init(const int ntypes,
 
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);
 
   for (int i=0; i<lj_types*lj_types; i++)
     host_write[i]=0.0;
diff --git a/lib/gpu/lal_lj.cu b/lib/gpu/lal_lj.cu
index b230927c1f..11b3a31dad 100644
--- a/lib/gpu/lal_lj.cu
+++ b/lib/gpu/lal_lj.cu
@@ -53,7 +53,8 @@ __kernel void k_lj(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
@@ -142,7 +143,8 @@ __kernel void k_lj_fast(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
 
diff --git a/lib/gpu/lal_lj96.cpp b/lib/gpu/lal_lj96.cpp
index 05feadc5e2..70e46b9fe1 100644
--- a/lib/gpu/lal_lj96.cpp
+++ b/lib/gpu/lal_lj96.cpp
@@ -69,7 +69,7 @@ int LJ96T::init(const int ntypes,
 
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);
 
   for (int i=0; i<lj_types*lj_types; i++)
     host_write[i]=0.0;
diff --git a/lib/gpu/lal_lj96.cu b/lib/gpu/lal_lj96.cu
index 141c830a0a..e274934618 100644
--- a/lib/gpu/lal_lj96.cu
+++ b/lib/gpu/lal_lj96.cu
@@ -53,7 +53,8 @@ __kernel void k_lj96(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
@@ -143,7 +144,8 @@ __kernel void k_lj96_fast(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
diff --git a/lib/gpu/lal_lj_class2_long.cpp b/lib/gpu/lal_lj_class2_long.cpp
index cdb040815f..ef59843c4a 100644
--- a/lib/gpu/lal_lj_class2_long.cpp
+++ b/lib/gpu/lal_lj_class2_long.cpp
@@ -73,7 +73,7 @@ int LJClass2LongT::init(const int ntypes, double **host_cutsq,
 
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);
 
   for (int i=0; i<lj_types*lj_types; i++)
     host_write[i]=0.0;
diff --git a/lib/gpu/lal_lj_class2_long.cu b/lib/gpu/lal_lj_class2_long.cu
index 5262c11147..2742a1f7b6 100644
--- a/lib/gpu/lal_lj_class2_long.cu
+++ b/lib/gpu/lal_lj_class2_long.cu
@@ -66,7 +66,8 @@ __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
@@ -187,7 +188,8 @@ __kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
diff --git a/lib/gpu/lal_lj_coul.cpp b/lib/gpu/lal_lj_coul.cpp
index b82b4e3fad..8030f3cfc2 100644
--- a/lib/gpu/lal_lj_coul.cpp
+++ b/lib/gpu/lal_lj_coul.cpp
@@ -72,7 +72,7 @@ int LJCoulT::init(const int ntypes,
 
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);
 
   for (int i=0; i<lj_types*lj_types; i++)
     host_write[i]=0.0;
diff --git a/lib/gpu/lal_lj_coul.cu b/lib/gpu/lal_lj_coul.cu
index 48e8399183..cb225393fc 100644
--- a/lib/gpu/lal_lj_coul.cu
+++ b/lib/gpu/lal_lj_coul.cu
@@ -66,7 +66,8 @@ __kernel void k_lj_coul(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
@@ -178,7 +179,8 @@ __kernel void k_lj_coul_fast(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
diff --git a/lib/gpu/lal_lj_coul_debye.cpp b/lib/gpu/lal_lj_coul_debye.cpp
index 54f5fe7ce0..135a4dfd9d 100644
--- a/lib/gpu/lal_lj_coul_debye.cpp
+++ b/lib/gpu/lal_lj_coul_debye.cpp
@@ -73,7 +73,7 @@ int LJCoulDebyeT::init(const int ntypes,
 
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);
 
   for (int i=0; i<lj_types*lj_types; i++)
     host_write[i]=0.0;
diff --git a/lib/gpu/lal_lj_coul_debye.cu b/lib/gpu/lal_lj_coul_debye.cu
index dc5a85a9b9..f5825d5f3d 100644
--- a/lib/gpu/lal_lj_coul_debye.cu
+++ b/lib/gpu/lal_lj_coul_debye.cu
@@ -67,7 +67,8 @@ __kernel void k_lj_debye(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
@@ -186,7 +187,8 @@ __kernel void k_lj_debye_fast(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
diff --git a/lib/gpu/lal_lj_coul_long.cpp b/lib/gpu/lal_lj_coul_long.cpp
index ae0cce38cb..97ccfc4272 100644
--- a/lib/gpu/lal_lj_coul_long.cpp
+++ b/lib/gpu/lal_lj_coul_long.cpp
@@ -73,7 +73,7 @@ int LJCoulLongT::init(const int ntypes,
 
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);
 
   for (int i=0; i<lj_types*lj_types; i++)
     host_write[i]=0.0;
diff --git a/lib/gpu/lal_lj_coul_long.cu b/lib/gpu/lal_lj_coul_long.cu
index 2fcf645017..7d838b7c17 100644
--- a/lib/gpu/lal_lj_coul_long.cu
+++ b/lib/gpu/lal_lj_coul_long.cu
@@ -66,7 +66,8 @@ __kernel void k_lj_coul_long(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
@@ -183,7 +184,8 @@ __kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
diff --git a/lib/gpu/lal_lj_dsf.cpp b/lib/gpu/lal_lj_dsf.cpp
index fe13808d28..1b8fdeabb0 100644
--- a/lib/gpu/lal_lj_dsf.cpp
+++ b/lib/gpu/lal_lj_dsf.cpp
@@ -77,7 +77,7 @@ int LJDSFT::init(const int ntypes, double **host_cutsq, double **host_lj1,
 
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);
 
   for (int i=0; i<lj_types*lj_types; i++)
     host_write[i]=0.0;
diff --git a/lib/gpu/lal_lj_dsf.cu b/lib/gpu/lal_lj_dsf.cu
index 757e36cbc5..e8f0f8e361 100644
--- a/lib/gpu/lal_lj_dsf.cu
+++ b/lib/gpu/lal_lj_dsf.cu
@@ -69,7 +69,8 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
@@ -195,7 +196,8 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
diff --git a/lib/gpu/lal_lj_expand.cpp b/lib/gpu/lal_lj_expand.cpp
index 7e88ee9da8..474a91c6b3 100644
--- a/lib/gpu/lal_lj_expand.cpp
+++ b/lib/gpu/lal_lj_expand.cpp
@@ -69,7 +69,7 @@ int LJExpandT::init(const int ntypes, double **host_cutsq,
 
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);
 
   for (int i=0; i<lj_types*lj_types; i++)
     host_write[i]=0.0;
diff --git a/lib/gpu/lal_lj_expand.cu b/lib/gpu/lal_lj_expand.cu
index ee3ef60b68..84f1007c58 100644
--- a/lib/gpu/lal_lj_expand.cu
+++ b/lib/gpu/lal_lj_expand.cu
@@ -55,7 +55,8 @@ __kernel void k_lj_expand(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
@@ -147,7 +148,8 @@ __kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
diff --git a/lib/gpu/lal_morse.cpp b/lib/gpu/lal_morse.cpp
index 5ec2afa3b5..ddf7d843e6 100644
--- a/lib/gpu/lal_morse.cpp
+++ b/lib/gpu/lal_morse.cpp
@@ -69,7 +69,7 @@ int MorseT::init(const int ntypes,
 
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<numtyp> host_write(types*types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);
 
   for (int i=0; i<types*types; i++)
     host_write[i]=0.0;
diff --git a/lib/gpu/lal_morse.cu b/lib/gpu/lal_morse.cu
index 55847f3d36..f1570dc1f7 100644
--- a/lib/gpu/lal_morse.cu
+++ b/lib/gpu/lal_morse.cu
@@ -55,7 +55,8 @@ __kernel void k_morse(const __global numtyp4 *restrict x_,
 
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
@@ -145,7 +146,8 @@ __kernel void k_morse_fast(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
diff --git a/lib/gpu/lal_neighbor.cpp b/lib/gpu/lal_neighbor.cpp
index a033b507a4..662ab114de 100644
--- a/lib/gpu/lal_neighbor.cpp
+++ b/lib/gpu/lal_neighbor.cpp
@@ -38,7 +38,8 @@ bool Neighbor::init(NeighborShared *shared, const int inum,
                        const bool pre_cut, const int block_cell_2d,
                        const int block_cell_id, const int block_nbor_build,
                        const int threads_per_atom, const int warp_size,
-                       const bool time_device) {
+                       const bool time_device, 
+                       const std::string compile_flags) {
   clear();
 
   _threads_per_atom=threads_per_atom;
@@ -92,13 +93,13 @@ bool Neighbor::init(NeighborShared *shared, const int inum,
 
   if (gpu_nbor==0)
     success=success && (host_packed.alloc(2*IJ_SIZE,*dev,
-                                          UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
+                                          UCL_WRITE_ONLY)==UCL_SUCCESS);
   alloc(success);
   if (!success)
     return false;
     
   if (_use_packing==false)
-    _shared->compile_kernels(devi,gpu_nbor);
+    _shared->compile_kernels(devi,gpu_nbor,compile_flags);
 
   return success;
 }
@@ -114,7 +115,7 @@ void Neighbor::alloc(bool &success) {
     success=success && (dev_nbor.alloc(3*_max_atoms,*dev,
                                        UCL_READ_ONLY)==UCL_SUCCESS);
   success=success && (host_acc.alloc(nt*2,*dev,
-                                     UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
+                                     UCL_READ_WRITE)==UCL_SUCCESS);
 
   _c_bytes=dev_nbor.row_bytes();
   if (_alloc_packed) {
@@ -129,10 +130,10 @@ void Neighbor::alloc(bool &success) {
     host_ilist.clear();
     host_jlist.clear();
     
-    success=(nbor_host.alloc(_max_nbors*_max_host,*dev,UCL_RW_OPTIMIZED,
-                             UCL_WRITE_ONLY)==UCL_SUCCESS) && success;
+    success=(nbor_host.alloc(_max_nbors*_max_host,*dev,UCL_READ_WRITE,
+                             UCL_READ_WRITE)==UCL_SUCCESS) && success;
     success=success && (dev_numj_host.alloc(_max_host,*dev,
-                                            UCL_WRITE_ONLY)==UCL_SUCCESS);
+                                            UCL_READ_WRITE)==UCL_SUCCESS);
     success=success && (host_ilist.alloc(nt,*dev,UCL_NOT_PINNED)==UCL_SUCCESS);
     if (!success)
       return;
@@ -161,7 +162,7 @@ void Neighbor::alloc(bool &success) {
     success=success && (dev_nspecial.alloc(3*at,*dev,
                                            UCL_READ_ONLY)==UCL_SUCCESS);
     success=success && (dev_special.alloc(_maxspecial*at,*dev,
-                                          UCL_READ_ONLY)==UCL_SUCCESS);
+                                          UCL_READ_WRITE)==UCL_SUCCESS);
     success=success && (dev_special_t.alloc(_maxspecial*at,*dev,
                                             UCL_READ_ONLY)==UCL_SUCCESS);
     _gpu_bytes+=dev_nspecial.row_bytes()+dev_special.row_bytes()+
@@ -178,11 +179,9 @@ void Neighbor::clear() {
   _bin_time=0.0;
   if (_ncells>0) {
     _ncells=0;
-    dev_cell_counts.clear();
-    if (_gpu_nbor==2) {
-      host_cell_counts.clear();
+    cell_counts.clear();
+    if (_gpu_nbor==2)
       delete [] cell_iter;
-    }
   }
   if (_allocated) {
     _allocated=false;
@@ -286,6 +285,80 @@ void Neighbor::get_host(const int inum, int *ilist, int *numj,
   }
 }
 
+// This is the same as get host, but the requirement that ilist[i]=i and
+// inum=nlocal is forced to be true to allow direct indexing of neighbors of
+// neighbors
+void Neighbor::get_host3(const int inum, const int nlist, int *ilist, int *numj,
+                         int **firstneigh, const int block_size) {  
+  _nbor_time_avail=true;
+  time_nbor.start();
+
+  UCL_H_Vec<int> ilist_view;
+  ilist_view.view(ilist,inum,*dev);
+  ucl_copy(dev_nbor,ilist_view,false);
+
+  UCL_D_Vec<int> nbor_offset;
+  UCL_H_Vec<int> host_offset;
+
+  int copy_count=0;
+  int ij_count=0;
+  int acc_count=0;
+  int dev_count=0;
+  int *h_ptr=host_packed.begin();
+  _nbor_pitch=inum;
+  
+  if (nlist!=inum)
+    host_acc.zero(inum);
+
+  for (int ii=0; ii<nlist; ii++) {
+    int i=ilist[ii];
+    int nj=numj[i];
+    host_acc[i]=nj;
+    host_acc[i+inum]=acc_count;
+    acc_count+=nj;
+  }
+  
+  for (int i=0; i<inum; i++) {
+    int nj=host_acc[i];
+    int *jlist=firstneigh[i];
+    for (int jj=0; jj<nj; jj++) {
+      *h_ptr=jlist[jj];
+      h_ptr++;
+      ij_count++;
+       
+      if (ij_count==IJ_SIZE) {
+        dev_nbor.sync();
+        host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,IJ_SIZE);
+        nbor_offset.view_offset(dev_count,dev_packed,IJ_SIZE);
+        ucl_copy(nbor_offset,host_offset,true);
+        copy_count++;
+        ij_count=0;
+        dev_count+=IJ_SIZE;
+        h_ptr=host_packed.begin()+(IJ_SIZE*(copy_count%2));
+      }
+    }
+  }
+  if (ij_count!=0) {
+    dev_nbor.sync();
+    host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,ij_count);
+    nbor_offset.view_offset(dev_count,dev_packed,ij_count);
+    ucl_copy(nbor_offset,host_offset,true);
+  }
+  UCL_D_Vec<int> acc_view;
+  acc_view.view_offset(inum,dev_nbor,inum*2);
+  ucl_copy(acc_view,host_acc,true);
+  time_nbor.stop();
+  
+  if (_use_packing==false) {
+    time_kernel.start();
+    int GX=static_cast<int>(ceil(static_cast<double>(inum)*_threads_per_atom/
+                                 block_size));
+    _shared->k_nbor.set_size(GX,block_size);
+    _shared->k_nbor.run(&dev_nbor, &dev_packed, &inum, &_threads_per_atom);
+    time_kernel.stop();
+  }
+}
+
 template <class numtyp, class acctyp>
 void Neighbor::resize_max_neighbors(const int maxn, bool &success) {
   if (maxn>_max_nbors) {  
@@ -330,24 +403,20 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
   ncellz = static_cast<int>(ceil((subhi[2]-sublo[2])/_cell_size))+ghost_cells;
   ncell_3d = ncellx * ncelly * ncellz;
   if (ncell_3d+1>_ncells) {
+    cell_counts.clear();
+  
     if (_gpu_nbor==2) {
-      if (_ncells>0) {
-        host_cell_counts.clear();
+      if (_ncells>0)
         delete [] cell_iter;
-      }
       cell_iter = new int[ncell_3d+1];
-      host_cell_counts.alloc(ncell_3d+1,dev_nbor);
-    }
-
-    if (_gpu_nbor==2 && atom.host_view())
-      dev_cell_counts.view(host_cell_counts);
-    else {
-      dev_cell_counts.clear();
-      dev_cell_counts.alloc(ncell_3d+1,dev_nbor);
+      cell_counts.alloc(ncell_3d+1,dev_nbor,UCL_READ_WRITE,UCL_READ_ONLY);
+    } else {
+      cell_counts.device.clear();
+      cell_counts.device.alloc(ncell_3d+1,dev_nbor);
     }
     
     _ncells=ncell_3d+1;
-    _cell_bytes=dev_cell_counts.row_bytes();
+    _cell_bytes=cell_counts.device.row_bytes();
   }
 
   const numtyp cutoff_cast=static_cast<numtyp>(_cutoff);
@@ -381,7 +450,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
     int *particle_id=atom.host_particle_id.begin();
     
     // Build cell list on CPU                               
-    host_cell_counts.zero();
+    cell_counts.host.zero();
     double i_cell_size=1.0/_cell_size;
 
     int offset_hi=_cells_in_cutoff+1;
@@ -403,7 +472,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
     
       int id = ix+iy*ncellx+iz*ncellx*ncelly;
       cell_id[i] = id;
-      host_cell_counts[id+1]++;
+      cell_counts[id+1]++;
     }
     
     for (int i=nt; i<nall; i++) {
@@ -424,12 +493,12 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
     
       int id = ix+iy*ncellx+iz*ncellx*ncelly;
       cell_id[i] = id;
-      host_cell_counts[id+1]++;
+      cell_counts[id+1]++;
     }
     
     mn=0;
     for (int i=0; i<_ncells; i++)
-      mn=std::max(mn,host_cell_counts[i]);
+      mn=std::max(mn,cell_counts[i]);
     mn*=8;
     set_nbor_block_size(mn/2);
 
@@ -440,11 +509,11 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
 
     cell_iter[0]=0;
     for (int i=1; i<_ncells; i++) {
-      host_cell_counts[i]+=host_cell_counts[i-1];
-      cell_iter[i]=host_cell_counts[i];
+      cell_counts[i]+=cell_counts[i-1];
+      cell_iter[i]=cell_counts[i];
     }
     time_hybrid1.start();
-    ucl_copy(dev_cell_counts,host_cell_counts,true);
+    cell_counts.update_device(true);
     time_hybrid1.stop();
     for (int i=0; i<nall; i++) {
       int celli=cell_id[i];
@@ -481,7 +550,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
 
     /* calculate cell count */
     _shared->k_cell_counts.set_size(GX,neigh_block);
-    _shared->k_cell_counts.run(&atom.dev_cell_id, &dev_cell_counts, &nall, 
+    _shared->k_cell_counts.run(&atom.dev_cell_id, &cell_counts, &nall, 
                                &ncell_3d);
   } 
   
@@ -490,7 +559,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
   _shared->k_build_nbor.set_size(ncellx-ghost_cells,(ncelly-ghost_cells)*
                                  (ncellz-ghost_cells),cell_block,1);
   _shared->k_build_nbor.run(&atom.x, &atom.dev_particle_id,
-                            &dev_cell_counts, &dev_nbor, &nbor_host,
+                            &cell_counts, &dev_nbor, &nbor_host,
                             &dev_numj_host, &_max_nbors, &cutoff_cast, &ncellx,
                             &ncelly, &ncellz, &inum, &nt, &nall,
                             &_threads_per_atom, &_cells_in_cutoff);
diff --git a/lib/gpu/lal_neighbor.h b/lib/gpu/lal_neighbor.h
index 5ac01bf331..7949221ea5 100644
--- a/lib/gpu/lal_neighbor.h
+++ b/lib/gpu/lal_neighbor.h
@@ -47,13 +47,15 @@ class Neighbor {
     * \param pre_cut True if cutoff test will be performed in separate kernel
     *                than the force kernel 
     * \param threads_per_atom Number of threads used per atom for force
-    *                         calculation **/
+    *                         calculation 
+    * \param compile_flags Flags for JIT compiling **/
   bool init(NeighborShared *shared, const int inum, const int host_inum,
             const int max_nbors, const int maxspecial, UCL_Device &dev,
             const int gpu_nbor, const int gpu_host, const bool pre_cut,
             const int block_cell_2d, const int block_cell_id, 
             const int block_nbor_build, const int threads_per_atom,
-            const int warp_size, const bool time_device);
+            const int warp_size, const bool time_device, 
+            const std::string compile_flags);
 
   /// Set the size of the cutoff+skin
   inline void cell_size(const double size, const double cutoff) { 
@@ -143,6 +145,10 @@ class Neighbor {
   void get_host(const int inum, int *ilist, int *numj, 
                 int **firstneigh, const int block_size);
   
+  /// Copy neighbor list from host for 3-body (first time or from a rebuild)  
+  void get_host3(const int inum, const int nlist, int *ilist, int *numj, 
+                 int **firstneigh, const int block_size);
+  
   /// Return the stride in elements for each nbor row
   inline int nbor_pitch() const { return _nbor_pitch; }
   
@@ -207,11 +213,9 @@ class Neighbor {
   UCL_D_Vec<int> dev_nspecial;
   /// Device storage for special neighbors
   UCL_D_Vec<int> dev_special, dev_special_t;
-  /// Host storage for number of particles per cell
-  UCL_H_Vec<int> host_cell_counts;
+  /// Host/Device storage for number of particles per cell
+  UCL_Vector<int,int> cell_counts;
   int *cell_iter;
-  /// Device storage for number of particles per cell
-  UCL_D_Vec<int> dev_cell_counts;
 
   /// Device timers
   UCL_Timer time_nbor, time_kernel, time_hybrid1, time_hybrid2, time_transpose;
diff --git a/lib/gpu/lal_neighbor_shared.cpp b/lib/gpu/lal_neighbor_shared.cpp
index ba948b4287..d5d37883cb 100644
--- a/lib/gpu/lal_neighbor_shared.cpp
+++ b/lib/gpu/lal_neighbor_shared.cpp
@@ -48,15 +48,12 @@ void NeighborShared::clear() {
   }
 }
 
-void NeighborShared::compile_kernels(UCL_Device &dev, const int gpu_nbor) {
+void NeighborShared::compile_kernels(UCL_Device &dev, const int gpu_nbor,
+                                     const std::string flags) {
   if (_compiled)
   	return;
   	
   _gpu_nbor=gpu_nbor;
-  std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
-                    std::string(OCL_PRECISION_COMPILE)+" -D"+
-                    std::string(OCL_VENDOR);
-
   if (_gpu_nbor==0) {
     nbor_program=new UCL_Program(dev);
     nbor_program->load_string(neighbor_cpu,flags.c_str());
diff --git a/lib/gpu/lal_neighbor_shared.h b/lib/gpu/lal_neighbor_shared.h
index dcd776669c..31d74b0fa6 100644
--- a/lib/gpu/lal_neighbor_shared.h
+++ b/lib/gpu/lal_neighbor_shared.h
@@ -44,7 +44,8 @@ class NeighborShared {
   UCL_Texture neigh_tex;
 
   /// Compile kernels for neighbor lists
-  void compile_kernels(UCL_Device &dev, const int gpu_nbor);
+  void compile_kernels(UCL_Device &dev, const int gpu_nbor, 
+                       const std::string flags);
 
   // ----------------------------- Kernels
   UCL_Program *nbor_program, *build_program;
diff --git a/lib/gpu/lal_pppm.cpp b/lib/gpu/lal_pppm.cpp
index 5a929d9e69..188d23096c 100644
--- a/lib/gpu/lal_pppm.cpp
+++ b/lib/gpu/lal_pppm.cpp
@@ -136,10 +136,10 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen,
   _npts_y=nyhi_out-nylo_out+1;
   _npts_z=nzhi_out-nzlo_out+1;
   _npts_yx=_npts_x*_npts_y;
-  success=success && (brick.alloc(_npts_x*_npts_y*_npts_z,*ucl_device)==
-                      UCL_SUCCESS);
-  success=success && (vd_brick.alloc(_npts_x*_npts_y*_npts_z*4,*ucl_device)==
-                      UCL_SUCCESS);
+  success=success && (brick.alloc(_npts_x*_npts_y*_npts_z,*ucl_device,
+                                  UCL_READ_ONLY,UCL_WRITE_ONLY)==UCL_SUCCESS);
+  success=success && (vd_brick.alloc(_npts_x*_npts_y*_npts_z*4,*ucl_device,
+                                    UCL_READ_WRITE,UCL_READ_ONLY)==UCL_SUCCESS);
   *vd_brick_p=vd_brick.host.begin();
   _max_bytes+=brick.device.row_bytes()+vd_brick.device.row_bytes();
 
@@ -159,7 +159,7 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen,
   _max_bytes+=d_brick_atoms.row_bytes();
 
   // Allocate error flags for checking out of bounds atoms
-  success=success && (error_flag.alloc(1,*ucl_device,UCL_RW_OPTIMIZED,
+  success=success && (error_flag.alloc(1,*ucl_device,UCL_READ_ONLY,
                                        UCL_WRITE_ONLY)==UCL_SUCCESS);
   if (!success) {
     flag=-3;
@@ -374,9 +374,7 @@ void PPPMT::compile_kernels(UCL_Device &dev) {
   if (sizeof(grdtyp)==sizeof(double) && ucl_device->double_precision()==false)
     return;
 
-  std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
-                    std::string(OCL_PRECISION_COMPILE)+" -D"+
-                    std::string(OCL_VENDOR);
+  std::string flags=device->compile_string();
   #ifdef USE_OPENCL
   flags+=std::string(" -Dgrdtyp=")+ucl_template_name<grdtyp>()+" -Dgrdtyp4="+
          ucl_template_name<grdtyp>()+"4";
diff --git a/lib/gpu/lal_pppm_ext.cpp b/lib/gpu/lal_pppm_ext.cpp
index c65508987d..08f2c94e90 100644
--- a/lib/gpu/lal_pppm_ext.cpp
+++ b/lib/gpu/lal_pppm_ext.cpp
@@ -97,11 +97,11 @@ float * pppm_gpu_init_f(const int nlocal, const int nall, FILE *screen,
                         const int nzhi_out, float **rho_coeff,
                         float **vd_brick, const double slab_volfactor,
                         const int nx_pppm, const int ny_pppm, const int nz_pppm,
-                        const bool split, int &success) {
+                        const bool split, const bool respa, int &success) {
   float *b=pppm_gpu_init(PPPMF,nlocal,nall,screen,order,nxlo_out,nylo_out,
                          nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff,vd_brick,
                          slab_volfactor,nx_pppm,ny_pppm,nz_pppm,split,success);
-  if (split==false)
+  if (split==false && respa==false)
     PPPMF.device->set_single_precompute(&PPPMF);                         
   return b;
 }
@@ -139,12 +139,13 @@ double * pppm_gpu_init_d(const int nlocal, const int nall, FILE *screen,
                          const int nzhi_out, double **rho_coeff,
                          double **vd_brick, const double slab_volfactor,
                          const int nx_pppm, const int ny_pppm,
-                         const int nz_pppm, const bool split, int &success) {
+                         const int nz_pppm, const bool split, 
+                         const bool respa, int &success) {
   double *b=pppm_gpu_init(PPPMD,nlocal,nall,screen,order,nxlo_out,nylo_out,
                           nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff,
                           vd_brick,slab_volfactor,nx_pppm,ny_pppm,nz_pppm,
                           split,success);                        
-  if (split==false)
+  if (split==false && respa==false)
     PPPMD.device->set_double_precompute(&PPPMD);                         
   return b;
 }
diff --git a/lib/gpu/lal_precision.h b/lib/gpu/lal_precision.h
index e31b10037e..88890d67f3 100644
--- a/lib/gpu/lal_precision.h
+++ b/lib/gpu/lal_precision.h
@@ -96,19 +96,27 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) {
 
 enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
 
-// OCL_VENDOR: preprocessor define for hardware
+// OCL_DEFAULT_VENDOR: preprocessor define for hardware
 // specific sizes of OpenCL kernel related constants
 
 #ifdef FERMI_OCL
-#define OCL_VENDOR "FERMI_OCL"
+#define OCL_DEFAULT_VENDOR "fermi"
+#endif
+
+#ifdef KEPLER_OCL
+#define OCL_DEFAULT_VENDOR "kepler"
 #endif
 
 #ifdef CYPRESS_OCL
-#define OCL_VENDOR "CYPRESS_OCL"
+#define OCL_DEFAULT_VENDOR "cypress"
 #endif
 
-#ifndef OCL_VENDOR
-#define OCL_VENDOR "GENERIC_OCL"
+#ifdef GENERIC_OCL
+#define OCL_DEFAULT_VENDOR "generic"
+#endif
+
+#ifndef OCL_DEFAULT_VENDOR
+#define OCL_DEFAULT_VENDOR "none"
 #endif
 
 #endif
diff --git a/lib/gpu/lal_preprocessor.h b/lib/gpu/lal_preprocessor.h
index 16e4c29610..05e3e1d57b 100644
--- a/lib/gpu/lal_preprocessor.h
+++ b/lib/gpu/lal_preprocessor.h
@@ -214,6 +214,30 @@ typedef struct _double4 double4;
 
 #endif
 
+// -------------------------------------------------------------------------
+//                            NVIDIA GENERIC OPENCL DEFINITIONS
+// -------------------------------------------------------------------------
+
+#ifdef NV_GENERIC_OCL
+
+#define USE_OPENCL
+#define fast_mul mul24
+#define MEM_THREADS 16
+#define THREADS_PER_ATOM 1
+#define THREADS_PER_CHARGE 1
+#define BLOCK_PAIR 64
+#define MAX_SHARED_TYPES 8
+#define BLOCK_NBOR_BUILD 64
+#define BLOCK_BIO_PAIR 64
+
+#define WARP_SIZE 32
+#define PPPM_BLOCK_1D 64
+#define BLOCK_CELL_2D 8
+#define BLOCK_CELL_ID 128
+#define MAX_BIO_SHARED_TYPES 128
+
+#endif
+
 // -------------------------------------------------------------------------
 //                           NVIDIA FERMI OPENCL DEFINITIONS
 // -------------------------------------------------------------------------
@@ -221,9 +245,6 @@ typedef struct _double4 double4;
 #ifdef FERMI_OCL
 
 #define USE_OPENCL
-#define fast_mul(X,Y) (X)*(Y)
-#define ARCH 0
-#define DRIVER 0
 #define MEM_THREADS 32
 #define THREADS_PER_ATOM 4
 #define THREADS_PER_CHARGE 8
@@ -238,7 +259,54 @@ typedef struct _double4 double4;
 #define BLOCK_CELL_ID 128
 #define MAX_BIO_SHARED_TYPES 128
 
-#pragma OPENCL EXTENSION cl_khr_fp64: enable
+#endif
+
+// -------------------------------------------------------------------------
+//                           NVIDIA KEPLER OPENCL DEFINITIONS
+// -------------------------------------------------------------------------
+
+#ifdef KEPLER_OCL
+
+#define USE_OPENCL
+#define MEM_THREADS 32
+#define THREADS_PER_ATOM 4
+#define THREADS_PER_CHARGE 8
+#define BLOCK_PAIR 256
+#define MAX_SHARED_TYPES 11
+#define BLOCK_NBOR_BUILD 128
+#define BLOCK_BIO_PAIR 256
+#define BLOCK_ELLIPSE 128
+
+#define WARP_SIZE 32
+#define PPPM_BLOCK_1D 64
+#define BLOCK_CELL_2D 8
+#define BLOCK_CELL_ID 128
+#define MAX_BIO_SHARED_TYPES 128
+
+#ifndef NO_OCL_PTX
+#define ARCH 300
+#ifdef _SINGLE_SINGLE
+inline float shfl_xor(float var, int laneMask, int width) {
+  float ret;
+  int c;
+  c = ((WARP_SIZE-width) << 8) | 0x1f;
+  asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=f"(ret) : "f"(var), "r"(laneMask), "r"(c));
+  return ret;
+}
+#else
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+inline double shfl_xor(double var, int laneMask, int width) {
+  int c = ((WARP_SIZE-width) << 8) | 0x1f;
+  int x,y,x2,y2;
+  double ans;
+  asm volatile ("mov.b64 {%0, %1}, %2;" : "=r"(y), "=r"(x) : "d"(var));
+  asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=r"(x2) : "r"(x), "r"(laneMask), "r"(c));
+  asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=r"(y2) : "r"(y), "r"(laneMask), "r"(c));
+  asm volatile ("mov.b64 %0, {%1, %2};" : "=d"(ans) : "r"(y2), "r"(x2));
+  return ans;
+}
+#endif
+#endif
 
 #endif
 
@@ -249,9 +317,6 @@ typedef struct _double4 double4;
 #ifdef CYPRESS_OCL
 
 #define USE_OPENCL
-#define fast_mul(X,Y) (X)*(Y)
-#define ARCH 0
-#define DRIVER 0
 #define MEM_THREADS 32
 #define THREADS_PER_ATOM 4
 #define THREADS_PER_CHARGE 8
@@ -266,12 +331,6 @@ typedef struct _double4 double4;
 #define BLOCK_CELL_ID 128
 #define MAX_BIO_SHARED_TYPES 128
 
-#if defined(cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-#elif defined(cl_amd_fp64)
-#pragma OPENCL EXTENSION cl_amd_fp64 : enable
-#endif
-
 #endif
 
 // -------------------------------------------------------------------------
@@ -281,9 +340,6 @@ typedef struct _double4 double4;
 #ifdef GENERIC_OCL
 
 #define USE_OPENCL
-#define fast_mul mul24
-#define ARCH 0
-#define DRIVER 0
 #define MEM_THREADS 16
 #define THREADS_PER_ATOM 1
 #define THREADS_PER_CHARGE 1
@@ -298,6 +354,20 @@ typedef struct _double4 double4;
 #define BLOCK_CELL_ID 128
 #define MAX_BIO_SHARED_TYPES 128
 
+#endif
+
+// -------------------------------------------------------------------------
+//                     OPENCL Stuff for All Hardware
+// -------------------------------------------------------------------------
+#ifdef USE_OPENCL
+
+#ifndef _SINGLE_SINGLE
+
+#ifndef cl_khr_fp64
+#ifndef cl_amd_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#endif
+#endif
 #if defined(cl_khr_fp64)
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 #elif defined(cl_amd_fp64)
@@ -306,10 +376,17 @@ typedef struct _double4 double4;
 
 #endif
 
-// -------------------------------------------------------------------------
-//                     OPENCL Stuff for All Hardware
-// -------------------------------------------------------------------------
-#ifdef USE_OPENCL
+#ifndef fast_mul
+#define fast_mul(X,Y) (X)*(Y)
+#endif
+
+#ifndef ARCH
+#define ARCH 0
+#endif
+
+#ifndef DRIVER
+#define DRIVER 0
+#endif
 
 #define GLOBAL_ID_X get_global_id(0)
 #define THREAD_ID_X get_local_id(0)
diff --git a/lib/gpu/lal_re_squared.cpp b/lib/gpu/lal_re_squared.cpp
index 8f7ef24a11..cbf50fab7d 100644
--- a/lib/gpu/lal_re_squared.cpp
+++ b/lib/gpu/lal_re_squared.cpp
@@ -74,7 +74,7 @@ int RESquaredT::init(const int ntypes, double **host_shape, double **host_well,
 
   // Allocate a host write buffer for copying type data
   UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);
 
   for (int i=0; i<lj_types*lj_types; i++)
     host_write[i]=0.0;
@@ -95,7 +95,7 @@ int RESquaredT::init(const int ntypes, double **host_shape, double **host_well,
   this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
 		         host_offset);
 
-  dev_error.alloc(1,*(this->ucl_device));
+  dev_error.alloc(1,*(this->ucl_device),UCL_WRITE_ONLY);
   dev_error.zero();
     
   // Allocate, cast and asynchronous memcpy of constant data
@@ -260,6 +260,9 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
                                    &this->_threads_per_atom);
       this->time_ellipsoid3.stop();
    } else {
+      GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum()-
+                               this->_last_ellipse)/
+                               (BX/this->_threads_per_atom)));
       this->ans->force.zero();
       this->ans->engv.zero();
       this->time_nbor1.zero();
diff --git a/lib/gpu/lal_re_squared.cu b/lib/gpu/lal_re_squared.cu
index 28e15b5cdf..1308a70a7f 100644
--- a/lib/gpu/lal_re_squared.cu
+++ b/lib/gpu/lal_re_squared.cu
@@ -75,7 +75,8 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_,
 
   if (ii<inum) {
     const __global int *nbor, *nbor_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
                 n_stride,nbor_end,nbor);
   
diff --git a/lib/gpu/lal_re_squared_lj.cu b/lib/gpu/lal_re_squared_lj.cu
index d287409d88..71dc0e536b 100644
--- a/lib/gpu/lal_re_squared_lj.cu
+++ b/lib/gpu/lal_re_squared_lj.cu
@@ -59,12 +59,12 @@
   if (offset==0) {                                                           \
     __global acctyp *ap1=engv+ii;                                            \
     if (eflag>0) {                                                           \
-      *ap1+=energy;                                                          \
+      *ap1+=energy*(acctyp)0.5;                                              \
       ap1+=astride;                                                          \
     }                                                                        \
     if (vflag>0) {                                                           \
       for (int i=0; i<6; i++) {                                              \
-        *ap1+=virial[i];                                                     \
+        *ap1+=virial[i]*(acctyp)0.5;                                         \
         ap1+=astride;                                                        \
       }                                                                      \
     }                                                                        \
@@ -104,12 +104,12 @@
   if (offset==0) {                                                          \
     __global acctyp *ap1=engv+ii;                                           \
     if (eflag>0) {                                                          \
-      *ap1+=energy;                                                         \
+      *ap1+=energy*(acctyp)0.5;                                             \
       ap1+=astride;                                                         \
     }                                                                       \
     if (vflag>0) {                                                          \
       for (int i=0; i<6; i++) {                                             \
-        *ap1+=virial[i];                                                    \
+        *ap1+=virial[i]*(acctyp)0.5;                                        \
         ap1+=astride;                                                       \
       }                                                                     \
     }                                                                       \
@@ -173,7 +173,8 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
 
   if (ii<inum) {
     const __global int *nbor, *nbor_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
                 n_stride,nbor_end,nbor);
   
@@ -424,7 +425,8 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
 
   if (ii<inum) {
     const __global int *nbor, *nbor_end;
-    int j, numj, n_stride;
+    int j, numj;
+    __local int n_stride;
     nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,j,numj,
                 n_stride,nbor_end,nbor);
   
@@ -615,7 +617,8 @@ __kernel void k_resquared_lj(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
                 n_stride,list_end,nbor);
   
@@ -708,7 +711,8 @@ __kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
                 n_stride,list_end,nbor);
 
diff --git a/lib/gpu/lal_table.cpp b/lib/gpu/lal_table.cpp
index d7c84d65ca..c99bf85815 100644
--- a/lib/gpu/lal_table.cpp
+++ b/lib/gpu/lal_table.cpp
@@ -88,7 +88,7 @@ int TableT::init(const int ntypes,
   
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<int> host_write_int(lj_types*lj_types,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);
 
   for (int i=0; i<lj_types*lj_types; i++) 
     host_write_int[i] = 0;
@@ -113,7 +113,7 @@ int TableT::init(const int ntypes,
   ucl_copy(nmask,host_write_int,false);
   
   UCL_H_Vec<numtyp4> host_write(lj_types*lj_types,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);
 
   coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   for (int ix=1; ix<ntypes; ix++)
@@ -127,7 +127,7 @@ int TableT::init(const int ntypes,
 
   // Allocate tablength arrays
   UCL_H_Vec<numtyp4> host_write2(_ntables*_tablength,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                                 UCL_WRITE_ONLY);
   for (int i=0; i<_ntables*_tablength; i++) {
     host_write2[i].x = 0.0;
     host_write2[i].y = 0.0;
@@ -190,7 +190,7 @@ int TableT::init(const int ntypes,
   ucl_copy(coeff4,host_write2,false);
   
   UCL_H_Vec<numtyp> host_rsq(lj_types*lj_types,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                             UCL_WRITE_ONLY);
   cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack1(ntypes,lj_types,cutsq,host_rsq,host_cutsq);
             
diff --git a/lib/gpu/lal_table.cu b/lib/gpu/lal_table.cu
index ed843b59ca..afc6267902 100644
--- a/lib/gpu/lal_table.cu
+++ b/lib/gpu/lal_table.cu
@@ -74,7 +74,8 @@ __kernel void k_table(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
 
@@ -171,7 +172,8 @@ __kernel void k_table_fast(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
 
@@ -268,7 +270,8 @@ __kernel void k_table_linear(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
 
@@ -369,7 +372,8 @@ __kernel void k_table_linear_fast(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
 
@@ -470,7 +474,8 @@ __kernel void k_table_spline(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
 
@@ -578,7 +583,8 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
 
@@ -688,7 +694,8 @@ __kernel void k_table_bitmap(const __global numtyp4 *x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
 
@@ -794,7 +801,8 @@ __kernel void k_table_bitmap_fast(const __global numtyp4 *x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
 
diff --git a/lib/gpu/lal_yukawa.cpp b/lib/gpu/lal_yukawa.cpp
index acde5e9890..585dc069a0 100644
--- a/lib/gpu/lal_yukawa.cpp
+++ b/lib/gpu/lal_yukawa.cpp
@@ -68,7 +68,7 @@ int YukawaT::init(const int ntypes,
 
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);
 
   for (int i=0; i<lj_types*lj_types; i++)
     host_write[i]=0.0;
diff --git a/lib/gpu/lal_yukawa.cu b/lib/gpu/lal_yukawa.cu
index ece54b854f..0caaa0bd10 100644
--- a/lib/gpu/lal_yukawa.cu
+++ b/lib/gpu/lal_yukawa.cu
@@ -52,7 +52,8 @@ __kernel void k_yukawa(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
@@ -140,7 +141,8 @@ __kernel void k_yukawa_fast(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
 
diff --git a/lib/gpu/lal_yukawa_colloid.cpp b/lib/gpu/lal_yukawa_colloid.cpp
index 41978639e3..f86a257821 100644
--- a/lib/gpu/lal_yukawa_colloid.cpp
+++ b/lib/gpu/lal_yukawa_colloid.cpp
@@ -56,25 +56,24 @@ int YukawaColloidT::init(const int ntypes,
   if (success!=0)
     return success;
 
+  if (this->ucl_device->shared_memory() && sizeof(numtyp)==sizeof(double))
+    _shared_view=true;
+  else
+    _shared_view=false;
+
   // allocate rad
   
-  bool cpuview=false;
-  if (this->ucl_device->device_type()==UCL_CPU)
-    cpuview=true;
-  
   int ef_nall=nall;
   if (ef_nall==0)
     ef_nall=2000;
   
   _max_rad_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
-  host_rad.alloc(_max_rad_size,*(this->ucl_device));
-  if (cpuview)
-    dev_rad.view(host_rad);
-  else 
-    dev_rad.alloc(_max_rad_size,*(this->ucl_device),UCL_WRITE_ONLY);
+    
+  if (_shared_view==false)
+    c_rad.alloc(_max_rad_size,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY);
   
   rad_tex.get_texture(*(this->pair_program),"rad_tex");
-  rad_tex.bind_float(dev_rad,1);
+  rad_tex.bind_float(c_rad,1);
 
   // If atom type constants fit in shared memory use fast kernel
   int lj_types=ntypes;
@@ -90,7 +89,7 @@ int YukawaColloidT::init(const int ntypes,
 
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
-                               UCL_WRITE_OPTIMIZED);
+                               UCL_WRITE_ONLY);
 
   for (int i=0; i<lj_types*lj_types*32; i++)
     host_write[i]=(numtyp)0.0;
@@ -118,8 +117,7 @@ void YukawaColloidT::clear() {
   coeff.clear();
   sp_lj.clear();
 
-  host_rad.clear();
-  dev_rad.clear();
+  c_rad.clear();
 
   this->clear_atomic();
 }
@@ -143,22 +141,11 @@ void YukawaColloidT::compute(const int f_ago, const int inum_full,
   // ------------------- Resize rad array --------------------------
   
   if (nall>_max_rad_size) {
-    dev_rad.clear();
-    host_rad.clear();
-    
     _max_rad_size=static_cast<int>(static_cast<double>(nall)*1.10);
-    host_rad.alloc(_max_rad_size,*(this->ucl_device));
-
-    if (this->ucl_device->device_type()==UCL_CPU) {
-      if (sizeof(numtyp)==sizeof(double)) {
-        host_rad.view((numtyp*)rad,nall,*(this->ucl_device));
-        dev_rad.view(host_rad);
-      } 
-    } else {
-      dev_rad.alloc(_max_rad_size,*(this->ucl_device));
+    if (_shared_view==false) {
+      c_rad.resize(_max_rad_size);
+      rad_tex.bind_float(c_rad,1);
     }
-    
-    rad_tex.bind_float(dev_rad,1);
   }
 
   // ----------------------------------------------------------------
@@ -212,22 +199,11 @@ int** YukawaColloidT::compute(const int ago, const int inum_full, const int nall
   // ------------------- Resize rad array ----------------------------
   
   if (nall>_max_rad_size) {
-    dev_rad.clear();
-    host_rad.clear();
-    
     _max_rad_size=static_cast<int>(static_cast<double>(nall)*1.10);
-    host_rad.alloc(_max_rad_size,*(this->ucl_device));
-
-    if (this->ucl_device->device_type()==UCL_CPU) {
-      if (sizeof(numtyp)==sizeof(double)) {
-        host_rad.view((numtyp*)rad,nall,*(this->ucl_device));
-        dev_rad.view(host_rad);
-      } 
-    } else {
-      dev_rad.alloc(_max_rad_size,*(this->ucl_device));
+    if (_shared_view==false) {
+      c_rad.resize(_max_rad_size);
+      rad_tex.bind_float(c_rad,1);
     }
-    
-    rad_tex.bind_float(dev_rad,1);
   }      
 
   // -----------------------------------------------------------------
@@ -298,13 +274,13 @@ void YukawaColloidT::loop(const bool _eflag, const bool _vflag) {
   this->time_pair.start();
   if (shared_types) {
     this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &dev_rad, &coeff, &sp_lj,
+    this->k_pair_fast.run(&this->atom->x, &c_rad, &coeff, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag, &vflag,
                           &ainum, &nbor_pitch, &this->_threads_per_atom, &_kappa);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->x, &dev_rad, &coeff, &_lj_types, &sp_lj, 
+    this->k_pair.run(&this->atom->x, &c_rad, &coeff, &_lj_types, &sp_lj, 
                      &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
                      &this->ans->force, &this->ans->engv, &eflag, &vflag,
                      &ainum, &nbor_pitch, &this->_threads_per_atom, &_kappa);
diff --git a/lib/gpu/lal_yukawa_colloid.cu b/lib/gpu/lal_yukawa_colloid.cu
index 55f52e70e7..65da63f8f7 100644
--- a/lib/gpu/lal_yukawa_colloid.cu
+++ b/lib/gpu/lal_yukawa_colloid.cu
@@ -59,7 +59,8 @@ __kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
@@ -150,7 +151,8 @@ __kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_,
   
   if (ii<inum) {
     const __global int *nbor, *list_end;
-    int i, numj, n_stride;
+    int i, numj;
+    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
 
diff --git a/lib/gpu/lal_yukawa_colloid.h b/lib/gpu/lal_yukawa_colloid.h
index cd3cbb5cab..2b6cbe4ecd 100644
--- a/lib/gpu/lal_yukawa_colloid.h
+++ b/lib/gpu/lal_yukawa_colloid.h
@@ -45,25 +45,20 @@ class YukawaColloid : public BaseAtomic<numtyp, acctyp> {
 
   inline void cast_rad_data(double* rad) {
     int nall = this->atom->nall();
-    if (this->ucl_device->device_type()==UCL_CPU) {
-      if (sizeof(numtyp)==sizeof(double)) {
-        host_rad.view((numtyp*)rad,nall,*(this->ucl_device));
-        dev_rad.view(host_rad);
-      } else {
-        for (int i=0; i<nall; i++) host_rad[i]=rad[i];
-      }
+    if (_shared_view) {
+      c_rad.host.view((numtyp*)rad,nall,*(this->ucl_device));
+      c_rad.device.view(c_rad.host);
     } else {
       if (sizeof(numtyp)==sizeof(double))
-        memcpy(host_rad.begin(),rad,nall*sizeof(numtyp));
-      else {
-        for (int i=0; i<nall; i++) host_rad[i]=rad[i];
-      }
+        memcpy(c_rad.host.begin(),rad,nall*sizeof(numtyp));
+      else
+        for (int i=0; i<nall; i++) c_rad[i]=rad[i];
     }
   }
 
   // Copy rad to device asynchronously
   inline void add_rad_data() {
-    ucl_copy(dev_rad,host_rad,this->atom->nall(),true);
+    c_rad.update_device(this->atom->nall(),true);
   }
 
   /// Clear all host and device data
@@ -114,10 +109,10 @@ class YukawaColloid : public BaseAtomic<numtyp, acctyp> {
   numtyp _kappa;
 
   /// Per-atom arrays
-  UCL_H_Vec<numtyp> host_rad;
-  UCL_D_Vec<numtyp> dev_rad;
+  UCL_Vector<numtyp,numtyp> c_rad;
 
  private:
+  bool _shared_view;
   bool _allocated;
   void loop(const bool _eflag, const bool _vflag);
 };