diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile
index 0824d048b8..31c687369a 100644
--- a/lib/gpu/Nvidia.makefile
+++ b/lib/gpu/Nvidia.makefile
@@ -3,6 +3,7 @@ CUDA  = $(NVCC) $(CUDA_INCLUDE) $(CUDA_OPTS) -Icudpp_mini $(CUDA_ARCH) \
 CUDR  = $(CUDR_CPP) $(CUDR_OPTS) $(CUDA_PRECISION) $(CUDA_INCLUDE) \
          $(CUDPP_OPT)
 CUDA_LINK = $(CUDA_LIB) -lcudart
+BIN2C = $(CUDA_HOME)/bin/bin2c
 
 GPU_LIB = $(LIB_DIR)/libgpu.a
 
@@ -27,6 +28,7 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_ans.o \
        $(OBJ_DIR)/lal_neighbor.o $(OBJ_DIR)/lal_neighbor_shared.o \
        $(OBJ_DIR)/lal_device.o $(OBJ_DIR)/lal_base_atomic.o \
        $(OBJ_DIR)/lal_base_charge.o $(OBJ_DIR)/lal_base_ellipsoid.o \
+       $(OBJ_DIR)/lal_base_dipole.o \
        $(OBJ_DIR)/lal_pppm.o $(OBJ_DIR)/lal_pppm_ext.o \
        $(OBJ_DIR)/lal_gayberne.o $(OBJ_DIR)/lal_gayberne_ext.o \
        $(OBJ_DIR)/lal_re_squared.o $(OBJ_DIR)/lal_re_squared_ext.o \
@@ -35,6 +37,7 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_ans.o \
        $(OBJ_DIR)/lal_lj_expand.o $(OBJ_DIR)/lal_lj_expand_ext.o \
        $(OBJ_DIR)/lal_lj_coul.o $(OBJ_DIR)/lal_lj_coul_ext.o \
        $(OBJ_DIR)/lal_lj_coul_long.o $(OBJ_DIR)/lal_lj_coul_long_ext.o \
+       $(OBJ_DIR)/lal_lj_dsf.o $(OBJ_DIR)/lal_lj_dsf_ext.o \
        $(OBJ_DIR)/lal_lj_class2_long.o $(OBJ_DIR)/lal_lj_class2_long_ext.o \
        $(OBJ_DIR)/lal_coul_long.o $(OBJ_DIR)/lal_coul_long_ext.o \
        $(OBJ_DIR)/lal_morse.o $(OBJ_DIR)/lal_morse_ext.o \
@@ -46,35 +49,57 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_ans.o \
        $(OBJ_DIR)/lal_buck_coul.o $(OBJ_DIR)/lal_buck_coul_ext.o \
        $(OBJ_DIR)/lal_buck_coul_long.o $(OBJ_DIR)/lal_buck_coul_long_ext.o \
        $(OBJ_DIR)/lal_table.o $(OBJ_DIR)/lal_table_ext.o \
-       $(OBJ_DIR)/lal_yukawa.o $(OBJ_DIR)/lal_yukawa_ext.o
-PTXS = $(OBJ_DIR)/device.ptx $(OBJ_DIR)/device_ptx.h \
-       $(OBJ_DIR)/atom.ptx $(OBJ_DIR)/atom_ptx.h \
-       $(OBJ_DIR)/neighbor_cpu.ptx $(OBJ_DIR)/neighbor_cpu_ptx.h \
-       $(OBJ_DIR)/neighbor_gpu.ptx $(OBJ_DIR)/neighbor_gpu_ptx.h \
-       $(OBJ_DIR)/pppm_f.ptx $(OBJ_DIR)/pppm_f_ptx.h \
-       $(OBJ_DIR)/pppm_d.ptx $(OBJ_DIR)/pppm_d_ptx.h \
-       $(OBJ_DIR)/ellipsoid_nbor.ptx $(OBJ_DIR)/ellipsoid_nbor_ptx.h \
-       $(OBJ_DIR)/gayberne.ptx $(OBJ_DIR)/gayberne_lj.ptx \
-       $(OBJ_DIR)/gayberne_ptx.h $(OBJ_DIR)/gayberne_lj_ptx.h \
-       $(OBJ_DIR)/re_squared.ptx $(OBJ_DIR)/re_squared_lj.ptx \
-       $(OBJ_DIR)/re_squared_ptx.h $(OBJ_DIR)/re_squared_lj_ptx.h \
-       $(OBJ_DIR)/lj.ptx $(OBJ_DIR)/lj_ptx.h \
-       $(OBJ_DIR)/lj96.ptx $(OBJ_DIR)/lj96_ptx.h \
-       $(OBJ_DIR)/lj_expand.ptx $(OBJ_DIR)/lj_expand_ptx.h \
-       $(OBJ_DIR)/lj_coul.ptx $(OBJ_DIR)/lj_coul_ptx.h \
-       $(OBJ_DIR)/lj_coul_long.ptx $(OBJ_DIR)/lj_coul_long_ptx.h \
-       $(OBJ_DIR)/lj_class2_long.ptx $(OBJ_DIR)/lj_class2_long_ptx.h \
-       $(OBJ_DIR)/coul_long.ptx $(OBJ_DIR)/coul_long_ptx.h \
-       $(OBJ_DIR)/morse.ptx $(OBJ_DIR)/morse_ptx.h \
-       $(OBJ_DIR)/charmm_long.ptx $(OBJ_DIR)/charmm_long_ptx.h \
-       $(OBJ_DIR)/cg_cmm.ptx $(OBJ_DIR)/cg_cmm_ptx.h \
-       $(OBJ_DIR)/cg_cmm_long.ptx $(OBJ_DIR)/cg_cmm_long_ptx.h \
-       $(OBJ_DIR)/eam.ptx $(OBJ_DIR)/eam_ptx.h \
-       $(OBJ_DIR)/buck.ptx $(OBJ_DIR)/buck_ptx.h \
-       $(OBJ_DIR)/buck_coul.ptx $(OBJ_DIR)/buck_coul_ptx.h \
-       $(OBJ_DIR)/buck_coul_long.ptx $(OBJ_DIR)/buck_coul_long_ptx.h \
-       $(OBJ_DIR)/table.ptx $(OBJ_DIR)/table_ptx.h \
-       $(OBJ_DIR)/yukawa.ptx $(OBJ_DIR)/yukawa_ptx.h
+       $(OBJ_DIR)/lal_yukawa.o $(OBJ_DIR)/lal_yukawa_ext.o \
+       $(OBJ_DIR)/lal_born.o $(OBJ_DIR)/lal_born_ext.o \
+       $(OBJ_DIR)/lal_born_coul_wolf.o $(OBJ_DIR)/lal_born_coul_wolf_ext.o \
+       $(OBJ_DIR)/lal_born_coul_long.o $(OBJ_DIR)/lal_born_coul_long_ext.o \
+       $(OBJ_DIR)/lal_dipole_lj.o $(OBJ_DIR)/lal_dipole_lj_ext.o \
+       $(OBJ_DIR)/lal_dipole_lj_sf.o $(OBJ_DIR)/lal_dipole_lj_sf_ext.o \
+       $(OBJ_DIR)/lal_colloid.o $(OBJ_DIR)/lal_colloid_ext.o \
+       $(OBJ_DIR)/lal_gauss.o $(OBJ_DIR)/lal_gauss_ext.o \
+       $(OBJ_DIR)/lal_yukawa_colloid.o $(OBJ_DIR)/lal_yukawa_colloid_ext.o \
+       $(OBJ_DIR)/lal_lj_coul_debye.o $(OBJ_DIR)/lal_lj_coul_debye_ext.o \
+       $(OBJ_DIR)/lal_coul_dsf.o $(OBJ_DIR)/lal_coul_dsf_ext.o
+
+CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \
+       $(OBJ_DIR)/atom.cubin $(OBJ_DIR)/atom_cubin.h \
+       $(OBJ_DIR)/neighbor_cpu.cubin $(OBJ_DIR)/neighbor_cpu_cubin.h \
+       $(OBJ_DIR)/neighbor_gpu.cubin $(OBJ_DIR)/neighbor_gpu_cubin.h \
+       $(OBJ_DIR)/pppm_f.cubin $(OBJ_DIR)/pppm_f_cubin.h \
+       $(OBJ_DIR)/pppm_d.cubin $(OBJ_DIR)/pppm_d_cubin.h \
+       $(OBJ_DIR)/ellipsoid_nbor.cubin $(OBJ_DIR)/ellipsoid_nbor_cubin.h \
+       $(OBJ_DIR)/gayberne.cubin $(OBJ_DIR)/gayberne_lj.cubin \
+       $(OBJ_DIR)/gayberne_cubin.h $(OBJ_DIR)/gayberne_lj_cubin.h \
+       $(OBJ_DIR)/re_squared.cubin $(OBJ_DIR)/re_squared_lj.cubin \
+       $(OBJ_DIR)/re_squared_cubin.h $(OBJ_DIR)/re_squared_lj_cubin.h \
+       $(OBJ_DIR)/lj.cubin $(OBJ_DIR)/lj_cubin.h \
+       $(OBJ_DIR)/lj96.cubin $(OBJ_DIR)/lj96_cubin.h \
+       $(OBJ_DIR)/lj_expand.cubin $(OBJ_DIR)/lj_expand_cubin.h \
+       $(OBJ_DIR)/lj_coul.cubin $(OBJ_DIR)/lj_coul_cubin.h \
+       $(OBJ_DIR)/lj_coul_long.cubin $(OBJ_DIR)/lj_coul_long_cubin.h \
+       $(OBJ_DIR)/lj_dsf.cubin $(OBJ_DIR)/lj_dsf_cubin.h \
+       $(OBJ_DIR)/lj_class2_long.cubin $(OBJ_DIR)/lj_class2_long_cubin.h \
+       $(OBJ_DIR)/coul_long.cubin $(OBJ_DIR)/coul_long_cubin.h \
+       $(OBJ_DIR)/morse.cubin $(OBJ_DIR)/morse_cubin.h \
+       $(OBJ_DIR)/charmm_long.cubin $(OBJ_DIR)/charmm_long_cubin.h \
+       $(OBJ_DIR)/cg_cmm.cubin $(OBJ_DIR)/cg_cmm_cubin.h \
+       $(OBJ_DIR)/cg_cmm_long.cubin $(OBJ_DIR)/cg_cmm_long_cubin.h \
+       $(OBJ_DIR)/eam.cubin $(OBJ_DIR)/eam_cubin.h \
+       $(OBJ_DIR)/buck.cubin $(OBJ_DIR)/buck_cubin.h \
+       $(OBJ_DIR)/buck_coul_long.cubin $(OBJ_DIR)/buck_coul_long_cubin.h \
+       $(OBJ_DIR)/buck_coul_wolf.cubin $(OBJ_DIR)/buck_coul_wolf_cubin.h \
+       $(OBJ_DIR)/table.cubin $(OBJ_DIR)/table_cubin.h \
+       $(OBJ_DIR)/yukawa.cubin $(OBJ_DIR)/yukawa_cubin.h \
+       $(OBJ_DIR)/born.cubin $(OBJ_DIR)/born_cubin.h \
+       $(OBJ_DIR)/born_coul_wolf.cubin $(OBJ_DIR)/born_coul_wolf_cubin.h \
+       $(OBJ_DIR)/born_coul_long.cubin $(OBJ_DIR)/born_coul_long_cubin.h \
+       $(OBJ_DIR)/dipole_lj.cubin $(OBJ_DIR)/dipole_lj_cubin.h \
+       $(OBJ_DIR)/dipole_lj_sf.cubin $(OBJ_DIR)/dipole_lj_sf_cubin.h \
+       $(OBJ_DIR)/colloid.cubin $(OBJ_DIR)/colloid_cubin.h \
+       $(OBJ_DIR)/gauss.cubin $(OBJ_DIR)/gauss_cubin.h \
+       $(OBJ_DIR)/yukawa_colloid.cubin $(OBJ_DIR)/yukawa_colloid_cubin.h \
+       $(OBJ_DIR)/lj_coul_debye.cubin $(OBJ_DIR)/lj_coul_debye_cubin.h \
+       $(OBJ_DIR)/coul_dsf.cubin $(OBJ_DIR)/coul_dsf_cubin.h
 
 all: $(GPU_LIB) $(EXECS)
 
@@ -96,43 +121,43 @@ $(OBJ_DIR)/radixsort_app.cu_o: cudpp_mini/radixsort_app.cu
 $(OBJ_DIR)/scan_app.cu_o: cudpp_mini/scan_app.cu
 	$(CUDA) -o $@ -c cudpp_mini/scan_app.cu
 
-$(OBJ_DIR)/atom.ptx: lal_atom.cu lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ lal_atom.cu
+$(OBJ_DIR)/atom.cubin: lal_atom.cu lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_atom.cu
 
-$(OBJ_DIR)/atom_ptx.h: $(OBJ_DIR)/atom.ptx
-	$(BSH) ./geryon/file_to_cstr.sh atom $(OBJ_DIR)/atom.ptx $(OBJ_DIR)/atom_ptx.h
+$(OBJ_DIR)/atom_cubin.h: $(OBJ_DIR)/atom.cubin
+	$(BIN2C) -c -n atom $(OBJ_DIR)/atom.cubin > $(OBJ_DIR)/atom_cubin.h
 
-$(OBJ_DIR)/lal_atom.o: lal_atom.cpp lal_atom.h $(NVD_H) $(OBJ_DIR)/atom_ptx.h
+$(OBJ_DIR)/lal_atom.o: lal_atom.cpp lal_atom.h $(NVD_H) $(OBJ_DIR)/atom_cubin.h
 	$(CUDR) -o $@ -c lal_atom.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lal_ans.o: lal_answer.cpp lal_answer.h $(NVD_H)
 	$(CUDR) -o $@ -c lal_answer.cpp -I$(OBJ_DIR)
 
-$(OBJ_DIR)/neighbor_cpu.ptx: lal_neighbor_cpu.cu lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ lal_neighbor_cpu.cu
+$(OBJ_DIR)/neighbor_cpu.cubin: lal_neighbor_cpu.cu lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_neighbor_cpu.cu
 
-$(OBJ_DIR)/neighbor_cpu_ptx.h: $(OBJ_DIR)/neighbor_cpu.ptx
-	$(BSH) ./geryon/file_to_cstr.sh neighbor_cpu $(OBJ_DIR)/neighbor_cpu.ptx $(OBJ_DIR)/neighbor_cpu_ptx.h
+$(OBJ_DIR)/neighbor_cpu_cubin.h: $(OBJ_DIR)/neighbor_cpu.cubin
+	$(BIN2C) -c -n neighbor_cpu $(OBJ_DIR)/neighbor_cpu.cubin > $(OBJ_DIR)/neighbor_cpu_cubin.h
 
-$(OBJ_DIR)/neighbor_gpu.ptx: lal_neighbor_gpu.cu lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ lal_neighbor_gpu.cu
+$(OBJ_DIR)/neighbor_gpu.cubin: lal_neighbor_gpu.cu lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_neighbor_gpu.cu
 
-$(OBJ_DIR)/neighbor_gpu_ptx.h: $(OBJ_DIR)/neighbor_gpu.ptx
-	$(BSH) ./geryon/file_to_cstr.sh neighbor_gpu $(OBJ_DIR)/neighbor_gpu.ptx $(OBJ_DIR)/neighbor_gpu_ptx.h
+$(OBJ_DIR)/neighbor_gpu_cubin.h: $(OBJ_DIR)/neighbor_gpu.cubin
+	$(BIN2C) -c -n neighbor_gpu $(OBJ_DIR)/neighbor_gpu.cubin > $(OBJ_DIR)/neighbor_gpu_cubin.h
 
-$(OBJ_DIR)/lal_neighbor_shared.o: lal_neighbor_shared.cpp lal_neighbor_shared.h $(OBJ_DIR)/neighbor_cpu_ptx.h $(OBJ_DIR)/neighbor_gpu_ptx.h $(NVD_H)
+$(OBJ_DIR)/lal_neighbor_shared.o: lal_neighbor_shared.cpp lal_neighbor_shared.h $(OBJ_DIR)/neighbor_cpu_cubin.h $(OBJ_DIR)/neighbor_gpu_cubin.h $(NVD_H)
 	$(CUDR) -o $@ -c lal_neighbor_shared.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lal_neighbor.o: lal_neighbor.cpp lal_neighbor.h lal_neighbor_shared.h $(NVD_H)
 	$(CUDR) -o $@ -c lal_neighbor.cpp -I$(OBJ_DIR)
 
-$(OBJ_DIR)/device.ptx: lal_device.cu lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ lal_device.cu
+$(OBJ_DIR)/device.cubin: lal_device.cu lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_device.cu
 
-$(OBJ_DIR)/device_ptx.h: $(OBJ_DIR)/device.ptx
-	$(BSH) ./geryon/file_to_cstr.sh device $(OBJ_DIR)/device.ptx $(OBJ_DIR)/device_ptx.h
+$(OBJ_DIR)/device_cubin.h: $(OBJ_DIR)/device.cubin
+	$(BIN2C) -c -n device $(OBJ_DIR)/device.cubin > $(OBJ_DIR)/device_cubin.h
 
-$(OBJ_DIR)/lal_device.o: lal_device.cpp lal_device.h $(ALL_H) $(OBJ_DIR)/device_ptx.h
+$(OBJ_DIR)/lal_device.o: lal_device.cpp lal_device.h $(ALL_H) $(OBJ_DIR)/device_cubin.h
 	$(CUDR) -o $@ -c lal_device.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lal_base_atomic.o: $(ALL_H) lal_base_atomic.h lal_base_atomic.cpp
@@ -141,273 +166,408 @@ $(OBJ_DIR)/lal_base_atomic.o: $(ALL_H) lal_base_atomic.h lal_base_atomic.cpp
 $(OBJ_DIR)/lal_base_charge.o: $(ALL_H) lal_base_charge.h lal_base_charge.cpp
 	$(CUDR) -o $@ -c lal_base_charge.cpp
 
-$(OBJ_DIR)/lal_base_ellipsoid.o: $(ALL_H) lal_base_ellipsoid.h lal_base_ellipsoid.cpp $(OBJ_DIR)/ellipsoid_nbor_ptx.h
+$(OBJ_DIR)/lal_base_ellipsoid.o: $(ALL_H) lal_base_ellipsoid.h lal_base_ellipsoid.cpp $(OBJ_DIR)/ellipsoid_nbor_cubin.h
 	$(CUDR) -o $@ -c lal_base_ellipsoid.cpp -I$(OBJ_DIR)
 
-$(OBJ_DIR)/pppm_f.ptx: lal_pppm.cu lal_precision.h lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -Dgrdtyp=float -Dgrdtyp4=float4 -o $@ lal_pppm.cu
+$(OBJ_DIR)/lal_base_dipole.o: $(ALL_H) lal_base_dipole.h lal_base_dipole.cpp
+	$(CUDR) -o $@ -c lal_base_dipole.cpp
 
-$(OBJ_DIR)/pppm_f_ptx.h: $(OBJ_DIR)/pppm_f.ptx
-	$(BSH) ./geryon/file_to_cstr.sh pppm_f $(OBJ_DIR)/pppm_f.ptx $(OBJ_DIR)/pppm_f_ptx.h
+$(OBJ_DIR)/pppm_f.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -Dgrdtyp=float -Dgrdtyp4=float4 -o $@ lal_pppm.cu
 
-$(OBJ_DIR)/pppm_d.ptx: lal_pppm.cu lal_precision.h lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -Dgrdtyp=double -Dgrdtyp4=double4 -o $@ lal_pppm.cu
+$(OBJ_DIR)/pppm_f_cubin.h: $(OBJ_DIR)/pppm_f.cubin
+	$(BIN2C) -c -n pppm_f $(OBJ_DIR)/pppm_f.cubin > $(OBJ_DIR)/pppm_f_cubin.h
 
-$(OBJ_DIR)/pppm_d_ptx.h: $(OBJ_DIR)/pppm_d.ptx
-	$(BSH) ./geryon/file_to_cstr.sh pppm_d $(OBJ_DIR)/pppm_d.ptx $(OBJ_DIR)/pppm_d_ptx.h
+$(OBJ_DIR)/pppm_d.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -Dgrdtyp=double -Dgrdtyp4=double4 -o $@ lal_pppm.cu
 
-$(OBJ_DIR)/lal_pppm.o: $(ALL_H) lal_pppm.h lal_pppm.cpp $(OBJ_DIR)/pppm_f_ptx.h $(OBJ_DIR)/pppm_d_ptx.h
+$(OBJ_DIR)/pppm_d_cubin.h: $(OBJ_DIR)/pppm_d.cubin
+	$(BIN2C) -c -n pppm_d $(OBJ_DIR)/pppm_d.cubin > $(OBJ_DIR)/pppm_d_cubin.h
+
+$(OBJ_DIR)/lal_pppm.o: $(ALL_H) lal_pppm.h lal_pppm.cpp $(OBJ_DIR)/pppm_f_cubin.h $(OBJ_DIR)/pppm_d_cubin.h
 	$(CUDR) -o $@ -c lal_pppm.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lal_pppm_ext.o: $(ALL_H) lal_pppm.h lal_pppm_ext.cpp
 	$(CUDR) -o $@ -c lal_pppm_ext.cpp -I$(OBJ_DIR)
 
-$(OBJ_DIR)/ellipsoid_nbor.ptx: lal_ellipsoid_nbor.cu lal_precision.h lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ lal_ellipsoid_nbor.cu
+$(OBJ_DIR)/ellipsoid_nbor.cubin: lal_ellipsoid_nbor.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_ellipsoid_nbor.cu
 
-$(OBJ_DIR)/ellipsoid_nbor_ptx.h: $(OBJ_DIR)/ellipsoid_nbor.ptx
-	$(BSH) ./geryon/file_to_cstr.sh ellipsoid_nbor $(OBJ_DIR)/ellipsoid_nbor.ptx $(OBJ_DIR)/ellipsoid_nbor_ptx.h
+$(OBJ_DIR)/ellipsoid_nbor_cubin.h: $(OBJ_DIR)/ellipsoid_nbor.cubin
+	$(BIN2C) -c -n ellipsoid_nbor $(OBJ_DIR)/ellipsoid_nbor.cubin > $(OBJ_DIR)/ellipsoid_nbor_cubin.h
 
-$(OBJ_DIR)/gayberne.ptx: lal_gayberne.cu lal_precision.h lal_ellipsoid_extra.h lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ lal_gayberne.cu
+$(OBJ_DIR)/gayberne.cubin: lal_gayberne.cu lal_precision.h lal_ellipsoid_extra.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_gayberne.cu
 
-$(OBJ_DIR)/gayberne_lj.ptx: lal_gayberne_lj.cu lal_precision.h lal_ellipsoid_extra.h lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ lal_gayberne_lj.cu
+$(OBJ_DIR)/gayberne_lj.cubin: lal_gayberne_lj.cu lal_precision.h lal_ellipsoid_extra.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_gayberne_lj.cu
 
-$(OBJ_DIR)/gayberne_ptx.h: $(OBJ_DIR)/gayberne.ptx
-	$(BSH) ./geryon/file_to_cstr.sh gayberne $(OBJ_DIR)/gayberne.ptx $(OBJ_DIR)/gayberne_ptx.h
+$(OBJ_DIR)/gayberne_cubin.h: $(OBJ_DIR)/gayberne.cubin
+	$(BIN2C) -c -n gayberne $(OBJ_DIR)/gayberne.cubin > $(OBJ_DIR)/gayberne_cubin.h
 
-$(OBJ_DIR)/gayberne_lj_ptx.h: $(OBJ_DIR)/gayberne_lj.ptx
-	$(BSH) ./geryon/file_to_cstr.sh gayberne_lj $(OBJ_DIR)/gayberne_lj.ptx $(OBJ_DIR)/gayberne_lj_ptx.h
+$(OBJ_DIR)/gayberne_lj_cubin.h: $(OBJ_DIR)/gayberne_lj.cubin
+	$(BIN2C) -c -n gayberne_lj $(OBJ_DIR)/gayberne_lj.cubin > $(OBJ_DIR)/gayberne_lj_cubin.h
 
-$(OBJ_DIR)/lal_gayberne.o: $(ALL_H) lal_gayberne.h lal_gayberne.cpp $(OBJ_DIR)/gayberne_ptx.h $(OBJ_DIR)/gayberne_lj_ptx.h $(OBJ_DIR)/lal_base_ellipsoid.o
+$(OBJ_DIR)/lal_gayberne.o: $(ALL_H) lal_gayberne.h lal_gayberne.cpp $(OBJ_DIR)/gayberne_cubin.h $(OBJ_DIR)/gayberne_lj_cubin.h $(OBJ_DIR)/lal_base_ellipsoid.o
 	$(CUDR) -o $@ -c lal_gayberne.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lal_gayberne_ext.o: $(ALL_H) $(OBJ_DIR)/lal_gayberne.o lal_gayberne_ext.cpp
 	$(CUDR) -o $@ -c lal_gayberne_ext.cpp -I$(OBJ_DIR)
 
-$(OBJ_DIR)/re_squared.ptx: lal_re_squared.cu lal_precision.h lal_ellipsoid_extra.h lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ lal_re_squared.cu
+$(OBJ_DIR)/re_squared.cubin: lal_re_squared.cu lal_precision.h lal_ellipsoid_extra.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_re_squared.cu
 
-$(OBJ_DIR)/re_squared_lj.ptx: lal_re_squared_lj.cu lal_precision.h lal_ellipsoid_extra.h lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ lal_re_squared_lj.cu
+$(OBJ_DIR)/re_squared_lj.cubin: lal_re_squared_lj.cu lal_precision.h lal_ellipsoid_extra.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_re_squared_lj.cu
 
-$(OBJ_DIR)/re_squared_ptx.h: $(OBJ_DIR)/re_squared.ptx
-	$(BSH) ./geryon/file_to_cstr.sh re_squared $(OBJ_DIR)/re_squared.ptx $(OBJ_DIR)/re_squared_ptx.h
+$(OBJ_DIR)/re_squared_cubin.h: $(OBJ_DIR)/re_squared.cubin
+	$(BIN2C) -c -n re_squared $(OBJ_DIR)/re_squared.cubin > $(OBJ_DIR)/re_squared_cubin.h
 
-$(OBJ_DIR)/re_squared_lj_ptx.h: $(OBJ_DIR)/re_squared_lj.ptx
-	$(BSH) ./geryon/file_to_cstr.sh re_squared_lj $(OBJ_DIR)/re_squared_lj.ptx $(OBJ_DIR)/re_squared_lj_ptx.h
+$(OBJ_DIR)/re_squared_lj_cubin.h: $(OBJ_DIR)/re_squared_lj.cubin
+	$(BIN2C) -c -n re_squared_lj $(OBJ_DIR)/re_squared_lj.cubin > $(OBJ_DIR)/re_squared_lj_cubin.h
 
-$(OBJ_DIR)/lal_re_squared.o: $(ALL_H) lal_re_squared.h lal_re_squared.cpp $(OBJ_DIR)/re_squared_ptx.h $(OBJ_DIR)/re_squared_lj_ptx.h $(OBJ_DIR)/lal_base_ellipsoid.o
+$(OBJ_DIR)/lal_re_squared.o: $(ALL_H) lal_re_squared.h lal_re_squared.cpp $(OBJ_DIR)/re_squared_cubin.h $(OBJ_DIR)/re_squared_lj_cubin.h $(OBJ_DIR)/lal_base_ellipsoid.o
 	$(CUDR) -o $@ -c lal_re_squared.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lal_re_squared_ext.o: $(ALL_H) $(OBJ_DIR)/lal_re_squared.o lal_re_squared_ext.cpp
 	$(CUDR) -o $@ -c lal_re_squared_ext.cpp -I$(OBJ_DIR)
 
-$(OBJ_DIR)/lj.ptx: lal_lj.cu lal_precision.h lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ lal_lj.cu
+$(OBJ_DIR)/lj.cubin: lal_lj.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj.cu
 
-$(OBJ_DIR)/lj_ptx.h: $(OBJ_DIR)/lj.ptx $(OBJ_DIR)/lj.ptx
-	$(BSH) ./geryon/file_to_cstr.sh lj $(OBJ_DIR)/lj.ptx $(OBJ_DIR)/lj_ptx.h
+$(OBJ_DIR)/lj_cubin.h: $(OBJ_DIR)/lj.cubin $(OBJ_DIR)/lj.cubin
+	$(BIN2C) -c -n lj $(OBJ_DIR)/lj.cubin > $(OBJ_DIR)/lj_cubin.h
 
-$(OBJ_DIR)/lal_lj.o: $(ALL_H) lal_lj.h lal_lj.cpp $(OBJ_DIR)/lj_ptx.h $(OBJ_DIR)/lal_base_atomic.o
+$(OBJ_DIR)/lal_lj.o: $(ALL_H) lal_lj.h lal_lj.cpp $(OBJ_DIR)/lj_cubin.h $(OBJ_DIR)/lal_base_atomic.o
 	$(CUDR) -o $@ -c lal_lj.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lal_lj_ext.o: $(ALL_H) lal_lj.h lal_lj_ext.cpp lal_base_atomic.h
 	$(CUDR) -o $@ -c lal_lj_ext.cpp -I$(OBJ_DIR)
 
-$(OBJ_DIR)/lj_coul.ptx: lal_lj_coul.cu lal_precision.h lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ lal_lj_coul.cu
+$(OBJ_DIR)/lj_coul.cubin: lal_lj_coul.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj_coul.cu
 
-$(OBJ_DIR)/lj_coul_ptx.h: $(OBJ_DIR)/lj_coul.ptx $(OBJ_DIR)/lj_coul.ptx
-	$(BSH) ./geryon/file_to_cstr.sh lj_coul $(OBJ_DIR)/lj_coul.ptx $(OBJ_DIR)/lj_coul_ptx.h
+$(OBJ_DIR)/lj_coul_cubin.h: $(OBJ_DIR)/lj_coul.cubin $(OBJ_DIR)/lj_coul.cubin
+	$(BIN2C) -c -n lj_coul $(OBJ_DIR)/lj_coul.cubin > $(OBJ_DIR)/lj_coul_cubin.h
 
-$(OBJ_DIR)/lal_lj_coul.o: $(ALL_H) lal_lj_coul.h lal_lj_coul.cpp $(OBJ_DIR)/lj_coul_ptx.h $(OBJ_DIR)/lal_base_charge.o
+$(OBJ_DIR)/lal_lj_coul.o: $(ALL_H) lal_lj_coul.h lal_lj_coul.cpp $(OBJ_DIR)/lj_coul_cubin.h $(OBJ_DIR)/lal_base_charge.o
 	$(CUDR) -o $@ -c lal_lj_coul.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lal_lj_coul_ext.o: $(ALL_H) lal_lj_coul.h lal_lj_coul_ext.cpp lal_base_charge.h
 	$(CUDR) -o $@ -c lal_lj_coul_ext.cpp -I$(OBJ_DIR)
 
-$(OBJ_DIR)/lj_class2_long.ptx: lal_lj_class2_long.cu lal_precision.h lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ lal_lj_class2_long.cu
+$(OBJ_DIR)/lj_class2_long.cubin: lal_lj_class2_long.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj_class2_long.cu
 
-$(OBJ_DIR)/lj_class2_long_ptx.h: $(OBJ_DIR)/lj_class2_long.ptx $(OBJ_DIR)/lj_class2_long.ptx
-	$(BSH) ./geryon/file_to_cstr.sh lj_class2_long $(OBJ_DIR)/lj_class2_long.ptx $(OBJ_DIR)/lj_class2_long_ptx.h
+$(OBJ_DIR)/lj_class2_long_cubin.h: $(OBJ_DIR)/lj_class2_long.cubin $(OBJ_DIR)/lj_class2_long.cubin
+	$(BIN2C) -c -n lj_class2_long $(OBJ_DIR)/lj_class2_long.cubin > $(OBJ_DIR)/lj_class2_long_cubin.h
 
-$(OBJ_DIR)/lal_lj_class2_long.o: $(ALL_H) lal_lj_class2_long.h lal_lj_class2_long.cpp $(OBJ_DIR)/lj_class2_long_ptx.h $(OBJ_DIR)/lal_base_charge.o
+$(OBJ_DIR)/lal_lj_class2_long.o: $(ALL_H) lal_lj_class2_long.h lal_lj_class2_long.cpp $(OBJ_DIR)/lj_class2_long_cubin.h $(OBJ_DIR)/lal_base_charge.o
 	$(CUDR) -o $@ -c lal_lj_class2_long.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lal_lj_class2_long_ext.o: $(ALL_H) lal_lj_class2_long.h lal_lj_class2_long_ext.cpp lal_base_charge.h
 	$(CUDR) -o $@ -c lal_lj_class2_long_ext.cpp -I$(OBJ_DIR)
 
-$(OBJ_DIR)/coul_long.ptx: lal_coul_long.cu lal_precision.h lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ lal_coul_long.cu
+$(OBJ_DIR)/coul_long.cubin: lal_coul_long.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_coul_long.cu
 
-$(OBJ_DIR)/coul_long_ptx.h: $(OBJ_DIR)/coul_long.ptx $(OBJ_DIR)/coul_long.ptx
-	$(BSH) ./geryon/file_to_cstr.sh coul_long $(OBJ_DIR)/coul_long.ptx $(OBJ_DIR)/coul_long_ptx.h
+$(OBJ_DIR)/coul_long_cubin.h: $(OBJ_DIR)/coul_long.cubin $(OBJ_DIR)/coul_long.cubin
+	$(BIN2C) -c -n coul_long $(OBJ_DIR)/coul_long.cubin > $(OBJ_DIR)/coul_long_cubin.h
 
-$(OBJ_DIR)/lal_coul_long.o: $(ALL_H) lal_coul_long.h lal_coul_long.cpp $(OBJ_DIR)/coul_long_ptx.h $(OBJ_DIR)/lal_base_charge.o
+$(OBJ_DIR)/lal_coul_long.o: $(ALL_H) lal_coul_long.h lal_coul_long.cpp $(OBJ_DIR)/coul_long_cubin.h $(OBJ_DIR)/lal_base_charge.o
 	$(CUDR) -o $@ -c lal_coul_long.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lal_coul_long_ext.o: $(ALL_H) lal_coul_long.h lal_coul_long_ext.cpp lal_base_charge.h
 	$(CUDR) -o $@ -c lal_coul_long_ext.cpp -I$(OBJ_DIR)
 
-$(OBJ_DIR)/lj_coul_long.ptx: lal_lj_coul_long.cu lal_precision.h lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ lal_lj_coul_long.cu
+$(OBJ_DIR)/lj_coul_long.cubin: lal_lj_coul_long.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj_coul_long.cu
 
-$(OBJ_DIR)/lj_coul_long_ptx.h: $(OBJ_DIR)/lj_coul_long.ptx $(OBJ_DIR)/lj_coul_long.ptx
-	$(BSH) ./geryon/file_to_cstr.sh lj_coul_long $(OBJ_DIR)/lj_coul_long.ptx $(OBJ_DIR)/lj_coul_long_ptx.h
+$(OBJ_DIR)/lj_coul_long_cubin.h: $(OBJ_DIR)/lj_coul_long.cubin $(OBJ_DIR)/lj_coul_long.cubin
+	$(BIN2C) -c -n lj_coul_long $(OBJ_DIR)/lj_coul_long.cubin > $(OBJ_DIR)/lj_coul_long_cubin.h
 
-$(OBJ_DIR)/lal_lj_coul_long.o: $(ALL_H) lal_lj_coul_long.h lal_lj_coul_long.cpp $(OBJ_DIR)/lj_coul_long_ptx.h $(OBJ_DIR)/lal_base_charge.o
+$(OBJ_DIR)/lal_lj_coul_long.o: $(ALL_H) lal_lj_coul_long.h lal_lj_coul_long.cpp $(OBJ_DIR)/lj_coul_long_cubin.h $(OBJ_DIR)/lal_base_charge.o
 	$(CUDR) -o $@ -c lal_lj_coul_long.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lal_lj_coul_long_ext.o: $(ALL_H) lal_lj_coul_long.h lal_lj_coul_long_ext.cpp lal_base_charge.h
 	$(CUDR) -o $@ -c lal_lj_coul_long_ext.cpp -I$(OBJ_DIR)
 
-$(OBJ_DIR)/morse.ptx: lal_morse.cu lal_precision.h lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ lal_morse.cu
+$(OBJ_DIR)/lj_dsf.cubin: lal_lj_dsf.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj_dsf.cu
 
-$(OBJ_DIR)/morse_ptx.h: $(OBJ_DIR)/morse.ptx $(OBJ_DIR)/morse.ptx
-	$(BSH) ./geryon/file_to_cstr.sh morse $(OBJ_DIR)/morse.ptx $(OBJ_DIR)/morse_ptx.h
+$(OBJ_DIR)/lj_dsf_cubin.h: $(OBJ_DIR)/lj_dsf.cubin $(OBJ_DIR)/lj_dsf.cubin
+	$(BIN2C) -c -n lj_dsf $(OBJ_DIR)/lj_dsf.cubin > $(OBJ_DIR)/lj_dsf_cubin.h
 
-$(OBJ_DIR)/lal_morse.o: $(ALL_H) lal_morse.h lal_morse.cpp $(OBJ_DIR)/morse_ptx.h $(OBJ_DIR)/lal_base_atomic.o
+$(OBJ_DIR)/lal_lj_dsf.o: $(ALL_H) lal_lj_dsf.h lal_lj_dsf.cpp $(OBJ_DIR)/lj_dsf_cubin.h $(OBJ_DIR)/lal_base_charge.o
+	$(CUDR) -o $@ -c lal_lj_dsf.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_lj_dsf_ext.o: $(ALL_H) lal_lj_dsf.h lal_lj_dsf_ext.cpp lal_base_charge.h
+	$(CUDR) -o $@ -c lal_lj_dsf_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/morse.cubin: lal_morse.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_morse.cu
+
+$(OBJ_DIR)/morse_cubin.h: $(OBJ_DIR)/morse.cubin $(OBJ_DIR)/morse.cubin
+	$(BIN2C) -c -n morse $(OBJ_DIR)/morse.cubin > $(OBJ_DIR)/morse_cubin.h
+
+$(OBJ_DIR)/lal_morse.o: $(ALL_H) lal_morse.h lal_morse.cpp $(OBJ_DIR)/morse_cubin.h $(OBJ_DIR)/lal_base_atomic.o
 	$(CUDR) -o $@ -c lal_morse.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lal_morse_ext.o: $(ALL_H) lal_morse.h lal_morse_ext.cpp lal_base_atomic.h
 	$(CUDR) -o $@ -c lal_morse_ext.cpp -I$(OBJ_DIR)
 
-$(OBJ_DIR)/charmm_long.ptx: lal_charmm_long.cu lal_precision.h lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ lal_charmm_long.cu
+$(OBJ_DIR)/charmm_long.cubin: lal_charmm_long.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_charmm_long.cu
 
-$(OBJ_DIR)/charmm_long_ptx.h: $(OBJ_DIR)/charmm_long.ptx $(OBJ_DIR)/charmm_long.ptx
-	$(BSH) ./geryon/file_to_cstr.sh charmm_long $(OBJ_DIR)/charmm_long.ptx $(OBJ_DIR)/charmm_long_ptx.h
+$(OBJ_DIR)/charmm_long_cubin.h: $(OBJ_DIR)/charmm_long.cubin $(OBJ_DIR)/charmm_long.cubin
+	$(BIN2C) -c -n charmm_long $(OBJ_DIR)/charmm_long.cubin > $(OBJ_DIR)/charmm_long_cubin.h
 
-$(OBJ_DIR)/lal_charmm_long.o: $(ALL_H) lal_charmm_long.h lal_charmm_long.cpp $(OBJ_DIR)/charmm_long_ptx.h $(OBJ_DIR)/lal_base_charge.o
+$(OBJ_DIR)/lal_charmm_long.o: $(ALL_H) lal_charmm_long.h lal_charmm_long.cpp $(OBJ_DIR)/charmm_long_cubin.h $(OBJ_DIR)/lal_base_charge.o
 	$(CUDR) -o $@ -c lal_charmm_long.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lal_charmm_long_ext.o: $(ALL_H) lal_charmm_long.h lal_charmm_long_ext.cpp lal_base_charge.h
 	$(CUDR) -o $@ -c lal_charmm_long_ext.cpp -I$(OBJ_DIR)
 
-$(OBJ_DIR)/lj96.ptx: lal_lj96.cu lal_precision.h lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ lal_lj96.cu
+$(OBJ_DIR)/lj96.cubin: lal_lj96.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj96.cu
 
-$(OBJ_DIR)/lj96_ptx.h: $(OBJ_DIR)/lj96.ptx $(OBJ_DIR)/lj96.ptx
-	$(BSH) ./geryon/file_to_cstr.sh lj96 $(OBJ_DIR)/lj96.ptx $(OBJ_DIR)/lj96_ptx.h
+$(OBJ_DIR)/lj96_cubin.h: $(OBJ_DIR)/lj96.cubin $(OBJ_DIR)/lj96.cubin
+	$(BIN2C) -c -n lj96 $(OBJ_DIR)/lj96.cubin > $(OBJ_DIR)/lj96_cubin.h
 
-$(OBJ_DIR)/lal_lj96.o: $(ALL_H) lal_lj96.h lal_lj96.cpp $(OBJ_DIR)/lj96_ptx.h $(OBJ_DIR)/lal_base_atomic.o
+$(OBJ_DIR)/lal_lj96.o: $(ALL_H) lal_lj96.h lal_lj96.cpp $(OBJ_DIR)/lj96_cubin.h $(OBJ_DIR)/lal_base_atomic.o
 	$(CUDR) -o $@ -c lal_lj96.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lal_lj96_ext.o: $(ALL_H) lal_lj96.h lal_lj96_ext.cpp lal_base_atomic.h
 	$(CUDR) -o $@ -c lal_lj96_ext.cpp -I$(OBJ_DIR)
 
-$(OBJ_DIR)/lj_expand.ptx: lal_lj_expand.cu lal_precision.h lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ lal_lj_expand.cu
+$(OBJ_DIR)/lj_expand.cubin: lal_lj_expand.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj_expand.cu
 
-$(OBJ_DIR)/lj_expand_ptx.h: $(OBJ_DIR)/lj_expand.ptx $(OBJ_DIR)/lj_expand.ptx
-	$(BSH) ./geryon/file_to_cstr.sh lj_expand $(OBJ_DIR)/lj_expand.ptx $(OBJ_DIR)/lj_expand_ptx.h
+$(OBJ_DIR)/lj_expand_cubin.h: $(OBJ_DIR)/lj_expand.cubin $(OBJ_DIR)/lj_expand.cubin
+	$(BIN2C) -c -n lj_expand $(OBJ_DIR)/lj_expand.cubin > $(OBJ_DIR)/lj_expand_cubin.h
 
-$(OBJ_DIR)/lal_lj_expand.o: $(ALL_H) lal_lj_expand.h lal_lj_expand.cpp $(OBJ_DIR)/lj_expand_ptx.h $(OBJ_DIR)/lal_base_atomic.o
+$(OBJ_DIR)/lal_lj_expand.o: $(ALL_H) lal_lj_expand.h lal_lj_expand.cpp $(OBJ_DIR)/lj_expand_cubin.h $(OBJ_DIR)/lal_base_atomic.o
 	$(CUDR) -o $@ -c lal_lj_expand.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lal_lj_expand_ext.o: $(ALL_H) lal_lj_expand.h lal_lj_expand_ext.cpp lal_base_atomic.h
 	$(CUDR) -o $@ -c lal_lj_expand_ext.cpp -I$(OBJ_DIR)
 
-$(OBJ_DIR)/cg_cmm.ptx: lal_cg_cmm.cu lal_precision.h lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ lal_cg_cmm.cu
+$(OBJ_DIR)/cg_cmm.cubin: lal_cg_cmm.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_cg_cmm.cu
 
-$(OBJ_DIR)/cg_cmm_ptx.h: $(OBJ_DIR)/cg_cmm.ptx $(OBJ_DIR)/cg_cmm.ptx
-	$(BSH) ./geryon/file_to_cstr.sh cg_cmm $(OBJ_DIR)/cg_cmm.ptx $(OBJ_DIR)/cg_cmm_ptx.h
+$(OBJ_DIR)/cg_cmm_cubin.h: $(OBJ_DIR)/cg_cmm.cubin $(OBJ_DIR)/cg_cmm.cubin
+	$(BIN2C) -c -n cg_cmm $(OBJ_DIR)/cg_cmm.cubin > $(OBJ_DIR)/cg_cmm_cubin.h
 
-$(OBJ_DIR)/lal_cg_cmm.o: $(ALL_H) lal_cg_cmm.h lal_cg_cmm.cpp $(OBJ_DIR)/cg_cmm_ptx.h $(OBJ_DIR)/lal_base_atomic.o
+$(OBJ_DIR)/lal_cg_cmm.o: $(ALL_H) lal_cg_cmm.h lal_cg_cmm.cpp $(OBJ_DIR)/cg_cmm_cubin.h $(OBJ_DIR)/lal_base_atomic.o
 	$(CUDR) -o $@ -c lal_cg_cmm.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lal_cg_cmm_ext.o: $(ALL_H) lal_cg_cmm.h lal_cg_cmm_ext.cpp lal_base_atomic.h
 	$(CUDR) -o $@ -c lal_cg_cmm_ext.cpp -I$(OBJ_DIR)
 
-$(OBJ_DIR)/cg_cmm_long.ptx: lal_cg_cmm_long.cu lal_precision.h lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ lal_cg_cmm_long.cu
+$(OBJ_DIR)/cg_cmm_long.cubin: lal_cg_cmm_long.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_cg_cmm_long.cu
 
-$(OBJ_DIR)/cg_cmm_long_ptx.h: $(OBJ_DIR)/cg_cmm_long.ptx $(OBJ_DIR)/cg_cmm_long.ptx
-	$(BSH) ./geryon/file_to_cstr.sh cg_cmm_long $(OBJ_DIR)/cg_cmm_long.ptx $(OBJ_DIR)/cg_cmm_long_ptx.h
+$(OBJ_DIR)/cg_cmm_long_cubin.h: $(OBJ_DIR)/cg_cmm_long.cubin $(OBJ_DIR)/cg_cmm_long.cubin
+	$(BIN2C) -c -n cg_cmm_long $(OBJ_DIR)/cg_cmm_long.cubin > $(OBJ_DIR)/cg_cmm_long_cubin.h
 
-$(OBJ_DIR)/lal_cg_cmm_long.o: $(ALL_H) lal_cg_cmm_long.h lal_cg_cmm_long.cpp $(OBJ_DIR)/cg_cmm_long_ptx.h $(OBJ_DIR)/lal_base_atomic.o
+$(OBJ_DIR)/lal_cg_cmm_long.o: $(ALL_H) lal_cg_cmm_long.h lal_cg_cmm_long.cpp $(OBJ_DIR)/cg_cmm_long_cubin.h $(OBJ_DIR)/lal_base_atomic.o
 	$(CUDR) -o $@ -c lal_cg_cmm_long.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lal_cg_cmm_long_ext.o: $(ALL_H) lal_cg_cmm_long.h lal_cg_cmm_long_ext.cpp lal_base_charge.h
 	$(CUDR) -o $@ -c lal_cg_cmm_long_ext.cpp -I$(OBJ_DIR)
 
-$(OBJ_DIR)/eam.ptx: lal_eam.cu lal_precision.h lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ lal_eam.cu
-  
-$(OBJ_DIR)/eam_ptx.h: $(OBJ_DIR)/eam.ptx $(OBJ_DIR)/eam.ptx
-	$(BSH) ./geryon/file_to_cstr.sh eam $(OBJ_DIR)/eam.ptx $(OBJ_DIR)/eam_ptx.h
-    
-$(OBJ_DIR)/lal_eam.o: $(ALL_H) lal_eam.h lal_eam.cpp $(OBJ_DIR)/eam_ptx.h $(OBJ_DIR)/lal_base_atomic.o
+$(OBJ_DIR)/eam.cubin: lal_eam.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_eam.cu
+
+$(OBJ_DIR)/eam_cubin.h: $(OBJ_DIR)/eam.cubin $(OBJ_DIR)/eam.cubin
+	$(BIN2C) -c -n eam $(OBJ_DIR)/eam.cubin > $(OBJ_DIR)/eam_cubin.h
+
+$(OBJ_DIR)/lal_eam.o: $(ALL_H) lal_eam.h lal_eam.cpp $(OBJ_DIR)/eam_cubin.h $(OBJ_DIR)/lal_base_atomic.o
 	$(CUDR) -o $@ -c lal_eam.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lal_eam_ext.o: $(ALL_H) lal_eam.h lal_eam_ext.cpp lal_base_atomic.h
 	$(CUDR) -o $@ -c lal_eam_ext.cpp -I$(OBJ_DIR)
 
-$(OBJ_DIR)/buck.ptx: lal_buck.cu lal_precision.h lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ lal_buck.cu
-  
-$(OBJ_DIR)/buck_ptx.h: $(OBJ_DIR)/buck.ptx $(OBJ_DIR)/buck.ptx
-	$(BSH) ./geryon/file_to_cstr.sh buck $(OBJ_DIR)/buck.ptx $(OBJ_DIR)/buck_ptx.h
-    
-$(OBJ_DIR)/lal_buck.o: $(ALL_H) lal_buck.h lal_buck.cpp $(OBJ_DIR)/buck_ptx.h $(OBJ_DIR)/lal_base_atomic.o
+$(OBJ_DIR)/buck.cubin: lal_buck.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_buck.cu
+
+$(OBJ_DIR)/buck_cubin.h: $(OBJ_DIR)/buck.cubin $(OBJ_DIR)/buck.cubin
+	$(BIN2C) -c -n buck $(OBJ_DIR)/buck.cubin > $(OBJ_DIR)/buck_cubin.h
+
+$(OBJ_DIR)/lal_buck.o: $(ALL_H) lal_buck.h lal_buck.cpp $(OBJ_DIR)/buck_cubin.h $(OBJ_DIR)/lal_base_atomic.o
 	$(CUDR) -o $@ -c lal_buck.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lal_buck_ext.o: $(ALL_H) lal_buck.h lal_buck_ext.cpp lal_base_atomic.h
 	$(CUDR) -o $@ -c lal_buck_ext.cpp -I$(OBJ_DIR)
 
-$(OBJ_DIR)/buck_coul.ptx: lal_buck_coul.cu lal_precision.h lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ lal_buck_coul.cu
-  
-$(OBJ_DIR)/buck_coul_ptx.h: $(OBJ_DIR)/buck_coul.ptx $(OBJ_DIR)/buck_coul.ptx
-	$(BSH) ./geryon/file_to_cstr.sh buck_coul $(OBJ_DIR)/buck_coul.ptx $(OBJ_DIR)/buck_coul_ptx.h
-    
-$(OBJ_DIR)/lal_buck_coul.o: $(ALL_H) lal_buck_coul.h lal_buck_coul.cpp $(OBJ_DIR)/buck_coul_ptx.h $(OBJ_DIR)/lal_base_charge.o
+$(OBJ_DIR)/buck_coul.cubin: lal_buck_coul.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_buck_coul.cu
+
+$(OBJ_DIR)/buck_coul_cubin.h: $(OBJ_DIR)/buck_coul.cubin $(OBJ_DIR)/buck_coul.cubin
+	$(BIN2C) -c -n buck_coul $(OBJ_DIR)/buck_coul.cubin > $(OBJ_DIR)/buck_coul_cubin.h
+
+$(OBJ_DIR)/lal_buck_coul.o: $(ALL_H) lal_buck_coul.h lal_buck_coul.cpp $(OBJ_DIR)/buck_coul_cubin.h $(OBJ_DIR)/lal_base_charge.o
 	$(CUDR) -o $@ -c lal_buck_coul.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lal_buck_coul_ext.o: $(ALL_H) lal_buck_coul.h lal_buck_coul_ext.cpp lal_base_charge.h
 	$(CUDR) -o $@ -c lal_buck_coul_ext.cpp -I$(OBJ_DIR)
 
-$(OBJ_DIR)/buck_coul_long.ptx: lal_buck_coul_long.cu lal_precision.h lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ lal_buck_coul_long.cu
-  
-$(OBJ_DIR)/buck_coul_long_ptx.h: $(OBJ_DIR)/buck_coul_long.ptx $(OBJ_DIR)/buck_coul_long.ptx
-	$(BSH) ./geryon/file_to_cstr.sh buck_coul_long $(OBJ_DIR)/buck_coul_long.ptx $(OBJ_DIR)/buck_coul_long_ptx.h
-    
-$(OBJ_DIR)/lal_buck_coul_long.o: $(ALL_H) lal_buck_coul_long.h lal_buck_coul_long.cpp $(OBJ_DIR)/buck_coul_long_ptx.h $(OBJ_DIR)/lal_base_charge.o
+$(OBJ_DIR)/buck_coul_long.cubin: lal_buck_coul_long.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_buck_coul_long.cu
+
+$(OBJ_DIR)/buck_coul_long_cubin.h: $(OBJ_DIR)/buck_coul_long.cubin $(OBJ_DIR)/buck_coul_long.cubin
+	$(BIN2C) -c -n buck_coul_long $(OBJ_DIR)/buck_coul_long.cubin > $(OBJ_DIR)/buck_coul_long_cubin.h
+
+$(OBJ_DIR)/lal_buck_coul_long.o: $(ALL_H) lal_buck_coul_long.h lal_buck_coul_long.cpp $(OBJ_DIR)/buck_coul_long_cubin.h $(OBJ_DIR)/lal_base_charge.o
 	$(CUDR) -o $@ -c lal_buck_coul_long.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lal_buck_coul_long_ext.o: $(ALL_H) lal_buck_coul_long.h lal_buck_coul_long_ext.cpp lal_base_charge.h
 	$(CUDR) -o $@ -c lal_buck_coul_long_ext.cpp -I$(OBJ_DIR)
 
-$(OBJ_DIR)/table.ptx: lal_table.cu lal_precision.h lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ lal_table.cu
-  
-$(OBJ_DIR)/table_ptx.h: $(OBJ_DIR)/table.ptx $(OBJ_DIR)/table.ptx
-	$(BSH) ./geryon/file_to_cstr.sh table $(OBJ_DIR)/table.ptx $(OBJ_DIR)/table_ptx.h
-    
-$(OBJ_DIR)/lal_table.o: $(ALL_H) lal_table.h lal_table.cpp $(OBJ_DIR)/table_ptx.h $(OBJ_DIR)/lal_base_atomic.o
+$(OBJ_DIR)/table.cubin: lal_table.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_table.cu
+
+$(OBJ_DIR)/table_cubin.h: $(OBJ_DIR)/table.cubin $(OBJ_DIR)/table.cubin
+	$(BIN2C) -c -n table $(OBJ_DIR)/table.cubin > $(OBJ_DIR)/table_cubin.h
+
+$(OBJ_DIR)/lal_table.o: $(ALL_H) lal_table.h lal_table.cpp $(OBJ_DIR)/table_cubin.h $(OBJ_DIR)/lal_base_atomic.o
 	$(CUDR) -o $@ -c lal_table.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lal_table_ext.o: $(ALL_H) lal_table.h lal_table_ext.cpp lal_base_atomic.h
 	$(CUDR) -o $@ -c lal_table_ext.cpp -I$(OBJ_DIR)
 
-$(OBJ_DIR)/yukawa.ptx: lal_yukawa.cu lal_precision.h lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ lal_yukawa.cu
-  
-$(OBJ_DIR)/yukawa_ptx.h: $(OBJ_DIR)/yukawa.ptx $(OBJ_DIR)/yukawa.ptx
-	$(BSH) ./geryon/file_to_cstr.sh yukawa $(OBJ_DIR)/yukawa.ptx $(OBJ_DIR)/yukawa_ptx.h
-    
-$(OBJ_DIR)/lal_yukawa.o: $(ALL_H) lal_yukawa.h lal_yukawa.cpp $(OBJ_DIR)/yukawa_ptx.h $(OBJ_DIR)/lal_base_atomic.o
+$(OBJ_DIR)/yukawa.cubin: lal_yukawa.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_yukawa.cu
+
+$(OBJ_DIR)/yukawa_cubin.h: $(OBJ_DIR)/yukawa.cubin $(OBJ_DIR)/yukawa.cubin
+	$(BIN2C) -c -n yukawa $(OBJ_DIR)/yukawa.cubin > $(OBJ_DIR)/yukawa_cubin.h
+
+$(OBJ_DIR)/lal_yukawa.o: $(ALL_H) lal_yukawa.h lal_yukawa.cpp $(OBJ_DIR)/yukawa_cubin.h $(OBJ_DIR)/lal_base_atomic.o
 	$(CUDR) -o $@ -c lal_yukawa.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lal_yukawa_ext.o: $(ALL_H) lal_yukawa.h lal_yukawa_ext.cpp lal_base_atomic.h
 	$(CUDR) -o $@ -c lal_yukawa_ext.cpp -I$(OBJ_DIR)
 
+$(OBJ_DIR)/born.cubin: lal_born.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_born.cu
+
+$(OBJ_DIR)/born_cubin.h: $(OBJ_DIR)/born.cubin $(OBJ_DIR)/born.cubin
+	$(BIN2C) -c -n born $(OBJ_DIR)/born.cubin > $(OBJ_DIR)/born_cubin.h
+
+$(OBJ_DIR)/lal_born.o: $(ALL_H) lal_born.h lal_born.cpp $(OBJ_DIR)/born_cubin.h $(OBJ_DIR)/lal_base_atomic.o
+	$(CUDR) -o $@ -c lal_born.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_born_ext.o: $(ALL_H) lal_born.h lal_born_ext.cpp lal_base_atomic.h
+	$(CUDR) -o $@ -c lal_born_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/born_coul_wolf.cubin: lal_born_coul_wolf.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_born_coul_wolf.cu
+
+$(OBJ_DIR)/born_coul_wolf_cubin.h: $(OBJ_DIR)/born_coul_wolf.cubin $(OBJ_DIR)/born_coul_wolf.cubin
+	$(BIN2C) -c -n born_coul_wolf $(OBJ_DIR)/born_coul_wolf.cubin > $(OBJ_DIR)/born_coul_wolf_cubin.h
+
+$(OBJ_DIR)/lal_born_coul_wolf.o: $(ALL_H) lal_born_coul_wolf.h lal_born_coul_wolf.cpp $(OBJ_DIR)/born_coul_wolf_cubin.h $(OBJ_DIR)/lal_base_charge.o
+	$(CUDR) -o $@ -c lal_born_coul_wolf.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_born_coul_wolf_ext.o: $(ALL_H) lal_born_coul_wolf.h lal_born_coul_wolf_ext.cpp lal_base_charge.h
+	$(CUDR) -o $@ -c lal_born_coul_wolf_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/born_coul_long.cubin: lal_born_coul_long.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_born_coul_long.cu
+
+$(OBJ_DIR)/born_coul_long_cubin.h: $(OBJ_DIR)/born_coul_long.cubin $(OBJ_DIR)/born_coul_long.cubin
+	$(BIN2C) -c -n born_coul_long $(OBJ_DIR)/born_coul_long.cubin > $(OBJ_DIR)/born_coul_long_cubin.h
+
+$(OBJ_DIR)/lal_born_coul_long.o: $(ALL_H) lal_born_coul_long.h lal_born_coul_long.cpp $(OBJ_DIR)/born_coul_long_cubin.h $(OBJ_DIR)/lal_base_charge.o
+	$(CUDR) -o $@ -c lal_born_coul_long.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_born_coul_long_ext.o: $(ALL_H) lal_born_coul_long.h lal_born_coul_long_ext.cpp lal_base_charge.h
+	$(CUDR) -o $@ -c lal_born_coul_long_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/dipole_lj.cubin: lal_dipole_lj.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_dipole_lj.cu
+
+$(OBJ_DIR)/dipole_lj_cubin.h: $(OBJ_DIR)/dipole_lj.cubin $(OBJ_DIR)/dipole_lj.cubin
+	$(BIN2C) -c -n dipole_lj $(OBJ_DIR)/dipole_lj.cubin > $(OBJ_DIR)/dipole_lj_cubin.h
+
+$(OBJ_DIR)/lal_dipole_lj.o: $(ALL_H) lal_dipole_lj.h lal_dipole_lj.cpp $(OBJ_DIR)/dipole_lj_cubin.h $(OBJ_DIR)/lal_base_dipole.o
+	$(CUDR) -o $@ -c lal_dipole_lj.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_dipole_lj_ext.o: $(ALL_H) lal_dipole_lj.h lal_dipole_lj_ext.cpp lal_base_dipole.h
+	$(CUDR) -o $@ -c lal_dipole_lj_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/dipole_lj_sf.cubin: lal_dipole_lj_sf.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_dipole_lj_sf.cu
+
+$(OBJ_DIR)/dipole_lj_sf_cubin.h: $(OBJ_DIR)/dipole_lj_sf.cubin $(OBJ_DIR)/dipole_lj_sf.cubin
+	$(BIN2C) -c -n dipole_lj_sf $(OBJ_DIR)/dipole_lj_sf.cubin > $(OBJ_DIR)/dipole_lj_sf_cubin.h
+
+$(OBJ_DIR)/lal_dipole_lj_sf.o: $(ALL_H) lal_dipole_lj_sf.h lal_dipole_lj_sf.cpp $(OBJ_DIR)/dipole_lj_sf_cubin.h $(OBJ_DIR)/lal_base_dipole.o
+	$(CUDR) -o $@ -c lal_dipole_lj_sf.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_dipole_lj_sf_ext.o: $(ALL_H) lal_dipole_lj_sf.h lal_dipole_lj_sf_ext.cpp lal_base_dipole.h
+	$(CUDR) -o $@ -c lal_dipole_lj_sf_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/colloid.cubin: lal_colloid.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_colloid.cu
+
+$(OBJ_DIR)/colloid_cubin.h: $(OBJ_DIR)/colloid.cubin $(OBJ_DIR)/colloid.cubin
+	$(BIN2C) -c -n colloid $(OBJ_DIR)/colloid.cubin > $(OBJ_DIR)/colloid_cubin.h
+
+$(OBJ_DIR)/lal_colloid.o: $(ALL_H) lal_colloid.h lal_colloid.cpp $(OBJ_DIR)/colloid_cubin.h $(OBJ_DIR)/lal_base_atomic.o
+	$(CUDR) -o $@ -c lal_colloid.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_colloid_ext.o: $(ALL_H) lal_colloid.h lal_colloid_ext.cpp lal_base_atomic.h
+	$(CUDR) -o $@ -c lal_colloid_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/gauss.cubin: lal_gauss.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_gauss.cu
+
+$(OBJ_DIR)/gauss_cubin.h: $(OBJ_DIR)/gauss.cubin $(OBJ_DIR)/gauss.cubin
+	$(BIN2C) -c -n gauss $(OBJ_DIR)/gauss.cubin > $(OBJ_DIR)/gauss_cubin.h
+
+$(OBJ_DIR)/lal_gauss.o: $(ALL_H) lal_gauss.h lal_gauss.cpp $(OBJ_DIR)/gauss_cubin.h $(OBJ_DIR)/lal_base_atomic.o
+	$(CUDR) -o $@ -c lal_gauss.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_gauss_ext.o: $(ALL_H) lal_gauss.h lal_gauss_ext.cpp lal_base_atomic.h
+	$(CUDR) -o $@ -c lal_gauss_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/yukawa_colloid.cubin: lal_yukawa_colloid.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_yukawa_colloid.cu
+
+$(OBJ_DIR)/yukawa_colloid_cubin.h: $(OBJ_DIR)/yukawa_colloid.cubin $(OBJ_DIR)/yukawa_colloid.cubin
+	$(BIN2C) -c -n yukawa_colloid $(OBJ_DIR)/yukawa_colloid.cubin > $(OBJ_DIR)/yukawa_colloid_cubin.h
+
+$(OBJ_DIR)/lal_yukawa_colloid.o: $(ALL_H) lal_yukawa_colloid.h lal_yukawa_colloid.cpp $(OBJ_DIR)/yukawa_colloid_cubin.h $(OBJ_DIR)/lal_base_atomic.o
+	$(CUDR) -o $@ -c lal_yukawa_colloid.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_yukawa_colloid_ext.o: $(ALL_H) lal_yukawa_colloid.h lal_yukawa_colloid_ext.cpp lal_base_atomic.h
+	$(CUDR) -o $@ -c lal_yukawa_colloid_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lj_coul_debye.cubin: lal_lj_coul_debye.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj_coul_debye.cu
+
+$(OBJ_DIR)/lj_coul_debye_cubin.h: $(OBJ_DIR)/lj_coul_debye.cubin $(OBJ_DIR)/lj_coul_debye.cubin
+	$(BIN2C) -c -n lj_coul_debye $(OBJ_DIR)/lj_coul_debye.cubin > $(OBJ_DIR)/lj_coul_debye_cubin.h
+
+$(OBJ_DIR)/lal_lj_coul_debye.o: $(ALL_H) lal_lj_coul_debye.h lal_lj_coul_debye.cpp $(OBJ_DIR)/lj_coul_debye_cubin.h $(OBJ_DIR)/lal_base_charge.o
+	$(CUDR) -o $@ -c lal_lj_coul_debye.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_lj_coul_debye_ext.o: $(ALL_H) lal_lj_coul_debye.h lal_lj_coul_debye_ext.cpp lal_base_charge.h
+	$(CUDR) -o $@ -c lal_lj_coul_debye_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/coul_dsf.cubin: lal_coul_dsf.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_coul_dsf.cu
+
+$(OBJ_DIR)/coul_dsf_cubin.h: $(OBJ_DIR)/coul_dsf.cubin $(OBJ_DIR)/coul_dsf.cubin
+	$(BIN2C) -c -n coul_dsf $(OBJ_DIR)/coul_dsf.cubin > $(OBJ_DIR)/coul_dsf_cubin.h
+
+$(OBJ_DIR)/lal_coul_dsf.o: $(ALL_H) lal_coul_dsf.h lal_coul_dsf.cpp $(OBJ_DIR)/coul_dsf_cubin.h $(OBJ_DIR)/lal_base_charge.o
+	$(CUDR) -o $@ -c lal_coul_dsf.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_coul_dsf_ext.o: $(ALL_H) lal_coul_dsf.h lal_coul_dsf_ext.cpp lal_base_charge.h
+	$(CUDR) -o $@ -c lal_coul_dsf_ext.cpp -I$(OBJ_DIR)
+
 $(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVD_H)
 	$(CUDR) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_CUDADR $(CUDA_LIB) -lcuda 
 
@@ -415,10 +575,10 @@ $(GPU_LIB): $(OBJS) $(CUDPP)
 	$(AR) -crusv $(GPU_LIB) $(OBJS) $(CUDPP)
 
 clean:
-	rm -f $(EXECS) $(GPU_LIB) $(OBJS) $(CUDPP) $(PTXS) *.linkinfo
+	rm -f $(EXECS) $(GPU_LIB) $(OBJS) $(CUDPP) $(CBNS) *.linkinfo
 
 veryclean: clean
 	rm -rf *~ *.linkinfo
 
 cleanlib:
-	rm -f $(EXECS) $(GPU_LIB) $(OBJS) $(PTXS) *.linkinfo
+	rm -f $(EXECS) $(GPU_LIB) $(OBJS) $(CBNS) *.linkinfo
diff --git a/lib/gpu/Opencl.makefile b/lib/gpu/Opencl.makefile
index 8435cddb72..51bd78fbd7 100644
--- a/lib/gpu/Opencl.makefile
+++ b/lib/gpu/Opencl.makefile
@@ -17,6 +17,7 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_answer.o \
        $(OBJ_DIR)/lal_neighbor_shared.o $(OBJ_DIR)/lal_neighbor.o \
        $(OBJ_DIR)/lal_device.o $(OBJ_DIR)/lal_base_atomic.o \
        $(OBJ_DIR)/lal_base_charge.o $(OBJ_DIR)/lal_base_ellipsoid.o \
+       $(OBJ_DIR)/lal_base_dipole.o \
        $(OBJ_DIR)/lal_pppm.o $(OBJ_DIR)/lal_pppm_ext.o \
        $(OBJ_DIR)/lal_gayberne.o $(OBJ_DIR)/lal_gayberne_ext.o \
        $(OBJ_DIR)/lal_re_squared.o $(OBJ_DIR)/lal_re_squared_ext.o \
@@ -25,6 +26,7 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_answer.o \
        $(OBJ_DIR)/lal_lj_expand.o $(OBJ_DIR)/lal_lj_expand_ext.o \
        $(OBJ_DIR)/lal_lj_coul.o $(OBJ_DIR)/lal_lj_coul_ext.o \
        $(OBJ_DIR)/lal_lj_coul_long.o $(OBJ_DIR)/lal_lj_coul_long_ext.o \
+       $(OBJ_DIR)/lal_lj_dsf.o $(OBJ_DIR)/lal_lj_dsf_ext.o \
        $(OBJ_DIR)/lal_lj_class2_long.o $(OBJ_DIR)/lal_lj_class2_long_ext.o \
        $(OBJ_DIR)/lal_coul_long.o $(OBJ_DIR)/lal_coul_long_ext.o \
        $(OBJ_DIR)/lal_morse.o $(OBJ_DIR)/lal_morse_ext.o \
@@ -36,20 +38,43 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_answer.o \
        $(OBJ_DIR)/lal_buck_coul.o $(OBJ_DIR)/lal_buck_coul_ext.o \
        $(OBJ_DIR)/lal_buck_coul_long.o $(OBJ_DIR)/lal_buck_coul_long_ext.o \
        $(OBJ_DIR)/lal_table.o $(OBJ_DIR)/lal_table_ext.o \
-       $(OBJ_DIR)/lal_yukawa.o $(OBJ_DIR)/lal_yukawa_ext.o
+       $(OBJ_DIR)/lal_yukawa.o $(OBJ_DIR)/lal_yukawa_ext.o \
+       $(OBJ_DIR)/lal_born.o $(OBJ_DIR)/lal_born_ext.o \
+       $(OBJ_DIR)/lal_born_coul_wolf.o $(OBJ_DIR)/lal_born_coul_wolf_ext.o \
+       $(OBJ_DIR)/lal_born_coul_long.o $(OBJ_DIR)/lal_born_coul_long_ext.o \
+       $(OBJ_DIR)/lal_dipole_lj.o $(OBJ_DIR)/lal_dipole_lj_ext.o \
+       $(OBJ_DIR)/lal_dipole_lj_sf.o $(OBJ_DIR)/lal_dipole_lj_sf_ext.o \
+       $(OBJ_DIR)/lal_colloid.o $(OBJ_DIR)/lal_colloid_ext.o \
+       $(OBJ_DIR)/lal_gauss.o $(OBJ_DIR)/lal_gauss_ext.o \
+       $(OBJ_DIR)/lal_yukawa_colloid.o $(OBJ_DIR)/lal_yukawa_colloid_ext.o \
+       $(OBJ_DIR)/lal_lj_coul_debye.o $(OBJ_DIR)/lal_lj_coul_debye_ext.o \
+       $(OBJ_DIR)/lal_coul_dsf.o $(OBJ_DIR)/lal_coul_dsf_ext.o
+
 KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \
        $(OBJ_DIR)/neighbor_cpu_cl.h $(OBJ_DIR)/pppm_cl.h \
        $(OBJ_DIR)/ellipsoid_nbor_cl.h $(OBJ_DIR)/gayberne_cl.h \
        $(OBJ_DIR)/gayberne_lj_cl.h $(OBJ_DIR)/re_squared_cl.h \
        $(OBJ_DIR)/re_squared_lj_cl.h $(OBJ_DIR)/lj_cl.h $(OBJ_DIR)/lj96_cl.h \
        $(OBJ_DIR)/lj_expand_cl.h $(OBJ_DIR)/lj_coul_cl.h \
-       $(OBJ_DIR)/lj_coul_long_cl.h $(OBJ_DIR)/lj_class2_long_cl.h \
+       $(OBJ_DIR)/lj_coul_long_cl.h $(OBJ_DIR)/lj_dsf_cl.h \
+       $(OBJ_DIR)/lj_class2_long_cl.h \
        $(OBJ_DIR)/coul_long_cl.h $(OBJ_DIR)/morse_cl.h \
        $(OBJ_DIR)/charmm_long_cl.h $(OBJ_DIR)/cg_cmm_cl.h \
        $(OBJ_DIR)/cg_cmm_long_cl.h $(OBJ_DIR)/neighbor_gpu_cl.h \
        $(OBJ_DIR)/eam_cl.h $(OBJ_DIR)/buck_cl.h \
        $(OBJ_DIR)/buck_coul_cl.h $(OBJ_DIR)/buck_coul_long_cl.h \
-       $(OBJ_DIR)/table_cl.h $(OBJ_DIR)/yukawa_cl.h
+       $(OBJ_DIR)/table_cl.h $(OBJ_DIR)/yukawa_cl.h \
+       $(OBJ_DIR)/born.cubin $(OBJ_DIR)/born_cubin.h \
+       $(OBJ_DIR)/born_coul_wolf.cubin $(OBJ_DIR)/born_coul_wolf_cubin.h \
+       $(OBJ_DIR)/born_coul_long.cubin $(OBJ_DIR)/born_coul_long_cubin.h \
+       $(OBJ_DIR)/dipole_lj.cubin $(OBJ_DIR)/dipole_lj_cubin.h \
+       $(OBJ_DIR)/dipole_lj_sf.cubin $(OBJ_DIR)/dipole_lj_sf_cubin.h \
+       $(OBJ_DIR)/colloid.cubin $(OBJ_DIR)/colloid_cubin.h \
+       $(OBJ_DIR)/gauss.cubin $(OBJ_DIR)/gauss_cubin.h \
+       $(OBJ_DIR)/yukawa_colloid.cubin $(OBJ_DIR)/yukawa_colloid_cubin.h \
+       $(OBJ_DIR)/lj_coul_debye.cubin $(OBJ_DIR)/lj_coul_debye_cubin.h \
+       $(OBJ_DIR)/coul_dsf.cubin $(OBJ_DIR)/coul_dsf_cubin.h
+
 
 OCL_EXECS = $(BIN_DIR)/ocl_get_devices
 
@@ -91,6 +116,9 @@ $(OBJ_DIR)/lal_base_charge.o: $(OCL_H) lal_base_charge.h lal_base_charge.cpp
 $(OBJ_DIR)/lal_base_ellipsoid.o: $(OCL_H) lal_base_ellipsoid.h lal_base_ellipsoid.cpp $(OBJ_DIR)/ellipsoid_nbor_cl.h
 	$(OCL) -o $@ -c lal_base_ellipsoid.cpp -I$(OBJ_DIR)
 
+$(OBJ_DIR)/lal_base_dipole.o: $(OCL_H) lal_base_dipole.h lal_base_dipole.cpp
+	$(OCL) -o $@ -c lal_base_dipole.cpp -I$(OBJ_DIR)
+
 $(OBJ_DIR)/pppm_cl.h: lal_pppm.cu lal_preprocessor.h
 	$(BSH) ./geryon/file_to_cstr.sh pppm lal_preprocessor.h lal_pppm.cu $(OBJ_DIR)/pppm_cl.h;
 
@@ -154,6 +182,15 @@ $(OBJ_DIR)/lal_lj_coul_long.o: $(ALL_H) lal_lj_coul_long.h lal_lj_coul_long.cpp
 $(OBJ_DIR)/lal_lj_coul_long_ext.o: $(ALL_H) lal_lj_coul_long.h lal_lj_coul_long_ext.cpp lal_base_charge.h
 	$(OCL) -o $@ -c lal_lj_coul_long_ext.cpp -I$(OBJ_DIR)
 
+$(OBJ_DIR)/lj_dsf_cl.h: lal_lj_dsf.cu $(PRE1_H)
+	$(BSH) ./geryon/file_to_cstr.sh lj_dsf $(PRE1_H) lal_lj_dsf.cu $(OBJ_DIR)/lj_dsf_cl.h;
+
+$(OBJ_DIR)/lal_lj_dsf.o: $(ALL_H) lal_lj_dsf.h lal_lj_dsf.cpp  $(OBJ_DIR)/lj_dsf_cl.h $(OBJ_DIR)/lj_dsf_cl.h $(OBJ_DIR)/lal_base_charge.o
+	$(OCL) -o $@ -c lal_lj_dsf.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_lj_dsf_ext.o: $(ALL_H) lal_lj_dsf.h lal_lj_dsf_ext.cpp lal_base_charge.h
+	$(OCL) -o $@ -c lal_lj_dsf_ext.cpp -I$(OBJ_DIR)
+
 $(OBJ_DIR)/lj_class2_long_cl.h: lal_lj_class2_long.cu $(PRE1_H)
 	$(BSH) ./geryon/file_to_cstr.sh lj_class2_long $(PRE1_H) lal_lj_class2_long.cu $(OBJ_DIR)/lj_class2_long_cl.h;
 
@@ -280,6 +317,96 @@ $(OBJ_DIR)/lal_yukawa.o: $(ALL_H) lal_yukawa.h lal_yukawa.cpp  $(OBJ_DIR)/yukawa
 $(OBJ_DIR)/lal_yukawa_ext.o: $(ALL_H) lal_yukawa.h lal_yukawa_ext.cpp lal_base_atomic.h
 	$(OCL) -o $@ -c lal_yukawa_ext.cpp -I$(OBJ_DIR)
 
+$(OBJ_DIR)/born_cl.h: lal_born.cu $(PRE1_H)
+	$(BSH) ./geryon/file_to_cstr.sh born $(PRE1_H) lal_born.cu $(OBJ_DIR)/born_cl.h;
+
+$(OBJ_DIR)/lal_born.o: $(ALL_H) lal_born.h lal_born.cpp  $(OBJ_DIR)/born_cl.h $(OBJ_DIR)/born_cl.h $(OBJ_DIR)/lal_base_atomic.o
+	$(OCL) -o $@ -c lal_born.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_born_ext.o: $(ALL_H) lal_born.h lal_born_ext.cpp lal_base_atomic.h
+	$(OCL) -o $@ -c lal_born_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/born_coul_wolf_cl.h: lal_born_coul_wolf.cu $(PRE1_H)
+	$(BSH) ./geryon/file_to_cstr.sh born_coul_wolf $(PRE1_H) lal_born_coul_wolf.cu $(OBJ_DIR)/born_coul_wolf_cl.h;
+
+$(OBJ_DIR)/lal_born_coul_wolf.o: $(ALL_H) lal_born_coul_wolf.h lal_born_coul_wolf.cpp  $(OBJ_DIR)/born_coul_wolf_cl.h $(OBJ_DIR)/born_coul_wolf_cl.h $(OBJ_DIR)/lal_base_charge.o
+	$(OCL) -o $@ -c lal_born_coul_wolf.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_born_coul_wolf_ext.o: $(ALL_H) lal_born_coul_wolf.h lal_born_coul_wolf_ext.cpp lal_base_charge.h
+	$(OCL) -o $@ -c lal_born_coul_wolf_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/born_coul_long_cl.h: lal_born_coul_long.cu $(PRE1_H)
+	$(BSH) ./geryon/file_to_cstr.sh born_coul_long $(PRE1_H) lal_born_coul_long.cu $(OBJ_DIR)/born_coul_long_cl.h;
+
+$(OBJ_DIR)/lal_born_coul_long.o: $(ALL_H) lal_born_coul_long.h lal_born_coul_long.cpp  $(OBJ_DIR)/born_coul_long_cl.h $(OBJ_DIR)/born_coul_long_cl.h $(OBJ_DIR)/lal_base_charge.o
+	$(OCL) -o $@ -c lal_born_coul_long.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_born_coul_long_ext.o: $(ALL_H) lal_born_coul_long.h lal_born_coul_long_ext.cpp lal_base_charge.h
+	$(OCL) -o $@ -c lal_born_coul_long_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/dipole_lj_cl.h: lal_dipole_lj.cu $(PRE1_H)
+	$(BSH) ./geryon/file_to_cstr.sh dipole_lj $(PRE1_H) lal_dipole_lj.cu $(OBJ_DIR)/dipole_lj_cl.h;
+
+$(OBJ_DIR)/lal_dipole_lj.o: $(ALL_H) lal_dipole_lj.h lal_dipole_lj.cpp  $(OBJ_DIR)/dipole_lj_cl.h $(OBJ_DIR)/dipole_lj_cl.h $(OBJ_DIR)/lal_base_dipole.o
+	$(OCL) -o $@ -c lal_dipole_lj.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_dipole_lj_ext.o: $(ALL_H) lal_dipole_lj.h lal_dipole_lj_ext.cpp lal_base_dipole.h
+	$(OCL) -o $@ -c lal_dipole_lj_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/dipole_lj_sf_cl.h: lal_dipole_lj_sf.cu $(PRE1_H)
+	$(BSH) ./geryon/file_to_cstr.sh dipole_lj_sf $(PRE1_H) lal_dipole_lj_sf.cu $(OBJ_DIR)/dipole_lj_sf_cl.h;
+
+$(OBJ_DIR)/lal_dipole_lj_sf.o: $(ALL_H) lal_dipole_lj_sf.h lal_dipole_lj_sf.cpp  $(OBJ_DIR)/dipole_lj_sf_cl.h $(OBJ_DIR)/dipole_lj_sf_cl.h $(OBJ_DIR)/lal_base_dipole.o
+	$(OCL) -o $@ -c lal_dipole_lj_sf.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_dipole_lj_sf_ext.o: $(ALL_H) lal_dipole_lj_sf.h lal_dipole_lj_sf_ext.cpp lal_base_dipole.h
+	$(OCL) -o $@ -c lal_dipole_lj_sf_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/colloid_cl.h: lal_colloid.cu $(PRE1_H)
+	$(BSH) ./geryon/file_to_cstr.sh colloid $(PRE1_H) lal_colloid.cu $(OBJ_DIR)/colloid_cl.h;
+
+$(OBJ_DIR)/lal_colloid.o: $(ALL_H) lal_colloid.h lal_colloid.cpp  $(OBJ_DIR)/colloid_cl.h $(OBJ_DIR)/colloid_cl.h $(OBJ_DIR)/lal_base_atomic.o
+	$(OCL) -o $@ -c lal_colloid.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_colloid_ext.o: $(ALL_H) lal_colloid.h lal_colloid_ext.cpp lal_base_atomic.h
+	$(OCL) -o $@ -c lal_colloid_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/gauss_cl.h: lal_gauss.cu $(PRE1_H)
+	$(BSH) ./geryon/file_to_cstr.sh gauss $(PRE1_H) lal_gauss.cu $(OBJ_DIR)/gauss_cl.h;
+
+$(OBJ_DIR)/lal_gauss.o: $(ALL_H) lal_gauss.h lal_gauss.cpp  $(OBJ_DIR)/gauss_cl.h $(OBJ_DIR)/gauss_cl.h $(OBJ_DIR)/lal_base_atomic.o
+	$(OCL) -o $@ -c lal_gauss.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_gauss_ext.o: $(ALL_H) lal_gauss.h lal_gauss_ext.cpp lal_base_atomic.h
+	$(OCL) -o $@ -c lal_gauss_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/yukawa_colloid_cl.h: lal_yukawa_colloid.cu $(PRE1_H)
+	$(BSH) ./geryon/file_to_cstr.sh yukawa_colloid $(PRE1_H) lal_yukawa_colloid.cu $(OBJ_DIR)/yukawa_colloid_cl.h;
+
+$(OBJ_DIR)/lal_yukawa_colloid.o: $(ALL_H) lal_yukawa_colloid.h lal_yukawa_colloid.cpp  $(OBJ_DIR)/yukawa_colloid_cl.h $(OBJ_DIR)/yukawa_colloid_cl.h $(OBJ_DIR)/lal_base_atomic.o
+	$(OCL) -o $@ -c lal_yukawa_colloid.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_yukawa_colloid_ext.o: $(ALL_H) lal_yukawa_colloid.h lal_yukawa_colloid_ext.cpp lal_base_atomic.h
+	$(OCL) -o $@ -c lal_yukawa_colloid_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lj_coul_debye_cl.h: lal_lj_coul_debye.cu $(PRE1_H)
+	$(BSH) ./geryon/file_to_cstr.sh lj_coul_debye $(PRE1_H) lal_lj_coul_debye.cu $(OBJ_DIR)/lj_coul_debye_cl.h;
+
+$(OBJ_DIR)/lal_lj_coul_debye.o: $(ALL_H) lal_lj_coul_debye.h lal_lj_coul_debye.cpp  $(OBJ_DIR)/lj_coul_debye_cl.h $(OBJ_DIR)/lj_coul_debye_cl.h $(OBJ_DIR)/lal_base_charge.o
+	$(OCL) -o $@ -c lal_lj_coul_debye.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_lj_coul_debye_ext.o: $(ALL_H) lal_lj_coul_debye.h lal_lj_coul_debye_ext.cpp lal_base_charge.h
+	$(OCL) -o $@ -c lal_lj_coul_debye_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/coul_dsf_cl.h: lal_coul_dsf.cu $(PRE1_H)
+	$(BSH) ./geryon/file_to_cstr.sh coul_dsf $(PRE1_H) lal_coul_dsf.cu $(OBJ_DIR)/coul_dsf_cl.h;
+
+$(OBJ_DIR)/lal_coul_dsf.o: $(ALL_H) lal_coul_dsf.h lal_coul_dsf.cpp  $(OBJ_DIR)/coul_dsf_cl.h $(OBJ_DIR)/coul_dsf_cl.h $(OBJ_DIR)/lal_base_charge.o
+	$(OCL) -o $@ -c lal_coul_dsf.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_coul_dsf_ext.o: $(ALL_H) lal_coul_dsf.h lal_coul_dsf_ext.cpp lal_base_charge.h
+	$(OCL) -o $@ -c lal_coul_dsf_ext.cpp -I$(OBJ_DIR)
+
 $(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp
 	$(OCL) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_OPENCL $(OCL_LINK) 
 
diff --git a/lib/gpu/geryon/README b/lib/gpu/geryon/README
index 601c19dc3c..018e9cff7f 100644
--- a/lib/gpu/geryon/README
+++ b/lib/gpu/geryon/README
@@ -1,3 +1,7 @@
+NOTE: This Geryon distribution has been modified to remove files not
+      necessary for the LAMMPS implementation. The full distribution
+      is available at http://users.nccs.gov/~wb8/geryon/index.htm
+
 Geryon
 
    Copyright (2010) Sandia Corporation.  Under the terms of Contract
diff --git a/lib/gpu/geryon/VERSION.txt b/lib/gpu/geryon/VERSION.txt
index 313907d611..47cefed44d 100644
--- a/lib/gpu/geryon/VERSION.txt
+++ b/lib/gpu/geryon/VERSION.txt
@@ -1 +1 @@
-Geryon Version 12.034
+Geryon Version 12.033
diff --git a/lib/gpu/geryon/nvd_device.h b/lib/gpu/geryon/nvd_device.h
index c17c1943c3..938e1d3bd6 100644
--- a/lib/gpu/geryon/nvd_device.h
+++ b/lib/gpu/geryon/nvd_device.h
@@ -141,6 +141,11 @@ class UCL_Device {
   /// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
   inline int device_type(const int i) { return UCL_GPU; }
   
+  /// Returns true if host memory is efficiently addressable from device
+  inline bool shared_memory() { return shared_memory(_device); }
+  /// Returns true if host memory is efficiently addressable from device
+  inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; }
+  
   /// Returns true if double precision is support for the current device
   bool double_precision() { return double_precision(_device); }
   /// Returns true if double precision is support for the device
diff --git a/lib/gpu/geryon/nvd_kernel.h b/lib/gpu/geryon/nvd_kernel.h
index b72c89c51f..fecd85eeb8 100644
--- a/lib/gpu/geryon/nvd_kernel.h
+++ b/lib/gpu/geryon/nvd_kernel.h
@@ -30,11 +30,23 @@
 namespace ucl_cudadr {
 
 class UCL_Texture;
+template <class numtyp> class UCL_D_Vec;
+template <class numtyp> class UCL_D_Mat;
+template <class hosttype, class devtype> class UCL_Vector;
+template <class hosttype, class devtype> class UCL_Matrix;
+#define UCL_MAX_KERNEL_ARGS 256
     
 /// Class storing 1 or more kernel functions from a single string or file
 class UCL_Program {
  public:
   inline UCL_Program(UCL_Device &device) { _cq=device.cq(); }
+  inline UCL_Program(UCL_Device &device, const void *program, 
+                     const char *flags="", std::string *log=NULL) { 
+    _cq=device.cq();
+    init(device); 
+    load_string(program,flags,log);
+  }
+
   inline ~UCL_Program() {}
 
   /// Initialize the program with a device
@@ -64,10 +76,10 @@ class UCL_Program {
   }
   
   /// Load a program from a string and compile with flags
-  inline int load_string(const char *program, const char *flags="",
+  inline int load_string(const void *program, const char *flags="",
                          std::string *log=NULL) {
     if (std::string(flags)=="BINARY")
-      return load_binary(program);
+      return load_binary((const char *)program);
     const unsigned int num_opts=2;
     CUjit_option options[num_opts];
     void *values[num_opts];
@@ -134,15 +146,25 @@ class UCL_Program {
   friend class UCL_Texture;
 };
 
-/// Class for dealing with OpenCL kernels
+/// Class for dealing with CUDA Driver kernels
 class UCL_Kernel {
  public:
-  UCL_Kernel() : _dimensions(1), _num_args(0), _param_size(0) 
-    { _num_blocks[0]=0; }
+  UCL_Kernel() : _dimensions(1), _num_args(0) { 
+    #if CUDA_VERSION < 4000
+    _param_size=0;
+    #endif
+    _num_blocks[0]=0; 
+  }
   
   UCL_Kernel(UCL_Program &program, const char *function) : 
-    _dimensions(1), _num_args(0), _param_size(0) 
-    { _num_blocks[0]=0; set_function(program,function); _cq=program._cq; }
+    _dimensions(1), _num_args(0) {
+    #if CUDA_VERSION < 4000
+    _param_size=0;
+    #endif
+    _num_blocks[0]=0; 
+    set_function(program,function); 
+    _cq=program._cq; 
+  }
   
   ~UCL_Kernel() {}
 
@@ -170,78 +192,190 @@ class UCL_Kernel {
     * changes 
     * \note To set kernel parameter i (i>0), parameter i-1 must be set **/
   template <class dtype>
-  inline void set_arg(const unsigned index, dtype *arg) {
+  inline void set_arg(const unsigned index, const dtype * const arg) {
     if (index==_num_args)
       add_arg(arg);
     else if (index<_num_args)
+      #if CUDA_VERSION >= 4000
+      _kernel_args[index]=arg;
+      #else
       CU_SAFE_CALL(cuParamSetv(_kernel, _offsets[index], arg, sizeof(dtype)));
+      #endif
     else
       assert(0==1); // Must add kernel parameters in sequential order 
   }
  
+  /// Set a geryon container as a kernel argument.
+  template <class numtyp>
+  inline void set_arg(const UCL_D_Vec<numtyp> * const arg) 
+    { set_arg(&arg->begin()); }
+
+  /// Set a geryon container as a kernel argument.
+  template <class numtyp>
+  inline void set_arg(const UCL_D_Mat<numtyp> * const arg) 
+    { set_arg(&arg->begin()); }
+
+  /// Set a geryon container as a kernel argument.
+  template <class hosttype, class devtype>
+  inline void set_arg(const UCL_Vector<hosttype, devtype> * const arg) 
+    { set_arg(&arg->device.begin()); }
+
+  /// Set a geryon container as a kernel argument.
+  template <class hosttype, class devtype>
+  inline void set_arg(const UCL_Matrix<hosttype, devtype> * const arg) 
+    { set_arg(&arg->device.begin()); }
+
   /// Add a kernel argument.
   inline void add_arg(const CUdeviceptr* const arg) {
+    #if CUDA_VERSION >= 4000
+    _kernel_args[_num_args]=(void *)arg;
+    #else
     void* ptr = (void*)(size_t)(*arg);
     _param_size = (_param_size + __alignof(ptr) - 1) & ~(__alignof(ptr) - 1);
     CU_SAFE_CALL(cuParamSetv(_kernel, _param_size, &ptr, sizeof(ptr)));
     _offsets.push_back(_param_size);
     _param_size+=sizeof(ptr);
+    #endif
     _num_args++;
+    if (_num_args>UCL_MAX_KERNEL_ARGS) assert(0==1);
   }
 
   /// Add a kernel argument.
   template <class dtype>
   inline void add_arg(const dtype* const arg) {
+    #if CUDA_VERSION >= 4000
+    _kernel_args[_num_args]=const_cast<dtype * const>(arg);
+    #else
     _param_size = (_param_size+__alignof(dtype)-1) & ~(__alignof(dtype)-1);
     CU_SAFE_CALL(cuParamSetv(_kernel,_param_size,(void*)arg,sizeof(dtype)));
     _offsets.push_back(_param_size);
     _param_size+=sizeof(dtype);
+    #endif
     _num_args++;
+    if (_num_args>UCL_MAX_KERNEL_ARGS) assert(0==1);
   }
 
+  /// Add a geryon container as a kernel argument.
+  template <class numtyp>
+  inline void add_arg(const UCL_D_Vec<numtyp> * const arg) 
+    { add_arg(&arg->begin()); }
+
+  /// Add a geryon container as a kernel argument.
+  template <class numtyp>
+  inline void add_arg(const UCL_D_Mat<numtyp> * const arg) 
+    { add_arg(&arg->begin()); }
+
+  /// Add a geryon container as a kernel argument.
+  template <class hosttype, class devtype>
+  inline void add_arg(const UCL_Vector<hosttype, devtype> * const arg) 
+    { add_arg(&arg->device.begin()); }
+
+  /// Add a geryon container as a kernel argument.
+  template <class hosttype, class devtype>
+  inline void add_arg(const UCL_Matrix<hosttype, devtype> * const arg) 
+    { add_arg(&arg->device.begin()); }
+
   /// Set the number of thread blocks and the number of threads in each block
-  /** \note This should be called after all arguments have been added **/
+  /** \note This should be called before any arguments have been added
+      \note The default command queue is used for the kernel execution **/
   inline void set_size(const size_t num_blocks, const size_t block_size) { 
     _dimensions=1; 
     _num_blocks[0]=num_blocks; 
-    _num_blocks[1]=1; 
+    _num_blocks[1]=1;
+    _num_blocks[2]=1;
+    #if CUDA_VERSION >= 4000
+    _block_size[0]=block_size;
+    _block_size[1]=1;
+    _block_size[2]=1;
+    #else    
     CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size,1,1));
+    #endif
   }
 
   /// Set the number of thread blocks and the number of threads in each block
+  /** \note This should be called before any arguments have been added
+      \note The default command queue for the kernel is changed to cq **/
+  inline void set_size(const size_t num_blocks, const size_t block_size,
+                       command_queue &cq)
+    { _cq=cq; set_size(num_blocks,block_size); }
+
+  /// Set the number of thread blocks and the number of threads in each block
+  /** \note This should be called before any arguments have been added
+      \note The default command queue is used for the kernel execution **/
   inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
                        const size_t block_size_x, const size_t block_size_y) { 
     _dimensions=2; 
     _num_blocks[0]=num_blocks_x; 
     _num_blocks[1]=num_blocks_y; 
+    _num_blocks[2]=1;
+    #if CUDA_VERSION >= 4000
+    _block_size[0]=block_size_x;
+    _block_size[1]=block_size_y;
+    _block_size[2]=1;
+    #else    
     CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size_x,block_size_y,1));
+    #endif
   }
   
   /// Set the number of thread blocks and the number of threads in each block
+  /** \note This should be called before any arguments have been added
+      \note The default command queue for the kernel is changed to cq **/
+  inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
+                       const size_t block_size_x, const size_t block_size_y,
+                       command_queue &cq) 
+    {_cq=cq; set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y);}
+
+  /// Set the number of thread blocks and the number of threads in each block
+  /** \note This should be called before any arguments have been added
+      \note The default command queue is used for the kernel execution **/
   inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
                        const size_t block_size_x, 
                        const size_t block_size_y, const size_t block_size_z) {
     _dimensions=2; 
     _num_blocks[0]=num_blocks_x; 
     _num_blocks[1]=num_blocks_y; 
+    _num_blocks[2]=1; 
+    #if CUDA_VERSION >= 4000
+    _block_size[0]=block_size_x;
+    _block_size[1]=block_size_y;
+    _block_size[2]=block_size_z;
+    #else    
     CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size_x,block_size_y,
-                                       block_size_z));
+                                     block_size_z));
+    #endif
   }
 
-  /// Run the kernel in the default command queue
-  inline void run() {
-    CU_SAFE_CALL(cuParamSetSize(_kernel,_param_size));
-    CU_SAFE_CALL(cuLaunchGridAsync(_kernel,_num_blocks[0],_num_blocks[1],_cq));
+  /// Set the number of thread blocks and the number of threads in each block
+  /** \note This should be called before any arguments have been added
+      \note The default command queue is used for the kernel execution **/
+  inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
+                       const size_t block_size_x, const size_t block_size_y,
+                       const size_t block_size_z, command_queue &cq) {
+    _cq=cq;
+    set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y, 
+             block_size_z);
   }
   
-  /// Run the kernel in the specified command queue
-  inline void run(command_queue &cq) {
+  /// Run the kernel in the default command queue
+  inline void run() {
+    #if CUDA_VERSION >= 4000
+    CU_SAFE_CALL(cuLaunchKernel(_kernel,_num_blocks[0],_num_blocks[1],
+                                _num_blocks[2],_block_size[0],_block_size[1],
+                                _block_size[2],0,_cq,_kernel_args,NULL));
+    #else
     CU_SAFE_CALL(cuParamSetSize(_kernel,_param_size));
-    CU_SAFE_CALL(cuLaunchGridAsync(_kernel,_num_blocks[0],_num_blocks[1],cq));
+    CU_SAFE_CALL(cuLaunchGridAsync(_kernel,_num_blocks[0],_num_blocks[1],_cq));
+    #endif
   }
   
   /// Clear any arguments associated with the kernel
-  inline void clear_args() { _num_args=0; _offsets.clear(); _param_size=0; }
+  inline void clear_args() { 
+    _num_args=0; 
+    #if CUDA_VERSION < 4000
+    _offsets.clear(); 
+    _param_size=0;
+    #endif
+  }
 
   #include "ucl_arg_kludge.h"
 
@@ -249,11 +383,17 @@ class UCL_Kernel {
   CUfunction _kernel;
   CUstream _cq;
   unsigned _dimensions;
-  unsigned _num_blocks[2];
+  unsigned _num_blocks[3];
   unsigned _num_args;
+  friend class UCL_Texture;
+  
+  #if CUDA_VERSION >= 4000
+  unsigned _block_size[3];
+  void * _kernel_args[UCL_MAX_KERNEL_ARGS];
+  #else
   std::vector<unsigned> _offsets;
   unsigned _param_size;
-  friend class UCL_Texture;
+  #endif
 };
 
 } // namespace
diff --git a/lib/gpu/geryon/nvd_mat.h b/lib/gpu/geryon/nvd_mat.h
index ed42305a70..51cfe1d56f 100644
--- a/lib/gpu/geryon/nvd_mat.h
+++ b/lib/gpu/geryon/nvd_mat.h
@@ -38,6 +38,9 @@ namespace ucl_cudadr {
 #include "ucl_h_mat.h"
 #include "ucl_d_vec.h"
 #include "ucl_d_mat.h"
+#include "ucl_s_obj_help.h"
+#include "ucl_vector.h"
+#include "ucl_matrix.h"
 #undef _UCL_DEVICE_PTR_MAT
 #undef _UCL_MAT_ALLOW
 
diff --git a/lib/gpu/geryon/nvd_memory.h b/lib/gpu/geryon/nvd_memory.h
index dc70aa1b3c..335418fe5f 100644
--- a/lib/gpu/geryon/nvd_memory.h
+++ b/lib/gpu/geryon/nvd_memory.h
@@ -85,6 +85,21 @@ inline void _host_free(mat_type &mat, const enum UCL_MEMOPT kind) {
     free(mat.begin());
 }
 
+template <class mat_type>
+inline int _host_resize(mat_type &mat, const size_t n) {
+  _host_free(mat,mat.kind());
+  CUresult err=CUDA_SUCCESS;
+  if (mat.kind()==UCL_RW_OPTIMIZED)  
+    err=cuMemAllocHost((void **)mat.host_ptr(),n);
+  else if (mat.kind()==UCL_WRITE_OPTIMIZED)
+    err=cuMemHostAlloc((void **)mat.host_ptr(),n,CU_MEMHOSTALLOC_WRITECOMBINED);
+  else
+    *(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
+  if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL)
+    return UCL_MEMORY_ERROR;
+  return UCL_SUCCESS;
+}
+
 // --------------------------------------------------------------------------
 // - DEVICE MEMORY ALLOCATION ROUTINES
 // --------------------------------------------------------------------------
@@ -143,6 +158,29 @@ inline void _device_free(mat_type &mat) {
   CU_DESTRUCT_CALL(cuMemFree(mat.cbegin()));
 }
 
+template <class mat_type>
+inline int _device_resize(mat_type &mat, const size_t n) {
+  _device_free(mat);
+  CUresult err=cuMemAlloc(&mat.cbegin(),n);
+  if (err!=CUDA_SUCCESS)
+    return UCL_MEMORY_ERROR;
+  return UCL_SUCCESS;
+}
+
+template <class mat_type>
+inline int _device_resize(mat_type &mat, const size_t rows,
+                          const size_t cols, size_t &pitch) {
+  _device_free(mat);
+  CUresult err;
+  CUDA_INT_TYPE upitch;                        
+  err=cuMemAllocPitch(&mat.cbegin(),&upitch,
+                      cols*sizeof(typename mat_type::data_type),rows,16);
+  pitch=static_cast<size_t>(upitch);                               
+  if (err!=CUDA_SUCCESS)
+    return UCL_MEMORY_ERROR;
+  return UCL_SUCCESS;
+}    
+
 inline void _device_view(CUdeviceptr *ptr, CUdeviceptr &in) { 
   *ptr=in;
 }
diff --git a/lib/gpu/geryon/nvd_texture.h b/lib/gpu/geryon/nvd_texture.h
index 3fbf80180b..07650263a5 100644
--- a/lib/gpu/geryon/nvd_texture.h
+++ b/lib/gpu/geryon/nvd_texture.h
@@ -42,27 +42,56 @@ class UCL_Texture {
     { CU_SAFE_CALL(cuModuleGetTexRef(&_tex, prog._module, texture_name)); }
 
   /// Bind a float array where each fetch grabs a vector of length numel
-  template<class mat_typ>
-  inline void bind_float(mat_typ &vec, const unsigned numel) {
-    #ifdef UCL_DEBUG
-    assert(numel!=0 && numel<5);
-    #endif
-    CU_SAFE_CALL(cuTexRefSetAddress(NULL, _tex, vec.cbegin(), 
-                 vec.numel()*vec.element_size()));
-    CU_SAFE_CALL(cuTexRefSetFormat(_tex, CU_AD_FORMAT_FLOAT, numel));
-  }
+  template<class numtyp>
+  inline void bind_float(UCL_D_Vec<numtyp> &vec, const unsigned numel) 
+    { _bind_float(vec,numel); }
+
+  /// Bind a float array where each fetch grabs a vector of length numel
+  template<class numtyp>
+  inline void bind_float(UCL_D_Mat<numtyp> &vec, const unsigned numel) 
+    { _bind_float(vec,numel); }
+
+  /// Bind a float array where each fetch grabs a vector of length numel
+  template<class numtyp, class devtyp>
+  inline void bind_float(UCL_Vector<numtyp, devtyp> &vec, const unsigned numel) 
+    { _bind_float(vec.device,numel); }
+
+  /// Bind a float array where each fetch grabs a vector of length numel
+  template<class numtyp, class devtyp>
+  inline void bind_float(UCL_Matrix<numtyp, devtyp> &vec, const unsigned numel) 
+    { _bind_float(vec.device,numel); }
 
   /// Unbind the texture reference from the memory allocation
   inline void unbind() { }
 
   /// Make a texture reference available to kernel  
   inline void allow(UCL_Kernel &kernel) { 
+    #if CUDA_VERSION < 4000
     CU_SAFE_CALL(cuParamSetTexRef(kernel._kernel, CU_PARAM_TR_DEFAULT, _tex)); 
+    #endif
   }
   
  private:
   CUtexref _tex;
   friend class UCL_Kernel;
+
+  template<class mat_typ>
+  inline void _bind_float(mat_typ &vec, const unsigned numel) {
+    #ifdef UCL_DEBUG
+    assert(numel!=0 && numel<5);
+    #endif
+    CU_SAFE_CALL(cuTexRefSetAddress(NULL, _tex, vec.cbegin(), 
+                 vec.numel()*vec.element_size()));
+    if (vec.element_size()==sizeof(float))
+      CU_SAFE_CALL(cuTexRefSetFormat(_tex, CU_AD_FORMAT_FLOAT, numel));
+    else {
+      if (numel>2)
+        CU_SAFE_CALL(cuTexRefSetFormat(_tex, CU_AD_FORMAT_SIGNED_INT32, numel));
+      else
+        CU_SAFE_CALL(cuTexRefSetFormat(_tex,CU_AD_FORMAT_SIGNED_INT32,numel*2));
+    }
+  }
+
 };
 
 } // namespace
diff --git a/lib/gpu/geryon/ocl_device.h b/lib/gpu/geryon/ocl_device.h
index 0fde8c2acf..391eeb9d95 100644
--- a/lib/gpu/geryon/ocl_device.h
+++ b/lib/gpu/geryon/ocl_device.h
@@ -158,6 +158,11 @@ class UCL_Device {
   /// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
   inline int device_type(const int i);
   
+  /// Returns true if host memory is efficiently addressable from device
+  inline bool shared_memory() { return shared_memory(_device); }
+  /// Returns true if host memory is efficiently addressable from device
+  inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; }
+  
   /// Returns true if double precision is support for the current device
   bool double_precision() { return double_precision(_device); }
   /// Returns true if double precision is support for the device
diff --git a/lib/gpu/geryon/ocl_kernel.h b/lib/gpu/geryon/ocl_kernel.h
index 4d77c85021..74bcea8f5e 100644
--- a/lib/gpu/geryon/ocl_kernel.h
+++ b/lib/gpu/geryon/ocl_kernel.h
@@ -29,11 +29,25 @@
 
 namespace ucl_opencl {
     
+class UCL_Texture;
+template <class numtyp> class UCL_D_Vec;
+template <class numtyp> class UCL_D_Mat;
+template <class hosttype, class devtype> class UCL_Vector;
+template <class hosttype, class devtype> class UCL_Matrix;
+#define UCL_MAX_KERNEL_ARGS 256
+
 /// Class storing 1 or more kernel functions from a single string or file
 class UCL_Program {
  public:
   inline UCL_Program() : _init_done(false) {}
   inline UCL_Program(UCL_Device &device) : _init_done(false) { init(device); }
+  inline UCL_Program(UCL_Device &device, const void *program, 
+                     const char *flags="", std::string *log=NULL) : 
+      _init_done(false) { 
+    init(device); 
+    load_string(program,flags,log);
+  }
+
   inline ~UCL_Program() { clear(); }
 
   /// Initialize the program with a device
@@ -78,10 +92,10 @@ class UCL_Program {
   }
   
   /// Load a program from a string and compile with flags
-  inline int load_string(const char *program, const char *flags="",
+  inline int load_string(const void *program, const char *flags="",
                          std::string *log=NULL) {
     cl_int error_flag;
-    const char *prog=program;
+    const char *prog=(const char *)program;
     _program=clCreateProgramWithSource(_context,1,&prog,NULL,&error_flag);
     CL_CHECK_ERR(error_flag);
     error_flag = clBuildProgram(_program,1,&_device,flags,NULL,NULL);
@@ -159,19 +173,61 @@ class UCL_Kernel {
   /** If not a device pointer, this must be repeated each time the argument
     * changes **/
   template <class dtype>
-  inline void set_arg(const cl_uint index, dtype *arg) { 
+  inline void set_arg(const cl_uint index, const dtype * const arg) { 
     CL_SAFE_CALL(clSetKernelArg(_kernel,index,sizeof(dtype),arg)); 
     if (index>_num_args) _num_args=index;
   }
  
+  /// Set a geryon container as a kernel argument.
+  template <class numtyp>
+  inline void set_arg(const UCL_D_Vec<numtyp> * const arg) 
+    { set_arg(&arg->begin()); }
+
+  /// Set a geryon container as a kernel argument.
+  template <class numtyp>
+  inline void set_arg(const UCL_D_Mat<numtyp> * const arg) 
+    { set_arg(&arg->begin()); }
+
+  /// Set a geryon container as a kernel argument.
+  template <class hosttype, class devtype>
+  inline void set_arg(const UCL_Vector<hosttype, devtype> * const arg) 
+    { set_arg(&arg->device.begin()); }
+
+  /// Set a geryon container as a kernel argument.
+  template <class hosttype, class devtype>
+  inline void set_arg(const UCL_Matrix<hosttype, devtype> * const arg) 
+    { set_arg(&arg->device.begin()); }
+
   /// Add a kernel argument.
   template <class dtype>
-  inline void add_arg(dtype *arg) {
+  inline void add_arg(const dtype * const arg) {
     CL_SAFE_CALL(clSetKernelArg(_kernel,_num_args,sizeof(dtype),arg)); 
     _num_args++; 
   }
 
+  /// Add a geryon container as a kernel argument.
+  template <class numtyp>
+  inline void add_arg(const UCL_D_Vec<numtyp> * const arg) 
+    { add_arg(&arg->begin()); }
+
+  /// Add a geryon container as a kernel argument.
+  template <class numtyp>
+  inline void add_arg(const UCL_D_Mat<numtyp> * const arg) 
+    { add_arg(&arg->begin()); }
+
+  /// Add a geryon container as a kernel argument.
+  template <class hosttype, class devtype>
+  inline void add_arg(const UCL_Vector<hosttype, devtype> * const arg) 
+    { add_arg(&arg->device.begin()); }
+
+  /// Add a geryon container as a kernel argument.
+  template <class hosttype, class devtype>
+  inline void add_arg(const UCL_Matrix<hosttype, devtype> * const arg) 
+    { add_arg(&arg->device.begin()); }
+
   /// Set the number of thread blocks and the number of threads in each block
+  /** \note This should be called before any arguments have been added
+      \note The default command queue is used for the kernel execution **/
   inline void set_size(const size_t num_blocks, const size_t block_size) { 
     _dimensions=1; 
     _num_blocks[0]=num_blocks*block_size; 
@@ -179,6 +235,15 @@ class UCL_Kernel {
   }
 
   /// Set the number of thread blocks and the number of threads in each block
+  /** \note This should be called before any arguments have been added
+      \note The default command queue for the kernel is changed to cq **/
+  inline void set_size(const size_t num_blocks, const size_t block_size,
+                       command_queue &cq)
+    { _cq=cq; set_size(num_blocks,block_size); }
+
+  /// Set the number of thread blocks and the number of threads in each block
+  /** \note This should be called before any arguments have been added
+      \note The default command queue is used for the kernel execution **/
   inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
                        const size_t block_size_x, const size_t block_size_y) { 
     _dimensions=2; 
@@ -189,6 +254,16 @@ class UCL_Kernel {
   }
   
   /// Set the number of thread blocks and the number of threads in each block
+  /** \note This should be called before any arguments have been added
+      \note The default command queue for the kernel is changed to cq **/
+  inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
+                       const size_t block_size_x, const size_t block_size_y,
+                       command_queue &cq) 
+    {_cq=cq; set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y);}
+
+  /// Set the number of thread blocks and the number of threads in each block
+  /** \note This should be called before any arguments have been added
+      \note The default command queue is used for the kernel execution **/
   inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
                        const size_t block_size_x, 
                        const size_t block_size_y, const size_t block_size_z) {
@@ -202,14 +277,20 @@ class UCL_Kernel {
     _block_size[2]=block_size_z; 
   }
 
-  /// Run the kernel in the default command queue
-  inline void run() {
-    run(_cq);
+  /// Set the number of thread blocks and the number of threads in each block
+  /** \note This should be called before any arguments have been added
+      \note The default command queue is used for the kernel execution **/
+  inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
+                       const size_t block_size_x, const size_t block_size_y,
+                       const size_t block_size_z, command_queue &cq) {
+    _cq=cq;
+    set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y, 
+             block_size_z);
   }
   
-  /// Run the kernel in the specified command queue
-  inline void run(command_queue &cq) {
-    CL_SAFE_CALL(clEnqueueNDRangeKernel(cq,_kernel,_dimensions,NULL,
+  /// Run the kernel in the default command queue
+  inline void run() {
+    CL_SAFE_CALL(clEnqueueNDRangeKernel(_cq,_kernel,_dimensions,NULL,
                                         _num_blocks,_block_size,0,NULL,NULL));
   }
   
diff --git a/lib/gpu/geryon/ocl_mat.h b/lib/gpu/geryon/ocl_mat.h
index 180b292d3b..2909d72a72 100644
--- a/lib/gpu/geryon/ocl_mat.h
+++ b/lib/gpu/geryon/ocl_mat.h
@@ -39,6 +39,9 @@ namespace ucl_opencl {
 #include "ucl_h_mat.h"
 #include "ucl_d_vec.h"
 #include "ucl_d_mat.h"
+#include "ucl_s_obj_help.h"
+#include "ucl_vector.h"
+#include "ucl_matrix.h"
 #undef _UCL_DEVICE_PTR_MAT
 #undef _OCL_MAT
 #undef _UCL_MAT_ALLOW
diff --git a/lib/gpu/geryon/ocl_memory.h b/lib/gpu/geryon/ocl_memory.h
index a049f1ea14..6051ee7b3e 100644
--- a/lib/gpu/geryon/ocl_memory.h
+++ b/lib/gpu/geryon/ocl_memory.h
@@ -132,6 +132,37 @@ inline void _host_free(mat_type &mat, const enum UCL_MEMOPT kind) {
   CL_DESTRUCT_CALL(clReleaseCommandQueue(mat.cq()));
 }
 
+template <class mat_type>
+inline int _host_resize(mat_type &mat, const size_t n) {
+  cl_int error_flag;
+  cl_context context;
+  CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_CONTEXT,sizeof(context),
+                                  &context,NULL));
+
+  CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
+  if (mat.kind()==UCL_WRITE_OPTIMIZED) {
+    mat.cbegin()=clCreateBuffer(context,
+                                CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
+                                n,NULL,&error_flag);                        
+    if (error_flag != CL_SUCCESS) 
+      return UCL_MEMORY_ERROR;
+    *mat.host_ptr() = (typename mat_type::data_type*)
+                      clEnqueueMapBuffer(mat.cq(),mat.cbegin(),CL_TRUE,
+                                         CL_MAP_WRITE,0,n,0,NULL,NULL,NULL);
+  } else {
+    mat.cbegin()=clCreateBuffer(context,
+                                CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
+                                n,NULL,&error_flag);
+    if (error_flag != CL_SUCCESS) 
+      return UCL_MEMORY_ERROR;
+    *mat.host_ptr() = (typename mat_type::data_type*)
+                      clEnqueueMapBuffer(mat.cq(),mat.cbegin(),CL_TRUE,
+                                         CL_MAP_READ | CL_MAP_WRITE,
+                                         0,n,0,NULL,NULL,NULL);
+  }
+  return UCL_SUCCESS;
+}
+
 // --------------------------------------------------------------------------
 // - DEVICE MEMORY ALLOCATION ROUTINES
 // --------------------------------------------------------------------------
@@ -211,6 +242,61 @@ inline void _device_free(mat_type &mat) {
   CL_DESTRUCT_CALL(clReleaseCommandQueue(mat.cq()));
 }
 
+template <class mat_type>
+inline int _device_resize(mat_type &mat, const size_t n) {
+  cl_int error_flag;
+
+  cl_context context;
+  CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_CONTEXT,sizeof(context),
+               &context,NULL));
+  CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
+
+  cl_mem_flags flag;
+  if (mat.kind()==UCL_READ_WRITE)
+    flag=CL_MEM_READ_WRITE;
+  else if (mat.kind()==UCL_READ_ONLY)
+    flag=CL_MEM_READ_ONLY;
+  else if (mat.kind()==UCL_WRITE_ONLY)
+    flag=CL_MEM_WRITE_ONLY;
+  else
+    assert(0==1);
+  mat.cbegin()=clCreateBuffer(context,flag,n,NULL,&error_flag);
+  if (error_flag != CL_SUCCESS) 
+    return UCL_MEMORY_ERROR;
+  return UCL_SUCCESS;
+}
+
+template <class mat_type>
+inline int _device_resize(mat_type &mat, const size_t rows,
+                         const size_t cols, size_t &pitch) {
+  size_t padded_cols=cols;
+  if (cols%256!=0)
+    padded_cols+=256-cols%256;
+  pitch=padded_cols*sizeof(typename mat_type::data_type);
+
+  cl_int error_flag;
+
+  cl_context context;
+  CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_CONTEXT,sizeof(context),
+               &context,NULL));
+  CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
+
+  cl_mem_flags flag;
+  if (mat.kind()==UCL_READ_WRITE)
+    flag=CL_MEM_READ_WRITE;
+  else if (mat.kind()==UCL_READ_ONLY)
+    flag=CL_MEM_READ_ONLY;
+  else if (mat.kind()==UCL_WRITE_ONLY)
+    flag=CL_MEM_WRITE_ONLY;
+  else
+    assert(0==1);
+  mat.cbegin()=clCreateBuffer(context,flag,pitch*rows,NULL,&error_flag);
+  if (error_flag != CL_SUCCESS) 
+    return UCL_MEMORY_ERROR;
+  return UCL_SUCCESS;
+}
+
+
 // --------------------------------------------------------------------------
 // - ZERO ROUTINES
 // --------------------------------------------------------------------------
diff --git a/lib/gpu/geryon/ucl_arg_kludge.h b/lib/gpu/geryon/ucl_arg_kludge.h
index f039a2ff42..646aa4d68f 100644
--- a/lib/gpu/geryon/ucl_arg_kludge.h
+++ b/lib/gpu/geryon/ucl_arg_kludge.h
@@ -828,441 +828,3 @@
     add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30); 
     run();
   }
-
-// ---------------------------------------------------------------------------
-
-  template <class t1>
-  inline void run_cq(command_queue &cq, t1 *a1) {
-    clear_args();
-    add_arg(a1);
-    run(cq);
-  }
-
-  template <class t1, class t2>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2) {
-    clear_args();
-    add_arg(a1); add_arg(a2);
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3);
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4);
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4, class t5>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4, class t5,
-            class t6>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
-                       t6 *a6) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6);  
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4, class t5,
-            class t6, class t7>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
-                       t6 *a6, t7 *a7) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); 
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4, class t5,
-            class t6, class t7, class t8>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
-                       t6 *a6, t7 *a7, t8 *a8) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8);  
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4, class t5,
-            class t6, class t7, class t8, class t9>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
-                       t6 *a6, t7 *a7, t8 *a8, t9 *a9) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); 
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4, class t5,
-            class t6, class t7, class t8, class t9, class t10>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
-                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4, class t5,
-            class t6, class t7, class t8, class t9, class t10,
-            class t11>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
-                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
-                       t11 *a11) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); 
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4, class t5,
-            class t6, class t7, class t8, class t9, class t10,
-            class t11, class t12>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
-                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
-                       t11 *a11, t12 *a12) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12);
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4, class t5,
-            class t6, class t7, class t8, class t9, class t10,
-            class t11, class t12, class t13>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
-                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
-                       t11 *a11, t12 *a12, t13 *a13) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13);  
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4, class t5,
-            class t6, class t7, class t8, class t9, class t10,
-            class t11, class t12, class t13, class t14>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
-                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
-                       t11 *a11, t12 *a12, t13 *a13, t14 *a14) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14);  
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4, class t5,
-            class t6, class t7, class t8, class t9, class t10,
-            class t11, class t12, class t13, class t14, class t15>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
-                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
-                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4, class t5,
-            class t6, class t7, class t8, class t9, class t10,
-            class t11, class t12, class t13, class t14, class t15,
-            class t16>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
-                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
-                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
-                       t16 *a16) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); 
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4, class t5,
-            class t6, class t7, class t8, class t9, class t10,
-            class t11, class t12, class t13, class t14, class t15,
-            class t16, class t17>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
-                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
-                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
-                       t16 *a16, t17 *a17) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); 
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4, class t5,
-            class t6, class t7, class t8, class t9, class t10,
-            class t11, class t12, class t13, class t14, class t15,
-            class t16, class t17, class t18>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
-                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
-                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
-                       t16 *a16, t17 *a17, t18 *a18) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); 
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4, class t5,
-            class t6, class t7, class t8, class t9, class t10,
-            class t11, class t12, class t13, class t14, class t15,
-            class t16, class t17, class t18, class t19>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
-                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
-                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
-                       t16 *a16, t17 *a17, t18 *a18, t19 *a19) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); 
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4, class t5,
-            class t6, class t7, class t8, class t9, class t10,
-            class t11, class t12, class t13, class t14, class t15,
-            class t16, class t17, class t18, class t19, class t20>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
-                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
-                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
-                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4, class t5,
-            class t6, class t7, class t8, class t9, class t10,
-            class t11, class t12, class t13, class t14, class t15,
-            class t16, class t17, class t18, class t19, class t20,
-            class t21>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
-                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
-                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
-                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
-                       t21 *a21) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
-    add_arg(a21);
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4, class t5,
-            class t6, class t7, class t8, class t9, class t10,
-            class t11, class t12, class t13, class t14, class t15,
-            class t16, class t17, class t18, class t19, class t20,
-            class t21, class t22>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
-                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
-                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
-                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
-                       t21 *a21, t22 *a22) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
-    add_arg(a21); add_arg(a22);
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4, class t5,
-            class t6, class t7, class t8, class t9, class t10,
-            class t11, class t12, class t13, class t14, class t15,
-            class t16, class t17, class t18, class t19, class t20,
-            class t21, class t22, class t23>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
-                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
-                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
-                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
-                       t21 *a21, t22 *a22, t23 *a23) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
-    add_arg(a21); add_arg(a22); add_arg(a23);
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4, class t5,
-            class t6, class t7, class t8, class t9, class t10,
-            class t11, class t12, class t13, class t14, class t15,
-            class t16, class t17, class t18, class t19, class t20,
-            class t21, class t22, class t23, class t24>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
-                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
-                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
-                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
-                       t21 *a21, t22 *a22, t23 *a23, t24 *a24) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
-    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24);
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4, class t5,
-            class t6, class t7, class t8, class t9, class t10,
-            class t11, class t12, class t13, class t14, class t15,
-            class t16, class t17, class t18, class t19, class t20,
-            class t21, class t22, class t23, class t24, class t25>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
-                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
-                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
-                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
-                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
-    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4, class t5,
-            class t6, class t7, class t8, class t9, class t10,
-            class t11, class t12, class t13, class t14, class t15,
-            class t16, class t17, class t18, class t19, class t20,
-            class t21, class t22, class t23, class t24, class t25,
-            class t26>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
-                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
-                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
-                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
-                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
-                       t26 *a26) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
-    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
-    add_arg(a26);
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4, class t5,
-            class t6, class t7, class t8, class t9, class t10,
-            class t11, class t12, class t13, class t14, class t15,
-            class t16, class t17, class t18, class t19, class t20,
-            class t21, class t22, class t23, class t24, class t25,
-            class t26, class t27>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
-                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
-                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
-                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
-                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
-                       t26 *a26, t27 *a27) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
-    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
-    add_arg(a26); add_arg(a27);
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4, class t5,
-            class t6, class t7, class t8, class t9, class t10,
-            class t11, class t12, class t13, class t14, class t15,
-            class t16, class t17, class t18, class t19, class t20,
-            class t21, class t22, class t23, class t24, class t25,
-            class t26, class t27, class t28>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
-                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
-                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
-                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
-                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
-                       t26 *a26, t27 *a27, t28 *a28) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
-    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
-    add_arg(a26); add_arg(a27); add_arg(a28); 
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4, class t5,
-            class t6, class t7, class t8, class t9, class t10,
-            class t11, class t12, class t13, class t14, class t15,
-            class t16, class t17, class t18, class t19, class t20,
-            class t21, class t22, class t23, class t24, class t25,
-            class t26, class t27, class t28, class t29>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
-                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
-                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
-                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
-                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
-                       t26 *a26, t27 *a27, t28 *a28, t29 *a29) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
-    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
-    add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29);
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4, class t5,
-            class t6, class t7, class t8, class t9, class t10,
-            class t11, class t12, class t13, class t14, class t15,
-            class t16, class t17, class t18, class t19, class t20,
-            class t21, class t22, class t23, class t24, class t25,
-            class t26, class t27, class t28, class t29, class t30>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
-                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
-                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
-                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
-                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
-                       t26 *a26, t27 *a27, t28 *a28, t29 *a29, t30 *a30) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
-    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
-    add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30); 
-    run(cq);
-  }
-
diff --git a/lib/gpu/geryon/ucl_d_mat.h b/lib/gpu/geryon/ucl_d_mat.h
index 9777de4586..b065a8b644 100644
--- a/lib/gpu/geryon/ucl_d_mat.h
+++ b/lib/gpu/geryon/ucl_d_mat.h
@@ -344,6 +344,39 @@ class UCL_D_Mat : public UCL_BaseMat {
   inline void clear() 
     { _rows=0; if (_kind!=UCL_VIEW) { _kind=UCL_VIEW; _device_free(*this); } }
 
+  /// Resize the allocation to contain cols elements
+  /** \note Cannot be used on views **/
+  inline int resize(const int rows, const int cols) {
+    assert(_kind!=UCL_VIEW);
+
+    int err=_device_resize(*this,rows,cols,_pitch);
+    if (err!=UCL_SUCCESS) {
+      #ifndef UCL_NO_EXIT
+      std::cerr << "UCL Error: Could not allocate " 
+                << rows*cols*sizeof(numtyp) << " bytes on device.\n";
+      UCL_GERYON_EXIT;
+      #endif
+      return err;
+    }
+
+    _rows=rows;
+    _cols=cols;
+    _row_size=_pitch/sizeof(numtyp);
+    #ifndef _UCL_DEVICE_PTR_MAT
+    _end=_array+_row_size*cols;
+    #endif
+    #ifdef _OCL_MAT
+    _offset=0;
+    #endif
+    return err; 
+  }
+    
+  /// Resize (only if bigger) the allocation to contain rows x cols elements
+  /** \note Cannot be used on views **/
+  inline int resize_ib(const int rows, const int cols)
+    { if (cols>_cols || rows>_rows) return resize(rows,cols); 
+      else return UCL_SUCCESS; }
+
   /// Set each element to zero
   inline void zero() { _device_zero(*this,row_bytes()*_rows); }
   
@@ -357,9 +390,9 @@ class UCL_D_Mat : public UCL_BaseMat {
   inline const device_ptr & begin() const { return _array; }
   #else
   /// For CUDA-RT, get device pointer to first element
-  inline numtyp * begin() { return _array; }
+  inline numtyp * & begin() { return _array; }
   /// For CUDA-RT, get device pointer to first element
-  inline const numtyp * begin() const { return _array; }
+  inline numtyp * const & begin() const { return _array; }
   /// For CUDA-RT, get device pointer to one past last element
   inline numtyp * end() { return _end; }
   /// For CUDA-RT, get device pointer to one past last element
diff --git a/lib/gpu/geryon/ucl_d_vec.h b/lib/gpu/geryon/ucl_d_vec.h
index 83063ba070..11107437ea 100644
--- a/lib/gpu/geryon/ucl_d_vec.h
+++ b/lib/gpu/geryon/ucl_d_vec.h
@@ -340,6 +340,39 @@ class UCL_D_Vec : public UCL_BaseMat {
   inline void clear() 
     { if (_kind!=UCL_VIEW) { _cols=0; _kind=UCL_VIEW; _device_free(*this); } }
 
+  /// Resize the allocation to contain cols elements
+  /** \note Cannot be used on views **/
+  inline int resize(const int cols) {
+    assert(_kind!=UCL_VIEW);
+
+    _row_bytes=cols*sizeof(numtyp);
+    int err=_device_resize(*this,_row_bytes);
+    if (err!=UCL_SUCCESS) {
+      #ifndef UCL_NO_EXIT
+      std::cerr << "UCL Error: Could not allocate " << _row_bytes
+                << " bytes on device.\n";
+      _row_bytes=0;
+      UCL_GERYON_EXIT;
+      #endif
+      _row_bytes=0;
+      return err;
+    }
+
+    _cols=cols;
+    #ifndef _UCL_DEVICE_PTR_MAT
+    _end=_array+cols;
+    #endif
+    #ifdef _OCL_MAT
+    _offset=0;
+    #endif
+    return err; 
+  }
+    
+  /// Resize (only if bigger) the allocation to contain cols elements
+  /** \note Cannot be used on views **/
+  inline int resize_ib(const int cols)
+    { if (cols>_cols) return resize(cols); else return UCL_SUCCESS; }
+
   /// Set each element to zero
   inline void zero() { _device_zero(*this,row_bytes()); }
 
@@ -353,13 +386,13 @@ class UCL_D_Vec : public UCL_BaseMat {
   inline const device_ptr & begin() const { return _array; }
   #else
   /// For CUDA-RT, get device pointer to first element
-  inline numtyp * begin() { return _array; }
+  inline numtyp * & begin() { return _array; }
   /// For CUDA-RT, get device pointer to first element
-  inline const numtyp * begin() const { return _array; }
+  inline numtyp * const & begin() const { return _array; }
   /// For CUDA-RT, get device pointer to one past last element
   inline numtyp * end() { return _end; }
   /// For CUDA-RT, get device pointer to one past last element
-  inline const numtyp * end() const { return _end; }
+  inline numtyp * end() const { return _end; }
   #endif
   
   #ifdef _UCL_DEVICE_PTR_MAT
diff --git a/lib/gpu/geryon/ucl_h_mat.h b/lib/gpu/geryon/ucl_h_mat.h
index 5c13a003aa..806b930630 100644
--- a/lib/gpu/geryon/ucl_h_mat.h
+++ b/lib/gpu/geryon/ucl_h_mat.h
@@ -318,6 +318,36 @@ class UCL_H_Mat : public UCL_BaseMat {
   inline void clear() 
     { if (_kind!=UCL_VIEW) {_rows=0; _kind=UCL_VIEW; _host_free(*this,_kind); }} 
 
+  /// Resize the allocation to rows x cols elements
+  /** \note Cannot be used on views **/
+  inline int resize(const int rows, const int cols) {
+    assert(_kind!=UCL_VIEW);
+
+    _row_bytes=cols*sizeof(numtyp);
+    int err=_host_resize(*this,_row_bytes*rows);
+    if (err!=UCL_SUCCESS) {
+      #ifndef UCL_NO_EXIT
+      std::cerr << "UCL Error: Could not allocate " << _row_bytes*_rows
+                << " bytes on host.\n";
+      _row_bytes=0;
+      UCL_GERYON_EXIT;
+      #endif 
+      _row_bytes=0;
+      return err;
+    }
+
+    _cols=cols;
+    _rows=rows;
+    _end=_array+rows*cols;
+    return err;
+  }
+
+  /// Resize (only if bigger) the allocation to contain rows x cols elements
+  /** \note Cannot be used on views **/
+  inline int resize_ib(const int rows, const int cols)
+    { if (cols>_cols || rows>_rows) return resize(rows,cols); 
+      else return UCL_SUCCESS; }
+
   /// Set each element to zero
   inline void zero() { _host_zero(_array,_rows*row_bytes()); }
   /// Set first n elements to zero
diff --git a/lib/gpu/geryon/ucl_h_vec.h b/lib/gpu/geryon/ucl_h_vec.h
index 2de68b487c..3a53113153 100644
--- a/lib/gpu/geryon/ucl_h_vec.h
+++ b/lib/gpu/geryon/ucl_h_vec.h
@@ -316,6 +316,34 @@ class UCL_H_Vec : public UCL_BaseMat {
   inline void clear() 
     { if (_kind!=UCL_VIEW) {_kind=UCL_VIEW; _cols=0; _host_free(*this,_kind);}}
 
+  /// Resize the allocation to contain cols elements
+  /** \note Cannot be used on views **/
+  inline int resize(const int cols) {
+    assert(_kind!=UCL_VIEW);
+    _row_bytes=cols*sizeof(numtyp);
+    int err=_host_resize(*this,_row_bytes);
+    
+    if (err!=UCL_SUCCESS) {
+      #ifndef UCL_NO_EXIT
+      std::cerr << "UCL Error: Could not allocate " << _row_bytes
+                << " bytes on host.\n";
+      _row_bytes=0;
+      UCL_GERYON_EXIT;
+      #endif
+      _row_bytes=0;
+      return err;
+    }
+
+    _cols=cols;
+    _end=_array+cols;
+    return err;
+  }
+    
+  /// Resize (only if bigger) the allocation to contain cols elements
+  /** \note Cannot be used on views **/
+  inline int resize_ib(const int cols)
+    { if (cols>_cols) return resize(cols); else return UCL_SUCCESS; }
+
   /// Set each element to zero
   inline void zero() { _host_zero(_array,row_bytes()); }
   
diff --git a/lib/gpu/geryon/ucl_print.h b/lib/gpu/geryon/ucl_print.h
index a8ab19a05d..87b3d3d7ff 100644
--- a/lib/gpu/geryon/ucl_print.h
+++ b/lib/gpu/geryon/ucl_print.h
@@ -270,4 +270,13 @@ template <class numtyp>
 inline std::ostream & operator << (std::ostream &out, UCL_D_Mat<numtyp> &mat)
   { ucl_print(mat,out); return out; } 
 
+
+template <class t1, class t2>
+inline std::ostream & operator << (std::ostream &out, UCL_Vector<t1,t2> &mat)
+  { ucl_print(mat.host,out); return out; } 
+
+template <class t1, class t2>
+inline std::ostream & operator << (std::ostream &out, UCL_Matrix<t1,t2> &mat)
+  { ucl_print(mat.host,out); return out; } 
+
 #endif
diff --git a/lib/gpu/geryon/ucl_types.h b/lib/gpu/geryon/ucl_types.h
index 9dabf16687..615bffea95 100644
--- a/lib/gpu/geryon/ucl_types.h
+++ b/lib/gpu/geryon/ucl_types.h
@@ -117,5 +117,61 @@ enum UCL_ERROR_FLAG {
 template <class numtyp>
 const char * ucl_template_name() { return _UCL_DATA_ID<numtyp>::name(); }
 
+template <class t1, class t2> struct ucl_same_type;
+
+template <> struct ucl_same_type<bool,bool> { enum { ans=1 }; };
+template <> struct ucl_same_type<char,char> { enum { ans=1 }; };
+template <> struct ucl_same_type<unsigned char,unsigned char> { enum { ans=1 }; };
+template <> struct ucl_same_type<int,int> { enum { ans=1 }; };
+template <> struct ucl_same_type<unsigned,unsigned> { enum { ans=1 }; };
+template <> struct ucl_same_type<short,short> { enum { ans=1 }; };
+template <> struct ucl_same_type<unsigned short,unsigned short> { enum { ans=1 }; };
+template <> struct ucl_same_type<long,long> { enum { ans=1 }; };
+template <> struct ucl_same_type<unsigned long,unsigned long> { enum { ans=1 }; };
+template <> struct ucl_same_type<float,float> { enum { ans=1 }; };
+template <> struct ucl_same_type<double,double> { enum { ans=1 }; };
+template <> struct ucl_same_type<long double,long double> { enum { ans=1 }; };
+
+template <> struct ucl_same_type<const bool,bool> { enum { ans=1 }; };
+template <> struct ucl_same_type<const char,char> { enum { ans=1 }; };
+template <> struct ucl_same_type<const unsigned char,unsigned char> { enum { ans=1 }; };
+template <> struct ucl_same_type<const int,int> { enum { ans=1 }; };
+template <> struct ucl_same_type<const unsigned,unsigned> { enum { ans=1 }; };
+template <> struct ucl_same_type<const short,short> { enum { ans=1 }; };
+template <> struct ucl_same_type<const unsigned short,unsigned short> { enum { ans=1 }; };
+template <> struct ucl_same_type<const long,long> { enum { ans=1 }; };
+template <> struct ucl_same_type<const unsigned long,unsigned long> { enum { ans=1 }; };
+template <> struct ucl_same_type<const float,float> { enum { ans=1 }; };
+template <> struct ucl_same_type<const double,double> { enum { ans=1 }; };
+template <> struct ucl_same_type<const long double,long double> { enum { ans=1 }; };
+
+template <> struct ucl_same_type<bool,const bool> { enum { ans=1 }; };
+template <> struct ucl_same_type<char,const char> { enum { ans=1 }; };
+template <> struct ucl_same_type<unsigned char,const unsigned char> { enum { ans=1 }; };
+template <> struct ucl_same_type<int,const int> { enum { ans=1 }; };
+template <> struct ucl_same_type<unsigned,const unsigned> { enum { ans=1 }; };
+template <> struct ucl_same_type<short,const short> { enum { ans=1 }; };
+template <> struct ucl_same_type<unsigned short,const unsigned short> { enum { ans=1 }; };
+template <> struct ucl_same_type<long,const long> { enum { ans=1 }; };
+template <> struct ucl_same_type<unsigned long,const unsigned long> { enum { ans=1 }; };
+template <> struct ucl_same_type<float,const float> { enum { ans=1 }; };
+template <> struct ucl_same_type<double,const double> { enum { ans=1 }; };
+template <> struct ucl_same_type<long double,const long double> { enum { ans=1 }; };
+
+template <> struct ucl_same_type<const bool,const bool> { enum { ans=1 }; };
+template <> struct ucl_same_type<const char,const char> { enum { ans=1 }; };
+template <> struct ucl_same_type<const unsigned char,const unsigned char> { enum { ans=1 }; };
+template <> struct ucl_same_type<const int,const int> { enum { ans=1 }; };
+template <> struct ucl_same_type<const unsigned,const unsigned> { enum { ans=1 }; };
+template <> struct ucl_same_type<const short,const short> { enum { ans=1 }; };
+template <> struct ucl_same_type<const unsigned short,const unsigned short> { enum { ans=1 }; };
+template <> struct ucl_same_type<const long,const long> { enum { ans=1 }; };
+template <> struct ucl_same_type<const unsigned long,const unsigned long> { enum { ans=1 }; };
+template <> struct ucl_same_type<const float,const float> { enum { ans=1 }; };
+template <> struct ucl_same_type<const double,const double> { enum { ans=1 }; };
+template <> struct ucl_same_type<const long double,const long double> { enum { ans=1 }; };
+
+template <class t1, class t2> struct ucl_same_type { enum { ans=0 }; };
+
 #endif
 
diff --git a/lib/gpu/lal_answer.cpp b/lib/gpu/lal_answer.cpp
index 4ead777609..6f42790ca3 100644
--- a/lib/gpu/lal_answer.cpp
+++ b/lib/gpu/lal_answer.cpp
@@ -39,30 +39,16 @@ bool AnswerT::alloc(const int inum) {
 
   bool success=true;
   
-  int ans_elements=4;
+  _ans_fields=4;
   if (_rot)
-    ans_elements+=4;
+    _ans_fields+=4;
   
-  // Ignore host/device transfers?
-  bool cpuview=false;
-  if (dev->device_type()==UCL_CPU)
-    cpuview=true;
-    
-  // --------------------------   Host allocations
-  success=success &&(host_ans.alloc(ans_elements*_max_local,*dev)==UCL_SUCCESS);
-  success=success &&(host_engv.alloc(_ev_fields*_max_local,*dev)==UCL_SUCCESS);
-    
   // ---------------------------  Device allocations
-  if (cpuview) {
-    dev_engv.view(host_engv);
-    dev_ans.view(host_ans);
-  } else {
-    success=success && (dev_engv.alloc(_ev_fields*_max_local,*dev,
-                                       UCL_WRITE_ONLY)==UCL_SUCCESS);
-    success=success && (dev_ans.alloc(ans_elements*_max_local,
-                                      *dev,UCL_WRITE_ONLY)==UCL_SUCCESS);
-  }
-  _gpu_bytes=dev_engv.row_bytes()+dev_ans.row_bytes();
+  success=success && (engv.alloc(_ev_fields*_max_local,*dev,UCL_RW_OPTIMIZED,
+                                 UCL_WRITE_ONLY)==UCL_SUCCESS);
+  success=success && (force.alloc(_ans_fields*_max_local,*dev,UCL_RW_OPTIMIZED,
+                                UCL_WRITE_ONLY)==UCL_SUCCESS);
+  _gpu_bytes=engv.device.row_bytes()+force.device.row_bytes();
   
   _allocated=true;  
   return success;
@@ -114,32 +100,24 @@ bool AnswerT::add_fields(const bool charge, const bool rot) {
   if (realloc) {
     _other=_charge || _rot;
     int inum=_max_local;
-    clear_resize();
+    force.clear();
+    engv.clear();
+    _allocated=false;
     return alloc(inum);
   }
   return true;
 }
 
-template <class numtyp, class acctyp>
-void AnswerT::clear_resize() {
-  if (!_allocated)
-    return;
-  _allocated=false;
-
-  dev_ans.clear();
-  dev_engv.clear();
-  host_ans.clear();
-  host_engv.clear();
-}
-
 template <class numtyp, class acctyp>
 void AnswerT::clear() {
   _gpu_bytes=0;
   if (!_allocated)
     return;
+  _allocated=false;
 
+  force.clear();
+  engv.clear();
   time_answer.clear();
-  clear_resize();
   _inum=0;
   _ilist=NULL;
   _eflag=false;
@@ -174,11 +152,11 @@ void AnswerT::copy_answers(const bool eflag, const bool vflag,
     csize-=6;
       
   if (csize>0)
-    ucl_copy(host_engv,dev_engv,_inum*csize,true);
+    engv.update_host(_inum*csize,true);
   if (_rot)
-    ucl_copy(host_ans,dev_ans,_inum*4*2,true);
+    force.update_host(_inum*4*2,true);
   else
-    ucl_copy(host_ans,dev_ans,_inum*4,true);
+    force.update_host(_inum*4,true);
   time_answer.stop();
 }
 
@@ -201,28 +179,28 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
   for (int i=0; i<6; i++) virial_acc[i]=0.0;
   if (_ilist==NULL) {
     for (int i=0; i<_inum; i++) {
-      acctyp *ap=host_engv.begin()+i;
+      int al=i;
       if (_eflag) {
         if (_ef_atom) {
-          evdwl+=*ap;
-          eatom[i]+=*ap*0.5;
-          ap+=_inum;
+          evdwl+=engv[al];
+          eatom[i]+=engv[al]*0.5;
+          al+=_inum;
         } else {
-          evdwl+=*ap;
-          ap+=_inum;
+          evdwl+=engv[al];
+          al+=_inum;
         }
       }
       if (_vflag) {
         if (_vf_atom) {
           for (int j=0; j<6; j++) {
-            vatom[i][j]+=*ap*0.5;
-            virial_acc[j]+=*ap;
-            ap+=_inum;
+            vatom[i][j]+=engv[al]*0.5;
+            virial_acc[j]+=engv[al];
+            al+=_inum;
           }
         } else {
           for (int j=0; j<6; j++) {
-            virial_acc[j]+=*ap;
-            ap+=_inum;
+            virial_acc[j]+=engv[al];
+            al+=_inum;
           }
         }
       }
@@ -231,29 +209,29 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
       virial[j]+=virial_acc[j]*0.5;
   } else {
     for (int i=0; i<_inum; i++) {
-      acctyp *ap=host_engv.begin()+i;
+      int al=i;
       int ii=_ilist[i];
       if (_eflag) {
         if (_ef_atom) {
-          evdwl+=*ap;
-          eatom[ii]+=*ap*0.5;
-          ap+=_inum;
+          evdwl+=engv[al];
+          eatom[ii]+=engv[al]*0.5;
+          al+=_inum;
         } else {
-          evdwl+=*ap;
-          ap+=_inum;
+          evdwl+=engv[al];
+          al+=_inum;
         }
       }
       if (_vflag) {
         if (_vf_atom) {
           for (int j=0; j<6; j++) {
-            vatom[ii][j]+=*ap*0.5;
-            virial_acc[j]+=*ap;
-            ap+=_inum;
+            vatom[ii][j]+=engv[al]*0.5;
+            virial_acc[j]+=engv[al];
+            al+=_inum;
           }
         } else {
           for (int j=0; j<6; j++) {
-            virial_acc[j]+=*ap;
-            ap+=_inum;
+            virial_acc[j]+=engv[al];
+            al+=_inum;
           }
         }
       }
@@ -281,33 +259,33 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
   for (int i=0; i<6; i++) virial_acc[i]=0.0;
   if (_ilist==NULL) {
     for (int i=0; i<_inum; i++) {
-      acctyp *ap=host_engv.begin()+i;
+      int al=i;
       if (_eflag) {
         if (_ef_atom) {
-          evdwl+=*ap;
-          eatom[i]+=*ap*0.5;
-          ap+=_inum;
-          _ecoul+=*ap;
-          eatom[i]+=*ap*0.5;
-          ap+=_inum;
+          evdwl+=engv[al];
+          eatom[i]+=engv[al]*0.5;
+          al+=_inum;
+          _ecoul+=engv[al];
+          eatom[i]+=engv[al]*0.5;
+          al+=_inum;
         } else {
-          evdwl+=*ap;
-          ap+=_inum;
-          _ecoul+=*ap;
-          ap+=_inum;
+          evdwl+=engv[al];
+          al+=_inum;
+          _ecoul+=engv[al];
+          al+=_inum;
         }
       }
       if (_vflag) {
         if (_vf_atom) {
           for (int j=0; j<6; j++) {
-            vatom[i][j]+=*ap*0.5;
-            virial_acc[j]+=*ap;
-            ap+=_inum;
+            vatom[i][j]+=engv[al]*0.5;
+            virial_acc[j]+=engv[al];
+            al+=_inum;
           }
         } else {
           for (int j=0; j<6; j++) {
-            virial_acc[j]+=*ap;
-            ap+=_inum;
+            virial_acc[j]+=engv[al];
+            al+=_inum;
           }
         }
       }
@@ -316,34 +294,34 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
       virial[j]+=virial_acc[j]*0.5;
   } else {
     for (int i=0; i<_inum; i++) {
-      acctyp *ap=host_engv.begin()+i;
+      int al=i;
       int ii=_ilist[i];
       if (_eflag) {
         if (_ef_atom) {
-          evdwl+=*ap;
-          eatom[ii]+=*ap*0.5;
-          ap+=_inum;
-          _ecoul+=*ap;
-          eatom[ii]+=*ap*0.5;
-          ap+=_inum;
+          evdwl+=engv[al];
+          eatom[ii]+=engv[al]*0.5;
+          al+=_inum;
+          _ecoul+=engv[al];
+          eatom[ii]+=engv[al]*0.5;
+          al+=_inum;
         } else {
-          evdwl+=*ap;
-          ap+=_inum;
-          _ecoul+=*ap;
-          ap+=_inum;
+          evdwl+=engv[al];
+          al+=_inum;
+          _ecoul+=engv[al];
+          al+=_inum;
         }
       }
       if (_vflag) {
         if (_vf_atom) {
           for (int j=0; j<6; j++) {
-            vatom[ii][j]+=*ap*0.5;
-            virial_acc[j]+=*ap;
-            ap+=_inum;
+            vatom[ii][j]+=engv[al]*0.5;
+            virial_acc[j]+=engv[al];
+            al+=_inum;
           }
         } else {
           for (int j=0; j<6; j++) {
-            virial_acc[j]+=*ap;
-            ap+=_inum;
+            virial_acc[j]+=engv[al];
+            al+=_inum;
           }
         }
       }
@@ -359,45 +337,37 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
 
 template <class numtyp, class acctyp>
 void AnswerT::get_answers(double **f, double **tor) {
-  acctyp *ap=host_ans.begin();
+  int fl=0;
   if (_ilist==NULL) {
     for (int i=0; i<_inum; i++) {
-      f[i][0]+=*ap;
-      ap++;
-      f[i][1]+=*ap;
-      ap++;
-      f[i][2]+=*ap;
-      ap+=2;
+      f[i][0]+=force[fl];
+      f[i][1]+=force[fl+1];
+      f[i][2]+=force[fl+2];
+      fl+=4;
     }
     if (_rot) {
       for (int i=0; i<_inum; i++) {
-        tor[i][0]+=*ap;
-        ap++;
-        tor[i][1]+=*ap;
-        ap++;
-        tor[i][2]+=*ap;
-        ap+=2;
+        tor[i][0]+=force[fl];
+        tor[i][1]+=force[fl+1];
+        tor[i][2]+=force[fl+2];
+        fl+=4;
       }
     }
   } else {
     for (int i=0; i<_inum; i++) {
       int ii=_ilist[i];
-      f[ii][0]+=*ap;
-      ap++;
-      f[ii][1]+=*ap;
-      ap++;
-      f[ii][2]+=*ap;
-      ap+=2;
+      f[ii][0]+=force[fl];
+      f[ii][1]+=force[fl+1];
+      f[ii][2]+=force[fl+2];
+      fl+=4;
     }
     if (_rot) {
       for (int i=0; i<_inum; i++) {
         int ii=_ilist[i];
-        tor[ii][0]+=*ap;
-        ap++;
-        tor[ii][1]+=*ap;
-        ap++;
-        tor[ii][2]+=*ap;
-        ap+=2;
+        tor[ii][0]+=force[fl];
+        tor[ii][1]+=force[fl+1];
+        tor[ii][2]+=force[fl+2];
+        fl+=4;
       }
     }
   }
diff --git a/lib/gpu/lal_answer.h b/lib/gpu/lal_answer.h
index 721e16cdd5..c642781c07 100644
--- a/lib/gpu/lal_answer.h
+++ b/lib/gpu/lal_answer.h
@@ -19,18 +19,18 @@
 #include <math.h>
 #include "mpi.h"
 
-#ifdef USE_OPENCL
-
+#if defined(USE_OPENCL)
 #include "geryon/ocl_timer.h"
 #include "geryon/ocl_mat.h"
 using namespace ucl_opencl;
-
+#elif defined(USE_CUDART)
+#include "geryon/nvc_timer.h"
+#include "geryon/nvc_mat.h"
+using namespace ucl_cudart;
 #else
-
 #include "geryon/nvd_timer.h"
 #include "geryon/nvd_mat.h"
 using namespace ucl_cudadr;
-
 #endif
 
 #include "lal_precision.h"
@@ -59,8 +59,10 @@ class Answer {
   inline void resize(const int inum, bool &success) {
     _inum=inum;
     if (inum>_max_local) {
-      clear_resize();
-      success = success && alloc(inum);
+      _max_local=static_cast<int>(static_cast<double>(inum)*1.10);
+      success=success && (force.resize(_max_local*_ans_fields)==UCL_SUCCESS);
+      success=success && (engv.resize(_max_local*_ev_fields)==UCL_SUCCESS);
+      _gpu_bytes=engv.device.row_bytes()+force.device.row_bytes();
     }
   }
   
@@ -68,9 +70,6 @@ class Answer {
   /** \param rot True if atom storage needs quaternions **/
   bool add_fields(const bool charge, const bool rot);
   
-  /// Free all memory on host and device needed to realloc for more atoms
-  void clear_resize();
-
   /// Free all memory on host and device
   void clear();
  
@@ -136,14 +135,9 @@ class Answer {
   // ------------------------------ DATA ----------------------------------
 
   /// Force and possibly torque
-  UCL_D_Vec<acctyp> dev_ans;
+  UCL_Vector<acctyp,acctyp> force;
   /// Energy and virial per-atom storage
-  UCL_D_Vec<acctyp> dev_engv;
-  
-  /// Force and possibly torque data on host
-  UCL_H_Vec<acctyp> host_ans;
-  /// Energy/virial data on host
-  UCL_H_Vec<acctyp> host_engv;
+  UCL_Vector<acctyp,acctyp> engv;
   
   /// Device timers
   UCL_Timer time_answer;
@@ -155,7 +149,7 @@ class Answer {
   bool alloc(const int inum);
   
   bool _allocated, _eflag, _vflag, _ef_atom, _vf_atom, _rot, _charge, _other;
-  int _max_local, _inum, _e_fields, _ev_fields;
+  int _max_local, _inum, _e_fields, _ev_fields, _ans_fields;
   int *_ilist;
   double _time_cast, _time_cpu_idle;
   
diff --git a/lib/gpu/lal_atom.cpp b/lib/gpu/lal_atom.cpp
index 357316c5a3..5cf46c8751 100644
--- a/lib/gpu/lal_atom.cpp
+++ b/lib/gpu/lal_atom.cpp
@@ -51,10 +51,14 @@ bool AtomT::alloc(const int nall) {
   bool success=true;
   
   // Ignore host/device transfers?
-  bool cpuview=false;
-  if (dev->device_type()==UCL_CPU)
-    cpuview=true;
-    
+  _host_view=false;
+  if (dev->shared_memory()) {
+    _host_view=true;
+    #ifdef GPU_CAST
+    assert(0==1);
+    #endif
+  }
+      
   // Allocate storage for CUDPP sort
   #ifdef USE_CUDPP
   if (_gpu_nbor==1) {
@@ -64,63 +68,101 @@ bool AtomT::alloc(const int nall) {
   }
   #endif
 
-  // --------------------------   Host allocations
-  // Get a host write only buffer
-  #ifdef GPU_CAST
-  success=success && (host_x_cast.alloc(_max_atoms*3,*dev,
-                                        UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
-  success=success && (host_type_cast.alloc(_max_atoms,*dev,
-                                           UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
-  #else
-  success=success && (host_x.alloc(_max_atoms*4,*dev,
-                      UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
-  #endif                      
-  // Buffer for casting only if different precisions
-  if (_charge)
-    success=success && (host_q.alloc(_max_atoms,*dev,
-                                     UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
-  // Buffer for casting only if different precisions
-  if (_rot)
-    success=success && (host_quat.alloc(_max_atoms*4,*dev,
-                                        UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
-
-    
   // ---------------------------  Device allocations
   int gpu_bytes=0;
-  if (cpuview) {
-    #ifdef GPU_CAST
-    assert(0==1);
-    #else
-    dev_x.view(host_x);
-    #endif
-    if (_rot)
-      dev_quat.view(host_quat);
-    if (_charge)
-      dev_q.view(host_q);
-  } else {
-    #ifdef GPU_CAST
-    success=success && (UCL_SUCCESS==dev_x.alloc(_max_atoms*4,*dev));
-    success=success && (UCL_SUCCESS==
-                        dev_x_cast.alloc(_max_atoms*3,*dev,UCL_READ_ONLY));
-    success=success && (UCL_SUCCESS==
-                        dev_type_cast.alloc(_max_atoms,*dev,UCL_READ_ONLY));
-    gpu_bytes+=dev_x_cast.row_bytes()+dev_type_cast.row_bytes();
-    #else
-    success=success && (UCL_SUCCESS==
-                        dev_x.alloc(_max_atoms*4,*dev,UCL_READ_ONLY));
-    #endif
-    if (_charge) {
-      success=success && (dev_q.alloc(_max_atoms,*dev,
-                                      UCL_READ_ONLY)==UCL_SUCCESS);
-      gpu_bytes+=dev_q.row_bytes();
+  success=success && (x.alloc(_max_atoms*4,*dev,UCL_WRITE_OPTIMIZED,
+                              UCL_READ_ONLY)==UCL_SUCCESS);
+  #ifdef GPU_CAST
+  success=success && (x_cast.alloc(_max_atoms*3,*dev,UCL_READ_ONLY)==
+                      UCL_SUCCESS);
+  success=success && (type_cast.alloc(_max_atoms,*dev,UCL_READ_ONLY)==
+                      UCL_SUCCESS);
+  gpu_bytes+=x_cast.device.row_bytes()+type_cast.device.row_bytes();
+  #endif
+
+  if (_charge && _host_view==false) {
+    success=success && (q.alloc(_max_atoms,*dev,UCL_WRITE_OPTIMIZED,
+                                UCL_READ_ONLY)==UCL_SUCCESS);
+    gpu_bytes+=q.device.row_bytes();
+  }
+  if (_rot && _host_view==false) {
+    success=success && (quat.alloc(_max_atoms*4,*dev,UCL_WRITE_OPTIMIZED,
+                                   UCL_READ_ONLY)==UCL_SUCCESS);
+    gpu_bytes+=quat.device.row_bytes();
+  }
+
+  if (_gpu_nbor>0) {
+    if (_bonds) {
+      success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS);
+      gpu_bytes+=dev_tag.row_bytes();
     }
-    if (_rot) {
-      success=success && (dev_quat.alloc(_max_atoms*4,*dev,
-                                      UCL_READ_ONLY)==UCL_SUCCESS);
-      gpu_bytes+=dev_quat.row_bytes();
+    if (_gpu_nbor==1) {
+      success=success && (dev_cell_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
+      gpu_bytes+=dev_cell_id.row_bytes();
+    } else {
+      success=success && (host_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
+      success=success && 
+             (host_cell_id.alloc(_max_atoms,*dev,UCL_NOT_PINNED)==UCL_SUCCESS);
+    }
+    if (_gpu_nbor==2 && _host_view)
+      dev_particle_id.view(host_particle_id);
+    else
+      success=success && (dev_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
+    gpu_bytes+=dev_particle_id.row_bytes();
+  }
+
+  gpu_bytes+=x.device.row_bytes();
+  if (gpu_bytes>_max_gpu_bytes)
+    _max_gpu_bytes=gpu_bytes;
+  
+  _allocated=true;  
+  return success;
+}
+
+template <class numtyp, class acctyp>
+bool AtomT::add_fields(const bool charge, const bool rot,
+                       const int gpu_nbor, const bool bonds) {
+  bool success=true;
+  // Ignore host/device transfers?
+  int gpu_bytes=0;
+  
+  if (charge && _charge==false) {
+    _charge=true;
+    _other=true;
+    if (_host_view==false) {
+      success=success && (q.alloc(_max_atoms,*dev,UCL_WRITE_OPTIMIZED,
+                                  UCL_READ_ONLY)==UCL_SUCCESS);
+      gpu_bytes+=q.device.row_bytes();
     }
   }
-  if (_gpu_nbor>0) {
+
+  if (rot && _rot==false) {
+    _rot=true;
+    _other=true;
+    if (_host_view==false) {
+      success=success && (quat.alloc(_max_atoms*4,*dev,UCL_WRITE_OPTIMIZED,
+                                     UCL_READ_ONLY)==UCL_SUCCESS);
+      gpu_bytes+=quat.device.row_bytes();
+    }
+  }
+
+  if (bonds && _bonds==false) {
+    _bonds=true;
+    if (_bonds && _gpu_nbor>0) {
+      success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS);
+      gpu_bytes+=dev_tag.row_bytes();
+    }
+  }
+
+  if (gpu_nbor>0 && _gpu_nbor==0) {
+    _gpu_nbor=gpu_nbor;
+    #ifdef USE_CUDPP
+    if (_gpu_nbor==1) {
+      CUDPPResult result = cudppPlan(&sort_plan, sort_config, _max_atoms, 1, 0);  
+      if (CUDPP_SUCCESS != result)
+        return false;
+    }
+    #endif
     success=success && (dev_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
     gpu_bytes+=dev_particle_id.row_bytes();
     if (_bonds) {
@@ -137,43 +179,9 @@ bool AtomT::alloc(const int nall) {
     }             
   }
 
-  gpu_bytes+=dev_x.row_bytes();
-  if (gpu_bytes>_max_gpu_bytes)
-    _max_gpu_bytes=gpu_bytes;
-  
-  _allocated=true;  
   return success;
 }
 
-template <class numtyp, class acctyp>
-bool AtomT::add_fields(const bool charge, const bool rot,
-                       const int gpu_nbor, const bool bonds) {
-  bool realloc=false;
-  if (charge && _charge==false) {
-    _charge=true;
-    realloc=true;
-  }
-  if (rot && _rot==false) {
-    _rot=true;
-    realloc=true;
-  }
-  if (gpu_nbor>0 && _gpu_nbor==0) {
-    _gpu_nbor=gpu_nbor;
-    realloc=true;
-  }
-  if (bonds && _bonds==false) {
-    _bonds=true;
-    realloc=true;
-  }
-  if (realloc) {
-    _other=_charge || _rot;
-    int max_atoms=_max_atoms;
-    clear_resize();
-    return alloc(max_atoms);
-  }
-  return true;
-}
-
 template <class numtyp, class acctyp>
 bool AtomT::init(const int nall, const bool charge, const bool rot,
                  UCL_Device &devi, const int gpu_nbor, const bool bonds) {
@@ -219,27 +227,18 @@ void AtomT::clear_resize() {
     return;
   _allocated=false;
 
-  dev_x.clear();
-  if (_charge) { 
-    dev_q.clear();
-    host_q.clear();
-  }
-  if (_rot) {
-    dev_quat.clear();
-    host_quat.clear();
-  }
-  #ifndef GPU_CAST
-  host_x.clear();
-  #else
-  host_x_cast.clear();
-  host_type_cast.clear();
-  #endif
+  x.clear();
+  if (_charge)
+    q.clear();
+  if (_rot)
+    quat.clear();
+
   dev_cell_id.clear();
   dev_particle_id.clear();
   dev_tag.clear();
   #ifdef GPU_CAST
-  dev_x_cast.clear();
-  dev_type_cast.clear();
+  x_cast.clear();
+  type_cast.clear();
   #endif
 
   #ifdef USE_CUDPP
@@ -279,8 +278,7 @@ double AtomT::host_memory_usage() const {
     atom_bytes+=1;
   if (_rot) 
     atom_bytes+=4;
-  return _max_atoms*atom_bytes*sizeof(numtyp)+
-         sizeof(Atom<numtyp,acctyp>);
+  return _max_atoms*atom_bytes*sizeof(numtyp)+sizeof(Atom<numtyp,acctyp>);
 }
   
 // Sort arrays for neighbor list calculation
@@ -292,16 +290,18 @@ void AtomT::sort_neighbor(const int num_atoms) {
                                  8*sizeof(unsigned), num_atoms);
   if (CUDPP_SUCCESS != result) {
     printf("Error in cudppSort\n");
-    NVD_GERYON_EXIT;
+    UCL_GERYON_EXIT;
   }
   #endif
 }
 
 #ifdef GPU_CAST
-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "atom_cl.h"
+#elif defined(USE_CUDART)
+const char *atom=0;
 #else
-#include "atom_ptx.h"
+#include "atom_cubin.h"
 #endif
 
 template <class numtyp, class acctyp>
@@ -316,3 +316,4 @@ void AtomT::compile_kernels(UCL_Device &dev) {
 #endif
 
 template class Atom<PRECISION,ACC_PRECISION>;
+
diff --git a/lib/gpu/lal_atom.h b/lib/gpu/lal_atom.h
index 642fce07ad..171141f7ea 100644
--- a/lib/gpu/lal_atom.h
+++ b/lib/gpu/lal_atom.h
@@ -19,20 +19,21 @@
 #include <math.h>
 #include "mpi.h"
 
-#ifdef USE_OPENCL
-
+#if defined(USE_OPENCL)
 #include "geryon/ocl_timer.h"
 #include "geryon/ocl_mat.h"
 #include "geryon/ocl_kernel.h"
 using namespace ucl_opencl;
-
+#elif defined(USE_CUDART)
+#include "geryon/nvc_timer.h"
+#include "geryon/nvc_mat.h"
+#include "geryon/nvc_kernel.h"
+using namespace ucl_cudart;
 #else
-
 #include "geryon/nvd_timer.h"
 #include "geryon/nvd_mat.h"
 #include "geryon/nvd_kernel.h"
 using namespace ucl_cudadr;
-
 #endif
 
 #ifdef USE_CUDPP
@@ -92,7 +93,7 @@ class Atom {
   bool charge() { return _charge; }
   
   /// Returns true if GPU is using quaternions
-  bool quat() { return _rot; }
+  bool quaternion() { return _rot; }
   
   /// Only free matrices of length inum or nall for resizing
   void clear_resize();
@@ -148,9 +149,9 @@ class Atom {
 
   /// Pack LAMMPS atom type constants into matrix and copy to device
   template <class dev_typ, class t1>
-  inline void type_pack1(const int n, const int m_size,
-			 UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer,
-			 t1 **one) {
+  inline void type_pack1(const int n, const int m_size, 
+                         UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer,
+                         t1 **one) {
     int ii=0;
     for (int i=0; i<n; i++) {
       for (int j=0; j<n; j++) {
@@ -167,8 +168,8 @@ class Atom {
   /// Pack LAMMPS atom type constants into 2 vectors and copy to device
   template <class dev_typ, class t1, class t2>
   inline void type_pack2(const int n, const int m_size,
-			 UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer,
-			 t1 **one, t2 **two) {
+                         UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer,
+                         t1 **one, t2 **two) {
     int ii=0;
     for (int i=0; i<n; i++) {
       for (int j=0; j<n; j++) {
@@ -186,8 +187,8 @@ class Atom {
   /// Pack LAMMPS atom type constants (3) into 4 vectors and copy to device
   template <class dev_typ, class t1, class t2, class t3>
   inline void type_pack4(const int n, const int m_size,
-			 UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer,
-			 t1 **one, t2 **two, t3 **three) {
+                         UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer,
+                         t1 **one, t2 **two, t3 **three) {
     int ii=0;
     for (int i=0; i<n; i++) {
       for (int j=0; j<n; j++) {
@@ -206,8 +207,8 @@ class Atom {
   /// Pack LAMMPS atom type constants (4) into 4 vectors and copy to device
   template <class dev_typ, class t1, class t2, class t3, class t4>
   inline void type_pack4(const int n, const int m_size,
-			 UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer,
-			 t1 **one, t2 **two, t3 **three, t4 **four) {
+                         UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer,
+                         t1 **one, t2 **two, t3 **three, t4 **four) {
     int ii=0;
     for (int i=0; i<n; i++) {
       for (int j=0; j<n; j++) {
@@ -251,16 +252,13 @@ class Atom {
       memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
       memcpy(host_type_cast.begin(),host_type,_nall*sizeof(int));
       #else
-      numtyp *_write_loc=host_x.begin();
+      int wl=0;
       for (int i=0; i<_nall; i++) {
-        *_write_loc=host_ptr[i][0];
-        _write_loc++;
-        *_write_loc=host_ptr[i][1];
-        _write_loc++;
-        *_write_loc=host_ptr[i][2];
-        _write_loc++;
-        *_write_loc=host_type[i];
-        _write_loc++;
+        x[wl]=host_ptr[i][0];
+        x[wl+1]=host_ptr[i][1];
+        x[wl+2]=host_ptr[i][2];
+        x[wl+3]=host_type[i];
+        wl+=4;
       }
       #endif
       _time_cast+=MPI_Wtime()-t;
@@ -273,15 +271,14 @@ class Atom {
     time_pos.start();
     if (_x_avail==false) {
       #ifdef GPU_CAST
-      ucl_copy(dev_x_cast,host_x_cast,_nall*3,true);
-      ucl_copy(dev_type_cast,host_type_cast,_nall,true);
+      x_cast.update_device(_nall*3,true);
+      type_cast.update_device(_nall,true);
       int block_size=64;
       int GX=static_cast<int>(ceil(static_cast<double>(_nall)/block_size));
       k_cast_x.set_size(GX,block_size);
-      k_cast_x.run(&dev_x.begin(), &dev_x_cast.begin(), &dev_type_cast.begin(), 
-                   &_nall);
+      k_cast_x.run(&x, &x_cast, &type_cast, &_nall);
       #else
-      ucl_copy(dev_x,host_x,_nall*4,true);
+      x.update_device(_nall*4,true);
       #endif
       _x_avail=true;
     }
@@ -299,18 +296,14 @@ class Atom {
   inline void cast_q_data(cpytyp *host_ptr) {
     if (_q_avail==false) {
       double t=MPI_Wtime();
-      if (dev->device_type()==UCL_CPU) {
-        if (sizeof(numtyp)==sizeof(double)) {
-          host_q.view((numtyp*)host_ptr,_nall,*dev);
-          dev_q.view(host_q);
-        } else
-          for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i];
-      } else {
-        if (sizeof(numtyp)==sizeof(double))
-          memcpy(host_q.begin(),host_ptr,_nall*sizeof(numtyp));
-        else
-          for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i];
-      }
+      // If double precision, still memcpy for async transfers
+      if (_host_view) {
+        q.host.view((numtyp*)host_ptr,_nall,*dev);
+        q.device.view(q.host);
+      } else if (sizeof(numtyp)==sizeof(double))
+        memcpy(q.host.begin(),host_ptr,_nall*sizeof(numtyp));
+      else
+        for (int i=0; i<_nall; i++) q[i]=host_ptr[i];
       _time_cast+=MPI_Wtime()-t;
     }
   }
@@ -318,7 +311,7 @@ class Atom {
   // Copy charges to device asynchronously
   inline void add_q_data() {
     if (_q_avail==false) {
-      ucl_copy(dev_q,host_q,_nall,true);
+      q.update_device(_nall,true);
       _q_avail=true;
     }
   }
@@ -328,18 +321,13 @@ class Atom {
   inline void cast_quat_data(cpytyp *host_ptr) {
     if (_quat_avail==false) {
       double t=MPI_Wtime();
-      if (dev->device_type()==UCL_CPU) {
-        if (sizeof(numtyp)==sizeof(double)) {
-          host_quat.view((numtyp*)host_ptr,_nall*4,*dev);
-          dev_quat.view(host_quat);
-        } else
-          for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i];
-      } else {
-        if (sizeof(numtyp)==sizeof(double))
-          memcpy(host_quat.begin(),host_ptr,_nall*4*sizeof(numtyp));
-        else
-          for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i];
-      }
+      if (_host_view) {
+        quat.host.view((numtyp*)host_ptr,_nall*4,*dev);
+        quat.device.view(quat.host);
+      } else if (sizeof(numtyp)==sizeof(double))
+        memcpy(quat.host.begin(),host_ptr,_nall*4*sizeof(numtyp));
+      else
+        for (int i=0; i<_nall*4; i++) quat[i]=host_ptr[i];
       _time_cast+=MPI_Wtime()-t;
     }
   }
@@ -348,7 +336,7 @@ class Atom {
   /** Copies nall()*4 elements **/
   inline void add_quat_data() {
     if (_quat_avail==false) {
-      ucl_copy(dev_quat,host_quat,_nall*4,true);
+      quat.update_device(_nall*4,true);
       _quat_avail=true;
     }
   }
@@ -363,29 +351,23 @@ class Atom {
   inline double max_gpu_bytes() 
     { double m=_max_gpu_bytes; _max_gpu_bytes=0.0; return m; } 
 
+  /// Returns true if the device is addressing memory on the host
+  inline bool host_view() { return _host_view; }
+
   // ------------------------------ DATA ----------------------------------
 
   /// Atom coordinates and types ([0] is x, [1] is y, [2] is z, [3] is type
-  UCL_D_Vec<numtyp> dev_x;
+  UCL_Vector<numtyp,numtyp> x;
   /// Charges
-  UCL_D_Vec<numtyp> dev_q;
+  UCL_Vector<numtyp,numtyp> q;
   /// Quaterions
-  UCL_D_Vec<numtyp> dev_quat;
+  UCL_Vector<numtyp,numtyp> quat;
   
   #ifdef GPU_CAST
-  UCL_D_Vec<double> dev_x_cast;
-  UCL_D_Vec<int> dev_type_cast;
-  UCL_H_Vec<double> host_x_cast;
-  UCL_H_Vec<int> host_type_cast;
+  UCL_Vector<double,double> x_cast;
+  UCL_Vector<int,int> type_cast;
   #endif
 
-  /// Buffer for moving positions to device
-  UCL_H_Vec<numtyp> host_x;
-  /// Buffer for moving charge data to GPU
-  UCL_H_Vec<numtyp> host_q;
-  /// Buffer for moving quat data to GPU
-  UCL_H_Vec<numtyp> host_quat;
-  
   /// Cell list identifiers for device nbor builds
   UCL_D_Vec<unsigned> dev_cell_id;
   /// Cell list identifiers for device nbor builds
@@ -418,9 +400,9 @@ class Atom {
 
   bool alloc(const int nall);
   
-  bool _allocated, _rot, _charge, _other;
+  bool _allocated, _rot, _charge, _bonds, _other;
   int _max_atoms, _nall, _gpu_nbor;
-  bool _bonds;
+  bool _host_view;
   double _time_cast, _time_transfer;
   
   double _max_gpu_bytes;
@@ -434,3 +416,4 @@ class Atom {
 }
 
 #endif
+
diff --git a/lib/gpu/lal_base_atomic.cpp b/lib/gpu/lal_base_atomic.cpp
index 553352e84b..f88c4417af 100644
--- a/lib/gpu/lal_base_atomic.cpp
+++ b/lib/gpu/lal_base_atomic.cpp
@@ -40,10 +40,10 @@ int BaseAtomicT::bytes_per_atom_atomic(const int max_nbors) const {
 
 template <class numtyp, class acctyp>
 int BaseAtomicT::init_atomic(const int nlocal, const int nall,
-                                  const int max_nbors, const int maxspecial,
-                                  const double cell_size,
-                                  const double gpu_split, FILE *_screen,
-                                  const char *pair_program) {
+                             const int max_nbors, const int maxspecial,
+                             const double cell_size, const double gpu_split,
+                             FILE *_screen, const void *pair_program,
+                             const char *k_name) {
   screen=_screen;
 
   int gpu_nbor=0;
@@ -74,7 +74,7 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall,
   atom=&device->atom;
 
   _block_size=device->pair_block_size();
-  compile_kernels(*ucl_device,pair_program);
+  compile_kernels(*ucl_device,pair_program,k_name);
 
   // Initialize host-device load balancer
   hd_balancer.init(device,gpu_nbor,gpu_split);
@@ -83,7 +83,7 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall,
   time_pair.init(*ucl_device);
   time_pair.zero();
 
-  pos_tex.bind_float(atom->dev_x,4);
+  pos_tex.bind_float(atom->x,4);
 
   _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
 
@@ -266,18 +266,20 @@ double BaseAtomicT::host_memory_usage_atomic() const {
 }
 
 template <class numtyp, class acctyp>
-void BaseAtomicT::compile_kernels(UCL_Device &dev, const char *pair_str) {
+void BaseAtomicT::compile_kernels(UCL_Device &dev, const void *pair_str,
+                                  const char *kname) {
   if (_compiled)
     return;
 
+  std::string s_fast=std::string(kname)+"_fast";
   std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
                     std::string(OCL_PRECISION_COMPILE)+" -D"+
                     std::string(OCL_VENDOR);
 
   pair_program=new UCL_Program(dev);
   pair_program->load_string(pair_str,flags.c_str());
-  k_pair_fast.set_function(*pair_program,"kernel_pair_fast");
-  k_pair.set_function(*pair_program,"kernel_pair");
+  k_pair_fast.set_function(*pair_program,s_fast.c_str());
+  k_pair.set_function(*pair_program,kname);
   pos_tex.get_texture(*pair_program,"pos_tex");
 
   _compiled=true;
diff --git a/lib/gpu/lal_base_atomic.h b/lib/gpu/lal_base_atomic.h
index 7e9a911385..74c8530f7f 100644
--- a/lib/gpu/lal_base_atomic.h
+++ b/lib/gpu/lal_base_atomic.h
@@ -20,8 +20,10 @@
 #include "lal_balance.h"
 #include "mpi.h"
 
-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "geryon/ocl_texture.h"
+#elif defined(USE_CUDART)
+#include "geryon/nvc_texture.h"
 #else
 #include "geryon/nvd_texture.h"
 #endif
@@ -38,6 +40,7 @@ class BaseAtomic {
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
+    * \param k_name name for the kernel for force calculation
     * 
     * Returns:
     * -  0 if successfull
@@ -48,7 +51,7 @@ class BaseAtomic {
   int init_atomic(const int nlocal, const int nall, const int max_nbors,
                   const int maxspecial, const double cell_size, 
                   const double gpu_split, FILE *screen, 
-                  const char *pair_program);
+                  const void *pair_program, const char *k_name);
 
   /// Estimate the overhead for GPU context changes and CPU driver
   void estimate_gpu_overhead();
@@ -57,7 +60,7 @@ class BaseAtomic {
   /** \param success set to false if insufficient memory **/
   inline void resize_atom(const int inum, const int nall, bool &success) {
     if (atom->resize(nall, success))
-      pos_tex.bind_float(atom->dev_x,4);
+      pos_tex.bind_float(atom->x,4);
     ans->resize(inum,success);
   }
 
@@ -188,7 +191,7 @@ class BaseAtomic {
   double _gpu_overhead, _driver_overhead;
   UCL_D_Vec<int> *_nbor_data;
 
-  void compile_kernels(UCL_Device &dev, const char *pair_string);
+  void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k);
 
   virtual void loop(const bool _eflag, const bool _vflag) = 0;
 };
diff --git a/lib/gpu/lal_base_charge.cpp b/lib/gpu/lal_base_charge.cpp
index 3ac63666b0..f9bb2a52f3 100644
--- a/lib/gpu/lal_base_charge.cpp
+++ b/lib/gpu/lal_base_charge.cpp
@@ -41,10 +41,10 @@ int BaseChargeT::bytes_per_atom_atomic(const int max_nbors) const {
 
 template <class numtyp, class acctyp>
 int BaseChargeT::init_atomic(const int nlocal, const int nall,
-                                  const int max_nbors, const int maxspecial,
-                                  const double cell_size,
-                                  const double gpu_split, FILE *_screen,
-                                  const char *pair_program) {
+                             const int max_nbors, const int maxspecial,
+                             const double cell_size, const double gpu_split,
+                             FILE *_screen, const void *pair_program,
+                             const char *k_name) {
   screen=_screen;
 
   int gpu_nbor=0;
@@ -76,7 +76,7 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall,
 
   _block_size=device->pair_block_size();
   _block_bio_size=device->block_bio_pair();
-  compile_kernels(*ucl_device,pair_program);
+  compile_kernels(*ucl_device,pair_program,k_name);
 
   // Initialize host-device load balancer
   hd_balancer.init(device,gpu_nbor,gpu_split);
@@ -85,8 +85,8 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall,
   time_pair.init(*ucl_device);
   time_pair.zero();
 
-  pos_tex.bind_float(atom->dev_x,4);
-  q_tex.bind_float(atom->dev_q,1);
+  pos_tex.bind_float(atom->x,4);
+  q_tex.bind_float(atom->q,1);
 
   _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
 
@@ -282,18 +282,20 @@ double BaseChargeT::host_memory_usage_atomic() const {
 }
 
 template <class numtyp, class acctyp>
-void BaseChargeT::compile_kernels(UCL_Device &dev, const char *pair_str) {
+void BaseChargeT::compile_kernels(UCL_Device &dev, const void *pair_str,
+                                  const char *kname) {
   if (_compiled)
     return;
 
+  std::string s_fast=std::string(kname)+"_fast";
   std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
                     std::string(OCL_PRECISION_COMPILE)+" -D"+
                     std::string(OCL_VENDOR);
 
   pair_program=new UCL_Program(dev);
   pair_program->load_string(pair_str,flags.c_str());
-  k_pair_fast.set_function(*pair_program,"kernel_pair_fast");
-  k_pair.set_function(*pair_program,"kernel_pair");
+  k_pair_fast.set_function(*pair_program,s_fast.c_str());
+  k_pair.set_function(*pair_program,kname);
   pos_tex.get_texture(*pair_program,"pos_tex");
   q_tex.get_texture(*pair_program,"q_tex");
 
diff --git a/lib/gpu/lal_base_charge.h b/lib/gpu/lal_base_charge.h
index a0a42be671..3ca4705177 100644
--- a/lib/gpu/lal_base_charge.h
+++ b/lib/gpu/lal_base_charge.h
@@ -21,8 +21,10 @@
 #include "lal_balance.h"
 #include "mpi.h"
 
-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "geryon/ocl_texture.h"
+#elif defined(USE_CUDART)
+#include "geryon/nvc_texture.h"
 #else
 #include "geryon/nvd_texture.h"
 #endif
@@ -39,6 +41,7 @@ class BaseCharge {
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
+    * \param k_name name for the kernel for force calculation
     * 
     * Returns:
     * -  0 if successfull
@@ -49,7 +52,7 @@ class BaseCharge {
   int init_atomic(const int nlocal, const int nall, const int max_nbors,
                   const int maxspecial, const double cell_size,
                   const double gpu_split, FILE *screen,
-                  const char *pair_program);
+                  const void *pair_program, const char *k_name);
 
   /// Estimate the overhead for GPU context changes and CPU driver
   void estimate_gpu_overhead();
@@ -58,8 +61,8 @@ class BaseCharge {
   /** \param success set to false if insufficient memory **/
   inline void resize_atom(const int inum, const int nall, bool &success) {
     if (atom->resize(nall, success)) {
-      pos_tex.bind_float(atom->dev_x,4);
-      q_tex.bind_float(atom->dev_q,1);
+      pos_tex.bind_float(atom->x,4);
+      q_tex.bind_float(atom->q,1);
     }
     ans->resize(inum,success);
   }
@@ -187,7 +190,7 @@ class BaseCharge {
   double _gpu_overhead, _driver_overhead;
   UCL_D_Vec<int> *_nbor_data;
 
-  void compile_kernels(UCL_Device &dev, const char *pair_string);
+  void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k);
 
   virtual void loop(const bool _eflag, const bool _vflag) = 0;
 };
diff --git a/lib/gpu/lal_base_ellipsoid.cpp b/lib/gpu/lal_base_ellipsoid.cpp
index ee43cd8f75..7e86d03e50 100644
--- a/lib/gpu/lal_base_ellipsoid.cpp
+++ b/lib/gpu/lal_base_ellipsoid.cpp
@@ -17,10 +17,12 @@
 #include <cstdlib>
 using namespace LAMMPS_AL;
 
-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "ellipsoid_nbor_cl.h"
+#elif defined(USE_CUDART)
+const char *ellipsoid_nbor=0;
 #else
-#include "ellipsoid_nbor_ptx.h"
+#include "ellipsoid_nbor_cubin.h"
 #endif
 
 #define BaseEllipsoidT BaseEllipsoid<numtyp, acctyp>
@@ -50,8 +52,9 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,
                               const int max_nbors, const int maxspecial,
                               const double cell_size, const double gpu_split,
                               FILE *_screen, const int ntypes, int **h_form,
-                              const char *ellipsoid_program,
-                              const char *lj_program, const bool ellip_sphere) {
+                              const void *ellipsoid_program,
+                              const void *lj_program, const char *k_name,
+                              const bool ellip_sphere) {
   screen=_screen;
   _ellipsoid_sphere=ellip_sphere;
 
@@ -78,7 +81,7 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,
   atom=&device->atom;
 
   _block_size=device->pair_block_size();
-  compile_kernels(*ucl_device,ellipsoid_program,lj_program,ellip_sphere);
+  compile_kernels(*ucl_device,ellipsoid_program,lj_program,k_name,ellip_sphere);
 
   // Initialize host-device load balancer
   hd_balancer.init(device,gpu_nbor,gpu_split);
@@ -112,7 +115,7 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,
   }
   
   if (_multiple_forms)
-    ans->dev_ans.zero();
+    ans->force.zero();
 
   // Memory for ilist ordered by particle type
   if (host_olist.alloc(nbor->max_atoms(),*ucl_device)==UCL_SUCCESS)
@@ -121,6 +124,12 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,
 
   _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
 
+  neigh_tex.bind_float(atom->x,4);
+  pos_tex.bind_float(atom->x,4);
+  quat_tex.bind_float(atom->quat,4);
+  lj_pos_tex.bind_float(atom->x,4);
+  lj_quat_tex.bind_float(atom->quat,4);
+
   return 0;
 }
 
@@ -241,14 +250,12 @@ void BaseEllipsoidT::pack_nbors(const int GX, const int BX, const int start,
   int stride=nbor->nbor_pitch();
   if (shared_types) {
     k_nbor_fast.set_size(GX,BX);
-    k_nbor_fast.run(&atom->dev_x.begin(), &cut_form.begin(), 
-                    &nbor->dev_nbor.begin(), &stride, &start, &inum,
-                    &nbor->dev_packed.begin(), &form_low, &form_high);
+    k_nbor_fast.run(&atom->x, &cut_form, &nbor->dev_nbor, &stride, &start,
+                    &inum, &nbor->dev_packed, &form_low, &form_high);
   } else {
     k_nbor.set_size(GX,BX);
-    k_nbor.run(&atom->dev_x.begin(), &cut_form.begin(), &ntypes,
-               &nbor->dev_nbor.begin(), &stride, &start, &inum, 
-               &nbor->dev_packed.begin(), &form_low, &form_high);
+    k_nbor.run(&atom->x, &cut_form, &ntypes, &nbor->dev_nbor, &stride,
+               &start, &inum, &nbor->dev_packed, &form_low, &form_high);
   }
 }
 
@@ -437,11 +444,18 @@ double BaseEllipsoidT::host_memory_usage_base() const {
 
 template <class numtyp, class acctyp>
 void BaseEllipsoidT::compile_kernels(UCL_Device &dev, 
-                                     const char *ellipsoid_string,
-                                     const char *lj_string, const bool e_s) {
+                                     const void *ellipsoid_string,
+                                     const void *lj_string, 
+                                     const char *kname, const bool e_s) {
   if (_compiled)
     return;
 
+  std::string kns=kname;
+  std::string s_sphere_ellipsoid=kns+"_sphere_ellipsoid";
+  std::string s_ellipsoid_sphere=kns+"_ellipsoid_sphere";
+  std::string s_lj=kns+"_lj";
+  std::string s_lj_fast=kns+"_lj_fast";
+
   std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
                     std::string(OCL_PRECISION_COMPILE)+" -D"+
                     std::string(OCL_VENDOR);
@@ -450,18 +464,23 @@ void BaseEllipsoidT::compile_kernels(UCL_Device &dev,
   nbor_program->load_string(ellipsoid_nbor,flags.c_str());
   k_nbor_fast.set_function(*nbor_program,"kernel_nbor_fast");
   k_nbor.set_function(*nbor_program,"kernel_nbor");
+  neigh_tex.get_texture(*nbor_program,"pos_tex");
 
   ellipsoid_program=new UCL_Program(dev);
   ellipsoid_program->load_string(ellipsoid_string,flags.c_str());
-  k_ellipsoid.set_function(*ellipsoid_program,"kernel_ellipsoid");
+  k_ellipsoid.set_function(*ellipsoid_program,kname);
+  pos_tex.get_texture(*ellipsoid_program,"pos_tex");
+  quat_tex.get_texture(*ellipsoid_program,"quat_tex");
 
   lj_program=new UCL_Program(dev);
   lj_program->load_string(lj_string,flags.c_str());
-  k_sphere_ellipsoid.set_function(*lj_program,"kernel_sphere_ellipsoid");
-  k_lj_fast.set_function(*lj_program,"kernel_lj_fast");
-  k_lj.set_function(*lj_program,"kernel_lj");
+  k_sphere_ellipsoid.set_function(*lj_program,s_sphere_ellipsoid.c_str());
+  k_lj_fast.set_function(*lj_program,s_lj_fast.c_str());
+  k_lj.set_function(*lj_program,s_lj.c_str());
   if (e_s)
-    k_ellipsoid_sphere.set_function(*lj_program,"kernel_ellipsoid_sphere");
+    k_ellipsoid_sphere.set_function(*lj_program,s_ellipsoid_sphere.c_str());
+  lj_pos_tex.get_texture(*lj_program,"pos_tex");
+  lj_quat_tex.get_texture(*lj_program,"quat_tex");
 
   _compiled=true;
 }
diff --git a/lib/gpu/lal_base_ellipsoid.h b/lib/gpu/lal_base_ellipsoid.h
index 7ccf5691d0..96e2e3ee50 100644
--- a/lib/gpu/lal_base_ellipsoid.h
+++ b/lib/gpu/lal_base_ellipsoid.h
@@ -20,8 +20,10 @@
 #include "lal_balance.h"
 #include "mpi.h"
 
-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "geryon/ocl_texture.h"
+#elif defined(USE_CUDART)
+#include "geryon/nvc_texture.h"
 #else
 #include "geryon/nvd_texture.h"
 #endif
@@ -39,6 +41,7 @@ class BaseEllipsoid {
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
     * \param ellipsoid_sphere true if ellipsoid-sphere case handled separately
+    * \param k_name name for the kernel for force calculation
     * 
     * Returns:
     * -  0 if successfull
@@ -49,8 +52,9 @@ class BaseEllipsoid {
   int init_base(const int nlocal, const int nall, const int max_nbors,
                 const int maxspecial, const double cell_size,
                 const double gpu_split, FILE *screen, const int ntypes,
-                int **h_form, const char *ellipsoid_program,
-                const char *lj_program, const bool ellipsoid_sphere=false);
+                int **h_form, const void *ellipsoid_program,
+                const void *lj_program, const char *k_name,
+                const bool ellipsoid_sphere=false);
 
   /// Estimate the overhead for GPU context changes and CPU driver
   void estimate_gpu_overhead();
@@ -58,7 +62,13 @@ class BaseEllipsoid {
   /// Check if there is enough storage for atom arrays and realloc if not
   /** \param success set to false if insufficient memory **/
   inline void resize_atom(const int nall, bool &success) {
-    atom->resize(nall, success);
+    if (atom->resize(nall, success)) {
+      neigh_tex.bind_float(atom->x,4);
+      pos_tex.bind_float(atom->x,4);
+      quat_tex.bind_float(atom->quat,4);
+      lj_pos_tex.bind_float(atom->x,4);
+      lj_quat_tex.bind_float(atom->quat,4);
+    }      
   }
 
   /// Check if there is enough storage for neighbors and realloc if not
@@ -74,7 +84,7 @@ class BaseEllipsoid {
                            const int max_nbors, const int olist_size,
                            bool &success) {
     ans->resize(nlocal, success);
-    if (_multiple_forms) ans->dev_ans.zero();
+    if (_multiple_forms) ans->force.zero();
 
     if (olist_size>static_cast<int>(host_olist.numel())) {
       host_olist.clear();
@@ -221,8 +231,7 @@ class BaseEllipsoid {
   inline int block_size() { return _block_size; }
 
   // --------------------------- TEXTURES -----------------------------
-  UCL_Texture pos_tex;
-  UCL_Texture q_tex;
+  UCL_Texture pos_tex, quat_tex, lj_pos_tex, lj_quat_tex, neigh_tex;
 
  protected:
   bool _compiled, _ellipsoid_sphere;
@@ -236,8 +245,8 @@ class BaseEllipsoid {
   int **_host_form;
   int _last_ellipse, _max_last_ellipse;
 
-  void compile_kernels(UCL_Device &dev, const char *ellipsoid_string,
-                       const char *lj_string, const bool e_s);
+  void compile_kernels(UCL_Device &dev, const void *ellipsoid_string,
+                       const void *lj_string, const char *kname,const bool e_s);
 
   virtual void loop(const bool _eflag, const bool _vflag) = 0;
 };
diff --git a/lib/gpu/lal_buck.cpp b/lib/gpu/lal_buck.cpp
index 33b73568be..091ae0f62a 100644
--- a/lib/gpu/lal_buck.cpp
+++ b/lib/gpu/lal_buck.cpp
@@ -13,10 +13,12 @@
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "buck_cl.h"
+#elif defined(USE_CUDART)
+const char *buck=0;
 #else
-#include "buck_ptx.h"
+#include "buck_cubin.h"
 #endif
 
 #include "lal_buck.h"
@@ -50,7 +52,7 @@ int BuckT::init(const int ntypes, double **host_cutsq,
            const double gpu_split, FILE *_screen) {
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                            _screen,buck);
+                            _screen,buck,"k_buck");
   if (success!=0)
     return success;
 
@@ -132,20 +134,17 @@ void BuckT::loop(const bool _eflag, const bool _vflag) {
   this->time_pair.start();
   if (shared_types) {
     this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->dev_x.begin(), &coeff1.begin(),
-                          &coeff2.begin(), &sp_lj.begin(),
-                          &this->nbor->dev_nbor.begin(),
-                          &this->_nbor_data->begin(),
-                          &this->ans->dev_ans.begin(),
-                          &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &nbor_pitch, &this->_threads_per_atom);
+    this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
+                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->ans->force, &this->ans->engv, &eflag, 
+                          &vflag, &ainum, &nbor_pitch, 
+                          &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->dev_x.begin(), &coeff1.begin(), &coeff2.begin(),
-                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &nbor_pitch, &this->_threads_per_atom);
+    this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj,
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                     &this->ans->force, &this->ans->engv, &eflag, &vflag,
+                     &ainum, &nbor_pitch, &this->_threads_per_atom);
   }
   this->time_pair.stop();
 }
diff --git a/lib/gpu/lal_buck.cu b/lib/gpu/lal_buck.cu
index 1281fef645..b0c817ad35 100644
--- a/lib/gpu/lal_buck.cu
+++ b/lib/gpu/lal_buck.cu
@@ -15,14 +15,16 @@
 
 #ifdef NV_KERNEL
 #include "lal_aux_fun1.h"
-texture<float4> pos_tex;
 #ifndef _DOUBLE_DOUBLE
-ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
-  { return tex1Dfetch(pos_tex, i); }
+texture<float4> pos_tex;
+#else
+texture<int4,1> pos_tex;
 #endif
+#else
+#define pos_tex x_
 #endif
 
-__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
+__kernel void k_buck(__global numtyp4 *x_, __global numtyp4 *coeff1,
                           __global numtyp4* coeff2, const int lj_types, 
                           __global numtyp *sp_lj_in, __global int *dev_nbor, 
                           __global int *dev_packed, __global acctyp4 *ans,
@@ -51,7 +53,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
@@ -61,7 +63,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int jtype=jx.w;
 
       // Compute r12
@@ -104,7 +106,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
   } // if ii
 }
 
-__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
+__kernel void k_buck_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
                                __global numtyp4* coeff2_in, 
                                __global numtyp* sp_lj_in, 
                                __global int *dev_nbor, __global int *dev_packed, 
@@ -140,7 +142,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
 
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
@@ -151,7 +153,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int mtype=itype+jx.w;
 
       // Compute r12
diff --git a/lib/gpu/lal_buck_coul.cpp b/lib/gpu/lal_buck_coul.cpp
index 42dbfb3e76..75e7231027 100644
--- a/lib/gpu/lal_buck_coul.cpp
+++ b/lib/gpu/lal_buck_coul.cpp
@@ -13,10 +13,12 @@
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "buck_coul_cl.h"
+#elif defined(USE_CUDART)
+const char *buck_coul=0;
 #else
-#include "buck_coul_ptx.h"
+#include "buck_coul_cubin.h"
 #endif
 
 #include "lal_buck_coul.h"
@@ -52,7 +54,7 @@ int BuckCoulT::init(const int ntypes, double **host_cutsq,
                    const double qqrd2e) {
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                            _screen,buck_coul);
+                            _screen,buck_coul,"k_buck_coul");
   if (success!=0)
     return success;
 
@@ -142,23 +144,18 @@ void BuckCoulT::loop(const bool _eflag, const bool _vflag) {
   this->time_pair.start();
   if (shared_types) {
     this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->dev_x.begin(), &coeff1.begin(),
-                          &coeff2.begin(), &sp_lj.begin(),
-                          &this->nbor->dev_nbor.begin(),
-                          &this->_nbor_data->begin(),
-                          &this->ans->dev_ans.begin(),
-                          &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &nbor_pitch,
-                          &this->atom->dev_q.begin(), &cutsq.begin(),
-                          &_qqrd2e, &this->_threads_per_atom);
+    this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
+                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->ans->force, &this->ans->engv, &eflag,
+                          &vflag, &ainum, &nbor_pitch, &this->atom->q, 
+                          &cutsq, &_qqrd2e, &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->dev_x.begin(), &coeff1.begin(), &coeff2.begin(),
-                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &nbor_pitch, &this->atom->dev_q.begin(),
-                     &cutsq.begin(), &_qqrd2e, &this->_threads_per_atom);
+    this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj,
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                     &this->ans->force, &this->ans->engv, &eflag, &vflag,
+                     &ainum, &nbor_pitch, &this->atom->q,
+                     &cutsq, &_qqrd2e, &this->_threads_per_atom);
   }
   this->time_pair.stop();
 }
diff --git a/lib/gpu/lal_buck_coul.cu b/lib/gpu/lal_buck_coul.cu
index df4a824b40..45cc36b0ce 100644
--- a/lib/gpu/lal_buck_coul.cu
+++ b/lib/gpu/lal_buck_coul.cu
@@ -14,18 +14,22 @@
 // ***************************************************************************/
 
 #ifdef NV_KERNEL
+
 #include "lal_aux_fun1.h"
+#ifndef _DOUBLE_DOUBLE
 texture<float4> pos_tex;
 texture<float> q_tex;
-#ifndef _DOUBLE_DOUBLE
-ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
-  { return tex1Dfetch(pos_tex, i); }
-ucl_inline float fetch_q(const int& i, const float *q) 
-  { return tex1Dfetch(q_tex, i); }
-#endif
+#else
+texture<int4,1> pos_tex;
+texture<int2> q_tex;
 #endif
 
-__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
+#else
+#define pos_tex x_
+#define q_tex q_
+#endif
+
+__kernel void k_buck_coul(__global numtyp4 *x_, __global numtyp4 *coeff1,
                           __global numtyp4* coeff2, const int lj_types, 
                           __global numtyp *sp_lj_in, __global int *dev_nbor, 
                           __global int *dev_packed, __global acctyp4 *ans,
@@ -61,8 +65,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    numtyp qtmp=fetch_q(i,q_);
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
     
     for ( ; nbor<list_end; nbor+=n_stride) {
@@ -73,7 +77,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
       factor_coul = sp_lj[sbmask(j)+4];
       j &= NEIGHMASK;
 
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int jtype=jx.w;
 
       // Compute r12
@@ -97,9 +101,10 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
         } else
           forcebuck = (numtyp)0.0;
         
-        if (rsq < coeff2[mtype].z)  // coul
-          forcecoul = qqrd2e*qtmp*fetch_q(j,q_)*ucl_rsqrt(rsq)*factor_coul;
-        else
+        if (rsq < coeff2[mtype].z) {
+          fetch(forcecoul,j,q_tex);
+          forcecoul *= qqrd2e*qtmp*ucl_rsqrt(rsq)*factor_coul;
+        } else
           forcecoul = (numtyp)0.0;
         
         force = (forcebuck + forcecoul) * r2inv;
@@ -131,7 +136,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
   } // if ii
 }
 
-__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
+__kernel void k_buck_coul_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
                                __global numtyp4* coeff2_in, 
                                __global numtyp* sp_lj_in, 
                                __global int *dev_nbor, __global int *dev_packed, 
@@ -172,8 +177,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
 
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    numtyp qtmp=fetch_q(i,q_);
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
@@ -185,7 +190,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in
       factor_coul = sp_lj[sbmask(j)+4];
       j &= NEIGHMASK;
 
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int mtype=itype+jx.w;
 
       // Compute r12
@@ -208,9 +213,10 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in
         } else
           forcebuck = (numtyp)0.0;
         
-        if (rsq < cutsq[mtype].z)  // coul
-          forcecoul = qqrd2e*qtmp*fetch_q(j,q_)*ucl_rsqrt(rsq)*factor_coul;
-        else
+        if (rsq < cutsq[mtype].z) {
+          fetch(forcecoul,j,q_tex);
+          forcecoul *= qqrd2e*qtmp*ucl_rsqrt(rsq)*factor_coul;
+        } else
           forcecoul = (numtyp)0.0;
         
         force = (forcebuck + forcecoul) * r2inv;
diff --git a/lib/gpu/lal_buck_coul_long.cpp b/lib/gpu/lal_buck_coul_long.cpp
index c47f46d1ba..01f5cd1bea 100644
--- a/lib/gpu/lal_buck_coul_long.cpp
+++ b/lib/gpu/lal_buck_coul_long.cpp
@@ -13,10 +13,12 @@
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "buck_coul_long_cl.h"
+#elif defined(USE_CUDART)
+const char *buck_coul_long=0;
 #else
-#include "buck_coul_long_ptx.h"
+#include "buck_coul_long_cubin.h"
 #endif
 
 #include "lal_buck_coul_long.h"
@@ -54,7 +56,7 @@ int BuckCoulLongT::init(const int ntypes, double **host_cutsq,
                        const double g_ewald) {
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                            _screen,buck_coul_long);
+                            _screen,buck_coul_long,"k_buck_coul_long");
   if (success!=0)
     return success;
 
@@ -145,25 +147,19 @@ void BuckCoulLongT::loop(const bool _eflag, const bool _vflag) {
   this->time_pair.start();
   if (shared_types) {
     this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->dev_x.begin(), &coeff1.begin(),
-                          &coeff2.begin(), &sp_lj.begin(),
-                          &this->nbor->dev_nbor.begin(),
-                          &this->_nbor_data->begin(),
-                          &this->ans->dev_ans.begin(),
-                          &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &nbor_pitch, &this->atom->dev_q.begin(),
-                          &cutsq.begin(), &_cut_coulsq, &_qqrd2e, 
+    this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
+                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->ans->force, &this->ans->engv, &eflag, 
+                          &vflag, &ainum, &nbor_pitch, &this->atom->q,
+                          &cutsq, &_cut_coulsq, &_qqrd2e, 
                           &_g_ewald, &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->dev_x.begin(), &coeff1.begin(), 
-                   &coeff2.begin(), &_lj_types, &sp_lj.begin(), 
-                   &this->nbor->dev_nbor.begin(),
-                   &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                   &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                   &nbor_pitch, &this->atom->dev_q.begin(), 
-                   &cutsq.begin(), &_cut_coulsq,
-                   &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
+    this->k_pair.run(&this->atom->x, &coeff1,  &coeff2, &_lj_types, &sp_lj, 
+                   &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                   &this->ans->force, &this->ans->engv, &eflag, &vflag,
+                   &ainum, &nbor_pitch, &this->atom->q, &cutsq, 
+                   &_cut_coulsq, &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
   }
   this->time_pair.stop();
 }
diff --git a/lib/gpu/lal_buck_coul_long.cu b/lib/gpu/lal_buck_coul_long.cu
index 95c13dc263..9ab3ed32c5 100644
--- a/lib/gpu/lal_buck_coul_long.cu
+++ b/lib/gpu/lal_buck_coul_long.cu
@@ -14,18 +14,22 @@
 // ***************************************************************************/
 
 #ifdef NV_KERNEL
+
 #include "lal_aux_fun1.h"
+#ifndef _DOUBLE_DOUBLE
 texture<float4> pos_tex;
 texture<float> q_tex;
-#ifndef _DOUBLE_DOUBLE
-ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
-  { return tex1Dfetch(pos_tex, i); }
-ucl_inline float fetch_q(const int& i, const float *q) 
-  { return tex1Dfetch(q_tex, i); }
-#endif
+#else
+texture<int4,1> pos_tex;
+texture<int2> q_tex;
 #endif
 
-__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
+#else
+#define pos_tex x_
+#define q_tex q_
+#endif
+
+__kernel void k_buck_coul_long(__global numtyp4 *x_, __global numtyp4 *coeff1,
                           __global numtyp4* coeff2, const int lj_types, 
                           __global numtyp *sp_lj_in, __global int *dev_nbor, 
                           __global int *dev_packed, __global acctyp4 *ans,
@@ -62,8 +66,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    numtyp qtmp=fetch_q(i,q_);
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
 
     for ( ; nbor<list_end; nbor+=n_stride) {
@@ -74,7 +78,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
       factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
       j &= NEIGHMASK;
 
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int jtype=jx.w;
 
       // Compute r12
@@ -104,7 +108,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
           numtyp expm2 = ucl_exp(-grij*grij);
           numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
           _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-          prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
+          fetch(prefactor,j,q_tex);
+          prefactor *= qqrd2e * qtmp/r;
           forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
         } else
           forcecoul = (numtyp)0.0;
@@ -139,7 +144,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
   } // if ii
 }
 
-__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
+__kernel void k_buck_coul_long_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
                                __global numtyp4* coeff2_in, 
                                __global numtyp* sp_lj_in,
                                __global int *dev_nbor, __global int *dev_packed,
@@ -179,8 +184,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    numtyp qtmp=fetch_q(i,q_);
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
@@ -192,7 +197,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in
       factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
       j &= NEIGHMASK;
 
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int mtype=itype+jx.w;
 
       // Compute r12
@@ -221,7 +226,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in
           numtyp expm2 = ucl_exp(-grij*grij);
           numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
           _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-          prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
+          fetch(prefactor,j,q_tex);
+          prefactor *= qqrd2e * qtmp/r;
           forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
         } else
           forcecoul = (numtyp)0.0;
diff --git a/lib/gpu/lal_cg_cmm.cpp b/lib/gpu/lal_cg_cmm.cpp
index 7b798cc446..2d4a44a14c 100644
--- a/lib/gpu/lal_cg_cmm.cpp
+++ b/lib/gpu/lal_cg_cmm.cpp
@@ -13,10 +13,12 @@
     email                : brownw@ornl.gov
  ***************************************************************************/
 
-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "cg_cmm_cl.h"
+#elif defined(USE_CUDART)
+const char *cg_cmm=0;
 #else
-#include "cg_cmm_ptx.h"
+#include "cg_cmm_cubin.h"
 #endif
 
 #include "lal_cg_cmm.h"
@@ -51,7 +53,7 @@ int CGCMMT::init(const int ntypes, double **host_cutsq,
                           const double gpu_split, FILE *_screen) {
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                            _screen,cg_cmm);
+                            _screen,cg_cmm,"k_cg_cmm");
   if (success!=0)
     return success;
 
@@ -133,19 +135,17 @@ void CGCMMT::loop(const bool _eflag, const bool _vflag) {
   this->time_pair.start();
   if (shared_types) {
     this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
-                          &lj3.begin(), &sp_lj.begin(),
-                          &this->nbor->dev_nbor.begin(),
-                          &this->_nbor_data->begin(),
-                          &this->ans->dev_ans.begin(),
-                          &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &nbor_pitch, &this->_threads_per_atom);
+    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
+                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->ans->force, &this->ans->engv, &eflag,
+                          &vflag, &ainum, &nbor_pitch,  
+                          &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
-                     &_cmm_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
+    this->k_pair.run(&this->atom->x, &lj1, &lj3,
+                     &_cmm_types, &sp_lj, &this->nbor->dev_nbor,
+                     &this->_nbor_data->begin(), &this->ans->force,
+                     &this->ans->engv, &eflag, &vflag, &ainum,
                      &nbor_pitch, &this->_threads_per_atom);
   }
   this->time_pair.stop();
diff --git a/lib/gpu/lal_cg_cmm.cu b/lib/gpu/lal_cg_cmm.cu
index 46d1cbab50..4543320cc7 100644
--- a/lib/gpu/lal_cg_cmm.cu
+++ b/lib/gpu/lal_cg_cmm.cu
@@ -15,14 +15,16 @@
 
 #ifdef NV_KERNEL
 #include "lal_aux_fun1.h"
-texture<float4> pos_tex;
 #ifndef _DOUBLE_DOUBLE
-ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
-  { return tex1Dfetch(pos_tex, i); }
+texture<float4> pos_tex;
+#else
+texture<int4,1> pos_tex;
 #endif
+#else
+#define pos_tex x_
 #endif
 
-__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
+__kernel void k_cg_cmm(__global numtyp4 *x_, __global numtyp4 *lj1,
                           __global numtyp4* lj3, const int lj_types, 
                           __global numtyp *sp_lj_in, __global int *dev_nbor, 
                           __global int *dev_packed, __global acctyp4 *ans,
@@ -51,7 +53,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
@@ -61,7 +63,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int jtype=jx.w;
 
       // Compute r12
@@ -109,7 +111,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
   } // if ii
 }
 
-__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
+__kernel void k_cg_cmm_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                                __global numtyp4* lj3_in, 
                                __global numtyp* sp_lj_in,__global int *dev_nbor,
                                __global int *dev_packed, __global acctyp4 *ans,
@@ -145,7 +147,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
@@ -156,7 +158,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int mtype=itype+jx.w;
 
       // Compute r12
diff --git a/lib/gpu/lal_cg_cmm_long.cpp b/lib/gpu/lal_cg_cmm_long.cpp
index 2c12125b52..50e2977c70 100644
--- a/lib/gpu/lal_cg_cmm_long.cpp
+++ b/lib/gpu/lal_cg_cmm_long.cpp
@@ -13,10 +13,12 @@
     email                : brownw@ornl.gov
  ***************************************************************************/
 
-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "cg_cmm_long_cl.h"
+#elif defined(USE_CUDART)
+const char *cg_cmm_long=0;
 #else
-#include "cg_cmm_long_ptx.h"
+#include "cg_cmm_long_cubin.h"
 #endif
 
 #include "lal_cg_cmm_long.h"
@@ -56,7 +58,7 @@ int CGCMMLongT::init(const int ntypes, double **host_cutsq,
                            const double g_ewald) {
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                            _screen,cg_cmm_long);
+                            _screen,cg_cmm_long,"k_cg_cmm_long");
   if (success!=0)
     return success;
 
@@ -144,24 +146,19 @@ void CGCMMLongT::loop(const bool _eflag, const bool _vflag) {
   this->time_pair.start();
   if (shared_types) {
     this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
-                          &lj3.begin(), &sp_lj.begin(),
-                          &this->nbor->dev_nbor.begin(),
-                          &this->_nbor_data->begin(),
-                          &this->ans->dev_ans.begin(),
-                          &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &nbor_pitch,
-                          &this->atom->dev_q.begin(), &_cut_coulsq,
-                          &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
+    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
+                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->ans->force, &this->ans->engv, &eflag,
+                          &vflag, &ainum, &nbor_pitch, &this->atom->q, 
+                          &_cut_coulsq, &_qqrd2e, &_g_ewald, 
+                          &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
-                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &nbor_pitch, &this->atom->dev_q.begin(),
-                     &_cut_coulsq, &_qqrd2e, &_g_ewald, 
-                     &this->_threads_per_atom);
+    this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, 
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                     &this->ans->force, &this->ans->engv, &eflag, &vflag,
+                     &ainum, &nbor_pitch, &this->atom->q, &_cut_coulsq,
+                     &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
   }
   this->time_pair.stop();
 }
diff --git a/lib/gpu/lal_cg_cmm_long.cu b/lib/gpu/lal_cg_cmm_long.cu
index b0db9d2aa3..2f1d9f2d21 100644
--- a/lib/gpu/lal_cg_cmm_long.cu
+++ b/lib/gpu/lal_cg_cmm_long.cu
@@ -14,18 +14,22 @@
 // ***************************************************************************/
 
 #ifdef NV_KERNEL
+
 #include "lal_aux_fun1.h"
+#ifndef _DOUBLE_DOUBLE
 texture<float4> pos_tex;
 texture<float> q_tex;
-#ifndef _DOUBLE_DOUBLE
-ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
-  { return tex1Dfetch(pos_tex, i); }
-ucl_inline float fetch_q(const int& i, const float *q) 
-  { return tex1Dfetch(q_tex, i); }
-#endif
+#else
+texture<int4,1> pos_tex;
+texture<int2> q_tex;
 #endif
 
-__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
+#else
+#define pos_tex x_
+#define q_tex q_
+#endif
+
+__kernel void k_cg_cmm_long(__global numtyp4 *x_, __global numtyp4 *lj1,
                           __global numtyp4* lj3, const int lj_types, 
                           __global numtyp *sp_lj_in, __global int *dev_nbor, 
                           __global int *dev_packed, __global acctyp4 *ans,
@@ -61,8 +65,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    numtyp qtmp=fetch_q(i,q_);
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
 
     for ( ; nbor<list_end; nbor+=n_stride) {
@@ -73,7 +77,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
       factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
       j &= NEIGHMASK;
 
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int jtype=jx.w;
 
       // Compute r12
@@ -108,7 +112,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
           numtyp expm2 = ucl_exp(-grij*grij);
           numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
           _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-          prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
+          fetch(prefactor,j,q_tex);
+          prefactor *= qqrd2e * qtmp/r;
           forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
         } else
           forcecoul = (numtyp)0.0;
@@ -143,7 +148,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
   } // if ii
 }
 
-__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
+__kernel void k_cg_cmm_long_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                                __global numtyp4* lj3_in, 
                                __global numtyp* sp_lj_in, 
                                __global int *dev_nbor, __global int *dev_packed,
@@ -181,8 +186,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    numtyp qtmp=fetch_q(i,q_);
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
@@ -194,7 +199,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
       factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
       j &= NEIGHMASK;
 
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int mtype=itype+jx.w;
 
       // Compute r12
@@ -228,7 +233,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
           numtyp expm2 = ucl_exp(-grij*grij);
           numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
           _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-          prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
+          fetch(prefactor,j,q_tex);
+          prefactor *= qqrd2e * qtmp/r;
           forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
         } else
           forcecoul = (numtyp)0.0;
diff --git a/lib/gpu/lal_charmm_long.cpp b/lib/gpu/lal_charmm_long.cpp
index d260230815..55a7f64be0 100644
--- a/lib/gpu/lal_charmm_long.cpp
+++ b/lib/gpu/lal_charmm_long.cpp
@@ -13,10 +13,12 @@
     email                : brownw@ornl.gov
  ***************************************************************************/
 
-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "charmm_long_cl.h"
+#elif defined(USE_CUDART)
+const char *charmm_long=0;
 #else
-#include "charmm_long_ptx.h"
+#include "charmm_long_cubin.h"
 #endif
 
 #include "lal_charmm_long.h"
@@ -57,7 +59,7 @@ int CHARMMLongT::init(const int ntypes,
                            double **sigma, const bool mix_arithmetic) {
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                            _screen,charmm_long);
+                            _screen,charmm_long,"k_charmm_long");
   if (success!=0)
     return success;
 
@@ -148,22 +150,19 @@ void CHARMMLongT::loop(const bool _eflag, const bool _vflag) {
   this->time_pair.start();
   if (shared_types) {
     this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->dev_x.begin(), &ljd.begin(),
-                          &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                          &this->_nbor_data->begin(),
-                          &this->ans->dev_ans.begin(),
-                          &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &nbor_pitch, &this->atom->dev_q.begin(),
+    this->k_pair_fast.run(&this->atom->x, &ljd, &sp_lj, 
+                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->ans->force, &this->ans->engv, &eflag,
+                          &vflag, &ainum, &nbor_pitch, &this->atom->q,
                           &_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj,
                           &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq, 
                           &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(),
-                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &nbor_pitch, &this->atom->dev_q.begin(),
+    this->k_pair.run(&this->atom->x, &lj1, &_lj_types, &sp_lj, 
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                     &this->ans->force, &this->ans->engv, &eflag, &vflag,
+                     &ainum, &nbor_pitch, &this->atom->q,
                      &_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj,
                      &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq,
                      &this->_threads_per_atom);
diff --git a/lib/gpu/lal_charmm_long.cu b/lib/gpu/lal_charmm_long.cu
index aa88967d66..9b884e473a 100644
--- a/lib/gpu/lal_charmm_long.cu
+++ b/lib/gpu/lal_charmm_long.cu
@@ -14,18 +14,22 @@
 // ***************************************************************************/
 
 #ifdef NV_KERNEL
+
 #include "lal_aux_fun1.h"
+#ifndef _DOUBLE_DOUBLE
 texture<float4> pos_tex;
 texture<float> q_tex;
-#ifndef _DOUBLE_DOUBLE
-ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
-  { return tex1Dfetch(pos_tex, i); }
-ucl_inline float fetch_q(const int& i, const float *q) 
-  { return tex1Dfetch(q_tex, i); }
-#endif
+#else
+texture<int4,1> pos_tex;
+texture<int2> q_tex;
 #endif
 
-__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
+#else
+#define pos_tex x_
+#define q_tex q_
+#endif
+
+__kernel void k_charmm_long(__global numtyp4 *x_, __global numtyp4 *lj1,
                           const int lj_types, __global numtyp *sp_lj_in,
                           __global int *dev_nbor, __global int *dev_packed,
                           __global acctyp4 *ans, __global acctyp *engv, 
@@ -62,8 +66,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    numtyp qtmp=fetch_q(i,q_);
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
 
     for ( ; nbor<list_end; nbor+=n_stride) {
@@ -74,7 +78,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
       factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
       j &= NEIGHMASK;
 
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int jtype=jx.w;
 
       // Compute r12
@@ -110,7 +114,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
           numtyp expm2 = ucl_exp(-grij*grij);
           numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
           _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-          prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
+          fetch(prefactor,j,q_tex);
+          prefactor *= qqrd2e * qtmp/r;
           forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
         } else
           forcecoul = (numtyp)0.0;
@@ -147,7 +152,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
   } // if ii
 }
 
-__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
+__kernel void k_charmm_long_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
                                __global numtyp* sp_lj_in, __global int *dev_nbor, 
                                __global int *dev_packed, __global acctyp4 *ans,
                                __global acctyp *engv, const int eflag,
@@ -185,8 +190,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    numtyp qtmp=fetch_q(i,q_);
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
 
     for ( ; nbor<list_end; nbor+=n_stride) {
@@ -197,7 +202,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
       factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
       j &= NEIGHMASK;
 
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int jtype=jx.w;
 
       // Compute r12
@@ -239,7 +244,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
           numtyp expm2 = ucl_exp(-grij*grij);
           numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
           _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-          prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
+          fetch(prefactor,j,q_tex);
+          prefactor *= qqrd2e * qtmp/r;
           forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
         } else
           forcecoul = (numtyp)0.0;
diff --git a/lib/gpu/lal_coul_long.cpp b/lib/gpu/lal_coul_long.cpp
index 90c07246fa..b5e6f670b3 100644
--- a/lib/gpu/lal_coul_long.cpp
+++ b/lib/gpu/lal_coul_long.cpp
@@ -13,10 +13,12 @@
     email                : a.kohlmeyer@temple.edu
  ***************************************************************************/
 
-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "coul_long_cl.h"
+#elif defined(USE_CUDART)
+const char *coul_long=0;
 #else
-#include "coul_long_ptx.h"
+#include "coul_long_cubin.h"
 #endif
 
 #include "lal_coul_long.h"
@@ -48,7 +50,7 @@ int CoulLongT::init(const int nlocal, const int nall, const int max_nbors,
 			 const double qqrd2e, const double g_ewald) {
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,
-			    gpu_split,_screen,coul_long);
+			                      gpu_split,_screen,coul_long,"k_coul_long");
   if (success!=0)
     return success;
 
@@ -132,22 +134,18 @@ void CoulLongT::loop(const bool _eflag, const bool _vflag) {
   this->time_pair.start();
   if (shared_types) {
     this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
-                          &lj3.begin(), &sp_cl.begin(),
-                          &this->nbor->dev_nbor.begin(),
-                          &this->_nbor_data->begin(),
-                          &this->ans->dev_ans.begin(),
-                          &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &nbor_pitch, &this->atom->dev_q.begin(),
-                          &_cut_coulsq, &_qqrd2e, &_g_ewald,
+    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_cl,
+                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->ans->force, &this->ans->engv,
+                          &eflag, &vflag, &ainum, &nbor_pitch, 
+                          &this->atom->q, &_cut_coulsq, &_qqrd2e, &_g_ewald,
                           &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
-                     &_lj_types, &sp_cl.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq,
+    this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_cl,
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                     &this->ans->force, &this->ans->engv, &eflag, &vflag,
+                     &ainum, &nbor_pitch, &this->atom->q, &_cut_coulsq,
                      &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
   }
   this->time_pair.stop();
diff --git a/lib/gpu/lal_coul_long.cu b/lib/gpu/lal_coul_long.cu
index 88d7406206..b93010c959 100644
--- a/lib/gpu/lal_coul_long.cu
+++ b/lib/gpu/lal_coul_long.cu
@@ -14,18 +14,22 @@
 // ***************************************************************************/
 
 #ifdef NV_KERNEL
+
 #include "lal_aux_fun1.h"
+#ifndef _DOUBLE_DOUBLE
 texture<float4> pos_tex;
 texture<float> q_tex;
-#ifndef _DOUBLE_DOUBLE
-ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
-  { return tex1Dfetch(pos_tex, i); }
-ucl_inline float fetch_q(const int& i, const float *q) 
-  { return tex1Dfetch(q_tex, i); }
-#endif
+#else
+texture<int4,1> pos_tex;
+texture<int2> q_tex;
 #endif
 
-__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
+#else
+#define pos_tex x_
+#define q_tex q_
+#endif
+
+__kernel void k_coul_long(__global numtyp4 *x_, __global numtyp4 *lj1,
                           __global numtyp4* lj3, const int lj_types,
                           __global numtyp *sp_cl_in, __global int *dev_nbor,
                           __global int *dev_packed, __global acctyp4 *ans,
@@ -56,8 +60,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
 
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    numtyp qtmp=fetch_q(i,q_);
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
 
     for ( ; nbor<list_end; nbor+=n_stride) {
       int j=*nbor;
@@ -66,7 +70,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
       factor_coul = (numtyp)1.0-sp_cl[sbmask(j)];
       j &= NEIGHMASK;
 
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
 
       // Compute r12
       numtyp delx = ix.x-jx.x;
@@ -75,16 +79,17 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
       numtyp rsq = delx*delx+dely*dely+delz*delz;
 
       if (rsq < cut_coulsq) {
-	numtyp r2inv=ucl_recip(rsq);
-	numtyp force, prefactor, _erfc;
+        numtyp r2inv=ucl_recip(rsq);
+        numtyp force, prefactor, _erfc;
 
-	numtyp r = ucl_rsqrt(r2inv);
-	numtyp grij = g_ewald * r;
-	numtyp expm2 = ucl_exp(-grij*grij);
-	numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
-	_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-	prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
-	force = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul) * r2inv;
+        numtyp r = ucl_rsqrt(r2inv);
+        numtyp grij = g_ewald * r;
+        numtyp expm2 = ucl_exp(-grij*grij);
+        numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
+        _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+        fetch(prefactor,j,q_tex);
+        prefactor *= qqrd2e * qtmp/r;
+        force = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul) * r2inv;
 
         f.x+=delx*force;
         f.y+=dely*force;
@@ -162,7 +167,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
   } // if ii
 }
 
-__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
+__kernel void k_coul_long_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                                __global numtyp4* lj3_in,
                                __global numtyp* sp_cl_in,
                                __global int *dev_nbor, __global int *dev_packed,
@@ -193,8 +198,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
 
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    numtyp qtmp=fetch_q(i,q_);
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
 
     for ( ; nbor<list_end; nbor+=n_stride) {
       int j=*nbor;
@@ -203,7 +208,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
       factor_coul = (numtyp)1.0-sp_cl[sbmask(j)];
       j &= NEIGHMASK;
 
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
 
       // Compute r12
       numtyp delx = ix.x-jx.x;
@@ -212,16 +217,17 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
       numtyp rsq = delx*delx+dely*dely+delz*delz;
 
       if (rsq < cut_coulsq) {
-	numtyp r2inv=ucl_recip(rsq);
-	numtyp force, prefactor, _erfc;
+        numtyp r2inv=ucl_recip(rsq);
+        numtyp force, prefactor, _erfc;
 
-	numtyp r = ucl_rsqrt(r2inv);
-	numtyp grij = g_ewald * r;
-	numtyp expm2 = ucl_exp(-grij*grij);
-	numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
-	_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-	prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
-	force = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul) * r2inv;
+        numtyp r = ucl_rsqrt(r2inv);
+        numtyp grij = g_ewald * r;
+        numtyp expm2 = ucl_exp(-grij*grij);
+        numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
+        _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+        fetch(prefactor,j,q_tex);
+        prefactor *= qqrd2e * qtmp/r;
+        force = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul) * r2inv;
 
         f.x+=delx*force;
         f.y+=dely*force;
diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp
index 24fdccd51c..3952495393 100644
--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@@ -21,10 +21,12 @@
 #include <omp.h>
 #endif
 
-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "device_cl.h"
+#elif defined(USE_CUDART)
+const char *device=0;
 #else
-#include "device_ptx.h"
+#include "device_cubin.h"
 #endif
 
 using namespace LAMMPS_AL;
@@ -42,10 +44,10 @@ DeviceT::~Device() {
 }
 
 template <class numtyp, class acctyp>
-int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, 
-                                const int first_gpu, const int last_gpu,
-                                const int gpu_mode, const double p_split,
-                                const int nthreads, const int t_per_atom) {
+int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
+                         const int last_gpu, const int gpu_mode, 
+                         const double p_split, const int nthreads, 
+                         const int t_per_atom, const double cell_size) {
   _nthreads=nthreads;
   #ifdef _OPENMP
   omp_set_num_threads(nthreads);
@@ -62,6 +64,7 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica,
   _last_device=last_gpu;
   _gpu_mode=gpu_mode;
   _particle_split=p_split;
+  _cell_size=cell_size;
 
   // Get the rank/size within the world
   MPI_Comm_rank(_comm_world,&_world_me);
@@ -191,7 +194,7 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
   } else {
     if (atom.charge()==false && charge)
       _data_in_estimate++;
-    if (atom.quat()==false && rot)
+    if (atom.quaternion()==false && rot)
       _data_in_estimate++;
     if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial))
       return -3;
@@ -205,7 +208,10 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
                   _block_cell_id, _block_nbor_build, threads_per_atom,
                   _warp_size, _time_device))
     return -3;
-  nbor->cell_size(cell_size);
+  if (_cell_size<0.0)
+    nbor->cell_size(cell_size,cell_size);
+  else
+    nbor->cell_size(_cell_size,cell_size);
 
   _init_count++;
   return 0;
@@ -251,7 +257,9 @@ void DeviceT::set_double_precompute
 template <class numtyp, class acctyp>
 void DeviceT::init_message(FILE *screen, const char *name,
                                   const int first_gpu, const int last_gpu) {
-  #ifdef USE_OPENCL
+  #if defined(USE_OPENCL)
+  std::string fs="";
+  #elif defined(USE_CUDART)
   std::string fs="";
   #else
   std::string fs=toa(gpu->free_gigabytes())+"/";
@@ -411,13 +419,11 @@ void DeviceT::estimate_gpu_overhead(const int kernel_calls,
 }              
 
 template <class numtyp, class acctyp>
-void DeviceT::output_times(UCL_Timer &time_pair, 
-                                  Answer<numtyp,acctyp> &ans, 
-                                  Neighbor &nbor, const double avg_split, 
-                                  const double max_bytes, 
-                                  const double gpu_overhead,
-                                  const double driver_overhead, 
-                                  const int threads_per_atom, FILE *screen) {
+void DeviceT::output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans, 
+                           Neighbor &nbor, const double avg_split, 
+                           const double max_bytes, const double gpu_overhead,
+                           const double driver_overhead, 
+                           const int threads_per_atom, FILE *screen) {
   double single[9], times[9];
 
   single[0]=atom.transfer_time()+ans.transfer_time();
@@ -574,33 +580,32 @@ int DeviceT::compile_kernels() {
   k_info.set_function(*dev_program,"kernel_info");
   _compiled=true;
 
-  UCL_H_Vec<int> h_gpu_lib_data(14,*gpu,UCL_NOT_PINNED);
-  UCL_D_Vec<int> d_gpu_lib_data(14,*gpu);
+  UCL_Vector<int,int> gpu_lib_data(14,*gpu,UCL_NOT_PINNED);
   k_info.set_size(1,1);
-  k_info.run(&d_gpu_lib_data.begin());
-  ucl_copy(h_gpu_lib_data,d_gpu_lib_data,false);
+  k_info.run(&gpu_lib_data);
+  gpu_lib_data.update_host(false);
   
-  _ptx_arch=static_cast<double>(h_gpu_lib_data[0])/100.0;
+  _ptx_arch=static_cast<double>(gpu_lib_data[0])/100.0;
   #ifndef USE_OPENCL
   if (_ptx_arch>gpu->arch())
     return -4;
   #endif
 
-  _num_mem_threads=h_gpu_lib_data[1];
-  _warp_size=h_gpu_lib_data[2];
+  _num_mem_threads=gpu_lib_data[1];
+  _warp_size=gpu_lib_data[2];
   if (_threads_per_atom<1)
-    _threads_per_atom=h_gpu_lib_data[3];
+    _threads_per_atom=gpu_lib_data[3];
   if (_threads_per_charge<1)
-    _threads_per_charge=h_gpu_lib_data[13];
-  _pppm_max_spline=h_gpu_lib_data[4];
-  _pppm_block=h_gpu_lib_data[5];
-  _block_pair=h_gpu_lib_data[6];
-  _max_shared_types=h_gpu_lib_data[7];
-  _block_cell_2d=h_gpu_lib_data[8];
-  _block_cell_id=h_gpu_lib_data[9];
-  _block_nbor_build=h_gpu_lib_data[10];
-  _block_bio_pair=h_gpu_lib_data[11];
-  _max_bio_shared_types=h_gpu_lib_data[12];
+    _threads_per_charge=gpu_lib_data[13];
+  _pppm_max_spline=gpu_lib_data[4];
+  _pppm_block=gpu_lib_data[5];
+  _block_pair=gpu_lib_data[6];
+  _max_shared_types=gpu_lib_data[7];
+  _block_cell_2d=gpu_lib_data[8];
+  _block_cell_id=gpu_lib_data[9];
+  _block_nbor_build=gpu_lib_data[10];
+  _block_bio_pair=gpu_lib_data[11];
+  _max_bio_shared_types=gpu_lib_data[12];
 
   if (static_cast<size_t>(_block_pair)>gpu->group_size())
     _block_pair=gpu->group_size();
@@ -634,9 +639,10 @@ Device<PRECISION,ACC_PRECISION> global_device;
 int lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
                     const int last_gpu, const int gpu_mode, 
                     const double particle_split, const int nthreads,
-                    const int t_per_atom) {
+                    const int t_per_atom, const double cell_size) {
   return global_device.init_device(world,replica,first_gpu,last_gpu,gpu_mode,
-                                     particle_split,nthreads,t_per_atom);
+                                   particle_split,nthreads,t_per_atom, 
+                                   cell_size);
 }
 
 void lmp_clear_device() {
diff --git a/lib/gpu/lal_device.h b/lib/gpu/lal_device.h
index e71c22de8b..6cfad82054 100644
--- a/lib/gpu/lal_device.h
+++ b/lib/gpu/lal_device.h
@@ -49,7 +49,7 @@ class Device {
   int init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, 
                    const int last_gpu, const int gpu_mode, 
                    const double particle_split, const int nthreads,
-                   const int t_per_atom);
+                   const int t_per_atom, const double cell_size);
 
   /// Initialize the device for Atom and Neighbor storage
   /** \param rot True if quaternions need to be stored
@@ -239,7 +239,7 @@ class Device {
     int num_blocks=static_cast<int>(ceil(static_cast<double>(numel)/
                                     _block_pair));
     k_zero.set_size(num_blocks,_block_pair);
-    k_zero.run(&mem.begin(),&numel);
+    k_zero.run(&mem,&numel);
   }
 
   // -------------------------- DEVICE DATA ------------------------- 
@@ -288,6 +288,7 @@ class Device {
   double _particle_split;
   double _cpu_full;
   double _ptx_arch;
+  double _cell_size; // -1 if the cutoff is used
 
   int _num_mem_threads, _warp_size, _threads_per_atom, _threads_per_charge;
   int _pppm_max_spline, _pppm_block;
diff --git a/lib/gpu/lal_eam.cpp b/lib/gpu/lal_eam.cpp
index 5642e5bbfe..5182f0b11f 100644
--- a/lib/gpu/lal_eam.cpp
+++ b/lib/gpu/lal_eam.cpp
@@ -13,10 +13,12 @@
     email                : brownw@ornl.gov nguyentd@ornl.gov
  ***************************************************************************/
  
-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "eam_cl.h"
+#elif defined(USE_CUDART)
+const char *eam=0;
 #else
-#include "eam_ptx.h"
+#include "eam_cubin.h"
 #endif
 
 #include "lal_eam.h"
@@ -51,32 +53,24 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
 {
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,
-                            gpu_split,_screen,eam);
+                            gpu_split,_screen,eam,"k_eam");
   
   if (success!=0)
     return success;
   
   // allocate fp
   
-  bool cpuview=false;
-  if (this->ucl_device->device_type()==UCL_CPU)
-    cpuview=true;
-  
   int ef_nall=nall;
   if (ef_nall==0)
     ef_nall=2000;
 
   _max_fp_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
-  host_fp.alloc(_max_fp_size,*(this->ucl_device));
-  if (cpuview)
-    dev_fp.view(host_fp);
-  else 
-    dev_fp.alloc(_max_fp_size,*(this->ucl_device),UCL_WRITE_ONLY);
+  _fp.alloc(_max_fp_size,*(this->ucl_device),UCL_RW_OPTIMIZED,UCL_WRITE_ONLY);
                                      
-  k_energy.set_function(*(this->pair_program),"kernel_energy");
-  k_energy_fast.set_function(*(this->pair_program),"kernel_energy_fast");
+  k_energy.set_function(*(this->pair_program),"k_energy");
+  k_energy_fast.set_function(*(this->pair_program),"k_energy_fast");
   fp_tex.get_texture(*(this->pair_program),"fp_tex");
-  fp_tex.bind_float(dev_fp,1);
+  fp_tex.bind_float(_fp,1);
   _compiled_energy = true;
   
   // Initialize timers for selected GPU
@@ -236,7 +230,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
         + frho_spline2.row_bytes()
         + z2r_spline1.row_bytes()
         + z2r_spline2.row_bytes()
-        + dev_fp.row_bytes();
+        + _fp.device.row_bytes();
   return 0;
 }
 
@@ -255,8 +249,7 @@ void EAMT::clear() {
   z2r_spline1.clear();
   z2r_spline2.clear();
   
-  host_fp.clear();
-  dev_fp.clear();
+  _fp.clear();
   
   time_pair2.clear();
   time_fp1.clear();
@@ -303,19 +296,11 @@ void EAMT::compute(const int f_ago, const int inum_full, const int nlocal,
   // ------------------- Resize FP Array for EAM --------------------
   
   if (nall>_max_fp_size) {
-    dev_fp.clear();
-    host_fp.clear();
-    
     _max_fp_size=static_cast<int>(static_cast<double>(nall)*1.10);
-    host_fp.alloc(_max_fp_size,*(this->ucl_device));
-    if (this->ucl_device->device_type()==UCL_CPU)
-      dev_fp.view(host_fp);
-    else 
-      dev_fp.alloc(_max_fp_size,*(this->ucl_device));
-    
-    fp_tex.bind_float(dev_fp,1);
+    _fp.resize(_max_fp_size);
+    fp_tex.bind_float(_fp,1);
   }
-  *fp_ptr=host_fp.begin();
+  *fp_ptr=_fp.host.begin();
 
   // ----------------------------------------------------------------
 
@@ -348,7 +333,7 @@ void EAMT::compute(const int f_ago, const int inum_full, const int nlocal,
   // copy fp from device to host for comm
   _nlocal=nlocal;
   time_fp1.start();
-  ucl_copy(host_fp,dev_fp,nlocal,true);
+  _fp.update_host(nlocal,true);
   time_fp1.stop();
   time_fp1.sync_stop();
 }
@@ -380,19 +365,11 @@ int** EAMT::compute(const int ago, const int inum_full, const int nall,
   // ------------------- Resize FP Array for EAM --------------------
   
   if (nall>_max_fp_size) {
-    dev_fp.clear();
-    host_fp.clear();
-    
     _max_fp_size=static_cast<int>(static_cast<double>(nall)*1.10);
-    host_fp.alloc(_max_fp_size,*(this->ucl_device));
-    if (this->ucl_device->device_type()==UCL_CPU)
-      dev_fp.view(host_fp);
-    else 
-      dev_fp.alloc(_max_fp_size,*(this->ucl_device));
-    
-    fp_tex.bind_float(dev_fp,1);
+    _fp.resize(_max_fp_size);
+    fp_tex.bind_float(_fp,1);
   }      
-  *fp_ptr=host_fp.begin();  
+  *fp_ptr=_fp.host.begin();  
 
   // -----------------------------------------------------------------
   
@@ -428,7 +405,7 @@ int** EAMT::compute(const int ago, const int inum_full, const int nall,
   // copy fp from device to host for comm
   _nlocal=inum_full;
   time_fp1.start();
-  ucl_copy(host_fp,dev_fp,inum_full,true);
+  _fp.update_host(inum_full,true);
   time_fp1.stop();
   time_fp1.sync_stop();
   
@@ -486,22 +463,18 @@ void EAMT::loop(const bool _eflag, const bool _vflag) {
   
   if (shared_types) {
     this->k_energy_fast.set_size(GX,BX);
-    this->k_energy_fast.run(&this->atom->dev_x.begin(), &type2rhor_z2r.begin(),
-                            &type2frho.begin(), &rhor_spline2.begin(),
-                            &frho_spline1.begin(),&frho_spline2.begin(), 
-                            &this->nbor->dev_nbor.begin(), 
-                            &this->_nbor_data->begin(), &dev_fp.begin(), 
-                            &this->ans->dev_engv.begin(), &eflag, &ainum,
+    this->k_energy_fast.run(&this->atom->x, &type2rhor_z2r, &type2frho,
+                            &rhor_spline2, &frho_spline1,&frho_spline2, 
+                            &this->nbor->dev_nbor,  &this->_nbor_data->begin(), 
+                            &_fp, &this->ans->engv, &eflag, &ainum,
                             &nbor_pitch, &_ntypes, &_cutforcesq, &_rdr, &_rdrho,
                             &_nrho, &_nr, &this->_threads_per_atom);
   } else {
     this->k_energy.set_size(GX,BX);
-    this->k_energy.run(&this->atom->dev_x.begin(), &type2rhor_z2r.begin(),
-                       &type2frho.begin(), &rhor_spline2.begin(),
-                       &frho_spline1.begin(),&frho_spline2.begin(), 
-                       &this->nbor->dev_nbor.begin(), 
-                       &this->_nbor_data->begin(), &dev_fp.begin(), 
-                       &this->ans->dev_engv.begin(),&eflag, &ainum, &nbor_pitch,
+    this->k_energy.run(&this->atom->x, &type2rhor_z2r, &type2frho,
+                       &rhor_spline2, &frho_spline1, &frho_spline2, 
+                       &this->nbor->dev_nbor, &this->_nbor_data->begin(), &_fp, 
+                       &this->ans->engv,&eflag, &ainum, &nbor_pitch,
                        &_ntypes, &_cutforcesq, &_rdr, &_rdrho, &_nrho, &_nr,
                        &this->_threads_per_atom);
   }
@@ -536,28 +509,20 @@ void EAMT::loop2(const bool _eflag, const bool _vflag) {
   
   if (shared_types) {
     this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->dev_x.begin(), &dev_fp.begin(), 
-                   &type2rhor_z2r.begin(),
-                   &rhor_spline1.begin(), 
-                   &z2r_spline1.begin(),
-                   &z2r_spline2.begin(), 
-                   &this->nbor->dev_nbor.begin(),
-                   &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                   &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                   &nbor_pitch, &_cutforcesq, &_rdr, &_nr,
-                   &this->_threads_per_atom);
+    this->k_pair_fast.run(&this->atom->x, &_fp, &type2rhor_z2r,
+                          &rhor_spline1, &z2r_spline1, &z2r_spline2, 
+                          &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                          &this->ans->force, &this->ans->engv, &eflag,
+                          &vflag, &ainum, &nbor_pitch, &_cutforcesq, &_rdr,
+                          &_nr, &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->dev_x.begin(), &dev_fp.begin(), 
-                   &type2rhor_z2r.begin(),
-                   &rhor_spline1.begin(), 
-                   &z2r_spline1.begin(),
-                   &z2r_spline2.begin(),
-                   &this->nbor->dev_nbor.begin(),
-                   &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                   &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                   &nbor_pitch, &_ntypes, &_cutforcesq, &_rdr, &_nr,
-                   &this->_threads_per_atom);
+    this->k_pair.run(&this->atom->x, &_fp, &type2rhor_z2r, &rhor_spline1, 
+                     &z2r_spline1, &z2r_spline2, &this->nbor->dev_nbor,
+                     &this->_nbor_data->begin(), &this->ans->force,
+                     &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch,
+                     &_ntypes, &_cutforcesq, &_rdr, &_nr,
+                     &this->_threads_per_atom);
   }
 
   this->time_pair2.stop();
diff --git a/lib/gpu/lal_eam.cu b/lib/gpu/lal_eam.cu
index 5909adaacd..ec20bd672f 100644
--- a/lib/gpu/lal_eam.cu
+++ b/lib/gpu/lal_eam.cu
@@ -15,66 +15,37 @@
 
 #ifdef NV_KERNEL
 #include "lal_aux_fun1.h"
+
+#ifndef _DOUBLE_DOUBLE
 texture<float4> pos_tex;
 texture<float> fp_tex;
-
 texture<float4> rhor_sp1_tex;
 texture<float4> rhor_sp2_tex;
 texture<float4> frho_sp1_tex;
 texture<float4> frho_sp2_tex;
 texture<float4> z2r_sp1_tex;
 texture<float4> z2r_sp2_tex;
-
-#ifdef _DOUBLE_DOUBLE
-ucl_inline double4 fetch_rhor_sp1(const int& i, const double4 *rhor_spline1) { 
-  return rhor_spline1[i]; 
-}
-ucl_inline double4 fetch_rhor_sp2(const int& i, const double4 *rhor_spline2) { 
-  return rhor_spline2[i]; 
-}
-ucl_inline double4 fetch_frho_sp1(const int& i, const double4 *frho_spline1) { 
-  return frho_spline1[i]; 
-}
-ucl_inline double4 fetch_frho_sp2(const int& i, const double4 *frho_spline2) { 
-  return frho_spline2[i]; 
-}
-ucl_inline double4 fetch_z2r_sp1(const int& i, const double4 *z2r_spline1) { 
-  return z2r_spline1[i]; 
-}
-ucl_inline double4 fetch_z2r_sp2(const int& i, const double4 *z2r_spline2) { 
-  return z2r_spline2[i]; 
-}
+#else
+texture<int4> pos_tex;
+texture<int2> fp_tex;
+texture<int4> rhor_sp1_tex;
+texture<int4> rhor_sp2_tex;
+texture<int4> frho_sp1_tex;
+texture<int4> frho_sp2_tex;
+texture<int4> z2r_sp1_tex;
+texture<int4> z2r_sp2_tex;
 #endif
 
-#ifndef _DOUBLE_DOUBLE
-ucl_inline float4 fetch_pos(const int& i, const float4 *pos)
-  { return tex1Dfetch(pos_tex, i); }
-ucl_inline float fetch_q(const int& i, const float *fp) 
-  { return tex1Dfetch(fp_tex, i); }
+#else
 
-ucl_inline float4 fetch_rhor_sp1(const int& i, const float4 *rhor_spline1) 
-  { return tex1Dfetch(rhor_sp1_tex, i); }
-ucl_inline float4 fetch_rhor_sp2(const int& i, const float4 *rhor_spline2) 
-  { return tex1Dfetch(rhor_sp2_tex, i); }
-ucl_inline float4 fetch_frho_sp1(const int& i, const float4 *frho_spline1) 
-  { return tex1Dfetch(frho_sp1_tex, i); }
-ucl_inline float4 fetch_frho_sp2(const int& i, const float4 *frho_spline2) 
-  { return tex1Dfetch(frho_sp2_tex, i); }
-ucl_inline float4 fetch_z2r_sp1(const int& i, const float4 *z2r_spline1) 
-  { return tex1Dfetch(z2r_sp1_tex, i); }
-ucl_inline float4 fetch_z2r_sp2(const int& i, const float4 *z2r_spline2) 
-  { return tex1Dfetch(z2r_sp2_tex, i); }
-#endif
-
-#else // OPENCL
-
-#define fetch_q(i,y) fp_[i]
-#define fetch_rhor_sp1(i,y) rhor_spline1[i]
-#define fetch_rhor_sp2(i,y) rhor_spline2[i]
-#define fetch_frho_sp1(i,y) frho_spline1[i]
-#define fetch_frho_sp2(i,y) frho_spline2[i]
-#define fetch_z2r_sp1(i,y) z2r_spline1[i] 
-#define fetch_z2r_sp2(i,y) z2r_spline2[i]
+#define pos_tex x_
+#define fp_tex fp_
+#define rhor_sp1_tex rhor_spline1
+#define rhor_sp2_tex rhor_spline2
+#define frho_sp1_tex frho_spline1
+#define frho_sp2_tex frho_spline2
+#define z2r_sp1_tex z2r_spline1
+#define z2r_sp2_tex z2r_spline2
 
 #endif
 
@@ -99,11 +70,11 @@ ucl_inline float4 fetch_z2r_sp2(const int& i, const float4 *z2r_spline2)
     p -= m;                                                                 \
     p = MIN(p,(numtyp)1.0);                                                 \
     int index = type2frho[itype]*(nrho+1)+m;                                \
-    numtyp4 coeff = fetch_frho_sp1(index, frho_spline1);                    \
+    numtyp4 coeff; fetch4(coeff,index,frho_sp1_tex);                        \
     numtyp fp = (coeff.x*p + coeff.y)*p + coeff.z;                          \
     fp_[i]=fp;                                                              \
     if (eflag>0) {                                                          \
-      coeff = fetch_frho_sp2(index, frho_spline2);                          \
+      fetch4(coeff,index,frho_sp2_tex);                                     \
       energy = ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w;             \
       engv[ii]=(acctyp)2.0*energy;                                          \
     }                                                                       \
@@ -154,7 +125,7 @@ ucl_inline float4 fetch_z2r_sp2(const int& i, const float4 *z2r_spline2)
     ans[ii]=f;                                                              \
   }
 
-__kernel void kernel_energy(__global numtyp4 *x_, __global int2 *type2rhor_z2r,
+__kernel void k_energy(__global numtyp4 *x_, __global int2 *type2rhor_z2r,
                             __global int *type2frho, 
                             __global numtyp4 *rhor_spline2, 
                             __global numtyp4 *frho_spline1,
@@ -178,14 +149,14 @@ __kernel void kernel_energy(__global numtyp4 *x_, __global int2 *type2rhor_z2r,
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
     
     for ( ; nbor<list_end; nbor+=n_stride) {
       int j=*nbor;
       j &= NEIGHMASK;
 
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int jtype=jx.w;
 
       // Compute r12
@@ -203,7 +174,7 @@ __kernel void kernel_energy(__global numtyp4 *x_, __global int2 *type2rhor_z2r,
         
         int mtype = jtype*ntypes+itype;
         int index = type2rhor_z2r[mtype].x*(nr+1)+m;
-        numtyp4 coeff = fetch_rhor_sp2(index, rhor_spline2);
+        numtyp4 coeff; fetch4(coeff,index,rhor_sp2_tex);
         rho += ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w;
       }
     } // for nbor
@@ -213,7 +184,7 @@ __kernel void kernel_energy(__global numtyp4 *x_, __global int2 *type2rhor_z2r,
   } // if ii
 }
 
-__kernel void kernel_energy_fast(__global numtyp4 *x_, 
+__kernel void k_energy_fast(__global numtyp4 *x_, 
                                  __global int2 *type2rhor_z2r_in,
                                  __global int *type2frho_in, 
                                  __global numtyp4 *rhor_spline2, 
@@ -252,14 +223,14 @@ __kernel void kernel_energy_fast(__global numtyp4 *x_,
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
     
     for ( ; nbor<list_end; nbor+=n_stride) {
       int j=*nbor;
       j &= NEIGHMASK;
 
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
 
       // Compute r12
       numtyp delx = ix.x-jx.x;
@@ -277,7 +248,7 @@ __kernel void kernel_energy_fast(__global numtyp4 *x_,
         int jtype=fast_mul((int)MAX_SHARED_TYPES,jx.w);
         int mtype = jtype+itype;
         int index = type2rhor_z2r[mtype].x*(nr+1)+m;
-        numtyp4 coeff = fetch_rhor_sp2(index, rhor_spline2);
+        numtyp4 coeff; fetch4(coeff,index,rhor_sp2_tex);
         rho += ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w;
       }
     } // for nbor
@@ -287,7 +258,7 @@ __kernel void kernel_energy_fast(__global numtyp4 *x_,
   } // if ii
 }
 
-__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp *fp_,
+__kernel void k_eam(__global numtyp4 *x_, __global numtyp *fp_,
                           __global int2 *type2rhor_z2r,
                           __global numtyp4 *rhor_spline1, 
                           __global numtyp4 *z2r_spline1,
@@ -317,15 +288,15 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp *fp_,
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    numtyp ifp=fetch_q(i,fp_);  //fp_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp ifp; fetch(ifp,i,fp_tex);  //fp_[i];
     int itype=ix.w;
 
     for ( ; nbor<list_end; nbor+=n_stride) {
       int j=*nbor;
       j &= NEIGHMASK;
 
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int jtype=jx.w;
 
       // Compute r12
@@ -347,25 +318,27 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp *fp_,
 
         mtype = itype*ntypes+jtype;
         index = type2rhor_z2r[mtype].x*(nr+1)+m;
-        coeff = fetch_rhor_sp1(index, rhor_spline1); 
+        fetch4(coeff,index,rhor_sp1_tex);
         numtyp rhoip = (coeff.x*p + coeff.y)*p + coeff.z;
 
         mtype = jtype*ntypes+itype;
         index = type2rhor_z2r[mtype].x*(nr+1)+m;
-        coeff = fetch_rhor_sp1(index, rhor_spline1); 
+        fetch4(coeff,index,rhor_sp1_tex);
         numtyp rhojp = (coeff.x*p + coeff.y)*p + coeff.z;
               
         mtype = itype*ntypes+jtype;
         index = type2rhor_z2r[mtype].y*(nr+1)+m;
-        coeff = fetch_z2r_sp1(index, z2r_spline1);
+        fetch4(coeff,index,z2r_sp1_tex);
         numtyp z2p = (coeff.x*p + coeff.y)*p + coeff.z;
-        coeff = fetch_z2r_sp2(index, z2r_spline2);
+        fetch4(coeff,index,z2r_sp2_tex);
         numtyp z2 = ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w;
         
         numtyp recip = ucl_recip(r);
         numtyp phi = z2*recip;
         numtyp phip = z2p*recip - phi*recip;
-        numtyp psip = ifp*rhojp + fetch_q(j,fp_)*rhoip + phip; 
+        numtyp psip;
+        fetch(psip,j,fp_tex);
+        psip = ifp*rhojp + psip*rhoip + phip; 
         numtyp force = -psip*recip;
         
         f.x+=delx*force;
@@ -391,7 +364,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp *fp_,
 
 }
 
-__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp *fp_,
+__kernel void k_eam_fast(__global numtyp4 *x_, __global numtyp *fp_,
                           __global int2 *type2rhor_z2r_in,
                           __global numtyp4 *rhor_spline1, 
                           __global numtyp4 *z2r_spline1,
@@ -427,8 +400,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp *fp_,
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
 
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    numtyp ifp=fetch_q(i,fp_); //fp_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp ifp; fetch(ifp,i,fp_tex); //fp_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
@@ -436,7 +409,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp *fp_,
       int j=*nbor;
       j &= NEIGHMASK;
 
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int jw=jx.w;
       int jtype=fast_mul((int)MAX_SHARED_TYPES,jw);
       
@@ -459,25 +432,27 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp *fp_,
         
         mtype = itype+jw;
         index = type2rhor_z2r[mtype].x*(nr+1)+m;
-        coeff = fetch_rhor_sp1(index, rhor_spline1); 
+        fetch4(coeff,index,rhor_sp1_tex);
         numtyp rhoip = (coeff.x*p + coeff.y)*p + coeff.z;
         
         mtype = jtype+iw;
         index = type2rhor_z2r[mtype].x*(nr+1)+m;
-        coeff = fetch_rhor_sp1(index, rhor_spline1); 
+        fetch4(coeff,index,rhor_sp1_tex);
         numtyp rhojp = (coeff.x*p + coeff.y)*p + coeff.z;
         
         mtype = itype+jw;
         index = type2rhor_z2r[mtype].y*(nr+1)+m;
-        coeff = fetch_z2r_sp1(index, z2r_spline1);
+        fetch4(coeff,index,z2r_sp1_tex);
         numtyp z2p = (coeff.x*p + coeff.y)*p + coeff.z;
-        coeff = fetch_z2r_sp2(index, z2r_spline2);
+        fetch4(coeff,index,z2r_sp2_tex);
         numtyp z2 = ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w;
       
         numtyp recip = ucl_recip(r);
         numtyp phi = z2*recip;
         numtyp phip = z2p*recip - phi*recip;
-        numtyp psip = ifp*rhojp + fetch_q(j,fp_)*rhoip + phip;
+        numtyp psip;
+        fetch(psip,j,fp_tex);
+        psip = ifp*rhojp + psip*rhoip + phip; 
         numtyp force = -psip*recip;
         
         f.x+=delx*force;
diff --git a/lib/gpu/lal_eam.h b/lib/gpu/lal_eam.h
index 6474b1fbdc..a22a4551d4 100644
--- a/lib/gpu/lal_eam.h
+++ b/lib/gpu/lal_eam.h
@@ -52,8 +52,8 @@ class EAM : public BaseAtomic<numtyp, acctyp> {
     if (nghost>0) {
       UCL_H_Vec<numtyp> host_view;
       UCL_D_Vec<numtyp> dev_view;
-      host_view.view_offset(_nlocal,host_fp);
-      dev_view.view_offset(_nlocal,dev_fp);
+      host_view.view_offset(_nlocal,_fp.host);
+      dev_view.view_offset(_nlocal,_fp.device);
       ucl_copy(dev_view,host_view,nghost,true);
     }
   }
@@ -128,8 +128,7 @@ class EAM : public BaseAtomic<numtyp, acctyp> {
   bool _compiled_energy;
   
   /// Per-atom arrays
-  UCL_H_Vec<numtyp> host_fp;
-  UCL_D_Vec<numtyp> dev_fp;
+  UCL_Vector<numtyp,numtyp> _fp;
   
 protected:
   bool _allocated;
diff --git a/lib/gpu/lal_ellipsoid_extra.h b/lib/gpu/lal_ellipsoid_extra.h
index cd963ffe68..e2287c0af2 100644
--- a/lib/gpu/lal_ellipsoid_extra.h
+++ b/lib/gpu/lal_ellipsoid_extra.h
@@ -20,6 +20,14 @@ enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
 
 #ifdef NV_KERNEL
 #include "lal_preprocessor.h"
+#ifndef _DOUBLE_DOUBLE
+texture<float4> pos_tex, quat_tex;
+#else
+texture<int4,1> pos_tex, quat_tex;
+#endif
+#else
+#define pos_tex x_
+#define quat_tex qif
 #endif
 
 #define atom_info(t_per_atom, ii, tid, offset)                               \
@@ -411,7 +419,7 @@ ucl_inline void gpu_mldivide3(const numtyp m[9], const numtyp *v, numtyp *ans,
 ucl_inline void gpu_quat_to_mat_trans(__global const numtyp4 *qif, const int qi, 
                                     numtyp mat[9])
 {
-  numtyp4 q=qif[qi];
+  numtyp4 q; fetch4(q,qi,quat_tex);
   
   numtyp w2 = q.x*q.x;
   numtyp i2 = q.y*q.y;
diff --git a/lib/gpu/lal_ellipsoid_nbor.cu b/lib/gpu/lal_ellipsoid_nbor.cu
index 669973a7e5..0be6c0922d 100644
--- a/lib/gpu/lal_ellipsoid_nbor.cu
+++ b/lib/gpu/lal_ellipsoid_nbor.cu
@@ -15,6 +15,13 @@
 
 #ifdef NV_KERNEL
 #include "lal_preprocessor.h"
+#ifndef _DOUBLE_DOUBLE
+texture<float4> pos_tex;
+#else
+texture<int4,1> pos_tex;
+#endif
+#else
+#define pos_tex x_
 #endif
 
 // ---------------------------------------------------------------------------
@@ -40,14 +47,14 @@ __kernel void kernel_nbor(__global numtyp4 *x_, __global numtyp2 *cut_form,
     __global int *list_end=nbor+fast_mul(numj,nbor_pitch);
     __global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
   
-    numtyp4 ix=x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul(iw,ntypes);
     int newj=0;  
     for ( ; nbor<list_end; nbor+=nbor_pitch) {
       int j=*nbor;
       j &= NEIGHMASK;
-      numtyp4 jx=x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int jtype=jx.w;
       int mtype=itype+jtype;
       numtyp2 cf=cut_form[mtype];
@@ -102,7 +109,7 @@ __kernel void kernel_nbor_fast(__global numtyp4 *x_, __global numtyp2 *cut_form,
     __global int *list_end=nbor+fast_mul(numj,nbor_pitch);
     __global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
   
-    numtyp4 ix=x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
@@ -110,7 +117,7 @@ __kernel void kernel_nbor_fast(__global numtyp4 *x_, __global numtyp2 *cut_form,
     for ( ; nbor<list_end; nbor+=nbor_pitch) {
       int j=*nbor;
       j &= NEIGHMASK;
-      numtyp4 jx=x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int jtype=jx.w;
       int mtype=itype+jtype;
       
diff --git a/lib/gpu/lal_gayberne.cpp b/lib/gpu/lal_gayberne.cpp
index 4e5c870228..48309b245f 100644
--- a/lib/gpu/lal_gayberne.cpp
+++ b/lib/gpu/lal_gayberne.cpp
@@ -13,12 +13,15 @@
     email                : brownw@ornl.gov
  ***************************************************************************/
 
-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "gayberne_cl.h"
 #include "gayberne_lj_cl.h"
+#elif defined(USE_CUDART)
+const char *gayberne=0;
+const char *gayberne_lj=0;
 #else
-#include "gayberne_ptx.h"
-#include "gayberne_lj_ptx.h"
+#include "gayberne_cubin.h"
+#include "gayberne_lj_cubin.h"
 #endif
 
 #include "lal_gayberne.h"
@@ -57,7 +60,8 @@ int GayBerneT::init(const int ntypes, const double gamma,
                          const double gpu_split, FILE *_screen) {
   int success;
   success=this->init_base(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                          _screen,ntypes,h_form,gayberne,gayberne_lj);
+                          _screen,ntypes,h_form,gayberne,gayberne_lj,
+                          "k_gayberne");
   if (success!=0)
     return success;
 
@@ -210,13 +214,13 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
 
       this->time_ellipsoid.start();
       this->k_ellipsoid.set_size(GX,BX);
-      this->k_ellipsoid.run(&this->atom->dev_x.begin(),
-       &this->atom->dev_quat.begin(), &this->shape.begin(), &this->well.begin(),
-       &this->gamma_upsilon_mu.begin(), &this->sigma_epsilon.begin(), 
-       &this->_lj_types, &this->lshape.begin(), &this->nbor->dev_nbor.begin(),
-       &stride, &this->ans->dev_ans.begin(),&ainum,&this->ans->dev_engv.begin(),
-       &this->dev_error.begin(), &eflag, &vflag, &this->_last_ellipse,
-       &this->_threads_per_atom);
+      this->k_ellipsoid.run(&this->atom->x, &this->atom->quat, 
+                            &this->shape, &this->well, &this->gamma_upsilon_mu,
+                            &this->sigma_epsilon, &this->_lj_types, 
+                            &this->lshape, &this->nbor->dev_nbor, &stride, 
+                            &this->ans->force, &ainum, &this->ans->engv,
+                            &this->dev_error, &eflag, &vflag, 
+                            &this->_last_ellipse, &this->_threads_per_atom);
       this->time_ellipsoid.stop();
 
       if (this->_last_ellipse==this->ans->inum()) {
@@ -243,17 +247,19 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
 
       this->time_ellipsoid2.start();
       this->k_sphere_ellipsoid.set_size(GX,BX);
-      this->k_sphere_ellipsoid.run(&this->atom->dev_x.begin(),
-        &this->atom->dev_quat.begin(), &this->shape.begin(), 
-        &this->well.begin(), &this->gamma_upsilon_mu.begin(), 
-        &this->sigma_epsilon.begin(), &this->_lj_types, &this->lshape.begin(), 
-        &this->nbor->dev_nbor.begin(), &stride, &this->ans->dev_ans.begin(),
-        &this->ans->dev_engv.begin(), &this->dev_error.begin(), &eflag,
-        &vflag, &this->_last_ellipse, &ainum, &this->_threads_per_atom);
+      this->k_sphere_ellipsoid.run(&this->atom->x, &this->atom->quat,
+                                   &this->shape,  &this->well, 
+                                   &this->gamma_upsilon_mu, 
+                                   &this->sigma_epsilon, &this->_lj_types, 
+                                   &this->lshape,  &this->nbor->dev_nbor, 
+                                   &stride, &this->ans->force, 
+                                   &this->ans->engv, &this->dev_error, 
+                                   &eflag, &vflag, &this->_last_ellipse,
+                                   &ainum, &this->_threads_per_atom);
       this->time_ellipsoid2.stop();
    } else {
-      this->ans->dev_ans.zero();
-      this->ans->dev_engv.zero();
+      this->ans->force.zero();
+      this->ans->engv.zero();
       this->time_nbor1.stop();
       this->time_ellipsoid.start();                                 
       this->time_ellipsoid.stop();
@@ -268,19 +274,20 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
     if (this->_last_ellipse<this->ans->inum()) {
       if (this->_shared_types) {
         this->k_lj_fast.set_size(GX,BX);
-        this->k_lj_fast.run(&this->atom->dev_x.begin(), &this->lj1.begin(),
-          &this->lj3.begin(), &this->gamma_upsilon_mu.begin(), &stride,
-          &this->nbor->dev_packed.begin(), &this->ans->dev_ans.begin(),
-          &this->ans->dev_engv.begin(), &this->dev_error.begin(),
-          &eflag, &vflag, &this->_last_ellipse, &ainum,
-          &this->_threads_per_atom);
+        this->k_lj_fast.run(&this->atom->x, &this->lj1, &this->lj3, 
+                            &this->gamma_upsilon_mu, &stride, 
+                            &this->nbor->dev_packed, &this->ans->force,
+                            &this->ans->engv, &this->dev_error, &eflag, 
+                            &vflag, &this->_last_ellipse, &ainum,
+                            &this->_threads_per_atom);
       } else {
         this->k_lj.set_size(GX,BX);
-        this->k_lj.run(&this->atom->dev_x.begin(), &this->lj1.begin(),
-          &this->lj3.begin(), &this->_lj_types, &this->gamma_upsilon_mu.begin(),
-          &stride, &this->nbor->dev_packed.begin(), &this->ans->dev_ans.begin(),
-          &this->ans->dev_engv.begin(), &this->dev_error.begin(), &eflag,
-          &vflag, &this->_last_ellipse, &ainum, &this->_threads_per_atom);
+        this->k_lj.run(&this->atom->x, &this->lj1, &this->lj3, 
+                       &this->_lj_types, &this->gamma_upsilon_mu, &stride,
+                       &this->nbor->dev_packed, &this->ans->force,
+                       &this->ans->engv, &this->dev_error, &eflag,
+                       &vflag, &this->_last_ellipse, &ainum,
+                       &this->_threads_per_atom);
       }
     }
     this->time_lj.stop();
@@ -294,13 +301,12 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
     this->time_nbor1.stop();
     this->time_ellipsoid.start(); 
     this->k_ellipsoid.set_size(GX,BX);
-    this->k_ellipsoid.run(&this->atom->dev_x.begin(), 
-      &this->atom->dev_quat.begin(), &this->shape.begin(), &this->well.begin(), 
-      &this->gamma_upsilon_mu.begin(), &this->sigma_epsilon.begin(), 
-      &this->_lj_types, &this->lshape.begin(), &this->nbor->dev_nbor.begin(),
-      &stride, &this->ans->dev_ans.begin(), &ainum, 
-      &this->ans->dev_engv.begin(), &this->dev_error.begin(),
-      &eflag, &vflag, &ainum, &this->_threads_per_atom);
+    this->k_ellipsoid.run(&this->atom->x,  &this->atom->quat, 
+                          &this->shape, &this->well, &this->gamma_upsilon_mu, 
+                          &this->sigma_epsilon, &this->_lj_types, &this->lshape,
+                          &this->nbor->dev_nbor, &stride, &this->ans->force,
+                          &ainum,  &this->ans->engv, &this->dev_error,
+                          &eflag, &vflag, &ainum, &this->_threads_per_atom);
     this->time_ellipsoid.stop();
   }
 }
diff --git a/lib/gpu/lal_gayberne.cu b/lib/gpu/lal_gayberne.cu
index e2bfe4b1b5..dbff1178ef 100644
--- a/lib/gpu/lal_gayberne.cu
+++ b/lib/gpu/lal_gayberne.cu
@@ -18,7 +18,7 @@
 #endif
 
 ucl_inline void compute_eta_torque(numtyp m[9],numtyp m2[9], const numtyp4 shape, 
-                                 numtyp ans[9])
+                                   numtyp ans[9])
 {
   numtyp den = m[3]*m[2]*m[7]-m[0]*m[5]*m[7]-
     m[2]*m[6]*m[4]+m[1]*m[6]*m[5]-
@@ -80,15 +80,15 @@ ucl_inline void compute_eta_torque(numtyp m[9],numtyp m2[9], const numtyp4 shape
 		    m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den;
 }
 
-__kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q,
-                               __global numtyp4* shape, __global numtyp4* well, 
-                               __global numtyp *gum, __global numtyp2* sig_eps, 
-                               const int ntypes, __global numtyp *lshape, 
-                               __global int *dev_nbor, const int stride, 
-                               __global acctyp4 *ans, const int astride, 
-                               __global acctyp *engv, __global int *err_flag, 
-                               const int eflag, const int vflag, const int inum,
-                               const int t_per_atom) {
+__kernel void k_gayberne(__global numtyp4* x_,__global numtyp4 *q,
+                         __global numtyp4* shape, __global numtyp4* well, 
+                         __global numtyp *gum, __global numtyp2* sig_eps, 
+                         const int ntypes, __global numtyp *lshape, 
+                         __global int *dev_nbor, const int stride, 
+                         __global acctyp4 *ans, const int astride, 
+                         __global acctyp *engv, __global int *err_flag, 
+                         const int eflag, const int vflag, const int inum,
+                         const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
@@ -117,7 +117,7 @@ __kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q,
     nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
                 n_stride,nbor_end,nbor);
   
-    numtyp4 ix=x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex);
     int itype=ix.w;
     numtyp a1[9], b1[9], g1[9];
     numtyp4 ishape=shape[itype];
@@ -136,7 +136,7 @@ __kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q,
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
-      numtyp4 jx=x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex);
       int jtype=jx.w;
 
       // Compute r12
diff --git a/lib/gpu/lal_gayberne_lj.cu b/lib/gpu/lal_gayberne_lj.cu
index bf294e1bb4..05a9b1008a 100644
--- a/lib/gpu/lal_gayberne_lj.cu
+++ b/lib/gpu/lal_gayberne_lj.cu
@@ -17,15 +17,15 @@
 #include "lal_ellipsoid_extra.h"
 #endif
 
-__kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
-                               __global numtyp4* shape,__global numtyp4* well, 
-                               __global numtyp *gum, __global numtyp2* sig_eps, 
-                               const int ntypes, __global numtyp *lshape, 
-                               __global int *dev_nbor, const int stride, 
-                               __global acctyp4 *ans, __global acctyp *engv, 
-                               __global int *err_flag, const int eflag, 
-                               const int vflag,const int start, const int inum, 
-                               const int t_per_atom) {
+__kernel void k_gayberne_sphere_ellipsoid(__global numtyp4 *x_,
+                  __global numtyp4 *q, __global numtyp4* shape,
+                  __global numtyp4* well, __global numtyp *gum, 
+                  __global numtyp2* sig_eps, const int ntypes, 
+                  __global numtyp *lshape, __global int *dev_nbor, 
+                  const int stride, __global acctyp4 *ans, 
+                  __global acctyp *engv, __global int *err_flag, 
+                  const int eflag, const int vflag,const int start, 
+                  const int inum, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
   ii+=start;
@@ -51,7 +51,7 @@ __kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
     nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
                 n_stride,nbor_end,nbor);
   
-    numtyp4 ix=x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex);
     int itype=ix.w;
       
     numtyp oner=shape[itype].x;
@@ -64,7 +64,7 @@ __kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
-      numtyp4 jx=x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex);
       int jtype=jx.w;
 
       // Compute r12
@@ -236,14 +236,13 @@ __kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
   } // if ii
 }
 
-__kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1, 
-                        __global numtyp4* lj3, const int lj_types, 
-                        __global numtyp *gum, 
-                        const int stride, __global int *dev_ij, 
-                        __global acctyp4 *ans, __global acctyp *engv, 
-                        __global int *err_flag, const int eflag, 
-                        const int vflag, const int start, const int inum, 
-                        const int t_per_atom) {
+__kernel void k_gayberne_lj(__global numtyp4 *x_, __global numtyp4 *lj1, 
+                            __global numtyp4* lj3, const int lj_types, 
+                            __global numtyp *gum, const int stride, 
+                            __global int *dev_ij, __global acctyp4 *ans, 
+                            __global acctyp *engv, __global int *err_flag, 
+                            const int eflag, const int vflag, const int start,
+                            const int inum, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
   ii+=start;
@@ -269,7 +268,7 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
     nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
                 n_stride,list_end,nbor);
   
-    numtyp4 ix=x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex);
     int itype=ix.w;
 
     numtyp factor_lj;
@@ -279,7 +278,7 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
-      numtyp4 jx=x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex);
       int jtype=jx.w;
 
       // Compute r12
@@ -319,13 +318,13 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
   } // if ii
 }
 
-__kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, 
-                             __global numtyp4* lj3_in, __global numtyp *gum, 
-                             const int stride, __global int *dev_ij,
-                             __global acctyp4 *ans, __global acctyp *engv,
-                             __global int *err_flag, const int eflag,
-                             const int vflag, const int start, const int inum,
-                             const int t_per_atom) {
+__kernel void k_gayberne_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, 
+                                 __global numtyp4* lj3_in, __global numtyp *gum, 
+                                 const int stride, __global int *dev_ij,
+                                 __global acctyp4 *ans, __global acctyp *engv,
+                                 __global int *err_flag, const int eflag,
+                                 const int vflag, const int start, 
+                                 const int inum, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
   ii+=start;
@@ -358,7 +357,7 @@ __kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
     nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
                 n_stride,list_end,nbor);
 
-    numtyp4 ix=x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex);
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
@@ -369,7 +368,7 @@ __kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
-      numtyp4 jx=x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex);
       int mtype=itype+jx.w;
 
       // Compute r12
@@ -406,3 +405,4 @@ __kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                 ans,engv);
   } // if ii
 }
+
diff --git a/lib/gpu/lal_lj.cpp b/lib/gpu/lal_lj.cpp
index a90e96f174..6c0609a17b 100644
--- a/lib/gpu/lal_lj.cpp
+++ b/lib/gpu/lal_lj.cpp
@@ -13,10 +13,12 @@
     email                : brownw@ornl.gov
  ***************************************************************************/
 
-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "lj_cl.h"
+#elif defined(USE_CUDART)
+const char *lj=0;
 #else
-#include "lj_ptx.h"
+#include "lj_cubin.h"
 #endif
 
 #include "lal_lj.h"
@@ -51,7 +53,7 @@ int LJT::init(const int ntypes,
                           const double gpu_split, FILE *_screen) {
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                            _screen,lj);
+                            _screen,lj,"k_lj");
   if (success!=0)
     return success;
 
@@ -133,20 +135,17 @@ void LJT::loop(const bool _eflag, const bool _vflag) {
   this->time_pair.start();
   if (shared_types) {
     this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
-                          &lj3.begin(), &sp_lj.begin(),
-                          &this->nbor->dev_nbor.begin(),
-                          &this->_nbor_data->begin(),
-                          &this->ans->dev_ans.begin(),
-                          &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &nbor_pitch, &this->_threads_per_atom);
+    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
+                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->ans->force, &this->ans->engv, &eflag,
+                          &vflag, &ainum, &nbor_pitch, 
+                          &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
-                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &nbor_pitch, &this->_threads_per_atom);
+    this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, 
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                     &this->ans->force, &this->ans->engv, &eflag, &vflag,
+                     &ainum, &nbor_pitch, &this->_threads_per_atom);
   }
   this->time_pair.stop();
 }
diff --git a/lib/gpu/lal_lj.cu b/lib/gpu/lal_lj.cu
index 12e2a487ec..0a049d187e 100644
--- a/lib/gpu/lal_lj.cu
+++ b/lib/gpu/lal_lj.cu
@@ -15,14 +15,16 @@
 
 #ifdef NV_KERNEL
 #include "lal_aux_fun1.h"
-texture<float4> pos_tex;
 #ifndef _DOUBLE_DOUBLE
-ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
-  { return tex1Dfetch(pos_tex, i); }
+texture<float4> pos_tex;
+#else
+texture<int4,1> pos_tex;
 #endif
+#else
+#define pos_tex x_
 #endif
 
-__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
+__kernel void k_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
                           __global numtyp4* lj3, const int lj_types, 
                           __global numtyp *sp_lj_in, __global int *dev_nbor, 
                           __global int *dev_packed, __global acctyp4 *ans,
@@ -51,7 +53,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
@@ -61,7 +63,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int jtype=jx.w;
 
       // Compute r12
@@ -101,7 +103,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
   } // if ii
 }
 
-__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
+__kernel void k_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                                __global numtyp4* lj3_in, 
                                __global numtyp* sp_lj_in, 
                                __global int *dev_nbor, __global int *dev_packed, 
@@ -137,7 +139,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
 
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
@@ -148,7 +150,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int mtype=itype+jx.w;
 
       // Compute r12
diff --git a/lib/gpu/lal_lj96.cpp b/lib/gpu/lal_lj96.cpp
index 6331574b85..05feadc5e2 100644
--- a/lib/gpu/lal_lj96.cpp
+++ b/lib/gpu/lal_lj96.cpp
@@ -13,10 +13,12 @@
     email                : brownw@ornl.gov
  ***************************************************************************/
 
-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "lj96_cl.h"
+#elif defined(USE_CUDART)
+const char *lj96=0;
 #else
-#include "lj96_ptx.h"
+#include "lj96_cubin.h"
 #endif
 
 #include "lal_lj96.h"
@@ -51,7 +53,7 @@ int LJ96T::init(const int ntypes,
                            const double gpu_split, FILE *_screen) {
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                            _screen,lj96);
+                            _screen,lj96,"k_lj96");
   if (success!=0)
     return success;
 
@@ -133,19 +135,17 @@ void LJ96T::loop(const bool _eflag, const bool _vflag) {
   this->time_pair.start();
   if (shared_types) {
     this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
-                          &lj3.begin(), &sp_lj.begin(),
-                          &this->nbor->dev_nbor.begin(),
-                          &this->_nbor_data->begin(),
-                          &this->ans->dev_ans.begin(),
-                          &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &nbor_pitch, &this->_threads_per_atom);
+    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
+                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->ans->force, &this->ans->engv, &eflag,
+                          &vflag, &ainum, &nbor_pitch, 
+                          &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
-                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
+    this->k_pair.run(&this->atom->x, &lj1, &lj3,
+                     &_lj_types, &sp_lj, &this->nbor->dev_nbor,
+                     &this->_nbor_data->begin(), &this->ans->force,
+                     &this->ans->engv, &eflag, &vflag, &ainum,
                      &nbor_pitch, &this->_threads_per_atom);
   }
   this->time_pair.stop();
diff --git a/lib/gpu/lal_lj96.cu b/lib/gpu/lal_lj96.cu
index c5ea89a74b..a5fc6c89df 100644
--- a/lib/gpu/lal_lj96.cu
+++ b/lib/gpu/lal_lj96.cu
@@ -15,14 +15,16 @@
 
 #ifdef NV_KERNEL
 #include "lal_aux_fun1.h"
-texture<float4> pos_tex;
 #ifndef _DOUBLE_DOUBLE
-ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
-  { return tex1Dfetch(pos_tex, i); }
+texture<float4> pos_tex;
+#else
+texture<int4,1> pos_tex;
 #endif
+#else
+#define pos_tex x_
 #endif
 
-__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
+__kernel void k_lj96(__global numtyp4 *x_, __global numtyp4 *lj1,
                           __global numtyp4* lj3, const int lj_types, 
                           __global numtyp *sp_lj_in, __global int *dev_nbor, 
                           __global int *dev_packed, __global acctyp4 *ans,
@@ -51,7 +53,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
@@ -61,7 +63,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int jtype=jx.w;
 
       // Compute r12
@@ -102,7 +104,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
   } // if ii
 }
 
-__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
+__kernel void k_lj96_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                                __global numtyp4* lj3_in, 
                                __global numtyp* sp_lj_in,
                                __global int *dev_nbor, __global int *dev_packed,
@@ -138,7 +140,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
@@ -149,7 +151,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int mtype=itype+jx.w;
 
       // Compute r12
diff --git a/lib/gpu/lal_lj_class2_long.cpp b/lib/gpu/lal_lj_class2_long.cpp
index d5d67e8d39..cdb040815f 100644
--- a/lib/gpu/lal_lj_class2_long.cpp
+++ b/lib/gpu/lal_lj_class2_long.cpp
@@ -13,10 +13,12 @@
     email                : brownw@ornl.gov
  ***************************************************************************/
 
-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "lj_class2_long_cl.h"
+#elif defined(USE_CUDART)
+const char *lj_class2_long=0;
 #else
-#include "lj_class2_long_ptx.h"
+#include "lj_class2_long_cubin.h"
 #endif
 
 #include "lal_lj_class2_long.h"
@@ -55,7 +57,7 @@ int LJClass2LongT::init(const int ntypes, double **host_cutsq,
                         const double g_ewald) {
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                            _screen,lj_class2_long);
+                            _screen,lj_class2_long,"k_lj_class2_long");
   if (success!=0)
     return success;
 
@@ -143,22 +145,19 @@ void LJClass2LongT::loop(const bool _eflag, const bool _vflag) {
   this->time_pair.start();
   if (shared_types) {
     this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
-                          &lj3.begin(), &sp_lj.begin(),
-                          &this->nbor->dev_nbor.begin(),
-                          &this->_nbor_data->begin(),
-                          &this->ans->dev_ans.begin(),
-                          &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &nbor_pitch, &this->atom->dev_q.begin(),
+    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, 
+                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->ans->force, &this->ans->engv, &eflag,
+                          &vflag, &ainum, &nbor_pitch, &this->atom->q,
                           &_cut_coulsq, &_qqrd2e, &_g_ewald, 
                           &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
-                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq,
+    this->k_pair.run(&this->atom->x, &lj1, &lj3,
+                     &_lj_types, &sp_lj, &this->nbor->dev_nbor,
+                     &this->_nbor_data->begin(), &this->ans->force,
+                     &this->ans->engv, &eflag, &vflag, &ainum,
+                     &nbor_pitch, &this->atom->q, &_cut_coulsq,
                      &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
   }
   this->time_pair.stop();
diff --git a/lib/gpu/lal_lj_class2_long.cu b/lib/gpu/lal_lj_class2_long.cu
index aabdbb9c2d..e08baf5a5f 100644
--- a/lib/gpu/lal_lj_class2_long.cu
+++ b/lib/gpu/lal_lj_class2_long.cu
@@ -14,18 +14,22 @@
 // ***************************************************************************/
 
 #ifdef NV_KERNEL
+
 #include "lal_aux_fun1.h"
+#ifndef _DOUBLE_DOUBLE
 texture<float4> pos_tex;
 texture<float> q_tex;
-#ifndef _DOUBLE_DOUBLE
-ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
-  { return tex1Dfetch(pos_tex, i); }
-ucl_inline float fetch_q(const int& i, const float *q) 
-  { return tex1Dfetch(q_tex, i); }
-#endif
+#else
+texture<int4,1> pos_tex;
+texture<int2> q_tex;
 #endif
 
-__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
+#else
+#define pos_tex x_
+#define q_tex q_
+#endif
+
+__kernel void k_lj_class2_long(__global numtyp4 *x_, __global numtyp4 *lj1,
                           __global numtyp4* lj3, const int lj_types, 
                           __global numtyp *sp_lj_in, __global int *dev_nbor, 
                           __global int *dev_packed, __global acctyp4 *ans,
@@ -61,8 +65,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    numtyp qtmp=fetch_q(i,q_);
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
 
     for ( ; nbor<list_end; nbor+=n_stride) {
@@ -73,7 +77,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
       factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
       j &= NEIGHMASK;
 
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int jtype=jx.w;
 
       // Compute r12
@@ -101,7 +105,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
           numtyp expm2 = ucl_exp(-grij*grij);
           numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
           _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-          prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
+          fetch(prefactor,j,q_tex);
+          prefactor *= qqrd2e * qtmp/r;
           forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
         } else
           forcecoul = (numtyp)0.0;
@@ -136,7 +141,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
   } // if ii
 }
 
-__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
+__kernel void k_lj_class2_long_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                                __global numtyp4* lj3_in, 
                                __global numtyp* sp_lj_in,
                                __global int *dev_nbor, __global int *dev_packed,
@@ -175,8 +180,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    numtyp qtmp=fetch_q(i,q_);
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
@@ -188,7 +193,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
       factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
       j &= NEIGHMASK;
 
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int mtype=itype+jx.w;
 
       // Compute r12
@@ -215,7 +220,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
           numtyp expm2 = ucl_exp(-grij*grij);
           numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
           _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-          prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
+          fetch(prefactor,j,q_tex);
+          prefactor *= qqrd2e * qtmp/r;
           forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
         } else
           forcecoul = (numtyp)0.0;
diff --git a/lib/gpu/lal_lj_coul.cpp b/lib/gpu/lal_lj_coul.cpp
index c649e89e1c..b82b4e3fad 100644
--- a/lib/gpu/lal_lj_coul.cpp
+++ b/lib/gpu/lal_lj_coul.cpp
@@ -13,10 +13,12 @@
     email                : brownw@ornl.gov
  ***************************************************************************/
 
-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "lj_coul_cl.h"
+#elif defined(USE_CUDART)
+const char *lj_coul=0;
 #else
-#include "lj_coul_ptx.h"
+#include "lj_coul_cubin.h"
 #endif
 
 #include "lal_lj_coul.h"
@@ -54,7 +56,7 @@ int LJCoulT::init(const int ntypes,
                           double *host_special_coul, const double qqrd2e) {
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                            _screen,lj_coul);
+                            _screen,lj_coul,"k_lj_coul");
   if (success!=0)
     return success;
 
@@ -145,23 +147,18 @@ void LJCoulT::loop(const bool _eflag, const bool _vflag) {
   this->time_pair.start();
   if (shared_types) {
     this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
-                          &lj3.begin(), &sp_lj.begin(),
-                          &this->nbor->dev_nbor.begin(),
-                          &this->_nbor_data->begin(),
-                          &this->ans->dev_ans.begin(),
-                          &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &nbor_pitch,
-                          &this->atom->dev_q.begin(), &cutsq.begin(),
-                          &_qqrd2e, &this->_threads_per_atom);
+    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
+                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->ans->force, &this->ans->engv, &eflag, 
+                          &vflag, &ainum, &nbor_pitch, &this->atom->q,
+                          &cutsq, &_qqrd2e, &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
-                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &nbor_pitch, &this->atom->dev_q.begin(),
-                     &cutsq.begin(), &_qqrd2e, &this->_threads_per_atom);
+    this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, 
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                     &this->ans->force, &this->ans->engv, 
+                     &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q,
+                     &cutsq, &_qqrd2e, &this->_threads_per_atom);
   }
   this->time_pair.stop();
 }
diff --git a/lib/gpu/lal_lj_coul.cu b/lib/gpu/lal_lj_coul.cu
index 221e5cdc8f..feccf7fc4c 100644
--- a/lib/gpu/lal_lj_coul.cu
+++ b/lib/gpu/lal_lj_coul.cu
@@ -14,18 +14,22 @@
 // ***************************************************************************/
 
 #ifdef NV_KERNEL
+
 #include "lal_aux_fun1.h"
+#ifndef _DOUBLE_DOUBLE
 texture<float4> pos_tex;
 texture<float> q_tex;
-#ifndef _DOUBLE_DOUBLE
-ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
-  { return tex1Dfetch(pos_tex, i); }
-ucl_inline float fetch_q(const int& i, const float *q) 
-  { return tex1Dfetch(q_tex, i); }
-#endif
+#else
+texture<int4,1> pos_tex;
+texture<int2> q_tex;
 #endif
 
-__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
+#else
+#define pos_tex x_
+#define q_tex q_
+#endif
+
+__kernel void k_lj_coul(__global numtyp4 *x_, __global numtyp4 *lj1,
                           __global numtyp4* lj3, const int lj_types, 
                           __global numtyp *sp_lj_in, __global int *dev_nbor, 
                           __global int *dev_packed, __global acctyp4 *ans,
@@ -61,8 +65,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    numtyp qtmp=fetch_q(i,q_);
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
 
     for ( ; nbor<list_end; nbor+=n_stride) {
@@ -73,7 +77,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
       factor_coul = sp_lj[sbmask(j)+4];
       j &= NEIGHMASK;
 
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int jtype=jx.w;
 
       // Compute r12
@@ -93,9 +97,10 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
         } else
           force_lj = (numtyp)0.0;
 
-        if (rsq < lj1[mtype].w) 
-          forcecoul = qqrd2e*qtmp*fetch_q(j,q_)*ucl_rsqrt(rsq)*factor_coul;
-        else
+        if (rsq < lj1[mtype].w) {
+          fetch(forcecoul,j,q_tex);
+          forcecoul *= qqrd2e*qtmp*ucl_rsqrt(rsq)*factor_coul;
+        } else
           forcecoul = (numtyp)0.0;
 
         force = (force_lj + forcecoul) * r2inv;
@@ -127,7 +132,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
   } // if ii
 }
 
-__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
+__kernel void k_lj_coul_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                                __global numtyp4* lj3_in, 
                                __global numtyp* sp_lj_in,
                                __global int *dev_nbor, __global int *dev_packed,
@@ -168,8 +173,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    numtyp qtmp=fetch_q(i,q_);
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
@@ -181,7 +186,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
       factor_coul = sp_lj[sbmask(j)+4];
       j &= NEIGHMASK;
 
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int mtype=itype+jx.w;
 
       // Compute r12
@@ -200,9 +205,10 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
         } else
           force_lj = (numtyp)0.0;
 
-        if (rsq < lj1[mtype].w)
-          forcecoul = qqrd2e*qtmp*fetch_q(j,q_)*ucl_rsqrt(rsq)*factor_coul;
-        else
+        if (rsq < lj1[mtype].w) {
+          fetch(forcecoul,j,q_tex);
+          forcecoul *= qqrd2e*qtmp*ucl_rsqrt(rsq)*factor_coul;
+        } else
           forcecoul = (numtyp)0.0;
 
         force = (force_lj + forcecoul) * r2inv;
diff --git a/lib/gpu/lal_lj_coul_long.cpp b/lib/gpu/lal_lj_coul_long.cpp
index bee116a8d7..ae0cce38cb 100644
--- a/lib/gpu/lal_lj_coul_long.cpp
+++ b/lib/gpu/lal_lj_coul_long.cpp
@@ -13,10 +13,12 @@
     email                : brownw@ornl.gov
  ***************************************************************************/
 
-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "lj_coul_long_cl.h"
+#elif defined(USE_CUDART)
+const char *lj_coul_long=0;
 #else
-#include "lj_coul_long_ptx.h"
+#include "lj_coul_long_cubin.h"
 #endif
 
 #include "lal_lj_coul_long.h"
@@ -55,7 +57,7 @@ int LJCoulLongT::init(const int ntypes,
                            const double g_ewald) {
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                            _screen,lj_coul_long);
+                            _screen,lj_coul_long,"k_lj_coul_long");
   if (success!=0)
     return success;
 
@@ -143,22 +145,19 @@ void LJCoulLongT::loop(const bool _eflag, const bool _vflag) {
   this->time_pair.start();
   if (shared_types) {
     this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
-                          &lj3.begin(), &sp_lj.begin(),
-                          &this->nbor->dev_nbor.begin(),
-                          &this->_nbor_data->begin(),
-                          &this->ans->dev_ans.begin(),
-                          &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &nbor_pitch, &this->atom->dev_q.begin(),
+    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, 
+                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->ans->force, &this->ans->engv, &eflag,
+                          &vflag, &ainum, &nbor_pitch, &this->atom->q,
                           &_cut_coulsq, &_qqrd2e, &_g_ewald,
                           &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
-                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq,
+    this->k_pair.run(&this->atom->x, &lj1, &lj3,
+                     &_lj_types, &sp_lj, &this->nbor->dev_nbor,
+                     &this->_nbor_data->begin(), &this->ans->force,
+                     &this->ans->engv, &eflag, &vflag, &ainum,
+                     &nbor_pitch, &this->atom->q, &_cut_coulsq,
                      &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
   }
   this->time_pair.stop();
diff --git a/lib/gpu/lal_lj_coul_long.cu b/lib/gpu/lal_lj_coul_long.cu
index 686186a4e4..9b655ab304 100644
--- a/lib/gpu/lal_lj_coul_long.cu
+++ b/lib/gpu/lal_lj_coul_long.cu
@@ -14,18 +14,22 @@
 // ***************************************************************************/
 
 #ifdef NV_KERNEL
+
 #include "lal_aux_fun1.h"
+#ifndef _DOUBLE_DOUBLE
 texture<float4> pos_tex;
 texture<float> q_tex;
-#ifndef _DOUBLE_DOUBLE
-ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
-  { return tex1Dfetch(pos_tex, i); }
-ucl_inline float fetch_q(const int& i, const float *q) 
-  { return tex1Dfetch(q_tex, i); }
-#endif
+#else
+texture<int4,1> pos_tex;
+texture<int2> q_tex;
 #endif
 
-__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
+#else
+#define pos_tex x_
+#define q_tex q_
+#endif
+
+__kernel void k_lj_coul_long(__global numtyp4 *x_, __global numtyp4 *lj1,
                           __global numtyp4* lj3, const int lj_types, 
                           __global numtyp *sp_lj_in, __global int *dev_nbor, 
                           __global int *dev_packed, __global acctyp4 *ans,
@@ -61,8 +65,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    numtyp qtmp=fetch_q(i,q_);
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
 
     for ( ; nbor<list_end; nbor+=n_stride) {
@@ -73,7 +77,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
       factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
       j &= NEIGHMASK;
 
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int jtype=jx.w;
 
       // Compute r12
@@ -99,7 +103,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
           numtyp expm2 = ucl_exp(-grij*grij);
           numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
           _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-          prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
+          fetch(prefactor,j,q_tex);
+          prefactor *= qqrd2e * qtmp/r;
           forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
         } else
           forcecoul = (numtyp)0.0;
@@ -134,7 +139,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
   } // if ii
 }
 
-__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
+__kernel void k_lj_coul_long_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                                __global numtyp4* lj3_in, 
                                __global numtyp* sp_lj_in,
                                __global int *dev_nbor, __global int *dev_packed,
@@ -173,8 +178,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    numtyp qtmp=fetch_q(i,q_);
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
@@ -186,7 +191,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
       factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
       j &= NEIGHMASK;
 
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int mtype=itype+jx.w;
 
       // Compute r12
@@ -211,7 +216,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
           numtyp expm2 = ucl_exp(-grij*grij);
           numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
           _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-          prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
+          fetch(prefactor,j,q_tex);
+          prefactor *= qqrd2e * qtmp/r;
           forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
         } else
           forcecoul = (numtyp)0.0;
diff --git a/lib/gpu/lal_lj_expand.cpp b/lib/gpu/lal_lj_expand.cpp
index ed1bd9f51e..7e88ee9da8 100644
--- a/lib/gpu/lal_lj_expand.cpp
+++ b/lib/gpu/lal_lj_expand.cpp
@@ -13,10 +13,12 @@
     email                : ibains@nvidia.com
  ***************************************************************************/
 
-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "lj_expand_cl.h"
+#elif defined(USE_CUDART)
+const char *lj_expand=0;
 #else
-#include "lj_expand_ptx.h"
+#include "lj_expand_cubin.h"
 #endif
 
 #include "lal_lj_expand.h"
@@ -51,7 +53,7 @@ int LJExpandT::init(const int ntypes, double **host_cutsq,
                           const double gpu_split, FILE *_screen) {
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                            _screen,lj_expand);
+                            _screen,lj_expand,"k_lj_expand");
   if (success!=0)
     return success;
 
@@ -133,20 +135,17 @@ void LJExpandT::loop(const bool _eflag, const bool _vflag) {
   this->time_pair.start();
   if (shared_types) {
     this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
-                          &lj3.begin(), &sp_lj.begin(),
-                          &this->nbor->dev_nbor.begin(),
-                          &this->_nbor_data->begin(),
-                          &this->ans->dev_ans.begin(),
-                          &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &nbor_pitch, &this->_threads_per_atom);
+    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, 
+                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->ans->force, &this->ans->engv, &eflag, 
+                          &vflag, &ainum, &nbor_pitch, 
+                          &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
-                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &nbor_pitch, &this->_threads_per_atom);
+    this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, 
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                     &this->ans->force, &this->ans->engv, &eflag, &vflag,
+                     &ainum, &nbor_pitch, &this->_threads_per_atom);
   }
   this->time_pair.stop();
 }
diff --git a/lib/gpu/lal_lj_expand.cu b/lib/gpu/lal_lj_expand.cu
index c4d59ab189..3e98ed7d9c 100644
--- a/lib/gpu/lal_lj_expand.cu
+++ b/lib/gpu/lal_lj_expand.cu
@@ -14,15 +14,19 @@
 // ***************************************************************************/
 
 #ifdef NV_KERNEL
+
 #include "lal_aux_fun1.h"
-texture<float4> pos_tex;
 #ifndef _DOUBLE_DOUBLE
-ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
-  { return tex1Dfetch(pos_tex, i); }
-#endif
+texture<float4> pos_tex;
+#else
+texture<int4,1> pos_tex;
 #endif
 
-__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
+#else
+#define pos_tex x_
+#endif
+
+__kernel void k_lj_expand(__global numtyp4 *x_, __global numtyp4 *lj1,
                           __global numtyp4* lj3, const int lj_types, 
                           __global numtyp *sp_lj_in, __global int *dev_nbor, 
                           __global int *dev_packed, __global acctyp4 *ans,
@@ -51,7 +55,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
@@ -61,7 +65,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int jtype=jx.w;
 
       // Compute r12
@@ -104,7 +108,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
   } // if ii
 }
 
-__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
+__kernel void k_lj_expand_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                                __global numtyp4* lj3_in, 
                                __global numtyp* sp_lj_in, 
                                __global int *dev_nbor, __global int *dev_packed,
@@ -140,7 +144,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
@@ -151,7 +155,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int mtype=itype+jx.w;
 
       // Compute r12
diff --git a/lib/gpu/lal_morse.cpp b/lib/gpu/lal_morse.cpp
index 9666517750..5ec2afa3b5 100644
--- a/lib/gpu/lal_morse.cpp
+++ b/lib/gpu/lal_morse.cpp
@@ -13,10 +13,12 @@
     email                : brownw@ornl.gov
  ***************************************************************************/
 
-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "morse_cl.h"
+#elif defined(USE_CUDART)
+const char *morse=0;
 #else
-#include "morse_ptx.h"
+#include "morse_cubin.h"
 #endif
 
 #include "lal_morse.h"
@@ -51,7 +53,7 @@ int MorseT::init(const int ntypes,
                           const double gpu_split, FILE *_screen) {
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                            _screen,morse);
+                            _screen,morse,"k_morse");
   if (success!=0)
     return success;
 
@@ -132,20 +134,17 @@ void MorseT::loop(const bool _eflag, const bool _vflag) {
   this->time_pair.start();
   if (shared_types) {
     this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->dev_x.begin(), &mor1.begin(),
-                          &mor2.begin(), &sp_lj.begin(),
-                          &this->nbor->dev_nbor.begin(),
-                          &this->_nbor_data->begin(),
-                          &this->ans->dev_ans.begin(),
-                          &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &nbor_pitch, &this->_threads_per_atom);
+    this->k_pair_fast.run(&this->atom->x, &mor1, &mor2, &sp_lj,
+                          &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                          &this->ans->force, &this->ans->engv, &eflag,
+                          &vflag, &ainum, &nbor_pitch, 
+                          &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->dev_x.begin(), &mor1.begin(), &mor2.begin(),
-                     &_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &nbor_pitch, &this->_threads_per_atom);
+    this->k_pair.run(&this->atom->x, &mor1, &mor2, &_types, &sp_lj, 
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                     &this->ans->force, &this->ans->engv, &eflag, &vflag,
+                     &ainum, &nbor_pitch, &this->_threads_per_atom);
   }
   this->time_pair.stop();
 }
diff --git a/lib/gpu/lal_morse.cu b/lib/gpu/lal_morse.cu
index bd9ae49c8c..4d89180390 100644
--- a/lib/gpu/lal_morse.cu
+++ b/lib/gpu/lal_morse.cu
@@ -14,15 +14,19 @@
 // ***************************************************************************/
 
 #ifdef NV_KERNEL
+
 #include "lal_aux_fun1.h"
-texture<float4> pos_tex;
 #ifndef _DOUBLE_DOUBLE
-ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
-  { return tex1Dfetch(pos_tex, i); }
-#endif
+texture<float4> pos_tex;
+#else
+texture<int4,1> pos_tex;
 #endif
 
-__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *mor1,
+#else
+#define pos_tex x_
+#endif
+
+__kernel void k_morse(__global numtyp4 *x_, __global numtyp4 *mor1,
                           __global numtyp2* mor2, const int lj_types, 
                           __global numtyp *sp_lj_in, __global int *dev_nbor, 
                           __global int *dev_packed, __global acctyp4 *ans,
@@ -51,7 +55,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *mor1,
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
@@ -61,7 +65,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *mor1,
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int jtype=jx.w;
 
       // Compute r12
@@ -102,7 +106,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *mor1,
   } // if ii
 }
 
-__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *mor1_in,
+__kernel void k_morse_fast(__global numtyp4 *x_, __global numtyp4 *mor1_in,
                                __global numtyp2* mor2_in, 
                                __global numtyp* sp_lj_in,
                                __global int *dev_nbor, __global int *dev_packed,
@@ -138,7 +142,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *mor1_in,
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
@@ -149,7 +153,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *mor1_in,
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int mtype=itype+jx.w;
 
       // Compute r12
diff --git a/lib/gpu/lal_neighbor.cpp b/lib/gpu/lal_neighbor.cpp
index 6a086745c5..a033b507a4 100644
--- a/lib/gpu/lal_neighbor.cpp
+++ b/lib/gpu/lal_neighbor.cpp
@@ -84,7 +84,7 @@ bool Neighbor::init(NeighborShared *shared, const int inum,
     _max_atoms=1000;
     
   _max_host=static_cast<int>(static_cast<double>(host_inum)*1.10);
-  _max_nbors=max_nbors;
+  _max_nbors=(max_nbors/threads_per_atom+1)*threads_per_atom;
 
   _maxspecial=maxspecial;
   if (gpu_nbor==0)
@@ -124,17 +124,14 @@ void Neighbor::alloc(bool &success) {
     _c_bytes+=dev_packed.row_bytes();                                         
   } 
   if (_max_host>0) {
-    host_nbor.clear();
-    dev_host_nbor.clear();
-    dev_host_numj.clear();
+    nbor_host.clear();
+    dev_numj_host.clear();
     host_ilist.clear();
     host_jlist.clear();
     
-    success=success && (host_nbor.alloc(_max_nbors*_max_host,*dev,
-                                        UCL_RW_OPTIMIZED)==UCL_SUCCESS);
-    success=success && (dev_host_nbor.alloc(_max_nbors*_max_host,
-                                            *dev,UCL_WRITE_ONLY)==UCL_SUCCESS);
-    success=success && (dev_host_numj.alloc(_max_host,*dev,
+    success=(nbor_host.alloc(_max_nbors*_max_host,*dev,UCL_RW_OPTIMIZED,
+                             UCL_WRITE_ONLY)==UCL_SUCCESS) && success;
+    success=success && (dev_numj_host.alloc(_max_host,*dev,
                                             UCL_WRITE_ONLY)==UCL_SUCCESS);
     success=success && (host_ilist.alloc(nt,*dev,UCL_NOT_PINNED)==UCL_SUCCESS);
     if (!success)
@@ -145,16 +142,16 @@ void Neighbor::alloc(bool &success) {
                                          UCL_NOT_PINNED)==UCL_SUCCESS);
     if (!success)
       return;
-    int *ptr=host_nbor.begin();
+    int *ptr=nbor_host.host.begin();
     for (int i=0; i<_max_host; i++) {
       host_jlist[i]=ptr;
       ptr+=_max_nbors;
     }                                                 
-    _c_bytes+=dev_host_nbor.row_bytes()+dev_host_numj.row_bytes();
+    _c_bytes+=nbor_host.device.row_bytes()+dev_numj_host.row_bytes();
   } else {
     // Some OpenCL implementations return errors for NULL pointers as args
-    dev_host_nbor.view(dev_nbor);
-    dev_host_numj.view(dev_nbor);
+    nbor_host.device.view(dev_nbor);
+    dev_numj_host.view(dev_nbor);
   }
   if (_maxspecial>0) {
     dev_nspecial.clear();
@@ -194,10 +191,9 @@ void Neighbor::clear() {
     host_packed.clear();
     host_acc.clear();
     dev_nbor.clear();
-    dev_host_nbor.clear();
+    nbor_host.clear();
     dev_packed.clear();
-    host_nbor.clear();
-    dev_host_numj.clear();
+    dev_numj_host.clear();
     host_ilist.clear();
     host_jlist.clear();
     dev_nspecial.clear();
@@ -215,8 +211,8 @@ void Neighbor::clear() {
 double Neighbor::host_memory_usage() const {
   if (_gpu_nbor>0) {
     if (_gpu_host)
-      return host_nbor.row_bytes()*host_nbor.rows()+host_ilist.row_bytes()+
-             host_jlist.row_bytes();
+      return nbor_host.device.row_bytes()*nbor_host.rows()+
+             host_ilist.row_bytes()+host_jlist.row_bytes();
     else
       return 0;
   } else 
@@ -285,8 +281,7 @@ void Neighbor::get_host(const int inum, int *ilist, int *numj,
     int GX=static_cast<int>(ceil(static_cast<double>(inum)*_threads_per_atom/
                                  block_size));
     _shared->k_nbor.set_size(GX,block_size);
-    _shared->k_nbor.run(&dev_nbor.begin(), &dev_packed.begin(), &inum,
-                        &_threads_per_atom);
+    _shared->k_nbor.run(&dev_nbor, &dev_packed, &inum, &_threads_per_atom);
     time_kernel.stop();
   }
 }
@@ -295,31 +290,23 @@ template <class numtyp, class acctyp>
 void Neighbor::resize_max_neighbors(const int maxn, bool &success) {
   if (maxn>_max_nbors) {  
     int mn=static_cast<int>(static_cast<double>(maxn)*1.10);
-    dev_nbor.clear();
-    success=success && 
-            (dev_nbor.alloc((mn+1)*_max_atoms,*dev)==UCL_SUCCESS);
+    mn=(mn/_threads_per_atom+1)*_threads_per_atom;
+    success=success && (dev_nbor.resize((mn+1)*_max_atoms)==UCL_SUCCESS);
     _gpu_bytes=dev_nbor.row_bytes();
     if (_max_host>0) {
-      host_nbor.clear();
-      dev_host_nbor.clear();
-      success=success && (host_nbor.alloc(mn*_max_host,*dev,
-                                          UCL_RW_OPTIMIZED)==UCL_SUCCESS);
-      success=success && (dev_host_nbor.alloc(mn*_max_host,
-                                        *dev,UCL_WRITE_ONLY)==UCL_SUCCESS);
-      int *ptr=host_nbor.begin();
+      success=success && (nbor_host.resize(mn*_max_host)==UCL_SUCCESS);
+      int *ptr=nbor_host.host.begin();
       for (int i=0; i<_max_host; i++) {
         host_jlist[i]=ptr;
         ptr+=mn;
       }                                                 
-      _gpu_bytes+=dev_host_nbor.row_bytes();
+      _gpu_bytes+=nbor_host.row_bytes();
     } else {
-      dev_host_nbor.view(dev_nbor);
-      dev_host_numj.view(dev_nbor);
+      nbor_host.device.view(dev_nbor);
+      dev_numj_host.view(dev_nbor);
     }
     if (_alloc_packed) {
-      dev_packed.clear();
-      success=success && (dev_packed.alloc((mn+2)*_max_atoms,*dev,
-                                           UCL_READ_ONLY)==UCL_SUCCESS);
+      success=success && (dev_packed.resize((mn+2)*_max_atoms)==UCL_SUCCESS);
       _gpu_bytes+=dev_packed.row_bytes();
     }
     _max_nbors=mn;
@@ -337,16 +324,12 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
 
   // Calculate number of cells and allocate storage for binning as necessary
   int ncellx, ncelly, ncellz, ncell_3d;
-  ncellx = static_cast<int>(ceil(((subhi[0] - sublo[0]) +
-                                  2.0*_cell_size)/_cell_size));
-  ncelly = static_cast<int>(ceil(((subhi[1] - sublo[1]) +
-                                  2.0*_cell_size)/_cell_size));
-  ncellz = static_cast<int>(ceil(((subhi[2] - sublo[2]) +
-                                  2.0*_cell_size)/_cell_size));
+  int ghost_cells=2*_cells_in_cutoff;
+  ncellx = static_cast<int>(ceil((subhi[0]-sublo[0])/_cell_size))+ghost_cells;
+  ncelly = static_cast<int>(ceil((subhi[1]-sublo[1])/_cell_size))+ghost_cells;
+  ncellz = static_cast<int>(ceil((subhi[2]-sublo[2])/_cell_size))+ghost_cells;
   ncell_3d = ncellx * ncelly * ncellz;
   if (ncell_3d+1>_ncells) {
-    dev_cell_counts.clear();
-    dev_cell_counts.alloc(ncell_3d+1,dev_nbor);
     if (_gpu_nbor==2) {
       if (_ncells>0) {
         host_cell_counts.clear();
@@ -355,11 +338,19 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
       cell_iter = new int[ncell_3d+1];
       host_cell_counts.alloc(ncell_3d+1,dev_nbor);
     }
+
+    if (_gpu_nbor==2 && atom.host_view())
+      dev_cell_counts.view(host_cell_counts);
+    else {
+      dev_cell_counts.clear();
+      dev_cell_counts.alloc(ncell_3d+1,dev_nbor);
+    }
+    
     _ncells=ncell_3d+1;
     _cell_bytes=dev_cell_counts.row_bytes();
   }
 
-  const numtyp cell_size_cast=static_cast<numtyp>(_cell_size);
+  const numtyp cutoff_cast=static_cast<numtyp>(_cutoff);
 
   if (_maxspecial>0) {
     time_nbor.start();
@@ -379,8 +370,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
     const int g2x=static_cast<int>(ceil(static_cast<double>(_maxspecial)/b2x));
     const int g2y=static_cast<int>(ceil(static_cast<double>(nt)/b2y));
     _shared->k_transpose.set_size(g2x,g2y,b2x,b2y);
-    _shared->k_transpose.run(&dev_special.begin(),&dev_special_t.begin(),
-                             &_maxspecial,&nt);        
+    _shared->k_transpose.run(&dev_special,&dev_special_t,&_maxspecial,&nt);        
     time_transpose.stop();
   }
   
@@ -392,28 +382,48 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
     
     // Build cell list on CPU                               
     host_cell_counts.zero();
-    double m_cell_size=-_cell_size;
-    double dx=subhi[0]-sublo[0]+_cell_size;
-    double dy=subhi[1]-sublo[1]+_cell_size;
-    double dz=subhi[2]-sublo[2]+_cell_size;
+    double i_cell_size=1.0/_cell_size;
 
-    for (int i=0; i<nall; i++) {
+    int offset_hi=_cells_in_cutoff+1;
+    for (int i=0; i<nt; i++) {
       double px, py, pz;
       px=x[i][0]-sublo[0];
       py=x[i][1]-sublo[1];
       pz=x[i][2]-sublo[2];
-      if (px<m_cell_size) px=m_cell_size;
-      if (py<m_cell_size) py=m_cell_size;
-      if (pz<m_cell_size) pz=m_cell_size;
-      if (px>dx) px=dx;            
-      if (py>dy) py=dy;            
-      if (pz>dz) pz=dz;            
+
+      int ix = static_cast<int>(px*i_cell_size+1);
+      ix = std::max(ix,_cells_in_cutoff);
+      ix = std::min(ix,ncellx-offset_hi);
+      int iy = static_cast<int>(py*i_cell_size+1);
+      iy = std::max(iy,_cells_in_cutoff);
+      iy = std::min(iy,ncelly-offset_hi);
+      int iz = static_cast<int>(pz*i_cell_size+1);
+      iz = std::max(iz,_cells_in_cutoff);
+      iz = std::min(iz,ncellz-offset_hi);
     
-      int id=static_cast<int>(px/_cell_size + 1.0) + 
-             static_cast<int>(py/_cell_size + 1.0) * ncellx +
-             static_cast<int>(pz/_cell_size + 1.0) * ncellx * ncelly;
+      int id = ix+iy*ncellx+iz*ncellx*ncelly;
+      cell_id[i] = id;
+      host_cell_counts[id+1]++;
+    }
     
-      cell_id[i]=id;
+    for (int i=nt; i<nall; i++) {
+      double px, py, pz;
+      px=x[i][0]-sublo[0];
+      py=x[i][1]-sublo[1];
+      pz=x[i][2]-sublo[2];
+
+      int ix = static_cast<int>(px*i_cell_size+1);
+      ix = std::max(ix,0);
+      ix = std::min(ix,ncellx-1);
+      int iy = static_cast<int>(py*i_cell_size+1);
+      iy = std::max(iy,0);
+      iy = std::min(iy,ncelly-1);
+      int iz = static_cast<int>(pz*i_cell_size+1);
+      iz = std::max(iz,0);
+      iz = std::min(iz,ncellz-1);
+    
+      int id = ix+iy*ncellx+iz*ncellx*ncelly;
+      cell_id[i] = id;
       host_cell_counts[id+1]++;
     }
     
@@ -451,41 +461,39 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
   time_kernel.start();
 
   _nbor_pitch=inum;
-  _shared->neigh_tex.bind_float(atom.dev_x,4);
+  _shared->neigh_tex.bind_float(atom.x,4);
 
   // If binning on GPU, do this now
   if (_gpu_nbor==1) {
+    const numtyp i_cell_size=static_cast<numtyp>(1.0/_cell_size);
     const int neigh_block=_block_cell_id;
     const int GX=(int)ceil((float)nall/neigh_block);
     const numtyp sublo0=static_cast<numtyp>(sublo[0]);
     const numtyp sublo1=static_cast<numtyp>(sublo[1]);
     const numtyp sublo2=static_cast<numtyp>(sublo[2]);
-    const numtyp subhi0=static_cast<numtyp>(subhi[0]);
-    const numtyp subhi1=static_cast<numtyp>(subhi[1]);
-    const numtyp subhi2=static_cast<numtyp>(subhi[2]);
     _shared->k_cell_id.set_size(GX,neigh_block);
-    _shared->k_cell_id.run(&atom.dev_x.begin(), &atom.dev_cell_id.begin(), 
-                           &atom.dev_particle_id.begin(),
-    				               &sublo0, &sublo1, &sublo2, &subhi0, &subhi1, 
-    				               &subhi2, &cell_size_cast, &ncellx, &ncelly, &nall);
+    _shared->k_cell_id.run(&atom.x, &atom.dev_cell_id, 
+                           &atom.dev_particle_id, &sublo0, &sublo1,
+                           &sublo2, &i_cell_size, &ncellx, &ncelly, &ncellz,
+                           &nt, &nall, &_cells_in_cutoff);
 
     atom.sort_neighbor(nall);
 
     /* calculate cell count */
     _shared->k_cell_counts.set_size(GX,neigh_block);
-    _shared->k_cell_counts.run(&atom.dev_cell_id.begin(), 
-                               &dev_cell_counts.begin(), &nall, &ncell_3d);
+    _shared->k_cell_counts.run(&atom.dev_cell_id, &dev_cell_counts, &nall, 
+                               &ncell_3d);
   } 
   
   /* build the neighbor list */
   const int cell_block=_block_nbor_build;
-  _shared->k_build_nbor.set_size(ncellx, ncelly*ncellz, cell_block, 1);
-  _shared->k_build_nbor.run(&atom.dev_x.begin(), &atom.dev_particle_id.begin(),
-                            &dev_cell_counts.begin(), &dev_nbor.begin(),
-                            &dev_host_nbor.begin(), &dev_host_numj.begin(),
-                            &_max_nbors,&cell_size_cast,
-                            &ncellx, &ncelly, &ncellz, &inum, &nt, &nall,
-                            &_threads_per_atom);
+  _shared->k_build_nbor.set_size(ncellx-ghost_cells,(ncelly-ghost_cells)*
+                                 (ncellz-ghost_cells),cell_block,1);
+  _shared->k_build_nbor.run(&atom.x, &atom.dev_particle_id,
+                            &dev_cell_counts, &dev_nbor, &nbor_host,
+                            &dev_numj_host, &_max_nbors, &cutoff_cast, &ncellx,
+                            &ncelly, &ncellz, &inum, &nt, &nall,
+                            &_threads_per_atom, &_cells_in_cutoff);
 
   /* Get the maximum number of nbors and realloc if necessary */
   UCL_D_Vec<int> numj;
@@ -494,7 +502,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
   if (nt>inum) {
     UCL_H_Vec<int> host_offset;
     host_offset.view_offset(inum,host_acc,nt-inum);
-    ucl_copy(host_offset,dev_host_numj,nt-inum,true);
+    ucl_copy(host_offset,dev_numj_host,nt-inum,true);
   }
   
   if (_gpu_nbor!=2) {
@@ -521,17 +529,16 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
     const int GX2=static_cast<int>(ceil(static_cast<double>
                                           (nt*_threads_per_atom)/cell_block));
     _shared->k_special.set_size(GX2,cell_block);
-    _shared->k_special.run(&dev_nbor.begin(), &dev_host_nbor.begin(), 
-                           &dev_host_numj.begin(), &atom.dev_tag.begin(), 
-                           &dev_nspecial.begin(), &dev_special.begin(), 
+    _shared->k_special.run(&dev_nbor, &nbor_host, &dev_numj_host,
+                           &atom.dev_tag, &dev_nspecial, &dev_special, 
                            &inum, &nt, &_max_nbors, &_threads_per_atom);
   }
   time_kernel.stop();
 
   time_nbor.start();
   if (inum<nt) {
-    ucl_copy(host_nbor,dev_host_nbor,true);
-    host_nbor.sync();
+    nbor_host.update_host(true);
+    nbor_host.sync();
   }
   time_nbor.stop();
 }
diff --git a/lib/gpu/lal_neighbor.h b/lib/gpu/lal_neighbor.h
index a2d2eda560..5ac01bf331 100644
--- a/lib/gpu/lal_neighbor.h
+++ b/lib/gpu/lal_neighbor.h
@@ -22,20 +22,6 @@
 
 #define IJ_SIZE 131072
 
-#ifdef USE_OPENCL
-
-#include "geryon/ocl_timer.h"
-#include "geryon/ocl_mat.h"
-using namespace ucl_opencl;
-
-#else
-
-#include "geryon/nvd_timer.h"
-#include "geryon/nvd_mat.h"
-using namespace ucl_cudadr;
-
-#endif
-
 namespace LAMMPS_AL {
 
 class Neighbor {
@@ -70,7 +56,14 @@ class Neighbor {
             const int warp_size, const bool time_device);
 
   /// Set the size of the cutoff+skin
-  inline void cell_size(const double size) { _cell_size=size; }
+  inline void cell_size(const double size, const double cutoff) { 
+    _cell_size=size;
+    _cutoff=cutoff;
+    if (cutoff>size)
+      _cells_in_cutoff=static_cast<int>(ceil(cutoff/size));
+    else
+      _cells_in_cutoff=1;
+  }
   
   /// Get the size of the cutoff+skin
   inline double cell_size() const { return _cell_size; }
@@ -203,14 +196,11 @@ class Neighbor {
 
   // ----------------- Data for GPU Neighbor Calculation ---------------
 
-  /// Host storage for device calculated neighbor lists
-  /** Same storage format as device matrix **/
-  UCL_H_Vec<int> host_nbor;
-  /// Device storage for neighbor list matrix that will be copied to host
+  /// Host/Device storage for device calculated neighbor lists
   /** - 1st row is numj
     * - Remaining rows are by atom, columns are nbors **/
-  UCL_D_Vec<int> dev_host_nbor;
-  UCL_D_Vec<int> dev_host_numj;
+  UCL_Vector<int,int> nbor_host;
+  UCL_D_Vec<int> dev_numj_host;
   UCL_H_Vec<int> host_ilist;
   UCL_H_Vec<int*> host_jlist;
   /// Device storage for special neighbor counts
@@ -232,13 +222,14 @@ class Neighbor {
   bool _allocated, _use_packing, _nbor_time_avail, _time_device;
   int _gpu_nbor, _max_atoms, _max_nbors, _max_host, _nbor_pitch, _maxspecial;
   bool _gpu_host, _alloc_packed;
-  double _cell_size, _bin_time;
+  double _cutoff, _cell_size, _bin_time;
 
   double _gpu_bytes, _c_bytes, _cell_bytes;
   void alloc(bool &success);
   
   int _block_cell_2d, _block_cell_id, _max_block_nbor_build, _block_nbor_build;
   int _ncells, _threads_per_atom, _total_atoms;
+  int _cells_in_cutoff;
 
   template <class numtyp, class acctyp>
   inline void resize_max_neighbors(const int maxn, bool &success);
diff --git a/lib/gpu/lal_neighbor_gpu.cu b/lib/gpu/lal_neighbor_gpu.cu
index 99990ece67..ebd18e2b2b 100644
--- a/lib/gpu/lal_neighbor_gpu.cu
+++ b/lib/gpu/lal_neighbor_gpu.cu
@@ -16,38 +16,48 @@
 
 #ifdef NV_KERNEL
 #include "lal_preprocessor.h"
-texture<float4> neigh_tex;
 #ifndef _DOUBLE_DOUBLE
-ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
-  { return tex1Dfetch(neigh_tex, i); }
+texture<float4> pos_tex;
+#else
+texture<int4,1> pos_tex;
 #endif
 
 __kernel void calc_cell_id(numtyp4 *pos, unsigned *cell_id, int *particle_id,
-                           numtyp boxlo0, 
-                           numtyp boxlo1, numtyp boxlo2, numtyp boxhi0, 
-                           numtyp boxhi1, numtyp boxhi2, numtyp cell_size, 
-                           int ncellx, int ncelly, int nall) {
+                           numtyp boxlo0, numtyp boxlo1, numtyp boxlo2, 
+                           numtyp i_cell_size, int ncellx, int ncelly, 
+                           int ncellz, int inum, int nall, 
+                           int cells_in_cutoff) {
   int i = threadIdx.x + blockIdx.x*blockDim.x;
 
   if (i < nall) {
-    numtyp4 p = fetch_pos(i,pos); //pos[i];
+    numtyp4 p;
+    fetch4(p,i,pos_tex); //pos[i];
 
     p.x -= boxlo0;
     p.y -= boxlo1;
     p.z -= boxlo2;
     
-    p.x = fmaxf(p.x, -cell_size);
-    p.x = fminf(p.x, boxhi0-boxlo0+cell_size);
-    p.y = fmaxf(p.y, -cell_size);
-    p.y = fminf(p.y, boxhi1-boxlo1+cell_size);
-    p.z = fmaxf(p.z, -cell_size);
-    p.z = fminf(p.z, boxhi2-boxlo2+cell_size);
+    int ix = int(p.x*i_cell_size+cells_in_cutoff);
+    int iy = int(p.y*i_cell_size+cells_in_cutoff);
+    int iz = int(p.z*i_cell_size+cells_in_cutoff);
     
-    unsigned int id = (unsigned int)(p.x/cell_size + 1.0) 
-      + (unsigned int)(p.y/cell_size + 1.0) * ncellx
-      + (unsigned int)(p.z/cell_size + 1.0) * ncellx * ncelly;
+    int offset_lo, offset_hi;
+    if (i<inum) {
+      offset_lo=cells_in_cutoff;
+      offset_hi=cells_in_cutoff+1;
+    } else {
+      offset_lo=0;
+      offset_hi=1;
+    }
     
-    cell_id[i] = id;
+    ix = max(ix,offset_lo);
+    ix = min(ix,ncellx-offset_hi);
+    iy = max(iy,offset_lo);
+    iy = min(iy,ncelly-offset_hi);
+    iz = max(iz,offset_lo);
+    iz = min(iz,ncellz-offset_hi);
+    
+    cell_id[i] = ix+iy*ncellx+iz*ncellx*ncelly;
     particle_id[i] = i;
   }
 }
@@ -78,6 +88,8 @@ __kernel void kernel_calc_cell_counts(unsigned *cell_id,
   }
 }
 
+#else
+#define pos_tex x_
 #endif
 
 
@@ -113,12 +125,13 @@ __kernel void calc_neigh_list_cell(__global numtyp4 *x_,
                                    __global int *host_numj, 
                                    int neigh_bin_size, numtyp cell_size,
                                    int ncellx, int ncelly, int ncellz,
-                                   int inum, int nt, int nall, int t_per_atom)
+                                   int inum, int nt, int nall, int t_per_atom,
+                                   int cells_in_cutoff)
 {
   int tid = THREAD_ID_X;
-  int ix = BLOCK_ID_X;
-  int iy = BLOCK_ID_Y % ncelly;
-  int iz = BLOCK_ID_Y / ncelly;
+  int ix = BLOCK_ID_X + cells_in_cutoff;
+  int iy = BLOCK_ID_Y % (ncelly - cells_in_cutoff*2) + cells_in_cutoff;
+  int iz = BLOCK_ID_Y / (ncelly - cells_in_cutoff*2) + cells_in_cutoff;
   int bsx = BLOCK_SIZE_X;
 	  
   int icell = ix + iy*ncellx + iz*ncellx*ncelly;
@@ -129,9 +142,9 @@ __kernel void calc_neigh_list_cell(__global numtyp4 *x_,
   int icell_begin = cell_counts[icell];
   int icell_end = cell_counts[icell+1];
 
-  int nborz0 = max(iz-1,0), nborz1 = min(iz+1, ncellz-1),
-      nbory0 = max(iy-1,0), nbory1 = min(iy+1, ncelly-1),
-      nborx0 = max(ix-1,0), nborx1 = min(ix+1, ncellx-1);
+  int nborz0 = iz-cells_in_cutoff, nborz1 = iz+cells_in_cutoff, 
+      nbory0 = iy-cells_in_cutoff, nbory1 = iy+cells_in_cutoff,
+      nborx0 = ix-cells_in_cutoff, nborx1 = ix+cells_in_cutoff;
 
   numtyp4 diff;
   numtyp r2;
@@ -147,7 +160,7 @@ __kernel void calc_neigh_list_cell(__global numtyp4 *x_,
       pid_i = cell_particle_id[i];
 
     if (pid_i < nt) {
-      atom_i = fetch_pos(pid_i,x_); //pos[pid_i];
+      fetch4(atom_i,pid_i,pos_tex); //pos[i];
     }
     if (pid_i < inum) {
       stride=inum;
@@ -182,7 +195,7 @@ __kernel void calc_neigh_list_cell(__global numtyp4 *x_,
             if (tid < end_idx) {
               pid_j =  cell_particle_id[tid+k*bsx+jcell_begin];
               cell_list_sh[tid] = pid_j;
-              atom_j = fetch_pos(pid_j,x_); //[pid_j];
+              fetch4(atom_j,pid_j,pos_tex); //[pid_j];
               pos_sh[tid].x = atom_j.x;
               pos_sh[tid].y = atom_j.y;
               pos_sh[tid].z = atom_j.z;
diff --git a/lib/gpu/lal_neighbor_shared.cpp b/lib/gpu/lal_neighbor_shared.cpp
index aeac76062a..ba948b4287 100644
--- a/lib/gpu/lal_neighbor_shared.cpp
+++ b/lib/gpu/lal_neighbor_shared.cpp
@@ -16,12 +16,15 @@
 #include "lal_precision.h"
 #include "lal_neighbor_shared.h"
 
-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "neighbor_cpu_cl.h"
 #include "neighbor_gpu_cl.h"
+#elif defined(USE_CUDART)
+const char *neighbor_cpu=0;
+const char *neighbor_gpu=0;
 #else
-#include "neighbor_cpu_ptx.h"
-#include "neighbor_gpu_ptx.h"
+#include "neighbor_cpu_cubin.h"
+#include "neighbor_gpu_cubin.h"
 #endif
 
 using namespace LAMMPS_AL;
@@ -69,7 +72,7 @@ void NeighborShared::compile_kernels(UCL_Device &dev, const int gpu_nbor) {
     k_build_nbor.set_function(*build_program,"calc_neigh_list_cell");
     k_transpose.set_function(*build_program,"transpose");
     k_special.set_function(*build_program,"kernel_special");
-    neigh_tex.get_texture(*build_program,"neigh_tex");
+    neigh_tex.get_texture(*build_program,"pos_tex");
   }
   _compiled=true;
 }
diff --git a/lib/gpu/lal_neighbor_shared.h b/lib/gpu/lal_neighbor_shared.h
index b579e0d600..dcd776669c 100644
--- a/lib/gpu/lal_neighbor_shared.h
+++ b/lib/gpu/lal_neighbor_shared.h
@@ -16,18 +16,18 @@
 #ifndef LAL_NEIGHBOR_SHARED_H
 #define LAL_NEIGHBOR_SHARED_H
 
-#ifdef USE_OPENCL
-
+#if defined(USE_OPENCL)
 #include "geryon/ocl_kernel.h"
 #include "geryon/ocl_texture.h"
 using namespace ucl_opencl;
-
+#elif defined(USE_CUDART)
+#include "geryon/nvc_kernel.h"
+#include "geryon/nvc_texture.h"
+using namespace ucl_cudart;
 #else
-
 #include "geryon/nvd_kernel.h"
 #include "geryon/nvd_texture.h"
 using namespace ucl_cudadr;
-
 #endif
 
 namespace LAMMPS_AL {
diff --git a/lib/gpu/lal_pppm.cpp b/lib/gpu/lal_pppm.cpp
index 84eb95acd0..5a929d9e69 100644
--- a/lib/gpu/lal_pppm.cpp
+++ b/lib/gpu/lal_pppm.cpp
@@ -13,11 +13,14 @@
     email                : brownw@ornl.gov
  ***************************************************************************/
 
-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "pppm_cl.h"
+#elif defined(USE_CUDART)
+const char *pppm_f=0;
+const char *pppm_d=0;
 #else
-#include "pppm_f_ptx.h"
-#include "pppm_d_ptx.h"
+#include "pppm_f_cubin.h"
+#include "pppm_d_cubin.h"
 #endif
 #include "lal_pppm.h"
 #include <cassert>
@@ -51,7 +54,7 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen,
                               const int nylo_out, const int nzlo_out,
                               const int nxhi_out, const int nyhi_out,
                               const int nzhi_out, grdtyp **rho_coeff,
-                              grdtyp **vd_brick, const double slab_volfactor, 
+                              grdtyp **vd_brick_p, const double slab_volfactor, 
                               const int nx_pppm, const int ny_pppm,
                               const int nz_pppm, const bool split, int &flag) {
   _max_bytes=10;
@@ -92,8 +95,8 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen,
   time_interp.init(*ucl_device);
   time_interp.zero();
 
-  pos_tex.bind_float(atom->dev_x,4);
-  q_tex.bind_float(atom->dev_q,1);
+  pos_tex.bind_float(atom->x,4);
+  q_tex.bind_float(atom->q,1);
 
   _allocated=true;
   _max_bytes=0;
@@ -133,14 +136,12 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen,
   _npts_y=nyhi_out-nylo_out+1;
   _npts_z=nzhi_out-nzlo_out+1;
   _npts_yx=_npts_x*_npts_y;
-  success=success && (d_brick.alloc(_npts_x*_npts_y*_npts_z*4,*ucl_device)==
+  success=success && (brick.alloc(_npts_x*_npts_y*_npts_z,*ucl_device)==
                       UCL_SUCCESS);
-  success=success && (h_brick.alloc(_npts_x*_npts_y*_npts_z,*ucl_device)==
+  success=success && (vd_brick.alloc(_npts_x*_npts_y*_npts_z*4,*ucl_device)==
                       UCL_SUCCESS);
-  success=success && (h_vd_brick.alloc(_npts_x*_npts_y*_npts_z*4,*ucl_device)==
-                      UCL_SUCCESS);
-  *vd_brick=h_vd_brick.begin();
-  _max_bytes+=d_brick.row_bytes();
+  *vd_brick_p=vd_brick.host.begin();
+  _max_bytes+=brick.device.row_bytes()+vd_brick.device.row_bytes();
 
   // Allocate vector with count of atoms assigned to each grid point
   _nlocal_x=_npts_x+_nlower-_nupper;
@@ -158,20 +159,19 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen,
   _max_bytes+=d_brick_atoms.row_bytes();
 
   // Allocate error flags for checking out of bounds atoms
-  success=success && (h_error_flag.alloc(1,*ucl_device)==UCL_SUCCESS);
-  success=success && (d_error_flag.alloc(1,*ucl_device,UCL_WRITE_ONLY)==
-                                         UCL_SUCCESS);
+  success=success && (error_flag.alloc(1,*ucl_device,UCL_RW_OPTIMIZED,
+                                       UCL_WRITE_ONLY)==UCL_SUCCESS);
   if (!success) {
     flag=-3;
     return 0;
   }
   
-  d_error_flag.zero();
+  error_flag.device.zero();
   _max_bytes+=1;
   
   _cpu_idle_time=0.0;
 
-  return h_brick.begin();
+  return brick.host.begin();
 }
 
 template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
@@ -181,12 +181,10 @@ void PPPMT::clear(const double cpu_time) {
   _allocated=false;
   _precompute_done=false;
   
-  d_brick.clear();
-  h_brick.clear();
-  h_vd_brick.clear();
+  brick.clear();
+  vd_brick.clear();
   d_brick_counts.clear();
-  h_error_flag.clear();
-  d_error_flag.clear();
+  error_flag.clear();
   d_brick_atoms.clear();
   
   acc_timers();
@@ -269,11 +267,11 @@ void PPPMT::_precompute(const int ago, const int nlocal, const int nall,
 
   device->zero(d_brick_counts,d_brick_counts.numel());
   k_particle_map.set_size(GX,BX);
-  k_particle_map.run(&atom->dev_x.begin(), &atom->dev_q.begin(), &f_delvolinv,
-                     &ainum, &d_brick_counts.begin(), &d_brick_atoms.begin(),
-                     &_brick_x, &_brick_y, &_brick_z, &_delxinv, &_delyinv, 
-                     &_delzinv, &_nlocal_x, &_nlocal_y, &_nlocal_z, 
-                     &_atom_stride, &_max_brick_atoms, &d_error_flag.begin());
+  k_particle_map.run(&atom->x, &atom->q, &f_delvolinv, &ainum,
+                     &d_brick_counts, &d_brick_atoms, &_brick_x, &_brick_y, 
+                     &_brick_z, &_delxinv, &_delyinv, &_delzinv, &_nlocal_x,
+                     &_nlocal_y, &_nlocal_z, &_atom_stride, &_max_brick_atoms,
+                     &error_flag);
   time_map.stop();
 
   time_rho.start();
@@ -282,15 +280,14 @@ void PPPMT::_precompute(const int ago, const int nlocal, const int nall,
   GX=static_cast<int>(ceil(static_cast<double>(_npts_y*_npts_z)/
                       _block_pencils));
   k_make_rho.set_size(GX,BX);
-  k_make_rho.run(&d_brick_counts.begin(), &d_brick_atoms.begin(),
-                 &d_brick.begin(), &d_rho_coeff.begin(), &_atom_stride, 
-                 &_npts_x, &_npts_y, &_npts_z, &_nlocal_x, &_nlocal_y,
-                 &_nlocal_z, &_order_m_1, &_order, &_order2);
+  k_make_rho.run(&d_brick_counts, &d_brick_atoms, &brick, &d_rho_coeff,
+                 &_atom_stride, &_npts_x, &_npts_y, &_npts_z, &_nlocal_x,
+                 &_nlocal_y, &_nlocal_z, &_order_m_1, &_order, &_order2);
   time_rho.stop();
 
   time_out.start();
-  ucl_copy(h_brick,d_brick,_npts_yx*_npts_z,true);
-  ucl_copy(h_error_flag,d_error_flag,true);
+  brick.update_host(_npts_yx*_npts_z,true);
+  error_flag.update_host(true);
   time_out.stop();
 
   _precompute_done=true;
@@ -322,18 +319,17 @@ int PPPMT::spread(const int ago, const int nlocal, const int nall,
 
   _precompute_done=false;
 
-  if (h_error_flag[0]==2) {
+  if (error_flag[0]==2) {
     // Not enough storage for atoms on the brick
     _max_brick_atoms*=2;
-    d_error_flag.zero();
-    d_brick_atoms.clear();
-    d_brick_atoms.alloc(_atom_stride*_max_brick_atoms,*ucl_device);
+    error_flag.device.zero();
+    d_brick_atoms.resize(_atom_stride*_max_brick_atoms);
     _max_bytes+=d_brick_atoms.row_bytes();
     return spread(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo, 
                   delxinv,delyinv,delzinv);
   }
   
-  return h_error_flag[0];
+  return error_flag[0];
 }
 
 // ---------------------------------------------------------------------------
@@ -342,7 +338,7 @@ int PPPMT::spread(const int ago, const int nlocal, const int nall,
 template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
 void PPPMT::interp(const grdtyp qqrd2e_scale) {
   time_in.start();
-  ucl_copy(d_brick,h_vd_brick,true);
+  vd_brick.update_device(true);
   time_in.stop();
   
   time_interp.start();
@@ -353,10 +349,10 @@ void PPPMT::interp(const grdtyp qqrd2e_scale) {
   int ainum=this->ans->inum();
   
   k_interp.set_size(GX,BX);
-  k_interp.run(&atom->dev_x.begin(), &atom->dev_q.begin(), &ainum, 
-               &d_brick.begin(), &d_rho_coeff.begin(), &_npts_x, &_npts_yx,
-               &_brick_x, &_brick_y, &_brick_z, &_delxinv, &_delyinv, &_delzinv,
-               &_order, &_order2, &qqrd2e_scale, &ans->dev_ans.begin());
+  k_interp.run(&atom->x, &atom->q, &ainum, &vd_brick, &d_rho_coeff,
+               &_npts_x, &_npts_yx, &_brick_x, &_brick_y, &_brick_z, &_delxinv,
+               &_delyinv, &_delzinv, &_order, &_order2, &qqrd2e_scale, 
+               &ans->force);
   time_interp.stop();
 
   ans->copy_answers(false,false,false,false);
@@ -408,4 +404,3 @@ void PPPMT::compile_kernels(UCL_Device &dev) {
 
 template class PPPM<PRECISION,ACC_PRECISION,float,_lgpu_float4>;
 template class PPPM<PRECISION,ACC_PRECISION,double,_lgpu_double4>;
-
diff --git a/lib/gpu/lal_pppm.cu b/lib/gpu/lal_pppm.cu
index 5204180e83..646afa5900 100644
--- a/lib/gpu/lal_pppm.cu
+++ b/lib/gpu/lal_pppm.cu
@@ -14,14 +14,14 @@
 // ***************************************************************************/
 
 #ifdef NV_KERNEL
+
 #include "lal_preprocessor.h"
+#ifndef _DOUBLE_DOUBLE
 texture<float4> pos_tex;
 texture<float> q_tex;
-#ifndef _DOUBLE_DOUBLE
-ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
-  { return tex1Dfetch(pos_tex, i); }
-ucl_inline float fetch_q(const int& i, const float *q) 
-  { return tex1Dfetch(q_tex, i); }
+#else
+texture<int4,1> pos_tex;
+texture<int2> q_tex;
 #endif
 
 // Allow PPPM to compile without atomics for NVIDIA 1.0 cards, error
@@ -31,6 +31,8 @@ ucl_inline float fetch_q(const int& i, const float *q)
 #endif
 
 #else
+#define pos_tex x_
+#define q_tex q_
 #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics: enable
 #endif
 
@@ -59,9 +61,11 @@ __kernel void particle_map(__global numtyp4 *x_,  __global numtyp *q_,
   int nx,ny,nz;
 
   if (ii<nlocal) {
-    numtyp4 p=fetch_pos(ii,x_);
+    numtyp4 p;
+    fetch4(p,ii,pos_tex);
     grdtyp4 delta;
-    delta.w=delvolinv*fetch_q(ii,q_);
+    fetch(delta.w,ii,q_tex);
+    delta.w*=delvolinv;
     
     if (delta.w!=(grdtyp)0.0) {
       delta.x=(p.x-b_lo_x)*delxinv;
@@ -212,8 +216,11 @@ __kernel void interp(__global numtyp4 *x_, __global numtyp *q_,
   grdtyp tx,ty,tz;
 
   if (ii<nlocal) {
-    numtyp4 p=fetch_pos(ii,x_);
-    grdtyp qs=qqrd2e_scale*fetch_q(ii,q_);
+    numtyp4 p;
+    fetch4(p,ii,pos_tex);
+    grdtyp qs;
+    fetch(qs,ii,q_tex);
+    qs*=qqrd2e_scale;
 
     acctyp4 ek;
     ek.x=(acctyp)0.0;
diff --git a/lib/gpu/lal_pppm.h b/lib/gpu/lal_pppm.h
index fec5166e95..3b5809ea6c 100644
--- a/lib/gpu/lal_pppm.h
+++ b/lib/gpu/lal_pppm.h
@@ -19,8 +19,10 @@
 #include "mpi.h"
 #include "lal_device.h"
 
-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "geryon/ocl_texture.h"
+#elif defined(USE_CUDART)
+#include "geryon/nvc_texture.h"
 #else
 #include "geryon/nvd_texture.h"
 #endif
@@ -55,8 +57,8 @@ class PPPM {
   /** \param success set to false if insufficient memory **/
   inline void resize_atom(const int inum, const int nall, bool &success) {
     if (atom->resize(nall, success)) {
-      pos_tex.bind_float(atom->dev_x,4);
-      q_tex.bind_float(atom->dev_q,1);
+      pos_tex.bind_float(atom->x,4);
+      q_tex.bind_float(atom->q,1);
     }
     ans->resize(inum,success);
   }
@@ -138,8 +140,8 @@ class PPPM {
 
   // --------------------------- GRID DATA --------------------------
 
-  UCL_H_Vec<grdtyp> h_brick, h_vd_brick;
-  UCL_D_Vec<grdtyp> d_brick;
+  UCL_Vector<grdtyp,grdtyp> brick;
+  UCL_Vector<grdtyp,grdtyp> vd_brick;
   
   // Count of number of atoms assigned to each grid point
   UCL_D_Vec<int> d_brick_counts;
@@ -147,8 +149,7 @@ class PPPM {
   UCL_D_Vec<grdtyp4> d_brick_atoms;
   
   // Error checking for out of bounds atoms
-  UCL_D_Vec<int> d_error_flag;
-  UCL_H_Vec<int> h_error_flag;
+  UCL_Vector<int,int> error_flag;
   
   // Number of grid points in brick (including ghost)
   int _npts_x, _npts_y, _npts_z, _npts_yx;
diff --git a/lib/gpu/lal_precision.h b/lib/gpu/lal_precision.h
index 03c41a0df3..e31b10037e 100644
--- a/lib/gpu/lal_precision.h
+++ b/lib/gpu/lal_precision.h
@@ -16,6 +16,10 @@
 #ifndef LAL_PRECISION_H
 #define LAL_PRECISION_H
 
+#if defined(USE_CUDART)
+#include <cuda_runtime.h>
+#endif
+
 struct _lgpu_int2 {
   int x; int y;
 };
@@ -108,3 +112,4 @@ enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
 #endif
 
 #endif
+
diff --git a/lib/gpu/lal_preprocessor.h b/lib/gpu/lal_preprocessor.h
index 0ea3a1ca6d..b817bbe551 100644
--- a/lib/gpu/lal_preprocessor.h
+++ b/lib/gpu/lal_preprocessor.h
@@ -107,7 +107,7 @@
 #define BLOCK_NBOR_BUILD 128
 #define BLOCK_PAIR 128
 #define BLOCK_BIO_PAIR 128
-#define MAX_SHARED_TYPES 11
+#define MAX_SHARED_TYPES 8
 
 #else
 
@@ -129,8 +129,21 @@
 #define MAX_BIO_SHARED_TYPES 128
 
 #ifdef _DOUBLE_DOUBLE
-ucl_inline double4 fetch_pos(const int& i, const double4 *pos) { return pos[i]; };
-ucl_inline double fetch_q(const int& i, const double *q) { return q[i]; };
+#define fetch4(ans,i,pos_tex) {                        \
+  int4 xy = tex1Dfetch(pos_tex,i*2);                   \
+  int4 zt = tex1Dfetch(pos_tex,i*2+1);                 \
+  ans.x=__hiloint2double(xy.y, xy.x);                  \
+  ans.y=__hiloint2double(xy.w, xy.z);                  \
+  ans.z=__hiloint2double(zt.y, zt.x);                  \
+  ans.w=__hiloint2double(zt.w, zt.z);                  \
+}
+#define fetch(ans,i,q_tex) {                           \
+  int2 qt = tex1Dfetch(q_tex,i);                       \
+  ans=__hiloint2double(qt.y, qt.x);                    \
+}
+#else
+#define fetch4(ans,i,pos_tex) ans=tex1Dfetch(pos_tex, i);
+#define fetch(ans,i,q_tex) ans=tex1Dfetch(q_tex,i);
 #endif
 
 #if (__CUDA_ARCH__ < 200)
@@ -293,8 +306,8 @@ typedef struct _double4 double4;
 #define BLOCK_ID_Y get_group_id(1)
 #define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
 #define ucl_inline inline
-#define fetch_pos(i,y) x_[i]
-#define fetch_q(i,y) q_[i]
+#define fetch4(ans,i,x) ans=x[i]
+#define fetch(ans,i,q) ans=q[i]
 
 #define ucl_atan atan
 #define ucl_cbrt cbrt
diff --git a/lib/gpu/lal_re_squared.cpp b/lib/gpu/lal_re_squared.cpp
index 50f0503b3d..8f7ef24a11 100644
--- a/lib/gpu/lal_re_squared.cpp
+++ b/lib/gpu/lal_re_squared.cpp
@@ -13,12 +13,15 @@
     email                : brownw@ornl.gov
  ***************************************************************************/
 
-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "re_squared_cl.h"
 #include "re_squared_lj_cl.h"
+#elif defined(USE_CUDART)
+const char *re_squared=0;
+const char *re_squared_lj=0;
 #else
-#include "re_squared_ptx.h"
-#include "re_squared_lj_ptx.h"
+#include "re_squared_cubin.h"
+#include "re_squared_lj_cubin.h"
 #endif
 
 #include "lal_re_squared.h"
@@ -54,7 +57,8 @@ int RESquaredT::init(const int ntypes, double **host_shape, double **host_well,
                      const double gpu_split, FILE *_screen) {
   int success;
   success=this->init_base(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                          _screen,ntypes,h_form,re_squared,re_squared_lj,true);
+                          _screen,ntypes,h_form,re_squared,re_squared_lj,
+                          "k_resquared",true);
   if (success!=0)
     return success;
 
@@ -198,13 +202,13 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
 
       this->time_ellipsoid.start();
       this->k_ellipsoid.set_size(GX,BX);
-      this->k_ellipsoid.run(&this->atom->dev_x.begin(),
-       &this->atom->dev_quat.begin(), &this->shape.begin(), &this->well.begin(),
-       &this->special_lj.begin(), &this->sigma_epsilon.begin(), 
-       &this->_lj_types, &this->nbor->dev_nbor.begin(), &stride,
-       &this->ans->dev_ans.begin(),&ainum,&this->ans->dev_engv.begin(),
-       &this->dev_error.begin(), &eflag, &vflag, &this->_last_ellipse,
-       &this->_threads_per_atom);
+      this->k_ellipsoid.run(&this->atom->x, &this->atom->quat,
+                            &this->shape, &this->well, &this->special_lj,
+                            &this->sigma_epsilon, &this->_lj_types, 
+                            &this->nbor->dev_nbor, &stride, 
+                            &this->ans->force,&ainum, &this->ans->engv,
+                            &this->dev_error, &eflag, &vflag, 
+                            &this->_last_ellipse, &this->_threads_per_atom);
       this->time_ellipsoid.stop();
 
       // ------------ ELLIPSE_SPHERE ---------------
@@ -215,13 +219,14 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
 
       this->time_ellipsoid2.start();
       this->k_ellipsoid_sphere.set_size(GX,BX);
-      this->k_ellipsoid_sphere.run(&this->atom->dev_x.begin(),
-       &this->atom->dev_quat.begin(), &this->shape.begin(), &this->well.begin(),
-       &this->special_lj.begin(), &this->sigma_epsilon.begin(), 
-       &this->_lj_types, &this->nbor->dev_nbor.begin(), &stride,
-       &this->ans->dev_ans.begin(),&ainum,&this->ans->dev_engv.begin(),
-       &this->dev_error.begin(), &eflag, &vflag, &this->_last_ellipse,
-       &this->_threads_per_atom);
+      this->k_ellipsoid_sphere.run(&this->atom->x, &this->atom->quat, 
+                                   &this->shape, &this->well, &this->special_lj,
+                                   &this->sigma_epsilon, &this->_lj_types, 
+                                   &this->nbor->dev_nbor, &stride,
+                                   &this->ans->force,&ainum,
+                                   &this->ans->engv, &this->dev_error, 
+                                   &eflag, &vflag, &this->_last_ellipse,
+                                   &this->_threads_per_atom);
       this->time_ellipsoid2.stop();
 
       if (this->_last_ellipse==this->ans->inum()) {
@@ -245,17 +250,18 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
 
       this->time_ellipsoid3.start();
       this->k_sphere_ellipsoid.set_size(GX,BX);
-      this->k_sphere_ellipsoid.run(&this->atom->dev_x.begin(),
-        &this->atom->dev_quat.begin(), &this->shape.begin(), 
-        &this->well.begin(), &this->special_lj.begin(), 
-        &this->sigma_epsilon.begin(), &this->_lj_types,
-        &this->nbor->dev_nbor.begin(), &stride, &this->ans->dev_ans.begin(),
-        &this->ans->dev_engv.begin(), &this->dev_error.begin(), &eflag,
-        &vflag, &this->_last_ellipse, &ainum, &this->_threads_per_atom);
+      this->k_sphere_ellipsoid.run(&this->atom->x, &this->atom->quat,
+                                   &this->shape, &this->well, &this->special_lj, 
+                                   &this->sigma_epsilon, &this->_lj_types,
+                                   &this->nbor->dev_nbor, &stride, 
+                                   &this->ans->force, &this->ans->engv,
+                                   &this->dev_error, &eflag, &vflag,
+                                   &this->_last_ellipse, &ainum, 
+                                   &this->_threads_per_atom);
       this->time_ellipsoid3.stop();
    } else {
-      this->ans->dev_ans.zero();
-      this->ans->dev_engv.zero();
+      this->ans->force.zero();
+      this->ans->engv.zero();
       this->time_nbor1.zero();
       this->time_ellipsoid.zero();                                 
       this->time_nbor2.zero();
@@ -269,19 +275,19 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
     if (this->_last_ellipse<this->ans->inum()) {
       if (this->_shared_types) {
         this->k_lj_fast.set_size(GX,BX);
-        this->k_lj_fast.run(&this->atom->dev_x.begin(), &this->lj1.begin(),
-          &this->lj3.begin(), &this->special_lj.begin(), &stride,
-          &this->nbor->dev_packed.begin(), &this->ans->dev_ans.begin(),
-          &this->ans->dev_engv.begin(), &this->dev_error.begin(),
-          &eflag, &vflag, &this->_last_ellipse, &ainum,
-          &this->_threads_per_atom);
+        this->k_lj_fast.run(&this->atom->x, &this->lj1, &this->lj3,
+                            &this->special_lj, &stride,
+                            &this->nbor->dev_packed, &this->ans->force,
+                            &this->ans->engv, &this->dev_error,
+                            &eflag, &vflag, &this->_last_ellipse, &ainum,
+                            &this->_threads_per_atom);
       } else {
         this->k_lj.set_size(GX,BX);
-        this->k_lj.run(&this->atom->dev_x.begin(), &this->lj1.begin(),
-          &this->lj3.begin(), &this->_lj_types, &this->special_lj.begin(),
-          &stride, &this->nbor->dev_packed.begin(), &this->ans->dev_ans.begin(),
-          &this->ans->dev_engv.begin(), &this->dev_error.begin(), &eflag,
-          &vflag, &this->_last_ellipse, &ainum, &this->_threads_per_atom);
+        this->k_lj.run(&this->atom->x, &this->lj1, &this->lj3,
+                       &this->_lj_types, &this->special_lj, &stride, 
+                       &this->nbor->dev_packed, &this->ans->force,
+                       &this->ans->engv, &this->dev_error, &eflag, &vflag,
+                       &this->_last_ellipse, &ainum, &this->_threads_per_atom);
       }
     }
     this->time_lj.stop();
@@ -295,13 +301,12 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
     this->time_nbor1.stop();
     this->time_ellipsoid.start(); 
     this->k_ellipsoid.set_size(GX,BX);
-    this->k_ellipsoid.run(&this->atom->dev_x.begin(), 
-      &this->atom->dev_quat.begin(), &this->shape.begin(), &this->well.begin(), 
-      &this->special_lj.begin(), &this->sigma_epsilon.begin(), 
-      &this->_lj_types, &this->nbor->dev_nbor.begin(), &stride,
-      &this->ans->dev_ans.begin(), &ainum,  &this->ans->dev_engv.begin(),
-      &this->dev_error.begin(), &eflag, &vflag, &ainum, 
-      &this->_threads_per_atom);
+    this->k_ellipsoid.run(&this->atom->x, &this->atom->quat, 
+                          &this->shape, &this->well, &this->special_lj, 
+                          &this->sigma_epsilon, &this->_lj_types, 
+                          &this->nbor->dev_nbor, &stride, &this->ans->force,
+                          &ainum,  &this->ans->engv, &this->dev_error, 
+                          &eflag, &vflag, &ainum, &this->_threads_per_atom);
     this->time_ellipsoid.stop();
   }
 }
diff --git a/lib/gpu/lal_re_squared.cu b/lib/gpu/lal_re_squared.cu
index 63057a30d9..c858b09801 100644
--- a/lib/gpu/lal_re_squared.cu
+++ b/lib/gpu/lal_re_squared.cu
@@ -32,15 +32,15 @@ ucl_inline numtyp det_prime(const numtyp m[9], const numtyp m2[9])
   return ans;
 }
 
-__kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q,
-                               __global numtyp4* shape, __global numtyp4* well, 
-                               __global numtyp *splj, __global numtyp2* sig_eps, 
-                               const int ntypes, __global int *dev_nbor,
-                               const int stride,  __global acctyp4 *ans,
-                               const int astride, __global acctyp *engv,
-                               __global int *err_flag, const int eflag,
-                               const int vflag, const int inum,
-                               const int t_per_atom) {
+__kernel void k_resquared(__global numtyp4* x_,__global numtyp4 *q,
+                          __global numtyp4* shape, __global numtyp4* well, 
+                          __global numtyp *splj, __global numtyp2* sig_eps, 
+                          const int ntypes, __global int *dev_nbor,
+                          const int stride,  __global acctyp4 *ans,
+                          const int astride, __global acctyp *engv,
+                          __global int *err_flag, const int eflag,
+                          const int vflag, const int inum,
+                          const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
@@ -73,7 +73,7 @@ __kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q,
     nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
                 n_stride,nbor_end,nbor);
   
-    numtyp4 ix=x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex);
     int itype=ix.w;
 
     numtyp a1[9];       // Rotation matrix (lab->body)
@@ -122,7 +122,7 @@ __kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q,
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
-      numtyp4 jx=x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex);
       int jtype=jx.w;
 
       // Compute r12
diff --git a/lib/gpu/lal_re_squared_lj.cu b/lib/gpu/lal_re_squared_lj.cu
index a0c82ea294..5c46c21e45 100644
--- a/lib/gpu/lal_re_squared_lj.cu
+++ b/lib/gpu/lal_re_squared_lj.cu
@@ -17,12 +17,13 @@
 #include "lal_ellipsoid_extra.h"
 #endif
 
-__kernel void kernel_ellipsoid_sphere(__global numtyp4* x_,__global numtyp4 *q,
-                   __global numtyp4* shape, __global numtyp4* well, 
-                   __global numtyp *splj, __global numtyp2* sig_eps,
-                   const int ntypes, __global int *dev_nbor, const int stride, 
-                   __global acctyp4 *ans, const int astride, 
-                   __global acctyp *engv, __global int *err_flag, 
+__kernel void k_resquared_ellipsoid_sphere(__global numtyp4* x_,
+                  __global numtyp4 *q, __global numtyp4* shape, 
+                  __global numtyp4* well, __global numtyp *splj, 
+                  __global numtyp2* sig_eps, const int ntypes, 
+                  __global int *dev_nbor, const int stride, 
+                  __global acctyp4 *ans, const int astride, 
+                  __global acctyp *engv, __global int *err_flag, 
                    const int eflag, const int vflag, const int inum,
                    const int t_per_atom) {
   int tid, ii, offset;
@@ -59,7 +60,7 @@ __kernel void kernel_ellipsoid_sphere(__global numtyp4* x_,__global numtyp4 *q,
     nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
                 n_stride,nbor_end,nbor);
   
-    numtyp4 ix=x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex);
     int itype=ix.w;
 
     numtyp a[9];       // Rotation matrix (lab->body)
@@ -84,7 +85,7 @@ __kernel void kernel_ellipsoid_sphere(__global numtyp4* x_,__global numtyp4 *q,
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
-      numtyp4 jx=x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex);
       int jtype=jx.w;
 
       // Compute r12
@@ -331,14 +332,14 @@ __kernel void kernel_ellipsoid_sphere(__global numtyp4* x_,__global numtyp4 *q,
   } // if ii
 }
 
-__kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
-                               __global numtyp4* shape,__global numtyp4* well, 
-                               __global numtyp *splj, __global numtyp2* sig_eps, 
-                               const int ntypes, __global int *dev_nbor,
-                               const int stride, __global acctyp4 *ans,
-                               __global acctyp *engv, __global int *err_flag,
-                               const int eflag, const int vflag,const int start,
-                               const int inum, const int t_per_atom) {
+__kernel void k_resquared_sphere_ellipsoid(__global numtyp4 *x_,
+                  __global numtyp4 *q, __global numtyp4* shape,
+                  __global numtyp4* well, __global numtyp *splj,
+                  __global numtyp2* sig_eps, const int ntypes, 
+                  __global int *dev_nbor, const int stride, 
+                  __global acctyp4 *ans, __global acctyp *engv, 
+                  __global int *err_flag, const int eflag, const int vflag,
+                  const int start, const int inum, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
   ii+=start;
@@ -370,7 +371,7 @@ __kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
     nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,j,numj,
                 n_stride,nbor_end,nbor);
   
-    numtyp4 jx=x_[j];
+    numtyp4 jx; fetch4(jx,j,pos_tex);
     int jtype=jx.w;
 
     numtyp factor_lj;
@@ -379,7 +380,7 @@ __kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
       factor_lj = sp_lj[sbmask(i)];
       i &= NEIGHMASK;
 
-      numtyp4 ix=x_[i];
+      numtyp4 ix; fetch4(ix,i,pos_tex);
       int itype=ix.w;
 
       numtyp a[9];       // Rotation matrix (lab->body)
@@ -524,14 +525,13 @@ __kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
   } // if ii
 }
 
-__kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1, 
-                        __global numtyp4* lj3, const int lj_types, 
-                        __global numtyp *gum, 
-                        const int stride, __global int *dev_ij, 
-                        __global acctyp4 *ans, __global acctyp *engv, 
-                        __global int *err_flag, const int eflag, 
-                        const int vflag, const int start, const int inum, 
-                        const int t_per_atom) {
+__kernel void k_resquared_lj(__global numtyp4 *x_, __global numtyp4 *lj1, 
+                             __global numtyp4* lj3, const int lj_types, 
+                             __global numtyp *gum, const int stride, 
+                             __global int *dev_ij, __global acctyp4 *ans,
+                             __global acctyp *engv, __global int *err_flag,
+                             const int eflag, const int vflag, const int start,
+                             const int inum, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
   ii+=start;
@@ -557,7 +557,7 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
     nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
                 n_stride,list_end,nbor);
   
-    numtyp4 ix=x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex);
     int itype=ix.w;
 
     numtyp factor_lj;
@@ -567,7 +567,7 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
-      numtyp4 jx=x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex);
       int jtype=jx.w;
 
       // Compute r12
@@ -606,13 +606,12 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
   } // if ii
 }
 
-__kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, 
-                             __global numtyp4* lj3_in, __global numtyp *gum, 
-                             const int stride, __global int *dev_ij,
-                             __global acctyp4 *ans, __global acctyp *engv,
-                             __global int *err_flag, const int eflag,
-                             const int vflag, const int start, const int inum,
-                             const int t_per_atom) {
+__kernel void k_resquared_lj_fast(__global numtyp4 *x_, 
+                  __global numtyp4 *lj1_in, __global numtyp4* lj3_in, 
+                  __global numtyp *gum, const int stride, __global int *dev_ij,
+                  __global acctyp4 *ans, __global acctyp *engv, 
+                  __global int *err_flag, const int eflag, const int vflag,
+                  const int start, const int inum, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
   ii+=start;
@@ -645,7 +644,7 @@ __kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
     nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
                 n_stride,list_end,nbor);
 
-    numtyp4 ix=x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex);
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
@@ -656,7 +655,7 @@ __kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
-      numtyp4 jx=x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex);
       int mtype=itype+jx.w;
 
       // Compute r12
diff --git a/lib/gpu/lal_table.cpp b/lib/gpu/lal_table.cpp
index 5b3d934e53..d7c84d65ca 100644
--- a/lib/gpu/lal_table.cpp
+++ b/lib/gpu/lal_table.cpp
@@ -13,10 +13,12 @@
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "table_cl.h"
+#elif defined(USE_CUDART)
+const char *table=0;
 #else
-#include "table_ptx.h"
+#include "table_cubin.h"
 #endif
 
 #include "lal_table.h"
@@ -56,17 +58,17 @@ int TableT::init(const int ntypes,
                 const double gpu_split, FILE *_screen, 
                 int tabstyle, int ntables, int tablength) {
   int success;
-  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                            _screen,table);
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,
+                            gpu_split,_screen,table,"k_table");
   if (success!=0)
     return success;
   
-  k_pair_linear.set_function(*(this->pair_program),"kernel_pair_linear");
-  k_pair_linear_fast.set_function(*(this->pair_program),"kernel_pair_linear_fast");
-  k_pair_spline.set_function(*(this->pair_program),"kernel_pair_spline");
-  k_pair_spline_fast.set_function(*(this->pair_program),"kernel_pair_spline_fast");
-  k_pair_bitmap.set_function(*(this->pair_program),"kernel_pair_bitmap");
-  k_pair_bitmap_fast.set_function(*(this->pair_program),"kernel_pair_bitmap_fast");
+  k_pair_linear.set_function(*(this->pair_program),"k_table_linear");
+  k_pair_linear_fast.set_function(*(this->pair_program),"k_table_linear_fast");
+  k_pair_spline.set_function(*(this->pair_program),"k_table_spline");
+  k_pair_spline_fast.set_function(*(this->pair_program),"k_table_spline_fast");
+  k_pair_bitmap.set_function(*(this->pair_program),"k_table_bitmap");
+  k_pair_bitmap_fast.set_function(*(this->pair_program),"k_table_bitmap_fast");
   _compiled_styles = true;
 
   // If atom type constants fit in shared memory use fast kernel
@@ -264,84 +266,71 @@ void TableT::loop(const bool _eflag, const bool _vflag) {
   if (shared_types) {
     if (_tabstyle == LOOKUP) {
       this->k_pair_fast.set_size(GX,BX);
-      this->k_pair_fast.run(&this->atom->dev_x.begin(), &tabindex.begin(),
-                            &coeff2.begin(), &coeff3.begin(),
-                            &coeff4.begin(), &cutsq.begin(), &sp_lj.begin(),
-                            &this->nbor->dev_nbor.begin(),
-                            &this->_nbor_data->begin(),
-                            &this->ans->dev_ans.begin(),
-                            &this->ans->dev_engv.begin(), &eflag, &vflag,
-                            &ainum, &nbor_pitch, &this->_threads_per_atom, 
-                            &_tablength);
+      this->k_pair_fast.run(&this->atom->x, &tabindex, &coeff2, &coeff3,
+                            &coeff4, &cutsq, &sp_lj, &this->nbor->dev_nbor,
+                            &this->_nbor_data->begin(), &this->ans->force,
+                            &this->ans->engv, &eflag, &vflag, &ainum, 
+                            &nbor_pitch, &this->_threads_per_atom, &_tablength);
     } else if (_tabstyle == LINEAR) {
       this->k_pair_linear_fast.set_size(GX,BX);
-      this->k_pair_linear_fast.run(&this->atom->dev_x.begin(), &tabindex.begin(),
-                            &coeff2.begin(), &coeff3.begin(),
-                            &coeff4.begin(), &cutsq.begin(), &sp_lj.begin(),
-                            &this->nbor->dev_nbor.begin(),
-                            &this->_nbor_data->begin(),
-                            &this->ans->dev_ans.begin(),
-                            &this->ans->dev_engv.begin(), &eflag, &vflag,
-                            &ainum, &nbor_pitch, &this->_threads_per_atom, 
-                            &_tablength);
+      this->k_pair_linear_fast.run(&this->atom->x, &tabindex, &coeff2, 
+                                   &coeff3, &coeff4, &cutsq, &sp_lj,
+                                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                                   &this->ans->force, &this->ans->engv,
+                                   &eflag, &vflag, &ainum, &nbor_pitch, 
+                                   &this->_threads_per_atom, &_tablength);
     } else if (_tabstyle == SPLINE) {
       this->k_pair_spline_fast.set_size(GX,BX);
-      this->k_pair_spline_fast.run(&this->atom->dev_x.begin(), &tabindex.begin(),
-                            &coeff2.begin(), &coeff3.begin(),
-                            &coeff4.begin(), &cutsq.begin(), &sp_lj.begin(),
-                            &this->nbor->dev_nbor.begin(),
-                            &this->_nbor_data->begin(),
-                            &this->ans->dev_ans.begin(),
-                            &this->ans->dev_engv.begin(), &eflag, &vflag,
-                            &ainum, &nbor_pitch, &this->_threads_per_atom, 
-                            &_tablength);
+      this->k_pair_spline_fast.run(&this->atom->x, &tabindex, &coeff2, 
+                                   &coeff3, &coeff4, &cutsq, &sp_lj,
+                                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                                   &this->ans->force, &this->ans->engv,
+                                   &eflag, &vflag, &ainum, &nbor_pitch, 
+                                   &this->_threads_per_atom, &_tablength);
     } else if (_tabstyle == BITMAP) {
       this->k_pair_bitmap_fast.set_size(GX,BX);
-      this->k_pair_bitmap_fast.run(&this->atom->dev_x.begin(), &tabindex.begin(),
-                            &nshiftbits.begin(), &nmask.begin(),   
-                            &coeff2.begin(), &coeff3.begin(),
-                            &coeff4.begin(), &cutsq.begin(), &sp_lj.begin(),
-                            &this->nbor->dev_nbor.begin(),
-                            &this->_nbor_data->begin(),
-                            &this->ans->dev_ans.begin(),
-                            &this->ans->dev_engv.begin(), &eflag, &vflag,
-                            &ainum, &nbor_pitch, &this->_threads_per_atom, 
-                            &_tablength);
+      this->k_pair_bitmap_fast.run(&this->atom->x, &tabindex, &nshiftbits,
+                                   &nmask, &coeff2, &coeff3, &coeff4, &cutsq,
+                                   &sp_lj, &this->nbor->dev_nbor, 
+                                   &this->_nbor_data->begin(), &this->ans->force,
+                                   &this->ans->engv, &eflag, &vflag,
+                                   &ainum, &nbor_pitch, 
+                                   &this->_threads_per_atom, &_tablength);
     } 
   } else {
     if (_tabstyle == LOOKUP) {
       this->k_pair.set_size(GX,BX);
-      this->k_pair.run(&this->atom->dev_x.begin(), &tabindex.begin(),
-                     &coeff2.begin(), &coeff3.begin(), &coeff4.begin(), &_lj_types, 
-                     &cutsq.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &nbor_pitch, &this->_threads_per_atom, &_tablength);
+      this->k_pair.run(&this->atom->x, &tabindex, &coeff2, &coeff3, 
+                       &coeff4, &_lj_types, &cutsq, &sp_lj, 
+                       &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                       &this->ans->force, &this->ans->engv, &eflag, 
+                       &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom,
+                       &_tablength);
     } else if (_tabstyle == LINEAR) {
       this->k_pair_linear.set_size(GX,BX);
-      this->k_pair_linear.run(&this->atom->dev_x.begin(), &tabindex.begin(),
-                     &coeff2.begin(), &coeff3.begin(), &coeff4.begin(), &_lj_types, 
-                     &cutsq.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &nbor_pitch, &this->_threads_per_atom, &_tablength);
+      this->k_pair_linear.run(&this->atom->x, &tabindex, &coeff2, &coeff3,
+                              &coeff4, &_lj_types, &cutsq, &sp_lj, 
+                              &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                              &this->ans->force, &this->ans->engv, &eflag,
+                              &vflag, &ainum, &nbor_pitch, 
+                              &this->_threads_per_atom, &_tablength);
     } else if (_tabstyle == SPLINE) {
       this->k_pair_spline.set_size(GX,BX);
-      this->k_pair_spline.run(&this->atom->dev_x.begin(), &tabindex.begin(),
-                     &coeff2.begin(), &coeff3.begin(), &coeff4.begin(), &_lj_types, 
-                     &cutsq.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &nbor_pitch, &this->_threads_per_atom, &_tablength);
+      this->k_pair_spline.run(&this->atom->x, &tabindex, &coeff2, &coeff3,
+                              &coeff4, &_lj_types, &cutsq, &sp_lj, 
+                              &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                              &this->ans->force, &this->ans->engv, &eflag,
+                              &vflag, &ainum, &nbor_pitch, 
+                              &this->_threads_per_atom, &_tablength);
     } else if (_tabstyle == BITMAP) {
       this->k_pair_bitmap.set_size(GX,BX);
-      this->k_pair_bitmap.run(&this->atom->dev_x.begin(), &tabindex.begin(),
-                     &nshiftbits.begin(), &nmask.begin(), 
-                     &coeff2.begin(), &coeff3.begin(), &coeff4.begin(), &_lj_types, 
-                     &cutsq.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &nbor_pitch, &this->_threads_per_atom, &_tablength);
+      this->k_pair_bitmap.run(&this->atom->x, &tabindex, &nshiftbits, 
+                              &nmask, &coeff2, &coeff3, &coeff4, &_lj_types,
+                              &cutsq, &sp_lj, &this->nbor->dev_nbor,
+                              &this->_nbor_data->begin(), &this->ans->force,
+                              &this->ans->engv, &eflag, &vflag, &ainum,
+                              &nbor_pitch, &this->_threads_per_atom, 
+                              &_tablength);
     }
   }
   this->time_pair.stop();
diff --git a/lib/gpu/lal_table.cu b/lib/gpu/lal_table.cu
index 65db9b9b15..4730c52cd1 100644
--- a/lib/gpu/lal_table.cu
+++ b/lib/gpu/lal_table.cu
@@ -15,11 +15,13 @@
 
 #ifdef NV_KERNEL
 #include "lal_aux_fun1.h"
-texture<float4> pos_tex;
 #ifndef _DOUBLE_DOUBLE
-ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
-  { return tex1Dfetch(pos_tex, i); }
+texture<float4> pos_tex;
+#else
+texture<int4,1> pos_tex;
 #endif
+#else
+#define pos_tex x_
 #endif
 
 #define LOOKUP 0
@@ -37,7 +39,7 @@ typedef union {
 
 /// ---------------- LOOKUP -------------------------------------------------
 
-__kernel void kernel_pair(__global numtyp4 *x_, __global int *tabindex,
+__kernel void k_table(__global numtyp4 *x_, __global int *tabindex,
                        __global numtyp4* coeff2, 
                        __global numtyp4 *coeff3,
                        __global numtyp4 *coeff4,
@@ -73,7 +75,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global int *tabindex,
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
 
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
     
     numtyp factor_lj;
@@ -83,7 +85,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global int *tabindex,
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int mtype=itype*lj_types+jx.w;
       int tbindex = tabindex[mtype];
       
@@ -128,7 +130,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global int *tabindex,
   } // if ii
 }
 
-__kernel void kernel_pair_fast(__global numtyp4 *x_, __global int *tabindex,
+__kernel void k_table_fast(__global numtyp4 *x_, __global int *tabindex,
                             __global numtyp4* coeff2, 
                             __global numtyp4 *coeff3,
                             __global numtyp4 *coeff4,
@@ -167,7 +169,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global int *tabindex,
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
 
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
     
@@ -178,7 +180,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global int *tabindex,
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int mtype=itype+jx.w;
       int tbindex = tabindex[mtype];
       
@@ -225,7 +227,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global int *tabindex,
 
 /// ---------------- LINEAR -------------------------------------------------
 
-__kernel void kernel_pair_linear(__global numtyp4 *x_, __global int *tabindex,
+__kernel void k_table_linear(__global numtyp4 *x_, __global int *tabindex,
                        __global numtyp4* coeff2, 
                        __global numtyp4 *coeff3,
                        __global numtyp4 *coeff4,
@@ -261,7 +263,7 @@ __kernel void kernel_pair_linear(__global numtyp4 *x_, __global int *tabindex,
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
 
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
     
     numtyp factor_lj;
@@ -271,7 +273,7 @@ __kernel void kernel_pair_linear(__global numtyp4 *x_, __global int *tabindex,
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int mtype=itype*lj_types+jx.w;
       int tbindex = tabindex[mtype];
       
@@ -320,7 +322,7 @@ __kernel void kernel_pair_linear(__global numtyp4 *x_, __global int *tabindex,
   } // if ii
 }
 
-__kernel void kernel_pair_linear_fast(__global numtyp4 *x_, __global int *tabindex,
+__kernel void k_table_linear_fast(__global numtyp4 *x_, __global int *tabindex,
                             __global numtyp4* coeff2, 
                             __global numtyp4 *coeff3,
                             __global numtyp4 *coeff4,
@@ -359,7 +361,7 @@ __kernel void kernel_pair_linear_fast(__global numtyp4 *x_, __global int *tabind
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
 
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
     
@@ -370,7 +372,7 @@ __kernel void kernel_pair_linear_fast(__global numtyp4 *x_, __global int *tabind
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int mtype=itype+jx.w;
       int tbindex = tabindex[mtype];
       
@@ -421,7 +423,7 @@ __kernel void kernel_pair_linear_fast(__global numtyp4 *x_, __global int *tabind
 
 /// ---------------- SPLINE -------------------------------------------------
 
-__kernel void kernel_pair_spline(__global numtyp4 *x_, __global int *tabindex,
+__kernel void k_table_spline(__global numtyp4 *x_, __global int *tabindex,
                        __global numtyp4* coeff2, 
                        __global numtyp4 *coeff3,
                        __global numtyp4 *coeff4,
@@ -457,7 +459,7 @@ __kernel void kernel_pair_spline(__global numtyp4 *x_, __global int *tabindex,
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
 
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
     
     numtyp factor_lj;
@@ -467,7 +469,7 @@ __kernel void kernel_pair_spline(__global numtyp4 *x_, __global int *tabindex,
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int mtype=itype*lj_types+jx.w;
       int tbindex = tabindex[mtype];
       
@@ -523,7 +525,7 @@ __kernel void kernel_pair_spline(__global numtyp4 *x_, __global int *tabindex,
   } // if ii
 }
 
-__kernel void kernel_pair_spline_fast(__global numtyp4 *x_, __global int *tabindex,
+__kernel void k_table_spline_fast(__global numtyp4 *x_, __global int *tabindex,
                             __global numtyp4* coeff2, 
                             __global numtyp4 *coeff3,
                             __global numtyp4 *coeff4,
@@ -562,7 +564,7 @@ __kernel void kernel_pair_spline_fast(__global numtyp4 *x_, __global int *tabind
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
 
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
     
@@ -573,7 +575,7 @@ __kernel void kernel_pair_spline_fast(__global numtyp4 *x_, __global int *tabind
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int mtype=itype+jx.w;
       int tbindex = tabindex[mtype];
       
@@ -631,7 +633,7 @@ __kernel void kernel_pair_spline_fast(__global numtyp4 *x_, __global int *tabind
 
 /// ---------------- BITMAP -------------------------------------------------
 
-__kernel void kernel_pair_bitmap(__global numtyp4 *x_, __global int *tabindex,
+__kernel void k_table_bitmap(__global numtyp4 *x_, __global int *tabindex,
                        __global int *nshiftbits, __global int *nmask,
                        __global numtyp4* coeff2, 
                        __global numtyp4 *coeff3,
@@ -668,7 +670,7 @@ __kernel void kernel_pair_bitmap(__global numtyp4 *x_, __global int *tabindex,
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
 
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
     
     numtyp factor_lj;
@@ -678,7 +680,7 @@ __kernel void kernel_pair_bitmap(__global numtyp4 *x_, __global int *tabindex,
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int mtype=itype*lj_types+jx.w;
       int tbindex = tabindex[mtype];
       
@@ -730,7 +732,7 @@ __kernel void kernel_pair_bitmap(__global numtyp4 *x_, __global int *tabindex,
   } // if ii
 }
 
-__kernel void kernel_pair_bitmap_fast(__global numtyp4 *x_, __global int *tabindex,
+__kernel void k_table_bitmap_fast(__global numtyp4 *x_, __global int *tabindex,
                             __global int *nshiftbits, __global int *nmask,
                             __global numtyp4* coeff2, 
                             __global numtyp4 *coeff3,
@@ -770,7 +772,7 @@ __kernel void kernel_pair_bitmap_fast(__global numtyp4 *x_, __global int *tabind
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
 
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
     
@@ -781,7 +783,7 @@ __kernel void kernel_pair_bitmap_fast(__global numtyp4 *x_, __global int *tabind
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int mtype=itype+jx.w;
       int tbindex = tabindex[mtype];
       
diff --git a/lib/gpu/lal_yukawa.cpp b/lib/gpu/lal_yukawa.cpp
index 5ab94ae817..acde5e9890 100644
--- a/lib/gpu/lal_yukawa.cpp
+++ b/lib/gpu/lal_yukawa.cpp
@@ -13,10 +13,12 @@
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "yukawa_cl.h"
+#elif defined(USE_CUDART)
+const char *yukawa=0;
 #else
-#include "yukawa_ptx.h"
+#include "yukawa_cubin.h"
 #endif
 
 #include "lal_yukawa.h"
@@ -50,7 +52,7 @@ int YukawaT::init(const int ntypes,
                   const double gpu_split, FILE *_screen) {
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                            _screen,yukawa);
+                            _screen,yukawa,"k_yukawa");
   if (success!=0)
     return success;
 
@@ -129,20 +131,17 @@ void YukawaT::loop(const bool _eflag, const bool _vflag) {
   this->time_pair.start();
   if (shared_types) {
     this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->dev_x.begin(), &coeff.begin(), &_kappa, 
-                          &sp_lj.begin(),
-                          &this->nbor->dev_nbor.begin(),
-                          &this->_nbor_data->begin(),
-                          &this->ans->dev_ans.begin(),
-                          &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &nbor_pitch, &this->_threads_per_atom);
+    this->k_pair_fast.run(&this->atom->x, &coeff, &_kappa, &sp_lj,
+                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->ans->force, &this->ans->engv, &eflag,
+                          &vflag, &ainum, &nbor_pitch, 
+                          &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->dev_x.begin(), &coeff.begin(), &_kappa,
-                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &nbor_pitch, &this->_threads_per_atom);
+    this->k_pair.run(&this->atom->x, &coeff, &_kappa, &_lj_types, &sp_lj,
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                     &this->ans->force, &this->ans->engv, &eflag, &vflag,
+                     &ainum, &nbor_pitch, &this->_threads_per_atom);
   }
   this->time_pair.stop();
 }
diff --git a/lib/gpu/lal_yukawa.cu b/lib/gpu/lal_yukawa.cu
index e2fa11aa36..593d123776 100644
--- a/lib/gpu/lal_yukawa.cu
+++ b/lib/gpu/lal_yukawa.cu
@@ -15,14 +15,16 @@
 
 #ifdef NV_KERNEL
 #include "lal_aux_fun1.h"
-texture<float4> pos_tex;
 #ifndef _DOUBLE_DOUBLE
-ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
-  { return tex1Dfetch(pos_tex, i); }
+texture<float4> pos_tex;
+#else
+texture<int4,1> pos_tex;
 #endif
+#else
+#define pos_tex x_
 #endif
 
-__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff,
+__kernel void k_yukawa(__global numtyp4 *x_, __global numtyp4 *coeff,
                           const numtyp kappa, const int lj_types,
                           __global numtyp *sp_lj_in, __global int *dev_nbor, 
                           __global int *dev_packed, __global acctyp4 *ans,
@@ -51,7 +53,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff,
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
@@ -61,7 +63,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff,
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int jtype=jx.w;
 
       // Compute r12
@@ -103,7 +105,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff,
   } // if ii
 }
 
-__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff_in,
+__kernel void k_yukawa_fast(__global numtyp4 *x_, __global numtyp4 *coeff_in,
                                const numtyp kappa, __global numtyp* sp_lj_in, 
                                __global int *dev_nbor, __global int *dev_packed, 
                                __global acctyp4 *ans, __global acctyp *engv, 
@@ -135,7 +137,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff_in,
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
 
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
@@ -146,7 +148,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff_in,
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int mtype=itype+jx.w;
 
       // Compute r12