git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@8693 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2012-08-21 13:57:32 +00:00
parent 9a99e27552
commit 31551d81fd
85 changed files with 2630 additions and 2172 deletions
--- a/lib/gpu/Nvidia.makefile
+++ b/lib/gpu/Nvidia.makefile
@ -3,6 +3,7 @@ CUDA  = $(NVCC) $(CUDA_INCLUDE) $(CUDA_OPTS) -Icudpp_mini $(CUDA_ARCH) \
 CUDR  = $(CUDR_CPP) $(CUDR_OPTS) $(CUDA_PRECISION) $(CUDA_INCLUDE) \
         $(CUDPP_OPT)
 CUDA_LINK = $(CUDA_LIB) -lcudart
+BIN2C = $(CUDA_HOME)/bin/bin2c

 GPU_LIB = $(LIB_DIR)/libgpu.a

@ -27,6 +28,7 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_ans.o \
       $(OBJ_DIR)/lal_neighbor.o $(OBJ_DIR)/lal_neighbor_shared.o \
       $(OBJ_DIR)/lal_device.o $(OBJ_DIR)/lal_base_atomic.o \
       $(OBJ_DIR)/lal_base_charge.o $(OBJ_DIR)/lal_base_ellipsoid.o \
+       $(OBJ_DIR)/lal_base_dipole.o \
       $(OBJ_DIR)/lal_pppm.o $(OBJ_DIR)/lal_pppm_ext.o \
       $(OBJ_DIR)/lal_gayberne.o $(OBJ_DIR)/lal_gayberne_ext.o \
       $(OBJ_DIR)/lal_re_squared.o $(OBJ_DIR)/lal_re_squared_ext.o \
@ -35,6 +37,7 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_ans.o \
       $(OBJ_DIR)/lal_lj_expand.o $(OBJ_DIR)/lal_lj_expand_ext.o \
       $(OBJ_DIR)/lal_lj_coul.o $(OBJ_DIR)/lal_lj_coul_ext.o \
       $(OBJ_DIR)/lal_lj_coul_long.o $(OBJ_DIR)/lal_lj_coul_long_ext.o \
+       $(OBJ_DIR)/lal_lj_dsf.o $(OBJ_DIR)/lal_lj_dsf_ext.o \
       $(OBJ_DIR)/lal_lj_class2_long.o $(OBJ_DIR)/lal_lj_class2_long_ext.o \
       $(OBJ_DIR)/lal_coul_long.o $(OBJ_DIR)/lal_coul_long_ext.o \
       $(OBJ_DIR)/lal_morse.o $(OBJ_DIR)/lal_morse_ext.o \
@ -46,35 +49,57 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_ans.o \
       $(OBJ_DIR)/lal_buck_coul.o $(OBJ_DIR)/lal_buck_coul_ext.o \
       $(OBJ_DIR)/lal_buck_coul_long.o $(OBJ_DIR)/lal_buck_coul_long_ext.o \
       $(OBJ_DIR)/lal_table.o $(OBJ_DIR)/lal_table_ext.o \
-       $(OBJ_DIR)/lal_yukawa.o $(OBJ_DIR)/lal_yukawa_ext.o
-PTXS = $(OBJ_DIR)/device.ptx $(OBJ_DIR)/device_ptx.h \
-       $(OBJ_DIR)/atom.ptx $(OBJ_DIR)/atom_ptx.h \
-       $(OBJ_DIR)/neighbor_cpu.ptx $(OBJ_DIR)/neighbor_cpu_ptx.h \
-       $(OBJ_DIR)/neighbor_gpu.ptx $(OBJ_DIR)/neighbor_gpu_ptx.h \
-       $(OBJ_DIR)/pppm_f.ptx $(OBJ_DIR)/pppm_f_ptx.h \
-       $(OBJ_DIR)/pppm_d.ptx $(OBJ_DIR)/pppm_d_ptx.h \
-       $(OBJ_DIR)/ellipsoid_nbor.ptx $(OBJ_DIR)/ellipsoid_nbor_ptx.h \
-       $(OBJ_DIR)/gayberne.ptx $(OBJ_DIR)/gayberne_lj.ptx \
-       $(OBJ_DIR)/gayberne_ptx.h $(OBJ_DIR)/gayberne_lj_ptx.h \
-       $(OBJ_DIR)/re_squared.ptx $(OBJ_DIR)/re_squared_lj.ptx \
-       $(OBJ_DIR)/re_squared_ptx.h $(OBJ_DIR)/re_squared_lj_ptx.h \
-       $(OBJ_DIR)/lj.ptx $(OBJ_DIR)/lj_ptx.h \
-       $(OBJ_DIR)/lj96.ptx $(OBJ_DIR)/lj96_ptx.h \
-       $(OBJ_DIR)/lj_expand.ptx $(OBJ_DIR)/lj_expand_ptx.h \
-       $(OBJ_DIR)/lj_coul.ptx $(OBJ_DIR)/lj_coul_ptx.h \
-       $(OBJ_DIR)/lj_coul_long.ptx $(OBJ_DIR)/lj_coul_long_ptx.h \
-       $(OBJ_DIR)/lj_class2_long.ptx $(OBJ_DIR)/lj_class2_long_ptx.h \
-       $(OBJ_DIR)/coul_long.ptx $(OBJ_DIR)/coul_long_ptx.h \
-       $(OBJ_DIR)/morse.ptx $(OBJ_DIR)/morse_ptx.h \
-       $(OBJ_DIR)/charmm_long.ptx $(OBJ_DIR)/charmm_long_ptx.h \
-       $(OBJ_DIR)/cg_cmm.ptx $(OBJ_DIR)/cg_cmm_ptx.h \
-       $(OBJ_DIR)/cg_cmm_long.ptx $(OBJ_DIR)/cg_cmm_long_ptx.h \
-       $(OBJ_DIR)/eam.ptx $(OBJ_DIR)/eam_ptx.h \
-       $(OBJ_DIR)/buck.ptx $(OBJ_DIR)/buck_ptx.h \
-       $(OBJ_DIR)/buck_coul.ptx $(OBJ_DIR)/buck_coul_ptx.h \
-       $(OBJ_DIR)/buck_coul_long.ptx $(OBJ_DIR)/buck_coul_long_ptx.h \
-       $(OBJ_DIR)/table.ptx $(OBJ_DIR)/table_ptx.h \
-       $(OBJ_DIR)/yukawa.ptx $(OBJ_DIR)/yukawa_ptx.h
+       $(OBJ_DIR)/lal_yukawa.o $(OBJ_DIR)/lal_yukawa_ext.o \
+       $(OBJ_DIR)/lal_born.o $(OBJ_DIR)/lal_born_ext.o \
+       $(OBJ_DIR)/lal_born_coul_wolf.o $(OBJ_DIR)/lal_born_coul_wolf_ext.o \
+       $(OBJ_DIR)/lal_born_coul_long.o $(OBJ_DIR)/lal_born_coul_long_ext.o \
+       $(OBJ_DIR)/lal_dipole_lj.o $(OBJ_DIR)/lal_dipole_lj_ext.o \
+       $(OBJ_DIR)/lal_dipole_lj_sf.o $(OBJ_DIR)/lal_dipole_lj_sf_ext.o \
+       $(OBJ_DIR)/lal_colloid.o $(OBJ_DIR)/lal_colloid_ext.o \
+       $(OBJ_DIR)/lal_gauss.o $(OBJ_DIR)/lal_gauss_ext.o \
+       $(OBJ_DIR)/lal_yukawa_colloid.o $(OBJ_DIR)/lal_yukawa_colloid_ext.o \
+       $(OBJ_DIR)/lal_lj_coul_debye.o $(OBJ_DIR)/lal_lj_coul_debye_ext.o \
+       $(OBJ_DIR)/lal_coul_dsf.o $(OBJ_DIR)/lal_coul_dsf_ext.o
+
+CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \
+       $(OBJ_DIR)/atom.cubin $(OBJ_DIR)/atom_cubin.h \
+       $(OBJ_DIR)/neighbor_cpu.cubin $(OBJ_DIR)/neighbor_cpu_cubin.h \
+       $(OBJ_DIR)/neighbor_gpu.cubin $(OBJ_DIR)/neighbor_gpu_cubin.h \
+       $(OBJ_DIR)/pppm_f.cubin $(OBJ_DIR)/pppm_f_cubin.h \
+       $(OBJ_DIR)/pppm_d.cubin $(OBJ_DIR)/pppm_d_cubin.h \
+       $(OBJ_DIR)/ellipsoid_nbor.cubin $(OBJ_DIR)/ellipsoid_nbor_cubin.h \
+       $(OBJ_DIR)/gayberne.cubin $(OBJ_DIR)/gayberne_lj.cubin \
+       $(OBJ_DIR)/gayberne_cubin.h $(OBJ_DIR)/gayberne_lj_cubin.h \
+       $(OBJ_DIR)/re_squared.cubin $(OBJ_DIR)/re_squared_lj.cubin \
+       $(OBJ_DIR)/re_squared_cubin.h $(OBJ_DIR)/re_squared_lj_cubin.h \
+       $(OBJ_DIR)/lj.cubin $(OBJ_DIR)/lj_cubin.h \
+       $(OBJ_DIR)/lj96.cubin $(OBJ_DIR)/lj96_cubin.h \
+       $(OBJ_DIR)/lj_expand.cubin $(OBJ_DIR)/lj_expand_cubin.h \
+       $(OBJ_DIR)/lj_coul.cubin $(OBJ_DIR)/lj_coul_cubin.h \
+       $(OBJ_DIR)/lj_coul_long.cubin $(OBJ_DIR)/lj_coul_long_cubin.h \
+       $(OBJ_DIR)/lj_dsf.cubin $(OBJ_DIR)/lj_dsf_cubin.h \
+       $(OBJ_DIR)/lj_class2_long.cubin $(OBJ_DIR)/lj_class2_long_cubin.h \
+       $(OBJ_DIR)/coul_long.cubin $(OBJ_DIR)/coul_long_cubin.h \
+       $(OBJ_DIR)/morse.cubin $(OBJ_DIR)/morse_cubin.h \
+       $(OBJ_DIR)/charmm_long.cubin $(OBJ_DIR)/charmm_long_cubin.h \
+       $(OBJ_DIR)/cg_cmm.cubin $(OBJ_DIR)/cg_cmm_cubin.h \
+       $(OBJ_DIR)/cg_cmm_long.cubin $(OBJ_DIR)/cg_cmm_long_cubin.h \
+       $(OBJ_DIR)/eam.cubin $(OBJ_DIR)/eam_cubin.h \
+       $(OBJ_DIR)/buck.cubin $(OBJ_DIR)/buck_cubin.h \
+       $(OBJ_DIR)/buck_coul_long.cubin $(OBJ_DIR)/buck_coul_long_cubin.h \
+       $(OBJ_DIR)/buck_coul_wolf.cubin $(OBJ_DIR)/buck_coul_wolf_cubin.h \
+       $(OBJ_DIR)/table.cubin $(OBJ_DIR)/table_cubin.h \
+       $(OBJ_DIR)/yukawa.cubin $(OBJ_DIR)/yukawa_cubin.h \
+       $(OBJ_DIR)/born.cubin $(OBJ_DIR)/born_cubin.h \
+       $(OBJ_DIR)/born_coul_wolf.cubin $(OBJ_DIR)/born_coul_wolf_cubin.h \
+       $(OBJ_DIR)/born_coul_long.cubin $(OBJ_DIR)/born_coul_long_cubin.h \
+       $(OBJ_DIR)/dipole_lj.cubin $(OBJ_DIR)/dipole_lj_cubin.h \
+       $(OBJ_DIR)/dipole_lj_sf.cubin $(OBJ_DIR)/dipole_lj_sf_cubin.h \
+       $(OBJ_DIR)/colloid.cubin $(OBJ_DIR)/colloid_cubin.h \
+       $(OBJ_DIR)/gauss.cubin $(OBJ_DIR)/gauss_cubin.h \
+       $(OBJ_DIR)/yukawa_colloid.cubin $(OBJ_DIR)/yukawa_colloid_cubin.h \
+       $(OBJ_DIR)/lj_coul_debye.cubin $(OBJ_DIR)/lj_coul_debye_cubin.h \
+       $(OBJ_DIR)/coul_dsf.cubin $(OBJ_DIR)/coul_dsf_cubin.h

 all: $(GPU_LIB) $(EXECS)

@ -96,43 +121,43 @@ $(OBJ_DIR)/radixsort_app.cu_o: cudpp_mini/radixsort_app.cu
 $(OBJ_DIR)/scan_app.cu_o: cudpp_mini/scan_app.cu
 	$(CUDA) -o $@ -c cudpp_mini/scan_app.cu

-$(OBJ_DIR)/atom.ptx: lal_atom.cu lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ lal_atom.cu
+$(OBJ_DIR)/atom.cubin: lal_atom.cu lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_atom.cu

-$(OBJ_DIR)/atom_ptx.h: $(OBJ_DIR)/atom.ptx
-	$(BSH) ./geryon/file_to_cstr.sh atom $(OBJ_DIR)/atom.ptx $(OBJ_DIR)/atom_ptx.h
+$(OBJ_DIR)/atom_cubin.h: $(OBJ_DIR)/atom.cubin
+	$(BIN2C) -c -n atom $(OBJ_DIR)/atom.cubin > $(OBJ_DIR)/atom_cubin.h

-$(OBJ_DIR)/lal_atom.o: lal_atom.cpp lal_atom.h $(NVD_H) $(OBJ_DIR)/atom_ptx.h
+$(OBJ_DIR)/lal_atom.o: lal_atom.cpp lal_atom.h $(NVD_H) $(OBJ_DIR)/atom_cubin.h
 	$(CUDR) -o $@ -c lal_atom.cpp -I$(OBJ_DIR)

 $(OBJ_DIR)/lal_ans.o: lal_answer.cpp lal_answer.h $(NVD_H)
 	$(CUDR) -o $@ -c lal_answer.cpp -I$(OBJ_DIR)

-$(OBJ_DIR)/neighbor_cpu.ptx: lal_neighbor_cpu.cu lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ lal_neighbor_cpu.cu
+$(OBJ_DIR)/neighbor_cpu.cubin: lal_neighbor_cpu.cu lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_neighbor_cpu.cu

-$(OBJ_DIR)/neighbor_cpu_ptx.h: $(OBJ_DIR)/neighbor_cpu.ptx
-	$(BSH) ./geryon/file_to_cstr.sh neighbor_cpu $(OBJ_DIR)/neighbor_cpu.ptx $(OBJ_DIR)/neighbor_cpu_ptx.h
+$(OBJ_DIR)/neighbor_cpu_cubin.h: $(OBJ_DIR)/neighbor_cpu.cubin
+	$(BIN2C) -c -n neighbor_cpu $(OBJ_DIR)/neighbor_cpu.cubin > $(OBJ_DIR)/neighbor_cpu_cubin.h

-$(OBJ_DIR)/neighbor_gpu.ptx: lal_neighbor_gpu.cu lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ lal_neighbor_gpu.cu
+$(OBJ_DIR)/neighbor_gpu.cubin: lal_neighbor_gpu.cu lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_neighbor_gpu.cu

-$(OBJ_DIR)/neighbor_gpu_ptx.h: $(OBJ_DIR)/neighbor_gpu.ptx
-	$(BSH) ./geryon/file_to_cstr.sh neighbor_gpu $(OBJ_DIR)/neighbor_gpu.ptx $(OBJ_DIR)/neighbor_gpu_ptx.h
+$(OBJ_DIR)/neighbor_gpu_cubin.h: $(OBJ_DIR)/neighbor_gpu.cubin
+	$(BIN2C) -c -n neighbor_gpu $(OBJ_DIR)/neighbor_gpu.cubin > $(OBJ_DIR)/neighbor_gpu_cubin.h

-$(OBJ_DIR)/lal_neighbor_shared.o: lal_neighbor_shared.cpp lal_neighbor_shared.h $(OBJ_DIR)/neighbor_cpu_ptx.h $(OBJ_DIR)/neighbor_gpu_ptx.h $(NVD_H)
+$(OBJ_DIR)/lal_neighbor_shared.o: lal_neighbor_shared.cpp lal_neighbor_shared.h $(OBJ_DIR)/neighbor_cpu_cubin.h $(OBJ_DIR)/neighbor_gpu_cubin.h $(NVD_H)
 	$(CUDR) -o $@ -c lal_neighbor_shared.cpp -I$(OBJ_DIR)

 $(OBJ_DIR)/lal_neighbor.o: lal_neighbor.cpp lal_neighbor.h lal_neighbor_shared.h $(NVD_H)
 	$(CUDR) -o $@ -c lal_neighbor.cpp -I$(OBJ_DIR)

-$(OBJ_DIR)/device.ptx: lal_device.cu lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ lal_device.cu
+$(OBJ_DIR)/device.cubin: lal_device.cu lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_device.cu

-$(OBJ_DIR)/device_ptx.h: $(OBJ_DIR)/device.ptx
-	$(BSH) ./geryon/file_to_cstr.sh device $(OBJ_DIR)/device.ptx $(OBJ_DIR)/device_ptx.h
+$(OBJ_DIR)/device_cubin.h: $(OBJ_DIR)/device.cubin
+	$(BIN2C) -c -n device $(OBJ_DIR)/device.cubin > $(OBJ_DIR)/device_cubin.h

-$(OBJ_DIR)/lal_device.o: lal_device.cpp lal_device.h $(ALL_H) $(OBJ_DIR)/device_ptx.h
+$(OBJ_DIR)/lal_device.o: lal_device.cpp lal_device.h $(ALL_H) $(OBJ_DIR)/device_cubin.h
 	$(CUDR) -o $@ -c lal_device.cpp -I$(OBJ_DIR)

 $(OBJ_DIR)/lal_base_atomic.o: $(ALL_H) lal_base_atomic.h lal_base_atomic.cpp
@ -141,273 +166,408 @@ $(OBJ_DIR)/lal_base_atomic.o: $(ALL_H) lal_base_atomic.h lal_base_atomic.cpp
 $(OBJ_DIR)/lal_base_charge.o: $(ALL_H) lal_base_charge.h lal_base_charge.cpp
 	$(CUDR) -o $@ -c lal_base_charge.cpp

-$(OBJ_DIR)/lal_base_ellipsoid.o: $(ALL_H) lal_base_ellipsoid.h lal_base_ellipsoid.cpp $(OBJ_DIR)/ellipsoid_nbor_ptx.h
+$(OBJ_DIR)/lal_base_ellipsoid.o: $(ALL_H) lal_base_ellipsoid.h lal_base_ellipsoid.cpp $(OBJ_DIR)/ellipsoid_nbor_cubin.h
 	$(CUDR) -o $@ -c lal_base_ellipsoid.cpp -I$(OBJ_DIR)

-$(OBJ_DIR)/pppm_f.ptx: lal_pppm.cu lal_precision.h lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -Dgrdtyp=float -Dgrdtyp4=float4 -o $@ lal_pppm.cu
+$(OBJ_DIR)/lal_base_dipole.o: $(ALL_H) lal_base_dipole.h lal_base_dipole.cpp
+	$(CUDR) -o $@ -c lal_base_dipole.cpp

-$(OBJ_DIR)/pppm_f_ptx.h: $(OBJ_DIR)/pppm_f.ptx
-	$(BSH) ./geryon/file_to_cstr.sh pppm_f $(OBJ_DIR)/pppm_f.ptx $(OBJ_DIR)/pppm_f_ptx.h
+$(OBJ_DIR)/pppm_f.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -Dgrdtyp=float -Dgrdtyp4=float4 -o $@ lal_pppm.cu

-$(OBJ_DIR)/pppm_d.ptx: lal_pppm.cu lal_precision.h lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -Dgrdtyp=double -Dgrdtyp4=double4 -o $@ lal_pppm.cu
+$(OBJ_DIR)/pppm_f_cubin.h: $(OBJ_DIR)/pppm_f.cubin
+	$(BIN2C) -c -n pppm_f $(OBJ_DIR)/pppm_f.cubin > $(OBJ_DIR)/pppm_f_cubin.h

-$(OBJ_DIR)/pppm_d_ptx.h: $(OBJ_DIR)/pppm_d.ptx
-	$(BSH) ./geryon/file_to_cstr.sh pppm_d $(OBJ_DIR)/pppm_d.ptx $(OBJ_DIR)/pppm_d_ptx.h
+$(OBJ_DIR)/pppm_d.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -Dgrdtyp=double -Dgrdtyp4=double4 -o $@ lal_pppm.cu

-$(OBJ_DIR)/lal_pppm.o: $(ALL_H) lal_pppm.h lal_pppm.cpp $(OBJ_DIR)/pppm_f_ptx.h $(OBJ_DIR)/pppm_d_ptx.h
+$(OBJ_DIR)/pppm_d_cubin.h: $(OBJ_DIR)/pppm_d.cubin
+	$(BIN2C) -c -n pppm_d $(OBJ_DIR)/pppm_d.cubin > $(OBJ_DIR)/pppm_d_cubin.h
+
+$(OBJ_DIR)/lal_pppm.o: $(ALL_H) lal_pppm.h lal_pppm.cpp $(OBJ_DIR)/pppm_f_cubin.h $(OBJ_DIR)/pppm_d_cubin.h
 	$(CUDR) -o $@ -c lal_pppm.cpp -I$(OBJ_DIR)

 $(OBJ_DIR)/lal_pppm_ext.o: $(ALL_H) lal_pppm.h lal_pppm_ext.cpp
 	$(CUDR) -o $@ -c lal_pppm_ext.cpp -I$(OBJ_DIR)

-$(OBJ_DIR)/ellipsoid_nbor.ptx: lal_ellipsoid_nbor.cu lal_precision.h lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ lal_ellipsoid_nbor.cu
+$(OBJ_DIR)/ellipsoid_nbor.cubin: lal_ellipsoid_nbor.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_ellipsoid_nbor.cu

-$(OBJ_DIR)/ellipsoid_nbor_ptx.h: $(OBJ_DIR)/ellipsoid_nbor.ptx
-	$(BSH) ./geryon/file_to_cstr.sh ellipsoid_nbor $(OBJ_DIR)/ellipsoid_nbor.ptx $(OBJ_DIR)/ellipsoid_nbor_ptx.h
+$(OBJ_DIR)/ellipsoid_nbor_cubin.h: $(OBJ_DIR)/ellipsoid_nbor.cubin
+	$(BIN2C) -c -n ellipsoid_nbor $(OBJ_DIR)/ellipsoid_nbor.cubin > $(OBJ_DIR)/ellipsoid_nbor_cubin.h

-$(OBJ_DIR)/gayberne.ptx: lal_gayberne.cu lal_precision.h lal_ellipsoid_extra.h lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ lal_gayberne.cu
+$(OBJ_DIR)/gayberne.cubin: lal_gayberne.cu lal_precision.h lal_ellipsoid_extra.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_gayberne.cu

-$(OBJ_DIR)/gayberne_lj.ptx: lal_gayberne_lj.cu lal_precision.h lal_ellipsoid_extra.h lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ lal_gayberne_lj.cu
+$(OBJ_DIR)/gayberne_lj.cubin: lal_gayberne_lj.cu lal_precision.h lal_ellipsoid_extra.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_gayberne_lj.cu

-$(OBJ_DIR)/gayberne_ptx.h: $(OBJ_DIR)/gayberne.ptx
-	$(BSH) ./geryon/file_to_cstr.sh gayberne $(OBJ_DIR)/gayberne.ptx $(OBJ_DIR)/gayberne_ptx.h
+$(OBJ_DIR)/gayberne_cubin.h: $(OBJ_DIR)/gayberne.cubin
+	$(BIN2C) -c -n gayberne $(OBJ_DIR)/gayberne.cubin > $(OBJ_DIR)/gayberne_cubin.h

-$(OBJ_DIR)/gayberne_lj_ptx.h: $(OBJ_DIR)/gayberne_lj.ptx
-	$(BSH) ./geryon/file_to_cstr.sh gayberne_lj $(OBJ_DIR)/gayberne_lj.ptx $(OBJ_DIR)/gayberne_lj_ptx.h
+$(OBJ_DIR)/gayberne_lj_cubin.h: $(OBJ_DIR)/gayberne_lj.cubin
+	$(BIN2C) -c -n gayberne_lj $(OBJ_DIR)/gayberne_lj.cubin > $(OBJ_DIR)/gayberne_lj_cubin.h

-$(OBJ_DIR)/lal_gayberne.o: $(ALL_H) lal_gayberne.h lal_gayberne.cpp $(OBJ_DIR)/gayberne_ptx.h $(OBJ_DIR)/gayberne_lj_ptx.h $(OBJ_DIR)/lal_base_ellipsoid.o
+$(OBJ_DIR)/lal_gayberne.o: $(ALL_H) lal_gayberne.h lal_gayberne.cpp $(OBJ_DIR)/gayberne_cubin.h $(OBJ_DIR)/gayberne_lj_cubin.h $(OBJ_DIR)/lal_base_ellipsoid.o
 	$(CUDR) -o $@ -c lal_gayberne.cpp -I$(OBJ_DIR)

 $(OBJ_DIR)/lal_gayberne_ext.o: $(ALL_H) $(OBJ_DIR)/lal_gayberne.o lal_gayberne_ext.cpp
 	$(CUDR) -o $@ -c lal_gayberne_ext.cpp -I$(OBJ_DIR)

-$(OBJ_DIR)/re_squared.ptx: lal_re_squared.cu lal_precision.h lal_ellipsoid_extra.h lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ lal_re_squared.cu
+$(OBJ_DIR)/re_squared.cubin: lal_re_squared.cu lal_precision.h lal_ellipsoid_extra.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_re_squared.cu

-$(OBJ_DIR)/re_squared_lj.ptx: lal_re_squared_lj.cu lal_precision.h lal_ellipsoid_extra.h lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ lal_re_squared_lj.cu
+$(OBJ_DIR)/re_squared_lj.cubin: lal_re_squared_lj.cu lal_precision.h lal_ellipsoid_extra.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_re_squared_lj.cu

-$(OBJ_DIR)/re_squared_ptx.h: $(OBJ_DIR)/re_squared.ptx
-	$(BSH) ./geryon/file_to_cstr.sh re_squared $(OBJ_DIR)/re_squared.ptx $(OBJ_DIR)/re_squared_ptx.h
+$(OBJ_DIR)/re_squared_cubin.h: $(OBJ_DIR)/re_squared.cubin
+	$(BIN2C) -c -n re_squared $(OBJ_DIR)/re_squared.cubin > $(OBJ_DIR)/re_squared_cubin.h

-$(OBJ_DIR)/re_squared_lj_ptx.h: $(OBJ_DIR)/re_squared_lj.ptx
-	$(BSH) ./geryon/file_to_cstr.sh re_squared_lj $(OBJ_DIR)/re_squared_lj.ptx $(OBJ_DIR)/re_squared_lj_ptx.h
+$(OBJ_DIR)/re_squared_lj_cubin.h: $(OBJ_DIR)/re_squared_lj.cubin
+	$(BIN2C) -c -n re_squared_lj $(OBJ_DIR)/re_squared_lj.cubin > $(OBJ_DIR)/re_squared_lj_cubin.h

-$(OBJ_DIR)/lal_re_squared.o: $(ALL_H) lal_re_squared.h lal_re_squared.cpp $(OBJ_DIR)/re_squared_ptx.h $(OBJ_DIR)/re_squared_lj_ptx.h $(OBJ_DIR)/lal_base_ellipsoid.o
+$(OBJ_DIR)/lal_re_squared.o: $(ALL_H) lal_re_squared.h lal_re_squared.cpp $(OBJ_DIR)/re_squared_cubin.h $(OBJ_DIR)/re_squared_lj_cubin.h $(OBJ_DIR)/lal_base_ellipsoid.o
 	$(CUDR) -o $@ -c lal_re_squared.cpp -I$(OBJ_DIR)

 $(OBJ_DIR)/lal_re_squared_ext.o: $(ALL_H) $(OBJ_DIR)/lal_re_squared.o lal_re_squared_ext.cpp
 	$(CUDR) -o $@ -c lal_re_squared_ext.cpp -I$(OBJ_DIR)

-$(OBJ_DIR)/lj.ptx: lal_lj.cu lal_precision.h lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ lal_lj.cu
+$(OBJ_DIR)/lj.cubin: lal_lj.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj.cu

-$(OBJ_DIR)/lj_ptx.h: $(OBJ_DIR)/lj.ptx $(OBJ_DIR)/lj.ptx
-	$(BSH) ./geryon/file_to_cstr.sh lj $(OBJ_DIR)/lj.ptx $(OBJ_DIR)/lj_ptx.h
+$(OBJ_DIR)/lj_cubin.h: $(OBJ_DIR)/lj.cubin $(OBJ_DIR)/lj.cubin
+	$(BIN2C) -c -n lj $(OBJ_DIR)/lj.cubin > $(OBJ_DIR)/lj_cubin.h

-$(OBJ_DIR)/lal_lj.o: $(ALL_H) lal_lj.h lal_lj.cpp $(OBJ_DIR)/lj_ptx.h $(OBJ_DIR)/lal_base_atomic.o
+$(OBJ_DIR)/lal_lj.o: $(ALL_H) lal_lj.h lal_lj.cpp $(OBJ_DIR)/lj_cubin.h $(OBJ_DIR)/lal_base_atomic.o
 	$(CUDR) -o $@ -c lal_lj.cpp -I$(OBJ_DIR)

 $(OBJ_DIR)/lal_lj_ext.o: $(ALL_H) lal_lj.h lal_lj_ext.cpp lal_base_atomic.h
 	$(CUDR) -o $@ -c lal_lj_ext.cpp -I$(OBJ_DIR)

-$(OBJ_DIR)/lj_coul.ptx: lal_lj_coul.cu lal_precision.h lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ lal_lj_coul.cu
+$(OBJ_DIR)/lj_coul.cubin: lal_lj_coul.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj_coul.cu

-$(OBJ_DIR)/lj_coul_ptx.h: $(OBJ_DIR)/lj_coul.ptx $(OBJ_DIR)/lj_coul.ptx
-	$(BSH) ./geryon/file_to_cstr.sh lj_coul $(OBJ_DIR)/lj_coul.ptx $(OBJ_DIR)/lj_coul_ptx.h
+$(OBJ_DIR)/lj_coul_cubin.h: $(OBJ_DIR)/lj_coul.cubin $(OBJ_DIR)/lj_coul.cubin
+	$(BIN2C) -c -n lj_coul $(OBJ_DIR)/lj_coul.cubin > $(OBJ_DIR)/lj_coul_cubin.h

-$(OBJ_DIR)/lal_lj_coul.o: $(ALL_H) lal_lj_coul.h lal_lj_coul.cpp $(OBJ_DIR)/lj_coul_ptx.h $(OBJ_DIR)/lal_base_charge.o
+$(OBJ_DIR)/lal_lj_coul.o: $(ALL_H) lal_lj_coul.h lal_lj_coul.cpp $(OBJ_DIR)/lj_coul_cubin.h $(OBJ_DIR)/lal_base_charge.o
 	$(CUDR) -o $@ -c lal_lj_coul.cpp -I$(OBJ_DIR)

 $(OBJ_DIR)/lal_lj_coul_ext.o: $(ALL_H) lal_lj_coul.h lal_lj_coul_ext.cpp lal_base_charge.h
 	$(CUDR) -o $@ -c lal_lj_coul_ext.cpp -I$(OBJ_DIR)

-$(OBJ_DIR)/lj_class2_long.ptx: lal_lj_class2_long.cu lal_precision.h lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ lal_lj_class2_long.cu
+$(OBJ_DIR)/lj_class2_long.cubin: lal_lj_class2_long.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj_class2_long.cu

-$(OBJ_DIR)/lj_class2_long_ptx.h: $(OBJ_DIR)/lj_class2_long.ptx $(OBJ_DIR)/lj_class2_long.ptx
-	$(BSH) ./geryon/file_to_cstr.sh lj_class2_long $(OBJ_DIR)/lj_class2_long.ptx $(OBJ_DIR)/lj_class2_long_ptx.h
+$(OBJ_DIR)/lj_class2_long_cubin.h: $(OBJ_DIR)/lj_class2_long.cubin $(OBJ_DIR)/lj_class2_long.cubin
+	$(BIN2C) -c -n lj_class2_long $(OBJ_DIR)/lj_class2_long.cubin > $(OBJ_DIR)/lj_class2_long_cubin.h

-$(OBJ_DIR)/lal_lj_class2_long.o: $(ALL_H) lal_lj_class2_long.h lal_lj_class2_long.cpp $(OBJ_DIR)/lj_class2_long_ptx.h $(OBJ_DIR)/lal_base_charge.o
+$(OBJ_DIR)/lal_lj_class2_long.o: $(ALL_H) lal_lj_class2_long.h lal_lj_class2_long.cpp $(OBJ_DIR)/lj_class2_long_cubin.h $(OBJ_DIR)/lal_base_charge.o
 	$(CUDR) -o $@ -c lal_lj_class2_long.cpp -I$(OBJ_DIR)

 $(OBJ_DIR)/lal_lj_class2_long_ext.o: $(ALL_H) lal_lj_class2_long.h lal_lj_class2_long_ext.cpp lal_base_charge.h
 	$(CUDR) -o $@ -c lal_lj_class2_long_ext.cpp -I$(OBJ_DIR)

-$(OBJ_DIR)/coul_long.ptx: lal_coul_long.cu lal_precision.h lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ lal_coul_long.cu
+$(OBJ_DIR)/coul_long.cubin: lal_coul_long.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_coul_long.cu

-$(OBJ_DIR)/coul_long_ptx.h: $(OBJ_DIR)/coul_long.ptx $(OBJ_DIR)/coul_long.ptx
-	$(BSH) ./geryon/file_to_cstr.sh coul_long $(OBJ_DIR)/coul_long.ptx $(OBJ_DIR)/coul_long_ptx.h
+$(OBJ_DIR)/coul_long_cubin.h: $(OBJ_DIR)/coul_long.cubin $(OBJ_DIR)/coul_long.cubin
+	$(BIN2C) -c -n coul_long $(OBJ_DIR)/coul_long.cubin > $(OBJ_DIR)/coul_long_cubin.h

-$(OBJ_DIR)/lal_coul_long.o: $(ALL_H) lal_coul_long.h lal_coul_long.cpp $(OBJ_DIR)/coul_long_ptx.h $(OBJ_DIR)/lal_base_charge.o
+$(OBJ_DIR)/lal_coul_long.o: $(ALL_H) lal_coul_long.h lal_coul_long.cpp $(OBJ_DIR)/coul_long_cubin.h $(OBJ_DIR)/lal_base_charge.o
 	$(CUDR) -o $@ -c lal_coul_long.cpp -I$(OBJ_DIR)

 $(OBJ_DIR)/lal_coul_long_ext.o: $(ALL_H) lal_coul_long.h lal_coul_long_ext.cpp lal_base_charge.h
 	$(CUDR) -o $@ -c lal_coul_long_ext.cpp -I$(OBJ_DIR)

-$(OBJ_DIR)/lj_coul_long.ptx: lal_lj_coul_long.cu lal_precision.h lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ lal_lj_coul_long.cu
+$(OBJ_DIR)/lj_coul_long.cubin: lal_lj_coul_long.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj_coul_long.cu

-$(OBJ_DIR)/lj_coul_long_ptx.h: $(OBJ_DIR)/lj_coul_long.ptx $(OBJ_DIR)/lj_coul_long.ptx
-	$(BSH) ./geryon/file_to_cstr.sh lj_coul_long $(OBJ_DIR)/lj_coul_long.ptx $(OBJ_DIR)/lj_coul_long_ptx.h
+$(OBJ_DIR)/lj_coul_long_cubin.h: $(OBJ_DIR)/lj_coul_long.cubin $(OBJ_DIR)/lj_coul_long.cubin
+	$(BIN2C) -c -n lj_coul_long $(OBJ_DIR)/lj_coul_long.cubin > $(OBJ_DIR)/lj_coul_long_cubin.h

-$(OBJ_DIR)/lal_lj_coul_long.o: $(ALL_H) lal_lj_coul_long.h lal_lj_coul_long.cpp $(OBJ_DIR)/lj_coul_long_ptx.h $(OBJ_DIR)/lal_base_charge.o
+$(OBJ_DIR)/lal_lj_coul_long.o: $(ALL_H) lal_lj_coul_long.h lal_lj_coul_long.cpp $(OBJ_DIR)/lj_coul_long_cubin.h $(OBJ_DIR)/lal_base_charge.o
 	$(CUDR) -o $@ -c lal_lj_coul_long.cpp -I$(OBJ_DIR)

 $(OBJ_DIR)/lal_lj_coul_long_ext.o: $(ALL_H) lal_lj_coul_long.h lal_lj_coul_long_ext.cpp lal_base_charge.h
 	$(CUDR) -o $@ -c lal_lj_coul_long_ext.cpp -I$(OBJ_DIR)

-$(OBJ_DIR)/morse.ptx: lal_morse.cu lal_precision.h lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ lal_morse.cu
+$(OBJ_DIR)/lj_dsf.cubin: lal_lj_dsf.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj_dsf.cu

-$(OBJ_DIR)/morse_ptx.h: $(OBJ_DIR)/morse.ptx $(OBJ_DIR)/morse.ptx
-	$(BSH) ./geryon/file_to_cstr.sh morse $(OBJ_DIR)/morse.ptx $(OBJ_DIR)/morse_ptx.h
+$(OBJ_DIR)/lj_dsf_cubin.h: $(OBJ_DIR)/lj_dsf.cubin $(OBJ_DIR)/lj_dsf.cubin
+	$(BIN2C) -c -n lj_dsf $(OBJ_DIR)/lj_dsf.cubin > $(OBJ_DIR)/lj_dsf_cubin.h

-$(OBJ_DIR)/lal_morse.o: $(ALL_H) lal_morse.h lal_morse.cpp $(OBJ_DIR)/morse_ptx.h $(OBJ_DIR)/lal_base_atomic.o
+$(OBJ_DIR)/lal_lj_dsf.o: $(ALL_H) lal_lj_dsf.h lal_lj_dsf.cpp $(OBJ_DIR)/lj_dsf_cubin.h $(OBJ_DIR)/lal_base_charge.o
+	$(CUDR) -o $@ -c lal_lj_dsf.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_lj_dsf_ext.o: $(ALL_H) lal_lj_dsf.h lal_lj_dsf_ext.cpp lal_base_charge.h
+	$(CUDR) -o $@ -c lal_lj_dsf_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/morse.cubin: lal_morse.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_morse.cu
+
+$(OBJ_DIR)/morse_cubin.h: $(OBJ_DIR)/morse.cubin $(OBJ_DIR)/morse.cubin
+	$(BIN2C) -c -n morse $(OBJ_DIR)/morse.cubin > $(OBJ_DIR)/morse_cubin.h
+
+$(OBJ_DIR)/lal_morse.o: $(ALL_H) lal_morse.h lal_morse.cpp $(OBJ_DIR)/morse_cubin.h $(OBJ_DIR)/lal_base_atomic.o
 	$(CUDR) -o $@ -c lal_morse.cpp -I$(OBJ_DIR)

 $(OBJ_DIR)/lal_morse_ext.o: $(ALL_H) lal_morse.h lal_morse_ext.cpp lal_base_atomic.h
 	$(CUDR) -o $@ -c lal_morse_ext.cpp -I$(OBJ_DIR)

-$(OBJ_DIR)/charmm_long.ptx: lal_charmm_long.cu lal_precision.h lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ lal_charmm_long.cu
+$(OBJ_DIR)/charmm_long.cubin: lal_charmm_long.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_charmm_long.cu

-$(OBJ_DIR)/charmm_long_ptx.h: $(OBJ_DIR)/charmm_long.ptx $(OBJ_DIR)/charmm_long.ptx
-	$(BSH) ./geryon/file_to_cstr.sh charmm_long $(OBJ_DIR)/charmm_long.ptx $(OBJ_DIR)/charmm_long_ptx.h
+$(OBJ_DIR)/charmm_long_cubin.h: $(OBJ_DIR)/charmm_long.cubin $(OBJ_DIR)/charmm_long.cubin
+	$(BIN2C) -c -n charmm_long $(OBJ_DIR)/charmm_long.cubin > $(OBJ_DIR)/charmm_long_cubin.h

-$(OBJ_DIR)/lal_charmm_long.o: $(ALL_H) lal_charmm_long.h lal_charmm_long.cpp $(OBJ_DIR)/charmm_long_ptx.h $(OBJ_DIR)/lal_base_charge.o
+$(OBJ_DIR)/lal_charmm_long.o: $(ALL_H) lal_charmm_long.h lal_charmm_long.cpp $(OBJ_DIR)/charmm_long_cubin.h $(OBJ_DIR)/lal_base_charge.o
 	$(CUDR) -o $@ -c lal_charmm_long.cpp -I$(OBJ_DIR)

 $(OBJ_DIR)/lal_charmm_long_ext.o: $(ALL_H) lal_charmm_long.h lal_charmm_long_ext.cpp lal_base_charge.h
 	$(CUDR) -o $@ -c lal_charmm_long_ext.cpp -I$(OBJ_DIR)

-$(OBJ_DIR)/lj96.ptx: lal_lj96.cu lal_precision.h lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ lal_lj96.cu
+$(OBJ_DIR)/lj96.cubin: lal_lj96.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj96.cu

-$(OBJ_DIR)/lj96_ptx.h: $(OBJ_DIR)/lj96.ptx $(OBJ_DIR)/lj96.ptx
-	$(BSH) ./geryon/file_to_cstr.sh lj96 $(OBJ_DIR)/lj96.ptx $(OBJ_DIR)/lj96_ptx.h
+$(OBJ_DIR)/lj96_cubin.h: $(OBJ_DIR)/lj96.cubin $(OBJ_DIR)/lj96.cubin
+	$(BIN2C) -c -n lj96 $(OBJ_DIR)/lj96.cubin > $(OBJ_DIR)/lj96_cubin.h

-$(OBJ_DIR)/lal_lj96.o: $(ALL_H) lal_lj96.h lal_lj96.cpp $(OBJ_DIR)/lj96_ptx.h $(OBJ_DIR)/lal_base_atomic.o
+$(OBJ_DIR)/lal_lj96.o: $(ALL_H) lal_lj96.h lal_lj96.cpp $(OBJ_DIR)/lj96_cubin.h $(OBJ_DIR)/lal_base_atomic.o
 	$(CUDR) -o $@ -c lal_lj96.cpp -I$(OBJ_DIR)

 $(OBJ_DIR)/lal_lj96_ext.o: $(ALL_H) lal_lj96.h lal_lj96_ext.cpp lal_base_atomic.h
 	$(CUDR) -o $@ -c lal_lj96_ext.cpp -I$(OBJ_DIR)

-$(OBJ_DIR)/lj_expand.ptx: lal_lj_expand.cu lal_precision.h lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ lal_lj_expand.cu
+$(OBJ_DIR)/lj_expand.cubin: lal_lj_expand.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj_expand.cu

-$(OBJ_DIR)/lj_expand_ptx.h: $(OBJ_DIR)/lj_expand.ptx $(OBJ_DIR)/lj_expand.ptx
-	$(BSH) ./geryon/file_to_cstr.sh lj_expand $(OBJ_DIR)/lj_expand.ptx $(OBJ_DIR)/lj_expand_ptx.h
+$(OBJ_DIR)/lj_expand_cubin.h: $(OBJ_DIR)/lj_expand.cubin $(OBJ_DIR)/lj_expand.cubin
+	$(BIN2C) -c -n lj_expand $(OBJ_DIR)/lj_expand.cubin > $(OBJ_DIR)/lj_expand_cubin.h

-$(OBJ_DIR)/lal_lj_expand.o: $(ALL_H) lal_lj_expand.h lal_lj_expand.cpp $(OBJ_DIR)/lj_expand_ptx.h $(OBJ_DIR)/lal_base_atomic.o
+$(OBJ_DIR)/lal_lj_expand.o: $(ALL_H) lal_lj_expand.h lal_lj_expand.cpp $(OBJ_DIR)/lj_expand_cubin.h $(OBJ_DIR)/lal_base_atomic.o
 	$(CUDR) -o $@ -c lal_lj_expand.cpp -I$(OBJ_DIR)

 $(OBJ_DIR)/lal_lj_expand_ext.o: $(ALL_H) lal_lj_expand.h lal_lj_expand_ext.cpp lal_base_atomic.h
 	$(CUDR) -o $@ -c lal_lj_expand_ext.cpp -I$(OBJ_DIR)

-$(OBJ_DIR)/cg_cmm.ptx: lal_cg_cmm.cu lal_precision.h lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ lal_cg_cmm.cu
+$(OBJ_DIR)/cg_cmm.cubin: lal_cg_cmm.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_cg_cmm.cu

-$(OBJ_DIR)/cg_cmm_ptx.h: $(OBJ_DIR)/cg_cmm.ptx $(OBJ_DIR)/cg_cmm.ptx
-	$(BSH) ./geryon/file_to_cstr.sh cg_cmm $(OBJ_DIR)/cg_cmm.ptx $(OBJ_DIR)/cg_cmm_ptx.h
+$(OBJ_DIR)/cg_cmm_cubin.h: $(OBJ_DIR)/cg_cmm.cubin $(OBJ_DIR)/cg_cmm.cubin
+	$(BIN2C) -c -n cg_cmm $(OBJ_DIR)/cg_cmm.cubin > $(OBJ_DIR)/cg_cmm_cubin.h

-$(OBJ_DIR)/lal_cg_cmm.o: $(ALL_H) lal_cg_cmm.h lal_cg_cmm.cpp $(OBJ_DIR)/cg_cmm_ptx.h $(OBJ_DIR)/lal_base_atomic.o
+$(OBJ_DIR)/lal_cg_cmm.o: $(ALL_H) lal_cg_cmm.h lal_cg_cmm.cpp $(OBJ_DIR)/cg_cmm_cubin.h $(OBJ_DIR)/lal_base_atomic.o
 	$(CUDR) -o $@ -c lal_cg_cmm.cpp -I$(OBJ_DIR)

 $(OBJ_DIR)/lal_cg_cmm_ext.o: $(ALL_H) lal_cg_cmm.h lal_cg_cmm_ext.cpp lal_base_atomic.h
 	$(CUDR) -o $@ -c lal_cg_cmm_ext.cpp -I$(OBJ_DIR)

-$(OBJ_DIR)/cg_cmm_long.ptx: lal_cg_cmm_long.cu lal_precision.h lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ lal_cg_cmm_long.cu
+$(OBJ_DIR)/cg_cmm_long.cubin: lal_cg_cmm_long.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_cg_cmm_long.cu

-$(OBJ_DIR)/cg_cmm_long_ptx.h: $(OBJ_DIR)/cg_cmm_long.ptx $(OBJ_DIR)/cg_cmm_long.ptx
-	$(BSH) ./geryon/file_to_cstr.sh cg_cmm_long $(OBJ_DIR)/cg_cmm_long.ptx $(OBJ_DIR)/cg_cmm_long_ptx.h
+$(OBJ_DIR)/cg_cmm_long_cubin.h: $(OBJ_DIR)/cg_cmm_long.cubin $(OBJ_DIR)/cg_cmm_long.cubin
+	$(BIN2C) -c -n cg_cmm_long $(OBJ_DIR)/cg_cmm_long.cubin > $(OBJ_DIR)/cg_cmm_long_cubin.h

-$(OBJ_DIR)/lal_cg_cmm_long.o: $(ALL_H) lal_cg_cmm_long.h lal_cg_cmm_long.cpp $(OBJ_DIR)/cg_cmm_long_ptx.h $(OBJ_DIR)/lal_base_atomic.o
+$(OBJ_DIR)/lal_cg_cmm_long.o: $(ALL_H) lal_cg_cmm_long.h lal_cg_cmm_long.cpp $(OBJ_DIR)/cg_cmm_long_cubin.h $(OBJ_DIR)/lal_base_atomic.o
 	$(CUDR) -o $@ -c lal_cg_cmm_long.cpp -I$(OBJ_DIR)

 $(OBJ_DIR)/lal_cg_cmm_long_ext.o: $(ALL_H) lal_cg_cmm_long.h lal_cg_cmm_long_ext.cpp lal_base_charge.h
 	$(CUDR) -o $@ -c lal_cg_cmm_long_ext.cpp -I$(OBJ_DIR)

-$(OBJ_DIR)/eam.ptx: lal_eam.cu lal_precision.h lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ lal_eam.cu
+$(OBJ_DIR)/eam.cubin: lal_eam.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_eam.cu

-$(OBJ_DIR)/eam_ptx.h: $(OBJ_DIR)/eam.ptx $(OBJ_DIR)/eam.ptx
-	$(BSH) ./geryon/file_to_cstr.sh eam $(OBJ_DIR)/eam.ptx $(OBJ_DIR)/eam_ptx.h
+$(OBJ_DIR)/eam_cubin.h: $(OBJ_DIR)/eam.cubin $(OBJ_DIR)/eam.cubin
+	$(BIN2C) -c -n eam $(OBJ_DIR)/eam.cubin > $(OBJ_DIR)/eam_cubin.h

-$(OBJ_DIR)/lal_eam.o: $(ALL_H) lal_eam.h lal_eam.cpp $(OBJ_DIR)/eam_ptx.h $(OBJ_DIR)/lal_base_atomic.o
+$(OBJ_DIR)/lal_eam.o: $(ALL_H) lal_eam.h lal_eam.cpp $(OBJ_DIR)/eam_cubin.h $(OBJ_DIR)/lal_base_atomic.o
 	$(CUDR) -o $@ -c lal_eam.cpp -I$(OBJ_DIR)

 $(OBJ_DIR)/lal_eam_ext.o: $(ALL_H) lal_eam.h lal_eam_ext.cpp lal_base_atomic.h
 	$(CUDR) -o $@ -c lal_eam_ext.cpp -I$(OBJ_DIR)

-$(OBJ_DIR)/buck.ptx: lal_buck.cu lal_precision.h lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ lal_buck.cu
+$(OBJ_DIR)/buck.cubin: lal_buck.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_buck.cu

-$(OBJ_DIR)/buck_ptx.h: $(OBJ_DIR)/buck.ptx $(OBJ_DIR)/buck.ptx
-	$(BSH) ./geryon/file_to_cstr.sh buck $(OBJ_DIR)/buck.ptx $(OBJ_DIR)/buck_ptx.h
+$(OBJ_DIR)/buck_cubin.h: $(OBJ_DIR)/buck.cubin $(OBJ_DIR)/buck.cubin
+	$(BIN2C) -c -n buck $(OBJ_DIR)/buck.cubin > $(OBJ_DIR)/buck_cubin.h

-$(OBJ_DIR)/lal_buck.o: $(ALL_H) lal_buck.h lal_buck.cpp $(OBJ_DIR)/buck_ptx.h $(OBJ_DIR)/lal_base_atomic.o
+$(OBJ_DIR)/lal_buck.o: $(ALL_H) lal_buck.h lal_buck.cpp $(OBJ_DIR)/buck_cubin.h $(OBJ_DIR)/lal_base_atomic.o
 	$(CUDR) -o $@ -c lal_buck.cpp -I$(OBJ_DIR)

 $(OBJ_DIR)/lal_buck_ext.o: $(ALL_H) lal_buck.h lal_buck_ext.cpp lal_base_atomic.h
 	$(CUDR) -o $@ -c lal_buck_ext.cpp -I$(OBJ_DIR)

-$(OBJ_DIR)/buck_coul.ptx: lal_buck_coul.cu lal_precision.h lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ lal_buck_coul.cu
+$(OBJ_DIR)/buck_coul.cubin: lal_buck_coul.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_buck_coul.cu

-$(OBJ_DIR)/buck_coul_ptx.h: $(OBJ_DIR)/buck_coul.ptx $(OBJ_DIR)/buck_coul.ptx
-	$(BSH) ./geryon/file_to_cstr.sh buck_coul $(OBJ_DIR)/buck_coul.ptx $(OBJ_DIR)/buck_coul_ptx.h
+$(OBJ_DIR)/buck_coul_cubin.h: $(OBJ_DIR)/buck_coul.cubin $(OBJ_DIR)/buck_coul.cubin
+	$(BIN2C) -c -n buck_coul $(OBJ_DIR)/buck_coul.cubin > $(OBJ_DIR)/buck_coul_cubin.h

-$(OBJ_DIR)/lal_buck_coul.o: $(ALL_H) lal_buck_coul.h lal_buck_coul.cpp $(OBJ_DIR)/buck_coul_ptx.h $(OBJ_DIR)/lal_base_charge.o
+$(OBJ_DIR)/lal_buck_coul.o: $(ALL_H) lal_buck_coul.h lal_buck_coul.cpp $(OBJ_DIR)/buck_coul_cubin.h $(OBJ_DIR)/lal_base_charge.o
 	$(CUDR) -o $@ -c lal_buck_coul.cpp -I$(OBJ_DIR)

 $(OBJ_DIR)/lal_buck_coul_ext.o: $(ALL_H) lal_buck_coul.h lal_buck_coul_ext.cpp lal_base_charge.h
 	$(CUDR) -o $@ -c lal_buck_coul_ext.cpp -I$(OBJ_DIR)

-$(OBJ_DIR)/buck_coul_long.ptx: lal_buck_coul_long.cu lal_precision.h lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ lal_buck_coul_long.cu
+$(OBJ_DIR)/buck_coul_long.cubin: lal_buck_coul_long.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_buck_coul_long.cu

-$(OBJ_DIR)/buck_coul_long_ptx.h: $(OBJ_DIR)/buck_coul_long.ptx $(OBJ_DIR)/buck_coul_long.ptx
-	$(BSH) ./geryon/file_to_cstr.sh buck_coul_long $(OBJ_DIR)/buck_coul_long.ptx $(OBJ_DIR)/buck_coul_long_ptx.h
+$(OBJ_DIR)/buck_coul_long_cubin.h: $(OBJ_DIR)/buck_coul_long.cubin $(OBJ_DIR)/buck_coul_long.cubin
+	$(BIN2C) -c -n buck_coul_long $(OBJ_DIR)/buck_coul_long.cubin > $(OBJ_DIR)/buck_coul_long_cubin.h

-$(OBJ_DIR)/lal_buck_coul_long.o: $(ALL_H) lal_buck_coul_long.h lal_buck_coul_long.cpp $(OBJ_DIR)/buck_coul_long_ptx.h $(OBJ_DIR)/lal_base_charge.o
+$(OBJ_DIR)/lal_buck_coul_long.o: $(ALL_H) lal_buck_coul_long.h lal_buck_coul_long.cpp $(OBJ_DIR)/buck_coul_long_cubin.h $(OBJ_DIR)/lal_base_charge.o
 	$(CUDR) -o $@ -c lal_buck_coul_long.cpp -I$(OBJ_DIR)

 $(OBJ_DIR)/lal_buck_coul_long_ext.o: $(ALL_H) lal_buck_coul_long.h lal_buck_coul_long_ext.cpp lal_base_charge.h
 	$(CUDR) -o $@ -c lal_buck_coul_long_ext.cpp -I$(OBJ_DIR)

-$(OBJ_DIR)/table.ptx: lal_table.cu lal_precision.h lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ lal_table.cu
+$(OBJ_DIR)/table.cubin: lal_table.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_table.cu

-$(OBJ_DIR)/table_ptx.h: $(OBJ_DIR)/table.ptx $(OBJ_DIR)/table.ptx
-	$(BSH) ./geryon/file_to_cstr.sh table $(OBJ_DIR)/table.ptx $(OBJ_DIR)/table_ptx.h
+$(OBJ_DIR)/table_cubin.h: $(OBJ_DIR)/table.cubin $(OBJ_DIR)/table.cubin
+	$(BIN2C) -c -n table $(OBJ_DIR)/table.cubin > $(OBJ_DIR)/table_cubin.h

-$(OBJ_DIR)/lal_table.o: $(ALL_H) lal_table.h lal_table.cpp $(OBJ_DIR)/table_ptx.h $(OBJ_DIR)/lal_base_atomic.o
+$(OBJ_DIR)/lal_table.o: $(ALL_H) lal_table.h lal_table.cpp $(OBJ_DIR)/table_cubin.h $(OBJ_DIR)/lal_base_atomic.o
 	$(CUDR) -o $@ -c lal_table.cpp -I$(OBJ_DIR)

 $(OBJ_DIR)/lal_table_ext.o: $(ALL_H) lal_table.h lal_table_ext.cpp lal_base_atomic.h
 	$(CUDR) -o $@ -c lal_table_ext.cpp -I$(OBJ_DIR)

-$(OBJ_DIR)/yukawa.ptx: lal_yukawa.cu lal_precision.h lal_preprocessor.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ lal_yukawa.cu
+$(OBJ_DIR)/yukawa.cubin: lal_yukawa.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_yukawa.cu

-$(OBJ_DIR)/yukawa_ptx.h: $(OBJ_DIR)/yukawa.ptx $(OBJ_DIR)/yukawa.ptx
-	$(BSH) ./geryon/file_to_cstr.sh yukawa $(OBJ_DIR)/yukawa.ptx $(OBJ_DIR)/yukawa_ptx.h
+$(OBJ_DIR)/yukawa_cubin.h: $(OBJ_DIR)/yukawa.cubin $(OBJ_DIR)/yukawa.cubin
+	$(BIN2C) -c -n yukawa $(OBJ_DIR)/yukawa.cubin > $(OBJ_DIR)/yukawa_cubin.h

-$(OBJ_DIR)/lal_yukawa.o: $(ALL_H) lal_yukawa.h lal_yukawa.cpp $(OBJ_DIR)/yukawa_ptx.h $(OBJ_DIR)/lal_base_atomic.o
+$(OBJ_DIR)/lal_yukawa.o: $(ALL_H) lal_yukawa.h lal_yukawa.cpp $(OBJ_DIR)/yukawa_cubin.h $(OBJ_DIR)/lal_base_atomic.o
 	$(CUDR) -o $@ -c lal_yukawa.cpp -I$(OBJ_DIR)

 $(OBJ_DIR)/lal_yukawa_ext.o: $(ALL_H) lal_yukawa.h lal_yukawa_ext.cpp lal_base_atomic.h
 	$(CUDR) -o $@ -c lal_yukawa_ext.cpp -I$(OBJ_DIR)

+$(OBJ_DIR)/born.cubin: lal_born.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_born.cu
+
+$(OBJ_DIR)/born_cubin.h: $(OBJ_DIR)/born.cubin $(OBJ_DIR)/born.cubin
+	$(BIN2C) -c -n born $(OBJ_DIR)/born.cubin > $(OBJ_DIR)/born_cubin.h
+
+$(OBJ_DIR)/lal_born.o: $(ALL_H) lal_born.h lal_born.cpp $(OBJ_DIR)/born_cubin.h $(OBJ_DIR)/lal_base_atomic.o
+	$(CUDR) -o $@ -c lal_born.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_born_ext.o: $(ALL_H) lal_born.h lal_born_ext.cpp lal_base_atomic.h
+	$(CUDR) -o $@ -c lal_born_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/born_coul_wolf.cubin: lal_born_coul_wolf.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_born_coul_wolf.cu
+
+$(OBJ_DIR)/born_coul_wolf_cubin.h: $(OBJ_DIR)/born_coul_wolf.cubin $(OBJ_DIR)/born_coul_wolf.cubin
+	$(BIN2C) -c -n born_coul_wolf $(OBJ_DIR)/born_coul_wolf.cubin > $(OBJ_DIR)/born_coul_wolf_cubin.h
+
+$(OBJ_DIR)/lal_born_coul_wolf.o: $(ALL_H) lal_born_coul_wolf.h lal_born_coul_wolf.cpp $(OBJ_DIR)/born_coul_wolf_cubin.h $(OBJ_DIR)/lal_base_charge.o
+	$(CUDR) -o $@ -c lal_born_coul_wolf.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_born_coul_wolf_ext.o: $(ALL_H) lal_born_coul_wolf.h lal_born_coul_wolf_ext.cpp lal_base_charge.h
+	$(CUDR) -o $@ -c lal_born_coul_wolf_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/born_coul_long.cubin: lal_born_coul_long.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_born_coul_long.cu
+
+$(OBJ_DIR)/born_coul_long_cubin.h: $(OBJ_DIR)/born_coul_long.cubin $(OBJ_DIR)/born_coul_long.cubin
+	$(BIN2C) -c -n born_coul_long $(OBJ_DIR)/born_coul_long.cubin > $(OBJ_DIR)/born_coul_long_cubin.h
+
+$(OBJ_DIR)/lal_born_coul_long.o: $(ALL_H) lal_born_coul_long.h lal_born_coul_long.cpp $(OBJ_DIR)/born_coul_long_cubin.h $(OBJ_DIR)/lal_base_charge.o
+	$(CUDR) -o $@ -c lal_born_coul_long.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_born_coul_long_ext.o: $(ALL_H) lal_born_coul_long.h lal_born_coul_long_ext.cpp lal_base_charge.h
+	$(CUDR) -o $@ -c lal_born_coul_long_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/dipole_lj.cubin: lal_dipole_lj.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_dipole_lj.cu
+
+$(OBJ_DIR)/dipole_lj_cubin.h: $(OBJ_DIR)/dipole_lj.cubin $(OBJ_DIR)/dipole_lj.cubin
+	$(BIN2C) -c -n dipole_lj $(OBJ_DIR)/dipole_lj.cubin > $(OBJ_DIR)/dipole_lj_cubin.h
+
+$(OBJ_DIR)/lal_dipole_lj.o: $(ALL_H) lal_dipole_lj.h lal_dipole_lj.cpp $(OBJ_DIR)/dipole_lj_cubin.h $(OBJ_DIR)/lal_base_dipole.o
+	$(CUDR) -o $@ -c lal_dipole_lj.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_dipole_lj_ext.o: $(ALL_H) lal_dipole_lj.h lal_dipole_lj_ext.cpp lal_base_dipole.h
+	$(CUDR) -o $@ -c lal_dipole_lj_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/dipole_lj_sf.cubin: lal_dipole_lj_sf.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_dipole_lj_sf.cu
+
+$(OBJ_DIR)/dipole_lj_sf_cubin.h: $(OBJ_DIR)/dipole_lj_sf.cubin $(OBJ_DIR)/dipole_lj_sf.cubin
+	$(BIN2C) -c -n dipole_lj_sf $(OBJ_DIR)/dipole_lj_sf.cubin > $(OBJ_DIR)/dipole_lj_sf_cubin.h
+
+$(OBJ_DIR)/lal_dipole_lj_sf.o: $(ALL_H) lal_dipole_lj_sf.h lal_dipole_lj_sf.cpp $(OBJ_DIR)/dipole_lj_sf_cubin.h $(OBJ_DIR)/lal_base_dipole.o
+	$(CUDR) -o $@ -c lal_dipole_lj_sf.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_dipole_lj_sf_ext.o: $(ALL_H) lal_dipole_lj_sf.h lal_dipole_lj_sf_ext.cpp lal_base_dipole.h
+	$(CUDR) -o $@ -c lal_dipole_lj_sf_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/colloid.cubin: lal_colloid.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_colloid.cu
+
+$(OBJ_DIR)/colloid_cubin.h: $(OBJ_DIR)/colloid.cubin $(OBJ_DIR)/colloid.cubin
+	$(BIN2C) -c -n colloid $(OBJ_DIR)/colloid.cubin > $(OBJ_DIR)/colloid_cubin.h
+
+$(OBJ_DIR)/lal_colloid.o: $(ALL_H) lal_colloid.h lal_colloid.cpp $(OBJ_DIR)/colloid_cubin.h $(OBJ_DIR)/lal_base_atomic.o
+	$(CUDR) -o $@ -c lal_colloid.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_colloid_ext.o: $(ALL_H) lal_colloid.h lal_colloid_ext.cpp lal_base_atomic.h
+	$(CUDR) -o $@ -c lal_colloid_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/gauss.cubin: lal_gauss.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_gauss.cu
+
+$(OBJ_DIR)/gauss_cubin.h: $(OBJ_DIR)/gauss.cubin $(OBJ_DIR)/gauss.cubin
+	$(BIN2C) -c -n gauss $(OBJ_DIR)/gauss.cubin > $(OBJ_DIR)/gauss_cubin.h
+
+$(OBJ_DIR)/lal_gauss.o: $(ALL_H) lal_gauss.h lal_gauss.cpp $(OBJ_DIR)/gauss_cubin.h $(OBJ_DIR)/lal_base_atomic.o
+	$(CUDR) -o $@ -c lal_gauss.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_gauss_ext.o: $(ALL_H) lal_gauss.h lal_gauss_ext.cpp lal_base_atomic.h
+	$(CUDR) -o $@ -c lal_gauss_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/yukawa_colloid.cubin: lal_yukawa_colloid.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_yukawa_colloid.cu
+
+$(OBJ_DIR)/yukawa_colloid_cubin.h: $(OBJ_DIR)/yukawa_colloid.cubin $(OBJ_DIR)/yukawa_colloid.cubin
+	$(BIN2C) -c -n yukawa_colloid $(OBJ_DIR)/yukawa_colloid.cubin > $(OBJ_DIR)/yukawa_colloid_cubin.h
+
+$(OBJ_DIR)/lal_yukawa_colloid.o: $(ALL_H) lal_yukawa_colloid.h lal_yukawa_colloid.cpp $(OBJ_DIR)/yukawa_colloid_cubin.h $(OBJ_DIR)/lal_base_atomic.o
+	$(CUDR) -o $@ -c lal_yukawa_colloid.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_yukawa_colloid_ext.o: $(ALL_H) lal_yukawa_colloid.h lal_yukawa_colloid_ext.cpp lal_base_atomic.h
+	$(CUDR) -o $@ -c lal_yukawa_colloid_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lj_coul_debye.cubin: lal_lj_coul_debye.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj_coul_debye.cu
+
+$(OBJ_DIR)/lj_coul_debye_cubin.h: $(OBJ_DIR)/lj_coul_debye.cubin $(OBJ_DIR)/lj_coul_debye.cubin
+	$(BIN2C) -c -n lj_coul_debye $(OBJ_DIR)/lj_coul_debye.cubin > $(OBJ_DIR)/lj_coul_debye_cubin.h
+
+$(OBJ_DIR)/lal_lj_coul_debye.o: $(ALL_H) lal_lj_coul_debye.h lal_lj_coul_debye.cpp $(OBJ_DIR)/lj_coul_debye_cubin.h $(OBJ_DIR)/lal_base_charge.o
+	$(CUDR) -o $@ -c lal_lj_coul_debye.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_lj_coul_debye_ext.o: $(ALL_H) lal_lj_coul_debye.h lal_lj_coul_debye_ext.cpp lal_base_charge.h
+	$(CUDR) -o $@ -c lal_lj_coul_debye_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/coul_dsf.cubin: lal_coul_dsf.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_coul_dsf.cu
+
+$(OBJ_DIR)/coul_dsf_cubin.h: $(OBJ_DIR)/coul_dsf.cubin $(OBJ_DIR)/coul_dsf.cubin
+	$(BIN2C) -c -n coul_dsf $(OBJ_DIR)/coul_dsf.cubin > $(OBJ_DIR)/coul_dsf_cubin.h
+
+$(OBJ_DIR)/lal_coul_dsf.o: $(ALL_H) lal_coul_dsf.h lal_coul_dsf.cpp $(OBJ_DIR)/coul_dsf_cubin.h $(OBJ_DIR)/lal_base_charge.o
+	$(CUDR) -o $@ -c lal_coul_dsf.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_coul_dsf_ext.o: $(ALL_H) lal_coul_dsf.h lal_coul_dsf_ext.cpp lal_base_charge.h
+	$(CUDR) -o $@ -c lal_coul_dsf_ext.cpp -I$(OBJ_DIR)
+
 $(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVD_H)
 	$(CUDR) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_CUDADR $(CUDA_LIB) -lcuda 

@ -415,10 +575,10 @@ $(GPU_LIB): $(OBJS) $(CUDPP)
 	$(AR) -crusv $(GPU_LIB) $(OBJS) $(CUDPP)

 clean:
-	rm -f $(EXECS) $(GPU_LIB) $(OBJS) $(CUDPP) $(PTXS) *.linkinfo
+	rm -f $(EXECS) $(GPU_LIB) $(OBJS) $(CUDPP) $(CBNS) *.linkinfo

 veryclean: clean
 	rm -rf *~ *.linkinfo

 cleanlib:
-	rm -f $(EXECS) $(GPU_LIB) $(OBJS) $(PTXS) *.linkinfo
+	rm -f $(EXECS) $(GPU_LIB) $(OBJS) $(CBNS) *.linkinfo
--- a/lib/gpu/Opencl.makefile
+++ b/lib/gpu/Opencl.makefile
@ -17,6 +17,7 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_answer.o \
       $(OBJ_DIR)/lal_neighbor_shared.o $(OBJ_DIR)/lal_neighbor.o \
       $(OBJ_DIR)/lal_device.o $(OBJ_DIR)/lal_base_atomic.o \
       $(OBJ_DIR)/lal_base_charge.o $(OBJ_DIR)/lal_base_ellipsoid.o \
+       $(OBJ_DIR)/lal_base_dipole.o \
       $(OBJ_DIR)/lal_pppm.o $(OBJ_DIR)/lal_pppm_ext.o \
       $(OBJ_DIR)/lal_gayberne.o $(OBJ_DIR)/lal_gayberne_ext.o \
       $(OBJ_DIR)/lal_re_squared.o $(OBJ_DIR)/lal_re_squared_ext.o \
@ -25,6 +26,7 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_answer.o \
       $(OBJ_DIR)/lal_lj_expand.o $(OBJ_DIR)/lal_lj_expand_ext.o \
       $(OBJ_DIR)/lal_lj_coul.o $(OBJ_DIR)/lal_lj_coul_ext.o \
       $(OBJ_DIR)/lal_lj_coul_long.o $(OBJ_DIR)/lal_lj_coul_long_ext.o \
+       $(OBJ_DIR)/lal_lj_dsf.o $(OBJ_DIR)/lal_lj_dsf_ext.o \
       $(OBJ_DIR)/lal_lj_class2_long.o $(OBJ_DIR)/lal_lj_class2_long_ext.o \
       $(OBJ_DIR)/lal_coul_long.o $(OBJ_DIR)/lal_coul_long_ext.o \
       $(OBJ_DIR)/lal_morse.o $(OBJ_DIR)/lal_morse_ext.o \
@ -36,20 +38,43 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_answer.o \
       $(OBJ_DIR)/lal_buck_coul.o $(OBJ_DIR)/lal_buck_coul_ext.o \
       $(OBJ_DIR)/lal_buck_coul_long.o $(OBJ_DIR)/lal_buck_coul_long_ext.o \
       $(OBJ_DIR)/lal_table.o $(OBJ_DIR)/lal_table_ext.o \
-       $(OBJ_DIR)/lal_yukawa.o $(OBJ_DIR)/lal_yukawa_ext.o
+       $(OBJ_DIR)/lal_yukawa.o $(OBJ_DIR)/lal_yukawa_ext.o \
+       $(OBJ_DIR)/lal_born.o $(OBJ_DIR)/lal_born_ext.o \
+       $(OBJ_DIR)/lal_born_coul_wolf.o $(OBJ_DIR)/lal_born_coul_wolf_ext.o \
+       $(OBJ_DIR)/lal_born_coul_long.o $(OBJ_DIR)/lal_born_coul_long_ext.o \
+       $(OBJ_DIR)/lal_dipole_lj.o $(OBJ_DIR)/lal_dipole_lj_ext.o \
+       $(OBJ_DIR)/lal_dipole_lj_sf.o $(OBJ_DIR)/lal_dipole_lj_sf_ext.o \
+       $(OBJ_DIR)/lal_colloid.o $(OBJ_DIR)/lal_colloid_ext.o \
+       $(OBJ_DIR)/lal_gauss.o $(OBJ_DIR)/lal_gauss_ext.o \
+       $(OBJ_DIR)/lal_yukawa_colloid.o $(OBJ_DIR)/lal_yukawa_colloid_ext.o \
+       $(OBJ_DIR)/lal_lj_coul_debye.o $(OBJ_DIR)/lal_lj_coul_debye_ext.o \
+       $(OBJ_DIR)/lal_coul_dsf.o $(OBJ_DIR)/lal_coul_dsf_ext.o
+
 KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \
       $(OBJ_DIR)/neighbor_cpu_cl.h $(OBJ_DIR)/pppm_cl.h \
       $(OBJ_DIR)/ellipsoid_nbor_cl.h $(OBJ_DIR)/gayberne_cl.h \
       $(OBJ_DIR)/gayberne_lj_cl.h $(OBJ_DIR)/re_squared_cl.h \
       $(OBJ_DIR)/re_squared_lj_cl.h $(OBJ_DIR)/lj_cl.h $(OBJ_DIR)/lj96_cl.h \
       $(OBJ_DIR)/lj_expand_cl.h $(OBJ_DIR)/lj_coul_cl.h \
-       $(OBJ_DIR)/lj_coul_long_cl.h $(OBJ_DIR)/lj_class2_long_cl.h \
+       $(OBJ_DIR)/lj_coul_long_cl.h $(OBJ_DIR)/lj_dsf_cl.h \
+       $(OBJ_DIR)/lj_class2_long_cl.h \
       $(OBJ_DIR)/coul_long_cl.h $(OBJ_DIR)/morse_cl.h \
       $(OBJ_DIR)/charmm_long_cl.h $(OBJ_DIR)/cg_cmm_cl.h \
       $(OBJ_DIR)/cg_cmm_long_cl.h $(OBJ_DIR)/neighbor_gpu_cl.h \
       $(OBJ_DIR)/eam_cl.h $(OBJ_DIR)/buck_cl.h \
       $(OBJ_DIR)/buck_coul_cl.h $(OBJ_DIR)/buck_coul_long_cl.h \
-       $(OBJ_DIR)/table_cl.h $(OBJ_DIR)/yukawa_cl.h
+       $(OBJ_DIR)/table_cl.h $(OBJ_DIR)/yukawa_cl.h \
+       $(OBJ_DIR)/born.cubin $(OBJ_DIR)/born_cubin.h \
+       $(OBJ_DIR)/born_coul_wolf.cubin $(OBJ_DIR)/born_coul_wolf_cubin.h \
+       $(OBJ_DIR)/born_coul_long.cubin $(OBJ_DIR)/born_coul_long_cubin.h \
+       $(OBJ_DIR)/dipole_lj.cubin $(OBJ_DIR)/dipole_lj_cubin.h \
+       $(OBJ_DIR)/dipole_lj_sf.cubin $(OBJ_DIR)/dipole_lj_sf_cubin.h \
+       $(OBJ_DIR)/colloid.cubin $(OBJ_DIR)/colloid_cubin.h \
+       $(OBJ_DIR)/gauss.cubin $(OBJ_DIR)/gauss_cubin.h \
+       $(OBJ_DIR)/yukawa_colloid.cubin $(OBJ_DIR)/yukawa_colloid_cubin.h \
+       $(OBJ_DIR)/lj_coul_debye.cubin $(OBJ_DIR)/lj_coul_debye_cubin.h \
+       $(OBJ_DIR)/coul_dsf.cubin $(OBJ_DIR)/coul_dsf_cubin.h
+

 OCL_EXECS = $(BIN_DIR)/ocl_get_devices

@ -91,6 +116,9 @@ $(OBJ_DIR)/lal_base_charge.o: $(OCL_H) lal_base_charge.h lal_base_charge.cpp
 $(OBJ_DIR)/lal_base_ellipsoid.o: $(OCL_H) lal_base_ellipsoid.h lal_base_ellipsoid.cpp $(OBJ_DIR)/ellipsoid_nbor_cl.h
 	$(OCL) -o $@ -c lal_base_ellipsoid.cpp -I$(OBJ_DIR)

+$(OBJ_DIR)/lal_base_dipole.o: $(OCL_H) lal_base_dipole.h lal_base_dipole.cpp
+	$(OCL) -o $@ -c lal_base_dipole.cpp -I$(OBJ_DIR)
+
 $(OBJ_DIR)/pppm_cl.h: lal_pppm.cu lal_preprocessor.h
 	$(BSH) ./geryon/file_to_cstr.sh pppm lal_preprocessor.h lal_pppm.cu $(OBJ_DIR)/pppm_cl.h;

@ -154,6 +182,15 @@ $(OBJ_DIR)/lal_lj_coul_long.o: $(ALL_H) lal_lj_coul_long.h lal_lj_coul_long.cpp
 $(OBJ_DIR)/lal_lj_coul_long_ext.o: $(ALL_H) lal_lj_coul_long.h lal_lj_coul_long_ext.cpp lal_base_charge.h
 	$(OCL) -o $@ -c lal_lj_coul_long_ext.cpp -I$(OBJ_DIR)

+$(OBJ_DIR)/lj_dsf_cl.h: lal_lj_dsf.cu $(PRE1_H)
+	$(BSH) ./geryon/file_to_cstr.sh lj_dsf $(PRE1_H) lal_lj_dsf.cu $(OBJ_DIR)/lj_dsf_cl.h;
+
+$(OBJ_DIR)/lal_lj_dsf.o: $(ALL_H) lal_lj_dsf.h lal_lj_dsf.cpp  $(OBJ_DIR)/lj_dsf_cl.h $(OBJ_DIR)/lj_dsf_cl.h $(OBJ_DIR)/lal_base_charge.o
+	$(OCL) -o $@ -c lal_lj_dsf.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_lj_dsf_ext.o: $(ALL_H) lal_lj_dsf.h lal_lj_dsf_ext.cpp lal_base_charge.h
+	$(OCL) -o $@ -c lal_lj_dsf_ext.cpp -I$(OBJ_DIR)
+
 $(OBJ_DIR)/lj_class2_long_cl.h: lal_lj_class2_long.cu $(PRE1_H)
 	$(BSH) ./geryon/file_to_cstr.sh lj_class2_long $(PRE1_H) lal_lj_class2_long.cu $(OBJ_DIR)/lj_class2_long_cl.h;

@ -280,6 +317,96 @@ $(OBJ_DIR)/lal_yukawa.o: $(ALL_H) lal_yukawa.h lal_yukawa.cpp  $(OBJ_DIR)/yukawa
 $(OBJ_DIR)/lal_yukawa_ext.o: $(ALL_H) lal_yukawa.h lal_yukawa_ext.cpp lal_base_atomic.h
 	$(OCL) -o $@ -c lal_yukawa_ext.cpp -I$(OBJ_DIR)

+$(OBJ_DIR)/born_cl.h: lal_born.cu $(PRE1_H)
+	$(BSH) ./geryon/file_to_cstr.sh born $(PRE1_H) lal_born.cu $(OBJ_DIR)/born_cl.h;
+
+$(OBJ_DIR)/lal_born.o: $(ALL_H) lal_born.h lal_born.cpp  $(OBJ_DIR)/born_cl.h $(OBJ_DIR)/born_cl.h $(OBJ_DIR)/lal_base_atomic.o
+	$(OCL) -o $@ -c lal_born.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_born_ext.o: $(ALL_H) lal_born.h lal_born_ext.cpp lal_base_atomic.h
+	$(OCL) -o $@ -c lal_born_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/born_coul_wolf_cl.h: lal_born_coul_wolf.cu $(PRE1_H)
+	$(BSH) ./geryon/file_to_cstr.sh born_coul_wolf $(PRE1_H) lal_born_coul_wolf.cu $(OBJ_DIR)/born_coul_wolf_cl.h;
+
+$(OBJ_DIR)/lal_born_coul_wolf.o: $(ALL_H) lal_born_coul_wolf.h lal_born_coul_wolf.cpp  $(OBJ_DIR)/born_coul_wolf_cl.h $(OBJ_DIR)/born_coul_wolf_cl.h $(OBJ_DIR)/lal_base_charge.o
+	$(OCL) -o $@ -c lal_born_coul_wolf.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_born_coul_wolf_ext.o: $(ALL_H) lal_born_coul_wolf.h lal_born_coul_wolf_ext.cpp lal_base_charge.h
+	$(OCL) -o $@ -c lal_born_coul_wolf_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/born_coul_long_cl.h: lal_born_coul_long.cu $(PRE1_H)
+	$(BSH) ./geryon/file_to_cstr.sh born_coul_long $(PRE1_H) lal_born_coul_long.cu $(OBJ_DIR)/born_coul_long_cl.h;
+
+$(OBJ_DIR)/lal_born_coul_long.o: $(ALL_H) lal_born_coul_long.h lal_born_coul_long.cpp  $(OBJ_DIR)/born_coul_long_cl.h $(OBJ_DIR)/born_coul_long_cl.h $(OBJ_DIR)/lal_base_charge.o
+	$(OCL) -o $@ -c lal_born_coul_long.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_born_coul_long_ext.o: $(ALL_H) lal_born_coul_long.h lal_born_coul_long_ext.cpp lal_base_charge.h
+	$(OCL) -o $@ -c lal_born_coul_long_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/dipole_lj_cl.h: lal_dipole_lj.cu $(PRE1_H)
+	$(BSH) ./geryon/file_to_cstr.sh dipole_lj $(PRE1_H) lal_dipole_lj.cu $(OBJ_DIR)/dipole_lj_cl.h;
+
+$(OBJ_DIR)/lal_dipole_lj.o: $(ALL_H) lal_dipole_lj.h lal_dipole_lj.cpp  $(OBJ_DIR)/dipole_lj_cl.h $(OBJ_DIR)/dipole_lj_cl.h $(OBJ_DIR)/lal_base_dipole.o
+	$(OCL) -o $@ -c lal_dipole_lj.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_dipole_lj_ext.o: $(ALL_H) lal_dipole_lj.h lal_dipole_lj_ext.cpp lal_base_dipole.h
+	$(OCL) -o $@ -c lal_dipole_lj_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/dipole_lj_sf_cl.h: lal_dipole_lj_sf.cu $(PRE1_H)
+	$(BSH) ./geryon/file_to_cstr.sh dipole_lj_sf $(PRE1_H) lal_dipole_lj_sf.cu $(OBJ_DIR)/dipole_lj_sf_cl.h;
+
+$(OBJ_DIR)/lal_dipole_lj_sf.o: $(ALL_H) lal_dipole_lj_sf.h lal_dipole_lj_sf.cpp  $(OBJ_DIR)/dipole_lj_sf_cl.h $(OBJ_DIR)/dipole_lj_sf_cl.h $(OBJ_DIR)/lal_base_dipole.o
+	$(OCL) -o $@ -c lal_dipole_lj_sf.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_dipole_lj_sf_ext.o: $(ALL_H) lal_dipole_lj_sf.h lal_dipole_lj_sf_ext.cpp lal_base_dipole.h
+	$(OCL) -o $@ -c lal_dipole_lj_sf_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/colloid_cl.h: lal_colloid.cu $(PRE1_H)
+	$(BSH) ./geryon/file_to_cstr.sh colloid $(PRE1_H) lal_colloid.cu $(OBJ_DIR)/colloid_cl.h;
+
+$(OBJ_DIR)/lal_colloid.o: $(ALL_H) lal_colloid.h lal_colloid.cpp  $(OBJ_DIR)/colloid_cl.h $(OBJ_DIR)/colloid_cl.h $(OBJ_DIR)/lal_base_atomic.o
+	$(OCL) -o $@ -c lal_colloid.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_colloid_ext.o: $(ALL_H) lal_colloid.h lal_colloid_ext.cpp lal_base_atomic.h
+	$(OCL) -o $@ -c lal_colloid_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/gauss_cl.h: lal_gauss.cu $(PRE1_H)
+	$(BSH) ./geryon/file_to_cstr.sh gauss $(PRE1_H) lal_gauss.cu $(OBJ_DIR)/gauss_cl.h;
+
+$(OBJ_DIR)/lal_gauss.o: $(ALL_H) lal_gauss.h lal_gauss.cpp  $(OBJ_DIR)/gauss_cl.h $(OBJ_DIR)/gauss_cl.h $(OBJ_DIR)/lal_base_atomic.o
+	$(OCL) -o $@ -c lal_gauss.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_gauss_ext.o: $(ALL_H) lal_gauss.h lal_gauss_ext.cpp lal_base_atomic.h
+	$(OCL) -o $@ -c lal_gauss_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/yukawa_colloid_cl.h: lal_yukawa_colloid.cu $(PRE1_H)
+	$(BSH) ./geryon/file_to_cstr.sh yukawa_colloid $(PRE1_H) lal_yukawa_colloid.cu $(OBJ_DIR)/yukawa_colloid_cl.h;
+
+$(OBJ_DIR)/lal_yukawa_colloid.o: $(ALL_H) lal_yukawa_colloid.h lal_yukawa_colloid.cpp  $(OBJ_DIR)/yukawa_colloid_cl.h $(OBJ_DIR)/yukawa_colloid_cl.h $(OBJ_DIR)/lal_base_atomic.o
+	$(OCL) -o $@ -c lal_yukawa_colloid.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_yukawa_colloid_ext.o: $(ALL_H) lal_yukawa_colloid.h lal_yukawa_colloid_ext.cpp lal_base_atomic.h
+	$(OCL) -o $@ -c lal_yukawa_colloid_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lj_coul_debye_cl.h: lal_lj_coul_debye.cu $(PRE1_H)
+	$(BSH) ./geryon/file_to_cstr.sh lj_coul_debye $(PRE1_H) lal_lj_coul_debye.cu $(OBJ_DIR)/lj_coul_debye_cl.h;
+
+$(OBJ_DIR)/lal_lj_coul_debye.o: $(ALL_H) lal_lj_coul_debye.h lal_lj_coul_debye.cpp  $(OBJ_DIR)/lj_coul_debye_cl.h $(OBJ_DIR)/lj_coul_debye_cl.h $(OBJ_DIR)/lal_base_charge.o
+	$(OCL) -o $@ -c lal_lj_coul_debye.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_lj_coul_debye_ext.o: $(ALL_H) lal_lj_coul_debye.h lal_lj_coul_debye_ext.cpp lal_base_charge.h
+	$(OCL) -o $@ -c lal_lj_coul_debye_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/coul_dsf_cl.h: lal_coul_dsf.cu $(PRE1_H)
+	$(BSH) ./geryon/file_to_cstr.sh coul_dsf $(PRE1_H) lal_coul_dsf.cu $(OBJ_DIR)/coul_dsf_cl.h;
+
+$(OBJ_DIR)/lal_coul_dsf.o: $(ALL_H) lal_coul_dsf.h lal_coul_dsf.cpp  $(OBJ_DIR)/coul_dsf_cl.h $(OBJ_DIR)/coul_dsf_cl.h $(OBJ_DIR)/lal_base_charge.o
+	$(OCL) -o $@ -c lal_coul_dsf.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_coul_dsf_ext.o: $(ALL_H) lal_coul_dsf.h lal_coul_dsf_ext.cpp lal_base_charge.h
+	$(OCL) -o $@ -c lal_coul_dsf_ext.cpp -I$(OBJ_DIR)
+
 $(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp
 	$(OCL) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_OPENCL $(OCL_LINK) 

--- a/lib/gpu/geryon/README
+++ b/lib/gpu/geryon/README
@ -1,3 +1,7 @@
+NOTE: This Geryon distribution has been modified to remove files not
+      necessary for the LAMMPS implementation. The full distribution
+      is available at http://users.nccs.gov/~wb8/geryon/index.htm
+
 Geryon

   Copyright (2010) Sandia Corporation.  Under the terms of Contract
--- a/lib/gpu/geryon/VERSION.txt
+++ b/lib/gpu/geryon/VERSION.txt
@ -1 +1 @@
-Geryon Version 12.034
+Geryon Version 12.033
--- a/lib/gpu/geryon/nvd_device.h
+++ b/lib/gpu/geryon/nvd_device.h
@ -141,6 +141,11 @@ class UCL_Device {
  /// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
  inline int device_type(const int i) { return UCL_GPU; }
  
+  /// Returns true if host memory is efficiently addressable from device
+  inline bool shared_memory() { return shared_memory(_device); }
+  /// Returns true if host memory is efficiently addressable from device
+  inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; }
+  
  /// Returns true if double precision is support for the current device
  bool double_precision() { return double_precision(_device); }
  /// Returns true if double precision is support for the device
--- a/lib/gpu/geryon/nvd_kernel.h
+++ b/lib/gpu/geryon/nvd_kernel.h
@ -30,11 +30,23 @@
 namespace ucl_cudadr {

 class UCL_Texture;
+template <class numtyp> class UCL_D_Vec;
+template <class numtyp> class UCL_D_Mat;
+template <class hosttype, class devtype> class UCL_Vector;
+template <class hosttype, class devtype> class UCL_Matrix;
+#define UCL_MAX_KERNEL_ARGS 256
    
 /// Class storing 1 or more kernel functions from a single string or file
 class UCL_Program {
 public:
  inline UCL_Program(UCL_Device &device) { _cq=device.cq(); }
+  inline UCL_Program(UCL_Device &device, const void *program, 
+                     const char *flags="", std::string *log=NULL) { 
+    _cq=device.cq();
+    init(device); 
+    load_string(program,flags,log);
+  }
+
  inline ~UCL_Program() {}

  /// Initialize the program with a device
@ -64,10 +76,10 @@ class UCL_Program {
  }
  
  /// Load a program from a string and compile with flags
-  inline int load_string(const char *program, const char *flags="",
+  inline int load_string(const void *program, const char *flags="",
                         std::string *log=NULL) {
    if (std::string(flags)=="BINARY")
-      return load_binary(program);
+      return load_binary((const char *)program);
    const unsigned int num_opts=2;
    CUjit_option options[num_opts];
    void *values[num_opts];
@ -134,15 +146,25 @@ class UCL_Program {
  friend class UCL_Texture;
 };

-/// Class for dealing with OpenCL kernels
+/// Class for dealing with CUDA Driver kernels
 class UCL_Kernel {
 public:
-  UCL_Kernel() : _dimensions(1), _num_args(0), _param_size(0) 
-    { _num_blocks[0]=0; }
+  UCL_Kernel() : _dimensions(1), _num_args(0) { 
+    #if CUDA_VERSION < 4000
+    _param_size=0;
+    #endif
+    _num_blocks[0]=0; 
+  }
  
  UCL_Kernel(UCL_Program &program, const char *function) : 
-    _dimensions(1), _num_args(0), _param_size(0) 
-    { _num_blocks[0]=0; set_function(program,function); _cq=program._cq; }
+    _dimensions(1), _num_args(0) {
+    #if CUDA_VERSION < 4000
+    _param_size=0;
+    #endif
+    _num_blocks[0]=0; 
+    set_function(program,function); 
+    _cq=program._cq; 
+  }
  
  ~UCL_Kernel() {}

@ -170,78 +192,190 @@ class UCL_Kernel {
    * changes 
    * \note To set kernel parameter i (i>0), parameter i-1 must be set **/
  template <class dtype>
-  inline void set_arg(const unsigned index, dtype *arg) {
+  inline void set_arg(const unsigned index, const dtype * const arg) {
    if (index==_num_args)
      add_arg(arg);
    else if (index<_num_args)
+      #if CUDA_VERSION >= 4000
+      _kernel_args[index]=arg;
+      #else
      CU_SAFE_CALL(cuParamSetv(_kernel, _offsets[index], arg, sizeof(dtype)));
+      #endif
    else
      assert(0==1); // Must add kernel parameters in sequential order 
  }
 
+  /// Set a geryon container as a kernel argument.
+  template <class numtyp>
+  inline void set_arg(const UCL_D_Vec<numtyp> * const arg) 
+    { set_arg(&arg->begin()); }
+
+  /// Set a geryon container as a kernel argument.
+  template <class numtyp>
+  inline void set_arg(const UCL_D_Mat<numtyp> * const arg) 
+    { set_arg(&arg->begin()); }
+
+  /// Set a geryon container as a kernel argument.
+  template <class hosttype, class devtype>
+  inline void set_arg(const UCL_Vector<hosttype, devtype> * const arg) 
+    { set_arg(&arg->device.begin()); }
+
+  /// Set a geryon container as a kernel argument.
+  template <class hosttype, class devtype>
+  inline void set_arg(const UCL_Matrix<hosttype, devtype> * const arg) 
+    { set_arg(&arg->device.begin()); }
+
  /// Add a kernel argument.
  inline void add_arg(const CUdeviceptr* const arg) {
+    #if CUDA_VERSION >= 4000
+    _kernel_args[_num_args]=(void *)arg;
+    #else
    void* ptr = (void*)(size_t)(*arg);
    _param_size = (_param_size + __alignof(ptr) - 1) & ~(__alignof(ptr) - 1);
    CU_SAFE_CALL(cuParamSetv(_kernel, _param_size, &ptr, sizeof(ptr)));
    _offsets.push_back(_param_size);
    _param_size+=sizeof(ptr);
+    #endif
    _num_args++;
+    if (_num_args>UCL_MAX_KERNEL_ARGS) assert(0==1);
  }

  /// Add a kernel argument.
  template <class dtype>
  inline void add_arg(const dtype* const arg) {
+    #if CUDA_VERSION >= 4000
+    _kernel_args[_num_args]=const_cast<dtype * const>(arg);
+    #else
    _param_size = (_param_size+__alignof(dtype)-1) & ~(__alignof(dtype)-1);
    CU_SAFE_CALL(cuParamSetv(_kernel,_param_size,(void*)arg,sizeof(dtype)));
    _offsets.push_back(_param_size);
    _param_size+=sizeof(dtype);
+    #endif
    _num_args++;
+    if (_num_args>UCL_MAX_KERNEL_ARGS) assert(0==1);
  }

+  /// Add a geryon container as a kernel argument.
+  template <class numtyp>
+  inline void add_arg(const UCL_D_Vec<numtyp> * const arg) 
+    { add_arg(&arg->begin()); }
+
+  /// Add a geryon container as a kernel argument.
+  template <class numtyp>
+  inline void add_arg(const UCL_D_Mat<numtyp> * const arg) 
+    { add_arg(&arg->begin()); }
+
+  /// Add a geryon container as a kernel argument.
+  template <class hosttype, class devtype>
+  inline void add_arg(const UCL_Vector<hosttype, devtype> * const arg) 
+    { add_arg(&arg->device.begin()); }
+
+  /// Add a geryon container as a kernel argument.
+  template <class hosttype, class devtype>
+  inline void add_arg(const UCL_Matrix<hosttype, devtype> * const arg) 
+    { add_arg(&arg->device.begin()); }
+
  /// Set the number of thread blocks and the number of threads in each block
-  /** \note This should be called after all arguments have been added **/
+  /** \note This should be called before any arguments have been added
+      \note The default command queue is used for the kernel execution **/
  inline void set_size(const size_t num_blocks, const size_t block_size) { 
    _dimensions=1; 
    _num_blocks[0]=num_blocks; 
    _num_blocks[1]=1;
+    _num_blocks[2]=1;
+    #if CUDA_VERSION >= 4000
+    _block_size[0]=block_size;
+    _block_size[1]=1;
+    _block_size[2]=1;
+    #else    
    CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size,1,1));
+    #endif
  }

  /// Set the number of thread blocks and the number of threads in each block
+  /** \note This should be called before any arguments have been added
+      \note The default command queue for the kernel is changed to cq **/
+  inline void set_size(const size_t num_blocks, const size_t block_size,
+                       command_queue &cq)
+    { _cq=cq; set_size(num_blocks,block_size); }
+
+  /// Set the number of thread blocks and the number of threads in each block
+  /** \note This should be called before any arguments have been added
+      \note The default command queue is used for the kernel execution **/
  inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
                       const size_t block_size_x, const size_t block_size_y) { 
    _dimensions=2; 
    _num_blocks[0]=num_blocks_x; 
    _num_blocks[1]=num_blocks_y; 
+    _num_blocks[2]=1;
+    #if CUDA_VERSION >= 4000
+    _block_size[0]=block_size_x;
+    _block_size[1]=block_size_y;
+    _block_size[2]=1;
+    #else    
    CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size_x,block_size_y,1));
+    #endif
  }
  
  /// Set the number of thread blocks and the number of threads in each block
+  /** \note This should be called before any arguments have been added
+      \note The default command queue for the kernel is changed to cq **/
+  inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
+                       const size_t block_size_x, const size_t block_size_y,
+                       command_queue &cq) 
+    {_cq=cq; set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y);}
+
+  /// Set the number of thread blocks and the number of threads in each block
+  /** \note This should be called before any arguments have been added
+      \note The default command queue is used for the kernel execution **/
  inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
                       const size_t block_size_x, 
                       const size_t block_size_y, const size_t block_size_z) {
    _dimensions=2; 
    _num_blocks[0]=num_blocks_x; 
    _num_blocks[1]=num_blocks_y; 
+    _num_blocks[2]=1; 
+    #if CUDA_VERSION >= 4000
+    _block_size[0]=block_size_x;
+    _block_size[1]=block_size_y;
+    _block_size[2]=block_size_z;
+    #else    
    CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size_x,block_size_y,
                                     block_size_z));
+    #endif
+  }
+
+  /// Set the number of thread blocks and the number of threads in each block
+  /** \note This should be called before any arguments have been added
+      \note The default command queue is used for the kernel execution **/
+  inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
+                       const size_t block_size_x, const size_t block_size_y,
+                       const size_t block_size_z, command_queue &cq) {
+    _cq=cq;
+    set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y, 
+             block_size_z);
  }
  
  /// Run the kernel in the default command queue
  inline void run() {
+    #if CUDA_VERSION >= 4000
+    CU_SAFE_CALL(cuLaunchKernel(_kernel,_num_blocks[0],_num_blocks[1],
+                                _num_blocks[2],_block_size[0],_block_size[1],
+                                _block_size[2],0,_cq,_kernel_args,NULL));
+    #else
    CU_SAFE_CALL(cuParamSetSize(_kernel,_param_size));
    CU_SAFE_CALL(cuLaunchGridAsync(_kernel,_num_blocks[0],_num_blocks[1],_cq));
-  }
-  
-  /// Run the kernel in the specified command queue
-  inline void run(command_queue &cq) {
-    CU_SAFE_CALL(cuParamSetSize(_kernel,_param_size));
-    CU_SAFE_CALL(cuLaunchGridAsync(_kernel,_num_blocks[0],_num_blocks[1],cq));
+    #endif
  }
  
  /// Clear any arguments associated with the kernel
-  inline void clear_args() { _num_args=0; _offsets.clear(); _param_size=0; }
+  inline void clear_args() { 
+    _num_args=0; 
+    #if CUDA_VERSION < 4000
+    _offsets.clear(); 
+    _param_size=0;
+    #endif
+  }

  #include "ucl_arg_kludge.h"

@ -249,11 +383,17 @@ class UCL_Kernel {
  CUfunction _kernel;
  CUstream _cq;
  unsigned _dimensions;
-  unsigned _num_blocks[2];
+  unsigned _num_blocks[3];
  unsigned _num_args;
+  friend class UCL_Texture;
+  
+  #if CUDA_VERSION >= 4000
+  unsigned _block_size[3];
+  void * _kernel_args[UCL_MAX_KERNEL_ARGS];
+  #else
  std::vector<unsigned> _offsets;
  unsigned _param_size;
-  friend class UCL_Texture;
+  #endif
 };

 } // namespace
--- a/lib/gpu/geryon/nvd_mat.h
+++ b/lib/gpu/geryon/nvd_mat.h
@ -38,6 +38,9 @@ namespace ucl_cudadr {
 #include "ucl_h_mat.h"
 #include "ucl_d_vec.h"
 #include "ucl_d_mat.h"
+#include "ucl_s_obj_help.h"
+#include "ucl_vector.h"
+#include "ucl_matrix.h"
 #undef _UCL_DEVICE_PTR_MAT
 #undef _UCL_MAT_ALLOW

--- a/lib/gpu/geryon/nvd_memory.h
+++ b/lib/gpu/geryon/nvd_memory.h
@ -85,6 +85,21 @@ inline void _host_free(mat_type &mat, const enum UCL_MEMOPT kind) {
    free(mat.begin());
 }

+template <class mat_type>
+inline int _host_resize(mat_type &mat, const size_t n) {
+  _host_free(mat,mat.kind());
+  CUresult err=CUDA_SUCCESS;
+  if (mat.kind()==UCL_RW_OPTIMIZED)  
+    err=cuMemAllocHost((void **)mat.host_ptr(),n);
+  else if (mat.kind()==UCL_WRITE_OPTIMIZED)
+    err=cuMemHostAlloc((void **)mat.host_ptr(),n,CU_MEMHOSTALLOC_WRITECOMBINED);
+  else
+    *(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
+  if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL)
+    return UCL_MEMORY_ERROR;
+  return UCL_SUCCESS;
+}
+
 // --------------------------------------------------------------------------
 // - DEVICE MEMORY ALLOCATION ROUTINES
 // --------------------------------------------------------------------------
@ -143,6 +158,29 @@ inline void _device_free(mat_type &mat) {
  CU_DESTRUCT_CALL(cuMemFree(mat.cbegin()));
 }

+template <class mat_type>
+inline int _device_resize(mat_type &mat, const size_t n) {
+  _device_free(mat);
+  CUresult err=cuMemAlloc(&mat.cbegin(),n);
+  if (err!=CUDA_SUCCESS)
+    return UCL_MEMORY_ERROR;
+  return UCL_SUCCESS;
+}
+
+template <class mat_type>
+inline int _device_resize(mat_type &mat, const size_t rows,
+                          const size_t cols, size_t &pitch) {
+  _device_free(mat);
+  CUresult err;
+  CUDA_INT_TYPE upitch;                        
+  err=cuMemAllocPitch(&mat.cbegin(),&upitch,
+                      cols*sizeof(typename mat_type::data_type),rows,16);
+  pitch=static_cast<size_t>(upitch);                               
+  if (err!=CUDA_SUCCESS)
+    return UCL_MEMORY_ERROR;
+  return UCL_SUCCESS;
+}    
+
 inline void _device_view(CUdeviceptr *ptr, CUdeviceptr &in) { 
  *ptr=in;
 }
--- a/lib/gpu/geryon/nvd_texture.h
+++ b/lib/gpu/geryon/nvd_texture.h
@ -42,27 +42,56 @@ class UCL_Texture {
    { CU_SAFE_CALL(cuModuleGetTexRef(&_tex, prog._module, texture_name)); }

  /// Bind a float array where each fetch grabs a vector of length numel
-  template<class mat_typ>
-  inline void bind_float(mat_typ &vec, const unsigned numel) {
-    #ifdef UCL_DEBUG
-    assert(numel!=0 && numel<5);
-    #endif
-    CU_SAFE_CALL(cuTexRefSetAddress(NULL, _tex, vec.cbegin(), 
-                 vec.numel()*vec.element_size()));
-    CU_SAFE_CALL(cuTexRefSetFormat(_tex, CU_AD_FORMAT_FLOAT, numel));
-  }
+  template<class numtyp>
+  inline void bind_float(UCL_D_Vec<numtyp> &vec, const unsigned numel) 
+    { _bind_float(vec,numel); }
+
+  /// Bind a float array where each fetch grabs a vector of length numel
+  template<class numtyp>
+  inline void bind_float(UCL_D_Mat<numtyp> &vec, const unsigned numel) 
+    { _bind_float(vec,numel); }
+
+  /// Bind a float array where each fetch grabs a vector of length numel
+  template<class numtyp, class devtyp>
+  inline void bind_float(UCL_Vector<numtyp, devtyp> &vec, const unsigned numel) 
+    { _bind_float(vec.device,numel); }
+
+  /// Bind a float array where each fetch grabs a vector of length numel
+  template<class numtyp, class devtyp>
+  inline void bind_float(UCL_Matrix<numtyp, devtyp> &vec, const unsigned numel) 
+    { _bind_float(vec.device,numel); }

  /// Unbind the texture reference from the memory allocation
  inline void unbind() { }

  /// Make a texture reference available to kernel  
  inline void allow(UCL_Kernel &kernel) { 
+    #if CUDA_VERSION < 4000
    CU_SAFE_CALL(cuParamSetTexRef(kernel._kernel, CU_PARAM_TR_DEFAULT, _tex)); 
+    #endif
  }
  
 private:
  CUtexref _tex;
  friend class UCL_Kernel;
+
+  template<class mat_typ>
+  inline void _bind_float(mat_typ &vec, const unsigned numel) {
+    #ifdef UCL_DEBUG
+    assert(numel!=0 && numel<5);
+    #endif
+    CU_SAFE_CALL(cuTexRefSetAddress(NULL, _tex, vec.cbegin(), 
+                 vec.numel()*vec.element_size()));
+    if (vec.element_size()==sizeof(float))
+      CU_SAFE_CALL(cuTexRefSetFormat(_tex, CU_AD_FORMAT_FLOAT, numel));
+    else {
+      if (numel>2)
+        CU_SAFE_CALL(cuTexRefSetFormat(_tex, CU_AD_FORMAT_SIGNED_INT32, numel));
+      else
+        CU_SAFE_CALL(cuTexRefSetFormat(_tex,CU_AD_FORMAT_SIGNED_INT32,numel*2));
+    }
+  }
+
 };

 } // namespace
--- a/lib/gpu/geryon/ocl_device.h
+++ b/lib/gpu/geryon/ocl_device.h
@ -158,6 +158,11 @@ class UCL_Device {
  /// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
  inline int device_type(const int i);
  
+  /// Returns true if host memory is efficiently addressable from device
+  inline bool shared_memory() { return shared_memory(_device); }
+  /// Returns true if host memory is efficiently addressable from device
+  inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; }
+  
  /// Returns true if double precision is support for the current device
  bool double_precision() { return double_precision(_device); }
  /// Returns true if double precision is support for the device
--- a/lib/gpu/geryon/ocl_kernel.h
+++ b/lib/gpu/geryon/ocl_kernel.h
@ -29,11 +29,25 @@

 namespace ucl_opencl {
    
+class UCL_Texture;
+template <class numtyp> class UCL_D_Vec;
+template <class numtyp> class UCL_D_Mat;
+template <class hosttype, class devtype> class UCL_Vector;
+template <class hosttype, class devtype> class UCL_Matrix;
+#define UCL_MAX_KERNEL_ARGS 256
+
 /// Class storing 1 or more kernel functions from a single string or file
 class UCL_Program {
 public:
  inline UCL_Program() : _init_done(false) {}
  inline UCL_Program(UCL_Device &device) : _init_done(false) { init(device); }
+  inline UCL_Program(UCL_Device &device, const void *program, 
+                     const char *flags="", std::string *log=NULL) : 
+      _init_done(false) { 
+    init(device); 
+    load_string(program,flags,log);
+  }
+
  inline ~UCL_Program() { clear(); }

  /// Initialize the program with a device
@ -78,10 +92,10 @@ class UCL_Program {
  }
  
  /// Load a program from a string and compile with flags
-  inline int load_string(const char *program, const char *flags="",
+  inline int load_string(const void *program, const char *flags="",
                         std::string *log=NULL) {
    cl_int error_flag;
-    const char *prog=program;
+    const char *prog=(const char *)program;
    _program=clCreateProgramWithSource(_context,1,&prog,NULL,&error_flag);
    CL_CHECK_ERR(error_flag);
    error_flag = clBuildProgram(_program,1,&_device,flags,NULL,NULL);
@ -159,19 +173,61 @@ class UCL_Kernel {
  /** If not a device pointer, this must be repeated each time the argument
    * changes **/
  template <class dtype>
-  inline void set_arg(const cl_uint index, dtype *arg) { 
+  inline void set_arg(const cl_uint index, const dtype * const arg) { 
    CL_SAFE_CALL(clSetKernelArg(_kernel,index,sizeof(dtype),arg)); 
    if (index>_num_args) _num_args=index;
  }
 
+  /// Set a geryon container as a kernel argument.
+  template <class numtyp>
+  inline void set_arg(const UCL_D_Vec<numtyp> * const arg) 
+    { set_arg(&arg->begin()); }
+
+  /// Set a geryon container as a kernel argument.
+  template <class numtyp>
+  inline void set_arg(const UCL_D_Mat<numtyp> * const arg) 
+    { set_arg(&arg->begin()); }
+
+  /// Set a geryon container as a kernel argument.
+  template <class hosttype, class devtype>
+  inline void set_arg(const UCL_Vector<hosttype, devtype> * const arg) 
+    { set_arg(&arg->device.begin()); }
+
+  /// Set a geryon container as a kernel argument.
+  template <class hosttype, class devtype>
+  inline void set_arg(const UCL_Matrix<hosttype, devtype> * const arg) 
+    { set_arg(&arg->device.begin()); }
+
  /// Add a kernel argument.
  template <class dtype>
-  inline void add_arg(dtype *arg) {
+  inline void add_arg(const dtype * const arg) {
    CL_SAFE_CALL(clSetKernelArg(_kernel,_num_args,sizeof(dtype),arg)); 
    _num_args++; 
  }

+  /// Add a geryon container as a kernel argument.
+  template <class numtyp>
+  inline void add_arg(const UCL_D_Vec<numtyp> * const arg) 
+    { add_arg(&arg->begin()); }
+
+  /// Add a geryon container as a kernel argument.
+  template <class numtyp>
+  inline void add_arg(const UCL_D_Mat<numtyp> * const arg) 
+    { add_arg(&arg->begin()); }
+
+  /// Add a geryon container as a kernel argument.
+  template <class hosttype, class devtype>
+  inline void add_arg(const UCL_Vector<hosttype, devtype> * const arg) 
+    { add_arg(&arg->device.begin()); }
+
+  /// Add a geryon container as a kernel argument.
+  template <class hosttype, class devtype>
+  inline void add_arg(const UCL_Matrix<hosttype, devtype> * const arg) 
+    { add_arg(&arg->device.begin()); }
+
  /// Set the number of thread blocks and the number of threads in each block
+  /** \note This should be called before any arguments have been added
+      \note The default command queue is used for the kernel execution **/
  inline void set_size(const size_t num_blocks, const size_t block_size) { 
    _dimensions=1; 
    _num_blocks[0]=num_blocks*block_size; 
@ -179,6 +235,15 @@ class UCL_Kernel {
  }

  /// Set the number of thread blocks and the number of threads in each block
+  /** \note This should be called before any arguments have been added
+      \note The default command queue for the kernel is changed to cq **/
+  inline void set_size(const size_t num_blocks, const size_t block_size,
+                       command_queue &cq)
+    { _cq=cq; set_size(num_blocks,block_size); }
+
+  /// Set the number of thread blocks and the number of threads in each block
+  /** \note This should be called before any arguments have been added
+      \note The default command queue is used for the kernel execution **/
  inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
                       const size_t block_size_x, const size_t block_size_y) { 
    _dimensions=2; 
@ -189,6 +254,16 @@ class UCL_Kernel {
  }
  
  /// Set the number of thread blocks and the number of threads in each block
+  /** \note This should be called before any arguments have been added
+      \note The default command queue for the kernel is changed to cq **/
+  inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
+                       const size_t block_size_x, const size_t block_size_y,
+                       command_queue &cq) 
+    {_cq=cq; set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y);}
+
+  /// Set the number of thread blocks and the number of threads in each block
+  /** \note This should be called before any arguments have been added
+      \note The default command queue is used for the kernel execution **/
  inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
                       const size_t block_size_x, 
                       const size_t block_size_y, const size_t block_size_z) {
@ -202,14 +277,20 @@ class UCL_Kernel {
    _block_size[2]=block_size_z; 
  }

-  /// Run the kernel in the default command queue
-  inline void run() {
-    run(_cq);
+  /// Set the number of thread blocks and the number of threads in each block
+  /** \note This should be called before any arguments have been added
+      \note The default command queue is used for the kernel execution **/
+  inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
+                       const size_t block_size_x, const size_t block_size_y,
+                       const size_t block_size_z, command_queue &cq) {
+    _cq=cq;
+    set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y, 
+             block_size_z);
  }
  
-  /// Run the kernel in the specified command queue
-  inline void run(command_queue &cq) {
-    CL_SAFE_CALL(clEnqueueNDRangeKernel(cq,_kernel,_dimensions,NULL,
+  /// Run the kernel in the default command queue
+  inline void run() {
+    CL_SAFE_CALL(clEnqueueNDRangeKernel(_cq,_kernel,_dimensions,NULL,
                                        _num_blocks,_block_size,0,NULL,NULL));
  }
  
--- a/lib/gpu/geryon/ocl_mat.h
+++ b/lib/gpu/geryon/ocl_mat.h
@ -39,6 +39,9 @@ namespace ucl_opencl {
 #include "ucl_h_mat.h"
 #include "ucl_d_vec.h"
 #include "ucl_d_mat.h"
+#include "ucl_s_obj_help.h"
+#include "ucl_vector.h"
+#include "ucl_matrix.h"
 #undef _UCL_DEVICE_PTR_MAT
 #undef _OCL_MAT
 #undef _UCL_MAT_ALLOW
--- a/lib/gpu/geryon/ocl_memory.h
+++ b/lib/gpu/geryon/ocl_memory.h
@ -132,6 +132,37 @@ inline void _host_free(mat_type &mat, const enum UCL_MEMOPT kind) {
  CL_DESTRUCT_CALL(clReleaseCommandQueue(mat.cq()));
 }

+template <class mat_type>
+inline int _host_resize(mat_type &mat, const size_t n) {
+  cl_int error_flag;
+  cl_context context;
+  CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_CONTEXT,sizeof(context),
+                                  &context,NULL));
+
+  CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
+  if (mat.kind()==UCL_WRITE_OPTIMIZED) {
+    mat.cbegin()=clCreateBuffer(context,
+                                CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
+                                n,NULL,&error_flag);                        
+    if (error_flag != CL_SUCCESS) 
+      return UCL_MEMORY_ERROR;
+    *mat.host_ptr() = (typename mat_type::data_type*)
+                      clEnqueueMapBuffer(mat.cq(),mat.cbegin(),CL_TRUE,
+                                         CL_MAP_WRITE,0,n,0,NULL,NULL,NULL);
+  } else {
+    mat.cbegin()=clCreateBuffer(context,
+                                CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
+                                n,NULL,&error_flag);
+    if (error_flag != CL_SUCCESS) 
+      return UCL_MEMORY_ERROR;
+    *mat.host_ptr() = (typename mat_type::data_type*)
+                      clEnqueueMapBuffer(mat.cq(),mat.cbegin(),CL_TRUE,
+                                         CL_MAP_READ | CL_MAP_WRITE,
+                                         0,n,0,NULL,NULL,NULL);
+  }
+  return UCL_SUCCESS;
+}
+
 // --------------------------------------------------------------------------
 // - DEVICE MEMORY ALLOCATION ROUTINES
 // --------------------------------------------------------------------------
@ -211,6 +242,61 @@ inline void _device_free(mat_type &mat) {
  CL_DESTRUCT_CALL(clReleaseCommandQueue(mat.cq()));
 }

+template <class mat_type>
+inline int _device_resize(mat_type &mat, const size_t n) {
+  cl_int error_flag;
+
+  cl_context context;
+  CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_CONTEXT,sizeof(context),
+               &context,NULL));
+  CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
+
+  cl_mem_flags flag;
+  if (mat.kind()==UCL_READ_WRITE)
+    flag=CL_MEM_READ_WRITE;
+  else if (mat.kind()==UCL_READ_ONLY)
+    flag=CL_MEM_READ_ONLY;
+  else if (mat.kind()==UCL_WRITE_ONLY)
+    flag=CL_MEM_WRITE_ONLY;
+  else
+    assert(0==1);
+  mat.cbegin()=clCreateBuffer(context,flag,n,NULL,&error_flag);
+  if (error_flag != CL_SUCCESS) 
+    return UCL_MEMORY_ERROR;
+  return UCL_SUCCESS;
+}
+
+template <class mat_type>
+inline int _device_resize(mat_type &mat, const size_t rows,
+                         const size_t cols, size_t &pitch) {
+  size_t padded_cols=cols;
+  if (cols%256!=0)
+    padded_cols+=256-cols%256;
+  pitch=padded_cols*sizeof(typename mat_type::data_type);
+
+  cl_int error_flag;
+
+  cl_context context;
+  CL_SAFE_CALL(clGetMemObjectInfo(mat.cbegin(),CL_MEM_CONTEXT,sizeof(context),
+               &context,NULL));
+  CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
+
+  cl_mem_flags flag;
+  if (mat.kind()==UCL_READ_WRITE)
+    flag=CL_MEM_READ_WRITE;
+  else if (mat.kind()==UCL_READ_ONLY)
+    flag=CL_MEM_READ_ONLY;
+  else if (mat.kind()==UCL_WRITE_ONLY)
+    flag=CL_MEM_WRITE_ONLY;
+  else
+    assert(0==1);
+  mat.cbegin()=clCreateBuffer(context,flag,pitch*rows,NULL,&error_flag);
+  if (error_flag != CL_SUCCESS) 
+    return UCL_MEMORY_ERROR;
+  return UCL_SUCCESS;
+}
+
+
 // --------------------------------------------------------------------------
 // - ZERO ROUTINES
 // --------------------------------------------------------------------------
--- a/lib/gpu/geryon/ucl_arg_kludge.h
+++ b/lib/gpu/geryon/ucl_arg_kludge.h
@ -828,441 +828,3 @@
    add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30); 
    run();
  }
-
-// ---------------------------------------------------------------------------
-
-  template <class t1>
-  inline void run_cq(command_queue &cq, t1 *a1) {
-    clear_args();
-    add_arg(a1);
-    run(cq);
-  }
-
-  template <class t1, class t2>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2) {
-    clear_args();
-    add_arg(a1); add_arg(a2);
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3);
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4);
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4, class t5>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4, class t5,
-            class t6>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
-                       t6 *a6) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6);  
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4, class t5,
-            class t6, class t7>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
-                       t6 *a6, t7 *a7) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); 
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4, class t5,
-            class t6, class t7, class t8>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
-                       t6 *a6, t7 *a7, t8 *a8) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8);  
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4, class t5,
-            class t6, class t7, class t8, class t9>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
-                       t6 *a6, t7 *a7, t8 *a8, t9 *a9) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); 
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4, class t5,
-            class t6, class t7, class t8, class t9, class t10>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
-                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4, class t5,
-            class t6, class t7, class t8, class t9, class t10,
-            class t11>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
-                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
-                       t11 *a11) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); 
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4, class t5,
-            class t6, class t7, class t8, class t9, class t10,
-            class t11, class t12>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
-                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
-                       t11 *a11, t12 *a12) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12);
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4, class t5,
-            class t6, class t7, class t8, class t9, class t10,
-            class t11, class t12, class t13>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
-                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
-                       t11 *a11, t12 *a12, t13 *a13) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13);  
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4, class t5,
-            class t6, class t7, class t8, class t9, class t10,
-            class t11, class t12, class t13, class t14>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
-                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
-                       t11 *a11, t12 *a12, t13 *a13, t14 *a14) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14);  
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4, class t5,
-            class t6, class t7, class t8, class t9, class t10,
-            class t11, class t12, class t13, class t14, class t15>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
-                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
-                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4, class t5,
-            class t6, class t7, class t8, class t9, class t10,
-            class t11, class t12, class t13, class t14, class t15,
-            class t16>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
-                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
-                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
-                       t16 *a16) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); 
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4, class t5,
-            class t6, class t7, class t8, class t9, class t10,
-            class t11, class t12, class t13, class t14, class t15,
-            class t16, class t17>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
-                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
-                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
-                       t16 *a16, t17 *a17) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); 
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4, class t5,
-            class t6, class t7, class t8, class t9, class t10,
-            class t11, class t12, class t13, class t14, class t15,
-            class t16, class t17, class t18>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
-                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
-                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
-                       t16 *a16, t17 *a17, t18 *a18) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); 
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4, class t5,
-            class t6, class t7, class t8, class t9, class t10,
-            class t11, class t12, class t13, class t14, class t15,
-            class t16, class t17, class t18, class t19>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
-                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
-                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
-                       t16 *a16, t17 *a17, t18 *a18, t19 *a19) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); 
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4, class t5,
-            class t6, class t7, class t8, class t9, class t10,
-            class t11, class t12, class t13, class t14, class t15,
-            class t16, class t17, class t18, class t19, class t20>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
-                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
-                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
-                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4, class t5,
-            class t6, class t7, class t8, class t9, class t10,
-            class t11, class t12, class t13, class t14, class t15,
-            class t16, class t17, class t18, class t19, class t20,
-            class t21>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
-                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
-                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
-                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
-                       t21 *a21) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
-    add_arg(a21);
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4, class t5,
-            class t6, class t7, class t8, class t9, class t10,
-            class t11, class t12, class t13, class t14, class t15,
-            class t16, class t17, class t18, class t19, class t20,
-            class t21, class t22>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
-                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
-                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
-                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
-                       t21 *a21, t22 *a22) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
-    add_arg(a21); add_arg(a22);
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4, class t5,
-            class t6, class t7, class t8, class t9, class t10,
-            class t11, class t12, class t13, class t14, class t15,
-            class t16, class t17, class t18, class t19, class t20,
-            class t21, class t22, class t23>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
-                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
-                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
-                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
-                       t21 *a21, t22 *a22, t23 *a23) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
-    add_arg(a21); add_arg(a22); add_arg(a23);
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4, class t5,
-            class t6, class t7, class t8, class t9, class t10,
-            class t11, class t12, class t13, class t14, class t15,
-            class t16, class t17, class t18, class t19, class t20,
-            class t21, class t22, class t23, class t24>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
-                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
-                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
-                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
-                       t21 *a21, t22 *a22, t23 *a23, t24 *a24) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
-    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24);
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4, class t5,
-            class t6, class t7, class t8, class t9, class t10,
-            class t11, class t12, class t13, class t14, class t15,
-            class t16, class t17, class t18, class t19, class t20,
-            class t21, class t22, class t23, class t24, class t25>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
-                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
-                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
-                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
-                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
-    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4, class t5,
-            class t6, class t7, class t8, class t9, class t10,
-            class t11, class t12, class t13, class t14, class t15,
-            class t16, class t17, class t18, class t19, class t20,
-            class t21, class t22, class t23, class t24, class t25,
-            class t26>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
-                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
-                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
-                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
-                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
-                       t26 *a26) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
-    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
-    add_arg(a26);
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4, class t5,
-            class t6, class t7, class t8, class t9, class t10,
-            class t11, class t12, class t13, class t14, class t15,
-            class t16, class t17, class t18, class t19, class t20,
-            class t21, class t22, class t23, class t24, class t25,
-            class t26, class t27>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
-                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
-                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
-                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
-                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
-                       t26 *a26, t27 *a27) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
-    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
-    add_arg(a26); add_arg(a27);
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4, class t5,
-            class t6, class t7, class t8, class t9, class t10,
-            class t11, class t12, class t13, class t14, class t15,
-            class t16, class t17, class t18, class t19, class t20,
-            class t21, class t22, class t23, class t24, class t25,
-            class t26, class t27, class t28>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
-                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
-                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
-                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
-                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
-                       t26 *a26, t27 *a27, t28 *a28) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
-    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
-    add_arg(a26); add_arg(a27); add_arg(a28); 
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4, class t5,
-            class t6, class t7, class t8, class t9, class t10,
-            class t11, class t12, class t13, class t14, class t15,
-            class t16, class t17, class t18, class t19, class t20,
-            class t21, class t22, class t23, class t24, class t25,
-            class t26, class t27, class t28, class t29>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
-                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
-                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
-                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
-                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
-                       t26 *a26, t27 *a27, t28 *a28, t29 *a29) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
-    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
-    add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29);
-    run(cq);
-  }
-
-  template <class t1, class t2, class t3, class t4, class t5,
-            class t6, class t7, class t8, class t9, class t10,
-            class t11, class t12, class t13, class t14, class t15,
-            class t16, class t17, class t18, class t19, class t20,
-            class t21, class t22, class t23, class t24, class t25,
-            class t26, class t27, class t28, class t29, class t30>
-  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
-                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
-                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
-                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
-                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
-                       t26 *a26, t27 *a27, t28 *a28, t29 *a29, t30 *a30) {
-    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
-    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
-    add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30); 
-    run(cq);
-  }
-
--- a/lib/gpu/geryon/ucl_d_mat.h
+++ b/lib/gpu/geryon/ucl_d_mat.h
@ -344,6 +344,39 @@ class UCL_D_Mat : public UCL_BaseMat {
  inline void clear() 
    { _rows=0; if (_kind!=UCL_VIEW) { _kind=UCL_VIEW; _device_free(*this); } }

+  /// Resize the allocation to contain cols elements
+  /** \note Cannot be used on views **/
+  inline int resize(const int rows, const int cols) {
+    assert(_kind!=UCL_VIEW);
+
+    int err=_device_resize(*this,rows,cols,_pitch);
+    if (err!=UCL_SUCCESS) {
+      #ifndef UCL_NO_EXIT
+      std::cerr << "UCL Error: Could not allocate " 
+                << rows*cols*sizeof(numtyp) << " bytes on device.\n";
+      UCL_GERYON_EXIT;
+      #endif
+      return err;
+    }
+
+    _rows=rows;
+    _cols=cols;
+    _row_size=_pitch/sizeof(numtyp);
+    #ifndef _UCL_DEVICE_PTR_MAT
+    _end=_array+_row_size*cols;
+    #endif
+    #ifdef _OCL_MAT
+    _offset=0;
+    #endif
+    return err; 
+  }
+    
+  /// Resize (only if bigger) the allocation to contain rows x cols elements
+  /** \note Cannot be used on views **/
+  inline int resize_ib(const int rows, const int cols)
+    { if (cols>_cols || rows>_rows) return resize(rows,cols); 
+      else return UCL_SUCCESS; }
+
  /// Set each element to zero
  inline void zero() { _device_zero(*this,row_bytes()*_rows); }
  
@ -357,9 +390,9 @@ class UCL_D_Mat : public UCL_BaseMat {
  inline const device_ptr & begin() const { return _array; }
  #else
  /// For CUDA-RT, get device pointer to first element
-  inline numtyp * begin() { return _array; }
+  inline numtyp * & begin() { return _array; }
  /// For CUDA-RT, get device pointer to first element
-  inline const numtyp * begin() const { return _array; }
+  inline numtyp * const & begin() const { return _array; }
  /// For CUDA-RT, get device pointer to one past last element
  inline numtyp * end() { return _end; }
  /// For CUDA-RT, get device pointer to one past last element
--- a/lib/gpu/geryon/ucl_d_vec.h
+++ b/lib/gpu/geryon/ucl_d_vec.h
@ -340,6 +340,39 @@ class UCL_D_Vec : public UCL_BaseMat {
  inline void clear() 
    { if (_kind!=UCL_VIEW) { _cols=0; _kind=UCL_VIEW; _device_free(*this); } }

+  /// Resize the allocation to contain cols elements
+  /** \note Cannot be used on views **/
+  inline int resize(const int cols) {
+    assert(_kind!=UCL_VIEW);
+
+    _row_bytes=cols*sizeof(numtyp);
+    int err=_device_resize(*this,_row_bytes);
+    if (err!=UCL_SUCCESS) {
+      #ifndef UCL_NO_EXIT
+      std::cerr << "UCL Error: Could not allocate " << _row_bytes
+                << " bytes on device.\n";
+      _row_bytes=0;
+      UCL_GERYON_EXIT;
+      #endif
+      _row_bytes=0;
+      return err;
+    }
+
+    _cols=cols;
+    #ifndef _UCL_DEVICE_PTR_MAT
+    _end=_array+cols;
+    #endif
+    #ifdef _OCL_MAT
+    _offset=0;
+    #endif
+    return err; 
+  }
+    
+  /// Resize (only if bigger) the allocation to contain cols elements
+  /** \note Cannot be used on views **/
+  inline int resize_ib(const int cols)
+    { if (cols>_cols) return resize(cols); else return UCL_SUCCESS; }
+
  /// Set each element to zero
  inline void zero() { _device_zero(*this,row_bytes()); }

@ -353,13 +386,13 @@ class UCL_D_Vec : public UCL_BaseMat {
  inline const device_ptr & begin() const { return _array; }
  #else
  /// For CUDA-RT, get device pointer to first element
-  inline numtyp * begin() { return _array; }
+  inline numtyp * & begin() { return _array; }
  /// For CUDA-RT, get device pointer to first element
-  inline const numtyp * begin() const { return _array; }
+  inline numtyp * const & begin() const { return _array; }
  /// For CUDA-RT, get device pointer to one past last element
  inline numtyp * end() { return _end; }
  /// For CUDA-RT, get device pointer to one past last element
-  inline const numtyp * end() const { return _end; }
+  inline numtyp * end() const { return _end; }
  #endif
  
  #ifdef _UCL_DEVICE_PTR_MAT
--- a/lib/gpu/geryon/ucl_h_mat.h
+++ b/lib/gpu/geryon/ucl_h_mat.h
@ -318,6 +318,36 @@ class UCL_H_Mat : public UCL_BaseMat {
  inline void clear() 
    { if (_kind!=UCL_VIEW) {_rows=0; _kind=UCL_VIEW; _host_free(*this,_kind); }} 

+  /// Resize the allocation to rows x cols elements
+  /** \note Cannot be used on views **/
+  inline int resize(const int rows, const int cols) {
+    assert(_kind!=UCL_VIEW);
+
+    _row_bytes=cols*sizeof(numtyp);
+    int err=_host_resize(*this,_row_bytes*rows);
+    if (err!=UCL_SUCCESS) {
+      #ifndef UCL_NO_EXIT
+      std::cerr << "UCL Error: Could not allocate " << _row_bytes*_rows
+                << " bytes on host.\n";
+      _row_bytes=0;
+      UCL_GERYON_EXIT;
+      #endif 
+      _row_bytes=0;
+      return err;
+    }
+
+    _cols=cols;
+    _rows=rows;
+    _end=_array+rows*cols;
+    return err;
+  }
+
+  /// Resize (only if bigger) the allocation to contain rows x cols elements
+  /** \note Cannot be used on views **/
+  inline int resize_ib(const int rows, const int cols)
+    { if (cols>_cols || rows>_rows) return resize(rows,cols); 
+      else return UCL_SUCCESS; }
+
  /// Set each element to zero
  inline void zero() { _host_zero(_array,_rows*row_bytes()); }
  /// Set first n elements to zero
--- a/lib/gpu/geryon/ucl_h_vec.h
+++ b/lib/gpu/geryon/ucl_h_vec.h
@ -316,6 +316,34 @@ class UCL_H_Vec : public UCL_BaseMat {
  inline void clear() 
    { if (_kind!=UCL_VIEW) {_kind=UCL_VIEW; _cols=0; _host_free(*this,_kind);}}

+  /// Resize the allocation to contain cols elements
+  /** \note Cannot be used on views **/
+  inline int resize(const int cols) {
+    assert(_kind!=UCL_VIEW);
+    _row_bytes=cols*sizeof(numtyp);
+    int err=_host_resize(*this,_row_bytes);
+    
+    if (err!=UCL_SUCCESS) {
+      #ifndef UCL_NO_EXIT
+      std::cerr << "UCL Error: Could not allocate " << _row_bytes
+                << " bytes on host.\n";
+      _row_bytes=0;
+      UCL_GERYON_EXIT;
+      #endif
+      _row_bytes=0;
+      return err;
+    }
+
+    _cols=cols;
+    _end=_array+cols;
+    return err;
+  }
+    
+  /// Resize (only if bigger) the allocation to contain cols elements
+  /** \note Cannot be used on views **/
+  inline int resize_ib(const int cols)
+    { if (cols>_cols) return resize(cols); else return UCL_SUCCESS; }
+
  /// Set each element to zero
  inline void zero() { _host_zero(_array,row_bytes()); }
  
--- a/lib/gpu/geryon/ucl_print.h
+++ b/lib/gpu/geryon/ucl_print.h
@ -270,4 +270,13 @@ template <class numtyp>
 inline std::ostream & operator << (std::ostream &out, UCL_D_Mat<numtyp> &mat)
  { ucl_print(mat,out); return out; } 

+
+template <class t1, class t2>
+inline std::ostream & operator << (std::ostream &out, UCL_Vector<t1,t2> &mat)
+  { ucl_print(mat.host,out); return out; } 
+
+template <class t1, class t2>
+inline std::ostream & operator << (std::ostream &out, UCL_Matrix<t1,t2> &mat)
+  { ucl_print(mat.host,out); return out; } 
+
 #endif
--- a/lib/gpu/geryon/ucl_types.h
+++ b/lib/gpu/geryon/ucl_types.h
@ -117,5 +117,61 @@ enum UCL_ERROR_FLAG {
 template <class numtyp>
 const char * ucl_template_name() { return _UCL_DATA_ID<numtyp>::name(); }

+template <class t1, class t2> struct ucl_same_type;
+
+template <> struct ucl_same_type<bool,bool> { enum { ans=1 }; };
+template <> struct ucl_same_type<char,char> { enum { ans=1 }; };
+template <> struct ucl_same_type<unsigned char,unsigned char> { enum { ans=1 }; };
+template <> struct ucl_same_type<int,int> { enum { ans=1 }; };
+template <> struct ucl_same_type<unsigned,unsigned> { enum { ans=1 }; };
+template <> struct ucl_same_type<short,short> { enum { ans=1 }; };
+template <> struct ucl_same_type<unsigned short,unsigned short> { enum { ans=1 }; };
+template <> struct ucl_same_type<long,long> { enum { ans=1 }; };
+template <> struct ucl_same_type<unsigned long,unsigned long> { enum { ans=1 }; };
+template <> struct ucl_same_type<float,float> { enum { ans=1 }; };
+template <> struct ucl_same_type<double,double> { enum { ans=1 }; };
+template <> struct ucl_same_type<long double,long double> { enum { ans=1 }; };
+
+template <> struct ucl_same_type<const bool,bool> { enum { ans=1 }; };
+template <> struct ucl_same_type<const char,char> { enum { ans=1 }; };
+template <> struct ucl_same_type<const unsigned char,unsigned char> { enum { ans=1 }; };
+template <> struct ucl_same_type<const int,int> { enum { ans=1 }; };
+template <> struct ucl_same_type<const unsigned,unsigned> { enum { ans=1 }; };
+template <> struct ucl_same_type<const short,short> { enum { ans=1 }; };
+template <> struct ucl_same_type<const unsigned short,unsigned short> { enum { ans=1 }; };
+template <> struct ucl_same_type<const long,long> { enum { ans=1 }; };
+template <> struct ucl_same_type<const unsigned long,unsigned long> { enum { ans=1 }; };
+template <> struct ucl_same_type<const float,float> { enum { ans=1 }; };
+template <> struct ucl_same_type<const double,double> { enum { ans=1 }; };
+template <> struct ucl_same_type<const long double,long double> { enum { ans=1 }; };
+
+template <> struct ucl_same_type<bool,const bool> { enum { ans=1 }; };
+template <> struct ucl_same_type<char,const char> { enum { ans=1 }; };
+template <> struct ucl_same_type<unsigned char,const unsigned char> { enum { ans=1 }; };
+template <> struct ucl_same_type<int,const int> { enum { ans=1 }; };
+template <> struct ucl_same_type<unsigned,const unsigned> { enum { ans=1 }; };
+template <> struct ucl_same_type<short,const short> { enum { ans=1 }; };
+template <> struct ucl_same_type<unsigned short,const unsigned short> { enum { ans=1 }; };
+template <> struct ucl_same_type<long,const long> { enum { ans=1 }; };
+template <> struct ucl_same_type<unsigned long,const unsigned long> { enum { ans=1 }; };
+template <> struct ucl_same_type<float,const float> { enum { ans=1 }; };
+template <> struct ucl_same_type<double,const double> { enum { ans=1 }; };
+template <> struct ucl_same_type<long double,const long double> { enum { ans=1 }; };
+
+template <> struct ucl_same_type<const bool,const bool> { enum { ans=1 }; };
+template <> struct ucl_same_type<const char,const char> { enum { ans=1 }; };
+template <> struct ucl_same_type<const unsigned char,const unsigned char> { enum { ans=1 }; };
+template <> struct ucl_same_type<const int,const int> { enum { ans=1 }; };
+template <> struct ucl_same_type<const unsigned,const unsigned> { enum { ans=1 }; };
+template <> struct ucl_same_type<const short,const short> { enum { ans=1 }; };
+template <> struct ucl_same_type<const unsigned short,const unsigned short> { enum { ans=1 }; };
+template <> struct ucl_same_type<const long,const long> { enum { ans=1 }; };
+template <> struct ucl_same_type<const unsigned long,const unsigned long> { enum { ans=1 }; };
+template <> struct ucl_same_type<const float,const float> { enum { ans=1 }; };
+template <> struct ucl_same_type<const double,const double> { enum { ans=1 }; };
+template <> struct ucl_same_type<const long double,const long double> { enum { ans=1 }; };
+
+template <class t1, class t2> struct ucl_same_type { enum { ans=0 }; };
+
 #endif

--- a/lib/gpu/lal_answer.cpp
+++ b/lib/gpu/lal_answer.cpp
@ -39,30 +39,16 @@ bool AnswerT::alloc(const int inum) {

  bool success=true;
  
-  int ans_elements=4;
+  _ans_fields=4;
  if (_rot)
-    ans_elements+=4;
-  
-  // Ignore host/device transfers?
-  bool cpuview=false;
-  if (dev->device_type()==UCL_CPU)
-    cpuview=true;
-    
-  // --------------------------   Host allocations
-  success=success &&(host_ans.alloc(ans_elements*_max_local,*dev)==UCL_SUCCESS);
-  success=success &&(host_engv.alloc(_ev_fields*_max_local,*dev)==UCL_SUCCESS);
+    _ans_fields+=4;
  
  // ---------------------------  Device allocations
-  if (cpuview) {
-    dev_engv.view(host_engv);
-    dev_ans.view(host_ans);
-  } else {
-    success=success && (dev_engv.alloc(_ev_fields*_max_local,*dev,
+  success=success && (engv.alloc(_ev_fields*_max_local,*dev,UCL_RW_OPTIMIZED,
                                 UCL_WRITE_ONLY)==UCL_SUCCESS);
-    success=success && (dev_ans.alloc(ans_elements*_max_local,
-                                      *dev,UCL_WRITE_ONLY)==UCL_SUCCESS);
-  }
-  _gpu_bytes=dev_engv.row_bytes()+dev_ans.row_bytes();
+  success=success && (force.alloc(_ans_fields*_max_local,*dev,UCL_RW_OPTIMIZED,
+                                UCL_WRITE_ONLY)==UCL_SUCCESS);
+  _gpu_bytes=engv.device.row_bytes()+force.device.row_bytes();
  
  _allocated=true;  
  return success;
@ -114,32 +100,24 @@ bool AnswerT::add_fields(const bool charge, const bool rot) {
  if (realloc) {
    _other=_charge || _rot;
    int inum=_max_local;
-    clear_resize();
+    force.clear();
+    engv.clear();
+    _allocated=false;
    return alloc(inum);
  }
  return true;
 }

-template <class numtyp, class acctyp>
-void AnswerT::clear_resize() {
-  if (!_allocated)
-    return;
-  _allocated=false;
-
-  dev_ans.clear();
-  dev_engv.clear();
-  host_ans.clear();
-  host_engv.clear();
-}
-
 template <class numtyp, class acctyp>
 void AnswerT::clear() {
  _gpu_bytes=0;
  if (!_allocated)
    return;
+  _allocated=false;

+  force.clear();
+  engv.clear();
  time_answer.clear();
-  clear_resize();
  _inum=0;
  _ilist=NULL;
  _eflag=false;
@ -174,11 +152,11 @@ void AnswerT::copy_answers(const bool eflag, const bool vflag,
    csize-=6;
      
  if (csize>0)
-    ucl_copy(host_engv,dev_engv,_inum*csize,true);
+    engv.update_host(_inum*csize,true);
  if (_rot)
-    ucl_copy(host_ans,dev_ans,_inum*4*2,true);
+    force.update_host(_inum*4*2,true);
  else
-    ucl_copy(host_ans,dev_ans,_inum*4,true);
+    force.update_host(_inum*4,true);
  time_answer.stop();
 }

@ -201,28 +179,28 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
  for (int i=0; i<6; i++) virial_acc[i]=0.0;
  if (_ilist==NULL) {
    for (int i=0; i<_inum; i++) {
-      acctyp *ap=host_engv.begin()+i;
+      int al=i;
      if (_eflag) {
        if (_ef_atom) {
-          evdwl+=*ap;
-          eatom[i]+=*ap*0.5;
-          ap+=_inum;
+          evdwl+=engv[al];
+          eatom[i]+=engv[al]*0.5;
+          al+=_inum;
        } else {
-          evdwl+=*ap;
-          ap+=_inum;
+          evdwl+=engv[al];
+          al+=_inum;
        }
      }
      if (_vflag) {
        if (_vf_atom) {
          for (int j=0; j<6; j++) {
-            vatom[i][j]+=*ap*0.5;
-            virial_acc[j]+=*ap;
-            ap+=_inum;
+            vatom[i][j]+=engv[al]*0.5;
+            virial_acc[j]+=engv[al];
+            al+=_inum;
          }
        } else {
          for (int j=0; j<6; j++) {
-            virial_acc[j]+=*ap;
-            ap+=_inum;
+            virial_acc[j]+=engv[al];
+            al+=_inum;
          }
        }
      }
@ -231,29 +209,29 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
      virial[j]+=virial_acc[j]*0.5;
  } else {
    for (int i=0; i<_inum; i++) {
-      acctyp *ap=host_engv.begin()+i;
+      int al=i;
      int ii=_ilist[i];
      if (_eflag) {
        if (_ef_atom) {
-          evdwl+=*ap;
-          eatom[ii]+=*ap*0.5;
-          ap+=_inum;
+          evdwl+=engv[al];
+          eatom[ii]+=engv[al]*0.5;
+          al+=_inum;
        } else {
-          evdwl+=*ap;
-          ap+=_inum;
+          evdwl+=engv[al];
+          al+=_inum;
        }
      }
      if (_vflag) {
        if (_vf_atom) {
          for (int j=0; j<6; j++) {
-            vatom[ii][j]+=*ap*0.5;
-            virial_acc[j]+=*ap;
-            ap+=_inum;
+            vatom[ii][j]+=engv[al]*0.5;
+            virial_acc[j]+=engv[al];
+            al+=_inum;
          }
        } else {
          for (int j=0; j<6; j++) {
-            virial_acc[j]+=*ap;
-            ap+=_inum;
+            virial_acc[j]+=engv[al];
+            al+=_inum;
          }
        }
      }
@ -281,33 +259,33 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
  for (int i=0; i<6; i++) virial_acc[i]=0.0;
  if (_ilist==NULL) {
    for (int i=0; i<_inum; i++) {
-      acctyp *ap=host_engv.begin()+i;
+      int al=i;
      if (_eflag) {
        if (_ef_atom) {
-          evdwl+=*ap;
-          eatom[i]+=*ap*0.5;
-          ap+=_inum;
-          _ecoul+=*ap;
-          eatom[i]+=*ap*0.5;
-          ap+=_inum;
+          evdwl+=engv[al];
+          eatom[i]+=engv[al]*0.5;
+          al+=_inum;
+          _ecoul+=engv[al];
+          eatom[i]+=engv[al]*0.5;
+          al+=_inum;
        } else {
-          evdwl+=*ap;
-          ap+=_inum;
-          _ecoul+=*ap;
-          ap+=_inum;
+          evdwl+=engv[al];
+          al+=_inum;
+          _ecoul+=engv[al];
+          al+=_inum;
        }
      }
      if (_vflag) {
        if (_vf_atom) {
          for (int j=0; j<6; j++) {
-            vatom[i][j]+=*ap*0.5;
-            virial_acc[j]+=*ap;
-            ap+=_inum;
+            vatom[i][j]+=engv[al]*0.5;
+            virial_acc[j]+=engv[al];
+            al+=_inum;
          }
        } else {
          for (int j=0; j<6; j++) {
-            virial_acc[j]+=*ap;
-            ap+=_inum;
+            virial_acc[j]+=engv[al];
+            al+=_inum;
          }
        }
      }
@ -316,34 +294,34 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
      virial[j]+=virial_acc[j]*0.5;
  } else {
    for (int i=0; i<_inum; i++) {
-      acctyp *ap=host_engv.begin()+i;
+      int al=i;
      int ii=_ilist[i];
      if (_eflag) {
        if (_ef_atom) {
-          evdwl+=*ap;
-          eatom[ii]+=*ap*0.5;
-          ap+=_inum;
-          _ecoul+=*ap;
-          eatom[ii]+=*ap*0.5;
-          ap+=_inum;
+          evdwl+=engv[al];
+          eatom[ii]+=engv[al]*0.5;
+          al+=_inum;
+          _ecoul+=engv[al];
+          eatom[ii]+=engv[al]*0.5;
+          al+=_inum;
        } else {
-          evdwl+=*ap;
-          ap+=_inum;
-          _ecoul+=*ap;
-          ap+=_inum;
+          evdwl+=engv[al];
+          al+=_inum;
+          _ecoul+=engv[al];
+          al+=_inum;
        }
      }
      if (_vflag) {
        if (_vf_atom) {
          for (int j=0; j<6; j++) {
-            vatom[ii][j]+=*ap*0.5;
-            virial_acc[j]+=*ap;
-            ap+=_inum;
+            vatom[ii][j]+=engv[al]*0.5;
+            virial_acc[j]+=engv[al];
+            al+=_inum;
          }
        } else {
          for (int j=0; j<6; j++) {
-            virial_acc[j]+=*ap;
-            ap+=_inum;
+            virial_acc[j]+=engv[al];
+            al+=_inum;
          }
        }
      }
@ -359,45 +337,37 @@ double AnswerT::energy_virial(double *eatom, double **vatom,

 template <class numtyp, class acctyp>
 void AnswerT::get_answers(double **f, double **tor) {
-  acctyp *ap=host_ans.begin();
+  int fl=0;
  if (_ilist==NULL) {
    for (int i=0; i<_inum; i++) {
-      f[i][0]+=*ap;
-      ap++;
-      f[i][1]+=*ap;
-      ap++;
-      f[i][2]+=*ap;
-      ap+=2;
+      f[i][0]+=force[fl];
+      f[i][1]+=force[fl+1];
+      f[i][2]+=force[fl+2];
+      fl+=4;
    }
    if (_rot) {
      for (int i=0; i<_inum; i++) {
-        tor[i][0]+=*ap;
-        ap++;
-        tor[i][1]+=*ap;
-        ap++;
-        tor[i][2]+=*ap;
-        ap+=2;
+        tor[i][0]+=force[fl];
+        tor[i][1]+=force[fl+1];
+        tor[i][2]+=force[fl+2];
+        fl+=4;
      }
    }
  } else {
    for (int i=0; i<_inum; i++) {
      int ii=_ilist[i];
-      f[ii][0]+=*ap;
-      ap++;
-      f[ii][1]+=*ap;
-      ap++;
-      f[ii][2]+=*ap;
-      ap+=2;
+      f[ii][0]+=force[fl];
+      f[ii][1]+=force[fl+1];
+      f[ii][2]+=force[fl+2];
+      fl+=4;
    }
    if (_rot) {
      for (int i=0; i<_inum; i++) {
        int ii=_ilist[i];
-        tor[ii][0]+=*ap;
-        ap++;
-        tor[ii][1]+=*ap;
-        ap++;
-        tor[ii][2]+=*ap;
-        ap+=2;
+        tor[ii][0]+=force[fl];
+        tor[ii][1]+=force[fl+1];
+        tor[ii][2]+=force[fl+2];
+        fl+=4;
      }
    }
  }
--- a/lib/gpu/lal_answer.h
+++ b/lib/gpu/lal_answer.h
@ -19,18 +19,18 @@
 #include <math.h>
 #include "mpi.h"

-#ifdef USE_OPENCL
-
+#if defined(USE_OPENCL)
 #include "geryon/ocl_timer.h"
 #include "geryon/ocl_mat.h"
 using namespace ucl_opencl;
-
+#elif defined(USE_CUDART)
+#include "geryon/nvc_timer.h"
+#include "geryon/nvc_mat.h"
+using namespace ucl_cudart;
 #else
-
 #include "geryon/nvd_timer.h"
 #include "geryon/nvd_mat.h"
 using namespace ucl_cudadr;
-
 #endif

 #include "lal_precision.h"
@ -59,8 +59,10 @@ class Answer {
  inline void resize(const int inum, bool &success) {
    _inum=inum;
    if (inum>_max_local) {
-      clear_resize();
-      success = success && alloc(inum);
+      _max_local=static_cast<int>(static_cast<double>(inum)*1.10);
+      success=success && (force.resize(_max_local*_ans_fields)==UCL_SUCCESS);
+      success=success && (engv.resize(_max_local*_ev_fields)==UCL_SUCCESS);
+      _gpu_bytes=engv.device.row_bytes()+force.device.row_bytes();
    }
  }
  
@ -68,9 +70,6 @@ class Answer {
  /** \param rot True if atom storage needs quaternions **/
  bool add_fields(const bool charge, const bool rot);
  
-  /// Free all memory on host and device needed to realloc for more atoms
-  void clear_resize();
-
  /// Free all memory on host and device
  void clear();
 
@ -136,14 +135,9 @@ class Answer {
  // ------------------------------ DATA ----------------------------------

  /// Force and possibly torque
-  UCL_D_Vec<acctyp> dev_ans;
+  UCL_Vector<acctyp,acctyp> force;
  /// Energy and virial per-atom storage
-  UCL_D_Vec<acctyp> dev_engv;
-  
-  /// Force and possibly torque data on host
-  UCL_H_Vec<acctyp> host_ans;
-  /// Energy/virial data on host
-  UCL_H_Vec<acctyp> host_engv;
+  UCL_Vector<acctyp,acctyp> engv;
  
  /// Device timers
  UCL_Timer time_answer;
@ -155,7 +149,7 @@ class Answer {
  bool alloc(const int inum);
  
  bool _allocated, _eflag, _vflag, _ef_atom, _vf_atom, _rot, _charge, _other;
-  int _max_local, _inum, _e_fields, _ev_fields;
+  int _max_local, _inum, _e_fields, _ev_fields, _ans_fields;
  int *_ilist;
  double _time_cast, _time_cpu_idle;
  
--- a/lib/gpu/lal_atom.cpp
+++ b/lib/gpu/lal_atom.cpp
@ -51,9 +51,13 @@ bool AtomT::alloc(const int nall) {
  bool success=true;
  
  // Ignore host/device transfers?
-  bool cpuview=false;
-  if (dev->device_type()==UCL_CPU)
-    cpuview=true;
+  _host_view=false;
+  if (dev->shared_memory()) {
+    _host_view=true;
+    #ifdef GPU_CAST
+    assert(0==1);
+    #endif
+  }
      
  // Allocate storage for CUDPP sort
  #ifdef USE_CUDPP
@ -64,63 +68,101 @@ bool AtomT::alloc(const int nall) {
  }
  #endif

-  // --------------------------   Host allocations
-  // Get a host write only buffer
-  #ifdef GPU_CAST
-  success=success && (host_x_cast.alloc(_max_atoms*3,*dev,
-                                        UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
-  success=success && (host_type_cast.alloc(_max_atoms,*dev,
-                                           UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
-  #else
-  success=success && (host_x.alloc(_max_atoms*4,*dev,
-                      UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
-  #endif                      
-  // Buffer for casting only if different precisions
-  if (_charge)
-    success=success && (host_q.alloc(_max_atoms,*dev,
-                                     UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
-  // Buffer for casting only if different precisions
-  if (_rot)
-    success=success && (host_quat.alloc(_max_atoms*4,*dev,
-                                        UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
-
-    
  // ---------------------------  Device allocations
  int gpu_bytes=0;
-  if (cpuview) {
-    #ifdef GPU_CAST
-    assert(0==1);
-    #else
-    dev_x.view(host_x);
-    #endif
-    if (_rot)
-      dev_quat.view(host_quat);
-    if (_charge)
-      dev_q.view(host_q);
-  } else {
-    #ifdef GPU_CAST
-    success=success && (UCL_SUCCESS==dev_x.alloc(_max_atoms*4,*dev));
-    success=success && (UCL_SUCCESS==
-                        dev_x_cast.alloc(_max_atoms*3,*dev,UCL_READ_ONLY));
-    success=success && (UCL_SUCCESS==
-                        dev_type_cast.alloc(_max_atoms,*dev,UCL_READ_ONLY));
-    gpu_bytes+=dev_x_cast.row_bytes()+dev_type_cast.row_bytes();
-    #else
-    success=success && (UCL_SUCCESS==
-                        dev_x.alloc(_max_atoms*4,*dev,UCL_READ_ONLY));
-    #endif
-    if (_charge) {
-      success=success && (dev_q.alloc(_max_atoms,*dev,
+  success=success && (x.alloc(_max_atoms*4,*dev,UCL_WRITE_OPTIMIZED,
                              UCL_READ_ONLY)==UCL_SUCCESS);
-      gpu_bytes+=dev_q.row_bytes();
-    }
-    if (_rot) {
-      success=success && (dev_quat.alloc(_max_atoms*4,*dev,
+  #ifdef GPU_CAST
+  success=success && (x_cast.alloc(_max_atoms*3,*dev,UCL_READ_ONLY)==
+                      UCL_SUCCESS);
+  success=success && (type_cast.alloc(_max_atoms,*dev,UCL_READ_ONLY)==
+                      UCL_SUCCESS);
+  gpu_bytes+=x_cast.device.row_bytes()+type_cast.device.row_bytes();
+  #endif
+
+  if (_charge && _host_view==false) {
+    success=success && (q.alloc(_max_atoms,*dev,UCL_WRITE_OPTIMIZED,
                                UCL_READ_ONLY)==UCL_SUCCESS);
-      gpu_bytes+=dev_quat.row_bytes();
+    gpu_bytes+=q.device.row_bytes();
  }
+  if (_rot && _host_view==false) {
+    success=success && (quat.alloc(_max_atoms*4,*dev,UCL_WRITE_OPTIMIZED,
+                                   UCL_READ_ONLY)==UCL_SUCCESS);
+    gpu_bytes+=quat.device.row_bytes();
  }
+
  if (_gpu_nbor>0) {
+    if (_bonds) {
+      success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS);
+      gpu_bytes+=dev_tag.row_bytes();
+    }
+    if (_gpu_nbor==1) {
+      success=success && (dev_cell_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
+      gpu_bytes+=dev_cell_id.row_bytes();
+    } else {
+      success=success && (host_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
+      success=success && 
+             (host_cell_id.alloc(_max_atoms,*dev,UCL_NOT_PINNED)==UCL_SUCCESS);
+    }
+    if (_gpu_nbor==2 && _host_view)
+      dev_particle_id.view(host_particle_id);
+    else
+      success=success && (dev_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
+    gpu_bytes+=dev_particle_id.row_bytes();
+  }
+
+  gpu_bytes+=x.device.row_bytes();
+  if (gpu_bytes>_max_gpu_bytes)
+    _max_gpu_bytes=gpu_bytes;
+  
+  _allocated=true;  
+  return success;
+}
+
+template <class numtyp, class acctyp>
+bool AtomT::add_fields(const bool charge, const bool rot,
+                       const int gpu_nbor, const bool bonds) {
+  bool success=true;
+  // Ignore host/device transfers?
+  int gpu_bytes=0;
+  
+  if (charge && _charge==false) {
+    _charge=true;
+    _other=true;
+    if (_host_view==false) {
+      success=success && (q.alloc(_max_atoms,*dev,UCL_WRITE_OPTIMIZED,
+                                  UCL_READ_ONLY)==UCL_SUCCESS);
+      gpu_bytes+=q.device.row_bytes();
+    }
+  }
+
+  if (rot && _rot==false) {
+    _rot=true;
+    _other=true;
+    if (_host_view==false) {
+      success=success && (quat.alloc(_max_atoms*4,*dev,UCL_WRITE_OPTIMIZED,
+                                     UCL_READ_ONLY)==UCL_SUCCESS);
+      gpu_bytes+=quat.device.row_bytes();
+    }
+  }
+
+  if (bonds && _bonds==false) {
+    _bonds=true;
+    if (_bonds && _gpu_nbor>0) {
+      success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS);
+      gpu_bytes+=dev_tag.row_bytes();
+    }
+  }
+
+  if (gpu_nbor>0 && _gpu_nbor==0) {
+    _gpu_nbor=gpu_nbor;
+    #ifdef USE_CUDPP
+    if (_gpu_nbor==1) {
+      CUDPPResult result = cudppPlan(&sort_plan, sort_config, _max_atoms, 1, 0);  
+      if (CUDPP_SUCCESS != result)
+        return false;
+    }
+    #endif
    success=success && (dev_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
    gpu_bytes+=dev_particle_id.row_bytes();
    if (_bonds) {
@ -137,43 +179,9 @@ bool AtomT::alloc(const int nall) {
    }             
  }

-  gpu_bytes+=dev_x.row_bytes();
-  if (gpu_bytes>_max_gpu_bytes)
-    _max_gpu_bytes=gpu_bytes;
-  
-  _allocated=true;  
  return success;
 }

-template <class numtyp, class acctyp>
-bool AtomT::add_fields(const bool charge, const bool rot,
-                       const int gpu_nbor, const bool bonds) {
-  bool realloc=false;
-  if (charge && _charge==false) {
-    _charge=true;
-    realloc=true;
-  }
-  if (rot && _rot==false) {
-    _rot=true;
-    realloc=true;
-  }
-  if (gpu_nbor>0 && _gpu_nbor==0) {
-    _gpu_nbor=gpu_nbor;
-    realloc=true;
-  }
-  if (bonds && _bonds==false) {
-    _bonds=true;
-    realloc=true;
-  }
-  if (realloc) {
-    _other=_charge || _rot;
-    int max_atoms=_max_atoms;
-    clear_resize();
-    return alloc(max_atoms);
-  }
-  return true;
-}
-
 template <class numtyp, class acctyp>
 bool AtomT::init(const int nall, const bool charge, const bool rot,
                 UCL_Device &devi, const int gpu_nbor, const bool bonds) {
@ -219,27 +227,18 @@ void AtomT::clear_resize() {
    return;
  _allocated=false;

-  dev_x.clear();
-  if (_charge) { 
-    dev_q.clear();
-    host_q.clear();
-  }
-  if (_rot) {
-    dev_quat.clear();
-    host_quat.clear();
-  }
-  #ifndef GPU_CAST
-  host_x.clear();
-  #else
-  host_x_cast.clear();
-  host_type_cast.clear();
-  #endif
+  x.clear();
+  if (_charge)
+    q.clear();
+  if (_rot)
+    quat.clear();
+
  dev_cell_id.clear();
  dev_particle_id.clear();
  dev_tag.clear();
  #ifdef GPU_CAST
-  dev_x_cast.clear();
-  dev_type_cast.clear();
+  x_cast.clear();
+  type_cast.clear();
  #endif

  #ifdef USE_CUDPP
@ -279,8 +278,7 @@ double AtomT::host_memory_usage() const {
    atom_bytes+=1;
  if (_rot) 
    atom_bytes+=4;
-  return _max_atoms*atom_bytes*sizeof(numtyp)+
-         sizeof(Atom<numtyp,acctyp>);
+  return _max_atoms*atom_bytes*sizeof(numtyp)+sizeof(Atom<numtyp,acctyp>);
 }
  
 // Sort arrays for neighbor list calculation
@ -292,16 +290,18 @@ void AtomT::sort_neighbor(const int num_atoms) {
                                 8*sizeof(unsigned), num_atoms);
  if (CUDPP_SUCCESS != result) {
    printf("Error in cudppSort\n");
-    NVD_GERYON_EXIT;
+    UCL_GERYON_EXIT;
  }
  #endif
 }

 #ifdef GPU_CAST
-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "atom_cl.h"
+#elif defined(USE_CUDART)
+const char *atom=0;
 #else
-#include "atom_ptx.h"
+#include "atom_cubin.h"
 #endif

 template <class numtyp, class acctyp>
@ -316,3 +316,4 @@ void AtomT::compile_kernels(UCL_Device &dev) {
 #endif

 template class Atom<PRECISION,ACC_PRECISION>;
+
--- a/lib/gpu/lal_atom.h
+++ b/lib/gpu/lal_atom.h
@ -19,20 +19,21 @@
 #include <math.h>
 #include "mpi.h"

-#ifdef USE_OPENCL
-
+#if defined(USE_OPENCL)
 #include "geryon/ocl_timer.h"
 #include "geryon/ocl_mat.h"
 #include "geryon/ocl_kernel.h"
 using namespace ucl_opencl;
-
+#elif defined(USE_CUDART)
+#include "geryon/nvc_timer.h"
+#include "geryon/nvc_mat.h"
+#include "geryon/nvc_kernel.h"
+using namespace ucl_cudart;
 #else
-
 #include "geryon/nvd_timer.h"
 #include "geryon/nvd_mat.h"
 #include "geryon/nvd_kernel.h"
 using namespace ucl_cudadr;
-
 #endif

 #ifdef USE_CUDPP
@ -92,7 +93,7 @@ class Atom {
  bool charge() { return _charge; }
  
  /// Returns true if GPU is using quaternions
-  bool quat() { return _rot; }
+  bool quaternion() { return _rot; }
  
  /// Only free matrices of length inum or nall for resizing
  void clear_resize();
@ -251,16 +252,13 @@ class Atom {
      memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
      memcpy(host_type_cast.begin(),host_type,_nall*sizeof(int));
      #else
-      numtyp *_write_loc=host_x.begin();
+      int wl=0;
      for (int i=0; i<_nall; i++) {
-        *_write_loc=host_ptr[i][0];
-        _write_loc++;
-        *_write_loc=host_ptr[i][1];
-        _write_loc++;
-        *_write_loc=host_ptr[i][2];
-        _write_loc++;
-        *_write_loc=host_type[i];
-        _write_loc++;
+        x[wl]=host_ptr[i][0];
+        x[wl+1]=host_ptr[i][1];
+        x[wl+2]=host_ptr[i][2];
+        x[wl+3]=host_type[i];
+        wl+=4;
      }
      #endif
      _time_cast+=MPI_Wtime()-t;
@ -273,15 +271,14 @@ class Atom {
    time_pos.start();
    if (_x_avail==false) {
      #ifdef GPU_CAST
-      ucl_copy(dev_x_cast,host_x_cast,_nall*3,true);
-      ucl_copy(dev_type_cast,host_type_cast,_nall,true);
+      x_cast.update_device(_nall*3,true);
+      type_cast.update_device(_nall,true);
      int block_size=64;
      int GX=static_cast<int>(ceil(static_cast<double>(_nall)/block_size));
      k_cast_x.set_size(GX,block_size);
-      k_cast_x.run(&dev_x.begin(), &dev_x_cast.begin(), &dev_type_cast.begin(), 
-                   &_nall);
+      k_cast_x.run(&x, &x_cast, &type_cast, &_nall);
      #else
-      ucl_copy(dev_x,host_x,_nall*4,true);
+      x.update_device(_nall*4,true);
      #endif
      _x_avail=true;
    }
@ -299,18 +296,14 @@ class Atom {
  inline void cast_q_data(cpytyp *host_ptr) {
    if (_q_avail==false) {
      double t=MPI_Wtime();
-      if (dev->device_type()==UCL_CPU) {
-        if (sizeof(numtyp)==sizeof(double)) {
-          host_q.view((numtyp*)host_ptr,_nall,*dev);
-          dev_q.view(host_q);
-        } else
-          for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i];
-      } else {
-        if (sizeof(numtyp)==sizeof(double))
-          memcpy(host_q.begin(),host_ptr,_nall*sizeof(numtyp));
+      // If double precision, still memcpy for async transfers
+      if (_host_view) {
+        q.host.view((numtyp*)host_ptr,_nall,*dev);
+        q.device.view(q.host);
+      } else if (sizeof(numtyp)==sizeof(double))
+        memcpy(q.host.begin(),host_ptr,_nall*sizeof(numtyp));
      else
-          for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i];
-      }
+        for (int i=0; i<_nall; i++) q[i]=host_ptr[i];
      _time_cast+=MPI_Wtime()-t;
    }
  }
@ -318,7 +311,7 @@ class Atom {
  // Copy charges to device asynchronously
  inline void add_q_data() {
    if (_q_avail==false) {
-      ucl_copy(dev_q,host_q,_nall,true);
+      q.update_device(_nall,true);
      _q_avail=true;
    }
  }
@ -328,18 +321,13 @@ class Atom {
  inline void cast_quat_data(cpytyp *host_ptr) {
    if (_quat_avail==false) {
      double t=MPI_Wtime();
-      if (dev->device_type()==UCL_CPU) {
-        if (sizeof(numtyp)==sizeof(double)) {
-          host_quat.view((numtyp*)host_ptr,_nall*4,*dev);
-          dev_quat.view(host_quat);
-        } else
-          for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i];
-      } else {
-        if (sizeof(numtyp)==sizeof(double))
-          memcpy(host_quat.begin(),host_ptr,_nall*4*sizeof(numtyp));
+      if (_host_view) {
+        quat.host.view((numtyp*)host_ptr,_nall*4,*dev);
+        quat.device.view(quat.host);
+      } else if (sizeof(numtyp)==sizeof(double))
+        memcpy(quat.host.begin(),host_ptr,_nall*4*sizeof(numtyp));
      else
-          for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i];
-      }
+        for (int i=0; i<_nall*4; i++) quat[i]=host_ptr[i];
      _time_cast+=MPI_Wtime()-t;
    }
  }
@ -348,7 +336,7 @@ class Atom {
  /** Copies nall()*4 elements **/
  inline void add_quat_data() {
    if (_quat_avail==false) {
-      ucl_copy(dev_quat,host_quat,_nall*4,true);
+      quat.update_device(_nall*4,true);
      _quat_avail=true;
    }
  }
@ -363,29 +351,23 @@ class Atom {
  inline double max_gpu_bytes() 
    { double m=_max_gpu_bytes; _max_gpu_bytes=0.0; return m; } 

+  /// Returns true if the device is addressing memory on the host
+  inline bool host_view() { return _host_view; }
+
  // ------------------------------ DATA ----------------------------------

  /// Atom coordinates and types ([0] is x, [1] is y, [2] is z, [3] is type
-  UCL_D_Vec<numtyp> dev_x;
+  UCL_Vector<numtyp,numtyp> x;
  /// Charges
-  UCL_D_Vec<numtyp> dev_q;
+  UCL_Vector<numtyp,numtyp> q;
  /// Quaterions
-  UCL_D_Vec<numtyp> dev_quat;
+  UCL_Vector<numtyp,numtyp> quat;
  
  #ifdef GPU_CAST
-  UCL_D_Vec<double> dev_x_cast;
-  UCL_D_Vec<int> dev_type_cast;
-  UCL_H_Vec<double> host_x_cast;
-  UCL_H_Vec<int> host_type_cast;
+  UCL_Vector<double,double> x_cast;
+  UCL_Vector<int,int> type_cast;
  #endif

-  /// Buffer for moving positions to device
-  UCL_H_Vec<numtyp> host_x;
-  /// Buffer for moving charge data to GPU
-  UCL_H_Vec<numtyp> host_q;
-  /// Buffer for moving quat data to GPU
-  UCL_H_Vec<numtyp> host_quat;
-  
  /// Cell list identifiers for device nbor builds
  UCL_D_Vec<unsigned> dev_cell_id;
  /// Cell list identifiers for device nbor builds
@ -418,9 +400,9 @@ class Atom {

  bool alloc(const int nall);
  
-  bool _allocated, _rot, _charge, _other;
+  bool _allocated, _rot, _charge, _bonds, _other;
  int _max_atoms, _nall, _gpu_nbor;
-  bool _bonds;
+  bool _host_view;
  double _time_cast, _time_transfer;
  
  double _max_gpu_bytes;
@ -434,3 +416,4 @@ class Atom {
 }

 #endif
+
--- a/lib/gpu/lal_base_atomic.cpp
+++ b/lib/gpu/lal_base_atomic.cpp
@ -41,9 +41,9 @@ int BaseAtomicT::bytes_per_atom_atomic(const int max_nbors) const {
 template <class numtyp, class acctyp>
 int BaseAtomicT::init_atomic(const int nlocal, const int nall,
                             const int max_nbors, const int maxspecial,
-                                  const double cell_size,
-                                  const double gpu_split, FILE *_screen,
-                                  const char *pair_program) {
+                             const double cell_size, const double gpu_split,
+                             FILE *_screen, const void *pair_program,
+                             const char *k_name) {
  screen=_screen;

  int gpu_nbor=0;
@ -74,7 +74,7 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall,
  atom=&device->atom;

  _block_size=device->pair_block_size();
-  compile_kernels(*ucl_device,pair_program);
+  compile_kernels(*ucl_device,pair_program,k_name);

  // Initialize host-device load balancer
  hd_balancer.init(device,gpu_nbor,gpu_split);
@ -83,7 +83,7 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall,
  time_pair.init(*ucl_device);
  time_pair.zero();

-  pos_tex.bind_float(atom->dev_x,4);
+  pos_tex.bind_float(atom->x,4);

  _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();

@ -266,18 +266,20 @@ double BaseAtomicT::host_memory_usage_atomic() const {
 }

 template <class numtyp, class acctyp>
-void BaseAtomicT::compile_kernels(UCL_Device &dev, const char *pair_str) {
+void BaseAtomicT::compile_kernels(UCL_Device &dev, const void *pair_str,
+                                  const char *kname) {
  if (_compiled)
    return;

+  std::string s_fast=std::string(kname)+"_fast";
  std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
                    std::string(OCL_PRECISION_COMPILE)+" -D"+
                    std::string(OCL_VENDOR);

  pair_program=new UCL_Program(dev);
  pair_program->load_string(pair_str,flags.c_str());
-  k_pair_fast.set_function(*pair_program,"kernel_pair_fast");
-  k_pair.set_function(*pair_program,"kernel_pair");
+  k_pair_fast.set_function(*pair_program,s_fast.c_str());
+  k_pair.set_function(*pair_program,kname);
  pos_tex.get_texture(*pair_program,"pos_tex");

  _compiled=true;
--- a/lib/gpu/lal_base_atomic.h
+++ b/lib/gpu/lal_base_atomic.h
@ -20,8 +20,10 @@
 #include "lal_balance.h"
 #include "mpi.h"

-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "geryon/ocl_texture.h"
+#elif defined(USE_CUDART)
+#include "geryon/nvc_texture.h"
 #else
 #include "geryon/nvd_texture.h"
 #endif
@ -38,6 +40,7 @@ class BaseAtomic {
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
    * \param gpu_split fraction of particles handled by device
+    * \param k_name name for the kernel for force calculation
    * 
    * Returns:
    * -  0 if successfull
@ -48,7 +51,7 @@ class BaseAtomic {
  int init_atomic(const int nlocal, const int nall, const int max_nbors,
                  const int maxspecial, const double cell_size, 
                  const double gpu_split, FILE *screen, 
-                  const char *pair_program);
+                  const void *pair_program, const char *k_name);

  /// Estimate the overhead for GPU context changes and CPU driver
  void estimate_gpu_overhead();
@ -57,7 +60,7 @@ class BaseAtomic {
  /** \param success set to false if insufficient memory **/
  inline void resize_atom(const int inum, const int nall, bool &success) {
    if (atom->resize(nall, success))
-      pos_tex.bind_float(atom->dev_x,4);
+      pos_tex.bind_float(atom->x,4);
    ans->resize(inum,success);
  }

@ -188,7 +191,7 @@ class BaseAtomic {
  double _gpu_overhead, _driver_overhead;
  UCL_D_Vec<int> *_nbor_data;

-  void compile_kernels(UCL_Device &dev, const char *pair_string);
+  void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k);

  virtual void loop(const bool _eflag, const bool _vflag) = 0;
 };
--- a/lib/gpu/lal_base_charge.cpp
+++ b/lib/gpu/lal_base_charge.cpp
@ -42,9 +42,9 @@ int BaseChargeT::bytes_per_atom_atomic(const int max_nbors) const {
 template <class numtyp, class acctyp>
 int BaseChargeT::init_atomic(const int nlocal, const int nall,
                             const int max_nbors, const int maxspecial,
-                                  const double cell_size,
-                                  const double gpu_split, FILE *_screen,
-                                  const char *pair_program) {
+                             const double cell_size, const double gpu_split,
+                             FILE *_screen, const void *pair_program,
+                             const char *k_name) {
  screen=_screen;

  int gpu_nbor=0;
@ -76,7 +76,7 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall,

  _block_size=device->pair_block_size();
  _block_bio_size=device->block_bio_pair();
-  compile_kernels(*ucl_device,pair_program);
+  compile_kernels(*ucl_device,pair_program,k_name);

  // Initialize host-device load balancer
  hd_balancer.init(device,gpu_nbor,gpu_split);
@ -85,8 +85,8 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall,
  time_pair.init(*ucl_device);
  time_pair.zero();

-  pos_tex.bind_float(atom->dev_x,4);
-  q_tex.bind_float(atom->dev_q,1);
+  pos_tex.bind_float(atom->x,4);
+  q_tex.bind_float(atom->q,1);

  _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();

@ -282,18 +282,20 @@ double BaseChargeT::host_memory_usage_atomic() const {
 }

 template <class numtyp, class acctyp>
-void BaseChargeT::compile_kernels(UCL_Device &dev, const char *pair_str) {
+void BaseChargeT::compile_kernels(UCL_Device &dev, const void *pair_str,
+                                  const char *kname) {
  if (_compiled)
    return;

+  std::string s_fast=std::string(kname)+"_fast";
  std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
                    std::string(OCL_PRECISION_COMPILE)+" -D"+
                    std::string(OCL_VENDOR);

  pair_program=new UCL_Program(dev);
  pair_program->load_string(pair_str,flags.c_str());
-  k_pair_fast.set_function(*pair_program,"kernel_pair_fast");
-  k_pair.set_function(*pair_program,"kernel_pair");
+  k_pair_fast.set_function(*pair_program,s_fast.c_str());
+  k_pair.set_function(*pair_program,kname);
  pos_tex.get_texture(*pair_program,"pos_tex");
  q_tex.get_texture(*pair_program,"q_tex");

--- a/lib/gpu/lal_base_charge.h
+++ b/lib/gpu/lal_base_charge.h
@ -21,8 +21,10 @@
 #include "lal_balance.h"
 #include "mpi.h"

-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "geryon/ocl_texture.h"
+#elif defined(USE_CUDART)
+#include "geryon/nvc_texture.h"
 #else
 #include "geryon/nvd_texture.h"
 #endif
@ -39,6 +41,7 @@ class BaseCharge {
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
    * \param gpu_split fraction of particles handled by device
+    * \param k_name name for the kernel for force calculation
    * 
    * Returns:
    * -  0 if successfull
@ -49,7 +52,7 @@ class BaseCharge {
  int init_atomic(const int nlocal, const int nall, const int max_nbors,
                  const int maxspecial, const double cell_size,
                  const double gpu_split, FILE *screen,
-                  const char *pair_program);
+                  const void *pair_program, const char *k_name);

  /// Estimate the overhead for GPU context changes and CPU driver
  void estimate_gpu_overhead();
@ -58,8 +61,8 @@ class BaseCharge {
  /** \param success set to false if insufficient memory **/
  inline void resize_atom(const int inum, const int nall, bool &success) {
    if (atom->resize(nall, success)) {
-      pos_tex.bind_float(atom->dev_x,4);
-      q_tex.bind_float(atom->dev_q,1);
+      pos_tex.bind_float(atom->x,4);
+      q_tex.bind_float(atom->q,1);
    }
    ans->resize(inum,success);
  }
@ -187,7 +190,7 @@ class BaseCharge {
  double _gpu_overhead, _driver_overhead;
  UCL_D_Vec<int> *_nbor_data;

-  void compile_kernels(UCL_Device &dev, const char *pair_string);
+  void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k);

  virtual void loop(const bool _eflag, const bool _vflag) = 0;
 };
--- a/lib/gpu/lal_base_ellipsoid.cpp
+++ b/lib/gpu/lal_base_ellipsoid.cpp
@ -17,10 +17,12 @@
 #include <cstdlib>
 using namespace LAMMPS_AL;

-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "ellipsoid_nbor_cl.h"
+#elif defined(USE_CUDART)
+const char *ellipsoid_nbor=0;
 #else
-#include "ellipsoid_nbor_ptx.h"
+#include "ellipsoid_nbor_cubin.h"
 #endif

 #define BaseEllipsoidT BaseEllipsoid<numtyp, acctyp>
@ -50,8 +52,9 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,
                              const int max_nbors, const int maxspecial,
                              const double cell_size, const double gpu_split,
                              FILE *_screen, const int ntypes, int **h_form,
-                              const char *ellipsoid_program,
-                              const char *lj_program, const bool ellip_sphere) {
+                              const void *ellipsoid_program,
+                              const void *lj_program, const char *k_name,
+                              const bool ellip_sphere) {
  screen=_screen;
  _ellipsoid_sphere=ellip_sphere;

@ -78,7 +81,7 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,
  atom=&device->atom;

  _block_size=device->pair_block_size();
-  compile_kernels(*ucl_device,ellipsoid_program,lj_program,ellip_sphere);
+  compile_kernels(*ucl_device,ellipsoid_program,lj_program,k_name,ellip_sphere);

  // Initialize host-device load balancer
  hd_balancer.init(device,gpu_nbor,gpu_split);
@ -112,7 +115,7 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,
  }
  
  if (_multiple_forms)
-    ans->dev_ans.zero();
+    ans->force.zero();

  // Memory for ilist ordered by particle type
  if (host_olist.alloc(nbor->max_atoms(),*ucl_device)==UCL_SUCCESS)
@ -121,6 +124,12 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,

  _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();

+  neigh_tex.bind_float(atom->x,4);
+  pos_tex.bind_float(atom->x,4);
+  quat_tex.bind_float(atom->quat,4);
+  lj_pos_tex.bind_float(atom->x,4);
+  lj_quat_tex.bind_float(atom->quat,4);
+
  return 0;
 }

@ -241,14 +250,12 @@ void BaseEllipsoidT::pack_nbors(const int GX, const int BX, const int start,
  int stride=nbor->nbor_pitch();
  if (shared_types) {
    k_nbor_fast.set_size(GX,BX);
-    k_nbor_fast.run(&atom->dev_x.begin(), &cut_form.begin(), 
-                    &nbor->dev_nbor.begin(), &stride, &start, &inum,
-                    &nbor->dev_packed.begin(), &form_low, &form_high);
+    k_nbor_fast.run(&atom->x, &cut_form, &nbor->dev_nbor, &stride, &start,
+                    &inum, &nbor->dev_packed, &form_low, &form_high);
  } else {
    k_nbor.set_size(GX,BX);
-    k_nbor.run(&atom->dev_x.begin(), &cut_form.begin(), &ntypes,
-               &nbor->dev_nbor.begin(), &stride, &start, &inum, 
-               &nbor->dev_packed.begin(), &form_low, &form_high);
+    k_nbor.run(&atom->x, &cut_form, &ntypes, &nbor->dev_nbor, &stride,
+               &start, &inum, &nbor->dev_packed, &form_low, &form_high);
  }
 }

@ -437,11 +444,18 @@ double BaseEllipsoidT::host_memory_usage_base() const {

 template <class numtyp, class acctyp>
 void BaseEllipsoidT::compile_kernels(UCL_Device &dev, 
-                                     const char *ellipsoid_string,
-                                     const char *lj_string, const bool e_s) {
+                                     const void *ellipsoid_string,
+                                     const void *lj_string, 
+                                     const char *kname, const bool e_s) {
  if (_compiled)
    return;

+  std::string kns=kname;
+  std::string s_sphere_ellipsoid=kns+"_sphere_ellipsoid";
+  std::string s_ellipsoid_sphere=kns+"_ellipsoid_sphere";
+  std::string s_lj=kns+"_lj";
+  std::string s_lj_fast=kns+"_lj_fast";
+
  std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
                    std::string(OCL_PRECISION_COMPILE)+" -D"+
                    std::string(OCL_VENDOR);
@ -450,18 +464,23 @@ void BaseEllipsoidT::compile_kernels(UCL_Device &dev,
  nbor_program->load_string(ellipsoid_nbor,flags.c_str());
  k_nbor_fast.set_function(*nbor_program,"kernel_nbor_fast");
  k_nbor.set_function(*nbor_program,"kernel_nbor");
+  neigh_tex.get_texture(*nbor_program,"pos_tex");

  ellipsoid_program=new UCL_Program(dev);
  ellipsoid_program->load_string(ellipsoid_string,flags.c_str());
-  k_ellipsoid.set_function(*ellipsoid_program,"kernel_ellipsoid");
+  k_ellipsoid.set_function(*ellipsoid_program,kname);
+  pos_tex.get_texture(*ellipsoid_program,"pos_tex");
+  quat_tex.get_texture(*ellipsoid_program,"quat_tex");

  lj_program=new UCL_Program(dev);
  lj_program->load_string(lj_string,flags.c_str());
-  k_sphere_ellipsoid.set_function(*lj_program,"kernel_sphere_ellipsoid");
-  k_lj_fast.set_function(*lj_program,"kernel_lj_fast");
-  k_lj.set_function(*lj_program,"kernel_lj");
+  k_sphere_ellipsoid.set_function(*lj_program,s_sphere_ellipsoid.c_str());
+  k_lj_fast.set_function(*lj_program,s_lj_fast.c_str());
+  k_lj.set_function(*lj_program,s_lj.c_str());
  if (e_s)
-    k_ellipsoid_sphere.set_function(*lj_program,"kernel_ellipsoid_sphere");
+    k_ellipsoid_sphere.set_function(*lj_program,s_ellipsoid_sphere.c_str());
+  lj_pos_tex.get_texture(*lj_program,"pos_tex");
+  lj_quat_tex.get_texture(*lj_program,"quat_tex");

  _compiled=true;
 }
--- a/lib/gpu/lal_base_ellipsoid.h
+++ b/lib/gpu/lal_base_ellipsoid.h
@ -20,8 +20,10 @@
 #include "lal_balance.h"
 #include "mpi.h"

-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "geryon/ocl_texture.h"
+#elif defined(USE_CUDART)
+#include "geryon/nvc_texture.h"
 #else
 #include "geryon/nvd_texture.h"
 #endif
@ -39,6 +41,7 @@ class BaseEllipsoid {
    * \param cell_size cutoff + skin
    * \param gpu_split fraction of particles handled by device
    * \param ellipsoid_sphere true if ellipsoid-sphere case handled separately
+    * \param k_name name for the kernel for force calculation
    * 
    * Returns:
    * -  0 if successfull
@ -49,8 +52,9 @@ class BaseEllipsoid {
  int init_base(const int nlocal, const int nall, const int max_nbors,
                const int maxspecial, const double cell_size,
                const double gpu_split, FILE *screen, const int ntypes,
-                int **h_form, const char *ellipsoid_program,
-                const char *lj_program, const bool ellipsoid_sphere=false);
+                int **h_form, const void *ellipsoid_program,
+                const void *lj_program, const char *k_name,
+                const bool ellipsoid_sphere=false);

  /// Estimate the overhead for GPU context changes and CPU driver
  void estimate_gpu_overhead();
@ -58,7 +62,13 @@ class BaseEllipsoid {
  /// Check if there is enough storage for atom arrays and realloc if not
  /** \param success set to false if insufficient memory **/
  inline void resize_atom(const int nall, bool &success) {
-    atom->resize(nall, success);
+    if (atom->resize(nall, success)) {
+      neigh_tex.bind_float(atom->x,4);
+      pos_tex.bind_float(atom->x,4);
+      quat_tex.bind_float(atom->quat,4);
+      lj_pos_tex.bind_float(atom->x,4);
+      lj_quat_tex.bind_float(atom->quat,4);
+    }      
  }

  /// Check if there is enough storage for neighbors and realloc if not
@ -74,7 +84,7 @@ class BaseEllipsoid {
                           const int max_nbors, const int olist_size,
                           bool &success) {
    ans->resize(nlocal, success);
-    if (_multiple_forms) ans->dev_ans.zero();
+    if (_multiple_forms) ans->force.zero();

    if (olist_size>static_cast<int>(host_olist.numel())) {
      host_olist.clear();
@ -221,8 +231,7 @@ class BaseEllipsoid {
  inline int block_size() { return _block_size; }

  // --------------------------- TEXTURES -----------------------------
-  UCL_Texture pos_tex;
-  UCL_Texture q_tex;
+  UCL_Texture pos_tex, quat_tex, lj_pos_tex, lj_quat_tex, neigh_tex;

 protected:
  bool _compiled, _ellipsoid_sphere;
@ -236,8 +245,8 @@ class BaseEllipsoid {
  int **_host_form;
  int _last_ellipse, _max_last_ellipse;

-  void compile_kernels(UCL_Device &dev, const char *ellipsoid_string,
-                       const char *lj_string, const bool e_s);
+  void compile_kernels(UCL_Device &dev, const void *ellipsoid_string,
+                       const void *lj_string, const char *kname,const bool e_s);

  virtual void loop(const bool _eflag, const bool _vflag) = 0;
 };
--- a/lib/gpu/lal_buck.cpp
+++ b/lib/gpu/lal_buck.cpp
@ -13,10 +13,12 @@
    email                : nguyentd@ornl.gov
 ***************************************************************************/

-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "buck_cl.h"
+#elif defined(USE_CUDART)
+const char *buck=0;
 #else
-#include "buck_ptx.h"
+#include "buck_cubin.h"
 #endif

 #include "lal_buck.h"
@ -50,7 +52,7 @@ int BuckT::init(const int ntypes, double **host_cutsq,
           const double gpu_split, FILE *_screen) {
  int success;
  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                            _screen,buck);
+                            _screen,buck,"k_buck");
  if (success!=0)
    return success;

@ -132,20 +134,17 @@ void BuckT::loop(const bool _eflag, const bool _vflag) {
  this->time_pair.start();
  if (shared_types) {
    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->dev_x.begin(), &coeff1.begin(),
-                          &coeff2.begin(), &sp_lj.begin(),
-                          &this->nbor->dev_nbor.begin(),
-                          &this->_nbor_data->begin(),
-                          &this->ans->dev_ans.begin(),
-                          &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &nbor_pitch, &this->_threads_per_atom);
+    this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
+                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->ans->force, &this->ans->engv, &eflag, 
+                          &vflag, &ainum, &nbor_pitch, 
+                          &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->dev_x.begin(), &coeff1.begin(), &coeff2.begin(),
-                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &nbor_pitch, &this->_threads_per_atom);
+    this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj,
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                     &this->ans->force, &this->ans->engv, &eflag, &vflag,
+                     &ainum, &nbor_pitch, &this->_threads_per_atom);
  }
  this->time_pair.stop();
 }
--- a/lib/gpu/lal_buck.cu
+++ b/lib/gpu/lal_buck.cu
@ -15,14 +15,16 @@

 #ifdef NV_KERNEL
 #include "lal_aux_fun1.h"
-texture<float4> pos_tex;
 #ifndef _DOUBLE_DOUBLE
-ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
-  { return tex1Dfetch(pos_tex, i); }
+texture<float4> pos_tex;
+#else
+texture<int4,1> pos_tex;
 #endif
+#else
+#define pos_tex x_
 #endif

-__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
+__kernel void k_buck(__global numtyp4 *x_, __global numtyp4 *coeff1,
                          __global numtyp4* coeff2, const int lj_types, 
                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
                          __global int *dev_packed, __global acctyp4 *ans,
@ -51,7 +53,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;

    numtyp factor_lj;
@ -61,7 +63,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int jtype=jx.w;

      // Compute r12
@ -104,7 +106,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
  } // if ii
 }

-__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
+__kernel void k_buck_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
                               __global numtyp4* coeff2_in, 
                               __global numtyp* sp_lj_in, 
                               __global int *dev_nbor, __global int *dev_packed, 
@ -140,7 +142,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);

-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

@ -151,7 +153,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int mtype=itype+jx.w;

      // Compute r12
--- a/lib/gpu/lal_buck_coul.cpp
+++ b/lib/gpu/lal_buck_coul.cpp
@ -13,10 +13,12 @@
    email                : nguyentd@ornl.gov
 ***************************************************************************/

-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "buck_coul_cl.h"
+#elif defined(USE_CUDART)
+const char *buck_coul=0;
 #else
-#include "buck_coul_ptx.h"
+#include "buck_coul_cubin.h"
 #endif

 #include "lal_buck_coul.h"
@ -52,7 +54,7 @@ int BuckCoulT::init(const int ntypes, double **host_cutsq,
                   const double qqrd2e) {
  int success;
  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                            _screen,buck_coul);
+                            _screen,buck_coul,"k_buck_coul");
  if (success!=0)
    return success;

@ -142,23 +144,18 @@ void BuckCoulT::loop(const bool _eflag, const bool _vflag) {
  this->time_pair.start();
  if (shared_types) {
    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->dev_x.begin(), &coeff1.begin(),
-                          &coeff2.begin(), &sp_lj.begin(),
-                          &this->nbor->dev_nbor.begin(),
-                          &this->_nbor_data->begin(),
-                          &this->ans->dev_ans.begin(),
-                          &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &nbor_pitch,
-                          &this->atom->dev_q.begin(), &cutsq.begin(),
-                          &_qqrd2e, &this->_threads_per_atom);
+    this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
+                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->ans->force, &this->ans->engv, &eflag,
+                          &vflag, &ainum, &nbor_pitch, &this->atom->q, 
+                          &cutsq, &_qqrd2e, &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->dev_x.begin(), &coeff1.begin(), &coeff2.begin(),
-                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &nbor_pitch, &this->atom->dev_q.begin(),
-                     &cutsq.begin(), &_qqrd2e, &this->_threads_per_atom);
+    this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj,
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                     &this->ans->force, &this->ans->engv, &eflag, &vflag,
+                     &ainum, &nbor_pitch, &this->atom->q,
+                     &cutsq, &_qqrd2e, &this->_threads_per_atom);
  }
  this->time_pair.stop();
 }
--- a/lib/gpu/lal_buck_coul.cu
+++ b/lib/gpu/lal_buck_coul.cu
@ -14,18 +14,22 @@
 // ***************************************************************************/

 #ifdef NV_KERNEL
+
 #include "lal_aux_fun1.h"
+#ifndef _DOUBLE_DOUBLE
 texture<float4> pos_tex;
 texture<float> q_tex;
-#ifndef _DOUBLE_DOUBLE
-ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
-  { return tex1Dfetch(pos_tex, i); }
-ucl_inline float fetch_q(const int& i, const float *q) 
-  { return tex1Dfetch(q_tex, i); }
-#endif
+#else
+texture<int4,1> pos_tex;
+texture<int2> q_tex;
 #endif

-__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
+#else
+#define pos_tex x_
+#define q_tex q_
+#endif
+
+__kernel void k_buck_coul(__global numtyp4 *x_, __global numtyp4 *coeff1,
                          __global numtyp4* coeff2, const int lj_types, 
                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
                          __global int *dev_packed, __global acctyp4 *ans,
@ -61,8 +65,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    numtyp qtmp=fetch_q(i,q_);
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
    int itype=ix.w;
    
    for ( ; nbor<list_end; nbor+=n_stride) {
@ -73,7 +77,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
      factor_coul = sp_lj[sbmask(j)+4];
      j &= NEIGHMASK;

-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int jtype=jx.w;

      // Compute r12
@ -97,9 +101,10 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
        } else
          forcebuck = (numtyp)0.0;
        
-        if (rsq < coeff2[mtype].z)  // coul
-          forcecoul = qqrd2e*qtmp*fetch_q(j,q_)*ucl_rsqrt(rsq)*factor_coul;
-        else
+        if (rsq < coeff2[mtype].z) {
+          fetch(forcecoul,j,q_tex);
+          forcecoul *= qqrd2e*qtmp*ucl_rsqrt(rsq)*factor_coul;
+        } else
          forcecoul = (numtyp)0.0;
        
        force = (forcebuck + forcecoul) * r2inv;
@ -131,7 +136,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
  } // if ii
 }

-__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
+__kernel void k_buck_coul_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
                               __global numtyp4* coeff2_in, 
                               __global numtyp* sp_lj_in, 
                               __global int *dev_nbor, __global int *dev_packed, 
@ -172,8 +177,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);

-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    numtyp qtmp=fetch_q(i,q_);
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

@ -185,7 +190,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in
      factor_coul = sp_lj[sbmask(j)+4];
      j &= NEIGHMASK;

-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int mtype=itype+jx.w;

      // Compute r12
@ -208,9 +213,10 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in
        } else
          forcebuck = (numtyp)0.0;
        
-        if (rsq < cutsq[mtype].z)  // coul
-          forcecoul = qqrd2e*qtmp*fetch_q(j,q_)*ucl_rsqrt(rsq)*factor_coul;
-        else
+        if (rsq < cutsq[mtype].z) {
+          fetch(forcecoul,j,q_tex);
+          forcecoul *= qqrd2e*qtmp*ucl_rsqrt(rsq)*factor_coul;
+        } else
          forcecoul = (numtyp)0.0;
        
        force = (forcebuck + forcecoul) * r2inv;
--- a/lib/gpu/lal_buck_coul_long.cpp
+++ b/lib/gpu/lal_buck_coul_long.cpp
@ -13,10 +13,12 @@
    email                : nguyentd@ornl.gov
 ***************************************************************************/

-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "buck_coul_long_cl.h"
+#elif defined(USE_CUDART)
+const char *buck_coul_long=0;
 #else
-#include "buck_coul_long_ptx.h"
+#include "buck_coul_long_cubin.h"
 #endif

 #include "lal_buck_coul_long.h"
@ -54,7 +56,7 @@ int BuckCoulLongT::init(const int ntypes, double **host_cutsq,
                       const double g_ewald) {
  int success;
  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                            _screen,buck_coul_long);
+                            _screen,buck_coul_long,"k_buck_coul_long");
  if (success!=0)
    return success;

@ -145,25 +147,19 @@ void BuckCoulLongT::loop(const bool _eflag, const bool _vflag) {
  this->time_pair.start();
  if (shared_types) {
    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->dev_x.begin(), &coeff1.begin(),
-                          &coeff2.begin(), &sp_lj.begin(),
-                          &this->nbor->dev_nbor.begin(),
-                          &this->_nbor_data->begin(),
-                          &this->ans->dev_ans.begin(),
-                          &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &nbor_pitch, &this->atom->dev_q.begin(),
-                          &cutsq.begin(), &_cut_coulsq, &_qqrd2e, 
+    this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
+                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->ans->force, &this->ans->engv, &eflag, 
+                          &vflag, &ainum, &nbor_pitch, &this->atom->q,
+                          &cutsq, &_cut_coulsq, &_qqrd2e, 
                          &_g_ewald, &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->dev_x.begin(), &coeff1.begin(), 
-                   &coeff2.begin(), &_lj_types, &sp_lj.begin(), 
-                   &this->nbor->dev_nbor.begin(),
-                   &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                   &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                   &nbor_pitch, &this->atom->dev_q.begin(), 
-                   &cutsq.begin(), &_cut_coulsq,
-                   &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
+    this->k_pair.run(&this->atom->x, &coeff1,  &coeff2, &_lj_types, &sp_lj, 
+                   &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                   &this->ans->force, &this->ans->engv, &eflag, &vflag,
+                   &ainum, &nbor_pitch, &this->atom->q, &cutsq, 
+                   &_cut_coulsq, &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
  }
  this->time_pair.stop();
 }
--- a/lib/gpu/lal_buck_coul_long.cu
+++ b/lib/gpu/lal_buck_coul_long.cu
@ -14,18 +14,22 @@
 // ***************************************************************************/

 #ifdef NV_KERNEL
+
 #include "lal_aux_fun1.h"
+#ifndef _DOUBLE_DOUBLE
 texture<float4> pos_tex;
 texture<float> q_tex;
-#ifndef _DOUBLE_DOUBLE
-ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
-  { return tex1Dfetch(pos_tex, i); }
-ucl_inline float fetch_q(const int& i, const float *q) 
-  { return tex1Dfetch(q_tex, i); }
-#endif
+#else
+texture<int4,1> pos_tex;
+texture<int2> q_tex;
 #endif

-__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
+#else
+#define pos_tex x_
+#define q_tex q_
+#endif
+
+__kernel void k_buck_coul_long(__global numtyp4 *x_, __global numtyp4 *coeff1,
                          __global numtyp4* coeff2, const int lj_types, 
                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
                          __global int *dev_packed, __global acctyp4 *ans,
@ -62,8 +66,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    numtyp qtmp=fetch_q(i,q_);
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
    int itype=ix.w;

    for ( ; nbor<list_end; nbor+=n_stride) {
@ -74,7 +78,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
      j &= NEIGHMASK;

-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int jtype=jx.w;

      // Compute r12
@ -104,7 +108,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
          numtyp expm2 = ucl_exp(-grij*grij);
          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-          prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
+          fetch(prefactor,j,q_tex);
+          prefactor *= qqrd2e * qtmp/r;
          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
        } else
          forcecoul = (numtyp)0.0;
@ -139,7 +144,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff1,
  } // if ii
 }

-__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
+__kernel void k_buck_coul_long_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
                               __global numtyp4* coeff2_in, 
                               __global numtyp* sp_lj_in,
                               __global int *dev_nbor, __global int *dev_packed,
@ -179,8 +184,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    numtyp qtmp=fetch_q(i,q_);
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

@ -192,7 +197,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in
      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
      j &= NEIGHMASK;

-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int mtype=itype+jx.w;

      // Compute r12
@ -221,7 +226,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in
          numtyp expm2 = ucl_exp(-grij*grij);
          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-          prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
+          fetch(prefactor,j,q_tex);
+          prefactor *= qqrd2e * qtmp/r;
          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
        } else
          forcecoul = (numtyp)0.0;
--- a/lib/gpu/lal_cg_cmm.cpp
+++ b/lib/gpu/lal_cg_cmm.cpp
@ -13,10 +13,12 @@
    email                : brownw@ornl.gov
 ***************************************************************************/

-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "cg_cmm_cl.h"
+#elif defined(USE_CUDART)
+const char *cg_cmm=0;
 #else
-#include "cg_cmm_ptx.h"
+#include "cg_cmm_cubin.h"
 #endif

 #include "lal_cg_cmm.h"
@ -51,7 +53,7 @@ int CGCMMT::init(const int ntypes, double **host_cutsq,
                          const double gpu_split, FILE *_screen) {
  int success;
  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                            _screen,cg_cmm);
+                            _screen,cg_cmm,"k_cg_cmm");
  if (success!=0)
    return success;

@ -133,19 +135,17 @@ void CGCMMT::loop(const bool _eflag, const bool _vflag) {
  this->time_pair.start();
  if (shared_types) {
    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
-                          &lj3.begin(), &sp_lj.begin(),
-                          &this->nbor->dev_nbor.begin(),
-                          &this->_nbor_data->begin(),
-                          &this->ans->dev_ans.begin(),
-                          &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &nbor_pitch, &this->_threads_per_atom);
+    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
+                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->ans->force, &this->ans->engv, &eflag,
+                          &vflag, &ainum, &nbor_pitch,  
+                          &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
-                     &_cmm_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
+    this->k_pair.run(&this->atom->x, &lj1, &lj3,
+                     &_cmm_types, &sp_lj, &this->nbor->dev_nbor,
+                     &this->_nbor_data->begin(), &this->ans->force,
+                     &this->ans->engv, &eflag, &vflag, &ainum,
                     &nbor_pitch, &this->_threads_per_atom);
  }
  this->time_pair.stop();
--- a/lib/gpu/lal_cg_cmm.cu
+++ b/lib/gpu/lal_cg_cmm.cu
@ -15,14 +15,16 @@

 #ifdef NV_KERNEL
 #include "lal_aux_fun1.h"
-texture<float4> pos_tex;
 #ifndef _DOUBLE_DOUBLE
-ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
-  { return tex1Dfetch(pos_tex, i); }
+texture<float4> pos_tex;
+#else
+texture<int4,1> pos_tex;
 #endif
+#else
+#define pos_tex x_
 #endif

-__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
+__kernel void k_cg_cmm(__global numtyp4 *x_, __global numtyp4 *lj1,
                          __global numtyp4* lj3, const int lj_types, 
                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
                          __global int *dev_packed, __global acctyp4 *ans,
@ -51,7 +53,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;

    numtyp factor_lj;
@ -61,7 +63,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int jtype=jx.w;

      // Compute r12
@ -109,7 +111,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
  } // if ii
 }

-__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
+__kernel void k_cg_cmm_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                               __global numtyp4* lj3_in, 
                               __global numtyp* sp_lj_in,__global int *dev_nbor,
                               __global int *dev_packed, __global acctyp4 *ans,
@ -145,7 +147,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

@ -156,7 +158,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int mtype=itype+jx.w;

      // Compute r12
--- a/lib/gpu/lal_cg_cmm_long.cpp
+++ b/lib/gpu/lal_cg_cmm_long.cpp
@ -13,10 +13,12 @@
    email                : brownw@ornl.gov
 ***************************************************************************/

-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "cg_cmm_long_cl.h"
+#elif defined(USE_CUDART)
+const char *cg_cmm_long=0;
 #else
-#include "cg_cmm_long_ptx.h"
+#include "cg_cmm_long_cubin.h"
 #endif

 #include "lal_cg_cmm_long.h"
@ -56,7 +58,7 @@ int CGCMMLongT::init(const int ntypes, double **host_cutsq,
                           const double g_ewald) {
  int success;
  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                            _screen,cg_cmm_long);
+                            _screen,cg_cmm_long,"k_cg_cmm_long");
  if (success!=0)
    return success;

@ -144,24 +146,19 @@ void CGCMMLongT::loop(const bool _eflag, const bool _vflag) {
  this->time_pair.start();
  if (shared_types) {
    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
-                          &lj3.begin(), &sp_lj.begin(),
-                          &this->nbor->dev_nbor.begin(),
-                          &this->_nbor_data->begin(),
-                          &this->ans->dev_ans.begin(),
-                          &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &nbor_pitch,
-                          &this->atom->dev_q.begin(), &_cut_coulsq,
-                          &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
-  } else {
-    this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
-                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &nbor_pitch, &this->atom->dev_q.begin(),
+    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
+                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->ans->force, &this->ans->engv, &eflag,
+                          &vflag, &ainum, &nbor_pitch, &this->atom->q, 
                          &_cut_coulsq, &_qqrd2e, &_g_ewald, 
                          &this->_threads_per_atom);
+  } else {
+    this->k_pair.set_size(GX,BX);
+    this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, 
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                     &this->ans->force, &this->ans->engv, &eflag, &vflag,
+                     &ainum, &nbor_pitch, &this->atom->q, &_cut_coulsq,
+                     &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
  }
  this->time_pair.stop();
 }
--- a/lib/gpu/lal_cg_cmm_long.cu
+++ b/lib/gpu/lal_cg_cmm_long.cu
@ -14,18 +14,22 @@
 // ***************************************************************************/

 #ifdef NV_KERNEL
+
 #include "lal_aux_fun1.h"
+#ifndef _DOUBLE_DOUBLE
 texture<float4> pos_tex;
 texture<float> q_tex;
-#ifndef _DOUBLE_DOUBLE
-ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
-  { return tex1Dfetch(pos_tex, i); }
-ucl_inline float fetch_q(const int& i, const float *q) 
-  { return tex1Dfetch(q_tex, i); }
-#endif
+#else
+texture<int4,1> pos_tex;
+texture<int2> q_tex;
 #endif

-__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
+#else
+#define pos_tex x_
+#define q_tex q_
+#endif
+
+__kernel void k_cg_cmm_long(__global numtyp4 *x_, __global numtyp4 *lj1,
                          __global numtyp4* lj3, const int lj_types, 
                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
                          __global int *dev_packed, __global acctyp4 *ans,
@ -61,8 +65,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    numtyp qtmp=fetch_q(i,q_);
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
    int itype=ix.w;

    for ( ; nbor<list_end; nbor+=n_stride) {
@ -73,7 +77,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
      j &= NEIGHMASK;

-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int jtype=jx.w;

      // Compute r12
@ -108,7 +112,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
          numtyp expm2 = ucl_exp(-grij*grij);
          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-          prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
+          fetch(prefactor,j,q_tex);
+          prefactor *= qqrd2e * qtmp/r;
          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
        } else
          forcecoul = (numtyp)0.0;
@ -143,7 +148,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
  } // if ii
 }

-__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
+__kernel void k_cg_cmm_long_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                               __global numtyp4* lj3_in, 
                               __global numtyp* sp_lj_in, 
                               __global int *dev_nbor, __global int *dev_packed,
@ -181,8 +186,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    numtyp qtmp=fetch_q(i,q_);
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

@ -194,7 +199,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
      j &= NEIGHMASK;

-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int mtype=itype+jx.w;

      // Compute r12
@ -228,7 +233,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
          numtyp expm2 = ucl_exp(-grij*grij);
          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-          prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
+          fetch(prefactor,j,q_tex);
+          prefactor *= qqrd2e * qtmp/r;
          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
        } else
          forcecoul = (numtyp)0.0;
--- a/lib/gpu/lal_charmm_long.cpp
+++ b/lib/gpu/lal_charmm_long.cpp
@ -13,10 +13,12 @@
    email                : brownw@ornl.gov
 ***************************************************************************/

-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "charmm_long_cl.h"
+#elif defined(USE_CUDART)
+const char *charmm_long=0;
 #else
-#include "charmm_long_ptx.h"
+#include "charmm_long_cubin.h"
 #endif

 #include "lal_charmm_long.h"
@ -57,7 +59,7 @@ int CHARMMLongT::init(const int ntypes,
                           double **sigma, const bool mix_arithmetic) {
  int success;
  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                            _screen,charmm_long);
+                            _screen,charmm_long,"k_charmm_long");
  if (success!=0)
    return success;

@ -148,22 +150,19 @@ void CHARMMLongT::loop(const bool _eflag, const bool _vflag) {
  this->time_pair.start();
  if (shared_types) {
    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->dev_x.begin(), &ljd.begin(),
-                          &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                          &this->_nbor_data->begin(),
-                          &this->ans->dev_ans.begin(),
-                          &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &nbor_pitch, &this->atom->dev_q.begin(),
+    this->k_pair_fast.run(&this->atom->x, &ljd, &sp_lj, 
+                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->ans->force, &this->ans->engv, &eflag,
+                          &vflag, &ainum, &nbor_pitch, &this->atom->q,
                          &_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj,
                          &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq, 
                          &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(),
-                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &nbor_pitch, &this->atom->dev_q.begin(),
+    this->k_pair.run(&this->atom->x, &lj1, &_lj_types, &sp_lj, 
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                     &this->ans->force, &this->ans->engv, &eflag, &vflag,
+                     &ainum, &nbor_pitch, &this->atom->q,
                     &_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj,
                     &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq,
                     &this->_threads_per_atom);
--- a/lib/gpu/lal_charmm_long.cu
+++ b/lib/gpu/lal_charmm_long.cu
@ -14,18 +14,22 @@
 // ***************************************************************************/

 #ifdef NV_KERNEL
+
 #include "lal_aux_fun1.h"
+#ifndef _DOUBLE_DOUBLE
 texture<float4> pos_tex;
 texture<float> q_tex;
-#ifndef _DOUBLE_DOUBLE
-ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
-  { return tex1Dfetch(pos_tex, i); }
-ucl_inline float fetch_q(const int& i, const float *q) 
-  { return tex1Dfetch(q_tex, i); }
-#endif
+#else
+texture<int4,1> pos_tex;
+texture<int2> q_tex;
 #endif

-__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
+#else
+#define pos_tex x_
+#define q_tex q_
+#endif
+
+__kernel void k_charmm_long(__global numtyp4 *x_, __global numtyp4 *lj1,
                          const int lj_types, __global numtyp *sp_lj_in,
                          __global int *dev_nbor, __global int *dev_packed,
                          __global acctyp4 *ans, __global acctyp *engv, 
@ -62,8 +66,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    numtyp qtmp=fetch_q(i,q_);
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
    int itype=ix.w;

    for ( ; nbor<list_end; nbor+=n_stride) {
@ -74,7 +78,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
      j &= NEIGHMASK;

-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int jtype=jx.w;

      // Compute r12
@ -110,7 +114,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
          numtyp expm2 = ucl_exp(-grij*grij);
          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-          prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
+          fetch(prefactor,j,q_tex);
+          prefactor *= qqrd2e * qtmp/r;
          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
        } else
          forcecoul = (numtyp)0.0;
@ -147,7 +152,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
  } // if ii
 }

-__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
+__kernel void k_charmm_long_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
                               __global numtyp* sp_lj_in, __global int *dev_nbor, 
                               __global int *dev_packed, __global acctyp4 *ans,
                               __global acctyp *engv, const int eflag,
@ -185,8 +190,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    numtyp qtmp=fetch_q(i,q_);
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
    int itype=ix.w;

    for ( ; nbor<list_end; nbor+=n_stride) {
@ -197,7 +202,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
      j &= NEIGHMASK;

-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int jtype=jx.w;

      // Compute r12
@ -239,7 +244,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
          numtyp expm2 = ucl_exp(-grij*grij);
          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-          prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
+          fetch(prefactor,j,q_tex);
+          prefactor *= qqrd2e * qtmp/r;
          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
        } else
          forcecoul = (numtyp)0.0;
--- a/lib/gpu/lal_coul_long.cpp
+++ b/lib/gpu/lal_coul_long.cpp
@ -13,10 +13,12 @@
    email                : a.kohlmeyer@temple.edu
 ***************************************************************************/

-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "coul_long_cl.h"
+#elif defined(USE_CUDART)
+const char *coul_long=0;
 #else
-#include "coul_long_ptx.h"
+#include "coul_long_cubin.h"
 #endif

 #include "lal_coul_long.h"
@ -48,7 +50,7 @@ int CoulLongT::init(const int nlocal, const int nall, const int max_nbors,
 			 const double qqrd2e, const double g_ewald) {
  int success;
  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,
-			    gpu_split,_screen,coul_long);
+			                      gpu_split,_screen,coul_long,"k_coul_long");
  if (success!=0)
    return success;

@ -132,22 +134,18 @@ void CoulLongT::loop(const bool _eflag, const bool _vflag) {
  this->time_pair.start();
  if (shared_types) {
    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
-                          &lj3.begin(), &sp_cl.begin(),
-                          &this->nbor->dev_nbor.begin(),
-                          &this->_nbor_data->begin(),
-                          &this->ans->dev_ans.begin(),
-                          &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &nbor_pitch, &this->atom->dev_q.begin(),
-                          &_cut_coulsq, &_qqrd2e, &_g_ewald,
+    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_cl,
+                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->ans->force, &this->ans->engv,
+                          &eflag, &vflag, &ainum, &nbor_pitch, 
+                          &this->atom->q, &_cut_coulsq, &_qqrd2e, &_g_ewald,
                          &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
-                     &_lj_types, &sp_cl.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq,
+    this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_cl,
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                     &this->ans->force, &this->ans->engv, &eflag, &vflag,
+                     &ainum, &nbor_pitch, &this->atom->q, &_cut_coulsq,
                     &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
  }
  this->time_pair.stop();
--- a/lib/gpu/lal_coul_long.cu
+++ b/lib/gpu/lal_coul_long.cu
@ -14,18 +14,22 @@
 // ***************************************************************************/

 #ifdef NV_KERNEL
+
 #include "lal_aux_fun1.h"
+#ifndef _DOUBLE_DOUBLE
 texture<float4> pos_tex;
 texture<float> q_tex;
-#ifndef _DOUBLE_DOUBLE
-ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
-  { return tex1Dfetch(pos_tex, i); }
-ucl_inline float fetch_q(const int& i, const float *q) 
-  { return tex1Dfetch(q_tex, i); }
-#endif
+#else
+texture<int4,1> pos_tex;
+texture<int2> q_tex;
 #endif

-__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
+#else
+#define pos_tex x_
+#define q_tex q_
+#endif
+
+__kernel void k_coul_long(__global numtyp4 *x_, __global numtyp4 *lj1,
                          __global numtyp4* lj3, const int lj_types,
                          __global numtyp *sp_cl_in, __global int *dev_nbor,
                          __global int *dev_packed, __global acctyp4 *ans,
@ -56,8 +60,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);

-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    numtyp qtmp=fetch_q(i,q_);
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);

    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;
@ -66,7 +70,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
      factor_coul = (numtyp)1.0-sp_cl[sbmask(j)];
      j &= NEIGHMASK;

-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];

      // Compute r12
      numtyp delx = ix.x-jx.x;
@ -83,7 +87,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
        numtyp expm2 = ucl_exp(-grij*grij);
        numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
        _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-	prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
+        fetch(prefactor,j,q_tex);
+        prefactor *= qqrd2e * qtmp/r;
        force = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul) * r2inv;

        f.x+=delx*force;
@ -162,7 +167,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
  } // if ii
 }

-__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
+__kernel void k_coul_long_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                               __global numtyp4* lj3_in,
                               __global numtyp* sp_cl_in,
                               __global int *dev_nbor, __global int *dev_packed,
@ -193,8 +198,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);

-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    numtyp qtmp=fetch_q(i,q_);
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);

    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;
@ -203,7 +208,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
      factor_coul = (numtyp)1.0-sp_cl[sbmask(j)];
      j &= NEIGHMASK;

-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];

      // Compute r12
      numtyp delx = ix.x-jx.x;
@ -220,7 +225,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
        numtyp expm2 = ucl_exp(-grij*grij);
        numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
        _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-	prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
+        fetch(prefactor,j,q_tex);
+        prefactor *= qqrd2e * qtmp/r;
        force = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul) * r2inv;

        f.x+=delx*force;
--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@ -21,10 +21,12 @@
 #include <omp.h>
 #endif

-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "device_cl.h"
+#elif defined(USE_CUDART)
+const char *device=0;
 #else
-#include "device_ptx.h"
+#include "device_cubin.h"
 #endif

 using namespace LAMMPS_AL;
@ -42,10 +44,10 @@ DeviceT::~Device() {
 }

 template <class numtyp, class acctyp>
-int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, 
-                                const int first_gpu, const int last_gpu,
-                                const int gpu_mode, const double p_split,
-                                const int nthreads, const int t_per_atom) {
+int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
+                         const int last_gpu, const int gpu_mode, 
+                         const double p_split, const int nthreads, 
+                         const int t_per_atom, const double cell_size) {
  _nthreads=nthreads;
  #ifdef _OPENMP
  omp_set_num_threads(nthreads);
@ -62,6 +64,7 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica,
  _last_device=last_gpu;
  _gpu_mode=gpu_mode;
  _particle_split=p_split;
+  _cell_size=cell_size;

  // Get the rank/size within the world
  MPI_Comm_rank(_comm_world,&_world_me);
@ -191,7 +194,7 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
  } else {
    if (atom.charge()==false && charge)
      _data_in_estimate++;
-    if (atom.quat()==false && rot)
+    if (atom.quaternion()==false && rot)
      _data_in_estimate++;
    if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial))
      return -3;
@ -205,7 +208,10 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
                  _block_cell_id, _block_nbor_build, threads_per_atom,
                  _warp_size, _time_device))
    return -3;
-  nbor->cell_size(cell_size);
+  if (_cell_size<0.0)
+    nbor->cell_size(cell_size,cell_size);
+  else
+    nbor->cell_size(_cell_size,cell_size);

  _init_count++;
  return 0;
@ -251,7 +257,9 @@ void DeviceT::set_double_precompute
 template <class numtyp, class acctyp>
 void DeviceT::init_message(FILE *screen, const char *name,
                                  const int first_gpu, const int last_gpu) {
-  #ifdef USE_OPENCL
+  #if defined(USE_OPENCL)
+  std::string fs="";
+  #elif defined(USE_CUDART)
  std::string fs="";
  #else
  std::string fs=toa(gpu->free_gigabytes())+"/";
@ -411,11 +419,9 @@ void DeviceT::estimate_gpu_overhead(const int kernel_calls,
 }              

 template <class numtyp, class acctyp>
-void DeviceT::output_times(UCL_Timer &time_pair, 
-                                  Answer<numtyp,acctyp> &ans, 
+void DeviceT::output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans, 
                           Neighbor &nbor, const double avg_split, 
-                                  const double max_bytes, 
-                                  const double gpu_overhead,
+                           const double max_bytes, const double gpu_overhead,
                           const double driver_overhead, 
                           const int threads_per_atom, FILE *screen) {
  double single[9], times[9];
@ -574,33 +580,32 @@ int DeviceT::compile_kernels() {
  k_info.set_function(*dev_program,"kernel_info");
  _compiled=true;

-  UCL_H_Vec<int> h_gpu_lib_data(14,*gpu,UCL_NOT_PINNED);
-  UCL_D_Vec<int> d_gpu_lib_data(14,*gpu);
+  UCL_Vector<int,int> gpu_lib_data(14,*gpu,UCL_NOT_PINNED);
  k_info.set_size(1,1);
-  k_info.run(&d_gpu_lib_data.begin());
-  ucl_copy(h_gpu_lib_data,d_gpu_lib_data,false);
+  k_info.run(&gpu_lib_data);
+  gpu_lib_data.update_host(false);
  
-  _ptx_arch=static_cast<double>(h_gpu_lib_data[0])/100.0;
+  _ptx_arch=static_cast<double>(gpu_lib_data[0])/100.0;
  #ifndef USE_OPENCL
  if (_ptx_arch>gpu->arch())
    return -4;
  #endif

-  _num_mem_threads=h_gpu_lib_data[1];
-  _warp_size=h_gpu_lib_data[2];
+  _num_mem_threads=gpu_lib_data[1];
+  _warp_size=gpu_lib_data[2];
  if (_threads_per_atom<1)
-    _threads_per_atom=h_gpu_lib_data[3];
+    _threads_per_atom=gpu_lib_data[3];
  if (_threads_per_charge<1)
-    _threads_per_charge=h_gpu_lib_data[13];
-  _pppm_max_spline=h_gpu_lib_data[4];
-  _pppm_block=h_gpu_lib_data[5];
-  _block_pair=h_gpu_lib_data[6];
-  _max_shared_types=h_gpu_lib_data[7];
-  _block_cell_2d=h_gpu_lib_data[8];
-  _block_cell_id=h_gpu_lib_data[9];
-  _block_nbor_build=h_gpu_lib_data[10];
-  _block_bio_pair=h_gpu_lib_data[11];
-  _max_bio_shared_types=h_gpu_lib_data[12];
+    _threads_per_charge=gpu_lib_data[13];
+  _pppm_max_spline=gpu_lib_data[4];
+  _pppm_block=gpu_lib_data[5];
+  _block_pair=gpu_lib_data[6];
+  _max_shared_types=gpu_lib_data[7];
+  _block_cell_2d=gpu_lib_data[8];
+  _block_cell_id=gpu_lib_data[9];
+  _block_nbor_build=gpu_lib_data[10];
+  _block_bio_pair=gpu_lib_data[11];
+  _max_bio_shared_types=gpu_lib_data[12];

  if (static_cast<size_t>(_block_pair)>gpu->group_size())
    _block_pair=gpu->group_size();
@ -634,9 +639,10 @@ Device<PRECISION,ACC_PRECISION> global_device;
 int lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
                    const int last_gpu, const int gpu_mode, 
                    const double particle_split, const int nthreads,
-                    const int t_per_atom) {
+                    const int t_per_atom, const double cell_size) {
  return global_device.init_device(world,replica,first_gpu,last_gpu,gpu_mode,
-                                     particle_split,nthreads,t_per_atom);
+                                   particle_split,nthreads,t_per_atom, 
+                                   cell_size);
 }

 void lmp_clear_device() {
--- a/lib/gpu/lal_device.h
+++ b/lib/gpu/lal_device.h
@ -49,7 +49,7 @@ class Device {
  int init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, 
                   const int last_gpu, const int gpu_mode, 
                   const double particle_split, const int nthreads,
-                   const int t_per_atom);
+                   const int t_per_atom, const double cell_size);

  /// Initialize the device for Atom and Neighbor storage
  /** \param rot True if quaternions need to be stored
@ -239,7 +239,7 @@ class Device {
    int num_blocks=static_cast<int>(ceil(static_cast<double>(numel)/
                                    _block_pair));
    k_zero.set_size(num_blocks,_block_pair);
-    k_zero.run(&mem.begin(),&numel);
+    k_zero.run(&mem,&numel);
  }

  // -------------------------- DEVICE DATA ------------------------- 
@ -288,6 +288,7 @@ class Device {
  double _particle_split;
  double _cpu_full;
  double _ptx_arch;
+  double _cell_size; // -1 if the cutoff is used

  int _num_mem_threads, _warp_size, _threads_per_atom, _threads_per_charge;
  int _pppm_max_spline, _pppm_block;
--- a/lib/gpu/lal_eam.cpp
+++ b/lib/gpu/lal_eam.cpp
@ -13,10 +13,12 @@
    email                : brownw@ornl.gov nguyentd@ornl.gov
 ***************************************************************************/
 
-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "eam_cl.h"
+#elif defined(USE_CUDART)
+const char *eam=0;
 #else
-#include "eam_ptx.h"
+#include "eam_cubin.h"
 #endif

 #include "lal_eam.h"
@ -51,32 +53,24 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
 {
  int success;
  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,
-                            gpu_split,_screen,eam);
+                            gpu_split,_screen,eam,"k_eam");
  
  if (success!=0)
    return success;
  
  // allocate fp
  
-  bool cpuview=false;
-  if (this->ucl_device->device_type()==UCL_CPU)
-    cpuview=true;
-  
  int ef_nall=nall;
  if (ef_nall==0)
    ef_nall=2000;

  _max_fp_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
-  host_fp.alloc(_max_fp_size,*(this->ucl_device));
-  if (cpuview)
-    dev_fp.view(host_fp);
-  else 
-    dev_fp.alloc(_max_fp_size,*(this->ucl_device),UCL_WRITE_ONLY);
+  _fp.alloc(_max_fp_size,*(this->ucl_device),UCL_RW_OPTIMIZED,UCL_WRITE_ONLY);
                                     
-  k_energy.set_function(*(this->pair_program),"kernel_energy");
-  k_energy_fast.set_function(*(this->pair_program),"kernel_energy_fast");
+  k_energy.set_function(*(this->pair_program),"k_energy");
+  k_energy_fast.set_function(*(this->pair_program),"k_energy_fast");
  fp_tex.get_texture(*(this->pair_program),"fp_tex");
-  fp_tex.bind_float(dev_fp,1);
+  fp_tex.bind_float(_fp,1);
  _compiled_energy = true;
  
  // Initialize timers for selected GPU
@ -236,7 +230,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
        + frho_spline2.row_bytes()
        + z2r_spline1.row_bytes()
        + z2r_spline2.row_bytes()
-        + dev_fp.row_bytes();
+        + _fp.device.row_bytes();
  return 0;
 }

@ -255,8 +249,7 @@ void EAMT::clear() {
  z2r_spline1.clear();
  z2r_spline2.clear();
  
-  host_fp.clear();
-  dev_fp.clear();
+  _fp.clear();
  
  time_pair2.clear();
  time_fp1.clear();
@ -303,19 +296,11 @@ void EAMT::compute(const int f_ago, const int inum_full, const int nlocal,
  // ------------------- Resize FP Array for EAM --------------------
  
  if (nall>_max_fp_size) {
-    dev_fp.clear();
-    host_fp.clear();
-    
    _max_fp_size=static_cast<int>(static_cast<double>(nall)*1.10);
-    host_fp.alloc(_max_fp_size,*(this->ucl_device));
-    if (this->ucl_device->device_type()==UCL_CPU)
-      dev_fp.view(host_fp);
-    else 
-      dev_fp.alloc(_max_fp_size,*(this->ucl_device));
-    
-    fp_tex.bind_float(dev_fp,1);
+    _fp.resize(_max_fp_size);
+    fp_tex.bind_float(_fp,1);
  }
-  *fp_ptr=host_fp.begin();
+  *fp_ptr=_fp.host.begin();

  // ----------------------------------------------------------------

@ -348,7 +333,7 @@ void EAMT::compute(const int f_ago, const int inum_full, const int nlocal,
  // copy fp from device to host for comm
  _nlocal=nlocal;
  time_fp1.start();
-  ucl_copy(host_fp,dev_fp,nlocal,true);
+  _fp.update_host(nlocal,true);
  time_fp1.stop();
  time_fp1.sync_stop();
 }
@ -380,19 +365,11 @@ int** EAMT::compute(const int ago, const int inum_full, const int nall,
  // ------------------- Resize FP Array for EAM --------------------
  
  if (nall>_max_fp_size) {
-    dev_fp.clear();
-    host_fp.clear();
-    
    _max_fp_size=static_cast<int>(static_cast<double>(nall)*1.10);
-    host_fp.alloc(_max_fp_size,*(this->ucl_device));
-    if (this->ucl_device->device_type()==UCL_CPU)
-      dev_fp.view(host_fp);
-    else 
-      dev_fp.alloc(_max_fp_size,*(this->ucl_device));
-    
-    fp_tex.bind_float(dev_fp,1);
+    _fp.resize(_max_fp_size);
+    fp_tex.bind_float(_fp,1);
  }      
-  *fp_ptr=host_fp.begin();  
+  *fp_ptr=_fp.host.begin();  

  // -----------------------------------------------------------------
  
@ -428,7 +405,7 @@ int** EAMT::compute(const int ago, const int inum_full, const int nall,
  // copy fp from device to host for comm
  _nlocal=inum_full;
  time_fp1.start();
-  ucl_copy(host_fp,dev_fp,inum_full,true);
+  _fp.update_host(inum_full,true);
  time_fp1.stop();
  time_fp1.sync_stop();
  
@ -486,22 +463,18 @@ void EAMT::loop(const bool _eflag, const bool _vflag) {
  
  if (shared_types) {
    this->k_energy_fast.set_size(GX,BX);
-    this->k_energy_fast.run(&this->atom->dev_x.begin(), &type2rhor_z2r.begin(),
-                            &type2frho.begin(), &rhor_spline2.begin(),
-                            &frho_spline1.begin(),&frho_spline2.begin(), 
-                            &this->nbor->dev_nbor.begin(), 
-                            &this->_nbor_data->begin(), &dev_fp.begin(), 
-                            &this->ans->dev_engv.begin(), &eflag, &ainum,
+    this->k_energy_fast.run(&this->atom->x, &type2rhor_z2r, &type2frho,
+                            &rhor_spline2, &frho_spline1,&frho_spline2, 
+                            &this->nbor->dev_nbor,  &this->_nbor_data->begin(), 
+                            &_fp, &this->ans->engv, &eflag, &ainum,
                            &nbor_pitch, &_ntypes, &_cutforcesq, &_rdr, &_rdrho,
                            &_nrho, &_nr, &this->_threads_per_atom);
  } else {
    this->k_energy.set_size(GX,BX);
-    this->k_energy.run(&this->atom->dev_x.begin(), &type2rhor_z2r.begin(),
-                       &type2frho.begin(), &rhor_spline2.begin(),
-                       &frho_spline1.begin(),&frho_spline2.begin(), 
-                       &this->nbor->dev_nbor.begin(), 
-                       &this->_nbor_data->begin(), &dev_fp.begin(), 
-                       &this->ans->dev_engv.begin(),&eflag, &ainum, &nbor_pitch,
+    this->k_energy.run(&this->atom->x, &type2rhor_z2r, &type2frho,
+                       &rhor_spline2, &frho_spline1, &frho_spline2, 
+                       &this->nbor->dev_nbor, &this->_nbor_data->begin(), &_fp, 
+                       &this->ans->engv,&eflag, &ainum, &nbor_pitch,
                       &_ntypes, &_cutforcesq, &_rdr, &_rdrho, &_nrho, &_nr,
                       &this->_threads_per_atom);
  }
@ -536,27 +509,19 @@ void EAMT::loop2(const bool _eflag, const bool _vflag) {
  
  if (shared_types) {
    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->dev_x.begin(), &dev_fp.begin(), 
-                   &type2rhor_z2r.begin(),
-                   &rhor_spline1.begin(), 
-                   &z2r_spline1.begin(),
-                   &z2r_spline2.begin(), 
-                   &this->nbor->dev_nbor.begin(),
-                   &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                   &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                   &nbor_pitch, &_cutforcesq, &_rdr, &_nr,
-                   &this->_threads_per_atom);
+    this->k_pair_fast.run(&this->atom->x, &_fp, &type2rhor_z2r,
+                          &rhor_spline1, &z2r_spline1, &z2r_spline2, 
+                          &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                          &this->ans->force, &this->ans->engv, &eflag,
+                          &vflag, &ainum, &nbor_pitch, &_cutforcesq, &_rdr,
+                          &_nr, &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->dev_x.begin(), &dev_fp.begin(), 
-                   &type2rhor_z2r.begin(),
-                   &rhor_spline1.begin(), 
-                   &z2r_spline1.begin(),
-                   &z2r_spline2.begin(),
-                   &this->nbor->dev_nbor.begin(),
-                   &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                   &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                   &nbor_pitch, &_ntypes, &_cutforcesq, &_rdr, &_nr,
+    this->k_pair.run(&this->atom->x, &_fp, &type2rhor_z2r, &rhor_spline1, 
+                     &z2r_spline1, &z2r_spline2, &this->nbor->dev_nbor,
+                     &this->_nbor_data->begin(), &this->ans->force,
+                     &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch,
+                     &_ntypes, &_cutforcesq, &_rdr, &_nr,
                     &this->_threads_per_atom);
  }

--- a/lib/gpu/lal_eam.cu
+++ b/lib/gpu/lal_eam.cu
@ -15,66 +15,37 @@

 #ifdef NV_KERNEL
 #include "lal_aux_fun1.h"
+
+#ifndef _DOUBLE_DOUBLE
 texture<float4> pos_tex;
 texture<float> fp_tex;
-
 texture<float4> rhor_sp1_tex;
 texture<float4> rhor_sp2_tex;
 texture<float4> frho_sp1_tex;
 texture<float4> frho_sp2_tex;
 texture<float4> z2r_sp1_tex;
 texture<float4> z2r_sp2_tex;
-
-#ifdef _DOUBLE_DOUBLE
-ucl_inline double4 fetch_rhor_sp1(const int& i, const double4 *rhor_spline1) { 
-  return rhor_spline1[i]; 
-}
-ucl_inline double4 fetch_rhor_sp2(const int& i, const double4 *rhor_spline2) { 
-  return rhor_spline2[i]; 
-}
-ucl_inline double4 fetch_frho_sp1(const int& i, const double4 *frho_spline1) { 
-  return frho_spline1[i]; 
-}
-ucl_inline double4 fetch_frho_sp2(const int& i, const double4 *frho_spline2) { 
-  return frho_spline2[i]; 
-}
-ucl_inline double4 fetch_z2r_sp1(const int& i, const double4 *z2r_spline1) { 
-  return z2r_spline1[i]; 
-}
-ucl_inline double4 fetch_z2r_sp2(const int& i, const double4 *z2r_spline2) { 
-  return z2r_spline2[i]; 
-}
+#else
+texture<int4> pos_tex;
+texture<int2> fp_tex;
+texture<int4> rhor_sp1_tex;
+texture<int4> rhor_sp2_tex;
+texture<int4> frho_sp1_tex;
+texture<int4> frho_sp2_tex;
+texture<int4> z2r_sp1_tex;
+texture<int4> z2r_sp2_tex;
 #endif

-#ifndef _DOUBLE_DOUBLE
-ucl_inline float4 fetch_pos(const int& i, const float4 *pos)
-  { return tex1Dfetch(pos_tex, i); }
-ucl_inline float fetch_q(const int& i, const float *fp) 
-  { return tex1Dfetch(fp_tex, i); }
+#else

-ucl_inline float4 fetch_rhor_sp1(const int& i, const float4 *rhor_spline1) 
-  { return tex1Dfetch(rhor_sp1_tex, i); }
-ucl_inline float4 fetch_rhor_sp2(const int& i, const float4 *rhor_spline2) 
-  { return tex1Dfetch(rhor_sp2_tex, i); }
-ucl_inline float4 fetch_frho_sp1(const int& i, const float4 *frho_spline1) 
-  { return tex1Dfetch(frho_sp1_tex, i); }
-ucl_inline float4 fetch_frho_sp2(const int& i, const float4 *frho_spline2) 
-  { return tex1Dfetch(frho_sp2_tex, i); }
-ucl_inline float4 fetch_z2r_sp1(const int& i, const float4 *z2r_spline1) 
-  { return tex1Dfetch(z2r_sp1_tex, i); }
-ucl_inline float4 fetch_z2r_sp2(const int& i, const float4 *z2r_spline2) 
-  { return tex1Dfetch(z2r_sp2_tex, i); }
-#endif
-
-#else // OPENCL
-
-#define fetch_q(i,y) fp_[i]
-#define fetch_rhor_sp1(i,y) rhor_spline1[i]
-#define fetch_rhor_sp2(i,y) rhor_spline2[i]
-#define fetch_frho_sp1(i,y) frho_spline1[i]
-#define fetch_frho_sp2(i,y) frho_spline2[i]
-#define fetch_z2r_sp1(i,y) z2r_spline1[i] 
-#define fetch_z2r_sp2(i,y) z2r_spline2[i]
+#define pos_tex x_
+#define fp_tex fp_
+#define rhor_sp1_tex rhor_spline1
+#define rhor_sp2_tex rhor_spline2
+#define frho_sp1_tex frho_spline1
+#define frho_sp2_tex frho_spline2
+#define z2r_sp1_tex z2r_spline1
+#define z2r_sp2_tex z2r_spline2

 #endif

@ -99,11 +70,11 @@ ucl_inline float4 fetch_z2r_sp2(const int& i, const float4 *z2r_spline2)
    p -= m;                                                                 \
    p = MIN(p,(numtyp)1.0);                                                 \
    int index = type2frho[itype]*(nrho+1)+m;                                \
-    numtyp4 coeff = fetch_frho_sp1(index, frho_spline1);                    \
+    numtyp4 coeff; fetch4(coeff,index,frho_sp1_tex);                        \
    numtyp fp = (coeff.x*p + coeff.y)*p + coeff.z;                          \
    fp_[i]=fp;                                                              \
    if (eflag>0) {                                                          \
-      coeff = fetch_frho_sp2(index, frho_spline2);                          \
+      fetch4(coeff,index,frho_sp2_tex);                                     \
      energy = ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w;             \
      engv[ii]=(acctyp)2.0*energy;                                          \
    }                                                                       \
@ -154,7 +125,7 @@ ucl_inline float4 fetch_z2r_sp2(const int& i, const float4 *z2r_spline2)
    ans[ii]=f;                                                              \
  }

-__kernel void kernel_energy(__global numtyp4 *x_, __global int2 *type2rhor_z2r,
+__kernel void k_energy(__global numtyp4 *x_, __global int2 *type2rhor_z2r,
                            __global int *type2frho, 
                            __global numtyp4 *rhor_spline2, 
                            __global numtyp4 *frho_spline1,
@ -178,14 +149,14 @@ __kernel void kernel_energy(__global numtyp4 *x_, __global int2 *type2rhor_z2r,
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;
    
    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;
      j &= NEIGHMASK;

-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int jtype=jx.w;

      // Compute r12
@ -203,7 +174,7 @@ __kernel void kernel_energy(__global numtyp4 *x_, __global int2 *type2rhor_z2r,
        
        int mtype = jtype*ntypes+itype;
        int index = type2rhor_z2r[mtype].x*(nr+1)+m;
-        numtyp4 coeff = fetch_rhor_sp2(index, rhor_spline2);
+        numtyp4 coeff; fetch4(coeff,index,rhor_sp2_tex);
        rho += ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w;
      }
    } // for nbor
@ -213,7 +184,7 @@ __kernel void kernel_energy(__global numtyp4 *x_, __global int2 *type2rhor_z2r,
  } // if ii
 }

-__kernel void kernel_energy_fast(__global numtyp4 *x_, 
+__kernel void k_energy_fast(__global numtyp4 *x_, 
                                 __global int2 *type2rhor_z2r_in,
                                 __global int *type2frho_in, 
                                 __global numtyp4 *rhor_spline2, 
@ -252,14 +223,14 @@ __kernel void kernel_energy_fast(__global numtyp4 *x_,
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;
    
    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;
      j &= NEIGHMASK;

-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];

      // Compute r12
      numtyp delx = ix.x-jx.x;
@ -277,7 +248,7 @@ __kernel void kernel_energy_fast(__global numtyp4 *x_,
        int jtype=fast_mul((int)MAX_SHARED_TYPES,jx.w);
        int mtype = jtype+itype;
        int index = type2rhor_z2r[mtype].x*(nr+1)+m;
-        numtyp4 coeff = fetch_rhor_sp2(index, rhor_spline2);
+        numtyp4 coeff; fetch4(coeff,index,rhor_sp2_tex);
        rho += ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w;
      }
    } // for nbor
@ -287,7 +258,7 @@ __kernel void kernel_energy_fast(__global numtyp4 *x_,
  } // if ii
 }

-__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp *fp_,
+__kernel void k_eam(__global numtyp4 *x_, __global numtyp *fp_,
                          __global int2 *type2rhor_z2r,
                          __global numtyp4 *rhor_spline1, 
                          __global numtyp4 *z2r_spline1,
@ -317,15 +288,15 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp *fp_,
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    numtyp ifp=fetch_q(i,fp_);  //fp_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp ifp; fetch(ifp,i,fp_tex);  //fp_[i];
    int itype=ix.w;

    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;
      j &= NEIGHMASK;

-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int jtype=jx.w;

      // Compute r12
@ -347,25 +318,27 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp *fp_,

        mtype = itype*ntypes+jtype;
        index = type2rhor_z2r[mtype].x*(nr+1)+m;
-        coeff = fetch_rhor_sp1(index, rhor_spline1); 
+        fetch4(coeff,index,rhor_sp1_tex);
        numtyp rhoip = (coeff.x*p + coeff.y)*p + coeff.z;

        mtype = jtype*ntypes+itype;
        index = type2rhor_z2r[mtype].x*(nr+1)+m;
-        coeff = fetch_rhor_sp1(index, rhor_spline1); 
+        fetch4(coeff,index,rhor_sp1_tex);
        numtyp rhojp = (coeff.x*p + coeff.y)*p + coeff.z;
              
        mtype = itype*ntypes+jtype;
        index = type2rhor_z2r[mtype].y*(nr+1)+m;
-        coeff = fetch_z2r_sp1(index, z2r_spline1);
+        fetch4(coeff,index,z2r_sp1_tex);
        numtyp z2p = (coeff.x*p + coeff.y)*p + coeff.z;
-        coeff = fetch_z2r_sp2(index, z2r_spline2);
+        fetch4(coeff,index,z2r_sp2_tex);
        numtyp z2 = ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w;
        
        numtyp recip = ucl_recip(r);
        numtyp phi = z2*recip;
        numtyp phip = z2p*recip - phi*recip;
-        numtyp psip = ifp*rhojp + fetch_q(j,fp_)*rhoip + phip; 
+        numtyp psip;
+        fetch(psip,j,fp_tex);
+        psip = ifp*rhojp + psip*rhoip + phip; 
        numtyp force = -psip*recip;
        
        f.x+=delx*force;
@ -391,7 +364,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp *fp_,

 }

-__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp *fp_,
+__kernel void k_eam_fast(__global numtyp4 *x_, __global numtyp *fp_,
                          __global int2 *type2rhor_z2r_in,
                          __global numtyp4 *rhor_spline1, 
                          __global numtyp4 *z2r_spline1,
@ -427,8 +400,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp *fp_,
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);

-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    numtyp ifp=fetch_q(i,fp_); //fp_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp ifp; fetch(ifp,i,fp_tex); //fp_[i];
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

@ -436,7 +409,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp *fp_,
      int j=*nbor;
      j &= NEIGHMASK;

-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int jw=jx.w;
      int jtype=fast_mul((int)MAX_SHARED_TYPES,jw);
      
@ -459,25 +432,27 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp *fp_,
        
        mtype = itype+jw;
        index = type2rhor_z2r[mtype].x*(nr+1)+m;
-        coeff = fetch_rhor_sp1(index, rhor_spline1); 
+        fetch4(coeff,index,rhor_sp1_tex);
        numtyp rhoip = (coeff.x*p + coeff.y)*p + coeff.z;
        
        mtype = jtype+iw;
        index = type2rhor_z2r[mtype].x*(nr+1)+m;
-        coeff = fetch_rhor_sp1(index, rhor_spline1); 
+        fetch4(coeff,index,rhor_sp1_tex);
        numtyp rhojp = (coeff.x*p + coeff.y)*p + coeff.z;
        
        mtype = itype+jw;
        index = type2rhor_z2r[mtype].y*(nr+1)+m;
-        coeff = fetch_z2r_sp1(index, z2r_spline1);
+        fetch4(coeff,index,z2r_sp1_tex);
        numtyp z2p = (coeff.x*p + coeff.y)*p + coeff.z;
-        coeff = fetch_z2r_sp2(index, z2r_spline2);
+        fetch4(coeff,index,z2r_sp2_tex);
        numtyp z2 = ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w;
      
        numtyp recip = ucl_recip(r);
        numtyp phi = z2*recip;
        numtyp phip = z2p*recip - phi*recip;
-        numtyp psip = ifp*rhojp + fetch_q(j,fp_)*rhoip + phip;
+        numtyp psip;
+        fetch(psip,j,fp_tex);
+        psip = ifp*rhojp + psip*rhoip + phip; 
        numtyp force = -psip*recip;
        
        f.x+=delx*force;
--- a/lib/gpu/lal_eam.h
+++ b/lib/gpu/lal_eam.h
@ -52,8 +52,8 @@ class EAM : public BaseAtomic<numtyp, acctyp> {
    if (nghost>0) {
      UCL_H_Vec<numtyp> host_view;
      UCL_D_Vec<numtyp> dev_view;
-      host_view.view_offset(_nlocal,host_fp);
-      dev_view.view_offset(_nlocal,dev_fp);
+      host_view.view_offset(_nlocal,_fp.host);
+      dev_view.view_offset(_nlocal,_fp.device);
      ucl_copy(dev_view,host_view,nghost,true);
    }
  }
@ -128,8 +128,7 @@ class EAM : public BaseAtomic<numtyp, acctyp> {
  bool _compiled_energy;
  
  /// Per-atom arrays
-  UCL_H_Vec<numtyp> host_fp;
-  UCL_D_Vec<numtyp> dev_fp;
+  UCL_Vector<numtyp,numtyp> _fp;
  
 protected:
  bool _allocated;
--- a/lib/gpu/lal_ellipsoid_extra.h
+++ b/lib/gpu/lal_ellipsoid_extra.h
@ -20,6 +20,14 @@ enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};

 #ifdef NV_KERNEL
 #include "lal_preprocessor.h"
+#ifndef _DOUBLE_DOUBLE
+texture<float4> pos_tex, quat_tex;
+#else
+texture<int4,1> pos_tex, quat_tex;
+#endif
+#else
+#define pos_tex x_
+#define quat_tex qif
 #endif

 #define atom_info(t_per_atom, ii, tid, offset)                               \
@ -411,7 +419,7 @@ ucl_inline void gpu_mldivide3(const numtyp m[9], const numtyp *v, numtyp *ans,
 ucl_inline void gpu_quat_to_mat_trans(__global const numtyp4 *qif, const int qi, 
                                    numtyp mat[9])
 {
-  numtyp4 q=qif[qi];
+  numtyp4 q; fetch4(q,qi,quat_tex);
  
  numtyp w2 = q.x*q.x;
  numtyp i2 = q.y*q.y;
--- a/lib/gpu/lal_ellipsoid_nbor.cu
+++ b/lib/gpu/lal_ellipsoid_nbor.cu
@ -15,6 +15,13 @@

 #ifdef NV_KERNEL
 #include "lal_preprocessor.h"
+#ifndef _DOUBLE_DOUBLE
+texture<float4> pos_tex;
+#else
+texture<int4,1> pos_tex;
+#endif
+#else
+#define pos_tex x_
 #endif

 // ---------------------------------------------------------------------------
@ -40,14 +47,14 @@ __kernel void kernel_nbor(__global numtyp4 *x_, __global numtyp2 *cut_form,
    __global int *list_end=nbor+fast_mul(numj,nbor_pitch);
    __global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
  
-    numtyp4 ix=x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int iw=ix.w;
    int itype=fast_mul(iw,ntypes);
    int newj=0;  
    for ( ; nbor<list_end; nbor+=nbor_pitch) {
      int j=*nbor;
      j &= NEIGHMASK;
-      numtyp4 jx=x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int jtype=jx.w;
      int mtype=itype+jtype;
      numtyp2 cf=cut_form[mtype];
@ -102,7 +109,7 @@ __kernel void kernel_nbor_fast(__global numtyp4 *x_, __global numtyp2 *cut_form,
    __global int *list_end=nbor+fast_mul(numj,nbor_pitch);
    __global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
  
-    numtyp4 ix=x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

@ -110,7 +117,7 @@ __kernel void kernel_nbor_fast(__global numtyp4 *x_, __global numtyp2 *cut_form,
    for ( ; nbor<list_end; nbor+=nbor_pitch) {
      int j=*nbor;
      j &= NEIGHMASK;
-      numtyp4 jx=x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int jtype=jx.w;
      int mtype=itype+jtype;
      
--- a/lib/gpu/lal_gayberne.cpp
+++ b/lib/gpu/lal_gayberne.cpp
@ -13,12 +13,15 @@
    email                : brownw@ornl.gov
 ***************************************************************************/

-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "gayberne_cl.h"
 #include "gayberne_lj_cl.h"
+#elif defined(USE_CUDART)
+const char *gayberne=0;
+const char *gayberne_lj=0;
 #else
-#include "gayberne_ptx.h"
-#include "gayberne_lj_ptx.h"
+#include "gayberne_cubin.h"
+#include "gayberne_lj_cubin.h"
 #endif

 #include "lal_gayberne.h"
@ -57,7 +60,8 @@ int GayBerneT::init(const int ntypes, const double gamma,
                         const double gpu_split, FILE *_screen) {
  int success;
  success=this->init_base(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                          _screen,ntypes,h_form,gayberne,gayberne_lj);
+                          _screen,ntypes,h_form,gayberne,gayberne_lj,
+                          "k_gayberne");
  if (success!=0)
    return success;

@ -210,13 +214,13 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {

      this->time_ellipsoid.start();
      this->k_ellipsoid.set_size(GX,BX);
-      this->k_ellipsoid.run(&this->atom->dev_x.begin(),
-       &this->atom->dev_quat.begin(), &this->shape.begin(), &this->well.begin(),
-       &this->gamma_upsilon_mu.begin(), &this->sigma_epsilon.begin(), 
-       &this->_lj_types, &this->lshape.begin(), &this->nbor->dev_nbor.begin(),
-       &stride, &this->ans->dev_ans.begin(),&ainum,&this->ans->dev_engv.begin(),
-       &this->dev_error.begin(), &eflag, &vflag, &this->_last_ellipse,
-       &this->_threads_per_atom);
+      this->k_ellipsoid.run(&this->atom->x, &this->atom->quat, 
+                            &this->shape, &this->well, &this->gamma_upsilon_mu,
+                            &this->sigma_epsilon, &this->_lj_types, 
+                            &this->lshape, &this->nbor->dev_nbor, &stride, 
+                            &this->ans->force, &ainum, &this->ans->engv,
+                            &this->dev_error, &eflag, &vflag, 
+                            &this->_last_ellipse, &this->_threads_per_atom);
      this->time_ellipsoid.stop();

      if (this->_last_ellipse==this->ans->inum()) {
@ -243,17 +247,19 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {

      this->time_ellipsoid2.start();
      this->k_sphere_ellipsoid.set_size(GX,BX);
-      this->k_sphere_ellipsoid.run(&this->atom->dev_x.begin(),
-        &this->atom->dev_quat.begin(), &this->shape.begin(), 
-        &this->well.begin(), &this->gamma_upsilon_mu.begin(), 
-        &this->sigma_epsilon.begin(), &this->_lj_types, &this->lshape.begin(), 
-        &this->nbor->dev_nbor.begin(), &stride, &this->ans->dev_ans.begin(),
-        &this->ans->dev_engv.begin(), &this->dev_error.begin(), &eflag,
-        &vflag, &this->_last_ellipse, &ainum, &this->_threads_per_atom);
+      this->k_sphere_ellipsoid.run(&this->atom->x, &this->atom->quat,
+                                   &this->shape,  &this->well, 
+                                   &this->gamma_upsilon_mu, 
+                                   &this->sigma_epsilon, &this->_lj_types, 
+                                   &this->lshape,  &this->nbor->dev_nbor, 
+                                   &stride, &this->ans->force, 
+                                   &this->ans->engv, &this->dev_error, 
+                                   &eflag, &vflag, &this->_last_ellipse,
+                                   &ainum, &this->_threads_per_atom);
      this->time_ellipsoid2.stop();
   } else {
-      this->ans->dev_ans.zero();
-      this->ans->dev_engv.zero();
+      this->ans->force.zero();
+      this->ans->engv.zero();
      this->time_nbor1.stop();
      this->time_ellipsoid.start();                                 
      this->time_ellipsoid.stop();
@ -268,19 +274,20 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
    if (this->_last_ellipse<this->ans->inum()) {
      if (this->_shared_types) {
        this->k_lj_fast.set_size(GX,BX);
-        this->k_lj_fast.run(&this->atom->dev_x.begin(), &this->lj1.begin(),
-          &this->lj3.begin(), &this->gamma_upsilon_mu.begin(), &stride,
-          &this->nbor->dev_packed.begin(), &this->ans->dev_ans.begin(),
-          &this->ans->dev_engv.begin(), &this->dev_error.begin(),
-          &eflag, &vflag, &this->_last_ellipse, &ainum,
+        this->k_lj_fast.run(&this->atom->x, &this->lj1, &this->lj3, 
+                            &this->gamma_upsilon_mu, &stride, 
+                            &this->nbor->dev_packed, &this->ans->force,
+                            &this->ans->engv, &this->dev_error, &eflag, 
+                            &vflag, &this->_last_ellipse, &ainum,
                            &this->_threads_per_atom);
      } else {
        this->k_lj.set_size(GX,BX);
-        this->k_lj.run(&this->atom->dev_x.begin(), &this->lj1.begin(),
-          &this->lj3.begin(), &this->_lj_types, &this->gamma_upsilon_mu.begin(),
-          &stride, &this->nbor->dev_packed.begin(), &this->ans->dev_ans.begin(),
-          &this->ans->dev_engv.begin(), &this->dev_error.begin(), &eflag,
-          &vflag, &this->_last_ellipse, &ainum, &this->_threads_per_atom);
+        this->k_lj.run(&this->atom->x, &this->lj1, &this->lj3, 
+                       &this->_lj_types, &this->gamma_upsilon_mu, &stride,
+                       &this->nbor->dev_packed, &this->ans->force,
+                       &this->ans->engv, &this->dev_error, &eflag,
+                       &vflag, &this->_last_ellipse, &ainum,
+                       &this->_threads_per_atom);
      }
    }
    this->time_lj.stop();
@ -294,12 +301,11 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
    this->time_nbor1.stop();
    this->time_ellipsoid.start(); 
    this->k_ellipsoid.set_size(GX,BX);
-    this->k_ellipsoid.run(&this->atom->dev_x.begin(), 
-      &this->atom->dev_quat.begin(), &this->shape.begin(), &this->well.begin(), 
-      &this->gamma_upsilon_mu.begin(), &this->sigma_epsilon.begin(), 
-      &this->_lj_types, &this->lshape.begin(), &this->nbor->dev_nbor.begin(),
-      &stride, &this->ans->dev_ans.begin(), &ainum, 
-      &this->ans->dev_engv.begin(), &this->dev_error.begin(),
+    this->k_ellipsoid.run(&this->atom->x,  &this->atom->quat, 
+                          &this->shape, &this->well, &this->gamma_upsilon_mu, 
+                          &this->sigma_epsilon, &this->_lj_types, &this->lshape,
+                          &this->nbor->dev_nbor, &stride, &this->ans->force,
+                          &ainum,  &this->ans->engv, &this->dev_error,
                          &eflag, &vflag, &ainum, &this->_threads_per_atom);
    this->time_ellipsoid.stop();
  }
--- a/lib/gpu/lal_gayberne.cu
+++ b/lib/gpu/lal_gayberne.cu
@ -80,7 +80,7 @@ ucl_inline void compute_eta_torque(numtyp m[9],numtyp m2[9], const numtyp4 shape
 		    m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den;
 }

-__kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q,
+__kernel void k_gayberne(__global numtyp4* x_,__global numtyp4 *q,
                         __global numtyp4* shape, __global numtyp4* well, 
                         __global numtyp *gum, __global numtyp2* sig_eps, 
                         const int ntypes, __global numtyp *lshape, 
@ -117,7 +117,7 @@ __kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q,
    nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
                n_stride,nbor_end,nbor);
  
-    numtyp4 ix=x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex);
    int itype=ix.w;
    numtyp a1[9], b1[9], g1[9];
    numtyp4 ishape=shape[itype];
@ -136,7 +136,7 @@ __kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q,
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

-      numtyp4 jx=x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex);
      int jtype=jx.w;

      // Compute r12
--- a/lib/gpu/lal_gayberne_lj.cu
+++ b/lib/gpu/lal_gayberne_lj.cu
@ -17,15 +17,15 @@
 #include "lal_ellipsoid_extra.h"
 #endif

-__kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
-                               __global numtyp4* shape,__global numtyp4* well, 
-                               __global numtyp *gum, __global numtyp2* sig_eps, 
-                               const int ntypes, __global numtyp *lshape, 
-                               __global int *dev_nbor, const int stride, 
-                               __global acctyp4 *ans, __global acctyp *engv, 
-                               __global int *err_flag, const int eflag, 
-                               const int vflag,const int start, const int inum, 
-                               const int t_per_atom) {
+__kernel void k_gayberne_sphere_ellipsoid(__global numtyp4 *x_,
+                  __global numtyp4 *q, __global numtyp4* shape,
+                  __global numtyp4* well, __global numtyp *gum, 
+                  __global numtyp2* sig_eps, const int ntypes, 
+                  __global numtyp *lshape, __global int *dev_nbor, 
+                  const int stride, __global acctyp4 *ans, 
+                  __global acctyp *engv, __global int *err_flag, 
+                  const int eflag, const int vflag,const int start, 
+                  const int inum, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
  ii+=start;
@ -51,7 +51,7 @@ __kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
    nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
                n_stride,nbor_end,nbor);
  
-    numtyp4 ix=x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex);
    int itype=ix.w;
      
    numtyp oner=shape[itype].x;
@ -64,7 +64,7 @@ __kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

-      numtyp4 jx=x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex);
      int jtype=jx.w;

      // Compute r12
@ -236,14 +236,13 @@ __kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
  } // if ii
 }

-__kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1, 
+__kernel void k_gayberne_lj(__global numtyp4 *x_, __global numtyp4 *lj1, 
                            __global numtyp4* lj3, const int lj_types, 
-                        __global numtyp *gum, 
-                        const int stride, __global int *dev_ij, 
-                        __global acctyp4 *ans, __global acctyp *engv, 
-                        __global int *err_flag, const int eflag, 
-                        const int vflag, const int start, const int inum, 
-                        const int t_per_atom) {
+                            __global numtyp *gum, const int stride, 
+                            __global int *dev_ij, __global acctyp4 *ans, 
+                            __global acctyp *engv, __global int *err_flag, 
+                            const int eflag, const int vflag, const int start,
+                            const int inum, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
  ii+=start;
@ -269,7 +268,7 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
    nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
                n_stride,list_end,nbor);
  
-    numtyp4 ix=x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex);
    int itype=ix.w;

    numtyp factor_lj;
@ -279,7 +278,7 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

-      numtyp4 jx=x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex);
      int jtype=jx.w;

      // Compute r12
@ -319,13 +318,13 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
  } // if ii
 }

-__kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, 
+__kernel void k_gayberne_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, 
                                 __global numtyp4* lj3_in, __global numtyp *gum, 
                                 const int stride, __global int *dev_ij,
                                 __global acctyp4 *ans, __global acctyp *engv,
                                 __global int *err_flag, const int eflag,
-                             const int vflag, const int start, const int inum,
-                             const int t_per_atom) {
+                                 const int vflag, const int start, 
+                                 const int inum, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
  ii+=start;
@ -358,7 +357,7 @@ __kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
    nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
                n_stride,list_end,nbor);

-    numtyp4 ix=x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex);
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

@ -369,7 +368,7 @@ __kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

-      numtyp4 jx=x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex);
      int mtype=itype+jx.w;

      // Compute r12
@ -406,3 +405,4 @@ __kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                ans,engv);
  } // if ii
 }
+
--- a/lib/gpu/lal_lj.cpp
+++ b/lib/gpu/lal_lj.cpp
@ -13,10 +13,12 @@
    email                : brownw@ornl.gov
 ***************************************************************************/

-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "lj_cl.h"
+#elif defined(USE_CUDART)
+const char *lj=0;
 #else
-#include "lj_ptx.h"
+#include "lj_cubin.h"
 #endif

 #include "lal_lj.h"
@ -51,7 +53,7 @@ int LJT::init(const int ntypes,
                          const double gpu_split, FILE *_screen) {
  int success;
  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                            _screen,lj);
+                            _screen,lj,"k_lj");
  if (success!=0)
    return success;

@ -133,20 +135,17 @@ void LJT::loop(const bool _eflag, const bool _vflag) {
  this->time_pair.start();
  if (shared_types) {
    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
-                          &lj3.begin(), &sp_lj.begin(),
-                          &this->nbor->dev_nbor.begin(),
-                          &this->_nbor_data->begin(),
-                          &this->ans->dev_ans.begin(),
-                          &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &nbor_pitch, &this->_threads_per_atom);
+    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
+                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->ans->force, &this->ans->engv, &eflag,
+                          &vflag, &ainum, &nbor_pitch, 
+                          &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
-                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &nbor_pitch, &this->_threads_per_atom);
+    this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, 
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                     &this->ans->force, &this->ans->engv, &eflag, &vflag,
+                     &ainum, &nbor_pitch, &this->_threads_per_atom);
  }
  this->time_pair.stop();
 }
--- a/lib/gpu/lal_lj.cu
+++ b/lib/gpu/lal_lj.cu
@ -15,14 +15,16 @@

 #ifdef NV_KERNEL
 #include "lal_aux_fun1.h"
-texture<float4> pos_tex;
 #ifndef _DOUBLE_DOUBLE
-ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
-  { return tex1Dfetch(pos_tex, i); }
+texture<float4> pos_tex;
+#else
+texture<int4,1> pos_tex;
 #endif
+#else
+#define pos_tex x_
 #endif

-__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
+__kernel void k_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
                          __global numtyp4* lj3, const int lj_types, 
                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
                          __global int *dev_packed, __global acctyp4 *ans,
@ -51,7 +53,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;

    numtyp factor_lj;
@ -61,7 +63,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int jtype=jx.w;

      // Compute r12
@ -101,7 +103,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
  } // if ii
 }

-__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
+__kernel void k_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                               __global numtyp4* lj3_in, 
                               __global numtyp* sp_lj_in, 
                               __global int *dev_nbor, __global int *dev_packed, 
@ -137,7 +139,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);

-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

@ -148,7 +150,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int mtype=itype+jx.w;

      // Compute r12
--- a/lib/gpu/lal_lj96.cpp
+++ b/lib/gpu/lal_lj96.cpp
@ -13,10 +13,12 @@
    email                : brownw@ornl.gov
 ***************************************************************************/

-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "lj96_cl.h"
+#elif defined(USE_CUDART)
+const char *lj96=0;
 #else
-#include "lj96_ptx.h"
+#include "lj96_cubin.h"
 #endif

 #include "lal_lj96.h"
@ -51,7 +53,7 @@ int LJ96T::init(const int ntypes,
                           const double gpu_split, FILE *_screen) {
  int success;
  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                            _screen,lj96);
+                            _screen,lj96,"k_lj96");
  if (success!=0)
    return success;

@ -133,19 +135,17 @@ void LJ96T::loop(const bool _eflag, const bool _vflag) {
  this->time_pair.start();
  if (shared_types) {
    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
-                          &lj3.begin(), &sp_lj.begin(),
-                          &this->nbor->dev_nbor.begin(),
-                          &this->_nbor_data->begin(),
-                          &this->ans->dev_ans.begin(),
-                          &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &nbor_pitch, &this->_threads_per_atom);
+    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
+                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->ans->force, &this->ans->engv, &eflag,
+                          &vflag, &ainum, &nbor_pitch, 
+                          &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
-                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
+    this->k_pair.run(&this->atom->x, &lj1, &lj3,
+                     &_lj_types, &sp_lj, &this->nbor->dev_nbor,
+                     &this->_nbor_data->begin(), &this->ans->force,
+                     &this->ans->engv, &eflag, &vflag, &ainum,
                     &nbor_pitch, &this->_threads_per_atom);
  }
  this->time_pair.stop();
--- a/lib/gpu/lal_lj96.cu
+++ b/lib/gpu/lal_lj96.cu
@ -15,14 +15,16 @@

 #ifdef NV_KERNEL
 #include "lal_aux_fun1.h"
-texture<float4> pos_tex;
 #ifndef _DOUBLE_DOUBLE
-ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
-  { return tex1Dfetch(pos_tex, i); }
+texture<float4> pos_tex;
+#else
+texture<int4,1> pos_tex;
 #endif
+#else
+#define pos_tex x_
 #endif

-__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
+__kernel void k_lj96(__global numtyp4 *x_, __global numtyp4 *lj1,
                          __global numtyp4* lj3, const int lj_types, 
                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
                          __global int *dev_packed, __global acctyp4 *ans,
@ -51,7 +53,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;

    numtyp factor_lj;
@ -61,7 +63,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int jtype=jx.w;

      // Compute r12
@ -102,7 +104,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
  } // if ii
 }

-__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
+__kernel void k_lj96_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                               __global numtyp4* lj3_in, 
                               __global numtyp* sp_lj_in,
                               __global int *dev_nbor, __global int *dev_packed,
@ -138,7 +140,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

@ -149,7 +151,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int mtype=itype+jx.w;

      // Compute r12
--- a/lib/gpu/lal_lj_class2_long.cpp
+++ b/lib/gpu/lal_lj_class2_long.cpp
@ -13,10 +13,12 @@
    email                : brownw@ornl.gov
 ***************************************************************************/

-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "lj_class2_long_cl.h"
+#elif defined(USE_CUDART)
+const char *lj_class2_long=0;
 #else
-#include "lj_class2_long_ptx.h"
+#include "lj_class2_long_cubin.h"
 #endif

 #include "lal_lj_class2_long.h"
@ -55,7 +57,7 @@ int LJClass2LongT::init(const int ntypes, double **host_cutsq,
                        const double g_ewald) {
  int success;
  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                            _screen,lj_class2_long);
+                            _screen,lj_class2_long,"k_lj_class2_long");
  if (success!=0)
    return success;

@ -143,22 +145,19 @@ void LJClass2LongT::loop(const bool _eflag, const bool _vflag) {
  this->time_pair.start();
  if (shared_types) {
    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
-                          &lj3.begin(), &sp_lj.begin(),
-                          &this->nbor->dev_nbor.begin(),
-                          &this->_nbor_data->begin(),
-                          &this->ans->dev_ans.begin(),
-                          &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &nbor_pitch, &this->atom->dev_q.begin(),
+    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, 
+                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->ans->force, &this->ans->engv, &eflag,
+                          &vflag, &ainum, &nbor_pitch, &this->atom->q,
                          &_cut_coulsq, &_qqrd2e, &_g_ewald, 
                          &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
-                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq,
+    this->k_pair.run(&this->atom->x, &lj1, &lj3,
+                     &_lj_types, &sp_lj, &this->nbor->dev_nbor,
+                     &this->_nbor_data->begin(), &this->ans->force,
+                     &this->ans->engv, &eflag, &vflag, &ainum,
+                     &nbor_pitch, &this->atom->q, &_cut_coulsq,
                     &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
  }
  this->time_pair.stop();
--- a/lib/gpu/lal_lj_class2_long.cu
+++ b/lib/gpu/lal_lj_class2_long.cu
@ -14,18 +14,22 @@
 // ***************************************************************************/

 #ifdef NV_KERNEL
+
 #include "lal_aux_fun1.h"
+#ifndef _DOUBLE_DOUBLE
 texture<float4> pos_tex;
 texture<float> q_tex;
-#ifndef _DOUBLE_DOUBLE
-ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
-  { return tex1Dfetch(pos_tex, i); }
-ucl_inline float fetch_q(const int& i, const float *q) 
-  { return tex1Dfetch(q_tex, i); }
-#endif
+#else
+texture<int4,1> pos_tex;
+texture<int2> q_tex;
 #endif

-__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
+#else
+#define pos_tex x_
+#define q_tex q_
+#endif
+
+__kernel void k_lj_class2_long(__global numtyp4 *x_, __global numtyp4 *lj1,
                          __global numtyp4* lj3, const int lj_types, 
                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
                          __global int *dev_packed, __global acctyp4 *ans,
@ -61,8 +65,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    numtyp qtmp=fetch_q(i,q_);
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
    int itype=ix.w;

    for ( ; nbor<list_end; nbor+=n_stride) {
@ -73,7 +77,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
      j &= NEIGHMASK;

-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int jtype=jx.w;

      // Compute r12
@ -101,7 +105,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
          numtyp expm2 = ucl_exp(-grij*grij);
          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-          prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
+          fetch(prefactor,j,q_tex);
+          prefactor *= qqrd2e * qtmp/r;
          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
        } else
          forcecoul = (numtyp)0.0;
@ -136,7 +141,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
  } // if ii
 }

-__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
+__kernel void k_lj_class2_long_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                               __global numtyp4* lj3_in, 
                               __global numtyp* sp_lj_in,
                               __global int *dev_nbor, __global int *dev_packed,
@ -175,8 +180,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    numtyp qtmp=fetch_q(i,q_);
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

@ -188,7 +193,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
      j &= NEIGHMASK;

-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int mtype=itype+jx.w;

      // Compute r12
@ -215,7 +220,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
          numtyp expm2 = ucl_exp(-grij*grij);
          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-          prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
+          fetch(prefactor,j,q_tex);
+          prefactor *= qqrd2e * qtmp/r;
          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
        } else
          forcecoul = (numtyp)0.0;
--- a/lib/gpu/lal_lj_coul.cpp
+++ b/lib/gpu/lal_lj_coul.cpp
@ -13,10 +13,12 @@
    email                : brownw@ornl.gov
 ***************************************************************************/

-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "lj_coul_cl.h"
+#elif defined(USE_CUDART)
+const char *lj_coul=0;
 #else
-#include "lj_coul_ptx.h"
+#include "lj_coul_cubin.h"
 #endif

 #include "lal_lj_coul.h"
@ -54,7 +56,7 @@ int LJCoulT::init(const int ntypes,
                          double *host_special_coul, const double qqrd2e) {
  int success;
  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                            _screen,lj_coul);
+                            _screen,lj_coul,"k_lj_coul");
  if (success!=0)
    return success;

@ -145,23 +147,18 @@ void LJCoulT::loop(const bool _eflag, const bool _vflag) {
  this->time_pair.start();
  if (shared_types) {
    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
-                          &lj3.begin(), &sp_lj.begin(),
-                          &this->nbor->dev_nbor.begin(),
-                          &this->_nbor_data->begin(),
-                          &this->ans->dev_ans.begin(),
-                          &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &nbor_pitch,
-                          &this->atom->dev_q.begin(), &cutsq.begin(),
-                          &_qqrd2e, &this->_threads_per_atom);
+    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
+                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->ans->force, &this->ans->engv, &eflag, 
+                          &vflag, &ainum, &nbor_pitch, &this->atom->q,
+                          &cutsq, &_qqrd2e, &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
-                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &nbor_pitch, &this->atom->dev_q.begin(),
-                     &cutsq.begin(), &_qqrd2e, &this->_threads_per_atom);
+    this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, 
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                     &this->ans->force, &this->ans->engv, 
+                     &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q,
+                     &cutsq, &_qqrd2e, &this->_threads_per_atom);
  }
  this->time_pair.stop();
 }
--- a/lib/gpu/lal_lj_coul.cu
+++ b/lib/gpu/lal_lj_coul.cu
@ -14,18 +14,22 @@
 // ***************************************************************************/

 #ifdef NV_KERNEL
+
 #include "lal_aux_fun1.h"
+#ifndef _DOUBLE_DOUBLE
 texture<float4> pos_tex;
 texture<float> q_tex;
-#ifndef _DOUBLE_DOUBLE
-ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
-  { return tex1Dfetch(pos_tex, i); }
-ucl_inline float fetch_q(const int& i, const float *q) 
-  { return tex1Dfetch(q_tex, i); }
-#endif
+#else
+texture<int4,1> pos_tex;
+texture<int2> q_tex;
 #endif

-__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
+#else
+#define pos_tex x_
+#define q_tex q_
+#endif
+
+__kernel void k_lj_coul(__global numtyp4 *x_, __global numtyp4 *lj1,
                          __global numtyp4* lj3, const int lj_types, 
                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
                          __global int *dev_packed, __global acctyp4 *ans,
@ -61,8 +65,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    numtyp qtmp=fetch_q(i,q_);
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
    int itype=ix.w;

    for ( ; nbor<list_end; nbor+=n_stride) {
@ -73,7 +77,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
      factor_coul = sp_lj[sbmask(j)+4];
      j &= NEIGHMASK;

-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int jtype=jx.w;

      // Compute r12
@ -93,9 +97,10 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
        } else
          force_lj = (numtyp)0.0;

-        if (rsq < lj1[mtype].w) 
-          forcecoul = qqrd2e*qtmp*fetch_q(j,q_)*ucl_rsqrt(rsq)*factor_coul;
-        else
+        if (rsq < lj1[mtype].w) {
+          fetch(forcecoul,j,q_tex);
+          forcecoul *= qqrd2e*qtmp*ucl_rsqrt(rsq)*factor_coul;
+        } else
          forcecoul = (numtyp)0.0;

        force = (force_lj + forcecoul) * r2inv;
@ -127,7 +132,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
  } // if ii
 }

-__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
+__kernel void k_lj_coul_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                               __global numtyp4* lj3_in, 
                               __global numtyp* sp_lj_in,
                               __global int *dev_nbor, __global int *dev_packed,
@ -168,8 +173,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    numtyp qtmp=fetch_q(i,q_);
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

@ -181,7 +186,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
      factor_coul = sp_lj[sbmask(j)+4];
      j &= NEIGHMASK;

-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int mtype=itype+jx.w;

      // Compute r12
@ -200,9 +205,10 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
        } else
          force_lj = (numtyp)0.0;

-        if (rsq < lj1[mtype].w)
-          forcecoul = qqrd2e*qtmp*fetch_q(j,q_)*ucl_rsqrt(rsq)*factor_coul;
-        else
+        if (rsq < lj1[mtype].w) {
+          fetch(forcecoul,j,q_tex);
+          forcecoul *= qqrd2e*qtmp*ucl_rsqrt(rsq)*factor_coul;
+        } else
          forcecoul = (numtyp)0.0;

        force = (force_lj + forcecoul) * r2inv;
--- a/lib/gpu/lal_lj_coul_long.cpp
+++ b/lib/gpu/lal_lj_coul_long.cpp
@ -13,10 +13,12 @@
    email                : brownw@ornl.gov
 ***************************************************************************/

-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "lj_coul_long_cl.h"
+#elif defined(USE_CUDART)
+const char *lj_coul_long=0;
 #else
-#include "lj_coul_long_ptx.h"
+#include "lj_coul_long_cubin.h"
 #endif

 #include "lal_lj_coul_long.h"
@ -55,7 +57,7 @@ int LJCoulLongT::init(const int ntypes,
                           const double g_ewald) {
  int success;
  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                            _screen,lj_coul_long);
+                            _screen,lj_coul_long,"k_lj_coul_long");
  if (success!=0)
    return success;

@ -143,22 +145,19 @@ void LJCoulLongT::loop(const bool _eflag, const bool _vflag) {
  this->time_pair.start();
  if (shared_types) {
    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
-                          &lj3.begin(), &sp_lj.begin(),
-                          &this->nbor->dev_nbor.begin(),
-                          &this->_nbor_data->begin(),
-                          &this->ans->dev_ans.begin(),
-                          &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &nbor_pitch, &this->atom->dev_q.begin(),
+    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, 
+                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->ans->force, &this->ans->engv, &eflag,
+                          &vflag, &ainum, &nbor_pitch, &this->atom->q,
                          &_cut_coulsq, &_qqrd2e, &_g_ewald,
                          &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
-                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq,
+    this->k_pair.run(&this->atom->x, &lj1, &lj3,
+                     &_lj_types, &sp_lj, &this->nbor->dev_nbor,
+                     &this->_nbor_data->begin(), &this->ans->force,
+                     &this->ans->engv, &eflag, &vflag, &ainum,
+                     &nbor_pitch, &this->atom->q, &_cut_coulsq,
                     &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
  }
  this->time_pair.stop();
--- a/lib/gpu/lal_lj_coul_long.cu
+++ b/lib/gpu/lal_lj_coul_long.cu
@ -14,18 +14,22 @@
 // ***************************************************************************/

 #ifdef NV_KERNEL
+
 #include "lal_aux_fun1.h"
+#ifndef _DOUBLE_DOUBLE
 texture<float4> pos_tex;
 texture<float> q_tex;
-#ifndef _DOUBLE_DOUBLE
-ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
-  { return tex1Dfetch(pos_tex, i); }
-ucl_inline float fetch_q(const int& i, const float *q) 
-  { return tex1Dfetch(q_tex, i); }
-#endif
+#else
+texture<int4,1> pos_tex;
+texture<int2> q_tex;
 #endif

-__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
+#else
+#define pos_tex x_
+#define q_tex q_
+#endif
+
+__kernel void k_lj_coul_long(__global numtyp4 *x_, __global numtyp4 *lj1,
                          __global numtyp4* lj3, const int lj_types, 
                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
                          __global int *dev_packed, __global acctyp4 *ans,
@ -61,8 +65,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    numtyp qtmp=fetch_q(i,q_);
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
    int itype=ix.w;

    for ( ; nbor<list_end; nbor+=n_stride) {
@ -73,7 +77,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
      j &= NEIGHMASK;

-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int jtype=jx.w;

      // Compute r12
@ -99,7 +103,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
          numtyp expm2 = ucl_exp(-grij*grij);
          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-          prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
+          fetch(prefactor,j,q_tex);
+          prefactor *= qqrd2e * qtmp/r;
          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
        } else
          forcecoul = (numtyp)0.0;
@ -134,7 +139,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
  } // if ii
 }

-__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
+__kernel void k_lj_coul_long_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                               __global numtyp4* lj3_in, 
                               __global numtyp* sp_lj_in,
                               __global int *dev_nbor, __global int *dev_packed,
@ -173,8 +178,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
-    numtyp qtmp=fetch_q(i,q_);
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

@ -186,7 +191,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
      j &= NEIGHMASK;

-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int mtype=itype+jx.w;

      // Compute r12
@ -211,7 +216,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
          numtyp expm2 = ucl_exp(-grij*grij);
          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-          prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
+          fetch(prefactor,j,q_tex);
+          prefactor *= qqrd2e * qtmp/r;
          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
        } else
          forcecoul = (numtyp)0.0;
--- a/lib/gpu/lal_lj_expand.cpp
+++ b/lib/gpu/lal_lj_expand.cpp
@ -13,10 +13,12 @@
    email                : ibains@nvidia.com
 ***************************************************************************/

-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "lj_expand_cl.h"
+#elif defined(USE_CUDART)
+const char *lj_expand=0;
 #else
-#include "lj_expand_ptx.h"
+#include "lj_expand_cubin.h"
 #endif

 #include "lal_lj_expand.h"
@ -51,7 +53,7 @@ int LJExpandT::init(const int ntypes, double **host_cutsq,
                          const double gpu_split, FILE *_screen) {
  int success;
  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                            _screen,lj_expand);
+                            _screen,lj_expand,"k_lj_expand");
  if (success!=0)
    return success;

@ -133,20 +135,17 @@ void LJExpandT::loop(const bool _eflag, const bool _vflag) {
  this->time_pair.start();
  if (shared_types) {
    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
-                          &lj3.begin(), &sp_lj.begin(),
-                          &this->nbor->dev_nbor.begin(),
-                          &this->_nbor_data->begin(),
-                          &this->ans->dev_ans.begin(),
-                          &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &nbor_pitch, &this->_threads_per_atom);
+    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, 
+                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->ans->force, &this->ans->engv, &eflag, 
+                          &vflag, &ainum, &nbor_pitch, 
+                          &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
-                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &nbor_pitch, &this->_threads_per_atom);
+    this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, 
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                     &this->ans->force, &this->ans->engv, &eflag, &vflag,
+                     &ainum, &nbor_pitch, &this->_threads_per_atom);
  }
  this->time_pair.stop();
 }
--- a/lib/gpu/lal_lj_expand.cu
+++ b/lib/gpu/lal_lj_expand.cu
@ -14,15 +14,19 @@
 // ***************************************************************************/

 #ifdef NV_KERNEL
+
 #include "lal_aux_fun1.h"
-texture<float4> pos_tex;
 #ifndef _DOUBLE_DOUBLE
-ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
-  { return tex1Dfetch(pos_tex, i); }
-#endif
+texture<float4> pos_tex;
+#else
+texture<int4,1> pos_tex;
 #endif

-__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
+#else
+#define pos_tex x_
+#endif
+
+__kernel void k_lj_expand(__global numtyp4 *x_, __global numtyp4 *lj1,
                          __global numtyp4* lj3, const int lj_types, 
                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
                          __global int *dev_packed, __global acctyp4 *ans,
@ -51,7 +55,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;

    numtyp factor_lj;
@ -61,7 +65,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int jtype=jx.w;

      // Compute r12
@ -104,7 +108,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
  } // if ii
 }

-__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
+__kernel void k_lj_expand_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                               __global numtyp4* lj3_in, 
                               __global numtyp* sp_lj_in, 
                               __global int *dev_nbor, __global int *dev_packed,
@ -140,7 +144,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

@ -151,7 +155,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int mtype=itype+jx.w;

      // Compute r12
--- a/lib/gpu/lal_morse.cpp
+++ b/lib/gpu/lal_morse.cpp
@ -13,10 +13,12 @@
    email                : brownw@ornl.gov
 ***************************************************************************/

-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "morse_cl.h"
+#elif defined(USE_CUDART)
+const char *morse=0;
 #else
-#include "morse_ptx.h"
+#include "morse_cubin.h"
 #endif

 #include "lal_morse.h"
@ -51,7 +53,7 @@ int MorseT::init(const int ntypes,
                          const double gpu_split, FILE *_screen) {
  int success;
  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                            _screen,morse);
+                            _screen,morse,"k_morse");
  if (success!=0)
    return success;

@ -132,20 +134,17 @@ void MorseT::loop(const bool _eflag, const bool _vflag) {
  this->time_pair.start();
  if (shared_types) {
    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->dev_x.begin(), &mor1.begin(),
-                          &mor2.begin(), &sp_lj.begin(),
-                          &this->nbor->dev_nbor.begin(),
-                          &this->_nbor_data->begin(),
-                          &this->ans->dev_ans.begin(),
-                          &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &nbor_pitch, &this->_threads_per_atom);
+    this->k_pair_fast.run(&this->atom->x, &mor1, &mor2, &sp_lj,
+                          &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                          &this->ans->force, &this->ans->engv, &eflag,
+                          &vflag, &ainum, &nbor_pitch, 
+                          &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->dev_x.begin(), &mor1.begin(), &mor2.begin(),
-                     &_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &nbor_pitch, &this->_threads_per_atom);
+    this->k_pair.run(&this->atom->x, &mor1, &mor2, &_types, &sp_lj, 
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                     &this->ans->force, &this->ans->engv, &eflag, &vflag,
+                     &ainum, &nbor_pitch, &this->_threads_per_atom);
  }
  this->time_pair.stop();
 }
--- a/lib/gpu/lal_morse.cu
+++ b/lib/gpu/lal_morse.cu
@ -14,15 +14,19 @@
 // ***************************************************************************/

 #ifdef NV_KERNEL
+
 #include "lal_aux_fun1.h"
-texture<float4> pos_tex;
 #ifndef _DOUBLE_DOUBLE
-ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
-  { return tex1Dfetch(pos_tex, i); }
-#endif
+texture<float4> pos_tex;
+#else
+texture<int4,1> pos_tex;
 #endif

-__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *mor1,
+#else
+#define pos_tex x_
+#endif
+
+__kernel void k_morse(__global numtyp4 *x_, __global numtyp4 *mor1,
                          __global numtyp2* mor2, const int lj_types, 
                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
                          __global int *dev_packed, __global acctyp4 *ans,
@ -51,7 +55,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *mor1,
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;

    numtyp factor_lj;
@ -61,7 +65,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *mor1,
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int jtype=jx.w;

      // Compute r12
@ -102,7 +106,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *mor1,
  } // if ii
 }

-__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *mor1_in,
+__kernel void k_morse_fast(__global numtyp4 *x_, __global numtyp4 *mor1_in,
                               __global numtyp2* mor2_in, 
                               __global numtyp* sp_lj_in,
                               __global int *dev_nbor, __global int *dev_packed,
@ -138,7 +142,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *mor1_in,
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

@ -149,7 +153,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *mor1_in,
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int mtype=itype+jx.w;

      // Compute r12
--- a/lib/gpu/lal_neighbor.cpp
+++ b/lib/gpu/lal_neighbor.cpp
@ -84,7 +84,7 @@ bool Neighbor::init(NeighborShared *shared, const int inum,
    _max_atoms=1000;
    
  _max_host=static_cast<int>(static_cast<double>(host_inum)*1.10);
-  _max_nbors=max_nbors;
+  _max_nbors=(max_nbors/threads_per_atom+1)*threads_per_atom;

  _maxspecial=maxspecial;
  if (gpu_nbor==0)
@ -124,17 +124,14 @@ void Neighbor::alloc(bool &success) {
    _c_bytes+=dev_packed.row_bytes();                                         
  } 
  if (_max_host>0) {
-    host_nbor.clear();
-    dev_host_nbor.clear();
-    dev_host_numj.clear();
+    nbor_host.clear();
+    dev_numj_host.clear();
    host_ilist.clear();
    host_jlist.clear();
    
-    success=success && (host_nbor.alloc(_max_nbors*_max_host,*dev,
-                                        UCL_RW_OPTIMIZED)==UCL_SUCCESS);
-    success=success && (dev_host_nbor.alloc(_max_nbors*_max_host,
-                                            *dev,UCL_WRITE_ONLY)==UCL_SUCCESS);
-    success=success && (dev_host_numj.alloc(_max_host,*dev,
+    success=(nbor_host.alloc(_max_nbors*_max_host,*dev,UCL_RW_OPTIMIZED,
+                             UCL_WRITE_ONLY)==UCL_SUCCESS) && success;
+    success=success && (dev_numj_host.alloc(_max_host,*dev,
                                            UCL_WRITE_ONLY)==UCL_SUCCESS);
    success=success && (host_ilist.alloc(nt,*dev,UCL_NOT_PINNED)==UCL_SUCCESS);
    if (!success)
@ -145,16 +142,16 @@ void Neighbor::alloc(bool &success) {
                                         UCL_NOT_PINNED)==UCL_SUCCESS);
    if (!success)
      return;
-    int *ptr=host_nbor.begin();
+    int *ptr=nbor_host.host.begin();
    for (int i=0; i<_max_host; i++) {
      host_jlist[i]=ptr;
      ptr+=_max_nbors;
    }                                                 
-    _c_bytes+=dev_host_nbor.row_bytes()+dev_host_numj.row_bytes();
+    _c_bytes+=nbor_host.device.row_bytes()+dev_numj_host.row_bytes();
  } else {
    // Some OpenCL implementations return errors for NULL pointers as args
-    dev_host_nbor.view(dev_nbor);
-    dev_host_numj.view(dev_nbor);
+    nbor_host.device.view(dev_nbor);
+    dev_numj_host.view(dev_nbor);
  }
  if (_maxspecial>0) {
    dev_nspecial.clear();
@ -194,10 +191,9 @@ void Neighbor::clear() {
    host_packed.clear();
    host_acc.clear();
    dev_nbor.clear();
-    dev_host_nbor.clear();
+    nbor_host.clear();
    dev_packed.clear();
-    host_nbor.clear();
-    dev_host_numj.clear();
+    dev_numj_host.clear();
    host_ilist.clear();
    host_jlist.clear();
    dev_nspecial.clear();
@ -215,8 +211,8 @@ void Neighbor::clear() {
 double Neighbor::host_memory_usage() const {
  if (_gpu_nbor>0) {
    if (_gpu_host)
-      return host_nbor.row_bytes()*host_nbor.rows()+host_ilist.row_bytes()+
-             host_jlist.row_bytes();
+      return nbor_host.device.row_bytes()*nbor_host.rows()+
+             host_ilist.row_bytes()+host_jlist.row_bytes();
    else
      return 0;
  } else 
@ -285,8 +281,7 @@ void Neighbor::get_host(const int inum, int *ilist, int *numj,
    int GX=static_cast<int>(ceil(static_cast<double>(inum)*_threads_per_atom/
                                 block_size));
    _shared->k_nbor.set_size(GX,block_size);
-    _shared->k_nbor.run(&dev_nbor.begin(), &dev_packed.begin(), &inum,
-                        &_threads_per_atom);
+    _shared->k_nbor.run(&dev_nbor, &dev_packed, &inum, &_threads_per_atom);
    time_kernel.stop();
  }
 }
@ -295,31 +290,23 @@ template <class numtyp, class acctyp>
 void Neighbor::resize_max_neighbors(const int maxn, bool &success) {
  if (maxn>_max_nbors) {  
    int mn=static_cast<int>(static_cast<double>(maxn)*1.10);
-    dev_nbor.clear();
-    success=success && 
-            (dev_nbor.alloc((mn+1)*_max_atoms,*dev)==UCL_SUCCESS);
+    mn=(mn/_threads_per_atom+1)*_threads_per_atom;
+    success=success && (dev_nbor.resize((mn+1)*_max_atoms)==UCL_SUCCESS);
    _gpu_bytes=dev_nbor.row_bytes();
    if (_max_host>0) {
-      host_nbor.clear();
-      dev_host_nbor.clear();
-      success=success && (host_nbor.alloc(mn*_max_host,*dev,
-                                          UCL_RW_OPTIMIZED)==UCL_SUCCESS);
-      success=success && (dev_host_nbor.alloc(mn*_max_host,
-                                        *dev,UCL_WRITE_ONLY)==UCL_SUCCESS);
-      int *ptr=host_nbor.begin();
+      success=success && (nbor_host.resize(mn*_max_host)==UCL_SUCCESS);
+      int *ptr=nbor_host.host.begin();
      for (int i=0; i<_max_host; i++) {
        host_jlist[i]=ptr;
        ptr+=mn;
      }                                                 
-      _gpu_bytes+=dev_host_nbor.row_bytes();
+      _gpu_bytes+=nbor_host.row_bytes();
    } else {
-      dev_host_nbor.view(dev_nbor);
-      dev_host_numj.view(dev_nbor);
+      nbor_host.device.view(dev_nbor);
+      dev_numj_host.view(dev_nbor);
    }
    if (_alloc_packed) {
-      dev_packed.clear();
-      success=success && (dev_packed.alloc((mn+2)*_max_atoms,*dev,
-                                           UCL_READ_ONLY)==UCL_SUCCESS);
+      success=success && (dev_packed.resize((mn+2)*_max_atoms)==UCL_SUCCESS);
      _gpu_bytes+=dev_packed.row_bytes();
    }
    _max_nbors=mn;
@ -337,16 +324,12 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,

  // Calculate number of cells and allocate storage for binning as necessary
  int ncellx, ncelly, ncellz, ncell_3d;
-  ncellx = static_cast<int>(ceil(((subhi[0] - sublo[0]) +
-                                  2.0*_cell_size)/_cell_size));
-  ncelly = static_cast<int>(ceil(((subhi[1] - sublo[1]) +
-                                  2.0*_cell_size)/_cell_size));
-  ncellz = static_cast<int>(ceil(((subhi[2] - sublo[2]) +
-                                  2.0*_cell_size)/_cell_size));
+  int ghost_cells=2*_cells_in_cutoff;
+  ncellx = static_cast<int>(ceil((subhi[0]-sublo[0])/_cell_size))+ghost_cells;
+  ncelly = static_cast<int>(ceil((subhi[1]-sublo[1])/_cell_size))+ghost_cells;
+  ncellz = static_cast<int>(ceil((subhi[2]-sublo[2])/_cell_size))+ghost_cells;
  ncell_3d = ncellx * ncelly * ncellz;
  if (ncell_3d+1>_ncells) {
-    dev_cell_counts.clear();
-    dev_cell_counts.alloc(ncell_3d+1,dev_nbor);
    if (_gpu_nbor==2) {
      if (_ncells>0) {
        host_cell_counts.clear();
@ -355,11 +338,19 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
      cell_iter = new int[ncell_3d+1];
      host_cell_counts.alloc(ncell_3d+1,dev_nbor);
    }
+
+    if (_gpu_nbor==2 && atom.host_view())
+      dev_cell_counts.view(host_cell_counts);
+    else {
+      dev_cell_counts.clear();
+      dev_cell_counts.alloc(ncell_3d+1,dev_nbor);
+    }
+    
    _ncells=ncell_3d+1;
    _cell_bytes=dev_cell_counts.row_bytes();
  }

-  const numtyp cell_size_cast=static_cast<numtyp>(_cell_size);
+  const numtyp cutoff_cast=static_cast<numtyp>(_cutoff);

  if (_maxspecial>0) {
    time_nbor.start();
@ -379,8 +370,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
    const int g2x=static_cast<int>(ceil(static_cast<double>(_maxspecial)/b2x));
    const int g2y=static_cast<int>(ceil(static_cast<double>(nt)/b2y));
    _shared->k_transpose.set_size(g2x,g2y,b2x,b2y);
-    _shared->k_transpose.run(&dev_special.begin(),&dev_special_t.begin(),
-                             &_maxspecial,&nt);        
+    _shared->k_transpose.run(&dev_special,&dev_special_t,&_maxspecial,&nt);        
    time_transpose.stop();
  }
  
@ -392,28 +382,48 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
    
    // Build cell list on CPU                               
    host_cell_counts.zero();
-    double m_cell_size=-_cell_size;
-    double dx=subhi[0]-sublo[0]+_cell_size;
-    double dy=subhi[1]-sublo[1]+_cell_size;
-    double dz=subhi[2]-sublo[2]+_cell_size;
+    double i_cell_size=1.0/_cell_size;

-    for (int i=0; i<nall; i++) {
+    int offset_hi=_cells_in_cutoff+1;
+    for (int i=0; i<nt; i++) {
      double px, py, pz;
      px=x[i][0]-sublo[0];
      py=x[i][1]-sublo[1];
      pz=x[i][2]-sublo[2];
-      if (px<m_cell_size) px=m_cell_size;
-      if (py<m_cell_size) py=m_cell_size;
-      if (pz<m_cell_size) pz=m_cell_size;
-      if (px>dx) px=dx;            
-      if (py>dy) py=dy;            
-      if (pz>dz) pz=dz;            

-      int id=static_cast<int>(px/_cell_size + 1.0) + 
-             static_cast<int>(py/_cell_size + 1.0) * ncellx +
-             static_cast<int>(pz/_cell_size + 1.0) * ncellx * ncelly;
+      int ix = static_cast<int>(px*i_cell_size+1);
+      ix = std::max(ix,_cells_in_cutoff);
+      ix = std::min(ix,ncellx-offset_hi);
+      int iy = static_cast<int>(py*i_cell_size+1);
+      iy = std::max(iy,_cells_in_cutoff);
+      iy = std::min(iy,ncelly-offset_hi);
+      int iz = static_cast<int>(pz*i_cell_size+1);
+      iz = std::max(iz,_cells_in_cutoff);
+      iz = std::min(iz,ncellz-offset_hi);
    
-      cell_id[i]=id;
+      int id = ix+iy*ncellx+iz*ncellx*ncelly;
+      cell_id[i] = id;
+      host_cell_counts[id+1]++;
+    }
+    
+    for (int i=nt; i<nall; i++) {
+      double px, py, pz;
+      px=x[i][0]-sublo[0];
+      py=x[i][1]-sublo[1];
+      pz=x[i][2]-sublo[2];
+
+      int ix = static_cast<int>(px*i_cell_size+1);
+      ix = std::max(ix,0);
+      ix = std::min(ix,ncellx-1);
+      int iy = static_cast<int>(py*i_cell_size+1);
+      iy = std::max(iy,0);
+      iy = std::min(iy,ncelly-1);
+      int iz = static_cast<int>(pz*i_cell_size+1);
+      iz = std::max(iz,0);
+      iz = std::min(iz,ncellz-1);
+    
+      int id = ix+iy*ncellx+iz*ncellx*ncelly;
+      cell_id[i] = id;
      host_cell_counts[id+1]++;
    }
    
@ -451,41 +461,39 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
  time_kernel.start();

  _nbor_pitch=inum;
-  _shared->neigh_tex.bind_float(atom.dev_x,4);
+  _shared->neigh_tex.bind_float(atom.x,4);

  // If binning on GPU, do this now
  if (_gpu_nbor==1) {
+    const numtyp i_cell_size=static_cast<numtyp>(1.0/_cell_size);
    const int neigh_block=_block_cell_id;
    const int GX=(int)ceil((float)nall/neigh_block);
    const numtyp sublo0=static_cast<numtyp>(sublo[0]);
    const numtyp sublo1=static_cast<numtyp>(sublo[1]);
    const numtyp sublo2=static_cast<numtyp>(sublo[2]);
-    const numtyp subhi0=static_cast<numtyp>(subhi[0]);
-    const numtyp subhi1=static_cast<numtyp>(subhi[1]);
-    const numtyp subhi2=static_cast<numtyp>(subhi[2]);
    _shared->k_cell_id.set_size(GX,neigh_block);
-    _shared->k_cell_id.run(&atom.dev_x.begin(), &atom.dev_cell_id.begin(), 
-                           &atom.dev_particle_id.begin(),
-    				               &sublo0, &sublo1, &sublo2, &subhi0, &subhi1, 
-    				               &subhi2, &cell_size_cast, &ncellx, &ncelly, &nall);
+    _shared->k_cell_id.run(&atom.x, &atom.dev_cell_id, 
+                           &atom.dev_particle_id, &sublo0, &sublo1,
+                           &sublo2, &i_cell_size, &ncellx, &ncelly, &ncellz,
+                           &nt, &nall, &_cells_in_cutoff);

    atom.sort_neighbor(nall);

    /* calculate cell count */
    _shared->k_cell_counts.set_size(GX,neigh_block);
-    _shared->k_cell_counts.run(&atom.dev_cell_id.begin(), 
-                               &dev_cell_counts.begin(), &nall, &ncell_3d);
+    _shared->k_cell_counts.run(&atom.dev_cell_id, &dev_cell_counts, &nall, 
+                               &ncell_3d);
  } 
  
  /* build the neighbor list */
  const int cell_block=_block_nbor_build;
-  _shared->k_build_nbor.set_size(ncellx, ncelly*ncellz, cell_block, 1);
-  _shared->k_build_nbor.run(&atom.dev_x.begin(), &atom.dev_particle_id.begin(),
-                            &dev_cell_counts.begin(), &dev_nbor.begin(),
-                            &dev_host_nbor.begin(), &dev_host_numj.begin(),
-                            &_max_nbors,&cell_size_cast,
-                            &ncellx, &ncelly, &ncellz, &inum, &nt, &nall,
-                            &_threads_per_atom);
+  _shared->k_build_nbor.set_size(ncellx-ghost_cells,(ncelly-ghost_cells)*
+                                 (ncellz-ghost_cells),cell_block,1);
+  _shared->k_build_nbor.run(&atom.x, &atom.dev_particle_id,
+                            &dev_cell_counts, &dev_nbor, &nbor_host,
+                            &dev_numj_host, &_max_nbors, &cutoff_cast, &ncellx,
+                            &ncelly, &ncellz, &inum, &nt, &nall,
+                            &_threads_per_atom, &_cells_in_cutoff);

  /* Get the maximum number of nbors and realloc if necessary */
  UCL_D_Vec<int> numj;
@ -494,7 +502,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
  if (nt>inum) {
    UCL_H_Vec<int> host_offset;
    host_offset.view_offset(inum,host_acc,nt-inum);
-    ucl_copy(host_offset,dev_host_numj,nt-inum,true);
+    ucl_copy(host_offset,dev_numj_host,nt-inum,true);
  }
  
  if (_gpu_nbor!=2) {
@ -521,17 +529,16 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
    const int GX2=static_cast<int>(ceil(static_cast<double>
                                          (nt*_threads_per_atom)/cell_block));
    _shared->k_special.set_size(GX2,cell_block);
-    _shared->k_special.run(&dev_nbor.begin(), &dev_host_nbor.begin(), 
-                           &dev_host_numj.begin(), &atom.dev_tag.begin(), 
-                           &dev_nspecial.begin(), &dev_special.begin(), 
+    _shared->k_special.run(&dev_nbor, &nbor_host, &dev_numj_host,
+                           &atom.dev_tag, &dev_nspecial, &dev_special, 
                           &inum, &nt, &_max_nbors, &_threads_per_atom);
  }
  time_kernel.stop();

  time_nbor.start();
  if (inum<nt) {
-    ucl_copy(host_nbor,dev_host_nbor,true);
-    host_nbor.sync();
+    nbor_host.update_host(true);
+    nbor_host.sync();
  }
  time_nbor.stop();
 }
--- a/lib/gpu/lal_neighbor.h
+++ b/lib/gpu/lal_neighbor.h
@ -22,20 +22,6 @@

 #define IJ_SIZE 131072

-#ifdef USE_OPENCL
-
-#include "geryon/ocl_timer.h"
-#include "geryon/ocl_mat.h"
-using namespace ucl_opencl;
-
-#else
-
-#include "geryon/nvd_timer.h"
-#include "geryon/nvd_mat.h"
-using namespace ucl_cudadr;
-
-#endif
-
 namespace LAMMPS_AL {

 class Neighbor {
@ -70,7 +56,14 @@ class Neighbor {
            const int warp_size, const bool time_device);

  /// Set the size of the cutoff+skin
-  inline void cell_size(const double size) { _cell_size=size; }
+  inline void cell_size(const double size, const double cutoff) { 
+    _cell_size=size;
+    _cutoff=cutoff;
+    if (cutoff>size)
+      _cells_in_cutoff=static_cast<int>(ceil(cutoff/size));
+    else
+      _cells_in_cutoff=1;
+  }
  
  /// Get the size of the cutoff+skin
  inline double cell_size() const { return _cell_size; }
@ -203,14 +196,11 @@ class Neighbor {

  // ----------------- Data for GPU Neighbor Calculation ---------------

-  /// Host storage for device calculated neighbor lists
-  /** Same storage format as device matrix **/
-  UCL_H_Vec<int> host_nbor;
-  /// Device storage for neighbor list matrix that will be copied to host
+  /// Host/Device storage for device calculated neighbor lists
  /** - 1st row is numj
    * - Remaining rows are by atom, columns are nbors **/
-  UCL_D_Vec<int> dev_host_nbor;
-  UCL_D_Vec<int> dev_host_numj;
+  UCL_Vector<int,int> nbor_host;
+  UCL_D_Vec<int> dev_numj_host;
  UCL_H_Vec<int> host_ilist;
  UCL_H_Vec<int*> host_jlist;
  /// Device storage for special neighbor counts
@ -232,13 +222,14 @@ class Neighbor {
  bool _allocated, _use_packing, _nbor_time_avail, _time_device;
  int _gpu_nbor, _max_atoms, _max_nbors, _max_host, _nbor_pitch, _maxspecial;
  bool _gpu_host, _alloc_packed;
-  double _cell_size, _bin_time;
+  double _cutoff, _cell_size, _bin_time;

  double _gpu_bytes, _c_bytes, _cell_bytes;
  void alloc(bool &success);
  
  int _block_cell_2d, _block_cell_id, _max_block_nbor_build, _block_nbor_build;
  int _ncells, _threads_per_atom, _total_atoms;
+  int _cells_in_cutoff;

  template <class numtyp, class acctyp>
  inline void resize_max_neighbors(const int maxn, bool &success);
--- a/lib/gpu/lal_neighbor_gpu.cu
+++ b/lib/gpu/lal_neighbor_gpu.cu
@ -16,38 +16,48 @@

 #ifdef NV_KERNEL
 #include "lal_preprocessor.h"
-texture<float4> neigh_tex;
 #ifndef _DOUBLE_DOUBLE
-ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
-  { return tex1Dfetch(neigh_tex, i); }
+texture<float4> pos_tex;
+#else
+texture<int4,1> pos_tex;
 #endif

 __kernel void calc_cell_id(numtyp4 *pos, unsigned *cell_id, int *particle_id,
-                           numtyp boxlo0, 
-                           numtyp boxlo1, numtyp boxlo2, numtyp boxhi0, 
-                           numtyp boxhi1, numtyp boxhi2, numtyp cell_size, 
-                           int ncellx, int ncelly, int nall) {
+                           numtyp boxlo0, numtyp boxlo1, numtyp boxlo2, 
+                           numtyp i_cell_size, int ncellx, int ncelly, 
+                           int ncellz, int inum, int nall, 
+                           int cells_in_cutoff) {
  int i = threadIdx.x + blockIdx.x*blockDim.x;

  if (i < nall) {
-    numtyp4 p = fetch_pos(i,pos); //pos[i];
+    numtyp4 p;
+    fetch4(p,i,pos_tex); //pos[i];

    p.x -= boxlo0;
    p.y -= boxlo1;
    p.z -= boxlo2;
    
-    p.x = fmaxf(p.x, -cell_size);
-    p.x = fminf(p.x, boxhi0-boxlo0+cell_size);
-    p.y = fmaxf(p.y, -cell_size);
-    p.y = fminf(p.y, boxhi1-boxlo1+cell_size);
-    p.z = fmaxf(p.z, -cell_size);
-    p.z = fminf(p.z, boxhi2-boxlo2+cell_size);
+    int ix = int(p.x*i_cell_size+cells_in_cutoff);
+    int iy = int(p.y*i_cell_size+cells_in_cutoff);
+    int iz = int(p.z*i_cell_size+cells_in_cutoff);
    
-    unsigned int id = (unsigned int)(p.x/cell_size + 1.0) 
-      + (unsigned int)(p.y/cell_size + 1.0) * ncellx
-      + (unsigned int)(p.z/cell_size + 1.0) * ncellx * ncelly;
+    int offset_lo, offset_hi;
+    if (i<inum) {
+      offset_lo=cells_in_cutoff;
+      offset_hi=cells_in_cutoff+1;
+    } else {
+      offset_lo=0;
+      offset_hi=1;
+    }
    
-    cell_id[i] = id;
+    ix = max(ix,offset_lo);
+    ix = min(ix,ncellx-offset_hi);
+    iy = max(iy,offset_lo);
+    iy = min(iy,ncelly-offset_hi);
+    iz = max(iz,offset_lo);
+    iz = min(iz,ncellz-offset_hi);
+    
+    cell_id[i] = ix+iy*ncellx+iz*ncellx*ncelly;
    particle_id[i] = i;
  }
 }
@ -78,6 +88,8 @@ __kernel void kernel_calc_cell_counts(unsigned *cell_id,
  }
 }

+#else
+#define pos_tex x_
 #endif


@ -113,12 +125,13 @@ __kernel void calc_neigh_list_cell(__global numtyp4 *x_,
                                   __global int *host_numj, 
                                   int neigh_bin_size, numtyp cell_size,
                                   int ncellx, int ncelly, int ncellz,
-                                   int inum, int nt, int nall, int t_per_atom)
+                                   int inum, int nt, int nall, int t_per_atom,
+                                   int cells_in_cutoff)
 {
  int tid = THREAD_ID_X;
-  int ix = BLOCK_ID_X;
-  int iy = BLOCK_ID_Y % ncelly;
-  int iz = BLOCK_ID_Y / ncelly;
+  int ix = BLOCK_ID_X + cells_in_cutoff;
+  int iy = BLOCK_ID_Y % (ncelly - cells_in_cutoff*2) + cells_in_cutoff;
+  int iz = BLOCK_ID_Y / (ncelly - cells_in_cutoff*2) + cells_in_cutoff;
  int bsx = BLOCK_SIZE_X;
 	  
  int icell = ix + iy*ncellx + iz*ncellx*ncelly;
@ -129,9 +142,9 @@ __kernel void calc_neigh_list_cell(__global numtyp4 *x_,
  int icell_begin = cell_counts[icell];
  int icell_end = cell_counts[icell+1];

-  int nborz0 = max(iz-1,0), nborz1 = min(iz+1, ncellz-1),
-      nbory0 = max(iy-1,0), nbory1 = min(iy+1, ncelly-1),
-      nborx0 = max(ix-1,0), nborx1 = min(ix+1, ncellx-1);
+  int nborz0 = iz-cells_in_cutoff, nborz1 = iz+cells_in_cutoff, 
+      nbory0 = iy-cells_in_cutoff, nbory1 = iy+cells_in_cutoff,
+      nborx0 = ix-cells_in_cutoff, nborx1 = ix+cells_in_cutoff;

  numtyp4 diff;
  numtyp r2;
@ -147,7 +160,7 @@ __kernel void calc_neigh_list_cell(__global numtyp4 *x_,
      pid_i = cell_particle_id[i];

    if (pid_i < nt) {
-      atom_i = fetch_pos(pid_i,x_); //pos[pid_i];
+      fetch4(atom_i,pid_i,pos_tex); //pos[i];
    }
    if (pid_i < inum) {
      stride=inum;
@ -182,7 +195,7 @@ __kernel void calc_neigh_list_cell(__global numtyp4 *x_,
            if (tid < end_idx) {
              pid_j =  cell_particle_id[tid+k*bsx+jcell_begin];
              cell_list_sh[tid] = pid_j;
-              atom_j = fetch_pos(pid_j,x_); //[pid_j];
+              fetch4(atom_j,pid_j,pos_tex); //[pid_j];
              pos_sh[tid].x = atom_j.x;
              pos_sh[tid].y = atom_j.y;
              pos_sh[tid].z = atom_j.z;
--- a/lib/gpu/lal_neighbor_shared.cpp
+++ b/lib/gpu/lal_neighbor_shared.cpp
@ -16,12 +16,15 @@
 #include "lal_precision.h"
 #include "lal_neighbor_shared.h"

-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "neighbor_cpu_cl.h"
 #include "neighbor_gpu_cl.h"
+#elif defined(USE_CUDART)
+const char *neighbor_cpu=0;
+const char *neighbor_gpu=0;
 #else
-#include "neighbor_cpu_ptx.h"
-#include "neighbor_gpu_ptx.h"
+#include "neighbor_cpu_cubin.h"
+#include "neighbor_gpu_cubin.h"
 #endif

 using namespace LAMMPS_AL;
@ -69,7 +72,7 @@ void NeighborShared::compile_kernels(UCL_Device &dev, const int gpu_nbor) {
    k_build_nbor.set_function(*build_program,"calc_neigh_list_cell");
    k_transpose.set_function(*build_program,"transpose");
    k_special.set_function(*build_program,"kernel_special");
-    neigh_tex.get_texture(*build_program,"neigh_tex");
+    neigh_tex.get_texture(*build_program,"pos_tex");
  }
  _compiled=true;
 }
--- a/lib/gpu/lal_neighbor_shared.h
+++ b/lib/gpu/lal_neighbor_shared.h
@ -16,18 +16,18 @@
 #ifndef LAL_NEIGHBOR_SHARED_H
 #define LAL_NEIGHBOR_SHARED_H

-#ifdef USE_OPENCL
-
+#if defined(USE_OPENCL)
 #include "geryon/ocl_kernel.h"
 #include "geryon/ocl_texture.h"
 using namespace ucl_opencl;
-
+#elif defined(USE_CUDART)
+#include "geryon/nvc_kernel.h"
+#include "geryon/nvc_texture.h"
+using namespace ucl_cudart;
 #else
-
 #include "geryon/nvd_kernel.h"
 #include "geryon/nvd_texture.h"
 using namespace ucl_cudadr;
-
 #endif

 namespace LAMMPS_AL {
--- a/lib/gpu/lal_pppm.cpp
+++ b/lib/gpu/lal_pppm.cpp
@ -13,11 +13,14 @@
    email                : brownw@ornl.gov
 ***************************************************************************/

-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "pppm_cl.h"
+#elif defined(USE_CUDART)
+const char *pppm_f=0;
+const char *pppm_d=0;
 #else
-#include "pppm_f_ptx.h"
-#include "pppm_d_ptx.h"
+#include "pppm_f_cubin.h"
+#include "pppm_d_cubin.h"
 #endif
 #include "lal_pppm.h"
 #include <cassert>
@ -51,7 +54,7 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen,
                              const int nylo_out, const int nzlo_out,
                              const int nxhi_out, const int nyhi_out,
                              const int nzhi_out, grdtyp **rho_coeff,
-                              grdtyp **vd_brick, const double slab_volfactor, 
+                              grdtyp **vd_brick_p, const double slab_volfactor, 
                              const int nx_pppm, const int ny_pppm,
                              const int nz_pppm, const bool split, int &flag) {
  _max_bytes=10;
@ -92,8 +95,8 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen,
  time_interp.init(*ucl_device);
  time_interp.zero();

-  pos_tex.bind_float(atom->dev_x,4);
-  q_tex.bind_float(atom->dev_q,1);
+  pos_tex.bind_float(atom->x,4);
+  q_tex.bind_float(atom->q,1);

  _allocated=true;
  _max_bytes=0;
@ -133,14 +136,12 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen,
  _npts_y=nyhi_out-nylo_out+1;
  _npts_z=nzhi_out-nzlo_out+1;
  _npts_yx=_npts_x*_npts_y;
-  success=success && (d_brick.alloc(_npts_x*_npts_y*_npts_z*4,*ucl_device)==
+  success=success && (brick.alloc(_npts_x*_npts_y*_npts_z,*ucl_device)==
                      UCL_SUCCESS);
-  success=success && (h_brick.alloc(_npts_x*_npts_y*_npts_z,*ucl_device)==
+  success=success && (vd_brick.alloc(_npts_x*_npts_y*_npts_z*4,*ucl_device)==
                      UCL_SUCCESS);
-  success=success && (h_vd_brick.alloc(_npts_x*_npts_y*_npts_z*4,*ucl_device)==
-                      UCL_SUCCESS);
-  *vd_brick=h_vd_brick.begin();
-  _max_bytes+=d_brick.row_bytes();
+  *vd_brick_p=vd_brick.host.begin();
+  _max_bytes+=brick.device.row_bytes()+vd_brick.device.row_bytes();

  // Allocate vector with count of atoms assigned to each grid point
  _nlocal_x=_npts_x+_nlower-_nupper;
@ -158,20 +159,19 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen,
  _max_bytes+=d_brick_atoms.row_bytes();

  // Allocate error flags for checking out of bounds atoms
-  success=success && (h_error_flag.alloc(1,*ucl_device)==UCL_SUCCESS);
-  success=success && (d_error_flag.alloc(1,*ucl_device,UCL_WRITE_ONLY)==
-                                         UCL_SUCCESS);
+  success=success && (error_flag.alloc(1,*ucl_device,UCL_RW_OPTIMIZED,
+                                       UCL_WRITE_ONLY)==UCL_SUCCESS);
  if (!success) {
    flag=-3;
    return 0;
  }
  
-  d_error_flag.zero();
+  error_flag.device.zero();
  _max_bytes+=1;
  
  _cpu_idle_time=0.0;

-  return h_brick.begin();
+  return brick.host.begin();
 }

 template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
@ -181,12 +181,10 @@ void PPPMT::clear(const double cpu_time) {
  _allocated=false;
  _precompute_done=false;
  
-  d_brick.clear();
-  h_brick.clear();
-  h_vd_brick.clear();
+  brick.clear();
+  vd_brick.clear();
  d_brick_counts.clear();
-  h_error_flag.clear();
-  d_error_flag.clear();
+  error_flag.clear();
  d_brick_atoms.clear();
  
  acc_timers();
@ -269,11 +267,11 @@ void PPPMT::_precompute(const int ago, const int nlocal, const int nall,

  device->zero(d_brick_counts,d_brick_counts.numel());
  k_particle_map.set_size(GX,BX);
-  k_particle_map.run(&atom->dev_x.begin(), &atom->dev_q.begin(), &f_delvolinv,
-                     &ainum, &d_brick_counts.begin(), &d_brick_atoms.begin(),
-                     &_brick_x, &_brick_y, &_brick_z, &_delxinv, &_delyinv, 
-                     &_delzinv, &_nlocal_x, &_nlocal_y, &_nlocal_z, 
-                     &_atom_stride, &_max_brick_atoms, &d_error_flag.begin());
+  k_particle_map.run(&atom->x, &atom->q, &f_delvolinv, &ainum,
+                     &d_brick_counts, &d_brick_atoms, &_brick_x, &_brick_y, 
+                     &_brick_z, &_delxinv, &_delyinv, &_delzinv, &_nlocal_x,
+                     &_nlocal_y, &_nlocal_z, &_atom_stride, &_max_brick_atoms,
+                     &error_flag);
  time_map.stop();

  time_rho.start();
@ -282,15 +280,14 @@ void PPPMT::_precompute(const int ago, const int nlocal, const int nall,
  GX=static_cast<int>(ceil(static_cast<double>(_npts_y*_npts_z)/
                      _block_pencils));
  k_make_rho.set_size(GX,BX);
-  k_make_rho.run(&d_brick_counts.begin(), &d_brick_atoms.begin(),
-                 &d_brick.begin(), &d_rho_coeff.begin(), &_atom_stride, 
-                 &_npts_x, &_npts_y, &_npts_z, &_nlocal_x, &_nlocal_y,
-                 &_nlocal_z, &_order_m_1, &_order, &_order2);
+  k_make_rho.run(&d_brick_counts, &d_brick_atoms, &brick, &d_rho_coeff,
+                 &_atom_stride, &_npts_x, &_npts_y, &_npts_z, &_nlocal_x,
+                 &_nlocal_y, &_nlocal_z, &_order_m_1, &_order, &_order2);
  time_rho.stop();

  time_out.start();
-  ucl_copy(h_brick,d_brick,_npts_yx*_npts_z,true);
-  ucl_copy(h_error_flag,d_error_flag,true);
+  brick.update_host(_npts_yx*_npts_z,true);
+  error_flag.update_host(true);
  time_out.stop();

  _precompute_done=true;
@ -322,18 +319,17 @@ int PPPMT::spread(const int ago, const int nlocal, const int nall,

  _precompute_done=false;

-  if (h_error_flag[0]==2) {
+  if (error_flag[0]==2) {
    // Not enough storage for atoms on the brick
    _max_brick_atoms*=2;
-    d_error_flag.zero();
-    d_brick_atoms.clear();
-    d_brick_atoms.alloc(_atom_stride*_max_brick_atoms,*ucl_device);
+    error_flag.device.zero();
+    d_brick_atoms.resize(_atom_stride*_max_brick_atoms);
    _max_bytes+=d_brick_atoms.row_bytes();
    return spread(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo, 
                  delxinv,delyinv,delzinv);
  }
  
-  return h_error_flag[0];
+  return error_flag[0];
 }

 // ---------------------------------------------------------------------------
@ -342,7 +338,7 @@ int PPPMT::spread(const int ago, const int nlocal, const int nall,
 template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
 void PPPMT::interp(const grdtyp qqrd2e_scale) {
  time_in.start();
-  ucl_copy(d_brick,h_vd_brick,true);
+  vd_brick.update_device(true);
  time_in.stop();
  
  time_interp.start();
@ -353,10 +349,10 @@ void PPPMT::interp(const grdtyp qqrd2e_scale) {
  int ainum=this->ans->inum();
  
  k_interp.set_size(GX,BX);
-  k_interp.run(&atom->dev_x.begin(), &atom->dev_q.begin(), &ainum, 
-               &d_brick.begin(), &d_rho_coeff.begin(), &_npts_x, &_npts_yx,
-               &_brick_x, &_brick_y, &_brick_z, &_delxinv, &_delyinv, &_delzinv,
-               &_order, &_order2, &qqrd2e_scale, &ans->dev_ans.begin());
+  k_interp.run(&atom->x, &atom->q, &ainum, &vd_brick, &d_rho_coeff,
+               &_npts_x, &_npts_yx, &_brick_x, &_brick_y, &_brick_z, &_delxinv,
+               &_delyinv, &_delzinv, &_order, &_order2, &qqrd2e_scale, 
+               &ans->force);
  time_interp.stop();

  ans->copy_answers(false,false,false,false);
@ -408,4 +404,3 @@ void PPPMT::compile_kernels(UCL_Device &dev) {

 template class PPPM<PRECISION,ACC_PRECISION,float,_lgpu_float4>;
 template class PPPM<PRECISION,ACC_PRECISION,double,_lgpu_double4>;
-
--- a/lib/gpu/lal_pppm.cu
+++ b/lib/gpu/lal_pppm.cu
@ -14,14 +14,14 @@
 // ***************************************************************************/

 #ifdef NV_KERNEL
+
 #include "lal_preprocessor.h"
+#ifndef _DOUBLE_DOUBLE
 texture<float4> pos_tex;
 texture<float> q_tex;
-#ifndef _DOUBLE_DOUBLE
-ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
-  { return tex1Dfetch(pos_tex, i); }
-ucl_inline float fetch_q(const int& i, const float *q) 
-  { return tex1Dfetch(q_tex, i); }
+#else
+texture<int4,1> pos_tex;
+texture<int2> q_tex;
 #endif

 // Allow PPPM to compile without atomics for NVIDIA 1.0 cards, error
@ -31,6 +31,8 @@ ucl_inline float fetch_q(const int& i, const float *q)
 #endif

 #else
+#define pos_tex x_
+#define q_tex q_
 #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics: enable
 #endif

@ -59,9 +61,11 @@ __kernel void particle_map(__global numtyp4 *x_,  __global numtyp *q_,
  int nx,ny,nz;

  if (ii<nlocal) {
-    numtyp4 p=fetch_pos(ii,x_);
+    numtyp4 p;
+    fetch4(p,ii,pos_tex);
    grdtyp4 delta;
-    delta.w=delvolinv*fetch_q(ii,q_);
+    fetch(delta.w,ii,q_tex);
+    delta.w*=delvolinv;
    
    if (delta.w!=(grdtyp)0.0) {
      delta.x=(p.x-b_lo_x)*delxinv;
@ -212,8 +216,11 @@ __kernel void interp(__global numtyp4 *x_, __global numtyp *q_,
  grdtyp tx,ty,tz;

  if (ii<nlocal) {
-    numtyp4 p=fetch_pos(ii,x_);
-    grdtyp qs=qqrd2e_scale*fetch_q(ii,q_);
+    numtyp4 p;
+    fetch4(p,ii,pos_tex);
+    grdtyp qs;
+    fetch(qs,ii,q_tex);
+    qs*=qqrd2e_scale;

    acctyp4 ek;
    ek.x=(acctyp)0.0;
--- a/lib/gpu/lal_pppm.h
+++ b/lib/gpu/lal_pppm.h
@ -19,8 +19,10 @@
 #include "mpi.h"
 #include "lal_device.h"

-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "geryon/ocl_texture.h"
+#elif defined(USE_CUDART)
+#include "geryon/nvc_texture.h"
 #else
 #include "geryon/nvd_texture.h"
 #endif
@ -55,8 +57,8 @@ class PPPM {
  /** \param success set to false if insufficient memory **/
  inline void resize_atom(const int inum, const int nall, bool &success) {
    if (atom->resize(nall, success)) {
-      pos_tex.bind_float(atom->dev_x,4);
-      q_tex.bind_float(atom->dev_q,1);
+      pos_tex.bind_float(atom->x,4);
+      q_tex.bind_float(atom->q,1);
    }
    ans->resize(inum,success);
  }
@ -138,8 +140,8 @@ class PPPM {

  // --------------------------- GRID DATA --------------------------

-  UCL_H_Vec<grdtyp> h_brick, h_vd_brick;
-  UCL_D_Vec<grdtyp> d_brick;
+  UCL_Vector<grdtyp,grdtyp> brick;
+  UCL_Vector<grdtyp,grdtyp> vd_brick;
  
  // Count of number of atoms assigned to each grid point
  UCL_D_Vec<int> d_brick_counts;
@ -147,8 +149,7 @@ class PPPM {
  UCL_D_Vec<grdtyp4> d_brick_atoms;
  
  // Error checking for out of bounds atoms
-  UCL_D_Vec<int> d_error_flag;
-  UCL_H_Vec<int> h_error_flag;
+  UCL_Vector<int,int> error_flag;
  
  // Number of grid points in brick (including ghost)
  int _npts_x, _npts_y, _npts_z, _npts_yx;
--- a/lib/gpu/lal_precision.h
+++ b/lib/gpu/lal_precision.h
@ -16,6 +16,10 @@
 #ifndef LAL_PRECISION_H
 #define LAL_PRECISION_H

+#if defined(USE_CUDART)
+#include <cuda_runtime.h>
+#endif
+
 struct _lgpu_int2 {
  int x; int y;
 };
@ -108,3 +112,4 @@ enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
 #endif

 #endif
+
--- a/lib/gpu/lal_preprocessor.h
+++ b/lib/gpu/lal_preprocessor.h
@ -107,7 +107,7 @@
 #define BLOCK_NBOR_BUILD 128
 #define BLOCK_PAIR 128
 #define BLOCK_BIO_PAIR 128
-#define MAX_SHARED_TYPES 11
+#define MAX_SHARED_TYPES 8

 #else

@ -129,8 +129,21 @@
 #define MAX_BIO_SHARED_TYPES 128

 #ifdef _DOUBLE_DOUBLE
-ucl_inline double4 fetch_pos(const int& i, const double4 *pos) { return pos[i]; };
-ucl_inline double fetch_q(const int& i, const double *q) { return q[i]; };
+#define fetch4(ans,i,pos_tex) {                        \
+  int4 xy = tex1Dfetch(pos_tex,i*2);                   \
+  int4 zt = tex1Dfetch(pos_tex,i*2+1);                 \
+  ans.x=__hiloint2double(xy.y, xy.x);                  \
+  ans.y=__hiloint2double(xy.w, xy.z);                  \
+  ans.z=__hiloint2double(zt.y, zt.x);                  \
+  ans.w=__hiloint2double(zt.w, zt.z);                  \
+}
+#define fetch(ans,i,q_tex) {                           \
+  int2 qt = tex1Dfetch(q_tex,i);                       \
+  ans=__hiloint2double(qt.y, qt.x);                    \
+}
+#else
+#define fetch4(ans,i,pos_tex) ans=tex1Dfetch(pos_tex, i);
+#define fetch(ans,i,q_tex) ans=tex1Dfetch(q_tex,i);
 #endif

 #if (__CUDA_ARCH__ < 200)
@ -293,8 +306,8 @@ typedef struct _double4 double4;
 #define BLOCK_ID_Y get_group_id(1)
 #define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
 #define ucl_inline inline
-#define fetch_pos(i,y) x_[i]
-#define fetch_q(i,y) q_[i]
+#define fetch4(ans,i,x) ans=x[i]
+#define fetch(ans,i,q) ans=q[i]

 #define ucl_atan atan
 #define ucl_cbrt cbrt
--- a/lib/gpu/lal_re_squared.cpp
+++ b/lib/gpu/lal_re_squared.cpp
@ -13,12 +13,15 @@
    email                : brownw@ornl.gov
 ***************************************************************************/

-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "re_squared_cl.h"
 #include "re_squared_lj_cl.h"
+#elif defined(USE_CUDART)
+const char *re_squared=0;
+const char *re_squared_lj=0;
 #else
-#include "re_squared_ptx.h"
-#include "re_squared_lj_ptx.h"
+#include "re_squared_cubin.h"
+#include "re_squared_lj_cubin.h"
 #endif

 #include "lal_re_squared.h"
@ -54,7 +57,8 @@ int RESquaredT::init(const int ntypes, double **host_shape, double **host_well,
                     const double gpu_split, FILE *_screen) {
  int success;
  success=this->init_base(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                          _screen,ntypes,h_form,re_squared,re_squared_lj,true);
+                          _screen,ntypes,h_form,re_squared,re_squared_lj,
+                          "k_resquared",true);
  if (success!=0)
    return success;

@ -198,13 +202,13 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {

      this->time_ellipsoid.start();
      this->k_ellipsoid.set_size(GX,BX);
-      this->k_ellipsoid.run(&this->atom->dev_x.begin(),
-       &this->atom->dev_quat.begin(), &this->shape.begin(), &this->well.begin(),
-       &this->special_lj.begin(), &this->sigma_epsilon.begin(), 
-       &this->_lj_types, &this->nbor->dev_nbor.begin(), &stride,
-       &this->ans->dev_ans.begin(),&ainum,&this->ans->dev_engv.begin(),
-       &this->dev_error.begin(), &eflag, &vflag, &this->_last_ellipse,
-       &this->_threads_per_atom);
+      this->k_ellipsoid.run(&this->atom->x, &this->atom->quat,
+                            &this->shape, &this->well, &this->special_lj,
+                            &this->sigma_epsilon, &this->_lj_types, 
+                            &this->nbor->dev_nbor, &stride, 
+                            &this->ans->force,&ainum, &this->ans->engv,
+                            &this->dev_error, &eflag, &vflag, 
+                            &this->_last_ellipse, &this->_threads_per_atom);
      this->time_ellipsoid.stop();

      // ------------ ELLIPSE_SPHERE ---------------
@ -215,12 +219,13 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {

      this->time_ellipsoid2.start();
      this->k_ellipsoid_sphere.set_size(GX,BX);
-      this->k_ellipsoid_sphere.run(&this->atom->dev_x.begin(),
-       &this->atom->dev_quat.begin(), &this->shape.begin(), &this->well.begin(),
-       &this->special_lj.begin(), &this->sigma_epsilon.begin(), 
-       &this->_lj_types, &this->nbor->dev_nbor.begin(), &stride,
-       &this->ans->dev_ans.begin(),&ainum,&this->ans->dev_engv.begin(),
-       &this->dev_error.begin(), &eflag, &vflag, &this->_last_ellipse,
+      this->k_ellipsoid_sphere.run(&this->atom->x, &this->atom->quat, 
+                                   &this->shape, &this->well, &this->special_lj,
+                                   &this->sigma_epsilon, &this->_lj_types, 
+                                   &this->nbor->dev_nbor, &stride,
+                                   &this->ans->force,&ainum,
+                                   &this->ans->engv, &this->dev_error, 
+                                   &eflag, &vflag, &this->_last_ellipse,
                                   &this->_threads_per_atom);
      this->time_ellipsoid2.stop();

@ -245,17 +250,18 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {

      this->time_ellipsoid3.start();
      this->k_sphere_ellipsoid.set_size(GX,BX);
-      this->k_sphere_ellipsoid.run(&this->atom->dev_x.begin(),
-        &this->atom->dev_quat.begin(), &this->shape.begin(), 
-        &this->well.begin(), &this->special_lj.begin(), 
-        &this->sigma_epsilon.begin(), &this->_lj_types,
-        &this->nbor->dev_nbor.begin(), &stride, &this->ans->dev_ans.begin(),
-        &this->ans->dev_engv.begin(), &this->dev_error.begin(), &eflag,
-        &vflag, &this->_last_ellipse, &ainum, &this->_threads_per_atom);
+      this->k_sphere_ellipsoid.run(&this->atom->x, &this->atom->quat,
+                                   &this->shape, &this->well, &this->special_lj, 
+                                   &this->sigma_epsilon, &this->_lj_types,
+                                   &this->nbor->dev_nbor, &stride, 
+                                   &this->ans->force, &this->ans->engv,
+                                   &this->dev_error, &eflag, &vflag,
+                                   &this->_last_ellipse, &ainum, 
+                                   &this->_threads_per_atom);
      this->time_ellipsoid3.stop();
   } else {
-      this->ans->dev_ans.zero();
-      this->ans->dev_engv.zero();
+      this->ans->force.zero();
+      this->ans->engv.zero();
      this->time_nbor1.zero();
      this->time_ellipsoid.zero();                                 
      this->time_nbor2.zero();
@ -269,19 +275,19 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
    if (this->_last_ellipse<this->ans->inum()) {
      if (this->_shared_types) {
        this->k_lj_fast.set_size(GX,BX);
-        this->k_lj_fast.run(&this->atom->dev_x.begin(), &this->lj1.begin(),
-          &this->lj3.begin(), &this->special_lj.begin(), &stride,
-          &this->nbor->dev_packed.begin(), &this->ans->dev_ans.begin(),
-          &this->ans->dev_engv.begin(), &this->dev_error.begin(),
+        this->k_lj_fast.run(&this->atom->x, &this->lj1, &this->lj3,
+                            &this->special_lj, &stride,
+                            &this->nbor->dev_packed, &this->ans->force,
+                            &this->ans->engv, &this->dev_error,
                            &eflag, &vflag, &this->_last_ellipse, &ainum,
                            &this->_threads_per_atom);
      } else {
        this->k_lj.set_size(GX,BX);
-        this->k_lj.run(&this->atom->dev_x.begin(), &this->lj1.begin(),
-          &this->lj3.begin(), &this->_lj_types, &this->special_lj.begin(),
-          &stride, &this->nbor->dev_packed.begin(), &this->ans->dev_ans.begin(),
-          &this->ans->dev_engv.begin(), &this->dev_error.begin(), &eflag,
-          &vflag, &this->_last_ellipse, &ainum, &this->_threads_per_atom);
+        this->k_lj.run(&this->atom->x, &this->lj1, &this->lj3,
+                       &this->_lj_types, &this->special_lj, &stride, 
+                       &this->nbor->dev_packed, &this->ans->force,
+                       &this->ans->engv, &this->dev_error, &eflag, &vflag,
+                       &this->_last_ellipse, &ainum, &this->_threads_per_atom);
      }
    }
    this->time_lj.stop();
@ -295,13 +301,12 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
    this->time_nbor1.stop();
    this->time_ellipsoid.start(); 
    this->k_ellipsoid.set_size(GX,BX);
-    this->k_ellipsoid.run(&this->atom->dev_x.begin(), 
-      &this->atom->dev_quat.begin(), &this->shape.begin(), &this->well.begin(), 
-      &this->special_lj.begin(), &this->sigma_epsilon.begin(), 
-      &this->_lj_types, &this->nbor->dev_nbor.begin(), &stride,
-      &this->ans->dev_ans.begin(), &ainum,  &this->ans->dev_engv.begin(),
-      &this->dev_error.begin(), &eflag, &vflag, &ainum, 
-      &this->_threads_per_atom);
+    this->k_ellipsoid.run(&this->atom->x, &this->atom->quat, 
+                          &this->shape, &this->well, &this->special_lj, 
+                          &this->sigma_epsilon, &this->_lj_types, 
+                          &this->nbor->dev_nbor, &stride, &this->ans->force,
+                          &ainum,  &this->ans->engv, &this->dev_error, 
+                          &eflag, &vflag, &ainum, &this->_threads_per_atom);
    this->time_ellipsoid.stop();
  }
 }
--- a/lib/gpu/lal_re_squared.cu
+++ b/lib/gpu/lal_re_squared.cu
@ -32,7 +32,7 @@ ucl_inline numtyp det_prime(const numtyp m[9], const numtyp m2[9])
  return ans;
 }

-__kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q,
+__kernel void k_resquared(__global numtyp4* x_,__global numtyp4 *q,
                          __global numtyp4* shape, __global numtyp4* well, 
                          __global numtyp *splj, __global numtyp2* sig_eps, 
                          const int ntypes, __global int *dev_nbor,
@ -73,7 +73,7 @@ __kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q,
    nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
                n_stride,nbor_end,nbor);
  
-    numtyp4 ix=x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex);
    int itype=ix.w;

    numtyp a1[9];       // Rotation matrix (lab->body)
@ -122,7 +122,7 @@ __kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q,
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

-      numtyp4 jx=x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex);
      int jtype=jx.w;

      // Compute r12
--- a/lib/gpu/lal_re_squared_lj.cu
+++ b/lib/gpu/lal_re_squared_lj.cu
@ -17,10 +17,11 @@
 #include "lal_ellipsoid_extra.h"
 #endif

-__kernel void kernel_ellipsoid_sphere(__global numtyp4* x_,__global numtyp4 *q,
-                   __global numtyp4* shape, __global numtyp4* well, 
-                   __global numtyp *splj, __global numtyp2* sig_eps,
-                   const int ntypes, __global int *dev_nbor, const int stride, 
+__kernel void k_resquared_ellipsoid_sphere(__global numtyp4* x_,
+                  __global numtyp4 *q, __global numtyp4* shape, 
+                  __global numtyp4* well, __global numtyp *splj, 
+                  __global numtyp2* sig_eps, const int ntypes, 
+                  __global int *dev_nbor, const int stride, 
                  __global acctyp4 *ans, const int astride, 
                  __global acctyp *engv, __global int *err_flag, 
                   const int eflag, const int vflag, const int inum,
@ -59,7 +60,7 @@ __kernel void kernel_ellipsoid_sphere(__global numtyp4* x_,__global numtyp4 *q,
    nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
                n_stride,nbor_end,nbor);
  
-    numtyp4 ix=x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex);
    int itype=ix.w;

    numtyp a[9];       // Rotation matrix (lab->body)
@ -84,7 +85,7 @@ __kernel void kernel_ellipsoid_sphere(__global numtyp4* x_,__global numtyp4 *q,
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

-      numtyp4 jx=x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex);
      int jtype=jx.w;

      // Compute r12
@ -331,14 +332,14 @@ __kernel void kernel_ellipsoid_sphere(__global numtyp4* x_,__global numtyp4 *q,
  } // if ii
 }

-__kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
-                               __global numtyp4* shape,__global numtyp4* well, 
-                               __global numtyp *splj, __global numtyp2* sig_eps, 
-                               const int ntypes, __global int *dev_nbor,
-                               const int stride, __global acctyp4 *ans,
-                               __global acctyp *engv, __global int *err_flag,
-                               const int eflag, const int vflag,const int start,
-                               const int inum, const int t_per_atom) {
+__kernel void k_resquared_sphere_ellipsoid(__global numtyp4 *x_,
+                  __global numtyp4 *q, __global numtyp4* shape,
+                  __global numtyp4* well, __global numtyp *splj,
+                  __global numtyp2* sig_eps, const int ntypes, 
+                  __global int *dev_nbor, const int stride, 
+                  __global acctyp4 *ans, __global acctyp *engv, 
+                  __global int *err_flag, const int eflag, const int vflag,
+                  const int start, const int inum, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
  ii+=start;
@ -370,7 +371,7 @@ __kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
    nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,j,numj,
                n_stride,nbor_end,nbor);
  
-    numtyp4 jx=x_[j];
+    numtyp4 jx; fetch4(jx,j,pos_tex);
    int jtype=jx.w;

    numtyp factor_lj;
@ -379,7 +380,7 @@ __kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
      factor_lj = sp_lj[sbmask(i)];
      i &= NEIGHMASK;

-      numtyp4 ix=x_[i];
+      numtyp4 ix; fetch4(ix,i,pos_tex);
      int itype=ix.w;

      numtyp a[9];       // Rotation matrix (lab->body)
@ -524,14 +525,13 @@ __kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
  } // if ii
 }

-__kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1, 
+__kernel void k_resquared_lj(__global numtyp4 *x_, __global numtyp4 *lj1, 
                             __global numtyp4* lj3, const int lj_types, 
-                        __global numtyp *gum, 
-                        const int stride, __global int *dev_ij, 
-                        __global acctyp4 *ans, __global acctyp *engv, 
-                        __global int *err_flag, const int eflag, 
-                        const int vflag, const int start, const int inum, 
-                        const int t_per_atom) {
+                             __global numtyp *gum, const int stride, 
+                             __global int *dev_ij, __global acctyp4 *ans,
+                             __global acctyp *engv, __global int *err_flag,
+                             const int eflag, const int vflag, const int start,
+                             const int inum, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
  ii+=start;
@ -557,7 +557,7 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
    nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
                n_stride,list_end,nbor);
  
-    numtyp4 ix=x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex);
    int itype=ix.w;

    numtyp factor_lj;
@ -567,7 +567,7 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

-      numtyp4 jx=x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex);
      int jtype=jx.w;

      // Compute r12
@ -606,13 +606,12 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
  } // if ii
 }

-__kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, 
-                             __global numtyp4* lj3_in, __global numtyp *gum, 
-                             const int stride, __global int *dev_ij,
+__kernel void k_resquared_lj_fast(__global numtyp4 *x_, 
+                  __global numtyp4 *lj1_in, __global numtyp4* lj3_in, 
+                  __global numtyp *gum, const int stride, __global int *dev_ij,
                  __global acctyp4 *ans, __global acctyp *engv, 
-                             __global int *err_flag, const int eflag,
-                             const int vflag, const int start, const int inum,
-                             const int t_per_atom) {
+                  __global int *err_flag, const int eflag, const int vflag,
+                  const int start, const int inum, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
  ii+=start;
@ -645,7 +644,7 @@ __kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
    nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
                n_stride,list_end,nbor);

-    numtyp4 ix=x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex);
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

@ -656,7 +655,7 @@ __kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

-      numtyp4 jx=x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex);
      int mtype=itype+jx.w;

      // Compute r12
--- a/lib/gpu/lal_table.cpp
+++ b/lib/gpu/lal_table.cpp
@ -13,10 +13,12 @@
    email                : nguyentd@ornl.gov
 ***************************************************************************/

-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "table_cl.h"
+#elif defined(USE_CUDART)
+const char *table=0;
 #else
-#include "table_ptx.h"
+#include "table_cubin.h"
 #endif

 #include "lal_table.h"
@ -56,17 +58,17 @@ int TableT::init(const int ntypes,
                const double gpu_split, FILE *_screen, 
                int tabstyle, int ntables, int tablength) {
  int success;
-  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                            _screen,table);
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,
+                            gpu_split,_screen,table,"k_table");
  if (success!=0)
    return success;
  
-  k_pair_linear.set_function(*(this->pair_program),"kernel_pair_linear");
-  k_pair_linear_fast.set_function(*(this->pair_program),"kernel_pair_linear_fast");
-  k_pair_spline.set_function(*(this->pair_program),"kernel_pair_spline");
-  k_pair_spline_fast.set_function(*(this->pair_program),"kernel_pair_spline_fast");
-  k_pair_bitmap.set_function(*(this->pair_program),"kernel_pair_bitmap");
-  k_pair_bitmap_fast.set_function(*(this->pair_program),"kernel_pair_bitmap_fast");
+  k_pair_linear.set_function(*(this->pair_program),"k_table_linear");
+  k_pair_linear_fast.set_function(*(this->pair_program),"k_table_linear_fast");
+  k_pair_spline.set_function(*(this->pair_program),"k_table_spline");
+  k_pair_spline_fast.set_function(*(this->pair_program),"k_table_spline_fast");
+  k_pair_bitmap.set_function(*(this->pair_program),"k_table_bitmap");
+  k_pair_bitmap_fast.set_function(*(this->pair_program),"k_table_bitmap_fast");
  _compiled_styles = true;

  // If atom type constants fit in shared memory use fast kernel
@ -264,84 +266,71 @@ void TableT::loop(const bool _eflag, const bool _vflag) {
  if (shared_types) {
    if (_tabstyle == LOOKUP) {
      this->k_pair_fast.set_size(GX,BX);
-      this->k_pair_fast.run(&this->atom->dev_x.begin(), &tabindex.begin(),
-                            &coeff2.begin(), &coeff3.begin(),
-                            &coeff4.begin(), &cutsq.begin(), &sp_lj.begin(),
-                            &this->nbor->dev_nbor.begin(),
-                            &this->_nbor_data->begin(),
-                            &this->ans->dev_ans.begin(),
-                            &this->ans->dev_engv.begin(), &eflag, &vflag,
-                            &ainum, &nbor_pitch, &this->_threads_per_atom, 
-                            &_tablength);
+      this->k_pair_fast.run(&this->atom->x, &tabindex, &coeff2, &coeff3,
+                            &coeff4, &cutsq, &sp_lj, &this->nbor->dev_nbor,
+                            &this->_nbor_data->begin(), &this->ans->force,
+                            &this->ans->engv, &eflag, &vflag, &ainum, 
+                            &nbor_pitch, &this->_threads_per_atom, &_tablength);
    } else if (_tabstyle == LINEAR) {
      this->k_pair_linear_fast.set_size(GX,BX);
-      this->k_pair_linear_fast.run(&this->atom->dev_x.begin(), &tabindex.begin(),
-                            &coeff2.begin(), &coeff3.begin(),
-                            &coeff4.begin(), &cutsq.begin(), &sp_lj.begin(),
-                            &this->nbor->dev_nbor.begin(),
-                            &this->_nbor_data->begin(),
-                            &this->ans->dev_ans.begin(),
-                            &this->ans->dev_engv.begin(), &eflag, &vflag,
-                            &ainum, &nbor_pitch, &this->_threads_per_atom, 
-                            &_tablength);
+      this->k_pair_linear_fast.run(&this->atom->x, &tabindex, &coeff2, 
+                                   &coeff3, &coeff4, &cutsq, &sp_lj,
+                                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                                   &this->ans->force, &this->ans->engv,
+                                   &eflag, &vflag, &ainum, &nbor_pitch, 
+                                   &this->_threads_per_atom, &_tablength);
    } else if (_tabstyle == SPLINE) {
      this->k_pair_spline_fast.set_size(GX,BX);
-      this->k_pair_spline_fast.run(&this->atom->dev_x.begin(), &tabindex.begin(),
-                            &coeff2.begin(), &coeff3.begin(),
-                            &coeff4.begin(), &cutsq.begin(), &sp_lj.begin(),
-                            &this->nbor->dev_nbor.begin(),
-                            &this->_nbor_data->begin(),
-                            &this->ans->dev_ans.begin(),
-                            &this->ans->dev_engv.begin(), &eflag, &vflag,
-                            &ainum, &nbor_pitch, &this->_threads_per_atom, 
-                            &_tablength);
+      this->k_pair_spline_fast.run(&this->atom->x, &tabindex, &coeff2, 
+                                   &coeff3, &coeff4, &cutsq, &sp_lj,
+                                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                                   &this->ans->force, &this->ans->engv,
+                                   &eflag, &vflag, &ainum, &nbor_pitch, 
+                                   &this->_threads_per_atom, &_tablength);
    } else if (_tabstyle == BITMAP) {
      this->k_pair_bitmap_fast.set_size(GX,BX);
-      this->k_pair_bitmap_fast.run(&this->atom->dev_x.begin(), &tabindex.begin(),
-                            &nshiftbits.begin(), &nmask.begin(),   
-                            &coeff2.begin(), &coeff3.begin(),
-                            &coeff4.begin(), &cutsq.begin(), &sp_lj.begin(),
-                            &this->nbor->dev_nbor.begin(),
-                            &this->_nbor_data->begin(),
-                            &this->ans->dev_ans.begin(),
-                            &this->ans->dev_engv.begin(), &eflag, &vflag,
-                            &ainum, &nbor_pitch, &this->_threads_per_atom, 
-                            &_tablength);
+      this->k_pair_bitmap_fast.run(&this->atom->x, &tabindex, &nshiftbits,
+                                   &nmask, &coeff2, &coeff3, &coeff4, &cutsq,
+                                   &sp_lj, &this->nbor->dev_nbor, 
+                                   &this->_nbor_data->begin(), &this->ans->force,
+                                   &this->ans->engv, &eflag, &vflag,
+                                   &ainum, &nbor_pitch, 
+                                   &this->_threads_per_atom, &_tablength);
    } 
  } else {
    if (_tabstyle == LOOKUP) {
      this->k_pair.set_size(GX,BX);
-      this->k_pair.run(&this->atom->dev_x.begin(), &tabindex.begin(),
-                     &coeff2.begin(), &coeff3.begin(), &coeff4.begin(), &_lj_types, 
-                     &cutsq.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &nbor_pitch, &this->_threads_per_atom, &_tablength);
+      this->k_pair.run(&this->atom->x, &tabindex, &coeff2, &coeff3, 
+                       &coeff4, &_lj_types, &cutsq, &sp_lj, 
+                       &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                       &this->ans->force, &this->ans->engv, &eflag, 
+                       &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom,
+                       &_tablength);
    } else if (_tabstyle == LINEAR) {
      this->k_pair_linear.set_size(GX,BX);
-      this->k_pair_linear.run(&this->atom->dev_x.begin(), &tabindex.begin(),
-                     &coeff2.begin(), &coeff3.begin(), &coeff4.begin(), &_lj_types, 
-                     &cutsq.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &nbor_pitch, &this->_threads_per_atom, &_tablength);
+      this->k_pair_linear.run(&this->atom->x, &tabindex, &coeff2, &coeff3,
+                              &coeff4, &_lj_types, &cutsq, &sp_lj, 
+                              &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                              &this->ans->force, &this->ans->engv, &eflag,
+                              &vflag, &ainum, &nbor_pitch, 
+                              &this->_threads_per_atom, &_tablength);
    } else if (_tabstyle == SPLINE) {
      this->k_pair_spline.set_size(GX,BX);
-      this->k_pair_spline.run(&this->atom->dev_x.begin(), &tabindex.begin(),
-                     &coeff2.begin(), &coeff3.begin(), &coeff4.begin(), &_lj_types, 
-                     &cutsq.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &nbor_pitch, &this->_threads_per_atom, &_tablength);
+      this->k_pair_spline.run(&this->atom->x, &tabindex, &coeff2, &coeff3,
+                              &coeff4, &_lj_types, &cutsq, &sp_lj, 
+                              &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                              &this->ans->force, &this->ans->engv, &eflag,
+                              &vflag, &ainum, &nbor_pitch, 
+                              &this->_threads_per_atom, &_tablength);
    } else if (_tabstyle == BITMAP) {
      this->k_pair_bitmap.set_size(GX,BX);
-      this->k_pair_bitmap.run(&this->atom->dev_x.begin(), &tabindex.begin(),
-                     &nshiftbits.begin(), &nmask.begin(), 
-                     &coeff2.begin(), &coeff3.begin(), &coeff4.begin(), &_lj_types, 
-                     &cutsq.begin(), &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &nbor_pitch, &this->_threads_per_atom, &_tablength);
+      this->k_pair_bitmap.run(&this->atom->x, &tabindex, &nshiftbits, 
+                              &nmask, &coeff2, &coeff3, &coeff4, &_lj_types,
+                              &cutsq, &sp_lj, &this->nbor->dev_nbor,
+                              &this->_nbor_data->begin(), &this->ans->force,
+                              &this->ans->engv, &eflag, &vflag, &ainum,
+                              &nbor_pitch, &this->_threads_per_atom, 
+                              &_tablength);
    }
  }
  this->time_pair.stop();
--- a/lib/gpu/lal_table.cu
+++ b/lib/gpu/lal_table.cu
@ -15,11 +15,13 @@

 #ifdef NV_KERNEL
 #include "lal_aux_fun1.h"
-texture<float4> pos_tex;
 #ifndef _DOUBLE_DOUBLE
-ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
-  { return tex1Dfetch(pos_tex, i); }
+texture<float4> pos_tex;
+#else
+texture<int4,1> pos_tex;
 #endif
+#else
+#define pos_tex x_
 #endif

 #define LOOKUP 0
@ -37,7 +39,7 @@ typedef union {

 /// ---------------- LOOKUP -------------------------------------------------

-__kernel void kernel_pair(__global numtyp4 *x_, __global int *tabindex,
+__kernel void k_table(__global numtyp4 *x_, __global int *tabindex,
                       __global numtyp4* coeff2, 
                       __global numtyp4 *coeff3,
                       __global numtyp4 *coeff4,
@ -73,7 +75,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global int *tabindex,
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);

-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;
    
    numtyp factor_lj;
@ -83,7 +85,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global int *tabindex,
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int mtype=itype*lj_types+jx.w;
      int tbindex = tabindex[mtype];
      
@ -128,7 +130,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global int *tabindex,
  } // if ii
 }

-__kernel void kernel_pair_fast(__global numtyp4 *x_, __global int *tabindex,
+__kernel void k_table_fast(__global numtyp4 *x_, __global int *tabindex,
                            __global numtyp4* coeff2, 
                            __global numtyp4 *coeff3,
                            __global numtyp4 *coeff4,
@ -167,7 +169,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global int *tabindex,
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);

-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
    
@ -178,7 +180,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global int *tabindex,
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int mtype=itype+jx.w;
      int tbindex = tabindex[mtype];
      
@ -225,7 +227,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global int *tabindex,

 /// ---------------- LINEAR -------------------------------------------------

-__kernel void kernel_pair_linear(__global numtyp4 *x_, __global int *tabindex,
+__kernel void k_table_linear(__global numtyp4 *x_, __global int *tabindex,
                       __global numtyp4* coeff2, 
                       __global numtyp4 *coeff3,
                       __global numtyp4 *coeff4,
@ -261,7 +263,7 @@ __kernel void kernel_pair_linear(__global numtyp4 *x_, __global int *tabindex,
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);

-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;
    
    numtyp factor_lj;
@ -271,7 +273,7 @@ __kernel void kernel_pair_linear(__global numtyp4 *x_, __global int *tabindex,
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int mtype=itype*lj_types+jx.w;
      int tbindex = tabindex[mtype];
      
@ -320,7 +322,7 @@ __kernel void kernel_pair_linear(__global numtyp4 *x_, __global int *tabindex,
  } // if ii
 }

-__kernel void kernel_pair_linear_fast(__global numtyp4 *x_, __global int *tabindex,
+__kernel void k_table_linear_fast(__global numtyp4 *x_, __global int *tabindex,
                            __global numtyp4* coeff2, 
                            __global numtyp4 *coeff3,
                            __global numtyp4 *coeff4,
@ -359,7 +361,7 @@ __kernel void kernel_pair_linear_fast(__global numtyp4 *x_, __global int *tabind
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);

-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
    
@ -370,7 +372,7 @@ __kernel void kernel_pair_linear_fast(__global numtyp4 *x_, __global int *tabind
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int mtype=itype+jx.w;
      int tbindex = tabindex[mtype];
      
@ -421,7 +423,7 @@ __kernel void kernel_pair_linear_fast(__global numtyp4 *x_, __global int *tabind

 /// ---------------- SPLINE -------------------------------------------------

-__kernel void kernel_pair_spline(__global numtyp4 *x_, __global int *tabindex,
+__kernel void k_table_spline(__global numtyp4 *x_, __global int *tabindex,
                       __global numtyp4* coeff2, 
                       __global numtyp4 *coeff3,
                       __global numtyp4 *coeff4,
@ -457,7 +459,7 @@ __kernel void kernel_pair_spline(__global numtyp4 *x_, __global int *tabindex,
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);

-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;
    
    numtyp factor_lj;
@ -467,7 +469,7 @@ __kernel void kernel_pair_spline(__global numtyp4 *x_, __global int *tabindex,
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int mtype=itype*lj_types+jx.w;
      int tbindex = tabindex[mtype];
      
@ -523,7 +525,7 @@ __kernel void kernel_pair_spline(__global numtyp4 *x_, __global int *tabindex,
  } // if ii
 }

-__kernel void kernel_pair_spline_fast(__global numtyp4 *x_, __global int *tabindex,
+__kernel void k_table_spline_fast(__global numtyp4 *x_, __global int *tabindex,
                            __global numtyp4* coeff2, 
                            __global numtyp4 *coeff3,
                            __global numtyp4 *coeff4,
@ -562,7 +564,7 @@ __kernel void kernel_pair_spline_fast(__global numtyp4 *x_, __global int *tabind
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);

-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
    
@ -573,7 +575,7 @@ __kernel void kernel_pair_spline_fast(__global numtyp4 *x_, __global int *tabind
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int mtype=itype+jx.w;
      int tbindex = tabindex[mtype];
      
@ -631,7 +633,7 @@ __kernel void kernel_pair_spline_fast(__global numtyp4 *x_, __global int *tabind

 /// ---------------- BITMAP -------------------------------------------------

-__kernel void kernel_pair_bitmap(__global numtyp4 *x_, __global int *tabindex,
+__kernel void k_table_bitmap(__global numtyp4 *x_, __global int *tabindex,
                       __global int *nshiftbits, __global int *nmask,
                       __global numtyp4* coeff2, 
                       __global numtyp4 *coeff3,
@ -668,7 +670,7 @@ __kernel void kernel_pair_bitmap(__global numtyp4 *x_, __global int *tabindex,
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);

-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;
    
    numtyp factor_lj;
@ -678,7 +680,7 @@ __kernel void kernel_pair_bitmap(__global numtyp4 *x_, __global int *tabindex,
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int mtype=itype*lj_types+jx.w;
      int tbindex = tabindex[mtype];
      
@ -730,7 +732,7 @@ __kernel void kernel_pair_bitmap(__global numtyp4 *x_, __global int *tabindex,
  } // if ii
 }

-__kernel void kernel_pair_bitmap_fast(__global numtyp4 *x_, __global int *tabindex,
+__kernel void k_table_bitmap_fast(__global numtyp4 *x_, __global int *tabindex,
                            __global int *nshiftbits, __global int *nmask,
                            __global numtyp4* coeff2, 
                            __global numtyp4 *coeff3,
@ -770,7 +772,7 @@ __kernel void kernel_pair_bitmap_fast(__global numtyp4 *x_, __global int *tabind
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);

-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
    
@ -781,7 +783,7 @@ __kernel void kernel_pair_bitmap_fast(__global numtyp4 *x_, __global int *tabind
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int mtype=itype+jx.w;
      int tbindex = tabindex[mtype];
      
--- a/lib/gpu/lal_yukawa.cpp
+++ b/lib/gpu/lal_yukawa.cpp
@ -13,10 +13,12 @@
    email                : nguyentd@ornl.gov
 ***************************************************************************/

-#ifdef USE_OPENCL
+#if defined(USE_OPENCL)
 #include "yukawa_cl.h"
+#elif defined(USE_CUDART)
+const char *yukawa=0;
 #else
-#include "yukawa_ptx.h"
+#include "yukawa_cubin.h"
 #endif

 #include "lal_yukawa.h"
@ -50,7 +52,7 @@ int YukawaT::init(const int ntypes,
                  const double gpu_split, FILE *_screen) {
  int success;
  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                            _screen,yukawa);
+                            _screen,yukawa,"k_yukawa");
  if (success!=0)
    return success;

@ -129,20 +131,17 @@ void YukawaT::loop(const bool _eflag, const bool _vflag) {
  this->time_pair.start();
  if (shared_types) {
    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->dev_x.begin(), &coeff.begin(), &_kappa, 
-                          &sp_lj.begin(),
-                          &this->nbor->dev_nbor.begin(),
-                          &this->_nbor_data->begin(),
-                          &this->ans->dev_ans.begin(),
-                          &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &nbor_pitch, &this->_threads_per_atom);
+    this->k_pair_fast.run(&this->atom->x, &coeff, &_kappa, &sp_lj,
+                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->ans->force, &this->ans->engv, &eflag,
+                          &vflag, &ainum, &nbor_pitch, 
+                          &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->dev_x.begin(), &coeff.begin(), &_kappa,
-                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &nbor_pitch, &this->_threads_per_atom);
+    this->k_pair.run(&this->atom->x, &coeff, &_kappa, &_lj_types, &sp_lj,
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                     &this->ans->force, &this->ans->engv, &eflag, &vflag,
+                     &ainum, &nbor_pitch, &this->_threads_per_atom);
  }
  this->time_pair.stop();
 }
--- a/lib/gpu/lal_yukawa.cu
+++ b/lib/gpu/lal_yukawa.cu
@ -15,14 +15,16 @@

 #ifdef NV_KERNEL
 #include "lal_aux_fun1.h"
-texture<float4> pos_tex;
 #ifndef _DOUBLE_DOUBLE
-ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
-  { return tex1Dfetch(pos_tex, i); }
+texture<float4> pos_tex;
+#else
+texture<int4,1> pos_tex;
 #endif
+#else
+#define pos_tex x_
 #endif

-__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff,
+__kernel void k_yukawa(__global numtyp4 *x_, __global numtyp4 *coeff,
                          const numtyp kappa, const int lj_types,
                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
                          __global int *dev_packed, __global acctyp4 *ans,
@ -51,7 +53,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff,
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;

    numtyp factor_lj;
@ -61,7 +63,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff,
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int jtype=jx.w;

      // Compute r12
@ -103,7 +105,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *coeff,
  } // if ii
 }

-__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff_in,
+__kernel void k_yukawa_fast(__global numtyp4 *x_, __global numtyp4 *coeff_in,
                               const numtyp kappa, __global numtyp* sp_lj_in, 
                               __global int *dev_nbor, __global int *dev_packed, 
                               __global acctyp4 *ans, __global acctyp *engv, 
@ -135,7 +137,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff_in,
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);

-    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

@ -146,7 +148,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *coeff_in,
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

-      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int mtype=itype+jx.w;

      // Compute r12