git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6053 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2011-05-02 15:02:52 +00:00
parent 2be078632d
commit 5f799182b3
70 changed files with 4489 additions and 2253 deletions
--- a/lib/gpu/Makefile.fermi
+++ b/lib/gpu/Makefile.fermi
@ -26,7 +26,7 @@ CUDA_INCLUDE = -I$(CUDA_HOME)/include
 CUDA_LIB = -L$(CUDA_HOME)/lib64 -Xlinker -rpath -Xlinker $(CUDA_HOME)/lib64
 CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math 
-CUDR_CPP = mpic++ -DMPI_GERYON -I$(CUDA_HOME)/include 
+CUDR_CPP = mpic++ -DMPI_GERYON -DUCL_NO_EXIT -I$(CUDA_HOME)/include 
 CUDR_OPTS = -O3 -ffast-math -funroll-loops -DMPI_GERYON
 BIN_DIR = ./
--- a/lib/gpu/Makefile.lens
+++ b/lib/gpu/Makefile.lens
@ -17,16 +17,16 @@
 #                          Paul Crozier (SNL), pscrozi@sandia.gov             
 # ------------------------------------------------------------------------- */
-CUDA_HOME = /sw/analysis-x64/cuda/3.0/sl5.0_binary/
+CUDA_HOME = /sw/analysis-x64/cuda/3.2/centos5.5_binary/
 NVCC = nvcc
 CUDA_ARCH = -arch=sm_13
-CUDA_PRECISION = -D_SINGLE_SINGLE
+CUDA_PRECISION = -D_SINGLE_DOUBLE
 CUDA_INCLUDE = -I$(CUDA_HOME)/include
 CUDA_LIB = -L$(CUDA_HOME)/lib64
 CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math
-CUDR_CPP = mpic++ -DMPI_GERYON -openmp
+CUDR_CPP = mpic++ -DMPI_GERYON -DUCL_NO_EXIT -openmp
 CUDR_OPTS = -O2 -xSSE2 -ip -use-intel-optimized-headers -fno-alias
 BIN_DIR = ./
--- a/lib/gpu/Makefile.lincoln
+++ b/lib/gpu/Makefile.lincoln
@ -24,7 +24,7 @@ CUDA_INCLUDE = -I$(CUDA_HOME)/include
 CUDA_LIB = -L$(CUDA_HOME)/lib64 -Wl,-rpath,$(CUDA_HOME)/lib64
 CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math 
-CUDR_CPP = mpic++ -DMPI_GERYON
+CUDR_CPP = mpic++ -DMPI_GERYON -DUCL_NO_EXIT 
 CUDR_OPTS = -O3 -DMPI_GERYON -ffast-math -funroll-loops
 BIN_DIR = ./
--- a/lib/gpu/Makefile.linux
+++ b/lib/gpu/Makefile.linux
@ -26,7 +26,7 @@ CUDA_INCLUDE = -I$(CUDA_HOME)/include
 CUDA_LIB = -L$(CUDA_HOME)/lib64
 CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math
-CUDR_CPP = mpic++ -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK
+CUDR_CPP = mpic++ -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK
 CUDR_OPTS = -O2 # -xHost -no-prec-div -ansi-alias
 BIN_DIR = ./
--- a/lib/gpu/Makefile.linux_opencl
+++ b/lib/gpu/Makefile.linux_opencl
@ -17,7 +17,7 @@
 #                          Paul Crozier (SNL), pscrozi@sandia.gov             
 # ------------------------------------------------------------------------- */
-OCL_CPP = mpic++ -I./geryon/opencl -O3 -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK
+OCL_CPP = mpic++ -O3 -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK
 OCL_LINK = -lOpenCL
 OCL_PREC = -D_SINGLE_SINGLE
--- a/lib/gpu/Makefile.longhorn
+++ b/lib/gpu/Makefile.longhorn
@ -23,7 +23,7 @@ CUDA_INCLUDE = -I$(CUDA_HOME)/include
 CUDA_LIB = -L$(TACC_CUDA_LIB) -Wl,-rpath,$(TACC_CUDA_LIB)
 CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math
-CUDR_CPP = mpicxx -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK
+CUDR_CPP = mpicxx -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK
 CUDR_OPTS = -O2 # -xHost -no-prec-div -ansi-alias
 BIN_DIR = ./
--- a/lib/gpu/Makefile.mac
+++ b/lib/gpu/Makefile.mac
@ -24,7 +24,7 @@ CUDA_ARCH = -arch=sm_11
 CUDA_PRECISION = -D_SINGLE_SINGLE
 CUDA_INCLUDE = -I$(CUDA_HOME)/include
 CUDA_LIB = -L$(CUDA_HOME)/lib
-CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math -m32
+CUDA_OPTS = -DUNIX -DUCL_NO_EXIT -O3 -Xptxas -v --use_fast_math -m32
 CUDR_CPP = mpic++
 CUDR_OPTS = -O2 -m32 -g
--- a/lib/gpu/Makefile.mac_opencl
+++ b/lib/gpu/Makefile.mac_opencl
@ -17,7 +17,7 @@
 #                          Paul Crozier (SNL), pscrozi@sandia.gov             
 # ------------------------------------------------------------------------- */
-OCL_CPP = mpic++ -I./geryon/opencl_1_0 -O3 -DMPI_GERYON
+OCL_CPP = mpic++ -I./geryon/opencl_1_0 -O3 -DMPI_GERYON -DUCL_NO_EXIT
 OCL_LINK = -framework OpenCL
 OCL_PREC = -D_SINGLE_SINGLE
--- a/lib/gpu/Nvidia.makefile
+++ b/lib/gpu/Nvidia.makefile
@ -14,6 +14,7 @@
 # /* ----------------------------------------------------------------------   
 #    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov               
 #                          Peng Wang (Nvidia), penwang@nvidia.com
 #                          Inderaj Bains (NVIDIA), ibains@nvidia.com
 #                          Paul Crozier (SNL), pscrozi@sandia.gov             
 # ------------------------------------------------------------------------- */
@ -28,10 +29,11 @@ GPU_LIB = $(LIB_DIR)/libgpu.a
 # Headers for Geryon
 UCL_H  = $(wildcard ./geryon/ucl*.h)
 NVC_H  = $(wildcard ./geryon/nvc*.h) $(UCL_H)
-NVD_H  = $(wildcard ./geryon/nvd*.h) $(UCL_H) 
+NVD_H  = $(wildcard ./geryon/nvd*.h) $(UCL_H) nv_kernel_def.h
 # Headers for Pair Stuff
-PAIR_H  = pair_gpu_atom.h pair_gpu_nbor.h pair_gpu_precision.h \
+PAIR_H  = pair_gpu_atom.h pair_gpu_ans.h pair_gpu_nbor_shared.h \
-          pair_gpu_device.h pair_gpu_balance.h
+          pair_gpu_nbor.h pair_gpu_precision.h pair_gpu_device.h \
          pair_gpu_balance.h pppm_gpu_memory.h
 ALL_H = $(NVD_H) $(PAIR_H)
@ -39,28 +41,37 @@ EXECS = $(BIN_DIR)/nvc_get_devices
 CUDPP = $(OBJ_DIR)/cudpp.o $(OBJ_DIR)/cudpp_plan.o \
        $(OBJ_DIR)/cudpp_maximal_launch.o $(OBJ_DIR)/cudpp_plan_manager.o \
        $(OBJ_DIR)/radixsort_app.cu_o $(OBJ_DIR)/scan_app.cu_o
-OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_nbor.o \
+OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_ans.o \
-       $(OBJ_DIR)/pair_gpu_device.o $(OBJ_DIR)/atomic_gpu_memory.o \
+       $(OBJ_DIR)/pair_gpu_nbor.o $(OBJ_DIR)/pair_gpu_nbor_shared.o \
-       $(OBJ_DIR)/charge_gpu_memory.o \
+       $(OBJ_DIR)/pair_gpu_device.o \
       $(OBJ_DIR)/atomic_gpu_memory.o $(OBJ_DIR)/charge_gpu_memory.o \
       $(OBJ_DIR)/pppm_gpu_memory.o $(OBJ_DIR)/pppm_l_gpu.o \
       $(OBJ_DIR)/gb_gpu_memory.o $(OBJ_DIR)/gb_gpu.o \
       $(OBJ_DIR)/lj_cut_gpu_memory.o $(OBJ_DIR)/lj_cut_gpu.o \
       $(OBJ_DIR)/lj96_cut_gpu_memory.o $(OBJ_DIR)/lj96_cut_gpu.o \
       $(OBJ_DIR)/lj_expand_gpu_memory.o $(OBJ_DIR)/lj_expand_gpu.o \
       $(OBJ_DIR)/ljc_cut_gpu_memory.o $(OBJ_DIR)/ljc_cut_gpu.o \
       $(OBJ_DIR)/ljcl_cut_gpu_memory.o $(OBJ_DIR)/ljcl_cut_gpu.o \
       $(OBJ_DIR)/morse_gpu_memory.o $(OBJ_DIR)/morse_gpu.o \
       $(OBJ_DIR)/crml_gpu_memory.o $(OBJ_DIR)/crml_gpu.o \
       $(OBJ_DIR)/cmm_cut_gpu_memory.o $(OBJ_DIR)/cmm_cut_gpu.o \
       $(OBJ_DIR)/cmmc_long_gpu_memory.o $(OBJ_DIR)/cmmc_long_gpu.o \
       $(CUDPP)
-PTXS = $(OBJ_DIR)/pair_gpu_atom_kernel.ptx $(OBJ_DIR)/pair_gpu_atom_ptx.h \
+PTXS = $(OBJ_DIR)/pair_gpu_dev_kernel.ptx \
       $(OBJ_DIR)/pair_gpu_atom_kernel.ptx $(OBJ_DIR)/pair_gpu_atom_ptx.h \
       $(OBJ_DIR)/pair_gpu_nbor_kernel.ptx $(OBJ_DIR)/pair_gpu_nbor_ptx.h \
       $(OBJ_DIR)/pair_gpu_build_kernel.ptx $(OBJ_DIR)/pair_gpu_build_ptx.h \
       $(OBJ_DIR)/pppm_f_gpu_kernel.ptx $(OBJ_DIR)/pppm_f_gpu_ptx.h \
       $(OBJ_DIR)/pppm_d_gpu_kernel.ptx $(OBJ_DIR)/pppm_d_gpu_ptx.h \
       $(OBJ_DIR)/gb_gpu_kernel_nbor.ptx $(OBJ_DIR)/gb_gpu_kernel.ptx \
       $(OBJ_DIR)/gb_gpu_kernel_lj.ptx $(OBJ_DIR)/gb_gpu_ptx.h \
       $(OBJ_DIR)/lj_cut_gpu_kernel.ptx $(OBJ_DIR)/lj_cut_gpu_ptx.h \
       $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx $(OBJ_DIR)/lj96_cut_gpu_ptx.h \
       $(OBJ_DIR)/lj_expand_gpu_kernel.ptx $(OBJ_DIR)/lj_expand_gpu_ptx.h \
       $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc_cut_gpu_ptx.h \
       $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx $(OBJ_DIR)/ljcl_cut_gpu_ptx.h \
-       $(OBJ_DIR)/crml_cut_gpu_kernel.ptx $(OBJ_DIR)/crml_cut_gpu_ptx.h \
+       $(OBJ_DIR)/morse_gpu_kernel.ptx $(OBJ_DIR)/morse_gpu_ptx.h \
       $(OBJ_DIR)/crml_gpu_kernel.ptx $(OBJ_DIR)/crml_gpu_ptx.h \
       $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx $(OBJ_DIR)/cmm_cut_gpu_ptx.h \
       $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(OBJ_DIR)/cmmc_long_gpu_ptx.h
@ -93,6 +104,9 @@ $(OBJ_DIR)/pair_gpu_atom_ptx.h: $(OBJ_DIR)/pair_gpu_atom_kernel.ptx
 $(OBJ_DIR)/pair_gpu_atom.o: pair_gpu_atom.cpp pair_gpu_atom.h $(NVD_H) $(OBJ_DIR)/pair_gpu_atom_ptx.h
 	$(CUDR) -o $@ -c pair_gpu_atom.cpp -I$(OBJ_DIR)
 $(OBJ_DIR)/pair_gpu_ans.o: pair_gpu_ans.cpp pair_gpu_ans.h $(NVD_H)
 	$(CUDR) -o $@ -c pair_gpu_ans.cpp -I$(OBJ_DIR)
 $(OBJ_DIR)/pair_gpu_nbor_kernel.ptx: pair_gpu_nbor_kernel.cu
 	$(CUDA) --ptx -DNV_KERNEL -o $@ pair_gpu_nbor_kernel.cu
@ -105,11 +119,20 @@ $(OBJ_DIR)/pair_gpu_build_kernel.ptx: pair_gpu_build_kernel.cu
 $(OBJ_DIR)/pair_gpu_build_ptx.h: $(OBJ_DIR)/pair_gpu_build_kernel.ptx
 	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pair_gpu_build_kernel.ptx $(OBJ_DIR)/pair_gpu_build_ptx.h
-$(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h $(OBJ_DIR)/pair_gpu_nbor_ptx.h $(OBJ_DIR)/pair_gpu_build_ptx.h $(NVD_H)
+$(OBJ_DIR)/pair_gpu_nbor_shared.o: pair_gpu_nbor_shared.cpp pair_gpu_nbor_shared.h $(OBJ_DIR)/pair_gpu_nbor_ptx.h $(OBJ_DIR)/pair_gpu_build_ptx.h $(NVD_H)
 	$(CUDR) -o $@ -c pair_gpu_nbor_shared.cpp -I$(OBJ_DIR)
 $(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h pair_gpu_nbor_shared.h $(NVD_H)
 	$(CUDR) -o $@ -c pair_gpu_nbor.cpp -I$(OBJ_DIR)
-$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(NVD_H)
+$(OBJ_DIR)/pair_gpu_dev_kernel.ptx: pair_gpu_dev_kernel.cu
-	$(CUDR) -o $@ -c pair_gpu_device.cpp
+	$(CUDA) --ptx -DNV_KERNEL -o $@ pair_gpu_dev_kernel.cu
 $(OBJ_DIR)/pair_gpu_dev_ptx.h: $(OBJ_DIR)/pair_gpu_dev_kernel.ptx
 	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pair_gpu_dev_kernel.ptx $(OBJ_DIR)/pair_gpu_dev_ptx.h
 $(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(ALL_H) $(OBJ_DIR)/pair_gpu_dev_ptx.h
 	$(CUDR) -o $@ -c pair_gpu_device.cpp -I$(OBJ_DIR)
 $(OBJ_DIR)/atomic_gpu_memory.o: $(ALL_H) atomic_gpu_memory.h atomic_gpu_memory.cpp
 	$(CUDR) -o $@ -c atomic_gpu_memory.cpp
@ -117,6 +140,24 @@ $(OBJ_DIR)/atomic_gpu_memory.o: $(ALL_H) atomic_gpu_memory.h atomic_gpu_memory.c
 $(OBJ_DIR)/charge_gpu_memory.o: $(ALL_H) charge_gpu_memory.h charge_gpu_memory.cpp
 	$(CUDR) -o $@ -c charge_gpu_memory.cpp
 $(OBJ_DIR)/pppm_f_gpu_kernel.ptx: pppm_gpu_kernel.cu pair_gpu_precision.h
 	$(CUDA) --ptx -DNV_KERNEL -Dgrdtyp=float -Dgrdtyp4=float4 -o $@ pppm_gpu_kernel.cu
 $(OBJ_DIR)/pppm_f_gpu_ptx.h: $(OBJ_DIR)/pppm_f_gpu_kernel.ptx
 	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pppm_f_gpu_kernel.ptx $(OBJ_DIR)/pppm_f_gpu_ptx.h
 $(OBJ_DIR)/pppm_d_gpu_kernel.ptx: pppm_gpu_kernel.cu pair_gpu_precision.h
 	$(CUDA) --ptx -DNV_KERNEL -Dgrdtyp=double -Dgrdtyp4=double4 -o $@ pppm_gpu_kernel.cu
 $(OBJ_DIR)/pppm_d_gpu_ptx.h: $(OBJ_DIR)/pppm_d_gpu_kernel.ptx
 	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pppm_d_gpu_kernel.ptx $(OBJ_DIR)/pppm_d_gpu_ptx.h
 $(OBJ_DIR)/pppm_gpu_memory.o: $(ALL_H) pppm_gpu_memory.h pppm_gpu_memory.cpp $(OBJ_DIR)/pppm_f_gpu_ptx.h $(OBJ_DIR)/pppm_d_gpu_ptx.h
 	$(CUDR) -o $@ -c pppm_gpu_memory.cpp -I$(OBJ_DIR)
 $(OBJ_DIR)/pppm_l_gpu.o: $(ALL_H) pppm_gpu_memory.h pppm_l_gpu.cpp
 	$(CUDR) -o $@ -c pppm_l_gpu.cpp -I$(OBJ_DIR)
 $(OBJ_DIR)/gb_gpu_kernel.ptx: gb_gpu_kernel.cu pair_gpu_precision.h gb_gpu_extra.h
 	$(CUDA) --ptx -DNV_KERNEL -o $@ gb_gpu_kernel.cu
@ -144,7 +185,7 @@ $(OBJ_DIR)/lj_cut_gpu_ptx.h: $(OBJ_DIR)/lj_cut_gpu_kernel.ptx $(OBJ_DIR)/lj_cut_
 $(OBJ_DIR)/lj_cut_gpu_memory.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu_memory.cpp $(OBJ_DIR)/lj_cut_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
 	$(CUDR) -o $@ -c lj_cut_gpu_memory.cpp -I$(OBJ_DIR)
-$(OBJ_DIR)/lj_cut_gpu.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu.cpp
+$(OBJ_DIR)/lj_cut_gpu.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu.cpp atomic_gpu_memory.h
 	$(CUDR) -o $@ -c lj_cut_gpu.cpp -I$(OBJ_DIR)
 $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx: ljc_cut_gpu_kernel.cu pair_gpu_precision.h
@ -156,7 +197,7 @@ $(OBJ_DIR)/ljc_cut_gpu_ptx.h: $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc_c
 $(OBJ_DIR)/ljc_cut_gpu_memory.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu_memory.cpp $(OBJ_DIR)/ljc_cut_gpu_ptx.h $(OBJ_DIR)/charge_gpu_memory.o
 	$(CUDR) -o $@ -c ljc_cut_gpu_memory.cpp -I$(OBJ_DIR)
-$(OBJ_DIR)/ljc_cut_gpu.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu.cpp
+$(OBJ_DIR)/ljc_cut_gpu.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu.cpp charge_gpu_memory.h
 	$(CUDR) -o $@ -c ljc_cut_gpu.cpp -I$(OBJ_DIR)
 $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx: ljcl_cut_gpu_kernel.cu pair_gpu_precision.h
@ -168,9 +209,21 @@ $(OBJ_DIR)/ljcl_cut_gpu_ptx.h: $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc
 $(OBJ_DIR)/ljcl_cut_gpu_memory.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu_memory.cpp $(OBJ_DIR)/ljcl_cut_gpu_ptx.h $(OBJ_DIR)/charge_gpu_memory.o
 	$(CUDR) -o $@ -c ljcl_cut_gpu_memory.cpp -I$(OBJ_DIR)
-$(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp
+$(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp charge_gpu_memory.h
 	$(CUDR) -o $@ -c ljcl_cut_gpu.cpp -I$(OBJ_DIR)
 $(OBJ_DIR)/morse_gpu_kernel.ptx: morse_gpu_kernel.cu pair_gpu_precision.h
 	$(CUDA) --ptx -DNV_KERNEL -o $@ morse_gpu_kernel.cu
 $(OBJ_DIR)/morse_gpu_ptx.h: $(OBJ_DIR)/morse_gpu_kernel.ptx $(OBJ_DIR)/morse_gpu_kernel.ptx
 	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/morse_gpu_kernel.ptx $(OBJ_DIR)/morse_gpu_ptx.h
 $(OBJ_DIR)/morse_gpu_memory.o: $(ALL_H) morse_gpu_memory.h morse_gpu_memory.cpp $(OBJ_DIR)/morse_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
 	$(CUDR) -o $@ -c morse_gpu_memory.cpp -I$(OBJ_DIR)
 $(OBJ_DIR)/morse_gpu.o: $(ALL_H) morse_gpu_memory.h morse_gpu.cpp atomic_gpu_memory.h
 	$(CUDR) -o $@ -c morse_gpu.cpp -I$(OBJ_DIR)
 $(OBJ_DIR)/crml_gpu_kernel.ptx: crml_gpu_kernel.cu pair_gpu_precision.h
 	$(CUDA) --ptx -DNV_KERNEL -o $@ crml_gpu_kernel.cu
@ -180,7 +233,7 @@ $(OBJ_DIR)/crml_gpu_ptx.h: $(OBJ_DIR)/crml_gpu_kernel.ptx $(OBJ_DIR)/crml_gpu_ke
 $(OBJ_DIR)/crml_gpu_memory.o: $(ALL_H) crml_gpu_memory.h crml_gpu_memory.cpp $(OBJ_DIR)/crml_gpu_ptx.h $(OBJ_DIR)/charge_gpu_memory.o
 	$(CUDR) -o $@ -c crml_gpu_memory.cpp -I$(OBJ_DIR)
-$(OBJ_DIR)/crml_gpu.o: $(ALL_H) crml_gpu_memory.h crml_gpu.cpp
+$(OBJ_DIR)/crml_gpu.o: $(ALL_H) crml_gpu_memory.h crml_gpu.cpp charge_gpu_memory.h
 	$(CUDR) -o $@ -c crml_gpu.cpp -I$(OBJ_DIR)
 $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx: lj96_cut_gpu_kernel.cu pair_gpu_precision.h
@ -192,9 +245,21 @@ $(OBJ_DIR)/lj96_cut_gpu_ptx.h: $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx $(OBJ_DIR)/lj9
 $(OBJ_DIR)/lj96_cut_gpu_memory.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu_memory.cpp $(OBJ_DIR)/lj96_cut_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
 	$(CUDR) -o $@ -c lj96_cut_gpu_memory.cpp -I$(OBJ_DIR)
-$(OBJ_DIR)/lj96_cut_gpu.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu.cpp
+$(OBJ_DIR)/lj96_cut_gpu.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu.cpp atomic_gpu_memory.h
 	$(CUDR) -o $@ -c lj96_cut_gpu.cpp -I$(OBJ_DIR)
 $(OBJ_DIR)/lj_expand_gpu_kernel.ptx: lj_expand_gpu_kernel.cu pair_gpu_precision.h
 	$(CUDA) --ptx -DNV_KERNEL -o $@ lj_expand_gpu_kernel.cu
 $(OBJ_DIR)/lj_expand_gpu_ptx.h: $(OBJ_DIR)/lj_expand_gpu_kernel.ptx $(OBJ_DIR)/lj_expand_gpu_kernel.ptx
 	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/lj_expand_gpu_kernel.ptx $(OBJ_DIR)/lj_expand_gpu_ptx.h
 $(OBJ_DIR)/lj_expand_gpu_memory.o: $(ALL_H) lj_expand_gpu_memory.h lj_expand_gpu_memory.cpp $(OBJ_DIR)/lj_expand_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
 	$(CUDR) -o $@ -c lj_expand_gpu_memory.cpp -I$(OBJ_DIR)
 $(OBJ_DIR)/lj_expand_gpu.o: $(ALL_H) lj_expand_gpu_memory.h lj_expand_gpu.cpp atomic_gpu_memory.h
 	$(CUDR) -o $@ -c lj_expand_gpu.cpp -I$(OBJ_DIR)
 $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx: cmm_cut_gpu_kernel.cu pair_gpu_precision.h
 	$(CUDA) --ptx -DNV_KERNEL -o $@ cmm_cut_gpu_kernel.cu
@ -204,7 +269,7 @@ $(OBJ_DIR)/cmm_cut_gpu_ptx.h: $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx $(OBJ_DIR)/cmm_c
 $(OBJ_DIR)/cmm_cut_gpu_memory.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu_memory.cpp $(OBJ_DIR)/cmm_cut_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
 	$(CUDR) -o $@ -c cmm_cut_gpu_memory.cpp -I$(OBJ_DIR)
-$(OBJ_DIR)/cmm_cut_gpu.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu.cpp
+$(OBJ_DIR)/cmm_cut_gpu.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu.cpp atomic_gpu_memory.h
 	$(CUDR) -o $@ -c cmm_cut_gpu.cpp -I$(OBJ_DIR)
 $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx: cmmc_long_gpu_kernel.cu pair_gpu_precision.h
@ -216,7 +281,7 @@ $(OBJ_DIR)/cmmc_long_gpu_ptx.h: $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(OBJ_DIR)/c
 $(OBJ_DIR)/cmmc_long_gpu_memory.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu_memory.cpp $(OBJ_DIR)/cmmc_long_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
 	$(CUDR) -o $@ -c cmmc_long_gpu_memory.cpp -I$(OBJ_DIR)
-$(OBJ_DIR)/cmmc_long_gpu.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu.cpp
+$(OBJ_DIR)/cmmc_long_gpu.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu.cpp charge_gpu_memory.h
 	$(CUDR) -o $@ -c cmmc_long_gpu.cpp -I$(OBJ_DIR)
 $(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVC_H)
--- a/lib/gpu/Opencl.makefile
+++ b/lib/gpu/Opencl.makefile
@ -14,6 +14,7 @@
 # /* ----------------------------------------------------------------------   
 #    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov               
 #                          Peng Wang (Nvidia), penwang@nvidia.com             
 #                          Inderaj Bains (NVIDIA), ibains@nvidia.com
 #                          Paul Crozier (SNL), pscrozi@sandia.gov             
 # ------------------------------------------------------------------------- */
@ -23,29 +24,36 @@ OCL_LIB = $(LIB_DIR)/libgpu.a
 UCL_H  = $(wildcard ./geryon/ucl*.h)
 OCL_H  = $(wildcard ./geryon/ocl*.h) $(UCL_H)
 # Headers for Pair Stuff
-PAIR_H  = pair_gpu_atom.h pair_gpu_nbor.h pair_gpu_precision.h \
+PAIR_H  = pair_gpu_atom.h pair_gpu_ans.h pair_gpu_nbor_shared.h \
-          pair_gpu_device.h pair_gpu_balance.h
+          pair_gpu_nbor.h pair_gpu_precision.h pair_gpu_device.h \
          pair_gpu_balance.h pppm_gpu_memory.h
 ALL_H = $(OCL_H) $(PAIR_H)
 EXECS = $(BIN_DIR)/ocl_get_devices
-OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_nbor.o \
+OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_ans.o \
-       $(OBJ_DIR)/pair_gpu_device.o $(OBJ_DIR)/atomic_gpu_memory.o \
+       $(OBJ_DIR)/pair_gpu_nbor_shared.o $(OBJ_DIR)/pair_gpu_nbor.o \
-       $(OBJ_DIR)/charge_gpu_memory.o \
+       $(OBJ_DIR)/pair_gpu_device.o \
       $(OBJ_DIR)/atomic_gpu_memory.o $(OBJ_DIR)/charge_gpu_memory.o \
       $(OBJ_DIR)/pppm_gpu_memory.o $(OBJ_DIR)/pppm_l_gpu.o \
       $(OBJ_DIR)/gb_gpu_memory.o $(OBJ_DIR)/gb_gpu.o \
       $(OBJ_DIR)/lj_cut_gpu_memory.o $(OBJ_DIR)/lj_cut_gpu.o \
       $(OBJ_DIR)/lj96_cut_gpu_memory.o $(OBJ_DIR)/lj96_cut_gpu.o \
       $(OBJ_DIR)/lj_expand_gpu_memory.o $(OBJ_DIR)/lj_expand_gpu.o \
       $(OBJ_DIR)/ljc_cut_gpu_memory.o $(OBJ_DIR)/ljc_cut_gpu.o \
       $(OBJ_DIR)/ljcl_cut_gpu_memory.o $(OBJ_DIR)/ljcl_cut_gpu.o \
       $(OBJ_DIR)/morse_gpu_memory.o $(OBJ_DIR)/morse_gpu.o \
       $(OBJ_DIR)/crml_gpu_memory.o $(OBJ_DIR)/crml_gpu.o \
       $(OBJ_DIR)/cmm_cut_gpu_memory.o $(OBJ_DIR)/cmm_cut_gpu.o \
       $(OBJ_DIR)/cmmc_long_gpu_memory.o $(OBJ_DIR)/cmmc_long_gpu.o 
-KERS = $(OBJ_DIR)/pair_gpu_atom_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h \
+KERS = $(OBJ_DIR)/pair_gpu_dev_cl.h $(OBJ_DIR)/pair_gpu_atom_cl.h \
       $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/pppm_gpu_cl.h \
       $(OBJ_DIR)/gb_gpu_nbor_cl.h $(OBJ_DIR)/gb_gpu_cl.h \
       $(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/lj96_cut_gpu_cl.h \
-       $(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/ljcl_cut_gpu_cl.h \
+       $(OBJ_DIR)/lj_expand_gpu_cl.h $(OBJ_DIR)/ljc_cut_gpu_cl.h \
-       $(OBJ_DIR)/crml_gpu_cl.h \
+       $(OBJ_DIR)/ljcl_cut_gpu_cl.h $(OBJ_DIR)/morse_gpu_cl.h \
-       $(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/cmmc_long_gpu_cl.h 
+       $(OBJ_DIR)/crml_gpu_cl.h $(OBJ_DIR)/cmm_cut_gpu_cl.h \
       $(OBJ_DIR)/cmmc_long_gpu_cl.h 
 OCL_EXECS = $(BIN_DIR)/ocl_get_devices
@ -57,14 +65,23 @@ $(OBJ_DIR)/pair_gpu_atom_cl.h: pair_gpu_atom_kernel.cu
 $(OBJ_DIR)/pair_gpu_atom.o: pair_gpu_atom.cpp pair_gpu_atom.h $(OCL_H) $(OBJ_DIR)/pair_gpu_atom_cl.h
 	$(OCL) -o $@ -c pair_gpu_atom.cpp -I$(OBJ_DIR)
 $(OBJ_DIR)/pair_gpu_ans.o: pair_gpu_ans.cpp pair_gpu_ans.h $(OCL_H)
 	$(OCL) -o $@ -c pair_gpu_ans.cpp -I$(OBJ_DIR)
 $(OBJ_DIR)/pair_gpu_nbor_cl.h: pair_gpu_nbor_kernel.cu
 	$(BSH) ./geryon/file_to_cstr.sh pair_gpu_nbor_kernel.cu $(OBJ_DIR)/pair_gpu_nbor_cl.h
-$(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h $(OCL_H) $(OBJ_DIR)/pair_gpu_nbor_cl.h
+$(OBJ_DIR)/pair_gpu_nbor_shared.o: pair_gpu_nbor_shared.cpp pair_gpu_nbor_shared.h $(OCL_H) $(OBJ_DIR)/pair_gpu_nbor_cl.h
 	$(OCL) -o $@ -c pair_gpu_nbor_shared.cpp -I$(OBJ_DIR)
 $(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h $(OCL_H) pair_gpu_nbor_shared.h
 	$(OCL) -o $@ -c pair_gpu_nbor.cpp -I$(OBJ_DIR)
-$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(OCL_H)
+$(OBJ_DIR)/pair_gpu_dev_cl.h: pair_gpu_dev_kernel.cu
-	$(OCL) -o $@ -c pair_gpu_device.cpp
+	$(BSH) ./geryon/file_to_cstr.sh pair_gpu_dev_kernel.cu $(OBJ_DIR)/pair_gpu_dev_cl.h
 $(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(ALL_H) $(OBJ_DIR)/pair_gpu_dev_cl.h
 	$(OCL) -o $@ -c pair_gpu_device.cpp -I$(OBJ_DIR)
 $(OBJ_DIR)/atomic_gpu_memory.o: $(OCL_H) atomic_gpu_memory.h atomic_gpu_memory.cpp
 	$(OCL) -o $@ -c atomic_gpu_memory.cpp
@ -72,6 +89,15 @@ $(OBJ_DIR)/atomic_gpu_memory.o: $(OCL_H) atomic_gpu_memory.h atomic_gpu_memory.c
 $(OBJ_DIR)/charge_gpu_memory.o: $(OCL_H) charge_gpu_memory.h charge_gpu_memory.cpp
 	$(OCL) -o $@ -c charge_gpu_memory.cpp
 $(OBJ_DIR)/pppm_gpu_cl.h: pppm_gpu_kernel.cu
 	$(BSH) ./geryon/file_to_cstr.sh pppm_gpu_kernel.cu $(OBJ_DIR)/pppm_gpu_cl.h;
 $(OBJ_DIR)/pppm_gpu_memory.o: $(ALL_H) pppm_gpu_memory.h pppm_gpu_memory.cpp  $(OBJ_DIR)/pppm_gpu_cl.h $(OBJ_DIR)/pppm_gpu_cl.h
 	$(OCL) -o $@ -c pppm_gpu_memory.cpp -I$(OBJ_DIR)
 $(OBJ_DIR)/pppm_l_gpu.o: $(ALL_H) pppm_gpu_memory.h pppm_l_gpu.cpp
 	$(OCL) -o $@ -c pppm_l_gpu.cpp -I$(OBJ_DIR)
 $(OBJ_DIR)/gb_gpu_nbor_cl.h: gb_gpu_kernel_nbor.cu
 	$(BSH) ./geryon/file_to_cstr.sh gb_gpu_kernel_nbor.cu $(OBJ_DIR)/gb_gpu_nbor_cl.h
@ -93,7 +119,7 @@ $(OBJ_DIR)/lj_cut_gpu_cl.h: lj_cut_gpu_kernel.cu
 $(OBJ_DIR)/lj_cut_gpu_memory.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu_memory.cpp  $(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
 	$(OCL) -o $@ -c lj_cut_gpu_memory.cpp -I$(OBJ_DIR)
-$(OBJ_DIR)/lj_cut_gpu.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu.cpp
+$(OBJ_DIR)/lj_cut_gpu.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu.cpp atomic_gpu_memory.h
 	$(OCL) -o $@ -c lj_cut_gpu.cpp -I$(OBJ_DIR)
 $(OBJ_DIR)/ljc_cut_gpu_cl.h: ljc_cut_gpu_kernel.cu
@ -102,7 +128,7 @@ $(OBJ_DIR)/ljc_cut_gpu_cl.h: ljc_cut_gpu_kernel.cu
 $(OBJ_DIR)/ljc_cut_gpu_memory.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu_memory.cpp  $(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/charge_gpu_memory.o
 	$(OCL) -o $@ -c ljc_cut_gpu_memory.cpp -I$(OBJ_DIR)
-$(OBJ_DIR)/ljc_cut_gpu.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu.cpp
+$(OBJ_DIR)/ljc_cut_gpu.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu.cpp charge_gpu_memory.h
 	$(OCL) -o $@ -c ljc_cut_gpu.cpp -I$(OBJ_DIR)
 $(OBJ_DIR)/ljcl_cut_gpu_cl.h: ljcl_cut_gpu_kernel.cu
@ -111,16 +137,25 @@ $(OBJ_DIR)/ljcl_cut_gpu_cl.h: ljcl_cut_gpu_kernel.cu
 $(OBJ_DIR)/ljcl_cut_gpu_memory.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu_memory.cpp  $(OBJ_DIR)/ljcl_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/ljcl_cut_gpu_cl.h $(OBJ_DIR)/charge_gpu_memory.o
 	$(OCL) -o $@ -c ljcl_cut_gpu_memory.cpp -I$(OBJ_DIR)
-$(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp
+$(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp charge_gpu_memory.h
 	$(OCL) -o $@ -c ljcl_cut_gpu.cpp -I$(OBJ_DIR)
 $(OBJ_DIR)/morse_gpu_cl.h: morse_gpu_kernel.cu
 	$(BSH) ./geryon/file_to_cstr.sh morse_gpu_kernel.cu $(OBJ_DIR)/morse_gpu_cl.h;
 $(OBJ_DIR)/morse_gpu_memory.o: $(ALL_H) morse_gpu_memory.h morse_gpu_memory.cpp  $(OBJ_DIR)/morse_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/morse_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
 	$(OCL) -o $@ -c morse_gpu_memory.cpp -I$(OBJ_DIR)
 $(OBJ_DIR)/morse_gpu.o: $(ALL_H) morse_gpu_memory.h morse_gpu.cpp atomic_gpu_memory.h
 	$(OCL) -o $@ -c morse_gpu.cpp -I$(OBJ_DIR)
 $(OBJ_DIR)/crml_gpu_cl.h: crml_gpu_kernel.cu
 	$(BSH) ./geryon/file_to_cstr.sh crml_gpu_kernel.cu $(OBJ_DIR)/crml_gpu_cl.h;
 $(OBJ_DIR)/crml_gpu_memory.o: $(ALL_H) crml_gpu_memory.h crml_gpu_memory.cpp  $(OBJ_DIR)/crml_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/crml_gpu_cl.h $(OBJ_DIR)/charge_gpu_memory.o
 	$(OCL) -o $@ -c crml_gpu_memory.cpp -I$(OBJ_DIR)
-$(OBJ_DIR)/crml_gpu.o: $(ALL_H) crml_gpu_memory.h crml_gpu.cpp
+$(OBJ_DIR)/crml_gpu.o: $(ALL_H) crml_gpu_memory.h crml_gpu.cpp charge_gpu_memory.h
 	$(OCL) -o $@ -c crml_gpu.cpp -I$(OBJ_DIR)
 $(OBJ_DIR)/lj96_cut_gpu_cl.h: lj96_cut_gpu_kernel.cu
@ -129,16 +164,25 @@ $(OBJ_DIR)/lj96_cut_gpu_cl.h: lj96_cut_gpu_kernel.cu
 $(OBJ_DIR)/lj96_cut_gpu_memory.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu_memory.cpp  $(OBJ_DIR)/lj96_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/lj96_cut_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
 	$(OCL) -o $@ -c lj96_cut_gpu_memory.cpp -I$(OBJ_DIR)
-$(OBJ_DIR)/lj96_cut_gpu.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu.cpp
+$(OBJ_DIR)/lj96_cut_gpu.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu.cpp atomic_gpu_memory.h
 	$(OCL) -o $@ -c lj96_cut_gpu.cpp -I$(OBJ_DIR)
 $(OBJ_DIR)/lj_expand_gpu_cl.h: lj_expand_gpu_kernel.cu
 	$(BSH) ./geryon/file_to_cstr.sh lj_expand_gpu_kernel.cu $(OBJ_DIR)/lj_expand_gpu_cl.h;
 $(OBJ_DIR)/lj_expand_gpu_memory.o: $(ALL_H) lj_expand_gpu_memory.h lj_expand_gpu_memory.cpp  $(OBJ_DIR)/lj_expand_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/lj_expand_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
 	$(OCL) -o $@ -c lj_expand_gpu_memory.cpp -I$(OBJ_DIR)
 $(OBJ_DIR)/lj_expand_gpu.o: $(ALL_H) lj_expand_gpu_memory.h lj_expand_gpu.cpp atomic_gpu_memory.h
 	$(OCL) -o $@ -c lj_expand_gpu.cpp -I$(OBJ_DIR)
 $(OBJ_DIR)/cmm_cut_gpu_cl.h: cmm_cut_gpu_kernel.cu
 	$(BSH) ./geryon/file_to_cstr.sh cmm_cut_gpu_kernel.cu $(OBJ_DIR)/cmm_cut_gpu_cl.h;
 $(OBJ_DIR)/cmm_cut_gpu_memory.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu_memory.cpp  $(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
 	$(OCL) -o $@ -c cmm_cut_gpu_memory.cpp -I$(OBJ_DIR)
-$(OBJ_DIR)/cmm_cut_gpu.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu.cpp
+$(OBJ_DIR)/cmm_cut_gpu.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu.cpp atomic_gpu_memory.h
 	$(OCL) -o $@ -c cmm_cut_gpu.cpp -I$(OBJ_DIR)
 $(OBJ_DIR)/cmmc_long_gpu_cl.h: cmmc_long_gpu_kernel.cu
@ -147,7 +191,7 @@ $(OBJ_DIR)/cmmc_long_gpu_cl.h: cmmc_long_gpu_kernel.cu
 $(OBJ_DIR)/cmmc_long_gpu_memory.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu_memory.cpp  $(OBJ_DIR)/cmmc_long_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/cmmc_long_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
 	$(OCL) -o $@ -c cmmc_long_gpu_memory.cpp -I$(OBJ_DIR)
-$(OBJ_DIR)/cmmc_long_gpu.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu.cpp
+$(OBJ_DIR)/cmmc_long_gpu.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu.cpp charge_gpu_memory.h
 	$(OCL) -o $@ -c cmmc_long_gpu.cpp -I$(OBJ_DIR)
 $(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp
--- a/lib/gpu/README
+++ b/lib/gpu/README
@ -14,6 +14,7 @@
 /* ----------------------------------------------------------------------
   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
                         Peng Wang (Nvidia), penwang@nvidia.com
                         Inderaj Bains (NVIDIA), ibains@nvidia.com
                         Paul Crozier (SNL), pscrozi@sandia.gov
 ------------------------------------------------------------------------- */
--- a/lib/gpu/atomic_gpu_memory.cpp
+++ b/lib/gpu/atomic_gpu_memory.cpp
@ -23,19 +23,24 @@ extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
 template <class numtyp, class acctyp>
 AtomicGPUMemoryT::AtomicGPUMemory() : _compiled(false), _max_bytes(0)  {
  device=&pair_gpu_device;
  ans=new PairGPUAns<numtyp,acctyp>();
  nbor=new PairGPUNbor();
 }
 template <class numtyp, class acctyp>
 AtomicGPUMemoryT::~AtomicGPUMemory() {
  delete ans;
  delete nbor;
 }
 template <class numtyp, class acctyp>
 int AtomicGPUMemoryT::bytes_per_atom_atomic(const int max_nbors) const {
-  return device->atom.bytes_per_atom()+device->nbor.bytes_per_atom(max_nbors);
+  return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
         nbor->bytes_per_atom(max_nbors);
 }
 template <class numtyp, class acctyp>
-bool AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall,
+int AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall,
                                  const int max_nbors, const int maxspecial,
                                  const double cell_size,
                                  const double gpu_split, FILE *_screen,
@ -48,24 +53,30 @@ bool AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall,
    gpu_nbor=true;
  int _gpu_host=0;
-  int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_nbor,gpu_split);
+  int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
  if (host_nlocal>0)
    _gpu_host=1;
-  if (!device->init(false,false,nlocal,host_nlocal,nall,maxspecial,gpu_nbor,
+  _threads_per_atom=device->threads_per_atom();
-                    _gpu_host,max_nbors,cell_size,false))
+  if (_threads_per_atom>1 && gpu_nbor==false) {
-    return false;
+    nbor->packing(true);
    _nbor_data=&(nbor->dev_packed);
  } else
    _nbor_data=&(nbor->dev_nbor);
  int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor,
                           maxspecial,_gpu_host,max_nbors,cell_size,false);
  if (success!=0)
    return success;
  ucl_device=device->gpu;
  atom=&device->atom;
  nbor=&device->nbor;
-  _block_size=BLOCK_1D;
+  _block_size=device->pair_block_size();
  if (static_cast<size_t>(_block_size)>ucl_device->group_size())
    _block_size=ucl_device->group_size();
  compile_kernels(*ucl_device,pair_program);
  // Initialize host-device load balancer
-  hd_balancer.init(device,gpu_split);
+  hd_balancer.init(device,gpu_nbor,gpu_split);
  // Initialize timers for the selected GPU
  time_pair.init(*ucl_device);
@ -73,9 +84,14 @@ bool AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall,
  pos_tex.bind_float(atom->dev_x,4);
-  _max_an_bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+  _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
-  return true;
+  return 0;
 }
 template <class numtyp, class acctyp>
 void AtomicGPUMemoryT::estimate_gpu_overhead() {
  device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead);
 }
 template <class numtyp, class acctyp>
@ -83,7 +99,10 @@ void AtomicGPUMemoryT::clear_atomic() {
  // Output any timing information
  acc_timers();
  double avg_split=hd_balancer.all_avg_split();
-  device->output_times(time_pair,avg_split,_max_bytes+_max_an_bytes,screen);
+  _gpu_overhead*=hd_balancer.timestep();
  _driver_overhead*=hd_balancer.timestep();
  device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes,
                       _gpu_overhead,_driver_overhead,_threads_per_atom,screen);
  if (_compiled) {
    k_pair_fast.clear();
@ -107,8 +126,7 @@ int * AtomicGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist,
  success=true;
  nbor_time_avail=true;
-
+  int mn=nbor->max_nbor_loop(inum,numj,ilist);
  int mn=nbor->max_nbor_loop(inum,numj);
  resize_atom(inum,nall,success);
  resize_local(inum,mn,success);
  if (!success)
@ -116,7 +134,7 @@ int * AtomicGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist,
  nbor->get_host(inum,ilist,numj,firstneigh,block_size());
-  double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
  if (bytes>_max_an_bytes)
    _max_an_bytes=bytes;
@ -130,8 +148,8 @@ template <class numtyp, class acctyp>
 inline void AtomicGPUMemoryT::build_nbor_list(const int inum,
                                              const int host_inum,
                                              const int nall, double **host_x,
-                                              int *host_type, double *boxlo,
+                                              int *host_type, double *sublo,
-                                              double *boxhi, int *tag,
+                                              double *subhi, int *tag,
                                              int **nspecial, int **special,
                                              bool &success) {
  nbor_time_avail=true;
@ -144,10 +162,10 @@ inline void AtomicGPUMemoryT::build_nbor_list(const int inum,
  atom->cast_copy_x(host_x,host_type);
  int mn;
-  nbor->build_nbor_list(inum, host_inum, nall, *atom, boxlo, boxhi, tag,
+  nbor->build_nbor_list(inum, host_inum, nall, *atom, sublo, subhi, tag,
                        nspecial, special, success, mn);
-  double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
  if (bytes>_max_an_bytes)
    _max_an_bytes=bytes;
 }
@ -156,9 +174,8 @@ inline void AtomicGPUMemoryT::build_nbor_list(const int inum,
 // Copy nbor list from host if necessary and then calculate forces, virials,..
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void AtomicGPUMemoryT::compute(const int timestep, const int f_ago,
+void AtomicGPUMemoryT::compute(const int f_ago, const int inum_full,
-			      const int inum_full, const int nall,
+                               const int nall, double **host_x, int *host_type,
                              double **host_x, int *host_type,
                               int *ilist, int *numj, int **firstneigh,
                               const bool eflag, const bool vflag,
                               const bool eatom, const bool vatom,
@ -166,14 +183,16 @@ void AtomicGPUMemoryT::compute(const int timestep, const int f_ago,
                               bool &success) {
  acc_timers();
  if (inum_full==0) {
    host_start=0;
    // Make sure textures are correct if realloc by a different hybrid style
    resize_atom(0,nall,success);
    zero_timers();
    return;
  }
  int ago=hd_balancer.ago_first(f_ago);
-  int inum=hd_balancer.balance(timestep,ago,inum_full,cpu_time,
+  int inum=hd_balancer.balance(ago,inum_full,cpu_time);
-		               nbor->gpu_nbor());
+  ans->inum(inum);
  atom->inum(inum);
  host_start=inum;
  if (ago==0) {
@ -187,7 +206,8 @@ void AtomicGPUMemoryT::compute(const int timestep, const int f_ago,
  atom->add_x_data(host_x,host_type);
  loop(eflag,vflag);
-  atom->copy_answers(eflag,vflag,eatom,vatom,ilist);
+  ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
  device->add_ans_object(ans);
  hd_balancer.stop_timer();
 }
@ -195,29 +215,32 @@ void AtomicGPUMemoryT::compute(const int timestep, const int f_ago,
 // Reneighbor on GPU if necessary and then compute forces, virials, energies
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-int * AtomicGPUMemoryT::compute(const int timestep, const int ago,
+int ** AtomicGPUMemoryT::compute(const int ago, const int inum_full,
-                                const int inum_full, const int nall,
+                                 const int nall, double **host_x, int *host_type,
-                                double **host_x, int *host_type, double *boxlo,
+                                 double *sublo, double *subhi, int *tag,
-                                double *boxhi, int *tag, int **nspecial,
+                                 int **nspecial, int **special, const bool eflag, 
                                int **special, const bool eflag, 
                                 const bool vflag, const bool eatom,
                                 const bool vatom, int &host_start,
                                 int **ilist, int **jnum,
                                 const double cpu_time, bool &success) {
  acc_timers();
  if (inum_full==0) {
    host_start=0;
    // Make sure textures are correct if realloc by a different hybrid style
    resize_atom(0,nall,success);
    zero_timers();
    return NULL;
  }
-  hd_balancer.balance(cpu_time,nbor->gpu_nbor());
+  hd_balancer.balance(cpu_time);
-  int inum=hd_balancer.get_gpu_count(timestep,ago,inum_full);
+  int inum=hd_balancer.get_gpu_count(ago,inum_full);
-  atom->inum(inum);
+  ans->inum(inum);
  host_start=inum;
  // Build neighbor list on GPU if necessary
  if (ago==0) {
    build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
-                    boxlo, boxhi, tag, nspecial, special, success);
+                    sublo, subhi, tag, nspecial, special, success);
    if (!success)
      return NULL;
    hd_balancer.start_timer();
@ -226,19 +249,21 @@ int * AtomicGPUMemoryT::compute(const int timestep, const int ago,
    hd_balancer.start_timer();
    atom->add_x_data(host_x,host_type);
  }
  *ilist=nbor->host_ilist.begin();
  *jnum=nbor->host_acc.begin();
  loop(eflag,vflag);
-  atom->copy_answers(eflag,vflag,eatom,vatom);
+  ans->copy_answers(eflag,vflag,eatom,vatom);
  device->add_ans_object(ans);
  hd_balancer.stop_timer();
-  return device->nbor.host_nbor.begin();
+  return nbor->host_jlist.begin()-host_start;
 }
 template <class numtyp, class acctyp>
 double AtomicGPUMemoryT::host_memory_usage_atomic() const {
-  return device->atom.host_memory_usage()+
+  return device->atom.host_memory_usage()+nbor->host_memory_usage()+
-         device->nbor.host_memory_usage()+4*sizeof(numtyp)+
+         4*sizeof(numtyp)+sizeof(AtomicGPUMemory<numtyp,acctyp>);
         sizeof(AtomicGPUMemory<numtyp,acctyp>);
 }
 template <class numtyp, class acctyp>
--- a/lib/gpu/atomic_gpu_memory.h
+++ b/lib/gpu/atomic_gpu_memory.h
@ -18,8 +18,6 @@
 #ifndef ATOMIC_GPU_MEMORY_H
 #define ATOMIC_GPU_MEMORY_H
 #define BLOCK_1D 64
 #include "pair_gpu_device.h"
 #include "pair_gpu_balance.h"
 #include "mpi.h"
@ -39,17 +37,28 @@ class AtomicGPUMemory {
  /// Clear any previous data and set up for a new LAMMPS run
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device **/
+    * \param gpu_split fraction of particles handled by device
-  bool init_atomic(const int nlocal, const int nall, const int max_nbors,
+    * 
    * Returns:
    * -  0 if successfull
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
    * - -5 Double precision is not supported on card **/
  int init_atomic(const int nlocal, const int nall, const int max_nbors,
                  const int maxspecial, const double cell_size, 
                  const double gpu_split, FILE *screen, 
                  const char *pair_program);
  /// Estimate the overhead for GPU context changes and CPU driver
  void estimate_gpu_overhead();
  /// Check if there is enough storage for atom arrays and realloc if not
  /** \param success set to false if insufficient memory **/
  inline void resize_atom(const int inum, const int nall, bool &success) {
-    if (atom->resize(inum, nall, success))
+    if (atom->resize(nall, success))
      pos_tex.bind_float(atom->dev_x,4);
    ans->resize(inum,success);
  }
  /// Check if there is enough storage for neighbors and realloc if not
@ -85,6 +94,7 @@ class AtomicGPUMemory {
  /// Accumulate timers
  inline void acc_timers() {
    if (device->time_device()) {
      if (nbor_time_avail) {
        nbor->time_nbor.add_to_total();
        nbor->time_kernel.add_to_total();
@ -92,6 +102,8 @@ class AtomicGPUMemory {
      }
      time_pair.add_to_total();
      atom->acc_timers();
      ans->acc_timers();
    }
  }
  /// Zero timers
@ -99,6 +111,7 @@ class AtomicGPUMemory {
    nbor_time_avail=false;
    time_pair.zero();
    atom->zero_timers();
    ans->zero_timers();
  }
  /// Copy neighbor list from host
@ -108,24 +121,32 @@ class AtomicGPUMemory {
  /// Build neighbor list on device
  void build_nbor_list(const int inum, const int host_inum,
                       const int nall, double **host_x, int *host_type,
-                       double *boxlo, double *boxhi, int *tag, int **nspecial, 
+                       double *sublo, double *subhi, int *tag, int **nspecial, 
                       int **special, bool &success);
  /// Pair loop with host neighboring
-  void compute(const int timestep, const int f_ago, const int inum_full,
+  void compute(const int f_ago, const int inum_full,
               const int nall, double **host_x, int *host_type,
               int *ilist, int *numj, int **firstneigh, const bool eflag,
               const bool vflag, const bool eatom, const bool vatom,
               int &host_start, const double cpu_time, bool &success);
  /// Pair loop with device neighboring
-  int * compute(const int timestep, const int ago, const int inum_full,
+  int * compute(const int ago, const int inum_full,
-                const int nall, double **host_x, int *host_type, double *boxlo,
+                const int nall, double **host_x, int *host_type, double *sublo,
-                double *boxhi, int *tag, int **nspecial,
+                double *subhi, int *tag, int **nspecial,
                int **special, const bool eflag, const bool vflag, 
                const bool eatom, const bool vatom, int &host_start, 
                const double cpu_time, bool &success);
  /// Pair loop with device neighboring
  int ** compute(const int ago, const int inum_full,
                 const int nall, double **host_x, int *host_type, double *sublo,
                 double *subhi, int *tag, int **nspecial,
                 int **special, const bool eflag, const bool vflag, 
                 const bool eatom, const bool vatom, int &host_start, 
                 int **ilist, int **numj, const double cpu_time, bool &success);
  // -------------------------- DEVICE DATA ------------------------- 
  /// Device Properties and Atom and Neighbor storage
@ -148,6 +169,9 @@ class AtomicGPUMemory {
  /// Atom Data
  PairGPUAtom<numtyp,acctyp> *atom;
  // ------------------------ FORCE/ENERGY DATA -----------------------
  PairGPUAns<numtyp,acctyp> *ans;
  // --------------------------- NBOR DATA ----------------------------
@ -167,8 +191,10 @@ class AtomicGPUMemory {
 protected:
  bool _compiled;
-  int _block_size;
+  int _block_size, _threads_per_atom;
  double _max_bytes, _max_an_bytes;
  double _gpu_overhead, _driver_overhead;
  UCL_D_Vec<int> *_nbor_data;
  void compile_kernels(UCL_Device &dev, const char *pair_string);
--- a/lib/gpu/charge_gpu_memory.cpp
+++ b/lib/gpu/charge_gpu_memory.cpp
@ -23,19 +23,24 @@ extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
 template <class numtyp, class acctyp>
 ChargeGPUMemoryT::ChargeGPUMemory() : _compiled(false), _max_bytes(0) {
  device=&pair_gpu_device;
  ans=new PairGPUAns<numtyp,acctyp>();
  nbor=new PairGPUNbor();
 }
 template <class numtyp, class acctyp>
 ChargeGPUMemoryT::~ChargeGPUMemory() {
  delete ans;
  delete nbor;
 }
 template <class numtyp, class acctyp>
 int ChargeGPUMemoryT::bytes_per_atom_atomic(const int max_nbors) const {
-  return device->atom.bytes_per_atom()+device->nbor.bytes_per_atom(max_nbors);
+  return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
         nbor->bytes_per_atom(max_nbors);
 }
 template <class numtyp, class acctyp>
-bool ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall,
+int ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall,
                                  const int max_nbors, const int maxspecial,
                                  const double cell_size,
                                  const double gpu_split, FILE *_screen,
@ -48,24 +53,31 @@ bool ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall,
    gpu_nbor=true;
  int _gpu_host=0;
-  int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_nbor,gpu_split);
+  int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
  if (host_nlocal>0)
    _gpu_host=1;
-  if (!device->init(true,false,nlocal,host_nlocal,nall,maxspecial,gpu_nbor,
+  _threads_per_atom=device->threads_per_charge();
-                    _gpu_host,max_nbors,cell_size,false))
+  if (_threads_per_atom>1 && gpu_nbor==false) {
-    return false;
+    nbor->packing(true);
    _nbor_data=&(nbor->dev_packed);
  } else
    _nbor_data=&(nbor->dev_nbor);
  int success=device->init(*ans,true,false,nlocal,host_nlocal,nall,nbor,
                           maxspecial,_gpu_host,max_nbors,cell_size,false);
  if (success!=0)
    return success;
  ucl_device=device->gpu;
  atom=&device->atom;
  nbor=&device->nbor;
-  _block_size=BLOCK_1D;
+  _block_size=device->pair_block_size();
-  if (static_cast<size_t>(_block_size)>ucl_device->group_size())
+  _block_bio_size=device->block_bio_pair();
    _block_size=ucl_device->group_size();
  compile_kernels(*ucl_device,pair_program);
  // Initialize host-device load balancer
-  hd_balancer.init(device,gpu_split);
+  hd_balancer.init(device,gpu_nbor,gpu_split);
  // Initialize timers for the selected GPU
  time_pair.init(*ucl_device);
@ -74,9 +86,14 @@ bool ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall,
  pos_tex.bind_float(atom->dev_x,4);
  q_tex.bind_float(atom->dev_q,1);
-  _max_an_bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+  _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
-  return true;
+  return success;
 }
 template <class numtyp, class acctyp>
 void ChargeGPUMemoryT::estimate_gpu_overhead() {
  device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead);
 }
 template <class numtyp, class acctyp>
@ -84,7 +101,10 @@ void ChargeGPUMemoryT::clear_atomic() {
  // Output any timing information
  acc_timers();
  double avg_split=hd_balancer.all_avg_split();
-  device->output_times(time_pair,avg_split,_max_bytes+_max_an_bytes,screen);
+  _gpu_overhead*=hd_balancer.timestep();
  _driver_overhead*=hd_balancer.timestep();
  device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes,
                       _gpu_overhead,_driver_overhead,_threads_per_atom,screen);
  if (_compiled) {
    k_pair_fast.clear();
@ -109,7 +129,7 @@ int * ChargeGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist,
  nbor_time_avail=true;
-  int mn=nbor->max_nbor_loop(inum,numj);
+  int mn=nbor->max_nbor_loop(inum,numj,ilist);
  resize_atom(inum,nall,success);
  resize_local(inum,mn,success);
  if (!success)
@ -117,7 +137,7 @@ int * ChargeGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist,
  nbor->get_host(inum,ilist,numj,firstneigh,block_size());
-  double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
  if (bytes>_max_an_bytes)
    _max_an_bytes=bytes;
@ -131,8 +151,8 @@ template <class numtyp, class acctyp>
 inline void ChargeGPUMemoryT::build_nbor_list(const int inum,
                                              const int host_inum,
                                              const int nall, double **host_x,
-                                              int *host_type, double *boxlo,
+                                              int *host_type, double *sublo,
-                                              double *boxhi, int *tag, 
+                                              double *subhi, int *tag, 
                                              int **nspecial, int **special,
                                              bool &success) {
  nbor_time_avail=true;
@ -145,10 +165,10 @@ inline void ChargeGPUMemoryT::build_nbor_list(const int inum,
  atom->cast_copy_x(host_x,host_type);
  int mn;
-  nbor->build_nbor_list(inum, host_inum, nall, *atom, boxlo, boxhi, tag,
+  nbor->build_nbor_list(inum, host_inum, nall, *atom, sublo, subhi, tag,
                        nspecial, special, success, mn);
-  double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
  if (bytes>_max_an_bytes)
    _max_an_bytes=bytes;
 }
@ -157,24 +177,26 @@ inline void ChargeGPUMemoryT::build_nbor_list(const int inum,
 // Copy nbor list from host if necessary and then calculate forces, virials,..
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void ChargeGPUMemoryT::compute(const int timestep, const int f_ago,
+void ChargeGPUMemoryT::compute(const int f_ago, const int inum_full,
-                               const int inum_full, const int nall,
+                               const int nall, double **host_x, int *host_type,
                              double **host_x, int *host_type,
                               int *ilist, int *numj, int **firstneigh,
                               const bool eflag, const bool vflag,
                               const bool eatom, const bool vatom,
                               int &host_start, const double cpu_time,
-                              bool &success, double *host_q) {
+                               bool &success, double *host_q,
                               const int nlocal, double *boxlo, double *prd) {
  acc_timers();
  if (inum_full==0) {
    host_start=0;
    // Make sure textures are correct if realloc by a different hybrid style
    resize_atom(0,nall,success);
    zero_timers();
    return;
  }
  int ago=hd_balancer.ago_first(f_ago);
-  int inum=hd_balancer.balance(timestep,ago,inum_full,cpu_time,
+  int inum=hd_balancer.balance(ago,inum_full,cpu_time);
-		               nbor->gpu_nbor());
+  ans->inum(inum);
  atom->inum(inum);
  host_start=inum;
  if (ago==0) {
@ -187,10 +209,14 @@ void ChargeGPUMemoryT::compute(const int timestep, const int f_ago,
  atom->cast_q_data(host_q);
  hd_balancer.start_timer();
  atom->add_x_data(host_x,host_type);
-  atom->add_other_data();
+  atom->add_q_data();
  device->precompute(f_ago,nlocal,nall,host_x,host_type,success,host_q,
                     boxlo, prd);
  loop(eflag,vflag);
-  atom->copy_answers(eflag,vflag,eatom,vatom,ilist);
+  ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
  device->add_ans_object(ans);
  hd_balancer.stop_timer();
 }
@ -198,30 +224,33 @@ void ChargeGPUMemoryT::compute(const int timestep, const int f_ago,
 // Reneighbor on GPU if necessary and then compute forces, virials, energies
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-int * ChargeGPUMemoryT::compute(const int timestep, const int ago,
+int** ChargeGPUMemoryT::compute(const int ago, const int inum_full,
-                                const int inum_full, const int nall, 
+                                const int nall, double **host_x, int *host_type,
-                                double **host_x, int *host_type, double *boxlo,
+                                double *sublo, double *subhi, int *tag,
-                                double *boxhi, int *tag, int **nspecial,
+                                int **nspecial, int **special, const bool eflag, 
                                int **special, const bool eflag, 
                                const bool vflag, const bool eatom,
                                const bool vatom, int &host_start,
                                int **ilist, int **jnum,
                                const double cpu_time, bool &success,
-                                double *host_q) {
+                                double *host_q, double *boxlo, double *prd) {
  acc_timers();
  if (inum_full==0) {
    host_start=0;
    // Make sure textures are correct if realloc by a different hybrid style
    resize_atom(0,nall,success);
    zero_timers();
    return NULL;
  }
-  hd_balancer.balance(cpu_time,nbor->gpu_nbor());
+  hd_balancer.balance(cpu_time);
-  int inum=hd_balancer.get_gpu_count(timestep,ago,inum_full);
+  int inum=hd_balancer.get_gpu_count(ago,inum_full);
-  atom->inum(inum);
+  ans->inum(inum);
  host_start=inum;
  // Build neighbor list on GPU if necessary
  if (ago==0) {
    build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
-                    boxlo, boxhi, tag, nspecial, special, success);
+                    sublo, subhi, tag, nspecial, special, success);
    if (!success)
      return NULL;
    atom->cast_q_data(host_q);
@ -232,20 +261,25 @@ int * ChargeGPUMemoryT::compute(const int timestep, const int ago,
    hd_balancer.start_timer();
    atom->add_x_data(host_x,host_type);
  }
-  atom->add_other_data();
+  atom->add_q_data();
  *ilist=nbor->host_ilist.begin();
  *jnum=nbor->host_acc.begin();
  device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q,
                     boxlo, prd);
  loop(eflag,vflag);
-  atom->copy_answers(eflag,vflag,eatom,vatom);
+  ans->copy_answers(eflag,vflag,eatom,vatom);
  device->add_ans_object(ans);
  hd_balancer.stop_timer();
-  return device->nbor.host_nbor.begin();
+  return nbor->host_jlist.begin()-host_start;
 }
 template <class numtyp, class acctyp>
 double ChargeGPUMemoryT::host_memory_usage_atomic() const {
-  return device->atom.host_memory_usage()+
+  return device->atom.host_memory_usage()+nbor->host_memory_usage()+
-         device->nbor.host_memory_usage()+4*sizeof(numtyp)+
+         4*sizeof(numtyp)+sizeof(ChargeGPUMemory<numtyp,acctyp>);
         sizeof(ChargeGPUMemory<numtyp,acctyp>);
 }
 template <class numtyp, class acctyp>
--- a/lib/gpu/charge_gpu_memory.h
+++ b/lib/gpu/charge_gpu_memory.h
@ -18,8 +18,6 @@
 #ifndef CHARGE_GPU_MEMORY_H
 #define CHARGE_GPU_MEMORY_H
 #define BLOCK_1D 64
 #include "pair_gpu_device.h"
 #include "pair_gpu_balance.h"
 #include "mpi.h"
@ -39,19 +37,30 @@ class ChargeGPUMemory {
  /// Clear any previous data and set up for a new LAMMPS run
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device **/
+    * \param gpu_split fraction of particles handled by device
-  bool init_atomic(const int nlocal, const int nall, const int max_nbors,
+    * 
    * Returns:
    * -  0 if successfull
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
    * - -5 Double precision is not supported on card **/
  int init_atomic(const int nlocal, const int nall, const int max_nbors,
                  const int maxspecial, const double cell_size,
                  const double gpu_split, FILE *screen,
                  const char *pair_program);
  /// Estimate the overhead for GPU context changes and CPU driver
  void estimate_gpu_overhead();
  /// Check if there is enough storage for atom arrays and realloc if not
  /** \param success set to false if insufficient memory **/
  inline void resize_atom(const int inum, const int nall, bool &success) {
-    if (atom->resize(inum, nall, success)) {
+    if (atom->resize(nall, success)) {
      pos_tex.bind_float(atom->dev_x,4);
      q_tex.bind_float(atom->dev_q,1);
    }
    ans->resize(inum,success);
  }
  /// Check if there is enough storage for neighbors and realloc if not
@ -87,6 +96,7 @@ class ChargeGPUMemory {
  /// Accumulate timers
  inline void acc_timers() {
    if (device->time_device()) {
      if (nbor_time_avail) {
        nbor->time_nbor.add_to_total();
        nbor->time_kernel.add_to_total();
@ -94,6 +104,8 @@ class ChargeGPUMemory {
      }
      time_pair.add_to_total();
      atom->acc_timers();
      ans->acc_timers();
    }
  }
  /// Zero timers
@ -101,6 +113,7 @@ class ChargeGPUMemory {
    nbor_time_avail=false;
    time_pair.zero();
    atom->zero_timers();
    ans->zero_timers();
  }
  /// Copy neighbor list from host
@ -110,24 +123,25 @@ class ChargeGPUMemory {
  /// Build neighbor list on device
  void build_nbor_list(const int inum, const int host_inum,
                       const int nall, double **host_x, int *host_type,
-                       double *boxlo, double *boxhi, int *tag, int **nspecial,
+                       double *sublo, double *subhi, int *tag, int **nspecial,
                       int **special, bool &success);
  /// Pair loop with host neighboring
-  void compute(const int timestep, const int f_ago, const int inum_full,
+  void compute(const int f_ago, const int inum_full, const int nall,
-               const int nall, double **host_x, int *host_type,
+               double **host_x, int *host_type, int *ilist, int *numj,
-               int *ilist, int *numj, int **firstneigh, const bool eflag,
+               int **firstneigh, const bool eflag, const bool vflag,
-               const bool vflag, const bool eatom, const bool vatom,
+               const bool eatom, const bool vatom, int &host_start,
-               int &host_start, const double cpu_time, bool &success,
+               const double cpu_time, bool &success, double *charge,
-               double *charge);
+               const int nlocal, double *boxlo, double *prd);
  /// Pair loop with device neighboring
-  int * compute(const int timestep, const int ago, const int inum_full,
+  int** compute(const int ago, const int inum_full, const int nall,
-                const int nall, double **host_x, int *host_type, double *boxlo,
+                double **host_x, int *host_type, double *sublo,
-                double *boxhi, int *tag, int **nspecial,
+                double *subhi, int *tag, int **nspecial,
                int **special, const bool eflag, const bool vflag, 
                const bool eatom, const bool vatom, int &host_start, 
-                const double cpu_time, bool &success, double *charge);
+                int **ilist, int **numj, const double cpu_time, bool &success,
                double *charge, double *boxlo, double *prd);
  // -------------------------- DEVICE DATA ------------------------- 
@ -152,6 +166,10 @@ class ChargeGPUMemory {
  PairGPUAtom<numtyp,acctyp> *atom;
  // ------------------------ FORCE/ENERGY DATA -----------------------
  PairGPUAns<numtyp,acctyp> *ans;
  // --------------------------- NBOR DATA ----------------------------
  /// Neighbor data
@ -171,8 +189,10 @@ class ChargeGPUMemory {
 protected:
  bool _compiled;
-  int _block_size;
+  int _block_size, _block_bio_size, _threads_per_atom;
  double  _max_bytes, _max_an_bytes;
  double _gpu_overhead, _driver_overhead;
  UCL_D_Vec<int> *_nbor_data;
  void compile_kernels(UCL_Device &dev, const char *pair_string);
--- a/lib/gpu/cmm_cut_gpu.cpp
+++ b/lib/gpu/cmm_cut_gpu.cpp
@ -28,7 +28,7 @@ static CMM_GPU_Memory<PRECISION,ACC_PRECISION> CMMMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-bool cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
+int cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
                 double **host_lj1, double **host_lj2, double **host_lj3, 
                 double **host_lj4, double **offset, double *special_lj,
                 const int inum, const int nall, const int max_nbors, 
@ -54,13 +54,11 @@ bool cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
    fflush(screen);
  }
-  if (world_me==0) {
+  int init_ok=0;
-    bool init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3, 
+  if (world_me==0)
    init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3, 
                       host_lj4, offset, special_lj, inum, nall, 300,
                       maxspecial, cell_size, gpu_split, screen);
    if (!init_ok)
      return false;
  }
  CMMMF.device->world_barrier();
  if (message)
@ -75,45 +73,45 @@ bool cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
                last_gpu,i);
      fflush(screen);
    }
-    if (gpu_rank==i && world_me!=0) {
+    if (gpu_rank==i && world_me!=0)
-      bool init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3,
+      init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3,
                         host_lj4, offset, special_lj, inum, nall, 300,
-                              maxspecial, cell_size, gpu_split,
+                         maxspecial, cell_size, gpu_split, screen);
-			      screen);
+
      if (!init_ok)
        return false;
    }
    CMMMF.device->gpu_barrier();
    if (message) 
      fprintf(screen,"Done.\n");
  }
  if (message)
    fprintf(screen,"\n");
-  return true;
+
  if (init_ok==0)
    CMMMF.estimate_gpu_overhead();
  return init_ok;
 }
 void cmm_gpu_clear() {
  CMMMF.clear();
 }
-int * cmm_gpu_compute_n(const int timestep, const int ago, const int inum_full,
+int** cmm_gpu_compute_n(const int ago, const int inum_full,
                        const int nall, double **host_x, int *host_type,
-                        double *boxlo, double *boxhi, int *tag, int **nspecial,
+                        double *sublo, double *subhi, int *tag, int **nspecial,
                        int **special, const bool eflag, const bool vflag,
                        const bool eatom, const bool vatom, int &host_start,
-                        const double cpu_time, bool &success) {
+                        int **ilist, int **jnum, const double cpu_time,
-  return CMMMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
+                        bool &success) {
-                       boxhi, tag, nspecial, special, eflag, vflag, eatom,
+  return CMMMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
-                       vatom, host_start, cpu_time, success);
+                       subhi, tag, nspecial, special, eflag, vflag, eatom,
                       vatom, host_start, ilist, jnum, cpu_time, success);
 }  
-void cmm_gpu_compute(const int timestep, const int ago, const int inum_full,
+void cmm_gpu_compute(const int ago, const int inum_full, const int nall,
-	 	     const int nall, double **host_x, int *host_type,
+                     double **host_x, int *host_type, int *ilist, int *numj,
-                     int *ilist, int *numj, int **firstneigh,
+                     int **firstneigh, const bool eflag, const bool vflag,
-		     const bool eflag, const bool vflag, const bool eatom,
+                     const bool eatom, const bool vatom, int &host_start,
-                     const bool vatom, int &host_start, const double cpu_time,
+                     const double cpu_time, bool &success) {
-                     bool &success) {
+  CMMMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
  CMMMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
                firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
 }
--- a/lib/gpu/cmm_cut_gpu_kernel.cu
+++ b/lib/gpu/cmm_cut_gpu_kernel.cu
@ -18,8 +18,6 @@
 #ifndef CMM_GPU_KERNEL
 #define CMM_GPU_KERNEL
 #define MAX_SHARED_TYPES 8
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
 #define numtyp2 double2
@ -46,7 +44,7 @@
 #ifdef NV_KERNEL
-#include "geryon/ucl_nv_kernel.h"
+#include "nv_kernel_def.h"
 texture<float4> pos_tex;
 #ifdef _DOUBLE_DOUBLE
@ -72,6 +70,8 @@ __inline float4 fetch_pos(const int& i, const float4 *pos)
 #define __inline inline
 #define fetch_pos(i,y) x_[i]
 #define BLOCK_PAIR 64
 #define MAX_SHARED_TYPES 8
 #endif
@ -82,19 +82,21 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; }
 __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
                          __global numtyp4* lj3, const int lj_types, 
                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global acctyp4 *ans, __global acctyp *engv, 
+                          __global int *dev_packed, __global acctyp4 *ans,
-                          const int eflag, const int vflag, const int inum, 
+                          __global acctyp *engv, const int eflag,
-                          const int nall, const int nbor_pitch) {
+                          const int vflag, const int inum, const int nall,
-  // ii indexes the two interacting particles in gi
+                          const int nbor_pitch, const int t_per_atom) {
-  int ii=GLOBAL_ID_X;
+  int tid=THREAD_ID_X;
  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
  ii+=tid/t_per_atom;
  int offset=tid%t_per_atom;
  __local numtyp sp_lj[4];
  sp_lj[0]=sp_lj_in[0];
  sp_lj[1]=sp_lj_in[1];
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];
  if (ii<inum) {
  acctyp energy=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0;
@ -104,18 +106,32 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
  if (ii<inum) {
    __global int *nbor=dev_nbor+ii;
    int i=*nbor;
    nbor+=nbor_pitch;
    int numj=*nbor;
    nbor+=nbor_pitch;
-    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+
    int n_stride;
    __global int *list_end;
    if (dev_nbor==dev_packed) {
      list_end=nbor+mul24(numj,nbor_pitch);
      nbor+=mul24(offset,nbor_pitch);
      n_stride=mul24(t_per_atom,nbor_pitch);
    } else {
      nbor=dev_packed+*nbor;
      list_end=nbor+numj;
      n_stride=t_per_atom;
      nbor+=offset;
    }
    numtyp4 ix=fetch_pos(i,x_); //x_[i];
    int itype=ix.w;
    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;
      factor_lj = sp_lj[sbmask(j)];
@ -164,8 +180,47 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
      }
    } // for nbor
  } // if ii
  // Reduce answers
  if (t_per_atom>1) {
    __local acctyp red_acc[6][BLOCK_PAIR];
    red_acc[0][tid]=f.x;
    red_acc[1][tid]=f.y;
    red_acc[2][tid]=f.z;
    red_acc[3][tid]=energy;
    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
      if (offset < s) {
        for (int r=0; r<4; r++)
          red_acc[r][tid] += red_acc[r][tid+s];
      }
    }
    f.x=red_acc[0][tid];
    f.y=red_acc[1][tid];
    f.z=red_acc[2][tid];
    energy=red_acc[3][tid];
    if (vflag>0) {
      for (int r=0; r<6; r++)
        red_acc[r][tid]=virial[r];
      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
        if (offset < s) {
          for (int r=0; r<6; r++)
            red_acc[r][tid] += red_acc[r][tid+s];
        }
      }
      for (int r=0; r<6; r++)
        virial[r]=red_acc[r][tid];
    }
  }
  // Store answers
  if (ii<inum && offset==0) {
    __global acctyp *ap1=engv+ii;
    if (eflag>0) {
      *ap1=energy;
@ -184,25 +239,25 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
 __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                               __global numtyp4* lj3_in, 
                               __global numtyp* sp_lj_in,__global int *dev_nbor,
-                               __global acctyp4 *ans, __global acctyp *engv, 
+                               __global int *dev_packed, __global acctyp4 *ans,
-                               const int eflag, const int vflag, const int inum, 
+                               __global acctyp *engv, const int eflag,
-                               const int nall, const int nbor_pitch) {
+                               const int vflag, const int inum, const int nall,
-  // ii indexes the two interacting particles in gi
+                               const int nbor_pitch, const int t_per_atom) {
-  int ii=THREAD_ID_X;
+  int tid=THREAD_ID_X;
  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
  ii+=tid/t_per_atom;
  int offset=tid%t_per_atom;
  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp sp_lj[4];
-  if (ii<4)
+  if (tid<4)
-    sp_lj[ii]=sp_lj_in[ii];
+    sp_lj[tid]=sp_lj_in[tid];
-  if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
-    lj1[ii]=lj1_in[ii];
+    lj1[tid]=lj1_in[tid];
    if (eflag>0)
-      lj3[ii]=lj3_in[ii];
+      lj3[tid]=lj3_in[tid];
  }
  ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
  __syncthreads();
  if (ii<inum) {
  acctyp energy=(acctyp)0;
  acctyp4 f;
@ -213,19 +268,34 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
  __syncthreads();
  if (ii<inum) {
    __global int *nbor=dev_nbor+ii;
    int i=*nbor;
    nbor+=nbor_pitch;
    int numj=*nbor;
    nbor+=nbor_pitch;
-    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+
    int n_stride;
    __global int *list_end;
    if (dev_nbor==dev_packed) {
      list_end=nbor+mul24(numj,nbor_pitch);
      nbor+=mul24(offset,nbor_pitch);
      n_stride=mul24(t_per_atom,nbor_pitch);
    } else {
      nbor=dev_packed+*nbor;
      list_end=nbor+numj;
      n_stride=t_per_atom;
      nbor+=offset;
    }
    numtyp4 ix=fetch_pos(i,x_); //x_[i];
    int iw=ix.w;
    int itype=mul24((int)MAX_SHARED_TYPES,iw);
    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;
      factor_lj = sp_lj[sbmask(j)];
@ -273,8 +343,47 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
      }
    } // for nbor
  } // if ii
  // Reduce answers
  if (t_per_atom>1) {
    __local acctyp red_acc[6][BLOCK_PAIR];
    red_acc[0][tid]=f.x;
    red_acc[1][tid]=f.y;
    red_acc[2][tid]=f.z;
    red_acc[3][tid]=energy;
    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
      if (offset < s) {
        for (int r=0; r<4; r++)
          red_acc[r][tid] += red_acc[r][tid+s];
      }
    }
    f.x=red_acc[0][tid];
    f.y=red_acc[1][tid];
    f.z=red_acc[2][tid];
    energy=red_acc[3][tid];
    if (vflag>0) {
      for (int r=0; r<6; r++)
        red_acc[r][tid]=virial[r];
      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
        if (offset < s) {
          for (int r=0; r<6; r++)
            red_acc[r][tid] += red_acc[r][tid+s];
        }
      }
      for (int r=0; r<6; r++)
        virial[r]=red_acc[r][tid];
    }
  }
  // Store answers
  if (ii<inum && offset==0) {
    __global acctyp *ap1=engv+ii;
    if (eflag>0) {
      *ap1=energy;
--- a/lib/gpu/cmm_cut_gpu_memory.cpp
+++ b/lib/gpu/cmm_cut_gpu_memory.cpp
@ -42,7 +42,7 @@ int CMM_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
 }
 template <class numtyp, class acctyp>
-bool CMM_GPU_MemoryT::init(const int ntypes, double **host_cutsq, 
+int CMM_GPU_MemoryT::init(const int ntypes, double **host_cutsq, 
                          int **host_cg_type, double **host_lj1, 
                          double **host_lj2, double **host_lj3, 
                          double **host_lj4, double **host_offset, 
@ -50,14 +50,18 @@ bool CMM_GPU_MemoryT::init(const int ntypes, double **host_cutsq,
                          const int nall, const int max_nbors,
                          const int maxspecial, const double cell_size, 
                          const double gpu_split, FILE *_screen) {
-  this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+  int success;
  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
                            _screen,cmm_cut_gpu_kernel);
  if (success!=0)
    return success;
  // If atom type constants fit in shared memory use fast kernel
  int cmm_types=ntypes;
  shared_types=false;
-  if (cmm_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) {
+  int max_shared_types=this->device->max_shared_types();
-    cmm_types=MAX_SHARED_TYPES;
+  if (cmm_types<=max_shared_types && this->_block_size>=max_shared_types) {
    cmm_types=max_shared_types;
    shared_types=true;
  }
  _cmm_types=cmm_types;
@ -84,7 +88,7 @@ bool CMM_GPU_MemoryT::init(const int ntypes, double **host_cutsq,
  _allocated=true;
  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
-  return true;
+  return 0;
 }
 template <class numtyp, class acctyp>
@ -122,9 +126,10 @@ void CMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
  else
    vflag=0;
-  int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));
-  int ainum=this->atom->inum();
+  int ainum=this->ans->inum();
  int anall=this->atom->nall();
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
@ -133,16 +138,18 @@ void CMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
                          &lj3.begin(), &sp_lj.begin(),
                          &this->nbor->dev_nbor.begin(),
-                          &this->atom->dev_ans.begin(),
+                          &this->_nbor_data->begin(),
-                          &this->atom->dev_engv.begin(), &eflag, &vflag,
+                          &this->ans->dev_ans.begin(),
-                          &ainum, &anall, &nbor_pitch);
+                          &this->ans->dev_engv.begin(), &eflag, &vflag,
                          &ainum, &anall, &nbor_pitch, 
                          &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
                     &_cmm_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->atom->dev_ans.begin(),
+                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &anall, &nbor_pitch);
+                     &anall, &nbor_pitch, &this->_threads_per_atom);
  }
  this->time_pair.stop();
 }
--- a/lib/gpu/cmm_cut_gpu_memory.h
+++ b/lib/gpu/cmm_cut_gpu_memory.h
@ -29,8 +29,15 @@ class CMM_GPU_Memory : public AtomicGPUMemory<numtyp, acctyp> {
  /// Clear any previous data and set up for a new LAMMPS run
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device **/
+    * \param gpu_split fraction of particles handled by device
-  bool init(const int ntypes, double **host_cutsq, int **host_cg_type,
+    * 
    * Returns:
    * -  0 if successfull
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
    * - -5 Double precision is not supported on card **/
  int init(const int ntypes, double **host_cutsq, int **host_cg_type,
           double **host_lj1, double **host_lj2, double **host_lj3,
           double **host_lj4, double **host_offset, double *host_special_lj,
           const int nlocal, const int nall, const int max_nbors, 
--- a/lib/gpu/cmmc_long_gpu.cpp
+++ b/lib/gpu/cmmc_long_gpu.cpp
@ -28,7 +28,7 @@ static CMML_GPU_Memory<PRECISION,ACC_PRECISION> CMMLMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-bool cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
+int cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
                  double **host_lj1, double **host_lj2, double **host_lj3, 
                  double **host_lj4, double **offset, double *special_lj,
                  const int inum, const int nall, const int max_nbors, 
@ -56,15 +56,12 @@ bool cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
    fflush(screen);
  }
-  if (world_me==0) {
+  int init_ok=0;
-    bool init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, 
+  if (world_me==0)
-                             host_lj3, host_lj4, offset, special_lj, inum, 
+    init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, host_lj3,
-                             nall, 300, maxspecial, cell_size, gpu_split, 
+                        host_lj4, offset, special_lj, inum, nall, 300, 
-                             screen, host_cut_ljsq, host_cut_coulsq,
+                        maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
-                             host_special_coul, qqrd2e,g_ewald);
+                        host_cut_coulsq, host_special_coul, qqrd2e,g_ewald);
    if (!init_ok)
      return false;
  }
  CMMLMF.device->world_barrier();
  if (message)
@ -79,48 +76,51 @@ bool cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
                last_gpu,i);
      fflush(screen);
    }
-    if (gpu_rank==i && world_me!=0) {
+    if (gpu_rank==i && world_me!=0)
-      bool init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, 
+      init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, host_lj3,
-                               host_lj3, host_lj4, offset, special_lj, inum, 
+                          host_lj4, offset, special_lj, inum,  nall, 300,
-                               nall, 300, maxspecial, cell_size, gpu_split,
+                          maxspecial, cell_size, gpu_split, screen,
-                               screen, host_cut_ljsq, host_cut_coulsq,
+                          host_cut_ljsq, host_cut_coulsq, host_special_coul,
-                               host_special_coul, qqrd2e, g_ewald);
+                          qqrd2e, g_ewald);
      if (!init_ok)
        return false;
    }
    CMMLMF.device->gpu_barrier();
    if (message) 
      fprintf(screen,"Done.\n");
  }
  if (message)
    fprintf(screen,"\n");
-  return true;
+
  if (init_ok==0)
    CMMLMF.estimate_gpu_overhead();
  return init_ok;
 }
 void cmml_gpu_clear() {
  CMMLMF.clear();
 }
-int * cmml_gpu_compute_n(const int timestep, const int ago, const int inum_full,
+int** cmml_gpu_compute_n(const int ago, const int inum_full,
                         const int nall, double **host_x, int *host_type,
-                         double *boxlo, double *boxhi, int *tag, int **nspecial, 
+                         double *sublo, double *subhi, int *tag, int **nspecial, 
                         int **special, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
-                         const double cpu_time, bool &success, double *host_q) {
+                         int **ilist, int **jnum, const double cpu_time,
-  return CMMLMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
+                         bool &success, double *host_q, double *boxlo,
-                        boxhi, tag, nspecial, special, eflag, vflag, eatom,
+                         double *prd) {
-                        vatom, host_start, cpu_time, success, host_q);
+  return CMMLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success,
                        host_q,boxlo,prd);
 }  
-void cmml_gpu_compute(const int timestep, const int ago, const int inum_full,
+void cmml_gpu_compute(const int ago, const int inum_full, const int nall,
-	 	     const int nall, double **host_x, int *host_type,
+                      double **host_x, int *host_type, int *ilist, int *numj,
-                     int *ilist, int *numj, int **firstneigh,
+                      int **firstneigh, const bool eflag, const bool vflag,
-		     const bool eflag, const bool vflag, const bool eatom,
+                      const bool eatom, const bool vatom, int &host_start,
-                     const bool vatom, int &host_start, const double cpu_time,
+                      const double cpu_time, bool &success, double *host_q,
-                     bool &success, double *host_q) {
+                      const int nlocal, double *boxlo, double *prd) {
-  CMMLMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
+  CMMLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
                firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
-                host_q);
+                host_q,nlocal,boxlo,prd);
 }
 double cmml_gpu_bytes() {
--- a/lib/gpu/cmmc_long_gpu_kernel.cu
+++ b/lib/gpu/cmmc_long_gpu_kernel.cu
@ -18,8 +18,6 @@
 #ifndef CMML_GPU_KERNEL
 #define CMML_GPU_KERNEL
 #define MAX_SHARED_TYPES 8
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
 #define numtyp2 double2
@ -54,7 +52,7 @@
 #ifdef NV_KERNEL
-#include "geryon/ucl_nv_kernel.h"
+#include "nv_kernel_def.h"
 texture<float4> pos_tex;
 texture<float> q_tex;
@ -90,6 +88,8 @@ __inline float fetch_q(const int& i, const float *q)
 #define fetch_pos(i,y) x_[i]
 #define fetch_q(i,y) q_[i]
 #define BLOCK_PAIR 64
 #define MAX_SHARED_TYPES 8
 #endif
@ -100,13 +100,17 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; }
 __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
                          __global numtyp4* lj3, const int lj_types, 
                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global acctyp4 *ans, __global acctyp *engv, 
+                          __global int *dev_packed, __global acctyp4 *ans,
-                          const int eflag, const int vflag, const int inum, 
+                          __global acctyp *engv, const int eflag, 
-                          const int nall, const int nbor_pitch,
+                          const int vflag, const int inum, const int nall,
-                          __global numtyp *q_ , const numtyp cut_coulsq,
+                          const int nbor_pitch, __global numtyp *q_ ,
-                          const numtyp qqrd2e, const numtyp g_ewald) {
+                          const numtyp cut_coulsq, const numtyp qqrd2e,
-  // ii indexes the two interacting particles in gi
+                          const numtyp g_ewald, const int t_per_atom) {
-  int ii=GLOBAL_ID_X;
+  int tid=THREAD_ID_X;
  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
  ii+=tid/t_per_atom;
  int offset=tid%t_per_atom;
  __local numtyp sp_lj[8];
  sp_lj[0]=sp_lj_in[0];
  sp_lj[1]=sp_lj_in[1];
@ -117,7 +121,6 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];
  if (ii<inum) {
  acctyp energy=(acctyp)0;
  acctyp e_coul=(acctyp)0;
  acctyp4 f;
@ -128,18 +131,31 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
  if (ii<inum) {
    __global int *nbor=dev_nbor+ii;
    int i=*nbor;
    nbor+=nbor_pitch;
    int numj=*nbor;
    nbor+=nbor_pitch;
-    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+
    int n_stride;
    __global int *list_end;
    if (dev_nbor==dev_packed) {
      list_end=nbor+mul24(numj,nbor_pitch);
      nbor+=mul24(offset,nbor_pitch);
      n_stride=mul24(t_per_atom,nbor_pitch);
    } else {
      nbor=dev_packed+*nbor;
      list_end=nbor+numj;
      n_stride=t_per_atom;
      nbor+=offset;
    }
    numtyp4 ix=fetch_pos(i,x_); //x_[i];
    numtyp qtmp=fetch_q(i,q_);
    int itype=ix.w;
-    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;
      numtyp factor_lj, factor_coul;
@ -213,8 +229,49 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
      }
    } // for nbor
  } // if ii
  // Reduce answers
  if (t_per_atom>1) {
    __local acctyp red_acc[6][BLOCK_PAIR];
    red_acc[0][tid]=f.x;
    red_acc[1][tid]=f.y;
    red_acc[2][tid]=f.z;
    red_acc[3][tid]=energy;
    red_acc[4][tid]=e_coul;
    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
      if (offset < s) {
        for (int r=0; r<5; r++)
          red_acc[r][tid] += red_acc[r][tid+s];
      }
    }
    f.x=red_acc[0][tid];
    f.y=red_acc[1][tid];
    f.z=red_acc[2][tid];
    energy=red_acc[3][tid];
    e_coul=red_acc[4][tid];
    if (vflag>0) {
      for (int r=0; r<6; r++)
        red_acc[r][tid]=virial[r];
      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
        if (offset < s) {
          for (int r=0; r<6; r++)
            red_acc[r][tid] += red_acc[r][tid+s];
        }
      }
      for (int r=0; r<6; r++)
        virial[r]=red_acc[r][tid];
    }
  }
  // Store answers
  if (ii<inum && offset==0) {
    __global acctyp *ap1=engv+ii;
    if (eflag>0) {
      *ap1=energy;
@ -234,27 +291,28 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
 __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                               __global numtyp4* lj3_in, 
-                               __global numtyp* sp_lj_in, __global int *dev_nbor, 
+                               __global numtyp* sp_lj_in, 
                               __global int *dev_nbor, __global int *dev_packed,
                               __global acctyp4 *ans, __global acctyp *engv, 
                               const int eflag, const int vflag, const int inum, 
                               const int nall, const int nbor_pitch,
                               __global numtyp *q_ , const numtyp cut_coulsq,
-                               const numtyp qqrd2e, const numtyp g_ewald) {
+                               const numtyp qqrd2e, const numtyp g_ewald,
-  // ii indexes the two interacting particles in gi
+                               const int t_per_atom) {
-  int ii=THREAD_ID_X;
+  int tid=THREAD_ID_X;
  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
  ii+=tid/t_per_atom;
  int offset=tid%t_per_atom;
  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp sp_lj[8];
-  if (ii<8)
+  if (tid<8)
-    sp_lj[ii]=sp_lj_in[ii];
+    sp_lj[tid]=sp_lj_in[tid];
-  if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
-    lj1[ii]=lj1_in[ii];
+    lj1[tid]=lj1_in[tid];
-    lj3[ii]=lj3_in[ii];
+    lj3[tid]=lj3_in[tid];
  }
  ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
  __syncthreads();
  if (ii<inum) {
  acctyp energy=(acctyp)0;
  acctyp e_coul=(acctyp)0;
@ -266,19 +324,34 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
  __syncthreads();
  if (ii<inum) {
    __global int *nbor=dev_nbor+ii;
    int i=*nbor;
    nbor+=nbor_pitch;
    int numj=*nbor;
    nbor+=nbor_pitch;
-    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+
    int n_stride;
    __global int *list_end;
    if (dev_nbor==dev_packed) {
      list_end=nbor+mul24(numj,nbor_pitch);
      nbor+=mul24(offset,nbor_pitch);
      n_stride=mul24(t_per_atom,nbor_pitch);
    } else {
      nbor=dev_packed+*nbor;
      list_end=nbor+numj;
      n_stride=t_per_atom;
      nbor+=offset;
    }
    numtyp4 ix=fetch_pos(i,x_); //x_[i];
    numtyp qtmp=fetch_q(i,q_);
    int iw=ix.w;
    int itype=mul24((int)MAX_SHARED_TYPES,iw);
-    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;
      numtyp factor_lj, factor_coul;
@ -351,8 +424,49 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
      }
    } // for nbor
  } // if ii
  // Reduce answers
  if (t_per_atom>1) {
    __local acctyp red_acc[6][BLOCK_PAIR];
    red_acc[0][tid]=f.x;
    red_acc[1][tid]=f.y;
    red_acc[2][tid]=f.z;
    red_acc[3][tid]=energy;
    red_acc[4][tid]=e_coul;
    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
      if (offset < s) {
        for (int r=0; r<5; r++)
          red_acc[r][tid] += red_acc[r][tid+s];
      }
    }
    f.x=red_acc[0][tid];
    f.y=red_acc[1][tid];
    f.z=red_acc[2][tid];
    energy=red_acc[3][tid];
    e_coul=red_acc[4][tid];
    if (vflag>0) {
      for (int r=0; r<6; r++)
        red_acc[r][tid]=virial[r];
      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
        if (offset < s) {
          for (int r=0; r<6; r++)
            red_acc[r][tid] += red_acc[r][tid+s];
        }
      }
      for (int r=0; r<6; r++)
        virial[r]=red_acc[r][tid];
    }
  }
  // Store answers
  if (ii<inum && offset==0) {
    __global acctyp *ap1=engv+ii;
    if (eflag>0) {
      *ap1=energy;
--- a/lib/gpu/cmmc_long_gpu_memory.cpp
+++ b/lib/gpu/cmmc_long_gpu_memory.cpp
@ -43,7 +43,7 @@ int CMML_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
 }
 template <class numtyp, class acctyp>
-bool CMML_GPU_MemoryT::init(const int ntypes, double **host_cutsq, 
+int CMML_GPU_MemoryT::init(const int ntypes, double **host_cutsq, 
                           int **host_cg_type, double **host_lj1, 
                           double **host_lj2, double **host_lj3, 
                           double **host_lj4, double **host_offset, 
@ -55,14 +55,18 @@ bool CMML_GPU_MemoryT::init(const int ntypes, double **host_cutsq,
                           const double host_cut_coulsq,
                           double *host_special_coul, const double qqrd2e,
                           const double g_ewald) {
-  this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+  int success;
  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
                            _screen,cmmc_long_gpu_kernel);
  if (success!=0)
    return success;
  // If atom type constants fit in shared memory use fast kernel
  int lj_types=ntypes;
  shared_types=false;
-  if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) {
+  int max_shared_types=this->device->max_shared_types();
-    lj_types=MAX_SHARED_TYPES;
+  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
    lj_types=max_shared_types;
    shared_types=true;
  }
  _lj_types=lj_types;
@ -95,7 +99,7 @@ bool CMML_GPU_MemoryT::init(const int ntypes, double **host_cutsq,
  _allocated=true;
  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
-  return true;
+  return 0;
 }
 template <class numtyp, class acctyp>
@ -133,9 +137,10 @@ void CMML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
  else
    vflag=0;
-  int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));
-  int ainum=this->atom->inum();
+  int ainum=this->ans->inum();
  int anall=this->atom->nall();
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
@ -144,19 +149,21 @@ void CMML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
                          &lj3.begin(), &sp_lj.begin(),
                          &this->nbor->dev_nbor.begin(),
-                          &this->atom->dev_ans.begin(),
+                          &this->_nbor_data->begin(),
-                          &this->atom->dev_engv.begin(), &eflag, &vflag,
+                          &this->ans->dev_ans.begin(),
                          &this->ans->dev_engv.begin(), &eflag, &vflag,
                          &ainum, &anall, &nbor_pitch,
                          &this->atom->dev_q.begin(), &_cut_coulsq,
-                          &_qqrd2e, &_g_ewald);
+                          &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->atom->dev_ans.begin(),
+                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
                     &anall, &nbor_pitch, &this->atom->dev_q.begin(),
-                     &_cut_coulsq, &_qqrd2e, &_g_ewald);
+                     &_cut_coulsq, &_qqrd2e, &_g_ewald, 
                     &this->_threads_per_atom);
  }
  this->time_pair.stop();
 }
--- a/lib/gpu/cmmc_long_gpu_memory.h
+++ b/lib/gpu/cmmc_long_gpu_memory.h
@ -29,8 +29,15 @@ class CMML_GPU_Memory : public ChargeGPUMemory<numtyp, acctyp> {
  /// Clear any previous data and set up for a new LAMMPS run
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device **/
+    * \param gpu_split fraction of particles handled by device
-  bool init(const int ntypes, double **host_cutsq, int ** cg_type,
+    * 
    * Returns:
    * -  0 if successfull
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
    * - -5 Double precision is not supported on card **/
  int init(const int ntypes, double **host_cutsq, int ** cg_type,
           double **host_lj1, double **host_lj2, double **host_lj3,
           double **host_lj4, double **host_offset, double *host_special_lj,
           const int nlocal, const int nall, const int max_nbors, 
--- a/lib/gpu/crml_gpu.cpp
+++ b/lib/gpu/crml_gpu.cpp
@ -28,7 +28,7 @@ static CRML_GPU_Memory<PRECISION,ACC_PRECISION> CRMLMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-bool crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
+int crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
                  double **host_lj2, double **host_lj3, double **host_lj4,
                  double **offset, double *special_lj, const int inum,
                  const int nall, const int max_nbors, const int maxspecial,
@ -58,16 +58,13 @@ bool crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
    fflush(screen);
  }
-  if (world_me==0) {
+  int init_ok=0;
-    bool init_ok=CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3,
+  if (world_me==0)
-                             host_lj4, offset, special_lj, inum, nall, 300,
+    CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                             maxspecial, cell_size, gpu_split, screen,
+                offset, special_lj, inum, nall, 300, maxspecial, cell_size,
-                             host_cut_ljsq, host_cut_coulsq, host_special_coul,
+                gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
-                             qqrd2e, g_ewald, cut_lj_innersq, denom_lj,
+                host_special_coul, qqrd2e, g_ewald, cut_lj_innersq, denom_lj,
                epsilon,sigma,mix_arithmetic);
    if (!init_ok)
      return false;
  }
  CRMLMF.device->world_barrier();
  if (message)
@ -82,50 +79,54 @@ bool crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
                last_gpu,i);
      fflush(screen);
    }
-    if (gpu_rank==i && world_me!=0) {
+    if (gpu_rank==i && world_me!=0)
-      bool init_ok=CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3,
+      init_ok=CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3,
                          host_lj4, offset, special_lj, inum, nall, 300,
-                               maxspecial, cell_size, gpu_split,
+                          maxspecial, cell_size, gpu_split, screen,
-                               screen, host_cut_ljsq, host_cut_coulsq,
+                          host_cut_ljsq, host_cut_coulsq, host_special_coul,
-                               host_special_coul, qqrd2e, g_ewald, 
+                          qqrd2e, g_ewald,  cut_lj_innersq, denom_lj, epsilon,
-                               cut_lj_innersq, denom_lj, epsilon, sigma,
+                          sigma, mix_arithmetic);
-                               mix_arithmetic);
+
      if (!init_ok)
        return false;
    }
    CRMLMF.device->gpu_barrier();
    if (message) 
      fprintf(screen,"Done.\n");
  }
  if (message)
    fprintf(screen,"\n");
-  return true;
+
  if (init_ok==0)
    CRMLMF.estimate_gpu_overhead();
  return init_ok;
 }
 void crml_gpu_clear() {
  CRMLMF.clear();
 }
-int * crml_gpu_compute_n(const int timestep, const int ago, const int inum_full,
+int** crml_gpu_compute_n(const int ago, const int inum_full,
                         const int nall, double **host_x, int *host_type,
-                         double *boxlo, double *boxhi, int *tag, int **nspecial, 
+                         double *sublo, double *subhi, int *tag, int **nspecial, 
                         int **special, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
-                         const double cpu_time, bool &success, double *host_q) {
+                         int **ilist, int **jnum, const double cpu_time,
-  return CRMLMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
+                         bool &success, double *host_q, double *boxlo,
-                        boxhi, tag, nspecial, special, eflag, vflag, eatom,
+                         double *prd) {
-                        vatom, host_start, cpu_time, success, host_q);
+  return CRMLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success,
                        host_q, boxlo, prd);
 }  
-void crml_gpu_compute(const int timestep, const int ago, const int inum_full,
+void crml_gpu_compute(const int ago, const int inum_full,
 	 	                  const int nall, double **host_x, int *host_type,
                      int *ilist, int *numj, int **firstneigh,
 		                  const bool eflag, const bool vflag, const bool eatom,
                      const bool vatom, int &host_start, const double cpu_time,
-                     bool &success, double *host_q) {
+                      bool &success, double *host_q, const int nlocal, 
-  CRMLMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
+                      double *boxlo, double *prd) {
-                firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
+  CRMLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,
-                host_q);
+                 eflag,vflag,eatom,vatom,host_start,cpu_time,success,host_q,
                 nlocal,boxlo,prd);
 }
 double crml_gpu_bytes() {
--- a/lib/gpu/crml_gpu_kernel.cu
+++ b/lib/gpu/crml_gpu_kernel.cu
@ -54,7 +54,7 @@
 #ifdef NV_KERNEL
-#include "geryon/ucl_nv_kernel.h"
+#include "nv_kernel_def.h"
 texture<float4> pos_tex;
 texture<float> q_tex;
@ -90,6 +90,7 @@ __inline float fetch_q(const int& i, const float *q)
 #define fetch_pos(i,y) x_[i]
 #define fetch_q(i,y) q_[i]
 #define BLOCK_BIO_PAIR 64
 #endif
@ -98,18 +99,22 @@ __inline float fetch_q(const int& i, const float *q)
 __inline int sbmask(int j) { return j >> SBBITS & 3; }
 __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
-                          const int lj_types, 
+                          const int lj_types, __global numtyp *sp_lj_in,
-                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
+                          __global int *dev_nbor, __global int *dev_packed,
                          __global acctyp4 *ans, __global acctyp *engv, 
                          const int eflag, const int vflag, const int inum, 
                          const int nall, const int nbor_pitch,
                          __global numtyp *q_, const numtyp cut_coulsq,
                          const numtyp qqrd2e, const numtyp g_ewald,
                          const numtyp denom_lj, const numtyp cut_bothsq, 
-                          const numtyp cut_ljsq, const numtyp cut_lj_innersq) {
+                          const numtyp cut_ljsq, const numtyp cut_lj_innersq,
                          const int t_per_atom) {
  int tid=THREAD_ID_X;
  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
  ii+=tid/t_per_atom;
  int offset=tid%t_per_atom;
  // ii indexes the two interacting particles in gi
  int ii=GLOBAL_ID_X;
  __local numtyp sp_lj[8];
  sp_lj[0]=sp_lj_in[0];
  sp_lj[1]=sp_lj_in[1];
@ -120,7 +125,6 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];
  if (ii<inum) {
  acctyp energy=(acctyp)0;
  acctyp e_coul=(acctyp)0;
  acctyp4 f;
@ -131,18 +135,31 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
  if (ii<inum) {
    __global int *nbor=dev_nbor+ii;
    int i=*nbor;
    nbor+=nbor_pitch;
    int numj=*nbor;
    nbor+=nbor_pitch;
-    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+
    int n_stride;
    __global int *list_end;
    if (dev_nbor==dev_packed) {
      list_end=nbor+mul24(numj,nbor_pitch);
      nbor+=mul24(offset,nbor_pitch);
      n_stride=mul24(t_per_atom,nbor_pitch);
    } else {
      nbor=dev_packed+*nbor;
      list_end=nbor+numj;
      n_stride=t_per_atom;
      nbor+=offset;
    }
    numtyp4 ix=fetch_pos(i,x_); //x_[i];
    numtyp qtmp=fetch_q(i,q_);
    int itype=ix.w;
-    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;
      numtyp factor_lj, factor_coul;
@ -219,8 +236,49 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
      }
    } // for nbor
  } // if ii
  // Reduce answers
  if (t_per_atom>1) {
    __local acctyp red_acc[6][BLOCK_BIO_PAIR];
    red_acc[0][tid]=f.x;
    red_acc[1][tid]=f.y;
    red_acc[2][tid]=f.z;
    red_acc[3][tid]=energy;
    red_acc[4][tid]=e_coul;
    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
      if (offset < s) {
        for (int r=0; r<5; r++)
          red_acc[r][tid] += red_acc[r][tid+s];
      }
    }
    f.x=red_acc[0][tid];
    f.y=red_acc[1][tid];
    f.z=red_acc[2][tid];
    energy=red_acc[3][tid];
    e_coul=red_acc[4][tid];
    if (vflag>0) {
      for (int r=0; r<6; r++)
        red_acc[r][tid]=virial[r];
      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
        if (offset < s) {
          for (int r=0; r<6; r++)
            red_acc[r][tid] += red_acc[r][tid+s];
        }
      }
      for (int r=0; r<6; r++)
        virial[r]=red_acc[r][tid];
    }
  }
  // Store answers
  if (ii<inum && offset==0) {
    __global acctyp *ap1=engv+ii;
    if (eflag>0) {
      *ap1=energy;
@ -240,27 +298,27 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
 __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
                               __global numtyp* sp_lj_in, __global int *dev_nbor, 
-                               __global acctyp4 *ans, __global acctyp *engv, 
+                               __global int *dev_packed, __global acctyp4 *ans,
-                               const int eflag, const int vflag, const int inum, 
+                               __global acctyp *engv, const int eflag,
-                               const int nall, const int nbor_pitch,
+                               const int vflag, const int inum, const int nall,
-                               __global numtyp *q_, const numtyp cut_coulsq, 
+                               const int nbor_pitch, __global numtyp *q_,
-                               const numtyp qqrd2e, const numtyp g_ewald,
+                               const numtyp cut_coulsq, const numtyp qqrd2e,
-                               const numtyp denom_lj, const numtyp cut_bothsq, 
+                               const numtyp g_ewald, const numtyp denom_lj,
-                               const numtyp cut_ljsq,
+                               const numtyp cut_bothsq, const numtyp cut_ljsq, 
-                               const numtyp cut_lj_innersq) {
+                               const numtyp cut_lj_innersq,
-  // ii indexes the two interacting particles in gi
+                               const int t_per_atom) {
-  int ii=THREAD_ID_X;
+  int tid=THREAD_ID_X;
  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
  ii+=tid/t_per_atom;
  int offset=tid%t_per_atom;
  __local numtyp2 ljd[MAX_BIO_SHARED_TYPES];
  __local numtyp sp_lj[8];
-  if (ii<8)
+  if (tid<8)
-    sp_lj[ii]=sp_lj_in[ii];
+    sp_lj[tid]=sp_lj_in[tid];
-  ljd[ii]=ljd_in[ii];
+  ljd[tid]=ljd_in[tid];
-  ljd[ii+64]=ljd_in[ii+64];
+  if (tid+BLOCK_BIO_PAIR<MAX_BIO_SHARED_TYPES)
-
+    ljd[tid+BLOCK_BIO_PAIR]=ljd_in[tid+BLOCK_BIO_PAIR];
  ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
  __syncthreads();
  if (ii<inum) {
  acctyp energy=(acctyp)0;
  acctyp e_coul=(acctyp)0;
@ -272,18 +330,33 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
  __syncthreads();
  if (ii<inum) {
    __global int *nbor=dev_nbor+ii;
    int i=*nbor;
    nbor+=nbor_pitch;
    int numj=*nbor;
    nbor+=nbor_pitch;
-    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+
    int n_stride;
    __global int *list_end;
    if (dev_nbor==dev_packed) {
      list_end=nbor+mul24(numj,nbor_pitch);
      nbor+=mul24(offset,nbor_pitch);
      n_stride=mul24(t_per_atom,nbor_pitch);
    } else {
      nbor=dev_packed+*nbor;
      list_end=nbor+numj;
      n_stride=t_per_atom;
      nbor+=offset;
    }
    numtyp4 ix=fetch_pos(i,x_); //x_[i];
    numtyp qtmp=fetch_q(i,q_);
    int itype=ix.w;
-    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;
      numtyp factor_lj, factor_coul;
@ -366,8 +439,49 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
      }
    } // for nbor
  } // if ii
  // Reduce answers
  if (t_per_atom>1) {
    __local acctyp red_acc[6][BLOCK_BIO_PAIR];
    red_acc[0][tid]=f.x;
    red_acc[1][tid]=f.y;
    red_acc[2][tid]=f.z;
    red_acc[3][tid]=energy;
    red_acc[4][tid]=e_coul;
    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
      if (offset < s) {
        for (int r=0; r<5; r++)
          red_acc[r][tid] += red_acc[r][tid+s];
      }
    }
    f.x=red_acc[0][tid];
    f.y=red_acc[1][tid];
    f.z=red_acc[2][tid];
    energy=red_acc[3][tid];
    e_coul=red_acc[4][tid];
    if (vflag>0) {
      for (int r=0; r<6; r++)
        red_acc[r][tid]=virial[r];
      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
        if (offset < s) {
          for (int r=0; r<6; r++)
            red_acc[r][tid] += red_acc[r][tid+s];
        }
      }
      for (int r=0; r<6; r++)
        virial[r]=red_acc[r][tid];
    }
  }
  // Store answers
  if (ii<inum && offset==0) {
    __global acctyp *ap1=engv+ii;
    if (eflag>0) {
      *ap1=energy;
--- a/lib/gpu/crml_gpu_memory.cpp
+++ b/lib/gpu/crml_gpu_memory.cpp
@ -43,7 +43,7 @@ int CRML_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
 }
 template <class numtyp, class acctyp>
-bool CRML_GPU_MemoryT::init(const int ntypes,
+int CRML_GPU_MemoryT::init(const int ntypes,
                           double host_cut_bothsq, double **host_lj1, 
                           double **host_lj2, double **host_lj3, 
                           double **host_lj4, double **host_offset, 
@ -56,20 +56,24 @@ bool CRML_GPU_MemoryT::init(const int ntypes,
                           const double g_ewald, const double cut_lj_innersq,
                           const double denom_lj, double **epsilon,
                           double **sigma, const bool mix_arithmetic) {
-  this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+  int success;
  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
                            _screen,crml_gpu_kernel);
  if (success!=0)
    return success;
  // If atom type constants fit in shared memory use fast kernel
  int lj_types=ntypes;
  shared_types=false;
-  if (this->_block_size>=64 && mix_arithmetic)
+  if (this->_block_bio_size>=64 && mix_arithmetic)
    shared_types=true;
  _lj_types=lj_types;
  // Allocate a host write buffer for data initialization
  int h_size=lj_types*lj_types;
-  if (h_size<MAX_BIO_SHARED_TYPES)
+  int max_bio_shared_types=this->device->max_bio_shared_types();
-    h_size=MAX_BIO_SHARED_TYPES;
+  if (h_size<max_bio_shared_types)
    h_size=max_bio_shared_types;
  UCL_H_Vec<numtyp> host_write(h_size*32,*(this->ucl_device),
                               UCL_WRITE_OPTIMIZED);
  for (int i=0; i<h_size*32; i++)
@ -79,7 +83,7 @@ bool CRML_GPU_MemoryT::init(const int ntypes,
  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
                         host_lj3,host_lj4);
-  ljd.alloc(MAX_BIO_SHARED_TYPES,*(this->ucl_device),UCL_READ_ONLY);
+  ljd.alloc(max_bio_shared_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->self_pack2(ntypes,ljd,host_write,epsilon,sigma);
  sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
@ -99,7 +103,7 @@ bool CRML_GPU_MemoryT::init(const int ntypes,
  _allocated=true;
  this->_max_bytes=lj1.row_bytes()+ljd.row_bytes()+sp_lj.row_bytes();
-  return true;
+  return 0;
 }
 template <class numtyp, class acctyp>
@ -125,7 +129,7 @@ double CRML_GPU_MemoryT::host_memory_usage() const {
 template <class numtyp, class acctyp>
 void CRML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
  // Compute the block size and grid size to keep all cores busy
-  const int BX=this->block_size();
+  const int BX=this->_block_bio_size;
  int eflag, vflag;
  if (_eflag)
    eflag=1;
@ -137,9 +141,10 @@ void CRML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
  else
    vflag=0;
-  int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));
-  int ainum=this->atom->inum();
+  int ainum=this->ans->inum();
  int anall=this->atom->nall();
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
@ -147,21 +152,24 @@ void CRML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
    this->k_pair_fast.set_size(GX,BX);
    this->k_pair_fast.run(&this->atom->dev_x.begin(), &ljd.begin(),
                          &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                          &this->atom->dev_ans.begin(),
+                          &this->_nbor_data->begin(),
-                          &this->atom->dev_engv.begin(), &eflag, &vflag,
+                          &this->ans->dev_ans.begin(),
                          &this->ans->dev_engv.begin(), &eflag, &vflag,
                          &ainum, &anall, &nbor_pitch,
                          &this->atom->dev_q.begin(), &_cut_coulsq,
                          &_qqrd2e, &_g_ewald, &_denom_lj, &_cut_bothsq,
-                          &_cut_ljsq, &_cut_lj_innersq);
+                          &_cut_ljsq, &_cut_lj_innersq, 
                          &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(),
                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->atom->dev_ans.begin(),
+                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
                     &anall, &nbor_pitch, &this->atom->dev_q.begin(),
                     &_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj,
-                     &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq);
+                     &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq,
                     &this->_threads_per_atom);
  }
  this->time_pair.stop();
 }
--- a/lib/gpu/crml_gpu_memory.h
+++ b/lib/gpu/crml_gpu_memory.h
@ -29,8 +29,15 @@ class CRML_GPU_Memory : public ChargeGPUMemory<numtyp, acctyp> {
  /// Clear any previous data and set up for a new LAMMPS run
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device **/
+    * \param gpu_split fraction of particles handled by device
-  bool init(const int ntypes, double host_cut_bothsq,
+    * 
    * Returns:
    * -  0 if successfull
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
    * - -5 Double precision is not supported on card **/
  int init(const int ntypes, double host_cut_bothsq,
           double **host_lj1, double **host_lj2, double **host_lj3,
           double **host_lj4, double **host_offset, double *host_special_lj,
           const int nlocal, const int nall, const int max_nbors, 
--- a/lib/gpu/gb_gpu.cpp
+++ b/lib/gpu/gb_gpu.cpp
@ -49,7 +49,7 @@ void gb_gpu_pack_nbors(GBMT &gbm, const int GX, const int BX, const int start,
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-bool gb_gpu_init(const int ntypes, const double gamma,
+int gb_gpu_init(const int ntypes, const double gamma,
                const double upsilon, const double mu, double **shape,
                double **well, double **cutsq, double **sigma,
                double **epsilon, double *host_lshape, int **form,
@ -77,14 +77,12 @@ bool gb_gpu_init(const int ntypes, const double gamma,
    fflush(screen);
  }
-  if (world_me==0) {
+  int init_ok=0;
-    bool init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq, 
+  if (world_me==0)
    init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq, 
                      sigma, epsilon, host_lshape, form, host_lj1, 
                      host_lj2, host_lj3, host_lj4, offset, special_lj, 
                      inum, nall, max_nbors, cell_size, gpu_split, screen);
    if (!init_ok)
      return false;
  }
  GBMF.device->world_barrier();
  if (message)
@ -99,22 +97,22 @@ bool gb_gpu_init(const int ntypes, const double gamma,
                last_gpu,i);
      fflush(screen);
    }
-    if (gpu_rank==i && world_me!=0) {
+    if (gpu_rank==i && world_me!=0)
-      bool init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq, 
+      init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq,  sigma,
-                             sigma, epsilon, host_lshape, form, host_lj1, 
+                        epsilon, host_lshape, form, host_lj1, host_lj2,
-                             host_lj2, host_lj3, host_lj4, offset, special_lj, 
+                        host_lj3, host_lj4, offset, special_lj,  inum, nall,
-                             inum, nall, max_nbors, cell_size, gpu_split, 
+                        max_nbors, cell_size, gpu_split,  screen);
-                             screen);
+
      if (!init_ok)
        return false;
    }
    GBMF.device->gpu_barrier();
    if (message) 
      fprintf(screen,"Done.\n");
  }
  if (message)
    fprintf(screen,"\n");
-  return true;
+
  if (init_ok==0)
    GBMF.estimate_gpu_overhead();
  return init_ok;
 }
 // ---------------------------------------------------------------------------
@ -131,8 +129,8 @@ template <class gbmtyp>
 inline void _gb_gpu_build_nbor_list(gbmtyp &gbm, const int inum,
                                    const int host_inum, const int nall, 
                                    double **host_x, double **host_quat,
-                                    int *host_type, double *boxlo,
+                                    int *host_type, double *sublo,
-                                    double *boxhi, bool &success) {
+                                    double *subhi, bool &success) {
  gbm.nbor_time_avail=true;
  success=true;
@ -144,7 +142,7 @@ inline void _gb_gpu_build_nbor_list(gbmtyp &gbm, const int inum,
  gbm.atom->cast_copy_x(host_x,host_type);
  int mn;
  gbm.nbor->build_nbor_list(inum, host_inum, nall, *gbm.atom,
-                            boxlo, boxhi, NULL, NULL, NULL, success, mn);
+                            sublo, subhi, NULL, NULL, NULL, success, mn);
  gbm.nbor->copy_unpacked(inum,mn);
  gbm.last_ellipse=inum;
  gbm.max_last_ellipse=inum;
@ -163,7 +161,7 @@ void _gb_gpu_reset_nbors(gbmtyp &gbm, const int nall,
  gbm.nbor_time_avail=true;
-  int mn=gbm.nbor->max_nbor_loop(inum,numj);
+  int mn=gbm.nbor->max_nbor_loop(inum,numj,ilist);
  gbm.resize_atom(inum,nall,success);
  gbm.resize_local(inum,0,mn,osize,success);
  if (!success)
@ -216,9 +214,10 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
  else
    vflag=0;
-  int GX=static_cast<int>(ceil(static_cast<double>(gbm.atom->inum())/BX));
+  int GX=static_cast<int>(ceil(static_cast<double>(gbm.ans->inum())/
                               (BX/gbm._threads_per_atom)));
  int stride=gbm.nbor->nbor_pitch();
-  int ainum=gbm.atom->inum();
+  int ainum=gbm.ans->inum();
  int anall=gbm.atom->nall();
  if (gbm.multiple_forms) {
@ -226,7 +225,7 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
    if (gbm.last_ellipse>0) {
      // ------------ ELLIPSE_ELLIPSE and ELLIPSE_SPHERE ---------------
      GX=static_cast<int>(ceil(static_cast<double>(gbm.last_ellipse)/
-                               static_cast<double>(BX)));
+                               (BX/gbm._threads_per_atom)));
      gb_gpu_pack_nbors(gbm,GX,BX, 0, gbm.last_ellipse,ELLIPSE_SPHERE,
 			ELLIPSE_ELLIPSE);
      gbm.time_kernel.stop();
@ -237,11 +236,12 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
           &gbm.atom->dev_quat.begin(), &gbm.shape.begin(), &gbm.well.begin(),
           &gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(), 
           &gbm._lj_types, &gbm.lshape.begin(), &gbm.nbor->dev_nbor.begin(),
-           &stride, &gbm.atom->dev_ans.begin(),&ainum,&gbm.atom->dev_engv.begin(),
+           &stride, &gbm.ans->dev_ans.begin(),&ainum,&gbm.ans->dev_engv.begin(),
-           &gbm.dev_error.begin(), &eflag, &vflag, &gbm.last_ellipse, &anall);
+           &gbm.dev_error.begin(), &eflag, &vflag, &gbm.last_ellipse, &anall,
           &gbm._threads_per_atom);
      gbm.time_gayberne.stop();
-      if (gbm.last_ellipse==gbm.atom->inum()) {
+      if (gbm.last_ellipse==gbm.ans->inum()) {
        gbm.time_kernel2.start();
        gbm.time_kernel2.stop();
        gbm.time_gayberne2.start();
@ -254,9 +254,10 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
      // ------------ SPHERE_ELLIPSE ---------------
      gbm.time_kernel2.start();
-      GX=static_cast<int>(ceil(static_cast<double>(gbm.atom->inum()-
+      GX=static_cast<int>(ceil(static_cast<double>(gbm.ans->inum()-
-                               gbm.last_ellipse)/BX));
+                               gbm.last_ellipse)/
-      gb_gpu_pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.atom->inum(),
+                               (BX/gbm._threads_per_atom)));
      gb_gpu_pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.ans->inum(),
 			SPHERE_ELLIPSE,SPHERE_ELLIPSE);
      gbm.time_kernel2.stop();
@ -266,13 +267,14 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
              &gbm.shape.begin(), &gbm.well.begin(), 
              &gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(), 
              &gbm._lj_types, &gbm.lshape.begin(), 
-              &gbm.nbor->dev_nbor.begin(), &stride, &gbm.atom->dev_ans.begin(),
+              &gbm.nbor->dev_nbor.begin(), &stride, &gbm.ans->dev_ans.begin(),
-              &gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(), &eflag,
+              &gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(), &eflag,
-              &vflag, &gbm.last_ellipse, &ainum, &anall);
+              &vflag, &gbm.last_ellipse, &ainum, &anall,
              &gbm._threads_per_atom);
      gbm.time_gayberne2.stop();
   } else {
-      gbm.atom->dev_ans.zero();
+      gbm.ans->dev_ans.zero();
-      gbm.atom->dev_engv.zero();
+      gbm.ans->dev_engv.zero();
      gbm.time_kernel.stop();
      gbm.time_gayberne.start();                                 
      gbm.time_gayberne.stop();
@ -284,29 +286,31 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
    // ------------         LJ      ---------------
    gbm.time_pair.start();
-    if (gbm.last_ellipse<gbm.atom->inum()) {
+    if (gbm.last_ellipse<gbm.ans->inum()) {
      if (gbm.shared_types) {
        GBMF.k_lj_fast.set_size(GX,BX);
        GBMF.k_lj_fast.run(&gbm.atom->dev_x.begin(), &gbm.lj1.begin(),
                           &gbm.lj3.begin(), &gbm.gamma_upsilon_mu.begin(),
                           &stride, &gbm.nbor->dev_packed.begin(),
-                           &gbm.atom->dev_ans.begin(),
+                           &gbm.ans->dev_ans.begin(),
-                           &gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(),
+                           &gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(),
-                           &eflag, &vflag, &gbm.last_ellipse, &ainum, &anall);
+                           &eflag, &vflag, &gbm.last_ellipse, &ainum, &anall,
                           &gbm._threads_per_atom);
      } else {
        GBMF.k_lj.set_size(GX,BX);
        GBMF.k_lj.run(&gbm.atom->dev_x.begin(), &gbm.lj1.begin(),
                      &gbm.lj3.begin(), &gbm._lj_types, 
                      &gbm.gamma_upsilon_mu.begin(), &stride, 
-                      &gbm.nbor->dev_packed.begin(), &gbm.atom->dev_ans.begin(),
+                      &gbm.nbor->dev_packed.begin(), &gbm.ans->dev_ans.begin(),
-                      &gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(),
+                      &gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(),
-                      &eflag, &vflag, &gbm.last_ellipse, &ainum, &anall);
+                      &eflag, &vflag, &gbm.last_ellipse, &ainum, &anall,
                      &gbm._threads_per_atom);
      }
    }
    gbm.time_pair.stop();
  } else {
    gbm.time_kernel.start();
-    gb_gpu_pack_nbors(gbm, GX, BX, 0, gbm.atom->inum(),SPHERE_SPHERE,
+    gb_gpu_pack_nbors(gbm, GX, BX, 0, gbm.ans->inum(),SPHERE_SPHERE,
 		      ELLIPSE_ELLIPSE);
    gbm.time_kernel.stop();
    gbm.time_gayberne.start(); 
@ -315,9 +319,9 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
            &gbm.shape.begin(), &gbm.well.begin(), 
            &gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(), 
            &gbm._lj_types, &gbm.lshape.begin(), &gbm.nbor->dev_nbor.begin(),
-            &stride, &gbm.atom->dev_ans.begin(), &ainum,
+            &stride, &gbm.ans->dev_ans.begin(), &ainum,
-            &gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(),
+            &gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(),
-            &eflag, &vflag, &ainum, &anall);
+            &eflag, &vflag, &ainum, &anall, &gbm._threads_per_atom);
    gbm.time_gayberne.stop();
  }
 }
@ -326,30 +330,31 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
 // Reneighbor on GPU if necessary and then compute forces, torques, energies
 // ---------------------------------------------------------------------------
 template <class gbmtyp>
-inline int * _gb_gpu_compute_n(gbmtyp &gbm, const int timestep, const int ago,
+inline int** _gb_gpu_compute_n(gbmtyp &gbm, const int ago,
                               const int inum_full, const int nall,
                               double **host_x, int *host_type,
-			       double *boxlo, double *boxhi, const bool eflag,
+                               double *sublo, double *subhi, const bool eflag,
                               const bool vflag, const bool eatom,
                               const bool vatom, int &host_start,
-		               const double cpu_time, bool &success,
+                               int **ilist, int **jnum, const double cpu_time,
-			       double **host_quat) {
+                               bool &success, double **host_quat) {
  gbm.acc_timers();
  if (inum_full==0) {
    host_start=0;
    gbm.zero_timers();
    return NULL;
  }
-  gbm.hd_balancer.balance(cpu_time,gbm.nbor->gpu_nbor());
+  gbm.hd_balancer.balance(cpu_time);
-  int inum=gbm.hd_balancer.get_gpu_count(timestep,ago,inum_full);
+  int inum=gbm.hd_balancer.get_gpu_count(ago,inum_full);
-  gbm.atom->inum(inum);
+  gbm.ans->inum(inum);
  gbm.last_ellipse=std::min(inum,gbm.max_last_ellipse);
  host_start=inum;
  // Build neighbor list on GPU if necessary
  if (ago==0) {
    _gb_gpu_build_nbor_list(gbm, inum, inum_full-inum, nall, host_x,
-                            host_quat, host_type, boxlo, boxhi, success);
+                            host_quat, host_type, sublo, subhi, success);
    if (!success)
      return NULL;
    gbm.atom->cast_quat_data(host_quat[0]);
@ -361,47 +366,49 @@ inline int * _gb_gpu_compute_n(gbmtyp &gbm, const int timestep, const int ago,
    gbm.atom->add_x_data(host_x,host_type);
  }
-  gbm.atom->add_other_data();
+  gbm.atom->add_quat_data();
  *ilist=gbm.nbor->host_ilist.begin();
  *jnum=gbm.nbor->host_acc.begin();
  _gb_gpu_gayberne<PRECISION,ACC_PRECISION>(gbm,eflag,vflag);
-  gbm.atom->copy_answers(eflag,vflag,eatom,vatom);
+  gbm.ans->copy_answers(eflag,vflag,eatom,vatom);
  gbm.device->add_ans_object(gbm.ans);
  gbm.hd_balancer.stop_timer();
-  return gbm.device->nbor.host_nbor.begin();
+  return gbm.nbor->host_jlist.begin()-host_start;
 }
-int * gb_gpu_compute_n(const int timestep, const int ago, const int inum_full,
+int** gb_gpu_compute_n(const int ago, const int inum_full, const int nall,
-	 	       const int nall, double **host_x, int *host_type,
+                       double **host_x, int *host_type, double *sublo,
-                       double *boxlo, double *boxhi, const bool eflag,
+                       double *subhi, const bool eflag, const bool vflag,
-		       const bool vflag, const bool eatom, const bool vatom,
+                       const bool eatom, const bool vatom, int &host_start,
-                       int &host_start, const double cpu_time, bool &success,
+                       int **ilist, int **jnum, const double cpu_time,
-		       double **host_quat) {
+                       bool &success, double **host_quat) {
-  return _gb_gpu_compute_n(GBMF, timestep, ago, inum_full, nall, host_x,
+  return _gb_gpu_compute_n(GBMF, ago, inum_full, nall, host_x, host_type, sublo,
-			   host_type, boxlo, boxhi, eflag, vflag, eatom, vatom,
+                           subhi, eflag, vflag, eatom, vatom, host_start, ilist,
-                           host_start, cpu_time, success, host_quat);
+                           jnum, cpu_time, success, host_quat);
 }  
 // ---------------------------------------------------------------------------
 // Copy nbor list from host if necessary and then calculate forces, torques,..
 // ---------------------------------------------------------------------------
 template <class gbmtyp>
-inline int * _gb_gpu_compute(gbmtyp &gbm, const int timestep, const int f_ago,
+inline int * _gb_gpu_compute(gbmtyp &gbm, const int f_ago, const int inum_full,
-			     const int inum_full,const int nall,double **host_x,
+                             const int nall,double **host_x, int *host_type,
-			     int *host_type, int *ilist, int *numj,
+                             int *ilist, int *numj, int **firstneigh,
-			     int **firstneigh, const bool eflag,
+                             const bool eflag, const bool vflag,
-			     const bool vflag, const bool eatom,
+                             const bool eatom, const bool vatom,
-                             const bool vatom, int &host_start,
+                             int &host_start, const double cpu_time,
-			     const double cpu_time, bool &success,
+                             bool &success, double **host_quat) {
 			     double **host_quat) {
  gbm.acc_timers();
  if (inum_full==0) {
    host_start=0;
    gbm.zero_timers();
    return NULL;
  }
  int ago=gbm.hd_balancer.ago_first(f_ago);
-  int inum=gbm.hd_balancer.balance(timestep,ago,inum_full,cpu_time,
+  int inum=gbm.hd_balancer.balance(ago,inum_full,cpu_time);
-				   gbm.nbor->gpu_nbor());
+  gbm.ans->inum(inum);
  gbm.atom->inum(inum);
  gbm.last_ellipse=std::min(inum,gbm.max_last_ellipse);
  host_start=inum;
@ -421,21 +428,21 @@ inline int * _gb_gpu_compute(gbmtyp &gbm, const int timestep, const int f_ago,
  gbm.atom->cast_quat_data(host_quat[0]);
  gbm.hd_balancer.start_timer();
  gbm.atom->add_x_data(host_x,host_type);
-  gbm.atom->add_other_data();
+  gbm.atom->add_quat_data();
  _gb_gpu_gayberne<PRECISION,ACC_PRECISION>(gbm,eflag,vflag);
-  gbm.atom->copy_answers(eflag,vflag,eatom,vatom,list);
+  gbm.ans->copy_answers(eflag,vflag,eatom,vatom,list);
  gbm.device->add_ans_object(gbm.ans);
  gbm.hd_balancer.stop_timer();
  return list;
 }
-int * gb_gpu_compute(const int timestep, const int ago, const int inum_full,
+int * gb_gpu_compute(const int ago, const int inum_full, const int nall,
-	 	     const int nall, double **host_x, int *host_type,
+                     double **host_x, int *host_type, int *ilist, int *numj,
-                     int *ilist, int *numj, int **firstneigh,
+                     int **firstneigh, const bool eflag, const bool vflag,
-		     const bool eflag, const bool vflag, const bool eatom,
+                     const bool eatom, const bool vatom, int &host_start,
-                     const bool vatom, int &host_start, const double cpu_time,
+                     const double cpu_time, bool &success, double **host_quat) {
-                     bool &success, double **host_quat) {
+  return _gb_gpu_compute(GBMF, ago, inum_full, nall, host_x,
  return _gb_gpu_compute(GBMF, timestep, ago, inum_full, nall, host_x,
 			 host_type, ilist, numj, firstneigh, eflag, vflag,
 			 eatom, vatom, host_start, cpu_time, success,
                         host_quat);
--- a/lib/gpu/gb_gpu_extra.h
+++ b/lib/gpu/gb_gpu_extra.h
@ -18,7 +18,6 @@
 #ifndef GB_GPU_EXTRA_H
 #define GB_GPU_EXTRA_H
 #define MAX_SHARED_TYPES 8
 enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
 #ifdef _DOUBLE_DOUBLE
@ -47,7 +46,7 @@ enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
 #ifdef NV_KERNEL
-#include "geryon/ucl_nv_kernel.h"
+#include "nv_kernel_def.h"
 #else
@ -58,6 +57,8 @@ enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
 #define BLOCK_SIZE_X get_local_size(0)
 #define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
 #define __inline inline
 #define BLOCK_PAIR 64
 #define MAX_SHARED_TYPES 8
 #endif
--- a/lib/gpu/gb_gpu_kernel.cu
+++ b/lib/gpu/gb_gpu_kernel.cu
@ -97,17 +97,17 @@ __kernel void kernel_gayberne(__global numtyp4* x_,__global numtyp4 *q,
                              __global acctyp4 *ans, const int astride, 
                              __global acctyp *engv, __global int *err_flag, 
                              const int eflag, const int vflag, const int inum,
-                              const int nall) {
+                              const int nall, const int t_per_atom) {
  int tid=THREAD_ID_X;
  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
  ii+=tid/t_per_atom;
  int offset=tid%t_per_atom;
  __local numtyp sp_lj[4];
-
+  sp_lj[0]=gum[3];    
-  // ii indexes the two interacting particles in gi
+  sp_lj[1]=gum[4];    
-  int ii=THREAD_ID_X;
+  sp_lj[2]=gum[5];    
-  if (ii<4)
+  sp_lj[3]=gum[6];    
    sp_lj[ii]=gum[ii+3];    
  ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);                                  
  __syncthreads();
  if (ii<inum) {
  acctyp energy=(acctyp)0;
  acctyp4 f;
@ -122,12 +122,15 @@ __kernel void kernel_gayberne(__global numtyp4* x_,__global numtyp4 *q,
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
  if (ii<inum) {
    __global int *nbor=dev_nbor+ii;
    int i=*nbor;
    nbor+=stride;
    int numj=*nbor;
    nbor+=stride;
    __global int *nbor_end=nbor+mul24(stride,numj);
    nbor+=mul24(offset,stride);
    int n_stride=mul24(t_per_atom,stride);
    numtyp4 ix=x_[i];
    int itype=ix.w;
@ -143,8 +146,7 @@ __kernel void kernel_gayberne(__global numtyp4* x_,__global numtyp4 *q,
    }
    numtyp factor_lj;
-  for ( ; nbor<nbor_end; nbor+=stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
      int j=*nbor;
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;
@ -362,8 +364,53 @@ __kernel void kernel_gayberne(__global numtyp4* x_,__global numtyp4 *q,
      tor.z+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2];
    } // for nbor
  } // if ii
  // Reduce answers
  if (t_per_atom>1) {
    __local acctyp red_acc[7][BLOCK_PAIR];
    red_acc[0][tid]=f.x;
    red_acc[1][tid]=f.y;
    red_acc[2][tid]=f.z;
    red_acc[3][tid]=tor.x;
    red_acc[4][tid]=tor.y;
    red_acc[5][tid]=tor.z;
    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
      if (offset < s) {
        for (int r=0; r<6; r++)
          red_acc[r][tid] += red_acc[r][tid+s];
      }
    }
    f.x=red_acc[0][tid];
    f.y=red_acc[1][tid];
    f.z=red_acc[2][tid];
    tor.x=red_acc[3][tid];
    tor.y=red_acc[4][tid];
    tor.z=red_acc[5][tid];
    if (eflag>0 || vflag>0) {
      for (int r=0; r<6; r++)
        red_acc[r][tid]=virial[r];
      red_acc[6][tid]=energy;
      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
        if (offset < s) {
          for (int r=0; r<7; r++)
            red_acc[r][tid] += red_acc[r][tid+s];
        }
      }
      for (int r=0; r<6; r++)
        virial[r]=red_acc[r][tid];
      energy=red_acc[6][tid];
    }
  }
  // Store answers
  if (ii<inum && offset==0) {
    __global acctyp *ap1=engv+ii;
    if (eflag>0) {
      *ap1=energy;
--- a/lib/gpu/gb_gpu_kernel_lj.cu
+++ b/lib/gpu/gb_gpu_kernel_lj.cu
@ -34,17 +34,17 @@ __kernel void kernel_sphere_gb(__global numtyp4 *x_,__global numtyp4 *q,
                               __global acctyp4 *ans, __global acctyp *engv, 
                               __global int *err_flag, const int eflag, 
                               const int vflag,const int start, const int inum, 
-                               const int nall) {
+                               const int nall, const int t_per_atom) {
  int tid=THREAD_ID_X;
  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
  ii+=tid/t_per_atom+start;
  int offset=tid%t_per_atom;
  __local numtyp sp_lj[4];
-
+  sp_lj[0]=gum[3];    
-  // ii indexes the two interacting particles in gi
+  sp_lj[1]=gum[4];    
-  int ii=THREAD_ID_X;
+  sp_lj[2]=gum[5];    
-  if (ii<4)
+  sp_lj[3]=gum[6];    
    sp_lj[ii]=gum[ii+3];    
  ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X)+start;
  __syncthreads();
  if (ii<inum) {
  acctyp energy=(acctyp)0;
  acctyp4 f;
@ -55,12 +55,15 @@ __kernel void kernel_sphere_gb(__global numtyp4 *x_,__global numtyp4 *q,
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
  if (ii<inum) {
    __global int *nbor=dev_nbor+ii;
    int i=*nbor;
    nbor+=stride;
    int numj=*nbor;
    nbor+=stride;
    __global int *nbor_end=nbor+stride*numj;
    nbor+=mul24(offset,stride);
    int n_stride=mul24(t_per_atom,stride);
    numtyp4 ix=x_[i];
    int itype=ix.w;
@ -69,7 +72,7 @@ __kernel void kernel_sphere_gb(__global numtyp4 *x_,__global numtyp4 *q,
    numtyp one_well=well[itype].x;
    numtyp factor_lj;
-    for ( ; nbor<nbor_end; nbor+=stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
      int j=*nbor;
      factor_lj = sp_lj[sbmask(j)];
@ -241,8 +244,47 @@ __kernel void kernel_sphere_gb(__global numtyp4 *x_,__global numtyp4 *q,
        f.z+=temp1*dchi[2]-temp2*dUr[2];
      }
    } // for nbor
  } // if ii
  // Reduce answers
  if (t_per_atom>1) {
    __local acctyp red_acc[6][BLOCK_PAIR];
    red_acc[0][tid]=f.x;
    red_acc[1][tid]=f.y;
    red_acc[2][tid]=f.z;
    red_acc[3][tid]=energy;
    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
      if (offset < s) {
        for (int r=0; r<4; r++)
          red_acc[r][tid] += red_acc[r][tid+s];
      }
    }
    f.x=red_acc[0][tid];
    f.y=red_acc[1][tid];
    f.z=red_acc[2][tid];
    energy=red_acc[3][tid];
    if (vflag>0) {
      for (int r=0; r<6; r++)
        red_acc[r][tid]=virial[r];
      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
        if (offset < s) {
          for (int r=0; r<6; r++)
            red_acc[r][tid] += red_acc[r][tid+s];
        }
      }
      for (int r=0; r<6; r++)
        virial[r]=red_acc[r][tid];
    }
  }
  // Store answers
  if (ii<inum && offset==0) {
    __global acctyp *ap1=engv+ii;
    if (eflag>0) {
      *ap1=energy;
@ -265,17 +307,17 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
                        __global acctyp4 *ans, __global acctyp *engv, 
                        __global int *err_flag, const int eflag, 
                        const int vflag, const int start, const int inum, 
-                        const int nall) {
+                        const int nall, const int t_per_atom) {
  int tid=THREAD_ID_X;
  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
  ii+=tid/t_per_atom+start;
  int offset=tid%t_per_atom;
  __local numtyp sp_lj[4];
-  
+  sp_lj[0]=gum[3];    
-  // ii indexes the two interacting particles in gi
+  sp_lj[1]=gum[4];    
-  int ii=THREAD_ID_X;
+  sp_lj[2]=gum[5];    
-  if (ii<4)
+  sp_lj[3]=gum[6];    
    sp_lj[ii]=gum[ii+3];    
  ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X)+start;
  __syncthreads();
  if (ii<inum) {
  acctyp energy=(acctyp)0;
  acctyp4 f;
@ -286,18 +328,21 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
  if (ii<inum) {
    __global int *nbor=dev_ij+ii;
    int i=*nbor;
    nbor+=stride;
    int numj=*nbor;
    nbor+=stride;
    __global int *list_end=nbor+mul24(stride,numj);
    nbor+=mul24(offset,stride);
    int n_stride=mul24(t_per_atom,stride);
    numtyp4 ix=x_[i];
    int itype=ix.w;
    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=stride) {
+    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;
      factor_lj = sp_lj[sbmask(j)];
@ -338,8 +383,47 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
      }
    } // for nbor
  } // if ii
  // Reduce answers
  if (t_per_atom>1) {
    __local acctyp red_acc[6][BLOCK_PAIR];
    red_acc[0][tid]=f.x;
    red_acc[1][tid]=f.y;
    red_acc[2][tid]=f.z;
    red_acc[3][tid]=energy;
    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
      if (offset < s) {
        for (int r=0; r<4; r++)
          red_acc[r][tid] += red_acc[r][tid+s];
      }
    }
    f.x=red_acc[0][tid];
    f.y=red_acc[1][tid];
    f.z=red_acc[2][tid];
    energy=red_acc[3][tid];
    if (vflag>0) {
      for (int r=0; r<6; r++)
        red_acc[r][tid]=virial[r];
      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
        if (offset < s) {
          for (int r=0; r<6; r++)
            red_acc[r][tid] += red_acc[r][tid+s];
        }
      }
      for (int r=0; r<6; r++)
        virial[r]=red_acc[r][tid];
    }
  }
  // Store answers
  if (ii<inum && offset==0) {
    __global acctyp *ap1=engv+ii;
    if (eflag>0) {
      *ap1+=energy;
@ -361,27 +445,26 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
 __kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, 
                             __global numtyp4* lj3_in, __global numtyp *gum, 
-                             const int stride, 
+                             const int stride, __global int *dev_ij,
-                             __global int *dev_ij, __global acctyp4 *ans, 
+                             __global acctyp4 *ans, __global acctyp *engv,
-                             __global acctyp *engv, __global int *err_flag,
+                             __global int *err_flag, const int eflag,
-                             const int eflag,const int vflag, const int start,
+                             const int vflag, const int start, const int inum,
-                             const int inum, const int nall) {
+                             const int nall, const int t_per_atom) {
-  // ii indexes the two interacting particles in gi
+  int tid=THREAD_ID_X;
-  int ii=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
  ii+=tid/t_per_atom+start;
  int offset=tid%t_per_atom;
  __local numtyp sp_lj[4];                              
  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  if (ii<4)
+  if (tid<4)
-    sp_lj[ii]=gum[ii+3];    
+    sp_lj[tid]=gum[tid+3];    
-  if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
-    lj1[ii]=lj1_in[ii];
+    lj1[tid]=lj1_in[tid];
    if (eflag>0)
-      lj3[ii]=lj3_in[ii];
+      lj3[tid]=lj3_in[tid];
  }
  ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X)+start;
  __syncthreads();
  if (ii<inum) {
  acctyp energy=(acctyp)0;
  acctyp4 f;
@ -392,19 +475,24 @@ __kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
  __syncthreads();
  if (ii<inum) {
    __global int *nbor=dev_ij+ii;
    int i=*nbor;
    nbor+=stride;
    int numj=*nbor;
    nbor+=stride;
    __global int *list_end=nbor+mul24(stride,numj);
    nbor+=mul24(offset,stride);
    int n_stride=mul24(t_per_atom,stride);
    numtyp4 ix=x_[i];
    int iw=ix.w;
    int itype=mul24((int)MAX_SHARED_TYPES,iw);
    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=stride) {
+    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;
      factor_lj = sp_lj[sbmask(j)];
@ -443,8 +531,47 @@ __kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
      }
    } // for nbor
  } // if ii
  // Reduce answers
  if (t_per_atom>1) {
    __local acctyp red_acc[6][BLOCK_PAIR];
    red_acc[0][tid]=f.x;
    red_acc[1][tid]=f.y;
    red_acc[2][tid]=f.z;
    red_acc[3][tid]=energy;
    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
      if (offset < s) {
        for (int r=0; r<4; r++)
          red_acc[r][tid] += red_acc[r][tid+s];
      }
    }
    f.x=red_acc[0][tid];
    f.y=red_acc[1][tid];
    f.z=red_acc[2][tid];
    energy=red_acc[3][tid];
    if (vflag>0) {
      for (int r=0; r<6; r++)
        red_acc[r][tid]=virial[r];
      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
        if (offset < s) {
          for (int r=0; r<6; r++)
            red_acc[r][tid] += red_acc[r][tid+s];
        }
      }
      for (int r=0; r<6; r++)
        virial[r]=red_acc[r][tid];
    }
  }
  // Store answers
  if (ii<inum && offset==0) {
    __global acctyp *ap1=engv+ii;
    if (eflag>0) {
      *ap1+=energy;
--- a/lib/gpu/gb_gpu_kernel_nbor.cu
+++ b/lib/gpu/gb_gpu_kernel_nbor.cu
@ -18,8 +18,6 @@
 #ifndef PAIR_GPU_KERNEL_H
 #define PAIR_GPU_KERNEL_H
 #define MAX_SHARED_TYPES 8
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
 #define numtyp2 double2
@ -32,7 +30,7 @@
 #ifdef NV_KERNEL
-#include "geryon/ucl_nv_kernel.h"
+#include "nv_kernel_def.h"
 #else
@ -42,6 +40,7 @@
 #define BLOCK_ID_X get_group_id(0)
 #define BLOCK_SIZE_X get_local_size(0)
 #define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
 #define MAX_SHARED_TYPES 8
 #endif
--- a/lib/gpu/gb_gpu_memory.cpp
+++ b/lib/gpu/gb_gpu_memory.cpp
@ -32,20 +32,25 @@ template <class numtyp, class acctyp>
 GB_GPU_MemoryT::GB_GPU_Memory() : _allocated(false), _compiled(false),
                                  _max_bytes(0.0) {
  device=&pair_gpu_device;
  ans=new PairGPUAns<numtyp,acctyp>();
  nbor=new PairGPUNbor;
 }
 template <class numtyp, class acctyp>
 GB_GPU_MemoryT::~GB_GPU_Memory() { 
  clear();
  delete ans;
  delete nbor;
 }
 template <class numtyp, class acctyp>
 int GB_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
-  return device->atom.bytes_per_atom()+device->nbor.bytes_per_atom(max_nbors);
+  return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
         nbor->bytes_per_atom(max_nbors);
 }
 template <class numtyp, class acctyp>
-bool GB_GPU_MemoryT::init(const int ntypes, const double gamma, 
+int GB_GPU_MemoryT::init(const int ntypes, const double gamma, 
                         const double upsilon, const double mu, 
                         double **host_shape, double **host_well, 
                         double **host_cutsq, double **host_sigma, 
@ -64,24 +69,24 @@ bool GB_GPU_MemoryT::init(const int ntypes, const double gamma,
    gpu_nbor=true;
  int _gpu_host=0;
-  int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_nbor,gpu_split);
+  int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
  if (host_nlocal>0)
    _gpu_host=1;
-  if (!device->init(false,true,nlocal,host_nlocal,nall,0,gpu_nbor,_gpu_host,
+  _threads_per_atom=device->threads_per_atom();
-                    max_nbors,cell_size,true))
+  int success=device->init(*ans,false,true,nlocal,host_nlocal,nall,nbor,0,
-    return false;
+                           _gpu_host,max_nbors,cell_size,true);
  if (success!=0)
    return success;
  ucl_device=device->gpu;
  atom=&device->atom;
  nbor=&device->nbor;
-  _block_size=BLOCK_1D;
+  _block_size=device->pair_block_size();
  if (static_cast<size_t>(_block_size)>ucl_device->group_size())
    _block_size=ucl_device->group_size();
  compile_kernels(*ucl_device);
  // Initialize host-device load balancer
-  hd_balancer.init(device,gpu_split);
+  hd_balancer.init(device,gpu_nbor,gpu_split);
  // Initialize timers for the selected GPU
  time_pair.init(*ucl_device);
@ -90,8 +95,9 @@ bool GB_GPU_MemoryT::init(const int ntypes, const double gamma,
  // If atom type constants fit in shared memory use fast kernel
  int lj_types=ntypes;
  shared_types=false;
-  if (lj_types<=MAX_SHARED_TYPES && _block_size>=MAX_SHARED_TYPES) {
+  int max_shared_types=device->max_shared_types();
-    lj_types=MAX_SHARED_TYPES;
+  if (lj_types<=max_shared_types && _block_size>=max_shared_types) {
    lj_types=max_shared_types;
    shared_types=true;
  }
  _lj_types=lj_types;
@ -186,12 +192,19 @@ bool GB_GPU_MemoryT::init(const int ntypes, const double gamma,
  }
  if (multiple_forms)
-    atom->dev_ans.zero();
+    ans->dev_ans.zero();
-  _max_bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+  _max_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
  // Memory for ilist ordered by particle type
-  return (host_olist.alloc(nbor->max_atoms(),*ucl_device)==UCL_SUCCESS);
+  if (host_olist.alloc(nbor->max_atoms(),*ucl_device)==UCL_SUCCESS)
    return 0;
  else return -3;
 }
 template <class numtyp, class acctyp>
 void GB_GPU_MemoryT::estimate_gpu_overhead() {
  device->estimate_gpu_overhead(2,_gpu_overhead,_driver_overhead);
 }
 template <class numtyp, class acctyp>
@ -209,9 +222,9 @@ void GB_GPU_MemoryT::clear() {
  // Output any timing information
  acc_timers();
-  double single[6], times[6];
+  double single[9], times[9];
-  single[0]=atom->transfer_time();
+  single[0]=atom->transfer_time()+ans->transfer_time();
  single[1]=nbor->time_nbor.total_seconds();
  single[2]=time_kernel.total_seconds()+time_kernel2.total_seconds()+
            nbor->time_kernel.total_seconds();
@ -220,15 +233,18 @@ void GB_GPU_MemoryT::clear() {
    single[4]=time_pair.total_seconds();
  else
    single[4]=0;
-  single[5]=atom->cast_time();
+  single[5]=atom->cast_time()+ans->cast_time();
  single[6]=_gpu_overhead;
  single[7]=_driver_overhead;
  single[8]=ans->cpu_idle_time();
-  MPI_Reduce(single,times,6,MPI_DOUBLE,MPI_SUM,0,device->replica());
+  MPI_Reduce(single,times,9,MPI_DOUBLE,MPI_SUM,0,device->replica());
  double avg_split=hd_balancer.all_avg_split();
  _max_bytes+=dev_error.row_bytes()+lj1.row_bytes()+lj3.row_bytes()+
              sigma_epsilon.row_bytes()+cut_form.row_bytes()+
              shape.row_bytes()+well.row_bytes()+lshape.row_bytes()+
-              gamma_upsilon_mu.row_bytes();
+              gamma_upsilon_mu.row_bytes()+atom->max_gpu_bytes();
  double mpi_max_bytes;
  MPI_Reduce(&_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,
             device->replica());
@ -255,10 +271,19 @@ void GB_GPU_MemoryT::clear() {
        fprintf(screen,"Force calc:      %.4f s.\n",times[3]/replica_size);
        fprintf(screen,"LJ calc:         %.4f s.\n",times[4]/replica_size);
      }
      fprintf(screen,"GPU Overhead:    %.4f s.\n",times[6]/replica_size);
      fprintf(screen,"Average split:   %.4f.\n",avg_split);
      fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);
      fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[7]/replica_size);
      fprintf(screen,"CPU Idle_Time:   %.4f s.\n",times[8]/replica_size);
      fprintf(screen,"-------------------------------------");
      fprintf(screen,"--------------------------------\n\n");
      fprintf(screen,"Average split:   %.4f.\n",avg_split);
      fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);
    }
  _max_bytes=0.0;
@ -299,10 +324,9 @@ void GB_GPU_MemoryT::clear() {
 template <class numtyp, class acctyp>
 double GB_GPU_MemoryT::host_memory_usage() const {
-  return device->atom.host_memory_usage()+
+  return device->atom.host_memory_usage()+nbor->host_memory_usage()+
-         device->nbor.host_memory_usage()+4*sizeof(numtyp)+
+         4*sizeof(numtyp)+sizeof(GB_GPU_Memory<numtyp,acctyp>)+
-         sizeof(GB_GPU_Memory<numtyp,acctyp>)+
+         nbor->max_atoms()*sizeof(int);
         device->nbor.max_atoms()*sizeof(int);
 }
 template <class numtyp, class acctyp>
--- a/lib/gpu/gb_gpu_memory.h
+++ b/lib/gpu/gb_gpu_memory.h
@ -18,8 +18,6 @@
 #ifndef GB_GPU_MEMORY_H
 #define GB_GPU_MEMORY_H
 #define BLOCK_1D 64
 #include "pair_gpu_device.h"
 #include "pair_gpu_balance.h"
 #include "mpi.h"
@ -35,8 +33,15 @@ class GB_GPU_Memory {
    * \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
    * \param gpu_split fraction of particles handled by device 
-    * \return false if there is not sufficient memory or device init prob **/
+    * \return false if there is not sufficient memory or device init prob
-  bool init(const int ntypes, const double gamma,
+    * 
    * Returns:
    * -  0 if successfull
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
    * - -5 Double precision is not supported on card **/
  int init(const int ntypes, const double gamma,
           const double upsilon, const double mu, double **host_shape,
           double **host_well, double **host_cutsq, double **host_sigma, 
           double **host_epsilon, double *host_lshape, int **h_form,
@ -46,12 +51,16 @@ class GB_GPU_Memory {
           const int max_nbors, const double cell_size,
           const double gpu_split, FILE *screen);
  /// Estimate the overhead for GPU context changes and CPU driver
  void estimate_gpu_overhead();
  /// Check if there is enough storage for atom arrays and realloc if not
  /** \param success set to false if insufficient memory **/
  inline void resize_atom(const int inum, const int nall, bool &success) {
-    atom->resize(inum, nall, success);
+    atom->resize(nall, success);
-    if (multiple_forms) atom->dev_ans.zero();
+    ans->resize(inum, success);
-    double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+    if (multiple_forms) ans->dev_ans.zero();
    double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
    if (bytes>_max_bytes)
      _max_bytes=bytes;
  }
@ -74,7 +83,7 @@ class GB_GPU_Memory {
      success=success && (host_olist.alloc(new_size,*ucl_device)==UCL_SUCCESS);
    }
    nbor->resize(nlocal,host_inum,max_nbors,success);
-    double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+    double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
    if (bytes>_max_bytes)
      _max_bytes=bytes;
  }
@ -91,6 +100,7 @@ class GB_GPU_Memory {
  /// Accumulate timers
  inline void acc_timers() {
    if (device->time_device()) {
      if (nbor_time_avail) {
        nbor->time_nbor.add_to_total();
        nbor->time_kernel.add_to_total();
@ -104,6 +114,8 @@ class GB_GPU_Memory {
        time_pair.add_to_total();
      }
      atom->acc_timers();
      ans->acc_timers();
    }
  }
  /// Accumulate timers
@ -117,6 +129,7 @@ class GB_GPU_Memory {
      time_pair.zero();
    }
    atom->zero_timers();
    ans->zero_timers();
  }
  // -------------------------- DEVICE DATA ------------------------- 
@ -168,6 +181,10 @@ class GB_GPU_Memory {
  int last_ellipse, max_last_ellipse;
  // ------------------------ FORCE/ENERGY DATA -----------------------
  PairGPUAns<numtyp,acctyp> *ans;
  // --------------------------- NBOR DATA ----------------------------
  /// Neighbor data
@ -183,10 +200,12 @@ class GB_GPU_Memory {
  UCL_Kernel k_gayberne, k_sphere_gb, k_lj_fast, k_lj;
  inline int block_size() { return _block_size; }
  int _threads_per_atom;
 private:
  bool _allocated, _compiled;
  int _block_size;
  double _max_bytes;
  double _gpu_overhead, _driver_overhead;
  void compile_kernels(UCL_Device &dev);
 };
--- a/lib/gpu/geryon/VERSION.txt
+++ b/lib/gpu/geryon/VERSION.txt
@ -1,2 +1,2 @@
-Geryon Version 10.280
+Geryon Version 11.094
--- a/lib/gpu/geryon/nvc_device.h
+++ b/lib/gpu/geryon/nvc_device.h
@ -167,6 +167,7 @@ class UCL_Device {
  int _device, _num_devices;
  std::vector<cudaDeviceProp> _properties;
  std::vector<cudaStream_t> _cq;
  std::vector<int> _device_ids;
 };
 // Grabs the properties for all devices
@ -178,6 +179,7 @@ inline UCL_Device::UCL_Device() {
    if (deviceProp.major == 9999 && deviceProp.minor == 9999)
      break;
    _properties.push_back(deviceProp);
    _device_ids.push_back(dev);
  }
  _device=-1;
  _cq.push_back(cudaStream_t());
@ -194,7 +196,7 @@ inline void UCL_Device::set(int num) {
    return;
  for (int i=1; i<num_queues(); i++) pop_command_queue();
  cudaThreadExit();
-  CUDA_SAFE_CALL_NS(cudaSetDevice(num));
+  CUDA_SAFE_CALL_NS(cudaSetDevice(_device_ids[num]));
  _device=num;
 }
--- a/lib/gpu/geryon/nvd_device.h
+++ b/lib/gpu/geryon/nvd_device.h
@ -42,6 +42,7 @@ inline void ucl_sync(CUstream &stream) {
 }
 struct NVDProperties {
  int device_id;
  std::string name;
  int major;
  int minor;
@ -208,15 +209,20 @@ inline UCL_Device::UCL_Device() {
  for (int dev=0; dev<_num_devices; ++dev) {
    CUdevice m;
    CU_SAFE_CALL_NS(cuDeviceGet(&m,dev));
    int major, minor;
    CU_SAFE_CALL_NS(cuDeviceComputeCapability(&major,&minor,m));
    if (major==9999)
      continue;
    _properties.push_back(NVDProperties());
    _properties.back().device_id=dev;
    _properties.back().major=major;
    _properties.back().minor=minor;
    char namecstr[1024];
    CU_SAFE_CALL_NS(cuDeviceGetName(namecstr,1024,m));
    _properties.back().name=namecstr;
    CU_SAFE_CALL_NS(cuDeviceComputeCapability(&_properties.back().major,
                                              &_properties.back().minor,m));
    CU_SAFE_CALL_NS(cuDeviceTotalMem(&_properties.back().totalGlobalMem,m));
    CU_SAFE_CALL_NS(cuDeviceGetAttribute(&_properties.back().multiProcessorCount,
                                       CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
@ -262,9 +268,9 @@ inline void UCL_Device::set(int num) {
    CU_SAFE_CALL_NS(cuCtxDestroy(_context));
    for (int i=1; i<num_queues(); i++) pop_command_queue();
  }
-  CU_SAFE_CALL_NS(cuDeviceGet(&_cu_device,num));
+  _device=_properties[num].device_id;
  CU_SAFE_CALL_NS(cuDeviceGet(&_cu_device,_device));
  CU_SAFE_CALL_NS(cuCtxCreate(&_context,0,_cu_device));
  _device=num;
 }
 // List all devices along with all properties
--- a/lib/gpu/geryon/nvd_timer.h
+++ b/lib/gpu/geryon/nvd_timer.h
@ -25,6 +25,7 @@
 #define NVD_TIMER_H
 #include "nvd_macros.h"
 #include "nvd_device.h"
 namespace ucl_cudadr {
@ -66,12 +67,23 @@ class UCL_Timer {
  /// Stop timing on command queue
  inline void stop() { CU_SAFE_CALL(cuEventRecord(stop_event,_cq)); }
  /// Block until the start event has been reached on device
  inline void sync_start() 
    { CU_SAFE_CALL(cuEventSynchronize(start_event)); }
  /// Block until the stop event has been reached on device
  inline void sync_stop() 
    { CU_SAFE_CALL(cuEventSynchronize(stop_event)); }
  /// Set the time elapsed to zero (not the total_time)
  inline void zero() {
    CU_SAFE_CALL(cuEventRecord(start_event,_cq));
    CU_SAFE_CALL(cuEventRecord(stop_event,_cq));
  }
  /// Set the total time to zero
  inline void zero_total() { _total_time=0.0; }
  /// Add time from previous start and stop to total
  /** Forces synchronization **/
  inline double add_to_total() 
--- a/lib/gpu/geryon/ocl_timer.h
+++ b/lib/gpu/geryon/ocl_timer.h
@ -25,6 +25,7 @@
 #define OCL_TIMER_H
 #include "ocl_macros.h"
 #include "ocl_device.h"
 namespace ucl_opencl {
@ -67,10 +68,21 @@ class UCL_Timer {
  /// Stop timing on default command queue
  inline void stop() { clEnqueueMarker(_cq,&stop_event); }
  /// Block until the start event has been reached on device
  inline void sync_start() 
    { CL_SAFE_CALL(clWaitForEvents(1,&start_event)); }
  /// Block until the stop event has been reached on device
  inline void sync_stop() 
    { CL_SAFE_CALL(clWaitForEvents(1,&stop_event)); }
  /// Set the time elapsed to zero (not the total_time)
  inline void zero() 
    { clEnqueueMarker(_cq,&start_event); clEnqueueMarker(_cq,&stop_event); } 
  /// Set the total time to zero
  inline void zero_total() { _total_time=0.0; }
  /// Add time from previous start and stop to total
  /** Forces synchronization **/
  inline double add_to_total() 
--- a/lib/gpu/geryon/ucl_arg_kludge.h
+++ b/lib/gpu/geryon/ucl_arg_kludge.h
@ -206,6 +206,191 @@
    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
  }
  template <class t1, class t2, class t3, class t4, class t5,
            class t6, class t7, class t8, class t9, class t10,
            class t11, class t12, class t13, class t14, class t15,
            class t16, class t17, class t18, class t19, class t20,
            class t21>
  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                       t21 *a21) {
    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
    add_arg(a21);
  }
  template <class t1, class t2, class t3, class t4, class t5,
            class t6, class t7, class t8, class t9, class t10,
            class t11, class t12, class t13, class t14, class t15,
            class t16, class t17, class t18, class t19, class t20,
            class t21, class t22>
  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                       t21 *a21, t22 *a22) {
    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
    add_arg(a21); add_arg(a22);
  }
  template <class t1, class t2, class t3, class t4, class t5,
            class t6, class t7, class t8, class t9, class t10,
            class t11, class t12, class t13, class t14, class t15,
            class t16, class t17, class t18, class t19, class t20,
            class t21, class t22, class t23>
  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                       t21 *a21, t22 *a22, t23 *a23) {
    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
    add_arg(a21); add_arg(a22); add_arg(a23);
  }
  template <class t1, class t2, class t3, class t4, class t5,
            class t6, class t7, class t8, class t9, class t10,
            class t11, class t12, class t13, class t14, class t15,
            class t16, class t17, class t18, class t19, class t20,
            class t21, class t22, class t23, class t24>
  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                       t21 *a21, t22 *a22, t23 *a23, t24 *a24) {
    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24);
  }
  template <class t1, class t2, class t3, class t4, class t5,
            class t6, class t7, class t8, class t9, class t10,
            class t11, class t12, class t13, class t14, class t15,
            class t16, class t17, class t18, class t19, class t20,
            class t21, class t22, class t23, class t24, class t25>
  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25) {
    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
  }
  template <class t1, class t2, class t3, class t4, class t5,
            class t6, class t7, class t8, class t9, class t10,
            class t11, class t12, class t13, class t14, class t15,
            class t16, class t17, class t18, class t19, class t20,
            class t21, class t22, class t23, class t24, class t25,
            class t26>
  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
                       t26 *a26) {
    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
    add_arg(a26);
  }
  template <class t1, class t2, class t3, class t4, class t5,
            class t6, class t7, class t8, class t9, class t10,
            class t11, class t12, class t13, class t14, class t15,
            class t16, class t17, class t18, class t19, class t20,
            class t21, class t22, class t23, class t24, class t25,
            class t26, class t27>
  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
                       t26 *a26, t27 *a27) {
    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
    add_arg(a26); add_arg(a27);
  }
  template <class t1, class t2, class t3, class t4, class t5,
            class t6, class t7, class t8, class t9, class t10,
            class t11, class t12, class t13, class t14, class t15,
            class t16, class t17, class t18, class t19, class t20,
            class t21, class t22, class t23, class t24, class t25,
            class t26, class t27, class t28>
  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
                       t26 *a26, t27 *a27, t28 *a28) {
    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
    add_arg(a26); add_arg(a27); add_arg(a28);
  }
  template <class t1, class t2, class t3, class t4, class t5,
            class t6, class t7, class t8, class t9, class t10,
            class t11, class t12, class t13, class t14, class t15,
            class t16, class t17, class t18, class t19, class t20,
            class t21, class t22, class t23, class t24, class t25,
            class t26, class t27, class t28, class t29>
  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
                       t26 *a26, t27 *a27, t28 *a28, t29 *a29) {
    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
    add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29);
  }
  template <class t1, class t2, class t3, class t4, class t5,
            class t6, class t7, class t8, class t9, class t10,
            class t11, class t12, class t13, class t14, class t15,
            class t16, class t17, class t18, class t19, class t20,
            class t21, class t22, class t23, class t24, class t25,
            class t26, class t27, class t28, class t29, class t30>
  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
                       t26 *a26, t27 *a27, t28 *a28, t29 *a29, t30 *a30) {
    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
    add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30); 
  }
 // ---------------------------------------------------------------------------
@ -439,6 +624,211 @@
    run();
  }
  template <class t1, class t2, class t3, class t4, class t5,
            class t6, class t7, class t8, class t9, class t10,
            class t11, class t12, class t13, class t14, class t15,
            class t16, class t17, class t18, class t19, class t20,
            class t21>
  inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                       t21 *a21) {
    clear_args();
    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
    add_arg(a21);
    run();
  }
  template <class t1, class t2, class t3, class t4, class t5,
            class t6, class t7, class t8, class t9, class t10,
            class t11, class t12, class t13, class t14, class t15,
            class t16, class t17, class t18, class t19, class t20,
            class t21, class t22>
  inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                       t21 *a21, t22 *a22) {
    clear_args();
    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
    add_arg(a21); add_arg(a22);
    run();
  }
  template <class t1, class t2, class t3, class t4, class t5,
            class t6, class t7, class t8, class t9, class t10,
            class t11, class t12, class t13, class t14, class t15,
            class t16, class t17, class t18, class t19, class t20,
            class t21, class t22, class t23>
  inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                       t21 *a21, t22 *a22, t23 *a23) {
    clear_args();
    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
    add_arg(a21); add_arg(a22); add_arg(a23);
    run();
  }
  template <class t1, class t2, class t3, class t4, class t5,
            class t6, class t7, class t8, class t9, class t10,
            class t11, class t12, class t13, class t14, class t15,
            class t16, class t17, class t18, class t19, class t20,
            class t21, class t22, class t23, class t24>
  inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                       t21 *a21, t22 *a22, t23 *a23, t24 *a24) {
    clear_args();
    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24);
    run();
  }
  template <class t1, class t2, class t3, class t4, class t5,
            class t6, class t7, class t8, class t9, class t10,
            class t11, class t12, class t13, class t14, class t15,
            class t16, class t17, class t18, class t19, class t20,
            class t21, class t22, class t23, class t24, class t25>
  inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25) {
    clear_args();
    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
    run();
  }
  template <class t1, class t2, class t3, class t4, class t5,
            class t6, class t7, class t8, class t9, class t10,
            class t11, class t12, class t13, class t14, class t15,
            class t16, class t17, class t18, class t19, class t20,
            class t21, class t22, class t23, class t24, class t25,
            class t26>
  inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
                       t26 *a26) {
    clear_args();
    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
    add_arg(a26);
    run();
  }
  template <class t1, class t2, class t3, class t4, class t5,
            class t6, class t7, class t8, class t9, class t10,
            class t11, class t12, class t13, class t14, class t15,
            class t16, class t17, class t18, class t19, class t20,
            class t21, class t22, class t23, class t24, class t25,
            class t26, class t27>
  inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
                       t26 *a26, t27 *a27) {
    clear_args();
    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
    add_arg(a26); add_arg(a27);
    run();
  }
  template <class t1, class t2, class t3, class t4, class t5,
            class t6, class t7, class t8, class t9, class t10,
            class t11, class t12, class t13, class t14, class t15,
            class t16, class t17, class t18, class t19, class t20,
            class t21, class t22, class t23, class t24, class t25,
            class t26, class t27, class t28>
  inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
                       t26 *a26, t27 *a27, t28 *a28) {
    clear_args();
    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
    add_arg(a26); add_arg(a27); add_arg(a28); 
    run();
  }
  template <class t1, class t2, class t3, class t4, class t5,
            class t6, class t7, class t8, class t9, class t10,
            class t11, class t12, class t13, class t14, class t15,
            class t16, class t17, class t18, class t19, class t20,
            class t21, class t22, class t23, class t24, class t25,
            class t26, class t27, class t28, class t29>
  inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
                       t26 *a26, t27 *a27, t28 *a28, t29 *a29) {
    clear_args();
    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
    add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29);
    run();
  }
  template <class t1, class t2, class t3, class t4, class t5,
            class t6, class t7, class t8, class t9, class t10,
            class t11, class t12, class t13, class t14, class t15,
            class t16, class t17, class t18, class t19, class t20,
            class t21, class t22, class t23, class t24, class t25,
            class t26, class t27, class t28, class t29, class t30>
  inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
                       t26 *a26, t27 *a27, t28 *a28, t29 *a29, t30 *a30) {
    clear_args();
    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
    add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30); 
    run();
  }
 // ---------------------------------------------------------------------------
  template <class t1>
@ -671,3 +1061,208 @@
    run(cq);
  }
  template <class t1, class t2, class t3, class t4, class t5,
            class t6, class t7, class t8, class t9, class t10,
            class t11, class t12, class t13, class t14, class t15,
            class t16, class t17, class t18, class t19, class t20,
            class t21>
  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                       t21 *a21) {
    clear_args();
    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
    add_arg(a21);
    run(cq);
  }
  template <class t1, class t2, class t3, class t4, class t5,
            class t6, class t7, class t8, class t9, class t10,
            class t11, class t12, class t13, class t14, class t15,
            class t16, class t17, class t18, class t19, class t20,
            class t21, class t22>
  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                       t21 *a21, t22 *a22) {
    clear_args();
    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
    add_arg(a21); add_arg(a22);
    run(cq);
  }
  template <class t1, class t2, class t3, class t4, class t5,
            class t6, class t7, class t8, class t9, class t10,
            class t11, class t12, class t13, class t14, class t15,
            class t16, class t17, class t18, class t19, class t20,
            class t21, class t22, class t23>
  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                       t21 *a21, t22 *a22, t23 *a23) {
    clear_args();
    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
    add_arg(a21); add_arg(a22); add_arg(a23);
    run(cq);
  }
  template <class t1, class t2, class t3, class t4, class t5,
            class t6, class t7, class t8, class t9, class t10,
            class t11, class t12, class t13, class t14, class t15,
            class t16, class t17, class t18, class t19, class t20,
            class t21, class t22, class t23, class t24>
  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                       t21 *a21, t22 *a22, t23 *a23, t24 *a24) {
    clear_args();
    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24);
    run(cq);
  }
  template <class t1, class t2, class t3, class t4, class t5,
            class t6, class t7, class t8, class t9, class t10,
            class t11, class t12, class t13, class t14, class t15,
            class t16, class t17, class t18, class t19, class t20,
            class t21, class t22, class t23, class t24, class t25>
  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25) {
    clear_args();
    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
    run(cq);
  }
  template <class t1, class t2, class t3, class t4, class t5,
            class t6, class t7, class t8, class t9, class t10,
            class t11, class t12, class t13, class t14, class t15,
            class t16, class t17, class t18, class t19, class t20,
            class t21, class t22, class t23, class t24, class t25,
            class t26>
  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
                       t26 *a26) {
    clear_args();
    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
    add_arg(a26);
    run(cq);
  }
  template <class t1, class t2, class t3, class t4, class t5,
            class t6, class t7, class t8, class t9, class t10,
            class t11, class t12, class t13, class t14, class t15,
            class t16, class t17, class t18, class t19, class t20,
            class t21, class t22, class t23, class t24, class t25,
            class t26, class t27>
  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
                       t26 *a26, t27 *a27) {
    clear_args();
    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
    add_arg(a26); add_arg(a27);
    run(cq);
  }
  template <class t1, class t2, class t3, class t4, class t5,
            class t6, class t7, class t8, class t9, class t10,
            class t11, class t12, class t13, class t14, class t15,
            class t16, class t17, class t18, class t19, class t20,
            class t21, class t22, class t23, class t24, class t25,
            class t26, class t27, class t28>
  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
                       t26 *a26, t27 *a27, t28 *a28) {
    clear_args();
    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
    add_arg(a26); add_arg(a27); add_arg(a28); 
    run(cq);
  }
  template <class t1, class t2, class t3, class t4, class t5,
            class t6, class t7, class t8, class t9, class t10,
            class t11, class t12, class t13, class t14, class t15,
            class t16, class t17, class t18, class t19, class t20,
            class t21, class t22, class t23, class t24, class t25,
            class t26, class t27, class t28, class t29>
  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
                       t26 *a26, t27 *a27, t28 *a28, t29 *a29) {
    clear_args();
    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
    add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29);
    run(cq);
  }
  template <class t1, class t2, class t3, class t4, class t5,
            class t6, class t7, class t8, class t9, class t10,
            class t11, class t12, class t13, class t14, class t15,
            class t16, class t17, class t18, class t19, class t20,
            class t21, class t22, class t23, class t24, class t25,
            class t26, class t27, class t28, class t29, class t30>
  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
                       t26 *a26, t27 *a27, t28 *a28, t29 *a29, t30 *a30) {
    clear_args();
    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
    add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30); 
    run(cq);
  }
--- a/lib/gpu/geryon/ucl_d_mat.h
+++ b/lib/gpu/geryon/ucl_d_mat.h
@ -61,20 +61,23 @@ class UCL_D_Mat : public UCL_BaseMat {
  inline int alloc(const size_t rows, const size_t cols, mat_type &cq,
                   const enum UCL_MEMOPT kind=UCL_READ_WRITE) {
    clear();
-    _kind=kind;
+
    _rows=rows;
    _cols=cols;
    int err=_device_alloc(*this,cq,rows,cols,_pitch,kind);
    _row_size=_pitch/sizeof(numtyp);
    #ifndef _UCL_DEVICE_PTR_MAT
    _end=_array+_row_size*cols;
    #endif
    #ifndef UCL_NO_EXIT
    if (err!=UCL_SUCCESS) {
      #ifndef UCL_NO_EXIT
      std::cerr << "UCL Error: Could not allocate " 
                << rows*cols*sizeof(numtyp) << " bytes on device.\n";
      exit(1);
      #endif
      return err;
    }
    _kind=kind;
    _rows=rows;
    _cols=cols;
    _row_size=_pitch/sizeof(numtyp);
    #ifndef _UCL_DEVICE_PTR_MAT
    _end=_array+_row_size*cols;
    #endif
    #ifdef _OCL_MAT
    _offset=0;
@ -94,20 +97,23 @@ class UCL_D_Mat : public UCL_BaseMat {
  inline int alloc(const size_t rows, const size_t cols, UCL_Device &device,
                   const enum UCL_MEMOPT kind=UCL_READ_WRITE) {
    clear();
-    _kind=kind;
+
    _rows=rows;
    _cols=cols;
    int err=_device_alloc(*this,device,rows,cols,_pitch,kind);
    _row_size=_pitch/sizeof(numtyp);
    #ifndef _UCL_DEVICE_PTR_MAT
    _end=_array+_row_size*cols;
    #endif
    #ifndef UCL_NO_EXIT
    if (err!=UCL_SUCCESS) {
      #ifndef UCL_NO_EXIT
      std::cerr << "UCL Error: Could not allocate "
                << rows*cols*sizeof(numtyp) << " bytes on device.\n";
      exit(1);
      #endif
      return err;
    }
    _kind=kind;
    _rows=rows;
    _cols=cols;
    _row_size=_pitch/sizeof(numtyp);
    #ifndef _UCL_DEVICE_PTR_MAT
    _end=_array+_row_size*cols;
    #endif
    #ifdef _OCL_MAT
    _offset=0;
--- a/lib/gpu/geryon/ucl_d_vec.h
+++ b/lib/gpu/geryon/ucl_d_vec.h
@ -60,19 +60,24 @@ class UCL_D_Vec : public UCL_BaseMat {
                   const enum UCL_MEMOPT kind=UCL_READ_WRITE) {
    clear();
-    _kind=kind;
+
    _cols=cols;
    _row_bytes=cols*sizeof(numtyp);
    int err=_device_alloc(*this,cq,_row_bytes,kind);
    #ifndef _UCL_DEVICE_PTR_MAT
    _end=_array+cols;
    #endif
    #ifndef UCL_NO_EXIT
    if (err!=UCL_SUCCESS) {
      #ifndef UCL_NO_EXIT
      std::cerr << "UCL Error: Could not allocate " << _row_bytes
                << " bytes on device.\n";
      _row_bytes=0;
      exit(1);
      #endif
      _row_bytes=0;
      return err;
    }
    _kind=kind;
    _cols=cols;
    #ifndef _UCL_DEVICE_PTR_MAT
    _end=_array+cols;
    #endif
    #ifdef _OCL_MAT
    _offset=0;
@ -90,19 +95,23 @@ class UCL_D_Vec : public UCL_BaseMat {
  inline int alloc(const size_t cols, UCL_Device &device,
                   const enum UCL_MEMOPT kind=UCL_READ_WRITE) {
    clear();
    _kind=kind;
    _cols=cols;
    _row_bytes=cols*sizeof(numtyp);
    int err=_device_alloc(*this,device,_row_bytes,kind);
    #ifndef _UCL_DEVICE_PTR_MAT
    _end=_array+cols;
    #endif
    #ifndef UCL_NO_EXIT
    if (err!=UCL_SUCCESS) {
      #ifndef UCL_NO_EXIT
      std::cerr << "UCL Error: Could not allocate " << _row_bytes
                << " bytes on device.\n";
      _row_bytes=0;
      exit(1);
      #endif
      _row_bytes=0;
      return err;
    }
    _kind=kind;
    _cols=cols;
    #ifndef _UCL_DEVICE_PTR_MAT
    _end=_array+cols;
    #endif
    #ifdef _OCL_MAT
    _offset=0;
--- a/lib/gpu/geryon/ucl_h_mat.h
+++ b/lib/gpu/geryon/ucl_h_mat.h
@ -39,7 +39,11 @@ class UCL_H_Mat : public UCL_BaseMat {
   };
   typedef numtyp data_type; 
-  UCL_H_Mat() : _kind(UCL_VIEW), _rows(0) { }
+  UCL_H_Mat() : _kind(UCL_VIEW), _rows(0) {
    #ifdef _OCL_MAT
    _carray=(cl_mem)(0);
    #endif
  }
  ~UCL_H_Mat() { if (_kind!=UCL_VIEW) _host_free(*this,_kind); }
  /// Construct with specied number of rows and columns
@ -59,18 +63,23 @@ class UCL_H_Mat : public UCL_BaseMat {
  inline int alloc(const size_t rows, const size_t cols, mat_type &cq,
                   const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
    clear();
-    _cols=cols;
+
    _rows=rows;
    _row_bytes=cols*sizeof(numtyp);
-    _kind=kind;
+    int err=_host_alloc(*this,cq,_row_bytes*rows,kind);
    int err=_host_alloc(*this,cq,_row_bytes*_rows,kind);
    #ifndef UCL_NO_EXIT
    if (err!=UCL_SUCCESS) {
      #ifndef UCL_NO_EXIT
      std::cerr << "UCL Error: Could not allocate " << _row_bytes*_rows
                << " bytes on host.\n";
      _row_bytes=0;
      exit(1);
    }
      #endif 
      _row_bytes=0;
      return err;
    }
    _cols=cols;
    _rows=rows;
    _kind=kind;
    _end=_array+rows*cols;
    return err;
  }    
@ -85,19 +94,24 @@ class UCL_H_Mat : public UCL_BaseMat {
  inline int alloc(const size_t rows, const size_t cols, UCL_Device &device,
                   const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
    clear();
-    _cols=cols;
+
    _rows=rows;
    _row_bytes=cols*sizeof(numtyp);
-    _kind=kind;
+    int err=_host_alloc(*this,device,_row_bytes*rows,kind);
    int err=_host_alloc(*this,device,_row_bytes*_rows,kind);
    _end=_array+rows*cols;
    #ifndef UCL_NO_EXIT
    if (err!=UCL_SUCCESS) {
      #ifndef UCL_NO_EXIT
      std::cerr << "UCL Error: Could not allocate " << _row_bytes*_rows
                << " bytes on host.\n";
      _row_bytes=0;
      exit(1);
    }
      #endif
      _row_bytes=0;
      return err;
    }
    _cols=cols;
    _rows=rows;
    _kind=kind;
    _end=_array+rows*cols;
    return err;
  }    
--- a/lib/gpu/geryon/ucl_h_vec.h
+++ b/lib/gpu/geryon/ucl_h_vec.h
@ -39,7 +39,11 @@ class UCL_H_Vec : public UCL_BaseMat {
   };
   typedef numtyp data_type; 
-  UCL_H_Vec() : _kind(UCL_VIEW), _cols(0) { }
+  UCL_H_Vec() : _kind(UCL_VIEW), _cols(0) {
    #ifdef _OCL_MAT
    _carray=(cl_mem)(0);
    #endif
  }
  ~UCL_H_Vec() { if (_kind!=UCL_VIEW) _host_free(*this,_kind); }
  /// Construct with n columns
@ -59,18 +63,24 @@ class UCL_H_Vec : public UCL_BaseMat {
  inline int alloc(const size_t cols, mat_type &cq,
                   const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
    clear();
-    _cols=cols;
+
    _row_bytes=cols*sizeof(numtyp);
    _kind=kind;
    int err=_host_alloc(*this,cq,_row_bytes,kind);
-    _end=_array+cols;
+
    #ifndef UCL_NO_EXIT
    if (err!=UCL_SUCCESS) {
      #ifndef UCL_NO_EXIT
      std::cerr << "UCL Error: Could not allocate " << _row_bytes
                << " bytes on host.\n";
      _row_bytes=0;
      exit(1);
    }
      #endif
      _row_bytes=0;
      return err;
    }
    _cols=cols;
    _kind=kind;
    _end=_array+cols;
    return err;
  }    
@ -84,18 +94,24 @@ class UCL_H_Vec : public UCL_BaseMat {
  inline int alloc(const size_t cols, UCL_Device &device,
                   const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
    clear();
-    _cols=cols;
+
    _row_bytes=cols*sizeof(numtyp);
    _kind=kind;
    int err=_host_alloc(*this,device,_row_bytes,kind);
-    _end=_array+cols;
+
    #ifndef UCL_NO_EXIT
    if (err!=UCL_SUCCESS) {
      #ifndef UCL_NO_EXIT
      std::cerr << "UCL Error: Could not allocate " << _row_bytes
                << " bytes on host.\n";
      _row_bytes=0;
      exit(1);
    }
      #endif 
      _row_bytes=0;
      return err;
    }
    _cols=cols;
    _kind=kind;
    _end=_array+cols;
    return err;
  }
--- a/lib/gpu/geryon/ucl_nv_kernel.h
+++ b/lib/gpu/geryon/ucl_nv_kernel.h
@ -25,8 +25,18 @@
 #ifndef UCL_NV_KERNEL_H
 #define UCL_NV_KERNEL_H
-#define GLOBAL_ID_X threadIdx.x+__mul24(blockIdx.x,blockDim.x)
+#if (__CUDA_ARCH__ < 200)
-#define GLOBAL_ID_Y threadIdx.y+__mul24(blockIdx.y,blockDim.y)
+#define mul24 __mul24
 #define MEM_THREADS 16
 #else
 #define mul24(X,Y) (X)*(Y)
 #define MEM_THREADS 32
 #endif
 #define GLOBAL_ID_X threadIdx.x+mul24(blockIdx.x,blockDim.x)
 #define GLOBAL_ID_Y threadIdx.y+mul24(blockIdx.y,blockDim.y)
 #define GLOBAL_SIZE_X mul24(gridDim.x,blockDim.x);
 #define GLOBAL_SIZE_Y mul24(gridDim.y,blockDim.y);
 #define THREAD_ID_X threadIdx.x
 #define THREAD_ID_Y threadIdx.y
 #define BLOCK_ID_X blockIdx.x
@ -35,8 +45,9 @@
 #define BLOCK_SIZE_Y blockDim.y
 #define __kernel extern "C" __global__
 #define __local __shared__
 #define mul24 __mul24
 #define __global  
 #define __inline static __inline__ __device__ 
 #define atom_add atomicAdd
 #endif
--- a/lib/gpu/lj96_cut_gpu.cpp
+++ b/lib/gpu/lj96_cut_gpu.cpp
@ -28,7 +28,7 @@ static LJ96_GPU_Memory<PRECISION,ACC_PRECISION> LJ96MF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-bool lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
+int lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                  double **host_lj2, double **host_lj3, double **host_lj4,
                  double **offset, double *special_lj, const int inum,
                  const int nall, const int max_nbors, const int maxspecial,
@ -53,13 +53,11 @@ bool lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
    fflush(screen);
  }
-  if (world_me==0) {
+  int init_ok=0;
-    bool init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
+  if (world_me==0)
    init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
                        host_lj4, offset, special_lj, inum, nall, 300,
                        maxspecial, cell_size, gpu_split, screen);
    if (!init_ok)
      return false;
  }
  LJ96MF.device->world_barrier();
  if (message)
@ -74,46 +72,46 @@ bool lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                last_gpu,i);
      fflush(screen);
    }
-    if (gpu_rank==i && world_me!=0) {
+    if (gpu_rank==i && world_me!=0)
-      bool init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
+      init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                              host_lj4, offset, special_lj, inum, 
+                          offset, special_lj, inum,  nall, 300, maxspecial,
-                              nall, 300, maxspecial, cell_size, gpu_split,
+                          cell_size, gpu_split, screen);
-			      screen);
+
      if (!init_ok)
        return false;
    }
    LJ96MF.device->gpu_barrier();
    if (message) 
      fprintf(screen,"Done.\n");
  }
  if (message)
    fprintf(screen,"\n");
-  return true;
+
  if (init_ok==0)
    LJ96MF.estimate_gpu_overhead();
  return init_ok;
 }
 void lj96_gpu_clear() {
  LJ96MF.clear();
 }
-int * lj96_gpu_compute_n(const int timestep, const int ago, const int inum_full,
+int** lj96_gpu_compute_n(const int ago, const int inum_full,
                         const int nall, double **host_x, int *host_type,
-                         double *boxlo, double *boxhi, int *tag, int **nspecial,
+                         double *sublo, double *subhi, int *tag, int **nspecial,
                         int **special, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
-                         const double cpu_time, bool &success) {
+                         int **ilist, int **jnum, const double cpu_time,
-  return LJ96MF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
+                         bool &success) {
-                        boxhi, tag, nspecial, special, eflag, vflag, eatom,
+  return LJ96MF.compute(ago, inum_full, nall, host_x, host_type, sublo,
-                        vatom, host_start, cpu_time, success);
+                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success);
 }  
-void lj96_gpu_compute(const int timestep, const int ago, const int inum_full,
+void lj96_gpu_compute(const int ago, const int inum_full, const int nall,
-	 	     const int nall, double **host_x, int *host_type,
+                      double **host_x, int *host_type, int *ilist, int *numj,
-                     int *ilist, int *numj, int **firstneigh,
+                      int **firstneigh, const bool eflag, const bool vflag,
-		     const bool eflag, const bool vflag, const bool eatom,
+                      const bool eatom, const bool vatom, int &host_start,
-                     const bool vatom, int &host_start, const double cpu_time,
+                      const double cpu_time, bool &success) {
-                     bool &success) {
+  LJ96MF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,
-  LJ96MF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
+                 eflag,vflag,eatom,vatom,host_start,cpu_time,success);
                firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
 }
 double lj96_gpu_bytes() {
--- a/lib/gpu/lj96_cut_gpu_kernel.cu
+++ b/lib/gpu/lj96_cut_gpu_kernel.cu
@ -18,8 +18,6 @@
 #ifndef LJ96_GPU_KERNEL
 #define LJ96_GPU_KERNEL
 #define MAX_SHARED_TYPES 8
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
 #define numtyp2 double2
@ -46,7 +44,7 @@
 #ifdef NV_KERNEL
-#include "geryon/ucl_nv_kernel.h"
+#include "nv_kernel_def.h"
 texture<float4> pos_tex;
 #ifdef _DOUBLE_DOUBLE
@ -72,6 +70,8 @@ __inline float4 fetch_pos(const int& i, const float4 *pos)
 #define __inline inline
 #define fetch_pos(i,y) x_[i]
 #define BLOCK_PAIR 64
 #define MAX_SHARED_TYPES 8
 #endif
@ -82,19 +82,21 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; }
 __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
                          __global numtyp4* lj3, const int lj_types, 
                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global acctyp4 *ans, __global acctyp *engv, 
+                          __global int *dev_packed, __global acctyp4 *ans,
-                          const int eflag, const int vflag, const int inum, 
+                          __global acctyp *engv, const int eflag,
-                          const int nall, const int nbor_pitch) {
+                          const int vflag, const int inum, const int nall,
-  // ii indexes the two interacting particles in gi
+                          const int nbor_pitch, const int t_per_atom) {
-  int ii=GLOBAL_ID_X;
+  int tid=THREAD_ID_X;
  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
  ii+=tid/t_per_atom;
  int offset=tid%t_per_atom;
  __local numtyp sp_lj[4];
  sp_lj[0]=sp_lj_in[0];
  sp_lj[1]=sp_lj_in[1];
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];
  if (ii<inum) {
  acctyp energy=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0;
@ -104,18 +106,31 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
  if (ii<inum) {
    __global int *nbor=dev_nbor+ii;
    int i=*nbor;
    nbor+=nbor_pitch;
    int numj=*nbor;
    nbor+=nbor_pitch;
-    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+
    int n_stride;
    __global int *list_end;
    if (dev_nbor==dev_packed) {
      list_end=nbor+mul24(numj,nbor_pitch);
      nbor+=mul24(offset,nbor_pitch);
      n_stride=mul24(t_per_atom,nbor_pitch);
    } else {
      nbor=dev_packed+*nbor;
      list_end=nbor+numj;
      n_stride=t_per_atom;
      nbor+=offset;
    }
    numtyp4 ix=fetch_pos(i,x_); //x_[i];
    int itype=ix.w;
    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;
      factor_lj = sp_lj[sbmask(j)];
@ -157,8 +172,47 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
      }
    } // for nbor
  } // if ii
  // Reduce answers
  if (t_per_atom>1) {
    __local acctyp red_acc[6][BLOCK_PAIR];
    red_acc[0][tid]=f.x;
    red_acc[1][tid]=f.y;
    red_acc[2][tid]=f.z;
    red_acc[3][tid]=energy;
    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
      if (offset < s) {
        for (int r=0; r<4; r++)
          red_acc[r][tid] += red_acc[r][tid+s];
      }
    }
    f.x=red_acc[0][tid];
    f.y=red_acc[1][tid];
    f.z=red_acc[2][tid];
    energy=red_acc[3][tid];
    if (vflag>0) {
      for (int r=0; r<6; r++)
        red_acc[r][tid]=virial[r];
      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
        if (offset < s) {
          for (int r=0; r<6; r++)
            red_acc[r][tid] += red_acc[r][tid+s];
        }
      }
      for (int r=0; r<6; r++)
        virial[r]=red_acc[r][tid];
    }
  }
  // Store answers
  if (ii<inum && offset==0) {
    __global acctyp *ap1=engv+ii;
    if (eflag>0) {
      *ap1=energy;
@ -176,26 +230,27 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
 __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                               __global numtyp4* lj3_in, 
-                               __global numtyp* sp_lj_in, __global int *dev_nbor, 
+                               __global numtyp* sp_lj_in,
                               __global int *dev_nbor, __global int *dev_packed,
                               __global acctyp4 *ans, __global acctyp *engv, 
                               const int eflag, const int vflag, const int inum, 
-                               const int nall, const int nbor_pitch) {
+                               const int nall, const int nbor_pitch,
-  // ii indexes the two interacting particles in gi
+                               const int t_per_atom) {
-  int ii=THREAD_ID_X;
+  int tid=THREAD_ID_X;
  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
  ii+=tid/t_per_atom;
  int offset=tid%t_per_atom;
  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp sp_lj[4];
-  if (ii<4)
+  if (tid<4)
-    sp_lj[ii]=sp_lj_in[ii];
+    sp_lj[tid]=sp_lj_in[tid];
-  if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
-    lj1[ii]=lj1_in[ii];
+    lj1[tid]=lj1_in[tid];
    if (eflag>0)
-      lj3[ii]=lj3_in[ii];
+      lj3[tid]=lj3_in[tid];
  }
  ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
  __syncthreads();
  if (ii<inum) {
  acctyp energy=(acctyp)0;
  acctyp4 f;
@ -206,19 +261,34 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
  __syncthreads();
  if (ii<inum) {
    __global int *nbor=dev_nbor+ii;
    int i=*nbor;
    nbor+=nbor_pitch;
    int numj=*nbor;
    nbor+=nbor_pitch;
-    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+
    int n_stride;
    __global int *list_end;
    if (dev_nbor==dev_packed) {
      list_end=nbor+mul24(numj,nbor_pitch);
      nbor+=mul24(offset,nbor_pitch);
      n_stride=mul24(t_per_atom,nbor_pitch);
    } else {
      nbor=dev_packed+*nbor;
      list_end=nbor+numj;
      n_stride=t_per_atom;
      nbor+=offset;
    }
    numtyp4 ix=fetch_pos(i,x_); //x_[i];
    int iw=ix.w;
    int itype=mul24((int)MAX_SHARED_TYPES,iw);
    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;
      factor_lj = sp_lj[sbmask(j)];
@ -258,8 +328,47 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
      }
    } // for nbor
  } // if ii
  // Reduce answers
  if (t_per_atom>1) {
    __local acctyp red_acc[6][BLOCK_PAIR];
    red_acc[0][tid]=f.x;
    red_acc[1][tid]=f.y;
    red_acc[2][tid]=f.z;
    red_acc[3][tid]=energy;
    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
      if (offset < s) {
        for (int r=0; r<4; r++)
          red_acc[r][tid] += red_acc[r][tid+s];
      }
    }
    f.x=red_acc[0][tid];
    f.y=red_acc[1][tid];
    f.z=red_acc[2][tid];
    energy=red_acc[3][tid];
    if (vflag>0) {
      for (int r=0; r<6; r++)
        red_acc[r][tid]=virial[r];
      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
        if (offset < s) {
          for (int r=0; r<6; r++)
            red_acc[r][tid] += red_acc[r][tid+s];
        }
      }
      for (int r=0; r<6; r++)
        virial[r]=red_acc[r][tid];
    }
  }
  // Store answers
  if (ii<inum && offset==0) {
    __global acctyp *ap1=engv+ii;
    if (eflag>0) {
      *ap1=energy;
--- a/lib/gpu/lj96_cut_gpu_memory.cpp
+++ b/lib/gpu/lj96_cut_gpu_memory.cpp
@ -42,7 +42,7 @@ int LJ96_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
 }
 template <class numtyp, class acctyp>
-bool LJ96_GPU_MemoryT::init(const int ntypes,
+int LJ96_GPU_MemoryT::init(const int ntypes,
                           double **host_cutsq, double **host_lj1, 
                           double **host_lj2, double **host_lj3, 
                           double **host_lj4, double **host_offset, 
@ -50,14 +50,18 @@ bool LJ96_GPU_MemoryT::init(const int ntypes,
                           const int nall, const int max_nbors,
                           const int maxspecial, const double cell_size,
                           const double gpu_split, FILE *_screen) {
-  this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+  int success;
  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
                            _screen,lj96_cut_gpu_kernel);
  if (success!=0)
    return success;
  // If atom type constants fit in shared memory use fast kernel
  int lj_types=ntypes;
  shared_types=false;
-  if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) {
+  int max_shared_types=this->device->max_shared_types();
-    lj_types=MAX_SHARED_TYPES;
+  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
    lj_types=max_shared_types;
    shared_types=true;
  }
  _lj_types=lj_types;
@ -84,7 +88,7 @@ bool LJ96_GPU_MemoryT::init(const int ntypes,
  _allocated=true;
  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
-  return true;
+  return 0;
 }
 template <class numtyp, class acctyp>
@ -122,9 +126,10 @@ void LJ96_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
  else
    vflag=0;
-  int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));
-  int ainum=this->atom->inum();
+  int ainum=this->ans->inum();
  int anall=this->atom->nall();
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
@ -133,16 +138,18 @@ void LJ96_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
                          &lj3.begin(), &sp_lj.begin(),
                          &this->nbor->dev_nbor.begin(),
-                          &this->atom->dev_ans.begin(),
+                          &this->_nbor_data->begin(),
-                          &this->atom->dev_engv.begin(), &eflag, &vflag,
+                          &this->ans->dev_ans.begin(),
-                          &ainum, &anall, &nbor_pitch);
+                          &this->ans->dev_engv.begin(), &eflag, &vflag,
                          &ainum, &anall, &nbor_pitch, 
                          &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->atom->dev_ans.begin(),
+                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &anall, &nbor_pitch);
+                     &anall, &nbor_pitch, &this->_threads_per_atom);
  }
  this->time_pair.stop();
 }
--- a/lib/gpu/lj96_cut_gpu_memory.h
+++ b/lib/gpu/lj96_cut_gpu_memory.h
@ -29,8 +29,15 @@ class LJ96_GPU_Memory : public AtomicGPUMemory<numtyp, acctyp> {
  /// Clear any previous data and set up for a new LAMMPS run
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device **/
+    * \param gpu_split fraction of particles handled by device
-  bool init(const int ntypes, double **host_cutsq, double **host_lj1,
+    * 
    * Returns:
    * -  0 if successfull
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
    * - -5 Double precision is not supported on card **/
  int init(const int ntypes, double **host_cutsq, double **host_lj1,
           double **host_lj2, double **host_lj3, double **host_lj4,
           double **host_offset, double *host_special_lj,
           const int nlocal, const int nall, const int max_nbors, 
--- a/lib/gpu/lj_cut_gpu.cpp
+++ b/lib/gpu/lj_cut_gpu.cpp
@ -28,12 +28,11 @@ static LJL_GPU_Memory<PRECISION,ACC_PRECISION> LJLMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-bool ljl_gpu_init(const int ntypes, double **cutsq,
+int ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
-                  double **host_lj1, double **host_lj2, double **host_lj3, 
+                 double **host_lj2, double **host_lj3, double **host_lj4,
-                  double **host_lj4, double **offset, double *special_lj,
+                 double **offset, double *special_lj, const int inum,
-                  const int inum, const int nall, const int max_nbors, 
+                 const int nall, const int max_nbors,  const int maxspecial,
-                  const int maxspecial, const double cell_size, int &gpu_mode,
+                 const double cell_size, int &gpu_mode, FILE *screen) {
                  FILE *screen) {
  LJLMF.clear();
  gpu_mode=LJLMF.device->gpu_mode();
  double gpu_split=LJLMF.device->particle_split();
@ -54,13 +53,11 @@ bool ljl_gpu_init(const int ntypes, double **cutsq,
    fflush(screen);
  }
-  if (world_me==0) {
+  int init_ok=0;
-    bool init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, 
+  if (world_me==0)
    init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, 
                       host_lj4, offset, special_lj, inum, nall, 300,
                       maxspecial, cell_size, gpu_split, screen);
    if (!init_ok)
      return false;
  }
  LJLMF.device->world_barrier();
  if (message)
@ -75,45 +72,45 @@ bool ljl_gpu_init(const int ntypes, double **cutsq,
                last_gpu,i);
      fflush(screen);
    }
-    if (gpu_rank==i && world_me!=0) {
+    if (gpu_rank==i && world_me!=0)
-      bool init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, 
+      init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                              host_lj4, offset, special_lj, inum, nall, 300,
+                         offset, special_lj, inum, nall, 300, maxspecial,
-                              maxspecial, cell_size, gpu_split,
+                         cell_size, gpu_split, screen);
-			      screen);
+
      if (!init_ok)
        return false;
    }
    LJLMF.device->gpu_barrier();
    if (message) 
      fprintf(screen,"Done.\n");
  }
  if (message)
    fprintf(screen,"\n");
-  return true;
+
  if (init_ok==0)
    LJLMF.estimate_gpu_overhead();
  return init_ok;
 }
 void ljl_gpu_clear() {
  LJLMF.clear();
 }
-int * ljl_gpu_compute_n(const int timestep, const int ago, const int inum_full,
+int ** ljl_gpu_compute_n(const int ago, const int inum_full,
                        const int nall, double **host_x, int *host_type,
-                        double *boxlo, double *boxhi, int *tag, int **nspecial,
+                        double *sublo, double *subhi, int *tag, int **nspecial,
                        int **special, const bool eflag, const bool vflag,
                        const bool eatom, const bool vatom, int &host_start,
-                        const double cpu_time, bool &success) {
+                        int **ilist, int **jnum, const double cpu_time,
-  return LJLMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
+                        bool &success) {
-                       boxhi, tag, nspecial, special, eflag, vflag, eatom,
+  return LJLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
-                       vatom, host_start, cpu_time, success);
+                       subhi, tag, nspecial, special, eflag, vflag, eatom,
                       vatom, host_start, ilist, jnum, cpu_time, success);
 }  
-void ljl_gpu_compute(const int timestep, const int ago, const int inum_full,
+void ljl_gpu_compute(const int ago, const int inum_full, const int nall,
-	 	     const int nall, double **host_x, int *host_type,
+                     double **host_x, int *host_type, int *ilist, int *numj,
-                     int *ilist, int *numj, int **firstneigh,
+                     int **firstneigh, const bool eflag, const bool vflag,
-		     const bool eflag, const bool vflag, const bool eatom,
+                     const bool eatom, const bool vatom, int &host_start,
-                     const bool vatom, int &host_start, const double cpu_time,
+                     const double cpu_time, bool &success) {
-                     bool &success) {
+  LJLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
  LJLMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
                firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
 }
--- a/lib/gpu/lj_cut_gpu_kernel.cu
+++ b/lib/gpu/lj_cut_gpu_kernel.cu
@ -18,8 +18,6 @@
 #ifndef LJ_GPU_KERNEL
 #define LJ_GPU_KERNEL
 #define MAX_SHARED_TYPES 8
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
 #define numtyp2 double2
@ -46,7 +44,7 @@
 #ifdef NV_KERNEL
-#include "geryon/ucl_nv_kernel.h"
+#include "nv_kernel_def.h"
 texture<float4> pos_tex;
 #ifdef _DOUBLE_DOUBLE
@ -72,6 +70,8 @@ __inline float4 fetch_pos(const int& i, const float4 *pos)
 #define __inline inline
 #define fetch_pos(i,y) x_[i]
 #define BLOCK_PAIR 64
 #define MAX_SHARED_TYPES 8
 #endif
@ -82,19 +82,21 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; }
 __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
                          __global numtyp4* lj3, const int lj_types, 
                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global acctyp4 *ans, __global acctyp *engv, 
+                          __global int *dev_packed, __global acctyp4 *ans,
-                          const int eflag, const int vflag, const int inum, 
+                          __global acctyp *engv, const int eflag, 
-                          const int nall, const int nbor_pitch) {
+                          const int vflag, const int inum, const int nall,
-  // ii indexes the two interacting particles in gi
+                          const int nbor_pitch, const int t_per_atom) {
-  int ii=GLOBAL_ID_X;
+  int tid=THREAD_ID_X;
  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
  ii+=tid/t_per_atom;
  int offset=tid%t_per_atom;
  __local numtyp sp_lj[4];
  sp_lj[0]=sp_lj_in[0];
  sp_lj[1]=sp_lj_in[1];
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];
  if (ii<inum) {
  acctyp energy=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0;
@ -104,18 +106,31 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
  if (ii<inum) {
    __global int *nbor=dev_nbor+ii;
    int i=*nbor;
    nbor+=nbor_pitch;
    int numj=*nbor;
    nbor+=nbor_pitch;
-    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+
    int n_stride;
    __global int *list_end;
    if (dev_nbor==dev_packed) {
      list_end=nbor+mul24(numj,nbor_pitch);
      nbor+=mul24(offset,nbor_pitch);
      n_stride=mul24(t_per_atom,nbor_pitch);
    } else {
      nbor=dev_packed+*nbor;
      list_end=nbor+numj;
      n_stride=t_per_atom;
      nbor+=offset;
    }
    numtyp4 ix=fetch_pos(i,x_); //x_[i];
    int itype=ix.w;
    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;
      factor_lj = sp_lj[sbmask(j)];
@ -156,8 +171,47 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
      }
    } // for nbor
  } // if ii
  // Reduce answers
  if (t_per_atom>1) {
    __local acctyp red_acc[6][BLOCK_PAIR];
    red_acc[0][tid]=f.x;
    red_acc[1][tid]=f.y;
    red_acc[2][tid]=f.z;
    red_acc[3][tid]=energy;
    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
      if (offset < s) {
        for (int r=0; r<4; r++)
          red_acc[r][tid] += red_acc[r][tid+s];
      }
    }
    f.x=red_acc[0][tid];
    f.y=red_acc[1][tid];
    f.z=red_acc[2][tid];
    energy=red_acc[3][tid];
    if (vflag>0) {
      for (int r=0; r<6; r++)
        red_acc[r][tid]=virial[r];
      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
        if (offset < s) {
          for (int r=0; r<6; r++)
            red_acc[r][tid] += red_acc[r][tid+s];
        }
      }
      for (int r=0; r<6; r++)
        virial[r]=red_acc[r][tid];
    }
  }
  // Store answers
  if (ii<inum && offset==0) {
    __global acctyp *ap1=engv+ii;
    if (eflag>0) {
      *ap1=energy;
@ -175,26 +229,27 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
 __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                               __global numtyp4* lj3_in, 
-                               __global numtyp* sp_lj_in, __global int *dev_nbor, 
+                               __global numtyp* sp_lj_in, 
                               __global int *dev_nbor, __global int *dev_packed, 
                               __global acctyp4 *ans, __global acctyp *engv, 
                               const int eflag, const int vflag, const int inum, 
-                               const int nall, const int nbor_pitch) {
+                               const int nall, const int nbor_pitch,
-  // ii indexes the two interacting particles in gi
+                               const int t_per_atom) {
-  int ii=THREAD_ID_X;
+  int tid=THREAD_ID_X;
  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
  ii+=tid/t_per_atom;
  int offset=tid%t_per_atom;
  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp sp_lj[4];
-  if (ii<4)
+  if (tid<4)
-    sp_lj[ii]=sp_lj_in[ii];
+    sp_lj[tid]=sp_lj_in[tid];
-  if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
-    lj1[ii]=lj1_in[ii];
+    lj1[tid]=lj1_in[tid];
    if (eflag>0)
-      lj3[ii]=lj3_in[ii];
+      lj3[tid]=lj3_in[tid];
  }
  ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
  __syncthreads();
  if (ii<inum) {
  acctyp energy=(acctyp)0;
  acctyp4 f;
@ -205,19 +260,34 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
  __syncthreads();
  if (ii<inum) {
    __global int *nbor=dev_nbor+ii;
    int i=*nbor;
    nbor+=nbor_pitch;
    int numj=*nbor;
    nbor+=nbor_pitch;
-    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+
    int n_stride;
    __global int *list_end;
    if (dev_nbor==dev_packed) {
      list_end=nbor+mul24(numj,nbor_pitch);
      nbor+=mul24(offset,nbor_pitch);
      n_stride=mul24(t_per_atom,nbor_pitch);
    } else {
      nbor=dev_packed+*nbor;
      list_end=nbor+numj;
      n_stride=t_per_atom;
      nbor+=offset;
    }
    numtyp4 ix=fetch_pos(i,x_); //x_[i];
    int iw=ix.w;
    int itype=mul24((int)MAX_SHARED_TYPES,iw);
    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;
      factor_lj = sp_lj[sbmask(j)];
@ -256,8 +326,47 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
      }
    } // for nbor
  } // if ii
  // Reduce answers
  if (t_per_atom>1) {
    __local acctyp red_acc[6][BLOCK_PAIR];
    red_acc[0][tid]=f.x;
    red_acc[1][tid]=f.y;
    red_acc[2][tid]=f.z;
    red_acc[3][tid]=energy;
    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
      if (offset < s) {
        for (int r=0; r<4; r++)
          red_acc[r][tid] += red_acc[r][tid+s];
      }
    }
    f.x=red_acc[0][tid];
    f.y=red_acc[1][tid];
    f.z=red_acc[2][tid];
    energy=red_acc[3][tid];
    if (vflag>0) {
      for (int r=0; r<6; r++)
        red_acc[r][tid]=virial[r];
      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
        if (offset < s) {
          for (int r=0; r<6; r++)
            red_acc[r][tid] += red_acc[r][tid+s];
        }
      }
      for (int r=0; r<6; r++)
        virial[r]=red_acc[r][tid];
    }
  }
  // Store answers
  if (ii<inum && offset==0) {
    __global acctyp *ap1=engv+ii;
    if (eflag>0) {
      *ap1=energy;
--- a/lib/gpu/lj_cut_gpu_memory.cpp
+++ b/lib/gpu/lj_cut_gpu_memory.cpp
@ -42,7 +42,7 @@ int LJL_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
 }
 template <class numtyp, class acctyp>
-bool LJL_GPU_MemoryT::init(const int ntypes, 
+int LJL_GPU_MemoryT::init(const int ntypes, 
                          double **host_cutsq, double **host_lj1, 
                          double **host_lj2, double **host_lj3, 
                          double **host_lj4, double **host_offset, 
@ -50,14 +50,18 @@ bool LJL_GPU_MemoryT::init(const int ntypes,
                          const int nall, const int max_nbors,
                          const int maxspecial, const double cell_size,
                          const double gpu_split, FILE *_screen) {
-  this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+  int success;
  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
                            _screen,lj_cut_gpu_kernel);
  if (success!=0)
    return success;
  // If atom type constants fit in shared memory use fast kernel
  int lj_types=ntypes;
  shared_types=false;
-  if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) {
+  int max_shared_types=this->device->max_shared_types();
-    lj_types=MAX_SHARED_TYPES;
+  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
    lj_types=max_shared_types;
    shared_types=true;
  }
  _lj_types=lj_types;
@ -84,7 +88,7 @@ bool LJL_GPU_MemoryT::init(const int ntypes,
  _allocated=true;
  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
-  return true;
+  return 0;
 }
 template <class numtyp, class acctyp>
@ -122,9 +126,10 @@ void LJL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
  else
    vflag=0;
-  int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));
-  int ainum=this->atom->inum();
+  int ainum=this->ans->inum();
  int anall=this->atom->nall();
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
@ -133,16 +138,18 @@ void LJL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
                          &lj3.begin(), &sp_lj.begin(),
                          &this->nbor->dev_nbor.begin(),
-                          &this->atom->dev_ans.begin(),
+                          &this->_nbor_data->begin(),
-                          &this->atom->dev_engv.begin(), &eflag, &vflag,
+                          &this->ans->dev_ans.begin(),
-                          &ainum, &anall, &nbor_pitch);
+                          &this->ans->dev_engv.begin(), &eflag, &vflag,
                          &ainum, &anall, &nbor_pitch, 
                          &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->atom->dev_ans.begin(),
+                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &anall, &nbor_pitch);
+                     &anall, &nbor_pitch, &this->_threads_per_atom);
  }
  this->time_pair.stop();
 }
--- a/lib/gpu/lj_cut_gpu_memory.h
+++ b/lib/gpu/lj_cut_gpu_memory.h
@ -29,8 +29,15 @@ class LJL_GPU_Memory : public AtomicGPUMemory<numtyp, acctyp> {
  /// Clear any previous data and set up for a new LAMMPS run
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device **/
+    * \param gpu_split fraction of particles handled by device
-  bool init(const int ntypes, double **host_cutsq,
+    * 
    * Returns:
    * -  0 if successfull
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
    * - -5 Double precision is not supported on card **/
  int init(const int ntypes, double **host_cutsq,
           double **host_lj1, double **host_lj2, double **host_lj3,
           double **host_lj4, double **host_offset, double *host_special_lj,
           const int nlocal, const int nall, const int max_nbors, 
--- a/lib/gpu/ljc_cut_gpu.cpp
+++ b/lib/gpu/ljc_cut_gpu.cpp
@ -28,7 +28,7 @@ static LJC_GPU_Memory<PRECISION,ACC_PRECISION> LJCMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-bool ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
+int ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                 double **host_lj2, double **host_lj3, double **host_lj4,
                 double **offset, double *special_lj, const int inum,
                 const int nall, const int max_nbors, const int maxspecial,
@ -55,15 +55,12 @@ bool ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
    fflush(screen);
  }
-  if (world_me==0) {
+  int init_ok=0;
-    bool init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
+  if (world_me==0)
    init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
                       host_lj4, offset, special_lj, inum, nall, 300,
-                            maxspecial, cell_size, gpu_split, screen,
+                       maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
-                            host_cut_ljsq, host_cut_coulsq, host_special_coul,
+                       host_cut_coulsq, host_special_coul, qqrd2e);
                            qqrd2e);
    if (!init_ok)
      return false;
  }
  LJCMF.device->world_barrier();
  if (message)
@ -78,48 +75,51 @@ bool ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                last_gpu,i);
      fflush(screen);
    }
-    if (gpu_rank==i && world_me!=0) {
+    if (gpu_rank==i && world_me!=0)
-      bool init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
+      init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                              host_lj4, offset, special_lj, inum, nall, 300,
+                         offset, special_lj, inum, nall, 300, maxspecial,
-                              maxspecial, cell_size, gpu_split,
+                         cell_size, gpu_split, screen, host_cut_ljsq,
-			      screen, host_cut_ljsq, host_cut_coulsq,
+                         host_cut_coulsq, host_special_coul, qqrd2e);
-                              host_special_coul, qqrd2e);
+
      if (!init_ok)
        return false;
    }
    LJCMF.device->gpu_barrier();
    if (message) 
      fprintf(screen,"Done.\n");
  }
  if (message)
    fprintf(screen,"\n");
-  return true;
+
  if (init_ok==0)
    LJCMF.estimate_gpu_overhead();
  return init_ok;
 }
 void ljc_gpu_clear() {
  LJCMF.clear();
 }
-int * ljc_gpu_compute_n(const int timestep, const int ago, const int inum_full,
+int** ljc_gpu_compute_n(const int ago, const int inum_full,
                        const int nall, double **host_x, int *host_type,
-                        double *boxlo, double *boxhi, int *tag, int **nspecial, 
+                        double *sublo, double *subhi, int *tag, int **nspecial, 
                        int **special, const bool eflag, const bool vflag,
                        const bool eatom, const bool vatom, int &host_start,
-                        const double cpu_time, bool &success, double *host_q) {
+                        int **ilist, int **jnum, const double cpu_time,
-  return LJCMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
+                        bool &success, double *host_q, double *boxlo,
-                       boxhi, tag, nspecial, special, eflag, vflag, eatom,
+                        double *prd) {
-                       vatom, host_start, cpu_time, success, host_q);
+  return LJCMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                       subhi, tag, nspecial, special, eflag, vflag, eatom,
                       vatom, host_start, ilist, jnum, cpu_time, success,
                       host_q, boxlo, prd);
 }  
-void ljc_gpu_compute(const int timestep, const int ago, const int inum_full,
+void ljc_gpu_compute(const int ago, const int inum_full, const int nall,
-	 	     const int nall, double **host_x, int *host_type,
+                     double **host_x, int *host_type, int *ilist, int *numj,
-                     int *ilist, int *numj, int **firstneigh,
+                     int **firstneigh, const bool eflag, const bool vflag,
-		     const bool eflag, const bool vflag, const bool eatom,
+                     const bool eatom, const bool vatom, int &host_start,
-                     const bool vatom, int &host_start, const double cpu_time,
+                     const double cpu_time, bool &success, double *host_q,
-                     bool &success, double *host_q) {
+                     const int nlocal, double *boxlo, double *prd) {
-  LJCMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
+  LJCMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag,
-                firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
+                vflag,eatom,vatom,host_start,cpu_time,success,host_q,
-                host_q);
+                nlocal,boxlo,prd);
 }
 double ljc_gpu_bytes() {
--- a/lib/gpu/ljc_cut_gpu_kernel.cu
+++ b/lib/gpu/ljc_cut_gpu_kernel.cu
@ -18,8 +18,6 @@
 #ifndef LJC_GPU_KERNEL
 #define LJC_GPU_KERNEL
 #define MAX_SHARED_TYPES 8
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
 #define numtyp2 double2
@ -46,7 +44,7 @@
 #ifdef NV_KERNEL
-#include "geryon/ucl_nv_kernel.h"
+#include "nv_kernel_def.h"
 texture<float4> pos_tex;
 texture<float> q_tex;
@ -82,6 +80,8 @@ __inline float fetch_q(const int& i, const float *q)
 #define fetch_pos(i,y) x_[i]
 #define fetch_q(i,y) q_[i]
 #define BLOCK_PAIR 64
 #define MAX_SHARED_TYPES 8
 #endif
@ -92,13 +92,17 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; }
 __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
                          __global numtyp4* lj3, const int lj_types, 
                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global acctyp4 *ans, __global acctyp *engv, 
+                          __global int *dev_packed, __global acctyp4 *ans,
-                          const int eflag, const int vflag, const int inum, 
+                          __global acctyp *engv, const int eflag,
-                          const int nall, const int nbor_pitch,
+                          const int vflag, const int inum, const int nall,
-                          __global numtyp *q_ , __global numtyp *cutsq,
+                          const int nbor_pitch, __global numtyp *q_ ,
-                          const numtyp qqrd2e) {
+                          __global numtyp *cutsq, const numtyp qqrd2e,
-  // ii indexes the two interacting particles in gi
+                          const int t_per_atom) {
-  int ii=GLOBAL_ID_X;
+  int tid=THREAD_ID_X;
  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
  ii+=tid/t_per_atom;
  int offset=tid%t_per_atom;
  __local numtyp sp_lj[8];
  sp_lj[0]=sp_lj_in[0];
  sp_lj[1]=sp_lj_in[1];
@ -109,7 +113,6 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];
  if (ii<inum) {
  acctyp energy=(acctyp)0;
  acctyp e_coul=(acctyp)0;
  acctyp4 f;
@ -120,18 +123,31 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
  if (ii<inum) {
    __global int *nbor=dev_nbor+ii;
    int i=*nbor;
    nbor+=nbor_pitch;
    int numj=*nbor;
    nbor+=nbor_pitch;
-    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+
    int n_stride;
    __global int *list_end;
    if (dev_nbor==dev_packed) {
      list_end=nbor+mul24(numj,nbor_pitch);
      nbor+=mul24(offset,nbor_pitch);
      n_stride=mul24(t_per_atom,nbor_pitch);
    } else {
      nbor=dev_packed+*nbor;
      list_end=nbor+numj;
      n_stride=t_per_atom;
      nbor+=offset;
    }
    numtyp4 ix=fetch_pos(i,x_); //x_[i];
    numtyp qtmp=fetch_q(i,q_);
    int itype=ix.w;
-    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;
      numtyp factor_lj, factor_coul;
@ -188,8 +204,49 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
      }
    } // for nbor
  } // if ii
  // Reduce answers
  if (t_per_atom>1) {
    __local acctyp red_acc[6][BLOCK_PAIR];
    red_acc[0][tid]=f.x;
    red_acc[1][tid]=f.y;
    red_acc[2][tid]=f.z;
    red_acc[3][tid]=energy;
    red_acc[4][tid]=e_coul;
    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
      if (offset < s) {
        for (int r=0; r<5; r++)
          red_acc[r][tid] += red_acc[r][tid+s];
      }
    }
    f.x=red_acc[0][tid];
    f.y=red_acc[1][tid];
    f.z=red_acc[2][tid];
    energy=red_acc[3][tid];
    e_coul=red_acc[4][tid];
    if (vflag>0) {
      for (int r=0; r<6; r++)
        red_acc[r][tid]=virial[r];
      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
        if (offset < s) {
          for (int r=0; r<6; r++)
            red_acc[r][tid] += red_acc[r][tid+s];
        }
      }
      for (int r=0; r<6; r++)
        virial[r]=red_acc[r][tid];
    }
  }
  // Store answers
  if (ii<inum && offset==0) {
    __global acctyp *ap1=engv+ii;
    if (eflag>0) {
      *ap1=energy;
@ -209,30 +266,30 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
 __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                               __global numtyp4* lj3_in, 
-                               __global numtyp* sp_lj_in, __global int *dev_nbor, 
+                               __global numtyp* sp_lj_in,
                               __global int *dev_nbor, __global int *dev_packed,
                               __global acctyp4 *ans, __global acctyp *engv, 
                               const int eflag, const int vflag, const int inum, 
                               const int nall, const int nbor_pitch,
                               __global numtyp *q_ , __global numtyp *_cutsq,
-                               const numtyp qqrd2e) {
+                               const numtyp qqrd2e, const int t_per_atom) {
-  // ii indexes the two interacting particles in gi
+  int tid=THREAD_ID_X;
-  int ii=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
  ii+=tid/t_per_atom;
  int offset=tid%t_per_atom;
  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp sp_lj[8];
-  if (ii<8)
+  if (tid<8)
-    sp_lj[ii]=sp_lj_in[ii];
+    sp_lj[tid]=sp_lj_in[tid];
-  if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
-    lj1[ii]=lj1_in[ii];
+    lj1[tid]=lj1_in[tid];
-    cutsq[ii]=_cutsq[ii];
+    cutsq[tid]=_cutsq[tid];
    if (eflag>0)
-      lj3[ii]=lj3_in[ii];
+      lj3[tid]=lj3_in[tid];
  }
  ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
  __syncthreads();
  if (ii<inum) {
  acctyp energy=(acctyp)0;
  acctyp e_coul=(acctyp)0;
@ -244,19 +301,34 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
  __syncthreads();
  if (ii<inum) {
    __global int *nbor=dev_nbor+ii;
    int i=*nbor;
    nbor+=nbor_pitch;
    int numj=*nbor;
    nbor+=nbor_pitch;
-    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+
    int n_stride;
    __global int *list_end;
    if (dev_nbor==dev_packed) {
      list_end=nbor+mul24(numj,nbor_pitch);
      nbor+=mul24(offset,nbor_pitch);
      n_stride=mul24(t_per_atom,nbor_pitch);
    } else {
      nbor=dev_packed+*nbor;
      list_end=nbor+numj;
      n_stride=t_per_atom;
      nbor+=offset;
    }
    numtyp4 ix=fetch_pos(i,x_); //x_[i];
    numtyp qtmp=fetch_q(i,q_);
    int iw=ix.w;
    int itype=mul24((int)MAX_SHARED_TYPES,iw);
-    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;
      numtyp factor_lj, factor_coul;
@ -312,8 +384,49 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
      }
    } // for nbor
  } // if ii
  // Reduce answers
  if (t_per_atom>1) {
    __local acctyp red_acc[6][BLOCK_PAIR];
    red_acc[0][tid]=f.x;
    red_acc[1][tid]=f.y;
    red_acc[2][tid]=f.z;
    red_acc[3][tid]=energy;
    red_acc[4][tid]=e_coul;
    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
      if (offset < s) {
        for (int r=0; r<5; r++)
          red_acc[r][tid] += red_acc[r][tid+s];
      }
    }
    f.x=red_acc[0][tid];
    f.y=red_acc[1][tid];
    f.z=red_acc[2][tid];
    energy=red_acc[3][tid];
    e_coul=red_acc[4][tid];
    if (vflag>0) {
      for (int r=0; r<6; r++)
        red_acc[r][tid]=virial[r];
      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
        if (offset < s) {
          for (int r=0; r<6; r++)
            red_acc[r][tid] += red_acc[r][tid+s];
        }
      }
      for (int r=0; r<6; r++)
        virial[r]=red_acc[r][tid];
    }
  }
  // Store answers
  if (ii<inum && offset==0) {
    __global acctyp *ap1=engv+ii;
    if (eflag>0) {
      *ap1=energy;
--- a/lib/gpu/ljc_cut_gpu_memory.cpp
+++ b/lib/gpu/ljc_cut_gpu_memory.cpp
@ -43,7 +43,7 @@ int LJC_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
 }
 template <class numtyp, class acctyp>
-bool LJC_GPU_MemoryT::init(const int ntypes,
+int LJC_GPU_MemoryT::init(const int ntypes,
                          double **host_cutsq, double **host_lj1, 
                          double **host_lj2, double **host_lj3, 
                          double **host_lj4, double **host_offset, 
@ -53,14 +53,18 @@ bool LJC_GPU_MemoryT::init(const int ntypes,
                          const double gpu_split, FILE *_screen,
                          double **host_cut_ljsq, double **host_cut_coulsq,
                          double *host_special_coul, const double qqrd2e) {
-  this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+  int success;
  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
                            _screen,ljc_cut_gpu_kernel);
  if (success!=0)
    return success;
  // If atom type constants fit in shared memory use fast kernel
  int lj_types=ntypes;
  shared_types=false;
-  if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) {
+  int max_shared_types=this->device->max_shared_types();
-    lj_types=MAX_SHARED_TYPES;
+  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
    lj_types=max_shared_types;
    shared_types=true;
  }
  _lj_types=lj_types;
@ -95,7 +99,7 @@ bool LJC_GPU_MemoryT::init(const int ntypes,
  _allocated=true;
  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+cutsq.row_bytes()+
                   sp_lj.row_bytes();
-  return true;
+  return 0;
 }
 template <class numtyp, class acctyp>
@ -134,9 +138,10 @@ void LJC_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
  else
    vflag=0;
-  int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));
-  int ainum=this->atom->inum();
+  int ainum=this->ans->inum();
  int anall=this->atom->nall();
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
@ -145,19 +150,20 @@ void LJC_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
                          &lj3.begin(), &sp_lj.begin(),
                          &this->nbor->dev_nbor.begin(),
-                          &this->atom->dev_ans.begin(),
+                          &this->_nbor_data->begin(),
-                          &this->atom->dev_engv.begin(), &eflag, &vflag,
+                          &this->ans->dev_ans.begin(),
                          &this->ans->dev_engv.begin(), &eflag, &vflag,
                          &ainum, &anall, &nbor_pitch,
                          &this->atom->dev_q.begin(), &cutsq.begin(),
-                          &_qqrd2e);
+                          &_qqrd2e, &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->atom->dev_ans.begin(),
+                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
                     &anall, &nbor_pitch, &this->atom->dev_q.begin(),
-                     &cutsq.begin(), &_qqrd2e);
+                     &cutsq.begin(), &_qqrd2e, &this->_threads_per_atom);
  }
  this->time_pair.stop();
 }
--- a/lib/gpu/ljc_cut_gpu_memory.h
+++ b/lib/gpu/ljc_cut_gpu_memory.h
@ -29,8 +29,15 @@ class LJC_GPU_Memory : public ChargeGPUMemory<numtyp, acctyp> {
  /// Clear any previous data and set up for a new LAMMPS run
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device **/
+    * \param gpu_split fraction of particles handled by device
-  bool init(const int ntypes, double **host_cutsq, double **host_lj1,
+    * 
    * Returns:
    * -  0 if successfull
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
    * - -5 Double precision is not supported on card **/
  int init(const int ntypes, double **host_cutsq, double **host_lj1,
           double **host_lj2, double **host_lj3, double **host_lj4,
           double **host_offset, double *host_special_lj,
           const int nlocal, const int nall, const int max_nbors, 
--- a/lib/gpu/ljcl_cut_gpu.cpp
+++ b/lib/gpu/ljcl_cut_gpu.cpp
@ -28,7 +28,7 @@ static LJCL_GPU_Memory<PRECISION,ACC_PRECISION> LJCLMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-bool ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
+int ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                  double **host_lj2, double **host_lj3, double **host_lj4,
                  double **offset, double *special_lj, const int inum,
                  const int nall, const int max_nbors, const int maxspecial,
@ -56,15 +56,12 @@ bool ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
    fflush(screen);
  }
-  if (world_me==0) {
+  int init_ok=0;
-    bool init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
+  if (world_me==0)
-                            host_lj4, offset, special_lj, inum, nall, 300,
+    init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                            maxspecial, cell_size, gpu_split, screen,
+                        offset, special_lj, inum, nall, 300, maxspecial,
-                            host_cut_ljsq, host_cut_coulsq, host_special_coul,
+                        cell_size, gpu_split, screen, host_cut_ljsq,
-                            qqrd2e,g_ewald);
+                        host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
    if (!init_ok)
      return false;
  }
  LJCLMF.device->world_barrier();
  if (message)
@ -79,48 +76,51 @@ bool ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                last_gpu,i);
      fflush(screen);
    }
-    if (gpu_rank==i && world_me!=0) {
+    if (gpu_rank==i && world_me!=0)
-      bool init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
+      init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                              host_lj4, offset, special_lj, inum, nall, 300,
+                          offset, special_lj, inum, nall, 300, maxspecial,
-                              maxspecial, cell_size, gpu_split,
+                          cell_size, gpu_split, screen, host_cut_ljsq,
-			      screen, host_cut_ljsq, host_cut_coulsq,
+                          host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
-                              host_special_coul, qqrd2e, g_ewald);
+
      if (!init_ok)
        return false;
    }
    LJCLMF.device->gpu_barrier();
    if (message) 
      fprintf(screen,"Done.\n");
  }
  if (message)
    fprintf(screen,"\n");
-  return true;
+
  if (init_ok==0)
    LJCLMF.estimate_gpu_overhead();
  return init_ok;
 }
 void ljcl_gpu_clear() {
  LJCLMF.clear();
 }
-int * ljcl_gpu_compute_n(const int timestep, const int ago, const int inum_full,
+int** ljcl_gpu_compute_n(const int ago, const int inum_full,
                         const int nall, double **host_x, int *host_type,
-                         double *boxlo, double *boxhi, int *tag, int **nspecial, 
+                         double *sublo, double *subhi, int *tag, int **nspecial, 
                         int **special, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
-                         const double cpu_time, bool &success, double *host_q) {
+                         int **ilist, int **jnum,  const double cpu_time,
-  return LJCLMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
+                         bool &success, double *host_q, double *boxlo,
-                        boxhi, tag, nspecial, special, eflag, vflag, eatom,
+                         double *prd) {
-                        vatom, host_start, cpu_time, success, host_q);
+  return LJCLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success,
                        host_q, boxlo, prd);
 }  
-void ljcl_gpu_compute(const int timestep, const int ago, const int inum_full,
+void ljcl_gpu_compute(const int ago, const int inum_full, const int nall,
-	 	     const int nall, double **host_x, int *host_type,
+                      double **host_x, int *host_type, int *ilist, int *numj,
-                     int *ilist, int *numj, int **firstneigh,
+                      int **firstneigh, const bool eflag, const bool vflag,
-		     const bool eflag, const bool vflag, const bool eatom,
+                      const bool eatom, const bool vatom, int &host_start,
-                     const bool vatom, int &host_start, const double cpu_time,
+                      const double cpu_time, bool &success, double *host_q,
-                     bool &success, double *host_q) {
+                      const int nlocal, double *boxlo, double *prd) {
-  LJCLMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
+  LJCLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
                firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
-                host_q);
+                host_q,nlocal,boxlo,prd);
 }
 double ljcl_gpu_bytes() {
--- a/lib/gpu/ljcl_cut_gpu_kernel.cu
+++ b/lib/gpu/ljcl_cut_gpu_kernel.cu
@ -18,8 +18,6 @@
 #ifndef LJCL_GPU_KERNEL
 #define LJCL_GPU_KERNEL
 #define MAX_SHARED_TYPES 8
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
 #define numtyp2 double2
@ -54,7 +52,7 @@
 #ifdef NV_KERNEL
-#include "geryon/ucl_nv_kernel.h"
+#include "nv_kernel_def.h"
 texture<float4> pos_tex;
 texture<float> q_tex;
@ -90,6 +88,8 @@ __inline float fetch_q(const int& i, const float *q)
 #define fetch_pos(i,y) x_[i]
 #define fetch_q(i,y) q_[i]
 #define BLOCK_PAIR 64
 #define MAX_SHARED_TYPES 8
 #endif
@ -100,13 +100,17 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; }
 __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
                          __global numtyp4* lj3, const int lj_types, 
                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global acctyp4 *ans, __global acctyp *engv, 
+                          __global int *dev_packed, __global acctyp4 *ans,
-                          const int eflag, const int vflag, const int inum, 
+                          __global acctyp *engv, const int eflag, 
-                          const int nall, const int nbor_pitch,
+                          const int vflag, const int inum, const int nall,
-                          __global numtyp *q_ , const numtyp cut_coulsq,
+                          const int nbor_pitch, __global numtyp *q_,
-                          const numtyp qqrd2e, const numtyp g_ewald) {
+                          const numtyp cut_coulsq, const numtyp qqrd2e,
-  // ii indexes the two interacting particles in gi
+                          const numtyp g_ewald, const int t_per_atom) {
-  int ii=GLOBAL_ID_X;
+  int tid=THREAD_ID_X;
  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
  ii+=tid/t_per_atom;
  int offset=tid%t_per_atom;
  __local numtyp sp_lj[8];
  sp_lj[0]=sp_lj_in[0];
  sp_lj[1]=sp_lj_in[1];
@ -117,7 +121,6 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];
  if (ii<inum) {
  acctyp energy=(acctyp)0;
  acctyp e_coul=(acctyp)0;
  acctyp4 f;
@ -128,18 +131,31 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
  if (ii<inum) {
    __global int *nbor=dev_nbor+ii;
    int i=*nbor;
    nbor+=nbor_pitch;
    int numj=*nbor;
    nbor+=nbor_pitch;
-    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+
    int n_stride;
    __global int *list_end;
    if (dev_nbor==dev_packed) {
      list_end=nbor+mul24(numj,nbor_pitch);
      nbor+=mul24(offset,nbor_pitch);
      n_stride=mul24(t_per_atom,nbor_pitch);
    } else {
      nbor=dev_packed+*nbor;
      list_end=nbor+numj;
      n_stride=t_per_atom;
      nbor+=offset;
    }
    numtyp4 ix=fetch_pos(i,x_); //x_[i];
    numtyp qtmp=fetch_q(i,q_);
    int itype=ix.w;
-    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;
      numtyp factor_lj, factor_coul;
@ -204,8 +220,49 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
      }
    } // for nbor
  } // if ii
  // Reduce answers
  if (t_per_atom>1) {
    __local acctyp red_acc[6][BLOCK_PAIR];
    red_acc[0][tid]=f.x;
    red_acc[1][tid]=f.y;
    red_acc[2][tid]=f.z;
    red_acc[3][tid]=energy;
    red_acc[4][tid]=e_coul;
    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
      if (offset < s) {
        for (int r=0; r<5; r++)
          red_acc[r][tid] += red_acc[r][tid+s];
      }
    }
    f.x=red_acc[0][tid];
    f.y=red_acc[1][tid];
    f.z=red_acc[2][tid];
    energy=red_acc[3][tid];
    e_coul=red_acc[4][tid];
    if (vflag>0) {
      for (int r=0; r<6; r++)
        red_acc[r][tid]=virial[r];
      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
        if (offset < s) {
          for (int r=0; r<6; r++)
            red_acc[r][tid] += red_acc[r][tid+s];
        }
      }
      for (int r=0; r<6; r++)
        virial[r]=red_acc[r][tid];
    }
  }
  // Store answers
  if (ii<inum && offset==0) {
    __global acctyp *ap1=engv+ii;
    if (eflag>0) {
      *ap1=energy;
@ -225,28 +282,29 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
 __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                               __global numtyp4* lj3_in, 
-                               __global numtyp* sp_lj_in, __global int *dev_nbor, 
+                               __global numtyp* sp_lj_in,
                               __global int *dev_nbor, __global int *dev_packed,
                               __global acctyp4 *ans, __global acctyp *engv, 
                               const int eflag, const int vflag, const int inum, 
                               const int nall, const int nbor_pitch,
                               __global numtyp *q_ , const numtyp cut_coulsq,
-                               const numtyp qqrd2e, const numtyp g_ewald) {
+                               const numtyp qqrd2e, const numtyp g_ewald,
-  // ii indexes the two interacting particles in gi
+                               const int t_per_atom) {
-  int ii=THREAD_ID_X;
+  int tid=THREAD_ID_X;
  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
  ii+=tid/t_per_atom;
  int offset=tid%t_per_atom;
  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp sp_lj[8];
-  if (ii<8)
+  if (tid<8)
-    sp_lj[ii]=sp_lj_in[ii];
+    sp_lj[tid]=sp_lj_in[tid];
-  if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
-    lj1[ii]=lj1_in[ii];
+    lj1[tid]=lj1_in[tid];
    if (eflag>0)
-      lj3[ii]=lj3_in[ii];
+      lj3[tid]=lj3_in[tid];
  }
  ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
  __syncthreads();
  if (ii<inum) {
  acctyp energy=(acctyp)0;
  acctyp e_coul=(acctyp)0;
@ -258,19 +316,34 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
  __syncthreads();
  if (ii<inum) {
    __global int *nbor=dev_nbor+ii;
    int i=*nbor;
    nbor+=nbor_pitch;
    int numj=*nbor;
    nbor+=nbor_pitch;
-    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+
    int n_stride;
    __global int *list_end;
    if (dev_nbor==dev_packed) {
      list_end=nbor+mul24(numj,nbor_pitch);
      nbor+=mul24(offset,nbor_pitch);
      n_stride=mul24(t_per_atom,nbor_pitch);
    } else {
      nbor=dev_packed+*nbor;
      list_end=nbor+numj;
      n_stride=t_per_atom;
      nbor+=offset;
    }
    numtyp4 ix=fetch_pos(i,x_); //x_[i];
    numtyp qtmp=fetch_q(i,q_);
    int iw=ix.w;
    int itype=mul24((int)MAX_SHARED_TYPES,iw);
-    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;
      numtyp factor_lj, factor_coul;
@ -334,8 +407,49 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
      }
    } // for nbor
  } // if ii
  // Reduce answers
  if (t_per_atom>1) {
    __local acctyp red_acc[6][BLOCK_PAIR];
    red_acc[0][tid]=f.x;
    red_acc[1][tid]=f.y;
    red_acc[2][tid]=f.z;
    red_acc[3][tid]=energy;
    red_acc[4][tid]=e_coul;
    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
      if (offset < s) {
        for (int r=0; r<5; r++)
          red_acc[r][tid] += red_acc[r][tid+s];
      }
    }
    f.x=red_acc[0][tid];
    f.y=red_acc[1][tid];
    f.z=red_acc[2][tid];
    energy=red_acc[3][tid];
    e_coul=red_acc[4][tid];
    if (vflag>0) {
      for (int r=0; r<6; r++)
        red_acc[r][tid]=virial[r];
      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
        if (offset < s) {
          for (int r=0; r<6; r++)
            red_acc[r][tid] += red_acc[r][tid+s];
        }
      }
      for (int r=0; r<6; r++)
        virial[r]=red_acc[r][tid];
    }
  }
  // Store answers
  if (ii<inum && offset==0) {
    __global acctyp *ap1=engv+ii;
    if (eflag>0) {
      *ap1=energy;
--- a/lib/gpu/ljcl_cut_gpu_memory.cpp
+++ b/lib/gpu/ljcl_cut_gpu_memory.cpp
@ -43,7 +43,7 @@ int LJCL_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
 }
 template <class numtyp, class acctyp>
-bool LJCL_GPU_MemoryT::init(const int ntypes,
+int LJCL_GPU_MemoryT::init(const int ntypes,
                           double **host_cutsq, double **host_lj1, 
                           double **host_lj2, double **host_lj3, 
                           double **host_lj4, double **host_offset, 
@ -54,14 +54,18 @@ bool LJCL_GPU_MemoryT::init(const int ntypes,
                           double **host_cut_ljsq, const double host_cut_coulsq,
                           double *host_special_coul, const double qqrd2e,
                           const double g_ewald) {
-  this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+  int success;
  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
                            _screen,ljcl_cut_gpu_kernel);
  if (success!=0)
    return success;
  // If atom type constants fit in shared memory use fast kernel
  int lj_types=ntypes;
  shared_types=false;
-  if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) {
+  int max_shared_types=this->device->max_shared_types();
-    lj_types=MAX_SHARED_TYPES;
+  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
    lj_types=max_shared_types;
    shared_types=true;
  }
  _lj_types=lj_types;
@ -94,7 +98,7 @@ bool LJCL_GPU_MemoryT::init(const int ntypes,
  _allocated=true;
  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
-  return true;
+  return 0;
 }
 template <class numtyp, class acctyp>
@ -132,9 +136,10 @@ void LJCL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
  else
    vflag=0;
-  int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));
-  int ainum=this->atom->inum();
+  int ainum=this->ans->inum();
  int anall=this->atom->nall();
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
@ -143,19 +148,21 @@ void LJCL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
                          &lj3.begin(), &sp_lj.begin(),
                          &this->nbor->dev_nbor.begin(),
-                          &this->atom->dev_ans.begin(),
+                          &this->_nbor_data->begin(),
-                          &this->atom->dev_engv.begin(), &eflag, &vflag,
+                          &this->ans->dev_ans.begin(),
                          &this->ans->dev_engv.begin(), &eflag, &vflag,
                          &ainum, &anall, &nbor_pitch,
                          &this->atom->dev_q.begin(), &_cut_coulsq,
-                          &_qqrd2e, &_g_ewald);
+                          &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->atom->dev_ans.begin(),
+                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
-                     &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
                     &anall, &nbor_pitch, &this->atom->dev_q.begin(),
-                     &_cut_coulsq, &_qqrd2e, &_g_ewald);
+                     &_cut_coulsq, &_qqrd2e, &_g_ewald, 
                     &this->_threads_per_atom);
  }
  this->time_pair.stop();
 }
--- a/lib/gpu/ljcl_cut_gpu_memory.h
+++ b/lib/gpu/ljcl_cut_gpu_memory.h
@ -29,8 +29,15 @@ class LJCL_GPU_Memory : public ChargeGPUMemory<numtyp, acctyp> {
  /// Clear any previous data and set up for a new LAMMPS run
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device **/
+    * \param gpu_split fraction of particles handled by device
-  bool init(const int ntypes, double **host_cutsq,
+    * 
    * Returns:
    * -  0 if successfull
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
    * - -5 Double precision is not supported on card **/
  int init(const int ntypes, double **host_cutsq,
           double **host_lj1, double **host_lj2, double **host_lj3,
           double **host_lj4, double **host_offset, double *host_special_lj,
           const int nlocal, const int nall, const int max_nbors, 
--- a/lib/gpu/pair_gpu_atom.cpp
+++ b/lib/gpu/pair_gpu_atom.cpp
@ -29,9 +29,8 @@ __win_sort _win_sort;
 #endif
 template <class numtyp, class acctyp>
-PairGPUAtomT::PairGPUAtom() : _compiled(false),_allocated(false),_eflag(false),
+PairGPUAtomT::PairGPUAtom() : _compiled(false),_allocated(false),
-                              _vflag(false),_inum(0),_ilist(NULL), 
+                              _max_gpu_bytes(0) {
                              _newton(false) {
  #ifndef USE_OPENCL
  sort_config.op = CUDPP_ADD;
  sort_config.datatype = CUDPP_UINT;
@ -56,28 +55,20 @@ int PairGPUAtomT::bytes_per_atom() const {
  int id_space=0;
  if (_gpu_nbor)
    id_space=2;
-  int bytes=4*sizeof(numtyp)+11*sizeof(acctyp)+id_space;
+  int bytes=4*sizeof(numtyp)+id_space;
  if (_rot)
-    bytes+=4*sizeof(numtyp)+4*sizeof(acctyp);
+    bytes+=4*sizeof(numtyp);
  if (_charge)
    bytes+=sizeof(numtyp);
  return bytes;
 }
 template <class numtyp, class acctyp>
-bool PairGPUAtomT::alloc(const int inum, const int nall) {
+bool PairGPUAtomT::alloc(const int nall) {
  _max_atoms=static_cast<int>(static_cast<double>(nall)*1.10);
  if (_newton)
    _max_local=_max_atoms;
  else
    _max_local=static_cast<int>(static_cast<double>(inum)*1.10);
  bool success=true;
  int ans_elements=4;
  if (_rot)
    ans_elements+=4;
  // Ignore host/device transfers?
  bool cpuview=false;
  if (dev->device_type()==UCL_CPU)
@ -107,8 +98,6 @@ bool PairGPUAtomT::alloc(const int inum, const int nall) {
  success=success && (host_x.alloc(_max_atoms*4,*dev,
                      UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
  #endif                      
  success=success &&(host_ans.alloc(ans_elements*_max_local,*dev)==UCL_SUCCESS);
  success=success &&(host_engv.alloc(_ev_fields*_max_local,*dev)==UCL_SUCCESS);
  // Buffer for casting only if different precisions
  if (_charge)
    success=success && (host_q.alloc(_max_atoms,*dev,
@ -120,15 +109,13 @@ bool PairGPUAtomT::alloc(const int inum, const int nall) {
  // ---------------------------  Device allocations
-  _gpu_bytes=0;
+  int gpu_bytes=0;
  if (cpuview) {
    #ifdef GPU_CAST
    assert(0==1);
    #else
    dev_x.view(host_x);
    #endif
    dev_engv.view(host_engv);
    dev_ans.view(host_ans);
    if (_rot)
      dev_quat.view(host_quat);
    if (_charge)
@ -140,49 +127,80 @@ bool PairGPUAtomT::alloc(const int inum, const int nall) {
                        dev_x_cast.alloc(_max_atoms*3,*dev,UCL_READ_ONLY));
    success=success && (UCL_SUCCESS==
                        dev_type_cast.alloc(_max_atoms,*dev,UCL_READ_ONLY));
-    _gpu_bytes+=dev_x_cast.row_bytes()+dev_type_cast.row_bytes();
+    gpu_bytes+=dev_x_cast.row_bytes()+dev_type_cast.row_bytes();
    #else
    success=success && (UCL_SUCCESS==
                        dev_x.alloc(_max_atoms*4,*dev,UCL_READ_ONLY));
    #endif
    success=success && (dev_engv.alloc(_ev_fields*_max_local,*dev,
                                       UCL_WRITE_ONLY)==UCL_SUCCESS);
    success=success && (dev_ans.alloc(ans_elements*_max_local,
                                      *dev,UCL_WRITE_ONLY)==UCL_SUCCESS);
    if (_charge) {
      success=success && (dev_q.alloc(_max_atoms,*dev,
                                      UCL_READ_ONLY)==UCL_SUCCESS);
-      _gpu_bytes+=dev_q.row_bytes();
+      gpu_bytes+=dev_q.row_bytes();
    }
    if (_rot) {
      success=success && (dev_quat.alloc(_max_atoms*4,*dev,
                                      UCL_READ_ONLY)==UCL_SUCCESS);
-      _gpu_bytes+=dev_quat.row_bytes();
+      gpu_bytes+=dev_quat.row_bytes();
    }
  }
  if (_gpu_nbor) {
    success=success && (dev_cell_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
    success=success && (dev_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
-    _gpu_bytes+=dev_cell_id.row_bytes()+dev_particle_id.row_bytes();
+    gpu_bytes+=dev_cell_id.row_bytes()+dev_particle_id.row_bytes();
    if (_bonds) {
      success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS);
-      _gpu_bytes+=dev_tag.row_bytes();
+      gpu_bytes+=dev_tag.row_bytes();
    }
  }
-  _gpu_bytes+=dev_x.row_bytes()+dev_engv.row_bytes()+dev_ans.row_bytes();
+  gpu_bytes+=dev_x.row_bytes();
  if (gpu_bytes>_max_gpu_bytes)
    _max_gpu_bytes=gpu_bytes;
  _allocated=true;  
  return success;
 }
 template <class numtyp, class acctyp>
-bool PairGPUAtomT::init(const int inum, const int nall, const bool charge,
+bool PairGPUAtomT::add_fields(const bool charge, const bool rot,
-                        const bool rot, UCL_Device &devi, const bool gpu_nbor,
+                              const bool gpu_nbor, const bool bonds) {
  bool realloc=false;
  if (charge && _charge==false) {
    _charge=true;
    realloc=true;
  }
  if (rot && _rot==false) {
    _rot=true;
    realloc=true;
  }
  if (gpu_nbor && _gpu_nbor==false) {
    _gpu_nbor=true;
    realloc=true;
  }
  if (bonds && _bonds==false) {
    _bonds=true;
    realloc=true;
  }
  if (realloc) {
    _other=_charge || _rot;
    int max_atoms=_max_atoms;
    clear_resize();
    return alloc(max_atoms);
  }
  return true;
 }
 template <class numtyp, class acctyp>
 bool PairGPUAtomT::init(const int nall, const bool charge, const bool rot,
                        UCL_Device &devi, const bool gpu_nbor,
                        const bool bonds) {
  clear();
  bool success=true;
  _x_avail=false;
  _q_avail=false;
  _quat_avail=false;
  _resized=false;
  _gpu_nbor=gpu_nbor;
  _bonds=bonds;
  _charge=charge;
@ -190,33 +208,25 @@ bool PairGPUAtomT::init(const int inum, const int nall, const bool charge,
  _other=_charge || _rot;
  dev=&devi;
  _e_fields=1;
  if (_charge)
    _e_fields++;
  _ev_fields=6+_e_fields;
  // Initialize atom and nbor data
  int ef_inum=inum;
  if (ef_inum==0)
    ef_inum=1000;
  int ef_nall=nall;
-  if (ef_nall<=ef_inum)
+  if (ef_nall==0)
-    ef_nall=ef_inum*2;
+    ef_nall=2000;
  // Initialize timers for the selected device
  time_pos.init(*dev);
-  time_other.init(*dev);
+  time_q.init(*dev);
-  time_answer.init(*dev);
+  time_quat.init(*dev);
  time_pos.zero();
-  time_other.zero();
+  time_q.zero();
-  time_answer.zero();
+  time_quat.zero();
  _time_cast=0.0;
  #ifdef GPU_CAST
  compile_kernels(*dev);
  #endif
-  return success && alloc(ef_inum,ef_nall);
+  return success && alloc(ef_nall);
 }
 template <class numtyp, class acctyp>
@ -234,16 +244,12 @@ void PairGPUAtomT::clear_resize() {
    dev_quat.clear();
    host_quat.clear();
  }
  dev_ans.clear();
  dev_engv.clear();
  #ifndef GPU_CAST
  host_x.clear();
  #else
  host_x_cast.clear();
  host_type_cast.clear();
  #endif
  host_ans.clear();
  host_engv.clear();
  dev_cell_id.clear();
  dev_particle_id.clear();
  dev_tag.clear();
@ -261,17 +267,14 @@ void PairGPUAtomT::clear_resize() {
 template <class numtyp, class acctyp>
 void PairGPUAtomT::clear() {
-  _gpu_bytes=0;
+  _max_gpu_bytes=0;
  if (!_allocated)
    return;
  time_pos.clear();
-  time_other.clear();
+  time_q.clear();
-  time_answer.clear();
+  time_quat.clear();
  clear_resize();
  _inum=0;
  _eflag=false;
  _vflag=false;
  #ifdef GPU_CAST
  if (_compiled) {
@ -289,255 +292,10 @@ double PairGPUAtomT::host_memory_usage() const {
    atom_bytes+=1;
  if (_rot) 
    atom_bytes+=4;
  int ans_bytes=atom_bytes+_ev_fields;
  return _max_atoms*atom_bytes*sizeof(numtyp)+
         ans_bytes*(_max_local)*sizeof(acctyp)+
         sizeof(PairGPUAtom<numtyp,acctyp>);
 }
 template <class numtyp, class acctyp>
 void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag,
                                const bool ef_atom, const bool vf_atom) {
  time_answer.start();
  _eflag=eflag;
  _vflag=vflag;
  _ef_atom=ef_atom;
  _vf_atom=vf_atom;
  int csize=_ev_fields;    
  if (!eflag)
    csize-=_e_fields;
  if (!vflag)
    csize-=6;
  if (csize>0)
    ucl_copy(host_engv,dev_engv,_inum*csize,true);
  if (_rot)
    ucl_copy(host_ans,dev_ans,_inum*4*2,true);
  else
    ucl_copy(host_ans,dev_ans,_inum*4,true);
  time_answer.stop();
 }
 template <class numtyp, class acctyp>
 void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag,
                                const bool ef_atom, const bool vf_atom,
                                int *ilist) {
  _ilist=ilist;
  copy_answers(eflag,vflag,ef_atom,vf_atom);
 }
 template <class numtyp, class acctyp>
 double PairGPUAtomT::energy_virial(double *eatom, double **vatom,
                                   double *virial) {
  if (_eflag==false && _vflag==false)
    return 0.0;
  double evdwl=0.0;
  if (_gpu_nbor) {
    for (int i=0; i<_inum; i++) {
      acctyp *ap=host_engv.begin()+i;
      if (_eflag) {
        if (_ef_atom) {
          evdwl+=*ap;
          eatom[i]+=*ap*0.5;
          ap+=_inum;
        } else {
          evdwl+=*ap;
          ap+=_inum;
        }
      }
      if (_vflag) {
        if (_vf_atom) {
          for (int j=0; j<6; j++) {
            vatom[i][j]+=*ap*0.5;
            virial[j]+=*ap;
            ap+=_inum;
          }
        } else {
          for (int j=0; j<6; j++) {
            virial[j]+=*ap;
            ap+=_inum;
          }
        }
      }
    }
    for (int j=0; j<6; j++)
      virial[j]*=0.5;
  } else {
    for (int i=0; i<_inum; i++) {
      acctyp *ap=host_engv.begin()+i;
      int ii=_ilist[i];
      if (_eflag) {
        if (_ef_atom) {
          evdwl+=*ap;
          eatom[ii]+=*ap*0.5;
          ap+=_inum;
        } else {
          evdwl+=*ap;
          ap+=_inum;
        }
      }
      if (_vflag) {
        if (_vf_atom) {
          for (int j=0; j<6; j++) {
            vatom[ii][j]+=*ap*0.5;
            virial[j]+=*ap;
            ap+=_inum;
          }
        } else {
          for (int j=0; j<6; j++) {
            virial[j]+=*ap;
            ap+=_inum;
          }
        }
      }
    }
    for (int j=0; j<6; j++)
      virial[j]*=0.5;
  }
  evdwl*=0.5;
  return evdwl;
 }
 template <class numtyp, class acctyp>
 double PairGPUAtomT::energy_virial(double *eatom, double **vatom,
                                   double *virial, double &ecoul) {
  if (_eflag==false && _vflag==false) {
    ecoul=0.0;
    return 0.0;
  }
  if (_charge==false)
    return energy_virial(eatom,vatom,virial);
  double evdwl=0.0;
  double _ecoul=0.0;
  if (_gpu_nbor) {
    for (int i=0; i<_inum; i++) {
      acctyp *ap=host_engv.begin()+i;
      if (_eflag) {
        if (_ef_atom) {
          evdwl+=*ap;
          eatom[i]+=*ap*0.5;
          ap+=_inum;
          _ecoul+=*ap;
          eatom[i]+=*ap*0.5;
          ap+=_inum;
        } else {
          evdwl+=*ap;
          ap+=_inum;
          _ecoul+=*ap;
          ap+=_inum;
        }
      }
      if (_vflag) {
        if (_vf_atom) {
          for (int j=0; j<6; j++) {
            vatom[i][j]+=*ap*0.5;
            virial[j]+=*ap;
            ap+=_inum;
          }
        } else {
          for (int j=0; j<6; j++) {
            virial[j]+=*ap;
            ap+=_inum;
          }
        }
      }
    }
    for (int j=0; j<6; j++)
      virial[j]*=0.5;
  } else {
    for (int i=0; i<_inum; i++) {
      acctyp *ap=host_engv.begin()+i;
      int ii=_ilist[i];
      if (_eflag) {
        if (_ef_atom) {
          evdwl+=*ap;
          eatom[ii]+=*ap*0.5;
          ap+=_inum;
          _ecoul+=*ap;
          eatom[ii]+=*ap*0.5;
          ap+=_inum;
        } else {
          evdwl+=*ap;
          ap+=_inum;
          _ecoul+=*ap;
          ap+=_inum;
        }
      }
      if (_vflag) {
        if (_vf_atom) {
          for (int j=0; j<6; j++) {
            vatom[ii][j]+=*ap*0.5;
            virial[j]+=*ap;
            ap+=_inum;
          }
        } else {
          for (int j=0; j<6; j++) {
            virial[j]+=*ap;
            ap+=_inum;
          }
        }
      }
    }
    for (int j=0; j<6; j++)
      virial[j]*=0.5;
  }
  evdwl*=0.5;
  ecoul+=_ecoul*0.5;
  return evdwl;
 }
 template <class numtyp, class acctyp>
 void PairGPUAtomT::get_answers(double **f, double **tor) {
  acctyp *ap=host_ans.begin();
  if (_gpu_nbor) {
    for (int i=0; i<_inum; i++) {
      f[i][0]+=*ap;
      ap++;
      f[i][1]+=*ap;
      ap++;
      f[i][2]+=*ap;
      ap+=2;
    }
    if (_rot) {
      for (int i=0; i<_inum; i++) {
        tor[i][0]+=*ap;
        ap++;
        tor[i][1]+=*ap;
        ap++;
        tor[i][2]+=*ap;
        ap+=2;
      }
    }
  } else {
    for (int i=0; i<_inum; i++) {
      int ii=_ilist[i];
      f[ii][0]+=*ap;
      ap++;
      f[ii][1]+=*ap;
      ap++;
      f[ii][2]+=*ap;
      ap+=2;
    }
    if (_rot) {
      for (int i=0; i<_inum; i++) {
        int ii=_ilist[i];
        tor[ii][0]+=*ap;
        ap++;
        tor[ii][1]+=*ap;
        ap++;
        tor[ii][2]+=*ap;
        ap+=2;
      }
    }
  }
 }
 // Sort arrays for neighbor list calculation
 template <class numtyp, class acctyp>
 void PairGPUAtomT::sort_neighbor(const int num_atoms) {
--- a/lib/gpu/pair_gpu_atom.h
+++ b/lib/gpu/pair_gpu_atom.h
@ -23,7 +23,6 @@
 #ifdef USE_OPENCL
 #include "geryon/ocl_device.h"
 #include "geryon/ocl_timer.h"
 #include "geryon/ocl_mat.h"
 #include "geryon/ocl_kernel.h"
@ -32,7 +31,6 @@ using namespace ucl_opencl;
 #else
 #include "cudpp.h"
 #include "geryon/nvd_device.h"
 #include "geryon/nvd_timer.h"
 #include "geryon/nvd_mat.h"
 #include "geryon/nvd_kernel.h"
@ -40,10 +38,6 @@ using namespace ucl_cudadr;
 #endif
 #ifndef int2
 struct int2 { int x; int y; };
 #endif
 #include "pair_gpu_precision.h"
 template <class numtyp, class acctyp>
@ -56,13 +50,9 @@ class PairGPUAtom {
  inline int max_atoms() const { return _max_atoms; }
  /// Current number of local+ghost atoms stored
  inline int nall() const { return _nall; }
  /// Current number of local atoms stored
  inline int inum() const { return _inum; }
  /// Set number of local+ghost atoms for future copy operations
  inline void nall(const int n) { _nall=n; }
  /// Set number of local atoms for future copy operations
  inline void inum(const int n) { _inum=n; }
  /// Memory usage per atom in this class
  int bytes_per_atom() const; 
@ -70,21 +60,33 @@ class PairGPUAtom {
  /// Clear any previous data and set up for a new LAMMPS run
  /** \param rot True if atom storage needs quaternions
    * \param gpu_nbor True if neighboring will be performed on device **/
-  bool init(const int inum, const int nall, const bool charge, const bool rot, 
+  bool init(const int nall, const bool charge, const bool rot, 
            UCL_Device &dev, const bool gpu_nbor=false, const bool bonds=false);
  /// Check if we have enough device storage and realloc if not
-  inline bool resize(const int inum, const int nall, bool &success) {
+  /** Returns true if resized with any call during this timestep **/
-    _inum=inum;
+  inline bool resize(const int nall, bool &success) {
    _nall=nall;
-    if (inum>_max_local || nall>_max_atoms) {
+    if (nall>_max_atoms) {
      clear_resize();
-      success = success && alloc(inum,nall);
+      success = success && alloc(nall);
-      return true;
+      _resized=true;
    }
-    return false;
+    return _resized;
  }
  /// If already initialized by another LAMMPS style, add fields as necessary
  /** \param rot True if atom storage needs quaternions
    * \param gpu_nbor True if neighboring will be performed on device **/
  bool add_fields(const bool charge, const bool rot, const bool gpu_nbor,
                  const bool bonds);
  /// Returns true if GPU is using charges
  bool charge() { return _charge; }
  /// Returns true if GPU is using quaternions
  bool quat() { return _rot; }
  /// Only free matrices of length inum or nall for resizing
  void clear_resize();
@ -100,28 +102,42 @@ class PairGPUAtom {
  /// Add copy times to timers
  inline void acc_timers() {
    time_pos.add_to_total();
-    time_answer.add_to_total();
+    if (_charge)
-    if (_other)
+      time_q.add_to_total();
-      time_other.add_to_total();
+    if (_rot)
      time_quat.add_to_total();
  }
  /// Add copy times to timers
  inline void zero_timers() {
    time_pos.zero();
-    time_answer.zero();
+    if (_charge)
-    if (_other)
+      time_q.zero();
-      time_other.zero();
+    if (_rot)
      time_quat.zero();
  }
  /// Return the total time for host/device data transfer
  /** Zeros the total so that the atom times are only included once **/
  inline double transfer_time() {
-    double total=time_pos.total_seconds()+time_answer.total_seconds();
+    double total=time_pos.total_seconds();
-    if (_other) total+=time_other.total_seconds();
+    time_pos.zero_total();
    if (_charge) {
      total+=time_q.total_seconds();
      time_q.zero_total();
    }
    if (_rot) {
      total+=time_q.total_seconds();
      time_quat.zero_total();
    }
    return total;
  }
  /// Return the total time for data cast/pack
-  inline double cast_time() { return _time_cast; }
+  /** Zeros the time so that atom times are only included once **/
  inline double cast_time() 
    { double t=_time_cast; _time_cast=0.0; return t; }
  /// Pack LAMMPS atom type constants into matrix and copy to device
  template <class dev_typ, class t1>
@ -216,8 +232,13 @@ class PairGPUAtom {
  // -------------------------COPY TO GPU ----------------------------------
  /// Signal that we need to transfer atom data for next timestep
  inline void data_unavail()
    { _x_avail=false; _q_avail=false; _quat_avail=false; _resized=false; }
  /// Cast positions and types to write buffer
  inline void cast_x_data(double **host_ptr, const int *host_type) {
    if (_x_avail==false) {
      double t=MPI_Wtime();
      #ifdef GPU_CAST
      memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
@ -237,11 +258,13 @@ class PairGPUAtom {
      #endif
      _time_cast+=MPI_Wtime()-t;
    }
  }
  /// Copy positions and types to device asynchronously
  /** Copies nall() elements **/
  inline void add_x_data(double **host_ptr, int *host_type) { 
    time_pos.start();
    if (_x_avail==false) {
      #ifdef GPU_CAST
      ucl_copy(dev_x_cast,host_x_cast,_nall*3,true);
      ucl_copy(dev_type_cast,host_type_cast,_nall,true);
@ -253,6 +276,8 @@ class PairGPUAtom {
      #else
      ucl_copy(dev_x,host_x,_nall*4,true);
      #endif
      _x_avail=true;
    }
    time_pos.stop();
  }
@ -262,9 +287,10 @@ class PairGPUAtom {
    add_x_data(host_ptr,host_type);
  }
-  /// Cast charges to write buffer
+  // Cast charges to write buffer
  template<class cpytyp>
  inline void cast_q_data(cpytyp *host_ptr) {
    if (_q_avail==false) {
      double t=MPI_Wtime();
      if (dev->device_type()==UCL_CPU) {
        if (sizeof(numtyp)==sizeof(double)) {
@ -280,15 +306,20 @@ class PairGPUAtom {
      }
      _time_cast+=MPI_Wtime()-t;
    }
  /// Copy charges to device asynchronously
  inline void add_q_data() {
    ucl_copy(dev_q,host_q,_nall,true);
  }
-  /// Cast quaternions to write buffer
+  // Copy charges to device asynchronously
  inline void add_q_data() {
    if (_q_avail==false) {
      ucl_copy(dev_q,host_q,_nall,true);
      _q_avail=true;
    }
  }
  // Cast quaternions to write buffer
  template<class cpytyp>
  inline void cast_quat_data(cpytyp *host_ptr) {
    if (_quat_avail==false) {
      double t=MPI_Wtime();
      if (dev->device_type()==UCL_CPU) {
        if (sizeof(numtyp)==sizeof(double)) {
@ -304,45 +335,20 @@ class PairGPUAtom {
      }
      _time_cast+=MPI_Wtime()-t;
    }
  /// Copy quaternions to device
  /** Copies nall()*4 elements **/
  inline void add_quat_data() {
    ucl_copy(dev_quat,host_quat,_nall*4,true);
  }
-  /// Copy data other than pos and data to device
+  // Copy quaternions to device
-  inline void add_other_data() {
+  /** Copies nall()*4 elements **/
-    time_other.start();
+  inline void add_quat_data() {
-    if (_charge)
+    if (_quat_avail==false) {
-      add_q_data();
+      ucl_copy(dev_quat,host_quat,_nall*4,true);
-    if (_rot)
+      _quat_avail=true;
-      add_quat_data();
+    }
    time_other.stop();
  }
  /// Return number of bytes used on device
-  inline double gpu_bytes() { return _gpu_bytes; } 
+  inline double max_gpu_bytes() 
-
+    { double m=_max_gpu_bytes; _max_gpu_bytes=0.0; return m; } 
  // -------------------------COPY FROM GPU -------------------------------
  /// Copy answers from device into read buffer asynchronously
  void copy_answers(const bool eflag, const bool vflag,
                    const bool ef_atom, const bool vf_atom);
  /// Copy answers from device into read buffer asynchronously
  void copy_answers(const bool eflag, const bool vflag,
                    const bool ef_atom, const bool vf_atom, int *ilist);
  /// Copy energy and virial data into LAMMPS memory
  double energy_virial(double *eatom, double **vatom, double *virial);
  /// Copy energy and virial data into LAMMPS memory
  double energy_virial(double *eatom, double **vatom, double *virial,
                       double &ecoul);
  /// Add forces and torques from the GPU into a LAMMPS pointer
  void get_answers(double **f, double **tor);
  // ------------------------------ DATA ----------------------------------
@ -352,10 +358,6 @@ class PairGPUAtom {
  UCL_D_Vec<numtyp> dev_q;
  /// Quaterions
  UCL_D_Vec<numtyp> dev_quat;
  /// Force and possibly torque
  UCL_D_Vec<acctyp> dev_ans;
  /// Energy and virial per-atom storage
  UCL_D_Vec<acctyp> dev_engv;
  #ifdef GPU_CAST
  UCL_D_Vec<double> dev_x_cast;
@ -370,10 +372,6 @@ class PairGPUAtom {
  UCL_H_Vec<numtyp> host_q;
  /// Buffer for moving quat data to GPU
  UCL_H_Vec<numtyp> host_quat;
  /// Force and possibly torque data on host
  UCL_H_Vec<acctyp> host_ans;
  /// Energy/virial data on host
  UCL_H_Vec<acctyp> host_engv;
  /// Cell list identifiers for device nbor builds
  UCL_D_Vec<unsigned> dev_cell_id;
@ -383,7 +381,7 @@ class PairGPUAtom {
  UCL_D_Vec<int> dev_tag;
  /// Device timers
-  UCL_Timer time_pos, time_other, time_answer;
+  UCL_Timer time_pos, time_q, time_quat;
  /// Geryon device
  UCL_Device *dev;
@ -397,17 +395,17 @@ class PairGPUAtom {
  bool _compiled;
-  bool alloc(const int inum, const int nall);
+  // True if data has been copied to device already
  bool _x_avail, _q_avail, _quat_avail, _resized;
-  bool _allocated, _eflag, _vflag, _ef_atom, _vf_atom, _rot, _charge, _other;
+  bool alloc(const int nall);
-  int _max_local, _max_atoms, _nall, _inum, _e_fields, _ev_fields;
+  
  bool _allocated, _rot, _charge, _other;
  int _max_atoms, _nall;
  bool _gpu_nbor, _bonds;
  int *_ilist;
  double _time_cast;
-  double _gpu_bytes;
+  double _max_gpu_bytes;
  bool _newton;
  #ifndef USE_OPENCL
  CUDPPConfiguration sort_config;
--- a/lib/gpu/pair_gpu_balance.h
+++ b/lib/gpu/pair_gpu_balance.h
@ -23,7 +23,7 @@
 #define _HD_BALANCE_EVERY 25
 #define _HD_BALANCE_WEIGHT 0.5
-#define _HD_BALANCE_GAP 1.05
+#define _HD_BALANCE_GAP 1.10
 /// Host/device load balancer
 template<class numtyp, class acctyp>
@ -33,7 +33,8 @@ class PairGPUBalance {
  inline ~PairGPUBalance() { clear(); }
  /// Clear any old data and setup for new LAMMPS run
-  inline void init(PairGPUDevice<numtyp, acctyp> *gpu, const double split);
+  inline void init(PairGPUDevice<numtyp, acctyp> *gpu, const bool gpu_nbor,
                   const double split);
  /// Clear all host and device data
  inline void clear() {
@ -44,22 +45,24 @@ class PairGPUBalance {
    }
  }
  /// Return the timestep since initialization
  inline int timestep() { return _timestep; }
  /// Get a count of the number of particles host will handle for initial alloc
-  inline int first_host_count(const int nlocal,const bool gpu_nbor,
+  inline int first_host_count(const int nlocal, const double gpu_split,
-                              const double gpu_split) const {
+                              const bool gpu_nbor) const {
    int host_nlocal=0;
    if (gpu_nbor && gpu_split!=1.0) {
      if (gpu_split>0)
        host_nlocal=static_cast<int>(ceil((1.0-gpu_split)*nlocal));
      else
-        host_nlocal=static_cast<int>(ceil(0.1*nlocal));
+        host_nlocal=static_cast<int>(ceil(0.05*nlocal));
    }
    return host_nlocal;
  }
  /// Return the number of particles the device will handle this timestep
-  inline int get_gpu_count(const int timestep, const int ago,
+  inline int get_gpu_count(const int ago, const int inum_full);
                           const int inum_full);
  /// Return the average fraction of particles handled by device on all procs
  inline double all_avg_split() {
@ -82,10 +85,10 @@ class PairGPUBalance {
    if (_measure_this_step) {
      _device->gpu->sync();
      _device->gpu_barrier();
      _device->start_host_timer();
      _device_time.start();
      _device->gpu->sync();
      _device->gpu_barrier();
      _device->start_host_timer();
    }
  }
@ -95,34 +98,34 @@ class PairGPUBalance {
  /// Calculate the new host/device split based on the cpu and device times
  /** \note Only does calculation every _HD_BALANCE_EVERY timesteps 
            (and first 10) **/
-  inline void balance(const double cpu_time, const bool gpu_nbor);
+  inline void balance(const double cpu_time);
  /// Calls balance() and then get_gpu_count()
-  inline int balance(const int timestep, const int ago, const int inum_full,
+  inline int balance(const int ago,const int inum_full,const double cpu_time) {
-                     const double cpu_time, const bool gpu_nbor) {
+    balance(cpu_time);
-    balance(cpu_time,gpu_nbor);
+    return get_gpu_count(ago,inum_full);
    return get_gpu_count(timestep,ago,inum_full);
  }
 private:
  PairGPUDevice<numtyp,acctyp> *_device;
  UCL_Timer _device_time;
-  bool _init_done;
+  bool _init_done, _gpu_nbor;
  bool _load_balance;
  double _actual_split, _avg_split, _desired_split, _max_split;
  int _avg_count;
  bool _measure_this_step;
-  int _inum, _inum_full;
+  int _inum, _inum_full, _timestep;
 };
 #define PairGPUBalanceT PairGPUBalance<numtyp,acctyp>
 template <class numtyp, class acctyp>
 void PairGPUBalanceT::init(PairGPUDevice<numtyp, acctyp> *gpu, 
-			   const double split) {
+                           const bool gpu_nbor, const double split) {
  clear();
  _gpu_nbor=gpu_nbor;
  _init_done=true;
  _device=gpu;
@ -130,7 +133,7 @@ void PairGPUBalanceT::init(PairGPUDevice<numtyp, acctyp> *gpu,
  if (split<0.0) {
    _load_balance=true;
-    _desired_split=0.9;
+    _desired_split=0.90;
  } else {
    _load_balance=false;
    _desired_split=split;
@ -138,14 +141,14 @@ void PairGPUBalanceT::init(PairGPUDevice<numtyp, acctyp> *gpu,
  _actual_split=_desired_split;
  _avg_split=0.0;
  _avg_count=0;
  _timestep=0;
 }
 template <class numtyp, class acctyp>
-int PairGPUBalanceT::get_gpu_count(const int timestep, const int ago,
+int PairGPUBalanceT::get_gpu_count(const int ago, const int inum_full) {
 			           const int inum_full) {
  _measure_this_step=false;
  if (_load_balance) {
-    if (_avg_count<11 || timestep%_HD_BALANCE_EVERY==0) {
+    if (_avg_count<11 || _timestep%_HD_BALANCE_EVERY==0) {
      _measure_this_step=true;
      _inum_full=inum_full;
    }
@ -156,44 +159,44 @@ int PairGPUBalanceT::get_gpu_count(const int timestep, const int ago,
  }
  _inum=static_cast<int>(floor(_actual_split*inum_full));
  if (_inum==0) _inum++;
  _timestep++;
  return _inum;
 }
 template <class numtyp, class acctyp>
-void PairGPUBalanceT::balance(const double cpu_time, const bool gpu_nbor) {
+void PairGPUBalanceT::balance(const double cpu_time) {
  if (_measure_this_step) {
    _measure_this_step=false;
    double gpu_time=_device_time.seconds();
    double max_gpu_time;
    MPI_Allreduce(&gpu_time,&max_gpu_time,1,MPI_DOUBLE,MPI_MAX,
                  _device->gpu_comm());
    if (_inum_full==_inum) {
      _desired_split=1.0;
      return;
    }
-    _measure_this_step=false;
+    double cpu_time_per_atom=cpu_time/(_inum_full-_inum);
-    double gpu_time=_device_time.seconds();
+    double cpu_other_time=_device->host_time()-cpu_time;
    int host_inum=static_cast<int>((max_gpu_time-cpu_other_time)/
                                   cpu_time_per_atom);
-    double cpu_gpu_time[3], max_times[3];
+    double split=static_cast<double>(_inum_full-host_inum)/_inum_full;
-    cpu_gpu_time[0]=cpu_time/(_inum_full-_inum);
+    _desired_split=split*_HD_BALANCE_GAP;
-    cpu_gpu_time[1]=gpu_time/_inum;
+    if (_desired_split>1.0)
-    cpu_gpu_time[2]=(_device->host_time()-cpu_time)/_inum_full;
+      _desired_split=1.0;
    if (_desired_split<0.0)
      _desired_split=0.0;
-    MPI_Allreduce(cpu_gpu_time,max_times,3,MPI_DOUBLE,MPI_MAX,
+    if (!_gpu_nbor) {
                  _device->gpu_comm());
    double split=(max_times[0]+max_times[2])/(max_times[0]+max_times[1]);
    split*=_HD_BALANCE_GAP;
    if (split>1.0)
      split=1.0;
    if (_avg_count<10)
      _desired_split=(_desired_split*_avg_count+split)/(_avg_count+1);
    else
      _desired_split=_desired_split*(1.0-_HD_BALANCE_WEIGHT)+
                     _HD_BALANCE_WEIGHT*split;
    if (!gpu_nbor) {
      if (_desired_split<_max_split)
        _actual_split=_desired_split;
      else
        _actual_split=_max_split;
    }
 //std::cout << gpu_time << " " << max_gpu_time << " " << cpu_other_time << " " << cpu_time_per_atom << " " << cpu_time << " " << _desired_split << " " << host_inum << std::endl;
  }
  _avg_split+=_desired_split;
  _avg_count++;
--- a/lib/gpu/pair_gpu_build_kernel.cu
+++ b/lib/gpu/pair_gpu_build_kernel.cu
@ -18,7 +18,7 @@
 #ifdef NV_KERNEL
-#include "geryon/ucl_nv_kernel.h"
+#include "nv_kernel_def.h"
 texture<float4> neigh_tex;
 #ifdef _DOUBLE_DOUBLE
@ -36,6 +36,7 @@ __inline float4 fetch_pos(const int& i, const float4 *pos)
 #else
 #define fetch_pos(i,y) x_[i]
 #define BLOCK_NBOR_BUILD 64
 #endif
@ -54,29 +55,30 @@ __inline float4 fetch_pos(const int& i, const float4 *pos)
 #define numtyp4 float4
 #endif
-#define CELL_BLOCK_SIZE 64
+#define BLOCK_CELL_2D 8
-#define BLOCK_2D 8
+
 #define SBBITS 30
 #define SBBITS 30
 __kernel void transpose(int *out, int *in, int columns_in, int rows_in)
 {
-	__local float block[BLOCK_2D][BLOCK_2D+1];
+	__local float block[BLOCK_CELL_2D][BLOCK_CELL_2D+1];
 	unsigned ti=THREAD_ID_X;
 	unsigned tj=THREAD_ID_Y;
 	unsigned bi=BLOCK_ID_X;
 	unsigned bj=BLOCK_ID_Y;
-	unsigned i=bi*BLOCK_2D+ti;
+	unsigned i=bi*BLOCK_CELL_2D+ti;
-	unsigned j=bj*BLOCK_2D+tj;
+	unsigned j=bj*BLOCK_CELL_2D+tj;
 	if ((i<columns_in) && (j<rows_in))
 		block[tj][ti]=in[j*columns_in+i];
 	__syncthreads();
-	i=bj*BLOCK_2D+ti;
+	i=bj*BLOCK_CELL_2D+ti;
-	j=bi*BLOCK_2D+tj;
+	j=bi*BLOCK_CELL_2D+tj;
 	if ((i<rows_in) && (j<columns_in))
 		out[j*rows_in+i] = block[ti][tj];
 }
@ -142,6 +144,7 @@ __kernel void calc_neigh_list_cell(numtyp4 *pos,
 				     int *cell_counts,
 				     int *nbor_list,
 				     int *host_nbor_list,
 				     int *host_numj, 
 				     int neigh_bin_size, 
 				     numtyp cell_size,
 				     int ncellx, int ncelly, int ncellz,
@ -154,8 +157,8 @@ __kernel void calc_neigh_list_cell(numtyp4 *pos,
  int icell = ix + iy*ncellx + iz*ncellx*ncelly;
-  __shared__ int cell_list_sh[CELL_BLOCK_SIZE];
+  __shared__ int cell_list_sh[BLOCK_NBOR_BUILD];
-  __shared__ numtyp4 pos_sh[CELL_BLOCK_SIZE];
+  __shared__ numtyp4 pos_sh[BLOCK_NBOR_BUILD];
  int icell_begin = cell_counts[icell];
  int icell_end = cell_counts[icell+1];
@ -185,9 +188,9 @@ __kernel void calc_neigh_list_cell(numtyp4 *pos,
      neigh_list=neigh_counts+stride;
      nbor_list[pid_i]=pid_i;
    } else {
-      stride=nt-inum;
+      stride=1;
-    	neigh_counts=host_nbor_list+pid_i-inum;
+    	neigh_counts=host_numj+pid_i-inum;
-      neigh_list=neigh_counts+stride;
+      neigh_list=host_nbor_list+(pid_i-inum)*neigh_bin_size;
    }
    // loop through neighbors
@ -203,13 +206,13 @@ __kernel void calc_neigh_list_cell(numtyp4 *pos,
          int num_atom_cell = jcell_end - jcell_begin;
          // load jcell to shared memory
-          int num_iter = (int)ceil((numtyp)num_atom_cell/CELL_BLOCK_SIZE);
+          int num_iter = (int)ceil((numtyp)num_atom_cell/BLOCK_NBOR_BUILD);
          for (int k = 0; k < num_iter; k++) {
-            int end_idx = min(CELL_BLOCK_SIZE, num_atom_cell-k*CELL_BLOCK_SIZE);
+            int end_idx = min(BLOCK_NBOR_BUILD, num_atom_cell-k*BLOCK_NBOR_BUILD);
            if (tid < end_idx) {
-              pid_j =  cell_particle_id[tid+k*CELL_BLOCK_SIZE+jcell_begin];
+              pid_j =  cell_particle_id[tid+k*BLOCK_NBOR_BUILD+jcell_begin];
              cell_list_sh[tid] = pid_j;
              atom_j = fetch_pos(pid_j,pos); //[pid_j];
              pos_sh[tid].x = atom_j.x;
@ -222,7 +225,6 @@ __kernel void calc_neigh_list_cell(numtyp4 *pos,
              for (int j = 0; j < end_idx; j++) {
                int pid_j = cell_list_sh[j]; // gather from shared memory
                if (pid_i<inum || pid_j<inum || pid_j>pid_i) {
                diff.x = atom_i.x - pos_sh[j].x;
                diff.y = atom_i.y - pos_sh[j].y;
                diff.z = atom_i.z - pos_sh[j].z;
@ -236,7 +238,6 @@ __kernel void calc_neigh_list_cell(numtyp4 *pos,
                  cnt++;
                }		
              }
              }
            }
 	          __syncthreads();
 	        } // for (k)
@ -249,9 +250,10 @@ __kernel void calc_neigh_list_cell(numtyp4 *pos,
 }
 __kernel void kernel_special(__global int *dev_nbor, 
-                             __global int *host_nbor_list, __global int *tag,
+                             __global int *host_nbor_list, 
                             __global int *host_numj, __global int *tag,
                             __global int *nspecial, __global int *special,
-                             int inum, int nt, int nall) {
+                             int inum, int nt, int nall, int max_nbors) {
  // ii indexes the two interacting particles in gi
  int ii=GLOBAL_ID_X;
@ -263,15 +265,17 @@ __kernel void kernel_special(__global int *dev_nbor,
    int n2=nspecial[ii*3+1];
    int n3=nspecial[ii*3+2];
    int numj;
    if (ii < inum) {
      stride=inum;
      list=dev_nbor+stride+ii;
-    } else {
+      numj=*list;
      stride=nt-inum;
      list=host_nbor_list+ii-inum;
    }
    int numj=*list;
      list+=stride;
    } else {
      stride=1;
      list=host_nbor_list+(ii-inum)*max_nbors;
      numj=host_numj[ii-inum];
    }
    list_end=list+numj*stride;
    for ( ; list<list_end; list+=stride) {
@ -294,4 +298,3 @@ __kernel void kernel_special(__global int *dev_nbor,
    }
  } // if ii
 }
--- a/lib/gpu/pair_gpu_device.cpp
+++ b/lib/gpu/pair_gpu_device.cpp
@ -19,13 +19,22 @@
 #include "pair_gpu_precision.h"
 #include <map>
 #include <math.h>
 #ifdef _OPENMP
 #include <omp.h>
 #endif
 #ifdef USE_OPENCL
 #include "pair_gpu_dev_cl.h"
 #else
 #include "pair_gpu_dev_ptx.h"
 #endif
 #define PairGPUDeviceT PairGPUDevice<numtyp, acctyp>
 template <class numtyp, class acctyp>
 PairGPUDeviceT::PairGPUDevice() : _init_count(0), _device_init(false),
                                  _gpu_mode(GPU_FORCE), _first_device(0),
-                                  _last_device(0) {
+                                  _last_device(0), _compiled(false) {
 }
 template <class numtyp, class acctyp>
@ -34,14 +43,19 @@ PairGPUDeviceT::~PairGPUDevice() {
 }
 template <class numtyp, class acctyp>
-bool PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica, 
+int PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica, 
                                const int first_gpu, const int last_gpu,
                                const int gpu_mode, const double p_split,
-                                 const int nthreads) {
+                                const int nthreads, const int t_per_atom) {
  _nthreads=nthreads;
  #ifdef _OPENMP
  omp_set_num_threads(nthreads);
  #endif
  _threads_per_atom=t_per_atom;
  _threads_per_charge=t_per_atom;
  if (_device_init)
-    return true;
+    return 0;
  _device_init=true;
  _comm_world=world;
  _comm_replica=replica;
@ -96,7 +110,12 @@ bool PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica,
  // set the device ID
  _procs_per_gpu=static_cast<int>(ceil(static_cast<double>(procs_per_node)/
                                       (last_gpu-first_gpu+1)));
-  int my_gpu=node_rank/_procs_per_gpu;
+  int my_gpu=node_rank/_procs_per_gpu+first_gpu;
  // Time on the device only if 1 proc per gpu
  _time_device=true;
  if (_procs_per_gpu>1)
    _time_device=false;
  // Set up a per device communicator
  MPI_Comm_split(node_comm,my_gpu,0,&_comm_gpu);
@ -104,39 +123,109 @@ bool PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica,
  gpu=new UCL_Device();
  if (my_gpu>=gpu->num_devices())
-    return false;
+    return -2;
  gpu->set(my_gpu);
-  return true;
+
  _long_range_precompute=0;
  int flag=compile_kernels();
  return flag;
 }
 template <class numtyp, class acctyp>
-bool PairGPUDeviceT::init(const bool charge, const bool rot, const int nlocal, 
+int PairGPUDeviceT::init(PairGPUAns<numtyp,acctyp> &ans, const bool charge,
                         const bool rot, const int nlocal, 
                         const int host_nlocal, const int nall,
-                          const int maxspecial, const bool gpu_nbor, 
+                         PairGPUNbor *nbor, const int maxspecial,
                         const int gpu_host, const int max_nbors, 
                         const double cell_size, const bool pre_cut) {
  if (!_device_init)
-    return false;                          
+    return -1;
-  if (_init_count==0) {
+  if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false)
-    // Initialize atom and nbor data
+    return -5;
  // Counts of data transfers for timing overhead estimates
  _data_in_estimate=0;
  _data_out_estimate=1;
  // Initial number of local particles
  int ef_nlocal=nlocal;
  if (_particle_split<1.0 && _particle_split>0.0)
    ef_nlocal=static_cast<int>(_particle_split*nlocal);
-    if (!atom.init(ef_nlocal,nall,charge,rot,*gpu,gpu_nbor,
+
-                   gpu_nbor && maxspecial>0))
+  bool gpu_nbor=false;
-      return false;
+  if (_gpu_mode==GPU_NEIGH)
-    if (!nbor.init(ef_nlocal,host_nlocal,max_nbors,maxspecial,*gpu,gpu_nbor,
+    gpu_nbor=true;
-                   gpu_host,pre_cut))
+    
-      return false;
+  if (_init_count==0) {
-    nbor.cell_size(cell_size);
+    // Initialize atom and nbor data
    if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor && maxspecial>0))
      return -3;
    _data_in_estimate++;
    if (charge)
      _data_in_estimate++;
    if (rot)
      _data_in_estimate++;
  } else {
-    if (cell_size>nbor.cell_size())
+    if (atom.charge()==false && charge)
-      nbor.cell_size(cell_size);
+      _data_in_estimate++;
    if (atom.quat()==false && rot)
      _data_in_estimate++;
    if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor && maxspecial))
      return -3;
  }
  if (!ans.init(ef_nlocal,charge,rot,*gpu))
    return -3;
  if (!nbor->init(&_nbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial,
                  *gpu,gpu_nbor,gpu_host,pre_cut, _block_cell_2d, 
                  _block_cell_id, _block_nbor_build))
    return -3;
  nbor->cell_size(cell_size);
  _init_count++;
-  return true;
+  return 0;
 }
 template <class numtyp, class acctyp>
 int PairGPUDeviceT::init(PairGPUAns<numtyp,acctyp> &ans, const int nlocal,
                         const int nall) {
  if (!_device_init)
    return -1;                          
  if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false)
    return -5;
  if (_init_count==0) {
    // Initialize atom and nbor data
    if (!atom.init(nall,true,false,*gpu,false,false))
      return -3;
  } else
    if (!atom.add_fields(true,false,false,false))
      return -3;
  if (!ans.init(nlocal,true,false,*gpu))
    return -3;
  _init_count++;
  return 0;
 }
 template <class numtyp, class acctyp>
 void PairGPUDeviceT::set_single_precompute
                     (PPPMGPUMemory<numtyp,acctyp,float,_lgpu_float4> *pppm) {
  _long_range_precompute=1;
  pppm_single=pppm;
 }
 template <class numtyp, class acctyp>
 void PairGPUDeviceT::set_double_precompute
                     (PPPMGPUMemory<numtyp,acctyp,double,_lgpu_double4> *pppm) {
  _long_range_precompute=2;
  pppm_double=pppm;
 }
 template <class numtyp, class acctyp>
@ -152,11 +241,17 @@ void PairGPUDeviceT::init_message(FILE *screen, const char *name,
    fprintf(screen,"\n-------------------------------------");
    fprintf(screen,"-------------------------------------\n");
    fprintf(screen,"- Using GPGPU acceleration for %s:\n",name);
-    fprintf(screen,"-  with %d procs per device.\n",_procs_per_gpu);
+    fprintf(screen,"-  with %d proc(s) per device.\n",_procs_per_gpu);
    #ifdef _OPENMP
    fprintf(screen,"-  with %d thread(s) per proc.\n",_nthreads);
    #endif
    fprintf(screen,"-------------------------------------");
    fprintf(screen,"-------------------------------------\n");
-    for (int i=first_gpu; i<=last_gpu; i++) {
+    int last=last_gpu+1;
    if (last>gpu->num_devices())
      last=gpu->num_devices();
    for (int i=first_gpu; i<last; i++) {
      std::string sname=gpu->name(i)+", "+toa(gpu->cores(i))+" cores, "+fs+
                        toa(gpu->gigabytes(i))+" GB, "+toa(gpu->clock_rate(i))+
                        " GHZ (";
@ -177,32 +272,152 @@ void PairGPUDeviceT::init_message(FILE *screen, const char *name,
 }
 template <class numtyp, class acctyp>
-void PairGPUDeviceT::output_times(UCL_Timer &time_pair, const double avg_split,
+void PairGPUDeviceT::estimate_gpu_overhead(const int kernel_calls, 
-                                  const double max_bytes, FILE *screen) {
+                                           double &gpu_overhead,
-  double single[5], times[5];
+                                           double &gpu_driver_overhead) {
  UCL_H_Vec<int> *host_data_in=NULL, *host_data_out=NULL;
  UCL_D_Vec<int> *dev_data_in=NULL, *dev_data_out=NULL, *kernel_data=NULL;
  UCL_Timer *timers_in=NULL, *timers_out=NULL, *timers_kernel=NULL;
  UCL_Timer over_timer(*gpu);
-  single[0]=atom.transfer_time();
+  if (_data_in_estimate>0) {
    host_data_in=new UCL_H_Vec<int>[_data_in_estimate];
    dev_data_in=new UCL_D_Vec<int>[_data_in_estimate];
    timers_in=new UCL_Timer[_data_in_estimate];
  }
  if (_data_out_estimate>0) {
    host_data_out=new UCL_H_Vec<int>[_data_out_estimate];
    dev_data_out=new UCL_D_Vec<int>[_data_out_estimate];
    timers_out=new UCL_Timer[_data_out_estimate];
  }
  if (kernel_calls>0) {
    kernel_data=new UCL_D_Vec<int>[kernel_calls];
    timers_kernel=new UCL_Timer[kernel_calls];
  }
  for (int i=0; i<_data_in_estimate; i++) {
    host_data_in[i].alloc(1,*gpu);
    dev_data_in[i].alloc(1,*gpu);
    timers_in[i].init(*gpu);
  }  
  for (int i=0; i<_data_out_estimate; i++) {
    host_data_out[i].alloc(1,*gpu);
    dev_data_out[i].alloc(1,*gpu);
    timers_out[i].init(*gpu);
  }  
  for (int i=0; i<kernel_calls; i++) {
    kernel_data[i].alloc(1,*gpu);
    timers_kernel[i].init(*gpu);
  }  
  gpu_overhead=0.0;
  gpu_driver_overhead=0.0;
  for (int i=0; i<10; i++) {
    gpu->sync();
    gpu_barrier();
    over_timer.start();
    gpu->sync();
    gpu_barrier();
    double driver_time=MPI_Wtime();
    for (int i=0; i<_data_in_estimate; i++) {
      timers_in[i].start();
      ucl_copy(dev_data_in[i],host_data_in[i],true);
      timers_in[i].stop();
    }
    for (int i=0; i<kernel_calls; i++) {
      timers_kernel[i].start();
      zero(kernel_data[i],1);
      timers_kernel[i].stop();
    }
    for (int i=0; i<_data_out_estimate; i++) {
      timers_out[i].start();
      ucl_copy(host_data_out[i],dev_data_out[i],true);
      timers_out[i].stop();
    }
    over_timer.stop();
    double time=over_timer.seconds();
    driver_time=MPI_Wtime()-driver_time;
    if (time_device()) {
      for (int i=0; i<_data_in_estimate; i++)
        timers_in[i].add_to_total();
      for (int i=0; i<kernel_calls; i++)
        timers_kernel[i].add_to_total();
      for (int i=0; i<_data_out_estimate; i++)
        timers_out[i].add_to_total();
    }
    double mpi_time, mpi_driver_time;
    MPI_Allreduce(&time,&mpi_time,1,MPI_DOUBLE,MPI_MAX,gpu_comm());
    MPI_Allreduce(&driver_time,&mpi_driver_time,1,MPI_DOUBLE,MPI_MAX,gpu_comm());
    gpu_overhead+=mpi_time;
    gpu_driver_overhead+=mpi_driver_time;
  }
  gpu_overhead/=10.0;
  gpu_driver_overhead/=10.0;
  if (_data_in_estimate>0) {
    delete [] host_data_in;
    delete [] dev_data_in;
    delete [] timers_in;
  }
  if (_data_out_estimate>0) {
    delete [] host_data_out;
    delete [] dev_data_out;
    delete [] timers_out;
  }
  if (kernel_calls>0) {
    delete [] kernel_data;
    delete [] timers_kernel;
  }
 }              
 template <class numtyp, class acctyp>
 void PairGPUDeviceT::output_times(UCL_Timer &time_pair, 
                                  PairGPUAns<numtyp,acctyp> &ans, 
                                  PairGPUNbor &nbor, const double avg_split, 
                                  const double max_bytes, 
                                  const double gpu_overhead,
                                  const double driver_overhead, 
                                  const int threads_per_atom, FILE *screen) {
  double single[8], times[8];
  single[0]=atom.transfer_time()+ans.transfer_time();
  single[1]=nbor.time_nbor.total_seconds();
  single[2]=nbor.time_kernel.total_seconds();
  single[3]=time_pair.total_seconds();
-  single[4]=atom.cast_time();
+  single[4]=atom.cast_time()+ans.cast_time();
  single[5]=gpu_overhead;
  single[6]=driver_overhead;
  single[7]=ans.cpu_idle_time();
-  MPI_Reduce(single,times,5,MPI_DOUBLE,MPI_SUM,0,_comm_replica);
+  MPI_Reduce(single,times,8,MPI_DOUBLE,MPI_SUM,0,_comm_replica);
-  double my_max_bytes=max_bytes;
+  double my_max_bytes=max_bytes+atom.max_gpu_bytes();
  double mpi_max_bytes;
  MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica);
  double max_mb=mpi_max_bytes/(1024.0*1024.0);
  if (replica_me()==0)
-    if (screen && times[3]>0.0) {
+    if (screen && times[5]>0.0) {
      fprintf(screen,"\n\n-------------------------------------");
      fprintf(screen,"--------------------------------\n");
      fprintf(screen,"      GPU Time Info (average): ");
      fprintf(screen,"\n-------------------------------------");
      fprintf(screen,"--------------------------------\n");
-      if (procs_per_gpu()==1) {
+      if (time_device()) {
        fprintf(screen,"Data Transfer:   %.4f s.\n",times[0]/_replica_size);
        fprintf(screen,"Data Cast/Pack:  %.4f s.\n",times[4]/_replica_size);
        fprintf(screen,"Neighbor copy:   %.4f s.\n",times[1]/_replica_size);
@ -212,7 +427,71 @@ void PairGPUDeviceT::output_times(UCL_Timer &time_pair, const double avg_split,
          fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/_replica_size);
        fprintf(screen,"Force calc:      %.4f s.\n",times[3]/_replica_size);
      }
      fprintf(screen,"GPU Overhead:    %.4f s.\n",times[5]/_replica_size);
      fprintf(screen,"Average split:   %.4f.\n",avg_split);
      fprintf(screen,"Threads / atom:  %d.\n",threads_per_atom);
      fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);
      fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[6]/_replica_size);
      fprintf(screen,"CPU Idle_Time:   %.4f s.\n",times[7]/_replica_size);
      fprintf(screen,"-------------------------------------");
      fprintf(screen,"--------------------------------\n\n");
    }
 }
 template <class numtyp, class acctyp>
 void PairGPUDeviceT::output_kspace_times(UCL_Timer &time_in, 
                                         UCL_Timer &time_out,
                                         UCL_Timer &time_map,
                                         UCL_Timer &time_rho,
                                         UCL_Timer &time_interp,
                                         PairGPUAns<numtyp,acctyp> &ans, 
                                         const double max_bytes, 
                                         const double cpu_time, 
                                         const double idle_time, FILE *screen) {
  double single[8], times[8];
  single[0]=time_out.total_seconds();
  single[1]=time_in.total_seconds()+atom.transfer_time()+atom.cast_time();
  single[2]=time_map.total_seconds();
  single[3]=time_rho.total_seconds();
  single[4]=time_interp.total_seconds();
  single[5]=ans.transfer_time()+ans.cast_time();
  single[6]=cpu_time;
  single[7]=idle_time;
  MPI_Reduce(single,times,8,MPI_DOUBLE,MPI_SUM,0,_comm_replica);
  double my_max_bytes=max_bytes+atom.max_gpu_bytes();
  double mpi_max_bytes;
  MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica);
  double max_mb=mpi_max_bytes/(1024.0*1024.0);
  if (replica_me()==0)
    if (screen && times[6]>0.0) {
      fprintf(screen,"\n\n-------------------------------------");
      fprintf(screen,"--------------------------------\n");
      fprintf(screen,"      GPU Time Info (average): ");
      fprintf(screen,"\n-------------------------------------");
      fprintf(screen,"--------------------------------\n");
      if (time_device()) {
        fprintf(screen,"Data Out:        %.4f s.\n",times[0]/_replica_size);
        fprintf(screen,"Data In:         %.4f s.\n",times[1]/_replica_size);
        fprintf(screen,"Kernel (map):    %.4f s.\n",times[2]/_replica_size);
        fprintf(screen,"Kernel (rho):    %.4f s.\n",times[3]/_replica_size);
        fprintf(screen,"Force interp:    %.4f s.\n",times[4]/_replica_size);
        fprintf(screen,"Total rho:       %.4f s.\n",
                (times[0]+times[2]+times[3])/_replica_size);
        fprintf(screen,"Total interp:    %.4f s.\n",
                (times[1]+times[4])/_replica_size);
        fprintf(screen,"Force copy/cast: %.4f s.\n",times[5]/_replica_size);
        fprintf(screen,"Total:           %.4f s.\n",
                (times[0]+times[1]+times[2]+times[3]+times[4]+times[5])/
                _replica_size);
      }
      fprintf(screen,"CPU Poisson:     %.4f s.\n",times[6]/_replica_size);
      fprintf(screen,"CPU Idle Time:   %.4f s.\n",times[7]/_replica_size);
      fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);
      fprintf(screen,"-------------------------------------");
@ -223,10 +502,17 @@ void PairGPUDeviceT::output_times(UCL_Timer &time_pair, const double avg_split,
 template <class numtyp, class acctyp>
 void PairGPUDeviceT::clear() {
  if (_init_count>0) {
    _long_range_precompute=0;
    _init_count--;
    if (_init_count==0) {
      atom.clear();
-      nbor.clear();
+      _nbor_shared.clear();
      if (_compiled) {
        k_zero.clear();
        k_info.clear();
        delete dev_program;
        _compiled=false;
      }
    }
  }
 }
@ -241,21 +527,80 @@ void PairGPUDeviceT::clear_device() {
  }
 }
 template <class numtyp, class acctyp>
 int PairGPUDeviceT::compile_kernels() {
  int flag=0;
  if (_compiled)
  	return flag;
  std::string flags="-cl-mad-enable";
  dev_program=new UCL_Program(*gpu);
  int success=dev_program->load_string(pair_gpu_dev_kernel,flags.c_str());
  if (success!=UCL_SUCCESS)
    return -4;
  k_zero.set_function(*dev_program,"kernel_zero");
  k_info.set_function(*dev_program,"kernel_info");
  _compiled=true;
  UCL_H_Vec<int> h_gpu_lib_data(14,*gpu,UCL_NOT_PINNED);
  UCL_D_Vec<int> d_gpu_lib_data(14,*gpu);
  k_info.set_size(1,1);
  k_info.run(&d_gpu_lib_data.begin());
  ucl_copy(h_gpu_lib_data,d_gpu_lib_data,false);
  #ifndef USE_OPENCL
  if (static_cast<double>(h_gpu_lib_data[0])/100.0>gpu->arch())
    return -4;
  #endif
  _num_mem_threads=h_gpu_lib_data[1];
  _warp_size=h_gpu_lib_data[2];
  if (_threads_per_atom<1)
    _threads_per_atom=h_gpu_lib_data[3];
  if (_threads_per_charge<1)
    _threads_per_charge=h_gpu_lib_data[13];
  _pppm_max_spline=h_gpu_lib_data[4];
  _pppm_block=h_gpu_lib_data[5];
  _block_pair=h_gpu_lib_data[6];
  _max_shared_types=h_gpu_lib_data[7];
  _block_cell_2d=h_gpu_lib_data[8];
  _block_cell_id=h_gpu_lib_data[9];
  _block_nbor_build=h_gpu_lib_data[10];
  _block_bio_pair=h_gpu_lib_data[11];
  _max_bio_shared_types=h_gpu_lib_data[12];
  if (static_cast<size_t>(_block_pair)>gpu->group_size())
    _block_pair=gpu->group_size();
  if (static_cast<size_t>(_block_bio_pair)>gpu->group_size())
    _block_bio_pair=gpu->group_size();
  if (_threads_per_atom>_warp_size)
    _threads_per_atom=_warp_size;
  if (_warp_size%_threads_per_atom!=0)
    _threads_per_atom=1;
  if (_threads_per_charge>_warp_size)
    _threads_per_charge=_warp_size;
  if (_warp_size%_threads_per_charge!=0)
    _threads_per_charge=1;
  return flag;    
 }
 template <class numtyp, class acctyp>
 double PairGPUDeviceT::host_memory_usage() const {
-  return atom.host_memory_usage()+
+  return atom.host_memory_usage()+4*sizeof(numtyp)+
         nbor.host_memory_usage()+4*sizeof(numtyp)+
         sizeof(PairGPUDevice<numtyp,acctyp>);
 }
 template class PairGPUDevice<PRECISION,ACC_PRECISION>;
 PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
-bool lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
+int lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
                    const int last_gpu, const int gpu_mode, 
-                     const double particle_split, const int nthreads) {
+                    const double particle_split, const int nthreads,
                    const int t_per_atom) {
  return pair_gpu_device.init_device(world,replica,first_gpu,last_gpu,gpu_mode,
-                                     particle_split,nthreads);
+                                     particle_split,nthreads,t_per_atom);
 }
 void lmp_clear_device() {
@ -264,14 +609,5 @@ void lmp_clear_device() {
 double lmp_gpu_forces(double **f, double **tor, double *eatom,
                      double **vatom, double *virial, double &ecoul) {
-  if (pair_gpu_device.init_count()) {
+  return pair_gpu_device.fix_gpu(f,tor,eatom,vatom,virial,ecoul);
    pair_gpu_device.stop_host_timer();
    pair_gpu_device.gpu->sync();
    double evdw=pair_gpu_device.atom.energy_virial(eatom,vatom,virial,ecoul);
    pair_gpu_device.atom.get_answers(f,tor);
    return evdw;
  }
  return 0.0;
 }
--- a/lib/gpu/pair_gpu_device.h
+++ b/lib/gpu/pair_gpu_device.h
@ -19,11 +19,17 @@
 #define PAIR_GPU_DEVICE_H
 #include "pair_gpu_atom.h"
 #include "pair_gpu_ans.h"
 #include "pair_gpu_nbor.h"
 #include "pppm_gpu_memory.h"
 #include "mpi.h"
 #include <sstream>
 #include "stdio.h"
 #include <string>
 #include <queue>
 template <class numtyp, class acctyp, 
          class grdtyp, class grdtyp4> class PPPMGPUMemory;
 template <class numtyp, class acctyp>
 class PairGPUDevice {
@ -33,10 +39,15 @@ class PairGPUDevice {
  /// Initialize the device for use by this process
  /** Sets up a per-device MPI communicator for load balancing and initializes
-    * the device (>=first_gpu and <=last_gpu) that this proc will be using **/
+    * the device (>=first_gpu and <=last_gpu) that this proc will be using 
-  bool init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, 
+    * Returns:
    * -  0 if successfull
    * - -2 if GPU not found
    * - -4 if GPU library not compiled for GPU **/
  int init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, 
                   const int last_gpu, const int gpu_mode, 
-                   const double particle_split, const int nthreads);
+                   const double particle_split, const int nthreads,
                   const int t_per_atom);
  /// Initialize the device for Atom and Neighbor storage
  /** \param rot True if quaternions need to be stored
@ -50,19 +61,67 @@ class PairGPUDevice {
    * \param max_nbors Initial number of rows in the neighbor matrix
    * \param cell_size cutoff+skin 
    * \param pre_cut True if cutoff test will be performed in separate kernel
-    *                than the force kernel **/
+    *                than the force kernel 
-  bool init(const bool charge, const bool rot, const int nlocal,
+    * Returns:
-            const int host_nlocal, const int nall, const int maxspecial, 
+    * -  0 if successfull
-            const bool gpu_nbor, const int gpu_host, const int max_nbors,
+    * - -1 if fix gpu not found
-            const double cell_size, const bool pre_cut);
+    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
    * - -5 Double precision is not supported on card **/
  int init(PairGPUAns<numtyp,acctyp> &a, const bool charge, const bool rot,
           const int nlocal, const int host_nlocal, const int nall,
           PairGPUNbor *nbor, const int maxspecial, const int gpu_host,
           const int max_nbors, const double cell_size, const bool pre_cut);
  /// Initialize the device for Atom storage only
  /** \param nlocal Total number of local particles to allocate memory for
    * \param nall Total number of local+ghost particles
    *
    * Returns:
    * -  0 if successfull
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
    * - -5 Double precision is not supported on card **/
  int init(PairGPUAns<numtyp,acctyp> &ans, const int nlocal, const int nall);
  /// Output a message for pair_style acceleration with device stats
  void init_message(FILE *screen, const char *name,
                    const int first_gpu, const int last_gpu);
  /// Perform charge assignment asynchronously for PPPM
 	void set_single_precompute(PPPMGPUMemory<numtyp,acctyp,
 	                                         float,_lgpu_float4> *pppm);
  /// Perform charge assignment asynchronously for PPPM
 	void set_double_precompute(PPPMGPUMemory<numtyp,acctyp,
 	                                         double,_lgpu_double4> *pppm);
  /// Esimate the overhead from GPU calls from multiple procs
  /** \param kernel_calls Number of kernel calls/timestep for timing estimated
    *                     overhead
    * \param gpu_overhead Estimated gpu overhead per timestep (sec)
    * \param driver_overhead Estimated overhead from driver per timestep (s) **/
  void estimate_gpu_overhead(const int kernel_calls, double &gpu_overhead,
                             double &gpu_driver_overhead);
  /// Returns true if double precision is supported on card
  inline bool double_precision() { return gpu->double_precision(); }
  /// Output a message with timing information
-  void output_times(UCL_Timer &time_pair, const double avg_split, 
+  void output_times(UCL_Timer &time_pair, PairGPUAns<numtyp,acctyp> &ans, 
-                    const double max_bytes, FILE *screen);
+                    PairGPUNbor &nbor, const double avg_split, 
                    const double max_bytes, const double gpu_overhead,
                    const double driver_overhead, 
                    const int threads_per_atom, FILE *screen);
  /// Output a message with timing information
  void output_kspace_times(UCL_Timer &time_in, UCL_Timer &time_out,
                           UCL_Timer & time_map, UCL_Timer & time_rho,
                           UCL_Timer &time_interp, 
                           PairGPUAns<numtyp,acctyp> &ans, 
                           const double max_bytes, const double cpu_time,
                           const double cpu_idle_time, FILE *screen);
  /// Clear all memory on host and device associated with atom and nbor data
  void clear();
@ -70,11 +129,37 @@ class PairGPUDevice {
  /// Clear all memory on host and device
  void clear_device();
  /// Add an answer object for putting forces, energies, etc from GPU to LAMMPS
  inline void add_ans_object(PairGPUAns<numtyp,acctyp> *ans)
    { ans_queue.push(ans); }
  /// Add "answers" (force,energies,etc.) into LAMMPS structures
  inline double fix_gpu(double **f, double **tor, double *eatom,
                        double **vatom, double *virial, double &ecoul) {
    atom.data_unavail();
    if (ans_queue.empty()==false) {
      stop_host_timer();
      double evdw=0.0;
      while (ans_queue.empty()==false) {
        evdw+=ans_queue.front()->get_answers(f,tor,eatom,vatom,virial,ecoul);
        ans_queue.pop();
      }                                                 
      return evdw;
    }
    return 0.0;
  }
  /// Start timer on host
-  inline void start_host_timer() { _cpu_full=MPI_Wtime(); }
+  inline void start_host_timer() 
    { _cpu_full=MPI_Wtime(); _host_timer_started=true; }
  /// Stop timer on host
-  inline void stop_host_timer() { _cpu_full=MPI_Wtime()-_cpu_full; }
+  inline void stop_host_timer() { 
    if (_host_timer_started) {
      _cpu_full=MPI_Wtime()-_cpu_full; 
      _host_timer_started=false;
    }
  }
  /// Return host time
  inline double host_time() { return _cpu_full; }
@ -114,6 +199,42 @@ class PairGPUDevice {
  inline double particle_split() const { return _particle_split; }
  /// Return the initialization count for the device
  inline int init_count() const { return _init_count; }
  /// True if device is being timed
  inline bool time_device() const { return _time_device; }
  /// Return the number of threads accessing memory simulatenously
  inline int num_mem_threads() const { return _num_mem_threads; }
  /// Return the number of threads per atom for pair styles
  inline int threads_per_atom() const { return _threads_per_atom; }
  /// Return the number of threads per atom for pair styles using charge
  inline int threads_per_charge() const { return _threads_per_charge; }
  /// Return the min of the pair block size or the device max block size
  inline int pair_block_size() const { return _block_pair; }
  /// Return the maximum number of atom types that can be used with shared mem
  inline int max_shared_types() const { return _max_shared_types; }
  /// Return the maximum order for PPPM splines
  inline int pppm_max_spline() const { return _pppm_max_spline; }
  /// Return the block size for PPPM kernels
  inline int pppm_block() const { return _pppm_block; }
  /// Return the block size for neighbor binning
  inline int block_cell_2d() const { return _block_cell_2d; }
  /// Return the block size for atom mapping for neighbor builds
  inline int block_cell_id() const { return _block_cell_id; }
  /// Return the block size for neighbor build kernel
  inline int block_nbor_build() const { return _block_nbor_build; }
  /// Return the block size for "bio" pair styles
  inline int block_bio_pair() const { return _block_bio_pair; }
  /// Return the maximum number of atom types for shared mem with "bio" styles
  inline int max_bio_shared_types() const { return _max_bio_shared_types; }
  // -------------------- SHARED DEVICE ROUTINES -------------------- 
  // Perform asynchronous zero of integer array 
  void zero(UCL_D_Vec<int> &mem, const int numel) {
    int num_blocks=static_cast<int>(ceil(static_cast<double>(numel)/
                                    _block_pair));
    k_zero.set_size(num_blocks,_block_pair);
    k_zero.run(&mem.begin(),&numel);
  }
  // -------------------------- DEVICE DATA ------------------------- 
@ -130,11 +251,30 @@ class PairGPUDevice {
  // --------------------------- NBOR DATA ----------------------------
  /// Neighbor Data
-  PairGPUNbor nbor;
+  PairGPUNborShared _nbor_shared;
  // ------------------------ LONG RANGE DATA -------------------------
  // Long Range Data
  int _long_range_precompute;
  PPPMGPUMemory<numtyp,acctyp,float,_lgpu_float4> *pppm_single;
  PPPMGPUMemory<numtyp,acctyp,double,_lgpu_double4> *pppm_double;
  /// Precomputations for long range charge assignment (asynchronously)
  inline void precompute(const int ago, const int nlocal, const int nall,
                         double **host_x, int *host_type, bool &success,
                         double *charge, double *boxlo, double *prd) {
    if (_long_range_precompute==1)
      pppm_single->precompute(ago,nlocal,nall,host_x,host_type,success,charge,
                              boxlo,prd);
    else if (_long_range_precompute==2)
      pppm_double->precompute(ago,nlocal,nall,host_x,host_type,success,charge,
                              boxlo,prd);
  }
 private:
  std::queue<PairGPUAns<numtyp,acctyp> *> ans_queue;
  int _init_count;
-  bool _device_init;
+  bool _device_init, _host_timer_started, _time_device;
  MPI_Comm _comm_world, _comm_replica, _comm_gpu;
  int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me, 
      _replica_size;
@ -142,6 +282,19 @@ class PairGPUDevice {
  double _particle_split;
  double _cpu_full;
  int _num_mem_threads, _warp_size, _threads_per_atom, _threads_per_charge;
  int _pppm_max_spline, _pppm_block;
  int _block_pair, _max_shared_types;
  int _block_cell_2d, _block_cell_id, _block_nbor_build;
  int _block_bio_pair, _max_bio_shared_types;
  UCL_Program *dev_program;
  UCL_Kernel k_zero, k_info;
  bool _compiled;
  int compile_kernels();
  int _data_in_estimate, _data_out_estimate;
  template <class t>
  inline std::string toa(const t& in) {
    std::ostringstream o;
--- a/lib/gpu/pair_gpu_nbor.cpp
+++ b/lib/gpu/pair_gpu_nbor.cpp
@ -18,15 +18,9 @@
 #include "pair_gpu_precision.h"
 #include "pair_gpu_nbor.h"
 #include "pair_gpu_device.h"
 #include "math.h"
 #ifdef USE_OPENCL
 #include "pair_gpu_nbor_cl.h"
 #else
 #include "pair_gpu_nbor_ptx.h"
 #include "pair_gpu_build_ptx.h"
 #endif
 int PairGPUNbor::bytes_per_atom(const int max_nbors) const {
  if (_gpu_nbor)
    return (max_nbors+2)*sizeof(int);
@ -36,12 +30,18 @@ int PairGPUNbor::bytes_per_atom(const int max_nbors) const {
    return (max_nbors+3)*sizeof(int);
 }
-bool PairGPUNbor::init(const int inum, const int host_inum, const int max_nbors, 
+bool PairGPUNbor::init(PairGPUNborShared *shared, const int inum,
                       const int host_inum, const int max_nbors, 
                       const int maxspecial, UCL_Device &devi, 
                       const bool gpu_nbor, const int gpu_host, 
-                       const bool pre_cut) {
+                       const bool pre_cut, const int block_cell_2d,
                       const int block_cell_id, const int block_nbor_build) {
  clear();
  _block_cell_2d=block_cell_2d;
  _block_cell_id=block_cell_id;
  _block_nbor_build=block_nbor_build;
  _shared=shared;
  dev=&devi;
  _gpu_nbor=gpu_nbor;
  if (gpu_host==0)
@ -80,8 +80,11 @@ bool PairGPUNbor::init(const int inum, const int host_inum, const int max_nbors,
    success=success && (host_packed.alloc(2*IJ_SIZE,*dev,
                                          UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
  alloc(success);
  if (!success)
    return false;
  if (_use_packing==false)
-    compile_kernels(devi);
+    _shared->compile_kernels(devi,gpu_nbor);
  return success;
 }
@ -89,13 +92,14 @@ bool PairGPUNbor::init(const int inum, const int host_inum, const int max_nbors,
 void PairGPUNbor::alloc(bool &success) { 
  dev_nbor.clear();
  host_acc.clear();
  int nt=_max_atoms+_max_host;
  if (_use_packing==false || _gpu_nbor) 
    success=success && (dev_nbor.alloc((_max_nbors+2)*_max_atoms,*dev,
                                       UCL_READ_ONLY)==UCL_SUCCESS);
  else 
    success=success && (dev_nbor.alloc(3*_max_atoms,*dev,
                                       UCL_READ_ONLY)==UCL_SUCCESS);
-  success=success && (host_acc.alloc((_max_atoms+_max_host)*2,*dev,
+  success=success && (host_acc.alloc(nt*2,*dev,
                                     UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
  _c_bytes=dev_nbor.row_bytes();
@ -108,11 +112,31 @@ void PairGPUNbor::alloc(bool &success) {
  if (_max_host>0) {
    host_nbor.clear();
    dev_host_nbor.clear();
-    success=success && (host_nbor.alloc((_max_nbors+1)*_max_host,*dev,
+    dev_host_numj.clear();
    host_ilist.clear();
    host_jlist.clear();
    success=success && (host_nbor.alloc(_max_nbors*_max_host,*dev,
                                        UCL_RW_OPTIMIZED)==UCL_SUCCESS);
-    success=success && (dev_host_nbor.alloc((_max_nbors+1)*_max_host,
+    success=success && (dev_host_nbor.alloc(_max_nbors*_max_host,
                                            *dev,UCL_WRITE_ONLY)==UCL_SUCCESS);
-    _c_bytes+=dev_host_nbor.row_bytes();
+    success=success && (dev_host_numj.alloc(_max_host,*dev,
                                            UCL_WRITE_ONLY)==UCL_SUCCESS);
    success=success && (host_ilist.alloc(nt,*dev,UCL_NOT_PINNED)==UCL_SUCCESS);
    if (!success)
      return;
    for (int i=0; i<nt; i++)
      host_ilist[i]=i;
    success=success && (host_jlist.alloc(_max_host,*dev,
                                         UCL_NOT_PINNED)==UCL_SUCCESS);
    if (!success)
      return;
    int *ptr=host_nbor.begin();
    for (int i=0; i<_max_host; i++) {
      host_jlist[i]=ptr;
      ptr+=_max_nbors;
    }                                                 
    _c_bytes+=dev_host_nbor.row_bytes()+dev_host_numj.row_bytes();
  }
  if (_maxspecial>0) {
    dev_nspecial.clear();
@ -145,6 +169,9 @@ void PairGPUNbor::clear() {
    dev_host_nbor.clear();
    dev_packed.clear();
    host_nbor.clear();
    dev_host_numj.clear();
    host_ilist.clear();
    host_jlist.clear();
    dev_nspecial.clear();
    dev_special.clear();
    dev_special_t.clear();
@ -152,27 +179,13 @@ void PairGPUNbor::clear() {
    time_kernel.clear();
    time_nbor.clear();
  }
  if (_compiled) {
    if (_gpu_nbor) {
      k_cell_id.clear();
      k_cell_counts.clear();
      k_build_nbor.clear();
      k_transpose.clear();
      k_special.clear();
      delete build_program;
    } else {
      k_nbor.clear();
      delete nbor_program;
    }
    _compiled=false;
  }
 }
 double PairGPUNbor::host_memory_usage() const {
  if (_gpu_nbor) {
    if (_gpu_host)
-      return host_nbor.row_bytes()*host_nbor.rows();
+      return host_nbor.row_bytes()*host_nbor.rows()+host_ilist.row_bytes()+
             host_jlist.row_bytes();
    else
      return 0;
  } else 
@ -186,7 +199,7 @@ void PairGPUNbor::get_host(const int inum, int *ilist, int *numj,
  UCL_H_Vec<int> ilist_view;
  ilist_view.view(ilist,inum,*dev);
-  ucl_copy(dev_nbor,ilist_view,true);
+  ucl_copy(dev_nbor,ilist_view,false);
  UCL_D_Vec<int> nbor_offset;
  UCL_H_Vec<int> host_offset;
@ -238,46 +251,20 @@ void PairGPUNbor::get_host(const int inum, int *ilist, int *numj,
  if (_use_packing==false) {
    time_kernel.start();
    int GX=static_cast<int>(ceil(static_cast<double>(inum)/block_size));
-    k_nbor.set_size(GX,block_size);
+    _shared->k_nbor.set_size(GX,block_size);
-    k_nbor.run(&dev_nbor.begin(), &dev_packed.begin(), &inum);
+    _shared->k_nbor.run(&dev_nbor.begin(), &dev_packed.begin(), &inum);
    time_kernel.stop();
  }
 }
 void PairGPUNbor::compile_kernels(UCL_Device &dev) {
  std::string flags="-cl-fast-relaxed-math -cl-mad-enable";
  if (_gpu_nbor==false) {
    nbor_program=new UCL_Program(dev);
    nbor_program->load_string(pair_gpu_nbor_kernel,flags.c_str());
    k_nbor.set_function(*nbor_program,"kernel_unpack");
  } else {
    build_program=new UCL_Program(dev);
    #ifdef USE_OPENCL
    std::cerr << "CANNOT CURRENTLY USE GPU NEIGHBORING WITH OPENCL\n";
    exit(1);
    #else
    build_program->load_string(pair_gpu_build_kernel,flags.c_str());
    #endif
    k_cell_id.set_function(*build_program,"calc_cell_id");
    k_cell_counts.set_function(*build_program,"kernel_calc_cell_counts");
    k_build_nbor.set_function(*build_program,"calc_neigh_list_cell");
    k_transpose.set_function(*build_program,"transpose");
    k_special.set_function(*build_program,"kernel_special");
    neigh_tex.get_texture(*build_program,"neigh_tex");
  }
  _compiled=true;
 }
 template <class numtyp, class acctyp>
 void PairGPUNbor::build_nbor_list(const int inum, const int host_inum,
                                  const int nall, 
                                  PairGPUAtom<numtyp,acctyp> &atom, 
-                                  double *boxlo, double *boxhi, int *tag, 
+                                  double *sublo, double *subhi, int *tag, 
                                  int **nspecial, int **special, bool &success,
                                  int &mn) {
  const int nt=inum+host_inum;
  if (_maxspecial>0) {
    time_nbor.start();
    UCL_H_Vec<int> view_nspecial, view_special, view_tag;
@ -290,25 +277,25 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum,
    time_nbor.stop();
    time_nbor.add_to_total();
    time_kernel.start();
-    const int b2x=8;
+    const int b2x=_block_cell_2d;
-    const int b2y=8;
+    const int b2y=_block_cell_2d;
    const int g2x=static_cast<int>(ceil(static_cast<double>(_maxspecial)/b2x));
    const int g2y=static_cast<int>(ceil(static_cast<double>(nt)/b2y));
-    k_transpose.set_size(g2x,g2y,b2x,b2y);
+    _shared->k_transpose.set_size(g2x,g2y,b2x,b2y);
-    k_transpose.run(&dev_special.begin(),&dev_special_t.begin(),&_maxspecial,
+    _shared->k_transpose.run(&dev_special.begin(),&dev_special_t.begin(),
-                    &nt);        
+                             &_maxspecial,&nt);        
  } else
    time_kernel.start();
  _nbor_pitch=inum;
-  neigh_tex.bind_float(atom.dev_x,4);
+  _shared->neigh_tex.bind_float(atom.dev_x,4);
  int ncellx, ncelly, ncellz, ncell_3d;
-  ncellx = static_cast<int>(ceil(((boxhi[0] - boxlo[0]) +
+  ncellx = static_cast<int>(ceil(((subhi[0] - sublo[0]) +
                                  2.0*_cell_size)/_cell_size));
-  ncelly = static_cast<int>(ceil(((boxhi[1] - boxlo[1]) +
+  ncelly = static_cast<int>(ceil(((subhi[1] - sublo[1]) +
                                  2.0*_cell_size)/_cell_size));
-  ncellz = static_cast<int>(ceil(((boxhi[2] - boxlo[2]) +
+  ncellz = static_cast<int>(ceil(((subhi[2] - sublo[2]) +
                                  2.0*_cell_size)/_cell_size));
  ncell_3d = ncellx * ncelly * ncellz;
  UCL_D_Vec<int> cell_counts;
@ -316,34 +303,35 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum,
  _cell_bytes=cell_counts.row_bytes();
  /* build cell list on GPU */
-  const int neigh_block=128;
+  const int neigh_block=_block_cell_id;
  const int GX=(int)ceil((float)nall/neigh_block);
-  const numtyp boxlo0=static_cast<numtyp>(boxlo[0]);
+  const numtyp sublo0=static_cast<numtyp>(sublo[0]);
-  const numtyp boxlo1=static_cast<numtyp>(boxlo[1]);
+  const numtyp sublo1=static_cast<numtyp>(sublo[1]);
-  const numtyp boxlo2=static_cast<numtyp>(boxlo[2]);
+  const numtyp sublo2=static_cast<numtyp>(sublo[2]);
-  const numtyp boxhi0=static_cast<numtyp>(boxhi[0]);
+  const numtyp subhi0=static_cast<numtyp>(subhi[0]);
-  const numtyp boxhi1=static_cast<numtyp>(boxhi[1]);
+  const numtyp subhi1=static_cast<numtyp>(subhi[1]);
-  const numtyp boxhi2=static_cast<numtyp>(boxhi[2]);
+  const numtyp subhi2=static_cast<numtyp>(subhi[2]);
  const numtyp cell_size_cast=static_cast<numtyp>(_cell_size);
-  k_cell_id.set_size(GX,neigh_block);
+  _shared->k_cell_id.set_size(GX,neigh_block);
-  k_cell_id.run(&atom.dev_x.begin(), &atom.dev_cell_id.begin(), 
+  _shared->k_cell_id.run(&atom.dev_x.begin(), &atom.dev_cell_id.begin(), 
                         &atom.dev_particle_id.begin(),
-  				      &boxlo0, &boxlo1, &boxlo2, &boxhi0, &boxhi1, 
+  				               &sublo0, &sublo1, &sublo2, &subhi0, &subhi1, 
-  				      &boxhi2, &cell_size_cast, &ncellx, &ncelly, &nall);
+  				               &subhi2, &cell_size_cast, &ncellx, &ncelly, &nall);
  atom.sort_neighbor(nall);
  /* calculate cell count */
-  k_cell_counts.set_size(GX,neigh_block);
+  _shared->k_cell_counts.set_size(GX,neigh_block);
-  k_cell_counts.run(&atom.dev_cell_id.begin(), &cell_counts.begin(), &nall, 
+  _shared->k_cell_counts.run(&atom.dev_cell_id.begin(), &cell_counts.begin(), 
-                    &ncell_3d);
+                             &nall, &ncell_3d);
  /* build the neighbor list */
-  const int cell_block=64;
+  const int cell_block=_block_nbor_build;
-  k_build_nbor.set_size(ncellx, ncelly*ncellz, cell_block, 1);
+  _shared->k_build_nbor.set_size(ncellx, ncelly*ncellz, cell_block, 1);
-  k_build_nbor.run(&atom.dev_x.begin(), &atom.dev_particle_id.begin(),
+  _shared->k_build_nbor.run(&atom.dev_x.begin(), &atom.dev_particle_id.begin(),
                            &cell_counts.begin(), &dev_nbor.begin(),
-                   &dev_host_nbor.begin(), &_max_nbors, &cell_size_cast,
+                            &dev_host_nbor.begin(), &dev_host_numj.begin(),
                            &_max_nbors,&cell_size_cast,
                            &ncellx, &ncelly, &ncellz, &inum, &nt, &nall);
  /* Get the maximum number of nbors and realloc if necessary */
@ -353,7 +341,7 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum,
  if (nt>inum) {
    UCL_H_Vec<int> host_offset;
    host_offset.view_offset(inum,host_acc,nt-inum);
-    ucl_copy(host_offset,dev_host_nbor,nt-inum,false);
+    ucl_copy(host_offset,dev_host_numj,nt-inum,false);
  }
  mn=host_acc[0];
  for (int i=1; i<nt; i++)
@ -368,10 +356,15 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum,
    if (_max_host>0) {
      host_nbor.clear();
      dev_host_nbor.clear();
-      success=success && (host_nbor.alloc((mn+1)*_max_host,dev_nbor,
+      success=success && (host_nbor.alloc(mn*_max_host,dev_nbor,
                                          UCL_RW_OPTIMIZED)==UCL_SUCCESS);
-      success=success && (dev_host_nbor.alloc((mn+1)*_max_host,
+      success=success && (dev_host_nbor.alloc(mn*_max_host,
                                        dev_nbor,UCL_WRITE_ONLY)==UCL_SUCCESS);
      int *ptr=host_nbor.begin();
      for (int i=0; i<_max_host; i++) {
        host_jlist[i]=ptr;
        ptr+=mn;
      }                                                 
      _gpu_bytes+=dev_host_nbor.row_bytes();
    }
    if (_alloc_packed) {
@ -385,28 +378,29 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum,
    _max_nbors=mn;
    time_kernel.stop();
    time_kernel.add_to_total();
-    build_nbor_list(inum, host_inum, nall, atom, boxlo, boxhi, tag, nspecial,
+    build_nbor_list(inum, host_inum, nall, atom, sublo, subhi, tag, nspecial,
                    special, success, mn);
    return;
  }
  if (_maxspecial>0) {
    const int GX2=static_cast<int>(ceil(static_cast<double>(nt)/cell_block));
-    k_special.set_size(GX2,cell_block);
+    _shared->k_special.set_size(GX2,cell_block);
-    k_special.run(&dev_nbor.begin(), &dev_host_nbor.begin(), 
+    _shared->k_special.run(&dev_nbor.begin(), &dev_host_nbor.begin(), 
-                  &atom.dev_tag.begin(), &dev_nspecial.begin(), 
+                           &dev_host_numj.begin(), &atom.dev_tag.begin(), 
-                  &dev_special.begin(), &inum, &nt, &nall);
+                           &dev_nspecial.begin(), &dev_special.begin(), 
                           &inum, &nt, &nall, &_max_nbors);
  }
  time_kernel.stop();
  time_nbor.start();
  if (_gpu_host)
-    ucl_copy(host_nbor,dev_host_nbor,host_inum*(mn+1),false);
+    ucl_copy(host_nbor,dev_host_nbor,false);
  time_nbor.stop();
 }
 template void PairGPUNbor::build_nbor_list<PRECISION,ACC_PRECISION>
     (const int inum, const int host_inum, const int nall,
-      PairGPUAtom<PRECISION,ACC_PRECISION> &atom, double *boxlo, double *boxhi,
+      PairGPUAtom<PRECISION,ACC_PRECISION> &atom, double *sublo, double *subhi,
      int *, int **, int **, bool &success, int &mn);
--- a/lib/gpu/pair_gpu_nbor.h
+++ b/lib/gpu/pair_gpu_nbor.h
@ -19,32 +19,27 @@
 #define PAIR_GPU_NBOR_H
 #include "pair_gpu_atom.h"
 #include "pair_gpu_nbor_shared.h"
 #define IJ_SIZE 131072
 #ifdef USE_OPENCL
 #include "geryon/ocl_device.h"
 #include "geryon/ocl_timer.h"
 #include "geryon/ocl_mat.h"
 #include "geryon/ocl_kernel.h"
 #include "geryon/ocl_texture.h"
 using namespace ucl_opencl;
 #else
 #include "geryon/nvd_device.h"
 #include "geryon/nvd_timer.h"
 #include "geryon/nvd_mat.h"
 #include "geryon/nvd_kernel.h"
 #include "geryon/nvd_texture.h"
 using namespace ucl_cudadr;
 #endif
 class PairGPUNbor {
 public:
-  PairGPUNbor() : _allocated(false), _use_packing(false), _compiled(false) {}
+  PairGPUNbor() : _allocated(false), _use_packing(false) {}
  ~PairGPUNbor() { clear(); }
  /// Determine whether neighbor unpacking should be used
@ -62,9 +57,11 @@ class PairGPUNbor {
    *                 2 if gpu_nbor is true, and host needs a full nbor list
    * \param pre_cut True if cutoff test will be performed in separate kernel
    *                than the force kernel **/
-  bool init(const int inum, const int host_inum, const int max_nbors, 
+  bool init(PairGPUNborShared *shared, const int inum, const int host_inum,
-            const int maxspecial, UCL_Device &dev, const bool gpu_nbor,
+            const int max_nbors, const int maxspecial, UCL_Device &dev,
-            const int gpu_host, const bool pre_cut);
+            const bool gpu_nbor, const int gpu_host, const bool pre_cut,
            const int block_cell_2d, const int block_cell_id, 
            const int block_nbor_build);
  /// Set the size of the cutoff+skin
  inline void cell_size(const double size) { _cell_size=size; }
@ -131,18 +128,18 @@ class PairGPUNbor {
  inline int max_nbors() const { return _max_nbors; }
  /// Loop through neighbor count array and return maximum nbors for a particle
-  inline int max_nbor_loop(const int inum, int *numj) const {
+  inline int max_nbor_loop(const int inum, int *numj, int *ilist) const {
    int mn=0;
    for (int i=0; i<inum; i++)
-      mn=std::max(mn,numj[i]);
+      mn=std::max(mn,numj[ilist[i]]);
    return mn;
  }
  /// Build nbor list on the device
  template <class numtyp, class acctyp>
  void build_nbor_list(const int inum, const int host_inum, const int nall,
-                       PairGPUAtom<numtyp,acctyp> &atom, double *boxlo,
+                       PairGPUAtom<numtyp,acctyp> &atom, double *sublo,
-                       double *boxhi, int *tag, int **nspecial, int **special, 
+                       double *subhi, int *tag, int **nspecial, int **special, 
                       bool &success, int &max_nbors);
  /// Return the number of bytes used on device
@ -176,31 +173,31 @@ class PairGPUNbor {
  UCL_H_Vec<int> host_nbor;
  /// Device storage for neighbor list matrix that will be copied to host
  /** - 1st row is numj
-    * - Remaining rows are nbors **/
+    * - Remaining rows are by atom, columns are nbors **/
  UCL_D_Vec<int> dev_host_nbor;
  UCL_D_Vec<int> dev_host_numj;
  UCL_H_Vec<int> host_ilist;
  UCL_H_Vec<int*> host_jlist;
  /// Device storage for special neighbor counts
  UCL_D_Vec<int> dev_nspecial;
  /// Device storage for special neighbors
  UCL_D_Vec<int> dev_special, dev_special_t;
  /// Texture for cached position/type access with CUDA
  UCL_Texture neigh_tex;
  /// Device timers
  UCL_Timer time_nbor, time_kernel;
 private:
  PairGPUNborShared *_shared;
  UCL_Device *dev;
-  UCL_Program *nbor_program, *build_program;
+  bool _allocated, _use_packing;
  UCL_Kernel k_nbor, k_cell_id, k_cell_counts, k_build_nbor;
  UCL_Kernel k_transpose, k_special;
  bool _allocated, _use_packing, _compiled;
  void compile_kernels(UCL_Device &dev);
  int _max_atoms, _max_nbors, _max_host, _nbor_pitch, _maxspecial;
  bool _gpu_nbor, _gpu_host, _alloc_packed;
  double _cell_size;
  double _gpu_bytes, _c_bytes, _cell_bytes;
  void alloc(bool &success);
  int _block_cell_2d, _block_cell_id, _block_nbor_build;
 };
 #endif
--- a/lib/gpu/pair_gpu_precision.h
+++ b/lib/gpu/pair_gpu_precision.h
@ -84,8 +84,6 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) {
 #define acctyp4 _lgpu_float4
 #endif
 #define MAX_SHARED_TYPES 8
 #define MAX_BIO_SHARED_TYPES 128
 enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
 #endif
`@ -1,2 +1,2 @@`
	`Geryon Version 10.280`	`Geryon Version 11.094`