git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@7281 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2011-12-02 16:02:36 +00:00
parent 00dc2b891f
commit 7837edd51f
118 changed files with 47017 additions and 0 deletions
--- a/lib/gpu/cmm_cut_gpu_kernel.ptx
+++ b/lib/gpu/cmm_cut_gpu_kernel.ptx
--- a/lib/gpu/cmm_cut_gpu_ptx.h
+++ b/lib/gpu/cmm_cut_gpu_ptx.h
@ -0,0 +1,984 @@
+const char * cmm_cut_gpu_kernel = 
+"	.version 2.3\n"
+"	.target sm_20\n"
+"	.address_size 64\n"
+"	.global .texref pos_tex;\n"
+"	.entry kernel_pair (\n"
+"		.param .u64 __cudaparm_kernel_pair_x_,\n"
+"		.param .u64 __cudaparm_kernel_pair_lj1,\n"
+"		.param .u64 __cudaparm_kernel_pair_lj3,\n"
+"		.param .s32 __cudaparm_kernel_pair_lj_types,\n"
+"		.param .u64 __cudaparm_kernel_pair_sp_lj_in,\n"
+"		.param .u64 __cudaparm_kernel_pair_dev_nbor,\n"
+"		.param .u64 __cudaparm_kernel_pair_dev_packed,\n"
+"		.param .u64 __cudaparm_kernel_pair_ans,\n"
+"		.param .u64 __cudaparm_kernel_pair_engv,\n"
+"		.param .s32 __cudaparm_kernel_pair_eflag,\n"
+"		.param .s32 __cudaparm_kernel_pair_vflag,\n"
+"		.param .s32 __cudaparm_kernel_pair_inum,\n"
+"		.param .s32 __cudaparm_kernel_pair_nbor_pitch,\n"
+"		.param .s32 __cudaparm_kernel_pair_t_per_atom)\n"
+"	{\n"
+"	.reg .u32 %r<72>;\n"
+"	.reg .u64 %rd<62>;\n"
+"	.reg .f32 %f<111>;\n"
+"	.reg .pred %p<21>;\n"
+"	.shared .align 16 .b8 __cuda___cuda_local_var_32497_33_non_const_sp_lj92[16];\n"
+"	.shared .align 4 .b8 __cuda___cuda_local_var_32590_35_non_const_red_acc108[3072];\n"
+"	.loc	16	88	0\n"
+"$LDWbegin_kernel_pair:\n"
+"	.loc	16	95	0\n"
+"	ld.param.u64 	%rd1, [__cudaparm_kernel_pair_sp_lj_in];\n"
+"	ldu.global.f32 	%f1, [%rd1+0];\n"
+"	.loc	16	96	0\n"
+"	ld.global.f32 	%f2, [%rd1+4];\n"
+"	.loc	16	97	0\n"
+"	ld.global.f32 	%f3, [%rd1+8];\n"
+"	.loc	16	98	0\n"
+"	ld.global.f32 	%f4, [%rd1+12];\n"
+"	st.shared.v4.f32 	[__cuda___cuda_local_var_32497_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4};\n"
+"	.loc	16	107	0\n"
+"	mov.f32 	%f5, 0f00000000;     	\n"
+"	mov.f32 	%f6, %f5;\n"
+"	mov.f32 	%f7, 0f00000000;     	\n"
+"	mov.f32 	%f8, %f7;\n"
+"	mov.f32 	%f9, 0f00000000;     	\n"
+"	mov.f32 	%f10, %f9;\n"
+"	mov.f32 	%f11, 0f00000000;    	\n"
+"	mov.f32 	%f12, %f11;\n"
+"	mov.f32 	%f13, 0f00000000;    	\n"
+"	mov.f32 	%f14, %f13;\n"
+"	mov.f32 	%f15, 0f00000000;    	\n"
+"	mov.f32 	%f16, %f15;\n"
+"	ld.param.s32 	%r1, [__cudaparm_kernel_pair_t_per_atom];\n"
+"	cvt.s32.u32 	%r2, %tid.x;\n"
+"	div.s32 	%r3, %r2, %r1;\n"
+"	cvt.s32.u32 	%r4, %ntid.x;\n"
+"	div.s32 	%r5, %r4, %r1;\n"
+"	rem.s32 	%r6, %r2, %r1;\n"
+"	cvt.s32.u32 	%r7, %ctaid.x;\n"
+"	mul.lo.s32 	%r8, %r7, %r5;\n"
+"	add.s32 	%r9, %r3, %r8;\n"
+"	ld.param.s32 	%r10, [__cudaparm_kernel_pair_inum];\n"
+"	setp.lt.s32 	%p1, %r9, %r10;\n"
+"	@!%p1 bra 	$Lt_0_20738;\n"
+"	.loc	16	114	0\n"
+"	ld.param.s32 	%r11, [__cudaparm_kernel_pair_nbor_pitch];\n"
+"	cvt.s64.s32 	%rd2, %r11;\n"
+"	mul.wide.s32 	%rd3, %r11, 4;\n"
+"	cvt.s64.s32 	%rd4, %r9;\n"
+"	mul.wide.s32 	%rd5, %r9, 4;\n"
+"	ld.param.u64 	%rd6, [__cudaparm_kernel_pair_dev_nbor];\n"
+"	add.u64 	%rd7, %rd5, %rd6;\n"
+"	add.u64 	%rd8, %rd3, %rd7;\n"
+"	ld.global.s32 	%r12, [%rd8+0];\n"
+"	add.u64 	%rd9, %rd3, %rd8;\n"
+"	ld.param.u64 	%rd10, [__cudaparm_kernel_pair_dev_packed];\n"
+"	setp.ne.u64 	%p2, %rd10, %rd6;\n"
+"	@%p2 bra 	$Lt_0_21250;\n"
+"	.loc	16	120	0\n"
+"	cvt.s32.s64 	%r13, %rd2;\n"
+"	mul.lo.s32 	%r14, %r13, %r12;\n"
+"	cvt.s64.s32 	%rd11, %r14;\n"
+"	mul.wide.s32 	%rd12, %r14, 4;\n"
+"	add.u64 	%rd13, %rd9, %rd12;\n"
+"	.loc	16	121	0\n"
+"	mul.lo.s32 	%r15, %r6, %r13;\n"
+"	cvt.s64.s32 	%rd14, %r15;\n"
+"	mul.wide.s32 	%rd15, %r15, 4;\n"
+"	add.u64 	%rd16, %rd9, %rd15;\n"
+"	.loc	16	122	0\n"
+"	mul.lo.s32 	%r16, %r13, %r1;\n"
+"	bra.uni 	$Lt_0_20994;\n"
+"$Lt_0_21250:\n"
+"	.loc	16	124	0\n"
+"	ld.global.s32 	%r17, [%rd9+0];\n"
+"	cvt.s64.s32 	%rd17, %r17;\n"
+"	mul.wide.s32 	%rd18, %r17, 4;\n"
+"	add.u64 	%rd19, %rd10, %rd18;\n"
+"	.loc	16	125	0\n"
+"	cvt.s64.s32 	%rd20, %r12;\n"
+"	mul.wide.s32 	%rd21, %r12, 4;\n"
+"	add.u64 	%rd13, %rd19, %rd21;\n"
+"	.loc	16	126	0\n"
+"	mov.s32 	%r16, %r1;\n"
+"	.loc	16	127	0\n"
+"	cvt.s64.s32 	%rd22, %r6;\n"
+"	mul.wide.s32 	%rd23, %r6, 4;\n"
+"	add.u64 	%rd16, %rd19, %rd23;\n"
+"$Lt_0_20994:\n"
+"	.loc	16	130	0\n"
+"	ld.global.s32 	%r18, [%rd7+0];\n"
+"	mov.u32 	%r19, %r18;\n"
+"	mov.s32 	%r20, 0;\n"
+"	mov.u32 	%r21, %r20;\n"
+"	mov.s32 	%r22, 0;\n"
+"	mov.u32 	%r23, %r22;\n"
+"	mov.s32 	%r24, 0;\n"
+"	mov.u32 	%r25, %r24;\n"
+"	tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r19,%r21,%r23,%r25}];\n"
+"	mov.f32 	%f21, %f17;\n"
+"	mov.f32 	%f22, %f18;\n"
+"	mov.f32 	%f23, %f19;\n"
+"	mov.f32 	%f24, %f20;\n"
+"	setp.ge.u64 	%p3, %rd16, %rd13;\n"
+"	@%p3 bra 	$Lt_0_30722;\n"
+"	cvt.rzi.ftz.s32.f32 	%r26, %f24;\n"
+"	cvt.s64.s32 	%rd24, %r16;\n"
+"	ld.param.s32 	%r27, [__cudaparm_kernel_pair_lj_types];\n"
+"	mul.lo.s32 	%r28, %r27, %r26;\n"
+"	ld.param.u64 	%rd25, [__cudaparm_kernel_pair_lj1];\n"
+"	mov.f32 	%f25, 0f00000000;    	\n"
+"	mov.f32 	%f26, 0f00000000;    	\n"
+"	mov.f32 	%f27, 0f00000000;    	\n"
+"	mov.f32 	%f28, 0f00000000;    	\n"
+"	mov.u64 	%rd26, __cuda___cuda_local_var_32497_33_non_const_sp_lj92;\n"
+"$Lt_0_22018:\n"
+"	.loc	16	136	0\n"
+"	ld.global.s32 	%r29, [%rd16+0];\n"
+"	.loc	16	137	0\n"
+"	shr.s32 	%r30, %r29, 30;\n"
+"	and.b32 	%r31, %r30, 3;\n"
+"	cvt.s64.s32 	%rd27, %r31;\n"
+"	mul.wide.s32 	%rd28, %r31, 4;\n"
+"	add.u64 	%rd29, %rd26, %rd28;\n"
+"	ld.shared.f32 	%f29, [%rd29+0];\n"
+"	.loc	16	140	0\n"
+"	and.b32 	%r32, %r29, 1073741823;\n"
+"	mov.u32 	%r33, %r32;\n"
+"	mov.s32 	%r34, 0;\n"
+"	mov.u32 	%r35, %r34;\n"
+"	mov.s32 	%r36, 0;\n"
+"	mov.u32 	%r37, %r36;\n"
+"	mov.s32 	%r38, 0;\n"
+"	mov.u32 	%r39, %r38;\n"
+"	tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r33,%r35,%r37,%r39}];\n"
+"	mov.f32 	%f34, %f30;\n"
+"	mov.f32 	%f35, %f31;\n"
+"	mov.f32 	%f36, %f32;\n"
+"	mov.f32 	%f37, %f33;\n"
+"	cvt.rzi.ftz.s32.f32 	%r40, %f37;\n"
+"	sub.ftz.f32 	%f38, %f22, %f35;\n"
+"	sub.ftz.f32 	%f39, %f21, %f34;\n"
+"	sub.ftz.f32 	%f40, %f23, %f36;\n"
+"	mul.ftz.f32 	%f41, %f38, %f38;\n"
+"	fma.rn.ftz.f32 	%f42, %f39, %f39, %f41;\n"
+"	fma.rn.ftz.f32 	%f43, %f40, %f40, %f42;\n"
+"	add.s32 	%r41, %r40, %r28;\n"
+"	cvt.s64.s32 	%rd30, %r41;\n"
+"	mul.wide.s32 	%rd31, %r41, 16;\n"
+"	add.u64 	%rd32, %rd31, %rd25;\n"
+"	ld.global.f32 	%f44, [%rd32+0];\n"
+"	setp.gt.ftz.f32 	%p4, %f44, %f43;\n"
+"	@!%p4 bra 	$Lt_0_24322;\n"
+"	rcp.approx.ftz.f32 	%f45, %f43;\n"
+"	ld.global.f32 	%f46, [%rd32+4];\n"
+"	mov.f32 	%f47, 0f40000000;    	\n"
+"	setp.eq.ftz.f32 	%p5, %f46, %f47;\n"
+"	@!%p5 bra 	$Lt_0_23042;\n"
+"	.loc	16	155	0\n"
+"	mul.ftz.f32 	%f48, %f45, %f45;\n"
+"	mov.f32 	%f49, %f48;\n"
+"	.loc	16	156	0\n"
+"	mul.ftz.f32 	%f50, %f48, %f48;\n"
+"	bra.uni 	$Lt_0_23298;\n"
+"$Lt_0_23042:\n"
+"	mov.f32 	%f51, 0f3f800000;    	\n"
+"	setp.eq.ftz.f32 	%p6, %f46, %f51;\n"
+"	@!%p6 bra 	$Lt_0_23554;\n"
+"	.loc	16	158	0\n"
+"	sqrt.approx.ftz.f32 	%f52, %f45;\n"
+"	mul.ftz.f32 	%f53, %f45, %f52;\n"
+"	mov.f32 	%f50, %f53;\n"
+"	.loc	16	159	0\n"
+"	mul.ftz.f32 	%f49, %f53, %f53;\n"
+"	bra.uni 	$Lt_0_23298;\n"
+"$Lt_0_23554:\n"
+"	.loc	16	161	0\n"
+"	mul.ftz.f32 	%f54, %f45, %f45;\n"
+"	mul.ftz.f32 	%f55, %f45, %f54;\n"
+"	mov.f32 	%f49, %f55;\n"
+"	.loc	16	162	0\n"
+"	mov.f32 	%f50, %f55;\n"
+"$Lt_0_23298:\n"
+"$Lt_0_22786:\n"
+"	.loc	16	164	0\n"
+"	mul.ftz.f32 	%f56, %f45, %f29;\n"
+"	mul.ftz.f32 	%f57, %f49, %f56;\n"
+"	ld.global.v2.f32 	{%f58,%f59}, [%rd32+8];\n"
+"	mul.ftz.f32 	%f60, %f58, %f50;\n"
+"	sub.ftz.f32 	%f61, %f60, %f59;\n"
+"	mul.ftz.f32 	%f62, %f57, %f61;\n"
+"	.loc	16	166	0\n"
+"	fma.rn.ftz.f32 	%f27, %f39, %f62, %f27;\n"
+"	.loc	16	167	0\n"
+"	fma.rn.ftz.f32 	%f26, %f38, %f62, %f26;\n"
+"	.loc	16	168	0\n"
+"	fma.rn.ftz.f32 	%f25, %f40, %f62, %f25;\n"
+"	ld.param.s32 	%r42, [__cudaparm_kernel_pair_eflag];\n"
+"	mov.u32 	%r43, 0;\n"
+"	setp.le.s32 	%p7, %r42, %r43;\n"
+"	@%p7 bra 	$Lt_0_23810;\n"
+"	.loc	16	170	0\n"
+"	ld.param.u64 	%rd33, [__cudaparm_kernel_pair_lj3];\n"
+"	add.u64 	%rd34, %rd33, %rd31;\n"
+"	ld.global.v4.f32 	{%f63,%f64,%f65,_}, [%rd34+0];\n"
+"	mul.ftz.f32 	%f66, %f29, %f49;\n"
+"	mul.ftz.f32 	%f67, %f63, %f50;\n"
+"	sub.ftz.f32 	%f68, %f67, %f64;\n"
+"	mul.ftz.f32 	%f69, %f66, %f68;\n"
+"	sub.ftz.f32 	%f70, %f69, %f65;\n"
+"	add.ftz.f32 	%f28, %f28, %f70;\n"
+"$Lt_0_23810:\n"
+"	ld.param.s32 	%r44, [__cudaparm_kernel_pair_vflag];\n"
+"	mov.u32 	%r45, 0;\n"
+"	setp.le.s32 	%p8, %r44, %r45;\n"
+"	@%p8 bra 	$Lt_0_24322;\n"
+"	.loc	16	173	0\n"
+"	mov.f32 	%f71, %f6;\n"
+"	mul.ftz.f32 	%f72, %f39, %f39;\n"
+"	fma.rn.ftz.f32 	%f73, %f62, %f72, %f71;\n"
+"	mov.f32 	%f6, %f73;\n"
+"	.loc	16	174	0\n"
+"	mov.f32 	%f74, %f8;\n"
+"	fma.rn.ftz.f32 	%f75, %f62, %f41, %f74;\n"
+"	mov.f32 	%f8, %f75;\n"
+"	.loc	16	175	0\n"
+"	mov.f32 	%f76, %f10;\n"
+"	mul.ftz.f32 	%f77, %f40, %f40;\n"
+"	fma.rn.ftz.f32 	%f78, %f62, %f77, %f76;\n"
+"	mov.f32 	%f10, %f78;\n"
+"	.loc	16	176	0\n"
+"	mov.f32 	%f79, %f12;\n"
+"	mul.ftz.f32 	%f80, %f38, %f39;\n"
+"	fma.rn.ftz.f32 	%f81, %f62, %f80, %f79;\n"
+"	mov.f32 	%f12, %f81;\n"
+"	.loc	16	177	0\n"
+"	mov.f32 	%f82, %f14;\n"
+"	mul.ftz.f32 	%f83, %f39, %f40;\n"
+"	fma.rn.ftz.f32 	%f84, %f62, %f83, %f82;\n"
+"	mov.f32 	%f14, %f84;\n"
+"	.loc	16	178	0\n"
+"	mul.ftz.f32 	%f85, %f38, %f40;\n"
+"	fma.rn.ftz.f32 	%f15, %f62, %f85, %f15;\n"
+"	mov.f32 	%f16, %f15;\n"
+"$Lt_0_24322:\n"
+"$Lt_0_22274:\n"
+"	.loc	16	134	0\n"
+"	mul.lo.u64 	%rd35, %rd24, 4;\n"
+"	add.u64 	%rd16, %rd16, %rd35;\n"
+"	setp.lt.u64 	%p9, %rd16, %rd13;\n"
+"	@%p9 bra 	$Lt_0_22018;\n"
+"	bra.uni 	$Lt_0_20482;\n"
+"$Lt_0_30722:\n"
+"	mov.f32 	%f25, 0f00000000;    	\n"
+"	mov.f32 	%f26, 0f00000000;    	\n"
+"	mov.f32 	%f27, 0f00000000;    	\n"
+"	mov.f32 	%f28, 0f00000000;    	\n"
+"	bra.uni 	$Lt_0_20482;\n"
+"$Lt_0_20738:\n"
+"	mov.f32 	%f25, 0f00000000;    	\n"
+"	mov.f32 	%f26, 0f00000000;    	\n"
+"	mov.f32 	%f27, 0f00000000;    	\n"
+"	mov.f32 	%f28, 0f00000000;    	\n"
+"$Lt_0_20482:\n"
+"	mov.u32 	%r46, 1;\n"
+"	setp.le.s32 	%p10, %r1, %r46;\n"
+"	@%p10 bra 	$Lt_0_27138;\n"
+"	.loc	16	189	0\n"
+"	mov.u64 	%rd36, __cuda___cuda_local_var_32590_35_non_const_red_acc108;\n"
+"	cvt.s64.s32 	%rd37, %r2;\n"
+"	mul.wide.s32 	%rd38, %r2, 4;\n"
+"	add.u64 	%rd39, %rd36, %rd38;\n"
+"	mov.f32 	%f86, %f27;\n"
+"	st.shared.f32 	[%rd39+0], %f86;\n"
+"	.loc	16	190	0\n"
+"	mov.f32 	%f87, %f26;\n"
+"	st.shared.f32 	[%rd39+512], %f87;\n"
+"	.loc	16	191	0\n"
+"	mov.f32 	%f88, %f25;\n"
+"	st.shared.f32 	[%rd39+1024], %f88;\n"
+"	.loc	16	192	0\n"
+"	mov.f32 	%f89, %f28;\n"
+"	st.shared.f32 	[%rd39+1536], %f89;\n"
+"	.loc	16	194	0\n"
+"	shr.s32 	%r47, %r1, 31;\n"
+"	mov.s32 	%r48, 1;\n"
+"	and.b32 	%r49, %r47, %r48;\n"
+"	add.s32 	%r50, %r49, %r1;\n"
+"	shr.s32 	%r51, %r50, 1;\n"
+"	mov.s32 	%r52, %r51;\n"
+"	mov.u32 	%r53, 0;\n"
+"	setp.ne.u32 	%p11, %r51, %r53;\n"
+"	@!%p11 bra 	$Lt_0_25602;\n"
+"$Lt_0_26114:\n"
+"	setp.ge.u32 	%p12, %r6, %r52;\n"
+"	@%p12 bra 	$Lt_0_26370;\n"
+"	.loc	16	197	0\n"
+"	add.u32 	%r54, %r2, %r52;\n"
+"	cvt.u64.u32 	%rd40, %r54;\n"
+"	mul.wide.u32 	%rd41, %r54, 4;\n"
+"	add.u64 	%rd42, %rd36, %rd41;\n"
+"	ld.shared.f32 	%f90, [%rd42+0];\n"
+"	add.ftz.f32 	%f86, %f90, %f86;\n"
+"	st.shared.f32 	[%rd39+0], %f86;\n"
+"	ld.shared.f32 	%f91, [%rd42+512];\n"
+"	add.ftz.f32 	%f87, %f91, %f87;\n"
+"	st.shared.f32 	[%rd39+512], %f87;\n"
+"	ld.shared.f32 	%f92, [%rd42+1024];\n"
+"	add.ftz.f32 	%f88, %f92, %f88;\n"
+"	st.shared.f32 	[%rd39+1024], %f88;\n"
+"	ld.shared.f32 	%f93, [%rd42+1536];\n"
+"	add.ftz.f32 	%f89, %f93, %f89;\n"
+"	st.shared.f32 	[%rd39+1536], %f89;\n"
+"$Lt_0_26370:\n"
+"	.loc	16	194	0\n"
+"	shr.u32 	%r52, %r52, 1;\n"
+"	mov.u32 	%r55, 0;\n"
+"	setp.ne.u32 	%p13, %r52, %r55;\n"
+"	@%p13 bra 	$Lt_0_26114;\n"
+"$Lt_0_25602:\n"
+"	.loc	16	201	0\n"
+"	mov.f32 	%f27, %f86;\n"
+"	.loc	16	202	0\n"
+"	mov.f32 	%f26, %f87;\n"
+"	.loc	16	203	0\n"
+"	mov.f32 	%f25, %f88;\n"
+"	.loc	16	204	0\n"
+"	mov.f32 	%f28, %f89;\n"
+"	ld.param.s32 	%r56, [__cudaparm_kernel_pair_vflag];\n"
+"	mov.u32 	%r57, 0;\n"
+"	setp.le.s32 	%p14, %r56, %r57;\n"
+"	@%p14 bra 	$Lt_0_27138;\n"
+"	.loc	16	208	0\n"
+"	mov.f32 	%f86, %f6;\n"
+"	st.shared.f32 	[%rd39+0], %f86;\n"
+"	mov.f32 	%f87, %f8;\n"
+"	st.shared.f32 	[%rd39+512], %f87;\n"
+"	mov.f32 	%f88, %f10;\n"
+"	st.shared.f32 	[%rd39+1024], %f88;\n"
+"	mov.f32 	%f89, %f12;\n"
+"	st.shared.f32 	[%rd39+1536], %f89;\n"
+"	mov.f32 	%f94, %f14;\n"
+"	st.shared.f32 	[%rd39+2048], %f94;\n"
+"	mov.f32 	%f95, %f16;\n"
+"	st.shared.f32 	[%rd39+2560], %f95;\n"
+"	.loc	16	210	0\n"
+"	mov.s32 	%r58, %r51;\n"
+"	@!%p11 bra 	$Lt_0_27650;\n"
+"$Lt_0_28162:\n"
+"	setp.ge.u32 	%p15, %r6, %r58;\n"
+"	@%p15 bra 	$Lt_0_28418;\n"
+"	.loc	16	213	0\n"
+"	add.u32 	%r59, %r2, %r58;\n"
+"	cvt.u64.u32 	%rd43, %r59;\n"
+"	mul.wide.u32 	%rd44, %r59, 4;\n"
+"	add.u64 	%rd45, %rd36, %rd44;\n"
+"	ld.shared.f32 	%f96, [%rd45+0];\n"
+"	add.ftz.f32 	%f86, %f96, %f86;\n"
+"	st.shared.f32 	[%rd39+0], %f86;\n"
+"	ld.shared.f32 	%f97, [%rd45+512];\n"
+"	add.ftz.f32 	%f87, %f97, %f87;\n"
+"	st.shared.f32 	[%rd39+512], %f87;\n"
+"	ld.shared.f32 	%f98, [%rd45+1024];\n"
+"	add.ftz.f32 	%f88, %f98, %f88;\n"
+"	st.shared.f32 	[%rd39+1024], %f88;\n"
+"	ld.shared.f32 	%f99, [%rd45+1536];\n"
+"	add.ftz.f32 	%f89, %f99, %f89;\n"
+"	st.shared.f32 	[%rd39+1536], %f89;\n"
+"	ld.shared.f32 	%f100, [%rd45+2048];\n"
+"	add.ftz.f32 	%f94, %f100, %f94;\n"
+"	st.shared.f32 	[%rd39+2048], %f94;\n"
+"	ld.shared.f32 	%f101, [%rd45+2560];\n"
+"	add.ftz.f32 	%f95, %f101, %f95;\n"
+"	st.shared.f32 	[%rd39+2560], %f95;\n"
+"$Lt_0_28418:\n"
+"	.loc	16	210	0\n"
+"	shr.u32 	%r58, %r58, 1;\n"
+"	mov.u32 	%r60, 0;\n"
+"	setp.ne.u32 	%p16, %r58, %r60;\n"
+"	@%p16 bra 	$Lt_0_28162;\n"
+"$Lt_0_27650:\n"
+"	.loc	16	218	0\n"
+"	mov.f32 	%f6, %f86;\n"
+"	mov.f32 	%f8, %f87;\n"
+"	mov.f32 	%f10, %f88;\n"
+"	mov.f32 	%f12, %f89;\n"
+"	mov.f32 	%f14, %f94;\n"
+"	mov.f32 	%f16, %f95;\n"
+"$Lt_0_27138:\n"
+"$Lt_0_25090:\n"
+"	selp.s32 	%r61, 1, 0, %p1;\n"
+"	mov.s32 	%r62, 0;\n"
+"	set.eq.u32.s32 	%r63, %r6, %r62;\n"
+"	neg.s32 	%r64, %r63;\n"
+"	and.b32 	%r65, %r61, %r64;\n"
+"	mov.u32 	%r66, 0;\n"
+"	setp.eq.s32 	%p17, %r65, %r66;\n"
+"	@%p17 bra 	$Lt_0_29186;\n"
+"	.loc	16	224	0\n"
+"	cvt.s64.s32 	%rd46, %r9;\n"
+"	ld.param.u64 	%rd47, [__cudaparm_kernel_pair_engv];\n"
+"	mul.wide.s32 	%rd48, %r9, 4;\n"
+"	add.u64 	%rd49, %rd47, %rd48;\n"
+"	ld.param.s32 	%r67, [__cudaparm_kernel_pair_eflag];\n"
+"	mov.u32 	%r68, 0;\n"
+"	setp.le.s32 	%p18, %r67, %r68;\n"
+"	@%p18 bra 	$Lt_0_29698;\n"
+"	.loc	16	226	0\n"
+"	st.global.f32 	[%rd49+0], %f28;\n"
+"	.loc	16	227	0\n"
+"	cvt.s64.s32 	%rd50, %r10;\n"
+"	mul.wide.s32 	%rd51, %r10, 4;\n"
+"	add.u64 	%rd49, %rd49, %rd51;\n"
+"$Lt_0_29698:\n"
+"	ld.param.s32 	%r69, [__cudaparm_kernel_pair_vflag];\n"
+"	mov.u32 	%r70, 0;\n"
+"	setp.le.s32 	%p19, %r69, %r70;\n"
+"	@%p19 bra 	$Lt_0_30210;\n"
+"	.loc	16	231	0\n"
+"	mov.f32 	%f102, %f6;\n"
+"	st.global.f32 	[%rd49+0], %f102;\n"
+"	.loc	16	232	0\n"
+"	cvt.s64.s32 	%rd52, %r10;\n"
+"	mul.wide.s32 	%rd53, %r10, 4;\n"
+"	add.u64 	%rd54, %rd53, %rd49;\n"
+"	.loc	16	231	0\n"
+"	mov.f32 	%f103, %f8;\n"
+"	st.global.f32 	[%rd54+0], %f103;\n"
+"	.loc	16	232	0\n"
+"	add.u64 	%rd55, %rd53, %rd54;\n"
+"	.loc	16	231	0\n"
+"	mov.f32 	%f104, %f10;\n"
+"	st.global.f32 	[%rd55+0], %f104;\n"
+"	.loc	16	232	0\n"
+"	add.u64 	%rd56, %rd53, %rd55;\n"
+"	.loc	16	231	0\n"
+"	mov.f32 	%f105, %f12;\n"
+"	st.global.f32 	[%rd56+0], %f105;\n"
+"	.loc	16	232	0\n"
+"	add.u64 	%rd49, %rd53, %rd56;\n"
+"	.loc	16	231	0\n"
+"	mov.f32 	%f106, %f14;\n"
+"	st.global.f32 	[%rd49+0], %f106;\n"
+"	mov.f32 	%f107, %f16;\n"
+"	add.u64 	%rd57, %rd53, %rd49;\n"
+"	st.global.f32 	[%rd57+0], %f107;\n"
+"$Lt_0_30210:\n"
+"	.loc	16	235	0\n"
+"	ld.param.u64 	%rd58, [__cudaparm_kernel_pair_ans];\n"
+"	mul.lo.u64 	%rd59, %rd46, 16;\n"
+"	add.u64 	%rd60, %rd58, %rd59;\n"
+"	mov.f32 	%f108, %f109;\n"
+"	st.global.v4.f32 	[%rd60+0], {%f27,%f26,%f25,%f108};\n"
+"$Lt_0_29186:\n"
+"	.loc	16	237	0\n"
+"	exit;\n"
+"$LDWend_kernel_pair:\n"
+"	}\n"
+"	.entry kernel_pair_fast (\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_x_,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_lj1_in,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_lj3_in,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_dev_packed,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_ans,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_engv,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_eflag,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_vflag,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_inum,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_t_per_atom)\n"
+"	{\n"
+"	.reg .u32 %r<74>;\n"
+"	.reg .u64 %rd<74>;\n"
+"	.reg .f32 %f<118>;\n"
+"	.reg .pred %p<24>;\n"
+"	.shared .align 4 .b8 __cuda___cuda_local_var_32656_33_non_const_sp_lj3268[16];\n"
+"	.shared .align 16 .b8 __cuda___cuda_local_var_32654_34_non_const_lj13296[1936];\n"
+"	.shared .align 16 .b8 __cuda___cuda_local_var_32655_34_non_const_lj35232[1936];\n"
+"	.shared .align 4 .b8 __cuda___cuda_local_var_32753_35_non_const_red_acc7168[3072];\n"
+"	.loc	16	245	0\n"
+"$LDWbegin_kernel_pair_fast:\n"
+"	cvt.s32.u32 	%r1, %tid.x;\n"
+"	mov.u32 	%r2, 3;\n"
+"	setp.gt.s32 	%p1, %r1, %r2;\n"
+"	@%p1 bra 	$Lt_1_22786;\n"
+"	.loc	16	255	0\n"
+"	mov.u64 	%rd1, __cuda___cuda_local_var_32656_33_non_const_sp_lj3268;\n"
+"	cvt.s64.s32 	%rd2, %r1;\n"
+"	mul.wide.s32 	%rd3, %r1, 4;\n"
+"	ld.param.u64 	%rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];\n"
+"	add.u64 	%rd5, %rd4, %rd3;\n"
+"	ld.global.f32 	%f1, [%rd5+0];\n"
+"	add.u64 	%rd6, %rd3, %rd1;\n"
+"	st.shared.f32 	[%rd6+0], %f1;\n"
+"$Lt_1_22786:\n"
+"	mov.u64 	%rd1, __cuda___cuda_local_var_32656_33_non_const_sp_lj3268;\n"
+"	mov.u32 	%r3, 120;\n"
+"	setp.gt.s32 	%p2, %r1, %r3;\n"
+"	@%p2 bra 	$Lt_1_23298;\n"
+"	.loc	16	257	0\n"
+"	mov.u64 	%rd7, __cuda___cuda_local_var_32654_34_non_const_lj13296;\n"
+"	cvt.s64.s32 	%rd8, %r1;\n"
+"	mul.wide.s32 	%rd9, %r1, 16;\n"
+"	ld.param.u64 	%rd10, [__cudaparm_kernel_pair_fast_lj1_in];\n"
+"	add.u64 	%rd11, %rd10, %rd9;\n"
+"	add.u64 	%rd12, %rd9, %rd7;\n"
+"	ld.global.v4.f32 	{%f2,%f3,%f4,%f5}, [%rd11+0];\n"
+"	st.shared.v4.f32 	[%rd12+0], {%f2,%f3,%f4,%f5};\n"
+"	ld.param.s32 	%r4, [__cudaparm_kernel_pair_fast_eflag];\n"
+"	mov.u32 	%r5, 0;\n"
+"	setp.le.s32 	%p3, %r4, %r5;\n"
+"	@%p3 bra 	$Lt_1_23810;\n"
+"	.loc	16	259	0\n"
+"	mov.u64 	%rd13, __cuda___cuda_local_var_32655_34_non_const_lj35232;\n"
+"	ld.param.u64 	%rd14, [__cudaparm_kernel_pair_fast_lj3_in];\n"
+"	add.u64 	%rd15, %rd14, %rd9;\n"
+"	add.u64 	%rd16, %rd9, %rd13;\n"
+"	ld.global.v4.f32 	{%f6,%f7,%f8,%f9}, [%rd15+0];\n"
+"	st.shared.v4.f32 	[%rd16+0], {%f6,%f7,%f8,%f9};\n"
+"$Lt_1_23810:\n"
+"	mov.u64 	%rd13, __cuda___cuda_local_var_32655_34_non_const_lj35232;\n"
+"$Lt_1_23298:\n"
+"	mov.u64 	%rd7, __cuda___cuda_local_var_32654_34_non_const_lj13296;\n"
+"	mov.u64 	%rd13, __cuda___cuda_local_var_32655_34_non_const_lj35232;\n"
+"	.loc	16	269	0\n"
+"	mov.f32 	%f10, 0f00000000;    	\n"
+"	mov.f32 	%f11, %f10;\n"
+"	mov.f32 	%f12, 0f00000000;    	\n"
+"	mov.f32 	%f13, %f12;\n"
+"	mov.f32 	%f14, 0f00000000;    	\n"
+"	mov.f32 	%f15, %f14;\n"
+"	mov.f32 	%f16, 0f00000000;    	\n"
+"	mov.f32 	%f17, %f16;\n"
+"	mov.f32 	%f18, 0f00000000;    	\n"
+"	mov.f32 	%f19, %f18;\n"
+"	mov.f32 	%f20, 0f00000000;    	\n"
+"	mov.f32 	%f21, %f20;\n"
+"	.loc	16	271	0\n"
+"	bar.sync 	0;\n"
+"	ld.param.s32 	%r6, [__cudaparm_kernel_pair_fast_t_per_atom];\n"
+"	div.s32 	%r7, %r1, %r6;\n"
+"	cvt.s32.u32 	%r8, %ntid.x;\n"
+"	div.s32 	%r9, %r8, %r6;\n"
+"	rem.s32 	%r10, %r1, %r6;\n"
+"	cvt.s32.u32 	%r11, %ctaid.x;\n"
+"	mul.lo.s32 	%r12, %r11, %r9;\n"
+"	add.s32 	%r13, %r7, %r12;\n"
+"	ld.param.s32 	%r14, [__cudaparm_kernel_pair_fast_inum];\n"
+"	setp.lt.s32 	%p4, %r13, %r14;\n"
+"	@!%p4 bra 	$Lt_1_24578;\n"
+"	.loc	16	277	0\n"
+"	ld.param.s32 	%r15, [__cudaparm_kernel_pair_fast_nbor_pitch];\n"
+"	cvt.s64.s32 	%rd17, %r15;\n"
+"	mul.wide.s32 	%rd18, %r15, 4;\n"
+"	cvt.s64.s32 	%rd19, %r13;\n"
+"	mul.wide.s32 	%rd20, %r13, 4;\n"
+"	ld.param.u64 	%rd21, [__cudaparm_kernel_pair_fast_dev_nbor];\n"
+"	add.u64 	%rd22, %rd20, %rd21;\n"
+"	add.u64 	%rd23, %rd18, %rd22;\n"
+"	ld.global.s32 	%r16, [%rd23+0];\n"
+"	add.u64 	%rd24, %rd18, %rd23;\n"
+"	ld.param.u64 	%rd25, [__cudaparm_kernel_pair_fast_dev_packed];\n"
+"	setp.ne.u64 	%p5, %rd25, %rd21;\n"
+"	@%p5 bra 	$Lt_1_25090;\n"
+"	.loc	16	283	0\n"
+"	cvt.s32.s64 	%r17, %rd17;\n"
+"	mul.lo.s32 	%r18, %r17, %r16;\n"
+"	cvt.s64.s32 	%rd26, %r18;\n"
+"	mul.wide.s32 	%rd27, %r18, 4;\n"
+"	add.u64 	%rd28, %rd24, %rd27;\n"
+"	.loc	16	284	0\n"
+"	mul.lo.s32 	%r19, %r10, %r17;\n"
+"	cvt.s64.s32 	%rd29, %r19;\n"
+"	mul.wide.s32 	%rd30, %r19, 4;\n"
+"	add.u64 	%rd31, %rd24, %rd30;\n"
+"	.loc	16	285	0\n"
+"	mul.lo.s32 	%r20, %r17, %r6;\n"
+"	bra.uni 	$Lt_1_24834;\n"
+"$Lt_1_25090:\n"
+"	.loc	16	287	0\n"
+"	ld.global.s32 	%r21, [%rd24+0];\n"
+"	cvt.s64.s32 	%rd32, %r21;\n"
+"	mul.wide.s32 	%rd33, %r21, 4;\n"
+"	add.u64 	%rd34, %rd25, %rd33;\n"
+"	.loc	16	288	0\n"
+"	cvt.s64.s32 	%rd35, %r16;\n"
+"	mul.wide.s32 	%rd36, %r16, 4;\n"
+"	add.u64 	%rd28, %rd34, %rd36;\n"
+"	.loc	16	289	0\n"
+"	mov.s32 	%r20, %r6;\n"
+"	.loc	16	290	0\n"
+"	cvt.s64.s32 	%rd37, %r10;\n"
+"	mul.wide.s32 	%rd38, %r10, 4;\n"
+"	add.u64 	%rd31, %rd34, %rd38;\n"
+"$Lt_1_24834:\n"
+"	.loc	16	293	0\n"
+"	ld.global.s32 	%r22, [%rd22+0];\n"
+"	mov.u32 	%r23, %r22;\n"
+"	mov.s32 	%r24, 0;\n"
+"	mov.u32 	%r25, %r24;\n"
+"	mov.s32 	%r26, 0;\n"
+"	mov.u32 	%r27, %r26;\n"
+"	mov.s32 	%r28, 0;\n"
+"	mov.u32 	%r29, %r28;\n"
+"	tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r23,%r25,%r27,%r29}];\n"
+"	mov.f32 	%f26, %f22;\n"
+"	mov.f32 	%f27, %f23;\n"
+"	mov.f32 	%f28, %f24;\n"
+"	mov.f32 	%f29, %f25;\n"
+"	setp.ge.u64 	%p6, %rd31, %rd28;\n"
+"	@%p6 bra 	$Lt_1_34562;\n"
+"	cvt.rzi.ftz.s32.f32 	%r30, %f29;\n"
+"	cvt.s64.s32 	%rd39, %r20;\n"
+"	mul.lo.s32 	%r31, %r30, 11;\n"
+"	cvt.rn.f32.s32 	%f30, %r31;\n"
+"	mov.f32 	%f31, 0f00000000;    	\n"
+"	mov.f32 	%f32, 0f00000000;    	\n"
+"	mov.f32 	%f33, 0f00000000;    	\n"
+"	mov.f32 	%f34, 0f00000000;    	\n"
+"$Lt_1_25858:\n"
+"	.loc	16	300	0\n"
+"	ld.global.s32 	%r32, [%rd31+0];\n"
+"	.loc	16	301	0\n"
+"	shr.s32 	%r33, %r32, 30;\n"
+"	and.b32 	%r34, %r33, 3;\n"
+"	cvt.s64.s32 	%rd40, %r34;\n"
+"	mul.wide.s32 	%rd41, %r34, 4;\n"
+"	add.u64 	%rd42, %rd1, %rd41;\n"
+"	ld.shared.f32 	%f35, [%rd42+0];\n"
+"	.loc	16	304	0\n"
+"	and.b32 	%r35, %r32, 1073741823;\n"
+"	mov.u32 	%r36, %r35;\n"
+"	mov.s32 	%r37, 0;\n"
+"	mov.u32 	%r38, %r37;\n"
+"	mov.s32 	%r39, 0;\n"
+"	mov.u32 	%r40, %r39;\n"
+"	mov.s32 	%r41, 0;\n"
+"	mov.u32 	%r42, %r41;\n"
+"	tex.1d.v4.f32.s32 {%f36,%f37,%f38,%f39},[pos_tex,{%r36,%r38,%r40,%r42}];\n"
+"	mov.f32 	%f40, %f36;\n"
+"	mov.f32 	%f41, %f37;\n"
+"	mov.f32 	%f42, %f38;\n"
+"	mov.f32 	%f43, %f39;\n"
+"	sub.ftz.f32 	%f44, %f27, %f41;\n"
+"	sub.ftz.f32 	%f45, %f26, %f40;\n"
+"	sub.ftz.f32 	%f46, %f28, %f42;\n"
+"	mul.ftz.f32 	%f47, %f44, %f44;\n"
+"	fma.rn.ftz.f32 	%f48, %f45, %f45, %f47;\n"
+"	fma.rn.ftz.f32 	%f49, %f46, %f46, %f48;\n"
+"	add.ftz.f32 	%f50, %f30, %f43;\n"
+"	cvt.rzi.ftz.s32.f32 	%r43, %f50;\n"
+"	cvt.s64.s32 	%rd43, %r43;\n"
+"	mul.wide.s32 	%rd44, %r43, 16;\n"
+"	add.u64 	%rd45, %rd44, %rd7;\n"
+"	ld.shared.f32 	%f51, [%rd45+0];\n"
+"	setp.gt.ftz.f32 	%p7, %f51, %f49;\n"
+"	@!%p7 bra 	$Lt_1_28162;\n"
+"	rcp.approx.ftz.f32 	%f52, %f49;\n"
+"	ld.shared.f32 	%f53, [%rd45+4];\n"
+"	mov.f32 	%f54, 0f40000000;    	\n"
+"	setp.eq.ftz.f32 	%p8, %f53, %f54;\n"
+"	@!%p8 bra 	$Lt_1_26882;\n"
+"	.loc	16	318	0\n"
+"	mul.ftz.f32 	%f55, %f52, %f52;\n"
+"	mov.f32 	%f56, %f55;\n"
+"	.loc	16	319	0\n"
+"	mul.ftz.f32 	%f57, %f55, %f55;\n"
+"	bra.uni 	$Lt_1_27138;\n"
+"$Lt_1_26882:\n"
+"	mov.f32 	%f58, 0f3f800000;    	\n"
+"	setp.eq.ftz.f32 	%p9, %f53, %f58;\n"
+"	@!%p9 bra 	$Lt_1_27394;\n"
+"	.loc	16	321	0\n"
+"	sqrt.approx.ftz.f32 	%f59, %f52;\n"
+"	mul.ftz.f32 	%f60, %f52, %f59;\n"
+"	mov.f32 	%f57, %f60;\n"
+"	.loc	16	322	0\n"
+"	mul.ftz.f32 	%f56, %f60, %f60;\n"
+"	bra.uni 	$Lt_1_27138;\n"
+"$Lt_1_27394:\n"
+"	.loc	16	324	0\n"
+"	mul.ftz.f32 	%f61, %f52, %f52;\n"
+"	mul.ftz.f32 	%f62, %f52, %f61;\n"
+"	mov.f32 	%f56, %f62;\n"
+"	.loc	16	325	0\n"
+"	mov.f32 	%f57, %f62;\n"
+"$Lt_1_27138:\n"
+"$Lt_1_26626:\n"
+"	.loc	16	327	0\n"
+"	mul.ftz.f32 	%f63, %f52, %f35;\n"
+"	mul.ftz.f32 	%f64, %f56, %f63;\n"
+"	ld.shared.v2.f32 	{%f65,%f66}, [%rd45+8];\n"
+"	mul.ftz.f32 	%f67, %f65, %f57;\n"
+"	sub.ftz.f32 	%f68, %f67, %f66;\n"
+"	mul.ftz.f32 	%f69, %f64, %f68;\n"
+"	.loc	16	329	0\n"
+"	fma.rn.ftz.f32 	%f33, %f45, %f69, %f33;\n"
+"	.loc	16	330	0\n"
+"	fma.rn.ftz.f32 	%f32, %f44, %f69, %f32;\n"
+"	.loc	16	331	0\n"
+"	fma.rn.ftz.f32 	%f31, %f46, %f69, %f31;\n"
+"	ld.param.s32 	%r44, [__cudaparm_kernel_pair_fast_eflag];\n"
+"	mov.u32 	%r45, 0;\n"
+"	setp.le.s32 	%p10, %r44, %r45;\n"
+"	@%p10 bra 	$Lt_1_27650;\n"
+"	.loc	16	333	0\n"
+"	add.u64 	%rd46, %rd44, %rd13;\n"
+"	ld.shared.v4.f32 	{%f70,%f71,%f72,_}, [%rd46+0];\n"
+"	mul.ftz.f32 	%f73, %f35, %f56;\n"
+"	mul.ftz.f32 	%f74, %f70, %f57;\n"
+"	sub.ftz.f32 	%f75, %f74, %f71;\n"
+"	mul.ftz.f32 	%f76, %f73, %f75;\n"
+"	sub.ftz.f32 	%f77, %f76, %f72;\n"
+"	add.ftz.f32 	%f34, %f34, %f77;\n"
+"$Lt_1_27650:\n"
+"	ld.param.s32 	%r46, [__cudaparm_kernel_pair_fast_vflag];\n"
+"	mov.u32 	%r47, 0;\n"
+"	setp.le.s32 	%p11, %r46, %r47;\n"
+"	@%p11 bra 	$Lt_1_28162;\n"
+"	.loc	16	336	0\n"
+"	mov.f32 	%f78, %f11;\n"
+"	mul.ftz.f32 	%f79, %f45, %f45;\n"
+"	fma.rn.ftz.f32 	%f80, %f69, %f79, %f78;\n"
+"	mov.f32 	%f11, %f80;\n"
+"	.loc	16	337	0\n"
+"	mov.f32 	%f81, %f13;\n"
+"	fma.rn.ftz.f32 	%f82, %f69, %f47, %f81;\n"
+"	mov.f32 	%f13, %f82;\n"
+"	.loc	16	338	0\n"
+"	mov.f32 	%f83, %f15;\n"
+"	mul.ftz.f32 	%f84, %f46, %f46;\n"
+"	fma.rn.ftz.f32 	%f85, %f69, %f84, %f83;\n"
+"	mov.f32 	%f15, %f85;\n"
+"	.loc	16	339	0\n"
+"	mov.f32 	%f86, %f17;\n"
+"	mul.ftz.f32 	%f87, %f44, %f45;\n"
+"	fma.rn.ftz.f32 	%f88, %f69, %f87, %f86;\n"
+"	mov.f32 	%f17, %f88;\n"
+"	.loc	16	340	0\n"
+"	mov.f32 	%f89, %f19;\n"
+"	mul.ftz.f32 	%f90, %f45, %f46;\n"
+"	fma.rn.ftz.f32 	%f91, %f69, %f90, %f89;\n"
+"	mov.f32 	%f19, %f91;\n"
+"	.loc	16	341	0\n"
+"	mul.ftz.f32 	%f92, %f44, %f46;\n"
+"	fma.rn.ftz.f32 	%f20, %f69, %f92, %f20;\n"
+"	mov.f32 	%f21, %f20;\n"
+"$Lt_1_28162:\n"
+"$Lt_1_26114:\n"
+"	.loc	16	298	0\n"
+"	mul.lo.u64 	%rd47, %rd39, 4;\n"
+"	add.u64 	%rd31, %rd31, %rd47;\n"
+"	setp.lt.u64 	%p12, %rd31, %rd28;\n"
+"	@%p12 bra 	$Lt_1_25858;\n"
+"	bra.uni 	$Lt_1_24322;\n"
+"$Lt_1_34562:\n"
+"	mov.f32 	%f31, 0f00000000;    	\n"
+"	mov.f32 	%f32, 0f00000000;    	\n"
+"	mov.f32 	%f33, 0f00000000;    	\n"
+"	mov.f32 	%f34, 0f00000000;    	\n"
+"	bra.uni 	$Lt_1_24322;\n"
+"$Lt_1_24578:\n"
+"	mov.f32 	%f31, 0f00000000;    	\n"
+"	mov.f32 	%f32, 0f00000000;    	\n"
+"	mov.f32 	%f33, 0f00000000;    	\n"
+"	mov.f32 	%f34, 0f00000000;    	\n"
+"$Lt_1_24322:\n"
+"	mov.u32 	%r48, 1;\n"
+"	setp.le.s32 	%p13, %r6, %r48;\n"
+"	@%p13 bra 	$Lt_1_30978;\n"
+"	.loc	16	352	0\n"
+"	mov.u64 	%rd48, __cuda___cuda_local_var_32753_35_non_const_red_acc7168;\n"
+"	cvt.s64.s32 	%rd49, %r1;\n"
+"	mul.wide.s32 	%rd50, %r1, 4;\n"
+"	add.u64 	%rd51, %rd48, %rd50;\n"
+"	mov.f32 	%f93, %f33;\n"
+"	st.shared.f32 	[%rd51+0], %f93;\n"
+"	.loc	16	353	0\n"
+"	mov.f32 	%f94, %f32;\n"
+"	st.shared.f32 	[%rd51+512], %f94;\n"
+"	.loc	16	354	0\n"
+"	mov.f32 	%f95, %f31;\n"
+"	st.shared.f32 	[%rd51+1024], %f95;\n"
+"	.loc	16	355	0\n"
+"	mov.f32 	%f96, %f34;\n"
+"	st.shared.f32 	[%rd51+1536], %f96;\n"
+"	.loc	16	357	0\n"
+"	shr.s32 	%r49, %r6, 31;\n"
+"	mov.s32 	%r50, 1;\n"
+"	and.b32 	%r51, %r49, %r50;\n"
+"	add.s32 	%r52, %r51, %r6;\n"
+"	shr.s32 	%r53, %r52, 1;\n"
+"	mov.s32 	%r54, %r53;\n"
+"	mov.u32 	%r55, 0;\n"
+"	setp.ne.u32 	%p14, %r53, %r55;\n"
+"	@!%p14 bra 	$Lt_1_29442;\n"
+"$Lt_1_29954:\n"
+"	setp.ge.u32 	%p15, %r10, %r54;\n"
+"	@%p15 bra 	$Lt_1_30210;\n"
+"	.loc	16	360	0\n"
+"	add.u32 	%r56, %r1, %r54;\n"
+"	cvt.u64.u32 	%rd52, %r56;\n"
+"	mul.wide.u32 	%rd53, %r56, 4;\n"
+"	add.u64 	%rd54, %rd48, %rd53;\n"
+"	ld.shared.f32 	%f97, [%rd54+0];\n"
+"	add.ftz.f32 	%f93, %f97, %f93;\n"
+"	st.shared.f32 	[%rd51+0], %f93;\n"
+"	ld.shared.f32 	%f98, [%rd54+512];\n"
+"	add.ftz.f32 	%f94, %f98, %f94;\n"
+"	st.shared.f32 	[%rd51+512], %f94;\n"
+"	ld.shared.f32 	%f99, [%rd54+1024];\n"
+"	add.ftz.f32 	%f95, %f99, %f95;\n"
+"	st.shared.f32 	[%rd51+1024], %f95;\n"
+"	ld.shared.f32 	%f100, [%rd54+1536];\n"
+"	add.ftz.f32 	%f96, %f100, %f96;\n"
+"	st.shared.f32 	[%rd51+1536], %f96;\n"
+"$Lt_1_30210:\n"
+"	.loc	16	357	0\n"
+"	shr.u32 	%r54, %r54, 1;\n"
+"	mov.u32 	%r57, 0;\n"
+"	setp.ne.u32 	%p16, %r54, %r57;\n"
+"	@%p16 bra 	$Lt_1_29954;\n"
+"$Lt_1_29442:\n"
+"	.loc	16	364	0\n"
+"	mov.f32 	%f33, %f93;\n"
+"	.loc	16	365	0\n"
+"	mov.f32 	%f32, %f94;\n"
+"	.loc	16	366	0\n"
+"	mov.f32 	%f31, %f95;\n"
+"	.loc	16	367	0\n"
+"	mov.f32 	%f34, %f96;\n"
+"	ld.param.s32 	%r58, [__cudaparm_kernel_pair_fast_vflag];\n"
+"	mov.u32 	%r59, 0;\n"
+"	setp.le.s32 	%p17, %r58, %r59;\n"
+"	@%p17 bra 	$Lt_1_30978;\n"
+"	.loc	16	371	0\n"
+"	mov.f32 	%f93, %f11;\n"
+"	st.shared.f32 	[%rd51+0], %f93;\n"
+"	mov.f32 	%f94, %f13;\n"
+"	st.shared.f32 	[%rd51+512], %f94;\n"
+"	mov.f32 	%f95, %f15;\n"
+"	st.shared.f32 	[%rd51+1024], %f95;\n"
+"	mov.f32 	%f96, %f17;\n"
+"	st.shared.f32 	[%rd51+1536], %f96;\n"
+"	mov.f32 	%f101, %f19;\n"
+"	st.shared.f32 	[%rd51+2048], %f101;\n"
+"	mov.f32 	%f102, %f21;\n"
+"	st.shared.f32 	[%rd51+2560], %f102;\n"
+"	.loc	16	373	0\n"
+"	mov.s32 	%r60, %r53;\n"
+"	@!%p14 bra 	$Lt_1_31490;\n"
+"$Lt_1_32002:\n"
+"	setp.ge.u32 	%p18, %r10, %r60;\n"
+"	@%p18 bra 	$Lt_1_32258;\n"
+"	.loc	16	376	0\n"
+"	add.u32 	%r61, %r1, %r60;\n"
+"	cvt.u64.u32 	%rd55, %r61;\n"
+"	mul.wide.u32 	%rd56, %r61, 4;\n"
+"	add.u64 	%rd57, %rd48, %rd56;\n"
+"	ld.shared.f32 	%f103, [%rd57+0];\n"
+"	add.ftz.f32 	%f93, %f103, %f93;\n"
+"	st.shared.f32 	[%rd51+0], %f93;\n"
+"	ld.shared.f32 	%f104, [%rd57+512];\n"
+"	add.ftz.f32 	%f94, %f104, %f94;\n"
+"	st.shared.f32 	[%rd51+512], %f94;\n"
+"	ld.shared.f32 	%f105, [%rd57+1024];\n"
+"	add.ftz.f32 	%f95, %f105, %f95;\n"
+"	st.shared.f32 	[%rd51+1024], %f95;\n"
+"	ld.shared.f32 	%f106, [%rd57+1536];\n"
+"	add.ftz.f32 	%f96, %f106, %f96;\n"
+"	st.shared.f32 	[%rd51+1536], %f96;\n"
+"	ld.shared.f32 	%f107, [%rd57+2048];\n"
+"	add.ftz.f32 	%f101, %f107, %f101;\n"
+"	st.shared.f32 	[%rd51+2048], %f101;\n"
+"	ld.shared.f32 	%f108, [%rd57+2560];\n"
+"	add.ftz.f32 	%f102, %f108, %f102;\n"
+"	st.shared.f32 	[%rd51+2560], %f102;\n"
+"$Lt_1_32258:\n"
+"	.loc	16	373	0\n"
+"	shr.u32 	%r60, %r60, 1;\n"
+"	mov.u32 	%r62, 0;\n"
+"	setp.ne.u32 	%p19, %r60, %r62;\n"
+"	@%p19 bra 	$Lt_1_32002;\n"
+"$Lt_1_31490:\n"
+"	.loc	16	381	0\n"
+"	mov.f32 	%f11, %f93;\n"
+"	mov.f32 	%f13, %f94;\n"
+"	mov.f32 	%f15, %f95;\n"
+"	mov.f32 	%f17, %f96;\n"
+"	mov.f32 	%f19, %f101;\n"
+"	mov.f32 	%f21, %f102;\n"
+"$Lt_1_30978:\n"
+"$Lt_1_28930:\n"
+"	selp.s32 	%r63, 1, 0, %p4;\n"
+"	mov.s32 	%r64, 0;\n"
+"	set.eq.u32.s32 	%r65, %r10, %r64;\n"
+"	neg.s32 	%r66, %r65;\n"
+"	and.b32 	%r67, %r63, %r66;\n"
+"	mov.u32 	%r68, 0;\n"
+"	setp.eq.s32 	%p20, %r67, %r68;\n"
+"	@%p20 bra 	$Lt_1_33026;\n"
+"	.loc	16	387	0\n"
+"	cvt.s64.s32 	%rd58, %r13;\n"
+"	ld.param.u64 	%rd59, [__cudaparm_kernel_pair_fast_engv];\n"
+"	mul.wide.s32 	%rd60, %r13, 4;\n"
+"	add.u64 	%rd61, %rd59, %rd60;\n"
+"	ld.param.s32 	%r69, [__cudaparm_kernel_pair_fast_eflag];\n"
+"	mov.u32 	%r70, 0;\n"
+"	setp.le.s32 	%p21, %r69, %r70;\n"
+"	@%p21 bra 	$Lt_1_33538;\n"
+"	.loc	16	389	0\n"
+"	st.global.f32 	[%rd61+0], %f34;\n"
+"	.loc	16	390	0\n"
+"	cvt.s64.s32 	%rd62, %r14;\n"
+"	mul.wide.s32 	%rd63, %r14, 4;\n"
+"	add.u64 	%rd61, %rd61, %rd63;\n"
+"$Lt_1_33538:\n"
+"	ld.param.s32 	%r71, [__cudaparm_kernel_pair_fast_vflag];\n"
+"	mov.u32 	%r72, 0;\n"
+"	setp.le.s32 	%p22, %r71, %r72;\n"
+"	@%p22 bra 	$Lt_1_34050;\n"
+"	.loc	16	394	0\n"
+"	mov.f32 	%f109, %f11;\n"
+"	st.global.f32 	[%rd61+0], %f109;\n"
+"	.loc	16	395	0\n"
+"	cvt.s64.s32 	%rd64, %r14;\n"
+"	mul.wide.s32 	%rd65, %r14, 4;\n"
+"	add.u64 	%rd66, %rd65, %rd61;\n"
+"	.loc	16	394	0\n"
+"	mov.f32 	%f110, %f13;\n"
+"	st.global.f32 	[%rd66+0], %f110;\n"
+"	.loc	16	395	0\n"
+"	add.u64 	%rd67, %rd65, %rd66;\n"
+"	.loc	16	394	0\n"
+"	mov.f32 	%f111, %f15;\n"
+"	st.global.f32 	[%rd67+0], %f111;\n"
+"	.loc	16	395	0\n"
+"	add.u64 	%rd68, %rd65, %rd67;\n"
+"	.loc	16	394	0\n"
+"	mov.f32 	%f112, %f17;\n"
+"	st.global.f32 	[%rd68+0], %f112;\n"
+"	.loc	16	395	0\n"
+"	add.u64 	%rd61, %rd65, %rd68;\n"
+"	.loc	16	394	0\n"
+"	mov.f32 	%f113, %f19;\n"
+"	st.global.f32 	[%rd61+0], %f113;\n"
+"	mov.f32 	%f114, %f21;\n"
+"	add.u64 	%rd69, %rd65, %rd61;\n"
+"	st.global.f32 	[%rd69+0], %f114;\n"
+"$Lt_1_34050:\n"
+"	.loc	16	398	0\n"
+"	ld.param.u64 	%rd70, [__cudaparm_kernel_pair_fast_ans];\n"
+"	mul.lo.u64 	%rd71, %rd58, 16;\n"
+"	add.u64 	%rd72, %rd70, %rd71;\n"
+"	mov.f32 	%f115, %f116;\n"
+"	st.global.v4.f32 	[%rd72+0], {%f33,%f32,%f31,%f115};\n"
+"$Lt_1_33026:\n"
+"	.loc	16	400	0\n"
+"	exit;\n"
+"$LDWend_kernel_pair_fast:\n"
+"	}\n"
+;
--- a/lib/gpu/cmmc_long_gpu_kernel.ptx
+++ b/lib/gpu/cmmc_long_gpu_kernel.ptx
--- a/lib/gpu/cmmc_long_gpu_ptx.h
+++ b/lib/gpu/cmmc_long_gpu_ptx.h
--- a/lib/gpu/cmmc_msm_gpu_kernel.ptx
+++ b/lib/gpu/cmmc_msm_gpu_kernel.ptx
--- a/lib/gpu/cmmc_msm_gpu_ptx.h
+++ b/lib/gpu/cmmc_msm_gpu_ptx.h
--- a/lib/gpu/coul_long_gpu_kernel.ptx
+++ b/lib/gpu/coul_long_gpu_kernel.ptx
--- a/lib/gpu/coul_long_gpu_ptx.h
+++ b/lib/gpu/coul_long_gpu_ptx.h
@ -0,0 +1,979 @@
+const char * coul_long_gpu_kernel = 
+"	.version 2.3\n"
+"	.target sm_20\n"
+"	.address_size 64\n"
+"	.global .texref pos_tex;\n"
+"	.global .texref q_tex;\n"
+"	.entry kernel_pair (\n"
+"		.param .u64 __cudaparm_kernel_pair_x_,\n"
+"		.param .u64 __cudaparm_kernel_pair_lj1,\n"
+"		.param .u64 __cudaparm_kernel_pair_lj3,\n"
+"		.param .s32 __cudaparm_kernel_pair_lj_types,\n"
+"		.param .u64 __cudaparm_kernel_pair_sp_cl_in,\n"
+"		.param .u64 __cudaparm_kernel_pair_dev_nbor,\n"
+"		.param .u64 __cudaparm_kernel_pair_dev_packed,\n"
+"		.param .u64 __cudaparm_kernel_pair_ans,\n"
+"		.param .u64 __cudaparm_kernel_pair_engv,\n"
+"		.param .s32 __cudaparm_kernel_pair_eflag,\n"
+"		.param .s32 __cudaparm_kernel_pair_vflag,\n"
+"		.param .s32 __cudaparm_kernel_pair_inum,\n"
+"		.param .s32 __cudaparm_kernel_pair_nbor_pitch,\n"
+"		.param .u64 __cudaparm_kernel_pair_q_,\n"
+"		.param .f32 __cudaparm_kernel_pair_cut_coulsq,\n"
+"		.param .f32 __cudaparm_kernel_pair_qqrd2e,\n"
+"		.param .f32 __cudaparm_kernel_pair_g_ewald,\n"
+"		.param .s32 __cudaparm_kernel_pair_t_per_atom)\n"
+"	{\n"
+"	.reg .u32 %r<81>;\n"
+"	.reg .u64 %rd<57>;\n"
+"	.reg .f32 %f<132>;\n"
+"	.reg .pred %p<19>;\n"
+"	.shared .align 16 .b8 __cuda___cuda_local_var_32498_33_non_const_sp_cl112[16];\n"
+"	.shared .align 4 .b8 __cuda___cuda_local_var_32585_35_non_const_red_acc128[3072];\n"
+"	.loc	16	108	0\n"
+"$LDWbegin_kernel_pair:\n"
+"	.loc	16	115	0\n"
+"	ld.param.u64 	%rd1, [__cudaparm_kernel_pair_sp_cl_in];\n"
+"	ldu.global.f32 	%f1, [%rd1+0];\n"
+"	.loc	16	116	0\n"
+"	ld.global.f32 	%f2, [%rd1+4];\n"
+"	.loc	16	117	0\n"
+"	ld.global.f32 	%f3, [%rd1+8];\n"
+"	.loc	16	118	0\n"
+"	ld.global.f32 	%f4, [%rd1+12];\n"
+"	st.shared.v4.f32 	[__cuda___cuda_local_var_32498_33_non_const_sp_cl112+0], {%f1,%f2,%f3,%f4};\n"
+"	.loc	16	127	0\n"
+"	mov.f32 	%f5, 0f00000000;     	\n"
+"	mov.f32 	%f6, %f5;\n"
+"	mov.f32 	%f7, 0f00000000;     	\n"
+"	mov.f32 	%f8, %f7;\n"
+"	mov.f32 	%f9, 0f00000000;     	\n"
+"	mov.f32 	%f10, %f9;\n"
+"	mov.f32 	%f11, 0f00000000;    	\n"
+"	mov.f32 	%f12, %f11;\n"
+"	mov.f32 	%f13, 0f00000000;    	\n"
+"	mov.f32 	%f14, %f13;\n"
+"	mov.f32 	%f15, 0f00000000;    	\n"
+"	mov.f32 	%f16, %f15;\n"
+"	ld.param.s32 	%r1, [__cudaparm_kernel_pair_t_per_atom];\n"
+"	cvt.s32.u32 	%r2, %tid.x;\n"
+"	div.s32 	%r3, %r2, %r1;\n"
+"	cvt.s32.u32 	%r4, %ntid.x;\n"
+"	div.s32 	%r5, %r4, %r1;\n"
+"	rem.s32 	%r6, %r2, %r1;\n"
+"	cvt.s32.u32 	%r7, %ctaid.x;\n"
+"	mul.lo.s32 	%r8, %r7, %r5;\n"
+"	add.s32 	%r9, %r3, %r8;\n"
+"	ld.param.s32 	%r10, [__cudaparm_kernel_pair_inum];\n"
+"	setp.lt.s32 	%p1, %r9, %r10;\n"
+"	@!%p1 bra 	$Lt_0_19202;\n"
+"	.loc	16	131	0\n"
+"	cvt.s64.s32 	%rd2, %r9;\n"
+"	mul.wide.s32 	%rd3, %r9, 4;\n"
+"	ld.param.u64 	%rd4, [__cudaparm_kernel_pair_dev_nbor];\n"
+"	add.u64 	%rd5, %rd3, %rd4;\n"
+"	ld.global.s32 	%r11, [%rd5+0];\n"
+"	.loc	16	133	0\n"
+"	ld.param.s32 	%r12, [__cudaparm_kernel_pair_nbor_pitch];\n"
+"	cvt.s64.s32 	%rd6, %r12;\n"
+"	mul.wide.s32 	%rd7, %r12, 4;\n"
+"	add.u64 	%rd8, %rd7, %rd5;\n"
+"	ld.global.s32 	%r13, [%rd8+0];\n"
+"	add.u64 	%rd9, %rd7, %rd8;\n"
+"	ld.param.u64 	%rd10, [__cudaparm_kernel_pair_dev_packed];\n"
+"	setp.ne.u64 	%p2, %rd10, %rd4;\n"
+"	@%p2 bra 	$Lt_0_19714;\n"
+"	.loc	16	139	0\n"
+"	cvt.s32.s64 	%r14, %rd6;\n"
+"	mul.lo.s32 	%r15, %r14, %r13;\n"
+"	cvt.s64.s32 	%rd11, %r15;\n"
+"	mul.wide.s32 	%rd12, %r15, 4;\n"
+"	add.u64 	%rd13, %rd9, %rd12;\n"
+"	.loc	16	140	0\n"
+"	mul.lo.s32 	%r16, %r6, %r14;\n"
+"	cvt.s64.s32 	%rd14, %r16;\n"
+"	mul.wide.s32 	%rd15, %r16, 4;\n"
+"	add.u64 	%rd16, %rd9, %rd15;\n"
+"	.loc	16	141	0\n"
+"	mul.lo.s32 	%r17, %r14, %r1;\n"
+"	bra.uni 	$Lt_0_19458;\n"
+"$Lt_0_19714:\n"
+"	.loc	16	143	0\n"
+"	ld.global.s32 	%r18, [%rd9+0];\n"
+"	cvt.s64.s32 	%rd17, %r18;\n"
+"	mul.wide.s32 	%rd18, %r18, 4;\n"
+"	add.u64 	%rd19, %rd10, %rd18;\n"
+"	.loc	16	144	0\n"
+"	cvt.s64.s32 	%rd20, %r13;\n"
+"	mul.wide.s32 	%rd21, %r13, 4;\n"
+"	add.u64 	%rd13, %rd19, %rd21;\n"
+"	.loc	16	145	0\n"
+"	mov.s32 	%r17, %r1;\n"
+"	.loc	16	146	0\n"
+"	cvt.s64.s32 	%rd22, %r6;\n"
+"	mul.wide.s32 	%rd23, %r6, 4;\n"
+"	add.u64 	%rd16, %rd19, %rd23;\n"
+"$Lt_0_19458:\n"
+"	.loc	16	149	0\n"
+"	mov.u32 	%r19, %r11;\n"
+"	mov.s32 	%r20, 0;\n"
+"	mov.u32 	%r21, %r20;\n"
+"	mov.s32 	%r22, 0;\n"
+"	mov.u32 	%r23, %r22;\n"
+"	mov.s32 	%r24, 0;\n"
+"	mov.u32 	%r25, %r24;\n"
+"	tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r19,%r21,%r23,%r25}];\n"
+"	mov.f32 	%f21, %f17;\n"
+"	mov.f32 	%f22, %f18;\n"
+"	mov.f32 	%f23, %f19;\n"
+"	.loc	16	150	0\n"
+"	mov.u32 	%r26, %r11;\n"
+"	mov.s32 	%r27, 0;\n"
+"	mov.u32 	%r28, %r27;\n"
+"	mov.s32 	%r29, 0;\n"
+"	mov.u32 	%r30, %r29;\n"
+"	mov.s32 	%r31, 0;\n"
+"	mov.u32 	%r32, %r31;\n"
+"	tex.1d.v4.f32.s32 {%f24,%f25,%f26,%f27},[q_tex,{%r26,%r28,%r30,%r32}];\n"
+"	mov.f32 	%f28, %f24;\n"
+"	setp.ge.u64 	%p3, %rd16, %rd13;\n"
+"	@%p3 bra 	$Lt_0_27650;\n"
+"	cvt.s64.s32 	%rd24, %r17;\n"
+"	ld.param.f32 	%f29, [__cudaparm_kernel_pair_cut_coulsq];\n"
+"	mov.f32 	%f30, 0f00000000;    	\n"
+"	mov.f32 	%f31, 0f00000000;    	\n"
+"	mov.f32 	%f32, 0f00000000;    	\n"
+"	mov.f32 	%f33, 0f00000000;    	\n"
+"	mov.u64 	%rd25, __cuda___cuda_local_var_32498_33_non_const_sp_cl112;\n"
+"$Lt_0_20482:\n"
+"	.loc	16	153	0\n"
+"	ld.global.s32 	%r33, [%rd16+0];\n"
+"	.loc	16	156	0\n"
+"	mov.f32 	%f34, 0f3f800000;    	\n"
+"	shr.s32 	%r34, %r33, 30;\n"
+"	and.b32 	%r35, %r34, 3;\n"
+"	cvt.s64.s32 	%rd26, %r35;\n"
+"	mul.wide.s32 	%rd27, %r35, 4;\n"
+"	add.u64 	%rd28, %rd25, %rd27;\n"
+"	ld.shared.f32 	%f35, [%rd28+0];\n"
+"	sub.ftz.f32 	%f36, %f34, %f35;\n"
+"	.loc	16	159	0\n"
+"	and.b32 	%r36, %r33, 1073741823;\n"
+"	mov.u32 	%r37, %r36;\n"
+"	mov.s32 	%r38, 0;\n"
+"	mov.u32 	%r39, %r38;\n"
+"	mov.s32 	%r40, 0;\n"
+"	mov.u32 	%r41, %r40;\n"
+"	mov.s32 	%r42, 0;\n"
+"	mov.u32 	%r43, %r42;\n"
+"	tex.1d.v4.f32.s32 {%f37,%f38,%f39,%f40},[pos_tex,{%r37,%r39,%r41,%r43}];\n"
+"	mov.f32 	%f41, %f37;\n"
+"	mov.f32 	%f42, %f38;\n"
+"	mov.f32 	%f43, %f39;\n"
+"	sub.ftz.f32 	%f44, %f22, %f42;\n"
+"	sub.ftz.f32 	%f45, %f21, %f41;\n"
+"	sub.ftz.f32 	%f46, %f23, %f43;\n"
+"	mul.ftz.f32 	%f47, %f44, %f44;\n"
+"	fma.rn.ftz.f32 	%f48, %f45, %f45, %f47;\n"
+"	fma.rn.ftz.f32 	%f49, %f46, %f46, %f48;\n"
+"	setp.lt.ftz.f32 	%p4, %f49, %f29;\n"
+"	@!%p4 bra 	$Lt_0_21250;\n"
+"	.loc	16	175	0\n"
+"	sqrt.approx.ftz.f32 	%f50, %f49;\n"
+"	ld.param.f32 	%f51, [__cudaparm_kernel_pair_g_ewald];\n"
+"	mul.ftz.f32 	%f52, %f51, %f50;\n"
+"	mul.ftz.f32 	%f53, %f52, %f52;\n"
+"	mov.f32 	%f54, 0f3f800000;    	\n"
+"	mov.f32 	%f55, 0f3ea7ba05;    	\n"
+"	fma.rn.ftz.f32 	%f56, %f55, %f52, %f54;\n"
+"	neg.ftz.f32 	%f57, %f53;\n"
+"	rcp.approx.ftz.f32 	%f58, %f56;\n"
+"	mov.f32 	%f59, 0f3fb8aa3b;    	\n"
+"	mul.ftz.f32 	%f60, %f57, %f59;\n"
+"	ex2.approx.ftz.f32 	%f61, %f60;\n"
+"	mov.f32 	%f62, 0f3e827906;    	\n"
+"	mov.f32 	%f63, 0fbe91a98e;    	\n"
+"	mov.f32 	%f64, 0f3fb5f0e3;    	\n"
+"	mov.f32 	%f65, 0fbfba00e3;    	\n"
+"	mov.f32 	%f66, 0f3f87dc22;    	\n"
+"	fma.rn.ftz.f32 	%f67, %f66, %f58, %f65;\n"
+"	fma.rn.ftz.f32 	%f68, %f58, %f67, %f64;\n"
+"	fma.rn.ftz.f32 	%f69, %f58, %f68, %f63;\n"
+"	fma.rn.ftz.f32 	%f70, %f58, %f69, %f62;\n"
+"	mul.ftz.f32 	%f71, %f58, %f70;\n"
+"	mul.ftz.f32 	%f72, %f61, %f71;\n"
+"	.loc	16	176	0\n"
+"	mov.u32 	%r44, %r36;\n"
+"	mov.s32 	%r45, 0;\n"
+"	mov.u32 	%r46, %r45;\n"
+"	mov.s32 	%r47, 0;\n"
+"	mov.u32 	%r48, %r47;\n"
+"	mov.s32 	%r49, 0;\n"
+"	mov.u32 	%r50, %r49;\n"
+"	tex.1d.v4.f32.s32 {%f73,%f74,%f75,%f76},[q_tex,{%r44,%r46,%r48,%r50}];\n"
+"	mov.f32 	%f77, %f73;\n"
+"	.loc	16	177	0\n"
+"	ld.param.f32 	%f78, [__cudaparm_kernel_pair_qqrd2e];\n"
+"	mul.ftz.f32 	%f79, %f78, %f28;\n"
+"	mul.ftz.f32 	%f80, %f79, %f77;\n"
+"	div.approx.ftz.f32 	%f81, %f80, %f50;\n"
+"	mov.f32 	%f82, 0f3f906ebb;    	\n"
+"	mul.ftz.f32 	%f83, %f52, %f82;\n"
+"	fma.rn.ftz.f32 	%f84, %f61, %f83, %f72;\n"
+"	sub.ftz.f32 	%f85, %f84, %f36;\n"
+"	mul.ftz.f32 	%f86, %f81, %f85;\n"
+"	rcp.approx.ftz.f32 	%f87, %f49;\n"
+"	mul.ftz.f32 	%f88, %f86, %f87;\n"
+"	.loc	16	179	0\n"
+"	fma.rn.ftz.f32 	%f32, %f45, %f88, %f32;\n"
+"	.loc	16	180	0\n"
+"	fma.rn.ftz.f32 	%f31, %f44, %f88, %f31;\n"
+"	.loc	16	181	0\n"
+"	fma.rn.ftz.f32 	%f30, %f46, %f88, %f30;\n"
+"	.loc	16	168	0\n"
+"	sub.ftz.f32 	%f89, %f72, %f36;\n"
+"	fma.rn.ftz.f32 	%f90, %f81, %f89, %f33;\n"
+"	ld.param.s32 	%r51, [__cudaparm_kernel_pair_eflag];\n"
+"	mov.s32 	%r52, 0;\n"
+"	setp.gt.s32 	%p5, %r51, %r52;\n"
+"	selp.f32 	%f33, %f90, %f33, %p5;\n"
+"	ld.param.s32 	%r53, [__cudaparm_kernel_pair_vflag];\n"
+"	mov.u32 	%r54, 0;\n"
+"	setp.le.s32 	%p6, %r53, %r54;\n"
+"	@%p6 bra 	$Lt_0_21250;\n"
+"	.loc	16	187	0\n"
+"	mov.f32 	%f91, %f6;\n"
+"	mul.ftz.f32 	%f92, %f45, %f45;\n"
+"	fma.rn.ftz.f32 	%f93, %f88, %f92, %f91;\n"
+"	mov.f32 	%f6, %f93;\n"
+"	.loc	16	188	0\n"
+"	mov.f32 	%f94, %f8;\n"
+"	fma.rn.ftz.f32 	%f95, %f88, %f47, %f94;\n"
+"	mov.f32 	%f8, %f95;\n"
+"	.loc	16	189	0\n"
+"	mov.f32 	%f96, %f10;\n"
+"	mul.ftz.f32 	%f97, %f46, %f46;\n"
+"	fma.rn.ftz.f32 	%f98, %f88, %f97, %f96;\n"
+"	mov.f32 	%f10, %f98;\n"
+"	.loc	16	190	0\n"
+"	mov.f32 	%f99, %f12;\n"
+"	mul.ftz.f32 	%f100, %f44, %f45;\n"
+"	fma.rn.ftz.f32 	%f101, %f88, %f100, %f99;\n"
+"	mov.f32 	%f12, %f101;\n"
+"	.loc	16	191	0\n"
+"	mov.f32 	%f102, %f14;\n"
+"	mul.ftz.f32 	%f103, %f45, %f46;\n"
+"	fma.rn.ftz.f32 	%f104, %f88, %f103, %f102;\n"
+"	mov.f32 	%f14, %f104;\n"
+"	.loc	16	192	0\n"
+"	mul.ftz.f32 	%f105, %f44, %f46;\n"
+"	fma.rn.ftz.f32 	%f15, %f88, %f105, %f15;\n"
+"	mov.f32 	%f16, %f15;\n"
+"$Lt_0_21250:\n"
+"$Lt_0_20738:\n"
+"	.loc	16	152	0\n"
+"	mul.lo.u64 	%rd29, %rd24, 4;\n"
+"	add.u64 	%rd16, %rd16, %rd29;\n"
+"	setp.lt.u64 	%p7, %rd16, %rd13;\n"
+"	@%p7 bra 	$Lt_0_20482;\n"
+"	bra.uni 	$Lt_0_18946;\n"
+"$Lt_0_27650:\n"
+"	mov.f32 	%f30, 0f00000000;    	\n"
+"	mov.f32 	%f31, 0f00000000;    	\n"
+"	mov.f32 	%f32, 0f00000000;    	\n"
+"	mov.f32 	%f33, 0f00000000;    	\n"
+"	bra.uni 	$Lt_0_18946;\n"
+"$Lt_0_19202:\n"
+"	mov.f32 	%f30, 0f00000000;    	\n"
+"	mov.f32 	%f31, 0f00000000;    	\n"
+"	mov.f32 	%f32, 0f00000000;    	\n"
+"	mov.f32 	%f33, 0f00000000;    	\n"
+"$Lt_0_18946:\n"
+"	mov.u32 	%r55, 1;\n"
+"	setp.le.s32 	%p8, %r1, %r55;\n"
+"	@%p8 bra 	$Lt_0_24066;\n"
+"	.loc	16	203	0\n"
+"	mov.u64 	%rd30, __cuda___cuda_local_var_32585_35_non_const_red_acc128;\n"
+"	cvt.s64.s32 	%rd31, %r2;\n"
+"	mul.wide.s32 	%rd32, %r2, 4;\n"
+"	add.u64 	%rd33, %rd30, %rd32;\n"
+"	mov.f32 	%f106, %f32;\n"
+"	st.shared.f32 	[%rd33+0], %f106;\n"
+"	.loc	16	204	0\n"
+"	mov.f32 	%f107, %f31;\n"
+"	st.shared.f32 	[%rd33+512], %f107;\n"
+"	.loc	16	205	0\n"
+"	mov.f32 	%f108, %f30;\n"
+"	st.shared.f32 	[%rd33+1024], %f108;\n"
+"	.loc	16	206	0\n"
+"	mov.f32 	%f109, %f33;\n"
+"	st.shared.f32 	[%rd33+1536], %f109;\n"
+"	.loc	16	208	0\n"
+"	shr.s32 	%r56, %r1, 31;\n"
+"	mov.s32 	%r57, 1;\n"
+"	and.b32 	%r58, %r56, %r57;\n"
+"	add.s32 	%r59, %r58, %r1;\n"
+"	shr.s32 	%r60, %r59, 1;\n"
+"	mov.s32 	%r61, %r60;\n"
+"	mov.u32 	%r62, 0;\n"
+"	setp.ne.u32 	%p9, %r60, %r62;\n"
+"	@!%p9 bra 	$Lt_0_22530;\n"
+"$Lt_0_23042:\n"
+"	setp.ge.u32 	%p10, %r6, %r61;\n"
+"	@%p10 bra 	$Lt_0_23298;\n"
+"	.loc	16	211	0\n"
+"	add.u32 	%r63, %r2, %r61;\n"
+"	cvt.u64.u32 	%rd34, %r63;\n"
+"	mul.wide.u32 	%rd35, %r63, 4;\n"
+"	add.u64 	%rd36, %rd30, %rd35;\n"
+"	ld.shared.f32 	%f110, [%rd36+0];\n"
+"	add.ftz.f32 	%f106, %f110, %f106;\n"
+"	st.shared.f32 	[%rd33+0], %f106;\n"
+"	ld.shared.f32 	%f111, [%rd36+512];\n"
+"	add.ftz.f32 	%f107, %f111, %f107;\n"
+"	st.shared.f32 	[%rd33+512], %f107;\n"
+"	ld.shared.f32 	%f112, [%rd36+1024];\n"
+"	add.ftz.f32 	%f108, %f112, %f108;\n"
+"	st.shared.f32 	[%rd33+1024], %f108;\n"
+"	ld.shared.f32 	%f113, [%rd36+1536];\n"
+"	add.ftz.f32 	%f109, %f113, %f109;\n"
+"	st.shared.f32 	[%rd33+1536], %f109;\n"
+"$Lt_0_23298:\n"
+"	.loc	16	208	0\n"
+"	shr.u32 	%r61, %r61, 1;\n"
+"	mov.u32 	%r64, 0;\n"
+"	setp.ne.u32 	%p11, %r61, %r64;\n"
+"	@%p11 bra 	$Lt_0_23042;\n"
+"$Lt_0_22530:\n"
+"	.loc	16	215	0\n"
+"	mov.f32 	%f32, %f106;\n"
+"	.loc	16	216	0\n"
+"	mov.f32 	%f31, %f107;\n"
+"	.loc	16	217	0\n"
+"	mov.f32 	%f30, %f108;\n"
+"	.loc	16	218	0\n"
+"	mov.f32 	%f33, %f109;\n"
+"	ld.param.s32 	%r65, [__cudaparm_kernel_pair_vflag];\n"
+"	mov.u32 	%r66, 0;\n"
+"	setp.le.s32 	%p12, %r65, %r66;\n"
+"	@%p12 bra 	$Lt_0_24066;\n"
+"	.loc	16	222	0\n"
+"	mov.f32 	%f106, %f6;\n"
+"	st.shared.f32 	[%rd33+0], %f106;\n"
+"	mov.f32 	%f107, %f8;\n"
+"	st.shared.f32 	[%rd33+512], %f107;\n"
+"	mov.f32 	%f108, %f10;\n"
+"	st.shared.f32 	[%rd33+1024], %f108;\n"
+"	mov.f32 	%f109, %f12;\n"
+"	st.shared.f32 	[%rd33+1536], %f109;\n"
+"	mov.f32 	%f114, %f14;\n"
+"	st.shared.f32 	[%rd33+2048], %f114;\n"
+"	mov.f32 	%f115, %f16;\n"
+"	st.shared.f32 	[%rd33+2560], %f115;\n"
+"	.loc	16	224	0\n"
+"	mov.s32 	%r67, %r60;\n"
+"	@!%p9 bra 	$Lt_0_24578;\n"
+"$Lt_0_25090:\n"
+"	setp.ge.u32 	%p13, %r6, %r67;\n"
+"	@%p13 bra 	$Lt_0_25346;\n"
+"	.loc	16	227	0\n"
+"	add.u32 	%r68, %r2, %r67;\n"
+"	cvt.u64.u32 	%rd37, %r68;\n"
+"	mul.wide.u32 	%rd38, %r68, 4;\n"
+"	add.u64 	%rd39, %rd30, %rd38;\n"
+"	ld.shared.f32 	%f116, [%rd39+0];\n"
+"	add.ftz.f32 	%f106, %f116, %f106;\n"
+"	st.shared.f32 	[%rd33+0], %f106;\n"
+"	ld.shared.f32 	%f117, [%rd39+512];\n"
+"	add.ftz.f32 	%f107, %f117, %f107;\n"
+"	st.shared.f32 	[%rd33+512], %f107;\n"
+"	ld.shared.f32 	%f118, [%rd39+1024];\n"
+"	add.ftz.f32 	%f108, %f118, %f108;\n"
+"	st.shared.f32 	[%rd33+1024], %f108;\n"
+"	ld.shared.f32 	%f119, [%rd39+1536];\n"
+"	add.ftz.f32 	%f109, %f119, %f109;\n"
+"	st.shared.f32 	[%rd33+1536], %f109;\n"
+"	ld.shared.f32 	%f120, [%rd39+2048];\n"
+"	add.ftz.f32 	%f114, %f120, %f114;\n"
+"	st.shared.f32 	[%rd33+2048], %f114;\n"
+"	ld.shared.f32 	%f121, [%rd39+2560];\n"
+"	add.ftz.f32 	%f115, %f121, %f115;\n"
+"	st.shared.f32 	[%rd33+2560], %f115;\n"
+"$Lt_0_25346:\n"
+"	.loc	16	224	0\n"
+"	shr.u32 	%r67, %r67, 1;\n"
+"	mov.u32 	%r69, 0;\n"
+"	setp.ne.u32 	%p14, %r67, %r69;\n"
+"	@%p14 bra 	$Lt_0_25090;\n"
+"$Lt_0_24578:\n"
+"	.loc	16	232	0\n"
+"	mov.f32 	%f6, %f106;\n"
+"	mov.f32 	%f8, %f107;\n"
+"	mov.f32 	%f10, %f108;\n"
+"	mov.f32 	%f12, %f109;\n"
+"	mov.f32 	%f14, %f114;\n"
+"	mov.f32 	%f16, %f115;\n"
+"$Lt_0_24066:\n"
+"$Lt_0_22018:\n"
+"	selp.s32 	%r70, 1, 0, %p1;\n"
+"	mov.s32 	%r71, 0;\n"
+"	set.eq.u32.s32 	%r72, %r6, %r71;\n"
+"	neg.s32 	%r73, %r72;\n"
+"	and.b32 	%r74, %r70, %r73;\n"
+"	mov.u32 	%r75, 0;\n"
+"	setp.eq.s32 	%p15, %r74, %r75;\n"
+"	@%p15 bra 	$Lt_0_26114;\n"
+"	.loc	16	238	0\n"
+"	cvt.s64.s32 	%rd40, %r9;\n"
+"	ld.param.u64 	%rd41, [__cudaparm_kernel_pair_engv];\n"
+"	mul.wide.s32 	%rd42, %r9, 4;\n"
+"	add.u64 	%rd43, %rd41, %rd42;\n"
+"	ld.param.s32 	%r76, [__cudaparm_kernel_pair_eflag];\n"
+"	mov.u32 	%r77, 0;\n"
+"	setp.le.s32 	%p16, %r76, %r77;\n"
+"	@%p16 bra 	$Lt_0_26626;\n"
+"	.loc	16	240	0\n"
+"	mov.f32 	%f122, 0f00000000;   	\n"
+"	st.global.f32 	[%rd43+0], %f122;\n"
+"	.loc	16	241	0\n"
+"	cvt.s64.s32 	%rd44, %r10;\n"
+"	mul.wide.s32 	%rd45, %r10, 4;\n"
+"	add.u64 	%rd46, %rd45, %rd43;\n"
+"	.loc	16	242	0\n"
+"	st.global.f32 	[%rd46+0], %f33;\n"
+"	.loc	16	243	0\n"
+"	add.u64 	%rd43, %rd45, %rd46;\n"
+"$Lt_0_26626:\n"
+"	ld.param.s32 	%r78, [__cudaparm_kernel_pair_vflag];\n"
+"	mov.u32 	%r79, 0;\n"
+"	setp.le.s32 	%p17, %r78, %r79;\n"
+"	@%p17 bra 	$Lt_0_27138;\n"
+"	.loc	16	247	0\n"
+"	mov.f32 	%f123, %f6;\n"
+"	st.global.f32 	[%rd43+0], %f123;\n"
+"	.loc	16	248	0\n"
+"	cvt.s64.s32 	%rd47, %r10;\n"
+"	mul.wide.s32 	%rd48, %r10, 4;\n"
+"	add.u64 	%rd49, %rd48, %rd43;\n"
+"	.loc	16	247	0\n"
+"	mov.f32 	%f124, %f8;\n"
+"	st.global.f32 	[%rd49+0], %f124;\n"
+"	.loc	16	248	0\n"
+"	add.u64 	%rd50, %rd48, %rd49;\n"
+"	.loc	16	247	0\n"
+"	mov.f32 	%f125, %f10;\n"
+"	st.global.f32 	[%rd50+0], %f125;\n"
+"	.loc	16	248	0\n"
+"	add.u64 	%rd51, %rd48, %rd50;\n"
+"	.loc	16	247	0\n"
+"	mov.f32 	%f126, %f12;\n"
+"	st.global.f32 	[%rd51+0], %f126;\n"
+"	.loc	16	248	0\n"
+"	add.u64 	%rd43, %rd48, %rd51;\n"
+"	.loc	16	247	0\n"
+"	mov.f32 	%f127, %f14;\n"
+"	st.global.f32 	[%rd43+0], %f127;\n"
+"	mov.f32 	%f128, %f16;\n"
+"	add.u64 	%rd52, %rd48, %rd43;\n"
+"	st.global.f32 	[%rd52+0], %f128;\n"
+"$Lt_0_27138:\n"
+"	.loc	16	251	0\n"
+"	ld.param.u64 	%rd53, [__cudaparm_kernel_pair_ans];\n"
+"	mul.lo.u64 	%rd54, %rd40, 16;\n"
+"	add.u64 	%rd55, %rd53, %rd54;\n"
+"	mov.f32 	%f129, %f130;\n"
+"	st.global.v4.f32 	[%rd55+0], {%f32,%f31,%f30,%f129};\n"
+"$Lt_0_26114:\n"
+"	.loc	16	253	0\n"
+"	exit;\n"
+"$LDWend_kernel_pair:\n"
+"	}\n"
+"	.entry kernel_pair_fast (\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_x_,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_lj1_in,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_lj3_in,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_sp_cl_in,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_dev_packed,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_ans,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_engv,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_eflag,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_vflag,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_inum,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_q_,\n"
+"		.param .f32 __cudaparm_kernel_pair_fast_cut_coulsq,\n"
+"		.param .f32 __cudaparm_kernel_pair_fast_qqrd2e,\n"
+"		.param .f32 __cudaparm_kernel_pair_fast_g_ewald,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_t_per_atom)\n"
+"	{\n"
+"	.reg .u32 %r<82>;\n"
+"	.reg .u64 %rd<61>;\n"
+"	.reg .f32 %f<129>;\n"
+"	.reg .pred %p<20>;\n"
+"	.shared .align 4 .b8 __cuda___cuda_local_var_32653_33_non_const_sp_cl3304[16];\n"
+"	.shared .align 4 .b8 __cuda___cuda_local_var_32740_35_non_const_red_acc3320[3072];\n"
+"	.loc	16	263	0\n"
+"$LDWbegin_kernel_pair_fast:\n"
+"	cvt.s32.u32 	%r1, %tid.x;\n"
+"	mov.u32 	%r2, 3;\n"
+"	setp.gt.s32 	%p1, %r1, %r2;\n"
+"	@%p1 bra 	$Lt_1_19714;\n"
+"	.loc	16	271	0\n"
+"	mov.u64 	%rd1, __cuda___cuda_local_var_32653_33_non_const_sp_cl3304;\n"
+"	cvt.s64.s32 	%rd2, %r1;\n"
+"	mul.wide.s32 	%rd3, %r1, 4;\n"
+"	ld.param.u64 	%rd4, [__cudaparm_kernel_pair_fast_sp_cl_in];\n"
+"	add.u64 	%rd5, %rd4, %rd3;\n"
+"	ld.global.f32 	%f1, [%rd5+0];\n"
+"	add.u64 	%rd6, %rd3, %rd1;\n"
+"	st.shared.f32 	[%rd6+0], %f1;\n"
+"$Lt_1_19714:\n"
+"	mov.u64 	%rd1, __cuda___cuda_local_var_32653_33_non_const_sp_cl3304;\n"
+"	.loc	16	280	0\n"
+"	mov.f32 	%f2, 0f00000000;     	\n"
+"	mov.f32 	%f3, %f2;\n"
+"	mov.f32 	%f4, 0f00000000;     	\n"
+"	mov.f32 	%f5, %f4;\n"
+"	mov.f32 	%f6, 0f00000000;     	\n"
+"	mov.f32 	%f7, %f6;\n"
+"	mov.f32 	%f8, 0f00000000;     	\n"
+"	mov.f32 	%f9, %f8;\n"
+"	mov.f32 	%f10, 0f00000000;    	\n"
+"	mov.f32 	%f11, %f10;\n"
+"	mov.f32 	%f12, 0f00000000;    	\n"
+"	mov.f32 	%f13, %f12;\n"
+"	.loc	16	282	0\n"
+"	bar.sync 	0;\n"
+"	ld.param.s32 	%r3, [__cudaparm_kernel_pair_fast_t_per_atom];\n"
+"	div.s32 	%r4, %r1, %r3;\n"
+"	cvt.s32.u32 	%r5, %ntid.x;\n"
+"	div.s32 	%r6, %r5, %r3;\n"
+"	rem.s32 	%r7, %r1, %r3;\n"
+"	cvt.s32.u32 	%r8, %ctaid.x;\n"
+"	mul.lo.s32 	%r9, %r8, %r6;\n"
+"	add.s32 	%r10, %r4, %r9;\n"
+"	ld.param.s32 	%r11, [__cudaparm_kernel_pair_fast_inum];\n"
+"	setp.lt.s32 	%p2, %r10, %r11;\n"
+"	@!%p2 bra 	$Lt_1_20482;\n"
+"	.loc	16	286	0\n"
+"	cvt.s64.s32 	%rd7, %r10;\n"
+"	mul.wide.s32 	%rd8, %r10, 4;\n"
+"	ld.param.u64 	%rd9, [__cudaparm_kernel_pair_fast_dev_nbor];\n"
+"	add.u64 	%rd10, %rd8, %rd9;\n"
+"	ld.global.s32 	%r12, [%rd10+0];\n"
+"	.loc	16	288	0\n"
+"	ld.param.s32 	%r13, [__cudaparm_kernel_pair_fast_nbor_pitch];\n"
+"	cvt.s64.s32 	%rd11, %r13;\n"
+"	mul.wide.s32 	%rd12, %r13, 4;\n"
+"	add.u64 	%rd13, %rd12, %rd10;\n"
+"	ld.global.s32 	%r14, [%rd13+0];\n"
+"	add.u64 	%rd14, %rd12, %rd13;\n"
+"	ld.param.u64 	%rd15, [__cudaparm_kernel_pair_fast_dev_packed];\n"
+"	setp.ne.u64 	%p3, %rd15, %rd9;\n"
+"	@%p3 bra 	$Lt_1_20994;\n"
+"	.loc	16	294	0\n"
+"	cvt.s32.s64 	%r15, %rd11;\n"
+"	mul.lo.s32 	%r16, %r15, %r14;\n"
+"	cvt.s64.s32 	%rd16, %r16;\n"
+"	mul.wide.s32 	%rd17, %r16, 4;\n"
+"	add.u64 	%rd18, %rd14, %rd17;\n"
+"	.loc	16	295	0\n"
+"	mul.lo.s32 	%r17, %r7, %r15;\n"
+"	cvt.s64.s32 	%rd19, %r17;\n"
+"	mul.wide.s32 	%rd20, %r17, 4;\n"
+"	add.u64 	%rd21, %rd14, %rd20;\n"
+"	.loc	16	296	0\n"
+"	mul.lo.s32 	%r18, %r15, %r3;\n"
+"	bra.uni 	$Lt_1_20738;\n"
+"$Lt_1_20994:\n"
+"	.loc	16	298	0\n"
+"	ld.global.s32 	%r19, [%rd14+0];\n"
+"	cvt.s64.s32 	%rd22, %r19;\n"
+"	mul.wide.s32 	%rd23, %r19, 4;\n"
+"	add.u64 	%rd24, %rd15, %rd23;\n"
+"	.loc	16	299	0\n"
+"	cvt.s64.s32 	%rd25, %r14;\n"
+"	mul.wide.s32 	%rd26, %r14, 4;\n"
+"	add.u64 	%rd18, %rd24, %rd26;\n"
+"	.loc	16	300	0\n"
+"	mov.s32 	%r18, %r3;\n"
+"	.loc	16	301	0\n"
+"	cvt.s64.s32 	%rd27, %r7;\n"
+"	mul.wide.s32 	%rd28, %r7, 4;\n"
+"	add.u64 	%rd21, %rd24, %rd28;\n"
+"$Lt_1_20738:\n"
+"	.loc	16	304	0\n"
+"	mov.u32 	%r20, %r12;\n"
+"	mov.s32 	%r21, 0;\n"
+"	mov.u32 	%r22, %r21;\n"
+"	mov.s32 	%r23, 0;\n"
+"	mov.u32 	%r24, %r23;\n"
+"	mov.s32 	%r25, 0;\n"
+"	mov.u32 	%r26, %r25;\n"
+"	tex.1d.v4.f32.s32 {%f14,%f15,%f16,%f17},[pos_tex,{%r20,%r22,%r24,%r26}];\n"
+"	mov.f32 	%f18, %f14;\n"
+"	mov.f32 	%f19, %f15;\n"
+"	mov.f32 	%f20, %f16;\n"
+"	.loc	16	305	0\n"
+"	mov.u32 	%r27, %r12;\n"
+"	mov.s32 	%r28, 0;\n"
+"	mov.u32 	%r29, %r28;\n"
+"	mov.s32 	%r30, 0;\n"
+"	mov.u32 	%r31, %r30;\n"
+"	mov.s32 	%r32, 0;\n"
+"	mov.u32 	%r33, %r32;\n"
+"	tex.1d.v4.f32.s32 {%f21,%f22,%f23,%f24},[q_tex,{%r27,%r29,%r31,%r33}];\n"
+"	mov.f32 	%f25, %f21;\n"
+"	setp.ge.u64 	%p4, %rd21, %rd18;\n"
+"	@%p4 bra 	$Lt_1_28930;\n"
+"	cvt.s64.s32 	%rd29, %r18;\n"
+"	ld.param.f32 	%f26, [__cudaparm_kernel_pair_fast_cut_coulsq];\n"
+"	mov.f32 	%f27, 0f00000000;    	\n"
+"	mov.f32 	%f28, 0f00000000;    	\n"
+"	mov.f32 	%f29, 0f00000000;    	\n"
+"	mov.f32 	%f30, 0f00000000;    	\n"
+"$Lt_1_21762:\n"
+"	.loc	16	308	0\n"
+"	ld.global.s32 	%r34, [%rd21+0];\n"
+"	.loc	16	311	0\n"
+"	mov.f32 	%f31, 0f3f800000;    	\n"
+"	shr.s32 	%r35, %r34, 30;\n"
+"	and.b32 	%r36, %r35, 3;\n"
+"	cvt.s64.s32 	%rd30, %r36;\n"
+"	mul.wide.s32 	%rd31, %r36, 4;\n"
+"	add.u64 	%rd32, %rd1, %rd31;\n"
+"	ld.shared.f32 	%f32, [%rd32+0];\n"
+"	sub.ftz.f32 	%f33, %f31, %f32;\n"
+"	.loc	16	314	0\n"
+"	and.b32 	%r37, %r34, 1073741823;\n"
+"	mov.u32 	%r38, %r37;\n"
+"	mov.s32 	%r39, 0;\n"
+"	mov.u32 	%r40, %r39;\n"
+"	mov.s32 	%r41, 0;\n"
+"	mov.u32 	%r42, %r41;\n"
+"	mov.s32 	%r43, 0;\n"
+"	mov.u32 	%r44, %r43;\n"
+"	tex.1d.v4.f32.s32 {%f34,%f35,%f36,%f37},[pos_tex,{%r38,%r40,%r42,%r44}];\n"
+"	mov.f32 	%f38, %f34;\n"
+"	mov.f32 	%f39, %f35;\n"
+"	mov.f32 	%f40, %f36;\n"
+"	sub.ftz.f32 	%f41, %f19, %f39;\n"
+"	sub.ftz.f32 	%f42, %f18, %f38;\n"
+"	sub.ftz.f32 	%f43, %f20, %f40;\n"
+"	mul.ftz.f32 	%f44, %f41, %f41;\n"
+"	fma.rn.ftz.f32 	%f45, %f42, %f42, %f44;\n"
+"	fma.rn.ftz.f32 	%f46, %f43, %f43, %f45;\n"
+"	setp.lt.ftz.f32 	%p5, %f46, %f26;\n"
+"	@!%p5 bra 	$Lt_1_22530;\n"
+"	.loc	16	330	0\n"
+"	sqrt.approx.ftz.f32 	%f47, %f46;\n"
+"	ld.param.f32 	%f48, [__cudaparm_kernel_pair_fast_g_ewald];\n"
+"	mul.ftz.f32 	%f49, %f48, %f47;\n"
+"	mul.ftz.f32 	%f50, %f49, %f49;\n"
+"	mov.f32 	%f51, 0f3f800000;    	\n"
+"	mov.f32 	%f52, 0f3ea7ba05;    	\n"
+"	fma.rn.ftz.f32 	%f53, %f52, %f49, %f51;\n"
+"	neg.ftz.f32 	%f54, %f50;\n"
+"	rcp.approx.ftz.f32 	%f55, %f53;\n"
+"	mov.f32 	%f56, 0f3fb8aa3b;    	\n"
+"	mul.ftz.f32 	%f57, %f54, %f56;\n"
+"	ex2.approx.ftz.f32 	%f58, %f57;\n"
+"	mov.f32 	%f59, 0f3e827906;    	\n"
+"	mov.f32 	%f60, 0fbe91a98e;    	\n"
+"	mov.f32 	%f61, 0f3fb5f0e3;    	\n"
+"	mov.f32 	%f62, 0fbfba00e3;    	\n"
+"	mov.f32 	%f63, 0f3f87dc22;    	\n"
+"	fma.rn.ftz.f32 	%f64, %f63, %f55, %f62;\n"
+"	fma.rn.ftz.f32 	%f65, %f55, %f64, %f61;\n"
+"	fma.rn.ftz.f32 	%f66, %f55, %f65, %f60;\n"
+"	fma.rn.ftz.f32 	%f67, %f55, %f66, %f59;\n"
+"	mul.ftz.f32 	%f68, %f55, %f67;\n"
+"	mul.ftz.f32 	%f69, %f58, %f68;\n"
+"	.loc	16	331	0\n"
+"	mov.u32 	%r45, %r37;\n"
+"	mov.s32 	%r46, 0;\n"
+"	mov.u32 	%r47, %r46;\n"
+"	mov.s32 	%r48, 0;\n"
+"	mov.u32 	%r49, %r48;\n"
+"	mov.s32 	%r50, 0;\n"
+"	mov.u32 	%r51, %r50;\n"
+"	tex.1d.v4.f32.s32 {%f70,%f71,%f72,%f73},[q_tex,{%r45,%r47,%r49,%r51}];\n"
+"	mov.f32 	%f74, %f70;\n"
+"	.loc	16	332	0\n"
+"	ld.param.f32 	%f75, [__cudaparm_kernel_pair_fast_qqrd2e];\n"
+"	mul.ftz.f32 	%f76, %f75, %f25;\n"
+"	mul.ftz.f32 	%f77, %f76, %f74;\n"
+"	div.approx.ftz.f32 	%f78, %f77, %f47;\n"
+"	mov.f32 	%f79, 0f3f906ebb;    	\n"
+"	mul.ftz.f32 	%f80, %f49, %f79;\n"
+"	fma.rn.ftz.f32 	%f81, %f58, %f80, %f69;\n"
+"	sub.ftz.f32 	%f82, %f81, %f33;\n"
+"	mul.ftz.f32 	%f83, %f78, %f82;\n"
+"	rcp.approx.ftz.f32 	%f84, %f46;\n"
+"	mul.ftz.f32 	%f85, %f83, %f84;\n"
+"	.loc	16	334	0\n"
+"	fma.rn.ftz.f32 	%f29, %f42, %f85, %f29;\n"
+"	.loc	16	335	0\n"
+"	fma.rn.ftz.f32 	%f28, %f41, %f85, %f28;\n"
+"	.loc	16	336	0\n"
+"	fma.rn.ftz.f32 	%f27, %f43, %f85, %f27;\n"
+"	.loc	16	323	0\n"
+"	sub.ftz.f32 	%f86, %f69, %f33;\n"
+"	fma.rn.ftz.f32 	%f87, %f78, %f86, %f30;\n"
+"	ld.param.s32 	%r52, [__cudaparm_kernel_pair_fast_eflag];\n"
+"	mov.s32 	%r53, 0;\n"
+"	setp.gt.s32 	%p6, %r52, %r53;\n"
+"	selp.f32 	%f30, %f87, %f30, %p6;\n"
+"	ld.param.s32 	%r54, [__cudaparm_kernel_pair_fast_vflag];\n"
+"	mov.u32 	%r55, 0;\n"
+"	setp.le.s32 	%p7, %r54, %r55;\n"
+"	@%p7 bra 	$Lt_1_22530;\n"
+"	.loc	16	342	0\n"
+"	mov.f32 	%f88, %f3;\n"
+"	mul.ftz.f32 	%f89, %f42, %f42;\n"
+"	fma.rn.ftz.f32 	%f90, %f85, %f89, %f88;\n"
+"	mov.f32 	%f3, %f90;\n"
+"	.loc	16	343	0\n"
+"	mov.f32 	%f91, %f5;\n"
+"	fma.rn.ftz.f32 	%f92, %f85, %f44, %f91;\n"
+"	mov.f32 	%f5, %f92;\n"
+"	.loc	16	344	0\n"
+"	mov.f32 	%f93, %f7;\n"
+"	mul.ftz.f32 	%f94, %f43, %f43;\n"
+"	fma.rn.ftz.f32 	%f95, %f85, %f94, %f93;\n"
+"	mov.f32 	%f7, %f95;\n"
+"	.loc	16	345	0\n"
+"	mov.f32 	%f96, %f9;\n"
+"	mul.ftz.f32 	%f97, %f41, %f42;\n"
+"	fma.rn.ftz.f32 	%f98, %f85, %f97, %f96;\n"
+"	mov.f32 	%f9, %f98;\n"
+"	.loc	16	346	0\n"
+"	mov.f32 	%f99, %f11;\n"
+"	mul.ftz.f32 	%f100, %f42, %f43;\n"
+"	fma.rn.ftz.f32 	%f101, %f85, %f100, %f99;\n"
+"	mov.f32 	%f11, %f101;\n"
+"	.loc	16	347	0\n"
+"	mul.ftz.f32 	%f102, %f41, %f43;\n"
+"	fma.rn.ftz.f32 	%f12, %f85, %f102, %f12;\n"
+"	mov.f32 	%f13, %f12;\n"
+"$Lt_1_22530:\n"
+"$Lt_1_22018:\n"
+"	.loc	16	307	0\n"
+"	mul.lo.u64 	%rd33, %rd29, 4;\n"
+"	add.u64 	%rd21, %rd21, %rd33;\n"
+"	setp.lt.u64 	%p8, %rd21, %rd18;\n"
+"	@%p8 bra 	$Lt_1_21762;\n"
+"	bra.uni 	$Lt_1_20226;\n"
+"$Lt_1_28930:\n"
+"	mov.f32 	%f27, 0f00000000;    	\n"
+"	mov.f32 	%f28, 0f00000000;    	\n"
+"	mov.f32 	%f29, 0f00000000;    	\n"
+"	mov.f32 	%f30, 0f00000000;    	\n"
+"	bra.uni 	$Lt_1_20226;\n"
+"$Lt_1_20482:\n"
+"	mov.f32 	%f27, 0f00000000;    	\n"
+"	mov.f32 	%f28, 0f00000000;    	\n"
+"	mov.f32 	%f29, 0f00000000;    	\n"
+"	mov.f32 	%f30, 0f00000000;    	\n"
+"$Lt_1_20226:\n"
+"	mov.u32 	%r56, 1;\n"
+"	setp.le.s32 	%p9, %r3, %r56;\n"
+"	@%p9 bra 	$Lt_1_25346;\n"
+"	.loc	16	358	0\n"
+"	mov.u64 	%rd34, __cuda___cuda_local_var_32740_35_non_const_red_acc3320;\n"
+"	cvt.s64.s32 	%rd35, %r1;\n"
+"	mul.wide.s32 	%rd36, %r1, 4;\n"
+"	add.u64 	%rd37, %rd34, %rd36;\n"
+"	mov.f32 	%f103, %f29;\n"
+"	st.shared.f32 	[%rd37+0], %f103;\n"
+"	.loc	16	359	0\n"
+"	mov.f32 	%f104, %f28;\n"
+"	st.shared.f32 	[%rd37+512], %f104;\n"
+"	.loc	16	360	0\n"
+"	mov.f32 	%f105, %f27;\n"
+"	st.shared.f32 	[%rd37+1024], %f105;\n"
+"	.loc	16	361	0\n"
+"	mov.f32 	%f106, %f30;\n"
+"	st.shared.f32 	[%rd37+1536], %f106;\n"
+"	.loc	16	363	0\n"
+"	shr.s32 	%r57, %r3, 31;\n"
+"	mov.s32 	%r58, 1;\n"
+"	and.b32 	%r59, %r57, %r58;\n"
+"	add.s32 	%r60, %r59, %r3;\n"
+"	shr.s32 	%r61, %r60, 1;\n"
+"	mov.s32 	%r62, %r61;\n"
+"	mov.u32 	%r63, 0;\n"
+"	setp.ne.u32 	%p10, %r61, %r63;\n"
+"	@!%p10 bra 	$Lt_1_23810;\n"
+"$Lt_1_24322:\n"
+"	setp.ge.u32 	%p11, %r7, %r62;\n"
+"	@%p11 bra 	$Lt_1_24578;\n"
+"	.loc	16	366	0\n"
+"	add.u32 	%r64, %r1, %r62;\n"
+"	cvt.u64.u32 	%rd38, %r64;\n"
+"	mul.wide.u32 	%rd39, %r64, 4;\n"
+"	add.u64 	%rd40, %rd34, %rd39;\n"
+"	ld.shared.f32 	%f107, [%rd40+0];\n"
+"	add.ftz.f32 	%f103, %f107, %f103;\n"
+"	st.shared.f32 	[%rd37+0], %f103;\n"
+"	ld.shared.f32 	%f108, [%rd40+512];\n"
+"	add.ftz.f32 	%f104, %f108, %f104;\n"
+"	st.shared.f32 	[%rd37+512], %f104;\n"
+"	ld.shared.f32 	%f109, [%rd40+1024];\n"
+"	add.ftz.f32 	%f105, %f109, %f105;\n"
+"	st.shared.f32 	[%rd37+1024], %f105;\n"
+"	ld.shared.f32 	%f110, [%rd40+1536];\n"
+"	add.ftz.f32 	%f106, %f110, %f106;\n"
+"	st.shared.f32 	[%rd37+1536], %f106;\n"
+"$Lt_1_24578:\n"
+"	.loc	16	363	0\n"
+"	shr.u32 	%r62, %r62, 1;\n"
+"	mov.u32 	%r65, 0;\n"
+"	setp.ne.u32 	%p12, %r62, %r65;\n"
+"	@%p12 bra 	$Lt_1_24322;\n"
+"$Lt_1_23810:\n"
+"	.loc	16	370	0\n"
+"	mov.f32 	%f29, %f103;\n"
+"	.loc	16	371	0\n"
+"	mov.f32 	%f28, %f104;\n"
+"	.loc	16	372	0\n"
+"	mov.f32 	%f27, %f105;\n"
+"	.loc	16	373	0\n"
+"	mov.f32 	%f30, %f106;\n"
+"	ld.param.s32 	%r66, [__cudaparm_kernel_pair_fast_vflag];\n"
+"	mov.u32 	%r67, 0;\n"
+"	setp.le.s32 	%p13, %r66, %r67;\n"
+"	@%p13 bra 	$Lt_1_25346;\n"
+"	.loc	16	377	0\n"
+"	mov.f32 	%f103, %f3;\n"
+"	st.shared.f32 	[%rd37+0], %f103;\n"
+"	mov.f32 	%f104, %f5;\n"
+"	st.shared.f32 	[%rd37+512], %f104;\n"
+"	mov.f32 	%f105, %f7;\n"
+"	st.shared.f32 	[%rd37+1024], %f105;\n"
+"	mov.f32 	%f106, %f9;\n"
+"	st.shared.f32 	[%rd37+1536], %f106;\n"
+"	mov.f32 	%f111, %f11;\n"
+"	st.shared.f32 	[%rd37+2048], %f111;\n"
+"	mov.f32 	%f112, %f13;\n"
+"	st.shared.f32 	[%rd37+2560], %f112;\n"
+"	.loc	16	379	0\n"
+"	mov.s32 	%r68, %r61;\n"
+"	@!%p10 bra 	$Lt_1_25858;\n"
+"$Lt_1_26370:\n"
+"	setp.ge.u32 	%p14, %r7, %r68;\n"
+"	@%p14 bra 	$Lt_1_26626;\n"
+"	.loc	16	382	0\n"
+"	add.u32 	%r69, %r1, %r68;\n"
+"	cvt.u64.u32 	%rd41, %r69;\n"
+"	mul.wide.u32 	%rd42, %r69, 4;\n"
+"	add.u64 	%rd43, %rd34, %rd42;\n"
+"	ld.shared.f32 	%f113, [%rd43+0];\n"
+"	add.ftz.f32 	%f103, %f113, %f103;\n"
+"	st.shared.f32 	[%rd37+0], %f103;\n"
+"	ld.shared.f32 	%f114, [%rd43+512];\n"
+"	add.ftz.f32 	%f104, %f114, %f104;\n"
+"	st.shared.f32 	[%rd37+512], %f104;\n"
+"	ld.shared.f32 	%f115, [%rd43+1024];\n"
+"	add.ftz.f32 	%f105, %f115, %f105;\n"
+"	st.shared.f32 	[%rd37+1024], %f105;\n"
+"	ld.shared.f32 	%f116, [%rd43+1536];\n"
+"	add.ftz.f32 	%f106, %f116, %f106;\n"
+"	st.shared.f32 	[%rd37+1536], %f106;\n"
+"	ld.shared.f32 	%f117, [%rd43+2048];\n"
+"	add.ftz.f32 	%f111, %f117, %f111;\n"
+"	st.shared.f32 	[%rd37+2048], %f111;\n"
+"	ld.shared.f32 	%f118, [%rd43+2560];\n"
+"	add.ftz.f32 	%f112, %f118, %f112;\n"
+"	st.shared.f32 	[%rd37+2560], %f112;\n"
+"$Lt_1_26626:\n"
+"	.loc	16	379	0\n"
+"	shr.u32 	%r68, %r68, 1;\n"
+"	mov.u32 	%r70, 0;\n"
+"	setp.ne.u32 	%p15, %r68, %r70;\n"
+"	@%p15 bra 	$Lt_1_26370;\n"
+"$Lt_1_25858:\n"
+"	.loc	16	387	0\n"
+"	mov.f32 	%f3, %f103;\n"
+"	mov.f32 	%f5, %f104;\n"
+"	mov.f32 	%f7, %f105;\n"
+"	mov.f32 	%f9, %f106;\n"
+"	mov.f32 	%f11, %f111;\n"
+"	mov.f32 	%f13, %f112;\n"
+"$Lt_1_25346:\n"
+"$Lt_1_23298:\n"
+"	selp.s32 	%r71, 1, 0, %p2;\n"
+"	mov.s32 	%r72, 0;\n"
+"	set.eq.u32.s32 	%r73, %r7, %r72;\n"
+"	neg.s32 	%r74, %r73;\n"
+"	and.b32 	%r75, %r71, %r74;\n"
+"	mov.u32 	%r76, 0;\n"
+"	setp.eq.s32 	%p16, %r75, %r76;\n"
+"	@%p16 bra 	$Lt_1_27394;\n"
+"	.loc	16	393	0\n"
+"	cvt.s64.s32 	%rd44, %r10;\n"
+"	ld.param.u64 	%rd45, [__cudaparm_kernel_pair_fast_engv];\n"
+"	mul.wide.s32 	%rd46, %r10, 4;\n"
+"	add.u64 	%rd47, %rd45, %rd46;\n"
+"	ld.param.s32 	%r77, [__cudaparm_kernel_pair_fast_eflag];\n"
+"	mov.u32 	%r78, 0;\n"
+"	setp.le.s32 	%p17, %r77, %r78;\n"
+"	@%p17 bra 	$Lt_1_27906;\n"
+"	.loc	16	395	0\n"
+"	mov.f32 	%f119, 0f00000000;   	\n"
+"	st.global.f32 	[%rd47+0], %f119;\n"
+"	.loc	16	396	0\n"
+"	cvt.s64.s32 	%rd48, %r11;\n"
+"	mul.wide.s32 	%rd49, %r11, 4;\n"
+"	add.u64 	%rd50, %rd49, %rd47;\n"
+"	.loc	16	397	0\n"
+"	st.global.f32 	[%rd50+0], %f30;\n"
+"	.loc	16	398	0\n"
+"	add.u64 	%rd47, %rd49, %rd50;\n"
+"$Lt_1_27906:\n"
+"	ld.param.s32 	%r79, [__cudaparm_kernel_pair_fast_vflag];\n"
+"	mov.u32 	%r80, 0;\n"
+"	setp.le.s32 	%p18, %r79, %r80;\n"
+"	@%p18 bra 	$Lt_1_28418;\n"
+"	.loc	16	402	0\n"
+"	mov.f32 	%f120, %f3;\n"
+"	st.global.f32 	[%rd47+0], %f120;\n"
+"	.loc	16	403	0\n"
+"	cvt.s64.s32 	%rd51, %r11;\n"
+"	mul.wide.s32 	%rd52, %r11, 4;\n"
+"	add.u64 	%rd53, %rd52, %rd47;\n"
+"	.loc	16	402	0\n"
+"	mov.f32 	%f121, %f5;\n"
+"	st.global.f32 	[%rd53+0], %f121;\n"
+"	.loc	16	403	0\n"
+"	add.u64 	%rd54, %rd52, %rd53;\n"
+"	.loc	16	402	0\n"
+"	mov.f32 	%f122, %f7;\n"
+"	st.global.f32 	[%rd54+0], %f122;\n"
+"	.loc	16	403	0\n"
+"	add.u64 	%rd55, %rd52, %rd54;\n"
+"	.loc	16	402	0\n"
+"	mov.f32 	%f123, %f9;\n"
+"	st.global.f32 	[%rd55+0], %f123;\n"
+"	.loc	16	403	0\n"
+"	add.u64 	%rd47, %rd52, %rd55;\n"
+"	.loc	16	402	0\n"
+"	mov.f32 	%f124, %f11;\n"
+"	st.global.f32 	[%rd47+0], %f124;\n"
+"	mov.f32 	%f125, %f13;\n"
+"	add.u64 	%rd56, %rd52, %rd47;\n"
+"	st.global.f32 	[%rd56+0], %f125;\n"
+"$Lt_1_28418:\n"
+"	.loc	16	406	0\n"
+"	ld.param.u64 	%rd57, [__cudaparm_kernel_pair_fast_ans];\n"
+"	mul.lo.u64 	%rd58, %rd44, 16;\n"
+"	add.u64 	%rd59, %rd57, %rd58;\n"
+"	mov.f32 	%f126, %f127;\n"
+"	st.global.v4.f32 	[%rd59+0], {%f29,%f28,%f27,%f126};\n"
+"$Lt_1_27394:\n"
+"	.loc	16	408	0\n"
+"	exit;\n"
+"$LDWend_kernel_pair_fast:\n"
+"	}\n"
+;
--- a/lib/gpu/crml_gpu_kernel.ptx
+++ b/lib/gpu/crml_gpu_kernel.ptx
--- a/lib/gpu/crml_gpu_ptx.h
+++ b/lib/gpu/crml_gpu_ptx.h
--- a/lib/gpu/lal_answer.cpp
+++ b/lib/gpu/lal_answer.cpp
@ -0,0 +1,406 @@
+/***************************************************************************
+                                 answer.cpp
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Class for data management of forces, torques, energies, and virials
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#include "lal_answer.h"
+
+using namespace LAMMPS_AL;
+#define AnswerT Answer<numtyp,acctyp>
+
+template <class numtyp, class acctyp>
+AnswerT::Answer() : _allocated(false),_eflag(false),_vflag(false),
+                            _inum(0),_ilist(NULL),_newton(false) {
+}
+
+template <class numtyp, class acctyp>
+int AnswerT::bytes_per_atom() const { 
+  int bytes=11*sizeof(acctyp);
+  if (_rot)
+    bytes+=4*sizeof(acctyp);
+  if (_charge)
+    bytes+=sizeof(acctyp);
+  return bytes;
+}
+
+template <class numtyp, class acctyp>
+bool AnswerT::alloc(const int inum) {
+  _max_local=static_cast<int>(static_cast<double>(inum)*1.10);
+
+  bool success=true;
+  
+  int ans_elements=4;
+  if (_rot)
+    ans_elements+=4;
+  
+  // Ignore host/device transfers?
+  bool cpuview=false;
+  if (dev->device_type()==UCL_CPU)
+    cpuview=true;
+    
+  // --------------------------   Host allocations
+  success=success &&(host_ans.alloc(ans_elements*_max_local,*dev)==UCL_SUCCESS);
+  success=success &&(host_engv.alloc(_ev_fields*_max_local,*dev)==UCL_SUCCESS);
+    
+  // ---------------------------  Device allocations
+  if (cpuview) {
+    dev_engv.view(host_engv);
+    dev_ans.view(host_ans);
+  } else {
+    success=success && (dev_engv.alloc(_ev_fields*_max_local,*dev,
+                                       UCL_WRITE_ONLY)==UCL_SUCCESS);
+    success=success && (dev_ans.alloc(ans_elements*_max_local,
+                                      *dev,UCL_WRITE_ONLY)==UCL_SUCCESS);
+  }
+  _gpu_bytes=dev_engv.row_bytes()+dev_ans.row_bytes();
+  
+  _allocated=true;  
+  return success;
+}
+
+template <class numtyp, class acctyp>
+bool AnswerT::init(const int inum, const bool charge, const bool rot,
+                       UCL_Device &devi) {
+  clear();
+
+  bool success=true;
+  _charge=charge;
+  _rot=rot;
+  _other=_charge || _rot;
+  dev=&devi;
+
+  _e_fields=1;
+  if (_charge)
+    _e_fields++;
+  _ev_fields=6+_e_fields;
+    
+  // Initialize atom and nbor data
+  int ef_inum=inum;
+  if (ef_inum==0)
+    ef_inum=1000;
+  
+  // Initialize timers for the selected device
+  time_answer.init(*dev);
+  time_answer.zero();
+  _time_cast=0.0;
+  _time_cpu_idle=0.0;
+  
+  return success && alloc(ef_inum);
+}
+  
+template <class numtyp, class acctyp>
+bool AnswerT::add_fields(const bool charge, const bool rot) {
+  bool realloc=false;
+  if (charge && _charge==false) {
+    _charge=true;
+    _e_fields++;
+    _ev_fields++;
+    realloc=true;
+  }
+  if (rot && _rot==false) {
+    _rot=true;
+    realloc=true;
+  }
+  if (realloc) {
+    _other=_charge || _rot;
+    int inum=_max_local;
+    clear_resize();
+    return alloc(inum);
+  }
+  return true;
+}
+
+template <class numtyp, class acctyp>
+void AnswerT::clear_resize() {
+  if (!_allocated)
+    return;
+  _allocated=false;
+
+  dev_ans.clear();
+  dev_engv.clear();
+  host_ans.clear();
+  host_engv.clear();
+}
+
+template <class numtyp, class acctyp>
+void AnswerT::clear() {
+  _gpu_bytes=0;
+  if (!_allocated)
+    return;
+
+  time_answer.clear();
+  clear_resize();
+  _inum=0;
+  _ilist=NULL;
+  _eflag=false;
+  _vflag=false;
+}
+
+template <class numtyp, class acctyp>
+double AnswerT::host_memory_usage() const {
+  int atom_bytes=4;
+  if (_charge) 
+    atom_bytes+=1;
+  if (_rot) 
+    atom_bytes+=4;
+  int ans_bytes=atom_bytes+_ev_fields;
+  return ans_bytes*(_max_local)*sizeof(acctyp)+
+         sizeof(Answer<numtyp,acctyp>);
+}
+  
+template <class numtyp, class acctyp>
+void AnswerT::copy_answers(const bool eflag, const bool vflag,
+                               const bool ef_atom, const bool vf_atom) {
+  time_answer.start();
+  _eflag=eflag;
+  _vflag=vflag;
+  _ef_atom=ef_atom;
+  _vf_atom=vf_atom;
+    
+  int csize=_ev_fields;    
+  if (!eflag)
+    csize-=_e_fields;
+  if (!vflag)
+    csize-=6;
+      
+  if (csize>0)
+    ucl_copy(host_engv,dev_engv,_inum*csize,true);
+  if (_rot)
+    ucl_copy(host_ans,dev_ans,_inum*4*2,true);
+  else
+    ucl_copy(host_ans,dev_ans,_inum*4,true);
+  time_answer.stop();
+}
+
+template <class numtyp, class acctyp>
+void AnswerT::copy_answers(const bool eflag, const bool vflag,
+                               const bool ef_atom, const bool vf_atom,
+                               int *ilist) {
+  _ilist=ilist;
+  copy_answers(eflag,vflag,ef_atom,vf_atom);
+}
+
+template <class numtyp, class acctyp>
+double AnswerT::energy_virial(double *eatom, double **vatom,
+                                  double *virial) {
+  if (_eflag==false && _vflag==false)
+    return 0.0;
+
+  double evdwl=0.0;
+  double virial_acc[6];
+  for (int i=0; i<6; i++) virial_acc[i]=0.0;
+  if (_ilist==NULL) {
+    for (int i=0; i<_inum; i++) {
+      acctyp *ap=host_engv.begin()+i;
+      if (_eflag) {
+        if (_ef_atom) {
+          evdwl+=*ap;
+          eatom[i]+=*ap*0.5;
+          ap+=_inum;
+        } else {
+          evdwl+=*ap;
+          ap+=_inum;
+        }
+      }
+      if (_vflag) {
+        if (_vf_atom) {
+          for (int j=0; j<6; j++) {
+            vatom[i][j]+=*ap*0.5;
+            virial_acc[j]+=*ap;
+            ap+=_inum;
+          }
+        } else {
+          for (int j=0; j<6; j++) {
+            virial_acc[j]+=*ap;
+            ap+=_inum;
+          }
+        }
+      }
+    }
+    for (int j=0; j<6; j++)
+      virial[j]+=virial_acc[j]*0.5;
+  } else {
+    for (int i=0; i<_inum; i++) {
+      acctyp *ap=host_engv.begin()+i;
+      int ii=_ilist[i];
+      if (_eflag) {
+        if (_ef_atom) {
+          evdwl+=*ap;
+          eatom[ii]+=*ap*0.5;
+          ap+=_inum;
+        } else {
+          evdwl+=*ap;
+          ap+=_inum;
+        }
+      }
+      if (_vflag) {
+        if (_vf_atom) {
+          for (int j=0; j<6; j++) {
+            vatom[ii][j]+=*ap*0.5;
+            virial_acc[j]+=*ap;
+            ap+=_inum;
+          }
+        } else {
+          for (int j=0; j<6; j++) {
+            virial_acc[j]+=*ap;
+            ap+=_inum;
+          }
+        }
+      }
+    }
+    for (int j=0; j<6; j++)
+      virial[j]+=virial_acc[j]*0.5;
+  }
+  
+  evdwl*=0.5;
+  return evdwl;
+}
+
+template <class numtyp, class acctyp>
+double AnswerT::energy_virial(double *eatom, double **vatom,
+                                   double *virial, double &ecoul) {
+  if (_eflag==false && _vflag==false)
+    return 0.0;
+
+  if (_charge==false)
+    return energy_virial(eatom,vatom,virial);
+
+  double evdwl=0.0;
+  double _ecoul=0.0;
+  double virial_acc[6];
+  for (int i=0; i<6; i++) virial_acc[i]=0.0;
+  if (_ilist==NULL) {
+    for (int i=0; i<_inum; i++) {
+      acctyp *ap=host_engv.begin()+i;
+      if (_eflag) {
+        if (_ef_atom) {
+          evdwl+=*ap;
+          eatom[i]+=*ap*0.5;
+          ap+=_inum;
+          _ecoul+=*ap;
+          eatom[i]+=*ap*0.5;
+          ap+=_inum;
+        } else {
+          evdwl+=*ap;
+          ap+=_inum;
+          _ecoul+=*ap;
+          ap+=_inum;
+        }
+      }
+      if (_vflag) {
+        if (_vf_atom) {
+          for (int j=0; j<6; j++) {
+            vatom[i][j]+=*ap*0.5;
+            virial_acc[j]+=*ap;
+            ap+=_inum;
+          }
+        } else {
+          for (int j=0; j<6; j++) {
+            virial_acc[j]+=*ap;
+            ap+=_inum;
+          }
+        }
+      }
+    }
+    for (int j=0; j<6; j++)
+      virial[j]+=virial_acc[j]*0.5;
+  } else {
+    for (int i=0; i<_inum; i++) {
+      acctyp *ap=host_engv.begin()+i;
+      int ii=_ilist[i];
+      if (_eflag) {
+        if (_ef_atom) {
+          evdwl+=*ap;
+          eatom[ii]+=*ap*0.5;
+          ap+=_inum;
+          _ecoul+=*ap;
+          eatom[ii]+=*ap*0.5;
+          ap+=_inum;
+        } else {
+          evdwl+=*ap;
+          ap+=_inum;
+          _ecoul+=*ap;
+          ap+=_inum;
+        }
+      }
+      if (_vflag) {
+        if (_vf_atom) {
+          for (int j=0; j<6; j++) {
+            vatom[ii][j]+=*ap*0.5;
+            virial_acc[j]+=*ap;
+            ap+=_inum;
+          }
+        } else {
+          for (int j=0; j<6; j++) {
+            virial_acc[j]+=*ap;
+            ap+=_inum;
+          }
+        }
+      }
+    }
+    for (int j=0; j<6; j++)
+      virial[j]+=virial_acc[j]*0.5;
+  }
+  
+  evdwl*=0.5;
+  ecoul+=_ecoul*0.5;
+  return evdwl;
+}
+
+template <class numtyp, class acctyp>
+void AnswerT::get_answers(double **f, double **tor) {
+  acctyp *ap=host_ans.begin();
+  if (_ilist==NULL) {
+    for (int i=0; i<_inum; i++) {
+      f[i][0]+=*ap;
+      ap++;
+      f[i][1]+=*ap;
+      ap++;
+      f[i][2]+=*ap;
+      ap+=2;
+    }
+    if (_rot) {
+      for (int i=0; i<_inum; i++) {
+        tor[i][0]+=*ap;
+        ap++;
+        tor[i][1]+=*ap;
+        ap++;
+        tor[i][2]+=*ap;
+        ap+=2;
+      }
+    }
+  } else {
+    for (int i=0; i<_inum; i++) {
+      int ii=_ilist[i];
+      f[ii][0]+=*ap;
+      ap++;
+      f[ii][1]+=*ap;
+      ap++;
+      f[ii][2]+=*ap;
+      ap+=2;
+    }
+    if (_rot) {
+      for (int i=0; i<_inum; i++) {
+        int ii=_ilist[i];
+        tor[ii][0]+=*ap;
+        ap++;
+        tor[ii][1]+=*ap;
+        ap++;
+        tor[ii][2]+=*ap;
+        ap+=2;
+      }
+    }
+  }
+}
+
+template class Answer<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lal_answer.h
+++ b/lib/gpu/lal_answer.h
@ -0,0 +1,169 @@
+/***************************************************************************
+                                  answer.h
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Class for data management of forces, torques, energies, and virials
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#ifndef LAL_ANSWER_H
+#define LAL_ANSWER_H
+
+#include <math.h>
+#include "mpi.h"
+
+#ifdef USE_OPENCL
+
+#include "geryon/ocl_timer.h"
+#include "geryon/ocl_mat.h"
+using namespace ucl_opencl;
+
+#else
+
+#include "geryon/nvd_timer.h"
+#include "geryon/nvd_mat.h"
+using namespace ucl_cudadr;
+
+#endif
+
+#include "lal_precision.h"
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class Answer {
+ public:
+  Answer();
+  ~Answer() { clear(); }
+
+  /// Current number of local atoms stored
+  inline int inum() const { return _inum; }
+  /// Set number of local atoms for future copy operations
+  inline void inum(const int n) { _inum=n; }
+  
+  /// Memory usage per atom in this class
+  int bytes_per_atom() const; 
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param rot True if atom storage needs quaternions **/
+  bool init(const int inum, const bool charge, const bool rot, UCL_Device &dev);
+  
+  /// Check if we have enough device storage and realloc if not
+  inline void resize(const int inum, bool &success) {
+    _inum=inum;
+    if (inum>_max_local) {
+      clear_resize();
+      success = success && alloc(inum);
+    }
+  }
+  
+  /// If already initialized by another LAMMPS style, add fields as necessary
+  /** \param rot True if atom storage needs quaternions **/
+  bool add_fields(const bool charge, const bool rot);
+  
+  /// Free all memory on host and device needed to realloc for more atoms
+  void clear_resize();
+
+  /// Free all memory on host and device
+  void clear();
+ 
+  /// Return the total amount of host memory used by class in bytes
+  double host_memory_usage() const;
+
+  /// Add copy times to timers
+  inline void acc_timers() {
+    time_answer.add_to_total();
+  }
+
+  /// Add copy times to timers
+  inline void zero_timers() {
+    time_answer.zero();
+  }
+
+  /// Return the total time for host/device data transfer
+  inline double transfer_time() {
+    return time_answer.total_seconds();
+  }
+  
+  /// Return the total time for data cast/pack
+  inline double cast_time() { return _time_cast; }
+  
+  /// Return number of bytes used on device
+  inline double gpu_bytes() { return _gpu_bytes; } 
+
+  // -------------------------COPY FROM GPU -------------------------------
+
+  /// Copy answers from device into read buffer asynchronously
+  void copy_answers(const bool eflag, const bool vflag,
+                    const bool ef_atom, const bool vf_atom);
+
+  /// Copy answers from device into read buffer asynchronously
+  void copy_answers(const bool eflag, const bool vflag,
+                    const bool ef_atom, const bool vf_atom, int *ilist);
+  
+  /// Copy energy and virial data into LAMMPS memory
+  double energy_virial(double *eatom, double **vatom, double *virial);
+
+  /// Copy energy and virial data into LAMMPS memory
+  double energy_virial(double *eatom, double **vatom, double *virial,
+                       double &ecoul);
+
+  /// Add forces and torques from the GPU into a LAMMPS pointer
+  void get_answers(double **f, double **tor);
+
+  inline double get_answers(double **f, double **tor, double *eatom, 
+                            double **vatom, double *virial, double &ecoul) {
+    double ta=MPI_Wtime();
+    time_answer.sync_stop();
+    _time_cpu_idle+=MPI_Wtime()-ta;
+    double ts=MPI_Wtime();
+    double evdw=energy_virial(eatom,vatom,virial,ecoul);
+    get_answers(f,tor);
+    _time_cast+=MPI_Wtime()-ts;
+    return evdw;
+  }
+  
+  /// Return the time the CPU was idle waiting for GPU
+  inline double cpu_idle_time() { return _time_cpu_idle; }
+
+  // ------------------------------ DATA ----------------------------------
+
+  /// Force and possibly torque
+  UCL_D_Vec<acctyp> dev_ans;
+  /// Energy and virial per-atom storage
+  UCL_D_Vec<acctyp> dev_engv;
+  
+  /// Force and possibly torque data on host
+  UCL_H_Vec<acctyp> host_ans;
+  /// Energy/virial data on host
+  UCL_H_Vec<acctyp> host_engv;
+  
+  /// Device timers
+  UCL_Timer time_answer;
+  
+  /// Geryon device
+  UCL_Device *dev;
+
+ private:
+  bool alloc(const int inum);
+  
+  bool _allocated, _eflag, _vflag, _ef_atom, _vf_atom, _rot, _charge, _other;
+  int _max_local, _inum, _e_fields, _ev_fields;
+  int *_ilist;
+  double _time_cast, _time_cpu_idle;
+  
+  double _gpu_bytes;
+  
+  bool _newton;
+};
+
+}
+
+#endif
--- a/lib/gpu/lal_atom.cpp
+++ b/lib/gpu/lal_atom.cpp
@ -0,0 +1,317 @@
+/***************************************************************************
+                                  atom.cpp
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Class for particle data management
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#include "lal_atom.h"
+
+using namespace LAMMPS_AL;
+#define AtomT Atom<numtyp,acctyp>
+
+template <class numtyp, class acctyp>
+AtomT::Atom() : _compiled(false),_allocated(false),
+                              _max_gpu_bytes(0) {
+  #ifndef USE_OPENCL
+  sort_config.op = CUDPP_ADD;
+  sort_config.datatype = CUDPP_UINT;
+  sort_config.algorithm = CUDPP_SORT_RADIX;
+  sort_config.options = CUDPP_OPTION_KEY_VALUE_PAIRS;
+  #endif
+}
+
+template <class numtyp, class acctyp>
+int AtomT::bytes_per_atom() const { 
+  int id_space=0;
+  if (_gpu_nbor==1)
+    id_space=2;
+  else if (_gpu_nbor==2)
+    id_space=4;
+  int bytes=4*sizeof(numtyp)+id_space*sizeof(int);
+  if (_rot)
+    bytes+=4*sizeof(numtyp);
+  if (_charge)
+    bytes+=sizeof(numtyp);
+  return bytes;
+}
+
+template <class numtyp, class acctyp>
+bool AtomT::alloc(const int nall) {
+  _max_atoms=static_cast<int>(static_cast<double>(nall)*1.10);
+
+  bool success=true;
+  
+  // Ignore host/device transfers?
+  bool cpuview=false;
+  if (dev->device_type()==UCL_CPU)
+    cpuview=true;
+    
+  // Allocate storage for CUDPP sort
+  #ifndef USE_OPENCL
+  if (_gpu_nbor==1) {
+    CUDPPResult result = cudppPlan(&sort_plan, sort_config, _max_atoms, 1, 0);  
+    if (CUDPP_SUCCESS != result)
+      return false;
+  }
+  #endif
+
+  // --------------------------   Host allocations
+  // Get a host write only buffer
+  #ifdef GPU_CAST
+  success=success && (host_x_cast.alloc(_max_atoms*3,*dev,
+                                        UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
+  success=success && (host_type_cast.alloc(_max_atoms,*dev,
+                                           UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
+  #else
+  success=success && (host_x.alloc(_max_atoms*4,*dev,
+                      UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
+  #endif                      
+  // Buffer for casting only if different precisions
+  if (_charge)
+    success=success && (host_q.alloc(_max_atoms,*dev,
+                                     UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
+  // Buffer for casting only if different precisions
+  if (_rot)
+    success=success && (host_quat.alloc(_max_atoms*4,*dev,
+                                        UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
+
+    
+  // ---------------------------  Device allocations
+  int gpu_bytes=0;
+  if (cpuview) {
+    #ifdef GPU_CAST
+    assert(0==1);
+    #else
+    dev_x.view(host_x);
+    #endif
+    if (_rot)
+      dev_quat.view(host_quat);
+    if (_charge)
+      dev_q.view(host_q);
+  } else {
+    #ifdef GPU_CAST
+    success=success && (UCL_SUCCESS==dev_x.alloc(_max_atoms*4,*dev));
+    success=success && (UCL_SUCCESS==
+                        dev_x_cast.alloc(_max_atoms*3,*dev,UCL_READ_ONLY));
+    success=success && (UCL_SUCCESS==
+                        dev_type_cast.alloc(_max_atoms,*dev,UCL_READ_ONLY));
+    gpu_bytes+=dev_x_cast.row_bytes()+dev_type_cast.row_bytes();
+    #else
+    success=success && (UCL_SUCCESS==
+                        dev_x.alloc(_max_atoms*4,*dev,UCL_READ_ONLY));
+    #endif
+    if (_charge) {
+      success=success && (dev_q.alloc(_max_atoms,*dev,
+                                      UCL_READ_ONLY)==UCL_SUCCESS);
+      gpu_bytes+=dev_q.row_bytes();
+    }
+    if (_rot) {
+      success=success && (dev_quat.alloc(_max_atoms*4,*dev,
+                                      UCL_READ_ONLY)==UCL_SUCCESS);
+      gpu_bytes+=dev_quat.row_bytes();
+    }
+  }
+  if (_gpu_nbor>0) {
+    success=success && (dev_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
+    gpu_bytes+=dev_particle_id.row_bytes();
+    if (_bonds) {
+      success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS);
+      gpu_bytes+=dev_tag.row_bytes();
+    }
+    if (_gpu_nbor==1) {
+      success=success && (dev_cell_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
+      gpu_bytes+=dev_cell_id.row_bytes();
+    } else {
+      success=success && (host_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
+      success=success && 
+             (host_cell_id.alloc(_max_atoms,*dev,UCL_NOT_PINNED)==UCL_SUCCESS);
+    }             
+  }
+
+  gpu_bytes+=dev_x.row_bytes();
+  if (gpu_bytes>_max_gpu_bytes)
+    _max_gpu_bytes=gpu_bytes;
+  
+  _allocated=true;  
+  return success;
+}
+
+template <class numtyp, class acctyp>
+bool AtomT::add_fields(const bool charge, const bool rot,
+                       const int gpu_nbor, const bool bonds) {
+  bool realloc=false;
+  if (charge && _charge==false) {
+    _charge=true;
+    realloc=true;
+  }
+  if (rot && _rot==false) {
+    _rot=true;
+    realloc=true;
+  }
+  if (gpu_nbor>0 && _gpu_nbor==0) {
+    _gpu_nbor=gpu_nbor;
+    realloc=true;
+  }
+  if (bonds && _bonds==false) {
+    _bonds=true;
+    realloc=true;
+  }
+  if (realloc) {
+    _other=_charge || _rot;
+    int max_atoms=_max_atoms;
+    clear_resize();
+    return alloc(max_atoms);
+  }
+  return true;
+}
+
+template <class numtyp, class acctyp>
+bool AtomT::init(const int nall, const bool charge, const bool rot,
+                 UCL_Device &devi, const int gpu_nbor, const bool bonds) {
+  clear();
+
+  bool success=true;
+  _x_avail=false;
+  _q_avail=false;
+  _quat_avail=false;
+  _resized=false;
+  _gpu_nbor=gpu_nbor;
+  _bonds=bonds;
+  _charge=charge;
+  _rot=rot;
+  _other=_charge || _rot;
+  dev=&devi;
+
+  // Initialize atom and nbor data
+  int ef_nall=nall;
+  if (ef_nall==0)
+    ef_nall=2000;
+  
+  // Initialize timers for the selected device
+  time_pos.init(*dev);
+  time_q.init(*dev);
+  time_quat.init(*dev);
+  time_pos.zero();
+  time_q.zero();
+  time_quat.zero();
+  _time_cast=0.0;
+  
+  #ifdef GPU_CAST
+  compile_kernels(*dev);
+  #endif
+  
+  return success && alloc(ef_nall);
+}
+  
+template <class numtyp, class acctyp>
+void AtomT::clear_resize() {
+  if (!_allocated)
+    return;
+  _allocated=false;
+
+  dev_x.clear();
+  if (_charge) { 
+    dev_q.clear();
+    host_q.clear();
+  }
+  if (_rot) {
+    dev_quat.clear();
+    host_quat.clear();
+  }
+  #ifndef GPU_CAST
+  host_x.clear();
+  #else
+  host_x_cast.clear();
+  host_type_cast.clear();
+  #endif
+  dev_cell_id.clear();
+  dev_particle_id.clear();
+  dev_tag.clear();
+  #ifdef GPU_CAST
+  dev_x_cast.clear();
+  dev_type_cast.clear();
+  #endif
+
+  #ifndef USE_OPENCL
+  if (_gpu_nbor==1) cudppDestroyPlan(sort_plan);
+  #endif
+  
+  if (_gpu_nbor==2) {
+    host_particle_id.clear();
+    host_cell_id.clear();
+  }
+}
+
+template <class numtyp, class acctyp>
+void AtomT::clear() {
+  _max_gpu_bytes=0;
+  if (!_allocated)
+    return;
+
+  time_pos.clear();
+  time_q.clear();
+  time_quat.clear();
+  clear_resize();
+
+  #ifdef GPU_CAST
+  if (_compiled) {
+    k_cast_x.clear();
+    delete atom_program;
+    _compiled=false;
+  }
+  #endif
+}
+
+template <class numtyp, class acctyp>
+double AtomT::host_memory_usage() const {
+  int atom_bytes=4;
+  if (_charge) 
+    atom_bytes+=1;
+  if (_rot) 
+    atom_bytes+=4;
+  return _max_atoms*atom_bytes*sizeof(numtyp)+
+         sizeof(Atom<numtyp,acctyp>);
+}
+  
+// Sort arrays for neighbor list calculation
+template <class numtyp, class acctyp>
+void AtomT::sort_neighbor(const int num_atoms) {
+  #ifndef USE_OPENCL
+  CUDPPResult result = cudppSort(sort_plan, (unsigned *)dev_cell_id.begin(), 
+                                 (int *)dev_particle_id.begin(), 
+                                 8*sizeof(unsigned), num_atoms);
+  if (CUDPP_SUCCESS != result) {
+    printf("Error in cudppSort\n");
+    NVD_GERYON_EXIT;
+  }
+  #endif
+}
+
+#ifdef GPU_CAST
+#ifdef USE_OPENCL
+#include "atom_cl.h"
+#else
+#include "atom_ptx.h"
+#endif
+
+template <class numtyp, class acctyp>
+void AtomT::compile_kernels(UCL_Device &dev) {
+  std::string flags = "-D"+std::string(OCL_VENDOR);
+  atom_program=new UCL_Program(dev);
+  atom_program->load_string(atom,flags);
+  k_cast_x.set_function(*atom_program,"kernel_cast_x");
+  _compiled=true;
+}
+
+#endif
+
+template class Atom<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lal_atom.cu
+++ b/lib/gpu/lal_atom.cu
@ -0,0 +1,33 @@
+// **************************************************************************
+//                                  atom.cu
+//                             -------------------
+//                           W. Michael Brown (ORNL)
+//
+//  Device code for atom data casting
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                : 
+//    email                : brownw@ornl.gov
+// ***************************************************************************/
+
+#ifdef NV_KERNEL
+#include "lal_preprocessor.h"
+#endif
+
+__kernel void kernel_cast_x(__global numtyp4 *x_type, __global double *x,
+                            __global int *type, const int nall) {
+  int ii=GLOBAL_ID_X;
+
+  if (ii<nall) {
+    numtyp4 xt;
+    xt.w=type[ii];
+    int i=ii*3;
+    xt.x=x[i];
+    xt.y=x[i+1];
+    xt.z=x[i+2];
+    x_type[ii]=xt;
+  } // if ii
+}
--- a/lib/gpu/lal_atom.h
+++ b/lib/gpu/lal_atom.h
@ -0,0 +1,427 @@
+/***************************************************************************
+                                   atom.h
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Class for particle data management
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#ifndef PAIR_GPU_ATOM_H
+#define PAIR_GPU_ATOM_H
+
+#include <math.h>
+#include "mpi.h"
+
+#ifdef USE_OPENCL
+
+#include "geryon/ocl_timer.h"
+#include "geryon/ocl_mat.h"
+#include "geryon/ocl_kernel.h"
+using namespace ucl_opencl;
+
+#else
+
+#include "cudpp.h"
+#include "geryon/nvd_timer.h"
+#include "geryon/nvd_mat.h"
+#include "geryon/nvd_kernel.h"
+using namespace ucl_cudadr;
+
+#endif
+
+#include "lal_precision.h"
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class Atom {
+ public:
+  Atom();
+  ~Atom() { clear(); }
+
+  /// Maximum number of atoms that can be stored with current allocation
+  inline int max_atoms() const { return _max_atoms; }
+  /// Current number of local+ghost atoms stored
+  inline int nall() const { return _nall; }
+
+  /// Set number of local+ghost atoms for future copy operations
+  inline void nall(const int n) { _nall=n; }
+  
+  /// Memory usage per atom in this class
+  int bytes_per_atom() const; 
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param rot True if atom storage needs quaternions
+    * \param gpu_nbor 0 if neighboring will be performed on host
+    *        gpu_nbor 1 if neighboring will be performed on device
+    *        gpu_nbor 2 if binning on host and neighboring on device **/
+  bool init(const int nall, const bool charge, const bool rot, 
+            UCL_Device &dev, const int gpu_nbor=0, const bool bonds=false);
+  
+  /// Check if we have enough device storage and realloc if not
+  /** Returns true if resized with any call during this timestep **/
+  inline bool resize(const int nall, bool &success) {
+    _nall=nall;
+    if (nall>_max_atoms) {
+      clear_resize();
+      success = success && alloc(nall);
+      _resized=true;
+    }
+    return _resized;
+  }
+  
+  /// If already initialized by another LAMMPS style, add fields as necessary
+  /** \param rot True if atom storage needs quaternions
+    * \param gpu_nbor 0 if neighboring will be performed on host
+    *        gpu_nbor 1 if neighboring will be performed on device
+    *        gpu_nbor 2 if binning on host and neighboring on device **/
+  bool add_fields(const bool charge, const bool rot, const int gpu_nbor,
+                  const bool bonds);
+  
+  /// Returns true if GPU is using charges
+  bool charge() { return _charge; }
+  
+  /// Returns true if GPU is using quaternions
+  bool quat() { return _rot; }
+  
+  /// Only free matrices of length inum or nall for resizing
+  void clear_resize();
+  
+  /// Free all memory on host and device
+  void clear();
+ 
+  /// Return the total amount of host memory used by class in bytes
+  double host_memory_usage() const;
+
+  /// Sort arrays for neighbor list calculation on device
+  void sort_neighbor(const int num_atoms);
+  
+  /// Add copy times to timers
+  inline void acc_timers() {
+    time_pos.add_to_total();
+    if (_charge)
+      time_q.add_to_total();
+    if (_rot)
+      time_quat.add_to_total();
+  }
+
+  /// Add copy times to timers
+  inline void zero_timers() {
+    time_pos.zero();
+    if (_charge)
+      time_q.zero();
+    if (_rot)
+      time_quat.zero();
+  }
+
+  /// Return the total time for host/device data transfer
+  /** Zeros the total so that the atom times are only included once **/
+  inline double transfer_time() {
+    double total=time_pos.total_seconds();
+    time_pos.zero_total();
+    if (_charge) {
+      total+=time_q.total_seconds();
+      time_q.zero_total();
+    }
+    if (_rot) {
+      total+=time_q.total_seconds();
+      time_quat.zero_total();
+    }
+    
+    return total;
+  }
+  
+  /// Return the total time for data cast/pack
+  /** Zeros the time so that atom times are only included once **/
+  inline double cast_time() 
+    { double t=_time_cast; _time_cast=0.0; return t; }
+
+  /// Pack LAMMPS atom type constants into matrix and copy to device
+  template <class dev_typ, class t1>
+  inline void type_pack1(const int n, const int m_size,
+			 UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer,
+			 t1 **one) {
+    int ii=0;
+    for (int i=0; i<n; i++) {
+      for (int j=0; j<n; j++) {
+        buffer[ii]=static_cast<numtyp>(one[i][j]);
+        ii++;
+      }
+      ii+=m_size-n;
+    }
+    UCL_H_Vec<dev_typ> view;
+    view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
+    ucl_copy(dev_v,view,false);
+  }
+
+  /// Pack LAMMPS atom type constants into 2 vectors and copy to device
+  template <class dev_typ, class t1, class t2>
+  inline void type_pack2(const int n, const int m_size,
+			 UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer,
+			 t1 **one, t2 **two) {
+    int ii=0;
+    for (int i=0; i<n; i++) {
+      for (int j=0; j<n; j++) {
+        buffer[ii*2]=static_cast<numtyp>(one[i][j]);
+        buffer[ii*2+1]=static_cast<numtyp>(two[i][j]);
+        ii++;
+      }
+      ii+=m_size-n;
+    }
+    UCL_H_Vec<dev_typ> view;
+    view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
+    ucl_copy(dev_v,view,false);
+  }
+
+  /// Pack LAMMPS atom type constants (3) into 4 vectors and copy to device
+  template <class dev_typ, class t1, class t2, class t3>
+  inline void type_pack4(const int n, const int m_size,
+			 UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer,
+			 t1 **one, t2 **two, t3 **three) {
+    int ii=0;
+    for (int i=0; i<n; i++) {
+      for (int j=0; j<n; j++) {
+        buffer[ii*4]=static_cast<numtyp>(one[i][j]);
+        buffer[ii*4+1]=static_cast<numtyp>(two[i][j]);
+        buffer[ii*4+2]=static_cast<numtyp>(three[i][j]);
+        ii++;
+      }
+      ii+=m_size-n;
+    }
+    UCL_H_Vec<dev_typ> view;
+    view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
+    ucl_copy(dev_v,view,false);
+  }
+  
+  /// Pack LAMMPS atom type constants (4) into 4 vectors and copy to device
+  template <class dev_typ, class t1, class t2, class t3, class t4>
+  inline void type_pack4(const int n, const int m_size,
+			 UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer,
+			 t1 **one, t2 **two, t3 **three, t4 **four) {
+    int ii=0;
+    for (int i=0; i<n; i++) {
+      for (int j=0; j<n; j++) {
+        buffer[ii*4]=static_cast<numtyp>(one[i][j]);
+        buffer[ii*4+1]=static_cast<numtyp>(two[i][j]);
+        buffer[ii*4+2]=static_cast<numtyp>(three[i][j]);
+        buffer[ii*4+3]=static_cast<numtyp>(four[i][j]);
+        ii++;
+      }
+      ii+=m_size-n;
+    }
+    UCL_H_Vec<dev_typ> view;
+    view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
+    ucl_copy(dev_v,view,false);
+  }
+
+  /// Pack LAMMPS atom "self" type constants into 2 vectors and copy to device
+  template <class dev_typ, class t1, class t2>
+  inline void self_pack2(const int n, UCL_D_Vec<dev_typ> &dev_v, 
+                         UCL_H_Vec<numtyp> &buffer, t1 **one, t2 **two) {
+    for (int i=0; i<n; i++) {
+      buffer[i*2]=static_cast<numtyp>(one[i][i]);
+      buffer[i*2+1]=static_cast<numtyp>(two[i][i]);
+    }
+    UCL_H_Vec<dev_typ> view;
+    view.view((dev_typ*)buffer.begin(),n,*dev);
+    ucl_copy(dev_v,view,false);
+  }
+
+  // -------------------------COPY TO GPU ----------------------------------
+
+  /// Signal that we need to transfer atom data for next timestep
+  inline void data_unavail()
+    { _x_avail=false; _q_avail=false; _quat_avail=false; _resized=false; }
+
+  /// Cast positions and types to write buffer
+  inline void cast_x_data(double **host_ptr, const int *host_type) {
+    if (_x_avail==false) {
+      double t=MPI_Wtime();
+      #ifdef GPU_CAST
+      memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
+      memcpy(host_type_cast.begin(),host_type,_nall*sizeof(int));
+      #else
+      numtyp *_write_loc=host_x.begin();
+      for (int i=0; i<_nall; i++) {
+        *_write_loc=host_ptr[i][0];
+        _write_loc++;
+        *_write_loc=host_ptr[i][1];
+        _write_loc++;
+        *_write_loc=host_ptr[i][2];
+        _write_loc++;
+        *_write_loc=host_type[i];
+        _write_loc++;
+      }
+      #endif
+      _time_cast+=MPI_Wtime()-t;
+    }
+  }
+
+  /// Copy positions and types to device asynchronously
+  /** Copies nall() elements **/
+  inline void add_x_data(double **host_ptr, int *host_type) { 
+    time_pos.start();
+    if (_x_avail==false) {
+      #ifdef GPU_CAST
+      ucl_copy(dev_x_cast,host_x_cast,_nall*3,true);
+      ucl_copy(dev_type_cast,host_type_cast,_nall,true);
+      int block_size=64;
+      int GX=static_cast<int>(ceil(static_cast<double>(_nall)/block_size));
+      k_cast_x.set_size(GX,block_size);
+      k_cast_x.run(&dev_x.begin(), &dev_x_cast.begin(), &dev_type_cast.begin(), 
+                   &_nall);
+      #else
+      ucl_copy(dev_x,host_x,_nall*4,true);
+      #endif
+      _x_avail=true;
+    }
+    time_pos.stop();
+  }
+
+  /// Calls cast_x_data and add_x_data and times the routines
+  inline void cast_copy_x(double **host_ptr, int *host_type) {
+    cast_x_data(host_ptr,host_type);
+    add_x_data(host_ptr,host_type);
+  }
+
+  // Cast charges to write buffer
+  template<class cpytyp>
+  inline void cast_q_data(cpytyp *host_ptr) {
+    if (_q_avail==false) {
+      double t=MPI_Wtime();
+      if (dev->device_type()==UCL_CPU) {
+        if (sizeof(numtyp)==sizeof(double)) {
+          host_q.view((numtyp*)host_ptr,_nall,*dev);
+          dev_q.view(host_q);
+        } else
+          for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i];
+      } else {
+        if (sizeof(numtyp)==sizeof(double))
+          memcpy(host_q.begin(),host_ptr,_nall*sizeof(numtyp));
+        else
+          for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i];
+      }
+      _time_cast+=MPI_Wtime()-t;
+    }
+  }
+
+  // Copy charges to device asynchronously
+  inline void add_q_data() {
+    if (_q_avail==false) {
+      ucl_copy(dev_q,host_q,_nall,true);
+      _q_avail=true;
+    }
+  }
+
+  // Cast quaternions to write buffer
+  template<class cpytyp>
+  inline void cast_quat_data(cpytyp *host_ptr) {
+    if (_quat_avail==false) {
+      double t=MPI_Wtime();
+      if (dev->device_type()==UCL_CPU) {
+        if (sizeof(numtyp)==sizeof(double)) {
+          host_quat.view((numtyp*)host_ptr,_nall*4,*dev);
+          dev_quat.view(host_quat);
+        } else
+          for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i];
+      } else {
+        if (sizeof(numtyp)==sizeof(double))
+          memcpy(host_quat.begin(),host_ptr,_nall*4*sizeof(numtyp));
+        else
+          for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i];
+      }
+      _time_cast+=MPI_Wtime()-t;
+    }
+  }
+
+  // Copy quaternions to device
+  /** Copies nall()*4 elements **/
+  inline void add_quat_data() {
+    if (_quat_avail==false) {
+      ucl_copy(dev_quat,host_quat,_nall*4,true);
+      _quat_avail=true;
+    }
+  }
+
+  /// Return number of bytes used on device
+  inline double max_gpu_bytes() 
+    { double m=_max_gpu_bytes; _max_gpu_bytes=0.0; return m; } 
+
+  // ------------------------------ DATA ----------------------------------
+
+  /// Atom coordinates and types ([0] is x, [1] is y, [2] is z, [3] is type
+  UCL_D_Vec<numtyp> dev_x;
+  /// Charges
+  UCL_D_Vec<numtyp> dev_q;
+  /// Quaterions
+  UCL_D_Vec<numtyp> dev_quat;
+  
+  #ifdef GPU_CAST
+  UCL_D_Vec<double> dev_x_cast;
+  UCL_D_Vec<int> dev_type_cast;
+  UCL_H_Vec<double> host_x_cast;
+  UCL_H_Vec<int> host_type_cast;
+  #endif
+
+  /// Buffer for moving positions to device
+  UCL_H_Vec<numtyp> host_x;
+  /// Buffer for moving charge data to GPU
+  UCL_H_Vec<numtyp> host_q;
+  /// Buffer for moving quat data to GPU
+  UCL_H_Vec<numtyp> host_quat;
+  
+  /// Cell list identifiers for device nbor builds
+  UCL_D_Vec<unsigned> dev_cell_id;
+  /// Cell list identifiers for device nbor builds
+  UCL_D_Vec<int> dev_particle_id;
+  /// Atom tag information for device nbor builds
+  UCL_D_Vec<int> dev_tag;
+  
+  /// Cell list identifiers for hybrid nbor builds
+  UCL_H_Vec<int> host_cell_id;
+  /// Cell list identifiers for hybrid nbor builds
+  UCL_H_Vec<int> host_particle_id;
+
+  /// Device timers
+  UCL_Timer time_pos, time_q, time_quat;
+  
+  /// Geryon device
+  UCL_Device *dev;
+
+ private:
+  #ifdef GPU_CAST
+  UCL_Program *atom_program;
+  UCL_Kernel k_cast_x;
+  void compile_kernels(UCL_Device &dev);
+  #endif
+
+  bool _compiled;
+  
+  // True if data has been copied to device already
+  bool _x_avail, _q_avail, _quat_avail, _resized;
+
+  bool alloc(const int nall);
+  
+  bool _allocated, _rot, _charge, _other;
+  int _max_atoms, _nall, _gpu_nbor;
+  bool _bonds;
+  double _time_cast;
+  
+  double _max_gpu_bytes;
+  
+  #ifndef USE_OPENCL
+  CUDPPConfiguration sort_config;
+  CUDPPHandle sort_plan;
+  #endif
+};
+
+}
+
+#endif
--- a/lib/gpu/lal_aux_fun1.h
+++ b/lib/gpu/lal_aux_fun1.h
@ -0,0 +1,139 @@
+// **************************************************************************
+//                                 aux_fun1.h
+//                             -------------------
+//                           W. Michael Brown (ORNL)
+//
+//  Device code for pair style auxiliary functions
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                : Sat Oct 22 2011
+//    email                : brownw@ornl.gov
+// ***************************************************************************/
+
+#ifdef NV_KERNEL
+#include "lal_preprocessor.h"
+#endif
+
+#define atom_info(t_per_atom, ii, tid, offset)                               \
+  tid=THREAD_ID_X;                                                           \
+  offset=tid & (t_per_atom-1);                                               \
+  ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom)+tid/t_per_atom;
+
+#define nbor_info(nbor_mem, packed_mem, nbor_stride, t_per_atom, ii, offset, \
+                  i, numj, stride, list_end, nbor)                           \
+  nbor=nbor_mem+ii;                                                          \
+  i=*nbor;                                                                   \
+  nbor+=nbor_stride;                                                         \
+  numj=*nbor;                                                                \
+  if (nbor_mem==packed_mem) {                                                \
+    nbor+=nbor_stride+fast_mul(ii,t_per_atom-1);                             \
+    stride=fast_mul(t_per_atom,nbor_stride);                                 \
+    list_end=nbor+fast_mul(numj/t_per_atom,stride)+ (numj & (t_per_atom-1)); \
+    nbor+=offset;                                                            \
+  } else {                                                                   \
+    nbor+=nbor_stride;                                                       \
+    nbor=packed_mem+*nbor;                                                   \
+    list_end=nbor+numj;                                                      \
+    stride=t_per_atom;                                                       \
+    nbor+=offset;                                                            \
+  }
+
+#define store_answers(f, energy, virial, ii, inum, tid, t_per_atom, offset, \
+                      eflag, vflag, ans, engv)                              \
+  if (t_per_atom>1) {                                                       \
+    __local acctyp red_acc[6][BLOCK_PAIR];                                  \
+    red_acc[0][tid]=f.x;                                                    \
+    red_acc[1][tid]=f.y;                                                    \
+    red_acc[2][tid]=f.z;                                                    \
+    red_acc[3][tid]=energy;                                                 \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      if (offset < s) {                                                     \
+        for (int r=0; r<4; r++)                                             \
+          red_acc[r][tid] += red_acc[r][tid+s];                             \
+      }                                                                     \
+    }                                                                       \
+    f.x=red_acc[0][tid];                                                    \
+    f.y=red_acc[1][tid];                                                    \
+    f.z=red_acc[2][tid];                                                    \
+    energy=red_acc[3][tid];                                                 \
+    if (vflag>0) {                                                          \
+      for (int r=0; r<6; r++)                                               \
+        red_acc[r][tid]=virial[r];                                          \
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
+        if (offset < s) {                                                   \
+          for (int r=0; r<6; r++)                                           \
+            red_acc[r][tid] += red_acc[r][tid+s];                           \
+        }                                                                   \
+      }                                                                     \
+      for (int r=0; r<6; r++)                                               \
+        virial[r]=red_acc[r][tid];                                          \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0) {                                                          \
+    engv+=ii;                                                               \
+    if (eflag>0) {                                                          \
+      *engv=energy;                                                         \
+      engv+=inum;                                                           \
+    }                                                                       \
+    if (vflag>0) {                                                          \
+      for (int i=0; i<6; i++) {                                             \
+        *engv=virial[i];                                                    \
+        engv+=inum;                                                         \
+      }                                                                     \
+    }                                                                       \
+    ans[ii]=f;                                                              \
+  }
+
+#define store_answers_q(f, energy, e_coul, virial, ii, inum, tid,           \
+                        t_per_atom, offset, eflag, vflag, ans, engv)        \
+  if (t_per_atom>1) {                                                       \
+    __local acctyp red_acc[6][BLOCK_PAIR];                                  \
+    red_acc[0][tid]=f.x;                                                    \
+    red_acc[1][tid]=f.y;                                                    \
+    red_acc[2][tid]=f.z;                                                    \
+    red_acc[3][tid]=energy;                                                 \
+    red_acc[4][tid]=e_coul;                                                 \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      if (offset < s) {                                                     \
+        for (int r=0; r<5; r++)                                             \
+          red_acc[r][tid] += red_acc[r][tid+s];                             \
+      }                                                                     \
+    }                                                                       \
+    f.x=red_acc[0][tid];                                                    \
+    f.y=red_acc[1][tid];                                                    \
+    f.z=red_acc[2][tid];                                                    \
+    energy=red_acc[3][tid];                                                 \
+    e_coul=red_acc[4][tid];                                                 \
+    if (vflag>0) {                                                          \
+      for (int r=0; r<6; r++)                                               \
+        red_acc[r][tid]=virial[r];                                          \
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
+        if (offset < s) {                                                   \
+          for (int r=0; r<6; r++)                                           \
+            red_acc[r][tid] += red_acc[r][tid+s];                           \
+        }                                                                   \
+      }                                                                     \
+      for (int r=0; r<6; r++)                                               \
+        virial[r]=red_acc[r][tid];                                          \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0) {                                                          \
+    engv+=ii;                                                               \
+    if (eflag>0) {                                                          \
+      *engv=energy;                                                         \
+      engv+=inum;                                                           \
+      *engv=e_coul;                                                         \
+      engv+=inum;                                                           \
+    }                                                                       \
+    if (vflag>0) {                                                          \
+      for (int i=0; i<6; i++) {                                             \
+        *engv=virial[i];                                                    \
+        engv+=inum;                                                         \
+      }                                                                     \
+    }                                                                       \
+    ans[ii]=f;                                                              \
+  }
+
--- a/lib/gpu/lal_balance.h
+++ b/lib/gpu/lal_balance.h
@ -0,0 +1,207 @@
+/***************************************************************************
+                                  balance.h
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Class for host-device load balancing
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#ifndef LAL_BALANCE_H
+#define LAL_BALANCE_H
+
+#include "lal_device.h"
+#include <math.h>
+
+#define _HD_BALANCE_EVERY 25
+#define _HD_BALANCE_WEIGHT 0.5
+#define _HD_BALANCE_GAP 1.10
+
+namespace LAMMPS_AL {
+
+/// Host/device load balancer
+template<class numtyp, class acctyp>
+class Balance {
+ public:
+  inline Balance() : _init_done(false), _measure_this_step(false) {}
+  inline ~Balance() { clear(); }
+
+  /// Clear any old data and setup for new LAMMPS run
+  inline void init(Device<numtyp, acctyp> *gpu, const int gpu_nbor,
+                   const double split);
+
+  /// Clear all host and device data
+  inline void clear() {
+    if (_init_done) {
+      _device_time.clear();
+      _measure_this_step=false;
+      _init_done=false;
+    }
+  }
+  
+  /// Return the timestep since initialization
+  inline int timestep() { return _timestep; }
+
+  /// Get a count of the number of particles host will handle for initial alloc
+  inline int first_host_count(const int nlocal, const double gpu_split,
+                              const int gpu_nbor) const {
+    int host_nlocal=0;
+    if (gpu_nbor>0 && gpu_split!=1.0) {
+      if (gpu_split>0)
+        host_nlocal=static_cast<int>(ceil((1.0-gpu_split)*nlocal));
+      else
+        host_nlocal=static_cast<int>(ceil(0.05*nlocal));
+    }
+    return host_nlocal;
+  }
+
+  /// Return the number of particles the device will handle this timestep
+  inline int get_gpu_count(const int ago, const int inum_full);
+
+  /// Return the average fraction of particles handled by device on all procs
+  inline double all_avg_split() {
+    if (_load_balance) {
+      double _all_avg_split=0.0;
+      MPI_Reduce(&_avg_split,&_all_avg_split,1,MPI_DOUBLE,MPI_SUM,0,
+                 _device->replica());
+      _all_avg_split/=_device->replica_size();
+      return _all_avg_split/_avg_count;
+    } else
+      return _actual_split;
+  }
+
+  /// If CPU neighboring, allow the device fraction to increase on 2nd timestep
+  inline int ago_first(int ago) const
+    { if (_avg_count==1 && _actual_split<_desired_split) ago=0; return ago; }
+
+  /// Start the timer for asynchronous device execution
+  inline void start_timer() {
+    if (_measure_this_step) {
+      _device->gpu->sync();
+      _device->gpu_barrier();
+      _device->start_host_timer();
+      _device_time.start();
+      _device->gpu->sync();
+      _device->gpu_barrier();
+    }
+  }
+
+  /// Stop the timer for asynchronous device execution
+  inline void stop_timer() { if (_measure_this_step) { _device_time.stop(); } }
+
+  /// Calculate the new host/device split based on the cpu and device times
+  /** \note Only does calculation every _HD_BALANCE_EVERY timesteps 
+            (and first 10) **/
+  inline void balance(const double cpu_time);
+
+  /// Calls balance() and then get_gpu_count()
+  inline int balance(const int ago,const int inum_full,const double cpu_time) {
+    balance(cpu_time);
+    return get_gpu_count(ago,inum_full);
+  }
+  
+ private:
+  Device<numtyp,acctyp> *_device;
+  UCL_Timer _device_time;
+  bool _init_done;
+  int _gpu_nbor;
+  
+  bool _load_balance;
+  double _actual_split, _avg_split, _desired_split, _max_split;
+  int _avg_count;
+
+  bool _measure_this_step;
+  int _inum, _inum_full, _timestep;
+};
+
+#define BalanceT Balance<numtyp,acctyp>
+
+template <class numtyp, class acctyp>
+void BalanceT::init(Device<numtyp, acctyp> *gpu, 
+                           const int gpu_nbor, const double split) {
+  clear();
+  _gpu_nbor=gpu_nbor;
+  _init_done=true;
+  
+  _device=gpu;
+  _device_time.init(*gpu->gpu);
+  
+  if (split<0.0) {
+    _load_balance=true;
+    _desired_split=0.90;
+  } else {
+    _load_balance=false;
+    _desired_split=split;
+  }
+  _actual_split=_desired_split;
+  _avg_split=0.0;
+  _avg_count=0;
+  _timestep=0;
+}
+
+template <class numtyp, class acctyp>
+int BalanceT::get_gpu_count(const int ago, const int inum_full) {
+  _measure_this_step=false;
+  if (_load_balance) {
+    if (_avg_count<11 || _timestep%_HD_BALANCE_EVERY==0) {
+      _measure_this_step=true;
+      _inum_full=inum_full;
+    }
+    if (ago==0) {
+      _actual_split=_desired_split;
+      _max_split=_desired_split;
+    }
+  }
+  _inum=static_cast<int>(floor(_actual_split*inum_full));
+  if (_inum==0) _inum++;
+  _timestep++;
+  return _inum;
+}
+    
+template <class numtyp, class acctyp>
+void BalanceT::balance(const double cpu_time) {
+  if (_measure_this_step) {
+    _measure_this_step=false;
+    double gpu_time=_device_time.seconds();
+
+    double max_gpu_time;
+    MPI_Allreduce(&gpu_time,&max_gpu_time,1,MPI_DOUBLE,MPI_MAX,
+                  _device->gpu_comm());
+
+    if (_inum_full==_inum) {
+      _desired_split=1.0;
+      return;
+    }
+
+    double cpu_time_per_atom=cpu_time/(_inum_full-_inum);
+    double cpu_other_time=_device->host_time()-cpu_time;
+    int host_inum=static_cast<int>((max_gpu_time-cpu_other_time)/
+                                   cpu_time_per_atom);
+
+    double split=static_cast<double>(_inum_full-host_inum)/_inum_full;
+    _desired_split=split*_HD_BALANCE_GAP;
+    if (_desired_split>1.0)
+      _desired_split=1.0;
+    if (_desired_split<0.0)
+      _desired_split=0.0;
+
+    if (_gpu_nbor==0) {
+      if (_desired_split<_max_split)
+        _actual_split=_desired_split;
+      else
+        _actual_split=_max_split;
+    }
+  }
+  _avg_split+=_desired_split;
+  _avg_count++;
+}
+
+}
+
+#endif
--- a/lib/gpu/lal_base_atomic.cpp
+++ b/lib/gpu/lal_base_atomic.cpp
@ -0,0 +1,287 @@
+/***************************************************************************
+                               base_atomic.cpp
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Base class for pair styles with per-particle data for position and type
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+ 
+#include "lal_base_atomic.h"
+using namespace LAMMPS_AL;
+#define BaseAtomicT BaseAtomic<numtyp, acctyp>
+
+extern Device<PRECISION,ACC_PRECISION> global_device;
+
+template <class numtyp, class acctyp>
+BaseAtomicT::BaseAtomic() : _compiled(false), _max_bytes(0)  {
+  device=&global_device;
+  ans=new Answer<numtyp,acctyp>();
+  nbor=new Neighbor();
+}
+
+template <class numtyp, class acctyp>
+BaseAtomicT::~BaseAtomic() {
+  delete ans;
+  delete nbor;
+}
+
+template <class numtyp, class acctyp>
+int BaseAtomicT::bytes_per_atom_atomic(const int max_nbors) const {
+  return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
+         nbor->bytes_per_atom(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int BaseAtomicT::init_atomic(const int nlocal, const int nall,
+                                  const int max_nbors, const int maxspecial,
+                                  const double cell_size,
+                                  const double gpu_split, FILE *_screen,
+                                  const char *pair_program) {
+  screen=_screen;
+
+  int gpu_nbor=0;
+  if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_NEIGH)
+    gpu_nbor=1;
+  else if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
+    gpu_nbor=2;
+
+  int _gpu_host=0;
+  int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
+  if (host_nlocal>0)
+    _gpu_host=1;
+
+  _threads_per_atom=device->threads_per_atom();
+  if (_threads_per_atom>1 && gpu_nbor==0) {
+    nbor->packing(true);
+    _nbor_data=&(nbor->dev_packed);
+  } else
+    _nbor_data=&(nbor->dev_nbor);
+    
+  int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor,
+                           maxspecial,_gpu_host,max_nbors,cell_size,false,
+                           _threads_per_atom);
+  if (success!=0)
+    return success;
+    
+  ucl_device=device->gpu;
+  atom=&device->atom;
+
+  _block_size=device->pair_block_size();
+  compile_kernels(*ucl_device,pair_program);
+
+  // Initialize host-device load balancer
+  hd_balancer.init(device,gpu_nbor,gpu_split);
+
+  // Initialize timers for the selected GPU
+  time_pair.init(*ucl_device);
+  time_pair.zero();
+
+  pos_tex.bind_float(atom->dev_x,4);
+
+  _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
+
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+void BaseAtomicT::estimate_gpu_overhead() {
+  device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead);
+}
+
+template <class numtyp, class acctyp>
+void BaseAtomicT::clear_atomic() {
+  // Output any timing information
+  acc_timers();
+  double avg_split=hd_balancer.all_avg_split();
+  _gpu_overhead*=hd_balancer.timestep();
+  _driver_overhead*=hd_balancer.timestep();
+  device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes,
+                       _gpu_overhead,_driver_overhead,_threads_per_atom,screen);
+
+  if (_compiled) {
+    k_pair_fast.clear();
+    k_pair.clear();
+    delete pair_program;
+    _compiled=false;
+  }
+
+  time_pair.clear();
+  hd_balancer.clear();
+
+  nbor->clear();
+  ans->clear();
+  device->clear();
+}
+
+// ---------------------------------------------------------------------------
+// Copy neighbor list from host
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int * BaseAtomicT::reset_nbors(const int nall, const int inum, int *ilist,
+                                   int *numj, int **firstneigh, bool &success) {
+  success=true;
+
+  int mn=nbor->max_nbor_loop(inum,numj,ilist);
+  resize_atom(inum,nall,success);
+  resize_local(inum,mn,success);
+  if (!success)
+    return false;
+
+  nbor->get_host(inum,ilist,numj,firstneigh,block_size());
+
+  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
+  if (bytes>_max_an_bytes)
+    _max_an_bytes=bytes;
+  
+  return ilist;
+}
+
+// ---------------------------------------------------------------------------
+// Build neighbor list on device
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+inline void BaseAtomicT::build_nbor_list(const int inum, const int host_inum,
+                                         const int nall, double **host_x,
+                                         int *host_type, double *sublo,
+                                         double *subhi, int *tag,
+                                         int **nspecial, int **special,
+                                         bool &success) {
+  success=true;
+  resize_atom(inum,nall,success);
+  resize_local(inum,host_inum,nbor->max_nbors(),success);
+  if (!success)
+    return;
+  atom->cast_copy_x(host_x,host_type);
+
+  int mn;
+  nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, tag,
+                        nspecial, special, success, mn);
+
+  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
+  if (bytes>_max_an_bytes)
+    _max_an_bytes=bytes;
+}
+
+// ---------------------------------------------------------------------------
+// Copy nbor list from host if necessary and then calculate forces, virials,..
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void BaseAtomicT::compute(const int f_ago, const int inum_full,
+                               const int nall, double **host_x, int *host_type,
+                               int *ilist, int *numj, int **firstneigh,
+                               const bool eflag, const bool vflag,
+                               const bool eatom, const bool vatom,
+                               int &host_start, const double cpu_time,
+                               bool &success) {
+  acc_timers();
+  if (inum_full==0) {
+    host_start=0;
+    // Make sure textures are correct if realloc by a different hybrid style
+    resize_atom(0,nall,success);
+    zero_timers();
+    return;
+  }
+  
+  int ago=hd_balancer.ago_first(f_ago);
+  int inum=hd_balancer.balance(ago,inum_full,cpu_time);
+  ans->inum(inum);
+  host_start=inum;
+
+  if (ago==0) {
+    reset_nbors(nall, inum, ilist, numj, firstneigh, success);
+    if (!success)
+      return;
+  }
+
+  atom->cast_x_data(host_x,host_type);
+  hd_balancer.start_timer();
+  atom->add_x_data(host_x,host_type);
+
+  loop(eflag,vflag);
+  ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
+  device->add_ans_object(ans);
+  hd_balancer.stop_timer();
+}
+
+// ---------------------------------------------------------------------------
+// Reneighbor on GPU if necessary and then compute forces, virials, energies
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int ** BaseAtomicT::compute(const int ago, const int inum_full,
+                                 const int nall, double **host_x, int *host_type,
+                                 double *sublo, double *subhi, int *tag,
+                                 int **nspecial, int **special, const bool eflag, 
+                                 const bool vflag, const bool eatom,
+                                 const bool vatom, int &host_start,
+                                 int **ilist, int **jnum,
+                                 const double cpu_time, bool &success) {
+  acc_timers();
+  if (inum_full==0) {
+    host_start=0;
+    // Make sure textures are correct if realloc by a different hybrid style
+    resize_atom(0,nall,success);
+    zero_timers();
+    return NULL;
+  }
+  
+  hd_balancer.balance(cpu_time);
+  int inum=hd_balancer.get_gpu_count(ago,inum_full);
+  ans->inum(inum);
+  host_start=inum;
+ 
+  // Build neighbor list on GPU if necessary
+  if (ago==0) {
+    build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
+                    sublo, subhi, tag, nspecial, special, success);
+    if (!success)
+      return NULL;
+    hd_balancer.start_timer();
+  } else {
+    atom->cast_x_data(host_x,host_type);
+    hd_balancer.start_timer();
+    atom->add_x_data(host_x,host_type);
+  }
+  *ilist=nbor->host_ilist.begin();
+  *jnum=nbor->host_acc.begin();
+
+  loop(eflag,vflag);
+  ans->copy_answers(eflag,vflag,eatom,vatom);
+  device->add_ans_object(ans);
+  hd_balancer.stop_timer();
+  
+  return nbor->host_jlist.begin()-host_start;
+}
+
+template <class numtyp, class acctyp>
+double BaseAtomicT::host_memory_usage_atomic() const {
+  return device->atom.host_memory_usage()+nbor->host_memory_usage()+
+         4*sizeof(numtyp)+sizeof(BaseAtomic<numtyp,acctyp>);
+}
+
+template <class numtyp, class acctyp>
+void BaseAtomicT::compile_kernels(UCL_Device &dev, const char *pair_str) {
+  if (_compiled)
+    return;
+
+  std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
+                    std::string(OCL_PRECISION_COMPILE)+" -D"+
+                    std::string(OCL_VENDOR);
+
+  pair_program=new UCL_Program(dev);
+  pair_program->load_string(pair_str,flags.c_str());
+  k_pair_fast.set_function(*pair_program,"kernel_pair_fast");
+  k_pair.set_function(*pair_program,"kernel_pair");
+  pos_tex.get_texture(*pair_program,"pos_tex");
+
+  _compiled=true;
+}
+
+template class BaseAtomic<PRECISION,ACC_PRECISION>;
+
--- a/lib/gpu/lal_base_atomic.h
+++ b/lib/gpu/lal_base_atomic.h
@ -0,0 +1,198 @@
+/***************************************************************************
+                                base_atomic.h
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Base class for pair styles with per-particle data for position and type
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#ifndef LAL_BASE_ATOMIC_H
+#define LAL_BASE_ATOMIC_H
+
+#include "lal_device.h"
+#include "lal_balance.h"
+#include "mpi.h"
+
+#ifdef USE_OPENCL
+#include "geryon/ocl_texture.h"
+#else
+#include "geryon/nvd_texture.h"
+#endif
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class BaseAtomic {
+ public:
+  BaseAtomic();
+  virtual ~BaseAtomic();
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init_atomic(const int nlocal, const int nall, const int max_nbors,
+                  const int maxspecial, const double cell_size, 
+                  const double gpu_split, FILE *screen, 
+                  const char *pair_program);
+
+  /// Estimate the overhead for GPU context changes and CPU driver
+  void estimate_gpu_overhead();
+
+  /// Check if there is enough storage for atom arrays and realloc if not
+  /** \param success set to false if insufficient memory **/
+  inline void resize_atom(const int inum, const int nall, bool &success) {
+    if (atom->resize(nall, success))
+      pos_tex.bind_float(atom->dev_x,4);
+    ans->resize(inum,success);
+  }
+
+  /// Check if there is enough storage for neighbors and realloc if not
+  /** \param nlocal number of particles whose nbors must be stored on device
+    * \param host_inum number of particles whose nbors need to copied to host
+    * \param current maximum number of neighbors
+    * \note olist_size=total number of local particles **/
+  inline void resize_local(const int inum, const int max_nbors, bool &success) {
+    nbor->resize(inum,max_nbors,success);
+  }
+
+  /// Check if there is enough storage for neighbors and realloc if not
+  /** \param nlocal number of particles whose nbors must be stored on device
+    * \param host_inum number of particles whose nbors need to copied to host
+    * \param current maximum number of neighbors
+    * \note host_inum is 0 if the host is performing neighboring
+    * \note nlocal+host_inum=total number local particles
+    * \note olist_size=0 **/
+  inline void resize_local(const int inum, const int host_inum, 
+                           const int max_nbors, bool &success) {
+    nbor->resize(inum,host_inum,max_nbors,success);
+  }
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear_atomic();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom_atomic(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage_atomic() const;
+
+  /// Accumulate timers
+  inline void acc_timers() {
+    if (device->time_device()) {
+      nbor->acc_timers();
+      time_pair.add_to_total();
+      atom->acc_timers();
+      ans->acc_timers();
+    }
+  }
+
+  /// Zero timers
+  inline void zero_timers() {
+    time_pair.zero();
+    atom->zero_timers();
+    ans->zero_timers();
+  }
+
+  /// Copy neighbor list from host
+  int * reset_nbors(const int nall, const int inum, int *ilist, int *numj,
+                    int **firstneigh, bool &success);
+
+  /// Build neighbor list on device
+  void build_nbor_list(const int inum, const int host_inum,
+                       const int nall, double **host_x, int *host_type,
+                       double *sublo, double *subhi, int *tag, int **nspecial, 
+                       int **special, bool &success);
+
+  /// Pair loop with host neighboring
+  void compute(const int f_ago, const int inum_full,
+               const int nall, double **host_x, int *host_type,
+               int *ilist, int *numj, int **firstneigh, const bool eflag,
+               const bool vflag, const bool eatom, const bool vatom,
+               int &host_start, const double cpu_time, bool &success);
+
+  /// Pair loop with device neighboring
+  int * compute(const int ago, const int inum_full,
+                const int nall, double **host_x, int *host_type, double *sublo,
+                double *subhi, int *tag, int **nspecial,
+                int **special, const bool eflag, const bool vflag, 
+                const bool eatom, const bool vatom, int &host_start, 
+                const double cpu_time, bool &success);
+
+  /// Pair loop with device neighboring
+  int ** compute(const int ago, const int inum_full,
+                 const int nall, double **host_x, int *host_type, double *sublo,
+                 double *subhi, int *tag, int **nspecial,
+                 int **special, const bool eflag, const bool vflag, 
+                 const bool eatom, const bool vatom, int &host_start, 
+                 int **ilist, int **numj, const double cpu_time, bool &success);
+
+  // -------------------------- DEVICE DATA ------------------------- 
+
+  /// Device Properties and Atom and Neighbor storage
+  Device<numtyp,acctyp> *device;
+
+  /// Geryon device
+  UCL_Device *ucl_device;
+
+  /// Device Timers
+  UCL_Timer time_pair;
+
+  /// Host device load balancer
+  Balance<numtyp,acctyp> hd_balancer;
+
+  /// LAMMPS pointer for screen output
+  FILE *screen;
+
+  // --------------------------- ATOM DATA --------------------------
+
+  /// Atom Data
+  Atom<numtyp,acctyp> *atom;
+
+  // ------------------------ FORCE/ENERGY DATA -----------------------
+
+  Answer<numtyp,acctyp> *ans;
+
+  // --------------------------- NBOR DATA ----------------------------
+
+  /// Neighbor data
+  Neighbor *nbor;
+
+  // ------------------------- DEVICE KERNELS -------------------------
+  UCL_Program *pair_program;
+  UCL_Kernel k_pair_fast, k_pair;
+  inline int block_size() { return _block_size; }
+
+  // --------------------------- TEXTURES -----------------------------
+  UCL_Texture pos_tex;
+
+ protected:
+  bool _compiled;
+  int _block_size, _threads_per_atom;
+  double _max_bytes, _max_an_bytes;
+  double _gpu_overhead, _driver_overhead;
+  UCL_D_Vec<int> *_nbor_data;
+
+  void compile_kernels(UCL_Device &dev, const char *pair_string);
+
+  virtual void loop(const bool _eflag, const bool _vflag) = 0;
+};
+
+}
+
+#endif
--- a/lib/gpu/lal_base_charge.cpp
+++ b/lib/gpu/lal_base_charge.cpp
@ -0,0 +1,304 @@
+/***************************************************************************
+                               base_charge.cpp
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Base class for pair styles needing per-particle data for position,
+  charge, and type.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#include "lal_base_charge.h"
+using namespace LAMMPS_AL;
+#define BaseChargeT BaseCharge<numtyp, acctyp>
+
+extern Device<PRECISION,ACC_PRECISION> global_device;
+
+template <class numtyp, class acctyp>
+BaseChargeT::BaseCharge() : _compiled(false), _max_bytes(0) {
+  device=&global_device;
+  ans=new Answer<numtyp,acctyp>();
+  nbor=new Neighbor();
+}
+
+template <class numtyp, class acctyp>
+BaseChargeT::~BaseCharge() {
+  delete ans;
+  delete nbor;
+}
+
+template <class numtyp, class acctyp>
+int BaseChargeT::bytes_per_atom_atomic(const int max_nbors) const {
+  return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
+         nbor->bytes_per_atom(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int BaseChargeT::init_atomic(const int nlocal, const int nall,
+                                  const int max_nbors, const int maxspecial,
+                                  const double cell_size,
+                                  const double gpu_split, FILE *_screen,
+                                  const char *pair_program) {
+  screen=_screen;
+
+  int gpu_nbor=0;
+  if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_NEIGH)
+    gpu_nbor=1;
+  else if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
+    gpu_nbor=2;
+
+  int _gpu_host=0;
+  int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
+  if (host_nlocal>0)
+    _gpu_host=1;
+
+  _threads_per_atom=device->threads_per_charge();
+  if (_threads_per_atom>1 && gpu_nbor==0) {
+    nbor->packing(true);
+    _nbor_data=&(nbor->dev_packed);
+  } else
+    _nbor_data=&(nbor->dev_nbor);
+    
+  int success=device->init(*ans,true,false,nlocal,host_nlocal,nall,nbor,
+                           maxspecial,_gpu_host,max_nbors,cell_size,false,
+                           _threads_per_atom);
+  if (success!=0)
+    return success;
+
+  ucl_device=device->gpu;
+  atom=&device->atom;
+
+  _block_size=device->pair_block_size();
+  _block_bio_size=device->block_bio_pair();
+  compile_kernels(*ucl_device,pair_program);
+
+  // Initialize host-device load balancer
+  hd_balancer.init(device,gpu_nbor,gpu_split);
+
+  // Initialize timers for the selected GPU
+  time_pair.init(*ucl_device);
+  time_pair.zero();
+
+  pos_tex.bind_float(atom->dev_x,4);
+  q_tex.bind_float(atom->dev_q,1);
+
+  _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
+
+  return success;
+}
+
+template <class numtyp, class acctyp>
+void BaseChargeT::estimate_gpu_overhead() {
+  device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead);
+}
+
+template <class numtyp, class acctyp>
+void BaseChargeT::clear_atomic() {
+  // Output any timing information
+  acc_timers();
+  double avg_split=hd_balancer.all_avg_split();
+  _gpu_overhead*=hd_balancer.timestep();
+  _driver_overhead*=hd_balancer.timestep();
+  device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes,
+                       _gpu_overhead,_driver_overhead,_threads_per_atom,screen);
+
+  if (_compiled) {
+    k_pair_fast.clear();
+    k_pair.clear();
+    delete pair_program;
+    _compiled=false;
+  }
+
+  time_pair.clear();
+  hd_balancer.clear();
+
+  nbor->clear();
+  ans->clear();
+  device->clear();
+}
+
+// ---------------------------------------------------------------------------
+// Copy neighbor list from host
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int * BaseChargeT::reset_nbors(const int nall, const int inum, int *ilist,
+                                   int *numj, int **firstneigh, bool &success) {
+  success=true;
+
+  int mn=nbor->max_nbor_loop(inum,numj,ilist);
+  resize_atom(inum,nall,success);
+  resize_local(inum,mn,success);
+  if (!success)
+    return false;
+
+  nbor->get_host(inum,ilist,numj,firstneigh,block_size());
+
+  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
+  if (bytes>_max_an_bytes)
+    _max_an_bytes=bytes;
+
+  return ilist;
+}
+
+// ---------------------------------------------------------------------------
+// Build neighbor list on device
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+inline void BaseChargeT::build_nbor_list(const int inum, const int host_inum,
+                                         const int nall, double **host_x,
+                                         int *host_type, double *sublo,
+                                         double *subhi, int *tag, 
+                                         int **nspecial, int **special,
+                                         bool &success) {
+  success=true;
+  resize_atom(inum,nall,success);
+  resize_local(inum,host_inum,nbor->max_nbors(),success);
+  if (!success)
+    return;
+  atom->cast_copy_x(host_x,host_type);
+
+  int mn;
+  nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, tag,
+                        nspecial, special, success, mn);
+
+  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
+  if (bytes>_max_an_bytes)
+    _max_an_bytes=bytes;
+}
+
+// ---------------------------------------------------------------------------
+// Copy nbor list from host if necessary and then calculate forces, virials,..
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void BaseChargeT::compute(const int f_ago, const int inum_full,
+                               const int nall, double **host_x, int *host_type,
+                               int *ilist, int *numj, int **firstneigh,
+                               const bool eflag, const bool vflag,
+                               const bool eatom, const bool vatom,
+                               int &host_start, const double cpu_time,
+                               bool &success, double *host_q,
+                               const int nlocal, double *boxlo, double *prd) {
+  acc_timers();
+  if (inum_full==0) {
+    host_start=0;
+    // Make sure textures are correct if realloc by a different hybrid style
+    resize_atom(0,nall,success);
+    zero_timers();
+    return;
+  }
+  
+  int ago=hd_balancer.ago_first(f_ago);
+  int inum=hd_balancer.balance(ago,inum_full,cpu_time);
+  ans->inum(inum);
+  host_start=inum;
+
+  if (ago==0) {
+    reset_nbors(nall, inum, ilist, numj, firstneigh, success);
+    if (!success)
+      return;
+  }
+
+  atom->cast_x_data(host_x,host_type);
+  atom->cast_q_data(host_q);
+  hd_balancer.start_timer();
+  atom->add_x_data(host_x,host_type);
+  atom->add_q_data();
+
+  device->precompute(f_ago,nlocal,nall,host_x,host_type,success,host_q,
+                     boxlo, prd);
+
+  loop(eflag,vflag);
+  ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
+  device->add_ans_object(ans);
+  hd_balancer.stop_timer();
+}
+
+// ---------------------------------------------------------------------------
+// Reneighbor on GPU if necessary and then compute forces, virials, energies
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int** BaseChargeT::compute(const int ago, const int inum_full,
+                                const int nall, double **host_x, int *host_type,
+                                double *sublo, double *subhi, int *tag,
+                                int **nspecial, int **special, const bool eflag, 
+                                const bool vflag, const bool eatom,
+                                const bool vatom, int &host_start,
+                                int **ilist, int **jnum,
+                                const double cpu_time, bool &success,
+                                double *host_q, double *boxlo, double *prd) {
+  acc_timers();
+  if (inum_full==0) {
+    host_start=0;
+    // Make sure textures are correct if realloc by a different hybrid style
+    resize_atom(0,nall,success);
+    zero_timers();
+    return NULL;
+  }
+  
+  hd_balancer.balance(cpu_time);
+  int inum=hd_balancer.get_gpu_count(ago,inum_full);
+  ans->inum(inum);
+  host_start=inum;
+ 
+  // Build neighbor list on GPU if necessary
+  if (ago==0) {
+    build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
+                    sublo, subhi, tag, nspecial, special, success);
+    if (!success)
+      return NULL;
+    atom->cast_q_data(host_q);
+    hd_balancer.start_timer();
+  } else {
+    atom->cast_x_data(host_x,host_type);
+    atom->cast_q_data(host_q);
+    hd_balancer.start_timer();
+    atom->add_x_data(host_x,host_type);
+  }
+  atom->add_q_data();
+  *ilist=nbor->host_ilist.begin();
+  *jnum=nbor->host_acc.begin();
+
+  device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q,
+                     boxlo, prd);
+
+  loop(eflag,vflag);
+  ans->copy_answers(eflag,vflag,eatom,vatom);
+  device->add_ans_object(ans);
+  hd_balancer.stop_timer();
+  
+  return nbor->host_jlist.begin()-host_start;
+}
+
+template <class numtyp, class acctyp>
+double BaseChargeT::host_memory_usage_atomic() const {
+  return device->atom.host_memory_usage()+nbor->host_memory_usage()+
+         4*sizeof(numtyp)+sizeof(BaseCharge<numtyp,acctyp>);
+}
+
+template <class numtyp, class acctyp>
+void BaseChargeT::compile_kernels(UCL_Device &dev, const char *pair_str) {
+  if (_compiled)
+    return;
+
+  std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
+                    std::string(OCL_PRECISION_COMPILE)+" -D"+
+                    std::string(OCL_VENDOR);
+
+  pair_program=new UCL_Program(dev);
+  pair_program->load_string(pair_str,flags.c_str());
+  k_pair_fast.set_function(*pair_program,"kernel_pair_fast");
+  k_pair.set_function(*pair_program,"kernel_pair");
+  pos_tex.get_texture(*pair_program,"pos_tex");
+  q_tex.get_texture(*pair_program,"q_tex");
+
+  _compiled=true;
+}
+
+template class BaseCharge<PRECISION,ACC_PRECISION>;
+
--- a/lib/gpu/lal_base_charge.h
+++ b/lib/gpu/lal_base_charge.h
@ -0,0 +1,197 @@
+/***************************************************************************
+                                base_charge.h
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Base class for pair styles needing per-particle data for position,
+  charge, and type.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#ifndef LAL_BASE_CHARGE_H
+#define LAL_BASE_CHARGE_H
+
+#include "lal_device.h"
+#include "lal_balance.h"
+#include "mpi.h"
+
+#ifdef USE_OPENCL
+#include "geryon/ocl_texture.h"
+#else
+#include "geryon/nvd_texture.h"
+#endif
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class BaseCharge {
+ public:
+  BaseCharge();
+  virtual ~BaseCharge();
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init_atomic(const int nlocal, const int nall, const int max_nbors,
+                  const int maxspecial, const double cell_size,
+                  const double gpu_split, FILE *screen,
+                  const char *pair_program);
+
+  /// Estimate the overhead for GPU context changes and CPU driver
+  void estimate_gpu_overhead();
+
+  /// Check if there is enough storage for atom arrays and realloc if not
+  /** \param success set to false if insufficient memory **/
+  inline void resize_atom(const int inum, const int nall, bool &success) {
+    if (atom->resize(nall, success)) {
+      pos_tex.bind_float(atom->dev_x,4);
+      q_tex.bind_float(atom->dev_q,1);
+    }
+    ans->resize(inum,success);
+  }
+
+  /// Check if there is enough storage for neighbors and realloc if not
+  /** \param nlocal number of particles whose nbors must be stored on device
+    * \param host_inum number of particles whose nbors need to copied to host
+    * \param current maximum number of neighbors
+    * \note olist_size=total number of local particles **/
+  inline void resize_local(const int inum, const int max_nbors, bool &success) {
+    nbor->resize(inum,max_nbors,success);
+  }
+
+  /// Check if there is enough storage for neighbors and realloc if not
+  /** \param nlocal number of particles whose nbors must be stored on device
+    * \param host_inum number of particles whose nbors need to copied to host
+    * \param current maximum number of neighbors
+    * \note host_inum is 0 if the host is performing neighboring
+    * \note nlocal+host_inum=total number local particles
+    * \note olist_size=0 **/
+  inline void resize_local(const int inum, const int host_inum, 
+                           const int max_nbors, bool &success) {
+    nbor->resize(inum,host_inum,max_nbors,success);
+  }
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear_atomic();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom_atomic(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage_atomic() const;
+
+  /// Accumulate timers
+  inline void acc_timers() {
+    if (device->time_device()) {
+      nbor->acc_timers();
+      time_pair.add_to_total();
+      atom->acc_timers();
+      ans->acc_timers();
+    }
+  }
+
+  /// Zero timers
+  inline void zero_timers() {
+    time_pair.zero();
+    atom->zero_timers();
+    ans->zero_timers();
+  }
+
+  /// Copy neighbor list from host
+  int * reset_nbors(const int nall, const int inum, int *ilist, int *numj,
+                    int **firstneigh, bool &success);
+
+  /// Build neighbor list on device
+  void build_nbor_list(const int inum, const int host_inum,
+                       const int nall, double **host_x, int *host_type,
+                       double *sublo, double *subhi, int *tag, int **nspecial,
+                       int **special, bool &success);
+
+  /// Pair loop with host neighboring
+  void compute(const int f_ago, const int inum_full, const int nall,
+               double **host_x, int *host_type, int *ilist, int *numj,
+               int **firstneigh, const bool eflag, const bool vflag,
+               const bool eatom, const bool vatom, int &host_start,
+               const double cpu_time, bool &success, double *charge,
+               const int nlocal, double *boxlo, double *prd);
+
+  /// Pair loop with device neighboring
+  int** compute(const int ago, const int inum_full, const int nall,
+                double **host_x, int *host_type, double *sublo,
+                double *subhi, int *tag, int **nspecial,
+                int **special, const bool eflag, const bool vflag, 
+                const bool eatom, const bool vatom, int &host_start, 
+                int **ilist, int **numj, const double cpu_time, bool &success,
+                double *charge, double *boxlo, double *prd);
+
+  // -------------------------- DEVICE DATA ------------------------- 
+
+  /// Device Properties and Atom and Neighbor storage
+  Device<numtyp,acctyp> *device;
+
+  /// Geryon device
+  UCL_Device *ucl_device;
+
+  /// Device Timers
+  UCL_Timer time_pair;
+
+  /// Host device load balancer
+  Balance<numtyp,acctyp> hd_balancer;
+
+  /// LAMMPS pointer for screen output
+  FILE *screen;
+
+  // --------------------------- ATOM DATA --------------------------
+
+  /// Atom Data
+  Atom<numtyp,acctyp> *atom;
+
+
+  // ------------------------ FORCE/ENERGY DATA -----------------------
+
+  Answer<numtyp,acctyp> *ans;
+
+  // --------------------------- NBOR DATA ----------------------------
+
+  /// Neighbor data
+  Neighbor *nbor;
+
+  // ------------------------- DEVICE KERNELS -------------------------
+  UCL_Program *pair_program;
+  UCL_Kernel k_pair_fast, k_pair;
+  inline int block_size() { return _block_size; }
+
+  // --------------------------- TEXTURES -----------------------------
+  UCL_Texture pos_tex;
+  UCL_Texture q_tex;
+
+ protected:
+  bool _compiled;
+  int _block_size, _block_bio_size, _threads_per_atom;
+  double  _max_bytes, _max_an_bytes;
+  double _gpu_overhead, _driver_overhead;
+  UCL_D_Vec<int> *_nbor_data;
+
+  void compile_kernels(UCL_Device &dev, const char *pair_string);
+
+  virtual void loop(const bool _eflag, const bool _vflag) = 0;
+};
+
+}
+
+#endif
--- a/lib/gpu/lal_base_ellipsoid.cpp
+++ b/lib/gpu/lal_base_ellipsoid.cpp
@ -0,0 +1,469 @@
+/***************************************************************************
+                              base_ellipsoid.cpp
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Base class for acceleration of ellipsoid potentials
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : Thu May 5 2011
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#include "lal_base_ellipsoid.h"
+using namespace LAMMPS_AL;
+
+#ifdef USE_OPENCL
+#include "ellipsoid_nbor_cl.h"
+#else
+#include "ellipsoid_nbor_ptx.h"
+#endif
+
+#define BaseEllipsoidT BaseEllipsoid<numtyp, acctyp>
+extern Device<PRECISION,ACC_PRECISION> global_device;
+
+template <class numtyp, class acctyp>
+BaseEllipsoidT::BaseEllipsoid() : _compiled(false), _max_bytes(0) {
+  device=&global_device;
+  ans=new Answer<numtyp,acctyp>();
+  nbor=new Neighbor();
+}
+
+template <class numtyp, class acctyp>
+BaseEllipsoidT::~BaseEllipsoid() {
+  delete ans;
+  delete nbor;
+}
+
+template <class numtyp, class acctyp>
+int BaseEllipsoidT::bytes_per_atom(const int max_nbors) const {
+  return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
+         nbor->bytes_per_atom(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int BaseEllipsoidT::init_base(const int nlocal, const int nall,
+                              const int max_nbors, const int maxspecial,
+                              const double cell_size, const double gpu_split,
+                              FILE *_screen, const int ntypes, int **h_form,
+                              const char *ellipsoid_program,
+                              const char *lj_program, const bool ellip_sphere) {
+  screen=_screen;
+  _ellipsoid_sphere=ellip_sphere;
+
+  int gpu_nbor=0;
+  if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_NEIGH)
+    gpu_nbor=1;
+  else if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
+    gpu_nbor=2;
+
+  int _gpu_host=0;
+  int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
+  if (host_nlocal>0)
+    _gpu_host=1;
+
+  _threads_per_atom=device->threads_per_atom();
+    
+  int success=device->init(*ans,false,true,nlocal,host_nlocal,nall,nbor,
+                           maxspecial,_gpu_host,max_nbors,cell_size,true,
+                           1);
+  if (success!=0)
+    return success;
+
+  ucl_device=device->gpu;
+  atom=&device->atom;
+
+  _block_size=device->pair_block_size();
+  compile_kernels(*ucl_device,ellipsoid_program,lj_program,ellip_sphere);
+
+  // Initialize host-device load balancer
+  hd_balancer.init(device,gpu_nbor,gpu_split);
+
+  // Initialize timers for the selected GPU
+  time_lj.init(*ucl_device);
+  time_nbor1.init(*ucl_device);
+  time_ellipsoid.init(*ucl_device);
+  time_nbor2.init(*ucl_device);
+  time_ellipsoid2.init(*ucl_device);
+  time_nbor3.init(*ucl_device);
+  time_ellipsoid3.init(*ucl_device);
+  time_lj.zero();
+  time_nbor1.zero();
+  time_ellipsoid.zero();
+  time_nbor2.zero();
+  time_ellipsoid2.zero();
+  time_nbor3.zero();
+  time_ellipsoid3.zero();
+
+  // See if we want fast GB-sphere or sphere-sphere calculations
+  _host_form=h_form;
+  _multiple_forms=false;
+  for (int i=1; i<ntypes; i++)
+    for (int j=i; j<ntypes; j++)
+      if (_host_form[i][j]!=ELLIPSE_ELLIPSE)
+        _multiple_forms=true;
+  if (_multiple_forms && host_nlocal>0) {
+    std::cerr << "Cannot use Gayberne with multiple forms and GPU neighbor.\n";
+    exit(1);
+  }
+  
+  if (_multiple_forms)
+    ans->dev_ans.zero();
+
+  // Memory for ilist ordered by particle type
+  if (host_olist.alloc(nbor->max_atoms(),*ucl_device)==UCL_SUCCESS)
+    return 0;
+  else return -3;
+
+  _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
+
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+void BaseEllipsoidT::estimate_gpu_overhead() {
+  device->estimate_gpu_overhead(2,_gpu_overhead,_driver_overhead);
+}
+
+template <class numtyp, class acctyp>
+void BaseEllipsoidT::clear_base() {
+  // Output any timing information
+  output_times();
+  host_olist.clear();
+  
+  if (_compiled) {
+    k_nbor_fast.clear();
+    k_nbor.clear();
+    k_ellipsoid.clear();
+    k_ellipsoid_sphere.clear();
+    k_sphere_ellipsoid.clear();
+    k_lj_fast.clear();
+    k_lj.clear();
+    delete nbor_program;
+    delete ellipsoid_program;
+    delete lj_program;
+    _compiled=false;
+  }
+ 
+  time_nbor1.clear();
+  time_ellipsoid.clear();
+  time_nbor2.clear();
+  time_ellipsoid2.clear();
+  time_nbor3.clear();
+  time_ellipsoid3.clear();
+  time_lj.clear();
+  hd_balancer.clear();
+
+  nbor->clear();
+  ans->clear();
+  device->clear();
+}
+
+template <class numtyp, class acctyp>
+void BaseEllipsoidT::output_times() {
+  // Output any timing information
+  acc_timers();
+  double single[10], times[10];
+
+  single[0]=atom->transfer_time()+ans->transfer_time();
+  single[1]=nbor->time_nbor.total_seconds()+nbor->time_hybrid1.total_seconds()+
+            nbor->time_hybrid2.total_seconds();
+  single[2]=time_nbor1.total_seconds()+time_nbor2.total_seconds()+
+            time_nbor3.total_seconds()+nbor->time_nbor.total_seconds();
+  single[3]=time_ellipsoid.total_seconds()+time_ellipsoid2.total_seconds()+
+            time_ellipsoid3.total_seconds();
+  if (_multiple_forms)
+    single[4]=time_lj.total_seconds();
+  else
+    single[4]=0;
+  single[5]=atom->cast_time()+ans->cast_time();
+  single[6]=_gpu_overhead;
+  single[7]=_driver_overhead;
+  single[8]=ans->cpu_idle_time();
+  single[9]=nbor->bin_time();
+
+  MPI_Reduce(single,times,10,MPI_DOUBLE,MPI_SUM,0,device->replica());
+  double avg_split=hd_balancer.all_avg_split();
+
+  _max_bytes+=atom->max_gpu_bytes();
+  double mpi_max_bytes;
+  MPI_Reduce(&_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,
+             device->replica());
+  double max_mb=mpi_max_bytes/(1024*1024);
+
+  if (device->replica_me()==0)
+    if (screen && times[5]>0.0) {
+      int replica_size=device->replica_size();
+
+      fprintf(screen,"\n\n-------------------------------------");
+      fprintf(screen,"--------------------------------\n");
+      fprintf(screen,"      GPU Time Info (average): ");
+      fprintf(screen,"\n-------------------------------------");
+      fprintf(screen,"--------------------------------\n");
+
+      if (device->procs_per_gpu()==1) {
+        fprintf(screen,"Data Transfer:   %.4f s.\n",times[0]/replica_size);
+        fprintf(screen,"Data Cast/Pack:  %.4f s.\n",times[5]/replica_size);
+        fprintf(screen,"Neighbor copy:   %.4f s.\n",times[1]/replica_size);
+        if (nbor->gpu_nbor()>0)
+          fprintf(screen,"Neighbor build:  %.4f s.\n",times[2]/replica_size);
+        else
+          fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/replica_size);
+        fprintf(screen,"Force calc:      %.4f s.\n",times[3]/replica_size);
+        fprintf(screen,"LJ calc:         %.4f s.\n",times[4]/replica_size);
+      }
+      if (nbor->gpu_nbor()==2)
+        fprintf(screen,"Neighbor (CPU):  %.4f s.\n",times[9]/replica_size);
+      fprintf(screen,"GPU Overhead:    %.4f s.\n",times[6]/replica_size);
+      fprintf(screen,"Average split:   %.4f.\n",avg_split);
+      fprintf(screen,"Threads / atom:  %d.\n",_threads_per_atom);      
+      fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);
+      fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[7]/replica_size);
+      fprintf(screen,"CPU Idle_Time:   %.4f s.\n",times[8]/replica_size);
+      fprintf(screen,"-------------------------------------");
+      fprintf(screen,"--------------------------------\n\n");
+    }
+  _max_bytes=0.0;
+}
+
+// ---------------------------------------------------------------------------
+// Pack neighbors to limit thread divergence for lj-lj and ellipse 
+// ---------------------------------------------------------------------------
+template<class numtyp, class acctyp>
+void BaseEllipsoidT::pack_nbors(const int GX, const int BX, const int start, 
+                                const int inum, const int form_low,
+                                const int form_high, const bool shared_types,
+                                int ntypes) {
+  int stride=nbor->nbor_pitch();
+  if (shared_types) {
+    k_nbor_fast.set_size(GX,BX);
+    k_nbor_fast.run(&atom->dev_x.begin(), &cut_form.begin(), 
+                    &nbor->dev_nbor.begin(), &stride, &start, &inum,
+                    &nbor->dev_packed.begin(), &form_low, &form_high);
+  } else {
+    k_nbor.set_size(GX,BX);
+    k_nbor.run(&atom->dev_x.begin(), &cut_form.begin(), &ntypes,
+               &nbor->dev_nbor.begin(), &stride, &start, &inum, 
+               &nbor->dev_packed.begin(), &form_low, &form_high);
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Copy neighbor list from host
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void BaseEllipsoidT::reset_nbors(const int nall, const int inum, 
+                                 const int osize, int *ilist,
+                                 int *numj, int *type, int **firstneigh,
+                                 bool &success) {
+  success=true;
+    
+  int mn=nbor->max_nbor_loop(osize,numj,ilist);
+  resize_atom(nall,success);
+  resize_local(inum,0,mn,osize,success);
+  if (!success)
+    return;
+    
+  if (_multiple_forms) {
+    int p=0;
+    for (int i=0; i<osize; i++) {
+      int itype=type[ilist[i]];
+      if (_host_form[itype][itype]==ELLIPSE_ELLIPSE) {
+        host_olist[p]=ilist[i];
+        p++;
+      }
+    }
+    _max_last_ellipse=p;
+    _last_ellipse=std::min(inum,_max_last_ellipse);
+    for (int i=0; i<osize; i++) {
+      int itype=type[ilist[i]];
+      if (_host_form[itype][itype]!=ELLIPSE_ELLIPSE) {
+        host_olist[p]=ilist[i];
+        p++;
+      }
+    }
+    nbor->get_host(inum,host_olist.begin(),numj,firstneigh,block_size());
+    nbor->copy_unpacked(inum,mn);
+    return;
+  }
+  _last_ellipse=inum;
+  _max_last_ellipse=inum;
+  nbor->get_host(inum,ilist,numj,firstneigh,block_size());
+  nbor->copy_unpacked(inum,mn);
+
+  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
+  if (bytes>_max_an_bytes)
+    _max_an_bytes=bytes;
+}
+
+// ---------------------------------------------------------------------------
+// Build neighbor list on device
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+inline void BaseEllipsoidT::build_nbor_list(const int inum, const int host_inum,
+                                            const int nall, double **host_x,
+                                            int *host_type, double *sublo,
+                                            double *subhi, int *tag, 
+                                            int **nspecial, int **special,
+                                            bool &success) {
+  success=true;
+  resize_atom(nall,success);
+  resize_local(inum,host_inum,nbor->max_nbors(),0,success);
+  if (!success)
+    return;
+  atom->cast_copy_x(host_x,host_type);
+
+  int mn;
+  nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, tag,
+                        nspecial, special, success, mn);
+  nbor->copy_unpacked(inum,mn);
+  _last_ellipse=inum;
+  _max_last_ellipse=inum;
+
+  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
+  if (bytes>_max_an_bytes)
+    _max_an_bytes=bytes;
+}
+
+// ---------------------------------------------------------------------------
+// Copy nbor list from host if necessary and then calculate forces, virials,..
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int* BaseEllipsoidT::compute(const int f_ago, const int inum_full,
+                             const int nall, double **host_x, int *host_type,
+                             int *ilist, int *numj, int **firstneigh,
+                             const bool eflag, const bool vflag,
+                             const bool eatom, const bool vatom,
+                             int &host_start, const double cpu_time,
+                             bool &success, double **host_quat) {
+  acc_timers();
+  if (inum_full==0) {
+    host_start=0;
+    zero_timers();
+    return NULL;
+  }
+  
+  int ago=hd_balancer.ago_first(f_ago);
+  int inum=hd_balancer.balance(ago,inum_full,cpu_time);
+  ans->inum(inum);
+  _last_ellipse=std::min(inum,_max_last_ellipse);
+  host_start=inum;
+
+  if (ago==0) {
+    reset_nbors(nall, inum, inum_full, ilist, numj, host_type, firstneigh,
+                success);
+    if (!success)
+      return NULL;
+  }
+  int *list;
+  if (_multiple_forms)
+    list=host_olist.begin();
+  else
+    list=ilist;
+
+  atom->cast_x_data(host_x,host_type);
+  atom->cast_quat_data(host_quat[0]);
+  hd_balancer.start_timer();
+  atom->add_x_data(host_x,host_type);
+  atom->add_quat_data();
+
+  loop(eflag,vflag);
+  ans->copy_answers(eflag,vflag,eatom,vatom,list);
+  device->add_ans_object(ans);
+  hd_balancer.stop_timer();
+  return list;
+}
+
+// ---------------------------------------------------------------------------
+// Reneighbor on GPU if necessary and then compute forces, virials, energies
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int** BaseEllipsoidT::compute(const int ago, const int inum_full, const int nall,
+                              double **host_x, int *host_type, double *sublo,
+                              double *subhi, int *tag, int **nspecial,
+                              int **special, const bool eflag, const bool vflag,
+                              const bool eatom, const bool vatom, 
+                              int &host_start, int **ilist, int **jnum,
+                              const double cpu_time, bool &success,
+                              double **host_quat) {
+  acc_timers();
+  if (inum_full==0) {
+    host_start=0;
+    zero_timers();
+    return NULL;
+  }
+
+  hd_balancer.balance(cpu_time);
+  int inum=hd_balancer.get_gpu_count(ago,inum_full);
+  ans->inum(inum);
+  _last_ellipse=std::min(inum,_max_last_ellipse);
+  host_start=inum;
+  
+  // Build neighbor list on GPU if necessary
+  if (ago==0) {
+    build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
+                    sublo, subhi, tag, nspecial, special, success);
+    if (!success)
+      return NULL;
+    atom->cast_quat_data(host_quat[0]);
+    hd_balancer.start_timer();
+  } else {    
+    atom->cast_x_data(host_x,host_type);
+    atom->cast_quat_data(host_quat[0]);
+    hd_balancer.start_timer();
+    atom->add_x_data(host_x,host_type);
+  }
+
+  atom->add_quat_data();
+  *ilist=nbor->host_ilist.begin();
+  *jnum=nbor->host_acc.begin();
+
+  loop(eflag,vflag);
+  ans->copy_answers(eflag,vflag,eatom,vatom);
+  device->add_ans_object(ans);
+  hd_balancer.stop_timer();
+  return nbor->host_jlist.begin()-host_start;
+}
+
+template <class numtyp, class acctyp>
+double BaseEllipsoidT::host_memory_usage_base() const {
+  return device->atom.host_memory_usage()+nbor->host_memory_usage()+
+         4*sizeof(numtyp)+sizeof(BaseEllipsoid<numtyp,acctyp>);
+}
+
+template <class numtyp, class acctyp>
+void BaseEllipsoidT::compile_kernels(UCL_Device &dev, 
+                                     const char *ellipsoid_string,
+                                     const char *lj_string, const bool e_s) {
+  if (_compiled)
+    return;
+
+  std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
+                    std::string(OCL_PRECISION_COMPILE)+" -D"+
+                    std::string(OCL_VENDOR);
+
+  nbor_program=new UCL_Program(dev);
+  nbor_program->load_string(ellipsoid_nbor,flags.c_str());
+  k_nbor_fast.set_function(*nbor_program,"kernel_nbor_fast");
+  k_nbor.set_function(*nbor_program,"kernel_nbor");
+
+  ellipsoid_program=new UCL_Program(dev);
+  ellipsoid_program->load_string(ellipsoid_string,flags.c_str());
+  k_ellipsoid.set_function(*ellipsoid_program,"kernel_ellipsoid");
+
+  lj_program=new UCL_Program(dev);
+  lj_program->load_string(lj_string,flags.c_str());
+  k_sphere_ellipsoid.set_function(*lj_program,"kernel_sphere_ellipsoid");
+  k_lj_fast.set_function(*lj_program,"kernel_lj_fast");
+  k_lj.set_function(*lj_program,"kernel_lj");
+  if (e_s)
+    k_ellipsoid_sphere.set_function(*lj_program,"kernel_ellipsoid_sphere");
+
+  _compiled=true;
+}
+
+template class BaseEllipsoid<PRECISION,ACC_PRECISION>;
+
--- a/lib/gpu/lal_base_ellipsoid.h
+++ b/lib/gpu/lal_base_ellipsoid.h
@ -0,0 +1,248 @@
+/***************************************************************************
+                               base_ellipsoid.h
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Base class for acceleration of ellipsoid potentials
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : Thu May 5 2011
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#ifndef LAL_BASE_ELLIPSOID_H
+#define LAL_BASE_ELLIPSOID_H
+
+#include "lal_device.h"
+#include "lal_balance.h"
+#include "mpi.h"
+
+#ifdef USE_OPENCL
+#include "geryon/ocl_texture.h"
+#else
+#include "geryon/nvd_texture.h"
+#endif
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class BaseEllipsoid {
+ public:
+  BaseEllipsoid();
+  virtual ~BaseEllipsoid();
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    * \param ellipsoid_sphere true if ellipsoid-sphere case handled separately
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init_base(const int nlocal, const int nall, const int max_nbors,
+                const int maxspecial, const double cell_size,
+                const double gpu_split, FILE *screen, const int ntypes,
+                int **h_form, const char *ellipsoid_program,
+                const char *lj_program, const bool ellipsoid_sphere=false);
+
+  /// Estimate the overhead for GPU context changes and CPU driver
+  void estimate_gpu_overhead();
+
+  /// Check if there is enough storage for atom arrays and realloc if not
+  /** \param success set to false if insufficient memory **/
+  inline void resize_atom(const int nall, bool &success) {
+    atom->resize(nall, success);
+  }
+
+  /// Check if there is enough storage for neighbors and realloc if not
+  /** \param nlocal number of particles whose nbors must be stored on device
+    * \param host_inum number of particles whose nbors need to copied to host
+    * \param current maximum number of neighbors
+    * \param olist_size size of list of particles from CPU neighboring
+    * \note host_inum is 0 if the host is performing neighboring
+    * \note if GPU is neighboring nlocal+host_inum=total number local particles
+    * \note if CPU is neighboring olist_size=total number of local particles 
+    * \note if GPU is neighboring olist_size=0 **/
+  inline void resize_local(const int nlocal, const int host_inum,
+                           const int max_nbors, const int olist_size,
+                           bool &success) {
+    ans->resize(nlocal, success);
+    if (_multiple_forms) ans->dev_ans.zero();
+
+    if (olist_size>static_cast<int>(host_olist.numel())) {
+      host_olist.clear();
+      int new_size=static_cast<int>(static_cast<double>(olist_size)*1.10);
+      success=success && (host_olist.alloc(new_size,*ucl_device)==UCL_SUCCESS);
+    }
+
+    nbor->resize(nlocal,host_inum,max_nbors,success);
+    double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
+    if (bytes>_max_bytes)
+      _max_bytes=bytes;
+  }
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear_base();
+  
+  /// Output any timing information
+  void output_times();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage_base() const;
+
+  /// Accumulate timers
+  inline void acc_timers() {
+    if (device->time_device()) {
+      nbor->acc_timers();
+      time_nbor1.add_to_total();
+      time_ellipsoid.add_to_total();
+      if (_multiple_forms) {
+        time_nbor2.add_to_total();
+        time_ellipsoid2.add_to_total();
+        if (_ellipsoid_sphere) {
+          time_nbor3.add_to_total();
+          time_ellipsoid3.add_to_total();
+        }
+        time_lj.add_to_total();
+      }
+      atom->acc_timers();
+      ans->acc_timers();
+    }
+  }
+  
+  /// Zero timers
+  inline void zero_timers() {
+    time_nbor1.zero();
+    time_ellipsoid.zero();
+    if (_multiple_forms) {
+      time_nbor2.zero();
+      time_ellipsoid2.zero();
+      if (_ellipsoid_sphere) {
+        time_nbor3.zero();
+        time_ellipsoid3.zero();
+      }
+      time_lj.zero();
+    }
+    atom->zero_timers();
+    ans->zero_timers();
+  }
+
+  /// Pack neighbors to limit thread divergence for lj-lj and ellipse 
+  void pack_nbors(const int GX, const int BX, const int start, const int inum,
+                  const int form_low, const int form_high, 
+                  const bool shared_types, int ntypes);
+
+  /// Copy neighbor list from host
+  void reset_nbors(const int nall, const int inum, const int osize, int *ilist,
+                   int *numj, int *type, int **firstneigh, bool &success);
+
+  /// Build neighbor list on device
+  void build_nbor_list(const int inum, const int host_inum,
+                       const int nall, double **host_x, int *host_type,
+                       double *sublo, double *subhi, int *tag, int **nspecial,
+                       int **special, bool &success);
+
+  /// Pair loop with host neighboring
+  int* compute(const int f_ago, const int inum_full, const int nall,
+               double **host_x, int *host_type, int *ilist, int *numj,
+               int **firstneigh, const bool eflag, const bool vflag,
+               const bool eatom, const bool vatom, int &host_start,
+               const double cpu_time, bool &success, double **quat);
+
+  /// Pair loop with device neighboring
+  int** compute(const int ago, const int inum_full, const int nall,
+                double **host_x, int *host_type, double *sublo,
+                double *subhi, int *tag, int **nspecial,
+                int **special, const bool eflag, const bool vflag, 
+                const bool eatom, const bool vatom, int &host_start, 
+                int **ilist, int **numj, const double cpu_time, bool &success,
+                double **host_quat);
+
+  /// Build neighbor list on accelerator
+  void build_nbor_list(const int inum, const int host_inum, const int nall, 
+                       double **host_x, int *host_type, double *sublo,
+                       double *subhi, bool &success);
+                       
+  // -------------------------- DEVICE DATA ------------------------- 
+
+  /// Device Properties and Atom and Neighbor storage
+  Device<numtyp,acctyp> *device;
+
+  /// Geryon device
+  UCL_Device *ucl_device;
+
+  /// Device Timers
+  UCL_Timer time_nbor1, time_ellipsoid, time_nbor2, time_ellipsoid2, time_lj;
+  UCL_Timer time_nbor3, time_ellipsoid3;
+
+  /// Host device load balancer
+  Balance<numtyp,acctyp> hd_balancer;
+
+  /// LAMMPS pointer for screen output
+  FILE *screen;
+
+  // --------------------------- ATOM DATA --------------------------
+
+  /// Atom Data
+  Atom<numtyp,acctyp> *atom;
+
+  // --------------------------- TYPE DATA -------------------------- 
+
+  /// cut_form.x = cutsq, cut_form.y = form
+  UCL_D_Vec<numtyp2> cut_form;
+
+  // ------------------------ FORCE/ENERGY DATA -----------------------
+
+  Answer<numtyp,acctyp> *ans;
+
+  // --------------------------- NBOR DATA ----------------------------
+
+  /// Neighbor data
+  Neighbor *nbor;
+  /// ilist with particles sorted by type
+  UCL_H_Vec<int> host_olist;
+
+  // ------------------------- DEVICE KERNELS -------------------------
+  UCL_Program *nbor_program, *ellipsoid_program, *lj_program;
+  UCL_Kernel k_nbor_fast, k_nbor;
+  UCL_Kernel k_ellipsoid, k_ellipsoid_sphere, k_sphere_ellipsoid;
+  UCL_Kernel k_lj_fast, k_lj;
+  inline int block_size() { return _block_size; }
+
+  // --------------------------- TEXTURES -----------------------------
+  UCL_Texture pos_tex;
+  UCL_Texture q_tex;
+
+ protected:
+  bool _compiled, _ellipsoid_sphere;
+  int _block_size, _threads_per_atom;
+  double  _max_bytes, _max_an_bytes;
+  double _gpu_overhead, _driver_overhead;
+  UCL_D_Vec<int> *_nbor_data;
+
+  // True if we want to use fast GB-sphere or sphere-sphere calculations 
+  bool _multiple_forms;
+  int **_host_form;
+  int _last_ellipse, _max_last_ellipse;
+
+  void compile_kernels(UCL_Device &dev, const char *ellipsoid_string,
+                       const char *lj_string, const bool e_s);
+
+  virtual void loop(const bool _eflag, const bool _vflag) = 0;
+};
+
+}
+
+#endif
+
--- a/lib/gpu/lal_cg_cmm.cpp
+++ b/lib/gpu/lal_cg_cmm.cpp
@ -0,0 +1,154 @@
+/***************************************************************************
+                                 cg_cmm.cpp
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Class for acceleration of the cg/cmm/cut pair style
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#ifdef USE_OPENCL
+#include "cg_cmm_cl.h"
+#else
+#include "cg_cmm_ptx.h"
+#endif
+
+#include "lal_cg_cmm.h"
+#include <cassert>
+using namespace LAMMPS_AL;
+#define CGCMMT CGCMM<numtyp, acctyp>
+
+extern Device<PRECISION,ACC_PRECISION> device;
+
+template <class numtyp, class acctyp>
+CGCMMT::CGCMM() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
+}
+
+template <class numtyp, class acctyp>
+CGCMMT::~CGCMM() { 
+  clear();
+}
+ 
+template <class numtyp, class acctyp>
+int CGCMMT::bytes_per_atom(const int max_nbors) const {
+  return this->bytes_per_atom_atomic(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int CGCMMT::init(const int ntypes, double **host_cutsq, 
+                          int **host_cg_type, double **host_lj1, 
+                          double **host_lj2, double **host_lj3, 
+                          double **host_lj4, double **host_offset, 
+                          double *host_special_lj, const int nlocal,
+                          const int nall, const int max_nbors,
+                          const int maxspecial, const double cell_size, 
+                          const double gpu_split, FILE *_screen) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                            _screen,cg_cmm);
+  if (success!=0)
+    return success;
+
+  // If atom type constants fit in shared memory use fast kernel
+  int cmm_types=ntypes;
+  shared_types=false;
+  int max_shared_types=this->device->max_shared_types();
+  if (cmm_types<=max_shared_types && this->_block_size>=max_shared_types) {
+    cmm_types=max_shared_types;
+    shared_types=true;
+  }
+  _cmm_types=cmm_types;
+
+  // Allocate a host write buffer for data initialization
+  UCL_H_Vec<numtyp> host_write(cmm_types*cmm_types*32,*(this->ucl_device),
+                               UCL_WRITE_OPTIMIZED);
+
+  for (int i=0; i<cmm_types*cmm_types; i++)
+    host_write[i]=0.0;
+
+  lj1.alloc(cmm_types*cmm_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,cmm_types,lj1,host_write,host_cutsq, 
+                         host_cg_type,host_lj1,host_lj2);
+
+  lj3.alloc(cmm_types*cmm_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,cmm_types,lj3,host_write,host_lj3,host_lj4,
+		         host_offset);
+
+  UCL_H_Vec<double> dview;
+  sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
+  dview.view(host_special_lj,4,*(this->ucl_device));
+  ucl_copy(sp_lj,dview,false);
+
+  _allocated=true;
+  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+void CGCMMT::clear() {
+  if (!_allocated)
+    return;
+  _allocated=false;
+
+  lj1.clear();
+  lj3.clear();
+  sp_lj.clear();
+  this->clear_atomic();
+}
+
+template <class numtyp, class acctyp>
+double CGCMMT::host_memory_usage() const {
+  return this->host_memory_usage_atomic()+sizeof(CGCMM<numtyp,acctyp>);
+}
+
+// ---------------------------------------------------------------------------
+// Calculate energies, forces, and torques
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void CGCMMT::loop(const bool _eflag, const bool _vflag) {
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int eflag, vflag;
+  if (_eflag)
+    eflag=1;
+  else
+    eflag=0;
+
+  if (_vflag)
+    vflag=1;
+  else
+    vflag=0;
+  
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+
+  int ainum=this->ans->inum();
+  int nbor_pitch=this->nbor->nbor_pitch();
+  this->time_pair.start();
+  if (shared_types) {
+    this->k_pair_fast.set_size(GX,BX);
+    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
+                          &lj3.begin(), &sp_lj.begin(),
+                          &this->nbor->dev_nbor.begin(),
+                          &this->_nbor_data->begin(),
+                          &this->ans->dev_ans.begin(),
+                          &this->ans->dev_engv.begin(), &eflag, &vflag,
+                          &ainum, &nbor_pitch, &this->_threads_per_atom);
+  } else {
+    this->k_pair.set_size(GX,BX);
+    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
+                     &_cmm_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
+                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
+                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &nbor_pitch, &this->_threads_per_atom);
+  }
+  this->time_pair.stop();
+}
+
+template class CGCMM<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lal_cg_cmm.cu
+++ b/lib/gpu/lal_cg_cmm.cu
@ -0,0 +1,205 @@
+// **************************************************************************
+//                                  cg_cmm.cu
+//                             -------------------
+//                           W. Michael Brown (ORNL)
+//
+//  Device code for acceleration of the cg/cmm/cut pair style
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                : 
+//    email                : brownw@ornl.gov
+// ***************************************************************************/
+
+#ifdef NV_KERNEL
+#include "lal_aux_fun1.h"
+texture<float4> pos_tex;
+#ifndef _DOUBLE_DOUBLE
+ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
+  { return tex1Dfetch(pos_tex, i); }
+#endif
+#endif
+
+__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
+                          __global numtyp4* lj3, const int lj_types, 
+                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
+                          __global int *dev_packed, __global acctyp4 *ans,
+                          __global acctyp *engv, const int eflag,
+                          const int vflag, const int inum,
+                          const int nbor_pitch, const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp sp_lj[4];
+  sp_lj[0]=sp_lj_in[0];
+  sp_lj[1]=sp_lj_in[1];
+  sp_lj[2]=sp_lj_in[2];
+  sp_lj[3]=sp_lj_in[3];
+
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
+  if (ii<inum) {
+    __global int *nbor, *list_end;
+    int i, numj, n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,list_end,nbor);
+  
+    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    int itype=ix.w;
+
+    numtyp factor_lj;
+    for ( ; nbor<list_end; nbor+=n_stride) {
+  
+      int j=*nbor;
+      factor_lj = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp r2inv = delx*delx+dely*dely+delz*delz;
+        
+      int mtype=itype*lj_types+jtype;
+      if (r2inv<lj1[mtype].x) {
+        r2inv=ucl_recip(r2inv);
+        numtyp inv1,inv2;
+        
+        if (lj1[mtype].y == 2) {
+          inv1=r2inv*r2inv;
+          inv2=inv1*inv1;
+        } else if (lj1[mtype].y == 1) {
+          inv2=r2inv*ucl_sqrt(r2inv);
+          inv1=inv2*inv2;
+        } else {
+          inv1=r2inv*r2inv*r2inv;
+          inv2=inv1;
+        }
+        numtyp force = factor_lj*r2inv*inv1*(lj1[mtype].z*inv2-lj1[mtype].w);
+      
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+        if (eflag>0)
+          energy += factor_lj*inv1*(lj3[mtype].x*inv2-lj3[mtype].y)-
+                    lj3[mtype].z;
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                  ans,engv);
+  } // if ii
+}
+
+__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
+                               __global numtyp4* lj3_in, 
+                               __global numtyp* sp_lj_in,__global int *dev_nbor,
+                               __global int *dev_packed, __global acctyp4 *ans,
+                               __global acctyp *engv, const int eflag,
+                               const int vflag, const int inum,
+                               const int nbor_pitch, const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp sp_lj[4];
+  if (tid<4)
+    sp_lj[tid]=sp_lj_in[tid];
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    lj1[tid]=lj1_in[tid];
+    if (eflag>0)
+      lj3[tid]=lj3_in[tid];
+  }
+  
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
+  __syncthreads();
+  
+  if (ii<inum) {
+    __global int *nbor, *list_end;
+    int i, numj, n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,list_end,nbor);
+  
+    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    int iw=ix.w;
+    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
+
+    numtyp factor_lj;
+    for ( ; nbor<list_end; nbor+=n_stride) {
+  
+      int j=*nbor;
+      factor_lj = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      int mtype=itype+jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp r2inv = delx*delx+dely*dely+delz*delz;
+        
+      if (r2inv<lj1[mtype].x) {
+        r2inv=ucl_recip(r2inv);
+        numtyp inv1,inv2;
+        
+        if (lj1[mtype].y == (numtyp)2) {
+          inv1=r2inv*r2inv;
+          inv2=inv1*inv1;
+        } else if (lj1[mtype].y == (numtyp)1) {
+          inv2=r2inv*ucl_sqrt(r2inv);
+          inv1=inv2*inv2;
+        } else {
+          inv1=r2inv*r2inv*r2inv;
+          inv2=inv1;
+        }
+        numtyp force = factor_lj*r2inv*inv1*(lj1[mtype].z*inv2-lj1[mtype].w);
+      
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+        if (eflag>0)
+          energy += factor_lj*inv1*(lj3[mtype].x*inv2-lj3[mtype].y)-
+                    lj3[mtype].z;
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                  ans,engv);
+  } // if ii
+}
+
--- a/lib/gpu/lal_cg_cmm.h
+++ b/lib/gpu/lal_cg_cmm.h
@ -0,0 +1,79 @@
+/***************************************************************************
+                                  cg_cmm.h
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Class for acceleration of the cg/cmm/cut pair style
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#ifndef LAL_CG_CMM_H
+#define LAL_CG_CMM_H
+
+#include "lal_base_atomic.h"
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class CGCMM : public BaseAtomic<numtyp, acctyp> {
+ public:
+  CGCMM();
+  ~CGCMM(); 
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double **host_cutsq, int **host_cg_type,
+           double **host_lj1, double **host_lj2, double **host_lj3,
+           double **host_lj4, double **host_offset, double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors, 
+           const int maxspecial, const double cell_size,
+           const double gpu_split, FILE *screen);
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage() const;
+
+  // --------------------------- TYPE DATA --------------------------
+
+  /// lj1.x = cutsq, lj1.y=cg_type, lj1.z = lj1, lj1.w = lj2
+  UCL_D_Vec<numtyp4> lj1;
+  /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
+  UCL_D_Vec<numtyp4> lj3;
+  /// Special LJ values
+  UCL_D_Vec<numtyp> sp_lj;
+
+  /// If atom type constants fit in shared memory, use fast kernels
+  bool shared_types;
+
+  /// Number of atom types 
+  int _cmm_types;
+
+ private:
+  bool _allocated;
+  void loop(const bool _eflag, const bool _vflag);
+};
+
+}
+
+#endif
--- a/lib/gpu/lal_cg_cmm_ext.cpp
+++ b/lib/gpu/lal_cg_cmm_ext.cpp
@ -0,0 +1,121 @@
+/***************************************************************************
+                                  cg_cmm.h
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Functions for LAMMPS access to cg/cmm/cut pair acceleration routines
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#include <iostream>
+#include <cassert>
+#include <math.h>
+
+#include "lal_cg_cmm.h"
+
+using namespace std;
+using namespace LAMMPS_AL;
+
+static CGCMM<PRECISION,ACC_PRECISION> CMMMF;
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+int cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
+                 double **host_lj1, double **host_lj2, double **host_lj3, 
+                 double **host_lj4, double **offset, double *special_lj,
+                 const int inum, const int nall, const int max_nbors, 
+                 const int maxspecial, const double cell_size, int &gpu_mode,
+                 FILE *screen) {
+  CMMMF.clear();
+  gpu_mode=CMMMF.device->gpu_mode();
+  double gpu_split=CMMMF.device->particle_split();
+  int first_gpu=CMMMF.device->first_device();
+  int last_gpu=CMMMF.device->last_device();
+  int world_me=CMMMF.device->world_me();
+  int gpu_rank=CMMMF.device->gpu_rank();
+  int procs_per_gpu=CMMMF.device->procs_per_gpu();
+
+  CMMMF.device->init_message(screen,"cg/cmm",first_gpu,last_gpu);
+
+  bool message=false;
+  if (CMMMF.device->replica_me()==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fflush(screen);
+  }
+
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3, 
+                       host_lj4, offset, special_lj, inum, nall, 300,
+                       maxspecial, cell_size, gpu_split, screen);
+
+  CMMMF.device->world_barrier();
+  if (message)
+    fprintf(screen,"Done.\n");
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+      else
+        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3,
+                         host_lj4, offset, special_lj, inum, nall, 300,
+                         maxspecial, cell_size, gpu_split, screen);
+
+    CMMMF.device->gpu_barrier();
+    if (message) 
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+
+  if (init_ok==0)
+    CMMMF.estimate_gpu_overhead();
+  return init_ok;
+}
+
+void cmm_gpu_clear() {
+  CMMMF.clear();
+}
+
+int** cmm_gpu_compute_n(const int ago, const int inum_full,
+                        const int nall, double **host_x, int *host_type,
+                        double *sublo, double *subhi, int *tag, int **nspecial,
+                        int **special, const bool eflag, const bool vflag,
+                        const bool eatom, const bool vatom, int &host_start,
+                        int **ilist, int **jnum, const double cpu_time,
+                        bool &success) {
+  return CMMMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
+                       subhi, tag, nspecial, special, eflag, vflag, eatom,
+                       vatom, host_start, ilist, jnum, cpu_time, success);
+}  
+			
+void cmm_gpu_compute(const int ago, const int inum_full, const int nall,
+                     double **host_x, int *host_type, int *ilist, int *numj,
+                     int **firstneigh, const bool eflag, const bool vflag,
+                     const bool eatom, const bool vatom, int &host_start,
+                     const double cpu_time, bool &success) {
+  CMMMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
+                firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
+}
+
+double cmm_gpu_bytes() {
+  return CMMMF.host_memory_usage();
+}
+
+
--- a/lib/gpu/lal_cg_cmm_long.cpp
+++ b/lib/gpu/lal_cg_cmm_long.cpp
@ -0,0 +1,169 @@
+/***************************************************************************
+                               cg_cmm_long.cpp
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Class for acceleration of the cg/cmm/coul/long pair style
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#ifdef USE_OPENCL
+#include "cg_cmm_long_cl.h"
+#else
+#include "cg_cmm_long_ptx.h"
+#endif
+
+#include "lal_cg_cmm_long.h"
+#include <cassert>
+using namespace LAMMPS_AL;
+#define CGCMMLongT CGCMMLong<numtyp, acctyp>
+
+extern Device<PRECISION,ACC_PRECISION> device;
+
+template <class numtyp, class acctyp>
+CGCMMLongT::CGCMMLong() : BaseCharge<numtyp,acctyp>(),
+                                    _allocated(false) {
+}
+
+template <class numtyp, class acctyp>
+CGCMMLongT::~CGCMMLong() {
+  clear();
+}
+ 
+template <class numtyp, class acctyp>
+int CGCMMLongT::bytes_per_atom(const int max_nbors) const {
+  return this->bytes_per_atom_atomic(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int CGCMMLongT::init(const int ntypes, double **host_cutsq, 
+                           int **host_cg_type, double **host_lj1, 
+                           double **host_lj2, double **host_lj3, 
+                           double **host_lj4, double **host_offset, 
+                           double *host_special_lj, const int nlocal,
+                           const int nall, const int max_nbors,
+                           const int maxspecial, const double cell_size,
+                           const double gpu_split, FILE *_screen,
+                           double **host_cut_ljsq, 
+                           const double host_cut_coulsq,
+                           double *host_special_coul, const double qqrd2e,
+                           const double g_ewald) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                            _screen,cg_cmm_long);
+  if (success!=0)
+    return success;
+
+  // If atom type constants fit in shared memory use fast kernel
+  int lj_types=ntypes;
+  shared_types=false;
+  int max_shared_types=this->device->max_shared_types();
+  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
+    lj_types=max_shared_types;
+    shared_types=true;
+  }
+  _lj_types=lj_types;
+
+  // Allocate a host write buffer for data initialization
+  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
+                               UCL_WRITE_OPTIMIZED);
+
+  for (int i=0; i<lj_types*lj_types; i++)
+    host_write[i]=0.0;
+
+  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_cutsq,
+                         host_cut_ljsq,host_lj1,host_lj2);
+
+  lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_cg_type,host_lj3,
+                         host_lj4,host_offset);
+
+  sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
+  for (int i=0; i<4; i++) {
+    host_write[i]=host_special_lj[i];
+    host_write[i+4]=host_special_coul[i];
+  }
+  ucl_copy(sp_lj,host_write,8,false);
+
+  _cut_coulsq=host_cut_coulsq;
+  _qqrd2e=qqrd2e;
+  _g_ewald=g_ewald;
+
+  _allocated=true;
+  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+void CGCMMLongT::clear() {
+  if (!_allocated)
+    return;
+  _allocated=false;
+
+  lj1.clear();
+  lj3.clear();
+  sp_lj.clear();
+  this->clear_atomic();
+}
+
+template <class numtyp, class acctyp>
+double CGCMMLongT::host_memory_usage() const {
+  return this->host_memory_usage_atomic()+sizeof(CGCMMLong<numtyp,acctyp>);
+}
+
+// ---------------------------------------------------------------------------
+// Calculate energies, forces, and torques
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void CGCMMLongT::loop(const bool _eflag, const bool _vflag) {
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int eflag, vflag;
+  if (_eflag)
+    eflag=1;
+  else
+    eflag=0;
+
+  if (_vflag)
+    vflag=1;
+  else
+    vflag=0;
+  
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+
+  int ainum=this->ans->inum();
+  int nbor_pitch=this->nbor->nbor_pitch();
+  this->time_pair.start();
+  if (shared_types) {
+    this->k_pair_fast.set_size(GX,BX);
+    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
+                          &lj3.begin(), &sp_lj.begin(),
+                          &this->nbor->dev_nbor.begin(),
+                          &this->_nbor_data->begin(),
+                          &this->ans->dev_ans.begin(),
+                          &this->ans->dev_engv.begin(), &eflag, &vflag,
+                          &ainum, &nbor_pitch,
+                          &this->atom->dev_q.begin(), &_cut_coulsq,
+                          &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
+  } else {
+    this->k_pair.set_size(GX,BX);
+    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
+                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
+                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
+                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &nbor_pitch, &this->atom->dev_q.begin(),
+                     &_cut_coulsq, &_qqrd2e, &_g_ewald, 
+                     &this->_threads_per_atom);
+  }
+  this->time_pair.stop();
+}
+
+template class CGCMMLong<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lal_cg_cmm_long.cu
+++ b/lib/gpu/lal_cg_cmm_long.cu
@ -0,0 +1,265 @@
+// **************************************************************************
+//                                cg_cmm_long.cu
+//                             -------------------
+//                           W. Michael Brown (ORNL)
+//
+//  Device code for acceleration of the cg/cmm/coul/long pair style
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                : 
+//    email                : brownw@ornl.gov
+// ***************************************************************************/
+
+#ifdef NV_KERNEL
+#include "lal_aux_fun1.h"
+texture<float4> pos_tex;
+texture<float> q_tex;
+#ifndef _DOUBLE_DOUBLE
+ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
+  { return tex1Dfetch(pos_tex, i); }
+ucl_inline float fetch_q(const int& i, const float *q) 
+  { return tex1Dfetch(q_tex, i); }
+#endif
+#endif
+
+__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
+                          __global numtyp4* lj3, const int lj_types, 
+                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
+                          __global int *dev_packed, __global acctyp4 *ans,
+                          __global acctyp *engv, const int eflag, 
+                          const int vflag, const int inum,
+                          const int nbor_pitch, __global numtyp *q_ ,
+                          const numtyp cut_coulsq, const numtyp qqrd2e,
+                          const numtyp g_ewald, const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp sp_lj[8];
+  sp_lj[0]=sp_lj_in[0];
+  sp_lj[1]=sp_lj_in[1];
+  sp_lj[2]=sp_lj_in[2];
+  sp_lj[3]=sp_lj_in[3];
+  sp_lj[4]=sp_lj_in[4];
+  sp_lj[5]=sp_lj_in[5];
+  sp_lj[6]=sp_lj_in[6];
+  sp_lj[7]=sp_lj_in[7];
+
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+
+  if (ii<inum) {
+    __global int *nbor, *list_end;
+    int i, numj, n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,list_end,nbor);
+  
+    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp qtmp=fetch_q(i,q_);
+    int itype=ix.w;
+
+    for ( ; nbor<list_end; nbor+=n_stride) {
+      int j=*nbor;
+
+      numtyp factor_lj, factor_coul;
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
+      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      int mtype=itype*lj_types+jtype;
+      if (rsq<lj1[mtype].x) {
+        numtyp forcecoul, force_lj, force, inv1, inv2, prefactor, _erfc;
+        numtyp r2inv=ucl_recip(rsq);
+
+        if (rsq < lj1[mtype].y) {
+          if (lj3[mtype].x == (numtyp)2) {
+            inv1=r2inv*r2inv;
+            inv2=inv1*inv1;
+          } else if (lj3[mtype].x == (numtyp)1) {
+            inv2=r2inv*ucl_rsqrt(rsq);
+            inv1=inv2*inv2;
+          } else {
+            inv1=r2inv*r2inv*r2inv;
+            inv2=inv1;
+          }
+          force_lj = factor_lj*inv1*(lj1[mtype].z*inv2-lj1[mtype].w);
+        } else
+          force_lj = (numtyp)0.0;
+
+        if (rsq < cut_coulsq) {
+          numtyp r = ucl_rsqrt(r2inv);
+          numtyp grij = g_ewald * r;
+          numtyp expm2 = ucl_exp(-grij*grij);
+          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
+          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+          prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
+          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
+        } else
+          forcecoul = (numtyp)0.0;
+
+        force = (force_lj + forcecoul) * r2inv;
+
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          if (rsq < cut_coulsq)
+            e_coul += prefactor*(_erfc-factor_coul);
+          if (rsq < lj1[mtype].y) {
+            energy += factor_lj*inv1*(lj3[mtype].y*inv2-lj3[mtype].z)-
+                      lj3[mtype].w;
+          } 
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                    vflag,ans,engv);
+  } // if ii
+}
+
+__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
+                               __global numtyp4* lj3_in, 
+                               __global numtyp* sp_lj_in, 
+                               __global int *dev_nbor, __global int *dev_packed,
+                               __global acctyp4 *ans, __global acctyp *engv, 
+                               const int eflag, const int vflag, const int inum, 
+                               const int nbor_pitch, __global numtyp *q_,
+                               const numtyp cut_coulsq, const numtyp qqrd2e,
+                               const numtyp g_ewald, const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp sp_lj[8];
+  if (tid<8)
+    sp_lj[tid]=sp_lj_in[tid];
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    lj1[tid]=lj1_in[tid];
+    lj3[tid]=lj3_in[tid];
+  }
+  
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
+  __syncthreads();
+  
+  if (ii<inum) {
+    __global int *nbor, *list_end;
+    int i, numj, n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,list_end,nbor);
+  
+    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp qtmp=fetch_q(i,q_);
+    int iw=ix.w;
+    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
+
+    for ( ; nbor<list_end; nbor+=n_stride) {
+      int j=*nbor;
+
+      numtyp factor_lj, factor_coul;
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
+      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      int mtype=itype+jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<lj1[mtype].x) {
+        numtyp forcecoul, force_lj, force, inv1, inv2, prefactor, _erfc;
+        numtyp r2inv=ucl_recip(rsq);
+
+        if (rsq < lj1[mtype].y) {
+          if (lj3[mtype].x == (numtyp)2) {
+            inv1=r2inv*r2inv;
+            inv2=inv1*inv1;
+          } else if (lj3[mtype].x == (numtyp)1) {
+            inv2=r2inv*ucl_rsqrt(rsq);
+            inv1=inv2*inv2;
+          } else {
+            inv1=r2inv*r2inv*r2inv;
+            inv2=inv1;
+          }
+          force_lj = factor_lj*inv1*(lj1[mtype].z*inv2-lj1[mtype].w);
+        } else
+          force_lj = (numtyp)0.0;
+
+        if (rsq < cut_coulsq) {
+          numtyp r = ucl_rsqrt(r2inv);
+          numtyp grij = g_ewald * r;
+          numtyp expm2 = ucl_exp(-grij*grij);
+          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
+          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+          prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
+          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
+        } else
+          forcecoul = (numtyp)0.0;
+
+        force = (force_lj + forcecoul) * r2inv;
+
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          if (rsq < cut_coulsq)
+            e_coul += prefactor*(_erfc-factor_coul);
+          if (rsq < lj1[mtype].y) {
+            energy += factor_lj*inv1*(lj3[mtype].y*inv2-lj3[mtype].z)-
+                      lj3[mtype].w;
+          } 
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                    vflag,ans,engv);
+  } // if ii
+}
+
--- a/lib/gpu/lal_cg_cmm_long.h
+++ b/lib/gpu/lal_cg_cmm_long.h
@ -0,0 +1,83 @@
+/***************************************************************************
+                                cg_cmm_long.h
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Class for acceleration of the cg/cmm/coul/long pair style
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#ifndef LAL_CG_CMM_LONG_H
+#define LAL_CG_CMM_LONG_H
+
+#include "lal_base_charge.h"
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class CGCMMLong : public BaseCharge<numtyp, acctyp> {
+ public:
+  CGCMMLong();
+  ~CGCMMLong();
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double **host_cutsq, int ** cg_type,
+           double **host_lj1, double **host_lj2, double **host_lj3,
+           double **host_lj4, double **host_offset, double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors, 
+           const int maxspecial, const double cell_size, 
+           const double gpu_split, FILE *screen, double **host_cut_ljsq,
+           const double host_cut_coulsq, double *host_special_coul,
+           const double qqrd2e, const double g_ewald);
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage() const;
+
+  // --------------------------- TYPE DATA --------------------------
+
+  /// lj1.x = cutsq, lj1.y = cutsq_vdw, lj1.z = lj1, lj1.w = lj2, 
+  UCL_D_Vec<numtyp4> lj1;
+  /// lj3.x = cg_type, lj3.y = lj3, lj3.z = lj4, lj3.w = offset
+  UCL_D_Vec<numtyp4> lj3;
+  /// Special LJ values [0-3] and Special Coul values [4-7]
+  UCL_D_Vec<numtyp> sp_lj;
+
+  /// If atom type constants fit in shared memory, use fast kernels
+  bool shared_types;
+
+  /// Number of atom types 
+  int _lj_types;
+
+  numtyp _cut_coulsq, _qqrd2e, _g_ewald;
+
+ private:
+  bool _allocated;
+  void loop(const bool _eflag, const bool _vflag);
+};
+
+}
+
+#endif
--- a/lib/gpu/lal_cg_cmm_long_ext.cpp
+++ b/lib/gpu/lal_cg_cmm_long_ext.cpp
@ -0,0 +1,129 @@
+/***************************************************************************
+                                cg_cmm_long.h
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Functions for LAMMPS access to cg/cmm/coul/long acceleration functions
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#include <iostream>
+#include <cassert>
+#include <math.h>
+
+#include "lal_cg_cmm_long.h"
+
+using namespace std;
+using namespace LAMMPS_AL;
+
+static CGCMMLong<PRECISION,ACC_PRECISION> CMMLMF;
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+int cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
+                  double **host_lj1, double **host_lj2, double **host_lj3, 
+                  double **host_lj4, double **offset, double *special_lj,
+                  const int inum, const int nall, const int max_nbors, 
+                  const int maxspecial, const double cell_size, int &gpu_mode,
+                  FILE *screen, double **host_cut_ljsq, double host_cut_coulsq,
+                  double *host_special_coul, const double qqrd2e,
+                  const double g_ewald) {
+  CMMLMF.clear();
+  gpu_mode=CMMLMF.device->gpu_mode();
+  double gpu_split=CMMLMF.device->particle_split();
+  int first_gpu=CMMLMF.device->first_device();
+  int last_gpu=CMMLMF.device->last_device();
+  int world_me=CMMLMF.device->world_me();
+  int gpu_rank=CMMLMF.device->gpu_rank();
+  int procs_per_gpu=CMMLMF.device->procs_per_gpu();
+
+  CMMLMF.device->init_message(screen,"cg/cmm/coul/long",first_gpu,last_gpu);
+
+  bool message=false;
+  if (CMMLMF.device->replica_me()==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fflush(screen);
+  }
+
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, host_lj3,
+                        host_lj4, offset, special_lj, inum, nall, 300, 
+                        maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
+                        host_cut_coulsq, host_special_coul, qqrd2e,g_ewald);
+
+  CMMLMF.device->world_barrier();
+  if (message)
+    fprintf(screen,"Done.\n");
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+      else
+        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, host_lj3,
+                          host_lj4, offset, special_lj, inum,  nall, 300,
+                          maxspecial, cell_size, gpu_split, screen,
+                          host_cut_ljsq, host_cut_coulsq, host_special_coul,
+                          qqrd2e, g_ewald);
+    CMMLMF.device->gpu_barrier();
+    if (message) 
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+
+  if (init_ok==0)
+    CMMLMF.estimate_gpu_overhead();
+  return init_ok;
+}
+
+void cmml_gpu_clear() {
+  CMMLMF.clear();
+}
+
+int** cmml_gpu_compute_n(const int ago, const int inum_full,
+                         const int nall, double **host_x, int *host_type,
+                         double *sublo, double *subhi, int *tag, int **nspecial, 
+                         int **special, const bool eflag, const bool vflag,
+                         const bool eatom, const bool vatom, int &host_start,
+                         int **ilist, int **jnum, const double cpu_time,
+                         bool &success, double *host_q, double *boxlo,
+                         double *prd) {
+  return CMMLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
+                        subhi, tag, nspecial, special, eflag, vflag, eatom,
+                        vatom, host_start, ilist, jnum, cpu_time, success,
+                        host_q,boxlo,prd);
+}  
+			
+void cmml_gpu_compute(const int ago, const int inum_full, const int nall,
+                      double **host_x, int *host_type, int *ilist, int *numj,
+                      int **firstneigh, const bool eflag, const bool vflag,
+                      const bool eatom, const bool vatom, int &host_start,
+                      const double cpu_time, bool &success, double *host_q,
+                      const int nlocal, double *boxlo, double *prd) {
+  CMMLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
+                firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
+                host_q,nlocal,boxlo,prd);
+}
+
+double cmml_gpu_bytes() {
+  return CMMLMF.host_memory_usage();
+}
+
+
--- a/lib/gpu/lal_charmm_long.cpp
+++ b/lib/gpu/lal_charmm_long.cpp
@ -0,0 +1,174 @@
+/***************************************************************************
+                               charmm_long.cpp
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Class for acceleration of the charmm/coul/long pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#ifdef USE_OPENCL
+#include "charmm_long_cl.h"
+#else
+#include "charmm_long_ptx.h"
+#endif
+
+#include "lal_charmm_long.h"
+#include <cassert>
+using namespace LAMMPS_AL;
+#define CHARMMLongT CHARMMLong<numtyp, acctyp>
+
+extern Device<PRECISION,ACC_PRECISION> device;
+
+template <class numtyp, class acctyp>
+CHARMMLongT::CHARMMLong() : BaseCharge<numtyp,acctyp>(),
+                                    _allocated(false) {
+}
+
+template <class numtyp, class acctyp>
+CHARMMLongT::~CHARMMLong() {
+  clear();
+}
+ 
+template <class numtyp, class acctyp>
+int CHARMMLongT::bytes_per_atom(const int max_nbors) const {
+  return this->bytes_per_atom_atomic(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int CHARMMLongT::init(const int ntypes,
+                           double host_cut_bothsq, double **host_lj1, 
+                           double **host_lj2, double **host_lj3, 
+                           double **host_lj4, double **host_offset, 
+                           double *host_special_lj, const int nlocal,
+                           const int nall, const int max_nbors,
+                           const int maxspecial, const double cell_size,
+                           const double gpu_split, FILE *_screen,
+                           double host_cut_ljsq, const double host_cut_coulsq,
+                           double *host_special_coul, const double qqrd2e,
+                           const double g_ewald, const double cut_lj_innersq,
+                           const double denom_lj, double **epsilon,
+                           double **sigma, const bool mix_arithmetic) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                            _screen,charmm_long);
+  if (success!=0)
+    return success;
+
+  // If atom type constants fit in shared memory use fast kernel
+  int lj_types=ntypes;
+  shared_types=false;
+  if (this->_block_bio_size>=64 && mix_arithmetic)
+    shared_types=true;
+  _lj_types=lj_types;
+
+  // Allocate a host write buffer for data initialization
+  int h_size=lj_types*lj_types;
+  int max_bio_shared_types=this->device->max_bio_shared_types();
+  if (h_size<max_bio_shared_types)
+    h_size=max_bio_shared_types;
+  UCL_H_Vec<numtyp> host_write(h_size*32,*(this->ucl_device),
+                               UCL_WRITE_OPTIMIZED);
+  for (int i=0; i<h_size*32; i++)
+    host_write[i]=0.0;
+
+  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
+                         host_lj3,host_lj4);
+
+  ljd.alloc(max_bio_shared_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->self_pack2(ntypes,ljd,host_write,epsilon,sigma);
+
+  sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
+  for (int i=0; i<4; i++) {
+    host_write[i]=host_special_lj[i];
+    host_write[i+4]=host_special_coul[i];
+  }
+  ucl_copy(sp_lj,host_write,8,false);
+
+  _cut_bothsq = host_cut_bothsq;
+  _cut_coulsq = host_cut_coulsq;
+  _cut_ljsq = host_cut_ljsq;
+  _cut_lj_innersq = cut_lj_innersq;
+  _qqrd2e=qqrd2e;
+  _g_ewald=g_ewald;
+  _denom_lj=denom_lj;
+
+  _allocated=true;
+  this->_max_bytes=lj1.row_bytes()+ljd.row_bytes()+sp_lj.row_bytes();
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+void CHARMMLongT::clear() {
+  if (!_allocated)
+    return;
+  _allocated=false;
+
+  lj1.clear();
+  ljd.clear();
+  sp_lj.clear();
+  this->clear_atomic();
+}
+
+template <class numtyp, class acctyp>
+double CHARMMLongT::host_memory_usage() const {
+  return this->host_memory_usage_atomic()+sizeof(CHARMMLong<numtyp,acctyp>);
+}
+
+// ---------------------------------------------------------------------------
+// Calculate energies, forces, and torques
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void CHARMMLongT::loop(const bool _eflag, const bool _vflag) {
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->_block_bio_size;
+  int eflag, vflag;
+  if (_eflag)
+    eflag=1;
+  else
+    eflag=0;
+
+  if (_vflag)
+    vflag=1;
+  else
+    vflag=0;
+  
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+
+  int ainum=this->ans->inum();
+  int nbor_pitch=this->nbor->nbor_pitch();
+  this->time_pair.start();
+  if (shared_types) {
+    this->k_pair_fast.set_size(GX,BX);
+    this->k_pair_fast.run(&this->atom->dev_x.begin(), &ljd.begin(),
+                          &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
+                          &this->_nbor_data->begin(),
+                          &this->ans->dev_ans.begin(),
+                          &this->ans->dev_engv.begin(), &eflag, &vflag,
+                          &ainum, &nbor_pitch, &this->atom->dev_q.begin(),
+                          &_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj,
+                          &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq, 
+                          &this->_threads_per_atom);
+  } else {
+    this->k_pair.set_size(GX,BX);
+    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(),
+                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
+                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
+                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &nbor_pitch, &this->atom->dev_q.begin(),
+                     &_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj,
+                     &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq,
+                     &this->_threads_per_atom);
+  }
+  this->time_pair.stop();
+}
+
+template class CHARMMLong<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lal_charmm_long.cu
+++ b/lib/gpu/lal_charmm_long.cu
@ -0,0 +1,278 @@
+// **************************************************************************
+//                               charmm_long.cu
+//                             -------------------
+//                           W. Michael Brown (ORNL)
+//
+//  Device code for acceleration of the charmm/coul/long pair style
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                : 
+//    email                : brownw@ornl.gov
+// ***************************************************************************/
+
+#ifdef NV_KERNEL
+#include "lal_aux_fun1.h"
+texture<float4> pos_tex;
+texture<float> q_tex;
+#ifndef _DOUBLE_DOUBLE
+ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
+  { return tex1Dfetch(pos_tex, i); }
+ucl_inline float fetch_q(const int& i, const float *q) 
+  { return tex1Dfetch(q_tex, i); }
+#endif
+#endif
+
+__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
+                          const int lj_types, __global numtyp *sp_lj_in,
+                          __global int *dev_nbor, __global int *dev_packed,
+                          __global acctyp4 *ans, __global acctyp *engv, 
+                          const int eflag, const int vflag, const int inum, 
+                          const int nbor_pitch, __global numtyp *q_,
+                          const numtyp cut_coulsq, const numtyp qqrd2e,
+                          const numtyp g_ewald, const numtyp denom_lj,
+                          const numtyp cut_bothsq, const numtyp cut_ljsq,
+                          const numtyp cut_lj_innersq, const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp sp_lj[8];
+  sp_lj[0]=sp_lj_in[0];
+  sp_lj[1]=sp_lj_in[1];
+  sp_lj[2]=sp_lj_in[2];
+  sp_lj[3]=sp_lj_in[3];
+  sp_lj[4]=sp_lj_in[4];
+  sp_lj[5]=sp_lj_in[5];
+  sp_lj[6]=sp_lj_in[6];
+  sp_lj[7]=sp_lj_in[7];
+
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+
+  if (ii<inum) {
+    __global int *nbor, *list_end;
+    int i, numj, n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,list_end,nbor);
+  
+    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp qtmp=fetch_q(i,q_);
+    int itype=ix.w;
+
+    for ( ; nbor<list_end; nbor+=n_stride) {
+      int j=*nbor;
+
+      numtyp factor_lj, factor_coul;
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
+      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      int mtype=itype*lj_types+jtype;
+      if (rsq<cut_bothsq) {
+        numtyp r2inv=ucl_recip(rsq);
+        numtyp forcecoul, force_lj, force, r6inv, prefactor, _erfc, switch1;
+
+        if (rsq < cut_ljsq) {
+          r6inv = r2inv*r2inv*r2inv;
+          force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
+          if (rsq > cut_lj_innersq) {
+            switch1 = (cut_ljsq-rsq);
+            numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)/ 
+                             denom_lj;
+            switch1 *= switch1;
+            switch1 *= (cut_ljsq+(numtyp)2.0*rsq-(numtyp)3.0*cut_lj_innersq)/
+                       denom_lj;
+            switch2 *= r6inv*(lj1[mtype].z*r6inv-lj1[mtype].w);
+            force_lj = force_lj*switch1+switch2;
+          }
+        } else
+          force_lj = (numtyp)0.0;
+
+        if (rsq < cut_coulsq) {
+          numtyp r = ucl_rsqrt(r2inv);
+          numtyp grij = g_ewald * r;
+          numtyp expm2 = ucl_exp(-grij*grij);
+          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
+          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+          prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
+          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
+        } else
+          forcecoul = (numtyp)0.0;
+
+        force = (force_lj + forcecoul) * r2inv;
+
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          if (rsq < cut_coulsq)
+            e_coul += prefactor*(_erfc-factor_coul);
+          if (rsq < cut_ljsq) {
+            numtyp e=r6inv*(lj1[mtype].z*r6inv-lj1[mtype].w);
+            if (rsq > cut_lj_innersq)
+              e *= switch1;
+            energy+=factor_lj*e;
+          } 
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                    vflag,ans,engv);
+  } // if ii
+}
+
+__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
+                               __global numtyp* sp_lj_in, __global int *dev_nbor, 
+                               __global int *dev_packed, __global acctyp4 *ans,
+                               __global acctyp *engv, const int eflag,
+                               const int vflag, const int inum,
+                               const int nbor_pitch, __global numtyp *q_,
+                               const numtyp cut_coulsq, const numtyp qqrd2e,
+                               const numtyp g_ewald, const numtyp denom_lj,
+                               const numtyp cut_bothsq, const numtyp cut_ljsq, 
+                               const numtyp cut_lj_innersq,
+                               const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp2 ljd[MAX_BIO_SHARED_TYPES];
+  __local numtyp sp_lj[8];
+  if (tid<8)
+    sp_lj[tid]=sp_lj_in[tid];
+  ljd[tid]=ljd_in[tid];
+  if (tid+BLOCK_BIO_PAIR<MAX_BIO_SHARED_TYPES)
+    ljd[tid+BLOCK_BIO_PAIR]=ljd_in[tid+BLOCK_BIO_PAIR];
+  
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
+  __syncthreads();
+  
+  if (ii<inum) {
+    __global int *nbor, *list_end;
+    int i, numj, n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,list_end,nbor);
+  
+    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp qtmp=fetch_q(i,q_);
+    int itype=ix.w;
+
+    for ( ; nbor<list_end; nbor+=n_stride) {
+      int j=*nbor;
+
+      numtyp factor_lj, factor_coul;
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
+      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<cut_bothsq) {
+        numtyp r2inv=ucl_recip(rsq);
+        numtyp forcecoul, force_lj, force, prefactor, _erfc, switch1;
+        numtyp lj3, lj4;
+
+        if (rsq < cut_ljsq) {
+          numtyp eps = ucl_sqrt(ljd[itype].x*ljd[jtype].x);
+          numtyp sig6 = (numtyp)0.5 * (ljd[itype].y+ljd[jtype].y);
+
+          numtyp sig_r_6 = sig6*sig6*r2inv;
+          sig_r_6 = sig_r_6*sig_r_6*sig_r_6;
+          lj4 = (numtyp)4.0*eps*sig_r_6;
+          lj3 = lj4*sig_r_6;
+          force_lj = factor_lj*((numtyp)12.0 * lj3 - (numtyp)6.0 * lj4);
+          if (rsq > cut_lj_innersq) {
+            switch1 = (cut_ljsq-rsq);
+            numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)/ 
+                             denom_lj;
+            switch1 *= switch1;
+            switch1 *= (cut_ljsq+(numtyp)2.0*rsq-(numtyp)3.0*cut_lj_innersq)/
+                       denom_lj;
+            switch2 *= lj3-lj4;
+            force_lj = force_lj*switch1+switch2;
+          }
+        } else
+          force_lj = (numtyp)0.0;
+
+        if (rsq < cut_coulsq) {
+          numtyp r = ucl_rsqrt(r2inv);
+          numtyp grij = g_ewald * r;
+          numtyp expm2 = ucl_exp(-grij*grij);
+          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
+          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+          prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
+          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
+        } else
+          forcecoul = (numtyp)0.0;
+
+        force = (force_lj + forcecoul) * r2inv;
+
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          if (rsq < cut_coulsq)
+            e_coul += prefactor*(_erfc-factor_coul);
+          if (rsq < cut_ljsq) {
+            numtyp e=lj3-lj4;
+            if (rsq > cut_lj_innersq)
+              e *= switch1;
+            energy+=factor_lj*e;
+          }
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                    vflag,ans,engv);
+  } // if ii
+}
+
--- a/lib/gpu/lal_charmm_long.h
+++ b/lib/gpu/lal_charmm_long.h
@ -0,0 +1,87 @@
+/***************************************************************************
+                                charmm_long.h
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Class for acceleration of the charmm/coul/long pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#ifndef LAL_CHARMM_LONG_H
+#define LAL_CHARMM_LONG_H
+
+#include "lal_base_charge.h"
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class CHARMMLong : public BaseCharge<numtyp, acctyp> {
+ public:
+  CHARMMLong();
+  ~CHARMMLong();
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double host_cut_bothsq,
+           double **host_lj1, double **host_lj2, double **host_lj3,
+           double **host_lj4, double **host_offset, double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors, 
+           const int maxspecial, const double cell_size, 
+           const double gpu_split, FILE *screen, double host_cut_ljsq,
+           const double host_cut_coulsq, double *host_special_coul,
+           const double qqrd2e, const double g_ewald,
+           const double cut_lj_innersq, const double denom_lj, 
+           double **epsilon, double **sigma, const bool mix_arithmetic);
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage() const;
+
+  // --------------------------- TYPE DATA --------------------------
+
+  /// x = lj1, y = lj2, z = lj3, w = lj4
+  UCL_D_Vec<numtyp4> lj1;
+  /// x = epsilon, y = sigma
+  UCL_D_Vec<numtyp2> ljd;
+  /// Special LJ values [0-3] and Special Coul values [4-7]
+  UCL_D_Vec<numtyp> sp_lj;
+
+  /// If atom type constants fit in shared memory, use fast kernels
+  bool shared_types;
+
+  /// Number of atom types 
+  int _lj_types;
+
+  numtyp _qqrd2e, _g_ewald, _denom_lj;
+
+  numtyp _cut_coulsq, _cut_bothsq, _cut_ljsq, _cut_lj_innersq;
+
+ private:
+  bool _allocated;
+  void loop(const bool _eflag, const bool _vflag);
+};
+
+}
+
+#endif
--- a/lib/gpu/lal_charmm_long_ext.cpp
+++ b/lib/gpu/lal_charmm_long_ext.cpp
@ -0,0 +1,135 @@
+/***************************************************************************
+                             charmm_long_ext.cpp
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Functions for LAMMPS access to charmm/coul/long acceleration routines.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#include <iostream>
+#include <cassert>
+#include <math.h>
+
+#include "lal_charmm_long.h"
+
+using namespace std;
+using namespace LAMMPS_AL;
+
+static CHARMMLong<PRECISION,ACC_PRECISION> CRMLMF;
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+int crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
+                  double **host_lj2, double **host_lj3, double **host_lj4,
+                  double **offset, double *special_lj, const int inum,
+                  const int nall, const int max_nbors, const int maxspecial,
+                  const double cell_size, int &gpu_mode, FILE *screen,
+                  double host_cut_ljsq, double host_cut_coulsq,
+                  double *host_special_coul, const double qqrd2e,
+                  const double g_ewald, const double cut_lj_innersq,
+                  const double denom_lj, double **epsilon,
+                  double **sigma, const bool mix_arithmetic) {
+  CRMLMF.clear();
+  gpu_mode=CRMLMF.device->gpu_mode();
+  double gpu_split=CRMLMF.device->particle_split();
+  int first_gpu=CRMLMF.device->first_device();
+  int last_gpu=CRMLMF.device->last_device();
+  int world_me=CRMLMF.device->world_me();
+  int gpu_rank=CRMLMF.device->gpu_rank();
+  int procs_per_gpu=CRMLMF.device->procs_per_gpu();
+
+  CRMLMF.device->init_message(screen,"lj/charmm/coul/long",first_gpu,last_gpu);
+
+  bool message=false;
+  if (CRMLMF.device->replica_me()==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fflush(screen);
+  }
+
+  int init_ok=0;
+  if (world_me==0)
+    CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3, host_lj4,
+                offset, special_lj, inum, nall, 300, maxspecial, cell_size,
+                gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
+                host_special_coul, qqrd2e, g_ewald, cut_lj_innersq, denom_lj,
+                epsilon,sigma,mix_arithmetic);
+
+  CRMLMF.device->world_barrier();
+  if (message)
+    fprintf(screen,"Done.\n");
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+      else
+        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3,
+                          host_lj4, offset, special_lj, inum, nall, 300,
+                          maxspecial, cell_size, gpu_split, screen,
+                          host_cut_ljsq, host_cut_coulsq, host_special_coul,
+                          qqrd2e, g_ewald,  cut_lj_innersq, denom_lj, epsilon,
+                          sigma, mix_arithmetic);
+
+    CRMLMF.device->gpu_barrier();
+    if (message) 
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+
+  if (init_ok==0)
+    CRMLMF.estimate_gpu_overhead();
+  return init_ok;
+}
+
+void crml_gpu_clear() {
+  CRMLMF.clear();
+}
+
+int** crml_gpu_compute_n(const int ago, const int inum_full,
+                         const int nall, double **host_x, int *host_type,
+                         double *sublo, double *subhi, int *tag, int **nspecial, 
+                         int **special, const bool eflag, const bool vflag,
+                         const bool eatom, const bool vatom, int &host_start,
+                         int **ilist, int **jnum, const double cpu_time,
+                         bool &success, double *host_q, double *boxlo,
+                         double *prd) {
+  return CRMLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
+                        subhi, tag, nspecial, special, eflag, vflag, eatom,
+                        vatom, host_start, ilist, jnum, cpu_time, success,
+                        host_q, boxlo, prd);
+}  
+			
+void crml_gpu_compute(const int ago, const int inum_full,
+	 	                  const int nall, double **host_x, int *host_type,
+                      int *ilist, int *numj, int **firstneigh,
+		                  const bool eflag, const bool vflag, const bool eatom,
+                      const bool vatom, int &host_start, const double cpu_time,
+                      bool &success, double *host_q, const int nlocal, 
+                      double *boxlo, double *prd) {
+  CRMLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,
+                 eflag,vflag,eatom,vatom,host_start,cpu_time,success,host_q,
+                 nlocal,boxlo,prd);
+}
+
+double crml_gpu_bytes() {
+  return CRMLMF.host_memory_usage();
+}
+
+
--- a/lib/gpu/lal_coul_long.cpp
+++ b/lib/gpu/lal_coul_long.cpp
@ -0,0 +1,156 @@
+/***************************************************************************
+                                coul_long.cpp
+                             -------------------
+                           Axel Kohlmeyer (Temple)
+
+  Class for acceleration of the coul/long pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : July 2011
+    email                : a.kohlmeyer@temple.edu
+ ***************************************************************************/
+
+#ifdef USE_OPENCL
+#include "coul_long_cl.h"
+#else
+#include "coul_long_ptx.h"
+#endif
+
+#include "lal_coul_long.h"
+#include <cassert>
+using namespace LAMMPS_AL;
+#define CoulLongT CoulLong<numtyp, acctyp>
+
+extern Device<PRECISION,ACC_PRECISION> pair_gpu_device;
+
+template <class numtyp, class acctyp>
+CoulLongT::CoulLong() : BaseCharge<numtyp,acctyp>(), _allocated(false) {
+}
+
+template <class numtyp, class acctyp>
+CoulLongT::~CoulLong() {
+  clear();
+}
+ 
+template <class numtyp, class acctyp>
+int CoulLongT::bytes_per_atom(const int max_nbors) const {
+  return this->bytes_per_atom_atomic(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int CoulLongT::init(const int nlocal, const int nall, const int max_nbors,
+			 const int maxspecial, const double cell_size,
+			 const double gpu_split, FILE *_screen,
+			 const double host_cut_coulsq, double *host_special_coul,
+			 const double qqrd2e, const double g_ewald) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,
+			    gpu_split,_screen,coul_long);
+  if (success!=0)
+    return success;
+
+  // we don't have atom types for coulomb only,
+  // but go with the minimum so that we can use
+  // the same infrastructure as lj/cut/coul/long/gpu.
+  int lj_types=1;
+  shared_types=false;
+  int max_shared_types=this->device->max_shared_types();
+  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
+    lj_types=max_shared_types;
+    shared_types=true;
+  }
+  _lj_types=lj_types;
+
+  // Allocate a host write buffer for data initialization
+  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
+                               UCL_WRITE_OPTIMIZED);
+
+  for (int i=0; i<lj_types*lj_types; i++)
+    host_write[i]=0.0;
+
+  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+
+  sp_cl.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
+  for (int i=0; i<4; i++) {
+    host_write[i]=host_special_coul[i];
+  }
+  ucl_copy(sp_cl,host_write,4,false);
+
+  _cut_coulsq=host_cut_coulsq;
+  _qqrd2e=qqrd2e;
+  _g_ewald=g_ewald;
+
+  _allocated=true;
+  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_cl.row_bytes();
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+void CoulLongT::clear() {
+  if (!_allocated)
+    return;
+  _allocated=false;
+
+  lj1.clear();
+  lj3.clear();
+  sp_cl.clear();
+  this->clear_atomic();
+}
+
+template <class numtyp, class acctyp>
+double CoulLongT::host_memory_usage() const {
+  return this->host_memory_usage_atomic()+sizeof(CoulLong<numtyp,acctyp>);
+}
+
+// ---------------------------------------------------------------------------
+// Calculate energies, forces, and torques
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void CoulLongT::loop(const bool _eflag, const bool _vflag) {
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int eflag, vflag;
+  if (_eflag)
+    eflag=1;
+  else
+    eflag=0;
+
+  if (_vflag)
+    vflag=1;
+  else
+    vflag=0;
+  
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+
+  int ainum=this->ans->inum();
+  int nbor_pitch=this->nbor->nbor_pitch();
+  this->time_pair.start();
+  if (shared_types) {
+    this->k_pair_fast.set_size(GX,BX);
+    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
+                          &lj3.begin(), &sp_cl.begin(),
+                          &this->nbor->dev_nbor.begin(),
+                          &this->_nbor_data->begin(),
+                          &this->ans->dev_ans.begin(),
+                          &this->ans->dev_engv.begin(), &eflag, &vflag,
+                          &ainum, &nbor_pitch, &this->atom->dev_q.begin(),
+                          &_cut_coulsq, &_qqrd2e, &_g_ewald,
+                          &this->_threads_per_atom);
+  } else {
+    this->k_pair.set_size(GX,BX);
+    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
+                     &_lj_types, &sp_cl.begin(), &this->nbor->dev_nbor.begin(),
+                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
+                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq,
+                     &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
+  }
+  this->time_pair.stop();
+}
+
+template class CoulLong<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lal_coul_long.cu
+++ b/lib/gpu/lal_coul_long.cu
@ -0,0 +1,301 @@
+// **************************************************************************
+//                               coul_long.cu
+//                             -------------------
+//                           Axel Kohlmeyer (Temple)
+//
+//  Device code for acceleration of the coul/long pair style
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                : July 2011
+//    email                : a.kohlmeyer@temple.edu
+// ***************************************************************************/
+
+#ifdef NV_KERNEL
+#include "lal_aux_fun1.h"
+texture<float4> pos_tex;
+texture<float> q_tex;
+#ifndef _DOUBLE_DOUBLE
+ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
+  { return tex1Dfetch(pos_tex, i); }
+ucl_inline float fetch_q(const int& i, const float *q) 
+  { return tex1Dfetch(q_tex, i); }
+#endif
+#endif
+
+__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
+                          __global numtyp4* lj3, const int lj_types,
+                          __global numtyp *sp_cl_in, __global int *dev_nbor,
+                          __global int *dev_packed, __global acctyp4 *ans,
+                          __global acctyp *engv, const int eflag,
+                          const int vflag, const int inum,
+                          const int nbor_pitch, __global numtyp *q_,
+                          const numtyp cut_coulsq, const numtyp qqrd2e,
+                          const numtyp g_ewald, const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp sp_cl[4];
+  sp_cl[0]=sp_cl_in[0];
+  sp_cl[1]=sp_cl_in[1];
+  sp_cl[2]=sp_cl_in[2];
+  sp_cl[3]=sp_cl_in[3];
+
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+
+  if (ii<inum) {
+    __global int *nbor, *list_end;
+    int i, numj, n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,list_end,nbor);
+
+    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp qtmp=fetch_q(i,q_);
+
+    for ( ; nbor<list_end; nbor+=n_stride) {
+      int j=*nbor;
+
+      numtyp factor_coul;
+      factor_coul = (numtyp)1.0-sp_cl[sbmask(j)];
+      j &= NEIGHMASK;
+
+      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq < cut_coulsq) {
+	numtyp r2inv=ucl_recip(rsq);
+	numtyp force, prefactor, _erfc;
+
+	numtyp r = ucl_rsqrt(r2inv);
+	numtyp grij = g_ewald * r;
+	numtyp expm2 = ucl_exp(-grij*grij);
+	numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
+	_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+	prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
+	force = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul) * r2inv;
+
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+	  e_coul += prefactor*(_erfc-factor_coul);
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+
+    // Reduce answers
+    if (t_per_atom>1) {
+      __local acctyp red_acc[6][BLOCK_PAIR];
+
+      red_acc[0][tid]=f.x;
+      red_acc[1][tid]=f.y;
+      red_acc[2][tid]=f.z;
+      red_acc[3][tid]=e_coul;
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<4; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+
+      f.x=red_acc[0][tid];
+      f.y=red_acc[1][tid];
+      f.z=red_acc[2][tid];
+      e_coul=red_acc[3][tid];
+
+      if (vflag>0) {
+        for (int r=0; r<6; r++)
+          red_acc[r][tid]=virial[r];
+
+        for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+          if (offset < s) {
+            for (int r=0; r<6; r++)
+              red_acc[r][tid] += red_acc[r][tid+s];
+          }
+        }
+
+        for (int r=0; r<6; r++)
+          virial[r]=red_acc[r][tid];
+      }
+    }
+
+    // Store answers
+    if (offset==0) {
+      __global acctyp *ap1=engv+ii;
+      if (eflag>0) {
+        *ap1=(acctyp)0;
+        ap1+=inum;
+        *ap1=e_coul;
+        ap1+=inum;
+      }
+      if (vflag>0) {
+        for (int i=0; i<6; i++) {
+          *ap1=virial[i];
+          ap1+=inum;
+        }
+      }
+      ans[ii]=f;
+    }
+  } // if ii
+}
+
+__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
+                               __global numtyp4* lj3_in,
+                               __global numtyp* sp_cl_in,
+                               __global int *dev_nbor, __global int *dev_packed,
+                               __global acctyp4 *ans, __global acctyp *engv,
+                               const int eflag, const int vflag, const int inum,
+                               const int nbor_pitch, __global numtyp *q_,
+                               const numtyp cut_coulsq, const numtyp qqrd2e,
+                               const numtyp g_ewald, const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp sp_cl[4];
+  if (tid<4)
+    sp_cl[tid]=sp_cl_in[tid];
+
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+
+  __syncthreads();
+
+  if (ii<inum) {
+    __global int *nbor, *list_end;
+    int i, numj, n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,list_end,nbor);
+
+    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp qtmp=fetch_q(i,q_);
+
+    for ( ; nbor<list_end; nbor+=n_stride) {
+      int j=*nbor;
+
+      numtyp factor_coul;
+      factor_coul = (numtyp)1.0-sp_cl[sbmask(j)];
+      j &= NEIGHMASK;
+
+      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq < cut_coulsq) {
+	numtyp r2inv=ucl_recip(rsq);
+	numtyp force, prefactor, _erfc;
+
+	numtyp r = ucl_rsqrt(r2inv);
+	numtyp grij = g_ewald * r;
+	numtyp expm2 = ucl_exp(-grij*grij);
+	numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
+	_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+	prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
+	force = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul) * r2inv;
+
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+	  e_coul += prefactor*(_erfc-factor_coul);
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+
+    // Reduce answers
+    if (t_per_atom>1) {
+      __local acctyp red_acc[6][BLOCK_PAIR];
+
+      red_acc[0][tid]=f.x;
+      red_acc[1][tid]=f.y;
+      red_acc[2][tid]=f.z;
+      red_acc[3][tid]=e_coul;
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<4; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+
+      f.x=red_acc[0][tid];
+      f.y=red_acc[1][tid];
+      f.z=red_acc[2][tid];
+      e_coul=red_acc[3][tid];
+
+      if (vflag>0) {
+        for (int r=0; r<6; r++)
+          red_acc[r][tid]=virial[r];
+
+        for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+          if (offset < s) {
+            for (int r=0; r<6; r++)
+              red_acc[r][tid] += red_acc[r][tid+s];
+          }
+        }
+
+        for (int r=0; r<6; r++)
+          virial[r]=red_acc[r][tid];
+      }
+    }
+
+    // Store answers
+    if (offset==0) {
+      __global acctyp *ap1=engv+ii;
+      if (eflag>0) {
+        *ap1=(acctyp)0;
+        ap1+=inum;
+        *ap1=e_coul;
+        ap1+=inum;
+      }
+      if (vflag>0) {
+        for (int i=0; i<6; i++) {
+          *ap1=virial[i];
+          ap1+=inum;
+        }
+      }
+      ans[ii]=f;
+    }
+  } // if ii
+}
+
--- a/lib/gpu/lal_coul_long.h
+++ b/lib/gpu/lal_coul_long.h
@ -0,0 +1,80 @@
+/***************************************************************************
+                                 coul_long.h
+                             -------------------
+                           Axel Kohlmeyer (Temple)
+
+  Class for acceleration of the coul/long pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : July 2011
+    email                : a.kohlmeyer@temple.edu
+ ***************************************************************************/
+
+#ifndef LAL_Coul_Long_H
+#define LAL_Coul_Long_H
+
+#include "lal_base_charge.h"
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class CoulLong : public BaseCharge<numtyp, acctyp> {
+ public:
+  CoulLong();
+  ~CoulLong();
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
+	   const double gpu_split, FILE *screen, 
+	   const double host_cut_coulsq, double *host_special_coul,
+	   const double qqrd2e, const double g_ewald);
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage() const;
+
+  // --------------------------- TYPE DATA --------------------------
+
+  /// lj1 dummy
+  UCL_D_Vec<numtyp4> lj1;
+  /// lj3 dummy
+  UCL_D_Vec<numtyp4> lj3;
+  /// Special Coul values [0-3]
+  UCL_D_Vec<numtyp> sp_cl;
+
+  /// If atom type constants fit in shared memory, use fast kernels
+  bool shared_types;
+
+  /// Number of atom types
+  int _lj_types;
+
+  numtyp _cut_coulsq, _qqrd2e, _g_ewald;
+
+ private:
+  bool _allocated;
+  void loop(const bool _eflag, const bool _vflag);
+};
+
+}
+
+#endif
--- a/lib/gpu/lal_coul_long_ext.cpp
+++ b/lib/gpu/lal_coul_long_ext.cpp
@ -0,0 +1,123 @@
+/***************************************************************************
+                              coul_long_ext.cpp
+                             -------------------
+                           Axel Kohlmeyer (Temple)
+
+  Functions for LAMMPS access to coul/long acceleration routines.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : July 2011
+    email                : a.kohlmeyer@temple.edu
+ ***************************************************************************/
+
+#include <iostream>
+#include <cassert>
+#include <math.h>
+
+#include "lal_coul_long.h"
+
+using namespace std;
+using namespace LAMMPS_AL;
+
+static CoulLong<PRECISION,ACC_PRECISION> CLMF;
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+int cl_gpu_init(const int inum, const int nall, const int max_nbors,
+		const int maxspecial, const double cell_size, int &gpu_mode,
+		FILE *screen, double host_cut_coulsq, double *host_special_coul,
+		const double qqrd2e, const double g_ewald) {
+  CLMF.clear();
+  gpu_mode=CLMF.device->gpu_mode();
+  double gpu_split=CLMF.device->particle_split();
+  int first_gpu=CLMF.device->first_device();
+  int last_gpu=CLMF.device->last_device();
+  int world_me=CLMF.device->world_me();
+  int gpu_rank=CLMF.device->gpu_rank();
+  int procs_per_gpu=CLMF.device->procs_per_gpu();
+
+  CLMF.device->init_message(screen,"coul/long",first_gpu,last_gpu);
+
+  bool message=false;
+  if (CLMF.device->replica_me()==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fflush(screen);
+  }
+
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=CLMF.init(inum, nall, 300, maxspecial, cell_size, gpu_split,
+		      screen, host_cut_coulsq, host_special_coul, qqrd2e,
+		      g_ewald);
+
+  CLMF.device->world_barrier();
+  if (message)
+    fprintf(screen,"Done.\n");
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+      else
+        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=CLMF.init(inum, nall, 300, maxspecial, cell_size, gpu_split,
+			screen, host_cut_coulsq, host_special_coul,
+			qqrd2e, g_ewald);
+
+    CLMF.device->gpu_barrier();
+    if (message)
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+
+  if (init_ok==0)
+    CLMF.estimate_gpu_overhead();
+  return init_ok;
+}
+
+void cl_gpu_clear() {
+  CLMF.clear();
+}
+
+int** cl_gpu_compute_n(const int ago, const int inum_full,
+		       const int nall, double **host_x, int *host_type,
+		       double *sublo, double *subhi, int *tag, int **nspecial,
+		       int **special, const bool eflag, const bool vflag,
+		       const bool eatom, const bool vatom, int &host_start,
+		       int **ilist, int **jnum,  const double cpu_time,
+		       bool &success, double *host_q, double *boxlo,
+		       double *prd) {
+  return CLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
+		      subhi, tag, nspecial, special, eflag, vflag, eatom,
+		      vatom, host_start, ilist, jnum, cpu_time, success,
+		      host_q, boxlo, prd);
+}
+
+void cl_gpu_compute(const int ago, const int inum_full, const int nall,
+		    double **host_x, int *host_type, int *ilist, int *numj,
+		    int **firstneigh, const bool eflag, const bool vflag,
+		    const bool eatom, const bool vatom, int &host_start,
+		    const double cpu_time, bool &success, double *host_q,
+		    const int nlocal, double *boxlo, double *prd) {
+  CLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
+	       firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
+	       host_q,nlocal,boxlo,prd);
+}
+
+double cl_gpu_bytes() {
+  return CLMF.host_memory_usage();
+}
+
+
--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@ -0,0 +1,640 @@
+/***************************************************************************
+                                  device.cpp
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Class for management of the device where the computations are performed
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#include "lal_device.h"
+#include "lal_precision.h"
+#include <map>
+#include <math.h>
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#ifdef USE_OPENCL
+#include "device_cl.h"
+#else
+#include "device_ptx.h"
+#endif
+
+using namespace LAMMPS_AL;
+#define DeviceT Device<numtyp, acctyp>
+
+template <class numtyp, class acctyp>
+DeviceT::Device() : _init_count(0), _device_init(false),
+                                  _gpu_mode(GPU_FORCE), _first_device(0),
+                                  _last_device(0), _compiled(false) {
+}
+
+template <class numtyp, class acctyp>
+DeviceT::~Device() {
+  clear_device();
+}
+
+template <class numtyp, class acctyp>
+int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, 
+                                const int first_gpu, const int last_gpu,
+                                const int gpu_mode, const double p_split,
+                                const int nthreads, const int t_per_atom) {
+  _nthreads=nthreads;
+  #ifdef _OPENMP
+  omp_set_num_threads(nthreads);
+  #endif
+  _threads_per_atom=t_per_atom;
+  _threads_per_charge=t_per_atom;
+
+  if (_device_init)
+    return 0;
+  _device_init=true;
+  _comm_world=world;
+  _comm_replica=replica;
+  _first_device=first_gpu;
+  _last_device=last_gpu;
+  _gpu_mode=gpu_mode;
+  _particle_split=p_split;
+
+  // Get the rank/size within the world
+  MPI_Comm_rank(_comm_world,&_world_me);
+  MPI_Comm_size(_comm_world,&_world_size);
+  // Get the rank/size within the replica
+  MPI_Comm_rank(_comm_replica,&_replica_me);
+  MPI_Comm_size(_comm_replica,&_replica_size);
+
+  // Get the names of all nodes
+  int name_length;
+  char node_name[MPI_MAX_PROCESSOR_NAME];
+  char node_names[MPI_MAX_PROCESSOR_NAME*_world_size];
+  MPI_Get_processor_name(node_name,&name_length);
+  MPI_Allgather(&node_name,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,&node_names,
+                MPI_MAX_PROCESSOR_NAME,MPI_CHAR,_comm_world);
+  std::string node_string=std::string(node_name);
+  
+  // Get the number of procs per node                
+  std::map<std::string,int> name_map;
+  std::map<std::string,int>::iterator np;
+  for (int i=0; i<_world_size; i++) {
+    std::string i_string=std::string(&node_names[i*MPI_MAX_PROCESSOR_NAME]);
+    np=name_map.find(i_string);
+    if (np==name_map.end())
+      name_map[i_string]=1;
+    else
+      np->second++;
+  }
+  int procs_per_node=name_map.begin()->second;
+
+  // Assign a unique id to each node
+  int split_num=0, split_id=0;
+  for (np=name_map.begin(); np!=name_map.end(); ++np) {
+    if (np->first==node_string)
+      split_id=split_num;
+    split_num++;
+  }
+  
+  // Set up a per node communicator and find rank within
+  MPI_Comm node_comm;
+  MPI_Comm_split(_comm_world, split_id, 0, &node_comm);  
+  int node_rank;
+  MPI_Comm_rank(node_comm,&node_rank);                  
+
+  // set the device ID
+  _procs_per_gpu=static_cast<int>(ceil(static_cast<double>(procs_per_node)/
+                                       (last_gpu-first_gpu+1)));
+  int my_gpu=node_rank/_procs_per_gpu+first_gpu;
+
+  // Time on the device only if 1 proc per gpu
+  _time_device=true;
+  if (_procs_per_gpu>1)
+    _time_device=false;
+  
+  // Set up a per device communicator
+  MPI_Comm_split(node_comm,my_gpu,0,&_comm_gpu);
+  MPI_Comm_rank(_comm_gpu,&_gpu_rank);
+
+  gpu=new UCL_Device();
+  if (my_gpu>=gpu->num_devices())
+    return -2;
+    
+  if (_procs_per_gpu>1 && gpu->sharing_supported(my_gpu)==false)
+    return -7;
+  
+  if (gpu->set(my_gpu)!=UCL_SUCCESS)
+    return -6;
+
+  _long_range_precompute=0;
+
+  int flag=compile_kernels();
+
+  return flag;
+}
+
+template <class numtyp, class acctyp>
+int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
+                         const bool rot, const int nlocal, 
+                         const int host_nlocal, const int nall,
+                         Neighbor *nbor, const int maxspecial,
+                         const int gpu_host, const int max_nbors, 
+                         const double cell_size, const bool pre_cut,
+                         const int threads_per_atom) {
+  if (!_device_init)
+    return -1;
+  if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false)
+    return -5;
+
+  // Counts of data transfers for timing overhead estimates
+  _data_in_estimate=0;
+  _data_out_estimate=1;
+
+  // Initial number of local particles
+  int ef_nlocal=nlocal;
+  if (_particle_split<1.0 && _particle_split>0.0)
+    ef_nlocal=static_cast<int>(_particle_split*nlocal);
+
+  int gpu_nbor=0;
+  if (_gpu_mode==Device<numtyp,acctyp>::GPU_NEIGH)
+    gpu_nbor=1;
+  else if (_gpu_mode==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
+    gpu_nbor=2;
+  #ifdef USE_OPENCL
+  if (gpu_nbor==1)
+    gpu_nbor=2;
+  #endif
+
+  if (_init_count==0) {
+    // Initialize atom and nbor data
+    if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor>0 && maxspecial>0))
+      return -3;
+      
+    _data_in_estimate++;
+    if (charge)
+      _data_in_estimate++;
+    if (rot)
+      _data_in_estimate++;
+  } else {
+    if (atom.charge()==false && charge)
+      _data_in_estimate++;
+    if (atom.quat()==false && rot)
+      _data_in_estimate++;
+    if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial))
+      return -3;
+  }
+  
+  if (!ans.init(ef_nlocal,charge,rot,*gpu))
+    return -3;
+
+  if (!nbor->init(&_neighbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial,
+                  *gpu,gpu_nbor,gpu_host,pre_cut, _block_cell_2d, 
+                  _block_cell_id, _block_nbor_build, threads_per_atom,
+                  _time_device))
+    return -3;
+  nbor->cell_size(cell_size);
+
+  _init_count++;
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+int DeviceT::init(Answer<numtyp,acctyp> &ans, const int nlocal,
+                         const int nall) {
+  if (!_device_init)
+    return -1;                          
+  if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false)
+    return -5;
+
+  if (_init_count==0) {
+    // Initialize atom and nbor data
+    if (!atom.init(nall,true,false,*gpu,false,false))
+      return -3;
+  } else
+    if (!atom.add_fields(true,false,false,false))
+      return -3;
+
+  if (!ans.init(nlocal,true,false,*gpu))
+    return -3;
+
+  _init_count++;
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+void DeviceT::set_single_precompute
+                     (PPPM<numtyp,acctyp,float,_lgpu_float4> *pppm) {
+  _long_range_precompute=1;
+  pppm_single=pppm;
+}
+
+template <class numtyp, class acctyp>
+void DeviceT::set_double_precompute
+                     (PPPM<numtyp,acctyp,double,_lgpu_double4> *pppm) {
+  _long_range_precompute=2;
+  pppm_double=pppm;
+}
+
+template <class numtyp, class acctyp>
+void DeviceT::init_message(FILE *screen, const char *name,
+                                  const int first_gpu, const int last_gpu) {
+  #ifdef USE_OPENCL
+  std::string fs="";
+  #else
+  std::string fs=toa(gpu->free_gigabytes())+"/";
+  #endif
+  
+  if (_replica_me == 0 && screen) {
+    fprintf(screen,"\n-------------------------------------");
+    fprintf(screen,"-------------------------------------\n");
+    fprintf(screen,"- Using GPGPU acceleration for %s:\n",name);
+    fprintf(screen,"-  with %d proc(s) per device.\n",_procs_per_gpu);
+    #ifdef _OPENMP
+    fprintf(screen,"-  with %d thread(s) per proc.\n",_nthreads);
+    #endif
+    #ifdef USE_OPENCL
+    fprintf(screen,"-  with OpenCL Parameters for: %s\n",OCL_VENDOR);
+    #endif
+    fprintf(screen,"-------------------------------------");
+    fprintf(screen,"-------------------------------------\n");
+
+    int last=last_gpu+1;
+    if (last>gpu->num_devices())
+      last=gpu->num_devices();
+    for (int i=first_gpu; i<last; i++) {
+      std::string sname;
+      if (i==first_gpu)
+        sname=gpu->name(i)+", "+toa(gpu->cores(i))+" cores, "+fs+
+              toa(gpu->gigabytes(i))+" GB, "+toa(gpu->clock_rate(i))+" GHZ (";
+      else              
+        sname=gpu->name(i)+", "+toa(gpu->cores(i))+" cores, "+fs+
+              toa(gpu->clock_rate(i))+" GHZ (";
+      if (sizeof(PRECISION)==4) {
+        if (sizeof(ACC_PRECISION)==4)
+          sname+="Single Precision)";
+        else
+          sname+="Mixed Precision)";
+      } else
+        sname+="Double Precision)";
+
+      fprintf(screen,"GPU %d: %s\n",i,sname.c_str());
+    }
+
+    fprintf(screen,"-------------------------------------");
+    fprintf(screen,"-------------------------------------\n\n");
+  }
+}
+
+template <class numtyp, class acctyp>
+void DeviceT::estimate_gpu_overhead(const int kernel_calls, 
+                                           double &gpu_overhead,
+                                           double &gpu_driver_overhead) {
+  UCL_H_Vec<int> *host_data_in=NULL, *host_data_out=NULL;
+  UCL_D_Vec<int> *dev_data_in=NULL, *dev_data_out=NULL, *kernel_data=NULL;
+  UCL_Timer *timers_in=NULL, *timers_out=NULL, *timers_kernel=NULL;
+  UCL_Timer over_timer(*gpu);
+
+  if (_data_in_estimate>0) {
+    host_data_in=new UCL_H_Vec<int>[_data_in_estimate];
+    dev_data_in=new UCL_D_Vec<int>[_data_in_estimate];
+    timers_in=new UCL_Timer[_data_in_estimate];
+  }
+  
+  if (_data_out_estimate>0) {
+    host_data_out=new UCL_H_Vec<int>[_data_out_estimate];
+    dev_data_out=new UCL_D_Vec<int>[_data_out_estimate];
+    timers_out=new UCL_Timer[_data_out_estimate];
+  }
+  
+  if (kernel_calls>0) {
+    kernel_data=new UCL_D_Vec<int>[kernel_calls];
+    timers_kernel=new UCL_Timer[kernel_calls];
+  }
+  
+  for (int i=0; i<_data_in_estimate; i++) {
+    host_data_in[i].alloc(1,*gpu);
+    dev_data_in[i].alloc(1,*gpu);
+    timers_in[i].init(*gpu);
+  }  
+  
+  for (int i=0; i<_data_out_estimate; i++) {
+    host_data_out[i].alloc(1,*gpu);
+    dev_data_out[i].alloc(1,*gpu);
+    timers_out[i].init(*gpu);
+  }  
+  
+  for (int i=0; i<kernel_calls; i++) {
+    kernel_data[i].alloc(1,*gpu);
+    timers_kernel[i].init(*gpu);
+  }  
+  
+  gpu_overhead=0.0;
+  gpu_driver_overhead=0.0;
+  
+  for (int i=0; i<10; i++) {
+    gpu->sync();
+    gpu_barrier();
+    over_timer.start();
+    gpu->sync();
+    gpu_barrier();
+
+    double driver_time=MPI_Wtime();
+    for (int i=0; i<_data_in_estimate; i++) {
+      timers_in[i].start();
+      ucl_copy(dev_data_in[i],host_data_in[i],true);
+      timers_in[i].stop();
+    }
+    
+    for (int i=0; i<kernel_calls; i++) {
+      timers_kernel[i].start();
+      zero(kernel_data[i],1);
+      timers_kernel[i].stop();
+    }
+
+    for (int i=0; i<_data_out_estimate; i++) {
+      timers_out[i].start();
+      ucl_copy(host_data_out[i],dev_data_out[i],true);
+      timers_out[i].stop();
+    }
+    over_timer.stop();
+
+    double time=over_timer.seconds();
+    driver_time=MPI_Wtime()-driver_time;
+     
+    if (time_device()) {
+      for (int i=0; i<_data_in_estimate; i++)
+        timers_in[i].add_to_total();
+      for (int i=0; i<kernel_calls; i++)
+        timers_kernel[i].add_to_total();
+      for (int i=0; i<_data_out_estimate; i++)
+        timers_out[i].add_to_total();
+    }
+    
+    double mpi_time, mpi_driver_time;
+    MPI_Allreduce(&time,&mpi_time,1,MPI_DOUBLE,MPI_MAX,gpu_comm());
+    MPI_Allreduce(&driver_time,&mpi_driver_time,1,MPI_DOUBLE,MPI_MAX,gpu_comm());
+    gpu_overhead+=mpi_time;
+    gpu_driver_overhead+=mpi_driver_time;
+  }
+  gpu_overhead/=10.0;
+  gpu_driver_overhead/=10.0;
+
+  if (_data_in_estimate>0) {
+    delete [] host_data_in;
+    delete [] dev_data_in;
+    delete [] timers_in;
+  }
+  
+  if (_data_out_estimate>0) {
+    delete [] host_data_out;
+    delete [] dev_data_out;
+    delete [] timers_out;
+  }
+  
+  if (kernel_calls>0) {
+    delete [] kernel_data;
+    delete [] timers_kernel;
+  }
+}              
+
+template <class numtyp, class acctyp>
+void DeviceT::output_times(UCL_Timer &time_pair, 
+                                  Answer<numtyp,acctyp> &ans, 
+                                  Neighbor &nbor, const double avg_split, 
+                                  const double max_bytes, 
+                                  const double gpu_overhead,
+                                  const double driver_overhead, 
+                                  const int threads_per_atom, FILE *screen) {
+  double single[9], times[9];
+
+  single[0]=atom.transfer_time()+ans.transfer_time();
+  single[1]=nbor.time_nbor.total_seconds()+nbor.time_hybrid1.total_seconds()+
+            nbor.time_hybrid2.total_seconds();
+  single[2]=nbor.time_kernel.total_seconds();
+  single[3]=time_pair.total_seconds();
+  single[4]=atom.cast_time()+ans.cast_time();
+  single[5]=gpu_overhead;
+  single[6]=driver_overhead;
+  single[7]=ans.cpu_idle_time();
+  single[8]=nbor.bin_time();
+
+  MPI_Reduce(single,times,9,MPI_DOUBLE,MPI_SUM,0,_comm_replica);
+
+  double my_max_bytes=max_bytes+atom.max_gpu_bytes();
+  double mpi_max_bytes;
+  MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica);
+  double max_mb=mpi_max_bytes/(1024.0*1024.0);
+
+  if (replica_me()==0)
+    if (screen && times[5]>0.0) {
+      fprintf(screen,"\n\n-------------------------------------");
+      fprintf(screen,"--------------------------------\n");
+      fprintf(screen,"      GPU Time Info (average): ");
+      fprintf(screen,"\n-------------------------------------");
+      fprintf(screen,"--------------------------------\n");
+
+      if (time_device()) {
+        fprintf(screen,"Data Transfer:   %.4f s.\n",times[0]/_replica_size);
+        fprintf(screen,"Data Cast/Pack:  %.4f s.\n",times[4]/_replica_size);
+        fprintf(screen,"Neighbor copy:   %.4f s.\n",times[1]/_replica_size);
+        if (nbor.gpu_nbor()>0)
+          fprintf(screen,"Neighbor build:  %.4f s.\n",times[2]/_replica_size);
+        else
+          fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/_replica_size);
+        fprintf(screen,"Force calc:      %.4f s.\n",times[3]/_replica_size);
+      }
+      if (nbor.gpu_nbor()==2)
+        fprintf(screen,"Neighbor (CPU):  %.4f s.\n",times[8]/_replica_size);
+      fprintf(screen,"GPU Overhead:    %.4f s.\n",times[5]/_replica_size);
+      fprintf(screen,"Average split:   %.4f.\n",avg_split);
+      fprintf(screen,"Threads / atom:  %d.\n",threads_per_atom);
+      fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);
+      fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[6]/_replica_size);
+      fprintf(screen,"CPU Idle_Time:   %.4f s.\n",times[7]/_replica_size);
+
+      fprintf(screen,"-------------------------------------");
+      fprintf(screen,"--------------------------------\n\n");
+    }
+}
+
+template <class numtyp, class acctyp>
+void DeviceT::output_kspace_times(UCL_Timer &time_in, 
+                                         UCL_Timer &time_out,
+                                         UCL_Timer &time_map,
+                                         UCL_Timer &time_rho,
+                                         UCL_Timer &time_interp,
+                                         Answer<numtyp,acctyp> &ans, 
+                                         const double max_bytes, 
+                                         const double cpu_time, 
+                                         const double idle_time, FILE *screen) {
+  double single[8], times[8];
+
+  single[0]=time_out.total_seconds();
+  single[1]=time_in.total_seconds()+atom.transfer_time()+atom.cast_time();
+  single[2]=time_map.total_seconds();
+  single[3]=time_rho.total_seconds();
+  single[4]=time_interp.total_seconds();
+  single[5]=ans.transfer_time()+ans.cast_time();
+  single[6]=cpu_time;
+  single[7]=idle_time;
+
+  MPI_Reduce(single,times,8,MPI_DOUBLE,MPI_SUM,0,_comm_replica);
+
+  double my_max_bytes=max_bytes+atom.max_gpu_bytes();
+  double mpi_max_bytes;
+  MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica);
+  double max_mb=mpi_max_bytes/(1024.0*1024.0);
+
+  if (replica_me()==0)
+    if (screen && times[6]>0.0) {
+      fprintf(screen,"\n\n-------------------------------------");
+      fprintf(screen,"--------------------------------\n");
+      fprintf(screen,"      GPU Time Info (average): ");
+      fprintf(screen,"\n-------------------------------------");
+      fprintf(screen,"--------------------------------\n");
+
+      if (time_device()) {
+        fprintf(screen,"Data Out:        %.4f s.\n",times[0]/_replica_size);
+        fprintf(screen,"Data In:         %.4f s.\n",times[1]/_replica_size);
+        fprintf(screen,"Kernel (map):    %.4f s.\n",times[2]/_replica_size);
+        fprintf(screen,"Kernel (rho):    %.4f s.\n",times[3]/_replica_size);
+        fprintf(screen,"Force interp:    %.4f s.\n",times[4]/_replica_size);
+        fprintf(screen,"Total rho:       %.4f s.\n",
+                (times[0]+times[2]+times[3])/_replica_size);
+        fprintf(screen,"Total interp:    %.4f s.\n",
+                (times[1]+times[4])/_replica_size);
+        fprintf(screen,"Force copy/cast: %.4f s.\n",times[5]/_replica_size);
+        fprintf(screen,"Total:           %.4f s.\n",
+                (times[0]+times[1]+times[2]+times[3]+times[4]+times[5])/
+                _replica_size);
+      }
+      fprintf(screen,"CPU Poisson:     %.4f s.\n",times[6]/_replica_size);
+      fprintf(screen,"CPU Idle Time:   %.4f s.\n",times[7]/_replica_size);
+      fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);
+
+      fprintf(screen,"-------------------------------------");
+      fprintf(screen,"--------------------------------\n\n");
+    }
+}
+
+template <class numtyp, class acctyp>
+void DeviceT::clear() {
+  if (_init_count>0) {
+    _long_range_precompute=0;
+    _init_count--;
+    if (_init_count==0) {
+      atom.clear();
+      _neighbor_shared.clear();
+      if (_compiled) {
+        k_zero.clear();
+        k_info.clear();
+        delete dev_program;
+        _compiled=false;
+      }
+    }
+  }
+}
+
+template <class numtyp, class acctyp>
+void DeviceT::clear_device() {
+  while (_init_count>0)
+    clear();
+  if (_device_init) {
+    delete gpu;
+    _device_init=false;
+  }
+}
+
+template <class numtyp, class acctyp>
+int DeviceT::compile_kernels() {
+  int flag=0;
+
+  if (_compiled)
+  	return flag;
+  	
+  std::string flags="-cl-mad-enable -D"+std::string(OCL_VENDOR);
+  dev_program=new UCL_Program(*gpu);
+  int success=dev_program->load_string(device,flags.c_str());
+  if (success!=UCL_SUCCESS)
+    return -4;
+  k_zero.set_function(*dev_program,"kernel_zero");
+  k_info.set_function(*dev_program,"kernel_info");
+  _compiled=true;
+
+  UCL_H_Vec<int> h_gpu_lib_data(14,*gpu,UCL_NOT_PINNED);
+  UCL_D_Vec<int> d_gpu_lib_data(14,*gpu);
+  k_info.set_size(1,1);
+  k_info.run(&d_gpu_lib_data.begin());
+  ucl_copy(h_gpu_lib_data,d_gpu_lib_data,false);
+  
+  _ptx_arch=static_cast<double>(h_gpu_lib_data[0])/100.0;
+  #ifndef USE_OPENCL
+  if (_ptx_arch>gpu->arch())
+    return -4;
+  #endif
+
+  _num_mem_threads=h_gpu_lib_data[1];
+  _warp_size=h_gpu_lib_data[2];
+  if (_threads_per_atom<1)
+    _threads_per_atom=h_gpu_lib_data[3];
+  if (_threads_per_charge<1)
+    _threads_per_charge=h_gpu_lib_data[13];
+  _pppm_max_spline=h_gpu_lib_data[4];
+  _pppm_block=h_gpu_lib_data[5];
+  _block_pair=h_gpu_lib_data[6];
+  _max_shared_types=h_gpu_lib_data[7];
+  _block_cell_2d=h_gpu_lib_data[8];
+  _block_cell_id=h_gpu_lib_data[9];
+  _block_nbor_build=h_gpu_lib_data[10];
+  _block_bio_pair=h_gpu_lib_data[11];
+  _max_bio_shared_types=h_gpu_lib_data[12];
+
+  if (static_cast<size_t>(_block_pair)>gpu->group_size())
+    _block_pair=gpu->group_size();
+  if (static_cast<size_t>(_block_bio_pair)>gpu->group_size())
+    _block_bio_pair=gpu->group_size();
+  if (_threads_per_atom>_warp_size)
+    _threads_per_atom=_warp_size;
+  if (_warp_size%_threads_per_atom!=0)
+    _threads_per_atom=1;
+  if (_threads_per_atom & (_threads_per_atom - 1))
+    _threads_per_atom=1;
+  if (_threads_per_charge>_warp_size)
+    _threads_per_charge=_warp_size;
+  if (_warp_size%_threads_per_charge!=0)
+    _threads_per_charge=1;
+  if (_threads_per_charge & (_threads_per_charge - 1))
+    _threads_per_charge=1;
+
+  return flag;    
+}
+
+template <class numtyp, class acctyp>
+double DeviceT::host_memory_usage() const {
+  return atom.host_memory_usage()+4*sizeof(numtyp)+
+         sizeof(Device<numtyp,acctyp>);
+}
+
+template class Device<PRECISION,ACC_PRECISION>;
+Device<PRECISION,ACC_PRECISION> global_device;
+
+int lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
+                    const int last_gpu, const int gpu_mode, 
+                    const double particle_split, const int nthreads,
+                    const int t_per_atom) {
+  return global_device.init_device(world,replica,first_gpu,last_gpu,gpu_mode,
+                                     particle_split,nthreads,t_per_atom);
+}
+
+void lmp_clear_device() {
+  global_device.clear_device();
+}
+
+double lmp_gpu_forces(double **f, double **tor, double *eatom,
+                      double **vatom, double *virial, double &ecoul) {
+  return global_device.fix_gpu(f,tor,eatom,vatom,virial,ecoul);
+}
--- a/lib/gpu/lal_device.cu
+++ b/lib/gpu/lal_device.cu
@ -0,0 +1,42 @@
+// **************************************************************************
+//                                  device.cu
+//                             -------------------
+//                           W. Michael Brown (ORNL)
+//
+//  Device code for device information
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                : 
+//    email                : brownw@ornl.gov
+// ***************************************************************************/
+
+#ifdef NV_KERNEL
+#include "lal_preprocessor.h"
+#endif
+
+__kernel void kernel_zero(__global int *mem, int numel) {
+  int ii=GLOBAL_ID_X;
+  
+  if (ii<numel)
+    mem[ii]=0;
+}
+
+__kernel void kernel_info(__global int *info) {
+  info[0]=ARCH;
+  info[1]=MEM_THREADS;
+  info[2]=WARP_SIZE;
+  info[3]=THREADS_PER_ATOM;
+  info[4]=PPPM_MAX_SPLINE;
+  info[5]=PPPM_BLOCK_1D;
+  info[6]=BLOCK_PAIR;
+  info[7]=MAX_SHARED_TYPES;
+  info[8]=BLOCK_CELL_2D;
+  info[9]=BLOCK_CELL_ID;
+  info[10]=BLOCK_NBOR_BUILD;
+  info[11]=BLOCK_BIO_PAIR;
+  info[12]=MAX_BIO_SHARED_TYPES;
+  info[13]=THREADS_PER_CHARGE;
+}
--- a/lib/gpu/lal_device.h
+++ b/lib/gpu/lal_device.h
@ -0,0 +1,317 @@
+/***************************************************************************
+                                  device.h
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Class for management of the device where the computations are performed
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#ifndef LAL_DEVICE_H
+#define LAL_DEVICE_H
+
+#include "lal_atom.h"
+#include "lal_answer.h"
+#include "lal_neighbor.h"
+#include "lal_pppm.h"
+#include "mpi.h"
+#include <sstream>
+#include "stdio.h"
+#include <string>
+#include <queue>
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp, 
+          class grdtyp, class grdtyp4> class PPPM;
+
+template <class numtyp, class acctyp>
+class Device {
+ public:
+  Device();
+  ~Device(); 
+ 
+  /// Initialize the device for use by this process
+  /** Sets up a per-device MPI communicator for load balancing and initializes
+    * the device (>=first_gpu and <=last_gpu) that this proc will be using 
+    * Returns:
+    * -  0 if successfull
+    * - -2 if GPU not found
+    * - -4 if GPU library not compiled for GPU
+    * - -6 if GPU could not be initialized for use
+    * - -7 if accelerator sharing is not currently allowed on system **/
+  int init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, 
+                   const int last_gpu, const int gpu_mode, 
+                   const double particle_split, const int nthreads,
+                   const int t_per_atom);
+
+  /// Initialize the device for Atom and Neighbor storage
+  /** \param rot True if quaternions need to be stored
+    * \param nlocal Total number of local particles to allocate memory for
+    * \param host_nlocal Initial number of host particles to allocate memory for
+    * \param nall Total number of local+ghost particles
+    * \param gpu_host 0 if host will not perform force calculations,
+    *                 1 if gpu_nbor is true, and host needs a half nbor list,
+    *                 2 if gpu_nbor is true, and host needs a full nbor list
+    * \param max_nbors Initial number of rows in the neighbor matrix
+    * \param cell_size cutoff+skin 
+    * \param pre_cut True if cutoff test will be performed in separate kernel
+    *                than the force kernel 
+    * \param threads_per_atom value to be used by the neighbor list only
+    *
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(Answer<numtyp,acctyp> &a, const bool charge, const bool rot,
+           const int nlocal, const int host_nlocal, const int nall,
+           Neighbor *nbor, const int maxspecial, const int gpu_host,
+           const int max_nbors, const double cell_size, const bool pre_cut,
+           const int threads_per_atom);
+
+  /// Initialize the device for Atom storage only
+  /** \param nlocal Total number of local particles to allocate memory for
+    * \param nall Total number of local+ghost particles
+    *
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(Answer<numtyp,acctyp> &ans, const int nlocal, const int nall);
+
+  /// Output a message for pair_style acceleration with device stats
+  void init_message(FILE *screen, const char *name,
+                    const int first_gpu, const int last_gpu);
+
+  /// Perform charge assignment asynchronously for PPPM
+	void set_single_precompute(PPPM<numtyp,acctyp,
+	                                         float,_lgpu_float4> *pppm);
+
+  /// Perform charge assignment asynchronously for PPPM
+	void set_double_precompute(PPPM<numtyp,acctyp,
+	                                         double,_lgpu_double4> *pppm);
+
+  /// Esimate the overhead from GPU calls from multiple procs
+  /** \param kernel_calls Number of kernel calls/timestep for timing estimated
+    *                     overhead
+    * \param gpu_overhead Estimated gpu overhead per timestep (sec)
+    * \param driver_overhead Estimated overhead from driver per timestep (s) **/
+  void estimate_gpu_overhead(const int kernel_calls, double &gpu_overhead,
+                             double &gpu_driver_overhead);
+
+  /// Returns true if double precision is supported on card
+  inline bool double_precision() { return gpu->double_precision(); }
+  
+  /// Output a message with timing information
+  void output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans, 
+                    Neighbor &nbor, const double avg_split, 
+                    const double max_bytes, const double gpu_overhead,
+                    const double driver_overhead, 
+                    const int threads_per_atom, FILE *screen);
+
+  /// Output a message with timing information
+  void output_kspace_times(UCL_Timer &time_in, UCL_Timer &time_out,
+                           UCL_Timer & time_map, UCL_Timer & time_rho,
+                           UCL_Timer &time_interp, 
+                           Answer<numtyp,acctyp> &ans, 
+                           const double max_bytes, const double cpu_time,
+                           const double cpu_idle_time, FILE *screen);
+
+  /// Clear all memory on host and device associated with atom and nbor data
+  void clear();
+  
+  /// Clear all memory on host and device
+  void clear_device();
+
+  /// Add an answer object for putting forces, energies, etc from GPU to LAMMPS
+  inline void add_ans_object(Answer<numtyp,acctyp> *ans)
+    { ans_queue.push(ans); }
+
+  /// Add "answers" (force,energies,etc.) into LAMMPS structures
+  inline double fix_gpu(double **f, double **tor, double *eatom,
+                        double **vatom, double *virial, double &ecoul) {
+    atom.data_unavail();
+    if (ans_queue.empty()==false) {
+      stop_host_timer();
+      double evdw=0.0;
+      while (ans_queue.empty()==false) {
+        evdw+=ans_queue.front()->get_answers(f,tor,eatom,vatom,virial,ecoul);
+        ans_queue.pop();
+      }                                                 
+      return evdw;
+    }
+    return 0.0;
+  }
+
+  /// Start timer on host
+  inline void start_host_timer() 
+    { _cpu_full=MPI_Wtime(); _host_timer_started=true; }
+  
+  /// Stop timer on host
+  inline void stop_host_timer() { 
+    if (_host_timer_started) {
+      _cpu_full=MPI_Wtime()-_cpu_full; 
+      _host_timer_started=false;
+    }
+  }
+  
+  /// Return host time
+  inline double host_time() { return _cpu_full; }
+
+  /// Return host memory usage in bytes
+  double host_memory_usage() const;
+
+  /// Return the number of procs sharing a device (size of device commincator)
+  inline int procs_per_gpu() const { return _procs_per_gpu; }
+  /// Return the number of threads per proc
+  inline int num_threads() const { return _nthreads; }
+  /// My rank within all processes
+  inline int world_me() const { return _world_me; }
+  /// Total number of processes
+  inline int world_size() const { return _world_size; }
+  /// MPI Barrier for world
+  inline void world_barrier() { MPI_Barrier(_comm_world); }
+  /// Return the replica MPI communicator
+  inline MPI_Comm & replica() { return _comm_replica; }
+  /// My rank within replica communicator
+  inline int replica_me() const { return _replica_me; }
+  /// Number of procs in replica communicator
+  inline int replica_size() const { return _replica_size; }
+  /// Return the per-GPU MPI communicator
+  inline MPI_Comm & gpu_comm() { return _comm_gpu; }
+  /// Return my rank in the device communicator
+  inline int gpu_rank() const { return _gpu_rank; }
+  /// MPI Barrier for gpu
+  inline void gpu_barrier() { MPI_Barrier(_comm_gpu); }
+  /// Return the 'mode' for acceleration: GPU_FORCE, GPU_NEIGH or GPU_HYB_NEIGH
+  inline int gpu_mode() const { return _gpu_mode; }
+  /// Index of first device used by a node
+  inline int first_device() const { return _first_device; }
+  /// Index of last device used by a node
+  inline int last_device() const { return _last_device; }
+  /// Particle split defined in fix
+  inline double particle_split() const { return _particle_split; }
+  /// Return the initialization count for the device
+  inline int init_count() const { return _init_count; }
+  /// True if device is being timed
+  inline bool time_device() const { return _time_device; }
+
+  /// Return the number of threads accessing memory simulatenously
+  inline int num_mem_threads() const { return _num_mem_threads; }
+  /// Return the number of threads per atom for pair styles
+  inline int threads_per_atom() const { return _threads_per_atom; }
+  /// Return the number of threads per atom for pair styles using charge
+  inline int threads_per_charge() const { return _threads_per_charge; }
+  /// Return the min of the pair block size or the device max block size
+  inline int pair_block_size() const { return _block_pair; }
+  /// Return the maximum number of atom types that can be used with shared mem
+  inline int max_shared_types() const { return _max_shared_types; }
+  /// Return the maximum order for PPPM splines
+  inline int pppm_max_spline() const { return _pppm_max_spline; }
+  /// Return the block size for PPPM kernels
+  inline int pppm_block() const { return _pppm_block; }
+  /// Return the block size for neighbor binning
+  inline int block_cell_2d() const { return _block_cell_2d; }
+  /// Return the block size for atom mapping for neighbor builds
+  inline int block_cell_id() const { return _block_cell_id; }
+  /// Return the block size for neighbor build kernel
+  inline int block_nbor_build() const { return _block_nbor_build; }
+  /// Return the block size for "bio" pair styles
+  inline int block_bio_pair() const { return _block_bio_pair; }
+  /// Return the maximum number of atom types for shared mem with "bio" styles
+  inline int max_bio_shared_types() const { return _max_bio_shared_types; }
+  /// Architecture gpu code compiled for (returns 0 for OpenCL)
+  inline double ptx_arch() const { return _ptx_arch; }
+
+  // -------------------- SHARED DEVICE ROUTINES -------------------- 
+  // Perform asynchronous zero of integer array 
+  void zero(UCL_D_Vec<int> &mem, const int numel) {
+    int num_blocks=static_cast<int>(ceil(static_cast<double>(numel)/
+                                    _block_pair));
+    k_zero.set_size(num_blocks,_block_pair);
+    k_zero.run(&mem.begin(),&numel);
+  }
+
+  // -------------------------- DEVICE DATA ------------------------- 
+
+  /// Geryon Device
+  UCL_Device *gpu;
+
+  enum{GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH};
+
+  // --------------------------- ATOM DATA -------------------------- 
+
+  /// Atom Data
+  Atom<numtyp,acctyp> atom;
+
+  // --------------------------- NBOR DATA ----------------------------
+  
+  /// Neighbor Data
+  NeighborShared _neighbor_shared;
+
+  // ------------------------ LONG RANGE DATA -------------------------
+  
+  // Long Range Data
+  int _long_range_precompute;
+  PPPM<numtyp,acctyp,float,_lgpu_float4> *pppm_single;
+  PPPM<numtyp,acctyp,double,_lgpu_double4> *pppm_double;
+  /// Precomputations for long range charge assignment (asynchronously)
+  inline void precompute(const int ago, const int nlocal, const int nall,
+                         double **host_x, int *host_type, bool &success,
+                         double *charge, double *boxlo, double *prd) {
+    if (_long_range_precompute==1)
+      pppm_single->precompute(ago,nlocal,nall,host_x,host_type,success,charge,
+                              boxlo,prd);
+    else if (_long_range_precompute==2)
+      pppm_double->precompute(ago,nlocal,nall,host_x,host_type,success,charge,
+                              boxlo,prd);
+  }
+
+ private:
+  std::queue<Answer<numtyp,acctyp> *> ans_queue;
+  int _init_count;
+  bool _device_init, _host_timer_started, _time_device;
+  MPI_Comm _comm_world, _comm_replica, _comm_gpu;
+  int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me, 
+      _replica_size;
+  int _gpu_mode, _first_device, _last_device, _nthreads;
+  double _particle_split;
+  double _cpu_full;
+  double _ptx_arch;
+
+  int _num_mem_threads, _warp_size, _threads_per_atom, _threads_per_charge;
+  int _pppm_max_spline, _pppm_block;
+  int _block_pair, _max_shared_types;
+  int _block_cell_2d, _block_cell_id, _block_nbor_build;
+  int _block_bio_pair, _max_bio_shared_types;
+
+  UCL_Program *dev_program;
+  UCL_Kernel k_zero, k_info;
+  bool _compiled;
+  int compile_kernels();
+
+  int _data_in_estimate, _data_out_estimate;
+  
+  template <class t>
+  inline std::string toa(const t& in) {
+    std::ostringstream o;
+    o.precision(2);
+    o << in;
+    return o.str();
+  }
+
+};
+
+}
+
+#endif
--- a/lib/gpu/lal_ellipsoid_extra.h
+++ b/lib/gpu/lal_ellipsoid_extra.h
@ -0,0 +1,539 @@
+// **************************************************************************
+//                              ellipsoid_extra.h
+//                             -------------------
+//                           W. Michael Brown (ORNL)
+//
+//  Device code for Ellipsoid math routines
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                :
+//    email                : brownw@ornl.gov
+// ***************************************************************************/
+
+#ifndef LAL_ELLIPSOID_EXTRA_H
+#define LAL_ELLIPSOID_EXTRA_H
+
+enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
+
+#ifdef NV_KERNEL
+#include "lal_preprocessor.h"
+#endif
+
+#define atom_info(t_per_atom, ii, tid, offset)                               \
+  tid=THREAD_ID_X;                                                           \
+  offset=tid & (t_per_atom-1);                                               \
+  ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom)+tid/t_per_atom;
+
+#define nbor_info_e(nbor_mem, nbor_stride, t_per_atom, ii, offset,           \
+                    i, numj, stride, list_end, nbor)                         \
+    nbor=nbor_mem+ii;                                                        \
+    i=*nbor;                                                                 \
+    nbor+=nbor_stride;                                                       \
+    numj=*nbor;                                                              \
+    nbor+=nbor_stride;                                                       \
+    list_end=nbor+fast_mul(nbor_stride,numj);                                   \
+    nbor+=fast_mul(offset,nbor_stride);                                         \
+    stride=fast_mul(t_per_atom,nbor_stride);
+
+#define store_answers(f, energy, virial, ii, inum, tid, t_per_atom, offset, \
+                      eflag, vflag, ans, engv)                              \
+  if (t_per_atom>1) {                                                       \
+    __local acctyp red_acc[6][BLOCK_PAIR];                                  \
+    red_acc[0][tid]=f.x;                                                    \
+    red_acc[1][tid]=f.y;                                                    \
+    red_acc[2][tid]=f.z;                                                    \
+    red_acc[3][tid]=energy;                                                 \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      if (offset < s) {                                                     \
+        for (int r=0; r<4; r++)                                             \
+          red_acc[r][tid] += red_acc[r][tid+s];                             \
+      }                                                                     \
+    }                                                                       \
+    f.x=red_acc[0][tid];                                                    \
+    f.y=red_acc[1][tid];                                                    \
+    f.z=red_acc[2][tid];                                                    \
+    energy=red_acc[3][tid];                                                 \
+    if (vflag>0) {                                                          \
+      for (int r=0; r<6; r++)                                               \
+        red_acc[r][tid]=virial[r];                                          \
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
+        if (offset < s) {                                                   \
+          for (int r=0; r<6; r++)                                           \
+            red_acc[r][tid] += red_acc[r][tid+s];                           \
+        }                                                                   \
+      }                                                                     \
+      for (int r=0; r<6; r++)                                               \
+        virial[r]=red_acc[r][tid];                                          \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0) {                                                          \
+    engv+=ii;                                                               \
+    if (eflag>0) {                                                          \
+      *engv=energy;                                                         \
+      engv+=inum;                                                           \
+    }                                                                       \
+    if (vflag>0) {                                                          \
+      for (int i=0; i<6; i++) {                                             \
+        *engv=virial[i];                                                    \
+        engv+=inum;                                                         \
+      }                                                                     \
+    }                                                                       \
+    ans[ii]=f;                                                              \
+  }
+
+#define store_answers_t(f, tor, energy, virial, ii, astride, tid,           \
+                        t_per_atom, offset, eflag, vflag, ans, engv)        \
+  if (t_per_atom>1) {                                                       \
+    __local acctyp red_acc[7][BLOCK_PAIR];                                  \
+    red_acc[0][tid]=f.x;                                                    \
+    red_acc[1][tid]=f.y;                                                    \
+    red_acc[2][tid]=f.z;                                                    \
+    red_acc[3][tid]=tor.x;                                                  \
+    red_acc[4][tid]=tor.y;                                                  \
+    red_acc[5][tid]=tor.z;                                                  \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      if (offset < s) {                                                     \
+        for (int r=0; r<6; r++)                                             \
+          red_acc[r][tid] += red_acc[r][tid+s];                             \
+      }                                                                     \
+    }                                                                       \
+    f.x=red_acc[0][tid];                                                    \
+    f.y=red_acc[1][tid];                                                    \
+    f.z=red_acc[2][tid];                                                    \
+    tor.x=red_acc[3][tid];                                                  \
+    tor.y=red_acc[4][tid];                                                  \
+    tor.z=red_acc[5][tid];                                                  \
+    if (eflag>0 || vflag>0) {                                               \
+      for (int r=0; r<6; r++)                                               \
+        red_acc[r][tid]=virial[r];                                          \
+      red_acc[6][tid]=energy;                                               \
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
+        if (offset < s) {                                                   \
+          for (int r=0; r<7; r++)                                           \
+            red_acc[r][tid] += red_acc[r][tid+s];                           \
+        }                                                                   \
+      }                                                                     \
+      for (int r=0; r<6; r++)                                               \
+        virial[r]=red_acc[r][tid];                                          \
+      energy=red_acc[6][tid];                                               \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0) {                                                          \
+    __global acctyp *ap1=engv+ii;                                           \
+    if (eflag>0) {                                                          \
+      *ap1=energy;                                                          \
+      ap1+=astride;                                                         \
+    }                                                                       \
+    if (vflag>0) {                                                          \
+      for (int i=0; i<6; i++) {                                             \
+        *ap1=virial[i];                                                     \
+        ap1+=astride;                                                       \
+      }                                                                     \
+    }                                                                       \
+    ans[ii]=f;                                                              \
+    ans[ii+astride]=tor;                                                    \
+  }
+
+#define acc_answers(f, energy, virial, ii, inum, tid, t_per_atom, offset,   \
+                    eflag, vflag, ans, engv)                                \
+  if (t_per_atom>1) {                                                       \
+    __local acctyp red_acc[6][BLOCK_PAIR];                                  \
+    red_acc[0][tid]=f.x;                                                    \
+    red_acc[1][tid]=f.y;                                                    \
+    red_acc[2][tid]=f.z;                                                    \
+    red_acc[3][tid]=energy;                                                 \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      if (offset < s) {                                                     \
+        for (int r=0; r<4; r++)                                             \
+          red_acc[r][tid] += red_acc[r][tid+s];                             \
+      }                                                                     \
+    }                                                                       \
+    f.x=red_acc[0][tid];                                                    \
+    f.y=red_acc[1][tid];                                                    \
+    f.z=red_acc[2][tid];                                                    \
+    energy=red_acc[3][tid];                                                 \
+    if (vflag>0) {                                                          \
+      for (int r=0; r<6; r++)                                               \
+        red_acc[r][tid]=virial[r];                                          \
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
+        if (offset < s) {                                                   \
+          for (int r=0; r<6; r++)                                           \
+            red_acc[r][tid] += red_acc[r][tid+s];                           \
+        }                                                                   \
+      }                                                                     \
+      for (int r=0; r<6; r++)                                               \
+        virial[r]=red_acc[r][tid];                                          \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0) {                                                          \
+    engv+=ii;                                                               \
+    if (eflag>0) {                                                          \
+      *engv+=energy;                                                        \
+      engv+=inum;                                                           \
+    }                                                                       \
+    if (vflag>0) {                                                          \
+      for (int i=0; i<6; i++) {                                             \
+        *engv+=virial[i];                                                   \
+        engv+=inum;                                                         \
+      }                                                                     \
+    }                                                                       \
+    acctyp4 old=ans[ii];                                                    \
+    old.x+=f.x;                                                             \
+    old.y+=f.y;                                                             \
+    old.z+=f.z;                                                             \
+    ans[ii]=old;                                                            \
+  }
+
+/* ----------------------------------------------------------------------
+   dot product of 2 vectors
+------------------------------------------------------------------------- */
+
+ucl_inline numtyp gpu_dot3(const numtyp *v1, const numtyp *v2)
+{
+  return v1[0]*v2[0]+v1[1]*v2[1]+v1[2]*v2[2];
+}
+
+/* ----------------------------------------------------------------------
+   cross product of 2 vectors
+------------------------------------------------------------------------- */
+
+ucl_inline void gpu_cross3(const numtyp *v1, const numtyp *v2, numtyp *ans)
+{
+  ans[0] = v1[1]*v2[2]-v1[2]*v2[1];
+  ans[1] = v1[2]*v2[0]-v1[0]*v2[2];
+  ans[2] = v1[0]*v2[1]-v1[1]*v2[0];
+}
+
+/* ----------------------------------------------------------------------
+   determinant of a matrix
+------------------------------------------------------------------------- */
+
+ucl_inline numtyp gpu_det3(const numtyp m[9])
+{
+  numtyp ans = m[0]*m[4]*m[8] - m[0]*m[5]*m[7] - 
+    m[3]*m[1]*m[8] + m[3]*m[2]*m[7] + 
+    m[6]*m[1]*m[5] - m[6]*m[2]*m[4];
+  return ans;
+}
+
+/* ----------------------------------------------------------------------
+   diagonal matrix times a full matrix
+------------------------------------------------------------------------- */
+
+ucl_inline void gpu_diag_times3(const numtyp4 shape, const numtyp m[9], 
+                              numtyp ans[9])
+{
+  ans[0] = shape.x*m[0];
+  ans[1] = shape.x*m[1];
+  ans[2] = shape.x*m[2];
+  ans[3] = shape.y*m[3];
+  ans[4] = shape.y*m[4];
+  ans[5] = shape.y*m[5];
+  ans[6] = shape.z*m[6];
+  ans[7] = shape.z*m[7];
+  ans[8] = shape.z*m[8];
+}
+
+/* ----------------------------------------------------------------------
+   add two matrices
+------------------------------------------------------------------------- */
+
+ucl_inline void gpu_plus3(const numtyp m[9], const numtyp m2[9], numtyp ans[9])
+{
+  ans[0] = m[0]+m2[0];
+  ans[1] = m[1]+m2[1];
+  ans[2] = m[2]+m2[2];
+  ans[3] = m[3]+m2[3];
+  ans[4] = m[4]+m2[4];
+  ans[5] = m[5]+m2[5];
+  ans[6] = m[6]+m2[6];
+  ans[7] = m[7]+m2[7];
+  ans[8] = m[8]+m2[8];
+}
+
+/* ----------------------------------------------------------------------
+   multiply the transpose of mat1 times mat2
+------------------------------------------------------------------------- */
+
+ucl_inline void gpu_transpose_times3(const numtyp m[9], const numtyp m2[9],
+                                   numtyp ans[9])
+{
+  ans[0] = m[0]*m2[0]+m[3]*m2[3]+m[6]*m2[6];
+  ans[1] = m[0]*m2[1]+m[3]*m2[4]+m[6]*m2[7];
+  ans[2] = m[0]*m2[2]+m[3]*m2[5]+m[6]*m2[8];
+  ans[3] = m[1]*m2[0]+m[4]*m2[3]+m[7]*m2[6];
+  ans[4] = m[1]*m2[1]+m[4]*m2[4]+m[7]*m2[7];
+  ans[5] = m[1]*m2[2]+m[4]*m2[5]+m[7]*m2[8];
+  ans[6] = m[2]*m2[0]+m[5]*m2[3]+m[8]*m2[6];
+  ans[7] = m[2]*m2[1]+m[5]*m2[4]+m[8]*m2[7];
+  ans[8] = m[2]*m2[2]+m[5]*m2[5]+m[8]*m2[8];
+}
+
+/* ----------------------------------------------------------------------
+   row vector times matrix
+------------------------------------------------------------------------- */
+
+ucl_inline void gpu_row_times3(const numtyp *v, const numtyp m[9], numtyp *ans)
+{
+  ans[0] = m[0]*v[0]+v[1]*m[3]+v[2]*m[6];
+  ans[1] = v[0]*m[1]+m[4]*v[1]+v[2]*m[7];
+  ans[2] = v[0]*m[2]+v[1]*m[5]+m[8]*v[2];
+}
+
+/* ----------------------------------------------------------------------
+   solve Ax = b or M ans = v
+   use gaussian elimination & partial pivoting on matrix
+   error_flag set to 2 if bad matrix inversion attempted
+------------------------------------------------------------------------- */
+
+ucl_inline void gpu_mldivide3(const numtyp m[9], const numtyp *v, numtyp *ans,
+                            __global int *error_flag)
+{
+  // create augmented matrix for pivoting
+
+  numtyp aug[12], t;
+
+  aug[3] = v[0];
+  aug[0] = m[0];
+  aug[1] = m[1];
+  aug[2] = m[2];
+  aug[7] = v[1];
+  aug[4] = m[3];
+  aug[5] = m[4];
+  aug[6] = m[5];
+  aug[11] = v[2];
+  aug[8] = m[6];
+  aug[9] = m[7];
+  aug[10] = m[8];
+
+  if (ucl_abs(aug[4]) > ucl_abs(aug[0])) {
+    numtyp swapt;
+    swapt=aug[0]; aug[0]=aug[4]; aug[4]=swapt;
+    swapt=aug[1]; aug[1]=aug[5]; aug[5]=swapt;
+    swapt=aug[2]; aug[2]=aug[6]; aug[6]=swapt;
+    swapt=aug[3]; aug[3]=aug[7]; aug[7]=swapt;
+  }
+  if (ucl_abs(aug[8]) > ucl_abs(aug[0])) {
+    numtyp swapt;
+    swapt=aug[0]; aug[0]=aug[8]; aug[8]=swapt;
+    swapt=aug[1]; aug[1]=aug[9]; aug[9]=swapt;
+    swapt=aug[2]; aug[2]=aug[10]; aug[10]=swapt;
+    swapt=aug[3]; aug[3]=aug[11]; aug[11]=swapt;
+  }
+
+  if (aug[0] != (numtyp)0.0) {
+    if (0!=0) {
+      numtyp swapt;
+      swapt=aug[0]; aug[0]=aug[0]; aug[0]=swapt;
+      swapt=aug[1]; aug[1]=aug[1]; aug[1]=swapt;
+      swapt=aug[2]; aug[2]=aug[2]; aug[2]=swapt;
+      swapt=aug[3]; aug[3]=aug[3]; aug[3]=swapt;
+    }
+  } else if (aug[4] != (numtyp)0.0) {
+    if (1!=0) {
+      numtyp swapt;
+      swapt=aug[0]; aug[0]=aug[4]; aug[4]=swapt;
+      swapt=aug[1]; aug[1]=aug[5]; aug[5]=swapt;
+      swapt=aug[2]; aug[2]=aug[6]; aug[6]=swapt;
+      swapt=aug[3]; aug[3]=aug[7]; aug[7]=swapt;
+    }
+  } else if (aug[8] != (numtyp)0.0) {
+    if (2!=0) {
+      numtyp swapt;
+      swapt=aug[0]; aug[0]=aug[8]; aug[8]=swapt;
+      swapt=aug[1]; aug[1]=aug[9]; aug[9]=swapt;
+      swapt=aug[2]; aug[2]=aug[10]; aug[10]=swapt;
+      swapt=aug[3]; aug[3]=aug[11]; aug[11]=swapt;
+    }
+  } else
+    *error_flag=2;
+
+  t = aug[4]/aug[0];
+  aug[5]-=t*aug[1];
+  aug[6]-=t*aug[2];
+  aug[7]-=t*aug[3];
+  t = aug[8]/aug[0];
+  aug[9]-=t*aug[1];
+  aug[10]-=t*aug[2];
+  aug[11]-=t*aug[3];
+
+  if (ucl_abs(aug[9]) > ucl_abs(aug[5])) {
+    numtyp swapt;
+    swapt=aug[4]; aug[4]=aug[8]; aug[8]=swapt;
+    swapt=aug[5]; aug[5]=aug[9]; aug[9]=swapt;
+    swapt=aug[6]; aug[6]=aug[10]; aug[10]=swapt;
+    swapt=aug[7]; aug[7]=aug[11]; aug[11]=swapt;
+  }
+
+  if (aug[5] != (numtyp)0.0) {
+    if (1!=1) {
+      numtyp swapt;
+      swapt=aug[4]; aug[4]=aug[4]; aug[4]=swapt;
+      swapt=aug[5]; aug[5]=aug[5]; aug[5]=swapt;
+      swapt=aug[6]; aug[6]=aug[6]; aug[6]=swapt;
+      swapt=aug[7]; aug[7]=aug[7]; aug[7]=swapt;
+    }
+  } else if (aug[9] != (numtyp)0.0) {
+    if (2!=1) {
+      numtyp swapt;
+      swapt=aug[4]; aug[4]=aug[8]; aug[8]=swapt;
+      swapt=aug[5]; aug[5]=aug[9]; aug[9]=swapt;
+      swapt=aug[6]; aug[6]=aug[10]; aug[10]=swapt;
+      swapt=aug[7]; aug[7]=aug[11]; aug[11]=swapt;
+    }
+  }
+
+  t = aug[9]/aug[5];
+  aug[10]-=t*aug[6];
+  aug[11]-=t*aug[7];
+  
+  if (aug[10] == (numtyp)0.0)
+    *error_flag=2;
+
+  ans[2] = aug[11]/aug[10];
+  t = (numtyp)0.0;
+  t += aug[6]*ans[2];
+  ans[1] = (aug[7]-t) / aug[5];
+  t = (numtyp)0.0;
+  t += aug[1]*ans[1];
+  t += aug[2]*ans[2];
+  ans[0] = (aug[3]-t) / aug[0];
+}
+
+/* ----------------------------------------------------------------------
+   compute rotation matrix from quaternion conjugate
+   quat = [w i j k]
+------------------------------------------------------------------------- */
+
+ucl_inline void gpu_quat_to_mat_trans(__global const numtyp4 *qif, const int qi, 
+                                    numtyp mat[9])
+{
+  numtyp4 q=qif[qi];
+  
+  numtyp w2 = q.x*q.x;
+  numtyp i2 = q.y*q.y;
+  numtyp j2 = q.z*q.z;
+  numtyp k2 = q.w*q.w;
+  numtyp twoij = (numtyp)2.0*q.y*q.z;
+  numtyp twoik = (numtyp)2.0*q.y*q.w;
+  numtyp twojk = (numtyp)2.0*q.z*q.w;
+  numtyp twoiw = (numtyp)2.0*q.y*q.x;
+  numtyp twojw = (numtyp)2.0*q.z*q.x;
+  numtyp twokw = (numtyp)2.0*q.w*q.x;
+
+  mat[0] = w2+i2-j2-k2;
+  mat[3] = twoij-twokw;
+  mat[6] = twojw+twoik;
+
+  mat[1] = twoij+twokw;
+  mat[4] = w2-i2+j2-k2;
+  mat[7] = twojk-twoiw;
+	
+  mat[2] = twoik-twojw;
+  mat[5] = twojk+twoiw;
+  mat[8] = w2-i2-j2+k2;
+}
+
+/* ----------------------------------------------------------------------
+   transposed matrix times diagonal matrix
+------------------------------------------------------------------------- */
+
+ucl_inline void gpu_transpose_times_diag3(const numtyp m[9],
+                                        const numtyp4 d, numtyp ans[9])
+{
+  ans[0] = m[0]*d.x;
+  ans[1] = m[3]*d.y;
+  ans[2] = m[6]*d.z;
+  ans[3] = m[1]*d.x;
+  ans[4] = m[4]*d.y;
+  ans[5] = m[7]*d.z;
+  ans[6] = m[2]*d.x;
+  ans[7] = m[5]*d.y;
+  ans[8] = m[8]*d.z;
+}
+
+/* ----------------------------------------------------------------------
+   multiply mat1 times mat2
+------------------------------------------------------------------------- */
+
+ucl_inline void gpu_times3(const numtyp m[9], const numtyp m2[9],
+                         numtyp ans[9])
+{
+  ans[0] = m[0]*m2[0] + m[1]*m2[3] + m[2]*m2[6];
+  ans[1] = m[0]*m2[1] + m[1]*m2[4] + m[2]*m2[7];
+  ans[2] = m[0]*m2[2] + m[1]*m2[5] + m[2]*m2[8];
+  ans[3] = m[3]*m2[0] + m[4]*m2[3] + m[5]*m2[6];
+  ans[4] = m[3]*m2[1] + m[4]*m2[4] + m[5]*m2[7];
+  ans[5] = m[3]*m2[2] + m[4]*m2[5] + m[5]*m2[8];
+  ans[6] = m[6]*m2[0] + m[7]*m2[3] + m[8]*m2[6];
+  ans[7] = m[6]*m2[1] + m[7]*m2[4] + m[8]*m2[7];
+  ans[8] = m[6]*m2[2] + m[7]*m2[5] + m[8]*m2[8];
+}
+
+/* ----------------------------------------------------------------------
+   Apply principal rotation generator about x to rotation matrix m
+------------------------------------------------------------------------- */
+
+ucl_inline void gpu_rotation_generator_x(const numtyp m[9], numtyp ans[9])
+{
+  ans[0] = 0;
+  ans[1] = -m[2];
+  ans[2] = m[1];
+  ans[3] = 0;
+  ans[4] = -m[5];
+  ans[5] = m[4];
+  ans[6] = 0;
+  ans[7] = -m[8];
+  ans[8] = m[7];
+}
+
+/* ----------------------------------------------------------------------
+   Apply principal rotation generator about y to rotation matrix m
+------------------------------------------------------------------------- */
+
+ucl_inline void gpu_rotation_generator_y(const numtyp m[9], numtyp ans[9])
+{
+  ans[0] = m[2];
+  ans[1] = 0;
+  ans[2] = -m[0];
+  ans[3] = m[5];
+  ans[4] = 0;
+  ans[5] = -m[3];
+  ans[6] = m[8];
+  ans[7] = 0;
+  ans[8] = -m[6];
+}
+
+/* ----------------------------------------------------------------------
+   Apply principal rotation generator about z to rotation matrix m
+------------------------------------------------------------------------- */
+
+ucl_inline void gpu_rotation_generator_z(const numtyp m[9], numtyp ans[9])
+{
+  ans[0] = -m[1];
+  ans[1] = m[0];
+  ans[2] = 0;
+  ans[3] = -m[4];
+  ans[4] = m[3];
+  ans[5] = 0;
+  ans[6] = -m[7];
+  ans[7] = m[6];
+  ans[8] = 0;
+}
+
+/* ----------------------------------------------------------------------
+   matrix times vector
+------------------------------------------------------------------------- */
+
+ucl_inline void gpu_times_column3(const numtyp m[9], const numtyp v[3],
+                                numtyp ans[3]) 
+{
+  ans[0] = m[0]*v[0] + m[1]*v[1] + m[2]*v[2];
+  ans[1] = m[3]*v[0] + m[4]*v[1] + m[5]*v[2];
+  ans[2] = m[6]*v[0] + m[7]*v[1] + m[8]*v[2];
+}
+
+#endif
--- a/lib/gpu/lal_ellipsoid_nbor.cu
+++ b/lib/gpu/lal_ellipsoid_nbor.cu
@ -0,0 +1,135 @@
+// **************************************************************************
+//                              ellipsoid_nbor.cu
+//                             -------------------
+//                           W. Michael Brown (ORNL)
+//
+//  Device code for Ellipsoid neighbor routines
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                :
+//    email                : brownw@ornl.gov
+// ***************************************************************************/
+
+#ifdef NV_KERNEL
+#include "lal_preprocessor.h"
+#endif
+
+// ---------------------------------------------------------------------------
+// Unpack neighbors from dev_ij array into dev_nbor matrix for coalesced access
+// -- Only unpack neighbors matching the specified inclusive range of forms
+// -- Only unpack neighbors within cutoff
+// ---------------------------------------------------------------------------
+__kernel void kernel_nbor(__global numtyp4 *x_, __global numtyp2 *cut_form, 
+                          const int ntypes, __global int *dev_nbor,
+                          const int nbor_pitch, const int start, const int inum, 
+                          __global int *dev_ij, const int form_low, 
+                          const int form_high) {
+                                
+  // ii indexes the two interacting particles in gi
+  int ii=GLOBAL_ID_X+start;
+
+  if (ii<inum) {
+    __global int *nbor=dev_ij+ii;
+    int i=*nbor;
+    nbor+=nbor_pitch;
+    int numj=*nbor;
+    nbor+=nbor_pitch;
+    __global int *list_end=nbor+fast_mul(numj,nbor_pitch);
+    __global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
+  
+    numtyp4 ix=x_[i];
+    int iw=ix.w;
+    int itype=fast_mul(iw,ntypes);
+    int newj=0;  
+    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+      int j=*nbor;
+      j &= NEIGHMASK;
+      numtyp4 jx=x_[j];
+      int jtype=jx.w;
+      int mtype=itype+jtype;
+      numtyp2 cf=cut_form[mtype];
+      if (cf.y>=form_low && cf.y<=form_high) {
+        // Compute r12;
+        numtyp rsq=jx.x-ix.x;
+        rsq*=rsq;
+        numtyp t=jx.y-ix.y;
+        rsq+=t*t;
+        t=jx.z-ix.z;
+        rsq+=t*t;
+
+        if (rsq<cf.x) {
+          *packed=j;
+          packed+=nbor_pitch;
+          newj++;
+        }
+      }
+    }
+    dev_nbor[ii+nbor_pitch]=newj;
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Unpack neighbors from dev_ij array into dev_nbor matrix for coalesced access
+// -- Only unpack neighbors matching the specified inclusive range of forms
+// -- Only unpack neighbors within cutoff
+// -- Fast version of routine that uses shared memory for LJ constants
+// ---------------------------------------------------------------------------
+__kernel void kernel_nbor_fast(__global numtyp4 *x_, __global numtyp2 *cut_form,
+                               __global int *dev_nbor, const int nbor_pitch, 
+                               const int start, const int inum, 
+                               __global int *dev_ij, const int form_low, 
+                               const int form_high) {
+                                
+  int ii=THREAD_ID_X;
+  __local int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    cutsq[ii]=cut_form[ii].x;
+    form[ii]=cut_form[ii].y;
+  }
+  ii+=fast_mul((int)BLOCK_SIZE_X,(int)BLOCK_ID_X)+start;
+  __syncthreads();
+
+  if (ii<inum) {
+    __global int *nbor=dev_ij+ii;
+    int i=*nbor;
+    nbor+=nbor_pitch;
+    int numj=*nbor;
+    nbor+=nbor_pitch;
+    __global int *list_end=nbor+fast_mul(numj,nbor_pitch);
+    __global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
+  
+    numtyp4 ix=x_[i];
+    int iw=ix.w;
+    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
+
+    int newj=0;  
+    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+      int j=*nbor;
+      j &= NEIGHMASK;
+      numtyp4 jx=x_[j];
+      int jtype=jx.w;
+      int mtype=itype+jtype;
+      
+      if (form[mtype]>=form_low && form[mtype]<=form_high) {
+        // Compute r12;
+        numtyp rsq=jx.x-ix.x;
+        rsq*=rsq;
+        numtyp t=jx.y-ix.y;
+        rsq+=t*t;
+        t=jx.z-ix.z;
+        rsq+=t*t;
+
+        if (rsq<cutsq[mtype]) {
+          *packed=j;
+          packed+=nbor_pitch;
+          newj++;
+        }
+      }
+    }
+    dev_nbor[ii+nbor_pitch]=newj;
+  }
+}
--- a/lib/gpu/lal_gayberne.cpp
+++ b/lib/gpu/lal_gayberne.cpp
@ -0,0 +1,309 @@
+/***************************************************************************
+                                gayberne.cpp
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Host code for Gay-Berne potential acceleration
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#ifdef USE_OPENCL
+#include "gayberne_cl.h"
+#include "gayberne_lj_cl.h"
+#else
+#include "gayberne_ptx.h"
+#include "gayberne_lj_ptx.h"
+#endif
+
+#include "lal_gayberne.h"
+#include <cassert>
+using namespace LAMMPS_AL;
+
+#define GayBerneT GayBerne<numtyp, acctyp>
+extern Device<PRECISION,ACC_PRECISION> device;
+
+template <class numtyp, class acctyp>
+GayBerneT::GayBerne() : BaseEllipsoid<numtyp,acctyp>(),
+                                  _allocated(false) {
+}
+
+template <class numtyp, class acctyp>
+GayBerneT::~GayBerne() { 
+  clear();
+}
+ 
+template <class numtyp, class acctyp>
+int GayBerneT::bytes_per_atom(const int max_nbors) const {
+  return this->bytes_per_atom(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int GayBerneT::init(const int ntypes, const double gamma, 
+                         const double upsilon, const double mu, 
+                         double **host_shape, double **host_well, 
+                         double **host_cutsq, double **host_sigma, 
+                         double **host_epsilon, double *host_lshape, 
+                         int **h_form, double **host_lj1, double **host_lj2,
+                         double **host_lj3, double **host_lj4,
+                         double **host_offset, const double *host_special_lj,
+                         const int nlocal, const int nall, const int max_nbors,
+                         const int maxspecial, const double cell_size,
+                         const double gpu_split, FILE *_screen) {
+  int success;
+  success=this->init_base(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                          _screen,ntypes,h_form,gayberne,gayberne_lj);
+  if (success!=0)
+    return success;
+
+  // If atom type constants fit in shared memory use fast kernel
+  int lj_types=ntypes;
+  _shared_types=false;
+  int max_shared_types=this->device->max_shared_types();
+  if (lj_types<=max_shared_types && this->block_size()>=max_shared_types) {
+    lj_types=max_shared_types;
+    _shared_types=true;
+  }
+  _lj_types=lj_types;
+
+  // Allocate a host write buffer for copying type data
+  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
+                               UCL_WRITE_OPTIMIZED);
+
+  for (int i=0; i<lj_types*lj_types; i++)
+    host_write[i]=0.0;
+
+  sigma_epsilon.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack2(ntypes,lj_types,sigma_epsilon,host_write,
+			 host_sigma,host_epsilon);
+
+  this->cut_form.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack2(ntypes,lj_types,this->cut_form,host_write,
+			 host_cutsq,h_form);
+
+  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
+			 host_cutsq,h_form);
+
+  lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
+		         host_offset);
+
+  dev_error.alloc(1,*(this->ucl_device));
+  dev_error.zero();
+    
+  // Allocate, cast and asynchronous memcpy of constant data
+  // Copy data for bonded interactions
+  gamma_upsilon_mu.alloc(7,*(this->ucl_device),UCL_READ_ONLY);
+  host_write[0]=static_cast<numtyp>(gamma); 
+  host_write[1]=static_cast<numtyp>(upsilon);
+  host_write[2]=static_cast<numtyp>(mu);
+  host_write[3]=static_cast<numtyp>(host_special_lj[0]);
+  host_write[4]=static_cast<numtyp>(host_special_lj[1]);
+  host_write[5]=static_cast<numtyp>(host_special_lj[2]);
+  host_write[6]=static_cast<numtyp>(host_special_lj[3]);
+  ucl_copy(gamma_upsilon_mu,host_write,7,false);
+
+  lshape.alloc(ntypes,*(this->ucl_device),UCL_READ_ONLY);
+  UCL_H_Vec<double> d_view;
+  d_view.view(host_lshape,lshape.numel(),*(this->ucl_device));
+  ucl_copy(lshape,d_view,false);
+    
+  // Copy shape, well, sigma, epsilon, and cutsq onto GPU
+  // - cast if necessary
+  shape.alloc(ntypes,*(this->ucl_device),UCL_READ_ONLY);
+  for (int i=0; i<ntypes; i++) {
+    host_write[i*4]=host_shape[i][0];
+    host_write[i*4+1]=host_shape[i][1];
+    host_write[i*4+2]=host_shape[i][2];
+  }
+  UCL_H_Vec<numtyp4> view4;
+  view4.view((numtyp4*)host_write.begin(),shape.numel(),*(this->ucl_device));
+  ucl_copy(shape,view4,false);
+
+  well.alloc(ntypes,*(this->ucl_device),UCL_READ_ONLY);
+  for (int i=0; i<ntypes; i++) {
+    host_write[i*4]=host_well[i][0];
+    host_write[i*4+1]=host_well[i][1];
+    host_write[i*4+2]=host_well[i][2];
+  }
+  view4.view((numtyp4*)host_write.begin(),well.numel(),*(this->ucl_device));
+  ucl_copy(well,view4,false);
+  
+  _allocated=true;
+  this->_max_bytes=sigma_epsilon.row_bytes()+this->cut_form.row_bytes()+
+                   lj1.row_bytes()+lj3.row_bytes()+gamma_upsilon_mu.row_bytes()+
+                   lshape.row_bytes()+shape.row_bytes()+well.row_bytes();
+
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+void GayBerneT::clear() {
+  if (!_allocated)
+    return;
+
+  UCL_H_Vec<int> err_flag(1,*(this->ucl_device));
+  ucl_copy(err_flag,dev_error,false);
+  if (err_flag[0] == 2)
+    std::cerr << "BAD MATRIX INVERSION IN FORCE COMPUTATION.\n";  
+  err_flag.clear();
+
+  _allocated=false;
+
+  dev_error.clear();
+  lj1.clear();
+  lj3.clear();
+  sigma_epsilon.clear();
+  this->cut_form.clear();
+
+  shape.clear();
+  well.clear();
+  lshape.clear();
+  gamma_upsilon_mu.clear();
+  
+  this->clear_base();
+}
+
+template <class numtyp, class acctyp>
+double GayBerneT::host_memory_usage() const {
+  return this->host_memory_usage_base()+sizeof(GayBerneT)+
+         4*sizeof(numtyp);
+}
+
+// ---------------------------------------------------------------------------
+// Calculate energies, forces, and torques
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void GayBerneT::loop(const bool _eflag, const bool _vflag) {
+  const int BX=this->block_size();
+  int eflag, vflag;
+  if (_eflag)
+    eflag=1;
+  else
+    eflag=0;
+
+  if (_vflag)
+    vflag=1;
+  else
+    vflag=0;
+  
+  int GX=0, NGX;
+  int stride=this->nbor->nbor_pitch();
+  int ainum=this->ans->inum();
+
+  if (this->_multiple_forms) {
+    this->time_nbor1.start();
+    if (this->_last_ellipse>0) {
+      // ------------ ELLIPSE_ELLIPSE and ELLIPSE_SPHERE ---------------
+      GX=static_cast<int>(ceil(static_cast<double>(this->_last_ellipse)/
+                               (BX/this->_threads_per_atom)));
+      NGX=static_cast<int>(ceil(static_cast<double>(this->_last_ellipse)/BX));
+      this->pack_nbors(NGX,BX, 0, this->_last_ellipse,ELLIPSE_SPHERE,
+			                 ELLIPSE_ELLIPSE,_shared_types,_lj_types);
+      this->time_nbor1.stop();
+
+      this->time_ellipsoid.start();
+      this->k_ellipsoid.set_size(GX,BX);
+      this->k_ellipsoid.run(&this->atom->dev_x.begin(),
+       &this->atom->dev_quat.begin(), &this->shape.begin(), &this->well.begin(),
+       &this->gamma_upsilon_mu.begin(), &this->sigma_epsilon.begin(), 
+       &this->_lj_types, &this->lshape.begin(), &this->nbor->dev_nbor.begin(),
+       &stride, &this->ans->dev_ans.begin(),&ainum,&this->ans->dev_engv.begin(),
+       &this->dev_error.begin(), &eflag, &vflag, &this->_last_ellipse,
+       &this->_threads_per_atom);
+      this->time_ellipsoid.stop();
+
+      if (this->_last_ellipse==this->ans->inum()) {
+        this->time_nbor2.start();
+        this->time_nbor2.stop();
+        this->time_ellipsoid2.start();
+        this->time_ellipsoid2.stop();
+        this->time_lj.start();
+        this->time_lj.stop();
+        return;
+      }
+
+      // ------------ SPHERE_ELLIPSE ---------------
+
+      this->time_nbor2.start();
+      GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum()-
+                               this->_last_ellipse)/
+                               (BX/this->_threads_per_atom)));
+      NGX=static_cast<int>(ceil(static_cast<double>(this->ans->inum()-
+                                this->_last_ellipse)/BX));
+      this->pack_nbors(NGX,BX,this->_last_ellipse,this->ans->inum(),
+			                 SPHERE_ELLIPSE,SPHERE_ELLIPSE,_shared_types,_lj_types);
+      this->time_nbor2.stop();
+
+      this->time_ellipsoid2.start();
+      this->k_sphere_ellipsoid.set_size(GX,BX);
+      this->k_sphere_ellipsoid.run(&this->atom->dev_x.begin(),
+        &this->atom->dev_quat.begin(), &this->shape.begin(), 
+        &this->well.begin(), &this->gamma_upsilon_mu.begin(), 
+        &this->sigma_epsilon.begin(), &this->_lj_types, &this->lshape.begin(), 
+        &this->nbor->dev_nbor.begin(), &stride, &this->ans->dev_ans.begin(),
+        &this->ans->dev_engv.begin(), &this->dev_error.begin(), &eflag,
+        &vflag, &this->_last_ellipse, &ainum, &this->_threads_per_atom);
+      this->time_ellipsoid2.stop();
+   } else {
+      this->ans->dev_ans.zero();
+      this->ans->dev_engv.zero();
+      this->time_nbor1.stop();
+      this->time_ellipsoid.start();                                 
+      this->time_ellipsoid.stop();
+      this->time_nbor2.start();
+      this->time_nbor2.stop();
+      this->time_ellipsoid2.start();
+      this->time_ellipsoid2.stop();
+    }
+    
+    // ------------         LJ      ---------------
+    this->time_lj.start();
+    if (this->_last_ellipse<this->ans->inum()) {
+      if (this->_shared_types) {
+        this->k_lj_fast.set_size(GX,BX);
+        this->k_lj_fast.run(&this->atom->dev_x.begin(), &this->lj1.begin(),
+          &this->lj3.begin(), &this->gamma_upsilon_mu.begin(), &stride,
+          &this->nbor->dev_packed.begin(), &this->ans->dev_ans.begin(),
+          &this->ans->dev_engv.begin(), &this->dev_error.begin(),
+          &eflag, &vflag, &this->_last_ellipse, &ainum,
+          &this->_threads_per_atom);
+      } else {
+        this->k_lj.set_size(GX,BX);
+        this->k_lj.run(&this->atom->dev_x.begin(), &this->lj1.begin(),
+          &this->lj3.begin(), &this->_lj_types, &this->gamma_upsilon_mu.begin(),
+          &stride, &this->nbor->dev_packed.begin(), &this->ans->dev_ans.begin(),
+          &this->ans->dev_engv.begin(), &this->dev_error.begin(), &eflag,
+          &vflag, &this->_last_ellipse, &ainum, &this->_threads_per_atom);
+      }
+    }
+    this->time_lj.stop();
+  } else {
+    GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                             (BX/this->_threads_per_atom)));
+    NGX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
+    this->time_nbor1.start();
+    this->pack_nbors(NGX, BX, 0, this->ans->inum(),SPHERE_SPHERE,
+		                 ELLIPSE_ELLIPSE,_shared_types,_lj_types);
+    this->time_nbor1.stop();
+    this->time_ellipsoid.start(); 
+    this->k_ellipsoid.set_size(GX,BX);
+    this->k_ellipsoid.run(&this->atom->dev_x.begin(), 
+      &this->atom->dev_quat.begin(), &this->shape.begin(), &this->well.begin(), 
+      &this->gamma_upsilon_mu.begin(), &this->sigma_epsilon.begin(), 
+      &this->_lj_types, &this->lshape.begin(), &this->nbor->dev_nbor.begin(),
+      &stride, &this->ans->dev_ans.begin(), &ainum, 
+      &this->ans->dev_engv.begin(), &this->dev_error.begin(),
+      &eflag, &vflag, &ainum, &this->_threads_per_atom);
+    this->time_ellipsoid.stop();
+  }
+}
+
+template class GayBerne<PRECISION,ACC_PRECISION>;
+
--- a/lib/gpu/lal_gayberne.cu
+++ b/lib/gpu/lal_gayberne.cu
@ -0,0 +1,356 @@
+// **************************************************************************
+//                                 gayberne.cu
+//                             -------------------
+//                           W. Michael Brown (ORNL)
+//
+//  Device code for Gay-Berne potential acceleration
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                :
+//    email                : brownw@ornl.gov
+// ***************************************************************************/
+
+#ifdef NV_KERNEL
+#include "lal_ellipsoid_extra.h"
+#endif
+
+ucl_inline void compute_eta_torque(numtyp m[9],numtyp m2[9], const numtyp4 shape, 
+                                 numtyp ans[9])
+{
+  numtyp den = m[3]*m[2]*m[7]-m[0]*m[5]*m[7]-
+    m[2]*m[6]*m[4]+m[1]*m[6]*m[5]-
+    m[3]*m[1]*m[8]+m[0]*m[4]*m[8];
+  den = ucl_recip(den);
+  
+  ans[0] = shape.x*(m[5]*m[1]*m2[2]+(numtyp)2.0*m[4]*m[8]*m2[0]-
+		    m[4]*m2[2]*m[2]-(numtyp)2.0*m[5]*m2[0]*m[7]+
+		    m2[1]*m[2]*m[7]-m2[1]*m[1]*m[8]-
+		    m[3]*m[8]*m2[1]+m[6]*m[5]*m2[1]+
+		    m[3]*m2[2]*m[7]-m2[2]*m[6]*m[4])*den;
+  
+  ans[1] = shape.x*(m[2]*m2[0]*m[7]-m[8]*m2[0]*m[1]+
+		    (numtyp)2.0*m[0]*m[8]*m2[1]-m[0]*m2[2]*m[5]-
+		    (numtyp)2.0*m[6]*m[2]*m2[1]+m2[2]*m[3]*m[2]-
+		    m[8]*m[3]*m2[0]+m[6]*m2[0]*m[5]+
+		    m[6]*m2[2]*m[1]-m2[2]*m[0]*m[7])*den;
+  
+  ans[2] = shape.x*(m[1]*m[5]*m2[0]-m[2]*m2[0]*m[4]-
+		    m[0]*m[5]*m2[1]+m[3]*m[2]*m2[1]-
+		    m2[1]*m[0]*m[7]-m[6]*m[4]*m2[0]+
+		    (numtyp)2.0*m[4]*m[0]*m2[2]-(numtyp)2.0*m[3]*m2[2]*m[1]+
+		    m[3]*m[7]*m2[0]+m[6]*m2[1]*m[1])*den;
+  
+  ans[3] = shape.y*(-m[4]*m2[5]*m[2]+(numtyp)2.0*m[4]*m[8]*m2[3]+
+		    m[5]*m[1]*m2[5]-(numtyp)2.0*m[5]*m2[3]*m[7]+
+		    m2[4]*m[2]*m[7]-m2[4]*m[1]*m[8]-
+		    m[3]*m[8]*m2[4]+m[6]*m[5]*m2[4]-
+		    m2[5]*m[6]*m[4]+m[3]*m2[5]*m[7])*den;
+  
+  ans[4] = shape.y*(m[2]*m2[3]*m[7]-m[1]*m[8]*m2[3]+
+		    (numtyp)2.0*m[8]*m[0]*m2[4]-m2[5]*m[0]*m[5]-
+		    (numtyp)2.0*m[6]*m2[4]*m[2]-m[3]*m[8]*m2[3]+
+		    m[6]*m[5]*m2[3]+m[3]*m2[5]*m[2]-
+		    m[0]*m2[5]*m[7]+m2[5]*m[1]*m[6])*den;
+  
+  ans[5] = shape.y*(m[1]*m[5]*m2[3]-m[2]*m2[3]*m[4]-
+		    m[0]*m[5]*m2[4]+m[3]*m[2]*m2[4]+
+		    (numtyp)2.0*m[4]*m[0]*m2[5]-m[0]*m2[4]*m[7]+
+		    m[1]*m[6]*m2[4]-m2[3]*m[6]*m[4]-
+		    (numtyp)2.0*m[3]*m[1]*m2[5]+m[3]*m2[3]*m[7])*den;
+  
+  ans[6] = shape.z*(-m[4]*m[2]*m2[8]+m[1]*m[5]*m2[8]+
+		    (numtyp)2.0*m[4]*m2[6]*m[8]-m[1]*m2[7]*m[8]+
+		    m[2]*m[7]*m2[7]-(numtyp)2.0*m2[6]*m[7]*m[5]-
+		    m[3]*m2[7]*m[8]+m[5]*m[6]*m2[7]-
+		    m[4]*m[6]*m2[8]+m[7]*m[3]*m2[8])*den;
+  
+  ans[7] = shape.z*-(m[1]*m[8]*m2[6]-m[2]*m2[6]*m[7]-
+		     (numtyp)2.0*m2[7]*m[0]*m[8]+m[5]*m2[8]*m[0]+
+		     (numtyp)2.0*m2[7]*m[2]*m[6]+m[3]*m2[6]*m[8]-
+		     m[3]*m[2]*m2[8]-m[5]*m[6]*m2[6]+
+		     m[0]*m2[8]*m[7]-m2[8]*m[1]*m[6])*den;
+  
+  ans[8] = shape.z*(m[1]*m[5]*m2[6]-m[2]*m2[6]*m[4]-
+		    m[0]*m[5]*m2[7]+m[3]*m[2]*m2[7]-
+		    m[4]*m[6]*m2[6]-m[7]*m2[7]*m[0]+
+		    (numtyp)2.0*m[4]*m2[8]*m[0]+m[7]*m[3]*m2[6]+
+		    m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den;
+}
+
+__kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q,
+                               __global numtyp4* shape, __global numtyp4* well, 
+                               __global numtyp *gum, __global numtyp2* sig_eps, 
+                               const int ntypes, __global numtyp *lshape, 
+                               __global int *dev_nbor, const int stride, 
+                               __global acctyp4 *ans, const int astride, 
+                               __global acctyp *engv, __global int *err_flag, 
+                               const int eflag, const int vflag, const int inum,
+                               const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp sp_lj[4];
+  sp_lj[0]=gum[3];    
+  sp_lj[1]=gum[4];    
+  sp_lj[2]=gum[5];    
+  sp_lj[3]=gum[6];    
+
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp4 tor;
+  tor.x=(acctyp)0;
+  tor.y=(acctyp)0;
+  tor.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+
+  if (ii<inum) {
+    __global int *nbor, *nbor_end;
+    int i, numj, n_stride;
+    nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
+                n_stride,nbor_end,nbor);
+  
+    numtyp4 ix=x_[i];
+    int itype=ix.w;
+    numtyp a1[9], b1[9], g1[9];
+    numtyp4 ishape=shape[itype];
+    {
+      numtyp t[9];
+      gpu_quat_to_mat_trans(q,i,a1);
+      gpu_diag_times3(ishape,a1,t);
+      gpu_transpose_times3(a1,t,g1);
+      gpu_diag_times3(well[itype],a1,t);
+      gpu_transpose_times3(a1,t,b1);
+    }
+
+    numtyp factor_lj;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=*nbor;
+      factor_lj = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      numtyp4 jx=x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp r12[3];
+      r12[0] = jx.x-ix.x;
+      r12[1] = jx.y-ix.y;
+      r12[2] = jx.z-ix.z;
+      numtyp ir = gpu_dot3(r12,r12);
+
+      ir = ucl_rsqrt(ir);
+      numtyp r = ucl_recip(ir);
+
+      numtyp a2[9];
+      gpu_quat_to_mat_trans(q,j,a2);
+  
+      numtyp u_r, dUr[3], tUr[3], eta, teta[3];
+      { // Compute U_r, dUr, eta, and teta
+        // Compute g12
+        numtyp g12[9];
+        {
+          numtyp g2[9];
+          {
+              gpu_diag_times3(shape[jtype],a2,g12);
+              gpu_transpose_times3(a2,g12,g2);
+              gpu_plus3(g1,g2,g12);
+          }
+
+          { // Compute U_r and dUr
+    
+            // Compute kappa
+            numtyp kappa[3];
+            gpu_mldivide3(g12,r12,kappa,err_flag);
+
+            // -- replace r12 with r12 hat
+            r12[0]*=ir;
+            r12[1]*=ir;
+            r12[2]*=ir;
+
+            // -- kappa is now / r
+            kappa[0]*=ir;
+            kappa[1]*=ir;
+            kappa[2]*=ir;
+
+            // energy
+  
+            // compute u_r and dUr
+            numtyp uslj_rsq;
+            {
+              // Compute distance of closest approach
+              numtyp h12, sigma12;
+              sigma12 = gpu_dot3(r12,kappa);
+              sigma12 = ucl_rsqrt((numtyp)0.5*sigma12);
+              h12 = r-sigma12;
+
+              // -- kappa is now ok
+              kappa[0]*=r;
+              kappa[1]*=r;
+              kappa[2]*=r;
+          
+              int mtype=fast_mul(ntypes,itype)+jtype;
+              numtyp sigma = sig_eps[mtype].x;
+              numtyp epsilon = sig_eps[mtype].y;
+              numtyp varrho = sigma/(h12+gum[0]*sigma);
+              numtyp varrho6 = varrho*varrho*varrho;
+              varrho6*=varrho6;
+              numtyp varrho12 = varrho6*varrho6;
+              u_r = (numtyp)4.0*epsilon*(varrho12-varrho6);
+
+              numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma;
+              temp1 = temp1*(numtyp)24.0*epsilon;
+              uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5;
+              numtyp temp2 = gpu_dot3(kappa,r12);
+              uslj_rsq = uslj_rsq*ir*ir;
+
+              dUr[0] = temp1*r12[0]+uslj_rsq*(kappa[0]-temp2*r12[0]);
+              dUr[1] = temp1*r12[1]+uslj_rsq*(kappa[1]-temp2*r12[1]);
+              dUr[2] = temp1*r12[2]+uslj_rsq*(kappa[2]-temp2*r12[2]);
+            }
+
+            // torque for particle 1
+            {
+              numtyp tempv[3], tempv2[3];
+              tempv[0] = -uslj_rsq*kappa[0];
+              tempv[1] = -uslj_rsq*kappa[1];
+              tempv[2] = -uslj_rsq*kappa[2];
+              gpu_row_times3(kappa,g1,tempv2);
+              gpu_cross3(tempv,tempv2,tUr);
+            }
+          }
+        }
+     
+        // Compute eta
+        {
+          eta = (numtyp)2.0*lshape[itype]*lshape[jtype];
+          numtyp det_g12 = gpu_det3(g12);
+          eta = ucl_powr(eta/det_g12,gum[1]);
+        }
+    
+        // Compute teta
+        numtyp temp[9], tempv[3], tempv2[3];
+        compute_eta_torque(g12,a1,ishape,temp);
+        numtyp temp1 = -eta*gum[1];
+
+        tempv[0] = temp1*temp[0];
+        tempv[1] = temp1*temp[1];
+        tempv[2] = temp1*temp[2];
+        gpu_cross3(a1,tempv,tempv2);
+        teta[0] = tempv2[0];
+        teta[1] = tempv2[1];
+        teta[2] = tempv2[2];
+  
+        tempv[0] = temp1*temp[3];
+        tempv[1] = temp1*temp[4];
+        tempv[2] = temp1*temp[5];
+        gpu_cross3(a1+3,tempv,tempv2);
+        teta[0] += tempv2[0];
+        teta[1] += tempv2[1];
+        teta[2] += tempv2[2];
+
+        tempv[0] = temp1*temp[6];
+        tempv[1] = temp1*temp[7];
+        tempv[2] = temp1*temp[8];
+        gpu_cross3(a1+6,tempv,tempv2);
+        teta[0] += tempv2[0];
+        teta[1] += tempv2[1];
+        teta[2] += tempv2[2];
+      }
+  
+      numtyp chi, dchi[3], tchi[3];
+      { // Compute chi and dchi
+
+        // Compute b12
+        numtyp b2[9], b12[9];
+        {
+          gpu_diag_times3(well[jtype],a2,b12);
+          gpu_transpose_times3(a2,b12,b2);
+          gpu_plus3(b1,b2,b12);
+        }
+
+        // compute chi_12
+        r12[0]*=r;
+        r12[1]*=r;
+        r12[2]*=r;
+        numtyp iota[3];
+        gpu_mldivide3(b12,r12,iota,err_flag);
+        // -- iota is now iota/r
+        iota[0]*=ir;
+        iota[1]*=ir;
+        iota[2]*=ir;
+        r12[0]*=ir;
+        r12[1]*=ir;
+        r12[2]*=ir;
+        chi = gpu_dot3(r12,iota);
+        chi = ucl_powr(chi*(numtyp)2.0,gum[2]);
+
+        // -- iota is now ok
+        iota[0]*=r;
+        iota[1]*=r;
+        iota[2]*=r;
+
+        numtyp temp1 = gpu_dot3(iota,r12);
+        numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*ucl_powr(chi,(gum[2]-(numtyp)1.0)/
+                                                          gum[2]);
+        dchi[0] = temp2*(iota[0]-temp1*r12[0]);
+        dchi[1] = temp2*(iota[1]-temp1*r12[1]);
+        dchi[2] = temp2*(iota[2]-temp1*r12[2]);
+
+        // compute t_chi
+        numtyp tempv[3];
+        gpu_row_times3(iota,b1,tempv);
+        gpu_cross3(tempv,iota,tchi);
+        temp1 = (numtyp)-4.0*ir*ir;
+        tchi[0] *= temp1;
+        tchi[1] *= temp1;
+        tchi[2] *= temp1;
+      }
+
+      numtyp temp2 = factor_lj*eta*chi;
+      if (eflag>0)
+        energy+=u_r*temp2;
+      numtyp temp1 = -eta*u_r*factor_lj;
+      if (vflag>0) {
+        r12[0]*=-r;
+        r12[1]*=-r;
+        r12[2]*=-r;
+        numtyp ft=temp1*dchi[0]-temp2*dUr[0];
+        f.x+=ft;
+        virial[0]+=r12[0]*ft;
+        ft=temp1*dchi[1]-temp2*dUr[1];
+        f.y+=ft;
+        virial[1]+=r12[1]*ft;
+        virial[3]+=r12[0]*ft;
+        ft=temp1*dchi[2]-temp2*dUr[2];
+        f.z+=ft;
+        virial[2]+=r12[2]*ft;
+        virial[4]+=r12[0]*ft;
+        virial[5]+=r12[1]*ft;
+      } else {
+        f.x+=temp1*dchi[0]-temp2*dUr[0];
+        f.y+=temp1*dchi[1]-temp2*dUr[1];
+        f.z+=temp1*dchi[2]-temp2*dUr[2];
+      }
+
+      // Torque on 1
+      temp1 = -u_r*eta*factor_lj;
+      temp2 = -u_r*chi*factor_lj;
+      numtyp temp3 = -chi*eta*factor_lj;
+      tor.x+=temp1*tchi[0]+temp2*teta[0]+temp3*tUr[0];
+      tor.y+=temp1*tchi[1]+temp2*teta[1]+temp3*tUr[1];
+      tor.z+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2];
+ 
+    } // for nbor
+    store_answers_t(f,tor,energy,virial,ii,astride,tid,t_per_atom,offset,eflag,
+                    vflag,ans,engv);
+  } // if ii
+}
+
--- a/lib/gpu/lal_gayberne.h
+++ b/lib/gpu/lal_gayberne.h
@ -0,0 +1,94 @@
+/***************************************************************************
+                                 gayberne.h
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Host code for Gay-Berne potential acceleration
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#ifndef LAL_GAYBERNE_H
+#define LAL_GAYBERNE_H
+
+#include "lal_base_ellipsoid.h"
+#include "mpi.h"
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class GayBerne : public BaseEllipsoid<numtyp, acctyp> {
+ public:
+  GayBerne();
+  ~GayBerne(); 
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device 
+    * \return false if there is not sufficient memory or device init prob
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, const double gamma,
+           const double upsilon, const double mu, double **host_shape,
+           double **host_well, double **host_cutsq, double **host_sigma, 
+           double **host_epsilon, double *host_lshape, int **h_form,
+           double **host_lj1, double **host_lj2, double **host_lj3, 
+           double **host_lj4, double **host_offset, 
+           const double *host_special_lj, const int nlocal, const int nall, 
+           const int max_nbors, const int maxspecial, const double cell_size,
+           const double gpu_split, FILE *screen);
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear();
+ 
+  /// Returns memory usage on device per atom
+  int bytes_per_atom(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage() const;
+
+  /// Device Error Flag - Set if a bad matrix inversion occurs
+  UCL_D_Vec<int> dev_error;
+  
+  // --------------------------- TYPE DATA -------------------------- 
+
+  /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq, lj1.w = form
+  UCL_D_Vec<numtyp4> lj1;
+  /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
+  UCL_D_Vec<numtyp4> lj3;
+  /// sigma_epsilon.x = sigma, sigma_epsilon.y = epsilon
+  UCL_D_Vec<numtyp2> sigma_epsilon;
+  // 0 - gamma, 1-upsilon, 2-mu, 3-special_lj[0], 4-special_lj[1], ...
+  UCL_D_Vec<numtyp> gamma_upsilon_mu;
+  
+  /// If atom type constants fit in shared memory, use fast kernels
+  bool _shared_types;
+  int _lj_types;
+   
+  // --------------------------- ATOM DATA -------------------------- 
+
+  /// Aspherical Const Data for Atoms
+  UCL_D_Vec<numtyp4> shape, well;
+  /// Aspherical Const Data for Atoms
+  UCL_D_Vec<numtyp> lshape;
+
+ private:
+  bool _allocated;
+  void loop(const bool _eflag, const bool _vflag);
+};
+
+}
+
+#endif
--- a/lib/gpu/lal_gayberne_ext.cpp
+++ b/lib/gpu/lal_gayberne_ext.cpp
@ -0,0 +1,141 @@
+/***************************************************************************
+                              gayberne_ext.cpp
+                             -------------------
+                               W. Michael Brown
+
+  LAMMPS Wrappers for Gay-Berne Acceleration
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#include <iostream>
+#include <cassert>
+#include <math.h>
+
+#include "lal_gayberne.h"
+
+using namespace std;
+using namespace LAMMPS_AL;
+
+static GayBerne<PRECISION,ACC_PRECISION> GBMF;
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+int gb_gpu_init(const int ntypes, const double gamma,
+                const double upsilon, const double mu, double **shape,
+                double **well, double **cutsq, double **sigma,
+                double **epsilon, double *host_lshape, int **form,
+                double **host_lj1, double **host_lj2, double **host_lj3,
+                double **host_lj4, double **offset, double *special_lj,
+                const int inum, const int nall, const int max_nbors, 
+                const int maxspecial, const double cell_size, int &gpu_mode,
+                FILE *screen) {
+  GBMF.clear();
+  gpu_mode=GBMF.device->gpu_mode();
+  double gpu_split=GBMF.device->particle_split();
+  int first_gpu=GBMF.device->first_device();
+  int last_gpu=GBMF.device->last_device();
+  int world_me=GBMF.device->world_me();
+  int gpu_rank=GBMF.device->gpu_rank();
+  int procs_per_gpu=GBMF.device->procs_per_gpu();
+
+  GBMF.device->init_message(screen,"gayberne",first_gpu,last_gpu);
+
+  bool message=false;
+  if (GBMF.device->replica_me()==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fflush(screen);
+  }
+
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq, 
+                      sigma, epsilon, host_lshape, form, host_lj1, 
+                      host_lj2, host_lj3, host_lj4, offset, special_lj, 
+                      inum, nall, max_nbors, maxspecial, cell_size, gpu_split,
+                      screen);
+
+  GBMF.device->world_barrier();
+  if (message)
+    fprintf(screen,"Done.\n");
+        
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+      else
+        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq,  sigma,
+                        epsilon, host_lshape, form, host_lj1, host_lj2,
+                        host_lj3, host_lj4, offset, special_lj,  inum, nall,
+                        max_nbors, maxspecial, cell_size, gpu_split,  screen);
+
+    GBMF.device->gpu_barrier();
+    if (message) 
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+
+  if (init_ok==0)
+    GBMF.estimate_gpu_overhead();
+  return init_ok;
+}
+
+// ---------------------------------------------------------------------------
+// Clear memory on host and device
+// ---------------------------------------------------------------------------
+void gb_gpu_clear() {
+  GBMF.clear();
+}
+
+  int** compute(const int ago, const int inum_full, const int nall,
+                double **host_x, int *host_type, double *sublo,
+                double *subhi, int *tag, int **nspecial,
+                int **special, const bool eflag, const bool vflag, 
+                const bool eatom, const bool vatom, int &host_start, 
+                int **ilist, int **numj, const double cpu_time, bool &success,
+                double **host_quat);
+
+int** gb_gpu_compute_n(const int ago, const int inum_full, const int nall,
+                       double **host_x, int *host_type, double *sublo,
+                       double *subhi, int *tag, int **nspecial, int **special,
+                       const bool eflag, const bool vflag, const bool eatom,
+                       const bool vatom, int &host_start, int **ilist,
+                       int **jnum, const double cpu_time, bool &success,
+                       double **host_quat) {
+  return GBMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, 
+                      tag, nspecial, special, eflag, vflag, eatom, vatom, 
+                      host_start, ilist, jnum, cpu_time, success, host_quat);
+}
+
+int * gb_gpu_compute(const int ago, const int inum_full, const int nall,
+                     double **host_x, int *host_type, int *ilist, int *numj,
+                     int **firstneigh, const bool eflag, const bool vflag,
+                     const bool eatom, const bool vatom, int &host_start,
+                     const double cpu_time, bool &success, double **host_quat) {
+  return GBMF.compute(ago, inum_full, nall, host_x, host_type, ilist,
+                      numj, firstneigh, eflag, vflag, eatom, vatom, host_start,
+                      cpu_time, success, host_quat);
+}
+
+// ---------------------------------------------------------------------------
+// Return memory usage
+// ---------------------------------------------------------------------------
+double gb_gpu_bytes() {
+  return GBMF.host_memory_usage();
+}
+
--- a/lib/gpu/lal_gayberne_lj.cu
+++ b/lib/gpu/lal_gayberne_lj.cu
@ -0,0 +1,408 @@
+// **************************************************************************
+//                                gayberne_lj.cu
+//                             -------------------
+//                           W. Michael Brown (ORNL)
+//
+//  Device code for Gay-Berne - Lennard-Jones potential acceleration
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                :
+//    email                : brownw@ornl.gov
+// ***************************************************************************/
+
+#ifdef NV_KERNEL
+#include "lal_ellipsoid_extra.h"
+#endif
+
+__kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
+                               __global numtyp4* shape,__global numtyp4* well, 
+                               __global numtyp *gum, __global numtyp2* sig_eps, 
+                               const int ntypes, __global numtyp *lshape, 
+                               __global int *dev_nbor, const int stride, 
+                               __global acctyp4 *ans, __global acctyp *engv, 
+                               __global int *err_flag, const int eflag, 
+                               const int vflag,const int start, const int inum, 
+                               const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+  ii+=start;
+
+  __local numtyp sp_lj[4];
+  sp_lj[0]=gum[3];    
+  sp_lj[1]=gum[4];    
+  sp_lj[2]=gum[5];    
+  sp_lj[3]=gum[6];    
+
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+
+  if (ii<inum) {
+    __global int *nbor, *nbor_end;
+    int i, numj, n_stride;
+    nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
+                n_stride,nbor_end,nbor);
+  
+    numtyp4 ix=x_[i];
+    int itype=ix.w;
+      
+    numtyp oner=shape[itype].x;
+    numtyp one_well=well[itype].x;
+  
+    numtyp factor_lj;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+  
+      int j=*nbor;
+      factor_lj = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      numtyp4 jx=x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp r12[3];
+      r12[0] = jx.x-ix.x;
+      r12[1] = jx.y-ix.y;
+      r12[2] = jx.z-ix.z;
+      numtyp ir = gpu_dot3(r12,r12);
+
+      ir = ucl_rsqrt(ir);
+      numtyp r = ucl_recip(ir);
+      
+      numtyp r12hat[3];
+      r12hat[0]=r12[0]*ir;
+      r12hat[1]=r12[1]*ir;
+      r12hat[2]=r12[2]*ir;
+
+      numtyp a2[9];
+      gpu_quat_to_mat_trans(q,j,a2);
+  
+      numtyp u_r, dUr[3], eta;
+      { // Compute U_r, dUr, eta, and teta
+        // Compute g12
+        numtyp g12[9];
+        {
+          {
+            numtyp g2[9];
+            gpu_diag_times3(shape[jtype],a2,g12);
+            gpu_transpose_times3(a2,g12,g2);
+            g12[0]=g2[0]+oner;
+            g12[4]=g2[4]+oner;
+            g12[8]=g2[8]+oner;
+            g12[1]=g2[1];
+            g12[2]=g2[2];
+            g12[3]=g2[3];
+            g12[5]=g2[5];
+            g12[6]=g2[6];
+            g12[7]=g2[7];    
+          }
+  
+          { // Compute U_r and dUr
+    
+            // Compute kappa
+            numtyp kappa[3];
+            gpu_mldivide3(g12,r12,kappa,err_flag);
+
+            // -- kappa is now / r
+            kappa[0]*=ir;
+            kappa[1]*=ir;
+            kappa[2]*=ir;
+  
+            // energy
+  
+            // compute u_r and dUr
+            numtyp uslj_rsq;
+            {
+              // Compute distance of closest approach
+              numtyp h12, sigma12;
+              sigma12 = gpu_dot3(r12hat,kappa);
+              sigma12 = ucl_rsqrt((numtyp)0.5*sigma12);
+              h12 = r-sigma12;
+
+              // -- kappa is now ok
+              kappa[0]*=r;
+              kappa[1]*=r;
+              kappa[2]*=r;
+          
+              int mtype=fast_mul(ntypes,itype)+jtype;
+              numtyp sigma = sig_eps[mtype].x;
+              numtyp epsilon = sig_eps[mtype].y;
+              numtyp varrho = sigma/(h12+gum[0]*sigma);
+              numtyp varrho6 = varrho*varrho*varrho;
+              varrho6*=varrho6;
+              numtyp varrho12 = varrho6*varrho6;
+              u_r = (numtyp)4.0*epsilon*(varrho12-varrho6);
+
+              numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma;
+              temp1 = temp1*(numtyp)24.0*epsilon;
+              uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5;
+              numtyp temp2 = gpu_dot3(kappa,r12hat);
+              uslj_rsq = uslj_rsq*ir*ir;
+
+              dUr[0] = temp1*r12hat[0]+uslj_rsq*(kappa[0]-temp2*r12hat[0]);
+              dUr[1] = temp1*r12hat[1]+uslj_rsq*(kappa[1]-temp2*r12hat[1]);
+              dUr[2] = temp1*r12hat[2]+uslj_rsq*(kappa[2]-temp2*r12hat[2]);
+            }
+          }
+        }
+     
+        // Compute eta
+        {
+          eta = (numtyp)2.0*lshape[itype]*lshape[jtype];
+          numtyp det_g12 = gpu_det3(g12);
+          eta = ucl_powr(eta/det_g12,gum[1]);
+        }
+      }
+  
+      numtyp chi, dchi[3];
+      { // Compute chi and dchi
+
+        // Compute b12
+        numtyp b12[9];
+        {
+          numtyp b2[9];
+          gpu_diag_times3(well[jtype],a2,b12);
+          gpu_transpose_times3(a2,b12,b2);
+          b12[0]=b2[0]+one_well;
+          b12[4]=b2[4]+one_well;
+          b12[8]=b2[8]+one_well;
+          b12[1]=b2[1];
+          b12[2]=b2[2];
+          b12[3]=b2[3];
+          b12[5]=b2[5];
+          b12[6]=b2[6];
+          b12[7]=b2[7];    
+        }
+
+        // compute chi_12
+        numtyp iota[3];
+        gpu_mldivide3(b12,r12,iota,err_flag);
+        // -- iota is now iota/r
+        iota[0]*=ir;
+        iota[1]*=ir;
+        iota[2]*=ir;
+        chi = gpu_dot3(r12hat,iota);
+        chi = ucl_powr(chi*(numtyp)2.0,gum[2]);
+
+        // -- iota is now ok
+        iota[0]*=r;
+        iota[1]*=r;
+        iota[2]*=r;
+
+        numtyp temp1 = gpu_dot3(iota,r12hat);
+        numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*ucl_powr(chi,(gum[2]-(numtyp)1.0)/
+                                                     gum[2]);
+        dchi[0] = temp2*(iota[0]-temp1*r12hat[0]);
+        dchi[1] = temp2*(iota[1]-temp1*r12hat[1]);
+        dchi[2] = temp2*(iota[2]-temp1*r12hat[2]);
+      }
+
+      numtyp temp2 = factor_lj*eta*chi;
+      if (eflag>0)
+        energy+=u_r*temp2;
+      numtyp temp1 = -eta*u_r*factor_lj;
+      if (vflag>0) {
+        r12[0]*=-1;
+        r12[1]*=-1;
+        r12[2]*=-1;
+        numtyp ft=temp1*dchi[0]-temp2*dUr[0];
+        f.x+=ft;
+        virial[0]+=r12[0]*ft;
+        ft=temp1*dchi[1]-temp2*dUr[1];
+        f.y+=ft;
+        virial[1]+=r12[1]*ft;
+        virial[3]+=r12[0]*ft;
+        ft=temp1*dchi[2]-temp2*dUr[2];
+        f.z+=ft;
+        virial[2]+=r12[2]*ft;
+        virial[4]+=r12[0]*ft;
+        virial[5]+=r12[1]*ft;
+      } else {
+        f.x+=temp1*dchi[0]-temp2*dUr[0];
+        f.y+=temp1*dchi[1]-temp2*dUr[1];
+        f.z+=temp1*dchi[2]-temp2*dUr[2];
+      }
+    } // for nbor
+    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                  ans,engv);
+  } // if ii
+}
+
+__kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1, 
+                        __global numtyp4* lj3, const int lj_types, 
+                        __global numtyp *gum, 
+                        const int stride, __global int *dev_ij, 
+                        __global acctyp4 *ans, __global acctyp *engv, 
+                        __global int *err_flag, const int eflag, 
+                        const int vflag, const int start, const int inum, 
+                        const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+  ii+=start;
+
+  __local numtyp sp_lj[4];
+  sp_lj[0]=gum[3];    
+  sp_lj[1]=gum[4];    
+  sp_lj[2]=gum[5];    
+  sp_lj[3]=gum[6];    
+
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
+  if (ii<inum) {
+    __global int *nbor, *list_end;
+    int i, numj, n_stride;
+    nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
+                n_stride,list_end,nbor);
+  
+    numtyp4 ix=x_[i];
+    int itype=ix.w;
+
+    numtyp factor_lj;
+    for ( ; nbor<list_end; nbor+=n_stride) {
+  
+      int j=*nbor;
+      factor_lj = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      numtyp4 jx=x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp r2inv = delx*delx+dely*dely+delz*delz;
+        
+      int ii=itype*lj_types+jtype;
+      if (r2inv<lj1[ii].z && lj1[ii].w==SPHERE_SPHERE) {
+        r2inv=ucl_recip(r2inv);
+        numtyp r6inv = r2inv*r2inv*r2inv;
+        numtyp force = r2inv*r6inv*(lj1[ii].x*r6inv-lj1[ii].y);
+        force*=factor_lj;
+      
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          numtyp e=r6inv*(lj3[ii].x*r6inv-lj3[ii].y);
+          energy+=factor_lj*(e-lj3[ii].z); 
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    acc_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
+  } // if ii
+}
+
+__kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, 
+                             __global numtyp4* lj3_in, __global numtyp *gum, 
+                             const int stride, __global int *dev_ij,
+                             __global acctyp4 *ans, __global acctyp *engv,
+                             __global int *err_flag, const int eflag,
+                             const int vflag, const int start, const int inum,
+                             const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+  ii+=start;
+
+  __local numtyp sp_lj[4];                              
+  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  if (tid<4)
+    sp_lj[tid]=gum[tid+3];    
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    lj1[tid]=lj1_in[tid];
+    if (eflag>0)
+      lj3[tid]=lj3_in[tid];
+  }
+  
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
+  __syncthreads();
+  
+  if (ii<inum) {
+    __global int *nbor, *list_end;
+    int i, numj, n_stride;
+    nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
+                n_stride,list_end,nbor);
+
+    numtyp4 ix=x_[i];
+    int iw=ix.w;
+    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
+
+    numtyp factor_lj;
+    for ( ; nbor<list_end; nbor+=n_stride) {
+  
+      int j=*nbor;
+      factor_lj = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      numtyp4 jx=x_[j];
+      int mtype=itype+jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp r2inv = delx*delx+dely*dely+delz*delz;
+        
+      if (r2inv<lj1[mtype].z && lj1[mtype].w==SPHERE_SPHERE) {
+        r2inv=ucl_recip(r2inv);
+        numtyp r6inv = r2inv*r2inv*r2inv;
+        numtyp force = factor_lj*r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
+      
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
+          energy+=factor_lj*(e-lj3[mtype].z); 
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    acc_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
+  } // if ii
+}
--- a/lib/gpu/lal_lj.cpp
+++ b/lib/gpu/lal_lj.cpp
@ -0,0 +1,154 @@
+/***************************************************************************
+                                   lj.cpp
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Class for acceleration of the lj/cut pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#ifdef USE_OPENCL
+#include "lj_cl.h"
+#else
+#include "lj_ptx.h"
+#endif
+
+#include "lal_lj.h"
+#include <cassert>
+using namespace LAMMPS_AL;
+#define LJT LJ<numtyp, acctyp>
+
+extern Device<PRECISION,ACC_PRECISION> device;
+
+template <class numtyp, class acctyp>
+LJT::LJ() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
+}
+
+template <class numtyp, class acctyp>
+LJT::~LJ() { 
+  clear();
+}
+ 
+template <class numtyp, class acctyp>
+int LJT::bytes_per_atom(const int max_nbors) const {
+  return this->bytes_per_atom_atomic(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int LJT::init(const int ntypes, 
+                          double **host_cutsq, double **host_lj1, 
+                          double **host_lj2, double **host_lj3, 
+                          double **host_lj4, double **host_offset, 
+                          double *host_special_lj, const int nlocal,
+                          const int nall, const int max_nbors,
+                          const int maxspecial, const double cell_size,
+                          const double gpu_split, FILE *_screen) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                            _screen,lj);
+  if (success!=0)
+    return success;
+
+  // If atom type constants fit in shared memory use fast kernel
+  int lj_types=ntypes;
+  shared_types=false;
+  int max_shared_types=this->device->max_shared_types();
+  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
+    lj_types=max_shared_types;
+    shared_types=true;
+  }
+  _lj_types=lj_types;
+
+  // Allocate a host write buffer for data initialization
+  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
+                               UCL_WRITE_OPTIMIZED);
+
+  for (int i=0; i<lj_types*lj_types; i++)
+    host_write[i]=0.0;
+
+  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
+			 host_cutsq);
+
+  lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
+		         host_offset);
+
+  UCL_H_Vec<double> dview;
+  sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
+  dview.view(host_special_lj,4,*(this->ucl_device));
+  ucl_copy(sp_lj,dview,false);
+
+  _allocated=true;
+  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+void LJT::clear() {
+  if (!_allocated)
+    return;
+  _allocated=false;
+
+  lj1.clear();
+  lj3.clear();
+  sp_lj.clear();
+  this->clear_atomic();
+}
+
+template <class numtyp, class acctyp>
+double LJT::host_memory_usage() const {
+  return this->host_memory_usage_atomic()+sizeof(LJ<numtyp,acctyp>);
+}
+
+// ---------------------------------------------------------------------------
+// Calculate energies, forces, and torques
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void LJT::loop(const bool _eflag, const bool _vflag) {
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int eflag, vflag;
+  if (_eflag)
+    eflag=1;
+  else
+    eflag=0;
+
+  if (_vflag)
+    vflag=1;
+  else
+    vflag=0;
+  
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+
+  int ainum=this->ans->inum();
+  int nbor_pitch=this->nbor->nbor_pitch();
+  this->time_pair.start();
+  if (shared_types) {
+    this->k_pair_fast.set_size(GX,BX);
+    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
+                          &lj3.begin(), &sp_lj.begin(),
+                          &this->nbor->dev_nbor.begin(),
+                          &this->_nbor_data->begin(),
+                          &this->ans->dev_ans.begin(),
+                          &this->ans->dev_engv.begin(), &eflag, &vflag,
+                          &ainum, &nbor_pitch, &this->_threads_per_atom);
+  } else {
+    this->k_pair.set_size(GX,BX);
+    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
+                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
+                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
+                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &nbor_pitch, &this->_threads_per_atom);
+  }
+  this->time_pair.stop();
+}
+
+template class LJ<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lal_lj.cu
+++ b/lib/gpu/lal_lj.cu
@ -0,0 +1,188 @@
+// **************************************************************************
+//                                   lj.cu
+//                             -------------------
+//                           W. Michael Brown (ORNL)
+//
+//  Device code for acceleration of the lj/cut pair style
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                : 
+//    email                : brownw@ornl.gov
+// ***************************************************************************/
+
+#ifdef NV_KERNEL
+#include "lal_aux_fun1.h"
+texture<float4> pos_tex;
+#ifndef _DOUBLE_DOUBLE
+ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
+  { return tex1Dfetch(pos_tex, i); }
+#endif
+#endif
+
+__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
+                          __global numtyp4* lj3, const int lj_types, 
+                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
+                          __global int *dev_packed, __global acctyp4 *ans,
+                          __global acctyp *engv, const int eflag, 
+                          const int vflag, const int inum,
+                          const int nbor_pitch, const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp sp_lj[4];
+  sp_lj[0]=sp_lj_in[0];
+  sp_lj[1]=sp_lj_in[1];
+  sp_lj[2]=sp_lj_in[2];
+  sp_lj[3]=sp_lj_in[3];
+
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
+  if (ii<inum) {
+    __global int *nbor, *list_end;
+    int i, numj, n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,list_end,nbor);
+  
+    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    int itype=ix.w;
+
+    numtyp factor_lj;
+    for ( ; nbor<list_end; nbor+=n_stride) {
+  
+      int j=*nbor;
+      factor_lj = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp r2inv = delx*delx+dely*dely+delz*delz;
+        
+      int mtype=itype*lj_types+jtype;
+      if (r2inv<lj1[mtype].z) {
+        r2inv=ucl_recip(r2inv);
+        numtyp r6inv = r2inv*r2inv*r2inv;
+        numtyp force = r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
+        force*=factor_lj;
+      
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
+          energy+=factor_lj*(e-lj3[mtype].z); 
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                  ans,engv);
+  } // if ii
+}
+
+__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
+                               __global numtyp4* lj3_in, 
+                               __global numtyp* sp_lj_in, 
+                               __global int *dev_nbor, __global int *dev_packed, 
+                               __global acctyp4 *ans, __global acctyp *engv, 
+                               const int eflag, const int vflag, const int inum, 
+                               const int nbor_pitch, const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+  
+  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp sp_lj[4];
+  if (tid<4)
+    sp_lj[tid]=sp_lj_in[tid];
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    lj1[tid]=lj1_in[tid];
+    if (eflag>0)
+      lj3[tid]=lj3_in[tid];
+  }
+  
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+
+  __syncthreads();
+  
+  if (ii<inum) {
+    __global int *nbor, *list_end;
+    int i, numj, n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,list_end,nbor);
+
+    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    int iw=ix.w;
+    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
+
+    numtyp factor_lj;
+    for ( ; nbor<list_end; nbor+=n_stride) {
+  
+      int j=*nbor;
+      factor_lj = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      int mtype=itype+jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp r2inv = delx*delx+dely*dely+delz*delz;
+        
+      if (r2inv<lj1[mtype].z) {
+        r2inv=ucl_recip(r2inv);
+        numtyp r6inv = r2inv*r2inv*r2inv;
+        numtyp force = factor_lj*r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
+      
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
+          energy+=factor_lj*(e-lj3[mtype].z); 
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                  ans,engv);
+  } // if ii
+}
+
--- a/lib/gpu/lal_lj.h
+++ b/lib/gpu/lal_lj.h
@ -0,0 +1,79 @@
+/***************************************************************************
+                                    lj.h
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Class for acceleration of the lj/cut pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#ifndef LAL_LJ_H
+#define LAL_LJ_H
+
+#include "lal_base_atomic.h"
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class LJ : public BaseAtomic<numtyp, acctyp> {
+ public:
+  LJ();
+  ~LJ(); 
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double **host_cutsq,
+           double **host_lj1, double **host_lj2, double **host_lj3,
+           double **host_lj4, double **host_offset, double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors, 
+           const int maxspecial, const double cell_size, 
+           const double gpu_split, FILE *screen);
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage() const;
+
+  // --------------------------- TYPE DATA --------------------------
+
+  /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq
+  UCL_D_Vec<numtyp4> lj1;
+  /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
+  UCL_D_Vec<numtyp4> lj3;
+  /// Special LJ values
+  UCL_D_Vec<numtyp> sp_lj;
+
+  /// If atom type constants fit in shared memory, use fast kernels
+  bool shared_types;
+
+  /// Number of atom types 
+  int _lj_types;
+
+ private:
+  bool _allocated;
+  void loop(const bool _eflag, const bool _vflag);
+};
+
+}
+
+#endif
--- a/lib/gpu/lal_lj96.cpp
+++ b/lib/gpu/lal_lj96.cpp
@ -0,0 +1,154 @@
+/***************************************************************************
+                                   lj96.cpp
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Class for acceleration of the lj96/cut pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#ifdef USE_OPENCL
+#include "lj96_cl.h"
+#else
+#include "lj96_ptx.h"
+#endif
+
+#include "lal_lj96.h"
+#include <cassert>
+using namespace LAMMPS_AL;
+#define LJ96T LJ96<numtyp, acctyp>
+
+extern Device<PRECISION,ACC_PRECISION> device;
+
+template <class numtyp, class acctyp>
+LJ96T::LJ96() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
+}
+
+template <class numtyp, class acctyp>
+LJ96T::~LJ96() {
+  clear();
+}
+ 
+template <class numtyp, class acctyp>
+int LJ96T::bytes_per_atom(const int max_nbors) const {
+  return this->bytes_per_atom_atomic(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int LJ96T::init(const int ntypes,
+                           double **host_cutsq, double **host_lj1, 
+                           double **host_lj2, double **host_lj3, 
+                           double **host_lj4, double **host_offset, 
+                           double *host_special_lj, const int nlocal,
+                           const int nall, const int max_nbors,
+                           const int maxspecial, const double cell_size,
+                           const double gpu_split, FILE *_screen) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                            _screen,lj96);
+  if (success!=0)
+    return success;
+
+  // If atom type constants fit in shared memory use fast kernel
+  int lj_types=ntypes;
+  shared_types=false;
+  int max_shared_types=this->device->max_shared_types();
+  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
+    lj_types=max_shared_types;
+    shared_types=true;
+  }
+  _lj_types=lj_types;
+
+  // Allocate a host write buffer for data initialization
+  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
+                               UCL_WRITE_OPTIMIZED);
+
+  for (int i=0; i<lj_types*lj_types; i++)
+    host_write[i]=0.0;
+
+  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
+			 host_cutsq);
+
+  lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
+		         host_offset);
+
+  UCL_H_Vec<double> dview;
+  sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
+  dview.view(host_special_lj,4,*(this->ucl_device));
+  ucl_copy(sp_lj,dview,false);
+
+  _allocated=true;
+  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+void LJ96T::clear() {
+  if (!_allocated)
+    return;
+  _allocated=false;
+
+  lj1.clear();
+  lj3.clear();
+  sp_lj.clear();
+  this->clear_atomic();
+}
+
+template <class numtyp, class acctyp>
+double LJ96T::host_memory_usage() const {
+  return this->host_memory_usage_atomic()+sizeof(LJ96<numtyp,acctyp>);
+}
+
+// ---------------------------------------------------------------------------
+// Calculate energies, forces, and torques
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void LJ96T::loop(const bool _eflag, const bool _vflag) {
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int eflag, vflag;
+  if (_eflag)
+    eflag=1;
+  else
+    eflag=0;
+
+  if (_vflag)
+    vflag=1;
+  else
+    vflag=0;
+  
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+
+  int ainum=this->ans->inum();
+  int nbor_pitch=this->nbor->nbor_pitch();
+  this->time_pair.start();
+  if (shared_types) {
+    this->k_pair_fast.set_size(GX,BX);
+    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
+                          &lj3.begin(), &sp_lj.begin(),
+                          &this->nbor->dev_nbor.begin(),
+                          &this->_nbor_data->begin(),
+                          &this->ans->dev_ans.begin(),
+                          &this->ans->dev_engv.begin(), &eflag, &vflag,
+                          &ainum, &nbor_pitch, &this->_threads_per_atom);
+  } else {
+    this->k_pair.set_size(GX,BX);
+    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
+                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
+                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
+                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &nbor_pitch, &this->_threads_per_atom);
+  }
+  this->time_pair.stop();
+}
+
+template class LJ96<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lal_lj96.cu
+++ b/lib/gpu/lal_lj96.cu
@ -0,0 +1,190 @@
+// **************************************************************************
+//                                  lj96.cu
+//                             -------------------
+//                           W. Michael Brown (ORNL)
+//
+//  Device code for acceleration of the lj96/cut style
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                : 
+//    email                : brownw@ornl.gov
+// ***************************************************************************/
+
+#ifdef NV_KERNEL
+#include "lal_aux_fun1.h"
+texture<float4> pos_tex;
+#ifndef _DOUBLE_DOUBLE
+ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
+  { return tex1Dfetch(pos_tex, i); }
+#endif
+#endif
+
+__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
+                          __global numtyp4* lj3, const int lj_types, 
+                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
+                          __global int *dev_packed, __global acctyp4 *ans,
+                          __global acctyp *engv, const int eflag,
+                          const int vflag, const int inum,
+                          const int nbor_pitch, const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp sp_lj[4];
+  sp_lj[0]=sp_lj_in[0];
+  sp_lj[1]=sp_lj_in[1];
+  sp_lj[2]=sp_lj_in[2];
+  sp_lj[3]=sp_lj_in[3];
+
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
+  if (ii<inum) {
+    __global int *nbor, *list_end;
+    int i, numj, n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,list_end,nbor);
+  
+    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    int itype=ix.w;
+
+    numtyp factor_lj;
+    for ( ; nbor<list_end; nbor+=n_stride) {
+  
+      int j=*nbor;
+      factor_lj = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp r2inv = delx*delx+dely*dely+delz*delz;
+        
+      int mtype=itype*lj_types+jtype;
+      if (r2inv<lj1[mtype].z) {
+        r2inv=ucl_recip(r2inv);
+        numtyp r6inv = r2inv*r2inv*r2inv;
+        numtyp r3inv = ucl_sqrt(r6inv);
+        numtyp force = r2inv*r6inv*(lj1[mtype].x*r3inv-lj1[mtype].y);
+        force*=factor_lj;
+      
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y);
+          energy+=factor_lj*(e-lj3[mtype].z); 
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                  ans,engv);
+  } // if ii
+}
+
+__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
+                               __global numtyp4* lj3_in, 
+                               __global numtyp* sp_lj_in,
+                               __global int *dev_nbor, __global int *dev_packed,
+                               __global acctyp4 *ans, __global acctyp *engv, 
+                               const int eflag, const int vflag, const int inum, 
+                               const int nbor_pitch, const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp sp_lj[4];
+  if (tid<4)
+    sp_lj[tid]=sp_lj_in[tid];
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    lj1[tid]=lj1_in[tid];
+    if (eflag>0)
+      lj3[tid]=lj3_in[tid];
+  }
+  
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
+  __syncthreads();
+  
+  if (ii<inum) {
+    __global int *nbor, *list_end;
+    int i, numj, n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,list_end,nbor);
+  
+    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    int iw=ix.w;
+    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
+
+    numtyp factor_lj;
+    for ( ; nbor<list_end; nbor+=n_stride) {
+  
+      int j=*nbor;
+      factor_lj = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      int mtype=itype+jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp r2inv = delx*delx+dely*dely+delz*delz;
+        
+      if (r2inv<lj1[mtype].z) {
+        r2inv=ucl_recip(r2inv);
+        numtyp r6inv = r2inv*r2inv*r2inv;
+        numtyp r3inv = ucl_sqrt(r6inv);
+        numtyp force = r2inv*r6inv*(lj1[mtype].x*r3inv-lj1[mtype].y);
+      
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y);
+          energy+=factor_lj*(e-lj3[mtype].z); 
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                  ans,engv);
+  } // if ii
+}
+
--- a/lib/gpu/lal_lj96.h
+++ b/lib/gpu/lal_lj96.h
@ -0,0 +1,79 @@
+/***************************************************************************
+                                   lj96.h
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Class for acceleration of the lj96/cut pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#ifndef LAL_LJ96_H
+#define LAL_LJ96_H
+
+#include "lal_base_atomic.h"
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class LJ96 : public BaseAtomic<numtyp, acctyp> {
+ public:
+  LJ96();
+  ~LJ96();
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double **host_cutsq, double **host_lj1,
+           double **host_lj2, double **host_lj3, double **host_lj4,
+           double **host_offset, double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors, 
+           const int maxspecial, const double cell_size, 
+           const double gpu_split, FILE *screen);
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage() const;
+
+  // --------------------------- TYPE DATA --------------------------
+
+  /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq
+  UCL_D_Vec<numtyp4> lj1;
+  /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
+  UCL_D_Vec<numtyp4> lj3;
+  /// Special LJ values
+  UCL_D_Vec<numtyp> sp_lj;
+
+  /// If atom type constants fit in shared memory, use fast kernels
+  bool shared_types;
+
+  /// Number of atom types 
+  int _lj_types;
+
+ private:
+  bool _allocated;
+  void loop(const bool _eflag, const bool _vflag);
+};
+
+}
+
+#endif
--- a/lib/gpu/lal_lj96_ext.cpp
+++ b/lib/gpu/lal_lj96_ext.cpp
@ -0,0 +1,120 @@
+/***************************************************************************
+                                 lj96_ext.cpp
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Functions for LAMMPS access to lj96/cut acceleration routines.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#include <iostream>
+#include <cassert>
+#include <math.h>
+
+#include "lal_lj96.h"
+
+using namespace std;
+using namespace LAMMPS_AL;
+
+static LJ96<PRECISION,ACC_PRECISION> LJ96MF;
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+int lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
+                  double **host_lj2, double **host_lj3, double **host_lj4,
+                  double **offset, double *special_lj, const int inum,
+                  const int nall, const int max_nbors, const int maxspecial,
+                  const double cell_size, int &gpu_mode, FILE *screen) {
+  LJ96MF.clear();
+  gpu_mode=LJ96MF.device->gpu_mode();
+  double gpu_split=LJ96MF.device->particle_split();
+  int first_gpu=LJ96MF.device->first_device();
+  int last_gpu=LJ96MF.device->last_device();
+  int world_me=LJ96MF.device->world_me();
+  int gpu_rank=LJ96MF.device->gpu_rank();
+  int procs_per_gpu=LJ96MF.device->procs_per_gpu();
+
+  LJ96MF.device->init_message(screen,"lj96/cut",first_gpu,last_gpu);
+
+  bool message=false;
+  if (LJ96MF.device->replica_me()==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fflush(screen);
+  }
+
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
+                        host_lj4, offset, special_lj, inum, nall, 300,
+                        maxspecial, cell_size, gpu_split, screen);
+
+  LJ96MF.device->world_barrier();
+  if (message)
+    fprintf(screen,"Done.\n");
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+      else
+        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
+                          offset, special_lj, inum,  nall, 300, maxspecial,
+                          cell_size, gpu_split, screen);
+
+    LJ96MF.device->gpu_barrier();
+    if (message) 
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+
+  if (init_ok==0)
+    LJ96MF.estimate_gpu_overhead();
+  return init_ok;
+}
+
+void lj96_gpu_clear() {
+  LJ96MF.clear();
+}
+
+int** lj96_gpu_compute_n(const int ago, const int inum_full,
+                         const int nall, double **host_x, int *host_type,
+                         double *sublo, double *subhi, int *tag, int **nspecial,
+                         int **special, const bool eflag, const bool vflag,
+                         const bool eatom, const bool vatom, int &host_start,
+                         int **ilist, int **jnum, const double cpu_time,
+                         bool &success) {
+  return LJ96MF.compute(ago, inum_full, nall, host_x, host_type, sublo,
+                        subhi, tag, nspecial, special, eflag, vflag, eatom,
+                        vatom, host_start, ilist, jnum, cpu_time, success);
+}  
+			
+void lj96_gpu_compute(const int ago, const int inum_full, const int nall,
+                      double **host_x, int *host_type, int *ilist, int *numj,
+                      int **firstneigh, const bool eflag, const bool vflag,
+                      const bool eatom, const bool vatom, int &host_start,
+                      const double cpu_time, bool &success) {
+  LJ96MF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,
+                 eflag,vflag,eatom,vatom,host_start,cpu_time,success);
+}
+
+double lj96_gpu_bytes() {
+  return LJ96MF.host_memory_usage();
+}
+
+
--- a/lib/gpu/lal_lj_class2_long.cpp
+++ b/lib/gpu/lal_lj_class2_long.cpp
@ -0,0 +1,168 @@
+/***************************************************************************
+                             lj_class2_long.cpp
+                             -------------------
+                               W. Michael Brown
+
+  Host code for COMPASS LJ long potential acceleration
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : Mon May 16 2011
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#ifdef USE_OPENCL
+#include "lj_class2_long_cl.h"
+#else
+#include "lj_class2_long_ptx.h"
+#endif
+
+#include "lal_lj_class2_long.h"
+#include <cassert>
+using namespace LAMMPS_AL;
+
+#define LJClass2LongT LJClass2Long<numtyp, acctyp>
+
+extern Device<PRECISION,ACC_PRECISION> device;
+
+template <class numtyp, class acctyp>
+LJClass2LongT::LJClass2Long() : BaseCharge<numtyp,acctyp>(),
+                                _allocated(false) {
+}
+
+template <class numtyp, class acctyp>
+LJClass2LongT::~LJClass2Long() {
+  clear();
+}
+ 
+template <class numtyp, class acctyp>
+int LJClass2LongT::bytes_per_atom(const int max_nbors) const {
+  return this->bytes_per_atom_atomic(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int LJClass2LongT::init(const int ntypes, double **host_cutsq,
+                        double **host_lj1, double **host_lj2, double **host_lj3, 
+                        double **host_lj4, double **host_offset, 
+                        double *host_special_lj, const int nlocal,
+                        const int nall, const int max_nbors,
+                        const int maxspecial, const double cell_size,
+                        const double gpu_split, FILE *_screen,
+                        double **host_cut_ljsq, const double host_cut_coulsq,
+                        double *host_special_coul, const double qqrd2e,
+                        const double g_ewald) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                            _screen,lj_class2_long);
+  if (success!=0)
+    return success;
+
+  // If atom type constants fit in shared memory use fast kernel
+  int lj_types=ntypes;
+  shared_types=false;
+  int max_shared_types=this->device->max_shared_types();
+  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
+    lj_types=max_shared_types;
+    shared_types=true;
+  }
+  _lj_types=lj_types;
+
+  // Allocate a host write buffer for data initialization
+  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
+                               UCL_WRITE_OPTIMIZED);
+
+  for (int i=0; i<lj_types*lj_types; i++)
+    host_write[i]=0.0;
+
+  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
+			 host_cutsq, host_cut_ljsq);
+
+  lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
+		         host_offset);
+
+  sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
+  for (int i=0; i<4; i++) {
+    host_write[i]=host_special_lj[i];
+    host_write[i+4]=host_special_coul[i];
+  }
+  ucl_copy(sp_lj,host_write,8,false);
+
+  _cut_coulsq=host_cut_coulsq;
+  _qqrd2e=qqrd2e;
+  _g_ewald=g_ewald;
+
+  _allocated=true;
+  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+void LJClass2LongT::clear() {
+  if (!_allocated)
+    return;
+  _allocated=false;
+
+  lj1.clear();
+  lj3.clear();
+  sp_lj.clear();
+  this->clear_atomic();
+}
+
+template <class numtyp, class acctyp>
+double LJClass2LongT::host_memory_usage() const {
+  return this->host_memory_usage_atomic()+sizeof(LJClass2Long<numtyp,acctyp>);
+}
+
+// ---------------------------------------------------------------------------
+// Calculate energies, forces, and torques
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void LJClass2LongT::loop(const bool _eflag, const bool _vflag) {
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int eflag, vflag;
+  if (_eflag)
+    eflag=1;
+  else
+    eflag=0;
+
+  if (_vflag)
+    vflag=1;
+  else
+    vflag=0;
+  
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+
+  int ainum=this->ans->inum();
+  int nbor_pitch=this->nbor->nbor_pitch();
+  this->time_pair.start();
+  if (shared_types) {
+    this->k_pair_fast.set_size(GX,BX);
+    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
+                          &lj3.begin(), &sp_lj.begin(),
+                          &this->nbor->dev_nbor.begin(),
+                          &this->_nbor_data->begin(),
+                          &this->ans->dev_ans.begin(),
+                          &this->ans->dev_engv.begin(), &eflag, &vflag,
+                          &ainum, &nbor_pitch, &this->atom->dev_q.begin(),
+                          &_cut_coulsq, &_qqrd2e, &_g_ewald, 
+                          &this->_threads_per_atom);
+  } else {
+    this->k_pair.set_size(GX,BX);
+    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
+                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
+                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
+                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq,
+                     &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
+  }
+  this->time_pair.stop();
+}
+
+template class LJClass2Long<PRECISION,ACC_PRECISION>;
+
--- a/lib/gpu/lal_lj_class2_long.cu
+++ b/lib/gpu/lal_lj_class2_long.cu
@ -0,0 +1,252 @@
+// **************************************************************************
+//                              lj_class2_long.cu
+//                             -------------------
+//                               W. Michael Brown
+//
+//  Device code for COMPASS LJ long acceleration
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                : Mon May 16 2011
+//    email                : brownw@ornl.gov
+// ***************************************************************************/
+
+#ifdef NV_KERNEL
+#include "lal_aux_fun1.h"
+texture<float4> pos_tex;
+texture<float> q_tex;
+#ifndef _DOUBLE_DOUBLE
+ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
+  { return tex1Dfetch(pos_tex, i); }
+ucl_inline float fetch_q(const int& i, const float *q) 
+  { return tex1Dfetch(q_tex, i); }
+#endif
+#endif
+
+__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
+                          __global numtyp4* lj3, const int lj_types, 
+                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
+                          __global int *dev_packed, __global acctyp4 *ans,
+                          __global acctyp *engv, const int eflag, 
+                          const int vflag, const int inum,
+                          const int nbor_pitch, __global numtyp *q_,
+                          const numtyp cut_coulsq, const numtyp qqrd2e,
+                          const numtyp g_ewald, const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp sp_lj[8];
+  sp_lj[0]=sp_lj_in[0];
+  sp_lj[1]=sp_lj_in[1];
+  sp_lj[2]=sp_lj_in[2];
+  sp_lj[3]=sp_lj_in[3];
+  sp_lj[4]=sp_lj_in[4];
+  sp_lj[5]=sp_lj_in[5];
+  sp_lj[6]=sp_lj_in[6];
+  sp_lj[7]=sp_lj_in[7];
+
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
+  if (ii<inum) {
+    __global int *nbor, *list_end;
+    int i, numj, n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,list_end,nbor);
+  
+    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp qtmp=fetch_q(i,q_);
+    int itype=ix.w;
+
+    for ( ; nbor<list_end; nbor+=n_stride) {
+      int j=*nbor;
+
+      numtyp factor_lj, factor_coul;
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
+      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      int mtype=itype*lj_types+jtype;
+      if (rsq<lj1[mtype].z) {
+        numtyp r2inv=ucl_recip(rsq);
+        numtyp forcecoul, force_lj, force, r6inv, r3inv, prefactor, _erfc;
+
+        if (rsq < lj1[mtype].w) {
+          numtyp rinv=ucl_rsqrt(rsq);
+          r3inv=r2inv*rinv;
+          r6inv = r3inv*r3inv;
+          force_lj = factor_lj*r6inv*(lj1[mtype].x*r3inv-lj1[mtype].y);
+        } else
+          force_lj = (numtyp)0.0;
+
+        if (rsq < cut_coulsq) {
+          numtyp r = ucl_rsqrt(r2inv);
+          numtyp grij = g_ewald * r;
+          numtyp expm2 = ucl_exp(-grij*grij);
+          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
+          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+          prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
+          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
+        } else
+          forcecoul = (numtyp)0.0;
+
+        force = (force_lj + forcecoul) * r2inv;
+
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          if (rsq < cut_coulsq)
+            e_coul += prefactor*(_erfc-factor_coul);
+          if (rsq < lj1[mtype].w) {
+            numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y);
+            energy+=factor_lj*(e-lj3[mtype].z);
+          } 
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                    vflag,ans,engv);
+  } // if ii
+}
+
+__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
+                               __global numtyp4* lj3_in, 
+                               __global numtyp* sp_lj_in,
+                               __global int *dev_nbor, __global int *dev_packed,
+                               __global acctyp4 *ans, __global acctyp *engv, 
+                               const int eflag, const int vflag, const int inum, 
+                               const int nbor_pitch, __global numtyp *q_,
+                               const numtyp cut_coulsq, const numtyp qqrd2e,
+                               const numtyp g_ewald, const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp sp_lj[8];
+  if (tid<8)
+    sp_lj[tid]=sp_lj_in[tid];
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    lj1[tid]=lj1_in[tid];
+    if (eflag>0)
+      lj3[tid]=lj3_in[tid];
+  }
+  
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
+  __syncthreads();
+  
+  if (ii<inum) {
+    __global int *nbor, *list_end;
+    int i, numj, n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,list_end,nbor);
+  
+    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp qtmp=fetch_q(i,q_);
+    int iw=ix.w;
+    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
+
+    for ( ; nbor<list_end; nbor+=n_stride) {
+      int j=*nbor;
+
+      numtyp factor_lj, factor_coul;
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
+      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      int mtype=itype+jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<lj1[mtype].z) {
+        numtyp r2inv=ucl_recip(rsq);
+        numtyp forcecoul, force_lj, force, r6inv, r3inv, prefactor, _erfc;
+
+        if (rsq < lj1[mtype].w) {
+          numtyp rinv=ucl_rsqrt(rsq);
+          r3inv=r2inv*rinv;
+          r6inv = r3inv*r3inv;
+          force_lj = factor_lj*r6inv*(lj1[mtype].x*r3inv-lj1[mtype].y);
+        } else
+          force_lj = (numtyp)0.0;
+
+        if (rsq < cut_coulsq) {
+          numtyp r = ucl_rsqrt(r2inv);
+          numtyp grij = g_ewald * r;
+          numtyp expm2 = ucl_exp(-grij*grij);
+          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
+          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+          prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
+          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
+        } else
+          forcecoul = (numtyp)0.0;
+
+        force = (force_lj + forcecoul) * r2inv;
+
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          if (rsq < cut_coulsq)
+            e_coul += prefactor*(_erfc-factor_coul);
+          if (rsq < lj1[mtype].w) {
+            numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y);
+            energy+=factor_lj*(e-lj3[mtype].z);
+          }
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                    vflag,ans,engv);
+  } // if ii
+}
+
--- a/lib/gpu/lal_lj_class2_long.h
+++ b/lib/gpu/lal_lj_class2_long.h
@ -0,0 +1,84 @@
+/***************************************************************************
+                               lj_class2_long.h
+                             -------------------
+                               W. Michael Brown
+
+  Host code for COMPASS LJ long potential acceleration
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : Mon May 16 2011
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#ifndef LJ_CLASS2_LONG_H
+#define LJ_CLASS2_LONG_H
+
+#include "lal_base_charge.h"
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class LJClass2Long : public BaseCharge<numtyp, acctyp> {
+ public:
+  LJClass2Long();
+  ~LJClass2Long();
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double **host_cutsq,
+           double **host_lj1, double **host_lj2, double **host_lj3,
+           double **host_lj4, double **host_offset, double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors, 
+           const int maxspecial, const double cell_size, 
+           const double gpu_split, FILE *screen, double **host_cut_ljsq,
+           const double host_cut_coulsq, double *host_special_coul,
+           const double qqrd2e, const double g_ewald);
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage() const;
+
+  // --------------------------- TYPE DATA --------------------------
+
+  /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq, lj1.w = cutsq_vdw
+  UCL_D_Vec<numtyp4> lj1;
+  /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
+  UCL_D_Vec<numtyp4> lj3;
+  /// Special LJ values [0-3] and Special Coul values [4-7]
+  UCL_D_Vec<numtyp> sp_lj;
+
+  /// If atom type constants fit in shared memory, use fast kernels
+  bool shared_types;
+
+  /// Number of atom types 
+  int _lj_types;
+
+  numtyp _cut_coulsq, _qqrd2e, _g_ewald;
+
+ private:
+  bool _allocated;
+  void loop(const bool _eflag, const bool _vflag);
+};
+
+}
+
+#endif
+
--- a/lib/gpu/lal_lj_class2_long_ext.cpp
+++ b/lib/gpu/lal_lj_class2_long_ext.cpp
@ -0,0 +1,129 @@
+/***************************************************************************
+                           lj_class2_long_ext.cpp
+                             -------------------
+                               W. Michael Brown
+
+  LAMMPS Wrappers for COMMPASS LJ long Acceleration
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : Mon May 16 2011
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#include <iostream>
+#include <cassert>
+#include <math.h>
+
+#include "lal_lj_class2_long.h"
+
+using namespace std;
+using namespace LAMMPS_AL;
+
+static LJClass2Long<PRECISION,ACC_PRECISION> C2CLMF;
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+int c2cl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
+                  double **host_lj2, double **host_lj3, double **host_lj4,
+                  double **offset, double *special_lj, const int inum,
+                  const int nall, const int max_nbors, const int maxspecial,
+                  const double cell_size, int &gpu_mode, FILE *screen,
+                  double **host_cut_ljsq, double host_cut_coulsq,
+                  double *host_special_coul, const double qqrd2e,
+                  const double g_ewald) {
+  C2CLMF.clear();
+  gpu_mode=C2CLMF.device->gpu_mode();
+  double gpu_split=C2CLMF.device->particle_split();
+  int first_gpu=C2CLMF.device->first_device();
+  int last_gpu=C2CLMF.device->last_device();
+  int world_me=C2CLMF.device->world_me();
+  int gpu_rank=C2CLMF.device->gpu_rank();
+  int procs_per_gpu=C2CLMF.device->procs_per_gpu();
+
+  C2CLMF.device->init_message(screen,"lj/class2/coul/long",first_gpu,last_gpu);
+
+  bool message=false;
+  if (C2CLMF.device->replica_me()==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fflush(screen);
+  }
+
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=C2CLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
+                        offset, special_lj, inum, nall, 300, maxspecial,
+                        cell_size, gpu_split, screen, host_cut_ljsq,
+                        host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
+
+  C2CLMF.device->world_barrier();
+  if (message)
+    fprintf(screen,"Done.\n");
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+      else
+        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=C2CLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
+                          offset, special_lj, inum, nall, 300, maxspecial,
+                          cell_size, gpu_split, screen, host_cut_ljsq,
+                          host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
+
+    C2CLMF.device->gpu_barrier();
+    if (message) 
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+
+  if (init_ok==0)
+    C2CLMF.estimate_gpu_overhead();
+  return init_ok;
+}
+
+void c2cl_gpu_clear() {
+  C2CLMF.clear();
+}
+
+int** c2cl_gpu_compute_n(const int ago, const int inum_full,
+                         const int nall, double **host_x, int *host_type,
+                         double *sublo, double *subhi, int *tag, int **nspecial, 
+                         int **special, const bool eflag, const bool vflag,
+                         const bool eatom, const bool vatom, int &host_start,
+                         int **ilist, int **jnum,  const double cpu_time,
+                         bool &success, double *host_q, double *boxlo,
+                         double *prd) {
+  return C2CLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
+                        subhi, tag, nspecial, special, eflag, vflag, eatom,
+                        vatom, host_start, ilist, jnum, cpu_time, success,
+                        host_q, boxlo, prd);
+}  
+			
+void c2cl_gpu_compute(const int ago, const int inum_full, const int nall,
+                      double **host_x, int *host_type, int *ilist, int *numj,
+                      int **firstneigh, const bool eflag, const bool vflag,
+                      const bool eatom, const bool vatom, int &host_start,
+                      const double cpu_time, bool &success, double *host_q,
+                      const int nlocal, double *boxlo, double *prd) {
+  C2CLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
+                firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
+                host_q,nlocal,boxlo,prd);
+}
+
+double c2cl_gpu_bytes() {
+  return C2CLMF.host_memory_usage();
+}
+
+
--- a/lib/gpu/lal_lj_coul.cpp
+++ b/lib/gpu/lal_lj_coul.cpp
@ -0,0 +1,169 @@
+/***************************************************************************
+                                 lj_coul.cpp
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Class for acceleration of the lj/cut/coul/cut pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#ifdef USE_OPENCL
+#include "lj_coul_cl.h"
+#else
+#include "lj_coul_ptx.h"
+#endif
+
+#include "lal_lj_coul.h"
+#include <cassert>
+using namespace LAMMPS_AL;
+#define LJCoulT LJCoul<numtyp, acctyp>
+
+extern Device<PRECISION,ACC_PRECISION> device;
+
+template <class numtyp, class acctyp>
+LJCoulT::LJCoul() : BaseCharge<numtyp,acctyp>(),
+                                    _allocated(false) {
+}
+
+template <class numtyp, class acctyp>
+LJCoulT::~LJCoul() {
+  clear();
+}
+ 
+template <class numtyp, class acctyp>
+int LJCoulT::bytes_per_atom(const int max_nbors) const {
+  return this->bytes_per_atom_atomic(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int LJCoulT::init(const int ntypes,
+                          double **host_cutsq, double **host_lj1, 
+                          double **host_lj2, double **host_lj3, 
+                          double **host_lj4, double **host_offset, 
+                          double *host_special_lj, const int nlocal,
+                          const int nall, const int max_nbors,
+                          const int maxspecial, const double cell_size,
+                          const double gpu_split, FILE *_screen,
+                          double **host_cut_ljsq, double **host_cut_coulsq,
+                          double *host_special_coul, const double qqrd2e) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                            _screen,lj_coul);
+  if (success!=0)
+    return success;
+
+  // If atom type constants fit in shared memory use fast kernel
+  int lj_types=ntypes;
+  shared_types=false;
+  int max_shared_types=this->device->max_shared_types();
+  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
+    lj_types=max_shared_types;
+    shared_types=true;
+  }
+  _lj_types=lj_types;
+
+  // Allocate a host write buffer for data initialization
+  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
+                               UCL_WRITE_OPTIMIZED);
+
+  for (int i=0; i<lj_types*lj_types; i++)
+    host_write[i]=0.0;
+
+  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
+			 host_cut_ljsq, host_cut_coulsq);
+
+  lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
+		         host_offset);
+
+  cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq);
+
+  sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
+  for (int i=0; i<4; i++) {
+    host_write[i]=host_special_lj[i];
+    host_write[i+4]=host_special_coul[i];
+  }
+  ucl_copy(sp_lj,host_write,8,false);
+
+  _qqrd2e=qqrd2e;
+
+  _allocated=true;
+  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+cutsq.row_bytes()+
+                   sp_lj.row_bytes();
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+void LJCoulT::clear() {
+  if (!_allocated)
+    return;
+  _allocated=false;
+
+  lj1.clear();
+  lj3.clear();
+  cutsq.clear();
+  sp_lj.clear();
+  this->clear_atomic();
+}
+
+template <class numtyp, class acctyp>
+double LJCoulT::host_memory_usage() const {
+  return this->host_memory_usage_atomic()+sizeof(LJCoul<numtyp,acctyp>);
+}
+
+// ---------------------------------------------------------------------------
+// Calculate energies, forces, and torques
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void LJCoulT::loop(const bool _eflag, const bool _vflag) {
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int eflag, vflag;
+  if (_eflag)
+    eflag=1;
+  else
+    eflag=0;
+
+  if (_vflag)
+    vflag=1;
+  else
+    vflag=0;
+  
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+
+  int ainum=this->ans->inum();
+  int nbor_pitch=this->nbor->nbor_pitch();
+  this->time_pair.start();
+  if (shared_types) {
+    this->k_pair_fast.set_size(GX,BX);
+    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
+                          &lj3.begin(), &sp_lj.begin(),
+                          &this->nbor->dev_nbor.begin(),
+                          &this->_nbor_data->begin(),
+                          &this->ans->dev_ans.begin(),
+                          &this->ans->dev_engv.begin(), &eflag, &vflag,
+                          &ainum, &nbor_pitch,
+                          &this->atom->dev_q.begin(), &cutsq.begin(),
+                          &_qqrd2e, &this->_threads_per_atom);
+  } else {
+    this->k_pair.set_size(GX,BX);
+    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
+                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
+                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
+                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &nbor_pitch, &this->atom->dev_q.begin(),
+                     &cutsq.begin(), &_qqrd2e, &this->_threads_per_atom);
+  }
+  this->time_pair.stop();
+}
+
+template class LJCoul<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lal_lj_coul.cu
+++ b/lib/gpu/lal_lj_coul.cu
@ -0,0 +1,236 @@
+// **************************************************************************
+//                                lj_coul.cu
+//                             -------------------
+//                           W. Michael Brown (ORNL)
+//
+//  Device code for acceleration of the lj/coul/cut pair style
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                : 
+//    email                : brownw@ornl.gov
+// ***************************************************************************/
+
+#ifdef NV_KERNEL
+#include "lal_aux_fun1.h"
+texture<float4> pos_tex;
+texture<float> q_tex;
+#ifndef _DOUBLE_DOUBLE
+ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
+  { return tex1Dfetch(pos_tex, i); }
+ucl_inline float fetch_q(const int& i, const float *q) 
+  { return tex1Dfetch(q_tex, i); }
+#endif
+#endif
+
+__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
+                          __global numtyp4* lj3, const int lj_types, 
+                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
+                          __global int *dev_packed, __global acctyp4 *ans,
+                          __global acctyp *engv, const int eflag,
+                          const int vflag, const int inum,
+                          const int nbor_pitch, __global numtyp *q_ ,
+                          __global numtyp *cutsq, const numtyp qqrd2e,
+                          const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp sp_lj[8];
+  sp_lj[0]=sp_lj_in[0];
+  sp_lj[1]=sp_lj_in[1];
+  sp_lj[2]=sp_lj_in[2];
+  sp_lj[3]=sp_lj_in[3];
+  sp_lj[4]=sp_lj_in[4];
+  sp_lj[5]=sp_lj_in[5];
+  sp_lj[6]=sp_lj_in[6];
+  sp_lj[7]=sp_lj_in[7];
+
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
+  if (ii<inum) {
+    __global int *nbor, *list_end;
+    int i, numj, n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,list_end,nbor);
+  
+    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp qtmp=fetch_q(i,q_);
+    int itype=ix.w;
+
+    for ( ; nbor<list_end; nbor+=n_stride) {
+      int j=*nbor;
+
+      numtyp factor_lj, factor_coul;
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
+      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      int mtype=itype*lj_types+jtype;
+      if (rsq<cutsq[mtype]) {
+        numtyp r2inv=ucl_recip(rsq);
+        numtyp forcecoul, force_lj, force, r6inv;
+
+        if (rsq < lj1[mtype].z) {
+          r6inv = r2inv*r2inv*r2inv;
+          force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
+        } else
+          force_lj = (numtyp)0.0;
+
+        if (rsq < lj1[mtype].w) 
+          forcecoul = qqrd2e*qtmp*fetch_q(j,q_)*ucl_rsqrt(rsq)*factor_coul;
+        else
+          forcecoul = (numtyp)0.0;
+
+        force = (force_lj + forcecoul) * r2inv;
+
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          e_coul += forcecoul;
+          if (rsq < lj1[mtype].z) {
+            numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
+            energy+=factor_lj*(e-lj3[mtype].z);
+          } 
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                    vflag,ans,engv);
+  } // if ii
+}
+
+__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
+                               __global numtyp4* lj3_in, 
+                               __global numtyp* sp_lj_in,
+                               __global int *dev_nbor, __global int *dev_packed,
+                               __global acctyp4 *ans, __global acctyp *engv, 
+                               const int eflag, const int vflag, const int inum, 
+                               const int nbor_pitch, __global numtyp *q_,
+                               __global numtyp *_cutsq, const numtyp qqrd2e,
+                               const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp sp_lj[8];
+  if (tid<8)
+    sp_lj[tid]=sp_lj_in[tid];
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    lj1[tid]=lj1_in[tid];
+    cutsq[tid]=_cutsq[tid];
+    if (eflag>0)
+      lj3[tid]=lj3_in[tid];
+  }
+  
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
+  __syncthreads();
+  
+  if (ii<inum) {
+    __global int *nbor, *list_end;
+    int i, numj, n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,list_end,nbor);
+  
+    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp qtmp=fetch_q(i,q_);
+    int iw=ix.w;
+    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
+
+    for ( ; nbor<list_end; nbor+=n_stride) {
+      int j=*nbor;
+
+      numtyp factor_lj, factor_coul;
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
+      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      int mtype=itype+jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<cutsq[mtype]) {
+        numtyp r2inv=ucl_recip(rsq);
+        numtyp forcecoul, force_lj, force, r6inv;
+
+        if (rsq < lj1[mtype].z) {
+          r6inv = r2inv*r2inv*r2inv;
+          force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
+        } else
+          force_lj = (numtyp)0.0;
+
+        if (rsq < lj1[mtype].w)
+          forcecoul = qqrd2e*qtmp*fetch_q(j,q_)*ucl_rsqrt(rsq)*factor_coul;
+        else
+          forcecoul = (numtyp)0.0;
+
+        force = (force_lj + forcecoul) * r2inv;
+
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          e_coul += forcecoul;
+          if (rsq < lj1[mtype].z) {
+            numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
+            energy+=factor_lj*(e-lj3[mtype].z);
+          }
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                    vflag,ans,engv);
+  } // if ii
+}
+
--- a/lib/gpu/lal_lj_coul.h
+++ b/lib/gpu/lal_lj_coul.h
@ -0,0 +1,85 @@
+/***************************************************************************
+                                  lj_coul.h
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Class for acceleration of the lj/cut/coul/cut pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#ifndef LAL_LJ_COUL_H
+#define LAL_LJ_COUL_H
+
+#include "lal_base_charge.h"
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class LJCoul : public BaseCharge<numtyp, acctyp> {
+ public:
+  LJCoul();
+  ~LJCoul();
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double **host_cutsq, double **host_lj1,
+           double **host_lj2, double **host_lj3, double **host_lj4,
+           double **host_offset, double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors, 
+           const int maxspecial, const double cell_size,
+           const double gpu_split, FILE *screen, double **host_cut_ljsq,
+           double **host_cut_coulsq, double *host_special_coul,
+           const double qqrd2e);
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage() const;
+
+  // --------------------------- TYPE DATA --------------------------
+
+  /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq_vdw, lj1.w = cutsq_coul
+  UCL_D_Vec<numtyp4> lj1;
+  /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
+  UCL_D_Vec<numtyp4> lj3;
+  /// cutsq
+  UCL_D_Vec<numtyp> cutsq;
+  /// Special LJ values [0-3] and Special Coul values [4-7]
+  UCL_D_Vec<numtyp> sp_lj;
+
+  /// If atom type constants fit in shared memory, use fast kernels
+  bool shared_types;
+
+  /// Number of atom types 
+  int _lj_types;
+
+  numtyp _qqrd2e;
+
+ private:
+  bool _allocated;
+  void loop(const bool _eflag, const bool _vflag);
+};
+
+}
+
+#endif
--- a/lib/gpu/lal_lj_coul_ext.cpp
+++ b/lib/gpu/lal_lj_coul_ext.cpp
@ -0,0 +1,128 @@
+/***************************************************************************
+                               lj_coul_ext.cpp
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Functions for LAMMPS access to lj/cut/coul acceleration routines.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#include <iostream>
+#include <cassert>
+#include <math.h>
+
+#include "lal_lj_coul.h"
+
+using namespace std;
+using namespace LAMMPS_AL;
+
+static LJCoul<PRECISION,ACC_PRECISION> LJCMF;
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+int ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
+                 double **host_lj2, double **host_lj3, double **host_lj4,
+                 double **offset, double *special_lj, const int inum,
+                 const int nall, const int max_nbors, const int maxspecial,
+                 const double cell_size, int &gpu_mode, FILE *screen,
+                 double **host_cut_ljsq, double **host_cut_coulsq,
+                 double *host_special_coul, const double qqrd2e) {
+  LJCMF.clear();
+  gpu_mode=LJCMF.device->gpu_mode();
+  double gpu_split=LJCMF.device->particle_split();
+  int first_gpu=LJCMF.device->first_device();
+  int last_gpu=LJCMF.device->last_device();
+  int world_me=LJCMF.device->world_me();
+  int gpu_rank=LJCMF.device->gpu_rank();
+  int procs_per_gpu=LJCMF.device->procs_per_gpu();
+
+  LJCMF.device->init_message(screen,"lj/cut/coul/cut",first_gpu,last_gpu);
+
+  bool message=false;
+  if (LJCMF.device->replica_me()==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fflush(screen);
+  }
+
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
+                       host_lj4, offset, special_lj, inum, nall, 300,
+                       maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
+                       host_cut_coulsq, host_special_coul, qqrd2e);
+
+  LJCMF.device->world_barrier();
+  if (message)
+    fprintf(screen,"Done.\n");
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+      else
+        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
+                         offset, special_lj, inum, nall, 300, maxspecial,
+                         cell_size, gpu_split, screen, host_cut_ljsq,
+                         host_cut_coulsq, host_special_coul, qqrd2e);
+
+    LJCMF.device->gpu_barrier();
+    if (message) 
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+
+  if (init_ok==0)
+    LJCMF.estimate_gpu_overhead();
+  return init_ok;
+}
+
+void ljc_gpu_clear() {
+  LJCMF.clear();
+}
+
+int** ljc_gpu_compute_n(const int ago, const int inum_full,
+                        const int nall, double **host_x, int *host_type,
+                        double *sublo, double *subhi, int *tag, int **nspecial, 
+                        int **special, const bool eflag, const bool vflag,
+                        const bool eatom, const bool vatom, int &host_start,
+                        int **ilist, int **jnum, const double cpu_time,
+                        bool &success, double *host_q, double *boxlo,
+                        double *prd) {
+  return LJCMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
+                       subhi, tag, nspecial, special, eflag, vflag, eatom,
+                       vatom, host_start, ilist, jnum, cpu_time, success,
+                       host_q, boxlo, prd);
+}  
+			
+void ljc_gpu_compute(const int ago, const int inum_full, const int nall,
+                     double **host_x, int *host_type, int *ilist, int *numj,
+                     int **firstneigh, const bool eflag, const bool vflag,
+                     const bool eatom, const bool vatom, int &host_start,
+                     const double cpu_time, bool &success, double *host_q,
+                     const int nlocal, double *boxlo, double *prd) {
+  LJCMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag,
+                vflag,eatom,vatom,host_start,cpu_time,success,host_q,
+                nlocal,boxlo,prd);
+}
+
+double ljc_gpu_bytes() {
+  return LJCMF.host_memory_usage();
+}
+
+
--- a/lib/gpu/lal_lj_coul_long.cpp
+++ b/lib/gpu/lal_lj_coul_long.cpp
@ -0,0 +1,167 @@
+/***************************************************************************
+                               lj_coul_long.cpp
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Class for acceleration of the lj/cut/coul/long pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#ifdef USE_OPENCL
+#include "lj_coul_long_cl.h"
+#else
+#include "lj_coul_long_ptx.h"
+#endif
+
+#include "lal_lj_coul_long.h"
+#include <cassert>
+using namespace LAMMPS_AL;
+#define LJCoulLongT LJCoulLong<numtyp, acctyp>
+
+extern Device<PRECISION,ACC_PRECISION> device;
+
+template <class numtyp, class acctyp>
+LJCoulLongT::LJCoulLong() : BaseCharge<numtyp,acctyp>(),
+                                    _allocated(false) {
+}
+
+template <class numtyp, class acctyp>
+LJCoulLongT::~LJCoulLong() {
+  clear();
+}
+ 
+template <class numtyp, class acctyp>
+int LJCoulLongT::bytes_per_atom(const int max_nbors) const {
+  return this->bytes_per_atom_atomic(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int LJCoulLongT::init(const int ntypes,
+                           double **host_cutsq, double **host_lj1, 
+                           double **host_lj2, double **host_lj3, 
+                           double **host_lj4, double **host_offset, 
+                           double *host_special_lj, const int nlocal,
+                           const int nall, const int max_nbors,
+                           const int maxspecial, const double cell_size,
+                           const double gpu_split, FILE *_screen,
+                           double **host_cut_ljsq, const double host_cut_coulsq,
+                           double *host_special_coul, const double qqrd2e,
+                           const double g_ewald) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                            _screen,lj_coul_long);
+  if (success!=0)
+    return success;
+
+  // If atom type constants fit in shared memory use fast kernel
+  int lj_types=ntypes;
+  shared_types=false;
+  int max_shared_types=this->device->max_shared_types();
+  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
+    lj_types=max_shared_types;
+    shared_types=true;
+  }
+  _lj_types=lj_types;
+
+  // Allocate a host write buffer for data initialization
+  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
+                               UCL_WRITE_OPTIMIZED);
+
+  for (int i=0; i<lj_types*lj_types; i++)
+    host_write[i]=0.0;
+
+  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
+			 host_cutsq, host_cut_ljsq);
+
+  lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
+		         host_offset);
+
+  sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
+  for (int i=0; i<4; i++) {
+    host_write[i]=host_special_lj[i];
+    host_write[i+4]=host_special_coul[i];
+  }
+  ucl_copy(sp_lj,host_write,8,false);
+
+  _cut_coulsq=host_cut_coulsq;
+  _qqrd2e=qqrd2e;
+  _g_ewald=g_ewald;
+
+  _allocated=true;
+  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+void LJCoulLongT::clear() {
+  if (!_allocated)
+    return;
+  _allocated=false;
+
+  lj1.clear();
+  lj3.clear();
+  sp_lj.clear();
+  this->clear_atomic();
+}
+
+template <class numtyp, class acctyp>
+double LJCoulLongT::host_memory_usage() const {
+  return this->host_memory_usage_atomic()+sizeof(LJCoulLong<numtyp,acctyp>);
+}
+
+// ---------------------------------------------------------------------------
+// Calculate energies, forces, and torques
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void LJCoulLongT::loop(const bool _eflag, const bool _vflag) {
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int eflag, vflag;
+  if (_eflag)
+    eflag=1;
+  else
+    eflag=0;
+
+  if (_vflag)
+    vflag=1;
+  else
+    vflag=0;
+  
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+
+  int ainum=this->ans->inum();
+  int nbor_pitch=this->nbor->nbor_pitch();
+  this->time_pair.start();
+  if (shared_types) {
+    this->k_pair_fast.set_size(GX,BX);
+    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
+                          &lj3.begin(), &sp_lj.begin(),
+                          &this->nbor->dev_nbor.begin(),
+                          &this->_nbor_data->begin(),
+                          &this->ans->dev_ans.begin(),
+                          &this->ans->dev_engv.begin(), &eflag, &vflag,
+                          &ainum, &nbor_pitch, &this->atom->dev_q.begin(),
+                          &_cut_coulsq, &_qqrd2e, &_g_ewald,
+                          &this->_threads_per_atom);
+  } else {
+    this->k_pair.set_size(GX,BX);
+    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
+                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
+                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
+                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq,
+                     &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
+  }
+  this->time_pair.stop();
+}
+
+template class LJCoulLong<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lal_lj_coul_long.cu
+++ b/lib/gpu/lal_lj_coul_long.cu
@ -0,0 +1,248 @@
+// **************************************************************************
+//                               lj_coul_long.cu
+//                             -------------------
+//                           W. Michael Brown (ORNL)
+//
+//  Device code for acceleration of the lj/cut/coul/long pair style
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                : 
+//    email                : brownw@ornl.gov
+// ***************************************************************************/
+
+#ifdef NV_KERNEL
+#include "lal_aux_fun1.h"
+texture<float4> pos_tex;
+texture<float> q_tex;
+#ifndef _DOUBLE_DOUBLE
+ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
+  { return tex1Dfetch(pos_tex, i); }
+ucl_inline float fetch_q(const int& i, const float *q) 
+  { return tex1Dfetch(q_tex, i); }
+#endif
+#endif
+
+__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
+                          __global numtyp4* lj3, const int lj_types, 
+                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
+                          __global int *dev_packed, __global acctyp4 *ans,
+                          __global acctyp *engv, const int eflag, 
+                          const int vflag, const int inum,
+                          const int nbor_pitch, __global numtyp *q_,
+                          const numtyp cut_coulsq, const numtyp qqrd2e,
+                          const numtyp g_ewald, const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp sp_lj[8];
+  sp_lj[0]=sp_lj_in[0];
+  sp_lj[1]=sp_lj_in[1];
+  sp_lj[2]=sp_lj_in[2];
+  sp_lj[3]=sp_lj_in[3];
+  sp_lj[4]=sp_lj_in[4];
+  sp_lj[5]=sp_lj_in[5];
+  sp_lj[6]=sp_lj_in[6];
+  sp_lj[7]=sp_lj_in[7];
+
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
+  if (ii<inum) {
+    __global int *nbor, *list_end;
+    int i, numj, n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,list_end,nbor);
+  
+    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp qtmp=fetch_q(i,q_);
+    int itype=ix.w;
+
+    for ( ; nbor<list_end; nbor+=n_stride) {
+      int j=*nbor;
+
+      numtyp factor_lj, factor_coul;
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
+      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      int mtype=itype*lj_types+jtype;
+      if (rsq<lj1[mtype].z) {
+        numtyp r2inv=ucl_recip(rsq);
+        numtyp forcecoul, force_lj, force, r6inv, prefactor, _erfc;
+
+        if (rsq < lj1[mtype].w) {
+          r6inv = r2inv*r2inv*r2inv;
+          force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
+        } else
+          force_lj = (numtyp)0.0;
+
+        if (rsq < cut_coulsq) {
+          numtyp r = ucl_rsqrt(r2inv);
+          numtyp grij = g_ewald * r;
+          numtyp expm2 = ucl_exp(-grij*grij);
+          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
+          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+          prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
+          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
+        } else
+          forcecoul = (numtyp)0.0;
+
+        force = (force_lj + forcecoul) * r2inv;
+
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          if (rsq < cut_coulsq)
+            e_coul += prefactor*(_erfc-factor_coul);
+          if (rsq < lj1[mtype].w) {
+            numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
+            energy+=factor_lj*(e-lj3[mtype].z);
+          } 
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                    vflag,ans,engv);
+  } // if ii
+}
+
+__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
+                               __global numtyp4* lj3_in, 
+                               __global numtyp* sp_lj_in,
+                               __global int *dev_nbor, __global int *dev_packed,
+                               __global acctyp4 *ans, __global acctyp *engv, 
+                               const int eflag, const int vflag, const int inum, 
+                               const int nbor_pitch, __global numtyp *q_,
+                               const numtyp cut_coulsq, const numtyp qqrd2e,
+                               const numtyp g_ewald, const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp sp_lj[8];
+  if (tid<8)
+    sp_lj[tid]=sp_lj_in[tid];
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    lj1[tid]=lj1_in[tid];
+    if (eflag>0)
+      lj3[tid]=lj3_in[tid];
+  }
+  
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
+  __syncthreads();
+  
+  if (ii<inum) {
+    __global int *nbor, *list_end;
+    int i, numj, n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,list_end,nbor);
+  
+    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp qtmp=fetch_q(i,q_);
+    int iw=ix.w;
+    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
+
+    for ( ; nbor<list_end; nbor+=n_stride) {
+      int j=*nbor;
+
+      numtyp factor_lj, factor_coul;
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
+      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      int mtype=itype+jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<lj1[mtype].z) {
+        numtyp r2inv=ucl_recip(rsq);
+        numtyp forcecoul, force_lj, force, r6inv, prefactor, _erfc;
+
+        if (rsq < lj1[mtype].w) {
+          r6inv = r2inv*r2inv*r2inv;
+          force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
+        } else
+          force_lj = (numtyp)0.0;
+
+        if (rsq < cut_coulsq) {
+          numtyp r = ucl_rsqrt(r2inv);
+          numtyp grij = g_ewald * r;
+          numtyp expm2 = ucl_exp(-grij*grij);
+          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
+          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+          prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
+          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
+        } else
+          forcecoul = (numtyp)0.0;
+
+        force = (force_lj + forcecoul) * r2inv;
+
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          if (rsq < cut_coulsq)
+            e_coul += prefactor*(_erfc-factor_coul);
+          if (rsq < lj1[mtype].w) {
+            numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
+            energy+=factor_lj*(e-lj3[mtype].z);
+          }
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                    vflag,ans,engv);
+  } // if ii
+}
+
--- a/lib/gpu/lal_lj_coul_long.h
+++ b/lib/gpu/lal_lj_coul_long.h
@ -0,0 +1,83 @@
+/***************************************************************************
+                               lj_coul_long.h
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Class for acceleration of the lj/cut/coul/long pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#ifndef LAL_LJ_COUL_LONG_H
+#define LAL_LJ_COUL_LONG_H
+
+#include "lal_base_charge.h"
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class LJCoulLong : public BaseCharge<numtyp, acctyp> {
+ public:
+  LJCoulLong();
+  ~LJCoulLong();
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double **host_cutsq,
+           double **host_lj1, double **host_lj2, double **host_lj3,
+           double **host_lj4, double **host_offset, double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors, 
+           const int maxspecial, const double cell_size, 
+           const double gpu_split, FILE *screen, double **host_cut_ljsq,
+           const double host_cut_coulsq, double *host_special_coul,
+           const double qqrd2e, const double g_ewald);
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage() const;
+
+  // --------------------------- TYPE DATA --------------------------
+
+  /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq, lj1.w = cutsq_vdw
+  UCL_D_Vec<numtyp4> lj1;
+  /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
+  UCL_D_Vec<numtyp4> lj3;
+  /// Special LJ values [0-3] and Special Coul values [4-7]
+  UCL_D_Vec<numtyp> sp_lj;
+
+  /// If atom type constants fit in shared memory, use fast kernels
+  bool shared_types;
+
+  /// Number of atom types 
+  int _lj_types;
+
+  numtyp _cut_coulsq, _qqrd2e, _g_ewald;
+
+ private:
+  bool _allocated;
+  void loop(const bool _eflag, const bool _vflag);
+};
+
+}
+
+#endif
--- a/lib/gpu/lal_lj_coul_long_ext.cpp
+++ b/lib/gpu/lal_lj_coul_long_ext.cpp
@ -0,0 +1,129 @@
+/***************************************************************************
+                            lj_coul_long_ext.cpp
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Functions for LAMMPS access to lj/cut/coul/long acceleration routines.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#include <iostream>
+#include <cassert>
+#include <math.h>
+
+#include "lal_lj_coul_long.h"
+
+using namespace std;
+using namespace LAMMPS_AL;
+
+static LJCoulLong<PRECISION,ACC_PRECISION> LJCLMF;
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+int ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
+                  double **host_lj2, double **host_lj3, double **host_lj4,
+                  double **offset, double *special_lj, const int inum,
+                  const int nall, const int max_nbors, const int maxspecial,
+                  const double cell_size, int &gpu_mode, FILE *screen,
+                  double **host_cut_ljsq, double host_cut_coulsq,
+                  double *host_special_coul, const double qqrd2e,
+                  const double g_ewald) {
+  LJCLMF.clear();
+  gpu_mode=LJCLMF.device->gpu_mode();
+  double gpu_split=LJCLMF.device->particle_split();
+  int first_gpu=LJCLMF.device->first_device();
+  int last_gpu=LJCLMF.device->last_device();
+  int world_me=LJCLMF.device->world_me();
+  int gpu_rank=LJCLMF.device->gpu_rank();
+  int procs_per_gpu=LJCLMF.device->procs_per_gpu();
+
+  LJCLMF.device->init_message(screen,"lj/cut/coul/long",first_gpu,last_gpu);
+
+  bool message=false;
+  if (LJCLMF.device->replica_me()==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fflush(screen);
+  }
+
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
+                        offset, special_lj, inum, nall, 300, maxspecial,
+                        cell_size, gpu_split, screen, host_cut_ljsq,
+                        host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
+
+  LJCLMF.device->world_barrier();
+  if (message)
+    fprintf(screen,"Done.\n");
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+      else
+        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
+                          offset, special_lj, inum, nall, 300, maxspecial,
+                          cell_size, gpu_split, screen, host_cut_ljsq,
+                          host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
+
+    LJCLMF.device->gpu_barrier();
+    if (message) 
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+
+  if (init_ok==0)
+    LJCLMF.estimate_gpu_overhead();
+  return init_ok;
+}
+
+void ljcl_gpu_clear() {
+  LJCLMF.clear();
+}
+
+int** ljcl_gpu_compute_n(const int ago, const int inum_full,
+                         const int nall, double **host_x, int *host_type,
+                         double *sublo, double *subhi, int *tag, int **nspecial, 
+                         int **special, const bool eflag, const bool vflag,
+                         const bool eatom, const bool vatom, int &host_start,
+                         int **ilist, int **jnum,  const double cpu_time,
+                         bool &success, double *host_q, double *boxlo,
+                         double *prd) {
+  return LJCLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
+                        subhi, tag, nspecial, special, eflag, vflag, eatom,
+                        vatom, host_start, ilist, jnum, cpu_time, success,
+                        host_q, boxlo, prd);
+}  
+			
+void ljcl_gpu_compute(const int ago, const int inum_full, const int nall,
+                      double **host_x, int *host_type, int *ilist, int *numj,
+                      int **firstneigh, const bool eflag, const bool vflag,
+                      const bool eatom, const bool vatom, int &host_start,
+                      const double cpu_time, bool &success, double *host_q,
+                      const int nlocal, double *boxlo, double *prd) {
+  LJCLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
+                firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
+                host_q,nlocal,boxlo,prd);
+}
+
+double ljcl_gpu_bytes() {
+  return LJCLMF.host_memory_usage();
+}
+
+
--- a/lib/gpu/lal_lj_expand.cpp
+++ b/lib/gpu/lal_lj_expand.cpp
@ -0,0 +1,154 @@
+/***************************************************************************
+                                lj_expand.cpp
+                             -------------------
+                            Inderaj Bains (NVIDIA)
+
+  Class for acceleration of the lj/expand pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : ibains@nvidia.com
+ ***************************************************************************/
+
+#ifdef USE_OPENCL
+#include "lj_expand_cl.h"
+#else
+#include "lj_expand_ptx.h"
+#endif
+
+#include "lal_lj_expand.h"
+#include <cassert>
+using namespace LAMMPS_AL;
+#define LJExpandT LJExpand<numtyp, acctyp>
+
+extern Device<PRECISION,ACC_PRECISION> device;
+
+template <class numtyp, class acctyp>
+LJExpandT::LJExpand() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
+}
+
+template <class numtyp, class acctyp>
+LJExpandT::~LJExpand() {
+  clear();
+}
+ 
+template <class numtyp, class acctyp>
+int LJExpandT::bytes_per_atom(const int max_nbors) const {
+  return this->bytes_per_atom_atomic(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int LJExpandT::init(const int ntypes, double **host_cutsq,
+                          double **host_lj1, double **host_lj2,
+                          double **host_lj3, double **host_lj4,
+                          double **host_offset, double **host_shift,
+                          double *host_special_lj, const int nlocal,
+                          const int nall, const int max_nbors,
+                          const int maxspecial, const double cell_size,
+                          const double gpu_split, FILE *_screen) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                            _screen,lj_expand);
+  if (success!=0)
+    return success;
+
+  // If atom type constants fit in shared memory use fast kernel
+  int lj_types=ntypes;
+  shared_types=false;
+  int max_shared_types=this->device->max_shared_types();
+  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
+    lj_types=max_shared_types;
+    shared_types=true;
+  }
+  _lj_types=lj_types;
+
+  // Allocate a host write buffer for data initialization
+  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
+                               UCL_WRITE_OPTIMIZED);
+
+  for (int i=0; i<lj_types*lj_types; i++)
+    host_write[i]=0.0;
+
+  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
+			 host_cutsq, host_shift);
+
+  lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
+		         host_offset);
+
+  UCL_H_Vec<double> dview;
+  sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
+  dview.view(host_special_lj,4,*(this->ucl_device));
+  ucl_copy(sp_lj,dview,false);
+
+  _allocated=true;
+  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+void LJExpandT::clear() {
+  if (!_allocated)
+    return;
+  _allocated=false;
+
+  lj1.clear();
+  lj3.clear();
+  sp_lj.clear();
+  this->clear_atomic();
+}
+
+template <class numtyp, class acctyp>
+double LJExpandT::host_memory_usage() const {
+  return this->host_memory_usage_atomic()+sizeof(LJExpand<numtyp,acctyp>);
+}
+
+// ---------------------------------------------------------------------------
+// Calculate energies, forces, and torques
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void LJExpandT::loop(const bool _eflag, const bool _vflag) {
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int eflag, vflag;
+  if (_eflag)
+    eflag=1;
+  else
+    eflag=0;
+
+  if (_vflag)
+    vflag=1;
+  else
+    vflag=0;
+  
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+
+  int ainum=this->ans->inum();
+  int nbor_pitch=this->nbor->nbor_pitch();
+  this->time_pair.start();
+  if (shared_types) {
+    this->k_pair_fast.set_size(GX,BX);
+    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
+                          &lj3.begin(), &sp_lj.begin(),
+                          &this->nbor->dev_nbor.begin(),
+                          &this->_nbor_data->begin(),
+                          &this->ans->dev_ans.begin(),
+                          &this->ans->dev_engv.begin(), &eflag, &vflag,
+                          &ainum, &nbor_pitch, &this->_threads_per_atom);
+  } else {
+    this->k_pair.set_size(GX,BX);
+    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
+                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
+                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
+                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &nbor_pitch, &this->_threads_per_atom);
+  }
+  this->time_pair.stop();
+}
+
+template class LJExpand<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lal_lj_expand.cu
+++ b/lib/gpu/lal_lj_expand.cu
@ -0,0 +1,195 @@
+// **************************************************************************
+//                                lj_expand.cu
+//                             -------------------
+//                            Inderaj Bains (NVIDIA)
+//
+//  Device code for acceleration of the lj/expand pair style
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                : 
+//    email                : ibains@nvidia.com
+// ***************************************************************************/
+
+#ifdef NV_KERNEL
+#include "lal_aux_fun1.h"
+texture<float4> pos_tex;
+#ifndef _DOUBLE_DOUBLE
+ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
+  { return tex1Dfetch(pos_tex, i); }
+#endif
+#endif
+
+__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
+                          __global numtyp4* lj3, const int lj_types, 
+                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
+                          __global int *dev_packed, __global acctyp4 *ans,
+                          __global acctyp *engv, const int eflag,
+                          const int vflag, const int inum,
+                          const int nbor_pitch, const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp sp_lj[4];
+  sp_lj[0]=sp_lj_in[0];
+  sp_lj[1]=sp_lj_in[1];
+  sp_lj[2]=sp_lj_in[2];
+  sp_lj[3]=sp_lj_in[3];
+
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
+  if (ii<inum) {
+    __global int *nbor, *list_end;
+    int i, numj, n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,list_end,nbor);
+  
+    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    int itype=ix.w;
+
+    numtyp factor_lj;
+    for ( ; nbor<list_end; nbor+=n_stride) {
+  
+      int j=*nbor;
+      factor_lj = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp r2inv = delx*delx+dely*dely+delz*delz;
+        
+      int mtype=itype*lj_types+jtype;
+      if (r2inv<lj1[mtype].z) {
+        numtyp r = ucl_sqrt(r2inv);
+	numtyp rshift = r - lj1[mtype].w;
+	numtyp rshiftsq = rshift*rshift;
+	r2inv = ucl_recip(rshiftsq);
+        numtyp r6inv = r2inv*r2inv*r2inv;
+        numtyp force = r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
+        force*=factor_lj/rshift/r;
+      
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
+          energy+=factor_lj*(e-lj3[mtype].z); 
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                  ans,engv);
+  } // if ii
+}
+
+__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
+                               __global numtyp4* lj3_in, 
+                               __global numtyp* sp_lj_in, 
+                               __global int *dev_nbor, __global int *dev_packed,
+                               __global acctyp4 *ans, __global acctyp *engv, 
+                               const int eflag, const int vflag, const int inum, 
+                               const int nbor_pitch, const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp sp_lj[4];
+  if (tid<4)
+    sp_lj[tid]=sp_lj_in[tid];
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    lj1[tid]=lj1_in[tid];
+    if (eflag>0)
+      lj3[tid]=lj3_in[tid];
+  }
+  
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(numtyp)0;
+  
+  __syncthreads();
+  
+  if (ii<inum) {
+    __global int *nbor, *list_end;
+    int i, numj, n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,list_end,nbor);
+  
+    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    int iw=ix.w;
+    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
+
+    numtyp factor_lj;
+    for ( ; nbor<list_end; nbor+=n_stride) {
+  
+      int j=*nbor;
+      factor_lj = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      int mtype=itype+jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp r2inv = delx*delx+dely*dely+delz*delz;
+        
+      if (r2inv<lj1[mtype].z) {
+        numtyp r = ucl_sqrt(r2inv);
+	numtyp rshift = r - lj1[mtype].w;
+	numtyp rshiftsq = rshift*rshift;
+	r2inv = ucl_recip(rshiftsq);
+        numtyp r6inv = r2inv*r2inv*r2inv;
+        numtyp force = r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
+        force*=factor_lj/rshift/r;
+      
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
+          energy+=factor_lj*(e-lj3[mtype].z); 
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                  ans,engv);
+  } // if ii
+}
+
--- a/lib/gpu/lal_lj_expand.h
+++ b/lib/gpu/lal_lj_expand.h
@ -0,0 +1,79 @@
+/***************************************************************************
+                                 lj_expand.h
+                             -------------------
+                            Inderaj Bains (NVIDIA)
+
+  Class for acceleration of the lj/expand pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : ibains@nvidia.com
+ ***************************************************************************/
+
+#ifndef LAL_LJ_EXPAND_H
+#define LAL_LJ_EXPAND_H
+
+#include "lal_base_atomic.h"
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class LJExpand : public BaseAtomic<numtyp, acctyp> {
+ public:
+  LJExpand();
+  ~LJExpand();
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double **host_cutsq, double **host_lj1,
+           double **host_lj2, double **host_lj3, double **host_lj4,
+           double **host_offset, double **host_shift, double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors, 
+           const int maxspecial, const double cell_size, 
+           const double gpu_split, FILE *screen);
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage() const;
+
+  // --------------------------- TYPE DATA --------------------------
+
+  /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq, lj1.w = shift
+  UCL_D_Vec<numtyp4> lj1;
+  /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
+  UCL_D_Vec<numtyp4> lj3;
+  /// Special LJ values
+  UCL_D_Vec<numtyp> sp_lj;
+
+  /// If atom type constants fit in shared memory, use fast kernels
+  bool shared_types;
+
+  /// Number of atom types 
+  int _lj_types;
+
+ private:
+  bool _allocated;
+  void loop(const bool _eflag, const bool _vflag);
+};
+
+}
+
+#endif
--- a/lib/gpu/lal_lj_expand_ext.cpp
+++ b/lib/gpu/lal_lj_expand_ext.cpp
@ -0,0 +1,121 @@
+/***************************************************************************
+                              lj_expand_ext.cpp
+                             -------------------
+                            Inderaj Bains (NVIDIA)
+
+  Functions for LAMMPS access to lj/expand acceleration routines.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : ibains@nvidia.com
+ ***************************************************************************/
+
+#include <iostream>
+#include <cassert>
+#include <math.h>
+
+#include "lal_lj_expand.h"
+
+using namespace std;
+using namespace LAMMPS_AL;
+
+static LJExpand<PRECISION,ACC_PRECISION> LJEMF;
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+int lje_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
+                 double **host_lj2, double **host_lj3, double **host_lj4,
+                 double **offset, double **shift, double *special_lj,
+                 const int inum, const int nall, const int max_nbors, 
+                 const int maxspecial, const double cell_size, int &gpu_mode,
+                 FILE *screen) {
+  LJEMF.clear();
+  gpu_mode=LJEMF.device->gpu_mode();
+  double gpu_split=LJEMF.device->particle_split();
+  int first_gpu=LJEMF.device->first_device();
+  int last_gpu=LJEMF.device->last_device();
+  int world_me=LJEMF.device->world_me();
+  int gpu_rank=LJEMF.device->gpu_rank();
+  int procs_per_gpu=LJEMF.device->procs_per_gpu();
+
+  LJEMF.device->init_message(screen,"lj/expand",first_gpu,last_gpu);
+
+  bool message=false;
+  if (LJEMF.device->replica_me()==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fflush(screen);
+  }
+
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=LJEMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
+                       host_lj4, offset, shift, special_lj, inum, nall, 300,
+                       maxspecial, cell_size, gpu_split, screen);
+
+  LJEMF.device->world_barrier();
+  if (message)
+    fprintf(screen,"Done.\n");
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+      else
+        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=LJEMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
+                         offset, shift, special_lj, inum, nall, 300, maxspecial,
+                         cell_size, gpu_split,screen);
+
+    LJEMF.device->world_barrier();
+    if (message) 
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+
+  if (init_ok==0)
+    LJEMF.estimate_gpu_overhead();
+  return init_ok;
+}
+
+void lje_gpu_clear() {
+  LJEMF.clear();
+}
+
+int** lje_gpu_compute_n(const int ago, const int inum_full,
+                        const int nall, double **host_x, int *host_type,
+                        double *sublo, double *subhi, int *tag, int **nspecial,
+                        int **special, const bool eflag, const bool vflag,
+                        const bool eatom, const bool vatom, int &host_start,
+                        int **ilist, int **jnum, const double cpu_time,
+                        bool &success) {
+  return LJEMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
+                       subhi, tag, nspecial, special, eflag, vflag, eatom,
+                       vatom, host_start, ilist, jnum, cpu_time, success);
+}  
+			
+void lje_gpu_compute(const int ago, const int inum_full, const int nall,
+                     double **host_x, int *host_type, int *ilist, int *numj,
+                     int **firstneigh, const bool eflag, const bool vflag,
+                     const bool eatom, const bool vatom, int &host_start,
+                     const double cpu_time, bool &success) {
+  LJEMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
+                firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
+}
+
+double lje_gpu_bytes() {
+  return LJEMF.host_memory_usage();
+}
+
+
--- a/lib/gpu/lal_lj_ext.cpp
+++ b/lib/gpu/lal_lj_ext.cpp
@ -0,0 +1,120 @@
+/***************************************************************************
+                                 lj_ext.cpp
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Functions for LAMMPS access to lj/cut acceleration routines.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#include <iostream>
+#include <cassert>
+#include <math.h>
+
+#include "lal_lj.h"
+
+using namespace std;
+using namespace LAMMPS_AL;
+
+static LJ<PRECISION,ACC_PRECISION> LJLMF;
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+int ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
+                 double **host_lj2, double **host_lj3, double **host_lj4,
+                 double **offset, double *special_lj, const int inum,
+                 const int nall, const int max_nbors,  const int maxspecial,
+                 const double cell_size, int &gpu_mode, FILE *screen) {
+  LJLMF.clear();
+  gpu_mode=LJLMF.device->gpu_mode();
+  double gpu_split=LJLMF.device->particle_split();
+  int first_gpu=LJLMF.device->first_device();
+  int last_gpu=LJLMF.device->last_device();
+  int world_me=LJLMF.device->world_me();
+  int gpu_rank=LJLMF.device->gpu_rank();
+  int procs_per_gpu=LJLMF.device->procs_per_gpu();
+
+  LJLMF.device->init_message(screen,"lj/cut",first_gpu,last_gpu);
+
+  bool message=false;
+  if (LJLMF.device->replica_me()==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fflush(screen);
+  }
+
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, 
+                       host_lj4, offset, special_lj, inum, nall, 300,
+                       maxspecial, cell_size, gpu_split, screen);
+
+  LJLMF.device->world_barrier();
+  if (message)
+    fprintf(screen,"Done.\n");
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+      else
+        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
+                         offset, special_lj, inum, nall, 300, maxspecial,
+                         cell_size, gpu_split, screen);
+
+    LJLMF.device->gpu_barrier();
+    if (message) 
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+
+  if (init_ok==0)
+    LJLMF.estimate_gpu_overhead();
+  return init_ok;
+}
+
+void ljl_gpu_clear() {
+  LJLMF.clear();
+}
+
+int ** ljl_gpu_compute_n(const int ago, const int inum_full,
+                        const int nall, double **host_x, int *host_type,
+                        double *sublo, double *subhi, int *tag, int **nspecial,
+                        int **special, const bool eflag, const bool vflag,
+                        const bool eatom, const bool vatom, int &host_start,
+                        int **ilist, int **jnum, const double cpu_time,
+                        bool &success) {
+  return LJLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
+                       subhi, tag, nspecial, special, eflag, vflag, eatom,
+                       vatom, host_start, ilist, jnum, cpu_time, success);
+}  
+			
+void ljl_gpu_compute(const int ago, const int inum_full, const int nall,
+                     double **host_x, int *host_type, int *ilist, int *numj,
+                     int **firstneigh, const bool eflag, const bool vflag,
+                     const bool eatom, const bool vatom, int &host_start,
+                     const double cpu_time, bool &success) {
+  LJLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
+                firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
+}
+
+double ljl_gpu_bytes() {
+  return LJLMF.host_memory_usage();
+}
+
+
--- a/lib/gpu/lal_morse.cpp
+++ b/lib/gpu/lal_morse.cpp
@ -0,0 +1,154 @@
+/***************************************************************************
+                                  morse.cpp
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Class for acceleration of the morse pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#ifdef USE_OPENCL
+#include "morse_cl.h"
+#else
+#include "morse_ptx.h"
+#endif
+
+#include "lal_morse.h"
+#include <cassert>
+using namespace LAMMPS_AL;
+#define MorseT Morse<numtyp, acctyp>
+
+extern Device<PRECISION,ACC_PRECISION> device;
+
+template <class numtyp, class acctyp>
+MorseT::Morse() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
+}
+
+template <class numtyp, class acctyp>
+MorseT::~Morse() { 
+  clear();
+}
+ 
+template <class numtyp, class acctyp>
+int MorseT::bytes_per_atom(const int max_nbors) const {
+  return this->bytes_per_atom_atomic(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int MorseT::init(const int ntypes, 
+                          double **host_cutsq, double **host_morse1, 
+                          double **host_r0, double **host_alpha, 
+                          double **host_d0, double **host_offset, 
+                          double *host_special_lj, const int nlocal,
+                          const int nall, const int max_nbors,
+                          const int maxspecial, const double cell_size,
+                          const double gpu_split, FILE *_screen) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                            _screen,morse);
+  if (success!=0)
+    return success;
+
+  // If atom type constants fit in shared memory use fast kernel
+  int types=ntypes;
+  shared_types=false;
+  int max_shared_types=this->device->max_shared_types();
+  if (types<=max_shared_types && this->_block_size>=max_shared_types) {
+    types=max_shared_types;
+    shared_types=true;
+  }
+  _types=types;
+
+  // Allocate a host write buffer for data initialization
+  UCL_H_Vec<numtyp> host_write(types*types*32,*(this->ucl_device),
+                               UCL_WRITE_OPTIMIZED);
+
+  for (int i=0; i<types*types; i++)
+    host_write[i]=0.0;
+
+  mor1.alloc(types*types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,types,mor1,host_write,host_cutsq,host_morse1,
+                         host_r0,host_alpha);
+
+  mor2.alloc(types*types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack2(ntypes,types,mor2,host_write,host_d0,host_offset);
+
+  UCL_H_Vec<double> dview;
+  sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
+  dview.view(host_special_lj,4,*(this->ucl_device));
+  ucl_copy(sp_lj,dview,false);
+
+  _allocated=true;
+  this->_max_bytes=mor1.row_bytes()+mor2.row_bytes()+sp_lj.row_bytes();
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+void MorseT::clear() {
+  if (!_allocated)
+    return;
+  _allocated=false;
+
+  mor1.clear();
+  mor2.clear();
+  sp_lj.clear();
+  this->clear_atomic();
+}
+
+template <class numtyp, class acctyp>
+double MorseT::host_memory_usage() const {
+  return this->host_memory_usage_atomic()+sizeof(Morse<numtyp,acctyp>);
+}
+
+// ---------------------------------------------------------------------------
+// Calculate energies, forces, and torques
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void MorseT::loop(const bool _eflag, const bool _vflag) {
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int eflag, vflag;
+  if (_eflag)
+    eflag=1;
+  else
+    eflag=0;
+
+  if (_vflag)
+    vflag=1;
+  else
+    vflag=0;
+  
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+
+  int ainum=this->ans->inum();
+  int nbor_pitch=this->nbor->nbor_pitch();
+  this->time_pair.start();
+  if (shared_types) {
+    this->k_pair_fast.set_size(GX,BX);
+    this->k_pair_fast.run(&this->atom->dev_x.begin(), &mor1.begin(),
+                          &mor2.begin(), &sp_lj.begin(),
+                          &this->nbor->dev_nbor.begin(),
+                          &this->_nbor_data->begin(),
+                          &this->ans->dev_ans.begin(),
+                          &this->ans->dev_engv.begin(), &eflag, &vflag,
+                          &ainum, &nbor_pitch, &this->_threads_per_atom);
+  } else {
+    this->k_pair.set_size(GX,BX);
+    this->k_pair.run(&this->atom->dev_x.begin(), &mor1.begin(), &mor2.begin(),
+                     &_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
+                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
+                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &nbor_pitch, &this->_threads_per_atom);
+  }
+  this->time_pair.stop();
+}
+
+template class Morse<PRECISION,ACC_PRECISION>;
+
--- a/lib/gpu/lal_morse.cu
+++ b/lib/gpu/lal_morse.cu
@ -0,0 +1,191 @@
+// **************************************************************************
+//                                  morse.cu
+//                             -------------------
+//                           W. Michael Brown (ORNL)
+//
+//  Device code for acceleration of the morse pair style
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                : 
+//    email                : brownw@ornl.gov
+// ***************************************************************************/
+
+#ifdef NV_KERNEL
+#include "lal_aux_fun1.h"
+texture<float4> pos_tex;
+#ifndef _DOUBLE_DOUBLE
+ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
+  { return tex1Dfetch(pos_tex, i); }
+#endif
+#endif
+
+__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *mor1,
+                          __global numtyp2* mor2, const int lj_types, 
+                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
+                          __global int *dev_packed, __global acctyp4 *ans,
+                          __global acctyp *engv, const int eflag,
+                          const int vflag, const int inum,
+                          const int nbor_pitch, const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp sp_lj[4];
+  sp_lj[0]=sp_lj_in[0];
+  sp_lj[1]=sp_lj_in[1];
+  sp_lj[2]=sp_lj_in[2];
+  sp_lj[3]=sp_lj_in[3];
+
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+
+  if (ii<inum) {
+    __global int *nbor, *list_end;
+    int i, numj, n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,list_end,nbor);
+  
+    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    int itype=ix.w;
+
+    numtyp factor_lj;
+    for ( ; nbor<list_end; nbor+=n_stride) {
+  
+      int j=*nbor;
+      factor_lj = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp r = delx*delx+dely*dely+delz*delz;
+        
+      int mtype=itype*lj_types+jtype;
+      if (r<mor1[mtype].x) {
+        r=ucl_sqrt(r);
+        numtyp dexp=r-mor1[mtype].z;
+        dexp=ucl_exp(-mor1[mtype].w*dexp);
+        numtyp dm=dexp*dexp-dexp;
+        numtyp force = mor1[mtype].y*dm/r*factor_lj;
+      
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          numtyp e=mor2[mtype].x*(dexp*dexp - 2.0*dexp) - mor2[mtype].y;
+          energy+=e*factor_lj; 
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                  ans,engv);
+  } // if ii
+}
+
+__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *mor1_in,
+                               __global numtyp2* mor2_in, 
+                               __global numtyp* sp_lj_in,
+                               __global int *dev_nbor, __global int *dev_packed,
+                               __global acctyp4 *ans, __global acctyp *engv, 
+                               const int eflag, const int vflag, const int inum, 
+                               const int nbor_pitch, const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp4 mor1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp2 mor2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp sp_lj[4];
+  if (tid<4)
+    sp_lj[tid]=sp_lj_in[tid];
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    mor1[tid]=mor1_in[tid];
+    if (eflag>0)
+      mor2[tid]=mor2_in[tid];
+  }
+  
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
+  __syncthreads();
+  
+  if (ii<inum) {
+    __global int *nbor, *list_end;
+    int i, numj, n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,list_end,nbor);
+  
+    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    int iw=ix.w;
+    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
+
+    numtyp factor_lj;
+    for ( ; nbor<list_end; nbor+=n_stride) {
+  
+      int j=*nbor;
+      factor_lj = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      int mtype=itype+jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp r = delx*delx+dely*dely+delz*delz;
+        
+      if (r<mor1[mtype].x) {
+        r=ucl_sqrt(r);
+        numtyp dexp=r-mor1[mtype].z;
+        dexp=ucl_exp(-mor1[mtype].w*dexp);
+        numtyp dm=dexp*dexp-dexp;
+        numtyp force = mor1[mtype].y*dm/r*factor_lj;
+      
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          numtyp e=mor2[mtype].x*(dm-dexp)-mor2[mtype].y;
+          energy+=e*factor_lj; 
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                  ans,engv);
+  } // if ii
+}
+
--- a/lib/gpu/lal_morse.h
+++ b/lib/gpu/lal_morse.h
@ -0,0 +1,79 @@
+/***************************************************************************
+                                   morse.h
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Class for acceleration of the morse pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#ifndef LAL_MORSE_H
+#define LAL_MORSE_H
+
+#include "lal_base_atomic.h"
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class Morse : public BaseAtomic<numtyp, acctyp> {
+ public:
+  Morse();
+  ~Morse(); 
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double **host_cutsq,
+           double **host_morse1, double **host_r0, double **host_alpha,
+           double **host_d0, double **host_offset, double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors, 
+           const int maxspecial, const double cell_size, 
+           const double gpu_split, FILE *screen);
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage() const;
+
+  // --------------------------- TYPE DATA --------------------------
+
+  /// mor1.x = cutsq, mor1.y = morse1, mor1.z = r0, mor1.w = alpha
+  UCL_D_Vec<numtyp4> mor1;
+  /// mor2.x = d0, mor2.y = offset
+  UCL_D_Vec<numtyp2> mor2;
+  /// Special LJ values
+  UCL_D_Vec<numtyp> sp_lj;
+
+  /// If atom type constants fit in shared memory, use fast kernels
+  bool shared_types;
+
+  /// Number of atom types 
+  int _types;
+
+ private:
+  bool _allocated;
+  void loop(const bool _eflag, const bool _vflag);
+};
+
+}
+
+#endif
--- a/lib/gpu/lal_morse_ext.cpp
+++ b/lib/gpu/lal_morse_ext.cpp
@ -0,0 +1,121 @@
+/***************************************************************************
+                                  morse.cpp
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Functions for LAMMPS access to morse acceleration routines.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#include <iostream>
+#include <cassert>
+#include <math.h>
+
+#include "lal_morse.h"
+
+using namespace std;
+using namespace LAMMPS_AL;
+
+static Morse<PRECISION,ACC_PRECISION> MORMF;
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+int mor_gpu_init(const int ntypes, double **cutsq,
+                 double **host_lj1, double **host_lj2, double **host_lj3, 
+                 double **host_lj4, double **offset, double *special_lj,
+                 const int inum, const int nall, const int max_nbors, 
+                 const int maxspecial, const double cell_size, int &gpu_mode,
+                 FILE *screen) {
+  MORMF.clear();
+  gpu_mode=MORMF.device->gpu_mode();
+  double gpu_split=MORMF.device->particle_split();
+  int first_gpu=MORMF.device->first_device();
+  int last_gpu=MORMF.device->last_device();
+  int world_me=MORMF.device->world_me();
+  int gpu_rank=MORMF.device->gpu_rank();
+  int procs_per_gpu=MORMF.device->procs_per_gpu();
+
+  MORMF.device->init_message(screen,"morse",first_gpu,last_gpu);
+
+  bool message=false;
+  if (MORMF.device->replica_me()==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fflush(screen);
+  }
+
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=MORMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, 
+                       host_lj4, offset, special_lj, inum, nall, 300,
+                       maxspecial, cell_size, gpu_split, screen);
+
+  MORMF.device->world_barrier();
+  if (message)
+    fprintf(screen,"Done.\n");
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+      else
+        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=MORMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
+                         offset, special_lj, inum, nall, 300, maxspecial,
+                         cell_size, gpu_split, screen);
+
+    MORMF.device->gpu_barrier();
+    if (message) 
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+
+  if (init_ok==0)
+    MORMF.estimate_gpu_overhead();
+  return init_ok;
+}
+
+void mor_gpu_clear() {
+  MORMF.clear();
+}
+
+int** mor_gpu_compute_n(const int ago, const int inum_full,
+                        const int nall, double **host_x, int *host_type,
+                        double *sublo, double *subhi, int *tag, int **nspecial,
+                        int **special, const bool eflag, const bool vflag,
+                        const bool eatom, const bool vatom, int &host_start,
+                        int **ilist, int **jnum, const double cpu_time,
+                        bool &success) {
+  return MORMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
+                       subhi, tag, nspecial, special, eflag, vflag, eatom,
+                       vatom, host_start, ilist, jnum, cpu_time, success);
+}  
+			
+void mor_gpu_compute(const int ago, const int inum_full, const int nall,
+                     double **host_x, int *host_type, int *ilist, int *numj,
+                     int **firstneigh, const bool eflag, const bool vflag,
+                     const bool eatom, const bool vatom, int &host_start,
+                     const double cpu_time, bool &success) {
+  MORMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
+                firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
+}
+
+double mor_gpu_bytes() {
+  return MORMF.host_memory_usage();
+}
+
+
--- a/lib/gpu/lal_neighbor.cpp
+++ b/lib/gpu/lal_neighbor.cpp
@ -0,0 +1,502 @@
+/***************************************************************************
+                                 neighbor.cpp
+                             -------------------
+                            W. Michael Brown (ORNL)
+                              Peng Wang (Nvidia)
+
+  Class for handling neighbor lists
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : brownw@ornl.gov, penwang@nvidia.com
+ ***************************************************************************/
+
+#include "lal_precision.h"
+#include "lal_neighbor.h"
+#include "lal_device.h"
+#include "math.h"
+using namespace LAMMPS_AL;
+
+int Neighbor::bytes_per_atom(const int max_nbors) const {
+  if (_gpu_nbor==1)
+    return (max_nbors+2)*sizeof(int);
+  else if (_gpu_nbor==2)
+    return (max_nbors+3)*sizeof(int);
+  else if (_use_packing)
+    return ((max_nbors+2)*2)*sizeof(int);
+  else
+    return (max_nbors+3)*sizeof(int);
+}
+
+bool Neighbor::init(NeighborShared *shared, const int inum,
+                       const int host_inum, const int max_nbors, 
+                       const int maxspecial, UCL_Device &devi, 
+                       const int gpu_nbor, const int gpu_host, 
+                       const bool pre_cut, const int block_cell_2d,
+                       const int block_cell_id, const int block_nbor_build,
+                       const int threads_per_atom, const bool time_device) {
+  clear();
+
+  _threads_per_atom=threads_per_atom;
+  _block_cell_2d=block_cell_2d;
+  _block_cell_id=block_cell_id;
+  _block_nbor_build=block_nbor_build;
+  _shared=shared;
+  dev=&devi;
+  _gpu_nbor=gpu_nbor;
+  _time_device=time_device;
+  if (gpu_host==0)
+    _gpu_host=false;
+  else if (gpu_host==1)
+    _gpu_host=true;
+  else 
+    // Not yet implemented
+    assert(0==1);
+  
+  if (pre_cut || gpu_nbor==0)
+    _alloc_packed=true;
+  else
+    _alloc_packed=false;
+
+  bool success=true;
+    
+  // Initialize timers for the selected GPU
+  _nbor_time_avail=false;
+  time_nbor.init(*dev);
+  time_kernel.init(*dev);
+  time_hybrid1.init(*dev);
+  time_hybrid2.init(*dev);
+  time_nbor.zero();
+  time_kernel.zero();
+  time_hybrid1.zero();
+  time_hybrid2.zero();
+
+  _max_atoms=static_cast<int>(static_cast<double>(inum)*1.10);
+  if (_max_atoms==0)
+    _max_atoms=1000;
+    
+  _max_host=static_cast<int>(static_cast<double>(host_inum)*1.10);
+  _max_nbors=max_nbors;
+
+  _maxspecial=maxspecial;
+  if (gpu_nbor==0)
+    _maxspecial=0;
+
+  if (gpu_nbor==0)
+    success=success && (host_packed.alloc(2*IJ_SIZE,*dev,
+                                          UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
+  alloc(success);
+  if (!success)
+    return false;
+    
+  if (_use_packing==false)
+    _shared->compile_kernels(devi,gpu_nbor);
+
+  return success;
+}
+
+void Neighbor::alloc(bool &success) { 
+  dev_nbor.clear();
+  host_acc.clear();
+  int nt=_max_atoms+_max_host;
+  if (_use_packing==false || _gpu_nbor>0) 
+    success=success && 
+            (dev_nbor.alloc((_max_nbors+2)*_max_atoms,*dev)==UCL_SUCCESS);
+  else 
+    success=success && (dev_nbor.alloc(3*_max_atoms,*dev,
+                                       UCL_READ_ONLY)==UCL_SUCCESS);
+  success=success && (host_acc.alloc(nt*2,*dev,
+                                     UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
+
+  _c_bytes=dev_nbor.row_bytes();
+  if (_alloc_packed) {
+    dev_packed.clear();
+    success=success && (dev_packed.alloc((_max_nbors+2)*_max_atoms,*dev,
+                                         UCL_READ_ONLY)==UCL_SUCCESS);
+    _c_bytes+=dev_packed.row_bytes();                                         
+  } 
+  if (_max_host>0) {
+    host_nbor.clear();
+    dev_host_nbor.clear();
+    dev_host_numj.clear();
+    host_ilist.clear();
+    host_jlist.clear();
+    
+    success=success && (host_nbor.alloc(_max_nbors*_max_host,*dev,
+                                        UCL_RW_OPTIMIZED)==UCL_SUCCESS);
+    success=success && (dev_host_nbor.alloc(_max_nbors*_max_host,
+                                            *dev,UCL_WRITE_ONLY)==UCL_SUCCESS);
+    success=success && (dev_host_numj.alloc(_max_host,*dev,
+                                            UCL_WRITE_ONLY)==UCL_SUCCESS);
+    success=success && (host_ilist.alloc(nt,*dev,UCL_NOT_PINNED)==UCL_SUCCESS);
+    if (!success)
+      return;
+    for (int i=0; i<nt; i++)
+      host_ilist[i]=i;
+    success=success && (host_jlist.alloc(_max_host,*dev,
+                                         UCL_NOT_PINNED)==UCL_SUCCESS);
+    if (!success)
+      return;
+    int *ptr=host_nbor.begin();
+    for (int i=0; i<_max_host; i++) {
+      host_jlist[i]=ptr;
+      ptr+=_max_nbors;
+    }                                                 
+    _c_bytes+=dev_host_nbor.row_bytes()+dev_host_numj.row_bytes();
+  }
+  if (_maxspecial>0) {
+    dev_nspecial.clear();
+    dev_special.clear();
+    dev_special_t.clear();
+    int at=_max_atoms+_max_host;
+    success=success && (dev_nspecial.alloc(3*at,*dev,
+                                           UCL_READ_ONLY)==UCL_SUCCESS);
+    success=success && (dev_special.alloc(_maxspecial*at,*dev,
+                                          UCL_READ_ONLY)==UCL_SUCCESS);
+    success=success && (dev_special_t.alloc(_maxspecial*at,*dev,
+                                            UCL_READ_ONLY)==UCL_SUCCESS);
+    _gpu_bytes+=dev_nspecial.row_bytes()+dev_special.row_bytes()+
+                dev_special_t.row_bytes();
+  }
+
+  _allocated=true;
+}
+  
+void Neighbor::clear() {
+  _gpu_bytes=0.0;
+  _cell_bytes=0.0;
+  _c_bytes=0.0;
+  _bin_time=0.0;
+  if (_ncells>0) {
+    _ncells=0;
+    dev_cell_counts.clear();
+    if (_gpu_nbor==2) {
+      host_cell_counts.clear();
+      delete [] cell_iter;
+    }
+  }
+  if (_allocated) {
+    _allocated=false;
+    _nbor_time_avail=false;
+
+    host_packed.clear();
+    host_acc.clear();
+    dev_nbor.clear();
+    dev_host_nbor.clear();
+    dev_packed.clear();
+    host_nbor.clear();
+    dev_host_numj.clear();
+    host_ilist.clear();
+    host_jlist.clear();
+    dev_nspecial.clear();
+    dev_special.clear();
+    dev_special_t.clear();
+
+    time_kernel.clear();
+    time_nbor.clear();
+    time_hybrid1.clear();
+    time_hybrid2.clear();
+  }
+}
+
+double Neighbor::host_memory_usage() const {
+  if (_gpu_nbor>0) {
+    if (_gpu_host)
+      return host_nbor.row_bytes()*host_nbor.rows()+host_ilist.row_bytes()+
+             host_jlist.row_bytes();
+    else
+      return 0;
+  } else 
+    return host_packed.row_bytes()*host_packed.rows()+host_acc.row_bytes()+
+           sizeof(Neighbor);
+}
+
+void Neighbor::get_host(const int inum, int *ilist, int *numj,
+                           int **firstneigh, const int block_size) {  
+  _nbor_time_avail=true;
+  time_nbor.start();
+
+  UCL_H_Vec<int> ilist_view;
+  ilist_view.view(ilist,inum,*dev);
+  ucl_copy(dev_nbor,ilist_view,false);
+
+  UCL_D_Vec<int> nbor_offset;
+  UCL_H_Vec<int> host_offset;
+
+  int copy_count=0;
+  int ij_count=0;
+  int acc_count=0;
+  int dev_count=0;
+  int *h_ptr=host_packed.begin();
+  _nbor_pitch=inum;
+  
+  for (int ii=0; ii<inum; ii++) {
+    int i=ilist[ii];
+    int nj=numj[i];
+    host_acc[ii]=nj;
+    host_acc[ii+inum]=acc_count;
+
+    acc_count+=nj;
+    
+    int *jlist=firstneigh[i];
+    for (int jj=0; jj<nj; jj++) {
+      *h_ptr=jlist[jj];
+      h_ptr++;
+      ij_count++;
+       
+      if (ij_count==IJ_SIZE) {
+        dev_nbor.sync();
+        host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,IJ_SIZE);
+        nbor_offset.view_offset(dev_count,dev_packed,IJ_SIZE);
+        ucl_copy(nbor_offset,host_offset,true);
+        copy_count++;
+        ij_count=0;
+        dev_count+=IJ_SIZE;
+        h_ptr=host_packed.begin()+(IJ_SIZE*(copy_count%2));
+      }
+    }
+  }
+  if (ij_count!=0) {
+    dev_nbor.sync();
+    host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,ij_count);
+    nbor_offset.view_offset(dev_count,dev_packed,ij_count);
+    ucl_copy(nbor_offset,host_offset,true);
+  }
+  UCL_D_Vec<int> acc_view;
+  acc_view.view_offset(inum,dev_nbor,inum*2);
+  ucl_copy(acc_view,host_acc,true);
+  time_nbor.stop();
+  
+  if (_use_packing==false) {
+    time_kernel.start();
+    int GX=static_cast<int>(ceil(static_cast<double>(inum)*_threads_per_atom/
+                                 block_size));
+    _shared->k_nbor.set_size(GX,block_size);
+    _shared->k_nbor.run(&dev_nbor.begin(), &dev_packed.begin(), &inum,
+                        &_threads_per_atom);
+    time_kernel.stop();
+  }
+}
+
+template <class numtyp, class acctyp>
+void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
+                               const int nall, Atom<numtyp,acctyp> &atom, 
+                               double *sublo, double *subhi, int *tag, 
+                               int **nspecial, int **special, bool &success,
+                               int &mn) {
+  _nbor_time_avail=true;
+  const int nt=inum+host_inum;
+
+  // Calculate number of cells and allocate storage for binning as necessary
+  int ncellx, ncelly, ncellz, ncell_3d;
+  ncellx = static_cast<int>(ceil(((subhi[0] - sublo[0]) +
+                                  2.0*_cell_size)/_cell_size));
+  ncelly = static_cast<int>(ceil(((subhi[1] - sublo[1]) +
+                                  2.0*_cell_size)/_cell_size));
+  ncellz = static_cast<int>(ceil(((subhi[2] - sublo[2]) +
+                                  2.0*_cell_size)/_cell_size));
+  ncell_3d = ncellx * ncelly * ncellz;
+  if (ncell_3d+1>_ncells) {
+    dev_cell_counts.clear();
+    dev_cell_counts.alloc(ncell_3d+1,dev_nbor);
+    if (_gpu_nbor==2) {
+      if (_ncells>0) {
+        host_cell_counts.clear();
+        delete [] cell_iter;
+      }
+      cell_iter = new int[ncell_3d+1];
+      host_cell_counts.alloc(ncell_3d+1,dev_nbor);
+    }
+    _ncells=ncell_3d+1;
+    _cell_bytes=dev_cell_counts.row_bytes();
+  }
+
+  const numtyp cell_size_cast=static_cast<numtyp>(_cell_size);
+
+  // If binning on CPU, do this now
+  if (_gpu_nbor==2) {
+    double stime = MPI_Wtime();
+    int *cell_id=atom.host_cell_id.begin();
+    int *particle_id=atom.host_particle_id.begin();
+    
+    // Build cell list on CPU                               
+    host_cell_counts.zero();
+    double m_cell_size=-_cell_size;
+    double dx=subhi[0]-sublo[0]+_cell_size;
+    double dy=subhi[1]-sublo[1]+_cell_size;
+    double dz=subhi[2]-sublo[2]+_cell_size;
+
+    for (int i=0; i<nall; i++) {
+      double px, py, pz;
+      px=x[i][0]-sublo[0];
+      py=x[i][1]-sublo[1];
+      pz=x[i][2]-sublo[2];
+      if (px<m_cell_size) px=m_cell_size;
+      if (py<m_cell_size) py=m_cell_size;
+      if (pz<m_cell_size) pz=m_cell_size;
+      if (px>dx) px=dx;            
+      if (py>dy) py=dy;            
+      if (pz>dz) pz=dz;            
+    
+      int id=static_cast<int>(px/_cell_size + 1.0) + 
+             static_cast<int>(py/_cell_size + 1.0) * ncellx +
+             static_cast<int>(pz/_cell_size + 1.0) * ncellx * ncelly;
+    
+      cell_id[i]=id;
+      host_cell_counts[id+1]++;
+    }
+    cell_iter[0]=0;
+    for (int i=1; i<_ncells; i++) {
+      host_cell_counts[i]+=host_cell_counts[i-1];
+      cell_iter[i]=host_cell_counts[i];
+    }
+    time_hybrid1.start();
+    ucl_copy(dev_cell_counts,host_cell_counts,true);
+    time_hybrid1.stop();
+    for (int i=0; i<nall; i++) {
+      int celli=cell_id[i];
+      int ploc=cell_iter[celli];
+      cell_iter[celli]++;
+      particle_id[ploc]=i;
+    }
+    time_hybrid2.start();
+    ucl_copy(atom.dev_particle_id,atom.host_particle_id,true);
+    time_hybrid2.stop();
+    _bin_time+=MPI_Wtime()-stime;
+  }        
+
+  if (_maxspecial>0) {
+    time_nbor.start();
+    UCL_H_Vec<int> view_nspecial, view_special, view_tag;
+    view_nspecial.view(nspecial[0],nt*3,*dev);
+    view_special.view(special[0],nt*_maxspecial,*dev);
+    view_tag.view(tag,nall,*dev);
+    ucl_copy(dev_nspecial,view_nspecial,nt*3,false);
+    ucl_copy(dev_special_t,view_special,nt*_maxspecial,false);
+    ucl_copy(atom.dev_tag,view_tag,nall,false);
+    time_nbor.stop();
+    if (_time_device)
+      time_nbor.add_to_total();
+    time_kernel.start();
+    const int b2x=_block_cell_2d;
+    const int b2y=_block_cell_2d;
+    const int g2x=static_cast<int>(ceil(static_cast<double>(_maxspecial)/b2x));
+    const int g2y=static_cast<int>(ceil(static_cast<double>(nt)/b2y));
+    _shared->k_transpose.set_size(g2x,g2y,b2x,b2y);
+    _shared->k_transpose.run(&dev_special.begin(),&dev_special_t.begin(),
+                             &_maxspecial,&nt);        
+  } else
+    time_kernel.start();
+
+  _nbor_pitch=inum;
+  _shared->neigh_tex.bind_float(atom.dev_x,4);
+
+  // If binning on GPU, do this now
+  if (_gpu_nbor==1) {
+    const int neigh_block=_block_cell_id;
+    const int GX=(int)ceil((float)nall/neigh_block);
+    const numtyp sublo0=static_cast<numtyp>(sublo[0]);
+    const numtyp sublo1=static_cast<numtyp>(sublo[1]);
+    const numtyp sublo2=static_cast<numtyp>(sublo[2]);
+    const numtyp subhi0=static_cast<numtyp>(subhi[0]);
+    const numtyp subhi1=static_cast<numtyp>(subhi[1]);
+    const numtyp subhi2=static_cast<numtyp>(subhi[2]);
+    _shared->k_cell_id.set_size(GX,neigh_block);
+    _shared->k_cell_id.run(&atom.dev_x.begin(), &atom.dev_cell_id.begin(), 
+                           &atom.dev_particle_id.begin(),
+    				               &sublo0, &sublo1, &sublo2, &subhi0, &subhi1, 
+    				               &subhi2, &cell_size_cast, &ncellx, &ncelly, &nall);
+
+    atom.sort_neighbor(nall);
+
+    /* calculate cell count */
+    _shared->k_cell_counts.set_size(GX,neigh_block);
+    _shared->k_cell_counts.run(&atom.dev_cell_id.begin(), 
+                               &dev_cell_counts.begin(), &nall, &ncell_3d);
+  } 
+  
+  /* build the neighbor list */
+  const int cell_block=_block_nbor_build;
+  _shared->k_build_nbor.set_size(ncellx, ncelly*ncellz, cell_block, 1);
+  _shared->k_build_nbor.run(&atom.dev_x.begin(), &atom.dev_particle_id.begin(),
+                            &dev_cell_counts.begin(), &dev_nbor.begin(),
+                            &dev_host_nbor.begin(), &dev_host_numj.begin(),
+                            &_max_nbors,&cell_size_cast,
+                            &ncellx, &ncelly, &ncellz, &inum, &nt, &nall,
+                            &_threads_per_atom);
+
+  /* Get the maximum number of nbors and realloc if necessary */
+  UCL_D_Vec<int> numj;
+  numj.view_offset(inum,dev_nbor,inum);
+  ucl_copy(host_acc,numj,inum,false);
+  if (nt>inum) {
+    UCL_H_Vec<int> host_offset;
+    host_offset.view_offset(inum,host_acc,nt-inum);
+    ucl_copy(host_offset,dev_host_numj,nt-inum,false);
+  }
+  mn=host_acc[0];
+  for (int i=1; i<nt; i++)
+    mn=std::max(mn,host_acc[i]);
+
+  if (mn>_max_nbors) {  
+    mn=static_cast<int>(static_cast<double>(mn)*1.10);
+    dev_nbor.clear();
+    success=success && 
+            (dev_nbor.alloc((mn+1)*_max_atoms,atom.dev_x)==UCL_SUCCESS);
+    _gpu_bytes=dev_nbor.row_bytes();
+    if (_max_host>0) {
+      host_nbor.clear();
+      dev_host_nbor.clear();
+      success=success && (host_nbor.alloc(mn*_max_host,dev_nbor,
+                                          UCL_RW_OPTIMIZED)==UCL_SUCCESS);
+      success=success && (dev_host_nbor.alloc(mn*_max_host,
+                                        dev_nbor,UCL_WRITE_ONLY)==UCL_SUCCESS);
+      int *ptr=host_nbor.begin();
+      for (int i=0; i<_max_host; i++) {
+        host_jlist[i]=ptr;
+        ptr+=mn;
+      }                                                 
+      _gpu_bytes+=dev_host_nbor.row_bytes();
+    }
+    if (_alloc_packed) {
+      dev_packed.clear();
+      success=success && (dev_packed.alloc((mn+2)*_max_atoms,*dev,
+                                           UCL_READ_ONLY)==UCL_SUCCESS);
+      _gpu_bytes+=dev_packed.row_bytes();
+    }
+    if (!success)
+      return;
+    _max_nbors=mn;
+    time_kernel.stop();
+    if (_time_device)
+      time_kernel.add_to_total();
+    build_nbor_list(x, inum, host_inum, nall, atom, sublo, subhi, tag, nspecial,
+                    special, success, mn);
+    return;
+  }
+  
+  if (_maxspecial>0) {
+    const int GX2=static_cast<int>(ceil(static_cast<double>
+                                          (nt*_threads_per_atom)/cell_block));
+    _shared->k_special.set_size(GX2,cell_block);
+    _shared->k_special.run(&dev_nbor.begin(), &dev_host_nbor.begin(), 
+                           &dev_host_numj.begin(), &atom.dev_tag.begin(), 
+                           &dev_nspecial.begin(), &dev_special.begin(), 
+                           &inum, &nt, &_max_nbors, &_threads_per_atom);
+  }
+  time_kernel.stop();
+
+  time_nbor.start();
+  if (_gpu_host)
+    ucl_copy(host_nbor,dev_host_nbor,false);
+  time_nbor.stop();
+}
+
+template void Neighbor::build_nbor_list<PRECISION,ACC_PRECISION>
+     (double **x, const int inum, const int host_inum, const int nall,
+      Atom<PRECISION,ACC_PRECISION> &atom, double *sublo, double *subhi,
+      int *, int **, int **, bool &success, int &mn);
+
--- a/lib/gpu/lal_neighbor.h
+++ b/lib/gpu/lal_neighbor.h
@ -0,0 +1,235 @@
+/***************************************************************************
+                                  neighbor.h
+                             -------------------
+                            W. Michael Brown (ORNL)
+                              Peng Wang (Nvidia)
+
+  Class for handling neighbor lists
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : brownw@ornl.gov, penwang@nvidia.com
+ ***************************************************************************/
+
+#ifndef LAL_NEIGHBOR_H
+#define LAL_NEIGHBOR_H
+
+#include "lal_atom.h"
+#include "lal_neighbor_shared.h"
+
+#define IJ_SIZE 131072
+
+#ifdef USE_OPENCL
+
+#include "geryon/ocl_timer.h"
+#include "geryon/ocl_mat.h"
+using namespace ucl_opencl;
+
+#else
+
+#include "geryon/nvd_timer.h"
+#include "geryon/nvd_mat.h"
+using namespace ucl_cudadr;
+
+#endif
+
+namespace LAMMPS_AL {
+
+class Neighbor {
+ public:
+  Neighbor() : _allocated(false), _use_packing(false), _ncells(0) {}
+  ~Neighbor() { clear(); }
+ 
+  /// Determine whether neighbor unpacking should be used
+  /** If false, twice as much memory is reserved to allow unpacking neighbors by 
+    * atom for coalesced access. **/
+  void packing(const bool use_packing) { _use_packing=use_packing; }
+  
+  /// Clear any old data and setup for new LAMMPS run
+  /** \param inum Initial number of particles whose neighbors stored on device
+    * \param host_inum Initial number of particles whose nbors copied to host
+    * \param max_nbors Initial number of rows in the neighbor matrix
+    * \param gpu_nbor 0 if neighboring will be performed on host
+    *        gpu_nbor 1 if neighboring will be performed on device
+    *        gpu_nbor 2 if binning on host and neighboring on device
+    * \param gpu_host 0 if host will not perform force calculations,
+    *                 1 if gpu_nbor is true, and host needs a half nbor list,
+    *                 2 if gpu_nbor is true, and host needs a full nbor list
+    * \param pre_cut True if cutoff test will be performed in separate kernel
+    *                than the force kernel 
+    * \param threads_per_atom Number of threads used per atom for force
+    *                         calculation **/
+  bool init(NeighborShared *shared, const int inum, const int host_inum,
+            const int max_nbors, const int maxspecial, UCL_Device &dev,
+            const int gpu_nbor, const int gpu_host, const bool pre_cut,
+            const int block_cell_2d, const int block_cell_id, 
+            const int block_nbor_build, const int threads_per_atom,
+            const bool time_device);
+
+  /// Set the size of the cutoff+skin
+  inline void cell_size(const double size) { _cell_size=size; }
+  
+  /// Get the size of the cutoff+skin
+  inline double cell_size() const { return _cell_size; }
+
+  /// Check if there is enough memory for neighbor data and realloc if not
+  /** \param inum Number of particles whose nbors will be stored on device
+    * \param max_nbor Current max number of neighbors for a particle
+    * \param success False if insufficient memory **/
+  inline void resize(const int inum, const int max_nbor, bool &success) {
+    if (inum>_max_atoms || max_nbor>_max_nbors) {
+      _max_atoms=static_cast<int>(static_cast<double>(inum)*1.10);
+      if (max_nbor>_max_nbors)
+        _max_nbors=static_cast<int>(static_cast<double>(max_nbor)*1.10);
+      alloc(success);
+    }
+  }
+
+  /// Check if there is enough memory for neighbor data and realloc if not
+  /** \param inum Number of particles whose nbors will be stored on device
+    * \param host_inum Number of particles whose nbors will be copied to host
+    * \param max_nbor Current max number of neighbors for a particle
+    * \param success False if insufficient memory **/
+  inline void resize(const int inum, const int host_inum, const int max_nbor, 
+                     bool &success) {
+    if (inum>_max_atoms || max_nbor>_max_nbors || host_inum>_max_host) {
+      _max_atoms=static_cast<int>(static_cast<double>(inum)*1.10);
+      _max_host=static_cast<int>(static_cast<double>(host_inum)*1.10);
+      if (max_nbor>_max_nbors)
+        _max_nbors=static_cast<int>(static_cast<double>(max_nbor)*1.10);
+      alloc(success);
+    }
+  }
+
+  inline void acc_timers() {
+    if (_nbor_time_avail) {
+      time_nbor.add_to_total();
+      time_kernel.add_to_total();
+      if (_gpu_nbor==2) {
+        time_hybrid1.add_to_total();
+        time_hybrid2.add_to_total();
+      }
+      _nbor_time_avail=false;
+    }
+  }
+
+  /// Free all memory on host and device
+  void clear();
+ 
+  /// Bytes per atom used on device
+  int bytes_per_atom(const int max_nbors) const;
+  
+  /// Total host memory used by class
+  double host_memory_usage() const;
+  
+  /// Returns the type of neighboring:
+  /** - 0 if neighboring will be performed on host
+    * - 1 if neighboring will be performed on device
+    * - 2 if binning on host and neighboring on device **/
+  inline int gpu_nbor() const { return _gpu_nbor; }
+  
+  /// Make a copy of unpacked nbor lists in the packed storage area (for gb)
+  inline void copy_unpacked(const int inum, const int maxj) 
+    { ucl_copy(dev_packed,dev_nbor,inum*(maxj+2),true); }
+
+  /// Copy neighbor list from host (first time or from a rebuild)  
+  void get_host(const int inum, int *ilist, int *numj, 
+                int **firstneigh, const int block_size);
+  
+  /// Return the stride in elements for each nbor row
+  inline int nbor_pitch() const { return _nbor_pitch; }
+  
+  /// Return the maximum number of atoms that can currently be stored
+  inline int max_atoms() const { return _max_atoms; }
+
+  /// Return the maximum number of nbors for a particle based on current alloc
+  inline int max_nbors() const { return _max_nbors; }
+  
+  /// Return the time spent binning on the CPU for hybrid neighbor builds
+  inline double bin_time() const { return _bin_time; }
+
+  /// Loop through neighbor count array and return maximum nbors for a particle
+  inline int max_nbor_loop(const int inum, int *numj, int *ilist) const {
+    int mn=0;
+    for (int i=0; i<inum; i++)
+      mn=std::max(mn,numj[ilist[i]]);
+    return mn;
+  }
+
+  /// Build nbor list on the device
+  template <class numtyp, class acctyp>
+  void build_nbor_list(double **x, const int inum, const int host_inum, 
+                       const int nall, Atom<numtyp,acctyp> &atom, double *sublo,
+                       double *subhi, int *tag, int **nspecial, int **special, 
+                       bool &success, int &max_nbors);
+
+  /// Return the number of bytes used on device
+  inline double gpu_bytes() {
+    double res = _gpu_bytes + _c_bytes + _cell_bytes;
+    if (_gpu_nbor==0)
+      res += 2*IJ_SIZE*sizeof(int);
+
+    return res;
+  }
+  
+  // ------------------------------- Data -------------------------------
+
+  /// Device neighbor matrix
+  /** - 1st row is i (index into atom data)
+    * - 2nd row is numj (number of neighbors)
+    * - 3rd row is starting location in packed nbors
+    * - Remaining rows are the neighbors arranged for coalesced access **/
+  UCL_D_Vec<int> dev_nbor;
+  /// Packed storage for neighbor lists copied from host
+  UCL_D_Vec<int> dev_packed;
+  /// Host buffer for copying neighbor lists
+  UCL_H_Vec<int> host_packed;
+  /// Host storage for nbor counts (row 1) & accumulated neighbor counts (row2)
+  UCL_H_Vec<int> host_acc;
+
+  // ----------------- Data for GPU Neighbor Calculation ---------------
+
+  /// Host storage for device calculated neighbor lists
+  /** Same storage format as device matrix **/
+  UCL_H_Vec<int> host_nbor;
+  /// Device storage for neighbor list matrix that will be copied to host
+  /** - 1st row is numj
+    * - Remaining rows are by atom, columns are nbors **/
+  UCL_D_Vec<int> dev_host_nbor;
+  UCL_D_Vec<int> dev_host_numj;
+  UCL_H_Vec<int> host_ilist;
+  UCL_H_Vec<int*> host_jlist;
+  /// Device storage for special neighbor counts
+  UCL_D_Vec<int> dev_nspecial;
+  /// Device storage for special neighbors
+  UCL_D_Vec<int> dev_special, dev_special_t;
+  /// Host storage for number of particles per cell
+  UCL_H_Vec<int> host_cell_counts;
+  int *cell_iter;
+  /// Device storage for number of particles per cell
+  UCL_D_Vec<int> dev_cell_counts;
+
+  /// Device timers
+  UCL_Timer time_nbor, time_kernel, time_hybrid1, time_hybrid2;
+  
+ private:
+  NeighborShared *_shared;
+  UCL_Device *dev;
+  bool _allocated, _use_packing, _nbor_time_avail, _time_device;
+  int _gpu_nbor, _max_atoms, _max_nbors, _max_host, _nbor_pitch, _maxspecial;
+  bool _gpu_host, _alloc_packed;
+  double _cell_size, _bin_time;
+
+  double _gpu_bytes, _c_bytes, _cell_bytes;
+  void alloc(bool &success);
+  
+  int _block_cell_2d, _block_cell_id, _block_nbor_build, _ncells;
+  int _threads_per_atom;
+};
+
+}
+
+#endif
--- a/lib/gpu/lal_neighbor_cpu.cu
+++ b/lib/gpu/lal_neighbor_cpu.cu
@ -0,0 +1,42 @@
+// **************************************************************************
+//                                  atom.cu
+//                             -------------------
+//                           W. Michael Brown (ORNL)
+//
+//  Device code for handling CPU generated neighbor lists
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                : 
+//    email                : brownw@ornl.gov
+// ***************************************************************************/
+
+#ifdef NV_KERNEL
+#include "lal_preprocessor.h"
+#endif
+
+__kernel void kernel_unpack(__global int *dev_nbor, __global int *dev_ij,
+                            const int inum, const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int offset=tid & (t_per_atom-1);
+  int ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom)+tid/t_per_atom;
+
+  if (ii<inum) {
+    __global int *nbor=dev_nbor+ii+inum;
+    int numj=*nbor;
+    nbor+=inum;
+    __global int *list=dev_ij+*nbor;
+    __global int *list_end=list+numj;
+    list+=offset;
+    nbor+=fast_mul(ii,t_per_atom-1)+offset;
+    int stride=fast_mul(t_per_atom,inum);
+      
+    for ( ; list<list_end; list++) {
+      *nbor=*list;
+      nbor+=stride;
+    }
+  } // if ii
+}
+
--- a/lib/gpu/lal_neighbor_gpu.cu
+++ b/lib/gpu/lal_neighbor_gpu.cu
@ -0,0 +1,277 @@
+// **************************************************************************
+//                               neighbor_gpu.cu
+//                             -------------------
+//                              Peng Wang (Nvidia)
+//                           W. Michael Brown (ORNL)
+//
+//  Device code for handling GPU generated neighbor lists
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                : 
+//    email                : penwang@nvidia.com, brownw@ornl.gov
+// ***************************************************************************/
+
+#ifdef NV_KERNEL
+#include "lal_preprocessor.h"
+texture<float4> neigh_tex;
+#ifndef _DOUBLE_DOUBLE
+ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
+  { return tex1Dfetch(neigh_tex, i); }
+#endif
+
+__kernel void calc_cell_id(numtyp4 *pos, unsigned *cell_id, int *particle_id,
+                           numtyp boxlo0, 
+                           numtyp boxlo1, numtyp boxlo2, numtyp boxhi0, 
+                           numtyp boxhi1, numtyp boxhi2, numtyp cell_size, 
+                           int ncellx, int ncelly, int nall) {
+  int i = threadIdx.x + blockIdx.x*blockDim.x;
+
+  if (i < nall) {
+    numtyp4 p = fetch_pos(i,pos); //pos[i];
+
+    p.x -= boxlo0;
+    p.y -= boxlo1;
+    p.z -= boxlo2;
+    
+    p.x = fmaxf(p.x, -cell_size);
+    p.x = fminf(p.x, boxhi0-boxlo0+cell_size);
+    p.y = fmaxf(p.y, -cell_size);
+    p.y = fminf(p.y, boxhi1-boxlo1+cell_size);
+    p.z = fmaxf(p.z, -cell_size);
+    p.z = fminf(p.z, boxhi2-boxlo2+cell_size);
+    
+    unsigned int id = (unsigned int)(p.x/cell_size + 1.0) 
+      + (unsigned int)(p.y/cell_size + 1.0) * ncellx
+      + (unsigned int)(p.z/cell_size + 1.0) * ncellx * ncelly;
+    
+    cell_id[i] = id;
+    particle_id[i] = i;
+  }
+}
+
+__kernel void kernel_calc_cell_counts(unsigned *cell_id,
+                                      int *cell_counts, int nall, int ncell) {
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < nall) {
+    int id = cell_id[idx];
+
+    // handle boundary cases
+    if (idx == 0) {
+      for (int i = 0; i < id + 1; i++) 
+        cell_counts[i] = 0;
+    }
+    if (idx == nall - 1) {
+      for (int i = id+1; i <= ncell; i++) 
+        cell_counts[i] = nall;
+    }
+
+    if (idx > 0 && idx < nall) {
+      int id_l = cell_id[idx-1];
+      if (id != id_l) {
+        for (int i = id_l+1; i <= id; i++) 
+          cell_counts[i] = idx;
+      }
+    }
+  }
+}
+
+#endif
+
+
+
+__kernel void transpose(__global int *out, __global int *in, int columns_in, 
+                        int rows_in)
+{
+	__local float block[BLOCK_CELL_2D][BLOCK_CELL_2D+1];
+	
+	unsigned ti=THREAD_ID_X;
+	unsigned tj=THREAD_ID_Y;
+	unsigned bi=BLOCK_ID_X;
+	unsigned bj=BLOCK_ID_Y;
+	
+	unsigned i=bi*BLOCK_CELL_2D+ti;
+	unsigned j=bj*BLOCK_CELL_2D+tj;
+	if ((i<columns_in) && (j<rows_in))
+		block[tj][ti]=in[j*columns_in+i];
+
+	__syncthreads();
+
+	i=bj*BLOCK_CELL_2D+ti;
+	j=bi*BLOCK_CELL_2D+tj;
+	if ((i<rows_in) && (j<columns_in))
+		out[j*rows_in+i] = block[ti][tj];
+}
+
+__kernel void calc_neigh_list_cell(__global numtyp4 *x_, 
+                                   __global int *cell_particle_id, 
+                                   __global int *cell_counts, 
+                                   __global int *nbor_list,
+                                   __global int *host_nbor_list, 
+                                   __global int *host_numj, 
+                                   int neigh_bin_size, numtyp cell_size,
+                                   int ncellx, int ncelly, int ncellz,
+                                   int inum, int nt, int nall, int t_per_atom)
+{
+  int tid = THREAD_ID_X;
+  int ix = BLOCK_ID_X;
+  int iy = BLOCK_ID_Y % ncelly;
+  int iz = BLOCK_ID_Y / ncelly;
+	  
+  int icell = ix + iy*ncellx + iz*ncellx*ncelly;
+
+  __local int cell_list_sh[BLOCK_NBOR_BUILD];
+  __local numtyp4 pos_sh[BLOCK_NBOR_BUILD];
+
+  int icell_begin = cell_counts[icell];
+  int icell_end = cell_counts[icell+1];
+
+  int nborz0 = max(iz-1,0), nborz1 = min(iz+1, ncellz-1),
+      nbory0 = max(iy-1,0), nbory1 = min(iy+1, ncelly-1),
+      nborx0 = max(ix-1,0), nborx1 = min(ix+1, ncellx-1);
+
+  numtyp4 diff;
+  numtyp r2;
+  int cap=ucl_ceil((numtyp)(icell_end - icell_begin)/BLOCK_SIZE_X);
+  for (int ii = 0; ii < cap; ii++) {
+    int i = icell_begin + tid + ii*BLOCK_SIZE_X;
+    int pid_i = nall, pid_j, stride;
+    numtyp4 atom_i, atom_j;
+    int cnt = 0;    
+    __global int *neigh_counts, *neigh_list;
+    
+    if (i < icell_end)
+      pid_i = cell_particle_id[i];
+
+    if (pid_i < nt) {
+      atom_i = fetch_pos(pid_i,x_); //pos[pid_i];
+    }
+    if (pid_i < inum) {
+      stride=inum;
+      neigh_counts=nbor_list+stride+pid_i;
+      neigh_list=neigh_counts+stride+pid_i*(t_per_atom-1);
+      stride=stride*t_per_atom-t_per_atom;
+      nbor_list[pid_i]=pid_i;
+    } else {
+      stride=0;
+    	neigh_counts=host_numj+pid_i-inum;
+      neigh_list=host_nbor_list+(pid_i-inum)*neigh_bin_size;
+    }
+    
+    // loop through neighbors
+
+    for (int nborz = nborz0; nborz <= nborz1; nborz++) {
+      for (int nbory = nbory0; nbory <= nbory1; nbory++) {
+        for (int nborx = nborx0; nborx <= nborx1; nborx++) {
+	
+          int jcell = nborx + nbory*ncellx + nborz*ncellx*ncelly;
+		
+          int jcell_begin = cell_counts[jcell];
+          int jcell_end = cell_counts[jcell+1];
+          int num_atom_cell = jcell_end - jcell_begin;
+	  
+          // load jcell to shared memory
+          int num_iter = ucl_ceil((numtyp)num_atom_cell/BLOCK_NBOR_BUILD);
+
+          for (int k = 0; k < num_iter; k++) {
+            int end_idx = min(BLOCK_NBOR_BUILD, 
+                              num_atom_cell-k*BLOCK_NBOR_BUILD);
+	    
+            if (tid < end_idx) {
+              pid_j =  cell_particle_id[tid+k*BLOCK_NBOR_BUILD+jcell_begin];
+              cell_list_sh[tid] = pid_j;
+              atom_j = fetch_pos(pid_j,x_); //[pid_j];
+              pos_sh[tid].x = atom_j.x;
+              pos_sh[tid].y = atom_j.y;
+              pos_sh[tid].z = atom_j.z;
+            }
+            __syncthreads();
+	    
+            if (pid_i < nt) {
+	    
+              for (int j = 0; j < end_idx; j++) {
+                int pid_j = cell_list_sh[j]; // gather from shared memory
+                diff.x = atom_i.x - pos_sh[j].x;
+                diff.y = atom_i.y - pos_sh[j].y;
+                diff.z = atom_i.z - pos_sh[j].z;
+		
+                r2 = diff.x*diff.x + diff.y*diff.y + diff.z*diff.z;
+                if (r2 < cell_size*cell_size && r2 > 1e-5) {
+                  cnt++;
+                  if (cnt < neigh_bin_size) {
+                    *neigh_list = pid_j;
+                    neigh_list++;
+                    if ((cnt & (t_per_atom-1))==0)
+                      neigh_list=neigh_list+stride;
+                  }
+                }		
+              }
+            }
+	          __syncthreads();
+	        } // for (k)
+        }
+      }
+    }
+    if (pid_i < nt)
+      *neigh_counts = cnt;
+  } // for (i)
+}
+
+__kernel void kernel_special(__global int *dev_nbor, 
+                             __global int *host_nbor_list, 
+                             __global int *host_numj, __global int *tag,
+                             __global int *nspecial, __global int *special,
+                             int inum, int nt, int max_nbors, int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom;
+  int offset=tid & (t_per_atom-1);
+
+  if (ii<nt) {
+    int stride;
+    __global int *list, *list_end;
+    
+    int n1=nspecial[ii*3];
+    int n2=nspecial[ii*3+1];
+    int n3=nspecial[ii*3+2];
+
+    int numj;
+    if (ii < inum) {
+      stride=inum;
+      list=dev_nbor+stride+ii;
+      numj=*list;
+      list+=stride+fast_mul(ii,t_per_atom-1);
+      stride=fast_mul(inum,t_per_atom);
+      int njt=numj/t_per_atom;
+      list_end=list+fast_mul(njt,stride)+(numj & (t_per_atom-1));
+      list+=offset;
+    } else {
+      stride=1;
+      list=host_nbor_list+(ii-inum)*max_nbors;
+      numj=host_numj[ii-inum];
+      list_end=list+fast_mul(numj,stride);
+    }
+  
+    for ( ; list<list_end; list+=stride) {
+      int nbor=*list;
+      int jtag=tag[nbor];
+
+      int offset=ii;
+      for (int i=0; i<n3; i++) {
+        if (special[offset]==jtag) {
+          int which = 1;
+          if (i>=n1)
+            which++;
+          if (i>=n2)
+            which++;
+          nbor=nbor ^ (which << SBBITS);
+          *list=nbor;
+        }
+        offset+=nt;
+      }
+    }
+  } // if ii
+}
+
--- a/lib/gpu/lal_neighbor_shared.cpp
+++ b/lib/gpu/lal_neighbor_shared.cpp
@ -0,0 +1,74 @@
+/***************************************************************************
+                             neighbor_shared.cpp
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Class for management of data shared by all neighbor lists
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#include "lal_precision.h"
+#include "lal_neighbor_shared.h"
+
+#ifdef USE_OPENCL
+#include "neighbor_cpu_cl.h"
+#include "neighbor_gpu_cl.h"
+#else
+#include "neighbor_cpu_ptx.h"
+#include "neighbor_gpu_ptx.h"
+#endif
+
+using namespace LAMMPS_AL;
+
+void NeighborShared::clear() {
+  if (_compiled) {
+    if (_gpu_nbor>0) {
+      if (_gpu_nbor==1) {
+        k_cell_id.clear();
+        k_cell_counts.clear();
+      }
+      k_build_nbor.clear();
+      k_transpose.clear();
+      k_special.clear();
+      delete build_program;
+    } else {
+      k_nbor.clear();
+      delete nbor_program;
+    }
+    _compiled=false;
+  }
+}
+
+void NeighborShared::compile_kernels(UCL_Device &dev, const int gpu_nbor) {
+  if (_compiled)
+  	return;
+  	
+  _gpu_nbor=gpu_nbor;
+  std::string flags="-cl-fast-relaxed-math -cl-mad-enable -D"+
+                    std::string(OCL_VENDOR);
+
+  if (_gpu_nbor==0) {
+    nbor_program=new UCL_Program(dev);
+    nbor_program->load_string(neighbor_cpu,flags.c_str());
+    k_nbor.set_function(*nbor_program,"kernel_unpack");
+  } else {
+    build_program=new UCL_Program(dev);
+    build_program->load_string(neighbor_gpu,flags.c_str());
+
+    if (_gpu_nbor==1) {
+      k_cell_id.set_function(*build_program,"calc_cell_id");
+      k_cell_counts.set_function(*build_program,"kernel_calc_cell_counts");
+    }
+    k_build_nbor.set_function(*build_program,"calc_neigh_list_cell");
+    k_transpose.set_function(*build_program,"transpose");
+    k_special.set_function(*build_program,"kernel_special");
+    neigh_tex.get_texture(*build_program,"neigh_tex");
+  }
+  _compiled=true;
+}
--- a/lib/gpu/lal_neighbor_shared.h
+++ b/lib/gpu/lal_neighbor_shared.h
@ -0,0 +1,61 @@
+/***************************************************************************
+                              neighbor_shared.h
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Class for management of data shared by all neighbor lists
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#ifndef LAL_NEIGHBOR_SHARED_H
+#define LAL_NEIGHBOR_SHARED_H
+
+#ifdef USE_OPENCL
+
+#include "geryon/ocl_kernel.h"
+#include "geryon/ocl_texture.h"
+using namespace ucl_opencl;
+
+#else
+
+#include "geryon/nvd_kernel.h"
+#include "geryon/nvd_texture.h"
+using namespace ucl_cudadr;
+
+#endif
+
+namespace LAMMPS_AL {
+
+class NeighborShared {
+ public:
+  NeighborShared() : _compiled(false) {}
+  ~NeighborShared() { clear(); }
+ 
+  /// Free all memory on host and device
+  void clear();
+
+  /// Texture for cached position/type access with CUDA
+  UCL_Texture neigh_tex;
+
+  /// Compile kernels for neighbor lists
+  void compile_kernels(UCL_Device &dev, const int gpu_nbor);
+
+  // ----------------------------- Kernels
+  UCL_Program *nbor_program, *build_program;
+  UCL_Kernel k_nbor, k_cell_id, k_cell_counts, k_build_nbor;
+  UCL_Kernel k_transpose, k_special;
+
+ private:
+  bool _compiled;
+  int _gpu_nbor;
+};
+
+}
+
+#endif
--- a/lib/gpu/lal_pppm.cpp
+++ b/lib/gpu/lal_pppm.cpp
@ -0,0 +1,410 @@
+/***************************************************************************
+                                  pppm.cpp
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Class for PPPM acceleration
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#ifdef USE_OPENCL
+#include "pppm_cl.h"
+#else
+#include "pppm_f_ptx.h"
+#include "pppm_d_ptx.h"
+#endif
+#include "lal_pppm.h"
+#include <cassert>
+
+using namespace LAMMPS_AL;
+#define PPPMT PPPM<numtyp, acctyp, grdtyp, grdtyp4>
+
+extern Device<PRECISION,ACC_PRECISION> global_device;
+
+template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
+PPPMT::PPPM() : _allocated(false), _compiled(false),
+                                  _max_bytes(0) {
+  device=&global_device;
+  ans=new Answer<numtyp,acctyp>();
+}
+
+template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
+PPPMT::~PPPM() {
+  clear(0.0);
+  delete ans;
+}
+
+template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
+int PPPMT::bytes_per_atom() const {
+  return device->atom.bytes_per_atom()+ans->bytes_per_atom()+1;
+}
+
+template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
+grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen,
+                              const int order, const int nxlo_out,
+                              const int nylo_out, const int nzlo_out,
+                              const int nxhi_out, const int nyhi_out,
+                              const int nzhi_out, grdtyp **rho_coeff,
+                              grdtyp **vd_brick, const double slab_volfactor, 
+                              const int nx_pppm, const int ny_pppm,
+                              const int nz_pppm, int &flag) {
+  _max_bytes=10;
+  screen=_screen;
+  bool success=true;
+
+  flag=device->init(*ans,nlocal,nall);
+  if (flag!=0)
+    return 0;
+  if (sizeof(grdtyp)==sizeof(double) && device->double_precision()==false) {
+    flag=-5;
+    return 0;
+  }
+  if (device->ptx_arch()>0.0 && device->ptx_arch()<1.1) {
+    flag=-4;
+    return 0;
+  }
+
+  ucl_device=device->gpu;
+  atom=&device->atom;
+
+  _block_size=device->pppm_block();
+  _pencil_size=device->num_mem_threads();
+  _block_pencils=_block_size/_pencil_size;
+
+  compile_kernels(*ucl_device);
+
+  // Initialize timers for the selected GPU
+  time_in.init(*ucl_device);
+  time_in.zero();
+  time_out.init(*ucl_device);
+  time_out.zero();
+  time_map.init(*ucl_device);
+  time_map.zero();
+  time_rho.init(*ucl_device);
+  time_rho.zero();
+  time_interp.init(*ucl_device);
+  time_interp.zero();
+
+  pos_tex.bind_float(atom->dev_x,4);
+  q_tex.bind_float(atom->dev_q,1);
+
+  _allocated=true;
+  _max_bytes=0;
+  _max_an_bytes=ans->gpu_bytes();
+  
+  _order=order;
+  _order_m_1=order-1;
+  _order2=_order_m_1*_order;
+  _nlower=-(_order-1)/2;
+  _nupper=order/2;
+  _nxlo_out=nxlo_out;
+  _nylo_out=nylo_out;
+  _nzlo_out=nzlo_out;
+  _nxhi_out=nxhi_out;
+  _nyhi_out=nyhi_out;
+  _nzhi_out=nzhi_out;
+
+  _slab_volfactor=slab_volfactor;
+  _nx_pppm=nx_pppm;
+  _ny_pppm=ny_pppm;
+  _nz_pppm=nz_pppm;
+
+  _max_brick_atoms=10;
+
+  // Get rho_coeff on device
+  int n2lo=(1-order)/2;
+  int numel=order*( order/2 - n2lo + 1 );
+  success=success && (d_rho_coeff.alloc(numel,*ucl_device,UCL_READ_ONLY)==
+                      UCL_SUCCESS);
+  UCL_H_Vec<grdtyp> view;
+  view.view(rho_coeff[0]+n2lo,numel,*ucl_device);
+  ucl_copy(d_rho_coeff,view,true);
+  _max_bytes+=d_rho_coeff.row_bytes();
+  
+  // Allocate storage for grid
+  _npts_x=nxhi_out-nxlo_out+1;
+  _npts_y=nyhi_out-nylo_out+1;
+  _npts_z=nzhi_out-nzlo_out+1;
+  _npts_yx=_npts_x*_npts_y;
+  success=success && (d_brick.alloc(_npts_x*_npts_y*_npts_z*4,*ucl_device)==
+                      UCL_SUCCESS);
+  success=success && (h_brick.alloc(_npts_x*_npts_y*_npts_z,*ucl_device)==
+                      UCL_SUCCESS);
+  success=success && (h_vd_brick.alloc(_npts_x*_npts_y*_npts_z*4,*ucl_device)==
+                      UCL_SUCCESS);
+  *vd_brick=h_vd_brick.begin();
+  _max_bytes+=d_brick.row_bytes();
+
+  // Allocate vector with count of atoms assigned to each grid point
+  _nlocal_x=_npts_x+_nlower-_nupper;
+  _nlocal_y=_npts_y+_nlower-_nupper;
+  _nlocal_z=_npts_z+_nlower-_nupper;
+  _nlocal_yx=_nlocal_x*_nlocal_y;
+  _atom_stride=_nlocal_x*_nlocal_y*_nlocal_z;
+  success=success && (d_brick_counts.alloc(_atom_stride,*ucl_device)==
+                      UCL_SUCCESS);
+  _max_bytes+=d_brick_counts.row_bytes();
+
+  // Allocate storage for atoms assigned to each grid point
+  success=success && (d_brick_atoms.alloc(_atom_stride*_max_brick_atoms,
+                                          *ucl_device)==UCL_SUCCESS);
+  _max_bytes+=d_brick_atoms.row_bytes();
+
+  // Allocate error flags for checking out of bounds atoms
+  success=success && (h_error_flag.alloc(1,*ucl_device)==UCL_SUCCESS);
+  success=success && (d_error_flag.alloc(1,*ucl_device,UCL_WRITE_ONLY)==
+                                         UCL_SUCCESS);
+  if (!success) {
+    flag=-3;
+    return 0;
+  }
+  
+  d_error_flag.zero();
+  _max_bytes+=1;
+  
+  _cpu_idle_time=0.0;
+
+  return h_brick.begin();
+}
+
+template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
+void PPPMT::clear(const double cpu_time) {
+  if (!_allocated)
+    return;
+  _allocated=false;
+  _precompute_done=false;
+  
+  d_brick.clear();
+  h_brick.clear();
+  h_vd_brick.clear();
+  d_brick_counts.clear();
+  h_error_flag.clear();
+  d_error_flag.clear();
+  d_brick_atoms.clear();
+  
+  acc_timers();
+  device->output_kspace_times(time_in,time_out,time_map,time_rho,time_interp,
+                              *ans,_max_bytes+_max_an_bytes,cpu_time,
+                              _cpu_idle_time,screen);
+
+  if (_compiled) {
+    k_particle_map.clear();
+    k_make_rho.clear();
+    k_interp.clear();
+    delete pppm_program;
+    _compiled=false;
+  }
+
+  time_in.clear();
+  time_out.clear();
+  time_map.clear();
+  time_rho.clear();
+  time_interp.clear();
+
+  ans->clear();
+  device->clear();
+}
+
+// ---------------------------------------------------------------------------
+// Charge assignment that can be performed asynchronously
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
+void PPPMT::_precompute(const int ago, const int nlocal, const int nall,
+                                 double **host_x, int *host_type, bool &success,
+                                 double *host_q, double *boxlo, 
+                                 const double delxinv, const double delyinv,
+                                 const double delzinv) {
+  acc_timers();
+  if (nlocal==0) {
+    zero_timers();
+    return;
+  }
+  
+  ans->inum(nlocal);
+
+  if (ago==0) {
+    resize_atom(nlocal,nall,success);
+    resize_local(nlocal,success);
+    if (!success)
+      return;
+
+    double bytes=ans->gpu_bytes();
+    if (bytes>_max_an_bytes)
+      _max_an_bytes=bytes;
+  }
+
+  atom->cast_x_data(host_x,host_type);
+  atom->cast_q_data(host_q);
+  atom->add_x_data(host_x,host_type);
+  atom->add_q_data();
+
+  time_map.start();
+
+  // Compute the block size and grid size to keep all cores busy
+  int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
+
+  int ainum=this->ans->inum();
+  
+  // Boxlo adjusted to be upper left brick and shift for even spline order
+  double shift=0.0;
+  if (_order % 2)
+    shift=0.5;
+  _brick_x=boxlo[0]+(_nxlo_out-_nlower-shift)/delxinv;
+  _brick_y=boxlo[1]+(_nylo_out-_nlower-shift)/delyinv;
+  _brick_z=boxlo[2]+(_nzlo_out-_nlower-shift)/delzinv;
+  
+  _delxinv=delxinv;
+  _delyinv=delyinv;
+  _delzinv=delzinv;
+  double delvolinv = delxinv*delyinv*delzinv;
+  grdtyp f_delvolinv = delvolinv;
+
+  device->zero(d_brick_counts,d_brick_counts.numel());
+  k_particle_map.set_size(GX,BX);
+  k_particle_map.run(&atom->dev_x.begin(), &atom->dev_q.begin(), &f_delvolinv,
+                     &ainum, &d_brick_counts.begin(), &d_brick_atoms.begin(),
+                     &_brick_x, &_brick_y, &_brick_z, &_delxinv, &_delyinv, 
+                     &_delzinv, &_nlocal_x, &_nlocal_y, &_nlocal_z, 
+                     &_atom_stride, &_max_brick_atoms, &d_error_flag.begin());
+  time_map.stop();
+
+  time_rho.start();
+  BX=block_size();
+
+  GX=static_cast<int>(ceil(static_cast<double>(_npts_y*_npts_z)/
+                      _block_pencils));
+  k_make_rho.set_size(GX,BX);
+  k_make_rho.run(&d_brick_counts.begin(), &d_brick_atoms.begin(),
+                 &d_brick.begin(), &d_rho_coeff.begin(), &_atom_stride, 
+                 &_npts_x, &_npts_y, &_npts_z, &_nlocal_x, &_nlocal_y,
+                 &_nlocal_z, &_order_m_1, &_order, &_order2);
+  time_rho.stop();
+
+  time_out.start();
+  ucl_copy(h_brick,d_brick,_npts_yx*_npts_z,true);
+  ucl_copy(h_error_flag,d_error_flag,true);
+  time_out.stop();
+
+  _precompute_done=true;
+}
+
+// ---------------------------------------------------------------------------
+// Charge spreading stuff
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
+int PPPMT::spread(const int ago, const int nlocal, const int nall,
+                           double **host_x, int *host_type, bool &success,
+                           double *host_q, double *boxlo, 
+                           const double delxinv, const double delyinv,
+                           const double delzinv) {
+  if (_precompute_done==false) {
+    atom->acc_timers();
+    _precompute(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo,delxinv,
+                delyinv,delzinv);
+  }
+
+  device->stop_host_timer();
+  
+  if (!success || nlocal==0)
+    return 0;
+    
+  double t=MPI_Wtime();
+  time_out.sync_stop();
+  _cpu_idle_time+=MPI_Wtime()-t;
+
+  _precompute_done=false;
+
+  if (h_error_flag[0]==2) {
+    // Not enough storage for atoms on the brick
+    _max_brick_atoms*=2;
+    d_error_flag.zero();
+    d_brick_atoms.clear();
+    d_brick_atoms.alloc(_atom_stride*_max_brick_atoms,*ucl_device);
+    _max_bytes+=d_brick_atoms.row_bytes();
+    return spread(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo, 
+                  delxinv,delyinv,delzinv);
+  }
+  
+  return h_error_flag[0];
+}
+
+// ---------------------------------------------------------------------------
+// Charge spreading stuff
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
+void PPPMT::interp(const grdtyp qqrd2e_scale) {
+  time_in.start();
+  ucl_copy(d_brick,h_vd_brick,true);
+  time_in.stop();
+  
+  time_interp.start();
+  // Compute the block size and grid size to keep all cores busy
+  int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
+
+  int ainum=this->ans->inum();
+  
+  k_interp.set_size(GX,BX);
+  k_interp.run(&atom->dev_x.begin(), &atom->dev_q.begin(), &ainum, 
+               &d_brick.begin(), &d_rho_coeff.begin(), &_npts_x, &_npts_yx,
+               &_brick_x, &_brick_y, &_brick_z, &_delxinv, &_delyinv, &_delzinv,
+               &_order, &_order2, &qqrd2e_scale, &ans->dev_ans.begin());
+  time_interp.stop();
+
+  ans->copy_answers(false,false,false,false);
+  device->add_ans_object(ans);
+}
+
+
+template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
+double PPPMT::host_memory_usage() const {
+  return device->atom.host_memory_usage()+
+         sizeof(PPPM<numtyp,acctyp,grdtyp,grdtyp4>);
+}
+
+template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
+void PPPMT::compile_kernels(UCL_Device &dev) {
+  if (_compiled)
+    return;
+
+  if (sizeof(grdtyp)==sizeof(double) && ucl_device->double_precision()==false)
+    return;
+
+  std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
+                    std::string(OCL_PRECISION_COMPILE)+" -D"+
+                    std::string(OCL_VENDOR);
+  #ifdef USE_OPENCL
+  flags+=std::string(" -Dgrdtyp=")+ucl_template_name<grdtyp>()+" -Dgrdtyp4="+
+         ucl_template_name<grdtyp>()+"4";
+  #endif
+
+  pppm_program=new UCL_Program(dev);
+  
+  #ifdef USE_OPENCL
+  pppm_program->load_string(pppm,flags.c_str());
+  #else
+  if (sizeof(grdtyp)==sizeof(float))
+    pppm_program->load_string(pppm_f,flags.c_str());
+  else
+    pppm_program->load_string(pppm_d,flags.c_str());
+  #endif
+
+  k_particle_map.set_function(*pppm_program,"particle_map");
+  k_make_rho.set_function(*pppm_program,"make_rho");
+  k_interp.set_function(*pppm_program,"interp");
+  pos_tex.get_texture(*pppm_program,"pos_tex");
+  q_tex.get_texture(*pppm_program,"q_tex");
+
+  _compiled=true;
+}
+
+template class PPPM<PRECISION,ACC_PRECISION,float,_lgpu_float4>;
+template class PPPM<PRECISION,ACC_PRECISION,double,_lgpu_double4>;
+
--- a/lib/gpu/lal_pppm.cu
+++ b/lib/gpu/lal_pppm.cu
@ -0,0 +1,267 @@
+// **************************************************************************
+//                                  pppm.cu
+//                             -------------------
+//                           W. Michael Brown (ORNL)
+//
+//  Device code for PPPM acceleration
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                : 
+//    email                : brownw@ornl.gov
+// ***************************************************************************/
+
+#ifdef NV_KERNEL
+#include "lal_preprocessor.h"
+texture<float4> pos_tex;
+texture<float> q_tex;
+#ifndef _DOUBLE_DOUBLE
+ucl_inline float4 fetch_pos(const int& i, const float4 *pos) 
+  { return tex1Dfetch(pos_tex, i); }
+ucl_inline float fetch_q(const int& i, const float *q) 
+  { return tex1Dfetch(q_tex, i); }
+#endif
+
+// Allow PPPM to compile without atomics for NVIDIA 1.0 cards, error
+// generated at runtime with use of pppm/gpu
+#if (__CUDA_ARCH__ < 110)
+#define atomicAdd(x,y) *(x)+=0
+#endif
+
+#else
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics: enable
+#endif
+
+// Number of threads per pencil for charge spread
+#define PENCIL_SIZE MEM_THREADS
+// Number of pencils per block for charge spread
+#define BLOCK_PENCILS (PPPM_BLOCK_1D/PENCIL_SIZE)
+
+__kernel void particle_map(__global numtyp4 *x_,  __global numtyp *q_,
+                           const grdtyp delvolinv, const int nlocal, 
+                           __global int *counts, __global grdtyp4 *ans, 
+                           const grdtyp b_lo_x, const grdtyp b_lo_y,
+                           const grdtyp b_lo_z, const grdtyp delxinv,
+                           const grdtyp delyinv, const grdtyp delzinv,
+                           const int nlocal_x, const int nlocal_y,
+                           const int nlocal_z, const int atom_stride,
+                           const int max_atoms, __global int *error) {
+  // ii indexes the two interacting particles in gi
+  int ii=GLOBAL_ID_X;
+
+  // Resequence the atom indices to avoid collisions during atomic ops
+  int nthreads=GLOBAL_SIZE_X;
+  ii=fast_mul(ii,PPPM_BLOCK_1D);
+  ii-=(ii/nthreads)*(nthreads-1);
+
+  int nx,ny,nz;
+
+  if (ii<nlocal) {
+    numtyp4 p=fetch_pos(ii,x_);
+    grdtyp4 delta;
+    delta.w=delvolinv*fetch_q(ii,q_);
+    
+    if (delta.w!=(grdtyp)0.0) {
+      delta.x=(p.x-b_lo_x)*delxinv;
+      nx=delta.x;
+      delta.y=(p.y-b_lo_y)*delyinv;
+      ny=delta.y;
+      delta.z=(p.z-b_lo_z)*delzinv;
+      nz=delta.z;
+
+      if (delta.x<(grdtyp)0 || delta.y<(grdtyp)0 || delta.z<(grdtyp)0 || 
+          nx>=nlocal_x || ny>=nlocal_y || nz>=nlocal_z)
+        *error=1;
+      else {
+        delta.x=nx+(grdtyp)0.5-delta.x;
+        delta.y=ny+(grdtyp)0.5-delta.y;
+        delta.z=nz+(grdtyp)0.5-delta.z;
+      
+        int i=nz*nlocal_y*nlocal_x+ny*nlocal_x+nx;
+        int old=atom_add(counts+i, 1);
+        if (old>=max_atoms) {
+          *error=2;
+          atom_add(counts+i, -1);
+        } else
+          ans[atom_stride*old+i]=delta;
+      }
+    }
+  }
+}
+
+/* --------------------------- */
+
+__kernel void make_rho(__global int *counts, __global grdtyp4 *atoms,
+                       __global grdtyp *brick, __global grdtyp *_rho_coeff,
+                       const int atom_stride, const int npts_x,
+                       const int npts_y, const int npts_z, const int nlocal_x,
+                       const int nlocal_y, const int nlocal_z,
+                       const int order_m_1, const int order, const int order2) {
+  __local grdtyp rho_coeff[PPPM_MAX_SPLINE*PPPM_MAX_SPLINE];
+  __local grdtyp front[BLOCK_PENCILS][PENCIL_SIZE+PPPM_MAX_SPLINE];
+  __local grdtyp ans[PPPM_MAX_SPLINE][PPPM_BLOCK_1D];
+  
+  int tid=THREAD_ID_X;
+  if (tid<order2+order)
+    rho_coeff[tid]=_rho_coeff[tid];
+    
+  int pid=tid/PENCIL_SIZE;
+  int fid=tid%PENCIL_SIZE;
+  int fid_halo=PENCIL_SIZE+fid;
+  if (fid<order) 
+    front[pid][fid_halo]=(grdtyp)0.0;
+
+  __syncthreads();
+
+  int bt=BLOCK_ID_X*BLOCK_PENCILS+pid;
+  int ny=bt%npts_y;
+  int nz=bt/npts_y;
+  int y_start=0;
+  int z_start=0;
+  int y_stop=order;
+  int z_stop=order;
+  if (ny<order_m_1)
+    y_start=order_m_1-ny;
+  if (nz<order_m_1)
+    z_start=order_m_1-nz;
+  if (ny>=nlocal_y)
+    y_stop-=ny-nlocal_y+1;
+  if (nz>=nlocal_z)
+    z_stop-=nz-nlocal_z+1;
+  int z_stride=fast_mul(nlocal_x,nlocal_y);
+
+  int loop_count=npts_x/PENCIL_SIZE+1;
+  int nx=fid;
+  int pt=fast_mul(nz,fast_mul(npts_y,npts_x))+fast_mul(ny,npts_x)+nx;
+  for (int i=0 ; i<loop_count; i++) {
+    for (int n=0; n<order; n++)
+      ans[n][tid]=(grdtyp)0.0;
+    if (nx<nlocal_x && nz<npts_z) {
+      int z_pos=fast_mul(nz+z_start-order_m_1,z_stride);
+      for (int m=z_start; m<z_stop; m++) {
+        int y_pos=fast_mul(ny+y_start-order_m_1,nlocal_x);
+        for (int l=y_start; l<y_stop; l++) {
+          int pos=z_pos+y_pos+nx;
+          int natoms=fast_mul(counts[pos],atom_stride);
+          for (int row=pos; row<natoms; row+=atom_stride) {
+            grdtyp4 delta=atoms[row];
+      
+            grdtyp rho1d_1=(grdtyp)0.0;
+            grdtyp rho1d_2=(grdtyp)0.0;
+            for (int k=order2+order-1; k > -1; k-=order) {
+              rho1d_1=rho_coeff[k-l]+rho1d_1*delta.y;
+              rho1d_2=rho_coeff[k-m]+rho1d_2*delta.z;
+            }
+            delta.w*=rho1d_1*rho1d_2;
+
+            for (int n=0; n<order; n++) {
+              grdtyp rho1d_0=(grdtyp)0.0;
+              for (int k=order2+n; k>=n; k-=order)
+                rho1d_0=rho_coeff[k]+rho1d_0*delta.x;
+              ans[n][tid]+=delta.w*rho1d_0;
+            }
+          }
+          y_pos+=nlocal_x;
+        }
+        z_pos+=z_stride;
+      }
+    }
+    
+    __syncthreads();
+    if (fid<order) {
+      front[pid][fid]=front[pid][fid_halo];
+      front[pid][fid_halo]=(grdtyp)0.0;
+    } else 
+      front[pid][fid]=(grdtyp)0.0;
+    
+    for (int n=0; n<order; n++) {
+      front[pid][fid+n]+=ans[n][tid];
+      __syncthreads();
+    }
+
+    if (nx<npts_x && nz<npts_z)
+      brick[pt]=front[pid][fid];
+    pt+=PENCIL_SIZE;
+    nx+=PENCIL_SIZE;
+  }
+}
+
+__kernel void interp(__global numtyp4 *x_, __global numtyp *q_,
+                     const int nlocal, __global grdtyp4 *brick,
+                     __global grdtyp *_rho_coeff, const int npts_x,
+                     const int npts_yx, const grdtyp b_lo_x,
+                     const grdtyp b_lo_y, const grdtyp b_lo_z,
+                     const grdtyp delxinv,  const grdtyp delyinv,
+                     const grdtyp delzinv, const int order,
+                     const int order2, const grdtyp qqrd2e_scale, 
+                     __global acctyp4 *ans) {
+  __local grdtyp rho_coeff[PPPM_MAX_SPLINE*PPPM_MAX_SPLINE];
+  __local grdtyp rho1d_0[PPPM_MAX_SPLINE][PPPM_BLOCK_1D];
+  __local grdtyp rho1d_1[PPPM_MAX_SPLINE][PPPM_BLOCK_1D];
+
+  int tid=THREAD_ID_X;
+  if (tid<order2+order)
+    rho_coeff[tid]=_rho_coeff[tid];
+  __syncthreads();
+  
+  int ii=tid+BLOCK_ID_X*BLOCK_SIZE_X;
+  
+  int nx,ny,nz;
+  grdtyp tx,ty,tz;
+
+  if (ii<nlocal) {
+    numtyp4 p=fetch_pos(ii,x_);
+    grdtyp qs=qqrd2e_scale*fetch_q(ii,q_);
+
+    acctyp4 ek;
+    ek.x=(acctyp)0.0;
+    ek.y=(acctyp)0.0;
+    ek.z=(acctyp)0.0;
+    if (qs!=(grdtyp)0.0) {
+      tx=(p.x-b_lo_x)*delxinv;
+      nx=tx;
+      ty=(p.y-b_lo_y)*delyinv;
+      ny=ty;
+      tz=(p.z-b_lo_z)*delzinv;
+      nz=tz;
+
+      grdtyp dx=nx+(grdtyp)0.5-tx;
+      grdtyp dy=ny+(grdtyp)0.5-ty;
+      grdtyp dz=nz+(grdtyp)0.5-tz;
+
+      for (int k=0; k<order; k++) {
+        rho1d_0[k][tid]=(grdtyp)0.0;
+        rho1d_1[k][tid]=(grdtyp)0.0;
+        for (int l=order2+k; l>=k; l-=order) {
+          rho1d_0[k][tid]=rho_coeff[l]+rho1d_0[k][tid]*dx;
+          rho1d_1[k][tid]=rho_coeff[l]+rho1d_1[k][tid]*dy;
+        }
+      }
+        
+      int mz=fast_mul(nz,npts_yx)+nx;
+      for (int n=0; n<order; n++) {
+        grdtyp rho1d_2=(grdtyp)0.0;
+        for (int k=order2+n; k>=n; k-=order)
+          rho1d_2=rho_coeff[k]+rho1d_2*dz;
+        grdtyp z0=qs*rho1d_2;
+        int my=mz+fast_mul(ny,npts_x);
+        for (int m=0; m<order; m++) {
+          grdtyp y0=z0*rho1d_1[m][tid];
+  	      for (int l=0; l<order; l++) {
+  	        grdtyp x0=y0*rho1d_0[l][tid];
+  	        grdtyp4 el=brick[my+l];
+  	        ek.x-=x0*el.x;
+  	        ek.y-=x0*el.y;
+  	        ek.z-=x0*el.z;
+  	      }
+          my+=npts_x;
+        }
+        mz+=npts_yx;
+  	  }
+    }
+    ans[ii]=ek;
+	}
+}
+
--- a/lib/gpu/lal_pppm.h
+++ b/lib/gpu/lal_pppm.h
@ -0,0 +1,196 @@
+/***************************************************************************
+                                   pppm.h
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Class for PPPM acceleration
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#ifndef LAL_PPPM_H
+#define LAL_PPPM_H
+
+#include "mpi.h"
+#include "lal_device.h"
+
+#ifdef USE_OPENCL
+#include "geryon/ocl_texture.h"
+#else
+#include "geryon/nvd_texture.h"
+#endif
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp> class Device;
+
+template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
+class PPPM {
+ public:
+  PPPM();
+  virtual ~PPPM();
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** Success will be:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -2 if GPU could not be found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  grdtyp * init(const int nlocal, const int nall, FILE *screen, const int order,
+                const int nxlo_out, const int nylo_out, const int nzlo_out,
+                const int nxhi_out, const int nyhi_out, const int nzhi_out,
+                grdtyp **rho_coeff, grdtyp **vd_brick, 
+                const double slab_volfactor, const int nx_pppm, 
+                const int ny_pppm, const int nz_pppm, int &success);
+
+  /// Check if there is enough storage for atom arrays and realloc if not
+  /** \param success set to false if insufficient memory **/
+  inline void resize_atom(const int inum, const int nall, bool &success) {
+    if (atom->resize(nall, success)) {
+      pos_tex.bind_float(atom->dev_x,4);
+      q_tex.bind_float(atom->dev_q,1);
+    }
+    ans->resize(inum,success);
+  }
+
+  /// Check if there is enough storage for local atoms and realloc if not
+  inline void resize_local(const int inum, bool &success) {
+  }
+  
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear(const double cpu_time);
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom() const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage() const;
+
+  /// Accumulate timers
+  inline void acc_timers() {
+    if (device->time_device()) {
+      ans->acc_timers();
+      time_in.add_to_total();
+      time_out.add_to_total();
+      time_map.add_to_total();
+      time_rho.add_to_total();
+      time_interp.add_to_total();
+    }
+  }
+
+  /// Zero timers
+  inline void zero_timers() {
+    atom->zero_timers();
+    ans->zero_timers();
+    time_in.zero();
+    time_out.zero();
+    time_map.zero();
+    time_rho.zero();
+    time_interp.zero();
+  }
+
+  /// Precomputations for charge assignment that can be done asynchronously
+  inline void precompute(const int ago, const int nlocal, const int nall,
+                         double **host_x, int *host_type, bool &success,
+                         double *charge, double *boxlo, double *prd) {
+    double delxinv=_nx_pppm/prd[0];
+    double delyinv=_ny_pppm/prd[1];
+    double delzinv=_nz_pppm/(prd[2]*_slab_volfactor);
+    _precompute(ago,nlocal,nall,host_x,host_type,success,charge,boxlo,delxinv,
+                delyinv,delzinv);
+  }
+
+  /// Returns non-zero if out of bounds atoms
+  int spread(const int ago, const int nlocal, const int nall, double **host_x,
+             int *host_type, bool &success, double *charge, double *boxlo,
+             const double delxinv, const double delyinv, const double delzinv);
+
+  void interp(const grdtyp qqrd2e_scale);
+
+  // -------------------------- DEVICE DATA ------------------------- 
+
+  /// Device Properties and Atom and Neighbor storage
+  Device<numtyp,acctyp> *device;
+
+  /// Geryon device
+  UCL_Device *ucl_device;
+
+  /// Device Timers
+  UCL_Timer time_in, time_out, time_map, time_rho, time_interp;
+
+  /// LAMMPS pointer for screen output
+  FILE *screen;
+
+  // --------------------------- ATOM DATA --------------------------
+
+  /// Atom Data
+  Atom<numtyp,acctyp> *atom;
+
+
+  // --------------------------- GRID DATA --------------------------
+
+  UCL_H_Vec<grdtyp> h_brick, h_vd_brick;
+  UCL_D_Vec<grdtyp> d_brick;
+  
+  // Count of number of atoms assigned to each grid point
+  UCL_D_Vec<int> d_brick_counts;
+  // Atoms assigned to each grid point
+  UCL_D_Vec<grdtyp4> d_brick_atoms;
+  
+  // Error checking for out of bounds atoms
+  UCL_D_Vec<int> d_error_flag;
+  UCL_H_Vec<int> h_error_flag;
+  
+  // Number of grid points in brick (including ghost)
+  int _npts_x, _npts_y, _npts_z, _npts_yx;
+  
+  // Number of local grid points in brick
+  int _nlocal_x, _nlocal_y, _nlocal_z, _nlocal_yx, _atom_stride;
+  
+  // -------------------------- SPLINE DATA -------------------------
+  UCL_D_Vec<grdtyp> d_rho_coeff;
+  int _order, _nlower, _nupper, _order_m_1, _order2;
+  int _nxlo_out, _nylo_out, _nzlo_out, _nxhi_out, _nyhi_out, _nzhi_out;
+
+  // ------------------------ FORCE/ENERGY DATA -----------------------
+
+  Answer<numtyp,acctyp> *ans;
+
+  // ------------------------- DEVICE KERNELS -------------------------
+  UCL_Program *pppm_program;
+  UCL_Kernel k_particle_map, k_make_rho, k_interp;
+  inline int block_size() { return _block_size; }
+
+  // --------------------------- TEXTURES -----------------------------
+  UCL_Texture pos_tex;
+  UCL_Texture q_tex;
+
+ protected:
+  bool _allocated, _compiled, _precompute_done;
+  int _block_size, _block_pencils, _pencil_size, _max_brick_atoms, _max_atoms;
+  double  _max_bytes, _max_an_bytes;
+  double _cpu_idle_time;
+  
+  grdtyp _brick_x, _brick_y, _brick_z, _delxinv, _delyinv, _delzinv; 
+
+  double _slab_volfactor;
+  int _nx_pppm, _ny_pppm, _nz_pppm;
+  
+  void compile_kernels(UCL_Device &dev);
+  void _precompute(const int ago, const int nlocal, const int nall,
+                   double **host_x, int *host_type, bool &success,
+                   double *charge, double *boxlo, const double delxinv,
+                   const double delyinv, const double delzinv);
+};
+
+}
+
+#endif
--- a/lib/gpu/lal_pppm_ext.cpp
+++ b/lib/gpu/lal_pppm_ext.cpp
@ -0,0 +1,163 @@
+/***************************************************************************
+                                 pppm_ext.cpp
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Functions for LAMMPS access to PPPM acceleration routines
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#include <iostream>
+#include <cassert>
+#include <math.h>
+
+#include "lal_pppm.h"
+
+using namespace std;
+using namespace LAMMPS_AL;
+
+static PPPM<PRECISION,ACC_PRECISION,float,_lgpu_float4> PPPMF;
+static PPPM<PRECISION,ACC_PRECISION,double,_lgpu_double4> PPPMD;
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+template <class grdtyp, class memtyp>
+grdtyp * pppm_gpu_init(memtyp &pppm, const int nlocal, const int nall,
+                       FILE *screen, const int order, const int nxlo_out, 
+                       const int nylo_out, const int nzlo_out,
+                       const int nxhi_out, const int nyhi_out,
+                       const int nzhi_out, grdtyp **rho_coeff,
+                       grdtyp **vd_brick, const double slab_volfactor,
+                       const int nx_pppm, const int ny_pppm, const int nz_pppm,
+                       int &success) {
+  pppm.clear(0.0);
+  int first_gpu=pppm.device->first_device();
+  int last_gpu=pppm.device->last_device();
+  int world_me=pppm.device->world_me();
+  int gpu_rank=pppm.device->gpu_rank();
+  int procs_per_gpu=pppm.device->procs_per_gpu();
+
+  pppm.device->init_message(screen,"pppm",first_gpu,last_gpu);
+
+  bool message=false;
+  if (pppm.device->replica_me()==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fflush(screen);
+  }
+
+  success=0;
+  grdtyp * host_brick=NULL;
+  if (world_me==0)
+    host_brick=pppm.init(nlocal,nall,screen,order,nxlo_out,nylo_out,nzlo_out,
+                         nxhi_out,nyhi_out,nzhi_out,rho_coeff,vd_brick,
+                         slab_volfactor,nx_pppm,ny_pppm,nz_pppm,success);
+
+  pppm.device->world_barrier();
+  if (message)
+    fprintf(screen,"Done.\n");
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+      else
+        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0)
+      host_brick=pppm.init(nlocal,nall,screen,order,nxlo_out,nylo_out,
+                           nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff,
+                           vd_brick,slab_volfactor,nx_pppm,ny_pppm,nz_pppm,
+                           success);
+
+    pppm.device->gpu_barrier();
+    if (message) 
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+  return host_brick;
+}
+
+float * pppm_gpu_init_f(const int nlocal, const int nall, FILE *screen,
+                        const int order, const int nxlo_out, 
+                        const int nylo_out, const int nzlo_out,
+                        const int nxhi_out, const int nyhi_out,
+                        const int nzhi_out, float **rho_coeff,
+                        float **vd_brick, const double slab_volfactor,
+                        const int nx_pppm, const int ny_pppm, const int nz_pppm,
+                        int &success) {
+  float *b=pppm_gpu_init(PPPMF,nlocal,nall,screen,order,nxlo_out,nylo_out,
+                         nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff,vd_brick,
+                         slab_volfactor,nx_pppm,ny_pppm,nz_pppm,success);
+  PPPMF.device->set_single_precompute(&PPPMF);                         
+  return b;
+}
+
+void pppm_gpu_clear_f(const double cpu_time) {
+  PPPMF.clear(cpu_time);
+}
+
+int pppm_gpu_spread_f(const int ago, const int nlocal, const int nall,
+                     double **host_x, int *host_type, bool &success,
+                     double *host_q, double *boxlo, const double delxinv,
+                     const double delyinv, const double delzinv) {
+  return PPPMF.spread(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo,
+                      delxinv,delyinv,delzinv);
+}
+
+void pppm_gpu_interp_f(const float qqrd2e_scale) {
+  return PPPMF.interp(qqrd2e_scale);
+}
+
+double pppm_gpu_bytes_f() {
+  return PPPMF.host_memory_usage();
+}
+
+double * pppm_gpu_init_d(const int nlocal, const int nall, FILE *screen,
+                         const int order, const int nxlo_out, 
+                         const int nylo_out, const int nzlo_out,
+                         const int nxhi_out, const int nyhi_out,
+                         const int nzhi_out, double **rho_coeff,
+                         double **vd_brick, const double slab_volfactor,
+                         const int nx_pppm, const int ny_pppm,
+                         const int nz_pppm, int &success) {
+  double *b=pppm_gpu_init(PPPMD,nlocal,nall,screen,order,nxlo_out,nylo_out,
+                          nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff,
+                          vd_brick,slab_volfactor,nx_pppm,ny_pppm,nz_pppm,
+                          success);                        
+  PPPMF.device->set_double_precompute(&PPPMD);                         
+  return b;
+}
+
+void pppm_gpu_clear_d(const double cpu_time) {
+  PPPMD.clear(cpu_time);
+}
+
+int pppm_gpu_spread_d(const int ago, const int nlocal, const int nall,
+                      double **host_x, int *host_type, bool &success,
+                      double *host_q, double *boxlo, const double delxinv,
+                      const double delyinv, const double delzinv) {
+  return PPPMD.spread(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo,
+                      delxinv,delyinv,delzinv);
+}
+
+void pppm_gpu_interp_d(const double qqrd2e_scale) {
+  return PPPMD.interp(qqrd2e_scale);
+}
+
+double pppm_gpu_bytes_d() {
+  return PPPMD.host_memory_usage();
+}
+
--- a/lib/gpu/lal_precision.h
+++ b/lib/gpu/lal_precision.h
@ -0,0 +1,95 @@
+/***************************************************************************
+                                 precision.h
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Data and preprocessor definitions for different precision modes
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#ifndef LAL_PRECISION_H
+#define LAL_PRECISION_H
+
+struct _lgpu_float2 {
+  float x; float y;
+};
+
+struct _lgpu_float4 {
+  float x; float y; float z; float w;
+};
+
+struct _lgpu_double2 {
+  double x; double y;
+};
+
+struct _lgpu_double4 {
+  double x; double y; double z; double w;
+};
+
+#include <iostream>
+inline std::ostream & operator<<(std::ostream &out, const _lgpu_float2 &v) {
+  out << v.x << " " << v.y;
+  return out;
+}
+  
+inline std::ostream & operator<<(std::ostream &out, const _lgpu_float4 &v) {
+  out << v.x << " " << v.y << " " << v.z;
+  return out;
+}
+  
+inline std::ostream & operator<<(std::ostream &out, const _lgpu_double2 &v) {
+  out << v.x << " " << v.y;
+  return out;
+}
+  
+inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) {
+  out << v.x << " " << v.y << " " << v.z;
+  return out;
+}
+
+// PRECISION - Precision for rsq, energy, force, and torque calculation
+// ACC_PRECISION - Precision for accumulation of energies, forces, and torques
+#ifdef _SINGLE_DOUBLE
+#define OCL_PRECISION_COMPILE "-D_SINGLE_DOUBLE"
+#define PRECISION float
+#define ACC_PRECISION double
+#define numtyp2 _lgpu_float2
+#define numtyp4 _lgpu_float4
+#define acctyp4 _lgpu_double4
+#endif
+
+#ifdef _DOUBLE_DOUBLE
+#define OCL_PRECISION_COMPILE "-D_DOUBLE_DOUBLE"
+#define PRECISION double
+#define ACC_PRECISION double
+#define numtyp2 _lgpu_double2
+#define numtyp4 _lgpu_double4
+#define acctyp4 _lgpu_double4
+#endif
+
+#ifndef PRECISION
+#define OCL_PRECISION_COMPILE "-D_SINGLE_SINGLE"
+#define PRECISION float
+#define ACC_PRECISION float
+#define numtyp2 _lgpu_float2
+#define numtyp4 _lgpu_float4
+#define acctyp4 _lgpu_float4
+#endif
+
+enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
+
+#ifdef FERMI_OCL
+#define OCL_VENDOR "FERMI_OCL"
+#endif
+
+#ifndef OCL_VENDOR
+#define OCL_VENDOR "GENERIC_OCL"
+#endif
+
+#endif
--- a/lib/gpu/lal_preprocessor.h
+++ b/lib/gpu/lal_preprocessor.h
@ -0,0 +1,319 @@
+// **************************************************************************
+//                              preprocessor.cu
+//                             -------------------
+//                           W. Michael Brown (ORNL)
+//
+//  Device code for CUDA-specific preprocessor definitions
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                : 
+//    email                : brownw@ornl.gov
+// ***************************************************************************/
+
+//*************************************************************************
+//                           Preprocessor Definitions
+//                           
+//  Note: It is assumed that constants with the same names are defined with
+//  the same values in all files.
+//  
+//  ARCH
+//     Definition:   Architecture number for accelerator
+//  MEM_THREADS
+//     Definition:   Number of threads with sequential ids accessing memory
+//                   simultaneously on multiprocessor
+//  WARP_SIZE:
+//     Definition:   Number of threads guaranteed to be on the same instruction
+//  THREADS_PER_ATOM
+//     Definition:   Default number of threads assigned per atom for pair styles
+//     Restructions: Must be power of 2; THREADS_PER_ATOM<=WARP_SIZE
+//  THREADS_PER_CHARGE
+//     Definition:   Default number of threads assigned per atom for pair styles
+//                   with charge
+//     Restructions: Must be power of 2; THREADS_PER_ATOM<=WARP_SIZE
+//  PPPM_MAX_SPLINE
+//     Definition:   Maximum order for splines in PPPM
+//  PPPM_BLOCK_1D    
+//     Definition:   Thread block size for PPPM kernels
+//     Restrictions: PPPM_BLOCK_1D>=PPPM_MAX_SPLINE*PPPM_MAX_SPLINE
+//                   PPPM_BLOCK_1D%32==0 
+//  BLOCK_PAIR
+//     Definition:   Default thread block size for pair styles
+//     Restrictions:
+//  MAX_SHARED_TYPES 8
+//     Definition:   Max # of atom type params can be stored in shared memory
+//     Restrictions: MAX_SHARED_TYPES*MAX_SHARED_TYPES<=BLOCK_PAIR
+//  BLOCK_CELL_2D 
+//     Definition:   Default block size in each dimension for cell list builds
+//                   and matrix transpose
+//  BLOCK_CELL_ID    
+//     Definition:   Default block size for binning atoms in cell list builds
+//  BLOCK_NBOR_BUILD 
+//     Definition:   Default block size for neighbor list builds
+//  BLOCK_BIO_PAIR
+//     Definition:   Default thread block size for "bio" pair styles
+//  MAX_BIO_SHARED_TYPES
+//     Definition:   Max # of atom type params can be stored in shared memory
+//     Restrictions:  MAX_BIO_SHARED_TYPES<=BLOCK_BIO_PAIR*2 &&
+//                    MAX_BIO_SHARED_TYPES>=BLOCK_BIO_PAIR
+//
+//*************************************************************************/
+
+// -------------------------------------------------------------------------
+//                            CUDA DEFINITIONS
+// -------------------------------------------------------------------------
+
+#ifdef NV_KERNEL
+
+#ifdef __CUDA_ARCH__
+#define ARCH __CUDA_ARCH__
+#else
+#define ARCH 100
+#endif
+
+#if (ARCH < 200)
+
+#define THREADS_PER_ATOM 1
+#define THREADS_PER_CHARGE 16
+#define BLOCK_NBOR_BUILD 64
+#define BLOCK_PAIR 64
+#define BLOCK_BIO_PAIR 64
+#define MAX_SHARED_TYPES 8
+
+#else
+
+#define THREADS_PER_ATOM 4
+#define THREADS_PER_CHARGE 8
+#define BLOCK_NBOR_BUILD 128
+#define BLOCK_PAIR 128
+#define BLOCK_BIO_PAIR 128
+#define MAX_SHARED_TYPES 11
+
+#endif
+
+#define WARP_SIZE 32
+#define PPPM_BLOCK_1D 64
+#define BLOCK_CELL_2D 8
+#define BLOCK_CELL_ID 128
+#define MAX_BIO_SHARED_TYPES 128
+
+#ifdef _DOUBLE_DOUBLE
+ucl_inline double4 fetch_pos(const int& i, const double4 *pos) { return pos[i]; }
+ucl_inline double fetch_q(const int& i, const double *q) { return q[i]; }
+#endif
+
+#if (__CUDA_ARCH__ < 200)
+#define fast_mul __mul24
+#define MEM_THREADS 16
+#else
+#define fast_mul(X,Y) (X)*(Y)
+#define MEM_THREADS 32
+#endif
+
+#ifdef CUDA_PRE_THREE
+struct __builtin_align__(16) _double4
+{
+  double x, y, z, w;
+};
+typedef struct _double4 double4;
+#endif
+
+#define GLOBAL_ID_X threadIdx.x+mul24(blockIdx.x,blockDim.x)
+#define GLOBAL_ID_Y threadIdx.y+mul24(blockIdx.y,blockDim.y)
+#define GLOBAL_SIZE_X mul24(gridDim.x,blockDim.x);
+#define GLOBAL_SIZE_Y mul24(gridDim.y,blockDim.y);
+#define THREAD_ID_X threadIdx.x
+#define THREAD_ID_Y threadIdx.y
+#define BLOCK_ID_X blockIdx.x
+#define BLOCK_ID_Y blockIdx.y
+#define BLOCK_SIZE_X blockDim.x
+#define BLOCK_SIZE_Y blockDim.y
+#define __kernel extern "C" __global__
+#define __local __shared__
+#define __global  
+#define atom_add atomicAdd
+#define ucl_inline static __inline__ __device__ 
+
+
+#ifndef _DOUBLE_DOUBLE
+
+#define ucl_exp exp
+#define ucl_powr pow
+#define ucl_atan atan
+#define ucl_cbrt cbrt
+#define ucl_ceil ceil
+#define ucl_abs fabs
+#define ucl_rsqrt rsqrt
+#define ucl_sqrt sqrt
+#define ucl_recip(x) ((numtyp)1.0/(x))
+
+#else
+
+#define ucl_atan atanf
+#define ucl_cbrt cbrtf
+#define ucl_ceil ceilf
+#define ucl_abs fabsf
+#define ucl_recip(x) ((numtyp)1.0/(x))
+
+#ifdef NO_HARDWARE_TRANSCENDENTALS
+
+#define ucl_exp expf
+#define ucl_powr powf
+#define ucl_rsqrt rsqrtf
+#define ucl_sqrt sqrtf
+
+#else
+
+#define ucl_exp __expf
+#define ucl_powr __powf
+#define ucl_rsqrt __rsqrtf
+#define ucl_sqrt __sqrtf
+
+#endif
+
+#endif
+
+#endif
+
+// -------------------------------------------------------------------------
+//                            FERMI OPENCL DEFINITIONS
+// -------------------------------------------------------------------------
+
+#ifdef FERMI_OCL
+
+#define USE_OPENCL
+#define fast_mul(X,Y) (X)*(Y)
+#define ARCH 0
+#define DRIVER 0
+#define MEM_THREADS 32
+#define THREADS_PER_ATOM 4
+#define THREADS_PER_CHARGE 8
+#define BLOCK_PAIR 128
+#define MAX_SHARED_TYPES 11
+#define BLOCK_NBOR_BUILD 128
+#define BLOCK_BIO_PAIR 128
+
+#define WARP_SIZE 32
+#define PPPM_BLOCK_1D 64
+#define BLOCK_CELL_2D 8
+#define BLOCK_CELL_ID 128
+#define MAX_BIO_SHARED_TYPES 128
+
+#pragma OPENCL EXTENSION cl_khr_fp64: enable
+
+#endif
+
+// -------------------------------------------------------------------------
+//                            GENERIC OPENCL DEFINITIONS
+// -------------------------------------------------------------------------
+
+#ifdef GENERIC_OCL
+
+#define USE_OPENCL
+#define fast_mul mul24
+#define ARCH 0
+#define DRIVER 0
+#define MEM_THREADS 16
+#define THREADS_PER_ATOM 1
+#define THREADS_PER_CHARGE 1
+#define BLOCK_PAIR 64
+#define MAX_SHARED_TYPES 8
+#define BLOCK_NBOR_BUILD 64
+#define BLOCK_BIO_PAIR 64
+
+#define WARP_SIZE 1
+#define PPPM_BLOCK_1D 64
+#define BLOCK_CELL_2D 8
+#define BLOCK_CELL_ID 128
+#define MAX_BIO_SHARED_TYPES 128
+
+#pragma OPENCL EXTENSION cl_khr_fp64: enable
+
+#endif
+
+// -------------------------------------------------------------------------
+//                     OPENCL Stuff for All Hardware
+// -------------------------------------------------------------------------
+#ifdef USE_OPENCL
+
+#define GLOBAL_ID_X get_global_id(0)
+#define THREAD_ID_X get_local_id(0)
+#define BLOCK_ID_X get_group_id(0)
+#define BLOCK_SIZE_X get_local_size(0)
+#define GLOBAL_SIZE_X get_global_size(0)
+#define THREAD_ID_Y get_local_id(1)
+#define BLOCK_ID_Y get_group_id(1)
+#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
+#define ucl_inline inline
+#define fetch_pos(i,y) x_[i]
+#define fetch_q(i,y) q_[i]
+
+#define ucl_atan atan
+#define ucl_cbrt cbrt
+#define ucl_ceil ceil
+#define ucl_abs fabs
+
+#ifdef NO_HARDWARE_TRANSCENDENTALS
+
+#define ucl_exp exp
+#define ucl_powr powr
+#define ucl_rsqrt rsqrt
+#define ucl_sqrt sqrt
+#define ucl_recip(x) ((numtyp)1.0/(x))
+
+#else
+
+#define ucl_exp native_exp
+#define ucl_powr native_powr
+#define ucl_rsqrt native_rsqrt
+#define ucl_sqrt native_sqrt
+#define ucl_recip native_recip
+
+#endif
+
+#endif
+
+// -------------------------------------------------------------------------
+//                  ARCHITECTURE INDEPENDENT DEFINITIONS
+// -------------------------------------------------------------------------
+
+#define PPPM_MAX_SPLINE 8
+
+#ifdef _DOUBLE_DOUBLE
+#define numtyp double
+#define numtyp2 double2
+#define numtyp4 double4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifdef _SINGLE_DOUBLE
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifndef numtyp
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp float
+#define acctyp4 float4
+#endif
+
+#define EWALD_F (numtyp)1.12837917
+#define EWALD_P (numtyp)0.3275911
+#define A1 (numtyp)0.254829592
+#define A2 (numtyp)-0.284496736
+#define A3 (numtyp)1.421413741
+#define A4 (numtyp)-1.453152027
+#define A5 (numtyp)1.061405429
+
+#define SBBITS 30
+#define NEIGHMASK 0x3FFFFFFF
+ucl_inline int sbmask(int j) { return j >> SBBITS & 3; }
+
--- a/lib/gpu/lal_re_squared.cpp
+++ b/lib/gpu/lal_re_squared.cpp
@ -0,0 +1,310 @@
+/***************************************************************************
+                                re_squared.cpp
+                             -------------------
+                               W. Michael Brown
+
+  Host code for RE-Squared potential acceleration
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : Fri May 06 2011
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#ifdef USE_OPENCL
+#include "re_squared_cl.h"
+#include "re_squared_lj_cl.h"
+#else
+#include "re_squared_ptx.h"
+#include "re_squared_lj_ptx.h"
+#endif
+
+#include "lal_re_squared.h"
+#include <cassert>
+using namespace LAMMPS_AL;
+
+#define RESquaredT RESquared<numtyp, acctyp>
+extern Device<PRECISION,ACC_PRECISION> device;
+
+template <class numtyp, class acctyp>
+RESquaredT::RESquared() : BaseEllipsoid<numtyp,acctyp>(),
+                                  _allocated(false) {
+}
+
+template <class numtyp, class acctyp>
+RESquaredT::~RESquared() { 
+  clear();
+}
+ 
+template <class numtyp, class acctyp>
+int RESquaredT::bytes_per_atom(const int max_nbors) const {
+  return this->bytes_per_atom(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int RESquaredT::init(const int ntypes, double **host_shape, double **host_well, 
+                     double **host_cutsq, double **host_sigma, 
+                     double **host_epsilon, int **h_form, double **host_lj1,
+                     double **host_lj2, double **host_lj3, double **host_lj4,
+                     double **host_offset, const double *host_special_lj,
+                     const int nlocal, const int nall, const int max_nbors,
+                     const int maxspecial, const double cell_size,
+                     const double gpu_split, FILE *_screen) {
+  int success;
+  success=this->init_base(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                          _screen,ntypes,h_form,re_squared,re_squared_lj,true);
+  if (success!=0)
+    return success;
+
+  // If atom type constants fit in shared memory use fast kernel
+  int lj_types=ntypes;
+  _shared_types=false;
+  int max_shared_types=this->device->max_shared_types();
+  if (lj_types<=max_shared_types && this->block_size()>=max_shared_types) {
+    lj_types=max_shared_types;
+    _shared_types=true;
+  }
+  _lj_types=lj_types;
+
+  // Allocate a host write buffer for copying type data
+  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
+                               UCL_WRITE_OPTIMIZED);
+
+  for (int i=0; i<lj_types*lj_types; i++)
+    host_write[i]=0.0;
+
+  sigma_epsilon.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack2(ntypes,lj_types,sigma_epsilon,host_write,
+			 host_sigma,host_epsilon);
+
+  this->cut_form.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack2(ntypes,lj_types,this->cut_form,host_write,
+			 host_cutsq,h_form);
+
+  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
+			 host_cutsq,h_form);
+
+  lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
+		         host_offset);
+
+  dev_error.alloc(1,*(this->ucl_device));
+  dev_error.zero();
+    
+  // Allocate, cast and asynchronous memcpy of constant data
+  // Copy data for bonded interactions
+  special_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
+  host_write[0]=static_cast<numtyp>(host_special_lj[0]);
+  host_write[1]=static_cast<numtyp>(host_special_lj[1]);
+  host_write[2]=static_cast<numtyp>(host_special_lj[2]);
+  host_write[3]=static_cast<numtyp>(host_special_lj[3]);
+  ucl_copy(special_lj,host_write,4,false);
+
+  // Copy shape, well, sigma, epsilon, and cutsq onto GPU
+  // - cast if necessary
+  shape.alloc(ntypes,*(this->ucl_device),UCL_READ_ONLY);
+  for (int i=0; i<ntypes; i++) {
+    host_write[i*4]=host_shape[i][0];
+    host_write[i*4+1]=host_shape[i][1];
+    host_write[i*4+2]=host_shape[i][2];
+  }
+  UCL_H_Vec<numtyp4> view4;
+  view4.view((numtyp4*)host_write.begin(),shape.numel(),*(this->ucl_device));
+  ucl_copy(shape,view4,false);
+
+  well.alloc(ntypes,*(this->ucl_device),UCL_READ_ONLY);
+  for (int i=0; i<ntypes; i++) {
+    host_write[i*4]=host_well[i][0];
+    host_write[i*4+1]=host_well[i][1];
+    host_write[i*4+2]=host_well[i][2];
+  }
+  view4.view((numtyp4*)host_write.begin(),well.numel(),*(this->ucl_device));
+  ucl_copy(well,view4,false);
+  
+  _allocated=true;
+  this->_max_bytes=sigma_epsilon.row_bytes()+this->cut_form.row_bytes()+
+                   lj1.row_bytes()+lj3.row_bytes()+special_lj.row_bytes()+
+                   shape.row_bytes()+well.row_bytes();
+
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+void RESquaredT::clear() {
+  if (!_allocated)
+    return;
+
+  UCL_H_Vec<int> err_flag(1,*(this->ucl_device));
+  ucl_copy(err_flag,dev_error,false);
+  if (err_flag[0] == 2)
+    std::cerr << "BAD MATRIX INVERSION IN FORCE COMPUTATION.\n";  
+  err_flag.clear();
+
+  _allocated=false;
+
+  dev_error.clear();
+  lj1.clear();
+  lj3.clear();
+  sigma_epsilon.clear();
+  this->cut_form.clear();
+
+  shape.clear();
+  well.clear();
+  special_lj.clear();
+  
+  this->clear_base();
+}
+
+template <class numtyp, class acctyp>
+double RESquaredT::host_memory_usage() const {
+  return this->host_memory_usage_base()+sizeof(RESquaredT)+
+         4*sizeof(numtyp);
+}
+
+// ---------------------------------------------------------------------------
+// Calculate energies, forces, and torques
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void RESquaredT::loop(const bool _eflag, const bool _vflag) {
+  const int BX=this->block_size();
+  int eflag, vflag;
+  if (_eflag)
+    eflag=1;
+  else
+    eflag=0;
+
+  if (_vflag)
+    vflag=1;
+  else
+    vflag=0;
+  
+  int GX=0, NGX;
+  int stride=this->nbor->nbor_pitch();
+  int ainum=this->ans->inum();
+
+  if (this->_multiple_forms) {
+    if (this->_last_ellipse>0) {
+      // ------------ ELLIPSE_ELLIPSE ---------------
+      this->time_nbor1.start();
+      GX=static_cast<int>(ceil(static_cast<double>(this->_last_ellipse)/
+                               (BX/this->_threads_per_atom)));
+      NGX=static_cast<int>(ceil(static_cast<double>(this->_last_ellipse)/BX));
+      this->pack_nbors(NGX,BX, 0, this->_last_ellipse,ELLIPSE_ELLIPSE,
+			                 ELLIPSE_ELLIPSE,_shared_types,_lj_types);
+      this->time_nbor1.stop();
+
+      this->time_ellipsoid.start();
+      this->k_ellipsoid.set_size(GX,BX);
+      this->k_ellipsoid.run(&this->atom->dev_x.begin(),
+       &this->atom->dev_quat.begin(), &this->shape.begin(), &this->well.begin(),
+       &this->special_lj.begin(), &this->sigma_epsilon.begin(), 
+       &this->_lj_types, &this->nbor->dev_nbor.begin(), &stride,
+       &this->ans->dev_ans.begin(),&ainum,&this->ans->dev_engv.begin(),
+       &this->dev_error.begin(), &eflag, &vflag, &this->_last_ellipse,
+       &this->_threads_per_atom);
+      this->time_ellipsoid.stop();
+
+      // ------------ ELLIPSE_SPHERE ---------------
+      this->time_nbor2.start();
+      this->pack_nbors(NGX,BX, 0, this->_last_ellipse,ELLIPSE_SPHERE,
+			                 ELLIPSE_SPHERE,_shared_types,_lj_types);
+      this->time_nbor2.stop();
+
+      this->time_ellipsoid2.start();
+      this->k_ellipsoid_sphere.set_size(GX,BX);
+      this->k_ellipsoid_sphere.run(&this->atom->dev_x.begin(),
+       &this->atom->dev_quat.begin(), &this->shape.begin(), &this->well.begin(),
+       &this->special_lj.begin(), &this->sigma_epsilon.begin(), 
+       &this->_lj_types, &this->nbor->dev_nbor.begin(), &stride,
+       &this->ans->dev_ans.begin(),&ainum,&this->ans->dev_engv.begin(),
+       &this->dev_error.begin(), &eflag, &vflag, &this->_last_ellipse,
+       &this->_threads_per_atom);
+      this->time_ellipsoid2.stop();
+
+      if (this->_last_ellipse==this->ans->inum()) {
+        this->time_nbor3.zero();
+        this->time_ellipsoid3.zero();
+        this->time_lj.zero();
+        return;
+      }
+
+      // ------------ SPHERE_ELLIPSE ---------------
+
+      this->time_nbor3.start();
+      GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum()-
+                               this->_last_ellipse)/
+                               (BX/this->_threads_per_atom)));
+      NGX=static_cast<int>(ceil(static_cast<double>(this->ans->inum()-
+                               this->_last_ellipse)/BX));
+      this->pack_nbors(NGX,BX,this->_last_ellipse,this->ans->inum(),
+			                 SPHERE_ELLIPSE,SPHERE_ELLIPSE,_shared_types,_lj_types);
+      this->time_nbor3.stop();
+
+      this->time_ellipsoid3.start();
+      this->k_sphere_ellipsoid.set_size(GX,BX);
+      this->k_sphere_ellipsoid.run(&this->atom->dev_x.begin(),
+        &this->atom->dev_quat.begin(), &this->shape.begin(), 
+        &this->well.begin(), &this->special_lj.begin(), 
+        &this->sigma_epsilon.begin(), &this->_lj_types,
+        &this->nbor->dev_nbor.begin(), &stride, &this->ans->dev_ans.begin(),
+        &this->ans->dev_engv.begin(), &this->dev_error.begin(), &eflag,
+        &vflag, &this->_last_ellipse, &ainum, &this->_threads_per_atom);
+      this->time_ellipsoid3.stop();
+   } else {
+      this->ans->dev_ans.zero();
+      this->ans->dev_engv.zero();
+      this->time_nbor1.zero();
+      this->time_ellipsoid.zero();                                 
+      this->time_nbor2.zero();
+      this->time_ellipsoid2.zero();
+      this->time_nbor3.zero();
+      this->time_ellipsoid3.zero();
+    }
+    
+    // ------------         LJ      ---------------
+    this->time_lj.start();
+    if (this->_last_ellipse<this->ans->inum()) {
+      if (this->_shared_types) {
+        this->k_lj_fast.set_size(GX,BX);
+        this->k_lj_fast.run(&this->atom->dev_x.begin(), &this->lj1.begin(),
+          &this->lj3.begin(), &this->special_lj.begin(), &stride,
+          &this->nbor->dev_packed.begin(), &this->ans->dev_ans.begin(),
+          &this->ans->dev_engv.begin(), &this->dev_error.begin(),
+          &eflag, &vflag, &this->_last_ellipse, &ainum,
+          &this->_threads_per_atom);
+      } else {
+        this->k_lj.set_size(GX,BX);
+        this->k_lj.run(&this->atom->dev_x.begin(), &this->lj1.begin(),
+          &this->lj3.begin(), &this->_lj_types, &this->special_lj.begin(),
+          &stride, &this->nbor->dev_packed.begin(), &this->ans->dev_ans.begin(),
+          &this->ans->dev_engv.begin(), &this->dev_error.begin(), &eflag,
+          &vflag, &this->_last_ellipse, &ainum, &this->_threads_per_atom);
+      }
+    }
+    this->time_lj.stop();
+  } else {
+    GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                             (BX/this->_threads_per_atom)));
+    NGX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
+    this->time_nbor1.start();
+    this->pack_nbors(NGX, BX, 0, this->ans->inum(),SPHERE_SPHERE,
+		                 ELLIPSE_ELLIPSE,_shared_types,_lj_types);
+    this->time_nbor1.stop();
+    this->time_ellipsoid.start(); 
+    this->k_ellipsoid.set_size(GX,BX);
+    this->k_ellipsoid.run(&this->atom->dev_x.begin(), 
+      &this->atom->dev_quat.begin(), &this->shape.begin(), &this->well.begin(), 
+      &this->special_lj.begin(), &this->sigma_epsilon.begin(), 
+      &this->_lj_types, &this->nbor->dev_nbor.begin(), &stride,
+      &this->ans->dev_ans.begin(), &ainum,  &this->ans->dev_engv.begin(),
+      &this->dev_error.begin(), &eflag, &vflag, &ainum, 
+      &this->_threads_per_atom);
+    this->time_ellipsoid.stop();
+  }
+}
+
+template class RESquared<PRECISION,ACC_PRECISION>;
+
--- a/lib/gpu/lal_re_squared.cu
+++ b/lib/gpu/lal_re_squared.cu
@ -0,0 +1,452 @@
+// **************************************************************************
+//                                re_squared.cu
+//                             -------------------
+//                               W. Michael Brown
+//
+//  Device code for RE-Squared potential acceleration
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                : Fri May 06 2011
+//    email                : brownw@ornl.gov
+// ***************************************************************************/
+
+#ifdef NV_KERNEL
+#include "lal_ellipsoid_extra.h"
+#endif
+
+ucl_inline numtyp det_prime(const numtyp m[9], const numtyp m2[9])
+{
+  numtyp ans;
+  ans = m2[0]*m[4]*m[8] - m2[0]*m[5]*m[7] -
+        m[3]*m2[1]*m[8] + m[3]*m2[2]*m[7] +
+        m[6]*m2[1]*m[5] - m[6]*m2[2]*m[4] +
+        m[0]*m2[4]*m[8] - m[0]*m2[5]*m[7] -
+        m2[3]*m[1]*m[8] + m2[3]*m[2]*m[7] +
+        m[6]*m[1]*m2[5] - m[6]*m[2]*m2[4] +
+        m[0]*m[4]*m2[8] - m[0]*m[5]*m2[7] -
+        m[3]*m[1]*m2[8] + m[3]*m[2]*m2[7] +
+        m2[6]*m[1]*m[5] - m2[6]*m[2]*m[4];
+  return ans;
+}
+
+__kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q,
+                               __global numtyp4* shape, __global numtyp4* well, 
+                               __global numtyp *splj, __global numtyp2* sig_eps, 
+                               const int ntypes, __global int *dev_nbor,
+                               const int stride,  __global acctyp4 *ans,
+                               const int astride, __global acctyp *engv,
+                               __global int *err_flag, const int eflag,
+                               const int vflag, const int inum,
+                               const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp sp_lj[4];
+  sp_lj[0]=splj[0];    
+  sp_lj[1]=splj[1];    
+  sp_lj[2]=splj[2];    
+  sp_lj[3]=splj[3];
+  
+  __local numtyp b_alpha, cr60;
+  b_alpha=(numtyp)45.0/(numtyp)56.0;
+  cr60=ucl_cbrt((numtyp)60.0);    
+
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp4 tor;
+  tor.x=(acctyp)0;
+  tor.y=(acctyp)0;
+  tor.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+
+  if (ii<inum) {
+    __global int *nbor, *nbor_end;
+    int i, numj, n_stride;
+    nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
+                n_stride,nbor_end,nbor);
+  
+    numtyp4 ix=x_[i];
+    int itype=ix.w;
+
+    numtyp a1[9];       // Rotation matrix (lab->body)
+    numtyp aTe1[9];     // A'*E
+    numtyp gamma1[9];   // A'*S^2*A
+    numtyp sa1[9];      // S^2*A;
+    numtyp lA1_0[9], lA1_1[9], lA1_2[9]; // -A*rotation generator (x,y, or z)
+    numtyp lAtwo1_0[9], lAtwo1_1[9], lAtwo1_2[9];  // A'*S^2*lA
+    numtyp lAsa1_0[9], lAsa1_1[9], lAsa1_2[9];   // lAtwo+lA'*sa
+    numtyp4 ishape;
+    
+    ishape=shape[itype];
+    numtyp4 ishape2;
+    ishape2.x=ishape.x*ishape.x;
+    ishape2.y=ishape.y*ishape.y;
+    ishape2.z=ishape.z*ishape.z;
+    numtyp ilshape = ishape.x*ishape.y*ishape.z;
+    
+    {
+      numtyp aTs[9];    // A1'*S1^2
+      gpu_quat_to_mat_trans(q,i,a1);
+      gpu_transpose_times_diag3(a1,well[itype],aTe1);
+      gpu_transpose_times_diag3(a1,ishape2,aTs);
+      gpu_diag_times3(ishape2,a1,sa1);
+      gpu_times3(aTs,a1,gamma1);
+      gpu_rotation_generator_x(a1,lA1_0);
+      gpu_rotation_generator_y(a1,lA1_1);
+      gpu_rotation_generator_z(a1,lA1_2);
+      gpu_times3(aTs,lA1_0,lAtwo1_0);
+      gpu_transpose_times3(lA1_0,sa1,lAsa1_0);
+      gpu_plus3(lAsa1_0,lAtwo1_0,lAsa1_0);
+      gpu_times3(aTs,lA1_1,lAtwo1_1);
+      gpu_transpose_times3(lA1_1,sa1,lAsa1_1);
+      gpu_plus3(lAsa1_1,lAtwo1_1,lAsa1_1);
+      gpu_times3(aTs,lA1_2,lAtwo1_2);
+      gpu_transpose_times3(lA1_2,sa1,lAsa1_2);
+      gpu_plus3(lAsa1_2,lAtwo1_2,lAsa1_2);
+    }
+    ishape2.x=ucl_recip(ishape2.x);
+    ishape2.y=ucl_recip(ishape2.y);
+    ishape2.z=ucl_recip(ishape2.z);
+
+    numtyp factor_lj;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=*nbor;
+      factor_lj = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      numtyp4 jx=x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp r[3], rhat[3];
+      numtyp rnorm;
+      r[0] = jx.x-ix.x;
+      r[1] = jx.y-ix.y;
+      r[2] = jx.z-ix.z;
+      rnorm = gpu_dot3(r,r);
+      rnorm = ucl_rsqrt(rnorm);
+      rhat[0] = r[0]*rnorm;
+      rhat[1] = r[1]*rnorm;
+      rhat[2] = r[2]*rnorm;
+
+
+      numtyp a2[9];       // Rotation matrix (lab->body)
+      numtyp gamma2[9];   // A'*S^2*A
+      numtyp4 jshape;
+    
+      jshape=shape[jtype];
+      numtyp4 jshape2;
+      jshape2.x=jshape.x*jshape.x;
+      jshape2.y=jshape.y*jshape.y;
+      jshape2.z=jshape.z*jshape.z;
+      {
+        numtyp aTs[9];    // A1'*S1^2
+        gpu_quat_to_mat_trans(q,j,a2);
+        gpu_transpose_times_diag3(a2,jshape2,aTs);
+        gpu_times3(aTs,a2,gamma2);
+      }
+
+      numtyp temp[9], s[3], z1[3], z2[3], v1[3], v2[3];
+      numtyp sigma12, sigma1, sigma2;
+      gpu_plus3(gamma1,gamma2,temp);
+      gpu_mldivide3(temp,rhat,s,err_flag);
+      sigma12 = ucl_rsqrt((numtyp)0.5*gpu_dot3(s,rhat));
+      gpu_times_column3(a1,rhat,z1);
+      gpu_times_column3(a2,rhat,z2);
+      v1[0] = z1[0]*ishape2.x;
+      v1[1] = z1[1]*ishape2.y;
+      v1[2] = z1[2]*ishape2.z;
+      v2[0] = z2[0]/jshape2.x;
+      v2[1] = z2[1]/jshape2.y;
+      v2[2] = z2[2]/jshape2.z;
+      sigma1 = ucl_sqrt(gpu_dot3(z1,v1));
+      sigma2 = ucl_sqrt(gpu_dot3(z2,v2));
+
+      numtyp H12[9];
+      numtyp dH;
+      H12[0] = gamma1[0]*sigma1+gamma2[0]*sigma2;
+      H12[1] = gamma1[1]*sigma1+gamma2[1]*sigma2;
+      H12[2] = gamma1[2]*sigma1+gamma2[2]*sigma2;
+      H12[3] = gamma1[3]*sigma1+gamma2[3]*sigma2;
+      H12[4] = gamma1[4]*sigma1+gamma2[4]*sigma2;
+      H12[5] = gamma1[5]*sigma1+gamma2[5]*sigma2;
+      H12[6] = gamma1[6]*sigma1+gamma2[6]*sigma2;
+      H12[7] = gamma1[7]*sigma1+gamma2[7]*sigma2;
+      H12[8] = gamma1[8]*sigma1+gamma2[8]*sigma2;
+      dH=gpu_det3(H12);
+      
+      numtyp sigma1p2, sigma2p2, lambda, nu;
+      sigma1p2 = sigma1*sigma1;
+      sigma2p2 = sigma2*sigma2;
+      numtyp jlshape = jshape.x*jshape.y*jshape.z;
+      lambda = ilshape*sigma1p2 + jlshape*sigma2p2;
+
+
+      sigma1=ucl_recip(sigma1);
+      sigma2=ucl_recip(sigma2);
+
+      nu = ucl_rsqrt((sigma1+sigma2)/dH);
+      gpu_times3(aTe1,a1,temp);
+
+      numtyp sigma, epsilon;
+      int mtype=fast_mul(ntypes,itype)+jtype;
+      sigma = sig_eps[mtype].x;
+      epsilon = sig_eps[mtype].y*factor_lj;
+
+      numtyp w[3], temp2[9];
+      numtyp h12,eta,chi,sprod,sigh,tprod;
+      numtyp aTe2[9];     // A'*E
+      gpu_transpose_times_diag3(a2,well[jtype],aTe2);
+      gpu_times3(aTe2,a2,temp2);
+      gpu_plus3(temp,temp2,temp);
+      gpu_mldivide3(temp,rhat,w,err_flag);
+      h12 = ucl_recip(rnorm)-sigma12;
+      eta = lambda/nu;
+      chi = (numtyp)2.0*gpu_dot3(rhat,w);
+      sprod = ilshape * jlshape;
+      sigh = sigma/h12;
+      tprod = eta*chi*sigh;
+
+      numtyp stemp, Ua;
+      stemp = h12*(numtyp)0.5;
+      Ua = (ishape.x+stemp)*(ishape.y+stemp)*
+           (ishape.z+stemp)*(jshape.x+stemp)*
+           (jshape.y+stemp)*(jshape.z+stemp);
+      Ua = ((numtyp)1.0+(numtyp)3.0*tprod)*sprod/Ua;
+      Ua = epsilon*Ua/(numtyp)-36.0;
+
+      numtyp Ur;
+      stemp = h12/cr60;
+      Ur = (ishape.x+stemp)*(ishape.y+stemp)*
+           (ishape.z+stemp)*(jshape.x+stemp)*
+           (jshape.y+stemp)*(jshape.z+stemp);
+      Ur = ((numtyp)1.0+b_alpha*tprod)*sprod/Ur;
+      numtyp sigh6=sigh*sigh*sigh;
+      sigh6*=sigh6;
+      Ur = epsilon*Ur*sigh6/(numtyp)2025.0;
+
+      energy+=Ua+Ur;
+
+      // force
+
+      numtyp vsigma1[3], vsigma2[3], gsigma1[9], gsigma2[9];
+      numtyp sec, sigma12p3, sigma1p3, sigma2p3;
+      sec = sigma*eta*chi;
+      sigma12p3 = sigma12*sigma12*sigma12;
+      sigma1p3 = sigma1/sigma1p2;
+      sigma2p3 = sigma2/sigma2p2;
+      vsigma1[0] = -sigma1p3*v1[0];
+      vsigma1[1] = -sigma1p3*v1[1];
+      vsigma1[2] = -sigma1p3*v1[2];
+      vsigma2[0] = -sigma2p3*v2[0];
+      vsigma2[1] = -sigma2p3*v2[1];
+      vsigma2[2] = -sigma2p3*v2[2];
+      gsigma1[0] = -gamma1[0]*sigma1p2;
+      gsigma1[1] = -gamma1[1]*sigma1p2;
+      gsigma1[2] = -gamma1[2]*sigma1p2;
+      gsigma1[3] = -gamma1[3]*sigma1p2;
+      gsigma1[4] = -gamma1[4]*sigma1p2;
+      gsigma1[5] = -gamma1[5]*sigma1p2;
+      gsigma1[6] = -gamma1[6]*sigma1p2;
+      gsigma1[7] = -gamma1[7]*sigma1p2;
+      gsigma1[8] = -gamma1[8]*sigma1p2;
+      gsigma2[0] = -gamma2[0]*sigma2p2;
+      gsigma2[1] = -gamma2[1]*sigma2p2;
+      gsigma2[2] = -gamma2[2]*sigma2p2;
+      gsigma2[3] = -gamma2[3]*sigma2p2;
+      gsigma2[4] = -gamma2[4]*sigma2p2;
+      gsigma2[5] = -gamma2[5]*sigma2p2;
+      gsigma2[6] = -gamma2[6]*sigma2p2;
+      gsigma2[7] = -gamma2[7]*sigma2p2;
+      gsigma2[8] = -gamma2[8]*sigma2p2;
+
+      numtyp tsig1sig2, tdH, teta1, teta2;
+      numtyp fourw[3], spr[3];
+      tsig1sig2 = eta/((numtyp)2.0*(sigma1+sigma2));
+      tdH = eta/((numtyp)2.0*dH);
+      teta1 = (numtyp)2.0*eta/lambda;
+      teta2 = teta1*jlshape/sigma2p3;
+      teta1 = teta1*ilshape/sigma1p3;
+      fourw[0] = (numtyp)4.0*w[0];
+      fourw[1] = (numtyp)4.0*w[1];
+      fourw[2] = (numtyp)4.0*w[2];
+      spr[0] = (numtyp)0.5*sigma12p3*s[0];
+      spr[1] = (numtyp)0.5*sigma12p3*s[1];
+      spr[2] = (numtyp)0.5*sigma12p3*s[2];
+
+      numtyp hsec, dspu, pbsu;
+      stemp = ucl_recip(ishape.x*(numtyp)2.0+h12)+
+              ucl_recip(ishape.y*(numtyp)2.0+h12)+
+              ucl_recip(ishape.z*(numtyp)2.0+h12)+
+              ucl_recip(jshape.x*(numtyp)2.0+h12)+
+              ucl_recip(jshape.y*(numtyp)2.0+h12)+
+              ucl_recip(jshape.z*(numtyp)2.0+h12);
+      hsec = ucl_recip(h12+(numtyp)3.0*sec);
+      dspu = ucl_recip(h12)-hsec+stemp;
+      pbsu = (numtyp)3.0*sigma*hsec;
+  
+      numtyp dspr, pbsr;
+      stemp = ucl_recip(ishape.x*cr60+h12)+
+              ucl_recip(ishape.y*cr60+h12)+
+              ucl_recip(ishape.z*cr60+h12)+
+              ucl_recip(jshape.x*cr60+h12)+
+              ucl_recip(jshape.y*cr60+h12)+
+              ucl_recip(jshape.z*cr60+h12);
+      hsec = ucl_recip(h12+b_alpha*sec);
+      dspr = (numtyp)7.0/h12-hsec+stemp;
+      pbsr = b_alpha*sigma*hsec;
+  
+      numtyp dH12[9];
+      numtyp dUa, dUr, deta, dchi, ddH, dh12;
+      numtyp dsigma1, dsigma2;
+
+      #pragma unroll
+      for (int i=0; i<3; i++) {
+        numtyp u[3], u1[3], u2[3];
+        u[0] = -rhat[i]*rhat[0];
+        u[1] = -rhat[i]*rhat[1];
+        u[2] = -rhat[i]*rhat[2];
+        u[i] += (numtyp)1.0;
+        u[0] *= rnorm;
+        u[1] *= rnorm;
+        u[2] *= rnorm;
+        gpu_times_column3(a1,u,u1);
+        gpu_times_column3(a2,u,u2);
+        dsigma1=gpu_dot3(u1,vsigma1);
+        dsigma2=gpu_dot3(u2,vsigma2);
+        dH12[0] = dsigma1*gsigma1[0]+dsigma2*gsigma2[0];
+        dH12[1] = dsigma1*gsigma1[1]+dsigma2*gsigma2[1];
+        dH12[2] = dsigma1*gsigma1[2]+dsigma2*gsigma2[2];
+        dH12[3] = dsigma1*gsigma1[3]+dsigma2*gsigma2[3];
+        dH12[4] = dsigma1*gsigma1[4]+dsigma2*gsigma2[4];
+        dH12[5] = dsigma1*gsigma1[5]+dsigma2*gsigma2[5];
+        dH12[6] = dsigma1*gsigma1[6]+dsigma2*gsigma2[6];
+        dH12[7] = dsigma1*gsigma1[7]+dsigma2*gsigma2[7];
+        dH12[8] = dsigma1*gsigma1[8]+dsigma2*gsigma2[8];
+        ddH = det_prime(H12,dH12);
+        deta = (dsigma1+dsigma2)*tsig1sig2;
+        deta -= ddH*tdH;
+        deta -= dsigma1*teta1+dsigma2*teta2;
+        dchi = gpu_dot3(u,fourw);
+        dh12 = rhat[i]+gpu_dot3(u,spr);
+        dUa = pbsu*(eta*dchi+deta*chi)-dh12*dspu;
+        dUr = pbsr*(eta*dchi+deta*chi)-dh12*dspr;
+        numtyp force=dUr*Ur+dUa*Ua;
+        if (i==0) {
+          f.x+=force;
+          if (vflag>0)
+            virial[0]+=-r[0]*force;
+        } else if (i==1) {
+          f.y+=force;
+          if (vflag>0) {
+            virial[1]+=-r[1]*force;
+            virial[3]+=-r[0]*force;
+          }
+        } else {
+          f.z+=force;
+          if (vflag>0) {
+            virial[2]+=-r[2]*force;
+            virial[4]+=-r[0]*force;
+            virial[5]+=-r[1]*force;
+          }
+        }
+      }
+
+      // torque on i
+      sigma1=ucl_recip(sigma1);
+
+      numtyp fwae[3], p[3];
+      gpu_row_times3(fourw,aTe1,fwae);
+
+      {
+        gpu_times_column3(lA1_0,rhat,p);
+        dsigma1 = gpu_dot3(p,vsigma1);
+        dH12[0] = lAsa1_0[0]*sigma1+dsigma1*gsigma1[0];
+        dH12[1] = lAsa1_0[1]*sigma1+dsigma1*gsigma1[1];
+        dH12[2] = lAsa1_0[2]*sigma1+dsigma1*gsigma1[2];
+        dH12[3] = lAsa1_0[3]*sigma1+dsigma1*gsigma1[3];
+        dH12[4] = lAsa1_0[4]*sigma1+dsigma1*gsigma1[4];
+        dH12[5] = lAsa1_0[5]*sigma1+dsigma1*gsigma1[5];
+        dH12[6] = lAsa1_0[6]*sigma1+dsigma1*gsigma1[6];
+        dH12[7] = lAsa1_0[7]*sigma1+dsigma1*gsigma1[7];
+        dH12[8] = lAsa1_0[8]*sigma1+dsigma1*gsigma1[8];
+        ddH = det_prime(H12,dH12);
+        deta = tsig1sig2*dsigma1-tdH*ddH;
+        deta -= teta1*dsigma1;
+        numtyp tempv[3];
+        gpu_times_column3(lA1_0,w,tempv);
+        dchi = -gpu_dot3(fwae,tempv);
+        gpu_times_column3(lAtwo1_0,spr,tempv);
+        dh12 = -gpu_dot3(s,tempv);
+
+        dUa = pbsu*(eta*dchi + deta*chi)-dh12*dspu;
+        dUr = pbsr*(eta*dchi + deta*chi)-dh12*dspr;
+        tor.x -= (dUa*Ua+dUr*Ur);
+      }
+
+      {
+        gpu_times_column3(lA1_1,rhat,p);
+        dsigma1 = gpu_dot3(p,vsigma1);
+        dH12[0] = lAsa1_1[0]*sigma1+dsigma1*gsigma1[0];
+        dH12[1] = lAsa1_1[1]*sigma1+dsigma1*gsigma1[1];
+        dH12[2] = lAsa1_1[2]*sigma1+dsigma1*gsigma1[2];
+        dH12[3] = lAsa1_1[3]*sigma1+dsigma1*gsigma1[3];
+        dH12[4] = lAsa1_1[4]*sigma1+dsigma1*gsigma1[4];
+        dH12[5] = lAsa1_1[5]*sigma1+dsigma1*gsigma1[5];
+        dH12[6] = lAsa1_1[6]*sigma1+dsigma1*gsigma1[6];
+        dH12[7] = lAsa1_1[7]*sigma1+dsigma1*gsigma1[7];
+        dH12[8] = lAsa1_1[8]*sigma1+dsigma1*gsigma1[8];
+        ddH = det_prime(H12,dH12);
+        deta = tsig1sig2*dsigma1-tdH*ddH;
+        deta -= teta1*dsigma1;
+        numtyp tempv[3];
+        gpu_times_column3(lA1_1,w,tempv);
+        dchi = -gpu_dot3(fwae,tempv);
+        gpu_times_column3(lAtwo1_1,spr,tempv);
+        dh12 = -gpu_dot3(s,tempv);
+
+        dUa = pbsu*(eta*dchi + deta*chi)-dh12*dspu;
+        dUr = pbsr*(eta*dchi + deta*chi)-dh12*dspr;
+        tor.y -= (dUa*Ua+dUr*Ur);
+      }
+
+      {
+        gpu_times_column3(lA1_2,rhat,p);
+        dsigma1 = gpu_dot3(p,vsigma1);
+        dH12[0] = lAsa1_2[0]*sigma1+dsigma1*gsigma1[0];
+        dH12[1] = lAsa1_2[1]*sigma1+dsigma1*gsigma1[1];
+        dH12[2] = lAsa1_2[2]*sigma1+dsigma1*gsigma1[2];
+        dH12[3] = lAsa1_2[3]*sigma1+dsigma1*gsigma1[3];
+        dH12[4] = lAsa1_2[4]*sigma1+dsigma1*gsigma1[4];
+        dH12[5] = lAsa1_2[5]*sigma1+dsigma1*gsigma1[5];
+        dH12[6] = lAsa1_2[6]*sigma1+dsigma1*gsigma1[6];
+        dH12[7] = lAsa1_2[7]*sigma1+dsigma1*gsigma1[7];
+        dH12[8] = lAsa1_2[8]*sigma1+dsigma1*gsigma1[8];
+        ddH = det_prime(H12,dH12);
+        deta = tsig1sig2*dsigma1-tdH*ddH;
+        deta -= teta1*dsigma1;
+        numtyp tempv[3];
+        gpu_times_column3(lA1_2,w,tempv);
+        dchi = -gpu_dot3(fwae,tempv);
+        gpu_times_column3(lAtwo1_2,spr,tempv);
+        dh12 = -gpu_dot3(s,tempv);
+
+        dUa = pbsu*(eta*dchi + deta*chi)-dh12*dspu;
+        dUr = pbsr*(eta*dchi + deta*chi)-dh12*dspr;
+        tor.z -= (dUa*Ua+dUr*Ur);
+      }
+
+    } // for nbor
+    store_answers_t(f,tor,energy,virial,ii,astride,tid,t_per_atom,offset,eflag,
+                    vflag,ans,engv);
+  } // if ii
+}
+
--- a/lib/gpu/lal_re_squared.h
+++ b/lib/gpu/lal_re_squared.h
@ -0,0 +1,90 @@
+/***************************************************************************
+                                 re_squared.h
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Host code for RE-Squared potential acceleration
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : Fri May 06 2011
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#ifndef RE_SQUARED_H
+#define RE_SQUARED_H
+
+#include "lal_base_ellipsoid.h"
+#include "mpi.h"
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class RESquared : public BaseEllipsoid<numtyp, acctyp> {
+ public:
+  RESquared();
+  ~RESquared(); 
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device 
+    * \return false if there is not sufficient memory or device init prob
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double **host_shape, double **host_well,
+           double **host_cutsq, double **host_sigma,  double **host_epsilon,
+           int **h_form, double **host_lj1, double **host_lj2, 
+           double **host_lj3, double **host_lj4, double **host_offset,
+           const double *host_special_lj, const int nlocal, const int nall,
+           const int max_nbors, const int maxspecial, const double cell_size,
+           const double gpu_split, FILE *screen);
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear();
+ 
+  /// Returns memory usage on device per atom
+  int bytes_per_atom(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage() const;
+
+  /// Device Error Flag - Set if a bad matrix inversion occurs
+  UCL_D_Vec<int> dev_error;
+  
+  // --------------------------- TYPE DATA -------------------------- 
+
+  /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq, lj1.w = form
+  UCL_D_Vec<numtyp4> lj1;
+  /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
+  UCL_D_Vec<numtyp4> lj3;
+  /// sigma_epsilon.x = sigma, sigma_epsilon.y = epsilon
+  UCL_D_Vec<numtyp2> sigma_epsilon;
+  /// special lj 0-4
+  UCL_D_Vec<numtyp> special_lj;
+  
+  /// If atom type constants fit in shared memory, use fast kernels
+  bool _shared_types;
+  int _lj_types;
+   
+  // --------------------------- ATOM DATA -------------------------- 
+
+  /// Aspherical Const Data for Atoms
+  UCL_D_Vec<numtyp4> shape, well;
+
+ private:
+  bool _allocated;
+  void loop(const bool _eflag, const bool _vflag);
+};
+
+}
+
+#endif
--- a/lib/gpu/lal_re_squared_ext.cpp
+++ b/lib/gpu/lal_re_squared_ext.cpp
@ -0,0 +1,138 @@
+/***************************************************************************
+                              re_squared_ext.cpp
+                             -------------------
+                               W. Michael Brown
+
+  LAMMPS Wrappers for RE-Squared Acceleration
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#include <iostream>
+#include <cassert>
+#include <math.h>
+
+#include "lal_re_squared.h"
+
+using namespace std;
+using namespace LAMMPS_AL;
+
+static RESquared<PRECISION,ACC_PRECISION> REMF;
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+int re_gpu_init(const int ntypes, double **shape, double **well, double **cutsq,
+                double **sigma, double **epsilon, 
+                int **form, double **host_lj1, double **host_lj2, 
+                double **host_lj3, double **host_lj4, double **offset,
+                double *special_lj, const int inum, const int nall,
+                const int max_nbors, const int maxspecial,
+                const double cell_size, int &gpu_mode, FILE *screen) {
+  REMF.clear();
+  gpu_mode=REMF.device->gpu_mode();
+  double gpu_split=REMF.device->particle_split();
+  int first_gpu=REMF.device->first_device();
+  int last_gpu=REMF.device->last_device();
+  int world_me=REMF.device->world_me();
+  int gpu_rank=REMF.device->gpu_rank();
+  int procs_per_gpu=REMF.device->procs_per_gpu();
+
+  REMF.device->init_message(screen,"resquared",first_gpu,last_gpu);
+
+  bool message=false;
+  if (REMF.device->replica_me()==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fflush(screen);
+  }
+
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=REMF.init(ntypes, shape, well, cutsq, sigma, epsilon, 
+                      form, host_lj1, host_lj2, host_lj3, host_lj4, offset,
+                      special_lj, inum, nall, max_nbors, maxspecial, cell_size,
+                      gpu_split, screen);
+
+  REMF.device->world_barrier();
+  if (message)
+    fprintf(screen,"Done.\n");
+        
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+      else
+        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=REMF.init(ntypes, shape, well, cutsq,  sigma, epsilon, 
+                        form, host_lj1, host_lj2, host_lj3,
+                        host_lj4, offset, special_lj,  inum, nall,
+                        max_nbors, maxspecial, cell_size, gpu_split, screen);
+
+    REMF.device->gpu_barrier();
+    if (message) 
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+
+  if (init_ok==0)
+    REMF.estimate_gpu_overhead();
+  return init_ok;
+}
+
+// ---------------------------------------------------------------------------
+// Clear memory on host and device
+// ---------------------------------------------------------------------------
+void re_gpu_clear() {
+  REMF.clear();
+}
+
+  int** compute(const int ago, const int inum_full, const int nall,
+                double **host_x, int *host_type, double *sublo,
+                double *subhi, int *tag, int **nspecial,
+                int **special, const bool eflag, const bool vflag, 
+                const bool eatom, const bool vatom, int &host_start, 
+                int **ilist, int **numj, const double cpu_time, bool &success,
+                double **host_quat);
+
+int** re_gpu_compute_n(const int ago, const int inum_full, const int nall,
+                       double **host_x, int *host_type, double *sublo,
+                       double *subhi, int *tag, int **nspecial, int **special,
+                       const bool eflag, const bool vflag, const bool eatom,
+                       const bool vatom, int &host_start, int **ilist,
+                       int **jnum, const double cpu_time, bool &success,
+                       double **host_quat) {
+  return REMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, 
+                      tag, nspecial, special, eflag, vflag, eatom, vatom, 
+                      host_start, ilist, jnum, cpu_time, success, host_quat);
+}
+
+int * re_gpu_compute(const int ago, const int inum_full, const int nall,
+                     double **host_x, int *host_type, int *ilist, int *numj,
+                     int **firstneigh, const bool eflag, const bool vflag,
+                     const bool eatom, const bool vatom, int &host_start,
+                     const double cpu_time, bool &success, double **host_quat) {
+  return REMF.compute(ago, inum_full, nall, host_x, host_type, ilist,
+                      numj, firstneigh, eflag, vflag, eatom, vatom, host_start,
+                      cpu_time, success, host_quat);
+}
+
+// ---------------------------------------------------------------------------
+// Return memory usage
+// ---------------------------------------------------------------------------
+double re_gpu_bytes() {
+  return REMF.host_memory_usage();
+}
+
--- a/lib/gpu/lal_re_squared_lj.cu
+++ b/lib/gpu/lal_re_squared_lj.cu
@ -0,0 +1,696 @@
+// **************************************************************************
+//                               re_squared_lj.cu
+//                             -------------------
+//                               W. Michael Brown
+//
+//  Device code for RE-Squared - Lennard-Jones potential acceleration
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                : Fri May 06 2011
+//    email                : brownw@ornl.gov
+// ***************************************************************************/
+
+#ifdef NV_KERNEL
+#include "lal_ellipsoid_extra.h"
+#endif
+
+__kernel void kernel_ellipsoid_sphere(__global numtyp4* x_,__global numtyp4 *q,
+                   __global numtyp4* shape, __global numtyp4* well, 
+                   __global numtyp *splj, __global numtyp2* sig_eps,
+                   const int ntypes, __global int *dev_nbor, const int stride, 
+                   __global acctyp4 *ans, const int astride, 
+                   __global acctyp *engv, __global int *err_flag, 
+                   const int eflag, const int vflag, const int inum,
+                   const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp sp_lj[4];
+  sp_lj[0]=splj[0];    
+  sp_lj[1]=splj[1];    
+  sp_lj[2]=splj[2];    
+  sp_lj[3]=splj[3];
+  
+  __local numtyp b_alpha, cr60, solv_f_a, solv_f_r;
+  b_alpha=(numtyp)45.0/(numtyp)56.0;
+  cr60=ucl_cbrt((numtyp)60.0);    
+  solv_f_a = (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*-(numtyp)36.0);
+  solv_f_r = (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*(numtyp)2025.0);
+
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp4 tor;
+  tor.x=(acctyp)0;
+  tor.y=(acctyp)0;
+  tor.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+
+  if (ii<inum) {
+    __global int *nbor, *nbor_end;
+    int i, numj, n_stride;
+    nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
+                n_stride,nbor_end,nbor);
+  
+    numtyp4 ix=x_[i];
+    int itype=ix.w;
+
+    numtyp a[9];       // Rotation matrix (lab->body)
+    numtyp aTe[9];     // A'*E
+    numtyp lA_0[9], lA_1[9], lA_2[9]; // -A*rotation generator (x,y, or z)
+
+    numtyp4 ishape;
+    ishape=shape[itype];
+    numtyp ilshape=ishape.x*ishape.y*ishape.z;
+
+    {
+      gpu_quat_to_mat_trans(q,i,a);
+      gpu_transpose_times_diag3(a,well[itype],aTe);
+      gpu_rotation_generator_x(a,lA_0);
+      gpu_rotation_generator_y(a,lA_1);
+      gpu_rotation_generator_z(a,lA_2);
+    }
+
+    numtyp factor_lj;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=*nbor;
+      factor_lj = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      numtyp4 jx=x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp r[3], rhat[3];
+      numtyp rnorm;
+      r[0] = jx.x-ix.x;
+      r[1] = jx.y-ix.y;
+      r[2] = jx.z-ix.z;
+      rnorm = gpu_dot3(r,r);
+      rnorm = ucl_rsqrt(rnorm);
+      rhat[0] = r[0]*rnorm;
+      rhat[1] = r[1]*rnorm;
+      rhat[2] = r[2]*rnorm;
+
+      numtyp sigma, epsilon;
+      int mtype=fast_mul(ntypes,itype)+jtype;
+      sigma = sig_eps[mtype].x;
+      epsilon = sig_eps[mtype].y*factor_lj;
+
+      numtyp aTs[9]; 
+      numtyp4 scorrect;
+      numtyp half_sigma=sigma*(numtyp)0.5;
+      scorrect.x = ishape.x+half_sigma;
+      scorrect.y = ishape.y+half_sigma;
+      scorrect.z = ishape.z+half_sigma;
+      scorrect.x = scorrect.x * scorrect.x * (numtyp)0.5;
+      scorrect.y = scorrect.y * scorrect.y * (numtyp)0.5;
+      scorrect.z = scorrect.z * scorrect.z * (numtyp)0.5;
+      gpu_transpose_times_diag3(a,scorrect,aTs);
+
+      // energy
+
+      numtyp gamma[9], s[3];
+      gpu_times3(aTs,a,gamma);
+      gpu_mldivide3(gamma,rhat,s,err_flag);
+
+      numtyp sigma12 = ucl_rsqrt((numtyp)0.5*gpu_dot3(s,rhat));
+      numtyp temp[9], w[3];
+      gpu_times3(aTe,a,temp);
+      temp[0] += (numtyp)1.0;
+      temp[4] += (numtyp)1.0;
+      temp[8] += (numtyp)1.0;
+      gpu_mldivide3(temp,rhat,w,err_flag);
+
+      numtyp h12 = ucl_recip(rnorm)-sigma12;
+      numtyp chi = (numtyp)2.0*gpu_dot3(rhat,w);
+      numtyp sigh = sigma/h12;
+      numtyp tprod = chi*sigh;
+
+      numtyp Ua, Ur;
+      numtyp h12p3 = h12*h12*h12;
+      numtyp sigmap3 = sigma*sigma*sigma;
+      numtyp stemp = h12*(numtyp)0.5;
+      Ua = (ishape.x+stemp)*(ishape.y+stemp)*(ishape.z+stemp)*h12p3/(numtyp)8.0;
+      Ua = ((numtyp)1.0+(numtyp)3.0*tprod)*ilshape/Ua;
+      Ua = epsilon*Ua*sigmap3*solv_f_a;
+    
+      stemp = h12/cr60;
+      Ur = (ishape.x+stemp)*(ishape.y+stemp)*(ishape.z+stemp)*h12p3/
+           (numtyp)60.0;
+      Ur = ((numtyp)1.0+b_alpha*tprod)*ilshape/Ur;
+      numtyp sigh6=sigh*sigh*sigh;
+      sigh6*=sigh6;
+      Ur = epsilon*Ur*sigmap3*sigh6*solv_f_r;
+
+      energy+=Ua+Ur;
+
+      // force
+
+      numtyp fourw[3], spr[3];
+      numtyp sec = sigma*chi;
+      numtyp sigma12p3 = sigma12*sigma12*sigma12;
+      fourw[0] = (numtyp)4.0*w[0];
+      fourw[1] = (numtyp)4.0*w[1];
+      fourw[2] = (numtyp)4.0*w[2];
+      spr[0] = (numtyp)0.5*sigma12p3*s[0];
+      spr[1] = (numtyp)0.5*sigma12p3*s[1];
+      spr[2] = (numtyp)0.5*sigma12p3*s[2];
+
+      stemp = ucl_recip(ishape.x*(numtyp)2.0+h12)+
+              ucl_recip(ishape.y*(numtyp)2.0+h12)+
+              ucl_recip(ishape.z*(numtyp)2.0+h12)+
+              (numtyp)3.0/h12;
+      numtyp hsec = ucl_recip(h12+(numtyp)3.0*sec);
+      numtyp dspu = ucl_recip(h12)-hsec+stemp;
+      numtyp pbsu = (numtyp)3.0*sigma*hsec;
+  
+      stemp = ucl_recip(ishape.x*cr60+h12)+
+              ucl_recip(ishape.y*cr60+h12)+
+              ucl_recip(ishape.z*cr60+h12)+
+              (numtyp)3.0/h12;
+      hsec = ucl_recip(h12+b_alpha*sec);
+      numtyp dspr = (numtyp)7.0/h12-hsec+stemp;
+      numtyp pbsr = b_alpha*sigma*hsec;
+  
+      #pragma unroll
+      for (int i=0; i<3; i++) {
+        numtyp u[3];
+        u[0] = -rhat[i]*rhat[0];
+        u[1] = -rhat[i]*rhat[1];
+        u[2] = -rhat[i]*rhat[2];
+        u[i] += (numtyp)1.0;
+        u[0] *= rnorm;
+        u[1] *= rnorm;
+        u[2] *= rnorm;
+        numtyp dchi = gpu_dot3(u,fourw);
+        numtyp dh12 = rhat[i]+gpu_dot3(u,spr);
+        numtyp dUa = pbsu*dchi-dh12*dspu;
+        numtyp dUr = pbsr*dchi-dh12*dspr;
+        numtyp force=dUr*Ur+dUa*Ua;
+        if (i==0) {
+          f.x+=force;
+          if (vflag>0)
+            virial[0]+=-r[0]*force;
+        } else if (i==1) {
+          f.y+=force;
+          if (vflag>0) {
+            virial[1]+=-r[1]*force;
+            virial[3]+=-r[0]*force;
+          }
+        } else {
+          f.z+=force;
+          if (vflag>0) {
+            virial[2]+=-r[2]*force;
+            virial[4]+=-r[0]*force;
+            virial[5]+=-r[1]*force;
+          }
+        }
+
+      }
+    
+      // torque on i
+      numtyp fwae[3];
+      gpu_row_times3(fourw,aTe,fwae);
+      {
+        numtyp tempv[3], p[3], lAtwo[9];
+        gpu_times_column3(lA_0,rhat,p);
+        gpu_times_column3(lA_0,w,tempv);
+        numtyp dchi = -gpu_dot3(fwae,tempv);
+        gpu_times3(aTs,lA_0,lAtwo);
+        gpu_times_column3(lAtwo,spr,tempv);
+        numtyp dh12 = -gpu_dot3(s,tempv);
+        numtyp dUa = pbsu*dchi-dh12*dspu;
+        numtyp dUr = pbsr*dchi-dh12*dspr;
+        tor.x -= (dUa*Ua+dUr*Ur);
+      }
+
+      {
+        numtyp tempv[3], p[3], lAtwo[9];
+        gpu_times_column3(lA_1,rhat,p);
+        gpu_times_column3(lA_1,w,tempv);
+        numtyp dchi = -gpu_dot3(fwae,tempv);
+        gpu_times3(aTs,lA_1,lAtwo);
+        gpu_times_column3(lAtwo,spr,tempv);
+        numtyp dh12 = -gpu_dot3(s,tempv);
+        numtyp dUa = pbsu*dchi-dh12*dspu;
+        numtyp dUr = pbsr*dchi-dh12*dspr;
+        tor.y -= (dUa*Ua+dUr*Ur);
+      }
+
+      {
+        numtyp tempv[3], p[3], lAtwo[9];
+        gpu_times_column3(lA_2,rhat,p);
+        gpu_times_column3(lA_2,w,tempv);
+        numtyp dchi = -gpu_dot3(fwae,tempv);
+        gpu_times3(aTs,lA_2,lAtwo);
+        gpu_times_column3(lAtwo,spr,tempv);
+        numtyp dh12 = -gpu_dot3(s,tempv);
+        numtyp dUa = pbsu*dchi-dh12*dspu;
+        numtyp dUr = pbsr*dchi-dh12*dspr;
+        tor.z -= (dUa*Ua+dUr*Ur);
+      }
+
+    } // for nbor
+
+    // Reduce answers
+    if (t_per_atom>1) {
+      __local acctyp red_acc[7][BLOCK_PAIR];
+      
+      red_acc[0][tid]=f.x;
+      red_acc[1][tid]=f.y;
+      red_acc[2][tid]=f.z;
+      red_acc[3][tid]=tor.x;
+      red_acc[4][tid]=tor.y;
+      red_acc[5][tid]=tor.z;
+  
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<6; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+      
+      f.x=red_acc[0][tid];
+      f.y=red_acc[1][tid];
+      f.z=red_acc[2][tid];
+      tor.x=red_acc[3][tid];
+      tor.y=red_acc[4][tid];
+      tor.z=red_acc[5][tid];
+  
+      if (eflag>0 || vflag>0) {
+        for (int r=0; r<6; r++)
+          red_acc[r][tid]=virial[r];
+        red_acc[6][tid]=energy;
+  
+        for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+          if (offset < s) {
+            for (int r=0; r<7; r++)
+              red_acc[r][tid] += red_acc[r][tid+s];
+          }
+        }
+      
+        for (int r=0; r<6; r++)
+          virial[r]=red_acc[r][tid];
+        energy=red_acc[6][tid];
+      }
+    }
+  
+    // Store answers
+    if (offset==0) {
+      __global acctyp *ap1=engv+ii;
+      if (eflag>0) {
+        *ap1+=energy;
+        ap1+=astride;
+      }
+      if (vflag>0) {
+        for (int i=0; i<6; i++) {
+          *ap1+=virial[i];
+          ap1+=astride;
+        }
+      }
+      acctyp4 old=ans[ii];
+      old.x+=f.x;
+      old.y+=f.y;
+      old.z+=f.z;
+      ans[ii]=old;
+      
+      old=ans[ii+astride];
+      old.x+=tor.x;
+      old.y+=tor.y;
+      old.z+=tor.z;
+      ans[ii+astride]=old;
+    }
+  } // if ii
+}
+
+__kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
+                               __global numtyp4* shape,__global numtyp4* well, 
+                               __global numtyp *splj, __global numtyp2* sig_eps, 
+                               const int ntypes, __global int *dev_nbor,
+                               const int stride, __global acctyp4 *ans,
+                               __global acctyp *engv, __global int *err_flag,
+                               const int eflag, const int vflag,const int start,
+                               const int inum, const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+  ii+=start;
+
+  __local numtyp sp_lj[4];
+  sp_lj[0]=splj[0];    
+  sp_lj[1]=splj[1];    
+  sp_lj[2]=splj[2];    
+  sp_lj[3]=splj[3];
+  
+  __local numtyp b_alpha, cr60, solv_f_a, solv_f_r;
+  b_alpha=(numtyp)45.0/(numtyp)56.0;
+  cr60=ucl_cbrt((numtyp)60.0);    
+  solv_f_a = (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*-(numtyp)36.0);
+  solv_f_r = (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*(numtyp)2025.0);
+
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+
+  if (ii<inum) {
+    __global int *nbor, *nbor_end;
+    int j, numj, n_stride;
+    nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,j,numj,
+                n_stride,nbor_end,nbor);
+  
+    numtyp4 jx=x_[j];
+    int jtype=jx.w;
+
+    numtyp factor_lj;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int i=*nbor;
+      factor_lj = sp_lj[sbmask(i)];
+      i &= NEIGHMASK;
+
+      numtyp4 ix=x_[i];
+      int itype=ix.w;
+
+      numtyp a[9];       // Rotation matrix (lab->body)
+      numtyp aTe[9];     // A'*E
+      numtyp4 ishape;
+    
+      ishape=shape[itype];
+      gpu_quat_to_mat_trans(q,i,a);
+      gpu_transpose_times_diag3(a,well[itype],aTe);
+
+      // Compute r12
+      numtyp r[3], rhat[3];
+      numtyp rnorm;
+      r[0] = ix.x-jx.x;
+      r[1] = ix.y-jx.y;
+      r[2] = ix.z-jx.z;
+      rnorm = gpu_dot3(r,r);
+      rnorm = ucl_rsqrt(rnorm);
+      rhat[0] = r[0]*rnorm;
+      rhat[1] = r[1]*rnorm;
+      rhat[2] = r[2]*rnorm;
+
+      numtyp sigma, epsilon;
+      int mtype=fast_mul(ntypes,itype)+jtype;
+      sigma = sig_eps[mtype].x;
+      epsilon = sig_eps[mtype].y*factor_lj;
+
+      numtyp aTs[9]; 
+      numtyp4 scorrect;
+      numtyp half_sigma=sigma * (numtyp)0.5;
+      scorrect.x = ishape.x+half_sigma;
+      scorrect.y = ishape.y+half_sigma;
+      scorrect.z = ishape.z+half_sigma;
+      scorrect.x = scorrect.x * scorrect.x * (numtyp)0.5;
+      scorrect.y = scorrect.y * scorrect.y * (numtyp)0.5;
+      scorrect.z = scorrect.z * scorrect.z * (numtyp)0.5;
+      gpu_transpose_times_diag3(a,scorrect,aTs);
+      
+      // energy
+
+      numtyp gamma[9], s[3];
+      gpu_times3(aTs,a,gamma);
+      gpu_mldivide3(gamma,rhat,s,err_flag);
+
+      numtyp sigma12 = ucl_rsqrt((numtyp)0.5*gpu_dot3(s,rhat));
+      numtyp temp[9], w[3];
+      gpu_times3(aTe,a,temp);
+      temp[0] += (numtyp)1.0;
+      temp[4] += (numtyp)1.0;
+      temp[8] += (numtyp)1.0;
+      gpu_mldivide3(temp,rhat,w,err_flag);
+
+      numtyp h12 = ucl_recip(rnorm)-sigma12;
+      numtyp chi = (numtyp)2.0*gpu_dot3(rhat,w);
+      numtyp sigh = sigma/h12;
+      numtyp tprod = chi*sigh;
+
+      numtyp Ua, Ur;
+      numtyp h12p3 = h12*h12*h12;
+      numtyp sigmap3 = sigma*sigma*sigma;
+      numtyp stemp = h12/(numtyp)2.0;
+      Ua = (ishape.x+stemp)*(ishape.y+stemp)*(ishape.z+stemp)*h12p3/(numtyp)8.0;
+      numtyp ilshape=ishape.x*ishape.y*ishape.z;
+      Ua = ((numtyp)1.0+(numtyp)3.0*tprod)*ilshape/Ua;
+      Ua = epsilon*Ua*sigmap3*solv_f_a;
+    
+      stemp = h12/cr60;
+      Ur = (ishape.x+stemp)*(ishape.y+stemp)*(ishape.z+stemp)*h12p3/
+           (numtyp)60.0;
+      Ur = ((numtyp)1.0+b_alpha*tprod)*ilshape/Ur;
+      numtyp sigh6=sigh*sigh*sigh;
+      sigh6*=sigh6;
+      Ur = epsilon*Ur*sigmap3*sigh6*solv_f_r;
+
+      energy+=Ua+Ur;
+
+      // force
+
+      numtyp fourw[3], spr[3];
+      numtyp sec = sigma*chi;
+      numtyp sigma12p3 = sigma12*sigma12*sigma12;
+      fourw[0] = (numtyp)4.0*w[0];
+      fourw[1] = (numtyp)4.0*w[1];
+      fourw[2] = (numtyp)4.0*w[2];
+      spr[0] = (numtyp)0.5*sigma12p3*s[0];
+      spr[1] = (numtyp)0.5*sigma12p3*s[1];
+      spr[2] = (numtyp)0.5*sigma12p3*s[2];
+
+      stemp = ucl_recip(ishape.x*(numtyp)2.0+h12)+
+              ucl_recip(ishape.y*(numtyp)2.0+h12)+
+              ucl_recip(ishape.z*(numtyp)2.0+h12)+
+              (numtyp)3.0/h12;
+      numtyp hsec = ucl_recip(h12+(numtyp)3.0*sec);
+      numtyp dspu = ucl_recip(h12)-hsec+stemp;
+      numtyp pbsu = (numtyp)3.0*sigma*hsec;
+  
+      stemp = ucl_recip(ishape.x*cr60+h12)+
+              ucl_recip(ishape.y*cr60+h12)+
+              ucl_recip(ishape.z*cr60+h12)+
+              (numtyp)3.0/h12;
+      hsec = ucl_recip(h12+b_alpha*sec);
+      numtyp dspr = (numtyp)7.0/h12-hsec+stemp;
+      numtyp pbsr = b_alpha*sigma*hsec;
+  
+      #pragma unroll
+      for (int i=0; i<3; i++) {
+        numtyp u[3];
+        u[0] = -rhat[i]*rhat[0];
+        u[1] = -rhat[i]*rhat[1];
+        u[2] = -rhat[i]*rhat[2];
+        u[i] += (numtyp)1.0;
+        u[0] *= rnorm;
+        u[1] *= rnorm;
+        u[2] *= rnorm;
+        numtyp dchi = gpu_dot3(u,fourw);
+        numtyp dh12 = rhat[i]+gpu_dot3(u,spr);
+        numtyp dUa = pbsu*dchi-dh12*dspu;
+        numtyp dUr = pbsr*dchi-dh12*dspr;
+        numtyp force=dUr*Ur+dUa*Ua;
+        if (i==0) {
+          f.x+=force;
+          if (vflag>0)
+            virial[0]+=-r[0]*force;
+        } else if (i==1) {
+          f.y+=force;
+          if (vflag>0) {
+            virial[1]+=-r[1]*force;
+            virial[3]+=-r[0]*force;
+          }
+        } else {
+          f.z+=force;
+          if (vflag>0) {
+            virial[2]+=-r[2]*force;
+            virial[4]+=-r[0]*force;
+            virial[5]+=-r[1]*force;
+          }
+        }
+      }
+    } // for nbor
+    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                  ans,engv);
+  } // if ii
+}
+
+__kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1, 
+                        __global numtyp4* lj3, const int lj_types, 
+                        __global numtyp *gum, 
+                        const int stride, __global int *dev_ij, 
+                        __global acctyp4 *ans, __global acctyp *engv, 
+                        __global int *err_flag, const int eflag, 
+                        const int vflag, const int start, const int inum, 
+                        const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+  ii+=start;
+
+  __local numtyp sp_lj[4];
+  sp_lj[0]=gum[0];    
+  sp_lj[1]=gum[1];    
+  sp_lj[2]=gum[2];    
+  sp_lj[3]=gum[3];    
+
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
+  if (ii<inum) {
+    __global int *nbor, *list_end;
+    int i, numj, n_stride;
+    nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
+                n_stride,list_end,nbor);
+  
+    numtyp4 ix=x_[i];
+    int itype=ix.w;
+
+    numtyp factor_lj;
+    for ( ; nbor<list_end; nbor+=n_stride) {
+  
+      int j=*nbor;
+      factor_lj = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      numtyp4 jx=x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp r2inv = delx*delx+dely*dely+delz*delz;
+        
+      int ii=itype*lj_types+jtype;
+      if (r2inv<lj1[ii].z && lj1[ii].w==SPHERE_SPHERE) {
+        r2inv=ucl_recip(r2inv);
+        numtyp r6inv = r2inv*r2inv*r2inv;
+        numtyp force = r2inv*r6inv*(lj1[ii].x*r6inv-lj1[ii].y);
+        force*=factor_lj;
+      
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          numtyp e=r6inv*(lj3[ii].x*r6inv-lj3[ii].y);
+          energy+=factor_lj*(e-lj3[ii].z); 
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+    } // for nbor
+    acc_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
+  } // if ii
+}
+
+__kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, 
+                             __global numtyp4* lj3_in, __global numtyp *gum, 
+                             const int stride, __global int *dev_ij,
+                             __global acctyp4 *ans, __global acctyp *engv,
+                             __global int *err_flag, const int eflag,
+                             const int vflag, const int start, const int inum,
+                             const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+  ii+=start;
+
+  __local numtyp sp_lj[4];                              
+  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  if (tid<4)
+    sp_lj[tid]=gum[tid];    
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    lj1[tid]=lj1_in[tid];
+    if (eflag>0)
+      lj3[tid]=lj3_in[tid];
+  }
+  
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
+  __syncthreads();
+  
+  if (ii<inum) {
+    __global int *nbor, *list_end;
+    int i, numj, n_stride;
+    nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
+                n_stride,list_end,nbor);
+
+    numtyp4 ix=x_[i];
+    int iw=ix.w;
+    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
+
+    numtyp factor_lj;
+    for ( ; nbor<list_end; nbor+=n_stride) {
+  
+      int j=*nbor;
+      factor_lj = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      numtyp4 jx=x_[j];
+      int mtype=itype+jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp r2inv = delx*delx+dely*dely+delz*delz;
+        
+      if (r2inv<lj1[mtype].z && lj1[mtype].w==SPHERE_SPHERE) {
+        r2inv=ucl_recip(r2inv);
+        numtyp r6inv = r2inv*r2inv*r2inv;
+        numtyp force = factor_lj*r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
+      
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
+          energy+=factor_lj*(e-lj3[mtype].z); 
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    acc_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
+  } // if ii
+}
+
--- a/lib/gpu/lj96_cut_gpu_kernel.ptx
+++ b/lib/gpu/lj96_cut_gpu_kernel.ptx
@ -0,0 +1,979 @@
+	.version 2.3
+	.target sm_20
+	.address_size 64
+	// compiled with /usr/local/cuda/open64/lib//be
+	// nvopencc 4.0 built on 2011-05-12
+
+	//-----------------------------------------------------------
+	// Compiling /tmp/tmpxft_0000bddd_00000000-9_lj96_cut_gpu_kernel.cpp3.i (/home/sjplimp/ccBI#.4Q2aYE)
+	//-----------------------------------------------------------
+
+	//-----------------------------------------------------------
+	// Options:
+	//-----------------------------------------------------------
+	//  Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
+	//  -O3	(Optimization level)
+	//  -g0	(Debug level)
+	//  -m2	(Report advisories)
+	//-----------------------------------------------------------
+
+	.file	1	"<command-line>"
+	.file	2	"/tmp/tmpxft_0000bddd_00000000-8_lj96_cut_gpu_kernel.cudafe2.gpu"
+	.file	3	"/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
+	.file	4	"/usr/local/cuda/include/crt/device_runtime.h"
+	.file	5	"/usr/local/cuda/include/host_defines.h"
+	.file	6	"/usr/local/cuda/include/builtin_types.h"
+	.file	7	"/usr/local/cuda/include/device_types.h"
+	.file	8	"/usr/local/cuda/include/driver_types.h"
+	.file	9	"/usr/local/cuda/include/surface_types.h"
+	.file	10	"/usr/local/cuda/include/texture_types.h"
+	.file	11	"/usr/local/cuda/include/vector_types.h"
+	.file	12	"/usr/local/cuda/include/device_launch_parameters.h"
+	.file	13	"/usr/local/cuda/include/crt/storage_class.h"
+	.file	14	"/usr/include/bits/types.h"
+	.file	15	"/usr/include/time.h"
+	.file	16	"lj96_cut_gpu_kernel.cu"
+	.file	17	"/usr/local/cuda/include/common_functions.h"
+	.file	18	"/usr/local/cuda/include/math_functions.h"
+	.file	19	"/usr/local/cuda/include/math_constants.h"
+	.file	20	"/usr/local/cuda/include/device_functions.h"
+	.file	21	"/usr/local/cuda/include/sm_11_atomic_functions.h"
+	.file	22	"/usr/local/cuda/include/sm_12_atomic_functions.h"
+	.file	23	"/usr/local/cuda/include/sm_13_double_functions.h"
+	.file	24	"/usr/local/cuda/include/sm_20_atomic_functions.h"
+	.file	25	"/usr/local/cuda/include/sm_20_intrinsics.h"
+	.file	26	"/usr/local/cuda/include/surface_functions.h"
+	.file	27	"/usr/local/cuda/include/texture_fetch_functions.h"
+	.file	28	"/usr/local/cuda/include/math_functions_dbl_ptx3.h"
+
+	.global .texref pos_tex;
+
+	.entry kernel_pair (
+		.param .u64 __cudaparm_kernel_pair_x_,
+		.param .u64 __cudaparm_kernel_pair_lj1,
+		.param .u64 __cudaparm_kernel_pair_lj3,
+		.param .s32 __cudaparm_kernel_pair_lj_types,
+		.param .u64 __cudaparm_kernel_pair_sp_lj_in,
+		.param .u64 __cudaparm_kernel_pair_dev_nbor,
+		.param .u64 __cudaparm_kernel_pair_dev_packed,
+		.param .u64 __cudaparm_kernel_pair_ans,
+		.param .u64 __cudaparm_kernel_pair_engv,
+		.param .s32 __cudaparm_kernel_pair_eflag,
+		.param .s32 __cudaparm_kernel_pair_vflag,
+		.param .s32 __cudaparm_kernel_pair_inum,
+		.param .s32 __cudaparm_kernel_pair_nbor_pitch,
+		.param .s32 __cudaparm_kernel_pair_t_per_atom)
+	{
+	.reg .u32 %r<72>;
+	.reg .u64 %rd<62>;
+	.reg .f32 %f<103>;
+	.reg .pred %p<19>;
+	.shared .align 16 .b8 __cuda___cuda_local_var_32497_33_non_const_sp_lj92[16];
+	.shared .align 4 .b8 __cuda___cuda_local_var_32582_35_non_const_red_acc108[3072];
+	// __cuda_local_var_32504_10_non_const_f = 48
+	// __cuda_local_var_32508_9_non_const_virial = 16
+	.loc	16	88	0
+$LDWbegin_kernel_pair:
+	.loc	16	95	0
+	ld.param.u64 	%rd1, [__cudaparm_kernel_pair_sp_lj_in];
+	ldu.global.f32 	%f1, [%rd1+0];
+	.loc	16	96	0
+	ld.global.f32 	%f2, [%rd1+4];
+	.loc	16	97	0
+	ld.global.f32 	%f3, [%rd1+8];
+	.loc	16	98	0
+	ld.global.f32 	%f4, [%rd1+12];
+	st.shared.v4.f32 	[__cuda___cuda_local_var_32497_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4};
+	.loc	16	107	0
+	mov.f32 	%f5, 0f00000000;     	// 0
+	mov.f32 	%f6, %f5;
+	mov.f32 	%f7, 0f00000000;     	// 0
+	mov.f32 	%f8, %f7;
+	mov.f32 	%f9, 0f00000000;     	// 0
+	mov.f32 	%f10, %f9;
+	mov.f32 	%f11, 0f00000000;    	// 0
+	mov.f32 	%f12, %f11;
+	mov.f32 	%f13, 0f00000000;    	// 0
+	mov.f32 	%f14, %f13;
+	mov.f32 	%f15, 0f00000000;    	// 0
+	mov.f32 	%f16, %f15;
+	ld.param.s32 	%r1, [__cudaparm_kernel_pair_t_per_atom];
+	cvt.s32.u32 	%r2, %tid.x;
+	div.s32 	%r3, %r2, %r1;
+	cvt.s32.u32 	%r4, %ntid.x;
+	div.s32 	%r5, %r4, %r1;
+	rem.s32 	%r6, %r2, %r1;
+	cvt.s32.u32 	%r7, %ctaid.x;
+	mul.lo.s32 	%r8, %r7, %r5;
+	add.s32 	%r9, %r3, %r8;
+	ld.param.s32 	%r10, [__cudaparm_kernel_pair_inum];
+	setp.lt.s32 	%p1, %r9, %r10;
+	@!%p1 bra 	$Lt_0_19202;
+	.loc	16	113	0
+	ld.param.s32 	%r11, [__cudaparm_kernel_pair_nbor_pitch];
+	cvt.s64.s32 	%rd2, %r11;
+	mul.wide.s32 	%rd3, %r11, 4;
+	cvt.s64.s32 	%rd4, %r9;
+	mul.wide.s32 	%rd5, %r9, 4;
+	ld.param.u64 	%rd6, [__cudaparm_kernel_pair_dev_nbor];
+	add.u64 	%rd7, %rd5, %rd6;
+	add.u64 	%rd8, %rd3, %rd7;
+	ld.global.s32 	%r12, [%rd8+0];
+	add.u64 	%rd9, %rd3, %rd8;
+	ld.param.u64 	%rd10, [__cudaparm_kernel_pair_dev_packed];
+	setp.ne.u64 	%p2, %rd10, %rd6;
+	@%p2 bra 	$Lt_0_19714;
+	.loc	16	119	0
+	cvt.s32.s64 	%r13, %rd2;
+	mul.lo.s32 	%r14, %r13, %r12;
+	cvt.s64.s32 	%rd11, %r14;
+	mul.wide.s32 	%rd12, %r14, 4;
+	add.u64 	%rd13, %rd9, %rd12;
+	.loc	16	120	0
+	mul.lo.s32 	%r15, %r6, %r13;
+	cvt.s64.s32 	%rd14, %r15;
+	mul.wide.s32 	%rd15, %r15, 4;
+	add.u64 	%rd16, %rd9, %rd15;
+	.loc	16	121	0
+	mul.lo.s32 	%r16, %r13, %r1;
+	bra.uni 	$Lt_0_19458;
+$Lt_0_19714:
+	.loc	16	123	0
+	ld.global.s32 	%r17, [%rd9+0];
+	cvt.s64.s32 	%rd17, %r17;
+	mul.wide.s32 	%rd18, %r17, 4;
+	add.u64 	%rd19, %rd10, %rd18;
+	.loc	16	124	0
+	cvt.s64.s32 	%rd20, %r12;
+	mul.wide.s32 	%rd21, %r12, 4;
+	add.u64 	%rd13, %rd19, %rd21;
+	.loc	16	125	0
+	mov.s32 	%r16, %r1;
+	.loc	16	126	0
+	cvt.s64.s32 	%rd22, %r6;
+	mul.wide.s32 	%rd23, %r6, 4;
+	add.u64 	%rd16, %rd19, %rd23;
+$Lt_0_19458:
+	.loc	16	129	0
+	ld.global.s32 	%r18, [%rd7+0];
+	mov.u32 	%r19, %r18;
+	mov.s32 	%r20, 0;
+	mov.u32 	%r21, %r20;
+	mov.s32 	%r22, 0;
+	mov.u32 	%r23, %r22;
+	mov.s32 	%r24, 0;
+	mov.u32 	%r25, %r24;
+	tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r19,%r21,%r23,%r25}];
+	mov.f32 	%f21, %f17;
+	mov.f32 	%f22, %f18;
+	mov.f32 	%f23, %f19;
+	mov.f32 	%f24, %f20;
+	setp.ge.u64 	%p3, %rd16, %rd13;
+	@%p3 bra 	$Lt_0_28162;
+	cvt.rzi.ftz.s32.f32 	%r26, %f24;
+	cvt.s64.s32 	%rd24, %r16;
+	ld.param.s32 	%r27, [__cudaparm_kernel_pair_lj_types];
+	mul.lo.s32 	%r28, %r27, %r26;
+	ld.param.u64 	%rd25, [__cudaparm_kernel_pair_lj1];
+	mov.f32 	%f25, 0f00000000;    	// 0
+	mov.f32 	%f26, 0f00000000;    	// 0
+	mov.f32 	%f27, 0f00000000;    	// 0
+	mov.f32 	%f28, 0f00000000;    	// 0
+	mov.u64 	%rd26, __cuda___cuda_local_var_32497_33_non_const_sp_lj92;
+$Lt_0_20482:
+ //<loop> Loop body line 129, nesting depth: 1, estimated iterations: unknown
+	.loc	16	135	0
+	ld.global.s32 	%r29, [%rd16+0];
+	.loc	16	136	0
+	shr.s32 	%r30, %r29, 30;
+	and.b32 	%r31, %r30, 3;
+	cvt.s64.s32 	%rd27, %r31;
+	mul.wide.s32 	%rd28, %r31, 4;
+	add.u64 	%rd29, %rd26, %rd28;
+	ld.shared.f32 	%f29, [%rd29+0];
+	.loc	16	139	0
+	and.b32 	%r32, %r29, 1073741823;
+	mov.u32 	%r33, %r32;
+	mov.s32 	%r34, 0;
+	mov.u32 	%r35, %r34;
+	mov.s32 	%r36, 0;
+	mov.u32 	%r37, %r36;
+	mov.s32 	%r38, 0;
+	mov.u32 	%r39, %r38;
+	tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r33,%r35,%r37,%r39}];
+	mov.f32 	%f34, %f30;
+	mov.f32 	%f35, %f31;
+	mov.f32 	%f36, %f32;
+	mov.f32 	%f37, %f33;
+	cvt.rzi.ftz.s32.f32 	%r40, %f37;
+	sub.ftz.f32 	%f38, %f22, %f35;
+	sub.ftz.f32 	%f39, %f21, %f34;
+	sub.ftz.f32 	%f40, %f23, %f36;
+	mul.ftz.f32 	%f41, %f38, %f38;
+	fma.rn.ftz.f32 	%f42, %f39, %f39, %f41;
+	fma.rn.ftz.f32 	%f43, %f40, %f40, %f42;
+	add.s32 	%r41, %r40, %r28;
+	cvt.s64.s32 	%rd30, %r41;
+	mul.wide.s32 	%rd31, %r41, 16;
+	add.u64 	%rd32, %rd31, %rd25;
+	ld.global.f32 	%f44, [%rd32+8];
+	setp.gt.ftz.f32 	%p4, %f44, %f43;
+	@!%p4 bra 	$Lt_0_21762;
+	.loc	16	154	0
+	rcp.approx.ftz.f32 	%f45, %f43;
+	mul.ftz.f32 	%f46, %f45, %f45;
+	mul.ftz.f32 	%f47, %f45, %f46;
+	sqrt.approx.ftz.f32 	%f48, %f47;
+	mul.ftz.f32 	%f49, %f45, %f47;
+	ld.global.v2.f32 	{%f50,%f51}, [%rd32+0];
+	mul.ftz.f32 	%f52, %f50, %f48;
+	sub.ftz.f32 	%f53, %f52, %f51;
+	mul.ftz.f32 	%f54, %f49, %f53;
+	mul.ftz.f32 	%f55, %f29, %f54;
+	.loc	16	156	0
+	fma.rn.ftz.f32 	%f27, %f39, %f55, %f27;
+	.loc	16	157	0
+	fma.rn.ftz.f32 	%f26, %f38, %f55, %f26;
+	.loc	16	158	0
+	fma.rn.ftz.f32 	%f25, %f40, %f55, %f25;
+	ld.param.s32 	%r42, [__cudaparm_kernel_pair_eflag];
+	mov.u32 	%r43, 0;
+	setp.le.s32 	%p5, %r42, %r43;
+	@%p5 bra 	$Lt_0_21250;
+	.loc	16	162	0
+	ld.param.u64 	%rd33, [__cudaparm_kernel_pair_lj3];
+	add.u64 	%rd34, %rd33, %rd31;
+	ld.global.v4.f32 	{%f56,%f57,%f58,_}, [%rd34+0];
+	mul.ftz.f32 	%f59, %f56, %f48;
+	sub.ftz.f32 	%f60, %f59, %f57;
+	mul.ftz.f32 	%f61, %f47, %f60;
+	sub.ftz.f32 	%f62, %f61, %f58;
+	fma.rn.ftz.f32 	%f28, %f29, %f62, %f28;
+$Lt_0_21250:
+	ld.param.s32 	%r44, [__cudaparm_kernel_pair_vflag];
+	mov.u32 	%r45, 0;
+	setp.le.s32 	%p6, %r44, %r45;
+	@%p6 bra 	$Lt_0_21762;
+	.loc	16	165	0
+	mov.f32 	%f63, %f6;
+	mul.ftz.f32 	%f64, %f39, %f39;
+	fma.rn.ftz.f32 	%f65, %f55, %f64, %f63;
+	mov.f32 	%f6, %f65;
+	.loc	16	166	0
+	mov.f32 	%f66, %f8;
+	fma.rn.ftz.f32 	%f67, %f55, %f41, %f66;
+	mov.f32 	%f8, %f67;
+	.loc	16	167	0
+	mov.f32 	%f68, %f10;
+	mul.ftz.f32 	%f69, %f40, %f40;
+	fma.rn.ftz.f32 	%f70, %f55, %f69, %f68;
+	mov.f32 	%f10, %f70;
+	.loc	16	168	0
+	mov.f32 	%f71, %f12;
+	mul.ftz.f32 	%f72, %f38, %f39;
+	fma.rn.ftz.f32 	%f73, %f55, %f72, %f71;
+	mov.f32 	%f12, %f73;
+	.loc	16	169	0
+	mov.f32 	%f74, %f14;
+	mul.ftz.f32 	%f75, %f39, %f40;
+	fma.rn.ftz.f32 	%f76, %f55, %f75, %f74;
+	mov.f32 	%f14, %f76;
+	.loc	16	170	0
+	mul.ftz.f32 	%f77, %f38, %f40;
+	fma.rn.ftz.f32 	%f15, %f55, %f77, %f15;
+	mov.f32 	%f16, %f15;
+$Lt_0_21762:
+$Lt_0_20738:
+	.loc	16	133	0
+	mul.lo.u64 	%rd35, %rd24, 4;
+	add.u64 	%rd16, %rd16, %rd35;
+	setp.lt.u64 	%p7, %rd16, %rd13;
+	@%p7 bra 	$Lt_0_20482;
+	bra.uni 	$Lt_0_18946;
+$Lt_0_28162:
+	mov.f32 	%f25, 0f00000000;    	// 0
+	mov.f32 	%f26, 0f00000000;    	// 0
+	mov.f32 	%f27, 0f00000000;    	// 0
+	mov.f32 	%f28, 0f00000000;    	// 0
+	bra.uni 	$Lt_0_18946;
+$Lt_0_19202:
+	mov.f32 	%f25, 0f00000000;    	// 0
+	mov.f32 	%f26, 0f00000000;    	// 0
+	mov.f32 	%f27, 0f00000000;    	// 0
+	mov.f32 	%f28, 0f00000000;    	// 0
+$Lt_0_18946:
+	mov.u32 	%r46, 1;
+	setp.le.s32 	%p8, %r1, %r46;
+	@%p8 bra 	$Lt_0_24578;
+	.loc	16	181	0
+	mov.u64 	%rd36, __cuda___cuda_local_var_32582_35_non_const_red_acc108;
+	cvt.s64.s32 	%rd37, %r2;
+	mul.wide.s32 	%rd38, %r2, 4;
+	add.u64 	%rd39, %rd36, %rd38;
+	mov.f32 	%f78, %f27;
+	st.shared.f32 	[%rd39+0], %f78;
+	.loc	16	182	0
+	mov.f32 	%f79, %f26;
+	st.shared.f32 	[%rd39+512], %f79;
+	.loc	16	183	0
+	mov.f32 	%f80, %f25;
+	st.shared.f32 	[%rd39+1024], %f80;
+	.loc	16	184	0
+	mov.f32 	%f81, %f28;
+	st.shared.f32 	[%rd39+1536], %f81;
+	.loc	16	186	0
+	shr.s32 	%r47, %r1, 31;
+	mov.s32 	%r48, 1;
+	and.b32 	%r49, %r47, %r48;
+	add.s32 	%r50, %r49, %r1;
+	shr.s32 	%r51, %r50, 1;
+	mov.s32 	%r52, %r51;
+	mov.u32 	%r53, 0;
+	setp.ne.u32 	%p9, %r51, %r53;
+	@!%p9 bra 	$Lt_0_23042;
+$Lt_0_23554:
+	setp.ge.u32 	%p10, %r6, %r52;
+	@%p10 bra 	$Lt_0_23810;
+	.loc	16	189	0
+	add.u32 	%r54, %r2, %r52;
+	cvt.u64.u32 	%rd40, %r54;
+	mul.wide.u32 	%rd41, %r54, 4;
+	add.u64 	%rd42, %rd36, %rd41;
+	ld.shared.f32 	%f82, [%rd42+0];
+	add.ftz.f32 	%f78, %f82, %f78;
+	st.shared.f32 	[%rd39+0], %f78;
+	ld.shared.f32 	%f83, [%rd42+512];
+	add.ftz.f32 	%f79, %f83, %f79;
+	st.shared.f32 	[%rd39+512], %f79;
+	ld.shared.f32 	%f84, [%rd42+1024];
+	add.ftz.f32 	%f80, %f84, %f80;
+	st.shared.f32 	[%rd39+1024], %f80;
+	ld.shared.f32 	%f85, [%rd42+1536];
+	add.ftz.f32 	%f81, %f85, %f81;
+	st.shared.f32 	[%rd39+1536], %f81;
+$Lt_0_23810:
+	.loc	16	186	0
+	shr.u32 	%r52, %r52, 1;
+	mov.u32 	%r55, 0;
+	setp.ne.u32 	%p11, %r52, %r55;
+	@%p11 bra 	$Lt_0_23554;
+$Lt_0_23042:
+	.loc	16	193	0
+	mov.f32 	%f27, %f78;
+	.loc	16	194	0
+	mov.f32 	%f26, %f79;
+	.loc	16	195	0
+	mov.f32 	%f25, %f80;
+	.loc	16	196	0
+	mov.f32 	%f28, %f81;
+	ld.param.s32 	%r56, [__cudaparm_kernel_pair_vflag];
+	mov.u32 	%r57, 0;
+	setp.le.s32 	%p12, %r56, %r57;
+	@%p12 bra 	$Lt_0_24578;
+	.loc	16	200	0
+	mov.f32 	%f78, %f6;
+	st.shared.f32 	[%rd39+0], %f78;
+	mov.f32 	%f79, %f8;
+	st.shared.f32 	[%rd39+512], %f79;
+	mov.f32 	%f80, %f10;
+	st.shared.f32 	[%rd39+1024], %f80;
+	mov.f32 	%f81, %f12;
+	st.shared.f32 	[%rd39+1536], %f81;
+	mov.f32 	%f86, %f14;
+	st.shared.f32 	[%rd39+2048], %f86;
+	mov.f32 	%f87, %f16;
+	st.shared.f32 	[%rd39+2560], %f87;
+	.loc	16	202	0
+	mov.s32 	%r58, %r51;
+	@!%p9 bra 	$Lt_0_25090;
+$Lt_0_25602:
+	setp.ge.u32 	%p13, %r6, %r58;
+	@%p13 bra 	$Lt_0_25858;
+	.loc	16	205	0
+	add.u32 	%r59, %r2, %r58;
+	cvt.u64.u32 	%rd43, %r59;
+	mul.wide.u32 	%rd44, %r59, 4;
+	add.u64 	%rd45, %rd36, %rd44;
+	ld.shared.f32 	%f88, [%rd45+0];
+	add.ftz.f32 	%f78, %f88, %f78;
+	st.shared.f32 	[%rd39+0], %f78;
+	ld.shared.f32 	%f89, [%rd45+512];
+	add.ftz.f32 	%f79, %f89, %f79;
+	st.shared.f32 	[%rd39+512], %f79;
+	ld.shared.f32 	%f90, [%rd45+1024];
+	add.ftz.f32 	%f80, %f90, %f80;
+	st.shared.f32 	[%rd39+1024], %f80;
+	ld.shared.f32 	%f91, [%rd45+1536];
+	add.ftz.f32 	%f81, %f91, %f81;
+	st.shared.f32 	[%rd39+1536], %f81;
+	ld.shared.f32 	%f92, [%rd45+2048];
+	add.ftz.f32 	%f86, %f92, %f86;
+	st.shared.f32 	[%rd39+2048], %f86;
+	ld.shared.f32 	%f93, [%rd45+2560];
+	add.ftz.f32 	%f87, %f93, %f87;
+	st.shared.f32 	[%rd39+2560], %f87;
+$Lt_0_25858:
+	.loc	16	202	0
+	shr.u32 	%r58, %r58, 1;
+	mov.u32 	%r60, 0;
+	setp.ne.u32 	%p14, %r58, %r60;
+	@%p14 bra 	$Lt_0_25602;
+$Lt_0_25090:
+	.loc	16	210	0
+	mov.f32 	%f6, %f78;
+	mov.f32 	%f8, %f79;
+	mov.f32 	%f10, %f80;
+	mov.f32 	%f12, %f81;
+	mov.f32 	%f14, %f86;
+	mov.f32 	%f16, %f87;
+$Lt_0_24578:
+$Lt_0_22530:
+	selp.s32 	%r61, 1, 0, %p1;
+	mov.s32 	%r62, 0;
+	set.eq.u32.s32 	%r63, %r6, %r62;
+	neg.s32 	%r64, %r63;
+	and.b32 	%r65, %r61, %r64;
+	mov.u32 	%r66, 0;
+	setp.eq.s32 	%p15, %r65, %r66;
+	@%p15 bra 	$Lt_0_26626;
+	.loc	16	216	0
+	cvt.s64.s32 	%rd46, %r9;
+	ld.param.u64 	%rd47, [__cudaparm_kernel_pair_engv];
+	mul.wide.s32 	%rd48, %r9, 4;
+	add.u64 	%rd49, %rd47, %rd48;
+	ld.param.s32 	%r67, [__cudaparm_kernel_pair_eflag];
+	mov.u32 	%r68, 0;
+	setp.le.s32 	%p16, %r67, %r68;
+	@%p16 bra 	$Lt_0_27138;
+	.loc	16	218	0
+	st.global.f32 	[%rd49+0], %f28;
+	.loc	16	219	0
+	cvt.s64.s32 	%rd50, %r10;
+	mul.wide.s32 	%rd51, %r10, 4;
+	add.u64 	%rd49, %rd49, %rd51;
+$Lt_0_27138:
+	ld.param.s32 	%r69, [__cudaparm_kernel_pair_vflag];
+	mov.u32 	%r70, 0;
+	setp.le.s32 	%p17, %r69, %r70;
+	@%p17 bra 	$Lt_0_27650;
+	.loc	16	223	0
+	mov.f32 	%f94, %f6;
+	st.global.f32 	[%rd49+0], %f94;
+	.loc	16	224	0
+	cvt.s64.s32 	%rd52, %r10;
+	mul.wide.s32 	%rd53, %r10, 4;
+	add.u64 	%rd54, %rd53, %rd49;
+	.loc	16	223	0
+	mov.f32 	%f95, %f8;
+	st.global.f32 	[%rd54+0], %f95;
+	.loc	16	224	0
+	add.u64 	%rd55, %rd53, %rd54;
+	.loc	16	223	0
+	mov.f32 	%f96, %f10;
+	st.global.f32 	[%rd55+0], %f96;
+	.loc	16	224	0
+	add.u64 	%rd56, %rd53, %rd55;
+	.loc	16	223	0
+	mov.f32 	%f97, %f12;
+	st.global.f32 	[%rd56+0], %f97;
+	.loc	16	224	0
+	add.u64 	%rd49, %rd53, %rd56;
+	.loc	16	223	0
+	mov.f32 	%f98, %f14;
+	st.global.f32 	[%rd49+0], %f98;
+	mov.f32 	%f99, %f16;
+	add.u64 	%rd57, %rd53, %rd49;
+	st.global.f32 	[%rd57+0], %f99;
+$Lt_0_27650:
+	.loc	16	227	0
+	ld.param.u64 	%rd58, [__cudaparm_kernel_pair_ans];
+	mul.lo.u64 	%rd59, %rd46, 16;
+	add.u64 	%rd60, %rd58, %rd59;
+	mov.f32 	%f100, %f101;
+	st.global.v4.f32 	[%rd60+0], {%f27,%f26,%f25,%f100};
+$Lt_0_26626:
+	.loc	16	229	0
+	exit;
+$LDWend_kernel_pair:
+	} // kernel_pair
+
+	.entry kernel_pair_fast (
+		.param .u64 __cudaparm_kernel_pair_fast_x_,
+		.param .u64 __cudaparm_kernel_pair_fast_lj1_in,
+		.param .u64 __cudaparm_kernel_pair_fast_lj3_in,
+		.param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,
+		.param .u64 __cudaparm_kernel_pair_fast_dev_nbor,
+		.param .u64 __cudaparm_kernel_pair_fast_dev_packed,
+		.param .u64 __cudaparm_kernel_pair_fast_ans,
+		.param .u64 __cudaparm_kernel_pair_fast_engv,
+		.param .s32 __cudaparm_kernel_pair_fast_eflag,
+		.param .s32 __cudaparm_kernel_pair_fast_vflag,
+		.param .s32 __cudaparm_kernel_pair_fast_inum,
+		.param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,
+		.param .s32 __cudaparm_kernel_pair_fast_t_per_atom)
+	{
+	.reg .u32 %r<74>;
+	.reg .u64 %rd<74>;
+	.reg .f32 %f<109>;
+	.reg .pred %p<22>;
+	.shared .align 4 .b8 __cuda___cuda_local_var_32648_33_non_const_sp_lj3268[16];
+	.shared .align 16 .b8 __cuda___cuda_local_var_32646_34_non_const_lj13296[1936];
+	.shared .align 16 .b8 __cuda___cuda_local_var_32647_34_non_const_lj35232[1936];
+	.shared .align 4 .b8 __cuda___cuda_local_var_32737_35_non_const_red_acc7168[3072];
+	// __cuda_local_var_32658_10_non_const_f = 48
+	// __cuda_local_var_32662_9_non_const_virial = 16
+	.loc	16	237	0
+$LDWbegin_kernel_pair_fast:
+	cvt.s32.u32 	%r1, %tid.x;
+	mov.u32 	%r2, 3;
+	setp.gt.s32 	%p1, %r1, %r2;
+	@%p1 bra 	$Lt_1_21250;
+	.loc	16	247	0
+	mov.u64 	%rd1, __cuda___cuda_local_var_32648_33_non_const_sp_lj3268;
+	cvt.s64.s32 	%rd2, %r1;
+	mul.wide.s32 	%rd3, %r1, 4;
+	ld.param.u64 	%rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];
+	add.u64 	%rd5, %rd4, %rd3;
+	ld.global.f32 	%f1, [%rd5+0];
+	add.u64 	%rd6, %rd3, %rd1;
+	st.shared.f32 	[%rd6+0], %f1;
+$Lt_1_21250:
+	mov.u64 	%rd1, __cuda___cuda_local_var_32648_33_non_const_sp_lj3268;
+	mov.u32 	%r3, 120;
+	setp.gt.s32 	%p2, %r1, %r3;
+	@%p2 bra 	$Lt_1_21762;
+	.loc	16	249	0
+	mov.u64 	%rd7, __cuda___cuda_local_var_32646_34_non_const_lj13296;
+	cvt.s64.s32 	%rd8, %r1;
+	mul.wide.s32 	%rd9, %r1, 16;
+	ld.param.u64 	%rd10, [__cudaparm_kernel_pair_fast_lj1_in];
+	add.u64 	%rd11, %rd10, %rd9;
+	add.u64 	%rd12, %rd9, %rd7;
+	ld.global.v4.f32 	{%f2,%f3,%f4,%f5}, [%rd11+0];
+	st.shared.v4.f32 	[%rd12+0], {%f2,%f3,%f4,%f5};
+	ld.param.s32 	%r4, [__cudaparm_kernel_pair_fast_eflag];
+	mov.u32 	%r5, 0;
+	setp.le.s32 	%p3, %r4, %r5;
+	@%p3 bra 	$Lt_1_22274;
+	.loc	16	251	0
+	mov.u64 	%rd13, __cuda___cuda_local_var_32647_34_non_const_lj35232;
+	ld.param.u64 	%rd14, [__cudaparm_kernel_pair_fast_lj3_in];
+	add.u64 	%rd15, %rd14, %rd9;
+	add.u64 	%rd16, %rd9, %rd13;
+	ld.global.v4.f32 	{%f6,%f7,%f8,%f9}, [%rd15+0];
+	st.shared.v4.f32 	[%rd16+0], {%f6,%f7,%f8,%f9};
+$Lt_1_22274:
+	mov.u64 	%rd13, __cuda___cuda_local_var_32647_34_non_const_lj35232;
+$Lt_1_21762:
+	mov.u64 	%rd7, __cuda___cuda_local_var_32646_34_non_const_lj13296;
+	mov.u64 	%rd13, __cuda___cuda_local_var_32647_34_non_const_lj35232;
+	.loc	16	261	0
+	mov.f32 	%f10, 0f00000000;    	// 0
+	mov.f32 	%f11, %f10;
+	mov.f32 	%f12, 0f00000000;    	// 0
+	mov.f32 	%f13, %f12;
+	mov.f32 	%f14, 0f00000000;    	// 0
+	mov.f32 	%f15, %f14;
+	mov.f32 	%f16, 0f00000000;    	// 0
+	mov.f32 	%f17, %f16;
+	mov.f32 	%f18, 0f00000000;    	// 0
+	mov.f32 	%f19, %f18;
+	mov.f32 	%f20, 0f00000000;    	// 0
+	mov.f32 	%f21, %f20;
+	.loc	16	263	0
+	bar.sync 	0;
+	ld.param.s32 	%r6, [__cudaparm_kernel_pair_fast_t_per_atom];
+	div.s32 	%r7, %r1, %r6;
+	cvt.s32.u32 	%r8, %ntid.x;
+	div.s32 	%r9, %r8, %r6;
+	rem.s32 	%r10, %r1, %r6;
+	cvt.s32.u32 	%r11, %ctaid.x;
+	mul.lo.s32 	%r12, %r11, %r9;
+	add.s32 	%r13, %r7, %r12;
+	ld.param.s32 	%r14, [__cudaparm_kernel_pair_fast_inum];
+	setp.lt.s32 	%p4, %r13, %r14;
+	@!%p4 bra 	$Lt_1_23042;
+	.loc	16	269	0
+	ld.param.s32 	%r15, [__cudaparm_kernel_pair_fast_nbor_pitch];
+	cvt.s64.s32 	%rd17, %r15;
+	mul.wide.s32 	%rd18, %r15, 4;
+	cvt.s64.s32 	%rd19, %r13;
+	mul.wide.s32 	%rd20, %r13, 4;
+	ld.param.u64 	%rd21, [__cudaparm_kernel_pair_fast_dev_nbor];
+	add.u64 	%rd22, %rd20, %rd21;
+	add.u64 	%rd23, %rd18, %rd22;
+	ld.global.s32 	%r16, [%rd23+0];
+	add.u64 	%rd24, %rd18, %rd23;
+	ld.param.u64 	%rd25, [__cudaparm_kernel_pair_fast_dev_packed];
+	setp.ne.u64 	%p5, %rd25, %rd21;
+	@%p5 bra 	$Lt_1_23554;
+	.loc	16	275	0
+	cvt.s32.s64 	%r17, %rd17;
+	mul.lo.s32 	%r18, %r17, %r16;
+	cvt.s64.s32 	%rd26, %r18;
+	mul.wide.s32 	%rd27, %r18, 4;
+	add.u64 	%rd28, %rd24, %rd27;
+	.loc	16	276	0
+	mul.lo.s32 	%r19, %r10, %r17;
+	cvt.s64.s32 	%rd29, %r19;
+	mul.wide.s32 	%rd30, %r19, 4;
+	add.u64 	%rd31, %rd24, %rd30;
+	.loc	16	277	0
+	mul.lo.s32 	%r20, %r17, %r6;
+	bra.uni 	$Lt_1_23298;
+$Lt_1_23554:
+	.loc	16	279	0
+	ld.global.s32 	%r21, [%rd24+0];
+	cvt.s64.s32 	%rd32, %r21;
+	mul.wide.s32 	%rd33, %r21, 4;
+	add.u64 	%rd34, %rd25, %rd33;
+	.loc	16	280	0
+	cvt.s64.s32 	%rd35, %r16;
+	mul.wide.s32 	%rd36, %r16, 4;
+	add.u64 	%rd28, %rd34, %rd36;
+	.loc	16	281	0
+	mov.s32 	%r20, %r6;
+	.loc	16	282	0
+	cvt.s64.s32 	%rd37, %r10;
+	mul.wide.s32 	%rd38, %r10, 4;
+	add.u64 	%rd31, %rd34, %rd38;
+$Lt_1_23298:
+	.loc	16	285	0
+	ld.global.s32 	%r22, [%rd22+0];
+	mov.u32 	%r23, %r22;
+	mov.s32 	%r24, 0;
+	mov.u32 	%r25, %r24;
+	mov.s32 	%r26, 0;
+	mov.u32 	%r27, %r26;
+	mov.s32 	%r28, 0;
+	mov.u32 	%r29, %r28;
+	tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r23,%r25,%r27,%r29}];
+	mov.f32 	%f26, %f22;
+	mov.f32 	%f27, %f23;
+	mov.f32 	%f28, %f24;
+	mov.f32 	%f29, %f25;
+	setp.ge.u64 	%p6, %rd31, %rd28;
+	@%p6 bra 	$Lt_1_32002;
+	cvt.rzi.ftz.s32.f32 	%r30, %f29;
+	cvt.s64.s32 	%rd39, %r20;
+	mul.lo.s32 	%r31, %r30, 11;
+	cvt.rn.f32.s32 	%f30, %r31;
+	mov.f32 	%f31, 0f00000000;    	// 0
+	mov.f32 	%f32, 0f00000000;    	// 0
+	mov.f32 	%f33, 0f00000000;    	// 0
+	mov.f32 	%f34, 0f00000000;    	// 0
+$Lt_1_24322:
+ //<loop> Loop body line 285, nesting depth: 1, estimated iterations: unknown
+	.loc	16	292	0
+	ld.global.s32 	%r32, [%rd31+0];
+	.loc	16	296	0
+	and.b32 	%r33, %r32, 1073741823;
+	mov.u32 	%r34, %r33;
+	mov.s32 	%r35, 0;
+	mov.u32 	%r36, %r35;
+	mov.s32 	%r37, 0;
+	mov.u32 	%r38, %r37;
+	mov.s32 	%r39, 0;
+	mov.u32 	%r40, %r39;
+	tex.1d.v4.f32.s32 {%f35,%f36,%f37,%f38},[pos_tex,{%r34,%r36,%r38,%r40}];
+	mov.f32 	%f39, %f35;
+	mov.f32 	%f40, %f36;
+	mov.f32 	%f41, %f37;
+	mov.f32 	%f42, %f38;
+	sub.ftz.f32 	%f43, %f27, %f40;
+	sub.ftz.f32 	%f44, %f26, %f39;
+	sub.ftz.f32 	%f45, %f28, %f41;
+	mul.ftz.f32 	%f46, %f43, %f43;
+	fma.rn.ftz.f32 	%f47, %f44, %f44, %f46;
+	fma.rn.ftz.f32 	%f48, %f45, %f45, %f47;
+	add.ftz.f32 	%f49, %f30, %f42;
+	cvt.rzi.ftz.s32.f32 	%r41, %f49;
+	cvt.s64.s32 	%rd40, %r41;
+	mul.wide.s32 	%rd41, %r41, 16;
+	add.u64 	%rd42, %rd41, %rd7;
+	ld.shared.f32 	%f50, [%rd42+8];
+	setp.gt.ftz.f32 	%p7, %f50, %f48;
+	@!%p7 bra 	$Lt_1_25602;
+	.loc	16	309	0
+	rcp.approx.ftz.f32 	%f51, %f48;
+	mul.ftz.f32 	%f52, %f51, %f51;
+	mul.ftz.f32 	%f53, %f51, %f52;
+	sqrt.approx.ftz.f32 	%f54, %f53;
+	mul.ftz.f32 	%f55, %f51, %f53;
+	ld.shared.v2.f32 	{%f56,%f57}, [%rd42+0];
+	mul.ftz.f32 	%f58, %f56, %f54;
+	sub.ftz.f32 	%f59, %f58, %f57;
+	mul.ftz.f32 	%f60, %f55, %f59;
+	.loc	16	311	0
+	fma.rn.ftz.f32 	%f33, %f44, %f60, %f33;
+	.loc	16	312	0
+	fma.rn.ftz.f32 	%f32, %f43, %f60, %f32;
+	.loc	16	313	0
+	fma.rn.ftz.f32 	%f31, %f45, %f60, %f31;
+	ld.param.s32 	%r42, [__cudaparm_kernel_pair_fast_eflag];
+	mov.u32 	%r43, 0;
+	setp.le.s32 	%p8, %r42, %r43;
+	@%p8 bra 	$Lt_1_25090;
+	.loc	16	316	0
+	add.u64 	%rd43, %rd41, %rd13;
+	ld.shared.v4.f32 	{%f61,%f62,%f63,_}, [%rd43+0];
+	mul.ftz.f32 	%f64, %f61, %f54;
+	sub.ftz.f32 	%f65, %f64, %f62;
+	mul.ftz.f32 	%f66, %f53, %f65;
+	.loc	16	317	0
+	shr.s32 	%r44, %r32, 30;
+	and.b32 	%r45, %r44, 3;
+	cvt.s64.s32 	%rd44, %r45;
+	mul.wide.s32 	%rd45, %r45, 4;
+	add.u64 	%rd46, %rd1, %rd45;
+	ld.shared.f32 	%f67, [%rd46+0];
+	sub.ftz.f32 	%f68, %f66, %f63;
+	fma.rn.ftz.f32 	%f34, %f67, %f68, %f34;
+$Lt_1_25090:
+	ld.param.s32 	%r46, [__cudaparm_kernel_pair_fast_vflag];
+	mov.u32 	%r47, 0;
+	setp.le.s32 	%p9, %r46, %r47;
+	@%p9 bra 	$Lt_1_25602;
+	.loc	16	320	0
+	mov.f32 	%f69, %f11;
+	mul.ftz.f32 	%f70, %f44, %f44;
+	fma.rn.ftz.f32 	%f71, %f60, %f70, %f69;
+	mov.f32 	%f11, %f71;
+	.loc	16	321	0
+	mov.f32 	%f72, %f13;
+	fma.rn.ftz.f32 	%f73, %f60, %f46, %f72;
+	mov.f32 	%f13, %f73;
+	.loc	16	322	0
+	mov.f32 	%f74, %f15;
+	mul.ftz.f32 	%f75, %f45, %f45;
+	fma.rn.ftz.f32 	%f76, %f60, %f75, %f74;
+	mov.f32 	%f15, %f76;
+	.loc	16	323	0
+	mov.f32 	%f77, %f17;
+	mul.ftz.f32 	%f78, %f43, %f44;
+	fma.rn.ftz.f32 	%f79, %f60, %f78, %f77;
+	mov.f32 	%f17, %f79;
+	.loc	16	324	0
+	mov.f32 	%f80, %f19;
+	mul.ftz.f32 	%f81, %f44, %f45;
+	fma.rn.ftz.f32 	%f82, %f60, %f81, %f80;
+	mov.f32 	%f19, %f82;
+	.loc	16	325	0
+	mul.ftz.f32 	%f83, %f43, %f45;
+	fma.rn.ftz.f32 	%f20, %f60, %f83, %f20;
+	mov.f32 	%f21, %f20;
+$Lt_1_25602:
+$Lt_1_24578:
+	.loc	16	290	0
+	mul.lo.u64 	%rd47, %rd39, 4;
+	add.u64 	%rd31, %rd31, %rd47;
+	setp.lt.u64 	%p10, %rd31, %rd28;
+	@%p10 bra 	$Lt_1_24322;
+	bra.uni 	$Lt_1_22786;
+$Lt_1_32002:
+	mov.f32 	%f31, 0f00000000;    	// 0
+	mov.f32 	%f32, 0f00000000;    	// 0
+	mov.f32 	%f33, 0f00000000;    	// 0
+	mov.f32 	%f34, 0f00000000;    	// 0
+	bra.uni 	$Lt_1_22786;
+$Lt_1_23042:
+	mov.f32 	%f31, 0f00000000;    	// 0
+	mov.f32 	%f32, 0f00000000;    	// 0
+	mov.f32 	%f33, 0f00000000;    	// 0
+	mov.f32 	%f34, 0f00000000;    	// 0
+$Lt_1_22786:
+	mov.u32 	%r48, 1;
+	setp.le.s32 	%p11, %r6, %r48;
+	@%p11 bra 	$Lt_1_28418;
+	.loc	16	336	0
+	mov.u64 	%rd48, __cuda___cuda_local_var_32737_35_non_const_red_acc7168;
+	cvt.s64.s32 	%rd49, %r1;
+	mul.wide.s32 	%rd50, %r1, 4;
+	add.u64 	%rd51, %rd48, %rd50;
+	mov.f32 	%f84, %f33;
+	st.shared.f32 	[%rd51+0], %f84;
+	.loc	16	337	0
+	mov.f32 	%f85, %f32;
+	st.shared.f32 	[%rd51+512], %f85;
+	.loc	16	338	0
+	mov.f32 	%f86, %f31;
+	st.shared.f32 	[%rd51+1024], %f86;
+	.loc	16	339	0
+	mov.f32 	%f87, %f34;
+	st.shared.f32 	[%rd51+1536], %f87;
+	.loc	16	341	0
+	shr.s32 	%r49, %r6, 31;
+	mov.s32 	%r50, 1;
+	and.b32 	%r51, %r49, %r50;
+	add.s32 	%r52, %r51, %r6;
+	shr.s32 	%r53, %r52, 1;
+	mov.s32 	%r54, %r53;
+	mov.u32 	%r55, 0;
+	setp.ne.u32 	%p12, %r53, %r55;
+	@!%p12 bra 	$Lt_1_26882;
+$Lt_1_27394:
+	setp.ge.u32 	%p13, %r10, %r54;
+	@%p13 bra 	$Lt_1_27650;
+	.loc	16	344	0
+	add.u32 	%r56, %r1, %r54;
+	cvt.u64.u32 	%rd52, %r56;
+	mul.wide.u32 	%rd53, %r56, 4;
+	add.u64 	%rd54, %rd48, %rd53;
+	ld.shared.f32 	%f88, [%rd54+0];
+	add.ftz.f32 	%f84, %f88, %f84;
+	st.shared.f32 	[%rd51+0], %f84;
+	ld.shared.f32 	%f89, [%rd54+512];
+	add.ftz.f32 	%f85, %f89, %f85;
+	st.shared.f32 	[%rd51+512], %f85;
+	ld.shared.f32 	%f90, [%rd54+1024];
+	add.ftz.f32 	%f86, %f90, %f86;
+	st.shared.f32 	[%rd51+1024], %f86;
+	ld.shared.f32 	%f91, [%rd54+1536];
+	add.ftz.f32 	%f87, %f91, %f87;
+	st.shared.f32 	[%rd51+1536], %f87;
+$Lt_1_27650:
+	.loc	16	341	0
+	shr.u32 	%r54, %r54, 1;
+	mov.u32 	%r57, 0;
+	setp.ne.u32 	%p14, %r54, %r57;
+	@%p14 bra 	$Lt_1_27394;
+$Lt_1_26882:
+	.loc	16	348	0
+	mov.f32 	%f33, %f84;
+	.loc	16	349	0
+	mov.f32 	%f32, %f85;
+	.loc	16	350	0
+	mov.f32 	%f31, %f86;
+	.loc	16	351	0
+	mov.f32 	%f34, %f87;
+	ld.param.s32 	%r58, [__cudaparm_kernel_pair_fast_vflag];
+	mov.u32 	%r59, 0;
+	setp.le.s32 	%p15, %r58, %r59;
+	@%p15 bra 	$Lt_1_28418;
+	.loc	16	355	0
+	mov.f32 	%f84, %f11;
+	st.shared.f32 	[%rd51+0], %f84;
+	mov.f32 	%f85, %f13;
+	st.shared.f32 	[%rd51+512], %f85;
+	mov.f32 	%f86, %f15;
+	st.shared.f32 	[%rd51+1024], %f86;
+	mov.f32 	%f87, %f17;
+	st.shared.f32 	[%rd51+1536], %f87;
+	mov.f32 	%f92, %f19;
+	st.shared.f32 	[%rd51+2048], %f92;
+	mov.f32 	%f93, %f21;
+	st.shared.f32 	[%rd51+2560], %f93;
+	.loc	16	357	0
+	mov.s32 	%r60, %r53;
+	@!%p12 bra 	$Lt_1_28930;
+$Lt_1_29442:
+	setp.ge.u32 	%p16, %r10, %r60;
+	@%p16 bra 	$Lt_1_29698;
+	.loc	16	360	0
+	add.u32 	%r61, %r1, %r60;
+	cvt.u64.u32 	%rd55, %r61;
+	mul.wide.u32 	%rd56, %r61, 4;
+	add.u64 	%rd57, %rd48, %rd56;
+	ld.shared.f32 	%f94, [%rd57+0];
+	add.ftz.f32 	%f84, %f94, %f84;
+	st.shared.f32 	[%rd51+0], %f84;
+	ld.shared.f32 	%f95, [%rd57+512];
+	add.ftz.f32 	%f85, %f95, %f85;
+	st.shared.f32 	[%rd51+512], %f85;
+	ld.shared.f32 	%f96, [%rd57+1024];
+	add.ftz.f32 	%f86, %f96, %f86;
+	st.shared.f32 	[%rd51+1024], %f86;
+	ld.shared.f32 	%f97, [%rd57+1536];
+	add.ftz.f32 	%f87, %f97, %f87;
+	st.shared.f32 	[%rd51+1536], %f87;
+	ld.shared.f32 	%f98, [%rd57+2048];
+	add.ftz.f32 	%f92, %f98, %f92;
+	st.shared.f32 	[%rd51+2048], %f92;
+	ld.shared.f32 	%f99, [%rd57+2560];
+	add.ftz.f32 	%f93, %f99, %f93;
+	st.shared.f32 	[%rd51+2560], %f93;
+$Lt_1_29698:
+	.loc	16	357	0
+	shr.u32 	%r60, %r60, 1;
+	mov.u32 	%r62, 0;
+	setp.ne.u32 	%p17, %r60, %r62;
+	@%p17 bra 	$Lt_1_29442;
+$Lt_1_28930:
+	.loc	16	365	0
+	mov.f32 	%f11, %f84;
+	mov.f32 	%f13, %f85;
+	mov.f32 	%f15, %f86;
+	mov.f32 	%f17, %f87;
+	mov.f32 	%f19, %f92;
+	mov.f32 	%f21, %f93;
+$Lt_1_28418:
+$Lt_1_26370:
+	selp.s32 	%r63, 1, 0, %p4;
+	mov.s32 	%r64, 0;
+	set.eq.u32.s32 	%r65, %r10, %r64;
+	neg.s32 	%r66, %r65;
+	and.b32 	%r67, %r63, %r66;
+	mov.u32 	%r68, 0;
+	setp.eq.s32 	%p18, %r67, %r68;
+	@%p18 bra 	$Lt_1_30466;
+	.loc	16	371	0
+	cvt.s64.s32 	%rd58, %r13;
+	ld.param.u64 	%rd59, [__cudaparm_kernel_pair_fast_engv];
+	mul.wide.s32 	%rd60, %r13, 4;
+	add.u64 	%rd61, %rd59, %rd60;
+	ld.param.s32 	%r69, [__cudaparm_kernel_pair_fast_eflag];
+	mov.u32 	%r70, 0;
+	setp.le.s32 	%p19, %r69, %r70;
+	@%p19 bra 	$Lt_1_30978;
+	.loc	16	373	0
+	st.global.f32 	[%rd61+0], %f34;
+	.loc	16	374	0
+	cvt.s64.s32 	%rd62, %r14;
+	mul.wide.s32 	%rd63, %r14, 4;
+	add.u64 	%rd61, %rd61, %rd63;
+$Lt_1_30978:
+	ld.param.s32 	%r71, [__cudaparm_kernel_pair_fast_vflag];
+	mov.u32 	%r72, 0;
+	setp.le.s32 	%p20, %r71, %r72;
+	@%p20 bra 	$Lt_1_31490;
+	.loc	16	378	0
+	mov.f32 	%f100, %f11;
+	st.global.f32 	[%rd61+0], %f100;
+	.loc	16	379	0
+	cvt.s64.s32 	%rd64, %r14;
+	mul.wide.s32 	%rd65, %r14, 4;
+	add.u64 	%rd66, %rd65, %rd61;
+	.loc	16	378	0
+	mov.f32 	%f101, %f13;
+	st.global.f32 	[%rd66+0], %f101;
+	.loc	16	379	0
+	add.u64 	%rd67, %rd65, %rd66;
+	.loc	16	378	0
+	mov.f32 	%f102, %f15;
+	st.global.f32 	[%rd67+0], %f102;
+	.loc	16	379	0
+	add.u64 	%rd68, %rd65, %rd67;
+	.loc	16	378	0
+	mov.f32 	%f103, %f17;
+	st.global.f32 	[%rd68+0], %f103;
+	.loc	16	379	0
+	add.u64 	%rd61, %rd65, %rd68;
+	.loc	16	378	0
+	mov.f32 	%f104, %f19;
+	st.global.f32 	[%rd61+0], %f104;
+	mov.f32 	%f105, %f21;
+	add.u64 	%rd69, %rd65, %rd61;
+	st.global.f32 	[%rd69+0], %f105;
+$Lt_1_31490:
+	.loc	16	382	0
+	ld.param.u64 	%rd70, [__cudaparm_kernel_pair_fast_ans];
+	mul.lo.u64 	%rd71, %rd58, 16;
+	add.u64 	%rd72, %rd70, %rd71;
+	mov.f32 	%f106, %f107;
+	st.global.v4.f32 	[%rd72+0], {%f33,%f32,%f31,%f106};
+$Lt_1_30466:
+	.loc	16	384	0
+	exit;
+$LDWend_kernel_pair_fast:
+	} // kernel_pair_fast
+
--- a/lib/gpu/lj96_cut_gpu_ptx.h
+++ b/lib/gpu/lj96_cut_gpu_ptx.h
@ -0,0 +1,927 @@
+const char * lj96_cut_gpu_kernel = 
+"	.version 2.3\n"
+"	.target sm_20\n"
+"	.address_size 64\n"
+"	.global .texref pos_tex;\n"
+"	.entry kernel_pair (\n"
+"		.param .u64 __cudaparm_kernel_pair_x_,\n"
+"		.param .u64 __cudaparm_kernel_pair_lj1,\n"
+"		.param .u64 __cudaparm_kernel_pair_lj3,\n"
+"		.param .s32 __cudaparm_kernel_pair_lj_types,\n"
+"		.param .u64 __cudaparm_kernel_pair_sp_lj_in,\n"
+"		.param .u64 __cudaparm_kernel_pair_dev_nbor,\n"
+"		.param .u64 __cudaparm_kernel_pair_dev_packed,\n"
+"		.param .u64 __cudaparm_kernel_pair_ans,\n"
+"		.param .u64 __cudaparm_kernel_pair_engv,\n"
+"		.param .s32 __cudaparm_kernel_pair_eflag,\n"
+"		.param .s32 __cudaparm_kernel_pair_vflag,\n"
+"		.param .s32 __cudaparm_kernel_pair_inum,\n"
+"		.param .s32 __cudaparm_kernel_pair_nbor_pitch,\n"
+"		.param .s32 __cudaparm_kernel_pair_t_per_atom)\n"
+"	{\n"
+"	.reg .u32 %r<72>;\n"
+"	.reg .u64 %rd<62>;\n"
+"	.reg .f32 %f<103>;\n"
+"	.reg .pred %p<19>;\n"
+"	.shared .align 16 .b8 __cuda___cuda_local_var_32497_33_non_const_sp_lj92[16];\n"
+"	.shared .align 4 .b8 __cuda___cuda_local_var_32582_35_non_const_red_acc108[3072];\n"
+"	.loc	16	88	0\n"
+"$LDWbegin_kernel_pair:\n"
+"	.loc	16	95	0\n"
+"	ld.param.u64 	%rd1, [__cudaparm_kernel_pair_sp_lj_in];\n"
+"	ldu.global.f32 	%f1, [%rd1+0];\n"
+"	.loc	16	96	0\n"
+"	ld.global.f32 	%f2, [%rd1+4];\n"
+"	.loc	16	97	0\n"
+"	ld.global.f32 	%f3, [%rd1+8];\n"
+"	.loc	16	98	0\n"
+"	ld.global.f32 	%f4, [%rd1+12];\n"
+"	st.shared.v4.f32 	[__cuda___cuda_local_var_32497_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4};\n"
+"	.loc	16	107	0\n"
+"	mov.f32 	%f5, 0f00000000;     	\n"
+"	mov.f32 	%f6, %f5;\n"
+"	mov.f32 	%f7, 0f00000000;     	\n"
+"	mov.f32 	%f8, %f7;\n"
+"	mov.f32 	%f9, 0f00000000;     	\n"
+"	mov.f32 	%f10, %f9;\n"
+"	mov.f32 	%f11, 0f00000000;    	\n"
+"	mov.f32 	%f12, %f11;\n"
+"	mov.f32 	%f13, 0f00000000;    	\n"
+"	mov.f32 	%f14, %f13;\n"
+"	mov.f32 	%f15, 0f00000000;    	\n"
+"	mov.f32 	%f16, %f15;\n"
+"	ld.param.s32 	%r1, [__cudaparm_kernel_pair_t_per_atom];\n"
+"	cvt.s32.u32 	%r2, %tid.x;\n"
+"	div.s32 	%r3, %r2, %r1;\n"
+"	cvt.s32.u32 	%r4, %ntid.x;\n"
+"	div.s32 	%r5, %r4, %r1;\n"
+"	rem.s32 	%r6, %r2, %r1;\n"
+"	cvt.s32.u32 	%r7, %ctaid.x;\n"
+"	mul.lo.s32 	%r8, %r7, %r5;\n"
+"	add.s32 	%r9, %r3, %r8;\n"
+"	ld.param.s32 	%r10, [__cudaparm_kernel_pair_inum];\n"
+"	setp.lt.s32 	%p1, %r9, %r10;\n"
+"	@!%p1 bra 	$Lt_0_19202;\n"
+"	.loc	16	113	0\n"
+"	ld.param.s32 	%r11, [__cudaparm_kernel_pair_nbor_pitch];\n"
+"	cvt.s64.s32 	%rd2, %r11;\n"
+"	mul.wide.s32 	%rd3, %r11, 4;\n"
+"	cvt.s64.s32 	%rd4, %r9;\n"
+"	mul.wide.s32 	%rd5, %r9, 4;\n"
+"	ld.param.u64 	%rd6, [__cudaparm_kernel_pair_dev_nbor];\n"
+"	add.u64 	%rd7, %rd5, %rd6;\n"
+"	add.u64 	%rd8, %rd3, %rd7;\n"
+"	ld.global.s32 	%r12, [%rd8+0];\n"
+"	add.u64 	%rd9, %rd3, %rd8;\n"
+"	ld.param.u64 	%rd10, [__cudaparm_kernel_pair_dev_packed];\n"
+"	setp.ne.u64 	%p2, %rd10, %rd6;\n"
+"	@%p2 bra 	$Lt_0_19714;\n"
+"	.loc	16	119	0\n"
+"	cvt.s32.s64 	%r13, %rd2;\n"
+"	mul.lo.s32 	%r14, %r13, %r12;\n"
+"	cvt.s64.s32 	%rd11, %r14;\n"
+"	mul.wide.s32 	%rd12, %r14, 4;\n"
+"	add.u64 	%rd13, %rd9, %rd12;\n"
+"	.loc	16	120	0\n"
+"	mul.lo.s32 	%r15, %r6, %r13;\n"
+"	cvt.s64.s32 	%rd14, %r15;\n"
+"	mul.wide.s32 	%rd15, %r15, 4;\n"
+"	add.u64 	%rd16, %rd9, %rd15;\n"
+"	.loc	16	121	0\n"
+"	mul.lo.s32 	%r16, %r13, %r1;\n"
+"	bra.uni 	$Lt_0_19458;\n"
+"$Lt_0_19714:\n"
+"	.loc	16	123	0\n"
+"	ld.global.s32 	%r17, [%rd9+0];\n"
+"	cvt.s64.s32 	%rd17, %r17;\n"
+"	mul.wide.s32 	%rd18, %r17, 4;\n"
+"	add.u64 	%rd19, %rd10, %rd18;\n"
+"	.loc	16	124	0\n"
+"	cvt.s64.s32 	%rd20, %r12;\n"
+"	mul.wide.s32 	%rd21, %r12, 4;\n"
+"	add.u64 	%rd13, %rd19, %rd21;\n"
+"	.loc	16	125	0\n"
+"	mov.s32 	%r16, %r1;\n"
+"	.loc	16	126	0\n"
+"	cvt.s64.s32 	%rd22, %r6;\n"
+"	mul.wide.s32 	%rd23, %r6, 4;\n"
+"	add.u64 	%rd16, %rd19, %rd23;\n"
+"$Lt_0_19458:\n"
+"	.loc	16	129	0\n"
+"	ld.global.s32 	%r18, [%rd7+0];\n"
+"	mov.u32 	%r19, %r18;\n"
+"	mov.s32 	%r20, 0;\n"
+"	mov.u32 	%r21, %r20;\n"
+"	mov.s32 	%r22, 0;\n"
+"	mov.u32 	%r23, %r22;\n"
+"	mov.s32 	%r24, 0;\n"
+"	mov.u32 	%r25, %r24;\n"
+"	tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r19,%r21,%r23,%r25}];\n"
+"	mov.f32 	%f21, %f17;\n"
+"	mov.f32 	%f22, %f18;\n"
+"	mov.f32 	%f23, %f19;\n"
+"	mov.f32 	%f24, %f20;\n"
+"	setp.ge.u64 	%p3, %rd16, %rd13;\n"
+"	@%p3 bra 	$Lt_0_28162;\n"
+"	cvt.rzi.ftz.s32.f32 	%r26, %f24;\n"
+"	cvt.s64.s32 	%rd24, %r16;\n"
+"	ld.param.s32 	%r27, [__cudaparm_kernel_pair_lj_types];\n"
+"	mul.lo.s32 	%r28, %r27, %r26;\n"
+"	ld.param.u64 	%rd25, [__cudaparm_kernel_pair_lj1];\n"
+"	mov.f32 	%f25, 0f00000000;    	\n"
+"	mov.f32 	%f26, 0f00000000;    	\n"
+"	mov.f32 	%f27, 0f00000000;    	\n"
+"	mov.f32 	%f28, 0f00000000;    	\n"
+"	mov.u64 	%rd26, __cuda___cuda_local_var_32497_33_non_const_sp_lj92;\n"
+"$Lt_0_20482:\n"
+"	.loc	16	135	0\n"
+"	ld.global.s32 	%r29, [%rd16+0];\n"
+"	.loc	16	136	0\n"
+"	shr.s32 	%r30, %r29, 30;\n"
+"	and.b32 	%r31, %r30, 3;\n"
+"	cvt.s64.s32 	%rd27, %r31;\n"
+"	mul.wide.s32 	%rd28, %r31, 4;\n"
+"	add.u64 	%rd29, %rd26, %rd28;\n"
+"	ld.shared.f32 	%f29, [%rd29+0];\n"
+"	.loc	16	139	0\n"
+"	and.b32 	%r32, %r29, 1073741823;\n"
+"	mov.u32 	%r33, %r32;\n"
+"	mov.s32 	%r34, 0;\n"
+"	mov.u32 	%r35, %r34;\n"
+"	mov.s32 	%r36, 0;\n"
+"	mov.u32 	%r37, %r36;\n"
+"	mov.s32 	%r38, 0;\n"
+"	mov.u32 	%r39, %r38;\n"
+"	tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r33,%r35,%r37,%r39}];\n"
+"	mov.f32 	%f34, %f30;\n"
+"	mov.f32 	%f35, %f31;\n"
+"	mov.f32 	%f36, %f32;\n"
+"	mov.f32 	%f37, %f33;\n"
+"	cvt.rzi.ftz.s32.f32 	%r40, %f37;\n"
+"	sub.ftz.f32 	%f38, %f22, %f35;\n"
+"	sub.ftz.f32 	%f39, %f21, %f34;\n"
+"	sub.ftz.f32 	%f40, %f23, %f36;\n"
+"	mul.ftz.f32 	%f41, %f38, %f38;\n"
+"	fma.rn.ftz.f32 	%f42, %f39, %f39, %f41;\n"
+"	fma.rn.ftz.f32 	%f43, %f40, %f40, %f42;\n"
+"	add.s32 	%r41, %r40, %r28;\n"
+"	cvt.s64.s32 	%rd30, %r41;\n"
+"	mul.wide.s32 	%rd31, %r41, 16;\n"
+"	add.u64 	%rd32, %rd31, %rd25;\n"
+"	ld.global.f32 	%f44, [%rd32+8];\n"
+"	setp.gt.ftz.f32 	%p4, %f44, %f43;\n"
+"	@!%p4 bra 	$Lt_0_21762;\n"
+"	.loc	16	154	0\n"
+"	rcp.approx.ftz.f32 	%f45, %f43;\n"
+"	mul.ftz.f32 	%f46, %f45, %f45;\n"
+"	mul.ftz.f32 	%f47, %f45, %f46;\n"
+"	sqrt.approx.ftz.f32 	%f48, %f47;\n"
+"	mul.ftz.f32 	%f49, %f45, %f47;\n"
+"	ld.global.v2.f32 	{%f50,%f51}, [%rd32+0];\n"
+"	mul.ftz.f32 	%f52, %f50, %f48;\n"
+"	sub.ftz.f32 	%f53, %f52, %f51;\n"
+"	mul.ftz.f32 	%f54, %f49, %f53;\n"
+"	mul.ftz.f32 	%f55, %f29, %f54;\n"
+"	.loc	16	156	0\n"
+"	fma.rn.ftz.f32 	%f27, %f39, %f55, %f27;\n"
+"	.loc	16	157	0\n"
+"	fma.rn.ftz.f32 	%f26, %f38, %f55, %f26;\n"
+"	.loc	16	158	0\n"
+"	fma.rn.ftz.f32 	%f25, %f40, %f55, %f25;\n"
+"	ld.param.s32 	%r42, [__cudaparm_kernel_pair_eflag];\n"
+"	mov.u32 	%r43, 0;\n"
+"	setp.le.s32 	%p5, %r42, %r43;\n"
+"	@%p5 bra 	$Lt_0_21250;\n"
+"	.loc	16	162	0\n"
+"	ld.param.u64 	%rd33, [__cudaparm_kernel_pair_lj3];\n"
+"	add.u64 	%rd34, %rd33, %rd31;\n"
+"	ld.global.v4.f32 	{%f56,%f57,%f58,_}, [%rd34+0];\n"
+"	mul.ftz.f32 	%f59, %f56, %f48;\n"
+"	sub.ftz.f32 	%f60, %f59, %f57;\n"
+"	mul.ftz.f32 	%f61, %f47, %f60;\n"
+"	sub.ftz.f32 	%f62, %f61, %f58;\n"
+"	fma.rn.ftz.f32 	%f28, %f29, %f62, %f28;\n"
+"$Lt_0_21250:\n"
+"	ld.param.s32 	%r44, [__cudaparm_kernel_pair_vflag];\n"
+"	mov.u32 	%r45, 0;\n"
+"	setp.le.s32 	%p6, %r44, %r45;\n"
+"	@%p6 bra 	$Lt_0_21762;\n"
+"	.loc	16	165	0\n"
+"	mov.f32 	%f63, %f6;\n"
+"	mul.ftz.f32 	%f64, %f39, %f39;\n"
+"	fma.rn.ftz.f32 	%f65, %f55, %f64, %f63;\n"
+"	mov.f32 	%f6, %f65;\n"
+"	.loc	16	166	0\n"
+"	mov.f32 	%f66, %f8;\n"
+"	fma.rn.ftz.f32 	%f67, %f55, %f41, %f66;\n"
+"	mov.f32 	%f8, %f67;\n"
+"	.loc	16	167	0\n"
+"	mov.f32 	%f68, %f10;\n"
+"	mul.ftz.f32 	%f69, %f40, %f40;\n"
+"	fma.rn.ftz.f32 	%f70, %f55, %f69, %f68;\n"
+"	mov.f32 	%f10, %f70;\n"
+"	.loc	16	168	0\n"
+"	mov.f32 	%f71, %f12;\n"
+"	mul.ftz.f32 	%f72, %f38, %f39;\n"
+"	fma.rn.ftz.f32 	%f73, %f55, %f72, %f71;\n"
+"	mov.f32 	%f12, %f73;\n"
+"	.loc	16	169	0\n"
+"	mov.f32 	%f74, %f14;\n"
+"	mul.ftz.f32 	%f75, %f39, %f40;\n"
+"	fma.rn.ftz.f32 	%f76, %f55, %f75, %f74;\n"
+"	mov.f32 	%f14, %f76;\n"
+"	.loc	16	170	0\n"
+"	mul.ftz.f32 	%f77, %f38, %f40;\n"
+"	fma.rn.ftz.f32 	%f15, %f55, %f77, %f15;\n"
+"	mov.f32 	%f16, %f15;\n"
+"$Lt_0_21762:\n"
+"$Lt_0_20738:\n"
+"	.loc	16	133	0\n"
+"	mul.lo.u64 	%rd35, %rd24, 4;\n"
+"	add.u64 	%rd16, %rd16, %rd35;\n"
+"	setp.lt.u64 	%p7, %rd16, %rd13;\n"
+"	@%p7 bra 	$Lt_0_20482;\n"
+"	bra.uni 	$Lt_0_18946;\n"
+"$Lt_0_28162:\n"
+"	mov.f32 	%f25, 0f00000000;    	\n"
+"	mov.f32 	%f26, 0f00000000;    	\n"
+"	mov.f32 	%f27, 0f00000000;    	\n"
+"	mov.f32 	%f28, 0f00000000;    	\n"
+"	bra.uni 	$Lt_0_18946;\n"
+"$Lt_0_19202:\n"
+"	mov.f32 	%f25, 0f00000000;    	\n"
+"	mov.f32 	%f26, 0f00000000;    	\n"
+"	mov.f32 	%f27, 0f00000000;    	\n"
+"	mov.f32 	%f28, 0f00000000;    	\n"
+"$Lt_0_18946:\n"
+"	mov.u32 	%r46, 1;\n"
+"	setp.le.s32 	%p8, %r1, %r46;\n"
+"	@%p8 bra 	$Lt_0_24578;\n"
+"	.loc	16	181	0\n"
+"	mov.u64 	%rd36, __cuda___cuda_local_var_32582_35_non_const_red_acc108;\n"
+"	cvt.s64.s32 	%rd37, %r2;\n"
+"	mul.wide.s32 	%rd38, %r2, 4;\n"
+"	add.u64 	%rd39, %rd36, %rd38;\n"
+"	mov.f32 	%f78, %f27;\n"
+"	st.shared.f32 	[%rd39+0], %f78;\n"
+"	.loc	16	182	0\n"
+"	mov.f32 	%f79, %f26;\n"
+"	st.shared.f32 	[%rd39+512], %f79;\n"
+"	.loc	16	183	0\n"
+"	mov.f32 	%f80, %f25;\n"
+"	st.shared.f32 	[%rd39+1024], %f80;\n"
+"	.loc	16	184	0\n"
+"	mov.f32 	%f81, %f28;\n"
+"	st.shared.f32 	[%rd39+1536], %f81;\n"
+"	.loc	16	186	0\n"
+"	shr.s32 	%r47, %r1, 31;\n"
+"	mov.s32 	%r48, 1;\n"
+"	and.b32 	%r49, %r47, %r48;\n"
+"	add.s32 	%r50, %r49, %r1;\n"
+"	shr.s32 	%r51, %r50, 1;\n"
+"	mov.s32 	%r52, %r51;\n"
+"	mov.u32 	%r53, 0;\n"
+"	setp.ne.u32 	%p9, %r51, %r53;\n"
+"	@!%p9 bra 	$Lt_0_23042;\n"
+"$Lt_0_23554:\n"
+"	setp.ge.u32 	%p10, %r6, %r52;\n"
+"	@%p10 bra 	$Lt_0_23810;\n"
+"	.loc	16	189	0\n"
+"	add.u32 	%r54, %r2, %r52;\n"
+"	cvt.u64.u32 	%rd40, %r54;\n"
+"	mul.wide.u32 	%rd41, %r54, 4;\n"
+"	add.u64 	%rd42, %rd36, %rd41;\n"
+"	ld.shared.f32 	%f82, [%rd42+0];\n"
+"	add.ftz.f32 	%f78, %f82, %f78;\n"
+"	st.shared.f32 	[%rd39+0], %f78;\n"
+"	ld.shared.f32 	%f83, [%rd42+512];\n"
+"	add.ftz.f32 	%f79, %f83, %f79;\n"
+"	st.shared.f32 	[%rd39+512], %f79;\n"
+"	ld.shared.f32 	%f84, [%rd42+1024];\n"
+"	add.ftz.f32 	%f80, %f84, %f80;\n"
+"	st.shared.f32 	[%rd39+1024], %f80;\n"
+"	ld.shared.f32 	%f85, [%rd42+1536];\n"
+"	add.ftz.f32 	%f81, %f85, %f81;\n"
+"	st.shared.f32 	[%rd39+1536], %f81;\n"
+"$Lt_0_23810:\n"
+"	.loc	16	186	0\n"
+"	shr.u32 	%r52, %r52, 1;\n"
+"	mov.u32 	%r55, 0;\n"
+"	setp.ne.u32 	%p11, %r52, %r55;\n"
+"	@%p11 bra 	$Lt_0_23554;\n"
+"$Lt_0_23042:\n"
+"	.loc	16	193	0\n"
+"	mov.f32 	%f27, %f78;\n"
+"	.loc	16	194	0\n"
+"	mov.f32 	%f26, %f79;\n"
+"	.loc	16	195	0\n"
+"	mov.f32 	%f25, %f80;\n"
+"	.loc	16	196	0\n"
+"	mov.f32 	%f28, %f81;\n"
+"	ld.param.s32 	%r56, [__cudaparm_kernel_pair_vflag];\n"
+"	mov.u32 	%r57, 0;\n"
+"	setp.le.s32 	%p12, %r56, %r57;\n"
+"	@%p12 bra 	$Lt_0_24578;\n"
+"	.loc	16	200	0\n"
+"	mov.f32 	%f78, %f6;\n"
+"	st.shared.f32 	[%rd39+0], %f78;\n"
+"	mov.f32 	%f79, %f8;\n"
+"	st.shared.f32 	[%rd39+512], %f79;\n"
+"	mov.f32 	%f80, %f10;\n"
+"	st.shared.f32 	[%rd39+1024], %f80;\n"
+"	mov.f32 	%f81, %f12;\n"
+"	st.shared.f32 	[%rd39+1536], %f81;\n"
+"	mov.f32 	%f86, %f14;\n"
+"	st.shared.f32 	[%rd39+2048], %f86;\n"
+"	mov.f32 	%f87, %f16;\n"
+"	st.shared.f32 	[%rd39+2560], %f87;\n"
+"	.loc	16	202	0\n"
+"	mov.s32 	%r58, %r51;\n"
+"	@!%p9 bra 	$Lt_0_25090;\n"
+"$Lt_0_25602:\n"
+"	setp.ge.u32 	%p13, %r6, %r58;\n"
+"	@%p13 bra 	$Lt_0_25858;\n"
+"	.loc	16	205	0\n"
+"	add.u32 	%r59, %r2, %r58;\n"
+"	cvt.u64.u32 	%rd43, %r59;\n"
+"	mul.wide.u32 	%rd44, %r59, 4;\n"
+"	add.u64 	%rd45, %rd36, %rd44;\n"
+"	ld.shared.f32 	%f88, [%rd45+0];\n"
+"	add.ftz.f32 	%f78, %f88, %f78;\n"
+"	st.shared.f32 	[%rd39+0], %f78;\n"
+"	ld.shared.f32 	%f89, [%rd45+512];\n"
+"	add.ftz.f32 	%f79, %f89, %f79;\n"
+"	st.shared.f32 	[%rd39+512], %f79;\n"
+"	ld.shared.f32 	%f90, [%rd45+1024];\n"
+"	add.ftz.f32 	%f80, %f90, %f80;\n"
+"	st.shared.f32 	[%rd39+1024], %f80;\n"
+"	ld.shared.f32 	%f91, [%rd45+1536];\n"
+"	add.ftz.f32 	%f81, %f91, %f81;\n"
+"	st.shared.f32 	[%rd39+1536], %f81;\n"
+"	ld.shared.f32 	%f92, [%rd45+2048];\n"
+"	add.ftz.f32 	%f86, %f92, %f86;\n"
+"	st.shared.f32 	[%rd39+2048], %f86;\n"
+"	ld.shared.f32 	%f93, [%rd45+2560];\n"
+"	add.ftz.f32 	%f87, %f93, %f87;\n"
+"	st.shared.f32 	[%rd39+2560], %f87;\n"
+"$Lt_0_25858:\n"
+"	.loc	16	202	0\n"
+"	shr.u32 	%r58, %r58, 1;\n"
+"	mov.u32 	%r60, 0;\n"
+"	setp.ne.u32 	%p14, %r58, %r60;\n"
+"	@%p14 bra 	$Lt_0_25602;\n"
+"$Lt_0_25090:\n"
+"	.loc	16	210	0\n"
+"	mov.f32 	%f6, %f78;\n"
+"	mov.f32 	%f8, %f79;\n"
+"	mov.f32 	%f10, %f80;\n"
+"	mov.f32 	%f12, %f81;\n"
+"	mov.f32 	%f14, %f86;\n"
+"	mov.f32 	%f16, %f87;\n"
+"$Lt_0_24578:\n"
+"$Lt_0_22530:\n"
+"	selp.s32 	%r61, 1, 0, %p1;\n"
+"	mov.s32 	%r62, 0;\n"
+"	set.eq.u32.s32 	%r63, %r6, %r62;\n"
+"	neg.s32 	%r64, %r63;\n"
+"	and.b32 	%r65, %r61, %r64;\n"
+"	mov.u32 	%r66, 0;\n"
+"	setp.eq.s32 	%p15, %r65, %r66;\n"
+"	@%p15 bra 	$Lt_0_26626;\n"
+"	.loc	16	216	0\n"
+"	cvt.s64.s32 	%rd46, %r9;\n"
+"	ld.param.u64 	%rd47, [__cudaparm_kernel_pair_engv];\n"
+"	mul.wide.s32 	%rd48, %r9, 4;\n"
+"	add.u64 	%rd49, %rd47, %rd48;\n"
+"	ld.param.s32 	%r67, [__cudaparm_kernel_pair_eflag];\n"
+"	mov.u32 	%r68, 0;\n"
+"	setp.le.s32 	%p16, %r67, %r68;\n"
+"	@%p16 bra 	$Lt_0_27138;\n"
+"	.loc	16	218	0\n"
+"	st.global.f32 	[%rd49+0], %f28;\n"
+"	.loc	16	219	0\n"
+"	cvt.s64.s32 	%rd50, %r10;\n"
+"	mul.wide.s32 	%rd51, %r10, 4;\n"
+"	add.u64 	%rd49, %rd49, %rd51;\n"
+"$Lt_0_27138:\n"
+"	ld.param.s32 	%r69, [__cudaparm_kernel_pair_vflag];\n"
+"	mov.u32 	%r70, 0;\n"
+"	setp.le.s32 	%p17, %r69, %r70;\n"
+"	@%p17 bra 	$Lt_0_27650;\n"
+"	.loc	16	223	0\n"
+"	mov.f32 	%f94, %f6;\n"
+"	st.global.f32 	[%rd49+0], %f94;\n"
+"	.loc	16	224	0\n"
+"	cvt.s64.s32 	%rd52, %r10;\n"
+"	mul.wide.s32 	%rd53, %r10, 4;\n"
+"	add.u64 	%rd54, %rd53, %rd49;\n"
+"	.loc	16	223	0\n"
+"	mov.f32 	%f95, %f8;\n"
+"	st.global.f32 	[%rd54+0], %f95;\n"
+"	.loc	16	224	0\n"
+"	add.u64 	%rd55, %rd53, %rd54;\n"
+"	.loc	16	223	0\n"
+"	mov.f32 	%f96, %f10;\n"
+"	st.global.f32 	[%rd55+0], %f96;\n"
+"	.loc	16	224	0\n"
+"	add.u64 	%rd56, %rd53, %rd55;\n"
+"	.loc	16	223	0\n"
+"	mov.f32 	%f97, %f12;\n"
+"	st.global.f32 	[%rd56+0], %f97;\n"
+"	.loc	16	224	0\n"
+"	add.u64 	%rd49, %rd53, %rd56;\n"
+"	.loc	16	223	0\n"
+"	mov.f32 	%f98, %f14;\n"
+"	st.global.f32 	[%rd49+0], %f98;\n"
+"	mov.f32 	%f99, %f16;\n"
+"	add.u64 	%rd57, %rd53, %rd49;\n"
+"	st.global.f32 	[%rd57+0], %f99;\n"
+"$Lt_0_27650:\n"
+"	.loc	16	227	0\n"
+"	ld.param.u64 	%rd58, [__cudaparm_kernel_pair_ans];\n"
+"	mul.lo.u64 	%rd59, %rd46, 16;\n"
+"	add.u64 	%rd60, %rd58, %rd59;\n"
+"	mov.f32 	%f100, %f101;\n"
+"	st.global.v4.f32 	[%rd60+0], {%f27,%f26,%f25,%f100};\n"
+"$Lt_0_26626:\n"
+"	.loc	16	229	0\n"
+"	exit;\n"
+"$LDWend_kernel_pair:\n"
+"	}\n"
+"	.entry kernel_pair_fast (\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_x_,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_lj1_in,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_lj3_in,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_dev_packed,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_ans,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_engv,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_eflag,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_vflag,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_inum,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_t_per_atom)\n"
+"	{\n"
+"	.reg .u32 %r<74>;\n"
+"	.reg .u64 %rd<74>;\n"
+"	.reg .f32 %f<109>;\n"
+"	.reg .pred %p<22>;\n"
+"	.shared .align 4 .b8 __cuda___cuda_local_var_32648_33_non_const_sp_lj3268[16];\n"
+"	.shared .align 16 .b8 __cuda___cuda_local_var_32646_34_non_const_lj13296[1936];\n"
+"	.shared .align 16 .b8 __cuda___cuda_local_var_32647_34_non_const_lj35232[1936];\n"
+"	.shared .align 4 .b8 __cuda___cuda_local_var_32737_35_non_const_red_acc7168[3072];\n"
+"	.loc	16	237	0\n"
+"$LDWbegin_kernel_pair_fast:\n"
+"	cvt.s32.u32 	%r1, %tid.x;\n"
+"	mov.u32 	%r2, 3;\n"
+"	setp.gt.s32 	%p1, %r1, %r2;\n"
+"	@%p1 bra 	$Lt_1_21250;\n"
+"	.loc	16	247	0\n"
+"	mov.u64 	%rd1, __cuda___cuda_local_var_32648_33_non_const_sp_lj3268;\n"
+"	cvt.s64.s32 	%rd2, %r1;\n"
+"	mul.wide.s32 	%rd3, %r1, 4;\n"
+"	ld.param.u64 	%rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];\n"
+"	add.u64 	%rd5, %rd4, %rd3;\n"
+"	ld.global.f32 	%f1, [%rd5+0];\n"
+"	add.u64 	%rd6, %rd3, %rd1;\n"
+"	st.shared.f32 	[%rd6+0], %f1;\n"
+"$Lt_1_21250:\n"
+"	mov.u64 	%rd1, __cuda___cuda_local_var_32648_33_non_const_sp_lj3268;\n"
+"	mov.u32 	%r3, 120;\n"
+"	setp.gt.s32 	%p2, %r1, %r3;\n"
+"	@%p2 bra 	$Lt_1_21762;\n"
+"	.loc	16	249	0\n"
+"	mov.u64 	%rd7, __cuda___cuda_local_var_32646_34_non_const_lj13296;\n"
+"	cvt.s64.s32 	%rd8, %r1;\n"
+"	mul.wide.s32 	%rd9, %r1, 16;\n"
+"	ld.param.u64 	%rd10, [__cudaparm_kernel_pair_fast_lj1_in];\n"
+"	add.u64 	%rd11, %rd10, %rd9;\n"
+"	add.u64 	%rd12, %rd9, %rd7;\n"
+"	ld.global.v4.f32 	{%f2,%f3,%f4,%f5}, [%rd11+0];\n"
+"	st.shared.v4.f32 	[%rd12+0], {%f2,%f3,%f4,%f5};\n"
+"	ld.param.s32 	%r4, [__cudaparm_kernel_pair_fast_eflag];\n"
+"	mov.u32 	%r5, 0;\n"
+"	setp.le.s32 	%p3, %r4, %r5;\n"
+"	@%p3 bra 	$Lt_1_22274;\n"
+"	.loc	16	251	0\n"
+"	mov.u64 	%rd13, __cuda___cuda_local_var_32647_34_non_const_lj35232;\n"
+"	ld.param.u64 	%rd14, [__cudaparm_kernel_pair_fast_lj3_in];\n"
+"	add.u64 	%rd15, %rd14, %rd9;\n"
+"	add.u64 	%rd16, %rd9, %rd13;\n"
+"	ld.global.v4.f32 	{%f6,%f7,%f8,%f9}, [%rd15+0];\n"
+"	st.shared.v4.f32 	[%rd16+0], {%f6,%f7,%f8,%f9};\n"
+"$Lt_1_22274:\n"
+"	mov.u64 	%rd13, __cuda___cuda_local_var_32647_34_non_const_lj35232;\n"
+"$Lt_1_21762:\n"
+"	mov.u64 	%rd7, __cuda___cuda_local_var_32646_34_non_const_lj13296;\n"
+"	mov.u64 	%rd13, __cuda___cuda_local_var_32647_34_non_const_lj35232;\n"
+"	.loc	16	261	0\n"
+"	mov.f32 	%f10, 0f00000000;    	\n"
+"	mov.f32 	%f11, %f10;\n"
+"	mov.f32 	%f12, 0f00000000;    	\n"
+"	mov.f32 	%f13, %f12;\n"
+"	mov.f32 	%f14, 0f00000000;    	\n"
+"	mov.f32 	%f15, %f14;\n"
+"	mov.f32 	%f16, 0f00000000;    	\n"
+"	mov.f32 	%f17, %f16;\n"
+"	mov.f32 	%f18, 0f00000000;    	\n"
+"	mov.f32 	%f19, %f18;\n"
+"	mov.f32 	%f20, 0f00000000;    	\n"
+"	mov.f32 	%f21, %f20;\n"
+"	.loc	16	263	0\n"
+"	bar.sync 	0;\n"
+"	ld.param.s32 	%r6, [__cudaparm_kernel_pair_fast_t_per_atom];\n"
+"	div.s32 	%r7, %r1, %r6;\n"
+"	cvt.s32.u32 	%r8, %ntid.x;\n"
+"	div.s32 	%r9, %r8, %r6;\n"
+"	rem.s32 	%r10, %r1, %r6;\n"
+"	cvt.s32.u32 	%r11, %ctaid.x;\n"
+"	mul.lo.s32 	%r12, %r11, %r9;\n"
+"	add.s32 	%r13, %r7, %r12;\n"
+"	ld.param.s32 	%r14, [__cudaparm_kernel_pair_fast_inum];\n"
+"	setp.lt.s32 	%p4, %r13, %r14;\n"
+"	@!%p4 bra 	$Lt_1_23042;\n"
+"	.loc	16	269	0\n"
+"	ld.param.s32 	%r15, [__cudaparm_kernel_pair_fast_nbor_pitch];\n"
+"	cvt.s64.s32 	%rd17, %r15;\n"
+"	mul.wide.s32 	%rd18, %r15, 4;\n"
+"	cvt.s64.s32 	%rd19, %r13;\n"
+"	mul.wide.s32 	%rd20, %r13, 4;\n"
+"	ld.param.u64 	%rd21, [__cudaparm_kernel_pair_fast_dev_nbor];\n"
+"	add.u64 	%rd22, %rd20, %rd21;\n"
+"	add.u64 	%rd23, %rd18, %rd22;\n"
+"	ld.global.s32 	%r16, [%rd23+0];\n"
+"	add.u64 	%rd24, %rd18, %rd23;\n"
+"	ld.param.u64 	%rd25, [__cudaparm_kernel_pair_fast_dev_packed];\n"
+"	setp.ne.u64 	%p5, %rd25, %rd21;\n"
+"	@%p5 bra 	$Lt_1_23554;\n"
+"	.loc	16	275	0\n"
+"	cvt.s32.s64 	%r17, %rd17;\n"
+"	mul.lo.s32 	%r18, %r17, %r16;\n"
+"	cvt.s64.s32 	%rd26, %r18;\n"
+"	mul.wide.s32 	%rd27, %r18, 4;\n"
+"	add.u64 	%rd28, %rd24, %rd27;\n"
+"	.loc	16	276	0\n"
+"	mul.lo.s32 	%r19, %r10, %r17;\n"
+"	cvt.s64.s32 	%rd29, %r19;\n"
+"	mul.wide.s32 	%rd30, %r19, 4;\n"
+"	add.u64 	%rd31, %rd24, %rd30;\n"
+"	.loc	16	277	0\n"
+"	mul.lo.s32 	%r20, %r17, %r6;\n"
+"	bra.uni 	$Lt_1_23298;\n"
+"$Lt_1_23554:\n"
+"	.loc	16	279	0\n"
+"	ld.global.s32 	%r21, [%rd24+0];\n"
+"	cvt.s64.s32 	%rd32, %r21;\n"
+"	mul.wide.s32 	%rd33, %r21, 4;\n"
+"	add.u64 	%rd34, %rd25, %rd33;\n"
+"	.loc	16	280	0\n"
+"	cvt.s64.s32 	%rd35, %r16;\n"
+"	mul.wide.s32 	%rd36, %r16, 4;\n"
+"	add.u64 	%rd28, %rd34, %rd36;\n"
+"	.loc	16	281	0\n"
+"	mov.s32 	%r20, %r6;\n"
+"	.loc	16	282	0\n"
+"	cvt.s64.s32 	%rd37, %r10;\n"
+"	mul.wide.s32 	%rd38, %r10, 4;\n"
+"	add.u64 	%rd31, %rd34, %rd38;\n"
+"$Lt_1_23298:\n"
+"	.loc	16	285	0\n"
+"	ld.global.s32 	%r22, [%rd22+0];\n"
+"	mov.u32 	%r23, %r22;\n"
+"	mov.s32 	%r24, 0;\n"
+"	mov.u32 	%r25, %r24;\n"
+"	mov.s32 	%r26, 0;\n"
+"	mov.u32 	%r27, %r26;\n"
+"	mov.s32 	%r28, 0;\n"
+"	mov.u32 	%r29, %r28;\n"
+"	tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r23,%r25,%r27,%r29}];\n"
+"	mov.f32 	%f26, %f22;\n"
+"	mov.f32 	%f27, %f23;\n"
+"	mov.f32 	%f28, %f24;\n"
+"	mov.f32 	%f29, %f25;\n"
+"	setp.ge.u64 	%p6, %rd31, %rd28;\n"
+"	@%p6 bra 	$Lt_1_32002;\n"
+"	cvt.rzi.ftz.s32.f32 	%r30, %f29;\n"
+"	cvt.s64.s32 	%rd39, %r20;\n"
+"	mul.lo.s32 	%r31, %r30, 11;\n"
+"	cvt.rn.f32.s32 	%f30, %r31;\n"
+"	mov.f32 	%f31, 0f00000000;    	\n"
+"	mov.f32 	%f32, 0f00000000;    	\n"
+"	mov.f32 	%f33, 0f00000000;    	\n"
+"	mov.f32 	%f34, 0f00000000;    	\n"
+"$Lt_1_24322:\n"
+"	.loc	16	292	0\n"
+"	ld.global.s32 	%r32, [%rd31+0];\n"
+"	.loc	16	296	0\n"
+"	and.b32 	%r33, %r32, 1073741823;\n"
+"	mov.u32 	%r34, %r33;\n"
+"	mov.s32 	%r35, 0;\n"
+"	mov.u32 	%r36, %r35;\n"
+"	mov.s32 	%r37, 0;\n"
+"	mov.u32 	%r38, %r37;\n"
+"	mov.s32 	%r39, 0;\n"
+"	mov.u32 	%r40, %r39;\n"
+"	tex.1d.v4.f32.s32 {%f35,%f36,%f37,%f38},[pos_tex,{%r34,%r36,%r38,%r40}];\n"
+"	mov.f32 	%f39, %f35;\n"
+"	mov.f32 	%f40, %f36;\n"
+"	mov.f32 	%f41, %f37;\n"
+"	mov.f32 	%f42, %f38;\n"
+"	sub.ftz.f32 	%f43, %f27, %f40;\n"
+"	sub.ftz.f32 	%f44, %f26, %f39;\n"
+"	sub.ftz.f32 	%f45, %f28, %f41;\n"
+"	mul.ftz.f32 	%f46, %f43, %f43;\n"
+"	fma.rn.ftz.f32 	%f47, %f44, %f44, %f46;\n"
+"	fma.rn.ftz.f32 	%f48, %f45, %f45, %f47;\n"
+"	add.ftz.f32 	%f49, %f30, %f42;\n"
+"	cvt.rzi.ftz.s32.f32 	%r41, %f49;\n"
+"	cvt.s64.s32 	%rd40, %r41;\n"
+"	mul.wide.s32 	%rd41, %r41, 16;\n"
+"	add.u64 	%rd42, %rd41, %rd7;\n"
+"	ld.shared.f32 	%f50, [%rd42+8];\n"
+"	setp.gt.ftz.f32 	%p7, %f50, %f48;\n"
+"	@!%p7 bra 	$Lt_1_25602;\n"
+"	.loc	16	309	0\n"
+"	rcp.approx.ftz.f32 	%f51, %f48;\n"
+"	mul.ftz.f32 	%f52, %f51, %f51;\n"
+"	mul.ftz.f32 	%f53, %f51, %f52;\n"
+"	sqrt.approx.ftz.f32 	%f54, %f53;\n"
+"	mul.ftz.f32 	%f55, %f51, %f53;\n"
+"	ld.shared.v2.f32 	{%f56,%f57}, [%rd42+0];\n"
+"	mul.ftz.f32 	%f58, %f56, %f54;\n"
+"	sub.ftz.f32 	%f59, %f58, %f57;\n"
+"	mul.ftz.f32 	%f60, %f55, %f59;\n"
+"	.loc	16	311	0\n"
+"	fma.rn.ftz.f32 	%f33, %f44, %f60, %f33;\n"
+"	.loc	16	312	0\n"
+"	fma.rn.ftz.f32 	%f32, %f43, %f60, %f32;\n"
+"	.loc	16	313	0\n"
+"	fma.rn.ftz.f32 	%f31, %f45, %f60, %f31;\n"
+"	ld.param.s32 	%r42, [__cudaparm_kernel_pair_fast_eflag];\n"
+"	mov.u32 	%r43, 0;\n"
+"	setp.le.s32 	%p8, %r42, %r43;\n"
+"	@%p8 bra 	$Lt_1_25090;\n"
+"	.loc	16	316	0\n"
+"	add.u64 	%rd43, %rd41, %rd13;\n"
+"	ld.shared.v4.f32 	{%f61,%f62,%f63,_}, [%rd43+0];\n"
+"	mul.ftz.f32 	%f64, %f61, %f54;\n"
+"	sub.ftz.f32 	%f65, %f64, %f62;\n"
+"	mul.ftz.f32 	%f66, %f53, %f65;\n"
+"	.loc	16	317	0\n"
+"	shr.s32 	%r44, %r32, 30;\n"
+"	and.b32 	%r45, %r44, 3;\n"
+"	cvt.s64.s32 	%rd44, %r45;\n"
+"	mul.wide.s32 	%rd45, %r45, 4;\n"
+"	add.u64 	%rd46, %rd1, %rd45;\n"
+"	ld.shared.f32 	%f67, [%rd46+0];\n"
+"	sub.ftz.f32 	%f68, %f66, %f63;\n"
+"	fma.rn.ftz.f32 	%f34, %f67, %f68, %f34;\n"
+"$Lt_1_25090:\n"
+"	ld.param.s32 	%r46, [__cudaparm_kernel_pair_fast_vflag];\n"
+"	mov.u32 	%r47, 0;\n"
+"	setp.le.s32 	%p9, %r46, %r47;\n"
+"	@%p9 bra 	$Lt_1_25602;\n"
+"	.loc	16	320	0\n"
+"	mov.f32 	%f69, %f11;\n"
+"	mul.ftz.f32 	%f70, %f44, %f44;\n"
+"	fma.rn.ftz.f32 	%f71, %f60, %f70, %f69;\n"
+"	mov.f32 	%f11, %f71;\n"
+"	.loc	16	321	0\n"
+"	mov.f32 	%f72, %f13;\n"
+"	fma.rn.ftz.f32 	%f73, %f60, %f46, %f72;\n"
+"	mov.f32 	%f13, %f73;\n"
+"	.loc	16	322	0\n"
+"	mov.f32 	%f74, %f15;\n"
+"	mul.ftz.f32 	%f75, %f45, %f45;\n"
+"	fma.rn.ftz.f32 	%f76, %f60, %f75, %f74;\n"
+"	mov.f32 	%f15, %f76;\n"
+"	.loc	16	323	0\n"
+"	mov.f32 	%f77, %f17;\n"
+"	mul.ftz.f32 	%f78, %f43, %f44;\n"
+"	fma.rn.ftz.f32 	%f79, %f60, %f78, %f77;\n"
+"	mov.f32 	%f17, %f79;\n"
+"	.loc	16	324	0\n"
+"	mov.f32 	%f80, %f19;\n"
+"	mul.ftz.f32 	%f81, %f44, %f45;\n"
+"	fma.rn.ftz.f32 	%f82, %f60, %f81, %f80;\n"
+"	mov.f32 	%f19, %f82;\n"
+"	.loc	16	325	0\n"
+"	mul.ftz.f32 	%f83, %f43, %f45;\n"
+"	fma.rn.ftz.f32 	%f20, %f60, %f83, %f20;\n"
+"	mov.f32 	%f21, %f20;\n"
+"$Lt_1_25602:\n"
+"$Lt_1_24578:\n"
+"	.loc	16	290	0\n"
+"	mul.lo.u64 	%rd47, %rd39, 4;\n"
+"	add.u64 	%rd31, %rd31, %rd47;\n"
+"	setp.lt.u64 	%p10, %rd31, %rd28;\n"
+"	@%p10 bra 	$Lt_1_24322;\n"
+"	bra.uni 	$Lt_1_22786;\n"
+"$Lt_1_32002:\n"
+"	mov.f32 	%f31, 0f00000000;    	\n"
+"	mov.f32 	%f32, 0f00000000;    	\n"
+"	mov.f32 	%f33, 0f00000000;    	\n"
+"	mov.f32 	%f34, 0f00000000;    	\n"
+"	bra.uni 	$Lt_1_22786;\n"
+"$Lt_1_23042:\n"
+"	mov.f32 	%f31, 0f00000000;    	\n"
+"	mov.f32 	%f32, 0f00000000;    	\n"
+"	mov.f32 	%f33, 0f00000000;    	\n"
+"	mov.f32 	%f34, 0f00000000;    	\n"
+"$Lt_1_22786:\n"
+"	mov.u32 	%r48, 1;\n"
+"	setp.le.s32 	%p11, %r6, %r48;\n"
+"	@%p11 bra 	$Lt_1_28418;\n"
+"	.loc	16	336	0\n"
+"	mov.u64 	%rd48, __cuda___cuda_local_var_32737_35_non_const_red_acc7168;\n"
+"	cvt.s64.s32 	%rd49, %r1;\n"
+"	mul.wide.s32 	%rd50, %r1, 4;\n"
+"	add.u64 	%rd51, %rd48, %rd50;\n"
+"	mov.f32 	%f84, %f33;\n"
+"	st.shared.f32 	[%rd51+0], %f84;\n"
+"	.loc	16	337	0\n"
+"	mov.f32 	%f85, %f32;\n"
+"	st.shared.f32 	[%rd51+512], %f85;\n"
+"	.loc	16	338	0\n"
+"	mov.f32 	%f86, %f31;\n"
+"	st.shared.f32 	[%rd51+1024], %f86;\n"
+"	.loc	16	339	0\n"
+"	mov.f32 	%f87, %f34;\n"
+"	st.shared.f32 	[%rd51+1536], %f87;\n"
+"	.loc	16	341	0\n"
+"	shr.s32 	%r49, %r6, 31;\n"
+"	mov.s32 	%r50, 1;\n"
+"	and.b32 	%r51, %r49, %r50;\n"
+"	add.s32 	%r52, %r51, %r6;\n"
+"	shr.s32 	%r53, %r52, 1;\n"
+"	mov.s32 	%r54, %r53;\n"
+"	mov.u32 	%r55, 0;\n"
+"	setp.ne.u32 	%p12, %r53, %r55;\n"
+"	@!%p12 bra 	$Lt_1_26882;\n"
+"$Lt_1_27394:\n"
+"	setp.ge.u32 	%p13, %r10, %r54;\n"
+"	@%p13 bra 	$Lt_1_27650;\n"
+"	.loc	16	344	0\n"
+"	add.u32 	%r56, %r1, %r54;\n"
+"	cvt.u64.u32 	%rd52, %r56;\n"
+"	mul.wide.u32 	%rd53, %r56, 4;\n"
+"	add.u64 	%rd54, %rd48, %rd53;\n"
+"	ld.shared.f32 	%f88, [%rd54+0];\n"
+"	add.ftz.f32 	%f84, %f88, %f84;\n"
+"	st.shared.f32 	[%rd51+0], %f84;\n"
+"	ld.shared.f32 	%f89, [%rd54+512];\n"
+"	add.ftz.f32 	%f85, %f89, %f85;\n"
+"	st.shared.f32 	[%rd51+512], %f85;\n"
+"	ld.shared.f32 	%f90, [%rd54+1024];\n"
+"	add.ftz.f32 	%f86, %f90, %f86;\n"
+"	st.shared.f32 	[%rd51+1024], %f86;\n"
+"	ld.shared.f32 	%f91, [%rd54+1536];\n"
+"	add.ftz.f32 	%f87, %f91, %f87;\n"
+"	st.shared.f32 	[%rd51+1536], %f87;\n"
+"$Lt_1_27650:\n"
+"	.loc	16	341	0\n"
+"	shr.u32 	%r54, %r54, 1;\n"
+"	mov.u32 	%r57, 0;\n"
+"	setp.ne.u32 	%p14, %r54, %r57;\n"
+"	@%p14 bra 	$Lt_1_27394;\n"
+"$Lt_1_26882:\n"
+"	.loc	16	348	0\n"
+"	mov.f32 	%f33, %f84;\n"
+"	.loc	16	349	0\n"
+"	mov.f32 	%f32, %f85;\n"
+"	.loc	16	350	0\n"
+"	mov.f32 	%f31, %f86;\n"
+"	.loc	16	351	0\n"
+"	mov.f32 	%f34, %f87;\n"
+"	ld.param.s32 	%r58, [__cudaparm_kernel_pair_fast_vflag];\n"
+"	mov.u32 	%r59, 0;\n"
+"	setp.le.s32 	%p15, %r58, %r59;\n"
+"	@%p15 bra 	$Lt_1_28418;\n"
+"	.loc	16	355	0\n"
+"	mov.f32 	%f84, %f11;\n"
+"	st.shared.f32 	[%rd51+0], %f84;\n"
+"	mov.f32 	%f85, %f13;\n"
+"	st.shared.f32 	[%rd51+512], %f85;\n"
+"	mov.f32 	%f86, %f15;\n"
+"	st.shared.f32 	[%rd51+1024], %f86;\n"
+"	mov.f32 	%f87, %f17;\n"
+"	st.shared.f32 	[%rd51+1536], %f87;\n"
+"	mov.f32 	%f92, %f19;\n"
+"	st.shared.f32 	[%rd51+2048], %f92;\n"
+"	mov.f32 	%f93, %f21;\n"
+"	st.shared.f32 	[%rd51+2560], %f93;\n"
+"	.loc	16	357	0\n"
+"	mov.s32 	%r60, %r53;\n"
+"	@!%p12 bra 	$Lt_1_28930;\n"
+"$Lt_1_29442:\n"
+"	setp.ge.u32 	%p16, %r10, %r60;\n"
+"	@%p16 bra 	$Lt_1_29698;\n"
+"	.loc	16	360	0\n"
+"	add.u32 	%r61, %r1, %r60;\n"
+"	cvt.u64.u32 	%rd55, %r61;\n"
+"	mul.wide.u32 	%rd56, %r61, 4;\n"
+"	add.u64 	%rd57, %rd48, %rd56;\n"
+"	ld.shared.f32 	%f94, [%rd57+0];\n"
+"	add.ftz.f32 	%f84, %f94, %f84;\n"
+"	st.shared.f32 	[%rd51+0], %f84;\n"
+"	ld.shared.f32 	%f95, [%rd57+512];\n"
+"	add.ftz.f32 	%f85, %f95, %f85;\n"
+"	st.shared.f32 	[%rd51+512], %f85;\n"
+"	ld.shared.f32 	%f96, [%rd57+1024];\n"
+"	add.ftz.f32 	%f86, %f96, %f86;\n"
+"	st.shared.f32 	[%rd51+1024], %f86;\n"
+"	ld.shared.f32 	%f97, [%rd57+1536];\n"
+"	add.ftz.f32 	%f87, %f97, %f87;\n"
+"	st.shared.f32 	[%rd51+1536], %f87;\n"
+"	ld.shared.f32 	%f98, [%rd57+2048];\n"
+"	add.ftz.f32 	%f92, %f98, %f92;\n"
+"	st.shared.f32 	[%rd51+2048], %f92;\n"
+"	ld.shared.f32 	%f99, [%rd57+2560];\n"
+"	add.ftz.f32 	%f93, %f99, %f93;\n"
+"	st.shared.f32 	[%rd51+2560], %f93;\n"
+"$Lt_1_29698:\n"
+"	.loc	16	357	0\n"
+"	shr.u32 	%r60, %r60, 1;\n"
+"	mov.u32 	%r62, 0;\n"
+"	setp.ne.u32 	%p17, %r60, %r62;\n"
+"	@%p17 bra 	$Lt_1_29442;\n"
+"$Lt_1_28930:\n"
+"	.loc	16	365	0\n"
+"	mov.f32 	%f11, %f84;\n"
+"	mov.f32 	%f13, %f85;\n"
+"	mov.f32 	%f15, %f86;\n"
+"	mov.f32 	%f17, %f87;\n"
+"	mov.f32 	%f19, %f92;\n"
+"	mov.f32 	%f21, %f93;\n"
+"$Lt_1_28418:\n"
+"$Lt_1_26370:\n"
+"	selp.s32 	%r63, 1, 0, %p4;\n"
+"	mov.s32 	%r64, 0;\n"
+"	set.eq.u32.s32 	%r65, %r10, %r64;\n"
+"	neg.s32 	%r66, %r65;\n"
+"	and.b32 	%r67, %r63, %r66;\n"
+"	mov.u32 	%r68, 0;\n"
+"	setp.eq.s32 	%p18, %r67, %r68;\n"
+"	@%p18 bra 	$Lt_1_30466;\n"
+"	.loc	16	371	0\n"
+"	cvt.s64.s32 	%rd58, %r13;\n"
+"	ld.param.u64 	%rd59, [__cudaparm_kernel_pair_fast_engv];\n"
+"	mul.wide.s32 	%rd60, %r13, 4;\n"
+"	add.u64 	%rd61, %rd59, %rd60;\n"
+"	ld.param.s32 	%r69, [__cudaparm_kernel_pair_fast_eflag];\n"
+"	mov.u32 	%r70, 0;\n"
+"	setp.le.s32 	%p19, %r69, %r70;\n"
+"	@%p19 bra 	$Lt_1_30978;\n"
+"	.loc	16	373	0\n"
+"	st.global.f32 	[%rd61+0], %f34;\n"
+"	.loc	16	374	0\n"
+"	cvt.s64.s32 	%rd62, %r14;\n"
+"	mul.wide.s32 	%rd63, %r14, 4;\n"
+"	add.u64 	%rd61, %rd61, %rd63;\n"
+"$Lt_1_30978:\n"
+"	ld.param.s32 	%r71, [__cudaparm_kernel_pair_fast_vflag];\n"
+"	mov.u32 	%r72, 0;\n"
+"	setp.le.s32 	%p20, %r71, %r72;\n"
+"	@%p20 bra 	$Lt_1_31490;\n"
+"	.loc	16	378	0\n"
+"	mov.f32 	%f100, %f11;\n"
+"	st.global.f32 	[%rd61+0], %f100;\n"
+"	.loc	16	379	0\n"
+"	cvt.s64.s32 	%rd64, %r14;\n"
+"	mul.wide.s32 	%rd65, %r14, 4;\n"
+"	add.u64 	%rd66, %rd65, %rd61;\n"
+"	.loc	16	378	0\n"
+"	mov.f32 	%f101, %f13;\n"
+"	st.global.f32 	[%rd66+0], %f101;\n"
+"	.loc	16	379	0\n"
+"	add.u64 	%rd67, %rd65, %rd66;\n"
+"	.loc	16	378	0\n"
+"	mov.f32 	%f102, %f15;\n"
+"	st.global.f32 	[%rd67+0], %f102;\n"
+"	.loc	16	379	0\n"
+"	add.u64 	%rd68, %rd65, %rd67;\n"
+"	.loc	16	378	0\n"
+"	mov.f32 	%f103, %f17;\n"
+"	st.global.f32 	[%rd68+0], %f103;\n"
+"	.loc	16	379	0\n"
+"	add.u64 	%rd61, %rd65, %rd68;\n"
+"	.loc	16	378	0\n"
+"	mov.f32 	%f104, %f19;\n"
+"	st.global.f32 	[%rd61+0], %f104;\n"
+"	mov.f32 	%f105, %f21;\n"
+"	add.u64 	%rd69, %rd65, %rd61;\n"
+"	st.global.f32 	[%rd69+0], %f105;\n"
+"$Lt_1_31490:\n"
+"	.loc	16	382	0\n"
+"	ld.param.u64 	%rd70, [__cudaparm_kernel_pair_fast_ans];\n"
+"	mul.lo.u64 	%rd71, %rd58, 16;\n"
+"	add.u64 	%rd72, %rd70, %rd71;\n"
+"	mov.f32 	%f106, %f107;\n"
+"	st.global.v4.f32 	[%rd72+0], {%f33,%f32,%f31,%f106};\n"
+"$Lt_1_30466:\n"
+"	.loc	16	384	0\n"
+"	exit;\n"
+"$LDWend_kernel_pair_fast:\n"
+"	}\n"
+;
--- a/lib/gpu/lj_cut_gpu_kernel.ptx
+++ b/lib/gpu/lj_cut_gpu_kernel.ptx
@ -0,0 +1,979 @@
+	.version 2.3
+	.target sm_20
+	.address_size 64
+	// compiled with /usr/local/cuda/open64/lib//be
+	// nvopencc 4.0 built on 2011-05-12
+
+	//-----------------------------------------------------------
+	// Compiling /tmp/tmpxft_0000bd91_00000000-9_lj_cut_gpu_kernel.cpp3.i (/home/sjplimp/ccBI#.gvU1PY)
+	//-----------------------------------------------------------
+
+	//-----------------------------------------------------------
+	// Options:
+	//-----------------------------------------------------------
+	//  Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
+	//  -O3	(Optimization level)
+	//  -g0	(Debug level)
+	//  -m2	(Report advisories)
+	//-----------------------------------------------------------
+
+	.file	1	"<command-line>"
+	.file	2	"/tmp/tmpxft_0000bd91_00000000-8_lj_cut_gpu_kernel.cudafe2.gpu"
+	.file	3	"/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
+	.file	4	"/usr/local/cuda/include/crt/device_runtime.h"
+	.file	5	"/usr/local/cuda/include/host_defines.h"
+	.file	6	"/usr/local/cuda/include/builtin_types.h"
+	.file	7	"/usr/local/cuda/include/device_types.h"
+	.file	8	"/usr/local/cuda/include/driver_types.h"
+	.file	9	"/usr/local/cuda/include/surface_types.h"
+	.file	10	"/usr/local/cuda/include/texture_types.h"
+	.file	11	"/usr/local/cuda/include/vector_types.h"
+	.file	12	"/usr/local/cuda/include/device_launch_parameters.h"
+	.file	13	"/usr/local/cuda/include/crt/storage_class.h"
+	.file	14	"/usr/include/bits/types.h"
+	.file	15	"/usr/include/time.h"
+	.file	16	"lj_cut_gpu_kernel.cu"
+	.file	17	"/usr/local/cuda/include/common_functions.h"
+	.file	18	"/usr/local/cuda/include/math_functions.h"
+	.file	19	"/usr/local/cuda/include/math_constants.h"
+	.file	20	"/usr/local/cuda/include/device_functions.h"
+	.file	21	"/usr/local/cuda/include/sm_11_atomic_functions.h"
+	.file	22	"/usr/local/cuda/include/sm_12_atomic_functions.h"
+	.file	23	"/usr/local/cuda/include/sm_13_double_functions.h"
+	.file	24	"/usr/local/cuda/include/sm_20_atomic_functions.h"
+	.file	25	"/usr/local/cuda/include/sm_20_intrinsics.h"
+	.file	26	"/usr/local/cuda/include/surface_functions.h"
+	.file	27	"/usr/local/cuda/include/texture_fetch_functions.h"
+	.file	28	"/usr/local/cuda/include/math_functions_dbl_ptx3.h"
+
+	.global .texref pos_tex;
+
+	.entry kernel_pair (
+		.param .u64 __cudaparm_kernel_pair_x_,
+		.param .u64 __cudaparm_kernel_pair_lj1,
+		.param .u64 __cudaparm_kernel_pair_lj3,
+		.param .s32 __cudaparm_kernel_pair_lj_types,
+		.param .u64 __cudaparm_kernel_pair_sp_lj_in,
+		.param .u64 __cudaparm_kernel_pair_dev_nbor,
+		.param .u64 __cudaparm_kernel_pair_dev_packed,
+		.param .u64 __cudaparm_kernel_pair_ans,
+		.param .u64 __cudaparm_kernel_pair_engv,
+		.param .s32 __cudaparm_kernel_pair_eflag,
+		.param .s32 __cudaparm_kernel_pair_vflag,
+		.param .s32 __cudaparm_kernel_pair_inum,
+		.param .s32 __cudaparm_kernel_pair_nbor_pitch,
+		.param .s32 __cudaparm_kernel_pair_t_per_atom)
+	{
+	.reg .u32 %r<72>;
+	.reg .u64 %rd<62>;
+	.reg .f32 %f<102>;
+	.reg .pred %p<19>;
+	.shared .align 16 .b8 __cuda___cuda_local_var_32497_33_non_const_sp_lj92[16];
+	.shared .align 4 .b8 __cuda___cuda_local_var_32581_35_non_const_red_acc108[3072];
+	// __cuda_local_var_32504_10_non_const_f = 48
+	// __cuda_local_var_32508_9_non_const_virial = 16
+	.loc	16	88	0
+$LDWbegin_kernel_pair:
+	.loc	16	95	0
+	ld.param.u64 	%rd1, [__cudaparm_kernel_pair_sp_lj_in];
+	ldu.global.f32 	%f1, [%rd1+0];
+	.loc	16	96	0
+	ld.global.f32 	%f2, [%rd1+4];
+	.loc	16	97	0
+	ld.global.f32 	%f3, [%rd1+8];
+	.loc	16	98	0
+	ld.global.f32 	%f4, [%rd1+12];
+	st.shared.v4.f32 	[__cuda___cuda_local_var_32497_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4};
+	.loc	16	107	0
+	mov.f32 	%f5, 0f00000000;     	// 0
+	mov.f32 	%f6, %f5;
+	mov.f32 	%f7, 0f00000000;     	// 0
+	mov.f32 	%f8, %f7;
+	mov.f32 	%f9, 0f00000000;     	// 0
+	mov.f32 	%f10, %f9;
+	mov.f32 	%f11, 0f00000000;    	// 0
+	mov.f32 	%f12, %f11;
+	mov.f32 	%f13, 0f00000000;    	// 0
+	mov.f32 	%f14, %f13;
+	mov.f32 	%f15, 0f00000000;    	// 0
+	mov.f32 	%f16, %f15;
+	ld.param.s32 	%r1, [__cudaparm_kernel_pair_t_per_atom];
+	cvt.s32.u32 	%r2, %tid.x;
+	div.s32 	%r3, %r2, %r1;
+	cvt.s32.u32 	%r4, %ntid.x;
+	div.s32 	%r5, %r4, %r1;
+	rem.s32 	%r6, %r2, %r1;
+	cvt.s32.u32 	%r7, %ctaid.x;
+	mul.lo.s32 	%r8, %r7, %r5;
+	add.s32 	%r9, %r3, %r8;
+	ld.param.s32 	%r10, [__cudaparm_kernel_pair_inum];
+	setp.lt.s32 	%p1, %r9, %r10;
+	@!%p1 bra 	$Lt_0_19202;
+	.loc	16	113	0
+	ld.param.s32 	%r11, [__cudaparm_kernel_pair_nbor_pitch];
+	cvt.s64.s32 	%rd2, %r11;
+	mul.wide.s32 	%rd3, %r11, 4;
+	cvt.s64.s32 	%rd4, %r9;
+	mul.wide.s32 	%rd5, %r9, 4;
+	ld.param.u64 	%rd6, [__cudaparm_kernel_pair_dev_nbor];
+	add.u64 	%rd7, %rd5, %rd6;
+	add.u64 	%rd8, %rd3, %rd7;
+	ld.global.s32 	%r12, [%rd8+0];
+	add.u64 	%rd9, %rd3, %rd8;
+	ld.param.u64 	%rd10, [__cudaparm_kernel_pair_dev_packed];
+	setp.ne.u64 	%p2, %rd10, %rd6;
+	@%p2 bra 	$Lt_0_19714;
+	.loc	16	119	0
+	cvt.s32.s64 	%r13, %rd2;
+	mul.lo.s32 	%r14, %r13, %r12;
+	cvt.s64.s32 	%rd11, %r14;
+	mul.wide.s32 	%rd12, %r14, 4;
+	add.u64 	%rd13, %rd9, %rd12;
+	.loc	16	120	0
+	mul.lo.s32 	%r15, %r6, %r13;
+	cvt.s64.s32 	%rd14, %r15;
+	mul.wide.s32 	%rd15, %r15, 4;
+	add.u64 	%rd16, %rd9, %rd15;
+	.loc	16	121	0
+	mul.lo.s32 	%r16, %r13, %r1;
+	bra.uni 	$Lt_0_19458;
+$Lt_0_19714:
+	.loc	16	123	0
+	ld.global.s32 	%r17, [%rd9+0];
+	cvt.s64.s32 	%rd17, %r17;
+	mul.wide.s32 	%rd18, %r17, 4;
+	add.u64 	%rd19, %rd10, %rd18;
+	.loc	16	124	0
+	cvt.s64.s32 	%rd20, %r12;
+	mul.wide.s32 	%rd21, %r12, 4;
+	add.u64 	%rd13, %rd19, %rd21;
+	.loc	16	125	0
+	mov.s32 	%r16, %r1;
+	.loc	16	126	0
+	cvt.s64.s32 	%rd22, %r6;
+	mul.wide.s32 	%rd23, %r6, 4;
+	add.u64 	%rd16, %rd19, %rd23;
+$Lt_0_19458:
+	.loc	16	129	0
+	ld.global.s32 	%r18, [%rd7+0];
+	mov.u32 	%r19, %r18;
+	mov.s32 	%r20, 0;
+	mov.u32 	%r21, %r20;
+	mov.s32 	%r22, 0;
+	mov.u32 	%r23, %r22;
+	mov.s32 	%r24, 0;
+	mov.u32 	%r25, %r24;
+	tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r19,%r21,%r23,%r25}];
+	mov.f32 	%f21, %f17;
+	mov.f32 	%f22, %f18;
+	mov.f32 	%f23, %f19;
+	mov.f32 	%f24, %f20;
+	setp.ge.u64 	%p3, %rd16, %rd13;
+	@%p3 bra 	$Lt_0_28162;
+	cvt.rzi.ftz.s32.f32 	%r26, %f24;
+	cvt.s64.s32 	%rd24, %r16;
+	ld.param.s32 	%r27, [__cudaparm_kernel_pair_lj_types];
+	mul.lo.s32 	%r28, %r27, %r26;
+	ld.param.u64 	%rd25, [__cudaparm_kernel_pair_lj1];
+	mov.f32 	%f25, 0f00000000;    	// 0
+	mov.f32 	%f26, 0f00000000;    	// 0
+	mov.f32 	%f27, 0f00000000;    	// 0
+	mov.f32 	%f28, 0f00000000;    	// 0
+	mov.u64 	%rd26, __cuda___cuda_local_var_32497_33_non_const_sp_lj92;
+$Lt_0_20482:
+ //<loop> Loop body line 129, nesting depth: 1, estimated iterations: unknown
+	.loc	16	135	0
+	ld.global.s32 	%r29, [%rd16+0];
+	.loc	16	136	0
+	shr.s32 	%r30, %r29, 30;
+	and.b32 	%r31, %r30, 3;
+	cvt.s64.s32 	%rd27, %r31;
+	mul.wide.s32 	%rd28, %r31, 4;
+	add.u64 	%rd29, %rd26, %rd28;
+	ld.shared.f32 	%f29, [%rd29+0];
+	.loc	16	139	0
+	and.b32 	%r32, %r29, 1073741823;
+	mov.u32 	%r33, %r32;
+	mov.s32 	%r34, 0;
+	mov.u32 	%r35, %r34;
+	mov.s32 	%r36, 0;
+	mov.u32 	%r37, %r36;
+	mov.s32 	%r38, 0;
+	mov.u32 	%r39, %r38;
+	tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r33,%r35,%r37,%r39}];
+	mov.f32 	%f34, %f30;
+	mov.f32 	%f35, %f31;
+	mov.f32 	%f36, %f32;
+	mov.f32 	%f37, %f33;
+	cvt.rzi.ftz.s32.f32 	%r40, %f37;
+	sub.ftz.f32 	%f38, %f22, %f35;
+	sub.ftz.f32 	%f39, %f21, %f34;
+	sub.ftz.f32 	%f40, %f23, %f36;
+	mul.ftz.f32 	%f41, %f38, %f38;
+	fma.rn.ftz.f32 	%f42, %f39, %f39, %f41;
+	fma.rn.ftz.f32 	%f43, %f40, %f40, %f42;
+	add.s32 	%r41, %r40, %r28;
+	cvt.s64.s32 	%rd30, %r41;
+	mul.wide.s32 	%rd31, %r41, 16;
+	add.u64 	%rd32, %rd31, %rd25;
+	ld.global.f32 	%f44, [%rd32+8];
+	setp.gt.ftz.f32 	%p4, %f44, %f43;
+	@!%p4 bra 	$Lt_0_21762;
+	.loc	16	153	0
+	rcp.approx.ftz.f32 	%f45, %f43;
+	mul.ftz.f32 	%f46, %f45, %f45;
+	mul.ftz.f32 	%f47, %f45, %f46;
+	mul.ftz.f32 	%f48, %f45, %f47;
+	ld.global.v2.f32 	{%f49,%f50}, [%rd32+0];
+	mul.ftz.f32 	%f51, %f49, %f47;
+	sub.ftz.f32 	%f52, %f51, %f50;
+	mul.ftz.f32 	%f53, %f48, %f52;
+	mul.ftz.f32 	%f54, %f29, %f53;
+	.loc	16	155	0
+	fma.rn.ftz.f32 	%f27, %f39, %f54, %f27;
+	.loc	16	156	0
+	fma.rn.ftz.f32 	%f26, %f38, %f54, %f26;
+	.loc	16	157	0
+	fma.rn.ftz.f32 	%f25, %f40, %f54, %f25;
+	ld.param.s32 	%r42, [__cudaparm_kernel_pair_eflag];
+	mov.u32 	%r43, 0;
+	setp.le.s32 	%p5, %r42, %r43;
+	@%p5 bra 	$Lt_0_21250;
+	.loc	16	161	0
+	ld.param.u64 	%rd33, [__cudaparm_kernel_pair_lj3];
+	add.u64 	%rd34, %rd33, %rd31;
+	ld.global.v4.f32 	{%f55,%f56,%f57,_}, [%rd34+0];
+	mul.ftz.f32 	%f58, %f55, %f47;
+	sub.ftz.f32 	%f59, %f58, %f56;
+	mul.ftz.f32 	%f60, %f47, %f59;
+	sub.ftz.f32 	%f61, %f60, %f57;
+	fma.rn.ftz.f32 	%f28, %f29, %f61, %f28;
+$Lt_0_21250:
+	ld.param.s32 	%r44, [__cudaparm_kernel_pair_vflag];
+	mov.u32 	%r45, 0;
+	setp.le.s32 	%p6, %r44, %r45;
+	@%p6 bra 	$Lt_0_21762;
+	.loc	16	164	0
+	mov.f32 	%f62, %f6;
+	mul.ftz.f32 	%f63, %f39, %f39;
+	fma.rn.ftz.f32 	%f64, %f54, %f63, %f62;
+	mov.f32 	%f6, %f64;
+	.loc	16	165	0
+	mov.f32 	%f65, %f8;
+	fma.rn.ftz.f32 	%f66, %f54, %f41, %f65;
+	mov.f32 	%f8, %f66;
+	.loc	16	166	0
+	mov.f32 	%f67, %f10;
+	mul.ftz.f32 	%f68, %f40, %f40;
+	fma.rn.ftz.f32 	%f69, %f54, %f68, %f67;
+	mov.f32 	%f10, %f69;
+	.loc	16	167	0
+	mov.f32 	%f70, %f12;
+	mul.ftz.f32 	%f71, %f38, %f39;
+	fma.rn.ftz.f32 	%f72, %f54, %f71, %f70;
+	mov.f32 	%f12, %f72;
+	.loc	16	168	0
+	mov.f32 	%f73, %f14;
+	mul.ftz.f32 	%f74, %f39, %f40;
+	fma.rn.ftz.f32 	%f75, %f54, %f74, %f73;
+	mov.f32 	%f14, %f75;
+	.loc	16	169	0
+	mul.ftz.f32 	%f76, %f38, %f40;
+	fma.rn.ftz.f32 	%f15, %f54, %f76, %f15;
+	mov.f32 	%f16, %f15;
+$Lt_0_21762:
+$Lt_0_20738:
+	.loc	16	133	0
+	mul.lo.u64 	%rd35, %rd24, 4;
+	add.u64 	%rd16, %rd16, %rd35;
+	setp.lt.u64 	%p7, %rd16, %rd13;
+	@%p7 bra 	$Lt_0_20482;
+	bra.uni 	$Lt_0_18946;
+$Lt_0_28162:
+	mov.f32 	%f25, 0f00000000;    	// 0
+	mov.f32 	%f26, 0f00000000;    	// 0
+	mov.f32 	%f27, 0f00000000;    	// 0
+	mov.f32 	%f28, 0f00000000;    	// 0
+	bra.uni 	$Lt_0_18946;
+$Lt_0_19202:
+	mov.f32 	%f25, 0f00000000;    	// 0
+	mov.f32 	%f26, 0f00000000;    	// 0
+	mov.f32 	%f27, 0f00000000;    	// 0
+	mov.f32 	%f28, 0f00000000;    	// 0
+$Lt_0_18946:
+	mov.u32 	%r46, 1;
+	setp.le.s32 	%p8, %r1, %r46;
+	@%p8 bra 	$Lt_0_24578;
+	.loc	16	180	0
+	mov.u64 	%rd36, __cuda___cuda_local_var_32581_35_non_const_red_acc108;
+	cvt.s64.s32 	%rd37, %r2;
+	mul.wide.s32 	%rd38, %r2, 4;
+	add.u64 	%rd39, %rd36, %rd38;
+	mov.f32 	%f77, %f27;
+	st.shared.f32 	[%rd39+0], %f77;
+	.loc	16	181	0
+	mov.f32 	%f78, %f26;
+	st.shared.f32 	[%rd39+512], %f78;
+	.loc	16	182	0
+	mov.f32 	%f79, %f25;
+	st.shared.f32 	[%rd39+1024], %f79;
+	.loc	16	183	0
+	mov.f32 	%f80, %f28;
+	st.shared.f32 	[%rd39+1536], %f80;
+	.loc	16	185	0
+	shr.s32 	%r47, %r1, 31;
+	mov.s32 	%r48, 1;
+	and.b32 	%r49, %r47, %r48;
+	add.s32 	%r50, %r49, %r1;
+	shr.s32 	%r51, %r50, 1;
+	mov.s32 	%r52, %r51;
+	mov.u32 	%r53, 0;
+	setp.ne.u32 	%p9, %r51, %r53;
+	@!%p9 bra 	$Lt_0_23042;
+$Lt_0_23554:
+	setp.ge.u32 	%p10, %r6, %r52;
+	@%p10 bra 	$Lt_0_23810;
+	.loc	16	188	0
+	add.u32 	%r54, %r2, %r52;
+	cvt.u64.u32 	%rd40, %r54;
+	mul.wide.u32 	%rd41, %r54, 4;
+	add.u64 	%rd42, %rd36, %rd41;
+	ld.shared.f32 	%f81, [%rd42+0];
+	add.ftz.f32 	%f77, %f81, %f77;
+	st.shared.f32 	[%rd39+0], %f77;
+	ld.shared.f32 	%f82, [%rd42+512];
+	add.ftz.f32 	%f78, %f82, %f78;
+	st.shared.f32 	[%rd39+512], %f78;
+	ld.shared.f32 	%f83, [%rd42+1024];
+	add.ftz.f32 	%f79, %f83, %f79;
+	st.shared.f32 	[%rd39+1024], %f79;
+	ld.shared.f32 	%f84, [%rd42+1536];
+	add.ftz.f32 	%f80, %f84, %f80;
+	st.shared.f32 	[%rd39+1536], %f80;
+$Lt_0_23810:
+	.loc	16	185	0
+	shr.u32 	%r52, %r52, 1;
+	mov.u32 	%r55, 0;
+	setp.ne.u32 	%p11, %r52, %r55;
+	@%p11 bra 	$Lt_0_23554;
+$Lt_0_23042:
+	.loc	16	192	0
+	mov.f32 	%f27, %f77;
+	.loc	16	193	0
+	mov.f32 	%f26, %f78;
+	.loc	16	194	0
+	mov.f32 	%f25, %f79;
+	.loc	16	195	0
+	mov.f32 	%f28, %f80;
+	ld.param.s32 	%r56, [__cudaparm_kernel_pair_vflag];
+	mov.u32 	%r57, 0;
+	setp.le.s32 	%p12, %r56, %r57;
+	@%p12 bra 	$Lt_0_24578;
+	.loc	16	199	0
+	mov.f32 	%f77, %f6;
+	st.shared.f32 	[%rd39+0], %f77;
+	mov.f32 	%f78, %f8;
+	st.shared.f32 	[%rd39+512], %f78;
+	mov.f32 	%f79, %f10;
+	st.shared.f32 	[%rd39+1024], %f79;
+	mov.f32 	%f80, %f12;
+	st.shared.f32 	[%rd39+1536], %f80;
+	mov.f32 	%f85, %f14;
+	st.shared.f32 	[%rd39+2048], %f85;
+	mov.f32 	%f86, %f16;
+	st.shared.f32 	[%rd39+2560], %f86;
+	.loc	16	201	0
+	mov.s32 	%r58, %r51;
+	@!%p9 bra 	$Lt_0_25090;
+$Lt_0_25602:
+	setp.ge.u32 	%p13, %r6, %r58;
+	@%p13 bra 	$Lt_0_25858;
+	.loc	16	204	0
+	add.u32 	%r59, %r2, %r58;
+	cvt.u64.u32 	%rd43, %r59;
+	mul.wide.u32 	%rd44, %r59, 4;
+	add.u64 	%rd45, %rd36, %rd44;
+	ld.shared.f32 	%f87, [%rd45+0];
+	add.ftz.f32 	%f77, %f87, %f77;
+	st.shared.f32 	[%rd39+0], %f77;
+	ld.shared.f32 	%f88, [%rd45+512];
+	add.ftz.f32 	%f78, %f88, %f78;
+	st.shared.f32 	[%rd39+512], %f78;
+	ld.shared.f32 	%f89, [%rd45+1024];
+	add.ftz.f32 	%f79, %f89, %f79;
+	st.shared.f32 	[%rd39+1024], %f79;
+	ld.shared.f32 	%f90, [%rd45+1536];
+	add.ftz.f32 	%f80, %f90, %f80;
+	st.shared.f32 	[%rd39+1536], %f80;
+	ld.shared.f32 	%f91, [%rd45+2048];
+	add.ftz.f32 	%f85, %f91, %f85;
+	st.shared.f32 	[%rd39+2048], %f85;
+	ld.shared.f32 	%f92, [%rd45+2560];
+	add.ftz.f32 	%f86, %f92, %f86;
+	st.shared.f32 	[%rd39+2560], %f86;
+$Lt_0_25858:
+	.loc	16	201	0
+	shr.u32 	%r58, %r58, 1;
+	mov.u32 	%r60, 0;
+	setp.ne.u32 	%p14, %r58, %r60;
+	@%p14 bra 	$Lt_0_25602;
+$Lt_0_25090:
+	.loc	16	209	0
+	mov.f32 	%f6, %f77;
+	mov.f32 	%f8, %f78;
+	mov.f32 	%f10, %f79;
+	mov.f32 	%f12, %f80;
+	mov.f32 	%f14, %f85;
+	mov.f32 	%f16, %f86;
+$Lt_0_24578:
+$Lt_0_22530:
+	selp.s32 	%r61, 1, 0, %p1;
+	mov.s32 	%r62, 0;
+	set.eq.u32.s32 	%r63, %r6, %r62;
+	neg.s32 	%r64, %r63;
+	and.b32 	%r65, %r61, %r64;
+	mov.u32 	%r66, 0;
+	setp.eq.s32 	%p15, %r65, %r66;
+	@%p15 bra 	$Lt_0_26626;
+	.loc	16	215	0
+	cvt.s64.s32 	%rd46, %r9;
+	ld.param.u64 	%rd47, [__cudaparm_kernel_pair_engv];
+	mul.wide.s32 	%rd48, %r9, 4;
+	add.u64 	%rd49, %rd47, %rd48;
+	ld.param.s32 	%r67, [__cudaparm_kernel_pair_eflag];
+	mov.u32 	%r68, 0;
+	setp.le.s32 	%p16, %r67, %r68;
+	@%p16 bra 	$Lt_0_27138;
+	.loc	16	217	0
+	st.global.f32 	[%rd49+0], %f28;
+	.loc	16	218	0
+	cvt.s64.s32 	%rd50, %r10;
+	mul.wide.s32 	%rd51, %r10, 4;
+	add.u64 	%rd49, %rd49, %rd51;
+$Lt_0_27138:
+	ld.param.s32 	%r69, [__cudaparm_kernel_pair_vflag];
+	mov.u32 	%r70, 0;
+	setp.le.s32 	%p17, %r69, %r70;
+	@%p17 bra 	$Lt_0_27650;
+	.loc	16	222	0
+	mov.f32 	%f93, %f6;
+	st.global.f32 	[%rd49+0], %f93;
+	.loc	16	223	0
+	cvt.s64.s32 	%rd52, %r10;
+	mul.wide.s32 	%rd53, %r10, 4;
+	add.u64 	%rd54, %rd53, %rd49;
+	.loc	16	222	0
+	mov.f32 	%f94, %f8;
+	st.global.f32 	[%rd54+0], %f94;
+	.loc	16	223	0
+	add.u64 	%rd55, %rd53, %rd54;
+	.loc	16	222	0
+	mov.f32 	%f95, %f10;
+	st.global.f32 	[%rd55+0], %f95;
+	.loc	16	223	0
+	add.u64 	%rd56, %rd53, %rd55;
+	.loc	16	222	0
+	mov.f32 	%f96, %f12;
+	st.global.f32 	[%rd56+0], %f96;
+	.loc	16	223	0
+	add.u64 	%rd49, %rd53, %rd56;
+	.loc	16	222	0
+	mov.f32 	%f97, %f14;
+	st.global.f32 	[%rd49+0], %f97;
+	mov.f32 	%f98, %f16;
+	add.u64 	%rd57, %rd53, %rd49;
+	st.global.f32 	[%rd57+0], %f98;
+$Lt_0_27650:
+	.loc	16	226	0
+	ld.param.u64 	%rd58, [__cudaparm_kernel_pair_ans];
+	mul.lo.u64 	%rd59, %rd46, 16;
+	add.u64 	%rd60, %rd58, %rd59;
+	mov.f32 	%f99, %f100;
+	st.global.v4.f32 	[%rd60+0], {%f27,%f26,%f25,%f99};
+$Lt_0_26626:
+	.loc	16	228	0
+	exit;
+$LDWend_kernel_pair:
+	} // kernel_pair
+
+	.entry kernel_pair_fast (
+		.param .u64 __cudaparm_kernel_pair_fast_x_,
+		.param .u64 __cudaparm_kernel_pair_fast_lj1_in,
+		.param .u64 __cudaparm_kernel_pair_fast_lj3_in,
+		.param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,
+		.param .u64 __cudaparm_kernel_pair_fast_dev_nbor,
+		.param .u64 __cudaparm_kernel_pair_fast_dev_packed,
+		.param .u64 __cudaparm_kernel_pair_fast_ans,
+		.param .u64 __cudaparm_kernel_pair_fast_engv,
+		.param .s32 __cudaparm_kernel_pair_fast_eflag,
+		.param .s32 __cudaparm_kernel_pair_fast_vflag,
+		.param .s32 __cudaparm_kernel_pair_fast_inum,
+		.param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,
+		.param .s32 __cudaparm_kernel_pair_fast_t_per_atom)
+	{
+	.reg .u32 %r<74>;
+	.reg .u64 %rd<74>;
+	.reg .f32 %f<109>;
+	.reg .pred %p<22>;
+	.shared .align 4 .b8 __cuda___cuda_local_var_32647_33_non_const_sp_lj3268[16];
+	.shared .align 16 .b8 __cuda___cuda_local_var_32645_34_non_const_lj13296[1936];
+	.shared .align 16 .b8 __cuda___cuda_local_var_32646_34_non_const_lj35232[1936];
+	.shared .align 4 .b8 __cuda___cuda_local_var_32735_35_non_const_red_acc7168[3072];
+	// __cuda_local_var_32657_10_non_const_f = 48
+	// __cuda_local_var_32661_9_non_const_virial = 16
+	.loc	16	236	0
+$LDWbegin_kernel_pair_fast:
+	cvt.s32.u32 	%r1, %tid.x;
+	mov.u32 	%r2, 3;
+	setp.gt.s32 	%p1, %r1, %r2;
+	@%p1 bra 	$Lt_1_21250;
+	.loc	16	246	0
+	mov.u64 	%rd1, __cuda___cuda_local_var_32647_33_non_const_sp_lj3268;
+	cvt.s64.s32 	%rd2, %r1;
+	mul.wide.s32 	%rd3, %r1, 4;
+	ld.param.u64 	%rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];
+	add.u64 	%rd5, %rd4, %rd3;
+	ld.global.f32 	%f1, [%rd5+0];
+	add.u64 	%rd6, %rd3, %rd1;
+	st.shared.f32 	[%rd6+0], %f1;
+$Lt_1_21250:
+	mov.u64 	%rd1, __cuda___cuda_local_var_32647_33_non_const_sp_lj3268;
+	mov.u32 	%r3, 120;
+	setp.gt.s32 	%p2, %r1, %r3;
+	@%p2 bra 	$Lt_1_21762;
+	.loc	16	248	0
+	mov.u64 	%rd7, __cuda___cuda_local_var_32645_34_non_const_lj13296;
+	cvt.s64.s32 	%rd8, %r1;
+	mul.wide.s32 	%rd9, %r1, 16;
+	ld.param.u64 	%rd10, [__cudaparm_kernel_pair_fast_lj1_in];
+	add.u64 	%rd11, %rd10, %rd9;
+	add.u64 	%rd12, %rd9, %rd7;
+	ld.global.v4.f32 	{%f2,%f3,%f4,%f5}, [%rd11+0];
+	st.shared.v4.f32 	[%rd12+0], {%f2,%f3,%f4,%f5};
+	ld.param.s32 	%r4, [__cudaparm_kernel_pair_fast_eflag];
+	mov.u32 	%r5, 0;
+	setp.le.s32 	%p3, %r4, %r5;
+	@%p3 bra 	$Lt_1_22274;
+	.loc	16	250	0
+	mov.u64 	%rd13, __cuda___cuda_local_var_32646_34_non_const_lj35232;
+	ld.param.u64 	%rd14, [__cudaparm_kernel_pair_fast_lj3_in];
+	add.u64 	%rd15, %rd14, %rd9;
+	add.u64 	%rd16, %rd9, %rd13;
+	ld.global.v4.f32 	{%f6,%f7,%f8,%f9}, [%rd15+0];
+	st.shared.v4.f32 	[%rd16+0], {%f6,%f7,%f8,%f9};
+$Lt_1_22274:
+	mov.u64 	%rd13, __cuda___cuda_local_var_32646_34_non_const_lj35232;
+$Lt_1_21762:
+	mov.u64 	%rd7, __cuda___cuda_local_var_32645_34_non_const_lj13296;
+	mov.u64 	%rd13, __cuda___cuda_local_var_32646_34_non_const_lj35232;
+	.loc	16	260	0
+	mov.f32 	%f10, 0f00000000;    	// 0
+	mov.f32 	%f11, %f10;
+	mov.f32 	%f12, 0f00000000;    	// 0
+	mov.f32 	%f13, %f12;
+	mov.f32 	%f14, 0f00000000;    	// 0
+	mov.f32 	%f15, %f14;
+	mov.f32 	%f16, 0f00000000;    	// 0
+	mov.f32 	%f17, %f16;
+	mov.f32 	%f18, 0f00000000;    	// 0
+	mov.f32 	%f19, %f18;
+	mov.f32 	%f20, 0f00000000;    	// 0
+	mov.f32 	%f21, %f20;
+	.loc	16	262	0
+	bar.sync 	0;
+	ld.param.s32 	%r6, [__cudaparm_kernel_pair_fast_t_per_atom];
+	div.s32 	%r7, %r1, %r6;
+	cvt.s32.u32 	%r8, %ntid.x;
+	div.s32 	%r9, %r8, %r6;
+	rem.s32 	%r10, %r1, %r6;
+	cvt.s32.u32 	%r11, %ctaid.x;
+	mul.lo.s32 	%r12, %r11, %r9;
+	add.s32 	%r13, %r7, %r12;
+	ld.param.s32 	%r14, [__cudaparm_kernel_pair_fast_inum];
+	setp.lt.s32 	%p4, %r13, %r14;
+	@!%p4 bra 	$Lt_1_23042;
+	.loc	16	268	0
+	ld.param.s32 	%r15, [__cudaparm_kernel_pair_fast_nbor_pitch];
+	cvt.s64.s32 	%rd17, %r15;
+	mul.wide.s32 	%rd18, %r15, 4;
+	cvt.s64.s32 	%rd19, %r13;
+	mul.wide.s32 	%rd20, %r13, 4;
+	ld.param.u64 	%rd21, [__cudaparm_kernel_pair_fast_dev_nbor];
+	add.u64 	%rd22, %rd20, %rd21;
+	add.u64 	%rd23, %rd18, %rd22;
+	ld.global.s32 	%r16, [%rd23+0];
+	add.u64 	%rd24, %rd18, %rd23;
+	ld.param.u64 	%rd25, [__cudaparm_kernel_pair_fast_dev_packed];
+	setp.ne.u64 	%p5, %rd25, %rd21;
+	@%p5 bra 	$Lt_1_23554;
+	.loc	16	274	0
+	cvt.s32.s64 	%r17, %rd17;
+	mul.lo.s32 	%r18, %r17, %r16;
+	cvt.s64.s32 	%rd26, %r18;
+	mul.wide.s32 	%rd27, %r18, 4;
+	add.u64 	%rd28, %rd24, %rd27;
+	.loc	16	275	0
+	mul.lo.s32 	%r19, %r10, %r17;
+	cvt.s64.s32 	%rd29, %r19;
+	mul.wide.s32 	%rd30, %r19, 4;
+	add.u64 	%rd31, %rd24, %rd30;
+	.loc	16	276	0
+	mul.lo.s32 	%r20, %r17, %r6;
+	bra.uni 	$Lt_1_23298;
+$Lt_1_23554:
+	.loc	16	278	0
+	ld.global.s32 	%r21, [%rd24+0];
+	cvt.s64.s32 	%rd32, %r21;
+	mul.wide.s32 	%rd33, %r21, 4;
+	add.u64 	%rd34, %rd25, %rd33;
+	.loc	16	279	0
+	cvt.s64.s32 	%rd35, %r16;
+	mul.wide.s32 	%rd36, %r16, 4;
+	add.u64 	%rd28, %rd34, %rd36;
+	.loc	16	280	0
+	mov.s32 	%r20, %r6;
+	.loc	16	281	0
+	cvt.s64.s32 	%rd37, %r10;
+	mul.wide.s32 	%rd38, %r10, 4;
+	add.u64 	%rd31, %rd34, %rd38;
+$Lt_1_23298:
+	.loc	16	284	0
+	ld.global.s32 	%r22, [%rd22+0];
+	mov.u32 	%r23, %r22;
+	mov.s32 	%r24, 0;
+	mov.u32 	%r25, %r24;
+	mov.s32 	%r26, 0;
+	mov.u32 	%r27, %r26;
+	mov.s32 	%r28, 0;
+	mov.u32 	%r29, %r28;
+	tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r23,%r25,%r27,%r29}];
+	mov.f32 	%f26, %f22;
+	mov.f32 	%f27, %f23;
+	mov.f32 	%f28, %f24;
+	mov.f32 	%f29, %f25;
+	setp.ge.u64 	%p6, %rd31, %rd28;
+	@%p6 bra 	$Lt_1_32002;
+	cvt.rzi.ftz.s32.f32 	%r30, %f29;
+	cvt.s64.s32 	%rd39, %r20;
+	mul.lo.s32 	%r31, %r30, 11;
+	cvt.rn.f32.s32 	%f30, %r31;
+	mov.f32 	%f31, 0f00000000;    	// 0
+	mov.f32 	%f32, 0f00000000;    	// 0
+	mov.f32 	%f33, 0f00000000;    	// 0
+	mov.f32 	%f34, 0f00000000;    	// 0
+$Lt_1_24322:
+ //<loop> Loop body line 284, nesting depth: 1, estimated iterations: unknown
+	.loc	16	291	0
+	ld.global.s32 	%r32, [%rd31+0];
+	.loc	16	292	0
+	shr.s32 	%r33, %r32, 30;
+	and.b32 	%r34, %r33, 3;
+	cvt.s64.s32 	%rd40, %r34;
+	mul.wide.s32 	%rd41, %r34, 4;
+	add.u64 	%rd42, %rd1, %rd41;
+	ld.shared.f32 	%f35, [%rd42+0];
+	.loc	16	295	0
+	and.b32 	%r35, %r32, 1073741823;
+	mov.u32 	%r36, %r35;
+	mov.s32 	%r37, 0;
+	mov.u32 	%r38, %r37;
+	mov.s32 	%r39, 0;
+	mov.u32 	%r40, %r39;
+	mov.s32 	%r41, 0;
+	mov.u32 	%r42, %r41;
+	tex.1d.v4.f32.s32 {%f36,%f37,%f38,%f39},[pos_tex,{%r36,%r38,%r40,%r42}];
+	mov.f32 	%f40, %f36;
+	mov.f32 	%f41, %f37;
+	mov.f32 	%f42, %f38;
+	mov.f32 	%f43, %f39;
+	sub.ftz.f32 	%f44, %f27, %f41;
+	sub.ftz.f32 	%f45, %f26, %f40;
+	sub.ftz.f32 	%f46, %f28, %f42;
+	mul.ftz.f32 	%f47, %f44, %f44;
+	fma.rn.ftz.f32 	%f48, %f45, %f45, %f47;
+	fma.rn.ftz.f32 	%f49, %f46, %f46, %f48;
+	add.ftz.f32 	%f50, %f30, %f43;
+	cvt.rzi.ftz.s32.f32 	%r43, %f50;
+	cvt.s64.s32 	%rd43, %r43;
+	mul.wide.s32 	%rd44, %r43, 16;
+	add.u64 	%rd45, %rd44, %rd7;
+	ld.shared.f32 	%f51, [%rd45+8];
+	setp.gt.ftz.f32 	%p7, %f51, %f49;
+	@!%p7 bra 	$Lt_1_25602;
+	.loc	16	307	0
+	rcp.approx.ftz.f32 	%f52, %f49;
+	mul.ftz.f32 	%f53, %f52, %f52;
+	mul.ftz.f32 	%f54, %f52, %f53;
+	mul.ftz.f32 	%f55, %f52, %f35;
+	mul.ftz.f32 	%f56, %f54, %f55;
+	ld.shared.v2.f32 	{%f57,%f58}, [%rd45+0];
+	mul.ftz.f32 	%f59, %f57, %f54;
+	sub.ftz.f32 	%f60, %f59, %f58;
+	mul.ftz.f32 	%f61, %f56, %f60;
+	.loc	16	309	0
+	fma.rn.ftz.f32 	%f33, %f45, %f61, %f33;
+	.loc	16	310	0
+	fma.rn.ftz.f32 	%f32, %f44, %f61, %f32;
+	.loc	16	311	0
+	fma.rn.ftz.f32 	%f31, %f46, %f61, %f31;
+	ld.param.s32 	%r44, [__cudaparm_kernel_pair_fast_eflag];
+	mov.u32 	%r45, 0;
+	setp.le.s32 	%p8, %r44, %r45;
+	@%p8 bra 	$Lt_1_25090;
+	.loc	16	314	0
+	add.u64 	%rd46, %rd44, %rd13;
+	ld.shared.v4.f32 	{%f62,%f63,%f64,_}, [%rd46+0];
+	mul.ftz.f32 	%f65, %f62, %f54;
+	sub.ftz.f32 	%f66, %f65, %f63;
+	mul.ftz.f32 	%f67, %f54, %f66;
+	.loc	16	315	0
+	sub.ftz.f32 	%f68, %f67, %f64;
+	fma.rn.ftz.f32 	%f34, %f35, %f68, %f34;
+$Lt_1_25090:
+	ld.param.s32 	%r46, [__cudaparm_kernel_pair_fast_vflag];
+	mov.u32 	%r47, 0;
+	setp.le.s32 	%p9, %r46, %r47;
+	@%p9 bra 	$Lt_1_25602;
+	.loc	16	318	0
+	mov.f32 	%f69, %f11;
+	mul.ftz.f32 	%f70, %f45, %f45;
+	fma.rn.ftz.f32 	%f71, %f61, %f70, %f69;
+	mov.f32 	%f11, %f71;
+	.loc	16	319	0
+	mov.f32 	%f72, %f13;
+	fma.rn.ftz.f32 	%f73, %f61, %f47, %f72;
+	mov.f32 	%f13, %f73;
+	.loc	16	320	0
+	mov.f32 	%f74, %f15;
+	mul.ftz.f32 	%f75, %f46, %f46;
+	fma.rn.ftz.f32 	%f76, %f61, %f75, %f74;
+	mov.f32 	%f15, %f76;
+	.loc	16	321	0
+	mov.f32 	%f77, %f17;
+	mul.ftz.f32 	%f78, %f44, %f45;
+	fma.rn.ftz.f32 	%f79, %f61, %f78, %f77;
+	mov.f32 	%f17, %f79;
+	.loc	16	322	0
+	mov.f32 	%f80, %f19;
+	mul.ftz.f32 	%f81, %f45, %f46;
+	fma.rn.ftz.f32 	%f82, %f61, %f81, %f80;
+	mov.f32 	%f19, %f82;
+	.loc	16	323	0
+	mul.ftz.f32 	%f83, %f44, %f46;
+	fma.rn.ftz.f32 	%f20, %f61, %f83, %f20;
+	mov.f32 	%f21, %f20;
+$Lt_1_25602:
+$Lt_1_24578:
+	.loc	16	289	0
+	mul.lo.u64 	%rd47, %rd39, 4;
+	add.u64 	%rd31, %rd31, %rd47;
+	setp.lt.u64 	%p10, %rd31, %rd28;
+	@%p10 bra 	$Lt_1_24322;
+	bra.uni 	$Lt_1_22786;
+$Lt_1_32002:
+	mov.f32 	%f31, 0f00000000;    	// 0
+	mov.f32 	%f32, 0f00000000;    	// 0
+	mov.f32 	%f33, 0f00000000;    	// 0
+	mov.f32 	%f34, 0f00000000;    	// 0
+	bra.uni 	$Lt_1_22786;
+$Lt_1_23042:
+	mov.f32 	%f31, 0f00000000;    	// 0
+	mov.f32 	%f32, 0f00000000;    	// 0
+	mov.f32 	%f33, 0f00000000;    	// 0
+	mov.f32 	%f34, 0f00000000;    	// 0
+$Lt_1_22786:
+	mov.u32 	%r48, 1;
+	setp.le.s32 	%p11, %r6, %r48;
+	@%p11 bra 	$Lt_1_28418;
+	.loc	16	334	0
+	mov.u64 	%rd48, __cuda___cuda_local_var_32735_35_non_const_red_acc7168;
+	cvt.s64.s32 	%rd49, %r1;
+	mul.wide.s32 	%rd50, %r1, 4;
+	add.u64 	%rd51, %rd48, %rd50;
+	mov.f32 	%f84, %f33;
+	st.shared.f32 	[%rd51+0], %f84;
+	.loc	16	335	0
+	mov.f32 	%f85, %f32;
+	st.shared.f32 	[%rd51+512], %f85;
+	.loc	16	336	0
+	mov.f32 	%f86, %f31;
+	st.shared.f32 	[%rd51+1024], %f86;
+	.loc	16	337	0
+	mov.f32 	%f87, %f34;
+	st.shared.f32 	[%rd51+1536], %f87;
+	.loc	16	339	0
+	shr.s32 	%r49, %r6, 31;
+	mov.s32 	%r50, 1;
+	and.b32 	%r51, %r49, %r50;
+	add.s32 	%r52, %r51, %r6;
+	shr.s32 	%r53, %r52, 1;
+	mov.s32 	%r54, %r53;
+	mov.u32 	%r55, 0;
+	setp.ne.u32 	%p12, %r53, %r55;
+	@!%p12 bra 	$Lt_1_26882;
+$Lt_1_27394:
+	setp.ge.u32 	%p13, %r10, %r54;
+	@%p13 bra 	$Lt_1_27650;
+	.loc	16	342	0
+	add.u32 	%r56, %r1, %r54;
+	cvt.u64.u32 	%rd52, %r56;
+	mul.wide.u32 	%rd53, %r56, 4;
+	add.u64 	%rd54, %rd48, %rd53;
+	ld.shared.f32 	%f88, [%rd54+0];
+	add.ftz.f32 	%f84, %f88, %f84;
+	st.shared.f32 	[%rd51+0], %f84;
+	ld.shared.f32 	%f89, [%rd54+512];
+	add.ftz.f32 	%f85, %f89, %f85;
+	st.shared.f32 	[%rd51+512], %f85;
+	ld.shared.f32 	%f90, [%rd54+1024];
+	add.ftz.f32 	%f86, %f90, %f86;
+	st.shared.f32 	[%rd51+1024], %f86;
+	ld.shared.f32 	%f91, [%rd54+1536];
+	add.ftz.f32 	%f87, %f91, %f87;
+	st.shared.f32 	[%rd51+1536], %f87;
+$Lt_1_27650:
+	.loc	16	339	0
+	shr.u32 	%r54, %r54, 1;
+	mov.u32 	%r57, 0;
+	setp.ne.u32 	%p14, %r54, %r57;
+	@%p14 bra 	$Lt_1_27394;
+$Lt_1_26882:
+	.loc	16	346	0
+	mov.f32 	%f33, %f84;
+	.loc	16	347	0
+	mov.f32 	%f32, %f85;
+	.loc	16	348	0
+	mov.f32 	%f31, %f86;
+	.loc	16	349	0
+	mov.f32 	%f34, %f87;
+	ld.param.s32 	%r58, [__cudaparm_kernel_pair_fast_vflag];
+	mov.u32 	%r59, 0;
+	setp.le.s32 	%p15, %r58, %r59;
+	@%p15 bra 	$Lt_1_28418;
+	.loc	16	353	0
+	mov.f32 	%f84, %f11;
+	st.shared.f32 	[%rd51+0], %f84;
+	mov.f32 	%f85, %f13;
+	st.shared.f32 	[%rd51+512], %f85;
+	mov.f32 	%f86, %f15;
+	st.shared.f32 	[%rd51+1024], %f86;
+	mov.f32 	%f87, %f17;
+	st.shared.f32 	[%rd51+1536], %f87;
+	mov.f32 	%f92, %f19;
+	st.shared.f32 	[%rd51+2048], %f92;
+	mov.f32 	%f93, %f21;
+	st.shared.f32 	[%rd51+2560], %f93;
+	.loc	16	355	0
+	mov.s32 	%r60, %r53;
+	@!%p12 bra 	$Lt_1_28930;
+$Lt_1_29442:
+	setp.ge.u32 	%p16, %r10, %r60;
+	@%p16 bra 	$Lt_1_29698;
+	.loc	16	358	0
+	add.u32 	%r61, %r1, %r60;
+	cvt.u64.u32 	%rd55, %r61;
+	mul.wide.u32 	%rd56, %r61, 4;
+	add.u64 	%rd57, %rd48, %rd56;
+	ld.shared.f32 	%f94, [%rd57+0];
+	add.ftz.f32 	%f84, %f94, %f84;
+	st.shared.f32 	[%rd51+0], %f84;
+	ld.shared.f32 	%f95, [%rd57+512];
+	add.ftz.f32 	%f85, %f95, %f85;
+	st.shared.f32 	[%rd51+512], %f85;
+	ld.shared.f32 	%f96, [%rd57+1024];
+	add.ftz.f32 	%f86, %f96, %f86;
+	st.shared.f32 	[%rd51+1024], %f86;
+	ld.shared.f32 	%f97, [%rd57+1536];
+	add.ftz.f32 	%f87, %f97, %f87;
+	st.shared.f32 	[%rd51+1536], %f87;
+	ld.shared.f32 	%f98, [%rd57+2048];
+	add.ftz.f32 	%f92, %f98, %f92;
+	st.shared.f32 	[%rd51+2048], %f92;
+	ld.shared.f32 	%f99, [%rd57+2560];
+	add.ftz.f32 	%f93, %f99, %f93;
+	st.shared.f32 	[%rd51+2560], %f93;
+$Lt_1_29698:
+	.loc	16	355	0
+	shr.u32 	%r60, %r60, 1;
+	mov.u32 	%r62, 0;
+	setp.ne.u32 	%p17, %r60, %r62;
+	@%p17 bra 	$Lt_1_29442;
+$Lt_1_28930:
+	.loc	16	363	0
+	mov.f32 	%f11, %f84;
+	mov.f32 	%f13, %f85;
+	mov.f32 	%f15, %f86;
+	mov.f32 	%f17, %f87;
+	mov.f32 	%f19, %f92;
+	mov.f32 	%f21, %f93;
+$Lt_1_28418:
+$Lt_1_26370:
+	selp.s32 	%r63, 1, 0, %p4;
+	mov.s32 	%r64, 0;
+	set.eq.u32.s32 	%r65, %r10, %r64;
+	neg.s32 	%r66, %r65;
+	and.b32 	%r67, %r63, %r66;
+	mov.u32 	%r68, 0;
+	setp.eq.s32 	%p18, %r67, %r68;
+	@%p18 bra 	$Lt_1_30466;
+	.loc	16	369	0
+	cvt.s64.s32 	%rd58, %r13;
+	ld.param.u64 	%rd59, [__cudaparm_kernel_pair_fast_engv];
+	mul.wide.s32 	%rd60, %r13, 4;
+	add.u64 	%rd61, %rd59, %rd60;
+	ld.param.s32 	%r69, [__cudaparm_kernel_pair_fast_eflag];
+	mov.u32 	%r70, 0;
+	setp.le.s32 	%p19, %r69, %r70;
+	@%p19 bra 	$Lt_1_30978;
+	.loc	16	371	0
+	st.global.f32 	[%rd61+0], %f34;
+	.loc	16	372	0
+	cvt.s64.s32 	%rd62, %r14;
+	mul.wide.s32 	%rd63, %r14, 4;
+	add.u64 	%rd61, %rd61, %rd63;
+$Lt_1_30978:
+	ld.param.s32 	%r71, [__cudaparm_kernel_pair_fast_vflag];
+	mov.u32 	%r72, 0;
+	setp.le.s32 	%p20, %r71, %r72;
+	@%p20 bra 	$Lt_1_31490;
+	.loc	16	376	0
+	mov.f32 	%f100, %f11;
+	st.global.f32 	[%rd61+0], %f100;
+	.loc	16	377	0
+	cvt.s64.s32 	%rd64, %r14;
+	mul.wide.s32 	%rd65, %r14, 4;
+	add.u64 	%rd66, %rd65, %rd61;
+	.loc	16	376	0
+	mov.f32 	%f101, %f13;
+	st.global.f32 	[%rd66+0], %f101;
+	.loc	16	377	0
+	add.u64 	%rd67, %rd65, %rd66;
+	.loc	16	376	0
+	mov.f32 	%f102, %f15;
+	st.global.f32 	[%rd67+0], %f102;
+	.loc	16	377	0
+	add.u64 	%rd68, %rd65, %rd67;
+	.loc	16	376	0
+	mov.f32 	%f103, %f17;
+	st.global.f32 	[%rd68+0], %f103;
+	.loc	16	377	0
+	add.u64 	%rd61, %rd65, %rd68;
+	.loc	16	376	0
+	mov.f32 	%f104, %f19;
+	st.global.f32 	[%rd61+0], %f104;
+	mov.f32 	%f105, %f21;
+	add.u64 	%rd69, %rd65, %rd61;
+	st.global.f32 	[%rd69+0], %f105;
+$Lt_1_31490:
+	.loc	16	380	0
+	ld.param.u64 	%rd70, [__cudaparm_kernel_pair_fast_ans];
+	mul.lo.u64 	%rd71, %rd58, 16;
+	add.u64 	%rd72, %rd70, %rd71;
+	mov.f32 	%f106, %f107;
+	st.global.v4.f32 	[%rd72+0], {%f33,%f32,%f31,%f106};
+$Lt_1_30466:
+	.loc	16	382	0
+	exit;
+$LDWend_kernel_pair_fast:
+	} // kernel_pair_fast
+
--- a/lib/gpu/lj_cut_gpu_ptx.h
+++ b/lib/gpu/lj_cut_gpu_ptx.h
@ -0,0 +1,927 @@
+const char * lj_cut_gpu_kernel = 
+"	.version 2.3\n"
+"	.target sm_20\n"
+"	.address_size 64\n"
+"	.global .texref pos_tex;\n"
+"	.entry kernel_pair (\n"
+"		.param .u64 __cudaparm_kernel_pair_x_,\n"
+"		.param .u64 __cudaparm_kernel_pair_lj1,\n"
+"		.param .u64 __cudaparm_kernel_pair_lj3,\n"
+"		.param .s32 __cudaparm_kernel_pair_lj_types,\n"
+"		.param .u64 __cudaparm_kernel_pair_sp_lj_in,\n"
+"		.param .u64 __cudaparm_kernel_pair_dev_nbor,\n"
+"		.param .u64 __cudaparm_kernel_pair_dev_packed,\n"
+"		.param .u64 __cudaparm_kernel_pair_ans,\n"
+"		.param .u64 __cudaparm_kernel_pair_engv,\n"
+"		.param .s32 __cudaparm_kernel_pair_eflag,\n"
+"		.param .s32 __cudaparm_kernel_pair_vflag,\n"
+"		.param .s32 __cudaparm_kernel_pair_inum,\n"
+"		.param .s32 __cudaparm_kernel_pair_nbor_pitch,\n"
+"		.param .s32 __cudaparm_kernel_pair_t_per_atom)\n"
+"	{\n"
+"	.reg .u32 %r<72>;\n"
+"	.reg .u64 %rd<62>;\n"
+"	.reg .f32 %f<102>;\n"
+"	.reg .pred %p<19>;\n"
+"	.shared .align 16 .b8 __cuda___cuda_local_var_32497_33_non_const_sp_lj92[16];\n"
+"	.shared .align 4 .b8 __cuda___cuda_local_var_32581_35_non_const_red_acc108[3072];\n"
+"	.loc	16	88	0\n"
+"$LDWbegin_kernel_pair:\n"
+"	.loc	16	95	0\n"
+"	ld.param.u64 	%rd1, [__cudaparm_kernel_pair_sp_lj_in];\n"
+"	ldu.global.f32 	%f1, [%rd1+0];\n"
+"	.loc	16	96	0\n"
+"	ld.global.f32 	%f2, [%rd1+4];\n"
+"	.loc	16	97	0\n"
+"	ld.global.f32 	%f3, [%rd1+8];\n"
+"	.loc	16	98	0\n"
+"	ld.global.f32 	%f4, [%rd1+12];\n"
+"	st.shared.v4.f32 	[__cuda___cuda_local_var_32497_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4};\n"
+"	.loc	16	107	0\n"
+"	mov.f32 	%f5, 0f00000000;     	\n"
+"	mov.f32 	%f6, %f5;\n"
+"	mov.f32 	%f7, 0f00000000;     	\n"
+"	mov.f32 	%f8, %f7;\n"
+"	mov.f32 	%f9, 0f00000000;     	\n"
+"	mov.f32 	%f10, %f9;\n"
+"	mov.f32 	%f11, 0f00000000;    	\n"
+"	mov.f32 	%f12, %f11;\n"
+"	mov.f32 	%f13, 0f00000000;    	\n"
+"	mov.f32 	%f14, %f13;\n"
+"	mov.f32 	%f15, 0f00000000;    	\n"
+"	mov.f32 	%f16, %f15;\n"
+"	ld.param.s32 	%r1, [__cudaparm_kernel_pair_t_per_atom];\n"
+"	cvt.s32.u32 	%r2, %tid.x;\n"
+"	div.s32 	%r3, %r2, %r1;\n"
+"	cvt.s32.u32 	%r4, %ntid.x;\n"
+"	div.s32 	%r5, %r4, %r1;\n"
+"	rem.s32 	%r6, %r2, %r1;\n"
+"	cvt.s32.u32 	%r7, %ctaid.x;\n"
+"	mul.lo.s32 	%r8, %r7, %r5;\n"
+"	add.s32 	%r9, %r3, %r8;\n"
+"	ld.param.s32 	%r10, [__cudaparm_kernel_pair_inum];\n"
+"	setp.lt.s32 	%p1, %r9, %r10;\n"
+"	@!%p1 bra 	$Lt_0_19202;\n"
+"	.loc	16	113	0\n"
+"	ld.param.s32 	%r11, [__cudaparm_kernel_pair_nbor_pitch];\n"
+"	cvt.s64.s32 	%rd2, %r11;\n"
+"	mul.wide.s32 	%rd3, %r11, 4;\n"
+"	cvt.s64.s32 	%rd4, %r9;\n"
+"	mul.wide.s32 	%rd5, %r9, 4;\n"
+"	ld.param.u64 	%rd6, [__cudaparm_kernel_pair_dev_nbor];\n"
+"	add.u64 	%rd7, %rd5, %rd6;\n"
+"	add.u64 	%rd8, %rd3, %rd7;\n"
+"	ld.global.s32 	%r12, [%rd8+0];\n"
+"	add.u64 	%rd9, %rd3, %rd8;\n"
+"	ld.param.u64 	%rd10, [__cudaparm_kernel_pair_dev_packed];\n"
+"	setp.ne.u64 	%p2, %rd10, %rd6;\n"
+"	@%p2 bra 	$Lt_0_19714;\n"
+"	.loc	16	119	0\n"
+"	cvt.s32.s64 	%r13, %rd2;\n"
+"	mul.lo.s32 	%r14, %r13, %r12;\n"
+"	cvt.s64.s32 	%rd11, %r14;\n"
+"	mul.wide.s32 	%rd12, %r14, 4;\n"
+"	add.u64 	%rd13, %rd9, %rd12;\n"
+"	.loc	16	120	0\n"
+"	mul.lo.s32 	%r15, %r6, %r13;\n"
+"	cvt.s64.s32 	%rd14, %r15;\n"
+"	mul.wide.s32 	%rd15, %r15, 4;\n"
+"	add.u64 	%rd16, %rd9, %rd15;\n"
+"	.loc	16	121	0\n"
+"	mul.lo.s32 	%r16, %r13, %r1;\n"
+"	bra.uni 	$Lt_0_19458;\n"
+"$Lt_0_19714:\n"
+"	.loc	16	123	0\n"
+"	ld.global.s32 	%r17, [%rd9+0];\n"
+"	cvt.s64.s32 	%rd17, %r17;\n"
+"	mul.wide.s32 	%rd18, %r17, 4;\n"
+"	add.u64 	%rd19, %rd10, %rd18;\n"
+"	.loc	16	124	0\n"
+"	cvt.s64.s32 	%rd20, %r12;\n"
+"	mul.wide.s32 	%rd21, %r12, 4;\n"
+"	add.u64 	%rd13, %rd19, %rd21;\n"
+"	.loc	16	125	0\n"
+"	mov.s32 	%r16, %r1;\n"
+"	.loc	16	126	0\n"
+"	cvt.s64.s32 	%rd22, %r6;\n"
+"	mul.wide.s32 	%rd23, %r6, 4;\n"
+"	add.u64 	%rd16, %rd19, %rd23;\n"
+"$Lt_0_19458:\n"
+"	.loc	16	129	0\n"
+"	ld.global.s32 	%r18, [%rd7+0];\n"
+"	mov.u32 	%r19, %r18;\n"
+"	mov.s32 	%r20, 0;\n"
+"	mov.u32 	%r21, %r20;\n"
+"	mov.s32 	%r22, 0;\n"
+"	mov.u32 	%r23, %r22;\n"
+"	mov.s32 	%r24, 0;\n"
+"	mov.u32 	%r25, %r24;\n"
+"	tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r19,%r21,%r23,%r25}];\n"
+"	mov.f32 	%f21, %f17;\n"
+"	mov.f32 	%f22, %f18;\n"
+"	mov.f32 	%f23, %f19;\n"
+"	mov.f32 	%f24, %f20;\n"
+"	setp.ge.u64 	%p3, %rd16, %rd13;\n"
+"	@%p3 bra 	$Lt_0_28162;\n"
+"	cvt.rzi.ftz.s32.f32 	%r26, %f24;\n"
+"	cvt.s64.s32 	%rd24, %r16;\n"
+"	ld.param.s32 	%r27, [__cudaparm_kernel_pair_lj_types];\n"
+"	mul.lo.s32 	%r28, %r27, %r26;\n"
+"	ld.param.u64 	%rd25, [__cudaparm_kernel_pair_lj1];\n"
+"	mov.f32 	%f25, 0f00000000;    	\n"
+"	mov.f32 	%f26, 0f00000000;    	\n"
+"	mov.f32 	%f27, 0f00000000;    	\n"
+"	mov.f32 	%f28, 0f00000000;    	\n"
+"	mov.u64 	%rd26, __cuda___cuda_local_var_32497_33_non_const_sp_lj92;\n"
+"$Lt_0_20482:\n"
+"	.loc	16	135	0\n"
+"	ld.global.s32 	%r29, [%rd16+0];\n"
+"	.loc	16	136	0\n"
+"	shr.s32 	%r30, %r29, 30;\n"
+"	and.b32 	%r31, %r30, 3;\n"
+"	cvt.s64.s32 	%rd27, %r31;\n"
+"	mul.wide.s32 	%rd28, %r31, 4;\n"
+"	add.u64 	%rd29, %rd26, %rd28;\n"
+"	ld.shared.f32 	%f29, [%rd29+0];\n"
+"	.loc	16	139	0\n"
+"	and.b32 	%r32, %r29, 1073741823;\n"
+"	mov.u32 	%r33, %r32;\n"
+"	mov.s32 	%r34, 0;\n"
+"	mov.u32 	%r35, %r34;\n"
+"	mov.s32 	%r36, 0;\n"
+"	mov.u32 	%r37, %r36;\n"
+"	mov.s32 	%r38, 0;\n"
+"	mov.u32 	%r39, %r38;\n"
+"	tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r33,%r35,%r37,%r39}];\n"
+"	mov.f32 	%f34, %f30;\n"
+"	mov.f32 	%f35, %f31;\n"
+"	mov.f32 	%f36, %f32;\n"
+"	mov.f32 	%f37, %f33;\n"
+"	cvt.rzi.ftz.s32.f32 	%r40, %f37;\n"
+"	sub.ftz.f32 	%f38, %f22, %f35;\n"
+"	sub.ftz.f32 	%f39, %f21, %f34;\n"
+"	sub.ftz.f32 	%f40, %f23, %f36;\n"
+"	mul.ftz.f32 	%f41, %f38, %f38;\n"
+"	fma.rn.ftz.f32 	%f42, %f39, %f39, %f41;\n"
+"	fma.rn.ftz.f32 	%f43, %f40, %f40, %f42;\n"
+"	add.s32 	%r41, %r40, %r28;\n"
+"	cvt.s64.s32 	%rd30, %r41;\n"
+"	mul.wide.s32 	%rd31, %r41, 16;\n"
+"	add.u64 	%rd32, %rd31, %rd25;\n"
+"	ld.global.f32 	%f44, [%rd32+8];\n"
+"	setp.gt.ftz.f32 	%p4, %f44, %f43;\n"
+"	@!%p4 bra 	$Lt_0_21762;\n"
+"	.loc	16	153	0\n"
+"	rcp.approx.ftz.f32 	%f45, %f43;\n"
+"	mul.ftz.f32 	%f46, %f45, %f45;\n"
+"	mul.ftz.f32 	%f47, %f45, %f46;\n"
+"	mul.ftz.f32 	%f48, %f45, %f47;\n"
+"	ld.global.v2.f32 	{%f49,%f50}, [%rd32+0];\n"
+"	mul.ftz.f32 	%f51, %f49, %f47;\n"
+"	sub.ftz.f32 	%f52, %f51, %f50;\n"
+"	mul.ftz.f32 	%f53, %f48, %f52;\n"
+"	mul.ftz.f32 	%f54, %f29, %f53;\n"
+"	.loc	16	155	0\n"
+"	fma.rn.ftz.f32 	%f27, %f39, %f54, %f27;\n"
+"	.loc	16	156	0\n"
+"	fma.rn.ftz.f32 	%f26, %f38, %f54, %f26;\n"
+"	.loc	16	157	0\n"
+"	fma.rn.ftz.f32 	%f25, %f40, %f54, %f25;\n"
+"	ld.param.s32 	%r42, [__cudaparm_kernel_pair_eflag];\n"
+"	mov.u32 	%r43, 0;\n"
+"	setp.le.s32 	%p5, %r42, %r43;\n"
+"	@%p5 bra 	$Lt_0_21250;\n"
+"	.loc	16	161	0\n"
+"	ld.param.u64 	%rd33, [__cudaparm_kernel_pair_lj3];\n"
+"	add.u64 	%rd34, %rd33, %rd31;\n"
+"	ld.global.v4.f32 	{%f55,%f56,%f57,_}, [%rd34+0];\n"
+"	mul.ftz.f32 	%f58, %f55, %f47;\n"
+"	sub.ftz.f32 	%f59, %f58, %f56;\n"
+"	mul.ftz.f32 	%f60, %f47, %f59;\n"
+"	sub.ftz.f32 	%f61, %f60, %f57;\n"
+"	fma.rn.ftz.f32 	%f28, %f29, %f61, %f28;\n"
+"$Lt_0_21250:\n"
+"	ld.param.s32 	%r44, [__cudaparm_kernel_pair_vflag];\n"
+"	mov.u32 	%r45, 0;\n"
+"	setp.le.s32 	%p6, %r44, %r45;\n"
+"	@%p6 bra 	$Lt_0_21762;\n"
+"	.loc	16	164	0\n"
+"	mov.f32 	%f62, %f6;\n"
+"	mul.ftz.f32 	%f63, %f39, %f39;\n"
+"	fma.rn.ftz.f32 	%f64, %f54, %f63, %f62;\n"
+"	mov.f32 	%f6, %f64;\n"
+"	.loc	16	165	0\n"
+"	mov.f32 	%f65, %f8;\n"
+"	fma.rn.ftz.f32 	%f66, %f54, %f41, %f65;\n"
+"	mov.f32 	%f8, %f66;\n"
+"	.loc	16	166	0\n"
+"	mov.f32 	%f67, %f10;\n"
+"	mul.ftz.f32 	%f68, %f40, %f40;\n"
+"	fma.rn.ftz.f32 	%f69, %f54, %f68, %f67;\n"
+"	mov.f32 	%f10, %f69;\n"
+"	.loc	16	167	0\n"
+"	mov.f32 	%f70, %f12;\n"
+"	mul.ftz.f32 	%f71, %f38, %f39;\n"
+"	fma.rn.ftz.f32 	%f72, %f54, %f71, %f70;\n"
+"	mov.f32 	%f12, %f72;\n"
+"	.loc	16	168	0\n"
+"	mov.f32 	%f73, %f14;\n"
+"	mul.ftz.f32 	%f74, %f39, %f40;\n"
+"	fma.rn.ftz.f32 	%f75, %f54, %f74, %f73;\n"
+"	mov.f32 	%f14, %f75;\n"
+"	.loc	16	169	0\n"
+"	mul.ftz.f32 	%f76, %f38, %f40;\n"
+"	fma.rn.ftz.f32 	%f15, %f54, %f76, %f15;\n"
+"	mov.f32 	%f16, %f15;\n"
+"$Lt_0_21762:\n"
+"$Lt_0_20738:\n"
+"	.loc	16	133	0\n"
+"	mul.lo.u64 	%rd35, %rd24, 4;\n"
+"	add.u64 	%rd16, %rd16, %rd35;\n"
+"	setp.lt.u64 	%p7, %rd16, %rd13;\n"
+"	@%p7 bra 	$Lt_0_20482;\n"
+"	bra.uni 	$Lt_0_18946;\n"
+"$Lt_0_28162:\n"
+"	mov.f32 	%f25, 0f00000000;    	\n"
+"	mov.f32 	%f26, 0f00000000;    	\n"
+"	mov.f32 	%f27, 0f00000000;    	\n"
+"	mov.f32 	%f28, 0f00000000;    	\n"
+"	bra.uni 	$Lt_0_18946;\n"
+"$Lt_0_19202:\n"
+"	mov.f32 	%f25, 0f00000000;    	\n"
+"	mov.f32 	%f26, 0f00000000;    	\n"
+"	mov.f32 	%f27, 0f00000000;    	\n"
+"	mov.f32 	%f28, 0f00000000;    	\n"
+"$Lt_0_18946:\n"
+"	mov.u32 	%r46, 1;\n"
+"	setp.le.s32 	%p8, %r1, %r46;\n"
+"	@%p8 bra 	$Lt_0_24578;\n"
+"	.loc	16	180	0\n"
+"	mov.u64 	%rd36, __cuda___cuda_local_var_32581_35_non_const_red_acc108;\n"
+"	cvt.s64.s32 	%rd37, %r2;\n"
+"	mul.wide.s32 	%rd38, %r2, 4;\n"
+"	add.u64 	%rd39, %rd36, %rd38;\n"
+"	mov.f32 	%f77, %f27;\n"
+"	st.shared.f32 	[%rd39+0], %f77;\n"
+"	.loc	16	181	0\n"
+"	mov.f32 	%f78, %f26;\n"
+"	st.shared.f32 	[%rd39+512], %f78;\n"
+"	.loc	16	182	0\n"
+"	mov.f32 	%f79, %f25;\n"
+"	st.shared.f32 	[%rd39+1024], %f79;\n"
+"	.loc	16	183	0\n"
+"	mov.f32 	%f80, %f28;\n"
+"	st.shared.f32 	[%rd39+1536], %f80;\n"
+"	.loc	16	185	0\n"
+"	shr.s32 	%r47, %r1, 31;\n"
+"	mov.s32 	%r48, 1;\n"
+"	and.b32 	%r49, %r47, %r48;\n"
+"	add.s32 	%r50, %r49, %r1;\n"
+"	shr.s32 	%r51, %r50, 1;\n"
+"	mov.s32 	%r52, %r51;\n"
+"	mov.u32 	%r53, 0;\n"
+"	setp.ne.u32 	%p9, %r51, %r53;\n"
+"	@!%p9 bra 	$Lt_0_23042;\n"
+"$Lt_0_23554:\n"
+"	setp.ge.u32 	%p10, %r6, %r52;\n"
+"	@%p10 bra 	$Lt_0_23810;\n"
+"	.loc	16	188	0\n"
+"	add.u32 	%r54, %r2, %r52;\n"
+"	cvt.u64.u32 	%rd40, %r54;\n"
+"	mul.wide.u32 	%rd41, %r54, 4;\n"
+"	add.u64 	%rd42, %rd36, %rd41;\n"
+"	ld.shared.f32 	%f81, [%rd42+0];\n"
+"	add.ftz.f32 	%f77, %f81, %f77;\n"
+"	st.shared.f32 	[%rd39+0], %f77;\n"
+"	ld.shared.f32 	%f82, [%rd42+512];\n"
+"	add.ftz.f32 	%f78, %f82, %f78;\n"
+"	st.shared.f32 	[%rd39+512], %f78;\n"
+"	ld.shared.f32 	%f83, [%rd42+1024];\n"
+"	add.ftz.f32 	%f79, %f83, %f79;\n"
+"	st.shared.f32 	[%rd39+1024], %f79;\n"
+"	ld.shared.f32 	%f84, [%rd42+1536];\n"
+"	add.ftz.f32 	%f80, %f84, %f80;\n"
+"	st.shared.f32 	[%rd39+1536], %f80;\n"
+"$Lt_0_23810:\n"
+"	.loc	16	185	0\n"
+"	shr.u32 	%r52, %r52, 1;\n"
+"	mov.u32 	%r55, 0;\n"
+"	setp.ne.u32 	%p11, %r52, %r55;\n"
+"	@%p11 bra 	$Lt_0_23554;\n"
+"$Lt_0_23042:\n"
+"	.loc	16	192	0\n"
+"	mov.f32 	%f27, %f77;\n"
+"	.loc	16	193	0\n"
+"	mov.f32 	%f26, %f78;\n"
+"	.loc	16	194	0\n"
+"	mov.f32 	%f25, %f79;\n"
+"	.loc	16	195	0\n"
+"	mov.f32 	%f28, %f80;\n"
+"	ld.param.s32 	%r56, [__cudaparm_kernel_pair_vflag];\n"
+"	mov.u32 	%r57, 0;\n"
+"	setp.le.s32 	%p12, %r56, %r57;\n"
+"	@%p12 bra 	$Lt_0_24578;\n"
+"	.loc	16	199	0\n"
+"	mov.f32 	%f77, %f6;\n"
+"	st.shared.f32 	[%rd39+0], %f77;\n"
+"	mov.f32 	%f78, %f8;\n"
+"	st.shared.f32 	[%rd39+512], %f78;\n"
+"	mov.f32 	%f79, %f10;\n"
+"	st.shared.f32 	[%rd39+1024], %f79;\n"
+"	mov.f32 	%f80, %f12;\n"
+"	st.shared.f32 	[%rd39+1536], %f80;\n"
+"	mov.f32 	%f85, %f14;\n"
+"	st.shared.f32 	[%rd39+2048], %f85;\n"
+"	mov.f32 	%f86, %f16;\n"
+"	st.shared.f32 	[%rd39+2560], %f86;\n"
+"	.loc	16	201	0\n"
+"	mov.s32 	%r58, %r51;\n"
+"	@!%p9 bra 	$Lt_0_25090;\n"
+"$Lt_0_25602:\n"
+"	setp.ge.u32 	%p13, %r6, %r58;\n"
+"	@%p13 bra 	$Lt_0_25858;\n"
+"	.loc	16	204	0\n"
+"	add.u32 	%r59, %r2, %r58;\n"
+"	cvt.u64.u32 	%rd43, %r59;\n"
+"	mul.wide.u32 	%rd44, %r59, 4;\n"
+"	add.u64 	%rd45, %rd36, %rd44;\n"
+"	ld.shared.f32 	%f87, [%rd45+0];\n"
+"	add.ftz.f32 	%f77, %f87, %f77;\n"
+"	st.shared.f32 	[%rd39+0], %f77;\n"
+"	ld.shared.f32 	%f88, [%rd45+512];\n"
+"	add.ftz.f32 	%f78, %f88, %f78;\n"
+"	st.shared.f32 	[%rd39+512], %f78;\n"
+"	ld.shared.f32 	%f89, [%rd45+1024];\n"
+"	add.ftz.f32 	%f79, %f89, %f79;\n"
+"	st.shared.f32 	[%rd39+1024], %f79;\n"
+"	ld.shared.f32 	%f90, [%rd45+1536];\n"
+"	add.ftz.f32 	%f80, %f90, %f80;\n"
+"	st.shared.f32 	[%rd39+1536], %f80;\n"
+"	ld.shared.f32 	%f91, [%rd45+2048];\n"
+"	add.ftz.f32 	%f85, %f91, %f85;\n"
+"	st.shared.f32 	[%rd39+2048], %f85;\n"
+"	ld.shared.f32 	%f92, [%rd45+2560];\n"
+"	add.ftz.f32 	%f86, %f92, %f86;\n"
+"	st.shared.f32 	[%rd39+2560], %f86;\n"
+"$Lt_0_25858:\n"
+"	.loc	16	201	0\n"
+"	shr.u32 	%r58, %r58, 1;\n"
+"	mov.u32 	%r60, 0;\n"
+"	setp.ne.u32 	%p14, %r58, %r60;\n"
+"	@%p14 bra 	$Lt_0_25602;\n"
+"$Lt_0_25090:\n"
+"	.loc	16	209	0\n"
+"	mov.f32 	%f6, %f77;\n"
+"	mov.f32 	%f8, %f78;\n"
+"	mov.f32 	%f10, %f79;\n"
+"	mov.f32 	%f12, %f80;\n"
+"	mov.f32 	%f14, %f85;\n"
+"	mov.f32 	%f16, %f86;\n"
+"$Lt_0_24578:\n"
+"$Lt_0_22530:\n"
+"	selp.s32 	%r61, 1, 0, %p1;\n"
+"	mov.s32 	%r62, 0;\n"
+"	set.eq.u32.s32 	%r63, %r6, %r62;\n"
+"	neg.s32 	%r64, %r63;\n"
+"	and.b32 	%r65, %r61, %r64;\n"
+"	mov.u32 	%r66, 0;\n"
+"	setp.eq.s32 	%p15, %r65, %r66;\n"
+"	@%p15 bra 	$Lt_0_26626;\n"
+"	.loc	16	215	0\n"
+"	cvt.s64.s32 	%rd46, %r9;\n"
+"	ld.param.u64 	%rd47, [__cudaparm_kernel_pair_engv];\n"
+"	mul.wide.s32 	%rd48, %r9, 4;\n"
+"	add.u64 	%rd49, %rd47, %rd48;\n"
+"	ld.param.s32 	%r67, [__cudaparm_kernel_pair_eflag];\n"
+"	mov.u32 	%r68, 0;\n"
+"	setp.le.s32 	%p16, %r67, %r68;\n"
+"	@%p16 bra 	$Lt_0_27138;\n"
+"	.loc	16	217	0\n"
+"	st.global.f32 	[%rd49+0], %f28;\n"
+"	.loc	16	218	0\n"
+"	cvt.s64.s32 	%rd50, %r10;\n"
+"	mul.wide.s32 	%rd51, %r10, 4;\n"
+"	add.u64 	%rd49, %rd49, %rd51;\n"
+"$Lt_0_27138:\n"
+"	ld.param.s32 	%r69, [__cudaparm_kernel_pair_vflag];\n"
+"	mov.u32 	%r70, 0;\n"
+"	setp.le.s32 	%p17, %r69, %r70;\n"
+"	@%p17 bra 	$Lt_0_27650;\n"
+"	.loc	16	222	0\n"
+"	mov.f32 	%f93, %f6;\n"
+"	st.global.f32 	[%rd49+0], %f93;\n"
+"	.loc	16	223	0\n"
+"	cvt.s64.s32 	%rd52, %r10;\n"
+"	mul.wide.s32 	%rd53, %r10, 4;\n"
+"	add.u64 	%rd54, %rd53, %rd49;\n"
+"	.loc	16	222	0\n"
+"	mov.f32 	%f94, %f8;\n"
+"	st.global.f32 	[%rd54+0], %f94;\n"
+"	.loc	16	223	0\n"
+"	add.u64 	%rd55, %rd53, %rd54;\n"
+"	.loc	16	222	0\n"
+"	mov.f32 	%f95, %f10;\n"
+"	st.global.f32 	[%rd55+0], %f95;\n"
+"	.loc	16	223	0\n"
+"	add.u64 	%rd56, %rd53, %rd55;\n"
+"	.loc	16	222	0\n"
+"	mov.f32 	%f96, %f12;\n"
+"	st.global.f32 	[%rd56+0], %f96;\n"
+"	.loc	16	223	0\n"
+"	add.u64 	%rd49, %rd53, %rd56;\n"
+"	.loc	16	222	0\n"
+"	mov.f32 	%f97, %f14;\n"
+"	st.global.f32 	[%rd49+0], %f97;\n"
+"	mov.f32 	%f98, %f16;\n"
+"	add.u64 	%rd57, %rd53, %rd49;\n"
+"	st.global.f32 	[%rd57+0], %f98;\n"
+"$Lt_0_27650:\n"
+"	.loc	16	226	0\n"
+"	ld.param.u64 	%rd58, [__cudaparm_kernel_pair_ans];\n"
+"	mul.lo.u64 	%rd59, %rd46, 16;\n"
+"	add.u64 	%rd60, %rd58, %rd59;\n"
+"	mov.f32 	%f99, %f100;\n"
+"	st.global.v4.f32 	[%rd60+0], {%f27,%f26,%f25,%f99};\n"
+"$Lt_0_26626:\n"
+"	.loc	16	228	0\n"
+"	exit;\n"
+"$LDWend_kernel_pair:\n"
+"	}\n"
+"	.entry kernel_pair_fast (\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_x_,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_lj1_in,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_lj3_in,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_dev_packed,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_ans,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_engv,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_eflag,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_vflag,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_inum,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_t_per_atom)\n"
+"	{\n"
+"	.reg .u32 %r<74>;\n"
+"	.reg .u64 %rd<74>;\n"
+"	.reg .f32 %f<109>;\n"
+"	.reg .pred %p<22>;\n"
+"	.shared .align 4 .b8 __cuda___cuda_local_var_32647_33_non_const_sp_lj3268[16];\n"
+"	.shared .align 16 .b8 __cuda___cuda_local_var_32645_34_non_const_lj13296[1936];\n"
+"	.shared .align 16 .b8 __cuda___cuda_local_var_32646_34_non_const_lj35232[1936];\n"
+"	.shared .align 4 .b8 __cuda___cuda_local_var_32735_35_non_const_red_acc7168[3072];\n"
+"	.loc	16	236	0\n"
+"$LDWbegin_kernel_pair_fast:\n"
+"	cvt.s32.u32 	%r1, %tid.x;\n"
+"	mov.u32 	%r2, 3;\n"
+"	setp.gt.s32 	%p1, %r1, %r2;\n"
+"	@%p1 bra 	$Lt_1_21250;\n"
+"	.loc	16	246	0\n"
+"	mov.u64 	%rd1, __cuda___cuda_local_var_32647_33_non_const_sp_lj3268;\n"
+"	cvt.s64.s32 	%rd2, %r1;\n"
+"	mul.wide.s32 	%rd3, %r1, 4;\n"
+"	ld.param.u64 	%rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];\n"
+"	add.u64 	%rd5, %rd4, %rd3;\n"
+"	ld.global.f32 	%f1, [%rd5+0];\n"
+"	add.u64 	%rd6, %rd3, %rd1;\n"
+"	st.shared.f32 	[%rd6+0], %f1;\n"
+"$Lt_1_21250:\n"
+"	mov.u64 	%rd1, __cuda___cuda_local_var_32647_33_non_const_sp_lj3268;\n"
+"	mov.u32 	%r3, 120;\n"
+"	setp.gt.s32 	%p2, %r1, %r3;\n"
+"	@%p2 bra 	$Lt_1_21762;\n"
+"	.loc	16	248	0\n"
+"	mov.u64 	%rd7, __cuda___cuda_local_var_32645_34_non_const_lj13296;\n"
+"	cvt.s64.s32 	%rd8, %r1;\n"
+"	mul.wide.s32 	%rd9, %r1, 16;\n"
+"	ld.param.u64 	%rd10, [__cudaparm_kernel_pair_fast_lj1_in];\n"
+"	add.u64 	%rd11, %rd10, %rd9;\n"
+"	add.u64 	%rd12, %rd9, %rd7;\n"
+"	ld.global.v4.f32 	{%f2,%f3,%f4,%f5}, [%rd11+0];\n"
+"	st.shared.v4.f32 	[%rd12+0], {%f2,%f3,%f4,%f5};\n"
+"	ld.param.s32 	%r4, [__cudaparm_kernel_pair_fast_eflag];\n"
+"	mov.u32 	%r5, 0;\n"
+"	setp.le.s32 	%p3, %r4, %r5;\n"
+"	@%p3 bra 	$Lt_1_22274;\n"
+"	.loc	16	250	0\n"
+"	mov.u64 	%rd13, __cuda___cuda_local_var_32646_34_non_const_lj35232;\n"
+"	ld.param.u64 	%rd14, [__cudaparm_kernel_pair_fast_lj3_in];\n"
+"	add.u64 	%rd15, %rd14, %rd9;\n"
+"	add.u64 	%rd16, %rd9, %rd13;\n"
+"	ld.global.v4.f32 	{%f6,%f7,%f8,%f9}, [%rd15+0];\n"
+"	st.shared.v4.f32 	[%rd16+0], {%f6,%f7,%f8,%f9};\n"
+"$Lt_1_22274:\n"
+"	mov.u64 	%rd13, __cuda___cuda_local_var_32646_34_non_const_lj35232;\n"
+"$Lt_1_21762:\n"
+"	mov.u64 	%rd7, __cuda___cuda_local_var_32645_34_non_const_lj13296;\n"
+"	mov.u64 	%rd13, __cuda___cuda_local_var_32646_34_non_const_lj35232;\n"
+"	.loc	16	260	0\n"
+"	mov.f32 	%f10, 0f00000000;    	\n"
+"	mov.f32 	%f11, %f10;\n"
+"	mov.f32 	%f12, 0f00000000;    	\n"
+"	mov.f32 	%f13, %f12;\n"
+"	mov.f32 	%f14, 0f00000000;    	\n"
+"	mov.f32 	%f15, %f14;\n"
+"	mov.f32 	%f16, 0f00000000;    	\n"
+"	mov.f32 	%f17, %f16;\n"
+"	mov.f32 	%f18, 0f00000000;    	\n"
+"	mov.f32 	%f19, %f18;\n"
+"	mov.f32 	%f20, 0f00000000;    	\n"
+"	mov.f32 	%f21, %f20;\n"
+"	.loc	16	262	0\n"
+"	bar.sync 	0;\n"
+"	ld.param.s32 	%r6, [__cudaparm_kernel_pair_fast_t_per_atom];\n"
+"	div.s32 	%r7, %r1, %r6;\n"
+"	cvt.s32.u32 	%r8, %ntid.x;\n"
+"	div.s32 	%r9, %r8, %r6;\n"
+"	rem.s32 	%r10, %r1, %r6;\n"
+"	cvt.s32.u32 	%r11, %ctaid.x;\n"
+"	mul.lo.s32 	%r12, %r11, %r9;\n"
+"	add.s32 	%r13, %r7, %r12;\n"
+"	ld.param.s32 	%r14, [__cudaparm_kernel_pair_fast_inum];\n"
+"	setp.lt.s32 	%p4, %r13, %r14;\n"
+"	@!%p4 bra 	$Lt_1_23042;\n"
+"	.loc	16	268	0\n"
+"	ld.param.s32 	%r15, [__cudaparm_kernel_pair_fast_nbor_pitch];\n"
+"	cvt.s64.s32 	%rd17, %r15;\n"
+"	mul.wide.s32 	%rd18, %r15, 4;\n"
+"	cvt.s64.s32 	%rd19, %r13;\n"
+"	mul.wide.s32 	%rd20, %r13, 4;\n"
+"	ld.param.u64 	%rd21, [__cudaparm_kernel_pair_fast_dev_nbor];\n"
+"	add.u64 	%rd22, %rd20, %rd21;\n"
+"	add.u64 	%rd23, %rd18, %rd22;\n"
+"	ld.global.s32 	%r16, [%rd23+0];\n"
+"	add.u64 	%rd24, %rd18, %rd23;\n"
+"	ld.param.u64 	%rd25, [__cudaparm_kernel_pair_fast_dev_packed];\n"
+"	setp.ne.u64 	%p5, %rd25, %rd21;\n"
+"	@%p5 bra 	$Lt_1_23554;\n"
+"	.loc	16	274	0\n"
+"	cvt.s32.s64 	%r17, %rd17;\n"
+"	mul.lo.s32 	%r18, %r17, %r16;\n"
+"	cvt.s64.s32 	%rd26, %r18;\n"
+"	mul.wide.s32 	%rd27, %r18, 4;\n"
+"	add.u64 	%rd28, %rd24, %rd27;\n"
+"	.loc	16	275	0\n"
+"	mul.lo.s32 	%r19, %r10, %r17;\n"
+"	cvt.s64.s32 	%rd29, %r19;\n"
+"	mul.wide.s32 	%rd30, %r19, 4;\n"
+"	add.u64 	%rd31, %rd24, %rd30;\n"
+"	.loc	16	276	0\n"
+"	mul.lo.s32 	%r20, %r17, %r6;\n"
+"	bra.uni 	$Lt_1_23298;\n"
+"$Lt_1_23554:\n"
+"	.loc	16	278	0\n"
+"	ld.global.s32 	%r21, [%rd24+0];\n"
+"	cvt.s64.s32 	%rd32, %r21;\n"
+"	mul.wide.s32 	%rd33, %r21, 4;\n"
+"	add.u64 	%rd34, %rd25, %rd33;\n"
+"	.loc	16	279	0\n"
+"	cvt.s64.s32 	%rd35, %r16;\n"
+"	mul.wide.s32 	%rd36, %r16, 4;\n"
+"	add.u64 	%rd28, %rd34, %rd36;\n"
+"	.loc	16	280	0\n"
+"	mov.s32 	%r20, %r6;\n"
+"	.loc	16	281	0\n"
+"	cvt.s64.s32 	%rd37, %r10;\n"
+"	mul.wide.s32 	%rd38, %r10, 4;\n"
+"	add.u64 	%rd31, %rd34, %rd38;\n"
+"$Lt_1_23298:\n"
+"	.loc	16	284	0\n"
+"	ld.global.s32 	%r22, [%rd22+0];\n"
+"	mov.u32 	%r23, %r22;\n"
+"	mov.s32 	%r24, 0;\n"
+"	mov.u32 	%r25, %r24;\n"
+"	mov.s32 	%r26, 0;\n"
+"	mov.u32 	%r27, %r26;\n"
+"	mov.s32 	%r28, 0;\n"
+"	mov.u32 	%r29, %r28;\n"
+"	tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r23,%r25,%r27,%r29}];\n"
+"	mov.f32 	%f26, %f22;\n"
+"	mov.f32 	%f27, %f23;\n"
+"	mov.f32 	%f28, %f24;\n"
+"	mov.f32 	%f29, %f25;\n"
+"	setp.ge.u64 	%p6, %rd31, %rd28;\n"
+"	@%p6 bra 	$Lt_1_32002;\n"
+"	cvt.rzi.ftz.s32.f32 	%r30, %f29;\n"
+"	cvt.s64.s32 	%rd39, %r20;\n"
+"	mul.lo.s32 	%r31, %r30, 11;\n"
+"	cvt.rn.f32.s32 	%f30, %r31;\n"
+"	mov.f32 	%f31, 0f00000000;    	\n"
+"	mov.f32 	%f32, 0f00000000;    	\n"
+"	mov.f32 	%f33, 0f00000000;    	\n"
+"	mov.f32 	%f34, 0f00000000;    	\n"
+"$Lt_1_24322:\n"
+"	.loc	16	291	0\n"
+"	ld.global.s32 	%r32, [%rd31+0];\n"
+"	.loc	16	292	0\n"
+"	shr.s32 	%r33, %r32, 30;\n"
+"	and.b32 	%r34, %r33, 3;\n"
+"	cvt.s64.s32 	%rd40, %r34;\n"
+"	mul.wide.s32 	%rd41, %r34, 4;\n"
+"	add.u64 	%rd42, %rd1, %rd41;\n"
+"	ld.shared.f32 	%f35, [%rd42+0];\n"
+"	.loc	16	295	0\n"
+"	and.b32 	%r35, %r32, 1073741823;\n"
+"	mov.u32 	%r36, %r35;\n"
+"	mov.s32 	%r37, 0;\n"
+"	mov.u32 	%r38, %r37;\n"
+"	mov.s32 	%r39, 0;\n"
+"	mov.u32 	%r40, %r39;\n"
+"	mov.s32 	%r41, 0;\n"
+"	mov.u32 	%r42, %r41;\n"
+"	tex.1d.v4.f32.s32 {%f36,%f37,%f38,%f39},[pos_tex,{%r36,%r38,%r40,%r42}];\n"
+"	mov.f32 	%f40, %f36;\n"
+"	mov.f32 	%f41, %f37;\n"
+"	mov.f32 	%f42, %f38;\n"
+"	mov.f32 	%f43, %f39;\n"
+"	sub.ftz.f32 	%f44, %f27, %f41;\n"
+"	sub.ftz.f32 	%f45, %f26, %f40;\n"
+"	sub.ftz.f32 	%f46, %f28, %f42;\n"
+"	mul.ftz.f32 	%f47, %f44, %f44;\n"
+"	fma.rn.ftz.f32 	%f48, %f45, %f45, %f47;\n"
+"	fma.rn.ftz.f32 	%f49, %f46, %f46, %f48;\n"
+"	add.ftz.f32 	%f50, %f30, %f43;\n"
+"	cvt.rzi.ftz.s32.f32 	%r43, %f50;\n"
+"	cvt.s64.s32 	%rd43, %r43;\n"
+"	mul.wide.s32 	%rd44, %r43, 16;\n"
+"	add.u64 	%rd45, %rd44, %rd7;\n"
+"	ld.shared.f32 	%f51, [%rd45+8];\n"
+"	setp.gt.ftz.f32 	%p7, %f51, %f49;\n"
+"	@!%p7 bra 	$Lt_1_25602;\n"
+"	.loc	16	307	0\n"
+"	rcp.approx.ftz.f32 	%f52, %f49;\n"
+"	mul.ftz.f32 	%f53, %f52, %f52;\n"
+"	mul.ftz.f32 	%f54, %f52, %f53;\n"
+"	mul.ftz.f32 	%f55, %f52, %f35;\n"
+"	mul.ftz.f32 	%f56, %f54, %f55;\n"
+"	ld.shared.v2.f32 	{%f57,%f58}, [%rd45+0];\n"
+"	mul.ftz.f32 	%f59, %f57, %f54;\n"
+"	sub.ftz.f32 	%f60, %f59, %f58;\n"
+"	mul.ftz.f32 	%f61, %f56, %f60;\n"
+"	.loc	16	309	0\n"
+"	fma.rn.ftz.f32 	%f33, %f45, %f61, %f33;\n"
+"	.loc	16	310	0\n"
+"	fma.rn.ftz.f32 	%f32, %f44, %f61, %f32;\n"
+"	.loc	16	311	0\n"
+"	fma.rn.ftz.f32 	%f31, %f46, %f61, %f31;\n"
+"	ld.param.s32 	%r44, [__cudaparm_kernel_pair_fast_eflag];\n"
+"	mov.u32 	%r45, 0;\n"
+"	setp.le.s32 	%p8, %r44, %r45;\n"
+"	@%p8 bra 	$Lt_1_25090;\n"
+"	.loc	16	314	0\n"
+"	add.u64 	%rd46, %rd44, %rd13;\n"
+"	ld.shared.v4.f32 	{%f62,%f63,%f64,_}, [%rd46+0];\n"
+"	mul.ftz.f32 	%f65, %f62, %f54;\n"
+"	sub.ftz.f32 	%f66, %f65, %f63;\n"
+"	mul.ftz.f32 	%f67, %f54, %f66;\n"
+"	.loc	16	315	0\n"
+"	sub.ftz.f32 	%f68, %f67, %f64;\n"
+"	fma.rn.ftz.f32 	%f34, %f35, %f68, %f34;\n"
+"$Lt_1_25090:\n"
+"	ld.param.s32 	%r46, [__cudaparm_kernel_pair_fast_vflag];\n"
+"	mov.u32 	%r47, 0;\n"
+"	setp.le.s32 	%p9, %r46, %r47;\n"
+"	@%p9 bra 	$Lt_1_25602;\n"
+"	.loc	16	318	0\n"
+"	mov.f32 	%f69, %f11;\n"
+"	mul.ftz.f32 	%f70, %f45, %f45;\n"
+"	fma.rn.ftz.f32 	%f71, %f61, %f70, %f69;\n"
+"	mov.f32 	%f11, %f71;\n"
+"	.loc	16	319	0\n"
+"	mov.f32 	%f72, %f13;\n"
+"	fma.rn.ftz.f32 	%f73, %f61, %f47, %f72;\n"
+"	mov.f32 	%f13, %f73;\n"
+"	.loc	16	320	0\n"
+"	mov.f32 	%f74, %f15;\n"
+"	mul.ftz.f32 	%f75, %f46, %f46;\n"
+"	fma.rn.ftz.f32 	%f76, %f61, %f75, %f74;\n"
+"	mov.f32 	%f15, %f76;\n"
+"	.loc	16	321	0\n"
+"	mov.f32 	%f77, %f17;\n"
+"	mul.ftz.f32 	%f78, %f44, %f45;\n"
+"	fma.rn.ftz.f32 	%f79, %f61, %f78, %f77;\n"
+"	mov.f32 	%f17, %f79;\n"
+"	.loc	16	322	0\n"
+"	mov.f32 	%f80, %f19;\n"
+"	mul.ftz.f32 	%f81, %f45, %f46;\n"
+"	fma.rn.ftz.f32 	%f82, %f61, %f81, %f80;\n"
+"	mov.f32 	%f19, %f82;\n"
+"	.loc	16	323	0\n"
+"	mul.ftz.f32 	%f83, %f44, %f46;\n"
+"	fma.rn.ftz.f32 	%f20, %f61, %f83, %f20;\n"
+"	mov.f32 	%f21, %f20;\n"
+"$Lt_1_25602:\n"
+"$Lt_1_24578:\n"
+"	.loc	16	289	0\n"
+"	mul.lo.u64 	%rd47, %rd39, 4;\n"
+"	add.u64 	%rd31, %rd31, %rd47;\n"
+"	setp.lt.u64 	%p10, %rd31, %rd28;\n"
+"	@%p10 bra 	$Lt_1_24322;\n"
+"	bra.uni 	$Lt_1_22786;\n"
+"$Lt_1_32002:\n"
+"	mov.f32 	%f31, 0f00000000;    	\n"
+"	mov.f32 	%f32, 0f00000000;    	\n"
+"	mov.f32 	%f33, 0f00000000;    	\n"
+"	mov.f32 	%f34, 0f00000000;    	\n"
+"	bra.uni 	$Lt_1_22786;\n"
+"$Lt_1_23042:\n"
+"	mov.f32 	%f31, 0f00000000;    	\n"
+"	mov.f32 	%f32, 0f00000000;    	\n"
+"	mov.f32 	%f33, 0f00000000;    	\n"
+"	mov.f32 	%f34, 0f00000000;    	\n"
+"$Lt_1_22786:\n"
+"	mov.u32 	%r48, 1;\n"
+"	setp.le.s32 	%p11, %r6, %r48;\n"
+"	@%p11 bra 	$Lt_1_28418;\n"
+"	.loc	16	334	0\n"
+"	mov.u64 	%rd48, __cuda___cuda_local_var_32735_35_non_const_red_acc7168;\n"
+"	cvt.s64.s32 	%rd49, %r1;\n"
+"	mul.wide.s32 	%rd50, %r1, 4;\n"
+"	add.u64 	%rd51, %rd48, %rd50;\n"
+"	mov.f32 	%f84, %f33;\n"
+"	st.shared.f32 	[%rd51+0], %f84;\n"
+"	.loc	16	335	0\n"
+"	mov.f32 	%f85, %f32;\n"
+"	st.shared.f32 	[%rd51+512], %f85;\n"
+"	.loc	16	336	0\n"
+"	mov.f32 	%f86, %f31;\n"
+"	st.shared.f32 	[%rd51+1024], %f86;\n"
+"	.loc	16	337	0\n"
+"	mov.f32 	%f87, %f34;\n"
+"	st.shared.f32 	[%rd51+1536], %f87;\n"
+"	.loc	16	339	0\n"
+"	shr.s32 	%r49, %r6, 31;\n"
+"	mov.s32 	%r50, 1;\n"
+"	and.b32 	%r51, %r49, %r50;\n"
+"	add.s32 	%r52, %r51, %r6;\n"
+"	shr.s32 	%r53, %r52, 1;\n"
+"	mov.s32 	%r54, %r53;\n"
+"	mov.u32 	%r55, 0;\n"
+"	setp.ne.u32 	%p12, %r53, %r55;\n"
+"	@!%p12 bra 	$Lt_1_26882;\n"
+"$Lt_1_27394:\n"
+"	setp.ge.u32 	%p13, %r10, %r54;\n"
+"	@%p13 bra 	$Lt_1_27650;\n"
+"	.loc	16	342	0\n"
+"	add.u32 	%r56, %r1, %r54;\n"
+"	cvt.u64.u32 	%rd52, %r56;\n"
+"	mul.wide.u32 	%rd53, %r56, 4;\n"
+"	add.u64 	%rd54, %rd48, %rd53;\n"
+"	ld.shared.f32 	%f88, [%rd54+0];\n"
+"	add.ftz.f32 	%f84, %f88, %f84;\n"
+"	st.shared.f32 	[%rd51+0], %f84;\n"
+"	ld.shared.f32 	%f89, [%rd54+512];\n"
+"	add.ftz.f32 	%f85, %f89, %f85;\n"
+"	st.shared.f32 	[%rd51+512], %f85;\n"
+"	ld.shared.f32 	%f90, [%rd54+1024];\n"
+"	add.ftz.f32 	%f86, %f90, %f86;\n"
+"	st.shared.f32 	[%rd51+1024], %f86;\n"
+"	ld.shared.f32 	%f91, [%rd54+1536];\n"
+"	add.ftz.f32 	%f87, %f91, %f87;\n"
+"	st.shared.f32 	[%rd51+1536], %f87;\n"
+"$Lt_1_27650:\n"
+"	.loc	16	339	0\n"
+"	shr.u32 	%r54, %r54, 1;\n"
+"	mov.u32 	%r57, 0;\n"
+"	setp.ne.u32 	%p14, %r54, %r57;\n"
+"	@%p14 bra 	$Lt_1_27394;\n"
+"$Lt_1_26882:\n"
+"	.loc	16	346	0\n"
+"	mov.f32 	%f33, %f84;\n"
+"	.loc	16	347	0\n"
+"	mov.f32 	%f32, %f85;\n"
+"	.loc	16	348	0\n"
+"	mov.f32 	%f31, %f86;\n"
+"	.loc	16	349	0\n"
+"	mov.f32 	%f34, %f87;\n"
+"	ld.param.s32 	%r58, [__cudaparm_kernel_pair_fast_vflag];\n"
+"	mov.u32 	%r59, 0;\n"
+"	setp.le.s32 	%p15, %r58, %r59;\n"
+"	@%p15 bra 	$Lt_1_28418;\n"
+"	.loc	16	353	0\n"
+"	mov.f32 	%f84, %f11;\n"
+"	st.shared.f32 	[%rd51+0], %f84;\n"
+"	mov.f32 	%f85, %f13;\n"
+"	st.shared.f32 	[%rd51+512], %f85;\n"
+"	mov.f32 	%f86, %f15;\n"
+"	st.shared.f32 	[%rd51+1024], %f86;\n"
+"	mov.f32 	%f87, %f17;\n"
+"	st.shared.f32 	[%rd51+1536], %f87;\n"
+"	mov.f32 	%f92, %f19;\n"
+"	st.shared.f32 	[%rd51+2048], %f92;\n"
+"	mov.f32 	%f93, %f21;\n"
+"	st.shared.f32 	[%rd51+2560], %f93;\n"
+"	.loc	16	355	0\n"
+"	mov.s32 	%r60, %r53;\n"
+"	@!%p12 bra 	$Lt_1_28930;\n"
+"$Lt_1_29442:\n"
+"	setp.ge.u32 	%p16, %r10, %r60;\n"
+"	@%p16 bra 	$Lt_1_29698;\n"
+"	.loc	16	358	0\n"
+"	add.u32 	%r61, %r1, %r60;\n"
+"	cvt.u64.u32 	%rd55, %r61;\n"
+"	mul.wide.u32 	%rd56, %r61, 4;\n"
+"	add.u64 	%rd57, %rd48, %rd56;\n"
+"	ld.shared.f32 	%f94, [%rd57+0];\n"
+"	add.ftz.f32 	%f84, %f94, %f84;\n"
+"	st.shared.f32 	[%rd51+0], %f84;\n"
+"	ld.shared.f32 	%f95, [%rd57+512];\n"
+"	add.ftz.f32 	%f85, %f95, %f85;\n"
+"	st.shared.f32 	[%rd51+512], %f85;\n"
+"	ld.shared.f32 	%f96, [%rd57+1024];\n"
+"	add.ftz.f32 	%f86, %f96, %f86;\n"
+"	st.shared.f32 	[%rd51+1024], %f86;\n"
+"	ld.shared.f32 	%f97, [%rd57+1536];\n"
+"	add.ftz.f32 	%f87, %f97, %f87;\n"
+"	st.shared.f32 	[%rd51+1536], %f87;\n"
+"	ld.shared.f32 	%f98, [%rd57+2048];\n"
+"	add.ftz.f32 	%f92, %f98, %f92;\n"
+"	st.shared.f32 	[%rd51+2048], %f92;\n"
+"	ld.shared.f32 	%f99, [%rd57+2560];\n"
+"	add.ftz.f32 	%f93, %f99, %f93;\n"
+"	st.shared.f32 	[%rd51+2560], %f93;\n"
+"$Lt_1_29698:\n"
+"	.loc	16	355	0\n"
+"	shr.u32 	%r60, %r60, 1;\n"
+"	mov.u32 	%r62, 0;\n"
+"	setp.ne.u32 	%p17, %r60, %r62;\n"
+"	@%p17 bra 	$Lt_1_29442;\n"
+"$Lt_1_28930:\n"
+"	.loc	16	363	0\n"
+"	mov.f32 	%f11, %f84;\n"
+"	mov.f32 	%f13, %f85;\n"
+"	mov.f32 	%f15, %f86;\n"
+"	mov.f32 	%f17, %f87;\n"
+"	mov.f32 	%f19, %f92;\n"
+"	mov.f32 	%f21, %f93;\n"
+"$Lt_1_28418:\n"
+"$Lt_1_26370:\n"
+"	selp.s32 	%r63, 1, 0, %p4;\n"
+"	mov.s32 	%r64, 0;\n"
+"	set.eq.u32.s32 	%r65, %r10, %r64;\n"
+"	neg.s32 	%r66, %r65;\n"
+"	and.b32 	%r67, %r63, %r66;\n"
+"	mov.u32 	%r68, 0;\n"
+"	setp.eq.s32 	%p18, %r67, %r68;\n"
+"	@%p18 bra 	$Lt_1_30466;\n"
+"	.loc	16	369	0\n"
+"	cvt.s64.s32 	%rd58, %r13;\n"
+"	ld.param.u64 	%rd59, [__cudaparm_kernel_pair_fast_engv];\n"
+"	mul.wide.s32 	%rd60, %r13, 4;\n"
+"	add.u64 	%rd61, %rd59, %rd60;\n"
+"	ld.param.s32 	%r69, [__cudaparm_kernel_pair_fast_eflag];\n"
+"	mov.u32 	%r70, 0;\n"
+"	setp.le.s32 	%p19, %r69, %r70;\n"
+"	@%p19 bra 	$Lt_1_30978;\n"
+"	.loc	16	371	0\n"
+"	st.global.f32 	[%rd61+0], %f34;\n"
+"	.loc	16	372	0\n"
+"	cvt.s64.s32 	%rd62, %r14;\n"
+"	mul.wide.s32 	%rd63, %r14, 4;\n"
+"	add.u64 	%rd61, %rd61, %rd63;\n"
+"$Lt_1_30978:\n"
+"	ld.param.s32 	%r71, [__cudaparm_kernel_pair_fast_vflag];\n"
+"	mov.u32 	%r72, 0;\n"
+"	setp.le.s32 	%p20, %r71, %r72;\n"
+"	@%p20 bra 	$Lt_1_31490;\n"
+"	.loc	16	376	0\n"
+"	mov.f32 	%f100, %f11;\n"
+"	st.global.f32 	[%rd61+0], %f100;\n"
+"	.loc	16	377	0\n"
+"	cvt.s64.s32 	%rd64, %r14;\n"
+"	mul.wide.s32 	%rd65, %r14, 4;\n"
+"	add.u64 	%rd66, %rd65, %rd61;\n"
+"	.loc	16	376	0\n"
+"	mov.f32 	%f101, %f13;\n"
+"	st.global.f32 	[%rd66+0], %f101;\n"
+"	.loc	16	377	0\n"
+"	add.u64 	%rd67, %rd65, %rd66;\n"
+"	.loc	16	376	0\n"
+"	mov.f32 	%f102, %f15;\n"
+"	st.global.f32 	[%rd67+0], %f102;\n"
+"	.loc	16	377	0\n"
+"	add.u64 	%rd68, %rd65, %rd67;\n"
+"	.loc	16	376	0\n"
+"	mov.f32 	%f103, %f17;\n"
+"	st.global.f32 	[%rd68+0], %f103;\n"
+"	.loc	16	377	0\n"
+"	add.u64 	%rd61, %rd65, %rd68;\n"
+"	.loc	16	376	0\n"
+"	mov.f32 	%f104, %f19;\n"
+"	st.global.f32 	[%rd61+0], %f104;\n"
+"	mov.f32 	%f105, %f21;\n"
+"	add.u64 	%rd69, %rd65, %rd61;\n"
+"	st.global.f32 	[%rd69+0], %f105;\n"
+"$Lt_1_31490:\n"
+"	.loc	16	380	0\n"
+"	ld.param.u64 	%rd70, [__cudaparm_kernel_pair_fast_ans];\n"
+"	mul.lo.u64 	%rd71, %rd58, 16;\n"
+"	add.u64 	%rd72, %rd70, %rd71;\n"
+"	mov.f32 	%f106, %f107;\n"
+"	st.global.v4.f32 	[%rd72+0], {%f33,%f32,%f31,%f106};\n"
+"$Lt_1_30466:\n"
+"	.loc	16	382	0\n"
+"	exit;\n"
+"$LDWend_kernel_pair_fast:\n"
+"	}\n"
+;
--- a/lib/gpu/lj_expand_gpu_kernel.ptx
+++ b/lib/gpu/lj_expand_gpu_kernel.ptx
@ -0,0 +1,993 @@
+	.version 2.3
+	.target sm_20
+	.address_size 64
+	// compiled with /usr/local/cuda/open64/lib//be
+	// nvopencc 4.0 built on 2011-05-12
+
+	//-----------------------------------------------------------
+	// Compiling /tmp/tmpxft_0000be22_00000000-9_lj_expand_gpu_kernel.cpp3.i (/home/sjplimp/ccBI#.LdVC9u)
+	//-----------------------------------------------------------
+
+	//-----------------------------------------------------------
+	// Options:
+	//-----------------------------------------------------------
+	//  Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
+	//  -O3	(Optimization level)
+	//  -g0	(Debug level)
+	//  -m2	(Report advisories)
+	//-----------------------------------------------------------
+
+	.file	1	"<command-line>"
+	.file	2	"/tmp/tmpxft_0000be22_00000000-8_lj_expand_gpu_kernel.cudafe2.gpu"
+	.file	3	"/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
+	.file	4	"/usr/local/cuda/include/crt/device_runtime.h"
+	.file	5	"/usr/local/cuda/include/host_defines.h"
+	.file	6	"/usr/local/cuda/include/builtin_types.h"
+	.file	7	"/usr/local/cuda/include/device_types.h"
+	.file	8	"/usr/local/cuda/include/driver_types.h"
+	.file	9	"/usr/local/cuda/include/surface_types.h"
+	.file	10	"/usr/local/cuda/include/texture_types.h"
+	.file	11	"/usr/local/cuda/include/vector_types.h"
+	.file	12	"/usr/local/cuda/include/device_launch_parameters.h"
+	.file	13	"/usr/local/cuda/include/crt/storage_class.h"
+	.file	14	"/usr/include/bits/types.h"
+	.file	15	"/usr/include/time.h"
+	.file	16	"lj_expand_gpu_kernel.cu"
+	.file	17	"/usr/local/cuda/include/common_functions.h"
+	.file	18	"/usr/local/cuda/include/math_functions.h"
+	.file	19	"/usr/local/cuda/include/math_constants.h"
+	.file	20	"/usr/local/cuda/include/device_functions.h"
+	.file	21	"/usr/local/cuda/include/sm_11_atomic_functions.h"
+	.file	22	"/usr/local/cuda/include/sm_12_atomic_functions.h"
+	.file	23	"/usr/local/cuda/include/sm_13_double_functions.h"
+	.file	24	"/usr/local/cuda/include/sm_20_atomic_functions.h"
+	.file	25	"/usr/local/cuda/include/sm_20_intrinsics.h"
+	.file	26	"/usr/local/cuda/include/surface_functions.h"
+	.file	27	"/usr/local/cuda/include/texture_fetch_functions.h"
+	.file	28	"/usr/local/cuda/include/math_functions_dbl_ptx3.h"
+
+	.global .texref pos_tex;
+
+	.entry kernel_pair (
+		.param .u64 __cudaparm_kernel_pair_x_,
+		.param .u64 __cudaparm_kernel_pair_lj1,
+		.param .u64 __cudaparm_kernel_pair_lj3,
+		.param .s32 __cudaparm_kernel_pair_lj_types,
+		.param .u64 __cudaparm_kernel_pair_sp_lj_in,
+		.param .u64 __cudaparm_kernel_pair_dev_nbor,
+		.param .u64 __cudaparm_kernel_pair_dev_packed,
+		.param .u64 __cudaparm_kernel_pair_ans,
+		.param .u64 __cudaparm_kernel_pair_engv,
+		.param .s32 __cudaparm_kernel_pair_eflag,
+		.param .s32 __cudaparm_kernel_pair_vflag,
+		.param .s32 __cudaparm_kernel_pair_inum,
+		.param .s32 __cudaparm_kernel_pair_nbor_pitch,
+		.param .s32 __cudaparm_kernel_pair_t_per_atom)
+	{
+	.reg .u32 %r<72>;
+	.reg .u64 %rd<62>;
+	.reg .f32 %f<107>;
+	.reg .pred %p<19>;
+	.shared .align 16 .b8 __cuda___cuda_local_var_32497_33_non_const_sp_lj92[16];
+	.shared .align 4 .b8 __cuda___cuda_local_var_32584_35_non_const_red_acc108[3072];
+	// __cuda_local_var_32504_10_non_const_f = 48
+	// __cuda_local_var_32508_9_non_const_virial = 16
+	.loc	16	88	0
+$LDWbegin_kernel_pair:
+	.loc	16	95	0
+	ld.param.u64 	%rd1, [__cudaparm_kernel_pair_sp_lj_in];
+	ldu.global.f32 	%f1, [%rd1+0];
+	.loc	16	96	0
+	ld.global.f32 	%f2, [%rd1+4];
+	.loc	16	97	0
+	ld.global.f32 	%f3, [%rd1+8];
+	.loc	16	98	0
+	ld.global.f32 	%f4, [%rd1+12];
+	st.shared.v4.f32 	[__cuda___cuda_local_var_32497_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4};
+	.loc	16	107	0
+	mov.f32 	%f5, 0f00000000;     	// 0
+	mov.f32 	%f6, %f5;
+	mov.f32 	%f7, 0f00000000;     	// 0
+	mov.f32 	%f8, %f7;
+	mov.f32 	%f9, 0f00000000;     	// 0
+	mov.f32 	%f10, %f9;
+	mov.f32 	%f11, 0f00000000;    	// 0
+	mov.f32 	%f12, %f11;
+	mov.f32 	%f13, 0f00000000;    	// 0
+	mov.f32 	%f14, %f13;
+	mov.f32 	%f15, 0f00000000;    	// 0
+	mov.f32 	%f16, %f15;
+	ld.param.s32 	%r1, [__cudaparm_kernel_pair_t_per_atom];
+	cvt.s32.u32 	%r2, %tid.x;
+	div.s32 	%r3, %r2, %r1;
+	cvt.s32.u32 	%r4, %ntid.x;
+	div.s32 	%r5, %r4, %r1;
+	rem.s32 	%r6, %r2, %r1;
+	cvt.s32.u32 	%r7, %ctaid.x;
+	mul.lo.s32 	%r8, %r7, %r5;
+	add.s32 	%r9, %r3, %r8;
+	ld.param.s32 	%r10, [__cudaparm_kernel_pair_inum];
+	setp.lt.s32 	%p1, %r9, %r10;
+	@!%p1 bra 	$Lt_0_19202;
+	.loc	16	113	0
+	ld.param.s32 	%r11, [__cudaparm_kernel_pair_nbor_pitch];
+	cvt.s64.s32 	%rd2, %r11;
+	mul.wide.s32 	%rd3, %r11, 4;
+	cvt.s64.s32 	%rd4, %r9;
+	mul.wide.s32 	%rd5, %r9, 4;
+	ld.param.u64 	%rd6, [__cudaparm_kernel_pair_dev_nbor];
+	add.u64 	%rd7, %rd5, %rd6;
+	add.u64 	%rd8, %rd3, %rd7;
+	ld.global.s32 	%r12, [%rd8+0];
+	add.u64 	%rd9, %rd3, %rd8;
+	ld.param.u64 	%rd10, [__cudaparm_kernel_pair_dev_packed];
+	setp.ne.u64 	%p2, %rd10, %rd6;
+	@%p2 bra 	$Lt_0_19714;
+	.loc	16	119	0
+	cvt.s32.s64 	%r13, %rd2;
+	mul.lo.s32 	%r14, %r13, %r12;
+	cvt.s64.s32 	%rd11, %r14;
+	mul.wide.s32 	%rd12, %r14, 4;
+	add.u64 	%rd13, %rd9, %rd12;
+	.loc	16	120	0
+	mul.lo.s32 	%r15, %r6, %r13;
+	cvt.s64.s32 	%rd14, %r15;
+	mul.wide.s32 	%rd15, %r15, 4;
+	add.u64 	%rd16, %rd9, %rd15;
+	.loc	16	121	0
+	mul.lo.s32 	%r16, %r13, %r1;
+	bra.uni 	$Lt_0_19458;
+$Lt_0_19714:
+	.loc	16	123	0
+	ld.global.s32 	%r17, [%rd9+0];
+	cvt.s64.s32 	%rd17, %r17;
+	mul.wide.s32 	%rd18, %r17, 4;
+	add.u64 	%rd19, %rd10, %rd18;
+	.loc	16	124	0
+	cvt.s64.s32 	%rd20, %r12;
+	mul.wide.s32 	%rd21, %r12, 4;
+	add.u64 	%rd13, %rd19, %rd21;
+	.loc	16	125	0
+	mov.s32 	%r16, %r1;
+	.loc	16	126	0
+	cvt.s64.s32 	%rd22, %r6;
+	mul.wide.s32 	%rd23, %r6, 4;
+	add.u64 	%rd16, %rd19, %rd23;
+$Lt_0_19458:
+	.loc	16	129	0
+	ld.global.s32 	%r18, [%rd7+0];
+	mov.u32 	%r19, %r18;
+	mov.s32 	%r20, 0;
+	mov.u32 	%r21, %r20;
+	mov.s32 	%r22, 0;
+	mov.u32 	%r23, %r22;
+	mov.s32 	%r24, 0;
+	mov.u32 	%r25, %r24;
+	tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r19,%r21,%r23,%r25}];
+	mov.f32 	%f21, %f17;
+	mov.f32 	%f22, %f18;
+	mov.f32 	%f23, %f19;
+	mov.f32 	%f24, %f20;
+	setp.ge.u64 	%p3, %rd16, %rd13;
+	@%p3 bra 	$Lt_0_28162;
+	cvt.rzi.ftz.s32.f32 	%r26, %f24;
+	cvt.s64.s32 	%rd24, %r16;
+	ld.param.s32 	%r27, [__cudaparm_kernel_pair_lj_types];
+	mul.lo.s32 	%r28, %r27, %r26;
+	ld.param.u64 	%rd25, [__cudaparm_kernel_pair_lj1];
+	mov.f32 	%f25, 0f00000000;    	// 0
+	mov.f32 	%f26, 0f00000000;    	// 0
+	mov.f32 	%f27, 0f00000000;    	// 0
+	mov.f32 	%f28, 0f00000000;    	// 0
+	mov.u64 	%rd26, __cuda___cuda_local_var_32497_33_non_const_sp_lj92;
+$Lt_0_20482:
+ //<loop> Loop body line 129, nesting depth: 1, estimated iterations: unknown
+	.loc	16	135	0
+	ld.global.s32 	%r29, [%rd16+0];
+	.loc	16	136	0
+	shr.s32 	%r30, %r29, 30;
+	and.b32 	%r31, %r30, 3;
+	cvt.s64.s32 	%rd27, %r31;
+	mul.wide.s32 	%rd28, %r31, 4;
+	add.u64 	%rd29, %rd26, %rd28;
+	ld.shared.f32 	%f29, [%rd29+0];
+	.loc	16	139	0
+	and.b32 	%r32, %r29, 1073741823;
+	mov.u32 	%r33, %r32;
+	mov.s32 	%r34, 0;
+	mov.u32 	%r35, %r34;
+	mov.s32 	%r36, 0;
+	mov.u32 	%r37, %r36;
+	mov.s32 	%r38, 0;
+	mov.u32 	%r39, %r38;
+	tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r33,%r35,%r37,%r39}];
+	mov.f32 	%f34, %f30;
+	mov.f32 	%f35, %f31;
+	mov.f32 	%f36, %f32;
+	mov.f32 	%f37, %f33;
+	cvt.rzi.ftz.s32.f32 	%r40, %f37;
+	sub.ftz.f32 	%f38, %f22, %f35;
+	sub.ftz.f32 	%f39, %f21, %f34;
+	sub.ftz.f32 	%f40, %f23, %f36;
+	mul.ftz.f32 	%f41, %f38, %f38;
+	fma.rn.ftz.f32 	%f42, %f39, %f39, %f41;
+	fma.rn.ftz.f32 	%f43, %f40, %f40, %f42;
+	add.s32 	%r41, %r40, %r28;
+	cvt.s64.s32 	%rd30, %r41;
+	mul.wide.s32 	%rd31, %r41, 16;
+	add.u64 	%rd32, %rd31, %rd25;
+	ld.global.f32 	%f44, [%rd32+8];
+	setp.gt.ftz.f32 	%p4, %f44, %f43;
+	@!%p4 bra 	$Lt_0_21762;
+	.loc	16	151	0
+	sqrt.approx.ftz.f32 	%f45, %f43;
+	ld.global.v4.f32 	{%f46,%f47,_,%f48}, [%rd32+0];
+	sub.ftz.f32 	%f49, %f45, %f48;
+	.loc	16	156	0
+	mul.ftz.f32 	%f50, %f49, %f49;
+	rcp.approx.ftz.f32 	%f51, %f50;
+	mul.ftz.f32 	%f52, %f51, %f51;
+	mul.ftz.f32 	%f53, %f51, %f52;
+	div.approx.ftz.f32 	%f54, %f29, %f49;
+	div.approx.ftz.f32 	%f55, %f54, %f45;
+	mul.ftz.f32 	%f56, %f46, %f53;
+	sub.ftz.f32 	%f57, %f56, %f47;
+	mul.ftz.f32 	%f58, %f53, %f57;
+	mul.ftz.f32 	%f59, %f55, %f58;
+	.loc	16	158	0
+	fma.rn.ftz.f32 	%f27, %f39, %f59, %f27;
+	.loc	16	159	0
+	fma.rn.ftz.f32 	%f26, %f38, %f59, %f26;
+	.loc	16	160	0
+	fma.rn.ftz.f32 	%f25, %f40, %f59, %f25;
+	ld.param.s32 	%r42, [__cudaparm_kernel_pair_eflag];
+	mov.u32 	%r43, 0;
+	setp.le.s32 	%p5, %r42, %r43;
+	@%p5 bra 	$Lt_0_21250;
+	.loc	16	164	0
+	ld.param.u64 	%rd33, [__cudaparm_kernel_pair_lj3];
+	add.u64 	%rd34, %rd33, %rd31;
+	ld.global.v4.f32 	{%f60,%f61,%f62,_}, [%rd34+0];
+	mul.ftz.f32 	%f63, %f60, %f53;
+	sub.ftz.f32 	%f64, %f63, %f61;
+	mul.ftz.f32 	%f65, %f53, %f64;
+	sub.ftz.f32 	%f66, %f65, %f62;
+	fma.rn.ftz.f32 	%f28, %f29, %f66, %f28;
+$Lt_0_21250:
+	ld.param.s32 	%r44, [__cudaparm_kernel_pair_vflag];
+	mov.u32 	%r45, 0;
+	setp.le.s32 	%p6, %r44, %r45;
+	@%p6 bra 	$Lt_0_21762;
+	.loc	16	167	0
+	mov.f32 	%f67, %f6;
+	mul.ftz.f32 	%f68, %f39, %f39;
+	fma.rn.ftz.f32 	%f69, %f59, %f68, %f67;
+	mov.f32 	%f6, %f69;
+	.loc	16	168	0
+	mov.f32 	%f70, %f8;
+	fma.rn.ftz.f32 	%f71, %f59, %f41, %f70;
+	mov.f32 	%f8, %f71;
+	.loc	16	169	0
+	mov.f32 	%f72, %f10;
+	mul.ftz.f32 	%f73, %f40, %f40;
+	fma.rn.ftz.f32 	%f74, %f59, %f73, %f72;
+	mov.f32 	%f10, %f74;
+	.loc	16	170	0
+	mov.f32 	%f75, %f12;
+	mul.ftz.f32 	%f76, %f38, %f39;
+	fma.rn.ftz.f32 	%f77, %f59, %f76, %f75;
+	mov.f32 	%f12, %f77;
+	.loc	16	171	0
+	mov.f32 	%f78, %f14;
+	mul.ftz.f32 	%f79, %f39, %f40;
+	fma.rn.ftz.f32 	%f80, %f59, %f79, %f78;
+	mov.f32 	%f14, %f80;
+	.loc	16	172	0
+	mul.ftz.f32 	%f81, %f38, %f40;
+	fma.rn.ftz.f32 	%f15, %f59, %f81, %f15;
+	mov.f32 	%f16, %f15;
+$Lt_0_21762:
+$Lt_0_20738:
+	.loc	16	133	0
+	mul.lo.u64 	%rd35, %rd24, 4;
+	add.u64 	%rd16, %rd16, %rd35;
+	setp.lt.u64 	%p7, %rd16, %rd13;
+	@%p7 bra 	$Lt_0_20482;
+	bra.uni 	$Lt_0_18946;
+$Lt_0_28162:
+	mov.f32 	%f25, 0f00000000;    	// 0
+	mov.f32 	%f26, 0f00000000;    	// 0
+	mov.f32 	%f27, 0f00000000;    	// 0
+	mov.f32 	%f28, 0f00000000;    	// 0
+	bra.uni 	$Lt_0_18946;
+$Lt_0_19202:
+	mov.f32 	%f25, 0f00000000;    	// 0
+	mov.f32 	%f26, 0f00000000;    	// 0
+	mov.f32 	%f27, 0f00000000;    	// 0
+	mov.f32 	%f28, 0f00000000;    	// 0
+$Lt_0_18946:
+	mov.u32 	%r46, 1;
+	setp.le.s32 	%p8, %r1, %r46;
+	@%p8 bra 	$Lt_0_24578;
+	.loc	16	183	0
+	mov.u64 	%rd36, __cuda___cuda_local_var_32584_35_non_const_red_acc108;
+	cvt.s64.s32 	%rd37, %r2;
+	mul.wide.s32 	%rd38, %r2, 4;
+	add.u64 	%rd39, %rd36, %rd38;
+	mov.f32 	%f82, %f27;
+	st.shared.f32 	[%rd39+0], %f82;
+	.loc	16	184	0
+	mov.f32 	%f83, %f26;
+	st.shared.f32 	[%rd39+512], %f83;
+	.loc	16	185	0
+	mov.f32 	%f84, %f25;
+	st.shared.f32 	[%rd39+1024], %f84;
+	.loc	16	186	0
+	mov.f32 	%f85, %f28;
+	st.shared.f32 	[%rd39+1536], %f85;
+	.loc	16	188	0
+	shr.s32 	%r47, %r1, 31;
+	mov.s32 	%r48, 1;
+	and.b32 	%r49, %r47, %r48;
+	add.s32 	%r50, %r49, %r1;
+	shr.s32 	%r51, %r50, 1;
+	mov.s32 	%r52, %r51;
+	mov.u32 	%r53, 0;
+	setp.ne.u32 	%p9, %r51, %r53;
+	@!%p9 bra 	$Lt_0_23042;
+$Lt_0_23554:
+	setp.ge.u32 	%p10, %r6, %r52;
+	@%p10 bra 	$Lt_0_23810;
+	.loc	16	191	0
+	add.u32 	%r54, %r2, %r52;
+	cvt.u64.u32 	%rd40, %r54;
+	mul.wide.u32 	%rd41, %r54, 4;
+	add.u64 	%rd42, %rd36, %rd41;
+	ld.shared.f32 	%f86, [%rd42+0];
+	add.ftz.f32 	%f82, %f86, %f82;
+	st.shared.f32 	[%rd39+0], %f82;
+	ld.shared.f32 	%f87, [%rd42+512];
+	add.ftz.f32 	%f83, %f87, %f83;
+	st.shared.f32 	[%rd39+512], %f83;
+	ld.shared.f32 	%f88, [%rd42+1024];
+	add.ftz.f32 	%f84, %f88, %f84;
+	st.shared.f32 	[%rd39+1024], %f84;
+	ld.shared.f32 	%f89, [%rd42+1536];
+	add.ftz.f32 	%f85, %f89, %f85;
+	st.shared.f32 	[%rd39+1536], %f85;
+$Lt_0_23810:
+	.loc	16	188	0
+	shr.u32 	%r52, %r52, 1;
+	mov.u32 	%r55, 0;
+	setp.ne.u32 	%p11, %r52, %r55;
+	@%p11 bra 	$Lt_0_23554;
+$Lt_0_23042:
+	.loc	16	195	0
+	mov.f32 	%f27, %f82;
+	.loc	16	196	0
+	mov.f32 	%f26, %f83;
+	.loc	16	197	0
+	mov.f32 	%f25, %f84;
+	.loc	16	198	0
+	mov.f32 	%f28, %f85;
+	ld.param.s32 	%r56, [__cudaparm_kernel_pair_vflag];
+	mov.u32 	%r57, 0;
+	setp.le.s32 	%p12, %r56, %r57;
+	@%p12 bra 	$Lt_0_24578;
+	.loc	16	202	0
+	mov.f32 	%f82, %f6;
+	st.shared.f32 	[%rd39+0], %f82;
+	mov.f32 	%f83, %f8;
+	st.shared.f32 	[%rd39+512], %f83;
+	mov.f32 	%f84, %f10;
+	st.shared.f32 	[%rd39+1024], %f84;
+	mov.f32 	%f85, %f12;
+	st.shared.f32 	[%rd39+1536], %f85;
+	mov.f32 	%f90, %f14;
+	st.shared.f32 	[%rd39+2048], %f90;
+	mov.f32 	%f91, %f16;
+	st.shared.f32 	[%rd39+2560], %f91;
+	.loc	16	204	0
+	mov.s32 	%r58, %r51;
+	@!%p9 bra 	$Lt_0_25090;
+$Lt_0_25602:
+	setp.ge.u32 	%p13, %r6, %r58;
+	@%p13 bra 	$Lt_0_25858;
+	.loc	16	207	0
+	add.u32 	%r59, %r2, %r58;
+	cvt.u64.u32 	%rd43, %r59;
+	mul.wide.u32 	%rd44, %r59, 4;
+	add.u64 	%rd45, %rd36, %rd44;
+	ld.shared.f32 	%f92, [%rd45+0];
+	add.ftz.f32 	%f82, %f92, %f82;
+	st.shared.f32 	[%rd39+0], %f82;
+	ld.shared.f32 	%f93, [%rd45+512];
+	add.ftz.f32 	%f83, %f93, %f83;
+	st.shared.f32 	[%rd39+512], %f83;
+	ld.shared.f32 	%f94, [%rd45+1024];
+	add.ftz.f32 	%f84, %f94, %f84;
+	st.shared.f32 	[%rd39+1024], %f84;
+	ld.shared.f32 	%f95, [%rd45+1536];
+	add.ftz.f32 	%f85, %f95, %f85;
+	st.shared.f32 	[%rd39+1536], %f85;
+	ld.shared.f32 	%f96, [%rd45+2048];
+	add.ftz.f32 	%f90, %f96, %f90;
+	st.shared.f32 	[%rd39+2048], %f90;
+	ld.shared.f32 	%f97, [%rd45+2560];
+	add.ftz.f32 	%f91, %f97, %f91;
+	st.shared.f32 	[%rd39+2560], %f91;
+$Lt_0_25858:
+	.loc	16	204	0
+	shr.u32 	%r58, %r58, 1;
+	mov.u32 	%r60, 0;
+	setp.ne.u32 	%p14, %r58, %r60;
+	@%p14 bra 	$Lt_0_25602;
+$Lt_0_25090:
+	.loc	16	212	0
+	mov.f32 	%f6, %f82;
+	mov.f32 	%f8, %f83;
+	mov.f32 	%f10, %f84;
+	mov.f32 	%f12, %f85;
+	mov.f32 	%f14, %f90;
+	mov.f32 	%f16, %f91;
+$Lt_0_24578:
+$Lt_0_22530:
+	selp.s32 	%r61, 1, 0, %p1;
+	mov.s32 	%r62, 0;
+	set.eq.u32.s32 	%r63, %r6, %r62;
+	neg.s32 	%r64, %r63;
+	and.b32 	%r65, %r61, %r64;
+	mov.u32 	%r66, 0;
+	setp.eq.s32 	%p15, %r65, %r66;
+	@%p15 bra 	$Lt_0_26626;
+	.loc	16	218	0
+	cvt.s64.s32 	%rd46, %r9;
+	ld.param.u64 	%rd47, [__cudaparm_kernel_pair_engv];
+	mul.wide.s32 	%rd48, %r9, 4;
+	add.u64 	%rd49, %rd47, %rd48;
+	ld.param.s32 	%r67, [__cudaparm_kernel_pair_eflag];
+	mov.u32 	%r68, 0;
+	setp.le.s32 	%p16, %r67, %r68;
+	@%p16 bra 	$Lt_0_27138;
+	.loc	16	220	0
+	st.global.f32 	[%rd49+0], %f28;
+	.loc	16	221	0
+	cvt.s64.s32 	%rd50, %r10;
+	mul.wide.s32 	%rd51, %r10, 4;
+	add.u64 	%rd49, %rd49, %rd51;
+$Lt_0_27138:
+	ld.param.s32 	%r69, [__cudaparm_kernel_pair_vflag];
+	mov.u32 	%r70, 0;
+	setp.le.s32 	%p17, %r69, %r70;
+	@%p17 bra 	$Lt_0_27650;
+	.loc	16	225	0
+	mov.f32 	%f98, %f6;
+	st.global.f32 	[%rd49+0], %f98;
+	.loc	16	226	0
+	cvt.s64.s32 	%rd52, %r10;
+	mul.wide.s32 	%rd53, %r10, 4;
+	add.u64 	%rd54, %rd53, %rd49;
+	.loc	16	225	0
+	mov.f32 	%f99, %f8;
+	st.global.f32 	[%rd54+0], %f99;
+	.loc	16	226	0
+	add.u64 	%rd55, %rd53, %rd54;
+	.loc	16	225	0
+	mov.f32 	%f100, %f10;
+	st.global.f32 	[%rd55+0], %f100;
+	.loc	16	226	0
+	add.u64 	%rd56, %rd53, %rd55;
+	.loc	16	225	0
+	mov.f32 	%f101, %f12;
+	st.global.f32 	[%rd56+0], %f101;
+	.loc	16	226	0
+	add.u64 	%rd49, %rd53, %rd56;
+	.loc	16	225	0
+	mov.f32 	%f102, %f14;
+	st.global.f32 	[%rd49+0], %f102;
+	mov.f32 	%f103, %f16;
+	add.u64 	%rd57, %rd53, %rd49;
+	st.global.f32 	[%rd57+0], %f103;
+$Lt_0_27650:
+	.loc	16	229	0
+	ld.param.u64 	%rd58, [__cudaparm_kernel_pair_ans];
+	mul.lo.u64 	%rd59, %rd46, 16;
+	add.u64 	%rd60, %rd58, %rd59;
+	mov.f32 	%f104, %f105;
+	st.global.v4.f32 	[%rd60+0], {%f27,%f26,%f25,%f104};
+$Lt_0_26626:
+	.loc	16	231	0
+	exit;
+$LDWend_kernel_pair:
+	} // kernel_pair
+
+	.entry kernel_pair_fast (
+		.param .u64 __cudaparm_kernel_pair_fast_x_,
+		.param .u64 __cudaparm_kernel_pair_fast_lj1_in,
+		.param .u64 __cudaparm_kernel_pair_fast_lj3_in,
+		.param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,
+		.param .u64 __cudaparm_kernel_pair_fast_dev_nbor,
+		.param .u64 __cudaparm_kernel_pair_fast_dev_packed,
+		.param .u64 __cudaparm_kernel_pair_fast_ans,
+		.param .u64 __cudaparm_kernel_pair_fast_engv,
+		.param .s32 __cudaparm_kernel_pair_fast_eflag,
+		.param .s32 __cudaparm_kernel_pair_fast_vflag,
+		.param .s32 __cudaparm_kernel_pair_fast_inum,
+		.param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,
+		.param .s32 __cudaparm_kernel_pair_fast_t_per_atom)
+	{
+	.reg .u32 %r<74>;
+	.reg .u64 %rd<74>;
+	.reg .f32 %f<114>;
+	.reg .f64 %fd<4>;
+	.reg .pred %p<22>;
+	.shared .align 4 .b8 __cuda___cuda_local_var_32650_33_non_const_sp_lj3268[16];
+	.shared .align 16 .b8 __cuda___cuda_local_var_32648_34_non_const_lj13296[1936];
+	.shared .align 16 .b8 __cuda___cuda_local_var_32649_34_non_const_lj35232[1936];
+	.shared .align 4 .b8 __cuda___cuda_local_var_32742_35_non_const_red_acc7168[3072];
+	// __cuda_local_var_32660_10_non_const_f = 48
+	// __cuda_local_var_32664_9_non_const_virial = 16
+	.loc	16	239	0
+$LDWbegin_kernel_pair_fast:
+	cvt.s32.u32 	%r1, %tid.x;
+	mov.u32 	%r2, 3;
+	setp.gt.s32 	%p1, %r1, %r2;
+	@%p1 bra 	$Lt_1_21250;
+	.loc	16	249	0
+	mov.u64 	%rd1, __cuda___cuda_local_var_32650_33_non_const_sp_lj3268;
+	cvt.s64.s32 	%rd2, %r1;
+	mul.wide.s32 	%rd3, %r1, 4;
+	ld.param.u64 	%rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];
+	add.u64 	%rd5, %rd4, %rd3;
+	ld.global.f32 	%f1, [%rd5+0];
+	add.u64 	%rd6, %rd3, %rd1;
+	st.shared.f32 	[%rd6+0], %f1;
+$Lt_1_21250:
+	mov.u64 	%rd1, __cuda___cuda_local_var_32650_33_non_const_sp_lj3268;
+	mov.u32 	%r3, 120;
+	setp.gt.s32 	%p2, %r1, %r3;
+	@%p2 bra 	$Lt_1_21762;
+	.loc	16	251	0
+	mov.u64 	%rd7, __cuda___cuda_local_var_32648_34_non_const_lj13296;
+	cvt.s64.s32 	%rd8, %r1;
+	mul.wide.s32 	%rd9, %r1, 16;
+	ld.param.u64 	%rd10, [__cudaparm_kernel_pair_fast_lj1_in];
+	add.u64 	%rd11, %rd10, %rd9;
+	add.u64 	%rd12, %rd9, %rd7;
+	ld.global.v4.f32 	{%f2,%f3,%f4,%f5}, [%rd11+0];
+	st.shared.v4.f32 	[%rd12+0], {%f2,%f3,%f4,%f5};
+	ld.param.s32 	%r4, [__cudaparm_kernel_pair_fast_eflag];
+	mov.u32 	%r5, 0;
+	setp.le.s32 	%p3, %r4, %r5;
+	@%p3 bra 	$Lt_1_22274;
+	.loc	16	253	0
+	mov.u64 	%rd13, __cuda___cuda_local_var_32649_34_non_const_lj35232;
+	ld.param.u64 	%rd14, [__cudaparm_kernel_pair_fast_lj3_in];
+	add.u64 	%rd15, %rd14, %rd9;
+	add.u64 	%rd16, %rd9, %rd13;
+	ld.global.v4.f32 	{%f6,%f7,%f8,%f9}, [%rd15+0];
+	st.shared.v4.f32 	[%rd16+0], {%f6,%f7,%f8,%f9};
+$Lt_1_22274:
+	mov.u64 	%rd13, __cuda___cuda_local_var_32649_34_non_const_lj35232;
+$Lt_1_21762:
+	mov.u64 	%rd7, __cuda___cuda_local_var_32648_34_non_const_lj13296;
+	mov.u64 	%rd13, __cuda___cuda_local_var_32649_34_non_const_lj35232;
+	.loc	16	263	0
+	mov.f32 	%f10, 0f00000000;    	// 0
+	mov.f32 	%f11, %f10;
+	mov.f32 	%f12, 0f00000000;    	// 0
+	mov.f32 	%f13, %f12;
+	mov.f32 	%f14, 0f00000000;    	// 0
+	mov.f32 	%f15, %f14;
+	mov.f32 	%f16, 0f00000000;    	// 0
+	mov.f32 	%f17, %f16;
+	mov.f32 	%f18, 0f00000000;    	// 0
+	mov.f32 	%f19, %f18;
+	mov.f32 	%f20, 0f00000000;    	// 0
+	mov.f32 	%f21, %f20;
+	.loc	16	265	0
+	bar.sync 	0;
+	ld.param.s32 	%r6, [__cudaparm_kernel_pair_fast_t_per_atom];
+	div.s32 	%r7, %r1, %r6;
+	cvt.s32.u32 	%r8, %ntid.x;
+	div.s32 	%r9, %r8, %r6;
+	rem.s32 	%r10, %r1, %r6;
+	cvt.s32.u32 	%r11, %ctaid.x;
+	mul.lo.s32 	%r12, %r11, %r9;
+	add.s32 	%r13, %r7, %r12;
+	ld.param.s32 	%r14, [__cudaparm_kernel_pair_fast_inum];
+	setp.lt.s32 	%p4, %r13, %r14;
+	@!%p4 bra 	$Lt_1_23042;
+	.loc	16	271	0
+	ld.param.s32 	%r15, [__cudaparm_kernel_pair_fast_nbor_pitch];
+	cvt.s64.s32 	%rd17, %r15;
+	mul.wide.s32 	%rd18, %r15, 4;
+	cvt.s64.s32 	%rd19, %r13;
+	mul.wide.s32 	%rd20, %r13, 4;
+	ld.param.u64 	%rd21, [__cudaparm_kernel_pair_fast_dev_nbor];
+	add.u64 	%rd22, %rd20, %rd21;
+	add.u64 	%rd23, %rd18, %rd22;
+	ld.global.s32 	%r16, [%rd23+0];
+	add.u64 	%rd24, %rd18, %rd23;
+	ld.param.u64 	%rd25, [__cudaparm_kernel_pair_fast_dev_packed];
+	setp.ne.u64 	%p5, %rd25, %rd21;
+	@%p5 bra 	$Lt_1_23554;
+	.loc	16	277	0
+	cvt.s32.s64 	%r17, %rd17;
+	mul.lo.s32 	%r18, %r17, %r16;
+	cvt.s64.s32 	%rd26, %r18;
+	mul.wide.s32 	%rd27, %r18, 4;
+	add.u64 	%rd28, %rd24, %rd27;
+	.loc	16	278	0
+	mul.lo.s32 	%r19, %r10, %r17;
+	cvt.s64.s32 	%rd29, %r19;
+	mul.wide.s32 	%rd30, %r19, 4;
+	add.u64 	%rd31, %rd24, %rd30;
+	.loc	16	279	0
+	mul.lo.s32 	%r20, %r17, %r6;
+	bra.uni 	$Lt_1_23298;
+$Lt_1_23554:
+	.loc	16	281	0
+	ld.global.s32 	%r21, [%rd24+0];
+	cvt.s64.s32 	%rd32, %r21;
+	mul.wide.s32 	%rd33, %r21, 4;
+	add.u64 	%rd34, %rd25, %rd33;
+	.loc	16	282	0
+	cvt.s64.s32 	%rd35, %r16;
+	mul.wide.s32 	%rd36, %r16, 4;
+	add.u64 	%rd28, %rd34, %rd36;
+	.loc	16	283	0
+	mov.s32 	%r20, %r6;
+	.loc	16	284	0
+	cvt.s64.s32 	%rd37, %r10;
+	mul.wide.s32 	%rd38, %r10, 4;
+	add.u64 	%rd31, %rd34, %rd38;
+$Lt_1_23298:
+	.loc	16	287	0
+	ld.global.s32 	%r22, [%rd22+0];
+	mov.u32 	%r23, %r22;
+	mov.s32 	%r24, 0;
+	mov.u32 	%r25, %r24;
+	mov.s32 	%r26, 0;
+	mov.u32 	%r27, %r26;
+	mov.s32 	%r28, 0;
+	mov.u32 	%r29, %r28;
+	tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r23,%r25,%r27,%r29}];
+	mov.f32 	%f26, %f22;
+	mov.f32 	%f27, %f23;
+	mov.f32 	%f28, %f24;
+	mov.f32 	%f29, %f25;
+	setp.ge.u64 	%p6, %rd31, %rd28;
+	@%p6 bra 	$Lt_1_32002;
+	cvt.rzi.ftz.s32.f32 	%r30, %f29;
+	cvt.s64.s32 	%rd39, %r20;
+	mul.lo.s32 	%r31, %r30, 11;
+	cvt.rn.f32.s32 	%f30, %r31;
+	mov.f32 	%f31, 0f00000000;    	// 0
+	mov.f32 	%f32, 0f00000000;    	// 0
+	mov.f32 	%f33, 0f00000000;    	// 0
+	mov.f32 	%f34, 0f00000000;    	// 0
+$Lt_1_24322:
+ //<loop> Loop body line 287, nesting depth: 1, estimated iterations: unknown
+	.loc	16	294	0
+	ld.global.s32 	%r32, [%rd31+0];
+	.loc	16	295	0
+	shr.s32 	%r33, %r32, 30;
+	and.b32 	%r34, %r33, 3;
+	cvt.s64.s32 	%rd40, %r34;
+	mul.wide.s32 	%rd41, %r34, 4;
+	add.u64 	%rd42, %rd1, %rd41;
+	ld.shared.f32 	%f35, [%rd42+0];
+	.loc	16	298	0
+	and.b32 	%r35, %r32, 1073741823;
+	mov.u32 	%r36, %r35;
+	mov.s32 	%r37, 0;
+	mov.u32 	%r38, %r37;
+	mov.s32 	%r39, 0;
+	mov.u32 	%r40, %r39;
+	mov.s32 	%r41, 0;
+	mov.u32 	%r42, %r41;
+	tex.1d.v4.f32.s32 {%f36,%f37,%f38,%f39},[pos_tex,{%r36,%r38,%r40,%r42}];
+	mov.f32 	%f40, %f36;
+	mov.f32 	%f41, %f37;
+	mov.f32 	%f42, %f38;
+	mov.f32 	%f43, %f39;
+	sub.ftz.f32 	%f44, %f27, %f41;
+	sub.ftz.f32 	%f45, %f26, %f40;
+	sub.ftz.f32 	%f46, %f28, %f42;
+	mul.ftz.f32 	%f47, %f44, %f44;
+	fma.rn.ftz.f32 	%f48, %f45, %f45, %f47;
+	fma.rn.ftz.f32 	%f49, %f46, %f46, %f48;
+	add.ftz.f32 	%f50, %f30, %f43;
+	cvt.rzi.ftz.s32.f32 	%r43, %f50;
+	cvt.s64.s32 	%rd43, %r43;
+	mul.wide.s32 	%rd44, %r43, 16;
+	add.u64 	%rd45, %rd44, %rd7;
+	ld.shared.f32 	%f51, [%rd45+8];
+	setp.gt.ftz.f32 	%p7, %f51, %f49;
+	@!%p7 bra 	$Lt_1_25602;
+	.loc	16	309	0
+	sqrt.approx.ftz.f32 	%f52, %f49;
+	ld.shared.v4.f32 	{%f53,%f54,_,%f55}, [%rd45+0];
+	sub.ftz.f32 	%f56, %f52, %f55;
+	.loc	16	313	0
+	mul.ftz.f32 	%f57, %f56, %f56;
+	cvt.ftz.f64.f32 	%fd1, %f57;
+	rcp.rn.f64 	%fd2, %fd1;
+	cvt.rn.ftz.f32.f64 	%f58, %fd2;
+	mul.ftz.f32 	%f59, %f58, %f58;
+	mul.ftz.f32 	%f60, %f58, %f59;
+	mul.ftz.f32 	%f61, %f53, %f60;
+	sub.ftz.f32 	%f62, %f61, %f54;
+	mul.ftz.f32 	%f63, %f60, %f62;
+	.loc	16	314	0
+	div.approx.ftz.f32 	%f64, %f35, %f56;
+	div.approx.ftz.f32 	%f65, %f64, %f52;
+	mul.ftz.f32 	%f66, %f63, %f65;
+	.loc	16	316	0
+	fma.rn.ftz.f32 	%f33, %f45, %f66, %f33;
+	.loc	16	317	0
+	fma.rn.ftz.f32 	%f32, %f44, %f66, %f32;
+	.loc	16	318	0
+	fma.rn.ftz.f32 	%f31, %f46, %f66, %f31;
+	ld.param.s32 	%r44, [__cudaparm_kernel_pair_fast_eflag];
+	mov.u32 	%r45, 0;
+	setp.le.s32 	%p8, %r44, %r45;
+	@%p8 bra 	$Lt_1_25090;
+	.loc	16	321	0
+	add.u64 	%rd46, %rd44, %rd13;
+	ld.shared.v4.f32 	{%f67,%f68,%f69,_}, [%rd46+0];
+	mul.ftz.f32 	%f70, %f67, %f60;
+	sub.ftz.f32 	%f71, %f70, %f68;
+	mul.ftz.f32 	%f72, %f60, %f71;
+	.loc	16	322	0
+	sub.ftz.f32 	%f73, %f72, %f69;
+	fma.rn.ftz.f32 	%f34, %f35, %f73, %f34;
+$Lt_1_25090:
+	ld.param.s32 	%r46, [__cudaparm_kernel_pair_fast_vflag];
+	mov.u32 	%r47, 0;
+	setp.le.s32 	%p9, %r46, %r47;
+	@%p9 bra 	$Lt_1_25602;
+	.loc	16	325	0
+	mov.f32 	%f74, %f11;
+	mul.ftz.f32 	%f75, %f45, %f45;
+	fma.rn.ftz.f32 	%f76, %f66, %f75, %f74;
+	mov.f32 	%f11, %f76;
+	.loc	16	326	0
+	mov.f32 	%f77, %f13;
+	fma.rn.ftz.f32 	%f78, %f66, %f47, %f77;
+	mov.f32 	%f13, %f78;
+	.loc	16	327	0
+	mov.f32 	%f79, %f15;
+	mul.ftz.f32 	%f80, %f46, %f46;
+	fma.rn.ftz.f32 	%f81, %f66, %f80, %f79;
+	mov.f32 	%f15, %f81;
+	.loc	16	328	0
+	mov.f32 	%f82, %f17;
+	mul.ftz.f32 	%f83, %f44, %f45;
+	fma.rn.ftz.f32 	%f84, %f66, %f83, %f82;
+	mov.f32 	%f17, %f84;
+	.loc	16	329	0
+	mov.f32 	%f85, %f19;
+	mul.ftz.f32 	%f86, %f45, %f46;
+	fma.rn.ftz.f32 	%f87, %f66, %f86, %f85;
+	mov.f32 	%f19, %f87;
+	.loc	16	330	0
+	mul.ftz.f32 	%f88, %f44, %f46;
+	fma.rn.ftz.f32 	%f20, %f66, %f88, %f20;
+	mov.f32 	%f21, %f20;
+$Lt_1_25602:
+$Lt_1_24578:
+	.loc	16	292	0
+	mul.lo.u64 	%rd47, %rd39, 4;
+	add.u64 	%rd31, %rd31, %rd47;
+	setp.lt.u64 	%p10, %rd31, %rd28;
+	@%p10 bra 	$Lt_1_24322;
+	bra.uni 	$Lt_1_22786;
+$Lt_1_32002:
+	mov.f32 	%f31, 0f00000000;    	// 0
+	mov.f32 	%f32, 0f00000000;    	// 0
+	mov.f32 	%f33, 0f00000000;    	// 0
+	mov.f32 	%f34, 0f00000000;    	// 0
+	bra.uni 	$Lt_1_22786;
+$Lt_1_23042:
+	mov.f32 	%f31, 0f00000000;    	// 0
+	mov.f32 	%f32, 0f00000000;    	// 0
+	mov.f32 	%f33, 0f00000000;    	// 0
+	mov.f32 	%f34, 0f00000000;    	// 0
+$Lt_1_22786:
+	mov.u32 	%r48, 1;
+	setp.le.s32 	%p11, %r6, %r48;
+	@%p11 bra 	$Lt_1_28418;
+	.loc	16	341	0
+	mov.u64 	%rd48, __cuda___cuda_local_var_32742_35_non_const_red_acc7168;
+	cvt.s64.s32 	%rd49, %r1;
+	mul.wide.s32 	%rd50, %r1, 4;
+	add.u64 	%rd51, %rd48, %rd50;
+	mov.f32 	%f89, %f33;
+	st.shared.f32 	[%rd51+0], %f89;
+	.loc	16	342	0
+	mov.f32 	%f90, %f32;
+	st.shared.f32 	[%rd51+512], %f90;
+	.loc	16	343	0
+	mov.f32 	%f91, %f31;
+	st.shared.f32 	[%rd51+1024], %f91;
+	.loc	16	344	0
+	mov.f32 	%f92, %f34;
+	st.shared.f32 	[%rd51+1536], %f92;
+	.loc	16	346	0
+	shr.s32 	%r49, %r6, 31;
+	mov.s32 	%r50, 1;
+	and.b32 	%r51, %r49, %r50;
+	add.s32 	%r52, %r51, %r6;
+	shr.s32 	%r53, %r52, 1;
+	mov.s32 	%r54, %r53;
+	mov.u32 	%r55, 0;
+	setp.ne.u32 	%p12, %r53, %r55;
+	@!%p12 bra 	$Lt_1_26882;
+$Lt_1_27394:
+	setp.ge.u32 	%p13, %r10, %r54;
+	@%p13 bra 	$Lt_1_27650;
+	.loc	16	349	0
+	add.u32 	%r56, %r1, %r54;
+	cvt.u64.u32 	%rd52, %r56;
+	mul.wide.u32 	%rd53, %r56, 4;
+	add.u64 	%rd54, %rd48, %rd53;
+	ld.shared.f32 	%f93, [%rd54+0];
+	add.ftz.f32 	%f89, %f93, %f89;
+	st.shared.f32 	[%rd51+0], %f89;
+	ld.shared.f32 	%f94, [%rd54+512];
+	add.ftz.f32 	%f90, %f94, %f90;
+	st.shared.f32 	[%rd51+512], %f90;
+	ld.shared.f32 	%f95, [%rd54+1024];
+	add.ftz.f32 	%f91, %f95, %f91;
+	st.shared.f32 	[%rd51+1024], %f91;
+	ld.shared.f32 	%f96, [%rd54+1536];
+	add.ftz.f32 	%f92, %f96, %f92;
+	st.shared.f32 	[%rd51+1536], %f92;
+$Lt_1_27650:
+	.loc	16	346	0
+	shr.u32 	%r54, %r54, 1;
+	mov.u32 	%r57, 0;
+	setp.ne.u32 	%p14, %r54, %r57;
+	@%p14 bra 	$Lt_1_27394;
+$Lt_1_26882:
+	.loc	16	353	0
+	mov.f32 	%f33, %f89;
+	.loc	16	354	0
+	mov.f32 	%f32, %f90;
+	.loc	16	355	0
+	mov.f32 	%f31, %f91;
+	.loc	16	356	0
+	mov.f32 	%f34, %f92;
+	ld.param.s32 	%r58, [__cudaparm_kernel_pair_fast_vflag];
+	mov.u32 	%r59, 0;
+	setp.le.s32 	%p15, %r58, %r59;
+	@%p15 bra 	$Lt_1_28418;
+	.loc	16	360	0
+	mov.f32 	%f89, %f11;
+	st.shared.f32 	[%rd51+0], %f89;
+	mov.f32 	%f90, %f13;
+	st.shared.f32 	[%rd51+512], %f90;
+	mov.f32 	%f91, %f15;
+	st.shared.f32 	[%rd51+1024], %f91;
+	mov.f32 	%f92, %f17;
+	st.shared.f32 	[%rd51+1536], %f92;
+	mov.f32 	%f97, %f19;
+	st.shared.f32 	[%rd51+2048], %f97;
+	mov.f32 	%f98, %f21;
+	st.shared.f32 	[%rd51+2560], %f98;
+	.loc	16	362	0
+	mov.s32 	%r60, %r53;
+	@!%p12 bra 	$Lt_1_28930;
+$Lt_1_29442:
+	setp.ge.u32 	%p16, %r10, %r60;
+	@%p16 bra 	$Lt_1_29698;
+	.loc	16	365	0
+	add.u32 	%r61, %r1, %r60;
+	cvt.u64.u32 	%rd55, %r61;
+	mul.wide.u32 	%rd56, %r61, 4;
+	add.u64 	%rd57, %rd48, %rd56;
+	ld.shared.f32 	%f99, [%rd57+0];
+	add.ftz.f32 	%f89, %f99, %f89;
+	st.shared.f32 	[%rd51+0], %f89;
+	ld.shared.f32 	%f100, [%rd57+512];
+	add.ftz.f32 	%f90, %f100, %f90;
+	st.shared.f32 	[%rd51+512], %f90;
+	ld.shared.f32 	%f101, [%rd57+1024];
+	add.ftz.f32 	%f91, %f101, %f91;
+	st.shared.f32 	[%rd51+1024], %f91;
+	ld.shared.f32 	%f102, [%rd57+1536];
+	add.ftz.f32 	%f92, %f102, %f92;
+	st.shared.f32 	[%rd51+1536], %f92;
+	ld.shared.f32 	%f103, [%rd57+2048];
+	add.ftz.f32 	%f97, %f103, %f97;
+	st.shared.f32 	[%rd51+2048], %f97;
+	ld.shared.f32 	%f104, [%rd57+2560];
+	add.ftz.f32 	%f98, %f104, %f98;
+	st.shared.f32 	[%rd51+2560], %f98;
+$Lt_1_29698:
+	.loc	16	362	0
+	shr.u32 	%r60, %r60, 1;
+	mov.u32 	%r62, 0;
+	setp.ne.u32 	%p17, %r60, %r62;
+	@%p17 bra 	$Lt_1_29442;
+$Lt_1_28930:
+	.loc	16	370	0
+	mov.f32 	%f11, %f89;
+	mov.f32 	%f13, %f90;
+	mov.f32 	%f15, %f91;
+	mov.f32 	%f17, %f92;
+	mov.f32 	%f19, %f97;
+	mov.f32 	%f21, %f98;
+$Lt_1_28418:
+$Lt_1_26370:
+	selp.s32 	%r63, 1, 0, %p4;
+	mov.s32 	%r64, 0;
+	set.eq.u32.s32 	%r65, %r10, %r64;
+	neg.s32 	%r66, %r65;
+	and.b32 	%r67, %r63, %r66;
+	mov.u32 	%r68, 0;
+	setp.eq.s32 	%p18, %r67, %r68;
+	@%p18 bra 	$Lt_1_30466;
+	.loc	16	376	0
+	cvt.s64.s32 	%rd58, %r13;
+	ld.param.u64 	%rd59, [__cudaparm_kernel_pair_fast_engv];
+	mul.wide.s32 	%rd60, %r13, 4;
+	add.u64 	%rd61, %rd59, %rd60;
+	ld.param.s32 	%r69, [__cudaparm_kernel_pair_fast_eflag];
+	mov.u32 	%r70, 0;
+	setp.le.s32 	%p19, %r69, %r70;
+	@%p19 bra 	$Lt_1_30978;
+	.loc	16	378	0
+	st.global.f32 	[%rd61+0], %f34;
+	.loc	16	379	0
+	cvt.s64.s32 	%rd62, %r14;
+	mul.wide.s32 	%rd63, %r14, 4;
+	add.u64 	%rd61, %rd61, %rd63;
+$Lt_1_30978:
+	ld.param.s32 	%r71, [__cudaparm_kernel_pair_fast_vflag];
+	mov.u32 	%r72, 0;
+	setp.le.s32 	%p20, %r71, %r72;
+	@%p20 bra 	$Lt_1_31490;
+	.loc	16	383	0
+	mov.f32 	%f105, %f11;
+	st.global.f32 	[%rd61+0], %f105;
+	.loc	16	384	0
+	cvt.s64.s32 	%rd64, %r14;
+	mul.wide.s32 	%rd65, %r14, 4;
+	add.u64 	%rd66, %rd65, %rd61;
+	.loc	16	383	0
+	mov.f32 	%f106, %f13;
+	st.global.f32 	[%rd66+0], %f106;
+	.loc	16	384	0
+	add.u64 	%rd67, %rd65, %rd66;
+	.loc	16	383	0
+	mov.f32 	%f107, %f15;
+	st.global.f32 	[%rd67+0], %f107;
+	.loc	16	384	0
+	add.u64 	%rd68, %rd65, %rd67;
+	.loc	16	383	0
+	mov.f32 	%f108, %f17;
+	st.global.f32 	[%rd68+0], %f108;
+	.loc	16	384	0
+	add.u64 	%rd61, %rd65, %rd68;
+	.loc	16	383	0
+	mov.f32 	%f109, %f19;
+	st.global.f32 	[%rd61+0], %f109;
+	mov.f32 	%f110, %f21;
+	add.u64 	%rd69, %rd65, %rd61;
+	st.global.f32 	[%rd69+0], %f110;
+$Lt_1_31490:
+	.loc	16	387	0
+	ld.param.u64 	%rd70, [__cudaparm_kernel_pair_fast_ans];
+	mul.lo.u64 	%rd71, %rd58, 16;
+	add.u64 	%rd72, %rd70, %rd71;
+	mov.f32 	%f111, %f112;
+	st.global.v4.f32 	[%rd72+0], {%f33,%f32,%f31,%f111};
+$Lt_1_30466:
+	.loc	16	389	0
+	exit;
+$LDWend_kernel_pair_fast:
+	} // kernel_pair_fast
+
--- a/lib/gpu/lj_expand_gpu_ptx.h
+++ b/lib/gpu/lj_expand_gpu_ptx.h
@ -0,0 +1,941 @@
+const char * lj_expand_gpu_kernel = 
+"	.version 2.3\n"
+"	.target sm_20\n"
+"	.address_size 64\n"
+"	.global .texref pos_tex;\n"
+"	.entry kernel_pair (\n"
+"		.param .u64 __cudaparm_kernel_pair_x_,\n"
+"		.param .u64 __cudaparm_kernel_pair_lj1,\n"
+"		.param .u64 __cudaparm_kernel_pair_lj3,\n"
+"		.param .s32 __cudaparm_kernel_pair_lj_types,\n"
+"		.param .u64 __cudaparm_kernel_pair_sp_lj_in,\n"
+"		.param .u64 __cudaparm_kernel_pair_dev_nbor,\n"
+"		.param .u64 __cudaparm_kernel_pair_dev_packed,\n"
+"		.param .u64 __cudaparm_kernel_pair_ans,\n"
+"		.param .u64 __cudaparm_kernel_pair_engv,\n"
+"		.param .s32 __cudaparm_kernel_pair_eflag,\n"
+"		.param .s32 __cudaparm_kernel_pair_vflag,\n"
+"		.param .s32 __cudaparm_kernel_pair_inum,\n"
+"		.param .s32 __cudaparm_kernel_pair_nbor_pitch,\n"
+"		.param .s32 __cudaparm_kernel_pair_t_per_atom)\n"
+"	{\n"
+"	.reg .u32 %r<72>;\n"
+"	.reg .u64 %rd<62>;\n"
+"	.reg .f32 %f<107>;\n"
+"	.reg .pred %p<19>;\n"
+"	.shared .align 16 .b8 __cuda___cuda_local_var_32497_33_non_const_sp_lj92[16];\n"
+"	.shared .align 4 .b8 __cuda___cuda_local_var_32584_35_non_const_red_acc108[3072];\n"
+"	.loc	16	88	0\n"
+"$LDWbegin_kernel_pair:\n"
+"	.loc	16	95	0\n"
+"	ld.param.u64 	%rd1, [__cudaparm_kernel_pair_sp_lj_in];\n"
+"	ldu.global.f32 	%f1, [%rd1+0];\n"
+"	.loc	16	96	0\n"
+"	ld.global.f32 	%f2, [%rd1+4];\n"
+"	.loc	16	97	0\n"
+"	ld.global.f32 	%f3, [%rd1+8];\n"
+"	.loc	16	98	0\n"
+"	ld.global.f32 	%f4, [%rd1+12];\n"
+"	st.shared.v4.f32 	[__cuda___cuda_local_var_32497_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4};\n"
+"	.loc	16	107	0\n"
+"	mov.f32 	%f5, 0f00000000;     	\n"
+"	mov.f32 	%f6, %f5;\n"
+"	mov.f32 	%f7, 0f00000000;     	\n"
+"	mov.f32 	%f8, %f7;\n"
+"	mov.f32 	%f9, 0f00000000;     	\n"
+"	mov.f32 	%f10, %f9;\n"
+"	mov.f32 	%f11, 0f00000000;    	\n"
+"	mov.f32 	%f12, %f11;\n"
+"	mov.f32 	%f13, 0f00000000;    	\n"
+"	mov.f32 	%f14, %f13;\n"
+"	mov.f32 	%f15, 0f00000000;    	\n"
+"	mov.f32 	%f16, %f15;\n"
+"	ld.param.s32 	%r1, [__cudaparm_kernel_pair_t_per_atom];\n"
+"	cvt.s32.u32 	%r2, %tid.x;\n"
+"	div.s32 	%r3, %r2, %r1;\n"
+"	cvt.s32.u32 	%r4, %ntid.x;\n"
+"	div.s32 	%r5, %r4, %r1;\n"
+"	rem.s32 	%r6, %r2, %r1;\n"
+"	cvt.s32.u32 	%r7, %ctaid.x;\n"
+"	mul.lo.s32 	%r8, %r7, %r5;\n"
+"	add.s32 	%r9, %r3, %r8;\n"
+"	ld.param.s32 	%r10, [__cudaparm_kernel_pair_inum];\n"
+"	setp.lt.s32 	%p1, %r9, %r10;\n"
+"	@!%p1 bra 	$Lt_0_19202;\n"
+"	.loc	16	113	0\n"
+"	ld.param.s32 	%r11, [__cudaparm_kernel_pair_nbor_pitch];\n"
+"	cvt.s64.s32 	%rd2, %r11;\n"
+"	mul.wide.s32 	%rd3, %r11, 4;\n"
+"	cvt.s64.s32 	%rd4, %r9;\n"
+"	mul.wide.s32 	%rd5, %r9, 4;\n"
+"	ld.param.u64 	%rd6, [__cudaparm_kernel_pair_dev_nbor];\n"
+"	add.u64 	%rd7, %rd5, %rd6;\n"
+"	add.u64 	%rd8, %rd3, %rd7;\n"
+"	ld.global.s32 	%r12, [%rd8+0];\n"
+"	add.u64 	%rd9, %rd3, %rd8;\n"
+"	ld.param.u64 	%rd10, [__cudaparm_kernel_pair_dev_packed];\n"
+"	setp.ne.u64 	%p2, %rd10, %rd6;\n"
+"	@%p2 bra 	$Lt_0_19714;\n"
+"	.loc	16	119	0\n"
+"	cvt.s32.s64 	%r13, %rd2;\n"
+"	mul.lo.s32 	%r14, %r13, %r12;\n"
+"	cvt.s64.s32 	%rd11, %r14;\n"
+"	mul.wide.s32 	%rd12, %r14, 4;\n"
+"	add.u64 	%rd13, %rd9, %rd12;\n"
+"	.loc	16	120	0\n"
+"	mul.lo.s32 	%r15, %r6, %r13;\n"
+"	cvt.s64.s32 	%rd14, %r15;\n"
+"	mul.wide.s32 	%rd15, %r15, 4;\n"
+"	add.u64 	%rd16, %rd9, %rd15;\n"
+"	.loc	16	121	0\n"
+"	mul.lo.s32 	%r16, %r13, %r1;\n"
+"	bra.uni 	$Lt_0_19458;\n"
+"$Lt_0_19714:\n"
+"	.loc	16	123	0\n"
+"	ld.global.s32 	%r17, [%rd9+0];\n"
+"	cvt.s64.s32 	%rd17, %r17;\n"
+"	mul.wide.s32 	%rd18, %r17, 4;\n"
+"	add.u64 	%rd19, %rd10, %rd18;\n"
+"	.loc	16	124	0\n"
+"	cvt.s64.s32 	%rd20, %r12;\n"
+"	mul.wide.s32 	%rd21, %r12, 4;\n"
+"	add.u64 	%rd13, %rd19, %rd21;\n"
+"	.loc	16	125	0\n"
+"	mov.s32 	%r16, %r1;\n"
+"	.loc	16	126	0\n"
+"	cvt.s64.s32 	%rd22, %r6;\n"
+"	mul.wide.s32 	%rd23, %r6, 4;\n"
+"	add.u64 	%rd16, %rd19, %rd23;\n"
+"$Lt_0_19458:\n"
+"	.loc	16	129	0\n"
+"	ld.global.s32 	%r18, [%rd7+0];\n"
+"	mov.u32 	%r19, %r18;\n"
+"	mov.s32 	%r20, 0;\n"
+"	mov.u32 	%r21, %r20;\n"
+"	mov.s32 	%r22, 0;\n"
+"	mov.u32 	%r23, %r22;\n"
+"	mov.s32 	%r24, 0;\n"
+"	mov.u32 	%r25, %r24;\n"
+"	tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r19,%r21,%r23,%r25}];\n"
+"	mov.f32 	%f21, %f17;\n"
+"	mov.f32 	%f22, %f18;\n"
+"	mov.f32 	%f23, %f19;\n"
+"	mov.f32 	%f24, %f20;\n"
+"	setp.ge.u64 	%p3, %rd16, %rd13;\n"
+"	@%p3 bra 	$Lt_0_28162;\n"
+"	cvt.rzi.ftz.s32.f32 	%r26, %f24;\n"
+"	cvt.s64.s32 	%rd24, %r16;\n"
+"	ld.param.s32 	%r27, [__cudaparm_kernel_pair_lj_types];\n"
+"	mul.lo.s32 	%r28, %r27, %r26;\n"
+"	ld.param.u64 	%rd25, [__cudaparm_kernel_pair_lj1];\n"
+"	mov.f32 	%f25, 0f00000000;    	\n"
+"	mov.f32 	%f26, 0f00000000;    	\n"
+"	mov.f32 	%f27, 0f00000000;    	\n"
+"	mov.f32 	%f28, 0f00000000;    	\n"
+"	mov.u64 	%rd26, __cuda___cuda_local_var_32497_33_non_const_sp_lj92;\n"
+"$Lt_0_20482:\n"
+"	.loc	16	135	0\n"
+"	ld.global.s32 	%r29, [%rd16+0];\n"
+"	.loc	16	136	0\n"
+"	shr.s32 	%r30, %r29, 30;\n"
+"	and.b32 	%r31, %r30, 3;\n"
+"	cvt.s64.s32 	%rd27, %r31;\n"
+"	mul.wide.s32 	%rd28, %r31, 4;\n"
+"	add.u64 	%rd29, %rd26, %rd28;\n"
+"	ld.shared.f32 	%f29, [%rd29+0];\n"
+"	.loc	16	139	0\n"
+"	and.b32 	%r32, %r29, 1073741823;\n"
+"	mov.u32 	%r33, %r32;\n"
+"	mov.s32 	%r34, 0;\n"
+"	mov.u32 	%r35, %r34;\n"
+"	mov.s32 	%r36, 0;\n"
+"	mov.u32 	%r37, %r36;\n"
+"	mov.s32 	%r38, 0;\n"
+"	mov.u32 	%r39, %r38;\n"
+"	tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r33,%r35,%r37,%r39}];\n"
+"	mov.f32 	%f34, %f30;\n"
+"	mov.f32 	%f35, %f31;\n"
+"	mov.f32 	%f36, %f32;\n"
+"	mov.f32 	%f37, %f33;\n"
+"	cvt.rzi.ftz.s32.f32 	%r40, %f37;\n"
+"	sub.ftz.f32 	%f38, %f22, %f35;\n"
+"	sub.ftz.f32 	%f39, %f21, %f34;\n"
+"	sub.ftz.f32 	%f40, %f23, %f36;\n"
+"	mul.ftz.f32 	%f41, %f38, %f38;\n"
+"	fma.rn.ftz.f32 	%f42, %f39, %f39, %f41;\n"
+"	fma.rn.ftz.f32 	%f43, %f40, %f40, %f42;\n"
+"	add.s32 	%r41, %r40, %r28;\n"
+"	cvt.s64.s32 	%rd30, %r41;\n"
+"	mul.wide.s32 	%rd31, %r41, 16;\n"
+"	add.u64 	%rd32, %rd31, %rd25;\n"
+"	ld.global.f32 	%f44, [%rd32+8];\n"
+"	setp.gt.ftz.f32 	%p4, %f44, %f43;\n"
+"	@!%p4 bra 	$Lt_0_21762;\n"
+"	.loc	16	151	0\n"
+"	sqrt.approx.ftz.f32 	%f45, %f43;\n"
+"	ld.global.v4.f32 	{%f46,%f47,_,%f48}, [%rd32+0];\n"
+"	sub.ftz.f32 	%f49, %f45, %f48;\n"
+"	.loc	16	156	0\n"
+"	mul.ftz.f32 	%f50, %f49, %f49;\n"
+"	rcp.approx.ftz.f32 	%f51, %f50;\n"
+"	mul.ftz.f32 	%f52, %f51, %f51;\n"
+"	mul.ftz.f32 	%f53, %f51, %f52;\n"
+"	div.approx.ftz.f32 	%f54, %f29, %f49;\n"
+"	div.approx.ftz.f32 	%f55, %f54, %f45;\n"
+"	mul.ftz.f32 	%f56, %f46, %f53;\n"
+"	sub.ftz.f32 	%f57, %f56, %f47;\n"
+"	mul.ftz.f32 	%f58, %f53, %f57;\n"
+"	mul.ftz.f32 	%f59, %f55, %f58;\n"
+"	.loc	16	158	0\n"
+"	fma.rn.ftz.f32 	%f27, %f39, %f59, %f27;\n"
+"	.loc	16	159	0\n"
+"	fma.rn.ftz.f32 	%f26, %f38, %f59, %f26;\n"
+"	.loc	16	160	0\n"
+"	fma.rn.ftz.f32 	%f25, %f40, %f59, %f25;\n"
+"	ld.param.s32 	%r42, [__cudaparm_kernel_pair_eflag];\n"
+"	mov.u32 	%r43, 0;\n"
+"	setp.le.s32 	%p5, %r42, %r43;\n"
+"	@%p5 bra 	$Lt_0_21250;\n"
+"	.loc	16	164	0\n"
+"	ld.param.u64 	%rd33, [__cudaparm_kernel_pair_lj3];\n"
+"	add.u64 	%rd34, %rd33, %rd31;\n"
+"	ld.global.v4.f32 	{%f60,%f61,%f62,_}, [%rd34+0];\n"
+"	mul.ftz.f32 	%f63, %f60, %f53;\n"
+"	sub.ftz.f32 	%f64, %f63, %f61;\n"
+"	mul.ftz.f32 	%f65, %f53, %f64;\n"
+"	sub.ftz.f32 	%f66, %f65, %f62;\n"
+"	fma.rn.ftz.f32 	%f28, %f29, %f66, %f28;\n"
+"$Lt_0_21250:\n"
+"	ld.param.s32 	%r44, [__cudaparm_kernel_pair_vflag];\n"
+"	mov.u32 	%r45, 0;\n"
+"	setp.le.s32 	%p6, %r44, %r45;\n"
+"	@%p6 bra 	$Lt_0_21762;\n"
+"	.loc	16	167	0\n"
+"	mov.f32 	%f67, %f6;\n"
+"	mul.ftz.f32 	%f68, %f39, %f39;\n"
+"	fma.rn.ftz.f32 	%f69, %f59, %f68, %f67;\n"
+"	mov.f32 	%f6, %f69;\n"
+"	.loc	16	168	0\n"
+"	mov.f32 	%f70, %f8;\n"
+"	fma.rn.ftz.f32 	%f71, %f59, %f41, %f70;\n"
+"	mov.f32 	%f8, %f71;\n"
+"	.loc	16	169	0\n"
+"	mov.f32 	%f72, %f10;\n"
+"	mul.ftz.f32 	%f73, %f40, %f40;\n"
+"	fma.rn.ftz.f32 	%f74, %f59, %f73, %f72;\n"
+"	mov.f32 	%f10, %f74;\n"
+"	.loc	16	170	0\n"
+"	mov.f32 	%f75, %f12;\n"
+"	mul.ftz.f32 	%f76, %f38, %f39;\n"
+"	fma.rn.ftz.f32 	%f77, %f59, %f76, %f75;\n"
+"	mov.f32 	%f12, %f77;\n"
+"	.loc	16	171	0\n"
+"	mov.f32 	%f78, %f14;\n"
+"	mul.ftz.f32 	%f79, %f39, %f40;\n"
+"	fma.rn.ftz.f32 	%f80, %f59, %f79, %f78;\n"
+"	mov.f32 	%f14, %f80;\n"
+"	.loc	16	172	0\n"
+"	mul.ftz.f32 	%f81, %f38, %f40;\n"
+"	fma.rn.ftz.f32 	%f15, %f59, %f81, %f15;\n"
+"	mov.f32 	%f16, %f15;\n"
+"$Lt_0_21762:\n"
+"$Lt_0_20738:\n"
+"	.loc	16	133	0\n"
+"	mul.lo.u64 	%rd35, %rd24, 4;\n"
+"	add.u64 	%rd16, %rd16, %rd35;\n"
+"	setp.lt.u64 	%p7, %rd16, %rd13;\n"
+"	@%p7 bra 	$Lt_0_20482;\n"
+"	bra.uni 	$Lt_0_18946;\n"
+"$Lt_0_28162:\n"
+"	mov.f32 	%f25, 0f00000000;    	\n"
+"	mov.f32 	%f26, 0f00000000;    	\n"
+"	mov.f32 	%f27, 0f00000000;    	\n"
+"	mov.f32 	%f28, 0f00000000;    	\n"
+"	bra.uni 	$Lt_0_18946;\n"
+"$Lt_0_19202:\n"
+"	mov.f32 	%f25, 0f00000000;    	\n"
+"	mov.f32 	%f26, 0f00000000;    	\n"
+"	mov.f32 	%f27, 0f00000000;    	\n"
+"	mov.f32 	%f28, 0f00000000;    	\n"
+"$Lt_0_18946:\n"
+"	mov.u32 	%r46, 1;\n"
+"	setp.le.s32 	%p8, %r1, %r46;\n"
+"	@%p8 bra 	$Lt_0_24578;\n"
+"	.loc	16	183	0\n"
+"	mov.u64 	%rd36, __cuda___cuda_local_var_32584_35_non_const_red_acc108;\n"
+"	cvt.s64.s32 	%rd37, %r2;\n"
+"	mul.wide.s32 	%rd38, %r2, 4;\n"
+"	add.u64 	%rd39, %rd36, %rd38;\n"
+"	mov.f32 	%f82, %f27;\n"
+"	st.shared.f32 	[%rd39+0], %f82;\n"
+"	.loc	16	184	0\n"
+"	mov.f32 	%f83, %f26;\n"
+"	st.shared.f32 	[%rd39+512], %f83;\n"
+"	.loc	16	185	0\n"
+"	mov.f32 	%f84, %f25;\n"
+"	st.shared.f32 	[%rd39+1024], %f84;\n"
+"	.loc	16	186	0\n"
+"	mov.f32 	%f85, %f28;\n"
+"	st.shared.f32 	[%rd39+1536], %f85;\n"
+"	.loc	16	188	0\n"
+"	shr.s32 	%r47, %r1, 31;\n"
+"	mov.s32 	%r48, 1;\n"
+"	and.b32 	%r49, %r47, %r48;\n"
+"	add.s32 	%r50, %r49, %r1;\n"
+"	shr.s32 	%r51, %r50, 1;\n"
+"	mov.s32 	%r52, %r51;\n"
+"	mov.u32 	%r53, 0;\n"
+"	setp.ne.u32 	%p9, %r51, %r53;\n"
+"	@!%p9 bra 	$Lt_0_23042;\n"
+"$Lt_0_23554:\n"
+"	setp.ge.u32 	%p10, %r6, %r52;\n"
+"	@%p10 bra 	$Lt_0_23810;\n"
+"	.loc	16	191	0\n"
+"	add.u32 	%r54, %r2, %r52;\n"
+"	cvt.u64.u32 	%rd40, %r54;\n"
+"	mul.wide.u32 	%rd41, %r54, 4;\n"
+"	add.u64 	%rd42, %rd36, %rd41;\n"
+"	ld.shared.f32 	%f86, [%rd42+0];\n"
+"	add.ftz.f32 	%f82, %f86, %f82;\n"
+"	st.shared.f32 	[%rd39+0], %f82;\n"
+"	ld.shared.f32 	%f87, [%rd42+512];\n"
+"	add.ftz.f32 	%f83, %f87, %f83;\n"
+"	st.shared.f32 	[%rd39+512], %f83;\n"
+"	ld.shared.f32 	%f88, [%rd42+1024];\n"
+"	add.ftz.f32 	%f84, %f88, %f84;\n"
+"	st.shared.f32 	[%rd39+1024], %f84;\n"
+"	ld.shared.f32 	%f89, [%rd42+1536];\n"
+"	add.ftz.f32 	%f85, %f89, %f85;\n"
+"	st.shared.f32 	[%rd39+1536], %f85;\n"
+"$Lt_0_23810:\n"
+"	.loc	16	188	0\n"
+"	shr.u32 	%r52, %r52, 1;\n"
+"	mov.u32 	%r55, 0;\n"
+"	setp.ne.u32 	%p11, %r52, %r55;\n"
+"	@%p11 bra 	$Lt_0_23554;\n"
+"$Lt_0_23042:\n"
+"	.loc	16	195	0\n"
+"	mov.f32 	%f27, %f82;\n"
+"	.loc	16	196	0\n"
+"	mov.f32 	%f26, %f83;\n"
+"	.loc	16	197	0\n"
+"	mov.f32 	%f25, %f84;\n"
+"	.loc	16	198	0\n"
+"	mov.f32 	%f28, %f85;\n"
+"	ld.param.s32 	%r56, [__cudaparm_kernel_pair_vflag];\n"
+"	mov.u32 	%r57, 0;\n"
+"	setp.le.s32 	%p12, %r56, %r57;\n"
+"	@%p12 bra 	$Lt_0_24578;\n"
+"	.loc	16	202	0\n"
+"	mov.f32 	%f82, %f6;\n"
+"	st.shared.f32 	[%rd39+0], %f82;\n"
+"	mov.f32 	%f83, %f8;\n"
+"	st.shared.f32 	[%rd39+512], %f83;\n"
+"	mov.f32 	%f84, %f10;\n"
+"	st.shared.f32 	[%rd39+1024], %f84;\n"
+"	mov.f32 	%f85, %f12;\n"
+"	st.shared.f32 	[%rd39+1536], %f85;\n"
+"	mov.f32 	%f90, %f14;\n"
+"	st.shared.f32 	[%rd39+2048], %f90;\n"
+"	mov.f32 	%f91, %f16;\n"
+"	st.shared.f32 	[%rd39+2560], %f91;\n"
+"	.loc	16	204	0\n"
+"	mov.s32 	%r58, %r51;\n"
+"	@!%p9 bra 	$Lt_0_25090;\n"
+"$Lt_0_25602:\n"
+"	setp.ge.u32 	%p13, %r6, %r58;\n"
+"	@%p13 bra 	$Lt_0_25858;\n"
+"	.loc	16	207	0\n"
+"	add.u32 	%r59, %r2, %r58;\n"
+"	cvt.u64.u32 	%rd43, %r59;\n"
+"	mul.wide.u32 	%rd44, %r59, 4;\n"
+"	add.u64 	%rd45, %rd36, %rd44;\n"
+"	ld.shared.f32 	%f92, [%rd45+0];\n"
+"	add.ftz.f32 	%f82, %f92, %f82;\n"
+"	st.shared.f32 	[%rd39+0], %f82;\n"
+"	ld.shared.f32 	%f93, [%rd45+512];\n"
+"	add.ftz.f32 	%f83, %f93, %f83;\n"
+"	st.shared.f32 	[%rd39+512], %f83;\n"
+"	ld.shared.f32 	%f94, [%rd45+1024];\n"
+"	add.ftz.f32 	%f84, %f94, %f84;\n"
+"	st.shared.f32 	[%rd39+1024], %f84;\n"
+"	ld.shared.f32 	%f95, [%rd45+1536];\n"
+"	add.ftz.f32 	%f85, %f95, %f85;\n"
+"	st.shared.f32 	[%rd39+1536], %f85;\n"
+"	ld.shared.f32 	%f96, [%rd45+2048];\n"
+"	add.ftz.f32 	%f90, %f96, %f90;\n"
+"	st.shared.f32 	[%rd39+2048], %f90;\n"
+"	ld.shared.f32 	%f97, [%rd45+2560];\n"
+"	add.ftz.f32 	%f91, %f97, %f91;\n"
+"	st.shared.f32 	[%rd39+2560], %f91;\n"
+"$Lt_0_25858:\n"
+"	.loc	16	204	0\n"
+"	shr.u32 	%r58, %r58, 1;\n"
+"	mov.u32 	%r60, 0;\n"
+"	setp.ne.u32 	%p14, %r58, %r60;\n"
+"	@%p14 bra 	$Lt_0_25602;\n"
+"$Lt_0_25090:\n"
+"	.loc	16	212	0\n"
+"	mov.f32 	%f6, %f82;\n"
+"	mov.f32 	%f8, %f83;\n"
+"	mov.f32 	%f10, %f84;\n"
+"	mov.f32 	%f12, %f85;\n"
+"	mov.f32 	%f14, %f90;\n"
+"	mov.f32 	%f16, %f91;\n"
+"$Lt_0_24578:\n"
+"$Lt_0_22530:\n"
+"	selp.s32 	%r61, 1, 0, %p1;\n"
+"	mov.s32 	%r62, 0;\n"
+"	set.eq.u32.s32 	%r63, %r6, %r62;\n"
+"	neg.s32 	%r64, %r63;\n"
+"	and.b32 	%r65, %r61, %r64;\n"
+"	mov.u32 	%r66, 0;\n"
+"	setp.eq.s32 	%p15, %r65, %r66;\n"
+"	@%p15 bra 	$Lt_0_26626;\n"
+"	.loc	16	218	0\n"
+"	cvt.s64.s32 	%rd46, %r9;\n"
+"	ld.param.u64 	%rd47, [__cudaparm_kernel_pair_engv];\n"
+"	mul.wide.s32 	%rd48, %r9, 4;\n"
+"	add.u64 	%rd49, %rd47, %rd48;\n"
+"	ld.param.s32 	%r67, [__cudaparm_kernel_pair_eflag];\n"
+"	mov.u32 	%r68, 0;\n"
+"	setp.le.s32 	%p16, %r67, %r68;\n"
+"	@%p16 bra 	$Lt_0_27138;\n"
+"	.loc	16	220	0\n"
+"	st.global.f32 	[%rd49+0], %f28;\n"
+"	.loc	16	221	0\n"
+"	cvt.s64.s32 	%rd50, %r10;\n"
+"	mul.wide.s32 	%rd51, %r10, 4;\n"
+"	add.u64 	%rd49, %rd49, %rd51;\n"
+"$Lt_0_27138:\n"
+"	ld.param.s32 	%r69, [__cudaparm_kernel_pair_vflag];\n"
+"	mov.u32 	%r70, 0;\n"
+"	setp.le.s32 	%p17, %r69, %r70;\n"
+"	@%p17 bra 	$Lt_0_27650;\n"
+"	.loc	16	225	0\n"
+"	mov.f32 	%f98, %f6;\n"
+"	st.global.f32 	[%rd49+0], %f98;\n"
+"	.loc	16	226	0\n"
+"	cvt.s64.s32 	%rd52, %r10;\n"
+"	mul.wide.s32 	%rd53, %r10, 4;\n"
+"	add.u64 	%rd54, %rd53, %rd49;\n"
+"	.loc	16	225	0\n"
+"	mov.f32 	%f99, %f8;\n"
+"	st.global.f32 	[%rd54+0], %f99;\n"
+"	.loc	16	226	0\n"
+"	add.u64 	%rd55, %rd53, %rd54;\n"
+"	.loc	16	225	0\n"
+"	mov.f32 	%f100, %f10;\n"
+"	st.global.f32 	[%rd55+0], %f100;\n"
+"	.loc	16	226	0\n"
+"	add.u64 	%rd56, %rd53, %rd55;\n"
+"	.loc	16	225	0\n"
+"	mov.f32 	%f101, %f12;\n"
+"	st.global.f32 	[%rd56+0], %f101;\n"
+"	.loc	16	226	0\n"
+"	add.u64 	%rd49, %rd53, %rd56;\n"
+"	.loc	16	225	0\n"
+"	mov.f32 	%f102, %f14;\n"
+"	st.global.f32 	[%rd49+0], %f102;\n"
+"	mov.f32 	%f103, %f16;\n"
+"	add.u64 	%rd57, %rd53, %rd49;\n"
+"	st.global.f32 	[%rd57+0], %f103;\n"
+"$Lt_0_27650:\n"
+"	.loc	16	229	0\n"
+"	ld.param.u64 	%rd58, [__cudaparm_kernel_pair_ans];\n"
+"	mul.lo.u64 	%rd59, %rd46, 16;\n"
+"	add.u64 	%rd60, %rd58, %rd59;\n"
+"	mov.f32 	%f104, %f105;\n"
+"	st.global.v4.f32 	[%rd60+0], {%f27,%f26,%f25,%f104};\n"
+"$Lt_0_26626:\n"
+"	.loc	16	231	0\n"
+"	exit;\n"
+"$LDWend_kernel_pair:\n"
+"	}\n"
+"	.entry kernel_pair_fast (\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_x_,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_lj1_in,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_lj3_in,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_dev_packed,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_ans,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_engv,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_eflag,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_vflag,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_inum,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_t_per_atom)\n"
+"	{\n"
+"	.reg .u32 %r<74>;\n"
+"	.reg .u64 %rd<74>;\n"
+"	.reg .f32 %f<114>;\n"
+"	.reg .f64 %fd<4>;\n"
+"	.reg .pred %p<22>;\n"
+"	.shared .align 4 .b8 __cuda___cuda_local_var_32650_33_non_const_sp_lj3268[16];\n"
+"	.shared .align 16 .b8 __cuda___cuda_local_var_32648_34_non_const_lj13296[1936];\n"
+"	.shared .align 16 .b8 __cuda___cuda_local_var_32649_34_non_const_lj35232[1936];\n"
+"	.shared .align 4 .b8 __cuda___cuda_local_var_32742_35_non_const_red_acc7168[3072];\n"
+"	.loc	16	239	0\n"
+"$LDWbegin_kernel_pair_fast:\n"
+"	cvt.s32.u32 	%r1, %tid.x;\n"
+"	mov.u32 	%r2, 3;\n"
+"	setp.gt.s32 	%p1, %r1, %r2;\n"
+"	@%p1 bra 	$Lt_1_21250;\n"
+"	.loc	16	249	0\n"
+"	mov.u64 	%rd1, __cuda___cuda_local_var_32650_33_non_const_sp_lj3268;\n"
+"	cvt.s64.s32 	%rd2, %r1;\n"
+"	mul.wide.s32 	%rd3, %r1, 4;\n"
+"	ld.param.u64 	%rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];\n"
+"	add.u64 	%rd5, %rd4, %rd3;\n"
+"	ld.global.f32 	%f1, [%rd5+0];\n"
+"	add.u64 	%rd6, %rd3, %rd1;\n"
+"	st.shared.f32 	[%rd6+0], %f1;\n"
+"$Lt_1_21250:\n"
+"	mov.u64 	%rd1, __cuda___cuda_local_var_32650_33_non_const_sp_lj3268;\n"
+"	mov.u32 	%r3, 120;\n"
+"	setp.gt.s32 	%p2, %r1, %r3;\n"
+"	@%p2 bra 	$Lt_1_21762;\n"
+"	.loc	16	251	0\n"
+"	mov.u64 	%rd7, __cuda___cuda_local_var_32648_34_non_const_lj13296;\n"
+"	cvt.s64.s32 	%rd8, %r1;\n"
+"	mul.wide.s32 	%rd9, %r1, 16;\n"
+"	ld.param.u64 	%rd10, [__cudaparm_kernel_pair_fast_lj1_in];\n"
+"	add.u64 	%rd11, %rd10, %rd9;\n"
+"	add.u64 	%rd12, %rd9, %rd7;\n"
+"	ld.global.v4.f32 	{%f2,%f3,%f4,%f5}, [%rd11+0];\n"
+"	st.shared.v4.f32 	[%rd12+0], {%f2,%f3,%f4,%f5};\n"
+"	ld.param.s32 	%r4, [__cudaparm_kernel_pair_fast_eflag];\n"
+"	mov.u32 	%r5, 0;\n"
+"	setp.le.s32 	%p3, %r4, %r5;\n"
+"	@%p3 bra 	$Lt_1_22274;\n"
+"	.loc	16	253	0\n"
+"	mov.u64 	%rd13, __cuda___cuda_local_var_32649_34_non_const_lj35232;\n"
+"	ld.param.u64 	%rd14, [__cudaparm_kernel_pair_fast_lj3_in];\n"
+"	add.u64 	%rd15, %rd14, %rd9;\n"
+"	add.u64 	%rd16, %rd9, %rd13;\n"
+"	ld.global.v4.f32 	{%f6,%f7,%f8,%f9}, [%rd15+0];\n"
+"	st.shared.v4.f32 	[%rd16+0], {%f6,%f7,%f8,%f9};\n"
+"$Lt_1_22274:\n"
+"	mov.u64 	%rd13, __cuda___cuda_local_var_32649_34_non_const_lj35232;\n"
+"$Lt_1_21762:\n"
+"	mov.u64 	%rd7, __cuda___cuda_local_var_32648_34_non_const_lj13296;\n"
+"	mov.u64 	%rd13, __cuda___cuda_local_var_32649_34_non_const_lj35232;\n"
+"	.loc	16	263	0\n"
+"	mov.f32 	%f10, 0f00000000;    	\n"
+"	mov.f32 	%f11, %f10;\n"
+"	mov.f32 	%f12, 0f00000000;    	\n"
+"	mov.f32 	%f13, %f12;\n"
+"	mov.f32 	%f14, 0f00000000;    	\n"
+"	mov.f32 	%f15, %f14;\n"
+"	mov.f32 	%f16, 0f00000000;    	\n"
+"	mov.f32 	%f17, %f16;\n"
+"	mov.f32 	%f18, 0f00000000;    	\n"
+"	mov.f32 	%f19, %f18;\n"
+"	mov.f32 	%f20, 0f00000000;    	\n"
+"	mov.f32 	%f21, %f20;\n"
+"	.loc	16	265	0\n"
+"	bar.sync 	0;\n"
+"	ld.param.s32 	%r6, [__cudaparm_kernel_pair_fast_t_per_atom];\n"
+"	div.s32 	%r7, %r1, %r6;\n"
+"	cvt.s32.u32 	%r8, %ntid.x;\n"
+"	div.s32 	%r9, %r8, %r6;\n"
+"	rem.s32 	%r10, %r1, %r6;\n"
+"	cvt.s32.u32 	%r11, %ctaid.x;\n"
+"	mul.lo.s32 	%r12, %r11, %r9;\n"
+"	add.s32 	%r13, %r7, %r12;\n"
+"	ld.param.s32 	%r14, [__cudaparm_kernel_pair_fast_inum];\n"
+"	setp.lt.s32 	%p4, %r13, %r14;\n"
+"	@!%p4 bra 	$Lt_1_23042;\n"
+"	.loc	16	271	0\n"
+"	ld.param.s32 	%r15, [__cudaparm_kernel_pair_fast_nbor_pitch];\n"
+"	cvt.s64.s32 	%rd17, %r15;\n"
+"	mul.wide.s32 	%rd18, %r15, 4;\n"
+"	cvt.s64.s32 	%rd19, %r13;\n"
+"	mul.wide.s32 	%rd20, %r13, 4;\n"
+"	ld.param.u64 	%rd21, [__cudaparm_kernel_pair_fast_dev_nbor];\n"
+"	add.u64 	%rd22, %rd20, %rd21;\n"
+"	add.u64 	%rd23, %rd18, %rd22;\n"
+"	ld.global.s32 	%r16, [%rd23+0];\n"
+"	add.u64 	%rd24, %rd18, %rd23;\n"
+"	ld.param.u64 	%rd25, [__cudaparm_kernel_pair_fast_dev_packed];\n"
+"	setp.ne.u64 	%p5, %rd25, %rd21;\n"
+"	@%p5 bra 	$Lt_1_23554;\n"
+"	.loc	16	277	0\n"
+"	cvt.s32.s64 	%r17, %rd17;\n"
+"	mul.lo.s32 	%r18, %r17, %r16;\n"
+"	cvt.s64.s32 	%rd26, %r18;\n"
+"	mul.wide.s32 	%rd27, %r18, 4;\n"
+"	add.u64 	%rd28, %rd24, %rd27;\n"
+"	.loc	16	278	0\n"
+"	mul.lo.s32 	%r19, %r10, %r17;\n"
+"	cvt.s64.s32 	%rd29, %r19;\n"
+"	mul.wide.s32 	%rd30, %r19, 4;\n"
+"	add.u64 	%rd31, %rd24, %rd30;\n"
+"	.loc	16	279	0\n"
+"	mul.lo.s32 	%r20, %r17, %r6;\n"
+"	bra.uni 	$Lt_1_23298;\n"
+"$Lt_1_23554:\n"
+"	.loc	16	281	0\n"
+"	ld.global.s32 	%r21, [%rd24+0];\n"
+"	cvt.s64.s32 	%rd32, %r21;\n"
+"	mul.wide.s32 	%rd33, %r21, 4;\n"
+"	add.u64 	%rd34, %rd25, %rd33;\n"
+"	.loc	16	282	0\n"
+"	cvt.s64.s32 	%rd35, %r16;\n"
+"	mul.wide.s32 	%rd36, %r16, 4;\n"
+"	add.u64 	%rd28, %rd34, %rd36;\n"
+"	.loc	16	283	0\n"
+"	mov.s32 	%r20, %r6;\n"
+"	.loc	16	284	0\n"
+"	cvt.s64.s32 	%rd37, %r10;\n"
+"	mul.wide.s32 	%rd38, %r10, 4;\n"
+"	add.u64 	%rd31, %rd34, %rd38;\n"
+"$Lt_1_23298:\n"
+"	.loc	16	287	0\n"
+"	ld.global.s32 	%r22, [%rd22+0];\n"
+"	mov.u32 	%r23, %r22;\n"
+"	mov.s32 	%r24, 0;\n"
+"	mov.u32 	%r25, %r24;\n"
+"	mov.s32 	%r26, 0;\n"
+"	mov.u32 	%r27, %r26;\n"
+"	mov.s32 	%r28, 0;\n"
+"	mov.u32 	%r29, %r28;\n"
+"	tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r23,%r25,%r27,%r29}];\n"
+"	mov.f32 	%f26, %f22;\n"
+"	mov.f32 	%f27, %f23;\n"
+"	mov.f32 	%f28, %f24;\n"
+"	mov.f32 	%f29, %f25;\n"
+"	setp.ge.u64 	%p6, %rd31, %rd28;\n"
+"	@%p6 bra 	$Lt_1_32002;\n"
+"	cvt.rzi.ftz.s32.f32 	%r30, %f29;\n"
+"	cvt.s64.s32 	%rd39, %r20;\n"
+"	mul.lo.s32 	%r31, %r30, 11;\n"
+"	cvt.rn.f32.s32 	%f30, %r31;\n"
+"	mov.f32 	%f31, 0f00000000;    	\n"
+"	mov.f32 	%f32, 0f00000000;    	\n"
+"	mov.f32 	%f33, 0f00000000;    	\n"
+"	mov.f32 	%f34, 0f00000000;    	\n"
+"$Lt_1_24322:\n"
+"	.loc	16	294	0\n"
+"	ld.global.s32 	%r32, [%rd31+0];\n"
+"	.loc	16	295	0\n"
+"	shr.s32 	%r33, %r32, 30;\n"
+"	and.b32 	%r34, %r33, 3;\n"
+"	cvt.s64.s32 	%rd40, %r34;\n"
+"	mul.wide.s32 	%rd41, %r34, 4;\n"
+"	add.u64 	%rd42, %rd1, %rd41;\n"
+"	ld.shared.f32 	%f35, [%rd42+0];\n"
+"	.loc	16	298	0\n"
+"	and.b32 	%r35, %r32, 1073741823;\n"
+"	mov.u32 	%r36, %r35;\n"
+"	mov.s32 	%r37, 0;\n"
+"	mov.u32 	%r38, %r37;\n"
+"	mov.s32 	%r39, 0;\n"
+"	mov.u32 	%r40, %r39;\n"
+"	mov.s32 	%r41, 0;\n"
+"	mov.u32 	%r42, %r41;\n"
+"	tex.1d.v4.f32.s32 {%f36,%f37,%f38,%f39},[pos_tex,{%r36,%r38,%r40,%r42}];\n"
+"	mov.f32 	%f40, %f36;\n"
+"	mov.f32 	%f41, %f37;\n"
+"	mov.f32 	%f42, %f38;\n"
+"	mov.f32 	%f43, %f39;\n"
+"	sub.ftz.f32 	%f44, %f27, %f41;\n"
+"	sub.ftz.f32 	%f45, %f26, %f40;\n"
+"	sub.ftz.f32 	%f46, %f28, %f42;\n"
+"	mul.ftz.f32 	%f47, %f44, %f44;\n"
+"	fma.rn.ftz.f32 	%f48, %f45, %f45, %f47;\n"
+"	fma.rn.ftz.f32 	%f49, %f46, %f46, %f48;\n"
+"	add.ftz.f32 	%f50, %f30, %f43;\n"
+"	cvt.rzi.ftz.s32.f32 	%r43, %f50;\n"
+"	cvt.s64.s32 	%rd43, %r43;\n"
+"	mul.wide.s32 	%rd44, %r43, 16;\n"
+"	add.u64 	%rd45, %rd44, %rd7;\n"
+"	ld.shared.f32 	%f51, [%rd45+8];\n"
+"	setp.gt.ftz.f32 	%p7, %f51, %f49;\n"
+"	@!%p7 bra 	$Lt_1_25602;\n"
+"	.loc	16	309	0\n"
+"	sqrt.approx.ftz.f32 	%f52, %f49;\n"
+"	ld.shared.v4.f32 	{%f53,%f54,_,%f55}, [%rd45+0];\n"
+"	sub.ftz.f32 	%f56, %f52, %f55;\n"
+"	.loc	16	313	0\n"
+"	mul.ftz.f32 	%f57, %f56, %f56;\n"
+"	cvt.ftz.f64.f32 	%fd1, %f57;\n"
+"	rcp.rn.f64 	%fd2, %fd1;\n"
+"	cvt.rn.ftz.f32.f64 	%f58, %fd2;\n"
+"	mul.ftz.f32 	%f59, %f58, %f58;\n"
+"	mul.ftz.f32 	%f60, %f58, %f59;\n"
+"	mul.ftz.f32 	%f61, %f53, %f60;\n"
+"	sub.ftz.f32 	%f62, %f61, %f54;\n"
+"	mul.ftz.f32 	%f63, %f60, %f62;\n"
+"	.loc	16	314	0\n"
+"	div.approx.ftz.f32 	%f64, %f35, %f56;\n"
+"	div.approx.ftz.f32 	%f65, %f64, %f52;\n"
+"	mul.ftz.f32 	%f66, %f63, %f65;\n"
+"	.loc	16	316	0\n"
+"	fma.rn.ftz.f32 	%f33, %f45, %f66, %f33;\n"
+"	.loc	16	317	0\n"
+"	fma.rn.ftz.f32 	%f32, %f44, %f66, %f32;\n"
+"	.loc	16	318	0\n"
+"	fma.rn.ftz.f32 	%f31, %f46, %f66, %f31;\n"
+"	ld.param.s32 	%r44, [__cudaparm_kernel_pair_fast_eflag];\n"
+"	mov.u32 	%r45, 0;\n"
+"	setp.le.s32 	%p8, %r44, %r45;\n"
+"	@%p8 bra 	$Lt_1_25090;\n"
+"	.loc	16	321	0\n"
+"	add.u64 	%rd46, %rd44, %rd13;\n"
+"	ld.shared.v4.f32 	{%f67,%f68,%f69,_}, [%rd46+0];\n"
+"	mul.ftz.f32 	%f70, %f67, %f60;\n"
+"	sub.ftz.f32 	%f71, %f70, %f68;\n"
+"	mul.ftz.f32 	%f72, %f60, %f71;\n"
+"	.loc	16	322	0\n"
+"	sub.ftz.f32 	%f73, %f72, %f69;\n"
+"	fma.rn.ftz.f32 	%f34, %f35, %f73, %f34;\n"
+"$Lt_1_25090:\n"
+"	ld.param.s32 	%r46, [__cudaparm_kernel_pair_fast_vflag];\n"
+"	mov.u32 	%r47, 0;\n"
+"	setp.le.s32 	%p9, %r46, %r47;\n"
+"	@%p9 bra 	$Lt_1_25602;\n"
+"	.loc	16	325	0\n"
+"	mov.f32 	%f74, %f11;\n"
+"	mul.ftz.f32 	%f75, %f45, %f45;\n"
+"	fma.rn.ftz.f32 	%f76, %f66, %f75, %f74;\n"
+"	mov.f32 	%f11, %f76;\n"
+"	.loc	16	326	0\n"
+"	mov.f32 	%f77, %f13;\n"
+"	fma.rn.ftz.f32 	%f78, %f66, %f47, %f77;\n"
+"	mov.f32 	%f13, %f78;\n"
+"	.loc	16	327	0\n"
+"	mov.f32 	%f79, %f15;\n"
+"	mul.ftz.f32 	%f80, %f46, %f46;\n"
+"	fma.rn.ftz.f32 	%f81, %f66, %f80, %f79;\n"
+"	mov.f32 	%f15, %f81;\n"
+"	.loc	16	328	0\n"
+"	mov.f32 	%f82, %f17;\n"
+"	mul.ftz.f32 	%f83, %f44, %f45;\n"
+"	fma.rn.ftz.f32 	%f84, %f66, %f83, %f82;\n"
+"	mov.f32 	%f17, %f84;\n"
+"	.loc	16	329	0\n"
+"	mov.f32 	%f85, %f19;\n"
+"	mul.ftz.f32 	%f86, %f45, %f46;\n"
+"	fma.rn.ftz.f32 	%f87, %f66, %f86, %f85;\n"
+"	mov.f32 	%f19, %f87;\n"
+"	.loc	16	330	0\n"
+"	mul.ftz.f32 	%f88, %f44, %f46;\n"
+"	fma.rn.ftz.f32 	%f20, %f66, %f88, %f20;\n"
+"	mov.f32 	%f21, %f20;\n"
+"$Lt_1_25602:\n"
+"$Lt_1_24578:\n"
+"	.loc	16	292	0\n"
+"	mul.lo.u64 	%rd47, %rd39, 4;\n"
+"	add.u64 	%rd31, %rd31, %rd47;\n"
+"	setp.lt.u64 	%p10, %rd31, %rd28;\n"
+"	@%p10 bra 	$Lt_1_24322;\n"
+"	bra.uni 	$Lt_1_22786;\n"
+"$Lt_1_32002:\n"
+"	mov.f32 	%f31, 0f00000000;    	\n"
+"	mov.f32 	%f32, 0f00000000;    	\n"
+"	mov.f32 	%f33, 0f00000000;    	\n"
+"	mov.f32 	%f34, 0f00000000;    	\n"
+"	bra.uni 	$Lt_1_22786;\n"
+"$Lt_1_23042:\n"
+"	mov.f32 	%f31, 0f00000000;    	\n"
+"	mov.f32 	%f32, 0f00000000;    	\n"
+"	mov.f32 	%f33, 0f00000000;    	\n"
+"	mov.f32 	%f34, 0f00000000;    	\n"
+"$Lt_1_22786:\n"
+"	mov.u32 	%r48, 1;\n"
+"	setp.le.s32 	%p11, %r6, %r48;\n"
+"	@%p11 bra 	$Lt_1_28418;\n"
+"	.loc	16	341	0\n"
+"	mov.u64 	%rd48, __cuda___cuda_local_var_32742_35_non_const_red_acc7168;\n"
+"	cvt.s64.s32 	%rd49, %r1;\n"
+"	mul.wide.s32 	%rd50, %r1, 4;\n"
+"	add.u64 	%rd51, %rd48, %rd50;\n"
+"	mov.f32 	%f89, %f33;\n"
+"	st.shared.f32 	[%rd51+0], %f89;\n"
+"	.loc	16	342	0\n"
+"	mov.f32 	%f90, %f32;\n"
+"	st.shared.f32 	[%rd51+512], %f90;\n"
+"	.loc	16	343	0\n"
+"	mov.f32 	%f91, %f31;\n"
+"	st.shared.f32 	[%rd51+1024], %f91;\n"
+"	.loc	16	344	0\n"
+"	mov.f32 	%f92, %f34;\n"
+"	st.shared.f32 	[%rd51+1536], %f92;\n"
+"	.loc	16	346	0\n"
+"	shr.s32 	%r49, %r6, 31;\n"
+"	mov.s32 	%r50, 1;\n"
+"	and.b32 	%r51, %r49, %r50;\n"
+"	add.s32 	%r52, %r51, %r6;\n"
+"	shr.s32 	%r53, %r52, 1;\n"
+"	mov.s32 	%r54, %r53;\n"
+"	mov.u32 	%r55, 0;\n"
+"	setp.ne.u32 	%p12, %r53, %r55;\n"
+"	@!%p12 bra 	$Lt_1_26882;\n"
+"$Lt_1_27394:\n"
+"	setp.ge.u32 	%p13, %r10, %r54;\n"
+"	@%p13 bra 	$Lt_1_27650;\n"
+"	.loc	16	349	0\n"
+"	add.u32 	%r56, %r1, %r54;\n"
+"	cvt.u64.u32 	%rd52, %r56;\n"
+"	mul.wide.u32 	%rd53, %r56, 4;\n"
+"	add.u64 	%rd54, %rd48, %rd53;\n"
+"	ld.shared.f32 	%f93, [%rd54+0];\n"
+"	add.ftz.f32 	%f89, %f93, %f89;\n"
+"	st.shared.f32 	[%rd51+0], %f89;\n"
+"	ld.shared.f32 	%f94, [%rd54+512];\n"
+"	add.ftz.f32 	%f90, %f94, %f90;\n"
+"	st.shared.f32 	[%rd51+512], %f90;\n"
+"	ld.shared.f32 	%f95, [%rd54+1024];\n"
+"	add.ftz.f32 	%f91, %f95, %f91;\n"
+"	st.shared.f32 	[%rd51+1024], %f91;\n"
+"	ld.shared.f32 	%f96, [%rd54+1536];\n"
+"	add.ftz.f32 	%f92, %f96, %f92;\n"
+"	st.shared.f32 	[%rd51+1536], %f92;\n"
+"$Lt_1_27650:\n"
+"	.loc	16	346	0\n"
+"	shr.u32 	%r54, %r54, 1;\n"
+"	mov.u32 	%r57, 0;\n"
+"	setp.ne.u32 	%p14, %r54, %r57;\n"
+"	@%p14 bra 	$Lt_1_27394;\n"
+"$Lt_1_26882:\n"
+"	.loc	16	353	0\n"
+"	mov.f32 	%f33, %f89;\n"
+"	.loc	16	354	0\n"
+"	mov.f32 	%f32, %f90;\n"
+"	.loc	16	355	0\n"
+"	mov.f32 	%f31, %f91;\n"
+"	.loc	16	356	0\n"
+"	mov.f32 	%f34, %f92;\n"
+"	ld.param.s32 	%r58, [__cudaparm_kernel_pair_fast_vflag];\n"
+"	mov.u32 	%r59, 0;\n"
+"	setp.le.s32 	%p15, %r58, %r59;\n"
+"	@%p15 bra 	$Lt_1_28418;\n"
+"	.loc	16	360	0\n"
+"	mov.f32 	%f89, %f11;\n"
+"	st.shared.f32 	[%rd51+0], %f89;\n"
+"	mov.f32 	%f90, %f13;\n"
+"	st.shared.f32 	[%rd51+512], %f90;\n"
+"	mov.f32 	%f91, %f15;\n"
+"	st.shared.f32 	[%rd51+1024], %f91;\n"
+"	mov.f32 	%f92, %f17;\n"
+"	st.shared.f32 	[%rd51+1536], %f92;\n"
+"	mov.f32 	%f97, %f19;\n"
+"	st.shared.f32 	[%rd51+2048], %f97;\n"
+"	mov.f32 	%f98, %f21;\n"
+"	st.shared.f32 	[%rd51+2560], %f98;\n"
+"	.loc	16	362	0\n"
+"	mov.s32 	%r60, %r53;\n"
+"	@!%p12 bra 	$Lt_1_28930;\n"
+"$Lt_1_29442:\n"
+"	setp.ge.u32 	%p16, %r10, %r60;\n"
+"	@%p16 bra 	$Lt_1_29698;\n"
+"	.loc	16	365	0\n"
+"	add.u32 	%r61, %r1, %r60;\n"
+"	cvt.u64.u32 	%rd55, %r61;\n"
+"	mul.wide.u32 	%rd56, %r61, 4;\n"
+"	add.u64 	%rd57, %rd48, %rd56;\n"
+"	ld.shared.f32 	%f99, [%rd57+0];\n"
+"	add.ftz.f32 	%f89, %f99, %f89;\n"
+"	st.shared.f32 	[%rd51+0], %f89;\n"
+"	ld.shared.f32 	%f100, [%rd57+512];\n"
+"	add.ftz.f32 	%f90, %f100, %f90;\n"
+"	st.shared.f32 	[%rd51+512], %f90;\n"
+"	ld.shared.f32 	%f101, [%rd57+1024];\n"
+"	add.ftz.f32 	%f91, %f101, %f91;\n"
+"	st.shared.f32 	[%rd51+1024], %f91;\n"
+"	ld.shared.f32 	%f102, [%rd57+1536];\n"
+"	add.ftz.f32 	%f92, %f102, %f92;\n"
+"	st.shared.f32 	[%rd51+1536], %f92;\n"
+"	ld.shared.f32 	%f103, [%rd57+2048];\n"
+"	add.ftz.f32 	%f97, %f103, %f97;\n"
+"	st.shared.f32 	[%rd51+2048], %f97;\n"
+"	ld.shared.f32 	%f104, [%rd57+2560];\n"
+"	add.ftz.f32 	%f98, %f104, %f98;\n"
+"	st.shared.f32 	[%rd51+2560], %f98;\n"
+"$Lt_1_29698:\n"
+"	.loc	16	362	0\n"
+"	shr.u32 	%r60, %r60, 1;\n"
+"	mov.u32 	%r62, 0;\n"
+"	setp.ne.u32 	%p17, %r60, %r62;\n"
+"	@%p17 bra 	$Lt_1_29442;\n"
+"$Lt_1_28930:\n"
+"	.loc	16	370	0\n"
+"	mov.f32 	%f11, %f89;\n"
+"	mov.f32 	%f13, %f90;\n"
+"	mov.f32 	%f15, %f91;\n"
+"	mov.f32 	%f17, %f92;\n"
+"	mov.f32 	%f19, %f97;\n"
+"	mov.f32 	%f21, %f98;\n"
+"$Lt_1_28418:\n"
+"$Lt_1_26370:\n"
+"	selp.s32 	%r63, 1, 0, %p4;\n"
+"	mov.s32 	%r64, 0;\n"
+"	set.eq.u32.s32 	%r65, %r10, %r64;\n"
+"	neg.s32 	%r66, %r65;\n"
+"	and.b32 	%r67, %r63, %r66;\n"
+"	mov.u32 	%r68, 0;\n"
+"	setp.eq.s32 	%p18, %r67, %r68;\n"
+"	@%p18 bra 	$Lt_1_30466;\n"
+"	.loc	16	376	0\n"
+"	cvt.s64.s32 	%rd58, %r13;\n"
+"	ld.param.u64 	%rd59, [__cudaparm_kernel_pair_fast_engv];\n"
+"	mul.wide.s32 	%rd60, %r13, 4;\n"
+"	add.u64 	%rd61, %rd59, %rd60;\n"
+"	ld.param.s32 	%r69, [__cudaparm_kernel_pair_fast_eflag];\n"
+"	mov.u32 	%r70, 0;\n"
+"	setp.le.s32 	%p19, %r69, %r70;\n"
+"	@%p19 bra 	$Lt_1_30978;\n"
+"	.loc	16	378	0\n"
+"	st.global.f32 	[%rd61+0], %f34;\n"
+"	.loc	16	379	0\n"
+"	cvt.s64.s32 	%rd62, %r14;\n"
+"	mul.wide.s32 	%rd63, %r14, 4;\n"
+"	add.u64 	%rd61, %rd61, %rd63;\n"
+"$Lt_1_30978:\n"
+"	ld.param.s32 	%r71, [__cudaparm_kernel_pair_fast_vflag];\n"
+"	mov.u32 	%r72, 0;\n"
+"	setp.le.s32 	%p20, %r71, %r72;\n"
+"	@%p20 bra 	$Lt_1_31490;\n"
+"	.loc	16	383	0\n"
+"	mov.f32 	%f105, %f11;\n"
+"	st.global.f32 	[%rd61+0], %f105;\n"
+"	.loc	16	384	0\n"
+"	cvt.s64.s32 	%rd64, %r14;\n"
+"	mul.wide.s32 	%rd65, %r14, 4;\n"
+"	add.u64 	%rd66, %rd65, %rd61;\n"
+"	.loc	16	383	0\n"
+"	mov.f32 	%f106, %f13;\n"
+"	st.global.f32 	[%rd66+0], %f106;\n"
+"	.loc	16	384	0\n"
+"	add.u64 	%rd67, %rd65, %rd66;\n"
+"	.loc	16	383	0\n"
+"	mov.f32 	%f107, %f15;\n"
+"	st.global.f32 	[%rd67+0], %f107;\n"
+"	.loc	16	384	0\n"
+"	add.u64 	%rd68, %rd65, %rd67;\n"
+"	.loc	16	383	0\n"
+"	mov.f32 	%f108, %f17;\n"
+"	st.global.f32 	[%rd68+0], %f108;\n"
+"	.loc	16	384	0\n"
+"	add.u64 	%rd61, %rd65, %rd68;\n"
+"	.loc	16	383	0\n"
+"	mov.f32 	%f109, %f19;\n"
+"	st.global.f32 	[%rd61+0], %f109;\n"
+"	mov.f32 	%f110, %f21;\n"
+"	add.u64 	%rd69, %rd65, %rd61;\n"
+"	st.global.f32 	[%rd69+0], %f110;\n"
+"$Lt_1_31490:\n"
+"	.loc	16	387	0\n"
+"	ld.param.u64 	%rd70, [__cudaparm_kernel_pair_fast_ans];\n"
+"	mul.lo.u64 	%rd71, %rd58, 16;\n"
+"	add.u64 	%rd72, %rd70, %rd71;\n"
+"	mov.f32 	%f111, %f112;\n"
+"	st.global.v4.f32 	[%rd72+0], {%f33,%f32,%f31,%f111};\n"
+"$Lt_1_30466:\n"
+"	.loc	16	389	0\n"
+"	exit;\n"
+"$LDWend_kernel_pair_fast:\n"
+"	}\n"
+;
--- a/Show More
+++ b/Show More