diff --git a/lib/gpu/atom.ptx b/lib/gpu/atom.ptx deleted file mode 100644 index 7d73b0501f..0000000000 --- a/lib/gpu/atom.ptx +++ /dev/null @@ -1,101 +0,0 @@ - .version 2.3 - .target sm_20 - .address_size 64 - // compiled with /usr/local/cuda/open64/lib//be - // nvopencc 4.0 built on 2011-05-12 - - //----------------------------------------------------------- - // Compiling /tmp/tmpxft_000099dd_00000000-9_lal_atom.cpp3.i (/home/sjplimp/ccBI#.Q6OzuV) - //----------------------------------------------------------- - - //----------------------------------------------------------- - // Options: - //----------------------------------------------------------- - // Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64 - // -O3 (Optimization level) - // -g0 (Debug level) - // -m2 (Report advisories) - //----------------------------------------------------------- - - .file 1 "" - .file 2 "/tmp/tmpxft_000099dd_00000000-8_lal_atom.cudafe2.gpu" - .file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h" - .file 4 "/usr/local/cuda/include/crt/device_runtime.h" - .file 5 "/usr/local/cuda/include/host_defines.h" - .file 6 "/usr/local/cuda/include/builtin_types.h" - .file 7 "/usr/local/cuda/include/device_types.h" - .file 8 "/usr/local/cuda/include/driver_types.h" - .file 9 "/usr/local/cuda/include/surface_types.h" - .file 10 "/usr/local/cuda/include/texture_types.h" - .file 11 "/usr/local/cuda/include/vector_types.h" - .file 12 "/usr/local/cuda/include/device_launch_parameters.h" - .file 13 "/usr/local/cuda/include/crt/storage_class.h" - .file 14 "/usr/include/bits/types.h" - .file 15 "/usr/include/time.h" - .file 16 "lal_atom.cu" - .file 17 "/usr/local/cuda/include/common_functions.h" - .file 18 "/usr/local/cuda/include/math_functions.h" - .file 19 "/usr/local/cuda/include/math_constants.h" - .file 20 "/usr/local/cuda/include/device_functions.h" - .file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h" - .file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h" - .file 23 "/usr/local/cuda/include/sm_13_double_functions.h" - .file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h" - .file 25 "/usr/local/cuda/include/sm_20_intrinsics.h" - .file 26 "/usr/local/cuda/include/surface_functions.h" - .file 27 "/usr/local/cuda/include/texture_fetch_functions.h" - .file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h" - - - .entry kernel_cast_x ( - .param .u64 __cudaparm_kernel_cast_x_x_type, - .param .u64 __cudaparm_kernel_cast_x_x, - .param .u64 __cudaparm_kernel_cast_x_type, - .param .s32 __cudaparm_kernel_cast_x_nall) - { - .reg .u32 %r<10>; - .reg .u64 %rd<13>; - .reg .f32 %f<6>; - .reg .f64 %fd<5>; - .reg .pred %p<3>; - .loc 16 21 0 -$LDWbegin_kernel_cast_x: - cvt.s32.u32 %r1, %ctaid.x; - cvt.s32.u32 %r2, %ntid.x; - mul24.lo.s32 %r3, %r1, %r2; - mov.u32 %r4, %tid.x; - add.u32 %r5, %r3, %r4; - ld.param.s32 %r6, [__cudaparm_kernel_cast_x_nall]; - setp.le.s32 %p1, %r6, %r5; - @%p1 bra $Lt_0_1026; - .loc 16 26 0 - cvt.s64.s32 %rd1, %r5; - ld.param.u64 %rd2, [__cudaparm_kernel_cast_x_type]; - mul.wide.s32 %rd3, %r5, 4; - add.u64 %rd4, %rd2, %rd3; - ld.global.s32 %r7, [%rd4+0]; - cvt.rn.f32.s32 %f1, %r7; - .loc 16 29 0 - ld.param.u64 %rd5, [__cudaparm_kernel_cast_x_x]; - mul.lo.s32 %r8, %r5, 3; - cvt.s64.s32 %rd6, %r8; - mul.wide.s32 %rd7, %r8, 8; - add.u64 %rd8, %rd5, %rd7; - ld.global.f64 %fd1, [%rd8+8]; - cvt.rn.ftz.f32.f64 %f2, %fd1; - .loc 16 30 0 - ld.global.f64 %fd2, [%rd8+16]; - cvt.rn.ftz.f32.f64 %f3, %fd2; - .loc 16 31 0 - ld.param.u64 %rd9, [__cudaparm_kernel_cast_x_x_type]; - mul.wide.s32 %rd10, %r5, 16; - add.u64 %rd11, %rd9, %rd10; - ld.global.f64 %fd3, [%rd8+0]; - cvt.rn.ftz.f32.f64 %f4, %fd3; - st.global.v4.f32 [%rd11+0], {%f4,%f2,%f3,%f1}; -$Lt_0_1026: - .loc 16 33 0 - exit; -$LDWend_kernel_cast_x: - } // kernel_cast_x - diff --git a/lib/gpu/atom_ptx.h b/lib/gpu/atom_ptx.h deleted file mode 100644 index 1d69622a18..0000000000 --- a/lib/gpu/atom_ptx.h +++ /dev/null @@ -1,56 +0,0 @@ -const char * atom = -" .version 2.3\n" -" .target sm_20\n" -" .address_size 64\n" -" .entry kernel_cast_x (\n" -" .param .u64 __cudaparm_kernel_cast_x_x_type,\n" -" .param .u64 __cudaparm_kernel_cast_x_x,\n" -" .param .u64 __cudaparm_kernel_cast_x_type,\n" -" .param .s32 __cudaparm_kernel_cast_x_nall)\n" -" {\n" -" .reg .u32 %r<10>;\n" -" .reg .u64 %rd<13>;\n" -" .reg .f32 %f<6>;\n" -" .reg .f64 %fd<5>;\n" -" .reg .pred %p<3>;\n" -" .loc 16 21 0\n" -"$LDWbegin_kernel_cast_x:\n" -" cvt.s32.u32 %r1, %ctaid.x;\n" -" cvt.s32.u32 %r2, %ntid.x;\n" -" mul24.lo.s32 %r3, %r1, %r2;\n" -" mov.u32 %r4, %tid.x;\n" -" add.u32 %r5, %r3, %r4;\n" -" ld.param.s32 %r6, [__cudaparm_kernel_cast_x_nall];\n" -" setp.le.s32 %p1, %r6, %r5;\n" -" @%p1 bra $Lt_0_1026;\n" -" .loc 16 26 0\n" -" cvt.s64.s32 %rd1, %r5;\n" -" ld.param.u64 %rd2, [__cudaparm_kernel_cast_x_type];\n" -" mul.wide.s32 %rd3, %r5, 4;\n" -" add.u64 %rd4, %rd2, %rd3;\n" -" ld.global.s32 %r7, [%rd4+0];\n" -" cvt.rn.f32.s32 %f1, %r7;\n" -" .loc 16 29 0\n" -" ld.param.u64 %rd5, [__cudaparm_kernel_cast_x_x];\n" -" mul.lo.s32 %r8, %r5, 3;\n" -" cvt.s64.s32 %rd6, %r8;\n" -" mul.wide.s32 %rd7, %r8, 8;\n" -" add.u64 %rd8, %rd5, %rd7;\n" -" ld.global.f64 %fd1, [%rd8+8];\n" -" cvt.rn.ftz.f32.f64 %f2, %fd1;\n" -" .loc 16 30 0\n" -" ld.global.f64 %fd2, [%rd8+16];\n" -" cvt.rn.ftz.f32.f64 %f3, %fd2;\n" -" .loc 16 31 0\n" -" ld.param.u64 %rd9, [__cudaparm_kernel_cast_x_x_type];\n" -" mul.wide.s32 %rd10, %r5, 16;\n" -" add.u64 %rd11, %rd9, %rd10;\n" -" ld.global.f64 %fd3, [%rd8+0];\n" -" cvt.rn.ftz.f32.f64 %f4, %fd3;\n" -" st.global.v4.f32 [%rd11+0], {%f4,%f2,%f3,%f1};\n" -"$Lt_0_1026:\n" -" .loc 16 33 0\n" -" exit;\n" -"$LDWend_kernel_cast_x:\n" -" }\n" -; diff --git a/lib/gpu/cg_cmm.ptx b/lib/gpu/cg_cmm.ptx deleted file mode 100644 index cffb2c0e97..0000000000 --- a/lib/gpu/cg_cmm.ptx +++ /dev/null @@ -1,958 +0,0 @@ - .version 2.3 - .target sm_20 - .address_size 64 - // compiled with /usr/local/cuda/open64/lib//be - // nvopencc 4.0 built on 2011-05-12 - - //----------------------------------------------------------- - // Compiling /tmp/tmpxft_00009eb0_00000000-9_lal_cg_cmm.cpp3.i (/home/sjplimp/ccBI#.oK8Qzh) - //----------------------------------------------------------- - - //----------------------------------------------------------- - // Options: - //----------------------------------------------------------- - // Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64 - // -O3 (Optimization level) - // -g0 (Debug level) - // -m2 (Report advisories) - //----------------------------------------------------------- - - .file 1 "" - .file 2 "/tmp/tmpxft_00009eb0_00000000-8_lal_cg_cmm.cudafe2.gpu" - .file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h" - .file 4 "/usr/local/cuda/include/crt/device_runtime.h" - .file 5 "/usr/local/cuda/include/host_defines.h" - .file 6 "/usr/local/cuda/include/builtin_types.h" - .file 7 "/usr/local/cuda/include/device_types.h" - .file 8 "/usr/local/cuda/include/driver_types.h" - .file 9 "/usr/local/cuda/include/surface_types.h" - .file 10 "/usr/local/cuda/include/texture_types.h" - .file 11 "/usr/local/cuda/include/vector_types.h" - .file 12 "/usr/local/cuda/include/device_launch_parameters.h" - .file 13 "/usr/local/cuda/include/crt/storage_class.h" - .file 14 "/usr/include/bits/types.h" - .file 15 "/usr/include/time.h" - .file 16 "lal_cg_cmm.cu" - .file 17 "/usr/local/cuda/include/common_functions.h" - .file 18 "/usr/local/cuda/include/math_functions.h" - .file 19 "/usr/local/cuda/include/math_constants.h" - .file 20 "/usr/local/cuda/include/device_functions.h" - .file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h" - .file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h" - .file 23 "/usr/local/cuda/include/sm_13_double_functions.h" - .file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h" - .file 25 "/usr/local/cuda/include/sm_20_intrinsics.h" - .file 26 "/usr/local/cuda/include/surface_functions.h" - .file 27 "/usr/local/cuda/include/texture_fetch_functions.h" - .file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h" - - .global .texref pos_tex; - - .entry kernel_pair ( - .param .u64 __cudaparm_kernel_pair_x_, - .param .u64 __cudaparm_kernel_pair_lj1, - .param .u64 __cudaparm_kernel_pair_lj3, - .param .s32 __cudaparm_kernel_pair_lj_types, - .param .u64 __cudaparm_kernel_pair_sp_lj_in, - .param .u64 __cudaparm_kernel_pair_dev_nbor, - .param .u64 __cudaparm_kernel_pair_dev_packed, - .param .u64 __cudaparm_kernel_pair_ans, - .param .u64 __cudaparm_kernel_pair___val_paramengv, - .param .s32 __cudaparm_kernel_pair_eflag, - .param .s32 __cudaparm_kernel_pair_vflag, - .param .s32 __cudaparm_kernel_pair_inum, - .param .s32 __cudaparm_kernel_pair_nbor_pitch, - .param .s32 __cudaparm_kernel_pair_t_per_atom) - { - .reg .u32 %r<72>; - .reg .u64 %rd<63>; - .reg .f32 %f<111>; - .reg .pred %p<21>; - .shared .align 16 .b8 __cuda___cuda_local_var_32536_33_non_const_sp_lj92[16]; - .shared .align 4 .b8 __cuda___cuda_local_var_32608_55_non_const_red_acc108[3072]; - // __cuda_local_var_32543_10_non_const_f = 48 - // __cuda_local_var_32545_9_non_const_virial = 16 - .loc 16 31 0 -$LDWbegin_kernel_pair: - .loc 16 36 0 - ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in]; - ldu.global.f32 %f1, [%rd1+0]; - .loc 16 37 0 - ld.global.f32 %f2, [%rd1+4]; - .loc 16 38 0 - ld.global.f32 %f3, [%rd1+8]; - .loc 16 39 0 - ld.global.f32 %f4, [%rd1+12]; - st.shared.v4.f32 [__cuda___cuda_local_var_32536_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4}; - .loc 16 46 0 - mov.f32 %f5, 0f00000000; // 0 - mov.f32 %f6, %f5; - mov.f32 %f7, 0f00000000; // 0 - mov.f32 %f8, %f7; - mov.f32 %f9, 0f00000000; // 0 - mov.f32 %f10, %f9; - mov.f32 %f11, 0f00000000; // 0 - mov.f32 %f12, %f11; - mov.f32 %f13, 0f00000000; // 0 - mov.f32 %f14, %f13; - mov.f32 %f15, 0f00000000; // 0 - mov.f32 %f16, %f15; - ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom]; - cvt.s32.u32 %r2, %tid.x; - div.s32 %r3, %r2, %r1; - cvt.s32.u32 %r4, %ntid.x; - div.s32 %r5, %r4, %r1; - cvt.s32.u32 %r6, %ctaid.x; - mul.lo.s32 %r7, %r6, %r5; - add.s32 %r8, %r3, %r7; - ld.param.s32 %r9, [__cudaparm_kernel_pair_inum]; - setp.ge.s32 %p1, %r8, %r9; - @%p1 bra $Lt_0_28930; - .loc 16 51 0 - ld.param.s32 %r10, [__cudaparm_kernel_pair_nbor_pitch]; - cvt.s64.s32 %rd2, %r10; - mul.wide.s32 %rd3, %r10, 4; - cvt.s64.s32 %rd4, %r8; - mul.wide.s32 %rd5, %r8, 4; - ld.param.u64 %rd6, [__cudaparm_kernel_pair_dev_nbor]; - add.u64 %rd7, %rd5, %rd6; - add.u64 %rd8, %rd3, %rd7; - ld.global.s32 %r11, [%rd8+0]; - sub.s32 %r12, %r1, 1; - and.b32 %r13, %r12, %r2; - cvt.s64.s32 %rd9, %r13; - mul.wide.s32 %rd10, %r13, 4; - ld.param.u64 %rd11, [__cudaparm_kernel_pair_dev_packed]; - setp.ne.u64 %p2, %rd11, %rd6; - @%p2 bra $Lt_0_20994; - cvt.s32.s64 %r14, %rd2; - mul.lo.s32 %r15, %r14, %r1; - mov.s32 %r16, %r15; - mul.lo.s32 %r17, %r12, %r8; - add.s32 %r18, %r14, %r17; - cvt.s64.s32 %rd12, %r18; - mul.wide.s32 %rd13, %r18, 4; - add.u64 %rd14, %rd8, %rd13; - and.b32 %r19, %r12, %r11; - cvt.s64.s32 %rd15, %r19; - div.s32 %r20, %r11, %r1; - mul.lo.s32 %r21, %r15, %r20; - cvt.s64.s32 %rd16, %r21; - add.u64 %rd17, %rd15, %rd16; - mul.lo.u64 %rd18, %rd17, 4; - add.u64 %rd19, %rd14, %rd18; - add.u64 %rd20, %rd10, %rd14; - bra.uni $Lt_0_20738; -$Lt_0_20994: - add.u64 %rd21, %rd3, %rd8; - ld.global.s32 %r22, [%rd21+0]; - cvt.s64.s32 %rd22, %r22; - mul.wide.s32 %rd23, %r22, 4; - add.u64 %rd24, %rd11, %rd23; - cvt.s64.s32 %rd25, %r11; - mul.wide.s32 %rd26, %r11, 4; - add.u64 %rd19, %rd24, %rd26; - mov.s32 %r16, %r1; - add.u64 %rd20, %rd10, %rd24; -$Lt_0_20738: - .loc 16 54 0 - ld.global.s32 %r23, [%rd7+0]; - mov.u32 %r24, %r23; - mov.s32 %r25, 0; - mov.u32 %r26, %r25; - mov.s32 %r27, 0; - mov.u32 %r28, %r27; - mov.s32 %r29, 0; - mov.u32 %r30, %r29; - tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r24,%r26,%r28,%r30}]; - mov.f32 %f21, %f17; - mov.f32 %f22, %f18; - mov.f32 %f23, %f19; - mov.f32 %f24, %f20; - setp.ge.u64 %p3, %rd20, %rd19; - @%p3 bra $Lt_0_30466; - cvt.rzi.ftz.s32.f32 %r31, %f24; - cvt.s64.s32 %rd27, %r16; - ld.param.s32 %r32, [__cudaparm_kernel_pair_lj_types]; - mul.lo.s32 %r33, %r32, %r31; - ld.param.u64 %rd28, [__cudaparm_kernel_pair_lj1]; - mov.f32 %f25, 0f00000000; // 0 - mov.f32 %f26, 0f00000000; // 0 - mov.f32 %f27, 0f00000000; // 0 - mov.f32 %f28, 0f00000000; // 0 - mov.u64 %rd29, __cuda___cuda_local_var_32536_33_non_const_sp_lj92; -$Lt_0_21762: - // Loop body line 54, nesting depth: 1, estimated iterations: unknown - .loc 16 60 0 - ld.global.s32 %r34, [%rd20+0]; - .loc 16 61 0 - shr.s32 %r35, %r34, 30; - and.b32 %r36, %r35, 3; - cvt.s64.s32 %rd30, %r36; - mul.wide.s32 %rd31, %r36, 4; - add.u64 %rd32, %rd29, %rd31; - ld.shared.f32 %f29, [%rd32+0]; - .loc 16 64 0 - and.b32 %r37, %r34, 1073741823; - mov.u32 %r38, %r37; - mov.s32 %r39, 0; - mov.u32 %r40, %r39; - mov.s32 %r41, 0; - mov.u32 %r42, %r41; - mov.s32 %r43, 0; - mov.u32 %r44, %r43; - tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r38,%r40,%r42,%r44}]; - mov.f32 %f34, %f30; - mov.f32 %f35, %f31; - mov.f32 %f36, %f32; - mov.f32 %f37, %f33; - cvt.rzi.ftz.s32.f32 %r45, %f37; - sub.ftz.f32 %f38, %f22, %f35; - sub.ftz.f32 %f39, %f21, %f34; - sub.ftz.f32 %f40, %f23, %f36; - mul.ftz.f32 %f41, %f38, %f38; - fma.rn.ftz.f32 %f42, %f39, %f39, %f41; - fma.rn.ftz.f32 %f43, %f40, %f40, %f42; - add.s32 %r46, %r45, %r33; - cvt.s64.s32 %rd33, %r46; - mul.wide.s32 %rd34, %r46, 16; - add.u64 %rd35, %rd34, %rd28; - ld.global.f32 %f44, [%rd35+0]; - setp.gt.ftz.f32 %p4, %f44, %f43; - @!%p4 bra $Lt_0_24066; - rcp.approx.ftz.f32 %f45, %f43; - ld.global.f32 %f46, [%rd35+4]; - mov.f32 %f47, 0f40000000; // 2 - setp.eq.ftz.f32 %p5, %f46, %f47; - @!%p5 bra $Lt_0_22786; - .loc 16 79 0 - mul.ftz.f32 %f48, %f45, %f45; - mov.f32 %f49, %f48; - .loc 16 80 0 - mul.ftz.f32 %f50, %f48, %f48; - bra.uni $Lt_0_23042; -$Lt_0_22786: - mov.f32 %f51, 0f3f800000; // 1 - setp.eq.ftz.f32 %p6, %f46, %f51; - @!%p6 bra $Lt_0_23298; - .loc 16 82 0 - sqrt.approx.ftz.f32 %f52, %f45; - mul.ftz.f32 %f53, %f45, %f52; - mov.f32 %f50, %f53; - .loc 16 83 0 - mul.ftz.f32 %f49, %f53, %f53; - bra.uni $Lt_0_23042; -$Lt_0_23298: - .loc 16 85 0 - mul.ftz.f32 %f54, %f45, %f45; - mul.ftz.f32 %f55, %f45, %f54; - mov.f32 %f49, %f55; - .loc 16 86 0 - mov.f32 %f50, %f55; -$Lt_0_23042: -$Lt_0_22530: - .loc 16 88 0 - mul.ftz.f32 %f56, %f45, %f29; - mul.ftz.f32 %f57, %f49, %f56; - ld.global.v2.f32 {%f58,%f59}, [%rd35+8]; - mul.ftz.f32 %f60, %f58, %f50; - sub.ftz.f32 %f61, %f60, %f59; - mul.ftz.f32 %f62, %f57, %f61; - .loc 16 90 0 - fma.rn.ftz.f32 %f27, %f39, %f62, %f27; - .loc 16 91 0 - fma.rn.ftz.f32 %f26, %f38, %f62, %f26; - .loc 16 92 0 - fma.rn.ftz.f32 %f25, %f40, %f62, %f25; - ld.param.s32 %r47, [__cudaparm_kernel_pair_eflag]; - mov.u32 %r48, 0; - setp.le.s32 %p7, %r47, %r48; - @%p7 bra $Lt_0_23554; - .loc 16 94 0 - ld.param.u64 %rd36, [__cudaparm_kernel_pair_lj3]; - add.u64 %rd37, %rd36, %rd34; - ld.global.v4.f32 {%f63,%f64,%f65,_}, [%rd37+0]; - mul.ftz.f32 %f66, %f29, %f49; - mul.ftz.f32 %f67, %f63, %f50; - sub.ftz.f32 %f68, %f67, %f64; - mul.ftz.f32 %f69, %f66, %f68; - sub.ftz.f32 %f70, %f69, %f65; - add.ftz.f32 %f28, %f28, %f70; -$Lt_0_23554: - ld.param.s32 %r49, [__cudaparm_kernel_pair_vflag]; - mov.u32 %r50, 0; - setp.le.s32 %p8, %r49, %r50; - @%p8 bra $Lt_0_24066; - .loc 16 97 0 - mov.f32 %f71, %f6; - mul.ftz.f32 %f72, %f39, %f39; - fma.rn.ftz.f32 %f73, %f62, %f72, %f71; - mov.f32 %f6, %f73; - .loc 16 98 0 - mov.f32 %f74, %f8; - fma.rn.ftz.f32 %f75, %f62, %f41, %f74; - mov.f32 %f8, %f75; - .loc 16 99 0 - mov.f32 %f76, %f10; - mul.ftz.f32 %f77, %f40, %f40; - fma.rn.ftz.f32 %f78, %f62, %f77, %f76; - mov.f32 %f10, %f78; - .loc 16 100 0 - mov.f32 %f79, %f12; - mul.ftz.f32 %f80, %f38, %f39; - fma.rn.ftz.f32 %f81, %f62, %f80, %f79; - mov.f32 %f12, %f81; - .loc 16 101 0 - mov.f32 %f82, %f14; - mul.ftz.f32 %f83, %f39, %f40; - fma.rn.ftz.f32 %f84, %f62, %f83, %f82; - mov.f32 %f14, %f84; - .loc 16 102 0 - mul.ftz.f32 %f85, %f38, %f40; - fma.rn.ftz.f32 %f15, %f62, %f85, %f15; - mov.f32 %f16, %f15; -$Lt_0_24066: -$Lt_0_22018: - .loc 16 58 0 - mul.lo.u64 %rd38, %rd27, 4; - add.u64 %rd20, %rd20, %rd38; - setp.lt.u64 %p9, %rd20, %rd19; - @%p9 bra $Lt_0_21762; - bra.uni $Lt_0_21250; -$Lt_0_30466: - mov.f32 %f25, 0f00000000; // 0 - mov.f32 %f26, 0f00000000; // 0 - mov.f32 %f27, 0f00000000; // 0 - mov.f32 %f28, 0f00000000; // 0 -$Lt_0_21250: - mov.u32 %r51, 1; - setp.le.s32 %p10, %r1, %r51; - @%p10 bra $Lt_0_26882; - .loc 16 107 0 - mov.u64 %rd39, __cuda___cuda_local_var_32608_55_non_const_red_acc108; - cvt.s64.s32 %rd40, %r2; - mul.wide.s32 %rd41, %r2, 4; - add.u64 %rd42, %rd39, %rd41; - mov.f32 %f86, %f27; - st.shared.f32 [%rd42+0], %f86; - mov.f32 %f87, %f26; - st.shared.f32 [%rd42+512], %f87; - mov.f32 %f88, %f25; - st.shared.f32 [%rd42+1024], %f88; - mov.f32 %f89, %f28; - st.shared.f32 [%rd42+1536], %f89; - shr.s32 %r52, %r1, 31; - mov.s32 %r53, 1; - and.b32 %r54, %r52, %r53; - add.s32 %r55, %r54, %r1; - shr.s32 %r56, %r55, 1; - mov.s32 %r57, %r56; - mov.u32 %r58, 0; - setp.ne.u32 %p11, %r56, %r58; - @!%p11 bra $Lt_0_25346; -$Lt_0_25858: - setp.ge.u32 %p12, %r13, %r57; - @%p12 bra $Lt_0_26114; - add.u32 %r59, %r2, %r57; - cvt.u64.u32 %rd43, %r59; - mul.wide.u32 %rd44, %r59, 4; - add.u64 %rd45, %rd39, %rd44; - ld.shared.f32 %f90, [%rd45+0]; - add.ftz.f32 %f86, %f90, %f86; - st.shared.f32 [%rd42+0], %f86; - ld.shared.f32 %f91, [%rd45+512]; - add.ftz.f32 %f87, %f91, %f87; - st.shared.f32 [%rd42+512], %f87; - ld.shared.f32 %f92, [%rd45+1024]; - add.ftz.f32 %f88, %f92, %f88; - st.shared.f32 [%rd42+1024], %f88; - ld.shared.f32 %f93, [%rd45+1536]; - add.ftz.f32 %f89, %f93, %f89; - st.shared.f32 [%rd42+1536], %f89; -$Lt_0_26114: - shr.u32 %r57, %r57, 1; - mov.u32 %r60, 0; - setp.ne.u32 %p13, %r57, %r60; - @%p13 bra $Lt_0_25858; -$Lt_0_25346: - mov.f32 %f27, %f86; - mov.f32 %f26, %f87; - mov.f32 %f25, %f88; - mov.f32 %f28, %f89; - ld.param.s32 %r61, [__cudaparm_kernel_pair_vflag]; - mov.u32 %r62, 0; - setp.le.s32 %p14, %r61, %r62; - @%p14 bra $Lt_0_26882; - mov.f32 %f86, %f6; - st.shared.f32 [%rd42+0], %f86; - mov.f32 %f87, %f8; - st.shared.f32 [%rd42+512], %f87; - mov.f32 %f88, %f10; - st.shared.f32 [%rd42+1024], %f88; - mov.f32 %f89, %f12; - st.shared.f32 [%rd42+1536], %f89; - mov.f32 %f94, %f14; - st.shared.f32 [%rd42+2048], %f94; - mov.f32 %f95, %f15; - st.shared.f32 [%rd42+2560], %f95; - mov.s32 %r63, %r56; - @!%p11 bra $Lt_0_27394; -$Lt_0_27906: - setp.ge.u32 %p15, %r13, %r63; - @%p15 bra $Lt_0_28162; - add.u32 %r64, %r2, %r63; - cvt.u64.u32 %rd46, %r64; - mul.wide.u32 %rd47, %r64, 4; - add.u64 %rd48, %rd39, %rd47; - ld.shared.f32 %f96, [%rd48+0]; - add.ftz.f32 %f86, %f96, %f86; - st.shared.f32 [%rd42+0], %f86; - ld.shared.f32 %f97, [%rd48+512]; - add.ftz.f32 %f87, %f97, %f87; - st.shared.f32 [%rd42+512], %f87; - ld.shared.f32 %f98, [%rd48+1024]; - add.ftz.f32 %f88, %f98, %f88; - st.shared.f32 [%rd42+1024], %f88; - ld.shared.f32 %f99, [%rd48+1536]; - add.ftz.f32 %f89, %f99, %f89; - st.shared.f32 [%rd42+1536], %f89; - ld.shared.f32 %f100, [%rd48+2048]; - add.ftz.f32 %f94, %f100, %f94; - st.shared.f32 [%rd42+2048], %f94; - ld.shared.f32 %f101, [%rd48+2560]; - add.ftz.f32 %f95, %f101, %f95; - st.shared.f32 [%rd42+2560], %f95; -$Lt_0_28162: - shr.u32 %r63, %r63, 1; - mov.u32 %r65, 0; - setp.ne.u32 %p16, %r63, %r65; - @%p16 bra $Lt_0_27906; -$Lt_0_27394: - mov.f32 %f6, %f86; - mov.f32 %f8, %f87; - mov.f32 %f10, %f88; - mov.f32 %f12, %f89; - mov.f32 %f14, %f94; - mov.f32 %f16, %f95; -$Lt_0_26882: -$Lt_0_24834: - mov.u32 %r66, 0; - setp.ne.s32 %p17, %r13, %r66; - @%p17 bra $Lt_0_28930; - ld.param.u64 %rd49, [__cudaparm_kernel_pair___val_paramengv]; - add.u64 %rd50, %rd49, %rd5; - ld.param.s32 %r67, [__cudaparm_kernel_pair_eflag]; - mov.u32 %r68, 0; - setp.le.s32 %p18, %r67, %r68; - @%p18 bra $Lt_0_29442; - st.global.f32 [%rd50+0], %f28; - cvt.s64.s32 %rd51, %r9; - mul.wide.s32 %rd52, %r9, 4; - add.u64 %rd50, %rd50, %rd52; -$Lt_0_29442: - ld.param.s32 %r69, [__cudaparm_kernel_pair_vflag]; - mov.u32 %r70, 0; - setp.le.s32 %p19, %r69, %r70; - @%p19 bra $Lt_0_29954; - mov.f32 %f102, %f6; - st.global.f32 [%rd50+0], %f102; - cvt.s64.s32 %rd53, %r9; - mul.wide.s32 %rd54, %r9, 4; - add.u64 %rd55, %rd54, %rd50; - mov.f32 %f103, %f8; - st.global.f32 [%rd55+0], %f103; - add.u64 %rd56, %rd54, %rd55; - mov.f32 %f104, %f10; - st.global.f32 [%rd56+0], %f104; - add.u64 %rd57, %rd54, %rd56; - mov.f32 %f105, %f12; - st.global.f32 [%rd57+0], %f105; - add.u64 %rd50, %rd54, %rd57; - mov.f32 %f106, %f14; - st.global.f32 [%rd50+0], %f106; - mov.f32 %f107, %f16; - add.u64 %rd58, %rd54, %rd50; - st.global.f32 [%rd58+0], %f107; -$Lt_0_29954: - ld.param.u64 %rd59, [__cudaparm_kernel_pair_ans]; - mul.lo.u64 %rd60, %rd4, 16; - add.u64 %rd61, %rd59, %rd60; - mov.f32 %f108, %f109; - st.global.v4.f32 [%rd61+0], {%f27,%f26,%f25,%f108}; -$Lt_0_28930: -$Lt_0_20226: - .loc 16 110 0 - exit; -$LDWend_kernel_pair: - } // kernel_pair - - .entry kernel_pair_fast ( - .param .u64 __cudaparm_kernel_pair_fast_x_, - .param .u64 __cudaparm_kernel_pair_fast_lj1_in, - .param .u64 __cudaparm_kernel_pair_fast_lj3_in, - .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in, - .param .u64 __cudaparm_kernel_pair_fast_dev_nbor, - .param .u64 __cudaparm_kernel_pair_fast_dev_packed, - .param .u64 __cudaparm_kernel_pair_fast_ans, - .param .u64 __cudaparm_kernel_pair_fast___val_paramengv, - .param .s32 __cudaparm_kernel_pair_fast_eflag, - .param .s32 __cudaparm_kernel_pair_fast_vflag, - .param .s32 __cudaparm_kernel_pair_fast_inum, - .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch, - .param .s32 __cudaparm_kernel_pair_fast_t_per_atom) - { - .reg .u32 %r<74>; - .reg .u64 %rd<75>; - .reg .f32 %f<118>; - .reg .pred %p<24>; - .shared .align 4 .b8 __cuda___cuda_local_var_32625_33_non_const_sp_lj3268[16]; - .shared .align 16 .b8 __cuda___cuda_local_var_32623_34_non_const_lj13296[1936]; - .shared .align 16 .b8 __cuda___cuda_local_var_32624_34_non_const_lj35232[1936]; - .shared .align 4 .b8 __cuda___cuda_local_var_32702_55_non_const_red_acc7168[3072]; - // __cuda_local_var_32635_10_non_const_f = 48 - // __cuda_local_var_32637_9_non_const_virial = 16 - .loc 16 118 0 -$LDWbegin_kernel_pair_fast: - cvt.s32.u32 %r1, %tid.x; - mov.u32 %r2, 3; - setp.gt.s32 %p1, %r1, %r2; - @%p1 bra $Lt_1_22530; - .loc 16 126 0 - mov.u64 %rd1, __cuda___cuda_local_var_32625_33_non_const_sp_lj3268; - cvt.s64.s32 %rd2, %r1; - mul.wide.s32 %rd3, %r1, 4; - ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in]; - add.u64 %rd5, %rd4, %rd3; - ld.global.f32 %f1, [%rd5+0]; - add.u64 %rd6, %rd3, %rd1; - st.shared.f32 [%rd6+0], %f1; -$Lt_1_22530: - mov.u64 %rd1, __cuda___cuda_local_var_32625_33_non_const_sp_lj3268; - mov.u32 %r3, 120; - setp.gt.s32 %p2, %r1, %r3; - @%p2 bra $Lt_1_23042; - .loc 16 128 0 - mov.u64 %rd7, __cuda___cuda_local_var_32623_34_non_const_lj13296; - cvt.s64.s32 %rd8, %r1; - mul.wide.s32 %rd9, %r1, 16; - ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_lj1_in]; - add.u64 %rd11, %rd10, %rd9; - add.u64 %rd12, %rd9, %rd7; - ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0]; - st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5}; - ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag]; - mov.u32 %r5, 0; - setp.le.s32 %p3, %r4, %r5; - @%p3 bra $Lt_1_23554; - .loc 16 130 0 - mov.u64 %rd13, __cuda___cuda_local_var_32624_34_non_const_lj35232; - ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in]; - add.u64 %rd15, %rd14, %rd9; - add.u64 %rd16, %rd9, %rd13; - ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0]; - st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9}; -$Lt_1_23554: - mov.u64 %rd13, __cuda___cuda_local_var_32624_34_non_const_lj35232; -$Lt_1_23042: - mov.u64 %rd13, __cuda___cuda_local_var_32624_34_non_const_lj35232; - mov.u64 %rd7, __cuda___cuda_local_var_32623_34_non_const_lj13296; - .loc 16 138 0 - mov.f32 %f10, 0f00000000; // 0 - mov.f32 %f11, %f10; - mov.f32 %f12, 0f00000000; // 0 - mov.f32 %f13, %f12; - mov.f32 %f14, 0f00000000; // 0 - mov.f32 %f15, %f14; - mov.f32 %f16, 0f00000000; // 0 - mov.f32 %f17, %f16; - mov.f32 %f18, 0f00000000; // 0 - mov.f32 %f19, %f18; - mov.f32 %f20, 0f00000000; // 0 - mov.f32 %f21, %f20; - .loc 16 140 0 - bar.sync 0; - ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom]; - div.s32 %r7, %r1, %r6; - cvt.s32.u32 %r8, %ntid.x; - div.s32 %r9, %r8, %r6; - cvt.s32.u32 %r10, %ctaid.x; - mul.lo.s32 %r11, %r10, %r9; - add.s32 %r12, %r7, %r11; - ld.param.s32 %r13, [__cudaparm_kernel_pair_fast_inum]; - setp.ge.s32 %p4, %r12, %r13; - @%p4 bra $Lt_1_32770; - .loc 16 145 0 - ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_nbor_pitch]; - cvt.s64.s32 %rd17, %r14; - mul.wide.s32 %rd18, %r14, 4; - cvt.s64.s32 %rd19, %r12; - mul.wide.s32 %rd20, %r12, 4; - ld.param.u64 %rd21, [__cudaparm_kernel_pair_fast_dev_nbor]; - add.u64 %rd22, %rd20, %rd21; - add.u64 %rd23, %rd18, %rd22; - ld.global.s32 %r15, [%rd23+0]; - sub.s32 %r16, %r6, 1; - and.b32 %r17, %r16, %r1; - cvt.s64.s32 %rd24, %r17; - mul.wide.s32 %rd25, %r17, 4; - ld.param.u64 %rd26, [__cudaparm_kernel_pair_fast_dev_packed]; - setp.ne.u64 %p5, %rd26, %rd21; - @%p5 bra $Lt_1_24834; - cvt.s32.s64 %r18, %rd17; - mul.lo.s32 %r19, %r18, %r6; - mov.s32 %r20, %r19; - mul.lo.s32 %r21, %r16, %r12; - add.s32 %r22, %r18, %r21; - cvt.s64.s32 %rd27, %r22; - mul.wide.s32 %rd28, %r22, 4; - add.u64 %rd29, %rd23, %rd28; - and.b32 %r23, %r16, %r15; - cvt.s64.s32 %rd30, %r23; - div.s32 %r24, %r15, %r6; - mul.lo.s32 %r25, %r19, %r24; - cvt.s64.s32 %rd31, %r25; - add.u64 %rd32, %rd30, %rd31; - mul.lo.u64 %rd33, %rd32, 4; - add.u64 %rd34, %rd29, %rd33; - add.u64 %rd35, %rd25, %rd29; - bra.uni $Lt_1_24578; -$Lt_1_24834: - add.u64 %rd36, %rd18, %rd23; - ld.global.s32 %r26, [%rd36+0]; - cvt.s64.s32 %rd37, %r26; - mul.wide.s32 %rd38, %r26, 4; - add.u64 %rd39, %rd26, %rd38; - cvt.s64.s32 %rd40, %r15; - mul.wide.s32 %rd41, %r15, 4; - add.u64 %rd34, %rd39, %rd41; - mov.s32 %r20, %r6; - add.u64 %rd35, %rd25, %rd39; -$Lt_1_24578: - .loc 16 148 0 - ld.global.s32 %r27, [%rd22+0]; - mov.u32 %r28, %r27; - mov.s32 %r29, 0; - mov.u32 %r30, %r29; - mov.s32 %r31, 0; - mov.u32 %r32, %r31; - mov.s32 %r33, 0; - mov.u32 %r34, %r33; - tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r28,%r30,%r32,%r34}]; - mov.f32 %f26, %f22; - mov.f32 %f27, %f23; - mov.f32 %f28, %f24; - mov.f32 %f29, %f25; - setp.ge.u64 %p6, %rd35, %rd34; - @%p6 bra $Lt_1_34306; - cvt.rzi.ftz.s32.f32 %r35, %f29; - cvt.s64.s32 %rd42, %r20; - mul.lo.s32 %r36, %r35, 11; - cvt.rn.f32.s32 %f30, %r36; - mov.f32 %f31, 0f00000000; // 0 - mov.f32 %f32, 0f00000000; // 0 - mov.f32 %f33, 0f00000000; // 0 - mov.f32 %f34, 0f00000000; // 0 -$Lt_1_25602: - // Loop body line 148, nesting depth: 1, estimated iterations: unknown - .loc 16 155 0 - ld.global.s32 %r37, [%rd35+0]; - .loc 16 156 0 - shr.s32 %r38, %r37, 30; - and.b32 %r39, %r38, 3; - cvt.s64.s32 %rd43, %r39; - mul.wide.s32 %rd44, %r39, 4; - add.u64 %rd45, %rd1, %rd44; - ld.shared.f32 %f35, [%rd45+0]; - .loc 16 159 0 - and.b32 %r40, %r37, 1073741823; - mov.u32 %r41, %r40; - mov.s32 %r42, 0; - mov.u32 %r43, %r42; - mov.s32 %r44, 0; - mov.u32 %r45, %r44; - mov.s32 %r46, 0; - mov.u32 %r47, %r46; - tex.1d.v4.f32.s32 {%f36,%f37,%f38,%f39},[pos_tex,{%r41,%r43,%r45,%r47}]; - mov.f32 %f40, %f36; - mov.f32 %f41, %f37; - mov.f32 %f42, %f38; - mov.f32 %f43, %f39; - sub.ftz.f32 %f44, %f27, %f41; - sub.ftz.f32 %f45, %f26, %f40; - sub.ftz.f32 %f46, %f28, %f42; - mul.ftz.f32 %f47, %f44, %f44; - fma.rn.ftz.f32 %f48, %f45, %f45, %f47; - fma.rn.ftz.f32 %f49, %f46, %f46, %f48; - add.ftz.f32 %f50, %f30, %f43; - cvt.rzi.ftz.s32.f32 %r48, %f50; - cvt.s64.s32 %rd46, %r48; - mul.wide.s32 %rd47, %r48, 16; - add.u64 %rd48, %rd47, %rd7; - ld.shared.f32 %f51, [%rd48+0]; - setp.gt.ftz.f32 %p7, %f51, %f49; - @!%p7 bra $Lt_1_27906; - rcp.approx.ftz.f32 %f52, %f49; - ld.shared.f32 %f53, [%rd48+4]; - mov.f32 %f54, 0f40000000; // 2 - setp.eq.ftz.f32 %p8, %f53, %f54; - @!%p8 bra $Lt_1_26626; - .loc 16 173 0 - mul.ftz.f32 %f55, %f52, %f52; - mov.f32 %f56, %f55; - .loc 16 174 0 - mul.ftz.f32 %f57, %f55, %f55; - bra.uni $Lt_1_26882; -$Lt_1_26626: - mov.f32 %f58, 0f3f800000; // 1 - setp.eq.ftz.f32 %p9, %f53, %f58; - @!%p9 bra $Lt_1_27138; - .loc 16 176 0 - sqrt.approx.ftz.f32 %f59, %f52; - mul.ftz.f32 %f60, %f52, %f59; - mov.f32 %f57, %f60; - .loc 16 177 0 - mul.ftz.f32 %f56, %f60, %f60; - bra.uni $Lt_1_26882; -$Lt_1_27138: - .loc 16 179 0 - mul.ftz.f32 %f61, %f52, %f52; - mul.ftz.f32 %f62, %f52, %f61; - mov.f32 %f56, %f62; - .loc 16 180 0 - mov.f32 %f57, %f62; -$Lt_1_26882: -$Lt_1_26370: - .loc 16 182 0 - mul.ftz.f32 %f63, %f52, %f35; - mul.ftz.f32 %f64, %f56, %f63; - ld.shared.v2.f32 {%f65,%f66}, [%rd48+8]; - mul.ftz.f32 %f67, %f65, %f57; - sub.ftz.f32 %f68, %f67, %f66; - mul.ftz.f32 %f69, %f64, %f68; - .loc 16 184 0 - fma.rn.ftz.f32 %f33, %f45, %f69, %f33; - .loc 16 185 0 - fma.rn.ftz.f32 %f32, %f44, %f69, %f32; - .loc 16 186 0 - fma.rn.ftz.f32 %f31, %f46, %f69, %f31; - ld.param.s32 %r49, [__cudaparm_kernel_pair_fast_eflag]; - mov.u32 %r50, 0; - setp.le.s32 %p10, %r49, %r50; - @%p10 bra $Lt_1_27394; - .loc 16 188 0 - add.u64 %rd49, %rd47, %rd13; - ld.shared.v4.f32 {%f70,%f71,%f72,_}, [%rd49+0]; - mul.ftz.f32 %f73, %f35, %f56; - mul.ftz.f32 %f74, %f70, %f57; - sub.ftz.f32 %f75, %f74, %f71; - mul.ftz.f32 %f76, %f73, %f75; - sub.ftz.f32 %f77, %f76, %f72; - add.ftz.f32 %f34, %f34, %f77; -$Lt_1_27394: - ld.param.s32 %r51, [__cudaparm_kernel_pair_fast_vflag]; - mov.u32 %r52, 0; - setp.le.s32 %p11, %r51, %r52; - @%p11 bra $Lt_1_27906; - .loc 16 191 0 - mov.f32 %f78, %f11; - mul.ftz.f32 %f79, %f45, %f45; - fma.rn.ftz.f32 %f80, %f69, %f79, %f78; - mov.f32 %f11, %f80; - .loc 16 192 0 - mov.f32 %f81, %f13; - fma.rn.ftz.f32 %f82, %f69, %f47, %f81; - mov.f32 %f13, %f82; - .loc 16 193 0 - mov.f32 %f83, %f15; - mul.ftz.f32 %f84, %f46, %f46; - fma.rn.ftz.f32 %f85, %f69, %f84, %f83; - mov.f32 %f15, %f85; - .loc 16 194 0 - mov.f32 %f86, %f17; - mul.ftz.f32 %f87, %f44, %f45; - fma.rn.ftz.f32 %f88, %f69, %f87, %f86; - mov.f32 %f17, %f88; - .loc 16 195 0 - mov.f32 %f89, %f19; - mul.ftz.f32 %f90, %f45, %f46; - fma.rn.ftz.f32 %f91, %f69, %f90, %f89; - mov.f32 %f19, %f91; - .loc 16 196 0 - mul.ftz.f32 %f92, %f44, %f46; - fma.rn.ftz.f32 %f20, %f69, %f92, %f20; - mov.f32 %f21, %f20; -$Lt_1_27906: -$Lt_1_25858: - .loc 16 153 0 - mul.lo.u64 %rd50, %rd42, 4; - add.u64 %rd35, %rd35, %rd50; - setp.lt.u64 %p12, %rd35, %rd34; - @%p12 bra $Lt_1_25602; - bra.uni $Lt_1_25090; -$Lt_1_34306: - mov.f32 %f31, 0f00000000; // 0 - mov.f32 %f32, 0f00000000; // 0 - mov.f32 %f33, 0f00000000; // 0 - mov.f32 %f34, 0f00000000; // 0 -$Lt_1_25090: - mov.u32 %r53, 1; - setp.le.s32 %p13, %r6, %r53; - @%p13 bra $Lt_1_30722; - .loc 16 201 0 - mov.u64 %rd51, __cuda___cuda_local_var_32702_55_non_const_red_acc7168; - cvt.s64.s32 %rd52, %r1; - mul.wide.s32 %rd53, %r1, 4; - add.u64 %rd54, %rd51, %rd53; - mov.f32 %f93, %f33; - st.shared.f32 [%rd54+0], %f93; - mov.f32 %f94, %f32; - st.shared.f32 [%rd54+512], %f94; - mov.f32 %f95, %f31; - st.shared.f32 [%rd54+1024], %f95; - mov.f32 %f96, %f34; - st.shared.f32 [%rd54+1536], %f96; - shr.s32 %r54, %r6, 31; - mov.s32 %r55, 1; - and.b32 %r56, %r54, %r55; - add.s32 %r57, %r56, %r6; - shr.s32 %r58, %r57, 1; - mov.s32 %r59, %r58; - mov.u32 %r60, 0; - setp.ne.u32 %p14, %r58, %r60; - @!%p14 bra $Lt_1_29186; -$Lt_1_29698: - setp.ge.u32 %p15, %r17, %r59; - @%p15 bra $Lt_1_29954; - add.u32 %r61, %r1, %r59; - cvt.u64.u32 %rd55, %r61; - mul.wide.u32 %rd56, %r61, 4; - add.u64 %rd57, %rd51, %rd56; - ld.shared.f32 %f97, [%rd57+0]; - add.ftz.f32 %f93, %f97, %f93; - st.shared.f32 [%rd54+0], %f93; - ld.shared.f32 %f98, [%rd57+512]; - add.ftz.f32 %f94, %f98, %f94; - st.shared.f32 [%rd54+512], %f94; - ld.shared.f32 %f99, [%rd57+1024]; - add.ftz.f32 %f95, %f99, %f95; - st.shared.f32 [%rd54+1024], %f95; - ld.shared.f32 %f100, [%rd57+1536]; - add.ftz.f32 %f96, %f100, %f96; - st.shared.f32 [%rd54+1536], %f96; -$Lt_1_29954: - shr.u32 %r59, %r59, 1; - mov.u32 %r62, 0; - setp.ne.u32 %p16, %r59, %r62; - @%p16 bra $Lt_1_29698; -$Lt_1_29186: - mov.f32 %f33, %f93; - mov.f32 %f32, %f94; - mov.f32 %f31, %f95; - mov.f32 %f34, %f96; - ld.param.s32 %r63, [__cudaparm_kernel_pair_fast_vflag]; - mov.u32 %r64, 0; - setp.le.s32 %p17, %r63, %r64; - @%p17 bra $Lt_1_30722; - mov.f32 %f93, %f11; - st.shared.f32 [%rd54+0], %f93; - mov.f32 %f94, %f13; - st.shared.f32 [%rd54+512], %f94; - mov.f32 %f95, %f15; - st.shared.f32 [%rd54+1024], %f95; - mov.f32 %f96, %f17; - st.shared.f32 [%rd54+1536], %f96; - mov.f32 %f101, %f19; - st.shared.f32 [%rd54+2048], %f101; - mov.f32 %f102, %f20; - st.shared.f32 [%rd54+2560], %f102; - mov.s32 %r65, %r58; - @!%p14 bra $Lt_1_31234; -$Lt_1_31746: - setp.ge.u32 %p18, %r17, %r65; - @%p18 bra $Lt_1_32002; - add.u32 %r66, %r1, %r65; - cvt.u64.u32 %rd58, %r66; - mul.wide.u32 %rd59, %r66, 4; - add.u64 %rd60, %rd51, %rd59; - ld.shared.f32 %f103, [%rd60+0]; - add.ftz.f32 %f93, %f103, %f93; - st.shared.f32 [%rd54+0], %f93; - ld.shared.f32 %f104, [%rd60+512]; - add.ftz.f32 %f94, %f104, %f94; - st.shared.f32 [%rd54+512], %f94; - ld.shared.f32 %f105, [%rd60+1024]; - add.ftz.f32 %f95, %f105, %f95; - st.shared.f32 [%rd54+1024], %f95; - ld.shared.f32 %f106, [%rd60+1536]; - add.ftz.f32 %f96, %f106, %f96; - st.shared.f32 [%rd54+1536], %f96; - ld.shared.f32 %f107, [%rd60+2048]; - add.ftz.f32 %f101, %f107, %f101; - st.shared.f32 [%rd54+2048], %f101; - ld.shared.f32 %f108, [%rd60+2560]; - add.ftz.f32 %f102, %f108, %f102; - st.shared.f32 [%rd54+2560], %f102; -$Lt_1_32002: - shr.u32 %r65, %r65, 1; - mov.u32 %r67, 0; - setp.ne.u32 %p19, %r65, %r67; - @%p19 bra $Lt_1_31746; -$Lt_1_31234: - mov.f32 %f11, %f93; - mov.f32 %f13, %f94; - mov.f32 %f15, %f95; - mov.f32 %f17, %f96; - mov.f32 %f19, %f101; - mov.f32 %f21, %f102; -$Lt_1_30722: -$Lt_1_28674: - mov.u32 %r68, 0; - setp.ne.s32 %p20, %r17, %r68; - @%p20 bra $Lt_1_32770; - ld.param.u64 %rd61, [__cudaparm_kernel_pair_fast___val_paramengv]; - add.u64 %rd62, %rd61, %rd20; - ld.param.s32 %r69, [__cudaparm_kernel_pair_fast_eflag]; - mov.u32 %r70, 0; - setp.le.s32 %p21, %r69, %r70; - @%p21 bra $Lt_1_33282; - st.global.f32 [%rd62+0], %f34; - cvt.s64.s32 %rd63, %r13; - mul.wide.s32 %rd64, %r13, 4; - add.u64 %rd62, %rd62, %rd64; -$Lt_1_33282: - ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag]; - mov.u32 %r72, 0; - setp.le.s32 %p22, %r71, %r72; - @%p22 bra $Lt_1_33794; - mov.f32 %f109, %f11; - st.global.f32 [%rd62+0], %f109; - cvt.s64.s32 %rd65, %r13; - mul.wide.s32 %rd66, %r13, 4; - add.u64 %rd67, %rd66, %rd62; - mov.f32 %f110, %f13; - st.global.f32 [%rd67+0], %f110; - add.u64 %rd68, %rd66, %rd67; - mov.f32 %f111, %f15; - st.global.f32 [%rd68+0], %f111; - add.u64 %rd69, %rd66, %rd68; - mov.f32 %f112, %f17; - st.global.f32 [%rd69+0], %f112; - add.u64 %rd62, %rd66, %rd69; - mov.f32 %f113, %f19; - st.global.f32 [%rd62+0], %f113; - mov.f32 %f114, %f21; - add.u64 %rd70, %rd66, %rd62; - st.global.f32 [%rd70+0], %f114; -$Lt_1_33794: - ld.param.u64 %rd71, [__cudaparm_kernel_pair_fast_ans]; - mul.lo.u64 %rd72, %rd19, 16; - add.u64 %rd73, %rd71, %rd72; - mov.f32 %f115, %f116; - st.global.v4.f32 [%rd73+0], {%f33,%f32,%f31,%f115}; -$Lt_1_32770: -$Lt_1_24066: - .loc 16 204 0 - exit; -$LDWend_kernel_pair_fast: - } // kernel_pair_fast - diff --git a/lib/gpu/cg_cmm_long.ptx b/lib/gpu/cg_cmm_long.ptx deleted file mode 100644 index 00f627aca7..0000000000 --- a/lib/gpu/cg_cmm_long.ptx +++ /dev/null @@ -1,1189 +0,0 @@ - .version 2.3 - .target sm_20 - .address_size 64 - // compiled with /usr/local/cuda/open64/lib//be - // nvopencc 4.0 built on 2011-05-12 - - //----------------------------------------------------------- - // Compiling /tmp/tmpxft_00009ef5_00000000-9_lal_cg_cmm_long.cpp3.i (/home/sjplimp/ccBI#.OMEXPd) - //----------------------------------------------------------- - - //----------------------------------------------------------- - // Options: - //----------------------------------------------------------- - // Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64 - // -O3 (Optimization level) - // -g0 (Debug level) - // -m2 (Report advisories) - //----------------------------------------------------------- - - .file 1 "" - .file 2 "/tmp/tmpxft_00009ef5_00000000-8_lal_cg_cmm_long.cudafe2.gpu" - .file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h" - .file 4 "/usr/local/cuda/include/crt/device_runtime.h" - .file 5 "/usr/local/cuda/include/host_defines.h" - .file 6 "/usr/local/cuda/include/builtin_types.h" - .file 7 "/usr/local/cuda/include/device_types.h" - .file 8 "/usr/local/cuda/include/driver_types.h" - .file 9 "/usr/local/cuda/include/surface_types.h" - .file 10 "/usr/local/cuda/include/texture_types.h" - .file 11 "/usr/local/cuda/include/vector_types.h" - .file 12 "/usr/local/cuda/include/device_launch_parameters.h" - .file 13 "/usr/local/cuda/include/crt/storage_class.h" - .file 14 "/usr/include/bits/types.h" - .file 15 "/usr/include/time.h" - .file 16 "lal_cg_cmm_long.cu" - .file 17 "/usr/local/cuda/include/common_functions.h" - .file 18 "/usr/local/cuda/include/math_functions.h" - .file 19 "/usr/local/cuda/include/math_constants.h" - .file 20 "/usr/local/cuda/include/device_functions.h" - .file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h" - .file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h" - .file 23 "/usr/local/cuda/include/sm_13_double_functions.h" - .file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h" - .file 25 "/usr/local/cuda/include/sm_20_intrinsics.h" - .file 26 "/usr/local/cuda/include/surface_functions.h" - .file 27 "/usr/local/cuda/include/texture_fetch_functions.h" - .file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h" - - .global .texref pos_tex; - .global .texref q_tex; - - .entry kernel_pair ( - .param .u64 __cudaparm_kernel_pair_x_, - .param .u64 __cudaparm_kernel_pair_lj1, - .param .u64 __cudaparm_kernel_pair_lj3, - .param .s32 __cudaparm_kernel_pair_lj_types, - .param .u64 __cudaparm_kernel_pair_sp_lj_in, - .param .u64 __cudaparm_kernel_pair_dev_nbor, - .param .u64 __cudaparm_kernel_pair_dev_packed, - .param .u64 __cudaparm_kernel_pair_ans, - .param .u64 __cudaparm_kernel_pair___val_paramengv, - .param .s32 __cudaparm_kernel_pair_eflag, - .param .s32 __cudaparm_kernel_pair_vflag, - .param .s32 __cudaparm_kernel_pair_inum, - .param .s32 __cudaparm_kernel_pair_nbor_pitch, - .param .u64 __cudaparm_kernel_pair_q_, - .param .f32 __cudaparm_kernel_pair_cut_coulsq, - .param .f32 __cudaparm_kernel_pair_qqrd2e, - .param .f32 __cudaparm_kernel_pair_g_ewald, - .param .s32 __cudaparm_kernel_pair_t_per_atom) - { - .reg .u32 %r<86>; - .reg .u64 %rd<66>; - .reg .f32 %f<175>; - .reg .pred %p<23>; - .shared .align 16 .b8 __cuda___cuda_local_var_32541_33_non_const_sp_lj112[32]; - .shared .align 4 .b8 __cuda___cuda_local_var_32642_55_non_const_red_acc144[3072]; - // __cuda_local_var_32553_10_non_const_f = 64 - // __cuda_local_var_32555_9_non_const_virial = 16 - // __cuda_local_var_32588_43_non_const_inv1 = 40 - // __cuda_local_var_32588_49_non_const_inv2 = 44 - // __cuda_local_var_32588_55_non_const_prefactor = 52 - // __cuda_local_var_32588_66_non_const__erfc = 48 - .loc 16 36 0 -$LDWbegin_kernel_pair: - .loc 16 41 0 - ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in]; - ldu.global.f32 %f1, [%rd1+0]; - .loc 16 42 0 - ld.global.f32 %f2, [%rd1+4]; - .loc 16 43 0 - ld.global.f32 %f3, [%rd1+8]; - .loc 16 44 0 - ld.global.f32 %f4, [%rd1+12]; - st.shared.v4.f32 [__cuda___cuda_local_var_32541_33_non_const_sp_lj112+0], {%f1,%f2,%f3,%f4}; - .loc 16 45 0 - ld.global.f32 %f5, [%rd1+16]; - .loc 16 46 0 - ld.global.f32 %f6, [%rd1+20]; - .loc 16 47 0 - ld.global.f32 %f7, [%rd1+24]; - .loc 16 48 0 - ld.global.f32 %f8, [%rd1+28]; - st.shared.v4.f32 [__cuda___cuda_local_var_32541_33_non_const_sp_lj112+16], {%f5,%f6,%f7,%f8}; - .loc 16 56 0 - mov.f32 %f9, 0f00000000; // 0 - mov.f32 %f10, %f9; - mov.f32 %f11, 0f00000000; // 0 - mov.f32 %f12, %f11; - mov.f32 %f13, 0f00000000; // 0 - mov.f32 %f14, %f13; - mov.f32 %f15, 0f00000000; // 0 - mov.f32 %f16, %f15; - mov.f32 %f17, 0f00000000; // 0 - mov.f32 %f18, %f17; - mov.f32 %f19, 0f00000000; // 0 - mov.f32 %f20, %f19; - ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom]; - cvt.s32.u32 %r2, %tid.x; - div.s32 %r3, %r2, %r1; - cvt.s32.u32 %r4, %ntid.x; - div.s32 %r5, %r4, %r1; - cvt.s32.u32 %r6, %ctaid.x; - mul.lo.s32 %r7, %r6, %r5; - add.s32 %r8, %r3, %r7; - ld.param.s32 %r9, [__cudaparm_kernel_pair_inum]; - setp.ge.s32 %p1, %r8, %r9; - @%p1 bra $Lt_0_33538; - .loc 16 61 0 - cvt.s64.s32 %rd2, %r8; - mul.wide.s32 %rd3, %r8, 4; - ld.param.u64 %rd4, [__cudaparm_kernel_pair_dev_nbor]; - add.u64 %rd5, %rd3, %rd4; - ld.global.s32 %r10, [%rd5+0]; - ld.param.s32 %r11, [__cudaparm_kernel_pair_nbor_pitch]; - cvt.s64.s32 %rd6, %r11; - mul.wide.s32 %rd7, %r11, 4; - add.u64 %rd8, %rd7, %rd5; - ld.global.s32 %r12, [%rd8+0]; - sub.s32 %r13, %r1, 1; - and.b32 %r14, %r13, %r2; - cvt.s64.s32 %rd9, %r14; - mul.wide.s32 %rd10, %r14, 4; - ld.param.u64 %rd11, [__cudaparm_kernel_pair_dev_packed]; - setp.ne.u64 %p2, %rd11, %rd4; - @%p2 bra $Lt_0_24066; - cvt.s32.s64 %r15, %rd6; - mul.lo.s32 %r16, %r15, %r1; - mov.s32 %r17, %r16; - mul.lo.s32 %r18, %r13, %r8; - add.s32 %r19, %r15, %r18; - cvt.s64.s32 %rd12, %r19; - mul.wide.s32 %rd13, %r19, 4; - add.u64 %rd14, %rd8, %rd13; - and.b32 %r20, %r13, %r12; - cvt.s64.s32 %rd15, %r20; - div.s32 %r21, %r12, %r1; - mul.lo.s32 %r22, %r16, %r21; - cvt.s64.s32 %rd16, %r22; - add.u64 %rd17, %rd15, %rd16; - mul.lo.u64 %rd18, %rd17, 4; - add.u64 %rd19, %rd14, %rd18; - add.u64 %rd20, %rd10, %rd14; - bra.uni $Lt_0_23810; -$Lt_0_24066: - add.u64 %rd21, %rd7, %rd8; - ld.global.s32 %r23, [%rd21+0]; - cvt.s64.s32 %rd22, %r23; - mul.wide.s32 %rd23, %r23, 4; - add.u64 %rd24, %rd11, %rd23; - cvt.s64.s32 %rd25, %r12; - mul.wide.s32 %rd26, %r12, 4; - add.u64 %rd19, %rd24, %rd26; - mov.s32 %r17, %r1; - add.u64 %rd20, %rd10, %rd24; -$Lt_0_23810: - .loc 16 64 0 - mov.u32 %r24, %r10; - mov.s32 %r25, 0; - mov.u32 %r26, %r25; - mov.s32 %r27, 0; - mov.u32 %r28, %r27; - mov.s32 %r29, 0; - mov.u32 %r30, %r29; - tex.1d.v4.f32.s32 {%f21,%f22,%f23,%f24},[pos_tex,{%r24,%r26,%r28,%r30}]; - mov.f32 %f25, %f21; - mov.f32 %f26, %f22; - mov.f32 %f27, %f23; - mov.f32 %f28, %f24; - .loc 16 65 0 - mov.u32 %r31, %r10; - mov.s32 %r32, 0; - mov.u32 %r33, %r32; - mov.s32 %r34, 0; - mov.u32 %r35, %r34; - mov.s32 %r36, 0; - mov.u32 %r37, %r36; - tex.1d.v4.f32.s32 {%f29,%f30,%f31,%f32},[q_tex,{%r31,%r33,%r35,%r37}]; - mov.f32 %f33, %f29; - setp.ge.u64 %p3, %rd20, %rd19; - @%p3 bra $Lt_0_35074; - cvt.rzi.ftz.s32.f32 %r38, %f28; - cvt.s64.s32 %rd27, %r17; - ld.param.s32 %r39, [__cudaparm_kernel_pair_lj_types]; - mul.lo.s32 %r40, %r39, %r38; - ld.param.u64 %rd28, [__cudaparm_kernel_pair_lj1]; - mov.f32 %f34, 0f00000000; // 0 - mov.f32 %f35, 0f00000000; // 0 - mov.f32 %f36, 0f00000000; // 0 - mov.f32 %f37, 0f00000000; // 0 - mov.f32 %f38, 0f00000000; // 0 - mov.u64 %rd29, __cuda___cuda_local_var_32541_33_non_const_sp_lj112; -$Lt_0_24834: - // Loop body line 65, nesting depth: 1, estimated iterations: unknown - .loc 16 69 0 - ld.global.s32 %r41, [%rd20+0]; - .loc 16 72 0 - shr.s32 %r42, %r41, 30; - and.b32 %r43, %r42, 3; - cvt.s64.s32 %rd30, %r43; - mul.wide.s32 %rd31, %r43, 4; - add.u64 %rd32, %rd29, %rd31; - ld.shared.f32 %f39, [%rd32+0]; - .loc 16 73 0 - mov.f32 %f40, 0f3f800000; // 1 - ld.shared.f32 %f41, [%rd32+16]; - sub.ftz.f32 %f42, %f40, %f41; - .loc 16 76 0 - and.b32 %r44, %r41, 1073741823; - mov.u32 %r45, %r44; - mov.s32 %r46, 0; - mov.u32 %r47, %r46; - mov.s32 %r48, 0; - mov.u32 %r49, %r48; - mov.s32 %r50, 0; - mov.u32 %r51, %r50; - tex.1d.v4.f32.s32 {%f43,%f44,%f45,%f46},[pos_tex,{%r45,%r47,%r49,%r51}]; - mov.f32 %f47, %f43; - mov.f32 %f48, %f44; - mov.f32 %f49, %f45; - mov.f32 %f50, %f46; - cvt.rzi.ftz.s32.f32 %r52, %f50; - sub.ftz.f32 %f51, %f26, %f48; - sub.ftz.f32 %f52, %f25, %f47; - sub.ftz.f32 %f53, %f27, %f49; - mul.ftz.f32 %f54, %f51, %f51; - fma.rn.ftz.f32 %f55, %f52, %f52, %f54; - fma.rn.ftz.f32 %f56, %f53, %f53, %f55; - add.s32 %r53, %r52, %r40; - cvt.s64.s32 %rd33, %r53; - mul.wide.s32 %rd34, %r53, 16; - add.u64 %rd35, %rd34, %rd28; - ld.global.f32 %f57, [%rd35+0]; - setp.gt.ftz.f32 %p4, %f57, %f56; - @!%p4 bra $Lt_0_28674; - rcp.approx.ftz.f32 %f58, %f56; - ld.global.f32 %f59, [%rd35+4]; - setp.lt.ftz.f32 %p5, %f56, %f59; - @!%p5 bra $Lt_0_25858; - ld.param.u64 %rd36, [__cudaparm_kernel_pair_lj3]; - add.u64 %rd37, %rd36, %rd34; - ld.global.f32 %f60, [%rd37+0]; - mov.f32 %f61, 0f40000000; // 2 - setp.eq.ftz.f32 %p6, %f60, %f61; - @!%p6 bra $Lt_0_26370; - .loc 16 92 0 - mul.ftz.f32 %f62, %f58, %f58; - mov.f32 %f63, %f62; - mov.f32 %f64, %f63; - .loc 16 93 0 - mul.ftz.f32 %f65, %f62, %f62; - mov.f32 %f66, %f65; - bra.uni $Lt_0_26626; -$Lt_0_26370: - mov.f32 %f67, 0f3f800000; // 1 - setp.eq.ftz.f32 %p7, %f60, %f67; - @!%p7 bra $Lt_0_26882; - .loc 16 95 0 - rsqrt.approx.ftz.f32 %f68, %f56; - mul.ftz.f32 %f69, %f58, %f68; - mov.f32 %f65, %f69; - mov.f32 %f66, %f65; - .loc 16 96 0 - mul.ftz.f32 %f63, %f69, %f69; - mov.f32 %f64, %f63; - bra.uni $Lt_0_26626; -$Lt_0_26882: - .loc 16 98 0 - mul.ftz.f32 %f70, %f58, %f58; - mul.ftz.f32 %f71, %f58, %f70; - mov.f32 %f63, %f71; - mov.f32 %f64, %f63; - .loc 16 99 0 - mov.f32 %f65, %f71; - mov.f32 %f66, %f65; -$Lt_0_26626: -$Lt_0_26114: - .loc 16 101 0 - mul.ftz.f32 %f72, %f39, %f63; - ld.global.v2.f32 {%f73,%f74}, [%rd35+8]; - mul.ftz.f32 %f75, %f73, %f65; - sub.ftz.f32 %f76, %f75, %f74; - mul.ftz.f32 %f77, %f72, %f76; - bra.uni $Lt_0_25602; -$Lt_0_25858: - .loc 16 103 0 - mov.f32 %f77, 0f00000000; // 0 -$Lt_0_25602: - ld.param.f32 %f78, [__cudaparm_kernel_pair_cut_coulsq]; - setp.gt.ftz.f32 %p8, %f78, %f56; - @!%p8 bra $Lt_0_27394; - .loc 20 518 0 - rsqrt.approx.ftz.f32 %f79, %f58; - ld.param.f32 %f80, [__cudaparm_kernel_pair_g_ewald]; - mul.ftz.f32 %f81, %f80, %f79; - mul.ftz.f32 %f82, %f81, %f81; - neg.ftz.f32 %f83, %f82; - mov.f32 %f84, 0f3fb8aa3b; // 1.4427 - mul.ftz.f32 %f85, %f83, %f84; - ex2.approx.ftz.f32 %f86, %f85; - .loc 16 110 0 - mov.f32 %f87, 0f3f800000; // 1 - mov.f32 %f88, 0f3ea7ba05; // 0.327591 - fma.rn.ftz.f32 %f89, %f88, %f81, %f87; - rcp.approx.ftz.f32 %f90, %f89; - mov.f32 %f91, 0f3e827906; // 0.25483 - mov.f32 %f92, 0fbe91a98e; // -0.284497 - mov.f32 %f93, 0f3fb5f0e3; // 1.42141 - mov.f32 %f94, 0fbfba00e3; // -1.45315 - mov.f32 %f95, 0f3f87dc22; // 1.06141 - fma.rn.ftz.f32 %f96, %f95, %f90, %f94; - fma.rn.ftz.f32 %f97, %f90, %f96, %f93; - fma.rn.ftz.f32 %f98, %f90, %f97, %f92; - fma.rn.ftz.f32 %f99, %f90, %f98, %f91; - mul.ftz.f32 %f100, %f90, %f99; - mul.ftz.f32 %f101, %f86, %f100; - mov.f32 %f102, %f101; - .loc 16 111 0 - mov.u32 %r54, %r44; - mov.s32 %r55, 0; - mov.u32 %r56, %r55; - mov.s32 %r57, 0; - mov.u32 %r58, %r57; - mov.s32 %r59, 0; - mov.u32 %r60, %r59; - tex.1d.v4.f32.s32 {%f103,%f104,%f105,%f106},[q_tex,{%r54,%r56,%r58,%r60}]; - mov.f32 %f107, %f103; - ld.param.f32 %f108, [__cudaparm_kernel_pair_qqrd2e]; - mul.ftz.f32 %f109, %f108, %f33; - mul.ftz.f32 %f110, %f109, %f107; - div.approx.ftz.f32 %f111, %f110, %f79; - mov.f32 %f112, %f111; - .loc 16 112 0 - mov.f32 %f113, 0f3f906ebb; // 1.12838 - mul.ftz.f32 %f114, %f81, %f113; - fma.rn.ftz.f32 %f115, %f86, %f114, %f101; - sub.ftz.f32 %f116, %f115, %f42; - mul.ftz.f32 %f117, %f111, %f116; - bra.uni $Lt_0_27138; -$Lt_0_27394: - .loc 16 114 0 - mov.f32 %f117, 0f00000000; // 0 -$Lt_0_27138: - .loc 16 118 0 - add.ftz.f32 %f118, %f117, %f77; - mul.ftz.f32 %f119, %f118, %f58; - fma.rn.ftz.f32 %f36, %f52, %f119, %f36; - .loc 16 119 0 - fma.rn.ftz.f32 %f35, %f51, %f119, %f35; - .loc 16 120 0 - fma.rn.ftz.f32 %f34, %f53, %f119, %f34; - ld.param.s32 %r61, [__cudaparm_kernel_pair_eflag]; - mov.u32 %r62, 0; - setp.le.s32 %p9, %r61, %r62; - @%p9 bra $Lt_0_28162; - .loc 16 123 0 - mov.f32 %f120, %f112; - mov.f32 %f121, %f102; - sub.ftz.f32 %f122, %f121, %f42; - fma.rn.ftz.f32 %f123, %f120, %f122, %f37; - selp.f32 %f37, %f123, %f37, %p8; - @!%p5 bra $Lt_0_28162; - .loc 16 126 0 - ld.param.u64 %rd38, [__cudaparm_kernel_pair_lj3]; - add.u64 %rd39, %rd38, %rd34; - ld.global.v4.f32 {_,%f124,%f125,%f126}, [%rd39+0]; - mov.f32 %f127, %f64; - mul.ftz.f32 %f128, %f127, %f39; - mov.f32 %f129, %f66; - mul.ftz.f32 %f130, %f124, %f129; - sub.ftz.f32 %f131, %f130, %f125; - mul.ftz.f32 %f132, %f128, %f131; - sub.ftz.f32 %f133, %f132, %f126; - add.ftz.f32 %f38, %f38, %f133; -$Lt_0_28162: -$Lt_0_27650: - ld.param.s32 %r63, [__cudaparm_kernel_pair_vflag]; - mov.u32 %r64, 0; - setp.le.s32 %p10, %r63, %r64; - @%p10 bra $Lt_0_28674; - .loc 16 131 0 - mov.f32 %f134, %f10; - mul.ftz.f32 %f135, %f52, %f52; - fma.rn.ftz.f32 %f136, %f119, %f135, %f134; - mov.f32 %f10, %f136; - .loc 16 132 0 - mov.f32 %f137, %f12; - fma.rn.ftz.f32 %f138, %f119, %f54, %f137; - mov.f32 %f12, %f138; - .loc 16 133 0 - mov.f32 %f139, %f14; - mul.ftz.f32 %f140, %f53, %f53; - fma.rn.ftz.f32 %f141, %f119, %f140, %f139; - mov.f32 %f14, %f141; - .loc 16 134 0 - mov.f32 %f142, %f16; - mul.ftz.f32 %f143, %f51, %f52; - fma.rn.ftz.f32 %f144, %f119, %f143, %f142; - mov.f32 %f16, %f144; - .loc 16 135 0 - mov.f32 %f145, %f18; - mul.ftz.f32 %f146, %f52, %f53; - fma.rn.ftz.f32 %f147, %f119, %f146, %f145; - mov.f32 %f18, %f147; - .loc 16 136 0 - mul.ftz.f32 %f148, %f51, %f53; - fma.rn.ftz.f32 %f19, %f119, %f148, %f19; - mov.f32 %f20, %f19; -$Lt_0_28674: -$Lt_0_25090: - .loc 16 68 0 - mul.lo.u64 %rd40, %rd27, 4; - add.u64 %rd20, %rd20, %rd40; - setp.lt.u64 %p11, %rd20, %rd19; - @%p11 bra $Lt_0_24834; - bra.uni $Lt_0_24322; -$Lt_0_35074: - mov.f32 %f34, 0f00000000; // 0 - mov.f32 %f35, 0f00000000; // 0 - mov.f32 %f36, 0f00000000; // 0 - mov.f32 %f37, 0f00000000; // 0 - mov.f32 %f38, 0f00000000; // 0 -$Lt_0_24322: - mov.u32 %r65, 1; - setp.le.s32 %p12, %r1, %r65; - @%p12 bra $Lt_0_31490; - .loc 16 141 0 - mov.u64 %rd41, __cuda___cuda_local_var_32642_55_non_const_red_acc144; - cvt.s64.s32 %rd42, %r2; - mul.wide.s32 %rd43, %r2, 4; - add.u64 %rd44, %rd41, %rd43; - mov.f32 %f149, %f36; - st.shared.f32 [%rd44+0], %f149; - mov.f32 %f150, %f35; - st.shared.f32 [%rd44+512], %f150; - mov.f32 %f151, %f34; - st.shared.f32 [%rd44+1024], %f151; - mov.f32 %f152, %f38; - st.shared.f32 [%rd44+1536], %f152; - mov.f32 %f153, %f37; - st.shared.f32 [%rd44+2048], %f153; - shr.s32 %r66, %r1, 31; - mov.s32 %r67, 1; - and.b32 %r68, %r66, %r67; - add.s32 %r69, %r68, %r1; - shr.s32 %r70, %r69, 1; - mov.s32 %r71, %r70; - mov.u32 %r72, 0; - setp.ne.u32 %p13, %r70, %r72; - @!%p13 bra $Lt_0_29954; -$Lt_0_30466: - setp.ge.u32 %p14, %r14, %r71; - @%p14 bra $Lt_0_30722; - add.u32 %r73, %r2, %r71; - cvt.u64.u32 %rd45, %r73; - mul.wide.u32 %rd46, %r73, 4; - add.u64 %rd47, %rd41, %rd46; - ld.shared.f32 %f154, [%rd47+0]; - add.ftz.f32 %f149, %f154, %f149; - st.shared.f32 [%rd44+0], %f149; - ld.shared.f32 %f155, [%rd47+512]; - add.ftz.f32 %f150, %f155, %f150; - st.shared.f32 [%rd44+512], %f150; - ld.shared.f32 %f156, [%rd47+1024]; - add.ftz.f32 %f151, %f156, %f151; - st.shared.f32 [%rd44+1024], %f151; - ld.shared.f32 %f157, [%rd47+1536]; - add.ftz.f32 %f152, %f157, %f152; - st.shared.f32 [%rd44+1536], %f152; - ld.shared.f32 %f158, [%rd47+2048]; - add.ftz.f32 %f153, %f158, %f153; - st.shared.f32 [%rd44+2048], %f153; -$Lt_0_30722: - shr.u32 %r71, %r71, 1; - mov.u32 %r74, 0; - setp.ne.u32 %p15, %r71, %r74; - @%p15 bra $Lt_0_30466; -$Lt_0_29954: - mov.f32 %f36, %f149; - mov.f32 %f35, %f150; - mov.f32 %f34, %f151; - mov.f32 %f38, %f152; - mov.f32 %f37, %f153; - ld.param.s32 %r75, [__cudaparm_kernel_pair_vflag]; - mov.u32 %r76, 0; - setp.le.s32 %p16, %r75, %r76; - @%p16 bra $Lt_0_31490; - mov.f32 %f149, %f10; - st.shared.f32 [%rd44+0], %f149; - mov.f32 %f150, %f12; - st.shared.f32 [%rd44+512], %f150; - mov.f32 %f151, %f14; - st.shared.f32 [%rd44+1024], %f151; - mov.f32 %f152, %f16; - st.shared.f32 [%rd44+1536], %f152; - mov.f32 %f153, %f18; - st.shared.f32 [%rd44+2048], %f153; - mov.f32 %f159, %f19; - st.shared.f32 [%rd44+2560], %f159; - mov.s32 %r77, %r70; - @!%p13 bra $Lt_0_32002; -$Lt_0_32514: - setp.ge.u32 %p17, %r14, %r77; - @%p17 bra $Lt_0_32770; - add.u32 %r78, %r2, %r77; - cvt.u64.u32 %rd48, %r78; - mul.wide.u32 %rd49, %r78, 4; - add.u64 %rd50, %rd41, %rd49; - ld.shared.f32 %f160, [%rd50+0]; - add.ftz.f32 %f149, %f160, %f149; - st.shared.f32 [%rd44+0], %f149; - ld.shared.f32 %f161, [%rd50+512]; - add.ftz.f32 %f150, %f161, %f150; - st.shared.f32 [%rd44+512], %f150; - ld.shared.f32 %f162, [%rd50+1024]; - add.ftz.f32 %f151, %f162, %f151; - st.shared.f32 [%rd44+1024], %f151; - ld.shared.f32 %f163, [%rd50+1536]; - add.ftz.f32 %f152, %f163, %f152; - st.shared.f32 [%rd44+1536], %f152; - ld.shared.f32 %f164, [%rd50+2048]; - add.ftz.f32 %f153, %f164, %f153; - st.shared.f32 [%rd44+2048], %f153; - ld.shared.f32 %f165, [%rd50+2560]; - add.ftz.f32 %f159, %f165, %f159; - st.shared.f32 [%rd44+2560], %f159; -$Lt_0_32770: - shr.u32 %r77, %r77, 1; - mov.u32 %r79, 0; - setp.ne.u32 %p18, %r77, %r79; - @%p18 bra $Lt_0_32514; -$Lt_0_32002: - mov.f32 %f10, %f149; - mov.f32 %f12, %f150; - mov.f32 %f14, %f151; - mov.f32 %f16, %f152; - mov.f32 %f18, %f153; - mov.f32 %f20, %f159; -$Lt_0_31490: -$Lt_0_29442: - mov.u32 %r80, 0; - setp.ne.s32 %p19, %r14, %r80; - @%p19 bra $Lt_0_33538; - ld.param.u64 %rd51, [__cudaparm_kernel_pair___val_paramengv]; - add.u64 %rd52, %rd51, %rd3; - ld.param.s32 %r81, [__cudaparm_kernel_pair_eflag]; - mov.u32 %r82, 0; - setp.le.s32 %p20, %r81, %r82; - @%p20 bra $Lt_0_34050; - st.global.f32 [%rd52+0], %f38; - cvt.s64.s32 %rd53, %r9; - mul.wide.s32 %rd54, %r9, 4; - add.u64 %rd55, %rd54, %rd52; - st.global.f32 [%rd55+0], %f37; - add.u64 %rd52, %rd54, %rd55; -$Lt_0_34050: - ld.param.s32 %r83, [__cudaparm_kernel_pair_vflag]; - mov.u32 %r84, 0; - setp.le.s32 %p21, %r83, %r84; - @%p21 bra $Lt_0_34562; - mov.f32 %f166, %f10; - st.global.f32 [%rd52+0], %f166; - cvt.s64.s32 %rd56, %r9; - mul.wide.s32 %rd57, %r9, 4; - add.u64 %rd58, %rd57, %rd52; - mov.f32 %f167, %f12; - st.global.f32 [%rd58+0], %f167; - add.u64 %rd59, %rd57, %rd58; - mov.f32 %f168, %f14; - st.global.f32 [%rd59+0], %f168; - add.u64 %rd60, %rd57, %rd59; - mov.f32 %f169, %f16; - st.global.f32 [%rd60+0], %f169; - add.u64 %rd52, %rd57, %rd60; - mov.f32 %f170, %f18; - st.global.f32 [%rd52+0], %f170; - mov.f32 %f171, %f20; - add.u64 %rd61, %rd57, %rd52; - st.global.f32 [%rd61+0], %f171; -$Lt_0_34562: - ld.param.u64 %rd62, [__cudaparm_kernel_pair_ans]; - mul.lo.u64 %rd63, %rd2, 16; - add.u64 %rd64, %rd62, %rd63; - mov.f32 %f172, %f173; - st.global.v4.f32 [%rd64+0], {%f36,%f35,%f34,%f172}; -$Lt_0_33538: -$Lt_0_23298: - .loc 16 144 0 - exit; -$LDWend_kernel_pair: - } // kernel_pair - - .entry kernel_pair_fast ( - .param .u64 __cudaparm_kernel_pair_fast_x_, - .param .u64 __cudaparm_kernel_pair_fast_lj1_in, - .param .u64 __cudaparm_kernel_pair_fast_lj3_in, - .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in, - .param .u64 __cudaparm_kernel_pair_fast_dev_nbor, - .param .u64 __cudaparm_kernel_pair_fast_dev_packed, - .param .u64 __cudaparm_kernel_pair_fast_ans, - .param .u64 __cudaparm_kernel_pair_fast___val_paramengv, - .param .s32 __cudaparm_kernel_pair_fast_eflag, - .param .s32 __cudaparm_kernel_pair_fast_vflag, - .param .s32 __cudaparm_kernel_pair_fast_inum, - .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch, - .param .u64 __cudaparm_kernel_pair_fast_q_, - .param .f32 __cudaparm_kernel_pair_fast_cut_coulsq, - .param .f32 __cudaparm_kernel_pair_fast_qqrd2e, - .param .f32 __cudaparm_kernel_pair_fast_g_ewald, - .param .s32 __cudaparm_kernel_pair_fast_t_per_atom) - { - .reg .u32 %r<86>; - .reg .u64 %rd<77>; - .reg .f32 %f<178>; - .reg .pred %p<25>; - .shared .align 4 .b8 __cuda___cuda_local_var_32661_33_non_const_sp_lj3320[32]; - .shared .align 16 .b8 __cuda___cuda_local_var_32660_34_non_const_lj33360[1936]; - .shared .align 16 .b8 __cuda___cuda_local_var_32659_34_non_const_lj15296[1936]; - .shared .align 4 .b8 __cuda___cuda_local_var_32762_55_non_const_red_acc7232[3072]; - // __cuda_local_var_32671_10_non_const_f = 64 - // __cuda_local_var_32673_9_non_const_virial = 16 - // __cuda_local_var_32708_43_non_const_inv1 = 40 - // __cuda_local_var_32708_49_non_const_inv2 = 44 - // __cuda_local_var_32708_55_non_const_prefactor = 52 - // __cuda_local_var_32708_66_non_const__erfc = 48 - .loc 16 154 0 -$LDWbegin_kernel_pair_fast: - cvt.s32.u32 %r1, %tid.x; - mov.u32 %r2, 7; - setp.gt.s32 %p1, %r1, %r2; - @%p1 bra $Lt_1_24834; - .loc 16 162 0 - mov.u64 %rd1, __cuda___cuda_local_var_32661_33_non_const_sp_lj3320; - cvt.s64.s32 %rd2, %r1; - mul.wide.s32 %rd3, %r1, 4; - ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in]; - add.u64 %rd5, %rd4, %rd3; - ld.global.f32 %f1, [%rd5+0]; - add.u64 %rd6, %rd3, %rd1; - st.shared.f32 [%rd6+0], %f1; -$Lt_1_24834: - mov.u64 %rd1, __cuda___cuda_local_var_32661_33_non_const_sp_lj3320; - mov.u32 %r3, 120; - setp.gt.s32 %p2, %r1, %r3; - @%p2 bra $Lt_1_25346; - .loc 16 164 0 - mov.u64 %rd7, __cuda___cuda_local_var_32660_34_non_const_lj33360; - mov.u64 %rd8, __cuda___cuda_local_var_32659_34_non_const_lj15296; - cvt.s64.s32 %rd9, %r1; - mul.wide.s32 %rd10, %r1, 16; - ld.param.u64 %rd11, [__cudaparm_kernel_pair_fast_lj1_in]; - add.u64 %rd12, %rd11, %rd10; - add.u64 %rd13, %rd10, %rd8; - ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd12+0]; - st.shared.v4.f32 [%rd13+0], {%f2,%f3,%f4,%f5}; - .loc 16 165 0 - ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in]; - add.u64 %rd15, %rd14, %rd10; - add.u64 %rd16, %rd10, %rd7; - ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0]; - st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9}; -$Lt_1_25346: - mov.u64 %rd7, __cuda___cuda_local_var_32660_34_non_const_lj33360; - mov.u64 %rd8, __cuda___cuda_local_var_32659_34_non_const_lj15296; - .loc 16 174 0 - mov.f32 %f10, 0f00000000; // 0 - mov.f32 %f11, %f10; - mov.f32 %f12, 0f00000000; // 0 - mov.f32 %f13, %f12; - mov.f32 %f14, 0f00000000; // 0 - mov.f32 %f15, %f14; - mov.f32 %f16, 0f00000000; // 0 - mov.f32 %f17, %f16; - mov.f32 %f18, 0f00000000; // 0 - mov.f32 %f19, %f18; - mov.f32 %f20, 0f00000000; // 0 - mov.f32 %f21, %f20; - .loc 16 176 0 - bar.sync 0; - ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_t_per_atom]; - div.s32 %r5, %r1, %r4; - cvt.s32.u32 %r6, %ntid.x; - div.s32 %r7, %r6, %r4; - cvt.s32.u32 %r8, %ctaid.x; - mul.lo.s32 %r9, %r8, %r7; - add.s32 %r10, %r5, %r9; - ld.param.s32 %r11, [__cudaparm_kernel_pair_fast_inum]; - setp.ge.s32 %p3, %r10, %r11; - @%p3 bra $Lt_1_36098; - .loc 16 181 0 - cvt.s64.s32 %rd17, %r10; - mul.wide.s32 %rd18, %r10, 4; - ld.param.u64 %rd19, [__cudaparm_kernel_pair_fast_dev_nbor]; - add.u64 %rd20, %rd18, %rd19; - ld.global.s32 %r12, [%rd20+0]; - ld.param.s32 %r13, [__cudaparm_kernel_pair_fast_nbor_pitch]; - cvt.s64.s32 %rd21, %r13; - mul.wide.s32 %rd22, %r13, 4; - add.u64 %rd23, %rd22, %rd20; - ld.global.s32 %r14, [%rd23+0]; - sub.s32 %r15, %r4, 1; - and.b32 %r16, %r15, %r1; - cvt.s64.s32 %rd24, %r16; - mul.wide.s32 %rd25, %r16, 4; - ld.param.u64 %rd26, [__cudaparm_kernel_pair_fast_dev_packed]; - setp.ne.u64 %p4, %rd26, %rd19; - @%p4 bra $Lt_1_26626; - cvt.s32.s64 %r17, %rd21; - mul.lo.s32 %r18, %r17, %r4; - mov.s32 %r19, %r18; - mul.lo.s32 %r20, %r15, %r10; - add.s32 %r21, %r17, %r20; - cvt.s64.s32 %rd27, %r21; - mul.wide.s32 %rd28, %r21, 4; - add.u64 %rd29, %rd23, %rd28; - and.b32 %r22, %r15, %r14; - cvt.s64.s32 %rd30, %r22; - div.s32 %r23, %r14, %r4; - mul.lo.s32 %r24, %r18, %r23; - cvt.s64.s32 %rd31, %r24; - add.u64 %rd32, %rd30, %rd31; - mul.lo.u64 %rd33, %rd32, 4; - add.u64 %rd34, %rd29, %rd33; - add.u64 %rd35, %rd25, %rd29; - bra.uni $Lt_1_26370; -$Lt_1_26626: - add.u64 %rd36, %rd22, %rd23; - ld.global.s32 %r25, [%rd36+0]; - cvt.s64.s32 %rd37, %r25; - mul.wide.s32 %rd38, %r25, 4; - add.u64 %rd39, %rd26, %rd38; - cvt.s64.s32 %rd40, %r14; - mul.wide.s32 %rd41, %r14, 4; - add.u64 %rd34, %rd39, %rd41; - mov.s32 %r19, %r4; - add.u64 %rd35, %rd25, %rd39; -$Lt_1_26370: - .loc 16 184 0 - mov.u32 %r26, %r12; - mov.s32 %r27, 0; - mov.u32 %r28, %r27; - mov.s32 %r29, 0; - mov.u32 %r30, %r29; - mov.s32 %r31, 0; - mov.u32 %r32, %r31; - tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r26,%r28,%r30,%r32}]; - mov.f32 %f26, %f22; - mov.f32 %f27, %f23; - mov.f32 %f28, %f24; - mov.f32 %f29, %f25; - .loc 16 185 0 - mov.u32 %r33, %r12; - mov.s32 %r34, 0; - mov.u32 %r35, %r34; - mov.s32 %r36, 0; - mov.u32 %r37, %r36; - mov.s32 %r38, 0; - mov.u32 %r39, %r38; - tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[q_tex,{%r33,%r35,%r37,%r39}]; - mov.f32 %f34, %f30; - setp.ge.u64 %p5, %rd35, %rd34; - @%p5 bra $Lt_1_37634; - cvt.rzi.ftz.s32.f32 %r40, %f29; - cvt.s64.s32 %rd42, %r19; - mul.lo.s32 %r41, %r40, 11; - cvt.rn.f32.s32 %f35, %r41; - mov.f32 %f36, 0f00000000; // 0 - mov.f32 %f37, 0f00000000; // 0 - mov.f32 %f38, 0f00000000; // 0 - mov.f32 %f39, 0f00000000; // 0 - mov.f32 %f40, 0f00000000; // 0 -$Lt_1_27394: - // Loop body line 185, nesting depth: 1, estimated iterations: unknown - .loc 16 190 0 - ld.global.s32 %r42, [%rd35+0]; - .loc 16 193 0 - shr.s32 %r43, %r42, 30; - and.b32 %r44, %r43, 3; - cvt.s64.s32 %rd43, %r44; - mul.wide.s32 %rd44, %r44, 4; - add.u64 %rd45, %rd1, %rd44; - ld.shared.f32 %f41, [%rd45+0]; - .loc 16 194 0 - mov.f32 %f42, 0f3f800000; // 1 - ld.shared.f32 %f43, [%rd45+16]; - sub.ftz.f32 %f44, %f42, %f43; - .loc 16 197 0 - and.b32 %r45, %r42, 1073741823; - mov.u32 %r46, %r45; - mov.s32 %r47, 0; - mov.u32 %r48, %r47; - mov.s32 %r49, 0; - mov.u32 %r50, %r49; - mov.s32 %r51, 0; - mov.u32 %r52, %r51; - tex.1d.v4.f32.s32 {%f45,%f46,%f47,%f48},[pos_tex,{%r46,%r48,%r50,%r52}]; - mov.f32 %f49, %f45; - mov.f32 %f50, %f46; - mov.f32 %f51, %f47; - mov.f32 %f52, %f48; - sub.ftz.f32 %f53, %f27, %f50; - sub.ftz.f32 %f54, %f26, %f49; - sub.ftz.f32 %f55, %f28, %f51; - mul.ftz.f32 %f56, %f53, %f53; - fma.rn.ftz.f32 %f57, %f54, %f54, %f56; - fma.rn.ftz.f32 %f58, %f55, %f55, %f57; - add.ftz.f32 %f59, %f35, %f52; - cvt.rzi.ftz.s32.f32 %r53, %f59; - cvt.s64.s32 %rd46, %r53; - mul.wide.s32 %rd47, %r53, 16; - add.u64 %rd48, %rd47, %rd8; - ld.shared.f32 %f60, [%rd48+0]; - setp.gt.ftz.f32 %p6, %f60, %f58; - @!%p6 bra $Lt_1_31234; - rcp.approx.ftz.f32 %f61, %f58; - ld.shared.f32 %f62, [%rd48+4]; - setp.lt.ftz.f32 %p7, %f58, %f62; - @!%p7 bra $Lt_1_28418; - add.u64 %rd49, %rd47, %rd7; - ld.shared.f32 %f63, [%rd49+0]; - mov.f32 %f64, 0f40000000; // 2 - setp.eq.ftz.f32 %p8, %f63, %f64; - @!%p8 bra $Lt_1_28930; - .loc 16 212 0 - mul.ftz.f32 %f65, %f61, %f61; - mov.f32 %f66, %f65; - mov.f32 %f67, %f66; - .loc 16 213 0 - mul.ftz.f32 %f68, %f65, %f65; - mov.f32 %f69, %f68; - bra.uni $Lt_1_29186; -$Lt_1_28930: - mov.f32 %f70, 0f3f800000; // 1 - setp.eq.ftz.f32 %p9, %f63, %f70; - @!%p9 bra $Lt_1_29442; - .loc 16 215 0 - rsqrt.approx.ftz.f32 %f71, %f58; - mul.ftz.f32 %f72, %f61, %f71; - mov.f32 %f68, %f72; - mov.f32 %f69, %f68; - .loc 16 216 0 - mul.ftz.f32 %f66, %f72, %f72; - mov.f32 %f67, %f66; - bra.uni $Lt_1_29186; -$Lt_1_29442: - .loc 16 218 0 - mul.ftz.f32 %f73, %f61, %f61; - mul.ftz.f32 %f74, %f61, %f73; - mov.f32 %f66, %f74; - mov.f32 %f67, %f66; - .loc 16 219 0 - mov.f32 %f68, %f74; - mov.f32 %f69, %f68; -$Lt_1_29186: -$Lt_1_28674: - .loc 16 221 0 - mul.ftz.f32 %f75, %f41, %f66; - ld.shared.v2.f32 {%f76,%f77}, [%rd48+8]; - mul.ftz.f32 %f78, %f76, %f68; - sub.ftz.f32 %f79, %f78, %f77; - mul.ftz.f32 %f80, %f75, %f79; - bra.uni $Lt_1_28162; -$Lt_1_28418: - .loc 16 223 0 - mov.f32 %f80, 0f00000000; // 0 -$Lt_1_28162: - ld.param.f32 %f81, [__cudaparm_kernel_pair_fast_cut_coulsq]; - setp.gt.ftz.f32 %p10, %f81, %f58; - @!%p10 bra $Lt_1_29954; - .loc 20 518 0 - rsqrt.approx.ftz.f32 %f82, %f61; - ld.param.f32 %f83, [__cudaparm_kernel_pair_fast_g_ewald]; - mul.ftz.f32 %f84, %f83, %f82; - mul.ftz.f32 %f85, %f84, %f84; - neg.ftz.f32 %f86, %f85; - mov.f32 %f87, 0f3fb8aa3b; // 1.4427 - mul.ftz.f32 %f88, %f86, %f87; - ex2.approx.ftz.f32 %f89, %f88; - .loc 16 230 0 - mov.f32 %f90, 0f3f800000; // 1 - mov.f32 %f91, 0f3ea7ba05; // 0.327591 - fma.rn.ftz.f32 %f92, %f91, %f84, %f90; - rcp.approx.ftz.f32 %f93, %f92; - mov.f32 %f94, 0f3e827906; // 0.25483 - mov.f32 %f95, 0fbe91a98e; // -0.284497 - mov.f32 %f96, 0f3fb5f0e3; // 1.42141 - mov.f32 %f97, 0fbfba00e3; // -1.45315 - mov.f32 %f98, 0f3f87dc22; // 1.06141 - fma.rn.ftz.f32 %f99, %f98, %f93, %f97; - fma.rn.ftz.f32 %f100, %f93, %f99, %f96; - fma.rn.ftz.f32 %f101, %f93, %f100, %f95; - fma.rn.ftz.f32 %f102, %f93, %f101, %f94; - mul.ftz.f32 %f103, %f93, %f102; - mul.ftz.f32 %f104, %f89, %f103; - mov.f32 %f105, %f104; - .loc 16 231 0 - mov.u32 %r54, %r45; - mov.s32 %r55, 0; - mov.u32 %r56, %r55; - mov.s32 %r57, 0; - mov.u32 %r58, %r57; - mov.s32 %r59, 0; - mov.u32 %r60, %r59; - tex.1d.v4.f32.s32 {%f106,%f107,%f108,%f109},[q_tex,{%r54,%r56,%r58,%r60}]; - mov.f32 %f110, %f106; - ld.param.f32 %f111, [__cudaparm_kernel_pair_fast_qqrd2e]; - mul.ftz.f32 %f112, %f111, %f34; - mul.ftz.f32 %f113, %f112, %f110; - div.approx.ftz.f32 %f114, %f113, %f82; - mov.f32 %f115, %f114; - .loc 16 232 0 - mov.f32 %f116, 0f3f906ebb; // 1.12838 - mul.ftz.f32 %f117, %f84, %f116; - fma.rn.ftz.f32 %f118, %f89, %f117, %f104; - sub.ftz.f32 %f119, %f118, %f44; - mul.ftz.f32 %f120, %f114, %f119; - bra.uni $Lt_1_29698; -$Lt_1_29954: - .loc 16 234 0 - mov.f32 %f120, 0f00000000; // 0 -$Lt_1_29698: - .loc 16 238 0 - add.ftz.f32 %f121, %f120, %f80; - mul.ftz.f32 %f122, %f121, %f61; - fma.rn.ftz.f32 %f38, %f54, %f122, %f38; - .loc 16 239 0 - fma.rn.ftz.f32 %f37, %f53, %f122, %f37; - .loc 16 240 0 - fma.rn.ftz.f32 %f36, %f55, %f122, %f36; - ld.param.s32 %r61, [__cudaparm_kernel_pair_fast_eflag]; - mov.u32 %r62, 0; - setp.le.s32 %p11, %r61, %r62; - @%p11 bra $Lt_1_30722; - .loc 16 243 0 - mov.f32 %f123, %f115; - mov.f32 %f124, %f105; - sub.ftz.f32 %f125, %f124, %f44; - fma.rn.ftz.f32 %f126, %f123, %f125, %f39; - selp.f32 %f39, %f126, %f39, %p10; - @!%p7 bra $Lt_1_30722; - .loc 16 246 0 - add.u64 %rd50, %rd47, %rd7; - ld.shared.v4.f32 {_,%f127,%f128,%f129}, [%rd50+0]; - mov.f32 %f130, %f67; - mul.ftz.f32 %f131, %f130, %f41; - mov.f32 %f132, %f69; - mul.ftz.f32 %f133, %f127, %f132; - sub.ftz.f32 %f134, %f133, %f128; - mul.ftz.f32 %f135, %f131, %f134; - sub.ftz.f32 %f136, %f135, %f129; - add.ftz.f32 %f40, %f40, %f136; -$Lt_1_30722: -$Lt_1_30210: - ld.param.s32 %r63, [__cudaparm_kernel_pair_fast_vflag]; - mov.u32 %r64, 0; - setp.le.s32 %p12, %r63, %r64; - @%p12 bra $Lt_1_31234; - .loc 16 251 0 - mov.f32 %f137, %f11; - mul.ftz.f32 %f138, %f54, %f54; - fma.rn.ftz.f32 %f139, %f122, %f138, %f137; - mov.f32 %f11, %f139; - .loc 16 252 0 - mov.f32 %f140, %f13; - fma.rn.ftz.f32 %f141, %f122, %f56, %f140; - mov.f32 %f13, %f141; - .loc 16 253 0 - mov.f32 %f142, %f15; - mul.ftz.f32 %f143, %f55, %f55; - fma.rn.ftz.f32 %f144, %f122, %f143, %f142; - mov.f32 %f15, %f144; - .loc 16 254 0 - mov.f32 %f145, %f17; - mul.ftz.f32 %f146, %f53, %f54; - fma.rn.ftz.f32 %f147, %f122, %f146, %f145; - mov.f32 %f17, %f147; - .loc 16 255 0 - mov.f32 %f148, %f19; - mul.ftz.f32 %f149, %f54, %f55; - fma.rn.ftz.f32 %f150, %f122, %f149, %f148; - mov.f32 %f19, %f150; - .loc 16 256 0 - mul.ftz.f32 %f151, %f53, %f55; - fma.rn.ftz.f32 %f20, %f122, %f151, %f20; - mov.f32 %f21, %f20; -$Lt_1_31234: -$Lt_1_27650: - .loc 16 189 0 - mul.lo.u64 %rd51, %rd42, 4; - add.u64 %rd35, %rd35, %rd51; - setp.lt.u64 %p13, %rd35, %rd34; - @%p13 bra $Lt_1_27394; - bra.uni $Lt_1_26882; -$Lt_1_37634: - mov.f32 %f36, 0f00000000; // 0 - mov.f32 %f37, 0f00000000; // 0 - mov.f32 %f38, 0f00000000; // 0 - mov.f32 %f39, 0f00000000; // 0 - mov.f32 %f40, 0f00000000; // 0 -$Lt_1_26882: - mov.u32 %r65, 1; - setp.le.s32 %p14, %r4, %r65; - @%p14 bra $Lt_1_34050; - .loc 16 261 0 - mov.u64 %rd52, __cuda___cuda_local_var_32762_55_non_const_red_acc7232; - cvt.s64.s32 %rd53, %r1; - mul.wide.s32 %rd54, %r1, 4; - add.u64 %rd55, %rd52, %rd54; - mov.f32 %f152, %f38; - st.shared.f32 [%rd55+0], %f152; - mov.f32 %f153, %f37; - st.shared.f32 [%rd55+512], %f153; - mov.f32 %f154, %f36; - st.shared.f32 [%rd55+1024], %f154; - mov.f32 %f155, %f40; - st.shared.f32 [%rd55+1536], %f155; - mov.f32 %f156, %f39; - st.shared.f32 [%rd55+2048], %f156; - shr.s32 %r66, %r4, 31; - mov.s32 %r67, 1; - and.b32 %r68, %r66, %r67; - add.s32 %r69, %r68, %r4; - shr.s32 %r70, %r69, 1; - mov.s32 %r71, %r70; - mov.u32 %r72, 0; - setp.ne.u32 %p15, %r70, %r72; - @!%p15 bra $Lt_1_32514; -$Lt_1_33026: - setp.ge.u32 %p16, %r16, %r71; - @%p16 bra $Lt_1_33282; - add.u32 %r73, %r1, %r71; - cvt.u64.u32 %rd56, %r73; - mul.wide.u32 %rd57, %r73, 4; - add.u64 %rd58, %rd52, %rd57; - ld.shared.f32 %f157, [%rd58+0]; - add.ftz.f32 %f152, %f157, %f152; - st.shared.f32 [%rd55+0], %f152; - ld.shared.f32 %f158, [%rd58+512]; - add.ftz.f32 %f153, %f158, %f153; - st.shared.f32 [%rd55+512], %f153; - ld.shared.f32 %f159, [%rd58+1024]; - add.ftz.f32 %f154, %f159, %f154; - st.shared.f32 [%rd55+1024], %f154; - ld.shared.f32 %f160, [%rd58+1536]; - add.ftz.f32 %f155, %f160, %f155; - st.shared.f32 [%rd55+1536], %f155; - ld.shared.f32 %f161, [%rd58+2048]; - add.ftz.f32 %f156, %f161, %f156; - st.shared.f32 [%rd55+2048], %f156; -$Lt_1_33282: - shr.u32 %r71, %r71, 1; - mov.u32 %r74, 0; - setp.ne.u32 %p17, %r71, %r74; - @%p17 bra $Lt_1_33026; -$Lt_1_32514: - mov.f32 %f38, %f152; - mov.f32 %f37, %f153; - mov.f32 %f36, %f154; - mov.f32 %f40, %f155; - mov.f32 %f39, %f156; - ld.param.s32 %r75, [__cudaparm_kernel_pair_fast_vflag]; - mov.u32 %r76, 0; - setp.le.s32 %p18, %r75, %r76; - @%p18 bra $Lt_1_34050; - mov.f32 %f152, %f11; - st.shared.f32 [%rd55+0], %f152; - mov.f32 %f153, %f13; - st.shared.f32 [%rd55+512], %f153; - mov.f32 %f154, %f15; - st.shared.f32 [%rd55+1024], %f154; - mov.f32 %f155, %f17; - st.shared.f32 [%rd55+1536], %f155; - mov.f32 %f156, %f19; - st.shared.f32 [%rd55+2048], %f156; - mov.f32 %f162, %f20; - st.shared.f32 [%rd55+2560], %f162; - mov.s32 %r77, %r70; - @!%p15 bra $Lt_1_34562; -$Lt_1_35074: - setp.ge.u32 %p19, %r16, %r77; - @%p19 bra $Lt_1_35330; - add.u32 %r78, %r1, %r77; - cvt.u64.u32 %rd59, %r78; - mul.wide.u32 %rd60, %r78, 4; - add.u64 %rd61, %rd52, %rd60; - ld.shared.f32 %f163, [%rd61+0]; - add.ftz.f32 %f152, %f163, %f152; - st.shared.f32 [%rd55+0], %f152; - ld.shared.f32 %f164, [%rd61+512]; - add.ftz.f32 %f153, %f164, %f153; - st.shared.f32 [%rd55+512], %f153; - ld.shared.f32 %f165, [%rd61+1024]; - add.ftz.f32 %f154, %f165, %f154; - st.shared.f32 [%rd55+1024], %f154; - ld.shared.f32 %f166, [%rd61+1536]; - add.ftz.f32 %f155, %f166, %f155; - st.shared.f32 [%rd55+1536], %f155; - ld.shared.f32 %f167, [%rd61+2048]; - add.ftz.f32 %f156, %f167, %f156; - st.shared.f32 [%rd55+2048], %f156; - ld.shared.f32 %f168, [%rd61+2560]; - add.ftz.f32 %f162, %f168, %f162; - st.shared.f32 [%rd55+2560], %f162; -$Lt_1_35330: - shr.u32 %r77, %r77, 1; - mov.u32 %r79, 0; - setp.ne.u32 %p20, %r77, %r79; - @%p20 bra $Lt_1_35074; -$Lt_1_34562: - mov.f32 %f11, %f152; - mov.f32 %f13, %f153; - mov.f32 %f15, %f154; - mov.f32 %f17, %f155; - mov.f32 %f19, %f156; - mov.f32 %f21, %f162; -$Lt_1_34050: -$Lt_1_32002: - mov.u32 %r80, 0; - setp.ne.s32 %p21, %r16, %r80; - @%p21 bra $Lt_1_36098; - ld.param.u64 %rd62, [__cudaparm_kernel_pair_fast___val_paramengv]; - add.u64 %rd63, %rd62, %rd18; - ld.param.s32 %r81, [__cudaparm_kernel_pair_fast_eflag]; - mov.u32 %r82, 0; - setp.le.s32 %p22, %r81, %r82; - @%p22 bra $Lt_1_36610; - st.global.f32 [%rd63+0], %f40; - cvt.s64.s32 %rd64, %r11; - mul.wide.s32 %rd65, %r11, 4; - add.u64 %rd66, %rd65, %rd63; - st.global.f32 [%rd66+0], %f39; - add.u64 %rd63, %rd65, %rd66; -$Lt_1_36610: - ld.param.s32 %r83, [__cudaparm_kernel_pair_fast_vflag]; - mov.u32 %r84, 0; - setp.le.s32 %p23, %r83, %r84; - @%p23 bra $Lt_1_37122; - mov.f32 %f169, %f11; - st.global.f32 [%rd63+0], %f169; - cvt.s64.s32 %rd67, %r11; - mul.wide.s32 %rd68, %r11, 4; - add.u64 %rd69, %rd68, %rd63; - mov.f32 %f170, %f13; - st.global.f32 [%rd69+0], %f170; - add.u64 %rd70, %rd68, %rd69; - mov.f32 %f171, %f15; - st.global.f32 [%rd70+0], %f171; - add.u64 %rd71, %rd68, %rd70; - mov.f32 %f172, %f17; - st.global.f32 [%rd71+0], %f172; - add.u64 %rd63, %rd68, %rd71; - mov.f32 %f173, %f19; - st.global.f32 [%rd63+0], %f173; - mov.f32 %f174, %f21; - add.u64 %rd72, %rd68, %rd63; - st.global.f32 [%rd72+0], %f174; -$Lt_1_37122: - ld.param.u64 %rd73, [__cudaparm_kernel_pair_fast_ans]; - mul.lo.u64 %rd74, %rd17, 16; - add.u64 %rd75, %rd73, %rd74; - mov.f32 %f175, %f176; - st.global.v4.f32 [%rd75+0], {%f38,%f37,%f36,%f175}; -$Lt_1_36098: -$Lt_1_25858: - .loc 16 264 0 - exit; -$LDWend_kernel_pair_fast: - } // kernel_pair_fast - diff --git a/lib/gpu/cg_cmm_long_ptx.h b/lib/gpu/cg_cmm_long_ptx.h deleted file mode 100644 index 4098954171..0000000000 --- a/lib/gpu/cg_cmm_long_ptx.h +++ /dev/null @@ -1,1129 +0,0 @@ -const char * cg_cmm_long = -" .version 2.3\n" -" .target sm_20\n" -" .address_size 64\n" -" .global .texref pos_tex;\n" -" .global .texref q_tex;\n" -" .entry kernel_pair (\n" -" .param .u64 __cudaparm_kernel_pair_x_,\n" -" .param .u64 __cudaparm_kernel_pair_lj1,\n" -" .param .u64 __cudaparm_kernel_pair_lj3,\n" -" .param .s32 __cudaparm_kernel_pair_lj_types,\n" -" .param .u64 __cudaparm_kernel_pair_sp_lj_in,\n" -" .param .u64 __cudaparm_kernel_pair_dev_nbor,\n" -" .param .u64 __cudaparm_kernel_pair_dev_packed,\n" -" .param .u64 __cudaparm_kernel_pair_ans,\n" -" .param .u64 __cudaparm_kernel_pair___val_paramengv,\n" -" .param .s32 __cudaparm_kernel_pair_eflag,\n" -" .param .s32 __cudaparm_kernel_pair_vflag,\n" -" .param .s32 __cudaparm_kernel_pair_inum,\n" -" .param .s32 __cudaparm_kernel_pair_nbor_pitch,\n" -" .param .u64 __cudaparm_kernel_pair_q_,\n" -" .param .f32 __cudaparm_kernel_pair_cut_coulsq,\n" -" .param .f32 __cudaparm_kernel_pair_qqrd2e,\n" -" .param .f32 __cudaparm_kernel_pair_g_ewald,\n" -" .param .s32 __cudaparm_kernel_pair_t_per_atom)\n" -" {\n" -" .reg .u32 %r<86>;\n" -" .reg .u64 %rd<66>;\n" -" .reg .f32 %f<175>;\n" -" .reg .pred %p<23>;\n" -" .shared .align 16 .b8 __cuda___cuda_local_var_32541_33_non_const_sp_lj112[32];\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_32642_55_non_const_red_acc144[3072];\n" -" .loc 16 36 0\n" -"$LDWbegin_kernel_pair:\n" -" .loc 16 41 0\n" -" ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];\n" -" ldu.global.f32 %f1, [%rd1+0];\n" -" .loc 16 42 0\n" -" ld.global.f32 %f2, [%rd1+4];\n" -" .loc 16 43 0\n" -" ld.global.f32 %f3, [%rd1+8];\n" -" .loc 16 44 0\n" -" ld.global.f32 %f4, [%rd1+12];\n" -" st.shared.v4.f32 [__cuda___cuda_local_var_32541_33_non_const_sp_lj112+0], {%f1,%f2,%f3,%f4};\n" -" .loc 16 45 0\n" -" ld.global.f32 %f5, [%rd1+16];\n" -" .loc 16 46 0\n" -" ld.global.f32 %f6, [%rd1+20];\n" -" .loc 16 47 0\n" -" ld.global.f32 %f7, [%rd1+24];\n" -" .loc 16 48 0\n" -" ld.global.f32 %f8, [%rd1+28];\n" -" st.shared.v4.f32 [__cuda___cuda_local_var_32541_33_non_const_sp_lj112+16], {%f5,%f6,%f7,%f8};\n" -" .loc 16 56 0\n" -" mov.f32 %f9, 0f00000000; \n" -" mov.f32 %f10, %f9;\n" -" mov.f32 %f11, 0f00000000; \n" -" mov.f32 %f12, %f11;\n" -" mov.f32 %f13, 0f00000000; \n" -" mov.f32 %f14, %f13;\n" -" mov.f32 %f15, 0f00000000; \n" -" mov.f32 %f16, %f15;\n" -" mov.f32 %f17, 0f00000000; \n" -" mov.f32 %f18, %f17;\n" -" mov.f32 %f19, 0f00000000; \n" -" mov.f32 %f20, %f19;\n" -" ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];\n" -" cvt.s32.u32 %r2, %tid.x;\n" -" div.s32 %r3, %r2, %r1;\n" -" cvt.s32.u32 %r4, %ntid.x;\n" -" div.s32 %r5, %r4, %r1;\n" -" cvt.s32.u32 %r6, %ctaid.x;\n" -" mul.lo.s32 %r7, %r6, %r5;\n" -" add.s32 %r8, %r3, %r7;\n" -" ld.param.s32 %r9, [__cudaparm_kernel_pair_inum];\n" -" setp.ge.s32 %p1, %r8, %r9;\n" -" @%p1 bra $Lt_0_33538;\n" -" .loc 16 61 0\n" -" cvt.s64.s32 %rd2, %r8;\n" -" mul.wide.s32 %rd3, %r8, 4;\n" -" ld.param.u64 %rd4, [__cudaparm_kernel_pair_dev_nbor];\n" -" add.u64 %rd5, %rd3, %rd4;\n" -" ld.global.s32 %r10, [%rd5+0];\n" -" ld.param.s32 %r11, [__cudaparm_kernel_pair_nbor_pitch];\n" -" cvt.s64.s32 %rd6, %r11;\n" -" mul.wide.s32 %rd7, %r11, 4;\n" -" add.u64 %rd8, %rd7, %rd5;\n" -" ld.global.s32 %r12, [%rd8+0];\n" -" sub.s32 %r13, %r1, 1;\n" -" and.b32 %r14, %r13, %r2;\n" -" cvt.s64.s32 %rd9, %r14;\n" -" mul.wide.s32 %rd10, %r14, 4;\n" -" ld.param.u64 %rd11, [__cudaparm_kernel_pair_dev_packed];\n" -" setp.ne.u64 %p2, %rd11, %rd4;\n" -" @%p2 bra $Lt_0_24066;\n" -" cvt.s32.s64 %r15, %rd6;\n" -" mul.lo.s32 %r16, %r15, %r1;\n" -" mov.s32 %r17, %r16;\n" -" mul.lo.s32 %r18, %r13, %r8;\n" -" add.s32 %r19, %r15, %r18;\n" -" cvt.s64.s32 %rd12, %r19;\n" -" mul.wide.s32 %rd13, %r19, 4;\n" -" add.u64 %rd14, %rd8, %rd13;\n" -" and.b32 %r20, %r13, %r12;\n" -" cvt.s64.s32 %rd15, %r20;\n" -" div.s32 %r21, %r12, %r1;\n" -" mul.lo.s32 %r22, %r16, %r21;\n" -" cvt.s64.s32 %rd16, %r22;\n" -" add.u64 %rd17, %rd15, %rd16;\n" -" mul.lo.u64 %rd18, %rd17, 4;\n" -" add.u64 %rd19, %rd14, %rd18;\n" -" add.u64 %rd20, %rd10, %rd14;\n" -" bra.uni $Lt_0_23810;\n" -"$Lt_0_24066:\n" -" add.u64 %rd21, %rd7, %rd8;\n" -" ld.global.s32 %r23, [%rd21+0];\n" -" cvt.s64.s32 %rd22, %r23;\n" -" mul.wide.s32 %rd23, %r23, 4;\n" -" add.u64 %rd24, %rd11, %rd23;\n" -" cvt.s64.s32 %rd25, %r12;\n" -" mul.wide.s32 %rd26, %r12, 4;\n" -" add.u64 %rd19, %rd24, %rd26;\n" -" mov.s32 %r17, %r1;\n" -" add.u64 %rd20, %rd10, %rd24;\n" -"$Lt_0_23810:\n" -" .loc 16 64 0\n" -" mov.u32 %r24, %r10;\n" -" mov.s32 %r25, 0;\n" -" mov.u32 %r26, %r25;\n" -" mov.s32 %r27, 0;\n" -" mov.u32 %r28, %r27;\n" -" mov.s32 %r29, 0;\n" -" mov.u32 %r30, %r29;\n" -" tex.1d.v4.f32.s32 {%f21,%f22,%f23,%f24},[pos_tex,{%r24,%r26,%r28,%r30}];\n" -" mov.f32 %f25, %f21;\n" -" mov.f32 %f26, %f22;\n" -" mov.f32 %f27, %f23;\n" -" mov.f32 %f28, %f24;\n" -" .loc 16 65 0\n" -" mov.u32 %r31, %r10;\n" -" mov.s32 %r32, 0;\n" -" mov.u32 %r33, %r32;\n" -" mov.s32 %r34, 0;\n" -" mov.u32 %r35, %r34;\n" -" mov.s32 %r36, 0;\n" -" mov.u32 %r37, %r36;\n" -" tex.1d.v4.f32.s32 {%f29,%f30,%f31,%f32},[q_tex,{%r31,%r33,%r35,%r37}];\n" -" mov.f32 %f33, %f29;\n" -" setp.ge.u64 %p3, %rd20, %rd19;\n" -" @%p3 bra $Lt_0_35074;\n" -" cvt.rzi.ftz.s32.f32 %r38, %f28;\n" -" cvt.s64.s32 %rd27, %r17;\n" -" ld.param.s32 %r39, [__cudaparm_kernel_pair_lj_types];\n" -" mul.lo.s32 %r40, %r39, %r38;\n" -" ld.param.u64 %rd28, [__cudaparm_kernel_pair_lj1];\n" -" mov.f32 %f34, 0f00000000; \n" -" mov.f32 %f35, 0f00000000; \n" -" mov.f32 %f36, 0f00000000; \n" -" mov.f32 %f37, 0f00000000; \n" -" mov.f32 %f38, 0f00000000; \n" -" mov.u64 %rd29, __cuda___cuda_local_var_32541_33_non_const_sp_lj112;\n" -"$Lt_0_24834:\n" -" .loc 16 69 0\n" -" ld.global.s32 %r41, [%rd20+0];\n" -" .loc 16 72 0\n" -" shr.s32 %r42, %r41, 30;\n" -" and.b32 %r43, %r42, 3;\n" -" cvt.s64.s32 %rd30, %r43;\n" -" mul.wide.s32 %rd31, %r43, 4;\n" -" add.u64 %rd32, %rd29, %rd31;\n" -" ld.shared.f32 %f39, [%rd32+0];\n" -" .loc 16 73 0\n" -" mov.f32 %f40, 0f3f800000; \n" -" ld.shared.f32 %f41, [%rd32+16];\n" -" sub.ftz.f32 %f42, %f40, %f41;\n" -" .loc 16 76 0\n" -" and.b32 %r44, %r41, 1073741823;\n" -" mov.u32 %r45, %r44;\n" -" mov.s32 %r46, 0;\n" -" mov.u32 %r47, %r46;\n" -" mov.s32 %r48, 0;\n" -" mov.u32 %r49, %r48;\n" -" mov.s32 %r50, 0;\n" -" mov.u32 %r51, %r50;\n" -" tex.1d.v4.f32.s32 {%f43,%f44,%f45,%f46},[pos_tex,{%r45,%r47,%r49,%r51}];\n" -" mov.f32 %f47, %f43;\n" -" mov.f32 %f48, %f44;\n" -" mov.f32 %f49, %f45;\n" -" mov.f32 %f50, %f46;\n" -" cvt.rzi.ftz.s32.f32 %r52, %f50;\n" -" sub.ftz.f32 %f51, %f26, %f48;\n" -" sub.ftz.f32 %f52, %f25, %f47;\n" -" sub.ftz.f32 %f53, %f27, %f49;\n" -" mul.ftz.f32 %f54, %f51, %f51;\n" -" fma.rn.ftz.f32 %f55, %f52, %f52, %f54;\n" -" fma.rn.ftz.f32 %f56, %f53, %f53, %f55;\n" -" add.s32 %r53, %r52, %r40;\n" -" cvt.s64.s32 %rd33, %r53;\n" -" mul.wide.s32 %rd34, %r53, 16;\n" -" add.u64 %rd35, %rd34, %rd28;\n" -" ld.global.f32 %f57, [%rd35+0];\n" -" setp.gt.ftz.f32 %p4, %f57, %f56;\n" -" @!%p4 bra $Lt_0_28674;\n" -" rcp.approx.ftz.f32 %f58, %f56;\n" -" ld.global.f32 %f59, [%rd35+4];\n" -" setp.lt.ftz.f32 %p5, %f56, %f59;\n" -" @!%p5 bra $Lt_0_25858;\n" -" ld.param.u64 %rd36, [__cudaparm_kernel_pair_lj3];\n" -" add.u64 %rd37, %rd36, %rd34;\n" -" ld.global.f32 %f60, [%rd37+0];\n" -" mov.f32 %f61, 0f40000000; \n" -" setp.eq.ftz.f32 %p6, %f60, %f61;\n" -" @!%p6 bra $Lt_0_26370;\n" -" .loc 16 92 0\n" -" mul.ftz.f32 %f62, %f58, %f58;\n" -" mov.f32 %f63, %f62;\n" -" mov.f32 %f64, %f63;\n" -" .loc 16 93 0\n" -" mul.ftz.f32 %f65, %f62, %f62;\n" -" mov.f32 %f66, %f65;\n" -" bra.uni $Lt_0_26626;\n" -"$Lt_0_26370:\n" -" mov.f32 %f67, 0f3f800000; \n" -" setp.eq.ftz.f32 %p7, %f60, %f67;\n" -" @!%p7 bra $Lt_0_26882;\n" -" .loc 16 95 0\n" -" rsqrt.approx.ftz.f32 %f68, %f56;\n" -" mul.ftz.f32 %f69, %f58, %f68;\n" -" mov.f32 %f65, %f69;\n" -" mov.f32 %f66, %f65;\n" -" .loc 16 96 0\n" -" mul.ftz.f32 %f63, %f69, %f69;\n" -" mov.f32 %f64, %f63;\n" -" bra.uni $Lt_0_26626;\n" -"$Lt_0_26882:\n" -" .loc 16 98 0\n" -" mul.ftz.f32 %f70, %f58, %f58;\n" -" mul.ftz.f32 %f71, %f58, %f70;\n" -" mov.f32 %f63, %f71;\n" -" mov.f32 %f64, %f63;\n" -" .loc 16 99 0\n" -" mov.f32 %f65, %f71;\n" -" mov.f32 %f66, %f65;\n" -"$Lt_0_26626:\n" -"$Lt_0_26114:\n" -" .loc 16 101 0\n" -" mul.ftz.f32 %f72, %f39, %f63;\n" -" ld.global.v2.f32 {%f73,%f74}, [%rd35+8];\n" -" mul.ftz.f32 %f75, %f73, %f65;\n" -" sub.ftz.f32 %f76, %f75, %f74;\n" -" mul.ftz.f32 %f77, %f72, %f76;\n" -" bra.uni $Lt_0_25602;\n" -"$Lt_0_25858:\n" -" .loc 16 103 0\n" -" mov.f32 %f77, 0f00000000; \n" -"$Lt_0_25602:\n" -" ld.param.f32 %f78, [__cudaparm_kernel_pair_cut_coulsq];\n" -" setp.gt.ftz.f32 %p8, %f78, %f56;\n" -" @!%p8 bra $Lt_0_27394;\n" -" .loc 20 518 0\n" -" rsqrt.approx.ftz.f32 %f79, %f58;\n" -" ld.param.f32 %f80, [__cudaparm_kernel_pair_g_ewald];\n" -" mul.ftz.f32 %f81, %f80, %f79;\n" -" mul.ftz.f32 %f82, %f81, %f81;\n" -" neg.ftz.f32 %f83, %f82;\n" -" mov.f32 %f84, 0f3fb8aa3b; \n" -" mul.ftz.f32 %f85, %f83, %f84;\n" -" ex2.approx.ftz.f32 %f86, %f85;\n" -" .loc 16 110 0\n" -" mov.f32 %f87, 0f3f800000; \n" -" mov.f32 %f88, 0f3ea7ba05; \n" -" fma.rn.ftz.f32 %f89, %f88, %f81, %f87;\n" -" rcp.approx.ftz.f32 %f90, %f89;\n" -" mov.f32 %f91, 0f3e827906; \n" -" mov.f32 %f92, 0fbe91a98e; \n" -" mov.f32 %f93, 0f3fb5f0e3; \n" -" mov.f32 %f94, 0fbfba00e3; \n" -" mov.f32 %f95, 0f3f87dc22; \n" -" fma.rn.ftz.f32 %f96, %f95, %f90, %f94;\n" -" fma.rn.ftz.f32 %f97, %f90, %f96, %f93;\n" -" fma.rn.ftz.f32 %f98, %f90, %f97, %f92;\n" -" fma.rn.ftz.f32 %f99, %f90, %f98, %f91;\n" -" mul.ftz.f32 %f100, %f90, %f99;\n" -" mul.ftz.f32 %f101, %f86, %f100;\n" -" mov.f32 %f102, %f101;\n" -" .loc 16 111 0\n" -" mov.u32 %r54, %r44;\n" -" mov.s32 %r55, 0;\n" -" mov.u32 %r56, %r55;\n" -" mov.s32 %r57, 0;\n" -" mov.u32 %r58, %r57;\n" -" mov.s32 %r59, 0;\n" -" mov.u32 %r60, %r59;\n" -" tex.1d.v4.f32.s32 {%f103,%f104,%f105,%f106},[q_tex,{%r54,%r56,%r58,%r60}];\n" -" mov.f32 %f107, %f103;\n" -" ld.param.f32 %f108, [__cudaparm_kernel_pair_qqrd2e];\n" -" mul.ftz.f32 %f109, %f108, %f33;\n" -" mul.ftz.f32 %f110, %f109, %f107;\n" -" div.approx.ftz.f32 %f111, %f110, %f79;\n" -" mov.f32 %f112, %f111;\n" -" .loc 16 112 0\n" -" mov.f32 %f113, 0f3f906ebb; \n" -" mul.ftz.f32 %f114, %f81, %f113;\n" -" fma.rn.ftz.f32 %f115, %f86, %f114, %f101;\n" -" sub.ftz.f32 %f116, %f115, %f42;\n" -" mul.ftz.f32 %f117, %f111, %f116;\n" -" bra.uni $Lt_0_27138;\n" -"$Lt_0_27394:\n" -" .loc 16 114 0\n" -" mov.f32 %f117, 0f00000000; \n" -"$Lt_0_27138:\n" -" .loc 16 118 0\n" -" add.ftz.f32 %f118, %f117, %f77;\n" -" mul.ftz.f32 %f119, %f118, %f58;\n" -" fma.rn.ftz.f32 %f36, %f52, %f119, %f36;\n" -" .loc 16 119 0\n" -" fma.rn.ftz.f32 %f35, %f51, %f119, %f35;\n" -" .loc 16 120 0\n" -" fma.rn.ftz.f32 %f34, %f53, %f119, %f34;\n" -" ld.param.s32 %r61, [__cudaparm_kernel_pair_eflag];\n" -" mov.u32 %r62, 0;\n" -" setp.le.s32 %p9, %r61, %r62;\n" -" @%p9 bra $Lt_0_28162;\n" -" .loc 16 123 0\n" -" mov.f32 %f120, %f112;\n" -" mov.f32 %f121, %f102;\n" -" sub.ftz.f32 %f122, %f121, %f42;\n" -" fma.rn.ftz.f32 %f123, %f120, %f122, %f37;\n" -" selp.f32 %f37, %f123, %f37, %p8;\n" -" @!%p5 bra $Lt_0_28162;\n" -" .loc 16 126 0\n" -" ld.param.u64 %rd38, [__cudaparm_kernel_pair_lj3];\n" -" add.u64 %rd39, %rd38, %rd34;\n" -" ld.global.v4.f32 {_,%f124,%f125,%f126}, [%rd39+0];\n" -" mov.f32 %f127, %f64;\n" -" mul.ftz.f32 %f128, %f127, %f39;\n" -" mov.f32 %f129, %f66;\n" -" mul.ftz.f32 %f130, %f124, %f129;\n" -" sub.ftz.f32 %f131, %f130, %f125;\n" -" mul.ftz.f32 %f132, %f128, %f131;\n" -" sub.ftz.f32 %f133, %f132, %f126;\n" -" add.ftz.f32 %f38, %f38, %f133;\n" -"$Lt_0_28162:\n" -"$Lt_0_27650:\n" -" ld.param.s32 %r63, [__cudaparm_kernel_pair_vflag];\n" -" mov.u32 %r64, 0;\n" -" setp.le.s32 %p10, %r63, %r64;\n" -" @%p10 bra $Lt_0_28674;\n" -" .loc 16 131 0\n" -" mov.f32 %f134, %f10;\n" -" mul.ftz.f32 %f135, %f52, %f52;\n" -" fma.rn.ftz.f32 %f136, %f119, %f135, %f134;\n" -" mov.f32 %f10, %f136;\n" -" .loc 16 132 0\n" -" mov.f32 %f137, %f12;\n" -" fma.rn.ftz.f32 %f138, %f119, %f54, %f137;\n" -" mov.f32 %f12, %f138;\n" -" .loc 16 133 0\n" -" mov.f32 %f139, %f14;\n" -" mul.ftz.f32 %f140, %f53, %f53;\n" -" fma.rn.ftz.f32 %f141, %f119, %f140, %f139;\n" -" mov.f32 %f14, %f141;\n" -" .loc 16 134 0\n" -" mov.f32 %f142, %f16;\n" -" mul.ftz.f32 %f143, %f51, %f52;\n" -" fma.rn.ftz.f32 %f144, %f119, %f143, %f142;\n" -" mov.f32 %f16, %f144;\n" -" .loc 16 135 0\n" -" mov.f32 %f145, %f18;\n" -" mul.ftz.f32 %f146, %f52, %f53;\n" -" fma.rn.ftz.f32 %f147, %f119, %f146, %f145;\n" -" mov.f32 %f18, %f147;\n" -" .loc 16 136 0\n" -" mul.ftz.f32 %f148, %f51, %f53;\n" -" fma.rn.ftz.f32 %f19, %f119, %f148, %f19;\n" -" mov.f32 %f20, %f19;\n" -"$Lt_0_28674:\n" -"$Lt_0_25090:\n" -" .loc 16 68 0\n" -" mul.lo.u64 %rd40, %rd27, 4;\n" -" add.u64 %rd20, %rd20, %rd40;\n" -" setp.lt.u64 %p11, %rd20, %rd19;\n" -" @%p11 bra $Lt_0_24834;\n" -" bra.uni $Lt_0_24322;\n" -"$Lt_0_35074:\n" -" mov.f32 %f34, 0f00000000; \n" -" mov.f32 %f35, 0f00000000; \n" -" mov.f32 %f36, 0f00000000; \n" -" mov.f32 %f37, 0f00000000; \n" -" mov.f32 %f38, 0f00000000; \n" -"$Lt_0_24322:\n" -" mov.u32 %r65, 1;\n" -" setp.le.s32 %p12, %r1, %r65;\n" -" @%p12 bra $Lt_0_31490;\n" -" .loc 16 141 0\n" -" mov.u64 %rd41, __cuda___cuda_local_var_32642_55_non_const_red_acc144;\n" -" cvt.s64.s32 %rd42, %r2;\n" -" mul.wide.s32 %rd43, %r2, 4;\n" -" add.u64 %rd44, %rd41, %rd43;\n" -" mov.f32 %f149, %f36;\n" -" st.shared.f32 [%rd44+0], %f149;\n" -" mov.f32 %f150, %f35;\n" -" st.shared.f32 [%rd44+512], %f150;\n" -" mov.f32 %f151, %f34;\n" -" st.shared.f32 [%rd44+1024], %f151;\n" -" mov.f32 %f152, %f38;\n" -" st.shared.f32 [%rd44+1536], %f152;\n" -" mov.f32 %f153, %f37;\n" -" st.shared.f32 [%rd44+2048], %f153;\n" -" shr.s32 %r66, %r1, 31;\n" -" mov.s32 %r67, 1;\n" -" and.b32 %r68, %r66, %r67;\n" -" add.s32 %r69, %r68, %r1;\n" -" shr.s32 %r70, %r69, 1;\n" -" mov.s32 %r71, %r70;\n" -" mov.u32 %r72, 0;\n" -" setp.ne.u32 %p13, %r70, %r72;\n" -" @!%p13 bra $Lt_0_29954;\n" -"$Lt_0_30466:\n" -" setp.ge.u32 %p14, %r14, %r71;\n" -" @%p14 bra $Lt_0_30722;\n" -" add.u32 %r73, %r2, %r71;\n" -" cvt.u64.u32 %rd45, %r73;\n" -" mul.wide.u32 %rd46, %r73, 4;\n" -" add.u64 %rd47, %rd41, %rd46;\n" -" ld.shared.f32 %f154, [%rd47+0];\n" -" add.ftz.f32 %f149, %f154, %f149;\n" -" st.shared.f32 [%rd44+0], %f149;\n" -" ld.shared.f32 %f155, [%rd47+512];\n" -" add.ftz.f32 %f150, %f155, %f150;\n" -" st.shared.f32 [%rd44+512], %f150;\n" -" ld.shared.f32 %f156, [%rd47+1024];\n" -" add.ftz.f32 %f151, %f156, %f151;\n" -" st.shared.f32 [%rd44+1024], %f151;\n" -" ld.shared.f32 %f157, [%rd47+1536];\n" -" add.ftz.f32 %f152, %f157, %f152;\n" -" st.shared.f32 [%rd44+1536], %f152;\n" -" ld.shared.f32 %f158, [%rd47+2048];\n" -" add.ftz.f32 %f153, %f158, %f153;\n" -" st.shared.f32 [%rd44+2048], %f153;\n" -"$Lt_0_30722:\n" -" shr.u32 %r71, %r71, 1;\n" -" mov.u32 %r74, 0;\n" -" setp.ne.u32 %p15, %r71, %r74;\n" -" @%p15 bra $Lt_0_30466;\n" -"$Lt_0_29954:\n" -" mov.f32 %f36, %f149;\n" -" mov.f32 %f35, %f150;\n" -" mov.f32 %f34, %f151;\n" -" mov.f32 %f38, %f152;\n" -" mov.f32 %f37, %f153;\n" -" ld.param.s32 %r75, [__cudaparm_kernel_pair_vflag];\n" -" mov.u32 %r76, 0;\n" -" setp.le.s32 %p16, %r75, %r76;\n" -" @%p16 bra $Lt_0_31490;\n" -" mov.f32 %f149, %f10;\n" -" st.shared.f32 [%rd44+0], %f149;\n" -" mov.f32 %f150, %f12;\n" -" st.shared.f32 [%rd44+512], %f150;\n" -" mov.f32 %f151, %f14;\n" -" st.shared.f32 [%rd44+1024], %f151;\n" -" mov.f32 %f152, %f16;\n" -" st.shared.f32 [%rd44+1536], %f152;\n" -" mov.f32 %f153, %f18;\n" -" st.shared.f32 [%rd44+2048], %f153;\n" -" mov.f32 %f159, %f19;\n" -" st.shared.f32 [%rd44+2560], %f159;\n" -" mov.s32 %r77, %r70;\n" -" @!%p13 bra $Lt_0_32002;\n" -"$Lt_0_32514:\n" -" setp.ge.u32 %p17, %r14, %r77;\n" -" @%p17 bra $Lt_0_32770;\n" -" add.u32 %r78, %r2, %r77;\n" -" cvt.u64.u32 %rd48, %r78;\n" -" mul.wide.u32 %rd49, %r78, 4;\n" -" add.u64 %rd50, %rd41, %rd49;\n" -" ld.shared.f32 %f160, [%rd50+0];\n" -" add.ftz.f32 %f149, %f160, %f149;\n" -" st.shared.f32 [%rd44+0], %f149;\n" -" ld.shared.f32 %f161, [%rd50+512];\n" -" add.ftz.f32 %f150, %f161, %f150;\n" -" st.shared.f32 [%rd44+512], %f150;\n" -" ld.shared.f32 %f162, [%rd50+1024];\n" -" add.ftz.f32 %f151, %f162, %f151;\n" -" st.shared.f32 [%rd44+1024], %f151;\n" -" ld.shared.f32 %f163, [%rd50+1536];\n" -" add.ftz.f32 %f152, %f163, %f152;\n" -" st.shared.f32 [%rd44+1536], %f152;\n" -" ld.shared.f32 %f164, [%rd50+2048];\n" -" add.ftz.f32 %f153, %f164, %f153;\n" -" st.shared.f32 [%rd44+2048], %f153;\n" -" ld.shared.f32 %f165, [%rd50+2560];\n" -" add.ftz.f32 %f159, %f165, %f159;\n" -" st.shared.f32 [%rd44+2560], %f159;\n" -"$Lt_0_32770:\n" -" shr.u32 %r77, %r77, 1;\n" -" mov.u32 %r79, 0;\n" -" setp.ne.u32 %p18, %r77, %r79;\n" -" @%p18 bra $Lt_0_32514;\n" -"$Lt_0_32002:\n" -" mov.f32 %f10, %f149;\n" -" mov.f32 %f12, %f150;\n" -" mov.f32 %f14, %f151;\n" -" mov.f32 %f16, %f152;\n" -" mov.f32 %f18, %f153;\n" -" mov.f32 %f20, %f159;\n" -"$Lt_0_31490:\n" -"$Lt_0_29442:\n" -" mov.u32 %r80, 0;\n" -" setp.ne.s32 %p19, %r14, %r80;\n" -" @%p19 bra $Lt_0_33538;\n" -" ld.param.u64 %rd51, [__cudaparm_kernel_pair___val_paramengv];\n" -" add.u64 %rd52, %rd51, %rd3;\n" -" ld.param.s32 %r81, [__cudaparm_kernel_pair_eflag];\n" -" mov.u32 %r82, 0;\n" -" setp.le.s32 %p20, %r81, %r82;\n" -" @%p20 bra $Lt_0_34050;\n" -" st.global.f32 [%rd52+0], %f38;\n" -" cvt.s64.s32 %rd53, %r9;\n" -" mul.wide.s32 %rd54, %r9, 4;\n" -" add.u64 %rd55, %rd54, %rd52;\n" -" st.global.f32 [%rd55+0], %f37;\n" -" add.u64 %rd52, %rd54, %rd55;\n" -"$Lt_0_34050:\n" -" ld.param.s32 %r83, [__cudaparm_kernel_pair_vflag];\n" -" mov.u32 %r84, 0;\n" -" setp.le.s32 %p21, %r83, %r84;\n" -" @%p21 bra $Lt_0_34562;\n" -" mov.f32 %f166, %f10;\n" -" st.global.f32 [%rd52+0], %f166;\n" -" cvt.s64.s32 %rd56, %r9;\n" -" mul.wide.s32 %rd57, %r9, 4;\n" -" add.u64 %rd58, %rd57, %rd52;\n" -" mov.f32 %f167, %f12;\n" -" st.global.f32 [%rd58+0], %f167;\n" -" add.u64 %rd59, %rd57, %rd58;\n" -" mov.f32 %f168, %f14;\n" -" st.global.f32 [%rd59+0], %f168;\n" -" add.u64 %rd60, %rd57, %rd59;\n" -" mov.f32 %f169, %f16;\n" -" st.global.f32 [%rd60+0], %f169;\n" -" add.u64 %rd52, %rd57, %rd60;\n" -" mov.f32 %f170, %f18;\n" -" st.global.f32 [%rd52+0], %f170;\n" -" mov.f32 %f171, %f20;\n" -" add.u64 %rd61, %rd57, %rd52;\n" -" st.global.f32 [%rd61+0], %f171;\n" -"$Lt_0_34562:\n" -" ld.param.u64 %rd62, [__cudaparm_kernel_pair_ans];\n" -" mul.lo.u64 %rd63, %rd2, 16;\n" -" add.u64 %rd64, %rd62, %rd63;\n" -" mov.f32 %f172, %f173;\n" -" st.global.v4.f32 [%rd64+0], {%f36,%f35,%f34,%f172};\n" -"$Lt_0_33538:\n" -"$Lt_0_23298:\n" -" .loc 16 144 0\n" -" exit;\n" -"$LDWend_kernel_pair:\n" -" }\n" -" .entry kernel_pair_fast (\n" -" .param .u64 __cudaparm_kernel_pair_fast_x_,\n" -" .param .u64 __cudaparm_kernel_pair_fast_lj1_in,\n" -" .param .u64 __cudaparm_kernel_pair_fast_lj3_in,\n" -" .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,\n" -" .param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n" -" .param .u64 __cudaparm_kernel_pair_fast_dev_packed,\n" -" .param .u64 __cudaparm_kernel_pair_fast_ans,\n" -" .param .u64 __cudaparm_kernel_pair_fast___val_paramengv,\n" -" .param .s32 __cudaparm_kernel_pair_fast_eflag,\n" -" .param .s32 __cudaparm_kernel_pair_fast_vflag,\n" -" .param .s32 __cudaparm_kernel_pair_fast_inum,\n" -" .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,\n" -" .param .u64 __cudaparm_kernel_pair_fast_q_,\n" -" .param .f32 __cudaparm_kernel_pair_fast_cut_coulsq,\n" -" .param .f32 __cudaparm_kernel_pair_fast_qqrd2e,\n" -" .param .f32 __cudaparm_kernel_pair_fast_g_ewald,\n" -" .param .s32 __cudaparm_kernel_pair_fast_t_per_atom)\n" -" {\n" -" .reg .u32 %r<86>;\n" -" .reg .u64 %rd<77>;\n" -" .reg .f32 %f<178>;\n" -" .reg .pred %p<25>;\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_32661_33_non_const_sp_lj3320[32];\n" -" .shared .align 16 .b8 __cuda___cuda_local_var_32660_34_non_const_lj33360[1936];\n" -" .shared .align 16 .b8 __cuda___cuda_local_var_32659_34_non_const_lj15296[1936];\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_32762_55_non_const_red_acc7232[3072];\n" -" .loc 16 154 0\n" -"$LDWbegin_kernel_pair_fast:\n" -" cvt.s32.u32 %r1, %tid.x;\n" -" mov.u32 %r2, 7;\n" -" setp.gt.s32 %p1, %r1, %r2;\n" -" @%p1 bra $Lt_1_24834;\n" -" .loc 16 162 0\n" -" mov.u64 %rd1, __cuda___cuda_local_var_32661_33_non_const_sp_lj3320;\n" -" cvt.s64.s32 %rd2, %r1;\n" -" mul.wide.s32 %rd3, %r1, 4;\n" -" ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];\n" -" add.u64 %rd5, %rd4, %rd3;\n" -" ld.global.f32 %f1, [%rd5+0];\n" -" add.u64 %rd6, %rd3, %rd1;\n" -" st.shared.f32 [%rd6+0], %f1;\n" -"$Lt_1_24834:\n" -" mov.u64 %rd1, __cuda___cuda_local_var_32661_33_non_const_sp_lj3320;\n" -" mov.u32 %r3, 120;\n" -" setp.gt.s32 %p2, %r1, %r3;\n" -" @%p2 bra $Lt_1_25346;\n" -" .loc 16 164 0\n" -" mov.u64 %rd7, __cuda___cuda_local_var_32660_34_non_const_lj33360;\n" -" mov.u64 %rd8, __cuda___cuda_local_var_32659_34_non_const_lj15296;\n" -" cvt.s64.s32 %rd9, %r1;\n" -" mul.wide.s32 %rd10, %r1, 16;\n" -" ld.param.u64 %rd11, [__cudaparm_kernel_pair_fast_lj1_in];\n" -" add.u64 %rd12, %rd11, %rd10;\n" -" add.u64 %rd13, %rd10, %rd8;\n" -" ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd12+0];\n" -" st.shared.v4.f32 [%rd13+0], {%f2,%f3,%f4,%f5};\n" -" .loc 16 165 0\n" -" ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in];\n" -" add.u64 %rd15, %rd14, %rd10;\n" -" add.u64 %rd16, %rd10, %rd7;\n" -" ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0];\n" -" st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9};\n" -"$Lt_1_25346:\n" -" mov.u64 %rd7, __cuda___cuda_local_var_32660_34_non_const_lj33360;\n" -" mov.u64 %rd8, __cuda___cuda_local_var_32659_34_non_const_lj15296;\n" -" .loc 16 174 0\n" -" mov.f32 %f10, 0f00000000; \n" -" mov.f32 %f11, %f10;\n" -" mov.f32 %f12, 0f00000000; \n" -" mov.f32 %f13, %f12;\n" -" mov.f32 %f14, 0f00000000; \n" -" mov.f32 %f15, %f14;\n" -" mov.f32 %f16, 0f00000000; \n" -" mov.f32 %f17, %f16;\n" -" mov.f32 %f18, 0f00000000; \n" -" mov.f32 %f19, %f18;\n" -" mov.f32 %f20, 0f00000000; \n" -" mov.f32 %f21, %f20;\n" -" .loc 16 176 0\n" -" bar.sync 0;\n" -" ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_t_per_atom];\n" -" div.s32 %r5, %r1, %r4;\n" -" cvt.s32.u32 %r6, %ntid.x;\n" -" div.s32 %r7, %r6, %r4;\n" -" cvt.s32.u32 %r8, %ctaid.x;\n" -" mul.lo.s32 %r9, %r8, %r7;\n" -" add.s32 %r10, %r5, %r9;\n" -" ld.param.s32 %r11, [__cudaparm_kernel_pair_fast_inum];\n" -" setp.ge.s32 %p3, %r10, %r11;\n" -" @%p3 bra $Lt_1_36098;\n" -" .loc 16 181 0\n" -" cvt.s64.s32 %rd17, %r10;\n" -" mul.wide.s32 %rd18, %r10, 4;\n" -" ld.param.u64 %rd19, [__cudaparm_kernel_pair_fast_dev_nbor];\n" -" add.u64 %rd20, %rd18, %rd19;\n" -" ld.global.s32 %r12, [%rd20+0];\n" -" ld.param.s32 %r13, [__cudaparm_kernel_pair_fast_nbor_pitch];\n" -" cvt.s64.s32 %rd21, %r13;\n" -" mul.wide.s32 %rd22, %r13, 4;\n" -" add.u64 %rd23, %rd22, %rd20;\n" -" ld.global.s32 %r14, [%rd23+0];\n" -" sub.s32 %r15, %r4, 1;\n" -" and.b32 %r16, %r15, %r1;\n" -" cvt.s64.s32 %rd24, %r16;\n" -" mul.wide.s32 %rd25, %r16, 4;\n" -" ld.param.u64 %rd26, [__cudaparm_kernel_pair_fast_dev_packed];\n" -" setp.ne.u64 %p4, %rd26, %rd19;\n" -" @%p4 bra $Lt_1_26626;\n" -" cvt.s32.s64 %r17, %rd21;\n" -" mul.lo.s32 %r18, %r17, %r4;\n" -" mov.s32 %r19, %r18;\n" -" mul.lo.s32 %r20, %r15, %r10;\n" -" add.s32 %r21, %r17, %r20;\n" -" cvt.s64.s32 %rd27, %r21;\n" -" mul.wide.s32 %rd28, %r21, 4;\n" -" add.u64 %rd29, %rd23, %rd28;\n" -" and.b32 %r22, %r15, %r14;\n" -" cvt.s64.s32 %rd30, %r22;\n" -" div.s32 %r23, %r14, %r4;\n" -" mul.lo.s32 %r24, %r18, %r23;\n" -" cvt.s64.s32 %rd31, %r24;\n" -" add.u64 %rd32, %rd30, %rd31;\n" -" mul.lo.u64 %rd33, %rd32, 4;\n" -" add.u64 %rd34, %rd29, %rd33;\n" -" add.u64 %rd35, %rd25, %rd29;\n" -" bra.uni $Lt_1_26370;\n" -"$Lt_1_26626:\n" -" add.u64 %rd36, %rd22, %rd23;\n" -" ld.global.s32 %r25, [%rd36+0];\n" -" cvt.s64.s32 %rd37, %r25;\n" -" mul.wide.s32 %rd38, %r25, 4;\n" -" add.u64 %rd39, %rd26, %rd38;\n" -" cvt.s64.s32 %rd40, %r14;\n" -" mul.wide.s32 %rd41, %r14, 4;\n" -" add.u64 %rd34, %rd39, %rd41;\n" -" mov.s32 %r19, %r4;\n" -" add.u64 %rd35, %rd25, %rd39;\n" -"$Lt_1_26370:\n" -" .loc 16 184 0\n" -" mov.u32 %r26, %r12;\n" -" mov.s32 %r27, 0;\n" -" mov.u32 %r28, %r27;\n" -" mov.s32 %r29, 0;\n" -" mov.u32 %r30, %r29;\n" -" mov.s32 %r31, 0;\n" -" mov.u32 %r32, %r31;\n" -" tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r26,%r28,%r30,%r32}];\n" -" mov.f32 %f26, %f22;\n" -" mov.f32 %f27, %f23;\n" -" mov.f32 %f28, %f24;\n" -" mov.f32 %f29, %f25;\n" -" .loc 16 185 0\n" -" mov.u32 %r33, %r12;\n" -" mov.s32 %r34, 0;\n" -" mov.u32 %r35, %r34;\n" -" mov.s32 %r36, 0;\n" -" mov.u32 %r37, %r36;\n" -" mov.s32 %r38, 0;\n" -" mov.u32 %r39, %r38;\n" -" tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[q_tex,{%r33,%r35,%r37,%r39}];\n" -" mov.f32 %f34, %f30;\n" -" setp.ge.u64 %p5, %rd35, %rd34;\n" -" @%p5 bra $Lt_1_37634;\n" -" cvt.rzi.ftz.s32.f32 %r40, %f29;\n" -" cvt.s64.s32 %rd42, %r19;\n" -" mul.lo.s32 %r41, %r40, 11;\n" -" cvt.rn.f32.s32 %f35, %r41;\n" -" mov.f32 %f36, 0f00000000; \n" -" mov.f32 %f37, 0f00000000; \n" -" mov.f32 %f38, 0f00000000; \n" -" mov.f32 %f39, 0f00000000; \n" -" mov.f32 %f40, 0f00000000; \n" -"$Lt_1_27394:\n" -" .loc 16 190 0\n" -" ld.global.s32 %r42, [%rd35+0];\n" -" .loc 16 193 0\n" -" shr.s32 %r43, %r42, 30;\n" -" and.b32 %r44, %r43, 3;\n" -" cvt.s64.s32 %rd43, %r44;\n" -" mul.wide.s32 %rd44, %r44, 4;\n" -" add.u64 %rd45, %rd1, %rd44;\n" -" ld.shared.f32 %f41, [%rd45+0];\n" -" .loc 16 194 0\n" -" mov.f32 %f42, 0f3f800000; \n" -" ld.shared.f32 %f43, [%rd45+16];\n" -" sub.ftz.f32 %f44, %f42, %f43;\n" -" .loc 16 197 0\n" -" and.b32 %r45, %r42, 1073741823;\n" -" mov.u32 %r46, %r45;\n" -" mov.s32 %r47, 0;\n" -" mov.u32 %r48, %r47;\n" -" mov.s32 %r49, 0;\n" -" mov.u32 %r50, %r49;\n" -" mov.s32 %r51, 0;\n" -" mov.u32 %r52, %r51;\n" -" tex.1d.v4.f32.s32 {%f45,%f46,%f47,%f48},[pos_tex,{%r46,%r48,%r50,%r52}];\n" -" mov.f32 %f49, %f45;\n" -" mov.f32 %f50, %f46;\n" -" mov.f32 %f51, %f47;\n" -" mov.f32 %f52, %f48;\n" -" sub.ftz.f32 %f53, %f27, %f50;\n" -" sub.ftz.f32 %f54, %f26, %f49;\n" -" sub.ftz.f32 %f55, %f28, %f51;\n" -" mul.ftz.f32 %f56, %f53, %f53;\n" -" fma.rn.ftz.f32 %f57, %f54, %f54, %f56;\n" -" fma.rn.ftz.f32 %f58, %f55, %f55, %f57;\n" -" add.ftz.f32 %f59, %f35, %f52;\n" -" cvt.rzi.ftz.s32.f32 %r53, %f59;\n" -" cvt.s64.s32 %rd46, %r53;\n" -" mul.wide.s32 %rd47, %r53, 16;\n" -" add.u64 %rd48, %rd47, %rd8;\n" -" ld.shared.f32 %f60, [%rd48+0];\n" -" setp.gt.ftz.f32 %p6, %f60, %f58;\n" -" @!%p6 bra $Lt_1_31234;\n" -" rcp.approx.ftz.f32 %f61, %f58;\n" -" ld.shared.f32 %f62, [%rd48+4];\n" -" setp.lt.ftz.f32 %p7, %f58, %f62;\n" -" @!%p7 bra $Lt_1_28418;\n" -" add.u64 %rd49, %rd47, %rd7;\n" -" ld.shared.f32 %f63, [%rd49+0];\n" -" mov.f32 %f64, 0f40000000; \n" -" setp.eq.ftz.f32 %p8, %f63, %f64;\n" -" @!%p8 bra $Lt_1_28930;\n" -" .loc 16 212 0\n" -" mul.ftz.f32 %f65, %f61, %f61;\n" -" mov.f32 %f66, %f65;\n" -" mov.f32 %f67, %f66;\n" -" .loc 16 213 0\n" -" mul.ftz.f32 %f68, %f65, %f65;\n" -" mov.f32 %f69, %f68;\n" -" bra.uni $Lt_1_29186;\n" -"$Lt_1_28930:\n" -" mov.f32 %f70, 0f3f800000; \n" -" setp.eq.ftz.f32 %p9, %f63, %f70;\n" -" @!%p9 bra $Lt_1_29442;\n" -" .loc 16 215 0\n" -" rsqrt.approx.ftz.f32 %f71, %f58;\n" -" mul.ftz.f32 %f72, %f61, %f71;\n" -" mov.f32 %f68, %f72;\n" -" mov.f32 %f69, %f68;\n" -" .loc 16 216 0\n" -" mul.ftz.f32 %f66, %f72, %f72;\n" -" mov.f32 %f67, %f66;\n" -" bra.uni $Lt_1_29186;\n" -"$Lt_1_29442:\n" -" .loc 16 218 0\n" -" mul.ftz.f32 %f73, %f61, %f61;\n" -" mul.ftz.f32 %f74, %f61, %f73;\n" -" mov.f32 %f66, %f74;\n" -" mov.f32 %f67, %f66;\n" -" .loc 16 219 0\n" -" mov.f32 %f68, %f74;\n" -" mov.f32 %f69, %f68;\n" -"$Lt_1_29186:\n" -"$Lt_1_28674:\n" -" .loc 16 221 0\n" -" mul.ftz.f32 %f75, %f41, %f66;\n" -" ld.shared.v2.f32 {%f76,%f77}, [%rd48+8];\n" -" mul.ftz.f32 %f78, %f76, %f68;\n" -" sub.ftz.f32 %f79, %f78, %f77;\n" -" mul.ftz.f32 %f80, %f75, %f79;\n" -" bra.uni $Lt_1_28162;\n" -"$Lt_1_28418:\n" -" .loc 16 223 0\n" -" mov.f32 %f80, 0f00000000; \n" -"$Lt_1_28162:\n" -" ld.param.f32 %f81, [__cudaparm_kernel_pair_fast_cut_coulsq];\n" -" setp.gt.ftz.f32 %p10, %f81, %f58;\n" -" @!%p10 bra $Lt_1_29954;\n" -" .loc 20 518 0\n" -" rsqrt.approx.ftz.f32 %f82, %f61;\n" -" ld.param.f32 %f83, [__cudaparm_kernel_pair_fast_g_ewald];\n" -" mul.ftz.f32 %f84, %f83, %f82;\n" -" mul.ftz.f32 %f85, %f84, %f84;\n" -" neg.ftz.f32 %f86, %f85;\n" -" mov.f32 %f87, 0f3fb8aa3b; \n" -" mul.ftz.f32 %f88, %f86, %f87;\n" -" ex2.approx.ftz.f32 %f89, %f88;\n" -" .loc 16 230 0\n" -" mov.f32 %f90, 0f3f800000; \n" -" mov.f32 %f91, 0f3ea7ba05; \n" -" fma.rn.ftz.f32 %f92, %f91, %f84, %f90;\n" -" rcp.approx.ftz.f32 %f93, %f92;\n" -" mov.f32 %f94, 0f3e827906; \n" -" mov.f32 %f95, 0fbe91a98e; \n" -" mov.f32 %f96, 0f3fb5f0e3; \n" -" mov.f32 %f97, 0fbfba00e3; \n" -" mov.f32 %f98, 0f3f87dc22; \n" -" fma.rn.ftz.f32 %f99, %f98, %f93, %f97;\n" -" fma.rn.ftz.f32 %f100, %f93, %f99, %f96;\n" -" fma.rn.ftz.f32 %f101, %f93, %f100, %f95;\n" -" fma.rn.ftz.f32 %f102, %f93, %f101, %f94;\n" -" mul.ftz.f32 %f103, %f93, %f102;\n" -" mul.ftz.f32 %f104, %f89, %f103;\n" -" mov.f32 %f105, %f104;\n" -" .loc 16 231 0\n" -" mov.u32 %r54, %r45;\n" -" mov.s32 %r55, 0;\n" -" mov.u32 %r56, %r55;\n" -" mov.s32 %r57, 0;\n" -" mov.u32 %r58, %r57;\n" -" mov.s32 %r59, 0;\n" -" mov.u32 %r60, %r59;\n" -" tex.1d.v4.f32.s32 {%f106,%f107,%f108,%f109},[q_tex,{%r54,%r56,%r58,%r60}];\n" -" mov.f32 %f110, %f106;\n" -" ld.param.f32 %f111, [__cudaparm_kernel_pair_fast_qqrd2e];\n" -" mul.ftz.f32 %f112, %f111, %f34;\n" -" mul.ftz.f32 %f113, %f112, %f110;\n" -" div.approx.ftz.f32 %f114, %f113, %f82;\n" -" mov.f32 %f115, %f114;\n" -" .loc 16 232 0\n" -" mov.f32 %f116, 0f3f906ebb; \n" -" mul.ftz.f32 %f117, %f84, %f116;\n" -" fma.rn.ftz.f32 %f118, %f89, %f117, %f104;\n" -" sub.ftz.f32 %f119, %f118, %f44;\n" -" mul.ftz.f32 %f120, %f114, %f119;\n" -" bra.uni $Lt_1_29698;\n" -"$Lt_1_29954:\n" -" .loc 16 234 0\n" -" mov.f32 %f120, 0f00000000; \n" -"$Lt_1_29698:\n" -" .loc 16 238 0\n" -" add.ftz.f32 %f121, %f120, %f80;\n" -" mul.ftz.f32 %f122, %f121, %f61;\n" -" fma.rn.ftz.f32 %f38, %f54, %f122, %f38;\n" -" .loc 16 239 0\n" -" fma.rn.ftz.f32 %f37, %f53, %f122, %f37;\n" -" .loc 16 240 0\n" -" fma.rn.ftz.f32 %f36, %f55, %f122, %f36;\n" -" ld.param.s32 %r61, [__cudaparm_kernel_pair_fast_eflag];\n" -" mov.u32 %r62, 0;\n" -" setp.le.s32 %p11, %r61, %r62;\n" -" @%p11 bra $Lt_1_30722;\n" -" .loc 16 243 0\n" -" mov.f32 %f123, %f115;\n" -" mov.f32 %f124, %f105;\n" -" sub.ftz.f32 %f125, %f124, %f44;\n" -" fma.rn.ftz.f32 %f126, %f123, %f125, %f39;\n" -" selp.f32 %f39, %f126, %f39, %p10;\n" -" @!%p7 bra $Lt_1_30722;\n" -" .loc 16 246 0\n" -" add.u64 %rd50, %rd47, %rd7;\n" -" ld.shared.v4.f32 {_,%f127,%f128,%f129}, [%rd50+0];\n" -" mov.f32 %f130, %f67;\n" -" mul.ftz.f32 %f131, %f130, %f41;\n" -" mov.f32 %f132, %f69;\n" -" mul.ftz.f32 %f133, %f127, %f132;\n" -" sub.ftz.f32 %f134, %f133, %f128;\n" -" mul.ftz.f32 %f135, %f131, %f134;\n" -" sub.ftz.f32 %f136, %f135, %f129;\n" -" add.ftz.f32 %f40, %f40, %f136;\n" -"$Lt_1_30722:\n" -"$Lt_1_30210:\n" -" ld.param.s32 %r63, [__cudaparm_kernel_pair_fast_vflag];\n" -" mov.u32 %r64, 0;\n" -" setp.le.s32 %p12, %r63, %r64;\n" -" @%p12 bra $Lt_1_31234;\n" -" .loc 16 251 0\n" -" mov.f32 %f137, %f11;\n" -" mul.ftz.f32 %f138, %f54, %f54;\n" -" fma.rn.ftz.f32 %f139, %f122, %f138, %f137;\n" -" mov.f32 %f11, %f139;\n" -" .loc 16 252 0\n" -" mov.f32 %f140, %f13;\n" -" fma.rn.ftz.f32 %f141, %f122, %f56, %f140;\n" -" mov.f32 %f13, %f141;\n" -" .loc 16 253 0\n" -" mov.f32 %f142, %f15;\n" -" mul.ftz.f32 %f143, %f55, %f55;\n" -" fma.rn.ftz.f32 %f144, %f122, %f143, %f142;\n" -" mov.f32 %f15, %f144;\n" -" .loc 16 254 0\n" -" mov.f32 %f145, %f17;\n" -" mul.ftz.f32 %f146, %f53, %f54;\n" -" fma.rn.ftz.f32 %f147, %f122, %f146, %f145;\n" -" mov.f32 %f17, %f147;\n" -" .loc 16 255 0\n" -" mov.f32 %f148, %f19;\n" -" mul.ftz.f32 %f149, %f54, %f55;\n" -" fma.rn.ftz.f32 %f150, %f122, %f149, %f148;\n" -" mov.f32 %f19, %f150;\n" -" .loc 16 256 0\n" -" mul.ftz.f32 %f151, %f53, %f55;\n" -" fma.rn.ftz.f32 %f20, %f122, %f151, %f20;\n" -" mov.f32 %f21, %f20;\n" -"$Lt_1_31234:\n" -"$Lt_1_27650:\n" -" .loc 16 189 0\n" -" mul.lo.u64 %rd51, %rd42, 4;\n" -" add.u64 %rd35, %rd35, %rd51;\n" -" setp.lt.u64 %p13, %rd35, %rd34;\n" -" @%p13 bra $Lt_1_27394;\n" -" bra.uni $Lt_1_26882;\n" -"$Lt_1_37634:\n" -" mov.f32 %f36, 0f00000000; \n" -" mov.f32 %f37, 0f00000000; \n" -" mov.f32 %f38, 0f00000000; \n" -" mov.f32 %f39, 0f00000000; \n" -" mov.f32 %f40, 0f00000000; \n" -"$Lt_1_26882:\n" -" mov.u32 %r65, 1;\n" -" setp.le.s32 %p14, %r4, %r65;\n" -" @%p14 bra $Lt_1_34050;\n" -" .loc 16 261 0\n" -" mov.u64 %rd52, __cuda___cuda_local_var_32762_55_non_const_red_acc7232;\n" -" cvt.s64.s32 %rd53, %r1;\n" -" mul.wide.s32 %rd54, %r1, 4;\n" -" add.u64 %rd55, %rd52, %rd54;\n" -" mov.f32 %f152, %f38;\n" -" st.shared.f32 [%rd55+0], %f152;\n" -" mov.f32 %f153, %f37;\n" -" st.shared.f32 [%rd55+512], %f153;\n" -" mov.f32 %f154, %f36;\n" -" st.shared.f32 [%rd55+1024], %f154;\n" -" mov.f32 %f155, %f40;\n" -" st.shared.f32 [%rd55+1536], %f155;\n" -" mov.f32 %f156, %f39;\n" -" st.shared.f32 [%rd55+2048], %f156;\n" -" shr.s32 %r66, %r4, 31;\n" -" mov.s32 %r67, 1;\n" -" and.b32 %r68, %r66, %r67;\n" -" add.s32 %r69, %r68, %r4;\n" -" shr.s32 %r70, %r69, 1;\n" -" mov.s32 %r71, %r70;\n" -" mov.u32 %r72, 0;\n" -" setp.ne.u32 %p15, %r70, %r72;\n" -" @!%p15 bra $Lt_1_32514;\n" -"$Lt_1_33026:\n" -" setp.ge.u32 %p16, %r16, %r71;\n" -" @%p16 bra $Lt_1_33282;\n" -" add.u32 %r73, %r1, %r71;\n" -" cvt.u64.u32 %rd56, %r73;\n" -" mul.wide.u32 %rd57, %r73, 4;\n" -" add.u64 %rd58, %rd52, %rd57;\n" -" ld.shared.f32 %f157, [%rd58+0];\n" -" add.ftz.f32 %f152, %f157, %f152;\n" -" st.shared.f32 [%rd55+0], %f152;\n" -" ld.shared.f32 %f158, [%rd58+512];\n" -" add.ftz.f32 %f153, %f158, %f153;\n" -" st.shared.f32 [%rd55+512], %f153;\n" -" ld.shared.f32 %f159, [%rd58+1024];\n" -" add.ftz.f32 %f154, %f159, %f154;\n" -" st.shared.f32 [%rd55+1024], %f154;\n" -" ld.shared.f32 %f160, [%rd58+1536];\n" -" add.ftz.f32 %f155, %f160, %f155;\n" -" st.shared.f32 [%rd55+1536], %f155;\n" -" ld.shared.f32 %f161, [%rd58+2048];\n" -" add.ftz.f32 %f156, %f161, %f156;\n" -" st.shared.f32 [%rd55+2048], %f156;\n" -"$Lt_1_33282:\n" -" shr.u32 %r71, %r71, 1;\n" -" mov.u32 %r74, 0;\n" -" setp.ne.u32 %p17, %r71, %r74;\n" -" @%p17 bra $Lt_1_33026;\n" -"$Lt_1_32514:\n" -" mov.f32 %f38, %f152;\n" -" mov.f32 %f37, %f153;\n" -" mov.f32 %f36, %f154;\n" -" mov.f32 %f40, %f155;\n" -" mov.f32 %f39, %f156;\n" -" ld.param.s32 %r75, [__cudaparm_kernel_pair_fast_vflag];\n" -" mov.u32 %r76, 0;\n" -" setp.le.s32 %p18, %r75, %r76;\n" -" @%p18 bra $Lt_1_34050;\n" -" mov.f32 %f152, %f11;\n" -" st.shared.f32 [%rd55+0], %f152;\n" -" mov.f32 %f153, %f13;\n" -" st.shared.f32 [%rd55+512], %f153;\n" -" mov.f32 %f154, %f15;\n" -" st.shared.f32 [%rd55+1024], %f154;\n" -" mov.f32 %f155, %f17;\n" -" st.shared.f32 [%rd55+1536], %f155;\n" -" mov.f32 %f156, %f19;\n" -" st.shared.f32 [%rd55+2048], %f156;\n" -" mov.f32 %f162, %f20;\n" -" st.shared.f32 [%rd55+2560], %f162;\n" -" mov.s32 %r77, %r70;\n" -" @!%p15 bra $Lt_1_34562;\n" -"$Lt_1_35074:\n" -" setp.ge.u32 %p19, %r16, %r77;\n" -" @%p19 bra $Lt_1_35330;\n" -" add.u32 %r78, %r1, %r77;\n" -" cvt.u64.u32 %rd59, %r78;\n" -" mul.wide.u32 %rd60, %r78, 4;\n" -" add.u64 %rd61, %rd52, %rd60;\n" -" ld.shared.f32 %f163, [%rd61+0];\n" -" add.ftz.f32 %f152, %f163, %f152;\n" -" st.shared.f32 [%rd55+0], %f152;\n" -" ld.shared.f32 %f164, [%rd61+512];\n" -" add.ftz.f32 %f153, %f164, %f153;\n" -" st.shared.f32 [%rd55+512], %f153;\n" -" ld.shared.f32 %f165, [%rd61+1024];\n" -" add.ftz.f32 %f154, %f165, %f154;\n" -" st.shared.f32 [%rd55+1024], %f154;\n" -" ld.shared.f32 %f166, [%rd61+1536];\n" -" add.ftz.f32 %f155, %f166, %f155;\n" -" st.shared.f32 [%rd55+1536], %f155;\n" -" ld.shared.f32 %f167, [%rd61+2048];\n" -" add.ftz.f32 %f156, %f167, %f156;\n" -" st.shared.f32 [%rd55+2048], %f156;\n" -" ld.shared.f32 %f168, [%rd61+2560];\n" -" add.ftz.f32 %f162, %f168, %f162;\n" -" st.shared.f32 [%rd55+2560], %f162;\n" -"$Lt_1_35330:\n" -" shr.u32 %r77, %r77, 1;\n" -" mov.u32 %r79, 0;\n" -" setp.ne.u32 %p20, %r77, %r79;\n" -" @%p20 bra $Lt_1_35074;\n" -"$Lt_1_34562:\n" -" mov.f32 %f11, %f152;\n" -" mov.f32 %f13, %f153;\n" -" mov.f32 %f15, %f154;\n" -" mov.f32 %f17, %f155;\n" -" mov.f32 %f19, %f156;\n" -" mov.f32 %f21, %f162;\n" -"$Lt_1_34050:\n" -"$Lt_1_32002:\n" -" mov.u32 %r80, 0;\n" -" setp.ne.s32 %p21, %r16, %r80;\n" -" @%p21 bra $Lt_1_36098;\n" -" ld.param.u64 %rd62, [__cudaparm_kernel_pair_fast___val_paramengv];\n" -" add.u64 %rd63, %rd62, %rd18;\n" -" ld.param.s32 %r81, [__cudaparm_kernel_pair_fast_eflag];\n" -" mov.u32 %r82, 0;\n" -" setp.le.s32 %p22, %r81, %r82;\n" -" @%p22 bra $Lt_1_36610;\n" -" st.global.f32 [%rd63+0], %f40;\n" -" cvt.s64.s32 %rd64, %r11;\n" -" mul.wide.s32 %rd65, %r11, 4;\n" -" add.u64 %rd66, %rd65, %rd63;\n" -" st.global.f32 [%rd66+0], %f39;\n" -" add.u64 %rd63, %rd65, %rd66;\n" -"$Lt_1_36610:\n" -" ld.param.s32 %r83, [__cudaparm_kernel_pair_fast_vflag];\n" -" mov.u32 %r84, 0;\n" -" setp.le.s32 %p23, %r83, %r84;\n" -" @%p23 bra $Lt_1_37122;\n" -" mov.f32 %f169, %f11;\n" -" st.global.f32 [%rd63+0], %f169;\n" -" cvt.s64.s32 %rd67, %r11;\n" -" mul.wide.s32 %rd68, %r11, 4;\n" -" add.u64 %rd69, %rd68, %rd63;\n" -" mov.f32 %f170, %f13;\n" -" st.global.f32 [%rd69+0], %f170;\n" -" add.u64 %rd70, %rd68, %rd69;\n" -" mov.f32 %f171, %f15;\n" -" st.global.f32 [%rd70+0], %f171;\n" -" add.u64 %rd71, %rd68, %rd70;\n" -" mov.f32 %f172, %f17;\n" -" st.global.f32 [%rd71+0], %f172;\n" -" add.u64 %rd63, %rd68, %rd71;\n" -" mov.f32 %f173, %f19;\n" -" st.global.f32 [%rd63+0], %f173;\n" -" mov.f32 %f174, %f21;\n" -" add.u64 %rd72, %rd68, %rd63;\n" -" st.global.f32 [%rd72+0], %f174;\n" -"$Lt_1_37122:\n" -" ld.param.u64 %rd73, [__cudaparm_kernel_pair_fast_ans];\n" -" mul.lo.u64 %rd74, %rd17, 16;\n" -" add.u64 %rd75, %rd73, %rd74;\n" -" mov.f32 %f175, %f176;\n" -" st.global.v4.f32 [%rd75+0], {%f38,%f37,%f36,%f175};\n" -"$Lt_1_36098:\n" -"$Lt_1_25858:\n" -" .loc 16 264 0\n" -" exit;\n" -"$LDWend_kernel_pair_fast:\n" -" }\n" -; diff --git a/lib/gpu/cg_cmm_ptx.h b/lib/gpu/cg_cmm_ptx.h deleted file mode 100644 index 1409d07723..0000000000 --- a/lib/gpu/cg_cmm_ptx.h +++ /dev/null @@ -1,906 +0,0 @@ -const char * cg_cmm = -" .version 2.3\n" -" .target sm_20\n" -" .address_size 64\n" -" .global .texref pos_tex;\n" -" .entry kernel_pair (\n" -" .param .u64 __cudaparm_kernel_pair_x_,\n" -" .param .u64 __cudaparm_kernel_pair_lj1,\n" -" .param .u64 __cudaparm_kernel_pair_lj3,\n" -" .param .s32 __cudaparm_kernel_pair_lj_types,\n" -" .param .u64 __cudaparm_kernel_pair_sp_lj_in,\n" -" .param .u64 __cudaparm_kernel_pair_dev_nbor,\n" -" .param .u64 __cudaparm_kernel_pair_dev_packed,\n" -" .param .u64 __cudaparm_kernel_pair_ans,\n" -" .param .u64 __cudaparm_kernel_pair___val_paramengv,\n" -" .param .s32 __cudaparm_kernel_pair_eflag,\n" -" .param .s32 __cudaparm_kernel_pair_vflag,\n" -" .param .s32 __cudaparm_kernel_pair_inum,\n" -" .param .s32 __cudaparm_kernel_pair_nbor_pitch,\n" -" .param .s32 __cudaparm_kernel_pair_t_per_atom)\n" -" {\n" -" .reg .u32 %r<72>;\n" -" .reg .u64 %rd<63>;\n" -" .reg .f32 %f<111>;\n" -" .reg .pred %p<21>;\n" -" .shared .align 16 .b8 __cuda___cuda_local_var_32536_33_non_const_sp_lj92[16];\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_32608_55_non_const_red_acc108[3072];\n" -" .loc 16 31 0\n" -"$LDWbegin_kernel_pair:\n" -" .loc 16 36 0\n" -" ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];\n" -" ldu.global.f32 %f1, [%rd1+0];\n" -" .loc 16 37 0\n" -" ld.global.f32 %f2, [%rd1+4];\n" -" .loc 16 38 0\n" -" ld.global.f32 %f3, [%rd1+8];\n" -" .loc 16 39 0\n" -" ld.global.f32 %f4, [%rd1+12];\n" -" st.shared.v4.f32 [__cuda___cuda_local_var_32536_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4};\n" -" .loc 16 46 0\n" -" mov.f32 %f5, 0f00000000; \n" -" mov.f32 %f6, %f5;\n" -" mov.f32 %f7, 0f00000000; \n" -" mov.f32 %f8, %f7;\n" -" mov.f32 %f9, 0f00000000; \n" -" mov.f32 %f10, %f9;\n" -" mov.f32 %f11, 0f00000000; \n" -" mov.f32 %f12, %f11;\n" -" mov.f32 %f13, 0f00000000; \n" -" mov.f32 %f14, %f13;\n" -" mov.f32 %f15, 0f00000000; \n" -" mov.f32 %f16, %f15;\n" -" ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];\n" -" cvt.s32.u32 %r2, %tid.x;\n" -" div.s32 %r3, %r2, %r1;\n" -" cvt.s32.u32 %r4, %ntid.x;\n" -" div.s32 %r5, %r4, %r1;\n" -" cvt.s32.u32 %r6, %ctaid.x;\n" -" mul.lo.s32 %r7, %r6, %r5;\n" -" add.s32 %r8, %r3, %r7;\n" -" ld.param.s32 %r9, [__cudaparm_kernel_pair_inum];\n" -" setp.ge.s32 %p1, %r8, %r9;\n" -" @%p1 bra $Lt_0_28930;\n" -" .loc 16 51 0\n" -" ld.param.s32 %r10, [__cudaparm_kernel_pair_nbor_pitch];\n" -" cvt.s64.s32 %rd2, %r10;\n" -" mul.wide.s32 %rd3, %r10, 4;\n" -" cvt.s64.s32 %rd4, %r8;\n" -" mul.wide.s32 %rd5, %r8, 4;\n" -" ld.param.u64 %rd6, [__cudaparm_kernel_pair_dev_nbor];\n" -" add.u64 %rd7, %rd5, %rd6;\n" -" add.u64 %rd8, %rd3, %rd7;\n" -" ld.global.s32 %r11, [%rd8+0];\n" -" sub.s32 %r12, %r1, 1;\n" -" and.b32 %r13, %r12, %r2;\n" -" cvt.s64.s32 %rd9, %r13;\n" -" mul.wide.s32 %rd10, %r13, 4;\n" -" ld.param.u64 %rd11, [__cudaparm_kernel_pair_dev_packed];\n" -" setp.ne.u64 %p2, %rd11, %rd6;\n" -" @%p2 bra $Lt_0_20994;\n" -" cvt.s32.s64 %r14, %rd2;\n" -" mul.lo.s32 %r15, %r14, %r1;\n" -" mov.s32 %r16, %r15;\n" -" mul.lo.s32 %r17, %r12, %r8;\n" -" add.s32 %r18, %r14, %r17;\n" -" cvt.s64.s32 %rd12, %r18;\n" -" mul.wide.s32 %rd13, %r18, 4;\n" -" add.u64 %rd14, %rd8, %rd13;\n" -" and.b32 %r19, %r12, %r11;\n" -" cvt.s64.s32 %rd15, %r19;\n" -" div.s32 %r20, %r11, %r1;\n" -" mul.lo.s32 %r21, %r15, %r20;\n" -" cvt.s64.s32 %rd16, %r21;\n" -" add.u64 %rd17, %rd15, %rd16;\n" -" mul.lo.u64 %rd18, %rd17, 4;\n" -" add.u64 %rd19, %rd14, %rd18;\n" -" add.u64 %rd20, %rd10, %rd14;\n" -" bra.uni $Lt_0_20738;\n" -"$Lt_0_20994:\n" -" add.u64 %rd21, %rd3, %rd8;\n" -" ld.global.s32 %r22, [%rd21+0];\n" -" cvt.s64.s32 %rd22, %r22;\n" -" mul.wide.s32 %rd23, %r22, 4;\n" -" add.u64 %rd24, %rd11, %rd23;\n" -" cvt.s64.s32 %rd25, %r11;\n" -" mul.wide.s32 %rd26, %r11, 4;\n" -" add.u64 %rd19, %rd24, %rd26;\n" -" mov.s32 %r16, %r1;\n" -" add.u64 %rd20, %rd10, %rd24;\n" -"$Lt_0_20738:\n" -" .loc 16 54 0\n" -" ld.global.s32 %r23, [%rd7+0];\n" -" mov.u32 %r24, %r23;\n" -" mov.s32 %r25, 0;\n" -" mov.u32 %r26, %r25;\n" -" mov.s32 %r27, 0;\n" -" mov.u32 %r28, %r27;\n" -" mov.s32 %r29, 0;\n" -" mov.u32 %r30, %r29;\n" -" tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r24,%r26,%r28,%r30}];\n" -" mov.f32 %f21, %f17;\n" -" mov.f32 %f22, %f18;\n" -" mov.f32 %f23, %f19;\n" -" mov.f32 %f24, %f20;\n" -" setp.ge.u64 %p3, %rd20, %rd19;\n" -" @%p3 bra $Lt_0_30466;\n" -" cvt.rzi.ftz.s32.f32 %r31, %f24;\n" -" cvt.s64.s32 %rd27, %r16;\n" -" ld.param.s32 %r32, [__cudaparm_kernel_pair_lj_types];\n" -" mul.lo.s32 %r33, %r32, %r31;\n" -" ld.param.u64 %rd28, [__cudaparm_kernel_pair_lj1];\n" -" mov.f32 %f25, 0f00000000; \n" -" mov.f32 %f26, 0f00000000; \n" -" mov.f32 %f27, 0f00000000; \n" -" mov.f32 %f28, 0f00000000; \n" -" mov.u64 %rd29, __cuda___cuda_local_var_32536_33_non_const_sp_lj92;\n" -"$Lt_0_21762:\n" -" .loc 16 60 0\n" -" ld.global.s32 %r34, [%rd20+0];\n" -" .loc 16 61 0\n" -" shr.s32 %r35, %r34, 30;\n" -" and.b32 %r36, %r35, 3;\n" -" cvt.s64.s32 %rd30, %r36;\n" -" mul.wide.s32 %rd31, %r36, 4;\n" -" add.u64 %rd32, %rd29, %rd31;\n" -" ld.shared.f32 %f29, [%rd32+0];\n" -" .loc 16 64 0\n" -" and.b32 %r37, %r34, 1073741823;\n" -" mov.u32 %r38, %r37;\n" -" mov.s32 %r39, 0;\n" -" mov.u32 %r40, %r39;\n" -" mov.s32 %r41, 0;\n" -" mov.u32 %r42, %r41;\n" -" mov.s32 %r43, 0;\n" -" mov.u32 %r44, %r43;\n" -" tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r38,%r40,%r42,%r44}];\n" -" mov.f32 %f34, %f30;\n" -" mov.f32 %f35, %f31;\n" -" mov.f32 %f36, %f32;\n" -" mov.f32 %f37, %f33;\n" -" cvt.rzi.ftz.s32.f32 %r45, %f37;\n" -" sub.ftz.f32 %f38, %f22, %f35;\n" -" sub.ftz.f32 %f39, %f21, %f34;\n" -" sub.ftz.f32 %f40, %f23, %f36;\n" -" mul.ftz.f32 %f41, %f38, %f38;\n" -" fma.rn.ftz.f32 %f42, %f39, %f39, %f41;\n" -" fma.rn.ftz.f32 %f43, %f40, %f40, %f42;\n" -" add.s32 %r46, %r45, %r33;\n" -" cvt.s64.s32 %rd33, %r46;\n" -" mul.wide.s32 %rd34, %r46, 16;\n" -" add.u64 %rd35, %rd34, %rd28;\n" -" ld.global.f32 %f44, [%rd35+0];\n" -" setp.gt.ftz.f32 %p4, %f44, %f43;\n" -" @!%p4 bra $Lt_0_24066;\n" -" rcp.approx.ftz.f32 %f45, %f43;\n" -" ld.global.f32 %f46, [%rd35+4];\n" -" mov.f32 %f47, 0f40000000; \n" -" setp.eq.ftz.f32 %p5, %f46, %f47;\n" -" @!%p5 bra $Lt_0_22786;\n" -" .loc 16 79 0\n" -" mul.ftz.f32 %f48, %f45, %f45;\n" -" mov.f32 %f49, %f48;\n" -" .loc 16 80 0\n" -" mul.ftz.f32 %f50, %f48, %f48;\n" -" bra.uni $Lt_0_23042;\n" -"$Lt_0_22786:\n" -" mov.f32 %f51, 0f3f800000; \n" -" setp.eq.ftz.f32 %p6, %f46, %f51;\n" -" @!%p6 bra $Lt_0_23298;\n" -" .loc 16 82 0\n" -" sqrt.approx.ftz.f32 %f52, %f45;\n" -" mul.ftz.f32 %f53, %f45, %f52;\n" -" mov.f32 %f50, %f53;\n" -" .loc 16 83 0\n" -" mul.ftz.f32 %f49, %f53, %f53;\n" -" bra.uni $Lt_0_23042;\n" -"$Lt_0_23298:\n" -" .loc 16 85 0\n" -" mul.ftz.f32 %f54, %f45, %f45;\n" -" mul.ftz.f32 %f55, %f45, %f54;\n" -" mov.f32 %f49, %f55;\n" -" .loc 16 86 0\n" -" mov.f32 %f50, %f55;\n" -"$Lt_0_23042:\n" -"$Lt_0_22530:\n" -" .loc 16 88 0\n" -" mul.ftz.f32 %f56, %f45, %f29;\n" -" mul.ftz.f32 %f57, %f49, %f56;\n" -" ld.global.v2.f32 {%f58,%f59}, [%rd35+8];\n" -" mul.ftz.f32 %f60, %f58, %f50;\n" -" sub.ftz.f32 %f61, %f60, %f59;\n" -" mul.ftz.f32 %f62, %f57, %f61;\n" -" .loc 16 90 0\n" -" fma.rn.ftz.f32 %f27, %f39, %f62, %f27;\n" -" .loc 16 91 0\n" -" fma.rn.ftz.f32 %f26, %f38, %f62, %f26;\n" -" .loc 16 92 0\n" -" fma.rn.ftz.f32 %f25, %f40, %f62, %f25;\n" -" ld.param.s32 %r47, [__cudaparm_kernel_pair_eflag];\n" -" mov.u32 %r48, 0;\n" -" setp.le.s32 %p7, %r47, %r48;\n" -" @%p7 bra $Lt_0_23554;\n" -" .loc 16 94 0\n" -" ld.param.u64 %rd36, [__cudaparm_kernel_pair_lj3];\n" -" add.u64 %rd37, %rd36, %rd34;\n" -" ld.global.v4.f32 {%f63,%f64,%f65,_}, [%rd37+0];\n" -" mul.ftz.f32 %f66, %f29, %f49;\n" -" mul.ftz.f32 %f67, %f63, %f50;\n" -" sub.ftz.f32 %f68, %f67, %f64;\n" -" mul.ftz.f32 %f69, %f66, %f68;\n" -" sub.ftz.f32 %f70, %f69, %f65;\n" -" add.ftz.f32 %f28, %f28, %f70;\n" -"$Lt_0_23554:\n" -" ld.param.s32 %r49, [__cudaparm_kernel_pair_vflag];\n" -" mov.u32 %r50, 0;\n" -" setp.le.s32 %p8, %r49, %r50;\n" -" @%p8 bra $Lt_0_24066;\n" -" .loc 16 97 0\n" -" mov.f32 %f71, %f6;\n" -" mul.ftz.f32 %f72, %f39, %f39;\n" -" fma.rn.ftz.f32 %f73, %f62, %f72, %f71;\n" -" mov.f32 %f6, %f73;\n" -" .loc 16 98 0\n" -" mov.f32 %f74, %f8;\n" -" fma.rn.ftz.f32 %f75, %f62, %f41, %f74;\n" -" mov.f32 %f8, %f75;\n" -" .loc 16 99 0\n" -" mov.f32 %f76, %f10;\n" -" mul.ftz.f32 %f77, %f40, %f40;\n" -" fma.rn.ftz.f32 %f78, %f62, %f77, %f76;\n" -" mov.f32 %f10, %f78;\n" -" .loc 16 100 0\n" -" mov.f32 %f79, %f12;\n" -" mul.ftz.f32 %f80, %f38, %f39;\n" -" fma.rn.ftz.f32 %f81, %f62, %f80, %f79;\n" -" mov.f32 %f12, %f81;\n" -" .loc 16 101 0\n" -" mov.f32 %f82, %f14;\n" -" mul.ftz.f32 %f83, %f39, %f40;\n" -" fma.rn.ftz.f32 %f84, %f62, %f83, %f82;\n" -" mov.f32 %f14, %f84;\n" -" .loc 16 102 0\n" -" mul.ftz.f32 %f85, %f38, %f40;\n" -" fma.rn.ftz.f32 %f15, %f62, %f85, %f15;\n" -" mov.f32 %f16, %f15;\n" -"$Lt_0_24066:\n" -"$Lt_0_22018:\n" -" .loc 16 58 0\n" -" mul.lo.u64 %rd38, %rd27, 4;\n" -" add.u64 %rd20, %rd20, %rd38;\n" -" setp.lt.u64 %p9, %rd20, %rd19;\n" -" @%p9 bra $Lt_0_21762;\n" -" bra.uni $Lt_0_21250;\n" -"$Lt_0_30466:\n" -" mov.f32 %f25, 0f00000000; \n" -" mov.f32 %f26, 0f00000000; \n" -" mov.f32 %f27, 0f00000000; \n" -" mov.f32 %f28, 0f00000000; \n" -"$Lt_0_21250:\n" -" mov.u32 %r51, 1;\n" -" setp.le.s32 %p10, %r1, %r51;\n" -" @%p10 bra $Lt_0_26882;\n" -" .loc 16 107 0\n" -" mov.u64 %rd39, __cuda___cuda_local_var_32608_55_non_const_red_acc108;\n" -" cvt.s64.s32 %rd40, %r2;\n" -" mul.wide.s32 %rd41, %r2, 4;\n" -" add.u64 %rd42, %rd39, %rd41;\n" -" mov.f32 %f86, %f27;\n" -" st.shared.f32 [%rd42+0], %f86;\n" -" mov.f32 %f87, %f26;\n" -" st.shared.f32 [%rd42+512], %f87;\n" -" mov.f32 %f88, %f25;\n" -" st.shared.f32 [%rd42+1024], %f88;\n" -" mov.f32 %f89, %f28;\n" -" st.shared.f32 [%rd42+1536], %f89;\n" -" shr.s32 %r52, %r1, 31;\n" -" mov.s32 %r53, 1;\n" -" and.b32 %r54, %r52, %r53;\n" -" add.s32 %r55, %r54, %r1;\n" -" shr.s32 %r56, %r55, 1;\n" -" mov.s32 %r57, %r56;\n" -" mov.u32 %r58, 0;\n" -" setp.ne.u32 %p11, %r56, %r58;\n" -" @!%p11 bra $Lt_0_25346;\n" -"$Lt_0_25858:\n" -" setp.ge.u32 %p12, %r13, %r57;\n" -" @%p12 bra $Lt_0_26114;\n" -" add.u32 %r59, %r2, %r57;\n" -" cvt.u64.u32 %rd43, %r59;\n" -" mul.wide.u32 %rd44, %r59, 4;\n" -" add.u64 %rd45, %rd39, %rd44;\n" -" ld.shared.f32 %f90, [%rd45+0];\n" -" add.ftz.f32 %f86, %f90, %f86;\n" -" st.shared.f32 [%rd42+0], %f86;\n" -" ld.shared.f32 %f91, [%rd45+512];\n" -" add.ftz.f32 %f87, %f91, %f87;\n" -" st.shared.f32 [%rd42+512], %f87;\n" -" ld.shared.f32 %f92, [%rd45+1024];\n" -" add.ftz.f32 %f88, %f92, %f88;\n" -" st.shared.f32 [%rd42+1024], %f88;\n" -" ld.shared.f32 %f93, [%rd45+1536];\n" -" add.ftz.f32 %f89, %f93, %f89;\n" -" st.shared.f32 [%rd42+1536], %f89;\n" -"$Lt_0_26114:\n" -" shr.u32 %r57, %r57, 1;\n" -" mov.u32 %r60, 0;\n" -" setp.ne.u32 %p13, %r57, %r60;\n" -" @%p13 bra $Lt_0_25858;\n" -"$Lt_0_25346:\n" -" mov.f32 %f27, %f86;\n" -" mov.f32 %f26, %f87;\n" -" mov.f32 %f25, %f88;\n" -" mov.f32 %f28, %f89;\n" -" ld.param.s32 %r61, [__cudaparm_kernel_pair_vflag];\n" -" mov.u32 %r62, 0;\n" -" setp.le.s32 %p14, %r61, %r62;\n" -" @%p14 bra $Lt_0_26882;\n" -" mov.f32 %f86, %f6;\n" -" st.shared.f32 [%rd42+0], %f86;\n" -" mov.f32 %f87, %f8;\n" -" st.shared.f32 [%rd42+512], %f87;\n" -" mov.f32 %f88, %f10;\n" -" st.shared.f32 [%rd42+1024], %f88;\n" -" mov.f32 %f89, %f12;\n" -" st.shared.f32 [%rd42+1536], %f89;\n" -" mov.f32 %f94, %f14;\n" -" st.shared.f32 [%rd42+2048], %f94;\n" -" mov.f32 %f95, %f15;\n" -" st.shared.f32 [%rd42+2560], %f95;\n" -" mov.s32 %r63, %r56;\n" -" @!%p11 bra $Lt_0_27394;\n" -"$Lt_0_27906:\n" -" setp.ge.u32 %p15, %r13, %r63;\n" -" @%p15 bra $Lt_0_28162;\n" -" add.u32 %r64, %r2, %r63;\n" -" cvt.u64.u32 %rd46, %r64;\n" -" mul.wide.u32 %rd47, %r64, 4;\n" -" add.u64 %rd48, %rd39, %rd47;\n" -" ld.shared.f32 %f96, [%rd48+0];\n" -" add.ftz.f32 %f86, %f96, %f86;\n" -" st.shared.f32 [%rd42+0], %f86;\n" -" ld.shared.f32 %f97, [%rd48+512];\n" -" add.ftz.f32 %f87, %f97, %f87;\n" -" st.shared.f32 [%rd42+512], %f87;\n" -" ld.shared.f32 %f98, [%rd48+1024];\n" -" add.ftz.f32 %f88, %f98, %f88;\n" -" st.shared.f32 [%rd42+1024], %f88;\n" -" ld.shared.f32 %f99, [%rd48+1536];\n" -" add.ftz.f32 %f89, %f99, %f89;\n" -" st.shared.f32 [%rd42+1536], %f89;\n" -" ld.shared.f32 %f100, [%rd48+2048];\n" -" add.ftz.f32 %f94, %f100, %f94;\n" -" st.shared.f32 [%rd42+2048], %f94;\n" -" ld.shared.f32 %f101, [%rd48+2560];\n" -" add.ftz.f32 %f95, %f101, %f95;\n" -" st.shared.f32 [%rd42+2560], %f95;\n" -"$Lt_0_28162:\n" -" shr.u32 %r63, %r63, 1;\n" -" mov.u32 %r65, 0;\n" -" setp.ne.u32 %p16, %r63, %r65;\n" -" @%p16 bra $Lt_0_27906;\n" -"$Lt_0_27394:\n" -" mov.f32 %f6, %f86;\n" -" mov.f32 %f8, %f87;\n" -" mov.f32 %f10, %f88;\n" -" mov.f32 %f12, %f89;\n" -" mov.f32 %f14, %f94;\n" -" mov.f32 %f16, %f95;\n" -"$Lt_0_26882:\n" -"$Lt_0_24834:\n" -" mov.u32 %r66, 0;\n" -" setp.ne.s32 %p17, %r13, %r66;\n" -" @%p17 bra $Lt_0_28930;\n" -" ld.param.u64 %rd49, [__cudaparm_kernel_pair___val_paramengv];\n" -" add.u64 %rd50, %rd49, %rd5;\n" -" ld.param.s32 %r67, [__cudaparm_kernel_pair_eflag];\n" -" mov.u32 %r68, 0;\n" -" setp.le.s32 %p18, %r67, %r68;\n" -" @%p18 bra $Lt_0_29442;\n" -" st.global.f32 [%rd50+0], %f28;\n" -" cvt.s64.s32 %rd51, %r9;\n" -" mul.wide.s32 %rd52, %r9, 4;\n" -" add.u64 %rd50, %rd50, %rd52;\n" -"$Lt_0_29442:\n" -" ld.param.s32 %r69, [__cudaparm_kernel_pair_vflag];\n" -" mov.u32 %r70, 0;\n" -" setp.le.s32 %p19, %r69, %r70;\n" -" @%p19 bra $Lt_0_29954;\n" -" mov.f32 %f102, %f6;\n" -" st.global.f32 [%rd50+0], %f102;\n" -" cvt.s64.s32 %rd53, %r9;\n" -" mul.wide.s32 %rd54, %r9, 4;\n" -" add.u64 %rd55, %rd54, %rd50;\n" -" mov.f32 %f103, %f8;\n" -" st.global.f32 [%rd55+0], %f103;\n" -" add.u64 %rd56, %rd54, %rd55;\n" -" mov.f32 %f104, %f10;\n" -" st.global.f32 [%rd56+0], %f104;\n" -" add.u64 %rd57, %rd54, %rd56;\n" -" mov.f32 %f105, %f12;\n" -" st.global.f32 [%rd57+0], %f105;\n" -" add.u64 %rd50, %rd54, %rd57;\n" -" mov.f32 %f106, %f14;\n" -" st.global.f32 [%rd50+0], %f106;\n" -" mov.f32 %f107, %f16;\n" -" add.u64 %rd58, %rd54, %rd50;\n" -" st.global.f32 [%rd58+0], %f107;\n" -"$Lt_0_29954:\n" -" ld.param.u64 %rd59, [__cudaparm_kernel_pair_ans];\n" -" mul.lo.u64 %rd60, %rd4, 16;\n" -" add.u64 %rd61, %rd59, %rd60;\n" -" mov.f32 %f108, %f109;\n" -" st.global.v4.f32 [%rd61+0], {%f27,%f26,%f25,%f108};\n" -"$Lt_0_28930:\n" -"$Lt_0_20226:\n" -" .loc 16 110 0\n" -" exit;\n" -"$LDWend_kernel_pair:\n" -" }\n" -" .entry kernel_pair_fast (\n" -" .param .u64 __cudaparm_kernel_pair_fast_x_,\n" -" .param .u64 __cudaparm_kernel_pair_fast_lj1_in,\n" -" .param .u64 __cudaparm_kernel_pair_fast_lj3_in,\n" -" .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,\n" -" .param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n" -" .param .u64 __cudaparm_kernel_pair_fast_dev_packed,\n" -" .param .u64 __cudaparm_kernel_pair_fast_ans,\n" -" .param .u64 __cudaparm_kernel_pair_fast___val_paramengv,\n" -" .param .s32 __cudaparm_kernel_pair_fast_eflag,\n" -" .param .s32 __cudaparm_kernel_pair_fast_vflag,\n" -" .param .s32 __cudaparm_kernel_pair_fast_inum,\n" -" .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,\n" -" .param .s32 __cudaparm_kernel_pair_fast_t_per_atom)\n" -" {\n" -" .reg .u32 %r<74>;\n" -" .reg .u64 %rd<75>;\n" -" .reg .f32 %f<118>;\n" -" .reg .pred %p<24>;\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_32625_33_non_const_sp_lj3268[16];\n" -" .shared .align 16 .b8 __cuda___cuda_local_var_32623_34_non_const_lj13296[1936];\n" -" .shared .align 16 .b8 __cuda___cuda_local_var_32624_34_non_const_lj35232[1936];\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_32702_55_non_const_red_acc7168[3072];\n" -" .loc 16 118 0\n" -"$LDWbegin_kernel_pair_fast:\n" -" cvt.s32.u32 %r1, %tid.x;\n" -" mov.u32 %r2, 3;\n" -" setp.gt.s32 %p1, %r1, %r2;\n" -" @%p1 bra $Lt_1_22530;\n" -" .loc 16 126 0\n" -" mov.u64 %rd1, __cuda___cuda_local_var_32625_33_non_const_sp_lj3268;\n" -" cvt.s64.s32 %rd2, %r1;\n" -" mul.wide.s32 %rd3, %r1, 4;\n" -" ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];\n" -" add.u64 %rd5, %rd4, %rd3;\n" -" ld.global.f32 %f1, [%rd5+0];\n" -" add.u64 %rd6, %rd3, %rd1;\n" -" st.shared.f32 [%rd6+0], %f1;\n" -"$Lt_1_22530:\n" -" mov.u64 %rd1, __cuda___cuda_local_var_32625_33_non_const_sp_lj3268;\n" -" mov.u32 %r3, 120;\n" -" setp.gt.s32 %p2, %r1, %r3;\n" -" @%p2 bra $Lt_1_23042;\n" -" .loc 16 128 0\n" -" mov.u64 %rd7, __cuda___cuda_local_var_32623_34_non_const_lj13296;\n" -" cvt.s64.s32 %rd8, %r1;\n" -" mul.wide.s32 %rd9, %r1, 16;\n" -" ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_lj1_in];\n" -" add.u64 %rd11, %rd10, %rd9;\n" -" add.u64 %rd12, %rd9, %rd7;\n" -" ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0];\n" -" st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5};\n" -" ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag];\n" -" mov.u32 %r5, 0;\n" -" setp.le.s32 %p3, %r4, %r5;\n" -" @%p3 bra $Lt_1_23554;\n" -" .loc 16 130 0\n" -" mov.u64 %rd13, __cuda___cuda_local_var_32624_34_non_const_lj35232;\n" -" ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in];\n" -" add.u64 %rd15, %rd14, %rd9;\n" -" add.u64 %rd16, %rd9, %rd13;\n" -" ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0];\n" -" st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9};\n" -"$Lt_1_23554:\n" -" mov.u64 %rd13, __cuda___cuda_local_var_32624_34_non_const_lj35232;\n" -"$Lt_1_23042:\n" -" mov.u64 %rd13, __cuda___cuda_local_var_32624_34_non_const_lj35232;\n" -" mov.u64 %rd7, __cuda___cuda_local_var_32623_34_non_const_lj13296;\n" -" .loc 16 138 0\n" -" mov.f32 %f10, 0f00000000; \n" -" mov.f32 %f11, %f10;\n" -" mov.f32 %f12, 0f00000000; \n" -" mov.f32 %f13, %f12;\n" -" mov.f32 %f14, 0f00000000; \n" -" mov.f32 %f15, %f14;\n" -" mov.f32 %f16, 0f00000000; \n" -" mov.f32 %f17, %f16;\n" -" mov.f32 %f18, 0f00000000; \n" -" mov.f32 %f19, %f18;\n" -" mov.f32 %f20, 0f00000000; \n" -" mov.f32 %f21, %f20;\n" -" .loc 16 140 0\n" -" bar.sync 0;\n" -" ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom];\n" -" div.s32 %r7, %r1, %r6;\n" -" cvt.s32.u32 %r8, %ntid.x;\n" -" div.s32 %r9, %r8, %r6;\n" -" cvt.s32.u32 %r10, %ctaid.x;\n" -" mul.lo.s32 %r11, %r10, %r9;\n" -" add.s32 %r12, %r7, %r11;\n" -" ld.param.s32 %r13, [__cudaparm_kernel_pair_fast_inum];\n" -" setp.ge.s32 %p4, %r12, %r13;\n" -" @%p4 bra $Lt_1_32770;\n" -" .loc 16 145 0\n" -" ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_nbor_pitch];\n" -" cvt.s64.s32 %rd17, %r14;\n" -" mul.wide.s32 %rd18, %r14, 4;\n" -" cvt.s64.s32 %rd19, %r12;\n" -" mul.wide.s32 %rd20, %r12, 4;\n" -" ld.param.u64 %rd21, [__cudaparm_kernel_pair_fast_dev_nbor];\n" -" add.u64 %rd22, %rd20, %rd21;\n" -" add.u64 %rd23, %rd18, %rd22;\n" -" ld.global.s32 %r15, [%rd23+0];\n" -" sub.s32 %r16, %r6, 1;\n" -" and.b32 %r17, %r16, %r1;\n" -" cvt.s64.s32 %rd24, %r17;\n" -" mul.wide.s32 %rd25, %r17, 4;\n" -" ld.param.u64 %rd26, [__cudaparm_kernel_pair_fast_dev_packed];\n" -" setp.ne.u64 %p5, %rd26, %rd21;\n" -" @%p5 bra $Lt_1_24834;\n" -" cvt.s32.s64 %r18, %rd17;\n" -" mul.lo.s32 %r19, %r18, %r6;\n" -" mov.s32 %r20, %r19;\n" -" mul.lo.s32 %r21, %r16, %r12;\n" -" add.s32 %r22, %r18, %r21;\n" -" cvt.s64.s32 %rd27, %r22;\n" -" mul.wide.s32 %rd28, %r22, 4;\n" -" add.u64 %rd29, %rd23, %rd28;\n" -" and.b32 %r23, %r16, %r15;\n" -" cvt.s64.s32 %rd30, %r23;\n" -" div.s32 %r24, %r15, %r6;\n" -" mul.lo.s32 %r25, %r19, %r24;\n" -" cvt.s64.s32 %rd31, %r25;\n" -" add.u64 %rd32, %rd30, %rd31;\n" -" mul.lo.u64 %rd33, %rd32, 4;\n" -" add.u64 %rd34, %rd29, %rd33;\n" -" add.u64 %rd35, %rd25, %rd29;\n" -" bra.uni $Lt_1_24578;\n" -"$Lt_1_24834:\n" -" add.u64 %rd36, %rd18, %rd23;\n" -" ld.global.s32 %r26, [%rd36+0];\n" -" cvt.s64.s32 %rd37, %r26;\n" -" mul.wide.s32 %rd38, %r26, 4;\n" -" add.u64 %rd39, %rd26, %rd38;\n" -" cvt.s64.s32 %rd40, %r15;\n" -" mul.wide.s32 %rd41, %r15, 4;\n" -" add.u64 %rd34, %rd39, %rd41;\n" -" mov.s32 %r20, %r6;\n" -" add.u64 %rd35, %rd25, %rd39;\n" -"$Lt_1_24578:\n" -" .loc 16 148 0\n" -" ld.global.s32 %r27, [%rd22+0];\n" -" mov.u32 %r28, %r27;\n" -" mov.s32 %r29, 0;\n" -" mov.u32 %r30, %r29;\n" -" mov.s32 %r31, 0;\n" -" mov.u32 %r32, %r31;\n" -" mov.s32 %r33, 0;\n" -" mov.u32 %r34, %r33;\n" -" tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r28,%r30,%r32,%r34}];\n" -" mov.f32 %f26, %f22;\n" -" mov.f32 %f27, %f23;\n" -" mov.f32 %f28, %f24;\n" -" mov.f32 %f29, %f25;\n" -" setp.ge.u64 %p6, %rd35, %rd34;\n" -" @%p6 bra $Lt_1_34306;\n" -" cvt.rzi.ftz.s32.f32 %r35, %f29;\n" -" cvt.s64.s32 %rd42, %r20;\n" -" mul.lo.s32 %r36, %r35, 11;\n" -" cvt.rn.f32.s32 %f30, %r36;\n" -" mov.f32 %f31, 0f00000000; \n" -" mov.f32 %f32, 0f00000000; \n" -" mov.f32 %f33, 0f00000000; \n" -" mov.f32 %f34, 0f00000000; \n" -"$Lt_1_25602:\n" -" .loc 16 155 0\n" -" ld.global.s32 %r37, [%rd35+0];\n" -" .loc 16 156 0\n" -" shr.s32 %r38, %r37, 30;\n" -" and.b32 %r39, %r38, 3;\n" -" cvt.s64.s32 %rd43, %r39;\n" -" mul.wide.s32 %rd44, %r39, 4;\n" -" add.u64 %rd45, %rd1, %rd44;\n" -" ld.shared.f32 %f35, [%rd45+0];\n" -" .loc 16 159 0\n" -" and.b32 %r40, %r37, 1073741823;\n" -" mov.u32 %r41, %r40;\n" -" mov.s32 %r42, 0;\n" -" mov.u32 %r43, %r42;\n" -" mov.s32 %r44, 0;\n" -" mov.u32 %r45, %r44;\n" -" mov.s32 %r46, 0;\n" -" mov.u32 %r47, %r46;\n" -" tex.1d.v4.f32.s32 {%f36,%f37,%f38,%f39},[pos_tex,{%r41,%r43,%r45,%r47}];\n" -" mov.f32 %f40, %f36;\n" -" mov.f32 %f41, %f37;\n" -" mov.f32 %f42, %f38;\n" -" mov.f32 %f43, %f39;\n" -" sub.ftz.f32 %f44, %f27, %f41;\n" -" sub.ftz.f32 %f45, %f26, %f40;\n" -" sub.ftz.f32 %f46, %f28, %f42;\n" -" mul.ftz.f32 %f47, %f44, %f44;\n" -" fma.rn.ftz.f32 %f48, %f45, %f45, %f47;\n" -" fma.rn.ftz.f32 %f49, %f46, %f46, %f48;\n" -" add.ftz.f32 %f50, %f30, %f43;\n" -" cvt.rzi.ftz.s32.f32 %r48, %f50;\n" -" cvt.s64.s32 %rd46, %r48;\n" -" mul.wide.s32 %rd47, %r48, 16;\n" -" add.u64 %rd48, %rd47, %rd7;\n" -" ld.shared.f32 %f51, [%rd48+0];\n" -" setp.gt.ftz.f32 %p7, %f51, %f49;\n" -" @!%p7 bra $Lt_1_27906;\n" -" rcp.approx.ftz.f32 %f52, %f49;\n" -" ld.shared.f32 %f53, [%rd48+4];\n" -" mov.f32 %f54, 0f40000000; \n" -" setp.eq.ftz.f32 %p8, %f53, %f54;\n" -" @!%p8 bra $Lt_1_26626;\n" -" .loc 16 173 0\n" -" mul.ftz.f32 %f55, %f52, %f52;\n" -" mov.f32 %f56, %f55;\n" -" .loc 16 174 0\n" -" mul.ftz.f32 %f57, %f55, %f55;\n" -" bra.uni $Lt_1_26882;\n" -"$Lt_1_26626:\n" -" mov.f32 %f58, 0f3f800000; \n" -" setp.eq.ftz.f32 %p9, %f53, %f58;\n" -" @!%p9 bra $Lt_1_27138;\n" -" .loc 16 176 0\n" -" sqrt.approx.ftz.f32 %f59, %f52;\n" -" mul.ftz.f32 %f60, %f52, %f59;\n" -" mov.f32 %f57, %f60;\n" -" .loc 16 177 0\n" -" mul.ftz.f32 %f56, %f60, %f60;\n" -" bra.uni $Lt_1_26882;\n" -"$Lt_1_27138:\n" -" .loc 16 179 0\n" -" mul.ftz.f32 %f61, %f52, %f52;\n" -" mul.ftz.f32 %f62, %f52, %f61;\n" -" mov.f32 %f56, %f62;\n" -" .loc 16 180 0\n" -" mov.f32 %f57, %f62;\n" -"$Lt_1_26882:\n" -"$Lt_1_26370:\n" -" .loc 16 182 0\n" -" mul.ftz.f32 %f63, %f52, %f35;\n" -" mul.ftz.f32 %f64, %f56, %f63;\n" -" ld.shared.v2.f32 {%f65,%f66}, [%rd48+8];\n" -" mul.ftz.f32 %f67, %f65, %f57;\n" -" sub.ftz.f32 %f68, %f67, %f66;\n" -" mul.ftz.f32 %f69, %f64, %f68;\n" -" .loc 16 184 0\n" -" fma.rn.ftz.f32 %f33, %f45, %f69, %f33;\n" -" .loc 16 185 0\n" -" fma.rn.ftz.f32 %f32, %f44, %f69, %f32;\n" -" .loc 16 186 0\n" -" fma.rn.ftz.f32 %f31, %f46, %f69, %f31;\n" -" ld.param.s32 %r49, [__cudaparm_kernel_pair_fast_eflag];\n" -" mov.u32 %r50, 0;\n" -" setp.le.s32 %p10, %r49, %r50;\n" -" @%p10 bra $Lt_1_27394;\n" -" .loc 16 188 0\n" -" add.u64 %rd49, %rd47, %rd13;\n" -" ld.shared.v4.f32 {%f70,%f71,%f72,_}, [%rd49+0];\n" -" mul.ftz.f32 %f73, %f35, %f56;\n" -" mul.ftz.f32 %f74, %f70, %f57;\n" -" sub.ftz.f32 %f75, %f74, %f71;\n" -" mul.ftz.f32 %f76, %f73, %f75;\n" -" sub.ftz.f32 %f77, %f76, %f72;\n" -" add.ftz.f32 %f34, %f34, %f77;\n" -"$Lt_1_27394:\n" -" ld.param.s32 %r51, [__cudaparm_kernel_pair_fast_vflag];\n" -" mov.u32 %r52, 0;\n" -" setp.le.s32 %p11, %r51, %r52;\n" -" @%p11 bra $Lt_1_27906;\n" -" .loc 16 191 0\n" -" mov.f32 %f78, %f11;\n" -" mul.ftz.f32 %f79, %f45, %f45;\n" -" fma.rn.ftz.f32 %f80, %f69, %f79, %f78;\n" -" mov.f32 %f11, %f80;\n" -" .loc 16 192 0\n" -" mov.f32 %f81, %f13;\n" -" fma.rn.ftz.f32 %f82, %f69, %f47, %f81;\n" -" mov.f32 %f13, %f82;\n" -" .loc 16 193 0\n" -" mov.f32 %f83, %f15;\n" -" mul.ftz.f32 %f84, %f46, %f46;\n" -" fma.rn.ftz.f32 %f85, %f69, %f84, %f83;\n" -" mov.f32 %f15, %f85;\n" -" .loc 16 194 0\n" -" mov.f32 %f86, %f17;\n" -" mul.ftz.f32 %f87, %f44, %f45;\n" -" fma.rn.ftz.f32 %f88, %f69, %f87, %f86;\n" -" mov.f32 %f17, %f88;\n" -" .loc 16 195 0\n" -" mov.f32 %f89, %f19;\n" -" mul.ftz.f32 %f90, %f45, %f46;\n" -" fma.rn.ftz.f32 %f91, %f69, %f90, %f89;\n" -" mov.f32 %f19, %f91;\n" -" .loc 16 196 0\n" -" mul.ftz.f32 %f92, %f44, %f46;\n" -" fma.rn.ftz.f32 %f20, %f69, %f92, %f20;\n" -" mov.f32 %f21, %f20;\n" -"$Lt_1_27906:\n" -"$Lt_1_25858:\n" -" .loc 16 153 0\n" -" mul.lo.u64 %rd50, %rd42, 4;\n" -" add.u64 %rd35, %rd35, %rd50;\n" -" setp.lt.u64 %p12, %rd35, %rd34;\n" -" @%p12 bra $Lt_1_25602;\n" -" bra.uni $Lt_1_25090;\n" -"$Lt_1_34306:\n" -" mov.f32 %f31, 0f00000000; \n" -" mov.f32 %f32, 0f00000000; \n" -" mov.f32 %f33, 0f00000000; \n" -" mov.f32 %f34, 0f00000000; \n" -"$Lt_1_25090:\n" -" mov.u32 %r53, 1;\n" -" setp.le.s32 %p13, %r6, %r53;\n" -" @%p13 bra $Lt_1_30722;\n" -" .loc 16 201 0\n" -" mov.u64 %rd51, __cuda___cuda_local_var_32702_55_non_const_red_acc7168;\n" -" cvt.s64.s32 %rd52, %r1;\n" -" mul.wide.s32 %rd53, %r1, 4;\n" -" add.u64 %rd54, %rd51, %rd53;\n" -" mov.f32 %f93, %f33;\n" -" st.shared.f32 [%rd54+0], %f93;\n" -" mov.f32 %f94, %f32;\n" -" st.shared.f32 [%rd54+512], %f94;\n" -" mov.f32 %f95, %f31;\n" -" st.shared.f32 [%rd54+1024], %f95;\n" -" mov.f32 %f96, %f34;\n" -" st.shared.f32 [%rd54+1536], %f96;\n" -" shr.s32 %r54, %r6, 31;\n" -" mov.s32 %r55, 1;\n" -" and.b32 %r56, %r54, %r55;\n" -" add.s32 %r57, %r56, %r6;\n" -" shr.s32 %r58, %r57, 1;\n" -" mov.s32 %r59, %r58;\n" -" mov.u32 %r60, 0;\n" -" setp.ne.u32 %p14, %r58, %r60;\n" -" @!%p14 bra $Lt_1_29186;\n" -"$Lt_1_29698:\n" -" setp.ge.u32 %p15, %r17, %r59;\n" -" @%p15 bra $Lt_1_29954;\n" -" add.u32 %r61, %r1, %r59;\n" -" cvt.u64.u32 %rd55, %r61;\n" -" mul.wide.u32 %rd56, %r61, 4;\n" -" add.u64 %rd57, %rd51, %rd56;\n" -" ld.shared.f32 %f97, [%rd57+0];\n" -" add.ftz.f32 %f93, %f97, %f93;\n" -" st.shared.f32 [%rd54+0], %f93;\n" -" ld.shared.f32 %f98, [%rd57+512];\n" -" add.ftz.f32 %f94, %f98, %f94;\n" -" st.shared.f32 [%rd54+512], %f94;\n" -" ld.shared.f32 %f99, [%rd57+1024];\n" -" add.ftz.f32 %f95, %f99, %f95;\n" -" st.shared.f32 [%rd54+1024], %f95;\n" -" ld.shared.f32 %f100, [%rd57+1536];\n" -" add.ftz.f32 %f96, %f100, %f96;\n" -" st.shared.f32 [%rd54+1536], %f96;\n" -"$Lt_1_29954:\n" -" shr.u32 %r59, %r59, 1;\n" -" mov.u32 %r62, 0;\n" -" setp.ne.u32 %p16, %r59, %r62;\n" -" @%p16 bra $Lt_1_29698;\n" -"$Lt_1_29186:\n" -" mov.f32 %f33, %f93;\n" -" mov.f32 %f32, %f94;\n" -" mov.f32 %f31, %f95;\n" -" mov.f32 %f34, %f96;\n" -" ld.param.s32 %r63, [__cudaparm_kernel_pair_fast_vflag];\n" -" mov.u32 %r64, 0;\n" -" setp.le.s32 %p17, %r63, %r64;\n" -" @%p17 bra $Lt_1_30722;\n" -" mov.f32 %f93, %f11;\n" -" st.shared.f32 [%rd54+0], %f93;\n" -" mov.f32 %f94, %f13;\n" -" st.shared.f32 [%rd54+512], %f94;\n" -" mov.f32 %f95, %f15;\n" -" st.shared.f32 [%rd54+1024], %f95;\n" -" mov.f32 %f96, %f17;\n" -" st.shared.f32 [%rd54+1536], %f96;\n" -" mov.f32 %f101, %f19;\n" -" st.shared.f32 [%rd54+2048], %f101;\n" -" mov.f32 %f102, %f20;\n" -" st.shared.f32 [%rd54+2560], %f102;\n" -" mov.s32 %r65, %r58;\n" -" @!%p14 bra $Lt_1_31234;\n" -"$Lt_1_31746:\n" -" setp.ge.u32 %p18, %r17, %r65;\n" -" @%p18 bra $Lt_1_32002;\n" -" add.u32 %r66, %r1, %r65;\n" -" cvt.u64.u32 %rd58, %r66;\n" -" mul.wide.u32 %rd59, %r66, 4;\n" -" add.u64 %rd60, %rd51, %rd59;\n" -" ld.shared.f32 %f103, [%rd60+0];\n" -" add.ftz.f32 %f93, %f103, %f93;\n" -" st.shared.f32 [%rd54+0], %f93;\n" -" ld.shared.f32 %f104, [%rd60+512];\n" -" add.ftz.f32 %f94, %f104, %f94;\n" -" st.shared.f32 [%rd54+512], %f94;\n" -" ld.shared.f32 %f105, [%rd60+1024];\n" -" add.ftz.f32 %f95, %f105, %f95;\n" -" st.shared.f32 [%rd54+1024], %f95;\n" -" ld.shared.f32 %f106, [%rd60+1536];\n" -" add.ftz.f32 %f96, %f106, %f96;\n" -" st.shared.f32 [%rd54+1536], %f96;\n" -" ld.shared.f32 %f107, [%rd60+2048];\n" -" add.ftz.f32 %f101, %f107, %f101;\n" -" st.shared.f32 [%rd54+2048], %f101;\n" -" ld.shared.f32 %f108, [%rd60+2560];\n" -" add.ftz.f32 %f102, %f108, %f102;\n" -" st.shared.f32 [%rd54+2560], %f102;\n" -"$Lt_1_32002:\n" -" shr.u32 %r65, %r65, 1;\n" -" mov.u32 %r67, 0;\n" -" setp.ne.u32 %p19, %r65, %r67;\n" -" @%p19 bra $Lt_1_31746;\n" -"$Lt_1_31234:\n" -" mov.f32 %f11, %f93;\n" -" mov.f32 %f13, %f94;\n" -" mov.f32 %f15, %f95;\n" -" mov.f32 %f17, %f96;\n" -" mov.f32 %f19, %f101;\n" -" mov.f32 %f21, %f102;\n" -"$Lt_1_30722:\n" -"$Lt_1_28674:\n" -" mov.u32 %r68, 0;\n" -" setp.ne.s32 %p20, %r17, %r68;\n" -" @%p20 bra $Lt_1_32770;\n" -" ld.param.u64 %rd61, [__cudaparm_kernel_pair_fast___val_paramengv];\n" -" add.u64 %rd62, %rd61, %rd20;\n" -" ld.param.s32 %r69, [__cudaparm_kernel_pair_fast_eflag];\n" -" mov.u32 %r70, 0;\n" -" setp.le.s32 %p21, %r69, %r70;\n" -" @%p21 bra $Lt_1_33282;\n" -" st.global.f32 [%rd62+0], %f34;\n" -" cvt.s64.s32 %rd63, %r13;\n" -" mul.wide.s32 %rd64, %r13, 4;\n" -" add.u64 %rd62, %rd62, %rd64;\n" -"$Lt_1_33282:\n" -" ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag];\n" -" mov.u32 %r72, 0;\n" -" setp.le.s32 %p22, %r71, %r72;\n" -" @%p22 bra $Lt_1_33794;\n" -" mov.f32 %f109, %f11;\n" -" st.global.f32 [%rd62+0], %f109;\n" -" cvt.s64.s32 %rd65, %r13;\n" -" mul.wide.s32 %rd66, %r13, 4;\n" -" add.u64 %rd67, %rd66, %rd62;\n" -" mov.f32 %f110, %f13;\n" -" st.global.f32 [%rd67+0], %f110;\n" -" add.u64 %rd68, %rd66, %rd67;\n" -" mov.f32 %f111, %f15;\n" -" st.global.f32 [%rd68+0], %f111;\n" -" add.u64 %rd69, %rd66, %rd68;\n" -" mov.f32 %f112, %f17;\n" -" st.global.f32 [%rd69+0], %f112;\n" -" add.u64 %rd62, %rd66, %rd69;\n" -" mov.f32 %f113, %f19;\n" -" st.global.f32 [%rd62+0], %f113;\n" -" mov.f32 %f114, %f21;\n" -" add.u64 %rd70, %rd66, %rd62;\n" -" st.global.f32 [%rd70+0], %f114;\n" -"$Lt_1_33794:\n" -" ld.param.u64 %rd71, [__cudaparm_kernel_pair_fast_ans];\n" -" mul.lo.u64 %rd72, %rd19, 16;\n" -" add.u64 %rd73, %rd71, %rd72;\n" -" mov.f32 %f115, %f116;\n" -" st.global.v4.f32 [%rd73+0], {%f33,%f32,%f31,%f115};\n" -"$Lt_1_32770:\n" -"$Lt_1_24066:\n" -" .loc 16 204 0\n" -" exit;\n" -"$LDWend_kernel_pair_fast:\n" -" }\n" -; diff --git a/lib/gpu/charmm_long.ptx b/lib/gpu/charmm_long.ptx deleted file mode 100644 index 39d7cd5923..0000000000 --- a/lib/gpu/charmm_long.ptx +++ /dev/null @@ -1,1200 +0,0 @@ - .version 2.3 - .target sm_20 - .address_size 64 - // compiled with /usr/local/cuda/open64/lib//be - // nvopencc 4.0 built on 2011-05-12 - - //----------------------------------------------------------- - // Compiling /tmp/tmpxft_00009e6b_00000000-9_lal_charmm_long.cpp3.i (/home/sjplimp/ccBI#.BwX2xw) - //----------------------------------------------------------- - - //----------------------------------------------------------- - // Options: - //----------------------------------------------------------- - // Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64 - // -O3 (Optimization level) - // -g0 (Debug level) - // -m2 (Report advisories) - //----------------------------------------------------------- - - .file 1 "" - .file 2 "/tmp/tmpxft_00009e6b_00000000-8_lal_charmm_long.cudafe2.gpu" - .file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h" - .file 4 "/usr/local/cuda/include/crt/device_runtime.h" - .file 5 "/usr/local/cuda/include/host_defines.h" - .file 6 "/usr/local/cuda/include/builtin_types.h" - .file 7 "/usr/local/cuda/include/device_types.h" - .file 8 "/usr/local/cuda/include/driver_types.h" - .file 9 "/usr/local/cuda/include/surface_types.h" - .file 10 "/usr/local/cuda/include/texture_types.h" - .file 11 "/usr/local/cuda/include/vector_types.h" - .file 12 "/usr/local/cuda/include/device_launch_parameters.h" - .file 13 "/usr/local/cuda/include/crt/storage_class.h" - .file 14 "/usr/include/bits/types.h" - .file 15 "/usr/include/time.h" - .file 16 "lal_charmm_long.cu" - .file 17 "/usr/local/cuda/include/common_functions.h" - .file 18 "/usr/local/cuda/include/math_functions.h" - .file 19 "/usr/local/cuda/include/math_constants.h" - .file 20 "/usr/local/cuda/include/device_functions.h" - .file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h" - .file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h" - .file 23 "/usr/local/cuda/include/sm_13_double_functions.h" - .file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h" - .file 25 "/usr/local/cuda/include/sm_20_intrinsics.h" - .file 26 "/usr/local/cuda/include/surface_functions.h" - .file 27 "/usr/local/cuda/include/texture_fetch_functions.h" - .file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h" - - .global .texref pos_tex; - .global .texref q_tex; - - .entry kernel_pair ( - .param .u64 __cudaparm_kernel_pair_x_, - .param .u64 __cudaparm_kernel_pair_lj1, - .param .s32 __cudaparm_kernel_pair_lj_types, - .param .u64 __cudaparm_kernel_pair_sp_lj_in, - .param .u64 __cudaparm_kernel_pair_dev_nbor, - .param .u64 __cudaparm_kernel_pair_dev_packed, - .param .u64 __cudaparm_kernel_pair_ans, - .param .u64 __cudaparm_kernel_pair___val_paramengv, - .param .s32 __cudaparm_kernel_pair_eflag, - .param .s32 __cudaparm_kernel_pair_vflag, - .param .s32 __cudaparm_kernel_pair_inum, - .param .s32 __cudaparm_kernel_pair_nbor_pitch, - .param .u64 __cudaparm_kernel_pair_q_, - .param .f32 __cudaparm_kernel_pair_cut_coulsq, - .param .f32 __cudaparm_kernel_pair_qqrd2e, - .param .f32 __cudaparm_kernel_pair_g_ewald, - .param .f32 __cudaparm_kernel_pair_denom_lj, - .param .f32 __cudaparm_kernel_pair_cut_bothsq, - .param .f32 __cudaparm_kernel_pair_cut_ljsq, - .param .f32 __cudaparm_kernel_pair_cut_lj_innersq, - .param .s32 __cudaparm_kernel_pair_t_per_atom) - { - .reg .u32 %r<91>; - .reg .u64 %rd<65>; - .reg .f32 %f<190>; - .reg .pred %p<23>; - .shared .align 16 .b8 __cuda___cuda_local_var_32542_33_non_const_sp_lj120[32]; - .shared .align 4 .b8 __cuda___cuda_local_var_32646_55_non_const_red_acc152[3072]; - // __cuda_local_var_32554_10_non_const_f = 64 - // __cuda_local_var_32556_9_non_const_virial = 16 - // __cuda_local_var_32590_43_non_const_r6inv = 40 - // __cuda_local_var_32590_50_non_const_prefactor = 52 - // __cuda_local_var_32590_61_non_const__erfc = 48 - // __cuda_local_var_32590_68_non_const_switch1 = 44 - .loc 16 37 0 -$LDWbegin_kernel_pair: - .loc 16 42 0 - ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in]; - ldu.global.f32 %f1, [%rd1+0]; - .loc 16 43 0 - ld.global.f32 %f2, [%rd1+4]; - .loc 16 44 0 - ld.global.f32 %f3, [%rd1+8]; - .loc 16 45 0 - ld.global.f32 %f4, [%rd1+12]; - st.shared.v4.f32 [__cuda___cuda_local_var_32542_33_non_const_sp_lj120+0], {%f1,%f2,%f3,%f4}; - .loc 16 46 0 - ld.global.f32 %f5, [%rd1+16]; - .loc 16 47 0 - ld.global.f32 %f6, [%rd1+20]; - .loc 16 48 0 - ld.global.f32 %f7, [%rd1+24]; - .loc 16 49 0 - ld.global.f32 %f8, [%rd1+28]; - st.shared.v4.f32 [__cuda___cuda_local_var_32542_33_non_const_sp_lj120+16], {%f5,%f6,%f7,%f8}; - .loc 16 57 0 - mov.f32 %f9, 0f00000000; // 0 - mov.f32 %f10, %f9; - mov.f32 %f11, 0f00000000; // 0 - mov.f32 %f12, %f11; - mov.f32 %f13, 0f00000000; // 0 - mov.f32 %f14, %f13; - mov.f32 %f15, 0f00000000; // 0 - mov.f32 %f16, %f15; - mov.f32 %f17, 0f00000000; // 0 - mov.f32 %f18, %f17; - mov.f32 %f19, 0f00000000; // 0 - mov.f32 %f20, %f19; - ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom]; - cvt.s32.u32 %r2, %tid.x; - div.s32 %r3, %r2, %r1; - cvt.s32.u32 %r4, %ntid.x; - div.s32 %r5, %r4, %r1; - cvt.s32.u32 %r6, %ctaid.x; - mul.lo.s32 %r7, %r6, %r5; - add.s32 %r8, %r3, %r7; - ld.param.s32 %r9, [__cudaparm_kernel_pair_inum]; - setp.ge.s32 %p1, %r8, %r9; - @%p1 bra $Lt_0_33026; - .loc 16 62 0 - cvt.s64.s32 %rd2, %r8; - mul.wide.s32 %rd3, %r8, 4; - ld.param.u64 %rd4, [__cudaparm_kernel_pair_dev_nbor]; - add.u64 %rd5, %rd3, %rd4; - ld.global.s32 %r10, [%rd5+0]; - ld.param.s32 %r11, [__cudaparm_kernel_pair_nbor_pitch]; - cvt.s64.s32 %rd6, %r11; - mul.wide.s32 %rd7, %r11, 4; - add.u64 %rd8, %rd7, %rd5; - ld.global.s32 %r12, [%rd8+0]; - sub.s32 %r13, %r1, 1; - and.b32 %r14, %r13, %r2; - cvt.s64.s32 %rd9, %r14; - mul.wide.s32 %rd10, %r14, 4; - ld.param.u64 %rd11, [__cudaparm_kernel_pair_dev_packed]; - setp.ne.u64 %p2, %rd11, %rd4; - @%p2 bra $Lt_0_24066; - cvt.s32.s64 %r15, %rd6; - mul.lo.s32 %r16, %r15, %r1; - mov.s32 %r17, %r16; - mul.lo.s32 %r18, %r13, %r8; - add.s32 %r19, %r15, %r18; - cvt.s64.s32 %rd12, %r19; - mul.wide.s32 %rd13, %r19, 4; - add.u64 %rd14, %rd8, %rd13; - and.b32 %r20, %r13, %r12; - cvt.s64.s32 %rd15, %r20; - div.s32 %r21, %r12, %r1; - mul.lo.s32 %r22, %r16, %r21; - cvt.s64.s32 %rd16, %r22; - add.u64 %rd17, %rd15, %rd16; - mul.lo.u64 %rd18, %rd17, 4; - add.u64 %rd19, %rd14, %rd18; - add.u64 %rd20, %rd10, %rd14; - bra.uni $Lt_0_23810; -$Lt_0_24066: - add.u64 %rd21, %rd7, %rd8; - ld.global.s32 %r23, [%rd21+0]; - cvt.s64.s32 %rd22, %r23; - mul.wide.s32 %rd23, %r23, 4; - add.u64 %rd24, %rd11, %rd23; - cvt.s64.s32 %rd25, %r12; - mul.wide.s32 %rd26, %r12, 4; - add.u64 %rd19, %rd24, %rd26; - mov.s32 %r17, %r1; - add.u64 %rd20, %rd10, %rd24; -$Lt_0_23810: - .loc 16 65 0 - mov.u32 %r24, %r10; - mov.s32 %r25, 0; - mov.u32 %r26, %r25; - mov.s32 %r27, 0; - mov.u32 %r28, %r27; - mov.s32 %r29, 0; - mov.u32 %r30, %r29; - tex.1d.v4.f32.s32 {%f21,%f22,%f23,%f24},[pos_tex,{%r24,%r26,%r28,%r30}]; - mov.f32 %f25, %f21; - mov.f32 %f26, %f22; - mov.f32 %f27, %f23; - mov.f32 %f28, %f24; - .loc 16 66 0 - mov.u32 %r31, %r10; - mov.s32 %r32, 0; - mov.u32 %r33, %r32; - mov.s32 %r34, 0; - mov.u32 %r35, %r34; - mov.s32 %r36, 0; - mov.u32 %r37, %r36; - tex.1d.v4.f32.s32 {%f29,%f30,%f31,%f32},[q_tex,{%r31,%r33,%r35,%r37}]; - mov.f32 %f33, %f29; - setp.ge.u64 %p3, %rd20, %rd19; - @%p3 bra $Lt_0_34562; - cvt.s64.s32 %rd27, %r17; - ld.param.f32 %f34, [__cudaparm_kernel_pair_cut_bothsq]; - mov.f32 %f35, 0f00000000; // 0 - mov.f32 %f36, 0f00000000; // 0 - mov.f32 %f37, 0f00000000; // 0 - mov.f32 %f38, 0f00000000; // 0 - mov.f32 %f39, 0f00000000; // 0 - mov.u64 %rd28, __cuda___cuda_local_var_32542_33_non_const_sp_lj120; -$Lt_0_24834: - // Loop body line 66, nesting depth: 1, estimated iterations: unknown - .loc 16 70 0 - ld.global.s32 %r38, [%rd20+0]; - .loc 16 73 0 - shr.s32 %r39, %r38, 30; - and.b32 %r40, %r39, 3; - cvt.s64.s32 %rd29, %r40; - mul.wide.s32 %rd30, %r40, 4; - add.u64 %rd31, %rd28, %rd30; - ld.shared.f32 %f40, [%rd31+0]; - .loc 16 74 0 - mov.f32 %f41, 0f3f800000; // 1 - ld.shared.f32 %f42, [%rd31+16]; - sub.ftz.f32 %f43, %f41, %f42; - .loc 16 77 0 - and.b32 %r41, %r38, 1073741823; - mov.u32 %r42, %r41; - mov.s32 %r43, 0; - mov.u32 %r44, %r43; - mov.s32 %r45, 0; - mov.u32 %r46, %r45; - mov.s32 %r47, 0; - mov.u32 %r48, %r47; - tex.1d.v4.f32.s32 {%f44,%f45,%f46,%f47},[pos_tex,{%r42,%r44,%r46,%r48}]; - mov.f32 %f48, %f44; - mov.f32 %f49, %f45; - mov.f32 %f50, %f46; - mov.f32 %f51, %f47; - sub.ftz.f32 %f52, %f26, %f49; - sub.ftz.f32 %f53, %f25, %f48; - sub.ftz.f32 %f54, %f27, %f50; - mul.ftz.f32 %f55, %f52, %f52; - fma.rn.ftz.f32 %f56, %f53, %f53, %f55; - fma.rn.ftz.f32 %f57, %f54, %f54, %f56; - setp.lt.ftz.f32 %p4, %f57, %f34; - @!%p4 bra $Lt_0_28162; - ld.param.f32 %f58, [__cudaparm_kernel_pair_cut_ljsq]; - setp.lt.ftz.f32 %p5, %f57, %f58; - rcp.approx.ftz.f32 %f59, %f57; - @!%p5 bra $Lt_0_25858; - .loc 16 92 0 - mul.ftz.f32 %f60, %f59, %f59; - mul.ftz.f32 %f61, %f59, %f60; - mov.f32 %f62, %f61; - .loc 16 93 0 - cvt.rzi.ftz.s32.f32 %r49, %f51; - cvt.rzi.ftz.s32.f32 %r50, %f28; - ld.param.u64 %rd32, [__cudaparm_kernel_pair_lj1]; - ld.param.s32 %r51, [__cudaparm_kernel_pair_lj_types]; - mul.lo.s32 %r52, %r51, %r50; - add.s32 %r53, %r49, %r52; - cvt.s64.s32 %rd33, %r53; - mul.wide.s32 %rd34, %r53, 16; - add.u64 %rd35, %rd32, %rd34; - mul.ftz.f32 %f63, %f61, %f40; - ld.global.v2.f32 {%f64,%f65}, [%rd35+0]; - mul.ftz.f32 %f66, %f64, %f61; - sub.ftz.f32 %f67, %f66, %f65; - mul.ftz.f32 %f68, %f63, %f67; - ld.param.f32 %f69, [__cudaparm_kernel_pair_cut_lj_innersq]; - setp.gt.ftz.f32 %p6, %f57, %f69; - @!%p6 bra $Lt_0_25602; - .loc 16 99 0 - add.ftz.f32 %f70, %f57, %f57; - sub.ftz.f32 %f71, %f58, %f57; - add.ftz.f32 %f72, %f70, %f58; - mul.ftz.f32 %f73, %f71, %f71; - mov.f32 %f74, 0f40400000; // 3 - mul.ftz.f32 %f75, %f74, %f69; - sub.ftz.f32 %f76, %f72, %f75; - ld.param.f32 %f77, [__cudaparm_kernel_pair_denom_lj]; - div.approx.ftz.f32 %f78, %f76, %f77; - mul.ftz.f32 %f79, %f73, %f78; - mov.f32 %f80, %f79; - .loc 16 102 0 - mov.f32 %f81, 0f41400000; // 12 - mul.ftz.f32 %f82, %f57, %f81; - mul.ftz.f32 %f83, %f71, %f82; - sub.ftz.f32 %f84, %f57, %f69; - mul.ftz.f32 %f85, %f83, %f84; - div.approx.ftz.f32 %f86, %f85, %f77; - ld.global.v2.f32 {%f87,%f88}, [%rd35+8]; - mul.ftz.f32 %f89, %f87, %f61; - sub.ftz.f32 %f90, %f89, %f88; - mul.ftz.f32 %f91, %f61, %f90; - mul.ftz.f32 %f92, %f86, %f91; - fma.rn.ftz.f32 %f68, %f68, %f79, %f92; - bra.uni $Lt_0_25602; -$Lt_0_25858: - .loc 16 105 0 - mov.f32 %f68, 0f00000000; // 0 -$Lt_0_25602: - ld.param.f32 %f93, [__cudaparm_kernel_pair_cut_coulsq]; - setp.gt.ftz.f32 %p7, %f93, %f57; - @!%p7 bra $Lt_0_26882; - .loc 20 518 0 - rsqrt.approx.ftz.f32 %f94, %f59; - ld.param.f32 %f95, [__cudaparm_kernel_pair_g_ewald]; - mul.ftz.f32 %f96, %f95, %f94; - mul.ftz.f32 %f97, %f96, %f96; - neg.ftz.f32 %f98, %f97; - mov.f32 %f99, 0f3fb8aa3b; // 1.4427 - mul.ftz.f32 %f100, %f98, %f99; - ex2.approx.ftz.f32 %f101, %f100; - .loc 16 112 0 - mov.f32 %f102, 0f3f800000; // 1 - mov.f32 %f103, 0f3ea7ba05; // 0.327591 - fma.rn.ftz.f32 %f104, %f103, %f96, %f102; - rcp.approx.ftz.f32 %f105, %f104; - mov.f32 %f106, 0f3e827906; // 0.25483 - mov.f32 %f107, 0fbe91a98e; // -0.284497 - mov.f32 %f108, 0f3fb5f0e3; // 1.42141 - mov.f32 %f109, 0fbfba00e3; // -1.45315 - mov.f32 %f110, 0f3f87dc22; // 1.06141 - fma.rn.ftz.f32 %f111, %f110, %f105, %f109; - fma.rn.ftz.f32 %f112, %f105, %f111, %f108; - fma.rn.ftz.f32 %f113, %f105, %f112, %f107; - fma.rn.ftz.f32 %f114, %f105, %f113, %f106; - mul.ftz.f32 %f115, %f105, %f114; - mul.ftz.f32 %f116, %f101, %f115; - mov.f32 %f117, %f116; - .loc 16 113 0 - mov.u32 %r54, %r41; - mov.s32 %r55, 0; - mov.u32 %r56, %r55; - mov.s32 %r57, 0; - mov.u32 %r58, %r57; - mov.s32 %r59, 0; - mov.u32 %r60, %r59; - tex.1d.v4.f32.s32 {%f118,%f119,%f120,%f121},[q_tex,{%r54,%r56,%r58,%r60}]; - mov.f32 %f122, %f118; - ld.param.f32 %f123, [__cudaparm_kernel_pair_qqrd2e]; - mul.ftz.f32 %f124, %f123, %f33; - mul.ftz.f32 %f125, %f124, %f122; - div.approx.ftz.f32 %f126, %f125, %f94; - mov.f32 %f127, %f126; - .loc 16 114 0 - mov.f32 %f128, 0f3f906ebb; // 1.12838 - mul.ftz.f32 %f129, %f96, %f128; - fma.rn.ftz.f32 %f130, %f101, %f129, %f116; - sub.ftz.f32 %f131, %f130, %f43; - mul.ftz.f32 %f132, %f126, %f131; - bra.uni $Lt_0_26626; -$Lt_0_26882: - .loc 16 116 0 - mov.f32 %f132, 0f00000000; // 0 -$Lt_0_26626: - .loc 16 120 0 - add.ftz.f32 %f133, %f132, %f68; - mul.ftz.f32 %f134, %f133, %f59; - fma.rn.ftz.f32 %f37, %f53, %f134, %f37; - .loc 16 121 0 - fma.rn.ftz.f32 %f36, %f52, %f134, %f36; - .loc 16 122 0 - fma.rn.ftz.f32 %f35, %f54, %f134, %f35; - ld.param.s32 %r61, [__cudaparm_kernel_pair_eflag]; - mov.u32 %r62, 0; - setp.le.s32 %p8, %r61, %r62; - @%p8 bra $Lt_0_27650; - .loc 16 125 0 - mov.f32 %f135, %f127; - mov.f32 %f136, %f117; - sub.ftz.f32 %f137, %f136, %f43; - fma.rn.ftz.f32 %f138, %f135, %f137, %f38; - selp.f32 %f38, %f138, %f38, %p7; - @!%p5 bra $Lt_0_27650; - .loc 16 128 0 - cvt.rzi.ftz.s32.f32 %r63, %f51; - cvt.rzi.ftz.s32.f32 %r64, %f28; - ld.param.u64 %rd36, [__cudaparm_kernel_pair_lj1]; - ld.param.s32 %r65, [__cudaparm_kernel_pair_lj_types]; - mul.lo.s32 %r66, %r65, %r64; - add.s32 %r67, %r63, %r66; - cvt.s64.s32 %rd37, %r67; - mul.wide.s32 %rd38, %r67, 16; - add.u64 %rd35, %rd36, %rd38; - mov.f32 %f139, %f62; - ld.global.v2.f32 {%f140,%f141}, [%rd35+8]; - mul.ftz.f32 %f142, %f140, %f139; - sub.ftz.f32 %f143, %f142, %f141; - mul.ftz.f32 %f144, %f139, %f143; - mov.f32 %f145, %f80; - mul.ftz.f32 %f146, %f145, %f144; - ld.param.f32 %f147, [__cudaparm_kernel_pair_cut_lj_innersq]; - setp.lt.ftz.f32 %p9, %f147, %f57; - selp.f32 %f148, %f146, %f144, %p9; - .loc 16 131 0 - fma.rn.ftz.f32 %f39, %f40, %f148, %f39; -$Lt_0_27650: -$Lt_0_27138: - ld.param.s32 %r68, [__cudaparm_kernel_pair_vflag]; - mov.u32 %r69, 0; - setp.le.s32 %p10, %r68, %r69; - @%p10 bra $Lt_0_28162; - .loc 16 135 0 - mov.f32 %f149, %f10; - mul.ftz.f32 %f150, %f53, %f53; - fma.rn.ftz.f32 %f151, %f134, %f150, %f149; - mov.f32 %f10, %f151; - .loc 16 136 0 - mov.f32 %f152, %f12; - fma.rn.ftz.f32 %f153, %f134, %f55, %f152; - mov.f32 %f12, %f153; - .loc 16 137 0 - mov.f32 %f154, %f14; - mul.ftz.f32 %f155, %f54, %f54; - fma.rn.ftz.f32 %f156, %f134, %f155, %f154; - mov.f32 %f14, %f156; - .loc 16 138 0 - mov.f32 %f157, %f16; - mul.ftz.f32 %f158, %f52, %f53; - fma.rn.ftz.f32 %f159, %f134, %f158, %f157; - mov.f32 %f16, %f159; - .loc 16 139 0 - mov.f32 %f160, %f18; - mul.ftz.f32 %f161, %f53, %f54; - fma.rn.ftz.f32 %f162, %f134, %f161, %f160; - mov.f32 %f18, %f162; - .loc 16 140 0 - mul.ftz.f32 %f163, %f52, %f54; - fma.rn.ftz.f32 %f19, %f134, %f163, %f19; - mov.f32 %f20, %f19; -$Lt_0_28162: -$Lt_0_25090: - .loc 16 69 0 - mul.lo.u64 %rd39, %rd27, 4; - add.u64 %rd20, %rd20, %rd39; - setp.lt.u64 %p11, %rd20, %rd19; - @%p11 bra $Lt_0_24834; - bra.uni $Lt_0_24322; -$Lt_0_34562: - mov.f32 %f35, 0f00000000; // 0 - mov.f32 %f36, 0f00000000; // 0 - mov.f32 %f37, 0f00000000; // 0 - mov.f32 %f38, 0f00000000; // 0 - mov.f32 %f39, 0f00000000; // 0 -$Lt_0_24322: - mov.u32 %r70, 1; - setp.le.s32 %p12, %r1, %r70; - @%p12 bra $Lt_0_30978; - .loc 16 145 0 - mov.u64 %rd40, __cuda___cuda_local_var_32646_55_non_const_red_acc152; - cvt.s64.s32 %rd41, %r2; - mul.wide.s32 %rd42, %r2, 4; - add.u64 %rd43, %rd40, %rd42; - mov.f32 %f164, %f37; - st.shared.f32 [%rd43+0], %f164; - mov.f32 %f165, %f36; - st.shared.f32 [%rd43+512], %f165; - mov.f32 %f166, %f35; - st.shared.f32 [%rd43+1024], %f166; - mov.f32 %f167, %f39; - st.shared.f32 [%rd43+1536], %f167; - mov.f32 %f168, %f38; - st.shared.f32 [%rd43+2048], %f168; - shr.s32 %r71, %r1, 31; - mov.s32 %r72, 1; - and.b32 %r73, %r71, %r72; - add.s32 %r74, %r73, %r1; - shr.s32 %r75, %r74, 1; - mov.s32 %r76, %r75; - mov.u32 %r77, 0; - setp.ne.u32 %p13, %r75, %r77; - @!%p13 bra $Lt_0_29442; -$Lt_0_29954: - setp.ge.u32 %p14, %r14, %r76; - @%p14 bra $Lt_0_30210; - add.u32 %r78, %r2, %r76; - cvt.u64.u32 %rd44, %r78; - mul.wide.u32 %rd45, %r78, 4; - add.u64 %rd46, %rd40, %rd45; - ld.shared.f32 %f169, [%rd46+0]; - add.ftz.f32 %f164, %f169, %f164; - st.shared.f32 [%rd43+0], %f164; - ld.shared.f32 %f170, [%rd46+512]; - add.ftz.f32 %f165, %f170, %f165; - st.shared.f32 [%rd43+512], %f165; - ld.shared.f32 %f171, [%rd46+1024]; - add.ftz.f32 %f166, %f171, %f166; - st.shared.f32 [%rd43+1024], %f166; - ld.shared.f32 %f172, [%rd46+1536]; - add.ftz.f32 %f167, %f172, %f167; - st.shared.f32 [%rd43+1536], %f167; - ld.shared.f32 %f173, [%rd46+2048]; - add.ftz.f32 %f168, %f173, %f168; - st.shared.f32 [%rd43+2048], %f168; -$Lt_0_30210: - shr.u32 %r76, %r76, 1; - mov.u32 %r79, 0; - setp.ne.u32 %p15, %r76, %r79; - @%p15 bra $Lt_0_29954; -$Lt_0_29442: - mov.f32 %f37, %f164; - mov.f32 %f36, %f165; - mov.f32 %f35, %f166; - mov.f32 %f39, %f167; - mov.f32 %f38, %f168; - ld.param.s32 %r80, [__cudaparm_kernel_pair_vflag]; - mov.u32 %r81, 0; - setp.le.s32 %p16, %r80, %r81; - @%p16 bra $Lt_0_30978; - mov.f32 %f164, %f10; - st.shared.f32 [%rd43+0], %f164; - mov.f32 %f165, %f12; - st.shared.f32 [%rd43+512], %f165; - mov.f32 %f166, %f14; - st.shared.f32 [%rd43+1024], %f166; - mov.f32 %f167, %f16; - st.shared.f32 [%rd43+1536], %f167; - mov.f32 %f168, %f18; - st.shared.f32 [%rd43+2048], %f168; - mov.f32 %f174, %f19; - st.shared.f32 [%rd43+2560], %f174; - mov.s32 %r82, %r75; - @!%p13 bra $Lt_0_31490; -$Lt_0_32002: - setp.ge.u32 %p17, %r14, %r82; - @%p17 bra $Lt_0_32258; - add.u32 %r83, %r2, %r82; - cvt.u64.u32 %rd47, %r83; - mul.wide.u32 %rd48, %r83, 4; - add.u64 %rd49, %rd40, %rd48; - ld.shared.f32 %f175, [%rd49+0]; - add.ftz.f32 %f164, %f175, %f164; - st.shared.f32 [%rd43+0], %f164; - ld.shared.f32 %f176, [%rd49+512]; - add.ftz.f32 %f165, %f176, %f165; - st.shared.f32 [%rd43+512], %f165; - ld.shared.f32 %f177, [%rd49+1024]; - add.ftz.f32 %f166, %f177, %f166; - st.shared.f32 [%rd43+1024], %f166; - ld.shared.f32 %f178, [%rd49+1536]; - add.ftz.f32 %f167, %f178, %f167; - st.shared.f32 [%rd43+1536], %f167; - ld.shared.f32 %f179, [%rd49+2048]; - add.ftz.f32 %f168, %f179, %f168; - st.shared.f32 [%rd43+2048], %f168; - ld.shared.f32 %f180, [%rd49+2560]; - add.ftz.f32 %f174, %f180, %f174; - st.shared.f32 [%rd43+2560], %f174; -$Lt_0_32258: - shr.u32 %r82, %r82, 1; - mov.u32 %r84, 0; - setp.ne.u32 %p18, %r82, %r84; - @%p18 bra $Lt_0_32002; -$Lt_0_31490: - mov.f32 %f10, %f164; - mov.f32 %f12, %f165; - mov.f32 %f14, %f166; - mov.f32 %f16, %f167; - mov.f32 %f18, %f168; - mov.f32 %f20, %f174; -$Lt_0_30978: -$Lt_0_28930: - mov.u32 %r85, 0; - setp.ne.s32 %p19, %r14, %r85; - @%p19 bra $Lt_0_33026; - ld.param.u64 %rd50, [__cudaparm_kernel_pair___val_paramengv]; - add.u64 %rd51, %rd50, %rd3; - ld.param.s32 %r86, [__cudaparm_kernel_pair_eflag]; - mov.u32 %r87, 0; - setp.le.s32 %p20, %r86, %r87; - @%p20 bra $Lt_0_33538; - st.global.f32 [%rd51+0], %f39; - cvt.s64.s32 %rd52, %r9; - mul.wide.s32 %rd53, %r9, 4; - add.u64 %rd54, %rd53, %rd51; - st.global.f32 [%rd54+0], %f38; - add.u64 %rd51, %rd53, %rd54; -$Lt_0_33538: - ld.param.s32 %r88, [__cudaparm_kernel_pair_vflag]; - mov.u32 %r89, 0; - setp.le.s32 %p21, %r88, %r89; - @%p21 bra $Lt_0_34050; - mov.f32 %f181, %f10; - st.global.f32 [%rd51+0], %f181; - cvt.s64.s32 %rd55, %r9; - mul.wide.s32 %rd56, %r9, 4; - add.u64 %rd57, %rd56, %rd51; - mov.f32 %f182, %f12; - st.global.f32 [%rd57+0], %f182; - add.u64 %rd58, %rd56, %rd57; - mov.f32 %f183, %f14; - st.global.f32 [%rd58+0], %f183; - add.u64 %rd59, %rd56, %rd58; - mov.f32 %f184, %f16; - st.global.f32 [%rd59+0], %f184; - add.u64 %rd51, %rd56, %rd59; - mov.f32 %f185, %f18; - st.global.f32 [%rd51+0], %f185; - mov.f32 %f186, %f20; - add.u64 %rd60, %rd56, %rd51; - st.global.f32 [%rd60+0], %f186; -$Lt_0_34050: - ld.param.u64 %rd61, [__cudaparm_kernel_pair_ans]; - mul.lo.u64 %rd62, %rd2, 16; - add.u64 %rd63, %rd61, %rd62; - mov.f32 %f187, %f188; - st.global.v4.f32 [%rd63+0], {%f37,%f36,%f35,%f187}; -$Lt_0_33026: -$Lt_0_23298: - .loc 16 148 0 - exit; -$LDWend_kernel_pair: - } // kernel_pair - - .entry kernel_pair_fast ( - .param .u64 __cudaparm_kernel_pair_fast_x_, - .param .u64 __cudaparm_kernel_pair_fast_ljd_in, - .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in, - .param .u64 __cudaparm_kernel_pair_fast_dev_nbor, - .param .u64 __cudaparm_kernel_pair_fast_dev_packed, - .param .u64 __cudaparm_kernel_pair_fast_ans, - .param .u64 __cudaparm_kernel_pair_fast___val_paramengv, - .param .s32 __cudaparm_kernel_pair_fast_eflag, - .param .s32 __cudaparm_kernel_pair_fast_vflag, - .param .s32 __cudaparm_kernel_pair_fast_inum, - .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch, - .param .u64 __cudaparm_kernel_pair_fast_q_, - .param .f32 __cudaparm_kernel_pair_fast_cut_coulsq, - .param .f32 __cudaparm_kernel_pair_fast_qqrd2e, - .param .f32 __cudaparm_kernel_pair_fast_g_ewald, - .param .f32 __cudaparm_kernel_pair_fast_denom_lj, - .param .f32 __cudaparm_kernel_pair_fast_cut_bothsq, - .param .f32 __cudaparm_kernel_pair_fast_cut_ljsq, - .param .f32 __cudaparm_kernel_pair_fast_cut_lj_innersq, - .param .s32 __cudaparm_kernel_pair_fast_t_per_atom) - { - .reg .u32 %r<86>; - .reg .u64 %rd<72>; - .reg .f32 %f<196>; - .reg .pred %p<25>; - .shared .align 4 .b8 __cuda___cuda_local_var_32666_33_non_const_sp_lj3336[32]; - .shared .align 8 .b8 __cuda___cuda_local_var_32665_34_non_const_ljd3368[1024]; - .shared .align 4 .b8 __cuda___cuda_local_var_32775_55_non_const_red_acc4392[3072]; - // __cuda_local_var_32675_10_non_const_f = 64 - // __cuda_local_var_32677_9_non_const_virial = 16 - // __cuda_local_var_32712_43_non_const_prefactor = 56 - // __cuda_local_var_32712_54_non_const__erfc = 52 - // __cuda_local_var_32712_61_non_const_switch1 = 48 - // __cuda_local_var_32713_15_non_const_lj3 = 44 - // __cuda_local_var_32713_20_non_const_lj4 = 40 - .loc 16 160 0 -$LDWbegin_kernel_pair_fast: - cvt.s32.u32 %r1, %tid.x; - cvt.s64.s32 %rd1, %r1; - mov.u32 %r2, 7; - setp.gt.s32 %p1, %r1, %r2; - @%p1 bra $Lt_1_24834; - .loc 16 167 0 - mov.u64 %rd2, __cuda___cuda_local_var_32666_33_non_const_sp_lj3336; - mul.lo.u64 %rd3, %rd1, 4; - ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in]; - add.u64 %rd5, %rd4, %rd3; - ld.global.f32 %f1, [%rd5+0]; - add.u64 %rd6, %rd3, %rd2; - st.shared.f32 [%rd6+0], %f1; -$Lt_1_24834: - mov.u64 %rd7, __cuda___cuda_local_var_32665_34_non_const_ljd3368; - mov.u64 %rd2, __cuda___cuda_local_var_32666_33_non_const_sp_lj3336; - .loc 16 168 0 - mul.lo.u64 %rd8, %rd1, 8; - ld.param.u64 %rd9, [__cudaparm_kernel_pair_fast_ljd_in]; - add.u64 %rd10, %rd9, %rd8; - add.u64 %rd11, %rd8, %rd7; - ld.global.v2.f32 {%f2,%f3}, [%rd10+0]; - st.shared.v2.f32 [%rd11+0], {%f2,%f3}; - add.s32 %r3, %r1, 128; - mov.u32 %r4, 127; - setp.gt.s32 %p2, %r3, %r4; - @%p2 bra $Lt_1_25346; - ld.global.v2.f32 {%f4,%f5}, [%rd10+1024]; - st.shared.v2.f32 [%rd11+1024], {%f4,%f5}; -$Lt_1_25346: - .loc 16 178 0 - mov.f32 %f6, 0f00000000; // 0 - mov.f32 %f7, %f6; - mov.f32 %f8, 0f00000000; // 0 - mov.f32 %f9, %f8; - mov.f32 %f10, 0f00000000; // 0 - mov.f32 %f11, %f10; - mov.f32 %f12, 0f00000000; // 0 - mov.f32 %f13, %f12; - mov.f32 %f14, 0f00000000; // 0 - mov.f32 %f15, %f14; - mov.f32 %f16, 0f00000000; // 0 - mov.f32 %f17, %f16; - .loc 16 180 0 - bar.sync 0; - ld.param.s32 %r5, [__cudaparm_kernel_pair_fast_t_per_atom]; - div.s32 %r6, %r1, %r5; - cvt.s32.u32 %r7, %ntid.x; - div.s32 %r8, %r7, %r5; - cvt.s32.u32 %r9, %ctaid.x; - mul.lo.s32 %r10, %r9, %r8; - add.s32 %r11, %r6, %r10; - ld.param.s32 %r12, [__cudaparm_kernel_pair_fast_inum]; - setp.ge.s32 %p3, %r11, %r12; - @%p3 bra $Lt_1_35586; - .loc 16 185 0 - cvt.s64.s32 %rd12, %r11; - mul.wide.s32 %rd13, %r11, 4; - ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_dev_nbor]; - add.u64 %rd15, %rd13, %rd14; - ld.global.s32 %r13, [%rd15+0]; - ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_nbor_pitch]; - cvt.s64.s32 %rd16, %r14; - mul.wide.s32 %rd17, %r14, 4; - add.u64 %rd18, %rd17, %rd15; - ld.global.s32 %r15, [%rd18+0]; - sub.s32 %r16, %r5, 1; - and.b32 %r17, %r16, %r1; - cvt.s64.s32 %rd19, %r17; - mul.wide.s32 %rd20, %r17, 4; - ld.param.u64 %rd21, [__cudaparm_kernel_pair_fast_dev_packed]; - setp.ne.u64 %p4, %rd21, %rd14; - @%p4 bra $Lt_1_26626; - cvt.s32.s64 %r18, %rd16; - mul.lo.s32 %r19, %r18, %r5; - mov.s32 %r20, %r19; - mul.lo.s32 %r21, %r16, %r11; - add.s32 %r22, %r18, %r21; - cvt.s64.s32 %rd22, %r22; - mul.wide.s32 %rd23, %r22, 4; - add.u64 %rd24, %rd18, %rd23; - and.b32 %r23, %r16, %r15; - cvt.s64.s32 %rd25, %r23; - div.s32 %r24, %r15, %r5; - mul.lo.s32 %r25, %r19, %r24; - cvt.s64.s32 %rd26, %r25; - add.u64 %rd27, %rd25, %rd26; - mul.lo.u64 %rd28, %rd27, 4; - add.u64 %rd29, %rd24, %rd28; - add.u64 %rd30, %rd20, %rd24; - bra.uni $Lt_1_26370; -$Lt_1_26626: - add.u64 %rd31, %rd17, %rd18; - ld.global.s32 %r26, [%rd31+0]; - cvt.s64.s32 %rd32, %r26; - mul.wide.s32 %rd33, %r26, 4; - add.u64 %rd34, %rd21, %rd33; - cvt.s64.s32 %rd35, %r15; - mul.wide.s32 %rd36, %r15, 4; - add.u64 %rd29, %rd34, %rd36; - mov.s32 %r20, %r5; - add.u64 %rd30, %rd20, %rd34; -$Lt_1_26370: - .loc 16 188 0 - mov.u32 %r27, %r13; - mov.s32 %r28, 0; - mov.u32 %r29, %r28; - mov.s32 %r30, 0; - mov.u32 %r31, %r30; - mov.s32 %r32, 0; - mov.u32 %r33, %r32; - tex.1d.v4.f32.s32 {%f18,%f19,%f20,%f21},[pos_tex,{%r27,%r29,%r31,%r33}]; - mov.f32 %f22, %f18; - mov.f32 %f23, %f19; - mov.f32 %f24, %f20; - mov.f32 %f25, %f21; - .loc 16 189 0 - mov.u32 %r34, %r13; - mov.s32 %r35, 0; - mov.u32 %r36, %r35; - mov.s32 %r37, 0; - mov.u32 %r38, %r37; - mov.s32 %r39, 0; - mov.u32 %r40, %r39; - tex.1d.v4.f32.s32 {%f26,%f27,%f28,%f29},[q_tex,{%r34,%r36,%r38,%r40}]; - mov.f32 %f30, %f26; - setp.ge.u64 %p5, %rd30, %rd29; - @%p5 bra $Lt_1_37122; - cvt.rzi.ftz.s32.f32 %r41, %f25; - cvt.s64.s32 %rd37, %r20; - ld.param.f32 %f31, [__cudaparm_kernel_pair_fast_cut_bothsq]; - mov.f32 %f32, 0f00000000; // 0 - mov.f32 %f33, 0f00000000; // 0 - mov.f32 %f34, 0f00000000; // 0 - mov.f32 %f35, 0f00000000; // 0 - mov.f32 %f36, 0f00000000; // 0 -$Lt_1_27394: - // Loop body line 189, nesting depth: 1, estimated iterations: unknown - .loc 16 193 0 - ld.global.s32 %r42, [%rd30+0]; - .loc 16 196 0 - shr.s32 %r43, %r42, 30; - and.b32 %r44, %r43, 3; - cvt.s64.s32 %rd38, %r44; - mul.wide.s32 %rd39, %r44, 4; - add.u64 %rd40, %rd2, %rd39; - ld.shared.f32 %f37, [%rd40+0]; - .loc 16 197 0 - mov.f32 %f38, 0f3f800000; // 1 - ld.shared.f32 %f39, [%rd40+16]; - sub.ftz.f32 %f40, %f38, %f39; - .loc 16 200 0 - and.b32 %r45, %r42, 1073741823; - mov.u32 %r46, %r45; - mov.s32 %r47, 0; - mov.u32 %r48, %r47; - mov.s32 %r49, 0; - mov.u32 %r50, %r49; - mov.s32 %r51, 0; - mov.u32 %r52, %r51; - tex.1d.v4.f32.s32 {%f41,%f42,%f43,%f44},[pos_tex,{%r46,%r48,%r50,%r52}]; - mov.f32 %f45, %f41; - mov.f32 %f46, %f42; - mov.f32 %f47, %f43; - mov.f32 %f48, %f44; - sub.ftz.f32 %f49, %f23, %f46; - sub.ftz.f32 %f50, %f22, %f45; - sub.ftz.f32 %f51, %f24, %f47; - mul.ftz.f32 %f52, %f49, %f49; - fma.rn.ftz.f32 %f53, %f50, %f50, %f52; - fma.rn.ftz.f32 %f54, %f51, %f51, %f53; - setp.lt.ftz.f32 %p6, %f54, %f31; - @!%p6 bra $Lt_1_30722; - ld.param.f32 %f55, [__cudaparm_kernel_pair_fast_cut_ljsq]; - setp.lt.ftz.f32 %p7, %f54, %f55; - rcp.approx.ftz.f32 %f56, %f54; - @!%p7 bra $Lt_1_28418; - .loc 16 215 0 - cvt.rzi.ftz.s32.f32 %r53, %f48; - cvt.s64.s32 %rd41, %r41; - mul.wide.s32 %rd42, %r41, 8; - add.u64 %rd43, %rd7, %rd42; - cvt.s64.s32 %rd44, %r53; - mul.wide.s32 %rd45, %r53, 8; - add.u64 %rd46, %rd7, %rd45; - ld.shared.v2.f32 {%f57,%f58}, [%rd43+0]; - ld.shared.v2.f32 {%f59,%f60}, [%rd46+0]; - mul.ftz.f32 %f61, %f57, %f59; - .loc 16 216 0 - add.ftz.f32 %f62, %f58, %f60; - mov.f32 %f63, 0f3f000000; // 0.5 - mul.ftz.f32 %f64, %f62, %f63; - .loc 16 220 0 - mul.ftz.f32 %f65, %f64, %f64; - sqrt.approx.ftz.f32 %f66, %f61; - mov.f32 %f67, 0f40800000; // 4 - mul.ftz.f32 %f68, %f66, %f67; - mul.ftz.f32 %f69, %f65, %f56; - mul.ftz.f32 %f70, %f69, %f69; - mul.ftz.f32 %f71, %f69, %f70; - mul.ftz.f32 %f72, %f68, %f71; - mov.f32 %f73, %f72; - .loc 16 221 0 - mul.ftz.f32 %f74, %f71, %f72; - mov.f32 %f75, %f74; - .loc 16 222 0 - mov.f32 %f76, 0f40c00000; // 6 - mul.ftz.f32 %f77, %f72, %f76; - mov.f32 %f78, 0f41400000; // 12 - mul.ftz.f32 %f79, %f78, %f74; - sub.ftz.f32 %f80, %f79, %f77; - mul.ftz.f32 %f81, %f37, %f80; - ld.param.f32 %f82, [__cudaparm_kernel_pair_fast_cut_lj_innersq]; - setp.gt.ftz.f32 %p8, %f54, %f82; - @!%p8 bra $Lt_1_28162; - .loc 16 228 0 - add.ftz.f32 %f83, %f54, %f54; - sub.ftz.f32 %f84, %f55, %f54; - add.ftz.f32 %f85, %f83, %f55; - mul.ftz.f32 %f86, %f84, %f84; - mov.f32 %f87, 0f40400000; // 3 - mul.ftz.f32 %f88, %f87, %f82; - sub.ftz.f32 %f89, %f85, %f88; - ld.param.f32 %f90, [__cudaparm_kernel_pair_fast_denom_lj]; - div.approx.ftz.f32 %f91, %f89, %f90; - mul.ftz.f32 %f92, %f86, %f91; - mov.f32 %f93, %f92; - .loc 16 231 0 - mov.f32 %f94, 0f41400000; // 12 - mul.ftz.f32 %f95, %f54, %f94; - mul.ftz.f32 %f96, %f84, %f95; - sub.ftz.f32 %f97, %f54, %f82; - mul.ftz.f32 %f98, %f96, %f97; - div.approx.ftz.f32 %f99, %f98, %f90; - sub.ftz.f32 %f100, %f74, %f72; - mul.ftz.f32 %f101, %f99, %f100; - fma.rn.ftz.f32 %f81, %f81, %f92, %f101; - bra.uni $Lt_1_28162; -$Lt_1_28418: - .loc 16 234 0 - mov.f32 %f81, 0f00000000; // 0 -$Lt_1_28162: - ld.param.f32 %f102, [__cudaparm_kernel_pair_fast_cut_coulsq]; - setp.gt.ftz.f32 %p9, %f102, %f54; - @!%p9 bra $Lt_1_29442; - .loc 20 518 0 - rsqrt.approx.ftz.f32 %f103, %f56; - ld.param.f32 %f104, [__cudaparm_kernel_pair_fast_g_ewald]; - mul.ftz.f32 %f105, %f104, %f103; - mul.ftz.f32 %f106, %f105, %f105; - neg.ftz.f32 %f107, %f106; - mov.f32 %f108, 0f3fb8aa3b; // 1.4427 - mul.ftz.f32 %f109, %f107, %f108; - ex2.approx.ftz.f32 %f110, %f109; - .loc 16 241 0 - mov.f32 %f111, 0f3f800000; // 1 - mov.f32 %f112, 0f3ea7ba05; // 0.327591 - fma.rn.ftz.f32 %f113, %f112, %f105, %f111; - rcp.approx.ftz.f32 %f114, %f113; - mov.f32 %f115, 0f3e827906; // 0.25483 - mov.f32 %f116, 0fbe91a98e; // -0.284497 - mov.f32 %f117, 0f3fb5f0e3; // 1.42141 - mov.f32 %f118, 0fbfba00e3; // -1.45315 - mov.f32 %f119, 0f3f87dc22; // 1.06141 - fma.rn.ftz.f32 %f120, %f119, %f114, %f118; - fma.rn.ftz.f32 %f121, %f114, %f120, %f117; - fma.rn.ftz.f32 %f122, %f114, %f121, %f116; - fma.rn.ftz.f32 %f123, %f114, %f122, %f115; - mul.ftz.f32 %f124, %f114, %f123; - mul.ftz.f32 %f125, %f110, %f124; - mov.f32 %f126, %f125; - .loc 16 242 0 - mov.u32 %r54, %r45; - mov.s32 %r55, 0; - mov.u32 %r56, %r55; - mov.s32 %r57, 0; - mov.u32 %r58, %r57; - mov.s32 %r59, 0; - mov.u32 %r60, %r59; - tex.1d.v4.f32.s32 {%f127,%f128,%f129,%f130},[q_tex,{%r54,%r56,%r58,%r60}]; - mov.f32 %f131, %f127; - ld.param.f32 %f132, [__cudaparm_kernel_pair_fast_qqrd2e]; - mul.ftz.f32 %f133, %f132, %f30; - mul.ftz.f32 %f134, %f133, %f131; - div.approx.ftz.f32 %f135, %f134, %f103; - mov.f32 %f136, %f135; - .loc 16 243 0 - mov.f32 %f137, 0f3f906ebb; // 1.12838 - mul.ftz.f32 %f138, %f105, %f137; - fma.rn.ftz.f32 %f139, %f110, %f138, %f125; - sub.ftz.f32 %f140, %f139, %f40; - mul.ftz.f32 %f141, %f135, %f140; - bra.uni $Lt_1_29186; -$Lt_1_29442: - .loc 16 245 0 - mov.f32 %f141, 0f00000000; // 0 -$Lt_1_29186: - .loc 16 249 0 - add.ftz.f32 %f142, %f141, %f81; - mul.ftz.f32 %f143, %f142, %f56; - fma.rn.ftz.f32 %f34, %f50, %f143, %f34; - .loc 16 250 0 - fma.rn.ftz.f32 %f33, %f49, %f143, %f33; - .loc 16 251 0 - fma.rn.ftz.f32 %f32, %f51, %f143, %f32; - ld.param.s32 %r61, [__cudaparm_kernel_pair_fast_eflag]; - mov.u32 %r62, 0; - setp.le.s32 %p10, %r61, %r62; - @%p10 bra $Lt_1_30210; - .loc 16 254 0 - mov.f32 %f144, %f136; - mov.f32 %f145, %f126; - sub.ftz.f32 %f146, %f145, %f40; - fma.rn.ftz.f32 %f147, %f144, %f146, %f35; - selp.f32 %f35, %f147, %f35, %p9; - @!%p7 bra $Lt_1_30210; - .loc 16 260 0 - mov.f32 %f148, %f75; - mov.f32 %f149, %f73; - sub.ftz.f32 %f150, %f148, %f149; - mov.f32 %f151, %f93; - mul.ftz.f32 %f152, %f151, %f150; - ld.param.f32 %f153, [__cudaparm_kernel_pair_fast_cut_lj_innersq]; - setp.lt.ftz.f32 %p11, %f153, %f54; - selp.f32 %f154, %f152, %f150, %p11; - fma.rn.ftz.f32 %f36, %f37, %f154, %f36; -$Lt_1_30210: -$Lt_1_29698: - ld.param.s32 %r63, [__cudaparm_kernel_pair_fast_vflag]; - mov.u32 %r64, 0; - setp.le.s32 %p12, %r63, %r64; - @%p12 bra $Lt_1_30722; - .loc 16 264 0 - mov.f32 %f155, %f7; - mul.ftz.f32 %f156, %f50, %f50; - fma.rn.ftz.f32 %f157, %f143, %f156, %f155; - mov.f32 %f7, %f157; - .loc 16 265 0 - mov.f32 %f158, %f9; - fma.rn.ftz.f32 %f159, %f143, %f52, %f158; - mov.f32 %f9, %f159; - .loc 16 266 0 - mov.f32 %f160, %f11; - mul.ftz.f32 %f161, %f51, %f51; - fma.rn.ftz.f32 %f162, %f143, %f161, %f160; - mov.f32 %f11, %f162; - .loc 16 267 0 - mov.f32 %f163, %f13; - mul.ftz.f32 %f164, %f49, %f50; - fma.rn.ftz.f32 %f165, %f143, %f164, %f163; - mov.f32 %f13, %f165; - .loc 16 268 0 - mov.f32 %f166, %f15; - mul.ftz.f32 %f167, %f50, %f51; - fma.rn.ftz.f32 %f168, %f143, %f167, %f166; - mov.f32 %f15, %f168; - .loc 16 269 0 - mul.ftz.f32 %f169, %f49, %f51; - fma.rn.ftz.f32 %f16, %f143, %f169, %f16; - mov.f32 %f17, %f16; -$Lt_1_30722: -$Lt_1_27650: - .loc 16 192 0 - mul.lo.u64 %rd47, %rd37, 4; - add.u64 %rd30, %rd30, %rd47; - setp.lt.u64 %p13, %rd30, %rd29; - @%p13 bra $Lt_1_27394; - bra.uni $Lt_1_26882; -$Lt_1_37122: - mov.f32 %f32, 0f00000000; // 0 - mov.f32 %f33, 0f00000000; // 0 - mov.f32 %f34, 0f00000000; // 0 - mov.f32 %f35, 0f00000000; // 0 - mov.f32 %f36, 0f00000000; // 0 -$Lt_1_26882: - mov.u32 %r65, 1; - setp.le.s32 %p14, %r5, %r65; - @%p14 bra $Lt_1_33538; - .loc 16 274 0 - mov.u64 %rd48, __cuda___cuda_local_var_32775_55_non_const_red_acc4392; - mul.lo.u64 %rd49, %rd1, 4; - add.u64 %rd50, %rd48, %rd49; - mov.f32 %f170, %f34; - st.shared.f32 [%rd50+0], %f170; - mov.f32 %f171, %f33; - st.shared.f32 [%rd50+512], %f171; - mov.f32 %f172, %f32; - st.shared.f32 [%rd50+1024], %f172; - mov.f32 %f173, %f36; - st.shared.f32 [%rd50+1536], %f173; - mov.f32 %f174, %f35; - st.shared.f32 [%rd50+2048], %f174; - shr.s32 %r66, %r5, 31; - mov.s32 %r67, 1; - and.b32 %r68, %r66, %r67; - add.s32 %r69, %r68, %r5; - shr.s32 %r70, %r69, 1; - mov.s32 %r71, %r70; - mov.u32 %r72, 0; - setp.ne.u32 %p15, %r70, %r72; - @!%p15 bra $Lt_1_32002; -$Lt_1_32514: - setp.ge.u32 %p16, %r17, %r71; - @%p16 bra $Lt_1_32770; - add.u32 %r73, %r1, %r71; - cvt.u64.u32 %rd51, %r73; - mul.wide.u32 %rd52, %r73, 4; - add.u64 %rd53, %rd48, %rd52; - ld.shared.f32 %f175, [%rd53+0]; - add.ftz.f32 %f170, %f175, %f170; - st.shared.f32 [%rd50+0], %f170; - ld.shared.f32 %f176, [%rd53+512]; - add.ftz.f32 %f171, %f176, %f171; - st.shared.f32 [%rd50+512], %f171; - ld.shared.f32 %f177, [%rd53+1024]; - add.ftz.f32 %f172, %f177, %f172; - st.shared.f32 [%rd50+1024], %f172; - ld.shared.f32 %f178, [%rd53+1536]; - add.ftz.f32 %f173, %f178, %f173; - st.shared.f32 [%rd50+1536], %f173; - ld.shared.f32 %f179, [%rd53+2048]; - add.ftz.f32 %f174, %f179, %f174; - st.shared.f32 [%rd50+2048], %f174; -$Lt_1_32770: - shr.u32 %r71, %r71, 1; - mov.u32 %r74, 0; - setp.ne.u32 %p17, %r71, %r74; - @%p17 bra $Lt_1_32514; -$Lt_1_32002: - mov.f32 %f34, %f170; - mov.f32 %f33, %f171; - mov.f32 %f32, %f172; - mov.f32 %f36, %f173; - mov.f32 %f35, %f174; - ld.param.s32 %r75, [__cudaparm_kernel_pair_fast_vflag]; - mov.u32 %r76, 0; - setp.le.s32 %p18, %r75, %r76; - @%p18 bra $Lt_1_33538; - mov.f32 %f170, %f7; - st.shared.f32 [%rd50+0], %f170; - mov.f32 %f171, %f9; - st.shared.f32 [%rd50+512], %f171; - mov.f32 %f172, %f11; - st.shared.f32 [%rd50+1024], %f172; - mov.f32 %f173, %f13; - st.shared.f32 [%rd50+1536], %f173; - mov.f32 %f174, %f15; - st.shared.f32 [%rd50+2048], %f174; - mov.f32 %f180, %f16; - st.shared.f32 [%rd50+2560], %f180; - mov.s32 %r77, %r70; - @!%p15 bra $Lt_1_34050; -$Lt_1_34562: - setp.ge.u32 %p19, %r17, %r77; - @%p19 bra $Lt_1_34818; - add.u32 %r78, %r1, %r77; - cvt.u64.u32 %rd54, %r78; - mul.wide.u32 %rd55, %r78, 4; - add.u64 %rd56, %rd48, %rd55; - ld.shared.f32 %f181, [%rd56+0]; - add.ftz.f32 %f170, %f181, %f170; - st.shared.f32 [%rd50+0], %f170; - ld.shared.f32 %f182, [%rd56+512]; - add.ftz.f32 %f171, %f182, %f171; - st.shared.f32 [%rd50+512], %f171; - ld.shared.f32 %f183, [%rd56+1024]; - add.ftz.f32 %f172, %f183, %f172; - st.shared.f32 [%rd50+1024], %f172; - ld.shared.f32 %f184, [%rd56+1536]; - add.ftz.f32 %f173, %f184, %f173; - st.shared.f32 [%rd50+1536], %f173; - ld.shared.f32 %f185, [%rd56+2048]; - add.ftz.f32 %f174, %f185, %f174; - st.shared.f32 [%rd50+2048], %f174; - ld.shared.f32 %f186, [%rd56+2560]; - add.ftz.f32 %f180, %f186, %f180; - st.shared.f32 [%rd50+2560], %f180; -$Lt_1_34818: - shr.u32 %r77, %r77, 1; - mov.u32 %r79, 0; - setp.ne.u32 %p20, %r77, %r79; - @%p20 bra $Lt_1_34562; -$Lt_1_34050: - mov.f32 %f7, %f170; - mov.f32 %f9, %f171; - mov.f32 %f11, %f172; - mov.f32 %f13, %f173; - mov.f32 %f15, %f174; - mov.f32 %f17, %f180; -$Lt_1_33538: -$Lt_1_31490: - mov.u32 %r80, 0; - setp.ne.s32 %p21, %r17, %r80; - @%p21 bra $Lt_1_35586; - ld.param.u64 %rd57, [__cudaparm_kernel_pair_fast___val_paramengv]; - add.u64 %rd58, %rd57, %rd13; - ld.param.s32 %r81, [__cudaparm_kernel_pair_fast_eflag]; - mov.u32 %r82, 0; - setp.le.s32 %p22, %r81, %r82; - @%p22 bra $Lt_1_36098; - st.global.f32 [%rd58+0], %f36; - cvt.s64.s32 %rd59, %r12; - mul.wide.s32 %rd60, %r12, 4; - add.u64 %rd61, %rd60, %rd58; - st.global.f32 [%rd61+0], %f35; - add.u64 %rd58, %rd60, %rd61; -$Lt_1_36098: - ld.param.s32 %r83, [__cudaparm_kernel_pair_fast_vflag]; - mov.u32 %r84, 0; - setp.le.s32 %p23, %r83, %r84; - @%p23 bra $Lt_1_36610; - mov.f32 %f187, %f7; - st.global.f32 [%rd58+0], %f187; - cvt.s64.s32 %rd62, %r12; - mul.wide.s32 %rd63, %r12, 4; - add.u64 %rd64, %rd63, %rd58; - mov.f32 %f188, %f9; - st.global.f32 [%rd64+0], %f188; - add.u64 %rd65, %rd63, %rd64; - mov.f32 %f189, %f11; - st.global.f32 [%rd65+0], %f189; - add.u64 %rd66, %rd63, %rd65; - mov.f32 %f190, %f13; - st.global.f32 [%rd66+0], %f190; - add.u64 %rd58, %rd63, %rd66; - mov.f32 %f191, %f15; - st.global.f32 [%rd58+0], %f191; - mov.f32 %f192, %f17; - add.u64 %rd67, %rd63, %rd58; - st.global.f32 [%rd67+0], %f192; -$Lt_1_36610: - ld.param.u64 %rd68, [__cudaparm_kernel_pair_fast_ans]; - mul.lo.u64 %rd69, %rd12, 16; - add.u64 %rd70, %rd68, %rd69; - mov.f32 %f193, %f194; - st.global.v4.f32 [%rd70+0], {%f34,%f33,%f32,%f193}; -$Lt_1_35586: -$Lt_1_25858: - .loc 16 277 0 - exit; -$LDWend_kernel_pair_fast: - } // kernel_pair_fast - diff --git a/lib/gpu/charmm_long_ptx.h b/lib/gpu/charmm_long_ptx.h deleted file mode 100644 index 92b2bf8ca1..0000000000 --- a/lib/gpu/charmm_long_ptx.h +++ /dev/null @@ -1,1139 +0,0 @@ -const char * charmm_long = -" .version 2.3\n" -" .target sm_20\n" -" .address_size 64\n" -" .global .texref pos_tex;\n" -" .global .texref q_tex;\n" -" .entry kernel_pair (\n" -" .param .u64 __cudaparm_kernel_pair_x_,\n" -" .param .u64 __cudaparm_kernel_pair_lj1,\n" -" .param .s32 __cudaparm_kernel_pair_lj_types,\n" -" .param .u64 __cudaparm_kernel_pair_sp_lj_in,\n" -" .param .u64 __cudaparm_kernel_pair_dev_nbor,\n" -" .param .u64 __cudaparm_kernel_pair_dev_packed,\n" -" .param .u64 __cudaparm_kernel_pair_ans,\n" -" .param .u64 __cudaparm_kernel_pair___val_paramengv,\n" -" .param .s32 __cudaparm_kernel_pair_eflag,\n" -" .param .s32 __cudaparm_kernel_pair_vflag,\n" -" .param .s32 __cudaparm_kernel_pair_inum,\n" -" .param .s32 __cudaparm_kernel_pair_nbor_pitch,\n" -" .param .u64 __cudaparm_kernel_pair_q_,\n" -" .param .f32 __cudaparm_kernel_pair_cut_coulsq,\n" -" .param .f32 __cudaparm_kernel_pair_qqrd2e,\n" -" .param .f32 __cudaparm_kernel_pair_g_ewald,\n" -" .param .f32 __cudaparm_kernel_pair_denom_lj,\n" -" .param .f32 __cudaparm_kernel_pair_cut_bothsq,\n" -" .param .f32 __cudaparm_kernel_pair_cut_ljsq,\n" -" .param .f32 __cudaparm_kernel_pair_cut_lj_innersq,\n" -" .param .s32 __cudaparm_kernel_pair_t_per_atom)\n" -" {\n" -" .reg .u32 %r<91>;\n" -" .reg .u64 %rd<65>;\n" -" .reg .f32 %f<190>;\n" -" .reg .pred %p<23>;\n" -" .shared .align 16 .b8 __cuda___cuda_local_var_32542_33_non_const_sp_lj120[32];\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_32646_55_non_const_red_acc152[3072];\n" -" .loc 16 37 0\n" -"$LDWbegin_kernel_pair:\n" -" .loc 16 42 0\n" -" ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];\n" -" ldu.global.f32 %f1, [%rd1+0];\n" -" .loc 16 43 0\n" -" ld.global.f32 %f2, [%rd1+4];\n" -" .loc 16 44 0\n" -" ld.global.f32 %f3, [%rd1+8];\n" -" .loc 16 45 0\n" -" ld.global.f32 %f4, [%rd1+12];\n" -" st.shared.v4.f32 [__cuda___cuda_local_var_32542_33_non_const_sp_lj120+0], {%f1,%f2,%f3,%f4};\n" -" .loc 16 46 0\n" -" ld.global.f32 %f5, [%rd1+16];\n" -" .loc 16 47 0\n" -" ld.global.f32 %f6, [%rd1+20];\n" -" .loc 16 48 0\n" -" ld.global.f32 %f7, [%rd1+24];\n" -" .loc 16 49 0\n" -" ld.global.f32 %f8, [%rd1+28];\n" -" st.shared.v4.f32 [__cuda___cuda_local_var_32542_33_non_const_sp_lj120+16], {%f5,%f6,%f7,%f8};\n" -" .loc 16 57 0\n" -" mov.f32 %f9, 0f00000000; \n" -" mov.f32 %f10, %f9;\n" -" mov.f32 %f11, 0f00000000; \n" -" mov.f32 %f12, %f11;\n" -" mov.f32 %f13, 0f00000000; \n" -" mov.f32 %f14, %f13;\n" -" mov.f32 %f15, 0f00000000; \n" -" mov.f32 %f16, %f15;\n" -" mov.f32 %f17, 0f00000000; \n" -" mov.f32 %f18, %f17;\n" -" mov.f32 %f19, 0f00000000; \n" -" mov.f32 %f20, %f19;\n" -" ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];\n" -" cvt.s32.u32 %r2, %tid.x;\n" -" div.s32 %r3, %r2, %r1;\n" -" cvt.s32.u32 %r4, %ntid.x;\n" -" div.s32 %r5, %r4, %r1;\n" -" cvt.s32.u32 %r6, %ctaid.x;\n" -" mul.lo.s32 %r7, %r6, %r5;\n" -" add.s32 %r8, %r3, %r7;\n" -" ld.param.s32 %r9, [__cudaparm_kernel_pair_inum];\n" -" setp.ge.s32 %p1, %r8, %r9;\n" -" @%p1 bra $Lt_0_33026;\n" -" .loc 16 62 0\n" -" cvt.s64.s32 %rd2, %r8;\n" -" mul.wide.s32 %rd3, %r8, 4;\n" -" ld.param.u64 %rd4, [__cudaparm_kernel_pair_dev_nbor];\n" -" add.u64 %rd5, %rd3, %rd4;\n" -" ld.global.s32 %r10, [%rd5+0];\n" -" ld.param.s32 %r11, [__cudaparm_kernel_pair_nbor_pitch];\n" -" cvt.s64.s32 %rd6, %r11;\n" -" mul.wide.s32 %rd7, %r11, 4;\n" -" add.u64 %rd8, %rd7, %rd5;\n" -" ld.global.s32 %r12, [%rd8+0];\n" -" sub.s32 %r13, %r1, 1;\n" -" and.b32 %r14, %r13, %r2;\n" -" cvt.s64.s32 %rd9, %r14;\n" -" mul.wide.s32 %rd10, %r14, 4;\n" -" ld.param.u64 %rd11, [__cudaparm_kernel_pair_dev_packed];\n" -" setp.ne.u64 %p2, %rd11, %rd4;\n" -" @%p2 bra $Lt_0_24066;\n" -" cvt.s32.s64 %r15, %rd6;\n" -" mul.lo.s32 %r16, %r15, %r1;\n" -" mov.s32 %r17, %r16;\n" -" mul.lo.s32 %r18, %r13, %r8;\n" -" add.s32 %r19, %r15, %r18;\n" -" cvt.s64.s32 %rd12, %r19;\n" -" mul.wide.s32 %rd13, %r19, 4;\n" -" add.u64 %rd14, %rd8, %rd13;\n" -" and.b32 %r20, %r13, %r12;\n" -" cvt.s64.s32 %rd15, %r20;\n" -" div.s32 %r21, %r12, %r1;\n" -" mul.lo.s32 %r22, %r16, %r21;\n" -" cvt.s64.s32 %rd16, %r22;\n" -" add.u64 %rd17, %rd15, %rd16;\n" -" mul.lo.u64 %rd18, %rd17, 4;\n" -" add.u64 %rd19, %rd14, %rd18;\n" -" add.u64 %rd20, %rd10, %rd14;\n" -" bra.uni $Lt_0_23810;\n" -"$Lt_0_24066:\n" -" add.u64 %rd21, %rd7, %rd8;\n" -" ld.global.s32 %r23, [%rd21+0];\n" -" cvt.s64.s32 %rd22, %r23;\n" -" mul.wide.s32 %rd23, %r23, 4;\n" -" add.u64 %rd24, %rd11, %rd23;\n" -" cvt.s64.s32 %rd25, %r12;\n" -" mul.wide.s32 %rd26, %r12, 4;\n" -" add.u64 %rd19, %rd24, %rd26;\n" -" mov.s32 %r17, %r1;\n" -" add.u64 %rd20, %rd10, %rd24;\n" -"$Lt_0_23810:\n" -" .loc 16 65 0\n" -" mov.u32 %r24, %r10;\n" -" mov.s32 %r25, 0;\n" -" mov.u32 %r26, %r25;\n" -" mov.s32 %r27, 0;\n" -" mov.u32 %r28, %r27;\n" -" mov.s32 %r29, 0;\n" -" mov.u32 %r30, %r29;\n" -" tex.1d.v4.f32.s32 {%f21,%f22,%f23,%f24},[pos_tex,{%r24,%r26,%r28,%r30}];\n" -" mov.f32 %f25, %f21;\n" -" mov.f32 %f26, %f22;\n" -" mov.f32 %f27, %f23;\n" -" mov.f32 %f28, %f24;\n" -" .loc 16 66 0\n" -" mov.u32 %r31, %r10;\n" -" mov.s32 %r32, 0;\n" -" mov.u32 %r33, %r32;\n" -" mov.s32 %r34, 0;\n" -" mov.u32 %r35, %r34;\n" -" mov.s32 %r36, 0;\n" -" mov.u32 %r37, %r36;\n" -" tex.1d.v4.f32.s32 {%f29,%f30,%f31,%f32},[q_tex,{%r31,%r33,%r35,%r37}];\n" -" mov.f32 %f33, %f29;\n" -" setp.ge.u64 %p3, %rd20, %rd19;\n" -" @%p3 bra $Lt_0_34562;\n" -" cvt.s64.s32 %rd27, %r17;\n" -" ld.param.f32 %f34, [__cudaparm_kernel_pair_cut_bothsq];\n" -" mov.f32 %f35, 0f00000000; \n" -" mov.f32 %f36, 0f00000000; \n" -" mov.f32 %f37, 0f00000000; \n" -" mov.f32 %f38, 0f00000000; \n" -" mov.f32 %f39, 0f00000000; \n" -" mov.u64 %rd28, __cuda___cuda_local_var_32542_33_non_const_sp_lj120;\n" -"$Lt_0_24834:\n" -" .loc 16 70 0\n" -" ld.global.s32 %r38, [%rd20+0];\n" -" .loc 16 73 0\n" -" shr.s32 %r39, %r38, 30;\n" -" and.b32 %r40, %r39, 3;\n" -" cvt.s64.s32 %rd29, %r40;\n" -" mul.wide.s32 %rd30, %r40, 4;\n" -" add.u64 %rd31, %rd28, %rd30;\n" -" ld.shared.f32 %f40, [%rd31+0];\n" -" .loc 16 74 0\n" -" mov.f32 %f41, 0f3f800000; \n" -" ld.shared.f32 %f42, [%rd31+16];\n" -" sub.ftz.f32 %f43, %f41, %f42;\n" -" .loc 16 77 0\n" -" and.b32 %r41, %r38, 1073741823;\n" -" mov.u32 %r42, %r41;\n" -" mov.s32 %r43, 0;\n" -" mov.u32 %r44, %r43;\n" -" mov.s32 %r45, 0;\n" -" mov.u32 %r46, %r45;\n" -" mov.s32 %r47, 0;\n" -" mov.u32 %r48, %r47;\n" -" tex.1d.v4.f32.s32 {%f44,%f45,%f46,%f47},[pos_tex,{%r42,%r44,%r46,%r48}];\n" -" mov.f32 %f48, %f44;\n" -" mov.f32 %f49, %f45;\n" -" mov.f32 %f50, %f46;\n" -" mov.f32 %f51, %f47;\n" -" sub.ftz.f32 %f52, %f26, %f49;\n" -" sub.ftz.f32 %f53, %f25, %f48;\n" -" sub.ftz.f32 %f54, %f27, %f50;\n" -" mul.ftz.f32 %f55, %f52, %f52;\n" -" fma.rn.ftz.f32 %f56, %f53, %f53, %f55;\n" -" fma.rn.ftz.f32 %f57, %f54, %f54, %f56;\n" -" setp.lt.ftz.f32 %p4, %f57, %f34;\n" -" @!%p4 bra $Lt_0_28162;\n" -" ld.param.f32 %f58, [__cudaparm_kernel_pair_cut_ljsq];\n" -" setp.lt.ftz.f32 %p5, %f57, %f58;\n" -" rcp.approx.ftz.f32 %f59, %f57;\n" -" @!%p5 bra $Lt_0_25858;\n" -" .loc 16 92 0\n" -" mul.ftz.f32 %f60, %f59, %f59;\n" -" mul.ftz.f32 %f61, %f59, %f60;\n" -" mov.f32 %f62, %f61;\n" -" .loc 16 93 0\n" -" cvt.rzi.ftz.s32.f32 %r49, %f51;\n" -" cvt.rzi.ftz.s32.f32 %r50, %f28;\n" -" ld.param.u64 %rd32, [__cudaparm_kernel_pair_lj1];\n" -" ld.param.s32 %r51, [__cudaparm_kernel_pair_lj_types];\n" -" mul.lo.s32 %r52, %r51, %r50;\n" -" add.s32 %r53, %r49, %r52;\n" -" cvt.s64.s32 %rd33, %r53;\n" -" mul.wide.s32 %rd34, %r53, 16;\n" -" add.u64 %rd35, %rd32, %rd34;\n" -" mul.ftz.f32 %f63, %f61, %f40;\n" -" ld.global.v2.f32 {%f64,%f65}, [%rd35+0];\n" -" mul.ftz.f32 %f66, %f64, %f61;\n" -" sub.ftz.f32 %f67, %f66, %f65;\n" -" mul.ftz.f32 %f68, %f63, %f67;\n" -" ld.param.f32 %f69, [__cudaparm_kernel_pair_cut_lj_innersq];\n" -" setp.gt.ftz.f32 %p6, %f57, %f69;\n" -" @!%p6 bra $Lt_0_25602;\n" -" .loc 16 99 0\n" -" add.ftz.f32 %f70, %f57, %f57;\n" -" sub.ftz.f32 %f71, %f58, %f57;\n" -" add.ftz.f32 %f72, %f70, %f58;\n" -" mul.ftz.f32 %f73, %f71, %f71;\n" -" mov.f32 %f74, 0f40400000; \n" -" mul.ftz.f32 %f75, %f74, %f69;\n" -" sub.ftz.f32 %f76, %f72, %f75;\n" -" ld.param.f32 %f77, [__cudaparm_kernel_pair_denom_lj];\n" -" div.approx.ftz.f32 %f78, %f76, %f77;\n" -" mul.ftz.f32 %f79, %f73, %f78;\n" -" mov.f32 %f80, %f79;\n" -" .loc 16 102 0\n" -" mov.f32 %f81, 0f41400000; \n" -" mul.ftz.f32 %f82, %f57, %f81;\n" -" mul.ftz.f32 %f83, %f71, %f82;\n" -" sub.ftz.f32 %f84, %f57, %f69;\n" -" mul.ftz.f32 %f85, %f83, %f84;\n" -" div.approx.ftz.f32 %f86, %f85, %f77;\n" -" ld.global.v2.f32 {%f87,%f88}, [%rd35+8];\n" -" mul.ftz.f32 %f89, %f87, %f61;\n" -" sub.ftz.f32 %f90, %f89, %f88;\n" -" mul.ftz.f32 %f91, %f61, %f90;\n" -" mul.ftz.f32 %f92, %f86, %f91;\n" -" fma.rn.ftz.f32 %f68, %f68, %f79, %f92;\n" -" bra.uni $Lt_0_25602;\n" -"$Lt_0_25858:\n" -" .loc 16 105 0\n" -" mov.f32 %f68, 0f00000000; \n" -"$Lt_0_25602:\n" -" ld.param.f32 %f93, [__cudaparm_kernel_pair_cut_coulsq];\n" -" setp.gt.ftz.f32 %p7, %f93, %f57;\n" -" @!%p7 bra $Lt_0_26882;\n" -" .loc 20 518 0\n" -" rsqrt.approx.ftz.f32 %f94, %f59;\n" -" ld.param.f32 %f95, [__cudaparm_kernel_pair_g_ewald];\n" -" mul.ftz.f32 %f96, %f95, %f94;\n" -" mul.ftz.f32 %f97, %f96, %f96;\n" -" neg.ftz.f32 %f98, %f97;\n" -" mov.f32 %f99, 0f3fb8aa3b; \n" -" mul.ftz.f32 %f100, %f98, %f99;\n" -" ex2.approx.ftz.f32 %f101, %f100;\n" -" .loc 16 112 0\n" -" mov.f32 %f102, 0f3f800000; \n" -" mov.f32 %f103, 0f3ea7ba05; \n" -" fma.rn.ftz.f32 %f104, %f103, %f96, %f102;\n" -" rcp.approx.ftz.f32 %f105, %f104;\n" -" mov.f32 %f106, 0f3e827906; \n" -" mov.f32 %f107, 0fbe91a98e; \n" -" mov.f32 %f108, 0f3fb5f0e3; \n" -" mov.f32 %f109, 0fbfba00e3; \n" -" mov.f32 %f110, 0f3f87dc22; \n" -" fma.rn.ftz.f32 %f111, %f110, %f105, %f109;\n" -" fma.rn.ftz.f32 %f112, %f105, %f111, %f108;\n" -" fma.rn.ftz.f32 %f113, %f105, %f112, %f107;\n" -" fma.rn.ftz.f32 %f114, %f105, %f113, %f106;\n" -" mul.ftz.f32 %f115, %f105, %f114;\n" -" mul.ftz.f32 %f116, %f101, %f115;\n" -" mov.f32 %f117, %f116;\n" -" .loc 16 113 0\n" -" mov.u32 %r54, %r41;\n" -" mov.s32 %r55, 0;\n" -" mov.u32 %r56, %r55;\n" -" mov.s32 %r57, 0;\n" -" mov.u32 %r58, %r57;\n" -" mov.s32 %r59, 0;\n" -" mov.u32 %r60, %r59;\n" -" tex.1d.v4.f32.s32 {%f118,%f119,%f120,%f121},[q_tex,{%r54,%r56,%r58,%r60}];\n" -" mov.f32 %f122, %f118;\n" -" ld.param.f32 %f123, [__cudaparm_kernel_pair_qqrd2e];\n" -" mul.ftz.f32 %f124, %f123, %f33;\n" -" mul.ftz.f32 %f125, %f124, %f122;\n" -" div.approx.ftz.f32 %f126, %f125, %f94;\n" -" mov.f32 %f127, %f126;\n" -" .loc 16 114 0\n" -" mov.f32 %f128, 0f3f906ebb; \n" -" mul.ftz.f32 %f129, %f96, %f128;\n" -" fma.rn.ftz.f32 %f130, %f101, %f129, %f116;\n" -" sub.ftz.f32 %f131, %f130, %f43;\n" -" mul.ftz.f32 %f132, %f126, %f131;\n" -" bra.uni $Lt_0_26626;\n" -"$Lt_0_26882:\n" -" .loc 16 116 0\n" -" mov.f32 %f132, 0f00000000; \n" -"$Lt_0_26626:\n" -" .loc 16 120 0\n" -" add.ftz.f32 %f133, %f132, %f68;\n" -" mul.ftz.f32 %f134, %f133, %f59;\n" -" fma.rn.ftz.f32 %f37, %f53, %f134, %f37;\n" -" .loc 16 121 0\n" -" fma.rn.ftz.f32 %f36, %f52, %f134, %f36;\n" -" .loc 16 122 0\n" -" fma.rn.ftz.f32 %f35, %f54, %f134, %f35;\n" -" ld.param.s32 %r61, [__cudaparm_kernel_pair_eflag];\n" -" mov.u32 %r62, 0;\n" -" setp.le.s32 %p8, %r61, %r62;\n" -" @%p8 bra $Lt_0_27650;\n" -" .loc 16 125 0\n" -" mov.f32 %f135, %f127;\n" -" mov.f32 %f136, %f117;\n" -" sub.ftz.f32 %f137, %f136, %f43;\n" -" fma.rn.ftz.f32 %f138, %f135, %f137, %f38;\n" -" selp.f32 %f38, %f138, %f38, %p7;\n" -" @!%p5 bra $Lt_0_27650;\n" -" .loc 16 128 0\n" -" cvt.rzi.ftz.s32.f32 %r63, %f51;\n" -" cvt.rzi.ftz.s32.f32 %r64, %f28;\n" -" ld.param.u64 %rd36, [__cudaparm_kernel_pair_lj1];\n" -" ld.param.s32 %r65, [__cudaparm_kernel_pair_lj_types];\n" -" mul.lo.s32 %r66, %r65, %r64;\n" -" add.s32 %r67, %r63, %r66;\n" -" cvt.s64.s32 %rd37, %r67;\n" -" mul.wide.s32 %rd38, %r67, 16;\n" -" add.u64 %rd35, %rd36, %rd38;\n" -" mov.f32 %f139, %f62;\n" -" ld.global.v2.f32 {%f140,%f141}, [%rd35+8];\n" -" mul.ftz.f32 %f142, %f140, %f139;\n" -" sub.ftz.f32 %f143, %f142, %f141;\n" -" mul.ftz.f32 %f144, %f139, %f143;\n" -" mov.f32 %f145, %f80;\n" -" mul.ftz.f32 %f146, %f145, %f144;\n" -" ld.param.f32 %f147, [__cudaparm_kernel_pair_cut_lj_innersq];\n" -" setp.lt.ftz.f32 %p9, %f147, %f57;\n" -" selp.f32 %f148, %f146, %f144, %p9;\n" -" .loc 16 131 0\n" -" fma.rn.ftz.f32 %f39, %f40, %f148, %f39;\n" -"$Lt_0_27650:\n" -"$Lt_0_27138:\n" -" ld.param.s32 %r68, [__cudaparm_kernel_pair_vflag];\n" -" mov.u32 %r69, 0;\n" -" setp.le.s32 %p10, %r68, %r69;\n" -" @%p10 bra $Lt_0_28162;\n" -" .loc 16 135 0\n" -" mov.f32 %f149, %f10;\n" -" mul.ftz.f32 %f150, %f53, %f53;\n" -" fma.rn.ftz.f32 %f151, %f134, %f150, %f149;\n" -" mov.f32 %f10, %f151;\n" -" .loc 16 136 0\n" -" mov.f32 %f152, %f12;\n" -" fma.rn.ftz.f32 %f153, %f134, %f55, %f152;\n" -" mov.f32 %f12, %f153;\n" -" .loc 16 137 0\n" -" mov.f32 %f154, %f14;\n" -" mul.ftz.f32 %f155, %f54, %f54;\n" -" fma.rn.ftz.f32 %f156, %f134, %f155, %f154;\n" -" mov.f32 %f14, %f156;\n" -" .loc 16 138 0\n" -" mov.f32 %f157, %f16;\n" -" mul.ftz.f32 %f158, %f52, %f53;\n" -" fma.rn.ftz.f32 %f159, %f134, %f158, %f157;\n" -" mov.f32 %f16, %f159;\n" -" .loc 16 139 0\n" -" mov.f32 %f160, %f18;\n" -" mul.ftz.f32 %f161, %f53, %f54;\n" -" fma.rn.ftz.f32 %f162, %f134, %f161, %f160;\n" -" mov.f32 %f18, %f162;\n" -" .loc 16 140 0\n" -" mul.ftz.f32 %f163, %f52, %f54;\n" -" fma.rn.ftz.f32 %f19, %f134, %f163, %f19;\n" -" mov.f32 %f20, %f19;\n" -"$Lt_0_28162:\n" -"$Lt_0_25090:\n" -" .loc 16 69 0\n" -" mul.lo.u64 %rd39, %rd27, 4;\n" -" add.u64 %rd20, %rd20, %rd39;\n" -" setp.lt.u64 %p11, %rd20, %rd19;\n" -" @%p11 bra $Lt_0_24834;\n" -" bra.uni $Lt_0_24322;\n" -"$Lt_0_34562:\n" -" mov.f32 %f35, 0f00000000; \n" -" mov.f32 %f36, 0f00000000; \n" -" mov.f32 %f37, 0f00000000; \n" -" mov.f32 %f38, 0f00000000; \n" -" mov.f32 %f39, 0f00000000; \n" -"$Lt_0_24322:\n" -" mov.u32 %r70, 1;\n" -" setp.le.s32 %p12, %r1, %r70;\n" -" @%p12 bra $Lt_0_30978;\n" -" .loc 16 145 0\n" -" mov.u64 %rd40, __cuda___cuda_local_var_32646_55_non_const_red_acc152;\n" -" cvt.s64.s32 %rd41, %r2;\n" -" mul.wide.s32 %rd42, %r2, 4;\n" -" add.u64 %rd43, %rd40, %rd42;\n" -" mov.f32 %f164, %f37;\n" -" st.shared.f32 [%rd43+0], %f164;\n" -" mov.f32 %f165, %f36;\n" -" st.shared.f32 [%rd43+512], %f165;\n" -" mov.f32 %f166, %f35;\n" -" st.shared.f32 [%rd43+1024], %f166;\n" -" mov.f32 %f167, %f39;\n" -" st.shared.f32 [%rd43+1536], %f167;\n" -" mov.f32 %f168, %f38;\n" -" st.shared.f32 [%rd43+2048], %f168;\n" -" shr.s32 %r71, %r1, 31;\n" -" mov.s32 %r72, 1;\n" -" and.b32 %r73, %r71, %r72;\n" -" add.s32 %r74, %r73, %r1;\n" -" shr.s32 %r75, %r74, 1;\n" -" mov.s32 %r76, %r75;\n" -" mov.u32 %r77, 0;\n" -" setp.ne.u32 %p13, %r75, %r77;\n" -" @!%p13 bra $Lt_0_29442;\n" -"$Lt_0_29954:\n" -" setp.ge.u32 %p14, %r14, %r76;\n" -" @%p14 bra $Lt_0_30210;\n" -" add.u32 %r78, %r2, %r76;\n" -" cvt.u64.u32 %rd44, %r78;\n" -" mul.wide.u32 %rd45, %r78, 4;\n" -" add.u64 %rd46, %rd40, %rd45;\n" -" ld.shared.f32 %f169, [%rd46+0];\n" -" add.ftz.f32 %f164, %f169, %f164;\n" -" st.shared.f32 [%rd43+0], %f164;\n" -" ld.shared.f32 %f170, [%rd46+512];\n" -" add.ftz.f32 %f165, %f170, %f165;\n" -" st.shared.f32 [%rd43+512], %f165;\n" -" ld.shared.f32 %f171, [%rd46+1024];\n" -" add.ftz.f32 %f166, %f171, %f166;\n" -" st.shared.f32 [%rd43+1024], %f166;\n" -" ld.shared.f32 %f172, [%rd46+1536];\n" -" add.ftz.f32 %f167, %f172, %f167;\n" -" st.shared.f32 [%rd43+1536], %f167;\n" -" ld.shared.f32 %f173, [%rd46+2048];\n" -" add.ftz.f32 %f168, %f173, %f168;\n" -" st.shared.f32 [%rd43+2048], %f168;\n" -"$Lt_0_30210:\n" -" shr.u32 %r76, %r76, 1;\n" -" mov.u32 %r79, 0;\n" -" setp.ne.u32 %p15, %r76, %r79;\n" -" @%p15 bra $Lt_0_29954;\n" -"$Lt_0_29442:\n" -" mov.f32 %f37, %f164;\n" -" mov.f32 %f36, %f165;\n" -" mov.f32 %f35, %f166;\n" -" mov.f32 %f39, %f167;\n" -" mov.f32 %f38, %f168;\n" -" ld.param.s32 %r80, [__cudaparm_kernel_pair_vflag];\n" -" mov.u32 %r81, 0;\n" -" setp.le.s32 %p16, %r80, %r81;\n" -" @%p16 bra $Lt_0_30978;\n" -" mov.f32 %f164, %f10;\n" -" st.shared.f32 [%rd43+0], %f164;\n" -" mov.f32 %f165, %f12;\n" -" st.shared.f32 [%rd43+512], %f165;\n" -" mov.f32 %f166, %f14;\n" -" st.shared.f32 [%rd43+1024], %f166;\n" -" mov.f32 %f167, %f16;\n" -" st.shared.f32 [%rd43+1536], %f167;\n" -" mov.f32 %f168, %f18;\n" -" st.shared.f32 [%rd43+2048], %f168;\n" -" mov.f32 %f174, %f19;\n" -" st.shared.f32 [%rd43+2560], %f174;\n" -" mov.s32 %r82, %r75;\n" -" @!%p13 bra $Lt_0_31490;\n" -"$Lt_0_32002:\n" -" setp.ge.u32 %p17, %r14, %r82;\n" -" @%p17 bra $Lt_0_32258;\n" -" add.u32 %r83, %r2, %r82;\n" -" cvt.u64.u32 %rd47, %r83;\n" -" mul.wide.u32 %rd48, %r83, 4;\n" -" add.u64 %rd49, %rd40, %rd48;\n" -" ld.shared.f32 %f175, [%rd49+0];\n" -" add.ftz.f32 %f164, %f175, %f164;\n" -" st.shared.f32 [%rd43+0], %f164;\n" -" ld.shared.f32 %f176, [%rd49+512];\n" -" add.ftz.f32 %f165, %f176, %f165;\n" -" st.shared.f32 [%rd43+512], %f165;\n" -" ld.shared.f32 %f177, [%rd49+1024];\n" -" add.ftz.f32 %f166, %f177, %f166;\n" -" st.shared.f32 [%rd43+1024], %f166;\n" -" ld.shared.f32 %f178, [%rd49+1536];\n" -" add.ftz.f32 %f167, %f178, %f167;\n" -" st.shared.f32 [%rd43+1536], %f167;\n" -" ld.shared.f32 %f179, [%rd49+2048];\n" -" add.ftz.f32 %f168, %f179, %f168;\n" -" st.shared.f32 [%rd43+2048], %f168;\n" -" ld.shared.f32 %f180, [%rd49+2560];\n" -" add.ftz.f32 %f174, %f180, %f174;\n" -" st.shared.f32 [%rd43+2560], %f174;\n" -"$Lt_0_32258:\n" -" shr.u32 %r82, %r82, 1;\n" -" mov.u32 %r84, 0;\n" -" setp.ne.u32 %p18, %r82, %r84;\n" -" @%p18 bra $Lt_0_32002;\n" -"$Lt_0_31490:\n" -" mov.f32 %f10, %f164;\n" -" mov.f32 %f12, %f165;\n" -" mov.f32 %f14, %f166;\n" -" mov.f32 %f16, %f167;\n" -" mov.f32 %f18, %f168;\n" -" mov.f32 %f20, %f174;\n" -"$Lt_0_30978:\n" -"$Lt_0_28930:\n" -" mov.u32 %r85, 0;\n" -" setp.ne.s32 %p19, %r14, %r85;\n" -" @%p19 bra $Lt_0_33026;\n" -" ld.param.u64 %rd50, [__cudaparm_kernel_pair___val_paramengv];\n" -" add.u64 %rd51, %rd50, %rd3;\n" -" ld.param.s32 %r86, [__cudaparm_kernel_pair_eflag];\n" -" mov.u32 %r87, 0;\n" -" setp.le.s32 %p20, %r86, %r87;\n" -" @%p20 bra $Lt_0_33538;\n" -" st.global.f32 [%rd51+0], %f39;\n" -" cvt.s64.s32 %rd52, %r9;\n" -" mul.wide.s32 %rd53, %r9, 4;\n" -" add.u64 %rd54, %rd53, %rd51;\n" -" st.global.f32 [%rd54+0], %f38;\n" -" add.u64 %rd51, %rd53, %rd54;\n" -"$Lt_0_33538:\n" -" ld.param.s32 %r88, [__cudaparm_kernel_pair_vflag];\n" -" mov.u32 %r89, 0;\n" -" setp.le.s32 %p21, %r88, %r89;\n" -" @%p21 bra $Lt_0_34050;\n" -" mov.f32 %f181, %f10;\n" -" st.global.f32 [%rd51+0], %f181;\n" -" cvt.s64.s32 %rd55, %r9;\n" -" mul.wide.s32 %rd56, %r9, 4;\n" -" add.u64 %rd57, %rd56, %rd51;\n" -" mov.f32 %f182, %f12;\n" -" st.global.f32 [%rd57+0], %f182;\n" -" add.u64 %rd58, %rd56, %rd57;\n" -" mov.f32 %f183, %f14;\n" -" st.global.f32 [%rd58+0], %f183;\n" -" add.u64 %rd59, %rd56, %rd58;\n" -" mov.f32 %f184, %f16;\n" -" st.global.f32 [%rd59+0], %f184;\n" -" add.u64 %rd51, %rd56, %rd59;\n" -" mov.f32 %f185, %f18;\n" -" st.global.f32 [%rd51+0], %f185;\n" -" mov.f32 %f186, %f20;\n" -" add.u64 %rd60, %rd56, %rd51;\n" -" st.global.f32 [%rd60+0], %f186;\n" -"$Lt_0_34050:\n" -" ld.param.u64 %rd61, [__cudaparm_kernel_pair_ans];\n" -" mul.lo.u64 %rd62, %rd2, 16;\n" -" add.u64 %rd63, %rd61, %rd62;\n" -" mov.f32 %f187, %f188;\n" -" st.global.v4.f32 [%rd63+0], {%f37,%f36,%f35,%f187};\n" -"$Lt_0_33026:\n" -"$Lt_0_23298:\n" -" .loc 16 148 0\n" -" exit;\n" -"$LDWend_kernel_pair:\n" -" }\n" -" .entry kernel_pair_fast (\n" -" .param .u64 __cudaparm_kernel_pair_fast_x_,\n" -" .param .u64 __cudaparm_kernel_pair_fast_ljd_in,\n" -" .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,\n" -" .param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n" -" .param .u64 __cudaparm_kernel_pair_fast_dev_packed,\n" -" .param .u64 __cudaparm_kernel_pair_fast_ans,\n" -" .param .u64 __cudaparm_kernel_pair_fast___val_paramengv,\n" -" .param .s32 __cudaparm_kernel_pair_fast_eflag,\n" -" .param .s32 __cudaparm_kernel_pair_fast_vflag,\n" -" .param .s32 __cudaparm_kernel_pair_fast_inum,\n" -" .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,\n" -" .param .u64 __cudaparm_kernel_pair_fast_q_,\n" -" .param .f32 __cudaparm_kernel_pair_fast_cut_coulsq,\n" -" .param .f32 __cudaparm_kernel_pair_fast_qqrd2e,\n" -" .param .f32 __cudaparm_kernel_pair_fast_g_ewald,\n" -" .param .f32 __cudaparm_kernel_pair_fast_denom_lj,\n" -" .param .f32 __cudaparm_kernel_pair_fast_cut_bothsq,\n" -" .param .f32 __cudaparm_kernel_pair_fast_cut_ljsq,\n" -" .param .f32 __cudaparm_kernel_pair_fast_cut_lj_innersq,\n" -" .param .s32 __cudaparm_kernel_pair_fast_t_per_atom)\n" -" {\n" -" .reg .u32 %r<86>;\n" -" .reg .u64 %rd<72>;\n" -" .reg .f32 %f<196>;\n" -" .reg .pred %p<25>;\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_32666_33_non_const_sp_lj3336[32];\n" -" .shared .align 8 .b8 __cuda___cuda_local_var_32665_34_non_const_ljd3368[1024];\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_32775_55_non_const_red_acc4392[3072];\n" -" .loc 16 160 0\n" -"$LDWbegin_kernel_pair_fast:\n" -" cvt.s32.u32 %r1, %tid.x;\n" -" cvt.s64.s32 %rd1, %r1;\n" -" mov.u32 %r2, 7;\n" -" setp.gt.s32 %p1, %r1, %r2;\n" -" @%p1 bra $Lt_1_24834;\n" -" .loc 16 167 0\n" -" mov.u64 %rd2, __cuda___cuda_local_var_32666_33_non_const_sp_lj3336;\n" -" mul.lo.u64 %rd3, %rd1, 4;\n" -" ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];\n" -" add.u64 %rd5, %rd4, %rd3;\n" -" ld.global.f32 %f1, [%rd5+0];\n" -" add.u64 %rd6, %rd3, %rd2;\n" -" st.shared.f32 [%rd6+0], %f1;\n" -"$Lt_1_24834:\n" -" mov.u64 %rd7, __cuda___cuda_local_var_32665_34_non_const_ljd3368;\n" -" mov.u64 %rd2, __cuda___cuda_local_var_32666_33_non_const_sp_lj3336;\n" -" .loc 16 168 0\n" -" mul.lo.u64 %rd8, %rd1, 8;\n" -" ld.param.u64 %rd9, [__cudaparm_kernel_pair_fast_ljd_in];\n" -" add.u64 %rd10, %rd9, %rd8;\n" -" add.u64 %rd11, %rd8, %rd7;\n" -" ld.global.v2.f32 {%f2,%f3}, [%rd10+0];\n" -" st.shared.v2.f32 [%rd11+0], {%f2,%f3};\n" -" add.s32 %r3, %r1, 128;\n" -" mov.u32 %r4, 127;\n" -" setp.gt.s32 %p2, %r3, %r4;\n" -" @%p2 bra $Lt_1_25346;\n" -" ld.global.v2.f32 {%f4,%f5}, [%rd10+1024];\n" -" st.shared.v2.f32 [%rd11+1024], {%f4,%f5};\n" -"$Lt_1_25346:\n" -" .loc 16 178 0\n" -" mov.f32 %f6, 0f00000000; \n" -" mov.f32 %f7, %f6;\n" -" mov.f32 %f8, 0f00000000; \n" -" mov.f32 %f9, %f8;\n" -" mov.f32 %f10, 0f00000000; \n" -" mov.f32 %f11, %f10;\n" -" mov.f32 %f12, 0f00000000; \n" -" mov.f32 %f13, %f12;\n" -" mov.f32 %f14, 0f00000000; \n" -" mov.f32 %f15, %f14;\n" -" mov.f32 %f16, 0f00000000; \n" -" mov.f32 %f17, %f16;\n" -" .loc 16 180 0\n" -" bar.sync 0;\n" -" ld.param.s32 %r5, [__cudaparm_kernel_pair_fast_t_per_atom];\n" -" div.s32 %r6, %r1, %r5;\n" -" cvt.s32.u32 %r7, %ntid.x;\n" -" div.s32 %r8, %r7, %r5;\n" -" cvt.s32.u32 %r9, %ctaid.x;\n" -" mul.lo.s32 %r10, %r9, %r8;\n" -" add.s32 %r11, %r6, %r10;\n" -" ld.param.s32 %r12, [__cudaparm_kernel_pair_fast_inum];\n" -" setp.ge.s32 %p3, %r11, %r12;\n" -" @%p3 bra $Lt_1_35586;\n" -" .loc 16 185 0\n" -" cvt.s64.s32 %rd12, %r11;\n" -" mul.wide.s32 %rd13, %r11, 4;\n" -" ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_dev_nbor];\n" -" add.u64 %rd15, %rd13, %rd14;\n" -" ld.global.s32 %r13, [%rd15+0];\n" -" ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_nbor_pitch];\n" -" cvt.s64.s32 %rd16, %r14;\n" -" mul.wide.s32 %rd17, %r14, 4;\n" -" add.u64 %rd18, %rd17, %rd15;\n" -" ld.global.s32 %r15, [%rd18+0];\n" -" sub.s32 %r16, %r5, 1;\n" -" and.b32 %r17, %r16, %r1;\n" -" cvt.s64.s32 %rd19, %r17;\n" -" mul.wide.s32 %rd20, %r17, 4;\n" -" ld.param.u64 %rd21, [__cudaparm_kernel_pair_fast_dev_packed];\n" -" setp.ne.u64 %p4, %rd21, %rd14;\n" -" @%p4 bra $Lt_1_26626;\n" -" cvt.s32.s64 %r18, %rd16;\n" -" mul.lo.s32 %r19, %r18, %r5;\n" -" mov.s32 %r20, %r19;\n" -" mul.lo.s32 %r21, %r16, %r11;\n" -" add.s32 %r22, %r18, %r21;\n" -" cvt.s64.s32 %rd22, %r22;\n" -" mul.wide.s32 %rd23, %r22, 4;\n" -" add.u64 %rd24, %rd18, %rd23;\n" -" and.b32 %r23, %r16, %r15;\n" -" cvt.s64.s32 %rd25, %r23;\n" -" div.s32 %r24, %r15, %r5;\n" -" mul.lo.s32 %r25, %r19, %r24;\n" -" cvt.s64.s32 %rd26, %r25;\n" -" add.u64 %rd27, %rd25, %rd26;\n" -" mul.lo.u64 %rd28, %rd27, 4;\n" -" add.u64 %rd29, %rd24, %rd28;\n" -" add.u64 %rd30, %rd20, %rd24;\n" -" bra.uni $Lt_1_26370;\n" -"$Lt_1_26626:\n" -" add.u64 %rd31, %rd17, %rd18;\n" -" ld.global.s32 %r26, [%rd31+0];\n" -" cvt.s64.s32 %rd32, %r26;\n" -" mul.wide.s32 %rd33, %r26, 4;\n" -" add.u64 %rd34, %rd21, %rd33;\n" -" cvt.s64.s32 %rd35, %r15;\n" -" mul.wide.s32 %rd36, %r15, 4;\n" -" add.u64 %rd29, %rd34, %rd36;\n" -" mov.s32 %r20, %r5;\n" -" add.u64 %rd30, %rd20, %rd34;\n" -"$Lt_1_26370:\n" -" .loc 16 188 0\n" -" mov.u32 %r27, %r13;\n" -" mov.s32 %r28, 0;\n" -" mov.u32 %r29, %r28;\n" -" mov.s32 %r30, 0;\n" -" mov.u32 %r31, %r30;\n" -" mov.s32 %r32, 0;\n" -" mov.u32 %r33, %r32;\n" -" tex.1d.v4.f32.s32 {%f18,%f19,%f20,%f21},[pos_tex,{%r27,%r29,%r31,%r33}];\n" -" mov.f32 %f22, %f18;\n" -" mov.f32 %f23, %f19;\n" -" mov.f32 %f24, %f20;\n" -" mov.f32 %f25, %f21;\n" -" .loc 16 189 0\n" -" mov.u32 %r34, %r13;\n" -" mov.s32 %r35, 0;\n" -" mov.u32 %r36, %r35;\n" -" mov.s32 %r37, 0;\n" -" mov.u32 %r38, %r37;\n" -" mov.s32 %r39, 0;\n" -" mov.u32 %r40, %r39;\n" -" tex.1d.v4.f32.s32 {%f26,%f27,%f28,%f29},[q_tex,{%r34,%r36,%r38,%r40}];\n" -" mov.f32 %f30, %f26;\n" -" setp.ge.u64 %p5, %rd30, %rd29;\n" -" @%p5 bra $Lt_1_37122;\n" -" cvt.rzi.ftz.s32.f32 %r41, %f25;\n" -" cvt.s64.s32 %rd37, %r20;\n" -" ld.param.f32 %f31, [__cudaparm_kernel_pair_fast_cut_bothsq];\n" -" mov.f32 %f32, 0f00000000; \n" -" mov.f32 %f33, 0f00000000; \n" -" mov.f32 %f34, 0f00000000; \n" -" mov.f32 %f35, 0f00000000; \n" -" mov.f32 %f36, 0f00000000; \n" -"$Lt_1_27394:\n" -" .loc 16 193 0\n" -" ld.global.s32 %r42, [%rd30+0];\n" -" .loc 16 196 0\n" -" shr.s32 %r43, %r42, 30;\n" -" and.b32 %r44, %r43, 3;\n" -" cvt.s64.s32 %rd38, %r44;\n" -" mul.wide.s32 %rd39, %r44, 4;\n" -" add.u64 %rd40, %rd2, %rd39;\n" -" ld.shared.f32 %f37, [%rd40+0];\n" -" .loc 16 197 0\n" -" mov.f32 %f38, 0f3f800000; \n" -" ld.shared.f32 %f39, [%rd40+16];\n" -" sub.ftz.f32 %f40, %f38, %f39;\n" -" .loc 16 200 0\n" -" and.b32 %r45, %r42, 1073741823;\n" -" mov.u32 %r46, %r45;\n" -" mov.s32 %r47, 0;\n" -" mov.u32 %r48, %r47;\n" -" mov.s32 %r49, 0;\n" -" mov.u32 %r50, %r49;\n" -" mov.s32 %r51, 0;\n" -" mov.u32 %r52, %r51;\n" -" tex.1d.v4.f32.s32 {%f41,%f42,%f43,%f44},[pos_tex,{%r46,%r48,%r50,%r52}];\n" -" mov.f32 %f45, %f41;\n" -" mov.f32 %f46, %f42;\n" -" mov.f32 %f47, %f43;\n" -" mov.f32 %f48, %f44;\n" -" sub.ftz.f32 %f49, %f23, %f46;\n" -" sub.ftz.f32 %f50, %f22, %f45;\n" -" sub.ftz.f32 %f51, %f24, %f47;\n" -" mul.ftz.f32 %f52, %f49, %f49;\n" -" fma.rn.ftz.f32 %f53, %f50, %f50, %f52;\n" -" fma.rn.ftz.f32 %f54, %f51, %f51, %f53;\n" -" setp.lt.ftz.f32 %p6, %f54, %f31;\n" -" @!%p6 bra $Lt_1_30722;\n" -" ld.param.f32 %f55, [__cudaparm_kernel_pair_fast_cut_ljsq];\n" -" setp.lt.ftz.f32 %p7, %f54, %f55;\n" -" rcp.approx.ftz.f32 %f56, %f54;\n" -" @!%p7 bra $Lt_1_28418;\n" -" .loc 16 215 0\n" -" cvt.rzi.ftz.s32.f32 %r53, %f48;\n" -" cvt.s64.s32 %rd41, %r41;\n" -" mul.wide.s32 %rd42, %r41, 8;\n" -" add.u64 %rd43, %rd7, %rd42;\n" -" cvt.s64.s32 %rd44, %r53;\n" -" mul.wide.s32 %rd45, %r53, 8;\n" -" add.u64 %rd46, %rd7, %rd45;\n" -" ld.shared.v2.f32 {%f57,%f58}, [%rd43+0];\n" -" ld.shared.v2.f32 {%f59,%f60}, [%rd46+0];\n" -" mul.ftz.f32 %f61, %f57, %f59;\n" -" .loc 16 216 0\n" -" add.ftz.f32 %f62, %f58, %f60;\n" -" mov.f32 %f63, 0f3f000000; \n" -" mul.ftz.f32 %f64, %f62, %f63;\n" -" .loc 16 220 0\n" -" mul.ftz.f32 %f65, %f64, %f64;\n" -" sqrt.approx.ftz.f32 %f66, %f61;\n" -" mov.f32 %f67, 0f40800000; \n" -" mul.ftz.f32 %f68, %f66, %f67;\n" -" mul.ftz.f32 %f69, %f65, %f56;\n" -" mul.ftz.f32 %f70, %f69, %f69;\n" -" mul.ftz.f32 %f71, %f69, %f70;\n" -" mul.ftz.f32 %f72, %f68, %f71;\n" -" mov.f32 %f73, %f72;\n" -" .loc 16 221 0\n" -" mul.ftz.f32 %f74, %f71, %f72;\n" -" mov.f32 %f75, %f74;\n" -" .loc 16 222 0\n" -" mov.f32 %f76, 0f40c00000; \n" -" mul.ftz.f32 %f77, %f72, %f76;\n" -" mov.f32 %f78, 0f41400000; \n" -" mul.ftz.f32 %f79, %f78, %f74;\n" -" sub.ftz.f32 %f80, %f79, %f77;\n" -" mul.ftz.f32 %f81, %f37, %f80;\n" -" ld.param.f32 %f82, [__cudaparm_kernel_pair_fast_cut_lj_innersq];\n" -" setp.gt.ftz.f32 %p8, %f54, %f82;\n" -" @!%p8 bra $Lt_1_28162;\n" -" .loc 16 228 0\n" -" add.ftz.f32 %f83, %f54, %f54;\n" -" sub.ftz.f32 %f84, %f55, %f54;\n" -" add.ftz.f32 %f85, %f83, %f55;\n" -" mul.ftz.f32 %f86, %f84, %f84;\n" -" mov.f32 %f87, 0f40400000; \n" -" mul.ftz.f32 %f88, %f87, %f82;\n" -" sub.ftz.f32 %f89, %f85, %f88;\n" -" ld.param.f32 %f90, [__cudaparm_kernel_pair_fast_denom_lj];\n" -" div.approx.ftz.f32 %f91, %f89, %f90;\n" -" mul.ftz.f32 %f92, %f86, %f91;\n" -" mov.f32 %f93, %f92;\n" -" .loc 16 231 0\n" -" mov.f32 %f94, 0f41400000; \n" -" mul.ftz.f32 %f95, %f54, %f94;\n" -" mul.ftz.f32 %f96, %f84, %f95;\n" -" sub.ftz.f32 %f97, %f54, %f82;\n" -" mul.ftz.f32 %f98, %f96, %f97;\n" -" div.approx.ftz.f32 %f99, %f98, %f90;\n" -" sub.ftz.f32 %f100, %f74, %f72;\n" -" mul.ftz.f32 %f101, %f99, %f100;\n" -" fma.rn.ftz.f32 %f81, %f81, %f92, %f101;\n" -" bra.uni $Lt_1_28162;\n" -"$Lt_1_28418:\n" -" .loc 16 234 0\n" -" mov.f32 %f81, 0f00000000; \n" -"$Lt_1_28162:\n" -" ld.param.f32 %f102, [__cudaparm_kernel_pair_fast_cut_coulsq];\n" -" setp.gt.ftz.f32 %p9, %f102, %f54;\n" -" @!%p9 bra $Lt_1_29442;\n" -" .loc 20 518 0\n" -" rsqrt.approx.ftz.f32 %f103, %f56;\n" -" ld.param.f32 %f104, [__cudaparm_kernel_pair_fast_g_ewald];\n" -" mul.ftz.f32 %f105, %f104, %f103;\n" -" mul.ftz.f32 %f106, %f105, %f105;\n" -" neg.ftz.f32 %f107, %f106;\n" -" mov.f32 %f108, 0f3fb8aa3b; \n" -" mul.ftz.f32 %f109, %f107, %f108;\n" -" ex2.approx.ftz.f32 %f110, %f109;\n" -" .loc 16 241 0\n" -" mov.f32 %f111, 0f3f800000; \n" -" mov.f32 %f112, 0f3ea7ba05; \n" -" fma.rn.ftz.f32 %f113, %f112, %f105, %f111;\n" -" rcp.approx.ftz.f32 %f114, %f113;\n" -" mov.f32 %f115, 0f3e827906; \n" -" mov.f32 %f116, 0fbe91a98e; \n" -" mov.f32 %f117, 0f3fb5f0e3; \n" -" mov.f32 %f118, 0fbfba00e3; \n" -" mov.f32 %f119, 0f3f87dc22; \n" -" fma.rn.ftz.f32 %f120, %f119, %f114, %f118;\n" -" fma.rn.ftz.f32 %f121, %f114, %f120, %f117;\n" -" fma.rn.ftz.f32 %f122, %f114, %f121, %f116;\n" -" fma.rn.ftz.f32 %f123, %f114, %f122, %f115;\n" -" mul.ftz.f32 %f124, %f114, %f123;\n" -" mul.ftz.f32 %f125, %f110, %f124;\n" -" mov.f32 %f126, %f125;\n" -" .loc 16 242 0\n" -" mov.u32 %r54, %r45;\n" -" mov.s32 %r55, 0;\n" -" mov.u32 %r56, %r55;\n" -" mov.s32 %r57, 0;\n" -" mov.u32 %r58, %r57;\n" -" mov.s32 %r59, 0;\n" -" mov.u32 %r60, %r59;\n" -" tex.1d.v4.f32.s32 {%f127,%f128,%f129,%f130},[q_tex,{%r54,%r56,%r58,%r60}];\n" -" mov.f32 %f131, %f127;\n" -" ld.param.f32 %f132, [__cudaparm_kernel_pair_fast_qqrd2e];\n" -" mul.ftz.f32 %f133, %f132, %f30;\n" -" mul.ftz.f32 %f134, %f133, %f131;\n" -" div.approx.ftz.f32 %f135, %f134, %f103;\n" -" mov.f32 %f136, %f135;\n" -" .loc 16 243 0\n" -" mov.f32 %f137, 0f3f906ebb; \n" -" mul.ftz.f32 %f138, %f105, %f137;\n" -" fma.rn.ftz.f32 %f139, %f110, %f138, %f125;\n" -" sub.ftz.f32 %f140, %f139, %f40;\n" -" mul.ftz.f32 %f141, %f135, %f140;\n" -" bra.uni $Lt_1_29186;\n" -"$Lt_1_29442:\n" -" .loc 16 245 0\n" -" mov.f32 %f141, 0f00000000; \n" -"$Lt_1_29186:\n" -" .loc 16 249 0\n" -" add.ftz.f32 %f142, %f141, %f81;\n" -" mul.ftz.f32 %f143, %f142, %f56;\n" -" fma.rn.ftz.f32 %f34, %f50, %f143, %f34;\n" -" .loc 16 250 0\n" -" fma.rn.ftz.f32 %f33, %f49, %f143, %f33;\n" -" .loc 16 251 0\n" -" fma.rn.ftz.f32 %f32, %f51, %f143, %f32;\n" -" ld.param.s32 %r61, [__cudaparm_kernel_pair_fast_eflag];\n" -" mov.u32 %r62, 0;\n" -" setp.le.s32 %p10, %r61, %r62;\n" -" @%p10 bra $Lt_1_30210;\n" -" .loc 16 254 0\n" -" mov.f32 %f144, %f136;\n" -" mov.f32 %f145, %f126;\n" -" sub.ftz.f32 %f146, %f145, %f40;\n" -" fma.rn.ftz.f32 %f147, %f144, %f146, %f35;\n" -" selp.f32 %f35, %f147, %f35, %p9;\n" -" @!%p7 bra $Lt_1_30210;\n" -" .loc 16 260 0\n" -" mov.f32 %f148, %f75;\n" -" mov.f32 %f149, %f73;\n" -" sub.ftz.f32 %f150, %f148, %f149;\n" -" mov.f32 %f151, %f93;\n" -" mul.ftz.f32 %f152, %f151, %f150;\n" -" ld.param.f32 %f153, [__cudaparm_kernel_pair_fast_cut_lj_innersq];\n" -" setp.lt.ftz.f32 %p11, %f153, %f54;\n" -" selp.f32 %f154, %f152, %f150, %p11;\n" -" fma.rn.ftz.f32 %f36, %f37, %f154, %f36;\n" -"$Lt_1_30210:\n" -"$Lt_1_29698:\n" -" ld.param.s32 %r63, [__cudaparm_kernel_pair_fast_vflag];\n" -" mov.u32 %r64, 0;\n" -" setp.le.s32 %p12, %r63, %r64;\n" -" @%p12 bra $Lt_1_30722;\n" -" .loc 16 264 0\n" -" mov.f32 %f155, %f7;\n" -" mul.ftz.f32 %f156, %f50, %f50;\n" -" fma.rn.ftz.f32 %f157, %f143, %f156, %f155;\n" -" mov.f32 %f7, %f157;\n" -" .loc 16 265 0\n" -" mov.f32 %f158, %f9;\n" -" fma.rn.ftz.f32 %f159, %f143, %f52, %f158;\n" -" mov.f32 %f9, %f159;\n" -" .loc 16 266 0\n" -" mov.f32 %f160, %f11;\n" -" mul.ftz.f32 %f161, %f51, %f51;\n" -" fma.rn.ftz.f32 %f162, %f143, %f161, %f160;\n" -" mov.f32 %f11, %f162;\n" -" .loc 16 267 0\n" -" mov.f32 %f163, %f13;\n" -" mul.ftz.f32 %f164, %f49, %f50;\n" -" fma.rn.ftz.f32 %f165, %f143, %f164, %f163;\n" -" mov.f32 %f13, %f165;\n" -" .loc 16 268 0\n" -" mov.f32 %f166, %f15;\n" -" mul.ftz.f32 %f167, %f50, %f51;\n" -" fma.rn.ftz.f32 %f168, %f143, %f167, %f166;\n" -" mov.f32 %f15, %f168;\n" -" .loc 16 269 0\n" -" mul.ftz.f32 %f169, %f49, %f51;\n" -" fma.rn.ftz.f32 %f16, %f143, %f169, %f16;\n" -" mov.f32 %f17, %f16;\n" -"$Lt_1_30722:\n" -"$Lt_1_27650:\n" -" .loc 16 192 0\n" -" mul.lo.u64 %rd47, %rd37, 4;\n" -" add.u64 %rd30, %rd30, %rd47;\n" -" setp.lt.u64 %p13, %rd30, %rd29;\n" -" @%p13 bra $Lt_1_27394;\n" -" bra.uni $Lt_1_26882;\n" -"$Lt_1_37122:\n" -" mov.f32 %f32, 0f00000000; \n" -" mov.f32 %f33, 0f00000000; \n" -" mov.f32 %f34, 0f00000000; \n" -" mov.f32 %f35, 0f00000000; \n" -" mov.f32 %f36, 0f00000000; \n" -"$Lt_1_26882:\n" -" mov.u32 %r65, 1;\n" -" setp.le.s32 %p14, %r5, %r65;\n" -" @%p14 bra $Lt_1_33538;\n" -" .loc 16 274 0\n" -" mov.u64 %rd48, __cuda___cuda_local_var_32775_55_non_const_red_acc4392;\n" -" mul.lo.u64 %rd49, %rd1, 4;\n" -" add.u64 %rd50, %rd48, %rd49;\n" -" mov.f32 %f170, %f34;\n" -" st.shared.f32 [%rd50+0], %f170;\n" -" mov.f32 %f171, %f33;\n" -" st.shared.f32 [%rd50+512], %f171;\n" -" mov.f32 %f172, %f32;\n" -" st.shared.f32 [%rd50+1024], %f172;\n" -" mov.f32 %f173, %f36;\n" -" st.shared.f32 [%rd50+1536], %f173;\n" -" mov.f32 %f174, %f35;\n" -" st.shared.f32 [%rd50+2048], %f174;\n" -" shr.s32 %r66, %r5, 31;\n" -" mov.s32 %r67, 1;\n" -" and.b32 %r68, %r66, %r67;\n" -" add.s32 %r69, %r68, %r5;\n" -" shr.s32 %r70, %r69, 1;\n" -" mov.s32 %r71, %r70;\n" -" mov.u32 %r72, 0;\n" -" setp.ne.u32 %p15, %r70, %r72;\n" -" @!%p15 bra $Lt_1_32002;\n" -"$Lt_1_32514:\n" -" setp.ge.u32 %p16, %r17, %r71;\n" -" @%p16 bra $Lt_1_32770;\n" -" add.u32 %r73, %r1, %r71;\n" -" cvt.u64.u32 %rd51, %r73;\n" -" mul.wide.u32 %rd52, %r73, 4;\n" -" add.u64 %rd53, %rd48, %rd52;\n" -" ld.shared.f32 %f175, [%rd53+0];\n" -" add.ftz.f32 %f170, %f175, %f170;\n" -" st.shared.f32 [%rd50+0], %f170;\n" -" ld.shared.f32 %f176, [%rd53+512];\n" -" add.ftz.f32 %f171, %f176, %f171;\n" -" st.shared.f32 [%rd50+512], %f171;\n" -" ld.shared.f32 %f177, [%rd53+1024];\n" -" add.ftz.f32 %f172, %f177, %f172;\n" -" st.shared.f32 [%rd50+1024], %f172;\n" -" ld.shared.f32 %f178, [%rd53+1536];\n" -" add.ftz.f32 %f173, %f178, %f173;\n" -" st.shared.f32 [%rd50+1536], %f173;\n" -" ld.shared.f32 %f179, [%rd53+2048];\n" -" add.ftz.f32 %f174, %f179, %f174;\n" -" st.shared.f32 [%rd50+2048], %f174;\n" -"$Lt_1_32770:\n" -" shr.u32 %r71, %r71, 1;\n" -" mov.u32 %r74, 0;\n" -" setp.ne.u32 %p17, %r71, %r74;\n" -" @%p17 bra $Lt_1_32514;\n" -"$Lt_1_32002:\n" -" mov.f32 %f34, %f170;\n" -" mov.f32 %f33, %f171;\n" -" mov.f32 %f32, %f172;\n" -" mov.f32 %f36, %f173;\n" -" mov.f32 %f35, %f174;\n" -" ld.param.s32 %r75, [__cudaparm_kernel_pair_fast_vflag];\n" -" mov.u32 %r76, 0;\n" -" setp.le.s32 %p18, %r75, %r76;\n" -" @%p18 bra $Lt_1_33538;\n" -" mov.f32 %f170, %f7;\n" -" st.shared.f32 [%rd50+0], %f170;\n" -" mov.f32 %f171, %f9;\n" -" st.shared.f32 [%rd50+512], %f171;\n" -" mov.f32 %f172, %f11;\n" -" st.shared.f32 [%rd50+1024], %f172;\n" -" mov.f32 %f173, %f13;\n" -" st.shared.f32 [%rd50+1536], %f173;\n" -" mov.f32 %f174, %f15;\n" -" st.shared.f32 [%rd50+2048], %f174;\n" -" mov.f32 %f180, %f16;\n" -" st.shared.f32 [%rd50+2560], %f180;\n" -" mov.s32 %r77, %r70;\n" -" @!%p15 bra $Lt_1_34050;\n" -"$Lt_1_34562:\n" -" setp.ge.u32 %p19, %r17, %r77;\n" -" @%p19 bra $Lt_1_34818;\n" -" add.u32 %r78, %r1, %r77;\n" -" cvt.u64.u32 %rd54, %r78;\n" -" mul.wide.u32 %rd55, %r78, 4;\n" -" add.u64 %rd56, %rd48, %rd55;\n" -" ld.shared.f32 %f181, [%rd56+0];\n" -" add.ftz.f32 %f170, %f181, %f170;\n" -" st.shared.f32 [%rd50+0], %f170;\n" -" ld.shared.f32 %f182, [%rd56+512];\n" -" add.ftz.f32 %f171, %f182, %f171;\n" -" st.shared.f32 [%rd50+512], %f171;\n" -" ld.shared.f32 %f183, [%rd56+1024];\n" -" add.ftz.f32 %f172, %f183, %f172;\n" -" st.shared.f32 [%rd50+1024], %f172;\n" -" ld.shared.f32 %f184, [%rd56+1536];\n" -" add.ftz.f32 %f173, %f184, %f173;\n" -" st.shared.f32 [%rd50+1536], %f173;\n" -" ld.shared.f32 %f185, [%rd56+2048];\n" -" add.ftz.f32 %f174, %f185, %f174;\n" -" st.shared.f32 [%rd50+2048], %f174;\n" -" ld.shared.f32 %f186, [%rd56+2560];\n" -" add.ftz.f32 %f180, %f186, %f180;\n" -" st.shared.f32 [%rd50+2560], %f180;\n" -"$Lt_1_34818:\n" -" shr.u32 %r77, %r77, 1;\n" -" mov.u32 %r79, 0;\n" -" setp.ne.u32 %p20, %r77, %r79;\n" -" @%p20 bra $Lt_1_34562;\n" -"$Lt_1_34050:\n" -" mov.f32 %f7, %f170;\n" -" mov.f32 %f9, %f171;\n" -" mov.f32 %f11, %f172;\n" -" mov.f32 %f13, %f173;\n" -" mov.f32 %f15, %f174;\n" -" mov.f32 %f17, %f180;\n" -"$Lt_1_33538:\n" -"$Lt_1_31490:\n" -" mov.u32 %r80, 0;\n" -" setp.ne.s32 %p21, %r17, %r80;\n" -" @%p21 bra $Lt_1_35586;\n" -" ld.param.u64 %rd57, [__cudaparm_kernel_pair_fast___val_paramengv];\n" -" add.u64 %rd58, %rd57, %rd13;\n" -" ld.param.s32 %r81, [__cudaparm_kernel_pair_fast_eflag];\n" -" mov.u32 %r82, 0;\n" -" setp.le.s32 %p22, %r81, %r82;\n" -" @%p22 bra $Lt_1_36098;\n" -" st.global.f32 [%rd58+0], %f36;\n" -" cvt.s64.s32 %rd59, %r12;\n" -" mul.wide.s32 %rd60, %r12, 4;\n" -" add.u64 %rd61, %rd60, %rd58;\n" -" st.global.f32 [%rd61+0], %f35;\n" -" add.u64 %rd58, %rd60, %rd61;\n" -"$Lt_1_36098:\n" -" ld.param.s32 %r83, [__cudaparm_kernel_pair_fast_vflag];\n" -" mov.u32 %r84, 0;\n" -" setp.le.s32 %p23, %r83, %r84;\n" -" @%p23 bra $Lt_1_36610;\n" -" mov.f32 %f187, %f7;\n" -" st.global.f32 [%rd58+0], %f187;\n" -" cvt.s64.s32 %rd62, %r12;\n" -" mul.wide.s32 %rd63, %r12, 4;\n" -" add.u64 %rd64, %rd63, %rd58;\n" -" mov.f32 %f188, %f9;\n" -" st.global.f32 [%rd64+0], %f188;\n" -" add.u64 %rd65, %rd63, %rd64;\n" -" mov.f32 %f189, %f11;\n" -" st.global.f32 [%rd65+0], %f189;\n" -" add.u64 %rd66, %rd63, %rd65;\n" -" mov.f32 %f190, %f13;\n" -" st.global.f32 [%rd66+0], %f190;\n" -" add.u64 %rd58, %rd63, %rd66;\n" -" mov.f32 %f191, %f15;\n" -" st.global.f32 [%rd58+0], %f191;\n" -" mov.f32 %f192, %f17;\n" -" add.u64 %rd67, %rd63, %rd58;\n" -" st.global.f32 [%rd67+0], %f192;\n" -"$Lt_1_36610:\n" -" ld.param.u64 %rd68, [__cudaparm_kernel_pair_fast_ans];\n" -" mul.lo.u64 %rd69, %rd12, 16;\n" -" add.u64 %rd70, %rd68, %rd69;\n" -" mov.f32 %f193, %f194;\n" -" st.global.v4.f32 [%rd70+0], {%f34,%f33,%f32,%f193};\n" -"$Lt_1_35586:\n" -"$Lt_1_25858:\n" -" .loc 16 277 0\n" -" exit;\n" -"$LDWend_kernel_pair_fast:\n" -" }\n" -; diff --git a/lib/gpu/coul_long.ptx b/lib/gpu/coul_long.ptx deleted file mode 100644 index 5576f36c59..0000000000 --- a/lib/gpu/coul_long.ptx +++ /dev/null @@ -1,1009 +0,0 @@ - .version 2.3 - .target sm_20 - .address_size 64 - // compiled with /usr/local/cuda/open64/lib//be - // nvopencc 4.0 built on 2011-05-12 - - //----------------------------------------------------------- - // Compiling /tmp/tmpxft_00009de1_00000000-9_lal_coul_long.cpp3.i (/home/sjplimp/ccBI#.NrfuKV) - //----------------------------------------------------------- - - //----------------------------------------------------------- - // Options: - //----------------------------------------------------------- - // Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64 - // -O3 (Optimization level) - // -g0 (Debug level) - // -m2 (Report advisories) - //----------------------------------------------------------- - - .file 1 "" - .file 2 "/tmp/tmpxft_00009de1_00000000-8_lal_coul_long.cudafe2.gpu" - .file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h" - .file 4 "/usr/local/cuda/include/crt/device_runtime.h" - .file 5 "/usr/local/cuda/include/host_defines.h" - .file 6 "/usr/local/cuda/include/builtin_types.h" - .file 7 "/usr/local/cuda/include/device_types.h" - .file 8 "/usr/local/cuda/include/driver_types.h" - .file 9 "/usr/local/cuda/include/surface_types.h" - .file 10 "/usr/local/cuda/include/texture_types.h" - .file 11 "/usr/local/cuda/include/vector_types.h" - .file 12 "/usr/local/cuda/include/device_launch_parameters.h" - .file 13 "/usr/local/cuda/include/crt/storage_class.h" - .file 14 "/usr/include/bits/types.h" - .file 15 "/usr/include/time.h" - .file 16 "lal_coul_long.cu" - .file 17 "/usr/local/cuda/include/common_functions.h" - .file 18 "/usr/local/cuda/include/math_functions.h" - .file 19 "/usr/local/cuda/include/math_constants.h" - .file 20 "/usr/local/cuda/include/device_functions.h" - .file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h" - .file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h" - .file 23 "/usr/local/cuda/include/sm_13_double_functions.h" - .file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h" - .file 25 "/usr/local/cuda/include/sm_20_intrinsics.h" - .file 26 "/usr/local/cuda/include/surface_functions.h" - .file 27 "/usr/local/cuda/include/texture_fetch_functions.h" - .file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h" - - .global .texref pos_tex; - .global .texref q_tex; - - .entry kernel_pair ( - .param .u64 __cudaparm_kernel_pair_x_, - .param .u64 __cudaparm_kernel_pair_lj1, - .param .u64 __cudaparm_kernel_pair_lj3, - .param .s32 __cudaparm_kernel_pair_lj_types, - .param .u64 __cudaparm_kernel_pair_sp_cl_in, - .param .u64 __cudaparm_kernel_pair_dev_nbor, - .param .u64 __cudaparm_kernel_pair_dev_packed, - .param .u64 __cudaparm_kernel_pair_ans, - .param .u64 __cudaparm_kernel_pair_engv, - .param .s32 __cudaparm_kernel_pair_eflag, - .param .s32 __cudaparm_kernel_pair_vflag, - .param .s32 __cudaparm_kernel_pair_inum, - .param .s32 __cudaparm_kernel_pair_nbor_pitch, - .param .u64 __cudaparm_kernel_pair_q_, - .param .f32 __cudaparm_kernel_pair_cut_coulsq, - .param .f32 __cudaparm_kernel_pair_qqrd2e, - .param .f32 __cudaparm_kernel_pair_g_ewald, - .param .s32 __cudaparm_kernel_pair_t_per_atom) - { - .reg .u32 %r<81>; - .reg .u64 %rd<58>; - .reg .f32 %f<132>; - .reg .pred %p<19>; - .shared .align 16 .b8 __cuda___cuda_local_var_32541_33_non_const_sp_cl112[16]; - .shared .align 4 .b8 __cuda___cuda_local_var_32611_37_non_const_red_acc128[3072]; - // __cuda_local_var_32548_10_non_const_f = 48 - // __cuda_local_var_32550_9_non_const_virial = 16 - .loc 16 36 0 -$LDWbegin_kernel_pair: - .loc 16 41 0 - ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_cl_in]; - ldu.global.f32 %f1, [%rd1+0]; - .loc 16 42 0 - ld.global.f32 %f2, [%rd1+4]; - .loc 16 43 0 - ld.global.f32 %f3, [%rd1+8]; - .loc 16 44 0 - ld.global.f32 %f4, [%rd1+12]; - st.shared.v4.f32 [__cuda___cuda_local_var_32541_33_non_const_sp_cl112+0], {%f1,%f2,%f3,%f4}; - .loc 16 51 0 - mov.f32 %f5, 0f00000000; // 0 - mov.f32 %f6, %f5; - mov.f32 %f7, 0f00000000; // 0 - mov.f32 %f8, %f7; - mov.f32 %f9, 0f00000000; // 0 - mov.f32 %f10, %f9; - mov.f32 %f11, 0f00000000; // 0 - mov.f32 %f12, %f11; - mov.f32 %f13, 0f00000000; // 0 - mov.f32 %f14, %f13; - mov.f32 %f15, 0f00000000; // 0 - mov.f32 %f16, %f15; - ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom]; - cvt.s32.u32 %r2, %tid.x; - div.s32 %r3, %r2, %r1; - cvt.s32.u32 %r4, %ntid.x; - div.s32 %r5, %r4, %r1; - cvt.s32.u32 %r6, %ctaid.x; - mul.lo.s32 %r7, %r6, %r5; - add.s32 %r8, %r3, %r7; - ld.param.s32 %r9, [__cudaparm_kernel_pair_inum]; - setp.ge.s32 %p1, %r8, %r9; - @%p1 bra $Lt_0_25858; - .loc 16 56 0 - cvt.s64.s32 %rd2, %r8; - mul.wide.s32 %rd3, %r8, 4; - ld.param.u64 %rd4, [__cudaparm_kernel_pair_dev_nbor]; - add.u64 %rd5, %rd3, %rd4; - ld.global.s32 %r10, [%rd5+0]; - ld.param.s32 %r11, [__cudaparm_kernel_pair_nbor_pitch]; - cvt.s64.s32 %rd6, %r11; - mul.wide.s32 %rd7, %r11, 4; - add.u64 %rd8, %rd7, %rd5; - ld.global.s32 %r12, [%rd8+0]; - sub.s32 %r13, %r1, 1; - and.b32 %r14, %r13, %r2; - cvt.s64.s32 %rd9, %r14; - mul.wide.s32 %rd10, %r14, 4; - ld.param.u64 %rd11, [__cudaparm_kernel_pair_dev_packed]; - setp.ne.u64 %p2, %rd11, %rd4; - @%p2 bra $Lt_0_19458; - cvt.s32.s64 %r15, %rd6; - mul.lo.s32 %r16, %r15, %r1; - mov.s32 %r17, %r16; - mul.lo.s32 %r18, %r13, %r8; - add.s32 %r19, %r15, %r18; - cvt.s64.s32 %rd12, %r19; - mul.wide.s32 %rd13, %r19, 4; - add.u64 %rd14, %rd8, %rd13; - and.b32 %r20, %r13, %r12; - cvt.s64.s32 %rd15, %r20; - div.s32 %r21, %r12, %r1; - mul.lo.s32 %r22, %r16, %r21; - cvt.s64.s32 %rd16, %r22; - add.u64 %rd17, %rd15, %rd16; - mul.lo.u64 %rd18, %rd17, 4; - add.u64 %rd19, %rd14, %rd18; - add.u64 %rd20, %rd10, %rd14; - bra.uni $Lt_0_19202; -$Lt_0_19458: - add.u64 %rd21, %rd7, %rd8; - ld.global.s32 %r23, [%rd21+0]; - cvt.s64.s32 %rd22, %r23; - mul.wide.s32 %rd23, %r23, 4; - add.u64 %rd24, %rd11, %rd23; - cvt.s64.s32 %rd25, %r12; - mul.wide.s32 %rd26, %r12, 4; - add.u64 %rd19, %rd24, %rd26; - mov.s32 %r17, %r1; - add.u64 %rd20, %rd10, %rd24; -$Lt_0_19202: - .loc 16 59 0 - mov.u32 %r24, %r10; - mov.s32 %r25, 0; - mov.u32 %r26, %r25; - mov.s32 %r27, 0; - mov.u32 %r28, %r27; - mov.s32 %r29, 0; - mov.u32 %r30, %r29; - tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r24,%r26,%r28,%r30}]; - mov.f32 %f21, %f17; - mov.f32 %f22, %f18; - mov.f32 %f23, %f19; - .loc 16 60 0 - mov.u32 %r31, %r10; - mov.s32 %r32, 0; - mov.u32 %r33, %r32; - mov.s32 %r34, 0; - mov.u32 %r35, %r34; - mov.s32 %r36, 0; - mov.u32 %r37, %r36; - tex.1d.v4.f32.s32 {%f24,%f25,%f26,%f27},[q_tex,{%r31,%r33,%r35,%r37}]; - mov.f32 %f28, %f24; - setp.ge.u64 %p3, %rd20, %rd19; - @%p3 bra $Lt_0_27394; - cvt.s64.s32 %rd27, %r17; - ld.param.f32 %f29, [__cudaparm_kernel_pair_cut_coulsq]; - mov.f32 %f30, 0f00000000; // 0 - mov.f32 %f31, 0f00000000; // 0 - mov.f32 %f32, 0f00000000; // 0 - mov.f32 %f33, 0f00000000; // 0 - mov.u64 %rd28, __cuda___cuda_local_var_32541_33_non_const_sp_cl112; -$Lt_0_20226: - // Loop body line 60, nesting depth: 1, estimated iterations: unknown - .loc 16 63 0 - ld.global.s32 %r38, [%rd20+0]; - .loc 16 66 0 - mov.f32 %f34, 0f3f800000; // 1 - shr.s32 %r39, %r38, 30; - and.b32 %r40, %r39, 3; - cvt.s64.s32 %rd29, %r40; - mul.wide.s32 %rd30, %r40, 4; - add.u64 %rd31, %rd28, %rd30; - ld.shared.f32 %f35, [%rd31+0]; - sub.ftz.f32 %f36, %f34, %f35; - .loc 16 69 0 - and.b32 %r41, %r38, 1073741823; - mov.u32 %r42, %r41; - mov.s32 %r43, 0; - mov.u32 %r44, %r43; - mov.s32 %r45, 0; - mov.u32 %r46, %r45; - mov.s32 %r47, 0; - mov.u32 %r48, %r47; - tex.1d.v4.f32.s32 {%f37,%f38,%f39,%f40},[pos_tex,{%r42,%r44,%r46,%r48}]; - mov.f32 %f41, %f37; - mov.f32 %f42, %f38; - mov.f32 %f43, %f39; - sub.ftz.f32 %f44, %f22, %f42; - sub.ftz.f32 %f45, %f21, %f41; - sub.ftz.f32 %f46, %f23, %f43; - mul.ftz.f32 %f47, %f44, %f44; - fma.rn.ftz.f32 %f48, %f45, %f45, %f47; - fma.rn.ftz.f32 %f49, %f46, %f46, %f48; - setp.lt.ftz.f32 %p4, %f49, %f29; - @!%p4 bra $Lt_0_20994; - .loc 20 518 0 - rcp.approx.ftz.f32 %f50, %f49; - rsqrt.approx.ftz.f32 %f51, %f50; - ld.param.f32 %f52, [__cudaparm_kernel_pair_g_ewald]; - mul.ftz.f32 %f53, %f52, %f51; - mul.ftz.f32 %f54, %f53, %f53; - neg.ftz.f32 %f55, %f54; - mov.f32 %f56, 0f3fb8aa3b; // 1.4427 - mul.ftz.f32 %f57, %f55, %f56; - ex2.approx.ftz.f32 %f58, %f57; - .loc 16 85 0 - mov.f32 %f59, 0f3f800000; // 1 - mov.f32 %f60, 0f3ea7ba05; // 0.327591 - fma.rn.ftz.f32 %f61, %f60, %f53, %f59; - rcp.approx.ftz.f32 %f62, %f61; - mov.f32 %f63, 0f3e827906; // 0.25483 - mov.f32 %f64, 0fbe91a98e; // -0.284497 - mov.f32 %f65, 0f3fb5f0e3; // 1.42141 - mov.f32 %f66, 0fbfba00e3; // -1.45315 - mov.f32 %f67, 0f3f87dc22; // 1.06141 - fma.rn.ftz.f32 %f68, %f67, %f62, %f66; - fma.rn.ftz.f32 %f69, %f62, %f68, %f65; - fma.rn.ftz.f32 %f70, %f62, %f69, %f64; - fma.rn.ftz.f32 %f71, %f62, %f70, %f63; - mul.ftz.f32 %f72, %f62, %f71; - mul.ftz.f32 %f73, %f58, %f72; - .loc 16 86 0 - mov.u32 %r49, %r41; - mov.s32 %r50, 0; - mov.u32 %r51, %r50; - mov.s32 %r52, 0; - mov.u32 %r53, %r52; - mov.s32 %r54, 0; - mov.u32 %r55, %r54; - tex.1d.v4.f32.s32 {%f74,%f75,%f76,%f77},[q_tex,{%r49,%r51,%r53,%r55}]; - mov.f32 %f78, %f74; - .loc 16 87 0 - ld.param.f32 %f79, [__cudaparm_kernel_pair_qqrd2e]; - mul.ftz.f32 %f80, %f79, %f28; - mul.ftz.f32 %f81, %f80, %f78; - div.approx.ftz.f32 %f82, %f81, %f51; - mov.f32 %f83, 0f3f906ebb; // 1.12838 - mul.ftz.f32 %f84, %f53, %f83; - fma.rn.ftz.f32 %f85, %f58, %f84, %f73; - sub.ftz.f32 %f86, %f85, %f36; - mul.ftz.f32 %f87, %f82, %f86; - mul.ftz.f32 %f88, %f50, %f87; - .loc 16 89 0 - fma.rn.ftz.f32 %f32, %f45, %f88, %f32; - .loc 16 90 0 - fma.rn.ftz.f32 %f31, %f44, %f88, %f31; - .loc 16 91 0 - fma.rn.ftz.f32 %f30, %f46, %f88, %f30; - .loc 16 78 0 - sub.ftz.f32 %f89, %f73, %f36; - fma.rn.ftz.f32 %f90, %f82, %f89, %f33; - ld.param.s32 %r56, [__cudaparm_kernel_pair_eflag]; - mov.s32 %r57, 0; - setp.gt.s32 %p5, %r56, %r57; - selp.f32 %f33, %f90, %f33, %p5; - ld.param.s32 %r58, [__cudaparm_kernel_pair_vflag]; - mov.u32 %r59, 0; - setp.le.s32 %p6, %r58, %r59; - @%p6 bra $Lt_0_20994; - .loc 16 97 0 - mov.f32 %f91, %f6; - mul.ftz.f32 %f92, %f45, %f45; - fma.rn.ftz.f32 %f93, %f88, %f92, %f91; - mov.f32 %f6, %f93; - .loc 16 98 0 - mov.f32 %f94, %f8; - fma.rn.ftz.f32 %f95, %f88, %f47, %f94; - mov.f32 %f8, %f95; - .loc 16 99 0 - mov.f32 %f96, %f10; - mul.ftz.f32 %f97, %f46, %f46; - fma.rn.ftz.f32 %f98, %f88, %f97, %f96; - mov.f32 %f10, %f98; - .loc 16 100 0 - mov.f32 %f99, %f12; - mul.ftz.f32 %f100, %f44, %f45; - fma.rn.ftz.f32 %f101, %f88, %f100, %f99; - mov.f32 %f12, %f101; - .loc 16 101 0 - mov.f32 %f102, %f14; - mul.ftz.f32 %f103, %f45, %f46; - fma.rn.ftz.f32 %f104, %f88, %f103, %f102; - mov.f32 %f14, %f104; - .loc 16 102 0 - mul.ftz.f32 %f105, %f44, %f46; - fma.rn.ftz.f32 %f15, %f88, %f105, %f15; - mov.f32 %f16, %f15; -$Lt_0_20994: -$Lt_0_20482: - .loc 16 62 0 - mul.lo.u64 %rd32, %rd27, 4; - add.u64 %rd20, %rd20, %rd32; - setp.lt.u64 %p7, %rd20, %rd19; - @%p7 bra $Lt_0_20226; - bra.uni $Lt_0_19714; -$Lt_0_27394: - mov.f32 %f30, 0f00000000; // 0 - mov.f32 %f31, 0f00000000; // 0 - mov.f32 %f32, 0f00000000; // 0 - mov.f32 %f33, 0f00000000; // 0 -$Lt_0_19714: - mov.u32 %r60, 1; - setp.le.s32 %p8, %r1, %r60; - @%p8 bra $Lt_0_23810; - .loc 16 112 0 - mov.u64 %rd33, __cuda___cuda_local_var_32611_37_non_const_red_acc128; - cvt.s64.s32 %rd34, %r2; - mul.wide.s32 %rd35, %r2, 4; - add.u64 %rd36, %rd33, %rd35; - mov.f32 %f106, %f32; - st.shared.f32 [%rd36+0], %f106; - .loc 16 113 0 - mov.f32 %f107, %f31; - st.shared.f32 [%rd36+512], %f107; - .loc 16 114 0 - mov.f32 %f108, %f30; - st.shared.f32 [%rd36+1024], %f108; - .loc 16 115 0 - mov.f32 %f109, %f33; - st.shared.f32 [%rd36+1536], %f109; - .loc 16 117 0 - shr.s32 %r61, %r1, 31; - mov.s32 %r62, 1; - and.b32 %r63, %r61, %r62; - add.s32 %r64, %r63, %r1; - shr.s32 %r65, %r64, 1; - mov.s32 %r66, %r65; - mov.u32 %r67, 0; - setp.ne.u32 %p9, %r65, %r67; - @!%p9 bra $Lt_0_22274; -$Lt_0_22786: - setp.ge.u32 %p10, %r14, %r66; - @%p10 bra $Lt_0_23042; - .loc 16 120 0 - add.u32 %r68, %r2, %r66; - cvt.u64.u32 %rd37, %r68; - mul.wide.u32 %rd38, %r68, 4; - add.u64 %rd39, %rd33, %rd38; - ld.shared.f32 %f110, [%rd39+0]; - add.ftz.f32 %f106, %f110, %f106; - st.shared.f32 [%rd36+0], %f106; - ld.shared.f32 %f111, [%rd39+512]; - add.ftz.f32 %f107, %f111, %f107; - st.shared.f32 [%rd36+512], %f107; - ld.shared.f32 %f112, [%rd39+1024]; - add.ftz.f32 %f108, %f112, %f108; - st.shared.f32 [%rd36+1024], %f108; - ld.shared.f32 %f113, [%rd39+1536]; - add.ftz.f32 %f109, %f113, %f109; - st.shared.f32 [%rd36+1536], %f109; -$Lt_0_23042: - .loc 16 117 0 - shr.u32 %r66, %r66, 1; - mov.u32 %r69, 0; - setp.ne.u32 %p11, %r66, %r69; - @%p11 bra $Lt_0_22786; -$Lt_0_22274: - .loc 16 124 0 - mov.f32 %f32, %f106; - .loc 16 125 0 - mov.f32 %f31, %f107; - .loc 16 126 0 - mov.f32 %f30, %f108; - .loc 16 127 0 - mov.f32 %f33, %f109; - ld.param.s32 %r70, [__cudaparm_kernel_pair_vflag]; - mov.u32 %r71, 0; - setp.le.s32 %p12, %r70, %r71; - @%p12 bra $Lt_0_23810; - .loc 16 131 0 - mov.f32 %f106, %f6; - st.shared.f32 [%rd36+0], %f106; - mov.f32 %f107, %f8; - st.shared.f32 [%rd36+512], %f107; - mov.f32 %f108, %f10; - st.shared.f32 [%rd36+1024], %f108; - mov.f32 %f109, %f12; - st.shared.f32 [%rd36+1536], %f109; - mov.f32 %f114, %f14; - st.shared.f32 [%rd36+2048], %f114; - mov.f32 %f115, %f15; - st.shared.f32 [%rd36+2560], %f115; - .loc 16 133 0 - mov.s32 %r72, %r65; - @!%p9 bra $Lt_0_24322; -$Lt_0_24834: - setp.ge.u32 %p13, %r14, %r72; - @%p13 bra $Lt_0_25090; - .loc 16 136 0 - add.u32 %r73, %r2, %r72; - cvt.u64.u32 %rd40, %r73; - mul.wide.u32 %rd41, %r73, 4; - add.u64 %rd42, %rd33, %rd41; - ld.shared.f32 %f116, [%rd42+0]; - add.ftz.f32 %f106, %f116, %f106; - st.shared.f32 [%rd36+0], %f106; - ld.shared.f32 %f117, [%rd42+512]; - add.ftz.f32 %f107, %f117, %f107; - st.shared.f32 [%rd36+512], %f107; - ld.shared.f32 %f118, [%rd42+1024]; - add.ftz.f32 %f108, %f118, %f108; - st.shared.f32 [%rd36+1024], %f108; - ld.shared.f32 %f119, [%rd42+1536]; - add.ftz.f32 %f109, %f119, %f109; - st.shared.f32 [%rd36+1536], %f109; - ld.shared.f32 %f120, [%rd42+2048]; - add.ftz.f32 %f114, %f120, %f114; - st.shared.f32 [%rd36+2048], %f114; - ld.shared.f32 %f121, [%rd42+2560]; - add.ftz.f32 %f115, %f121, %f115; - st.shared.f32 [%rd36+2560], %f115; -$Lt_0_25090: - .loc 16 133 0 - shr.u32 %r72, %r72, 1; - mov.u32 %r74, 0; - setp.ne.u32 %p14, %r72, %r74; - @%p14 bra $Lt_0_24834; -$Lt_0_24322: - .loc 16 141 0 - mov.f32 %f6, %f106; - mov.f32 %f8, %f107; - mov.f32 %f10, %f108; - mov.f32 %f12, %f109; - mov.f32 %f14, %f114; - mov.f32 %f16, %f115; -$Lt_0_23810: -$Lt_0_21762: - mov.u32 %r75, 0; - setp.ne.s32 %p15, %r14, %r75; - @%p15 bra $Lt_0_25858; - .loc 16 147 0 - ld.param.u64 %rd43, [__cudaparm_kernel_pair_engv]; - add.u64 %rd44, %rd43, %rd3; - ld.param.s32 %r76, [__cudaparm_kernel_pair_eflag]; - mov.u32 %r77, 0; - setp.le.s32 %p16, %r76, %r77; - @%p16 bra $Lt_0_26370; - .loc 16 149 0 - mov.f32 %f122, 0f00000000; // 0 - st.global.f32 [%rd44+0], %f122; - .loc 16 150 0 - cvt.s64.s32 %rd45, %r9; - mul.wide.s32 %rd46, %r9, 4; - add.u64 %rd47, %rd46, %rd44; - .loc 16 151 0 - st.global.f32 [%rd47+0], %f33; - .loc 16 152 0 - add.u64 %rd44, %rd46, %rd47; -$Lt_0_26370: - ld.param.s32 %r78, [__cudaparm_kernel_pair_vflag]; - mov.u32 %r79, 0; - setp.le.s32 %p17, %r78, %r79; - @%p17 bra $Lt_0_26882; - .loc 16 156 0 - mov.f32 %f123, %f6; - st.global.f32 [%rd44+0], %f123; - .loc 16 157 0 - cvt.s64.s32 %rd48, %r9; - mul.wide.s32 %rd49, %r9, 4; - add.u64 %rd50, %rd49, %rd44; - .loc 16 156 0 - mov.f32 %f124, %f8; - st.global.f32 [%rd50+0], %f124; - .loc 16 157 0 - add.u64 %rd51, %rd49, %rd50; - .loc 16 156 0 - mov.f32 %f125, %f10; - st.global.f32 [%rd51+0], %f125; - .loc 16 157 0 - add.u64 %rd52, %rd49, %rd51; - .loc 16 156 0 - mov.f32 %f126, %f12; - st.global.f32 [%rd52+0], %f126; - .loc 16 157 0 - add.u64 %rd44, %rd49, %rd52; - .loc 16 156 0 - mov.f32 %f127, %f14; - st.global.f32 [%rd44+0], %f127; - mov.f32 %f128, %f16; - add.u64 %rd53, %rd49, %rd44; - st.global.f32 [%rd53+0], %f128; -$Lt_0_26882: - .loc 16 160 0 - ld.param.u64 %rd54, [__cudaparm_kernel_pair_ans]; - mul.lo.u64 %rd55, %rd2, 16; - add.u64 %rd56, %rd54, %rd55; - mov.f32 %f129, %f130; - st.global.v4.f32 [%rd56+0], {%f32,%f31,%f30,%f129}; -$Lt_0_25858: -$Lt_0_18690: - .loc 16 163 0 - exit; -$LDWend_kernel_pair: - } // kernel_pair - - .entry kernel_pair_fast ( - .param .u64 __cudaparm_kernel_pair_fast_x_, - .param .u64 __cudaparm_kernel_pair_fast_lj1_in, - .param .u64 __cudaparm_kernel_pair_fast_lj3_in, - .param .u64 __cudaparm_kernel_pair_fast_sp_cl_in, - .param .u64 __cudaparm_kernel_pair_fast_dev_nbor, - .param .u64 __cudaparm_kernel_pair_fast_dev_packed, - .param .u64 __cudaparm_kernel_pair_fast_ans, - .param .u64 __cudaparm_kernel_pair_fast_engv, - .param .s32 __cudaparm_kernel_pair_fast_eflag, - .param .s32 __cudaparm_kernel_pair_fast_vflag, - .param .s32 __cudaparm_kernel_pair_fast_inum, - .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch, - .param .u64 __cudaparm_kernel_pair_fast_q_, - .param .f32 __cudaparm_kernel_pair_fast_cut_coulsq, - .param .f32 __cudaparm_kernel_pair_fast_qqrd2e, - .param .f32 __cudaparm_kernel_pair_fast_g_ewald, - .param .s32 __cudaparm_kernel_pair_fast_t_per_atom) - { - .reg .u32 %r<82>; - .reg .u64 %rd<62>; - .reg .f32 %f<129>; - .reg .pred %p<20>; - .shared .align 4 .b8 __cuda___cuda_local_var_32678_33_non_const_sp_cl3304[16]; - .shared .align 4 .b8 __cuda___cuda_local_var_32748_37_non_const_red_acc3320[3072]; - // __cuda_local_var_32683_10_non_const_f = 48 - // __cuda_local_var_32685_9_non_const_virial = 16 - .loc 16 173 0 -$LDWbegin_kernel_pair_fast: - cvt.s32.u32 %r1, %tid.x; - mov.u32 %r2, 3; - setp.gt.s32 %p1, %r1, %r2; - @%p1 bra $Lt_1_19458; - .loc 16 179 0 - mov.u64 %rd1, __cuda___cuda_local_var_32678_33_non_const_sp_cl3304; - cvt.s64.s32 %rd2, %r1; - mul.wide.s32 %rd3, %r1, 4; - ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_cl_in]; - add.u64 %rd5, %rd4, %rd3; - ld.global.f32 %f1, [%rd5+0]; - add.u64 %rd6, %rd3, %rd1; - st.shared.f32 [%rd6+0], %f1; -$Lt_1_19458: - mov.u64 %rd1, __cuda___cuda_local_var_32678_33_non_const_sp_cl3304; - .loc 16 186 0 - mov.f32 %f2, 0f00000000; // 0 - mov.f32 %f3, %f2; - mov.f32 %f4, 0f00000000; // 0 - mov.f32 %f5, %f4; - mov.f32 %f6, 0f00000000; // 0 - mov.f32 %f7, %f6; - mov.f32 %f8, 0f00000000; // 0 - mov.f32 %f9, %f8; - mov.f32 %f10, 0f00000000; // 0 - mov.f32 %f11, %f10; - mov.f32 %f12, 0f00000000; // 0 - mov.f32 %f13, %f12; - .loc 16 188 0 - bar.sync 0; - ld.param.s32 %r3, [__cudaparm_kernel_pair_fast_t_per_atom]; - div.s32 %r4, %r1, %r3; - cvt.s32.u32 %r5, %ntid.x; - div.s32 %r6, %r5, %r3; - cvt.s32.u32 %r7, %ctaid.x; - mul.lo.s32 %r8, %r7, %r6; - add.s32 %r9, %r4, %r8; - ld.param.s32 %r10, [__cudaparm_kernel_pair_fast_inum]; - setp.ge.s32 %p2, %r9, %r10; - @%p2 bra $Lt_1_27138; - .loc 16 193 0 - cvt.s64.s32 %rd7, %r9; - mul.wide.s32 %rd8, %r9, 4; - ld.param.u64 %rd9, [__cudaparm_kernel_pair_fast_dev_nbor]; - add.u64 %rd10, %rd8, %rd9; - ld.global.s32 %r11, [%rd10+0]; - ld.param.s32 %r12, [__cudaparm_kernel_pair_fast_nbor_pitch]; - cvt.s64.s32 %rd11, %r12; - mul.wide.s32 %rd12, %r12, 4; - add.u64 %rd13, %rd12, %rd10; - ld.global.s32 %r13, [%rd13+0]; - sub.s32 %r14, %r3, 1; - and.b32 %r15, %r14, %r1; - cvt.s64.s32 %rd14, %r15; - mul.wide.s32 %rd15, %r15, 4; - ld.param.u64 %rd16, [__cudaparm_kernel_pair_fast_dev_packed]; - setp.ne.u64 %p3, %rd16, %rd9; - @%p3 bra $Lt_1_20738; - cvt.s32.s64 %r16, %rd11; - mul.lo.s32 %r17, %r16, %r3; - mov.s32 %r18, %r17; - mul.lo.s32 %r19, %r14, %r9; - add.s32 %r20, %r16, %r19; - cvt.s64.s32 %rd17, %r20; - mul.wide.s32 %rd18, %r20, 4; - add.u64 %rd19, %rd13, %rd18; - and.b32 %r21, %r14, %r13; - cvt.s64.s32 %rd20, %r21; - div.s32 %r22, %r13, %r3; - mul.lo.s32 %r23, %r17, %r22; - cvt.s64.s32 %rd21, %r23; - add.u64 %rd22, %rd20, %rd21; - mul.lo.u64 %rd23, %rd22, 4; - add.u64 %rd24, %rd19, %rd23; - add.u64 %rd25, %rd15, %rd19; - bra.uni $Lt_1_20482; -$Lt_1_20738: - add.u64 %rd26, %rd12, %rd13; - ld.global.s32 %r24, [%rd26+0]; - cvt.s64.s32 %rd27, %r24; - mul.wide.s32 %rd28, %r24, 4; - add.u64 %rd29, %rd16, %rd28; - cvt.s64.s32 %rd30, %r13; - mul.wide.s32 %rd31, %r13, 4; - add.u64 %rd24, %rd29, %rd31; - mov.s32 %r18, %r3; - add.u64 %rd25, %rd15, %rd29; -$Lt_1_20482: - .loc 16 196 0 - mov.u32 %r25, %r11; - mov.s32 %r26, 0; - mov.u32 %r27, %r26; - mov.s32 %r28, 0; - mov.u32 %r29, %r28; - mov.s32 %r30, 0; - mov.u32 %r31, %r30; - tex.1d.v4.f32.s32 {%f14,%f15,%f16,%f17},[pos_tex,{%r25,%r27,%r29,%r31}]; - mov.f32 %f18, %f14; - mov.f32 %f19, %f15; - mov.f32 %f20, %f16; - .loc 16 197 0 - mov.u32 %r32, %r11; - mov.s32 %r33, 0; - mov.u32 %r34, %r33; - mov.s32 %r35, 0; - mov.u32 %r36, %r35; - mov.s32 %r37, 0; - mov.u32 %r38, %r37; - tex.1d.v4.f32.s32 {%f21,%f22,%f23,%f24},[q_tex,{%r32,%r34,%r36,%r38}]; - mov.f32 %f25, %f21; - setp.ge.u64 %p4, %rd25, %rd24; - @%p4 bra $Lt_1_28674; - cvt.s64.s32 %rd32, %r18; - ld.param.f32 %f26, [__cudaparm_kernel_pair_fast_cut_coulsq]; - mov.f32 %f27, 0f00000000; // 0 - mov.f32 %f28, 0f00000000; // 0 - mov.f32 %f29, 0f00000000; // 0 - mov.f32 %f30, 0f00000000; // 0 -$Lt_1_21506: - // Loop body line 197, nesting depth: 1, estimated iterations: unknown - .loc 16 200 0 - ld.global.s32 %r39, [%rd25+0]; - .loc 16 203 0 - mov.f32 %f31, 0f3f800000; // 1 - shr.s32 %r40, %r39, 30; - and.b32 %r41, %r40, 3; - cvt.s64.s32 %rd33, %r41; - mul.wide.s32 %rd34, %r41, 4; - add.u64 %rd35, %rd1, %rd34; - ld.shared.f32 %f32, [%rd35+0]; - sub.ftz.f32 %f33, %f31, %f32; - .loc 16 206 0 - and.b32 %r42, %r39, 1073741823; - mov.u32 %r43, %r42; - mov.s32 %r44, 0; - mov.u32 %r45, %r44; - mov.s32 %r46, 0; - mov.u32 %r47, %r46; - mov.s32 %r48, 0; - mov.u32 %r49, %r48; - tex.1d.v4.f32.s32 {%f34,%f35,%f36,%f37},[pos_tex,{%r43,%r45,%r47,%r49}]; - mov.f32 %f38, %f34; - mov.f32 %f39, %f35; - mov.f32 %f40, %f36; - sub.ftz.f32 %f41, %f19, %f39; - sub.ftz.f32 %f42, %f18, %f38; - sub.ftz.f32 %f43, %f20, %f40; - mul.ftz.f32 %f44, %f41, %f41; - fma.rn.ftz.f32 %f45, %f42, %f42, %f44; - fma.rn.ftz.f32 %f46, %f43, %f43, %f45; - setp.lt.ftz.f32 %p5, %f46, %f26; - @!%p5 bra $Lt_1_22274; - .loc 20 518 0 - rcp.approx.ftz.f32 %f47, %f46; - rsqrt.approx.ftz.f32 %f48, %f47; - ld.param.f32 %f49, [__cudaparm_kernel_pair_fast_g_ewald]; - mul.ftz.f32 %f50, %f49, %f48; - mul.ftz.f32 %f51, %f50, %f50; - neg.ftz.f32 %f52, %f51; - mov.f32 %f53, 0f3fb8aa3b; // 1.4427 - mul.ftz.f32 %f54, %f52, %f53; - ex2.approx.ftz.f32 %f55, %f54; - .loc 16 222 0 - mov.f32 %f56, 0f3f800000; // 1 - mov.f32 %f57, 0f3ea7ba05; // 0.327591 - fma.rn.ftz.f32 %f58, %f57, %f50, %f56; - rcp.approx.ftz.f32 %f59, %f58; - mov.f32 %f60, 0f3e827906; // 0.25483 - mov.f32 %f61, 0fbe91a98e; // -0.284497 - mov.f32 %f62, 0f3fb5f0e3; // 1.42141 - mov.f32 %f63, 0fbfba00e3; // -1.45315 - mov.f32 %f64, 0f3f87dc22; // 1.06141 - fma.rn.ftz.f32 %f65, %f64, %f59, %f63; - fma.rn.ftz.f32 %f66, %f59, %f65, %f62; - fma.rn.ftz.f32 %f67, %f59, %f66, %f61; - fma.rn.ftz.f32 %f68, %f59, %f67, %f60; - mul.ftz.f32 %f69, %f59, %f68; - mul.ftz.f32 %f70, %f55, %f69; - .loc 16 223 0 - mov.u32 %r50, %r42; - mov.s32 %r51, 0; - mov.u32 %r52, %r51; - mov.s32 %r53, 0; - mov.u32 %r54, %r53; - mov.s32 %r55, 0; - mov.u32 %r56, %r55; - tex.1d.v4.f32.s32 {%f71,%f72,%f73,%f74},[q_tex,{%r50,%r52,%r54,%r56}]; - mov.f32 %f75, %f71; - .loc 16 224 0 - ld.param.f32 %f76, [__cudaparm_kernel_pair_fast_qqrd2e]; - mul.ftz.f32 %f77, %f76, %f25; - mul.ftz.f32 %f78, %f77, %f75; - div.approx.ftz.f32 %f79, %f78, %f48; - mov.f32 %f80, 0f3f906ebb; // 1.12838 - mul.ftz.f32 %f81, %f50, %f80; - fma.rn.ftz.f32 %f82, %f55, %f81, %f70; - sub.ftz.f32 %f83, %f82, %f33; - mul.ftz.f32 %f84, %f79, %f83; - mul.ftz.f32 %f85, %f47, %f84; - .loc 16 226 0 - fma.rn.ftz.f32 %f29, %f42, %f85, %f29; - .loc 16 227 0 - fma.rn.ftz.f32 %f28, %f41, %f85, %f28; - .loc 16 228 0 - fma.rn.ftz.f32 %f27, %f43, %f85, %f27; - .loc 16 215 0 - sub.ftz.f32 %f86, %f70, %f33; - fma.rn.ftz.f32 %f87, %f79, %f86, %f30; - ld.param.s32 %r57, [__cudaparm_kernel_pair_fast_eflag]; - mov.s32 %r58, 0; - setp.gt.s32 %p6, %r57, %r58; - selp.f32 %f30, %f87, %f30, %p6; - ld.param.s32 %r59, [__cudaparm_kernel_pair_fast_vflag]; - mov.u32 %r60, 0; - setp.le.s32 %p7, %r59, %r60; - @%p7 bra $Lt_1_22274; - .loc 16 234 0 - mov.f32 %f88, %f3; - mul.ftz.f32 %f89, %f42, %f42; - fma.rn.ftz.f32 %f90, %f85, %f89, %f88; - mov.f32 %f3, %f90; - .loc 16 235 0 - mov.f32 %f91, %f5; - fma.rn.ftz.f32 %f92, %f85, %f44, %f91; - mov.f32 %f5, %f92; - .loc 16 236 0 - mov.f32 %f93, %f7; - mul.ftz.f32 %f94, %f43, %f43; - fma.rn.ftz.f32 %f95, %f85, %f94, %f93; - mov.f32 %f7, %f95; - .loc 16 237 0 - mov.f32 %f96, %f9; - mul.ftz.f32 %f97, %f41, %f42; - fma.rn.ftz.f32 %f98, %f85, %f97, %f96; - mov.f32 %f9, %f98; - .loc 16 238 0 - mov.f32 %f99, %f11; - mul.ftz.f32 %f100, %f42, %f43; - fma.rn.ftz.f32 %f101, %f85, %f100, %f99; - mov.f32 %f11, %f101; - .loc 16 239 0 - mul.ftz.f32 %f102, %f41, %f43; - fma.rn.ftz.f32 %f12, %f85, %f102, %f12; - mov.f32 %f13, %f12; -$Lt_1_22274: -$Lt_1_21762: - .loc 16 199 0 - mul.lo.u64 %rd36, %rd32, 4; - add.u64 %rd25, %rd25, %rd36; - setp.lt.u64 %p8, %rd25, %rd24; - @%p8 bra $Lt_1_21506; - bra.uni $Lt_1_20994; -$Lt_1_28674: - mov.f32 %f27, 0f00000000; // 0 - mov.f32 %f28, 0f00000000; // 0 - mov.f32 %f29, 0f00000000; // 0 - mov.f32 %f30, 0f00000000; // 0 -$Lt_1_20994: - mov.u32 %r61, 1; - setp.le.s32 %p9, %r3, %r61; - @%p9 bra $Lt_1_25090; - .loc 16 249 0 - mov.u64 %rd37, __cuda___cuda_local_var_32748_37_non_const_red_acc3320; - cvt.s64.s32 %rd38, %r1; - mul.wide.s32 %rd39, %r1, 4; - add.u64 %rd40, %rd37, %rd39; - mov.f32 %f103, %f29; - st.shared.f32 [%rd40+0], %f103; - .loc 16 250 0 - mov.f32 %f104, %f28; - st.shared.f32 [%rd40+512], %f104; - .loc 16 251 0 - mov.f32 %f105, %f27; - st.shared.f32 [%rd40+1024], %f105; - .loc 16 252 0 - mov.f32 %f106, %f30; - st.shared.f32 [%rd40+1536], %f106; - .loc 16 254 0 - shr.s32 %r62, %r3, 31; - mov.s32 %r63, 1; - and.b32 %r64, %r62, %r63; - add.s32 %r65, %r64, %r3; - shr.s32 %r66, %r65, 1; - mov.s32 %r67, %r66; - mov.u32 %r68, 0; - setp.ne.u32 %p10, %r66, %r68; - @!%p10 bra $Lt_1_23554; -$Lt_1_24066: - setp.ge.u32 %p11, %r15, %r67; - @%p11 bra $Lt_1_24322; - .loc 16 257 0 - add.u32 %r69, %r1, %r67; - cvt.u64.u32 %rd41, %r69; - mul.wide.u32 %rd42, %r69, 4; - add.u64 %rd43, %rd37, %rd42; - ld.shared.f32 %f107, [%rd43+0]; - add.ftz.f32 %f103, %f107, %f103; - st.shared.f32 [%rd40+0], %f103; - ld.shared.f32 %f108, [%rd43+512]; - add.ftz.f32 %f104, %f108, %f104; - st.shared.f32 [%rd40+512], %f104; - ld.shared.f32 %f109, [%rd43+1024]; - add.ftz.f32 %f105, %f109, %f105; - st.shared.f32 [%rd40+1024], %f105; - ld.shared.f32 %f110, [%rd43+1536]; - add.ftz.f32 %f106, %f110, %f106; - st.shared.f32 [%rd40+1536], %f106; -$Lt_1_24322: - .loc 16 254 0 - shr.u32 %r67, %r67, 1; - mov.u32 %r70, 0; - setp.ne.u32 %p12, %r67, %r70; - @%p12 bra $Lt_1_24066; -$Lt_1_23554: - .loc 16 261 0 - mov.f32 %f29, %f103; - .loc 16 262 0 - mov.f32 %f28, %f104; - .loc 16 263 0 - mov.f32 %f27, %f105; - .loc 16 264 0 - mov.f32 %f30, %f106; - ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag]; - mov.u32 %r72, 0; - setp.le.s32 %p13, %r71, %r72; - @%p13 bra $Lt_1_25090; - .loc 16 268 0 - mov.f32 %f103, %f3; - st.shared.f32 [%rd40+0], %f103; - mov.f32 %f104, %f5; - st.shared.f32 [%rd40+512], %f104; - mov.f32 %f105, %f7; - st.shared.f32 [%rd40+1024], %f105; - mov.f32 %f106, %f9; - st.shared.f32 [%rd40+1536], %f106; - mov.f32 %f111, %f11; - st.shared.f32 [%rd40+2048], %f111; - mov.f32 %f112, %f12; - st.shared.f32 [%rd40+2560], %f112; - .loc 16 270 0 - mov.s32 %r73, %r66; - @!%p10 bra $Lt_1_25602; -$Lt_1_26114: - setp.ge.u32 %p14, %r15, %r73; - @%p14 bra $Lt_1_26370; - .loc 16 273 0 - add.u32 %r74, %r1, %r73; - cvt.u64.u32 %rd44, %r74; - mul.wide.u32 %rd45, %r74, 4; - add.u64 %rd46, %rd37, %rd45; - ld.shared.f32 %f113, [%rd46+0]; - add.ftz.f32 %f103, %f113, %f103; - st.shared.f32 [%rd40+0], %f103; - ld.shared.f32 %f114, [%rd46+512]; - add.ftz.f32 %f104, %f114, %f104; - st.shared.f32 [%rd40+512], %f104; - ld.shared.f32 %f115, [%rd46+1024]; - add.ftz.f32 %f105, %f115, %f105; - st.shared.f32 [%rd40+1024], %f105; - ld.shared.f32 %f116, [%rd46+1536]; - add.ftz.f32 %f106, %f116, %f106; - st.shared.f32 [%rd40+1536], %f106; - ld.shared.f32 %f117, [%rd46+2048]; - add.ftz.f32 %f111, %f117, %f111; - st.shared.f32 [%rd40+2048], %f111; - ld.shared.f32 %f118, [%rd46+2560]; - add.ftz.f32 %f112, %f118, %f112; - st.shared.f32 [%rd40+2560], %f112; -$Lt_1_26370: - .loc 16 270 0 - shr.u32 %r73, %r73, 1; - mov.u32 %r75, 0; - setp.ne.u32 %p15, %r73, %r75; - @%p15 bra $Lt_1_26114; -$Lt_1_25602: - .loc 16 278 0 - mov.f32 %f3, %f103; - mov.f32 %f5, %f104; - mov.f32 %f7, %f105; - mov.f32 %f9, %f106; - mov.f32 %f11, %f111; - mov.f32 %f13, %f112; -$Lt_1_25090: -$Lt_1_23042: - mov.u32 %r76, 0; - setp.ne.s32 %p16, %r15, %r76; - @%p16 bra $Lt_1_27138; - .loc 16 284 0 - ld.param.u64 %rd47, [__cudaparm_kernel_pair_fast_engv]; - add.u64 %rd48, %rd47, %rd8; - ld.param.s32 %r77, [__cudaparm_kernel_pair_fast_eflag]; - mov.u32 %r78, 0; - setp.le.s32 %p17, %r77, %r78; - @%p17 bra $Lt_1_27650; - .loc 16 286 0 - mov.f32 %f119, 0f00000000; // 0 - st.global.f32 [%rd48+0], %f119; - .loc 16 287 0 - cvt.s64.s32 %rd49, %r10; - mul.wide.s32 %rd50, %r10, 4; - add.u64 %rd51, %rd50, %rd48; - .loc 16 288 0 - st.global.f32 [%rd51+0], %f30; - .loc 16 289 0 - add.u64 %rd48, %rd50, %rd51; -$Lt_1_27650: - ld.param.s32 %r79, [__cudaparm_kernel_pair_fast_vflag]; - mov.u32 %r80, 0; - setp.le.s32 %p18, %r79, %r80; - @%p18 bra $Lt_1_28162; - .loc 16 293 0 - mov.f32 %f120, %f3; - st.global.f32 [%rd48+0], %f120; - .loc 16 294 0 - cvt.s64.s32 %rd52, %r10; - mul.wide.s32 %rd53, %r10, 4; - add.u64 %rd54, %rd53, %rd48; - .loc 16 293 0 - mov.f32 %f121, %f5; - st.global.f32 [%rd54+0], %f121; - .loc 16 294 0 - add.u64 %rd55, %rd53, %rd54; - .loc 16 293 0 - mov.f32 %f122, %f7; - st.global.f32 [%rd55+0], %f122; - .loc 16 294 0 - add.u64 %rd56, %rd53, %rd55; - .loc 16 293 0 - mov.f32 %f123, %f9; - st.global.f32 [%rd56+0], %f123; - .loc 16 294 0 - add.u64 %rd48, %rd53, %rd56; - .loc 16 293 0 - mov.f32 %f124, %f11; - st.global.f32 [%rd48+0], %f124; - mov.f32 %f125, %f13; - add.u64 %rd57, %rd53, %rd48; - st.global.f32 [%rd57+0], %f125; -$Lt_1_28162: - .loc 16 297 0 - ld.param.u64 %rd58, [__cudaparm_kernel_pair_fast_ans]; - mul.lo.u64 %rd59, %rd7, 16; - add.u64 %rd60, %rd58, %rd59; - mov.f32 %f126, %f127; - st.global.v4.f32 [%rd60+0], {%f29,%f28,%f27,%f126}; -$Lt_1_27138: -$Lt_1_19970: - .loc 16 300 0 - exit; -$LDWend_kernel_pair_fast: - } // kernel_pair_fast - diff --git a/lib/gpu/coul_long_ptx.h b/lib/gpu/coul_long_ptx.h deleted file mode 100644 index 5091de01fa..0000000000 --- a/lib/gpu/coul_long_ptx.h +++ /dev/null @@ -1,957 +0,0 @@ -const char * coul_long = -" .version 2.3\n" -" .target sm_20\n" -" .address_size 64\n" -" .global .texref pos_tex;\n" -" .global .texref q_tex;\n" -" .entry kernel_pair (\n" -" .param .u64 __cudaparm_kernel_pair_x_,\n" -" .param .u64 __cudaparm_kernel_pair_lj1,\n" -" .param .u64 __cudaparm_kernel_pair_lj3,\n" -" .param .s32 __cudaparm_kernel_pair_lj_types,\n" -" .param .u64 __cudaparm_kernel_pair_sp_cl_in,\n" -" .param .u64 __cudaparm_kernel_pair_dev_nbor,\n" -" .param .u64 __cudaparm_kernel_pair_dev_packed,\n" -" .param .u64 __cudaparm_kernel_pair_ans,\n" -" .param .u64 __cudaparm_kernel_pair_engv,\n" -" .param .s32 __cudaparm_kernel_pair_eflag,\n" -" .param .s32 __cudaparm_kernel_pair_vflag,\n" -" .param .s32 __cudaparm_kernel_pair_inum,\n" -" .param .s32 __cudaparm_kernel_pair_nbor_pitch,\n" -" .param .u64 __cudaparm_kernel_pair_q_,\n" -" .param .f32 __cudaparm_kernel_pair_cut_coulsq,\n" -" .param .f32 __cudaparm_kernel_pair_qqrd2e,\n" -" .param .f32 __cudaparm_kernel_pair_g_ewald,\n" -" .param .s32 __cudaparm_kernel_pair_t_per_atom)\n" -" {\n" -" .reg .u32 %r<81>;\n" -" .reg .u64 %rd<58>;\n" -" .reg .f32 %f<132>;\n" -" .reg .pred %p<19>;\n" -" .shared .align 16 .b8 __cuda___cuda_local_var_32541_33_non_const_sp_cl112[16];\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_32611_37_non_const_red_acc128[3072];\n" -" .loc 16 36 0\n" -"$LDWbegin_kernel_pair:\n" -" .loc 16 41 0\n" -" ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_cl_in];\n" -" ldu.global.f32 %f1, [%rd1+0];\n" -" .loc 16 42 0\n" -" ld.global.f32 %f2, [%rd1+4];\n" -" .loc 16 43 0\n" -" ld.global.f32 %f3, [%rd1+8];\n" -" .loc 16 44 0\n" -" ld.global.f32 %f4, [%rd1+12];\n" -" st.shared.v4.f32 [__cuda___cuda_local_var_32541_33_non_const_sp_cl112+0], {%f1,%f2,%f3,%f4};\n" -" .loc 16 51 0\n" -" mov.f32 %f5, 0f00000000; \n" -" mov.f32 %f6, %f5;\n" -" mov.f32 %f7, 0f00000000; \n" -" mov.f32 %f8, %f7;\n" -" mov.f32 %f9, 0f00000000; \n" -" mov.f32 %f10, %f9;\n" -" mov.f32 %f11, 0f00000000; \n" -" mov.f32 %f12, %f11;\n" -" mov.f32 %f13, 0f00000000; \n" -" mov.f32 %f14, %f13;\n" -" mov.f32 %f15, 0f00000000; \n" -" mov.f32 %f16, %f15;\n" -" ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];\n" -" cvt.s32.u32 %r2, %tid.x;\n" -" div.s32 %r3, %r2, %r1;\n" -" cvt.s32.u32 %r4, %ntid.x;\n" -" div.s32 %r5, %r4, %r1;\n" -" cvt.s32.u32 %r6, %ctaid.x;\n" -" mul.lo.s32 %r7, %r6, %r5;\n" -" add.s32 %r8, %r3, %r7;\n" -" ld.param.s32 %r9, [__cudaparm_kernel_pair_inum];\n" -" setp.ge.s32 %p1, %r8, %r9;\n" -" @%p1 bra $Lt_0_25858;\n" -" .loc 16 56 0\n" -" cvt.s64.s32 %rd2, %r8;\n" -" mul.wide.s32 %rd3, %r8, 4;\n" -" ld.param.u64 %rd4, [__cudaparm_kernel_pair_dev_nbor];\n" -" add.u64 %rd5, %rd3, %rd4;\n" -" ld.global.s32 %r10, [%rd5+0];\n" -" ld.param.s32 %r11, [__cudaparm_kernel_pair_nbor_pitch];\n" -" cvt.s64.s32 %rd6, %r11;\n" -" mul.wide.s32 %rd7, %r11, 4;\n" -" add.u64 %rd8, %rd7, %rd5;\n" -" ld.global.s32 %r12, [%rd8+0];\n" -" sub.s32 %r13, %r1, 1;\n" -" and.b32 %r14, %r13, %r2;\n" -" cvt.s64.s32 %rd9, %r14;\n" -" mul.wide.s32 %rd10, %r14, 4;\n" -" ld.param.u64 %rd11, [__cudaparm_kernel_pair_dev_packed];\n" -" setp.ne.u64 %p2, %rd11, %rd4;\n" -" @%p2 bra $Lt_0_19458;\n" -" cvt.s32.s64 %r15, %rd6;\n" -" mul.lo.s32 %r16, %r15, %r1;\n" -" mov.s32 %r17, %r16;\n" -" mul.lo.s32 %r18, %r13, %r8;\n" -" add.s32 %r19, %r15, %r18;\n" -" cvt.s64.s32 %rd12, %r19;\n" -" mul.wide.s32 %rd13, %r19, 4;\n" -" add.u64 %rd14, %rd8, %rd13;\n" -" and.b32 %r20, %r13, %r12;\n" -" cvt.s64.s32 %rd15, %r20;\n" -" div.s32 %r21, %r12, %r1;\n" -" mul.lo.s32 %r22, %r16, %r21;\n" -" cvt.s64.s32 %rd16, %r22;\n" -" add.u64 %rd17, %rd15, %rd16;\n" -" mul.lo.u64 %rd18, %rd17, 4;\n" -" add.u64 %rd19, %rd14, %rd18;\n" -" add.u64 %rd20, %rd10, %rd14;\n" -" bra.uni $Lt_0_19202;\n" -"$Lt_0_19458:\n" -" add.u64 %rd21, %rd7, %rd8;\n" -" ld.global.s32 %r23, [%rd21+0];\n" -" cvt.s64.s32 %rd22, %r23;\n" -" mul.wide.s32 %rd23, %r23, 4;\n" -" add.u64 %rd24, %rd11, %rd23;\n" -" cvt.s64.s32 %rd25, %r12;\n" -" mul.wide.s32 %rd26, %r12, 4;\n" -" add.u64 %rd19, %rd24, %rd26;\n" -" mov.s32 %r17, %r1;\n" -" add.u64 %rd20, %rd10, %rd24;\n" -"$Lt_0_19202:\n" -" .loc 16 59 0\n" -" mov.u32 %r24, %r10;\n" -" mov.s32 %r25, 0;\n" -" mov.u32 %r26, %r25;\n" -" mov.s32 %r27, 0;\n" -" mov.u32 %r28, %r27;\n" -" mov.s32 %r29, 0;\n" -" mov.u32 %r30, %r29;\n" -" tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r24,%r26,%r28,%r30}];\n" -" mov.f32 %f21, %f17;\n" -" mov.f32 %f22, %f18;\n" -" mov.f32 %f23, %f19;\n" -" .loc 16 60 0\n" -" mov.u32 %r31, %r10;\n" -" mov.s32 %r32, 0;\n" -" mov.u32 %r33, %r32;\n" -" mov.s32 %r34, 0;\n" -" mov.u32 %r35, %r34;\n" -" mov.s32 %r36, 0;\n" -" mov.u32 %r37, %r36;\n" -" tex.1d.v4.f32.s32 {%f24,%f25,%f26,%f27},[q_tex,{%r31,%r33,%r35,%r37}];\n" -" mov.f32 %f28, %f24;\n" -" setp.ge.u64 %p3, %rd20, %rd19;\n" -" @%p3 bra $Lt_0_27394;\n" -" cvt.s64.s32 %rd27, %r17;\n" -" ld.param.f32 %f29, [__cudaparm_kernel_pair_cut_coulsq];\n" -" mov.f32 %f30, 0f00000000; \n" -" mov.f32 %f31, 0f00000000; \n" -" mov.f32 %f32, 0f00000000; \n" -" mov.f32 %f33, 0f00000000; \n" -" mov.u64 %rd28, __cuda___cuda_local_var_32541_33_non_const_sp_cl112;\n" -"$Lt_0_20226:\n" -" .loc 16 63 0\n" -" ld.global.s32 %r38, [%rd20+0];\n" -" .loc 16 66 0\n" -" mov.f32 %f34, 0f3f800000; \n" -" shr.s32 %r39, %r38, 30;\n" -" and.b32 %r40, %r39, 3;\n" -" cvt.s64.s32 %rd29, %r40;\n" -" mul.wide.s32 %rd30, %r40, 4;\n" -" add.u64 %rd31, %rd28, %rd30;\n" -" ld.shared.f32 %f35, [%rd31+0];\n" -" sub.ftz.f32 %f36, %f34, %f35;\n" -" .loc 16 69 0\n" -" and.b32 %r41, %r38, 1073741823;\n" -" mov.u32 %r42, %r41;\n" -" mov.s32 %r43, 0;\n" -" mov.u32 %r44, %r43;\n" -" mov.s32 %r45, 0;\n" -" mov.u32 %r46, %r45;\n" -" mov.s32 %r47, 0;\n" -" mov.u32 %r48, %r47;\n" -" tex.1d.v4.f32.s32 {%f37,%f38,%f39,%f40},[pos_tex,{%r42,%r44,%r46,%r48}];\n" -" mov.f32 %f41, %f37;\n" -" mov.f32 %f42, %f38;\n" -" mov.f32 %f43, %f39;\n" -" sub.ftz.f32 %f44, %f22, %f42;\n" -" sub.ftz.f32 %f45, %f21, %f41;\n" -" sub.ftz.f32 %f46, %f23, %f43;\n" -" mul.ftz.f32 %f47, %f44, %f44;\n" -" fma.rn.ftz.f32 %f48, %f45, %f45, %f47;\n" -" fma.rn.ftz.f32 %f49, %f46, %f46, %f48;\n" -" setp.lt.ftz.f32 %p4, %f49, %f29;\n" -" @!%p4 bra $Lt_0_20994;\n" -" .loc 20 518 0\n" -" rcp.approx.ftz.f32 %f50, %f49;\n" -" rsqrt.approx.ftz.f32 %f51, %f50;\n" -" ld.param.f32 %f52, [__cudaparm_kernel_pair_g_ewald];\n" -" mul.ftz.f32 %f53, %f52, %f51;\n" -" mul.ftz.f32 %f54, %f53, %f53;\n" -" neg.ftz.f32 %f55, %f54;\n" -" mov.f32 %f56, 0f3fb8aa3b; \n" -" mul.ftz.f32 %f57, %f55, %f56;\n" -" ex2.approx.ftz.f32 %f58, %f57;\n" -" .loc 16 85 0\n" -" mov.f32 %f59, 0f3f800000; \n" -" mov.f32 %f60, 0f3ea7ba05; \n" -" fma.rn.ftz.f32 %f61, %f60, %f53, %f59;\n" -" rcp.approx.ftz.f32 %f62, %f61;\n" -" mov.f32 %f63, 0f3e827906; \n" -" mov.f32 %f64, 0fbe91a98e; \n" -" mov.f32 %f65, 0f3fb5f0e3; \n" -" mov.f32 %f66, 0fbfba00e3; \n" -" mov.f32 %f67, 0f3f87dc22; \n" -" fma.rn.ftz.f32 %f68, %f67, %f62, %f66;\n" -" fma.rn.ftz.f32 %f69, %f62, %f68, %f65;\n" -" fma.rn.ftz.f32 %f70, %f62, %f69, %f64;\n" -" fma.rn.ftz.f32 %f71, %f62, %f70, %f63;\n" -" mul.ftz.f32 %f72, %f62, %f71;\n" -" mul.ftz.f32 %f73, %f58, %f72;\n" -" .loc 16 86 0\n" -" mov.u32 %r49, %r41;\n" -" mov.s32 %r50, 0;\n" -" mov.u32 %r51, %r50;\n" -" mov.s32 %r52, 0;\n" -" mov.u32 %r53, %r52;\n" -" mov.s32 %r54, 0;\n" -" mov.u32 %r55, %r54;\n" -" tex.1d.v4.f32.s32 {%f74,%f75,%f76,%f77},[q_tex,{%r49,%r51,%r53,%r55}];\n" -" mov.f32 %f78, %f74;\n" -" .loc 16 87 0\n" -" ld.param.f32 %f79, [__cudaparm_kernel_pair_qqrd2e];\n" -" mul.ftz.f32 %f80, %f79, %f28;\n" -" mul.ftz.f32 %f81, %f80, %f78;\n" -" div.approx.ftz.f32 %f82, %f81, %f51;\n" -" mov.f32 %f83, 0f3f906ebb; \n" -" mul.ftz.f32 %f84, %f53, %f83;\n" -" fma.rn.ftz.f32 %f85, %f58, %f84, %f73;\n" -" sub.ftz.f32 %f86, %f85, %f36;\n" -" mul.ftz.f32 %f87, %f82, %f86;\n" -" mul.ftz.f32 %f88, %f50, %f87;\n" -" .loc 16 89 0\n" -" fma.rn.ftz.f32 %f32, %f45, %f88, %f32;\n" -" .loc 16 90 0\n" -" fma.rn.ftz.f32 %f31, %f44, %f88, %f31;\n" -" .loc 16 91 0\n" -" fma.rn.ftz.f32 %f30, %f46, %f88, %f30;\n" -" .loc 16 78 0\n" -" sub.ftz.f32 %f89, %f73, %f36;\n" -" fma.rn.ftz.f32 %f90, %f82, %f89, %f33;\n" -" ld.param.s32 %r56, [__cudaparm_kernel_pair_eflag];\n" -" mov.s32 %r57, 0;\n" -" setp.gt.s32 %p5, %r56, %r57;\n" -" selp.f32 %f33, %f90, %f33, %p5;\n" -" ld.param.s32 %r58, [__cudaparm_kernel_pair_vflag];\n" -" mov.u32 %r59, 0;\n" -" setp.le.s32 %p6, %r58, %r59;\n" -" @%p6 bra $Lt_0_20994;\n" -" .loc 16 97 0\n" -" mov.f32 %f91, %f6;\n" -" mul.ftz.f32 %f92, %f45, %f45;\n" -" fma.rn.ftz.f32 %f93, %f88, %f92, %f91;\n" -" mov.f32 %f6, %f93;\n" -" .loc 16 98 0\n" -" mov.f32 %f94, %f8;\n" -" fma.rn.ftz.f32 %f95, %f88, %f47, %f94;\n" -" mov.f32 %f8, %f95;\n" -" .loc 16 99 0\n" -" mov.f32 %f96, %f10;\n" -" mul.ftz.f32 %f97, %f46, %f46;\n" -" fma.rn.ftz.f32 %f98, %f88, %f97, %f96;\n" -" mov.f32 %f10, %f98;\n" -" .loc 16 100 0\n" -" mov.f32 %f99, %f12;\n" -" mul.ftz.f32 %f100, %f44, %f45;\n" -" fma.rn.ftz.f32 %f101, %f88, %f100, %f99;\n" -" mov.f32 %f12, %f101;\n" -" .loc 16 101 0\n" -" mov.f32 %f102, %f14;\n" -" mul.ftz.f32 %f103, %f45, %f46;\n" -" fma.rn.ftz.f32 %f104, %f88, %f103, %f102;\n" -" mov.f32 %f14, %f104;\n" -" .loc 16 102 0\n" -" mul.ftz.f32 %f105, %f44, %f46;\n" -" fma.rn.ftz.f32 %f15, %f88, %f105, %f15;\n" -" mov.f32 %f16, %f15;\n" -"$Lt_0_20994:\n" -"$Lt_0_20482:\n" -" .loc 16 62 0\n" -" mul.lo.u64 %rd32, %rd27, 4;\n" -" add.u64 %rd20, %rd20, %rd32;\n" -" setp.lt.u64 %p7, %rd20, %rd19;\n" -" @%p7 bra $Lt_0_20226;\n" -" bra.uni $Lt_0_19714;\n" -"$Lt_0_27394:\n" -" mov.f32 %f30, 0f00000000; \n" -" mov.f32 %f31, 0f00000000; \n" -" mov.f32 %f32, 0f00000000; \n" -" mov.f32 %f33, 0f00000000; \n" -"$Lt_0_19714:\n" -" mov.u32 %r60, 1;\n" -" setp.le.s32 %p8, %r1, %r60;\n" -" @%p8 bra $Lt_0_23810;\n" -" .loc 16 112 0\n" -" mov.u64 %rd33, __cuda___cuda_local_var_32611_37_non_const_red_acc128;\n" -" cvt.s64.s32 %rd34, %r2;\n" -" mul.wide.s32 %rd35, %r2, 4;\n" -" add.u64 %rd36, %rd33, %rd35;\n" -" mov.f32 %f106, %f32;\n" -" st.shared.f32 [%rd36+0], %f106;\n" -" .loc 16 113 0\n" -" mov.f32 %f107, %f31;\n" -" st.shared.f32 [%rd36+512], %f107;\n" -" .loc 16 114 0\n" -" mov.f32 %f108, %f30;\n" -" st.shared.f32 [%rd36+1024], %f108;\n" -" .loc 16 115 0\n" -" mov.f32 %f109, %f33;\n" -" st.shared.f32 [%rd36+1536], %f109;\n" -" .loc 16 117 0\n" -" shr.s32 %r61, %r1, 31;\n" -" mov.s32 %r62, 1;\n" -" and.b32 %r63, %r61, %r62;\n" -" add.s32 %r64, %r63, %r1;\n" -" shr.s32 %r65, %r64, 1;\n" -" mov.s32 %r66, %r65;\n" -" mov.u32 %r67, 0;\n" -" setp.ne.u32 %p9, %r65, %r67;\n" -" @!%p9 bra $Lt_0_22274;\n" -"$Lt_0_22786:\n" -" setp.ge.u32 %p10, %r14, %r66;\n" -" @%p10 bra $Lt_0_23042;\n" -" .loc 16 120 0\n" -" add.u32 %r68, %r2, %r66;\n" -" cvt.u64.u32 %rd37, %r68;\n" -" mul.wide.u32 %rd38, %r68, 4;\n" -" add.u64 %rd39, %rd33, %rd38;\n" -" ld.shared.f32 %f110, [%rd39+0];\n" -" add.ftz.f32 %f106, %f110, %f106;\n" -" st.shared.f32 [%rd36+0], %f106;\n" -" ld.shared.f32 %f111, [%rd39+512];\n" -" add.ftz.f32 %f107, %f111, %f107;\n" -" st.shared.f32 [%rd36+512], %f107;\n" -" ld.shared.f32 %f112, [%rd39+1024];\n" -" add.ftz.f32 %f108, %f112, %f108;\n" -" st.shared.f32 [%rd36+1024], %f108;\n" -" ld.shared.f32 %f113, [%rd39+1536];\n" -" add.ftz.f32 %f109, %f113, %f109;\n" -" st.shared.f32 [%rd36+1536], %f109;\n" -"$Lt_0_23042:\n" -" .loc 16 117 0\n" -" shr.u32 %r66, %r66, 1;\n" -" mov.u32 %r69, 0;\n" -" setp.ne.u32 %p11, %r66, %r69;\n" -" @%p11 bra $Lt_0_22786;\n" -"$Lt_0_22274:\n" -" .loc 16 124 0\n" -" mov.f32 %f32, %f106;\n" -" .loc 16 125 0\n" -" mov.f32 %f31, %f107;\n" -" .loc 16 126 0\n" -" mov.f32 %f30, %f108;\n" -" .loc 16 127 0\n" -" mov.f32 %f33, %f109;\n" -" ld.param.s32 %r70, [__cudaparm_kernel_pair_vflag];\n" -" mov.u32 %r71, 0;\n" -" setp.le.s32 %p12, %r70, %r71;\n" -" @%p12 bra $Lt_0_23810;\n" -" .loc 16 131 0\n" -" mov.f32 %f106, %f6;\n" -" st.shared.f32 [%rd36+0], %f106;\n" -" mov.f32 %f107, %f8;\n" -" st.shared.f32 [%rd36+512], %f107;\n" -" mov.f32 %f108, %f10;\n" -" st.shared.f32 [%rd36+1024], %f108;\n" -" mov.f32 %f109, %f12;\n" -" st.shared.f32 [%rd36+1536], %f109;\n" -" mov.f32 %f114, %f14;\n" -" st.shared.f32 [%rd36+2048], %f114;\n" -" mov.f32 %f115, %f15;\n" -" st.shared.f32 [%rd36+2560], %f115;\n" -" .loc 16 133 0\n" -" mov.s32 %r72, %r65;\n" -" @!%p9 bra $Lt_0_24322;\n" -"$Lt_0_24834:\n" -" setp.ge.u32 %p13, %r14, %r72;\n" -" @%p13 bra $Lt_0_25090;\n" -" .loc 16 136 0\n" -" add.u32 %r73, %r2, %r72;\n" -" cvt.u64.u32 %rd40, %r73;\n" -" mul.wide.u32 %rd41, %r73, 4;\n" -" add.u64 %rd42, %rd33, %rd41;\n" -" ld.shared.f32 %f116, [%rd42+0];\n" -" add.ftz.f32 %f106, %f116, %f106;\n" -" st.shared.f32 [%rd36+0], %f106;\n" -" ld.shared.f32 %f117, [%rd42+512];\n" -" add.ftz.f32 %f107, %f117, %f107;\n" -" st.shared.f32 [%rd36+512], %f107;\n" -" ld.shared.f32 %f118, [%rd42+1024];\n" -" add.ftz.f32 %f108, %f118, %f108;\n" -" st.shared.f32 [%rd36+1024], %f108;\n" -" ld.shared.f32 %f119, [%rd42+1536];\n" -" add.ftz.f32 %f109, %f119, %f109;\n" -" st.shared.f32 [%rd36+1536], %f109;\n" -" ld.shared.f32 %f120, [%rd42+2048];\n" -" add.ftz.f32 %f114, %f120, %f114;\n" -" st.shared.f32 [%rd36+2048], %f114;\n" -" ld.shared.f32 %f121, [%rd42+2560];\n" -" add.ftz.f32 %f115, %f121, %f115;\n" -" st.shared.f32 [%rd36+2560], %f115;\n" -"$Lt_0_25090:\n" -" .loc 16 133 0\n" -" shr.u32 %r72, %r72, 1;\n" -" mov.u32 %r74, 0;\n" -" setp.ne.u32 %p14, %r72, %r74;\n" -" @%p14 bra $Lt_0_24834;\n" -"$Lt_0_24322:\n" -" .loc 16 141 0\n" -" mov.f32 %f6, %f106;\n" -" mov.f32 %f8, %f107;\n" -" mov.f32 %f10, %f108;\n" -" mov.f32 %f12, %f109;\n" -" mov.f32 %f14, %f114;\n" -" mov.f32 %f16, %f115;\n" -"$Lt_0_23810:\n" -"$Lt_0_21762:\n" -" mov.u32 %r75, 0;\n" -" setp.ne.s32 %p15, %r14, %r75;\n" -" @%p15 bra $Lt_0_25858;\n" -" .loc 16 147 0\n" -" ld.param.u64 %rd43, [__cudaparm_kernel_pair_engv];\n" -" add.u64 %rd44, %rd43, %rd3;\n" -" ld.param.s32 %r76, [__cudaparm_kernel_pair_eflag];\n" -" mov.u32 %r77, 0;\n" -" setp.le.s32 %p16, %r76, %r77;\n" -" @%p16 bra $Lt_0_26370;\n" -" .loc 16 149 0\n" -" mov.f32 %f122, 0f00000000; \n" -" st.global.f32 [%rd44+0], %f122;\n" -" .loc 16 150 0\n" -" cvt.s64.s32 %rd45, %r9;\n" -" mul.wide.s32 %rd46, %r9, 4;\n" -" add.u64 %rd47, %rd46, %rd44;\n" -" .loc 16 151 0\n" -" st.global.f32 [%rd47+0], %f33;\n" -" .loc 16 152 0\n" -" add.u64 %rd44, %rd46, %rd47;\n" -"$Lt_0_26370:\n" -" ld.param.s32 %r78, [__cudaparm_kernel_pair_vflag];\n" -" mov.u32 %r79, 0;\n" -" setp.le.s32 %p17, %r78, %r79;\n" -" @%p17 bra $Lt_0_26882;\n" -" .loc 16 156 0\n" -" mov.f32 %f123, %f6;\n" -" st.global.f32 [%rd44+0], %f123;\n" -" .loc 16 157 0\n" -" cvt.s64.s32 %rd48, %r9;\n" -" mul.wide.s32 %rd49, %r9, 4;\n" -" add.u64 %rd50, %rd49, %rd44;\n" -" .loc 16 156 0\n" -" mov.f32 %f124, %f8;\n" -" st.global.f32 [%rd50+0], %f124;\n" -" .loc 16 157 0\n" -" add.u64 %rd51, %rd49, %rd50;\n" -" .loc 16 156 0\n" -" mov.f32 %f125, %f10;\n" -" st.global.f32 [%rd51+0], %f125;\n" -" .loc 16 157 0\n" -" add.u64 %rd52, %rd49, %rd51;\n" -" .loc 16 156 0\n" -" mov.f32 %f126, %f12;\n" -" st.global.f32 [%rd52+0], %f126;\n" -" .loc 16 157 0\n" -" add.u64 %rd44, %rd49, %rd52;\n" -" .loc 16 156 0\n" -" mov.f32 %f127, %f14;\n" -" st.global.f32 [%rd44+0], %f127;\n" -" mov.f32 %f128, %f16;\n" -" add.u64 %rd53, %rd49, %rd44;\n" -" st.global.f32 [%rd53+0], %f128;\n" -"$Lt_0_26882:\n" -" .loc 16 160 0\n" -" ld.param.u64 %rd54, [__cudaparm_kernel_pair_ans];\n" -" mul.lo.u64 %rd55, %rd2, 16;\n" -" add.u64 %rd56, %rd54, %rd55;\n" -" mov.f32 %f129, %f130;\n" -" st.global.v4.f32 [%rd56+0], {%f32,%f31,%f30,%f129};\n" -"$Lt_0_25858:\n" -"$Lt_0_18690:\n" -" .loc 16 163 0\n" -" exit;\n" -"$LDWend_kernel_pair:\n" -" }\n" -" .entry kernel_pair_fast (\n" -" .param .u64 __cudaparm_kernel_pair_fast_x_,\n" -" .param .u64 __cudaparm_kernel_pair_fast_lj1_in,\n" -" .param .u64 __cudaparm_kernel_pair_fast_lj3_in,\n" -" .param .u64 __cudaparm_kernel_pair_fast_sp_cl_in,\n" -" .param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n" -" .param .u64 __cudaparm_kernel_pair_fast_dev_packed,\n" -" .param .u64 __cudaparm_kernel_pair_fast_ans,\n" -" .param .u64 __cudaparm_kernel_pair_fast_engv,\n" -" .param .s32 __cudaparm_kernel_pair_fast_eflag,\n" -" .param .s32 __cudaparm_kernel_pair_fast_vflag,\n" -" .param .s32 __cudaparm_kernel_pair_fast_inum,\n" -" .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,\n" -" .param .u64 __cudaparm_kernel_pair_fast_q_,\n" -" .param .f32 __cudaparm_kernel_pair_fast_cut_coulsq,\n" -" .param .f32 __cudaparm_kernel_pair_fast_qqrd2e,\n" -" .param .f32 __cudaparm_kernel_pair_fast_g_ewald,\n" -" .param .s32 __cudaparm_kernel_pair_fast_t_per_atom)\n" -" {\n" -" .reg .u32 %r<82>;\n" -" .reg .u64 %rd<62>;\n" -" .reg .f32 %f<129>;\n" -" .reg .pred %p<20>;\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_32678_33_non_const_sp_cl3304[16];\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_32748_37_non_const_red_acc3320[3072];\n" -" .loc 16 173 0\n" -"$LDWbegin_kernel_pair_fast:\n" -" cvt.s32.u32 %r1, %tid.x;\n" -" mov.u32 %r2, 3;\n" -" setp.gt.s32 %p1, %r1, %r2;\n" -" @%p1 bra $Lt_1_19458;\n" -" .loc 16 179 0\n" -" mov.u64 %rd1, __cuda___cuda_local_var_32678_33_non_const_sp_cl3304;\n" -" cvt.s64.s32 %rd2, %r1;\n" -" mul.wide.s32 %rd3, %r1, 4;\n" -" ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_cl_in];\n" -" add.u64 %rd5, %rd4, %rd3;\n" -" ld.global.f32 %f1, [%rd5+0];\n" -" add.u64 %rd6, %rd3, %rd1;\n" -" st.shared.f32 [%rd6+0], %f1;\n" -"$Lt_1_19458:\n" -" mov.u64 %rd1, __cuda___cuda_local_var_32678_33_non_const_sp_cl3304;\n" -" .loc 16 186 0\n" -" mov.f32 %f2, 0f00000000; \n" -" mov.f32 %f3, %f2;\n" -" mov.f32 %f4, 0f00000000; \n" -" mov.f32 %f5, %f4;\n" -" mov.f32 %f6, 0f00000000; \n" -" mov.f32 %f7, %f6;\n" -" mov.f32 %f8, 0f00000000; \n" -" mov.f32 %f9, %f8;\n" -" mov.f32 %f10, 0f00000000; \n" -" mov.f32 %f11, %f10;\n" -" mov.f32 %f12, 0f00000000; \n" -" mov.f32 %f13, %f12;\n" -" .loc 16 188 0\n" -" bar.sync 0;\n" -" ld.param.s32 %r3, [__cudaparm_kernel_pair_fast_t_per_atom];\n" -" div.s32 %r4, %r1, %r3;\n" -" cvt.s32.u32 %r5, %ntid.x;\n" -" div.s32 %r6, %r5, %r3;\n" -" cvt.s32.u32 %r7, %ctaid.x;\n" -" mul.lo.s32 %r8, %r7, %r6;\n" -" add.s32 %r9, %r4, %r8;\n" -" ld.param.s32 %r10, [__cudaparm_kernel_pair_fast_inum];\n" -" setp.ge.s32 %p2, %r9, %r10;\n" -" @%p2 bra $Lt_1_27138;\n" -" .loc 16 193 0\n" -" cvt.s64.s32 %rd7, %r9;\n" -" mul.wide.s32 %rd8, %r9, 4;\n" -" ld.param.u64 %rd9, [__cudaparm_kernel_pair_fast_dev_nbor];\n" -" add.u64 %rd10, %rd8, %rd9;\n" -" ld.global.s32 %r11, [%rd10+0];\n" -" ld.param.s32 %r12, [__cudaparm_kernel_pair_fast_nbor_pitch];\n" -" cvt.s64.s32 %rd11, %r12;\n" -" mul.wide.s32 %rd12, %r12, 4;\n" -" add.u64 %rd13, %rd12, %rd10;\n" -" ld.global.s32 %r13, [%rd13+0];\n" -" sub.s32 %r14, %r3, 1;\n" -" and.b32 %r15, %r14, %r1;\n" -" cvt.s64.s32 %rd14, %r15;\n" -" mul.wide.s32 %rd15, %r15, 4;\n" -" ld.param.u64 %rd16, [__cudaparm_kernel_pair_fast_dev_packed];\n" -" setp.ne.u64 %p3, %rd16, %rd9;\n" -" @%p3 bra $Lt_1_20738;\n" -" cvt.s32.s64 %r16, %rd11;\n" -" mul.lo.s32 %r17, %r16, %r3;\n" -" mov.s32 %r18, %r17;\n" -" mul.lo.s32 %r19, %r14, %r9;\n" -" add.s32 %r20, %r16, %r19;\n" -" cvt.s64.s32 %rd17, %r20;\n" -" mul.wide.s32 %rd18, %r20, 4;\n" -" add.u64 %rd19, %rd13, %rd18;\n" -" and.b32 %r21, %r14, %r13;\n" -" cvt.s64.s32 %rd20, %r21;\n" -" div.s32 %r22, %r13, %r3;\n" -" mul.lo.s32 %r23, %r17, %r22;\n" -" cvt.s64.s32 %rd21, %r23;\n" -" add.u64 %rd22, %rd20, %rd21;\n" -" mul.lo.u64 %rd23, %rd22, 4;\n" -" add.u64 %rd24, %rd19, %rd23;\n" -" add.u64 %rd25, %rd15, %rd19;\n" -" bra.uni $Lt_1_20482;\n" -"$Lt_1_20738:\n" -" add.u64 %rd26, %rd12, %rd13;\n" -" ld.global.s32 %r24, [%rd26+0];\n" -" cvt.s64.s32 %rd27, %r24;\n" -" mul.wide.s32 %rd28, %r24, 4;\n" -" add.u64 %rd29, %rd16, %rd28;\n" -" cvt.s64.s32 %rd30, %r13;\n" -" mul.wide.s32 %rd31, %r13, 4;\n" -" add.u64 %rd24, %rd29, %rd31;\n" -" mov.s32 %r18, %r3;\n" -" add.u64 %rd25, %rd15, %rd29;\n" -"$Lt_1_20482:\n" -" .loc 16 196 0\n" -" mov.u32 %r25, %r11;\n" -" mov.s32 %r26, 0;\n" -" mov.u32 %r27, %r26;\n" -" mov.s32 %r28, 0;\n" -" mov.u32 %r29, %r28;\n" -" mov.s32 %r30, 0;\n" -" mov.u32 %r31, %r30;\n" -" tex.1d.v4.f32.s32 {%f14,%f15,%f16,%f17},[pos_tex,{%r25,%r27,%r29,%r31}];\n" -" mov.f32 %f18, %f14;\n" -" mov.f32 %f19, %f15;\n" -" mov.f32 %f20, %f16;\n" -" .loc 16 197 0\n" -" mov.u32 %r32, %r11;\n" -" mov.s32 %r33, 0;\n" -" mov.u32 %r34, %r33;\n" -" mov.s32 %r35, 0;\n" -" mov.u32 %r36, %r35;\n" -" mov.s32 %r37, 0;\n" -" mov.u32 %r38, %r37;\n" -" tex.1d.v4.f32.s32 {%f21,%f22,%f23,%f24},[q_tex,{%r32,%r34,%r36,%r38}];\n" -" mov.f32 %f25, %f21;\n" -" setp.ge.u64 %p4, %rd25, %rd24;\n" -" @%p4 bra $Lt_1_28674;\n" -" cvt.s64.s32 %rd32, %r18;\n" -" ld.param.f32 %f26, [__cudaparm_kernel_pair_fast_cut_coulsq];\n" -" mov.f32 %f27, 0f00000000; \n" -" mov.f32 %f28, 0f00000000; \n" -" mov.f32 %f29, 0f00000000; \n" -" mov.f32 %f30, 0f00000000; \n" -"$Lt_1_21506:\n" -" .loc 16 200 0\n" -" ld.global.s32 %r39, [%rd25+0];\n" -" .loc 16 203 0\n" -" mov.f32 %f31, 0f3f800000; \n" -" shr.s32 %r40, %r39, 30;\n" -" and.b32 %r41, %r40, 3;\n" -" cvt.s64.s32 %rd33, %r41;\n" -" mul.wide.s32 %rd34, %r41, 4;\n" -" add.u64 %rd35, %rd1, %rd34;\n" -" ld.shared.f32 %f32, [%rd35+0];\n" -" sub.ftz.f32 %f33, %f31, %f32;\n" -" .loc 16 206 0\n" -" and.b32 %r42, %r39, 1073741823;\n" -" mov.u32 %r43, %r42;\n" -" mov.s32 %r44, 0;\n" -" mov.u32 %r45, %r44;\n" -" mov.s32 %r46, 0;\n" -" mov.u32 %r47, %r46;\n" -" mov.s32 %r48, 0;\n" -" mov.u32 %r49, %r48;\n" -" tex.1d.v4.f32.s32 {%f34,%f35,%f36,%f37},[pos_tex,{%r43,%r45,%r47,%r49}];\n" -" mov.f32 %f38, %f34;\n" -" mov.f32 %f39, %f35;\n" -" mov.f32 %f40, %f36;\n" -" sub.ftz.f32 %f41, %f19, %f39;\n" -" sub.ftz.f32 %f42, %f18, %f38;\n" -" sub.ftz.f32 %f43, %f20, %f40;\n" -" mul.ftz.f32 %f44, %f41, %f41;\n" -" fma.rn.ftz.f32 %f45, %f42, %f42, %f44;\n" -" fma.rn.ftz.f32 %f46, %f43, %f43, %f45;\n" -" setp.lt.ftz.f32 %p5, %f46, %f26;\n" -" @!%p5 bra $Lt_1_22274;\n" -" .loc 20 518 0\n" -" rcp.approx.ftz.f32 %f47, %f46;\n" -" rsqrt.approx.ftz.f32 %f48, %f47;\n" -" ld.param.f32 %f49, [__cudaparm_kernel_pair_fast_g_ewald];\n" -" mul.ftz.f32 %f50, %f49, %f48;\n" -" mul.ftz.f32 %f51, %f50, %f50;\n" -" neg.ftz.f32 %f52, %f51;\n" -" mov.f32 %f53, 0f3fb8aa3b; \n" -" mul.ftz.f32 %f54, %f52, %f53;\n" -" ex2.approx.ftz.f32 %f55, %f54;\n" -" .loc 16 222 0\n" -" mov.f32 %f56, 0f3f800000; \n" -" mov.f32 %f57, 0f3ea7ba05; \n" -" fma.rn.ftz.f32 %f58, %f57, %f50, %f56;\n" -" rcp.approx.ftz.f32 %f59, %f58;\n" -" mov.f32 %f60, 0f3e827906; \n" -" mov.f32 %f61, 0fbe91a98e; \n" -" mov.f32 %f62, 0f3fb5f0e3; \n" -" mov.f32 %f63, 0fbfba00e3; \n" -" mov.f32 %f64, 0f3f87dc22; \n" -" fma.rn.ftz.f32 %f65, %f64, %f59, %f63;\n" -" fma.rn.ftz.f32 %f66, %f59, %f65, %f62;\n" -" fma.rn.ftz.f32 %f67, %f59, %f66, %f61;\n" -" fma.rn.ftz.f32 %f68, %f59, %f67, %f60;\n" -" mul.ftz.f32 %f69, %f59, %f68;\n" -" mul.ftz.f32 %f70, %f55, %f69;\n" -" .loc 16 223 0\n" -" mov.u32 %r50, %r42;\n" -" mov.s32 %r51, 0;\n" -" mov.u32 %r52, %r51;\n" -" mov.s32 %r53, 0;\n" -" mov.u32 %r54, %r53;\n" -" mov.s32 %r55, 0;\n" -" mov.u32 %r56, %r55;\n" -" tex.1d.v4.f32.s32 {%f71,%f72,%f73,%f74},[q_tex,{%r50,%r52,%r54,%r56}];\n" -" mov.f32 %f75, %f71;\n" -" .loc 16 224 0\n" -" ld.param.f32 %f76, [__cudaparm_kernel_pair_fast_qqrd2e];\n" -" mul.ftz.f32 %f77, %f76, %f25;\n" -" mul.ftz.f32 %f78, %f77, %f75;\n" -" div.approx.ftz.f32 %f79, %f78, %f48;\n" -" mov.f32 %f80, 0f3f906ebb; \n" -" mul.ftz.f32 %f81, %f50, %f80;\n" -" fma.rn.ftz.f32 %f82, %f55, %f81, %f70;\n" -" sub.ftz.f32 %f83, %f82, %f33;\n" -" mul.ftz.f32 %f84, %f79, %f83;\n" -" mul.ftz.f32 %f85, %f47, %f84;\n" -" .loc 16 226 0\n" -" fma.rn.ftz.f32 %f29, %f42, %f85, %f29;\n" -" .loc 16 227 0\n" -" fma.rn.ftz.f32 %f28, %f41, %f85, %f28;\n" -" .loc 16 228 0\n" -" fma.rn.ftz.f32 %f27, %f43, %f85, %f27;\n" -" .loc 16 215 0\n" -" sub.ftz.f32 %f86, %f70, %f33;\n" -" fma.rn.ftz.f32 %f87, %f79, %f86, %f30;\n" -" ld.param.s32 %r57, [__cudaparm_kernel_pair_fast_eflag];\n" -" mov.s32 %r58, 0;\n" -" setp.gt.s32 %p6, %r57, %r58;\n" -" selp.f32 %f30, %f87, %f30, %p6;\n" -" ld.param.s32 %r59, [__cudaparm_kernel_pair_fast_vflag];\n" -" mov.u32 %r60, 0;\n" -" setp.le.s32 %p7, %r59, %r60;\n" -" @%p7 bra $Lt_1_22274;\n" -" .loc 16 234 0\n" -" mov.f32 %f88, %f3;\n" -" mul.ftz.f32 %f89, %f42, %f42;\n" -" fma.rn.ftz.f32 %f90, %f85, %f89, %f88;\n" -" mov.f32 %f3, %f90;\n" -" .loc 16 235 0\n" -" mov.f32 %f91, %f5;\n" -" fma.rn.ftz.f32 %f92, %f85, %f44, %f91;\n" -" mov.f32 %f5, %f92;\n" -" .loc 16 236 0\n" -" mov.f32 %f93, %f7;\n" -" mul.ftz.f32 %f94, %f43, %f43;\n" -" fma.rn.ftz.f32 %f95, %f85, %f94, %f93;\n" -" mov.f32 %f7, %f95;\n" -" .loc 16 237 0\n" -" mov.f32 %f96, %f9;\n" -" mul.ftz.f32 %f97, %f41, %f42;\n" -" fma.rn.ftz.f32 %f98, %f85, %f97, %f96;\n" -" mov.f32 %f9, %f98;\n" -" .loc 16 238 0\n" -" mov.f32 %f99, %f11;\n" -" mul.ftz.f32 %f100, %f42, %f43;\n" -" fma.rn.ftz.f32 %f101, %f85, %f100, %f99;\n" -" mov.f32 %f11, %f101;\n" -" .loc 16 239 0\n" -" mul.ftz.f32 %f102, %f41, %f43;\n" -" fma.rn.ftz.f32 %f12, %f85, %f102, %f12;\n" -" mov.f32 %f13, %f12;\n" -"$Lt_1_22274:\n" -"$Lt_1_21762:\n" -" .loc 16 199 0\n" -" mul.lo.u64 %rd36, %rd32, 4;\n" -" add.u64 %rd25, %rd25, %rd36;\n" -" setp.lt.u64 %p8, %rd25, %rd24;\n" -" @%p8 bra $Lt_1_21506;\n" -" bra.uni $Lt_1_20994;\n" -"$Lt_1_28674:\n" -" mov.f32 %f27, 0f00000000; \n" -" mov.f32 %f28, 0f00000000; \n" -" mov.f32 %f29, 0f00000000; \n" -" mov.f32 %f30, 0f00000000; \n" -"$Lt_1_20994:\n" -" mov.u32 %r61, 1;\n" -" setp.le.s32 %p9, %r3, %r61;\n" -" @%p9 bra $Lt_1_25090;\n" -" .loc 16 249 0\n" -" mov.u64 %rd37, __cuda___cuda_local_var_32748_37_non_const_red_acc3320;\n" -" cvt.s64.s32 %rd38, %r1;\n" -" mul.wide.s32 %rd39, %r1, 4;\n" -" add.u64 %rd40, %rd37, %rd39;\n" -" mov.f32 %f103, %f29;\n" -" st.shared.f32 [%rd40+0], %f103;\n" -" .loc 16 250 0\n" -" mov.f32 %f104, %f28;\n" -" st.shared.f32 [%rd40+512], %f104;\n" -" .loc 16 251 0\n" -" mov.f32 %f105, %f27;\n" -" st.shared.f32 [%rd40+1024], %f105;\n" -" .loc 16 252 0\n" -" mov.f32 %f106, %f30;\n" -" st.shared.f32 [%rd40+1536], %f106;\n" -" .loc 16 254 0\n" -" shr.s32 %r62, %r3, 31;\n" -" mov.s32 %r63, 1;\n" -" and.b32 %r64, %r62, %r63;\n" -" add.s32 %r65, %r64, %r3;\n" -" shr.s32 %r66, %r65, 1;\n" -" mov.s32 %r67, %r66;\n" -" mov.u32 %r68, 0;\n" -" setp.ne.u32 %p10, %r66, %r68;\n" -" @!%p10 bra $Lt_1_23554;\n" -"$Lt_1_24066:\n" -" setp.ge.u32 %p11, %r15, %r67;\n" -" @%p11 bra $Lt_1_24322;\n" -" .loc 16 257 0\n" -" add.u32 %r69, %r1, %r67;\n" -" cvt.u64.u32 %rd41, %r69;\n" -" mul.wide.u32 %rd42, %r69, 4;\n" -" add.u64 %rd43, %rd37, %rd42;\n" -" ld.shared.f32 %f107, [%rd43+0];\n" -" add.ftz.f32 %f103, %f107, %f103;\n" -" st.shared.f32 [%rd40+0], %f103;\n" -" ld.shared.f32 %f108, [%rd43+512];\n" -" add.ftz.f32 %f104, %f108, %f104;\n" -" st.shared.f32 [%rd40+512], %f104;\n" -" ld.shared.f32 %f109, [%rd43+1024];\n" -" add.ftz.f32 %f105, %f109, %f105;\n" -" st.shared.f32 [%rd40+1024], %f105;\n" -" ld.shared.f32 %f110, [%rd43+1536];\n" -" add.ftz.f32 %f106, %f110, %f106;\n" -" st.shared.f32 [%rd40+1536], %f106;\n" -"$Lt_1_24322:\n" -" .loc 16 254 0\n" -" shr.u32 %r67, %r67, 1;\n" -" mov.u32 %r70, 0;\n" -" setp.ne.u32 %p12, %r67, %r70;\n" -" @%p12 bra $Lt_1_24066;\n" -"$Lt_1_23554:\n" -" .loc 16 261 0\n" -" mov.f32 %f29, %f103;\n" -" .loc 16 262 0\n" -" mov.f32 %f28, %f104;\n" -" .loc 16 263 0\n" -" mov.f32 %f27, %f105;\n" -" .loc 16 264 0\n" -" mov.f32 %f30, %f106;\n" -" ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag];\n" -" mov.u32 %r72, 0;\n" -" setp.le.s32 %p13, %r71, %r72;\n" -" @%p13 bra $Lt_1_25090;\n" -" .loc 16 268 0\n" -" mov.f32 %f103, %f3;\n" -" st.shared.f32 [%rd40+0], %f103;\n" -" mov.f32 %f104, %f5;\n" -" st.shared.f32 [%rd40+512], %f104;\n" -" mov.f32 %f105, %f7;\n" -" st.shared.f32 [%rd40+1024], %f105;\n" -" mov.f32 %f106, %f9;\n" -" st.shared.f32 [%rd40+1536], %f106;\n" -" mov.f32 %f111, %f11;\n" -" st.shared.f32 [%rd40+2048], %f111;\n" -" mov.f32 %f112, %f12;\n" -" st.shared.f32 [%rd40+2560], %f112;\n" -" .loc 16 270 0\n" -" mov.s32 %r73, %r66;\n" -" @!%p10 bra $Lt_1_25602;\n" -"$Lt_1_26114:\n" -" setp.ge.u32 %p14, %r15, %r73;\n" -" @%p14 bra $Lt_1_26370;\n" -" .loc 16 273 0\n" -" add.u32 %r74, %r1, %r73;\n" -" cvt.u64.u32 %rd44, %r74;\n" -" mul.wide.u32 %rd45, %r74, 4;\n" -" add.u64 %rd46, %rd37, %rd45;\n" -" ld.shared.f32 %f113, [%rd46+0];\n" -" add.ftz.f32 %f103, %f113, %f103;\n" -" st.shared.f32 [%rd40+0], %f103;\n" -" ld.shared.f32 %f114, [%rd46+512];\n" -" add.ftz.f32 %f104, %f114, %f104;\n" -" st.shared.f32 [%rd40+512], %f104;\n" -" ld.shared.f32 %f115, [%rd46+1024];\n" -" add.ftz.f32 %f105, %f115, %f105;\n" -" st.shared.f32 [%rd40+1024], %f105;\n" -" ld.shared.f32 %f116, [%rd46+1536];\n" -" add.ftz.f32 %f106, %f116, %f106;\n" -" st.shared.f32 [%rd40+1536], %f106;\n" -" ld.shared.f32 %f117, [%rd46+2048];\n" -" add.ftz.f32 %f111, %f117, %f111;\n" -" st.shared.f32 [%rd40+2048], %f111;\n" -" ld.shared.f32 %f118, [%rd46+2560];\n" -" add.ftz.f32 %f112, %f118, %f112;\n" -" st.shared.f32 [%rd40+2560], %f112;\n" -"$Lt_1_26370:\n" -" .loc 16 270 0\n" -" shr.u32 %r73, %r73, 1;\n" -" mov.u32 %r75, 0;\n" -" setp.ne.u32 %p15, %r73, %r75;\n" -" @%p15 bra $Lt_1_26114;\n" -"$Lt_1_25602:\n" -" .loc 16 278 0\n" -" mov.f32 %f3, %f103;\n" -" mov.f32 %f5, %f104;\n" -" mov.f32 %f7, %f105;\n" -" mov.f32 %f9, %f106;\n" -" mov.f32 %f11, %f111;\n" -" mov.f32 %f13, %f112;\n" -"$Lt_1_25090:\n" -"$Lt_1_23042:\n" -" mov.u32 %r76, 0;\n" -" setp.ne.s32 %p16, %r15, %r76;\n" -" @%p16 bra $Lt_1_27138;\n" -" .loc 16 284 0\n" -" ld.param.u64 %rd47, [__cudaparm_kernel_pair_fast_engv];\n" -" add.u64 %rd48, %rd47, %rd8;\n" -" ld.param.s32 %r77, [__cudaparm_kernel_pair_fast_eflag];\n" -" mov.u32 %r78, 0;\n" -" setp.le.s32 %p17, %r77, %r78;\n" -" @%p17 bra $Lt_1_27650;\n" -" .loc 16 286 0\n" -" mov.f32 %f119, 0f00000000; \n" -" st.global.f32 [%rd48+0], %f119;\n" -" .loc 16 287 0\n" -" cvt.s64.s32 %rd49, %r10;\n" -" mul.wide.s32 %rd50, %r10, 4;\n" -" add.u64 %rd51, %rd50, %rd48;\n" -" .loc 16 288 0\n" -" st.global.f32 [%rd51+0], %f30;\n" -" .loc 16 289 0\n" -" add.u64 %rd48, %rd50, %rd51;\n" -"$Lt_1_27650:\n" -" ld.param.s32 %r79, [__cudaparm_kernel_pair_fast_vflag];\n" -" mov.u32 %r80, 0;\n" -" setp.le.s32 %p18, %r79, %r80;\n" -" @%p18 bra $Lt_1_28162;\n" -" .loc 16 293 0\n" -" mov.f32 %f120, %f3;\n" -" st.global.f32 [%rd48+0], %f120;\n" -" .loc 16 294 0\n" -" cvt.s64.s32 %rd52, %r10;\n" -" mul.wide.s32 %rd53, %r10, 4;\n" -" add.u64 %rd54, %rd53, %rd48;\n" -" .loc 16 293 0\n" -" mov.f32 %f121, %f5;\n" -" st.global.f32 [%rd54+0], %f121;\n" -" .loc 16 294 0\n" -" add.u64 %rd55, %rd53, %rd54;\n" -" .loc 16 293 0\n" -" mov.f32 %f122, %f7;\n" -" st.global.f32 [%rd55+0], %f122;\n" -" .loc 16 294 0\n" -" add.u64 %rd56, %rd53, %rd55;\n" -" .loc 16 293 0\n" -" mov.f32 %f123, %f9;\n" -" st.global.f32 [%rd56+0], %f123;\n" -" .loc 16 294 0\n" -" add.u64 %rd48, %rd53, %rd56;\n" -" .loc 16 293 0\n" -" mov.f32 %f124, %f11;\n" -" st.global.f32 [%rd48+0], %f124;\n" -" mov.f32 %f125, %f13;\n" -" add.u64 %rd57, %rd53, %rd48;\n" -" st.global.f32 [%rd57+0], %f125;\n" -"$Lt_1_28162:\n" -" .loc 16 297 0\n" -" ld.param.u64 %rd58, [__cudaparm_kernel_pair_fast_ans];\n" -" mul.lo.u64 %rd59, %rd7, 16;\n" -" add.u64 %rd60, %rd58, %rd59;\n" -" mov.f32 %f126, %f127;\n" -" st.global.v4.f32 [%rd60+0], {%f29,%f28,%f27,%f126};\n" -"$Lt_1_27138:\n" -"$Lt_1_19970:\n" -" .loc 16 300 0\n" -" exit;\n" -"$LDWend_kernel_pair_fast:\n" -" }\n" -; diff --git a/lib/gpu/cudpp.o b/lib/gpu/cudpp.o deleted file mode 100644 index dfce9b08ac..0000000000 Binary files a/lib/gpu/cudpp.o and /dev/null differ diff --git a/lib/gpu/cudpp_maximal_launch.o b/lib/gpu/cudpp_maximal_launch.o deleted file mode 100644 index 17e0b892a6..0000000000 Binary files a/lib/gpu/cudpp_maximal_launch.o and /dev/null differ diff --git a/lib/gpu/cudpp_plan.o b/lib/gpu/cudpp_plan.o deleted file mode 100644 index 219bfcc181..0000000000 Binary files a/lib/gpu/cudpp_plan.o and /dev/null differ diff --git a/lib/gpu/cudpp_plan_manager.o b/lib/gpu/cudpp_plan_manager.o deleted file mode 100644 index ff98061756..0000000000 Binary files a/lib/gpu/cudpp_plan_manager.o and /dev/null differ diff --git a/lib/gpu/device.ptx b/lib/gpu/device.ptx deleted file mode 100644 index c21e0acd2f..0000000000 --- a/lib/gpu/device.ptx +++ /dev/null @@ -1,134 +0,0 @@ - .version 2.3 - .target sm_20 - .address_size 64 - // compiled with /usr/local/cuda/open64/lib//be - // nvopencc 4.0 built on 2011-05-12 - - //----------------------------------------------------------- - // Compiling /tmp/tmpxft_00009a81_00000000-9_lal_device.cpp3.i (/home/sjplimp/ccBI#.zwVkZj) - //----------------------------------------------------------- - - //----------------------------------------------------------- - // Options: - //----------------------------------------------------------- - // Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64 - // -O3 (Optimization level) - // -g0 (Debug level) - // -m2 (Report advisories) - //----------------------------------------------------------- - - .file 1 "" - .file 2 "/tmp/tmpxft_00009a81_00000000-8_lal_device.cudafe2.gpu" - .file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h" - .file 4 "/usr/local/cuda/include/crt/device_runtime.h" - .file 5 "/usr/local/cuda/include/host_defines.h" - .file 6 "/usr/local/cuda/include/builtin_types.h" - .file 7 "/usr/local/cuda/include/device_types.h" - .file 8 "/usr/local/cuda/include/driver_types.h" - .file 9 "/usr/local/cuda/include/surface_types.h" - .file 10 "/usr/local/cuda/include/texture_types.h" - .file 11 "/usr/local/cuda/include/vector_types.h" - .file 12 "/usr/local/cuda/include/device_launch_parameters.h" - .file 13 "/usr/local/cuda/include/crt/storage_class.h" - .file 14 "/usr/include/bits/types.h" - .file 15 "/usr/include/time.h" - .file 16 "lal_device.cu" - .file 17 "/usr/local/cuda/include/common_functions.h" - .file 18 "/usr/local/cuda/include/math_functions.h" - .file 19 "/usr/local/cuda/include/math_constants.h" - .file 20 "/usr/local/cuda/include/device_functions.h" - .file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h" - .file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h" - .file 23 "/usr/local/cuda/include/sm_13_double_functions.h" - .file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h" - .file 25 "/usr/local/cuda/include/sm_20_intrinsics.h" - .file 26 "/usr/local/cuda/include/surface_functions.h" - .file 27 "/usr/local/cuda/include/texture_fetch_functions.h" - .file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h" - - - .entry kernel_zero ( - .param .u64 __cudaparm_kernel_zero_mem, - .param .s32 __cudaparm_kernel_zero_numel) - { - .reg .u32 %r<9>; - .reg .u64 %rd<6>; - .reg .pred %p<3>; - .loc 16 20 0 -$LDWbegin_kernel_zero: - cvt.s32.u32 %r1, %ctaid.x; - cvt.s32.u32 %r2, %ntid.x; - mul24.lo.s32 %r3, %r1, %r2; - mov.u32 %r4, %tid.x; - add.u32 %r5, %r3, %r4; - ld.param.s32 %r6, [__cudaparm_kernel_zero_numel]; - setp.le.s32 %p1, %r6, %r5; - @%p1 bra $Lt_0_1026; - .loc 16 24 0 - mov.s32 %r7, 0; - ld.param.u64 %rd1, [__cudaparm_kernel_zero_mem]; - cvt.s64.s32 %rd2, %r5; - mul.wide.s32 %rd3, %r5, 4; - add.u64 %rd4, %rd1, %rd3; - st.global.s32 [%rd4+0], %r7; -$Lt_0_1026: - .loc 16 25 0 - exit; -$LDWend_kernel_zero: - } // kernel_zero - - .entry kernel_info ( - .param .u64 __cudaparm_kernel_info_info) - { - .reg .u32 %r<16>; - .reg .u64 %rd<3>; - .loc 16 27 0 -$LDWbegin_kernel_info: - .loc 16 28 0 - ld.param.u64 %rd1, [__cudaparm_kernel_info_info]; - mov.s32 %r1, 200; - st.global.s32 [%rd1+0], %r1; - .loc 16 29 0 - mov.s32 %r2, 32; - st.global.s32 [%rd1+4], %r2; - .loc 16 30 0 - mov.s32 %r3, 32; - st.global.s32 [%rd1+8], %r3; - .loc 16 31 0 - mov.s32 %r4, 4; - st.global.s32 [%rd1+12], %r4; - .loc 16 32 0 - mov.s32 %r5, 8; - st.global.s32 [%rd1+16], %r5; - .loc 16 33 0 - mov.s32 %r6, 64; - st.global.s32 [%rd1+20], %r6; - .loc 16 34 0 - mov.s32 %r7, 128; - st.global.s32 [%rd1+24], %r7; - .loc 16 35 0 - mov.s32 %r8, 11; - st.global.s32 [%rd1+28], %r8; - .loc 16 36 0 - mov.s32 %r9, 8; - st.global.s32 [%rd1+32], %r9; - .loc 16 37 0 - mov.s32 %r10, 128; - st.global.s32 [%rd1+36], %r10; - .loc 16 38 0 - mov.s32 %r11, 128; - st.global.s32 [%rd1+40], %r11; - .loc 16 39 0 - mov.s32 %r12, 128; - st.global.s32 [%rd1+44], %r12; - .loc 16 40 0 - mov.s32 %r13, 128; - st.global.s32 [%rd1+48], %r13; - .loc 16 41 0 - mov.s32 %r14, 8; - st.global.s32 [%rd1+52], %r14; - .loc 16 42 0 - exit; -$LDWend_kernel_info: - } // kernel_info - diff --git a/lib/gpu/device_ptx.h b/lib/gpu/device_ptx.h deleted file mode 100644 index 5df5c329a3..0000000000 --- a/lib/gpu/device_ptx.h +++ /dev/null @@ -1,88 +0,0 @@ -const char * device = -" .version 2.3\n" -" .target sm_20\n" -" .address_size 64\n" -" .entry kernel_zero (\n" -" .param .u64 __cudaparm_kernel_zero_mem,\n" -" .param .s32 __cudaparm_kernel_zero_numel)\n" -" {\n" -" .reg .u32 %r<9>;\n" -" .reg .u64 %rd<6>;\n" -" .reg .pred %p<3>;\n" -" .loc 16 20 0\n" -"$LDWbegin_kernel_zero:\n" -" cvt.s32.u32 %r1, %ctaid.x;\n" -" cvt.s32.u32 %r2, %ntid.x;\n" -" mul24.lo.s32 %r3, %r1, %r2;\n" -" mov.u32 %r4, %tid.x;\n" -" add.u32 %r5, %r3, %r4;\n" -" ld.param.s32 %r6, [__cudaparm_kernel_zero_numel];\n" -" setp.le.s32 %p1, %r6, %r5;\n" -" @%p1 bra $Lt_0_1026;\n" -" .loc 16 24 0\n" -" mov.s32 %r7, 0;\n" -" ld.param.u64 %rd1, [__cudaparm_kernel_zero_mem];\n" -" cvt.s64.s32 %rd2, %r5;\n" -" mul.wide.s32 %rd3, %r5, 4;\n" -" add.u64 %rd4, %rd1, %rd3;\n" -" st.global.s32 [%rd4+0], %r7;\n" -"$Lt_0_1026:\n" -" .loc 16 25 0\n" -" exit;\n" -"$LDWend_kernel_zero:\n" -" }\n" -" .entry kernel_info (\n" -" .param .u64 __cudaparm_kernel_info_info)\n" -" {\n" -" .reg .u32 %r<16>;\n" -" .reg .u64 %rd<3>;\n" -" .loc 16 27 0\n" -"$LDWbegin_kernel_info:\n" -" .loc 16 28 0\n" -" ld.param.u64 %rd1, [__cudaparm_kernel_info_info];\n" -" mov.s32 %r1, 200;\n" -" st.global.s32 [%rd1+0], %r1;\n" -" .loc 16 29 0\n" -" mov.s32 %r2, 32;\n" -" st.global.s32 [%rd1+4], %r2;\n" -" .loc 16 30 0\n" -" mov.s32 %r3, 32;\n" -" st.global.s32 [%rd1+8], %r3;\n" -" .loc 16 31 0\n" -" mov.s32 %r4, 4;\n" -" st.global.s32 [%rd1+12], %r4;\n" -" .loc 16 32 0\n" -" mov.s32 %r5, 8;\n" -" st.global.s32 [%rd1+16], %r5;\n" -" .loc 16 33 0\n" -" mov.s32 %r6, 64;\n" -" st.global.s32 [%rd1+20], %r6;\n" -" .loc 16 34 0\n" -" mov.s32 %r7, 128;\n" -" st.global.s32 [%rd1+24], %r7;\n" -" .loc 16 35 0\n" -" mov.s32 %r8, 11;\n" -" st.global.s32 [%rd1+28], %r8;\n" -" .loc 16 36 0\n" -" mov.s32 %r9, 8;\n" -" st.global.s32 [%rd1+32], %r9;\n" -" .loc 16 37 0\n" -" mov.s32 %r10, 128;\n" -" st.global.s32 [%rd1+36], %r10;\n" -" .loc 16 38 0\n" -" mov.s32 %r11, 128;\n" -" st.global.s32 [%rd1+40], %r11;\n" -" .loc 16 39 0\n" -" mov.s32 %r12, 128;\n" -" st.global.s32 [%rd1+44], %r12;\n" -" .loc 16 40 0\n" -" mov.s32 %r13, 128;\n" -" st.global.s32 [%rd1+48], %r13;\n" -" .loc 16 41 0\n" -" mov.s32 %r14, 8;\n" -" st.global.s32 [%rd1+52], %r14;\n" -" .loc 16 42 0\n" -" exit;\n" -"$LDWend_kernel_info:\n" -" }\n" -; diff --git a/lib/gpu/ellipsoid_nbor.ptx b/lib/gpu/ellipsoid_nbor.ptx deleted file mode 100644 index 657177c8f2..0000000000 --- a/lib/gpu/ellipsoid_nbor.ptx +++ /dev/null @@ -1,329 +0,0 @@ - .version 2.3 - .target sm_20 - .address_size 64 - // compiled with /usr/local/cuda/open64/lib//be - // nvopencc 4.0 built on 2011-05-12 - - //----------------------------------------------------------- - // Compiling /tmp/tmpxft_00009ad9_00000000-9_lal_ellipsoid_nbor.cpp3.i (/home/sjplimp/ccBI#.7CLzz0) - //----------------------------------------------------------- - - //----------------------------------------------------------- - // Options: - //----------------------------------------------------------- - // Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64 - // -O3 (Optimization level) - // -g0 (Debug level) - // -m2 (Report advisories) - //----------------------------------------------------------- - - .file 1 "" - .file 2 "/tmp/tmpxft_00009ad9_00000000-8_lal_ellipsoid_nbor.cudafe2.gpu" - .file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h" - .file 4 "/usr/local/cuda/include/crt/device_runtime.h" - .file 5 "/usr/local/cuda/include/host_defines.h" - .file 6 "/usr/local/cuda/include/builtin_types.h" - .file 7 "/usr/local/cuda/include/device_types.h" - .file 8 "/usr/local/cuda/include/driver_types.h" - .file 9 "/usr/local/cuda/include/surface_types.h" - .file 10 "/usr/local/cuda/include/texture_types.h" - .file 11 "/usr/local/cuda/include/vector_types.h" - .file 12 "/usr/local/cuda/include/device_launch_parameters.h" - .file 13 "/usr/local/cuda/include/crt/storage_class.h" - .file 14 "/usr/include/bits/types.h" - .file 15 "/usr/include/time.h" - .file 16 "lal_ellipsoid_nbor.cu" - .file 17 "/usr/local/cuda/include/common_functions.h" - .file 18 "/usr/local/cuda/include/math_functions.h" - .file 19 "/usr/local/cuda/include/math_constants.h" - .file 20 "/usr/local/cuda/include/device_functions.h" - .file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h" - .file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h" - .file 23 "/usr/local/cuda/include/sm_13_double_functions.h" - .file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h" - .file 25 "/usr/local/cuda/include/sm_20_intrinsics.h" - .file 26 "/usr/local/cuda/include/surface_functions.h" - .file 27 "/usr/local/cuda/include/texture_fetch_functions.h" - .file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h" - - - .entry kernel_nbor ( - .param .u64 __cudaparm_kernel_nbor_x_, - .param .u64 __cudaparm_kernel_nbor_cut_form, - .param .s32 __cudaparm_kernel_nbor_ntypes, - .param .u64 __cudaparm_kernel_nbor_dev_nbor, - .param .s32 __cudaparm_kernel_nbor_nbor_pitch, - .param .s32 __cudaparm_kernel_nbor_start, - .param .s32 __cudaparm_kernel_nbor_inum, - .param .u64 __cudaparm_kernel_nbor_dev_ij, - .param .s32 __cudaparm_kernel_nbor_form_low, - .param .s32 __cudaparm_kernel_nbor_form_high) - { - .reg .u32 %r<26>; - .reg .u64 %rd<33>; - .reg .f32 %f<20>; - .reg .pred %p<8>; - .loc 16 29 0 -$LDWbegin_kernel_nbor: - cvt.s32.u32 %r1, %ctaid.x; - cvt.s32.u32 %r2, %ntid.x; - mul24.lo.s32 %r3, %r1, %r2; - mov.u32 %r4, %tid.x; - add.u32 %r5, %r3, %r4; - ld.param.s32 %r6, [__cudaparm_kernel_nbor_start]; - add.u32 %r7, %r6, %r5; - ld.param.s32 %r8, [__cudaparm_kernel_nbor_inum]; - setp.le.s32 %p1, %r8, %r7; - @%p1 bra $Lt_0_4354; - .loc 16 36 0 - cvt.s64.s32 %rd1, %r7; - ld.param.u64 %rd2, [__cudaparm_kernel_nbor_dev_ij]; - mul.wide.s32 %rd3, %r7, 4; - add.u64 %rd4, %rd2, %rd3; - ld.global.s32 %r9, [%rd4+0]; - .loc 16 38 0 - ld.param.s32 %r10, [__cudaparm_kernel_nbor_nbor_pitch]; - cvt.s64.s32 %rd5, %r10; - mul.wide.s32 %rd6, %r10, 4; - add.u64 %rd7, %rd6, %rd4; - ld.global.s32 %r11, [%rd7+0]; - .loc 16 39 0 - add.u64 %rd8, %rd6, %rd7; - mov.s64 %rd9, %rd8; - .loc 16 41 0 - ld.param.u64 %rd10, [__cudaparm_kernel_nbor_dev_nbor]; - add.u64 %rd11, %rd1, %rd5; - add.u64 %rd12, %rd5, %rd11; - mul.lo.u64 %rd13, %rd12, 4; - add.u64 %rd14, %rd10, %rd13; - .loc 16 43 0 - ld.param.u64 %rd15, [__cudaparm_kernel_nbor_x_]; - cvt.s64.s32 %rd16, %r9; - mul.wide.s32 %rd17, %r9, 16; - add.u64 %rd18, %rd15, %rd17; - ld.global.v4.f32 {%f1,%f2,%f3,%f4}, [%rd18+0]; - cvt.s32.s64 %r12, %rd5; - mul.lo.s32 %r13, %r12, %r11; - cvt.s64.s32 %rd19, %r13; - mul.wide.s32 %rd20, %r13, 4; - add.u64 %rd21, %rd8, %rd20; - setp.ge.u64 %p2, %rd8, %rd21; - @%p2 bra $Lt_0_6402; - cvt.rzi.ftz.s32.f32 %r14, %f4; - ld.param.s32 %r15, [__cudaparm_kernel_nbor_form_low]; - cvt.rn.f32.s32 %f5, %r15; - ld.param.s32 %r16, [__cudaparm_kernel_nbor_ntypes]; - mul.lo.s32 %r17, %r16, %r14; - ld.param.u64 %rd22, [__cudaparm_kernel_nbor_cut_form]; - mov.s32 %r18, 0; -$Lt_0_5378: - // Loop body line 43, nesting depth: 1, estimated iterations: unknown - .loc 16 49 0 - ld.global.s32 %r19, [%rd9+0]; - and.b32 %r20, %r19, 1073741823; - .loc 16 50 0 - cvt.s64.s32 %rd23, %r20; - mul.wide.s32 %rd24, %r20, 16; - add.u64 %rd25, %rd15, %rd24; - ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd25+0]; - .loc 16 53 0 - cvt.rzi.ftz.s32.f32 %r21, %f9; - add.s32 %r22, %r21, %r17; - cvt.s64.s32 %rd26, %r22; - mul.wide.s32 %rd27, %r22, 8; - add.u64 %rd28, %rd22, %rd27; - ld.global.f32 %f10, [%rd28+4]; - .loc 16 48 0 - setp.le.ftz.f32 %p3, %f5, %f10; - @!%p3 bra $Lt_0_6658; - ld.param.s32 %r23, [__cudaparm_kernel_nbor_form_high]; - cvt.rn.f32.s32 %f11, %r23; - setp.ge.ftz.f32 %p4, %f11, %f10; - @!%p4 bra $Lt_0_6658; - sub.ftz.f32 %f12, %f6, %f1; - sub.ftz.f32 %f13, %f7, %f2; - sub.ftz.f32 %f14, %f8, %f3; - ld.global.f32 %f15, [%rd28+0]; - mul.ftz.f32 %f16, %f12, %f12; - fma.rn.ftz.f32 %f17, %f13, %f13, %f16; - fma.rn.ftz.f32 %f18, %f14, %f14, %f17; - setp.gt.ftz.f32 %p5, %f15, %f18; - @!%p5 bra $Lt_0_6658; - .loc 16 64 0 - st.global.s32 [%rd14+0], %r20; - .loc 16 65 0 - add.u64 %rd14, %rd6, %rd14; - .loc 16 66 0 - add.s32 %r18, %r18, 1; -$Lt_0_6658: -$L_0_3842: - .loc 16 47 0 - add.u64 %rd9, %rd6, %rd9; - setp.gt.u64 %p6, %rd21, %rd9; - @%p6 bra $Lt_0_5378; - bra.uni $Lt_0_4866; -$Lt_0_6402: - mov.s32 %r18, 0; -$Lt_0_4866: - .loc 16 70 0 - add.s32 %r24, %r12, %r7; - cvt.s64.s32 %rd29, %r24; - mul.wide.s32 %rd30, %r24, 4; - add.u64 %rd31, %rd10, %rd30; - st.global.s32 [%rd31+0], %r18; -$Lt_0_4354: - .loc 16 72 0 - exit; -$LDWend_kernel_nbor: - } // kernel_nbor - - .entry kernel_nbor_fast ( - .param .u64 __cudaparm_kernel_nbor_fast_x_, - .param .u64 __cudaparm_kernel_nbor_fast_cut_form, - .param .u64 __cudaparm_kernel_nbor_fast_dev_nbor, - .param .s32 __cudaparm_kernel_nbor_fast_nbor_pitch, - .param .s32 __cudaparm_kernel_nbor_fast_start, - .param .s32 __cudaparm_kernel_nbor_fast_inum, - .param .u64 __cudaparm_kernel_nbor_fast_dev_ij, - .param .s32 __cudaparm_kernel_nbor_fast_form_low, - .param .s32 __cudaparm_kernel_nbor_fast_form_high) - { - .reg .u32 %r<28>; - .reg .u64 %rd<42>; - .reg .f32 %f<19>; - .reg .pred %p<9>; - .shared .align 4 .b8 __cuda___cuda_local_var_32570_31_non_const_form120[484]; - .shared .align 4 .b8 __cuda___cuda_local_var_32571_33_non_const_cutsq604[484]; - .loc 16 84 0 -$LDWbegin_kernel_nbor_fast: - cvt.s32.u32 %r1, %tid.x; - mov.u32 %r2, 120; - setp.gt.s32 %p1, %r1, %r2; - @%p1 bra $Lt_1_5122; - .loc 16 90 0 - mov.u64 %rd1, __cuda___cuda_local_var_32570_31_non_const_form120; - mov.u64 %rd2, __cuda___cuda_local_var_32571_33_non_const_cutsq604; - cvt.s64.s32 %rd3, %r1; - mul.wide.s32 %rd4, %r1, 4; - ld.param.u64 %rd5, [__cudaparm_kernel_nbor_fast_cut_form]; - mul.wide.s32 %rd6, %r1, 8; - add.u64 %rd7, %rd5, %rd6; - ld.global.v2.f32 {%f1,%f2}, [%rd7+0]; - add.u64 %rd8, %rd4, %rd2; - st.shared.f32 [%rd8+0], %f1; - .loc 16 91 0 - cvt.rzi.ftz.s32.f32 %r3, %f2; - add.u64 %rd9, %rd4, %rd1; - st.shared.s32 [%rd9+0], %r3; -$Lt_1_5122: - mov.u64 %rd1, __cuda___cuda_local_var_32570_31_non_const_form120; - mov.u64 %rd2, __cuda___cuda_local_var_32571_33_non_const_cutsq604; - .loc 16 94 0 - bar.sync 0; - cvt.s32.u32 %r4, %ctaid.x; - cvt.s32.u32 %r5, %ntid.x; - mul.lo.s32 %r6, %r4, %r5; - ld.param.s32 %r7, [__cudaparm_kernel_nbor_fast_start]; - add.s32 %r8, %r7, %r6; - add.s32 %r9, %r8, %r1; - ld.param.s32 %r10, [__cudaparm_kernel_nbor_fast_inum]; - setp.le.s32 %p2, %r10, %r9; - @%p2 bra $Lt_1_5634; - .loc 16 98 0 - cvt.s64.s32 %rd10, %r9; - ld.param.u64 %rd11, [__cudaparm_kernel_nbor_fast_dev_ij]; - mul.wide.s32 %rd12, %r9, 4; - add.u64 %rd13, %rd11, %rd12; - ld.global.s32 %r11, [%rd13+0]; - .loc 16 100 0 - ld.param.s32 %r12, [__cudaparm_kernel_nbor_fast_nbor_pitch]; - cvt.s64.s32 %rd14, %r12; - mul.wide.s32 %rd15, %r12, 4; - add.u64 %rd16, %rd15, %rd13; - ld.global.s32 %r13, [%rd16+0]; - .loc 16 101 0 - add.u64 %rd17, %rd15, %rd16; - mov.s64 %rd18, %rd17; - .loc 16 103 0 - ld.param.u64 %rd19, [__cudaparm_kernel_nbor_fast_dev_nbor]; - add.u64 %rd20, %rd10, %rd14; - add.u64 %rd21, %rd14, %rd20; - mul.lo.u64 %rd22, %rd21, 4; - add.u64 %rd23, %rd19, %rd22; - .loc 16 105 0 - ld.param.u64 %rd24, [__cudaparm_kernel_nbor_fast_x_]; - cvt.s64.s32 %rd25, %r11; - mul.wide.s32 %rd26, %r11, 16; - add.u64 %rd27, %rd24, %rd26; - ld.global.v4.f32 {%f3,%f4,%f5,%f6}, [%rd27+0]; - cvt.s32.s64 %r14, %rd14; - mul.lo.s32 %r15, %r14, %r13; - cvt.s64.s32 %rd28, %r15; - mul.wide.s32 %rd29, %r15, 4; - add.u64 %rd30, %rd17, %rd29; - setp.ge.u64 %p3, %rd17, %rd30; - @%p3 bra $Lt_1_7682; - cvt.rzi.ftz.s32.f32 %r16, %f6; - mul.lo.s32 %r17, %r16, 11; - ld.param.s32 %r18, [__cudaparm_kernel_nbor_fast_form_low]; - mov.s32 %r19, 0; -$Lt_1_6658: - // Loop body line 105, nesting depth: 1, estimated iterations: unknown - .loc 16 112 0 - ld.global.s32 %r20, [%rd18+0]; - and.b32 %r21, %r20, 1073741823; - .loc 16 113 0 - cvt.s64.s32 %rd31, %r21; - mul.wide.s32 %rd32, %r21, 16; - add.u64 %rd33, %rd24, %rd32; - ld.global.v4.f32 {%f7,%f8,%f9,%f10}, [%rd33+0]; - .loc 16 111 0 - cvt.rzi.ftz.s32.f32 %r22, %f10; - add.s32 %r23, %r22, %r17; - cvt.s64.s32 %rd34, %r23; - mul.wide.s32 %rd35, %r23, 4; - add.u64 %rd36, %rd35, %rd1; - ld.shared.s32 %r24, [%rd36+0]; - setp.lt.s32 %p4, %r24, %r18; - @%p4 bra $Lt_1_7938; - ld.param.s32 %r25, [__cudaparm_kernel_nbor_fast_form_high]; - setp.lt.s32 %p5, %r25, %r24; - @%p5 bra $Lt_1_7938; - sub.ftz.f32 %f11, %f7, %f3; - sub.ftz.f32 %f12, %f8, %f4; - sub.ftz.f32 %f13, %f9, %f5; - add.u64 %rd37, %rd35, %rd2; - ld.shared.f32 %f14, [%rd37+0]; - mul.ftz.f32 %f15, %f11, %f11; - fma.rn.ftz.f32 %f16, %f12, %f12, %f15; - fma.rn.ftz.f32 %f17, %f13, %f13, %f16; - setp.gt.ftz.f32 %p6, %f14, %f17; - @!%p6 bra $Lt_1_7938; - .loc 16 127 0 - st.global.s32 [%rd23+0], %r21; - .loc 16 128 0 - add.u64 %rd23, %rd15, %rd23; - .loc 16 129 0 - add.s32 %r19, %r19, 1; -$Lt_1_7938: -$L_1_4610: - .loc 16 110 0 - add.u64 %rd18, %rd15, %rd18; - setp.gt.u64 %p7, %rd30, %rd18; - @%p7 bra $Lt_1_6658; - bra.uni $Lt_1_6146; -$Lt_1_7682: - mov.s32 %r19, 0; -$Lt_1_6146: - .loc 16 133 0 - add.s32 %r26, %r14, %r9; - cvt.s64.s32 %rd38, %r26; - mul.wide.s32 %rd39, %r26, 4; - add.u64 %rd40, %rd19, %rd39; - st.global.s32 [%rd40+0], %r19; -$Lt_1_5634: - .loc 16 135 0 - exit; -$LDWend_kernel_nbor_fast: - } // kernel_nbor_fast - diff --git a/lib/gpu/ellipsoid_nbor_ptx.h b/lib/gpu/ellipsoid_nbor_ptx.h deleted file mode 100644 index d47e6ce892..0000000000 --- a/lib/gpu/ellipsoid_nbor_ptx.h +++ /dev/null @@ -1,281 +0,0 @@ -const char * ellipsoid_nbor = -" .version 2.3\n" -" .target sm_20\n" -" .address_size 64\n" -" .entry kernel_nbor (\n" -" .param .u64 __cudaparm_kernel_nbor_x_,\n" -" .param .u64 __cudaparm_kernel_nbor_cut_form,\n" -" .param .s32 __cudaparm_kernel_nbor_ntypes,\n" -" .param .u64 __cudaparm_kernel_nbor_dev_nbor,\n" -" .param .s32 __cudaparm_kernel_nbor_nbor_pitch,\n" -" .param .s32 __cudaparm_kernel_nbor_start,\n" -" .param .s32 __cudaparm_kernel_nbor_inum,\n" -" .param .u64 __cudaparm_kernel_nbor_dev_ij,\n" -" .param .s32 __cudaparm_kernel_nbor_form_low,\n" -" .param .s32 __cudaparm_kernel_nbor_form_high)\n" -" {\n" -" .reg .u32 %r<26>;\n" -" .reg .u64 %rd<33>;\n" -" .reg .f32 %f<20>;\n" -" .reg .pred %p<8>;\n" -" .loc 16 29 0\n" -"$LDWbegin_kernel_nbor:\n" -" cvt.s32.u32 %r1, %ctaid.x;\n" -" cvt.s32.u32 %r2, %ntid.x;\n" -" mul24.lo.s32 %r3, %r1, %r2;\n" -" mov.u32 %r4, %tid.x;\n" -" add.u32 %r5, %r3, %r4;\n" -" ld.param.s32 %r6, [__cudaparm_kernel_nbor_start];\n" -" add.u32 %r7, %r6, %r5;\n" -" ld.param.s32 %r8, [__cudaparm_kernel_nbor_inum];\n" -" setp.le.s32 %p1, %r8, %r7;\n" -" @%p1 bra $Lt_0_4354;\n" -" .loc 16 36 0\n" -" cvt.s64.s32 %rd1, %r7;\n" -" ld.param.u64 %rd2, [__cudaparm_kernel_nbor_dev_ij];\n" -" mul.wide.s32 %rd3, %r7, 4;\n" -" add.u64 %rd4, %rd2, %rd3;\n" -" ld.global.s32 %r9, [%rd4+0];\n" -" .loc 16 38 0\n" -" ld.param.s32 %r10, [__cudaparm_kernel_nbor_nbor_pitch];\n" -" cvt.s64.s32 %rd5, %r10;\n" -" mul.wide.s32 %rd6, %r10, 4;\n" -" add.u64 %rd7, %rd6, %rd4;\n" -" ld.global.s32 %r11, [%rd7+0];\n" -" .loc 16 39 0\n" -" add.u64 %rd8, %rd6, %rd7;\n" -" mov.s64 %rd9, %rd8;\n" -" .loc 16 41 0\n" -" ld.param.u64 %rd10, [__cudaparm_kernel_nbor_dev_nbor];\n" -" add.u64 %rd11, %rd1, %rd5;\n" -" add.u64 %rd12, %rd5, %rd11;\n" -" mul.lo.u64 %rd13, %rd12, 4;\n" -" add.u64 %rd14, %rd10, %rd13;\n" -" .loc 16 43 0\n" -" ld.param.u64 %rd15, [__cudaparm_kernel_nbor_x_];\n" -" cvt.s64.s32 %rd16, %r9;\n" -" mul.wide.s32 %rd17, %r9, 16;\n" -" add.u64 %rd18, %rd15, %rd17;\n" -" ld.global.v4.f32 {%f1,%f2,%f3,%f4}, [%rd18+0];\n" -" cvt.s32.s64 %r12, %rd5;\n" -" mul.lo.s32 %r13, %r12, %r11;\n" -" cvt.s64.s32 %rd19, %r13;\n" -" mul.wide.s32 %rd20, %r13, 4;\n" -" add.u64 %rd21, %rd8, %rd20;\n" -" setp.ge.u64 %p2, %rd8, %rd21;\n" -" @%p2 bra $Lt_0_6402;\n" -" cvt.rzi.ftz.s32.f32 %r14, %f4;\n" -" ld.param.s32 %r15, [__cudaparm_kernel_nbor_form_low];\n" -" cvt.rn.f32.s32 %f5, %r15;\n" -" ld.param.s32 %r16, [__cudaparm_kernel_nbor_ntypes];\n" -" mul.lo.s32 %r17, %r16, %r14;\n" -" ld.param.u64 %rd22, [__cudaparm_kernel_nbor_cut_form];\n" -" mov.s32 %r18, 0;\n" -"$Lt_0_5378:\n" -" .loc 16 49 0\n" -" ld.global.s32 %r19, [%rd9+0];\n" -" and.b32 %r20, %r19, 1073741823;\n" -" .loc 16 50 0\n" -" cvt.s64.s32 %rd23, %r20;\n" -" mul.wide.s32 %rd24, %r20, 16;\n" -" add.u64 %rd25, %rd15, %rd24;\n" -" ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd25+0];\n" -" .loc 16 53 0\n" -" cvt.rzi.ftz.s32.f32 %r21, %f9;\n" -" add.s32 %r22, %r21, %r17;\n" -" cvt.s64.s32 %rd26, %r22;\n" -" mul.wide.s32 %rd27, %r22, 8;\n" -" add.u64 %rd28, %rd22, %rd27;\n" -" ld.global.f32 %f10, [%rd28+4];\n" -" .loc 16 48 0\n" -" setp.le.ftz.f32 %p3, %f5, %f10;\n" -" @!%p3 bra $Lt_0_6658;\n" -" ld.param.s32 %r23, [__cudaparm_kernel_nbor_form_high];\n" -" cvt.rn.f32.s32 %f11, %r23;\n" -" setp.ge.ftz.f32 %p4, %f11, %f10;\n" -" @!%p4 bra $Lt_0_6658;\n" -" sub.ftz.f32 %f12, %f6, %f1;\n" -" sub.ftz.f32 %f13, %f7, %f2;\n" -" sub.ftz.f32 %f14, %f8, %f3;\n" -" ld.global.f32 %f15, [%rd28+0];\n" -" mul.ftz.f32 %f16, %f12, %f12;\n" -" fma.rn.ftz.f32 %f17, %f13, %f13, %f16;\n" -" fma.rn.ftz.f32 %f18, %f14, %f14, %f17;\n" -" setp.gt.ftz.f32 %p5, %f15, %f18;\n" -" @!%p5 bra $Lt_0_6658;\n" -" .loc 16 64 0\n" -" st.global.s32 [%rd14+0], %r20;\n" -" .loc 16 65 0\n" -" add.u64 %rd14, %rd6, %rd14;\n" -" .loc 16 66 0\n" -" add.s32 %r18, %r18, 1;\n" -"$Lt_0_6658:\n" -"$L_0_3842:\n" -" .loc 16 47 0\n" -" add.u64 %rd9, %rd6, %rd9;\n" -" setp.gt.u64 %p6, %rd21, %rd9;\n" -" @%p6 bra $Lt_0_5378;\n" -" bra.uni $Lt_0_4866;\n" -"$Lt_0_6402:\n" -" mov.s32 %r18, 0;\n" -"$Lt_0_4866:\n" -" .loc 16 70 0\n" -" add.s32 %r24, %r12, %r7;\n" -" cvt.s64.s32 %rd29, %r24;\n" -" mul.wide.s32 %rd30, %r24, 4;\n" -" add.u64 %rd31, %rd10, %rd30;\n" -" st.global.s32 [%rd31+0], %r18;\n" -"$Lt_0_4354:\n" -" .loc 16 72 0\n" -" exit;\n" -"$LDWend_kernel_nbor:\n" -" }\n" -" .entry kernel_nbor_fast (\n" -" .param .u64 __cudaparm_kernel_nbor_fast_x_,\n" -" .param .u64 __cudaparm_kernel_nbor_fast_cut_form,\n" -" .param .u64 __cudaparm_kernel_nbor_fast_dev_nbor,\n" -" .param .s32 __cudaparm_kernel_nbor_fast_nbor_pitch,\n" -" .param .s32 __cudaparm_kernel_nbor_fast_start,\n" -" .param .s32 __cudaparm_kernel_nbor_fast_inum,\n" -" .param .u64 __cudaparm_kernel_nbor_fast_dev_ij,\n" -" .param .s32 __cudaparm_kernel_nbor_fast_form_low,\n" -" .param .s32 __cudaparm_kernel_nbor_fast_form_high)\n" -" {\n" -" .reg .u32 %r<28>;\n" -" .reg .u64 %rd<42>;\n" -" .reg .f32 %f<19>;\n" -" .reg .pred %p<9>;\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_32570_31_non_const_form120[484];\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_32571_33_non_const_cutsq604[484];\n" -" .loc 16 84 0\n" -"$LDWbegin_kernel_nbor_fast:\n" -" cvt.s32.u32 %r1, %tid.x;\n" -" mov.u32 %r2, 120;\n" -" setp.gt.s32 %p1, %r1, %r2;\n" -" @%p1 bra $Lt_1_5122;\n" -" .loc 16 90 0\n" -" mov.u64 %rd1, __cuda___cuda_local_var_32570_31_non_const_form120;\n" -" mov.u64 %rd2, __cuda___cuda_local_var_32571_33_non_const_cutsq604;\n" -" cvt.s64.s32 %rd3, %r1;\n" -" mul.wide.s32 %rd4, %r1, 4;\n" -" ld.param.u64 %rd5, [__cudaparm_kernel_nbor_fast_cut_form];\n" -" mul.wide.s32 %rd6, %r1, 8;\n" -" add.u64 %rd7, %rd5, %rd6;\n" -" ld.global.v2.f32 {%f1,%f2}, [%rd7+0];\n" -" add.u64 %rd8, %rd4, %rd2;\n" -" st.shared.f32 [%rd8+0], %f1;\n" -" .loc 16 91 0\n" -" cvt.rzi.ftz.s32.f32 %r3, %f2;\n" -" add.u64 %rd9, %rd4, %rd1;\n" -" st.shared.s32 [%rd9+0], %r3;\n" -"$Lt_1_5122:\n" -" mov.u64 %rd1, __cuda___cuda_local_var_32570_31_non_const_form120;\n" -" mov.u64 %rd2, __cuda___cuda_local_var_32571_33_non_const_cutsq604;\n" -" .loc 16 94 0\n" -" bar.sync 0;\n" -" cvt.s32.u32 %r4, %ctaid.x;\n" -" cvt.s32.u32 %r5, %ntid.x;\n" -" mul.lo.s32 %r6, %r4, %r5;\n" -" ld.param.s32 %r7, [__cudaparm_kernel_nbor_fast_start];\n" -" add.s32 %r8, %r7, %r6;\n" -" add.s32 %r9, %r8, %r1;\n" -" ld.param.s32 %r10, [__cudaparm_kernel_nbor_fast_inum];\n" -" setp.le.s32 %p2, %r10, %r9;\n" -" @%p2 bra $Lt_1_5634;\n" -" .loc 16 98 0\n" -" cvt.s64.s32 %rd10, %r9;\n" -" ld.param.u64 %rd11, [__cudaparm_kernel_nbor_fast_dev_ij];\n" -" mul.wide.s32 %rd12, %r9, 4;\n" -" add.u64 %rd13, %rd11, %rd12;\n" -" ld.global.s32 %r11, [%rd13+0];\n" -" .loc 16 100 0\n" -" ld.param.s32 %r12, [__cudaparm_kernel_nbor_fast_nbor_pitch];\n" -" cvt.s64.s32 %rd14, %r12;\n" -" mul.wide.s32 %rd15, %r12, 4;\n" -" add.u64 %rd16, %rd15, %rd13;\n" -" ld.global.s32 %r13, [%rd16+0];\n" -" .loc 16 101 0\n" -" add.u64 %rd17, %rd15, %rd16;\n" -" mov.s64 %rd18, %rd17;\n" -" .loc 16 103 0\n" -" ld.param.u64 %rd19, [__cudaparm_kernel_nbor_fast_dev_nbor];\n" -" add.u64 %rd20, %rd10, %rd14;\n" -" add.u64 %rd21, %rd14, %rd20;\n" -" mul.lo.u64 %rd22, %rd21, 4;\n" -" add.u64 %rd23, %rd19, %rd22;\n" -" .loc 16 105 0\n" -" ld.param.u64 %rd24, [__cudaparm_kernel_nbor_fast_x_];\n" -" cvt.s64.s32 %rd25, %r11;\n" -" mul.wide.s32 %rd26, %r11, 16;\n" -" add.u64 %rd27, %rd24, %rd26;\n" -" ld.global.v4.f32 {%f3,%f4,%f5,%f6}, [%rd27+0];\n" -" cvt.s32.s64 %r14, %rd14;\n" -" mul.lo.s32 %r15, %r14, %r13;\n" -" cvt.s64.s32 %rd28, %r15;\n" -" mul.wide.s32 %rd29, %r15, 4;\n" -" add.u64 %rd30, %rd17, %rd29;\n" -" setp.ge.u64 %p3, %rd17, %rd30;\n" -" @%p3 bra $Lt_1_7682;\n" -" cvt.rzi.ftz.s32.f32 %r16, %f6;\n" -" mul.lo.s32 %r17, %r16, 11;\n" -" ld.param.s32 %r18, [__cudaparm_kernel_nbor_fast_form_low];\n" -" mov.s32 %r19, 0;\n" -"$Lt_1_6658:\n" -" .loc 16 112 0\n" -" ld.global.s32 %r20, [%rd18+0];\n" -" and.b32 %r21, %r20, 1073741823;\n" -" .loc 16 113 0\n" -" cvt.s64.s32 %rd31, %r21;\n" -" mul.wide.s32 %rd32, %r21, 16;\n" -" add.u64 %rd33, %rd24, %rd32;\n" -" ld.global.v4.f32 {%f7,%f8,%f9,%f10}, [%rd33+0];\n" -" .loc 16 111 0\n" -" cvt.rzi.ftz.s32.f32 %r22, %f10;\n" -" add.s32 %r23, %r22, %r17;\n" -" cvt.s64.s32 %rd34, %r23;\n" -" mul.wide.s32 %rd35, %r23, 4;\n" -" add.u64 %rd36, %rd35, %rd1;\n" -" ld.shared.s32 %r24, [%rd36+0];\n" -" setp.lt.s32 %p4, %r24, %r18;\n" -" @%p4 bra $Lt_1_7938;\n" -" ld.param.s32 %r25, [__cudaparm_kernel_nbor_fast_form_high];\n" -" setp.lt.s32 %p5, %r25, %r24;\n" -" @%p5 bra $Lt_1_7938;\n" -" sub.ftz.f32 %f11, %f7, %f3;\n" -" sub.ftz.f32 %f12, %f8, %f4;\n" -" sub.ftz.f32 %f13, %f9, %f5;\n" -" add.u64 %rd37, %rd35, %rd2;\n" -" ld.shared.f32 %f14, [%rd37+0];\n" -" mul.ftz.f32 %f15, %f11, %f11;\n" -" fma.rn.ftz.f32 %f16, %f12, %f12, %f15;\n" -" fma.rn.ftz.f32 %f17, %f13, %f13, %f16;\n" -" setp.gt.ftz.f32 %p6, %f14, %f17;\n" -" @!%p6 bra $Lt_1_7938;\n" -" .loc 16 127 0\n" -" st.global.s32 [%rd23+0], %r21;\n" -" .loc 16 128 0\n" -" add.u64 %rd23, %rd15, %rd23;\n" -" .loc 16 129 0\n" -" add.s32 %r19, %r19, 1;\n" -"$Lt_1_7938:\n" -"$L_1_4610:\n" -" .loc 16 110 0\n" -" add.u64 %rd18, %rd15, %rd18;\n" -" setp.gt.u64 %p7, %rd30, %rd18;\n" -" @%p7 bra $Lt_1_6658;\n" -" bra.uni $Lt_1_6146;\n" -"$Lt_1_7682:\n" -" mov.s32 %r19, 0;\n" -"$Lt_1_6146:\n" -" .loc 16 133 0\n" -" add.s32 %r26, %r14, %r9;\n" -" cvt.s64.s32 %rd38, %r26;\n" -" mul.wide.s32 %rd39, %r26, 4;\n" -" add.u64 %rd40, %rd19, %rd39;\n" -" st.global.s32 [%rd40+0], %r19;\n" -"$Lt_1_5634:\n" -" .loc 16 135 0\n" -" exit;\n" -"$LDWend_kernel_nbor_fast:\n" -" }\n" -; diff --git a/lib/gpu/gayberne.ptx b/lib/gpu/gayberne.ptx deleted file mode 100644 index 806a9b39a0..0000000000 --- a/lib/gpu/gayberne.ptx +++ /dev/null @@ -1,1590 +0,0 @@ - .version 2.3 - .target sm_20 - .address_size 64 - // compiled with /usr/local/cuda/open64/lib//be - // nvopencc 4.0 built on 2011-05-12 - - //----------------------------------------------------------- - // Compiling /tmp/tmpxft_00009b6f_00000000-9_lal_gayberne.cpp3.i (/home/sjplimp/ccBI#.YH8XBK) - //----------------------------------------------------------- - - //----------------------------------------------------------- - // Options: - //----------------------------------------------------------- - // Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64 - // -O3 (Optimization level) - // -g0 (Debug level) - // -m2 (Report advisories) - //----------------------------------------------------------- - - .file 1 "" - .file 2 "/tmp/tmpxft_00009b6f_00000000-8_lal_gayberne.cudafe2.gpu" - .file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h" - .file 4 "/usr/local/cuda/include/crt/device_runtime.h" - .file 5 "/usr/local/cuda/include/host_defines.h" - .file 6 "/usr/local/cuda/include/builtin_types.h" - .file 7 "/usr/local/cuda/include/device_types.h" - .file 8 "/usr/local/cuda/include/driver_types.h" - .file 9 "/usr/local/cuda/include/surface_types.h" - .file 10 "/usr/local/cuda/include/texture_types.h" - .file 11 "/usr/local/cuda/include/vector_types.h" - .file 12 "/usr/local/cuda/include/device_launch_parameters.h" - .file 13 "/usr/local/cuda/include/crt/storage_class.h" - .file 14 "/usr/include/bits/types.h" - .file 15 "/usr/include/time.h" - .file 16 "lal_ellipsoid_extra.h" - .file 17 "lal_gayberne.cu" - .file 18 "/usr/local/cuda/include/common_functions.h" - .file 19 "/usr/local/cuda/include/math_functions.h" - .file 20 "/usr/local/cuda/include/math_constants.h" - .file 21 "/usr/local/cuda/include/device_functions.h" - .file 22 "/usr/local/cuda/include/sm_11_atomic_functions.h" - .file 23 "/usr/local/cuda/include/sm_12_atomic_functions.h" - .file 24 "/usr/local/cuda/include/sm_13_double_functions.h" - .file 25 "/usr/local/cuda/include/sm_20_atomic_functions.h" - .file 26 "/usr/local/cuda/include/sm_20_intrinsics.h" - .file 27 "/usr/local/cuda/include/surface_functions.h" - .file 28 "/usr/local/cuda/include/texture_fetch_functions.h" - .file 29 "/usr/local/cuda/include/math_functions_dbl_ptx3.h" - - - .entry kernel_ellipsoid ( - .param .u64 __cudaparm_kernel_ellipsoid_x_, - .param .u64 __cudaparm_kernel_ellipsoid_q, - .param .u64 __cudaparm_kernel_ellipsoid_shape, - .param .u64 __cudaparm_kernel_ellipsoid_well, - .param .u64 __cudaparm_kernel_ellipsoid_gum, - .param .u64 __cudaparm_kernel_ellipsoid_sig_eps, - .param .s32 __cudaparm_kernel_ellipsoid_ntypes, - .param .u64 __cudaparm_kernel_ellipsoid_lshape, - .param .u64 __cudaparm_kernel_ellipsoid_dev_nbor, - .param .s32 __cudaparm_kernel_ellipsoid_stride, - .param .u64 __cudaparm_kernel_ellipsoid_ans, - .param .s32 __cudaparm_kernel_ellipsoid_astride, - .param .u64 __cudaparm_kernel_ellipsoid_engv, - .param .u64 __cudaparm_kernel_ellipsoid_err_flag, - .param .s32 __cudaparm_kernel_ellipsoid_eflag, - .param .s32 __cudaparm_kernel_ellipsoid_vflag, - .param .s32 __cudaparm_kernel_ellipsoid_inum, - .param .s32 __cudaparm_kernel_ellipsoid_t_per_atom) - { - .reg .u32 %r<67>; - .reg .u64 %rd<83>; - .reg .f32 %f<898>; - .reg .pred %p<35>; - .shared .align 16 .b8 __cuda___cuda_local_var_32950_33_non_const_sp_lj128[16]; - .shared .align 4 .b8 __cuda___cuda_local_var_33207_55_non_const_red_acc144[3584]; - // __cuda_local_var_32957_10_non_const_f = 48 - // __cuda_local_var_32961_10_non_const_tor = 64 - // __cuda_local_var_32965_9_non_const_virial = 16 - .loc 17 91 0 -$LDWbegin_kernel_ellipsoid: - .loc 17 96 0 - ld.param.u64 %rd1, [__cudaparm_kernel_ellipsoid_gum]; - ldu.global.f32 %f1, [%rd1+12]; - .loc 17 97 0 - ld.global.f32 %f2, [%rd1+16]; - .loc 17 98 0 - ld.global.f32 %f3, [%rd1+20]; - .loc 17 99 0 - ld.global.f32 %f4, [%rd1+24]; - st.shared.v4.f32 [__cuda___cuda_local_var_32950_33_non_const_sp_lj128+0], {%f1,%f2,%f3,%f4}; - .loc 17 112 0 - mov.f32 %f5, 0f00000000; // 0 - mov.f32 %f6, %f5; - mov.f32 %f7, 0f00000000; // 0 - mov.f32 %f8, %f7; - mov.f32 %f9, 0f00000000; // 0 - mov.f32 %f10, %f9; - mov.f32 %f11, 0f00000000; // 0 - mov.f32 %f12, %f11; - mov.f32 %f13, 0f00000000; // 0 - mov.f32 %f14, %f13; - mov.f32 %f15, 0f00000000; // 0 - mov.f32 %f16, %f15; - ld.param.s32 %r1, [__cudaparm_kernel_ellipsoid_t_per_atom]; - cvt.s32.u32 %r2, %tid.x; - div.s32 %r3, %r2, %r1; - cvt.s32.u32 %r4, %ntid.x; - div.s32 %r5, %r4, %r1; - cvt.s32.u32 %r6, %ctaid.x; - mul.lo.s32 %r7, %r6, %r5; - add.s32 %r8, %r3, %r7; - ld.param.s32 %r9, [__cudaparm_kernel_ellipsoid_inum]; - setp.le.s32 %p1, %r9, %r8; - @%p1 bra $Lt_0_55298; - .loc 17 117 0 - cvt.s64.s32 %rd2, %r8; - mul.wide.s32 %rd3, %r8, 4; - ld.param.u64 %rd4, [__cudaparm_kernel_ellipsoid_dev_nbor]; - add.u64 %rd5, %rd4, %rd3; - ld.global.s32 %r10, [%rd5+0]; - ld.param.s32 %r11, [__cudaparm_kernel_ellipsoid_stride]; - cvt.s64.s32 %rd6, %r11; - mul.wide.s32 %rd7, %r11, 4; - add.u64 %rd8, %rd7, %rd5; - ld.global.s32 %r12, [%rd8+0]; - .loc 17 120 0 - cvt.s64.s32 %rd9, %r10; - mul.wide.s32 %rd10, %r10, 16; - ld.param.u64 %rd11, [__cudaparm_kernel_ellipsoid_x_]; - add.u64 %rd12, %rd10, %rd11; - ld.global.v4.f32 {%f17,%f18,%f19,%f20}, [%rd12+0]; - .loc 17 123 0 - cvt.rzi.ftz.s32.f32 %r13, %f20; - cvt.s64.s32 %rd13, %r13; - mul.wide.s32 %rd14, %r13, 16; - ld.param.u64 %rd15, [__cudaparm_kernel_ellipsoid_shape]; - add.u64 %rd16, %rd14, %rd15; - ld.global.v4.f32 {%f21,%f22,%f23,_}, [%rd16+0]; - .loc 17 126 0 - ld.param.u64 %rd17, [__cudaparm_kernel_ellipsoid_q]; - add.u64 %rd18, %rd10, %rd17; - ld.global.v4.f32 {%f24,%f25,%f26,%f27}, [%rd18+0]; - .loc 17 129 0 - ld.param.u64 %rd19, [__cudaparm_kernel_ellipsoid_well]; - add.u64 %rd20, %rd14, %rd19; - ld.global.v4.f32 {%f28,%f29,%f30,_}, [%rd20+0]; - .loc 17 130 0 - cvt.s32.s64 %r14, %rd6; - sub.s32 %r15, %r1, 1; - and.b32 %r16, %r15, %r2; - add.u64 %rd21, %rd7, %rd8; - mul.lo.s32 %r17, %r14, %r16; - cvt.s64.s32 %rd22, %r17; - mul.wide.s32 %rd23, %r17, 4; - add.u64 %rd24, %rd21, %rd23; - mov.s64 %rd25, %rd24; - mul.lo.s32 %r18, %r14, %r12; - cvt.s64.s32 %rd26, %r18; - mul.wide.s32 %rd27, %r18, 4; - add.u64 %rd28, %rd21, %rd27; - setp.ge.u64 %p2, %rd24, %rd28; - @%p2 bra $Lt_0_56834; - ld.param.s32 %r19, [__cudaparm_kernel_ellipsoid_eflag]; - mov.s32 %r20, 0; - setp.gt.s32 %p3, %r19, %r20; - ld.param.s32 %r21, [__cudaparm_kernel_ellipsoid_vflag]; - mov.s32 %r22, 0; - setp.gt.s32 %p4, %r21, %r22; - add.ftz.f32 %f31, %f25, %f25; - add.ftz.f32 %f32, %f27, %f27; - mul.ftz.f32 %f33, %f24, %f24; - mul.ftz.f32 %f34, %f25, %f25; - mul.ftz.f32 %f35, %f26, %f26; - mul.ftz.f32 %f36, %f27, %f27; - add.ftz.f32 %f37, %f26, %f26; - ld.param.s32 %r23, [__cudaparm_kernel_ellipsoid_ntypes]; - mul.lo.s32 %r24, %r23, %r13; - mul.ftz.f32 %f38, %f31, %f26; - mul.ftz.f32 %f39, %f31, %f27; - mul.ftz.f32 %f40, %f31, %f24; - mul.ftz.f32 %f41, %f32, %f24; - add.ftz.f32 %f42, %f33, %f34; - sub.ftz.f32 %f43, %f33, %f34; - mul.ftz.f32 %f44, %f37, %f24; - mul.ftz.f32 %f45, %f37, %f27; - sub.ftz.f32 %f46, %f38, %f41; - add.ftz.f32 %f47, %f38, %f41; - sub.ftz.f32 %f48, %f42, %f35; - add.ftz.f32 %f49, %f35, %f43; - sub.ftz.f32 %f50, %f43, %f35; - add.ftz.f32 %f51, %f39, %f44; - sub.ftz.f32 %f52, %f39, %f44; - sub.ftz.f32 %f53, %f45, %f40; - add.ftz.f32 %f54, %f40, %f45; - ld.param.u64 %rd29, [__cudaparm_kernel_ellipsoid_lshape]; - mul.lo.u64 %rd30, %rd13, 4; - add.u64 %rd31, %rd29, %rd30; - mul.ftz.f32 %f55, %f46, %f22; - mul.ftz.f32 %f56, %f46, %f29; - mul.ftz.f32 %f57, %f47, %f21; - mul.ftz.f32 %f58, %f47, %f28; - sub.ftz.f32 %f59, %f48, %f36; - sub.ftz.f32 %f60, %f49, %f36; - add.ftz.f32 %f61, %f36, %f50; - mul.ftz.f32 %f62, %f51, %f23; - mul.ftz.f32 %f63, %f51, %f30; - add.ftz.f32 %f64, %f51, %f51; - mul.ftz.f32 %f65, %f52, %f21; - mul.ftz.f32 %f66, %f52, %f28; - mul.ftz.f32 %f67, %f53, %f23; - mul.ftz.f32 %f68, %f53, %f30; - add.ftz.f32 %f69, %f53, %f53; - mul.ftz.f32 %f70, %f54, %f22; - mul.ftz.f32 %f71, %f54, %f29; - mul.ftz.f32 %f72, %f46, %f55; - mul.ftz.f32 %f73, %f54, %f55; - mul.ftz.f32 %f74, %f46, %f56; - mul.ftz.f32 %f75, %f54, %f56; - mul.ftz.f32 %f76, %f59, %f21; - mul.ftz.f32 %f77, %f59, %f28; - mul.ftz.f32 %f78, %f60, %f22; - mul.ftz.f32 %f79, %f55, %f60; - mul.ftz.f32 %f80, %f60, %f29; - mul.ftz.f32 %f81, %f56, %f60; - mul.ftz.f32 %f82, %f61, %f23; - mul.ftz.f32 %f83, %f61, %f30; - add.ftz.f32 %f84, %f61, %f61; - mul.ftz.f32 %f85, %f46, %f70; - mul.ftz.f32 %f86, %f60, %f70; - mul.ftz.f32 %f87, %f54, %f70; - mul.ftz.f32 %f88, %f46, %f71; - mul.ftz.f32 %f89, %f60, %f71; - mul.ftz.f32 %f90, %f54, %f71; - fma.rn.ftz.f32 %f91, %f59, %f76, %f72; - fma.rn.ftz.f32 %f92, %f76, %f52, %f73; - fma.rn.ftz.f32 %f93, %f59, %f77, %f74; - fma.rn.ftz.f32 %f94, %f77, %f52, %f75; - mul.ftz.f32 %f95, %f46, %f78; - mul.ftz.f32 %f96, %f60, %f78; - mul.ftz.f32 %f97, %f54, %f78; - fma.rn.ftz.f32 %f98, %f76, %f47, %f79; - mul.ftz.f32 %f99, %f46, %f80; - mul.ftz.f32 %f100, %f60, %f80; - mul.ftz.f32 %f101, %f54, %f80; - fma.rn.ftz.f32 %f102, %f77, %f47, %f81; - fma.rn.ftz.f32 %f103, %f59, %f65, %f85; - fma.rn.ftz.f32 %f104, %f47, %f65, %f86; - fma.rn.ftz.f32 %f105, %f52, %f65, %f87; - fma.rn.ftz.f32 %f106, %f59, %f66, %f88; - fma.rn.ftz.f32 %f107, %f47, %f66, %f89; - fma.rn.ftz.f32 %f108, %f52, %f66, %f90; - fma.rn.ftz.f32 %f109, %f51, %f62, %f91; - fma.rn.ftz.f32 %f110, %f62, %f61, %f92; - fma.rn.ftz.f32 %f111, %f51, %f63, %f93; - fma.rn.ftz.f32 %f112, %f63, %f61, %f94; - fma.rn.ftz.f32 %f113, %f59, %f57, %f95; - fma.rn.ftz.f32 %f114, %f47, %f57, %f96; - fma.rn.ftz.f32 %f115, %f57, %f52, %f97; - fma.rn.ftz.f32 %f116, %f62, %f53, %f98; - fma.rn.ftz.f32 %f117, %f59, %f58, %f99; - fma.rn.ftz.f32 %f118, %f47, %f58, %f100; - fma.rn.ftz.f32 %f119, %f58, %f52, %f101; - fma.rn.ftz.f32 %f120, %f63, %f53, %f102; - fma.rn.ftz.f32 %f121, %f51, %f82, %f103; - fma.rn.ftz.f32 %f122, %f53, %f82, %f104; - fma.rn.ftz.f32 %f123, %f61, %f82, %f105; - fma.rn.ftz.f32 %f124, %f51, %f83, %f106; - fma.rn.ftz.f32 %f125, %f53, %f83, %f107; - fma.rn.ftz.f32 %f126, %f61, %f83, %f108; - fma.rn.ftz.f32 %f127, %f51, %f67, %f113; - fma.rn.ftz.f32 %f128, %f53, %f67, %f114; - fma.rn.ftz.f32 %f129, %f67, %f61, %f115; - fma.rn.ftz.f32 %f130, %f51, %f68, %f117; - fma.rn.ftz.f32 %f131, %f53, %f68, %f118; - fma.rn.ftz.f32 %f132, %f68, %f61, %f119; - ld.param.u64 %rd32, [__cudaparm_kernel_ellipsoid_sig_eps]; - mov.f32 %f133, 0f00000000; // 0 - mov.f32 %f134, 0f00000000; // 0 - mov.f32 %f135, 0f00000000; // 0 - mov.f32 %f136, 0f00000000; // 0 - mov.f32 %f137, 0f00000000; // 0 - mov.f32 %f138, 0f00000000; // 0 - mov.f32 %f139, 0f00000000; // 0 - mov.u64 %rd33, __cuda___cuda_local_var_32950_33_non_const_sp_lj128; -$Lt_0_40962: - // Loop body line 130, nesting depth: 1, estimated iterations: unknown - .loc 17 135 0 - ld.global.s32 %r25, [%rd25+0]; - .loc 17 136 0 - shr.s32 %r26, %r25, 30; - and.b32 %r27, %r26, 3; - cvt.s64.s32 %rd34, %r27; - mul.wide.s32 %rd35, %r27, 4; - add.u64 %rd36, %rd33, %rd35; - ld.shared.f32 %f140, [%rd36+0]; - .loc 17 139 0 - and.b32 %r28, %r25, 1073741823; - cvt.s64.s32 %rd37, %r28; - mul.wide.s32 %rd38, %r28, 16; - add.u64 %rd39, %rd38, %rd11; - ld.global.v4.f32 {%f141,%f142,%f143,%f144}, [%rd39+0]; - .loc 17 153 0 - add.u64 %rd40, %rd38, %rd17; - ld.global.v4.f32 {%f145,%f146,%f147,%f148}, [%rd40+0]; - .loc 17 162 0 - cvt.rzi.ftz.s32.f32 %r29, %f144; - cvt.s64.s32 %rd41, %r29; - mul.wide.s32 %rd42, %r29, 16; - add.u64 %rd43, %rd42, %rd15; - ld.global.v4.f32 {%f149,%f150,%f151,_}, [%rd43+0]; - .loc 16 299 0 - sub.ftz.f32 %f152, %f141, %f17; - mov.f32 %f153, %f152; - .loc 16 300 0 - add.ftz.f32 %f154, %f146, %f146; - add.ftz.f32 %f155, %f148, %f148; - mul.ftz.f32 %f156, %f145, %f145; - mul.ftz.f32 %f157, %f146, %f146; - mul.ftz.f32 %f158, %f147, %f147; - mul.ftz.f32 %f159, %f148, %f148; - add.ftz.f32 %f160, %f147, %f147; - mul.ftz.f32 %f161, %f154, %f147; - mul.ftz.f32 %f162, %f154, %f148; - mul.ftz.f32 %f163, %f155, %f145; - add.ftz.f32 %f164, %f156, %f157; - mul.ftz.f32 %f165, %f160, %f145; - sub.ftz.f32 %f166, %f161, %f163; - sub.ftz.f32 %f167, %f164, %f158; - add.ftz.f32 %f168, %f162, %f165; - mul.ftz.f32 %f169, %f166, %f150; - sub.ftz.f32 %f170, %f167, %f159; - mul.ftz.f32 %f171, %f168, %f151; - mul.ftz.f32 %f172, %f166, %f169; - mul.ftz.f32 %f173, %f170, %f149; - fma.rn.ftz.f32 %f174, %f170, %f173, %f172; - fma.rn.ftz.f32 %f175, %f168, %f171, %f174; - add.ftz.f32 %f176, %f109, %f175; - mov.f32 %f177, %f176; - .loc 16 301 0 - mul.ftz.f32 %f178, %f154, %f145; - sub.ftz.f32 %f179, %f156, %f157; - mul.ftz.f32 %f180, %f160, %f148; - add.ftz.f32 %f181, %f161, %f163; - add.ftz.f32 %f182, %f158, %f179; - sub.ftz.f32 %f183, %f180, %f178; - mul.ftz.f32 %f184, %f181, %f149; - sub.ftz.f32 %f185, %f182, %f159; - mul.ftz.f32 %f186, %f183, %f151; - mul.ftz.f32 %f187, %f185, %f150; - mul.ftz.f32 %f188, %f166, %f187; - fma.rn.ftz.f32 %f189, %f170, %f184, %f188; - fma.rn.ftz.f32 %f190, %f168, %f186, %f189; - add.ftz.f32 %f191, %f127, %f190; - mov.f32 %f192, %f191; - .loc 16 302 0 - sub.ftz.f32 %f193, %f179, %f158; - sub.ftz.f32 %f194, %f162, %f165; - add.ftz.f32 %f195, %f178, %f180; - add.ftz.f32 %f196, %f159, %f193; - mul.ftz.f32 %f197, %f194, %f149; - mul.ftz.f32 %f198, %f195, %f150; - mul.ftz.f32 %f199, %f196, %f151; - mul.ftz.f32 %f200, %f166, %f198; - fma.rn.ftz.f32 %f201, %f170, %f197, %f200; - fma.rn.ftz.f32 %f202, %f168, %f199, %f201; - add.ftz.f32 %f203, %f121, %f202; - mov.f32 %f204, %f203; - .loc 16 303 0 - sub.ftz.f32 %f205, %f142, %f18; - mov.f32 %f206, %f205; - .loc 16 304 0 - mul.ftz.f32 %f207, %f169, %f185; - fma.rn.ftz.f32 %f208, %f173, %f181, %f207; - fma.rn.ftz.f32 %f209, %f171, %f183, %f208; - add.ftz.f32 %f210, %f116, %f209; - mov.f32 %f211, %f210; - .loc 16 305 0 - mul.ftz.f32 %f212, %f185, %f187; - fma.rn.ftz.f32 %f213, %f181, %f184, %f212; - fma.rn.ftz.f32 %f214, %f183, %f186, %f213; - add.ftz.f32 %f215, %f128, %f214; - mov.f32 %f216, %f215; - .loc 16 306 0 - mul.ftz.f32 %f217, %f185, %f198; - fma.rn.ftz.f32 %f218, %f181, %f197, %f217; - fma.rn.ftz.f32 %f219, %f183, %f199, %f218; - add.ftz.f32 %f220, %f122, %f219; - mov.f32 %f221, %f220; - .loc 16 307 0 - sub.ftz.f32 %f222, %f143, %f19; - mov.f32 %f223, %f222; - .loc 16 308 0 - mul.ftz.f32 %f224, %f195, %f169; - fma.rn.ftz.f32 %f225, %f173, %f194, %f224; - fma.rn.ftz.f32 %f226, %f171, %f196, %f225; - add.ftz.f32 %f227, %f110, %f226; - mov.f32 %f228, %f227; - .loc 16 309 0 - mul.ftz.f32 %f229, %f195, %f187; - fma.rn.ftz.f32 %f230, %f184, %f194, %f229; - fma.rn.ftz.f32 %f231, %f186, %f196, %f230; - add.ftz.f32 %f232, %f129, %f231; - mov.f32 %f233, %f232; - .loc 16 310 0 - mul.ftz.f32 %f234, %f195, %f198; - fma.rn.ftz.f32 %f235, %f194, %f197, %f234; - fma.rn.ftz.f32 %f236, %f196, %f199, %f235; - add.ftz.f32 %f237, %f123, %f236; - mov.f32 %f238, %f237; - abs.ftz.f32 %f239, %f210; - abs.ftz.f32 %f240, %f176; - setp.gt.ftz.f32 %p5, %f239, %f240; - @!%p5 bra $Lt_0_41218; - .loc 16 314 0 - mov.f32 %f177, %f210; - mov.f32 %f211, %f176; - .loc 16 315 0 - mov.f32 %f192, %f215; - mov.f32 %f216, %f191; - .loc 16 316 0 - mov.f32 %f204, %f220; - mov.f32 %f221, %f203; - .loc 16 317 0 - mov.f32 %f153, %f205; - mov.f32 %f206, %f152; -$Lt_0_41218: - mov.f32 %f241, %f177; - abs.ftz.f32 %f242, %f241; - abs.ftz.f32 %f243, %f227; - setp.lt.ftz.f32 %p6, %f242, %f243; - @!%p6 bra $Lt_0_41730; - .loc 16 321 0 - mov.f32 %f177, %f227; - mov.f32 %f228, %f241; - .loc 16 322 0 - mov.f32 %f244, %f192; - mov.f32 %f192, %f232; - mov.f32 %f233, %f244; - .loc 16 323 0 - mov.f32 %f245, %f204; - mov.f32 %f204, %f237; - mov.f32 %f238, %f245; - .loc 16 324 0 - mov.f32 %f246, %f153; - mov.f32 %f153, %f222; - mov.f32 %f223, %f246; -$Lt_0_41730: - mov.f32 %f247, %f177; - mov.f32 %f248, 0f00000000; // 0 - setp.neu.ftz.f32 %p7, %f247, %f248; - @!%p7 bra $Lt_0_42498; - bra.uni $Lt_0_43266; -$Lt_0_42498: - mov.f32 %f249, 0f00000000; // 0 - setp.neu.ftz.f32 %p8, %f211, %f249; - @!%p8 bra $Lt_0_43010; - .loc 16 338 0 - mov.f32 %f177, %f211; - mov.f32 %f211, %f247; - .loc 16 339 0 - mov.f32 %f250, %f192; - mov.f32 %f192, %f216; - mov.f32 %f216, %f250; - .loc 16 340 0 - mov.f32 %f251, %f204; - mov.f32 %f204, %f221; - mov.f32 %f221, %f251; - .loc 16 341 0 - mov.f32 %f252, %f153; - mov.f32 %f153, %f206; - mov.f32 %f206, %f252; - bra.uni $Lt_0_43266; -$Lt_0_43010: - mov.f32 %f253, 0f00000000; // 0 - setp.neu.ftz.f32 %p9, %f228, %f253; - @!%p9 bra $Lt_0_43522; - .loc 16 346 0 - mov.f32 %f177, %f228; - mov.f32 %f228, %f247; - .loc 16 347 0 - mov.f32 %f254, %f192; - mov.f32 %f192, %f233; - mov.f32 %f233, %f254; - .loc 16 348 0 - mov.f32 %f255, %f204; - mov.f32 %f204, %f238; - mov.f32 %f238, %f255; - .loc 16 349 0 - mov.f32 %f256, %f153; - mov.f32 %f153, %f223; - mov.f32 %f223, %f256; - bra.uni $Lt_0_43266; -$Lt_0_43522: - .loc 16 352 0 - mov.s32 %r30, 2; - ld.param.u64 %rd44, [__cudaparm_kernel_ellipsoid_err_flag]; - st.global.s32 [%rd44+0], %r30; -$Lt_0_43266: -$Lt_0_42754: -$Lt_0_42242: - .loc 16 355 0 - div.approx.ftz.f32 %f257, %f211, %f177; - mul.ftz.f32 %f258, %f192, %f257; - sub.ftz.f32 %f259, %f216, %f258; - mov.f32 %f216, %f259; - .loc 16 356 0 - mul.ftz.f32 %f260, %f204, %f257; - sub.ftz.f32 %f261, %f221, %f260; - mov.f32 %f221, %f261; - .loc 16 357 0 - mul.ftz.f32 %f262, %f153, %f257; - sub.ftz.f32 %f263, %f206, %f262; - mov.f32 %f206, %f263; - .loc 16 359 0 - div.approx.ftz.f32 %f264, %f228, %f177; - mul.ftz.f32 %f265, %f192, %f264; - sub.ftz.f32 %f233, %f233, %f265; - .loc 16 360 0 - mul.ftz.f32 %f266, %f204, %f264; - sub.ftz.f32 %f238, %f238, %f266; - .loc 16 361 0 - mul.ftz.f32 %f267, %f153, %f264; - sub.ftz.f32 %f223, %f223, %f267; - abs.ftz.f32 %f268, %f259; - abs.ftz.f32 %f269, %f233; - setp.lt.ftz.f32 %p10, %f268, %f269; - @!%p10 bra $Lt_0_43778; - .loc 16 366 0 - mov.f32 %f216, %f233; - mov.f32 %f233, %f259; - .loc 16 367 0 - mov.f32 %f221, %f238; - mov.f32 %f238, %f261; - .loc 16 368 0 - mov.f32 %f206, %f223; - mov.f32 %f223, %f263; -$Lt_0_43778: - mov.f32 %f270, %f216; - mov.f32 %f271, 0f00000000; // 0 - setp.neu.ftz.f32 %p11, %f270, %f271; - @!%p11 bra $Lt_0_44546; - bra.uni $Lt_0_44802; -$Lt_0_44546: - mov.f32 %f272, 0f00000000; // 0 - setp.neu.ftz.f32 %p12, %f233, %f272; - @!%p12 bra $Lt_0_44802; - .loc 16 383 0 - mov.f32 %f216, %f233; - mov.f32 %f233, %f270; - .loc 16 384 0 - mov.f32 %f273, %f221; - mov.f32 %f221, %f238; - mov.f32 %f238, %f273; - .loc 16 385 0 - mov.f32 %f274, %f206; - mov.f32 %f206, %f223; - mov.f32 %f223, %f274; -$Lt_0_44802: -$Lt_0_44290: - .loc 16 390 0 - div.approx.ftz.f32 %f275, %f233, %f216; - mul.ftz.f32 %f276, %f221, %f275; - sub.ftz.f32 %f238, %f238, %f276; - .loc 16 391 0 - mul.ftz.f32 %f277, %f206, %f275; - sub.ftz.f32 %f223, %f223, %f277; - mov.f32 %f278, 0f00000000; // 0 - setp.eq.ftz.f32 %p13, %f238, %f278; - @!%p13 bra $Lt_0_45314; - .loc 16 394 0 - mov.s32 %r31, 2; - ld.param.u64 %rd45, [__cudaparm_kernel_ellipsoid_err_flag]; - st.global.s32 [%rd45+0], %r31; -$Lt_0_45314: - .loc 17 179 0 - div.approx.ftz.f32 %f279, %f223, %f238; - mul.ftz.f32 %f280, %f205, %f205; - mul.ftz.f32 %f281, %f279, %f221; - fma.rn.ftz.f32 %f282, %f152, %f152, %f280; - sub.ftz.f32 %f283, %f206, %f281; - fma.rn.ftz.f32 %f284, %f222, %f222, %f282; - div.approx.ftz.f32 %f285, %f283, %f216; - rsqrt.approx.ftz.f32 %f286, %f284; - mul.ftz.f32 %f287, %f285, %f192; - fma.rn.ftz.f32 %f288, %f204, %f279, %f287; - sub.ftz.f32 %f289, %f153, %f288; - div.approx.ftz.f32 %f290, %f289, %f177; - mul.ftz.f32 %f291, %f286, %f290; - .loc 17 191 0 - mul.ftz.f32 %f292, %f285, %f286; - mul.ftz.f32 %f293, %f286, %f205; - mul.ftz.f32 %f294, %f286, %f152; - mul.ftz.f32 %f295, %f286, %f222; - mul.ftz.f32 %f296, %f279, %f286; - mul.ftz.f32 %f297, %f292, %f293; - fma.rn.ftz.f32 %f298, %f294, %f291, %f297; - fma.rn.ftz.f32 %f299, %f295, %f296, %f298; - mov.f32 %f300, 0f3f000000; // 0.5 - mul.ftz.f32 %f301, %f299, %f300; - rsqrt.approx.ftz.f32 %f302, %f301; - .loc 17 195 0 - rcp.approx.ftz.f32 %f303, %f286; - mul.ftz.f32 %f304, %f303, %f291; - .loc 17 200 0 - add.s32 %r32, %r29, %r24; - cvt.s64.s32 %rd46, %r32; - mul.wide.s32 %rd47, %r32, 8; - add.u64 %rd48, %rd32, %rd47; - ld.global.v2.f32 {%f305,%f306}, [%rd48+0]; - .loc 17 202 0 - sub.ftz.f32 %f307, %f303, %f302; - ld.global.f32 %f308, [%rd1+0]; - fma.rn.ftz.f32 %f309, %f308, %f305, %f307; - .loc 17 209 0 - div.approx.ftz.f32 %f310, %f305, %f309; - mul.ftz.f32 %f311, %f310, %f310; - mul.ftz.f32 %f312, %f310, %f311; - mul.ftz.f32 %f313, %f312, %f312; - mul.ftz.f32 %f314, %f313, %f313; - mul.ftz.f32 %f315, %f310, %f313; - add.ftz.f32 %f316, %f314, %f314; - mul.ftz.f32 %f317, %f310, %f316; - sub.ftz.f32 %f318, %f317, %f315; - div.approx.ftz.f32 %f319, %f318, %f305; - mov.f32 %f320, 0f41c00000; // 24 - mul.ftz.f32 %f321, %f319, %f320; - mul.ftz.f32 %f322, %f306, %f321; - .loc 17 214 0 - mul.ftz.f32 %f323, %f302, %f322; - mul.ftz.f32 %f324, %f323, %f302; - mul.ftz.f32 %f325, %f324, %f302; - mov.f32 %f326, 0f3f000000; // 0.5 - mul.ftz.f32 %f327, %f325, %f326; - mul.ftz.f32 %f328, %f327, %f286; - mul.ftz.f32 %f329, %f292, %f303; - mul.ftz.f32 %f330, %f296, %f303; - mul.ftz.f32 %f331, %f286, %f328; - mul.ftz.f32 %f332, %f293, %f329; - fma.rn.ftz.f32 %f333, %f294, %f304, %f332; - fma.rn.ftz.f32 %f334, %f295, %f330, %f333; - mul.ftz.f32 %f335, %f294, %f334; - sub.ftz.f32 %f336, %f304, %f335; - mul.ftz.f32 %f337, %f331, %f336; - fma.rn.ftz.f32 %f338, %f294, %f322, %f337; - .loc 17 215 0 - mul.ftz.f32 %f339, %f293, %f334; - sub.ftz.f32 %f340, %f329, %f339; - mul.ftz.f32 %f341, %f331, %f340; - fma.rn.ftz.f32 %f342, %f293, %f322, %f341; - .loc 17 216 0 - mul.ftz.f32 %f343, %f295, %f334; - sub.ftz.f32 %f344, %f330, %f343; - mul.ftz.f32 %f345, %f331, %f344; - fma.rn.ftz.f32 %f346, %f295, %f322, %f345; - .loc 17 226 0 - mul.ftz.f32 %f347, %f122, %f329; - mul.ftz.f32 %f348, %f330, %f331; - mul.ftz.f32 %f349, %f329, %f331; - mul.ftz.f32 %f350, %f329, %f128; - fma.rn.ftz.f32 %f351, %f304, %f121, %f347; - fma.rn.ftz.f32 %f352, %f304, %f127, %f350; - fma.rn.ftz.f32 %f353, %f330, %f123, %f351; - fma.rn.ftz.f32 %f354, %f330, %f129, %f352; - mul.ftz.f32 %f355, %f348, %f354; - neg.ftz.f32 %f356, %f349; - fma.rn.ftz.f32 %f357, %f356, %f353, %f355; - mul.ftz.f32 %f358, %f116, %f329; - mul.ftz.f32 %f359, %f331, %f304; - fma.rn.ftz.f32 %f360, %f109, %f304, %f358; - fma.rn.ftz.f32 %f361, %f330, %f110, %f360; - mul.ftz.f32 %f362, %f359, %f353; - neg.ftz.f32 %f363, %f348; - fma.rn.ftz.f32 %f364, %f361, %f363, %f362; - mul.ftz.f32 %f365, %f349, %f361; - neg.ftz.f32 %f366, %f359; - fma.rn.ftz.f32 %f367, %f366, %f354, %f365; - .loc 17 233 0 - ld.global.f32 %f368, [%rd31+0]; - mul.lo.u64 %rd49, %rd41, 4; - add.u64 %rd50, %rd29, %rd49; - ld.global.f32 %f369, [%rd50+0]; - add.ftz.f32 %f370, %f368, %f368; - mul.ftz.f32 %f371, %f369, %f370; - .loc 17 234 0 - mul.ftz.f32 %f372, %f210, %f203; - mul.ftz.f32 %f373, %f227, %f203; - mul.ftz.f32 %f374, %f220, %f176; - mul.ftz.f32 %f375, %f210, %f191; - mul.ftz.f32 %f376, %f227, %f191; - mul.ftz.f32 %f377, %f215, %f176; - mul.ftz.f32 %f378, %f374, %f232; - mul.ftz.f32 %f379, %f237, %f377; - sub.ftz.f32 %f380, %f379, %f378; - mul.ftz.f32 %f381, %f237, %f375; - sub.ftz.f32 %f382, %f380, %f381; - fma.rn.ftz.f32 %f383, %f232, %f372, %f382; - fma.rn.ftz.f32 %f384, %f220, %f376, %f383; - mul.ftz.f32 %f385, %f215, %f373; - sub.ftz.f32 %f386, %f384, %f385; - .loc 17 235 0 - ld.global.f32 %f387, [%rd1+4]; - .loc 17 240 0 - mul.ftz.f32 %f388, %f232, %f372; - sub.ftz.f32 %f389, %f388, %f378; - mul.ftz.f32 %f390, %f215, %f373; - sub.ftz.f32 %f391, %f389, %f390; - fma.rn.ftz.f32 %f392, %f220, %f376, %f391; - mul.ftz.f32 %f393, %f237, %f375; - sub.ftz.f32 %f394, %f392, %f393; - fma.rn.ftz.f32 %f395, %f237, %f377, %f394; - .loc 17 241 0 - div.approx.ftz.f32 %f396, %f371, %f386; - lg2.approx.ftz.f32 %f397, %f396; - mul.ftz.f32 %f398, %f397, %f387; - ex2.approx.ftz.f32 %f399, %f398; - mul.ftz.f32 %f400, %f399, %f387; - neg.ftz.f32 %f401, %f400; - .loc 17 274 0 - add.u64 %rd51, %rd42, %rd19; - ld.global.v4.f32 {%f402,%f403,%f404,_}, [%rd51+0]; - .loc 16 299 0 - mul.ftz.f32 %f405, %f294, %f303; - mov.f32 %f153, %f405; - .loc 16 300 0 - mul.ftz.f32 %f406, %f166, %f403; - mul.ftz.f32 %f407, %f168, %f404; - mul.ftz.f32 %f408, %f166, %f406; - mul.ftz.f32 %f409, %f170, %f402; - fma.rn.ftz.f32 %f410, %f170, %f409, %f408; - fma.rn.ftz.f32 %f411, %f168, %f407, %f410; - add.ftz.f32 %f412, %f111, %f411; - mov.f32 %f177, %f412; - .loc 16 301 0 - mul.ftz.f32 %f413, %f181, %f402; - mul.ftz.f32 %f414, %f183, %f404; - mul.ftz.f32 %f415, %f185, %f403; - mul.ftz.f32 %f416, %f166, %f415; - fma.rn.ftz.f32 %f417, %f170, %f413, %f416; - fma.rn.ftz.f32 %f418, %f168, %f414, %f417; - add.ftz.f32 %f419, %f130, %f418; - mov.f32 %f192, %f419; - .loc 16 302 0 - mul.ftz.f32 %f420, %f194, %f402; - mul.ftz.f32 %f421, %f195, %f403; - mul.ftz.f32 %f422, %f196, %f404; - mul.ftz.f32 %f423, %f166, %f421; - fma.rn.ftz.f32 %f424, %f170, %f420, %f423; - fma.rn.ftz.f32 %f425, %f168, %f422, %f424; - add.ftz.f32 %f426, %f124, %f425; - mov.f32 %f204, %f426; - .loc 16 303 0 - mul.ftz.f32 %f427, %f293, %f303; - mov.f32 %f206, %f427; - .loc 16 304 0 - mul.ftz.f32 %f428, %f406, %f185; - fma.rn.ftz.f32 %f429, %f409, %f181, %f428; - fma.rn.ftz.f32 %f430, %f407, %f183, %f429; - add.ftz.f32 %f431, %f120, %f430; - mov.f32 %f211, %f431; - .loc 16 305 0 - mul.ftz.f32 %f432, %f185, %f415; - fma.rn.ftz.f32 %f433, %f181, %f413, %f432; - fma.rn.ftz.f32 %f434, %f183, %f414, %f433; - add.ftz.f32 %f216, %f131, %f434; - .loc 16 306 0 - mul.ftz.f32 %f435, %f185, %f421; - fma.rn.ftz.f32 %f436, %f181, %f420, %f435; - fma.rn.ftz.f32 %f437, %f183, %f422, %f436; - add.ftz.f32 %f221, %f125, %f437; - .loc 16 307 0 - mul.ftz.f32 %f438, %f295, %f303; - mov.f32 %f223, %f438; - .loc 16 308 0 - mul.ftz.f32 %f439, %f195, %f406; - fma.rn.ftz.f32 %f440, %f409, %f194, %f439; - fma.rn.ftz.f32 %f441, %f407, %f196, %f440; - add.ftz.f32 %f442, %f112, %f441; - mov.f32 %f228, %f442; - .loc 16 309 0 - mul.ftz.f32 %f443, %f195, %f415; - fma.rn.ftz.f32 %f444, %f413, %f194, %f443; - fma.rn.ftz.f32 %f445, %f414, %f196, %f444; - add.ftz.f32 %f233, %f132, %f445; - .loc 16 310 0 - mul.ftz.f32 %f446, %f195, %f421; - fma.rn.ftz.f32 %f447, %f194, %f420, %f446; - fma.rn.ftz.f32 %f448, %f196, %f422, %f447; - add.ftz.f32 %f238, %f126, %f448; - abs.ftz.f32 %f449, %f431; - abs.ftz.f32 %f450, %f412; - setp.gt.ftz.f32 %p14, %f449, %f450; - @!%p14 bra $Lt_0_45826; - .loc 16 314 0 - mov.f32 %f177, %f431; - mov.f32 %f211, %f412; - .loc 16 315 0 - mov.f32 %f192, %f216; - mov.f32 %f216, %f419; - .loc 16 316 0 - mov.f32 %f204, %f221; - mov.f32 %f221, %f426; - .loc 16 317 0 - mov.f32 %f153, %f427; - mov.f32 %f206, %f405; -$Lt_0_45826: - mov.f32 %f451, %f177; - abs.ftz.f32 %f452, %f451; - abs.ftz.f32 %f453, %f442; - setp.lt.ftz.f32 %p15, %f452, %f453; - @!%p15 bra $Lt_0_46338; - .loc 16 321 0 - mov.f32 %f177, %f442; - mov.f32 %f228, %f451; - .loc 16 322 0 - mov.f32 %f454, %f192; - mov.f32 %f192, %f233; - mov.f32 %f233, %f454; - .loc 16 323 0 - mov.f32 %f455, %f204; - mov.f32 %f204, %f238; - mov.f32 %f238, %f455; - .loc 16 324 0 - mov.f32 %f456, %f153; - mov.f32 %f153, %f438; - mov.f32 %f223, %f456; -$Lt_0_46338: - mov.f32 %f457, %f177; - mov.f32 %f458, 0f00000000; // 0 - setp.neu.ftz.f32 %p16, %f457, %f458; - @!%p16 bra $Lt_0_47106; - bra.uni $Lt_0_47874; -$Lt_0_47106: - mov.f32 %f459, 0f00000000; // 0 - setp.neu.ftz.f32 %p17, %f211, %f459; - @!%p17 bra $Lt_0_47618; - .loc 16 338 0 - mov.f32 %f177, %f211; - mov.f32 %f211, %f457; - .loc 16 339 0 - mov.f32 %f460, %f192; - mov.f32 %f192, %f216; - mov.f32 %f216, %f460; - .loc 16 340 0 - mov.f32 %f461, %f204; - mov.f32 %f204, %f221; - mov.f32 %f221, %f461; - .loc 16 341 0 - mov.f32 %f462, %f153; - mov.f32 %f153, %f206; - mov.f32 %f206, %f462; - bra.uni $Lt_0_47874; -$Lt_0_47618: - mov.f32 %f463, 0f00000000; // 0 - setp.neu.ftz.f32 %p18, %f228, %f463; - @!%p18 bra $Lt_0_48130; - .loc 16 346 0 - mov.f32 %f177, %f228; - mov.f32 %f228, %f457; - .loc 16 347 0 - mov.f32 %f464, %f192; - mov.f32 %f192, %f233; - mov.f32 %f233, %f464; - .loc 16 348 0 - mov.f32 %f465, %f204; - mov.f32 %f204, %f238; - mov.f32 %f238, %f465; - .loc 16 349 0 - mov.f32 %f466, %f153; - mov.f32 %f153, %f223; - mov.f32 %f223, %f466; - bra.uni $Lt_0_47874; -$Lt_0_48130: - .loc 16 352 0 - mov.s32 %r33, 2; - ld.param.u64 %rd52, [__cudaparm_kernel_ellipsoid_err_flag]; - st.global.s32 [%rd52+0], %r33; -$Lt_0_47874: -$Lt_0_47362: -$Lt_0_46850: - .loc 16 355 0 - div.approx.ftz.f32 %f467, %f211, %f177; - mul.ftz.f32 %f468, %f192, %f467; - sub.ftz.f32 %f469, %f216, %f468; - mov.f32 %f216, %f469; - .loc 16 356 0 - mul.ftz.f32 %f470, %f204, %f467; - sub.ftz.f32 %f471, %f221, %f470; - mov.f32 %f221, %f471; - .loc 16 357 0 - mul.ftz.f32 %f472, %f153, %f467; - sub.ftz.f32 %f473, %f206, %f472; - mov.f32 %f206, %f473; - .loc 16 359 0 - div.approx.ftz.f32 %f474, %f228, %f177; - mul.ftz.f32 %f475, %f192, %f474; - sub.ftz.f32 %f233, %f233, %f475; - .loc 16 360 0 - mul.ftz.f32 %f476, %f204, %f474; - sub.ftz.f32 %f238, %f238, %f476; - .loc 16 361 0 - mul.ftz.f32 %f477, %f153, %f474; - sub.ftz.f32 %f223, %f223, %f477; - abs.ftz.f32 %f478, %f469; - abs.ftz.f32 %f479, %f233; - setp.lt.ftz.f32 %p19, %f478, %f479; - @!%p19 bra $Lt_0_48386; - .loc 16 366 0 - mov.f32 %f216, %f233; - mov.f32 %f233, %f469; - .loc 16 367 0 - mov.f32 %f221, %f238; - mov.f32 %f238, %f471; - .loc 16 368 0 - mov.f32 %f206, %f223; - mov.f32 %f223, %f473; -$Lt_0_48386: - mov.f32 %f480, %f216; - mov.f32 %f481, 0f00000000; // 0 - setp.neu.ftz.f32 %p20, %f480, %f481; - @!%p20 bra $Lt_0_49154; - bra.uni $Lt_0_49410; -$Lt_0_49154: - mov.f32 %f482, 0f00000000; // 0 - setp.neu.ftz.f32 %p21, %f233, %f482; - @!%p21 bra $Lt_0_49410; - .loc 16 383 0 - mov.f32 %f216, %f233; - mov.f32 %f233, %f480; - .loc 16 384 0 - mov.f32 %f483, %f221; - mov.f32 %f221, %f238; - mov.f32 %f238, %f483; - .loc 16 385 0 - mov.f32 %f484, %f206; - mov.f32 %f206, %f223; - mov.f32 %f223, %f484; -$Lt_0_49410: -$Lt_0_48898: - .loc 16 390 0 - div.approx.ftz.f32 %f485, %f233, %f216; - mul.ftz.f32 %f486, %f221, %f485; - sub.ftz.f32 %f238, %f238, %f486; - .loc 16 391 0 - mul.ftz.f32 %f487, %f206, %f485; - sub.ftz.f32 %f223, %f223, %f487; - mov.f32 %f488, 0f00000000; // 0 - setp.eq.ftz.f32 %p22, %f238, %f488; - @!%p22 bra $Lt_0_49922; - .loc 16 394 0 - mov.s32 %r34, 2; - ld.param.u64 %rd53, [__cudaparm_kernel_ellipsoid_err_flag]; - st.global.s32 [%rd53+0], %r34; -$Lt_0_49922: - .loc 17 286 0 - div.approx.ftz.f32 %f489, %f223, %f238; - mul.ftz.f32 %f490, %f489, %f221; - sub.ftz.f32 %f491, %f206, %f490; - div.approx.ftz.f32 %f492, %f491, %f216; - mul.ftz.f32 %f493, %f492, %f192; - fma.rn.ftz.f32 %f494, %f204, %f489, %f493; - sub.ftz.f32 %f495, %f153, %f494; - div.approx.ftz.f32 %f496, %f495, %f177; - mul.ftz.f32 %f497, %f286, %f496; - .loc 17 293 0 - mul.ftz.f32 %f498, %f492, %f286; - mul.ftz.f32 %f499, %f489, %f286; - mul.ftz.f32 %f500, %f286, %f405; - mul.ftz.f32 %f501, %f286, %f427; - mul.ftz.f32 %f502, %f286, %f438; - mul.ftz.f32 %f503, %f498, %f501; - fma.rn.ftz.f32 %f504, %f500, %f497, %f503; - fma.rn.ftz.f32 %f505, %f502, %f499, %f504; - add.ftz.f32 %f506, %f505, %f505; - ld.global.f32 %f507, [%rd1+8]; - .loc 17 296 0 - mul.ftz.f32 %f508, %f303, %f497; - .loc 17 301 0 - mov.f32 %f509, 0fbf800000; // -1 - add.ftz.f32 %f510, %f507, %f509; - lg2.approx.ftz.f32 %f511, %f506; - mul.ftz.f32 %f512, %f511, %f507; - ex2.approx.ftz.f32 %f513, %f512; - mov.f32 %f514, 0fc0800000; // -4 - mul.ftz.f32 %f515, %f286, %f514; - mul.ftz.f32 %f516, %f286, %f515; - lg2.approx.ftz.f32 %f517, %f513; - div.approx.ftz.f32 %f518, %f510, %f507; - mul.ftz.f32 %f519, %f517, %f518; - ex2.approx.ftz.f32 %f520, %f519; - mul.ftz.f32 %f521, %f516, %f507; - mul.ftz.f32 %f522, %f520, %f521; - .loc 17 303 0 - mul.ftz.f32 %f523, %f498, %f303; - mul.ftz.f32 %f524, %f499, %f303; - mul.ftz.f32 %f525, %f523, %f501; - fma.rn.ftz.f32 %f526, %f500, %f508, %f525; - fma.rn.ftz.f32 %f527, %f502, %f524, %f526; - mul.ftz.f32 %f528, %f500, %f527; - sub.ftz.f32 %f529, %f508, %f528; - mul.ftz.f32 %f530, %f522, %f529; - .loc 17 304 0 - mul.ftz.f32 %f531, %f501, %f527; - sub.ftz.f32 %f532, %f523, %f531; - mul.ftz.f32 %f533, %f522, %f532; - .loc 17 305 0 - mul.ftz.f32 %f534, %f502, %f527; - sub.ftz.f32 %f535, %f524, %f534; - mul.ftz.f32 %f536, %f522, %f535; - .loc 17 310 0 - mul.ftz.f32 %f537, %f125, %f523; - mul.ftz.f32 %f538, %f523, %f131; - fma.rn.ftz.f32 %f539, %f508, %f124, %f537; - fma.rn.ftz.f32 %f540, %f508, %f130, %f538; - fma.rn.ftz.f32 %f541, %f524, %f126, %f539; - fma.rn.ftz.f32 %f542, %f524, %f132, %f540; - mul.ftz.f32 %f543, %f523, %f541; - mul.ftz.f32 %f544, %f542, %f524; - sub.ftz.f32 %f545, %f544, %f543; - mul.ftz.f32 %f546, %f120, %f523; - fma.rn.ftz.f32 %f547, %f111, %f508, %f546; - fma.rn.ftz.f32 %f548, %f524, %f112, %f547; - mul.ftz.f32 %f549, %f524, %f548; - mul.ftz.f32 %f550, %f508, %f541; - sub.ftz.f32 %f551, %f550, %f549; - mul.ftz.f32 %f552, %f542, %f508; - mul.ftz.f32 %f553, %f548, %f523; - sub.ftz.f32 %f554, %f553, %f552; - .loc 17 312 0 - mul.ftz.f32 %f555, %f516, %f545; - .loc 17 313 0 - mul.ftz.f32 %f556, %f516, %f551; - .loc 17 314 0 - mul.ftz.f32 %f557, %f516, %f554; - .loc 16 396 0 - mov.f32 %f558, 0f40800000; // 4 - mul.ftz.f32 %f559, %f306, %f558; - mul.ftz.f32 %f560, %f399, %f140; - sub.ftz.f32 %f561, %f314, %f313; - mul.ftz.f32 %f562, %f513, %f560; - mul.ftz.f32 %f563, %f559, %f561; - fma.rn.ftz.f32 %f564, %f563, %f562, %f139; - selp.f32 %f139, %f564, %f139, %p3; - mul.ftz.f32 %f565, %f562, %f338; - mul.ftz.f32 %f566, %f562, %f342; - mul.ftz.f32 %f567, %f562, %f346; - mul.ftz.f32 %f568, %f399, %f563; - mul.ftz.f32 %f569, %f568, %f140; - neg.ftz.f32 %f570, %f569; - mul.ftz.f32 %f571, %f530, %f570; - sub.ftz.f32 %f572, %f571, %f565; - mul.ftz.f32 %f573, %f533, %f570; - sub.ftz.f32 %f574, %f573, %f566; - mul.ftz.f32 %f575, %f536, %f570; - sub.ftz.f32 %f576, %f575, %f567; - @!%p4 bra $Lt_0_50690; - .loc 17 326 0 - add.ftz.f32 %f138, %f572, %f138; - .loc 17 327 0 - mul.ftz.f32 %f577, %f303, %f500; - neg.ftz.f32 %f578, %f577; - mov.f32 %f579, %f6; - fma.rn.ftz.f32 %f580, %f578, %f572, %f579; - mov.f32 %f6, %f580; - .loc 17 329 0 - add.ftz.f32 %f137, %f574, %f137; - .loc 17 330 0 - mul.ftz.f32 %f581, %f303, %f501; - neg.ftz.f32 %f582, %f581; - mov.f32 %f583, %f8; - fma.rn.ftz.f32 %f584, %f582, %f574, %f583; - mov.f32 %f8, %f584; - .loc 17 331 0 - mov.f32 %f585, %f12; - fma.rn.ftz.f32 %f586, %f578, %f574, %f585; - mov.f32 %f12, %f586; - .loc 17 333 0 - add.ftz.f32 %f136, %f576, %f136; - .loc 17 334 0 - mov.f32 %f587, %f10; - mul.ftz.f32 %f588, %f303, %f502; - neg.ftz.f32 %f589, %f588; - fma.rn.ftz.f32 %f590, %f589, %f576, %f587; - mov.f32 %f10, %f590; - .loc 17 335 0 - mov.f32 %f591, %f14; - fma.rn.ftz.f32 %f592, %f578, %f576, %f591; - mov.f32 %f14, %f592; - .loc 17 336 0 - fma.rn.ftz.f32 %f15, %f582, %f576, %f15; - mov.f32 %f16, %f15; - bra.uni $Lt_0_50434; -$Lt_0_50690: - .loc 17 338 0 - add.ftz.f32 %f138, %f572, %f138; - .loc 17 339 0 - add.ftz.f32 %f137, %f574, %f137; - .loc 17 340 0 - add.ftz.f32 %f136, %f576, %f136; -$Lt_0_50434: - .loc 17 347 0 - rcp.approx.ftz.f32 %f593, %f395; - mul.ftz.f32 %f594, %f513, %f399; - mul.ftz.f32 %f595, %f594, %f140; - neg.ftz.f32 %f596, %f595; - mul.ftz.f32 %f597, %f513, %f563; - mul.ftz.f32 %f598, %f54, %f176; - mul.ftz.f32 %f599, %f60, %f176; - add.ftz.f32 %f600, %f176, %f176; - mul.ftz.f32 %f601, %f52, %f176; - mul.ftz.f32 %f602, %f47, %f176; - mul.ftz.f32 %f603, %f69, %f176; - mul.ftz.f32 %f604, %f61, %f176; - add.ftz.f32 %f605, %f227, %f227; - mul.ftz.f32 %f606, %f46, %f227; - mul.ftz.f32 %f607, %f59, %f227; - mul.ftz.f32 %f608, %f52, %f227; - mul.ftz.f32 %f609, %f47, %f227; - mul.ftz.f32 %f610, %f54, %f210; - add.ftz.f32 %f611, %f210, %f210; - mul.ftz.f32 %f612, %f46, %f210; - mul.ftz.f32 %f613, %f52, %f210; - mul.ftz.f32 %f614, %f51, %f210; - mul.ftz.f32 %f615, %f84, %f210; - mul.ftz.f32 %f616, %f46, %f203; - mul.ftz.f32 %f617, %f59, %f203; - mul.ftz.f32 %f618, %f51, %f203; - mul.ftz.f32 %f619, %f69, %f203; - mul.ftz.f32 %f620, %f227, %f220; - mul.ftz.f32 %f621, %f61, %f220; - add.ftz.f32 %f622, %f237, %f237; - mul.ftz.f32 %f623, %f237, %f210; - mul.ftz.f32 %f624, %f59, %f237; - mul.ftz.f32 %f625, %f597, %f140; - mul.ftz.f32 %f626, %f600, %f237; - mul.ftz.f32 %f627, %f60, %f605; - mul.ftz.f32 %f628, %f605, %f203; - mul.ftz.f32 %f629, %f237, %f191; - mul.ftz.f32 %f630, %f54, %f191; - mul.ftz.f32 %f631, %f220, %f191; - mul.ftz.f32 %f632, %f61, %f191; - add.ftz.f32 %f633, %f215, %f215; - mul.ftz.f32 %f634, %f227, %f215; - mul.ftz.f32 %f635, %f232, %f210; - mul.ftz.f32 %f636, %f53, %f232; - mul.ftz.f32 %f637, %f611, %f191; - mul.ftz.f32 %f638, %f52, %f611; - mul.ftz.f32 %f639, %f616, %f215; - mul.ftz.f32 %f640, %f617, %f215; - mul.ftz.f32 %f641, %f618, %f232; - mul.ftz.f32 %f642, %f618, %f215; - mul.ftz.f32 %f643, %f622, %f176; - mul.ftz.f32 %f644, %f624, %f191; - neg.ftz.f32 %f645, %f625; - mul.ftz.f32 %f646, %f46, %f629; - mul.ftz.f32 %f647, %f633, %f176; - mul.ftz.f32 %f648, %f61, %f633; - mul.ftz.f32 %f649, %f46, %f631; - sub.ftz.f32 %f650, %f649, %f639; - mul.ftz.f32 %f651, %f59, %f631; - sub.ftz.f32 %f652, %f651, %f640; - mul.ftz.f32 %f653, %f51, %f629; - sub.ftz.f32 %f654, %f653, %f641; - mul.ftz.f32 %f655, %f51, %f631; - sub.ftz.f32 %f656, %f655, %f642; - mul.ftz.f32 %f657, %f232, %f617; - sub.ftz.f32 %f658, %f657, %f644; - mul.ftz.f32 %f659, %f232, %f616; - sub.ftz.f32 %f660, %f659, %f646; - mul.ftz.f32 %f661, %f60, %f374; - sub.ftz.f32 %f662, %f650, %f661; - mul.ftz.f32 %f663, %f47, %f374; - sub.ftz.f32 %f664, %f652, %f663; - mul.ftz.f32 %f665, %f237, %f603; - sub.ftz.f32 %f666, %f654, %f665; - mul.ftz.f32 %f667, %f53, %f374; - sub.ftz.f32 %f668, %f656, %f667; - fma.rn.ftz.f32 %f669, %f47, %f626, %f658; - fma.rn.ftz.f32 %f670, %f60, %f643, %f660; - fma.rn.ftz.f32 %f671, %f60, %f372, %f662; - fma.rn.ftz.f32 %f672, %f47, %f372, %f664; - fma.rn.ftz.f32 %f673, %f176, %f621, %f666; - fma.rn.ftz.f32 %f674, %f53, %f372, %f668; - mul.ftz.f32 %f675, %f220, %f601; - sub.ftz.f32 %f676, %f669, %f675; - mul.ftz.f32 %f677, %f220, %f598; - sub.ftz.f32 %f678, %f670, %f677; - fma.rn.ftz.f32 %f679, %f54, %f647, %f671; - mul.ftz.f32 %f680, %f232, %f602; - sub.ftz.f32 %f681, %f672, %f680; - fma.rn.ftz.f32 %f682, %f227, %f619, %f673; - mul.ftz.f32 %f683, %f51, %f634; - sub.ftz.f32 %f684, %f674, %f683; - mul.ftz.f32 %f685, %f47, %f628; - sub.ftz.f32 %f686, %f676, %f685; - mul.ftz.f32 %f687, %f203, %f627; - sub.ftz.f32 %f688, %f678, %f687; - mul.ftz.f32 %f689, %f232, %f599; - sub.ftz.f32 %f690, %f679, %f689; - mul.ftz.f32 %f691, %f59, %f634; - sub.ftz.f32 %f692, %f681, %f691; - fma.rn.ftz.f32 %f693, %f237, %f614, %f682; - mul.ftz.f32 %f694, %f176, %f636; - sub.ftz.f32 %f695, %f684, %f694; - fma.rn.ftz.f32 %f696, %f203, %f613, %f686; - mul.ftz.f32 %f697, %f46, %f623; - sub.ftz.f32 %f698, %f688, %f697; - fma.rn.ftz.f32 %f699, %f60, %f376, %f690; - fma.rn.ftz.f32 %f700, %f52, %f647, %f692; - mul.ftz.f32 %f701, %f61, %f372; - sub.ftz.f32 %f702, %f693, %f701; - fma.rn.ftz.f32 %f703, %f176, %f648, %f695; - mul.ftz.f32 %f704, %f59, %f623; - sub.ftz.f32 %f705, %f696, %f704; - fma.rn.ftz.f32 %f706, %f46, %f620, %f698; - mul.ftz.f32 %f707, %f215, %f606; - sub.ftz.f32 %f708, %f699, %f707; - mul.ftz.f32 %f709, %f191, %f638; - sub.ftz.f32 %f710, %f700, %f709; - mul.ftz.f32 %f711, %f51, %f620; - sub.ftz.f32 %f712, %f702, %f711; - fma.rn.ftz.f32 %f713, %f51, %f635, %f703; - fma.rn.ftz.f32 %f714, %f220, %f607, %f705; - fma.rn.ftz.f32 %f715, %f203, %f610, %f706; - mul.ftz.f32 %f716, %f54, %f637; - sub.ftz.f32 %f717, %f708, %f716; - fma.rn.ftz.f32 %f718, %f59, %f635, %f710; - fma.rn.ftz.f32 %f719, %f232, %f604, %f712; - fma.rn.ftz.f32 %f720, %f53, %f376, %f713; - fma.rn.ftz.f32 %f721, %f191, %f608, %f714; - mul.ftz.f32 %f722, %f232, %f598; - sub.ftz.f32 %f723, %f715, %f722; - fma.rn.ftz.f32 %f724, %f232, %f612, %f717; - fma.rn.ftz.f32 %f725, %f191, %f609, %f718; - mul.ftz.f32 %f726, %f227, %f632; - sub.ftz.f32 %f727, %f726, %f719; - mul.ftz.f32 %f728, %f191, %f615; - sub.ftz.f32 %f729, %f720, %f728; - mul.ftz.f32 %f730, %f232, %f601; - sub.ftz.f32 %f731, %f721, %f730; - fma.rn.ftz.f32 %f732, %f227, %f630, %f723; - mul.ftz.f32 %f733, %f724, %f22; - mul.ftz.f32 %f734, %f725, %f21; - mul.ftz.f32 %f735, %f727, %f23; - mul.ftz.f32 %f736, %f729, %f23; - mul.ftz.f32 %f737, %f731, %f21; - mul.ftz.f32 %f738, %f732, %f22; - mul.ftz.f32 %f739, %f593, %f733; - mul.ftz.f32 %f740, %f593, %f734; - mul.ftz.f32 %f741, %f593, %f735; - mul.ftz.f32 %f742, %f593, %f736; - mul.ftz.f32 %f743, %f593, %f737; - mul.ftz.f32 %f744, %f593, %f738; - mul.ftz.f32 %f745, %f739, %f401; - mul.ftz.f32 %f746, %f740, %f401; - mul.ftz.f32 %f747, %f741, %f401; - mul.ftz.f32 %f748, %f742, %f401; - mul.ftz.f32 %f749, %f743, %f401; - mul.ftz.f32 %f750, %f744, %f401; - mul.ftz.f32 %f751, %f569, %f555; - mul.ftz.f32 %f752, %f52, %f749; - mul.ftz.f32 %f753, %f47, %f746; - sub.ftz.f32 %f754, %f753, %f752; - mul.ftz.f32 %f755, %f54, %f750; - mul.ftz.f32 %f756, %f745, %f60; - sub.ftz.f32 %f757, %f756, %f755; - add.ftz.f32 %f758, %f754, %f757; - mul.ftz.f32 %f759, %f61, %f747; - mul.ftz.f32 %f760, %f748, %f53; - sub.ftz.f32 %f761, %f760, %f759; - add.ftz.f32 %f762, %f758, %f761; - mul.ftz.f32 %f763, %f762, %f645; - sub.ftz.f32 %f764, %f763, %f751; - fma.rn.ftz.f32 %f765, %f357, %f596, %f764; - add.ftz.f32 %f135, %f135, %f765; - .loc 17 348 0 - mul.ftz.f32 %f766, %f54, %f227; - mul.ftz.f32 %f767, %f53, %f210; - mul.ftz.f32 %f768, %f47, %f203; - mul.ftz.f32 %f769, %f60, %f203; - add.ftz.f32 %f770, %f220, %f220; - mul.ftz.f32 %f771, %f47, %f191; - mul.ftz.f32 %f772, %f60, %f191; - mul.ftz.f32 %f773, %f53, %f191; - mul.ftz.f32 %f774, %f52, %f215; - mul.ftz.f32 %f775, %f54, %f215; - mul.ftz.f32 %f776, %f215, %f203; - mul.ftz.f32 %f777, %f232, %f203; - mul.ftz.f32 %f778, %f64, %f232; - mul.ftz.f32 %f779, %f59, %f770; - mul.ftz.f32 %f780, %f46, %f770; - mul.ftz.f32 %f781, %f52, %f631; - mul.ftz.f32 %f782, %f633, %f237; - mul.ftz.f32 %f783, %f51, %f633; - mul.ftz.f32 %f784, %f775, %f203; - mul.ftz.f32 %f785, %f61, %f776; - fma.rn.ftz.f32 %f786, %f59, %f782, %f781; - mul.ftz.f32 %f787, %f46, %f782; - sub.ftz.f32 %f788, %f787, %f784; - mul.ftz.f32 %f789, %f61, %f631; - sub.ftz.f32 %f790, %f789, %f785; - mul.ftz.f32 %f791, %f203, %f774; - sub.ftz.f32 %f792, %f786, %f791; - fma.rn.ftz.f32 %f793, %f54, %f631, %f788; - fma.rn.ftz.f32 %f794, %f237, %f783, %f790; - mul.ftz.f32 %f795, %f232, %f779; - sub.ftz.f32 %f796, %f792, %f795; - mul.ftz.f32 %f797, %f232, %f780; - sub.ftz.f32 %f798, %f793, %f797; - mul.ftz.f32 %f799, %f237, %f773; - sub.ftz.f32 %f800, %f794, %f799; - fma.rn.ftz.f32 %f801, %f232, %f768, %f796; - fma.rn.ftz.f32 %f802, %f232, %f769, %f798; - fma.rn.ftz.f32 %f803, %f53, %f777, %f800; - mul.ftz.f32 %f804, %f237, %f771; - sub.ftz.f32 %f805, %f801, %f804; - mul.ftz.f32 %f806, %f237, %f772; - sub.ftz.f32 %f807, %f802, %f806; - mul.ftz.f32 %f808, %f220, %f778; - sub.ftz.f32 %f809, %f803, %f808; - mul.ftz.f32 %f810, %f47, %f623; - sub.ftz.f32 %f811, %f805, %f810; - mul.ftz.f32 %f812, %f60, %f623; - sub.ftz.f32 %f813, %f807, %f812; - mul.ftz.f32 %f814, %f237, %f767; - sub.ftz.f32 %f815, %f809, %f814; - fma.rn.ftz.f32 %f816, %f47, %f620, %f811; - fma.rn.ftz.f32 %f817, %f60, %f620, %f813; - fma.rn.ftz.f32 %f818, %f53, %f620, %f815; - fma.rn.ftz.f32 %f819, %f232, %f613, %f816; - mul.ftz.f32 %f820, %f215, %f766; - sub.ftz.f32 %f821, %f817, %f820; - mul.ftz.f32 %f822, %f61, %f634; - sub.ftz.f32 %f823, %f818, %f822; - mul.ftz.f32 %f824, %f215, %f608; - sub.ftz.f32 %f825, %f819, %f824; - fma.rn.ftz.f32 %f826, %f232, %f610, %f821; - fma.rn.ftz.f32 %f827, %f61, %f635, %f823; - mul.ftz.f32 %f828, %f825, %f21; - mul.ftz.f32 %f829, %f826, %f22; - mul.ftz.f32 %f830, %f827, %f23; - mul.ftz.f32 %f831, %f593, %f828; - mul.ftz.f32 %f832, %f593, %f829; - mul.ftz.f32 %f833, %f593, %f830; - mul.ftz.f32 %f834, %f831, %f401; - mul.ftz.f32 %f835, %f832, %f401; - mul.ftz.f32 %f836, %f833, %f401; - mul.ftz.f32 %f837, %f569, %f556; - mul.ftz.f32 %f838, %f46, %f745; - mul.ftz.f32 %f839, %f835, %f54; - sub.ftz.f32 %f840, %f839, %f838; - mul.ftz.f32 %f841, %f59, %f746; - mul.ftz.f32 %f842, %f834, %f52; - sub.ftz.f32 %f843, %f842, %f841; - add.ftz.f32 %f844, %f840, %f843; - mul.ftz.f32 %f845, %f51, %f748; - mul.ftz.f32 %f846, %f836, %f61; - sub.ftz.f32 %f847, %f846, %f845; - add.ftz.f32 %f848, %f844, %f847; - mul.ftz.f32 %f849, %f848, %f645; - sub.ftz.f32 %f850, %f849, %f837; - fma.rn.ftz.f32 %f851, %f364, %f596, %f850; - add.ftz.f32 %f134, %f134, %f851; - .loc 17 349 0 - mul.ftz.f32 %f852, %f569, %f557; - mul.ftz.f32 %f853, %f47, %f834; - mul.ftz.f32 %f854, %f59, %f749; - sub.ftz.f32 %f855, %f854, %f853; - mul.ftz.f32 %f856, %f60, %f835; - mul.ftz.f32 %f857, %f750, %f46; - sub.ftz.f32 %f858, %f857, %f856; - add.ftz.f32 %f859, %f855, %f858; - mul.ftz.f32 %f860, %f53, %f836; - mul.ftz.f32 %f861, %f747, %f51; - sub.ftz.f32 %f862, %f861, %f860; - add.ftz.f32 %f863, %f859, %f862; - mul.ftz.f32 %f864, %f863, %f645; - sub.ftz.f32 %f865, %f864, %f852; - fma.rn.ftz.f32 %f866, %f367, %f596, %f865; - add.ftz.f32 %f133, %f133, %f866; - mul.lo.s32 %r35, %r14, %r1; - cvt.s64.s32 %rd54, %r35; - mul.wide.s32 %rd55, %r35, 4; - add.u64 %rd25, %rd25, %rd55; - setp.gt.u64 %p23, %rd28, %rd25; - @%p23 bra $Lt_0_40962; - bra.uni $Lt_0_40450; -$Lt_0_56834: - mov.f32 %f133, 0f00000000; // 0 - mov.f32 %f134, 0f00000000; // 0 - mov.f32 %f135, 0f00000000; // 0 - mov.f32 %f136, 0f00000000; // 0 - mov.f32 %f137, 0f00000000; // 0 - mov.f32 %f138, 0f00000000; // 0 - mov.f32 %f139, 0f00000000; // 0 -$Lt_0_40450: - mov.u32 %r36, 1; - setp.le.s32 %p24, %r1, %r36; - @%p24 bra $Lt_0_53250; - .loc 17 352 0 - mov.u64 %rd56, __cuda___cuda_local_var_33207_55_non_const_red_acc144; - cvt.s64.s32 %rd57, %r2; - mul.wide.s32 %rd58, %r2, 4; - add.u64 %rd59, %rd56, %rd58; - mov.f32 %f867, %f138; - st.shared.f32 [%rd59+0], %f867; - mov.f32 %f868, %f137; - st.shared.f32 [%rd59+512], %f868; - mov.f32 %f869, %f136; - st.shared.f32 [%rd59+1024], %f869; - mov.f32 %f870, %f135; - st.shared.f32 [%rd59+1536], %f870; - mov.f32 %f871, %f134; - st.shared.f32 [%rd59+2048], %f871; - mov.f32 %f872, %f133; - st.shared.f32 [%rd59+2560], %f872; - shr.s32 %r37, %r1, 31; - mov.s32 %r38, 1; - and.b32 %r39, %r37, %r38; - add.s32 %r40, %r39, %r1; - shr.s32 %r41, %r40, 1; - mov.s32 %r42, %r41; - mov.u32 %r43, 0; - setp.ne.u32 %p25, %r41, %r43; - @!%p25 bra $Lt_0_51714; -$Lt_0_52226: - setp.ge.u32 %p26, %r16, %r42; - @%p26 bra $Lt_0_52482; - add.u32 %r44, %r2, %r42; - cvt.u64.u32 %rd60, %r44; - mul.wide.u32 %rd61, %r44, 4; - add.u64 %rd62, %rd56, %rd61; - ld.shared.f32 %f873, [%rd62+0]; - add.ftz.f32 %f867, %f873, %f867; - st.shared.f32 [%rd59+0], %f867; - ld.shared.f32 %f874, [%rd62+512]; - add.ftz.f32 %f868, %f874, %f868; - st.shared.f32 [%rd59+512], %f868; - ld.shared.f32 %f875, [%rd62+1024]; - add.ftz.f32 %f869, %f875, %f869; - st.shared.f32 [%rd59+1024], %f869; - ld.shared.f32 %f876, [%rd62+1536]; - add.ftz.f32 %f870, %f876, %f870; - st.shared.f32 [%rd59+1536], %f870; - ld.shared.f32 %f877, [%rd62+2048]; - add.ftz.f32 %f871, %f877, %f871; - st.shared.f32 [%rd59+2048], %f871; - ld.shared.f32 %f878, [%rd62+2560]; - add.ftz.f32 %f872, %f878, %f872; - st.shared.f32 [%rd59+2560], %f872; -$Lt_0_52482: - shr.u32 %r42, %r42, 1; - mov.u32 %r45, 0; - setp.ne.u32 %p27, %r42, %r45; - @%p27 bra $Lt_0_52226; -$Lt_0_51714: - mov.f32 %f138, %f867; - mov.f32 %f137, %f868; - mov.f32 %f136, %f869; - mov.f32 %f135, %f870; - mov.f32 %f134, %f871; - mov.f32 %f133, %f872; - ld.param.s32 %r46, [__cudaparm_kernel_ellipsoid_eflag]; - mov.s32 %r47, 0; - set.gt.u32.s32 %r48, %r46, %r47; - neg.s32 %r49, %r48; - ld.param.s32 %r50, [__cudaparm_kernel_ellipsoid_vflag]; - mov.s32 %r51, 0; - set.gt.u32.s32 %r52, %r50, %r51; - neg.s32 %r53, %r52; - or.b32 %r54, %r49, %r53; - mov.u32 %r55, 0; - setp.eq.s32 %p28, %r54, %r55; - @%p28 bra $Lt_0_53250; - mov.f32 %f867, %f6; - st.shared.f32 [%rd59+0], %f867; - mov.f32 %f868, %f8; - st.shared.f32 [%rd59+512], %f868; - mov.f32 %f869, %f10; - st.shared.f32 [%rd59+1024], %f869; - mov.f32 %f870, %f12; - st.shared.f32 [%rd59+1536], %f870; - mov.f32 %f871, %f14; - st.shared.f32 [%rd59+2048], %f871; - mov.f32 %f872, %f15; - st.shared.f32 [%rd59+2560], %f872; - mov.f32 %f879, %f139; - st.shared.f32 [%rd59+3072], %f879; - mov.s32 %r56, %r41; - @!%p25 bra $Lt_0_53762; -$Lt_0_54274: - setp.ge.u32 %p29, %r16, %r56; - @%p29 bra $Lt_0_54530; - add.u32 %r57, %r2, %r56; - cvt.u64.u32 %rd63, %r57; - mul.wide.u32 %rd64, %r57, 4; - add.u64 %rd65, %rd56, %rd64; - ld.shared.f32 %f880, [%rd65+0]; - add.ftz.f32 %f867, %f880, %f867; - st.shared.f32 [%rd59+0], %f867; - ld.shared.f32 %f881, [%rd65+512]; - add.ftz.f32 %f868, %f881, %f868; - st.shared.f32 [%rd59+512], %f868; - ld.shared.f32 %f882, [%rd65+1024]; - add.ftz.f32 %f869, %f882, %f869; - st.shared.f32 [%rd59+1024], %f869; - ld.shared.f32 %f883, [%rd65+1536]; - add.ftz.f32 %f870, %f883, %f870; - st.shared.f32 [%rd59+1536], %f870; - ld.shared.f32 %f884, [%rd65+2048]; - add.ftz.f32 %f871, %f884, %f871; - st.shared.f32 [%rd59+2048], %f871; - ld.shared.f32 %f885, [%rd65+2560]; - add.ftz.f32 %f872, %f885, %f872; - st.shared.f32 [%rd59+2560], %f872; - ld.shared.f32 %f886, [%rd65+3072]; - add.ftz.f32 %f879, %f886, %f879; - st.shared.f32 [%rd59+3072], %f879; -$Lt_0_54530: - shr.u32 %r56, %r56, 1; - mov.u32 %r58, 0; - setp.ne.u32 %p30, %r56, %r58; - @%p30 bra $Lt_0_54274; -$Lt_0_53762: - mov.f32 %f6, %f867; - mov.f32 %f8, %f868; - mov.f32 %f10, %f869; - mov.f32 %f12, %f870; - mov.f32 %f14, %f871; - mov.f32 %f16, %f872; - mov.f32 %f139, %f879; -$Lt_0_53250: -$Lt_0_51202: - mov.u32 %r59, 0; - setp.ne.s32 %p31, %r16, %r59; - @%p31 bra $Lt_0_55298; - ld.param.u64 %rd66, [__cudaparm_kernel_ellipsoid_engv]; - add.u64 %rd67, %rd66, %rd3; - ld.param.s32 %r60, [__cudaparm_kernel_ellipsoid_astride]; - ld.param.s32 %r61, [__cudaparm_kernel_ellipsoid_eflag]; - mov.u32 %r62, 0; - setp.le.s32 %p32, %r61, %r62; - @%p32 bra $Lt_0_55810; - st.global.f32 [%rd67+0], %f139; - cvt.s64.s32 %rd68, %r60; - mul.wide.s32 %rd69, %r60, 4; - add.u64 %rd67, %rd67, %rd69; -$Lt_0_55810: - ld.param.s32 %r63, [__cudaparm_kernel_ellipsoid_vflag]; - mov.u32 %r64, 0; - setp.le.s32 %p33, %r63, %r64; - @%p33 bra $Lt_0_56322; - mov.f32 %f887, %f6; - st.global.f32 [%rd67+0], %f887; - cvt.s64.s32 %rd70, %r60; - mul.wide.s32 %rd71, %r60, 4; - add.u64 %rd72, %rd71, %rd67; - mov.f32 %f888, %f8; - st.global.f32 [%rd72+0], %f888; - add.u64 %rd73, %rd71, %rd72; - mov.f32 %f889, %f10; - st.global.f32 [%rd73+0], %f889; - add.u64 %rd74, %rd71, %rd73; - mov.f32 %f890, %f12; - st.global.f32 [%rd74+0], %f890; - add.u64 %rd67, %rd71, %rd74; - mov.f32 %f891, %f14; - st.global.f32 [%rd67+0], %f891; - mov.f32 %f892, %f16; - add.u64 %rd75, %rd71, %rd67; - st.global.f32 [%rd75+0], %f892; -$Lt_0_56322: - ld.param.u64 %rd76, [__cudaparm_kernel_ellipsoid_ans]; - mul.lo.u64 %rd77, %rd2, 16; - add.u64 %rd78, %rd76, %rd77; - mov.f32 %f893, %f894; - st.global.v4.f32 [%rd78+0], {%f138,%f137,%f136,%f893}; - add.s32 %r65, %r8, %r60; - cvt.s64.s32 %rd79, %r65; - mul.wide.s32 %rd80, %r65, 16; - add.u64 %rd81, %rd76, %rd80; - mov.f32 %f895, %f896; - st.global.v4.f32 [%rd81+0], {%f135,%f134,%f133,%f895}; -$Lt_0_55298: -$Lt_0_39938: - .loc 17 355 0 - exit; -$LDWend_kernel_ellipsoid: - } // kernel_ellipsoid - diff --git a/lib/gpu/gayberne_lj.ptx b/lib/gpu/gayberne_lj.ptx deleted file mode 100644 index d77eaa1059..0000000000 --- a/lib/gpu/gayberne_lj.ptx +++ /dev/null @@ -1,1915 +0,0 @@ - .version 2.3 - .target sm_20 - .address_size 64 - // compiled with /usr/local/cuda/open64/lib//be - // nvopencc 4.0 built on 2011-05-12 - - //----------------------------------------------------------- - // Compiling /tmp/tmpxft_00009b93_00000000-9_lal_gayberne_lj.cpp3.i (/home/sjplimp/ccBI#.hcleqA) - //----------------------------------------------------------- - - //----------------------------------------------------------- - // Options: - //----------------------------------------------------------- - // Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64 - // -O3 (Optimization level) - // -g0 (Debug level) - // -m2 (Report advisories) - //----------------------------------------------------------- - - .file 1 "" - .file 2 "/tmp/tmpxft_00009b93_00000000-8_lal_gayberne_lj.cudafe2.gpu" - .file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h" - .file 4 "/usr/local/cuda/include/crt/device_runtime.h" - .file 5 "/usr/local/cuda/include/host_defines.h" - .file 6 "/usr/local/cuda/include/builtin_types.h" - .file 7 "/usr/local/cuda/include/device_types.h" - .file 8 "/usr/local/cuda/include/driver_types.h" - .file 9 "/usr/local/cuda/include/surface_types.h" - .file 10 "/usr/local/cuda/include/texture_types.h" - .file 11 "/usr/local/cuda/include/vector_types.h" - .file 12 "/usr/local/cuda/include/device_launch_parameters.h" - .file 13 "/usr/local/cuda/include/crt/storage_class.h" - .file 14 "/usr/include/bits/types.h" - .file 15 "/usr/include/time.h" - .file 16 "lal_ellipsoid_extra.h" - .file 17 "lal_gayberne_lj.cu" - .file 18 "/usr/local/cuda/include/common_functions.h" - .file 19 "/usr/local/cuda/include/math_functions.h" - .file 20 "/usr/local/cuda/include/math_constants.h" - .file 21 "/usr/local/cuda/include/device_functions.h" - .file 22 "/usr/local/cuda/include/sm_11_atomic_functions.h" - .file 23 "/usr/local/cuda/include/sm_12_atomic_functions.h" - .file 24 "/usr/local/cuda/include/sm_13_double_functions.h" - .file 25 "/usr/local/cuda/include/sm_20_atomic_functions.h" - .file 26 "/usr/local/cuda/include/sm_20_intrinsics.h" - .file 27 "/usr/local/cuda/include/surface_functions.h" - .file 28 "/usr/local/cuda/include/texture_fetch_functions.h" - .file 29 "/usr/local/cuda/include/math_functions_dbl_ptx3.h" - - - .entry kernel_sphere_ellipsoid ( - .param .u64 __cudaparm_kernel_sphere_ellipsoid_x_, - .param .u64 __cudaparm_kernel_sphere_ellipsoid_q, - .param .u64 __cudaparm_kernel_sphere_ellipsoid_shape, - .param .u64 __cudaparm_kernel_sphere_ellipsoid_well, - .param .u64 __cudaparm_kernel_sphere_ellipsoid_gum, - .param .u64 __cudaparm_kernel_sphere_ellipsoid_sig_eps, - .param .s32 __cudaparm_kernel_sphere_ellipsoid_ntypes, - .param .u64 __cudaparm_kernel_sphere_ellipsoid_lshape, - .param .u64 __cudaparm_kernel_sphere_ellipsoid_dev_nbor, - .param .s32 __cudaparm_kernel_sphere_ellipsoid_stride, - .param .u64 __cudaparm_kernel_sphere_ellipsoid_ans, - .param .u64 __cudaparm_kernel_sphere_ellipsoid___val_paramengv, - .param .u64 __cudaparm_kernel_sphere_ellipsoid_err_flag, - .param .s32 __cudaparm_kernel_sphere_ellipsoid_eflag, - .param .s32 __cudaparm_kernel_sphere_ellipsoid_vflag, - .param .s32 __cudaparm_kernel_sphere_ellipsoid_start, - .param .s32 __cudaparm_kernel_sphere_ellipsoid_inum, - .param .s32 __cudaparm_kernel_sphere_ellipsoid_t_per_atom) - { - .reg .u32 %r<59>; - .reg .u64 %rd<79>; - .reg .f32 %f<432>; - .reg .pred %p<35>; - .shared .align 16 .b8 __cuda___cuda_local_var_32888_33_non_const_sp_lj124[16]; - .shared .align 4 .b8 __cuda___cuda_local_var_33089_55_non_const_red_acc140[3072]; - // __cuda_local_var_32895_10_non_const_f = 48 - // __cuda_local_var_32899_9_non_const_virial = 16 - .loc 17 28 0 -$LDWbegin_kernel_sphere_ellipsoid: - .loc 17 34 0 - ld.param.u64 %rd1, [__cudaparm_kernel_sphere_ellipsoid_gum]; - ldu.global.f32 %f1, [%rd1+12]; - .loc 17 35 0 - ld.global.f32 %f2, [%rd1+16]; - .loc 17 36 0 - ld.global.f32 %f3, [%rd1+20]; - .loc 17 37 0 - ld.global.f32 %f4, [%rd1+24]; - st.shared.v4.f32 [__cuda___cuda_local_var_32888_33_non_const_sp_lj124+0], {%f1,%f2,%f3,%f4}; - .loc 17 46 0 - mov.f32 %f5, 0f00000000; // 0 - mov.f32 %f6, %f5; - mov.f32 %f7, 0f00000000; // 0 - mov.f32 %f8, %f7; - mov.f32 %f9, 0f00000000; // 0 - mov.f32 %f10, %f9; - mov.f32 %f11, 0f00000000; // 0 - mov.f32 %f12, %f11; - mov.f32 %f13, 0f00000000; // 0 - mov.f32 %f14, %f13; - mov.f32 %f15, 0f00000000; // 0 - mov.f32 %f16, %f15; - ld.param.s32 %r1, [__cudaparm_kernel_sphere_ellipsoid_t_per_atom]; - cvt.s32.u32 %r2, %tid.x; - div.s32 %r3, %r2, %r1; - cvt.s32.u32 %r4, %ntid.x; - div.s32 %r5, %r4, %r1; - cvt.s32.u32 %r6, %ctaid.x; - mul.lo.s32 %r7, %r6, %r5; - add.s32 %r8, %r3, %r7; - ld.param.s32 %r9, [__cudaparm_kernel_sphere_ellipsoid_start]; - add.s32 %r10, %r9, %r8; - ld.param.s32 %r11, [__cudaparm_kernel_sphere_ellipsoid_inum]; - setp.ge.s32 %p1, %r10, %r11; - @%p1 bra $Lt_0_55042; - .loc 17 51 0 - cvt.s64.s32 %rd2, %r10; - mul.wide.s32 %rd3, %r10, 4; - ld.param.u64 %rd4, [__cudaparm_kernel_sphere_ellipsoid_dev_nbor]; - add.u64 %rd5, %rd4, %rd3; - ld.global.s32 %r12, [%rd5+0]; - ld.param.s32 %r13, [__cudaparm_kernel_sphere_ellipsoid_stride]; - cvt.s64.s32 %rd6, %r13; - mul.wide.s32 %rd7, %r13, 4; - add.u64 %rd8, %rd7, %rd5; - ld.global.s32 %r14, [%rd8+0]; - .loc 17 54 0 - ld.param.u64 %rd9, [__cudaparm_kernel_sphere_ellipsoid_x_]; - cvt.s64.s32 %rd10, %r12; - mul.wide.s32 %rd11, %r12, 16; - add.u64 %rd12, %rd9, %rd11; - ld.global.v4.f32 {%f17,%f18,%f19,%f20}, [%rd12+0]; - .loc 17 57 0 - cvt.rzi.ftz.s32.f32 %r15, %f20; - cvt.s64.s32 %rd13, %r15; - mul.wide.s32 %rd14, %r15, 16; - ld.param.u64 %rd15, [__cudaparm_kernel_sphere_ellipsoid_shape]; - add.u64 %rd16, %rd14, %rd15; - ld.global.f32 %f21, [%rd16+0]; - .loc 17 58 0 - ld.param.u64 %rd17, [__cudaparm_kernel_sphere_ellipsoid_well]; - add.u64 %rd18, %rd14, %rd17; - ld.global.f32 %f22, [%rd18+0]; - cvt.s32.s64 %r16, %rd6; - sub.s32 %r17, %r1, 1; - and.b32 %r18, %r17, %r2; - add.u64 %rd19, %rd7, %rd8; - mul.lo.s32 %r19, %r16, %r18; - cvt.s64.s32 %rd20, %r19; - mul.wide.s32 %rd21, %r19, 4; - add.u64 %rd22, %rd19, %rd21; - mov.s64 %rd23, %rd22; - mul.lo.s32 %r20, %r16, %r14; - cvt.s64.s32 %rd24, %r20; - mul.wide.s32 %rd25, %r20, 4; - add.u64 %rd26, %rd19, %rd25; - setp.ge.u64 %p2, %rd22, %rd26; - @%p2 bra $Lt_0_56578; - ld.param.s32 %r21, [__cudaparm_kernel_sphere_ellipsoid_eflag]; - mov.s32 %r22, 0; - setp.gt.s32 %p3, %r21, %r22; - ld.param.s32 %r23, [__cudaparm_kernel_sphere_ellipsoid_vflag]; - mov.s32 %r24, 0; - setp.gt.s32 %p4, %r23, %r24; - ld.param.s32 %r25, [__cudaparm_kernel_sphere_ellipsoid_ntypes]; - mul.lo.s32 %r26, %r25, %r15; - ld.param.u64 %rd27, [__cudaparm_kernel_sphere_ellipsoid_lshape]; - mul.lo.u64 %rd28, %rd13, 4; - add.u64 %rd29, %rd27, %rd28; - ld.param.u64 %rd30, [__cudaparm_kernel_sphere_ellipsoid_sig_eps]; - ld.param.u64 %rd31, [__cudaparm_kernel_sphere_ellipsoid_q]; - mov.f32 %f23, 0f00000000; // 0 - mov.f32 %f24, 0f00000000; // 0 - mov.f32 %f25, 0f00000000; // 0 - mov.f32 %f26, 0f00000000; // 0 - mov.u64 %rd32, __cuda___cuda_local_var_32888_33_non_const_sp_lj124; -$Lt_0_40706: - // Loop body line 58, nesting depth: 1, estimated iterations: unknown - .loc 17 63 0 - ld.global.s32 %r27, [%rd23+0]; - .loc 17 64 0 - shr.s32 %r28, %r27, 30; - and.b32 %r29, %r28, 3; - cvt.s64.s32 %rd33, %r29; - mul.wide.s32 %rd34, %r29, 4; - add.u64 %rd35, %rd32, %rd34; - ld.shared.f32 %f27, [%rd35+0]; - .loc 17 67 0 - and.b32 %r30, %r27, 1073741823; - cvt.s64.s32 %rd36, %r30; - mul.wide.s32 %rd37, %r30, 16; - add.u64 %rd38, %rd37, %rd9; - ld.global.v4.f32 {%f28,%f29,%f30,%f31}, [%rd38+0]; - .loc 17 86 0 - add.u64 %rd39, %rd37, %rd31; - ld.global.v4.f32 {%f32,%f33,%f34,%f35}, [%rd39+0]; - .loc 17 95 0 - cvt.rzi.ftz.s32.f32 %r31, %f31; - cvt.s64.s32 %rd40, %r31; - mul.wide.s32 %rd41, %r31, 16; - add.u64 %rd42, %rd41, %rd15; - ld.global.v4.f32 {%f36,%f37,%f38,_}, [%rd42+0]; - .loc 16 299 0 - sub.ftz.f32 %f39, %f28, %f17; - mov.f32 %f40, %f39; - .loc 16 300 0 - add.ftz.f32 %f41, %f33, %f33; - add.ftz.f32 %f42, %f35, %f35; - mul.ftz.f32 %f43, %f32, %f32; - mul.ftz.f32 %f44, %f33, %f33; - mul.ftz.f32 %f45, %f34, %f34; - mul.ftz.f32 %f46, %f35, %f35; - add.ftz.f32 %f47, %f34, %f34; - mul.ftz.f32 %f48, %f41, %f34; - mul.ftz.f32 %f49, %f41, %f35; - mul.ftz.f32 %f50, %f42, %f32; - add.ftz.f32 %f51, %f43, %f44; - mul.ftz.f32 %f52, %f47, %f32; - sub.ftz.f32 %f53, %f48, %f50; - sub.ftz.f32 %f54, %f51, %f45; - add.ftz.f32 %f55, %f49, %f52; - mul.ftz.f32 %f56, %f53, %f37; - sub.ftz.f32 %f57, %f54, %f46; - mul.ftz.f32 %f58, %f55, %f38; - mul.ftz.f32 %f59, %f53, %f56; - mul.ftz.f32 %f60, %f57, %f36; - fma.rn.ftz.f32 %f61, %f57, %f60, %f59; - fma.rn.ftz.f32 %f62, %f55, %f58, %f61; - add.ftz.f32 %f63, %f62, %f21; - mov.f32 %f64, %f63; - .loc 16 301 0 - mul.ftz.f32 %f65, %f41, %f32; - sub.ftz.f32 %f66, %f43, %f44; - mul.ftz.f32 %f67, %f47, %f35; - add.ftz.f32 %f68, %f48, %f50; - add.ftz.f32 %f69, %f45, %f66; - sub.ftz.f32 %f70, %f67, %f65; - mul.ftz.f32 %f71, %f68, %f36; - sub.ftz.f32 %f72, %f69, %f46; - mul.ftz.f32 %f73, %f70, %f38; - mul.ftz.f32 %f74, %f72, %f37; - mul.ftz.f32 %f75, %f53, %f74; - fma.rn.ftz.f32 %f76, %f57, %f71, %f75; - fma.rn.ftz.f32 %f77, %f55, %f73, %f76; - mov.f32 %f78, %f77; - .loc 16 302 0 - sub.ftz.f32 %f79, %f66, %f45; - sub.ftz.f32 %f80, %f49, %f52; - add.ftz.f32 %f81, %f65, %f67; - add.ftz.f32 %f82, %f46, %f79; - mul.ftz.f32 %f83, %f80, %f36; - mul.ftz.f32 %f84, %f81, %f37; - mul.ftz.f32 %f85, %f82, %f38; - mul.ftz.f32 %f86, %f53, %f84; - fma.rn.ftz.f32 %f87, %f57, %f83, %f86; - fma.rn.ftz.f32 %f88, %f55, %f85, %f87; - mov.f32 %f89, %f88; - .loc 16 303 0 - sub.ftz.f32 %f90, %f29, %f18; - mov.f32 %f91, %f90; - .loc 16 304 0 - mul.ftz.f32 %f92, %f56, %f72; - fma.rn.ftz.f32 %f93, %f60, %f68, %f92; - fma.rn.ftz.f32 %f94, %f58, %f70, %f93; - mov.f32 %f95, %f94; - .loc 16 305 0 - mul.ftz.f32 %f96, %f72, %f74; - fma.rn.ftz.f32 %f97, %f68, %f71, %f96; - fma.rn.ftz.f32 %f98, %f70, %f73, %f97; - add.ftz.f32 %f99, %f98, %f21; - mov.f32 %f100, %f99; - .loc 16 306 0 - mul.ftz.f32 %f101, %f72, %f84; - fma.rn.ftz.f32 %f102, %f68, %f83, %f101; - fma.rn.ftz.f32 %f103, %f70, %f85, %f102; - mov.f32 %f104, %f103; - .loc 16 307 0 - sub.ftz.f32 %f105, %f30, %f19; - mov.f32 %f106, %f105; - .loc 16 308 0 - mul.ftz.f32 %f107, %f81, %f56; - fma.rn.ftz.f32 %f108, %f60, %f80, %f107; - fma.rn.ftz.f32 %f109, %f58, %f82, %f108; - mov.f32 %f110, %f109; - .loc 16 309 0 - mul.ftz.f32 %f111, %f81, %f74; - fma.rn.ftz.f32 %f112, %f71, %f80, %f111; - fma.rn.ftz.f32 %f113, %f73, %f82, %f112; - mov.f32 %f114, %f113; - .loc 16 310 0 - mul.ftz.f32 %f115, %f81, %f84; - fma.rn.ftz.f32 %f116, %f80, %f83, %f115; - fma.rn.ftz.f32 %f117, %f82, %f85, %f116; - add.ftz.f32 %f118, %f117, %f21; - mov.f32 %f119, %f118; - abs.ftz.f32 %f120, %f94; - abs.ftz.f32 %f121, %f63; - setp.gt.ftz.f32 %p5, %f120, %f121; - @!%p5 bra $Lt_0_40962; - .loc 16 314 0 - mov.f32 %f64, %f94; - mov.f32 %f95, %f63; - .loc 16 315 0 - mov.f32 %f78, %f99; - mov.f32 %f100, %f77; - .loc 16 316 0 - mov.f32 %f89, %f103; - mov.f32 %f104, %f88; - .loc 16 317 0 - mov.f32 %f40, %f90; - mov.f32 %f91, %f39; -$Lt_0_40962: - mov.f32 %f122, %f64; - abs.ftz.f32 %f123, %f122; - abs.ftz.f32 %f124, %f109; - setp.lt.ftz.f32 %p6, %f123, %f124; - @!%p6 bra $Lt_0_41474; - .loc 16 321 0 - mov.f32 %f64, %f109; - mov.f32 %f110, %f122; - .loc 16 322 0 - mov.f32 %f125, %f78; - mov.f32 %f78, %f113; - mov.f32 %f114, %f125; - .loc 16 323 0 - mov.f32 %f126, %f89; - mov.f32 %f89, %f118; - mov.f32 %f119, %f126; - .loc 16 324 0 - mov.f32 %f127, %f40; - mov.f32 %f40, %f105; - mov.f32 %f106, %f127; -$Lt_0_41474: - mov.f32 %f128, %f64; - mov.f32 %f129, 0f00000000; // 0 - setp.neu.ftz.f32 %p7, %f128, %f129; - @!%p7 bra $Lt_0_42242; - bra.uni $Lt_0_43010; -$Lt_0_42242: - mov.f32 %f130, 0f00000000; // 0 - setp.neu.ftz.f32 %p8, %f95, %f130; - @!%p8 bra $Lt_0_42754; - .loc 16 338 0 - mov.f32 %f64, %f95; - mov.f32 %f95, %f128; - .loc 16 339 0 - mov.f32 %f131, %f78; - mov.f32 %f78, %f100; - mov.f32 %f100, %f131; - .loc 16 340 0 - mov.f32 %f132, %f89; - mov.f32 %f89, %f104; - mov.f32 %f104, %f132; - .loc 16 341 0 - mov.f32 %f133, %f40; - mov.f32 %f40, %f91; - mov.f32 %f91, %f133; - bra.uni $Lt_0_43010; -$Lt_0_42754: - mov.f32 %f134, 0f00000000; // 0 - setp.neu.ftz.f32 %p9, %f110, %f134; - @!%p9 bra $Lt_0_43266; - .loc 16 346 0 - mov.f32 %f64, %f110; - mov.f32 %f110, %f128; - .loc 16 347 0 - mov.f32 %f135, %f78; - mov.f32 %f78, %f114; - mov.f32 %f114, %f135; - .loc 16 348 0 - mov.f32 %f136, %f89; - mov.f32 %f89, %f119; - mov.f32 %f119, %f136; - .loc 16 349 0 - mov.f32 %f137, %f40; - mov.f32 %f40, %f106; - mov.f32 %f106, %f137; - bra.uni $Lt_0_43010; -$Lt_0_43266: - .loc 16 352 0 - mov.s32 %r32, 2; - ld.param.u64 %rd43, [__cudaparm_kernel_sphere_ellipsoid_err_flag]; - st.global.s32 [%rd43+0], %r32; -$Lt_0_43010: -$Lt_0_42498: -$Lt_0_41986: - .loc 16 355 0 - div.approx.ftz.f32 %f138, %f95, %f64; - mul.ftz.f32 %f139, %f78, %f138; - sub.ftz.f32 %f140, %f100, %f139; - mov.f32 %f100, %f140; - .loc 16 356 0 - mul.ftz.f32 %f141, %f89, %f138; - sub.ftz.f32 %f142, %f104, %f141; - mov.f32 %f104, %f142; - .loc 16 357 0 - mul.ftz.f32 %f143, %f40, %f138; - sub.ftz.f32 %f144, %f91, %f143; - mov.f32 %f91, %f144; - .loc 16 359 0 - div.approx.ftz.f32 %f145, %f110, %f64; - mul.ftz.f32 %f146, %f78, %f145; - sub.ftz.f32 %f114, %f114, %f146; - .loc 16 360 0 - mul.ftz.f32 %f147, %f89, %f145; - sub.ftz.f32 %f119, %f119, %f147; - .loc 16 361 0 - mul.ftz.f32 %f148, %f40, %f145; - sub.ftz.f32 %f106, %f106, %f148; - abs.ftz.f32 %f149, %f140; - abs.ftz.f32 %f150, %f114; - setp.lt.ftz.f32 %p10, %f149, %f150; - @!%p10 bra $Lt_0_43522; - .loc 16 366 0 - mov.f32 %f100, %f114; - mov.f32 %f114, %f140; - .loc 16 367 0 - mov.f32 %f104, %f119; - mov.f32 %f119, %f142; - .loc 16 368 0 - mov.f32 %f91, %f106; - mov.f32 %f106, %f144; -$Lt_0_43522: - mov.f32 %f151, %f100; - mov.f32 %f152, 0f00000000; // 0 - setp.neu.ftz.f32 %p11, %f151, %f152; - @!%p11 bra $Lt_0_44290; - bra.uni $Lt_0_44546; -$Lt_0_44290: - mov.f32 %f153, 0f00000000; // 0 - setp.neu.ftz.f32 %p12, %f114, %f153; - @!%p12 bra $Lt_0_44546; - .loc 16 383 0 - mov.f32 %f100, %f114; - mov.f32 %f114, %f151; - .loc 16 384 0 - mov.f32 %f154, %f104; - mov.f32 %f104, %f119; - mov.f32 %f119, %f154; - .loc 16 385 0 - mov.f32 %f155, %f91; - mov.f32 %f91, %f106; - mov.f32 %f106, %f155; -$Lt_0_44546: -$Lt_0_44034: - .loc 16 390 0 - div.approx.ftz.f32 %f156, %f114, %f100; - mul.ftz.f32 %f157, %f104, %f156; - sub.ftz.f32 %f119, %f119, %f157; - .loc 16 391 0 - mul.ftz.f32 %f158, %f91, %f156; - sub.ftz.f32 %f106, %f106, %f158; - mov.f32 %f159, 0f00000000; // 0 - setp.eq.ftz.f32 %p13, %f119, %f159; - @!%p13 bra $Lt_0_45058; - .loc 16 394 0 - mov.s32 %r33, 2; - ld.param.u64 %rd44, [__cudaparm_kernel_sphere_ellipsoid_err_flag]; - st.global.s32 [%rd44+0], %r33; -$Lt_0_45058: - .loc 17 115 0 - div.approx.ftz.f32 %f160, %f106, %f119; - mul.ftz.f32 %f161, %f90, %f90; - mul.ftz.f32 %f162, %f160, %f104; - fma.rn.ftz.f32 %f163, %f39, %f39, %f161; - sub.ftz.f32 %f164, %f91, %f162; - fma.rn.ftz.f32 %f165, %f105, %f105, %f163; - div.approx.ftz.f32 %f166, %f164, %f100; - rsqrt.approx.ftz.f32 %f167, %f165; - mul.ftz.f32 %f168, %f166, %f78; - fma.rn.ftz.f32 %f169, %f89, %f160, %f168; - sub.ftz.f32 %f170, %f40, %f169; - div.approx.ftz.f32 %f171, %f170, %f64; - mul.ftz.f32 %f172, %f167, %f171; - .loc 17 127 0 - mul.ftz.f32 %f173, %f166, %f167; - mul.ftz.f32 %f174, %f167, %f90; - mul.ftz.f32 %f175, %f167, %f39; - mul.ftz.f32 %f176, %f167, %f105; - mul.ftz.f32 %f177, %f160, %f167; - mul.ftz.f32 %f178, %f173, %f174; - fma.rn.ftz.f32 %f179, %f175, %f172, %f178; - fma.rn.ftz.f32 %f180, %f176, %f177, %f179; - mov.f32 %f181, 0f3f000000; // 0.5 - mul.ftz.f32 %f182, %f180, %f181; - rsqrt.approx.ftz.f32 %f183, %f182; - .loc 17 131 0 - rcp.approx.ftz.f32 %f184, %f167; - mul.ftz.f32 %f185, %f184, %f172; - .loc 17 136 0 - add.s32 %r34, %r31, %r26; - cvt.s64.s32 %rd45, %r34; - mul.wide.s32 %rd46, %r34, 8; - add.u64 %rd47, %rd30, %rd46; - ld.global.v2.f32 {%f186,%f187}, [%rd47+0]; - .loc 17 138 0 - sub.ftz.f32 %f188, %f184, %f183; - ld.global.f32 %f189, [%rd1+0]; - fma.rn.ftz.f32 %f190, %f189, %f186, %f188; - .loc 17 145 0 - div.approx.ftz.f32 %f191, %f186, %f190; - mul.ftz.f32 %f192, %f191, %f191; - mul.ftz.f32 %f193, %f191, %f192; - mul.ftz.f32 %f194, %f193, %f193; - mul.ftz.f32 %f195, %f194, %f194; - mul.ftz.f32 %f196, %f191, %f194; - add.ftz.f32 %f197, %f195, %f195; - mul.ftz.f32 %f198, %f191, %f197; - sub.ftz.f32 %f199, %f198, %f196; - div.approx.ftz.f32 %f200, %f199, %f186; - mov.f32 %f201, 0f41c00000; // 24 - mul.ftz.f32 %f202, %f200, %f201; - mul.ftz.f32 %f203, %f187, %f202; - .loc 17 150 0 - mul.ftz.f32 %f204, %f183, %f203; - mul.ftz.f32 %f205, %f204, %f183; - mul.ftz.f32 %f206, %f205, %f183; - mov.f32 %f207, 0f3f000000; // 0.5 - mul.ftz.f32 %f208, %f206, %f207; - mul.ftz.f32 %f209, %f208, %f167; - mul.ftz.f32 %f210, %f173, %f184; - mul.ftz.f32 %f211, %f177, %f184; - mul.ftz.f32 %f212, %f167, %f209; - mul.ftz.f32 %f213, %f174, %f210; - fma.rn.ftz.f32 %f214, %f175, %f185, %f213; - fma.rn.ftz.f32 %f215, %f176, %f211, %f214; - mul.ftz.f32 %f216, %f175, %f215; - sub.ftz.f32 %f217, %f185, %f216; - mul.ftz.f32 %f218, %f212, %f217; - fma.rn.ftz.f32 %f219, %f175, %f203, %f218; - .loc 17 151 0 - mul.ftz.f32 %f220, %f174, %f215; - sub.ftz.f32 %f221, %f210, %f220; - mul.ftz.f32 %f222, %f212, %f221; - fma.rn.ftz.f32 %f223, %f174, %f203, %f222; - .loc 17 152 0 - mul.ftz.f32 %f224, %f176, %f215; - sub.ftz.f32 %f225, %f211, %f224; - mul.ftz.f32 %f226, %f212, %f225; - fma.rn.ftz.f32 %f227, %f176, %f203, %f226; - .loc 17 159 0 - ld.global.f32 %f228, [%rd29+0]; - mul.lo.u64 %rd48, %rd40, 4; - add.u64 %rd49, %rd27, %rd48; - ld.global.f32 %f229, [%rd49+0]; - add.ftz.f32 %f230, %f228, %f228; - mul.ftz.f32 %f231, %f229, %f230; - .loc 17 160 0 - mul.ftz.f32 %f232, %f103, %f63; - mul.ftz.f32 %f233, %f113, %f232; - mul.ftz.f32 %f234, %f99, %f63; - mul.ftz.f32 %f235, %f118, %f234; - sub.ftz.f32 %f236, %f235, %f233; - mul.ftz.f32 %f237, %f94, %f77; - mul.ftz.f32 %f238, %f118, %f237; - sub.ftz.f32 %f239, %f236, %f238; - mul.ftz.f32 %f240, %f94, %f88; - fma.rn.ftz.f32 %f241, %f113, %f240, %f239; - mul.ftz.f32 %f242, %f109, %f77; - fma.rn.ftz.f32 %f243, %f103, %f242, %f241; - mul.ftz.f32 %f244, %f109, %f88; - mul.ftz.f32 %f245, %f99, %f244; - sub.ftz.f32 %f246, %f243, %f245; - .loc 17 161 0 - ld.global.f32 %f247, [%rd1+4]; - .loc 17 172 0 - add.u64 %rd50, %rd41, %rd17; - ld.global.v4.f32 {%f248,%f249,%f250,_}, [%rd50+0]; - .loc 16 299 0 - mov.f32 %f40, %f39; - .loc 16 300 0 - mul.ftz.f32 %f251, %f53, %f249; - mul.ftz.f32 %f252, %f55, %f250; - mul.ftz.f32 %f253, %f53, %f251; - mul.ftz.f32 %f254, %f57, %f248; - fma.rn.ftz.f32 %f255, %f57, %f254, %f253; - fma.rn.ftz.f32 %f256, %f55, %f252, %f255; - add.ftz.f32 %f257, %f256, %f22; - mov.f32 %f64, %f257; - .loc 16 301 0 - mul.ftz.f32 %f258, %f68, %f248; - mul.ftz.f32 %f259, %f70, %f250; - mul.ftz.f32 %f260, %f72, %f249; - mul.ftz.f32 %f261, %f53, %f260; - fma.rn.ftz.f32 %f262, %f57, %f258, %f261; - fma.rn.ftz.f32 %f263, %f55, %f259, %f262; - mov.f32 %f78, %f263; - .loc 16 302 0 - mul.ftz.f32 %f264, %f80, %f248; - mul.ftz.f32 %f265, %f81, %f249; - mul.ftz.f32 %f266, %f82, %f250; - mul.ftz.f32 %f267, %f53, %f265; - fma.rn.ftz.f32 %f268, %f57, %f264, %f267; - fma.rn.ftz.f32 %f269, %f55, %f266, %f268; - mov.f32 %f89, %f269; - .loc 16 303 0 - mov.f32 %f91, %f90; - .loc 16 304 0 - mul.ftz.f32 %f270, %f251, %f72; - fma.rn.ftz.f32 %f271, %f254, %f68, %f270; - fma.rn.ftz.f32 %f272, %f252, %f70, %f271; - mov.f32 %f95, %f272; - .loc 16 305 0 - mul.ftz.f32 %f273, %f72, %f260; - fma.rn.ftz.f32 %f274, %f68, %f258, %f273; - fma.rn.ftz.f32 %f275, %f70, %f259, %f274; - add.ftz.f32 %f100, %f22, %f275; - .loc 16 306 0 - mul.ftz.f32 %f276, %f72, %f265; - fma.rn.ftz.f32 %f277, %f68, %f264, %f276; - fma.rn.ftz.f32 %f104, %f70, %f266, %f277; - .loc 16 307 0 - mov.f32 %f106, %f105; - .loc 16 308 0 - mul.ftz.f32 %f278, %f81, %f251; - fma.rn.ftz.f32 %f279, %f254, %f80, %f278; - fma.rn.ftz.f32 %f280, %f252, %f82, %f279; - mov.f32 %f110, %f280; - .loc 16 309 0 - mul.ftz.f32 %f281, %f81, %f260; - fma.rn.ftz.f32 %f282, %f258, %f80, %f281; - fma.rn.ftz.f32 %f114, %f259, %f82, %f282; - .loc 16 310 0 - mul.ftz.f32 %f283, %f81, %f265; - fma.rn.ftz.f32 %f284, %f80, %f264, %f283; - fma.rn.ftz.f32 %f285, %f82, %f266, %f284; - add.ftz.f32 %f119, %f22, %f285; - abs.ftz.f32 %f286, %f272; - abs.ftz.f32 %f287, %f257; - setp.gt.ftz.f32 %p14, %f286, %f287; - @!%p14 bra $Lt_0_45570; - .loc 16 314 0 - mov.f32 %f64, %f272; - mov.f32 %f95, %f257; - .loc 16 315 0 - mov.f32 %f78, %f100; - mov.f32 %f100, %f263; - .loc 16 316 0 - mov.f32 %f89, %f104; - mov.f32 %f104, %f269; - .loc 16 317 0 - mov.f32 %f40, %f90; - mov.f32 %f91, %f39; -$Lt_0_45570: - mov.f32 %f288, %f64; - abs.ftz.f32 %f289, %f288; - abs.ftz.f32 %f290, %f280; - setp.lt.ftz.f32 %p15, %f289, %f290; - @!%p15 bra $Lt_0_46082; - .loc 16 321 0 - mov.f32 %f64, %f280; - mov.f32 %f110, %f288; - .loc 16 322 0 - mov.f32 %f291, %f78; - mov.f32 %f78, %f114; - mov.f32 %f114, %f291; - .loc 16 323 0 - mov.f32 %f292, %f89; - mov.f32 %f89, %f119; - mov.f32 %f119, %f292; - .loc 16 324 0 - mov.f32 %f293, %f40; - mov.f32 %f40, %f105; - mov.f32 %f106, %f293; -$Lt_0_46082: - mov.f32 %f294, %f64; - mov.f32 %f295, 0f00000000; // 0 - setp.neu.ftz.f32 %p16, %f294, %f295; - @!%p16 bra $Lt_0_46850; - bra.uni $Lt_0_47618; -$Lt_0_46850: - mov.f32 %f296, 0f00000000; // 0 - setp.neu.ftz.f32 %p17, %f95, %f296; - @!%p17 bra $Lt_0_47362; - .loc 16 338 0 - mov.f32 %f64, %f95; - mov.f32 %f95, %f294; - .loc 16 339 0 - mov.f32 %f297, %f78; - mov.f32 %f78, %f100; - mov.f32 %f100, %f297; - .loc 16 340 0 - mov.f32 %f298, %f89; - mov.f32 %f89, %f104; - mov.f32 %f104, %f298; - .loc 16 341 0 - mov.f32 %f299, %f40; - mov.f32 %f40, %f91; - mov.f32 %f91, %f299; - bra.uni $Lt_0_47618; -$Lt_0_47362: - mov.f32 %f300, 0f00000000; // 0 - setp.neu.ftz.f32 %p18, %f110, %f300; - @!%p18 bra $Lt_0_47874; - .loc 16 346 0 - mov.f32 %f64, %f110; - mov.f32 %f110, %f294; - .loc 16 347 0 - mov.f32 %f301, %f78; - mov.f32 %f78, %f114; - mov.f32 %f114, %f301; - .loc 16 348 0 - mov.f32 %f302, %f89; - mov.f32 %f89, %f119; - mov.f32 %f119, %f302; - .loc 16 349 0 - mov.f32 %f303, %f40; - mov.f32 %f40, %f106; - mov.f32 %f106, %f303; - bra.uni $Lt_0_47618; -$Lt_0_47874: - .loc 16 352 0 - mov.s32 %r35, 2; - ld.param.u64 %rd51, [__cudaparm_kernel_sphere_ellipsoid_err_flag]; - st.global.s32 [%rd51+0], %r35; -$Lt_0_47618: -$Lt_0_47106: -$Lt_0_46594: - .loc 16 355 0 - div.approx.ftz.f32 %f304, %f95, %f64; - mul.ftz.f32 %f305, %f78, %f304; - sub.ftz.f32 %f306, %f100, %f305; - mov.f32 %f100, %f306; - .loc 16 356 0 - mul.ftz.f32 %f307, %f89, %f304; - sub.ftz.f32 %f308, %f104, %f307; - mov.f32 %f104, %f308; - .loc 16 357 0 - mul.ftz.f32 %f309, %f40, %f304; - sub.ftz.f32 %f310, %f91, %f309; - mov.f32 %f91, %f310; - .loc 16 359 0 - div.approx.ftz.f32 %f311, %f110, %f64; - mul.ftz.f32 %f312, %f78, %f311; - sub.ftz.f32 %f114, %f114, %f312; - .loc 16 360 0 - mul.ftz.f32 %f313, %f89, %f311; - sub.ftz.f32 %f119, %f119, %f313; - .loc 16 361 0 - mul.ftz.f32 %f314, %f40, %f311; - sub.ftz.f32 %f106, %f106, %f314; - abs.ftz.f32 %f315, %f306; - abs.ftz.f32 %f316, %f114; - setp.lt.ftz.f32 %p19, %f315, %f316; - @!%p19 bra $Lt_0_48130; - .loc 16 366 0 - mov.f32 %f100, %f114; - mov.f32 %f114, %f306; - .loc 16 367 0 - mov.f32 %f104, %f119; - mov.f32 %f119, %f308; - .loc 16 368 0 - mov.f32 %f91, %f106; - mov.f32 %f106, %f310; -$Lt_0_48130: - mov.f32 %f317, %f100; - mov.f32 %f318, 0f00000000; // 0 - setp.neu.ftz.f32 %p20, %f317, %f318; - @!%p20 bra $Lt_0_48898; - bra.uni $Lt_0_49154; -$Lt_0_48898: - mov.f32 %f319, 0f00000000; // 0 - setp.neu.ftz.f32 %p21, %f114, %f319; - @!%p21 bra $Lt_0_49154; - .loc 16 383 0 - mov.f32 %f100, %f114; - mov.f32 %f114, %f317; - .loc 16 384 0 - mov.f32 %f320, %f104; - mov.f32 %f104, %f119; - mov.f32 %f119, %f320; - .loc 16 385 0 - mov.f32 %f321, %f91; - mov.f32 %f91, %f106; - mov.f32 %f106, %f321; -$Lt_0_49154: -$Lt_0_48642: - .loc 16 390 0 - div.approx.ftz.f32 %f322, %f114, %f100; - mul.ftz.f32 %f323, %f104, %f322; - sub.ftz.f32 %f119, %f119, %f323; - .loc 16 391 0 - mul.ftz.f32 %f324, %f91, %f322; - sub.ftz.f32 %f106, %f106, %f324; - mov.f32 %f325, 0f00000000; // 0 - setp.eq.ftz.f32 %p22, %f119, %f325; - @!%p22 bra $Lt_0_49666; - .loc 16 394 0 - mov.s32 %r36, 2; - ld.param.u64 %rd52, [__cudaparm_kernel_sphere_ellipsoid_err_flag]; - st.global.s32 [%rd52+0], %r36; -$Lt_0_49666: - .loc 17 189 0 - div.approx.ftz.f32 %f326, %f106, %f119; - mul.ftz.f32 %f327, %f326, %f104; - sub.ftz.f32 %f328, %f91, %f327; - div.approx.ftz.f32 %f329, %f328, %f100; - mul.ftz.f32 %f330, %f329, %f78; - fma.rn.ftz.f32 %f331, %f89, %f326, %f330; - sub.ftz.f32 %f332, %f40, %f331; - div.approx.ftz.f32 %f333, %f332, %f64; - mul.ftz.f32 %f334, %f167, %f333; - .loc 17 193 0 - ld.global.f32 %f335, [%rd1+8]; - .loc 21 496 0 - mul.ftz.f32 %f336, %f329, %f167; - mul.ftz.f32 %f337, %f326, %f167; - mul.ftz.f32 %f338, %f336, %f174; - fma.rn.ftz.f32 %f339, %f175, %f334, %f338; - fma.rn.ftz.f32 %f340, %f176, %f337, %f339; - add.ftz.f32 %f341, %f340, %f340; - lg2.approx.ftz.f32 %f342, %f341; - .loc 21 538 0 - mul.ftz.f32 %f343, %f342, %f335; - ex2.approx.ftz.f32 %f344, %f343; - .loc 17 196 0 - mul.ftz.f32 %f345, %f184, %f334; - .loc 17 201 0 - mov.f32 %f346, 0fbf800000; // -1 - add.ftz.f32 %f347, %f335, %f346; - .loc 21 496 0 - lg2.approx.ftz.f32 %f348, %f344; - .loc 17 201 0 - div.approx.ftz.f32 %f349, %f347, %f335; - mul.ftz.f32 %f350, %f348, %f349; - ex2.approx.ftz.f32 %f351, %f350; - mov.f32 %f352, 0fc0800000; // -4 - mul.ftz.f32 %f353, %f167, %f352; - mul.ftz.f32 %f354, %f167, %f353; - mul.ftz.f32 %f355, %f335, %f354; - mul.ftz.f32 %f356, %f351, %f355; - .loc 17 203 0 - mul.ftz.f32 %f357, %f336, %f184; - mul.ftz.f32 %f358, %f337, %f184; - mul.ftz.f32 %f359, %f174, %f357; - fma.rn.ftz.f32 %f360, %f175, %f345, %f359; - fma.rn.ftz.f32 %f361, %f176, %f358, %f360; - mul.ftz.f32 %f362, %f175, %f361; - sub.ftz.f32 %f363, %f345, %f362; - mul.ftz.f32 %f364, %f356, %f363; - .loc 17 204 0 - mul.ftz.f32 %f365, %f174, %f361; - sub.ftz.f32 %f366, %f357, %f365; - mul.ftz.f32 %f367, %f356, %f366; - .loc 17 205 0 - mul.ftz.f32 %f368, %f176, %f361; - sub.ftz.f32 %f369, %f358, %f368; - mul.ftz.f32 %f370, %f356, %f369; - .loc 16 396 0 - mov.f32 %f371, 0f40800000; // 4 - mul.ftz.f32 %f372, %f187, %f371; - div.approx.ftz.f32 %f373, %f231, %f246; - lg2.approx.ftz.f32 %f374, %f373; - mul.ftz.f32 %f375, %f374, %f247; - ex2.approx.ftz.f32 %f376, %f375; - mul.ftz.f32 %f377, %f376, %f27; - sub.ftz.f32 %f378, %f195, %f194; - mul.ftz.f32 %f379, %f377, %f344; - mul.ftz.f32 %f380, %f372, %f378; - fma.rn.ftz.f32 %f381, %f380, %f379, %f26; - selp.f32 %f26, %f381, %f26, %p3; - mul.ftz.f32 %f382, %f379, %f219; - mul.ftz.f32 %f383, %f379, %f223; - mul.ftz.f32 %f384, %f379, %f227; - mul.ftz.f32 %f385, %f376, %f380; - mul.ftz.f32 %f386, %f385, %f27; - neg.ftz.f32 %f387, %f386; - mul.ftz.f32 %f388, %f364, %f387; - sub.ftz.f32 %f389, %f388, %f382; - mul.ftz.f32 %f390, %f367, %f387; - sub.ftz.f32 %f391, %f390, %f383; - mul.ftz.f32 %f392, %f370, %f387; - sub.ftz.f32 %f393, %f392, %f384; - @!%p4 bra $Lt_0_50434; - .loc 17 217 0 - add.ftz.f32 %f25, %f389, %f25; - .loc 17 218 0 - sub.ftz.f32 %f394, %f17, %f28; - mov.f32 %f395, %f6; - fma.rn.ftz.f32 %f396, %f394, %f389, %f395; - mov.f32 %f6, %f396; - .loc 17 220 0 - add.ftz.f32 %f24, %f391, %f24; - .loc 17 221 0 - sub.ftz.f32 %f397, %f18, %f29; - mov.f32 %f398, %f8; - fma.rn.ftz.f32 %f399, %f397, %f391, %f398; - mov.f32 %f8, %f399; - .loc 17 222 0 - mov.f32 %f400, %f12; - fma.rn.ftz.f32 %f401, %f394, %f391, %f400; - mov.f32 %f12, %f401; - .loc 17 224 0 - add.ftz.f32 %f23, %f393, %f23; - .loc 17 225 0 - mov.f32 %f402, %f10; - sub.ftz.f32 %f403, %f19, %f30; - fma.rn.ftz.f32 %f404, %f403, %f393, %f402; - mov.f32 %f10, %f404; - .loc 17 226 0 - mov.f32 %f405, %f14; - fma.rn.ftz.f32 %f406, %f394, %f393, %f405; - mov.f32 %f14, %f406; - .loc 17 227 0 - fma.rn.ftz.f32 %f15, %f397, %f393, %f15; - mov.f32 %f16, %f15; - bra.uni $Lt_0_50178; -$Lt_0_50434: - .loc 17 229 0 - add.ftz.f32 %f25, %f389, %f25; - .loc 17 230 0 - add.ftz.f32 %f24, %f391, %f24; - .loc 17 231 0 - add.ftz.f32 %f23, %f393, %f23; -$Lt_0_50178: - mul.lo.s32 %r37, %r16, %r1; - cvt.s64.s32 %rd53, %r37; - mul.wide.s32 %rd54, %r37, 4; - add.u64 %rd23, %rd23, %rd54; - setp.gt.u64 %p23, %rd26, %rd23; - @%p23 bra $Lt_0_40706; - bra.uni $Lt_0_40194; -$Lt_0_56578: - mov.f32 %f23, 0f00000000; // 0 - mov.f32 %f24, 0f00000000; // 0 - mov.f32 %f25, 0f00000000; // 0 - mov.f32 %f26, 0f00000000; // 0 -$Lt_0_40194: - mov.u32 %r38, 1; - setp.le.s32 %p24, %r1, %r38; - @%p24 bra $Lt_0_52994; - .loc 17 234 0 - mov.u64 %rd55, __cuda___cuda_local_var_33089_55_non_const_red_acc140; - cvt.s64.s32 %rd56, %r2; - mul.wide.s32 %rd57, %r2, 4; - add.u64 %rd58, %rd55, %rd57; - mov.f32 %f407, %f25; - st.shared.f32 [%rd58+0], %f407; - mov.f32 %f408, %f24; - st.shared.f32 [%rd58+512], %f408; - mov.f32 %f409, %f23; - st.shared.f32 [%rd58+1024], %f409; - mov.f32 %f410, %f26; - st.shared.f32 [%rd58+1536], %f410; - shr.s32 %r39, %r1, 31; - mov.s32 %r40, 1; - and.b32 %r41, %r39, %r40; - add.s32 %r42, %r41, %r1; - shr.s32 %r43, %r42, 1; - mov.s32 %r44, %r43; - mov.u32 %r45, 0; - setp.ne.u32 %p25, %r43, %r45; - @!%p25 bra $Lt_0_51458; -$Lt_0_51970: - setp.ge.u32 %p26, %r18, %r44; - @%p26 bra $Lt_0_52226; - add.u32 %r46, %r2, %r44; - cvt.u64.u32 %rd59, %r46; - mul.wide.u32 %rd60, %r46, 4; - add.u64 %rd61, %rd55, %rd60; - ld.shared.f32 %f411, [%rd61+0]; - add.ftz.f32 %f407, %f411, %f407; - st.shared.f32 [%rd58+0], %f407; - ld.shared.f32 %f412, [%rd61+512]; - add.ftz.f32 %f408, %f412, %f408; - st.shared.f32 [%rd58+512], %f408; - ld.shared.f32 %f413, [%rd61+1024]; - add.ftz.f32 %f409, %f413, %f409; - st.shared.f32 [%rd58+1024], %f409; - ld.shared.f32 %f414, [%rd61+1536]; - add.ftz.f32 %f410, %f414, %f410; - st.shared.f32 [%rd58+1536], %f410; -$Lt_0_52226: - shr.u32 %r44, %r44, 1; - mov.u32 %r47, 0; - setp.ne.u32 %p27, %r44, %r47; - @%p27 bra $Lt_0_51970; -$Lt_0_51458: - mov.f32 %f25, %f407; - mov.f32 %f24, %f408; - mov.f32 %f23, %f409; - mov.f32 %f26, %f410; - ld.param.s32 %r48, [__cudaparm_kernel_sphere_ellipsoid_vflag]; - mov.u32 %r49, 0; - setp.le.s32 %p28, %r48, %r49; - @%p28 bra $Lt_0_52994; - mov.f32 %f407, %f6; - st.shared.f32 [%rd58+0], %f407; - mov.f32 %f408, %f8; - st.shared.f32 [%rd58+512], %f408; - mov.f32 %f409, %f10; - st.shared.f32 [%rd58+1024], %f409; - mov.f32 %f410, %f12; - st.shared.f32 [%rd58+1536], %f410; - mov.f32 %f415, %f14; - st.shared.f32 [%rd58+2048], %f415; - mov.f32 %f416, %f15; - st.shared.f32 [%rd58+2560], %f416; - mov.s32 %r50, %r43; - @!%p25 bra $Lt_0_53506; -$Lt_0_54018: - setp.ge.u32 %p29, %r18, %r50; - @%p29 bra $Lt_0_54274; - add.u32 %r51, %r2, %r50; - cvt.u64.u32 %rd62, %r51; - mul.wide.u32 %rd63, %r51, 4; - add.u64 %rd64, %rd55, %rd63; - ld.shared.f32 %f417, [%rd64+0]; - add.ftz.f32 %f407, %f417, %f407; - st.shared.f32 [%rd58+0], %f407; - ld.shared.f32 %f418, [%rd64+512]; - add.ftz.f32 %f408, %f418, %f408; - st.shared.f32 [%rd58+512], %f408; - ld.shared.f32 %f419, [%rd64+1024]; - add.ftz.f32 %f409, %f419, %f409; - st.shared.f32 [%rd58+1024], %f409; - ld.shared.f32 %f420, [%rd64+1536]; - add.ftz.f32 %f410, %f420, %f410; - st.shared.f32 [%rd58+1536], %f410; - ld.shared.f32 %f421, [%rd64+2048]; - add.ftz.f32 %f415, %f421, %f415; - st.shared.f32 [%rd58+2048], %f415; - ld.shared.f32 %f422, [%rd64+2560]; - add.ftz.f32 %f416, %f422, %f416; - st.shared.f32 [%rd58+2560], %f416; -$Lt_0_54274: - shr.u32 %r50, %r50, 1; - mov.u32 %r52, 0; - setp.ne.u32 %p30, %r50, %r52; - @%p30 bra $Lt_0_54018; -$Lt_0_53506: - mov.f32 %f6, %f407; - mov.f32 %f8, %f408; - mov.f32 %f10, %f409; - mov.f32 %f12, %f410; - mov.f32 %f14, %f415; - mov.f32 %f16, %f416; -$Lt_0_52994: -$Lt_0_50946: - mov.u32 %r53, 0; - setp.ne.s32 %p31, %r18, %r53; - @%p31 bra $Lt_0_55042; - ld.param.u64 %rd65, [__cudaparm_kernel_sphere_ellipsoid___val_paramengv]; - add.u64 %rd66, %rd65, %rd3; - ld.param.s32 %r54, [__cudaparm_kernel_sphere_ellipsoid_eflag]; - mov.u32 %r55, 0; - setp.le.s32 %p32, %r54, %r55; - @%p32 bra $Lt_0_55554; - st.global.f32 [%rd66+0], %f26; - cvt.s64.s32 %rd67, %r11; - mul.wide.s32 %rd68, %r11, 4; - add.u64 %rd66, %rd66, %rd68; -$Lt_0_55554: - ld.param.s32 %r56, [__cudaparm_kernel_sphere_ellipsoid_vflag]; - mov.u32 %r57, 0; - setp.le.s32 %p33, %r56, %r57; - @%p33 bra $Lt_0_56066; - mov.f32 %f423, %f6; - st.global.f32 [%rd66+0], %f423; - cvt.s64.s32 %rd69, %r11; - mul.wide.s32 %rd70, %r11, 4; - add.u64 %rd71, %rd70, %rd66; - mov.f32 %f424, %f8; - st.global.f32 [%rd71+0], %f424; - add.u64 %rd72, %rd70, %rd71; - mov.f32 %f425, %f10; - st.global.f32 [%rd72+0], %f425; - add.u64 %rd73, %rd70, %rd72; - mov.f32 %f426, %f12; - st.global.f32 [%rd73+0], %f426; - add.u64 %rd66, %rd70, %rd73; - mov.f32 %f427, %f14; - st.global.f32 [%rd66+0], %f427; - mov.f32 %f428, %f16; - add.u64 %rd74, %rd70, %rd66; - st.global.f32 [%rd74+0], %f428; -$Lt_0_56066: - ld.param.u64 %rd75, [__cudaparm_kernel_sphere_ellipsoid_ans]; - mul.lo.u64 %rd76, %rd2, 16; - add.u64 %rd77, %rd75, %rd76; - mov.f32 %f429, %f430; - st.global.v4.f32 [%rd77+0], {%f25,%f24,%f23,%f429}; -$Lt_0_55042: -$Lt_0_39682: - .loc 17 237 0 - exit; -$LDWend_kernel_sphere_ellipsoid: - } // kernel_sphere_ellipsoid - - .entry kernel_lj ( - .param .u64 __cudaparm_kernel_lj_x_, - .param .u64 __cudaparm_kernel_lj_lj1, - .param .u64 __cudaparm_kernel_lj_lj3, - .param .s32 __cudaparm_kernel_lj_lj_types, - .param .u64 __cudaparm_kernel_lj_gum, - .param .s32 __cudaparm_kernel_lj_stride, - .param .u64 __cudaparm_kernel_lj_dev_ij, - .param .u64 __cudaparm_kernel_lj_ans, - .param .u64 __cudaparm_kernel_lj___val_paramengv, - .param .u64 __cudaparm_kernel_lj_err_flag, - .param .s32 __cudaparm_kernel_lj_eflag, - .param .s32 __cudaparm_kernel_lj_vflag, - .param .s32 __cudaparm_kernel_lj_start, - .param .s32 __cudaparm_kernel_lj_inum, - .param .s32 __cudaparm_kernel_lj_t_per_atom) - { - .reg .u32 %r<55>; - .reg .u64 %rd<60>; - .reg .f32 %f<115>; - .reg .pred %p<19>; - .shared .align 16 .b8 __cuda___cuda_local_var_33106_33_non_const_sp_lj3316[16]; - .shared .align 4 .b8 __cuda___cuda_local_var_33172_55_non_const_red_acc3332[3072]; - // __cuda_local_var_33117_9_non_const_virial = 16 - .loc 17 246 0 -$LDWbegin_kernel_lj: - .loc 17 252 0 - ld.param.u64 %rd1, [__cudaparm_kernel_lj_gum]; - ldu.global.f32 %f1, [%rd1+12]; - .loc 17 253 0 - ld.global.f32 %f2, [%rd1+16]; - .loc 17 254 0 - ld.global.f32 %f3, [%rd1+20]; - .loc 17 255 0 - ld.global.f32 %f4, [%rd1+24]; - st.shared.v4.f32 [__cuda___cuda_local_var_33106_33_non_const_sp_lj3316+0], {%f1,%f2,%f3,%f4}; - .loc 17 264 0 - mov.f32 %f5, 0f00000000; // 0 - mov.f32 %f6, %f5; - mov.f32 %f7, 0f00000000; // 0 - mov.f32 %f8, %f7; - mov.f32 %f9, 0f00000000; // 0 - mov.f32 %f10, %f9; - mov.f32 %f11, 0f00000000; // 0 - mov.f32 %f12, %f11; - mov.f32 %f13, 0f00000000; // 0 - mov.f32 %f14, %f13; - mov.f32 %f15, 0f00000000; // 0 - mov.f32 %f16, %f15; - ld.param.s32 %r1, [__cudaparm_kernel_lj_t_per_atom]; - cvt.s32.u32 %r2, %tid.x; - div.s32 %r3, %r2, %r1; - cvt.s32.u32 %r4, %ntid.x; - div.s32 %r5, %r4, %r1; - cvt.s32.u32 %r6, %ctaid.x; - mul.lo.s32 %r7, %r6, %r5; - add.s32 %r8, %r3, %r7; - ld.param.s32 %r9, [__cudaparm_kernel_lj_start]; - add.s32 %r10, %r9, %r8; - ld.param.s32 %r11, [__cudaparm_kernel_lj_inum]; - setp.ge.s32 %p1, %r10, %r11; - @%p1 bra $Lt_1_25346; - .loc 17 269 0 - cvt.s64.s32 %rd2, %r10; - mul.wide.s32 %rd3, %r10, 4; - ld.param.u64 %rd4, [__cudaparm_kernel_lj_dev_ij]; - add.u64 %rd5, %rd4, %rd3; - ld.global.s32 %r12, [%rd5+0]; - ld.param.s32 %r13, [__cudaparm_kernel_lj_stride]; - cvt.s64.s32 %rd6, %r13; - mul.wide.s32 %rd7, %r13, 4; - add.u64 %rd8, %rd7, %rd5; - ld.global.s32 %r14, [%rd8+0]; - .loc 17 272 0 - ld.param.u64 %rd9, [__cudaparm_kernel_lj_x_]; - cvt.s64.s32 %rd10, %r12; - mul.wide.s32 %rd11, %r12, 16; - add.u64 %rd12, %rd9, %rd11; - ld.global.v4.f32 {%f17,%f18,%f19,%f20}, [%rd12+0]; - .loc 17 273 0 - cvt.s32.s64 %r15, %rd6; - sub.s32 %r16, %r1, 1; - and.b32 %r17, %r16, %r2; - add.u64 %rd13, %rd7, %rd8; - mul.lo.s32 %r18, %r15, %r17; - cvt.s64.s32 %rd14, %r18; - mul.wide.s32 %rd15, %r18, 4; - add.u64 %rd16, %rd13, %rd15; - mov.s64 %rd17, %rd16; - mul.lo.s32 %r19, %r15, %r14; - cvt.s64.s32 %rd18, %r19; - mul.wide.s32 %rd19, %r19, 4; - add.u64 %rd20, %rd13, %rd19; - setp.ge.u64 %p2, %rd16, %rd20; - @%p2 bra $Lt_1_26882; - cvt.rzi.ftz.s32.f32 %r20, %f20; - ld.param.s32 %r21, [__cudaparm_kernel_lj_lj_types]; - mul.lo.s32 %r22, %r21, %r20; - ld.param.u64 %rd21, [__cudaparm_kernel_lj_lj1]; - mov.f32 %f21, 0f00000000; // 0 - mov.f32 %f22, 0f00000000; // 0 - mov.f32 %f23, 0f00000000; // 0 - mov.f32 %f24, 0f00000000; // 0 - mov.u64 %rd22, __cuda___cuda_local_var_33106_33_non_const_sp_lj3316; -$Lt_1_19714: - // Loop body line 273, nesting depth: 1, estimated iterations: unknown - .loc 17 278 0 - ld.global.s32 %r23, [%rd17+0]; - .loc 17 279 0 - shr.s32 %r24, %r23, 30; - and.b32 %r25, %r24, 3; - cvt.s64.s32 %rd23, %r25; - mul.wide.s32 %rd24, %r25, 4; - add.u64 %rd25, %rd22, %rd24; - ld.shared.f32 %f25, [%rd25+0]; - .loc 17 282 0 - and.b32 %r26, %r23, 1073741823; - cvt.s64.s32 %rd26, %r26; - mul.wide.s32 %rd27, %r26, 16; - add.u64 %rd28, %rd9, %rd27; - ld.global.v4.f32 {%f26,%f27,%f28,%f29}, [%rd28+0]; - .loc 17 278 0 - cvt.rzi.ftz.s32.f32 %r27, %f29; - sub.ftz.f32 %f30, %f18, %f27; - sub.ftz.f32 %f31, %f17, %f26; - sub.ftz.f32 %f32, %f19, %f28; - mul.ftz.f32 %f33, %f30, %f30; - fma.rn.ftz.f32 %f34, %f31, %f31, %f33; - fma.rn.ftz.f32 %f35, %f32, %f32, %f34; - add.s32 %r28, %r27, %r22; - cvt.s64.s32 %rd29, %r28; - mul.wide.s32 %rd30, %r28, 16; - add.u64 %rd31, %rd30, %rd21; - ld.global.f32 %f36, [%rd31+8]; - setp.gt.ftz.f32 %p3, %f36, %f35; - @!%p3 bra $Lt_1_27138; - ld.global.f32 %f37, [%rd31+12]; - mov.f32 %f38, 0f00000000; // 0 - setp.eq.ftz.f32 %p4, %f37, %f38; - @!%p4 bra $Lt_1_27138; - .loc 17 296 0 - rcp.approx.ftz.f32 %f39, %f35; - mul.ftz.f32 %f40, %f39, %f39; - mul.ftz.f32 %f41, %f39, %f40; - mul.ftz.f32 %f42, %f39, %f41; - ld.global.v2.f32 {%f43,%f44}, [%rd31+0]; - mul.ftz.f32 %f45, %f43, %f41; - sub.ftz.f32 %f46, %f45, %f44; - mul.ftz.f32 %f47, %f42, %f46; - mul.ftz.f32 %f48, %f25, %f47; - .loc 17 298 0 - fma.rn.ftz.f32 %f23, %f31, %f48, %f23; - .loc 17 299 0 - fma.rn.ftz.f32 %f22, %f30, %f48, %f22; - .loc 17 300 0 - fma.rn.ftz.f32 %f21, %f32, %f48, %f21; - ld.param.s32 %r29, [__cudaparm_kernel_lj_eflag]; - mov.u32 %r30, 0; - setp.le.s32 %p5, %r29, %r30; - @%p5 bra $Lt_1_19970; - .loc 17 304 0 - ld.param.u64 %rd32, [__cudaparm_kernel_lj_lj3]; - add.u64 %rd33, %rd32, %rd30; - ld.global.v4.f32 {%f49,%f50,%f51,_}, [%rd33+0]; - mul.ftz.f32 %f52, %f49, %f41; - sub.ftz.f32 %f53, %f52, %f50; - mul.ftz.f32 %f54, %f41, %f53; - sub.ftz.f32 %f55, %f54, %f51; - fma.rn.ftz.f32 %f24, %f25, %f55, %f24; -$Lt_1_19970: - ld.param.s32 %r31, [__cudaparm_kernel_lj_vflag]; - mov.u32 %r32, 0; - setp.le.s32 %p6, %r31, %r32; - @%p6 bra $Lt_1_27138; - .loc 17 307 0 - mov.f32 %f56, %f6; - mul.ftz.f32 %f57, %f31, %f31; - fma.rn.ftz.f32 %f58, %f48, %f57, %f56; - mov.f32 %f6, %f58; - .loc 17 308 0 - mov.f32 %f59, %f8; - fma.rn.ftz.f32 %f60, %f48, %f33, %f59; - mov.f32 %f8, %f60; - .loc 17 309 0 - mov.f32 %f61, %f10; - mul.ftz.f32 %f62, %f32, %f32; - fma.rn.ftz.f32 %f63, %f48, %f62, %f61; - mov.f32 %f10, %f63; - .loc 17 310 0 - mov.f32 %f64, %f12; - mul.ftz.f32 %f65, %f30, %f31; - fma.rn.ftz.f32 %f66, %f48, %f65, %f64; - mov.f32 %f12, %f66; - .loc 17 311 0 - mov.f32 %f67, %f14; - mul.ftz.f32 %f68, %f31, %f32; - fma.rn.ftz.f32 %f69, %f48, %f68, %f67; - mov.f32 %f14, %f69; - .loc 17 312 0 - mul.ftz.f32 %f70, %f30, %f32; - fma.rn.ftz.f32 %f15, %f48, %f70, %f15; - mov.f32 %f16, %f15; -$Lt_1_27138: -$L_1_18178: - .loc 17 306 0 - mul.lo.s32 %r33, %r15, %r1; - cvt.s64.s32 %rd34, %r33; - mul.wide.s32 %rd35, %r33, 4; - add.u64 %rd17, %rd17, %rd35; - setp.gt.u64 %p7, %rd20, %rd17; - @%p7 bra $Lt_1_19714; - bra.uni $Lt_1_19202; -$Lt_1_26882: - mov.f32 %f21, 0f00000000; // 0 - mov.f32 %f22, 0f00000000; // 0 - mov.f32 %f23, 0f00000000; // 0 - mov.f32 %f24, 0f00000000; // 0 -$Lt_1_19202: - mov.u32 %r34, 1; - setp.le.s32 %p8, %r1, %r34; - @%p8 bra $Lt_1_23298; - .loc 17 317 0 - mov.u64 %rd36, __cuda___cuda_local_var_33172_55_non_const_red_acc3332; - cvt.s64.s32 %rd37, %r2; - mul.wide.s32 %rd38, %r2, 4; - add.u64 %rd39, %rd36, %rd38; - mov.f32 %f71, %f23; - st.shared.f32 [%rd39+0], %f71; - mov.f32 %f72, %f22; - st.shared.f32 [%rd39+512], %f72; - mov.f32 %f73, %f21; - st.shared.f32 [%rd39+1024], %f73; - mov.f32 %f74, %f24; - st.shared.f32 [%rd39+1536], %f74; - shr.s32 %r35, %r1, 31; - mov.s32 %r36, 1; - and.b32 %r37, %r35, %r36; - add.s32 %r38, %r37, %r1; - shr.s32 %r39, %r38, 1; - mov.s32 %r40, %r39; - mov.u32 %r41, 0; - setp.ne.u32 %p9, %r39, %r41; - @!%p9 bra $Lt_1_21762; -$Lt_1_22274: - setp.ge.u32 %p10, %r17, %r40; - @%p10 bra $Lt_1_22530; - add.u32 %r42, %r2, %r40; - cvt.u64.u32 %rd40, %r42; - mul.wide.u32 %rd41, %r42, 4; - add.u64 %rd42, %rd36, %rd41; - ld.shared.f32 %f75, [%rd42+0]; - add.ftz.f32 %f71, %f75, %f71; - st.shared.f32 [%rd39+0], %f71; - ld.shared.f32 %f76, [%rd42+512]; - add.ftz.f32 %f72, %f76, %f72; - st.shared.f32 [%rd39+512], %f72; - ld.shared.f32 %f77, [%rd42+1024]; - add.ftz.f32 %f73, %f77, %f73; - st.shared.f32 [%rd39+1024], %f73; - ld.shared.f32 %f78, [%rd42+1536]; - add.ftz.f32 %f74, %f78, %f74; - st.shared.f32 [%rd39+1536], %f74; -$Lt_1_22530: - shr.u32 %r40, %r40, 1; - mov.u32 %r43, 0; - setp.ne.u32 %p11, %r40, %r43; - @%p11 bra $Lt_1_22274; -$Lt_1_21762: - mov.f32 %f23, %f71; - mov.f32 %f22, %f72; - mov.f32 %f21, %f73; - mov.f32 %f24, %f74; - ld.param.s32 %r44, [__cudaparm_kernel_lj_vflag]; - mov.u32 %r45, 0; - setp.le.s32 %p12, %r44, %r45; - @%p12 bra $Lt_1_23298; - mov.f32 %f71, %f6; - st.shared.f32 [%rd39+0], %f71; - mov.f32 %f72, %f8; - st.shared.f32 [%rd39+512], %f72; - mov.f32 %f73, %f10; - st.shared.f32 [%rd39+1024], %f73; - mov.f32 %f74, %f12; - st.shared.f32 [%rd39+1536], %f74; - mov.f32 %f79, %f14; - st.shared.f32 [%rd39+2048], %f79; - mov.f32 %f80, %f15; - st.shared.f32 [%rd39+2560], %f80; - mov.s32 %r46, %r39; - @!%p9 bra $Lt_1_23810; -$Lt_1_24322: - setp.ge.u32 %p13, %r17, %r46; - @%p13 bra $Lt_1_24578; - add.u32 %r47, %r2, %r46; - cvt.u64.u32 %rd43, %r47; - mul.wide.u32 %rd44, %r47, 4; - add.u64 %rd45, %rd36, %rd44; - ld.shared.f32 %f81, [%rd45+0]; - add.ftz.f32 %f71, %f81, %f71; - st.shared.f32 [%rd39+0], %f71; - ld.shared.f32 %f82, [%rd45+512]; - add.ftz.f32 %f72, %f82, %f72; - st.shared.f32 [%rd39+512], %f72; - ld.shared.f32 %f83, [%rd45+1024]; - add.ftz.f32 %f73, %f83, %f73; - st.shared.f32 [%rd39+1024], %f73; - ld.shared.f32 %f84, [%rd45+1536]; - add.ftz.f32 %f74, %f84, %f74; - st.shared.f32 [%rd39+1536], %f74; - ld.shared.f32 %f85, [%rd45+2048]; - add.ftz.f32 %f79, %f85, %f79; - st.shared.f32 [%rd39+2048], %f79; - ld.shared.f32 %f86, [%rd45+2560]; - add.ftz.f32 %f80, %f86, %f80; - st.shared.f32 [%rd39+2560], %f80; -$Lt_1_24578: - shr.u32 %r46, %r46, 1; - mov.u32 %r48, 0; - setp.ne.u32 %p14, %r46, %r48; - @%p14 bra $Lt_1_24322; -$Lt_1_23810: - mov.f32 %f6, %f71; - mov.f32 %f8, %f72; - mov.f32 %f10, %f73; - mov.f32 %f12, %f74; - mov.f32 %f14, %f79; - mov.f32 %f16, %f80; -$Lt_1_23298: -$Lt_1_21250: - mov.u32 %r49, 0; - setp.ne.s32 %p15, %r17, %r49; - @%p15 bra $Lt_1_25346; - ld.param.u64 %rd46, [__cudaparm_kernel_lj___val_paramengv]; - add.u64 %rd47, %rd46, %rd3; - ld.param.s32 %r50, [__cudaparm_kernel_lj_eflag]; - mov.u32 %r51, 0; - setp.le.s32 %p16, %r50, %r51; - @%p16 bra $Lt_1_25858; - ld.global.f32 %f87, [%rd47+0]; - add.ftz.f32 %f88, %f87, %f24; - st.global.f32 [%rd47+0], %f88; - cvt.s64.s32 %rd48, %r11; - mul.wide.s32 %rd49, %r11, 4; - add.u64 %rd47, %rd47, %rd49; -$Lt_1_25858: - ld.param.s32 %r52, [__cudaparm_kernel_lj_vflag]; - mov.u32 %r53, 0; - setp.le.s32 %p17, %r52, %r53; - @%p17 bra $Lt_1_26370; - ld.global.f32 %f89, [%rd47+0]; - mov.f32 %f90, %f6; - add.ftz.f32 %f91, %f89, %f90; - st.global.f32 [%rd47+0], %f91; - cvt.s64.s32 %rd50, %r11; - mul.wide.s32 %rd51, %r11, 4; - add.u64 %rd52, %rd51, %rd47; - ld.global.f32 %f92, [%rd52+0]; - mov.f32 %f93, %f8; - add.ftz.f32 %f94, %f92, %f93; - st.global.f32 [%rd52+0], %f94; - add.u64 %rd53, %rd51, %rd52; - ld.global.f32 %f95, [%rd53+0]; - mov.f32 %f96, %f10; - add.ftz.f32 %f97, %f95, %f96; - st.global.f32 [%rd53+0], %f97; - add.u64 %rd54, %rd51, %rd53; - ld.global.f32 %f98, [%rd54+0]; - mov.f32 %f99, %f12; - add.ftz.f32 %f100, %f98, %f99; - st.global.f32 [%rd54+0], %f100; - add.u64 %rd55, %rd51, %rd54; - ld.global.f32 %f101, [%rd55+0]; - mov.f32 %f102, %f14; - add.ftz.f32 %f103, %f101, %f102; - st.global.f32 [%rd55+0], %f103; - add.u64 %rd47, %rd51, %rd55; - ld.global.f32 %f104, [%rd47+0]; - mov.f32 %f105, %f16; - add.ftz.f32 %f106, %f104, %f105; - st.global.f32 [%rd47+0], %f106; -$Lt_1_26370: - ld.param.u64 %rd56, [__cudaparm_kernel_lj_ans]; - mul.lo.u64 %rd57, %rd2, 16; - add.u64 %rd58, %rd56, %rd57; - ld.global.v4.f32 {%f107,%f108,%f109,%f110}, [%rd58+0]; - add.ftz.f32 %f111, %f108, %f22; - add.ftz.f32 %f112, %f109, %f21; - add.ftz.f32 %f113, %f107, %f23; - st.global.v4.f32 [%rd58+0], {%f113,%f111,%f112,%f110}; -$Lt_1_25346: -$Lt_1_18690: - .loc 17 320 0 - exit; -$LDWend_kernel_lj: - } // kernel_lj - - .entry kernel_lj_fast ( - .param .u64 __cudaparm_kernel_lj_fast_x_, - .param .u64 __cudaparm_kernel_lj_fast_lj1_in, - .param .u64 __cudaparm_kernel_lj_fast_lj3_in, - .param .u64 __cudaparm_kernel_lj_fast_gum, - .param .s32 __cudaparm_kernel_lj_fast_stride, - .param .u64 __cudaparm_kernel_lj_fast_dev_ij, - .param .u64 __cudaparm_kernel_lj_fast_ans, - .param .u64 __cudaparm_kernel_lj_fast___val_paramengv, - .param .u64 __cudaparm_kernel_lj_fast_err_flag, - .param .s32 __cudaparm_kernel_lj_fast_eflag, - .param .s32 __cudaparm_kernel_lj_fast_vflag, - .param .s32 __cudaparm_kernel_lj_fast_start, - .param .s32 __cudaparm_kernel_lj_fast_inum, - .param .s32 __cudaparm_kernel_lj_fast_t_per_atom) - { - .reg .u32 %r<57>; - .reg .u64 %rd<72>; - .reg .f32 %f<122>; - .reg .pred %p<22>; - .shared .align 4 .b8 __cuda___cuda_local_var_33188_33_non_const_sp_lj6500[16]; - .shared .align 16 .b8 __cuda___cuda_local_var_33189_34_non_const_lj16528[1936]; - .shared .align 16 .b8 __cuda___cuda_local_var_33190_34_non_const_lj38464[1936]; - .shared .align 4 .b8 __cuda___cuda_local_var_33260_55_non_const_red_acc10400[3072]; - // __cuda_local_var_33204_9_non_const_virial = 16 - .loc 17 328 0 -$LDWbegin_kernel_lj_fast: - cvt.s32.u32 %r1, %tid.x; - mov.u32 %r2, 3; - setp.gt.s32 %p1, %r1, %r2; - @%p1 bra $Lt_2_20994; - .loc 17 337 0 - mov.u64 %rd1, __cuda___cuda_local_var_33188_33_non_const_sp_lj6500; - cvt.s64.s32 %rd2, %r1; - mul.wide.s32 %rd3, %r1, 4; - ld.param.u64 %rd4, [__cudaparm_kernel_lj_fast_gum]; - add.u64 %rd5, %rd4, %rd3; - ld.global.f32 %f1, [%rd5+12]; - add.u64 %rd6, %rd3, %rd1; - st.shared.f32 [%rd6+0], %f1; -$Lt_2_20994: - mov.u64 %rd1, __cuda___cuda_local_var_33188_33_non_const_sp_lj6500; - mov.u32 %r3, 120; - setp.gt.s32 %p2, %r1, %r3; - @%p2 bra $Lt_2_21506; - .loc 17 339 0 - mov.u64 %rd7, __cuda___cuda_local_var_33189_34_non_const_lj16528; - cvt.s64.s32 %rd8, %r1; - mul.wide.s32 %rd9, %r1, 16; - ld.param.u64 %rd10, [__cudaparm_kernel_lj_fast_lj1_in]; - add.u64 %rd11, %rd10, %rd9; - add.u64 %rd12, %rd9, %rd7; - ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0]; - st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5}; - ld.param.s32 %r4, [__cudaparm_kernel_lj_fast_eflag]; - mov.u32 %r5, 0; - setp.le.s32 %p3, %r4, %r5; - @%p3 bra $Lt_2_22018; - .loc 17 341 0 - mov.u64 %rd13, __cuda___cuda_local_var_33190_34_non_const_lj38464; - ld.param.u64 %rd14, [__cudaparm_kernel_lj_fast_lj3_in]; - add.u64 %rd15, %rd14, %rd9; - add.u64 %rd16, %rd9, %rd13; - ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0]; - st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9}; -$Lt_2_22018: - mov.u64 %rd13, __cuda___cuda_local_var_33190_34_non_const_lj38464; -$Lt_2_21506: - mov.u64 %rd13, __cuda___cuda_local_var_33190_34_non_const_lj38464; - mov.u64 %rd7, __cuda___cuda_local_var_33189_34_non_const_lj16528; - .loc 17 351 0 - mov.f32 %f10, 0f00000000; // 0 - mov.f32 %f11, %f10; - mov.f32 %f12, 0f00000000; // 0 - mov.f32 %f13, %f12; - mov.f32 %f14, 0f00000000; // 0 - mov.f32 %f15, %f14; - mov.f32 %f16, 0f00000000; // 0 - mov.f32 %f17, %f16; - mov.f32 %f18, 0f00000000; // 0 - mov.f32 %f19, %f18; - mov.f32 %f20, 0f00000000; // 0 - mov.f32 %f21, %f20; - .loc 17 353 0 - bar.sync 0; - ld.param.s32 %r6, [__cudaparm_kernel_lj_fast_t_per_atom]; - div.s32 %r7, %r1, %r6; - cvt.s32.u32 %r8, %ntid.x; - div.s32 %r9, %r8, %r6; - cvt.s32.u32 %r10, %ctaid.x; - mul.lo.s32 %r11, %r10, %r9; - add.s32 %r12, %r7, %r11; - ld.param.s32 %r13, [__cudaparm_kernel_lj_fast_start]; - add.s32 %r14, %r13, %r12; - ld.param.s32 %r15, [__cudaparm_kernel_lj_fast_inum]; - setp.ge.s32 %p4, %r14, %r15; - @%p4 bra $Lt_2_29186; - .loc 17 358 0 - cvt.s64.s32 %rd17, %r14; - mul.wide.s32 %rd18, %r14, 4; - ld.param.u64 %rd19, [__cudaparm_kernel_lj_fast_dev_ij]; - add.u64 %rd20, %rd19, %rd18; - ld.global.s32 %r16, [%rd20+0]; - ld.param.s32 %r17, [__cudaparm_kernel_lj_fast_stride]; - cvt.s64.s32 %rd21, %r17; - mul.wide.s32 %rd22, %r17, 4; - add.u64 %rd23, %rd22, %rd20; - ld.global.s32 %r18, [%rd23+0]; - .loc 17 361 0 - ld.param.u64 %rd24, [__cudaparm_kernel_lj_fast_x_]; - cvt.s64.s32 %rd25, %r16; - mul.wide.s32 %rd26, %r16, 16; - add.u64 %rd27, %rd24, %rd26; - ld.global.v4.f32 {%f22,%f23,%f24,%f25}, [%rd27+0]; - .loc 17 363 0 - cvt.s32.s64 %r19, %rd21; - sub.s32 %r20, %r6, 1; - and.b32 %r21, %r20, %r1; - add.u64 %rd28, %rd22, %rd23; - mul.lo.s32 %r22, %r19, %r21; - cvt.s64.s32 %rd29, %r22; - mul.wide.s32 %rd30, %r22, 4; - add.u64 %rd31, %rd28, %rd30; - mov.s64 %rd32, %rd31; - mul.lo.s32 %r23, %r19, %r18; - cvt.s64.s32 %rd33, %r23; - mul.wide.s32 %rd34, %r23, 4; - add.u64 %rd35, %rd28, %rd34; - setp.ge.u64 %p5, %rd31, %rd35; - @%p5 bra $Lt_2_30722; - cvt.rzi.ftz.s32.f32 %r24, %f25; - mul.lo.s32 %r25, %r24, 11; - cvt.rn.f32.s32 %f26, %r25; - mov.f32 %f27, 0f00000000; // 0 - mov.f32 %f28, 0f00000000; // 0 - mov.f32 %f29, 0f00000000; // 0 - mov.f32 %f30, 0f00000000; // 0 -$Lt_2_23554: - // Loop body line 363, nesting depth: 1, estimated iterations: unknown - .loc 17 368 0 - ld.global.s32 %r26, [%rd32+0]; - .loc 17 369 0 - shr.s32 %r27, %r26, 30; - and.b32 %r28, %r27, 3; - cvt.s64.s32 %rd36, %r28; - mul.wide.s32 %rd37, %r28, 4; - add.u64 %rd38, %rd1, %rd37; - ld.shared.f32 %f31, [%rd38+0]; - .loc 17 372 0 - and.b32 %r29, %r26, 1073741823; - cvt.s64.s32 %rd39, %r29; - mul.wide.s32 %rd40, %r29, 16; - add.u64 %rd41, %rd24, %rd40; - ld.global.v4.f32 {%f32,%f33,%f34,%f35}, [%rd41+0]; - .loc 17 368 0 - sub.ftz.f32 %f36, %f23, %f33; - sub.ftz.f32 %f37, %f22, %f32; - sub.ftz.f32 %f38, %f24, %f34; - mul.ftz.f32 %f39, %f36, %f36; - fma.rn.ftz.f32 %f40, %f37, %f37, %f39; - fma.rn.ftz.f32 %f41, %f38, %f38, %f40; - add.ftz.f32 %f42, %f26, %f35; - cvt.rzi.ftz.s32.f32 %r30, %f42; - cvt.s64.s32 %rd42, %r30; - mul.wide.s32 %rd43, %r30, 16; - add.u64 %rd44, %rd43, %rd7; - ld.shared.f32 %f43, [%rd44+8]; - setp.gt.ftz.f32 %p6, %f43, %f41; - @!%p6 bra $Lt_2_30978; - ld.shared.f32 %f44, [%rd44+12]; - mov.f32 %f45, 0f00000000; // 0 - setp.eq.ftz.f32 %p7, %f44, %f45; - @!%p7 bra $Lt_2_30978; - .loc 17 384 0 - rcp.approx.ftz.f32 %f46, %f41; - mul.ftz.f32 %f47, %f46, %f46; - mul.ftz.f32 %f48, %f46, %f47; - mul.ftz.f32 %f49, %f46, %f31; - mul.ftz.f32 %f50, %f48, %f49; - ld.shared.v2.f32 {%f51,%f52}, [%rd44+0]; - mul.ftz.f32 %f53, %f51, %f48; - sub.ftz.f32 %f54, %f53, %f52; - mul.ftz.f32 %f55, %f50, %f54; - .loc 17 386 0 - fma.rn.ftz.f32 %f29, %f37, %f55, %f29; - .loc 17 387 0 - fma.rn.ftz.f32 %f28, %f36, %f55, %f28; - .loc 17 388 0 - fma.rn.ftz.f32 %f27, %f38, %f55, %f27; - ld.param.s32 %r31, [__cudaparm_kernel_lj_fast_eflag]; - mov.u32 %r32, 0; - setp.le.s32 %p8, %r31, %r32; - @%p8 bra $Lt_2_23810; - .loc 17 391 0 - add.u64 %rd45, %rd43, %rd13; - ld.shared.v4.f32 {%f56,%f57,%f58,_}, [%rd45+0]; - mul.ftz.f32 %f59, %f56, %f48; - sub.ftz.f32 %f60, %f59, %f57; - mul.ftz.f32 %f61, %f48, %f60; - .loc 17 392 0 - sub.ftz.f32 %f62, %f61, %f58; - fma.rn.ftz.f32 %f30, %f31, %f62, %f30; -$Lt_2_23810: - ld.param.s32 %r33, [__cudaparm_kernel_lj_fast_vflag]; - mov.u32 %r34, 0; - setp.le.s32 %p9, %r33, %r34; - @%p9 bra $Lt_2_30978; - .loc 17 395 0 - mov.f32 %f63, %f11; - mul.ftz.f32 %f64, %f37, %f37; - fma.rn.ftz.f32 %f65, %f55, %f64, %f63; - mov.f32 %f11, %f65; - .loc 17 396 0 - mov.f32 %f66, %f13; - fma.rn.ftz.f32 %f67, %f55, %f39, %f66; - mov.f32 %f13, %f67; - .loc 17 397 0 - mov.f32 %f68, %f15; - mul.ftz.f32 %f69, %f38, %f38; - fma.rn.ftz.f32 %f70, %f55, %f69, %f68; - mov.f32 %f15, %f70; - .loc 17 398 0 - mov.f32 %f71, %f17; - mul.ftz.f32 %f72, %f36, %f37; - fma.rn.ftz.f32 %f73, %f55, %f72, %f71; - mov.f32 %f17, %f73; - .loc 17 399 0 - mov.f32 %f74, %f19; - mul.ftz.f32 %f75, %f37, %f38; - fma.rn.ftz.f32 %f76, %f55, %f75, %f74; - mov.f32 %f19, %f76; - .loc 17 400 0 - mul.ftz.f32 %f77, %f36, %f38; - fma.rn.ftz.f32 %f20, %f55, %f77, %f20; - mov.f32 %f21, %f20; -$Lt_2_30978: -$L_2_20482: - .loc 17 394 0 - mul.lo.s32 %r35, %r19, %r6; - cvt.s64.s32 %rd46, %r35; - mul.wide.s32 %rd47, %r35, 4; - add.u64 %rd32, %rd32, %rd47; - setp.gt.u64 %p10, %rd35, %rd32; - @%p10 bra $Lt_2_23554; - bra.uni $Lt_2_23042; -$Lt_2_30722: - mov.f32 %f27, 0f00000000; // 0 - mov.f32 %f28, 0f00000000; // 0 - mov.f32 %f29, 0f00000000; // 0 - mov.f32 %f30, 0f00000000; // 0 -$Lt_2_23042: - mov.u32 %r36, 1; - setp.le.s32 %p11, %r6, %r36; - @%p11 bra $Lt_2_27138; - .loc 17 405 0 - mov.u64 %rd48, __cuda___cuda_local_var_33260_55_non_const_red_acc10400; - cvt.s64.s32 %rd49, %r1; - mul.wide.s32 %rd50, %r1, 4; - add.u64 %rd51, %rd48, %rd50; - mov.f32 %f78, %f29; - st.shared.f32 [%rd51+0], %f78; - mov.f32 %f79, %f28; - st.shared.f32 [%rd51+512], %f79; - mov.f32 %f80, %f27; - st.shared.f32 [%rd51+1024], %f80; - mov.f32 %f81, %f30; - st.shared.f32 [%rd51+1536], %f81; - shr.s32 %r37, %r6, 31; - mov.s32 %r38, 1; - and.b32 %r39, %r37, %r38; - add.s32 %r40, %r39, %r6; - shr.s32 %r41, %r40, 1; - mov.s32 %r42, %r41; - mov.u32 %r43, 0; - setp.ne.u32 %p12, %r41, %r43; - @!%p12 bra $Lt_2_25602; -$Lt_2_26114: - setp.ge.u32 %p13, %r21, %r42; - @%p13 bra $Lt_2_26370; - add.u32 %r44, %r1, %r42; - cvt.u64.u32 %rd52, %r44; - mul.wide.u32 %rd53, %r44, 4; - add.u64 %rd54, %rd48, %rd53; - ld.shared.f32 %f82, [%rd54+0]; - add.ftz.f32 %f78, %f82, %f78; - st.shared.f32 [%rd51+0], %f78; - ld.shared.f32 %f83, [%rd54+512]; - add.ftz.f32 %f79, %f83, %f79; - st.shared.f32 [%rd51+512], %f79; - ld.shared.f32 %f84, [%rd54+1024]; - add.ftz.f32 %f80, %f84, %f80; - st.shared.f32 [%rd51+1024], %f80; - ld.shared.f32 %f85, [%rd54+1536]; - add.ftz.f32 %f81, %f85, %f81; - st.shared.f32 [%rd51+1536], %f81; -$Lt_2_26370: - shr.u32 %r42, %r42, 1; - mov.u32 %r45, 0; - setp.ne.u32 %p14, %r42, %r45; - @%p14 bra $Lt_2_26114; -$Lt_2_25602: - mov.f32 %f29, %f78; - mov.f32 %f28, %f79; - mov.f32 %f27, %f80; - mov.f32 %f30, %f81; - ld.param.s32 %r46, [__cudaparm_kernel_lj_fast_vflag]; - mov.u32 %r47, 0; - setp.le.s32 %p15, %r46, %r47; - @%p15 bra $Lt_2_27138; - mov.f32 %f78, %f11; - st.shared.f32 [%rd51+0], %f78; - mov.f32 %f79, %f13; - st.shared.f32 [%rd51+512], %f79; - mov.f32 %f80, %f15; - st.shared.f32 [%rd51+1024], %f80; - mov.f32 %f81, %f17; - st.shared.f32 [%rd51+1536], %f81; - mov.f32 %f86, %f19; - st.shared.f32 [%rd51+2048], %f86; - mov.f32 %f87, %f20; - st.shared.f32 [%rd51+2560], %f87; - mov.s32 %r48, %r41; - @!%p12 bra $Lt_2_27650; -$Lt_2_28162: - setp.ge.u32 %p16, %r21, %r48; - @%p16 bra $Lt_2_28418; - add.u32 %r49, %r1, %r48; - cvt.u64.u32 %rd55, %r49; - mul.wide.u32 %rd56, %r49, 4; - add.u64 %rd57, %rd48, %rd56; - ld.shared.f32 %f88, [%rd57+0]; - add.ftz.f32 %f78, %f88, %f78; - st.shared.f32 [%rd51+0], %f78; - ld.shared.f32 %f89, [%rd57+512]; - add.ftz.f32 %f79, %f89, %f79; - st.shared.f32 [%rd51+512], %f79; - ld.shared.f32 %f90, [%rd57+1024]; - add.ftz.f32 %f80, %f90, %f80; - st.shared.f32 [%rd51+1024], %f80; - ld.shared.f32 %f91, [%rd57+1536]; - add.ftz.f32 %f81, %f91, %f81; - st.shared.f32 [%rd51+1536], %f81; - ld.shared.f32 %f92, [%rd57+2048]; - add.ftz.f32 %f86, %f92, %f86; - st.shared.f32 [%rd51+2048], %f86; - ld.shared.f32 %f93, [%rd57+2560]; - add.ftz.f32 %f87, %f93, %f87; - st.shared.f32 [%rd51+2560], %f87; -$Lt_2_28418: - shr.u32 %r48, %r48, 1; - mov.u32 %r50, 0; - setp.ne.u32 %p17, %r48, %r50; - @%p17 bra $Lt_2_28162; -$Lt_2_27650: - mov.f32 %f11, %f78; - mov.f32 %f13, %f79; - mov.f32 %f15, %f80; - mov.f32 %f17, %f81; - mov.f32 %f19, %f86; - mov.f32 %f21, %f87; -$Lt_2_27138: -$Lt_2_25090: - mov.u32 %r51, 0; - setp.ne.s32 %p18, %r21, %r51; - @%p18 bra $Lt_2_29186; - ld.param.u64 %rd58, [__cudaparm_kernel_lj_fast___val_paramengv]; - add.u64 %rd59, %rd58, %rd18; - ld.param.s32 %r52, [__cudaparm_kernel_lj_fast_eflag]; - mov.u32 %r53, 0; - setp.le.s32 %p19, %r52, %r53; - @%p19 bra $Lt_2_29698; - ld.global.f32 %f94, [%rd59+0]; - add.ftz.f32 %f95, %f94, %f30; - st.global.f32 [%rd59+0], %f95; - cvt.s64.s32 %rd60, %r15; - mul.wide.s32 %rd61, %r15, 4; - add.u64 %rd59, %rd59, %rd61; -$Lt_2_29698: - ld.param.s32 %r54, [__cudaparm_kernel_lj_fast_vflag]; - mov.u32 %r55, 0; - setp.le.s32 %p20, %r54, %r55; - @%p20 bra $Lt_2_30210; - ld.global.f32 %f96, [%rd59+0]; - mov.f32 %f97, %f11; - add.ftz.f32 %f98, %f96, %f97; - st.global.f32 [%rd59+0], %f98; - cvt.s64.s32 %rd62, %r15; - mul.wide.s32 %rd63, %r15, 4; - add.u64 %rd64, %rd63, %rd59; - ld.global.f32 %f99, [%rd64+0]; - mov.f32 %f100, %f13; - add.ftz.f32 %f101, %f99, %f100; - st.global.f32 [%rd64+0], %f101; - add.u64 %rd65, %rd63, %rd64; - ld.global.f32 %f102, [%rd65+0]; - mov.f32 %f103, %f15; - add.ftz.f32 %f104, %f102, %f103; - st.global.f32 [%rd65+0], %f104; - add.u64 %rd66, %rd63, %rd65; - ld.global.f32 %f105, [%rd66+0]; - mov.f32 %f106, %f17; - add.ftz.f32 %f107, %f105, %f106; - st.global.f32 [%rd66+0], %f107; - add.u64 %rd67, %rd63, %rd66; - ld.global.f32 %f108, [%rd67+0]; - mov.f32 %f109, %f19; - add.ftz.f32 %f110, %f108, %f109; - st.global.f32 [%rd67+0], %f110; - add.u64 %rd59, %rd63, %rd67; - ld.global.f32 %f111, [%rd59+0]; - mov.f32 %f112, %f21; - add.ftz.f32 %f113, %f111, %f112; - st.global.f32 [%rd59+0], %f113; -$Lt_2_30210: - ld.param.u64 %rd68, [__cudaparm_kernel_lj_fast_ans]; - mul.lo.u64 %rd69, %rd17, 16; - add.u64 %rd70, %rd68, %rd69; - ld.global.v4.f32 {%f114,%f115,%f116,%f117}, [%rd70+0]; - add.ftz.f32 %f118, %f115, %f28; - add.ftz.f32 %f119, %f116, %f27; - add.ftz.f32 %f120, %f114, %f29; - st.global.v4.f32 [%rd70+0], {%f120,%f118,%f119,%f117}; -$Lt_2_29186: -$Lt_2_22530: - .loc 17 408 0 - exit; -$LDWend_kernel_lj_fast: - } // kernel_lj_fast - diff --git a/lib/gpu/gayberne_lj_ptx.h b/lib/gpu/gayberne_lj_ptx.h deleted file mode 100644 index f4a7b21b12..0000000000 --- a/lib/gpu/gayberne_lj_ptx.h +++ /dev/null @@ -1,1860 +0,0 @@ -const char * gayberne_lj = -" .version 2.3\n" -" .target sm_20\n" -" .address_size 64\n" -" .entry kernel_sphere_ellipsoid (\n" -" .param .u64 __cudaparm_kernel_sphere_ellipsoid_x_,\n" -" .param .u64 __cudaparm_kernel_sphere_ellipsoid_q,\n" -" .param .u64 __cudaparm_kernel_sphere_ellipsoid_shape,\n" -" .param .u64 __cudaparm_kernel_sphere_ellipsoid_well,\n" -" .param .u64 __cudaparm_kernel_sphere_ellipsoid_gum,\n" -" .param .u64 __cudaparm_kernel_sphere_ellipsoid_sig_eps,\n" -" .param .s32 __cudaparm_kernel_sphere_ellipsoid_ntypes,\n" -" .param .u64 __cudaparm_kernel_sphere_ellipsoid_lshape,\n" -" .param .u64 __cudaparm_kernel_sphere_ellipsoid_dev_nbor,\n" -" .param .s32 __cudaparm_kernel_sphere_ellipsoid_stride,\n" -" .param .u64 __cudaparm_kernel_sphere_ellipsoid_ans,\n" -" .param .u64 __cudaparm_kernel_sphere_ellipsoid___val_paramengv,\n" -" .param .u64 __cudaparm_kernel_sphere_ellipsoid_err_flag,\n" -" .param .s32 __cudaparm_kernel_sphere_ellipsoid_eflag,\n" -" .param .s32 __cudaparm_kernel_sphere_ellipsoid_vflag,\n" -" .param .s32 __cudaparm_kernel_sphere_ellipsoid_start,\n" -" .param .s32 __cudaparm_kernel_sphere_ellipsoid_inum,\n" -" .param .s32 __cudaparm_kernel_sphere_ellipsoid_t_per_atom)\n" -" {\n" -" .reg .u32 %r<59>;\n" -" .reg .u64 %rd<79>;\n" -" .reg .f32 %f<432>;\n" -" .reg .pred %p<35>;\n" -" .shared .align 16 .b8 __cuda___cuda_local_var_32888_33_non_const_sp_lj124[16];\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_33089_55_non_const_red_acc140[3072];\n" -" .loc 17 28 0\n" -"$LDWbegin_kernel_sphere_ellipsoid:\n" -" .loc 17 34 0\n" -" ld.param.u64 %rd1, [__cudaparm_kernel_sphere_ellipsoid_gum];\n" -" ldu.global.f32 %f1, [%rd1+12];\n" -" .loc 17 35 0\n" -" ld.global.f32 %f2, [%rd1+16];\n" -" .loc 17 36 0\n" -" ld.global.f32 %f3, [%rd1+20];\n" -" .loc 17 37 0\n" -" ld.global.f32 %f4, [%rd1+24];\n" -" st.shared.v4.f32 [__cuda___cuda_local_var_32888_33_non_const_sp_lj124+0], {%f1,%f2,%f3,%f4};\n" -" .loc 17 46 0\n" -" mov.f32 %f5, 0f00000000; \n" -" mov.f32 %f6, %f5;\n" -" mov.f32 %f7, 0f00000000; \n" -" mov.f32 %f8, %f7;\n" -" mov.f32 %f9, 0f00000000; \n" -" mov.f32 %f10, %f9;\n" -" mov.f32 %f11, 0f00000000; \n" -" mov.f32 %f12, %f11;\n" -" mov.f32 %f13, 0f00000000; \n" -" mov.f32 %f14, %f13;\n" -" mov.f32 %f15, 0f00000000; \n" -" mov.f32 %f16, %f15;\n" -" ld.param.s32 %r1, [__cudaparm_kernel_sphere_ellipsoid_t_per_atom];\n" -" cvt.s32.u32 %r2, %tid.x;\n" -" div.s32 %r3, %r2, %r1;\n" -" cvt.s32.u32 %r4, %ntid.x;\n" -" div.s32 %r5, %r4, %r1;\n" -" cvt.s32.u32 %r6, %ctaid.x;\n" -" mul.lo.s32 %r7, %r6, %r5;\n" -" add.s32 %r8, %r3, %r7;\n" -" ld.param.s32 %r9, [__cudaparm_kernel_sphere_ellipsoid_start];\n" -" add.s32 %r10, %r9, %r8;\n" -" ld.param.s32 %r11, [__cudaparm_kernel_sphere_ellipsoid_inum];\n" -" setp.ge.s32 %p1, %r10, %r11;\n" -" @%p1 bra $Lt_0_55042;\n" -" .loc 17 51 0\n" -" cvt.s64.s32 %rd2, %r10;\n" -" mul.wide.s32 %rd3, %r10, 4;\n" -" ld.param.u64 %rd4, [__cudaparm_kernel_sphere_ellipsoid_dev_nbor];\n" -" add.u64 %rd5, %rd4, %rd3;\n" -" ld.global.s32 %r12, [%rd5+0];\n" -" ld.param.s32 %r13, [__cudaparm_kernel_sphere_ellipsoid_stride];\n" -" cvt.s64.s32 %rd6, %r13;\n" -" mul.wide.s32 %rd7, %r13, 4;\n" -" add.u64 %rd8, %rd7, %rd5;\n" -" ld.global.s32 %r14, [%rd8+0];\n" -" .loc 17 54 0\n" -" ld.param.u64 %rd9, [__cudaparm_kernel_sphere_ellipsoid_x_];\n" -" cvt.s64.s32 %rd10, %r12;\n" -" mul.wide.s32 %rd11, %r12, 16;\n" -" add.u64 %rd12, %rd9, %rd11;\n" -" ld.global.v4.f32 {%f17,%f18,%f19,%f20}, [%rd12+0];\n" -" .loc 17 57 0\n" -" cvt.rzi.ftz.s32.f32 %r15, %f20;\n" -" cvt.s64.s32 %rd13, %r15;\n" -" mul.wide.s32 %rd14, %r15, 16;\n" -" ld.param.u64 %rd15, [__cudaparm_kernel_sphere_ellipsoid_shape];\n" -" add.u64 %rd16, %rd14, %rd15;\n" -" ld.global.f32 %f21, [%rd16+0];\n" -" .loc 17 58 0\n" -" ld.param.u64 %rd17, [__cudaparm_kernel_sphere_ellipsoid_well];\n" -" add.u64 %rd18, %rd14, %rd17;\n" -" ld.global.f32 %f22, [%rd18+0];\n" -" cvt.s32.s64 %r16, %rd6;\n" -" sub.s32 %r17, %r1, 1;\n" -" and.b32 %r18, %r17, %r2;\n" -" add.u64 %rd19, %rd7, %rd8;\n" -" mul.lo.s32 %r19, %r16, %r18;\n" -" cvt.s64.s32 %rd20, %r19;\n" -" mul.wide.s32 %rd21, %r19, 4;\n" -" add.u64 %rd22, %rd19, %rd21;\n" -" mov.s64 %rd23, %rd22;\n" -" mul.lo.s32 %r20, %r16, %r14;\n" -" cvt.s64.s32 %rd24, %r20;\n" -" mul.wide.s32 %rd25, %r20, 4;\n" -" add.u64 %rd26, %rd19, %rd25;\n" -" setp.ge.u64 %p2, %rd22, %rd26;\n" -" @%p2 bra $Lt_0_56578;\n" -" ld.param.s32 %r21, [__cudaparm_kernel_sphere_ellipsoid_eflag];\n" -" mov.s32 %r22, 0;\n" -" setp.gt.s32 %p3, %r21, %r22;\n" -" ld.param.s32 %r23, [__cudaparm_kernel_sphere_ellipsoid_vflag];\n" -" mov.s32 %r24, 0;\n" -" setp.gt.s32 %p4, %r23, %r24;\n" -" ld.param.s32 %r25, [__cudaparm_kernel_sphere_ellipsoid_ntypes];\n" -" mul.lo.s32 %r26, %r25, %r15;\n" -" ld.param.u64 %rd27, [__cudaparm_kernel_sphere_ellipsoid_lshape];\n" -" mul.lo.u64 %rd28, %rd13, 4;\n" -" add.u64 %rd29, %rd27, %rd28;\n" -" ld.param.u64 %rd30, [__cudaparm_kernel_sphere_ellipsoid_sig_eps];\n" -" ld.param.u64 %rd31, [__cudaparm_kernel_sphere_ellipsoid_q];\n" -" mov.f32 %f23, 0f00000000; \n" -" mov.f32 %f24, 0f00000000; \n" -" mov.f32 %f25, 0f00000000; \n" -" mov.f32 %f26, 0f00000000; \n" -" mov.u64 %rd32, __cuda___cuda_local_var_32888_33_non_const_sp_lj124;\n" -"$Lt_0_40706:\n" -" .loc 17 63 0\n" -" ld.global.s32 %r27, [%rd23+0];\n" -" .loc 17 64 0\n" -" shr.s32 %r28, %r27, 30;\n" -" and.b32 %r29, %r28, 3;\n" -" cvt.s64.s32 %rd33, %r29;\n" -" mul.wide.s32 %rd34, %r29, 4;\n" -" add.u64 %rd35, %rd32, %rd34;\n" -" ld.shared.f32 %f27, [%rd35+0];\n" -" .loc 17 67 0\n" -" and.b32 %r30, %r27, 1073741823;\n" -" cvt.s64.s32 %rd36, %r30;\n" -" mul.wide.s32 %rd37, %r30, 16;\n" -" add.u64 %rd38, %rd37, %rd9;\n" -" ld.global.v4.f32 {%f28,%f29,%f30,%f31}, [%rd38+0];\n" -" .loc 17 86 0\n" -" add.u64 %rd39, %rd37, %rd31;\n" -" ld.global.v4.f32 {%f32,%f33,%f34,%f35}, [%rd39+0];\n" -" .loc 17 95 0\n" -" cvt.rzi.ftz.s32.f32 %r31, %f31;\n" -" cvt.s64.s32 %rd40, %r31;\n" -" mul.wide.s32 %rd41, %r31, 16;\n" -" add.u64 %rd42, %rd41, %rd15;\n" -" ld.global.v4.f32 {%f36,%f37,%f38,_}, [%rd42+0];\n" -" .loc 16 299 0\n" -" sub.ftz.f32 %f39, %f28, %f17;\n" -" mov.f32 %f40, %f39;\n" -" .loc 16 300 0\n" -" add.ftz.f32 %f41, %f33, %f33;\n" -" add.ftz.f32 %f42, %f35, %f35;\n" -" mul.ftz.f32 %f43, %f32, %f32;\n" -" mul.ftz.f32 %f44, %f33, %f33;\n" -" mul.ftz.f32 %f45, %f34, %f34;\n" -" mul.ftz.f32 %f46, %f35, %f35;\n" -" add.ftz.f32 %f47, %f34, %f34;\n" -" mul.ftz.f32 %f48, %f41, %f34;\n" -" mul.ftz.f32 %f49, %f41, %f35;\n" -" mul.ftz.f32 %f50, %f42, %f32;\n" -" add.ftz.f32 %f51, %f43, %f44;\n" -" mul.ftz.f32 %f52, %f47, %f32;\n" -" sub.ftz.f32 %f53, %f48, %f50;\n" -" sub.ftz.f32 %f54, %f51, %f45;\n" -" add.ftz.f32 %f55, %f49, %f52;\n" -" mul.ftz.f32 %f56, %f53, %f37;\n" -" sub.ftz.f32 %f57, %f54, %f46;\n" -" mul.ftz.f32 %f58, %f55, %f38;\n" -" mul.ftz.f32 %f59, %f53, %f56;\n" -" mul.ftz.f32 %f60, %f57, %f36;\n" -" fma.rn.ftz.f32 %f61, %f57, %f60, %f59;\n" -" fma.rn.ftz.f32 %f62, %f55, %f58, %f61;\n" -" add.ftz.f32 %f63, %f62, %f21;\n" -" mov.f32 %f64, %f63;\n" -" .loc 16 301 0\n" -" mul.ftz.f32 %f65, %f41, %f32;\n" -" sub.ftz.f32 %f66, %f43, %f44;\n" -" mul.ftz.f32 %f67, %f47, %f35;\n" -" add.ftz.f32 %f68, %f48, %f50;\n" -" add.ftz.f32 %f69, %f45, %f66;\n" -" sub.ftz.f32 %f70, %f67, %f65;\n" -" mul.ftz.f32 %f71, %f68, %f36;\n" -" sub.ftz.f32 %f72, %f69, %f46;\n" -" mul.ftz.f32 %f73, %f70, %f38;\n" -" mul.ftz.f32 %f74, %f72, %f37;\n" -" mul.ftz.f32 %f75, %f53, %f74;\n" -" fma.rn.ftz.f32 %f76, %f57, %f71, %f75;\n" -" fma.rn.ftz.f32 %f77, %f55, %f73, %f76;\n" -" mov.f32 %f78, %f77;\n" -" .loc 16 302 0\n" -" sub.ftz.f32 %f79, %f66, %f45;\n" -" sub.ftz.f32 %f80, %f49, %f52;\n" -" add.ftz.f32 %f81, %f65, %f67;\n" -" add.ftz.f32 %f82, %f46, %f79;\n" -" mul.ftz.f32 %f83, %f80, %f36;\n" -" mul.ftz.f32 %f84, %f81, %f37;\n" -" mul.ftz.f32 %f85, %f82, %f38;\n" -" mul.ftz.f32 %f86, %f53, %f84;\n" -" fma.rn.ftz.f32 %f87, %f57, %f83, %f86;\n" -" fma.rn.ftz.f32 %f88, %f55, %f85, %f87;\n" -" mov.f32 %f89, %f88;\n" -" .loc 16 303 0\n" -" sub.ftz.f32 %f90, %f29, %f18;\n" -" mov.f32 %f91, %f90;\n" -" .loc 16 304 0\n" -" mul.ftz.f32 %f92, %f56, %f72;\n" -" fma.rn.ftz.f32 %f93, %f60, %f68, %f92;\n" -" fma.rn.ftz.f32 %f94, %f58, %f70, %f93;\n" -" mov.f32 %f95, %f94;\n" -" .loc 16 305 0\n" -" mul.ftz.f32 %f96, %f72, %f74;\n" -" fma.rn.ftz.f32 %f97, %f68, %f71, %f96;\n" -" fma.rn.ftz.f32 %f98, %f70, %f73, %f97;\n" -" add.ftz.f32 %f99, %f98, %f21;\n" -" mov.f32 %f100, %f99;\n" -" .loc 16 306 0\n" -" mul.ftz.f32 %f101, %f72, %f84;\n" -" fma.rn.ftz.f32 %f102, %f68, %f83, %f101;\n" -" fma.rn.ftz.f32 %f103, %f70, %f85, %f102;\n" -" mov.f32 %f104, %f103;\n" -" .loc 16 307 0\n" -" sub.ftz.f32 %f105, %f30, %f19;\n" -" mov.f32 %f106, %f105;\n" -" .loc 16 308 0\n" -" mul.ftz.f32 %f107, %f81, %f56;\n" -" fma.rn.ftz.f32 %f108, %f60, %f80, %f107;\n" -" fma.rn.ftz.f32 %f109, %f58, %f82, %f108;\n" -" mov.f32 %f110, %f109;\n" -" .loc 16 309 0\n" -" mul.ftz.f32 %f111, %f81, %f74;\n" -" fma.rn.ftz.f32 %f112, %f71, %f80, %f111;\n" -" fma.rn.ftz.f32 %f113, %f73, %f82, %f112;\n" -" mov.f32 %f114, %f113;\n" -" .loc 16 310 0\n" -" mul.ftz.f32 %f115, %f81, %f84;\n" -" fma.rn.ftz.f32 %f116, %f80, %f83, %f115;\n" -" fma.rn.ftz.f32 %f117, %f82, %f85, %f116;\n" -" add.ftz.f32 %f118, %f117, %f21;\n" -" mov.f32 %f119, %f118;\n" -" abs.ftz.f32 %f120, %f94;\n" -" abs.ftz.f32 %f121, %f63;\n" -" setp.gt.ftz.f32 %p5, %f120, %f121;\n" -" @!%p5 bra $Lt_0_40962;\n" -" .loc 16 314 0\n" -" mov.f32 %f64, %f94;\n" -" mov.f32 %f95, %f63;\n" -" .loc 16 315 0\n" -" mov.f32 %f78, %f99;\n" -" mov.f32 %f100, %f77;\n" -" .loc 16 316 0\n" -" mov.f32 %f89, %f103;\n" -" mov.f32 %f104, %f88;\n" -" .loc 16 317 0\n" -" mov.f32 %f40, %f90;\n" -" mov.f32 %f91, %f39;\n" -"$Lt_0_40962:\n" -" mov.f32 %f122, %f64;\n" -" abs.ftz.f32 %f123, %f122;\n" -" abs.ftz.f32 %f124, %f109;\n" -" setp.lt.ftz.f32 %p6, %f123, %f124;\n" -" @!%p6 bra $Lt_0_41474;\n" -" .loc 16 321 0\n" -" mov.f32 %f64, %f109;\n" -" mov.f32 %f110, %f122;\n" -" .loc 16 322 0\n" -" mov.f32 %f125, %f78;\n" -" mov.f32 %f78, %f113;\n" -" mov.f32 %f114, %f125;\n" -" .loc 16 323 0\n" -" mov.f32 %f126, %f89;\n" -" mov.f32 %f89, %f118;\n" -" mov.f32 %f119, %f126;\n" -" .loc 16 324 0\n" -" mov.f32 %f127, %f40;\n" -" mov.f32 %f40, %f105;\n" -" mov.f32 %f106, %f127;\n" -"$Lt_0_41474:\n" -" mov.f32 %f128, %f64;\n" -" mov.f32 %f129, 0f00000000; \n" -" setp.neu.ftz.f32 %p7, %f128, %f129;\n" -" @!%p7 bra $Lt_0_42242;\n" -" bra.uni $Lt_0_43010;\n" -"$Lt_0_42242:\n" -" mov.f32 %f130, 0f00000000; \n" -" setp.neu.ftz.f32 %p8, %f95, %f130;\n" -" @!%p8 bra $Lt_0_42754;\n" -" .loc 16 338 0\n" -" mov.f32 %f64, %f95;\n" -" mov.f32 %f95, %f128;\n" -" .loc 16 339 0\n" -" mov.f32 %f131, %f78;\n" -" mov.f32 %f78, %f100;\n" -" mov.f32 %f100, %f131;\n" -" .loc 16 340 0\n" -" mov.f32 %f132, %f89;\n" -" mov.f32 %f89, %f104;\n" -" mov.f32 %f104, %f132;\n" -" .loc 16 341 0\n" -" mov.f32 %f133, %f40;\n" -" mov.f32 %f40, %f91;\n" -" mov.f32 %f91, %f133;\n" -" bra.uni $Lt_0_43010;\n" -"$Lt_0_42754:\n" -" mov.f32 %f134, 0f00000000; \n" -" setp.neu.ftz.f32 %p9, %f110, %f134;\n" -" @!%p9 bra $Lt_0_43266;\n" -" .loc 16 346 0\n" -" mov.f32 %f64, %f110;\n" -" mov.f32 %f110, %f128;\n" -" .loc 16 347 0\n" -" mov.f32 %f135, %f78;\n" -" mov.f32 %f78, %f114;\n" -" mov.f32 %f114, %f135;\n" -" .loc 16 348 0\n" -" mov.f32 %f136, %f89;\n" -" mov.f32 %f89, %f119;\n" -" mov.f32 %f119, %f136;\n" -" .loc 16 349 0\n" -" mov.f32 %f137, %f40;\n" -" mov.f32 %f40, %f106;\n" -" mov.f32 %f106, %f137;\n" -" bra.uni $Lt_0_43010;\n" -"$Lt_0_43266:\n" -" .loc 16 352 0\n" -" mov.s32 %r32, 2;\n" -" ld.param.u64 %rd43, [__cudaparm_kernel_sphere_ellipsoid_err_flag];\n" -" st.global.s32 [%rd43+0], %r32;\n" -"$Lt_0_43010:\n" -"$Lt_0_42498:\n" -"$Lt_0_41986:\n" -" .loc 16 355 0\n" -" div.approx.ftz.f32 %f138, %f95, %f64;\n" -" mul.ftz.f32 %f139, %f78, %f138;\n" -" sub.ftz.f32 %f140, %f100, %f139;\n" -" mov.f32 %f100, %f140;\n" -" .loc 16 356 0\n" -" mul.ftz.f32 %f141, %f89, %f138;\n" -" sub.ftz.f32 %f142, %f104, %f141;\n" -" mov.f32 %f104, %f142;\n" -" .loc 16 357 0\n" -" mul.ftz.f32 %f143, %f40, %f138;\n" -" sub.ftz.f32 %f144, %f91, %f143;\n" -" mov.f32 %f91, %f144;\n" -" .loc 16 359 0\n" -" div.approx.ftz.f32 %f145, %f110, %f64;\n" -" mul.ftz.f32 %f146, %f78, %f145;\n" -" sub.ftz.f32 %f114, %f114, %f146;\n" -" .loc 16 360 0\n" -" mul.ftz.f32 %f147, %f89, %f145;\n" -" sub.ftz.f32 %f119, %f119, %f147;\n" -" .loc 16 361 0\n" -" mul.ftz.f32 %f148, %f40, %f145;\n" -" sub.ftz.f32 %f106, %f106, %f148;\n" -" abs.ftz.f32 %f149, %f140;\n" -" abs.ftz.f32 %f150, %f114;\n" -" setp.lt.ftz.f32 %p10, %f149, %f150;\n" -" @!%p10 bra $Lt_0_43522;\n" -" .loc 16 366 0\n" -" mov.f32 %f100, %f114;\n" -" mov.f32 %f114, %f140;\n" -" .loc 16 367 0\n" -" mov.f32 %f104, %f119;\n" -" mov.f32 %f119, %f142;\n" -" .loc 16 368 0\n" -" mov.f32 %f91, %f106;\n" -" mov.f32 %f106, %f144;\n" -"$Lt_0_43522:\n" -" mov.f32 %f151, %f100;\n" -" mov.f32 %f152, 0f00000000; \n" -" setp.neu.ftz.f32 %p11, %f151, %f152;\n" -" @!%p11 bra $Lt_0_44290;\n" -" bra.uni $Lt_0_44546;\n" -"$Lt_0_44290:\n" -" mov.f32 %f153, 0f00000000; \n" -" setp.neu.ftz.f32 %p12, %f114, %f153;\n" -" @!%p12 bra $Lt_0_44546;\n" -" .loc 16 383 0\n" -" mov.f32 %f100, %f114;\n" -" mov.f32 %f114, %f151;\n" -" .loc 16 384 0\n" -" mov.f32 %f154, %f104;\n" -" mov.f32 %f104, %f119;\n" -" mov.f32 %f119, %f154;\n" -" .loc 16 385 0\n" -" mov.f32 %f155, %f91;\n" -" mov.f32 %f91, %f106;\n" -" mov.f32 %f106, %f155;\n" -"$Lt_0_44546:\n" -"$Lt_0_44034:\n" -" .loc 16 390 0\n" -" div.approx.ftz.f32 %f156, %f114, %f100;\n" -" mul.ftz.f32 %f157, %f104, %f156;\n" -" sub.ftz.f32 %f119, %f119, %f157;\n" -" .loc 16 391 0\n" -" mul.ftz.f32 %f158, %f91, %f156;\n" -" sub.ftz.f32 %f106, %f106, %f158;\n" -" mov.f32 %f159, 0f00000000; \n" -" setp.eq.ftz.f32 %p13, %f119, %f159;\n" -" @!%p13 bra $Lt_0_45058;\n" -" .loc 16 394 0\n" -" mov.s32 %r33, 2;\n" -" ld.param.u64 %rd44, [__cudaparm_kernel_sphere_ellipsoid_err_flag];\n" -" st.global.s32 [%rd44+0], %r33;\n" -"$Lt_0_45058:\n" -" .loc 17 115 0\n" -" div.approx.ftz.f32 %f160, %f106, %f119;\n" -" mul.ftz.f32 %f161, %f90, %f90;\n" -" mul.ftz.f32 %f162, %f160, %f104;\n" -" fma.rn.ftz.f32 %f163, %f39, %f39, %f161;\n" -" sub.ftz.f32 %f164, %f91, %f162;\n" -" fma.rn.ftz.f32 %f165, %f105, %f105, %f163;\n" -" div.approx.ftz.f32 %f166, %f164, %f100;\n" -" rsqrt.approx.ftz.f32 %f167, %f165;\n" -" mul.ftz.f32 %f168, %f166, %f78;\n" -" fma.rn.ftz.f32 %f169, %f89, %f160, %f168;\n" -" sub.ftz.f32 %f170, %f40, %f169;\n" -" div.approx.ftz.f32 %f171, %f170, %f64;\n" -" mul.ftz.f32 %f172, %f167, %f171;\n" -" .loc 17 127 0\n" -" mul.ftz.f32 %f173, %f166, %f167;\n" -" mul.ftz.f32 %f174, %f167, %f90;\n" -" mul.ftz.f32 %f175, %f167, %f39;\n" -" mul.ftz.f32 %f176, %f167, %f105;\n" -" mul.ftz.f32 %f177, %f160, %f167;\n" -" mul.ftz.f32 %f178, %f173, %f174;\n" -" fma.rn.ftz.f32 %f179, %f175, %f172, %f178;\n" -" fma.rn.ftz.f32 %f180, %f176, %f177, %f179;\n" -" mov.f32 %f181, 0f3f000000; \n" -" mul.ftz.f32 %f182, %f180, %f181;\n" -" rsqrt.approx.ftz.f32 %f183, %f182;\n" -" .loc 17 131 0\n" -" rcp.approx.ftz.f32 %f184, %f167;\n" -" mul.ftz.f32 %f185, %f184, %f172;\n" -" .loc 17 136 0\n" -" add.s32 %r34, %r31, %r26;\n" -" cvt.s64.s32 %rd45, %r34;\n" -" mul.wide.s32 %rd46, %r34, 8;\n" -" add.u64 %rd47, %rd30, %rd46;\n" -" ld.global.v2.f32 {%f186,%f187}, [%rd47+0];\n" -" .loc 17 138 0\n" -" sub.ftz.f32 %f188, %f184, %f183;\n" -" ld.global.f32 %f189, [%rd1+0];\n" -" fma.rn.ftz.f32 %f190, %f189, %f186, %f188;\n" -" .loc 17 145 0\n" -" div.approx.ftz.f32 %f191, %f186, %f190;\n" -" mul.ftz.f32 %f192, %f191, %f191;\n" -" mul.ftz.f32 %f193, %f191, %f192;\n" -" mul.ftz.f32 %f194, %f193, %f193;\n" -" mul.ftz.f32 %f195, %f194, %f194;\n" -" mul.ftz.f32 %f196, %f191, %f194;\n" -" add.ftz.f32 %f197, %f195, %f195;\n" -" mul.ftz.f32 %f198, %f191, %f197;\n" -" sub.ftz.f32 %f199, %f198, %f196;\n" -" div.approx.ftz.f32 %f200, %f199, %f186;\n" -" mov.f32 %f201, 0f41c00000; \n" -" mul.ftz.f32 %f202, %f200, %f201;\n" -" mul.ftz.f32 %f203, %f187, %f202;\n" -" .loc 17 150 0\n" -" mul.ftz.f32 %f204, %f183, %f203;\n" -" mul.ftz.f32 %f205, %f204, %f183;\n" -" mul.ftz.f32 %f206, %f205, %f183;\n" -" mov.f32 %f207, 0f3f000000; \n" -" mul.ftz.f32 %f208, %f206, %f207;\n" -" mul.ftz.f32 %f209, %f208, %f167;\n" -" mul.ftz.f32 %f210, %f173, %f184;\n" -" mul.ftz.f32 %f211, %f177, %f184;\n" -" mul.ftz.f32 %f212, %f167, %f209;\n" -" mul.ftz.f32 %f213, %f174, %f210;\n" -" fma.rn.ftz.f32 %f214, %f175, %f185, %f213;\n" -" fma.rn.ftz.f32 %f215, %f176, %f211, %f214;\n" -" mul.ftz.f32 %f216, %f175, %f215;\n" -" sub.ftz.f32 %f217, %f185, %f216;\n" -" mul.ftz.f32 %f218, %f212, %f217;\n" -" fma.rn.ftz.f32 %f219, %f175, %f203, %f218;\n" -" .loc 17 151 0\n" -" mul.ftz.f32 %f220, %f174, %f215;\n" -" sub.ftz.f32 %f221, %f210, %f220;\n" -" mul.ftz.f32 %f222, %f212, %f221;\n" -" fma.rn.ftz.f32 %f223, %f174, %f203, %f222;\n" -" .loc 17 152 0\n" -" mul.ftz.f32 %f224, %f176, %f215;\n" -" sub.ftz.f32 %f225, %f211, %f224;\n" -" mul.ftz.f32 %f226, %f212, %f225;\n" -" fma.rn.ftz.f32 %f227, %f176, %f203, %f226;\n" -" .loc 17 159 0\n" -" ld.global.f32 %f228, [%rd29+0];\n" -" mul.lo.u64 %rd48, %rd40, 4;\n" -" add.u64 %rd49, %rd27, %rd48;\n" -" ld.global.f32 %f229, [%rd49+0];\n" -" add.ftz.f32 %f230, %f228, %f228;\n" -" mul.ftz.f32 %f231, %f229, %f230;\n" -" .loc 17 160 0\n" -" mul.ftz.f32 %f232, %f103, %f63;\n" -" mul.ftz.f32 %f233, %f113, %f232;\n" -" mul.ftz.f32 %f234, %f99, %f63;\n" -" mul.ftz.f32 %f235, %f118, %f234;\n" -" sub.ftz.f32 %f236, %f235, %f233;\n" -" mul.ftz.f32 %f237, %f94, %f77;\n" -" mul.ftz.f32 %f238, %f118, %f237;\n" -" sub.ftz.f32 %f239, %f236, %f238;\n" -" mul.ftz.f32 %f240, %f94, %f88;\n" -" fma.rn.ftz.f32 %f241, %f113, %f240, %f239;\n" -" mul.ftz.f32 %f242, %f109, %f77;\n" -" fma.rn.ftz.f32 %f243, %f103, %f242, %f241;\n" -" mul.ftz.f32 %f244, %f109, %f88;\n" -" mul.ftz.f32 %f245, %f99, %f244;\n" -" sub.ftz.f32 %f246, %f243, %f245;\n" -" .loc 17 161 0\n" -" ld.global.f32 %f247, [%rd1+4];\n" -" .loc 17 172 0\n" -" add.u64 %rd50, %rd41, %rd17;\n" -" ld.global.v4.f32 {%f248,%f249,%f250,_}, [%rd50+0];\n" -" .loc 16 299 0\n" -" mov.f32 %f40, %f39;\n" -" .loc 16 300 0\n" -" mul.ftz.f32 %f251, %f53, %f249;\n" -" mul.ftz.f32 %f252, %f55, %f250;\n" -" mul.ftz.f32 %f253, %f53, %f251;\n" -" mul.ftz.f32 %f254, %f57, %f248;\n" -" fma.rn.ftz.f32 %f255, %f57, %f254, %f253;\n" -" fma.rn.ftz.f32 %f256, %f55, %f252, %f255;\n" -" add.ftz.f32 %f257, %f256, %f22;\n" -" mov.f32 %f64, %f257;\n" -" .loc 16 301 0\n" -" mul.ftz.f32 %f258, %f68, %f248;\n" -" mul.ftz.f32 %f259, %f70, %f250;\n" -" mul.ftz.f32 %f260, %f72, %f249;\n" -" mul.ftz.f32 %f261, %f53, %f260;\n" -" fma.rn.ftz.f32 %f262, %f57, %f258, %f261;\n" -" fma.rn.ftz.f32 %f263, %f55, %f259, %f262;\n" -" mov.f32 %f78, %f263;\n" -" .loc 16 302 0\n" -" mul.ftz.f32 %f264, %f80, %f248;\n" -" mul.ftz.f32 %f265, %f81, %f249;\n" -" mul.ftz.f32 %f266, %f82, %f250;\n" -" mul.ftz.f32 %f267, %f53, %f265;\n" -" fma.rn.ftz.f32 %f268, %f57, %f264, %f267;\n" -" fma.rn.ftz.f32 %f269, %f55, %f266, %f268;\n" -" mov.f32 %f89, %f269;\n" -" .loc 16 303 0\n" -" mov.f32 %f91, %f90;\n" -" .loc 16 304 0\n" -" mul.ftz.f32 %f270, %f251, %f72;\n" -" fma.rn.ftz.f32 %f271, %f254, %f68, %f270;\n" -" fma.rn.ftz.f32 %f272, %f252, %f70, %f271;\n" -" mov.f32 %f95, %f272;\n" -" .loc 16 305 0\n" -" mul.ftz.f32 %f273, %f72, %f260;\n" -" fma.rn.ftz.f32 %f274, %f68, %f258, %f273;\n" -" fma.rn.ftz.f32 %f275, %f70, %f259, %f274;\n" -" add.ftz.f32 %f100, %f22, %f275;\n" -" .loc 16 306 0\n" -" mul.ftz.f32 %f276, %f72, %f265;\n" -" fma.rn.ftz.f32 %f277, %f68, %f264, %f276;\n" -" fma.rn.ftz.f32 %f104, %f70, %f266, %f277;\n" -" .loc 16 307 0\n" -" mov.f32 %f106, %f105;\n" -" .loc 16 308 0\n" -" mul.ftz.f32 %f278, %f81, %f251;\n" -" fma.rn.ftz.f32 %f279, %f254, %f80, %f278;\n" -" fma.rn.ftz.f32 %f280, %f252, %f82, %f279;\n" -" mov.f32 %f110, %f280;\n" -" .loc 16 309 0\n" -" mul.ftz.f32 %f281, %f81, %f260;\n" -" fma.rn.ftz.f32 %f282, %f258, %f80, %f281;\n" -" fma.rn.ftz.f32 %f114, %f259, %f82, %f282;\n" -" .loc 16 310 0\n" -" mul.ftz.f32 %f283, %f81, %f265;\n" -" fma.rn.ftz.f32 %f284, %f80, %f264, %f283;\n" -" fma.rn.ftz.f32 %f285, %f82, %f266, %f284;\n" -" add.ftz.f32 %f119, %f22, %f285;\n" -" abs.ftz.f32 %f286, %f272;\n" -" abs.ftz.f32 %f287, %f257;\n" -" setp.gt.ftz.f32 %p14, %f286, %f287;\n" -" @!%p14 bra $Lt_0_45570;\n" -" .loc 16 314 0\n" -" mov.f32 %f64, %f272;\n" -" mov.f32 %f95, %f257;\n" -" .loc 16 315 0\n" -" mov.f32 %f78, %f100;\n" -" mov.f32 %f100, %f263;\n" -" .loc 16 316 0\n" -" mov.f32 %f89, %f104;\n" -" mov.f32 %f104, %f269;\n" -" .loc 16 317 0\n" -" mov.f32 %f40, %f90;\n" -" mov.f32 %f91, %f39;\n" -"$Lt_0_45570:\n" -" mov.f32 %f288, %f64;\n" -" abs.ftz.f32 %f289, %f288;\n" -" abs.ftz.f32 %f290, %f280;\n" -" setp.lt.ftz.f32 %p15, %f289, %f290;\n" -" @!%p15 bra $Lt_0_46082;\n" -" .loc 16 321 0\n" -" mov.f32 %f64, %f280;\n" -" mov.f32 %f110, %f288;\n" -" .loc 16 322 0\n" -" mov.f32 %f291, %f78;\n" -" mov.f32 %f78, %f114;\n" -" mov.f32 %f114, %f291;\n" -" .loc 16 323 0\n" -" mov.f32 %f292, %f89;\n" -" mov.f32 %f89, %f119;\n" -" mov.f32 %f119, %f292;\n" -" .loc 16 324 0\n" -" mov.f32 %f293, %f40;\n" -" mov.f32 %f40, %f105;\n" -" mov.f32 %f106, %f293;\n" -"$Lt_0_46082:\n" -" mov.f32 %f294, %f64;\n" -" mov.f32 %f295, 0f00000000; \n" -" setp.neu.ftz.f32 %p16, %f294, %f295;\n" -" @!%p16 bra $Lt_0_46850;\n" -" bra.uni $Lt_0_47618;\n" -"$Lt_0_46850:\n" -" mov.f32 %f296, 0f00000000; \n" -" setp.neu.ftz.f32 %p17, %f95, %f296;\n" -" @!%p17 bra $Lt_0_47362;\n" -" .loc 16 338 0\n" -" mov.f32 %f64, %f95;\n" -" mov.f32 %f95, %f294;\n" -" .loc 16 339 0\n" -" mov.f32 %f297, %f78;\n" -" mov.f32 %f78, %f100;\n" -" mov.f32 %f100, %f297;\n" -" .loc 16 340 0\n" -" mov.f32 %f298, %f89;\n" -" mov.f32 %f89, %f104;\n" -" mov.f32 %f104, %f298;\n" -" .loc 16 341 0\n" -" mov.f32 %f299, %f40;\n" -" mov.f32 %f40, %f91;\n" -" mov.f32 %f91, %f299;\n" -" bra.uni $Lt_0_47618;\n" -"$Lt_0_47362:\n" -" mov.f32 %f300, 0f00000000; \n" -" setp.neu.ftz.f32 %p18, %f110, %f300;\n" -" @!%p18 bra $Lt_0_47874;\n" -" .loc 16 346 0\n" -" mov.f32 %f64, %f110;\n" -" mov.f32 %f110, %f294;\n" -" .loc 16 347 0\n" -" mov.f32 %f301, %f78;\n" -" mov.f32 %f78, %f114;\n" -" mov.f32 %f114, %f301;\n" -" .loc 16 348 0\n" -" mov.f32 %f302, %f89;\n" -" mov.f32 %f89, %f119;\n" -" mov.f32 %f119, %f302;\n" -" .loc 16 349 0\n" -" mov.f32 %f303, %f40;\n" -" mov.f32 %f40, %f106;\n" -" mov.f32 %f106, %f303;\n" -" bra.uni $Lt_0_47618;\n" -"$Lt_0_47874:\n" -" .loc 16 352 0\n" -" mov.s32 %r35, 2;\n" -" ld.param.u64 %rd51, [__cudaparm_kernel_sphere_ellipsoid_err_flag];\n" -" st.global.s32 [%rd51+0], %r35;\n" -"$Lt_0_47618:\n" -"$Lt_0_47106:\n" -"$Lt_0_46594:\n" -" .loc 16 355 0\n" -" div.approx.ftz.f32 %f304, %f95, %f64;\n" -" mul.ftz.f32 %f305, %f78, %f304;\n" -" sub.ftz.f32 %f306, %f100, %f305;\n" -" mov.f32 %f100, %f306;\n" -" .loc 16 356 0\n" -" mul.ftz.f32 %f307, %f89, %f304;\n" -" sub.ftz.f32 %f308, %f104, %f307;\n" -" mov.f32 %f104, %f308;\n" -" .loc 16 357 0\n" -" mul.ftz.f32 %f309, %f40, %f304;\n" -" sub.ftz.f32 %f310, %f91, %f309;\n" -" mov.f32 %f91, %f310;\n" -" .loc 16 359 0\n" -" div.approx.ftz.f32 %f311, %f110, %f64;\n" -" mul.ftz.f32 %f312, %f78, %f311;\n" -" sub.ftz.f32 %f114, %f114, %f312;\n" -" .loc 16 360 0\n" -" mul.ftz.f32 %f313, %f89, %f311;\n" -" sub.ftz.f32 %f119, %f119, %f313;\n" -" .loc 16 361 0\n" -" mul.ftz.f32 %f314, %f40, %f311;\n" -" sub.ftz.f32 %f106, %f106, %f314;\n" -" abs.ftz.f32 %f315, %f306;\n" -" abs.ftz.f32 %f316, %f114;\n" -" setp.lt.ftz.f32 %p19, %f315, %f316;\n" -" @!%p19 bra $Lt_0_48130;\n" -" .loc 16 366 0\n" -" mov.f32 %f100, %f114;\n" -" mov.f32 %f114, %f306;\n" -" .loc 16 367 0\n" -" mov.f32 %f104, %f119;\n" -" mov.f32 %f119, %f308;\n" -" .loc 16 368 0\n" -" mov.f32 %f91, %f106;\n" -" mov.f32 %f106, %f310;\n" -"$Lt_0_48130:\n" -" mov.f32 %f317, %f100;\n" -" mov.f32 %f318, 0f00000000; \n" -" setp.neu.ftz.f32 %p20, %f317, %f318;\n" -" @!%p20 bra $Lt_0_48898;\n" -" bra.uni $Lt_0_49154;\n" -"$Lt_0_48898:\n" -" mov.f32 %f319, 0f00000000; \n" -" setp.neu.ftz.f32 %p21, %f114, %f319;\n" -" @!%p21 bra $Lt_0_49154;\n" -" .loc 16 383 0\n" -" mov.f32 %f100, %f114;\n" -" mov.f32 %f114, %f317;\n" -" .loc 16 384 0\n" -" mov.f32 %f320, %f104;\n" -" mov.f32 %f104, %f119;\n" -" mov.f32 %f119, %f320;\n" -" .loc 16 385 0\n" -" mov.f32 %f321, %f91;\n" -" mov.f32 %f91, %f106;\n" -" mov.f32 %f106, %f321;\n" -"$Lt_0_49154:\n" -"$Lt_0_48642:\n" -" .loc 16 390 0\n" -" div.approx.ftz.f32 %f322, %f114, %f100;\n" -" mul.ftz.f32 %f323, %f104, %f322;\n" -" sub.ftz.f32 %f119, %f119, %f323;\n" -" .loc 16 391 0\n" -" mul.ftz.f32 %f324, %f91, %f322;\n" -" sub.ftz.f32 %f106, %f106, %f324;\n" -" mov.f32 %f325, 0f00000000; \n" -" setp.eq.ftz.f32 %p22, %f119, %f325;\n" -" @!%p22 bra $Lt_0_49666;\n" -" .loc 16 394 0\n" -" mov.s32 %r36, 2;\n" -" ld.param.u64 %rd52, [__cudaparm_kernel_sphere_ellipsoid_err_flag];\n" -" st.global.s32 [%rd52+0], %r36;\n" -"$Lt_0_49666:\n" -" .loc 17 189 0\n" -" div.approx.ftz.f32 %f326, %f106, %f119;\n" -" mul.ftz.f32 %f327, %f326, %f104;\n" -" sub.ftz.f32 %f328, %f91, %f327;\n" -" div.approx.ftz.f32 %f329, %f328, %f100;\n" -" mul.ftz.f32 %f330, %f329, %f78;\n" -" fma.rn.ftz.f32 %f331, %f89, %f326, %f330;\n" -" sub.ftz.f32 %f332, %f40, %f331;\n" -" div.approx.ftz.f32 %f333, %f332, %f64;\n" -" mul.ftz.f32 %f334, %f167, %f333;\n" -" .loc 17 193 0\n" -" ld.global.f32 %f335, [%rd1+8];\n" -" .loc 21 496 0\n" -" mul.ftz.f32 %f336, %f329, %f167;\n" -" mul.ftz.f32 %f337, %f326, %f167;\n" -" mul.ftz.f32 %f338, %f336, %f174;\n" -" fma.rn.ftz.f32 %f339, %f175, %f334, %f338;\n" -" fma.rn.ftz.f32 %f340, %f176, %f337, %f339;\n" -" add.ftz.f32 %f341, %f340, %f340;\n" -" lg2.approx.ftz.f32 %f342, %f341;\n" -" .loc 21 538 0\n" -" mul.ftz.f32 %f343, %f342, %f335;\n" -" ex2.approx.ftz.f32 %f344, %f343;\n" -" .loc 17 196 0\n" -" mul.ftz.f32 %f345, %f184, %f334;\n" -" .loc 17 201 0\n" -" mov.f32 %f346, 0fbf800000; \n" -" add.ftz.f32 %f347, %f335, %f346;\n" -" .loc 21 496 0\n" -" lg2.approx.ftz.f32 %f348, %f344;\n" -" .loc 17 201 0\n" -" div.approx.ftz.f32 %f349, %f347, %f335;\n" -" mul.ftz.f32 %f350, %f348, %f349;\n" -" ex2.approx.ftz.f32 %f351, %f350;\n" -" mov.f32 %f352, 0fc0800000; \n" -" mul.ftz.f32 %f353, %f167, %f352;\n" -" mul.ftz.f32 %f354, %f167, %f353;\n" -" mul.ftz.f32 %f355, %f335, %f354;\n" -" mul.ftz.f32 %f356, %f351, %f355;\n" -" .loc 17 203 0\n" -" mul.ftz.f32 %f357, %f336, %f184;\n" -" mul.ftz.f32 %f358, %f337, %f184;\n" -" mul.ftz.f32 %f359, %f174, %f357;\n" -" fma.rn.ftz.f32 %f360, %f175, %f345, %f359;\n" -" fma.rn.ftz.f32 %f361, %f176, %f358, %f360;\n" -" mul.ftz.f32 %f362, %f175, %f361;\n" -" sub.ftz.f32 %f363, %f345, %f362;\n" -" mul.ftz.f32 %f364, %f356, %f363;\n" -" .loc 17 204 0\n" -" mul.ftz.f32 %f365, %f174, %f361;\n" -" sub.ftz.f32 %f366, %f357, %f365;\n" -" mul.ftz.f32 %f367, %f356, %f366;\n" -" .loc 17 205 0\n" -" mul.ftz.f32 %f368, %f176, %f361;\n" -" sub.ftz.f32 %f369, %f358, %f368;\n" -" mul.ftz.f32 %f370, %f356, %f369;\n" -" .loc 16 396 0\n" -" mov.f32 %f371, 0f40800000; \n" -" mul.ftz.f32 %f372, %f187, %f371;\n" -" div.approx.ftz.f32 %f373, %f231, %f246;\n" -" lg2.approx.ftz.f32 %f374, %f373;\n" -" mul.ftz.f32 %f375, %f374, %f247;\n" -" ex2.approx.ftz.f32 %f376, %f375;\n" -" mul.ftz.f32 %f377, %f376, %f27;\n" -" sub.ftz.f32 %f378, %f195, %f194;\n" -" mul.ftz.f32 %f379, %f377, %f344;\n" -" mul.ftz.f32 %f380, %f372, %f378;\n" -" fma.rn.ftz.f32 %f381, %f380, %f379, %f26;\n" -" selp.f32 %f26, %f381, %f26, %p3;\n" -" mul.ftz.f32 %f382, %f379, %f219;\n" -" mul.ftz.f32 %f383, %f379, %f223;\n" -" mul.ftz.f32 %f384, %f379, %f227;\n" -" mul.ftz.f32 %f385, %f376, %f380;\n" -" mul.ftz.f32 %f386, %f385, %f27;\n" -" neg.ftz.f32 %f387, %f386;\n" -" mul.ftz.f32 %f388, %f364, %f387;\n" -" sub.ftz.f32 %f389, %f388, %f382;\n" -" mul.ftz.f32 %f390, %f367, %f387;\n" -" sub.ftz.f32 %f391, %f390, %f383;\n" -" mul.ftz.f32 %f392, %f370, %f387;\n" -" sub.ftz.f32 %f393, %f392, %f384;\n" -" @!%p4 bra $Lt_0_50434;\n" -" .loc 17 217 0\n" -" add.ftz.f32 %f25, %f389, %f25;\n" -" .loc 17 218 0\n" -" sub.ftz.f32 %f394, %f17, %f28;\n" -" mov.f32 %f395, %f6;\n" -" fma.rn.ftz.f32 %f396, %f394, %f389, %f395;\n" -" mov.f32 %f6, %f396;\n" -" .loc 17 220 0\n" -" add.ftz.f32 %f24, %f391, %f24;\n" -" .loc 17 221 0\n" -" sub.ftz.f32 %f397, %f18, %f29;\n" -" mov.f32 %f398, %f8;\n" -" fma.rn.ftz.f32 %f399, %f397, %f391, %f398;\n" -" mov.f32 %f8, %f399;\n" -" .loc 17 222 0\n" -" mov.f32 %f400, %f12;\n" -" fma.rn.ftz.f32 %f401, %f394, %f391, %f400;\n" -" mov.f32 %f12, %f401;\n" -" .loc 17 224 0\n" -" add.ftz.f32 %f23, %f393, %f23;\n" -" .loc 17 225 0\n" -" mov.f32 %f402, %f10;\n" -" sub.ftz.f32 %f403, %f19, %f30;\n" -" fma.rn.ftz.f32 %f404, %f403, %f393, %f402;\n" -" mov.f32 %f10, %f404;\n" -" .loc 17 226 0\n" -" mov.f32 %f405, %f14;\n" -" fma.rn.ftz.f32 %f406, %f394, %f393, %f405;\n" -" mov.f32 %f14, %f406;\n" -" .loc 17 227 0\n" -" fma.rn.ftz.f32 %f15, %f397, %f393, %f15;\n" -" mov.f32 %f16, %f15;\n" -" bra.uni $Lt_0_50178;\n" -"$Lt_0_50434:\n" -" .loc 17 229 0\n" -" add.ftz.f32 %f25, %f389, %f25;\n" -" .loc 17 230 0\n" -" add.ftz.f32 %f24, %f391, %f24;\n" -" .loc 17 231 0\n" -" add.ftz.f32 %f23, %f393, %f23;\n" -"$Lt_0_50178:\n" -" mul.lo.s32 %r37, %r16, %r1;\n" -" cvt.s64.s32 %rd53, %r37;\n" -" mul.wide.s32 %rd54, %r37, 4;\n" -" add.u64 %rd23, %rd23, %rd54;\n" -" setp.gt.u64 %p23, %rd26, %rd23;\n" -" @%p23 bra $Lt_0_40706;\n" -" bra.uni $Lt_0_40194;\n" -"$Lt_0_56578:\n" -" mov.f32 %f23, 0f00000000; \n" -" mov.f32 %f24, 0f00000000; \n" -" mov.f32 %f25, 0f00000000; \n" -" mov.f32 %f26, 0f00000000; \n" -"$Lt_0_40194:\n" -" mov.u32 %r38, 1;\n" -" setp.le.s32 %p24, %r1, %r38;\n" -" @%p24 bra $Lt_0_52994;\n" -" .loc 17 234 0\n" -" mov.u64 %rd55, __cuda___cuda_local_var_33089_55_non_const_red_acc140;\n" -" cvt.s64.s32 %rd56, %r2;\n" -" mul.wide.s32 %rd57, %r2, 4;\n" -" add.u64 %rd58, %rd55, %rd57;\n" -" mov.f32 %f407, %f25;\n" -" st.shared.f32 [%rd58+0], %f407;\n" -" mov.f32 %f408, %f24;\n" -" st.shared.f32 [%rd58+512], %f408;\n" -" mov.f32 %f409, %f23;\n" -" st.shared.f32 [%rd58+1024], %f409;\n" -" mov.f32 %f410, %f26;\n" -" st.shared.f32 [%rd58+1536], %f410;\n" -" shr.s32 %r39, %r1, 31;\n" -" mov.s32 %r40, 1;\n" -" and.b32 %r41, %r39, %r40;\n" -" add.s32 %r42, %r41, %r1;\n" -" shr.s32 %r43, %r42, 1;\n" -" mov.s32 %r44, %r43;\n" -" mov.u32 %r45, 0;\n" -" setp.ne.u32 %p25, %r43, %r45;\n" -" @!%p25 bra $Lt_0_51458;\n" -"$Lt_0_51970:\n" -" setp.ge.u32 %p26, %r18, %r44;\n" -" @%p26 bra $Lt_0_52226;\n" -" add.u32 %r46, %r2, %r44;\n" -" cvt.u64.u32 %rd59, %r46;\n" -" mul.wide.u32 %rd60, %r46, 4;\n" -" add.u64 %rd61, %rd55, %rd60;\n" -" ld.shared.f32 %f411, [%rd61+0];\n" -" add.ftz.f32 %f407, %f411, %f407;\n" -" st.shared.f32 [%rd58+0], %f407;\n" -" ld.shared.f32 %f412, [%rd61+512];\n" -" add.ftz.f32 %f408, %f412, %f408;\n" -" st.shared.f32 [%rd58+512], %f408;\n" -" ld.shared.f32 %f413, [%rd61+1024];\n" -" add.ftz.f32 %f409, %f413, %f409;\n" -" st.shared.f32 [%rd58+1024], %f409;\n" -" ld.shared.f32 %f414, [%rd61+1536];\n" -" add.ftz.f32 %f410, %f414, %f410;\n" -" st.shared.f32 [%rd58+1536], %f410;\n" -"$Lt_0_52226:\n" -" shr.u32 %r44, %r44, 1;\n" -" mov.u32 %r47, 0;\n" -" setp.ne.u32 %p27, %r44, %r47;\n" -" @%p27 bra $Lt_0_51970;\n" -"$Lt_0_51458:\n" -" mov.f32 %f25, %f407;\n" -" mov.f32 %f24, %f408;\n" -" mov.f32 %f23, %f409;\n" -" mov.f32 %f26, %f410;\n" -" ld.param.s32 %r48, [__cudaparm_kernel_sphere_ellipsoid_vflag];\n" -" mov.u32 %r49, 0;\n" -" setp.le.s32 %p28, %r48, %r49;\n" -" @%p28 bra $Lt_0_52994;\n" -" mov.f32 %f407, %f6;\n" -" st.shared.f32 [%rd58+0], %f407;\n" -" mov.f32 %f408, %f8;\n" -" st.shared.f32 [%rd58+512], %f408;\n" -" mov.f32 %f409, %f10;\n" -" st.shared.f32 [%rd58+1024], %f409;\n" -" mov.f32 %f410, %f12;\n" -" st.shared.f32 [%rd58+1536], %f410;\n" -" mov.f32 %f415, %f14;\n" -" st.shared.f32 [%rd58+2048], %f415;\n" -" mov.f32 %f416, %f15;\n" -" st.shared.f32 [%rd58+2560], %f416;\n" -" mov.s32 %r50, %r43;\n" -" @!%p25 bra $Lt_0_53506;\n" -"$Lt_0_54018:\n" -" setp.ge.u32 %p29, %r18, %r50;\n" -" @%p29 bra $Lt_0_54274;\n" -" add.u32 %r51, %r2, %r50;\n" -" cvt.u64.u32 %rd62, %r51;\n" -" mul.wide.u32 %rd63, %r51, 4;\n" -" add.u64 %rd64, %rd55, %rd63;\n" -" ld.shared.f32 %f417, [%rd64+0];\n" -" add.ftz.f32 %f407, %f417, %f407;\n" -" st.shared.f32 [%rd58+0], %f407;\n" -" ld.shared.f32 %f418, [%rd64+512];\n" -" add.ftz.f32 %f408, %f418, %f408;\n" -" st.shared.f32 [%rd58+512], %f408;\n" -" ld.shared.f32 %f419, [%rd64+1024];\n" -" add.ftz.f32 %f409, %f419, %f409;\n" -" st.shared.f32 [%rd58+1024], %f409;\n" -" ld.shared.f32 %f420, [%rd64+1536];\n" -" add.ftz.f32 %f410, %f420, %f410;\n" -" st.shared.f32 [%rd58+1536], %f410;\n" -" ld.shared.f32 %f421, [%rd64+2048];\n" -" add.ftz.f32 %f415, %f421, %f415;\n" -" st.shared.f32 [%rd58+2048], %f415;\n" -" ld.shared.f32 %f422, [%rd64+2560];\n" -" add.ftz.f32 %f416, %f422, %f416;\n" -" st.shared.f32 [%rd58+2560], %f416;\n" -"$Lt_0_54274:\n" -" shr.u32 %r50, %r50, 1;\n" -" mov.u32 %r52, 0;\n" -" setp.ne.u32 %p30, %r50, %r52;\n" -" @%p30 bra $Lt_0_54018;\n" -"$Lt_0_53506:\n" -" mov.f32 %f6, %f407;\n" -" mov.f32 %f8, %f408;\n" -" mov.f32 %f10, %f409;\n" -" mov.f32 %f12, %f410;\n" -" mov.f32 %f14, %f415;\n" -" mov.f32 %f16, %f416;\n" -"$Lt_0_52994:\n" -"$Lt_0_50946:\n" -" mov.u32 %r53, 0;\n" -" setp.ne.s32 %p31, %r18, %r53;\n" -" @%p31 bra $Lt_0_55042;\n" -" ld.param.u64 %rd65, [__cudaparm_kernel_sphere_ellipsoid___val_paramengv];\n" -" add.u64 %rd66, %rd65, %rd3;\n" -" ld.param.s32 %r54, [__cudaparm_kernel_sphere_ellipsoid_eflag];\n" -" mov.u32 %r55, 0;\n" -" setp.le.s32 %p32, %r54, %r55;\n" -" @%p32 bra $Lt_0_55554;\n" -" st.global.f32 [%rd66+0], %f26;\n" -" cvt.s64.s32 %rd67, %r11;\n" -" mul.wide.s32 %rd68, %r11, 4;\n" -" add.u64 %rd66, %rd66, %rd68;\n" -"$Lt_0_55554:\n" -" ld.param.s32 %r56, [__cudaparm_kernel_sphere_ellipsoid_vflag];\n" -" mov.u32 %r57, 0;\n" -" setp.le.s32 %p33, %r56, %r57;\n" -" @%p33 bra $Lt_0_56066;\n" -" mov.f32 %f423, %f6;\n" -" st.global.f32 [%rd66+0], %f423;\n" -" cvt.s64.s32 %rd69, %r11;\n" -" mul.wide.s32 %rd70, %r11, 4;\n" -" add.u64 %rd71, %rd70, %rd66;\n" -" mov.f32 %f424, %f8;\n" -" st.global.f32 [%rd71+0], %f424;\n" -" add.u64 %rd72, %rd70, %rd71;\n" -" mov.f32 %f425, %f10;\n" -" st.global.f32 [%rd72+0], %f425;\n" -" add.u64 %rd73, %rd70, %rd72;\n" -" mov.f32 %f426, %f12;\n" -" st.global.f32 [%rd73+0], %f426;\n" -" add.u64 %rd66, %rd70, %rd73;\n" -" mov.f32 %f427, %f14;\n" -" st.global.f32 [%rd66+0], %f427;\n" -" mov.f32 %f428, %f16;\n" -" add.u64 %rd74, %rd70, %rd66;\n" -" st.global.f32 [%rd74+0], %f428;\n" -"$Lt_0_56066:\n" -" ld.param.u64 %rd75, [__cudaparm_kernel_sphere_ellipsoid_ans];\n" -" mul.lo.u64 %rd76, %rd2, 16;\n" -" add.u64 %rd77, %rd75, %rd76;\n" -" mov.f32 %f429, %f430;\n" -" st.global.v4.f32 [%rd77+0], {%f25,%f24,%f23,%f429};\n" -"$Lt_0_55042:\n" -"$Lt_0_39682:\n" -" .loc 17 237 0\n" -" exit;\n" -"$LDWend_kernel_sphere_ellipsoid:\n" -" }\n" -" .entry kernel_lj (\n" -" .param .u64 __cudaparm_kernel_lj_x_,\n" -" .param .u64 __cudaparm_kernel_lj_lj1,\n" -" .param .u64 __cudaparm_kernel_lj_lj3,\n" -" .param .s32 __cudaparm_kernel_lj_lj_types,\n" -" .param .u64 __cudaparm_kernel_lj_gum,\n" -" .param .s32 __cudaparm_kernel_lj_stride,\n" -" .param .u64 __cudaparm_kernel_lj_dev_ij,\n" -" .param .u64 __cudaparm_kernel_lj_ans,\n" -" .param .u64 __cudaparm_kernel_lj___val_paramengv,\n" -" .param .u64 __cudaparm_kernel_lj_err_flag,\n" -" .param .s32 __cudaparm_kernel_lj_eflag,\n" -" .param .s32 __cudaparm_kernel_lj_vflag,\n" -" .param .s32 __cudaparm_kernel_lj_start,\n" -" .param .s32 __cudaparm_kernel_lj_inum,\n" -" .param .s32 __cudaparm_kernel_lj_t_per_atom)\n" -" {\n" -" .reg .u32 %r<55>;\n" -" .reg .u64 %rd<60>;\n" -" .reg .f32 %f<115>;\n" -" .reg .pred %p<19>;\n" -" .shared .align 16 .b8 __cuda___cuda_local_var_33106_33_non_const_sp_lj3316[16];\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_33172_55_non_const_red_acc3332[3072];\n" -" .loc 17 246 0\n" -"$LDWbegin_kernel_lj:\n" -" .loc 17 252 0\n" -" ld.param.u64 %rd1, [__cudaparm_kernel_lj_gum];\n" -" ldu.global.f32 %f1, [%rd1+12];\n" -" .loc 17 253 0\n" -" ld.global.f32 %f2, [%rd1+16];\n" -" .loc 17 254 0\n" -" ld.global.f32 %f3, [%rd1+20];\n" -" .loc 17 255 0\n" -" ld.global.f32 %f4, [%rd1+24];\n" -" st.shared.v4.f32 [__cuda___cuda_local_var_33106_33_non_const_sp_lj3316+0], {%f1,%f2,%f3,%f4};\n" -" .loc 17 264 0\n" -" mov.f32 %f5, 0f00000000; \n" -" mov.f32 %f6, %f5;\n" -" mov.f32 %f7, 0f00000000; \n" -" mov.f32 %f8, %f7;\n" -" mov.f32 %f9, 0f00000000; \n" -" mov.f32 %f10, %f9;\n" -" mov.f32 %f11, 0f00000000; \n" -" mov.f32 %f12, %f11;\n" -" mov.f32 %f13, 0f00000000; \n" -" mov.f32 %f14, %f13;\n" -" mov.f32 %f15, 0f00000000; \n" -" mov.f32 %f16, %f15;\n" -" ld.param.s32 %r1, [__cudaparm_kernel_lj_t_per_atom];\n" -" cvt.s32.u32 %r2, %tid.x;\n" -" div.s32 %r3, %r2, %r1;\n" -" cvt.s32.u32 %r4, %ntid.x;\n" -" div.s32 %r5, %r4, %r1;\n" -" cvt.s32.u32 %r6, %ctaid.x;\n" -" mul.lo.s32 %r7, %r6, %r5;\n" -" add.s32 %r8, %r3, %r7;\n" -" ld.param.s32 %r9, [__cudaparm_kernel_lj_start];\n" -" add.s32 %r10, %r9, %r8;\n" -" ld.param.s32 %r11, [__cudaparm_kernel_lj_inum];\n" -" setp.ge.s32 %p1, %r10, %r11;\n" -" @%p1 bra $Lt_1_25346;\n" -" .loc 17 269 0\n" -" cvt.s64.s32 %rd2, %r10;\n" -" mul.wide.s32 %rd3, %r10, 4;\n" -" ld.param.u64 %rd4, [__cudaparm_kernel_lj_dev_ij];\n" -" add.u64 %rd5, %rd4, %rd3;\n" -" ld.global.s32 %r12, [%rd5+0];\n" -" ld.param.s32 %r13, [__cudaparm_kernel_lj_stride];\n" -" cvt.s64.s32 %rd6, %r13;\n" -" mul.wide.s32 %rd7, %r13, 4;\n" -" add.u64 %rd8, %rd7, %rd5;\n" -" ld.global.s32 %r14, [%rd8+0];\n" -" .loc 17 272 0\n" -" ld.param.u64 %rd9, [__cudaparm_kernel_lj_x_];\n" -" cvt.s64.s32 %rd10, %r12;\n" -" mul.wide.s32 %rd11, %r12, 16;\n" -" add.u64 %rd12, %rd9, %rd11;\n" -" ld.global.v4.f32 {%f17,%f18,%f19,%f20}, [%rd12+0];\n" -" .loc 17 273 0\n" -" cvt.s32.s64 %r15, %rd6;\n" -" sub.s32 %r16, %r1, 1;\n" -" and.b32 %r17, %r16, %r2;\n" -" add.u64 %rd13, %rd7, %rd8;\n" -" mul.lo.s32 %r18, %r15, %r17;\n" -" cvt.s64.s32 %rd14, %r18;\n" -" mul.wide.s32 %rd15, %r18, 4;\n" -" add.u64 %rd16, %rd13, %rd15;\n" -" mov.s64 %rd17, %rd16;\n" -" mul.lo.s32 %r19, %r15, %r14;\n" -" cvt.s64.s32 %rd18, %r19;\n" -" mul.wide.s32 %rd19, %r19, 4;\n" -" add.u64 %rd20, %rd13, %rd19;\n" -" setp.ge.u64 %p2, %rd16, %rd20;\n" -" @%p2 bra $Lt_1_26882;\n" -" cvt.rzi.ftz.s32.f32 %r20, %f20;\n" -" ld.param.s32 %r21, [__cudaparm_kernel_lj_lj_types];\n" -" mul.lo.s32 %r22, %r21, %r20;\n" -" ld.param.u64 %rd21, [__cudaparm_kernel_lj_lj1];\n" -" mov.f32 %f21, 0f00000000; \n" -" mov.f32 %f22, 0f00000000; \n" -" mov.f32 %f23, 0f00000000; \n" -" mov.f32 %f24, 0f00000000; \n" -" mov.u64 %rd22, __cuda___cuda_local_var_33106_33_non_const_sp_lj3316;\n" -"$Lt_1_19714:\n" -" .loc 17 278 0\n" -" ld.global.s32 %r23, [%rd17+0];\n" -" .loc 17 279 0\n" -" shr.s32 %r24, %r23, 30;\n" -" and.b32 %r25, %r24, 3;\n" -" cvt.s64.s32 %rd23, %r25;\n" -" mul.wide.s32 %rd24, %r25, 4;\n" -" add.u64 %rd25, %rd22, %rd24;\n" -" ld.shared.f32 %f25, [%rd25+0];\n" -" .loc 17 282 0\n" -" and.b32 %r26, %r23, 1073741823;\n" -" cvt.s64.s32 %rd26, %r26;\n" -" mul.wide.s32 %rd27, %r26, 16;\n" -" add.u64 %rd28, %rd9, %rd27;\n" -" ld.global.v4.f32 {%f26,%f27,%f28,%f29}, [%rd28+0];\n" -" .loc 17 278 0\n" -" cvt.rzi.ftz.s32.f32 %r27, %f29;\n" -" sub.ftz.f32 %f30, %f18, %f27;\n" -" sub.ftz.f32 %f31, %f17, %f26;\n" -" sub.ftz.f32 %f32, %f19, %f28;\n" -" mul.ftz.f32 %f33, %f30, %f30;\n" -" fma.rn.ftz.f32 %f34, %f31, %f31, %f33;\n" -" fma.rn.ftz.f32 %f35, %f32, %f32, %f34;\n" -" add.s32 %r28, %r27, %r22;\n" -" cvt.s64.s32 %rd29, %r28;\n" -" mul.wide.s32 %rd30, %r28, 16;\n" -" add.u64 %rd31, %rd30, %rd21;\n" -" ld.global.f32 %f36, [%rd31+8];\n" -" setp.gt.ftz.f32 %p3, %f36, %f35;\n" -" @!%p3 bra $Lt_1_27138;\n" -" ld.global.f32 %f37, [%rd31+12];\n" -" mov.f32 %f38, 0f00000000; \n" -" setp.eq.ftz.f32 %p4, %f37, %f38;\n" -" @!%p4 bra $Lt_1_27138;\n" -" .loc 17 296 0\n" -" rcp.approx.ftz.f32 %f39, %f35;\n" -" mul.ftz.f32 %f40, %f39, %f39;\n" -" mul.ftz.f32 %f41, %f39, %f40;\n" -" mul.ftz.f32 %f42, %f39, %f41;\n" -" ld.global.v2.f32 {%f43,%f44}, [%rd31+0];\n" -" mul.ftz.f32 %f45, %f43, %f41;\n" -" sub.ftz.f32 %f46, %f45, %f44;\n" -" mul.ftz.f32 %f47, %f42, %f46;\n" -" mul.ftz.f32 %f48, %f25, %f47;\n" -" .loc 17 298 0\n" -" fma.rn.ftz.f32 %f23, %f31, %f48, %f23;\n" -" .loc 17 299 0\n" -" fma.rn.ftz.f32 %f22, %f30, %f48, %f22;\n" -" .loc 17 300 0\n" -" fma.rn.ftz.f32 %f21, %f32, %f48, %f21;\n" -" ld.param.s32 %r29, [__cudaparm_kernel_lj_eflag];\n" -" mov.u32 %r30, 0;\n" -" setp.le.s32 %p5, %r29, %r30;\n" -" @%p5 bra $Lt_1_19970;\n" -" .loc 17 304 0\n" -" ld.param.u64 %rd32, [__cudaparm_kernel_lj_lj3];\n" -" add.u64 %rd33, %rd32, %rd30;\n" -" ld.global.v4.f32 {%f49,%f50,%f51,_}, [%rd33+0];\n" -" mul.ftz.f32 %f52, %f49, %f41;\n" -" sub.ftz.f32 %f53, %f52, %f50;\n" -" mul.ftz.f32 %f54, %f41, %f53;\n" -" sub.ftz.f32 %f55, %f54, %f51;\n" -" fma.rn.ftz.f32 %f24, %f25, %f55, %f24;\n" -"$Lt_1_19970:\n" -" ld.param.s32 %r31, [__cudaparm_kernel_lj_vflag];\n" -" mov.u32 %r32, 0;\n" -" setp.le.s32 %p6, %r31, %r32;\n" -" @%p6 bra $Lt_1_27138;\n" -" .loc 17 307 0\n" -" mov.f32 %f56, %f6;\n" -" mul.ftz.f32 %f57, %f31, %f31;\n" -" fma.rn.ftz.f32 %f58, %f48, %f57, %f56;\n" -" mov.f32 %f6, %f58;\n" -" .loc 17 308 0\n" -" mov.f32 %f59, %f8;\n" -" fma.rn.ftz.f32 %f60, %f48, %f33, %f59;\n" -" mov.f32 %f8, %f60;\n" -" .loc 17 309 0\n" -" mov.f32 %f61, %f10;\n" -" mul.ftz.f32 %f62, %f32, %f32;\n" -" fma.rn.ftz.f32 %f63, %f48, %f62, %f61;\n" -" mov.f32 %f10, %f63;\n" -" .loc 17 310 0\n" -" mov.f32 %f64, %f12;\n" -" mul.ftz.f32 %f65, %f30, %f31;\n" -" fma.rn.ftz.f32 %f66, %f48, %f65, %f64;\n" -" mov.f32 %f12, %f66;\n" -" .loc 17 311 0\n" -" mov.f32 %f67, %f14;\n" -" mul.ftz.f32 %f68, %f31, %f32;\n" -" fma.rn.ftz.f32 %f69, %f48, %f68, %f67;\n" -" mov.f32 %f14, %f69;\n" -" .loc 17 312 0\n" -" mul.ftz.f32 %f70, %f30, %f32;\n" -" fma.rn.ftz.f32 %f15, %f48, %f70, %f15;\n" -" mov.f32 %f16, %f15;\n" -"$Lt_1_27138:\n" -"$L_1_18178:\n" -" .loc 17 306 0\n" -" mul.lo.s32 %r33, %r15, %r1;\n" -" cvt.s64.s32 %rd34, %r33;\n" -" mul.wide.s32 %rd35, %r33, 4;\n" -" add.u64 %rd17, %rd17, %rd35;\n" -" setp.gt.u64 %p7, %rd20, %rd17;\n" -" @%p7 bra $Lt_1_19714;\n" -" bra.uni $Lt_1_19202;\n" -"$Lt_1_26882:\n" -" mov.f32 %f21, 0f00000000; \n" -" mov.f32 %f22, 0f00000000; \n" -" mov.f32 %f23, 0f00000000; \n" -" mov.f32 %f24, 0f00000000; \n" -"$Lt_1_19202:\n" -" mov.u32 %r34, 1;\n" -" setp.le.s32 %p8, %r1, %r34;\n" -" @%p8 bra $Lt_1_23298;\n" -" .loc 17 317 0\n" -" mov.u64 %rd36, __cuda___cuda_local_var_33172_55_non_const_red_acc3332;\n" -" cvt.s64.s32 %rd37, %r2;\n" -" mul.wide.s32 %rd38, %r2, 4;\n" -" add.u64 %rd39, %rd36, %rd38;\n" -" mov.f32 %f71, %f23;\n" -" st.shared.f32 [%rd39+0], %f71;\n" -" mov.f32 %f72, %f22;\n" -" st.shared.f32 [%rd39+512], %f72;\n" -" mov.f32 %f73, %f21;\n" -" st.shared.f32 [%rd39+1024], %f73;\n" -" mov.f32 %f74, %f24;\n" -" st.shared.f32 [%rd39+1536], %f74;\n" -" shr.s32 %r35, %r1, 31;\n" -" mov.s32 %r36, 1;\n" -" and.b32 %r37, %r35, %r36;\n" -" add.s32 %r38, %r37, %r1;\n" -" shr.s32 %r39, %r38, 1;\n" -" mov.s32 %r40, %r39;\n" -" mov.u32 %r41, 0;\n" -" setp.ne.u32 %p9, %r39, %r41;\n" -" @!%p9 bra $Lt_1_21762;\n" -"$Lt_1_22274:\n" -" setp.ge.u32 %p10, %r17, %r40;\n" -" @%p10 bra $Lt_1_22530;\n" -" add.u32 %r42, %r2, %r40;\n" -" cvt.u64.u32 %rd40, %r42;\n" -" mul.wide.u32 %rd41, %r42, 4;\n" -" add.u64 %rd42, %rd36, %rd41;\n" -" ld.shared.f32 %f75, [%rd42+0];\n" -" add.ftz.f32 %f71, %f75, %f71;\n" -" st.shared.f32 [%rd39+0], %f71;\n" -" ld.shared.f32 %f76, [%rd42+512];\n" -" add.ftz.f32 %f72, %f76, %f72;\n" -" st.shared.f32 [%rd39+512], %f72;\n" -" ld.shared.f32 %f77, [%rd42+1024];\n" -" add.ftz.f32 %f73, %f77, %f73;\n" -" st.shared.f32 [%rd39+1024], %f73;\n" -" ld.shared.f32 %f78, [%rd42+1536];\n" -" add.ftz.f32 %f74, %f78, %f74;\n" -" st.shared.f32 [%rd39+1536], %f74;\n" -"$Lt_1_22530:\n" -" shr.u32 %r40, %r40, 1;\n" -" mov.u32 %r43, 0;\n" -" setp.ne.u32 %p11, %r40, %r43;\n" -" @%p11 bra $Lt_1_22274;\n" -"$Lt_1_21762:\n" -" mov.f32 %f23, %f71;\n" -" mov.f32 %f22, %f72;\n" -" mov.f32 %f21, %f73;\n" -" mov.f32 %f24, %f74;\n" -" ld.param.s32 %r44, [__cudaparm_kernel_lj_vflag];\n" -" mov.u32 %r45, 0;\n" -" setp.le.s32 %p12, %r44, %r45;\n" -" @%p12 bra $Lt_1_23298;\n" -" mov.f32 %f71, %f6;\n" -" st.shared.f32 [%rd39+0], %f71;\n" -" mov.f32 %f72, %f8;\n" -" st.shared.f32 [%rd39+512], %f72;\n" -" mov.f32 %f73, %f10;\n" -" st.shared.f32 [%rd39+1024], %f73;\n" -" mov.f32 %f74, %f12;\n" -" st.shared.f32 [%rd39+1536], %f74;\n" -" mov.f32 %f79, %f14;\n" -" st.shared.f32 [%rd39+2048], %f79;\n" -" mov.f32 %f80, %f15;\n" -" st.shared.f32 [%rd39+2560], %f80;\n" -" mov.s32 %r46, %r39;\n" -" @!%p9 bra $Lt_1_23810;\n" -"$Lt_1_24322:\n" -" setp.ge.u32 %p13, %r17, %r46;\n" -" @%p13 bra $Lt_1_24578;\n" -" add.u32 %r47, %r2, %r46;\n" -" cvt.u64.u32 %rd43, %r47;\n" -" mul.wide.u32 %rd44, %r47, 4;\n" -" add.u64 %rd45, %rd36, %rd44;\n" -" ld.shared.f32 %f81, [%rd45+0];\n" -" add.ftz.f32 %f71, %f81, %f71;\n" -" st.shared.f32 [%rd39+0], %f71;\n" -" ld.shared.f32 %f82, [%rd45+512];\n" -" add.ftz.f32 %f72, %f82, %f72;\n" -" st.shared.f32 [%rd39+512], %f72;\n" -" ld.shared.f32 %f83, [%rd45+1024];\n" -" add.ftz.f32 %f73, %f83, %f73;\n" -" st.shared.f32 [%rd39+1024], %f73;\n" -" ld.shared.f32 %f84, [%rd45+1536];\n" -" add.ftz.f32 %f74, %f84, %f74;\n" -" st.shared.f32 [%rd39+1536], %f74;\n" -" ld.shared.f32 %f85, [%rd45+2048];\n" -" add.ftz.f32 %f79, %f85, %f79;\n" -" st.shared.f32 [%rd39+2048], %f79;\n" -" ld.shared.f32 %f86, [%rd45+2560];\n" -" add.ftz.f32 %f80, %f86, %f80;\n" -" st.shared.f32 [%rd39+2560], %f80;\n" -"$Lt_1_24578:\n" -" shr.u32 %r46, %r46, 1;\n" -" mov.u32 %r48, 0;\n" -" setp.ne.u32 %p14, %r46, %r48;\n" -" @%p14 bra $Lt_1_24322;\n" -"$Lt_1_23810:\n" -" mov.f32 %f6, %f71;\n" -" mov.f32 %f8, %f72;\n" -" mov.f32 %f10, %f73;\n" -" mov.f32 %f12, %f74;\n" -" mov.f32 %f14, %f79;\n" -" mov.f32 %f16, %f80;\n" -"$Lt_1_23298:\n" -"$Lt_1_21250:\n" -" mov.u32 %r49, 0;\n" -" setp.ne.s32 %p15, %r17, %r49;\n" -" @%p15 bra $Lt_1_25346;\n" -" ld.param.u64 %rd46, [__cudaparm_kernel_lj___val_paramengv];\n" -" add.u64 %rd47, %rd46, %rd3;\n" -" ld.param.s32 %r50, [__cudaparm_kernel_lj_eflag];\n" -" mov.u32 %r51, 0;\n" -" setp.le.s32 %p16, %r50, %r51;\n" -" @%p16 bra $Lt_1_25858;\n" -" ld.global.f32 %f87, [%rd47+0];\n" -" add.ftz.f32 %f88, %f87, %f24;\n" -" st.global.f32 [%rd47+0], %f88;\n" -" cvt.s64.s32 %rd48, %r11;\n" -" mul.wide.s32 %rd49, %r11, 4;\n" -" add.u64 %rd47, %rd47, %rd49;\n" -"$Lt_1_25858:\n" -" ld.param.s32 %r52, [__cudaparm_kernel_lj_vflag];\n" -" mov.u32 %r53, 0;\n" -" setp.le.s32 %p17, %r52, %r53;\n" -" @%p17 bra $Lt_1_26370;\n" -" ld.global.f32 %f89, [%rd47+0];\n" -" mov.f32 %f90, %f6;\n" -" add.ftz.f32 %f91, %f89, %f90;\n" -" st.global.f32 [%rd47+0], %f91;\n" -" cvt.s64.s32 %rd50, %r11;\n" -" mul.wide.s32 %rd51, %r11, 4;\n" -" add.u64 %rd52, %rd51, %rd47;\n" -" ld.global.f32 %f92, [%rd52+0];\n" -" mov.f32 %f93, %f8;\n" -" add.ftz.f32 %f94, %f92, %f93;\n" -" st.global.f32 [%rd52+0], %f94;\n" -" add.u64 %rd53, %rd51, %rd52;\n" -" ld.global.f32 %f95, [%rd53+0];\n" -" mov.f32 %f96, %f10;\n" -" add.ftz.f32 %f97, %f95, %f96;\n" -" st.global.f32 [%rd53+0], %f97;\n" -" add.u64 %rd54, %rd51, %rd53;\n" -" ld.global.f32 %f98, [%rd54+0];\n" -" mov.f32 %f99, %f12;\n" -" add.ftz.f32 %f100, %f98, %f99;\n" -" st.global.f32 [%rd54+0], %f100;\n" -" add.u64 %rd55, %rd51, %rd54;\n" -" ld.global.f32 %f101, [%rd55+0];\n" -" mov.f32 %f102, %f14;\n" -" add.ftz.f32 %f103, %f101, %f102;\n" -" st.global.f32 [%rd55+0], %f103;\n" -" add.u64 %rd47, %rd51, %rd55;\n" -" ld.global.f32 %f104, [%rd47+0];\n" -" mov.f32 %f105, %f16;\n" -" add.ftz.f32 %f106, %f104, %f105;\n" -" st.global.f32 [%rd47+0], %f106;\n" -"$Lt_1_26370:\n" -" ld.param.u64 %rd56, [__cudaparm_kernel_lj_ans];\n" -" mul.lo.u64 %rd57, %rd2, 16;\n" -" add.u64 %rd58, %rd56, %rd57;\n" -" ld.global.v4.f32 {%f107,%f108,%f109,%f110}, [%rd58+0];\n" -" add.ftz.f32 %f111, %f108, %f22;\n" -" add.ftz.f32 %f112, %f109, %f21;\n" -" add.ftz.f32 %f113, %f107, %f23;\n" -" st.global.v4.f32 [%rd58+0], {%f113,%f111,%f112,%f110};\n" -"$Lt_1_25346:\n" -"$Lt_1_18690:\n" -" .loc 17 320 0\n" -" exit;\n" -"$LDWend_kernel_lj:\n" -" }\n" -" .entry kernel_lj_fast (\n" -" .param .u64 __cudaparm_kernel_lj_fast_x_,\n" -" .param .u64 __cudaparm_kernel_lj_fast_lj1_in,\n" -" .param .u64 __cudaparm_kernel_lj_fast_lj3_in,\n" -" .param .u64 __cudaparm_kernel_lj_fast_gum,\n" -" .param .s32 __cudaparm_kernel_lj_fast_stride,\n" -" .param .u64 __cudaparm_kernel_lj_fast_dev_ij,\n" -" .param .u64 __cudaparm_kernel_lj_fast_ans,\n" -" .param .u64 __cudaparm_kernel_lj_fast___val_paramengv,\n" -" .param .u64 __cudaparm_kernel_lj_fast_err_flag,\n" -" .param .s32 __cudaparm_kernel_lj_fast_eflag,\n" -" .param .s32 __cudaparm_kernel_lj_fast_vflag,\n" -" .param .s32 __cudaparm_kernel_lj_fast_start,\n" -" .param .s32 __cudaparm_kernel_lj_fast_inum,\n" -" .param .s32 __cudaparm_kernel_lj_fast_t_per_atom)\n" -" {\n" -" .reg .u32 %r<57>;\n" -" .reg .u64 %rd<72>;\n" -" .reg .f32 %f<122>;\n" -" .reg .pred %p<22>;\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_33188_33_non_const_sp_lj6500[16];\n" -" .shared .align 16 .b8 __cuda___cuda_local_var_33189_34_non_const_lj16528[1936];\n" -" .shared .align 16 .b8 __cuda___cuda_local_var_33190_34_non_const_lj38464[1936];\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_33260_55_non_const_red_acc10400[3072];\n" -" .loc 17 328 0\n" -"$LDWbegin_kernel_lj_fast:\n" -" cvt.s32.u32 %r1, %tid.x;\n" -" mov.u32 %r2, 3;\n" -" setp.gt.s32 %p1, %r1, %r2;\n" -" @%p1 bra $Lt_2_20994;\n" -" .loc 17 337 0\n" -" mov.u64 %rd1, __cuda___cuda_local_var_33188_33_non_const_sp_lj6500;\n" -" cvt.s64.s32 %rd2, %r1;\n" -" mul.wide.s32 %rd3, %r1, 4;\n" -" ld.param.u64 %rd4, [__cudaparm_kernel_lj_fast_gum];\n" -" add.u64 %rd5, %rd4, %rd3;\n" -" ld.global.f32 %f1, [%rd5+12];\n" -" add.u64 %rd6, %rd3, %rd1;\n" -" st.shared.f32 [%rd6+0], %f1;\n" -"$Lt_2_20994:\n" -" mov.u64 %rd1, __cuda___cuda_local_var_33188_33_non_const_sp_lj6500;\n" -" mov.u32 %r3, 120;\n" -" setp.gt.s32 %p2, %r1, %r3;\n" -" @%p2 bra $Lt_2_21506;\n" -" .loc 17 339 0\n" -" mov.u64 %rd7, __cuda___cuda_local_var_33189_34_non_const_lj16528;\n" -" cvt.s64.s32 %rd8, %r1;\n" -" mul.wide.s32 %rd9, %r1, 16;\n" -" ld.param.u64 %rd10, [__cudaparm_kernel_lj_fast_lj1_in];\n" -" add.u64 %rd11, %rd10, %rd9;\n" -" add.u64 %rd12, %rd9, %rd7;\n" -" ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0];\n" -" st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5};\n" -" ld.param.s32 %r4, [__cudaparm_kernel_lj_fast_eflag];\n" -" mov.u32 %r5, 0;\n" -" setp.le.s32 %p3, %r4, %r5;\n" -" @%p3 bra $Lt_2_22018;\n" -" .loc 17 341 0\n" -" mov.u64 %rd13, __cuda___cuda_local_var_33190_34_non_const_lj38464;\n" -" ld.param.u64 %rd14, [__cudaparm_kernel_lj_fast_lj3_in];\n" -" add.u64 %rd15, %rd14, %rd9;\n" -" add.u64 %rd16, %rd9, %rd13;\n" -" ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0];\n" -" st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9};\n" -"$Lt_2_22018:\n" -" mov.u64 %rd13, __cuda___cuda_local_var_33190_34_non_const_lj38464;\n" -"$Lt_2_21506:\n" -" mov.u64 %rd13, __cuda___cuda_local_var_33190_34_non_const_lj38464;\n" -" mov.u64 %rd7, __cuda___cuda_local_var_33189_34_non_const_lj16528;\n" -" .loc 17 351 0\n" -" mov.f32 %f10, 0f00000000; \n" -" mov.f32 %f11, %f10;\n" -" mov.f32 %f12, 0f00000000; \n" -" mov.f32 %f13, %f12;\n" -" mov.f32 %f14, 0f00000000; \n" -" mov.f32 %f15, %f14;\n" -" mov.f32 %f16, 0f00000000; \n" -" mov.f32 %f17, %f16;\n" -" mov.f32 %f18, 0f00000000; \n" -" mov.f32 %f19, %f18;\n" -" mov.f32 %f20, 0f00000000; \n" -" mov.f32 %f21, %f20;\n" -" .loc 17 353 0\n" -" bar.sync 0;\n" -" ld.param.s32 %r6, [__cudaparm_kernel_lj_fast_t_per_atom];\n" -" div.s32 %r7, %r1, %r6;\n" -" cvt.s32.u32 %r8, %ntid.x;\n" -" div.s32 %r9, %r8, %r6;\n" -" cvt.s32.u32 %r10, %ctaid.x;\n" -" mul.lo.s32 %r11, %r10, %r9;\n" -" add.s32 %r12, %r7, %r11;\n" -" ld.param.s32 %r13, [__cudaparm_kernel_lj_fast_start];\n" -" add.s32 %r14, %r13, %r12;\n" -" ld.param.s32 %r15, [__cudaparm_kernel_lj_fast_inum];\n" -" setp.ge.s32 %p4, %r14, %r15;\n" -" @%p4 bra $Lt_2_29186;\n" -" .loc 17 358 0\n" -" cvt.s64.s32 %rd17, %r14;\n" -" mul.wide.s32 %rd18, %r14, 4;\n" -" ld.param.u64 %rd19, [__cudaparm_kernel_lj_fast_dev_ij];\n" -" add.u64 %rd20, %rd19, %rd18;\n" -" ld.global.s32 %r16, [%rd20+0];\n" -" ld.param.s32 %r17, [__cudaparm_kernel_lj_fast_stride];\n" -" cvt.s64.s32 %rd21, %r17;\n" -" mul.wide.s32 %rd22, %r17, 4;\n" -" add.u64 %rd23, %rd22, %rd20;\n" -" ld.global.s32 %r18, [%rd23+0];\n" -" .loc 17 361 0\n" -" ld.param.u64 %rd24, [__cudaparm_kernel_lj_fast_x_];\n" -" cvt.s64.s32 %rd25, %r16;\n" -" mul.wide.s32 %rd26, %r16, 16;\n" -" add.u64 %rd27, %rd24, %rd26;\n" -" ld.global.v4.f32 {%f22,%f23,%f24,%f25}, [%rd27+0];\n" -" .loc 17 363 0\n" -" cvt.s32.s64 %r19, %rd21;\n" -" sub.s32 %r20, %r6, 1;\n" -" and.b32 %r21, %r20, %r1;\n" -" add.u64 %rd28, %rd22, %rd23;\n" -" mul.lo.s32 %r22, %r19, %r21;\n" -" cvt.s64.s32 %rd29, %r22;\n" -" mul.wide.s32 %rd30, %r22, 4;\n" -" add.u64 %rd31, %rd28, %rd30;\n" -" mov.s64 %rd32, %rd31;\n" -" mul.lo.s32 %r23, %r19, %r18;\n" -" cvt.s64.s32 %rd33, %r23;\n" -" mul.wide.s32 %rd34, %r23, 4;\n" -" add.u64 %rd35, %rd28, %rd34;\n" -" setp.ge.u64 %p5, %rd31, %rd35;\n" -" @%p5 bra $Lt_2_30722;\n" -" cvt.rzi.ftz.s32.f32 %r24, %f25;\n" -" mul.lo.s32 %r25, %r24, 11;\n" -" cvt.rn.f32.s32 %f26, %r25;\n" -" mov.f32 %f27, 0f00000000; \n" -" mov.f32 %f28, 0f00000000; \n" -" mov.f32 %f29, 0f00000000; \n" -" mov.f32 %f30, 0f00000000; \n" -"$Lt_2_23554:\n" -" .loc 17 368 0\n" -" ld.global.s32 %r26, [%rd32+0];\n" -" .loc 17 369 0\n" -" shr.s32 %r27, %r26, 30;\n" -" and.b32 %r28, %r27, 3;\n" -" cvt.s64.s32 %rd36, %r28;\n" -" mul.wide.s32 %rd37, %r28, 4;\n" -" add.u64 %rd38, %rd1, %rd37;\n" -" ld.shared.f32 %f31, [%rd38+0];\n" -" .loc 17 372 0\n" -" and.b32 %r29, %r26, 1073741823;\n" -" cvt.s64.s32 %rd39, %r29;\n" -" mul.wide.s32 %rd40, %r29, 16;\n" -" add.u64 %rd41, %rd24, %rd40;\n" -" ld.global.v4.f32 {%f32,%f33,%f34,%f35}, [%rd41+0];\n" -" .loc 17 368 0\n" -" sub.ftz.f32 %f36, %f23, %f33;\n" -" sub.ftz.f32 %f37, %f22, %f32;\n" -" sub.ftz.f32 %f38, %f24, %f34;\n" -" mul.ftz.f32 %f39, %f36, %f36;\n" -" fma.rn.ftz.f32 %f40, %f37, %f37, %f39;\n" -" fma.rn.ftz.f32 %f41, %f38, %f38, %f40;\n" -" add.ftz.f32 %f42, %f26, %f35;\n" -" cvt.rzi.ftz.s32.f32 %r30, %f42;\n" -" cvt.s64.s32 %rd42, %r30;\n" -" mul.wide.s32 %rd43, %r30, 16;\n" -" add.u64 %rd44, %rd43, %rd7;\n" -" ld.shared.f32 %f43, [%rd44+8];\n" -" setp.gt.ftz.f32 %p6, %f43, %f41;\n" -" @!%p6 bra $Lt_2_30978;\n" -" ld.shared.f32 %f44, [%rd44+12];\n" -" mov.f32 %f45, 0f00000000; \n" -" setp.eq.ftz.f32 %p7, %f44, %f45;\n" -" @!%p7 bra $Lt_2_30978;\n" -" .loc 17 384 0\n" -" rcp.approx.ftz.f32 %f46, %f41;\n" -" mul.ftz.f32 %f47, %f46, %f46;\n" -" mul.ftz.f32 %f48, %f46, %f47;\n" -" mul.ftz.f32 %f49, %f46, %f31;\n" -" mul.ftz.f32 %f50, %f48, %f49;\n" -" ld.shared.v2.f32 {%f51,%f52}, [%rd44+0];\n" -" mul.ftz.f32 %f53, %f51, %f48;\n" -" sub.ftz.f32 %f54, %f53, %f52;\n" -" mul.ftz.f32 %f55, %f50, %f54;\n" -" .loc 17 386 0\n" -" fma.rn.ftz.f32 %f29, %f37, %f55, %f29;\n" -" .loc 17 387 0\n" -" fma.rn.ftz.f32 %f28, %f36, %f55, %f28;\n" -" .loc 17 388 0\n" -" fma.rn.ftz.f32 %f27, %f38, %f55, %f27;\n" -" ld.param.s32 %r31, [__cudaparm_kernel_lj_fast_eflag];\n" -" mov.u32 %r32, 0;\n" -" setp.le.s32 %p8, %r31, %r32;\n" -" @%p8 bra $Lt_2_23810;\n" -" .loc 17 391 0\n" -" add.u64 %rd45, %rd43, %rd13;\n" -" ld.shared.v4.f32 {%f56,%f57,%f58,_}, [%rd45+0];\n" -" mul.ftz.f32 %f59, %f56, %f48;\n" -" sub.ftz.f32 %f60, %f59, %f57;\n" -" mul.ftz.f32 %f61, %f48, %f60;\n" -" .loc 17 392 0\n" -" sub.ftz.f32 %f62, %f61, %f58;\n" -" fma.rn.ftz.f32 %f30, %f31, %f62, %f30;\n" -"$Lt_2_23810:\n" -" ld.param.s32 %r33, [__cudaparm_kernel_lj_fast_vflag];\n" -" mov.u32 %r34, 0;\n" -" setp.le.s32 %p9, %r33, %r34;\n" -" @%p9 bra $Lt_2_30978;\n" -" .loc 17 395 0\n" -" mov.f32 %f63, %f11;\n" -" mul.ftz.f32 %f64, %f37, %f37;\n" -" fma.rn.ftz.f32 %f65, %f55, %f64, %f63;\n" -" mov.f32 %f11, %f65;\n" -" .loc 17 396 0\n" -" mov.f32 %f66, %f13;\n" -" fma.rn.ftz.f32 %f67, %f55, %f39, %f66;\n" -" mov.f32 %f13, %f67;\n" -" .loc 17 397 0\n" -" mov.f32 %f68, %f15;\n" -" mul.ftz.f32 %f69, %f38, %f38;\n" -" fma.rn.ftz.f32 %f70, %f55, %f69, %f68;\n" -" mov.f32 %f15, %f70;\n" -" .loc 17 398 0\n" -" mov.f32 %f71, %f17;\n" -" mul.ftz.f32 %f72, %f36, %f37;\n" -" fma.rn.ftz.f32 %f73, %f55, %f72, %f71;\n" -" mov.f32 %f17, %f73;\n" -" .loc 17 399 0\n" -" mov.f32 %f74, %f19;\n" -" mul.ftz.f32 %f75, %f37, %f38;\n" -" fma.rn.ftz.f32 %f76, %f55, %f75, %f74;\n" -" mov.f32 %f19, %f76;\n" -" .loc 17 400 0\n" -" mul.ftz.f32 %f77, %f36, %f38;\n" -" fma.rn.ftz.f32 %f20, %f55, %f77, %f20;\n" -" mov.f32 %f21, %f20;\n" -"$Lt_2_30978:\n" -"$L_2_20482:\n" -" .loc 17 394 0\n" -" mul.lo.s32 %r35, %r19, %r6;\n" -" cvt.s64.s32 %rd46, %r35;\n" -" mul.wide.s32 %rd47, %r35, 4;\n" -" add.u64 %rd32, %rd32, %rd47;\n" -" setp.gt.u64 %p10, %rd35, %rd32;\n" -" @%p10 bra $Lt_2_23554;\n" -" bra.uni $Lt_2_23042;\n" -"$Lt_2_30722:\n" -" mov.f32 %f27, 0f00000000; \n" -" mov.f32 %f28, 0f00000000; \n" -" mov.f32 %f29, 0f00000000; \n" -" mov.f32 %f30, 0f00000000; \n" -"$Lt_2_23042:\n" -" mov.u32 %r36, 1;\n" -" setp.le.s32 %p11, %r6, %r36;\n" -" @%p11 bra $Lt_2_27138;\n" -" .loc 17 405 0\n" -" mov.u64 %rd48, __cuda___cuda_local_var_33260_55_non_const_red_acc10400;\n" -" cvt.s64.s32 %rd49, %r1;\n" -" mul.wide.s32 %rd50, %r1, 4;\n" -" add.u64 %rd51, %rd48, %rd50;\n" -" mov.f32 %f78, %f29;\n" -" st.shared.f32 [%rd51+0], %f78;\n" -" mov.f32 %f79, %f28;\n" -" st.shared.f32 [%rd51+512], %f79;\n" -" mov.f32 %f80, %f27;\n" -" st.shared.f32 [%rd51+1024], %f80;\n" -" mov.f32 %f81, %f30;\n" -" st.shared.f32 [%rd51+1536], %f81;\n" -" shr.s32 %r37, %r6, 31;\n" -" mov.s32 %r38, 1;\n" -" and.b32 %r39, %r37, %r38;\n" -" add.s32 %r40, %r39, %r6;\n" -" shr.s32 %r41, %r40, 1;\n" -" mov.s32 %r42, %r41;\n" -" mov.u32 %r43, 0;\n" -" setp.ne.u32 %p12, %r41, %r43;\n" -" @!%p12 bra $Lt_2_25602;\n" -"$Lt_2_26114:\n" -" setp.ge.u32 %p13, %r21, %r42;\n" -" @%p13 bra $Lt_2_26370;\n" -" add.u32 %r44, %r1, %r42;\n" -" cvt.u64.u32 %rd52, %r44;\n" -" mul.wide.u32 %rd53, %r44, 4;\n" -" add.u64 %rd54, %rd48, %rd53;\n" -" ld.shared.f32 %f82, [%rd54+0];\n" -" add.ftz.f32 %f78, %f82, %f78;\n" -" st.shared.f32 [%rd51+0], %f78;\n" -" ld.shared.f32 %f83, [%rd54+512];\n" -" add.ftz.f32 %f79, %f83, %f79;\n" -" st.shared.f32 [%rd51+512], %f79;\n" -" ld.shared.f32 %f84, [%rd54+1024];\n" -" add.ftz.f32 %f80, %f84, %f80;\n" -" st.shared.f32 [%rd51+1024], %f80;\n" -" ld.shared.f32 %f85, [%rd54+1536];\n" -" add.ftz.f32 %f81, %f85, %f81;\n" -" st.shared.f32 [%rd51+1536], %f81;\n" -"$Lt_2_26370:\n" -" shr.u32 %r42, %r42, 1;\n" -" mov.u32 %r45, 0;\n" -" setp.ne.u32 %p14, %r42, %r45;\n" -" @%p14 bra $Lt_2_26114;\n" -"$Lt_2_25602:\n" -" mov.f32 %f29, %f78;\n" -" mov.f32 %f28, %f79;\n" -" mov.f32 %f27, %f80;\n" -" mov.f32 %f30, %f81;\n" -" ld.param.s32 %r46, [__cudaparm_kernel_lj_fast_vflag];\n" -" mov.u32 %r47, 0;\n" -" setp.le.s32 %p15, %r46, %r47;\n" -" @%p15 bra $Lt_2_27138;\n" -" mov.f32 %f78, %f11;\n" -" st.shared.f32 [%rd51+0], %f78;\n" -" mov.f32 %f79, %f13;\n" -" st.shared.f32 [%rd51+512], %f79;\n" -" mov.f32 %f80, %f15;\n" -" st.shared.f32 [%rd51+1024], %f80;\n" -" mov.f32 %f81, %f17;\n" -" st.shared.f32 [%rd51+1536], %f81;\n" -" mov.f32 %f86, %f19;\n" -" st.shared.f32 [%rd51+2048], %f86;\n" -" mov.f32 %f87, %f20;\n" -" st.shared.f32 [%rd51+2560], %f87;\n" -" mov.s32 %r48, %r41;\n" -" @!%p12 bra $Lt_2_27650;\n" -"$Lt_2_28162:\n" -" setp.ge.u32 %p16, %r21, %r48;\n" -" @%p16 bra $Lt_2_28418;\n" -" add.u32 %r49, %r1, %r48;\n" -" cvt.u64.u32 %rd55, %r49;\n" -" mul.wide.u32 %rd56, %r49, 4;\n" -" add.u64 %rd57, %rd48, %rd56;\n" -" ld.shared.f32 %f88, [%rd57+0];\n" -" add.ftz.f32 %f78, %f88, %f78;\n" -" st.shared.f32 [%rd51+0], %f78;\n" -" ld.shared.f32 %f89, [%rd57+512];\n" -" add.ftz.f32 %f79, %f89, %f79;\n" -" st.shared.f32 [%rd51+512], %f79;\n" -" ld.shared.f32 %f90, [%rd57+1024];\n" -" add.ftz.f32 %f80, %f90, %f80;\n" -" st.shared.f32 [%rd51+1024], %f80;\n" -" ld.shared.f32 %f91, [%rd57+1536];\n" -" add.ftz.f32 %f81, %f91, %f81;\n" -" st.shared.f32 [%rd51+1536], %f81;\n" -" ld.shared.f32 %f92, [%rd57+2048];\n" -" add.ftz.f32 %f86, %f92, %f86;\n" -" st.shared.f32 [%rd51+2048], %f86;\n" -" ld.shared.f32 %f93, [%rd57+2560];\n" -" add.ftz.f32 %f87, %f93, %f87;\n" -" st.shared.f32 [%rd51+2560], %f87;\n" -"$Lt_2_28418:\n" -" shr.u32 %r48, %r48, 1;\n" -" mov.u32 %r50, 0;\n" -" setp.ne.u32 %p17, %r48, %r50;\n" -" @%p17 bra $Lt_2_28162;\n" -"$Lt_2_27650:\n" -" mov.f32 %f11, %f78;\n" -" mov.f32 %f13, %f79;\n" -" mov.f32 %f15, %f80;\n" -" mov.f32 %f17, %f81;\n" -" mov.f32 %f19, %f86;\n" -" mov.f32 %f21, %f87;\n" -"$Lt_2_27138:\n" -"$Lt_2_25090:\n" -" mov.u32 %r51, 0;\n" -" setp.ne.s32 %p18, %r21, %r51;\n" -" @%p18 bra $Lt_2_29186;\n" -" ld.param.u64 %rd58, [__cudaparm_kernel_lj_fast___val_paramengv];\n" -" add.u64 %rd59, %rd58, %rd18;\n" -" ld.param.s32 %r52, [__cudaparm_kernel_lj_fast_eflag];\n" -" mov.u32 %r53, 0;\n" -" setp.le.s32 %p19, %r52, %r53;\n" -" @%p19 bra $Lt_2_29698;\n" -" ld.global.f32 %f94, [%rd59+0];\n" -" add.ftz.f32 %f95, %f94, %f30;\n" -" st.global.f32 [%rd59+0], %f95;\n" -" cvt.s64.s32 %rd60, %r15;\n" -" mul.wide.s32 %rd61, %r15, 4;\n" -" add.u64 %rd59, %rd59, %rd61;\n" -"$Lt_2_29698:\n" -" ld.param.s32 %r54, [__cudaparm_kernel_lj_fast_vflag];\n" -" mov.u32 %r55, 0;\n" -" setp.le.s32 %p20, %r54, %r55;\n" -" @%p20 bra $Lt_2_30210;\n" -" ld.global.f32 %f96, [%rd59+0];\n" -" mov.f32 %f97, %f11;\n" -" add.ftz.f32 %f98, %f96, %f97;\n" -" st.global.f32 [%rd59+0], %f98;\n" -" cvt.s64.s32 %rd62, %r15;\n" -" mul.wide.s32 %rd63, %r15, 4;\n" -" add.u64 %rd64, %rd63, %rd59;\n" -" ld.global.f32 %f99, [%rd64+0];\n" -" mov.f32 %f100, %f13;\n" -" add.ftz.f32 %f101, %f99, %f100;\n" -" st.global.f32 [%rd64+0], %f101;\n" -" add.u64 %rd65, %rd63, %rd64;\n" -" ld.global.f32 %f102, [%rd65+0];\n" -" mov.f32 %f103, %f15;\n" -" add.ftz.f32 %f104, %f102, %f103;\n" -" st.global.f32 [%rd65+0], %f104;\n" -" add.u64 %rd66, %rd63, %rd65;\n" -" ld.global.f32 %f105, [%rd66+0];\n" -" mov.f32 %f106, %f17;\n" -" add.ftz.f32 %f107, %f105, %f106;\n" -" st.global.f32 [%rd66+0], %f107;\n" -" add.u64 %rd67, %rd63, %rd66;\n" -" ld.global.f32 %f108, [%rd67+0];\n" -" mov.f32 %f109, %f19;\n" -" add.ftz.f32 %f110, %f108, %f109;\n" -" st.global.f32 [%rd67+0], %f110;\n" -" add.u64 %rd59, %rd63, %rd67;\n" -" ld.global.f32 %f111, [%rd59+0];\n" -" mov.f32 %f112, %f21;\n" -" add.ftz.f32 %f113, %f111, %f112;\n" -" st.global.f32 [%rd59+0], %f113;\n" -"$Lt_2_30210:\n" -" ld.param.u64 %rd68, [__cudaparm_kernel_lj_fast_ans];\n" -" mul.lo.u64 %rd69, %rd17, 16;\n" -" add.u64 %rd70, %rd68, %rd69;\n" -" ld.global.v4.f32 {%f114,%f115,%f116,%f117}, [%rd70+0];\n" -" add.ftz.f32 %f118, %f115, %f28;\n" -" add.ftz.f32 %f119, %f116, %f27;\n" -" add.ftz.f32 %f120, %f114, %f29;\n" -" st.global.v4.f32 [%rd70+0], {%f120,%f118,%f119,%f117};\n" -"$Lt_2_29186:\n" -"$Lt_2_22530:\n" -" .loc 17 408 0\n" -" exit;\n" -"$LDWend_kernel_lj_fast:\n" -" }\n" -; diff --git a/lib/gpu/gayberne_ptx.h b/lib/gpu/gayberne_ptx.h deleted file mode 100644 index 55d334213c..0000000000 --- a/lib/gpu/gayberne_ptx.h +++ /dev/null @@ -1,1540 +0,0 @@ -const char * gayberne = -" .version 2.3\n" -" .target sm_20\n" -" .address_size 64\n" -" .entry kernel_ellipsoid (\n" -" .param .u64 __cudaparm_kernel_ellipsoid_x_,\n" -" .param .u64 __cudaparm_kernel_ellipsoid_q,\n" -" .param .u64 __cudaparm_kernel_ellipsoid_shape,\n" -" .param .u64 __cudaparm_kernel_ellipsoid_well,\n" -" .param .u64 __cudaparm_kernel_ellipsoid_gum,\n" -" .param .u64 __cudaparm_kernel_ellipsoid_sig_eps,\n" -" .param .s32 __cudaparm_kernel_ellipsoid_ntypes,\n" -" .param .u64 __cudaparm_kernel_ellipsoid_lshape,\n" -" .param .u64 __cudaparm_kernel_ellipsoid_dev_nbor,\n" -" .param .s32 __cudaparm_kernel_ellipsoid_stride,\n" -" .param .u64 __cudaparm_kernel_ellipsoid_ans,\n" -" .param .s32 __cudaparm_kernel_ellipsoid_astride,\n" -" .param .u64 __cudaparm_kernel_ellipsoid_engv,\n" -" .param .u64 __cudaparm_kernel_ellipsoid_err_flag,\n" -" .param .s32 __cudaparm_kernel_ellipsoid_eflag,\n" -" .param .s32 __cudaparm_kernel_ellipsoid_vflag,\n" -" .param .s32 __cudaparm_kernel_ellipsoid_inum,\n" -" .param .s32 __cudaparm_kernel_ellipsoid_t_per_atom)\n" -" {\n" -" .reg .u32 %r<67>;\n" -" .reg .u64 %rd<83>;\n" -" .reg .f32 %f<898>;\n" -" .reg .pred %p<35>;\n" -" .shared .align 16 .b8 __cuda___cuda_local_var_32950_33_non_const_sp_lj128[16];\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_33207_55_non_const_red_acc144[3584];\n" -" .loc 17 91 0\n" -"$LDWbegin_kernel_ellipsoid:\n" -" .loc 17 96 0\n" -" ld.param.u64 %rd1, [__cudaparm_kernel_ellipsoid_gum];\n" -" ldu.global.f32 %f1, [%rd1+12];\n" -" .loc 17 97 0\n" -" ld.global.f32 %f2, [%rd1+16];\n" -" .loc 17 98 0\n" -" ld.global.f32 %f3, [%rd1+20];\n" -" .loc 17 99 0\n" -" ld.global.f32 %f4, [%rd1+24];\n" -" st.shared.v4.f32 [__cuda___cuda_local_var_32950_33_non_const_sp_lj128+0], {%f1,%f2,%f3,%f4};\n" -" .loc 17 112 0\n" -" mov.f32 %f5, 0f00000000; \n" -" mov.f32 %f6, %f5;\n" -" mov.f32 %f7, 0f00000000; \n" -" mov.f32 %f8, %f7;\n" -" mov.f32 %f9, 0f00000000; \n" -" mov.f32 %f10, %f9;\n" -" mov.f32 %f11, 0f00000000; \n" -" mov.f32 %f12, %f11;\n" -" mov.f32 %f13, 0f00000000; \n" -" mov.f32 %f14, %f13;\n" -" mov.f32 %f15, 0f00000000; \n" -" mov.f32 %f16, %f15;\n" -" ld.param.s32 %r1, [__cudaparm_kernel_ellipsoid_t_per_atom];\n" -" cvt.s32.u32 %r2, %tid.x;\n" -" div.s32 %r3, %r2, %r1;\n" -" cvt.s32.u32 %r4, %ntid.x;\n" -" div.s32 %r5, %r4, %r1;\n" -" cvt.s32.u32 %r6, %ctaid.x;\n" -" mul.lo.s32 %r7, %r6, %r5;\n" -" add.s32 %r8, %r3, %r7;\n" -" ld.param.s32 %r9, [__cudaparm_kernel_ellipsoid_inum];\n" -" setp.le.s32 %p1, %r9, %r8;\n" -" @%p1 bra $Lt_0_55298;\n" -" .loc 17 117 0\n" -" cvt.s64.s32 %rd2, %r8;\n" -" mul.wide.s32 %rd3, %r8, 4;\n" -" ld.param.u64 %rd4, [__cudaparm_kernel_ellipsoid_dev_nbor];\n" -" add.u64 %rd5, %rd4, %rd3;\n" -" ld.global.s32 %r10, [%rd5+0];\n" -" ld.param.s32 %r11, [__cudaparm_kernel_ellipsoid_stride];\n" -" cvt.s64.s32 %rd6, %r11;\n" -" mul.wide.s32 %rd7, %r11, 4;\n" -" add.u64 %rd8, %rd7, %rd5;\n" -" ld.global.s32 %r12, [%rd8+0];\n" -" .loc 17 120 0\n" -" cvt.s64.s32 %rd9, %r10;\n" -" mul.wide.s32 %rd10, %r10, 16;\n" -" ld.param.u64 %rd11, [__cudaparm_kernel_ellipsoid_x_];\n" -" add.u64 %rd12, %rd10, %rd11;\n" -" ld.global.v4.f32 {%f17,%f18,%f19,%f20}, [%rd12+0];\n" -" .loc 17 123 0\n" -" cvt.rzi.ftz.s32.f32 %r13, %f20;\n" -" cvt.s64.s32 %rd13, %r13;\n" -" mul.wide.s32 %rd14, %r13, 16;\n" -" ld.param.u64 %rd15, [__cudaparm_kernel_ellipsoid_shape];\n" -" add.u64 %rd16, %rd14, %rd15;\n" -" ld.global.v4.f32 {%f21,%f22,%f23,_}, [%rd16+0];\n" -" .loc 17 126 0\n" -" ld.param.u64 %rd17, [__cudaparm_kernel_ellipsoid_q];\n" -" add.u64 %rd18, %rd10, %rd17;\n" -" ld.global.v4.f32 {%f24,%f25,%f26,%f27}, [%rd18+0];\n" -" .loc 17 129 0\n" -" ld.param.u64 %rd19, [__cudaparm_kernel_ellipsoid_well];\n" -" add.u64 %rd20, %rd14, %rd19;\n" -" ld.global.v4.f32 {%f28,%f29,%f30,_}, [%rd20+0];\n" -" .loc 17 130 0\n" -" cvt.s32.s64 %r14, %rd6;\n" -" sub.s32 %r15, %r1, 1;\n" -" and.b32 %r16, %r15, %r2;\n" -" add.u64 %rd21, %rd7, %rd8;\n" -" mul.lo.s32 %r17, %r14, %r16;\n" -" cvt.s64.s32 %rd22, %r17;\n" -" mul.wide.s32 %rd23, %r17, 4;\n" -" add.u64 %rd24, %rd21, %rd23;\n" -" mov.s64 %rd25, %rd24;\n" -" mul.lo.s32 %r18, %r14, %r12;\n" -" cvt.s64.s32 %rd26, %r18;\n" -" mul.wide.s32 %rd27, %r18, 4;\n" -" add.u64 %rd28, %rd21, %rd27;\n" -" setp.ge.u64 %p2, %rd24, %rd28;\n" -" @%p2 bra $Lt_0_56834;\n" -" ld.param.s32 %r19, [__cudaparm_kernel_ellipsoid_eflag];\n" -" mov.s32 %r20, 0;\n" -" setp.gt.s32 %p3, %r19, %r20;\n" -" ld.param.s32 %r21, [__cudaparm_kernel_ellipsoid_vflag];\n" -" mov.s32 %r22, 0;\n" -" setp.gt.s32 %p4, %r21, %r22;\n" -" add.ftz.f32 %f31, %f25, %f25;\n" -" add.ftz.f32 %f32, %f27, %f27;\n" -" mul.ftz.f32 %f33, %f24, %f24;\n" -" mul.ftz.f32 %f34, %f25, %f25;\n" -" mul.ftz.f32 %f35, %f26, %f26;\n" -" mul.ftz.f32 %f36, %f27, %f27;\n" -" add.ftz.f32 %f37, %f26, %f26;\n" -" ld.param.s32 %r23, [__cudaparm_kernel_ellipsoid_ntypes];\n" -" mul.lo.s32 %r24, %r23, %r13;\n" -" mul.ftz.f32 %f38, %f31, %f26;\n" -" mul.ftz.f32 %f39, %f31, %f27;\n" -" mul.ftz.f32 %f40, %f31, %f24;\n" -" mul.ftz.f32 %f41, %f32, %f24;\n" -" add.ftz.f32 %f42, %f33, %f34;\n" -" sub.ftz.f32 %f43, %f33, %f34;\n" -" mul.ftz.f32 %f44, %f37, %f24;\n" -" mul.ftz.f32 %f45, %f37, %f27;\n" -" sub.ftz.f32 %f46, %f38, %f41;\n" -" add.ftz.f32 %f47, %f38, %f41;\n" -" sub.ftz.f32 %f48, %f42, %f35;\n" -" add.ftz.f32 %f49, %f35, %f43;\n" -" sub.ftz.f32 %f50, %f43, %f35;\n" -" add.ftz.f32 %f51, %f39, %f44;\n" -" sub.ftz.f32 %f52, %f39, %f44;\n" -" sub.ftz.f32 %f53, %f45, %f40;\n" -" add.ftz.f32 %f54, %f40, %f45;\n" -" ld.param.u64 %rd29, [__cudaparm_kernel_ellipsoid_lshape];\n" -" mul.lo.u64 %rd30, %rd13, 4;\n" -" add.u64 %rd31, %rd29, %rd30;\n" -" mul.ftz.f32 %f55, %f46, %f22;\n" -" mul.ftz.f32 %f56, %f46, %f29;\n" -" mul.ftz.f32 %f57, %f47, %f21;\n" -" mul.ftz.f32 %f58, %f47, %f28;\n" -" sub.ftz.f32 %f59, %f48, %f36;\n" -" sub.ftz.f32 %f60, %f49, %f36;\n" -" add.ftz.f32 %f61, %f36, %f50;\n" -" mul.ftz.f32 %f62, %f51, %f23;\n" -" mul.ftz.f32 %f63, %f51, %f30;\n" -" add.ftz.f32 %f64, %f51, %f51;\n" -" mul.ftz.f32 %f65, %f52, %f21;\n" -" mul.ftz.f32 %f66, %f52, %f28;\n" -" mul.ftz.f32 %f67, %f53, %f23;\n" -" mul.ftz.f32 %f68, %f53, %f30;\n" -" add.ftz.f32 %f69, %f53, %f53;\n" -" mul.ftz.f32 %f70, %f54, %f22;\n" -" mul.ftz.f32 %f71, %f54, %f29;\n" -" mul.ftz.f32 %f72, %f46, %f55;\n" -" mul.ftz.f32 %f73, %f54, %f55;\n" -" mul.ftz.f32 %f74, %f46, %f56;\n" -" mul.ftz.f32 %f75, %f54, %f56;\n" -" mul.ftz.f32 %f76, %f59, %f21;\n" -" mul.ftz.f32 %f77, %f59, %f28;\n" -" mul.ftz.f32 %f78, %f60, %f22;\n" -" mul.ftz.f32 %f79, %f55, %f60;\n" -" mul.ftz.f32 %f80, %f60, %f29;\n" -" mul.ftz.f32 %f81, %f56, %f60;\n" -" mul.ftz.f32 %f82, %f61, %f23;\n" -" mul.ftz.f32 %f83, %f61, %f30;\n" -" add.ftz.f32 %f84, %f61, %f61;\n" -" mul.ftz.f32 %f85, %f46, %f70;\n" -" mul.ftz.f32 %f86, %f60, %f70;\n" -" mul.ftz.f32 %f87, %f54, %f70;\n" -" mul.ftz.f32 %f88, %f46, %f71;\n" -" mul.ftz.f32 %f89, %f60, %f71;\n" -" mul.ftz.f32 %f90, %f54, %f71;\n" -" fma.rn.ftz.f32 %f91, %f59, %f76, %f72;\n" -" fma.rn.ftz.f32 %f92, %f76, %f52, %f73;\n" -" fma.rn.ftz.f32 %f93, %f59, %f77, %f74;\n" -" fma.rn.ftz.f32 %f94, %f77, %f52, %f75;\n" -" mul.ftz.f32 %f95, %f46, %f78;\n" -" mul.ftz.f32 %f96, %f60, %f78;\n" -" mul.ftz.f32 %f97, %f54, %f78;\n" -" fma.rn.ftz.f32 %f98, %f76, %f47, %f79;\n" -" mul.ftz.f32 %f99, %f46, %f80;\n" -" mul.ftz.f32 %f100, %f60, %f80;\n" -" mul.ftz.f32 %f101, %f54, %f80;\n" -" fma.rn.ftz.f32 %f102, %f77, %f47, %f81;\n" -" fma.rn.ftz.f32 %f103, %f59, %f65, %f85;\n" -" fma.rn.ftz.f32 %f104, %f47, %f65, %f86;\n" -" fma.rn.ftz.f32 %f105, %f52, %f65, %f87;\n" -" fma.rn.ftz.f32 %f106, %f59, %f66, %f88;\n" -" fma.rn.ftz.f32 %f107, %f47, %f66, %f89;\n" -" fma.rn.ftz.f32 %f108, %f52, %f66, %f90;\n" -" fma.rn.ftz.f32 %f109, %f51, %f62, %f91;\n" -" fma.rn.ftz.f32 %f110, %f62, %f61, %f92;\n" -" fma.rn.ftz.f32 %f111, %f51, %f63, %f93;\n" -" fma.rn.ftz.f32 %f112, %f63, %f61, %f94;\n" -" fma.rn.ftz.f32 %f113, %f59, %f57, %f95;\n" -" fma.rn.ftz.f32 %f114, %f47, %f57, %f96;\n" -" fma.rn.ftz.f32 %f115, %f57, %f52, %f97;\n" -" fma.rn.ftz.f32 %f116, %f62, %f53, %f98;\n" -" fma.rn.ftz.f32 %f117, %f59, %f58, %f99;\n" -" fma.rn.ftz.f32 %f118, %f47, %f58, %f100;\n" -" fma.rn.ftz.f32 %f119, %f58, %f52, %f101;\n" -" fma.rn.ftz.f32 %f120, %f63, %f53, %f102;\n" -" fma.rn.ftz.f32 %f121, %f51, %f82, %f103;\n" -" fma.rn.ftz.f32 %f122, %f53, %f82, %f104;\n" -" fma.rn.ftz.f32 %f123, %f61, %f82, %f105;\n" -" fma.rn.ftz.f32 %f124, %f51, %f83, %f106;\n" -" fma.rn.ftz.f32 %f125, %f53, %f83, %f107;\n" -" fma.rn.ftz.f32 %f126, %f61, %f83, %f108;\n" -" fma.rn.ftz.f32 %f127, %f51, %f67, %f113;\n" -" fma.rn.ftz.f32 %f128, %f53, %f67, %f114;\n" -" fma.rn.ftz.f32 %f129, %f67, %f61, %f115;\n" -" fma.rn.ftz.f32 %f130, %f51, %f68, %f117;\n" -" fma.rn.ftz.f32 %f131, %f53, %f68, %f118;\n" -" fma.rn.ftz.f32 %f132, %f68, %f61, %f119;\n" -" ld.param.u64 %rd32, [__cudaparm_kernel_ellipsoid_sig_eps];\n" -" mov.f32 %f133, 0f00000000; \n" -" mov.f32 %f134, 0f00000000; \n" -" mov.f32 %f135, 0f00000000; \n" -" mov.f32 %f136, 0f00000000; \n" -" mov.f32 %f137, 0f00000000; \n" -" mov.f32 %f138, 0f00000000; \n" -" mov.f32 %f139, 0f00000000; \n" -" mov.u64 %rd33, __cuda___cuda_local_var_32950_33_non_const_sp_lj128;\n" -"$Lt_0_40962:\n" -" .loc 17 135 0\n" -" ld.global.s32 %r25, [%rd25+0];\n" -" .loc 17 136 0\n" -" shr.s32 %r26, %r25, 30;\n" -" and.b32 %r27, %r26, 3;\n" -" cvt.s64.s32 %rd34, %r27;\n" -" mul.wide.s32 %rd35, %r27, 4;\n" -" add.u64 %rd36, %rd33, %rd35;\n" -" ld.shared.f32 %f140, [%rd36+0];\n" -" .loc 17 139 0\n" -" and.b32 %r28, %r25, 1073741823;\n" -" cvt.s64.s32 %rd37, %r28;\n" -" mul.wide.s32 %rd38, %r28, 16;\n" -" add.u64 %rd39, %rd38, %rd11;\n" -" ld.global.v4.f32 {%f141,%f142,%f143,%f144}, [%rd39+0];\n" -" .loc 17 153 0\n" -" add.u64 %rd40, %rd38, %rd17;\n" -" ld.global.v4.f32 {%f145,%f146,%f147,%f148}, [%rd40+0];\n" -" .loc 17 162 0\n" -" cvt.rzi.ftz.s32.f32 %r29, %f144;\n" -" cvt.s64.s32 %rd41, %r29;\n" -" mul.wide.s32 %rd42, %r29, 16;\n" -" add.u64 %rd43, %rd42, %rd15;\n" -" ld.global.v4.f32 {%f149,%f150,%f151,_}, [%rd43+0];\n" -" .loc 16 299 0\n" -" sub.ftz.f32 %f152, %f141, %f17;\n" -" mov.f32 %f153, %f152;\n" -" .loc 16 300 0\n" -" add.ftz.f32 %f154, %f146, %f146;\n" -" add.ftz.f32 %f155, %f148, %f148;\n" -" mul.ftz.f32 %f156, %f145, %f145;\n" -" mul.ftz.f32 %f157, %f146, %f146;\n" -" mul.ftz.f32 %f158, %f147, %f147;\n" -" mul.ftz.f32 %f159, %f148, %f148;\n" -" add.ftz.f32 %f160, %f147, %f147;\n" -" mul.ftz.f32 %f161, %f154, %f147;\n" -" mul.ftz.f32 %f162, %f154, %f148;\n" -" mul.ftz.f32 %f163, %f155, %f145;\n" -" add.ftz.f32 %f164, %f156, %f157;\n" -" mul.ftz.f32 %f165, %f160, %f145;\n" -" sub.ftz.f32 %f166, %f161, %f163;\n" -" sub.ftz.f32 %f167, %f164, %f158;\n" -" add.ftz.f32 %f168, %f162, %f165;\n" -" mul.ftz.f32 %f169, %f166, %f150;\n" -" sub.ftz.f32 %f170, %f167, %f159;\n" -" mul.ftz.f32 %f171, %f168, %f151;\n" -" mul.ftz.f32 %f172, %f166, %f169;\n" -" mul.ftz.f32 %f173, %f170, %f149;\n" -" fma.rn.ftz.f32 %f174, %f170, %f173, %f172;\n" -" fma.rn.ftz.f32 %f175, %f168, %f171, %f174;\n" -" add.ftz.f32 %f176, %f109, %f175;\n" -" mov.f32 %f177, %f176;\n" -" .loc 16 301 0\n" -" mul.ftz.f32 %f178, %f154, %f145;\n" -" sub.ftz.f32 %f179, %f156, %f157;\n" -" mul.ftz.f32 %f180, %f160, %f148;\n" -" add.ftz.f32 %f181, %f161, %f163;\n" -" add.ftz.f32 %f182, %f158, %f179;\n" -" sub.ftz.f32 %f183, %f180, %f178;\n" -" mul.ftz.f32 %f184, %f181, %f149;\n" -" sub.ftz.f32 %f185, %f182, %f159;\n" -" mul.ftz.f32 %f186, %f183, %f151;\n" -" mul.ftz.f32 %f187, %f185, %f150;\n" -" mul.ftz.f32 %f188, %f166, %f187;\n" -" fma.rn.ftz.f32 %f189, %f170, %f184, %f188;\n" -" fma.rn.ftz.f32 %f190, %f168, %f186, %f189;\n" -" add.ftz.f32 %f191, %f127, %f190;\n" -" mov.f32 %f192, %f191;\n" -" .loc 16 302 0\n" -" sub.ftz.f32 %f193, %f179, %f158;\n" -" sub.ftz.f32 %f194, %f162, %f165;\n" -" add.ftz.f32 %f195, %f178, %f180;\n" -" add.ftz.f32 %f196, %f159, %f193;\n" -" mul.ftz.f32 %f197, %f194, %f149;\n" -" mul.ftz.f32 %f198, %f195, %f150;\n" -" mul.ftz.f32 %f199, %f196, %f151;\n" -" mul.ftz.f32 %f200, %f166, %f198;\n" -" fma.rn.ftz.f32 %f201, %f170, %f197, %f200;\n" -" fma.rn.ftz.f32 %f202, %f168, %f199, %f201;\n" -" add.ftz.f32 %f203, %f121, %f202;\n" -" mov.f32 %f204, %f203;\n" -" .loc 16 303 0\n" -" sub.ftz.f32 %f205, %f142, %f18;\n" -" mov.f32 %f206, %f205;\n" -" .loc 16 304 0\n" -" mul.ftz.f32 %f207, %f169, %f185;\n" -" fma.rn.ftz.f32 %f208, %f173, %f181, %f207;\n" -" fma.rn.ftz.f32 %f209, %f171, %f183, %f208;\n" -" add.ftz.f32 %f210, %f116, %f209;\n" -" mov.f32 %f211, %f210;\n" -" .loc 16 305 0\n" -" mul.ftz.f32 %f212, %f185, %f187;\n" -" fma.rn.ftz.f32 %f213, %f181, %f184, %f212;\n" -" fma.rn.ftz.f32 %f214, %f183, %f186, %f213;\n" -" add.ftz.f32 %f215, %f128, %f214;\n" -" mov.f32 %f216, %f215;\n" -" .loc 16 306 0\n" -" mul.ftz.f32 %f217, %f185, %f198;\n" -" fma.rn.ftz.f32 %f218, %f181, %f197, %f217;\n" -" fma.rn.ftz.f32 %f219, %f183, %f199, %f218;\n" -" add.ftz.f32 %f220, %f122, %f219;\n" -" mov.f32 %f221, %f220;\n" -" .loc 16 307 0\n" -" sub.ftz.f32 %f222, %f143, %f19;\n" -" mov.f32 %f223, %f222;\n" -" .loc 16 308 0\n" -" mul.ftz.f32 %f224, %f195, %f169;\n" -" fma.rn.ftz.f32 %f225, %f173, %f194, %f224;\n" -" fma.rn.ftz.f32 %f226, %f171, %f196, %f225;\n" -" add.ftz.f32 %f227, %f110, %f226;\n" -" mov.f32 %f228, %f227;\n" -" .loc 16 309 0\n" -" mul.ftz.f32 %f229, %f195, %f187;\n" -" fma.rn.ftz.f32 %f230, %f184, %f194, %f229;\n" -" fma.rn.ftz.f32 %f231, %f186, %f196, %f230;\n" -" add.ftz.f32 %f232, %f129, %f231;\n" -" mov.f32 %f233, %f232;\n" -" .loc 16 310 0\n" -" mul.ftz.f32 %f234, %f195, %f198;\n" -" fma.rn.ftz.f32 %f235, %f194, %f197, %f234;\n" -" fma.rn.ftz.f32 %f236, %f196, %f199, %f235;\n" -" add.ftz.f32 %f237, %f123, %f236;\n" -" mov.f32 %f238, %f237;\n" -" abs.ftz.f32 %f239, %f210;\n" -" abs.ftz.f32 %f240, %f176;\n" -" setp.gt.ftz.f32 %p5, %f239, %f240;\n" -" @!%p5 bra $Lt_0_41218;\n" -" .loc 16 314 0\n" -" mov.f32 %f177, %f210;\n" -" mov.f32 %f211, %f176;\n" -" .loc 16 315 0\n" -" mov.f32 %f192, %f215;\n" -" mov.f32 %f216, %f191;\n" -" .loc 16 316 0\n" -" mov.f32 %f204, %f220;\n" -" mov.f32 %f221, %f203;\n" -" .loc 16 317 0\n" -" mov.f32 %f153, %f205;\n" -" mov.f32 %f206, %f152;\n" -"$Lt_0_41218:\n" -" mov.f32 %f241, %f177;\n" -" abs.ftz.f32 %f242, %f241;\n" -" abs.ftz.f32 %f243, %f227;\n" -" setp.lt.ftz.f32 %p6, %f242, %f243;\n" -" @!%p6 bra $Lt_0_41730;\n" -" .loc 16 321 0\n" -" mov.f32 %f177, %f227;\n" -" mov.f32 %f228, %f241;\n" -" .loc 16 322 0\n" -" mov.f32 %f244, %f192;\n" -" mov.f32 %f192, %f232;\n" -" mov.f32 %f233, %f244;\n" -" .loc 16 323 0\n" -" mov.f32 %f245, %f204;\n" -" mov.f32 %f204, %f237;\n" -" mov.f32 %f238, %f245;\n" -" .loc 16 324 0\n" -" mov.f32 %f246, %f153;\n" -" mov.f32 %f153, %f222;\n" -" mov.f32 %f223, %f246;\n" -"$Lt_0_41730:\n" -" mov.f32 %f247, %f177;\n" -" mov.f32 %f248, 0f00000000; \n" -" setp.neu.ftz.f32 %p7, %f247, %f248;\n" -" @!%p7 bra $Lt_0_42498;\n" -" bra.uni $Lt_0_43266;\n" -"$Lt_0_42498:\n" -" mov.f32 %f249, 0f00000000; \n" -" setp.neu.ftz.f32 %p8, %f211, %f249;\n" -" @!%p8 bra $Lt_0_43010;\n" -" .loc 16 338 0\n" -" mov.f32 %f177, %f211;\n" -" mov.f32 %f211, %f247;\n" -" .loc 16 339 0\n" -" mov.f32 %f250, %f192;\n" -" mov.f32 %f192, %f216;\n" -" mov.f32 %f216, %f250;\n" -" .loc 16 340 0\n" -" mov.f32 %f251, %f204;\n" -" mov.f32 %f204, %f221;\n" -" mov.f32 %f221, %f251;\n" -" .loc 16 341 0\n" -" mov.f32 %f252, %f153;\n" -" mov.f32 %f153, %f206;\n" -" mov.f32 %f206, %f252;\n" -" bra.uni $Lt_0_43266;\n" -"$Lt_0_43010:\n" -" mov.f32 %f253, 0f00000000; \n" -" setp.neu.ftz.f32 %p9, %f228, %f253;\n" -" @!%p9 bra $Lt_0_43522;\n" -" .loc 16 346 0\n" -" mov.f32 %f177, %f228;\n" -" mov.f32 %f228, %f247;\n" -" .loc 16 347 0\n" -" mov.f32 %f254, %f192;\n" -" mov.f32 %f192, %f233;\n" -" mov.f32 %f233, %f254;\n" -" .loc 16 348 0\n" -" mov.f32 %f255, %f204;\n" -" mov.f32 %f204, %f238;\n" -" mov.f32 %f238, %f255;\n" -" .loc 16 349 0\n" -" mov.f32 %f256, %f153;\n" -" mov.f32 %f153, %f223;\n" -" mov.f32 %f223, %f256;\n" -" bra.uni $Lt_0_43266;\n" -"$Lt_0_43522:\n" -" .loc 16 352 0\n" -" mov.s32 %r30, 2;\n" -" ld.param.u64 %rd44, [__cudaparm_kernel_ellipsoid_err_flag];\n" -" st.global.s32 [%rd44+0], %r30;\n" -"$Lt_0_43266:\n" -"$Lt_0_42754:\n" -"$Lt_0_42242:\n" -" .loc 16 355 0\n" -" div.approx.ftz.f32 %f257, %f211, %f177;\n" -" mul.ftz.f32 %f258, %f192, %f257;\n" -" sub.ftz.f32 %f259, %f216, %f258;\n" -" mov.f32 %f216, %f259;\n" -" .loc 16 356 0\n" -" mul.ftz.f32 %f260, %f204, %f257;\n" -" sub.ftz.f32 %f261, %f221, %f260;\n" -" mov.f32 %f221, %f261;\n" -" .loc 16 357 0\n" -" mul.ftz.f32 %f262, %f153, %f257;\n" -" sub.ftz.f32 %f263, %f206, %f262;\n" -" mov.f32 %f206, %f263;\n" -" .loc 16 359 0\n" -" div.approx.ftz.f32 %f264, %f228, %f177;\n" -" mul.ftz.f32 %f265, %f192, %f264;\n" -" sub.ftz.f32 %f233, %f233, %f265;\n" -" .loc 16 360 0\n" -" mul.ftz.f32 %f266, %f204, %f264;\n" -" sub.ftz.f32 %f238, %f238, %f266;\n" -" .loc 16 361 0\n" -" mul.ftz.f32 %f267, %f153, %f264;\n" -" sub.ftz.f32 %f223, %f223, %f267;\n" -" abs.ftz.f32 %f268, %f259;\n" -" abs.ftz.f32 %f269, %f233;\n" -" setp.lt.ftz.f32 %p10, %f268, %f269;\n" -" @!%p10 bra $Lt_0_43778;\n" -" .loc 16 366 0\n" -" mov.f32 %f216, %f233;\n" -" mov.f32 %f233, %f259;\n" -" .loc 16 367 0\n" -" mov.f32 %f221, %f238;\n" -" mov.f32 %f238, %f261;\n" -" .loc 16 368 0\n" -" mov.f32 %f206, %f223;\n" -" mov.f32 %f223, %f263;\n" -"$Lt_0_43778:\n" -" mov.f32 %f270, %f216;\n" -" mov.f32 %f271, 0f00000000; \n" -" setp.neu.ftz.f32 %p11, %f270, %f271;\n" -" @!%p11 bra $Lt_0_44546;\n" -" bra.uni $Lt_0_44802;\n" -"$Lt_0_44546:\n" -" mov.f32 %f272, 0f00000000; \n" -" setp.neu.ftz.f32 %p12, %f233, %f272;\n" -" @!%p12 bra $Lt_0_44802;\n" -" .loc 16 383 0\n" -" mov.f32 %f216, %f233;\n" -" mov.f32 %f233, %f270;\n" -" .loc 16 384 0\n" -" mov.f32 %f273, %f221;\n" -" mov.f32 %f221, %f238;\n" -" mov.f32 %f238, %f273;\n" -" .loc 16 385 0\n" -" mov.f32 %f274, %f206;\n" -" mov.f32 %f206, %f223;\n" -" mov.f32 %f223, %f274;\n" -"$Lt_0_44802:\n" -"$Lt_0_44290:\n" -" .loc 16 390 0\n" -" div.approx.ftz.f32 %f275, %f233, %f216;\n" -" mul.ftz.f32 %f276, %f221, %f275;\n" -" sub.ftz.f32 %f238, %f238, %f276;\n" -" .loc 16 391 0\n" -" mul.ftz.f32 %f277, %f206, %f275;\n" -" sub.ftz.f32 %f223, %f223, %f277;\n" -" mov.f32 %f278, 0f00000000; \n" -" setp.eq.ftz.f32 %p13, %f238, %f278;\n" -" @!%p13 bra $Lt_0_45314;\n" -" .loc 16 394 0\n" -" mov.s32 %r31, 2;\n" -" ld.param.u64 %rd45, [__cudaparm_kernel_ellipsoid_err_flag];\n" -" st.global.s32 [%rd45+0], %r31;\n" -"$Lt_0_45314:\n" -" .loc 17 179 0\n" -" div.approx.ftz.f32 %f279, %f223, %f238;\n" -" mul.ftz.f32 %f280, %f205, %f205;\n" -" mul.ftz.f32 %f281, %f279, %f221;\n" -" fma.rn.ftz.f32 %f282, %f152, %f152, %f280;\n" -" sub.ftz.f32 %f283, %f206, %f281;\n" -" fma.rn.ftz.f32 %f284, %f222, %f222, %f282;\n" -" div.approx.ftz.f32 %f285, %f283, %f216;\n" -" rsqrt.approx.ftz.f32 %f286, %f284;\n" -" mul.ftz.f32 %f287, %f285, %f192;\n" -" fma.rn.ftz.f32 %f288, %f204, %f279, %f287;\n" -" sub.ftz.f32 %f289, %f153, %f288;\n" -" div.approx.ftz.f32 %f290, %f289, %f177;\n" -" mul.ftz.f32 %f291, %f286, %f290;\n" -" .loc 17 191 0\n" -" mul.ftz.f32 %f292, %f285, %f286;\n" -" mul.ftz.f32 %f293, %f286, %f205;\n" -" mul.ftz.f32 %f294, %f286, %f152;\n" -" mul.ftz.f32 %f295, %f286, %f222;\n" -" mul.ftz.f32 %f296, %f279, %f286;\n" -" mul.ftz.f32 %f297, %f292, %f293;\n" -" fma.rn.ftz.f32 %f298, %f294, %f291, %f297;\n" -" fma.rn.ftz.f32 %f299, %f295, %f296, %f298;\n" -" mov.f32 %f300, 0f3f000000; \n" -" mul.ftz.f32 %f301, %f299, %f300;\n" -" rsqrt.approx.ftz.f32 %f302, %f301;\n" -" .loc 17 195 0\n" -" rcp.approx.ftz.f32 %f303, %f286;\n" -" mul.ftz.f32 %f304, %f303, %f291;\n" -" .loc 17 200 0\n" -" add.s32 %r32, %r29, %r24;\n" -" cvt.s64.s32 %rd46, %r32;\n" -" mul.wide.s32 %rd47, %r32, 8;\n" -" add.u64 %rd48, %rd32, %rd47;\n" -" ld.global.v2.f32 {%f305,%f306}, [%rd48+0];\n" -" .loc 17 202 0\n" -" sub.ftz.f32 %f307, %f303, %f302;\n" -" ld.global.f32 %f308, [%rd1+0];\n" -" fma.rn.ftz.f32 %f309, %f308, %f305, %f307;\n" -" .loc 17 209 0\n" -" div.approx.ftz.f32 %f310, %f305, %f309;\n" -" mul.ftz.f32 %f311, %f310, %f310;\n" -" mul.ftz.f32 %f312, %f310, %f311;\n" -" mul.ftz.f32 %f313, %f312, %f312;\n" -" mul.ftz.f32 %f314, %f313, %f313;\n" -" mul.ftz.f32 %f315, %f310, %f313;\n" -" add.ftz.f32 %f316, %f314, %f314;\n" -" mul.ftz.f32 %f317, %f310, %f316;\n" -" sub.ftz.f32 %f318, %f317, %f315;\n" -" div.approx.ftz.f32 %f319, %f318, %f305;\n" -" mov.f32 %f320, 0f41c00000; \n" -" mul.ftz.f32 %f321, %f319, %f320;\n" -" mul.ftz.f32 %f322, %f306, %f321;\n" -" .loc 17 214 0\n" -" mul.ftz.f32 %f323, %f302, %f322;\n" -" mul.ftz.f32 %f324, %f323, %f302;\n" -" mul.ftz.f32 %f325, %f324, %f302;\n" -" mov.f32 %f326, 0f3f000000; \n" -" mul.ftz.f32 %f327, %f325, %f326;\n" -" mul.ftz.f32 %f328, %f327, %f286;\n" -" mul.ftz.f32 %f329, %f292, %f303;\n" -" mul.ftz.f32 %f330, %f296, %f303;\n" -" mul.ftz.f32 %f331, %f286, %f328;\n" -" mul.ftz.f32 %f332, %f293, %f329;\n" -" fma.rn.ftz.f32 %f333, %f294, %f304, %f332;\n" -" fma.rn.ftz.f32 %f334, %f295, %f330, %f333;\n" -" mul.ftz.f32 %f335, %f294, %f334;\n" -" sub.ftz.f32 %f336, %f304, %f335;\n" -" mul.ftz.f32 %f337, %f331, %f336;\n" -" fma.rn.ftz.f32 %f338, %f294, %f322, %f337;\n" -" .loc 17 215 0\n" -" mul.ftz.f32 %f339, %f293, %f334;\n" -" sub.ftz.f32 %f340, %f329, %f339;\n" -" mul.ftz.f32 %f341, %f331, %f340;\n" -" fma.rn.ftz.f32 %f342, %f293, %f322, %f341;\n" -" .loc 17 216 0\n" -" mul.ftz.f32 %f343, %f295, %f334;\n" -" sub.ftz.f32 %f344, %f330, %f343;\n" -" mul.ftz.f32 %f345, %f331, %f344;\n" -" fma.rn.ftz.f32 %f346, %f295, %f322, %f345;\n" -" .loc 17 226 0\n" -" mul.ftz.f32 %f347, %f122, %f329;\n" -" mul.ftz.f32 %f348, %f330, %f331;\n" -" mul.ftz.f32 %f349, %f329, %f331;\n" -" mul.ftz.f32 %f350, %f329, %f128;\n" -" fma.rn.ftz.f32 %f351, %f304, %f121, %f347;\n" -" fma.rn.ftz.f32 %f352, %f304, %f127, %f350;\n" -" fma.rn.ftz.f32 %f353, %f330, %f123, %f351;\n" -" fma.rn.ftz.f32 %f354, %f330, %f129, %f352;\n" -" mul.ftz.f32 %f355, %f348, %f354;\n" -" neg.ftz.f32 %f356, %f349;\n" -" fma.rn.ftz.f32 %f357, %f356, %f353, %f355;\n" -" mul.ftz.f32 %f358, %f116, %f329;\n" -" mul.ftz.f32 %f359, %f331, %f304;\n" -" fma.rn.ftz.f32 %f360, %f109, %f304, %f358;\n" -" fma.rn.ftz.f32 %f361, %f330, %f110, %f360;\n" -" mul.ftz.f32 %f362, %f359, %f353;\n" -" neg.ftz.f32 %f363, %f348;\n" -" fma.rn.ftz.f32 %f364, %f361, %f363, %f362;\n" -" mul.ftz.f32 %f365, %f349, %f361;\n" -" neg.ftz.f32 %f366, %f359;\n" -" fma.rn.ftz.f32 %f367, %f366, %f354, %f365;\n" -" .loc 17 233 0\n" -" ld.global.f32 %f368, [%rd31+0];\n" -" mul.lo.u64 %rd49, %rd41, 4;\n" -" add.u64 %rd50, %rd29, %rd49;\n" -" ld.global.f32 %f369, [%rd50+0];\n" -" add.ftz.f32 %f370, %f368, %f368;\n" -" mul.ftz.f32 %f371, %f369, %f370;\n" -" .loc 17 234 0\n" -" mul.ftz.f32 %f372, %f210, %f203;\n" -" mul.ftz.f32 %f373, %f227, %f203;\n" -" mul.ftz.f32 %f374, %f220, %f176;\n" -" mul.ftz.f32 %f375, %f210, %f191;\n" -" mul.ftz.f32 %f376, %f227, %f191;\n" -" mul.ftz.f32 %f377, %f215, %f176;\n" -" mul.ftz.f32 %f378, %f374, %f232;\n" -" mul.ftz.f32 %f379, %f237, %f377;\n" -" sub.ftz.f32 %f380, %f379, %f378;\n" -" mul.ftz.f32 %f381, %f237, %f375;\n" -" sub.ftz.f32 %f382, %f380, %f381;\n" -" fma.rn.ftz.f32 %f383, %f232, %f372, %f382;\n" -" fma.rn.ftz.f32 %f384, %f220, %f376, %f383;\n" -" mul.ftz.f32 %f385, %f215, %f373;\n" -" sub.ftz.f32 %f386, %f384, %f385;\n" -" .loc 17 235 0\n" -" ld.global.f32 %f387, [%rd1+4];\n" -" .loc 17 240 0\n" -" mul.ftz.f32 %f388, %f232, %f372;\n" -" sub.ftz.f32 %f389, %f388, %f378;\n" -" mul.ftz.f32 %f390, %f215, %f373;\n" -" sub.ftz.f32 %f391, %f389, %f390;\n" -" fma.rn.ftz.f32 %f392, %f220, %f376, %f391;\n" -" mul.ftz.f32 %f393, %f237, %f375;\n" -" sub.ftz.f32 %f394, %f392, %f393;\n" -" fma.rn.ftz.f32 %f395, %f237, %f377, %f394;\n" -" .loc 17 241 0\n" -" div.approx.ftz.f32 %f396, %f371, %f386;\n" -" lg2.approx.ftz.f32 %f397, %f396;\n" -" mul.ftz.f32 %f398, %f397, %f387;\n" -" ex2.approx.ftz.f32 %f399, %f398;\n" -" mul.ftz.f32 %f400, %f399, %f387;\n" -" neg.ftz.f32 %f401, %f400;\n" -" .loc 17 274 0\n" -" add.u64 %rd51, %rd42, %rd19;\n" -" ld.global.v4.f32 {%f402,%f403,%f404,_}, [%rd51+0];\n" -" .loc 16 299 0\n" -" mul.ftz.f32 %f405, %f294, %f303;\n" -" mov.f32 %f153, %f405;\n" -" .loc 16 300 0\n" -" mul.ftz.f32 %f406, %f166, %f403;\n" -" mul.ftz.f32 %f407, %f168, %f404;\n" -" mul.ftz.f32 %f408, %f166, %f406;\n" -" mul.ftz.f32 %f409, %f170, %f402;\n" -" fma.rn.ftz.f32 %f410, %f170, %f409, %f408;\n" -" fma.rn.ftz.f32 %f411, %f168, %f407, %f410;\n" -" add.ftz.f32 %f412, %f111, %f411;\n" -" mov.f32 %f177, %f412;\n" -" .loc 16 301 0\n" -" mul.ftz.f32 %f413, %f181, %f402;\n" -" mul.ftz.f32 %f414, %f183, %f404;\n" -" mul.ftz.f32 %f415, %f185, %f403;\n" -" mul.ftz.f32 %f416, %f166, %f415;\n" -" fma.rn.ftz.f32 %f417, %f170, %f413, %f416;\n" -" fma.rn.ftz.f32 %f418, %f168, %f414, %f417;\n" -" add.ftz.f32 %f419, %f130, %f418;\n" -" mov.f32 %f192, %f419;\n" -" .loc 16 302 0\n" -" mul.ftz.f32 %f420, %f194, %f402;\n" -" mul.ftz.f32 %f421, %f195, %f403;\n" -" mul.ftz.f32 %f422, %f196, %f404;\n" -" mul.ftz.f32 %f423, %f166, %f421;\n" -" fma.rn.ftz.f32 %f424, %f170, %f420, %f423;\n" -" fma.rn.ftz.f32 %f425, %f168, %f422, %f424;\n" -" add.ftz.f32 %f426, %f124, %f425;\n" -" mov.f32 %f204, %f426;\n" -" .loc 16 303 0\n" -" mul.ftz.f32 %f427, %f293, %f303;\n" -" mov.f32 %f206, %f427;\n" -" .loc 16 304 0\n" -" mul.ftz.f32 %f428, %f406, %f185;\n" -" fma.rn.ftz.f32 %f429, %f409, %f181, %f428;\n" -" fma.rn.ftz.f32 %f430, %f407, %f183, %f429;\n" -" add.ftz.f32 %f431, %f120, %f430;\n" -" mov.f32 %f211, %f431;\n" -" .loc 16 305 0\n" -" mul.ftz.f32 %f432, %f185, %f415;\n" -" fma.rn.ftz.f32 %f433, %f181, %f413, %f432;\n" -" fma.rn.ftz.f32 %f434, %f183, %f414, %f433;\n" -" add.ftz.f32 %f216, %f131, %f434;\n" -" .loc 16 306 0\n" -" mul.ftz.f32 %f435, %f185, %f421;\n" -" fma.rn.ftz.f32 %f436, %f181, %f420, %f435;\n" -" fma.rn.ftz.f32 %f437, %f183, %f422, %f436;\n" -" add.ftz.f32 %f221, %f125, %f437;\n" -" .loc 16 307 0\n" -" mul.ftz.f32 %f438, %f295, %f303;\n" -" mov.f32 %f223, %f438;\n" -" .loc 16 308 0\n" -" mul.ftz.f32 %f439, %f195, %f406;\n" -" fma.rn.ftz.f32 %f440, %f409, %f194, %f439;\n" -" fma.rn.ftz.f32 %f441, %f407, %f196, %f440;\n" -" add.ftz.f32 %f442, %f112, %f441;\n" -" mov.f32 %f228, %f442;\n" -" .loc 16 309 0\n" -" mul.ftz.f32 %f443, %f195, %f415;\n" -" fma.rn.ftz.f32 %f444, %f413, %f194, %f443;\n" -" fma.rn.ftz.f32 %f445, %f414, %f196, %f444;\n" -" add.ftz.f32 %f233, %f132, %f445;\n" -" .loc 16 310 0\n" -" mul.ftz.f32 %f446, %f195, %f421;\n" -" fma.rn.ftz.f32 %f447, %f194, %f420, %f446;\n" -" fma.rn.ftz.f32 %f448, %f196, %f422, %f447;\n" -" add.ftz.f32 %f238, %f126, %f448;\n" -" abs.ftz.f32 %f449, %f431;\n" -" abs.ftz.f32 %f450, %f412;\n" -" setp.gt.ftz.f32 %p14, %f449, %f450;\n" -" @!%p14 bra $Lt_0_45826;\n" -" .loc 16 314 0\n" -" mov.f32 %f177, %f431;\n" -" mov.f32 %f211, %f412;\n" -" .loc 16 315 0\n" -" mov.f32 %f192, %f216;\n" -" mov.f32 %f216, %f419;\n" -" .loc 16 316 0\n" -" mov.f32 %f204, %f221;\n" -" mov.f32 %f221, %f426;\n" -" .loc 16 317 0\n" -" mov.f32 %f153, %f427;\n" -" mov.f32 %f206, %f405;\n" -"$Lt_0_45826:\n" -" mov.f32 %f451, %f177;\n" -" abs.ftz.f32 %f452, %f451;\n" -" abs.ftz.f32 %f453, %f442;\n" -" setp.lt.ftz.f32 %p15, %f452, %f453;\n" -" @!%p15 bra $Lt_0_46338;\n" -" .loc 16 321 0\n" -" mov.f32 %f177, %f442;\n" -" mov.f32 %f228, %f451;\n" -" .loc 16 322 0\n" -" mov.f32 %f454, %f192;\n" -" mov.f32 %f192, %f233;\n" -" mov.f32 %f233, %f454;\n" -" .loc 16 323 0\n" -" mov.f32 %f455, %f204;\n" -" mov.f32 %f204, %f238;\n" -" mov.f32 %f238, %f455;\n" -" .loc 16 324 0\n" -" mov.f32 %f456, %f153;\n" -" mov.f32 %f153, %f438;\n" -" mov.f32 %f223, %f456;\n" -"$Lt_0_46338:\n" -" mov.f32 %f457, %f177;\n" -" mov.f32 %f458, 0f00000000; \n" -" setp.neu.ftz.f32 %p16, %f457, %f458;\n" -" @!%p16 bra $Lt_0_47106;\n" -" bra.uni $Lt_0_47874;\n" -"$Lt_0_47106:\n" -" mov.f32 %f459, 0f00000000; \n" -" setp.neu.ftz.f32 %p17, %f211, %f459;\n" -" @!%p17 bra $Lt_0_47618;\n" -" .loc 16 338 0\n" -" mov.f32 %f177, %f211;\n" -" mov.f32 %f211, %f457;\n" -" .loc 16 339 0\n" -" mov.f32 %f460, %f192;\n" -" mov.f32 %f192, %f216;\n" -" mov.f32 %f216, %f460;\n" -" .loc 16 340 0\n" -" mov.f32 %f461, %f204;\n" -" mov.f32 %f204, %f221;\n" -" mov.f32 %f221, %f461;\n" -" .loc 16 341 0\n" -" mov.f32 %f462, %f153;\n" -" mov.f32 %f153, %f206;\n" -" mov.f32 %f206, %f462;\n" -" bra.uni $Lt_0_47874;\n" -"$Lt_0_47618:\n" -" mov.f32 %f463, 0f00000000; \n" -" setp.neu.ftz.f32 %p18, %f228, %f463;\n" -" @!%p18 bra $Lt_0_48130;\n" -" .loc 16 346 0\n" -" mov.f32 %f177, %f228;\n" -" mov.f32 %f228, %f457;\n" -" .loc 16 347 0\n" -" mov.f32 %f464, %f192;\n" -" mov.f32 %f192, %f233;\n" -" mov.f32 %f233, %f464;\n" -" .loc 16 348 0\n" -" mov.f32 %f465, %f204;\n" -" mov.f32 %f204, %f238;\n" -" mov.f32 %f238, %f465;\n" -" .loc 16 349 0\n" -" mov.f32 %f466, %f153;\n" -" mov.f32 %f153, %f223;\n" -" mov.f32 %f223, %f466;\n" -" bra.uni $Lt_0_47874;\n" -"$Lt_0_48130:\n" -" .loc 16 352 0\n" -" mov.s32 %r33, 2;\n" -" ld.param.u64 %rd52, [__cudaparm_kernel_ellipsoid_err_flag];\n" -" st.global.s32 [%rd52+0], %r33;\n" -"$Lt_0_47874:\n" -"$Lt_0_47362:\n" -"$Lt_0_46850:\n" -" .loc 16 355 0\n" -" div.approx.ftz.f32 %f467, %f211, %f177;\n" -" mul.ftz.f32 %f468, %f192, %f467;\n" -" sub.ftz.f32 %f469, %f216, %f468;\n" -" mov.f32 %f216, %f469;\n" -" .loc 16 356 0\n" -" mul.ftz.f32 %f470, %f204, %f467;\n" -" sub.ftz.f32 %f471, %f221, %f470;\n" -" mov.f32 %f221, %f471;\n" -" .loc 16 357 0\n" -" mul.ftz.f32 %f472, %f153, %f467;\n" -" sub.ftz.f32 %f473, %f206, %f472;\n" -" mov.f32 %f206, %f473;\n" -" .loc 16 359 0\n" -" div.approx.ftz.f32 %f474, %f228, %f177;\n" -" mul.ftz.f32 %f475, %f192, %f474;\n" -" sub.ftz.f32 %f233, %f233, %f475;\n" -" .loc 16 360 0\n" -" mul.ftz.f32 %f476, %f204, %f474;\n" -" sub.ftz.f32 %f238, %f238, %f476;\n" -" .loc 16 361 0\n" -" mul.ftz.f32 %f477, %f153, %f474;\n" -" sub.ftz.f32 %f223, %f223, %f477;\n" -" abs.ftz.f32 %f478, %f469;\n" -" abs.ftz.f32 %f479, %f233;\n" -" setp.lt.ftz.f32 %p19, %f478, %f479;\n" -" @!%p19 bra $Lt_0_48386;\n" -" .loc 16 366 0\n" -" mov.f32 %f216, %f233;\n" -" mov.f32 %f233, %f469;\n" -" .loc 16 367 0\n" -" mov.f32 %f221, %f238;\n" -" mov.f32 %f238, %f471;\n" -" .loc 16 368 0\n" -" mov.f32 %f206, %f223;\n" -" mov.f32 %f223, %f473;\n" -"$Lt_0_48386:\n" -" mov.f32 %f480, %f216;\n" -" mov.f32 %f481, 0f00000000; \n" -" setp.neu.ftz.f32 %p20, %f480, %f481;\n" -" @!%p20 bra $Lt_0_49154;\n" -" bra.uni $Lt_0_49410;\n" -"$Lt_0_49154:\n" -" mov.f32 %f482, 0f00000000; \n" -" setp.neu.ftz.f32 %p21, %f233, %f482;\n" -" @!%p21 bra $Lt_0_49410;\n" -" .loc 16 383 0\n" -" mov.f32 %f216, %f233;\n" -" mov.f32 %f233, %f480;\n" -" .loc 16 384 0\n" -" mov.f32 %f483, %f221;\n" -" mov.f32 %f221, %f238;\n" -" mov.f32 %f238, %f483;\n" -" .loc 16 385 0\n" -" mov.f32 %f484, %f206;\n" -" mov.f32 %f206, %f223;\n" -" mov.f32 %f223, %f484;\n" -"$Lt_0_49410:\n" -"$Lt_0_48898:\n" -" .loc 16 390 0\n" -" div.approx.ftz.f32 %f485, %f233, %f216;\n" -" mul.ftz.f32 %f486, %f221, %f485;\n" -" sub.ftz.f32 %f238, %f238, %f486;\n" -" .loc 16 391 0\n" -" mul.ftz.f32 %f487, %f206, %f485;\n" -" sub.ftz.f32 %f223, %f223, %f487;\n" -" mov.f32 %f488, 0f00000000; \n" -" setp.eq.ftz.f32 %p22, %f238, %f488;\n" -" @!%p22 bra $Lt_0_49922;\n" -" .loc 16 394 0\n" -" mov.s32 %r34, 2;\n" -" ld.param.u64 %rd53, [__cudaparm_kernel_ellipsoid_err_flag];\n" -" st.global.s32 [%rd53+0], %r34;\n" -"$Lt_0_49922:\n" -" .loc 17 286 0\n" -" div.approx.ftz.f32 %f489, %f223, %f238;\n" -" mul.ftz.f32 %f490, %f489, %f221;\n" -" sub.ftz.f32 %f491, %f206, %f490;\n" -" div.approx.ftz.f32 %f492, %f491, %f216;\n" -" mul.ftz.f32 %f493, %f492, %f192;\n" -" fma.rn.ftz.f32 %f494, %f204, %f489, %f493;\n" -" sub.ftz.f32 %f495, %f153, %f494;\n" -" div.approx.ftz.f32 %f496, %f495, %f177;\n" -" mul.ftz.f32 %f497, %f286, %f496;\n" -" .loc 17 293 0\n" -" mul.ftz.f32 %f498, %f492, %f286;\n" -" mul.ftz.f32 %f499, %f489, %f286;\n" -" mul.ftz.f32 %f500, %f286, %f405;\n" -" mul.ftz.f32 %f501, %f286, %f427;\n" -" mul.ftz.f32 %f502, %f286, %f438;\n" -" mul.ftz.f32 %f503, %f498, %f501;\n" -" fma.rn.ftz.f32 %f504, %f500, %f497, %f503;\n" -" fma.rn.ftz.f32 %f505, %f502, %f499, %f504;\n" -" add.ftz.f32 %f506, %f505, %f505;\n" -" ld.global.f32 %f507, [%rd1+8];\n" -" .loc 17 296 0\n" -" mul.ftz.f32 %f508, %f303, %f497;\n" -" .loc 17 301 0\n" -" mov.f32 %f509, 0fbf800000; \n" -" add.ftz.f32 %f510, %f507, %f509;\n" -" lg2.approx.ftz.f32 %f511, %f506;\n" -" mul.ftz.f32 %f512, %f511, %f507;\n" -" ex2.approx.ftz.f32 %f513, %f512;\n" -" mov.f32 %f514, 0fc0800000; \n" -" mul.ftz.f32 %f515, %f286, %f514;\n" -" mul.ftz.f32 %f516, %f286, %f515;\n" -" lg2.approx.ftz.f32 %f517, %f513;\n" -" div.approx.ftz.f32 %f518, %f510, %f507;\n" -" mul.ftz.f32 %f519, %f517, %f518;\n" -" ex2.approx.ftz.f32 %f520, %f519;\n" -" mul.ftz.f32 %f521, %f516, %f507;\n" -" mul.ftz.f32 %f522, %f520, %f521;\n" -" .loc 17 303 0\n" -" mul.ftz.f32 %f523, %f498, %f303;\n" -" mul.ftz.f32 %f524, %f499, %f303;\n" -" mul.ftz.f32 %f525, %f523, %f501;\n" -" fma.rn.ftz.f32 %f526, %f500, %f508, %f525;\n" -" fma.rn.ftz.f32 %f527, %f502, %f524, %f526;\n" -" mul.ftz.f32 %f528, %f500, %f527;\n" -" sub.ftz.f32 %f529, %f508, %f528;\n" -" mul.ftz.f32 %f530, %f522, %f529;\n" -" .loc 17 304 0\n" -" mul.ftz.f32 %f531, %f501, %f527;\n" -" sub.ftz.f32 %f532, %f523, %f531;\n" -" mul.ftz.f32 %f533, %f522, %f532;\n" -" .loc 17 305 0\n" -" mul.ftz.f32 %f534, %f502, %f527;\n" -" sub.ftz.f32 %f535, %f524, %f534;\n" -" mul.ftz.f32 %f536, %f522, %f535;\n" -" .loc 17 310 0\n" -" mul.ftz.f32 %f537, %f125, %f523;\n" -" mul.ftz.f32 %f538, %f523, %f131;\n" -" fma.rn.ftz.f32 %f539, %f508, %f124, %f537;\n" -" fma.rn.ftz.f32 %f540, %f508, %f130, %f538;\n" -" fma.rn.ftz.f32 %f541, %f524, %f126, %f539;\n" -" fma.rn.ftz.f32 %f542, %f524, %f132, %f540;\n" -" mul.ftz.f32 %f543, %f523, %f541;\n" -" mul.ftz.f32 %f544, %f542, %f524;\n" -" sub.ftz.f32 %f545, %f544, %f543;\n" -" mul.ftz.f32 %f546, %f120, %f523;\n" -" fma.rn.ftz.f32 %f547, %f111, %f508, %f546;\n" -" fma.rn.ftz.f32 %f548, %f524, %f112, %f547;\n" -" mul.ftz.f32 %f549, %f524, %f548;\n" -" mul.ftz.f32 %f550, %f508, %f541;\n" -" sub.ftz.f32 %f551, %f550, %f549;\n" -" mul.ftz.f32 %f552, %f542, %f508;\n" -" mul.ftz.f32 %f553, %f548, %f523;\n" -" sub.ftz.f32 %f554, %f553, %f552;\n" -" .loc 17 312 0\n" -" mul.ftz.f32 %f555, %f516, %f545;\n" -" .loc 17 313 0\n" -" mul.ftz.f32 %f556, %f516, %f551;\n" -" .loc 17 314 0\n" -" mul.ftz.f32 %f557, %f516, %f554;\n" -" .loc 16 396 0\n" -" mov.f32 %f558, 0f40800000; \n" -" mul.ftz.f32 %f559, %f306, %f558;\n" -" mul.ftz.f32 %f560, %f399, %f140;\n" -" sub.ftz.f32 %f561, %f314, %f313;\n" -" mul.ftz.f32 %f562, %f513, %f560;\n" -" mul.ftz.f32 %f563, %f559, %f561;\n" -" fma.rn.ftz.f32 %f564, %f563, %f562, %f139;\n" -" selp.f32 %f139, %f564, %f139, %p3;\n" -" mul.ftz.f32 %f565, %f562, %f338;\n" -" mul.ftz.f32 %f566, %f562, %f342;\n" -" mul.ftz.f32 %f567, %f562, %f346;\n" -" mul.ftz.f32 %f568, %f399, %f563;\n" -" mul.ftz.f32 %f569, %f568, %f140;\n" -" neg.ftz.f32 %f570, %f569;\n" -" mul.ftz.f32 %f571, %f530, %f570;\n" -" sub.ftz.f32 %f572, %f571, %f565;\n" -" mul.ftz.f32 %f573, %f533, %f570;\n" -" sub.ftz.f32 %f574, %f573, %f566;\n" -" mul.ftz.f32 %f575, %f536, %f570;\n" -" sub.ftz.f32 %f576, %f575, %f567;\n" -" @!%p4 bra $Lt_0_50690;\n" -" .loc 17 326 0\n" -" add.ftz.f32 %f138, %f572, %f138;\n" -" .loc 17 327 0\n" -" mul.ftz.f32 %f577, %f303, %f500;\n" -" neg.ftz.f32 %f578, %f577;\n" -" mov.f32 %f579, %f6;\n" -" fma.rn.ftz.f32 %f580, %f578, %f572, %f579;\n" -" mov.f32 %f6, %f580;\n" -" .loc 17 329 0\n" -" add.ftz.f32 %f137, %f574, %f137;\n" -" .loc 17 330 0\n" -" mul.ftz.f32 %f581, %f303, %f501;\n" -" neg.ftz.f32 %f582, %f581;\n" -" mov.f32 %f583, %f8;\n" -" fma.rn.ftz.f32 %f584, %f582, %f574, %f583;\n" -" mov.f32 %f8, %f584;\n" -" .loc 17 331 0\n" -" mov.f32 %f585, %f12;\n" -" fma.rn.ftz.f32 %f586, %f578, %f574, %f585;\n" -" mov.f32 %f12, %f586;\n" -" .loc 17 333 0\n" -" add.ftz.f32 %f136, %f576, %f136;\n" -" .loc 17 334 0\n" -" mov.f32 %f587, %f10;\n" -" mul.ftz.f32 %f588, %f303, %f502;\n" -" neg.ftz.f32 %f589, %f588;\n" -" fma.rn.ftz.f32 %f590, %f589, %f576, %f587;\n" -" mov.f32 %f10, %f590;\n" -" .loc 17 335 0\n" -" mov.f32 %f591, %f14;\n" -" fma.rn.ftz.f32 %f592, %f578, %f576, %f591;\n" -" mov.f32 %f14, %f592;\n" -" .loc 17 336 0\n" -" fma.rn.ftz.f32 %f15, %f582, %f576, %f15;\n" -" mov.f32 %f16, %f15;\n" -" bra.uni $Lt_0_50434;\n" -"$Lt_0_50690:\n" -" .loc 17 338 0\n" -" add.ftz.f32 %f138, %f572, %f138;\n" -" .loc 17 339 0\n" -" add.ftz.f32 %f137, %f574, %f137;\n" -" .loc 17 340 0\n" -" add.ftz.f32 %f136, %f576, %f136;\n" -"$Lt_0_50434:\n" -" .loc 17 347 0\n" -" rcp.approx.ftz.f32 %f593, %f395;\n" -" mul.ftz.f32 %f594, %f513, %f399;\n" -" mul.ftz.f32 %f595, %f594, %f140;\n" -" neg.ftz.f32 %f596, %f595;\n" -" mul.ftz.f32 %f597, %f513, %f563;\n" -" mul.ftz.f32 %f598, %f54, %f176;\n" -" mul.ftz.f32 %f599, %f60, %f176;\n" -" add.ftz.f32 %f600, %f176, %f176;\n" -" mul.ftz.f32 %f601, %f52, %f176;\n" -" mul.ftz.f32 %f602, %f47, %f176;\n" -" mul.ftz.f32 %f603, %f69, %f176;\n" -" mul.ftz.f32 %f604, %f61, %f176;\n" -" add.ftz.f32 %f605, %f227, %f227;\n" -" mul.ftz.f32 %f606, %f46, %f227;\n" -" mul.ftz.f32 %f607, %f59, %f227;\n" -" mul.ftz.f32 %f608, %f52, %f227;\n" -" mul.ftz.f32 %f609, %f47, %f227;\n" -" mul.ftz.f32 %f610, %f54, %f210;\n" -" add.ftz.f32 %f611, %f210, %f210;\n" -" mul.ftz.f32 %f612, %f46, %f210;\n" -" mul.ftz.f32 %f613, %f52, %f210;\n" -" mul.ftz.f32 %f614, %f51, %f210;\n" -" mul.ftz.f32 %f615, %f84, %f210;\n" -" mul.ftz.f32 %f616, %f46, %f203;\n" -" mul.ftz.f32 %f617, %f59, %f203;\n" -" mul.ftz.f32 %f618, %f51, %f203;\n" -" mul.ftz.f32 %f619, %f69, %f203;\n" -" mul.ftz.f32 %f620, %f227, %f220;\n" -" mul.ftz.f32 %f621, %f61, %f220;\n" -" add.ftz.f32 %f622, %f237, %f237;\n" -" mul.ftz.f32 %f623, %f237, %f210;\n" -" mul.ftz.f32 %f624, %f59, %f237;\n" -" mul.ftz.f32 %f625, %f597, %f140;\n" -" mul.ftz.f32 %f626, %f600, %f237;\n" -" mul.ftz.f32 %f627, %f60, %f605;\n" -" mul.ftz.f32 %f628, %f605, %f203;\n" -" mul.ftz.f32 %f629, %f237, %f191;\n" -" mul.ftz.f32 %f630, %f54, %f191;\n" -" mul.ftz.f32 %f631, %f220, %f191;\n" -" mul.ftz.f32 %f632, %f61, %f191;\n" -" add.ftz.f32 %f633, %f215, %f215;\n" -" mul.ftz.f32 %f634, %f227, %f215;\n" -" mul.ftz.f32 %f635, %f232, %f210;\n" -" mul.ftz.f32 %f636, %f53, %f232;\n" -" mul.ftz.f32 %f637, %f611, %f191;\n" -" mul.ftz.f32 %f638, %f52, %f611;\n" -" mul.ftz.f32 %f639, %f616, %f215;\n" -" mul.ftz.f32 %f640, %f617, %f215;\n" -" mul.ftz.f32 %f641, %f618, %f232;\n" -" mul.ftz.f32 %f642, %f618, %f215;\n" -" mul.ftz.f32 %f643, %f622, %f176;\n" -" mul.ftz.f32 %f644, %f624, %f191;\n" -" neg.ftz.f32 %f645, %f625;\n" -" mul.ftz.f32 %f646, %f46, %f629;\n" -" mul.ftz.f32 %f647, %f633, %f176;\n" -" mul.ftz.f32 %f648, %f61, %f633;\n" -" mul.ftz.f32 %f649, %f46, %f631;\n" -" sub.ftz.f32 %f650, %f649, %f639;\n" -" mul.ftz.f32 %f651, %f59, %f631;\n" -" sub.ftz.f32 %f652, %f651, %f640;\n" -" mul.ftz.f32 %f653, %f51, %f629;\n" -" sub.ftz.f32 %f654, %f653, %f641;\n" -" mul.ftz.f32 %f655, %f51, %f631;\n" -" sub.ftz.f32 %f656, %f655, %f642;\n" -" mul.ftz.f32 %f657, %f232, %f617;\n" -" sub.ftz.f32 %f658, %f657, %f644;\n" -" mul.ftz.f32 %f659, %f232, %f616;\n" -" sub.ftz.f32 %f660, %f659, %f646;\n" -" mul.ftz.f32 %f661, %f60, %f374;\n" -" sub.ftz.f32 %f662, %f650, %f661;\n" -" mul.ftz.f32 %f663, %f47, %f374;\n" -" sub.ftz.f32 %f664, %f652, %f663;\n" -" mul.ftz.f32 %f665, %f237, %f603;\n" -" sub.ftz.f32 %f666, %f654, %f665;\n" -" mul.ftz.f32 %f667, %f53, %f374;\n" -" sub.ftz.f32 %f668, %f656, %f667;\n" -" fma.rn.ftz.f32 %f669, %f47, %f626, %f658;\n" -" fma.rn.ftz.f32 %f670, %f60, %f643, %f660;\n" -" fma.rn.ftz.f32 %f671, %f60, %f372, %f662;\n" -" fma.rn.ftz.f32 %f672, %f47, %f372, %f664;\n" -" fma.rn.ftz.f32 %f673, %f176, %f621, %f666;\n" -" fma.rn.ftz.f32 %f674, %f53, %f372, %f668;\n" -" mul.ftz.f32 %f675, %f220, %f601;\n" -" sub.ftz.f32 %f676, %f669, %f675;\n" -" mul.ftz.f32 %f677, %f220, %f598;\n" -" sub.ftz.f32 %f678, %f670, %f677;\n" -" fma.rn.ftz.f32 %f679, %f54, %f647, %f671;\n" -" mul.ftz.f32 %f680, %f232, %f602;\n" -" sub.ftz.f32 %f681, %f672, %f680;\n" -" fma.rn.ftz.f32 %f682, %f227, %f619, %f673;\n" -" mul.ftz.f32 %f683, %f51, %f634;\n" -" sub.ftz.f32 %f684, %f674, %f683;\n" -" mul.ftz.f32 %f685, %f47, %f628;\n" -" sub.ftz.f32 %f686, %f676, %f685;\n" -" mul.ftz.f32 %f687, %f203, %f627;\n" -" sub.ftz.f32 %f688, %f678, %f687;\n" -" mul.ftz.f32 %f689, %f232, %f599;\n" -" sub.ftz.f32 %f690, %f679, %f689;\n" -" mul.ftz.f32 %f691, %f59, %f634;\n" -" sub.ftz.f32 %f692, %f681, %f691;\n" -" fma.rn.ftz.f32 %f693, %f237, %f614, %f682;\n" -" mul.ftz.f32 %f694, %f176, %f636;\n" -" sub.ftz.f32 %f695, %f684, %f694;\n" -" fma.rn.ftz.f32 %f696, %f203, %f613, %f686;\n" -" mul.ftz.f32 %f697, %f46, %f623;\n" -" sub.ftz.f32 %f698, %f688, %f697;\n" -" fma.rn.ftz.f32 %f699, %f60, %f376, %f690;\n" -" fma.rn.ftz.f32 %f700, %f52, %f647, %f692;\n" -" mul.ftz.f32 %f701, %f61, %f372;\n" -" sub.ftz.f32 %f702, %f693, %f701;\n" -" fma.rn.ftz.f32 %f703, %f176, %f648, %f695;\n" -" mul.ftz.f32 %f704, %f59, %f623;\n" -" sub.ftz.f32 %f705, %f696, %f704;\n" -" fma.rn.ftz.f32 %f706, %f46, %f620, %f698;\n" -" mul.ftz.f32 %f707, %f215, %f606;\n" -" sub.ftz.f32 %f708, %f699, %f707;\n" -" mul.ftz.f32 %f709, %f191, %f638;\n" -" sub.ftz.f32 %f710, %f700, %f709;\n" -" mul.ftz.f32 %f711, %f51, %f620;\n" -" sub.ftz.f32 %f712, %f702, %f711;\n" -" fma.rn.ftz.f32 %f713, %f51, %f635, %f703;\n" -" fma.rn.ftz.f32 %f714, %f220, %f607, %f705;\n" -" fma.rn.ftz.f32 %f715, %f203, %f610, %f706;\n" -" mul.ftz.f32 %f716, %f54, %f637;\n" -" sub.ftz.f32 %f717, %f708, %f716;\n" -" fma.rn.ftz.f32 %f718, %f59, %f635, %f710;\n" -" fma.rn.ftz.f32 %f719, %f232, %f604, %f712;\n" -" fma.rn.ftz.f32 %f720, %f53, %f376, %f713;\n" -" fma.rn.ftz.f32 %f721, %f191, %f608, %f714;\n" -" mul.ftz.f32 %f722, %f232, %f598;\n" -" sub.ftz.f32 %f723, %f715, %f722;\n" -" fma.rn.ftz.f32 %f724, %f232, %f612, %f717;\n" -" fma.rn.ftz.f32 %f725, %f191, %f609, %f718;\n" -" mul.ftz.f32 %f726, %f227, %f632;\n" -" sub.ftz.f32 %f727, %f726, %f719;\n" -" mul.ftz.f32 %f728, %f191, %f615;\n" -" sub.ftz.f32 %f729, %f720, %f728;\n" -" mul.ftz.f32 %f730, %f232, %f601;\n" -" sub.ftz.f32 %f731, %f721, %f730;\n" -" fma.rn.ftz.f32 %f732, %f227, %f630, %f723;\n" -" mul.ftz.f32 %f733, %f724, %f22;\n" -" mul.ftz.f32 %f734, %f725, %f21;\n" -" mul.ftz.f32 %f735, %f727, %f23;\n" -" mul.ftz.f32 %f736, %f729, %f23;\n" -" mul.ftz.f32 %f737, %f731, %f21;\n" -" mul.ftz.f32 %f738, %f732, %f22;\n" -" mul.ftz.f32 %f739, %f593, %f733;\n" -" mul.ftz.f32 %f740, %f593, %f734;\n" -" mul.ftz.f32 %f741, %f593, %f735;\n" -" mul.ftz.f32 %f742, %f593, %f736;\n" -" mul.ftz.f32 %f743, %f593, %f737;\n" -" mul.ftz.f32 %f744, %f593, %f738;\n" -" mul.ftz.f32 %f745, %f739, %f401;\n" -" mul.ftz.f32 %f746, %f740, %f401;\n" -" mul.ftz.f32 %f747, %f741, %f401;\n" -" mul.ftz.f32 %f748, %f742, %f401;\n" -" mul.ftz.f32 %f749, %f743, %f401;\n" -" mul.ftz.f32 %f750, %f744, %f401;\n" -" mul.ftz.f32 %f751, %f569, %f555;\n" -" mul.ftz.f32 %f752, %f52, %f749;\n" -" mul.ftz.f32 %f753, %f47, %f746;\n" -" sub.ftz.f32 %f754, %f753, %f752;\n" -" mul.ftz.f32 %f755, %f54, %f750;\n" -" mul.ftz.f32 %f756, %f745, %f60;\n" -" sub.ftz.f32 %f757, %f756, %f755;\n" -" add.ftz.f32 %f758, %f754, %f757;\n" -" mul.ftz.f32 %f759, %f61, %f747;\n" -" mul.ftz.f32 %f760, %f748, %f53;\n" -" sub.ftz.f32 %f761, %f760, %f759;\n" -" add.ftz.f32 %f762, %f758, %f761;\n" -" mul.ftz.f32 %f763, %f762, %f645;\n" -" sub.ftz.f32 %f764, %f763, %f751;\n" -" fma.rn.ftz.f32 %f765, %f357, %f596, %f764;\n" -" add.ftz.f32 %f135, %f135, %f765;\n" -" .loc 17 348 0\n" -" mul.ftz.f32 %f766, %f54, %f227;\n" -" mul.ftz.f32 %f767, %f53, %f210;\n" -" mul.ftz.f32 %f768, %f47, %f203;\n" -" mul.ftz.f32 %f769, %f60, %f203;\n" -" add.ftz.f32 %f770, %f220, %f220;\n" -" mul.ftz.f32 %f771, %f47, %f191;\n" -" mul.ftz.f32 %f772, %f60, %f191;\n" -" mul.ftz.f32 %f773, %f53, %f191;\n" -" mul.ftz.f32 %f774, %f52, %f215;\n" -" mul.ftz.f32 %f775, %f54, %f215;\n" -" mul.ftz.f32 %f776, %f215, %f203;\n" -" mul.ftz.f32 %f777, %f232, %f203;\n" -" mul.ftz.f32 %f778, %f64, %f232;\n" -" mul.ftz.f32 %f779, %f59, %f770;\n" -" mul.ftz.f32 %f780, %f46, %f770;\n" -" mul.ftz.f32 %f781, %f52, %f631;\n" -" mul.ftz.f32 %f782, %f633, %f237;\n" -" mul.ftz.f32 %f783, %f51, %f633;\n" -" mul.ftz.f32 %f784, %f775, %f203;\n" -" mul.ftz.f32 %f785, %f61, %f776;\n" -" fma.rn.ftz.f32 %f786, %f59, %f782, %f781;\n" -" mul.ftz.f32 %f787, %f46, %f782;\n" -" sub.ftz.f32 %f788, %f787, %f784;\n" -" mul.ftz.f32 %f789, %f61, %f631;\n" -" sub.ftz.f32 %f790, %f789, %f785;\n" -" mul.ftz.f32 %f791, %f203, %f774;\n" -" sub.ftz.f32 %f792, %f786, %f791;\n" -" fma.rn.ftz.f32 %f793, %f54, %f631, %f788;\n" -" fma.rn.ftz.f32 %f794, %f237, %f783, %f790;\n" -" mul.ftz.f32 %f795, %f232, %f779;\n" -" sub.ftz.f32 %f796, %f792, %f795;\n" -" mul.ftz.f32 %f797, %f232, %f780;\n" -" sub.ftz.f32 %f798, %f793, %f797;\n" -" mul.ftz.f32 %f799, %f237, %f773;\n" -" sub.ftz.f32 %f800, %f794, %f799;\n" -" fma.rn.ftz.f32 %f801, %f232, %f768, %f796;\n" -" fma.rn.ftz.f32 %f802, %f232, %f769, %f798;\n" -" fma.rn.ftz.f32 %f803, %f53, %f777, %f800;\n" -" mul.ftz.f32 %f804, %f237, %f771;\n" -" sub.ftz.f32 %f805, %f801, %f804;\n" -" mul.ftz.f32 %f806, %f237, %f772;\n" -" sub.ftz.f32 %f807, %f802, %f806;\n" -" mul.ftz.f32 %f808, %f220, %f778;\n" -" sub.ftz.f32 %f809, %f803, %f808;\n" -" mul.ftz.f32 %f810, %f47, %f623;\n" -" sub.ftz.f32 %f811, %f805, %f810;\n" -" mul.ftz.f32 %f812, %f60, %f623;\n" -" sub.ftz.f32 %f813, %f807, %f812;\n" -" mul.ftz.f32 %f814, %f237, %f767;\n" -" sub.ftz.f32 %f815, %f809, %f814;\n" -" fma.rn.ftz.f32 %f816, %f47, %f620, %f811;\n" -" fma.rn.ftz.f32 %f817, %f60, %f620, %f813;\n" -" fma.rn.ftz.f32 %f818, %f53, %f620, %f815;\n" -" fma.rn.ftz.f32 %f819, %f232, %f613, %f816;\n" -" mul.ftz.f32 %f820, %f215, %f766;\n" -" sub.ftz.f32 %f821, %f817, %f820;\n" -" mul.ftz.f32 %f822, %f61, %f634;\n" -" sub.ftz.f32 %f823, %f818, %f822;\n" -" mul.ftz.f32 %f824, %f215, %f608;\n" -" sub.ftz.f32 %f825, %f819, %f824;\n" -" fma.rn.ftz.f32 %f826, %f232, %f610, %f821;\n" -" fma.rn.ftz.f32 %f827, %f61, %f635, %f823;\n" -" mul.ftz.f32 %f828, %f825, %f21;\n" -" mul.ftz.f32 %f829, %f826, %f22;\n" -" mul.ftz.f32 %f830, %f827, %f23;\n" -" mul.ftz.f32 %f831, %f593, %f828;\n" -" mul.ftz.f32 %f832, %f593, %f829;\n" -" mul.ftz.f32 %f833, %f593, %f830;\n" -" mul.ftz.f32 %f834, %f831, %f401;\n" -" mul.ftz.f32 %f835, %f832, %f401;\n" -" mul.ftz.f32 %f836, %f833, %f401;\n" -" mul.ftz.f32 %f837, %f569, %f556;\n" -" mul.ftz.f32 %f838, %f46, %f745;\n" -" mul.ftz.f32 %f839, %f835, %f54;\n" -" sub.ftz.f32 %f840, %f839, %f838;\n" -" mul.ftz.f32 %f841, %f59, %f746;\n" -" mul.ftz.f32 %f842, %f834, %f52;\n" -" sub.ftz.f32 %f843, %f842, %f841;\n" -" add.ftz.f32 %f844, %f840, %f843;\n" -" mul.ftz.f32 %f845, %f51, %f748;\n" -" mul.ftz.f32 %f846, %f836, %f61;\n" -" sub.ftz.f32 %f847, %f846, %f845;\n" -" add.ftz.f32 %f848, %f844, %f847;\n" -" mul.ftz.f32 %f849, %f848, %f645;\n" -" sub.ftz.f32 %f850, %f849, %f837;\n" -" fma.rn.ftz.f32 %f851, %f364, %f596, %f850;\n" -" add.ftz.f32 %f134, %f134, %f851;\n" -" .loc 17 349 0\n" -" mul.ftz.f32 %f852, %f569, %f557;\n" -" mul.ftz.f32 %f853, %f47, %f834;\n" -" mul.ftz.f32 %f854, %f59, %f749;\n" -" sub.ftz.f32 %f855, %f854, %f853;\n" -" mul.ftz.f32 %f856, %f60, %f835;\n" -" mul.ftz.f32 %f857, %f750, %f46;\n" -" sub.ftz.f32 %f858, %f857, %f856;\n" -" add.ftz.f32 %f859, %f855, %f858;\n" -" mul.ftz.f32 %f860, %f53, %f836;\n" -" mul.ftz.f32 %f861, %f747, %f51;\n" -" sub.ftz.f32 %f862, %f861, %f860;\n" -" add.ftz.f32 %f863, %f859, %f862;\n" -" mul.ftz.f32 %f864, %f863, %f645;\n" -" sub.ftz.f32 %f865, %f864, %f852;\n" -" fma.rn.ftz.f32 %f866, %f367, %f596, %f865;\n" -" add.ftz.f32 %f133, %f133, %f866;\n" -" mul.lo.s32 %r35, %r14, %r1;\n" -" cvt.s64.s32 %rd54, %r35;\n" -" mul.wide.s32 %rd55, %r35, 4;\n" -" add.u64 %rd25, %rd25, %rd55;\n" -" setp.gt.u64 %p23, %rd28, %rd25;\n" -" @%p23 bra $Lt_0_40962;\n" -" bra.uni $Lt_0_40450;\n" -"$Lt_0_56834:\n" -" mov.f32 %f133, 0f00000000; \n" -" mov.f32 %f134, 0f00000000; \n" -" mov.f32 %f135, 0f00000000; \n" -" mov.f32 %f136, 0f00000000; \n" -" mov.f32 %f137, 0f00000000; \n" -" mov.f32 %f138, 0f00000000; \n" -" mov.f32 %f139, 0f00000000; \n" -"$Lt_0_40450:\n" -" mov.u32 %r36, 1;\n" -" setp.le.s32 %p24, %r1, %r36;\n" -" @%p24 bra $Lt_0_53250;\n" -" .loc 17 352 0\n" -" mov.u64 %rd56, __cuda___cuda_local_var_33207_55_non_const_red_acc144;\n" -" cvt.s64.s32 %rd57, %r2;\n" -" mul.wide.s32 %rd58, %r2, 4;\n" -" add.u64 %rd59, %rd56, %rd58;\n" -" mov.f32 %f867, %f138;\n" -" st.shared.f32 [%rd59+0], %f867;\n" -" mov.f32 %f868, %f137;\n" -" st.shared.f32 [%rd59+512], %f868;\n" -" mov.f32 %f869, %f136;\n" -" st.shared.f32 [%rd59+1024], %f869;\n" -" mov.f32 %f870, %f135;\n" -" st.shared.f32 [%rd59+1536], %f870;\n" -" mov.f32 %f871, %f134;\n" -" st.shared.f32 [%rd59+2048], %f871;\n" -" mov.f32 %f872, %f133;\n" -" st.shared.f32 [%rd59+2560], %f872;\n" -" shr.s32 %r37, %r1, 31;\n" -" mov.s32 %r38, 1;\n" -" and.b32 %r39, %r37, %r38;\n" -" add.s32 %r40, %r39, %r1;\n" -" shr.s32 %r41, %r40, 1;\n" -" mov.s32 %r42, %r41;\n" -" mov.u32 %r43, 0;\n" -" setp.ne.u32 %p25, %r41, %r43;\n" -" @!%p25 bra $Lt_0_51714;\n" -"$Lt_0_52226:\n" -" setp.ge.u32 %p26, %r16, %r42;\n" -" @%p26 bra $Lt_0_52482;\n" -" add.u32 %r44, %r2, %r42;\n" -" cvt.u64.u32 %rd60, %r44;\n" -" mul.wide.u32 %rd61, %r44, 4;\n" -" add.u64 %rd62, %rd56, %rd61;\n" -" ld.shared.f32 %f873, [%rd62+0];\n" -" add.ftz.f32 %f867, %f873, %f867;\n" -" st.shared.f32 [%rd59+0], %f867;\n" -" ld.shared.f32 %f874, [%rd62+512];\n" -" add.ftz.f32 %f868, %f874, %f868;\n" -" st.shared.f32 [%rd59+512], %f868;\n" -" ld.shared.f32 %f875, [%rd62+1024];\n" -" add.ftz.f32 %f869, %f875, %f869;\n" -" st.shared.f32 [%rd59+1024], %f869;\n" -" ld.shared.f32 %f876, [%rd62+1536];\n" -" add.ftz.f32 %f870, %f876, %f870;\n" -" st.shared.f32 [%rd59+1536], %f870;\n" -" ld.shared.f32 %f877, [%rd62+2048];\n" -" add.ftz.f32 %f871, %f877, %f871;\n" -" st.shared.f32 [%rd59+2048], %f871;\n" -" ld.shared.f32 %f878, [%rd62+2560];\n" -" add.ftz.f32 %f872, %f878, %f872;\n" -" st.shared.f32 [%rd59+2560], %f872;\n" -"$Lt_0_52482:\n" -" shr.u32 %r42, %r42, 1;\n" -" mov.u32 %r45, 0;\n" -" setp.ne.u32 %p27, %r42, %r45;\n" -" @%p27 bra $Lt_0_52226;\n" -"$Lt_0_51714:\n" -" mov.f32 %f138, %f867;\n" -" mov.f32 %f137, %f868;\n" -" mov.f32 %f136, %f869;\n" -" mov.f32 %f135, %f870;\n" -" mov.f32 %f134, %f871;\n" -" mov.f32 %f133, %f872;\n" -" ld.param.s32 %r46, [__cudaparm_kernel_ellipsoid_eflag];\n" -" mov.s32 %r47, 0;\n" -" set.gt.u32.s32 %r48, %r46, %r47;\n" -" neg.s32 %r49, %r48;\n" -" ld.param.s32 %r50, [__cudaparm_kernel_ellipsoid_vflag];\n" -" mov.s32 %r51, 0;\n" -" set.gt.u32.s32 %r52, %r50, %r51;\n" -" neg.s32 %r53, %r52;\n" -" or.b32 %r54, %r49, %r53;\n" -" mov.u32 %r55, 0;\n" -" setp.eq.s32 %p28, %r54, %r55;\n" -" @%p28 bra $Lt_0_53250;\n" -" mov.f32 %f867, %f6;\n" -" st.shared.f32 [%rd59+0], %f867;\n" -" mov.f32 %f868, %f8;\n" -" st.shared.f32 [%rd59+512], %f868;\n" -" mov.f32 %f869, %f10;\n" -" st.shared.f32 [%rd59+1024], %f869;\n" -" mov.f32 %f870, %f12;\n" -" st.shared.f32 [%rd59+1536], %f870;\n" -" mov.f32 %f871, %f14;\n" -" st.shared.f32 [%rd59+2048], %f871;\n" -" mov.f32 %f872, %f15;\n" -" st.shared.f32 [%rd59+2560], %f872;\n" -" mov.f32 %f879, %f139;\n" -" st.shared.f32 [%rd59+3072], %f879;\n" -" mov.s32 %r56, %r41;\n" -" @!%p25 bra $Lt_0_53762;\n" -"$Lt_0_54274:\n" -" setp.ge.u32 %p29, %r16, %r56;\n" -" @%p29 bra $Lt_0_54530;\n" -" add.u32 %r57, %r2, %r56;\n" -" cvt.u64.u32 %rd63, %r57;\n" -" mul.wide.u32 %rd64, %r57, 4;\n" -" add.u64 %rd65, %rd56, %rd64;\n" -" ld.shared.f32 %f880, [%rd65+0];\n" -" add.ftz.f32 %f867, %f880, %f867;\n" -" st.shared.f32 [%rd59+0], %f867;\n" -" ld.shared.f32 %f881, [%rd65+512];\n" -" add.ftz.f32 %f868, %f881, %f868;\n" -" st.shared.f32 [%rd59+512], %f868;\n" -" ld.shared.f32 %f882, [%rd65+1024];\n" -" add.ftz.f32 %f869, %f882, %f869;\n" -" st.shared.f32 [%rd59+1024], %f869;\n" -" ld.shared.f32 %f883, [%rd65+1536];\n" -" add.ftz.f32 %f870, %f883, %f870;\n" -" st.shared.f32 [%rd59+1536], %f870;\n" -" ld.shared.f32 %f884, [%rd65+2048];\n" -" add.ftz.f32 %f871, %f884, %f871;\n" -" st.shared.f32 [%rd59+2048], %f871;\n" -" ld.shared.f32 %f885, [%rd65+2560];\n" -" add.ftz.f32 %f872, %f885, %f872;\n" -" st.shared.f32 [%rd59+2560], %f872;\n" -" ld.shared.f32 %f886, [%rd65+3072];\n" -" add.ftz.f32 %f879, %f886, %f879;\n" -" st.shared.f32 [%rd59+3072], %f879;\n" -"$Lt_0_54530:\n" -" shr.u32 %r56, %r56, 1;\n" -" mov.u32 %r58, 0;\n" -" setp.ne.u32 %p30, %r56, %r58;\n" -" @%p30 bra $Lt_0_54274;\n" -"$Lt_0_53762:\n" -" mov.f32 %f6, %f867;\n" -" mov.f32 %f8, %f868;\n" -" mov.f32 %f10, %f869;\n" -" mov.f32 %f12, %f870;\n" -" mov.f32 %f14, %f871;\n" -" mov.f32 %f16, %f872;\n" -" mov.f32 %f139, %f879;\n" -"$Lt_0_53250:\n" -"$Lt_0_51202:\n" -" mov.u32 %r59, 0;\n" -" setp.ne.s32 %p31, %r16, %r59;\n" -" @%p31 bra $Lt_0_55298;\n" -" ld.param.u64 %rd66, [__cudaparm_kernel_ellipsoid_engv];\n" -" add.u64 %rd67, %rd66, %rd3;\n" -" ld.param.s32 %r60, [__cudaparm_kernel_ellipsoid_astride];\n" -" ld.param.s32 %r61, [__cudaparm_kernel_ellipsoid_eflag];\n" -" mov.u32 %r62, 0;\n" -" setp.le.s32 %p32, %r61, %r62;\n" -" @%p32 bra $Lt_0_55810;\n" -" st.global.f32 [%rd67+0], %f139;\n" -" cvt.s64.s32 %rd68, %r60;\n" -" mul.wide.s32 %rd69, %r60, 4;\n" -" add.u64 %rd67, %rd67, %rd69;\n" -"$Lt_0_55810:\n" -" ld.param.s32 %r63, [__cudaparm_kernel_ellipsoid_vflag];\n" -" mov.u32 %r64, 0;\n" -" setp.le.s32 %p33, %r63, %r64;\n" -" @%p33 bra $Lt_0_56322;\n" -" mov.f32 %f887, %f6;\n" -" st.global.f32 [%rd67+0], %f887;\n" -" cvt.s64.s32 %rd70, %r60;\n" -" mul.wide.s32 %rd71, %r60, 4;\n" -" add.u64 %rd72, %rd71, %rd67;\n" -" mov.f32 %f888, %f8;\n" -" st.global.f32 [%rd72+0], %f888;\n" -" add.u64 %rd73, %rd71, %rd72;\n" -" mov.f32 %f889, %f10;\n" -" st.global.f32 [%rd73+0], %f889;\n" -" add.u64 %rd74, %rd71, %rd73;\n" -" mov.f32 %f890, %f12;\n" -" st.global.f32 [%rd74+0], %f890;\n" -" add.u64 %rd67, %rd71, %rd74;\n" -" mov.f32 %f891, %f14;\n" -" st.global.f32 [%rd67+0], %f891;\n" -" mov.f32 %f892, %f16;\n" -" add.u64 %rd75, %rd71, %rd67;\n" -" st.global.f32 [%rd75+0], %f892;\n" -"$Lt_0_56322:\n" -" ld.param.u64 %rd76, [__cudaparm_kernel_ellipsoid_ans];\n" -" mul.lo.u64 %rd77, %rd2, 16;\n" -" add.u64 %rd78, %rd76, %rd77;\n" -" mov.f32 %f893, %f894;\n" -" st.global.v4.f32 [%rd78+0], {%f138,%f137,%f136,%f893};\n" -" add.s32 %r65, %r8, %r60;\n" -" cvt.s64.s32 %rd79, %r65;\n" -" mul.wide.s32 %rd80, %r65, 16;\n" -" add.u64 %rd81, %rd76, %rd80;\n" -" mov.f32 %f895, %f896;\n" -" st.global.v4.f32 [%rd81+0], {%f135,%f134,%f133,%f895};\n" -"$Lt_0_55298:\n" -"$Lt_0_39938:\n" -" .loc 17 355 0\n" -" exit;\n" -"$LDWend_kernel_ellipsoid:\n" -" }\n" -; diff --git a/lib/gpu/lal_ans.o b/lib/gpu/lal_ans.o deleted file mode 100644 index bf704f85c5..0000000000 Binary files a/lib/gpu/lal_ans.o and /dev/null differ diff --git a/lib/gpu/lal_atom.o b/lib/gpu/lal_atom.o deleted file mode 100644 index 1c3379497e..0000000000 Binary files a/lib/gpu/lal_atom.o and /dev/null differ diff --git a/lib/gpu/lal_base_atomic.o b/lib/gpu/lal_base_atomic.o deleted file mode 100644 index 9ed10a2d39..0000000000 Binary files a/lib/gpu/lal_base_atomic.o and /dev/null differ diff --git a/lib/gpu/lal_base_charge.o b/lib/gpu/lal_base_charge.o deleted file mode 100644 index ab735d74db..0000000000 Binary files a/lib/gpu/lal_base_charge.o and /dev/null differ diff --git a/lib/gpu/lal_base_ellipsoid.o b/lib/gpu/lal_base_ellipsoid.o deleted file mode 100644 index 8fe2f1f638..0000000000 Binary files a/lib/gpu/lal_base_ellipsoid.o and /dev/null differ diff --git a/lib/gpu/lal_cg_cmm.o b/lib/gpu/lal_cg_cmm.o deleted file mode 100644 index 88fdd2fc7a..0000000000 Binary files a/lib/gpu/lal_cg_cmm.o and /dev/null differ diff --git a/lib/gpu/lal_cg_cmm_ext.o b/lib/gpu/lal_cg_cmm_ext.o deleted file mode 100644 index 93408bb679..0000000000 Binary files a/lib/gpu/lal_cg_cmm_ext.o and /dev/null differ diff --git a/lib/gpu/lal_cg_cmm_long.o b/lib/gpu/lal_cg_cmm_long.o deleted file mode 100644 index 45b416f03b..0000000000 Binary files a/lib/gpu/lal_cg_cmm_long.o and /dev/null differ diff --git a/lib/gpu/lal_cg_cmm_long_ext.o b/lib/gpu/lal_cg_cmm_long_ext.o deleted file mode 100644 index 5131cb7530..0000000000 Binary files a/lib/gpu/lal_cg_cmm_long_ext.o and /dev/null differ diff --git a/lib/gpu/lal_charmm_long.o b/lib/gpu/lal_charmm_long.o deleted file mode 100644 index f5baeb52bf..0000000000 Binary files a/lib/gpu/lal_charmm_long.o and /dev/null differ diff --git a/lib/gpu/lal_charmm_long_ext.o b/lib/gpu/lal_charmm_long_ext.o deleted file mode 100644 index 5a5fe16c2e..0000000000 Binary files a/lib/gpu/lal_charmm_long_ext.o and /dev/null differ diff --git a/lib/gpu/lal_coul_long.o b/lib/gpu/lal_coul_long.o deleted file mode 100644 index bf1455194f..0000000000 Binary files a/lib/gpu/lal_coul_long.o and /dev/null differ diff --git a/lib/gpu/lal_coul_long_ext.o b/lib/gpu/lal_coul_long_ext.o deleted file mode 100644 index d2407cfe75..0000000000 Binary files a/lib/gpu/lal_coul_long_ext.o and /dev/null differ diff --git a/lib/gpu/lal_device.o b/lib/gpu/lal_device.o deleted file mode 100644 index 5c8d20504b..0000000000 Binary files a/lib/gpu/lal_device.o and /dev/null differ diff --git a/lib/gpu/lal_gayberne.o b/lib/gpu/lal_gayberne.o deleted file mode 100644 index 71b1d41474..0000000000 Binary files a/lib/gpu/lal_gayberne.o and /dev/null differ diff --git a/lib/gpu/lal_gayberne_ext.o b/lib/gpu/lal_gayberne_ext.o deleted file mode 100644 index 0dab91ad12..0000000000 Binary files a/lib/gpu/lal_gayberne_ext.o and /dev/null differ diff --git a/lib/gpu/lal_lj.o b/lib/gpu/lal_lj.o deleted file mode 100644 index 46c1921dc3..0000000000 Binary files a/lib/gpu/lal_lj.o and /dev/null differ diff --git a/lib/gpu/lal_lj96.o b/lib/gpu/lal_lj96.o deleted file mode 100644 index 7263e2b7d7..0000000000 Binary files a/lib/gpu/lal_lj96.o and /dev/null differ diff --git a/lib/gpu/lal_lj96_ext.o b/lib/gpu/lal_lj96_ext.o deleted file mode 100644 index 91827d1ae8..0000000000 Binary files a/lib/gpu/lal_lj96_ext.o and /dev/null differ diff --git a/lib/gpu/lal_lj_class2_long.o b/lib/gpu/lal_lj_class2_long.o deleted file mode 100644 index b6006a7be5..0000000000 Binary files a/lib/gpu/lal_lj_class2_long.o and /dev/null differ diff --git a/lib/gpu/lal_lj_class2_long_ext.o b/lib/gpu/lal_lj_class2_long_ext.o deleted file mode 100644 index 49bee045b2..0000000000 Binary files a/lib/gpu/lal_lj_class2_long_ext.o and /dev/null differ diff --git a/lib/gpu/lal_lj_coul.o b/lib/gpu/lal_lj_coul.o deleted file mode 100644 index 6e251cfa18..0000000000 Binary files a/lib/gpu/lal_lj_coul.o and /dev/null differ diff --git a/lib/gpu/lal_lj_coul_ext.o b/lib/gpu/lal_lj_coul_ext.o deleted file mode 100644 index 239a24308b..0000000000 Binary files a/lib/gpu/lal_lj_coul_ext.o and /dev/null differ diff --git a/lib/gpu/lal_lj_coul_long.o b/lib/gpu/lal_lj_coul_long.o deleted file mode 100644 index 15ce8447a0..0000000000 Binary files a/lib/gpu/lal_lj_coul_long.o and /dev/null differ diff --git a/lib/gpu/lal_lj_coul_long_ext.o b/lib/gpu/lal_lj_coul_long_ext.o deleted file mode 100644 index 8946ee721c..0000000000 Binary files a/lib/gpu/lal_lj_coul_long_ext.o and /dev/null differ diff --git a/lib/gpu/lal_lj_expand.o b/lib/gpu/lal_lj_expand.o deleted file mode 100644 index 19e9fe682f..0000000000 Binary files a/lib/gpu/lal_lj_expand.o and /dev/null differ diff --git a/lib/gpu/lal_lj_expand_ext.o b/lib/gpu/lal_lj_expand_ext.o deleted file mode 100644 index a900ac9eb8..0000000000 Binary files a/lib/gpu/lal_lj_expand_ext.o and /dev/null differ diff --git a/lib/gpu/lal_lj_ext.o b/lib/gpu/lal_lj_ext.o deleted file mode 100644 index d1b8652b8a..0000000000 Binary files a/lib/gpu/lal_lj_ext.o and /dev/null differ diff --git a/lib/gpu/lal_morse.o b/lib/gpu/lal_morse.o deleted file mode 100644 index e325b2c5f7..0000000000 Binary files a/lib/gpu/lal_morse.o and /dev/null differ diff --git a/lib/gpu/lal_morse_ext.o b/lib/gpu/lal_morse_ext.o deleted file mode 100644 index b74e3c2d52..0000000000 Binary files a/lib/gpu/lal_morse_ext.o and /dev/null differ diff --git a/lib/gpu/lal_neighbor.o b/lib/gpu/lal_neighbor.o deleted file mode 100644 index f27fe9144c..0000000000 Binary files a/lib/gpu/lal_neighbor.o and /dev/null differ diff --git a/lib/gpu/lal_neighbor_shared.o b/lib/gpu/lal_neighbor_shared.o deleted file mode 100644 index b77e70255a..0000000000 Binary files a/lib/gpu/lal_neighbor_shared.o and /dev/null differ diff --git a/lib/gpu/lal_pppm.o b/lib/gpu/lal_pppm.o deleted file mode 100644 index 7648e2b13e..0000000000 Binary files a/lib/gpu/lal_pppm.o and /dev/null differ diff --git a/lib/gpu/lal_pppm_ext.o b/lib/gpu/lal_pppm_ext.o deleted file mode 100644 index d6413d9a80..0000000000 Binary files a/lib/gpu/lal_pppm_ext.o and /dev/null differ diff --git a/lib/gpu/lal_re_squared.o b/lib/gpu/lal_re_squared.o deleted file mode 100644 index 770bd3d8c3..0000000000 Binary files a/lib/gpu/lal_re_squared.o and /dev/null differ diff --git a/lib/gpu/lal_re_squared_ext.o b/lib/gpu/lal_re_squared_ext.o deleted file mode 100644 index 0445f65d6d..0000000000 Binary files a/lib/gpu/lal_re_squared_ext.o and /dev/null differ diff --git a/lib/gpu/lj.ptx b/lib/gpu/lj.ptx deleted file mode 100644 index a9cb5c978f..0000000000 --- a/lib/gpu/lj.ptx +++ /dev/null @@ -1,901 +0,0 @@ - .version 2.3 - .target sm_20 - .address_size 64 - // compiled with /usr/local/cuda/open64/lib//be - // nvopencc 4.0 built on 2011-05-12 - - //----------------------------------------------------------- - // Compiling /tmp/tmpxft_00009c40_00000000-9_lal_lj.cpp3.i (/home/sjplimp/ccBI#.N4UW9Z) - //----------------------------------------------------------- - - //----------------------------------------------------------- - // Options: - //----------------------------------------------------------- - // Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64 - // -O3 (Optimization level) - // -g0 (Debug level) - // -m2 (Report advisories) - //----------------------------------------------------------- - - .file 1 "" - .file 2 "/tmp/tmpxft_00009c40_00000000-8_lal_lj.cudafe2.gpu" - .file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h" - .file 4 "/usr/local/cuda/include/crt/device_runtime.h" - .file 5 "/usr/local/cuda/include/host_defines.h" - .file 6 "/usr/local/cuda/include/builtin_types.h" - .file 7 "/usr/local/cuda/include/device_types.h" - .file 8 "/usr/local/cuda/include/driver_types.h" - .file 9 "/usr/local/cuda/include/surface_types.h" - .file 10 "/usr/local/cuda/include/texture_types.h" - .file 11 "/usr/local/cuda/include/vector_types.h" - .file 12 "/usr/local/cuda/include/device_launch_parameters.h" - .file 13 "/usr/local/cuda/include/crt/storage_class.h" - .file 14 "/usr/include/bits/types.h" - .file 15 "/usr/include/time.h" - .file 16 "lal_lj.cu" - .file 17 "/usr/local/cuda/include/common_functions.h" - .file 18 "/usr/local/cuda/include/math_functions.h" - .file 19 "/usr/local/cuda/include/math_constants.h" - .file 20 "/usr/local/cuda/include/device_functions.h" - .file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h" - .file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h" - .file 23 "/usr/local/cuda/include/sm_13_double_functions.h" - .file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h" - .file 25 "/usr/local/cuda/include/sm_20_intrinsics.h" - .file 26 "/usr/local/cuda/include/surface_functions.h" - .file 27 "/usr/local/cuda/include/texture_fetch_functions.h" - .file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h" - - .global .texref pos_tex; - - .entry kernel_pair ( - .param .u64 __cudaparm_kernel_pair_x_, - .param .u64 __cudaparm_kernel_pair_lj1, - .param .u64 __cudaparm_kernel_pair_lj3, - .param .s32 __cudaparm_kernel_pair_lj_types, - .param .u64 __cudaparm_kernel_pair_sp_lj_in, - .param .u64 __cudaparm_kernel_pair_dev_nbor, - .param .u64 __cudaparm_kernel_pair_dev_packed, - .param .u64 __cudaparm_kernel_pair_ans, - .param .u64 __cudaparm_kernel_pair___val_paramengv, - .param .s32 __cudaparm_kernel_pair_eflag, - .param .s32 __cudaparm_kernel_pair_vflag, - .param .s32 __cudaparm_kernel_pair_inum, - .param .s32 __cudaparm_kernel_pair_nbor_pitch, - .param .s32 __cudaparm_kernel_pair_t_per_atom) - { - .reg .u32 %r<72>; - .reg .u64 %rd<63>; - .reg .f32 %f<102>; - .reg .pred %p<19>; - .shared .align 16 .b8 __cuda___cuda_local_var_32536_33_non_const_sp_lj92[16]; - .shared .align 4 .b8 __cuda___cuda_local_var_32600_55_non_const_red_acc108[3072]; - // __cuda_local_var_32543_10_non_const_f = 48 - // __cuda_local_var_32545_9_non_const_virial = 16 - .loc 16 31 0 -$LDWbegin_kernel_pair: - .loc 16 36 0 - ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in]; - ldu.global.f32 %f1, [%rd1+0]; - .loc 16 37 0 - ld.global.f32 %f2, [%rd1+4]; - .loc 16 38 0 - ld.global.f32 %f3, [%rd1+8]; - .loc 16 39 0 - ld.global.f32 %f4, [%rd1+12]; - st.shared.v4.f32 [__cuda___cuda_local_var_32536_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4}; - .loc 16 46 0 - mov.f32 %f5, 0f00000000; // 0 - mov.f32 %f6, %f5; - mov.f32 %f7, 0f00000000; // 0 - mov.f32 %f8, %f7; - mov.f32 %f9, 0f00000000; // 0 - mov.f32 %f10, %f9; - mov.f32 %f11, 0f00000000; // 0 - mov.f32 %f12, %f11; - mov.f32 %f13, 0f00000000; // 0 - mov.f32 %f14, %f13; - mov.f32 %f15, 0f00000000; // 0 - mov.f32 %f16, %f15; - ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom]; - cvt.s32.u32 %r2, %tid.x; - div.s32 %r3, %r2, %r1; - cvt.s32.u32 %r4, %ntid.x; - div.s32 %r5, %r4, %r1; - cvt.s32.u32 %r6, %ctaid.x; - mul.lo.s32 %r7, %r6, %r5; - add.s32 %r8, %r3, %r7; - ld.param.s32 %r9, [__cudaparm_kernel_pair_inum]; - setp.ge.s32 %p1, %r8, %r9; - @%p1 bra $Lt_0_26370; - .loc 16 51 0 - ld.param.s32 %r10, [__cudaparm_kernel_pair_nbor_pitch]; - cvt.s64.s32 %rd2, %r10; - mul.wide.s32 %rd3, %r10, 4; - cvt.s64.s32 %rd4, %r8; - mul.wide.s32 %rd5, %r8, 4; - ld.param.u64 %rd6, [__cudaparm_kernel_pair_dev_nbor]; - add.u64 %rd7, %rd5, %rd6; - add.u64 %rd8, %rd3, %rd7; - ld.global.s32 %r11, [%rd8+0]; - sub.s32 %r12, %r1, 1; - and.b32 %r13, %r12, %r2; - cvt.s64.s32 %rd9, %r13; - mul.wide.s32 %rd10, %r13, 4; - ld.param.u64 %rd11, [__cudaparm_kernel_pair_dev_packed]; - setp.ne.u64 %p2, %rd11, %rd6; - @%p2 bra $Lt_0_19458; - cvt.s32.s64 %r14, %rd2; - mul.lo.s32 %r15, %r14, %r1; - mov.s32 %r16, %r15; - mul.lo.s32 %r17, %r12, %r8; - add.s32 %r18, %r14, %r17; - cvt.s64.s32 %rd12, %r18; - mul.wide.s32 %rd13, %r18, 4; - add.u64 %rd14, %rd8, %rd13; - and.b32 %r19, %r12, %r11; - cvt.s64.s32 %rd15, %r19; - div.s32 %r20, %r11, %r1; - mul.lo.s32 %r21, %r15, %r20; - cvt.s64.s32 %rd16, %r21; - add.u64 %rd17, %rd15, %rd16; - mul.lo.u64 %rd18, %rd17, 4; - add.u64 %rd19, %rd14, %rd18; - add.u64 %rd20, %rd10, %rd14; - bra.uni $Lt_0_19202; -$Lt_0_19458: - add.u64 %rd21, %rd3, %rd8; - ld.global.s32 %r22, [%rd21+0]; - cvt.s64.s32 %rd22, %r22; - mul.wide.s32 %rd23, %r22, 4; - add.u64 %rd24, %rd11, %rd23; - cvt.s64.s32 %rd25, %r11; - mul.wide.s32 %rd26, %r11, 4; - add.u64 %rd19, %rd24, %rd26; - mov.s32 %r16, %r1; - add.u64 %rd20, %rd10, %rd24; -$Lt_0_19202: - .loc 16 54 0 - ld.global.s32 %r23, [%rd7+0]; - mov.u32 %r24, %r23; - mov.s32 %r25, 0; - mov.u32 %r26, %r25; - mov.s32 %r27, 0; - mov.u32 %r28, %r27; - mov.s32 %r29, 0; - mov.u32 %r30, %r29; - tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r24,%r26,%r28,%r30}]; - mov.f32 %f21, %f17; - mov.f32 %f22, %f18; - mov.f32 %f23, %f19; - mov.f32 %f24, %f20; - setp.ge.u64 %p3, %rd20, %rd19; - @%p3 bra $Lt_0_27906; - cvt.rzi.ftz.s32.f32 %r31, %f24; - cvt.s64.s32 %rd27, %r16; - ld.param.s32 %r32, [__cudaparm_kernel_pair_lj_types]; - mul.lo.s32 %r33, %r32, %r31; - ld.param.u64 %rd28, [__cudaparm_kernel_pair_lj1]; - mov.f32 %f25, 0f00000000; // 0 - mov.f32 %f26, 0f00000000; // 0 - mov.f32 %f27, 0f00000000; // 0 - mov.f32 %f28, 0f00000000; // 0 - mov.u64 %rd29, __cuda___cuda_local_var_32536_33_non_const_sp_lj92; -$Lt_0_20226: - // Loop body line 54, nesting depth: 1, estimated iterations: unknown - .loc 16 60 0 - ld.global.s32 %r34, [%rd20+0]; - .loc 16 61 0 - shr.s32 %r35, %r34, 30; - and.b32 %r36, %r35, 3; - cvt.s64.s32 %rd30, %r36; - mul.wide.s32 %rd31, %r36, 4; - add.u64 %rd32, %rd29, %rd31; - ld.shared.f32 %f29, [%rd32+0]; - .loc 16 64 0 - and.b32 %r37, %r34, 1073741823; - mov.u32 %r38, %r37; - mov.s32 %r39, 0; - mov.u32 %r40, %r39; - mov.s32 %r41, 0; - mov.u32 %r42, %r41; - mov.s32 %r43, 0; - mov.u32 %r44, %r43; - tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r38,%r40,%r42,%r44}]; - mov.f32 %f34, %f30; - mov.f32 %f35, %f31; - mov.f32 %f36, %f32; - mov.f32 %f37, %f33; - cvt.rzi.ftz.s32.f32 %r45, %f37; - sub.ftz.f32 %f38, %f22, %f35; - sub.ftz.f32 %f39, %f21, %f34; - sub.ftz.f32 %f40, %f23, %f36; - mul.ftz.f32 %f41, %f38, %f38; - fma.rn.ftz.f32 %f42, %f39, %f39, %f41; - fma.rn.ftz.f32 %f43, %f40, %f40, %f42; - add.s32 %r46, %r45, %r33; - cvt.s64.s32 %rd33, %r46; - mul.wide.s32 %rd34, %r46, 16; - add.u64 %rd35, %rd34, %rd28; - ld.global.f32 %f44, [%rd35+8]; - setp.gt.ftz.f32 %p4, %f44, %f43; - @!%p4 bra $Lt_0_21506; - .loc 16 78 0 - rcp.approx.ftz.f32 %f45, %f43; - mul.ftz.f32 %f46, %f45, %f45; - mul.ftz.f32 %f47, %f45, %f46; - mul.ftz.f32 %f48, %f45, %f47; - ld.global.v2.f32 {%f49,%f50}, [%rd35+0]; - mul.ftz.f32 %f51, %f49, %f47; - sub.ftz.f32 %f52, %f51, %f50; - mul.ftz.f32 %f53, %f48, %f52; - mul.ftz.f32 %f54, %f29, %f53; - .loc 16 80 0 - fma.rn.ftz.f32 %f27, %f39, %f54, %f27; - .loc 16 81 0 - fma.rn.ftz.f32 %f26, %f38, %f54, %f26; - .loc 16 82 0 - fma.rn.ftz.f32 %f25, %f40, %f54, %f25; - ld.param.s32 %r47, [__cudaparm_kernel_pair_eflag]; - mov.u32 %r48, 0; - setp.le.s32 %p5, %r47, %r48; - @%p5 bra $Lt_0_20994; - .loc 16 86 0 - ld.param.u64 %rd36, [__cudaparm_kernel_pair_lj3]; - add.u64 %rd37, %rd36, %rd34; - ld.global.v4.f32 {%f55,%f56,%f57,_}, [%rd37+0]; - mul.ftz.f32 %f58, %f55, %f47; - sub.ftz.f32 %f59, %f58, %f56; - mul.ftz.f32 %f60, %f47, %f59; - sub.ftz.f32 %f61, %f60, %f57; - fma.rn.ftz.f32 %f28, %f29, %f61, %f28; -$Lt_0_20994: - ld.param.s32 %r49, [__cudaparm_kernel_pair_vflag]; - mov.u32 %r50, 0; - setp.le.s32 %p6, %r49, %r50; - @%p6 bra $Lt_0_21506; - .loc 16 89 0 - mov.f32 %f62, %f6; - mul.ftz.f32 %f63, %f39, %f39; - fma.rn.ftz.f32 %f64, %f54, %f63, %f62; - mov.f32 %f6, %f64; - .loc 16 90 0 - mov.f32 %f65, %f8; - fma.rn.ftz.f32 %f66, %f54, %f41, %f65; - mov.f32 %f8, %f66; - .loc 16 91 0 - mov.f32 %f67, %f10; - mul.ftz.f32 %f68, %f40, %f40; - fma.rn.ftz.f32 %f69, %f54, %f68, %f67; - mov.f32 %f10, %f69; - .loc 16 92 0 - mov.f32 %f70, %f12; - mul.ftz.f32 %f71, %f38, %f39; - fma.rn.ftz.f32 %f72, %f54, %f71, %f70; - mov.f32 %f12, %f72; - .loc 16 93 0 - mov.f32 %f73, %f14; - mul.ftz.f32 %f74, %f39, %f40; - fma.rn.ftz.f32 %f75, %f54, %f74, %f73; - mov.f32 %f14, %f75; - .loc 16 94 0 - mul.ftz.f32 %f76, %f38, %f40; - fma.rn.ftz.f32 %f15, %f54, %f76, %f15; - mov.f32 %f16, %f15; -$Lt_0_21506: -$Lt_0_20482: - .loc 16 58 0 - mul.lo.u64 %rd38, %rd27, 4; - add.u64 %rd20, %rd20, %rd38; - setp.lt.u64 %p7, %rd20, %rd19; - @%p7 bra $Lt_0_20226; - bra.uni $Lt_0_19714; -$Lt_0_27906: - mov.f32 %f25, 0f00000000; // 0 - mov.f32 %f26, 0f00000000; // 0 - mov.f32 %f27, 0f00000000; // 0 - mov.f32 %f28, 0f00000000; // 0 -$Lt_0_19714: - mov.u32 %r51, 1; - setp.le.s32 %p8, %r1, %r51; - @%p8 bra $Lt_0_24322; - .loc 16 99 0 - mov.u64 %rd39, __cuda___cuda_local_var_32600_55_non_const_red_acc108; - cvt.s64.s32 %rd40, %r2; - mul.wide.s32 %rd41, %r2, 4; - add.u64 %rd42, %rd39, %rd41; - mov.f32 %f77, %f27; - st.shared.f32 [%rd42+0], %f77; - mov.f32 %f78, %f26; - st.shared.f32 [%rd42+512], %f78; - mov.f32 %f79, %f25; - st.shared.f32 [%rd42+1024], %f79; - mov.f32 %f80, %f28; - st.shared.f32 [%rd42+1536], %f80; - shr.s32 %r52, %r1, 31; - mov.s32 %r53, 1; - and.b32 %r54, %r52, %r53; - add.s32 %r55, %r54, %r1; - shr.s32 %r56, %r55, 1; - mov.s32 %r57, %r56; - mov.u32 %r58, 0; - setp.ne.u32 %p9, %r56, %r58; - @!%p9 bra $Lt_0_22786; -$Lt_0_23298: - setp.ge.u32 %p10, %r13, %r57; - @%p10 bra $Lt_0_23554; - add.u32 %r59, %r2, %r57; - cvt.u64.u32 %rd43, %r59; - mul.wide.u32 %rd44, %r59, 4; - add.u64 %rd45, %rd39, %rd44; - ld.shared.f32 %f81, [%rd45+0]; - add.ftz.f32 %f77, %f81, %f77; - st.shared.f32 [%rd42+0], %f77; - ld.shared.f32 %f82, [%rd45+512]; - add.ftz.f32 %f78, %f82, %f78; - st.shared.f32 [%rd42+512], %f78; - ld.shared.f32 %f83, [%rd45+1024]; - add.ftz.f32 %f79, %f83, %f79; - st.shared.f32 [%rd42+1024], %f79; - ld.shared.f32 %f84, [%rd45+1536]; - add.ftz.f32 %f80, %f84, %f80; - st.shared.f32 [%rd42+1536], %f80; -$Lt_0_23554: - shr.u32 %r57, %r57, 1; - mov.u32 %r60, 0; - setp.ne.u32 %p11, %r57, %r60; - @%p11 bra $Lt_0_23298; -$Lt_0_22786: - mov.f32 %f27, %f77; - mov.f32 %f26, %f78; - mov.f32 %f25, %f79; - mov.f32 %f28, %f80; - ld.param.s32 %r61, [__cudaparm_kernel_pair_vflag]; - mov.u32 %r62, 0; - setp.le.s32 %p12, %r61, %r62; - @%p12 bra $Lt_0_24322; - mov.f32 %f77, %f6; - st.shared.f32 [%rd42+0], %f77; - mov.f32 %f78, %f8; - st.shared.f32 [%rd42+512], %f78; - mov.f32 %f79, %f10; - st.shared.f32 [%rd42+1024], %f79; - mov.f32 %f80, %f12; - st.shared.f32 [%rd42+1536], %f80; - mov.f32 %f85, %f14; - st.shared.f32 [%rd42+2048], %f85; - mov.f32 %f86, %f15; - st.shared.f32 [%rd42+2560], %f86; - mov.s32 %r63, %r56; - @!%p9 bra $Lt_0_24834; -$Lt_0_25346: - setp.ge.u32 %p13, %r13, %r63; - @%p13 bra $Lt_0_25602; - add.u32 %r64, %r2, %r63; - cvt.u64.u32 %rd46, %r64; - mul.wide.u32 %rd47, %r64, 4; - add.u64 %rd48, %rd39, %rd47; - ld.shared.f32 %f87, [%rd48+0]; - add.ftz.f32 %f77, %f87, %f77; - st.shared.f32 [%rd42+0], %f77; - ld.shared.f32 %f88, [%rd48+512]; - add.ftz.f32 %f78, %f88, %f78; - st.shared.f32 [%rd42+512], %f78; - ld.shared.f32 %f89, [%rd48+1024]; - add.ftz.f32 %f79, %f89, %f79; - st.shared.f32 [%rd42+1024], %f79; - ld.shared.f32 %f90, [%rd48+1536]; - add.ftz.f32 %f80, %f90, %f80; - st.shared.f32 [%rd42+1536], %f80; - ld.shared.f32 %f91, [%rd48+2048]; - add.ftz.f32 %f85, %f91, %f85; - st.shared.f32 [%rd42+2048], %f85; - ld.shared.f32 %f92, [%rd48+2560]; - add.ftz.f32 %f86, %f92, %f86; - st.shared.f32 [%rd42+2560], %f86; -$Lt_0_25602: - shr.u32 %r63, %r63, 1; - mov.u32 %r65, 0; - setp.ne.u32 %p14, %r63, %r65; - @%p14 bra $Lt_0_25346; -$Lt_0_24834: - mov.f32 %f6, %f77; - mov.f32 %f8, %f78; - mov.f32 %f10, %f79; - mov.f32 %f12, %f80; - mov.f32 %f14, %f85; - mov.f32 %f16, %f86; -$Lt_0_24322: -$Lt_0_22274: - mov.u32 %r66, 0; - setp.ne.s32 %p15, %r13, %r66; - @%p15 bra $Lt_0_26370; - ld.param.u64 %rd49, [__cudaparm_kernel_pair___val_paramengv]; - add.u64 %rd50, %rd49, %rd5; - ld.param.s32 %r67, [__cudaparm_kernel_pair_eflag]; - mov.u32 %r68, 0; - setp.le.s32 %p16, %r67, %r68; - @%p16 bra $Lt_0_26882; - st.global.f32 [%rd50+0], %f28; - cvt.s64.s32 %rd51, %r9; - mul.wide.s32 %rd52, %r9, 4; - add.u64 %rd50, %rd50, %rd52; -$Lt_0_26882: - ld.param.s32 %r69, [__cudaparm_kernel_pair_vflag]; - mov.u32 %r70, 0; - setp.le.s32 %p17, %r69, %r70; - @%p17 bra $Lt_0_27394; - mov.f32 %f93, %f6; - st.global.f32 [%rd50+0], %f93; - cvt.s64.s32 %rd53, %r9; - mul.wide.s32 %rd54, %r9, 4; - add.u64 %rd55, %rd54, %rd50; - mov.f32 %f94, %f8; - st.global.f32 [%rd55+0], %f94; - add.u64 %rd56, %rd54, %rd55; - mov.f32 %f95, %f10; - st.global.f32 [%rd56+0], %f95; - add.u64 %rd57, %rd54, %rd56; - mov.f32 %f96, %f12; - st.global.f32 [%rd57+0], %f96; - add.u64 %rd50, %rd54, %rd57; - mov.f32 %f97, %f14; - st.global.f32 [%rd50+0], %f97; - mov.f32 %f98, %f16; - add.u64 %rd58, %rd54, %rd50; - st.global.f32 [%rd58+0], %f98; -$Lt_0_27394: - ld.param.u64 %rd59, [__cudaparm_kernel_pair_ans]; - mul.lo.u64 %rd60, %rd4, 16; - add.u64 %rd61, %rd59, %rd60; - mov.f32 %f99, %f100; - st.global.v4.f32 [%rd61+0], {%f27,%f26,%f25,%f99}; -$Lt_0_26370: -$Lt_0_18690: - .loc 16 102 0 - exit; -$LDWend_kernel_pair: - } // kernel_pair - - .entry kernel_pair_fast ( - .param .u64 __cudaparm_kernel_pair_fast_x_, - .param .u64 __cudaparm_kernel_pair_fast_lj1_in, - .param .u64 __cudaparm_kernel_pair_fast_lj3_in, - .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in, - .param .u64 __cudaparm_kernel_pair_fast_dev_nbor, - .param .u64 __cudaparm_kernel_pair_fast_dev_packed, - .param .u64 __cudaparm_kernel_pair_fast_ans, - .param .u64 __cudaparm_kernel_pair_fast___val_paramengv, - .param .s32 __cudaparm_kernel_pair_fast_eflag, - .param .s32 __cudaparm_kernel_pair_fast_vflag, - .param .s32 __cudaparm_kernel_pair_fast_inum, - .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch, - .param .s32 __cudaparm_kernel_pair_fast_t_per_atom) - { - .reg .u32 %r<74>; - .reg .u64 %rd<75>; - .reg .f32 %f<109>; - .reg .pred %p<22>; - .shared .align 4 .b8 __cuda___cuda_local_var_32617_33_non_const_sp_lj3268[16]; - .shared .align 16 .b8 __cuda___cuda_local_var_32615_34_non_const_lj13296[1936]; - .shared .align 16 .b8 __cuda___cuda_local_var_32616_34_non_const_lj35232[1936]; - .shared .align 4 .b8 __cuda___cuda_local_var_32685_55_non_const_red_acc7168[3072]; - // __cuda_local_var_32627_10_non_const_f = 48 - // __cuda_local_var_32629_9_non_const_virial = 16 - .loc 16 110 0 -$LDWbegin_kernel_pair_fast: - cvt.s32.u32 %r1, %tid.x; - mov.u32 %r2, 3; - setp.gt.s32 %p1, %r1, %r2; - @%p1 bra $Lt_1_20994; - .loc 16 118 0 - mov.u64 %rd1, __cuda___cuda_local_var_32617_33_non_const_sp_lj3268; - cvt.s64.s32 %rd2, %r1; - mul.wide.s32 %rd3, %r1, 4; - ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in]; - add.u64 %rd5, %rd4, %rd3; - ld.global.f32 %f1, [%rd5+0]; - add.u64 %rd6, %rd3, %rd1; - st.shared.f32 [%rd6+0], %f1; -$Lt_1_20994: - mov.u64 %rd1, __cuda___cuda_local_var_32617_33_non_const_sp_lj3268; - mov.u32 %r3, 120; - setp.gt.s32 %p2, %r1, %r3; - @%p2 bra $Lt_1_21506; - .loc 16 120 0 - mov.u64 %rd7, __cuda___cuda_local_var_32615_34_non_const_lj13296; - cvt.s64.s32 %rd8, %r1; - mul.wide.s32 %rd9, %r1, 16; - ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_lj1_in]; - add.u64 %rd11, %rd10, %rd9; - add.u64 %rd12, %rd9, %rd7; - ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0]; - st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5}; - ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag]; - mov.u32 %r5, 0; - setp.le.s32 %p3, %r4, %r5; - @%p3 bra $Lt_1_22018; - .loc 16 122 0 - mov.u64 %rd13, __cuda___cuda_local_var_32616_34_non_const_lj35232; - ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in]; - add.u64 %rd15, %rd14, %rd9; - add.u64 %rd16, %rd9, %rd13; - ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0]; - st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9}; -$Lt_1_22018: - mov.u64 %rd13, __cuda___cuda_local_var_32616_34_non_const_lj35232; -$Lt_1_21506: - mov.u64 %rd13, __cuda___cuda_local_var_32616_34_non_const_lj35232; - mov.u64 %rd7, __cuda___cuda_local_var_32615_34_non_const_lj13296; - .loc 16 130 0 - mov.f32 %f10, 0f00000000; // 0 - mov.f32 %f11, %f10; - mov.f32 %f12, 0f00000000; // 0 - mov.f32 %f13, %f12; - mov.f32 %f14, 0f00000000; // 0 - mov.f32 %f15, %f14; - mov.f32 %f16, 0f00000000; // 0 - mov.f32 %f17, %f16; - mov.f32 %f18, 0f00000000; // 0 - mov.f32 %f19, %f18; - mov.f32 %f20, 0f00000000; // 0 - mov.f32 %f21, %f20; - .loc 16 132 0 - bar.sync 0; - ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom]; - div.s32 %r7, %r1, %r6; - cvt.s32.u32 %r8, %ntid.x; - div.s32 %r9, %r8, %r6; - cvt.s32.u32 %r10, %ctaid.x; - mul.lo.s32 %r11, %r10, %r9; - add.s32 %r12, %r7, %r11; - ld.param.s32 %r13, [__cudaparm_kernel_pair_fast_inum]; - setp.ge.s32 %p4, %r12, %r13; - @%p4 bra $Lt_1_30210; - .loc 16 137 0 - ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_nbor_pitch]; - cvt.s64.s32 %rd17, %r14; - mul.wide.s32 %rd18, %r14, 4; - cvt.s64.s32 %rd19, %r12; - mul.wide.s32 %rd20, %r12, 4; - ld.param.u64 %rd21, [__cudaparm_kernel_pair_fast_dev_nbor]; - add.u64 %rd22, %rd20, %rd21; - add.u64 %rd23, %rd18, %rd22; - ld.global.s32 %r15, [%rd23+0]; - sub.s32 %r16, %r6, 1; - and.b32 %r17, %r16, %r1; - cvt.s64.s32 %rd24, %r17; - mul.wide.s32 %rd25, %r17, 4; - ld.param.u64 %rd26, [__cudaparm_kernel_pair_fast_dev_packed]; - setp.ne.u64 %p5, %rd26, %rd21; - @%p5 bra $Lt_1_23298; - cvt.s32.s64 %r18, %rd17; - mul.lo.s32 %r19, %r18, %r6; - mov.s32 %r20, %r19; - mul.lo.s32 %r21, %r16, %r12; - add.s32 %r22, %r18, %r21; - cvt.s64.s32 %rd27, %r22; - mul.wide.s32 %rd28, %r22, 4; - add.u64 %rd29, %rd23, %rd28; - and.b32 %r23, %r16, %r15; - cvt.s64.s32 %rd30, %r23; - div.s32 %r24, %r15, %r6; - mul.lo.s32 %r25, %r19, %r24; - cvt.s64.s32 %rd31, %r25; - add.u64 %rd32, %rd30, %rd31; - mul.lo.u64 %rd33, %rd32, 4; - add.u64 %rd34, %rd29, %rd33; - add.u64 %rd35, %rd25, %rd29; - bra.uni $Lt_1_23042; -$Lt_1_23298: - add.u64 %rd36, %rd18, %rd23; - ld.global.s32 %r26, [%rd36+0]; - cvt.s64.s32 %rd37, %r26; - mul.wide.s32 %rd38, %r26, 4; - add.u64 %rd39, %rd26, %rd38; - cvt.s64.s32 %rd40, %r15; - mul.wide.s32 %rd41, %r15, 4; - add.u64 %rd34, %rd39, %rd41; - mov.s32 %r20, %r6; - add.u64 %rd35, %rd25, %rd39; -$Lt_1_23042: - .loc 16 140 0 - ld.global.s32 %r27, [%rd22+0]; - mov.u32 %r28, %r27; - mov.s32 %r29, 0; - mov.u32 %r30, %r29; - mov.s32 %r31, 0; - mov.u32 %r32, %r31; - mov.s32 %r33, 0; - mov.u32 %r34, %r33; - tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r28,%r30,%r32,%r34}]; - mov.f32 %f26, %f22; - mov.f32 %f27, %f23; - mov.f32 %f28, %f24; - mov.f32 %f29, %f25; - setp.ge.u64 %p6, %rd35, %rd34; - @%p6 bra $Lt_1_31746; - cvt.rzi.ftz.s32.f32 %r35, %f29; - cvt.s64.s32 %rd42, %r20; - mul.lo.s32 %r36, %r35, 11; - cvt.rn.f32.s32 %f30, %r36; - mov.f32 %f31, 0f00000000; // 0 - mov.f32 %f32, 0f00000000; // 0 - mov.f32 %f33, 0f00000000; // 0 - mov.f32 %f34, 0f00000000; // 0 -$Lt_1_24066: - // Loop body line 140, nesting depth: 1, estimated iterations: unknown - .loc 16 147 0 - ld.global.s32 %r37, [%rd35+0]; - .loc 16 148 0 - shr.s32 %r38, %r37, 30; - and.b32 %r39, %r38, 3; - cvt.s64.s32 %rd43, %r39; - mul.wide.s32 %rd44, %r39, 4; - add.u64 %rd45, %rd1, %rd44; - ld.shared.f32 %f35, [%rd45+0]; - .loc 16 151 0 - and.b32 %r40, %r37, 1073741823; - mov.u32 %r41, %r40; - mov.s32 %r42, 0; - mov.u32 %r43, %r42; - mov.s32 %r44, 0; - mov.u32 %r45, %r44; - mov.s32 %r46, 0; - mov.u32 %r47, %r46; - tex.1d.v4.f32.s32 {%f36,%f37,%f38,%f39},[pos_tex,{%r41,%r43,%r45,%r47}]; - mov.f32 %f40, %f36; - mov.f32 %f41, %f37; - mov.f32 %f42, %f38; - mov.f32 %f43, %f39; - sub.ftz.f32 %f44, %f27, %f41; - sub.ftz.f32 %f45, %f26, %f40; - sub.ftz.f32 %f46, %f28, %f42; - mul.ftz.f32 %f47, %f44, %f44; - fma.rn.ftz.f32 %f48, %f45, %f45, %f47; - fma.rn.ftz.f32 %f49, %f46, %f46, %f48; - add.ftz.f32 %f50, %f30, %f43; - cvt.rzi.ftz.s32.f32 %r48, %f50; - cvt.s64.s32 %rd46, %r48; - mul.wide.s32 %rd47, %r48, 16; - add.u64 %rd48, %rd47, %rd7; - ld.shared.f32 %f51, [%rd48+8]; - setp.gt.ftz.f32 %p7, %f51, %f49; - @!%p7 bra $Lt_1_25346; - .loc 16 163 0 - rcp.approx.ftz.f32 %f52, %f49; - mul.ftz.f32 %f53, %f52, %f52; - mul.ftz.f32 %f54, %f52, %f53; - mul.ftz.f32 %f55, %f52, %f35; - mul.ftz.f32 %f56, %f54, %f55; - ld.shared.v2.f32 {%f57,%f58}, [%rd48+0]; - mul.ftz.f32 %f59, %f57, %f54; - sub.ftz.f32 %f60, %f59, %f58; - mul.ftz.f32 %f61, %f56, %f60; - .loc 16 165 0 - fma.rn.ftz.f32 %f33, %f45, %f61, %f33; - .loc 16 166 0 - fma.rn.ftz.f32 %f32, %f44, %f61, %f32; - .loc 16 167 0 - fma.rn.ftz.f32 %f31, %f46, %f61, %f31; - ld.param.s32 %r49, [__cudaparm_kernel_pair_fast_eflag]; - mov.u32 %r50, 0; - setp.le.s32 %p8, %r49, %r50; - @%p8 bra $Lt_1_24834; - .loc 16 170 0 - add.u64 %rd49, %rd47, %rd13; - ld.shared.v4.f32 {%f62,%f63,%f64,_}, [%rd49+0]; - mul.ftz.f32 %f65, %f62, %f54; - sub.ftz.f32 %f66, %f65, %f63; - mul.ftz.f32 %f67, %f54, %f66; - .loc 16 171 0 - sub.ftz.f32 %f68, %f67, %f64; - fma.rn.ftz.f32 %f34, %f35, %f68, %f34; -$Lt_1_24834: - ld.param.s32 %r51, [__cudaparm_kernel_pair_fast_vflag]; - mov.u32 %r52, 0; - setp.le.s32 %p9, %r51, %r52; - @%p9 bra $Lt_1_25346; - .loc 16 174 0 - mov.f32 %f69, %f11; - mul.ftz.f32 %f70, %f45, %f45; - fma.rn.ftz.f32 %f71, %f61, %f70, %f69; - mov.f32 %f11, %f71; - .loc 16 175 0 - mov.f32 %f72, %f13; - fma.rn.ftz.f32 %f73, %f61, %f47, %f72; - mov.f32 %f13, %f73; - .loc 16 176 0 - mov.f32 %f74, %f15; - mul.ftz.f32 %f75, %f46, %f46; - fma.rn.ftz.f32 %f76, %f61, %f75, %f74; - mov.f32 %f15, %f76; - .loc 16 177 0 - mov.f32 %f77, %f17; - mul.ftz.f32 %f78, %f44, %f45; - fma.rn.ftz.f32 %f79, %f61, %f78, %f77; - mov.f32 %f17, %f79; - .loc 16 178 0 - mov.f32 %f80, %f19; - mul.ftz.f32 %f81, %f45, %f46; - fma.rn.ftz.f32 %f82, %f61, %f81, %f80; - mov.f32 %f19, %f82; - .loc 16 179 0 - mul.ftz.f32 %f83, %f44, %f46; - fma.rn.ftz.f32 %f20, %f61, %f83, %f20; - mov.f32 %f21, %f20; -$Lt_1_25346: -$Lt_1_24322: - .loc 16 145 0 - mul.lo.u64 %rd50, %rd42, 4; - add.u64 %rd35, %rd35, %rd50; - setp.lt.u64 %p10, %rd35, %rd34; - @%p10 bra $Lt_1_24066; - bra.uni $Lt_1_23554; -$Lt_1_31746: - mov.f32 %f31, 0f00000000; // 0 - mov.f32 %f32, 0f00000000; // 0 - mov.f32 %f33, 0f00000000; // 0 - mov.f32 %f34, 0f00000000; // 0 -$Lt_1_23554: - mov.u32 %r53, 1; - setp.le.s32 %p11, %r6, %r53; - @%p11 bra $Lt_1_28162; - .loc 16 184 0 - mov.u64 %rd51, __cuda___cuda_local_var_32685_55_non_const_red_acc7168; - cvt.s64.s32 %rd52, %r1; - mul.wide.s32 %rd53, %r1, 4; - add.u64 %rd54, %rd51, %rd53; - mov.f32 %f84, %f33; - st.shared.f32 [%rd54+0], %f84; - mov.f32 %f85, %f32; - st.shared.f32 [%rd54+512], %f85; - mov.f32 %f86, %f31; - st.shared.f32 [%rd54+1024], %f86; - mov.f32 %f87, %f34; - st.shared.f32 [%rd54+1536], %f87; - shr.s32 %r54, %r6, 31; - mov.s32 %r55, 1; - and.b32 %r56, %r54, %r55; - add.s32 %r57, %r56, %r6; - shr.s32 %r58, %r57, 1; - mov.s32 %r59, %r58; - mov.u32 %r60, 0; - setp.ne.u32 %p12, %r58, %r60; - @!%p12 bra $Lt_1_26626; -$Lt_1_27138: - setp.ge.u32 %p13, %r17, %r59; - @%p13 bra $Lt_1_27394; - add.u32 %r61, %r1, %r59; - cvt.u64.u32 %rd55, %r61; - mul.wide.u32 %rd56, %r61, 4; - add.u64 %rd57, %rd51, %rd56; - ld.shared.f32 %f88, [%rd57+0]; - add.ftz.f32 %f84, %f88, %f84; - st.shared.f32 [%rd54+0], %f84; - ld.shared.f32 %f89, [%rd57+512]; - add.ftz.f32 %f85, %f89, %f85; - st.shared.f32 [%rd54+512], %f85; - ld.shared.f32 %f90, [%rd57+1024]; - add.ftz.f32 %f86, %f90, %f86; - st.shared.f32 [%rd54+1024], %f86; - ld.shared.f32 %f91, [%rd57+1536]; - add.ftz.f32 %f87, %f91, %f87; - st.shared.f32 [%rd54+1536], %f87; -$Lt_1_27394: - shr.u32 %r59, %r59, 1; - mov.u32 %r62, 0; - setp.ne.u32 %p14, %r59, %r62; - @%p14 bra $Lt_1_27138; -$Lt_1_26626: - mov.f32 %f33, %f84; - mov.f32 %f32, %f85; - mov.f32 %f31, %f86; - mov.f32 %f34, %f87; - ld.param.s32 %r63, [__cudaparm_kernel_pair_fast_vflag]; - mov.u32 %r64, 0; - setp.le.s32 %p15, %r63, %r64; - @%p15 bra $Lt_1_28162; - mov.f32 %f84, %f11; - st.shared.f32 [%rd54+0], %f84; - mov.f32 %f85, %f13; - st.shared.f32 [%rd54+512], %f85; - mov.f32 %f86, %f15; - st.shared.f32 [%rd54+1024], %f86; - mov.f32 %f87, %f17; - st.shared.f32 [%rd54+1536], %f87; - mov.f32 %f92, %f19; - st.shared.f32 [%rd54+2048], %f92; - mov.f32 %f93, %f20; - st.shared.f32 [%rd54+2560], %f93; - mov.s32 %r65, %r58; - @!%p12 bra $Lt_1_28674; -$Lt_1_29186: - setp.ge.u32 %p16, %r17, %r65; - @%p16 bra $Lt_1_29442; - add.u32 %r66, %r1, %r65; - cvt.u64.u32 %rd58, %r66; - mul.wide.u32 %rd59, %r66, 4; - add.u64 %rd60, %rd51, %rd59; - ld.shared.f32 %f94, [%rd60+0]; - add.ftz.f32 %f84, %f94, %f84; - st.shared.f32 [%rd54+0], %f84; - ld.shared.f32 %f95, [%rd60+512]; - add.ftz.f32 %f85, %f95, %f85; - st.shared.f32 [%rd54+512], %f85; - ld.shared.f32 %f96, [%rd60+1024]; - add.ftz.f32 %f86, %f96, %f86; - st.shared.f32 [%rd54+1024], %f86; - ld.shared.f32 %f97, [%rd60+1536]; - add.ftz.f32 %f87, %f97, %f87; - st.shared.f32 [%rd54+1536], %f87; - ld.shared.f32 %f98, [%rd60+2048]; - add.ftz.f32 %f92, %f98, %f92; - st.shared.f32 [%rd54+2048], %f92; - ld.shared.f32 %f99, [%rd60+2560]; - add.ftz.f32 %f93, %f99, %f93; - st.shared.f32 [%rd54+2560], %f93; -$Lt_1_29442: - shr.u32 %r65, %r65, 1; - mov.u32 %r67, 0; - setp.ne.u32 %p17, %r65, %r67; - @%p17 bra $Lt_1_29186; -$Lt_1_28674: - mov.f32 %f11, %f84; - mov.f32 %f13, %f85; - mov.f32 %f15, %f86; - mov.f32 %f17, %f87; - mov.f32 %f19, %f92; - mov.f32 %f21, %f93; -$Lt_1_28162: -$Lt_1_26114: - mov.u32 %r68, 0; - setp.ne.s32 %p18, %r17, %r68; - @%p18 bra $Lt_1_30210; - ld.param.u64 %rd61, [__cudaparm_kernel_pair_fast___val_paramengv]; - add.u64 %rd62, %rd61, %rd20; - ld.param.s32 %r69, [__cudaparm_kernel_pair_fast_eflag]; - mov.u32 %r70, 0; - setp.le.s32 %p19, %r69, %r70; - @%p19 bra $Lt_1_30722; - st.global.f32 [%rd62+0], %f34; - cvt.s64.s32 %rd63, %r13; - mul.wide.s32 %rd64, %r13, 4; - add.u64 %rd62, %rd62, %rd64; -$Lt_1_30722: - ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag]; - mov.u32 %r72, 0; - setp.le.s32 %p20, %r71, %r72; - @%p20 bra $Lt_1_31234; - mov.f32 %f100, %f11; - st.global.f32 [%rd62+0], %f100; - cvt.s64.s32 %rd65, %r13; - mul.wide.s32 %rd66, %r13, 4; - add.u64 %rd67, %rd66, %rd62; - mov.f32 %f101, %f13; - st.global.f32 [%rd67+0], %f101; - add.u64 %rd68, %rd66, %rd67; - mov.f32 %f102, %f15; - st.global.f32 [%rd68+0], %f102; - add.u64 %rd69, %rd66, %rd68; - mov.f32 %f103, %f17; - st.global.f32 [%rd69+0], %f103; - add.u64 %rd62, %rd66, %rd69; - mov.f32 %f104, %f19; - st.global.f32 [%rd62+0], %f104; - mov.f32 %f105, %f21; - add.u64 %rd70, %rd66, %rd62; - st.global.f32 [%rd70+0], %f105; -$Lt_1_31234: - ld.param.u64 %rd71, [__cudaparm_kernel_pair_fast_ans]; - mul.lo.u64 %rd72, %rd19, 16; - add.u64 %rd73, %rd71, %rd72; - mov.f32 %f106, %f107; - st.global.v4.f32 [%rd73+0], {%f33,%f32,%f31,%f106}; -$Lt_1_30210: -$Lt_1_22530: - .loc 16 187 0 - exit; -$LDWend_kernel_pair_fast: - } // kernel_pair_fast - diff --git a/lib/gpu/lj96.ptx b/lib/gpu/lj96.ptx deleted file mode 100644 index d9211d1b2c..0000000000 --- a/lib/gpu/lj96.ptx +++ /dev/null @@ -1,901 +0,0 @@ - .version 2.3 - .target sm_20 - .address_size 64 - // compiled with /usr/local/cuda/open64/lib//be - // nvopencc 4.0 built on 2011-05-12 - - //----------------------------------------------------------- - // Compiling /tmp/tmpxft_00009c89_00000000-9_lal_lj96.cpp3.i (/home/sjplimp/ccBI#.pOwwSL) - //----------------------------------------------------------- - - //----------------------------------------------------------- - // Options: - //----------------------------------------------------------- - // Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64 - // -O3 (Optimization level) - // -g0 (Debug level) - // -m2 (Report advisories) - //----------------------------------------------------------- - - .file 1 "" - .file 2 "/tmp/tmpxft_00009c89_00000000-8_lal_lj96.cudafe2.gpu" - .file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h" - .file 4 "/usr/local/cuda/include/crt/device_runtime.h" - .file 5 "/usr/local/cuda/include/host_defines.h" - .file 6 "/usr/local/cuda/include/builtin_types.h" - .file 7 "/usr/local/cuda/include/device_types.h" - .file 8 "/usr/local/cuda/include/driver_types.h" - .file 9 "/usr/local/cuda/include/surface_types.h" - .file 10 "/usr/local/cuda/include/texture_types.h" - .file 11 "/usr/local/cuda/include/vector_types.h" - .file 12 "/usr/local/cuda/include/device_launch_parameters.h" - .file 13 "/usr/local/cuda/include/crt/storage_class.h" - .file 14 "/usr/include/bits/types.h" - .file 15 "/usr/include/time.h" - .file 16 "lal_lj96.cu" - .file 17 "/usr/local/cuda/include/common_functions.h" - .file 18 "/usr/local/cuda/include/math_functions.h" - .file 19 "/usr/local/cuda/include/math_constants.h" - .file 20 "/usr/local/cuda/include/device_functions.h" - .file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h" - .file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h" - .file 23 "/usr/local/cuda/include/sm_13_double_functions.h" - .file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h" - .file 25 "/usr/local/cuda/include/sm_20_intrinsics.h" - .file 26 "/usr/local/cuda/include/surface_functions.h" - .file 27 "/usr/local/cuda/include/texture_fetch_functions.h" - .file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h" - - .global .texref pos_tex; - - .entry kernel_pair ( - .param .u64 __cudaparm_kernel_pair_x_, - .param .u64 __cudaparm_kernel_pair_lj1, - .param .u64 __cudaparm_kernel_pair_lj3, - .param .s32 __cudaparm_kernel_pair_lj_types, - .param .u64 __cudaparm_kernel_pair_sp_lj_in, - .param .u64 __cudaparm_kernel_pair_dev_nbor, - .param .u64 __cudaparm_kernel_pair_dev_packed, - .param .u64 __cudaparm_kernel_pair_ans, - .param .u64 __cudaparm_kernel_pair___val_paramengv, - .param .s32 __cudaparm_kernel_pair_eflag, - .param .s32 __cudaparm_kernel_pair_vflag, - .param .s32 __cudaparm_kernel_pair_inum, - .param .s32 __cudaparm_kernel_pair_nbor_pitch, - .param .s32 __cudaparm_kernel_pair_t_per_atom) - { - .reg .u32 %r<72>; - .reg .u64 %rd<63>; - .reg .f32 %f<103>; - .reg .pred %p<19>; - .shared .align 16 .b8 __cuda___cuda_local_var_32536_33_non_const_sp_lj92[16]; - .shared .align 4 .b8 __cuda___cuda_local_var_32601_55_non_const_red_acc108[3072]; - // __cuda_local_var_32543_10_non_const_f = 48 - // __cuda_local_var_32545_9_non_const_virial = 16 - .loc 16 31 0 -$LDWbegin_kernel_pair: - .loc 16 36 0 - ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in]; - ldu.global.f32 %f1, [%rd1+0]; - .loc 16 37 0 - ld.global.f32 %f2, [%rd1+4]; - .loc 16 38 0 - ld.global.f32 %f3, [%rd1+8]; - .loc 16 39 0 - ld.global.f32 %f4, [%rd1+12]; - st.shared.v4.f32 [__cuda___cuda_local_var_32536_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4}; - .loc 16 46 0 - mov.f32 %f5, 0f00000000; // 0 - mov.f32 %f6, %f5; - mov.f32 %f7, 0f00000000; // 0 - mov.f32 %f8, %f7; - mov.f32 %f9, 0f00000000; // 0 - mov.f32 %f10, %f9; - mov.f32 %f11, 0f00000000; // 0 - mov.f32 %f12, %f11; - mov.f32 %f13, 0f00000000; // 0 - mov.f32 %f14, %f13; - mov.f32 %f15, 0f00000000; // 0 - mov.f32 %f16, %f15; - ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom]; - cvt.s32.u32 %r2, %tid.x; - div.s32 %r3, %r2, %r1; - cvt.s32.u32 %r4, %ntid.x; - div.s32 %r5, %r4, %r1; - cvt.s32.u32 %r6, %ctaid.x; - mul.lo.s32 %r7, %r6, %r5; - add.s32 %r8, %r3, %r7; - ld.param.s32 %r9, [__cudaparm_kernel_pair_inum]; - setp.ge.s32 %p1, %r8, %r9; - @%p1 bra $Lt_0_26370; - .loc 16 51 0 - ld.param.s32 %r10, [__cudaparm_kernel_pair_nbor_pitch]; - cvt.s64.s32 %rd2, %r10; - mul.wide.s32 %rd3, %r10, 4; - cvt.s64.s32 %rd4, %r8; - mul.wide.s32 %rd5, %r8, 4; - ld.param.u64 %rd6, [__cudaparm_kernel_pair_dev_nbor]; - add.u64 %rd7, %rd5, %rd6; - add.u64 %rd8, %rd3, %rd7; - ld.global.s32 %r11, [%rd8+0]; - sub.s32 %r12, %r1, 1; - and.b32 %r13, %r12, %r2; - cvt.s64.s32 %rd9, %r13; - mul.wide.s32 %rd10, %r13, 4; - ld.param.u64 %rd11, [__cudaparm_kernel_pair_dev_packed]; - setp.ne.u64 %p2, %rd11, %rd6; - @%p2 bra $Lt_0_19458; - cvt.s32.s64 %r14, %rd2; - mul.lo.s32 %r15, %r14, %r1; - mov.s32 %r16, %r15; - mul.lo.s32 %r17, %r12, %r8; - add.s32 %r18, %r14, %r17; - cvt.s64.s32 %rd12, %r18; - mul.wide.s32 %rd13, %r18, 4; - add.u64 %rd14, %rd8, %rd13; - and.b32 %r19, %r12, %r11; - cvt.s64.s32 %rd15, %r19; - div.s32 %r20, %r11, %r1; - mul.lo.s32 %r21, %r15, %r20; - cvt.s64.s32 %rd16, %r21; - add.u64 %rd17, %rd15, %rd16; - mul.lo.u64 %rd18, %rd17, 4; - add.u64 %rd19, %rd14, %rd18; - add.u64 %rd20, %rd10, %rd14; - bra.uni $Lt_0_19202; -$Lt_0_19458: - add.u64 %rd21, %rd3, %rd8; - ld.global.s32 %r22, [%rd21+0]; - cvt.s64.s32 %rd22, %r22; - mul.wide.s32 %rd23, %r22, 4; - add.u64 %rd24, %rd11, %rd23; - cvt.s64.s32 %rd25, %r11; - mul.wide.s32 %rd26, %r11, 4; - add.u64 %rd19, %rd24, %rd26; - mov.s32 %r16, %r1; - add.u64 %rd20, %rd10, %rd24; -$Lt_0_19202: - .loc 16 54 0 - ld.global.s32 %r23, [%rd7+0]; - mov.u32 %r24, %r23; - mov.s32 %r25, 0; - mov.u32 %r26, %r25; - mov.s32 %r27, 0; - mov.u32 %r28, %r27; - mov.s32 %r29, 0; - mov.u32 %r30, %r29; - tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r24,%r26,%r28,%r30}]; - mov.f32 %f21, %f17; - mov.f32 %f22, %f18; - mov.f32 %f23, %f19; - mov.f32 %f24, %f20; - setp.ge.u64 %p3, %rd20, %rd19; - @%p3 bra $Lt_0_27906; - cvt.rzi.ftz.s32.f32 %r31, %f24; - cvt.s64.s32 %rd27, %r16; - ld.param.s32 %r32, [__cudaparm_kernel_pair_lj_types]; - mul.lo.s32 %r33, %r32, %r31; - ld.param.u64 %rd28, [__cudaparm_kernel_pair_lj1]; - mov.f32 %f25, 0f00000000; // 0 - mov.f32 %f26, 0f00000000; // 0 - mov.f32 %f27, 0f00000000; // 0 - mov.f32 %f28, 0f00000000; // 0 - mov.u64 %rd29, __cuda___cuda_local_var_32536_33_non_const_sp_lj92; -$Lt_0_20226: - // Loop body line 54, nesting depth: 1, estimated iterations: unknown - .loc 16 60 0 - ld.global.s32 %r34, [%rd20+0]; - .loc 16 61 0 - shr.s32 %r35, %r34, 30; - and.b32 %r36, %r35, 3; - cvt.s64.s32 %rd30, %r36; - mul.wide.s32 %rd31, %r36, 4; - add.u64 %rd32, %rd29, %rd31; - ld.shared.f32 %f29, [%rd32+0]; - .loc 16 64 0 - and.b32 %r37, %r34, 1073741823; - mov.u32 %r38, %r37; - mov.s32 %r39, 0; - mov.u32 %r40, %r39; - mov.s32 %r41, 0; - mov.u32 %r42, %r41; - mov.s32 %r43, 0; - mov.u32 %r44, %r43; - tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r38,%r40,%r42,%r44}]; - mov.f32 %f34, %f30; - mov.f32 %f35, %f31; - mov.f32 %f36, %f32; - mov.f32 %f37, %f33; - cvt.rzi.ftz.s32.f32 %r45, %f37; - sub.ftz.f32 %f38, %f22, %f35; - sub.ftz.f32 %f39, %f21, %f34; - sub.ftz.f32 %f40, %f23, %f36; - mul.ftz.f32 %f41, %f38, %f38; - fma.rn.ftz.f32 %f42, %f39, %f39, %f41; - fma.rn.ftz.f32 %f43, %f40, %f40, %f42; - add.s32 %r46, %r45, %r33; - cvt.s64.s32 %rd33, %r46; - mul.wide.s32 %rd34, %r46, 16; - add.u64 %rd35, %rd34, %rd28; - ld.global.f32 %f44, [%rd35+8]; - setp.gt.ftz.f32 %p4, %f44, %f43; - @!%p4 bra $Lt_0_21506; - .loc 16 79 0 - rcp.approx.ftz.f32 %f45, %f43; - mul.ftz.f32 %f46, %f45, %f45; - mul.ftz.f32 %f47, %f45, %f46; - sqrt.approx.ftz.f32 %f48, %f47; - mul.ftz.f32 %f49, %f45, %f47; - ld.global.v2.f32 {%f50,%f51}, [%rd35+0]; - mul.ftz.f32 %f52, %f50, %f48; - sub.ftz.f32 %f53, %f52, %f51; - mul.ftz.f32 %f54, %f49, %f53; - mul.ftz.f32 %f55, %f29, %f54; - .loc 16 81 0 - fma.rn.ftz.f32 %f27, %f39, %f55, %f27; - .loc 16 82 0 - fma.rn.ftz.f32 %f26, %f38, %f55, %f26; - .loc 16 83 0 - fma.rn.ftz.f32 %f25, %f40, %f55, %f25; - ld.param.s32 %r47, [__cudaparm_kernel_pair_eflag]; - mov.u32 %r48, 0; - setp.le.s32 %p5, %r47, %r48; - @%p5 bra $Lt_0_20994; - .loc 16 87 0 - ld.param.u64 %rd36, [__cudaparm_kernel_pair_lj3]; - add.u64 %rd37, %rd36, %rd34; - ld.global.v4.f32 {%f56,%f57,%f58,_}, [%rd37+0]; - mul.ftz.f32 %f59, %f56, %f48; - sub.ftz.f32 %f60, %f59, %f57; - mul.ftz.f32 %f61, %f47, %f60; - sub.ftz.f32 %f62, %f61, %f58; - fma.rn.ftz.f32 %f28, %f29, %f62, %f28; -$Lt_0_20994: - ld.param.s32 %r49, [__cudaparm_kernel_pair_vflag]; - mov.u32 %r50, 0; - setp.le.s32 %p6, %r49, %r50; - @%p6 bra $Lt_0_21506; - .loc 16 90 0 - mov.f32 %f63, %f6; - mul.ftz.f32 %f64, %f39, %f39; - fma.rn.ftz.f32 %f65, %f55, %f64, %f63; - mov.f32 %f6, %f65; - .loc 16 91 0 - mov.f32 %f66, %f8; - fma.rn.ftz.f32 %f67, %f55, %f41, %f66; - mov.f32 %f8, %f67; - .loc 16 92 0 - mov.f32 %f68, %f10; - mul.ftz.f32 %f69, %f40, %f40; - fma.rn.ftz.f32 %f70, %f55, %f69, %f68; - mov.f32 %f10, %f70; - .loc 16 93 0 - mov.f32 %f71, %f12; - mul.ftz.f32 %f72, %f38, %f39; - fma.rn.ftz.f32 %f73, %f55, %f72, %f71; - mov.f32 %f12, %f73; - .loc 16 94 0 - mov.f32 %f74, %f14; - mul.ftz.f32 %f75, %f39, %f40; - fma.rn.ftz.f32 %f76, %f55, %f75, %f74; - mov.f32 %f14, %f76; - .loc 16 95 0 - mul.ftz.f32 %f77, %f38, %f40; - fma.rn.ftz.f32 %f15, %f55, %f77, %f15; - mov.f32 %f16, %f15; -$Lt_0_21506: -$Lt_0_20482: - .loc 16 58 0 - mul.lo.u64 %rd38, %rd27, 4; - add.u64 %rd20, %rd20, %rd38; - setp.lt.u64 %p7, %rd20, %rd19; - @%p7 bra $Lt_0_20226; - bra.uni $Lt_0_19714; -$Lt_0_27906: - mov.f32 %f25, 0f00000000; // 0 - mov.f32 %f26, 0f00000000; // 0 - mov.f32 %f27, 0f00000000; // 0 - mov.f32 %f28, 0f00000000; // 0 -$Lt_0_19714: - mov.u32 %r51, 1; - setp.le.s32 %p8, %r1, %r51; - @%p8 bra $Lt_0_24322; - .loc 16 100 0 - mov.u64 %rd39, __cuda___cuda_local_var_32601_55_non_const_red_acc108; - cvt.s64.s32 %rd40, %r2; - mul.wide.s32 %rd41, %r2, 4; - add.u64 %rd42, %rd39, %rd41; - mov.f32 %f78, %f27; - st.shared.f32 [%rd42+0], %f78; - mov.f32 %f79, %f26; - st.shared.f32 [%rd42+512], %f79; - mov.f32 %f80, %f25; - st.shared.f32 [%rd42+1024], %f80; - mov.f32 %f81, %f28; - st.shared.f32 [%rd42+1536], %f81; - shr.s32 %r52, %r1, 31; - mov.s32 %r53, 1; - and.b32 %r54, %r52, %r53; - add.s32 %r55, %r54, %r1; - shr.s32 %r56, %r55, 1; - mov.s32 %r57, %r56; - mov.u32 %r58, 0; - setp.ne.u32 %p9, %r56, %r58; - @!%p9 bra $Lt_0_22786; -$Lt_0_23298: - setp.ge.u32 %p10, %r13, %r57; - @%p10 bra $Lt_0_23554; - add.u32 %r59, %r2, %r57; - cvt.u64.u32 %rd43, %r59; - mul.wide.u32 %rd44, %r59, 4; - add.u64 %rd45, %rd39, %rd44; - ld.shared.f32 %f82, [%rd45+0]; - add.ftz.f32 %f78, %f82, %f78; - st.shared.f32 [%rd42+0], %f78; - ld.shared.f32 %f83, [%rd45+512]; - add.ftz.f32 %f79, %f83, %f79; - st.shared.f32 [%rd42+512], %f79; - ld.shared.f32 %f84, [%rd45+1024]; - add.ftz.f32 %f80, %f84, %f80; - st.shared.f32 [%rd42+1024], %f80; - ld.shared.f32 %f85, [%rd45+1536]; - add.ftz.f32 %f81, %f85, %f81; - st.shared.f32 [%rd42+1536], %f81; -$Lt_0_23554: - shr.u32 %r57, %r57, 1; - mov.u32 %r60, 0; - setp.ne.u32 %p11, %r57, %r60; - @%p11 bra $Lt_0_23298; -$Lt_0_22786: - mov.f32 %f27, %f78; - mov.f32 %f26, %f79; - mov.f32 %f25, %f80; - mov.f32 %f28, %f81; - ld.param.s32 %r61, [__cudaparm_kernel_pair_vflag]; - mov.u32 %r62, 0; - setp.le.s32 %p12, %r61, %r62; - @%p12 bra $Lt_0_24322; - mov.f32 %f78, %f6; - st.shared.f32 [%rd42+0], %f78; - mov.f32 %f79, %f8; - st.shared.f32 [%rd42+512], %f79; - mov.f32 %f80, %f10; - st.shared.f32 [%rd42+1024], %f80; - mov.f32 %f81, %f12; - st.shared.f32 [%rd42+1536], %f81; - mov.f32 %f86, %f14; - st.shared.f32 [%rd42+2048], %f86; - mov.f32 %f87, %f15; - st.shared.f32 [%rd42+2560], %f87; - mov.s32 %r63, %r56; - @!%p9 bra $Lt_0_24834; -$Lt_0_25346: - setp.ge.u32 %p13, %r13, %r63; - @%p13 bra $Lt_0_25602; - add.u32 %r64, %r2, %r63; - cvt.u64.u32 %rd46, %r64; - mul.wide.u32 %rd47, %r64, 4; - add.u64 %rd48, %rd39, %rd47; - ld.shared.f32 %f88, [%rd48+0]; - add.ftz.f32 %f78, %f88, %f78; - st.shared.f32 [%rd42+0], %f78; - ld.shared.f32 %f89, [%rd48+512]; - add.ftz.f32 %f79, %f89, %f79; - st.shared.f32 [%rd42+512], %f79; - ld.shared.f32 %f90, [%rd48+1024]; - add.ftz.f32 %f80, %f90, %f80; - st.shared.f32 [%rd42+1024], %f80; - ld.shared.f32 %f91, [%rd48+1536]; - add.ftz.f32 %f81, %f91, %f81; - st.shared.f32 [%rd42+1536], %f81; - ld.shared.f32 %f92, [%rd48+2048]; - add.ftz.f32 %f86, %f92, %f86; - st.shared.f32 [%rd42+2048], %f86; - ld.shared.f32 %f93, [%rd48+2560]; - add.ftz.f32 %f87, %f93, %f87; - st.shared.f32 [%rd42+2560], %f87; -$Lt_0_25602: - shr.u32 %r63, %r63, 1; - mov.u32 %r65, 0; - setp.ne.u32 %p14, %r63, %r65; - @%p14 bra $Lt_0_25346; -$Lt_0_24834: - mov.f32 %f6, %f78; - mov.f32 %f8, %f79; - mov.f32 %f10, %f80; - mov.f32 %f12, %f81; - mov.f32 %f14, %f86; - mov.f32 %f16, %f87; -$Lt_0_24322: -$Lt_0_22274: - mov.u32 %r66, 0; - setp.ne.s32 %p15, %r13, %r66; - @%p15 bra $Lt_0_26370; - ld.param.u64 %rd49, [__cudaparm_kernel_pair___val_paramengv]; - add.u64 %rd50, %rd49, %rd5; - ld.param.s32 %r67, [__cudaparm_kernel_pair_eflag]; - mov.u32 %r68, 0; - setp.le.s32 %p16, %r67, %r68; - @%p16 bra $Lt_0_26882; - st.global.f32 [%rd50+0], %f28; - cvt.s64.s32 %rd51, %r9; - mul.wide.s32 %rd52, %r9, 4; - add.u64 %rd50, %rd50, %rd52; -$Lt_0_26882: - ld.param.s32 %r69, [__cudaparm_kernel_pair_vflag]; - mov.u32 %r70, 0; - setp.le.s32 %p17, %r69, %r70; - @%p17 bra $Lt_0_27394; - mov.f32 %f94, %f6; - st.global.f32 [%rd50+0], %f94; - cvt.s64.s32 %rd53, %r9; - mul.wide.s32 %rd54, %r9, 4; - add.u64 %rd55, %rd54, %rd50; - mov.f32 %f95, %f8; - st.global.f32 [%rd55+0], %f95; - add.u64 %rd56, %rd54, %rd55; - mov.f32 %f96, %f10; - st.global.f32 [%rd56+0], %f96; - add.u64 %rd57, %rd54, %rd56; - mov.f32 %f97, %f12; - st.global.f32 [%rd57+0], %f97; - add.u64 %rd50, %rd54, %rd57; - mov.f32 %f98, %f14; - st.global.f32 [%rd50+0], %f98; - mov.f32 %f99, %f16; - add.u64 %rd58, %rd54, %rd50; - st.global.f32 [%rd58+0], %f99; -$Lt_0_27394: - ld.param.u64 %rd59, [__cudaparm_kernel_pair_ans]; - mul.lo.u64 %rd60, %rd4, 16; - add.u64 %rd61, %rd59, %rd60; - mov.f32 %f100, %f101; - st.global.v4.f32 [%rd61+0], {%f27,%f26,%f25,%f100}; -$Lt_0_26370: -$Lt_0_18690: - .loc 16 103 0 - exit; -$LDWend_kernel_pair: - } // kernel_pair - - .entry kernel_pair_fast ( - .param .u64 __cudaparm_kernel_pair_fast_x_, - .param .u64 __cudaparm_kernel_pair_fast_lj1_in, - .param .u64 __cudaparm_kernel_pair_fast_lj3_in, - .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in, - .param .u64 __cudaparm_kernel_pair_fast_dev_nbor, - .param .u64 __cudaparm_kernel_pair_fast_dev_packed, - .param .u64 __cudaparm_kernel_pair_fast_ans, - .param .u64 __cudaparm_kernel_pair_fast___val_paramengv, - .param .s32 __cudaparm_kernel_pair_fast_eflag, - .param .s32 __cudaparm_kernel_pair_fast_vflag, - .param .s32 __cudaparm_kernel_pair_fast_inum, - .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch, - .param .s32 __cudaparm_kernel_pair_fast_t_per_atom) - { - .reg .u32 %r<74>; - .reg .u64 %rd<75>; - .reg .f32 %f<109>; - .reg .pred %p<22>; - .shared .align 4 .b8 __cuda___cuda_local_var_32618_33_non_const_sp_lj3268[16]; - .shared .align 16 .b8 __cuda___cuda_local_var_32616_34_non_const_lj13296[1936]; - .shared .align 16 .b8 __cuda___cuda_local_var_32617_34_non_const_lj35232[1936]; - .shared .align 4 .b8 __cuda___cuda_local_var_32687_55_non_const_red_acc7168[3072]; - // __cuda_local_var_32628_10_non_const_f = 48 - // __cuda_local_var_32630_9_non_const_virial = 16 - .loc 16 111 0 -$LDWbegin_kernel_pair_fast: - cvt.s32.u32 %r1, %tid.x; - mov.u32 %r2, 3; - setp.gt.s32 %p1, %r1, %r2; - @%p1 bra $Lt_1_20994; - .loc 16 119 0 - mov.u64 %rd1, __cuda___cuda_local_var_32618_33_non_const_sp_lj3268; - cvt.s64.s32 %rd2, %r1; - mul.wide.s32 %rd3, %r1, 4; - ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in]; - add.u64 %rd5, %rd4, %rd3; - ld.global.f32 %f1, [%rd5+0]; - add.u64 %rd6, %rd3, %rd1; - st.shared.f32 [%rd6+0], %f1; -$Lt_1_20994: - mov.u64 %rd1, __cuda___cuda_local_var_32618_33_non_const_sp_lj3268; - mov.u32 %r3, 120; - setp.gt.s32 %p2, %r1, %r3; - @%p2 bra $Lt_1_21506; - .loc 16 121 0 - mov.u64 %rd7, __cuda___cuda_local_var_32616_34_non_const_lj13296; - cvt.s64.s32 %rd8, %r1; - mul.wide.s32 %rd9, %r1, 16; - ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_lj1_in]; - add.u64 %rd11, %rd10, %rd9; - add.u64 %rd12, %rd9, %rd7; - ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0]; - st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5}; - ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag]; - mov.u32 %r5, 0; - setp.le.s32 %p3, %r4, %r5; - @%p3 bra $Lt_1_22018; - .loc 16 123 0 - mov.u64 %rd13, __cuda___cuda_local_var_32617_34_non_const_lj35232; - ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in]; - add.u64 %rd15, %rd14, %rd9; - add.u64 %rd16, %rd9, %rd13; - ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0]; - st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9}; -$Lt_1_22018: - mov.u64 %rd13, __cuda___cuda_local_var_32617_34_non_const_lj35232; -$Lt_1_21506: - mov.u64 %rd13, __cuda___cuda_local_var_32617_34_non_const_lj35232; - mov.u64 %rd7, __cuda___cuda_local_var_32616_34_non_const_lj13296; - .loc 16 131 0 - mov.f32 %f10, 0f00000000; // 0 - mov.f32 %f11, %f10; - mov.f32 %f12, 0f00000000; // 0 - mov.f32 %f13, %f12; - mov.f32 %f14, 0f00000000; // 0 - mov.f32 %f15, %f14; - mov.f32 %f16, 0f00000000; // 0 - mov.f32 %f17, %f16; - mov.f32 %f18, 0f00000000; // 0 - mov.f32 %f19, %f18; - mov.f32 %f20, 0f00000000; // 0 - mov.f32 %f21, %f20; - .loc 16 133 0 - bar.sync 0; - ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom]; - div.s32 %r7, %r1, %r6; - cvt.s32.u32 %r8, %ntid.x; - div.s32 %r9, %r8, %r6; - cvt.s32.u32 %r10, %ctaid.x; - mul.lo.s32 %r11, %r10, %r9; - add.s32 %r12, %r7, %r11; - ld.param.s32 %r13, [__cudaparm_kernel_pair_fast_inum]; - setp.ge.s32 %p4, %r12, %r13; - @%p4 bra $Lt_1_30210; - .loc 16 138 0 - ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_nbor_pitch]; - cvt.s64.s32 %rd17, %r14; - mul.wide.s32 %rd18, %r14, 4; - cvt.s64.s32 %rd19, %r12; - mul.wide.s32 %rd20, %r12, 4; - ld.param.u64 %rd21, [__cudaparm_kernel_pair_fast_dev_nbor]; - add.u64 %rd22, %rd20, %rd21; - add.u64 %rd23, %rd18, %rd22; - ld.global.s32 %r15, [%rd23+0]; - sub.s32 %r16, %r6, 1; - and.b32 %r17, %r16, %r1; - cvt.s64.s32 %rd24, %r17; - mul.wide.s32 %rd25, %r17, 4; - ld.param.u64 %rd26, [__cudaparm_kernel_pair_fast_dev_packed]; - setp.ne.u64 %p5, %rd26, %rd21; - @%p5 bra $Lt_1_23298; - cvt.s32.s64 %r18, %rd17; - mul.lo.s32 %r19, %r18, %r6; - mov.s32 %r20, %r19; - mul.lo.s32 %r21, %r16, %r12; - add.s32 %r22, %r18, %r21; - cvt.s64.s32 %rd27, %r22; - mul.wide.s32 %rd28, %r22, 4; - add.u64 %rd29, %rd23, %rd28; - and.b32 %r23, %r16, %r15; - cvt.s64.s32 %rd30, %r23; - div.s32 %r24, %r15, %r6; - mul.lo.s32 %r25, %r19, %r24; - cvt.s64.s32 %rd31, %r25; - add.u64 %rd32, %rd30, %rd31; - mul.lo.u64 %rd33, %rd32, 4; - add.u64 %rd34, %rd29, %rd33; - add.u64 %rd35, %rd25, %rd29; - bra.uni $Lt_1_23042; -$Lt_1_23298: - add.u64 %rd36, %rd18, %rd23; - ld.global.s32 %r26, [%rd36+0]; - cvt.s64.s32 %rd37, %r26; - mul.wide.s32 %rd38, %r26, 4; - add.u64 %rd39, %rd26, %rd38; - cvt.s64.s32 %rd40, %r15; - mul.wide.s32 %rd41, %r15, 4; - add.u64 %rd34, %rd39, %rd41; - mov.s32 %r20, %r6; - add.u64 %rd35, %rd25, %rd39; -$Lt_1_23042: - .loc 16 141 0 - ld.global.s32 %r27, [%rd22+0]; - mov.u32 %r28, %r27; - mov.s32 %r29, 0; - mov.u32 %r30, %r29; - mov.s32 %r31, 0; - mov.u32 %r32, %r31; - mov.s32 %r33, 0; - mov.u32 %r34, %r33; - tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r28,%r30,%r32,%r34}]; - mov.f32 %f26, %f22; - mov.f32 %f27, %f23; - mov.f32 %f28, %f24; - mov.f32 %f29, %f25; - setp.ge.u64 %p6, %rd35, %rd34; - @%p6 bra $Lt_1_31746; - cvt.rzi.ftz.s32.f32 %r35, %f29; - cvt.s64.s32 %rd42, %r20; - mul.lo.s32 %r36, %r35, 11; - cvt.rn.f32.s32 %f30, %r36; - mov.f32 %f31, 0f00000000; // 0 - mov.f32 %f32, 0f00000000; // 0 - mov.f32 %f33, 0f00000000; // 0 - mov.f32 %f34, 0f00000000; // 0 -$Lt_1_24066: - // Loop body line 141, nesting depth: 1, estimated iterations: unknown - .loc 16 148 0 - ld.global.s32 %r37, [%rd35+0]; - .loc 16 152 0 - and.b32 %r38, %r37, 1073741823; - mov.u32 %r39, %r38; - mov.s32 %r40, 0; - mov.u32 %r41, %r40; - mov.s32 %r42, 0; - mov.u32 %r43, %r42; - mov.s32 %r44, 0; - mov.u32 %r45, %r44; - tex.1d.v4.f32.s32 {%f35,%f36,%f37,%f38},[pos_tex,{%r39,%r41,%r43,%r45}]; - mov.f32 %f39, %f35; - mov.f32 %f40, %f36; - mov.f32 %f41, %f37; - mov.f32 %f42, %f38; - sub.ftz.f32 %f43, %f27, %f40; - sub.ftz.f32 %f44, %f26, %f39; - sub.ftz.f32 %f45, %f28, %f41; - mul.ftz.f32 %f46, %f43, %f43; - fma.rn.ftz.f32 %f47, %f44, %f44, %f46; - fma.rn.ftz.f32 %f48, %f45, %f45, %f47; - add.ftz.f32 %f49, %f30, %f42; - cvt.rzi.ftz.s32.f32 %r46, %f49; - cvt.s64.s32 %rd43, %r46; - mul.wide.s32 %rd44, %r46, 16; - add.u64 %rd45, %rd44, %rd7; - ld.shared.f32 %f50, [%rd45+8]; - setp.gt.ftz.f32 %p7, %f50, %f48; - @!%p7 bra $Lt_1_25346; - .loc 16 165 0 - rcp.approx.ftz.f32 %f51, %f48; - mul.ftz.f32 %f52, %f51, %f51; - mul.ftz.f32 %f53, %f51, %f52; - sqrt.approx.ftz.f32 %f54, %f53; - mul.ftz.f32 %f55, %f51, %f53; - ld.shared.v2.f32 {%f56,%f57}, [%rd45+0]; - mul.ftz.f32 %f58, %f56, %f54; - sub.ftz.f32 %f59, %f58, %f57; - mul.ftz.f32 %f60, %f55, %f59; - .loc 16 167 0 - fma.rn.ftz.f32 %f33, %f44, %f60, %f33; - .loc 16 168 0 - fma.rn.ftz.f32 %f32, %f43, %f60, %f32; - .loc 16 169 0 - fma.rn.ftz.f32 %f31, %f45, %f60, %f31; - ld.param.s32 %r47, [__cudaparm_kernel_pair_fast_eflag]; - mov.u32 %r48, 0; - setp.le.s32 %p8, %r47, %r48; - @%p8 bra $Lt_1_24834; - .loc 16 172 0 - add.u64 %rd46, %rd44, %rd13; - ld.shared.v4.f32 {%f61,%f62,%f63,_}, [%rd46+0]; - mul.ftz.f32 %f64, %f61, %f54; - sub.ftz.f32 %f65, %f64, %f62; - mul.ftz.f32 %f66, %f53, %f65; - .loc 16 173 0 - shr.s32 %r49, %r37, 30; - and.b32 %r50, %r49, 3; - cvt.s64.s32 %rd47, %r50; - mul.wide.s32 %rd48, %r50, 4; - add.u64 %rd49, %rd1, %rd48; - ld.shared.f32 %f67, [%rd49+0]; - sub.ftz.f32 %f68, %f66, %f63; - fma.rn.ftz.f32 %f34, %f67, %f68, %f34; -$Lt_1_24834: - ld.param.s32 %r51, [__cudaparm_kernel_pair_fast_vflag]; - mov.u32 %r52, 0; - setp.le.s32 %p9, %r51, %r52; - @%p9 bra $Lt_1_25346; - .loc 16 176 0 - mov.f32 %f69, %f11; - mul.ftz.f32 %f70, %f44, %f44; - fma.rn.ftz.f32 %f71, %f60, %f70, %f69; - mov.f32 %f11, %f71; - .loc 16 177 0 - mov.f32 %f72, %f13; - fma.rn.ftz.f32 %f73, %f60, %f46, %f72; - mov.f32 %f13, %f73; - .loc 16 178 0 - mov.f32 %f74, %f15; - mul.ftz.f32 %f75, %f45, %f45; - fma.rn.ftz.f32 %f76, %f60, %f75, %f74; - mov.f32 %f15, %f76; - .loc 16 179 0 - mov.f32 %f77, %f17; - mul.ftz.f32 %f78, %f43, %f44; - fma.rn.ftz.f32 %f79, %f60, %f78, %f77; - mov.f32 %f17, %f79; - .loc 16 180 0 - mov.f32 %f80, %f19; - mul.ftz.f32 %f81, %f44, %f45; - fma.rn.ftz.f32 %f82, %f60, %f81, %f80; - mov.f32 %f19, %f82; - .loc 16 181 0 - mul.ftz.f32 %f83, %f43, %f45; - fma.rn.ftz.f32 %f20, %f60, %f83, %f20; - mov.f32 %f21, %f20; -$Lt_1_25346: -$Lt_1_24322: - .loc 16 146 0 - mul.lo.u64 %rd50, %rd42, 4; - add.u64 %rd35, %rd35, %rd50; - setp.lt.u64 %p10, %rd35, %rd34; - @%p10 bra $Lt_1_24066; - bra.uni $Lt_1_23554; -$Lt_1_31746: - mov.f32 %f31, 0f00000000; // 0 - mov.f32 %f32, 0f00000000; // 0 - mov.f32 %f33, 0f00000000; // 0 - mov.f32 %f34, 0f00000000; // 0 -$Lt_1_23554: - mov.u32 %r53, 1; - setp.le.s32 %p11, %r6, %r53; - @%p11 bra $Lt_1_28162; - .loc 16 186 0 - mov.u64 %rd51, __cuda___cuda_local_var_32687_55_non_const_red_acc7168; - cvt.s64.s32 %rd52, %r1; - mul.wide.s32 %rd53, %r1, 4; - add.u64 %rd54, %rd51, %rd53; - mov.f32 %f84, %f33; - st.shared.f32 [%rd54+0], %f84; - mov.f32 %f85, %f32; - st.shared.f32 [%rd54+512], %f85; - mov.f32 %f86, %f31; - st.shared.f32 [%rd54+1024], %f86; - mov.f32 %f87, %f34; - st.shared.f32 [%rd54+1536], %f87; - shr.s32 %r54, %r6, 31; - mov.s32 %r55, 1; - and.b32 %r56, %r54, %r55; - add.s32 %r57, %r56, %r6; - shr.s32 %r58, %r57, 1; - mov.s32 %r59, %r58; - mov.u32 %r60, 0; - setp.ne.u32 %p12, %r58, %r60; - @!%p12 bra $Lt_1_26626; -$Lt_1_27138: - setp.ge.u32 %p13, %r17, %r59; - @%p13 bra $Lt_1_27394; - add.u32 %r61, %r1, %r59; - cvt.u64.u32 %rd55, %r61; - mul.wide.u32 %rd56, %r61, 4; - add.u64 %rd57, %rd51, %rd56; - ld.shared.f32 %f88, [%rd57+0]; - add.ftz.f32 %f84, %f88, %f84; - st.shared.f32 [%rd54+0], %f84; - ld.shared.f32 %f89, [%rd57+512]; - add.ftz.f32 %f85, %f89, %f85; - st.shared.f32 [%rd54+512], %f85; - ld.shared.f32 %f90, [%rd57+1024]; - add.ftz.f32 %f86, %f90, %f86; - st.shared.f32 [%rd54+1024], %f86; - ld.shared.f32 %f91, [%rd57+1536]; - add.ftz.f32 %f87, %f91, %f87; - st.shared.f32 [%rd54+1536], %f87; -$Lt_1_27394: - shr.u32 %r59, %r59, 1; - mov.u32 %r62, 0; - setp.ne.u32 %p14, %r59, %r62; - @%p14 bra $Lt_1_27138; -$Lt_1_26626: - mov.f32 %f33, %f84; - mov.f32 %f32, %f85; - mov.f32 %f31, %f86; - mov.f32 %f34, %f87; - ld.param.s32 %r63, [__cudaparm_kernel_pair_fast_vflag]; - mov.u32 %r64, 0; - setp.le.s32 %p15, %r63, %r64; - @%p15 bra $Lt_1_28162; - mov.f32 %f84, %f11; - st.shared.f32 [%rd54+0], %f84; - mov.f32 %f85, %f13; - st.shared.f32 [%rd54+512], %f85; - mov.f32 %f86, %f15; - st.shared.f32 [%rd54+1024], %f86; - mov.f32 %f87, %f17; - st.shared.f32 [%rd54+1536], %f87; - mov.f32 %f92, %f19; - st.shared.f32 [%rd54+2048], %f92; - mov.f32 %f93, %f20; - st.shared.f32 [%rd54+2560], %f93; - mov.s32 %r65, %r58; - @!%p12 bra $Lt_1_28674; -$Lt_1_29186: - setp.ge.u32 %p16, %r17, %r65; - @%p16 bra $Lt_1_29442; - add.u32 %r66, %r1, %r65; - cvt.u64.u32 %rd58, %r66; - mul.wide.u32 %rd59, %r66, 4; - add.u64 %rd60, %rd51, %rd59; - ld.shared.f32 %f94, [%rd60+0]; - add.ftz.f32 %f84, %f94, %f84; - st.shared.f32 [%rd54+0], %f84; - ld.shared.f32 %f95, [%rd60+512]; - add.ftz.f32 %f85, %f95, %f85; - st.shared.f32 [%rd54+512], %f85; - ld.shared.f32 %f96, [%rd60+1024]; - add.ftz.f32 %f86, %f96, %f86; - st.shared.f32 [%rd54+1024], %f86; - ld.shared.f32 %f97, [%rd60+1536]; - add.ftz.f32 %f87, %f97, %f87; - st.shared.f32 [%rd54+1536], %f87; - ld.shared.f32 %f98, [%rd60+2048]; - add.ftz.f32 %f92, %f98, %f92; - st.shared.f32 [%rd54+2048], %f92; - ld.shared.f32 %f99, [%rd60+2560]; - add.ftz.f32 %f93, %f99, %f93; - st.shared.f32 [%rd54+2560], %f93; -$Lt_1_29442: - shr.u32 %r65, %r65, 1; - mov.u32 %r67, 0; - setp.ne.u32 %p17, %r65, %r67; - @%p17 bra $Lt_1_29186; -$Lt_1_28674: - mov.f32 %f11, %f84; - mov.f32 %f13, %f85; - mov.f32 %f15, %f86; - mov.f32 %f17, %f87; - mov.f32 %f19, %f92; - mov.f32 %f21, %f93; -$Lt_1_28162: -$Lt_1_26114: - mov.u32 %r68, 0; - setp.ne.s32 %p18, %r17, %r68; - @%p18 bra $Lt_1_30210; - ld.param.u64 %rd61, [__cudaparm_kernel_pair_fast___val_paramengv]; - add.u64 %rd62, %rd61, %rd20; - ld.param.s32 %r69, [__cudaparm_kernel_pair_fast_eflag]; - mov.u32 %r70, 0; - setp.le.s32 %p19, %r69, %r70; - @%p19 bra $Lt_1_30722; - st.global.f32 [%rd62+0], %f34; - cvt.s64.s32 %rd63, %r13; - mul.wide.s32 %rd64, %r13, 4; - add.u64 %rd62, %rd62, %rd64; -$Lt_1_30722: - ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag]; - mov.u32 %r72, 0; - setp.le.s32 %p20, %r71, %r72; - @%p20 bra $Lt_1_31234; - mov.f32 %f100, %f11; - st.global.f32 [%rd62+0], %f100; - cvt.s64.s32 %rd65, %r13; - mul.wide.s32 %rd66, %r13, 4; - add.u64 %rd67, %rd66, %rd62; - mov.f32 %f101, %f13; - st.global.f32 [%rd67+0], %f101; - add.u64 %rd68, %rd66, %rd67; - mov.f32 %f102, %f15; - st.global.f32 [%rd68+0], %f102; - add.u64 %rd69, %rd66, %rd68; - mov.f32 %f103, %f17; - st.global.f32 [%rd69+0], %f103; - add.u64 %rd62, %rd66, %rd69; - mov.f32 %f104, %f19; - st.global.f32 [%rd62+0], %f104; - mov.f32 %f105, %f21; - add.u64 %rd70, %rd66, %rd62; - st.global.f32 [%rd70+0], %f105; -$Lt_1_31234: - ld.param.u64 %rd71, [__cudaparm_kernel_pair_fast_ans]; - mul.lo.u64 %rd72, %rd19, 16; - add.u64 %rd73, %rd71, %rd72; - mov.f32 %f106, %f107; - st.global.v4.f32 [%rd73+0], {%f33,%f32,%f31,%f106}; -$Lt_1_30210: -$Lt_1_22530: - .loc 16 189 0 - exit; -$LDWend_kernel_pair_fast: - } // kernel_pair_fast - diff --git a/lib/gpu/lj96_ptx.h b/lib/gpu/lj96_ptx.h deleted file mode 100644 index 7a942605dc..0000000000 --- a/lib/gpu/lj96_ptx.h +++ /dev/null @@ -1,849 +0,0 @@ -const char * lj96 = -" .version 2.3\n" -" .target sm_20\n" -" .address_size 64\n" -" .global .texref pos_tex;\n" -" .entry kernel_pair (\n" -" .param .u64 __cudaparm_kernel_pair_x_,\n" -" .param .u64 __cudaparm_kernel_pair_lj1,\n" -" .param .u64 __cudaparm_kernel_pair_lj3,\n" -" .param .s32 __cudaparm_kernel_pair_lj_types,\n" -" .param .u64 __cudaparm_kernel_pair_sp_lj_in,\n" -" .param .u64 __cudaparm_kernel_pair_dev_nbor,\n" -" .param .u64 __cudaparm_kernel_pair_dev_packed,\n" -" .param .u64 __cudaparm_kernel_pair_ans,\n" -" .param .u64 __cudaparm_kernel_pair___val_paramengv,\n" -" .param .s32 __cudaparm_kernel_pair_eflag,\n" -" .param .s32 __cudaparm_kernel_pair_vflag,\n" -" .param .s32 __cudaparm_kernel_pair_inum,\n" -" .param .s32 __cudaparm_kernel_pair_nbor_pitch,\n" -" .param .s32 __cudaparm_kernel_pair_t_per_atom)\n" -" {\n" -" .reg .u32 %r<72>;\n" -" .reg .u64 %rd<63>;\n" -" .reg .f32 %f<103>;\n" -" .reg .pred %p<19>;\n" -" .shared .align 16 .b8 __cuda___cuda_local_var_32536_33_non_const_sp_lj92[16];\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_32601_55_non_const_red_acc108[3072];\n" -" .loc 16 31 0\n" -"$LDWbegin_kernel_pair:\n" -" .loc 16 36 0\n" -" ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];\n" -" ldu.global.f32 %f1, [%rd1+0];\n" -" .loc 16 37 0\n" -" ld.global.f32 %f2, [%rd1+4];\n" -" .loc 16 38 0\n" -" ld.global.f32 %f3, [%rd1+8];\n" -" .loc 16 39 0\n" -" ld.global.f32 %f4, [%rd1+12];\n" -" st.shared.v4.f32 [__cuda___cuda_local_var_32536_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4};\n" -" .loc 16 46 0\n" -" mov.f32 %f5, 0f00000000; \n" -" mov.f32 %f6, %f5;\n" -" mov.f32 %f7, 0f00000000; \n" -" mov.f32 %f8, %f7;\n" -" mov.f32 %f9, 0f00000000; \n" -" mov.f32 %f10, %f9;\n" -" mov.f32 %f11, 0f00000000; \n" -" mov.f32 %f12, %f11;\n" -" mov.f32 %f13, 0f00000000; \n" -" mov.f32 %f14, %f13;\n" -" mov.f32 %f15, 0f00000000; \n" -" mov.f32 %f16, %f15;\n" -" ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];\n" -" cvt.s32.u32 %r2, %tid.x;\n" -" div.s32 %r3, %r2, %r1;\n" -" cvt.s32.u32 %r4, %ntid.x;\n" -" div.s32 %r5, %r4, %r1;\n" -" cvt.s32.u32 %r6, %ctaid.x;\n" -" mul.lo.s32 %r7, %r6, %r5;\n" -" add.s32 %r8, %r3, %r7;\n" -" ld.param.s32 %r9, [__cudaparm_kernel_pair_inum];\n" -" setp.ge.s32 %p1, %r8, %r9;\n" -" @%p1 bra $Lt_0_26370;\n" -" .loc 16 51 0\n" -" ld.param.s32 %r10, [__cudaparm_kernel_pair_nbor_pitch];\n" -" cvt.s64.s32 %rd2, %r10;\n" -" mul.wide.s32 %rd3, %r10, 4;\n" -" cvt.s64.s32 %rd4, %r8;\n" -" mul.wide.s32 %rd5, %r8, 4;\n" -" ld.param.u64 %rd6, [__cudaparm_kernel_pair_dev_nbor];\n" -" add.u64 %rd7, %rd5, %rd6;\n" -" add.u64 %rd8, %rd3, %rd7;\n" -" ld.global.s32 %r11, [%rd8+0];\n" -" sub.s32 %r12, %r1, 1;\n" -" and.b32 %r13, %r12, %r2;\n" -" cvt.s64.s32 %rd9, %r13;\n" -" mul.wide.s32 %rd10, %r13, 4;\n" -" ld.param.u64 %rd11, [__cudaparm_kernel_pair_dev_packed];\n" -" setp.ne.u64 %p2, %rd11, %rd6;\n" -" @%p2 bra $Lt_0_19458;\n" -" cvt.s32.s64 %r14, %rd2;\n" -" mul.lo.s32 %r15, %r14, %r1;\n" -" mov.s32 %r16, %r15;\n" -" mul.lo.s32 %r17, %r12, %r8;\n" -" add.s32 %r18, %r14, %r17;\n" -" cvt.s64.s32 %rd12, %r18;\n" -" mul.wide.s32 %rd13, %r18, 4;\n" -" add.u64 %rd14, %rd8, %rd13;\n" -" and.b32 %r19, %r12, %r11;\n" -" cvt.s64.s32 %rd15, %r19;\n" -" div.s32 %r20, %r11, %r1;\n" -" mul.lo.s32 %r21, %r15, %r20;\n" -" cvt.s64.s32 %rd16, %r21;\n" -" add.u64 %rd17, %rd15, %rd16;\n" -" mul.lo.u64 %rd18, %rd17, 4;\n" -" add.u64 %rd19, %rd14, %rd18;\n" -" add.u64 %rd20, %rd10, %rd14;\n" -" bra.uni $Lt_0_19202;\n" -"$Lt_0_19458:\n" -" add.u64 %rd21, %rd3, %rd8;\n" -" ld.global.s32 %r22, [%rd21+0];\n" -" cvt.s64.s32 %rd22, %r22;\n" -" mul.wide.s32 %rd23, %r22, 4;\n" -" add.u64 %rd24, %rd11, %rd23;\n" -" cvt.s64.s32 %rd25, %r11;\n" -" mul.wide.s32 %rd26, %r11, 4;\n" -" add.u64 %rd19, %rd24, %rd26;\n" -" mov.s32 %r16, %r1;\n" -" add.u64 %rd20, %rd10, %rd24;\n" -"$Lt_0_19202:\n" -" .loc 16 54 0\n" -" ld.global.s32 %r23, [%rd7+0];\n" -" mov.u32 %r24, %r23;\n" -" mov.s32 %r25, 0;\n" -" mov.u32 %r26, %r25;\n" -" mov.s32 %r27, 0;\n" -" mov.u32 %r28, %r27;\n" -" mov.s32 %r29, 0;\n" -" mov.u32 %r30, %r29;\n" -" tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r24,%r26,%r28,%r30}];\n" -" mov.f32 %f21, %f17;\n" -" mov.f32 %f22, %f18;\n" -" mov.f32 %f23, %f19;\n" -" mov.f32 %f24, %f20;\n" -" setp.ge.u64 %p3, %rd20, %rd19;\n" -" @%p3 bra $Lt_0_27906;\n" -" cvt.rzi.ftz.s32.f32 %r31, %f24;\n" -" cvt.s64.s32 %rd27, %r16;\n" -" ld.param.s32 %r32, [__cudaparm_kernel_pair_lj_types];\n" -" mul.lo.s32 %r33, %r32, %r31;\n" -" ld.param.u64 %rd28, [__cudaparm_kernel_pair_lj1];\n" -" mov.f32 %f25, 0f00000000; \n" -" mov.f32 %f26, 0f00000000; \n" -" mov.f32 %f27, 0f00000000; \n" -" mov.f32 %f28, 0f00000000; \n" -" mov.u64 %rd29, __cuda___cuda_local_var_32536_33_non_const_sp_lj92;\n" -"$Lt_0_20226:\n" -" .loc 16 60 0\n" -" ld.global.s32 %r34, [%rd20+0];\n" -" .loc 16 61 0\n" -" shr.s32 %r35, %r34, 30;\n" -" and.b32 %r36, %r35, 3;\n" -" cvt.s64.s32 %rd30, %r36;\n" -" mul.wide.s32 %rd31, %r36, 4;\n" -" add.u64 %rd32, %rd29, %rd31;\n" -" ld.shared.f32 %f29, [%rd32+0];\n" -" .loc 16 64 0\n" -" and.b32 %r37, %r34, 1073741823;\n" -" mov.u32 %r38, %r37;\n" -" mov.s32 %r39, 0;\n" -" mov.u32 %r40, %r39;\n" -" mov.s32 %r41, 0;\n" -" mov.u32 %r42, %r41;\n" -" mov.s32 %r43, 0;\n" -" mov.u32 %r44, %r43;\n" -" tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r38,%r40,%r42,%r44}];\n" -" mov.f32 %f34, %f30;\n" -" mov.f32 %f35, %f31;\n" -" mov.f32 %f36, %f32;\n" -" mov.f32 %f37, %f33;\n" -" cvt.rzi.ftz.s32.f32 %r45, %f37;\n" -" sub.ftz.f32 %f38, %f22, %f35;\n" -" sub.ftz.f32 %f39, %f21, %f34;\n" -" sub.ftz.f32 %f40, %f23, %f36;\n" -" mul.ftz.f32 %f41, %f38, %f38;\n" -" fma.rn.ftz.f32 %f42, %f39, %f39, %f41;\n" -" fma.rn.ftz.f32 %f43, %f40, %f40, %f42;\n" -" add.s32 %r46, %r45, %r33;\n" -" cvt.s64.s32 %rd33, %r46;\n" -" mul.wide.s32 %rd34, %r46, 16;\n" -" add.u64 %rd35, %rd34, %rd28;\n" -" ld.global.f32 %f44, [%rd35+8];\n" -" setp.gt.ftz.f32 %p4, %f44, %f43;\n" -" @!%p4 bra $Lt_0_21506;\n" -" .loc 16 79 0\n" -" rcp.approx.ftz.f32 %f45, %f43;\n" -" mul.ftz.f32 %f46, %f45, %f45;\n" -" mul.ftz.f32 %f47, %f45, %f46;\n" -" sqrt.approx.ftz.f32 %f48, %f47;\n" -" mul.ftz.f32 %f49, %f45, %f47;\n" -" ld.global.v2.f32 {%f50,%f51}, [%rd35+0];\n" -" mul.ftz.f32 %f52, %f50, %f48;\n" -" sub.ftz.f32 %f53, %f52, %f51;\n" -" mul.ftz.f32 %f54, %f49, %f53;\n" -" mul.ftz.f32 %f55, %f29, %f54;\n" -" .loc 16 81 0\n" -" fma.rn.ftz.f32 %f27, %f39, %f55, %f27;\n" -" .loc 16 82 0\n" -" fma.rn.ftz.f32 %f26, %f38, %f55, %f26;\n" -" .loc 16 83 0\n" -" fma.rn.ftz.f32 %f25, %f40, %f55, %f25;\n" -" ld.param.s32 %r47, [__cudaparm_kernel_pair_eflag];\n" -" mov.u32 %r48, 0;\n" -" setp.le.s32 %p5, %r47, %r48;\n" -" @%p5 bra $Lt_0_20994;\n" -" .loc 16 87 0\n" -" ld.param.u64 %rd36, [__cudaparm_kernel_pair_lj3];\n" -" add.u64 %rd37, %rd36, %rd34;\n" -" ld.global.v4.f32 {%f56,%f57,%f58,_}, [%rd37+0];\n" -" mul.ftz.f32 %f59, %f56, %f48;\n" -" sub.ftz.f32 %f60, %f59, %f57;\n" -" mul.ftz.f32 %f61, %f47, %f60;\n" -" sub.ftz.f32 %f62, %f61, %f58;\n" -" fma.rn.ftz.f32 %f28, %f29, %f62, %f28;\n" -"$Lt_0_20994:\n" -" ld.param.s32 %r49, [__cudaparm_kernel_pair_vflag];\n" -" mov.u32 %r50, 0;\n" -" setp.le.s32 %p6, %r49, %r50;\n" -" @%p6 bra $Lt_0_21506;\n" -" .loc 16 90 0\n" -" mov.f32 %f63, %f6;\n" -" mul.ftz.f32 %f64, %f39, %f39;\n" -" fma.rn.ftz.f32 %f65, %f55, %f64, %f63;\n" -" mov.f32 %f6, %f65;\n" -" .loc 16 91 0\n" -" mov.f32 %f66, %f8;\n" -" fma.rn.ftz.f32 %f67, %f55, %f41, %f66;\n" -" mov.f32 %f8, %f67;\n" -" .loc 16 92 0\n" -" mov.f32 %f68, %f10;\n" -" mul.ftz.f32 %f69, %f40, %f40;\n" -" fma.rn.ftz.f32 %f70, %f55, %f69, %f68;\n" -" mov.f32 %f10, %f70;\n" -" .loc 16 93 0\n" -" mov.f32 %f71, %f12;\n" -" mul.ftz.f32 %f72, %f38, %f39;\n" -" fma.rn.ftz.f32 %f73, %f55, %f72, %f71;\n" -" mov.f32 %f12, %f73;\n" -" .loc 16 94 0\n" -" mov.f32 %f74, %f14;\n" -" mul.ftz.f32 %f75, %f39, %f40;\n" -" fma.rn.ftz.f32 %f76, %f55, %f75, %f74;\n" -" mov.f32 %f14, %f76;\n" -" .loc 16 95 0\n" -" mul.ftz.f32 %f77, %f38, %f40;\n" -" fma.rn.ftz.f32 %f15, %f55, %f77, %f15;\n" -" mov.f32 %f16, %f15;\n" -"$Lt_0_21506:\n" -"$Lt_0_20482:\n" -" .loc 16 58 0\n" -" mul.lo.u64 %rd38, %rd27, 4;\n" -" add.u64 %rd20, %rd20, %rd38;\n" -" setp.lt.u64 %p7, %rd20, %rd19;\n" -" @%p7 bra $Lt_0_20226;\n" -" bra.uni $Lt_0_19714;\n" -"$Lt_0_27906:\n" -" mov.f32 %f25, 0f00000000; \n" -" mov.f32 %f26, 0f00000000; \n" -" mov.f32 %f27, 0f00000000; \n" -" mov.f32 %f28, 0f00000000; \n" -"$Lt_0_19714:\n" -" mov.u32 %r51, 1;\n" -" setp.le.s32 %p8, %r1, %r51;\n" -" @%p8 bra $Lt_0_24322;\n" -" .loc 16 100 0\n" -" mov.u64 %rd39, __cuda___cuda_local_var_32601_55_non_const_red_acc108;\n" -" cvt.s64.s32 %rd40, %r2;\n" -" mul.wide.s32 %rd41, %r2, 4;\n" -" add.u64 %rd42, %rd39, %rd41;\n" -" mov.f32 %f78, %f27;\n" -" st.shared.f32 [%rd42+0], %f78;\n" -" mov.f32 %f79, %f26;\n" -" st.shared.f32 [%rd42+512], %f79;\n" -" mov.f32 %f80, %f25;\n" -" st.shared.f32 [%rd42+1024], %f80;\n" -" mov.f32 %f81, %f28;\n" -" st.shared.f32 [%rd42+1536], %f81;\n" -" shr.s32 %r52, %r1, 31;\n" -" mov.s32 %r53, 1;\n" -" and.b32 %r54, %r52, %r53;\n" -" add.s32 %r55, %r54, %r1;\n" -" shr.s32 %r56, %r55, 1;\n" -" mov.s32 %r57, %r56;\n" -" mov.u32 %r58, 0;\n" -" setp.ne.u32 %p9, %r56, %r58;\n" -" @!%p9 bra $Lt_0_22786;\n" -"$Lt_0_23298:\n" -" setp.ge.u32 %p10, %r13, %r57;\n" -" @%p10 bra $Lt_0_23554;\n" -" add.u32 %r59, %r2, %r57;\n" -" cvt.u64.u32 %rd43, %r59;\n" -" mul.wide.u32 %rd44, %r59, 4;\n" -" add.u64 %rd45, %rd39, %rd44;\n" -" ld.shared.f32 %f82, [%rd45+0];\n" -" add.ftz.f32 %f78, %f82, %f78;\n" -" st.shared.f32 [%rd42+0], %f78;\n" -" ld.shared.f32 %f83, [%rd45+512];\n" -" add.ftz.f32 %f79, %f83, %f79;\n" -" st.shared.f32 [%rd42+512], %f79;\n" -" ld.shared.f32 %f84, [%rd45+1024];\n" -" add.ftz.f32 %f80, %f84, %f80;\n" -" st.shared.f32 [%rd42+1024], %f80;\n" -" ld.shared.f32 %f85, [%rd45+1536];\n" -" add.ftz.f32 %f81, %f85, %f81;\n" -" st.shared.f32 [%rd42+1536], %f81;\n" -"$Lt_0_23554:\n" -" shr.u32 %r57, %r57, 1;\n" -" mov.u32 %r60, 0;\n" -" setp.ne.u32 %p11, %r57, %r60;\n" -" @%p11 bra $Lt_0_23298;\n" -"$Lt_0_22786:\n" -" mov.f32 %f27, %f78;\n" -" mov.f32 %f26, %f79;\n" -" mov.f32 %f25, %f80;\n" -" mov.f32 %f28, %f81;\n" -" ld.param.s32 %r61, [__cudaparm_kernel_pair_vflag];\n" -" mov.u32 %r62, 0;\n" -" setp.le.s32 %p12, %r61, %r62;\n" -" @%p12 bra $Lt_0_24322;\n" -" mov.f32 %f78, %f6;\n" -" st.shared.f32 [%rd42+0], %f78;\n" -" mov.f32 %f79, %f8;\n" -" st.shared.f32 [%rd42+512], %f79;\n" -" mov.f32 %f80, %f10;\n" -" st.shared.f32 [%rd42+1024], %f80;\n" -" mov.f32 %f81, %f12;\n" -" st.shared.f32 [%rd42+1536], %f81;\n" -" mov.f32 %f86, %f14;\n" -" st.shared.f32 [%rd42+2048], %f86;\n" -" mov.f32 %f87, %f15;\n" -" st.shared.f32 [%rd42+2560], %f87;\n" -" mov.s32 %r63, %r56;\n" -" @!%p9 bra $Lt_0_24834;\n" -"$Lt_0_25346:\n" -" setp.ge.u32 %p13, %r13, %r63;\n" -" @%p13 bra $Lt_0_25602;\n" -" add.u32 %r64, %r2, %r63;\n" -" cvt.u64.u32 %rd46, %r64;\n" -" mul.wide.u32 %rd47, %r64, 4;\n" -" add.u64 %rd48, %rd39, %rd47;\n" -" ld.shared.f32 %f88, [%rd48+0];\n" -" add.ftz.f32 %f78, %f88, %f78;\n" -" st.shared.f32 [%rd42+0], %f78;\n" -" ld.shared.f32 %f89, [%rd48+512];\n" -" add.ftz.f32 %f79, %f89, %f79;\n" -" st.shared.f32 [%rd42+512], %f79;\n" -" ld.shared.f32 %f90, [%rd48+1024];\n" -" add.ftz.f32 %f80, %f90, %f80;\n" -" st.shared.f32 [%rd42+1024], %f80;\n" -" ld.shared.f32 %f91, [%rd48+1536];\n" -" add.ftz.f32 %f81, %f91, %f81;\n" -" st.shared.f32 [%rd42+1536], %f81;\n" -" ld.shared.f32 %f92, [%rd48+2048];\n" -" add.ftz.f32 %f86, %f92, %f86;\n" -" st.shared.f32 [%rd42+2048], %f86;\n" -" ld.shared.f32 %f93, [%rd48+2560];\n" -" add.ftz.f32 %f87, %f93, %f87;\n" -" st.shared.f32 [%rd42+2560], %f87;\n" -"$Lt_0_25602:\n" -" shr.u32 %r63, %r63, 1;\n" -" mov.u32 %r65, 0;\n" -" setp.ne.u32 %p14, %r63, %r65;\n" -" @%p14 bra $Lt_0_25346;\n" -"$Lt_0_24834:\n" -" mov.f32 %f6, %f78;\n" -" mov.f32 %f8, %f79;\n" -" mov.f32 %f10, %f80;\n" -" mov.f32 %f12, %f81;\n" -" mov.f32 %f14, %f86;\n" -" mov.f32 %f16, %f87;\n" -"$Lt_0_24322:\n" -"$Lt_0_22274:\n" -" mov.u32 %r66, 0;\n" -" setp.ne.s32 %p15, %r13, %r66;\n" -" @%p15 bra $Lt_0_26370;\n" -" ld.param.u64 %rd49, [__cudaparm_kernel_pair___val_paramengv];\n" -" add.u64 %rd50, %rd49, %rd5;\n" -" ld.param.s32 %r67, [__cudaparm_kernel_pair_eflag];\n" -" mov.u32 %r68, 0;\n" -" setp.le.s32 %p16, %r67, %r68;\n" -" @%p16 bra $Lt_0_26882;\n" -" st.global.f32 [%rd50+0], %f28;\n" -" cvt.s64.s32 %rd51, %r9;\n" -" mul.wide.s32 %rd52, %r9, 4;\n" -" add.u64 %rd50, %rd50, %rd52;\n" -"$Lt_0_26882:\n" -" ld.param.s32 %r69, [__cudaparm_kernel_pair_vflag];\n" -" mov.u32 %r70, 0;\n" -" setp.le.s32 %p17, %r69, %r70;\n" -" @%p17 bra $Lt_0_27394;\n" -" mov.f32 %f94, %f6;\n" -" st.global.f32 [%rd50+0], %f94;\n" -" cvt.s64.s32 %rd53, %r9;\n" -" mul.wide.s32 %rd54, %r9, 4;\n" -" add.u64 %rd55, %rd54, %rd50;\n" -" mov.f32 %f95, %f8;\n" -" st.global.f32 [%rd55+0], %f95;\n" -" add.u64 %rd56, %rd54, %rd55;\n" -" mov.f32 %f96, %f10;\n" -" st.global.f32 [%rd56+0], %f96;\n" -" add.u64 %rd57, %rd54, %rd56;\n" -" mov.f32 %f97, %f12;\n" -" st.global.f32 [%rd57+0], %f97;\n" -" add.u64 %rd50, %rd54, %rd57;\n" -" mov.f32 %f98, %f14;\n" -" st.global.f32 [%rd50+0], %f98;\n" -" mov.f32 %f99, %f16;\n" -" add.u64 %rd58, %rd54, %rd50;\n" -" st.global.f32 [%rd58+0], %f99;\n" -"$Lt_0_27394:\n" -" ld.param.u64 %rd59, [__cudaparm_kernel_pair_ans];\n" -" mul.lo.u64 %rd60, %rd4, 16;\n" -" add.u64 %rd61, %rd59, %rd60;\n" -" mov.f32 %f100, %f101;\n" -" st.global.v4.f32 [%rd61+0], {%f27,%f26,%f25,%f100};\n" -"$Lt_0_26370:\n" -"$Lt_0_18690:\n" -" .loc 16 103 0\n" -" exit;\n" -"$LDWend_kernel_pair:\n" -" }\n" -" .entry kernel_pair_fast (\n" -" .param .u64 __cudaparm_kernel_pair_fast_x_,\n" -" .param .u64 __cudaparm_kernel_pair_fast_lj1_in,\n" -" .param .u64 __cudaparm_kernel_pair_fast_lj3_in,\n" -" .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,\n" -" .param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n" -" .param .u64 __cudaparm_kernel_pair_fast_dev_packed,\n" -" .param .u64 __cudaparm_kernel_pair_fast_ans,\n" -" .param .u64 __cudaparm_kernel_pair_fast___val_paramengv,\n" -" .param .s32 __cudaparm_kernel_pair_fast_eflag,\n" -" .param .s32 __cudaparm_kernel_pair_fast_vflag,\n" -" .param .s32 __cudaparm_kernel_pair_fast_inum,\n" -" .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,\n" -" .param .s32 __cudaparm_kernel_pair_fast_t_per_atom)\n" -" {\n" -" .reg .u32 %r<74>;\n" -" .reg .u64 %rd<75>;\n" -" .reg .f32 %f<109>;\n" -" .reg .pred %p<22>;\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_32618_33_non_const_sp_lj3268[16];\n" -" .shared .align 16 .b8 __cuda___cuda_local_var_32616_34_non_const_lj13296[1936];\n" -" .shared .align 16 .b8 __cuda___cuda_local_var_32617_34_non_const_lj35232[1936];\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_32687_55_non_const_red_acc7168[3072];\n" -" .loc 16 111 0\n" -"$LDWbegin_kernel_pair_fast:\n" -" cvt.s32.u32 %r1, %tid.x;\n" -" mov.u32 %r2, 3;\n" -" setp.gt.s32 %p1, %r1, %r2;\n" -" @%p1 bra $Lt_1_20994;\n" -" .loc 16 119 0\n" -" mov.u64 %rd1, __cuda___cuda_local_var_32618_33_non_const_sp_lj3268;\n" -" cvt.s64.s32 %rd2, %r1;\n" -" mul.wide.s32 %rd3, %r1, 4;\n" -" ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];\n" -" add.u64 %rd5, %rd4, %rd3;\n" -" ld.global.f32 %f1, [%rd5+0];\n" -" add.u64 %rd6, %rd3, %rd1;\n" -" st.shared.f32 [%rd6+0], %f1;\n" -"$Lt_1_20994:\n" -" mov.u64 %rd1, __cuda___cuda_local_var_32618_33_non_const_sp_lj3268;\n" -" mov.u32 %r3, 120;\n" -" setp.gt.s32 %p2, %r1, %r3;\n" -" @%p2 bra $Lt_1_21506;\n" -" .loc 16 121 0\n" -" mov.u64 %rd7, __cuda___cuda_local_var_32616_34_non_const_lj13296;\n" -" cvt.s64.s32 %rd8, %r1;\n" -" mul.wide.s32 %rd9, %r1, 16;\n" -" ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_lj1_in];\n" -" add.u64 %rd11, %rd10, %rd9;\n" -" add.u64 %rd12, %rd9, %rd7;\n" -" ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0];\n" -" st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5};\n" -" ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag];\n" -" mov.u32 %r5, 0;\n" -" setp.le.s32 %p3, %r4, %r5;\n" -" @%p3 bra $Lt_1_22018;\n" -" .loc 16 123 0\n" -" mov.u64 %rd13, __cuda___cuda_local_var_32617_34_non_const_lj35232;\n" -" ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in];\n" -" add.u64 %rd15, %rd14, %rd9;\n" -" add.u64 %rd16, %rd9, %rd13;\n" -" ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0];\n" -" st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9};\n" -"$Lt_1_22018:\n" -" mov.u64 %rd13, __cuda___cuda_local_var_32617_34_non_const_lj35232;\n" -"$Lt_1_21506:\n" -" mov.u64 %rd13, __cuda___cuda_local_var_32617_34_non_const_lj35232;\n" -" mov.u64 %rd7, __cuda___cuda_local_var_32616_34_non_const_lj13296;\n" -" .loc 16 131 0\n" -" mov.f32 %f10, 0f00000000; \n" -" mov.f32 %f11, %f10;\n" -" mov.f32 %f12, 0f00000000; \n" -" mov.f32 %f13, %f12;\n" -" mov.f32 %f14, 0f00000000; \n" -" mov.f32 %f15, %f14;\n" -" mov.f32 %f16, 0f00000000; \n" -" mov.f32 %f17, %f16;\n" -" mov.f32 %f18, 0f00000000; \n" -" mov.f32 %f19, %f18;\n" -" mov.f32 %f20, 0f00000000; \n" -" mov.f32 %f21, %f20;\n" -" .loc 16 133 0\n" -" bar.sync 0;\n" -" ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom];\n" -" div.s32 %r7, %r1, %r6;\n" -" cvt.s32.u32 %r8, %ntid.x;\n" -" div.s32 %r9, %r8, %r6;\n" -" cvt.s32.u32 %r10, %ctaid.x;\n" -" mul.lo.s32 %r11, %r10, %r9;\n" -" add.s32 %r12, %r7, %r11;\n" -" ld.param.s32 %r13, [__cudaparm_kernel_pair_fast_inum];\n" -" setp.ge.s32 %p4, %r12, %r13;\n" -" @%p4 bra $Lt_1_30210;\n" -" .loc 16 138 0\n" -" ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_nbor_pitch];\n" -" cvt.s64.s32 %rd17, %r14;\n" -" mul.wide.s32 %rd18, %r14, 4;\n" -" cvt.s64.s32 %rd19, %r12;\n" -" mul.wide.s32 %rd20, %r12, 4;\n" -" ld.param.u64 %rd21, [__cudaparm_kernel_pair_fast_dev_nbor];\n" -" add.u64 %rd22, %rd20, %rd21;\n" -" add.u64 %rd23, %rd18, %rd22;\n" -" ld.global.s32 %r15, [%rd23+0];\n" -" sub.s32 %r16, %r6, 1;\n" -" and.b32 %r17, %r16, %r1;\n" -" cvt.s64.s32 %rd24, %r17;\n" -" mul.wide.s32 %rd25, %r17, 4;\n" -" ld.param.u64 %rd26, [__cudaparm_kernel_pair_fast_dev_packed];\n" -" setp.ne.u64 %p5, %rd26, %rd21;\n" -" @%p5 bra $Lt_1_23298;\n" -" cvt.s32.s64 %r18, %rd17;\n" -" mul.lo.s32 %r19, %r18, %r6;\n" -" mov.s32 %r20, %r19;\n" -" mul.lo.s32 %r21, %r16, %r12;\n" -" add.s32 %r22, %r18, %r21;\n" -" cvt.s64.s32 %rd27, %r22;\n" -" mul.wide.s32 %rd28, %r22, 4;\n" -" add.u64 %rd29, %rd23, %rd28;\n" -" and.b32 %r23, %r16, %r15;\n" -" cvt.s64.s32 %rd30, %r23;\n" -" div.s32 %r24, %r15, %r6;\n" -" mul.lo.s32 %r25, %r19, %r24;\n" -" cvt.s64.s32 %rd31, %r25;\n" -" add.u64 %rd32, %rd30, %rd31;\n" -" mul.lo.u64 %rd33, %rd32, 4;\n" -" add.u64 %rd34, %rd29, %rd33;\n" -" add.u64 %rd35, %rd25, %rd29;\n" -" bra.uni $Lt_1_23042;\n" -"$Lt_1_23298:\n" -" add.u64 %rd36, %rd18, %rd23;\n" -" ld.global.s32 %r26, [%rd36+0];\n" -" cvt.s64.s32 %rd37, %r26;\n" -" mul.wide.s32 %rd38, %r26, 4;\n" -" add.u64 %rd39, %rd26, %rd38;\n" -" cvt.s64.s32 %rd40, %r15;\n" -" mul.wide.s32 %rd41, %r15, 4;\n" -" add.u64 %rd34, %rd39, %rd41;\n" -" mov.s32 %r20, %r6;\n" -" add.u64 %rd35, %rd25, %rd39;\n" -"$Lt_1_23042:\n" -" .loc 16 141 0\n" -" ld.global.s32 %r27, [%rd22+0];\n" -" mov.u32 %r28, %r27;\n" -" mov.s32 %r29, 0;\n" -" mov.u32 %r30, %r29;\n" -" mov.s32 %r31, 0;\n" -" mov.u32 %r32, %r31;\n" -" mov.s32 %r33, 0;\n" -" mov.u32 %r34, %r33;\n" -" tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r28,%r30,%r32,%r34}];\n" -" mov.f32 %f26, %f22;\n" -" mov.f32 %f27, %f23;\n" -" mov.f32 %f28, %f24;\n" -" mov.f32 %f29, %f25;\n" -" setp.ge.u64 %p6, %rd35, %rd34;\n" -" @%p6 bra $Lt_1_31746;\n" -" cvt.rzi.ftz.s32.f32 %r35, %f29;\n" -" cvt.s64.s32 %rd42, %r20;\n" -" mul.lo.s32 %r36, %r35, 11;\n" -" cvt.rn.f32.s32 %f30, %r36;\n" -" mov.f32 %f31, 0f00000000; \n" -" mov.f32 %f32, 0f00000000; \n" -" mov.f32 %f33, 0f00000000; \n" -" mov.f32 %f34, 0f00000000; \n" -"$Lt_1_24066:\n" -" .loc 16 148 0\n" -" ld.global.s32 %r37, [%rd35+0];\n" -" .loc 16 152 0\n" -" and.b32 %r38, %r37, 1073741823;\n" -" mov.u32 %r39, %r38;\n" -" mov.s32 %r40, 0;\n" -" mov.u32 %r41, %r40;\n" -" mov.s32 %r42, 0;\n" -" mov.u32 %r43, %r42;\n" -" mov.s32 %r44, 0;\n" -" mov.u32 %r45, %r44;\n" -" tex.1d.v4.f32.s32 {%f35,%f36,%f37,%f38},[pos_tex,{%r39,%r41,%r43,%r45}];\n" -" mov.f32 %f39, %f35;\n" -" mov.f32 %f40, %f36;\n" -" mov.f32 %f41, %f37;\n" -" mov.f32 %f42, %f38;\n" -" sub.ftz.f32 %f43, %f27, %f40;\n" -" sub.ftz.f32 %f44, %f26, %f39;\n" -" sub.ftz.f32 %f45, %f28, %f41;\n" -" mul.ftz.f32 %f46, %f43, %f43;\n" -" fma.rn.ftz.f32 %f47, %f44, %f44, %f46;\n" -" fma.rn.ftz.f32 %f48, %f45, %f45, %f47;\n" -" add.ftz.f32 %f49, %f30, %f42;\n" -" cvt.rzi.ftz.s32.f32 %r46, %f49;\n" -" cvt.s64.s32 %rd43, %r46;\n" -" mul.wide.s32 %rd44, %r46, 16;\n" -" add.u64 %rd45, %rd44, %rd7;\n" -" ld.shared.f32 %f50, [%rd45+8];\n" -" setp.gt.ftz.f32 %p7, %f50, %f48;\n" -" @!%p7 bra $Lt_1_25346;\n" -" .loc 16 165 0\n" -" rcp.approx.ftz.f32 %f51, %f48;\n" -" mul.ftz.f32 %f52, %f51, %f51;\n" -" mul.ftz.f32 %f53, %f51, %f52;\n" -" sqrt.approx.ftz.f32 %f54, %f53;\n" -" mul.ftz.f32 %f55, %f51, %f53;\n" -" ld.shared.v2.f32 {%f56,%f57}, [%rd45+0];\n" -" mul.ftz.f32 %f58, %f56, %f54;\n" -" sub.ftz.f32 %f59, %f58, %f57;\n" -" mul.ftz.f32 %f60, %f55, %f59;\n" -" .loc 16 167 0\n" -" fma.rn.ftz.f32 %f33, %f44, %f60, %f33;\n" -" .loc 16 168 0\n" -" fma.rn.ftz.f32 %f32, %f43, %f60, %f32;\n" -" .loc 16 169 0\n" -" fma.rn.ftz.f32 %f31, %f45, %f60, %f31;\n" -" ld.param.s32 %r47, [__cudaparm_kernel_pair_fast_eflag];\n" -" mov.u32 %r48, 0;\n" -" setp.le.s32 %p8, %r47, %r48;\n" -" @%p8 bra $Lt_1_24834;\n" -" .loc 16 172 0\n" -" add.u64 %rd46, %rd44, %rd13;\n" -" ld.shared.v4.f32 {%f61,%f62,%f63,_}, [%rd46+0];\n" -" mul.ftz.f32 %f64, %f61, %f54;\n" -" sub.ftz.f32 %f65, %f64, %f62;\n" -" mul.ftz.f32 %f66, %f53, %f65;\n" -" .loc 16 173 0\n" -" shr.s32 %r49, %r37, 30;\n" -" and.b32 %r50, %r49, 3;\n" -" cvt.s64.s32 %rd47, %r50;\n" -" mul.wide.s32 %rd48, %r50, 4;\n" -" add.u64 %rd49, %rd1, %rd48;\n" -" ld.shared.f32 %f67, [%rd49+0];\n" -" sub.ftz.f32 %f68, %f66, %f63;\n" -" fma.rn.ftz.f32 %f34, %f67, %f68, %f34;\n" -"$Lt_1_24834:\n" -" ld.param.s32 %r51, [__cudaparm_kernel_pair_fast_vflag];\n" -" mov.u32 %r52, 0;\n" -" setp.le.s32 %p9, %r51, %r52;\n" -" @%p9 bra $Lt_1_25346;\n" -" .loc 16 176 0\n" -" mov.f32 %f69, %f11;\n" -" mul.ftz.f32 %f70, %f44, %f44;\n" -" fma.rn.ftz.f32 %f71, %f60, %f70, %f69;\n" -" mov.f32 %f11, %f71;\n" -" .loc 16 177 0\n" -" mov.f32 %f72, %f13;\n" -" fma.rn.ftz.f32 %f73, %f60, %f46, %f72;\n" -" mov.f32 %f13, %f73;\n" -" .loc 16 178 0\n" -" mov.f32 %f74, %f15;\n" -" mul.ftz.f32 %f75, %f45, %f45;\n" -" fma.rn.ftz.f32 %f76, %f60, %f75, %f74;\n" -" mov.f32 %f15, %f76;\n" -" .loc 16 179 0\n" -" mov.f32 %f77, %f17;\n" -" mul.ftz.f32 %f78, %f43, %f44;\n" -" fma.rn.ftz.f32 %f79, %f60, %f78, %f77;\n" -" mov.f32 %f17, %f79;\n" -" .loc 16 180 0\n" -" mov.f32 %f80, %f19;\n" -" mul.ftz.f32 %f81, %f44, %f45;\n" -" fma.rn.ftz.f32 %f82, %f60, %f81, %f80;\n" -" mov.f32 %f19, %f82;\n" -" .loc 16 181 0\n" -" mul.ftz.f32 %f83, %f43, %f45;\n" -" fma.rn.ftz.f32 %f20, %f60, %f83, %f20;\n" -" mov.f32 %f21, %f20;\n" -"$Lt_1_25346:\n" -"$Lt_1_24322:\n" -" .loc 16 146 0\n" -" mul.lo.u64 %rd50, %rd42, 4;\n" -" add.u64 %rd35, %rd35, %rd50;\n" -" setp.lt.u64 %p10, %rd35, %rd34;\n" -" @%p10 bra $Lt_1_24066;\n" -" bra.uni $Lt_1_23554;\n" -"$Lt_1_31746:\n" -" mov.f32 %f31, 0f00000000; \n" -" mov.f32 %f32, 0f00000000; \n" -" mov.f32 %f33, 0f00000000; \n" -" mov.f32 %f34, 0f00000000; \n" -"$Lt_1_23554:\n" -" mov.u32 %r53, 1;\n" -" setp.le.s32 %p11, %r6, %r53;\n" -" @%p11 bra $Lt_1_28162;\n" -" .loc 16 186 0\n" -" mov.u64 %rd51, __cuda___cuda_local_var_32687_55_non_const_red_acc7168;\n" -" cvt.s64.s32 %rd52, %r1;\n" -" mul.wide.s32 %rd53, %r1, 4;\n" -" add.u64 %rd54, %rd51, %rd53;\n" -" mov.f32 %f84, %f33;\n" -" st.shared.f32 [%rd54+0], %f84;\n" -" mov.f32 %f85, %f32;\n" -" st.shared.f32 [%rd54+512], %f85;\n" -" mov.f32 %f86, %f31;\n" -" st.shared.f32 [%rd54+1024], %f86;\n" -" mov.f32 %f87, %f34;\n" -" st.shared.f32 [%rd54+1536], %f87;\n" -" shr.s32 %r54, %r6, 31;\n" -" mov.s32 %r55, 1;\n" -" and.b32 %r56, %r54, %r55;\n" -" add.s32 %r57, %r56, %r6;\n" -" shr.s32 %r58, %r57, 1;\n" -" mov.s32 %r59, %r58;\n" -" mov.u32 %r60, 0;\n" -" setp.ne.u32 %p12, %r58, %r60;\n" -" @!%p12 bra $Lt_1_26626;\n" -"$Lt_1_27138:\n" -" setp.ge.u32 %p13, %r17, %r59;\n" -" @%p13 bra $Lt_1_27394;\n" -" add.u32 %r61, %r1, %r59;\n" -" cvt.u64.u32 %rd55, %r61;\n" -" mul.wide.u32 %rd56, %r61, 4;\n" -" add.u64 %rd57, %rd51, %rd56;\n" -" ld.shared.f32 %f88, [%rd57+0];\n" -" add.ftz.f32 %f84, %f88, %f84;\n" -" st.shared.f32 [%rd54+0], %f84;\n" -" ld.shared.f32 %f89, [%rd57+512];\n" -" add.ftz.f32 %f85, %f89, %f85;\n" -" st.shared.f32 [%rd54+512], %f85;\n" -" ld.shared.f32 %f90, [%rd57+1024];\n" -" add.ftz.f32 %f86, %f90, %f86;\n" -" st.shared.f32 [%rd54+1024], %f86;\n" -" ld.shared.f32 %f91, [%rd57+1536];\n" -" add.ftz.f32 %f87, %f91, %f87;\n" -" st.shared.f32 [%rd54+1536], %f87;\n" -"$Lt_1_27394:\n" -" shr.u32 %r59, %r59, 1;\n" -" mov.u32 %r62, 0;\n" -" setp.ne.u32 %p14, %r59, %r62;\n" -" @%p14 bra $Lt_1_27138;\n" -"$Lt_1_26626:\n" -" mov.f32 %f33, %f84;\n" -" mov.f32 %f32, %f85;\n" -" mov.f32 %f31, %f86;\n" -" mov.f32 %f34, %f87;\n" -" ld.param.s32 %r63, [__cudaparm_kernel_pair_fast_vflag];\n" -" mov.u32 %r64, 0;\n" -" setp.le.s32 %p15, %r63, %r64;\n" -" @%p15 bra $Lt_1_28162;\n" -" mov.f32 %f84, %f11;\n" -" st.shared.f32 [%rd54+0], %f84;\n" -" mov.f32 %f85, %f13;\n" -" st.shared.f32 [%rd54+512], %f85;\n" -" mov.f32 %f86, %f15;\n" -" st.shared.f32 [%rd54+1024], %f86;\n" -" mov.f32 %f87, %f17;\n" -" st.shared.f32 [%rd54+1536], %f87;\n" -" mov.f32 %f92, %f19;\n" -" st.shared.f32 [%rd54+2048], %f92;\n" -" mov.f32 %f93, %f20;\n" -" st.shared.f32 [%rd54+2560], %f93;\n" -" mov.s32 %r65, %r58;\n" -" @!%p12 bra $Lt_1_28674;\n" -"$Lt_1_29186:\n" -" setp.ge.u32 %p16, %r17, %r65;\n" -" @%p16 bra $Lt_1_29442;\n" -" add.u32 %r66, %r1, %r65;\n" -" cvt.u64.u32 %rd58, %r66;\n" -" mul.wide.u32 %rd59, %r66, 4;\n" -" add.u64 %rd60, %rd51, %rd59;\n" -" ld.shared.f32 %f94, [%rd60+0];\n" -" add.ftz.f32 %f84, %f94, %f84;\n" -" st.shared.f32 [%rd54+0], %f84;\n" -" ld.shared.f32 %f95, [%rd60+512];\n" -" add.ftz.f32 %f85, %f95, %f85;\n" -" st.shared.f32 [%rd54+512], %f85;\n" -" ld.shared.f32 %f96, [%rd60+1024];\n" -" add.ftz.f32 %f86, %f96, %f86;\n" -" st.shared.f32 [%rd54+1024], %f86;\n" -" ld.shared.f32 %f97, [%rd60+1536];\n" -" add.ftz.f32 %f87, %f97, %f87;\n" -" st.shared.f32 [%rd54+1536], %f87;\n" -" ld.shared.f32 %f98, [%rd60+2048];\n" -" add.ftz.f32 %f92, %f98, %f92;\n" -" st.shared.f32 [%rd54+2048], %f92;\n" -" ld.shared.f32 %f99, [%rd60+2560];\n" -" add.ftz.f32 %f93, %f99, %f93;\n" -" st.shared.f32 [%rd54+2560], %f93;\n" -"$Lt_1_29442:\n" -" shr.u32 %r65, %r65, 1;\n" -" mov.u32 %r67, 0;\n" -" setp.ne.u32 %p17, %r65, %r67;\n" -" @%p17 bra $Lt_1_29186;\n" -"$Lt_1_28674:\n" -" mov.f32 %f11, %f84;\n" -" mov.f32 %f13, %f85;\n" -" mov.f32 %f15, %f86;\n" -" mov.f32 %f17, %f87;\n" -" mov.f32 %f19, %f92;\n" -" mov.f32 %f21, %f93;\n" -"$Lt_1_28162:\n" -"$Lt_1_26114:\n" -" mov.u32 %r68, 0;\n" -" setp.ne.s32 %p18, %r17, %r68;\n" -" @%p18 bra $Lt_1_30210;\n" -" ld.param.u64 %rd61, [__cudaparm_kernel_pair_fast___val_paramengv];\n" -" add.u64 %rd62, %rd61, %rd20;\n" -" ld.param.s32 %r69, [__cudaparm_kernel_pair_fast_eflag];\n" -" mov.u32 %r70, 0;\n" -" setp.le.s32 %p19, %r69, %r70;\n" -" @%p19 bra $Lt_1_30722;\n" -" st.global.f32 [%rd62+0], %f34;\n" -" cvt.s64.s32 %rd63, %r13;\n" -" mul.wide.s32 %rd64, %r13, 4;\n" -" add.u64 %rd62, %rd62, %rd64;\n" -"$Lt_1_30722:\n" -" ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag];\n" -" mov.u32 %r72, 0;\n" -" setp.le.s32 %p20, %r71, %r72;\n" -" @%p20 bra $Lt_1_31234;\n" -" mov.f32 %f100, %f11;\n" -" st.global.f32 [%rd62+0], %f100;\n" -" cvt.s64.s32 %rd65, %r13;\n" -" mul.wide.s32 %rd66, %r13, 4;\n" -" add.u64 %rd67, %rd66, %rd62;\n" -" mov.f32 %f101, %f13;\n" -" st.global.f32 [%rd67+0], %f101;\n" -" add.u64 %rd68, %rd66, %rd67;\n" -" mov.f32 %f102, %f15;\n" -" st.global.f32 [%rd68+0], %f102;\n" -" add.u64 %rd69, %rd66, %rd68;\n" -" mov.f32 %f103, %f17;\n" -" st.global.f32 [%rd69+0], %f103;\n" -" add.u64 %rd62, %rd66, %rd69;\n" -" mov.f32 %f104, %f19;\n" -" st.global.f32 [%rd62+0], %f104;\n" -" mov.f32 %f105, %f21;\n" -" add.u64 %rd70, %rd66, %rd62;\n" -" st.global.f32 [%rd70+0], %f105;\n" -"$Lt_1_31234:\n" -" ld.param.u64 %rd71, [__cudaparm_kernel_pair_fast_ans];\n" -" mul.lo.u64 %rd72, %rd19, 16;\n" -" add.u64 %rd73, %rd71, %rd72;\n" -" mov.f32 %f106, %f107;\n" -" st.global.v4.f32 [%rd73+0], {%f33,%f32,%f31,%f106};\n" -"$Lt_1_30210:\n" -"$Lt_1_22530:\n" -" .loc 16 189 0\n" -" exit;\n" -"$LDWend_kernel_pair_fast:\n" -" }\n" -; diff --git a/lib/gpu/lj_class2_long.ptx b/lib/gpu/lj_class2_long.ptx deleted file mode 100644 index 3ffb43ace8..0000000000 --- a/lib/gpu/lj_class2_long.ptx +++ /dev/null @@ -1,1133 +0,0 @@ - .version 2.3 - .target sm_20 - .address_size 64 - // compiled with /usr/local/cuda/open64/lib//be - // nvopencc 4.0 built on 2011-05-12 - - //----------------------------------------------------------- - // Compiling /tmp/tmpxft_00009d9c_00000000-9_lal_lj_class2_long.cpp3.i (/home/sjplimp/ccBI#.JI7tD2) - //----------------------------------------------------------- - - //----------------------------------------------------------- - // Options: - //----------------------------------------------------------- - // Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64 - // -O3 (Optimization level) - // -g0 (Debug level) - // -m2 (Report advisories) - //----------------------------------------------------------- - - .file 1 "" - .file 2 "/tmp/tmpxft_00009d9c_00000000-8_lal_lj_class2_long.cudafe2.gpu" - .file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h" - .file 4 "/usr/local/cuda/include/crt/device_runtime.h" - .file 5 "/usr/local/cuda/include/host_defines.h" - .file 6 "/usr/local/cuda/include/builtin_types.h" - .file 7 "/usr/local/cuda/include/device_types.h" - .file 8 "/usr/local/cuda/include/driver_types.h" - .file 9 "/usr/local/cuda/include/surface_types.h" - .file 10 "/usr/local/cuda/include/texture_types.h" - .file 11 "/usr/local/cuda/include/vector_types.h" - .file 12 "/usr/local/cuda/include/device_launch_parameters.h" - .file 13 "/usr/local/cuda/include/crt/storage_class.h" - .file 14 "/usr/include/bits/types.h" - .file 15 "/usr/include/time.h" - .file 16 "lal_lj_class2_long.cu" - .file 17 "/usr/local/cuda/include/common_functions.h" - .file 18 "/usr/local/cuda/include/math_functions.h" - .file 19 "/usr/local/cuda/include/math_constants.h" - .file 20 "/usr/local/cuda/include/device_functions.h" - .file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h" - .file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h" - .file 23 "/usr/local/cuda/include/sm_13_double_functions.h" - .file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h" - .file 25 "/usr/local/cuda/include/sm_20_intrinsics.h" - .file 26 "/usr/local/cuda/include/surface_functions.h" - .file 27 "/usr/local/cuda/include/texture_fetch_functions.h" - .file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h" - - .global .texref pos_tex; - .global .texref q_tex; - - .entry kernel_pair ( - .param .u64 __cudaparm_kernel_pair_x_, - .param .u64 __cudaparm_kernel_pair_lj1, - .param .u64 __cudaparm_kernel_pair_lj3, - .param .s32 __cudaparm_kernel_pair_lj_types, - .param .u64 __cudaparm_kernel_pair_sp_lj_in, - .param .u64 __cudaparm_kernel_pair_dev_nbor, - .param .u64 __cudaparm_kernel_pair_dev_packed, - .param .u64 __cudaparm_kernel_pair_ans, - .param .u64 __cudaparm_kernel_pair___val_paramengv, - .param .s32 __cudaparm_kernel_pair_eflag, - .param .s32 __cudaparm_kernel_pair_vflag, - .param .s32 __cudaparm_kernel_pair_inum, - .param .s32 __cudaparm_kernel_pair_nbor_pitch, - .param .u64 __cudaparm_kernel_pair_q_, - .param .f32 __cudaparm_kernel_pair_cut_coulsq, - .param .f32 __cudaparm_kernel_pair_qqrd2e, - .param .f32 __cudaparm_kernel_pair_g_ewald, - .param .s32 __cudaparm_kernel_pair_t_per_atom) - { - .reg .u32 %r<86>; - .reg .u64 %rd<64>; - .reg .f32 %f<167>; - .reg .pred %p<21>; - .shared .align 16 .b8 __cuda___cuda_local_var_32541_33_non_const_sp_lj112[32]; - .shared .align 4 .b8 __cuda___cuda_local_var_32635_55_non_const_red_acc144[3072]; - // __cuda_local_var_32553_10_non_const_f = 64 - // __cuda_local_var_32555_9_non_const_virial = 16 - // __cuda_local_var_32589_43_non_const_r6inv = 44 - // __cuda_local_var_32589_50_non_const_r3inv = 40 - // __cuda_local_var_32589_57_non_const_prefactor = 52 - // __cuda_local_var_32589_68_non_const__erfc = 48 - .loc 16 36 0 -$LDWbegin_kernel_pair: - .loc 16 41 0 - ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in]; - ldu.global.f32 %f1, [%rd1+0]; - .loc 16 42 0 - ld.global.f32 %f2, [%rd1+4]; - .loc 16 43 0 - ld.global.f32 %f3, [%rd1+8]; - .loc 16 44 0 - ld.global.f32 %f4, [%rd1+12]; - st.shared.v4.f32 [__cuda___cuda_local_var_32541_33_non_const_sp_lj112+0], {%f1,%f2,%f3,%f4}; - .loc 16 45 0 - ld.global.f32 %f5, [%rd1+16]; - .loc 16 46 0 - ld.global.f32 %f6, [%rd1+20]; - .loc 16 47 0 - ld.global.f32 %f7, [%rd1+24]; - .loc 16 48 0 - ld.global.f32 %f8, [%rd1+28]; - st.shared.v4.f32 [__cuda___cuda_local_var_32541_33_non_const_sp_lj112+16], {%f5,%f6,%f7,%f8}; - .loc 16 56 0 - mov.f32 %f9, 0f00000000; // 0 - mov.f32 %f10, %f9; - mov.f32 %f11, 0f00000000; // 0 - mov.f32 %f12, %f11; - mov.f32 %f13, 0f00000000; // 0 - mov.f32 %f14, %f13; - mov.f32 %f15, 0f00000000; // 0 - mov.f32 %f16, %f15; - mov.f32 %f17, 0f00000000; // 0 - mov.f32 %f18, %f17; - mov.f32 %f19, 0f00000000; // 0 - mov.f32 %f20, %f19; - ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom]; - cvt.s32.u32 %r2, %tid.x; - div.s32 %r3, %r2, %r1; - cvt.s32.u32 %r4, %ntid.x; - div.s32 %r5, %r4, %r1; - cvt.s32.u32 %r6, %ctaid.x; - mul.lo.s32 %r7, %r6, %r5; - add.s32 %r8, %r3, %r7; - ld.param.s32 %r9, [__cudaparm_kernel_pair_inum]; - setp.ge.s32 %p1, %r8, %r9; - @%p1 bra $Lt_0_30978; - .loc 16 61 0 - cvt.s64.s32 %rd2, %r8; - mul.wide.s32 %rd3, %r8, 4; - ld.param.u64 %rd4, [__cudaparm_kernel_pair_dev_nbor]; - add.u64 %rd5, %rd3, %rd4; - ld.global.s32 %r10, [%rd5+0]; - ld.param.s32 %r11, [__cudaparm_kernel_pair_nbor_pitch]; - cvt.s64.s32 %rd6, %r11; - mul.wide.s32 %rd7, %r11, 4; - add.u64 %rd8, %rd7, %rd5; - ld.global.s32 %r12, [%rd8+0]; - sub.s32 %r13, %r1, 1; - and.b32 %r14, %r13, %r2; - cvt.s64.s32 %rd9, %r14; - mul.wide.s32 %rd10, %r14, 4; - ld.param.u64 %rd11, [__cudaparm_kernel_pair_dev_packed]; - setp.ne.u64 %p2, %rd11, %rd4; - @%p2 bra $Lt_0_22530; - cvt.s32.s64 %r15, %rd6; - mul.lo.s32 %r16, %r15, %r1; - mov.s32 %r17, %r16; - mul.lo.s32 %r18, %r13, %r8; - add.s32 %r19, %r15, %r18; - cvt.s64.s32 %rd12, %r19; - mul.wide.s32 %rd13, %r19, 4; - add.u64 %rd14, %rd8, %rd13; - and.b32 %r20, %r13, %r12; - cvt.s64.s32 %rd15, %r20; - div.s32 %r21, %r12, %r1; - mul.lo.s32 %r22, %r16, %r21; - cvt.s64.s32 %rd16, %r22; - add.u64 %rd17, %rd15, %rd16; - mul.lo.u64 %rd18, %rd17, 4; - add.u64 %rd19, %rd14, %rd18; - add.u64 %rd20, %rd10, %rd14; - bra.uni $Lt_0_22274; -$Lt_0_22530: - add.u64 %rd21, %rd7, %rd8; - ld.global.s32 %r23, [%rd21+0]; - cvt.s64.s32 %rd22, %r23; - mul.wide.s32 %rd23, %r23, 4; - add.u64 %rd24, %rd11, %rd23; - cvt.s64.s32 %rd25, %r12; - mul.wide.s32 %rd26, %r12, 4; - add.u64 %rd19, %rd24, %rd26; - mov.s32 %r17, %r1; - add.u64 %rd20, %rd10, %rd24; -$Lt_0_22274: - .loc 16 64 0 - mov.u32 %r24, %r10; - mov.s32 %r25, 0; - mov.u32 %r26, %r25; - mov.s32 %r27, 0; - mov.u32 %r28, %r27; - mov.s32 %r29, 0; - mov.u32 %r30, %r29; - tex.1d.v4.f32.s32 {%f21,%f22,%f23,%f24},[pos_tex,{%r24,%r26,%r28,%r30}]; - mov.f32 %f25, %f21; - mov.f32 %f26, %f22; - mov.f32 %f27, %f23; - mov.f32 %f28, %f24; - .loc 16 65 0 - mov.u32 %r31, %r10; - mov.s32 %r32, 0; - mov.u32 %r33, %r32; - mov.s32 %r34, 0; - mov.u32 %r35, %r34; - mov.s32 %r36, 0; - mov.u32 %r37, %r36; - tex.1d.v4.f32.s32 {%f29,%f30,%f31,%f32},[q_tex,{%r31,%r33,%r35,%r37}]; - mov.f32 %f33, %f29; - setp.ge.u64 %p3, %rd20, %rd19; - @%p3 bra $Lt_0_32514; - cvt.rzi.ftz.s32.f32 %r38, %f28; - cvt.s64.s32 %rd27, %r17; - ld.param.s32 %r39, [__cudaparm_kernel_pair_lj_types]; - mul.lo.s32 %r40, %r39, %r38; - ld.param.u64 %rd28, [__cudaparm_kernel_pair_lj1]; - mov.f32 %f34, 0f00000000; // 0 - mov.f32 %f35, 0f00000000; // 0 - mov.f32 %f36, 0f00000000; // 0 - mov.f32 %f37, 0f00000000; // 0 - mov.f32 %f38, 0f00000000; // 0 - mov.u64 %rd29, __cuda___cuda_local_var_32541_33_non_const_sp_lj112; -$Lt_0_23298: - // Loop body line 65, nesting depth: 1, estimated iterations: unknown - .loc 16 69 0 - ld.global.s32 %r41, [%rd20+0]; - .loc 16 72 0 - shr.s32 %r42, %r41, 30; - and.b32 %r43, %r42, 3; - cvt.s64.s32 %rd30, %r43; - mul.wide.s32 %rd31, %r43, 4; - add.u64 %rd32, %rd29, %rd31; - ld.shared.f32 %f39, [%rd32+0]; - .loc 16 73 0 - mov.f32 %f40, 0f3f800000; // 1 - ld.shared.f32 %f41, [%rd32+16]; - sub.ftz.f32 %f42, %f40, %f41; - .loc 16 76 0 - and.b32 %r44, %r41, 1073741823; - mov.u32 %r45, %r44; - mov.s32 %r46, 0; - mov.u32 %r47, %r46; - mov.s32 %r48, 0; - mov.u32 %r49, %r48; - mov.s32 %r50, 0; - mov.u32 %r51, %r50; - tex.1d.v4.f32.s32 {%f43,%f44,%f45,%f46},[pos_tex,{%r45,%r47,%r49,%r51}]; - mov.f32 %f47, %f43; - mov.f32 %f48, %f44; - mov.f32 %f49, %f45; - mov.f32 %f50, %f46; - cvt.rzi.ftz.s32.f32 %r52, %f50; - sub.ftz.f32 %f51, %f26, %f48; - sub.ftz.f32 %f52, %f25, %f47; - sub.ftz.f32 %f53, %f27, %f49; - mul.ftz.f32 %f54, %f51, %f51; - fma.rn.ftz.f32 %f55, %f52, %f52, %f54; - fma.rn.ftz.f32 %f56, %f53, %f53, %f55; - add.s32 %r53, %r52, %r40; - cvt.s64.s32 %rd33, %r53; - mul.wide.s32 %rd34, %r53, 16; - add.u64 %rd35, %rd34, %rd28; - ld.global.f32 %f57, [%rd35+8]; - setp.gt.ftz.f32 %p4, %f57, %f56; - @!%p4 bra $Lt_0_26114; - rcp.approx.ftz.f32 %f58, %f56; - ld.global.f32 %f59, [%rd35+12]; - setp.lt.ftz.f32 %p5, %f56, %f59; - @!%p5 bra $Lt_0_24322; - .loc 16 92 0 - rsqrt.approx.ftz.f32 %f60, %f56; - mul.ftz.f32 %f61, %f58, %f60; - mov.f32 %f62, %f61; - .loc 16 93 0 - mul.ftz.f32 %f63, %f61, %f61; - mov.f32 %f64, %f63; - .loc 16 94 0 - mul.ftz.f32 %f65, %f63, %f39; - ld.global.v2.f32 {%f66,%f67}, [%rd35+0]; - mul.ftz.f32 %f68, %f66, %f61; - sub.ftz.f32 %f69, %f68, %f67; - mul.ftz.f32 %f70, %f65, %f69; - bra.uni $Lt_0_24066; -$Lt_0_24322: - .loc 16 96 0 - mov.f32 %f70, 0f00000000; // 0 -$Lt_0_24066: - ld.param.f32 %f71, [__cudaparm_kernel_pair_cut_coulsq]; - setp.gt.ftz.f32 %p6, %f71, %f56; - @!%p6 bra $Lt_0_24834; - .loc 20 518 0 - rsqrt.approx.ftz.f32 %f72, %f58; - ld.param.f32 %f73, [__cudaparm_kernel_pair_g_ewald]; - mul.ftz.f32 %f74, %f73, %f72; - mul.ftz.f32 %f75, %f74, %f74; - neg.ftz.f32 %f76, %f75; - mov.f32 %f77, 0f3fb8aa3b; // 1.4427 - mul.ftz.f32 %f78, %f76, %f77; - ex2.approx.ftz.f32 %f79, %f78; - .loc 16 103 0 - mov.f32 %f80, 0f3f800000; // 1 - mov.f32 %f81, 0f3ea7ba05; // 0.327591 - fma.rn.ftz.f32 %f82, %f81, %f74, %f80; - rcp.approx.ftz.f32 %f83, %f82; - mov.f32 %f84, 0f3e827906; // 0.25483 - mov.f32 %f85, 0fbe91a98e; // -0.284497 - mov.f32 %f86, 0f3fb5f0e3; // 1.42141 - mov.f32 %f87, 0fbfba00e3; // -1.45315 - mov.f32 %f88, 0f3f87dc22; // 1.06141 - fma.rn.ftz.f32 %f89, %f88, %f83, %f87; - fma.rn.ftz.f32 %f90, %f83, %f89, %f86; - fma.rn.ftz.f32 %f91, %f83, %f90, %f85; - fma.rn.ftz.f32 %f92, %f83, %f91, %f84; - mul.ftz.f32 %f93, %f83, %f92; - mul.ftz.f32 %f94, %f79, %f93; - mov.f32 %f95, %f94; - .loc 16 104 0 - mov.u32 %r54, %r44; - mov.s32 %r55, 0; - mov.u32 %r56, %r55; - mov.s32 %r57, 0; - mov.u32 %r58, %r57; - mov.s32 %r59, 0; - mov.u32 %r60, %r59; - tex.1d.v4.f32.s32 {%f96,%f97,%f98,%f99},[q_tex,{%r54,%r56,%r58,%r60}]; - mov.f32 %f100, %f96; - ld.param.f32 %f101, [__cudaparm_kernel_pair_qqrd2e]; - mul.ftz.f32 %f102, %f101, %f33; - mul.ftz.f32 %f103, %f102, %f100; - div.approx.ftz.f32 %f104, %f103, %f72; - mov.f32 %f105, %f104; - .loc 16 105 0 - mov.f32 %f106, 0f3f906ebb; // 1.12838 - mul.ftz.f32 %f107, %f74, %f106; - fma.rn.ftz.f32 %f108, %f79, %f107, %f94; - sub.ftz.f32 %f109, %f108, %f42; - mul.ftz.f32 %f110, %f104, %f109; - bra.uni $Lt_0_24578; -$Lt_0_24834: - .loc 16 107 0 - mov.f32 %f110, 0f00000000; // 0 -$Lt_0_24578: - .loc 16 111 0 - add.ftz.f32 %f111, %f110, %f70; - mul.ftz.f32 %f112, %f111, %f58; - fma.rn.ftz.f32 %f36, %f52, %f112, %f36; - .loc 16 112 0 - fma.rn.ftz.f32 %f35, %f51, %f112, %f35; - .loc 16 113 0 - fma.rn.ftz.f32 %f34, %f53, %f112, %f34; - ld.param.s32 %r61, [__cudaparm_kernel_pair_eflag]; - mov.u32 %r62, 0; - setp.le.s32 %p7, %r61, %r62; - @%p7 bra $Lt_0_25602; - .loc 16 116 0 - mov.f32 %f113, %f105; - mov.f32 %f114, %f95; - sub.ftz.f32 %f115, %f114, %f42; - fma.rn.ftz.f32 %f116, %f113, %f115, %f37; - selp.f32 %f37, %f116, %f37, %p6; - @!%p5 bra $Lt_0_25602; - .loc 16 120 0 - ld.param.u64 %rd36, [__cudaparm_kernel_pair_lj3]; - add.u64 %rd37, %rd36, %rd34; - ld.global.v4.f32 {%f117,%f118,%f119,_}, [%rd37+0]; - mov.f32 %f120, %f64; - mov.f32 %f121, %f62; - mul.ftz.f32 %f122, %f117, %f121; - sub.ftz.f32 %f123, %f122, %f118; - mul.ftz.f32 %f124, %f120, %f123; - sub.ftz.f32 %f125, %f124, %f119; - fma.rn.ftz.f32 %f38, %f39, %f125, %f38; -$Lt_0_25602: -$Lt_0_25090: - ld.param.s32 %r63, [__cudaparm_kernel_pair_vflag]; - mov.u32 %r64, 0; - setp.le.s32 %p8, %r63, %r64; - @%p8 bra $Lt_0_26114; - .loc 16 124 0 - mov.f32 %f126, %f10; - mul.ftz.f32 %f127, %f52, %f52; - fma.rn.ftz.f32 %f128, %f112, %f127, %f126; - mov.f32 %f10, %f128; - .loc 16 125 0 - mov.f32 %f129, %f12; - fma.rn.ftz.f32 %f130, %f112, %f54, %f129; - mov.f32 %f12, %f130; - .loc 16 126 0 - mov.f32 %f131, %f14; - mul.ftz.f32 %f132, %f53, %f53; - fma.rn.ftz.f32 %f133, %f112, %f132, %f131; - mov.f32 %f14, %f133; - .loc 16 127 0 - mov.f32 %f134, %f16; - mul.ftz.f32 %f135, %f51, %f52; - fma.rn.ftz.f32 %f136, %f112, %f135, %f134; - mov.f32 %f16, %f136; - .loc 16 128 0 - mov.f32 %f137, %f18; - mul.ftz.f32 %f138, %f52, %f53; - fma.rn.ftz.f32 %f139, %f112, %f138, %f137; - mov.f32 %f18, %f139; - .loc 16 129 0 - mul.ftz.f32 %f140, %f51, %f53; - fma.rn.ftz.f32 %f19, %f112, %f140, %f19; - mov.f32 %f20, %f19; -$Lt_0_26114: -$Lt_0_23554: - .loc 16 68 0 - mul.lo.u64 %rd38, %rd27, 4; - add.u64 %rd20, %rd20, %rd38; - setp.lt.u64 %p9, %rd20, %rd19; - @%p9 bra $Lt_0_23298; - bra.uni $Lt_0_22786; -$Lt_0_32514: - mov.f32 %f34, 0f00000000; // 0 - mov.f32 %f35, 0f00000000; // 0 - mov.f32 %f36, 0f00000000; // 0 - mov.f32 %f37, 0f00000000; // 0 - mov.f32 %f38, 0f00000000; // 0 -$Lt_0_22786: - mov.u32 %r65, 1; - setp.le.s32 %p10, %r1, %r65; - @%p10 bra $Lt_0_28930; - .loc 16 134 0 - mov.u64 %rd39, __cuda___cuda_local_var_32635_55_non_const_red_acc144; - cvt.s64.s32 %rd40, %r2; - mul.wide.s32 %rd41, %r2, 4; - add.u64 %rd42, %rd39, %rd41; - mov.f32 %f141, %f36; - st.shared.f32 [%rd42+0], %f141; - mov.f32 %f142, %f35; - st.shared.f32 [%rd42+512], %f142; - mov.f32 %f143, %f34; - st.shared.f32 [%rd42+1024], %f143; - mov.f32 %f144, %f38; - st.shared.f32 [%rd42+1536], %f144; - mov.f32 %f145, %f37; - st.shared.f32 [%rd42+2048], %f145; - shr.s32 %r66, %r1, 31; - mov.s32 %r67, 1; - and.b32 %r68, %r66, %r67; - add.s32 %r69, %r68, %r1; - shr.s32 %r70, %r69, 1; - mov.s32 %r71, %r70; - mov.u32 %r72, 0; - setp.ne.u32 %p11, %r70, %r72; - @!%p11 bra $Lt_0_27394; -$Lt_0_27906: - setp.ge.u32 %p12, %r14, %r71; - @%p12 bra $Lt_0_28162; - add.u32 %r73, %r2, %r71; - cvt.u64.u32 %rd43, %r73; - mul.wide.u32 %rd44, %r73, 4; - add.u64 %rd45, %rd39, %rd44; - ld.shared.f32 %f146, [%rd45+0]; - add.ftz.f32 %f141, %f146, %f141; - st.shared.f32 [%rd42+0], %f141; - ld.shared.f32 %f147, [%rd45+512]; - add.ftz.f32 %f142, %f147, %f142; - st.shared.f32 [%rd42+512], %f142; - ld.shared.f32 %f148, [%rd45+1024]; - add.ftz.f32 %f143, %f148, %f143; - st.shared.f32 [%rd42+1024], %f143; - ld.shared.f32 %f149, [%rd45+1536]; - add.ftz.f32 %f144, %f149, %f144; - st.shared.f32 [%rd42+1536], %f144; - ld.shared.f32 %f150, [%rd45+2048]; - add.ftz.f32 %f145, %f150, %f145; - st.shared.f32 [%rd42+2048], %f145; -$Lt_0_28162: - shr.u32 %r71, %r71, 1; - mov.u32 %r74, 0; - setp.ne.u32 %p13, %r71, %r74; - @%p13 bra $Lt_0_27906; -$Lt_0_27394: - mov.f32 %f36, %f141; - mov.f32 %f35, %f142; - mov.f32 %f34, %f143; - mov.f32 %f38, %f144; - mov.f32 %f37, %f145; - ld.param.s32 %r75, [__cudaparm_kernel_pair_vflag]; - mov.u32 %r76, 0; - setp.le.s32 %p14, %r75, %r76; - @%p14 bra $Lt_0_28930; - mov.f32 %f141, %f10; - st.shared.f32 [%rd42+0], %f141; - mov.f32 %f142, %f12; - st.shared.f32 [%rd42+512], %f142; - mov.f32 %f143, %f14; - st.shared.f32 [%rd42+1024], %f143; - mov.f32 %f144, %f16; - st.shared.f32 [%rd42+1536], %f144; - mov.f32 %f145, %f18; - st.shared.f32 [%rd42+2048], %f145; - mov.f32 %f151, %f19; - st.shared.f32 [%rd42+2560], %f151; - mov.s32 %r77, %r70; - @!%p11 bra $Lt_0_29442; -$Lt_0_29954: - setp.ge.u32 %p15, %r14, %r77; - @%p15 bra $Lt_0_30210; - add.u32 %r78, %r2, %r77; - cvt.u64.u32 %rd46, %r78; - mul.wide.u32 %rd47, %r78, 4; - add.u64 %rd48, %rd39, %rd47; - ld.shared.f32 %f152, [%rd48+0]; - add.ftz.f32 %f141, %f152, %f141; - st.shared.f32 [%rd42+0], %f141; - ld.shared.f32 %f153, [%rd48+512]; - add.ftz.f32 %f142, %f153, %f142; - st.shared.f32 [%rd42+512], %f142; - ld.shared.f32 %f154, [%rd48+1024]; - add.ftz.f32 %f143, %f154, %f143; - st.shared.f32 [%rd42+1024], %f143; - ld.shared.f32 %f155, [%rd48+1536]; - add.ftz.f32 %f144, %f155, %f144; - st.shared.f32 [%rd42+1536], %f144; - ld.shared.f32 %f156, [%rd48+2048]; - add.ftz.f32 %f145, %f156, %f145; - st.shared.f32 [%rd42+2048], %f145; - ld.shared.f32 %f157, [%rd48+2560]; - add.ftz.f32 %f151, %f157, %f151; - st.shared.f32 [%rd42+2560], %f151; -$Lt_0_30210: - shr.u32 %r77, %r77, 1; - mov.u32 %r79, 0; - setp.ne.u32 %p16, %r77, %r79; - @%p16 bra $Lt_0_29954; -$Lt_0_29442: - mov.f32 %f10, %f141; - mov.f32 %f12, %f142; - mov.f32 %f14, %f143; - mov.f32 %f16, %f144; - mov.f32 %f18, %f145; - mov.f32 %f20, %f151; -$Lt_0_28930: -$Lt_0_26882: - mov.u32 %r80, 0; - setp.ne.s32 %p17, %r14, %r80; - @%p17 bra $Lt_0_30978; - ld.param.u64 %rd49, [__cudaparm_kernel_pair___val_paramengv]; - add.u64 %rd50, %rd49, %rd3; - ld.param.s32 %r81, [__cudaparm_kernel_pair_eflag]; - mov.u32 %r82, 0; - setp.le.s32 %p18, %r81, %r82; - @%p18 bra $Lt_0_31490; - st.global.f32 [%rd50+0], %f38; - cvt.s64.s32 %rd51, %r9; - mul.wide.s32 %rd52, %r9, 4; - add.u64 %rd53, %rd52, %rd50; - st.global.f32 [%rd53+0], %f37; - add.u64 %rd50, %rd52, %rd53; -$Lt_0_31490: - ld.param.s32 %r83, [__cudaparm_kernel_pair_vflag]; - mov.u32 %r84, 0; - setp.le.s32 %p19, %r83, %r84; - @%p19 bra $Lt_0_32002; - mov.f32 %f158, %f10; - st.global.f32 [%rd50+0], %f158; - cvt.s64.s32 %rd54, %r9; - mul.wide.s32 %rd55, %r9, 4; - add.u64 %rd56, %rd55, %rd50; - mov.f32 %f159, %f12; - st.global.f32 [%rd56+0], %f159; - add.u64 %rd57, %rd55, %rd56; - mov.f32 %f160, %f14; - st.global.f32 [%rd57+0], %f160; - add.u64 %rd58, %rd55, %rd57; - mov.f32 %f161, %f16; - st.global.f32 [%rd58+0], %f161; - add.u64 %rd50, %rd55, %rd58; - mov.f32 %f162, %f18; - st.global.f32 [%rd50+0], %f162; - mov.f32 %f163, %f20; - add.u64 %rd59, %rd55, %rd50; - st.global.f32 [%rd59+0], %f163; -$Lt_0_32002: - ld.param.u64 %rd60, [__cudaparm_kernel_pair_ans]; - mul.lo.u64 %rd61, %rd2, 16; - add.u64 %rd62, %rd60, %rd61; - mov.f32 %f164, %f165; - st.global.v4.f32 [%rd62+0], {%f36,%f35,%f34,%f164}; -$Lt_0_30978: -$Lt_0_21762: - .loc 16 137 0 - exit; -$LDWend_kernel_pair: - } // kernel_pair - - .entry kernel_pair_fast ( - .param .u64 __cudaparm_kernel_pair_fast_x_, - .param .u64 __cudaparm_kernel_pair_fast_lj1_in, - .param .u64 __cudaparm_kernel_pair_fast_lj3_in, - .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in, - .param .u64 __cudaparm_kernel_pair_fast_dev_nbor, - .param .u64 __cudaparm_kernel_pair_fast_dev_packed, - .param .u64 __cudaparm_kernel_pair_fast_ans, - .param .u64 __cudaparm_kernel_pair_fast___val_paramengv, - .param .s32 __cudaparm_kernel_pair_fast_eflag, - .param .s32 __cudaparm_kernel_pair_fast_vflag, - .param .s32 __cudaparm_kernel_pair_fast_inum, - .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch, - .param .u64 __cudaparm_kernel_pair_fast_q_, - .param .f32 __cudaparm_kernel_pair_fast_cut_coulsq, - .param .f32 __cudaparm_kernel_pair_fast_qqrd2e, - .param .f32 __cudaparm_kernel_pair_fast_g_ewald, - .param .s32 __cudaparm_kernel_pair_fast_t_per_atom) - { - .reg .u32 %r<88>; - .reg .u64 %rd<76>; - .reg .f32 %f<170>; - .reg .pred %p<24>; - .shared .align 4 .b8 __cuda___cuda_local_var_32654_33_non_const_sp_lj3320[32]; - .shared .align 16 .b8 __cuda___cuda_local_var_32652_34_non_const_lj13360[1936]; - .shared .align 16 .b8 __cuda___cuda_local_var_32653_34_non_const_lj35296[1936]; - .shared .align 4 .b8 __cuda___cuda_local_var_32749_55_non_const_red_acc7232[3072]; - // __cuda_local_var_32665_10_non_const_f = 64 - // __cuda_local_var_32667_9_non_const_virial = 16 - // __cuda_local_var_32703_43_non_const_r6inv = 44 - // __cuda_local_var_32703_50_non_const_r3inv = 40 - // __cuda_local_var_32703_57_non_const_prefactor = 52 - // __cuda_local_var_32703_68_non_const__erfc = 48 - .loc 16 147 0 -$LDWbegin_kernel_pair_fast: - cvt.s32.u32 %r1, %tid.x; - mov.u32 %r2, 7; - setp.gt.s32 %p1, %r1, %r2; - @%p1 bra $Lt_1_24066; - .loc 16 155 0 - mov.u64 %rd1, __cuda___cuda_local_var_32654_33_non_const_sp_lj3320; - cvt.s64.s32 %rd2, %r1; - mul.wide.s32 %rd3, %r1, 4; - ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in]; - add.u64 %rd5, %rd4, %rd3; - ld.global.f32 %f1, [%rd5+0]; - add.u64 %rd6, %rd3, %rd1; - st.shared.f32 [%rd6+0], %f1; -$Lt_1_24066: - mov.u64 %rd1, __cuda___cuda_local_var_32654_33_non_const_sp_lj3320; - mov.u32 %r3, 120; - setp.gt.s32 %p2, %r1, %r3; - @%p2 bra $Lt_1_24578; - .loc 16 157 0 - mov.u64 %rd7, __cuda___cuda_local_var_32652_34_non_const_lj13360; - cvt.s64.s32 %rd8, %r1; - mul.wide.s32 %rd9, %r1, 16; - ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_lj1_in]; - add.u64 %rd11, %rd10, %rd9; - add.u64 %rd12, %rd9, %rd7; - ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0]; - st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5}; - ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag]; - mov.u32 %r5, 0; - setp.le.s32 %p3, %r4, %r5; - @%p3 bra $Lt_1_25090; - .loc 16 159 0 - mov.u64 %rd13, __cuda___cuda_local_var_32653_34_non_const_lj35296; - ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in]; - add.u64 %rd15, %rd14, %rd9; - add.u64 %rd16, %rd9, %rd13; - ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0]; - st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9}; -$Lt_1_25090: - mov.u64 %rd13, __cuda___cuda_local_var_32653_34_non_const_lj35296; -$Lt_1_24578: - mov.u64 %rd13, __cuda___cuda_local_var_32653_34_non_const_lj35296; - mov.u64 %rd7, __cuda___cuda_local_var_32652_34_non_const_lj13360; - .loc 16 168 0 - mov.f32 %f10, 0f00000000; // 0 - mov.f32 %f11, %f10; - mov.f32 %f12, 0f00000000; // 0 - mov.f32 %f13, %f12; - mov.f32 %f14, 0f00000000; // 0 - mov.f32 %f15, %f14; - mov.f32 %f16, 0f00000000; // 0 - mov.f32 %f17, %f16; - mov.f32 %f18, 0f00000000; // 0 - mov.f32 %f19, %f18; - mov.f32 %f20, 0f00000000; // 0 - mov.f32 %f21, %f20; - .loc 16 170 0 - bar.sync 0; - ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom]; - div.s32 %r7, %r1, %r6; - cvt.s32.u32 %r8, %ntid.x; - div.s32 %r9, %r8, %r6; - cvt.s32.u32 %r10, %ctaid.x; - mul.lo.s32 %r11, %r10, %r9; - add.s32 %r12, %r7, %r11; - ld.param.s32 %r13, [__cudaparm_kernel_pair_fast_inum]; - setp.ge.s32 %p4, %r12, %r13; - @%p4 bra $Lt_1_34818; - .loc 16 175 0 - cvt.s64.s32 %rd17, %r12; - mul.wide.s32 %rd18, %r12, 4; - ld.param.u64 %rd19, [__cudaparm_kernel_pair_fast_dev_nbor]; - add.u64 %rd20, %rd18, %rd19; - ld.global.s32 %r14, [%rd20+0]; - ld.param.s32 %r15, [__cudaparm_kernel_pair_fast_nbor_pitch]; - cvt.s64.s32 %rd21, %r15; - mul.wide.s32 %rd22, %r15, 4; - add.u64 %rd23, %rd22, %rd20; - ld.global.s32 %r16, [%rd23+0]; - sub.s32 %r17, %r6, 1; - and.b32 %r18, %r17, %r1; - cvt.s64.s32 %rd24, %r18; - mul.wide.s32 %rd25, %r18, 4; - ld.param.u64 %rd26, [__cudaparm_kernel_pair_fast_dev_packed]; - setp.ne.u64 %p5, %rd26, %rd19; - @%p5 bra $Lt_1_26370; - cvt.s32.s64 %r19, %rd21; - mul.lo.s32 %r20, %r19, %r6; - mov.s32 %r21, %r20; - mul.lo.s32 %r22, %r17, %r12; - add.s32 %r23, %r19, %r22; - cvt.s64.s32 %rd27, %r23; - mul.wide.s32 %rd28, %r23, 4; - add.u64 %rd29, %rd23, %rd28; - and.b32 %r24, %r17, %r16; - cvt.s64.s32 %rd30, %r24; - div.s32 %r25, %r16, %r6; - mul.lo.s32 %r26, %r20, %r25; - cvt.s64.s32 %rd31, %r26; - add.u64 %rd32, %rd30, %rd31; - mul.lo.u64 %rd33, %rd32, 4; - add.u64 %rd34, %rd29, %rd33; - add.u64 %rd35, %rd25, %rd29; - bra.uni $Lt_1_26114; -$Lt_1_26370: - add.u64 %rd36, %rd22, %rd23; - ld.global.s32 %r27, [%rd36+0]; - cvt.s64.s32 %rd37, %r27; - mul.wide.s32 %rd38, %r27, 4; - add.u64 %rd39, %rd26, %rd38; - cvt.s64.s32 %rd40, %r16; - mul.wide.s32 %rd41, %r16, 4; - add.u64 %rd34, %rd39, %rd41; - mov.s32 %r21, %r6; - add.u64 %rd35, %rd25, %rd39; -$Lt_1_26114: - .loc 16 178 0 - mov.u32 %r28, %r14; - mov.s32 %r29, 0; - mov.u32 %r30, %r29; - mov.s32 %r31, 0; - mov.u32 %r32, %r31; - mov.s32 %r33, 0; - mov.u32 %r34, %r33; - tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r28,%r30,%r32,%r34}]; - mov.f32 %f26, %f22; - mov.f32 %f27, %f23; - mov.f32 %f28, %f24; - mov.f32 %f29, %f25; - .loc 16 179 0 - mov.u32 %r35, %r14; - mov.s32 %r36, 0; - mov.u32 %r37, %r36; - mov.s32 %r38, 0; - mov.u32 %r39, %r38; - mov.s32 %r40, 0; - mov.u32 %r41, %r40; - tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[q_tex,{%r35,%r37,%r39,%r41}]; - mov.f32 %f34, %f30; - setp.ge.u64 %p6, %rd35, %rd34; - @%p6 bra $Lt_1_36354; - cvt.rzi.ftz.s32.f32 %r42, %f29; - cvt.s64.s32 %rd42, %r21; - mul.lo.s32 %r43, %r42, 11; - cvt.rn.f32.s32 %f35, %r43; - mov.f32 %f36, 0f00000000; // 0 - mov.f32 %f37, 0f00000000; // 0 - mov.f32 %f38, 0f00000000; // 0 - mov.f32 %f39, 0f00000000; // 0 - mov.f32 %f40, 0f00000000; // 0 -$Lt_1_27138: - // Loop body line 179, nesting depth: 1, estimated iterations: unknown - .loc 16 184 0 - ld.global.s32 %r44, [%rd35+0]; - .loc 16 187 0 - shr.s32 %r45, %r44, 30; - and.b32 %r46, %r45, 3; - cvt.s64.s32 %rd43, %r46; - mul.wide.s32 %rd44, %r46, 4; - add.u64 %rd45, %rd1, %rd44; - ld.shared.f32 %f41, [%rd45+0]; - .loc 16 188 0 - mov.f32 %f42, 0f3f800000; // 1 - ld.shared.f32 %f43, [%rd45+16]; - sub.ftz.f32 %f44, %f42, %f43; - .loc 16 191 0 - and.b32 %r47, %r44, 1073741823; - mov.u32 %r48, %r47; - mov.s32 %r49, 0; - mov.u32 %r50, %r49; - mov.s32 %r51, 0; - mov.u32 %r52, %r51; - mov.s32 %r53, 0; - mov.u32 %r54, %r53; - tex.1d.v4.f32.s32 {%f45,%f46,%f47,%f48},[pos_tex,{%r48,%r50,%r52,%r54}]; - mov.f32 %f49, %f45; - mov.f32 %f50, %f46; - mov.f32 %f51, %f47; - mov.f32 %f52, %f48; - sub.ftz.f32 %f53, %f27, %f50; - sub.ftz.f32 %f54, %f26, %f49; - sub.ftz.f32 %f55, %f28, %f51; - mul.ftz.f32 %f56, %f53, %f53; - fma.rn.ftz.f32 %f57, %f54, %f54, %f56; - fma.rn.ftz.f32 %f58, %f55, %f55, %f57; - add.ftz.f32 %f59, %f35, %f52; - cvt.rzi.ftz.s32.f32 %r55, %f59; - cvt.s64.s32 %rd46, %r55; - mul.wide.s32 %rd47, %r55, 16; - add.u64 %rd48, %rd47, %rd7; - ld.shared.f32 %f60, [%rd48+8]; - setp.gt.ftz.f32 %p7, %f60, %f58; - @!%p7 bra $Lt_1_29954; - rcp.approx.ftz.f32 %f61, %f58; - ld.shared.f32 %f62, [%rd48+12]; - setp.lt.ftz.f32 %p8, %f58, %f62; - @!%p8 bra $Lt_1_28162; - .loc 16 206 0 - rsqrt.approx.ftz.f32 %f63, %f58; - mul.ftz.f32 %f64, %f61, %f63; - mov.f32 %f65, %f64; - .loc 16 207 0 - mul.ftz.f32 %f66, %f64, %f64; - mov.f32 %f67, %f66; - .loc 16 208 0 - mul.ftz.f32 %f68, %f66, %f41; - ld.shared.v2.f32 {%f69,%f70}, [%rd48+0]; - mul.ftz.f32 %f71, %f69, %f64; - sub.ftz.f32 %f72, %f71, %f70; - mul.ftz.f32 %f73, %f68, %f72; - bra.uni $Lt_1_27906; -$Lt_1_28162: - .loc 16 210 0 - mov.f32 %f73, 0f00000000; // 0 -$Lt_1_27906: - ld.param.f32 %f74, [__cudaparm_kernel_pair_fast_cut_coulsq]; - setp.gt.ftz.f32 %p9, %f74, %f58; - @!%p9 bra $Lt_1_28674; - .loc 20 518 0 - rsqrt.approx.ftz.f32 %f75, %f61; - ld.param.f32 %f76, [__cudaparm_kernel_pair_fast_g_ewald]; - mul.ftz.f32 %f77, %f76, %f75; - mul.ftz.f32 %f78, %f77, %f77; - neg.ftz.f32 %f79, %f78; - mov.f32 %f80, 0f3fb8aa3b; // 1.4427 - mul.ftz.f32 %f81, %f79, %f80; - ex2.approx.ftz.f32 %f82, %f81; - .loc 16 217 0 - mov.f32 %f83, 0f3f800000; // 1 - mov.f32 %f84, 0f3ea7ba05; // 0.327591 - fma.rn.ftz.f32 %f85, %f84, %f77, %f83; - rcp.approx.ftz.f32 %f86, %f85; - mov.f32 %f87, 0f3e827906; // 0.25483 - mov.f32 %f88, 0fbe91a98e; // -0.284497 - mov.f32 %f89, 0f3fb5f0e3; // 1.42141 - mov.f32 %f90, 0fbfba00e3; // -1.45315 - mov.f32 %f91, 0f3f87dc22; // 1.06141 - fma.rn.ftz.f32 %f92, %f91, %f86, %f90; - fma.rn.ftz.f32 %f93, %f86, %f92, %f89; - fma.rn.ftz.f32 %f94, %f86, %f93, %f88; - fma.rn.ftz.f32 %f95, %f86, %f94, %f87; - mul.ftz.f32 %f96, %f86, %f95; - mul.ftz.f32 %f97, %f82, %f96; - mov.f32 %f98, %f97; - .loc 16 218 0 - mov.u32 %r56, %r47; - mov.s32 %r57, 0; - mov.u32 %r58, %r57; - mov.s32 %r59, 0; - mov.u32 %r60, %r59; - mov.s32 %r61, 0; - mov.u32 %r62, %r61; - tex.1d.v4.f32.s32 {%f99,%f100,%f101,%f102},[q_tex,{%r56,%r58,%r60,%r62}]; - mov.f32 %f103, %f99; - ld.param.f32 %f104, [__cudaparm_kernel_pair_fast_qqrd2e]; - mul.ftz.f32 %f105, %f104, %f34; - mul.ftz.f32 %f106, %f105, %f103; - div.approx.ftz.f32 %f107, %f106, %f75; - mov.f32 %f108, %f107; - .loc 16 219 0 - mov.f32 %f109, 0f3f906ebb; // 1.12838 - mul.ftz.f32 %f110, %f77, %f109; - fma.rn.ftz.f32 %f111, %f82, %f110, %f97; - sub.ftz.f32 %f112, %f111, %f44; - mul.ftz.f32 %f113, %f107, %f112; - bra.uni $Lt_1_28418; -$Lt_1_28674: - .loc 16 221 0 - mov.f32 %f113, 0f00000000; // 0 -$Lt_1_28418: - .loc 16 225 0 - add.ftz.f32 %f114, %f113, %f73; - mul.ftz.f32 %f115, %f114, %f61; - fma.rn.ftz.f32 %f38, %f54, %f115, %f38; - .loc 16 226 0 - fma.rn.ftz.f32 %f37, %f53, %f115, %f37; - .loc 16 227 0 - fma.rn.ftz.f32 %f36, %f55, %f115, %f36; - ld.param.s32 %r63, [__cudaparm_kernel_pair_fast_eflag]; - mov.u32 %r64, 0; - setp.le.s32 %p10, %r63, %r64; - @%p10 bra $Lt_1_29442; - .loc 16 230 0 - mov.f32 %f116, %f108; - mov.f32 %f117, %f98; - sub.ftz.f32 %f118, %f117, %f44; - fma.rn.ftz.f32 %f119, %f116, %f118, %f39; - selp.f32 %f39, %f119, %f39, %p9; - @!%p8 bra $Lt_1_29442; - .loc 16 233 0 - add.u64 %rd49, %rd47, %rd13; - mov.f32 %f120, %f67; - ld.shared.v4.f32 {%f121,%f122,%f123,_}, [%rd49+0]; - mov.f32 %f124, %f65; - mul.ftz.f32 %f125, %f121, %f124; - sub.ftz.f32 %f126, %f125, %f122; - mul.ftz.f32 %f127, %f120, %f126; - .loc 16 234 0 - sub.ftz.f32 %f128, %f127, %f123; - fma.rn.ftz.f32 %f40, %f41, %f128, %f40; -$Lt_1_29442: -$Lt_1_28930: - ld.param.s32 %r65, [__cudaparm_kernel_pair_fast_vflag]; - mov.u32 %r66, 0; - setp.le.s32 %p11, %r65, %r66; - @%p11 bra $Lt_1_29954; - .loc 16 238 0 - mov.f32 %f129, %f11; - mul.ftz.f32 %f130, %f54, %f54; - fma.rn.ftz.f32 %f131, %f115, %f130, %f129; - mov.f32 %f11, %f131; - .loc 16 239 0 - mov.f32 %f132, %f13; - fma.rn.ftz.f32 %f133, %f115, %f56, %f132; - mov.f32 %f13, %f133; - .loc 16 240 0 - mov.f32 %f134, %f15; - mul.ftz.f32 %f135, %f55, %f55; - fma.rn.ftz.f32 %f136, %f115, %f135, %f134; - mov.f32 %f15, %f136; - .loc 16 241 0 - mov.f32 %f137, %f17; - mul.ftz.f32 %f138, %f53, %f54; - fma.rn.ftz.f32 %f139, %f115, %f138, %f137; - mov.f32 %f17, %f139; - .loc 16 242 0 - mov.f32 %f140, %f19; - mul.ftz.f32 %f141, %f54, %f55; - fma.rn.ftz.f32 %f142, %f115, %f141, %f140; - mov.f32 %f19, %f142; - .loc 16 243 0 - mul.ftz.f32 %f143, %f53, %f55; - fma.rn.ftz.f32 %f20, %f115, %f143, %f20; - mov.f32 %f21, %f20; -$Lt_1_29954: -$Lt_1_27394: - .loc 16 183 0 - mul.lo.u64 %rd50, %rd42, 4; - add.u64 %rd35, %rd35, %rd50; - setp.lt.u64 %p12, %rd35, %rd34; - @%p12 bra $Lt_1_27138; - bra.uni $Lt_1_26626; -$Lt_1_36354: - mov.f32 %f36, 0f00000000; // 0 - mov.f32 %f37, 0f00000000; // 0 - mov.f32 %f38, 0f00000000; // 0 - mov.f32 %f39, 0f00000000; // 0 - mov.f32 %f40, 0f00000000; // 0 -$Lt_1_26626: - mov.u32 %r67, 1; - setp.le.s32 %p13, %r6, %r67; - @%p13 bra $Lt_1_32770; - .loc 16 248 0 - mov.u64 %rd51, __cuda___cuda_local_var_32749_55_non_const_red_acc7232; - cvt.s64.s32 %rd52, %r1; - mul.wide.s32 %rd53, %r1, 4; - add.u64 %rd54, %rd51, %rd53; - mov.f32 %f144, %f38; - st.shared.f32 [%rd54+0], %f144; - mov.f32 %f145, %f37; - st.shared.f32 [%rd54+512], %f145; - mov.f32 %f146, %f36; - st.shared.f32 [%rd54+1024], %f146; - mov.f32 %f147, %f40; - st.shared.f32 [%rd54+1536], %f147; - mov.f32 %f148, %f39; - st.shared.f32 [%rd54+2048], %f148; - shr.s32 %r68, %r6, 31; - mov.s32 %r69, 1; - and.b32 %r70, %r68, %r69; - add.s32 %r71, %r70, %r6; - shr.s32 %r72, %r71, 1; - mov.s32 %r73, %r72; - mov.u32 %r74, 0; - setp.ne.u32 %p14, %r72, %r74; - @!%p14 bra $Lt_1_31234; -$Lt_1_31746: - setp.ge.u32 %p15, %r18, %r73; - @%p15 bra $Lt_1_32002; - add.u32 %r75, %r1, %r73; - cvt.u64.u32 %rd55, %r75; - mul.wide.u32 %rd56, %r75, 4; - add.u64 %rd57, %rd51, %rd56; - ld.shared.f32 %f149, [%rd57+0]; - add.ftz.f32 %f144, %f149, %f144; - st.shared.f32 [%rd54+0], %f144; - ld.shared.f32 %f150, [%rd57+512]; - add.ftz.f32 %f145, %f150, %f145; - st.shared.f32 [%rd54+512], %f145; - ld.shared.f32 %f151, [%rd57+1024]; - add.ftz.f32 %f146, %f151, %f146; - st.shared.f32 [%rd54+1024], %f146; - ld.shared.f32 %f152, [%rd57+1536]; - add.ftz.f32 %f147, %f152, %f147; - st.shared.f32 [%rd54+1536], %f147; - ld.shared.f32 %f153, [%rd57+2048]; - add.ftz.f32 %f148, %f153, %f148; - st.shared.f32 [%rd54+2048], %f148; -$Lt_1_32002: - shr.u32 %r73, %r73, 1; - mov.u32 %r76, 0; - setp.ne.u32 %p16, %r73, %r76; - @%p16 bra $Lt_1_31746; -$Lt_1_31234: - mov.f32 %f38, %f144; - mov.f32 %f37, %f145; - mov.f32 %f36, %f146; - mov.f32 %f40, %f147; - mov.f32 %f39, %f148; - ld.param.s32 %r77, [__cudaparm_kernel_pair_fast_vflag]; - mov.u32 %r78, 0; - setp.le.s32 %p17, %r77, %r78; - @%p17 bra $Lt_1_32770; - mov.f32 %f144, %f11; - st.shared.f32 [%rd54+0], %f144; - mov.f32 %f145, %f13; - st.shared.f32 [%rd54+512], %f145; - mov.f32 %f146, %f15; - st.shared.f32 [%rd54+1024], %f146; - mov.f32 %f147, %f17; - st.shared.f32 [%rd54+1536], %f147; - mov.f32 %f148, %f19; - st.shared.f32 [%rd54+2048], %f148; - mov.f32 %f154, %f20; - st.shared.f32 [%rd54+2560], %f154; - mov.s32 %r79, %r72; - @!%p14 bra $Lt_1_33282; -$Lt_1_33794: - setp.ge.u32 %p18, %r18, %r79; - @%p18 bra $Lt_1_34050; - add.u32 %r80, %r1, %r79; - cvt.u64.u32 %rd58, %r80; - mul.wide.u32 %rd59, %r80, 4; - add.u64 %rd60, %rd51, %rd59; - ld.shared.f32 %f155, [%rd60+0]; - add.ftz.f32 %f144, %f155, %f144; - st.shared.f32 [%rd54+0], %f144; - ld.shared.f32 %f156, [%rd60+512]; - add.ftz.f32 %f145, %f156, %f145; - st.shared.f32 [%rd54+512], %f145; - ld.shared.f32 %f157, [%rd60+1024]; - add.ftz.f32 %f146, %f157, %f146; - st.shared.f32 [%rd54+1024], %f146; - ld.shared.f32 %f158, [%rd60+1536]; - add.ftz.f32 %f147, %f158, %f147; - st.shared.f32 [%rd54+1536], %f147; - ld.shared.f32 %f159, [%rd60+2048]; - add.ftz.f32 %f148, %f159, %f148; - st.shared.f32 [%rd54+2048], %f148; - ld.shared.f32 %f160, [%rd60+2560]; - add.ftz.f32 %f154, %f160, %f154; - st.shared.f32 [%rd54+2560], %f154; -$Lt_1_34050: - shr.u32 %r79, %r79, 1; - mov.u32 %r81, 0; - setp.ne.u32 %p19, %r79, %r81; - @%p19 bra $Lt_1_33794; -$Lt_1_33282: - mov.f32 %f11, %f144; - mov.f32 %f13, %f145; - mov.f32 %f15, %f146; - mov.f32 %f17, %f147; - mov.f32 %f19, %f148; - mov.f32 %f21, %f154; -$Lt_1_32770: -$Lt_1_30722: - mov.u32 %r82, 0; - setp.ne.s32 %p20, %r18, %r82; - @%p20 bra $Lt_1_34818; - ld.param.u64 %rd61, [__cudaparm_kernel_pair_fast___val_paramengv]; - add.u64 %rd62, %rd61, %rd18; - ld.param.s32 %r83, [__cudaparm_kernel_pair_fast_eflag]; - mov.u32 %r84, 0; - setp.le.s32 %p21, %r83, %r84; - @%p21 bra $Lt_1_35330; - st.global.f32 [%rd62+0], %f40; - cvt.s64.s32 %rd63, %r13; - mul.wide.s32 %rd64, %r13, 4; - add.u64 %rd65, %rd64, %rd62; - st.global.f32 [%rd65+0], %f39; - add.u64 %rd62, %rd64, %rd65; -$Lt_1_35330: - ld.param.s32 %r85, [__cudaparm_kernel_pair_fast_vflag]; - mov.u32 %r86, 0; - setp.le.s32 %p22, %r85, %r86; - @%p22 bra $Lt_1_35842; - mov.f32 %f161, %f11; - st.global.f32 [%rd62+0], %f161; - cvt.s64.s32 %rd66, %r13; - mul.wide.s32 %rd67, %r13, 4; - add.u64 %rd68, %rd67, %rd62; - mov.f32 %f162, %f13; - st.global.f32 [%rd68+0], %f162; - add.u64 %rd69, %rd67, %rd68; - mov.f32 %f163, %f15; - st.global.f32 [%rd69+0], %f163; - add.u64 %rd70, %rd67, %rd69; - mov.f32 %f164, %f17; - st.global.f32 [%rd70+0], %f164; - add.u64 %rd62, %rd67, %rd70; - mov.f32 %f165, %f19; - st.global.f32 [%rd62+0], %f165; - mov.f32 %f166, %f21; - add.u64 %rd71, %rd67, %rd62; - st.global.f32 [%rd71+0], %f166; -$Lt_1_35842: - ld.param.u64 %rd72, [__cudaparm_kernel_pair_fast_ans]; - mul.lo.u64 %rd73, %rd17, 16; - add.u64 %rd74, %rd72, %rd73; - mov.f32 %f167, %f168; - st.global.v4.f32 [%rd74+0], {%f38,%f37,%f36,%f167}; -$Lt_1_34818: -$Lt_1_25602: - .loc 16 251 0 - exit; -$LDWend_kernel_pair_fast: - } // kernel_pair_fast - diff --git a/lib/gpu/lj_class2_long_ptx.h b/lib/gpu/lj_class2_long_ptx.h deleted file mode 100644 index 2876e0c3e3..0000000000 --- a/lib/gpu/lj_class2_long_ptx.h +++ /dev/null @@ -1,1073 +0,0 @@ -const char * lj_class2_long = -" .version 2.3\n" -" .target sm_20\n" -" .address_size 64\n" -" .global .texref pos_tex;\n" -" .global .texref q_tex;\n" -" .entry kernel_pair (\n" -" .param .u64 __cudaparm_kernel_pair_x_,\n" -" .param .u64 __cudaparm_kernel_pair_lj1,\n" -" .param .u64 __cudaparm_kernel_pair_lj3,\n" -" .param .s32 __cudaparm_kernel_pair_lj_types,\n" -" .param .u64 __cudaparm_kernel_pair_sp_lj_in,\n" -" .param .u64 __cudaparm_kernel_pair_dev_nbor,\n" -" .param .u64 __cudaparm_kernel_pair_dev_packed,\n" -" .param .u64 __cudaparm_kernel_pair_ans,\n" -" .param .u64 __cudaparm_kernel_pair___val_paramengv,\n" -" .param .s32 __cudaparm_kernel_pair_eflag,\n" -" .param .s32 __cudaparm_kernel_pair_vflag,\n" -" .param .s32 __cudaparm_kernel_pair_inum,\n" -" .param .s32 __cudaparm_kernel_pair_nbor_pitch,\n" -" .param .u64 __cudaparm_kernel_pair_q_,\n" -" .param .f32 __cudaparm_kernel_pair_cut_coulsq,\n" -" .param .f32 __cudaparm_kernel_pair_qqrd2e,\n" -" .param .f32 __cudaparm_kernel_pair_g_ewald,\n" -" .param .s32 __cudaparm_kernel_pair_t_per_atom)\n" -" {\n" -" .reg .u32 %r<86>;\n" -" .reg .u64 %rd<64>;\n" -" .reg .f32 %f<167>;\n" -" .reg .pred %p<21>;\n" -" .shared .align 16 .b8 __cuda___cuda_local_var_32541_33_non_const_sp_lj112[32];\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_32635_55_non_const_red_acc144[3072];\n" -" .loc 16 36 0\n" -"$LDWbegin_kernel_pair:\n" -" .loc 16 41 0\n" -" ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];\n" -" ldu.global.f32 %f1, [%rd1+0];\n" -" .loc 16 42 0\n" -" ld.global.f32 %f2, [%rd1+4];\n" -" .loc 16 43 0\n" -" ld.global.f32 %f3, [%rd1+8];\n" -" .loc 16 44 0\n" -" ld.global.f32 %f4, [%rd1+12];\n" -" st.shared.v4.f32 [__cuda___cuda_local_var_32541_33_non_const_sp_lj112+0], {%f1,%f2,%f3,%f4};\n" -" .loc 16 45 0\n" -" ld.global.f32 %f5, [%rd1+16];\n" -" .loc 16 46 0\n" -" ld.global.f32 %f6, [%rd1+20];\n" -" .loc 16 47 0\n" -" ld.global.f32 %f7, [%rd1+24];\n" -" .loc 16 48 0\n" -" ld.global.f32 %f8, [%rd1+28];\n" -" st.shared.v4.f32 [__cuda___cuda_local_var_32541_33_non_const_sp_lj112+16], {%f5,%f6,%f7,%f8};\n" -" .loc 16 56 0\n" -" mov.f32 %f9, 0f00000000; \n" -" mov.f32 %f10, %f9;\n" -" mov.f32 %f11, 0f00000000; \n" -" mov.f32 %f12, %f11;\n" -" mov.f32 %f13, 0f00000000; \n" -" mov.f32 %f14, %f13;\n" -" mov.f32 %f15, 0f00000000; \n" -" mov.f32 %f16, %f15;\n" -" mov.f32 %f17, 0f00000000; \n" -" mov.f32 %f18, %f17;\n" -" mov.f32 %f19, 0f00000000; \n" -" mov.f32 %f20, %f19;\n" -" ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];\n" -" cvt.s32.u32 %r2, %tid.x;\n" -" div.s32 %r3, %r2, %r1;\n" -" cvt.s32.u32 %r4, %ntid.x;\n" -" div.s32 %r5, %r4, %r1;\n" -" cvt.s32.u32 %r6, %ctaid.x;\n" -" mul.lo.s32 %r7, %r6, %r5;\n" -" add.s32 %r8, %r3, %r7;\n" -" ld.param.s32 %r9, [__cudaparm_kernel_pair_inum];\n" -" setp.ge.s32 %p1, %r8, %r9;\n" -" @%p1 bra $Lt_0_30978;\n" -" .loc 16 61 0\n" -" cvt.s64.s32 %rd2, %r8;\n" -" mul.wide.s32 %rd3, %r8, 4;\n" -" ld.param.u64 %rd4, [__cudaparm_kernel_pair_dev_nbor];\n" -" add.u64 %rd5, %rd3, %rd4;\n" -" ld.global.s32 %r10, [%rd5+0];\n" -" ld.param.s32 %r11, [__cudaparm_kernel_pair_nbor_pitch];\n" -" cvt.s64.s32 %rd6, %r11;\n" -" mul.wide.s32 %rd7, %r11, 4;\n" -" add.u64 %rd8, %rd7, %rd5;\n" -" ld.global.s32 %r12, [%rd8+0];\n" -" sub.s32 %r13, %r1, 1;\n" -" and.b32 %r14, %r13, %r2;\n" -" cvt.s64.s32 %rd9, %r14;\n" -" mul.wide.s32 %rd10, %r14, 4;\n" -" ld.param.u64 %rd11, [__cudaparm_kernel_pair_dev_packed];\n" -" setp.ne.u64 %p2, %rd11, %rd4;\n" -" @%p2 bra $Lt_0_22530;\n" -" cvt.s32.s64 %r15, %rd6;\n" -" mul.lo.s32 %r16, %r15, %r1;\n" -" mov.s32 %r17, %r16;\n" -" mul.lo.s32 %r18, %r13, %r8;\n" -" add.s32 %r19, %r15, %r18;\n" -" cvt.s64.s32 %rd12, %r19;\n" -" mul.wide.s32 %rd13, %r19, 4;\n" -" add.u64 %rd14, %rd8, %rd13;\n" -" and.b32 %r20, %r13, %r12;\n" -" cvt.s64.s32 %rd15, %r20;\n" -" div.s32 %r21, %r12, %r1;\n" -" mul.lo.s32 %r22, %r16, %r21;\n" -" cvt.s64.s32 %rd16, %r22;\n" -" add.u64 %rd17, %rd15, %rd16;\n" -" mul.lo.u64 %rd18, %rd17, 4;\n" -" add.u64 %rd19, %rd14, %rd18;\n" -" add.u64 %rd20, %rd10, %rd14;\n" -" bra.uni $Lt_0_22274;\n" -"$Lt_0_22530:\n" -" add.u64 %rd21, %rd7, %rd8;\n" -" ld.global.s32 %r23, [%rd21+0];\n" -" cvt.s64.s32 %rd22, %r23;\n" -" mul.wide.s32 %rd23, %r23, 4;\n" -" add.u64 %rd24, %rd11, %rd23;\n" -" cvt.s64.s32 %rd25, %r12;\n" -" mul.wide.s32 %rd26, %r12, 4;\n" -" add.u64 %rd19, %rd24, %rd26;\n" -" mov.s32 %r17, %r1;\n" -" add.u64 %rd20, %rd10, %rd24;\n" -"$Lt_0_22274:\n" -" .loc 16 64 0\n" -" mov.u32 %r24, %r10;\n" -" mov.s32 %r25, 0;\n" -" mov.u32 %r26, %r25;\n" -" mov.s32 %r27, 0;\n" -" mov.u32 %r28, %r27;\n" -" mov.s32 %r29, 0;\n" -" mov.u32 %r30, %r29;\n" -" tex.1d.v4.f32.s32 {%f21,%f22,%f23,%f24},[pos_tex,{%r24,%r26,%r28,%r30}];\n" -" mov.f32 %f25, %f21;\n" -" mov.f32 %f26, %f22;\n" -" mov.f32 %f27, %f23;\n" -" mov.f32 %f28, %f24;\n" -" .loc 16 65 0\n" -" mov.u32 %r31, %r10;\n" -" mov.s32 %r32, 0;\n" -" mov.u32 %r33, %r32;\n" -" mov.s32 %r34, 0;\n" -" mov.u32 %r35, %r34;\n" -" mov.s32 %r36, 0;\n" -" mov.u32 %r37, %r36;\n" -" tex.1d.v4.f32.s32 {%f29,%f30,%f31,%f32},[q_tex,{%r31,%r33,%r35,%r37}];\n" -" mov.f32 %f33, %f29;\n" -" setp.ge.u64 %p3, %rd20, %rd19;\n" -" @%p3 bra $Lt_0_32514;\n" -" cvt.rzi.ftz.s32.f32 %r38, %f28;\n" -" cvt.s64.s32 %rd27, %r17;\n" -" ld.param.s32 %r39, [__cudaparm_kernel_pair_lj_types];\n" -" mul.lo.s32 %r40, %r39, %r38;\n" -" ld.param.u64 %rd28, [__cudaparm_kernel_pair_lj1];\n" -" mov.f32 %f34, 0f00000000; \n" -" mov.f32 %f35, 0f00000000; \n" -" mov.f32 %f36, 0f00000000; \n" -" mov.f32 %f37, 0f00000000; \n" -" mov.f32 %f38, 0f00000000; \n" -" mov.u64 %rd29, __cuda___cuda_local_var_32541_33_non_const_sp_lj112;\n" -"$Lt_0_23298:\n" -" .loc 16 69 0\n" -" ld.global.s32 %r41, [%rd20+0];\n" -" .loc 16 72 0\n" -" shr.s32 %r42, %r41, 30;\n" -" and.b32 %r43, %r42, 3;\n" -" cvt.s64.s32 %rd30, %r43;\n" -" mul.wide.s32 %rd31, %r43, 4;\n" -" add.u64 %rd32, %rd29, %rd31;\n" -" ld.shared.f32 %f39, [%rd32+0];\n" -" .loc 16 73 0\n" -" mov.f32 %f40, 0f3f800000; \n" -" ld.shared.f32 %f41, [%rd32+16];\n" -" sub.ftz.f32 %f42, %f40, %f41;\n" -" .loc 16 76 0\n" -" and.b32 %r44, %r41, 1073741823;\n" -" mov.u32 %r45, %r44;\n" -" mov.s32 %r46, 0;\n" -" mov.u32 %r47, %r46;\n" -" mov.s32 %r48, 0;\n" -" mov.u32 %r49, %r48;\n" -" mov.s32 %r50, 0;\n" -" mov.u32 %r51, %r50;\n" -" tex.1d.v4.f32.s32 {%f43,%f44,%f45,%f46},[pos_tex,{%r45,%r47,%r49,%r51}];\n" -" mov.f32 %f47, %f43;\n" -" mov.f32 %f48, %f44;\n" -" mov.f32 %f49, %f45;\n" -" mov.f32 %f50, %f46;\n" -" cvt.rzi.ftz.s32.f32 %r52, %f50;\n" -" sub.ftz.f32 %f51, %f26, %f48;\n" -" sub.ftz.f32 %f52, %f25, %f47;\n" -" sub.ftz.f32 %f53, %f27, %f49;\n" -" mul.ftz.f32 %f54, %f51, %f51;\n" -" fma.rn.ftz.f32 %f55, %f52, %f52, %f54;\n" -" fma.rn.ftz.f32 %f56, %f53, %f53, %f55;\n" -" add.s32 %r53, %r52, %r40;\n" -" cvt.s64.s32 %rd33, %r53;\n" -" mul.wide.s32 %rd34, %r53, 16;\n" -" add.u64 %rd35, %rd34, %rd28;\n" -" ld.global.f32 %f57, [%rd35+8];\n" -" setp.gt.ftz.f32 %p4, %f57, %f56;\n" -" @!%p4 bra $Lt_0_26114;\n" -" rcp.approx.ftz.f32 %f58, %f56;\n" -" ld.global.f32 %f59, [%rd35+12];\n" -" setp.lt.ftz.f32 %p5, %f56, %f59;\n" -" @!%p5 bra $Lt_0_24322;\n" -" .loc 16 92 0\n" -" rsqrt.approx.ftz.f32 %f60, %f56;\n" -" mul.ftz.f32 %f61, %f58, %f60;\n" -" mov.f32 %f62, %f61;\n" -" .loc 16 93 0\n" -" mul.ftz.f32 %f63, %f61, %f61;\n" -" mov.f32 %f64, %f63;\n" -" .loc 16 94 0\n" -" mul.ftz.f32 %f65, %f63, %f39;\n" -" ld.global.v2.f32 {%f66,%f67}, [%rd35+0];\n" -" mul.ftz.f32 %f68, %f66, %f61;\n" -" sub.ftz.f32 %f69, %f68, %f67;\n" -" mul.ftz.f32 %f70, %f65, %f69;\n" -" bra.uni $Lt_0_24066;\n" -"$Lt_0_24322:\n" -" .loc 16 96 0\n" -" mov.f32 %f70, 0f00000000; \n" -"$Lt_0_24066:\n" -" ld.param.f32 %f71, [__cudaparm_kernel_pair_cut_coulsq];\n" -" setp.gt.ftz.f32 %p6, %f71, %f56;\n" -" @!%p6 bra $Lt_0_24834;\n" -" .loc 20 518 0\n" -" rsqrt.approx.ftz.f32 %f72, %f58;\n" -" ld.param.f32 %f73, [__cudaparm_kernel_pair_g_ewald];\n" -" mul.ftz.f32 %f74, %f73, %f72;\n" -" mul.ftz.f32 %f75, %f74, %f74;\n" -" neg.ftz.f32 %f76, %f75;\n" -" mov.f32 %f77, 0f3fb8aa3b; \n" -" mul.ftz.f32 %f78, %f76, %f77;\n" -" ex2.approx.ftz.f32 %f79, %f78;\n" -" .loc 16 103 0\n" -" mov.f32 %f80, 0f3f800000; \n" -" mov.f32 %f81, 0f3ea7ba05; \n" -" fma.rn.ftz.f32 %f82, %f81, %f74, %f80;\n" -" rcp.approx.ftz.f32 %f83, %f82;\n" -" mov.f32 %f84, 0f3e827906; \n" -" mov.f32 %f85, 0fbe91a98e; \n" -" mov.f32 %f86, 0f3fb5f0e3; \n" -" mov.f32 %f87, 0fbfba00e3; \n" -" mov.f32 %f88, 0f3f87dc22; \n" -" fma.rn.ftz.f32 %f89, %f88, %f83, %f87;\n" -" fma.rn.ftz.f32 %f90, %f83, %f89, %f86;\n" -" fma.rn.ftz.f32 %f91, %f83, %f90, %f85;\n" -" fma.rn.ftz.f32 %f92, %f83, %f91, %f84;\n" -" mul.ftz.f32 %f93, %f83, %f92;\n" -" mul.ftz.f32 %f94, %f79, %f93;\n" -" mov.f32 %f95, %f94;\n" -" .loc 16 104 0\n" -" mov.u32 %r54, %r44;\n" -" mov.s32 %r55, 0;\n" -" mov.u32 %r56, %r55;\n" -" mov.s32 %r57, 0;\n" -" mov.u32 %r58, %r57;\n" -" mov.s32 %r59, 0;\n" -" mov.u32 %r60, %r59;\n" -" tex.1d.v4.f32.s32 {%f96,%f97,%f98,%f99},[q_tex,{%r54,%r56,%r58,%r60}];\n" -" mov.f32 %f100, %f96;\n" -" ld.param.f32 %f101, [__cudaparm_kernel_pair_qqrd2e];\n" -" mul.ftz.f32 %f102, %f101, %f33;\n" -" mul.ftz.f32 %f103, %f102, %f100;\n" -" div.approx.ftz.f32 %f104, %f103, %f72;\n" -" mov.f32 %f105, %f104;\n" -" .loc 16 105 0\n" -" mov.f32 %f106, 0f3f906ebb; \n" -" mul.ftz.f32 %f107, %f74, %f106;\n" -" fma.rn.ftz.f32 %f108, %f79, %f107, %f94;\n" -" sub.ftz.f32 %f109, %f108, %f42;\n" -" mul.ftz.f32 %f110, %f104, %f109;\n" -" bra.uni $Lt_0_24578;\n" -"$Lt_0_24834:\n" -" .loc 16 107 0\n" -" mov.f32 %f110, 0f00000000; \n" -"$Lt_0_24578:\n" -" .loc 16 111 0\n" -" add.ftz.f32 %f111, %f110, %f70;\n" -" mul.ftz.f32 %f112, %f111, %f58;\n" -" fma.rn.ftz.f32 %f36, %f52, %f112, %f36;\n" -" .loc 16 112 0\n" -" fma.rn.ftz.f32 %f35, %f51, %f112, %f35;\n" -" .loc 16 113 0\n" -" fma.rn.ftz.f32 %f34, %f53, %f112, %f34;\n" -" ld.param.s32 %r61, [__cudaparm_kernel_pair_eflag];\n" -" mov.u32 %r62, 0;\n" -" setp.le.s32 %p7, %r61, %r62;\n" -" @%p7 bra $Lt_0_25602;\n" -" .loc 16 116 0\n" -" mov.f32 %f113, %f105;\n" -" mov.f32 %f114, %f95;\n" -" sub.ftz.f32 %f115, %f114, %f42;\n" -" fma.rn.ftz.f32 %f116, %f113, %f115, %f37;\n" -" selp.f32 %f37, %f116, %f37, %p6;\n" -" @!%p5 bra $Lt_0_25602;\n" -" .loc 16 120 0\n" -" ld.param.u64 %rd36, [__cudaparm_kernel_pair_lj3];\n" -" add.u64 %rd37, %rd36, %rd34;\n" -" ld.global.v4.f32 {%f117,%f118,%f119,_}, [%rd37+0];\n" -" mov.f32 %f120, %f64;\n" -" mov.f32 %f121, %f62;\n" -" mul.ftz.f32 %f122, %f117, %f121;\n" -" sub.ftz.f32 %f123, %f122, %f118;\n" -" mul.ftz.f32 %f124, %f120, %f123;\n" -" sub.ftz.f32 %f125, %f124, %f119;\n" -" fma.rn.ftz.f32 %f38, %f39, %f125, %f38;\n" -"$Lt_0_25602:\n" -"$Lt_0_25090:\n" -" ld.param.s32 %r63, [__cudaparm_kernel_pair_vflag];\n" -" mov.u32 %r64, 0;\n" -" setp.le.s32 %p8, %r63, %r64;\n" -" @%p8 bra $Lt_0_26114;\n" -" .loc 16 124 0\n" -" mov.f32 %f126, %f10;\n" -" mul.ftz.f32 %f127, %f52, %f52;\n" -" fma.rn.ftz.f32 %f128, %f112, %f127, %f126;\n" -" mov.f32 %f10, %f128;\n" -" .loc 16 125 0\n" -" mov.f32 %f129, %f12;\n" -" fma.rn.ftz.f32 %f130, %f112, %f54, %f129;\n" -" mov.f32 %f12, %f130;\n" -" .loc 16 126 0\n" -" mov.f32 %f131, %f14;\n" -" mul.ftz.f32 %f132, %f53, %f53;\n" -" fma.rn.ftz.f32 %f133, %f112, %f132, %f131;\n" -" mov.f32 %f14, %f133;\n" -" .loc 16 127 0\n" -" mov.f32 %f134, %f16;\n" -" mul.ftz.f32 %f135, %f51, %f52;\n" -" fma.rn.ftz.f32 %f136, %f112, %f135, %f134;\n" -" mov.f32 %f16, %f136;\n" -" .loc 16 128 0\n" -" mov.f32 %f137, %f18;\n" -" mul.ftz.f32 %f138, %f52, %f53;\n" -" fma.rn.ftz.f32 %f139, %f112, %f138, %f137;\n" -" mov.f32 %f18, %f139;\n" -" .loc 16 129 0\n" -" mul.ftz.f32 %f140, %f51, %f53;\n" -" fma.rn.ftz.f32 %f19, %f112, %f140, %f19;\n" -" mov.f32 %f20, %f19;\n" -"$Lt_0_26114:\n" -"$Lt_0_23554:\n" -" .loc 16 68 0\n" -" mul.lo.u64 %rd38, %rd27, 4;\n" -" add.u64 %rd20, %rd20, %rd38;\n" -" setp.lt.u64 %p9, %rd20, %rd19;\n" -" @%p9 bra $Lt_0_23298;\n" -" bra.uni $Lt_0_22786;\n" -"$Lt_0_32514:\n" -" mov.f32 %f34, 0f00000000; \n" -" mov.f32 %f35, 0f00000000; \n" -" mov.f32 %f36, 0f00000000; \n" -" mov.f32 %f37, 0f00000000; \n" -" mov.f32 %f38, 0f00000000; \n" -"$Lt_0_22786:\n" -" mov.u32 %r65, 1;\n" -" setp.le.s32 %p10, %r1, %r65;\n" -" @%p10 bra $Lt_0_28930;\n" -" .loc 16 134 0\n" -" mov.u64 %rd39, __cuda___cuda_local_var_32635_55_non_const_red_acc144;\n" -" cvt.s64.s32 %rd40, %r2;\n" -" mul.wide.s32 %rd41, %r2, 4;\n" -" add.u64 %rd42, %rd39, %rd41;\n" -" mov.f32 %f141, %f36;\n" -" st.shared.f32 [%rd42+0], %f141;\n" -" mov.f32 %f142, %f35;\n" -" st.shared.f32 [%rd42+512], %f142;\n" -" mov.f32 %f143, %f34;\n" -" st.shared.f32 [%rd42+1024], %f143;\n" -" mov.f32 %f144, %f38;\n" -" st.shared.f32 [%rd42+1536], %f144;\n" -" mov.f32 %f145, %f37;\n" -" st.shared.f32 [%rd42+2048], %f145;\n" -" shr.s32 %r66, %r1, 31;\n" -" mov.s32 %r67, 1;\n" -" and.b32 %r68, %r66, %r67;\n" -" add.s32 %r69, %r68, %r1;\n" -" shr.s32 %r70, %r69, 1;\n" -" mov.s32 %r71, %r70;\n" -" mov.u32 %r72, 0;\n" -" setp.ne.u32 %p11, %r70, %r72;\n" -" @!%p11 bra $Lt_0_27394;\n" -"$Lt_0_27906:\n" -" setp.ge.u32 %p12, %r14, %r71;\n" -" @%p12 bra $Lt_0_28162;\n" -" add.u32 %r73, %r2, %r71;\n" -" cvt.u64.u32 %rd43, %r73;\n" -" mul.wide.u32 %rd44, %r73, 4;\n" -" add.u64 %rd45, %rd39, %rd44;\n" -" ld.shared.f32 %f146, [%rd45+0];\n" -" add.ftz.f32 %f141, %f146, %f141;\n" -" st.shared.f32 [%rd42+0], %f141;\n" -" ld.shared.f32 %f147, [%rd45+512];\n" -" add.ftz.f32 %f142, %f147, %f142;\n" -" st.shared.f32 [%rd42+512], %f142;\n" -" ld.shared.f32 %f148, [%rd45+1024];\n" -" add.ftz.f32 %f143, %f148, %f143;\n" -" st.shared.f32 [%rd42+1024], %f143;\n" -" ld.shared.f32 %f149, [%rd45+1536];\n" -" add.ftz.f32 %f144, %f149, %f144;\n" -" st.shared.f32 [%rd42+1536], %f144;\n" -" ld.shared.f32 %f150, [%rd45+2048];\n" -" add.ftz.f32 %f145, %f150, %f145;\n" -" st.shared.f32 [%rd42+2048], %f145;\n" -"$Lt_0_28162:\n" -" shr.u32 %r71, %r71, 1;\n" -" mov.u32 %r74, 0;\n" -" setp.ne.u32 %p13, %r71, %r74;\n" -" @%p13 bra $Lt_0_27906;\n" -"$Lt_0_27394:\n" -" mov.f32 %f36, %f141;\n" -" mov.f32 %f35, %f142;\n" -" mov.f32 %f34, %f143;\n" -" mov.f32 %f38, %f144;\n" -" mov.f32 %f37, %f145;\n" -" ld.param.s32 %r75, [__cudaparm_kernel_pair_vflag];\n" -" mov.u32 %r76, 0;\n" -" setp.le.s32 %p14, %r75, %r76;\n" -" @%p14 bra $Lt_0_28930;\n" -" mov.f32 %f141, %f10;\n" -" st.shared.f32 [%rd42+0], %f141;\n" -" mov.f32 %f142, %f12;\n" -" st.shared.f32 [%rd42+512], %f142;\n" -" mov.f32 %f143, %f14;\n" -" st.shared.f32 [%rd42+1024], %f143;\n" -" mov.f32 %f144, %f16;\n" -" st.shared.f32 [%rd42+1536], %f144;\n" -" mov.f32 %f145, %f18;\n" -" st.shared.f32 [%rd42+2048], %f145;\n" -" mov.f32 %f151, %f19;\n" -" st.shared.f32 [%rd42+2560], %f151;\n" -" mov.s32 %r77, %r70;\n" -" @!%p11 bra $Lt_0_29442;\n" -"$Lt_0_29954:\n" -" setp.ge.u32 %p15, %r14, %r77;\n" -" @%p15 bra $Lt_0_30210;\n" -" add.u32 %r78, %r2, %r77;\n" -" cvt.u64.u32 %rd46, %r78;\n" -" mul.wide.u32 %rd47, %r78, 4;\n" -" add.u64 %rd48, %rd39, %rd47;\n" -" ld.shared.f32 %f152, [%rd48+0];\n" -" add.ftz.f32 %f141, %f152, %f141;\n" -" st.shared.f32 [%rd42+0], %f141;\n" -" ld.shared.f32 %f153, [%rd48+512];\n" -" add.ftz.f32 %f142, %f153, %f142;\n" -" st.shared.f32 [%rd42+512], %f142;\n" -" ld.shared.f32 %f154, [%rd48+1024];\n" -" add.ftz.f32 %f143, %f154, %f143;\n" -" st.shared.f32 [%rd42+1024], %f143;\n" -" ld.shared.f32 %f155, [%rd48+1536];\n" -" add.ftz.f32 %f144, %f155, %f144;\n" -" st.shared.f32 [%rd42+1536], %f144;\n" -" ld.shared.f32 %f156, [%rd48+2048];\n" -" add.ftz.f32 %f145, %f156, %f145;\n" -" st.shared.f32 [%rd42+2048], %f145;\n" -" ld.shared.f32 %f157, [%rd48+2560];\n" -" add.ftz.f32 %f151, %f157, %f151;\n" -" st.shared.f32 [%rd42+2560], %f151;\n" -"$Lt_0_30210:\n" -" shr.u32 %r77, %r77, 1;\n" -" mov.u32 %r79, 0;\n" -" setp.ne.u32 %p16, %r77, %r79;\n" -" @%p16 bra $Lt_0_29954;\n" -"$Lt_0_29442:\n" -" mov.f32 %f10, %f141;\n" -" mov.f32 %f12, %f142;\n" -" mov.f32 %f14, %f143;\n" -" mov.f32 %f16, %f144;\n" -" mov.f32 %f18, %f145;\n" -" mov.f32 %f20, %f151;\n" -"$Lt_0_28930:\n" -"$Lt_0_26882:\n" -" mov.u32 %r80, 0;\n" -" setp.ne.s32 %p17, %r14, %r80;\n" -" @%p17 bra $Lt_0_30978;\n" -" ld.param.u64 %rd49, [__cudaparm_kernel_pair___val_paramengv];\n" -" add.u64 %rd50, %rd49, %rd3;\n" -" ld.param.s32 %r81, [__cudaparm_kernel_pair_eflag];\n" -" mov.u32 %r82, 0;\n" -" setp.le.s32 %p18, %r81, %r82;\n" -" @%p18 bra $Lt_0_31490;\n" -" st.global.f32 [%rd50+0], %f38;\n" -" cvt.s64.s32 %rd51, %r9;\n" -" mul.wide.s32 %rd52, %r9, 4;\n" -" add.u64 %rd53, %rd52, %rd50;\n" -" st.global.f32 [%rd53+0], %f37;\n" -" add.u64 %rd50, %rd52, %rd53;\n" -"$Lt_0_31490:\n" -" ld.param.s32 %r83, [__cudaparm_kernel_pair_vflag];\n" -" mov.u32 %r84, 0;\n" -" setp.le.s32 %p19, %r83, %r84;\n" -" @%p19 bra $Lt_0_32002;\n" -" mov.f32 %f158, %f10;\n" -" st.global.f32 [%rd50+0], %f158;\n" -" cvt.s64.s32 %rd54, %r9;\n" -" mul.wide.s32 %rd55, %r9, 4;\n" -" add.u64 %rd56, %rd55, %rd50;\n" -" mov.f32 %f159, %f12;\n" -" st.global.f32 [%rd56+0], %f159;\n" -" add.u64 %rd57, %rd55, %rd56;\n" -" mov.f32 %f160, %f14;\n" -" st.global.f32 [%rd57+0], %f160;\n" -" add.u64 %rd58, %rd55, %rd57;\n" -" mov.f32 %f161, %f16;\n" -" st.global.f32 [%rd58+0], %f161;\n" -" add.u64 %rd50, %rd55, %rd58;\n" -" mov.f32 %f162, %f18;\n" -" st.global.f32 [%rd50+0], %f162;\n" -" mov.f32 %f163, %f20;\n" -" add.u64 %rd59, %rd55, %rd50;\n" -" st.global.f32 [%rd59+0], %f163;\n" -"$Lt_0_32002:\n" -" ld.param.u64 %rd60, [__cudaparm_kernel_pair_ans];\n" -" mul.lo.u64 %rd61, %rd2, 16;\n" -" add.u64 %rd62, %rd60, %rd61;\n" -" mov.f32 %f164, %f165;\n" -" st.global.v4.f32 [%rd62+0], {%f36,%f35,%f34,%f164};\n" -"$Lt_0_30978:\n" -"$Lt_0_21762:\n" -" .loc 16 137 0\n" -" exit;\n" -"$LDWend_kernel_pair:\n" -" }\n" -" .entry kernel_pair_fast (\n" -" .param .u64 __cudaparm_kernel_pair_fast_x_,\n" -" .param .u64 __cudaparm_kernel_pair_fast_lj1_in,\n" -" .param .u64 __cudaparm_kernel_pair_fast_lj3_in,\n" -" .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,\n" -" .param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n" -" .param .u64 __cudaparm_kernel_pair_fast_dev_packed,\n" -" .param .u64 __cudaparm_kernel_pair_fast_ans,\n" -" .param .u64 __cudaparm_kernel_pair_fast___val_paramengv,\n" -" .param .s32 __cudaparm_kernel_pair_fast_eflag,\n" -" .param .s32 __cudaparm_kernel_pair_fast_vflag,\n" -" .param .s32 __cudaparm_kernel_pair_fast_inum,\n" -" .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,\n" -" .param .u64 __cudaparm_kernel_pair_fast_q_,\n" -" .param .f32 __cudaparm_kernel_pair_fast_cut_coulsq,\n" -" .param .f32 __cudaparm_kernel_pair_fast_qqrd2e,\n" -" .param .f32 __cudaparm_kernel_pair_fast_g_ewald,\n" -" .param .s32 __cudaparm_kernel_pair_fast_t_per_atom)\n" -" {\n" -" .reg .u32 %r<88>;\n" -" .reg .u64 %rd<76>;\n" -" .reg .f32 %f<170>;\n" -" .reg .pred %p<24>;\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_32654_33_non_const_sp_lj3320[32];\n" -" .shared .align 16 .b8 __cuda___cuda_local_var_32652_34_non_const_lj13360[1936];\n" -" .shared .align 16 .b8 __cuda___cuda_local_var_32653_34_non_const_lj35296[1936];\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_32749_55_non_const_red_acc7232[3072];\n" -" .loc 16 147 0\n" -"$LDWbegin_kernel_pair_fast:\n" -" cvt.s32.u32 %r1, %tid.x;\n" -" mov.u32 %r2, 7;\n" -" setp.gt.s32 %p1, %r1, %r2;\n" -" @%p1 bra $Lt_1_24066;\n" -" .loc 16 155 0\n" -" mov.u64 %rd1, __cuda___cuda_local_var_32654_33_non_const_sp_lj3320;\n" -" cvt.s64.s32 %rd2, %r1;\n" -" mul.wide.s32 %rd3, %r1, 4;\n" -" ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];\n" -" add.u64 %rd5, %rd4, %rd3;\n" -" ld.global.f32 %f1, [%rd5+0];\n" -" add.u64 %rd6, %rd3, %rd1;\n" -" st.shared.f32 [%rd6+0], %f1;\n" -"$Lt_1_24066:\n" -" mov.u64 %rd1, __cuda___cuda_local_var_32654_33_non_const_sp_lj3320;\n" -" mov.u32 %r3, 120;\n" -" setp.gt.s32 %p2, %r1, %r3;\n" -" @%p2 bra $Lt_1_24578;\n" -" .loc 16 157 0\n" -" mov.u64 %rd7, __cuda___cuda_local_var_32652_34_non_const_lj13360;\n" -" cvt.s64.s32 %rd8, %r1;\n" -" mul.wide.s32 %rd9, %r1, 16;\n" -" ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_lj1_in];\n" -" add.u64 %rd11, %rd10, %rd9;\n" -" add.u64 %rd12, %rd9, %rd7;\n" -" ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0];\n" -" st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5};\n" -" ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag];\n" -" mov.u32 %r5, 0;\n" -" setp.le.s32 %p3, %r4, %r5;\n" -" @%p3 bra $Lt_1_25090;\n" -" .loc 16 159 0\n" -" mov.u64 %rd13, __cuda___cuda_local_var_32653_34_non_const_lj35296;\n" -" ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in];\n" -" add.u64 %rd15, %rd14, %rd9;\n" -" add.u64 %rd16, %rd9, %rd13;\n" -" ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0];\n" -" st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9};\n" -"$Lt_1_25090:\n" -" mov.u64 %rd13, __cuda___cuda_local_var_32653_34_non_const_lj35296;\n" -"$Lt_1_24578:\n" -" mov.u64 %rd13, __cuda___cuda_local_var_32653_34_non_const_lj35296;\n" -" mov.u64 %rd7, __cuda___cuda_local_var_32652_34_non_const_lj13360;\n" -" .loc 16 168 0\n" -" mov.f32 %f10, 0f00000000; \n" -" mov.f32 %f11, %f10;\n" -" mov.f32 %f12, 0f00000000; \n" -" mov.f32 %f13, %f12;\n" -" mov.f32 %f14, 0f00000000; \n" -" mov.f32 %f15, %f14;\n" -" mov.f32 %f16, 0f00000000; \n" -" mov.f32 %f17, %f16;\n" -" mov.f32 %f18, 0f00000000; \n" -" mov.f32 %f19, %f18;\n" -" mov.f32 %f20, 0f00000000; \n" -" mov.f32 %f21, %f20;\n" -" .loc 16 170 0\n" -" bar.sync 0;\n" -" ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom];\n" -" div.s32 %r7, %r1, %r6;\n" -" cvt.s32.u32 %r8, %ntid.x;\n" -" div.s32 %r9, %r8, %r6;\n" -" cvt.s32.u32 %r10, %ctaid.x;\n" -" mul.lo.s32 %r11, %r10, %r9;\n" -" add.s32 %r12, %r7, %r11;\n" -" ld.param.s32 %r13, [__cudaparm_kernel_pair_fast_inum];\n" -" setp.ge.s32 %p4, %r12, %r13;\n" -" @%p4 bra $Lt_1_34818;\n" -" .loc 16 175 0\n" -" cvt.s64.s32 %rd17, %r12;\n" -" mul.wide.s32 %rd18, %r12, 4;\n" -" ld.param.u64 %rd19, [__cudaparm_kernel_pair_fast_dev_nbor];\n" -" add.u64 %rd20, %rd18, %rd19;\n" -" ld.global.s32 %r14, [%rd20+0];\n" -" ld.param.s32 %r15, [__cudaparm_kernel_pair_fast_nbor_pitch];\n" -" cvt.s64.s32 %rd21, %r15;\n" -" mul.wide.s32 %rd22, %r15, 4;\n" -" add.u64 %rd23, %rd22, %rd20;\n" -" ld.global.s32 %r16, [%rd23+0];\n" -" sub.s32 %r17, %r6, 1;\n" -" and.b32 %r18, %r17, %r1;\n" -" cvt.s64.s32 %rd24, %r18;\n" -" mul.wide.s32 %rd25, %r18, 4;\n" -" ld.param.u64 %rd26, [__cudaparm_kernel_pair_fast_dev_packed];\n" -" setp.ne.u64 %p5, %rd26, %rd19;\n" -" @%p5 bra $Lt_1_26370;\n" -" cvt.s32.s64 %r19, %rd21;\n" -" mul.lo.s32 %r20, %r19, %r6;\n" -" mov.s32 %r21, %r20;\n" -" mul.lo.s32 %r22, %r17, %r12;\n" -" add.s32 %r23, %r19, %r22;\n" -" cvt.s64.s32 %rd27, %r23;\n" -" mul.wide.s32 %rd28, %r23, 4;\n" -" add.u64 %rd29, %rd23, %rd28;\n" -" and.b32 %r24, %r17, %r16;\n" -" cvt.s64.s32 %rd30, %r24;\n" -" div.s32 %r25, %r16, %r6;\n" -" mul.lo.s32 %r26, %r20, %r25;\n" -" cvt.s64.s32 %rd31, %r26;\n" -" add.u64 %rd32, %rd30, %rd31;\n" -" mul.lo.u64 %rd33, %rd32, 4;\n" -" add.u64 %rd34, %rd29, %rd33;\n" -" add.u64 %rd35, %rd25, %rd29;\n" -" bra.uni $Lt_1_26114;\n" -"$Lt_1_26370:\n" -" add.u64 %rd36, %rd22, %rd23;\n" -" ld.global.s32 %r27, [%rd36+0];\n" -" cvt.s64.s32 %rd37, %r27;\n" -" mul.wide.s32 %rd38, %r27, 4;\n" -" add.u64 %rd39, %rd26, %rd38;\n" -" cvt.s64.s32 %rd40, %r16;\n" -" mul.wide.s32 %rd41, %r16, 4;\n" -" add.u64 %rd34, %rd39, %rd41;\n" -" mov.s32 %r21, %r6;\n" -" add.u64 %rd35, %rd25, %rd39;\n" -"$Lt_1_26114:\n" -" .loc 16 178 0\n" -" mov.u32 %r28, %r14;\n" -" mov.s32 %r29, 0;\n" -" mov.u32 %r30, %r29;\n" -" mov.s32 %r31, 0;\n" -" mov.u32 %r32, %r31;\n" -" mov.s32 %r33, 0;\n" -" mov.u32 %r34, %r33;\n" -" tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r28,%r30,%r32,%r34}];\n" -" mov.f32 %f26, %f22;\n" -" mov.f32 %f27, %f23;\n" -" mov.f32 %f28, %f24;\n" -" mov.f32 %f29, %f25;\n" -" .loc 16 179 0\n" -" mov.u32 %r35, %r14;\n" -" mov.s32 %r36, 0;\n" -" mov.u32 %r37, %r36;\n" -" mov.s32 %r38, 0;\n" -" mov.u32 %r39, %r38;\n" -" mov.s32 %r40, 0;\n" -" mov.u32 %r41, %r40;\n" -" tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[q_tex,{%r35,%r37,%r39,%r41}];\n" -" mov.f32 %f34, %f30;\n" -" setp.ge.u64 %p6, %rd35, %rd34;\n" -" @%p6 bra $Lt_1_36354;\n" -" cvt.rzi.ftz.s32.f32 %r42, %f29;\n" -" cvt.s64.s32 %rd42, %r21;\n" -" mul.lo.s32 %r43, %r42, 11;\n" -" cvt.rn.f32.s32 %f35, %r43;\n" -" mov.f32 %f36, 0f00000000; \n" -" mov.f32 %f37, 0f00000000; \n" -" mov.f32 %f38, 0f00000000; \n" -" mov.f32 %f39, 0f00000000; \n" -" mov.f32 %f40, 0f00000000; \n" -"$Lt_1_27138:\n" -" .loc 16 184 0\n" -" ld.global.s32 %r44, [%rd35+0];\n" -" .loc 16 187 0\n" -" shr.s32 %r45, %r44, 30;\n" -" and.b32 %r46, %r45, 3;\n" -" cvt.s64.s32 %rd43, %r46;\n" -" mul.wide.s32 %rd44, %r46, 4;\n" -" add.u64 %rd45, %rd1, %rd44;\n" -" ld.shared.f32 %f41, [%rd45+0];\n" -" .loc 16 188 0\n" -" mov.f32 %f42, 0f3f800000; \n" -" ld.shared.f32 %f43, [%rd45+16];\n" -" sub.ftz.f32 %f44, %f42, %f43;\n" -" .loc 16 191 0\n" -" and.b32 %r47, %r44, 1073741823;\n" -" mov.u32 %r48, %r47;\n" -" mov.s32 %r49, 0;\n" -" mov.u32 %r50, %r49;\n" -" mov.s32 %r51, 0;\n" -" mov.u32 %r52, %r51;\n" -" mov.s32 %r53, 0;\n" -" mov.u32 %r54, %r53;\n" -" tex.1d.v4.f32.s32 {%f45,%f46,%f47,%f48},[pos_tex,{%r48,%r50,%r52,%r54}];\n" -" mov.f32 %f49, %f45;\n" -" mov.f32 %f50, %f46;\n" -" mov.f32 %f51, %f47;\n" -" mov.f32 %f52, %f48;\n" -" sub.ftz.f32 %f53, %f27, %f50;\n" -" sub.ftz.f32 %f54, %f26, %f49;\n" -" sub.ftz.f32 %f55, %f28, %f51;\n" -" mul.ftz.f32 %f56, %f53, %f53;\n" -" fma.rn.ftz.f32 %f57, %f54, %f54, %f56;\n" -" fma.rn.ftz.f32 %f58, %f55, %f55, %f57;\n" -" add.ftz.f32 %f59, %f35, %f52;\n" -" cvt.rzi.ftz.s32.f32 %r55, %f59;\n" -" cvt.s64.s32 %rd46, %r55;\n" -" mul.wide.s32 %rd47, %r55, 16;\n" -" add.u64 %rd48, %rd47, %rd7;\n" -" ld.shared.f32 %f60, [%rd48+8];\n" -" setp.gt.ftz.f32 %p7, %f60, %f58;\n" -" @!%p7 bra $Lt_1_29954;\n" -" rcp.approx.ftz.f32 %f61, %f58;\n" -" ld.shared.f32 %f62, [%rd48+12];\n" -" setp.lt.ftz.f32 %p8, %f58, %f62;\n" -" @!%p8 bra $Lt_1_28162;\n" -" .loc 16 206 0\n" -" rsqrt.approx.ftz.f32 %f63, %f58;\n" -" mul.ftz.f32 %f64, %f61, %f63;\n" -" mov.f32 %f65, %f64;\n" -" .loc 16 207 0\n" -" mul.ftz.f32 %f66, %f64, %f64;\n" -" mov.f32 %f67, %f66;\n" -" .loc 16 208 0\n" -" mul.ftz.f32 %f68, %f66, %f41;\n" -" ld.shared.v2.f32 {%f69,%f70}, [%rd48+0];\n" -" mul.ftz.f32 %f71, %f69, %f64;\n" -" sub.ftz.f32 %f72, %f71, %f70;\n" -" mul.ftz.f32 %f73, %f68, %f72;\n" -" bra.uni $Lt_1_27906;\n" -"$Lt_1_28162:\n" -" .loc 16 210 0\n" -" mov.f32 %f73, 0f00000000; \n" -"$Lt_1_27906:\n" -" ld.param.f32 %f74, [__cudaparm_kernel_pair_fast_cut_coulsq];\n" -" setp.gt.ftz.f32 %p9, %f74, %f58;\n" -" @!%p9 bra $Lt_1_28674;\n" -" .loc 20 518 0\n" -" rsqrt.approx.ftz.f32 %f75, %f61;\n" -" ld.param.f32 %f76, [__cudaparm_kernel_pair_fast_g_ewald];\n" -" mul.ftz.f32 %f77, %f76, %f75;\n" -" mul.ftz.f32 %f78, %f77, %f77;\n" -" neg.ftz.f32 %f79, %f78;\n" -" mov.f32 %f80, 0f3fb8aa3b; \n" -" mul.ftz.f32 %f81, %f79, %f80;\n" -" ex2.approx.ftz.f32 %f82, %f81;\n" -" .loc 16 217 0\n" -" mov.f32 %f83, 0f3f800000; \n" -" mov.f32 %f84, 0f3ea7ba05; \n" -" fma.rn.ftz.f32 %f85, %f84, %f77, %f83;\n" -" rcp.approx.ftz.f32 %f86, %f85;\n" -" mov.f32 %f87, 0f3e827906; \n" -" mov.f32 %f88, 0fbe91a98e; \n" -" mov.f32 %f89, 0f3fb5f0e3; \n" -" mov.f32 %f90, 0fbfba00e3; \n" -" mov.f32 %f91, 0f3f87dc22; \n" -" fma.rn.ftz.f32 %f92, %f91, %f86, %f90;\n" -" fma.rn.ftz.f32 %f93, %f86, %f92, %f89;\n" -" fma.rn.ftz.f32 %f94, %f86, %f93, %f88;\n" -" fma.rn.ftz.f32 %f95, %f86, %f94, %f87;\n" -" mul.ftz.f32 %f96, %f86, %f95;\n" -" mul.ftz.f32 %f97, %f82, %f96;\n" -" mov.f32 %f98, %f97;\n" -" .loc 16 218 0\n" -" mov.u32 %r56, %r47;\n" -" mov.s32 %r57, 0;\n" -" mov.u32 %r58, %r57;\n" -" mov.s32 %r59, 0;\n" -" mov.u32 %r60, %r59;\n" -" mov.s32 %r61, 0;\n" -" mov.u32 %r62, %r61;\n" -" tex.1d.v4.f32.s32 {%f99,%f100,%f101,%f102},[q_tex,{%r56,%r58,%r60,%r62}];\n" -" mov.f32 %f103, %f99;\n" -" ld.param.f32 %f104, [__cudaparm_kernel_pair_fast_qqrd2e];\n" -" mul.ftz.f32 %f105, %f104, %f34;\n" -" mul.ftz.f32 %f106, %f105, %f103;\n" -" div.approx.ftz.f32 %f107, %f106, %f75;\n" -" mov.f32 %f108, %f107;\n" -" .loc 16 219 0\n" -" mov.f32 %f109, 0f3f906ebb; \n" -" mul.ftz.f32 %f110, %f77, %f109;\n" -" fma.rn.ftz.f32 %f111, %f82, %f110, %f97;\n" -" sub.ftz.f32 %f112, %f111, %f44;\n" -" mul.ftz.f32 %f113, %f107, %f112;\n" -" bra.uni $Lt_1_28418;\n" -"$Lt_1_28674:\n" -" .loc 16 221 0\n" -" mov.f32 %f113, 0f00000000; \n" -"$Lt_1_28418:\n" -" .loc 16 225 0\n" -" add.ftz.f32 %f114, %f113, %f73;\n" -" mul.ftz.f32 %f115, %f114, %f61;\n" -" fma.rn.ftz.f32 %f38, %f54, %f115, %f38;\n" -" .loc 16 226 0\n" -" fma.rn.ftz.f32 %f37, %f53, %f115, %f37;\n" -" .loc 16 227 0\n" -" fma.rn.ftz.f32 %f36, %f55, %f115, %f36;\n" -" ld.param.s32 %r63, [__cudaparm_kernel_pair_fast_eflag];\n" -" mov.u32 %r64, 0;\n" -" setp.le.s32 %p10, %r63, %r64;\n" -" @%p10 bra $Lt_1_29442;\n" -" .loc 16 230 0\n" -" mov.f32 %f116, %f108;\n" -" mov.f32 %f117, %f98;\n" -" sub.ftz.f32 %f118, %f117, %f44;\n" -" fma.rn.ftz.f32 %f119, %f116, %f118, %f39;\n" -" selp.f32 %f39, %f119, %f39, %p9;\n" -" @!%p8 bra $Lt_1_29442;\n" -" .loc 16 233 0\n" -" add.u64 %rd49, %rd47, %rd13;\n" -" mov.f32 %f120, %f67;\n" -" ld.shared.v4.f32 {%f121,%f122,%f123,_}, [%rd49+0];\n" -" mov.f32 %f124, %f65;\n" -" mul.ftz.f32 %f125, %f121, %f124;\n" -" sub.ftz.f32 %f126, %f125, %f122;\n" -" mul.ftz.f32 %f127, %f120, %f126;\n" -" .loc 16 234 0\n" -" sub.ftz.f32 %f128, %f127, %f123;\n" -" fma.rn.ftz.f32 %f40, %f41, %f128, %f40;\n" -"$Lt_1_29442:\n" -"$Lt_1_28930:\n" -" ld.param.s32 %r65, [__cudaparm_kernel_pair_fast_vflag];\n" -" mov.u32 %r66, 0;\n" -" setp.le.s32 %p11, %r65, %r66;\n" -" @%p11 bra $Lt_1_29954;\n" -" .loc 16 238 0\n" -" mov.f32 %f129, %f11;\n" -" mul.ftz.f32 %f130, %f54, %f54;\n" -" fma.rn.ftz.f32 %f131, %f115, %f130, %f129;\n" -" mov.f32 %f11, %f131;\n" -" .loc 16 239 0\n" -" mov.f32 %f132, %f13;\n" -" fma.rn.ftz.f32 %f133, %f115, %f56, %f132;\n" -" mov.f32 %f13, %f133;\n" -" .loc 16 240 0\n" -" mov.f32 %f134, %f15;\n" -" mul.ftz.f32 %f135, %f55, %f55;\n" -" fma.rn.ftz.f32 %f136, %f115, %f135, %f134;\n" -" mov.f32 %f15, %f136;\n" -" .loc 16 241 0\n" -" mov.f32 %f137, %f17;\n" -" mul.ftz.f32 %f138, %f53, %f54;\n" -" fma.rn.ftz.f32 %f139, %f115, %f138, %f137;\n" -" mov.f32 %f17, %f139;\n" -" .loc 16 242 0\n" -" mov.f32 %f140, %f19;\n" -" mul.ftz.f32 %f141, %f54, %f55;\n" -" fma.rn.ftz.f32 %f142, %f115, %f141, %f140;\n" -" mov.f32 %f19, %f142;\n" -" .loc 16 243 0\n" -" mul.ftz.f32 %f143, %f53, %f55;\n" -" fma.rn.ftz.f32 %f20, %f115, %f143, %f20;\n" -" mov.f32 %f21, %f20;\n" -"$Lt_1_29954:\n" -"$Lt_1_27394:\n" -" .loc 16 183 0\n" -" mul.lo.u64 %rd50, %rd42, 4;\n" -" add.u64 %rd35, %rd35, %rd50;\n" -" setp.lt.u64 %p12, %rd35, %rd34;\n" -" @%p12 bra $Lt_1_27138;\n" -" bra.uni $Lt_1_26626;\n" -"$Lt_1_36354:\n" -" mov.f32 %f36, 0f00000000; \n" -" mov.f32 %f37, 0f00000000; \n" -" mov.f32 %f38, 0f00000000; \n" -" mov.f32 %f39, 0f00000000; \n" -" mov.f32 %f40, 0f00000000; \n" -"$Lt_1_26626:\n" -" mov.u32 %r67, 1;\n" -" setp.le.s32 %p13, %r6, %r67;\n" -" @%p13 bra $Lt_1_32770;\n" -" .loc 16 248 0\n" -" mov.u64 %rd51, __cuda___cuda_local_var_32749_55_non_const_red_acc7232;\n" -" cvt.s64.s32 %rd52, %r1;\n" -" mul.wide.s32 %rd53, %r1, 4;\n" -" add.u64 %rd54, %rd51, %rd53;\n" -" mov.f32 %f144, %f38;\n" -" st.shared.f32 [%rd54+0], %f144;\n" -" mov.f32 %f145, %f37;\n" -" st.shared.f32 [%rd54+512], %f145;\n" -" mov.f32 %f146, %f36;\n" -" st.shared.f32 [%rd54+1024], %f146;\n" -" mov.f32 %f147, %f40;\n" -" st.shared.f32 [%rd54+1536], %f147;\n" -" mov.f32 %f148, %f39;\n" -" st.shared.f32 [%rd54+2048], %f148;\n" -" shr.s32 %r68, %r6, 31;\n" -" mov.s32 %r69, 1;\n" -" and.b32 %r70, %r68, %r69;\n" -" add.s32 %r71, %r70, %r6;\n" -" shr.s32 %r72, %r71, 1;\n" -" mov.s32 %r73, %r72;\n" -" mov.u32 %r74, 0;\n" -" setp.ne.u32 %p14, %r72, %r74;\n" -" @!%p14 bra $Lt_1_31234;\n" -"$Lt_1_31746:\n" -" setp.ge.u32 %p15, %r18, %r73;\n" -" @%p15 bra $Lt_1_32002;\n" -" add.u32 %r75, %r1, %r73;\n" -" cvt.u64.u32 %rd55, %r75;\n" -" mul.wide.u32 %rd56, %r75, 4;\n" -" add.u64 %rd57, %rd51, %rd56;\n" -" ld.shared.f32 %f149, [%rd57+0];\n" -" add.ftz.f32 %f144, %f149, %f144;\n" -" st.shared.f32 [%rd54+0], %f144;\n" -" ld.shared.f32 %f150, [%rd57+512];\n" -" add.ftz.f32 %f145, %f150, %f145;\n" -" st.shared.f32 [%rd54+512], %f145;\n" -" ld.shared.f32 %f151, [%rd57+1024];\n" -" add.ftz.f32 %f146, %f151, %f146;\n" -" st.shared.f32 [%rd54+1024], %f146;\n" -" ld.shared.f32 %f152, [%rd57+1536];\n" -" add.ftz.f32 %f147, %f152, %f147;\n" -" st.shared.f32 [%rd54+1536], %f147;\n" -" ld.shared.f32 %f153, [%rd57+2048];\n" -" add.ftz.f32 %f148, %f153, %f148;\n" -" st.shared.f32 [%rd54+2048], %f148;\n" -"$Lt_1_32002:\n" -" shr.u32 %r73, %r73, 1;\n" -" mov.u32 %r76, 0;\n" -" setp.ne.u32 %p16, %r73, %r76;\n" -" @%p16 bra $Lt_1_31746;\n" -"$Lt_1_31234:\n" -" mov.f32 %f38, %f144;\n" -" mov.f32 %f37, %f145;\n" -" mov.f32 %f36, %f146;\n" -" mov.f32 %f40, %f147;\n" -" mov.f32 %f39, %f148;\n" -" ld.param.s32 %r77, [__cudaparm_kernel_pair_fast_vflag];\n" -" mov.u32 %r78, 0;\n" -" setp.le.s32 %p17, %r77, %r78;\n" -" @%p17 bra $Lt_1_32770;\n" -" mov.f32 %f144, %f11;\n" -" st.shared.f32 [%rd54+0], %f144;\n" -" mov.f32 %f145, %f13;\n" -" st.shared.f32 [%rd54+512], %f145;\n" -" mov.f32 %f146, %f15;\n" -" st.shared.f32 [%rd54+1024], %f146;\n" -" mov.f32 %f147, %f17;\n" -" st.shared.f32 [%rd54+1536], %f147;\n" -" mov.f32 %f148, %f19;\n" -" st.shared.f32 [%rd54+2048], %f148;\n" -" mov.f32 %f154, %f20;\n" -" st.shared.f32 [%rd54+2560], %f154;\n" -" mov.s32 %r79, %r72;\n" -" @!%p14 bra $Lt_1_33282;\n" -"$Lt_1_33794:\n" -" setp.ge.u32 %p18, %r18, %r79;\n" -" @%p18 bra $Lt_1_34050;\n" -" add.u32 %r80, %r1, %r79;\n" -" cvt.u64.u32 %rd58, %r80;\n" -" mul.wide.u32 %rd59, %r80, 4;\n" -" add.u64 %rd60, %rd51, %rd59;\n" -" ld.shared.f32 %f155, [%rd60+0];\n" -" add.ftz.f32 %f144, %f155, %f144;\n" -" st.shared.f32 [%rd54+0], %f144;\n" -" ld.shared.f32 %f156, [%rd60+512];\n" -" add.ftz.f32 %f145, %f156, %f145;\n" -" st.shared.f32 [%rd54+512], %f145;\n" -" ld.shared.f32 %f157, [%rd60+1024];\n" -" add.ftz.f32 %f146, %f157, %f146;\n" -" st.shared.f32 [%rd54+1024], %f146;\n" -" ld.shared.f32 %f158, [%rd60+1536];\n" -" add.ftz.f32 %f147, %f158, %f147;\n" -" st.shared.f32 [%rd54+1536], %f147;\n" -" ld.shared.f32 %f159, [%rd60+2048];\n" -" add.ftz.f32 %f148, %f159, %f148;\n" -" st.shared.f32 [%rd54+2048], %f148;\n" -" ld.shared.f32 %f160, [%rd60+2560];\n" -" add.ftz.f32 %f154, %f160, %f154;\n" -" st.shared.f32 [%rd54+2560], %f154;\n" -"$Lt_1_34050:\n" -" shr.u32 %r79, %r79, 1;\n" -" mov.u32 %r81, 0;\n" -" setp.ne.u32 %p19, %r79, %r81;\n" -" @%p19 bra $Lt_1_33794;\n" -"$Lt_1_33282:\n" -" mov.f32 %f11, %f144;\n" -" mov.f32 %f13, %f145;\n" -" mov.f32 %f15, %f146;\n" -" mov.f32 %f17, %f147;\n" -" mov.f32 %f19, %f148;\n" -" mov.f32 %f21, %f154;\n" -"$Lt_1_32770:\n" -"$Lt_1_30722:\n" -" mov.u32 %r82, 0;\n" -" setp.ne.s32 %p20, %r18, %r82;\n" -" @%p20 bra $Lt_1_34818;\n" -" ld.param.u64 %rd61, [__cudaparm_kernel_pair_fast___val_paramengv];\n" -" add.u64 %rd62, %rd61, %rd18;\n" -" ld.param.s32 %r83, [__cudaparm_kernel_pair_fast_eflag];\n" -" mov.u32 %r84, 0;\n" -" setp.le.s32 %p21, %r83, %r84;\n" -" @%p21 bra $Lt_1_35330;\n" -" st.global.f32 [%rd62+0], %f40;\n" -" cvt.s64.s32 %rd63, %r13;\n" -" mul.wide.s32 %rd64, %r13, 4;\n" -" add.u64 %rd65, %rd64, %rd62;\n" -" st.global.f32 [%rd65+0], %f39;\n" -" add.u64 %rd62, %rd64, %rd65;\n" -"$Lt_1_35330:\n" -" ld.param.s32 %r85, [__cudaparm_kernel_pair_fast_vflag];\n" -" mov.u32 %r86, 0;\n" -" setp.le.s32 %p22, %r85, %r86;\n" -" @%p22 bra $Lt_1_35842;\n" -" mov.f32 %f161, %f11;\n" -" st.global.f32 [%rd62+0], %f161;\n" -" cvt.s64.s32 %rd66, %r13;\n" -" mul.wide.s32 %rd67, %r13, 4;\n" -" add.u64 %rd68, %rd67, %rd62;\n" -" mov.f32 %f162, %f13;\n" -" st.global.f32 [%rd68+0], %f162;\n" -" add.u64 %rd69, %rd67, %rd68;\n" -" mov.f32 %f163, %f15;\n" -" st.global.f32 [%rd69+0], %f163;\n" -" add.u64 %rd70, %rd67, %rd69;\n" -" mov.f32 %f164, %f17;\n" -" st.global.f32 [%rd70+0], %f164;\n" -" add.u64 %rd62, %rd67, %rd70;\n" -" mov.f32 %f165, %f19;\n" -" st.global.f32 [%rd62+0], %f165;\n" -" mov.f32 %f166, %f21;\n" -" add.u64 %rd71, %rd67, %rd62;\n" -" st.global.f32 [%rd71+0], %f166;\n" -"$Lt_1_35842:\n" -" ld.param.u64 %rd72, [__cudaparm_kernel_pair_fast_ans];\n" -" mul.lo.u64 %rd73, %rd17, 16;\n" -" add.u64 %rd74, %rd72, %rd73;\n" -" mov.f32 %f167, %f168;\n" -" st.global.v4.f32 [%rd74+0], {%f38,%f37,%f36,%f167};\n" -"$Lt_1_34818:\n" -"$Lt_1_25602:\n" -" .loc 16 251 0\n" -" exit;\n" -"$LDWend_kernel_pair_fast:\n" -" }\n" -; diff --git a/lib/gpu/lj_coul.ptx b/lib/gpu/lj_coul.ptx deleted file mode 100644 index d1135ffe03..0000000000 --- a/lib/gpu/lj_coul.ptx +++ /dev/null @@ -1,1056 +0,0 @@ - .version 2.3 - .target sm_20 - .address_size 64 - // compiled with /usr/local/cuda/open64/lib//be - // nvopencc 4.0 built on 2011-05-12 - - //----------------------------------------------------------- - // Compiling /tmp/tmpxft_00009d12_00000000-9_lal_lj_coul.cpp3.i (/home/sjplimp/ccBI#.MygJKm) - //----------------------------------------------------------- - - //----------------------------------------------------------- - // Options: - //----------------------------------------------------------- - // Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64 - // -O3 (Optimization level) - // -g0 (Debug level) - // -m2 (Report advisories) - //----------------------------------------------------------- - - .file 1 "" - .file 2 "/tmp/tmpxft_00009d12_00000000-8_lal_lj_coul.cudafe2.gpu" - .file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h" - .file 4 "/usr/local/cuda/include/crt/device_runtime.h" - .file 5 "/usr/local/cuda/include/host_defines.h" - .file 6 "/usr/local/cuda/include/builtin_types.h" - .file 7 "/usr/local/cuda/include/device_types.h" - .file 8 "/usr/local/cuda/include/driver_types.h" - .file 9 "/usr/local/cuda/include/surface_types.h" - .file 10 "/usr/local/cuda/include/texture_types.h" - .file 11 "/usr/local/cuda/include/vector_types.h" - .file 12 "/usr/local/cuda/include/device_launch_parameters.h" - .file 13 "/usr/local/cuda/include/crt/storage_class.h" - .file 14 "/usr/include/bits/types.h" - .file 15 "/usr/include/time.h" - .file 16 "lal_lj_coul.cu" - .file 17 "/usr/local/cuda/include/common_functions.h" - .file 18 "/usr/local/cuda/include/math_functions.h" - .file 19 "/usr/local/cuda/include/math_constants.h" - .file 20 "/usr/local/cuda/include/device_functions.h" - .file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h" - .file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h" - .file 23 "/usr/local/cuda/include/sm_13_double_functions.h" - .file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h" - .file 25 "/usr/local/cuda/include/sm_20_intrinsics.h" - .file 26 "/usr/local/cuda/include/surface_functions.h" - .file 27 "/usr/local/cuda/include/texture_fetch_functions.h" - .file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h" - - .global .texref pos_tex; - .global .texref q_tex; - - .entry kernel_pair ( - .param .u64 __cudaparm_kernel_pair_x_, - .param .u64 __cudaparm_kernel_pair_lj1, - .param .u64 __cudaparm_kernel_pair_lj3, - .param .s32 __cudaparm_kernel_pair_lj_types, - .param .u64 __cudaparm_kernel_pair_sp_lj_in, - .param .u64 __cudaparm_kernel_pair_dev_nbor, - .param .u64 __cudaparm_kernel_pair_dev_packed, - .param .u64 __cudaparm_kernel_pair_ans, - .param .u64 __cudaparm_kernel_pair___val_paramengv, - .param .s32 __cudaparm_kernel_pair_eflag, - .param .s32 __cudaparm_kernel_pair_vflag, - .param .s32 __cudaparm_kernel_pair_inum, - .param .s32 __cudaparm_kernel_pair_nbor_pitch, - .param .u64 __cudaparm_kernel_pair_q_, - .param .u64 __cudaparm_kernel_pair_cutsq, - .param .f32 __cudaparm_kernel_pair_qqrd2e, - .param .s32 __cudaparm_kernel_pair_t_per_atom) - { - .reg .u32 %r<86>; - .reg .u64 %rd<67>; - .reg .f32 %f<130>; - .reg .pred %p<21>; - .shared .align 16 .b8 __cuda___cuda_local_var_32541_33_non_const_sp_lj112[32]; - .shared .align 4 .b8 __cuda___cuda_local_var_32626_55_non_const_red_acc144[3072]; - // __cuda_local_var_32553_10_non_const_f = 48 - // __cuda_local_var_32555_9_non_const_virial = 16 - // __cuda_local_var_32589_43_non_const_r6inv = 40 - .loc 16 36 0 -$LDWbegin_kernel_pair: - .loc 16 41 0 - ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in]; - ldu.global.f32 %f1, [%rd1+0]; - .loc 16 42 0 - ld.global.f32 %f2, [%rd1+4]; - .loc 16 43 0 - ld.global.f32 %f3, [%rd1+8]; - .loc 16 44 0 - ld.global.f32 %f4, [%rd1+12]; - st.shared.v4.f32 [__cuda___cuda_local_var_32541_33_non_const_sp_lj112+0], {%f1,%f2,%f3,%f4}; - .loc 16 45 0 - ld.global.f32 %f5, [%rd1+16]; - .loc 16 46 0 - ld.global.f32 %f6, [%rd1+20]; - .loc 16 47 0 - ld.global.f32 %f7, [%rd1+24]; - .loc 16 48 0 - ld.global.f32 %f8, [%rd1+28]; - st.shared.v4.f32 [__cuda___cuda_local_var_32541_33_non_const_sp_lj112+16], {%f5,%f6,%f7,%f8}; - .loc 16 56 0 - mov.f32 %f9, 0f00000000; // 0 - mov.f32 %f10, %f9; - mov.f32 %f11, 0f00000000; // 0 - mov.f32 %f12, %f11; - mov.f32 %f13, 0f00000000; // 0 - mov.f32 %f14, %f13; - mov.f32 %f15, 0f00000000; // 0 - mov.f32 %f16, %f15; - mov.f32 %f17, 0f00000000; // 0 - mov.f32 %f18, %f17; - mov.f32 %f19, 0f00000000; // 0 - mov.f32 %f20, %f19; - ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom]; - cvt.s32.u32 %r2, %tid.x; - div.s32 %r3, %r2, %r1; - cvt.s32.u32 %r4, %ntid.x; - div.s32 %r5, %r4, %r1; - cvt.s32.u32 %r6, %ctaid.x; - mul.lo.s32 %r7, %r6, %r5; - add.s32 %r8, %r3, %r7; - ld.param.s32 %r9, [__cudaparm_kernel_pair_inum]; - setp.ge.s32 %p1, %r8, %r9; - @%p1 bra $Lt_0_30210; - .loc 16 61 0 - cvt.s64.s32 %rd2, %r8; - mul.wide.s32 %rd3, %r8, 4; - ld.param.u64 %rd4, [__cudaparm_kernel_pair_dev_nbor]; - add.u64 %rd5, %rd3, %rd4; - ld.global.s32 %r10, [%rd5+0]; - ld.param.s32 %r11, [__cudaparm_kernel_pair_nbor_pitch]; - cvt.s64.s32 %rd6, %r11; - mul.wide.s32 %rd7, %r11, 4; - add.u64 %rd8, %rd7, %rd5; - ld.global.s32 %r12, [%rd8+0]; - sub.s32 %r13, %r1, 1; - and.b32 %r14, %r13, %r2; - cvt.s64.s32 %rd9, %r14; - mul.wide.s32 %rd10, %r14, 4; - ld.param.u64 %rd11, [__cudaparm_kernel_pair_dev_packed]; - setp.ne.u64 %p2, %rd11, %rd4; - @%p2 bra $Lt_0_21762; - cvt.s32.s64 %r15, %rd6; - mul.lo.s32 %r16, %r15, %r1; - mov.s32 %r17, %r16; - mul.lo.s32 %r18, %r13, %r8; - add.s32 %r19, %r15, %r18; - cvt.s64.s32 %rd12, %r19; - mul.wide.s32 %rd13, %r19, 4; - add.u64 %rd14, %rd8, %rd13; - and.b32 %r20, %r13, %r12; - cvt.s64.s32 %rd15, %r20; - div.s32 %r21, %r12, %r1; - mul.lo.s32 %r22, %r16, %r21; - cvt.s64.s32 %rd16, %r22; - add.u64 %rd17, %rd15, %rd16; - mul.lo.u64 %rd18, %rd17, 4; - add.u64 %rd19, %rd14, %rd18; - add.u64 %rd20, %rd10, %rd14; - bra.uni $Lt_0_21506; -$Lt_0_21762: - add.u64 %rd21, %rd7, %rd8; - ld.global.s32 %r23, [%rd21+0]; - cvt.s64.s32 %rd22, %r23; - mul.wide.s32 %rd23, %r23, 4; - add.u64 %rd24, %rd11, %rd23; - cvt.s64.s32 %rd25, %r12; - mul.wide.s32 %rd26, %r12, 4; - add.u64 %rd19, %rd24, %rd26; - mov.s32 %r17, %r1; - add.u64 %rd20, %rd10, %rd24; -$Lt_0_21506: - .loc 16 64 0 - mov.u32 %r24, %r10; - mov.s32 %r25, 0; - mov.u32 %r26, %r25; - mov.s32 %r27, 0; - mov.u32 %r28, %r27; - mov.s32 %r29, 0; - mov.u32 %r30, %r29; - tex.1d.v4.f32.s32 {%f21,%f22,%f23,%f24},[pos_tex,{%r24,%r26,%r28,%r30}]; - mov.f32 %f25, %f21; - mov.f32 %f26, %f22; - mov.f32 %f27, %f23; - mov.f32 %f28, %f24; - .loc 16 65 0 - mov.u32 %r31, %r10; - mov.s32 %r32, 0; - mov.u32 %r33, %r32; - mov.s32 %r34, 0; - mov.u32 %r35, %r34; - mov.s32 %r36, 0; - mov.u32 %r37, %r36; - tex.1d.v4.f32.s32 {%f29,%f30,%f31,%f32},[q_tex,{%r31,%r33,%r35,%r37}]; - mov.f32 %f33, %f29; - setp.ge.u64 %p3, %rd20, %rd19; - @%p3 bra $Lt_0_31746; - cvt.rzi.ftz.s32.f32 %r38, %f28; - cvt.s64.s32 %rd27, %r17; - ld.param.s32 %r39, [__cudaparm_kernel_pair_lj_types]; - mul.lo.s32 %r40, %r39, %r38; - ld.param.u64 %rd28, [__cudaparm_kernel_pair_cutsq]; - mov.f32 %f34, 0f00000000; // 0 - mov.f32 %f35, 0f00000000; // 0 - mov.f32 %f36, 0f00000000; // 0 - mov.f32 %f37, 0f00000000; // 0 - mov.f32 %f38, 0f00000000; // 0 - mov.u64 %rd29, __cuda___cuda_local_var_32541_33_non_const_sp_lj112; -$Lt_0_22530: - // Loop body line 65, nesting depth: 1, estimated iterations: unknown - .loc 16 69 0 - ld.global.s32 %r41, [%rd20+0]; - .loc 16 72 0 - shr.s32 %r42, %r41, 30; - and.b32 %r43, %r42, 3; - cvt.s64.s32 %rd30, %r43; - mul.wide.s32 %rd31, %r43, 4; - add.u64 %rd32, %rd29, %rd31; - ld.shared.f32 %f39, [%rd32+0]; - .loc 16 76 0 - and.b32 %r44, %r41, 1073741823; - mov.u32 %r45, %r44; - mov.s32 %r46, 0; - mov.u32 %r47, %r46; - mov.s32 %r48, 0; - mov.u32 %r49, %r48; - mov.s32 %r50, 0; - mov.u32 %r51, %r50; - tex.1d.v4.f32.s32 {%f40,%f41,%f42,%f43},[pos_tex,{%r45,%r47,%r49,%r51}]; - mov.f32 %f44, %f40; - mov.f32 %f45, %f41; - mov.f32 %f46, %f42; - mov.f32 %f47, %f43; - cvt.rzi.ftz.s32.f32 %r52, %f47; - sub.ftz.f32 %f48, %f26, %f45; - sub.ftz.f32 %f49, %f25, %f44; - sub.ftz.f32 %f50, %f27, %f46; - mul.ftz.f32 %f51, %f48, %f48; - fma.rn.ftz.f32 %f52, %f49, %f49, %f51; - add.s32 %r53, %r52, %r40; - cvt.s64.s32 %rd33, %r53; - fma.rn.ftz.f32 %f53, %f50, %f50, %f52; - mul.wide.s32 %rd34, %r53, 4; - add.u64 %rd35, %rd28, %rd34; - ld.global.f32 %f54, [%rd35+0]; - setp.gt.ftz.f32 %p4, %f54, %f53; - @!%p4 bra $Lt_0_25346; - mul.lo.u64 %rd36, %rd33, 16; - rcp.approx.ftz.f32 %f55, %f53; - ld.param.u64 %rd37, [__cudaparm_kernel_pair_lj1]; - add.u64 %rd38, %rd37, %rd36; - ld.global.f32 %f56, [%rd38+8]; - setp.lt.ftz.f32 %p5, %f53, %f56; - @!%p5 bra $Lt_0_23554; - .loc 16 91 0 - mul.ftz.f32 %f57, %f55, %f55; - mul.ftz.f32 %f58, %f55, %f57; - mov.f32 %f59, %f58; - .loc 16 92 0 - mul.ftz.f32 %f60, %f58, %f39; - ld.global.v2.f32 {%f61,%f62}, [%rd38+0]; - mul.ftz.f32 %f63, %f61, %f58; - sub.ftz.f32 %f64, %f63, %f62; - mul.ftz.f32 %f65, %f60, %f64; - bra.uni $Lt_0_23298; -$Lt_0_23554: - .loc 16 94 0 - mov.f32 %f65, 0f00000000; // 0 -$Lt_0_23298: - ld.global.f32 %f66, [%rd38+12]; - setp.gt.ftz.f32 %p6, %f66, %f53; - @!%p6 bra $Lt_0_24066; - .loc 16 97 0 - mov.u32 %r54, %r44; - mov.s32 %r55, 0; - mov.u32 %r56, %r55; - mov.s32 %r57, 0; - mov.u32 %r58, %r57; - mov.s32 %r59, 0; - mov.u32 %r60, %r59; - tex.1d.v4.f32.s32 {%f67,%f68,%f69,%f70},[q_tex,{%r54,%r56,%r58,%r60}]; - mov.f32 %f71, %f67; - ld.shared.f32 %f72, [%rd32+16]; - ld.param.f32 %f73, [__cudaparm_kernel_pair_qqrd2e]; - mul.ftz.f32 %f74, %f73, %f33; - mul.ftz.f32 %f75, %f71, %f74; - rsqrt.approx.ftz.f32 %f76, %f53; - mul.ftz.f32 %f77, %f75, %f76; - mul.ftz.f32 %f78, %f72, %f77; - bra.uni $Lt_0_23810; -$Lt_0_24066: - .loc 16 99 0 - mov.f32 %f78, 0f00000000; // 0 -$Lt_0_23810: - .loc 16 103 0 - add.ftz.f32 %f79, %f78, %f65; - mul.ftz.f32 %f80, %f79, %f55; - fma.rn.ftz.f32 %f36, %f49, %f80, %f36; - .loc 16 104 0 - fma.rn.ftz.f32 %f35, %f48, %f80, %f35; - .loc 16 105 0 - fma.rn.ftz.f32 %f34, %f50, %f80, %f34; - ld.param.s32 %r61, [__cudaparm_kernel_pair_eflag]; - mov.u32 %r62, 0; - setp.le.s32 %p7, %r61, %r62; - @%p7 bra $Lt_0_24834; - .loc 16 108 0 - add.ftz.f32 %f37, %f78, %f37; - @!%p5 bra $Lt_0_24834; - .loc 16 111 0 - ld.param.u64 %rd39, [__cudaparm_kernel_pair_lj3]; - add.u64 %rd40, %rd39, %rd36; - mov.f32 %f81, %f59; - ld.global.v4.f32 {%f82,%f83,%f84,_}, [%rd40+0]; - mul.ftz.f32 %f85, %f82, %f81; - sub.ftz.f32 %f86, %f85, %f83; - mul.ftz.f32 %f87, %f81, %f86; - sub.ftz.f32 %f88, %f87, %f84; - fma.rn.ftz.f32 %f38, %f39, %f88, %f38; -$Lt_0_24834: -$Lt_0_24322: - ld.param.s32 %r63, [__cudaparm_kernel_pair_vflag]; - mov.u32 %r64, 0; - setp.le.s32 %p8, %r63, %r64; - @%p8 bra $Lt_0_25346; - .loc 16 115 0 - mov.f32 %f89, %f10; - mul.ftz.f32 %f90, %f49, %f49; - fma.rn.ftz.f32 %f91, %f80, %f90, %f89; - mov.f32 %f10, %f91; - .loc 16 116 0 - mov.f32 %f92, %f12; - fma.rn.ftz.f32 %f93, %f80, %f51, %f92; - mov.f32 %f12, %f93; - .loc 16 117 0 - mov.f32 %f94, %f14; - mul.ftz.f32 %f95, %f50, %f50; - fma.rn.ftz.f32 %f96, %f80, %f95, %f94; - mov.f32 %f14, %f96; - .loc 16 118 0 - mov.f32 %f97, %f16; - mul.ftz.f32 %f98, %f48, %f49; - fma.rn.ftz.f32 %f99, %f80, %f98, %f97; - mov.f32 %f16, %f99; - .loc 16 119 0 - mov.f32 %f100, %f18; - mul.ftz.f32 %f101, %f49, %f50; - fma.rn.ftz.f32 %f102, %f80, %f101, %f100; - mov.f32 %f18, %f102; - .loc 16 120 0 - mul.ftz.f32 %f103, %f48, %f50; - fma.rn.ftz.f32 %f19, %f80, %f103, %f19; - mov.f32 %f20, %f19; -$Lt_0_25346: -$Lt_0_22786: - .loc 16 68 0 - mul.lo.u64 %rd41, %rd27, 4; - add.u64 %rd20, %rd20, %rd41; - setp.lt.u64 %p9, %rd20, %rd19; - @%p9 bra $Lt_0_22530; - bra.uni $Lt_0_22018; -$Lt_0_31746: - mov.f32 %f34, 0f00000000; // 0 - mov.f32 %f35, 0f00000000; // 0 - mov.f32 %f36, 0f00000000; // 0 - mov.f32 %f37, 0f00000000; // 0 - mov.f32 %f38, 0f00000000; // 0 -$Lt_0_22018: - mov.u32 %r65, 1; - setp.le.s32 %p10, %r1, %r65; - @%p10 bra $Lt_0_28162; - .loc 16 125 0 - mov.u64 %rd42, __cuda___cuda_local_var_32626_55_non_const_red_acc144; - cvt.s64.s32 %rd43, %r2; - mul.wide.s32 %rd44, %r2, 4; - add.u64 %rd45, %rd42, %rd44; - mov.f32 %f104, %f36; - st.shared.f32 [%rd45+0], %f104; - mov.f32 %f105, %f35; - st.shared.f32 [%rd45+512], %f105; - mov.f32 %f106, %f34; - st.shared.f32 [%rd45+1024], %f106; - mov.f32 %f107, %f38; - st.shared.f32 [%rd45+1536], %f107; - mov.f32 %f108, %f37; - st.shared.f32 [%rd45+2048], %f108; - shr.s32 %r66, %r1, 31; - mov.s32 %r67, 1; - and.b32 %r68, %r66, %r67; - add.s32 %r69, %r68, %r1; - shr.s32 %r70, %r69, 1; - mov.s32 %r71, %r70; - mov.u32 %r72, 0; - setp.ne.u32 %p11, %r70, %r72; - @!%p11 bra $Lt_0_26626; -$Lt_0_27138: - setp.ge.u32 %p12, %r14, %r71; - @%p12 bra $Lt_0_27394; - add.u32 %r73, %r2, %r71; - cvt.u64.u32 %rd46, %r73; - mul.wide.u32 %rd47, %r73, 4; - add.u64 %rd48, %rd42, %rd47; - ld.shared.f32 %f109, [%rd48+0]; - add.ftz.f32 %f104, %f109, %f104; - st.shared.f32 [%rd45+0], %f104; - ld.shared.f32 %f110, [%rd48+512]; - add.ftz.f32 %f105, %f110, %f105; - st.shared.f32 [%rd45+512], %f105; - ld.shared.f32 %f111, [%rd48+1024]; - add.ftz.f32 %f106, %f111, %f106; - st.shared.f32 [%rd45+1024], %f106; - ld.shared.f32 %f112, [%rd48+1536]; - add.ftz.f32 %f107, %f112, %f107; - st.shared.f32 [%rd45+1536], %f107; - ld.shared.f32 %f113, [%rd48+2048]; - add.ftz.f32 %f108, %f113, %f108; - st.shared.f32 [%rd45+2048], %f108; -$Lt_0_27394: - shr.u32 %r71, %r71, 1; - mov.u32 %r74, 0; - setp.ne.u32 %p13, %r71, %r74; - @%p13 bra $Lt_0_27138; -$Lt_0_26626: - mov.f32 %f36, %f104; - mov.f32 %f35, %f105; - mov.f32 %f34, %f106; - mov.f32 %f38, %f107; - mov.f32 %f37, %f108; - ld.param.s32 %r75, [__cudaparm_kernel_pair_vflag]; - mov.u32 %r76, 0; - setp.le.s32 %p14, %r75, %r76; - @%p14 bra $Lt_0_28162; - mov.f32 %f104, %f10; - st.shared.f32 [%rd45+0], %f104; - mov.f32 %f105, %f12; - st.shared.f32 [%rd45+512], %f105; - mov.f32 %f106, %f14; - st.shared.f32 [%rd45+1024], %f106; - mov.f32 %f107, %f16; - st.shared.f32 [%rd45+1536], %f107; - mov.f32 %f108, %f18; - st.shared.f32 [%rd45+2048], %f108; - mov.f32 %f114, %f19; - st.shared.f32 [%rd45+2560], %f114; - mov.s32 %r77, %r70; - @!%p11 bra $Lt_0_28674; -$Lt_0_29186: - setp.ge.u32 %p15, %r14, %r77; - @%p15 bra $Lt_0_29442; - add.u32 %r78, %r2, %r77; - cvt.u64.u32 %rd49, %r78; - mul.wide.u32 %rd50, %r78, 4; - add.u64 %rd51, %rd42, %rd50; - ld.shared.f32 %f115, [%rd51+0]; - add.ftz.f32 %f104, %f115, %f104; - st.shared.f32 [%rd45+0], %f104; - ld.shared.f32 %f116, [%rd51+512]; - add.ftz.f32 %f105, %f116, %f105; - st.shared.f32 [%rd45+512], %f105; - ld.shared.f32 %f117, [%rd51+1024]; - add.ftz.f32 %f106, %f117, %f106; - st.shared.f32 [%rd45+1024], %f106; - ld.shared.f32 %f118, [%rd51+1536]; - add.ftz.f32 %f107, %f118, %f107; - st.shared.f32 [%rd45+1536], %f107; - ld.shared.f32 %f119, [%rd51+2048]; - add.ftz.f32 %f108, %f119, %f108; - st.shared.f32 [%rd45+2048], %f108; - ld.shared.f32 %f120, [%rd51+2560]; - add.ftz.f32 %f114, %f120, %f114; - st.shared.f32 [%rd45+2560], %f114; -$Lt_0_29442: - shr.u32 %r77, %r77, 1; - mov.u32 %r79, 0; - setp.ne.u32 %p16, %r77, %r79; - @%p16 bra $Lt_0_29186; -$Lt_0_28674: - mov.f32 %f10, %f104; - mov.f32 %f12, %f105; - mov.f32 %f14, %f106; - mov.f32 %f16, %f107; - mov.f32 %f18, %f108; - mov.f32 %f20, %f114; -$Lt_0_28162: -$Lt_0_26114: - mov.u32 %r80, 0; - setp.ne.s32 %p17, %r14, %r80; - @%p17 bra $Lt_0_30210; - ld.param.u64 %rd52, [__cudaparm_kernel_pair___val_paramengv]; - add.u64 %rd53, %rd52, %rd3; - ld.param.s32 %r81, [__cudaparm_kernel_pair_eflag]; - mov.u32 %r82, 0; - setp.le.s32 %p18, %r81, %r82; - @%p18 bra $Lt_0_30722; - st.global.f32 [%rd53+0], %f38; - cvt.s64.s32 %rd54, %r9; - mul.wide.s32 %rd55, %r9, 4; - add.u64 %rd56, %rd55, %rd53; - st.global.f32 [%rd56+0], %f37; - add.u64 %rd53, %rd55, %rd56; -$Lt_0_30722: - ld.param.s32 %r83, [__cudaparm_kernel_pair_vflag]; - mov.u32 %r84, 0; - setp.le.s32 %p19, %r83, %r84; - @%p19 bra $Lt_0_31234; - mov.f32 %f121, %f10; - st.global.f32 [%rd53+0], %f121; - cvt.s64.s32 %rd57, %r9; - mul.wide.s32 %rd58, %r9, 4; - add.u64 %rd59, %rd58, %rd53; - mov.f32 %f122, %f12; - st.global.f32 [%rd59+0], %f122; - add.u64 %rd60, %rd58, %rd59; - mov.f32 %f123, %f14; - st.global.f32 [%rd60+0], %f123; - add.u64 %rd61, %rd58, %rd60; - mov.f32 %f124, %f16; - st.global.f32 [%rd61+0], %f124; - add.u64 %rd53, %rd58, %rd61; - mov.f32 %f125, %f18; - st.global.f32 [%rd53+0], %f125; - mov.f32 %f126, %f20; - add.u64 %rd62, %rd58, %rd53; - st.global.f32 [%rd62+0], %f126; -$Lt_0_31234: - ld.param.u64 %rd63, [__cudaparm_kernel_pair_ans]; - mul.lo.u64 %rd64, %rd2, 16; - add.u64 %rd65, %rd63, %rd64; - mov.f32 %f127, %f128; - st.global.v4.f32 [%rd65+0], {%f36,%f35,%f34,%f127}; -$Lt_0_30210: -$Lt_0_20994: - .loc 16 128 0 - exit; -$LDWend_kernel_pair: - } // kernel_pair - - .entry kernel_pair_fast ( - .param .u64 __cudaparm_kernel_pair_fast_x_, - .param .u64 __cudaparm_kernel_pair_fast_lj1_in, - .param .u64 __cudaparm_kernel_pair_fast_lj3_in, - .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in, - .param .u64 __cudaparm_kernel_pair_fast_dev_nbor, - .param .u64 __cudaparm_kernel_pair_fast_dev_packed, - .param .u64 __cudaparm_kernel_pair_fast_ans, - .param .u64 __cudaparm_kernel_pair_fast___val_paramengv, - .param .s32 __cudaparm_kernel_pair_fast_eflag, - .param .s32 __cudaparm_kernel_pair_fast_vflag, - .param .s32 __cudaparm_kernel_pair_fast_inum, - .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch, - .param .u64 __cudaparm_kernel_pair_fast_q_, - .param .u64 __cudaparm_kernel_pair_fast__cutsq, - .param .f32 __cudaparm_kernel_pair_fast_qqrd2e, - .param .s32 __cudaparm_kernel_pair_fast_t_per_atom) - { - .reg .u32 %r<88>; - .reg .u64 %rd<83>; - .reg .f32 %f<134>; - .reg .pred %p<24>; - .shared .align 4 .b8 __cuda___cuda_local_var_32646_33_non_const_sp_lj3320[32]; - .shared .align 16 .b8 __cuda___cuda_local_var_32643_34_non_const_lj13360[1936]; - .shared .align 4 .b8 __cuda___cuda_local_var_32645_33_non_const_cutsq5296[484]; - .shared .align 16 .b8 __cuda___cuda_local_var_32644_34_non_const_lj35792[1936]; - .shared .align 4 .b8 __cuda___cuda_local_var_32733_55_non_const_red_acc7728[3072]; - // __cuda_local_var_32658_10_non_const_f = 48 - // __cuda_local_var_32660_9_non_const_virial = 16 - // __cuda_local_var_32696_43_non_const_r6inv = 40 - .loc 16 138 0 -$LDWbegin_kernel_pair_fast: - cvt.s32.u32 %r1, %tid.x; - mov.u32 %r2, 7; - setp.gt.s32 %p1, %r1, %r2; - @%p1 bra $Lt_1_23298; - .loc 16 147 0 - mov.u64 %rd1, __cuda___cuda_local_var_32646_33_non_const_sp_lj3320; - cvt.s64.s32 %rd2, %r1; - mul.wide.s32 %rd3, %r1, 4; - ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in]; - add.u64 %rd5, %rd4, %rd3; - ld.global.f32 %f1, [%rd5+0]; - add.u64 %rd6, %rd3, %rd1; - st.shared.f32 [%rd6+0], %f1; -$Lt_1_23298: - mov.u64 %rd1, __cuda___cuda_local_var_32646_33_non_const_sp_lj3320; - mov.u32 %r3, 120; - setp.gt.s32 %p2, %r1, %r3; - @%p2 bra $Lt_1_23810; - .loc 16 149 0 - mov.u64 %rd7, __cuda___cuda_local_var_32643_34_non_const_lj13360; - mov.u64 %rd8, __cuda___cuda_local_var_32645_33_non_const_cutsq5296; - cvt.s64.s32 %rd9, %r1; - mul.wide.s32 %rd10, %r1, 16; - ld.param.u64 %rd11, [__cudaparm_kernel_pair_fast_lj1_in]; - add.u64 %rd12, %rd11, %rd10; - add.u64 %rd13, %rd10, %rd7; - ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd12+0]; - st.shared.v4.f32 [%rd13+0], {%f2,%f3,%f4,%f5}; - .loc 16 150 0 - mul.wide.s32 %rd14, %r1, 4; - ld.param.u64 %rd15, [__cudaparm_kernel_pair_fast__cutsq]; - add.u64 %rd16, %rd15, %rd14; - ld.global.f32 %f6, [%rd16+0]; - add.u64 %rd17, %rd14, %rd8; - st.shared.f32 [%rd17+0], %f6; - ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag]; - mov.u32 %r5, 0; - setp.le.s32 %p3, %r4, %r5; - @%p3 bra $Lt_1_24322; - .loc 16 152 0 - mov.u64 %rd18, __cuda___cuda_local_var_32644_34_non_const_lj35792; - ld.param.u64 %rd19, [__cudaparm_kernel_pair_fast_lj3_in]; - add.u64 %rd20, %rd19, %rd10; - add.u64 %rd21, %rd10, %rd18; - ld.global.v4.f32 {%f7,%f8,%f9,%f10}, [%rd20+0]; - st.shared.v4.f32 [%rd21+0], {%f7,%f8,%f9,%f10}; -$Lt_1_24322: - mov.u64 %rd18, __cuda___cuda_local_var_32644_34_non_const_lj35792; -$Lt_1_23810: - mov.u64 %rd7, __cuda___cuda_local_var_32643_34_non_const_lj13360; - mov.u64 %rd8, __cuda___cuda_local_var_32645_33_non_const_cutsq5296; - mov.u64 %rd18, __cuda___cuda_local_var_32644_34_non_const_lj35792; - .loc 16 161 0 - mov.f32 %f11, 0f00000000; // 0 - mov.f32 %f12, %f11; - mov.f32 %f13, 0f00000000; // 0 - mov.f32 %f14, %f13; - mov.f32 %f15, 0f00000000; // 0 - mov.f32 %f16, %f15; - mov.f32 %f17, 0f00000000; // 0 - mov.f32 %f18, %f17; - mov.f32 %f19, 0f00000000; // 0 - mov.f32 %f20, %f19; - mov.f32 %f21, 0f00000000; // 0 - mov.f32 %f22, %f21; - .loc 16 163 0 - bar.sync 0; - ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom]; - div.s32 %r7, %r1, %r6; - cvt.s32.u32 %r8, %ntid.x; - div.s32 %r9, %r8, %r6; - cvt.s32.u32 %r10, %ctaid.x; - mul.lo.s32 %r11, %r10, %r9; - add.s32 %r12, %r7, %r11; - ld.param.s32 %r13, [__cudaparm_kernel_pair_fast_inum]; - setp.ge.s32 %p4, %r12, %r13; - @%p4 bra $Lt_1_34050; - .loc 16 168 0 - cvt.s64.s32 %rd22, %r12; - mul.wide.s32 %rd23, %r12, 4; - ld.param.u64 %rd24, [__cudaparm_kernel_pair_fast_dev_nbor]; - add.u64 %rd25, %rd23, %rd24; - ld.global.s32 %r14, [%rd25+0]; - ld.param.s32 %r15, [__cudaparm_kernel_pair_fast_nbor_pitch]; - cvt.s64.s32 %rd26, %r15; - mul.wide.s32 %rd27, %r15, 4; - add.u64 %rd28, %rd27, %rd25; - ld.global.s32 %r16, [%rd28+0]; - sub.s32 %r17, %r6, 1; - and.b32 %r18, %r17, %r1; - cvt.s64.s32 %rd29, %r18; - mul.wide.s32 %rd30, %r18, 4; - ld.param.u64 %rd31, [__cudaparm_kernel_pair_fast_dev_packed]; - setp.ne.u64 %p5, %rd31, %rd24; - @%p5 bra $Lt_1_25602; - cvt.s32.s64 %r19, %rd26; - mul.lo.s32 %r20, %r19, %r6; - mov.s32 %r21, %r20; - mul.lo.s32 %r22, %r17, %r12; - add.s32 %r23, %r19, %r22; - cvt.s64.s32 %rd32, %r23; - mul.wide.s32 %rd33, %r23, 4; - add.u64 %rd34, %rd28, %rd33; - and.b32 %r24, %r17, %r16; - cvt.s64.s32 %rd35, %r24; - div.s32 %r25, %r16, %r6; - mul.lo.s32 %r26, %r20, %r25; - cvt.s64.s32 %rd36, %r26; - add.u64 %rd37, %rd35, %rd36; - mul.lo.u64 %rd38, %rd37, 4; - add.u64 %rd39, %rd34, %rd38; - add.u64 %rd40, %rd30, %rd34; - bra.uni $Lt_1_25346; -$Lt_1_25602: - add.u64 %rd41, %rd27, %rd28; - ld.global.s32 %r27, [%rd41+0]; - cvt.s64.s32 %rd42, %r27; - mul.wide.s32 %rd43, %r27, 4; - add.u64 %rd44, %rd31, %rd43; - cvt.s64.s32 %rd45, %r16; - mul.wide.s32 %rd46, %r16, 4; - add.u64 %rd39, %rd44, %rd46; - mov.s32 %r21, %r6; - add.u64 %rd40, %rd30, %rd44; -$Lt_1_25346: - .loc 16 171 0 - mov.u32 %r28, %r14; - mov.s32 %r29, 0; - mov.u32 %r30, %r29; - mov.s32 %r31, 0; - mov.u32 %r32, %r31; - mov.s32 %r33, 0; - mov.u32 %r34, %r33; - tex.1d.v4.f32.s32 {%f23,%f24,%f25,%f26},[pos_tex,{%r28,%r30,%r32,%r34}]; - mov.f32 %f27, %f23; - mov.f32 %f28, %f24; - mov.f32 %f29, %f25; - mov.f32 %f30, %f26; - .loc 16 172 0 - mov.u32 %r35, %r14; - mov.s32 %r36, 0; - mov.u32 %r37, %r36; - mov.s32 %r38, 0; - mov.u32 %r39, %r38; - mov.s32 %r40, 0; - mov.u32 %r41, %r40; - tex.1d.v4.f32.s32 {%f31,%f32,%f33,%f34},[q_tex,{%r35,%r37,%r39,%r41}]; - mov.f32 %f35, %f31; - setp.ge.u64 %p6, %rd40, %rd39; - @%p6 bra $Lt_1_35586; - cvt.rzi.ftz.s32.f32 %r42, %f30; - cvt.s64.s32 %rd47, %r21; - mul.lo.s32 %r43, %r42, 11; - cvt.rn.f32.s32 %f36, %r43; - mov.f32 %f37, 0f00000000; // 0 - mov.f32 %f38, 0f00000000; // 0 - mov.f32 %f39, 0f00000000; // 0 - mov.f32 %f40, 0f00000000; // 0 - mov.f32 %f41, 0f00000000; // 0 -$Lt_1_26370: - // Loop body line 172, nesting depth: 1, estimated iterations: unknown - .loc 16 177 0 - ld.global.s32 %r44, [%rd40+0]; - .loc 16 180 0 - shr.s32 %r45, %r44, 30; - and.b32 %r46, %r45, 3; - cvt.s64.s32 %rd48, %r46; - mul.wide.s32 %rd49, %r46, 4; - add.u64 %rd50, %rd1, %rd49; - ld.shared.f32 %f42, [%rd50+0]; - .loc 16 184 0 - and.b32 %r47, %r44, 1073741823; - mov.u32 %r48, %r47; - mov.s32 %r49, 0; - mov.u32 %r50, %r49; - mov.s32 %r51, 0; - mov.u32 %r52, %r51; - mov.s32 %r53, 0; - mov.u32 %r54, %r53; - tex.1d.v4.f32.s32 {%f43,%f44,%f45,%f46},[pos_tex,{%r48,%r50,%r52,%r54}]; - mov.f32 %f47, %f43; - mov.f32 %f48, %f44; - mov.f32 %f49, %f45; - mov.f32 %f50, %f46; - sub.ftz.f32 %f51, %f28, %f48; - sub.ftz.f32 %f52, %f27, %f47; - sub.ftz.f32 %f53, %f29, %f49; - mul.ftz.f32 %f54, %f51, %f51; - fma.rn.ftz.f32 %f55, %f52, %f52, %f54; - fma.rn.ftz.f32 %f56, %f53, %f53, %f55; - add.ftz.f32 %f57, %f36, %f50; - cvt.rzi.ftz.s32.f32 %r55, %f57; - cvt.s64.s32 %rd51, %r55; - mul.wide.s32 %rd52, %r55, 4; - add.u64 %rd53, %rd8, %rd52; - ld.shared.f32 %f58, [%rd53+0]; - setp.gt.ftz.f32 %p7, %f58, %f56; - @!%p7 bra $Lt_1_29186; - rcp.approx.ftz.f32 %f59, %f56; - mul.lo.u64 %rd54, %rd51, 16; - add.u64 %rd55, %rd54, %rd7; - ld.shared.f32 %f60, [%rd55+8]; - setp.lt.ftz.f32 %p8, %f56, %f60; - @!%p8 bra $Lt_1_27394; - .loc 16 198 0 - mul.ftz.f32 %f61, %f59, %f59; - mul.ftz.f32 %f62, %f59, %f61; - mov.f32 %f63, %f62; - .loc 16 199 0 - mul.ftz.f32 %f64, %f62, %f42; - ld.shared.v2.f32 {%f65,%f66}, [%rd55+0]; - mul.ftz.f32 %f67, %f65, %f62; - sub.ftz.f32 %f68, %f67, %f66; - mul.ftz.f32 %f69, %f64, %f68; - bra.uni $Lt_1_27138; -$Lt_1_27394: - .loc 16 201 0 - mov.f32 %f69, 0f00000000; // 0 -$Lt_1_27138: - ld.shared.f32 %f70, [%rd55+12]; - setp.gt.ftz.f32 %p9, %f70, %f56; - @!%p9 bra $Lt_1_27906; - .loc 16 204 0 - mov.u32 %r56, %r47; - mov.s32 %r57, 0; - mov.u32 %r58, %r57; - mov.s32 %r59, 0; - mov.u32 %r60, %r59; - mov.s32 %r61, 0; - mov.u32 %r62, %r61; - tex.1d.v4.f32.s32 {%f71,%f72,%f73,%f74},[q_tex,{%r56,%r58,%r60,%r62}]; - mov.f32 %f75, %f71; - ld.shared.f32 %f76, [%rd50+16]; - ld.param.f32 %f77, [__cudaparm_kernel_pair_fast_qqrd2e]; - mul.ftz.f32 %f78, %f77, %f35; - mul.ftz.f32 %f79, %f75, %f78; - rsqrt.approx.ftz.f32 %f80, %f56; - mul.ftz.f32 %f81, %f79, %f80; - mul.ftz.f32 %f82, %f76, %f81; - bra.uni $Lt_1_27650; -$Lt_1_27906: - .loc 16 206 0 - mov.f32 %f82, 0f00000000; // 0 -$Lt_1_27650: - .loc 16 210 0 - add.ftz.f32 %f83, %f82, %f69; - mul.ftz.f32 %f84, %f83, %f59; - fma.rn.ftz.f32 %f39, %f52, %f84, %f39; - .loc 16 211 0 - fma.rn.ftz.f32 %f38, %f51, %f84, %f38; - .loc 16 212 0 - fma.rn.ftz.f32 %f37, %f53, %f84, %f37; - ld.param.s32 %r63, [__cudaparm_kernel_pair_fast_eflag]; - mov.u32 %r64, 0; - setp.le.s32 %p10, %r63, %r64; - @%p10 bra $Lt_1_28674; - .loc 16 215 0 - add.ftz.f32 %f40, %f82, %f40; - @!%p8 bra $Lt_1_28674; - .loc 16 217 0 - add.u64 %rd56, %rd54, %rd18; - mov.f32 %f85, %f63; - ld.shared.v4.f32 {%f86,%f87,%f88,_}, [%rd56+0]; - mul.ftz.f32 %f89, %f86, %f85; - sub.ftz.f32 %f90, %f89, %f87; - mul.ftz.f32 %f91, %f85, %f90; - .loc 16 218 0 - sub.ftz.f32 %f92, %f91, %f88; - fma.rn.ftz.f32 %f41, %f42, %f92, %f41; -$Lt_1_28674: -$Lt_1_28162: - ld.param.s32 %r65, [__cudaparm_kernel_pair_fast_vflag]; - mov.u32 %r66, 0; - setp.le.s32 %p11, %r65, %r66; - @%p11 bra $Lt_1_29186; - .loc 16 222 0 - mov.f32 %f93, %f12; - mul.ftz.f32 %f94, %f52, %f52; - fma.rn.ftz.f32 %f95, %f84, %f94, %f93; - mov.f32 %f12, %f95; - .loc 16 223 0 - mov.f32 %f96, %f14; - fma.rn.ftz.f32 %f97, %f84, %f54, %f96; - mov.f32 %f14, %f97; - .loc 16 224 0 - mov.f32 %f98, %f16; - mul.ftz.f32 %f99, %f53, %f53; - fma.rn.ftz.f32 %f100, %f84, %f99, %f98; - mov.f32 %f16, %f100; - .loc 16 225 0 - mov.f32 %f101, %f18; - mul.ftz.f32 %f102, %f51, %f52; - fma.rn.ftz.f32 %f103, %f84, %f102, %f101; - mov.f32 %f18, %f103; - .loc 16 226 0 - mov.f32 %f104, %f20; - mul.ftz.f32 %f105, %f52, %f53; - fma.rn.ftz.f32 %f106, %f84, %f105, %f104; - mov.f32 %f20, %f106; - .loc 16 227 0 - mul.ftz.f32 %f107, %f51, %f53; - fma.rn.ftz.f32 %f21, %f84, %f107, %f21; - mov.f32 %f22, %f21; -$Lt_1_29186: -$Lt_1_26626: - .loc 16 176 0 - mul.lo.u64 %rd57, %rd47, 4; - add.u64 %rd40, %rd40, %rd57; - setp.lt.u64 %p12, %rd40, %rd39; - @%p12 bra $Lt_1_26370; - bra.uni $Lt_1_25858; -$Lt_1_35586: - mov.f32 %f37, 0f00000000; // 0 - mov.f32 %f38, 0f00000000; // 0 - mov.f32 %f39, 0f00000000; // 0 - mov.f32 %f40, 0f00000000; // 0 - mov.f32 %f41, 0f00000000; // 0 -$Lt_1_25858: - mov.u32 %r67, 1; - setp.le.s32 %p13, %r6, %r67; - @%p13 bra $Lt_1_32002; - .loc 16 232 0 - mov.u64 %rd58, __cuda___cuda_local_var_32733_55_non_const_red_acc7728; - cvt.s64.s32 %rd59, %r1; - mul.wide.s32 %rd60, %r1, 4; - add.u64 %rd61, %rd58, %rd60; - mov.f32 %f108, %f39; - st.shared.f32 [%rd61+0], %f108; - mov.f32 %f109, %f38; - st.shared.f32 [%rd61+512], %f109; - mov.f32 %f110, %f37; - st.shared.f32 [%rd61+1024], %f110; - mov.f32 %f111, %f41; - st.shared.f32 [%rd61+1536], %f111; - mov.f32 %f112, %f40; - st.shared.f32 [%rd61+2048], %f112; - shr.s32 %r68, %r6, 31; - mov.s32 %r69, 1; - and.b32 %r70, %r68, %r69; - add.s32 %r71, %r70, %r6; - shr.s32 %r72, %r71, 1; - mov.s32 %r73, %r72; - mov.u32 %r74, 0; - setp.ne.u32 %p14, %r72, %r74; - @!%p14 bra $Lt_1_30466; -$Lt_1_30978: - setp.ge.u32 %p15, %r18, %r73; - @%p15 bra $Lt_1_31234; - add.u32 %r75, %r1, %r73; - cvt.u64.u32 %rd62, %r75; - mul.wide.u32 %rd63, %r75, 4; - add.u64 %rd64, %rd58, %rd63; - ld.shared.f32 %f113, [%rd64+0]; - add.ftz.f32 %f108, %f113, %f108; - st.shared.f32 [%rd61+0], %f108; - ld.shared.f32 %f114, [%rd64+512]; - add.ftz.f32 %f109, %f114, %f109; - st.shared.f32 [%rd61+512], %f109; - ld.shared.f32 %f115, [%rd64+1024]; - add.ftz.f32 %f110, %f115, %f110; - st.shared.f32 [%rd61+1024], %f110; - ld.shared.f32 %f116, [%rd64+1536]; - add.ftz.f32 %f111, %f116, %f111; - st.shared.f32 [%rd61+1536], %f111; - ld.shared.f32 %f117, [%rd64+2048]; - add.ftz.f32 %f112, %f117, %f112; - st.shared.f32 [%rd61+2048], %f112; -$Lt_1_31234: - shr.u32 %r73, %r73, 1; - mov.u32 %r76, 0; - setp.ne.u32 %p16, %r73, %r76; - @%p16 bra $Lt_1_30978; -$Lt_1_30466: - mov.f32 %f39, %f108; - mov.f32 %f38, %f109; - mov.f32 %f37, %f110; - mov.f32 %f41, %f111; - mov.f32 %f40, %f112; - ld.param.s32 %r77, [__cudaparm_kernel_pair_fast_vflag]; - mov.u32 %r78, 0; - setp.le.s32 %p17, %r77, %r78; - @%p17 bra $Lt_1_32002; - mov.f32 %f108, %f12; - st.shared.f32 [%rd61+0], %f108; - mov.f32 %f109, %f14; - st.shared.f32 [%rd61+512], %f109; - mov.f32 %f110, %f16; - st.shared.f32 [%rd61+1024], %f110; - mov.f32 %f111, %f18; - st.shared.f32 [%rd61+1536], %f111; - mov.f32 %f112, %f20; - st.shared.f32 [%rd61+2048], %f112; - mov.f32 %f118, %f21; - st.shared.f32 [%rd61+2560], %f118; - mov.s32 %r79, %r72; - @!%p14 bra $Lt_1_32514; -$Lt_1_33026: - setp.ge.u32 %p18, %r18, %r79; - @%p18 bra $Lt_1_33282; - add.u32 %r80, %r1, %r79; - cvt.u64.u32 %rd65, %r80; - mul.wide.u32 %rd66, %r80, 4; - add.u64 %rd67, %rd58, %rd66; - ld.shared.f32 %f119, [%rd67+0]; - add.ftz.f32 %f108, %f119, %f108; - st.shared.f32 [%rd61+0], %f108; - ld.shared.f32 %f120, [%rd67+512]; - add.ftz.f32 %f109, %f120, %f109; - st.shared.f32 [%rd61+512], %f109; - ld.shared.f32 %f121, [%rd67+1024]; - add.ftz.f32 %f110, %f121, %f110; - st.shared.f32 [%rd61+1024], %f110; - ld.shared.f32 %f122, [%rd67+1536]; - add.ftz.f32 %f111, %f122, %f111; - st.shared.f32 [%rd61+1536], %f111; - ld.shared.f32 %f123, [%rd67+2048]; - add.ftz.f32 %f112, %f123, %f112; - st.shared.f32 [%rd61+2048], %f112; - ld.shared.f32 %f124, [%rd67+2560]; - add.ftz.f32 %f118, %f124, %f118; - st.shared.f32 [%rd61+2560], %f118; -$Lt_1_33282: - shr.u32 %r79, %r79, 1; - mov.u32 %r81, 0; - setp.ne.u32 %p19, %r79, %r81; - @%p19 bra $Lt_1_33026; -$Lt_1_32514: - mov.f32 %f12, %f108; - mov.f32 %f14, %f109; - mov.f32 %f16, %f110; - mov.f32 %f18, %f111; - mov.f32 %f20, %f112; - mov.f32 %f22, %f118; -$Lt_1_32002: -$Lt_1_29954: - mov.u32 %r82, 0; - setp.ne.s32 %p20, %r18, %r82; - @%p20 bra $Lt_1_34050; - ld.param.u64 %rd68, [__cudaparm_kernel_pair_fast___val_paramengv]; - add.u64 %rd69, %rd68, %rd23; - ld.param.s32 %r83, [__cudaparm_kernel_pair_fast_eflag]; - mov.u32 %r84, 0; - setp.le.s32 %p21, %r83, %r84; - @%p21 bra $Lt_1_34562; - st.global.f32 [%rd69+0], %f41; - cvt.s64.s32 %rd70, %r13; - mul.wide.s32 %rd71, %r13, 4; - add.u64 %rd72, %rd71, %rd69; - st.global.f32 [%rd72+0], %f40; - add.u64 %rd69, %rd71, %rd72; -$Lt_1_34562: - ld.param.s32 %r85, [__cudaparm_kernel_pair_fast_vflag]; - mov.u32 %r86, 0; - setp.le.s32 %p22, %r85, %r86; - @%p22 bra $Lt_1_35074; - mov.f32 %f125, %f12; - st.global.f32 [%rd69+0], %f125; - cvt.s64.s32 %rd73, %r13; - mul.wide.s32 %rd74, %r13, 4; - add.u64 %rd75, %rd74, %rd69; - mov.f32 %f126, %f14; - st.global.f32 [%rd75+0], %f126; - add.u64 %rd76, %rd74, %rd75; - mov.f32 %f127, %f16; - st.global.f32 [%rd76+0], %f127; - add.u64 %rd77, %rd74, %rd76; - mov.f32 %f128, %f18; - st.global.f32 [%rd77+0], %f128; - add.u64 %rd69, %rd74, %rd77; - mov.f32 %f129, %f20; - st.global.f32 [%rd69+0], %f129; - mov.f32 %f130, %f22; - add.u64 %rd78, %rd74, %rd69; - st.global.f32 [%rd78+0], %f130; -$Lt_1_35074: - ld.param.u64 %rd79, [__cudaparm_kernel_pair_fast_ans]; - mul.lo.u64 %rd80, %rd22, 16; - add.u64 %rd81, %rd79, %rd80; - mov.f32 %f131, %f132; - st.global.v4.f32 [%rd81+0], {%f39,%f38,%f37,%f131}; -$Lt_1_34050: -$Lt_1_24834: - .loc 16 235 0 - exit; -$LDWend_kernel_pair_fast: - } // kernel_pair_fast - diff --git a/lib/gpu/lj_coul_long.ptx b/lib/gpu/lj_coul_long.ptx deleted file mode 100644 index d824a1a1cf..0000000000 --- a/lib/gpu/lj_coul_long.ptx +++ /dev/null @@ -1,1123 +0,0 @@ - .version 2.3 - .target sm_20 - .address_size 64 - // compiled with /usr/local/cuda/open64/lib//be - // nvopencc 4.0 built on 2011-05-12 - - //----------------------------------------------------------- - // Compiling /tmp/tmpxft_00009d58_00000000-9_lal_lj_coul_long.cpp3.i (/home/sjplimp/ccBI#.SRe06h) - //----------------------------------------------------------- - - //----------------------------------------------------------- - // Options: - //----------------------------------------------------------- - // Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64 - // -O3 (Optimization level) - // -g0 (Debug level) - // -m2 (Report advisories) - //----------------------------------------------------------- - - .file 1 "" - .file 2 "/tmp/tmpxft_00009d58_00000000-8_lal_lj_coul_long.cudafe2.gpu" - .file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h" - .file 4 "/usr/local/cuda/include/crt/device_runtime.h" - .file 5 "/usr/local/cuda/include/host_defines.h" - .file 6 "/usr/local/cuda/include/builtin_types.h" - .file 7 "/usr/local/cuda/include/device_types.h" - .file 8 "/usr/local/cuda/include/driver_types.h" - .file 9 "/usr/local/cuda/include/surface_types.h" - .file 10 "/usr/local/cuda/include/texture_types.h" - .file 11 "/usr/local/cuda/include/vector_types.h" - .file 12 "/usr/local/cuda/include/device_launch_parameters.h" - .file 13 "/usr/local/cuda/include/crt/storage_class.h" - .file 14 "/usr/include/bits/types.h" - .file 15 "/usr/include/time.h" - .file 16 "lal_lj_coul_long.cu" - .file 17 "/usr/local/cuda/include/common_functions.h" - .file 18 "/usr/local/cuda/include/math_functions.h" - .file 19 "/usr/local/cuda/include/math_constants.h" - .file 20 "/usr/local/cuda/include/device_functions.h" - .file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h" - .file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h" - .file 23 "/usr/local/cuda/include/sm_13_double_functions.h" - .file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h" - .file 25 "/usr/local/cuda/include/sm_20_intrinsics.h" - .file 26 "/usr/local/cuda/include/surface_functions.h" - .file 27 "/usr/local/cuda/include/texture_fetch_functions.h" - .file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h" - - .global .texref pos_tex; - .global .texref q_tex; - - .entry kernel_pair ( - .param .u64 __cudaparm_kernel_pair_x_, - .param .u64 __cudaparm_kernel_pair_lj1, - .param .u64 __cudaparm_kernel_pair_lj3, - .param .s32 __cudaparm_kernel_pair_lj_types, - .param .u64 __cudaparm_kernel_pair_sp_lj_in, - .param .u64 __cudaparm_kernel_pair_dev_nbor, - .param .u64 __cudaparm_kernel_pair_dev_packed, - .param .u64 __cudaparm_kernel_pair_ans, - .param .u64 __cudaparm_kernel_pair___val_paramengv, - .param .s32 __cudaparm_kernel_pair_eflag, - .param .s32 __cudaparm_kernel_pair_vflag, - .param .s32 __cudaparm_kernel_pair_inum, - .param .s32 __cudaparm_kernel_pair_nbor_pitch, - .param .u64 __cudaparm_kernel_pair_q_, - .param .f32 __cudaparm_kernel_pair_cut_coulsq, - .param .f32 __cudaparm_kernel_pair_qqrd2e, - .param .f32 __cudaparm_kernel_pair_g_ewald, - .param .s32 __cudaparm_kernel_pair_t_per_atom) - { - .reg .u32 %r<86>; - .reg .u64 %rd<64>; - .reg .f32 %f<164>; - .reg .pred %p<21>; - .shared .align 16 .b8 __cuda___cuda_local_var_32541_33_non_const_sp_lj112[32]; - .shared .align 4 .b8 __cuda___cuda_local_var_32633_55_non_const_red_acc144[3072]; - // __cuda_local_var_32553_10_non_const_f = 64 - // __cuda_local_var_32555_9_non_const_virial = 16 - // __cuda_local_var_32589_43_non_const_r6inv = 40 - // __cuda_local_var_32589_50_non_const_prefactor = 48 - // __cuda_local_var_32589_61_non_const__erfc = 44 - .loc 16 36 0 -$LDWbegin_kernel_pair: - .loc 16 41 0 - ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in]; - ldu.global.f32 %f1, [%rd1+0]; - .loc 16 42 0 - ld.global.f32 %f2, [%rd1+4]; - .loc 16 43 0 - ld.global.f32 %f3, [%rd1+8]; - .loc 16 44 0 - ld.global.f32 %f4, [%rd1+12]; - st.shared.v4.f32 [__cuda___cuda_local_var_32541_33_non_const_sp_lj112+0], {%f1,%f2,%f3,%f4}; - .loc 16 45 0 - ld.global.f32 %f5, [%rd1+16]; - .loc 16 46 0 - ld.global.f32 %f6, [%rd1+20]; - .loc 16 47 0 - ld.global.f32 %f7, [%rd1+24]; - .loc 16 48 0 - ld.global.f32 %f8, [%rd1+28]; - st.shared.v4.f32 [__cuda___cuda_local_var_32541_33_non_const_sp_lj112+16], {%f5,%f6,%f7,%f8}; - .loc 16 56 0 - mov.f32 %f9, 0f00000000; // 0 - mov.f32 %f10, %f9; - mov.f32 %f11, 0f00000000; // 0 - mov.f32 %f12, %f11; - mov.f32 %f13, 0f00000000; // 0 - mov.f32 %f14, %f13; - mov.f32 %f15, 0f00000000; // 0 - mov.f32 %f16, %f15; - mov.f32 %f17, 0f00000000; // 0 - mov.f32 %f18, %f17; - mov.f32 %f19, 0f00000000; // 0 - mov.f32 %f20, %f19; - ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom]; - cvt.s32.u32 %r2, %tid.x; - div.s32 %r3, %r2, %r1; - cvt.s32.u32 %r4, %ntid.x; - div.s32 %r5, %r4, %r1; - cvt.s32.u32 %r6, %ctaid.x; - mul.lo.s32 %r7, %r6, %r5; - add.s32 %r8, %r3, %r7; - ld.param.s32 %r9, [__cudaparm_kernel_pair_inum]; - setp.ge.s32 %p1, %r8, %r9; - @%p1 bra $Lt_0_30978; - .loc 16 61 0 - cvt.s64.s32 %rd2, %r8; - mul.wide.s32 %rd3, %r8, 4; - ld.param.u64 %rd4, [__cudaparm_kernel_pair_dev_nbor]; - add.u64 %rd5, %rd3, %rd4; - ld.global.s32 %r10, [%rd5+0]; - ld.param.s32 %r11, [__cudaparm_kernel_pair_nbor_pitch]; - cvt.s64.s32 %rd6, %r11; - mul.wide.s32 %rd7, %r11, 4; - add.u64 %rd8, %rd7, %rd5; - ld.global.s32 %r12, [%rd8+0]; - sub.s32 %r13, %r1, 1; - and.b32 %r14, %r13, %r2; - cvt.s64.s32 %rd9, %r14; - mul.wide.s32 %rd10, %r14, 4; - ld.param.u64 %rd11, [__cudaparm_kernel_pair_dev_packed]; - setp.ne.u64 %p2, %rd11, %rd4; - @%p2 bra $Lt_0_22530; - cvt.s32.s64 %r15, %rd6; - mul.lo.s32 %r16, %r15, %r1; - mov.s32 %r17, %r16; - mul.lo.s32 %r18, %r13, %r8; - add.s32 %r19, %r15, %r18; - cvt.s64.s32 %rd12, %r19; - mul.wide.s32 %rd13, %r19, 4; - add.u64 %rd14, %rd8, %rd13; - and.b32 %r20, %r13, %r12; - cvt.s64.s32 %rd15, %r20; - div.s32 %r21, %r12, %r1; - mul.lo.s32 %r22, %r16, %r21; - cvt.s64.s32 %rd16, %r22; - add.u64 %rd17, %rd15, %rd16; - mul.lo.u64 %rd18, %rd17, 4; - add.u64 %rd19, %rd14, %rd18; - add.u64 %rd20, %rd10, %rd14; - bra.uni $Lt_0_22274; -$Lt_0_22530: - add.u64 %rd21, %rd7, %rd8; - ld.global.s32 %r23, [%rd21+0]; - cvt.s64.s32 %rd22, %r23; - mul.wide.s32 %rd23, %r23, 4; - add.u64 %rd24, %rd11, %rd23; - cvt.s64.s32 %rd25, %r12; - mul.wide.s32 %rd26, %r12, 4; - add.u64 %rd19, %rd24, %rd26; - mov.s32 %r17, %r1; - add.u64 %rd20, %rd10, %rd24; -$Lt_0_22274: - .loc 16 64 0 - mov.u32 %r24, %r10; - mov.s32 %r25, 0; - mov.u32 %r26, %r25; - mov.s32 %r27, 0; - mov.u32 %r28, %r27; - mov.s32 %r29, 0; - mov.u32 %r30, %r29; - tex.1d.v4.f32.s32 {%f21,%f22,%f23,%f24},[pos_tex,{%r24,%r26,%r28,%r30}]; - mov.f32 %f25, %f21; - mov.f32 %f26, %f22; - mov.f32 %f27, %f23; - mov.f32 %f28, %f24; - .loc 16 65 0 - mov.u32 %r31, %r10; - mov.s32 %r32, 0; - mov.u32 %r33, %r32; - mov.s32 %r34, 0; - mov.u32 %r35, %r34; - mov.s32 %r36, 0; - mov.u32 %r37, %r36; - tex.1d.v4.f32.s32 {%f29,%f30,%f31,%f32},[q_tex,{%r31,%r33,%r35,%r37}]; - mov.f32 %f33, %f29; - setp.ge.u64 %p3, %rd20, %rd19; - @%p3 bra $Lt_0_32514; - cvt.rzi.ftz.s32.f32 %r38, %f28; - cvt.s64.s32 %rd27, %r17; - ld.param.s32 %r39, [__cudaparm_kernel_pair_lj_types]; - mul.lo.s32 %r40, %r39, %r38; - ld.param.u64 %rd28, [__cudaparm_kernel_pair_lj1]; - mov.f32 %f34, 0f00000000; // 0 - mov.f32 %f35, 0f00000000; // 0 - mov.f32 %f36, 0f00000000; // 0 - mov.f32 %f37, 0f00000000; // 0 - mov.f32 %f38, 0f00000000; // 0 - mov.u64 %rd29, __cuda___cuda_local_var_32541_33_non_const_sp_lj112; -$Lt_0_23298: - // Loop body line 65, nesting depth: 1, estimated iterations: unknown - .loc 16 69 0 - ld.global.s32 %r41, [%rd20+0]; - .loc 16 72 0 - shr.s32 %r42, %r41, 30; - and.b32 %r43, %r42, 3; - cvt.s64.s32 %rd30, %r43; - mul.wide.s32 %rd31, %r43, 4; - add.u64 %rd32, %rd29, %rd31; - ld.shared.f32 %f39, [%rd32+0]; - .loc 16 73 0 - mov.f32 %f40, 0f3f800000; // 1 - ld.shared.f32 %f41, [%rd32+16]; - sub.ftz.f32 %f42, %f40, %f41; - .loc 16 76 0 - and.b32 %r44, %r41, 1073741823; - mov.u32 %r45, %r44; - mov.s32 %r46, 0; - mov.u32 %r47, %r46; - mov.s32 %r48, 0; - mov.u32 %r49, %r48; - mov.s32 %r50, 0; - mov.u32 %r51, %r50; - tex.1d.v4.f32.s32 {%f43,%f44,%f45,%f46},[pos_tex,{%r45,%r47,%r49,%r51}]; - mov.f32 %f47, %f43; - mov.f32 %f48, %f44; - mov.f32 %f49, %f45; - mov.f32 %f50, %f46; - cvt.rzi.ftz.s32.f32 %r52, %f50; - sub.ftz.f32 %f51, %f26, %f48; - sub.ftz.f32 %f52, %f25, %f47; - sub.ftz.f32 %f53, %f27, %f49; - mul.ftz.f32 %f54, %f51, %f51; - fma.rn.ftz.f32 %f55, %f52, %f52, %f54; - fma.rn.ftz.f32 %f56, %f53, %f53, %f55; - add.s32 %r53, %r52, %r40; - cvt.s64.s32 %rd33, %r53; - mul.wide.s32 %rd34, %r53, 16; - add.u64 %rd35, %rd34, %rd28; - ld.global.f32 %f57, [%rd35+8]; - setp.gt.ftz.f32 %p4, %f57, %f56; - @!%p4 bra $Lt_0_26114; - rcp.approx.ftz.f32 %f58, %f56; - ld.global.f32 %f59, [%rd35+12]; - setp.lt.ftz.f32 %p5, %f56, %f59; - @!%p5 bra $Lt_0_24322; - .loc 16 91 0 - mul.ftz.f32 %f60, %f58, %f58; - mul.ftz.f32 %f61, %f58, %f60; - mov.f32 %f62, %f61; - .loc 16 92 0 - mul.ftz.f32 %f63, %f61, %f39; - ld.global.v2.f32 {%f64,%f65}, [%rd35+0]; - mul.ftz.f32 %f66, %f64, %f61; - sub.ftz.f32 %f67, %f66, %f65; - mul.ftz.f32 %f68, %f63, %f67; - bra.uni $Lt_0_24066; -$Lt_0_24322: - .loc 16 94 0 - mov.f32 %f68, 0f00000000; // 0 -$Lt_0_24066: - ld.param.f32 %f69, [__cudaparm_kernel_pair_cut_coulsq]; - setp.gt.ftz.f32 %p6, %f69, %f56; - @!%p6 bra $Lt_0_24834; - .loc 20 518 0 - rsqrt.approx.ftz.f32 %f70, %f58; - ld.param.f32 %f71, [__cudaparm_kernel_pair_g_ewald]; - mul.ftz.f32 %f72, %f71, %f70; - mul.ftz.f32 %f73, %f72, %f72; - neg.ftz.f32 %f74, %f73; - mov.f32 %f75, 0f3fb8aa3b; // 1.4427 - mul.ftz.f32 %f76, %f74, %f75; - ex2.approx.ftz.f32 %f77, %f76; - .loc 16 101 0 - mov.f32 %f78, 0f3f800000; // 1 - mov.f32 %f79, 0f3ea7ba05; // 0.327591 - fma.rn.ftz.f32 %f80, %f79, %f72, %f78; - rcp.approx.ftz.f32 %f81, %f80; - mov.f32 %f82, 0f3e827906; // 0.25483 - mov.f32 %f83, 0fbe91a98e; // -0.284497 - mov.f32 %f84, 0f3fb5f0e3; // 1.42141 - mov.f32 %f85, 0fbfba00e3; // -1.45315 - mov.f32 %f86, 0f3f87dc22; // 1.06141 - fma.rn.ftz.f32 %f87, %f86, %f81, %f85; - fma.rn.ftz.f32 %f88, %f81, %f87, %f84; - fma.rn.ftz.f32 %f89, %f81, %f88, %f83; - fma.rn.ftz.f32 %f90, %f81, %f89, %f82; - mul.ftz.f32 %f91, %f81, %f90; - mul.ftz.f32 %f92, %f77, %f91; - mov.f32 %f93, %f92; - .loc 16 102 0 - mov.u32 %r54, %r44; - mov.s32 %r55, 0; - mov.u32 %r56, %r55; - mov.s32 %r57, 0; - mov.u32 %r58, %r57; - mov.s32 %r59, 0; - mov.u32 %r60, %r59; - tex.1d.v4.f32.s32 {%f94,%f95,%f96,%f97},[q_tex,{%r54,%r56,%r58,%r60}]; - mov.f32 %f98, %f94; - ld.param.f32 %f99, [__cudaparm_kernel_pair_qqrd2e]; - mul.ftz.f32 %f100, %f99, %f33; - mul.ftz.f32 %f101, %f100, %f98; - div.approx.ftz.f32 %f102, %f101, %f70; - mov.f32 %f103, %f102; - .loc 16 103 0 - mov.f32 %f104, 0f3f906ebb; // 1.12838 - mul.ftz.f32 %f105, %f72, %f104; - fma.rn.ftz.f32 %f106, %f77, %f105, %f92; - sub.ftz.f32 %f107, %f106, %f42; - mul.ftz.f32 %f108, %f102, %f107; - bra.uni $Lt_0_24578; -$Lt_0_24834: - .loc 16 105 0 - mov.f32 %f108, 0f00000000; // 0 -$Lt_0_24578: - .loc 16 109 0 - add.ftz.f32 %f109, %f108, %f68; - mul.ftz.f32 %f110, %f109, %f58; - fma.rn.ftz.f32 %f36, %f52, %f110, %f36; - .loc 16 110 0 - fma.rn.ftz.f32 %f35, %f51, %f110, %f35; - .loc 16 111 0 - fma.rn.ftz.f32 %f34, %f53, %f110, %f34; - ld.param.s32 %r61, [__cudaparm_kernel_pair_eflag]; - mov.u32 %r62, 0; - setp.le.s32 %p7, %r61, %r62; - @%p7 bra $Lt_0_25602; - .loc 16 114 0 - mov.f32 %f111, %f103; - mov.f32 %f112, %f93; - sub.ftz.f32 %f113, %f112, %f42; - fma.rn.ftz.f32 %f114, %f111, %f113, %f37; - selp.f32 %f37, %f114, %f37, %p6; - @!%p5 bra $Lt_0_25602; - .loc 16 118 0 - ld.param.u64 %rd36, [__cudaparm_kernel_pair_lj3]; - add.u64 %rd37, %rd36, %rd34; - mov.f32 %f115, %f62; - ld.global.v4.f32 {%f116,%f117,%f118,_}, [%rd37+0]; - mul.ftz.f32 %f119, %f116, %f115; - sub.ftz.f32 %f120, %f119, %f117; - mul.ftz.f32 %f121, %f115, %f120; - sub.ftz.f32 %f122, %f121, %f118; - fma.rn.ftz.f32 %f38, %f39, %f122, %f38; -$Lt_0_25602: -$Lt_0_25090: - ld.param.s32 %r63, [__cudaparm_kernel_pair_vflag]; - mov.u32 %r64, 0; - setp.le.s32 %p8, %r63, %r64; - @%p8 bra $Lt_0_26114; - .loc 16 122 0 - mov.f32 %f123, %f10; - mul.ftz.f32 %f124, %f52, %f52; - fma.rn.ftz.f32 %f125, %f110, %f124, %f123; - mov.f32 %f10, %f125; - .loc 16 123 0 - mov.f32 %f126, %f12; - fma.rn.ftz.f32 %f127, %f110, %f54, %f126; - mov.f32 %f12, %f127; - .loc 16 124 0 - mov.f32 %f128, %f14; - mul.ftz.f32 %f129, %f53, %f53; - fma.rn.ftz.f32 %f130, %f110, %f129, %f128; - mov.f32 %f14, %f130; - .loc 16 125 0 - mov.f32 %f131, %f16; - mul.ftz.f32 %f132, %f51, %f52; - fma.rn.ftz.f32 %f133, %f110, %f132, %f131; - mov.f32 %f16, %f133; - .loc 16 126 0 - mov.f32 %f134, %f18; - mul.ftz.f32 %f135, %f52, %f53; - fma.rn.ftz.f32 %f136, %f110, %f135, %f134; - mov.f32 %f18, %f136; - .loc 16 127 0 - mul.ftz.f32 %f137, %f51, %f53; - fma.rn.ftz.f32 %f19, %f110, %f137, %f19; - mov.f32 %f20, %f19; -$Lt_0_26114: -$Lt_0_23554: - .loc 16 68 0 - mul.lo.u64 %rd38, %rd27, 4; - add.u64 %rd20, %rd20, %rd38; - setp.lt.u64 %p9, %rd20, %rd19; - @%p9 bra $Lt_0_23298; - bra.uni $Lt_0_22786; -$Lt_0_32514: - mov.f32 %f34, 0f00000000; // 0 - mov.f32 %f35, 0f00000000; // 0 - mov.f32 %f36, 0f00000000; // 0 - mov.f32 %f37, 0f00000000; // 0 - mov.f32 %f38, 0f00000000; // 0 -$Lt_0_22786: - mov.u32 %r65, 1; - setp.le.s32 %p10, %r1, %r65; - @%p10 bra $Lt_0_28930; - .loc 16 132 0 - mov.u64 %rd39, __cuda___cuda_local_var_32633_55_non_const_red_acc144; - cvt.s64.s32 %rd40, %r2; - mul.wide.s32 %rd41, %r2, 4; - add.u64 %rd42, %rd39, %rd41; - mov.f32 %f138, %f36; - st.shared.f32 [%rd42+0], %f138; - mov.f32 %f139, %f35; - st.shared.f32 [%rd42+512], %f139; - mov.f32 %f140, %f34; - st.shared.f32 [%rd42+1024], %f140; - mov.f32 %f141, %f38; - st.shared.f32 [%rd42+1536], %f141; - mov.f32 %f142, %f37; - st.shared.f32 [%rd42+2048], %f142; - shr.s32 %r66, %r1, 31; - mov.s32 %r67, 1; - and.b32 %r68, %r66, %r67; - add.s32 %r69, %r68, %r1; - shr.s32 %r70, %r69, 1; - mov.s32 %r71, %r70; - mov.u32 %r72, 0; - setp.ne.u32 %p11, %r70, %r72; - @!%p11 bra $Lt_0_27394; -$Lt_0_27906: - setp.ge.u32 %p12, %r14, %r71; - @%p12 bra $Lt_0_28162; - add.u32 %r73, %r2, %r71; - cvt.u64.u32 %rd43, %r73; - mul.wide.u32 %rd44, %r73, 4; - add.u64 %rd45, %rd39, %rd44; - ld.shared.f32 %f143, [%rd45+0]; - add.ftz.f32 %f138, %f143, %f138; - st.shared.f32 [%rd42+0], %f138; - ld.shared.f32 %f144, [%rd45+512]; - add.ftz.f32 %f139, %f144, %f139; - st.shared.f32 [%rd42+512], %f139; - ld.shared.f32 %f145, [%rd45+1024]; - add.ftz.f32 %f140, %f145, %f140; - st.shared.f32 [%rd42+1024], %f140; - ld.shared.f32 %f146, [%rd45+1536]; - add.ftz.f32 %f141, %f146, %f141; - st.shared.f32 [%rd42+1536], %f141; - ld.shared.f32 %f147, [%rd45+2048]; - add.ftz.f32 %f142, %f147, %f142; - st.shared.f32 [%rd42+2048], %f142; -$Lt_0_28162: - shr.u32 %r71, %r71, 1; - mov.u32 %r74, 0; - setp.ne.u32 %p13, %r71, %r74; - @%p13 bra $Lt_0_27906; -$Lt_0_27394: - mov.f32 %f36, %f138; - mov.f32 %f35, %f139; - mov.f32 %f34, %f140; - mov.f32 %f38, %f141; - mov.f32 %f37, %f142; - ld.param.s32 %r75, [__cudaparm_kernel_pair_vflag]; - mov.u32 %r76, 0; - setp.le.s32 %p14, %r75, %r76; - @%p14 bra $Lt_0_28930; - mov.f32 %f138, %f10; - st.shared.f32 [%rd42+0], %f138; - mov.f32 %f139, %f12; - st.shared.f32 [%rd42+512], %f139; - mov.f32 %f140, %f14; - st.shared.f32 [%rd42+1024], %f140; - mov.f32 %f141, %f16; - st.shared.f32 [%rd42+1536], %f141; - mov.f32 %f142, %f18; - st.shared.f32 [%rd42+2048], %f142; - mov.f32 %f148, %f19; - st.shared.f32 [%rd42+2560], %f148; - mov.s32 %r77, %r70; - @!%p11 bra $Lt_0_29442; -$Lt_0_29954: - setp.ge.u32 %p15, %r14, %r77; - @%p15 bra $Lt_0_30210; - add.u32 %r78, %r2, %r77; - cvt.u64.u32 %rd46, %r78; - mul.wide.u32 %rd47, %r78, 4; - add.u64 %rd48, %rd39, %rd47; - ld.shared.f32 %f149, [%rd48+0]; - add.ftz.f32 %f138, %f149, %f138; - st.shared.f32 [%rd42+0], %f138; - ld.shared.f32 %f150, [%rd48+512]; - add.ftz.f32 %f139, %f150, %f139; - st.shared.f32 [%rd42+512], %f139; - ld.shared.f32 %f151, [%rd48+1024]; - add.ftz.f32 %f140, %f151, %f140; - st.shared.f32 [%rd42+1024], %f140; - ld.shared.f32 %f152, [%rd48+1536]; - add.ftz.f32 %f141, %f152, %f141; - st.shared.f32 [%rd42+1536], %f141; - ld.shared.f32 %f153, [%rd48+2048]; - add.ftz.f32 %f142, %f153, %f142; - st.shared.f32 [%rd42+2048], %f142; - ld.shared.f32 %f154, [%rd48+2560]; - add.ftz.f32 %f148, %f154, %f148; - st.shared.f32 [%rd42+2560], %f148; -$Lt_0_30210: - shr.u32 %r77, %r77, 1; - mov.u32 %r79, 0; - setp.ne.u32 %p16, %r77, %r79; - @%p16 bra $Lt_0_29954; -$Lt_0_29442: - mov.f32 %f10, %f138; - mov.f32 %f12, %f139; - mov.f32 %f14, %f140; - mov.f32 %f16, %f141; - mov.f32 %f18, %f142; - mov.f32 %f20, %f148; -$Lt_0_28930: -$Lt_0_26882: - mov.u32 %r80, 0; - setp.ne.s32 %p17, %r14, %r80; - @%p17 bra $Lt_0_30978; - ld.param.u64 %rd49, [__cudaparm_kernel_pair___val_paramengv]; - add.u64 %rd50, %rd49, %rd3; - ld.param.s32 %r81, [__cudaparm_kernel_pair_eflag]; - mov.u32 %r82, 0; - setp.le.s32 %p18, %r81, %r82; - @%p18 bra $Lt_0_31490; - st.global.f32 [%rd50+0], %f38; - cvt.s64.s32 %rd51, %r9; - mul.wide.s32 %rd52, %r9, 4; - add.u64 %rd53, %rd52, %rd50; - st.global.f32 [%rd53+0], %f37; - add.u64 %rd50, %rd52, %rd53; -$Lt_0_31490: - ld.param.s32 %r83, [__cudaparm_kernel_pair_vflag]; - mov.u32 %r84, 0; - setp.le.s32 %p19, %r83, %r84; - @%p19 bra $Lt_0_32002; - mov.f32 %f155, %f10; - st.global.f32 [%rd50+0], %f155; - cvt.s64.s32 %rd54, %r9; - mul.wide.s32 %rd55, %r9, 4; - add.u64 %rd56, %rd55, %rd50; - mov.f32 %f156, %f12; - st.global.f32 [%rd56+0], %f156; - add.u64 %rd57, %rd55, %rd56; - mov.f32 %f157, %f14; - st.global.f32 [%rd57+0], %f157; - add.u64 %rd58, %rd55, %rd57; - mov.f32 %f158, %f16; - st.global.f32 [%rd58+0], %f158; - add.u64 %rd50, %rd55, %rd58; - mov.f32 %f159, %f18; - st.global.f32 [%rd50+0], %f159; - mov.f32 %f160, %f20; - add.u64 %rd59, %rd55, %rd50; - st.global.f32 [%rd59+0], %f160; -$Lt_0_32002: - ld.param.u64 %rd60, [__cudaparm_kernel_pair_ans]; - mul.lo.u64 %rd61, %rd2, 16; - add.u64 %rd62, %rd60, %rd61; - mov.f32 %f161, %f162; - st.global.v4.f32 [%rd62+0], {%f36,%f35,%f34,%f161}; -$Lt_0_30978: -$Lt_0_21762: - .loc 16 135 0 - exit; -$LDWend_kernel_pair: - } // kernel_pair - - .entry kernel_pair_fast ( - .param .u64 __cudaparm_kernel_pair_fast_x_, - .param .u64 __cudaparm_kernel_pair_fast_lj1_in, - .param .u64 __cudaparm_kernel_pair_fast_lj3_in, - .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in, - .param .u64 __cudaparm_kernel_pair_fast_dev_nbor, - .param .u64 __cudaparm_kernel_pair_fast_dev_packed, - .param .u64 __cudaparm_kernel_pair_fast_ans, - .param .u64 __cudaparm_kernel_pair_fast___val_paramengv, - .param .s32 __cudaparm_kernel_pair_fast_eflag, - .param .s32 __cudaparm_kernel_pair_fast_vflag, - .param .s32 __cudaparm_kernel_pair_fast_inum, - .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch, - .param .u64 __cudaparm_kernel_pair_fast_q_, - .param .f32 __cudaparm_kernel_pair_fast_cut_coulsq, - .param .f32 __cudaparm_kernel_pair_fast_qqrd2e, - .param .f32 __cudaparm_kernel_pair_fast_g_ewald, - .param .s32 __cudaparm_kernel_pair_fast_t_per_atom) - { - .reg .u32 %r<88>; - .reg .u64 %rd<76>; - .reg .f32 %f<167>; - .reg .pred %p<24>; - .shared .align 4 .b8 __cuda___cuda_local_var_32652_33_non_const_sp_lj3320[32]; - .shared .align 16 .b8 __cuda___cuda_local_var_32650_34_non_const_lj13360[1936]; - .shared .align 16 .b8 __cuda___cuda_local_var_32651_34_non_const_lj35296[1936]; - .shared .align 4 .b8 __cuda___cuda_local_var_32745_55_non_const_red_acc7232[3072]; - // __cuda_local_var_32663_10_non_const_f = 64 - // __cuda_local_var_32665_9_non_const_virial = 16 - // __cuda_local_var_32701_43_non_const_r6inv = 40 - // __cuda_local_var_32701_50_non_const_prefactor = 48 - // __cuda_local_var_32701_61_non_const__erfc = 44 - .loc 16 145 0 -$LDWbegin_kernel_pair_fast: - cvt.s32.u32 %r1, %tid.x; - mov.u32 %r2, 7; - setp.gt.s32 %p1, %r1, %r2; - @%p1 bra $Lt_1_24066; - .loc 16 153 0 - mov.u64 %rd1, __cuda___cuda_local_var_32652_33_non_const_sp_lj3320; - cvt.s64.s32 %rd2, %r1; - mul.wide.s32 %rd3, %r1, 4; - ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in]; - add.u64 %rd5, %rd4, %rd3; - ld.global.f32 %f1, [%rd5+0]; - add.u64 %rd6, %rd3, %rd1; - st.shared.f32 [%rd6+0], %f1; -$Lt_1_24066: - mov.u64 %rd1, __cuda___cuda_local_var_32652_33_non_const_sp_lj3320; - mov.u32 %r3, 120; - setp.gt.s32 %p2, %r1, %r3; - @%p2 bra $Lt_1_24578; - .loc 16 155 0 - mov.u64 %rd7, __cuda___cuda_local_var_32650_34_non_const_lj13360; - cvt.s64.s32 %rd8, %r1; - mul.wide.s32 %rd9, %r1, 16; - ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_lj1_in]; - add.u64 %rd11, %rd10, %rd9; - add.u64 %rd12, %rd9, %rd7; - ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0]; - st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5}; - ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag]; - mov.u32 %r5, 0; - setp.le.s32 %p3, %r4, %r5; - @%p3 bra $Lt_1_25090; - .loc 16 157 0 - mov.u64 %rd13, __cuda___cuda_local_var_32651_34_non_const_lj35296; - ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in]; - add.u64 %rd15, %rd14, %rd9; - add.u64 %rd16, %rd9, %rd13; - ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0]; - st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9}; -$Lt_1_25090: - mov.u64 %rd13, __cuda___cuda_local_var_32651_34_non_const_lj35296; -$Lt_1_24578: - mov.u64 %rd13, __cuda___cuda_local_var_32651_34_non_const_lj35296; - mov.u64 %rd7, __cuda___cuda_local_var_32650_34_non_const_lj13360; - .loc 16 166 0 - mov.f32 %f10, 0f00000000; // 0 - mov.f32 %f11, %f10; - mov.f32 %f12, 0f00000000; // 0 - mov.f32 %f13, %f12; - mov.f32 %f14, 0f00000000; // 0 - mov.f32 %f15, %f14; - mov.f32 %f16, 0f00000000; // 0 - mov.f32 %f17, %f16; - mov.f32 %f18, 0f00000000; // 0 - mov.f32 %f19, %f18; - mov.f32 %f20, 0f00000000; // 0 - mov.f32 %f21, %f20; - .loc 16 168 0 - bar.sync 0; - ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom]; - div.s32 %r7, %r1, %r6; - cvt.s32.u32 %r8, %ntid.x; - div.s32 %r9, %r8, %r6; - cvt.s32.u32 %r10, %ctaid.x; - mul.lo.s32 %r11, %r10, %r9; - add.s32 %r12, %r7, %r11; - ld.param.s32 %r13, [__cudaparm_kernel_pair_fast_inum]; - setp.ge.s32 %p4, %r12, %r13; - @%p4 bra $Lt_1_34818; - .loc 16 173 0 - cvt.s64.s32 %rd17, %r12; - mul.wide.s32 %rd18, %r12, 4; - ld.param.u64 %rd19, [__cudaparm_kernel_pair_fast_dev_nbor]; - add.u64 %rd20, %rd18, %rd19; - ld.global.s32 %r14, [%rd20+0]; - ld.param.s32 %r15, [__cudaparm_kernel_pair_fast_nbor_pitch]; - cvt.s64.s32 %rd21, %r15; - mul.wide.s32 %rd22, %r15, 4; - add.u64 %rd23, %rd22, %rd20; - ld.global.s32 %r16, [%rd23+0]; - sub.s32 %r17, %r6, 1; - and.b32 %r18, %r17, %r1; - cvt.s64.s32 %rd24, %r18; - mul.wide.s32 %rd25, %r18, 4; - ld.param.u64 %rd26, [__cudaparm_kernel_pair_fast_dev_packed]; - setp.ne.u64 %p5, %rd26, %rd19; - @%p5 bra $Lt_1_26370; - cvt.s32.s64 %r19, %rd21; - mul.lo.s32 %r20, %r19, %r6; - mov.s32 %r21, %r20; - mul.lo.s32 %r22, %r17, %r12; - add.s32 %r23, %r19, %r22; - cvt.s64.s32 %rd27, %r23; - mul.wide.s32 %rd28, %r23, 4; - add.u64 %rd29, %rd23, %rd28; - and.b32 %r24, %r17, %r16; - cvt.s64.s32 %rd30, %r24; - div.s32 %r25, %r16, %r6; - mul.lo.s32 %r26, %r20, %r25; - cvt.s64.s32 %rd31, %r26; - add.u64 %rd32, %rd30, %rd31; - mul.lo.u64 %rd33, %rd32, 4; - add.u64 %rd34, %rd29, %rd33; - add.u64 %rd35, %rd25, %rd29; - bra.uni $Lt_1_26114; -$Lt_1_26370: - add.u64 %rd36, %rd22, %rd23; - ld.global.s32 %r27, [%rd36+0]; - cvt.s64.s32 %rd37, %r27; - mul.wide.s32 %rd38, %r27, 4; - add.u64 %rd39, %rd26, %rd38; - cvt.s64.s32 %rd40, %r16; - mul.wide.s32 %rd41, %r16, 4; - add.u64 %rd34, %rd39, %rd41; - mov.s32 %r21, %r6; - add.u64 %rd35, %rd25, %rd39; -$Lt_1_26114: - .loc 16 176 0 - mov.u32 %r28, %r14; - mov.s32 %r29, 0; - mov.u32 %r30, %r29; - mov.s32 %r31, 0; - mov.u32 %r32, %r31; - mov.s32 %r33, 0; - mov.u32 %r34, %r33; - tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r28,%r30,%r32,%r34}]; - mov.f32 %f26, %f22; - mov.f32 %f27, %f23; - mov.f32 %f28, %f24; - mov.f32 %f29, %f25; - .loc 16 177 0 - mov.u32 %r35, %r14; - mov.s32 %r36, 0; - mov.u32 %r37, %r36; - mov.s32 %r38, 0; - mov.u32 %r39, %r38; - mov.s32 %r40, 0; - mov.u32 %r41, %r40; - tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[q_tex,{%r35,%r37,%r39,%r41}]; - mov.f32 %f34, %f30; - setp.ge.u64 %p6, %rd35, %rd34; - @%p6 bra $Lt_1_36354; - cvt.rzi.ftz.s32.f32 %r42, %f29; - cvt.s64.s32 %rd42, %r21; - mul.lo.s32 %r43, %r42, 11; - cvt.rn.f32.s32 %f35, %r43; - mov.f32 %f36, 0f00000000; // 0 - mov.f32 %f37, 0f00000000; // 0 - mov.f32 %f38, 0f00000000; // 0 - mov.f32 %f39, 0f00000000; // 0 - mov.f32 %f40, 0f00000000; // 0 -$Lt_1_27138: - // Loop body line 177, nesting depth: 1, estimated iterations: unknown - .loc 16 182 0 - ld.global.s32 %r44, [%rd35+0]; - .loc 16 185 0 - shr.s32 %r45, %r44, 30; - and.b32 %r46, %r45, 3; - cvt.s64.s32 %rd43, %r46; - mul.wide.s32 %rd44, %r46, 4; - add.u64 %rd45, %rd1, %rd44; - ld.shared.f32 %f41, [%rd45+0]; - .loc 16 186 0 - mov.f32 %f42, 0f3f800000; // 1 - ld.shared.f32 %f43, [%rd45+16]; - sub.ftz.f32 %f44, %f42, %f43; - .loc 16 189 0 - and.b32 %r47, %r44, 1073741823; - mov.u32 %r48, %r47; - mov.s32 %r49, 0; - mov.u32 %r50, %r49; - mov.s32 %r51, 0; - mov.u32 %r52, %r51; - mov.s32 %r53, 0; - mov.u32 %r54, %r53; - tex.1d.v4.f32.s32 {%f45,%f46,%f47,%f48},[pos_tex,{%r48,%r50,%r52,%r54}]; - mov.f32 %f49, %f45; - mov.f32 %f50, %f46; - mov.f32 %f51, %f47; - mov.f32 %f52, %f48; - sub.ftz.f32 %f53, %f27, %f50; - sub.ftz.f32 %f54, %f26, %f49; - sub.ftz.f32 %f55, %f28, %f51; - mul.ftz.f32 %f56, %f53, %f53; - fma.rn.ftz.f32 %f57, %f54, %f54, %f56; - fma.rn.ftz.f32 %f58, %f55, %f55, %f57; - add.ftz.f32 %f59, %f35, %f52; - cvt.rzi.ftz.s32.f32 %r55, %f59; - cvt.s64.s32 %rd46, %r55; - mul.wide.s32 %rd47, %r55, 16; - add.u64 %rd48, %rd47, %rd7; - ld.shared.f32 %f60, [%rd48+8]; - setp.gt.ftz.f32 %p7, %f60, %f58; - @!%p7 bra $Lt_1_29954; - rcp.approx.ftz.f32 %f61, %f58; - ld.shared.f32 %f62, [%rd48+12]; - setp.lt.ftz.f32 %p8, %f58, %f62; - @!%p8 bra $Lt_1_28162; - .loc 16 203 0 - mul.ftz.f32 %f63, %f61, %f61; - mul.ftz.f32 %f64, %f61, %f63; - mov.f32 %f65, %f64; - .loc 16 204 0 - mul.ftz.f32 %f66, %f64, %f41; - ld.shared.v2.f32 {%f67,%f68}, [%rd48+0]; - mul.ftz.f32 %f69, %f67, %f64; - sub.ftz.f32 %f70, %f69, %f68; - mul.ftz.f32 %f71, %f66, %f70; - bra.uni $Lt_1_27906; -$Lt_1_28162: - .loc 16 206 0 - mov.f32 %f71, 0f00000000; // 0 -$Lt_1_27906: - ld.param.f32 %f72, [__cudaparm_kernel_pair_fast_cut_coulsq]; - setp.gt.ftz.f32 %p9, %f72, %f58; - @!%p9 bra $Lt_1_28674; - .loc 20 518 0 - rsqrt.approx.ftz.f32 %f73, %f61; - ld.param.f32 %f74, [__cudaparm_kernel_pair_fast_g_ewald]; - mul.ftz.f32 %f75, %f74, %f73; - mul.ftz.f32 %f76, %f75, %f75; - neg.ftz.f32 %f77, %f76; - mov.f32 %f78, 0f3fb8aa3b; // 1.4427 - mul.ftz.f32 %f79, %f77, %f78; - ex2.approx.ftz.f32 %f80, %f79; - .loc 16 213 0 - mov.f32 %f81, 0f3f800000; // 1 - mov.f32 %f82, 0f3ea7ba05; // 0.327591 - fma.rn.ftz.f32 %f83, %f82, %f75, %f81; - rcp.approx.ftz.f32 %f84, %f83; - mov.f32 %f85, 0f3e827906; // 0.25483 - mov.f32 %f86, 0fbe91a98e; // -0.284497 - mov.f32 %f87, 0f3fb5f0e3; // 1.42141 - mov.f32 %f88, 0fbfba00e3; // -1.45315 - mov.f32 %f89, 0f3f87dc22; // 1.06141 - fma.rn.ftz.f32 %f90, %f89, %f84, %f88; - fma.rn.ftz.f32 %f91, %f84, %f90, %f87; - fma.rn.ftz.f32 %f92, %f84, %f91, %f86; - fma.rn.ftz.f32 %f93, %f84, %f92, %f85; - mul.ftz.f32 %f94, %f84, %f93; - mul.ftz.f32 %f95, %f80, %f94; - mov.f32 %f96, %f95; - .loc 16 214 0 - mov.u32 %r56, %r47; - mov.s32 %r57, 0; - mov.u32 %r58, %r57; - mov.s32 %r59, 0; - mov.u32 %r60, %r59; - mov.s32 %r61, 0; - mov.u32 %r62, %r61; - tex.1d.v4.f32.s32 {%f97,%f98,%f99,%f100},[q_tex,{%r56,%r58,%r60,%r62}]; - mov.f32 %f101, %f97; - ld.param.f32 %f102, [__cudaparm_kernel_pair_fast_qqrd2e]; - mul.ftz.f32 %f103, %f102, %f34; - mul.ftz.f32 %f104, %f103, %f101; - div.approx.ftz.f32 %f105, %f104, %f73; - mov.f32 %f106, %f105; - .loc 16 215 0 - mov.f32 %f107, 0f3f906ebb; // 1.12838 - mul.ftz.f32 %f108, %f75, %f107; - fma.rn.ftz.f32 %f109, %f80, %f108, %f95; - sub.ftz.f32 %f110, %f109, %f44; - mul.ftz.f32 %f111, %f105, %f110; - bra.uni $Lt_1_28418; -$Lt_1_28674: - .loc 16 217 0 - mov.f32 %f111, 0f00000000; // 0 -$Lt_1_28418: - .loc 16 221 0 - add.ftz.f32 %f112, %f111, %f71; - mul.ftz.f32 %f113, %f112, %f61; - fma.rn.ftz.f32 %f38, %f54, %f113, %f38; - .loc 16 222 0 - fma.rn.ftz.f32 %f37, %f53, %f113, %f37; - .loc 16 223 0 - fma.rn.ftz.f32 %f36, %f55, %f113, %f36; - ld.param.s32 %r63, [__cudaparm_kernel_pair_fast_eflag]; - mov.u32 %r64, 0; - setp.le.s32 %p10, %r63, %r64; - @%p10 bra $Lt_1_29442; - .loc 16 226 0 - mov.f32 %f114, %f106; - mov.f32 %f115, %f96; - sub.ftz.f32 %f116, %f115, %f44; - fma.rn.ftz.f32 %f117, %f114, %f116, %f39; - selp.f32 %f39, %f117, %f39, %p9; - @!%p8 bra $Lt_1_29442; - .loc 16 229 0 - add.u64 %rd49, %rd47, %rd13; - mov.f32 %f118, %f65; - ld.shared.v4.f32 {%f119,%f120,%f121,_}, [%rd49+0]; - mul.ftz.f32 %f122, %f119, %f118; - sub.ftz.f32 %f123, %f122, %f120; - mul.ftz.f32 %f124, %f118, %f123; - .loc 16 230 0 - sub.ftz.f32 %f125, %f124, %f121; - fma.rn.ftz.f32 %f40, %f41, %f125, %f40; -$Lt_1_29442: -$Lt_1_28930: - ld.param.s32 %r65, [__cudaparm_kernel_pair_fast_vflag]; - mov.u32 %r66, 0; - setp.le.s32 %p11, %r65, %r66; - @%p11 bra $Lt_1_29954; - .loc 16 234 0 - mov.f32 %f126, %f11; - mul.ftz.f32 %f127, %f54, %f54; - fma.rn.ftz.f32 %f128, %f113, %f127, %f126; - mov.f32 %f11, %f128; - .loc 16 235 0 - mov.f32 %f129, %f13; - fma.rn.ftz.f32 %f130, %f113, %f56, %f129; - mov.f32 %f13, %f130; - .loc 16 236 0 - mov.f32 %f131, %f15; - mul.ftz.f32 %f132, %f55, %f55; - fma.rn.ftz.f32 %f133, %f113, %f132, %f131; - mov.f32 %f15, %f133; - .loc 16 237 0 - mov.f32 %f134, %f17; - mul.ftz.f32 %f135, %f53, %f54; - fma.rn.ftz.f32 %f136, %f113, %f135, %f134; - mov.f32 %f17, %f136; - .loc 16 238 0 - mov.f32 %f137, %f19; - mul.ftz.f32 %f138, %f54, %f55; - fma.rn.ftz.f32 %f139, %f113, %f138, %f137; - mov.f32 %f19, %f139; - .loc 16 239 0 - mul.ftz.f32 %f140, %f53, %f55; - fma.rn.ftz.f32 %f20, %f113, %f140, %f20; - mov.f32 %f21, %f20; -$Lt_1_29954: -$Lt_1_27394: - .loc 16 181 0 - mul.lo.u64 %rd50, %rd42, 4; - add.u64 %rd35, %rd35, %rd50; - setp.lt.u64 %p12, %rd35, %rd34; - @%p12 bra $Lt_1_27138; - bra.uni $Lt_1_26626; -$Lt_1_36354: - mov.f32 %f36, 0f00000000; // 0 - mov.f32 %f37, 0f00000000; // 0 - mov.f32 %f38, 0f00000000; // 0 - mov.f32 %f39, 0f00000000; // 0 - mov.f32 %f40, 0f00000000; // 0 -$Lt_1_26626: - mov.u32 %r67, 1; - setp.le.s32 %p13, %r6, %r67; - @%p13 bra $Lt_1_32770; - .loc 16 244 0 - mov.u64 %rd51, __cuda___cuda_local_var_32745_55_non_const_red_acc7232; - cvt.s64.s32 %rd52, %r1; - mul.wide.s32 %rd53, %r1, 4; - add.u64 %rd54, %rd51, %rd53; - mov.f32 %f141, %f38; - st.shared.f32 [%rd54+0], %f141; - mov.f32 %f142, %f37; - st.shared.f32 [%rd54+512], %f142; - mov.f32 %f143, %f36; - st.shared.f32 [%rd54+1024], %f143; - mov.f32 %f144, %f40; - st.shared.f32 [%rd54+1536], %f144; - mov.f32 %f145, %f39; - st.shared.f32 [%rd54+2048], %f145; - shr.s32 %r68, %r6, 31; - mov.s32 %r69, 1; - and.b32 %r70, %r68, %r69; - add.s32 %r71, %r70, %r6; - shr.s32 %r72, %r71, 1; - mov.s32 %r73, %r72; - mov.u32 %r74, 0; - setp.ne.u32 %p14, %r72, %r74; - @!%p14 bra $Lt_1_31234; -$Lt_1_31746: - setp.ge.u32 %p15, %r18, %r73; - @%p15 bra $Lt_1_32002; - add.u32 %r75, %r1, %r73; - cvt.u64.u32 %rd55, %r75; - mul.wide.u32 %rd56, %r75, 4; - add.u64 %rd57, %rd51, %rd56; - ld.shared.f32 %f146, [%rd57+0]; - add.ftz.f32 %f141, %f146, %f141; - st.shared.f32 [%rd54+0], %f141; - ld.shared.f32 %f147, [%rd57+512]; - add.ftz.f32 %f142, %f147, %f142; - st.shared.f32 [%rd54+512], %f142; - ld.shared.f32 %f148, [%rd57+1024]; - add.ftz.f32 %f143, %f148, %f143; - st.shared.f32 [%rd54+1024], %f143; - ld.shared.f32 %f149, [%rd57+1536]; - add.ftz.f32 %f144, %f149, %f144; - st.shared.f32 [%rd54+1536], %f144; - ld.shared.f32 %f150, [%rd57+2048]; - add.ftz.f32 %f145, %f150, %f145; - st.shared.f32 [%rd54+2048], %f145; -$Lt_1_32002: - shr.u32 %r73, %r73, 1; - mov.u32 %r76, 0; - setp.ne.u32 %p16, %r73, %r76; - @%p16 bra $Lt_1_31746; -$Lt_1_31234: - mov.f32 %f38, %f141; - mov.f32 %f37, %f142; - mov.f32 %f36, %f143; - mov.f32 %f40, %f144; - mov.f32 %f39, %f145; - ld.param.s32 %r77, [__cudaparm_kernel_pair_fast_vflag]; - mov.u32 %r78, 0; - setp.le.s32 %p17, %r77, %r78; - @%p17 bra $Lt_1_32770; - mov.f32 %f141, %f11; - st.shared.f32 [%rd54+0], %f141; - mov.f32 %f142, %f13; - st.shared.f32 [%rd54+512], %f142; - mov.f32 %f143, %f15; - st.shared.f32 [%rd54+1024], %f143; - mov.f32 %f144, %f17; - st.shared.f32 [%rd54+1536], %f144; - mov.f32 %f145, %f19; - st.shared.f32 [%rd54+2048], %f145; - mov.f32 %f151, %f20; - st.shared.f32 [%rd54+2560], %f151; - mov.s32 %r79, %r72; - @!%p14 bra $Lt_1_33282; -$Lt_1_33794: - setp.ge.u32 %p18, %r18, %r79; - @%p18 bra $Lt_1_34050; - add.u32 %r80, %r1, %r79; - cvt.u64.u32 %rd58, %r80; - mul.wide.u32 %rd59, %r80, 4; - add.u64 %rd60, %rd51, %rd59; - ld.shared.f32 %f152, [%rd60+0]; - add.ftz.f32 %f141, %f152, %f141; - st.shared.f32 [%rd54+0], %f141; - ld.shared.f32 %f153, [%rd60+512]; - add.ftz.f32 %f142, %f153, %f142; - st.shared.f32 [%rd54+512], %f142; - ld.shared.f32 %f154, [%rd60+1024]; - add.ftz.f32 %f143, %f154, %f143; - st.shared.f32 [%rd54+1024], %f143; - ld.shared.f32 %f155, [%rd60+1536]; - add.ftz.f32 %f144, %f155, %f144; - st.shared.f32 [%rd54+1536], %f144; - ld.shared.f32 %f156, [%rd60+2048]; - add.ftz.f32 %f145, %f156, %f145; - st.shared.f32 [%rd54+2048], %f145; - ld.shared.f32 %f157, [%rd60+2560]; - add.ftz.f32 %f151, %f157, %f151; - st.shared.f32 [%rd54+2560], %f151; -$Lt_1_34050: - shr.u32 %r79, %r79, 1; - mov.u32 %r81, 0; - setp.ne.u32 %p19, %r79, %r81; - @%p19 bra $Lt_1_33794; -$Lt_1_33282: - mov.f32 %f11, %f141; - mov.f32 %f13, %f142; - mov.f32 %f15, %f143; - mov.f32 %f17, %f144; - mov.f32 %f19, %f145; - mov.f32 %f21, %f151; -$Lt_1_32770: -$Lt_1_30722: - mov.u32 %r82, 0; - setp.ne.s32 %p20, %r18, %r82; - @%p20 bra $Lt_1_34818; - ld.param.u64 %rd61, [__cudaparm_kernel_pair_fast___val_paramengv]; - add.u64 %rd62, %rd61, %rd18; - ld.param.s32 %r83, [__cudaparm_kernel_pair_fast_eflag]; - mov.u32 %r84, 0; - setp.le.s32 %p21, %r83, %r84; - @%p21 bra $Lt_1_35330; - st.global.f32 [%rd62+0], %f40; - cvt.s64.s32 %rd63, %r13; - mul.wide.s32 %rd64, %r13, 4; - add.u64 %rd65, %rd64, %rd62; - st.global.f32 [%rd65+0], %f39; - add.u64 %rd62, %rd64, %rd65; -$Lt_1_35330: - ld.param.s32 %r85, [__cudaparm_kernel_pair_fast_vflag]; - mov.u32 %r86, 0; - setp.le.s32 %p22, %r85, %r86; - @%p22 bra $Lt_1_35842; - mov.f32 %f158, %f11; - st.global.f32 [%rd62+0], %f158; - cvt.s64.s32 %rd66, %r13; - mul.wide.s32 %rd67, %r13, 4; - add.u64 %rd68, %rd67, %rd62; - mov.f32 %f159, %f13; - st.global.f32 [%rd68+0], %f159; - add.u64 %rd69, %rd67, %rd68; - mov.f32 %f160, %f15; - st.global.f32 [%rd69+0], %f160; - add.u64 %rd70, %rd67, %rd69; - mov.f32 %f161, %f17; - st.global.f32 [%rd70+0], %f161; - add.u64 %rd62, %rd67, %rd70; - mov.f32 %f162, %f19; - st.global.f32 [%rd62+0], %f162; - mov.f32 %f163, %f21; - add.u64 %rd71, %rd67, %rd62; - st.global.f32 [%rd71+0], %f163; -$Lt_1_35842: - ld.param.u64 %rd72, [__cudaparm_kernel_pair_fast_ans]; - mul.lo.u64 %rd73, %rd17, 16; - add.u64 %rd74, %rd72, %rd73; - mov.f32 %f164, %f165; - st.global.v4.f32 [%rd74+0], {%f38,%f37,%f36,%f164}; -$Lt_1_34818: -$Lt_1_25602: - .loc 16 247 0 - exit; -$LDWend_kernel_pair_fast: - } // kernel_pair_fast - diff --git a/lib/gpu/lj_coul_long_ptx.h b/lib/gpu/lj_coul_long_ptx.h deleted file mode 100644 index 9e7a32c872..0000000000 --- a/lib/gpu/lj_coul_long_ptx.h +++ /dev/null @@ -1,1065 +0,0 @@ -const char * lj_coul_long = -" .version 2.3\n" -" .target sm_20\n" -" .address_size 64\n" -" .global .texref pos_tex;\n" -" .global .texref q_tex;\n" -" .entry kernel_pair (\n" -" .param .u64 __cudaparm_kernel_pair_x_,\n" -" .param .u64 __cudaparm_kernel_pair_lj1,\n" -" .param .u64 __cudaparm_kernel_pair_lj3,\n" -" .param .s32 __cudaparm_kernel_pair_lj_types,\n" -" .param .u64 __cudaparm_kernel_pair_sp_lj_in,\n" -" .param .u64 __cudaparm_kernel_pair_dev_nbor,\n" -" .param .u64 __cudaparm_kernel_pair_dev_packed,\n" -" .param .u64 __cudaparm_kernel_pair_ans,\n" -" .param .u64 __cudaparm_kernel_pair___val_paramengv,\n" -" .param .s32 __cudaparm_kernel_pair_eflag,\n" -" .param .s32 __cudaparm_kernel_pair_vflag,\n" -" .param .s32 __cudaparm_kernel_pair_inum,\n" -" .param .s32 __cudaparm_kernel_pair_nbor_pitch,\n" -" .param .u64 __cudaparm_kernel_pair_q_,\n" -" .param .f32 __cudaparm_kernel_pair_cut_coulsq,\n" -" .param .f32 __cudaparm_kernel_pair_qqrd2e,\n" -" .param .f32 __cudaparm_kernel_pair_g_ewald,\n" -" .param .s32 __cudaparm_kernel_pair_t_per_atom)\n" -" {\n" -" .reg .u32 %r<86>;\n" -" .reg .u64 %rd<64>;\n" -" .reg .f32 %f<164>;\n" -" .reg .pred %p<21>;\n" -" .shared .align 16 .b8 __cuda___cuda_local_var_32541_33_non_const_sp_lj112[32];\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_32633_55_non_const_red_acc144[3072];\n" -" .loc 16 36 0\n" -"$LDWbegin_kernel_pair:\n" -" .loc 16 41 0\n" -" ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];\n" -" ldu.global.f32 %f1, [%rd1+0];\n" -" .loc 16 42 0\n" -" ld.global.f32 %f2, [%rd1+4];\n" -" .loc 16 43 0\n" -" ld.global.f32 %f3, [%rd1+8];\n" -" .loc 16 44 0\n" -" ld.global.f32 %f4, [%rd1+12];\n" -" st.shared.v4.f32 [__cuda___cuda_local_var_32541_33_non_const_sp_lj112+0], {%f1,%f2,%f3,%f4};\n" -" .loc 16 45 0\n" -" ld.global.f32 %f5, [%rd1+16];\n" -" .loc 16 46 0\n" -" ld.global.f32 %f6, [%rd1+20];\n" -" .loc 16 47 0\n" -" ld.global.f32 %f7, [%rd1+24];\n" -" .loc 16 48 0\n" -" ld.global.f32 %f8, [%rd1+28];\n" -" st.shared.v4.f32 [__cuda___cuda_local_var_32541_33_non_const_sp_lj112+16], {%f5,%f6,%f7,%f8};\n" -" .loc 16 56 0\n" -" mov.f32 %f9, 0f00000000; \n" -" mov.f32 %f10, %f9;\n" -" mov.f32 %f11, 0f00000000; \n" -" mov.f32 %f12, %f11;\n" -" mov.f32 %f13, 0f00000000; \n" -" mov.f32 %f14, %f13;\n" -" mov.f32 %f15, 0f00000000; \n" -" mov.f32 %f16, %f15;\n" -" mov.f32 %f17, 0f00000000; \n" -" mov.f32 %f18, %f17;\n" -" mov.f32 %f19, 0f00000000; \n" -" mov.f32 %f20, %f19;\n" -" ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];\n" -" cvt.s32.u32 %r2, %tid.x;\n" -" div.s32 %r3, %r2, %r1;\n" -" cvt.s32.u32 %r4, %ntid.x;\n" -" div.s32 %r5, %r4, %r1;\n" -" cvt.s32.u32 %r6, %ctaid.x;\n" -" mul.lo.s32 %r7, %r6, %r5;\n" -" add.s32 %r8, %r3, %r7;\n" -" ld.param.s32 %r9, [__cudaparm_kernel_pair_inum];\n" -" setp.ge.s32 %p1, %r8, %r9;\n" -" @%p1 bra $Lt_0_30978;\n" -" .loc 16 61 0\n" -" cvt.s64.s32 %rd2, %r8;\n" -" mul.wide.s32 %rd3, %r8, 4;\n" -" ld.param.u64 %rd4, [__cudaparm_kernel_pair_dev_nbor];\n" -" add.u64 %rd5, %rd3, %rd4;\n" -" ld.global.s32 %r10, [%rd5+0];\n" -" ld.param.s32 %r11, [__cudaparm_kernel_pair_nbor_pitch];\n" -" cvt.s64.s32 %rd6, %r11;\n" -" mul.wide.s32 %rd7, %r11, 4;\n" -" add.u64 %rd8, %rd7, %rd5;\n" -" ld.global.s32 %r12, [%rd8+0];\n" -" sub.s32 %r13, %r1, 1;\n" -" and.b32 %r14, %r13, %r2;\n" -" cvt.s64.s32 %rd9, %r14;\n" -" mul.wide.s32 %rd10, %r14, 4;\n" -" ld.param.u64 %rd11, [__cudaparm_kernel_pair_dev_packed];\n" -" setp.ne.u64 %p2, %rd11, %rd4;\n" -" @%p2 bra $Lt_0_22530;\n" -" cvt.s32.s64 %r15, %rd6;\n" -" mul.lo.s32 %r16, %r15, %r1;\n" -" mov.s32 %r17, %r16;\n" -" mul.lo.s32 %r18, %r13, %r8;\n" -" add.s32 %r19, %r15, %r18;\n" -" cvt.s64.s32 %rd12, %r19;\n" -" mul.wide.s32 %rd13, %r19, 4;\n" -" add.u64 %rd14, %rd8, %rd13;\n" -" and.b32 %r20, %r13, %r12;\n" -" cvt.s64.s32 %rd15, %r20;\n" -" div.s32 %r21, %r12, %r1;\n" -" mul.lo.s32 %r22, %r16, %r21;\n" -" cvt.s64.s32 %rd16, %r22;\n" -" add.u64 %rd17, %rd15, %rd16;\n" -" mul.lo.u64 %rd18, %rd17, 4;\n" -" add.u64 %rd19, %rd14, %rd18;\n" -" add.u64 %rd20, %rd10, %rd14;\n" -" bra.uni $Lt_0_22274;\n" -"$Lt_0_22530:\n" -" add.u64 %rd21, %rd7, %rd8;\n" -" ld.global.s32 %r23, [%rd21+0];\n" -" cvt.s64.s32 %rd22, %r23;\n" -" mul.wide.s32 %rd23, %r23, 4;\n" -" add.u64 %rd24, %rd11, %rd23;\n" -" cvt.s64.s32 %rd25, %r12;\n" -" mul.wide.s32 %rd26, %r12, 4;\n" -" add.u64 %rd19, %rd24, %rd26;\n" -" mov.s32 %r17, %r1;\n" -" add.u64 %rd20, %rd10, %rd24;\n" -"$Lt_0_22274:\n" -" .loc 16 64 0\n" -" mov.u32 %r24, %r10;\n" -" mov.s32 %r25, 0;\n" -" mov.u32 %r26, %r25;\n" -" mov.s32 %r27, 0;\n" -" mov.u32 %r28, %r27;\n" -" mov.s32 %r29, 0;\n" -" mov.u32 %r30, %r29;\n" -" tex.1d.v4.f32.s32 {%f21,%f22,%f23,%f24},[pos_tex,{%r24,%r26,%r28,%r30}];\n" -" mov.f32 %f25, %f21;\n" -" mov.f32 %f26, %f22;\n" -" mov.f32 %f27, %f23;\n" -" mov.f32 %f28, %f24;\n" -" .loc 16 65 0\n" -" mov.u32 %r31, %r10;\n" -" mov.s32 %r32, 0;\n" -" mov.u32 %r33, %r32;\n" -" mov.s32 %r34, 0;\n" -" mov.u32 %r35, %r34;\n" -" mov.s32 %r36, 0;\n" -" mov.u32 %r37, %r36;\n" -" tex.1d.v4.f32.s32 {%f29,%f30,%f31,%f32},[q_tex,{%r31,%r33,%r35,%r37}];\n" -" mov.f32 %f33, %f29;\n" -" setp.ge.u64 %p3, %rd20, %rd19;\n" -" @%p3 bra $Lt_0_32514;\n" -" cvt.rzi.ftz.s32.f32 %r38, %f28;\n" -" cvt.s64.s32 %rd27, %r17;\n" -" ld.param.s32 %r39, [__cudaparm_kernel_pair_lj_types];\n" -" mul.lo.s32 %r40, %r39, %r38;\n" -" ld.param.u64 %rd28, [__cudaparm_kernel_pair_lj1];\n" -" mov.f32 %f34, 0f00000000; \n" -" mov.f32 %f35, 0f00000000; \n" -" mov.f32 %f36, 0f00000000; \n" -" mov.f32 %f37, 0f00000000; \n" -" mov.f32 %f38, 0f00000000; \n" -" mov.u64 %rd29, __cuda___cuda_local_var_32541_33_non_const_sp_lj112;\n" -"$Lt_0_23298:\n" -" .loc 16 69 0\n" -" ld.global.s32 %r41, [%rd20+0];\n" -" .loc 16 72 0\n" -" shr.s32 %r42, %r41, 30;\n" -" and.b32 %r43, %r42, 3;\n" -" cvt.s64.s32 %rd30, %r43;\n" -" mul.wide.s32 %rd31, %r43, 4;\n" -" add.u64 %rd32, %rd29, %rd31;\n" -" ld.shared.f32 %f39, [%rd32+0];\n" -" .loc 16 73 0\n" -" mov.f32 %f40, 0f3f800000; \n" -" ld.shared.f32 %f41, [%rd32+16];\n" -" sub.ftz.f32 %f42, %f40, %f41;\n" -" .loc 16 76 0\n" -" and.b32 %r44, %r41, 1073741823;\n" -" mov.u32 %r45, %r44;\n" -" mov.s32 %r46, 0;\n" -" mov.u32 %r47, %r46;\n" -" mov.s32 %r48, 0;\n" -" mov.u32 %r49, %r48;\n" -" mov.s32 %r50, 0;\n" -" mov.u32 %r51, %r50;\n" -" tex.1d.v4.f32.s32 {%f43,%f44,%f45,%f46},[pos_tex,{%r45,%r47,%r49,%r51}];\n" -" mov.f32 %f47, %f43;\n" -" mov.f32 %f48, %f44;\n" -" mov.f32 %f49, %f45;\n" -" mov.f32 %f50, %f46;\n" -" cvt.rzi.ftz.s32.f32 %r52, %f50;\n" -" sub.ftz.f32 %f51, %f26, %f48;\n" -" sub.ftz.f32 %f52, %f25, %f47;\n" -" sub.ftz.f32 %f53, %f27, %f49;\n" -" mul.ftz.f32 %f54, %f51, %f51;\n" -" fma.rn.ftz.f32 %f55, %f52, %f52, %f54;\n" -" fma.rn.ftz.f32 %f56, %f53, %f53, %f55;\n" -" add.s32 %r53, %r52, %r40;\n" -" cvt.s64.s32 %rd33, %r53;\n" -" mul.wide.s32 %rd34, %r53, 16;\n" -" add.u64 %rd35, %rd34, %rd28;\n" -" ld.global.f32 %f57, [%rd35+8];\n" -" setp.gt.ftz.f32 %p4, %f57, %f56;\n" -" @!%p4 bra $Lt_0_26114;\n" -" rcp.approx.ftz.f32 %f58, %f56;\n" -" ld.global.f32 %f59, [%rd35+12];\n" -" setp.lt.ftz.f32 %p5, %f56, %f59;\n" -" @!%p5 bra $Lt_0_24322;\n" -" .loc 16 91 0\n" -" mul.ftz.f32 %f60, %f58, %f58;\n" -" mul.ftz.f32 %f61, %f58, %f60;\n" -" mov.f32 %f62, %f61;\n" -" .loc 16 92 0\n" -" mul.ftz.f32 %f63, %f61, %f39;\n" -" ld.global.v2.f32 {%f64,%f65}, [%rd35+0];\n" -" mul.ftz.f32 %f66, %f64, %f61;\n" -" sub.ftz.f32 %f67, %f66, %f65;\n" -" mul.ftz.f32 %f68, %f63, %f67;\n" -" bra.uni $Lt_0_24066;\n" -"$Lt_0_24322:\n" -" .loc 16 94 0\n" -" mov.f32 %f68, 0f00000000; \n" -"$Lt_0_24066:\n" -" ld.param.f32 %f69, [__cudaparm_kernel_pair_cut_coulsq];\n" -" setp.gt.ftz.f32 %p6, %f69, %f56;\n" -" @!%p6 bra $Lt_0_24834;\n" -" .loc 20 518 0\n" -" rsqrt.approx.ftz.f32 %f70, %f58;\n" -" ld.param.f32 %f71, [__cudaparm_kernel_pair_g_ewald];\n" -" mul.ftz.f32 %f72, %f71, %f70;\n" -" mul.ftz.f32 %f73, %f72, %f72;\n" -" neg.ftz.f32 %f74, %f73;\n" -" mov.f32 %f75, 0f3fb8aa3b; \n" -" mul.ftz.f32 %f76, %f74, %f75;\n" -" ex2.approx.ftz.f32 %f77, %f76;\n" -" .loc 16 101 0\n" -" mov.f32 %f78, 0f3f800000; \n" -" mov.f32 %f79, 0f3ea7ba05; \n" -" fma.rn.ftz.f32 %f80, %f79, %f72, %f78;\n" -" rcp.approx.ftz.f32 %f81, %f80;\n" -" mov.f32 %f82, 0f3e827906; \n" -" mov.f32 %f83, 0fbe91a98e; \n" -" mov.f32 %f84, 0f3fb5f0e3; \n" -" mov.f32 %f85, 0fbfba00e3; \n" -" mov.f32 %f86, 0f3f87dc22; \n" -" fma.rn.ftz.f32 %f87, %f86, %f81, %f85;\n" -" fma.rn.ftz.f32 %f88, %f81, %f87, %f84;\n" -" fma.rn.ftz.f32 %f89, %f81, %f88, %f83;\n" -" fma.rn.ftz.f32 %f90, %f81, %f89, %f82;\n" -" mul.ftz.f32 %f91, %f81, %f90;\n" -" mul.ftz.f32 %f92, %f77, %f91;\n" -" mov.f32 %f93, %f92;\n" -" .loc 16 102 0\n" -" mov.u32 %r54, %r44;\n" -" mov.s32 %r55, 0;\n" -" mov.u32 %r56, %r55;\n" -" mov.s32 %r57, 0;\n" -" mov.u32 %r58, %r57;\n" -" mov.s32 %r59, 0;\n" -" mov.u32 %r60, %r59;\n" -" tex.1d.v4.f32.s32 {%f94,%f95,%f96,%f97},[q_tex,{%r54,%r56,%r58,%r60}];\n" -" mov.f32 %f98, %f94;\n" -" ld.param.f32 %f99, [__cudaparm_kernel_pair_qqrd2e];\n" -" mul.ftz.f32 %f100, %f99, %f33;\n" -" mul.ftz.f32 %f101, %f100, %f98;\n" -" div.approx.ftz.f32 %f102, %f101, %f70;\n" -" mov.f32 %f103, %f102;\n" -" .loc 16 103 0\n" -" mov.f32 %f104, 0f3f906ebb; \n" -" mul.ftz.f32 %f105, %f72, %f104;\n" -" fma.rn.ftz.f32 %f106, %f77, %f105, %f92;\n" -" sub.ftz.f32 %f107, %f106, %f42;\n" -" mul.ftz.f32 %f108, %f102, %f107;\n" -" bra.uni $Lt_0_24578;\n" -"$Lt_0_24834:\n" -" .loc 16 105 0\n" -" mov.f32 %f108, 0f00000000; \n" -"$Lt_0_24578:\n" -" .loc 16 109 0\n" -" add.ftz.f32 %f109, %f108, %f68;\n" -" mul.ftz.f32 %f110, %f109, %f58;\n" -" fma.rn.ftz.f32 %f36, %f52, %f110, %f36;\n" -" .loc 16 110 0\n" -" fma.rn.ftz.f32 %f35, %f51, %f110, %f35;\n" -" .loc 16 111 0\n" -" fma.rn.ftz.f32 %f34, %f53, %f110, %f34;\n" -" ld.param.s32 %r61, [__cudaparm_kernel_pair_eflag];\n" -" mov.u32 %r62, 0;\n" -" setp.le.s32 %p7, %r61, %r62;\n" -" @%p7 bra $Lt_0_25602;\n" -" .loc 16 114 0\n" -" mov.f32 %f111, %f103;\n" -" mov.f32 %f112, %f93;\n" -" sub.ftz.f32 %f113, %f112, %f42;\n" -" fma.rn.ftz.f32 %f114, %f111, %f113, %f37;\n" -" selp.f32 %f37, %f114, %f37, %p6;\n" -" @!%p5 bra $Lt_0_25602;\n" -" .loc 16 118 0\n" -" ld.param.u64 %rd36, [__cudaparm_kernel_pair_lj3];\n" -" add.u64 %rd37, %rd36, %rd34;\n" -" mov.f32 %f115, %f62;\n" -" ld.global.v4.f32 {%f116,%f117,%f118,_}, [%rd37+0];\n" -" mul.ftz.f32 %f119, %f116, %f115;\n" -" sub.ftz.f32 %f120, %f119, %f117;\n" -" mul.ftz.f32 %f121, %f115, %f120;\n" -" sub.ftz.f32 %f122, %f121, %f118;\n" -" fma.rn.ftz.f32 %f38, %f39, %f122, %f38;\n" -"$Lt_0_25602:\n" -"$Lt_0_25090:\n" -" ld.param.s32 %r63, [__cudaparm_kernel_pair_vflag];\n" -" mov.u32 %r64, 0;\n" -" setp.le.s32 %p8, %r63, %r64;\n" -" @%p8 bra $Lt_0_26114;\n" -" .loc 16 122 0\n" -" mov.f32 %f123, %f10;\n" -" mul.ftz.f32 %f124, %f52, %f52;\n" -" fma.rn.ftz.f32 %f125, %f110, %f124, %f123;\n" -" mov.f32 %f10, %f125;\n" -" .loc 16 123 0\n" -" mov.f32 %f126, %f12;\n" -" fma.rn.ftz.f32 %f127, %f110, %f54, %f126;\n" -" mov.f32 %f12, %f127;\n" -" .loc 16 124 0\n" -" mov.f32 %f128, %f14;\n" -" mul.ftz.f32 %f129, %f53, %f53;\n" -" fma.rn.ftz.f32 %f130, %f110, %f129, %f128;\n" -" mov.f32 %f14, %f130;\n" -" .loc 16 125 0\n" -" mov.f32 %f131, %f16;\n" -" mul.ftz.f32 %f132, %f51, %f52;\n" -" fma.rn.ftz.f32 %f133, %f110, %f132, %f131;\n" -" mov.f32 %f16, %f133;\n" -" .loc 16 126 0\n" -" mov.f32 %f134, %f18;\n" -" mul.ftz.f32 %f135, %f52, %f53;\n" -" fma.rn.ftz.f32 %f136, %f110, %f135, %f134;\n" -" mov.f32 %f18, %f136;\n" -" .loc 16 127 0\n" -" mul.ftz.f32 %f137, %f51, %f53;\n" -" fma.rn.ftz.f32 %f19, %f110, %f137, %f19;\n" -" mov.f32 %f20, %f19;\n" -"$Lt_0_26114:\n" -"$Lt_0_23554:\n" -" .loc 16 68 0\n" -" mul.lo.u64 %rd38, %rd27, 4;\n" -" add.u64 %rd20, %rd20, %rd38;\n" -" setp.lt.u64 %p9, %rd20, %rd19;\n" -" @%p9 bra $Lt_0_23298;\n" -" bra.uni $Lt_0_22786;\n" -"$Lt_0_32514:\n" -" mov.f32 %f34, 0f00000000; \n" -" mov.f32 %f35, 0f00000000; \n" -" mov.f32 %f36, 0f00000000; \n" -" mov.f32 %f37, 0f00000000; \n" -" mov.f32 %f38, 0f00000000; \n" -"$Lt_0_22786:\n" -" mov.u32 %r65, 1;\n" -" setp.le.s32 %p10, %r1, %r65;\n" -" @%p10 bra $Lt_0_28930;\n" -" .loc 16 132 0\n" -" mov.u64 %rd39, __cuda___cuda_local_var_32633_55_non_const_red_acc144;\n" -" cvt.s64.s32 %rd40, %r2;\n" -" mul.wide.s32 %rd41, %r2, 4;\n" -" add.u64 %rd42, %rd39, %rd41;\n" -" mov.f32 %f138, %f36;\n" -" st.shared.f32 [%rd42+0], %f138;\n" -" mov.f32 %f139, %f35;\n" -" st.shared.f32 [%rd42+512], %f139;\n" -" mov.f32 %f140, %f34;\n" -" st.shared.f32 [%rd42+1024], %f140;\n" -" mov.f32 %f141, %f38;\n" -" st.shared.f32 [%rd42+1536], %f141;\n" -" mov.f32 %f142, %f37;\n" -" st.shared.f32 [%rd42+2048], %f142;\n" -" shr.s32 %r66, %r1, 31;\n" -" mov.s32 %r67, 1;\n" -" and.b32 %r68, %r66, %r67;\n" -" add.s32 %r69, %r68, %r1;\n" -" shr.s32 %r70, %r69, 1;\n" -" mov.s32 %r71, %r70;\n" -" mov.u32 %r72, 0;\n" -" setp.ne.u32 %p11, %r70, %r72;\n" -" @!%p11 bra $Lt_0_27394;\n" -"$Lt_0_27906:\n" -" setp.ge.u32 %p12, %r14, %r71;\n" -" @%p12 bra $Lt_0_28162;\n" -" add.u32 %r73, %r2, %r71;\n" -" cvt.u64.u32 %rd43, %r73;\n" -" mul.wide.u32 %rd44, %r73, 4;\n" -" add.u64 %rd45, %rd39, %rd44;\n" -" ld.shared.f32 %f143, [%rd45+0];\n" -" add.ftz.f32 %f138, %f143, %f138;\n" -" st.shared.f32 [%rd42+0], %f138;\n" -" ld.shared.f32 %f144, [%rd45+512];\n" -" add.ftz.f32 %f139, %f144, %f139;\n" -" st.shared.f32 [%rd42+512], %f139;\n" -" ld.shared.f32 %f145, [%rd45+1024];\n" -" add.ftz.f32 %f140, %f145, %f140;\n" -" st.shared.f32 [%rd42+1024], %f140;\n" -" ld.shared.f32 %f146, [%rd45+1536];\n" -" add.ftz.f32 %f141, %f146, %f141;\n" -" st.shared.f32 [%rd42+1536], %f141;\n" -" ld.shared.f32 %f147, [%rd45+2048];\n" -" add.ftz.f32 %f142, %f147, %f142;\n" -" st.shared.f32 [%rd42+2048], %f142;\n" -"$Lt_0_28162:\n" -" shr.u32 %r71, %r71, 1;\n" -" mov.u32 %r74, 0;\n" -" setp.ne.u32 %p13, %r71, %r74;\n" -" @%p13 bra $Lt_0_27906;\n" -"$Lt_0_27394:\n" -" mov.f32 %f36, %f138;\n" -" mov.f32 %f35, %f139;\n" -" mov.f32 %f34, %f140;\n" -" mov.f32 %f38, %f141;\n" -" mov.f32 %f37, %f142;\n" -" ld.param.s32 %r75, [__cudaparm_kernel_pair_vflag];\n" -" mov.u32 %r76, 0;\n" -" setp.le.s32 %p14, %r75, %r76;\n" -" @%p14 bra $Lt_0_28930;\n" -" mov.f32 %f138, %f10;\n" -" st.shared.f32 [%rd42+0], %f138;\n" -" mov.f32 %f139, %f12;\n" -" st.shared.f32 [%rd42+512], %f139;\n" -" mov.f32 %f140, %f14;\n" -" st.shared.f32 [%rd42+1024], %f140;\n" -" mov.f32 %f141, %f16;\n" -" st.shared.f32 [%rd42+1536], %f141;\n" -" mov.f32 %f142, %f18;\n" -" st.shared.f32 [%rd42+2048], %f142;\n" -" mov.f32 %f148, %f19;\n" -" st.shared.f32 [%rd42+2560], %f148;\n" -" mov.s32 %r77, %r70;\n" -" @!%p11 bra $Lt_0_29442;\n" -"$Lt_0_29954:\n" -" setp.ge.u32 %p15, %r14, %r77;\n" -" @%p15 bra $Lt_0_30210;\n" -" add.u32 %r78, %r2, %r77;\n" -" cvt.u64.u32 %rd46, %r78;\n" -" mul.wide.u32 %rd47, %r78, 4;\n" -" add.u64 %rd48, %rd39, %rd47;\n" -" ld.shared.f32 %f149, [%rd48+0];\n" -" add.ftz.f32 %f138, %f149, %f138;\n" -" st.shared.f32 [%rd42+0], %f138;\n" -" ld.shared.f32 %f150, [%rd48+512];\n" -" add.ftz.f32 %f139, %f150, %f139;\n" -" st.shared.f32 [%rd42+512], %f139;\n" -" ld.shared.f32 %f151, [%rd48+1024];\n" -" add.ftz.f32 %f140, %f151, %f140;\n" -" st.shared.f32 [%rd42+1024], %f140;\n" -" ld.shared.f32 %f152, [%rd48+1536];\n" -" add.ftz.f32 %f141, %f152, %f141;\n" -" st.shared.f32 [%rd42+1536], %f141;\n" -" ld.shared.f32 %f153, [%rd48+2048];\n" -" add.ftz.f32 %f142, %f153, %f142;\n" -" st.shared.f32 [%rd42+2048], %f142;\n" -" ld.shared.f32 %f154, [%rd48+2560];\n" -" add.ftz.f32 %f148, %f154, %f148;\n" -" st.shared.f32 [%rd42+2560], %f148;\n" -"$Lt_0_30210:\n" -" shr.u32 %r77, %r77, 1;\n" -" mov.u32 %r79, 0;\n" -" setp.ne.u32 %p16, %r77, %r79;\n" -" @%p16 bra $Lt_0_29954;\n" -"$Lt_0_29442:\n" -" mov.f32 %f10, %f138;\n" -" mov.f32 %f12, %f139;\n" -" mov.f32 %f14, %f140;\n" -" mov.f32 %f16, %f141;\n" -" mov.f32 %f18, %f142;\n" -" mov.f32 %f20, %f148;\n" -"$Lt_0_28930:\n" -"$Lt_0_26882:\n" -" mov.u32 %r80, 0;\n" -" setp.ne.s32 %p17, %r14, %r80;\n" -" @%p17 bra $Lt_0_30978;\n" -" ld.param.u64 %rd49, [__cudaparm_kernel_pair___val_paramengv];\n" -" add.u64 %rd50, %rd49, %rd3;\n" -" ld.param.s32 %r81, [__cudaparm_kernel_pair_eflag];\n" -" mov.u32 %r82, 0;\n" -" setp.le.s32 %p18, %r81, %r82;\n" -" @%p18 bra $Lt_0_31490;\n" -" st.global.f32 [%rd50+0], %f38;\n" -" cvt.s64.s32 %rd51, %r9;\n" -" mul.wide.s32 %rd52, %r9, 4;\n" -" add.u64 %rd53, %rd52, %rd50;\n" -" st.global.f32 [%rd53+0], %f37;\n" -" add.u64 %rd50, %rd52, %rd53;\n" -"$Lt_0_31490:\n" -" ld.param.s32 %r83, [__cudaparm_kernel_pair_vflag];\n" -" mov.u32 %r84, 0;\n" -" setp.le.s32 %p19, %r83, %r84;\n" -" @%p19 bra $Lt_0_32002;\n" -" mov.f32 %f155, %f10;\n" -" st.global.f32 [%rd50+0], %f155;\n" -" cvt.s64.s32 %rd54, %r9;\n" -" mul.wide.s32 %rd55, %r9, 4;\n" -" add.u64 %rd56, %rd55, %rd50;\n" -" mov.f32 %f156, %f12;\n" -" st.global.f32 [%rd56+0], %f156;\n" -" add.u64 %rd57, %rd55, %rd56;\n" -" mov.f32 %f157, %f14;\n" -" st.global.f32 [%rd57+0], %f157;\n" -" add.u64 %rd58, %rd55, %rd57;\n" -" mov.f32 %f158, %f16;\n" -" st.global.f32 [%rd58+0], %f158;\n" -" add.u64 %rd50, %rd55, %rd58;\n" -" mov.f32 %f159, %f18;\n" -" st.global.f32 [%rd50+0], %f159;\n" -" mov.f32 %f160, %f20;\n" -" add.u64 %rd59, %rd55, %rd50;\n" -" st.global.f32 [%rd59+0], %f160;\n" -"$Lt_0_32002:\n" -" ld.param.u64 %rd60, [__cudaparm_kernel_pair_ans];\n" -" mul.lo.u64 %rd61, %rd2, 16;\n" -" add.u64 %rd62, %rd60, %rd61;\n" -" mov.f32 %f161, %f162;\n" -" st.global.v4.f32 [%rd62+0], {%f36,%f35,%f34,%f161};\n" -"$Lt_0_30978:\n" -"$Lt_0_21762:\n" -" .loc 16 135 0\n" -" exit;\n" -"$LDWend_kernel_pair:\n" -" }\n" -" .entry kernel_pair_fast (\n" -" .param .u64 __cudaparm_kernel_pair_fast_x_,\n" -" .param .u64 __cudaparm_kernel_pair_fast_lj1_in,\n" -" .param .u64 __cudaparm_kernel_pair_fast_lj3_in,\n" -" .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,\n" -" .param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n" -" .param .u64 __cudaparm_kernel_pair_fast_dev_packed,\n" -" .param .u64 __cudaparm_kernel_pair_fast_ans,\n" -" .param .u64 __cudaparm_kernel_pair_fast___val_paramengv,\n" -" .param .s32 __cudaparm_kernel_pair_fast_eflag,\n" -" .param .s32 __cudaparm_kernel_pair_fast_vflag,\n" -" .param .s32 __cudaparm_kernel_pair_fast_inum,\n" -" .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,\n" -" .param .u64 __cudaparm_kernel_pair_fast_q_,\n" -" .param .f32 __cudaparm_kernel_pair_fast_cut_coulsq,\n" -" .param .f32 __cudaparm_kernel_pair_fast_qqrd2e,\n" -" .param .f32 __cudaparm_kernel_pair_fast_g_ewald,\n" -" .param .s32 __cudaparm_kernel_pair_fast_t_per_atom)\n" -" {\n" -" .reg .u32 %r<88>;\n" -" .reg .u64 %rd<76>;\n" -" .reg .f32 %f<167>;\n" -" .reg .pred %p<24>;\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_32652_33_non_const_sp_lj3320[32];\n" -" .shared .align 16 .b8 __cuda___cuda_local_var_32650_34_non_const_lj13360[1936];\n" -" .shared .align 16 .b8 __cuda___cuda_local_var_32651_34_non_const_lj35296[1936];\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_32745_55_non_const_red_acc7232[3072];\n" -" .loc 16 145 0\n" -"$LDWbegin_kernel_pair_fast:\n" -" cvt.s32.u32 %r1, %tid.x;\n" -" mov.u32 %r2, 7;\n" -" setp.gt.s32 %p1, %r1, %r2;\n" -" @%p1 bra $Lt_1_24066;\n" -" .loc 16 153 0\n" -" mov.u64 %rd1, __cuda___cuda_local_var_32652_33_non_const_sp_lj3320;\n" -" cvt.s64.s32 %rd2, %r1;\n" -" mul.wide.s32 %rd3, %r1, 4;\n" -" ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];\n" -" add.u64 %rd5, %rd4, %rd3;\n" -" ld.global.f32 %f1, [%rd5+0];\n" -" add.u64 %rd6, %rd3, %rd1;\n" -" st.shared.f32 [%rd6+0], %f1;\n" -"$Lt_1_24066:\n" -" mov.u64 %rd1, __cuda___cuda_local_var_32652_33_non_const_sp_lj3320;\n" -" mov.u32 %r3, 120;\n" -" setp.gt.s32 %p2, %r1, %r3;\n" -" @%p2 bra $Lt_1_24578;\n" -" .loc 16 155 0\n" -" mov.u64 %rd7, __cuda___cuda_local_var_32650_34_non_const_lj13360;\n" -" cvt.s64.s32 %rd8, %r1;\n" -" mul.wide.s32 %rd9, %r1, 16;\n" -" ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_lj1_in];\n" -" add.u64 %rd11, %rd10, %rd9;\n" -" add.u64 %rd12, %rd9, %rd7;\n" -" ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0];\n" -" st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5};\n" -" ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag];\n" -" mov.u32 %r5, 0;\n" -" setp.le.s32 %p3, %r4, %r5;\n" -" @%p3 bra $Lt_1_25090;\n" -" .loc 16 157 0\n" -" mov.u64 %rd13, __cuda___cuda_local_var_32651_34_non_const_lj35296;\n" -" ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in];\n" -" add.u64 %rd15, %rd14, %rd9;\n" -" add.u64 %rd16, %rd9, %rd13;\n" -" ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0];\n" -" st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9};\n" -"$Lt_1_25090:\n" -" mov.u64 %rd13, __cuda___cuda_local_var_32651_34_non_const_lj35296;\n" -"$Lt_1_24578:\n" -" mov.u64 %rd13, __cuda___cuda_local_var_32651_34_non_const_lj35296;\n" -" mov.u64 %rd7, __cuda___cuda_local_var_32650_34_non_const_lj13360;\n" -" .loc 16 166 0\n" -" mov.f32 %f10, 0f00000000; \n" -" mov.f32 %f11, %f10;\n" -" mov.f32 %f12, 0f00000000; \n" -" mov.f32 %f13, %f12;\n" -" mov.f32 %f14, 0f00000000; \n" -" mov.f32 %f15, %f14;\n" -" mov.f32 %f16, 0f00000000; \n" -" mov.f32 %f17, %f16;\n" -" mov.f32 %f18, 0f00000000; \n" -" mov.f32 %f19, %f18;\n" -" mov.f32 %f20, 0f00000000; \n" -" mov.f32 %f21, %f20;\n" -" .loc 16 168 0\n" -" bar.sync 0;\n" -" ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom];\n" -" div.s32 %r7, %r1, %r6;\n" -" cvt.s32.u32 %r8, %ntid.x;\n" -" div.s32 %r9, %r8, %r6;\n" -" cvt.s32.u32 %r10, %ctaid.x;\n" -" mul.lo.s32 %r11, %r10, %r9;\n" -" add.s32 %r12, %r7, %r11;\n" -" ld.param.s32 %r13, [__cudaparm_kernel_pair_fast_inum];\n" -" setp.ge.s32 %p4, %r12, %r13;\n" -" @%p4 bra $Lt_1_34818;\n" -" .loc 16 173 0\n" -" cvt.s64.s32 %rd17, %r12;\n" -" mul.wide.s32 %rd18, %r12, 4;\n" -" ld.param.u64 %rd19, [__cudaparm_kernel_pair_fast_dev_nbor];\n" -" add.u64 %rd20, %rd18, %rd19;\n" -" ld.global.s32 %r14, [%rd20+0];\n" -" ld.param.s32 %r15, [__cudaparm_kernel_pair_fast_nbor_pitch];\n" -" cvt.s64.s32 %rd21, %r15;\n" -" mul.wide.s32 %rd22, %r15, 4;\n" -" add.u64 %rd23, %rd22, %rd20;\n" -" ld.global.s32 %r16, [%rd23+0];\n" -" sub.s32 %r17, %r6, 1;\n" -" and.b32 %r18, %r17, %r1;\n" -" cvt.s64.s32 %rd24, %r18;\n" -" mul.wide.s32 %rd25, %r18, 4;\n" -" ld.param.u64 %rd26, [__cudaparm_kernel_pair_fast_dev_packed];\n" -" setp.ne.u64 %p5, %rd26, %rd19;\n" -" @%p5 bra $Lt_1_26370;\n" -" cvt.s32.s64 %r19, %rd21;\n" -" mul.lo.s32 %r20, %r19, %r6;\n" -" mov.s32 %r21, %r20;\n" -" mul.lo.s32 %r22, %r17, %r12;\n" -" add.s32 %r23, %r19, %r22;\n" -" cvt.s64.s32 %rd27, %r23;\n" -" mul.wide.s32 %rd28, %r23, 4;\n" -" add.u64 %rd29, %rd23, %rd28;\n" -" and.b32 %r24, %r17, %r16;\n" -" cvt.s64.s32 %rd30, %r24;\n" -" div.s32 %r25, %r16, %r6;\n" -" mul.lo.s32 %r26, %r20, %r25;\n" -" cvt.s64.s32 %rd31, %r26;\n" -" add.u64 %rd32, %rd30, %rd31;\n" -" mul.lo.u64 %rd33, %rd32, 4;\n" -" add.u64 %rd34, %rd29, %rd33;\n" -" add.u64 %rd35, %rd25, %rd29;\n" -" bra.uni $Lt_1_26114;\n" -"$Lt_1_26370:\n" -" add.u64 %rd36, %rd22, %rd23;\n" -" ld.global.s32 %r27, [%rd36+0];\n" -" cvt.s64.s32 %rd37, %r27;\n" -" mul.wide.s32 %rd38, %r27, 4;\n" -" add.u64 %rd39, %rd26, %rd38;\n" -" cvt.s64.s32 %rd40, %r16;\n" -" mul.wide.s32 %rd41, %r16, 4;\n" -" add.u64 %rd34, %rd39, %rd41;\n" -" mov.s32 %r21, %r6;\n" -" add.u64 %rd35, %rd25, %rd39;\n" -"$Lt_1_26114:\n" -" .loc 16 176 0\n" -" mov.u32 %r28, %r14;\n" -" mov.s32 %r29, 0;\n" -" mov.u32 %r30, %r29;\n" -" mov.s32 %r31, 0;\n" -" mov.u32 %r32, %r31;\n" -" mov.s32 %r33, 0;\n" -" mov.u32 %r34, %r33;\n" -" tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r28,%r30,%r32,%r34}];\n" -" mov.f32 %f26, %f22;\n" -" mov.f32 %f27, %f23;\n" -" mov.f32 %f28, %f24;\n" -" mov.f32 %f29, %f25;\n" -" .loc 16 177 0\n" -" mov.u32 %r35, %r14;\n" -" mov.s32 %r36, 0;\n" -" mov.u32 %r37, %r36;\n" -" mov.s32 %r38, 0;\n" -" mov.u32 %r39, %r38;\n" -" mov.s32 %r40, 0;\n" -" mov.u32 %r41, %r40;\n" -" tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[q_tex,{%r35,%r37,%r39,%r41}];\n" -" mov.f32 %f34, %f30;\n" -" setp.ge.u64 %p6, %rd35, %rd34;\n" -" @%p6 bra $Lt_1_36354;\n" -" cvt.rzi.ftz.s32.f32 %r42, %f29;\n" -" cvt.s64.s32 %rd42, %r21;\n" -" mul.lo.s32 %r43, %r42, 11;\n" -" cvt.rn.f32.s32 %f35, %r43;\n" -" mov.f32 %f36, 0f00000000; \n" -" mov.f32 %f37, 0f00000000; \n" -" mov.f32 %f38, 0f00000000; \n" -" mov.f32 %f39, 0f00000000; \n" -" mov.f32 %f40, 0f00000000; \n" -"$Lt_1_27138:\n" -" .loc 16 182 0\n" -" ld.global.s32 %r44, [%rd35+0];\n" -" .loc 16 185 0\n" -" shr.s32 %r45, %r44, 30;\n" -" and.b32 %r46, %r45, 3;\n" -" cvt.s64.s32 %rd43, %r46;\n" -" mul.wide.s32 %rd44, %r46, 4;\n" -" add.u64 %rd45, %rd1, %rd44;\n" -" ld.shared.f32 %f41, [%rd45+0];\n" -" .loc 16 186 0\n" -" mov.f32 %f42, 0f3f800000; \n" -" ld.shared.f32 %f43, [%rd45+16];\n" -" sub.ftz.f32 %f44, %f42, %f43;\n" -" .loc 16 189 0\n" -" and.b32 %r47, %r44, 1073741823;\n" -" mov.u32 %r48, %r47;\n" -" mov.s32 %r49, 0;\n" -" mov.u32 %r50, %r49;\n" -" mov.s32 %r51, 0;\n" -" mov.u32 %r52, %r51;\n" -" mov.s32 %r53, 0;\n" -" mov.u32 %r54, %r53;\n" -" tex.1d.v4.f32.s32 {%f45,%f46,%f47,%f48},[pos_tex,{%r48,%r50,%r52,%r54}];\n" -" mov.f32 %f49, %f45;\n" -" mov.f32 %f50, %f46;\n" -" mov.f32 %f51, %f47;\n" -" mov.f32 %f52, %f48;\n" -" sub.ftz.f32 %f53, %f27, %f50;\n" -" sub.ftz.f32 %f54, %f26, %f49;\n" -" sub.ftz.f32 %f55, %f28, %f51;\n" -" mul.ftz.f32 %f56, %f53, %f53;\n" -" fma.rn.ftz.f32 %f57, %f54, %f54, %f56;\n" -" fma.rn.ftz.f32 %f58, %f55, %f55, %f57;\n" -" add.ftz.f32 %f59, %f35, %f52;\n" -" cvt.rzi.ftz.s32.f32 %r55, %f59;\n" -" cvt.s64.s32 %rd46, %r55;\n" -" mul.wide.s32 %rd47, %r55, 16;\n" -" add.u64 %rd48, %rd47, %rd7;\n" -" ld.shared.f32 %f60, [%rd48+8];\n" -" setp.gt.ftz.f32 %p7, %f60, %f58;\n" -" @!%p7 bra $Lt_1_29954;\n" -" rcp.approx.ftz.f32 %f61, %f58;\n" -" ld.shared.f32 %f62, [%rd48+12];\n" -" setp.lt.ftz.f32 %p8, %f58, %f62;\n" -" @!%p8 bra $Lt_1_28162;\n" -" .loc 16 203 0\n" -" mul.ftz.f32 %f63, %f61, %f61;\n" -" mul.ftz.f32 %f64, %f61, %f63;\n" -" mov.f32 %f65, %f64;\n" -" .loc 16 204 0\n" -" mul.ftz.f32 %f66, %f64, %f41;\n" -" ld.shared.v2.f32 {%f67,%f68}, [%rd48+0];\n" -" mul.ftz.f32 %f69, %f67, %f64;\n" -" sub.ftz.f32 %f70, %f69, %f68;\n" -" mul.ftz.f32 %f71, %f66, %f70;\n" -" bra.uni $Lt_1_27906;\n" -"$Lt_1_28162:\n" -" .loc 16 206 0\n" -" mov.f32 %f71, 0f00000000; \n" -"$Lt_1_27906:\n" -" ld.param.f32 %f72, [__cudaparm_kernel_pair_fast_cut_coulsq];\n" -" setp.gt.ftz.f32 %p9, %f72, %f58;\n" -" @!%p9 bra $Lt_1_28674;\n" -" .loc 20 518 0\n" -" rsqrt.approx.ftz.f32 %f73, %f61;\n" -" ld.param.f32 %f74, [__cudaparm_kernel_pair_fast_g_ewald];\n" -" mul.ftz.f32 %f75, %f74, %f73;\n" -" mul.ftz.f32 %f76, %f75, %f75;\n" -" neg.ftz.f32 %f77, %f76;\n" -" mov.f32 %f78, 0f3fb8aa3b; \n" -" mul.ftz.f32 %f79, %f77, %f78;\n" -" ex2.approx.ftz.f32 %f80, %f79;\n" -" .loc 16 213 0\n" -" mov.f32 %f81, 0f3f800000; \n" -" mov.f32 %f82, 0f3ea7ba05; \n" -" fma.rn.ftz.f32 %f83, %f82, %f75, %f81;\n" -" rcp.approx.ftz.f32 %f84, %f83;\n" -" mov.f32 %f85, 0f3e827906; \n" -" mov.f32 %f86, 0fbe91a98e; \n" -" mov.f32 %f87, 0f3fb5f0e3; \n" -" mov.f32 %f88, 0fbfba00e3; \n" -" mov.f32 %f89, 0f3f87dc22; \n" -" fma.rn.ftz.f32 %f90, %f89, %f84, %f88;\n" -" fma.rn.ftz.f32 %f91, %f84, %f90, %f87;\n" -" fma.rn.ftz.f32 %f92, %f84, %f91, %f86;\n" -" fma.rn.ftz.f32 %f93, %f84, %f92, %f85;\n" -" mul.ftz.f32 %f94, %f84, %f93;\n" -" mul.ftz.f32 %f95, %f80, %f94;\n" -" mov.f32 %f96, %f95;\n" -" .loc 16 214 0\n" -" mov.u32 %r56, %r47;\n" -" mov.s32 %r57, 0;\n" -" mov.u32 %r58, %r57;\n" -" mov.s32 %r59, 0;\n" -" mov.u32 %r60, %r59;\n" -" mov.s32 %r61, 0;\n" -" mov.u32 %r62, %r61;\n" -" tex.1d.v4.f32.s32 {%f97,%f98,%f99,%f100},[q_tex,{%r56,%r58,%r60,%r62}];\n" -" mov.f32 %f101, %f97;\n" -" ld.param.f32 %f102, [__cudaparm_kernel_pair_fast_qqrd2e];\n" -" mul.ftz.f32 %f103, %f102, %f34;\n" -" mul.ftz.f32 %f104, %f103, %f101;\n" -" div.approx.ftz.f32 %f105, %f104, %f73;\n" -" mov.f32 %f106, %f105;\n" -" .loc 16 215 0\n" -" mov.f32 %f107, 0f3f906ebb; \n" -" mul.ftz.f32 %f108, %f75, %f107;\n" -" fma.rn.ftz.f32 %f109, %f80, %f108, %f95;\n" -" sub.ftz.f32 %f110, %f109, %f44;\n" -" mul.ftz.f32 %f111, %f105, %f110;\n" -" bra.uni $Lt_1_28418;\n" -"$Lt_1_28674:\n" -" .loc 16 217 0\n" -" mov.f32 %f111, 0f00000000; \n" -"$Lt_1_28418:\n" -" .loc 16 221 0\n" -" add.ftz.f32 %f112, %f111, %f71;\n" -" mul.ftz.f32 %f113, %f112, %f61;\n" -" fma.rn.ftz.f32 %f38, %f54, %f113, %f38;\n" -" .loc 16 222 0\n" -" fma.rn.ftz.f32 %f37, %f53, %f113, %f37;\n" -" .loc 16 223 0\n" -" fma.rn.ftz.f32 %f36, %f55, %f113, %f36;\n" -" ld.param.s32 %r63, [__cudaparm_kernel_pair_fast_eflag];\n" -" mov.u32 %r64, 0;\n" -" setp.le.s32 %p10, %r63, %r64;\n" -" @%p10 bra $Lt_1_29442;\n" -" .loc 16 226 0\n" -" mov.f32 %f114, %f106;\n" -" mov.f32 %f115, %f96;\n" -" sub.ftz.f32 %f116, %f115, %f44;\n" -" fma.rn.ftz.f32 %f117, %f114, %f116, %f39;\n" -" selp.f32 %f39, %f117, %f39, %p9;\n" -" @!%p8 bra $Lt_1_29442;\n" -" .loc 16 229 0\n" -" add.u64 %rd49, %rd47, %rd13;\n" -" mov.f32 %f118, %f65;\n" -" ld.shared.v4.f32 {%f119,%f120,%f121,_}, [%rd49+0];\n" -" mul.ftz.f32 %f122, %f119, %f118;\n" -" sub.ftz.f32 %f123, %f122, %f120;\n" -" mul.ftz.f32 %f124, %f118, %f123;\n" -" .loc 16 230 0\n" -" sub.ftz.f32 %f125, %f124, %f121;\n" -" fma.rn.ftz.f32 %f40, %f41, %f125, %f40;\n" -"$Lt_1_29442:\n" -"$Lt_1_28930:\n" -" ld.param.s32 %r65, [__cudaparm_kernel_pair_fast_vflag];\n" -" mov.u32 %r66, 0;\n" -" setp.le.s32 %p11, %r65, %r66;\n" -" @%p11 bra $Lt_1_29954;\n" -" .loc 16 234 0\n" -" mov.f32 %f126, %f11;\n" -" mul.ftz.f32 %f127, %f54, %f54;\n" -" fma.rn.ftz.f32 %f128, %f113, %f127, %f126;\n" -" mov.f32 %f11, %f128;\n" -" .loc 16 235 0\n" -" mov.f32 %f129, %f13;\n" -" fma.rn.ftz.f32 %f130, %f113, %f56, %f129;\n" -" mov.f32 %f13, %f130;\n" -" .loc 16 236 0\n" -" mov.f32 %f131, %f15;\n" -" mul.ftz.f32 %f132, %f55, %f55;\n" -" fma.rn.ftz.f32 %f133, %f113, %f132, %f131;\n" -" mov.f32 %f15, %f133;\n" -" .loc 16 237 0\n" -" mov.f32 %f134, %f17;\n" -" mul.ftz.f32 %f135, %f53, %f54;\n" -" fma.rn.ftz.f32 %f136, %f113, %f135, %f134;\n" -" mov.f32 %f17, %f136;\n" -" .loc 16 238 0\n" -" mov.f32 %f137, %f19;\n" -" mul.ftz.f32 %f138, %f54, %f55;\n" -" fma.rn.ftz.f32 %f139, %f113, %f138, %f137;\n" -" mov.f32 %f19, %f139;\n" -" .loc 16 239 0\n" -" mul.ftz.f32 %f140, %f53, %f55;\n" -" fma.rn.ftz.f32 %f20, %f113, %f140, %f20;\n" -" mov.f32 %f21, %f20;\n" -"$Lt_1_29954:\n" -"$Lt_1_27394:\n" -" .loc 16 181 0\n" -" mul.lo.u64 %rd50, %rd42, 4;\n" -" add.u64 %rd35, %rd35, %rd50;\n" -" setp.lt.u64 %p12, %rd35, %rd34;\n" -" @%p12 bra $Lt_1_27138;\n" -" bra.uni $Lt_1_26626;\n" -"$Lt_1_36354:\n" -" mov.f32 %f36, 0f00000000; \n" -" mov.f32 %f37, 0f00000000; \n" -" mov.f32 %f38, 0f00000000; \n" -" mov.f32 %f39, 0f00000000; \n" -" mov.f32 %f40, 0f00000000; \n" -"$Lt_1_26626:\n" -" mov.u32 %r67, 1;\n" -" setp.le.s32 %p13, %r6, %r67;\n" -" @%p13 bra $Lt_1_32770;\n" -" .loc 16 244 0\n" -" mov.u64 %rd51, __cuda___cuda_local_var_32745_55_non_const_red_acc7232;\n" -" cvt.s64.s32 %rd52, %r1;\n" -" mul.wide.s32 %rd53, %r1, 4;\n" -" add.u64 %rd54, %rd51, %rd53;\n" -" mov.f32 %f141, %f38;\n" -" st.shared.f32 [%rd54+0], %f141;\n" -" mov.f32 %f142, %f37;\n" -" st.shared.f32 [%rd54+512], %f142;\n" -" mov.f32 %f143, %f36;\n" -" st.shared.f32 [%rd54+1024], %f143;\n" -" mov.f32 %f144, %f40;\n" -" st.shared.f32 [%rd54+1536], %f144;\n" -" mov.f32 %f145, %f39;\n" -" st.shared.f32 [%rd54+2048], %f145;\n" -" shr.s32 %r68, %r6, 31;\n" -" mov.s32 %r69, 1;\n" -" and.b32 %r70, %r68, %r69;\n" -" add.s32 %r71, %r70, %r6;\n" -" shr.s32 %r72, %r71, 1;\n" -" mov.s32 %r73, %r72;\n" -" mov.u32 %r74, 0;\n" -" setp.ne.u32 %p14, %r72, %r74;\n" -" @!%p14 bra $Lt_1_31234;\n" -"$Lt_1_31746:\n" -" setp.ge.u32 %p15, %r18, %r73;\n" -" @%p15 bra $Lt_1_32002;\n" -" add.u32 %r75, %r1, %r73;\n" -" cvt.u64.u32 %rd55, %r75;\n" -" mul.wide.u32 %rd56, %r75, 4;\n" -" add.u64 %rd57, %rd51, %rd56;\n" -" ld.shared.f32 %f146, [%rd57+0];\n" -" add.ftz.f32 %f141, %f146, %f141;\n" -" st.shared.f32 [%rd54+0], %f141;\n" -" ld.shared.f32 %f147, [%rd57+512];\n" -" add.ftz.f32 %f142, %f147, %f142;\n" -" st.shared.f32 [%rd54+512], %f142;\n" -" ld.shared.f32 %f148, [%rd57+1024];\n" -" add.ftz.f32 %f143, %f148, %f143;\n" -" st.shared.f32 [%rd54+1024], %f143;\n" -" ld.shared.f32 %f149, [%rd57+1536];\n" -" add.ftz.f32 %f144, %f149, %f144;\n" -" st.shared.f32 [%rd54+1536], %f144;\n" -" ld.shared.f32 %f150, [%rd57+2048];\n" -" add.ftz.f32 %f145, %f150, %f145;\n" -" st.shared.f32 [%rd54+2048], %f145;\n" -"$Lt_1_32002:\n" -" shr.u32 %r73, %r73, 1;\n" -" mov.u32 %r76, 0;\n" -" setp.ne.u32 %p16, %r73, %r76;\n" -" @%p16 bra $Lt_1_31746;\n" -"$Lt_1_31234:\n" -" mov.f32 %f38, %f141;\n" -" mov.f32 %f37, %f142;\n" -" mov.f32 %f36, %f143;\n" -" mov.f32 %f40, %f144;\n" -" mov.f32 %f39, %f145;\n" -" ld.param.s32 %r77, [__cudaparm_kernel_pair_fast_vflag];\n" -" mov.u32 %r78, 0;\n" -" setp.le.s32 %p17, %r77, %r78;\n" -" @%p17 bra $Lt_1_32770;\n" -" mov.f32 %f141, %f11;\n" -" st.shared.f32 [%rd54+0], %f141;\n" -" mov.f32 %f142, %f13;\n" -" st.shared.f32 [%rd54+512], %f142;\n" -" mov.f32 %f143, %f15;\n" -" st.shared.f32 [%rd54+1024], %f143;\n" -" mov.f32 %f144, %f17;\n" -" st.shared.f32 [%rd54+1536], %f144;\n" -" mov.f32 %f145, %f19;\n" -" st.shared.f32 [%rd54+2048], %f145;\n" -" mov.f32 %f151, %f20;\n" -" st.shared.f32 [%rd54+2560], %f151;\n" -" mov.s32 %r79, %r72;\n" -" @!%p14 bra $Lt_1_33282;\n" -"$Lt_1_33794:\n" -" setp.ge.u32 %p18, %r18, %r79;\n" -" @%p18 bra $Lt_1_34050;\n" -" add.u32 %r80, %r1, %r79;\n" -" cvt.u64.u32 %rd58, %r80;\n" -" mul.wide.u32 %rd59, %r80, 4;\n" -" add.u64 %rd60, %rd51, %rd59;\n" -" ld.shared.f32 %f152, [%rd60+0];\n" -" add.ftz.f32 %f141, %f152, %f141;\n" -" st.shared.f32 [%rd54+0], %f141;\n" -" ld.shared.f32 %f153, [%rd60+512];\n" -" add.ftz.f32 %f142, %f153, %f142;\n" -" st.shared.f32 [%rd54+512], %f142;\n" -" ld.shared.f32 %f154, [%rd60+1024];\n" -" add.ftz.f32 %f143, %f154, %f143;\n" -" st.shared.f32 [%rd54+1024], %f143;\n" -" ld.shared.f32 %f155, [%rd60+1536];\n" -" add.ftz.f32 %f144, %f155, %f144;\n" -" st.shared.f32 [%rd54+1536], %f144;\n" -" ld.shared.f32 %f156, [%rd60+2048];\n" -" add.ftz.f32 %f145, %f156, %f145;\n" -" st.shared.f32 [%rd54+2048], %f145;\n" -" ld.shared.f32 %f157, [%rd60+2560];\n" -" add.ftz.f32 %f151, %f157, %f151;\n" -" st.shared.f32 [%rd54+2560], %f151;\n" -"$Lt_1_34050:\n" -" shr.u32 %r79, %r79, 1;\n" -" mov.u32 %r81, 0;\n" -" setp.ne.u32 %p19, %r79, %r81;\n" -" @%p19 bra $Lt_1_33794;\n" -"$Lt_1_33282:\n" -" mov.f32 %f11, %f141;\n" -" mov.f32 %f13, %f142;\n" -" mov.f32 %f15, %f143;\n" -" mov.f32 %f17, %f144;\n" -" mov.f32 %f19, %f145;\n" -" mov.f32 %f21, %f151;\n" -"$Lt_1_32770:\n" -"$Lt_1_30722:\n" -" mov.u32 %r82, 0;\n" -" setp.ne.s32 %p20, %r18, %r82;\n" -" @%p20 bra $Lt_1_34818;\n" -" ld.param.u64 %rd61, [__cudaparm_kernel_pair_fast___val_paramengv];\n" -" add.u64 %rd62, %rd61, %rd18;\n" -" ld.param.s32 %r83, [__cudaparm_kernel_pair_fast_eflag];\n" -" mov.u32 %r84, 0;\n" -" setp.le.s32 %p21, %r83, %r84;\n" -" @%p21 bra $Lt_1_35330;\n" -" st.global.f32 [%rd62+0], %f40;\n" -" cvt.s64.s32 %rd63, %r13;\n" -" mul.wide.s32 %rd64, %r13, 4;\n" -" add.u64 %rd65, %rd64, %rd62;\n" -" st.global.f32 [%rd65+0], %f39;\n" -" add.u64 %rd62, %rd64, %rd65;\n" -"$Lt_1_35330:\n" -" ld.param.s32 %r85, [__cudaparm_kernel_pair_fast_vflag];\n" -" mov.u32 %r86, 0;\n" -" setp.le.s32 %p22, %r85, %r86;\n" -" @%p22 bra $Lt_1_35842;\n" -" mov.f32 %f158, %f11;\n" -" st.global.f32 [%rd62+0], %f158;\n" -" cvt.s64.s32 %rd66, %r13;\n" -" mul.wide.s32 %rd67, %r13, 4;\n" -" add.u64 %rd68, %rd67, %rd62;\n" -" mov.f32 %f159, %f13;\n" -" st.global.f32 [%rd68+0], %f159;\n" -" add.u64 %rd69, %rd67, %rd68;\n" -" mov.f32 %f160, %f15;\n" -" st.global.f32 [%rd69+0], %f160;\n" -" add.u64 %rd70, %rd67, %rd69;\n" -" mov.f32 %f161, %f17;\n" -" st.global.f32 [%rd70+0], %f161;\n" -" add.u64 %rd62, %rd67, %rd70;\n" -" mov.f32 %f162, %f19;\n" -" st.global.f32 [%rd62+0], %f162;\n" -" mov.f32 %f163, %f21;\n" -" add.u64 %rd71, %rd67, %rd62;\n" -" st.global.f32 [%rd71+0], %f163;\n" -"$Lt_1_35842:\n" -" ld.param.u64 %rd72, [__cudaparm_kernel_pair_fast_ans];\n" -" mul.lo.u64 %rd73, %rd17, 16;\n" -" add.u64 %rd74, %rd72, %rd73;\n" -" mov.f32 %f164, %f165;\n" -" st.global.v4.f32 [%rd74+0], {%f38,%f37,%f36,%f164};\n" -"$Lt_1_34818:\n" -"$Lt_1_25602:\n" -" .loc 16 247 0\n" -" exit;\n" -"$LDWend_kernel_pair_fast:\n" -" }\n" -; diff --git a/lib/gpu/lj_coul_ptx.h b/lib/gpu/lj_coul_ptx.h deleted file mode 100644 index 1241877c42..0000000000 --- a/lib/gpu/lj_coul_ptx.h +++ /dev/null @@ -1,1002 +0,0 @@ -const char * lj_coul = -" .version 2.3\n" -" .target sm_20\n" -" .address_size 64\n" -" .global .texref pos_tex;\n" -" .global .texref q_tex;\n" -" .entry kernel_pair (\n" -" .param .u64 __cudaparm_kernel_pair_x_,\n" -" .param .u64 __cudaparm_kernel_pair_lj1,\n" -" .param .u64 __cudaparm_kernel_pair_lj3,\n" -" .param .s32 __cudaparm_kernel_pair_lj_types,\n" -" .param .u64 __cudaparm_kernel_pair_sp_lj_in,\n" -" .param .u64 __cudaparm_kernel_pair_dev_nbor,\n" -" .param .u64 __cudaparm_kernel_pair_dev_packed,\n" -" .param .u64 __cudaparm_kernel_pair_ans,\n" -" .param .u64 __cudaparm_kernel_pair___val_paramengv,\n" -" .param .s32 __cudaparm_kernel_pair_eflag,\n" -" .param .s32 __cudaparm_kernel_pair_vflag,\n" -" .param .s32 __cudaparm_kernel_pair_inum,\n" -" .param .s32 __cudaparm_kernel_pair_nbor_pitch,\n" -" .param .u64 __cudaparm_kernel_pair_q_,\n" -" .param .u64 __cudaparm_kernel_pair_cutsq,\n" -" .param .f32 __cudaparm_kernel_pair_qqrd2e,\n" -" .param .s32 __cudaparm_kernel_pair_t_per_atom)\n" -" {\n" -" .reg .u32 %r<86>;\n" -" .reg .u64 %rd<67>;\n" -" .reg .f32 %f<130>;\n" -" .reg .pred %p<21>;\n" -" .shared .align 16 .b8 __cuda___cuda_local_var_32541_33_non_const_sp_lj112[32];\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_32626_55_non_const_red_acc144[3072];\n" -" .loc 16 36 0\n" -"$LDWbegin_kernel_pair:\n" -" .loc 16 41 0\n" -" ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];\n" -" ldu.global.f32 %f1, [%rd1+0];\n" -" .loc 16 42 0\n" -" ld.global.f32 %f2, [%rd1+4];\n" -" .loc 16 43 0\n" -" ld.global.f32 %f3, [%rd1+8];\n" -" .loc 16 44 0\n" -" ld.global.f32 %f4, [%rd1+12];\n" -" st.shared.v4.f32 [__cuda___cuda_local_var_32541_33_non_const_sp_lj112+0], {%f1,%f2,%f3,%f4};\n" -" .loc 16 45 0\n" -" ld.global.f32 %f5, [%rd1+16];\n" -" .loc 16 46 0\n" -" ld.global.f32 %f6, [%rd1+20];\n" -" .loc 16 47 0\n" -" ld.global.f32 %f7, [%rd1+24];\n" -" .loc 16 48 0\n" -" ld.global.f32 %f8, [%rd1+28];\n" -" st.shared.v4.f32 [__cuda___cuda_local_var_32541_33_non_const_sp_lj112+16], {%f5,%f6,%f7,%f8};\n" -" .loc 16 56 0\n" -" mov.f32 %f9, 0f00000000; \n" -" mov.f32 %f10, %f9;\n" -" mov.f32 %f11, 0f00000000; \n" -" mov.f32 %f12, %f11;\n" -" mov.f32 %f13, 0f00000000; \n" -" mov.f32 %f14, %f13;\n" -" mov.f32 %f15, 0f00000000; \n" -" mov.f32 %f16, %f15;\n" -" mov.f32 %f17, 0f00000000; \n" -" mov.f32 %f18, %f17;\n" -" mov.f32 %f19, 0f00000000; \n" -" mov.f32 %f20, %f19;\n" -" ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];\n" -" cvt.s32.u32 %r2, %tid.x;\n" -" div.s32 %r3, %r2, %r1;\n" -" cvt.s32.u32 %r4, %ntid.x;\n" -" div.s32 %r5, %r4, %r1;\n" -" cvt.s32.u32 %r6, %ctaid.x;\n" -" mul.lo.s32 %r7, %r6, %r5;\n" -" add.s32 %r8, %r3, %r7;\n" -" ld.param.s32 %r9, [__cudaparm_kernel_pair_inum];\n" -" setp.ge.s32 %p1, %r8, %r9;\n" -" @%p1 bra $Lt_0_30210;\n" -" .loc 16 61 0\n" -" cvt.s64.s32 %rd2, %r8;\n" -" mul.wide.s32 %rd3, %r8, 4;\n" -" ld.param.u64 %rd4, [__cudaparm_kernel_pair_dev_nbor];\n" -" add.u64 %rd5, %rd3, %rd4;\n" -" ld.global.s32 %r10, [%rd5+0];\n" -" ld.param.s32 %r11, [__cudaparm_kernel_pair_nbor_pitch];\n" -" cvt.s64.s32 %rd6, %r11;\n" -" mul.wide.s32 %rd7, %r11, 4;\n" -" add.u64 %rd8, %rd7, %rd5;\n" -" ld.global.s32 %r12, [%rd8+0];\n" -" sub.s32 %r13, %r1, 1;\n" -" and.b32 %r14, %r13, %r2;\n" -" cvt.s64.s32 %rd9, %r14;\n" -" mul.wide.s32 %rd10, %r14, 4;\n" -" ld.param.u64 %rd11, [__cudaparm_kernel_pair_dev_packed];\n" -" setp.ne.u64 %p2, %rd11, %rd4;\n" -" @%p2 bra $Lt_0_21762;\n" -" cvt.s32.s64 %r15, %rd6;\n" -" mul.lo.s32 %r16, %r15, %r1;\n" -" mov.s32 %r17, %r16;\n" -" mul.lo.s32 %r18, %r13, %r8;\n" -" add.s32 %r19, %r15, %r18;\n" -" cvt.s64.s32 %rd12, %r19;\n" -" mul.wide.s32 %rd13, %r19, 4;\n" -" add.u64 %rd14, %rd8, %rd13;\n" -" and.b32 %r20, %r13, %r12;\n" -" cvt.s64.s32 %rd15, %r20;\n" -" div.s32 %r21, %r12, %r1;\n" -" mul.lo.s32 %r22, %r16, %r21;\n" -" cvt.s64.s32 %rd16, %r22;\n" -" add.u64 %rd17, %rd15, %rd16;\n" -" mul.lo.u64 %rd18, %rd17, 4;\n" -" add.u64 %rd19, %rd14, %rd18;\n" -" add.u64 %rd20, %rd10, %rd14;\n" -" bra.uni $Lt_0_21506;\n" -"$Lt_0_21762:\n" -" add.u64 %rd21, %rd7, %rd8;\n" -" ld.global.s32 %r23, [%rd21+0];\n" -" cvt.s64.s32 %rd22, %r23;\n" -" mul.wide.s32 %rd23, %r23, 4;\n" -" add.u64 %rd24, %rd11, %rd23;\n" -" cvt.s64.s32 %rd25, %r12;\n" -" mul.wide.s32 %rd26, %r12, 4;\n" -" add.u64 %rd19, %rd24, %rd26;\n" -" mov.s32 %r17, %r1;\n" -" add.u64 %rd20, %rd10, %rd24;\n" -"$Lt_0_21506:\n" -" .loc 16 64 0\n" -" mov.u32 %r24, %r10;\n" -" mov.s32 %r25, 0;\n" -" mov.u32 %r26, %r25;\n" -" mov.s32 %r27, 0;\n" -" mov.u32 %r28, %r27;\n" -" mov.s32 %r29, 0;\n" -" mov.u32 %r30, %r29;\n" -" tex.1d.v4.f32.s32 {%f21,%f22,%f23,%f24},[pos_tex,{%r24,%r26,%r28,%r30}];\n" -" mov.f32 %f25, %f21;\n" -" mov.f32 %f26, %f22;\n" -" mov.f32 %f27, %f23;\n" -" mov.f32 %f28, %f24;\n" -" .loc 16 65 0\n" -" mov.u32 %r31, %r10;\n" -" mov.s32 %r32, 0;\n" -" mov.u32 %r33, %r32;\n" -" mov.s32 %r34, 0;\n" -" mov.u32 %r35, %r34;\n" -" mov.s32 %r36, 0;\n" -" mov.u32 %r37, %r36;\n" -" tex.1d.v4.f32.s32 {%f29,%f30,%f31,%f32},[q_tex,{%r31,%r33,%r35,%r37}];\n" -" mov.f32 %f33, %f29;\n" -" setp.ge.u64 %p3, %rd20, %rd19;\n" -" @%p3 bra $Lt_0_31746;\n" -" cvt.rzi.ftz.s32.f32 %r38, %f28;\n" -" cvt.s64.s32 %rd27, %r17;\n" -" ld.param.s32 %r39, [__cudaparm_kernel_pair_lj_types];\n" -" mul.lo.s32 %r40, %r39, %r38;\n" -" ld.param.u64 %rd28, [__cudaparm_kernel_pair_cutsq];\n" -" mov.f32 %f34, 0f00000000; \n" -" mov.f32 %f35, 0f00000000; \n" -" mov.f32 %f36, 0f00000000; \n" -" mov.f32 %f37, 0f00000000; \n" -" mov.f32 %f38, 0f00000000; \n" -" mov.u64 %rd29, __cuda___cuda_local_var_32541_33_non_const_sp_lj112;\n" -"$Lt_0_22530:\n" -" .loc 16 69 0\n" -" ld.global.s32 %r41, [%rd20+0];\n" -" .loc 16 72 0\n" -" shr.s32 %r42, %r41, 30;\n" -" and.b32 %r43, %r42, 3;\n" -" cvt.s64.s32 %rd30, %r43;\n" -" mul.wide.s32 %rd31, %r43, 4;\n" -" add.u64 %rd32, %rd29, %rd31;\n" -" ld.shared.f32 %f39, [%rd32+0];\n" -" .loc 16 76 0\n" -" and.b32 %r44, %r41, 1073741823;\n" -" mov.u32 %r45, %r44;\n" -" mov.s32 %r46, 0;\n" -" mov.u32 %r47, %r46;\n" -" mov.s32 %r48, 0;\n" -" mov.u32 %r49, %r48;\n" -" mov.s32 %r50, 0;\n" -" mov.u32 %r51, %r50;\n" -" tex.1d.v4.f32.s32 {%f40,%f41,%f42,%f43},[pos_tex,{%r45,%r47,%r49,%r51}];\n" -" mov.f32 %f44, %f40;\n" -" mov.f32 %f45, %f41;\n" -" mov.f32 %f46, %f42;\n" -" mov.f32 %f47, %f43;\n" -" cvt.rzi.ftz.s32.f32 %r52, %f47;\n" -" sub.ftz.f32 %f48, %f26, %f45;\n" -" sub.ftz.f32 %f49, %f25, %f44;\n" -" sub.ftz.f32 %f50, %f27, %f46;\n" -" mul.ftz.f32 %f51, %f48, %f48;\n" -" fma.rn.ftz.f32 %f52, %f49, %f49, %f51;\n" -" add.s32 %r53, %r52, %r40;\n" -" cvt.s64.s32 %rd33, %r53;\n" -" fma.rn.ftz.f32 %f53, %f50, %f50, %f52;\n" -" mul.wide.s32 %rd34, %r53, 4;\n" -" add.u64 %rd35, %rd28, %rd34;\n" -" ld.global.f32 %f54, [%rd35+0];\n" -" setp.gt.ftz.f32 %p4, %f54, %f53;\n" -" @!%p4 bra $Lt_0_25346;\n" -" mul.lo.u64 %rd36, %rd33, 16;\n" -" rcp.approx.ftz.f32 %f55, %f53;\n" -" ld.param.u64 %rd37, [__cudaparm_kernel_pair_lj1];\n" -" add.u64 %rd38, %rd37, %rd36;\n" -" ld.global.f32 %f56, [%rd38+8];\n" -" setp.lt.ftz.f32 %p5, %f53, %f56;\n" -" @!%p5 bra $Lt_0_23554;\n" -" .loc 16 91 0\n" -" mul.ftz.f32 %f57, %f55, %f55;\n" -" mul.ftz.f32 %f58, %f55, %f57;\n" -" mov.f32 %f59, %f58;\n" -" .loc 16 92 0\n" -" mul.ftz.f32 %f60, %f58, %f39;\n" -" ld.global.v2.f32 {%f61,%f62}, [%rd38+0];\n" -" mul.ftz.f32 %f63, %f61, %f58;\n" -" sub.ftz.f32 %f64, %f63, %f62;\n" -" mul.ftz.f32 %f65, %f60, %f64;\n" -" bra.uni $Lt_0_23298;\n" -"$Lt_0_23554:\n" -" .loc 16 94 0\n" -" mov.f32 %f65, 0f00000000; \n" -"$Lt_0_23298:\n" -" ld.global.f32 %f66, [%rd38+12];\n" -" setp.gt.ftz.f32 %p6, %f66, %f53;\n" -" @!%p6 bra $Lt_0_24066;\n" -" .loc 16 97 0\n" -" mov.u32 %r54, %r44;\n" -" mov.s32 %r55, 0;\n" -" mov.u32 %r56, %r55;\n" -" mov.s32 %r57, 0;\n" -" mov.u32 %r58, %r57;\n" -" mov.s32 %r59, 0;\n" -" mov.u32 %r60, %r59;\n" -" tex.1d.v4.f32.s32 {%f67,%f68,%f69,%f70},[q_tex,{%r54,%r56,%r58,%r60}];\n" -" mov.f32 %f71, %f67;\n" -" ld.shared.f32 %f72, [%rd32+16];\n" -" ld.param.f32 %f73, [__cudaparm_kernel_pair_qqrd2e];\n" -" mul.ftz.f32 %f74, %f73, %f33;\n" -" mul.ftz.f32 %f75, %f71, %f74;\n" -" rsqrt.approx.ftz.f32 %f76, %f53;\n" -" mul.ftz.f32 %f77, %f75, %f76;\n" -" mul.ftz.f32 %f78, %f72, %f77;\n" -" bra.uni $Lt_0_23810;\n" -"$Lt_0_24066:\n" -" .loc 16 99 0\n" -" mov.f32 %f78, 0f00000000; \n" -"$Lt_0_23810:\n" -" .loc 16 103 0\n" -" add.ftz.f32 %f79, %f78, %f65;\n" -" mul.ftz.f32 %f80, %f79, %f55;\n" -" fma.rn.ftz.f32 %f36, %f49, %f80, %f36;\n" -" .loc 16 104 0\n" -" fma.rn.ftz.f32 %f35, %f48, %f80, %f35;\n" -" .loc 16 105 0\n" -" fma.rn.ftz.f32 %f34, %f50, %f80, %f34;\n" -" ld.param.s32 %r61, [__cudaparm_kernel_pair_eflag];\n" -" mov.u32 %r62, 0;\n" -" setp.le.s32 %p7, %r61, %r62;\n" -" @%p7 bra $Lt_0_24834;\n" -" .loc 16 108 0\n" -" add.ftz.f32 %f37, %f78, %f37;\n" -" @!%p5 bra $Lt_0_24834;\n" -" .loc 16 111 0\n" -" ld.param.u64 %rd39, [__cudaparm_kernel_pair_lj3];\n" -" add.u64 %rd40, %rd39, %rd36;\n" -" mov.f32 %f81, %f59;\n" -" ld.global.v4.f32 {%f82,%f83,%f84,_}, [%rd40+0];\n" -" mul.ftz.f32 %f85, %f82, %f81;\n" -" sub.ftz.f32 %f86, %f85, %f83;\n" -" mul.ftz.f32 %f87, %f81, %f86;\n" -" sub.ftz.f32 %f88, %f87, %f84;\n" -" fma.rn.ftz.f32 %f38, %f39, %f88, %f38;\n" -"$Lt_0_24834:\n" -"$Lt_0_24322:\n" -" ld.param.s32 %r63, [__cudaparm_kernel_pair_vflag];\n" -" mov.u32 %r64, 0;\n" -" setp.le.s32 %p8, %r63, %r64;\n" -" @%p8 bra $Lt_0_25346;\n" -" .loc 16 115 0\n" -" mov.f32 %f89, %f10;\n" -" mul.ftz.f32 %f90, %f49, %f49;\n" -" fma.rn.ftz.f32 %f91, %f80, %f90, %f89;\n" -" mov.f32 %f10, %f91;\n" -" .loc 16 116 0\n" -" mov.f32 %f92, %f12;\n" -" fma.rn.ftz.f32 %f93, %f80, %f51, %f92;\n" -" mov.f32 %f12, %f93;\n" -" .loc 16 117 0\n" -" mov.f32 %f94, %f14;\n" -" mul.ftz.f32 %f95, %f50, %f50;\n" -" fma.rn.ftz.f32 %f96, %f80, %f95, %f94;\n" -" mov.f32 %f14, %f96;\n" -" .loc 16 118 0\n" -" mov.f32 %f97, %f16;\n" -" mul.ftz.f32 %f98, %f48, %f49;\n" -" fma.rn.ftz.f32 %f99, %f80, %f98, %f97;\n" -" mov.f32 %f16, %f99;\n" -" .loc 16 119 0\n" -" mov.f32 %f100, %f18;\n" -" mul.ftz.f32 %f101, %f49, %f50;\n" -" fma.rn.ftz.f32 %f102, %f80, %f101, %f100;\n" -" mov.f32 %f18, %f102;\n" -" .loc 16 120 0\n" -" mul.ftz.f32 %f103, %f48, %f50;\n" -" fma.rn.ftz.f32 %f19, %f80, %f103, %f19;\n" -" mov.f32 %f20, %f19;\n" -"$Lt_0_25346:\n" -"$Lt_0_22786:\n" -" .loc 16 68 0\n" -" mul.lo.u64 %rd41, %rd27, 4;\n" -" add.u64 %rd20, %rd20, %rd41;\n" -" setp.lt.u64 %p9, %rd20, %rd19;\n" -" @%p9 bra $Lt_0_22530;\n" -" bra.uni $Lt_0_22018;\n" -"$Lt_0_31746:\n" -" mov.f32 %f34, 0f00000000; \n" -" mov.f32 %f35, 0f00000000; \n" -" mov.f32 %f36, 0f00000000; \n" -" mov.f32 %f37, 0f00000000; \n" -" mov.f32 %f38, 0f00000000; \n" -"$Lt_0_22018:\n" -" mov.u32 %r65, 1;\n" -" setp.le.s32 %p10, %r1, %r65;\n" -" @%p10 bra $Lt_0_28162;\n" -" .loc 16 125 0\n" -" mov.u64 %rd42, __cuda___cuda_local_var_32626_55_non_const_red_acc144;\n" -" cvt.s64.s32 %rd43, %r2;\n" -" mul.wide.s32 %rd44, %r2, 4;\n" -" add.u64 %rd45, %rd42, %rd44;\n" -" mov.f32 %f104, %f36;\n" -" st.shared.f32 [%rd45+0], %f104;\n" -" mov.f32 %f105, %f35;\n" -" st.shared.f32 [%rd45+512], %f105;\n" -" mov.f32 %f106, %f34;\n" -" st.shared.f32 [%rd45+1024], %f106;\n" -" mov.f32 %f107, %f38;\n" -" st.shared.f32 [%rd45+1536], %f107;\n" -" mov.f32 %f108, %f37;\n" -" st.shared.f32 [%rd45+2048], %f108;\n" -" shr.s32 %r66, %r1, 31;\n" -" mov.s32 %r67, 1;\n" -" and.b32 %r68, %r66, %r67;\n" -" add.s32 %r69, %r68, %r1;\n" -" shr.s32 %r70, %r69, 1;\n" -" mov.s32 %r71, %r70;\n" -" mov.u32 %r72, 0;\n" -" setp.ne.u32 %p11, %r70, %r72;\n" -" @!%p11 bra $Lt_0_26626;\n" -"$Lt_0_27138:\n" -" setp.ge.u32 %p12, %r14, %r71;\n" -" @%p12 bra $Lt_0_27394;\n" -" add.u32 %r73, %r2, %r71;\n" -" cvt.u64.u32 %rd46, %r73;\n" -" mul.wide.u32 %rd47, %r73, 4;\n" -" add.u64 %rd48, %rd42, %rd47;\n" -" ld.shared.f32 %f109, [%rd48+0];\n" -" add.ftz.f32 %f104, %f109, %f104;\n" -" st.shared.f32 [%rd45+0], %f104;\n" -" ld.shared.f32 %f110, [%rd48+512];\n" -" add.ftz.f32 %f105, %f110, %f105;\n" -" st.shared.f32 [%rd45+512], %f105;\n" -" ld.shared.f32 %f111, [%rd48+1024];\n" -" add.ftz.f32 %f106, %f111, %f106;\n" -" st.shared.f32 [%rd45+1024], %f106;\n" -" ld.shared.f32 %f112, [%rd48+1536];\n" -" add.ftz.f32 %f107, %f112, %f107;\n" -" st.shared.f32 [%rd45+1536], %f107;\n" -" ld.shared.f32 %f113, [%rd48+2048];\n" -" add.ftz.f32 %f108, %f113, %f108;\n" -" st.shared.f32 [%rd45+2048], %f108;\n" -"$Lt_0_27394:\n" -" shr.u32 %r71, %r71, 1;\n" -" mov.u32 %r74, 0;\n" -" setp.ne.u32 %p13, %r71, %r74;\n" -" @%p13 bra $Lt_0_27138;\n" -"$Lt_0_26626:\n" -" mov.f32 %f36, %f104;\n" -" mov.f32 %f35, %f105;\n" -" mov.f32 %f34, %f106;\n" -" mov.f32 %f38, %f107;\n" -" mov.f32 %f37, %f108;\n" -" ld.param.s32 %r75, [__cudaparm_kernel_pair_vflag];\n" -" mov.u32 %r76, 0;\n" -" setp.le.s32 %p14, %r75, %r76;\n" -" @%p14 bra $Lt_0_28162;\n" -" mov.f32 %f104, %f10;\n" -" st.shared.f32 [%rd45+0], %f104;\n" -" mov.f32 %f105, %f12;\n" -" st.shared.f32 [%rd45+512], %f105;\n" -" mov.f32 %f106, %f14;\n" -" st.shared.f32 [%rd45+1024], %f106;\n" -" mov.f32 %f107, %f16;\n" -" st.shared.f32 [%rd45+1536], %f107;\n" -" mov.f32 %f108, %f18;\n" -" st.shared.f32 [%rd45+2048], %f108;\n" -" mov.f32 %f114, %f19;\n" -" st.shared.f32 [%rd45+2560], %f114;\n" -" mov.s32 %r77, %r70;\n" -" @!%p11 bra $Lt_0_28674;\n" -"$Lt_0_29186:\n" -" setp.ge.u32 %p15, %r14, %r77;\n" -" @%p15 bra $Lt_0_29442;\n" -" add.u32 %r78, %r2, %r77;\n" -" cvt.u64.u32 %rd49, %r78;\n" -" mul.wide.u32 %rd50, %r78, 4;\n" -" add.u64 %rd51, %rd42, %rd50;\n" -" ld.shared.f32 %f115, [%rd51+0];\n" -" add.ftz.f32 %f104, %f115, %f104;\n" -" st.shared.f32 [%rd45+0], %f104;\n" -" ld.shared.f32 %f116, [%rd51+512];\n" -" add.ftz.f32 %f105, %f116, %f105;\n" -" st.shared.f32 [%rd45+512], %f105;\n" -" ld.shared.f32 %f117, [%rd51+1024];\n" -" add.ftz.f32 %f106, %f117, %f106;\n" -" st.shared.f32 [%rd45+1024], %f106;\n" -" ld.shared.f32 %f118, [%rd51+1536];\n" -" add.ftz.f32 %f107, %f118, %f107;\n" -" st.shared.f32 [%rd45+1536], %f107;\n" -" ld.shared.f32 %f119, [%rd51+2048];\n" -" add.ftz.f32 %f108, %f119, %f108;\n" -" st.shared.f32 [%rd45+2048], %f108;\n" -" ld.shared.f32 %f120, [%rd51+2560];\n" -" add.ftz.f32 %f114, %f120, %f114;\n" -" st.shared.f32 [%rd45+2560], %f114;\n" -"$Lt_0_29442:\n" -" shr.u32 %r77, %r77, 1;\n" -" mov.u32 %r79, 0;\n" -" setp.ne.u32 %p16, %r77, %r79;\n" -" @%p16 bra $Lt_0_29186;\n" -"$Lt_0_28674:\n" -" mov.f32 %f10, %f104;\n" -" mov.f32 %f12, %f105;\n" -" mov.f32 %f14, %f106;\n" -" mov.f32 %f16, %f107;\n" -" mov.f32 %f18, %f108;\n" -" mov.f32 %f20, %f114;\n" -"$Lt_0_28162:\n" -"$Lt_0_26114:\n" -" mov.u32 %r80, 0;\n" -" setp.ne.s32 %p17, %r14, %r80;\n" -" @%p17 bra $Lt_0_30210;\n" -" ld.param.u64 %rd52, [__cudaparm_kernel_pair___val_paramengv];\n" -" add.u64 %rd53, %rd52, %rd3;\n" -" ld.param.s32 %r81, [__cudaparm_kernel_pair_eflag];\n" -" mov.u32 %r82, 0;\n" -" setp.le.s32 %p18, %r81, %r82;\n" -" @%p18 bra $Lt_0_30722;\n" -" st.global.f32 [%rd53+0], %f38;\n" -" cvt.s64.s32 %rd54, %r9;\n" -" mul.wide.s32 %rd55, %r9, 4;\n" -" add.u64 %rd56, %rd55, %rd53;\n" -" st.global.f32 [%rd56+0], %f37;\n" -" add.u64 %rd53, %rd55, %rd56;\n" -"$Lt_0_30722:\n" -" ld.param.s32 %r83, [__cudaparm_kernel_pair_vflag];\n" -" mov.u32 %r84, 0;\n" -" setp.le.s32 %p19, %r83, %r84;\n" -" @%p19 bra $Lt_0_31234;\n" -" mov.f32 %f121, %f10;\n" -" st.global.f32 [%rd53+0], %f121;\n" -" cvt.s64.s32 %rd57, %r9;\n" -" mul.wide.s32 %rd58, %r9, 4;\n" -" add.u64 %rd59, %rd58, %rd53;\n" -" mov.f32 %f122, %f12;\n" -" st.global.f32 [%rd59+0], %f122;\n" -" add.u64 %rd60, %rd58, %rd59;\n" -" mov.f32 %f123, %f14;\n" -" st.global.f32 [%rd60+0], %f123;\n" -" add.u64 %rd61, %rd58, %rd60;\n" -" mov.f32 %f124, %f16;\n" -" st.global.f32 [%rd61+0], %f124;\n" -" add.u64 %rd53, %rd58, %rd61;\n" -" mov.f32 %f125, %f18;\n" -" st.global.f32 [%rd53+0], %f125;\n" -" mov.f32 %f126, %f20;\n" -" add.u64 %rd62, %rd58, %rd53;\n" -" st.global.f32 [%rd62+0], %f126;\n" -"$Lt_0_31234:\n" -" ld.param.u64 %rd63, [__cudaparm_kernel_pair_ans];\n" -" mul.lo.u64 %rd64, %rd2, 16;\n" -" add.u64 %rd65, %rd63, %rd64;\n" -" mov.f32 %f127, %f128;\n" -" st.global.v4.f32 [%rd65+0], {%f36,%f35,%f34,%f127};\n" -"$Lt_0_30210:\n" -"$Lt_0_20994:\n" -" .loc 16 128 0\n" -" exit;\n" -"$LDWend_kernel_pair:\n" -" }\n" -" .entry kernel_pair_fast (\n" -" .param .u64 __cudaparm_kernel_pair_fast_x_,\n" -" .param .u64 __cudaparm_kernel_pair_fast_lj1_in,\n" -" .param .u64 __cudaparm_kernel_pair_fast_lj3_in,\n" -" .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,\n" -" .param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n" -" .param .u64 __cudaparm_kernel_pair_fast_dev_packed,\n" -" .param .u64 __cudaparm_kernel_pair_fast_ans,\n" -" .param .u64 __cudaparm_kernel_pair_fast___val_paramengv,\n" -" .param .s32 __cudaparm_kernel_pair_fast_eflag,\n" -" .param .s32 __cudaparm_kernel_pair_fast_vflag,\n" -" .param .s32 __cudaparm_kernel_pair_fast_inum,\n" -" .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,\n" -" .param .u64 __cudaparm_kernel_pair_fast_q_,\n" -" .param .u64 __cudaparm_kernel_pair_fast__cutsq,\n" -" .param .f32 __cudaparm_kernel_pair_fast_qqrd2e,\n" -" .param .s32 __cudaparm_kernel_pair_fast_t_per_atom)\n" -" {\n" -" .reg .u32 %r<88>;\n" -" .reg .u64 %rd<83>;\n" -" .reg .f32 %f<134>;\n" -" .reg .pred %p<24>;\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_32646_33_non_const_sp_lj3320[32];\n" -" .shared .align 16 .b8 __cuda___cuda_local_var_32643_34_non_const_lj13360[1936];\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_32645_33_non_const_cutsq5296[484];\n" -" .shared .align 16 .b8 __cuda___cuda_local_var_32644_34_non_const_lj35792[1936];\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_32733_55_non_const_red_acc7728[3072];\n" -" .loc 16 138 0\n" -"$LDWbegin_kernel_pair_fast:\n" -" cvt.s32.u32 %r1, %tid.x;\n" -" mov.u32 %r2, 7;\n" -" setp.gt.s32 %p1, %r1, %r2;\n" -" @%p1 bra $Lt_1_23298;\n" -" .loc 16 147 0\n" -" mov.u64 %rd1, __cuda___cuda_local_var_32646_33_non_const_sp_lj3320;\n" -" cvt.s64.s32 %rd2, %r1;\n" -" mul.wide.s32 %rd3, %r1, 4;\n" -" ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];\n" -" add.u64 %rd5, %rd4, %rd3;\n" -" ld.global.f32 %f1, [%rd5+0];\n" -" add.u64 %rd6, %rd3, %rd1;\n" -" st.shared.f32 [%rd6+0], %f1;\n" -"$Lt_1_23298:\n" -" mov.u64 %rd1, __cuda___cuda_local_var_32646_33_non_const_sp_lj3320;\n" -" mov.u32 %r3, 120;\n" -" setp.gt.s32 %p2, %r1, %r3;\n" -" @%p2 bra $Lt_1_23810;\n" -" .loc 16 149 0\n" -" mov.u64 %rd7, __cuda___cuda_local_var_32643_34_non_const_lj13360;\n" -" mov.u64 %rd8, __cuda___cuda_local_var_32645_33_non_const_cutsq5296;\n" -" cvt.s64.s32 %rd9, %r1;\n" -" mul.wide.s32 %rd10, %r1, 16;\n" -" ld.param.u64 %rd11, [__cudaparm_kernel_pair_fast_lj1_in];\n" -" add.u64 %rd12, %rd11, %rd10;\n" -" add.u64 %rd13, %rd10, %rd7;\n" -" ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd12+0];\n" -" st.shared.v4.f32 [%rd13+0], {%f2,%f3,%f4,%f5};\n" -" .loc 16 150 0\n" -" mul.wide.s32 %rd14, %r1, 4;\n" -" ld.param.u64 %rd15, [__cudaparm_kernel_pair_fast__cutsq];\n" -" add.u64 %rd16, %rd15, %rd14;\n" -" ld.global.f32 %f6, [%rd16+0];\n" -" add.u64 %rd17, %rd14, %rd8;\n" -" st.shared.f32 [%rd17+0], %f6;\n" -" ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag];\n" -" mov.u32 %r5, 0;\n" -" setp.le.s32 %p3, %r4, %r5;\n" -" @%p3 bra $Lt_1_24322;\n" -" .loc 16 152 0\n" -" mov.u64 %rd18, __cuda___cuda_local_var_32644_34_non_const_lj35792;\n" -" ld.param.u64 %rd19, [__cudaparm_kernel_pair_fast_lj3_in];\n" -" add.u64 %rd20, %rd19, %rd10;\n" -" add.u64 %rd21, %rd10, %rd18;\n" -" ld.global.v4.f32 {%f7,%f8,%f9,%f10}, [%rd20+0];\n" -" st.shared.v4.f32 [%rd21+0], {%f7,%f8,%f9,%f10};\n" -"$Lt_1_24322:\n" -" mov.u64 %rd18, __cuda___cuda_local_var_32644_34_non_const_lj35792;\n" -"$Lt_1_23810:\n" -" mov.u64 %rd7, __cuda___cuda_local_var_32643_34_non_const_lj13360;\n" -" mov.u64 %rd8, __cuda___cuda_local_var_32645_33_non_const_cutsq5296;\n" -" mov.u64 %rd18, __cuda___cuda_local_var_32644_34_non_const_lj35792;\n" -" .loc 16 161 0\n" -" mov.f32 %f11, 0f00000000; \n" -" mov.f32 %f12, %f11;\n" -" mov.f32 %f13, 0f00000000; \n" -" mov.f32 %f14, %f13;\n" -" mov.f32 %f15, 0f00000000; \n" -" mov.f32 %f16, %f15;\n" -" mov.f32 %f17, 0f00000000; \n" -" mov.f32 %f18, %f17;\n" -" mov.f32 %f19, 0f00000000; \n" -" mov.f32 %f20, %f19;\n" -" mov.f32 %f21, 0f00000000; \n" -" mov.f32 %f22, %f21;\n" -" .loc 16 163 0\n" -" bar.sync 0;\n" -" ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom];\n" -" div.s32 %r7, %r1, %r6;\n" -" cvt.s32.u32 %r8, %ntid.x;\n" -" div.s32 %r9, %r8, %r6;\n" -" cvt.s32.u32 %r10, %ctaid.x;\n" -" mul.lo.s32 %r11, %r10, %r9;\n" -" add.s32 %r12, %r7, %r11;\n" -" ld.param.s32 %r13, [__cudaparm_kernel_pair_fast_inum];\n" -" setp.ge.s32 %p4, %r12, %r13;\n" -" @%p4 bra $Lt_1_34050;\n" -" .loc 16 168 0\n" -" cvt.s64.s32 %rd22, %r12;\n" -" mul.wide.s32 %rd23, %r12, 4;\n" -" ld.param.u64 %rd24, [__cudaparm_kernel_pair_fast_dev_nbor];\n" -" add.u64 %rd25, %rd23, %rd24;\n" -" ld.global.s32 %r14, [%rd25+0];\n" -" ld.param.s32 %r15, [__cudaparm_kernel_pair_fast_nbor_pitch];\n" -" cvt.s64.s32 %rd26, %r15;\n" -" mul.wide.s32 %rd27, %r15, 4;\n" -" add.u64 %rd28, %rd27, %rd25;\n" -" ld.global.s32 %r16, [%rd28+0];\n" -" sub.s32 %r17, %r6, 1;\n" -" and.b32 %r18, %r17, %r1;\n" -" cvt.s64.s32 %rd29, %r18;\n" -" mul.wide.s32 %rd30, %r18, 4;\n" -" ld.param.u64 %rd31, [__cudaparm_kernel_pair_fast_dev_packed];\n" -" setp.ne.u64 %p5, %rd31, %rd24;\n" -" @%p5 bra $Lt_1_25602;\n" -" cvt.s32.s64 %r19, %rd26;\n" -" mul.lo.s32 %r20, %r19, %r6;\n" -" mov.s32 %r21, %r20;\n" -" mul.lo.s32 %r22, %r17, %r12;\n" -" add.s32 %r23, %r19, %r22;\n" -" cvt.s64.s32 %rd32, %r23;\n" -" mul.wide.s32 %rd33, %r23, 4;\n" -" add.u64 %rd34, %rd28, %rd33;\n" -" and.b32 %r24, %r17, %r16;\n" -" cvt.s64.s32 %rd35, %r24;\n" -" div.s32 %r25, %r16, %r6;\n" -" mul.lo.s32 %r26, %r20, %r25;\n" -" cvt.s64.s32 %rd36, %r26;\n" -" add.u64 %rd37, %rd35, %rd36;\n" -" mul.lo.u64 %rd38, %rd37, 4;\n" -" add.u64 %rd39, %rd34, %rd38;\n" -" add.u64 %rd40, %rd30, %rd34;\n" -" bra.uni $Lt_1_25346;\n" -"$Lt_1_25602:\n" -" add.u64 %rd41, %rd27, %rd28;\n" -" ld.global.s32 %r27, [%rd41+0];\n" -" cvt.s64.s32 %rd42, %r27;\n" -" mul.wide.s32 %rd43, %r27, 4;\n" -" add.u64 %rd44, %rd31, %rd43;\n" -" cvt.s64.s32 %rd45, %r16;\n" -" mul.wide.s32 %rd46, %r16, 4;\n" -" add.u64 %rd39, %rd44, %rd46;\n" -" mov.s32 %r21, %r6;\n" -" add.u64 %rd40, %rd30, %rd44;\n" -"$Lt_1_25346:\n" -" .loc 16 171 0\n" -" mov.u32 %r28, %r14;\n" -" mov.s32 %r29, 0;\n" -" mov.u32 %r30, %r29;\n" -" mov.s32 %r31, 0;\n" -" mov.u32 %r32, %r31;\n" -" mov.s32 %r33, 0;\n" -" mov.u32 %r34, %r33;\n" -" tex.1d.v4.f32.s32 {%f23,%f24,%f25,%f26},[pos_tex,{%r28,%r30,%r32,%r34}];\n" -" mov.f32 %f27, %f23;\n" -" mov.f32 %f28, %f24;\n" -" mov.f32 %f29, %f25;\n" -" mov.f32 %f30, %f26;\n" -" .loc 16 172 0\n" -" mov.u32 %r35, %r14;\n" -" mov.s32 %r36, 0;\n" -" mov.u32 %r37, %r36;\n" -" mov.s32 %r38, 0;\n" -" mov.u32 %r39, %r38;\n" -" mov.s32 %r40, 0;\n" -" mov.u32 %r41, %r40;\n" -" tex.1d.v4.f32.s32 {%f31,%f32,%f33,%f34},[q_tex,{%r35,%r37,%r39,%r41}];\n" -" mov.f32 %f35, %f31;\n" -" setp.ge.u64 %p6, %rd40, %rd39;\n" -" @%p6 bra $Lt_1_35586;\n" -" cvt.rzi.ftz.s32.f32 %r42, %f30;\n" -" cvt.s64.s32 %rd47, %r21;\n" -" mul.lo.s32 %r43, %r42, 11;\n" -" cvt.rn.f32.s32 %f36, %r43;\n" -" mov.f32 %f37, 0f00000000; \n" -" mov.f32 %f38, 0f00000000; \n" -" mov.f32 %f39, 0f00000000; \n" -" mov.f32 %f40, 0f00000000; \n" -" mov.f32 %f41, 0f00000000; \n" -"$Lt_1_26370:\n" -" .loc 16 177 0\n" -" ld.global.s32 %r44, [%rd40+0];\n" -" .loc 16 180 0\n" -" shr.s32 %r45, %r44, 30;\n" -" and.b32 %r46, %r45, 3;\n" -" cvt.s64.s32 %rd48, %r46;\n" -" mul.wide.s32 %rd49, %r46, 4;\n" -" add.u64 %rd50, %rd1, %rd49;\n" -" ld.shared.f32 %f42, [%rd50+0];\n" -" .loc 16 184 0\n" -" and.b32 %r47, %r44, 1073741823;\n" -" mov.u32 %r48, %r47;\n" -" mov.s32 %r49, 0;\n" -" mov.u32 %r50, %r49;\n" -" mov.s32 %r51, 0;\n" -" mov.u32 %r52, %r51;\n" -" mov.s32 %r53, 0;\n" -" mov.u32 %r54, %r53;\n" -" tex.1d.v4.f32.s32 {%f43,%f44,%f45,%f46},[pos_tex,{%r48,%r50,%r52,%r54}];\n" -" mov.f32 %f47, %f43;\n" -" mov.f32 %f48, %f44;\n" -" mov.f32 %f49, %f45;\n" -" mov.f32 %f50, %f46;\n" -" sub.ftz.f32 %f51, %f28, %f48;\n" -" sub.ftz.f32 %f52, %f27, %f47;\n" -" sub.ftz.f32 %f53, %f29, %f49;\n" -" mul.ftz.f32 %f54, %f51, %f51;\n" -" fma.rn.ftz.f32 %f55, %f52, %f52, %f54;\n" -" fma.rn.ftz.f32 %f56, %f53, %f53, %f55;\n" -" add.ftz.f32 %f57, %f36, %f50;\n" -" cvt.rzi.ftz.s32.f32 %r55, %f57;\n" -" cvt.s64.s32 %rd51, %r55;\n" -" mul.wide.s32 %rd52, %r55, 4;\n" -" add.u64 %rd53, %rd8, %rd52;\n" -" ld.shared.f32 %f58, [%rd53+0];\n" -" setp.gt.ftz.f32 %p7, %f58, %f56;\n" -" @!%p7 bra $Lt_1_29186;\n" -" rcp.approx.ftz.f32 %f59, %f56;\n" -" mul.lo.u64 %rd54, %rd51, 16;\n" -" add.u64 %rd55, %rd54, %rd7;\n" -" ld.shared.f32 %f60, [%rd55+8];\n" -" setp.lt.ftz.f32 %p8, %f56, %f60;\n" -" @!%p8 bra $Lt_1_27394;\n" -" .loc 16 198 0\n" -" mul.ftz.f32 %f61, %f59, %f59;\n" -" mul.ftz.f32 %f62, %f59, %f61;\n" -" mov.f32 %f63, %f62;\n" -" .loc 16 199 0\n" -" mul.ftz.f32 %f64, %f62, %f42;\n" -" ld.shared.v2.f32 {%f65,%f66}, [%rd55+0];\n" -" mul.ftz.f32 %f67, %f65, %f62;\n" -" sub.ftz.f32 %f68, %f67, %f66;\n" -" mul.ftz.f32 %f69, %f64, %f68;\n" -" bra.uni $Lt_1_27138;\n" -"$Lt_1_27394:\n" -" .loc 16 201 0\n" -" mov.f32 %f69, 0f00000000; \n" -"$Lt_1_27138:\n" -" ld.shared.f32 %f70, [%rd55+12];\n" -" setp.gt.ftz.f32 %p9, %f70, %f56;\n" -" @!%p9 bra $Lt_1_27906;\n" -" .loc 16 204 0\n" -" mov.u32 %r56, %r47;\n" -" mov.s32 %r57, 0;\n" -" mov.u32 %r58, %r57;\n" -" mov.s32 %r59, 0;\n" -" mov.u32 %r60, %r59;\n" -" mov.s32 %r61, 0;\n" -" mov.u32 %r62, %r61;\n" -" tex.1d.v4.f32.s32 {%f71,%f72,%f73,%f74},[q_tex,{%r56,%r58,%r60,%r62}];\n" -" mov.f32 %f75, %f71;\n" -" ld.shared.f32 %f76, [%rd50+16];\n" -" ld.param.f32 %f77, [__cudaparm_kernel_pair_fast_qqrd2e];\n" -" mul.ftz.f32 %f78, %f77, %f35;\n" -" mul.ftz.f32 %f79, %f75, %f78;\n" -" rsqrt.approx.ftz.f32 %f80, %f56;\n" -" mul.ftz.f32 %f81, %f79, %f80;\n" -" mul.ftz.f32 %f82, %f76, %f81;\n" -" bra.uni $Lt_1_27650;\n" -"$Lt_1_27906:\n" -" .loc 16 206 0\n" -" mov.f32 %f82, 0f00000000; \n" -"$Lt_1_27650:\n" -" .loc 16 210 0\n" -" add.ftz.f32 %f83, %f82, %f69;\n" -" mul.ftz.f32 %f84, %f83, %f59;\n" -" fma.rn.ftz.f32 %f39, %f52, %f84, %f39;\n" -" .loc 16 211 0\n" -" fma.rn.ftz.f32 %f38, %f51, %f84, %f38;\n" -" .loc 16 212 0\n" -" fma.rn.ftz.f32 %f37, %f53, %f84, %f37;\n" -" ld.param.s32 %r63, [__cudaparm_kernel_pair_fast_eflag];\n" -" mov.u32 %r64, 0;\n" -" setp.le.s32 %p10, %r63, %r64;\n" -" @%p10 bra $Lt_1_28674;\n" -" .loc 16 215 0\n" -" add.ftz.f32 %f40, %f82, %f40;\n" -" @!%p8 bra $Lt_1_28674;\n" -" .loc 16 217 0\n" -" add.u64 %rd56, %rd54, %rd18;\n" -" mov.f32 %f85, %f63;\n" -" ld.shared.v4.f32 {%f86,%f87,%f88,_}, [%rd56+0];\n" -" mul.ftz.f32 %f89, %f86, %f85;\n" -" sub.ftz.f32 %f90, %f89, %f87;\n" -" mul.ftz.f32 %f91, %f85, %f90;\n" -" .loc 16 218 0\n" -" sub.ftz.f32 %f92, %f91, %f88;\n" -" fma.rn.ftz.f32 %f41, %f42, %f92, %f41;\n" -"$Lt_1_28674:\n" -"$Lt_1_28162:\n" -" ld.param.s32 %r65, [__cudaparm_kernel_pair_fast_vflag];\n" -" mov.u32 %r66, 0;\n" -" setp.le.s32 %p11, %r65, %r66;\n" -" @%p11 bra $Lt_1_29186;\n" -" .loc 16 222 0\n" -" mov.f32 %f93, %f12;\n" -" mul.ftz.f32 %f94, %f52, %f52;\n" -" fma.rn.ftz.f32 %f95, %f84, %f94, %f93;\n" -" mov.f32 %f12, %f95;\n" -" .loc 16 223 0\n" -" mov.f32 %f96, %f14;\n" -" fma.rn.ftz.f32 %f97, %f84, %f54, %f96;\n" -" mov.f32 %f14, %f97;\n" -" .loc 16 224 0\n" -" mov.f32 %f98, %f16;\n" -" mul.ftz.f32 %f99, %f53, %f53;\n" -" fma.rn.ftz.f32 %f100, %f84, %f99, %f98;\n" -" mov.f32 %f16, %f100;\n" -" .loc 16 225 0\n" -" mov.f32 %f101, %f18;\n" -" mul.ftz.f32 %f102, %f51, %f52;\n" -" fma.rn.ftz.f32 %f103, %f84, %f102, %f101;\n" -" mov.f32 %f18, %f103;\n" -" .loc 16 226 0\n" -" mov.f32 %f104, %f20;\n" -" mul.ftz.f32 %f105, %f52, %f53;\n" -" fma.rn.ftz.f32 %f106, %f84, %f105, %f104;\n" -" mov.f32 %f20, %f106;\n" -" .loc 16 227 0\n" -" mul.ftz.f32 %f107, %f51, %f53;\n" -" fma.rn.ftz.f32 %f21, %f84, %f107, %f21;\n" -" mov.f32 %f22, %f21;\n" -"$Lt_1_29186:\n" -"$Lt_1_26626:\n" -" .loc 16 176 0\n" -" mul.lo.u64 %rd57, %rd47, 4;\n" -" add.u64 %rd40, %rd40, %rd57;\n" -" setp.lt.u64 %p12, %rd40, %rd39;\n" -" @%p12 bra $Lt_1_26370;\n" -" bra.uni $Lt_1_25858;\n" -"$Lt_1_35586:\n" -" mov.f32 %f37, 0f00000000; \n" -" mov.f32 %f38, 0f00000000; \n" -" mov.f32 %f39, 0f00000000; \n" -" mov.f32 %f40, 0f00000000; \n" -" mov.f32 %f41, 0f00000000; \n" -"$Lt_1_25858:\n" -" mov.u32 %r67, 1;\n" -" setp.le.s32 %p13, %r6, %r67;\n" -" @%p13 bra $Lt_1_32002;\n" -" .loc 16 232 0\n" -" mov.u64 %rd58, __cuda___cuda_local_var_32733_55_non_const_red_acc7728;\n" -" cvt.s64.s32 %rd59, %r1;\n" -" mul.wide.s32 %rd60, %r1, 4;\n" -" add.u64 %rd61, %rd58, %rd60;\n" -" mov.f32 %f108, %f39;\n" -" st.shared.f32 [%rd61+0], %f108;\n" -" mov.f32 %f109, %f38;\n" -" st.shared.f32 [%rd61+512], %f109;\n" -" mov.f32 %f110, %f37;\n" -" st.shared.f32 [%rd61+1024], %f110;\n" -" mov.f32 %f111, %f41;\n" -" st.shared.f32 [%rd61+1536], %f111;\n" -" mov.f32 %f112, %f40;\n" -" st.shared.f32 [%rd61+2048], %f112;\n" -" shr.s32 %r68, %r6, 31;\n" -" mov.s32 %r69, 1;\n" -" and.b32 %r70, %r68, %r69;\n" -" add.s32 %r71, %r70, %r6;\n" -" shr.s32 %r72, %r71, 1;\n" -" mov.s32 %r73, %r72;\n" -" mov.u32 %r74, 0;\n" -" setp.ne.u32 %p14, %r72, %r74;\n" -" @!%p14 bra $Lt_1_30466;\n" -"$Lt_1_30978:\n" -" setp.ge.u32 %p15, %r18, %r73;\n" -" @%p15 bra $Lt_1_31234;\n" -" add.u32 %r75, %r1, %r73;\n" -" cvt.u64.u32 %rd62, %r75;\n" -" mul.wide.u32 %rd63, %r75, 4;\n" -" add.u64 %rd64, %rd58, %rd63;\n" -" ld.shared.f32 %f113, [%rd64+0];\n" -" add.ftz.f32 %f108, %f113, %f108;\n" -" st.shared.f32 [%rd61+0], %f108;\n" -" ld.shared.f32 %f114, [%rd64+512];\n" -" add.ftz.f32 %f109, %f114, %f109;\n" -" st.shared.f32 [%rd61+512], %f109;\n" -" ld.shared.f32 %f115, [%rd64+1024];\n" -" add.ftz.f32 %f110, %f115, %f110;\n" -" st.shared.f32 [%rd61+1024], %f110;\n" -" ld.shared.f32 %f116, [%rd64+1536];\n" -" add.ftz.f32 %f111, %f116, %f111;\n" -" st.shared.f32 [%rd61+1536], %f111;\n" -" ld.shared.f32 %f117, [%rd64+2048];\n" -" add.ftz.f32 %f112, %f117, %f112;\n" -" st.shared.f32 [%rd61+2048], %f112;\n" -"$Lt_1_31234:\n" -" shr.u32 %r73, %r73, 1;\n" -" mov.u32 %r76, 0;\n" -" setp.ne.u32 %p16, %r73, %r76;\n" -" @%p16 bra $Lt_1_30978;\n" -"$Lt_1_30466:\n" -" mov.f32 %f39, %f108;\n" -" mov.f32 %f38, %f109;\n" -" mov.f32 %f37, %f110;\n" -" mov.f32 %f41, %f111;\n" -" mov.f32 %f40, %f112;\n" -" ld.param.s32 %r77, [__cudaparm_kernel_pair_fast_vflag];\n" -" mov.u32 %r78, 0;\n" -" setp.le.s32 %p17, %r77, %r78;\n" -" @%p17 bra $Lt_1_32002;\n" -" mov.f32 %f108, %f12;\n" -" st.shared.f32 [%rd61+0], %f108;\n" -" mov.f32 %f109, %f14;\n" -" st.shared.f32 [%rd61+512], %f109;\n" -" mov.f32 %f110, %f16;\n" -" st.shared.f32 [%rd61+1024], %f110;\n" -" mov.f32 %f111, %f18;\n" -" st.shared.f32 [%rd61+1536], %f111;\n" -" mov.f32 %f112, %f20;\n" -" st.shared.f32 [%rd61+2048], %f112;\n" -" mov.f32 %f118, %f21;\n" -" st.shared.f32 [%rd61+2560], %f118;\n" -" mov.s32 %r79, %r72;\n" -" @!%p14 bra $Lt_1_32514;\n" -"$Lt_1_33026:\n" -" setp.ge.u32 %p18, %r18, %r79;\n" -" @%p18 bra $Lt_1_33282;\n" -" add.u32 %r80, %r1, %r79;\n" -" cvt.u64.u32 %rd65, %r80;\n" -" mul.wide.u32 %rd66, %r80, 4;\n" -" add.u64 %rd67, %rd58, %rd66;\n" -" ld.shared.f32 %f119, [%rd67+0];\n" -" add.ftz.f32 %f108, %f119, %f108;\n" -" st.shared.f32 [%rd61+0], %f108;\n" -" ld.shared.f32 %f120, [%rd67+512];\n" -" add.ftz.f32 %f109, %f120, %f109;\n" -" st.shared.f32 [%rd61+512], %f109;\n" -" ld.shared.f32 %f121, [%rd67+1024];\n" -" add.ftz.f32 %f110, %f121, %f110;\n" -" st.shared.f32 [%rd61+1024], %f110;\n" -" ld.shared.f32 %f122, [%rd67+1536];\n" -" add.ftz.f32 %f111, %f122, %f111;\n" -" st.shared.f32 [%rd61+1536], %f111;\n" -" ld.shared.f32 %f123, [%rd67+2048];\n" -" add.ftz.f32 %f112, %f123, %f112;\n" -" st.shared.f32 [%rd61+2048], %f112;\n" -" ld.shared.f32 %f124, [%rd67+2560];\n" -" add.ftz.f32 %f118, %f124, %f118;\n" -" st.shared.f32 [%rd61+2560], %f118;\n" -"$Lt_1_33282:\n" -" shr.u32 %r79, %r79, 1;\n" -" mov.u32 %r81, 0;\n" -" setp.ne.u32 %p19, %r79, %r81;\n" -" @%p19 bra $Lt_1_33026;\n" -"$Lt_1_32514:\n" -" mov.f32 %f12, %f108;\n" -" mov.f32 %f14, %f109;\n" -" mov.f32 %f16, %f110;\n" -" mov.f32 %f18, %f111;\n" -" mov.f32 %f20, %f112;\n" -" mov.f32 %f22, %f118;\n" -"$Lt_1_32002:\n" -"$Lt_1_29954:\n" -" mov.u32 %r82, 0;\n" -" setp.ne.s32 %p20, %r18, %r82;\n" -" @%p20 bra $Lt_1_34050;\n" -" ld.param.u64 %rd68, [__cudaparm_kernel_pair_fast___val_paramengv];\n" -" add.u64 %rd69, %rd68, %rd23;\n" -" ld.param.s32 %r83, [__cudaparm_kernel_pair_fast_eflag];\n" -" mov.u32 %r84, 0;\n" -" setp.le.s32 %p21, %r83, %r84;\n" -" @%p21 bra $Lt_1_34562;\n" -" st.global.f32 [%rd69+0], %f41;\n" -" cvt.s64.s32 %rd70, %r13;\n" -" mul.wide.s32 %rd71, %r13, 4;\n" -" add.u64 %rd72, %rd71, %rd69;\n" -" st.global.f32 [%rd72+0], %f40;\n" -" add.u64 %rd69, %rd71, %rd72;\n" -"$Lt_1_34562:\n" -" ld.param.s32 %r85, [__cudaparm_kernel_pair_fast_vflag];\n" -" mov.u32 %r86, 0;\n" -" setp.le.s32 %p22, %r85, %r86;\n" -" @%p22 bra $Lt_1_35074;\n" -" mov.f32 %f125, %f12;\n" -" st.global.f32 [%rd69+0], %f125;\n" -" cvt.s64.s32 %rd73, %r13;\n" -" mul.wide.s32 %rd74, %r13, 4;\n" -" add.u64 %rd75, %rd74, %rd69;\n" -" mov.f32 %f126, %f14;\n" -" st.global.f32 [%rd75+0], %f126;\n" -" add.u64 %rd76, %rd74, %rd75;\n" -" mov.f32 %f127, %f16;\n" -" st.global.f32 [%rd76+0], %f127;\n" -" add.u64 %rd77, %rd74, %rd76;\n" -" mov.f32 %f128, %f18;\n" -" st.global.f32 [%rd77+0], %f128;\n" -" add.u64 %rd69, %rd74, %rd77;\n" -" mov.f32 %f129, %f20;\n" -" st.global.f32 [%rd69+0], %f129;\n" -" mov.f32 %f130, %f22;\n" -" add.u64 %rd78, %rd74, %rd69;\n" -" st.global.f32 [%rd78+0], %f130;\n" -"$Lt_1_35074:\n" -" ld.param.u64 %rd79, [__cudaparm_kernel_pair_fast_ans];\n" -" mul.lo.u64 %rd80, %rd22, 16;\n" -" add.u64 %rd81, %rd79, %rd80;\n" -" mov.f32 %f131, %f132;\n" -" st.global.v4.f32 [%rd81+0], {%f39,%f38,%f37,%f131};\n" -"$Lt_1_34050:\n" -"$Lt_1_24834:\n" -" .loc 16 235 0\n" -" exit;\n" -"$LDWend_kernel_pair_fast:\n" -" }\n" -; diff --git a/lib/gpu/lj_expand.ptx b/lib/gpu/lj_expand.ptx deleted file mode 100644 index 919bd9187d..0000000000 --- a/lib/gpu/lj_expand.ptx +++ /dev/null @@ -1,912 +0,0 @@ - .version 2.3 - .target sm_20 - .address_size 64 - // compiled with /usr/local/cuda/open64/lib//be - // nvopencc 4.0 built on 2011-05-12 - - //----------------------------------------------------------- - // Compiling /tmp/tmpxft_00009ccd_00000000-9_lal_lj_expand.cpp3.i (/home/sjplimp/ccBI#.06ur5E) - //----------------------------------------------------------- - - //----------------------------------------------------------- - // Options: - //----------------------------------------------------------- - // Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64 - // -O3 (Optimization level) - // -g0 (Debug level) - // -m2 (Report advisories) - //----------------------------------------------------------- - - .file 1 "" - .file 2 "/tmp/tmpxft_00009ccd_00000000-8_lal_lj_expand.cudafe2.gpu" - .file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h" - .file 4 "/usr/local/cuda/include/crt/device_runtime.h" - .file 5 "/usr/local/cuda/include/host_defines.h" - .file 6 "/usr/local/cuda/include/builtin_types.h" - .file 7 "/usr/local/cuda/include/device_types.h" - .file 8 "/usr/local/cuda/include/driver_types.h" - .file 9 "/usr/local/cuda/include/surface_types.h" - .file 10 "/usr/local/cuda/include/texture_types.h" - .file 11 "/usr/local/cuda/include/vector_types.h" - .file 12 "/usr/local/cuda/include/device_launch_parameters.h" - .file 13 "/usr/local/cuda/include/crt/storage_class.h" - .file 14 "/usr/include/bits/types.h" - .file 15 "/usr/include/time.h" - .file 16 "lal_lj_expand.cu" - .file 17 "/usr/local/cuda/include/common_functions.h" - .file 18 "/usr/local/cuda/include/math_functions.h" - .file 19 "/usr/local/cuda/include/math_constants.h" - .file 20 "/usr/local/cuda/include/device_functions.h" - .file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h" - .file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h" - .file 23 "/usr/local/cuda/include/sm_13_double_functions.h" - .file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h" - .file 25 "/usr/local/cuda/include/sm_20_intrinsics.h" - .file 26 "/usr/local/cuda/include/surface_functions.h" - .file 27 "/usr/local/cuda/include/texture_fetch_functions.h" - .file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h" - - .global .texref pos_tex; - - .entry kernel_pair ( - .param .u64 __cudaparm_kernel_pair_x_, - .param .u64 __cudaparm_kernel_pair_lj1, - .param .u64 __cudaparm_kernel_pair_lj3, - .param .s32 __cudaparm_kernel_pair_lj_types, - .param .u64 __cudaparm_kernel_pair_sp_lj_in, - .param .u64 __cudaparm_kernel_pair_dev_nbor, - .param .u64 __cudaparm_kernel_pair_dev_packed, - .param .u64 __cudaparm_kernel_pair_ans, - .param .u64 __cudaparm_kernel_pair___val_paramengv, - .param .s32 __cudaparm_kernel_pair_eflag, - .param .s32 __cudaparm_kernel_pair_vflag, - .param .s32 __cudaparm_kernel_pair_inum, - .param .s32 __cudaparm_kernel_pair_nbor_pitch, - .param .s32 __cudaparm_kernel_pair_t_per_atom) - { - .reg .u32 %r<72>; - .reg .u64 %rd<63>; - .reg .f32 %f<107>; - .reg .pred %p<19>; - .shared .align 16 .b8 __cuda___cuda_local_var_32536_33_non_const_sp_lj92[16]; - .shared .align 4 .b8 __cuda___cuda_local_var_32603_55_non_const_red_acc108[3072]; - // __cuda_local_var_32543_10_non_const_f = 48 - // __cuda_local_var_32545_9_non_const_virial = 16 - .loc 16 31 0 -$LDWbegin_kernel_pair: - .loc 16 36 0 - ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in]; - ldu.global.f32 %f1, [%rd1+0]; - .loc 16 37 0 - ld.global.f32 %f2, [%rd1+4]; - .loc 16 38 0 - ld.global.f32 %f3, [%rd1+8]; - .loc 16 39 0 - ld.global.f32 %f4, [%rd1+12]; - st.shared.v4.f32 [__cuda___cuda_local_var_32536_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4}; - .loc 16 46 0 - mov.f32 %f5, 0f00000000; // 0 - mov.f32 %f6, %f5; - mov.f32 %f7, 0f00000000; // 0 - mov.f32 %f8, %f7; - mov.f32 %f9, 0f00000000; // 0 - mov.f32 %f10, %f9; - mov.f32 %f11, 0f00000000; // 0 - mov.f32 %f12, %f11; - mov.f32 %f13, 0f00000000; // 0 - mov.f32 %f14, %f13; - mov.f32 %f15, 0f00000000; // 0 - mov.f32 %f16, %f15; - ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom]; - cvt.s32.u32 %r2, %tid.x; - div.s32 %r3, %r2, %r1; - cvt.s32.u32 %r4, %ntid.x; - div.s32 %r5, %r4, %r1; - cvt.s32.u32 %r6, %ctaid.x; - mul.lo.s32 %r7, %r6, %r5; - add.s32 %r8, %r3, %r7; - ld.param.s32 %r9, [__cudaparm_kernel_pair_inum]; - setp.ge.s32 %p1, %r8, %r9; - @%p1 bra $Lt_0_26370; - .loc 16 51 0 - ld.param.s32 %r10, [__cudaparm_kernel_pair_nbor_pitch]; - cvt.s64.s32 %rd2, %r10; - mul.wide.s32 %rd3, %r10, 4; - cvt.s64.s32 %rd4, %r8; - mul.wide.s32 %rd5, %r8, 4; - ld.param.u64 %rd6, [__cudaparm_kernel_pair_dev_nbor]; - add.u64 %rd7, %rd5, %rd6; - add.u64 %rd8, %rd3, %rd7; - ld.global.s32 %r11, [%rd8+0]; - sub.s32 %r12, %r1, 1; - and.b32 %r13, %r12, %r2; - cvt.s64.s32 %rd9, %r13; - mul.wide.s32 %rd10, %r13, 4; - ld.param.u64 %rd11, [__cudaparm_kernel_pair_dev_packed]; - setp.ne.u64 %p2, %rd11, %rd6; - @%p2 bra $Lt_0_19458; - cvt.s32.s64 %r14, %rd2; - mul.lo.s32 %r15, %r14, %r1; - mov.s32 %r16, %r15; - mul.lo.s32 %r17, %r12, %r8; - add.s32 %r18, %r14, %r17; - cvt.s64.s32 %rd12, %r18; - mul.wide.s32 %rd13, %r18, 4; - add.u64 %rd14, %rd8, %rd13; - and.b32 %r19, %r12, %r11; - cvt.s64.s32 %rd15, %r19; - div.s32 %r20, %r11, %r1; - mul.lo.s32 %r21, %r15, %r20; - cvt.s64.s32 %rd16, %r21; - add.u64 %rd17, %rd15, %rd16; - mul.lo.u64 %rd18, %rd17, 4; - add.u64 %rd19, %rd14, %rd18; - add.u64 %rd20, %rd10, %rd14; - bra.uni $Lt_0_19202; -$Lt_0_19458: - add.u64 %rd21, %rd3, %rd8; - ld.global.s32 %r22, [%rd21+0]; - cvt.s64.s32 %rd22, %r22; - mul.wide.s32 %rd23, %r22, 4; - add.u64 %rd24, %rd11, %rd23; - cvt.s64.s32 %rd25, %r11; - mul.wide.s32 %rd26, %r11, 4; - add.u64 %rd19, %rd24, %rd26; - mov.s32 %r16, %r1; - add.u64 %rd20, %rd10, %rd24; -$Lt_0_19202: - .loc 16 54 0 - ld.global.s32 %r23, [%rd7+0]; - mov.u32 %r24, %r23; - mov.s32 %r25, 0; - mov.u32 %r26, %r25; - mov.s32 %r27, 0; - mov.u32 %r28, %r27; - mov.s32 %r29, 0; - mov.u32 %r30, %r29; - tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r24,%r26,%r28,%r30}]; - mov.f32 %f21, %f17; - mov.f32 %f22, %f18; - mov.f32 %f23, %f19; - mov.f32 %f24, %f20; - setp.ge.u64 %p3, %rd20, %rd19; - @%p3 bra $Lt_0_27906; - cvt.rzi.ftz.s32.f32 %r31, %f24; - cvt.s64.s32 %rd27, %r16; - ld.param.s32 %r32, [__cudaparm_kernel_pair_lj_types]; - mul.lo.s32 %r33, %r32, %r31; - ld.param.u64 %rd28, [__cudaparm_kernel_pair_lj1]; - mov.f32 %f25, 0f00000000; // 0 - mov.f32 %f26, 0f00000000; // 0 - mov.f32 %f27, 0f00000000; // 0 - mov.f32 %f28, 0f00000000; // 0 - mov.u64 %rd29, __cuda___cuda_local_var_32536_33_non_const_sp_lj92; -$Lt_0_20226: - // Loop body line 54, nesting depth: 1, estimated iterations: unknown - .loc 16 60 0 - ld.global.s32 %r34, [%rd20+0]; - .loc 16 61 0 - shr.s32 %r35, %r34, 30; - and.b32 %r36, %r35, 3; - cvt.s64.s32 %rd30, %r36; - mul.wide.s32 %rd31, %r36, 4; - add.u64 %rd32, %rd29, %rd31; - ld.shared.f32 %f29, [%rd32+0]; - .loc 16 64 0 - and.b32 %r37, %r34, 1073741823; - mov.u32 %r38, %r37; - mov.s32 %r39, 0; - mov.u32 %r40, %r39; - mov.s32 %r41, 0; - mov.u32 %r42, %r41; - mov.s32 %r43, 0; - mov.u32 %r44, %r43; - tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r38,%r40,%r42,%r44}]; - mov.f32 %f34, %f30; - mov.f32 %f35, %f31; - mov.f32 %f36, %f32; - mov.f32 %f37, %f33; - cvt.rzi.ftz.s32.f32 %r45, %f37; - sub.ftz.f32 %f38, %f22, %f35; - sub.ftz.f32 %f39, %f21, %f34; - sub.ftz.f32 %f40, %f23, %f36; - mul.ftz.f32 %f41, %f38, %f38; - fma.rn.ftz.f32 %f42, %f39, %f39, %f41; - fma.rn.ftz.f32 %f43, %f40, %f40, %f42; - add.s32 %r46, %r45, %r33; - cvt.s64.s32 %rd33, %r46; - mul.wide.s32 %rd34, %r46, 16; - add.u64 %rd35, %rd34, %rd28; - ld.global.f32 %f44, [%rd35+8]; - setp.gt.ftz.f32 %p4, %f44, %f43; - @!%p4 bra $Lt_0_21506; - .loc 16 76 0 - sqrt.approx.ftz.f32 %f45, %f43; - ld.global.v4.f32 {%f46,%f47,_,%f48}, [%rd35+0]; - sub.ftz.f32 %f49, %f45, %f48; - .loc 16 81 0 - mul.ftz.f32 %f50, %f49, %f49; - rcp.approx.ftz.f32 %f51, %f50; - mul.ftz.f32 %f52, %f51, %f51; - mul.ftz.f32 %f53, %f51, %f52; - div.approx.ftz.f32 %f54, %f29, %f49; - div.approx.ftz.f32 %f55, %f54, %f45; - mul.ftz.f32 %f56, %f46, %f53; - sub.ftz.f32 %f57, %f56, %f47; - mul.ftz.f32 %f58, %f53, %f57; - mul.ftz.f32 %f59, %f55, %f58; - .loc 16 83 0 - fma.rn.ftz.f32 %f27, %f39, %f59, %f27; - .loc 16 84 0 - fma.rn.ftz.f32 %f26, %f38, %f59, %f26; - .loc 16 85 0 - fma.rn.ftz.f32 %f25, %f40, %f59, %f25; - ld.param.s32 %r47, [__cudaparm_kernel_pair_eflag]; - mov.u32 %r48, 0; - setp.le.s32 %p5, %r47, %r48; - @%p5 bra $Lt_0_20994; - .loc 16 89 0 - ld.param.u64 %rd36, [__cudaparm_kernel_pair_lj3]; - add.u64 %rd37, %rd36, %rd34; - ld.global.v4.f32 {%f60,%f61,%f62,_}, [%rd37+0]; - mul.ftz.f32 %f63, %f60, %f53; - sub.ftz.f32 %f64, %f63, %f61; - mul.ftz.f32 %f65, %f53, %f64; - sub.ftz.f32 %f66, %f65, %f62; - fma.rn.ftz.f32 %f28, %f29, %f66, %f28; -$Lt_0_20994: - ld.param.s32 %r49, [__cudaparm_kernel_pair_vflag]; - mov.u32 %r50, 0; - setp.le.s32 %p6, %r49, %r50; - @%p6 bra $Lt_0_21506; - .loc 16 92 0 - mov.f32 %f67, %f6; - mul.ftz.f32 %f68, %f39, %f39; - fma.rn.ftz.f32 %f69, %f59, %f68, %f67; - mov.f32 %f6, %f69; - .loc 16 93 0 - mov.f32 %f70, %f8; - fma.rn.ftz.f32 %f71, %f59, %f41, %f70; - mov.f32 %f8, %f71; - .loc 16 94 0 - mov.f32 %f72, %f10; - mul.ftz.f32 %f73, %f40, %f40; - fma.rn.ftz.f32 %f74, %f59, %f73, %f72; - mov.f32 %f10, %f74; - .loc 16 95 0 - mov.f32 %f75, %f12; - mul.ftz.f32 %f76, %f38, %f39; - fma.rn.ftz.f32 %f77, %f59, %f76, %f75; - mov.f32 %f12, %f77; - .loc 16 96 0 - mov.f32 %f78, %f14; - mul.ftz.f32 %f79, %f39, %f40; - fma.rn.ftz.f32 %f80, %f59, %f79, %f78; - mov.f32 %f14, %f80; - .loc 16 97 0 - mul.ftz.f32 %f81, %f38, %f40; - fma.rn.ftz.f32 %f15, %f59, %f81, %f15; - mov.f32 %f16, %f15; -$Lt_0_21506: -$Lt_0_20482: - .loc 16 58 0 - mul.lo.u64 %rd38, %rd27, 4; - add.u64 %rd20, %rd20, %rd38; - setp.lt.u64 %p7, %rd20, %rd19; - @%p7 bra $Lt_0_20226; - bra.uni $Lt_0_19714; -$Lt_0_27906: - mov.f32 %f25, 0f00000000; // 0 - mov.f32 %f26, 0f00000000; // 0 - mov.f32 %f27, 0f00000000; // 0 - mov.f32 %f28, 0f00000000; // 0 -$Lt_0_19714: - mov.u32 %r51, 1; - setp.le.s32 %p8, %r1, %r51; - @%p8 bra $Lt_0_24322; - .loc 16 102 0 - mov.u64 %rd39, __cuda___cuda_local_var_32603_55_non_const_red_acc108; - cvt.s64.s32 %rd40, %r2; - mul.wide.s32 %rd41, %r2, 4; - add.u64 %rd42, %rd39, %rd41; - mov.f32 %f82, %f27; - st.shared.f32 [%rd42+0], %f82; - mov.f32 %f83, %f26; - st.shared.f32 [%rd42+512], %f83; - mov.f32 %f84, %f25; - st.shared.f32 [%rd42+1024], %f84; - mov.f32 %f85, %f28; - st.shared.f32 [%rd42+1536], %f85; - shr.s32 %r52, %r1, 31; - mov.s32 %r53, 1; - and.b32 %r54, %r52, %r53; - add.s32 %r55, %r54, %r1; - shr.s32 %r56, %r55, 1; - mov.s32 %r57, %r56; - mov.u32 %r58, 0; - setp.ne.u32 %p9, %r56, %r58; - @!%p9 bra $Lt_0_22786; -$Lt_0_23298: - setp.ge.u32 %p10, %r13, %r57; - @%p10 bra $Lt_0_23554; - add.u32 %r59, %r2, %r57; - cvt.u64.u32 %rd43, %r59; - mul.wide.u32 %rd44, %r59, 4; - add.u64 %rd45, %rd39, %rd44; - ld.shared.f32 %f86, [%rd45+0]; - add.ftz.f32 %f82, %f86, %f82; - st.shared.f32 [%rd42+0], %f82; - ld.shared.f32 %f87, [%rd45+512]; - add.ftz.f32 %f83, %f87, %f83; - st.shared.f32 [%rd42+512], %f83; - ld.shared.f32 %f88, [%rd45+1024]; - add.ftz.f32 %f84, %f88, %f84; - st.shared.f32 [%rd42+1024], %f84; - ld.shared.f32 %f89, [%rd45+1536]; - add.ftz.f32 %f85, %f89, %f85; - st.shared.f32 [%rd42+1536], %f85; -$Lt_0_23554: - shr.u32 %r57, %r57, 1; - mov.u32 %r60, 0; - setp.ne.u32 %p11, %r57, %r60; - @%p11 bra $Lt_0_23298; -$Lt_0_22786: - mov.f32 %f27, %f82; - mov.f32 %f26, %f83; - mov.f32 %f25, %f84; - mov.f32 %f28, %f85; - ld.param.s32 %r61, [__cudaparm_kernel_pair_vflag]; - mov.u32 %r62, 0; - setp.le.s32 %p12, %r61, %r62; - @%p12 bra $Lt_0_24322; - mov.f32 %f82, %f6; - st.shared.f32 [%rd42+0], %f82; - mov.f32 %f83, %f8; - st.shared.f32 [%rd42+512], %f83; - mov.f32 %f84, %f10; - st.shared.f32 [%rd42+1024], %f84; - mov.f32 %f85, %f12; - st.shared.f32 [%rd42+1536], %f85; - mov.f32 %f90, %f14; - st.shared.f32 [%rd42+2048], %f90; - mov.f32 %f91, %f15; - st.shared.f32 [%rd42+2560], %f91; - mov.s32 %r63, %r56; - @!%p9 bra $Lt_0_24834; -$Lt_0_25346: - setp.ge.u32 %p13, %r13, %r63; - @%p13 bra $Lt_0_25602; - add.u32 %r64, %r2, %r63; - cvt.u64.u32 %rd46, %r64; - mul.wide.u32 %rd47, %r64, 4; - add.u64 %rd48, %rd39, %rd47; - ld.shared.f32 %f92, [%rd48+0]; - add.ftz.f32 %f82, %f92, %f82; - st.shared.f32 [%rd42+0], %f82; - ld.shared.f32 %f93, [%rd48+512]; - add.ftz.f32 %f83, %f93, %f83; - st.shared.f32 [%rd42+512], %f83; - ld.shared.f32 %f94, [%rd48+1024]; - add.ftz.f32 %f84, %f94, %f84; - st.shared.f32 [%rd42+1024], %f84; - ld.shared.f32 %f95, [%rd48+1536]; - add.ftz.f32 %f85, %f95, %f85; - st.shared.f32 [%rd42+1536], %f85; - ld.shared.f32 %f96, [%rd48+2048]; - add.ftz.f32 %f90, %f96, %f90; - st.shared.f32 [%rd42+2048], %f90; - ld.shared.f32 %f97, [%rd48+2560]; - add.ftz.f32 %f91, %f97, %f91; - st.shared.f32 [%rd42+2560], %f91; -$Lt_0_25602: - shr.u32 %r63, %r63, 1; - mov.u32 %r65, 0; - setp.ne.u32 %p14, %r63, %r65; - @%p14 bra $Lt_0_25346; -$Lt_0_24834: - mov.f32 %f6, %f82; - mov.f32 %f8, %f83; - mov.f32 %f10, %f84; - mov.f32 %f12, %f85; - mov.f32 %f14, %f90; - mov.f32 %f16, %f91; -$Lt_0_24322: -$Lt_0_22274: - mov.u32 %r66, 0; - setp.ne.s32 %p15, %r13, %r66; - @%p15 bra $Lt_0_26370; - ld.param.u64 %rd49, [__cudaparm_kernel_pair___val_paramengv]; - add.u64 %rd50, %rd49, %rd5; - ld.param.s32 %r67, [__cudaparm_kernel_pair_eflag]; - mov.u32 %r68, 0; - setp.le.s32 %p16, %r67, %r68; - @%p16 bra $Lt_0_26882; - st.global.f32 [%rd50+0], %f28; - cvt.s64.s32 %rd51, %r9; - mul.wide.s32 %rd52, %r9, 4; - add.u64 %rd50, %rd50, %rd52; -$Lt_0_26882: - ld.param.s32 %r69, [__cudaparm_kernel_pair_vflag]; - mov.u32 %r70, 0; - setp.le.s32 %p17, %r69, %r70; - @%p17 bra $Lt_0_27394; - mov.f32 %f98, %f6; - st.global.f32 [%rd50+0], %f98; - cvt.s64.s32 %rd53, %r9; - mul.wide.s32 %rd54, %r9, 4; - add.u64 %rd55, %rd54, %rd50; - mov.f32 %f99, %f8; - st.global.f32 [%rd55+0], %f99; - add.u64 %rd56, %rd54, %rd55; - mov.f32 %f100, %f10; - st.global.f32 [%rd56+0], %f100; - add.u64 %rd57, %rd54, %rd56; - mov.f32 %f101, %f12; - st.global.f32 [%rd57+0], %f101; - add.u64 %rd50, %rd54, %rd57; - mov.f32 %f102, %f14; - st.global.f32 [%rd50+0], %f102; - mov.f32 %f103, %f16; - add.u64 %rd58, %rd54, %rd50; - st.global.f32 [%rd58+0], %f103; -$Lt_0_27394: - ld.param.u64 %rd59, [__cudaparm_kernel_pair_ans]; - mul.lo.u64 %rd60, %rd4, 16; - add.u64 %rd61, %rd59, %rd60; - mov.f32 %f104, %f105; - st.global.v4.f32 [%rd61+0], {%f27,%f26,%f25,%f104}; -$Lt_0_26370: -$Lt_0_18690: - .loc 16 105 0 - exit; -$LDWend_kernel_pair: - } // kernel_pair - - .entry kernel_pair_fast ( - .param .u64 __cudaparm_kernel_pair_fast_x_, - .param .u64 __cudaparm_kernel_pair_fast_lj1_in, - .param .u64 __cudaparm_kernel_pair_fast_lj3_in, - .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in, - .param .u64 __cudaparm_kernel_pair_fast_dev_nbor, - .param .u64 __cudaparm_kernel_pair_fast_dev_packed, - .param .u64 __cudaparm_kernel_pair_fast_ans, - .param .u64 __cudaparm_kernel_pair_fast___val_paramengv, - .param .s32 __cudaparm_kernel_pair_fast_eflag, - .param .s32 __cudaparm_kernel_pair_fast_vflag, - .param .s32 __cudaparm_kernel_pair_fast_inum, - .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch, - .param .s32 __cudaparm_kernel_pair_fast_t_per_atom) - { - .reg .u32 %r<74>; - .reg .u64 %rd<75>; - .reg .f32 %f<114>; - .reg .pred %p<22>; - .shared .align 4 .b8 __cuda___cuda_local_var_32620_33_non_const_sp_lj3268[16]; - .shared .align 16 .b8 __cuda___cuda_local_var_32618_34_non_const_lj13296[1936]; - .shared .align 16 .b8 __cuda___cuda_local_var_32619_34_non_const_lj35232[1936]; - .shared .align 4 .b8 __cuda___cuda_local_var_32692_55_non_const_red_acc7168[3072]; - // __cuda_local_var_32630_10_non_const_f = 48 - // __cuda_local_var_32632_9_non_const_virial = 16 - .loc 16 113 0 -$LDWbegin_kernel_pair_fast: - cvt.s32.u32 %r1, %tid.x; - mov.u32 %r2, 3; - setp.gt.s32 %p1, %r1, %r2; - @%p1 bra $Lt_1_20994; - .loc 16 121 0 - mov.u64 %rd1, __cuda___cuda_local_var_32620_33_non_const_sp_lj3268; - cvt.s64.s32 %rd2, %r1; - mul.wide.s32 %rd3, %r1, 4; - ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in]; - add.u64 %rd5, %rd4, %rd3; - ld.global.f32 %f1, [%rd5+0]; - add.u64 %rd6, %rd3, %rd1; - st.shared.f32 [%rd6+0], %f1; -$Lt_1_20994: - mov.u64 %rd1, __cuda___cuda_local_var_32620_33_non_const_sp_lj3268; - mov.u32 %r3, 120; - setp.gt.s32 %p2, %r1, %r3; - @%p2 bra $Lt_1_21506; - .loc 16 123 0 - mov.u64 %rd7, __cuda___cuda_local_var_32618_34_non_const_lj13296; - cvt.s64.s32 %rd8, %r1; - mul.wide.s32 %rd9, %r1, 16; - ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_lj1_in]; - add.u64 %rd11, %rd10, %rd9; - add.u64 %rd12, %rd9, %rd7; - ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0]; - st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5}; - ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag]; - mov.u32 %r5, 0; - setp.le.s32 %p3, %r4, %r5; - @%p3 bra $Lt_1_22018; - .loc 16 125 0 - mov.u64 %rd13, __cuda___cuda_local_var_32619_34_non_const_lj35232; - ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in]; - add.u64 %rd15, %rd14, %rd9; - add.u64 %rd16, %rd9, %rd13; - ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0]; - st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9}; -$Lt_1_22018: - mov.u64 %rd13, __cuda___cuda_local_var_32619_34_non_const_lj35232; -$Lt_1_21506: - mov.u64 %rd13, __cuda___cuda_local_var_32619_34_non_const_lj35232; - mov.u64 %rd7, __cuda___cuda_local_var_32618_34_non_const_lj13296; - .loc 16 133 0 - mov.f32 %f10, 0f00000000; // 0 - mov.f32 %f11, %f10; - mov.f32 %f12, 0f00000000; // 0 - mov.f32 %f13, %f12; - mov.f32 %f14, 0f00000000; // 0 - mov.f32 %f15, %f14; - mov.f32 %f16, 0f00000000; // 0 - mov.f32 %f17, %f16; - mov.f32 %f18, 0f00000000; // 0 - mov.f32 %f19, %f18; - mov.f32 %f20, 0f00000000; // 0 - mov.f32 %f21, %f20; - .loc 16 135 0 - bar.sync 0; - ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom]; - div.s32 %r7, %r1, %r6; - cvt.s32.u32 %r8, %ntid.x; - div.s32 %r9, %r8, %r6; - cvt.s32.u32 %r10, %ctaid.x; - mul.lo.s32 %r11, %r10, %r9; - add.s32 %r12, %r7, %r11; - ld.param.s32 %r13, [__cudaparm_kernel_pair_fast_inum]; - setp.ge.s32 %p4, %r12, %r13; - @%p4 bra $Lt_1_30210; - .loc 16 140 0 - ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_nbor_pitch]; - cvt.s64.s32 %rd17, %r14; - mul.wide.s32 %rd18, %r14, 4; - cvt.s64.s32 %rd19, %r12; - mul.wide.s32 %rd20, %r12, 4; - ld.param.u64 %rd21, [__cudaparm_kernel_pair_fast_dev_nbor]; - add.u64 %rd22, %rd20, %rd21; - add.u64 %rd23, %rd18, %rd22; - ld.global.s32 %r15, [%rd23+0]; - sub.s32 %r16, %r6, 1; - and.b32 %r17, %r16, %r1; - cvt.s64.s32 %rd24, %r17; - mul.wide.s32 %rd25, %r17, 4; - ld.param.u64 %rd26, [__cudaparm_kernel_pair_fast_dev_packed]; - setp.ne.u64 %p5, %rd26, %rd21; - @%p5 bra $Lt_1_23298; - cvt.s32.s64 %r18, %rd17; - mul.lo.s32 %r19, %r18, %r6; - mov.s32 %r20, %r19; - mul.lo.s32 %r21, %r16, %r12; - add.s32 %r22, %r18, %r21; - cvt.s64.s32 %rd27, %r22; - mul.wide.s32 %rd28, %r22, 4; - add.u64 %rd29, %rd23, %rd28; - and.b32 %r23, %r16, %r15; - cvt.s64.s32 %rd30, %r23; - div.s32 %r24, %r15, %r6; - mul.lo.s32 %r25, %r19, %r24; - cvt.s64.s32 %rd31, %r25; - add.u64 %rd32, %rd30, %rd31; - mul.lo.u64 %rd33, %rd32, 4; - add.u64 %rd34, %rd29, %rd33; - add.u64 %rd35, %rd25, %rd29; - bra.uni $Lt_1_23042; -$Lt_1_23298: - add.u64 %rd36, %rd18, %rd23; - ld.global.s32 %r26, [%rd36+0]; - cvt.s64.s32 %rd37, %r26; - mul.wide.s32 %rd38, %r26, 4; - add.u64 %rd39, %rd26, %rd38; - cvt.s64.s32 %rd40, %r15; - mul.wide.s32 %rd41, %r15, 4; - add.u64 %rd34, %rd39, %rd41; - mov.s32 %r20, %r6; - add.u64 %rd35, %rd25, %rd39; -$Lt_1_23042: - .loc 16 143 0 - ld.global.s32 %r27, [%rd22+0]; - mov.u32 %r28, %r27; - mov.s32 %r29, 0; - mov.u32 %r30, %r29; - mov.s32 %r31, 0; - mov.u32 %r32, %r31; - mov.s32 %r33, 0; - mov.u32 %r34, %r33; - tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r28,%r30,%r32,%r34}]; - mov.f32 %f26, %f22; - mov.f32 %f27, %f23; - mov.f32 %f28, %f24; - mov.f32 %f29, %f25; - setp.ge.u64 %p6, %rd35, %rd34; - @%p6 bra $Lt_1_31746; - cvt.rzi.ftz.s32.f32 %r35, %f29; - cvt.s64.s32 %rd42, %r20; - mul.lo.s32 %r36, %r35, 11; - cvt.rn.f32.s32 %f30, %r36; - mov.f32 %f31, 0f00000000; // 0 - mov.f32 %f32, 0f00000000; // 0 - mov.f32 %f33, 0f00000000; // 0 - mov.f32 %f34, 0f00000000; // 0 -$Lt_1_24066: - // Loop body line 143, nesting depth: 1, estimated iterations: unknown - .loc 16 150 0 - ld.global.s32 %r37, [%rd35+0]; - .loc 16 151 0 - shr.s32 %r38, %r37, 30; - and.b32 %r39, %r38, 3; - cvt.s64.s32 %rd43, %r39; - mul.wide.s32 %rd44, %r39, 4; - add.u64 %rd45, %rd1, %rd44; - ld.shared.f32 %f35, [%rd45+0]; - .loc 16 154 0 - and.b32 %r40, %r37, 1073741823; - mov.u32 %r41, %r40; - mov.s32 %r42, 0; - mov.u32 %r43, %r42; - mov.s32 %r44, 0; - mov.u32 %r45, %r44; - mov.s32 %r46, 0; - mov.u32 %r47, %r46; - tex.1d.v4.f32.s32 {%f36,%f37,%f38,%f39},[pos_tex,{%r41,%r43,%r45,%r47}]; - mov.f32 %f40, %f36; - mov.f32 %f41, %f37; - mov.f32 %f42, %f38; - mov.f32 %f43, %f39; - sub.ftz.f32 %f44, %f27, %f41; - sub.ftz.f32 %f45, %f26, %f40; - sub.ftz.f32 %f46, %f28, %f42; - mul.ftz.f32 %f47, %f44, %f44; - fma.rn.ftz.f32 %f48, %f45, %f45, %f47; - fma.rn.ftz.f32 %f49, %f46, %f46, %f48; - add.ftz.f32 %f50, %f30, %f43; - cvt.rzi.ftz.s32.f32 %r48, %f50; - cvt.s64.s32 %rd46, %r48; - mul.wide.s32 %rd47, %r48, 16; - add.u64 %rd48, %rd47, %rd7; - ld.shared.f32 %f51, [%rd48+8]; - setp.gt.ftz.f32 %p7, %f51, %f49; - @!%p7 bra $Lt_1_25346; - .loc 16 165 0 - sqrt.approx.ftz.f32 %f52, %f49; - ld.shared.v4.f32 {%f53,%f54,_,%f55}, [%rd48+0]; - sub.ftz.f32 %f56, %f52, %f55; - .loc 16 169 0 - mul.ftz.f32 %f57, %f56, %f56; - rcp.approx.ftz.f32 %f58, %f57; - mul.ftz.f32 %f59, %f58, %f58; - mul.ftz.f32 %f60, %f58, %f59; - mul.ftz.f32 %f61, %f53, %f60; - sub.ftz.f32 %f62, %f61, %f54; - mul.ftz.f32 %f63, %f60, %f62; - .loc 16 170 0 - div.approx.ftz.f32 %f64, %f35, %f56; - div.approx.ftz.f32 %f65, %f64, %f52; - mul.ftz.f32 %f66, %f63, %f65; - .loc 16 172 0 - fma.rn.ftz.f32 %f33, %f45, %f66, %f33; - .loc 16 173 0 - fma.rn.ftz.f32 %f32, %f44, %f66, %f32; - .loc 16 174 0 - fma.rn.ftz.f32 %f31, %f46, %f66, %f31; - ld.param.s32 %r49, [__cudaparm_kernel_pair_fast_eflag]; - mov.u32 %r50, 0; - setp.le.s32 %p8, %r49, %r50; - @%p8 bra $Lt_1_24834; - .loc 16 177 0 - add.u64 %rd49, %rd47, %rd13; - ld.shared.v4.f32 {%f67,%f68,%f69,_}, [%rd49+0]; - mul.ftz.f32 %f70, %f67, %f60; - sub.ftz.f32 %f71, %f70, %f68; - mul.ftz.f32 %f72, %f60, %f71; - .loc 16 178 0 - sub.ftz.f32 %f73, %f72, %f69; - fma.rn.ftz.f32 %f34, %f35, %f73, %f34; -$Lt_1_24834: - ld.param.s32 %r51, [__cudaparm_kernel_pair_fast_vflag]; - mov.u32 %r52, 0; - setp.le.s32 %p9, %r51, %r52; - @%p9 bra $Lt_1_25346; - .loc 16 181 0 - mov.f32 %f74, %f11; - mul.ftz.f32 %f75, %f45, %f45; - fma.rn.ftz.f32 %f76, %f66, %f75, %f74; - mov.f32 %f11, %f76; - .loc 16 182 0 - mov.f32 %f77, %f13; - fma.rn.ftz.f32 %f78, %f66, %f47, %f77; - mov.f32 %f13, %f78; - .loc 16 183 0 - mov.f32 %f79, %f15; - mul.ftz.f32 %f80, %f46, %f46; - fma.rn.ftz.f32 %f81, %f66, %f80, %f79; - mov.f32 %f15, %f81; - .loc 16 184 0 - mov.f32 %f82, %f17; - mul.ftz.f32 %f83, %f44, %f45; - fma.rn.ftz.f32 %f84, %f66, %f83, %f82; - mov.f32 %f17, %f84; - .loc 16 185 0 - mov.f32 %f85, %f19; - mul.ftz.f32 %f86, %f45, %f46; - fma.rn.ftz.f32 %f87, %f66, %f86, %f85; - mov.f32 %f19, %f87; - .loc 16 186 0 - mul.ftz.f32 %f88, %f44, %f46; - fma.rn.ftz.f32 %f20, %f66, %f88, %f20; - mov.f32 %f21, %f20; -$Lt_1_25346: -$Lt_1_24322: - .loc 16 148 0 - mul.lo.u64 %rd50, %rd42, 4; - add.u64 %rd35, %rd35, %rd50; - setp.lt.u64 %p10, %rd35, %rd34; - @%p10 bra $Lt_1_24066; - bra.uni $Lt_1_23554; -$Lt_1_31746: - mov.f32 %f31, 0f00000000; // 0 - mov.f32 %f32, 0f00000000; // 0 - mov.f32 %f33, 0f00000000; // 0 - mov.f32 %f34, 0f00000000; // 0 -$Lt_1_23554: - mov.u32 %r53, 1; - setp.le.s32 %p11, %r6, %r53; - @%p11 bra $Lt_1_28162; - .loc 16 191 0 - mov.u64 %rd51, __cuda___cuda_local_var_32692_55_non_const_red_acc7168; - cvt.s64.s32 %rd52, %r1; - mul.wide.s32 %rd53, %r1, 4; - add.u64 %rd54, %rd51, %rd53; - mov.f32 %f89, %f33; - st.shared.f32 [%rd54+0], %f89; - mov.f32 %f90, %f32; - st.shared.f32 [%rd54+512], %f90; - mov.f32 %f91, %f31; - st.shared.f32 [%rd54+1024], %f91; - mov.f32 %f92, %f34; - st.shared.f32 [%rd54+1536], %f92; - shr.s32 %r54, %r6, 31; - mov.s32 %r55, 1; - and.b32 %r56, %r54, %r55; - add.s32 %r57, %r56, %r6; - shr.s32 %r58, %r57, 1; - mov.s32 %r59, %r58; - mov.u32 %r60, 0; - setp.ne.u32 %p12, %r58, %r60; - @!%p12 bra $Lt_1_26626; -$Lt_1_27138: - setp.ge.u32 %p13, %r17, %r59; - @%p13 bra $Lt_1_27394; - add.u32 %r61, %r1, %r59; - cvt.u64.u32 %rd55, %r61; - mul.wide.u32 %rd56, %r61, 4; - add.u64 %rd57, %rd51, %rd56; - ld.shared.f32 %f93, [%rd57+0]; - add.ftz.f32 %f89, %f93, %f89; - st.shared.f32 [%rd54+0], %f89; - ld.shared.f32 %f94, [%rd57+512]; - add.ftz.f32 %f90, %f94, %f90; - st.shared.f32 [%rd54+512], %f90; - ld.shared.f32 %f95, [%rd57+1024]; - add.ftz.f32 %f91, %f95, %f91; - st.shared.f32 [%rd54+1024], %f91; - ld.shared.f32 %f96, [%rd57+1536]; - add.ftz.f32 %f92, %f96, %f92; - st.shared.f32 [%rd54+1536], %f92; -$Lt_1_27394: - shr.u32 %r59, %r59, 1; - mov.u32 %r62, 0; - setp.ne.u32 %p14, %r59, %r62; - @%p14 bra $Lt_1_27138; -$Lt_1_26626: - mov.f32 %f33, %f89; - mov.f32 %f32, %f90; - mov.f32 %f31, %f91; - mov.f32 %f34, %f92; - ld.param.s32 %r63, [__cudaparm_kernel_pair_fast_vflag]; - mov.u32 %r64, 0; - setp.le.s32 %p15, %r63, %r64; - @%p15 bra $Lt_1_28162; - mov.f32 %f89, %f11; - st.shared.f32 [%rd54+0], %f89; - mov.f32 %f90, %f13; - st.shared.f32 [%rd54+512], %f90; - mov.f32 %f91, %f15; - st.shared.f32 [%rd54+1024], %f91; - mov.f32 %f92, %f17; - st.shared.f32 [%rd54+1536], %f92; - mov.f32 %f97, %f19; - st.shared.f32 [%rd54+2048], %f97; - mov.f32 %f98, %f20; - st.shared.f32 [%rd54+2560], %f98; - mov.s32 %r65, %r58; - @!%p12 bra $Lt_1_28674; -$Lt_1_29186: - setp.ge.u32 %p16, %r17, %r65; - @%p16 bra $Lt_1_29442; - add.u32 %r66, %r1, %r65; - cvt.u64.u32 %rd58, %r66; - mul.wide.u32 %rd59, %r66, 4; - add.u64 %rd60, %rd51, %rd59; - ld.shared.f32 %f99, [%rd60+0]; - add.ftz.f32 %f89, %f99, %f89; - st.shared.f32 [%rd54+0], %f89; - ld.shared.f32 %f100, [%rd60+512]; - add.ftz.f32 %f90, %f100, %f90; - st.shared.f32 [%rd54+512], %f90; - ld.shared.f32 %f101, [%rd60+1024]; - add.ftz.f32 %f91, %f101, %f91; - st.shared.f32 [%rd54+1024], %f91; - ld.shared.f32 %f102, [%rd60+1536]; - add.ftz.f32 %f92, %f102, %f92; - st.shared.f32 [%rd54+1536], %f92; - ld.shared.f32 %f103, [%rd60+2048]; - add.ftz.f32 %f97, %f103, %f97; - st.shared.f32 [%rd54+2048], %f97; - ld.shared.f32 %f104, [%rd60+2560]; - add.ftz.f32 %f98, %f104, %f98; - st.shared.f32 [%rd54+2560], %f98; -$Lt_1_29442: - shr.u32 %r65, %r65, 1; - mov.u32 %r67, 0; - setp.ne.u32 %p17, %r65, %r67; - @%p17 bra $Lt_1_29186; -$Lt_1_28674: - mov.f32 %f11, %f89; - mov.f32 %f13, %f90; - mov.f32 %f15, %f91; - mov.f32 %f17, %f92; - mov.f32 %f19, %f97; - mov.f32 %f21, %f98; -$Lt_1_28162: -$Lt_1_26114: - mov.u32 %r68, 0; - setp.ne.s32 %p18, %r17, %r68; - @%p18 bra $Lt_1_30210; - ld.param.u64 %rd61, [__cudaparm_kernel_pair_fast___val_paramengv]; - add.u64 %rd62, %rd61, %rd20; - ld.param.s32 %r69, [__cudaparm_kernel_pair_fast_eflag]; - mov.u32 %r70, 0; - setp.le.s32 %p19, %r69, %r70; - @%p19 bra $Lt_1_30722; - st.global.f32 [%rd62+0], %f34; - cvt.s64.s32 %rd63, %r13; - mul.wide.s32 %rd64, %r13, 4; - add.u64 %rd62, %rd62, %rd64; -$Lt_1_30722: - ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag]; - mov.u32 %r72, 0; - setp.le.s32 %p20, %r71, %r72; - @%p20 bra $Lt_1_31234; - mov.f32 %f105, %f11; - st.global.f32 [%rd62+0], %f105; - cvt.s64.s32 %rd65, %r13; - mul.wide.s32 %rd66, %r13, 4; - add.u64 %rd67, %rd66, %rd62; - mov.f32 %f106, %f13; - st.global.f32 [%rd67+0], %f106; - add.u64 %rd68, %rd66, %rd67; - mov.f32 %f107, %f15; - st.global.f32 [%rd68+0], %f107; - add.u64 %rd69, %rd66, %rd68; - mov.f32 %f108, %f17; - st.global.f32 [%rd69+0], %f108; - add.u64 %rd62, %rd66, %rd69; - mov.f32 %f109, %f19; - st.global.f32 [%rd62+0], %f109; - mov.f32 %f110, %f21; - add.u64 %rd70, %rd66, %rd62; - st.global.f32 [%rd70+0], %f110; -$Lt_1_31234: - ld.param.u64 %rd71, [__cudaparm_kernel_pair_fast_ans]; - mul.lo.u64 %rd72, %rd19, 16; - add.u64 %rd73, %rd71, %rd72; - mov.f32 %f111, %f112; - st.global.v4.f32 [%rd73+0], {%f33,%f32,%f31,%f111}; -$Lt_1_30210: -$Lt_1_22530: - .loc 16 194 0 - exit; -$LDWend_kernel_pair_fast: - } // kernel_pair_fast - diff --git a/lib/gpu/lj_expand_ptx.h b/lib/gpu/lj_expand_ptx.h deleted file mode 100644 index fa9b5450df..0000000000 --- a/lib/gpu/lj_expand_ptx.h +++ /dev/null @@ -1,860 +0,0 @@ -const char * lj_expand = -" .version 2.3\n" -" .target sm_20\n" -" .address_size 64\n" -" .global .texref pos_tex;\n" -" .entry kernel_pair (\n" -" .param .u64 __cudaparm_kernel_pair_x_,\n" -" .param .u64 __cudaparm_kernel_pair_lj1,\n" -" .param .u64 __cudaparm_kernel_pair_lj3,\n" -" .param .s32 __cudaparm_kernel_pair_lj_types,\n" -" .param .u64 __cudaparm_kernel_pair_sp_lj_in,\n" -" .param .u64 __cudaparm_kernel_pair_dev_nbor,\n" -" .param .u64 __cudaparm_kernel_pair_dev_packed,\n" -" .param .u64 __cudaparm_kernel_pair_ans,\n" -" .param .u64 __cudaparm_kernel_pair___val_paramengv,\n" -" .param .s32 __cudaparm_kernel_pair_eflag,\n" -" .param .s32 __cudaparm_kernel_pair_vflag,\n" -" .param .s32 __cudaparm_kernel_pair_inum,\n" -" .param .s32 __cudaparm_kernel_pair_nbor_pitch,\n" -" .param .s32 __cudaparm_kernel_pair_t_per_atom)\n" -" {\n" -" .reg .u32 %r<72>;\n" -" .reg .u64 %rd<63>;\n" -" .reg .f32 %f<107>;\n" -" .reg .pred %p<19>;\n" -" .shared .align 16 .b8 __cuda___cuda_local_var_32536_33_non_const_sp_lj92[16];\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_32603_55_non_const_red_acc108[3072];\n" -" .loc 16 31 0\n" -"$LDWbegin_kernel_pair:\n" -" .loc 16 36 0\n" -" ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];\n" -" ldu.global.f32 %f1, [%rd1+0];\n" -" .loc 16 37 0\n" -" ld.global.f32 %f2, [%rd1+4];\n" -" .loc 16 38 0\n" -" ld.global.f32 %f3, [%rd1+8];\n" -" .loc 16 39 0\n" -" ld.global.f32 %f4, [%rd1+12];\n" -" st.shared.v4.f32 [__cuda___cuda_local_var_32536_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4};\n" -" .loc 16 46 0\n" -" mov.f32 %f5, 0f00000000; \n" -" mov.f32 %f6, %f5;\n" -" mov.f32 %f7, 0f00000000; \n" -" mov.f32 %f8, %f7;\n" -" mov.f32 %f9, 0f00000000; \n" -" mov.f32 %f10, %f9;\n" -" mov.f32 %f11, 0f00000000; \n" -" mov.f32 %f12, %f11;\n" -" mov.f32 %f13, 0f00000000; \n" -" mov.f32 %f14, %f13;\n" -" mov.f32 %f15, 0f00000000; \n" -" mov.f32 %f16, %f15;\n" -" ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];\n" -" cvt.s32.u32 %r2, %tid.x;\n" -" div.s32 %r3, %r2, %r1;\n" -" cvt.s32.u32 %r4, %ntid.x;\n" -" div.s32 %r5, %r4, %r1;\n" -" cvt.s32.u32 %r6, %ctaid.x;\n" -" mul.lo.s32 %r7, %r6, %r5;\n" -" add.s32 %r8, %r3, %r7;\n" -" ld.param.s32 %r9, [__cudaparm_kernel_pair_inum];\n" -" setp.ge.s32 %p1, %r8, %r9;\n" -" @%p1 bra $Lt_0_26370;\n" -" .loc 16 51 0\n" -" ld.param.s32 %r10, [__cudaparm_kernel_pair_nbor_pitch];\n" -" cvt.s64.s32 %rd2, %r10;\n" -" mul.wide.s32 %rd3, %r10, 4;\n" -" cvt.s64.s32 %rd4, %r8;\n" -" mul.wide.s32 %rd5, %r8, 4;\n" -" ld.param.u64 %rd6, [__cudaparm_kernel_pair_dev_nbor];\n" -" add.u64 %rd7, %rd5, %rd6;\n" -" add.u64 %rd8, %rd3, %rd7;\n" -" ld.global.s32 %r11, [%rd8+0];\n" -" sub.s32 %r12, %r1, 1;\n" -" and.b32 %r13, %r12, %r2;\n" -" cvt.s64.s32 %rd9, %r13;\n" -" mul.wide.s32 %rd10, %r13, 4;\n" -" ld.param.u64 %rd11, [__cudaparm_kernel_pair_dev_packed];\n" -" setp.ne.u64 %p2, %rd11, %rd6;\n" -" @%p2 bra $Lt_0_19458;\n" -" cvt.s32.s64 %r14, %rd2;\n" -" mul.lo.s32 %r15, %r14, %r1;\n" -" mov.s32 %r16, %r15;\n" -" mul.lo.s32 %r17, %r12, %r8;\n" -" add.s32 %r18, %r14, %r17;\n" -" cvt.s64.s32 %rd12, %r18;\n" -" mul.wide.s32 %rd13, %r18, 4;\n" -" add.u64 %rd14, %rd8, %rd13;\n" -" and.b32 %r19, %r12, %r11;\n" -" cvt.s64.s32 %rd15, %r19;\n" -" div.s32 %r20, %r11, %r1;\n" -" mul.lo.s32 %r21, %r15, %r20;\n" -" cvt.s64.s32 %rd16, %r21;\n" -" add.u64 %rd17, %rd15, %rd16;\n" -" mul.lo.u64 %rd18, %rd17, 4;\n" -" add.u64 %rd19, %rd14, %rd18;\n" -" add.u64 %rd20, %rd10, %rd14;\n" -" bra.uni $Lt_0_19202;\n" -"$Lt_0_19458:\n" -" add.u64 %rd21, %rd3, %rd8;\n" -" ld.global.s32 %r22, [%rd21+0];\n" -" cvt.s64.s32 %rd22, %r22;\n" -" mul.wide.s32 %rd23, %r22, 4;\n" -" add.u64 %rd24, %rd11, %rd23;\n" -" cvt.s64.s32 %rd25, %r11;\n" -" mul.wide.s32 %rd26, %r11, 4;\n" -" add.u64 %rd19, %rd24, %rd26;\n" -" mov.s32 %r16, %r1;\n" -" add.u64 %rd20, %rd10, %rd24;\n" -"$Lt_0_19202:\n" -" .loc 16 54 0\n" -" ld.global.s32 %r23, [%rd7+0];\n" -" mov.u32 %r24, %r23;\n" -" mov.s32 %r25, 0;\n" -" mov.u32 %r26, %r25;\n" -" mov.s32 %r27, 0;\n" -" mov.u32 %r28, %r27;\n" -" mov.s32 %r29, 0;\n" -" mov.u32 %r30, %r29;\n" -" tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r24,%r26,%r28,%r30}];\n" -" mov.f32 %f21, %f17;\n" -" mov.f32 %f22, %f18;\n" -" mov.f32 %f23, %f19;\n" -" mov.f32 %f24, %f20;\n" -" setp.ge.u64 %p3, %rd20, %rd19;\n" -" @%p3 bra $Lt_0_27906;\n" -" cvt.rzi.ftz.s32.f32 %r31, %f24;\n" -" cvt.s64.s32 %rd27, %r16;\n" -" ld.param.s32 %r32, [__cudaparm_kernel_pair_lj_types];\n" -" mul.lo.s32 %r33, %r32, %r31;\n" -" ld.param.u64 %rd28, [__cudaparm_kernel_pair_lj1];\n" -" mov.f32 %f25, 0f00000000; \n" -" mov.f32 %f26, 0f00000000; \n" -" mov.f32 %f27, 0f00000000; \n" -" mov.f32 %f28, 0f00000000; \n" -" mov.u64 %rd29, __cuda___cuda_local_var_32536_33_non_const_sp_lj92;\n" -"$Lt_0_20226:\n" -" .loc 16 60 0\n" -" ld.global.s32 %r34, [%rd20+0];\n" -" .loc 16 61 0\n" -" shr.s32 %r35, %r34, 30;\n" -" and.b32 %r36, %r35, 3;\n" -" cvt.s64.s32 %rd30, %r36;\n" -" mul.wide.s32 %rd31, %r36, 4;\n" -" add.u64 %rd32, %rd29, %rd31;\n" -" ld.shared.f32 %f29, [%rd32+0];\n" -" .loc 16 64 0\n" -" and.b32 %r37, %r34, 1073741823;\n" -" mov.u32 %r38, %r37;\n" -" mov.s32 %r39, 0;\n" -" mov.u32 %r40, %r39;\n" -" mov.s32 %r41, 0;\n" -" mov.u32 %r42, %r41;\n" -" mov.s32 %r43, 0;\n" -" mov.u32 %r44, %r43;\n" -" tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r38,%r40,%r42,%r44}];\n" -" mov.f32 %f34, %f30;\n" -" mov.f32 %f35, %f31;\n" -" mov.f32 %f36, %f32;\n" -" mov.f32 %f37, %f33;\n" -" cvt.rzi.ftz.s32.f32 %r45, %f37;\n" -" sub.ftz.f32 %f38, %f22, %f35;\n" -" sub.ftz.f32 %f39, %f21, %f34;\n" -" sub.ftz.f32 %f40, %f23, %f36;\n" -" mul.ftz.f32 %f41, %f38, %f38;\n" -" fma.rn.ftz.f32 %f42, %f39, %f39, %f41;\n" -" fma.rn.ftz.f32 %f43, %f40, %f40, %f42;\n" -" add.s32 %r46, %r45, %r33;\n" -" cvt.s64.s32 %rd33, %r46;\n" -" mul.wide.s32 %rd34, %r46, 16;\n" -" add.u64 %rd35, %rd34, %rd28;\n" -" ld.global.f32 %f44, [%rd35+8];\n" -" setp.gt.ftz.f32 %p4, %f44, %f43;\n" -" @!%p4 bra $Lt_0_21506;\n" -" .loc 16 76 0\n" -" sqrt.approx.ftz.f32 %f45, %f43;\n" -" ld.global.v4.f32 {%f46,%f47,_,%f48}, [%rd35+0];\n" -" sub.ftz.f32 %f49, %f45, %f48;\n" -" .loc 16 81 0\n" -" mul.ftz.f32 %f50, %f49, %f49;\n" -" rcp.approx.ftz.f32 %f51, %f50;\n" -" mul.ftz.f32 %f52, %f51, %f51;\n" -" mul.ftz.f32 %f53, %f51, %f52;\n" -" div.approx.ftz.f32 %f54, %f29, %f49;\n" -" div.approx.ftz.f32 %f55, %f54, %f45;\n" -" mul.ftz.f32 %f56, %f46, %f53;\n" -" sub.ftz.f32 %f57, %f56, %f47;\n" -" mul.ftz.f32 %f58, %f53, %f57;\n" -" mul.ftz.f32 %f59, %f55, %f58;\n" -" .loc 16 83 0\n" -" fma.rn.ftz.f32 %f27, %f39, %f59, %f27;\n" -" .loc 16 84 0\n" -" fma.rn.ftz.f32 %f26, %f38, %f59, %f26;\n" -" .loc 16 85 0\n" -" fma.rn.ftz.f32 %f25, %f40, %f59, %f25;\n" -" ld.param.s32 %r47, [__cudaparm_kernel_pair_eflag];\n" -" mov.u32 %r48, 0;\n" -" setp.le.s32 %p5, %r47, %r48;\n" -" @%p5 bra $Lt_0_20994;\n" -" .loc 16 89 0\n" -" ld.param.u64 %rd36, [__cudaparm_kernel_pair_lj3];\n" -" add.u64 %rd37, %rd36, %rd34;\n" -" ld.global.v4.f32 {%f60,%f61,%f62,_}, [%rd37+0];\n" -" mul.ftz.f32 %f63, %f60, %f53;\n" -" sub.ftz.f32 %f64, %f63, %f61;\n" -" mul.ftz.f32 %f65, %f53, %f64;\n" -" sub.ftz.f32 %f66, %f65, %f62;\n" -" fma.rn.ftz.f32 %f28, %f29, %f66, %f28;\n" -"$Lt_0_20994:\n" -" ld.param.s32 %r49, [__cudaparm_kernel_pair_vflag];\n" -" mov.u32 %r50, 0;\n" -" setp.le.s32 %p6, %r49, %r50;\n" -" @%p6 bra $Lt_0_21506;\n" -" .loc 16 92 0\n" -" mov.f32 %f67, %f6;\n" -" mul.ftz.f32 %f68, %f39, %f39;\n" -" fma.rn.ftz.f32 %f69, %f59, %f68, %f67;\n" -" mov.f32 %f6, %f69;\n" -" .loc 16 93 0\n" -" mov.f32 %f70, %f8;\n" -" fma.rn.ftz.f32 %f71, %f59, %f41, %f70;\n" -" mov.f32 %f8, %f71;\n" -" .loc 16 94 0\n" -" mov.f32 %f72, %f10;\n" -" mul.ftz.f32 %f73, %f40, %f40;\n" -" fma.rn.ftz.f32 %f74, %f59, %f73, %f72;\n" -" mov.f32 %f10, %f74;\n" -" .loc 16 95 0\n" -" mov.f32 %f75, %f12;\n" -" mul.ftz.f32 %f76, %f38, %f39;\n" -" fma.rn.ftz.f32 %f77, %f59, %f76, %f75;\n" -" mov.f32 %f12, %f77;\n" -" .loc 16 96 0\n" -" mov.f32 %f78, %f14;\n" -" mul.ftz.f32 %f79, %f39, %f40;\n" -" fma.rn.ftz.f32 %f80, %f59, %f79, %f78;\n" -" mov.f32 %f14, %f80;\n" -" .loc 16 97 0\n" -" mul.ftz.f32 %f81, %f38, %f40;\n" -" fma.rn.ftz.f32 %f15, %f59, %f81, %f15;\n" -" mov.f32 %f16, %f15;\n" -"$Lt_0_21506:\n" -"$Lt_0_20482:\n" -" .loc 16 58 0\n" -" mul.lo.u64 %rd38, %rd27, 4;\n" -" add.u64 %rd20, %rd20, %rd38;\n" -" setp.lt.u64 %p7, %rd20, %rd19;\n" -" @%p7 bra $Lt_0_20226;\n" -" bra.uni $Lt_0_19714;\n" -"$Lt_0_27906:\n" -" mov.f32 %f25, 0f00000000; \n" -" mov.f32 %f26, 0f00000000; \n" -" mov.f32 %f27, 0f00000000; \n" -" mov.f32 %f28, 0f00000000; \n" -"$Lt_0_19714:\n" -" mov.u32 %r51, 1;\n" -" setp.le.s32 %p8, %r1, %r51;\n" -" @%p8 bra $Lt_0_24322;\n" -" .loc 16 102 0\n" -" mov.u64 %rd39, __cuda___cuda_local_var_32603_55_non_const_red_acc108;\n" -" cvt.s64.s32 %rd40, %r2;\n" -" mul.wide.s32 %rd41, %r2, 4;\n" -" add.u64 %rd42, %rd39, %rd41;\n" -" mov.f32 %f82, %f27;\n" -" st.shared.f32 [%rd42+0], %f82;\n" -" mov.f32 %f83, %f26;\n" -" st.shared.f32 [%rd42+512], %f83;\n" -" mov.f32 %f84, %f25;\n" -" st.shared.f32 [%rd42+1024], %f84;\n" -" mov.f32 %f85, %f28;\n" -" st.shared.f32 [%rd42+1536], %f85;\n" -" shr.s32 %r52, %r1, 31;\n" -" mov.s32 %r53, 1;\n" -" and.b32 %r54, %r52, %r53;\n" -" add.s32 %r55, %r54, %r1;\n" -" shr.s32 %r56, %r55, 1;\n" -" mov.s32 %r57, %r56;\n" -" mov.u32 %r58, 0;\n" -" setp.ne.u32 %p9, %r56, %r58;\n" -" @!%p9 bra $Lt_0_22786;\n" -"$Lt_0_23298:\n" -" setp.ge.u32 %p10, %r13, %r57;\n" -" @%p10 bra $Lt_0_23554;\n" -" add.u32 %r59, %r2, %r57;\n" -" cvt.u64.u32 %rd43, %r59;\n" -" mul.wide.u32 %rd44, %r59, 4;\n" -" add.u64 %rd45, %rd39, %rd44;\n" -" ld.shared.f32 %f86, [%rd45+0];\n" -" add.ftz.f32 %f82, %f86, %f82;\n" -" st.shared.f32 [%rd42+0], %f82;\n" -" ld.shared.f32 %f87, [%rd45+512];\n" -" add.ftz.f32 %f83, %f87, %f83;\n" -" st.shared.f32 [%rd42+512], %f83;\n" -" ld.shared.f32 %f88, [%rd45+1024];\n" -" add.ftz.f32 %f84, %f88, %f84;\n" -" st.shared.f32 [%rd42+1024], %f84;\n" -" ld.shared.f32 %f89, [%rd45+1536];\n" -" add.ftz.f32 %f85, %f89, %f85;\n" -" st.shared.f32 [%rd42+1536], %f85;\n" -"$Lt_0_23554:\n" -" shr.u32 %r57, %r57, 1;\n" -" mov.u32 %r60, 0;\n" -" setp.ne.u32 %p11, %r57, %r60;\n" -" @%p11 bra $Lt_0_23298;\n" -"$Lt_0_22786:\n" -" mov.f32 %f27, %f82;\n" -" mov.f32 %f26, %f83;\n" -" mov.f32 %f25, %f84;\n" -" mov.f32 %f28, %f85;\n" -" ld.param.s32 %r61, [__cudaparm_kernel_pair_vflag];\n" -" mov.u32 %r62, 0;\n" -" setp.le.s32 %p12, %r61, %r62;\n" -" @%p12 bra $Lt_0_24322;\n" -" mov.f32 %f82, %f6;\n" -" st.shared.f32 [%rd42+0], %f82;\n" -" mov.f32 %f83, %f8;\n" -" st.shared.f32 [%rd42+512], %f83;\n" -" mov.f32 %f84, %f10;\n" -" st.shared.f32 [%rd42+1024], %f84;\n" -" mov.f32 %f85, %f12;\n" -" st.shared.f32 [%rd42+1536], %f85;\n" -" mov.f32 %f90, %f14;\n" -" st.shared.f32 [%rd42+2048], %f90;\n" -" mov.f32 %f91, %f15;\n" -" st.shared.f32 [%rd42+2560], %f91;\n" -" mov.s32 %r63, %r56;\n" -" @!%p9 bra $Lt_0_24834;\n" -"$Lt_0_25346:\n" -" setp.ge.u32 %p13, %r13, %r63;\n" -" @%p13 bra $Lt_0_25602;\n" -" add.u32 %r64, %r2, %r63;\n" -" cvt.u64.u32 %rd46, %r64;\n" -" mul.wide.u32 %rd47, %r64, 4;\n" -" add.u64 %rd48, %rd39, %rd47;\n" -" ld.shared.f32 %f92, [%rd48+0];\n" -" add.ftz.f32 %f82, %f92, %f82;\n" -" st.shared.f32 [%rd42+0], %f82;\n" -" ld.shared.f32 %f93, [%rd48+512];\n" -" add.ftz.f32 %f83, %f93, %f83;\n" -" st.shared.f32 [%rd42+512], %f83;\n" -" ld.shared.f32 %f94, [%rd48+1024];\n" -" add.ftz.f32 %f84, %f94, %f84;\n" -" st.shared.f32 [%rd42+1024], %f84;\n" -" ld.shared.f32 %f95, [%rd48+1536];\n" -" add.ftz.f32 %f85, %f95, %f85;\n" -" st.shared.f32 [%rd42+1536], %f85;\n" -" ld.shared.f32 %f96, [%rd48+2048];\n" -" add.ftz.f32 %f90, %f96, %f90;\n" -" st.shared.f32 [%rd42+2048], %f90;\n" -" ld.shared.f32 %f97, [%rd48+2560];\n" -" add.ftz.f32 %f91, %f97, %f91;\n" -" st.shared.f32 [%rd42+2560], %f91;\n" -"$Lt_0_25602:\n" -" shr.u32 %r63, %r63, 1;\n" -" mov.u32 %r65, 0;\n" -" setp.ne.u32 %p14, %r63, %r65;\n" -" @%p14 bra $Lt_0_25346;\n" -"$Lt_0_24834:\n" -" mov.f32 %f6, %f82;\n" -" mov.f32 %f8, %f83;\n" -" mov.f32 %f10, %f84;\n" -" mov.f32 %f12, %f85;\n" -" mov.f32 %f14, %f90;\n" -" mov.f32 %f16, %f91;\n" -"$Lt_0_24322:\n" -"$Lt_0_22274:\n" -" mov.u32 %r66, 0;\n" -" setp.ne.s32 %p15, %r13, %r66;\n" -" @%p15 bra $Lt_0_26370;\n" -" ld.param.u64 %rd49, [__cudaparm_kernel_pair___val_paramengv];\n" -" add.u64 %rd50, %rd49, %rd5;\n" -" ld.param.s32 %r67, [__cudaparm_kernel_pair_eflag];\n" -" mov.u32 %r68, 0;\n" -" setp.le.s32 %p16, %r67, %r68;\n" -" @%p16 bra $Lt_0_26882;\n" -" st.global.f32 [%rd50+0], %f28;\n" -" cvt.s64.s32 %rd51, %r9;\n" -" mul.wide.s32 %rd52, %r9, 4;\n" -" add.u64 %rd50, %rd50, %rd52;\n" -"$Lt_0_26882:\n" -" ld.param.s32 %r69, [__cudaparm_kernel_pair_vflag];\n" -" mov.u32 %r70, 0;\n" -" setp.le.s32 %p17, %r69, %r70;\n" -" @%p17 bra $Lt_0_27394;\n" -" mov.f32 %f98, %f6;\n" -" st.global.f32 [%rd50+0], %f98;\n" -" cvt.s64.s32 %rd53, %r9;\n" -" mul.wide.s32 %rd54, %r9, 4;\n" -" add.u64 %rd55, %rd54, %rd50;\n" -" mov.f32 %f99, %f8;\n" -" st.global.f32 [%rd55+0], %f99;\n" -" add.u64 %rd56, %rd54, %rd55;\n" -" mov.f32 %f100, %f10;\n" -" st.global.f32 [%rd56+0], %f100;\n" -" add.u64 %rd57, %rd54, %rd56;\n" -" mov.f32 %f101, %f12;\n" -" st.global.f32 [%rd57+0], %f101;\n" -" add.u64 %rd50, %rd54, %rd57;\n" -" mov.f32 %f102, %f14;\n" -" st.global.f32 [%rd50+0], %f102;\n" -" mov.f32 %f103, %f16;\n" -" add.u64 %rd58, %rd54, %rd50;\n" -" st.global.f32 [%rd58+0], %f103;\n" -"$Lt_0_27394:\n" -" ld.param.u64 %rd59, [__cudaparm_kernel_pair_ans];\n" -" mul.lo.u64 %rd60, %rd4, 16;\n" -" add.u64 %rd61, %rd59, %rd60;\n" -" mov.f32 %f104, %f105;\n" -" st.global.v4.f32 [%rd61+0], {%f27,%f26,%f25,%f104};\n" -"$Lt_0_26370:\n" -"$Lt_0_18690:\n" -" .loc 16 105 0\n" -" exit;\n" -"$LDWend_kernel_pair:\n" -" }\n" -" .entry kernel_pair_fast (\n" -" .param .u64 __cudaparm_kernel_pair_fast_x_,\n" -" .param .u64 __cudaparm_kernel_pair_fast_lj1_in,\n" -" .param .u64 __cudaparm_kernel_pair_fast_lj3_in,\n" -" .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,\n" -" .param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n" -" .param .u64 __cudaparm_kernel_pair_fast_dev_packed,\n" -" .param .u64 __cudaparm_kernel_pair_fast_ans,\n" -" .param .u64 __cudaparm_kernel_pair_fast___val_paramengv,\n" -" .param .s32 __cudaparm_kernel_pair_fast_eflag,\n" -" .param .s32 __cudaparm_kernel_pair_fast_vflag,\n" -" .param .s32 __cudaparm_kernel_pair_fast_inum,\n" -" .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,\n" -" .param .s32 __cudaparm_kernel_pair_fast_t_per_atom)\n" -" {\n" -" .reg .u32 %r<74>;\n" -" .reg .u64 %rd<75>;\n" -" .reg .f32 %f<114>;\n" -" .reg .pred %p<22>;\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_32620_33_non_const_sp_lj3268[16];\n" -" .shared .align 16 .b8 __cuda___cuda_local_var_32618_34_non_const_lj13296[1936];\n" -" .shared .align 16 .b8 __cuda___cuda_local_var_32619_34_non_const_lj35232[1936];\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_32692_55_non_const_red_acc7168[3072];\n" -" .loc 16 113 0\n" -"$LDWbegin_kernel_pair_fast:\n" -" cvt.s32.u32 %r1, %tid.x;\n" -" mov.u32 %r2, 3;\n" -" setp.gt.s32 %p1, %r1, %r2;\n" -" @%p1 bra $Lt_1_20994;\n" -" .loc 16 121 0\n" -" mov.u64 %rd1, __cuda___cuda_local_var_32620_33_non_const_sp_lj3268;\n" -" cvt.s64.s32 %rd2, %r1;\n" -" mul.wide.s32 %rd3, %r1, 4;\n" -" ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];\n" -" add.u64 %rd5, %rd4, %rd3;\n" -" ld.global.f32 %f1, [%rd5+0];\n" -" add.u64 %rd6, %rd3, %rd1;\n" -" st.shared.f32 [%rd6+0], %f1;\n" -"$Lt_1_20994:\n" -" mov.u64 %rd1, __cuda___cuda_local_var_32620_33_non_const_sp_lj3268;\n" -" mov.u32 %r3, 120;\n" -" setp.gt.s32 %p2, %r1, %r3;\n" -" @%p2 bra $Lt_1_21506;\n" -" .loc 16 123 0\n" -" mov.u64 %rd7, __cuda___cuda_local_var_32618_34_non_const_lj13296;\n" -" cvt.s64.s32 %rd8, %r1;\n" -" mul.wide.s32 %rd9, %r1, 16;\n" -" ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_lj1_in];\n" -" add.u64 %rd11, %rd10, %rd9;\n" -" add.u64 %rd12, %rd9, %rd7;\n" -" ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0];\n" -" st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5};\n" -" ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag];\n" -" mov.u32 %r5, 0;\n" -" setp.le.s32 %p3, %r4, %r5;\n" -" @%p3 bra $Lt_1_22018;\n" -" .loc 16 125 0\n" -" mov.u64 %rd13, __cuda___cuda_local_var_32619_34_non_const_lj35232;\n" -" ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in];\n" -" add.u64 %rd15, %rd14, %rd9;\n" -" add.u64 %rd16, %rd9, %rd13;\n" -" ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0];\n" -" st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9};\n" -"$Lt_1_22018:\n" -" mov.u64 %rd13, __cuda___cuda_local_var_32619_34_non_const_lj35232;\n" -"$Lt_1_21506:\n" -" mov.u64 %rd13, __cuda___cuda_local_var_32619_34_non_const_lj35232;\n" -" mov.u64 %rd7, __cuda___cuda_local_var_32618_34_non_const_lj13296;\n" -" .loc 16 133 0\n" -" mov.f32 %f10, 0f00000000; \n" -" mov.f32 %f11, %f10;\n" -" mov.f32 %f12, 0f00000000; \n" -" mov.f32 %f13, %f12;\n" -" mov.f32 %f14, 0f00000000; \n" -" mov.f32 %f15, %f14;\n" -" mov.f32 %f16, 0f00000000; \n" -" mov.f32 %f17, %f16;\n" -" mov.f32 %f18, 0f00000000; \n" -" mov.f32 %f19, %f18;\n" -" mov.f32 %f20, 0f00000000; \n" -" mov.f32 %f21, %f20;\n" -" .loc 16 135 0\n" -" bar.sync 0;\n" -" ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom];\n" -" div.s32 %r7, %r1, %r6;\n" -" cvt.s32.u32 %r8, %ntid.x;\n" -" div.s32 %r9, %r8, %r6;\n" -" cvt.s32.u32 %r10, %ctaid.x;\n" -" mul.lo.s32 %r11, %r10, %r9;\n" -" add.s32 %r12, %r7, %r11;\n" -" ld.param.s32 %r13, [__cudaparm_kernel_pair_fast_inum];\n" -" setp.ge.s32 %p4, %r12, %r13;\n" -" @%p4 bra $Lt_1_30210;\n" -" .loc 16 140 0\n" -" ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_nbor_pitch];\n" -" cvt.s64.s32 %rd17, %r14;\n" -" mul.wide.s32 %rd18, %r14, 4;\n" -" cvt.s64.s32 %rd19, %r12;\n" -" mul.wide.s32 %rd20, %r12, 4;\n" -" ld.param.u64 %rd21, [__cudaparm_kernel_pair_fast_dev_nbor];\n" -" add.u64 %rd22, %rd20, %rd21;\n" -" add.u64 %rd23, %rd18, %rd22;\n" -" ld.global.s32 %r15, [%rd23+0];\n" -" sub.s32 %r16, %r6, 1;\n" -" and.b32 %r17, %r16, %r1;\n" -" cvt.s64.s32 %rd24, %r17;\n" -" mul.wide.s32 %rd25, %r17, 4;\n" -" ld.param.u64 %rd26, [__cudaparm_kernel_pair_fast_dev_packed];\n" -" setp.ne.u64 %p5, %rd26, %rd21;\n" -" @%p5 bra $Lt_1_23298;\n" -" cvt.s32.s64 %r18, %rd17;\n" -" mul.lo.s32 %r19, %r18, %r6;\n" -" mov.s32 %r20, %r19;\n" -" mul.lo.s32 %r21, %r16, %r12;\n" -" add.s32 %r22, %r18, %r21;\n" -" cvt.s64.s32 %rd27, %r22;\n" -" mul.wide.s32 %rd28, %r22, 4;\n" -" add.u64 %rd29, %rd23, %rd28;\n" -" and.b32 %r23, %r16, %r15;\n" -" cvt.s64.s32 %rd30, %r23;\n" -" div.s32 %r24, %r15, %r6;\n" -" mul.lo.s32 %r25, %r19, %r24;\n" -" cvt.s64.s32 %rd31, %r25;\n" -" add.u64 %rd32, %rd30, %rd31;\n" -" mul.lo.u64 %rd33, %rd32, 4;\n" -" add.u64 %rd34, %rd29, %rd33;\n" -" add.u64 %rd35, %rd25, %rd29;\n" -" bra.uni $Lt_1_23042;\n" -"$Lt_1_23298:\n" -" add.u64 %rd36, %rd18, %rd23;\n" -" ld.global.s32 %r26, [%rd36+0];\n" -" cvt.s64.s32 %rd37, %r26;\n" -" mul.wide.s32 %rd38, %r26, 4;\n" -" add.u64 %rd39, %rd26, %rd38;\n" -" cvt.s64.s32 %rd40, %r15;\n" -" mul.wide.s32 %rd41, %r15, 4;\n" -" add.u64 %rd34, %rd39, %rd41;\n" -" mov.s32 %r20, %r6;\n" -" add.u64 %rd35, %rd25, %rd39;\n" -"$Lt_1_23042:\n" -" .loc 16 143 0\n" -" ld.global.s32 %r27, [%rd22+0];\n" -" mov.u32 %r28, %r27;\n" -" mov.s32 %r29, 0;\n" -" mov.u32 %r30, %r29;\n" -" mov.s32 %r31, 0;\n" -" mov.u32 %r32, %r31;\n" -" mov.s32 %r33, 0;\n" -" mov.u32 %r34, %r33;\n" -" tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r28,%r30,%r32,%r34}];\n" -" mov.f32 %f26, %f22;\n" -" mov.f32 %f27, %f23;\n" -" mov.f32 %f28, %f24;\n" -" mov.f32 %f29, %f25;\n" -" setp.ge.u64 %p6, %rd35, %rd34;\n" -" @%p6 bra $Lt_1_31746;\n" -" cvt.rzi.ftz.s32.f32 %r35, %f29;\n" -" cvt.s64.s32 %rd42, %r20;\n" -" mul.lo.s32 %r36, %r35, 11;\n" -" cvt.rn.f32.s32 %f30, %r36;\n" -" mov.f32 %f31, 0f00000000; \n" -" mov.f32 %f32, 0f00000000; \n" -" mov.f32 %f33, 0f00000000; \n" -" mov.f32 %f34, 0f00000000; \n" -"$Lt_1_24066:\n" -" .loc 16 150 0\n" -" ld.global.s32 %r37, [%rd35+0];\n" -" .loc 16 151 0\n" -" shr.s32 %r38, %r37, 30;\n" -" and.b32 %r39, %r38, 3;\n" -" cvt.s64.s32 %rd43, %r39;\n" -" mul.wide.s32 %rd44, %r39, 4;\n" -" add.u64 %rd45, %rd1, %rd44;\n" -" ld.shared.f32 %f35, [%rd45+0];\n" -" .loc 16 154 0\n" -" and.b32 %r40, %r37, 1073741823;\n" -" mov.u32 %r41, %r40;\n" -" mov.s32 %r42, 0;\n" -" mov.u32 %r43, %r42;\n" -" mov.s32 %r44, 0;\n" -" mov.u32 %r45, %r44;\n" -" mov.s32 %r46, 0;\n" -" mov.u32 %r47, %r46;\n" -" tex.1d.v4.f32.s32 {%f36,%f37,%f38,%f39},[pos_tex,{%r41,%r43,%r45,%r47}];\n" -" mov.f32 %f40, %f36;\n" -" mov.f32 %f41, %f37;\n" -" mov.f32 %f42, %f38;\n" -" mov.f32 %f43, %f39;\n" -" sub.ftz.f32 %f44, %f27, %f41;\n" -" sub.ftz.f32 %f45, %f26, %f40;\n" -" sub.ftz.f32 %f46, %f28, %f42;\n" -" mul.ftz.f32 %f47, %f44, %f44;\n" -" fma.rn.ftz.f32 %f48, %f45, %f45, %f47;\n" -" fma.rn.ftz.f32 %f49, %f46, %f46, %f48;\n" -" add.ftz.f32 %f50, %f30, %f43;\n" -" cvt.rzi.ftz.s32.f32 %r48, %f50;\n" -" cvt.s64.s32 %rd46, %r48;\n" -" mul.wide.s32 %rd47, %r48, 16;\n" -" add.u64 %rd48, %rd47, %rd7;\n" -" ld.shared.f32 %f51, [%rd48+8];\n" -" setp.gt.ftz.f32 %p7, %f51, %f49;\n" -" @!%p7 bra $Lt_1_25346;\n" -" .loc 16 165 0\n" -" sqrt.approx.ftz.f32 %f52, %f49;\n" -" ld.shared.v4.f32 {%f53,%f54,_,%f55}, [%rd48+0];\n" -" sub.ftz.f32 %f56, %f52, %f55;\n" -" .loc 16 169 0\n" -" mul.ftz.f32 %f57, %f56, %f56;\n" -" rcp.approx.ftz.f32 %f58, %f57;\n" -" mul.ftz.f32 %f59, %f58, %f58;\n" -" mul.ftz.f32 %f60, %f58, %f59;\n" -" mul.ftz.f32 %f61, %f53, %f60;\n" -" sub.ftz.f32 %f62, %f61, %f54;\n" -" mul.ftz.f32 %f63, %f60, %f62;\n" -" .loc 16 170 0\n" -" div.approx.ftz.f32 %f64, %f35, %f56;\n" -" div.approx.ftz.f32 %f65, %f64, %f52;\n" -" mul.ftz.f32 %f66, %f63, %f65;\n" -" .loc 16 172 0\n" -" fma.rn.ftz.f32 %f33, %f45, %f66, %f33;\n" -" .loc 16 173 0\n" -" fma.rn.ftz.f32 %f32, %f44, %f66, %f32;\n" -" .loc 16 174 0\n" -" fma.rn.ftz.f32 %f31, %f46, %f66, %f31;\n" -" ld.param.s32 %r49, [__cudaparm_kernel_pair_fast_eflag];\n" -" mov.u32 %r50, 0;\n" -" setp.le.s32 %p8, %r49, %r50;\n" -" @%p8 bra $Lt_1_24834;\n" -" .loc 16 177 0\n" -" add.u64 %rd49, %rd47, %rd13;\n" -" ld.shared.v4.f32 {%f67,%f68,%f69,_}, [%rd49+0];\n" -" mul.ftz.f32 %f70, %f67, %f60;\n" -" sub.ftz.f32 %f71, %f70, %f68;\n" -" mul.ftz.f32 %f72, %f60, %f71;\n" -" .loc 16 178 0\n" -" sub.ftz.f32 %f73, %f72, %f69;\n" -" fma.rn.ftz.f32 %f34, %f35, %f73, %f34;\n" -"$Lt_1_24834:\n" -" ld.param.s32 %r51, [__cudaparm_kernel_pair_fast_vflag];\n" -" mov.u32 %r52, 0;\n" -" setp.le.s32 %p9, %r51, %r52;\n" -" @%p9 bra $Lt_1_25346;\n" -" .loc 16 181 0\n" -" mov.f32 %f74, %f11;\n" -" mul.ftz.f32 %f75, %f45, %f45;\n" -" fma.rn.ftz.f32 %f76, %f66, %f75, %f74;\n" -" mov.f32 %f11, %f76;\n" -" .loc 16 182 0\n" -" mov.f32 %f77, %f13;\n" -" fma.rn.ftz.f32 %f78, %f66, %f47, %f77;\n" -" mov.f32 %f13, %f78;\n" -" .loc 16 183 0\n" -" mov.f32 %f79, %f15;\n" -" mul.ftz.f32 %f80, %f46, %f46;\n" -" fma.rn.ftz.f32 %f81, %f66, %f80, %f79;\n" -" mov.f32 %f15, %f81;\n" -" .loc 16 184 0\n" -" mov.f32 %f82, %f17;\n" -" mul.ftz.f32 %f83, %f44, %f45;\n" -" fma.rn.ftz.f32 %f84, %f66, %f83, %f82;\n" -" mov.f32 %f17, %f84;\n" -" .loc 16 185 0\n" -" mov.f32 %f85, %f19;\n" -" mul.ftz.f32 %f86, %f45, %f46;\n" -" fma.rn.ftz.f32 %f87, %f66, %f86, %f85;\n" -" mov.f32 %f19, %f87;\n" -" .loc 16 186 0\n" -" mul.ftz.f32 %f88, %f44, %f46;\n" -" fma.rn.ftz.f32 %f20, %f66, %f88, %f20;\n" -" mov.f32 %f21, %f20;\n" -"$Lt_1_25346:\n" -"$Lt_1_24322:\n" -" .loc 16 148 0\n" -" mul.lo.u64 %rd50, %rd42, 4;\n" -" add.u64 %rd35, %rd35, %rd50;\n" -" setp.lt.u64 %p10, %rd35, %rd34;\n" -" @%p10 bra $Lt_1_24066;\n" -" bra.uni $Lt_1_23554;\n" -"$Lt_1_31746:\n" -" mov.f32 %f31, 0f00000000; \n" -" mov.f32 %f32, 0f00000000; \n" -" mov.f32 %f33, 0f00000000; \n" -" mov.f32 %f34, 0f00000000; \n" -"$Lt_1_23554:\n" -" mov.u32 %r53, 1;\n" -" setp.le.s32 %p11, %r6, %r53;\n" -" @%p11 bra $Lt_1_28162;\n" -" .loc 16 191 0\n" -" mov.u64 %rd51, __cuda___cuda_local_var_32692_55_non_const_red_acc7168;\n" -" cvt.s64.s32 %rd52, %r1;\n" -" mul.wide.s32 %rd53, %r1, 4;\n" -" add.u64 %rd54, %rd51, %rd53;\n" -" mov.f32 %f89, %f33;\n" -" st.shared.f32 [%rd54+0], %f89;\n" -" mov.f32 %f90, %f32;\n" -" st.shared.f32 [%rd54+512], %f90;\n" -" mov.f32 %f91, %f31;\n" -" st.shared.f32 [%rd54+1024], %f91;\n" -" mov.f32 %f92, %f34;\n" -" st.shared.f32 [%rd54+1536], %f92;\n" -" shr.s32 %r54, %r6, 31;\n" -" mov.s32 %r55, 1;\n" -" and.b32 %r56, %r54, %r55;\n" -" add.s32 %r57, %r56, %r6;\n" -" shr.s32 %r58, %r57, 1;\n" -" mov.s32 %r59, %r58;\n" -" mov.u32 %r60, 0;\n" -" setp.ne.u32 %p12, %r58, %r60;\n" -" @!%p12 bra $Lt_1_26626;\n" -"$Lt_1_27138:\n" -" setp.ge.u32 %p13, %r17, %r59;\n" -" @%p13 bra $Lt_1_27394;\n" -" add.u32 %r61, %r1, %r59;\n" -" cvt.u64.u32 %rd55, %r61;\n" -" mul.wide.u32 %rd56, %r61, 4;\n" -" add.u64 %rd57, %rd51, %rd56;\n" -" ld.shared.f32 %f93, [%rd57+0];\n" -" add.ftz.f32 %f89, %f93, %f89;\n" -" st.shared.f32 [%rd54+0], %f89;\n" -" ld.shared.f32 %f94, [%rd57+512];\n" -" add.ftz.f32 %f90, %f94, %f90;\n" -" st.shared.f32 [%rd54+512], %f90;\n" -" ld.shared.f32 %f95, [%rd57+1024];\n" -" add.ftz.f32 %f91, %f95, %f91;\n" -" st.shared.f32 [%rd54+1024], %f91;\n" -" ld.shared.f32 %f96, [%rd57+1536];\n" -" add.ftz.f32 %f92, %f96, %f92;\n" -" st.shared.f32 [%rd54+1536], %f92;\n" -"$Lt_1_27394:\n" -" shr.u32 %r59, %r59, 1;\n" -" mov.u32 %r62, 0;\n" -" setp.ne.u32 %p14, %r59, %r62;\n" -" @%p14 bra $Lt_1_27138;\n" -"$Lt_1_26626:\n" -" mov.f32 %f33, %f89;\n" -" mov.f32 %f32, %f90;\n" -" mov.f32 %f31, %f91;\n" -" mov.f32 %f34, %f92;\n" -" ld.param.s32 %r63, [__cudaparm_kernel_pair_fast_vflag];\n" -" mov.u32 %r64, 0;\n" -" setp.le.s32 %p15, %r63, %r64;\n" -" @%p15 bra $Lt_1_28162;\n" -" mov.f32 %f89, %f11;\n" -" st.shared.f32 [%rd54+0], %f89;\n" -" mov.f32 %f90, %f13;\n" -" st.shared.f32 [%rd54+512], %f90;\n" -" mov.f32 %f91, %f15;\n" -" st.shared.f32 [%rd54+1024], %f91;\n" -" mov.f32 %f92, %f17;\n" -" st.shared.f32 [%rd54+1536], %f92;\n" -" mov.f32 %f97, %f19;\n" -" st.shared.f32 [%rd54+2048], %f97;\n" -" mov.f32 %f98, %f20;\n" -" st.shared.f32 [%rd54+2560], %f98;\n" -" mov.s32 %r65, %r58;\n" -" @!%p12 bra $Lt_1_28674;\n" -"$Lt_1_29186:\n" -" setp.ge.u32 %p16, %r17, %r65;\n" -" @%p16 bra $Lt_1_29442;\n" -" add.u32 %r66, %r1, %r65;\n" -" cvt.u64.u32 %rd58, %r66;\n" -" mul.wide.u32 %rd59, %r66, 4;\n" -" add.u64 %rd60, %rd51, %rd59;\n" -" ld.shared.f32 %f99, [%rd60+0];\n" -" add.ftz.f32 %f89, %f99, %f89;\n" -" st.shared.f32 [%rd54+0], %f89;\n" -" ld.shared.f32 %f100, [%rd60+512];\n" -" add.ftz.f32 %f90, %f100, %f90;\n" -" st.shared.f32 [%rd54+512], %f90;\n" -" ld.shared.f32 %f101, [%rd60+1024];\n" -" add.ftz.f32 %f91, %f101, %f91;\n" -" st.shared.f32 [%rd54+1024], %f91;\n" -" ld.shared.f32 %f102, [%rd60+1536];\n" -" add.ftz.f32 %f92, %f102, %f92;\n" -" st.shared.f32 [%rd54+1536], %f92;\n" -" ld.shared.f32 %f103, [%rd60+2048];\n" -" add.ftz.f32 %f97, %f103, %f97;\n" -" st.shared.f32 [%rd54+2048], %f97;\n" -" ld.shared.f32 %f104, [%rd60+2560];\n" -" add.ftz.f32 %f98, %f104, %f98;\n" -" st.shared.f32 [%rd54+2560], %f98;\n" -"$Lt_1_29442:\n" -" shr.u32 %r65, %r65, 1;\n" -" mov.u32 %r67, 0;\n" -" setp.ne.u32 %p17, %r65, %r67;\n" -" @%p17 bra $Lt_1_29186;\n" -"$Lt_1_28674:\n" -" mov.f32 %f11, %f89;\n" -" mov.f32 %f13, %f90;\n" -" mov.f32 %f15, %f91;\n" -" mov.f32 %f17, %f92;\n" -" mov.f32 %f19, %f97;\n" -" mov.f32 %f21, %f98;\n" -"$Lt_1_28162:\n" -"$Lt_1_26114:\n" -" mov.u32 %r68, 0;\n" -" setp.ne.s32 %p18, %r17, %r68;\n" -" @%p18 bra $Lt_1_30210;\n" -" ld.param.u64 %rd61, [__cudaparm_kernel_pair_fast___val_paramengv];\n" -" add.u64 %rd62, %rd61, %rd20;\n" -" ld.param.s32 %r69, [__cudaparm_kernel_pair_fast_eflag];\n" -" mov.u32 %r70, 0;\n" -" setp.le.s32 %p19, %r69, %r70;\n" -" @%p19 bra $Lt_1_30722;\n" -" st.global.f32 [%rd62+0], %f34;\n" -" cvt.s64.s32 %rd63, %r13;\n" -" mul.wide.s32 %rd64, %r13, 4;\n" -" add.u64 %rd62, %rd62, %rd64;\n" -"$Lt_1_30722:\n" -" ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag];\n" -" mov.u32 %r72, 0;\n" -" setp.le.s32 %p20, %r71, %r72;\n" -" @%p20 bra $Lt_1_31234;\n" -" mov.f32 %f105, %f11;\n" -" st.global.f32 [%rd62+0], %f105;\n" -" cvt.s64.s32 %rd65, %r13;\n" -" mul.wide.s32 %rd66, %r13, 4;\n" -" add.u64 %rd67, %rd66, %rd62;\n" -" mov.f32 %f106, %f13;\n" -" st.global.f32 [%rd67+0], %f106;\n" -" add.u64 %rd68, %rd66, %rd67;\n" -" mov.f32 %f107, %f15;\n" -" st.global.f32 [%rd68+0], %f107;\n" -" add.u64 %rd69, %rd66, %rd68;\n" -" mov.f32 %f108, %f17;\n" -" st.global.f32 [%rd69+0], %f108;\n" -" add.u64 %rd62, %rd66, %rd69;\n" -" mov.f32 %f109, %f19;\n" -" st.global.f32 [%rd62+0], %f109;\n" -" mov.f32 %f110, %f21;\n" -" add.u64 %rd70, %rd66, %rd62;\n" -" st.global.f32 [%rd70+0], %f110;\n" -"$Lt_1_31234:\n" -" ld.param.u64 %rd71, [__cudaparm_kernel_pair_fast_ans];\n" -" mul.lo.u64 %rd72, %rd19, 16;\n" -" add.u64 %rd73, %rd71, %rd72;\n" -" mov.f32 %f111, %f112;\n" -" st.global.v4.f32 [%rd73+0], {%f33,%f32,%f31,%f111};\n" -"$Lt_1_30210:\n" -"$Lt_1_22530:\n" -" .loc 16 194 0\n" -" exit;\n" -"$LDWend_kernel_pair_fast:\n" -" }\n" -; diff --git a/lib/gpu/lj_ptx.h b/lib/gpu/lj_ptx.h deleted file mode 100644 index 3d9df759d8..0000000000 --- a/lib/gpu/lj_ptx.h +++ /dev/null @@ -1,849 +0,0 @@ -const char * lj = -" .version 2.3\n" -" .target sm_20\n" -" .address_size 64\n" -" .global .texref pos_tex;\n" -" .entry kernel_pair (\n" -" .param .u64 __cudaparm_kernel_pair_x_,\n" -" .param .u64 __cudaparm_kernel_pair_lj1,\n" -" .param .u64 __cudaparm_kernel_pair_lj3,\n" -" .param .s32 __cudaparm_kernel_pair_lj_types,\n" -" .param .u64 __cudaparm_kernel_pair_sp_lj_in,\n" -" .param .u64 __cudaparm_kernel_pair_dev_nbor,\n" -" .param .u64 __cudaparm_kernel_pair_dev_packed,\n" -" .param .u64 __cudaparm_kernel_pair_ans,\n" -" .param .u64 __cudaparm_kernel_pair___val_paramengv,\n" -" .param .s32 __cudaparm_kernel_pair_eflag,\n" -" .param .s32 __cudaparm_kernel_pair_vflag,\n" -" .param .s32 __cudaparm_kernel_pair_inum,\n" -" .param .s32 __cudaparm_kernel_pair_nbor_pitch,\n" -" .param .s32 __cudaparm_kernel_pair_t_per_atom)\n" -" {\n" -" .reg .u32 %r<72>;\n" -" .reg .u64 %rd<63>;\n" -" .reg .f32 %f<102>;\n" -" .reg .pred %p<19>;\n" -" .shared .align 16 .b8 __cuda___cuda_local_var_32536_33_non_const_sp_lj92[16];\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_32600_55_non_const_red_acc108[3072];\n" -" .loc 16 31 0\n" -"$LDWbegin_kernel_pair:\n" -" .loc 16 36 0\n" -" ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];\n" -" ldu.global.f32 %f1, [%rd1+0];\n" -" .loc 16 37 0\n" -" ld.global.f32 %f2, [%rd1+4];\n" -" .loc 16 38 0\n" -" ld.global.f32 %f3, [%rd1+8];\n" -" .loc 16 39 0\n" -" ld.global.f32 %f4, [%rd1+12];\n" -" st.shared.v4.f32 [__cuda___cuda_local_var_32536_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4};\n" -" .loc 16 46 0\n" -" mov.f32 %f5, 0f00000000; \n" -" mov.f32 %f6, %f5;\n" -" mov.f32 %f7, 0f00000000; \n" -" mov.f32 %f8, %f7;\n" -" mov.f32 %f9, 0f00000000; \n" -" mov.f32 %f10, %f9;\n" -" mov.f32 %f11, 0f00000000; \n" -" mov.f32 %f12, %f11;\n" -" mov.f32 %f13, 0f00000000; \n" -" mov.f32 %f14, %f13;\n" -" mov.f32 %f15, 0f00000000; \n" -" mov.f32 %f16, %f15;\n" -" ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];\n" -" cvt.s32.u32 %r2, %tid.x;\n" -" div.s32 %r3, %r2, %r1;\n" -" cvt.s32.u32 %r4, %ntid.x;\n" -" div.s32 %r5, %r4, %r1;\n" -" cvt.s32.u32 %r6, %ctaid.x;\n" -" mul.lo.s32 %r7, %r6, %r5;\n" -" add.s32 %r8, %r3, %r7;\n" -" ld.param.s32 %r9, [__cudaparm_kernel_pair_inum];\n" -" setp.ge.s32 %p1, %r8, %r9;\n" -" @%p1 bra $Lt_0_26370;\n" -" .loc 16 51 0\n" -" ld.param.s32 %r10, [__cudaparm_kernel_pair_nbor_pitch];\n" -" cvt.s64.s32 %rd2, %r10;\n" -" mul.wide.s32 %rd3, %r10, 4;\n" -" cvt.s64.s32 %rd4, %r8;\n" -" mul.wide.s32 %rd5, %r8, 4;\n" -" ld.param.u64 %rd6, [__cudaparm_kernel_pair_dev_nbor];\n" -" add.u64 %rd7, %rd5, %rd6;\n" -" add.u64 %rd8, %rd3, %rd7;\n" -" ld.global.s32 %r11, [%rd8+0];\n" -" sub.s32 %r12, %r1, 1;\n" -" and.b32 %r13, %r12, %r2;\n" -" cvt.s64.s32 %rd9, %r13;\n" -" mul.wide.s32 %rd10, %r13, 4;\n" -" ld.param.u64 %rd11, [__cudaparm_kernel_pair_dev_packed];\n" -" setp.ne.u64 %p2, %rd11, %rd6;\n" -" @%p2 bra $Lt_0_19458;\n" -" cvt.s32.s64 %r14, %rd2;\n" -" mul.lo.s32 %r15, %r14, %r1;\n" -" mov.s32 %r16, %r15;\n" -" mul.lo.s32 %r17, %r12, %r8;\n" -" add.s32 %r18, %r14, %r17;\n" -" cvt.s64.s32 %rd12, %r18;\n" -" mul.wide.s32 %rd13, %r18, 4;\n" -" add.u64 %rd14, %rd8, %rd13;\n" -" and.b32 %r19, %r12, %r11;\n" -" cvt.s64.s32 %rd15, %r19;\n" -" div.s32 %r20, %r11, %r1;\n" -" mul.lo.s32 %r21, %r15, %r20;\n" -" cvt.s64.s32 %rd16, %r21;\n" -" add.u64 %rd17, %rd15, %rd16;\n" -" mul.lo.u64 %rd18, %rd17, 4;\n" -" add.u64 %rd19, %rd14, %rd18;\n" -" add.u64 %rd20, %rd10, %rd14;\n" -" bra.uni $Lt_0_19202;\n" -"$Lt_0_19458:\n" -" add.u64 %rd21, %rd3, %rd8;\n" -" ld.global.s32 %r22, [%rd21+0];\n" -" cvt.s64.s32 %rd22, %r22;\n" -" mul.wide.s32 %rd23, %r22, 4;\n" -" add.u64 %rd24, %rd11, %rd23;\n" -" cvt.s64.s32 %rd25, %r11;\n" -" mul.wide.s32 %rd26, %r11, 4;\n" -" add.u64 %rd19, %rd24, %rd26;\n" -" mov.s32 %r16, %r1;\n" -" add.u64 %rd20, %rd10, %rd24;\n" -"$Lt_0_19202:\n" -" .loc 16 54 0\n" -" ld.global.s32 %r23, [%rd7+0];\n" -" mov.u32 %r24, %r23;\n" -" mov.s32 %r25, 0;\n" -" mov.u32 %r26, %r25;\n" -" mov.s32 %r27, 0;\n" -" mov.u32 %r28, %r27;\n" -" mov.s32 %r29, 0;\n" -" mov.u32 %r30, %r29;\n" -" tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r24,%r26,%r28,%r30}];\n" -" mov.f32 %f21, %f17;\n" -" mov.f32 %f22, %f18;\n" -" mov.f32 %f23, %f19;\n" -" mov.f32 %f24, %f20;\n" -" setp.ge.u64 %p3, %rd20, %rd19;\n" -" @%p3 bra $Lt_0_27906;\n" -" cvt.rzi.ftz.s32.f32 %r31, %f24;\n" -" cvt.s64.s32 %rd27, %r16;\n" -" ld.param.s32 %r32, [__cudaparm_kernel_pair_lj_types];\n" -" mul.lo.s32 %r33, %r32, %r31;\n" -" ld.param.u64 %rd28, [__cudaparm_kernel_pair_lj1];\n" -" mov.f32 %f25, 0f00000000; \n" -" mov.f32 %f26, 0f00000000; \n" -" mov.f32 %f27, 0f00000000; \n" -" mov.f32 %f28, 0f00000000; \n" -" mov.u64 %rd29, __cuda___cuda_local_var_32536_33_non_const_sp_lj92;\n" -"$Lt_0_20226:\n" -" .loc 16 60 0\n" -" ld.global.s32 %r34, [%rd20+0];\n" -" .loc 16 61 0\n" -" shr.s32 %r35, %r34, 30;\n" -" and.b32 %r36, %r35, 3;\n" -" cvt.s64.s32 %rd30, %r36;\n" -" mul.wide.s32 %rd31, %r36, 4;\n" -" add.u64 %rd32, %rd29, %rd31;\n" -" ld.shared.f32 %f29, [%rd32+0];\n" -" .loc 16 64 0\n" -" and.b32 %r37, %r34, 1073741823;\n" -" mov.u32 %r38, %r37;\n" -" mov.s32 %r39, 0;\n" -" mov.u32 %r40, %r39;\n" -" mov.s32 %r41, 0;\n" -" mov.u32 %r42, %r41;\n" -" mov.s32 %r43, 0;\n" -" mov.u32 %r44, %r43;\n" -" tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r38,%r40,%r42,%r44}];\n" -" mov.f32 %f34, %f30;\n" -" mov.f32 %f35, %f31;\n" -" mov.f32 %f36, %f32;\n" -" mov.f32 %f37, %f33;\n" -" cvt.rzi.ftz.s32.f32 %r45, %f37;\n" -" sub.ftz.f32 %f38, %f22, %f35;\n" -" sub.ftz.f32 %f39, %f21, %f34;\n" -" sub.ftz.f32 %f40, %f23, %f36;\n" -" mul.ftz.f32 %f41, %f38, %f38;\n" -" fma.rn.ftz.f32 %f42, %f39, %f39, %f41;\n" -" fma.rn.ftz.f32 %f43, %f40, %f40, %f42;\n" -" add.s32 %r46, %r45, %r33;\n" -" cvt.s64.s32 %rd33, %r46;\n" -" mul.wide.s32 %rd34, %r46, 16;\n" -" add.u64 %rd35, %rd34, %rd28;\n" -" ld.global.f32 %f44, [%rd35+8];\n" -" setp.gt.ftz.f32 %p4, %f44, %f43;\n" -" @!%p4 bra $Lt_0_21506;\n" -" .loc 16 78 0\n" -" rcp.approx.ftz.f32 %f45, %f43;\n" -" mul.ftz.f32 %f46, %f45, %f45;\n" -" mul.ftz.f32 %f47, %f45, %f46;\n" -" mul.ftz.f32 %f48, %f45, %f47;\n" -" ld.global.v2.f32 {%f49,%f50}, [%rd35+0];\n" -" mul.ftz.f32 %f51, %f49, %f47;\n" -" sub.ftz.f32 %f52, %f51, %f50;\n" -" mul.ftz.f32 %f53, %f48, %f52;\n" -" mul.ftz.f32 %f54, %f29, %f53;\n" -" .loc 16 80 0\n" -" fma.rn.ftz.f32 %f27, %f39, %f54, %f27;\n" -" .loc 16 81 0\n" -" fma.rn.ftz.f32 %f26, %f38, %f54, %f26;\n" -" .loc 16 82 0\n" -" fma.rn.ftz.f32 %f25, %f40, %f54, %f25;\n" -" ld.param.s32 %r47, [__cudaparm_kernel_pair_eflag];\n" -" mov.u32 %r48, 0;\n" -" setp.le.s32 %p5, %r47, %r48;\n" -" @%p5 bra $Lt_0_20994;\n" -" .loc 16 86 0\n" -" ld.param.u64 %rd36, [__cudaparm_kernel_pair_lj3];\n" -" add.u64 %rd37, %rd36, %rd34;\n" -" ld.global.v4.f32 {%f55,%f56,%f57,_}, [%rd37+0];\n" -" mul.ftz.f32 %f58, %f55, %f47;\n" -" sub.ftz.f32 %f59, %f58, %f56;\n" -" mul.ftz.f32 %f60, %f47, %f59;\n" -" sub.ftz.f32 %f61, %f60, %f57;\n" -" fma.rn.ftz.f32 %f28, %f29, %f61, %f28;\n" -"$Lt_0_20994:\n" -" ld.param.s32 %r49, [__cudaparm_kernel_pair_vflag];\n" -" mov.u32 %r50, 0;\n" -" setp.le.s32 %p6, %r49, %r50;\n" -" @%p6 bra $Lt_0_21506;\n" -" .loc 16 89 0\n" -" mov.f32 %f62, %f6;\n" -" mul.ftz.f32 %f63, %f39, %f39;\n" -" fma.rn.ftz.f32 %f64, %f54, %f63, %f62;\n" -" mov.f32 %f6, %f64;\n" -" .loc 16 90 0\n" -" mov.f32 %f65, %f8;\n" -" fma.rn.ftz.f32 %f66, %f54, %f41, %f65;\n" -" mov.f32 %f8, %f66;\n" -" .loc 16 91 0\n" -" mov.f32 %f67, %f10;\n" -" mul.ftz.f32 %f68, %f40, %f40;\n" -" fma.rn.ftz.f32 %f69, %f54, %f68, %f67;\n" -" mov.f32 %f10, %f69;\n" -" .loc 16 92 0\n" -" mov.f32 %f70, %f12;\n" -" mul.ftz.f32 %f71, %f38, %f39;\n" -" fma.rn.ftz.f32 %f72, %f54, %f71, %f70;\n" -" mov.f32 %f12, %f72;\n" -" .loc 16 93 0\n" -" mov.f32 %f73, %f14;\n" -" mul.ftz.f32 %f74, %f39, %f40;\n" -" fma.rn.ftz.f32 %f75, %f54, %f74, %f73;\n" -" mov.f32 %f14, %f75;\n" -" .loc 16 94 0\n" -" mul.ftz.f32 %f76, %f38, %f40;\n" -" fma.rn.ftz.f32 %f15, %f54, %f76, %f15;\n" -" mov.f32 %f16, %f15;\n" -"$Lt_0_21506:\n" -"$Lt_0_20482:\n" -" .loc 16 58 0\n" -" mul.lo.u64 %rd38, %rd27, 4;\n" -" add.u64 %rd20, %rd20, %rd38;\n" -" setp.lt.u64 %p7, %rd20, %rd19;\n" -" @%p7 bra $Lt_0_20226;\n" -" bra.uni $Lt_0_19714;\n" -"$Lt_0_27906:\n" -" mov.f32 %f25, 0f00000000; \n" -" mov.f32 %f26, 0f00000000; \n" -" mov.f32 %f27, 0f00000000; \n" -" mov.f32 %f28, 0f00000000; \n" -"$Lt_0_19714:\n" -" mov.u32 %r51, 1;\n" -" setp.le.s32 %p8, %r1, %r51;\n" -" @%p8 bra $Lt_0_24322;\n" -" .loc 16 99 0\n" -" mov.u64 %rd39, __cuda___cuda_local_var_32600_55_non_const_red_acc108;\n" -" cvt.s64.s32 %rd40, %r2;\n" -" mul.wide.s32 %rd41, %r2, 4;\n" -" add.u64 %rd42, %rd39, %rd41;\n" -" mov.f32 %f77, %f27;\n" -" st.shared.f32 [%rd42+0], %f77;\n" -" mov.f32 %f78, %f26;\n" -" st.shared.f32 [%rd42+512], %f78;\n" -" mov.f32 %f79, %f25;\n" -" st.shared.f32 [%rd42+1024], %f79;\n" -" mov.f32 %f80, %f28;\n" -" st.shared.f32 [%rd42+1536], %f80;\n" -" shr.s32 %r52, %r1, 31;\n" -" mov.s32 %r53, 1;\n" -" and.b32 %r54, %r52, %r53;\n" -" add.s32 %r55, %r54, %r1;\n" -" shr.s32 %r56, %r55, 1;\n" -" mov.s32 %r57, %r56;\n" -" mov.u32 %r58, 0;\n" -" setp.ne.u32 %p9, %r56, %r58;\n" -" @!%p9 bra $Lt_0_22786;\n" -"$Lt_0_23298:\n" -" setp.ge.u32 %p10, %r13, %r57;\n" -" @%p10 bra $Lt_0_23554;\n" -" add.u32 %r59, %r2, %r57;\n" -" cvt.u64.u32 %rd43, %r59;\n" -" mul.wide.u32 %rd44, %r59, 4;\n" -" add.u64 %rd45, %rd39, %rd44;\n" -" ld.shared.f32 %f81, [%rd45+0];\n" -" add.ftz.f32 %f77, %f81, %f77;\n" -" st.shared.f32 [%rd42+0], %f77;\n" -" ld.shared.f32 %f82, [%rd45+512];\n" -" add.ftz.f32 %f78, %f82, %f78;\n" -" st.shared.f32 [%rd42+512], %f78;\n" -" ld.shared.f32 %f83, [%rd45+1024];\n" -" add.ftz.f32 %f79, %f83, %f79;\n" -" st.shared.f32 [%rd42+1024], %f79;\n" -" ld.shared.f32 %f84, [%rd45+1536];\n" -" add.ftz.f32 %f80, %f84, %f80;\n" -" st.shared.f32 [%rd42+1536], %f80;\n" -"$Lt_0_23554:\n" -" shr.u32 %r57, %r57, 1;\n" -" mov.u32 %r60, 0;\n" -" setp.ne.u32 %p11, %r57, %r60;\n" -" @%p11 bra $Lt_0_23298;\n" -"$Lt_0_22786:\n" -" mov.f32 %f27, %f77;\n" -" mov.f32 %f26, %f78;\n" -" mov.f32 %f25, %f79;\n" -" mov.f32 %f28, %f80;\n" -" ld.param.s32 %r61, [__cudaparm_kernel_pair_vflag];\n" -" mov.u32 %r62, 0;\n" -" setp.le.s32 %p12, %r61, %r62;\n" -" @%p12 bra $Lt_0_24322;\n" -" mov.f32 %f77, %f6;\n" -" st.shared.f32 [%rd42+0], %f77;\n" -" mov.f32 %f78, %f8;\n" -" st.shared.f32 [%rd42+512], %f78;\n" -" mov.f32 %f79, %f10;\n" -" st.shared.f32 [%rd42+1024], %f79;\n" -" mov.f32 %f80, %f12;\n" -" st.shared.f32 [%rd42+1536], %f80;\n" -" mov.f32 %f85, %f14;\n" -" st.shared.f32 [%rd42+2048], %f85;\n" -" mov.f32 %f86, %f15;\n" -" st.shared.f32 [%rd42+2560], %f86;\n" -" mov.s32 %r63, %r56;\n" -" @!%p9 bra $Lt_0_24834;\n" -"$Lt_0_25346:\n" -" setp.ge.u32 %p13, %r13, %r63;\n" -" @%p13 bra $Lt_0_25602;\n" -" add.u32 %r64, %r2, %r63;\n" -" cvt.u64.u32 %rd46, %r64;\n" -" mul.wide.u32 %rd47, %r64, 4;\n" -" add.u64 %rd48, %rd39, %rd47;\n" -" ld.shared.f32 %f87, [%rd48+0];\n" -" add.ftz.f32 %f77, %f87, %f77;\n" -" st.shared.f32 [%rd42+0], %f77;\n" -" ld.shared.f32 %f88, [%rd48+512];\n" -" add.ftz.f32 %f78, %f88, %f78;\n" -" st.shared.f32 [%rd42+512], %f78;\n" -" ld.shared.f32 %f89, [%rd48+1024];\n" -" add.ftz.f32 %f79, %f89, %f79;\n" -" st.shared.f32 [%rd42+1024], %f79;\n" -" ld.shared.f32 %f90, [%rd48+1536];\n" -" add.ftz.f32 %f80, %f90, %f80;\n" -" st.shared.f32 [%rd42+1536], %f80;\n" -" ld.shared.f32 %f91, [%rd48+2048];\n" -" add.ftz.f32 %f85, %f91, %f85;\n" -" st.shared.f32 [%rd42+2048], %f85;\n" -" ld.shared.f32 %f92, [%rd48+2560];\n" -" add.ftz.f32 %f86, %f92, %f86;\n" -" st.shared.f32 [%rd42+2560], %f86;\n" -"$Lt_0_25602:\n" -" shr.u32 %r63, %r63, 1;\n" -" mov.u32 %r65, 0;\n" -" setp.ne.u32 %p14, %r63, %r65;\n" -" @%p14 bra $Lt_0_25346;\n" -"$Lt_0_24834:\n" -" mov.f32 %f6, %f77;\n" -" mov.f32 %f8, %f78;\n" -" mov.f32 %f10, %f79;\n" -" mov.f32 %f12, %f80;\n" -" mov.f32 %f14, %f85;\n" -" mov.f32 %f16, %f86;\n" -"$Lt_0_24322:\n" -"$Lt_0_22274:\n" -" mov.u32 %r66, 0;\n" -" setp.ne.s32 %p15, %r13, %r66;\n" -" @%p15 bra $Lt_0_26370;\n" -" ld.param.u64 %rd49, [__cudaparm_kernel_pair___val_paramengv];\n" -" add.u64 %rd50, %rd49, %rd5;\n" -" ld.param.s32 %r67, [__cudaparm_kernel_pair_eflag];\n" -" mov.u32 %r68, 0;\n" -" setp.le.s32 %p16, %r67, %r68;\n" -" @%p16 bra $Lt_0_26882;\n" -" st.global.f32 [%rd50+0], %f28;\n" -" cvt.s64.s32 %rd51, %r9;\n" -" mul.wide.s32 %rd52, %r9, 4;\n" -" add.u64 %rd50, %rd50, %rd52;\n" -"$Lt_0_26882:\n" -" ld.param.s32 %r69, [__cudaparm_kernel_pair_vflag];\n" -" mov.u32 %r70, 0;\n" -" setp.le.s32 %p17, %r69, %r70;\n" -" @%p17 bra $Lt_0_27394;\n" -" mov.f32 %f93, %f6;\n" -" st.global.f32 [%rd50+0], %f93;\n" -" cvt.s64.s32 %rd53, %r9;\n" -" mul.wide.s32 %rd54, %r9, 4;\n" -" add.u64 %rd55, %rd54, %rd50;\n" -" mov.f32 %f94, %f8;\n" -" st.global.f32 [%rd55+0], %f94;\n" -" add.u64 %rd56, %rd54, %rd55;\n" -" mov.f32 %f95, %f10;\n" -" st.global.f32 [%rd56+0], %f95;\n" -" add.u64 %rd57, %rd54, %rd56;\n" -" mov.f32 %f96, %f12;\n" -" st.global.f32 [%rd57+0], %f96;\n" -" add.u64 %rd50, %rd54, %rd57;\n" -" mov.f32 %f97, %f14;\n" -" st.global.f32 [%rd50+0], %f97;\n" -" mov.f32 %f98, %f16;\n" -" add.u64 %rd58, %rd54, %rd50;\n" -" st.global.f32 [%rd58+0], %f98;\n" -"$Lt_0_27394:\n" -" ld.param.u64 %rd59, [__cudaparm_kernel_pair_ans];\n" -" mul.lo.u64 %rd60, %rd4, 16;\n" -" add.u64 %rd61, %rd59, %rd60;\n" -" mov.f32 %f99, %f100;\n" -" st.global.v4.f32 [%rd61+0], {%f27,%f26,%f25,%f99};\n" -"$Lt_0_26370:\n" -"$Lt_0_18690:\n" -" .loc 16 102 0\n" -" exit;\n" -"$LDWend_kernel_pair:\n" -" }\n" -" .entry kernel_pair_fast (\n" -" .param .u64 __cudaparm_kernel_pair_fast_x_,\n" -" .param .u64 __cudaparm_kernel_pair_fast_lj1_in,\n" -" .param .u64 __cudaparm_kernel_pair_fast_lj3_in,\n" -" .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,\n" -" .param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n" -" .param .u64 __cudaparm_kernel_pair_fast_dev_packed,\n" -" .param .u64 __cudaparm_kernel_pair_fast_ans,\n" -" .param .u64 __cudaparm_kernel_pair_fast___val_paramengv,\n" -" .param .s32 __cudaparm_kernel_pair_fast_eflag,\n" -" .param .s32 __cudaparm_kernel_pair_fast_vflag,\n" -" .param .s32 __cudaparm_kernel_pair_fast_inum,\n" -" .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,\n" -" .param .s32 __cudaparm_kernel_pair_fast_t_per_atom)\n" -" {\n" -" .reg .u32 %r<74>;\n" -" .reg .u64 %rd<75>;\n" -" .reg .f32 %f<109>;\n" -" .reg .pred %p<22>;\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_32617_33_non_const_sp_lj3268[16];\n" -" .shared .align 16 .b8 __cuda___cuda_local_var_32615_34_non_const_lj13296[1936];\n" -" .shared .align 16 .b8 __cuda___cuda_local_var_32616_34_non_const_lj35232[1936];\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_32685_55_non_const_red_acc7168[3072];\n" -" .loc 16 110 0\n" -"$LDWbegin_kernel_pair_fast:\n" -" cvt.s32.u32 %r1, %tid.x;\n" -" mov.u32 %r2, 3;\n" -" setp.gt.s32 %p1, %r1, %r2;\n" -" @%p1 bra $Lt_1_20994;\n" -" .loc 16 118 0\n" -" mov.u64 %rd1, __cuda___cuda_local_var_32617_33_non_const_sp_lj3268;\n" -" cvt.s64.s32 %rd2, %r1;\n" -" mul.wide.s32 %rd3, %r1, 4;\n" -" ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];\n" -" add.u64 %rd5, %rd4, %rd3;\n" -" ld.global.f32 %f1, [%rd5+0];\n" -" add.u64 %rd6, %rd3, %rd1;\n" -" st.shared.f32 [%rd6+0], %f1;\n" -"$Lt_1_20994:\n" -" mov.u64 %rd1, __cuda___cuda_local_var_32617_33_non_const_sp_lj3268;\n" -" mov.u32 %r3, 120;\n" -" setp.gt.s32 %p2, %r1, %r3;\n" -" @%p2 bra $Lt_1_21506;\n" -" .loc 16 120 0\n" -" mov.u64 %rd7, __cuda___cuda_local_var_32615_34_non_const_lj13296;\n" -" cvt.s64.s32 %rd8, %r1;\n" -" mul.wide.s32 %rd9, %r1, 16;\n" -" ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_lj1_in];\n" -" add.u64 %rd11, %rd10, %rd9;\n" -" add.u64 %rd12, %rd9, %rd7;\n" -" ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0];\n" -" st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5};\n" -" ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag];\n" -" mov.u32 %r5, 0;\n" -" setp.le.s32 %p3, %r4, %r5;\n" -" @%p3 bra $Lt_1_22018;\n" -" .loc 16 122 0\n" -" mov.u64 %rd13, __cuda___cuda_local_var_32616_34_non_const_lj35232;\n" -" ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in];\n" -" add.u64 %rd15, %rd14, %rd9;\n" -" add.u64 %rd16, %rd9, %rd13;\n" -" ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0];\n" -" st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9};\n" -"$Lt_1_22018:\n" -" mov.u64 %rd13, __cuda___cuda_local_var_32616_34_non_const_lj35232;\n" -"$Lt_1_21506:\n" -" mov.u64 %rd13, __cuda___cuda_local_var_32616_34_non_const_lj35232;\n" -" mov.u64 %rd7, __cuda___cuda_local_var_32615_34_non_const_lj13296;\n" -" .loc 16 130 0\n" -" mov.f32 %f10, 0f00000000; \n" -" mov.f32 %f11, %f10;\n" -" mov.f32 %f12, 0f00000000; \n" -" mov.f32 %f13, %f12;\n" -" mov.f32 %f14, 0f00000000; \n" -" mov.f32 %f15, %f14;\n" -" mov.f32 %f16, 0f00000000; \n" -" mov.f32 %f17, %f16;\n" -" mov.f32 %f18, 0f00000000; \n" -" mov.f32 %f19, %f18;\n" -" mov.f32 %f20, 0f00000000; \n" -" mov.f32 %f21, %f20;\n" -" .loc 16 132 0\n" -" bar.sync 0;\n" -" ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom];\n" -" div.s32 %r7, %r1, %r6;\n" -" cvt.s32.u32 %r8, %ntid.x;\n" -" div.s32 %r9, %r8, %r6;\n" -" cvt.s32.u32 %r10, %ctaid.x;\n" -" mul.lo.s32 %r11, %r10, %r9;\n" -" add.s32 %r12, %r7, %r11;\n" -" ld.param.s32 %r13, [__cudaparm_kernel_pair_fast_inum];\n" -" setp.ge.s32 %p4, %r12, %r13;\n" -" @%p4 bra $Lt_1_30210;\n" -" .loc 16 137 0\n" -" ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_nbor_pitch];\n" -" cvt.s64.s32 %rd17, %r14;\n" -" mul.wide.s32 %rd18, %r14, 4;\n" -" cvt.s64.s32 %rd19, %r12;\n" -" mul.wide.s32 %rd20, %r12, 4;\n" -" ld.param.u64 %rd21, [__cudaparm_kernel_pair_fast_dev_nbor];\n" -" add.u64 %rd22, %rd20, %rd21;\n" -" add.u64 %rd23, %rd18, %rd22;\n" -" ld.global.s32 %r15, [%rd23+0];\n" -" sub.s32 %r16, %r6, 1;\n" -" and.b32 %r17, %r16, %r1;\n" -" cvt.s64.s32 %rd24, %r17;\n" -" mul.wide.s32 %rd25, %r17, 4;\n" -" ld.param.u64 %rd26, [__cudaparm_kernel_pair_fast_dev_packed];\n" -" setp.ne.u64 %p5, %rd26, %rd21;\n" -" @%p5 bra $Lt_1_23298;\n" -" cvt.s32.s64 %r18, %rd17;\n" -" mul.lo.s32 %r19, %r18, %r6;\n" -" mov.s32 %r20, %r19;\n" -" mul.lo.s32 %r21, %r16, %r12;\n" -" add.s32 %r22, %r18, %r21;\n" -" cvt.s64.s32 %rd27, %r22;\n" -" mul.wide.s32 %rd28, %r22, 4;\n" -" add.u64 %rd29, %rd23, %rd28;\n" -" and.b32 %r23, %r16, %r15;\n" -" cvt.s64.s32 %rd30, %r23;\n" -" div.s32 %r24, %r15, %r6;\n" -" mul.lo.s32 %r25, %r19, %r24;\n" -" cvt.s64.s32 %rd31, %r25;\n" -" add.u64 %rd32, %rd30, %rd31;\n" -" mul.lo.u64 %rd33, %rd32, 4;\n" -" add.u64 %rd34, %rd29, %rd33;\n" -" add.u64 %rd35, %rd25, %rd29;\n" -" bra.uni $Lt_1_23042;\n" -"$Lt_1_23298:\n" -" add.u64 %rd36, %rd18, %rd23;\n" -" ld.global.s32 %r26, [%rd36+0];\n" -" cvt.s64.s32 %rd37, %r26;\n" -" mul.wide.s32 %rd38, %r26, 4;\n" -" add.u64 %rd39, %rd26, %rd38;\n" -" cvt.s64.s32 %rd40, %r15;\n" -" mul.wide.s32 %rd41, %r15, 4;\n" -" add.u64 %rd34, %rd39, %rd41;\n" -" mov.s32 %r20, %r6;\n" -" add.u64 %rd35, %rd25, %rd39;\n" -"$Lt_1_23042:\n" -" .loc 16 140 0\n" -" ld.global.s32 %r27, [%rd22+0];\n" -" mov.u32 %r28, %r27;\n" -" mov.s32 %r29, 0;\n" -" mov.u32 %r30, %r29;\n" -" mov.s32 %r31, 0;\n" -" mov.u32 %r32, %r31;\n" -" mov.s32 %r33, 0;\n" -" mov.u32 %r34, %r33;\n" -" tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r28,%r30,%r32,%r34}];\n" -" mov.f32 %f26, %f22;\n" -" mov.f32 %f27, %f23;\n" -" mov.f32 %f28, %f24;\n" -" mov.f32 %f29, %f25;\n" -" setp.ge.u64 %p6, %rd35, %rd34;\n" -" @%p6 bra $Lt_1_31746;\n" -" cvt.rzi.ftz.s32.f32 %r35, %f29;\n" -" cvt.s64.s32 %rd42, %r20;\n" -" mul.lo.s32 %r36, %r35, 11;\n" -" cvt.rn.f32.s32 %f30, %r36;\n" -" mov.f32 %f31, 0f00000000; \n" -" mov.f32 %f32, 0f00000000; \n" -" mov.f32 %f33, 0f00000000; \n" -" mov.f32 %f34, 0f00000000; \n" -"$Lt_1_24066:\n" -" .loc 16 147 0\n" -" ld.global.s32 %r37, [%rd35+0];\n" -" .loc 16 148 0\n" -" shr.s32 %r38, %r37, 30;\n" -" and.b32 %r39, %r38, 3;\n" -" cvt.s64.s32 %rd43, %r39;\n" -" mul.wide.s32 %rd44, %r39, 4;\n" -" add.u64 %rd45, %rd1, %rd44;\n" -" ld.shared.f32 %f35, [%rd45+0];\n" -" .loc 16 151 0\n" -" and.b32 %r40, %r37, 1073741823;\n" -" mov.u32 %r41, %r40;\n" -" mov.s32 %r42, 0;\n" -" mov.u32 %r43, %r42;\n" -" mov.s32 %r44, 0;\n" -" mov.u32 %r45, %r44;\n" -" mov.s32 %r46, 0;\n" -" mov.u32 %r47, %r46;\n" -" tex.1d.v4.f32.s32 {%f36,%f37,%f38,%f39},[pos_tex,{%r41,%r43,%r45,%r47}];\n" -" mov.f32 %f40, %f36;\n" -" mov.f32 %f41, %f37;\n" -" mov.f32 %f42, %f38;\n" -" mov.f32 %f43, %f39;\n" -" sub.ftz.f32 %f44, %f27, %f41;\n" -" sub.ftz.f32 %f45, %f26, %f40;\n" -" sub.ftz.f32 %f46, %f28, %f42;\n" -" mul.ftz.f32 %f47, %f44, %f44;\n" -" fma.rn.ftz.f32 %f48, %f45, %f45, %f47;\n" -" fma.rn.ftz.f32 %f49, %f46, %f46, %f48;\n" -" add.ftz.f32 %f50, %f30, %f43;\n" -" cvt.rzi.ftz.s32.f32 %r48, %f50;\n" -" cvt.s64.s32 %rd46, %r48;\n" -" mul.wide.s32 %rd47, %r48, 16;\n" -" add.u64 %rd48, %rd47, %rd7;\n" -" ld.shared.f32 %f51, [%rd48+8];\n" -" setp.gt.ftz.f32 %p7, %f51, %f49;\n" -" @!%p7 bra $Lt_1_25346;\n" -" .loc 16 163 0\n" -" rcp.approx.ftz.f32 %f52, %f49;\n" -" mul.ftz.f32 %f53, %f52, %f52;\n" -" mul.ftz.f32 %f54, %f52, %f53;\n" -" mul.ftz.f32 %f55, %f52, %f35;\n" -" mul.ftz.f32 %f56, %f54, %f55;\n" -" ld.shared.v2.f32 {%f57,%f58}, [%rd48+0];\n" -" mul.ftz.f32 %f59, %f57, %f54;\n" -" sub.ftz.f32 %f60, %f59, %f58;\n" -" mul.ftz.f32 %f61, %f56, %f60;\n" -" .loc 16 165 0\n" -" fma.rn.ftz.f32 %f33, %f45, %f61, %f33;\n" -" .loc 16 166 0\n" -" fma.rn.ftz.f32 %f32, %f44, %f61, %f32;\n" -" .loc 16 167 0\n" -" fma.rn.ftz.f32 %f31, %f46, %f61, %f31;\n" -" ld.param.s32 %r49, [__cudaparm_kernel_pair_fast_eflag];\n" -" mov.u32 %r50, 0;\n" -" setp.le.s32 %p8, %r49, %r50;\n" -" @%p8 bra $Lt_1_24834;\n" -" .loc 16 170 0\n" -" add.u64 %rd49, %rd47, %rd13;\n" -" ld.shared.v4.f32 {%f62,%f63,%f64,_}, [%rd49+0];\n" -" mul.ftz.f32 %f65, %f62, %f54;\n" -" sub.ftz.f32 %f66, %f65, %f63;\n" -" mul.ftz.f32 %f67, %f54, %f66;\n" -" .loc 16 171 0\n" -" sub.ftz.f32 %f68, %f67, %f64;\n" -" fma.rn.ftz.f32 %f34, %f35, %f68, %f34;\n" -"$Lt_1_24834:\n" -" ld.param.s32 %r51, [__cudaparm_kernel_pair_fast_vflag];\n" -" mov.u32 %r52, 0;\n" -" setp.le.s32 %p9, %r51, %r52;\n" -" @%p9 bra $Lt_1_25346;\n" -" .loc 16 174 0\n" -" mov.f32 %f69, %f11;\n" -" mul.ftz.f32 %f70, %f45, %f45;\n" -" fma.rn.ftz.f32 %f71, %f61, %f70, %f69;\n" -" mov.f32 %f11, %f71;\n" -" .loc 16 175 0\n" -" mov.f32 %f72, %f13;\n" -" fma.rn.ftz.f32 %f73, %f61, %f47, %f72;\n" -" mov.f32 %f13, %f73;\n" -" .loc 16 176 0\n" -" mov.f32 %f74, %f15;\n" -" mul.ftz.f32 %f75, %f46, %f46;\n" -" fma.rn.ftz.f32 %f76, %f61, %f75, %f74;\n" -" mov.f32 %f15, %f76;\n" -" .loc 16 177 0\n" -" mov.f32 %f77, %f17;\n" -" mul.ftz.f32 %f78, %f44, %f45;\n" -" fma.rn.ftz.f32 %f79, %f61, %f78, %f77;\n" -" mov.f32 %f17, %f79;\n" -" .loc 16 178 0\n" -" mov.f32 %f80, %f19;\n" -" mul.ftz.f32 %f81, %f45, %f46;\n" -" fma.rn.ftz.f32 %f82, %f61, %f81, %f80;\n" -" mov.f32 %f19, %f82;\n" -" .loc 16 179 0\n" -" mul.ftz.f32 %f83, %f44, %f46;\n" -" fma.rn.ftz.f32 %f20, %f61, %f83, %f20;\n" -" mov.f32 %f21, %f20;\n" -"$Lt_1_25346:\n" -"$Lt_1_24322:\n" -" .loc 16 145 0\n" -" mul.lo.u64 %rd50, %rd42, 4;\n" -" add.u64 %rd35, %rd35, %rd50;\n" -" setp.lt.u64 %p10, %rd35, %rd34;\n" -" @%p10 bra $Lt_1_24066;\n" -" bra.uni $Lt_1_23554;\n" -"$Lt_1_31746:\n" -" mov.f32 %f31, 0f00000000; \n" -" mov.f32 %f32, 0f00000000; \n" -" mov.f32 %f33, 0f00000000; \n" -" mov.f32 %f34, 0f00000000; \n" -"$Lt_1_23554:\n" -" mov.u32 %r53, 1;\n" -" setp.le.s32 %p11, %r6, %r53;\n" -" @%p11 bra $Lt_1_28162;\n" -" .loc 16 184 0\n" -" mov.u64 %rd51, __cuda___cuda_local_var_32685_55_non_const_red_acc7168;\n" -" cvt.s64.s32 %rd52, %r1;\n" -" mul.wide.s32 %rd53, %r1, 4;\n" -" add.u64 %rd54, %rd51, %rd53;\n" -" mov.f32 %f84, %f33;\n" -" st.shared.f32 [%rd54+0], %f84;\n" -" mov.f32 %f85, %f32;\n" -" st.shared.f32 [%rd54+512], %f85;\n" -" mov.f32 %f86, %f31;\n" -" st.shared.f32 [%rd54+1024], %f86;\n" -" mov.f32 %f87, %f34;\n" -" st.shared.f32 [%rd54+1536], %f87;\n" -" shr.s32 %r54, %r6, 31;\n" -" mov.s32 %r55, 1;\n" -" and.b32 %r56, %r54, %r55;\n" -" add.s32 %r57, %r56, %r6;\n" -" shr.s32 %r58, %r57, 1;\n" -" mov.s32 %r59, %r58;\n" -" mov.u32 %r60, 0;\n" -" setp.ne.u32 %p12, %r58, %r60;\n" -" @!%p12 bra $Lt_1_26626;\n" -"$Lt_1_27138:\n" -" setp.ge.u32 %p13, %r17, %r59;\n" -" @%p13 bra $Lt_1_27394;\n" -" add.u32 %r61, %r1, %r59;\n" -" cvt.u64.u32 %rd55, %r61;\n" -" mul.wide.u32 %rd56, %r61, 4;\n" -" add.u64 %rd57, %rd51, %rd56;\n" -" ld.shared.f32 %f88, [%rd57+0];\n" -" add.ftz.f32 %f84, %f88, %f84;\n" -" st.shared.f32 [%rd54+0], %f84;\n" -" ld.shared.f32 %f89, [%rd57+512];\n" -" add.ftz.f32 %f85, %f89, %f85;\n" -" st.shared.f32 [%rd54+512], %f85;\n" -" ld.shared.f32 %f90, [%rd57+1024];\n" -" add.ftz.f32 %f86, %f90, %f86;\n" -" st.shared.f32 [%rd54+1024], %f86;\n" -" ld.shared.f32 %f91, [%rd57+1536];\n" -" add.ftz.f32 %f87, %f91, %f87;\n" -" st.shared.f32 [%rd54+1536], %f87;\n" -"$Lt_1_27394:\n" -" shr.u32 %r59, %r59, 1;\n" -" mov.u32 %r62, 0;\n" -" setp.ne.u32 %p14, %r59, %r62;\n" -" @%p14 bra $Lt_1_27138;\n" -"$Lt_1_26626:\n" -" mov.f32 %f33, %f84;\n" -" mov.f32 %f32, %f85;\n" -" mov.f32 %f31, %f86;\n" -" mov.f32 %f34, %f87;\n" -" ld.param.s32 %r63, [__cudaparm_kernel_pair_fast_vflag];\n" -" mov.u32 %r64, 0;\n" -" setp.le.s32 %p15, %r63, %r64;\n" -" @%p15 bra $Lt_1_28162;\n" -" mov.f32 %f84, %f11;\n" -" st.shared.f32 [%rd54+0], %f84;\n" -" mov.f32 %f85, %f13;\n" -" st.shared.f32 [%rd54+512], %f85;\n" -" mov.f32 %f86, %f15;\n" -" st.shared.f32 [%rd54+1024], %f86;\n" -" mov.f32 %f87, %f17;\n" -" st.shared.f32 [%rd54+1536], %f87;\n" -" mov.f32 %f92, %f19;\n" -" st.shared.f32 [%rd54+2048], %f92;\n" -" mov.f32 %f93, %f20;\n" -" st.shared.f32 [%rd54+2560], %f93;\n" -" mov.s32 %r65, %r58;\n" -" @!%p12 bra $Lt_1_28674;\n" -"$Lt_1_29186:\n" -" setp.ge.u32 %p16, %r17, %r65;\n" -" @%p16 bra $Lt_1_29442;\n" -" add.u32 %r66, %r1, %r65;\n" -" cvt.u64.u32 %rd58, %r66;\n" -" mul.wide.u32 %rd59, %r66, 4;\n" -" add.u64 %rd60, %rd51, %rd59;\n" -" ld.shared.f32 %f94, [%rd60+0];\n" -" add.ftz.f32 %f84, %f94, %f84;\n" -" st.shared.f32 [%rd54+0], %f84;\n" -" ld.shared.f32 %f95, [%rd60+512];\n" -" add.ftz.f32 %f85, %f95, %f85;\n" -" st.shared.f32 [%rd54+512], %f85;\n" -" ld.shared.f32 %f96, [%rd60+1024];\n" -" add.ftz.f32 %f86, %f96, %f86;\n" -" st.shared.f32 [%rd54+1024], %f86;\n" -" ld.shared.f32 %f97, [%rd60+1536];\n" -" add.ftz.f32 %f87, %f97, %f87;\n" -" st.shared.f32 [%rd54+1536], %f87;\n" -" ld.shared.f32 %f98, [%rd60+2048];\n" -" add.ftz.f32 %f92, %f98, %f92;\n" -" st.shared.f32 [%rd54+2048], %f92;\n" -" ld.shared.f32 %f99, [%rd60+2560];\n" -" add.ftz.f32 %f93, %f99, %f93;\n" -" st.shared.f32 [%rd54+2560], %f93;\n" -"$Lt_1_29442:\n" -" shr.u32 %r65, %r65, 1;\n" -" mov.u32 %r67, 0;\n" -" setp.ne.u32 %p17, %r65, %r67;\n" -" @%p17 bra $Lt_1_29186;\n" -"$Lt_1_28674:\n" -" mov.f32 %f11, %f84;\n" -" mov.f32 %f13, %f85;\n" -" mov.f32 %f15, %f86;\n" -" mov.f32 %f17, %f87;\n" -" mov.f32 %f19, %f92;\n" -" mov.f32 %f21, %f93;\n" -"$Lt_1_28162:\n" -"$Lt_1_26114:\n" -" mov.u32 %r68, 0;\n" -" setp.ne.s32 %p18, %r17, %r68;\n" -" @%p18 bra $Lt_1_30210;\n" -" ld.param.u64 %rd61, [__cudaparm_kernel_pair_fast___val_paramengv];\n" -" add.u64 %rd62, %rd61, %rd20;\n" -" ld.param.s32 %r69, [__cudaparm_kernel_pair_fast_eflag];\n" -" mov.u32 %r70, 0;\n" -" setp.le.s32 %p19, %r69, %r70;\n" -" @%p19 bra $Lt_1_30722;\n" -" st.global.f32 [%rd62+0], %f34;\n" -" cvt.s64.s32 %rd63, %r13;\n" -" mul.wide.s32 %rd64, %r13, 4;\n" -" add.u64 %rd62, %rd62, %rd64;\n" -"$Lt_1_30722:\n" -" ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag];\n" -" mov.u32 %r72, 0;\n" -" setp.le.s32 %p20, %r71, %r72;\n" -" @%p20 bra $Lt_1_31234;\n" -" mov.f32 %f100, %f11;\n" -" st.global.f32 [%rd62+0], %f100;\n" -" cvt.s64.s32 %rd65, %r13;\n" -" mul.wide.s32 %rd66, %r13, 4;\n" -" add.u64 %rd67, %rd66, %rd62;\n" -" mov.f32 %f101, %f13;\n" -" st.global.f32 [%rd67+0], %f101;\n" -" add.u64 %rd68, %rd66, %rd67;\n" -" mov.f32 %f102, %f15;\n" -" st.global.f32 [%rd68+0], %f102;\n" -" add.u64 %rd69, %rd66, %rd68;\n" -" mov.f32 %f103, %f17;\n" -" st.global.f32 [%rd69+0], %f103;\n" -" add.u64 %rd62, %rd66, %rd69;\n" -" mov.f32 %f104, %f19;\n" -" st.global.f32 [%rd62+0], %f104;\n" -" mov.f32 %f105, %f21;\n" -" add.u64 %rd70, %rd66, %rd62;\n" -" st.global.f32 [%rd70+0], %f105;\n" -"$Lt_1_31234:\n" -" ld.param.u64 %rd71, [__cudaparm_kernel_pair_fast_ans];\n" -" mul.lo.u64 %rd72, %rd19, 16;\n" -" add.u64 %rd73, %rd71, %rd72;\n" -" mov.f32 %f106, %f107;\n" -" st.global.v4.f32 [%rd73+0], {%f33,%f32,%f31,%f106};\n" -"$Lt_1_30210:\n" -"$Lt_1_22530:\n" -" .loc 16 187 0\n" -" exit;\n" -"$LDWend_kernel_pair_fast:\n" -" }\n" -; diff --git a/lib/gpu/morse.ptx b/lib/gpu/morse.ptx deleted file mode 100644 index 4075a7371d..0000000000 --- a/lib/gpu/morse.ptx +++ /dev/null @@ -1,921 +0,0 @@ - .version 2.3 - .target sm_20 - .address_size 64 - // compiled with /usr/local/cuda/open64/lib//be - // nvopencc 4.0 built on 2011-05-12 - - //----------------------------------------------------------- - // Compiling /tmp/tmpxft_00009e26_00000000-9_lal_morse.cpp3.i (/home/sjplimp/ccBI#.ffCTdB) - //----------------------------------------------------------- - - //----------------------------------------------------------- - // Options: - //----------------------------------------------------------- - // Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64 - // -O3 (Optimization level) - // -g0 (Debug level) - // -m2 (Report advisories) - //----------------------------------------------------------- - - .file 1 "" - .file 2 "/tmp/tmpxft_00009e26_00000000-8_lal_morse.cudafe2.gpu" - .file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h" - .file 4 "/usr/local/cuda/include/crt/device_runtime.h" - .file 5 "/usr/local/cuda/include/host_defines.h" - .file 6 "/usr/local/cuda/include/builtin_types.h" - .file 7 "/usr/local/cuda/include/device_types.h" - .file 8 "/usr/local/cuda/include/driver_types.h" - .file 9 "/usr/local/cuda/include/surface_types.h" - .file 10 "/usr/local/cuda/include/texture_types.h" - .file 11 "/usr/local/cuda/include/vector_types.h" - .file 12 "/usr/local/cuda/include/device_launch_parameters.h" - .file 13 "/usr/local/cuda/include/crt/storage_class.h" - .file 14 "/usr/include/bits/types.h" - .file 15 "/usr/include/time.h" - .file 16 "lal_morse.cu" - .file 17 "/usr/local/cuda/include/common_functions.h" - .file 18 "/usr/local/cuda/include/math_functions.h" - .file 19 "/usr/local/cuda/include/math_constants.h" - .file 20 "/usr/local/cuda/include/device_functions.h" - .file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h" - .file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h" - .file 23 "/usr/local/cuda/include/sm_13_double_functions.h" - .file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h" - .file 25 "/usr/local/cuda/include/sm_20_intrinsics.h" - .file 26 "/usr/local/cuda/include/surface_functions.h" - .file 27 "/usr/local/cuda/include/texture_fetch_functions.h" - .file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h" - - .global .texref pos_tex; - - .entry kernel_pair ( - .param .u64 __cudaparm_kernel_pair_x_, - .param .u64 __cudaparm_kernel_pair_mor1, - .param .u64 __cudaparm_kernel_pair_mor2, - .param .s32 __cudaparm_kernel_pair_lj_types, - .param .u64 __cudaparm_kernel_pair_sp_lj_in, - .param .u64 __cudaparm_kernel_pair_dev_nbor, - .param .u64 __cudaparm_kernel_pair_dev_packed, - .param .u64 __cudaparm_kernel_pair_ans, - .param .u64 __cudaparm_kernel_pair___val_paramengv, - .param .s32 __cudaparm_kernel_pair_eflag, - .param .s32 __cudaparm_kernel_pair_vflag, - .param .s32 __cudaparm_kernel_pair_inum, - .param .s32 __cudaparm_kernel_pair_nbor_pitch, - .param .s32 __cudaparm_kernel_pair_t_per_atom) - { - .reg .u32 %r<72>; - .reg .u64 %rd<64>; - .reg .f32 %f<104>; - .reg .f64 %fd<10>; - .reg .pred %p<19>; - .shared .align 16 .b8 __cuda___cuda_local_var_32536_33_non_const_sp_lj92[16]; - .shared .align 4 .b8 __cuda___cuda_local_var_32601_55_non_const_red_acc108[3072]; - // __cuda_local_var_32543_10_non_const_f = 48 - // __cuda_local_var_32545_9_non_const_virial = 16 - .loc 16 31 0 -$LDWbegin_kernel_pair: - .loc 16 36 0 - ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in]; - ldu.global.f32 %f1, [%rd1+0]; - .loc 16 37 0 - ld.global.f32 %f2, [%rd1+4]; - .loc 16 38 0 - ld.global.f32 %f3, [%rd1+8]; - .loc 16 39 0 - ld.global.f32 %f4, [%rd1+12]; - st.shared.v4.f32 [__cuda___cuda_local_var_32536_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4}; - .loc 16 46 0 - mov.f32 %f5, 0f00000000; // 0 - mov.f32 %f6, %f5; - mov.f32 %f7, 0f00000000; // 0 - mov.f32 %f8, %f7; - mov.f32 %f9, 0f00000000; // 0 - mov.f32 %f10, %f9; - mov.f32 %f11, 0f00000000; // 0 - mov.f32 %f12, %f11; - mov.f32 %f13, 0f00000000; // 0 - mov.f32 %f14, %f13; - mov.f32 %f15, 0f00000000; // 0 - mov.f32 %f16, %f15; - ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom]; - cvt.s32.u32 %r2, %tid.x; - div.s32 %r3, %r2, %r1; - cvt.s32.u32 %r4, %ntid.x; - div.s32 %r5, %r4, %r1; - cvt.s32.u32 %r6, %ctaid.x; - mul.lo.s32 %r7, %r6, %r5; - add.s32 %r8, %r3, %r7; - ld.param.s32 %r9, [__cudaparm_kernel_pair_inum]; - setp.ge.s32 %p1, %r8, %r9; - @%p1 bra $Lt_0_26370; - .loc 16 51 0 - ld.param.s32 %r10, [__cudaparm_kernel_pair_nbor_pitch]; - cvt.s64.s32 %rd2, %r10; - mul.wide.s32 %rd3, %r10, 4; - cvt.s64.s32 %rd4, %r8; - mul.wide.s32 %rd5, %r8, 4; - ld.param.u64 %rd6, [__cudaparm_kernel_pair_dev_nbor]; - add.u64 %rd7, %rd5, %rd6; - add.u64 %rd8, %rd3, %rd7; - ld.global.s32 %r11, [%rd8+0]; - sub.s32 %r12, %r1, 1; - and.b32 %r13, %r12, %r2; - cvt.s64.s32 %rd9, %r13; - mul.wide.s32 %rd10, %r13, 4; - ld.param.u64 %rd11, [__cudaparm_kernel_pair_dev_packed]; - setp.ne.u64 %p2, %rd11, %rd6; - @%p2 bra $Lt_0_19458; - cvt.s32.s64 %r14, %rd2; - mul.lo.s32 %r15, %r14, %r1; - mov.s32 %r16, %r15; - mul.lo.s32 %r17, %r12, %r8; - add.s32 %r18, %r14, %r17; - cvt.s64.s32 %rd12, %r18; - mul.wide.s32 %rd13, %r18, 4; - add.u64 %rd14, %rd8, %rd13; - and.b32 %r19, %r12, %r11; - cvt.s64.s32 %rd15, %r19; - div.s32 %r20, %r11, %r1; - mul.lo.s32 %r21, %r15, %r20; - cvt.s64.s32 %rd16, %r21; - add.u64 %rd17, %rd15, %rd16; - mul.lo.u64 %rd18, %rd17, 4; - add.u64 %rd19, %rd14, %rd18; - add.u64 %rd20, %rd10, %rd14; - bra.uni $Lt_0_19202; -$Lt_0_19458: - add.u64 %rd21, %rd3, %rd8; - ld.global.s32 %r22, [%rd21+0]; - cvt.s64.s32 %rd22, %r22; - mul.wide.s32 %rd23, %r22, 4; - add.u64 %rd24, %rd11, %rd23; - cvt.s64.s32 %rd25, %r11; - mul.wide.s32 %rd26, %r11, 4; - add.u64 %rd19, %rd24, %rd26; - mov.s32 %r16, %r1; - add.u64 %rd20, %rd10, %rd24; -$Lt_0_19202: - .loc 16 54 0 - ld.global.s32 %r23, [%rd7+0]; - mov.u32 %r24, %r23; - mov.s32 %r25, 0; - mov.u32 %r26, %r25; - mov.s32 %r27, 0; - mov.u32 %r28, %r27; - mov.s32 %r29, 0; - mov.u32 %r30, %r29; - tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r24,%r26,%r28,%r30}]; - mov.f32 %f21, %f17; - mov.f32 %f22, %f18; - mov.f32 %f23, %f19; - mov.f32 %f24, %f20; - setp.ge.u64 %p3, %rd20, %rd19; - @%p3 bra $Lt_0_27906; - cvt.rzi.ftz.s32.f32 %r31, %f24; - cvt.s64.s32 %rd27, %r16; - ld.param.s32 %r32, [__cudaparm_kernel_pair_lj_types]; - mul.lo.s32 %r33, %r32, %r31; - ld.param.u64 %rd28, [__cudaparm_kernel_pair_mor1]; - mov.f32 %f25, 0f00000000; // 0 - mov.f32 %f26, 0f00000000; // 0 - mov.f32 %f27, 0f00000000; // 0 - mov.f32 %f28, 0f00000000; // 0 - mov.u64 %rd29, __cuda___cuda_local_var_32536_33_non_const_sp_lj92; -$Lt_0_20226: - // Loop body line 54, nesting depth: 1, estimated iterations: unknown - .loc 16 60 0 - ld.global.s32 %r34, [%rd20+0]; - .loc 16 61 0 - shr.s32 %r35, %r34, 30; - and.b32 %r36, %r35, 3; - cvt.s64.s32 %rd30, %r36; - mul.wide.s32 %rd31, %r36, 4; - add.u64 %rd32, %rd29, %rd31; - ld.shared.f32 %f29, [%rd32+0]; - .loc 16 64 0 - and.b32 %r37, %r34, 1073741823; - mov.u32 %r38, %r37; - mov.s32 %r39, 0; - mov.u32 %r40, %r39; - mov.s32 %r41, 0; - mov.u32 %r42, %r41; - mov.s32 %r43, 0; - mov.u32 %r44, %r43; - tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r38,%r40,%r42,%r44}]; - mov.f32 %f34, %f30; - mov.f32 %f35, %f31; - mov.f32 %f36, %f32; - mov.f32 %f37, %f33; - cvt.rzi.ftz.s32.f32 %r45, %f37; - sub.ftz.f32 %f38, %f22, %f35; - sub.ftz.f32 %f39, %f21, %f34; - sub.ftz.f32 %f40, %f23, %f36; - mul.ftz.f32 %f41, %f38, %f38; - fma.rn.ftz.f32 %f42, %f39, %f39, %f41; - add.s32 %r46, %r45, %r33; - cvt.s64.s32 %rd33, %r46; - fma.rn.ftz.f32 %f43, %f40, %f40, %f42; - mul.wide.s32 %rd34, %r46, 16; - add.u64 %rd35, %rd28, %rd34; - ld.global.f32 %f44, [%rd35+0]; - setp.gt.ftz.f32 %p4, %f44, %f43; - @!%p4 bra $Lt_0_21506; - .loc 16 77 0 - sqrt.approx.ftz.f32 %f45, %f43; - ld.global.v4.f32 {_,%f46,%f47,%f48}, [%rd35+0]; - sub.ftz.f32 %f49, %f45, %f47; - mul.ftz.f32 %f50, %f48, %f49; - neg.ftz.f32 %f51, %f50; - .loc 16 79 0 - mov.f32 %f52, 0f3fb8aa3b; // 1.4427 - mul.ftz.f32 %f53, %f51, %f52; - ex2.approx.ftz.f32 %f54, %f53; - mul.ftz.f32 %f55, %f54, %f54; - sub.ftz.f32 %f56, %f55, %f54; - mul.ftz.f32 %f57, %f46, %f56; - .loc 16 81 0 - div.approx.ftz.f32 %f58, %f57, %f45; - mul.ftz.f32 %f59, %f58, %f29; - fma.rn.ftz.f32 %f27, %f39, %f59, %f27; - .loc 16 82 0 - fma.rn.ftz.f32 %f26, %f38, %f59, %f26; - .loc 16 83 0 - fma.rn.ftz.f32 %f25, %f40, %f59, %f25; - ld.param.s32 %r47, [__cudaparm_kernel_pair_eflag]; - mov.u32 %r48, 0; - setp.le.s32 %p5, %r47, %r48; - @%p5 bra $Lt_0_20994; - .loc 16 87 0 - cvt.ftz.f64.f32 %fd1, %f54; - ld.param.u64 %rd36, [__cudaparm_kernel_pair_mor2]; - mul.lo.u64 %rd37, %rd33, 8; - add.u64 %rd38, %rd36, %rd37; - ld.global.v2.f32 {%f60,%f61}, [%rd38+0]; - cvt.ftz.f64.f32 %fd2, %f61; - cvt.ftz.f64.f32 %fd3, %f60; - mul.ftz.f32 %f62, %f54, %f54; - cvt.ftz.f64.f32 %fd4, %f62; - add.f64 %fd5, %fd1, %fd1; - sub.f64 %fd6, %fd4, %fd5; - mul.f64 %fd7, %fd3, %fd6; - sub.f64 %fd8, %fd7, %fd2; - cvt.rn.ftz.f32.f64 %f63, %fd8; - fma.rn.ftz.f32 %f28, %f29, %f63, %f28; -$Lt_0_20994: - ld.param.s32 %r49, [__cudaparm_kernel_pair_vflag]; - mov.u32 %r50, 0; - setp.le.s32 %p6, %r49, %r50; - @%p6 bra $Lt_0_21506; - .loc 16 90 0 - mov.f32 %f64, %f6; - mul.ftz.f32 %f65, %f39, %f39; - fma.rn.ftz.f32 %f66, %f59, %f65, %f64; - mov.f32 %f6, %f66; - .loc 16 91 0 - mov.f32 %f67, %f8; - fma.rn.ftz.f32 %f68, %f59, %f41, %f67; - mov.f32 %f8, %f68; - .loc 16 92 0 - mov.f32 %f69, %f10; - mul.ftz.f32 %f70, %f40, %f40; - fma.rn.ftz.f32 %f71, %f59, %f70, %f69; - mov.f32 %f10, %f71; - .loc 16 93 0 - mov.f32 %f72, %f12; - mul.ftz.f32 %f73, %f38, %f39; - fma.rn.ftz.f32 %f74, %f59, %f73, %f72; - mov.f32 %f12, %f74; - .loc 16 94 0 - mov.f32 %f75, %f14; - mul.ftz.f32 %f76, %f39, %f40; - fma.rn.ftz.f32 %f77, %f59, %f76, %f75; - mov.f32 %f14, %f77; - .loc 16 95 0 - mul.ftz.f32 %f78, %f38, %f40; - fma.rn.ftz.f32 %f15, %f59, %f78, %f15; - mov.f32 %f16, %f15; -$Lt_0_21506: -$Lt_0_20482: - .loc 16 58 0 - mul.lo.u64 %rd39, %rd27, 4; - add.u64 %rd20, %rd20, %rd39; - setp.lt.u64 %p7, %rd20, %rd19; - @%p7 bra $Lt_0_20226; - bra.uni $Lt_0_19714; -$Lt_0_27906: - mov.f32 %f25, 0f00000000; // 0 - mov.f32 %f26, 0f00000000; // 0 - mov.f32 %f27, 0f00000000; // 0 - mov.f32 %f28, 0f00000000; // 0 -$Lt_0_19714: - mov.u32 %r51, 1; - setp.le.s32 %p8, %r1, %r51; - @%p8 bra $Lt_0_24322; - .loc 16 100 0 - mov.u64 %rd40, __cuda___cuda_local_var_32601_55_non_const_red_acc108; - cvt.s64.s32 %rd41, %r2; - mul.wide.s32 %rd42, %r2, 4; - add.u64 %rd43, %rd40, %rd42; - mov.f32 %f79, %f27; - st.shared.f32 [%rd43+0], %f79; - mov.f32 %f80, %f26; - st.shared.f32 [%rd43+512], %f80; - mov.f32 %f81, %f25; - st.shared.f32 [%rd43+1024], %f81; - mov.f32 %f82, %f28; - st.shared.f32 [%rd43+1536], %f82; - shr.s32 %r52, %r1, 31; - mov.s32 %r53, 1; - and.b32 %r54, %r52, %r53; - add.s32 %r55, %r54, %r1; - shr.s32 %r56, %r55, 1; - mov.s32 %r57, %r56; - mov.u32 %r58, 0; - setp.ne.u32 %p9, %r56, %r58; - @!%p9 bra $Lt_0_22786; -$Lt_0_23298: - setp.ge.u32 %p10, %r13, %r57; - @%p10 bra $Lt_0_23554; - add.u32 %r59, %r2, %r57; - cvt.u64.u32 %rd44, %r59; - mul.wide.u32 %rd45, %r59, 4; - add.u64 %rd46, %rd40, %rd45; - ld.shared.f32 %f83, [%rd46+0]; - add.ftz.f32 %f79, %f83, %f79; - st.shared.f32 [%rd43+0], %f79; - ld.shared.f32 %f84, [%rd46+512]; - add.ftz.f32 %f80, %f84, %f80; - st.shared.f32 [%rd43+512], %f80; - ld.shared.f32 %f85, [%rd46+1024]; - add.ftz.f32 %f81, %f85, %f81; - st.shared.f32 [%rd43+1024], %f81; - ld.shared.f32 %f86, [%rd46+1536]; - add.ftz.f32 %f82, %f86, %f82; - st.shared.f32 [%rd43+1536], %f82; -$Lt_0_23554: - shr.u32 %r57, %r57, 1; - mov.u32 %r60, 0; - setp.ne.u32 %p11, %r57, %r60; - @%p11 bra $Lt_0_23298; -$Lt_0_22786: - mov.f32 %f27, %f79; - mov.f32 %f26, %f80; - mov.f32 %f25, %f81; - mov.f32 %f28, %f82; - ld.param.s32 %r61, [__cudaparm_kernel_pair_vflag]; - mov.u32 %r62, 0; - setp.le.s32 %p12, %r61, %r62; - @%p12 bra $Lt_0_24322; - mov.f32 %f79, %f6; - st.shared.f32 [%rd43+0], %f79; - mov.f32 %f80, %f8; - st.shared.f32 [%rd43+512], %f80; - mov.f32 %f81, %f10; - st.shared.f32 [%rd43+1024], %f81; - mov.f32 %f82, %f12; - st.shared.f32 [%rd43+1536], %f82; - mov.f32 %f87, %f14; - st.shared.f32 [%rd43+2048], %f87; - mov.f32 %f88, %f15; - st.shared.f32 [%rd43+2560], %f88; - mov.s32 %r63, %r56; - @!%p9 bra $Lt_0_24834; -$Lt_0_25346: - setp.ge.u32 %p13, %r13, %r63; - @%p13 bra $Lt_0_25602; - add.u32 %r64, %r2, %r63; - cvt.u64.u32 %rd47, %r64; - mul.wide.u32 %rd48, %r64, 4; - add.u64 %rd49, %rd40, %rd48; - ld.shared.f32 %f89, [%rd49+0]; - add.ftz.f32 %f79, %f89, %f79; - st.shared.f32 [%rd43+0], %f79; - ld.shared.f32 %f90, [%rd49+512]; - add.ftz.f32 %f80, %f90, %f80; - st.shared.f32 [%rd43+512], %f80; - ld.shared.f32 %f91, [%rd49+1024]; - add.ftz.f32 %f81, %f91, %f81; - st.shared.f32 [%rd43+1024], %f81; - ld.shared.f32 %f92, [%rd49+1536]; - add.ftz.f32 %f82, %f92, %f82; - st.shared.f32 [%rd43+1536], %f82; - ld.shared.f32 %f93, [%rd49+2048]; - add.ftz.f32 %f87, %f93, %f87; - st.shared.f32 [%rd43+2048], %f87; - ld.shared.f32 %f94, [%rd49+2560]; - add.ftz.f32 %f88, %f94, %f88; - st.shared.f32 [%rd43+2560], %f88; -$Lt_0_25602: - shr.u32 %r63, %r63, 1; - mov.u32 %r65, 0; - setp.ne.u32 %p14, %r63, %r65; - @%p14 bra $Lt_0_25346; -$Lt_0_24834: - mov.f32 %f6, %f79; - mov.f32 %f8, %f80; - mov.f32 %f10, %f81; - mov.f32 %f12, %f82; - mov.f32 %f14, %f87; - mov.f32 %f16, %f88; -$Lt_0_24322: -$Lt_0_22274: - mov.u32 %r66, 0; - setp.ne.s32 %p15, %r13, %r66; - @%p15 bra $Lt_0_26370; - ld.param.u64 %rd50, [__cudaparm_kernel_pair___val_paramengv]; - add.u64 %rd51, %rd50, %rd5; - ld.param.s32 %r67, [__cudaparm_kernel_pair_eflag]; - mov.u32 %r68, 0; - setp.le.s32 %p16, %r67, %r68; - @%p16 bra $Lt_0_26882; - st.global.f32 [%rd51+0], %f28; - cvt.s64.s32 %rd52, %r9; - mul.wide.s32 %rd53, %r9, 4; - add.u64 %rd51, %rd51, %rd53; -$Lt_0_26882: - ld.param.s32 %r69, [__cudaparm_kernel_pair_vflag]; - mov.u32 %r70, 0; - setp.le.s32 %p17, %r69, %r70; - @%p17 bra $Lt_0_27394; - mov.f32 %f95, %f6; - st.global.f32 [%rd51+0], %f95; - cvt.s64.s32 %rd54, %r9; - mul.wide.s32 %rd55, %r9, 4; - add.u64 %rd56, %rd55, %rd51; - mov.f32 %f96, %f8; - st.global.f32 [%rd56+0], %f96; - add.u64 %rd57, %rd55, %rd56; - mov.f32 %f97, %f10; - st.global.f32 [%rd57+0], %f97; - add.u64 %rd58, %rd55, %rd57; - mov.f32 %f98, %f12; - st.global.f32 [%rd58+0], %f98; - add.u64 %rd51, %rd55, %rd58; - mov.f32 %f99, %f14; - st.global.f32 [%rd51+0], %f99; - mov.f32 %f100, %f16; - add.u64 %rd59, %rd55, %rd51; - st.global.f32 [%rd59+0], %f100; -$Lt_0_27394: - ld.param.u64 %rd60, [__cudaparm_kernel_pair_ans]; - mul.lo.u64 %rd61, %rd4, 16; - add.u64 %rd62, %rd60, %rd61; - mov.f32 %f101, %f102; - st.global.v4.f32 [%rd62+0], {%f27,%f26,%f25,%f101}; -$Lt_0_26370: -$Lt_0_18690: - .loc 16 103 0 - exit; -$LDWend_kernel_pair: - } // kernel_pair - - .entry kernel_pair_fast ( - .param .u64 __cudaparm_kernel_pair_fast_x_, - .param .u64 __cudaparm_kernel_pair_fast_mor1_in, - .param .u64 __cudaparm_kernel_pair_fast_mor2_in, - .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in, - .param .u64 __cudaparm_kernel_pair_fast_dev_nbor, - .param .u64 __cudaparm_kernel_pair_fast_dev_packed, - .param .u64 __cudaparm_kernel_pair_fast_ans, - .param .u64 __cudaparm_kernel_pair_fast___val_paramengv, - .param .s32 __cudaparm_kernel_pair_fast_eflag, - .param .s32 __cudaparm_kernel_pair_fast_vflag, - .param .s32 __cudaparm_kernel_pair_fast_inum, - .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch, - .param .s32 __cudaparm_kernel_pair_fast_t_per_atom) - { - .reg .u32 %r<74>; - .reg .u64 %rd<77>; - .reg .f32 %f<110>; - .reg .pred %p<22>; - .shared .align 4 .b8 __cuda___cuda_local_var_32618_33_non_const_sp_lj3268[16]; - .shared .align 16 .b8 __cuda___cuda_local_var_32616_34_non_const_mor13296[1936]; - .shared .align 8 .b8 __cuda___cuda_local_var_32617_34_non_const_mor25232[968]; - .shared .align 4 .b8 __cuda___cuda_local_var_32688_55_non_const_red_acc6200[3072]; - // __cuda_local_var_32628_10_non_const_f = 48 - // __cuda_local_var_32630_9_non_const_virial = 16 - .loc 16 111 0 -$LDWbegin_kernel_pair_fast: - cvt.s32.u32 %r1, %tid.x; - mov.u32 %r2, 3; - setp.gt.s32 %p1, %r1, %r2; - @%p1 bra $Lt_1_20994; - .loc 16 119 0 - mov.u64 %rd1, __cuda___cuda_local_var_32618_33_non_const_sp_lj3268; - cvt.s64.s32 %rd2, %r1; - mul.wide.s32 %rd3, %r1, 4; - ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in]; - add.u64 %rd5, %rd4, %rd3; - ld.global.f32 %f1, [%rd5+0]; - add.u64 %rd6, %rd3, %rd1; - st.shared.f32 [%rd6+0], %f1; -$Lt_1_20994: - mov.u64 %rd1, __cuda___cuda_local_var_32618_33_non_const_sp_lj3268; - mov.u32 %r3, 120; - setp.gt.s32 %p2, %r1, %r3; - @%p2 bra $Lt_1_21506; - .loc 16 121 0 - mov.u64 %rd7, __cuda___cuda_local_var_32616_34_non_const_mor13296; - cvt.s64.s32 %rd8, %r1; - mul.wide.s32 %rd9, %r1, 16; - ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_mor1_in]; - add.u64 %rd11, %rd10, %rd9; - add.u64 %rd12, %rd9, %rd7; - ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0]; - st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5}; - ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag]; - mov.u32 %r5, 0; - setp.le.s32 %p3, %r4, %r5; - @%p3 bra $Lt_1_22018; - .loc 16 123 0 - mov.u64 %rd13, __cuda___cuda_local_var_32617_34_non_const_mor25232; - mul.lo.u64 %rd14, %rd8, 8; - ld.param.u64 %rd15, [__cudaparm_kernel_pair_fast_mor2_in]; - add.u64 %rd16, %rd15, %rd14; - add.u64 %rd17, %rd14, %rd13; - ld.global.v2.f32 {%f6,%f7}, [%rd16+0]; - st.shared.v2.f32 [%rd17+0], {%f6,%f7}; -$Lt_1_22018: - mov.u64 %rd13, __cuda___cuda_local_var_32617_34_non_const_mor25232; -$Lt_1_21506: - mov.u64 %rd13, __cuda___cuda_local_var_32617_34_non_const_mor25232; - mov.u64 %rd7, __cuda___cuda_local_var_32616_34_non_const_mor13296; - .loc 16 131 0 - mov.f32 %f8, 0f00000000; // 0 - mov.f32 %f9, %f8; - mov.f32 %f10, 0f00000000; // 0 - mov.f32 %f11, %f10; - mov.f32 %f12, 0f00000000; // 0 - mov.f32 %f13, %f12; - mov.f32 %f14, 0f00000000; // 0 - mov.f32 %f15, %f14; - mov.f32 %f16, 0f00000000; // 0 - mov.f32 %f17, %f16; - mov.f32 %f18, 0f00000000; // 0 - mov.f32 %f19, %f18; - .loc 16 133 0 - bar.sync 0; - ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom]; - div.s32 %r7, %r1, %r6; - cvt.s32.u32 %r8, %ntid.x; - div.s32 %r9, %r8, %r6; - cvt.s32.u32 %r10, %ctaid.x; - mul.lo.s32 %r11, %r10, %r9; - add.s32 %r12, %r7, %r11; - ld.param.s32 %r13, [__cudaparm_kernel_pair_fast_inum]; - setp.ge.s32 %p4, %r12, %r13; - @%p4 bra $Lt_1_30210; - .loc 16 138 0 - ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_nbor_pitch]; - cvt.s64.s32 %rd18, %r14; - mul.wide.s32 %rd19, %r14, 4; - cvt.s64.s32 %rd20, %r12; - mul.wide.s32 %rd21, %r12, 4; - ld.param.u64 %rd22, [__cudaparm_kernel_pair_fast_dev_nbor]; - add.u64 %rd23, %rd21, %rd22; - add.u64 %rd24, %rd19, %rd23; - ld.global.s32 %r15, [%rd24+0]; - sub.s32 %r16, %r6, 1; - and.b32 %r17, %r16, %r1; - cvt.s64.s32 %rd25, %r17; - mul.wide.s32 %rd26, %r17, 4; - ld.param.u64 %rd27, [__cudaparm_kernel_pair_fast_dev_packed]; - setp.ne.u64 %p5, %rd27, %rd22; - @%p5 bra $Lt_1_23298; - cvt.s32.s64 %r18, %rd18; - mul.lo.s32 %r19, %r18, %r6; - mov.s32 %r20, %r19; - mul.lo.s32 %r21, %r16, %r12; - add.s32 %r22, %r18, %r21; - cvt.s64.s32 %rd28, %r22; - mul.wide.s32 %rd29, %r22, 4; - add.u64 %rd30, %rd24, %rd29; - and.b32 %r23, %r16, %r15; - cvt.s64.s32 %rd31, %r23; - div.s32 %r24, %r15, %r6; - mul.lo.s32 %r25, %r19, %r24; - cvt.s64.s32 %rd32, %r25; - add.u64 %rd33, %rd31, %rd32; - mul.lo.u64 %rd34, %rd33, 4; - add.u64 %rd35, %rd30, %rd34; - add.u64 %rd36, %rd26, %rd30; - bra.uni $Lt_1_23042; -$Lt_1_23298: - add.u64 %rd37, %rd19, %rd24; - ld.global.s32 %r26, [%rd37+0]; - cvt.s64.s32 %rd38, %r26; - mul.wide.s32 %rd39, %r26, 4; - add.u64 %rd40, %rd27, %rd39; - cvt.s64.s32 %rd41, %r15; - mul.wide.s32 %rd42, %r15, 4; - add.u64 %rd35, %rd40, %rd42; - mov.s32 %r20, %r6; - add.u64 %rd36, %rd26, %rd40; -$Lt_1_23042: - .loc 16 141 0 - ld.global.s32 %r27, [%rd23+0]; - mov.u32 %r28, %r27; - mov.s32 %r29, 0; - mov.u32 %r30, %r29; - mov.s32 %r31, 0; - mov.u32 %r32, %r31; - mov.s32 %r33, 0; - mov.u32 %r34, %r33; - tex.1d.v4.f32.s32 {%f20,%f21,%f22,%f23},[pos_tex,{%r28,%r30,%r32,%r34}]; - mov.f32 %f24, %f20; - mov.f32 %f25, %f21; - mov.f32 %f26, %f22; - mov.f32 %f27, %f23; - setp.ge.u64 %p6, %rd36, %rd35; - @%p6 bra $Lt_1_31746; - cvt.rzi.ftz.s32.f32 %r35, %f27; - cvt.s64.s32 %rd43, %r20; - mul.lo.s32 %r36, %r35, 11; - cvt.rn.f32.s32 %f28, %r36; - mov.f32 %f29, 0f00000000; // 0 - mov.f32 %f30, 0f00000000; // 0 - mov.f32 %f31, 0f00000000; // 0 - mov.f32 %f32, 0f00000000; // 0 -$Lt_1_24066: - // Loop body line 141, nesting depth: 1, estimated iterations: unknown - .loc 16 148 0 - ld.global.s32 %r37, [%rd36+0]; - .loc 16 149 0 - shr.s32 %r38, %r37, 30; - and.b32 %r39, %r38, 3; - cvt.s64.s32 %rd44, %r39; - mul.wide.s32 %rd45, %r39, 4; - add.u64 %rd46, %rd1, %rd45; - ld.shared.f32 %f33, [%rd46+0]; - .loc 16 152 0 - and.b32 %r40, %r37, 1073741823; - mov.u32 %r41, %r40; - mov.s32 %r42, 0; - mov.u32 %r43, %r42; - mov.s32 %r44, 0; - mov.u32 %r45, %r44; - mov.s32 %r46, 0; - mov.u32 %r47, %r46; - tex.1d.v4.f32.s32 {%f34,%f35,%f36,%f37},[pos_tex,{%r41,%r43,%r45,%r47}]; - mov.f32 %f38, %f34; - mov.f32 %f39, %f35; - mov.f32 %f40, %f36; - mov.f32 %f41, %f37; - sub.ftz.f32 %f42, %f25, %f39; - sub.ftz.f32 %f43, %f24, %f38; - sub.ftz.f32 %f44, %f26, %f40; - mul.ftz.f32 %f45, %f42, %f42; - fma.rn.ftz.f32 %f46, %f43, %f43, %f45; - fma.rn.ftz.f32 %f47, %f44, %f44, %f46; - add.ftz.f32 %f48, %f28, %f41; - cvt.rzi.ftz.s32.f32 %r48, %f48; - cvt.s64.s32 %rd47, %r48; - mul.wide.s32 %rd48, %r48, 16; - add.u64 %rd49, %rd7, %rd48; - ld.shared.f32 %f49, [%rd49+0]; - setp.gt.ftz.f32 %p7, %f49, %f47; - @!%p7 bra $Lt_1_25346; - .loc 16 163 0 - sqrt.approx.ftz.f32 %f50, %f47; - ld.shared.v4.f32 {_,%f51,%f52,%f53}, [%rd49+0]; - sub.ftz.f32 %f54, %f50, %f52; - .loc 16 164 0 - mul.ftz.f32 %f55, %f53, %f54; - neg.ftz.f32 %f56, %f55; - .loc 16 166 0 - mov.f32 %f57, 0f3fb8aa3b; // 1.4427 - mul.ftz.f32 %f58, %f56, %f57; - ex2.approx.ftz.f32 %f59, %f58; - mul.ftz.f32 %f60, %f59, %f59; - sub.ftz.f32 %f61, %f60, %f59; - mul.ftz.f32 %f62, %f51, %f61; - .loc 16 168 0 - div.approx.ftz.f32 %f63, %f62, %f50; - mul.ftz.f32 %f64, %f63, %f33; - fma.rn.ftz.f32 %f31, %f43, %f64, %f31; - .loc 16 169 0 - fma.rn.ftz.f32 %f30, %f42, %f64, %f30; - .loc 16 170 0 - fma.rn.ftz.f32 %f29, %f44, %f64, %f29; - ld.param.s32 %r49, [__cudaparm_kernel_pair_fast_eflag]; - mov.u32 %r50, 0; - setp.le.s32 %p8, %r49, %r50; - @%p8 bra $Lt_1_24834; - .loc 16 173 0 - mul.lo.u64 %rd50, %rd47, 8; - add.u64 %rd51, %rd13, %rd50; - ld.shared.v2.f32 {%f65,%f66}, [%rd51+0]; - sub.ftz.f32 %f67, %f61, %f59; - mul.ftz.f32 %f68, %f65, %f67; - sub.ftz.f32 %f69, %f68, %f66; - .loc 16 174 0 - fma.rn.ftz.f32 %f32, %f33, %f69, %f32; -$Lt_1_24834: - ld.param.s32 %r51, [__cudaparm_kernel_pair_fast_vflag]; - mov.u32 %r52, 0; - setp.le.s32 %p9, %r51, %r52; - @%p9 bra $Lt_1_25346; - .loc 16 177 0 - mov.f32 %f70, %f9; - mul.ftz.f32 %f71, %f43, %f43; - fma.rn.ftz.f32 %f72, %f64, %f71, %f70; - mov.f32 %f9, %f72; - .loc 16 178 0 - mov.f32 %f73, %f11; - fma.rn.ftz.f32 %f74, %f64, %f45, %f73; - mov.f32 %f11, %f74; - .loc 16 179 0 - mov.f32 %f75, %f13; - mul.ftz.f32 %f76, %f44, %f44; - fma.rn.ftz.f32 %f77, %f64, %f76, %f75; - mov.f32 %f13, %f77; - .loc 16 180 0 - mov.f32 %f78, %f15; - mul.ftz.f32 %f79, %f42, %f43; - fma.rn.ftz.f32 %f80, %f64, %f79, %f78; - mov.f32 %f15, %f80; - .loc 16 181 0 - mov.f32 %f81, %f17; - mul.ftz.f32 %f82, %f43, %f44; - fma.rn.ftz.f32 %f83, %f64, %f82, %f81; - mov.f32 %f17, %f83; - .loc 16 182 0 - mul.ftz.f32 %f84, %f42, %f44; - fma.rn.ftz.f32 %f18, %f64, %f84, %f18; - mov.f32 %f19, %f18; -$Lt_1_25346: -$Lt_1_24322: - .loc 16 146 0 - mul.lo.u64 %rd52, %rd43, 4; - add.u64 %rd36, %rd36, %rd52; - setp.lt.u64 %p10, %rd36, %rd35; - @%p10 bra $Lt_1_24066; - bra.uni $Lt_1_23554; -$Lt_1_31746: - mov.f32 %f29, 0f00000000; // 0 - mov.f32 %f30, 0f00000000; // 0 - mov.f32 %f31, 0f00000000; // 0 - mov.f32 %f32, 0f00000000; // 0 -$Lt_1_23554: - mov.u32 %r53, 1; - setp.le.s32 %p11, %r6, %r53; - @%p11 bra $Lt_1_28162; - .loc 16 187 0 - mov.u64 %rd53, __cuda___cuda_local_var_32688_55_non_const_red_acc6200; - cvt.s64.s32 %rd54, %r1; - mul.wide.s32 %rd55, %r1, 4; - add.u64 %rd56, %rd53, %rd55; - mov.f32 %f85, %f31; - st.shared.f32 [%rd56+0], %f85; - mov.f32 %f86, %f30; - st.shared.f32 [%rd56+512], %f86; - mov.f32 %f87, %f29; - st.shared.f32 [%rd56+1024], %f87; - mov.f32 %f88, %f32; - st.shared.f32 [%rd56+1536], %f88; - shr.s32 %r54, %r6, 31; - mov.s32 %r55, 1; - and.b32 %r56, %r54, %r55; - add.s32 %r57, %r56, %r6; - shr.s32 %r58, %r57, 1; - mov.s32 %r59, %r58; - mov.u32 %r60, 0; - setp.ne.u32 %p12, %r58, %r60; - @!%p12 bra $Lt_1_26626; -$Lt_1_27138: - setp.ge.u32 %p13, %r17, %r59; - @%p13 bra $Lt_1_27394; - add.u32 %r61, %r1, %r59; - cvt.u64.u32 %rd57, %r61; - mul.wide.u32 %rd58, %r61, 4; - add.u64 %rd59, %rd53, %rd58; - ld.shared.f32 %f89, [%rd59+0]; - add.ftz.f32 %f85, %f89, %f85; - st.shared.f32 [%rd56+0], %f85; - ld.shared.f32 %f90, [%rd59+512]; - add.ftz.f32 %f86, %f90, %f86; - st.shared.f32 [%rd56+512], %f86; - ld.shared.f32 %f91, [%rd59+1024]; - add.ftz.f32 %f87, %f91, %f87; - st.shared.f32 [%rd56+1024], %f87; - ld.shared.f32 %f92, [%rd59+1536]; - add.ftz.f32 %f88, %f92, %f88; - st.shared.f32 [%rd56+1536], %f88; -$Lt_1_27394: - shr.u32 %r59, %r59, 1; - mov.u32 %r62, 0; - setp.ne.u32 %p14, %r59, %r62; - @%p14 bra $Lt_1_27138; -$Lt_1_26626: - mov.f32 %f31, %f85; - mov.f32 %f30, %f86; - mov.f32 %f29, %f87; - mov.f32 %f32, %f88; - ld.param.s32 %r63, [__cudaparm_kernel_pair_fast_vflag]; - mov.u32 %r64, 0; - setp.le.s32 %p15, %r63, %r64; - @%p15 bra $Lt_1_28162; - mov.f32 %f85, %f9; - st.shared.f32 [%rd56+0], %f85; - mov.f32 %f86, %f11; - st.shared.f32 [%rd56+512], %f86; - mov.f32 %f87, %f13; - st.shared.f32 [%rd56+1024], %f87; - mov.f32 %f88, %f15; - st.shared.f32 [%rd56+1536], %f88; - mov.f32 %f93, %f17; - st.shared.f32 [%rd56+2048], %f93; - mov.f32 %f94, %f18; - st.shared.f32 [%rd56+2560], %f94; - mov.s32 %r65, %r58; - @!%p12 bra $Lt_1_28674; -$Lt_1_29186: - setp.ge.u32 %p16, %r17, %r65; - @%p16 bra $Lt_1_29442; - add.u32 %r66, %r1, %r65; - cvt.u64.u32 %rd60, %r66; - mul.wide.u32 %rd61, %r66, 4; - add.u64 %rd62, %rd53, %rd61; - ld.shared.f32 %f95, [%rd62+0]; - add.ftz.f32 %f85, %f95, %f85; - st.shared.f32 [%rd56+0], %f85; - ld.shared.f32 %f96, [%rd62+512]; - add.ftz.f32 %f86, %f96, %f86; - st.shared.f32 [%rd56+512], %f86; - ld.shared.f32 %f97, [%rd62+1024]; - add.ftz.f32 %f87, %f97, %f87; - st.shared.f32 [%rd56+1024], %f87; - ld.shared.f32 %f98, [%rd62+1536]; - add.ftz.f32 %f88, %f98, %f88; - st.shared.f32 [%rd56+1536], %f88; - ld.shared.f32 %f99, [%rd62+2048]; - add.ftz.f32 %f93, %f99, %f93; - st.shared.f32 [%rd56+2048], %f93; - ld.shared.f32 %f100, [%rd62+2560]; - add.ftz.f32 %f94, %f100, %f94; - st.shared.f32 [%rd56+2560], %f94; -$Lt_1_29442: - shr.u32 %r65, %r65, 1; - mov.u32 %r67, 0; - setp.ne.u32 %p17, %r65, %r67; - @%p17 bra $Lt_1_29186; -$Lt_1_28674: - mov.f32 %f9, %f85; - mov.f32 %f11, %f86; - mov.f32 %f13, %f87; - mov.f32 %f15, %f88; - mov.f32 %f17, %f93; - mov.f32 %f19, %f94; -$Lt_1_28162: -$Lt_1_26114: - mov.u32 %r68, 0; - setp.ne.s32 %p18, %r17, %r68; - @%p18 bra $Lt_1_30210; - ld.param.u64 %rd63, [__cudaparm_kernel_pair_fast___val_paramengv]; - add.u64 %rd64, %rd63, %rd21; - ld.param.s32 %r69, [__cudaparm_kernel_pair_fast_eflag]; - mov.u32 %r70, 0; - setp.le.s32 %p19, %r69, %r70; - @%p19 bra $Lt_1_30722; - st.global.f32 [%rd64+0], %f32; - cvt.s64.s32 %rd65, %r13; - mul.wide.s32 %rd66, %r13, 4; - add.u64 %rd64, %rd64, %rd66; -$Lt_1_30722: - ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag]; - mov.u32 %r72, 0; - setp.le.s32 %p20, %r71, %r72; - @%p20 bra $Lt_1_31234; - mov.f32 %f101, %f9; - st.global.f32 [%rd64+0], %f101; - cvt.s64.s32 %rd67, %r13; - mul.wide.s32 %rd68, %r13, 4; - add.u64 %rd69, %rd68, %rd64; - mov.f32 %f102, %f11; - st.global.f32 [%rd69+0], %f102; - add.u64 %rd70, %rd68, %rd69; - mov.f32 %f103, %f13; - st.global.f32 [%rd70+0], %f103; - add.u64 %rd71, %rd68, %rd70; - mov.f32 %f104, %f15; - st.global.f32 [%rd71+0], %f104; - add.u64 %rd64, %rd68, %rd71; - mov.f32 %f105, %f17; - st.global.f32 [%rd64+0], %f105; - mov.f32 %f106, %f19; - add.u64 %rd72, %rd68, %rd64; - st.global.f32 [%rd72+0], %f106; -$Lt_1_31234: - ld.param.u64 %rd73, [__cudaparm_kernel_pair_fast_ans]; - mul.lo.u64 %rd74, %rd20, 16; - add.u64 %rd75, %rd73, %rd74; - mov.f32 %f107, %f108; - st.global.v4.f32 [%rd75+0], {%f31,%f30,%f29,%f107}; -$Lt_1_30210: -$Lt_1_22530: - .loc 16 190 0 - exit; -$LDWend_kernel_pair_fast: - } // kernel_pair_fast - diff --git a/lib/gpu/morse_ptx.h b/lib/gpu/morse_ptx.h deleted file mode 100644 index d860de3076..0000000000 --- a/lib/gpu/morse_ptx.h +++ /dev/null @@ -1,869 +0,0 @@ -const char * morse = -" .version 2.3\n" -" .target sm_20\n" -" .address_size 64\n" -" .global .texref pos_tex;\n" -" .entry kernel_pair (\n" -" .param .u64 __cudaparm_kernel_pair_x_,\n" -" .param .u64 __cudaparm_kernel_pair_mor1,\n" -" .param .u64 __cudaparm_kernel_pair_mor2,\n" -" .param .s32 __cudaparm_kernel_pair_lj_types,\n" -" .param .u64 __cudaparm_kernel_pair_sp_lj_in,\n" -" .param .u64 __cudaparm_kernel_pair_dev_nbor,\n" -" .param .u64 __cudaparm_kernel_pair_dev_packed,\n" -" .param .u64 __cudaparm_kernel_pair_ans,\n" -" .param .u64 __cudaparm_kernel_pair___val_paramengv,\n" -" .param .s32 __cudaparm_kernel_pair_eflag,\n" -" .param .s32 __cudaparm_kernel_pair_vflag,\n" -" .param .s32 __cudaparm_kernel_pair_inum,\n" -" .param .s32 __cudaparm_kernel_pair_nbor_pitch,\n" -" .param .s32 __cudaparm_kernel_pair_t_per_atom)\n" -" {\n" -" .reg .u32 %r<72>;\n" -" .reg .u64 %rd<64>;\n" -" .reg .f32 %f<104>;\n" -" .reg .f64 %fd<10>;\n" -" .reg .pred %p<19>;\n" -" .shared .align 16 .b8 __cuda___cuda_local_var_32536_33_non_const_sp_lj92[16];\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_32601_55_non_const_red_acc108[3072];\n" -" .loc 16 31 0\n" -"$LDWbegin_kernel_pair:\n" -" .loc 16 36 0\n" -" ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];\n" -" ldu.global.f32 %f1, [%rd1+0];\n" -" .loc 16 37 0\n" -" ld.global.f32 %f2, [%rd1+4];\n" -" .loc 16 38 0\n" -" ld.global.f32 %f3, [%rd1+8];\n" -" .loc 16 39 0\n" -" ld.global.f32 %f4, [%rd1+12];\n" -" st.shared.v4.f32 [__cuda___cuda_local_var_32536_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4};\n" -" .loc 16 46 0\n" -" mov.f32 %f5, 0f00000000; \n" -" mov.f32 %f6, %f5;\n" -" mov.f32 %f7, 0f00000000; \n" -" mov.f32 %f8, %f7;\n" -" mov.f32 %f9, 0f00000000; \n" -" mov.f32 %f10, %f9;\n" -" mov.f32 %f11, 0f00000000; \n" -" mov.f32 %f12, %f11;\n" -" mov.f32 %f13, 0f00000000; \n" -" mov.f32 %f14, %f13;\n" -" mov.f32 %f15, 0f00000000; \n" -" mov.f32 %f16, %f15;\n" -" ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];\n" -" cvt.s32.u32 %r2, %tid.x;\n" -" div.s32 %r3, %r2, %r1;\n" -" cvt.s32.u32 %r4, %ntid.x;\n" -" div.s32 %r5, %r4, %r1;\n" -" cvt.s32.u32 %r6, %ctaid.x;\n" -" mul.lo.s32 %r7, %r6, %r5;\n" -" add.s32 %r8, %r3, %r7;\n" -" ld.param.s32 %r9, [__cudaparm_kernel_pair_inum];\n" -" setp.ge.s32 %p1, %r8, %r9;\n" -" @%p1 bra $Lt_0_26370;\n" -" .loc 16 51 0\n" -" ld.param.s32 %r10, [__cudaparm_kernel_pair_nbor_pitch];\n" -" cvt.s64.s32 %rd2, %r10;\n" -" mul.wide.s32 %rd3, %r10, 4;\n" -" cvt.s64.s32 %rd4, %r8;\n" -" mul.wide.s32 %rd5, %r8, 4;\n" -" ld.param.u64 %rd6, [__cudaparm_kernel_pair_dev_nbor];\n" -" add.u64 %rd7, %rd5, %rd6;\n" -" add.u64 %rd8, %rd3, %rd7;\n" -" ld.global.s32 %r11, [%rd8+0];\n" -" sub.s32 %r12, %r1, 1;\n" -" and.b32 %r13, %r12, %r2;\n" -" cvt.s64.s32 %rd9, %r13;\n" -" mul.wide.s32 %rd10, %r13, 4;\n" -" ld.param.u64 %rd11, [__cudaparm_kernel_pair_dev_packed];\n" -" setp.ne.u64 %p2, %rd11, %rd6;\n" -" @%p2 bra $Lt_0_19458;\n" -" cvt.s32.s64 %r14, %rd2;\n" -" mul.lo.s32 %r15, %r14, %r1;\n" -" mov.s32 %r16, %r15;\n" -" mul.lo.s32 %r17, %r12, %r8;\n" -" add.s32 %r18, %r14, %r17;\n" -" cvt.s64.s32 %rd12, %r18;\n" -" mul.wide.s32 %rd13, %r18, 4;\n" -" add.u64 %rd14, %rd8, %rd13;\n" -" and.b32 %r19, %r12, %r11;\n" -" cvt.s64.s32 %rd15, %r19;\n" -" div.s32 %r20, %r11, %r1;\n" -" mul.lo.s32 %r21, %r15, %r20;\n" -" cvt.s64.s32 %rd16, %r21;\n" -" add.u64 %rd17, %rd15, %rd16;\n" -" mul.lo.u64 %rd18, %rd17, 4;\n" -" add.u64 %rd19, %rd14, %rd18;\n" -" add.u64 %rd20, %rd10, %rd14;\n" -" bra.uni $Lt_0_19202;\n" -"$Lt_0_19458:\n" -" add.u64 %rd21, %rd3, %rd8;\n" -" ld.global.s32 %r22, [%rd21+0];\n" -" cvt.s64.s32 %rd22, %r22;\n" -" mul.wide.s32 %rd23, %r22, 4;\n" -" add.u64 %rd24, %rd11, %rd23;\n" -" cvt.s64.s32 %rd25, %r11;\n" -" mul.wide.s32 %rd26, %r11, 4;\n" -" add.u64 %rd19, %rd24, %rd26;\n" -" mov.s32 %r16, %r1;\n" -" add.u64 %rd20, %rd10, %rd24;\n" -"$Lt_0_19202:\n" -" .loc 16 54 0\n" -" ld.global.s32 %r23, [%rd7+0];\n" -" mov.u32 %r24, %r23;\n" -" mov.s32 %r25, 0;\n" -" mov.u32 %r26, %r25;\n" -" mov.s32 %r27, 0;\n" -" mov.u32 %r28, %r27;\n" -" mov.s32 %r29, 0;\n" -" mov.u32 %r30, %r29;\n" -" tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r24,%r26,%r28,%r30}];\n" -" mov.f32 %f21, %f17;\n" -" mov.f32 %f22, %f18;\n" -" mov.f32 %f23, %f19;\n" -" mov.f32 %f24, %f20;\n" -" setp.ge.u64 %p3, %rd20, %rd19;\n" -" @%p3 bra $Lt_0_27906;\n" -" cvt.rzi.ftz.s32.f32 %r31, %f24;\n" -" cvt.s64.s32 %rd27, %r16;\n" -" ld.param.s32 %r32, [__cudaparm_kernel_pair_lj_types];\n" -" mul.lo.s32 %r33, %r32, %r31;\n" -" ld.param.u64 %rd28, [__cudaparm_kernel_pair_mor1];\n" -" mov.f32 %f25, 0f00000000; \n" -" mov.f32 %f26, 0f00000000; \n" -" mov.f32 %f27, 0f00000000; \n" -" mov.f32 %f28, 0f00000000; \n" -" mov.u64 %rd29, __cuda___cuda_local_var_32536_33_non_const_sp_lj92;\n" -"$Lt_0_20226:\n" -" .loc 16 60 0\n" -" ld.global.s32 %r34, [%rd20+0];\n" -" .loc 16 61 0\n" -" shr.s32 %r35, %r34, 30;\n" -" and.b32 %r36, %r35, 3;\n" -" cvt.s64.s32 %rd30, %r36;\n" -" mul.wide.s32 %rd31, %r36, 4;\n" -" add.u64 %rd32, %rd29, %rd31;\n" -" ld.shared.f32 %f29, [%rd32+0];\n" -" .loc 16 64 0\n" -" and.b32 %r37, %r34, 1073741823;\n" -" mov.u32 %r38, %r37;\n" -" mov.s32 %r39, 0;\n" -" mov.u32 %r40, %r39;\n" -" mov.s32 %r41, 0;\n" -" mov.u32 %r42, %r41;\n" -" mov.s32 %r43, 0;\n" -" mov.u32 %r44, %r43;\n" -" tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r38,%r40,%r42,%r44}];\n" -" mov.f32 %f34, %f30;\n" -" mov.f32 %f35, %f31;\n" -" mov.f32 %f36, %f32;\n" -" mov.f32 %f37, %f33;\n" -" cvt.rzi.ftz.s32.f32 %r45, %f37;\n" -" sub.ftz.f32 %f38, %f22, %f35;\n" -" sub.ftz.f32 %f39, %f21, %f34;\n" -" sub.ftz.f32 %f40, %f23, %f36;\n" -" mul.ftz.f32 %f41, %f38, %f38;\n" -" fma.rn.ftz.f32 %f42, %f39, %f39, %f41;\n" -" add.s32 %r46, %r45, %r33;\n" -" cvt.s64.s32 %rd33, %r46;\n" -" fma.rn.ftz.f32 %f43, %f40, %f40, %f42;\n" -" mul.wide.s32 %rd34, %r46, 16;\n" -" add.u64 %rd35, %rd28, %rd34;\n" -" ld.global.f32 %f44, [%rd35+0];\n" -" setp.gt.ftz.f32 %p4, %f44, %f43;\n" -" @!%p4 bra $Lt_0_21506;\n" -" .loc 16 77 0\n" -" sqrt.approx.ftz.f32 %f45, %f43;\n" -" ld.global.v4.f32 {_,%f46,%f47,%f48}, [%rd35+0];\n" -" sub.ftz.f32 %f49, %f45, %f47;\n" -" mul.ftz.f32 %f50, %f48, %f49;\n" -" neg.ftz.f32 %f51, %f50;\n" -" .loc 16 79 0\n" -" mov.f32 %f52, 0f3fb8aa3b; \n" -" mul.ftz.f32 %f53, %f51, %f52;\n" -" ex2.approx.ftz.f32 %f54, %f53;\n" -" mul.ftz.f32 %f55, %f54, %f54;\n" -" sub.ftz.f32 %f56, %f55, %f54;\n" -" mul.ftz.f32 %f57, %f46, %f56;\n" -" .loc 16 81 0\n" -" div.approx.ftz.f32 %f58, %f57, %f45;\n" -" mul.ftz.f32 %f59, %f58, %f29;\n" -" fma.rn.ftz.f32 %f27, %f39, %f59, %f27;\n" -" .loc 16 82 0\n" -" fma.rn.ftz.f32 %f26, %f38, %f59, %f26;\n" -" .loc 16 83 0\n" -" fma.rn.ftz.f32 %f25, %f40, %f59, %f25;\n" -" ld.param.s32 %r47, [__cudaparm_kernel_pair_eflag];\n" -" mov.u32 %r48, 0;\n" -" setp.le.s32 %p5, %r47, %r48;\n" -" @%p5 bra $Lt_0_20994;\n" -" .loc 16 87 0\n" -" cvt.ftz.f64.f32 %fd1, %f54;\n" -" ld.param.u64 %rd36, [__cudaparm_kernel_pair_mor2];\n" -" mul.lo.u64 %rd37, %rd33, 8;\n" -" add.u64 %rd38, %rd36, %rd37;\n" -" ld.global.v2.f32 {%f60,%f61}, [%rd38+0];\n" -" cvt.ftz.f64.f32 %fd2, %f61;\n" -" cvt.ftz.f64.f32 %fd3, %f60;\n" -" mul.ftz.f32 %f62, %f54, %f54;\n" -" cvt.ftz.f64.f32 %fd4, %f62;\n" -" add.f64 %fd5, %fd1, %fd1;\n" -" sub.f64 %fd6, %fd4, %fd5;\n" -" mul.f64 %fd7, %fd3, %fd6;\n" -" sub.f64 %fd8, %fd7, %fd2;\n" -" cvt.rn.ftz.f32.f64 %f63, %fd8;\n" -" fma.rn.ftz.f32 %f28, %f29, %f63, %f28;\n" -"$Lt_0_20994:\n" -" ld.param.s32 %r49, [__cudaparm_kernel_pair_vflag];\n" -" mov.u32 %r50, 0;\n" -" setp.le.s32 %p6, %r49, %r50;\n" -" @%p6 bra $Lt_0_21506;\n" -" .loc 16 90 0\n" -" mov.f32 %f64, %f6;\n" -" mul.ftz.f32 %f65, %f39, %f39;\n" -" fma.rn.ftz.f32 %f66, %f59, %f65, %f64;\n" -" mov.f32 %f6, %f66;\n" -" .loc 16 91 0\n" -" mov.f32 %f67, %f8;\n" -" fma.rn.ftz.f32 %f68, %f59, %f41, %f67;\n" -" mov.f32 %f8, %f68;\n" -" .loc 16 92 0\n" -" mov.f32 %f69, %f10;\n" -" mul.ftz.f32 %f70, %f40, %f40;\n" -" fma.rn.ftz.f32 %f71, %f59, %f70, %f69;\n" -" mov.f32 %f10, %f71;\n" -" .loc 16 93 0\n" -" mov.f32 %f72, %f12;\n" -" mul.ftz.f32 %f73, %f38, %f39;\n" -" fma.rn.ftz.f32 %f74, %f59, %f73, %f72;\n" -" mov.f32 %f12, %f74;\n" -" .loc 16 94 0\n" -" mov.f32 %f75, %f14;\n" -" mul.ftz.f32 %f76, %f39, %f40;\n" -" fma.rn.ftz.f32 %f77, %f59, %f76, %f75;\n" -" mov.f32 %f14, %f77;\n" -" .loc 16 95 0\n" -" mul.ftz.f32 %f78, %f38, %f40;\n" -" fma.rn.ftz.f32 %f15, %f59, %f78, %f15;\n" -" mov.f32 %f16, %f15;\n" -"$Lt_0_21506:\n" -"$Lt_0_20482:\n" -" .loc 16 58 0\n" -" mul.lo.u64 %rd39, %rd27, 4;\n" -" add.u64 %rd20, %rd20, %rd39;\n" -" setp.lt.u64 %p7, %rd20, %rd19;\n" -" @%p7 bra $Lt_0_20226;\n" -" bra.uni $Lt_0_19714;\n" -"$Lt_0_27906:\n" -" mov.f32 %f25, 0f00000000; \n" -" mov.f32 %f26, 0f00000000; \n" -" mov.f32 %f27, 0f00000000; \n" -" mov.f32 %f28, 0f00000000; \n" -"$Lt_0_19714:\n" -" mov.u32 %r51, 1;\n" -" setp.le.s32 %p8, %r1, %r51;\n" -" @%p8 bra $Lt_0_24322;\n" -" .loc 16 100 0\n" -" mov.u64 %rd40, __cuda___cuda_local_var_32601_55_non_const_red_acc108;\n" -" cvt.s64.s32 %rd41, %r2;\n" -" mul.wide.s32 %rd42, %r2, 4;\n" -" add.u64 %rd43, %rd40, %rd42;\n" -" mov.f32 %f79, %f27;\n" -" st.shared.f32 [%rd43+0], %f79;\n" -" mov.f32 %f80, %f26;\n" -" st.shared.f32 [%rd43+512], %f80;\n" -" mov.f32 %f81, %f25;\n" -" st.shared.f32 [%rd43+1024], %f81;\n" -" mov.f32 %f82, %f28;\n" -" st.shared.f32 [%rd43+1536], %f82;\n" -" shr.s32 %r52, %r1, 31;\n" -" mov.s32 %r53, 1;\n" -" and.b32 %r54, %r52, %r53;\n" -" add.s32 %r55, %r54, %r1;\n" -" shr.s32 %r56, %r55, 1;\n" -" mov.s32 %r57, %r56;\n" -" mov.u32 %r58, 0;\n" -" setp.ne.u32 %p9, %r56, %r58;\n" -" @!%p9 bra $Lt_0_22786;\n" -"$Lt_0_23298:\n" -" setp.ge.u32 %p10, %r13, %r57;\n" -" @%p10 bra $Lt_0_23554;\n" -" add.u32 %r59, %r2, %r57;\n" -" cvt.u64.u32 %rd44, %r59;\n" -" mul.wide.u32 %rd45, %r59, 4;\n" -" add.u64 %rd46, %rd40, %rd45;\n" -" ld.shared.f32 %f83, [%rd46+0];\n" -" add.ftz.f32 %f79, %f83, %f79;\n" -" st.shared.f32 [%rd43+0], %f79;\n" -" ld.shared.f32 %f84, [%rd46+512];\n" -" add.ftz.f32 %f80, %f84, %f80;\n" -" st.shared.f32 [%rd43+512], %f80;\n" -" ld.shared.f32 %f85, [%rd46+1024];\n" -" add.ftz.f32 %f81, %f85, %f81;\n" -" st.shared.f32 [%rd43+1024], %f81;\n" -" ld.shared.f32 %f86, [%rd46+1536];\n" -" add.ftz.f32 %f82, %f86, %f82;\n" -" st.shared.f32 [%rd43+1536], %f82;\n" -"$Lt_0_23554:\n" -" shr.u32 %r57, %r57, 1;\n" -" mov.u32 %r60, 0;\n" -" setp.ne.u32 %p11, %r57, %r60;\n" -" @%p11 bra $Lt_0_23298;\n" -"$Lt_0_22786:\n" -" mov.f32 %f27, %f79;\n" -" mov.f32 %f26, %f80;\n" -" mov.f32 %f25, %f81;\n" -" mov.f32 %f28, %f82;\n" -" ld.param.s32 %r61, [__cudaparm_kernel_pair_vflag];\n" -" mov.u32 %r62, 0;\n" -" setp.le.s32 %p12, %r61, %r62;\n" -" @%p12 bra $Lt_0_24322;\n" -" mov.f32 %f79, %f6;\n" -" st.shared.f32 [%rd43+0], %f79;\n" -" mov.f32 %f80, %f8;\n" -" st.shared.f32 [%rd43+512], %f80;\n" -" mov.f32 %f81, %f10;\n" -" st.shared.f32 [%rd43+1024], %f81;\n" -" mov.f32 %f82, %f12;\n" -" st.shared.f32 [%rd43+1536], %f82;\n" -" mov.f32 %f87, %f14;\n" -" st.shared.f32 [%rd43+2048], %f87;\n" -" mov.f32 %f88, %f15;\n" -" st.shared.f32 [%rd43+2560], %f88;\n" -" mov.s32 %r63, %r56;\n" -" @!%p9 bra $Lt_0_24834;\n" -"$Lt_0_25346:\n" -" setp.ge.u32 %p13, %r13, %r63;\n" -" @%p13 bra $Lt_0_25602;\n" -" add.u32 %r64, %r2, %r63;\n" -" cvt.u64.u32 %rd47, %r64;\n" -" mul.wide.u32 %rd48, %r64, 4;\n" -" add.u64 %rd49, %rd40, %rd48;\n" -" ld.shared.f32 %f89, [%rd49+0];\n" -" add.ftz.f32 %f79, %f89, %f79;\n" -" st.shared.f32 [%rd43+0], %f79;\n" -" ld.shared.f32 %f90, [%rd49+512];\n" -" add.ftz.f32 %f80, %f90, %f80;\n" -" st.shared.f32 [%rd43+512], %f80;\n" -" ld.shared.f32 %f91, [%rd49+1024];\n" -" add.ftz.f32 %f81, %f91, %f81;\n" -" st.shared.f32 [%rd43+1024], %f81;\n" -" ld.shared.f32 %f92, [%rd49+1536];\n" -" add.ftz.f32 %f82, %f92, %f82;\n" -" st.shared.f32 [%rd43+1536], %f82;\n" -" ld.shared.f32 %f93, [%rd49+2048];\n" -" add.ftz.f32 %f87, %f93, %f87;\n" -" st.shared.f32 [%rd43+2048], %f87;\n" -" ld.shared.f32 %f94, [%rd49+2560];\n" -" add.ftz.f32 %f88, %f94, %f88;\n" -" st.shared.f32 [%rd43+2560], %f88;\n" -"$Lt_0_25602:\n" -" shr.u32 %r63, %r63, 1;\n" -" mov.u32 %r65, 0;\n" -" setp.ne.u32 %p14, %r63, %r65;\n" -" @%p14 bra $Lt_0_25346;\n" -"$Lt_0_24834:\n" -" mov.f32 %f6, %f79;\n" -" mov.f32 %f8, %f80;\n" -" mov.f32 %f10, %f81;\n" -" mov.f32 %f12, %f82;\n" -" mov.f32 %f14, %f87;\n" -" mov.f32 %f16, %f88;\n" -"$Lt_0_24322:\n" -"$Lt_0_22274:\n" -" mov.u32 %r66, 0;\n" -" setp.ne.s32 %p15, %r13, %r66;\n" -" @%p15 bra $Lt_0_26370;\n" -" ld.param.u64 %rd50, [__cudaparm_kernel_pair___val_paramengv];\n" -" add.u64 %rd51, %rd50, %rd5;\n" -" ld.param.s32 %r67, [__cudaparm_kernel_pair_eflag];\n" -" mov.u32 %r68, 0;\n" -" setp.le.s32 %p16, %r67, %r68;\n" -" @%p16 bra $Lt_0_26882;\n" -" st.global.f32 [%rd51+0], %f28;\n" -" cvt.s64.s32 %rd52, %r9;\n" -" mul.wide.s32 %rd53, %r9, 4;\n" -" add.u64 %rd51, %rd51, %rd53;\n" -"$Lt_0_26882:\n" -" ld.param.s32 %r69, [__cudaparm_kernel_pair_vflag];\n" -" mov.u32 %r70, 0;\n" -" setp.le.s32 %p17, %r69, %r70;\n" -" @%p17 bra $Lt_0_27394;\n" -" mov.f32 %f95, %f6;\n" -" st.global.f32 [%rd51+0], %f95;\n" -" cvt.s64.s32 %rd54, %r9;\n" -" mul.wide.s32 %rd55, %r9, 4;\n" -" add.u64 %rd56, %rd55, %rd51;\n" -" mov.f32 %f96, %f8;\n" -" st.global.f32 [%rd56+0], %f96;\n" -" add.u64 %rd57, %rd55, %rd56;\n" -" mov.f32 %f97, %f10;\n" -" st.global.f32 [%rd57+0], %f97;\n" -" add.u64 %rd58, %rd55, %rd57;\n" -" mov.f32 %f98, %f12;\n" -" st.global.f32 [%rd58+0], %f98;\n" -" add.u64 %rd51, %rd55, %rd58;\n" -" mov.f32 %f99, %f14;\n" -" st.global.f32 [%rd51+0], %f99;\n" -" mov.f32 %f100, %f16;\n" -" add.u64 %rd59, %rd55, %rd51;\n" -" st.global.f32 [%rd59+0], %f100;\n" -"$Lt_0_27394:\n" -" ld.param.u64 %rd60, [__cudaparm_kernel_pair_ans];\n" -" mul.lo.u64 %rd61, %rd4, 16;\n" -" add.u64 %rd62, %rd60, %rd61;\n" -" mov.f32 %f101, %f102;\n" -" st.global.v4.f32 [%rd62+0], {%f27,%f26,%f25,%f101};\n" -"$Lt_0_26370:\n" -"$Lt_0_18690:\n" -" .loc 16 103 0\n" -" exit;\n" -"$LDWend_kernel_pair:\n" -" }\n" -" .entry kernel_pair_fast (\n" -" .param .u64 __cudaparm_kernel_pair_fast_x_,\n" -" .param .u64 __cudaparm_kernel_pair_fast_mor1_in,\n" -" .param .u64 __cudaparm_kernel_pair_fast_mor2_in,\n" -" .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,\n" -" .param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n" -" .param .u64 __cudaparm_kernel_pair_fast_dev_packed,\n" -" .param .u64 __cudaparm_kernel_pair_fast_ans,\n" -" .param .u64 __cudaparm_kernel_pair_fast___val_paramengv,\n" -" .param .s32 __cudaparm_kernel_pair_fast_eflag,\n" -" .param .s32 __cudaparm_kernel_pair_fast_vflag,\n" -" .param .s32 __cudaparm_kernel_pair_fast_inum,\n" -" .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,\n" -" .param .s32 __cudaparm_kernel_pair_fast_t_per_atom)\n" -" {\n" -" .reg .u32 %r<74>;\n" -" .reg .u64 %rd<77>;\n" -" .reg .f32 %f<110>;\n" -" .reg .pred %p<22>;\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_32618_33_non_const_sp_lj3268[16];\n" -" .shared .align 16 .b8 __cuda___cuda_local_var_32616_34_non_const_mor13296[1936];\n" -" .shared .align 8 .b8 __cuda___cuda_local_var_32617_34_non_const_mor25232[968];\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_32688_55_non_const_red_acc6200[3072];\n" -" .loc 16 111 0\n" -"$LDWbegin_kernel_pair_fast:\n" -" cvt.s32.u32 %r1, %tid.x;\n" -" mov.u32 %r2, 3;\n" -" setp.gt.s32 %p1, %r1, %r2;\n" -" @%p1 bra $Lt_1_20994;\n" -" .loc 16 119 0\n" -" mov.u64 %rd1, __cuda___cuda_local_var_32618_33_non_const_sp_lj3268;\n" -" cvt.s64.s32 %rd2, %r1;\n" -" mul.wide.s32 %rd3, %r1, 4;\n" -" ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];\n" -" add.u64 %rd5, %rd4, %rd3;\n" -" ld.global.f32 %f1, [%rd5+0];\n" -" add.u64 %rd6, %rd3, %rd1;\n" -" st.shared.f32 [%rd6+0], %f1;\n" -"$Lt_1_20994:\n" -" mov.u64 %rd1, __cuda___cuda_local_var_32618_33_non_const_sp_lj3268;\n" -" mov.u32 %r3, 120;\n" -" setp.gt.s32 %p2, %r1, %r3;\n" -" @%p2 bra $Lt_1_21506;\n" -" .loc 16 121 0\n" -" mov.u64 %rd7, __cuda___cuda_local_var_32616_34_non_const_mor13296;\n" -" cvt.s64.s32 %rd8, %r1;\n" -" mul.wide.s32 %rd9, %r1, 16;\n" -" ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_mor1_in];\n" -" add.u64 %rd11, %rd10, %rd9;\n" -" add.u64 %rd12, %rd9, %rd7;\n" -" ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0];\n" -" st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5};\n" -" ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag];\n" -" mov.u32 %r5, 0;\n" -" setp.le.s32 %p3, %r4, %r5;\n" -" @%p3 bra $Lt_1_22018;\n" -" .loc 16 123 0\n" -" mov.u64 %rd13, __cuda___cuda_local_var_32617_34_non_const_mor25232;\n" -" mul.lo.u64 %rd14, %rd8, 8;\n" -" ld.param.u64 %rd15, [__cudaparm_kernel_pair_fast_mor2_in];\n" -" add.u64 %rd16, %rd15, %rd14;\n" -" add.u64 %rd17, %rd14, %rd13;\n" -" ld.global.v2.f32 {%f6,%f7}, [%rd16+0];\n" -" st.shared.v2.f32 [%rd17+0], {%f6,%f7};\n" -"$Lt_1_22018:\n" -" mov.u64 %rd13, __cuda___cuda_local_var_32617_34_non_const_mor25232;\n" -"$Lt_1_21506:\n" -" mov.u64 %rd13, __cuda___cuda_local_var_32617_34_non_const_mor25232;\n" -" mov.u64 %rd7, __cuda___cuda_local_var_32616_34_non_const_mor13296;\n" -" .loc 16 131 0\n" -" mov.f32 %f8, 0f00000000; \n" -" mov.f32 %f9, %f8;\n" -" mov.f32 %f10, 0f00000000; \n" -" mov.f32 %f11, %f10;\n" -" mov.f32 %f12, 0f00000000; \n" -" mov.f32 %f13, %f12;\n" -" mov.f32 %f14, 0f00000000; \n" -" mov.f32 %f15, %f14;\n" -" mov.f32 %f16, 0f00000000; \n" -" mov.f32 %f17, %f16;\n" -" mov.f32 %f18, 0f00000000; \n" -" mov.f32 %f19, %f18;\n" -" .loc 16 133 0\n" -" bar.sync 0;\n" -" ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom];\n" -" div.s32 %r7, %r1, %r6;\n" -" cvt.s32.u32 %r8, %ntid.x;\n" -" div.s32 %r9, %r8, %r6;\n" -" cvt.s32.u32 %r10, %ctaid.x;\n" -" mul.lo.s32 %r11, %r10, %r9;\n" -" add.s32 %r12, %r7, %r11;\n" -" ld.param.s32 %r13, [__cudaparm_kernel_pair_fast_inum];\n" -" setp.ge.s32 %p4, %r12, %r13;\n" -" @%p4 bra $Lt_1_30210;\n" -" .loc 16 138 0\n" -" ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_nbor_pitch];\n" -" cvt.s64.s32 %rd18, %r14;\n" -" mul.wide.s32 %rd19, %r14, 4;\n" -" cvt.s64.s32 %rd20, %r12;\n" -" mul.wide.s32 %rd21, %r12, 4;\n" -" ld.param.u64 %rd22, [__cudaparm_kernel_pair_fast_dev_nbor];\n" -" add.u64 %rd23, %rd21, %rd22;\n" -" add.u64 %rd24, %rd19, %rd23;\n" -" ld.global.s32 %r15, [%rd24+0];\n" -" sub.s32 %r16, %r6, 1;\n" -" and.b32 %r17, %r16, %r1;\n" -" cvt.s64.s32 %rd25, %r17;\n" -" mul.wide.s32 %rd26, %r17, 4;\n" -" ld.param.u64 %rd27, [__cudaparm_kernel_pair_fast_dev_packed];\n" -" setp.ne.u64 %p5, %rd27, %rd22;\n" -" @%p5 bra $Lt_1_23298;\n" -" cvt.s32.s64 %r18, %rd18;\n" -" mul.lo.s32 %r19, %r18, %r6;\n" -" mov.s32 %r20, %r19;\n" -" mul.lo.s32 %r21, %r16, %r12;\n" -" add.s32 %r22, %r18, %r21;\n" -" cvt.s64.s32 %rd28, %r22;\n" -" mul.wide.s32 %rd29, %r22, 4;\n" -" add.u64 %rd30, %rd24, %rd29;\n" -" and.b32 %r23, %r16, %r15;\n" -" cvt.s64.s32 %rd31, %r23;\n" -" div.s32 %r24, %r15, %r6;\n" -" mul.lo.s32 %r25, %r19, %r24;\n" -" cvt.s64.s32 %rd32, %r25;\n" -" add.u64 %rd33, %rd31, %rd32;\n" -" mul.lo.u64 %rd34, %rd33, 4;\n" -" add.u64 %rd35, %rd30, %rd34;\n" -" add.u64 %rd36, %rd26, %rd30;\n" -" bra.uni $Lt_1_23042;\n" -"$Lt_1_23298:\n" -" add.u64 %rd37, %rd19, %rd24;\n" -" ld.global.s32 %r26, [%rd37+0];\n" -" cvt.s64.s32 %rd38, %r26;\n" -" mul.wide.s32 %rd39, %r26, 4;\n" -" add.u64 %rd40, %rd27, %rd39;\n" -" cvt.s64.s32 %rd41, %r15;\n" -" mul.wide.s32 %rd42, %r15, 4;\n" -" add.u64 %rd35, %rd40, %rd42;\n" -" mov.s32 %r20, %r6;\n" -" add.u64 %rd36, %rd26, %rd40;\n" -"$Lt_1_23042:\n" -" .loc 16 141 0\n" -" ld.global.s32 %r27, [%rd23+0];\n" -" mov.u32 %r28, %r27;\n" -" mov.s32 %r29, 0;\n" -" mov.u32 %r30, %r29;\n" -" mov.s32 %r31, 0;\n" -" mov.u32 %r32, %r31;\n" -" mov.s32 %r33, 0;\n" -" mov.u32 %r34, %r33;\n" -" tex.1d.v4.f32.s32 {%f20,%f21,%f22,%f23},[pos_tex,{%r28,%r30,%r32,%r34}];\n" -" mov.f32 %f24, %f20;\n" -" mov.f32 %f25, %f21;\n" -" mov.f32 %f26, %f22;\n" -" mov.f32 %f27, %f23;\n" -" setp.ge.u64 %p6, %rd36, %rd35;\n" -" @%p6 bra $Lt_1_31746;\n" -" cvt.rzi.ftz.s32.f32 %r35, %f27;\n" -" cvt.s64.s32 %rd43, %r20;\n" -" mul.lo.s32 %r36, %r35, 11;\n" -" cvt.rn.f32.s32 %f28, %r36;\n" -" mov.f32 %f29, 0f00000000; \n" -" mov.f32 %f30, 0f00000000; \n" -" mov.f32 %f31, 0f00000000; \n" -" mov.f32 %f32, 0f00000000; \n" -"$Lt_1_24066:\n" -" .loc 16 148 0\n" -" ld.global.s32 %r37, [%rd36+0];\n" -" .loc 16 149 0\n" -" shr.s32 %r38, %r37, 30;\n" -" and.b32 %r39, %r38, 3;\n" -" cvt.s64.s32 %rd44, %r39;\n" -" mul.wide.s32 %rd45, %r39, 4;\n" -" add.u64 %rd46, %rd1, %rd45;\n" -" ld.shared.f32 %f33, [%rd46+0];\n" -" .loc 16 152 0\n" -" and.b32 %r40, %r37, 1073741823;\n" -" mov.u32 %r41, %r40;\n" -" mov.s32 %r42, 0;\n" -" mov.u32 %r43, %r42;\n" -" mov.s32 %r44, 0;\n" -" mov.u32 %r45, %r44;\n" -" mov.s32 %r46, 0;\n" -" mov.u32 %r47, %r46;\n" -" tex.1d.v4.f32.s32 {%f34,%f35,%f36,%f37},[pos_tex,{%r41,%r43,%r45,%r47}];\n" -" mov.f32 %f38, %f34;\n" -" mov.f32 %f39, %f35;\n" -" mov.f32 %f40, %f36;\n" -" mov.f32 %f41, %f37;\n" -" sub.ftz.f32 %f42, %f25, %f39;\n" -" sub.ftz.f32 %f43, %f24, %f38;\n" -" sub.ftz.f32 %f44, %f26, %f40;\n" -" mul.ftz.f32 %f45, %f42, %f42;\n" -" fma.rn.ftz.f32 %f46, %f43, %f43, %f45;\n" -" fma.rn.ftz.f32 %f47, %f44, %f44, %f46;\n" -" add.ftz.f32 %f48, %f28, %f41;\n" -" cvt.rzi.ftz.s32.f32 %r48, %f48;\n" -" cvt.s64.s32 %rd47, %r48;\n" -" mul.wide.s32 %rd48, %r48, 16;\n" -" add.u64 %rd49, %rd7, %rd48;\n" -" ld.shared.f32 %f49, [%rd49+0];\n" -" setp.gt.ftz.f32 %p7, %f49, %f47;\n" -" @!%p7 bra $Lt_1_25346;\n" -" .loc 16 163 0\n" -" sqrt.approx.ftz.f32 %f50, %f47;\n" -" ld.shared.v4.f32 {_,%f51,%f52,%f53}, [%rd49+0];\n" -" sub.ftz.f32 %f54, %f50, %f52;\n" -" .loc 16 164 0\n" -" mul.ftz.f32 %f55, %f53, %f54;\n" -" neg.ftz.f32 %f56, %f55;\n" -" .loc 16 166 0\n" -" mov.f32 %f57, 0f3fb8aa3b; \n" -" mul.ftz.f32 %f58, %f56, %f57;\n" -" ex2.approx.ftz.f32 %f59, %f58;\n" -" mul.ftz.f32 %f60, %f59, %f59;\n" -" sub.ftz.f32 %f61, %f60, %f59;\n" -" mul.ftz.f32 %f62, %f51, %f61;\n" -" .loc 16 168 0\n" -" div.approx.ftz.f32 %f63, %f62, %f50;\n" -" mul.ftz.f32 %f64, %f63, %f33;\n" -" fma.rn.ftz.f32 %f31, %f43, %f64, %f31;\n" -" .loc 16 169 0\n" -" fma.rn.ftz.f32 %f30, %f42, %f64, %f30;\n" -" .loc 16 170 0\n" -" fma.rn.ftz.f32 %f29, %f44, %f64, %f29;\n" -" ld.param.s32 %r49, [__cudaparm_kernel_pair_fast_eflag];\n" -" mov.u32 %r50, 0;\n" -" setp.le.s32 %p8, %r49, %r50;\n" -" @%p8 bra $Lt_1_24834;\n" -" .loc 16 173 0\n" -" mul.lo.u64 %rd50, %rd47, 8;\n" -" add.u64 %rd51, %rd13, %rd50;\n" -" ld.shared.v2.f32 {%f65,%f66}, [%rd51+0];\n" -" sub.ftz.f32 %f67, %f61, %f59;\n" -" mul.ftz.f32 %f68, %f65, %f67;\n" -" sub.ftz.f32 %f69, %f68, %f66;\n" -" .loc 16 174 0\n" -" fma.rn.ftz.f32 %f32, %f33, %f69, %f32;\n" -"$Lt_1_24834:\n" -" ld.param.s32 %r51, [__cudaparm_kernel_pair_fast_vflag];\n" -" mov.u32 %r52, 0;\n" -" setp.le.s32 %p9, %r51, %r52;\n" -" @%p9 bra $Lt_1_25346;\n" -" .loc 16 177 0\n" -" mov.f32 %f70, %f9;\n" -" mul.ftz.f32 %f71, %f43, %f43;\n" -" fma.rn.ftz.f32 %f72, %f64, %f71, %f70;\n" -" mov.f32 %f9, %f72;\n" -" .loc 16 178 0\n" -" mov.f32 %f73, %f11;\n" -" fma.rn.ftz.f32 %f74, %f64, %f45, %f73;\n" -" mov.f32 %f11, %f74;\n" -" .loc 16 179 0\n" -" mov.f32 %f75, %f13;\n" -" mul.ftz.f32 %f76, %f44, %f44;\n" -" fma.rn.ftz.f32 %f77, %f64, %f76, %f75;\n" -" mov.f32 %f13, %f77;\n" -" .loc 16 180 0\n" -" mov.f32 %f78, %f15;\n" -" mul.ftz.f32 %f79, %f42, %f43;\n" -" fma.rn.ftz.f32 %f80, %f64, %f79, %f78;\n" -" mov.f32 %f15, %f80;\n" -" .loc 16 181 0\n" -" mov.f32 %f81, %f17;\n" -" mul.ftz.f32 %f82, %f43, %f44;\n" -" fma.rn.ftz.f32 %f83, %f64, %f82, %f81;\n" -" mov.f32 %f17, %f83;\n" -" .loc 16 182 0\n" -" mul.ftz.f32 %f84, %f42, %f44;\n" -" fma.rn.ftz.f32 %f18, %f64, %f84, %f18;\n" -" mov.f32 %f19, %f18;\n" -"$Lt_1_25346:\n" -"$Lt_1_24322:\n" -" .loc 16 146 0\n" -" mul.lo.u64 %rd52, %rd43, 4;\n" -" add.u64 %rd36, %rd36, %rd52;\n" -" setp.lt.u64 %p10, %rd36, %rd35;\n" -" @%p10 bra $Lt_1_24066;\n" -" bra.uni $Lt_1_23554;\n" -"$Lt_1_31746:\n" -" mov.f32 %f29, 0f00000000; \n" -" mov.f32 %f30, 0f00000000; \n" -" mov.f32 %f31, 0f00000000; \n" -" mov.f32 %f32, 0f00000000; \n" -"$Lt_1_23554:\n" -" mov.u32 %r53, 1;\n" -" setp.le.s32 %p11, %r6, %r53;\n" -" @%p11 bra $Lt_1_28162;\n" -" .loc 16 187 0\n" -" mov.u64 %rd53, __cuda___cuda_local_var_32688_55_non_const_red_acc6200;\n" -" cvt.s64.s32 %rd54, %r1;\n" -" mul.wide.s32 %rd55, %r1, 4;\n" -" add.u64 %rd56, %rd53, %rd55;\n" -" mov.f32 %f85, %f31;\n" -" st.shared.f32 [%rd56+0], %f85;\n" -" mov.f32 %f86, %f30;\n" -" st.shared.f32 [%rd56+512], %f86;\n" -" mov.f32 %f87, %f29;\n" -" st.shared.f32 [%rd56+1024], %f87;\n" -" mov.f32 %f88, %f32;\n" -" st.shared.f32 [%rd56+1536], %f88;\n" -" shr.s32 %r54, %r6, 31;\n" -" mov.s32 %r55, 1;\n" -" and.b32 %r56, %r54, %r55;\n" -" add.s32 %r57, %r56, %r6;\n" -" shr.s32 %r58, %r57, 1;\n" -" mov.s32 %r59, %r58;\n" -" mov.u32 %r60, 0;\n" -" setp.ne.u32 %p12, %r58, %r60;\n" -" @!%p12 bra $Lt_1_26626;\n" -"$Lt_1_27138:\n" -" setp.ge.u32 %p13, %r17, %r59;\n" -" @%p13 bra $Lt_1_27394;\n" -" add.u32 %r61, %r1, %r59;\n" -" cvt.u64.u32 %rd57, %r61;\n" -" mul.wide.u32 %rd58, %r61, 4;\n" -" add.u64 %rd59, %rd53, %rd58;\n" -" ld.shared.f32 %f89, [%rd59+0];\n" -" add.ftz.f32 %f85, %f89, %f85;\n" -" st.shared.f32 [%rd56+0], %f85;\n" -" ld.shared.f32 %f90, [%rd59+512];\n" -" add.ftz.f32 %f86, %f90, %f86;\n" -" st.shared.f32 [%rd56+512], %f86;\n" -" ld.shared.f32 %f91, [%rd59+1024];\n" -" add.ftz.f32 %f87, %f91, %f87;\n" -" st.shared.f32 [%rd56+1024], %f87;\n" -" ld.shared.f32 %f92, [%rd59+1536];\n" -" add.ftz.f32 %f88, %f92, %f88;\n" -" st.shared.f32 [%rd56+1536], %f88;\n" -"$Lt_1_27394:\n" -" shr.u32 %r59, %r59, 1;\n" -" mov.u32 %r62, 0;\n" -" setp.ne.u32 %p14, %r59, %r62;\n" -" @%p14 bra $Lt_1_27138;\n" -"$Lt_1_26626:\n" -" mov.f32 %f31, %f85;\n" -" mov.f32 %f30, %f86;\n" -" mov.f32 %f29, %f87;\n" -" mov.f32 %f32, %f88;\n" -" ld.param.s32 %r63, [__cudaparm_kernel_pair_fast_vflag];\n" -" mov.u32 %r64, 0;\n" -" setp.le.s32 %p15, %r63, %r64;\n" -" @%p15 bra $Lt_1_28162;\n" -" mov.f32 %f85, %f9;\n" -" st.shared.f32 [%rd56+0], %f85;\n" -" mov.f32 %f86, %f11;\n" -" st.shared.f32 [%rd56+512], %f86;\n" -" mov.f32 %f87, %f13;\n" -" st.shared.f32 [%rd56+1024], %f87;\n" -" mov.f32 %f88, %f15;\n" -" st.shared.f32 [%rd56+1536], %f88;\n" -" mov.f32 %f93, %f17;\n" -" st.shared.f32 [%rd56+2048], %f93;\n" -" mov.f32 %f94, %f18;\n" -" st.shared.f32 [%rd56+2560], %f94;\n" -" mov.s32 %r65, %r58;\n" -" @!%p12 bra $Lt_1_28674;\n" -"$Lt_1_29186:\n" -" setp.ge.u32 %p16, %r17, %r65;\n" -" @%p16 bra $Lt_1_29442;\n" -" add.u32 %r66, %r1, %r65;\n" -" cvt.u64.u32 %rd60, %r66;\n" -" mul.wide.u32 %rd61, %r66, 4;\n" -" add.u64 %rd62, %rd53, %rd61;\n" -" ld.shared.f32 %f95, [%rd62+0];\n" -" add.ftz.f32 %f85, %f95, %f85;\n" -" st.shared.f32 [%rd56+0], %f85;\n" -" ld.shared.f32 %f96, [%rd62+512];\n" -" add.ftz.f32 %f86, %f96, %f86;\n" -" st.shared.f32 [%rd56+512], %f86;\n" -" ld.shared.f32 %f97, [%rd62+1024];\n" -" add.ftz.f32 %f87, %f97, %f87;\n" -" st.shared.f32 [%rd56+1024], %f87;\n" -" ld.shared.f32 %f98, [%rd62+1536];\n" -" add.ftz.f32 %f88, %f98, %f88;\n" -" st.shared.f32 [%rd56+1536], %f88;\n" -" ld.shared.f32 %f99, [%rd62+2048];\n" -" add.ftz.f32 %f93, %f99, %f93;\n" -" st.shared.f32 [%rd56+2048], %f93;\n" -" ld.shared.f32 %f100, [%rd62+2560];\n" -" add.ftz.f32 %f94, %f100, %f94;\n" -" st.shared.f32 [%rd56+2560], %f94;\n" -"$Lt_1_29442:\n" -" shr.u32 %r65, %r65, 1;\n" -" mov.u32 %r67, 0;\n" -" setp.ne.u32 %p17, %r65, %r67;\n" -" @%p17 bra $Lt_1_29186;\n" -"$Lt_1_28674:\n" -" mov.f32 %f9, %f85;\n" -" mov.f32 %f11, %f86;\n" -" mov.f32 %f13, %f87;\n" -" mov.f32 %f15, %f88;\n" -" mov.f32 %f17, %f93;\n" -" mov.f32 %f19, %f94;\n" -"$Lt_1_28162:\n" -"$Lt_1_26114:\n" -" mov.u32 %r68, 0;\n" -" setp.ne.s32 %p18, %r17, %r68;\n" -" @%p18 bra $Lt_1_30210;\n" -" ld.param.u64 %rd63, [__cudaparm_kernel_pair_fast___val_paramengv];\n" -" add.u64 %rd64, %rd63, %rd21;\n" -" ld.param.s32 %r69, [__cudaparm_kernel_pair_fast_eflag];\n" -" mov.u32 %r70, 0;\n" -" setp.le.s32 %p19, %r69, %r70;\n" -" @%p19 bra $Lt_1_30722;\n" -" st.global.f32 [%rd64+0], %f32;\n" -" cvt.s64.s32 %rd65, %r13;\n" -" mul.wide.s32 %rd66, %r13, 4;\n" -" add.u64 %rd64, %rd64, %rd66;\n" -"$Lt_1_30722:\n" -" ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag];\n" -" mov.u32 %r72, 0;\n" -" setp.le.s32 %p20, %r71, %r72;\n" -" @%p20 bra $Lt_1_31234;\n" -" mov.f32 %f101, %f9;\n" -" st.global.f32 [%rd64+0], %f101;\n" -" cvt.s64.s32 %rd67, %r13;\n" -" mul.wide.s32 %rd68, %r13, 4;\n" -" add.u64 %rd69, %rd68, %rd64;\n" -" mov.f32 %f102, %f11;\n" -" st.global.f32 [%rd69+0], %f102;\n" -" add.u64 %rd70, %rd68, %rd69;\n" -" mov.f32 %f103, %f13;\n" -" st.global.f32 [%rd70+0], %f103;\n" -" add.u64 %rd71, %rd68, %rd70;\n" -" mov.f32 %f104, %f15;\n" -" st.global.f32 [%rd71+0], %f104;\n" -" add.u64 %rd64, %rd68, %rd71;\n" -" mov.f32 %f105, %f17;\n" -" st.global.f32 [%rd64+0], %f105;\n" -" mov.f32 %f106, %f19;\n" -" add.u64 %rd72, %rd68, %rd64;\n" -" st.global.f32 [%rd72+0], %f106;\n" -"$Lt_1_31234:\n" -" ld.param.u64 %rd73, [__cudaparm_kernel_pair_fast_ans];\n" -" mul.lo.u64 %rd74, %rd20, 16;\n" -" add.u64 %rd75, %rd73, %rd74;\n" -" mov.f32 %f107, %f108;\n" -" st.global.v4.f32 [%rd75+0], {%f31,%f30,%f29,%f107};\n" -"$Lt_1_30210:\n" -"$Lt_1_22530:\n" -" .loc 16 190 0\n" -" exit;\n" -"$LDWend_kernel_pair_fast:\n" -" }\n" -; diff --git a/lib/gpu/neighbor_cpu.ptx b/lib/gpu/neighbor_cpu.ptx deleted file mode 100644 index 354ea985b1..0000000000 --- a/lib/gpu/neighbor_cpu.ptx +++ /dev/null @@ -1,132 +0,0 @@ - .version 2.3 - .target sm_20 - .address_size 64 - // compiled with /usr/local/cuda/open64/lib//be - // nvopencc 4.0 built on 2011-05-12 - - //----------------------------------------------------------- - // Compiling /tmp/tmpxft_00009a34_00000000-9_lal_neighbor_cpu.cpp3.i (/home/sjplimp/ccBI#.V8lyjI) - //----------------------------------------------------------- - - //----------------------------------------------------------- - // Options: - //----------------------------------------------------------- - // Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64 - // -O3 (Optimization level) - // -g0 (Debug level) - // -m2 (Report advisories) - //----------------------------------------------------------- - - .file 1 "" - .file 2 "/tmp/tmpxft_00009a34_00000000-8_lal_neighbor_cpu.cudafe2.gpu" - .file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h" - .file 4 "/usr/local/cuda/include/crt/device_runtime.h" - .file 5 "/usr/local/cuda/include/host_defines.h" - .file 6 "/usr/local/cuda/include/builtin_types.h" - .file 7 "/usr/local/cuda/include/device_types.h" - .file 8 "/usr/local/cuda/include/driver_types.h" - .file 9 "/usr/local/cuda/include/surface_types.h" - .file 10 "/usr/local/cuda/include/texture_types.h" - .file 11 "/usr/local/cuda/include/vector_types.h" - .file 12 "/usr/local/cuda/include/device_launch_parameters.h" - .file 13 "/usr/local/cuda/include/crt/storage_class.h" - .file 14 "/usr/include/bits/types.h" - .file 15 "/usr/include/time.h" - .file 16 "lal_neighbor_cpu.cu" - .file 17 "/usr/local/cuda/include/common_functions.h" - .file 18 "/usr/local/cuda/include/math_functions.h" - .file 19 "/usr/local/cuda/include/math_constants.h" - .file 20 "/usr/local/cuda/include/device_functions.h" - .file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h" - .file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h" - .file 23 "/usr/local/cuda/include/sm_13_double_functions.h" - .file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h" - .file 25 "/usr/local/cuda/include/sm_20_intrinsics.h" - .file 26 "/usr/local/cuda/include/surface_functions.h" - .file 27 "/usr/local/cuda/include/texture_fetch_functions.h" - .file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h" - - - .entry kernel_unpack ( - .param .u64 __cudaparm_kernel_unpack_dev_nbor, - .param .u64 __cudaparm_kernel_unpack_dev_ij, - .param .s32 __cudaparm_kernel_unpack_inum, - .param .s32 __cudaparm_kernel_unpack_t_per_atom) - { - .reg .u32 %r<19>; - .reg .u64 %rd<33>; - .reg .pred %p<5>; - .loc 16 21 0 -$LDWbegin_kernel_unpack: - ld.param.s32 %r1, [__cudaparm_kernel_unpack_t_per_atom]; - cvt.s32.u32 %r2, %tid.x; - div.s32 %r3, %r2, %r1; - cvt.s32.u32 %r4, %ntid.x; - div.s32 %r5, %r4, %r1; - cvt.s32.u32 %r6, %ctaid.x; - mul.lo.s32 %r7, %r6, %r5; - add.s32 %r8, %r3, %r7; - ld.param.s32 %r9, [__cudaparm_kernel_unpack_inum]; - setp.ge.s32 %p1, %r8, %r9; - @%p1 bra $Lt_0_2050; - .loc 16 30 0 - cvt.s64.s32 %rd1, %r9; - ld.param.u64 %rd2, [__cudaparm_kernel_unpack_dev_nbor]; - cvt.s64.s32 %rd3, %r8; - add.u64 %rd4, %rd3, %rd1; - mul.lo.u64 %rd5, %rd4, 4; - add.u64 %rd6, %rd2, %rd5; - mul.wide.s32 %rd7, %r9, 4; - add.u64 %rd8, %rd6, %rd7; - ld.param.u64 %rd9, [__cudaparm_kernel_unpack_dev_ij]; - ld.global.s32 %r10, [%rd8+0]; - cvt.s64.s32 %rd10, %r10; - mul.wide.s32 %rd11, %r10, 4; - add.u64 %rd12, %rd9, %rd11; - .loc 16 31 0 - ld.global.s32 %r11, [%rd6+0]; - cvt.s64.s32 %rd13, %r11; - mul.wide.s32 %rd14, %r11, 4; - add.u64 %rd15, %rd12, %rd14; - .loc 16 33 0 - sub.s32 %r12, %r1, 1; - and.b32 %r13, %r12, %r2; - mul.lo.s32 %r14, %r12, %r8; - add.s32 %r15, %r13, %r14; - cvt.s64.s32 %rd16, %r15; - mul.wide.s32 %rd17, %r15, 4; - add.u64 %rd18, %rd8, %rd17; - .loc 16 34 0 - cvt.s64.s32 %rd19, %r13; - mul.wide.s32 %rd20, %r13, 4; - add.u64 %rd21, %rd12, %rd20; - setp.ge.u64 %p2, %rd21, %rd15; - @%p2 bra $Lt_0_2562; - sub.u64 %rd22, %rd15, %rd21; - add.u64 %rd23, %rd22, 3; - shr.s64 %rd24, %rd23, 63; - mov.s64 %rd25, 3; - and.b64 %rd26, %rd24, %rd25; - add.s64 %rd27, %rd26, %rd23; - shr.s64 %rd28, %rd27, 2; - mul.lo.s32 %r16, %r9, %r1; - mov.s64 %rd29, %rd28; -$Lt_0_3074: - // Loop body line 34, nesting depth: 1, estimated iterations: unknown - .loc 16 37 0 - ld.global.s32 %r17, [%rd21+0]; - st.global.s32 [%rd18+0], %r17; - .loc 16 38 0 - cvt.s64.s32 %rd30, %r16; - mul.wide.s32 %rd31, %r16, 4; - add.u64 %rd18, %rd18, %rd31; - add.u64 %rd21, %rd21, 4; - setp.ne.u64 %p3, %rd21, %rd15; - @%p3 bra $Lt_0_3074; -$Lt_0_2562: -$Lt_0_2050: - .loc 16 41 0 - exit; -$LDWend_kernel_unpack: - } // kernel_unpack - diff --git a/lib/gpu/neighbor_cpu_ptx.h b/lib/gpu/neighbor_cpu_ptx.h deleted file mode 100644 index ac438d1abb..0000000000 --- a/lib/gpu/neighbor_cpu_ptx.h +++ /dev/null @@ -1,86 +0,0 @@ -const char * neighbor_cpu = -" .version 2.3\n" -" .target sm_20\n" -" .address_size 64\n" -" .entry kernel_unpack (\n" -" .param .u64 __cudaparm_kernel_unpack_dev_nbor,\n" -" .param .u64 __cudaparm_kernel_unpack_dev_ij,\n" -" .param .s32 __cudaparm_kernel_unpack_inum,\n" -" .param .s32 __cudaparm_kernel_unpack_t_per_atom)\n" -" {\n" -" .reg .u32 %r<19>;\n" -" .reg .u64 %rd<33>;\n" -" .reg .pred %p<5>;\n" -" .loc 16 21 0\n" -"$LDWbegin_kernel_unpack:\n" -" ld.param.s32 %r1, [__cudaparm_kernel_unpack_t_per_atom];\n" -" cvt.s32.u32 %r2, %tid.x;\n" -" div.s32 %r3, %r2, %r1;\n" -" cvt.s32.u32 %r4, %ntid.x;\n" -" div.s32 %r5, %r4, %r1;\n" -" cvt.s32.u32 %r6, %ctaid.x;\n" -" mul.lo.s32 %r7, %r6, %r5;\n" -" add.s32 %r8, %r3, %r7;\n" -" ld.param.s32 %r9, [__cudaparm_kernel_unpack_inum];\n" -" setp.ge.s32 %p1, %r8, %r9;\n" -" @%p1 bra $Lt_0_2050;\n" -" .loc 16 30 0\n" -" cvt.s64.s32 %rd1, %r9;\n" -" ld.param.u64 %rd2, [__cudaparm_kernel_unpack_dev_nbor];\n" -" cvt.s64.s32 %rd3, %r8;\n" -" add.u64 %rd4, %rd3, %rd1;\n" -" mul.lo.u64 %rd5, %rd4, 4;\n" -" add.u64 %rd6, %rd2, %rd5;\n" -" mul.wide.s32 %rd7, %r9, 4;\n" -" add.u64 %rd8, %rd6, %rd7;\n" -" ld.param.u64 %rd9, [__cudaparm_kernel_unpack_dev_ij];\n" -" ld.global.s32 %r10, [%rd8+0];\n" -" cvt.s64.s32 %rd10, %r10;\n" -" mul.wide.s32 %rd11, %r10, 4;\n" -" add.u64 %rd12, %rd9, %rd11;\n" -" .loc 16 31 0\n" -" ld.global.s32 %r11, [%rd6+0];\n" -" cvt.s64.s32 %rd13, %r11;\n" -" mul.wide.s32 %rd14, %r11, 4;\n" -" add.u64 %rd15, %rd12, %rd14;\n" -" .loc 16 33 0\n" -" sub.s32 %r12, %r1, 1;\n" -" and.b32 %r13, %r12, %r2;\n" -" mul.lo.s32 %r14, %r12, %r8;\n" -" add.s32 %r15, %r13, %r14;\n" -" cvt.s64.s32 %rd16, %r15;\n" -" mul.wide.s32 %rd17, %r15, 4;\n" -" add.u64 %rd18, %rd8, %rd17;\n" -" .loc 16 34 0\n" -" cvt.s64.s32 %rd19, %r13;\n" -" mul.wide.s32 %rd20, %r13, 4;\n" -" add.u64 %rd21, %rd12, %rd20;\n" -" setp.ge.u64 %p2, %rd21, %rd15;\n" -" @%p2 bra $Lt_0_2562;\n" -" sub.u64 %rd22, %rd15, %rd21;\n" -" add.u64 %rd23, %rd22, 3;\n" -" shr.s64 %rd24, %rd23, 63;\n" -" mov.s64 %rd25, 3;\n" -" and.b64 %rd26, %rd24, %rd25;\n" -" add.s64 %rd27, %rd26, %rd23;\n" -" shr.s64 %rd28, %rd27, 2;\n" -" mul.lo.s32 %r16, %r9, %r1;\n" -" mov.s64 %rd29, %rd28;\n" -"$Lt_0_3074:\n" -" .loc 16 37 0\n" -" ld.global.s32 %r17, [%rd21+0];\n" -" st.global.s32 [%rd18+0], %r17;\n" -" .loc 16 38 0\n" -" cvt.s64.s32 %rd30, %r16;\n" -" mul.wide.s32 %rd31, %r16, 4;\n" -" add.u64 %rd18, %rd18, %rd31;\n" -" add.u64 %rd21, %rd21, 4;\n" -" setp.ne.u64 %p3, %rd21, %rd15;\n" -" @%p3 bra $Lt_0_3074;\n" -"$Lt_0_2562:\n" -"$Lt_0_2050:\n" -" .loc 16 41 0\n" -" exit;\n" -"$LDWend_kernel_unpack:\n" -" }\n" -; diff --git a/lib/gpu/neighbor_gpu.ptx b/lib/gpu/neighbor_gpu.ptx deleted file mode 100644 index c43102765a..0000000000 --- a/lib/gpu/neighbor_gpu.ptx +++ /dev/null @@ -1,870 +0,0 @@ - .version 2.3 - .target sm_20 - .address_size 64 - // compiled with /usr/local/cuda/open64/lib//be - // nvopencc 4.0 built on 2011-05-12 - - //----------------------------------------------------------- - // Compiling /tmp/tmpxft_00009a53_00000000-9_lal_neighbor_gpu.cpp3.i (/home/sjplimp/ccBI#.a5G2Mh) - //----------------------------------------------------------- - - //----------------------------------------------------------- - // Options: - //----------------------------------------------------------- - // Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64 - // -O3 (Optimization level) - // -g0 (Debug level) - // -m2 (Report advisories) - //----------------------------------------------------------- - - .file 1 "" - .file 2 "/tmp/tmpxft_00009a53_00000000-8_lal_neighbor_gpu.cudafe2.gpu" - .file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h" - .file 4 "/usr/local/cuda/include/crt/device_runtime.h" - .file 5 "/usr/local/cuda/include/host_defines.h" - .file 6 "/usr/local/cuda/include/builtin_types.h" - .file 7 "/usr/local/cuda/include/device_types.h" - .file 8 "/usr/local/cuda/include/driver_types.h" - .file 9 "/usr/local/cuda/include/surface_types.h" - .file 10 "/usr/local/cuda/include/texture_types.h" - .file 11 "/usr/local/cuda/include/vector_types.h" - .file 12 "/usr/local/cuda/include/device_launch_parameters.h" - .file 13 "/usr/local/cuda/include/crt/storage_class.h" - .file 14 "/usr/include/bits/types.h" - .file 15 "/usr/include/time.h" - .file 16 "lal_neighbor_gpu.cu" - .file 17 "/usr/local/cuda/include/common_functions.h" - .file 18 "/usr/local/cuda/include/math_functions.h" - .file 19 "/usr/local/cuda/include/math_constants.h" - .file 20 "/usr/local/cuda/include/device_functions.h" - .file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h" - .file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h" - .file 23 "/usr/local/cuda/include/sm_13_double_functions.h" - .file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h" - .file 25 "/usr/local/cuda/include/sm_20_intrinsics.h" - .file 26 "/usr/local/cuda/include/surface_functions.h" - .file 27 "/usr/local/cuda/include/texture_fetch_functions.h" - .file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h" - - .global .texref neigh_tex; - - .entry calc_cell_id ( - .param .u64 __cudaparm_calc_cell_id_pos, - .param .u64 __cudaparm_calc_cell_id_cell_id, - .param .u64 __cudaparm_calc_cell_id_particle_id, - .param .f32 __cudaparm_calc_cell_id_boxlo0, - .param .f32 __cudaparm_calc_cell_id_boxlo1, - .param .f32 __cudaparm_calc_cell_id_boxlo2, - .param .f32 __cudaparm_calc_cell_id_boxhi0, - .param .f32 __cudaparm_calc_cell_id_boxhi1, - .param .f32 __cudaparm_calc_cell_id_boxhi2, - .param .f32 __cudaparm_calc_cell_id_cell_size, - .param .s32 __cudaparm_calc_cell_id_ncellx, - .param .s32 __cudaparm_calc_cell_id_ncelly, - .param .s32 __cudaparm_calc_cell_id_nall) - { - .reg .u32 %r<25>; - .reg .u64 %rd<8>; - .reg .f32 %f<35>; - .reg .f64 %fd<11>; - .reg .pred %p<3>; - .loc 16 29 0 -$LDWbegin_calc_cell_id: - mov.u32 %r1, %tid.x; - mov.u32 %r2, %ctaid.x; - mov.u32 %r3, %ntid.x; - mul.lo.u32 %r4, %r2, %r3; - add.u32 %r5, %r1, %r4; - ld.param.s32 %r6, [__cudaparm_calc_cell_id_nall]; - setp.le.s32 %p1, %r6, %r5; - @%p1 bra $Lt_0_1026; - .loc 16 33 0 - mov.u32 %r7, %r5; - mov.s32 %r8, 0; - mov.u32 %r9, %r8; - mov.s32 %r10, 0; - mov.u32 %r11, %r10; - mov.s32 %r12, 0; - mov.u32 %r13, %r12; - tex.1d.v4.f32.s32 {%f1,%f2,%f3,%f4},[neigh_tex,{%r7,%r9,%r11,%r13}]; - mov.f32 %f5, %f1; - mov.f32 %f6, %f2; - mov.f32 %f7, %f3; - .loc 16 46 0 - ld.param.f32 %f8, [__cudaparm_calc_cell_id_cell_size]; - neg.ftz.f32 %f9, %f8; - ld.param.f32 %f10, [__cudaparm_calc_cell_id_boxlo0]; - ld.param.f32 %f11, [__cudaparm_calc_cell_id_boxlo2]; - ld.param.f32 %f12, [__cudaparm_calc_cell_id_boxlo1]; - ld.param.s32 %r14, [__cudaparm_calc_cell_id_ncellx]; - ld.param.s32 %r15, [__cudaparm_calc_cell_id_ncelly]; - ld.param.f32 %f13, [__cudaparm_calc_cell_id_boxhi2]; - sub.ftz.f32 %f14, %f13, %f11; - add.ftz.f32 %f15, %f8, %f14; - sub.ftz.f32 %f16, %f7, %f11; - max.ftz.f32 %f17, %f9, %f16; - min.ftz.f32 %f18, %f15, %f17; - div.approx.ftz.f32 %f19, %f18, %f8; - cvt.ftz.f64.f32 %fd1, %f19; - mov.f64 %fd2, 0d3ff0000000000000; // 1 - add.f64 %fd3, %fd1, %fd2; - cvt.rzi.u32.f64 %r16, %fd3; - mul.lo.u32 %r17, %r14, %r16; - mul.lo.u32 %r18, %r15, %r17; - ld.param.f32 %f20, [__cudaparm_calc_cell_id_boxhi1]; - sub.ftz.f32 %f21, %f20, %f12; - add.ftz.f32 %f22, %f8, %f21; - sub.ftz.f32 %f23, %f6, %f12; - max.ftz.f32 %f24, %f9, %f23; - min.ftz.f32 %f25, %f22, %f24; - div.approx.ftz.f32 %f26, %f25, %f8; - cvt.ftz.f64.f32 %fd4, %f26; - mov.f64 %fd5, 0d3ff0000000000000; // 1 - add.f64 %fd6, %fd4, %fd5; - cvt.rzi.u32.f64 %r19, %fd6; - mul.lo.u32 %r20, %r14, %r19; - add.u32 %r21, %r18, %r20; - ld.param.f32 %f27, [__cudaparm_calc_cell_id_boxhi0]; - sub.ftz.f32 %f28, %f27, %f10; - add.ftz.f32 %f29, %f8, %f28; - sub.ftz.f32 %f30, %f5, %f10; - max.ftz.f32 %f31, %f9, %f30; - min.ftz.f32 %f32, %f29, %f31; - div.approx.ftz.f32 %f33, %f32, %f8; - cvt.ftz.f64.f32 %fd7, %f33; - mov.f64 %fd8, 0d3ff0000000000000; // 1 - add.f64 %fd9, %fd7, %fd8; - cvt.rzi.u32.f64 %r22, %fd9; - add.u32 %r23, %r21, %r22; - .loc 16 50 0 - cvt.s64.s32 %rd1, %r5; - mul.wide.s32 %rd2, %r5, 4; - ld.param.u64 %rd3, [__cudaparm_calc_cell_id_cell_id]; - add.u64 %rd4, %rd3, %rd2; - st.global.u32 [%rd4+0], %r23; - .loc 16 51 0 - ld.param.u64 %rd5, [__cudaparm_calc_cell_id_particle_id]; - add.u64 %rd6, %rd5, %rd2; - st.global.s32 [%rd6+0], %r5; -$Lt_0_1026: - .loc 16 53 0 - exit; -$LDWend_calc_cell_id: - } // calc_cell_id - - .entry kernel_calc_cell_counts ( - .param .u64 __cudaparm_kernel_calc_cell_counts_cell_id, - .param .u64 __cudaparm_kernel_calc_cell_counts_cell_counts, - .param .s32 __cudaparm_kernel_calc_cell_counts_nall, - .param .s32 __cudaparm_kernel_calc_cell_counts_ncell) - { - .reg .u32 %r<33>; - .reg .u64 %rd<15>; - .reg .pred %p<13>; - .loc 16 56 0 -$LDWbegin_kernel_calc_cell_counts: - mov.u32 %r1, %ctaid.x; - mov.u32 %r2, %ntid.x; - mul.lo.u32 %r3, %r1, %r2; - mov.u32 %r4, %tid.x; - add.u32 %r5, %r4, %r3; - ld.param.s32 %r6, [__cudaparm_kernel_calc_cell_counts_nall]; - setp.gt.s32 %p1, %r6, %r5; - @!%p1 bra $Lt_1_7426; - .loc 16 59 0 - ld.param.u64 %rd1, [__cudaparm_kernel_calc_cell_counts_cell_id]; - cvt.s64.s32 %rd2, %r5; - mul.wide.s32 %rd3, %r5, 4; - add.u64 %rd4, %rd1, %rd3; - ld.global.u32 %r7, [%rd4+0]; - mov.u32 %r8, 0; - setp.ne.s32 %p2, %r5, %r8; - @%p2 bra $Lt_1_7938; - add.s32 %r9, %r7, 1; - mov.u32 %r10, 0; - setp.le.s32 %p3, %r9, %r10; - @%p3 bra $Lt_1_8450; - mov.s32 %r11, %r9; - ld.param.u64 %rd5, [__cudaparm_kernel_calc_cell_counts_cell_counts]; - mov.s32 %r12, 0; - mov.s32 %r13, %r11; -$Lt_1_8962: - // Loop body line 59, nesting depth: 1, estimated iterations: unknown - .loc 16 64 0 - mov.s32 %r14, 0; - st.global.s32 [%rd5+0], %r14; - add.s32 %r12, %r12, 1; - add.u64 %rd5, %rd5, 4; - setp.ne.s32 %p4, %r9, %r12; - @%p4 bra $Lt_1_8962; -$Lt_1_8450: -$Lt_1_7938: - sub.s32 %r15, %r6, 1; - setp.ne.s32 %p5, %r5, %r15; - @%p5 bra $Lt_1_9474; - .loc 16 67 0 - add.s32 %r9, %r7, 1; - mov.s32 %r16, %r9; - ld.param.s32 %r17, [__cudaparm_kernel_calc_cell_counts_ncell]; - setp.gt.s32 %p6, %r9, %r17; - @%p6 bra $Lt_1_9986; - sub.s32 %r18, %r17, %r7; - add.s32 %r19, %r17, 1; - ld.param.u64 %rd6, [__cudaparm_kernel_calc_cell_counts_cell_counts]; - cvt.s64.s32 %rd7, %r9; - mul.wide.s32 %rd8, %r9, 4; - add.u64 %rd9, %rd6, %rd8; - mov.s32 %r20, %r18; -$Lt_1_10498: - // Loop body line 67, nesting depth: 1, estimated iterations: unknown - .loc 16 68 0 - st.global.s32 [%rd9+0], %r6; - add.s32 %r16, %r16, 1; - add.u64 %rd9, %rd9, 4; - setp.ne.s32 %p7, %r19, %r16; - @%p7 bra $Lt_1_10498; -$Lt_1_9986: -$Lt_1_9474: - selp.s32 %r21, 1, 0, %p1; - mov.s32 %r22, 0; - set.gt.u32.s32 %r23, %r5, %r22; - neg.s32 %r24, %r23; - and.b32 %r25, %r21, %r24; - mov.u32 %r26, 0; - setp.eq.s32 %p8, %r25, %r26; - @%p8 bra $Lt_1_11010; - .loc 16 72 0 - ld.global.u32 %r27, [%rd4+-4]; - setp.eq.s32 %p9, %r7, %r27; - @%p9 bra $Lt_1_11522; - .loc 16 74 0 - add.s32 %r28, %r27, 1; - mov.s32 %r29, %r28; - setp.gt.s32 %p10, %r28, %r7; - @%p10 bra $Lt_1_12034; - sub.s32 %r30, %r7, %r27; - add.s32 %r9, %r7, 1; - ld.param.u64 %rd10, [__cudaparm_kernel_calc_cell_counts_cell_counts]; - cvt.s64.s32 %rd11, %r28; - mul.wide.s32 %rd12, %r28, 4; - add.u64 %rd13, %rd10, %rd12; - mov.s32 %r31, %r30; -$Lt_1_12546: - // Loop body line 74, nesting depth: 1, estimated iterations: unknown - .loc 16 75 0 - st.global.s32 [%rd13+0], %r5; - add.s32 %r29, %r29, 1; - add.u64 %rd13, %rd13, 4; - setp.ne.s32 %p11, %r9, %r29; - @%p11 bra $Lt_1_12546; -$Lt_1_12034: -$Lt_1_11522: -$Lt_1_11010: -$Lt_1_7426: - .loc 16 79 0 - exit; -$LDWend_kernel_calc_cell_counts: - } // kernel_calc_cell_counts - - .entry transpose ( - .param .u64 __cudaparm_transpose_out, - .param .u64 __cudaparm_transpose_in, - .param .s32 __cudaparm_transpose_columns_in, - .param .s32 __cudaparm_transpose_rows_in) - { - .reg .u32 %r<32>; - .reg .u64 %rd<23>; - .reg .f32 %f<4>; - .reg .pred %p<4>; - .shared .align 4 .b8 __cuda___cuda_local_var_32571_32_non_const_block112[288]; - .loc 16 86 0 -$LDWbegin_transpose: - mov.u32 %r1, %ctaid.x; - mul.lo.u32 %r2, %r1, 8; - mov.u32 %r3, %ctaid.y; - mul.lo.u32 %r4, %r3, 8; - mov.u32 %r5, %tid.x; - add.u32 %r6, %r2, %r5; - mov.u32 %r7, %tid.y; - add.u32 %r8, %r4, %r7; - ld.param.s32 %r9, [__cudaparm_transpose_rows_in]; - ld.param.s32 %r10, [__cudaparm_transpose_columns_in]; - set.gt.u32.u32 %r11, %r9, %r8; - neg.s32 %r12, %r11; - set.gt.u32.u32 %r13, %r10, %r6; - neg.s32 %r14, %r13; - and.b32 %r15, %r12, %r14; - mov.u32 %r16, 0; - setp.eq.s32 %p1, %r15, %r16; - @%p1 bra $Lt_2_2306; - .loc 16 98 0 - mov.u64 %rd1, __cuda___cuda_local_var_32571_32_non_const_block112; - ld.param.u64 %rd2, [__cudaparm_transpose_in]; - mul.lo.u32 %r17, %r10, %r8; - add.u32 %r18, %r6, %r17; - cvt.u64.u32 %rd3, %r18; - mul.wide.u32 %rd4, %r18, 4; - add.u64 %rd5, %rd2, %rd4; - ld.global.s32 %r19, [%rd5+0]; - cvt.rn.f32.s32 %f1, %r19; - cvt.u64.u32 %rd6, %r5; - cvt.u64.u32 %rd7, %r7; - mul.wide.u32 %rd8, %r7, 9; - add.u64 %rd9, %rd6, %rd8; - mul.lo.u64 %rd10, %rd9, 4; - add.u64 %rd11, %rd1, %rd10; - st.shared.f32 [%rd11+0], %f1; -$Lt_2_2306: - mov.u64 %rd1, __cuda___cuda_local_var_32571_32_non_const_block112; - .loc 16 100 0 - bar.sync 0; - add.u32 %r20, %r2, %r7; - add.u32 %r21, %r4, %r5; - set.gt.u32.u32 %r22, %r9, %r21; - neg.s32 %r23, %r22; - set.gt.u32.u32 %r24, %r10, %r20; - neg.s32 %r25, %r24; - and.b32 %r26, %r23, %r25; - mov.u32 %r27, 0; - setp.eq.s32 %p2, %r26, %r27; - @%p2 bra $Lt_2_2818; - .loc 16 105 0 - cvt.u64.u32 %rd12, %r7; - cvt.u64.u32 %rd13, %r5; - mul.wide.u32 %rd14, %r5, 9; - add.u64 %rd15, %rd12, %rd14; - mul.lo.u64 %rd16, %rd15, 4; - add.u64 %rd17, %rd1, %rd16; - ld.shared.f32 %f2, [%rd17+0]; - cvt.rzi.ftz.s32.f32 %r28, %f2; - ld.param.u64 %rd18, [__cudaparm_transpose_out]; - mul.lo.u32 %r29, %r9, %r20; - add.u32 %r30, %r21, %r29; - cvt.u64.u32 %rd19, %r30; - mul.wide.u32 %rd20, %r30, 4; - add.u64 %rd21, %rd18, %rd20; - st.global.s32 [%rd21+0], %r28; -$Lt_2_2818: - .loc 16 106 0 - exit; -$LDWend_transpose: - } // transpose - - .entry calc_neigh_list_cell ( - .param .u64 __cudaparm_calc_neigh_list_cell_x_, - .param .u64 __cudaparm_calc_neigh_list_cell_cell_particle_id, - .param .u64 __cudaparm_calc_neigh_list_cell_cell_counts, - .param .u64 __cudaparm_calc_neigh_list_cell_nbor_list, - .param .u64 __cudaparm_calc_neigh_list_cell_host_nbor_list, - .param .u64 __cudaparm_calc_neigh_list_cell_host_numj, - .param .s32 __cudaparm_calc_neigh_list_cell_neigh_bin_size, - .param .f32 __cudaparm_calc_neigh_list_cell_cell_size, - .param .s32 __cudaparm_calc_neigh_list_cell_ncellx, - .param .s32 __cudaparm_calc_neigh_list_cell_ncelly, - .param .s32 __cudaparm_calc_neigh_list_cell_ncellz, - .param .s32 __cudaparm_calc_neigh_list_cell_inum, - .param .s32 __cudaparm_calc_neigh_list_cell_nt, - .param .s32 __cudaparm_calc_neigh_list_cell_nall, - .param .s32 __cudaparm_calc_neigh_list_cell_t_per_atom) - { - .reg .u32 %r<118>; - .reg .u64 %rd<52>; - .reg .f32 %f<41>; - .reg .f64 %fd<4>; - .reg .pred %p<23>; - .shared .align 16 .b8 __cuda___cuda_local_var_32609_34_non_const_pos_sh496[2048]; - .shared .align 4 .b8 __cuda___cuda_local_var_32608_31_non_const_cell_list_sh2544[512]; - // __cuda_local_var_32624_12_non_const_atom_i = 16 - .loc 16 116 0 -$LDWbegin_calc_neigh_list_cell: - .loc 16 128 0 - ld.param.s32 %r1, [__cudaparm_calc_neigh_list_cell_ncelly]; - mov.u32 %r2, %ctaid.y; - rem.u32 %r3, %r2, %r1; - div.u32 %r4, %r2, %r1; - ld.param.s32 %r5, [__cudaparm_calc_neigh_list_cell_ncellx]; - mul.lo.s32 %r6, %r5, %r3; - mul.lo.s32 %r7, %r5, %r4; - mul.lo.s32 %r8, %r7, %r1; - cvt.s32.u32 %r9, %ctaid.x; - ld.param.u64 %rd1, [__cudaparm_calc_neigh_list_cell_cell_counts]; - add.s32 %r10, %r6, %r8; - add.s32 %r11, %r9, %r10; - cvt.s64.s32 %rd2, %r11; - mul.wide.s32 %rd3, %r11, 4; - add.u64 %rd4, %rd1, %rd3; - ldu.global.s32 %r12, [%rd4+0]; - .loc 16 129 0 - ldu.global.s32 %r13, [%rd4+4]; - .loc 16 137 0 - sub.s32 %r14, %r13, %r12; - mov.u32 %r15, %ntid.x; - cvt.rn.f32.u32 %f1, %r15; - cvt.rn.f32.s32 %f2, %r14; - div.approx.ftz.f32 %f3, %f2, %f1; - cvt.rpi.ftz.f32.f32 %f4, %f3; - cvt.rzi.ftz.s32.f32 %r16, %f4; - mov.u32 %r17, 0; - setp.le.s32 %p1, %r16, %r17; - @%p1 bra $Lt_3_14082; - sub.s32 %r18, %r3, 1; - mov.s32 %r19, 0; - max.s32 %r20, %r18, %r19; - sub.s32 %r21, %r1, 1; - add.s32 %r22, %r3, 1; - min.s32 %r23, %r21, %r22; - ld.param.s32 %r24, [__cudaparm_calc_neigh_list_cell_ncellz]; - sub.s32 %r25, %r24, 1; - add.s32 %r26, %r4, 1; - min.s32 %r27, %r25, %r26; - sub.s32 %r28, %r9, 1; - mov.s32 %r29, 0; - max.s32 %r30, %r28, %r29; - add.s32 %r31, %r9, 1; - sub.s32 %r32, %r5, 1; - min.s32 %r33, %r31, %r32; - mov.s32 %r34, %r16; - cvt.s32.u32 %r35, %tid.x; - add.s32 %r36, %r12, %r35; - mov.u32 %r37, 0; - ld.param.s32 %r38, [__cudaparm_calc_neigh_list_cell_inum]; - cvt.s64.s32 %rd5, %r38; - sub.s32 %r39, %r4, 1; - mov.s32 %r40, %r36; - mov.s32 %r41, 0; - max.s32 %r42, %r39, %r41; - setp.ge.s32 %p2, %r27, %r42; - ld.param.s32 %r43, [__cudaparm_calc_neigh_list_cell_nt]; - ld.param.s32 %r44, [__cudaparm_calc_neigh_list_cell_nall]; - mov.s32 %r45, 0; - mov.u64 %rd6, __cuda___cuda_local_var_32609_34_non_const_pos_sh496; - mov.u64 %rd7, __cuda___cuda_local_var_32608_31_non_const_cell_list_sh2544; - mov.s32 %r46, %r34; -$Lt_3_14594: - // Loop body line 137, nesting depth: 1, estimated iterations: unknown - .loc 16 140 0 - mov.s32 %r47, %r44; - setp.ge.s32 %p3, %r40, %r13; - @%p3 bra $Lt_3_14850; - .loc 16 146 0 - ld.param.u64 %rd8, [__cudaparm_calc_neigh_list_cell_cell_particle_id]; - add.u32 %r48, %r36, %r37; - cvt.s64.s32 %rd9, %r48; - mul.wide.s32 %rd10, %r48, 4; - add.u64 %rd11, %rd8, %rd10; - ld.global.s32 %r47, [%rd11+0]; -$Lt_3_14850: - setp.lt.s32 %p4, %r47, %r43; - @!%p4 bra $Lt_3_15362; - .loc 16 149 0 - mov.u32 %r49, %r47; - mov.s32 %r50, 0; - mov.u32 %r51, %r50; - mov.s32 %r52, 0; - mov.u32 %r53, %r52; - mov.s32 %r54, 0; - mov.u32 %r55, %r54; - tex.1d.v4.f32.s32 {%f5,%f6,%f7,%f8},[neigh_tex,{%r49,%r51,%r53,%r55}]; - mov.f32 %f9, %f5; - mov.f32 %f10, %f6; - mov.f32 %f11, %f7; - mov.f32 %f12, %f9; - mov.f32 %f13, %f10; - mov.f32 %f14, %f11; -$Lt_3_15362: - cvt.s64.s32 %rd12, %r47; - mul.wide.s32 %rd13, %r47, 4; - setp.ge.s32 %p5, %r47, %r38; - @%p5 bra $Lt_3_16130; - .loc 16 153 0 - ld.param.u64 %rd14, [__cudaparm_calc_neigh_list_cell_nbor_list]; - add.u64 %rd15, %rd12, %rd5; - mul.lo.u64 %rd16, %rd15, 4; - add.u64 %rd17, %rd14, %rd16; - mov.s64 %rd18, %rd17; - .loc 16 154 0 - ld.param.s32 %r56, [__cudaparm_calc_neigh_list_cell_t_per_atom]; - sub.s32 %r57, %r56, 1; - mul.lo.s32 %r58, %r47, %r57; - cvt.s64.s32 %rd19, %r58; - add.u64 %rd20, %rd19, %rd5; - mul.lo.u64 %rd21, %rd20, 4; - add.u64 %rd22, %rd17, %rd21; - .loc 16 155 0 - mul.lo.s32 %r59, %r56, %r38; - sub.s32 %r60, %r59, %r56; - .loc 16 156 0 - add.u64 %rd23, %rd13, %rd14; - st.global.s32 [%rd23+0], %r47; - bra.uni $Lt_3_15874; -$Lt_3_16130: - .loc 16 159 0 - ld.param.u64 %rd24, [__cudaparm_calc_neigh_list_cell_host_numj]; - add.u64 %rd25, %rd24, %rd13; - mul.lo.u64 %rd26, %rd5, 4; - sub.u64 %rd18, %rd25, %rd26; - .loc 16 160 0 - ld.param.u64 %rd27, [__cudaparm_calc_neigh_list_cell_host_nbor_list]; - ld.param.s32 %r61, [__cudaparm_calc_neigh_list_cell_neigh_bin_size]; - sub.s32 %r62, %r47, %r38; - mul.lo.s32 %r63, %r61, %r62; - cvt.s64.s32 %rd28, %r63; - mul.wide.s32 %rd29, %r63, 4; - add.u64 %rd22, %rd27, %rd29; - mov.s32 %r60, 0; -$Lt_3_15874: - .loc 16 165 0 - mov.s32 %r64, %r42; - @!%p2 bra $Lt_3_24066; - sub.s32 %r65, %r27, %r42; - add.s32 %r66, %r65, 1; - setp.le.s32 %p6, %r20, %r23; - add.s32 %r67, %r27, 1; - mov.s32 %r68, 0; - mov.s32 %r69, %r66; -$Lt_3_16898: - // Loop body line 165, nesting depth: 2, estimated iterations: unknown - .loc 16 166 0 - mov.s32 %r70, %r20; - @!%p6 bra $Lt_3_17154; - sub.s32 %r71, %r23, %r20; - add.s32 %r72, %r71, 1; - setp.ge.s32 %p7, %r33, %r30; - add.s32 %r73, %r23, 1; - mov.s32 %r74, %r72; -$Lt_3_17666: - // Loop body line 166, nesting depth: 3, estimated iterations: unknown - @!%p7 bra $Lt_3_17922; - sub.s32 %r75, %r33, %r30; - add.s32 %r76, %r75, 1; - mul.lo.s32 %r77, %r70, %r5; - mul.lo.s32 %r78, %r64, %r5; - mul.lo.s32 %r79, %r78, %r1; - add.s32 %r80, %r33, 1; - add.s32 %r81, %r77, %r79; - add.s32 %r82, %r81, %r30; - add.s32 %r83, %r80, %r81; - cvt.s64.s32 %rd30, %r82; - mul.wide.s32 %rd31, %r82, 4; - add.u64 %rd32, %rd1, %rd31; - mov.s32 %r84, %r76; -$Lt_3_18434: - // Loop body line 166, nesting depth: 4, estimated iterations: unknown - .loc 16 171 0 - ld.global.s32 %r85, [%rd32+0]; - .loc 16 172 0 - ld.global.s32 %r86, [%rd32+4]; - .loc 16 176 0 - sub.s32 %r87, %r86, %r85; - cvt.rn.f32.s32 %f15, %r87; - mov.f32 %f16, 0f43000000; // 128 - div.approx.ftz.f32 %f17, %f15, %f16; - cvt.rpi.ftz.f32.f32 %f18, %f17; - cvt.rzi.ftz.s32.f32 %r88, %f18; - mov.u32 %r89, 0; - setp.le.s32 %p8, %r88, %r89; - @%p8 bra $Lt_3_18690; - mov.s32 %r90, %r88; - mov.s32 %r91, 0; - setp.lt.s32 %p9, %r47, %r43; - mul.lo.s32 %r92, %r88, 128; - mov.s32 %r93, %r90; -$Lt_3_19202: - // Loop body line 176, nesting depth: 5, estimated iterations: unknown - sub.s32 %r94, %r87, %r91; - mov.s32 %r95, 128; - min.s32 %r96, %r94, %r95; - setp.le.s32 %p10, %r96, %r35; - @%p10 bra $Lt_3_19458; - .loc 16 183 0 - ld.param.u64 %rd33, [__cudaparm_calc_neigh_list_cell_cell_particle_id]; - add.s32 %r97, %r91, %r35; - add.s32 %r98, %r85, %r97; - cvt.s64.s32 %rd34, %r98; - mul.wide.s32 %rd35, %r98, 4; - add.u64 %rd36, %rd33, %rd35; - ld.global.s32 %r99, [%rd36+0]; - .loc 16 184 0 - cvt.s64.s32 %rd37, %r35; - mul.wide.s32 %rd38, %r35, 4; - add.u64 %rd39, %rd7, %rd38; - st.shared.s32 [%rd39+0], %r99; - .loc 16 185 0 - mov.u32 %r100, %r99; - mov.s32 %r101, 0; - mov.u32 %r102, %r101; - mov.s32 %r103, 0; - mov.u32 %r104, %r103; - mov.s32 %r105, 0; - mov.u32 %r106, %r105; - tex.1d.v4.f32.s32 {%f19,%f20,%f21,%f22},[neigh_tex,{%r100,%r102,%r104,%r106}]; - mov.f32 %f23, %f19; - mov.f32 %f24, %f20; - mov.f32 %f25, %f21; - .loc 16 186 0 - mul.lo.u64 %rd40, %rd37, 16; - add.u64 %rd41, %rd6, %rd40; - st.shared.v2.f32 [%rd41+0], {%f23,%f24}; - .loc 16 188 0 - st.shared.f32 [%rd41+8], %f25; -$Lt_3_19458: - .loc 16 190 0 - bar.sync 0; - @!%p9 bra $Lt_3_20482; - mov.u32 %r107, 0; - setp.le.s32 %p11, %r96, %r107; - @%p11 bra $Lt_3_20482; - mov.s32 %r108, %r96; - mov.s64 %rd42, 0; - ld.param.f32 %f26, [__cudaparm_calc_neigh_list_cell_cell_size]; - mul.ftz.f32 %f27, %f26, %f26; - mov.s64 %rd43, %rd6; - mov.f32 %f28, %f14; - mov.f32 %f29, %f13; - mov.f32 %f30, %f12; - mov.s32 %r109, 0; - mov.s32 %r110, %r108; -$Lt_3_20994: - // Loop body line 190, nesting depth: 6, estimated iterations: unknown - ld.shared.v4.f32 {%f31,%f32,%f33,_}, [%rd43+0]; - .loc 16 196 0 - sub.ftz.f32 %f34, %f30, %f31; - .loc 16 197 0 - sub.ftz.f32 %f35, %f29, %f32; - .loc 16 198 0 - sub.ftz.f32 %f36, %f28, %f33; - .loc 16 195 0 - mul.ftz.f32 %f37, %f35, %f35; - fma.rn.ftz.f32 %f38, %f34, %f34, %f37; - fma.rn.ftz.f32 %f39, %f36, %f36, %f38; - setp.gt.ftz.f32 %p12, %f27, %f39; - @!%p12 bra $Lt_3_25346; - cvt.ftz.f64.f32 %fd1, %f39; - mov.f64 %fd2, 0d3ee4f8b588e368f1; // 1e-05 - setp.gt.f64 %p13, %fd1, %fd2; - @!%p13 bra $Lt_3_25346; - .loc 16 202 0 - add.s32 %r68, %r68, 1; - ld.param.s32 %r111, [__cudaparm_calc_neigh_list_cell_neigh_bin_size]; - setp.lt.s32 %p14, %r111, %r68; - @%p14 bra $Lt_3_25346; - .loc 16 204 0 - mul.lo.u64 %rd44, %rd42, 4; - add.u64 %rd45, %rd7, %rd44; - ld.shared.s32 %r112, [%rd45+0]; - st.global.s32 [%rd22+0], %r112; - cvt.s64.s32 %rd46, %r60; - mul.wide.s32 %rd47, %r60, 4; - add.u64 %rd48, %rd22, %rd47; - add.u64 %rd49, %rd48, 4; - add.u64 %rd50, %rd22, 4; - ld.param.s32 %r113, [__cudaparm_calc_neigh_list_cell_t_per_atom]; - sub.s32 %r114, %r113, 1; - and.b32 %r115, %r68, %r114; - mov.s32 %r116, 0; - setp.eq.s32 %p15, %r115, %r116; - selp.u64 %rd22, %rd49, %rd50, %p15; -$Lt_3_25346: -$L_3_13570: - .loc 16 202 0 - add.s32 %r109, %r109, 1; - add.s64 %rd42, %rd42, 1; - add.u64 %rd43, %rd43, 16; - setp.ne.s32 %p16, %r96, %r109; - @%p16 bra $Lt_3_20994; -$Lt_3_20482: -$Lt_3_19970: - .loc 16 212 0 - bar.sync 0; - add.s32 %r91, %r91, 128; - setp.ne.s32 %p17, %r91, %r92; - @%p17 bra $Lt_3_19202; -$Lt_3_18690: - add.s32 %r82, %r82, 1; - add.u64 %rd32, %rd32, 4; - setp.ne.s32 %p18, %r82, %r83; - @%p18 bra $Lt_3_18434; -$Lt_3_17922: - add.s32 %r70, %r70, 1; - setp.ne.s32 %p19, %r73, %r70; - @%p19 bra $Lt_3_17666; -$Lt_3_17154: - add.s32 %r64, %r64, 1; - setp.ne.s32 %p20, %r67, %r64; - @%p20 bra $Lt_3_16898; - bra.uni $Lt_3_16386; -$Lt_3_24066: - mov.s32 %r68, 0; -$Lt_3_16386: - @!%p4 bra $Lt_3_23042; - .loc 16 218 0 - st.global.s32 [%rd18+0], %r68; -$Lt_3_23042: - add.s32 %r45, %r45, 1; - add.u32 %r37, %r37, %r15; - add.s32 %r40, %r40, %r15; - setp.ne.s32 %p21, %r16, %r45; - @%p21 bra $Lt_3_14594; -$Lt_3_14082: - .loc 16 220 0 - exit; -$LDWend_calc_neigh_list_cell: - } // calc_neigh_list_cell - - .entry kernel_special ( - .param .u64 __cudaparm_kernel_special_dev_nbor, - .param .u64 __cudaparm_kernel_special_host_nbor_list, - .param .u64 __cudaparm_kernel_special_host_numj, - .param .u64 __cudaparm_kernel_special_tag, - .param .u64 __cudaparm_kernel_special_nspecial, - .param .u64 __cudaparm_kernel_special_special, - .param .s32 __cudaparm_kernel_special_inum, - .param .s32 __cudaparm_kernel_special_nt, - .param .s32 __cudaparm_kernel_special_max_nbors, - .param .s32 __cudaparm_kernel_special_t_per_atom) - { - .reg .u32 %r<45>; - .reg .u64 %rd<45>; - .reg .pred %p<11>; - .loc 16 226 0 -$LDWbegin_kernel_special: - ld.param.s32 %r1, [__cudaparm_kernel_special_t_per_atom]; - cvt.s32.u32 %r2, %tid.x; - div.s32 %r3, %r2, %r1; - cvt.s32.u32 %r4, %ntid.x; - div.s32 %r5, %r4, %r1; - cvt.s32.u32 %r6, %ctaid.x; - mul.lo.s32 %r7, %r6, %r5; - add.s32 %r8, %r3, %r7; - ld.param.s32 %r9, [__cudaparm_kernel_special_nt]; - setp.ge.s32 %p1, %r8, %r9; - @%p1 bra $Lt_4_6146; - .loc 16 236 0 - ld.param.u64 %rd1, [__cudaparm_kernel_special_nspecial]; - mul.lo.s32 %r10, %r8, 3; - cvt.s64.s32 %rd2, %r10; - mul.wide.s32 %rd3, %r10, 4; - add.u64 %rd4, %rd1, %rd3; - ld.global.s32 %r11, [%rd4+0]; - .loc 16 237 0 - ld.global.s32 %r12, [%rd4+4]; - .loc 16 238 0 - ld.global.s32 %r13, [%rd4+8]; - ld.param.s32 %r14, [__cudaparm_kernel_special_inum]; - setp.ge.s32 %p2, %r8, %r14; - @%p2 bra $Lt_4_6914; - .loc 16 244 0 - ld.param.u64 %rd5, [__cudaparm_kernel_special_dev_nbor]; - cvt.s64.s32 %rd6, %r8; - cvt.s64.s32 %rd7, %r14; - add.u64 %rd8, %rd6, %rd7; - mul.lo.u64 %rd9, %rd8, 4; - add.u64 %rd10, %rd5, %rd9; - ld.global.s32 %r15, [%rd10+0]; - .loc 16 246 0 - mul.lo.s32 %r16, %r14, %r1; - mov.s32 %r17, %r16; - .loc 16 248 0 - sub.s32 %r18, %r1, 1; - mul.lo.s32 %r19, %r18, %r8; - add.s32 %r20, %r14, %r19; - cvt.s64.s32 %rd11, %r20; - mul.wide.s32 %rd12, %r20, 4; - add.u64 %rd13, %rd10, %rd12; - and.b32 %r21, %r18, %r15; - cvt.s64.s32 %rd14, %r21; - div.s32 %r22, %r15, %r1; - mul.lo.s32 %r23, %r16, %r22; - cvt.s64.s32 %rd15, %r23; - add.u64 %rd16, %rd14, %rd15; - mul.lo.u64 %rd17, %rd16, 4; - add.u64 %rd18, %rd13, %rd17; - .loc 16 249 0 - and.b32 %r24, %r18, %r2; - cvt.s64.s32 %rd19, %r24; - mul.wide.s32 %rd20, %r24, 4; - add.u64 %rd21, %rd13, %rd20; - bra.uni $Lt_4_6658; -$Lt_4_6914: - .loc 16 252 0 - sub.s32 %r25, %r8, %r14; - ld.param.u64 %rd22, [__cudaparm_kernel_special_host_nbor_list]; - ld.param.s32 %r26, [__cudaparm_kernel_special_max_nbors]; - mul.lo.s32 %r27, %r26, %r25; - cvt.s64.s32 %rd23, %r27; - mul.wide.s32 %rd24, %r27, 4; - add.u64 %rd25, %rd22, %rd24; - mov.s64 %rd21, %rd25; - .loc 16 254 0 - ld.param.u64 %rd26, [__cudaparm_kernel_special_host_numj]; - cvt.s64.s32 %rd27, %r25; - mul.wide.s32 %rd28, %r25, 4; - add.u64 %rd29, %rd26, %rd28; - ld.global.s32 %r28, [%rd29+0]; - cvt.s64.s32 %rd30, %r28; - mul.wide.s32 %rd31, %r28, 4; - add.u64 %rd18, %rd25, %rd31; - mov.s32 %r17, 1; -$Lt_4_6658: - setp.ge.u64 %p3, %rd21, %rd18; - @%p3 bra $Lt_4_7170; - mov.s32 %r29, 0; - setp.gt.s32 %p4, %r13, %r29; - cvt.s64.s32 %rd32, %r17; - ld.param.u64 %rd33, [__cudaparm_kernel_special_tag]; -$Lt_4_7682: - // Loop body line 254, nesting depth: 1, estimated iterations: unknown - .loc 16 258 0 - ld.global.s32 %r30, [%rd21+0]; - .loc 16 259 0 - cvt.s64.s32 %rd34, %r30; - mul.wide.s32 %rd35, %r30, 4; - add.u64 %rd36, %rd33, %rd35; - ld.global.s32 %r31, [%rd36+0]; - @!%p4 bra $Lt_4_7938; - mov.s32 %r32, %r13; - cvt.s64.s32 %rd37, %r8; - cvt.s64.s32 %rd38, %r9; - mul.wide.s32 %rd39, %r9, 4; - ld.param.u64 %rd40, [__cudaparm_kernel_special_special]; - mul.wide.s32 %rd41, %r8, 4; - add.u64 %rd42, %rd40, %rd41; - mov.s32 %r33, 0; - mov.s32 %r34, %r32; -$Lt_4_8450: - // Loop body line 259, nesting depth: 1, estimated iterations: unknown - ld.global.s32 %r35, [%rd42+0]; - setp.ne.s32 %p5, %r35, %r31; - @%p5 bra $Lt_4_8706; - .loc 16 269 0 - setp.le.s32 %p6, %r11, %r33; - mov.s32 %r36, 3; - mov.s32 %r37, 2; - selp.s32 %r38, %r36, %r37, %p6; - mov.s32 %r39, 2; - mov.s32 %r40, 1; - selp.s32 %r41, %r39, %r40, %p6; - setp.le.s32 %p7, %r12, %r33; - selp.s32 %r42, %r38, %r41, %p7; - shl.b32 %r43, %r42, 30; - xor.b32 %r30, %r30, %r43; - .loc 16 270 0 - st.global.s32 [%rd21+0], %r30; -$Lt_4_8706: - add.s32 %r33, %r33, 1; - add.u64 %rd42, %rd39, %rd42; - setp.ne.s32 %p8, %r13, %r33; - @%p8 bra $Lt_4_8450; -$Lt_4_7938: - .loc 16 257 0 - mul.lo.u64 %rd43, %rd32, 4; - add.u64 %rd21, %rd21, %rd43; - setp.lt.u64 %p9, %rd21, %rd18; - @%p9 bra $Lt_4_7682; -$Lt_4_7170: -$Lt_4_6146: - .loc 16 276 0 - exit; -$LDWend_kernel_special: - } // kernel_special - diff --git a/lib/gpu/neighbor_gpu_ptx.h b/lib/gpu/neighbor_gpu_ptx.h deleted file mode 100644 index 5080ff7426..0000000000 --- a/lib/gpu/neighbor_gpu_ptx.h +++ /dev/null @@ -1,809 +0,0 @@ -const char * neighbor_gpu = -" .version 2.3\n" -" .target sm_20\n" -" .address_size 64\n" -" .global .texref neigh_tex;\n" -" .entry calc_cell_id (\n" -" .param .u64 __cudaparm_calc_cell_id_pos,\n" -" .param .u64 __cudaparm_calc_cell_id_cell_id,\n" -" .param .u64 __cudaparm_calc_cell_id_particle_id,\n" -" .param .f32 __cudaparm_calc_cell_id_boxlo0,\n" -" .param .f32 __cudaparm_calc_cell_id_boxlo1,\n" -" .param .f32 __cudaparm_calc_cell_id_boxlo2,\n" -" .param .f32 __cudaparm_calc_cell_id_boxhi0,\n" -" .param .f32 __cudaparm_calc_cell_id_boxhi1,\n" -" .param .f32 __cudaparm_calc_cell_id_boxhi2,\n" -" .param .f32 __cudaparm_calc_cell_id_cell_size,\n" -" .param .s32 __cudaparm_calc_cell_id_ncellx,\n" -" .param .s32 __cudaparm_calc_cell_id_ncelly,\n" -" .param .s32 __cudaparm_calc_cell_id_nall)\n" -" {\n" -" .reg .u32 %r<25>;\n" -" .reg .u64 %rd<8>;\n" -" .reg .f32 %f<35>;\n" -" .reg .f64 %fd<11>;\n" -" .reg .pred %p<3>;\n" -" .loc 16 29 0\n" -"$LDWbegin_calc_cell_id:\n" -" mov.u32 %r1, %tid.x;\n" -" mov.u32 %r2, %ctaid.x;\n" -" mov.u32 %r3, %ntid.x;\n" -" mul.lo.u32 %r4, %r2, %r3;\n" -" add.u32 %r5, %r1, %r4;\n" -" ld.param.s32 %r6, [__cudaparm_calc_cell_id_nall];\n" -" setp.le.s32 %p1, %r6, %r5;\n" -" @%p1 bra $Lt_0_1026;\n" -" .loc 16 33 0\n" -" mov.u32 %r7, %r5;\n" -" mov.s32 %r8, 0;\n" -" mov.u32 %r9, %r8;\n" -" mov.s32 %r10, 0;\n" -" mov.u32 %r11, %r10;\n" -" mov.s32 %r12, 0;\n" -" mov.u32 %r13, %r12;\n" -" tex.1d.v4.f32.s32 {%f1,%f2,%f3,%f4},[neigh_tex,{%r7,%r9,%r11,%r13}];\n" -" mov.f32 %f5, %f1;\n" -" mov.f32 %f6, %f2;\n" -" mov.f32 %f7, %f3;\n" -" .loc 16 46 0\n" -" ld.param.f32 %f8, [__cudaparm_calc_cell_id_cell_size];\n" -" neg.ftz.f32 %f9, %f8;\n" -" ld.param.f32 %f10, [__cudaparm_calc_cell_id_boxlo0];\n" -" ld.param.f32 %f11, [__cudaparm_calc_cell_id_boxlo2];\n" -" ld.param.f32 %f12, [__cudaparm_calc_cell_id_boxlo1];\n" -" ld.param.s32 %r14, [__cudaparm_calc_cell_id_ncellx];\n" -" ld.param.s32 %r15, [__cudaparm_calc_cell_id_ncelly];\n" -" ld.param.f32 %f13, [__cudaparm_calc_cell_id_boxhi2];\n" -" sub.ftz.f32 %f14, %f13, %f11;\n" -" add.ftz.f32 %f15, %f8, %f14;\n" -" sub.ftz.f32 %f16, %f7, %f11;\n" -" max.ftz.f32 %f17, %f9, %f16;\n" -" min.ftz.f32 %f18, %f15, %f17;\n" -" div.approx.ftz.f32 %f19, %f18, %f8;\n" -" cvt.ftz.f64.f32 %fd1, %f19;\n" -" mov.f64 %fd2, 0d3ff0000000000000; \n" -" add.f64 %fd3, %fd1, %fd2;\n" -" cvt.rzi.u32.f64 %r16, %fd3;\n" -" mul.lo.u32 %r17, %r14, %r16;\n" -" mul.lo.u32 %r18, %r15, %r17;\n" -" ld.param.f32 %f20, [__cudaparm_calc_cell_id_boxhi1];\n" -" sub.ftz.f32 %f21, %f20, %f12;\n" -" add.ftz.f32 %f22, %f8, %f21;\n" -" sub.ftz.f32 %f23, %f6, %f12;\n" -" max.ftz.f32 %f24, %f9, %f23;\n" -" min.ftz.f32 %f25, %f22, %f24;\n" -" div.approx.ftz.f32 %f26, %f25, %f8;\n" -" cvt.ftz.f64.f32 %fd4, %f26;\n" -" mov.f64 %fd5, 0d3ff0000000000000; \n" -" add.f64 %fd6, %fd4, %fd5;\n" -" cvt.rzi.u32.f64 %r19, %fd6;\n" -" mul.lo.u32 %r20, %r14, %r19;\n" -" add.u32 %r21, %r18, %r20;\n" -" ld.param.f32 %f27, [__cudaparm_calc_cell_id_boxhi0];\n" -" sub.ftz.f32 %f28, %f27, %f10;\n" -" add.ftz.f32 %f29, %f8, %f28;\n" -" sub.ftz.f32 %f30, %f5, %f10;\n" -" max.ftz.f32 %f31, %f9, %f30;\n" -" min.ftz.f32 %f32, %f29, %f31;\n" -" div.approx.ftz.f32 %f33, %f32, %f8;\n" -" cvt.ftz.f64.f32 %fd7, %f33;\n" -" mov.f64 %fd8, 0d3ff0000000000000; \n" -" add.f64 %fd9, %fd7, %fd8;\n" -" cvt.rzi.u32.f64 %r22, %fd9;\n" -" add.u32 %r23, %r21, %r22;\n" -" .loc 16 50 0\n" -" cvt.s64.s32 %rd1, %r5;\n" -" mul.wide.s32 %rd2, %r5, 4;\n" -" ld.param.u64 %rd3, [__cudaparm_calc_cell_id_cell_id];\n" -" add.u64 %rd4, %rd3, %rd2;\n" -" st.global.u32 [%rd4+0], %r23;\n" -" .loc 16 51 0\n" -" ld.param.u64 %rd5, [__cudaparm_calc_cell_id_particle_id];\n" -" add.u64 %rd6, %rd5, %rd2;\n" -" st.global.s32 [%rd6+0], %r5;\n" -"$Lt_0_1026:\n" -" .loc 16 53 0\n" -" exit;\n" -"$LDWend_calc_cell_id:\n" -" }\n" -" .entry kernel_calc_cell_counts (\n" -" .param .u64 __cudaparm_kernel_calc_cell_counts_cell_id,\n" -" .param .u64 __cudaparm_kernel_calc_cell_counts_cell_counts,\n" -" .param .s32 __cudaparm_kernel_calc_cell_counts_nall,\n" -" .param .s32 __cudaparm_kernel_calc_cell_counts_ncell)\n" -" {\n" -" .reg .u32 %r<33>;\n" -" .reg .u64 %rd<15>;\n" -" .reg .pred %p<13>;\n" -" .loc 16 56 0\n" -"$LDWbegin_kernel_calc_cell_counts:\n" -" mov.u32 %r1, %ctaid.x;\n" -" mov.u32 %r2, %ntid.x;\n" -" mul.lo.u32 %r3, %r1, %r2;\n" -" mov.u32 %r4, %tid.x;\n" -" add.u32 %r5, %r4, %r3;\n" -" ld.param.s32 %r6, [__cudaparm_kernel_calc_cell_counts_nall];\n" -" setp.gt.s32 %p1, %r6, %r5;\n" -" @!%p1 bra $Lt_1_7426;\n" -" .loc 16 59 0\n" -" ld.param.u64 %rd1, [__cudaparm_kernel_calc_cell_counts_cell_id];\n" -" cvt.s64.s32 %rd2, %r5;\n" -" mul.wide.s32 %rd3, %r5, 4;\n" -" add.u64 %rd4, %rd1, %rd3;\n" -" ld.global.u32 %r7, [%rd4+0];\n" -" mov.u32 %r8, 0;\n" -" setp.ne.s32 %p2, %r5, %r8;\n" -" @%p2 bra $Lt_1_7938;\n" -" add.s32 %r9, %r7, 1;\n" -" mov.u32 %r10, 0;\n" -" setp.le.s32 %p3, %r9, %r10;\n" -" @%p3 bra $Lt_1_8450;\n" -" mov.s32 %r11, %r9;\n" -" ld.param.u64 %rd5, [__cudaparm_kernel_calc_cell_counts_cell_counts];\n" -" mov.s32 %r12, 0;\n" -" mov.s32 %r13, %r11;\n" -"$Lt_1_8962:\n" -" .loc 16 64 0\n" -" mov.s32 %r14, 0;\n" -" st.global.s32 [%rd5+0], %r14;\n" -" add.s32 %r12, %r12, 1;\n" -" add.u64 %rd5, %rd5, 4;\n" -" setp.ne.s32 %p4, %r9, %r12;\n" -" @%p4 bra $Lt_1_8962;\n" -"$Lt_1_8450:\n" -"$Lt_1_7938:\n" -" sub.s32 %r15, %r6, 1;\n" -" setp.ne.s32 %p5, %r5, %r15;\n" -" @%p5 bra $Lt_1_9474;\n" -" .loc 16 67 0\n" -" add.s32 %r9, %r7, 1;\n" -" mov.s32 %r16, %r9;\n" -" ld.param.s32 %r17, [__cudaparm_kernel_calc_cell_counts_ncell];\n" -" setp.gt.s32 %p6, %r9, %r17;\n" -" @%p6 bra $Lt_1_9986;\n" -" sub.s32 %r18, %r17, %r7;\n" -" add.s32 %r19, %r17, 1;\n" -" ld.param.u64 %rd6, [__cudaparm_kernel_calc_cell_counts_cell_counts];\n" -" cvt.s64.s32 %rd7, %r9;\n" -" mul.wide.s32 %rd8, %r9, 4;\n" -" add.u64 %rd9, %rd6, %rd8;\n" -" mov.s32 %r20, %r18;\n" -"$Lt_1_10498:\n" -" .loc 16 68 0\n" -" st.global.s32 [%rd9+0], %r6;\n" -" add.s32 %r16, %r16, 1;\n" -" add.u64 %rd9, %rd9, 4;\n" -" setp.ne.s32 %p7, %r19, %r16;\n" -" @%p7 bra $Lt_1_10498;\n" -"$Lt_1_9986:\n" -"$Lt_1_9474:\n" -" selp.s32 %r21, 1, 0, %p1;\n" -" mov.s32 %r22, 0;\n" -" set.gt.u32.s32 %r23, %r5, %r22;\n" -" neg.s32 %r24, %r23;\n" -" and.b32 %r25, %r21, %r24;\n" -" mov.u32 %r26, 0;\n" -" setp.eq.s32 %p8, %r25, %r26;\n" -" @%p8 bra $Lt_1_11010;\n" -" .loc 16 72 0\n" -" ld.global.u32 %r27, [%rd4+-4];\n" -" setp.eq.s32 %p9, %r7, %r27;\n" -" @%p9 bra $Lt_1_11522;\n" -" .loc 16 74 0\n" -" add.s32 %r28, %r27, 1;\n" -" mov.s32 %r29, %r28;\n" -" setp.gt.s32 %p10, %r28, %r7;\n" -" @%p10 bra $Lt_1_12034;\n" -" sub.s32 %r30, %r7, %r27;\n" -" add.s32 %r9, %r7, 1;\n" -" ld.param.u64 %rd10, [__cudaparm_kernel_calc_cell_counts_cell_counts];\n" -" cvt.s64.s32 %rd11, %r28;\n" -" mul.wide.s32 %rd12, %r28, 4;\n" -" add.u64 %rd13, %rd10, %rd12;\n" -" mov.s32 %r31, %r30;\n" -"$Lt_1_12546:\n" -" .loc 16 75 0\n" -" st.global.s32 [%rd13+0], %r5;\n" -" add.s32 %r29, %r29, 1;\n" -" add.u64 %rd13, %rd13, 4;\n" -" setp.ne.s32 %p11, %r9, %r29;\n" -" @%p11 bra $Lt_1_12546;\n" -"$Lt_1_12034:\n" -"$Lt_1_11522:\n" -"$Lt_1_11010:\n" -"$Lt_1_7426:\n" -" .loc 16 79 0\n" -" exit;\n" -"$LDWend_kernel_calc_cell_counts:\n" -" }\n" -" .entry transpose (\n" -" .param .u64 __cudaparm_transpose_out,\n" -" .param .u64 __cudaparm_transpose_in,\n" -" .param .s32 __cudaparm_transpose_columns_in,\n" -" .param .s32 __cudaparm_transpose_rows_in)\n" -" {\n" -" .reg .u32 %r<32>;\n" -" .reg .u64 %rd<23>;\n" -" .reg .f32 %f<4>;\n" -" .reg .pred %p<4>;\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_32571_32_non_const_block112[288];\n" -" .loc 16 86 0\n" -"$LDWbegin_transpose:\n" -" mov.u32 %r1, %ctaid.x;\n" -" mul.lo.u32 %r2, %r1, 8;\n" -" mov.u32 %r3, %ctaid.y;\n" -" mul.lo.u32 %r4, %r3, 8;\n" -" mov.u32 %r5, %tid.x;\n" -" add.u32 %r6, %r2, %r5;\n" -" mov.u32 %r7, %tid.y;\n" -" add.u32 %r8, %r4, %r7;\n" -" ld.param.s32 %r9, [__cudaparm_transpose_rows_in];\n" -" ld.param.s32 %r10, [__cudaparm_transpose_columns_in];\n" -" set.gt.u32.u32 %r11, %r9, %r8;\n" -" neg.s32 %r12, %r11;\n" -" set.gt.u32.u32 %r13, %r10, %r6;\n" -" neg.s32 %r14, %r13;\n" -" and.b32 %r15, %r12, %r14;\n" -" mov.u32 %r16, 0;\n" -" setp.eq.s32 %p1, %r15, %r16;\n" -" @%p1 bra $Lt_2_2306;\n" -" .loc 16 98 0\n" -" mov.u64 %rd1, __cuda___cuda_local_var_32571_32_non_const_block112;\n" -" ld.param.u64 %rd2, [__cudaparm_transpose_in];\n" -" mul.lo.u32 %r17, %r10, %r8;\n" -" add.u32 %r18, %r6, %r17;\n" -" cvt.u64.u32 %rd3, %r18;\n" -" mul.wide.u32 %rd4, %r18, 4;\n" -" add.u64 %rd5, %rd2, %rd4;\n" -" ld.global.s32 %r19, [%rd5+0];\n" -" cvt.rn.f32.s32 %f1, %r19;\n" -" cvt.u64.u32 %rd6, %r5;\n" -" cvt.u64.u32 %rd7, %r7;\n" -" mul.wide.u32 %rd8, %r7, 9;\n" -" add.u64 %rd9, %rd6, %rd8;\n" -" mul.lo.u64 %rd10, %rd9, 4;\n" -" add.u64 %rd11, %rd1, %rd10;\n" -" st.shared.f32 [%rd11+0], %f1;\n" -"$Lt_2_2306:\n" -" mov.u64 %rd1, __cuda___cuda_local_var_32571_32_non_const_block112;\n" -" .loc 16 100 0\n" -" bar.sync 0;\n" -" add.u32 %r20, %r2, %r7;\n" -" add.u32 %r21, %r4, %r5;\n" -" set.gt.u32.u32 %r22, %r9, %r21;\n" -" neg.s32 %r23, %r22;\n" -" set.gt.u32.u32 %r24, %r10, %r20;\n" -" neg.s32 %r25, %r24;\n" -" and.b32 %r26, %r23, %r25;\n" -" mov.u32 %r27, 0;\n" -" setp.eq.s32 %p2, %r26, %r27;\n" -" @%p2 bra $Lt_2_2818;\n" -" .loc 16 105 0\n" -" cvt.u64.u32 %rd12, %r7;\n" -" cvt.u64.u32 %rd13, %r5;\n" -" mul.wide.u32 %rd14, %r5, 9;\n" -" add.u64 %rd15, %rd12, %rd14;\n" -" mul.lo.u64 %rd16, %rd15, 4;\n" -" add.u64 %rd17, %rd1, %rd16;\n" -" ld.shared.f32 %f2, [%rd17+0];\n" -" cvt.rzi.ftz.s32.f32 %r28, %f2;\n" -" ld.param.u64 %rd18, [__cudaparm_transpose_out];\n" -" mul.lo.u32 %r29, %r9, %r20;\n" -" add.u32 %r30, %r21, %r29;\n" -" cvt.u64.u32 %rd19, %r30;\n" -" mul.wide.u32 %rd20, %r30, 4;\n" -" add.u64 %rd21, %rd18, %rd20;\n" -" st.global.s32 [%rd21+0], %r28;\n" -"$Lt_2_2818:\n" -" .loc 16 106 0\n" -" exit;\n" -"$LDWend_transpose:\n" -" }\n" -" .entry calc_neigh_list_cell (\n" -" .param .u64 __cudaparm_calc_neigh_list_cell_x_,\n" -" .param .u64 __cudaparm_calc_neigh_list_cell_cell_particle_id,\n" -" .param .u64 __cudaparm_calc_neigh_list_cell_cell_counts,\n" -" .param .u64 __cudaparm_calc_neigh_list_cell_nbor_list,\n" -" .param .u64 __cudaparm_calc_neigh_list_cell_host_nbor_list,\n" -" .param .u64 __cudaparm_calc_neigh_list_cell_host_numj,\n" -" .param .s32 __cudaparm_calc_neigh_list_cell_neigh_bin_size,\n" -" .param .f32 __cudaparm_calc_neigh_list_cell_cell_size,\n" -" .param .s32 __cudaparm_calc_neigh_list_cell_ncellx,\n" -" .param .s32 __cudaparm_calc_neigh_list_cell_ncelly,\n" -" .param .s32 __cudaparm_calc_neigh_list_cell_ncellz,\n" -" .param .s32 __cudaparm_calc_neigh_list_cell_inum,\n" -" .param .s32 __cudaparm_calc_neigh_list_cell_nt,\n" -" .param .s32 __cudaparm_calc_neigh_list_cell_nall,\n" -" .param .s32 __cudaparm_calc_neigh_list_cell_t_per_atom)\n" -" {\n" -" .reg .u32 %r<118>;\n" -" .reg .u64 %rd<52>;\n" -" .reg .f32 %f<41>;\n" -" .reg .f64 %fd<4>;\n" -" .reg .pred %p<23>;\n" -" .shared .align 16 .b8 __cuda___cuda_local_var_32609_34_non_const_pos_sh496[2048];\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_32608_31_non_const_cell_list_sh2544[512];\n" -" .loc 16 116 0\n" -"$LDWbegin_calc_neigh_list_cell:\n" -" .loc 16 128 0\n" -" ld.param.s32 %r1, [__cudaparm_calc_neigh_list_cell_ncelly];\n" -" mov.u32 %r2, %ctaid.y;\n" -" rem.u32 %r3, %r2, %r1;\n" -" div.u32 %r4, %r2, %r1;\n" -" ld.param.s32 %r5, [__cudaparm_calc_neigh_list_cell_ncellx];\n" -" mul.lo.s32 %r6, %r5, %r3;\n" -" mul.lo.s32 %r7, %r5, %r4;\n" -" mul.lo.s32 %r8, %r7, %r1;\n" -" cvt.s32.u32 %r9, %ctaid.x;\n" -" ld.param.u64 %rd1, [__cudaparm_calc_neigh_list_cell_cell_counts];\n" -" add.s32 %r10, %r6, %r8;\n" -" add.s32 %r11, %r9, %r10;\n" -" cvt.s64.s32 %rd2, %r11;\n" -" mul.wide.s32 %rd3, %r11, 4;\n" -" add.u64 %rd4, %rd1, %rd3;\n" -" ldu.global.s32 %r12, [%rd4+0];\n" -" .loc 16 129 0\n" -" ldu.global.s32 %r13, [%rd4+4];\n" -" .loc 16 137 0\n" -" sub.s32 %r14, %r13, %r12;\n" -" mov.u32 %r15, %ntid.x;\n" -" cvt.rn.f32.u32 %f1, %r15;\n" -" cvt.rn.f32.s32 %f2, %r14;\n" -" div.approx.ftz.f32 %f3, %f2, %f1;\n" -" cvt.rpi.ftz.f32.f32 %f4, %f3;\n" -" cvt.rzi.ftz.s32.f32 %r16, %f4;\n" -" mov.u32 %r17, 0;\n" -" setp.le.s32 %p1, %r16, %r17;\n" -" @%p1 bra $Lt_3_14082;\n" -" sub.s32 %r18, %r3, 1;\n" -" mov.s32 %r19, 0;\n" -" max.s32 %r20, %r18, %r19;\n" -" sub.s32 %r21, %r1, 1;\n" -" add.s32 %r22, %r3, 1;\n" -" min.s32 %r23, %r21, %r22;\n" -" ld.param.s32 %r24, [__cudaparm_calc_neigh_list_cell_ncellz];\n" -" sub.s32 %r25, %r24, 1;\n" -" add.s32 %r26, %r4, 1;\n" -" min.s32 %r27, %r25, %r26;\n" -" sub.s32 %r28, %r9, 1;\n" -" mov.s32 %r29, 0;\n" -" max.s32 %r30, %r28, %r29;\n" -" add.s32 %r31, %r9, 1;\n" -" sub.s32 %r32, %r5, 1;\n" -" min.s32 %r33, %r31, %r32;\n" -" mov.s32 %r34, %r16;\n" -" cvt.s32.u32 %r35, %tid.x;\n" -" add.s32 %r36, %r12, %r35;\n" -" mov.u32 %r37, 0;\n" -" ld.param.s32 %r38, [__cudaparm_calc_neigh_list_cell_inum];\n" -" cvt.s64.s32 %rd5, %r38;\n" -" sub.s32 %r39, %r4, 1;\n" -" mov.s32 %r40, %r36;\n" -" mov.s32 %r41, 0;\n" -" max.s32 %r42, %r39, %r41;\n" -" setp.ge.s32 %p2, %r27, %r42;\n" -" ld.param.s32 %r43, [__cudaparm_calc_neigh_list_cell_nt];\n" -" ld.param.s32 %r44, [__cudaparm_calc_neigh_list_cell_nall];\n" -" mov.s32 %r45, 0;\n" -" mov.u64 %rd6, __cuda___cuda_local_var_32609_34_non_const_pos_sh496;\n" -" mov.u64 %rd7, __cuda___cuda_local_var_32608_31_non_const_cell_list_sh2544;\n" -" mov.s32 %r46, %r34;\n" -"$Lt_3_14594:\n" -" .loc 16 140 0\n" -" mov.s32 %r47, %r44;\n" -" setp.ge.s32 %p3, %r40, %r13;\n" -" @%p3 bra $Lt_3_14850;\n" -" .loc 16 146 0\n" -" ld.param.u64 %rd8, [__cudaparm_calc_neigh_list_cell_cell_particle_id];\n" -" add.u32 %r48, %r36, %r37;\n" -" cvt.s64.s32 %rd9, %r48;\n" -" mul.wide.s32 %rd10, %r48, 4;\n" -" add.u64 %rd11, %rd8, %rd10;\n" -" ld.global.s32 %r47, [%rd11+0];\n" -"$Lt_3_14850:\n" -" setp.lt.s32 %p4, %r47, %r43;\n" -" @!%p4 bra $Lt_3_15362;\n" -" .loc 16 149 0\n" -" mov.u32 %r49, %r47;\n" -" mov.s32 %r50, 0;\n" -" mov.u32 %r51, %r50;\n" -" mov.s32 %r52, 0;\n" -" mov.u32 %r53, %r52;\n" -" mov.s32 %r54, 0;\n" -" mov.u32 %r55, %r54;\n" -" tex.1d.v4.f32.s32 {%f5,%f6,%f7,%f8},[neigh_tex,{%r49,%r51,%r53,%r55}];\n" -" mov.f32 %f9, %f5;\n" -" mov.f32 %f10, %f6;\n" -" mov.f32 %f11, %f7;\n" -" mov.f32 %f12, %f9;\n" -" mov.f32 %f13, %f10;\n" -" mov.f32 %f14, %f11;\n" -"$Lt_3_15362:\n" -" cvt.s64.s32 %rd12, %r47;\n" -" mul.wide.s32 %rd13, %r47, 4;\n" -" setp.ge.s32 %p5, %r47, %r38;\n" -" @%p5 bra $Lt_3_16130;\n" -" .loc 16 153 0\n" -" ld.param.u64 %rd14, [__cudaparm_calc_neigh_list_cell_nbor_list];\n" -" add.u64 %rd15, %rd12, %rd5;\n" -" mul.lo.u64 %rd16, %rd15, 4;\n" -" add.u64 %rd17, %rd14, %rd16;\n" -" mov.s64 %rd18, %rd17;\n" -" .loc 16 154 0\n" -" ld.param.s32 %r56, [__cudaparm_calc_neigh_list_cell_t_per_atom];\n" -" sub.s32 %r57, %r56, 1;\n" -" mul.lo.s32 %r58, %r47, %r57;\n" -" cvt.s64.s32 %rd19, %r58;\n" -" add.u64 %rd20, %rd19, %rd5;\n" -" mul.lo.u64 %rd21, %rd20, 4;\n" -" add.u64 %rd22, %rd17, %rd21;\n" -" .loc 16 155 0\n" -" mul.lo.s32 %r59, %r56, %r38;\n" -" sub.s32 %r60, %r59, %r56;\n" -" .loc 16 156 0\n" -" add.u64 %rd23, %rd13, %rd14;\n" -" st.global.s32 [%rd23+0], %r47;\n" -" bra.uni $Lt_3_15874;\n" -"$Lt_3_16130:\n" -" .loc 16 159 0\n" -" ld.param.u64 %rd24, [__cudaparm_calc_neigh_list_cell_host_numj];\n" -" add.u64 %rd25, %rd24, %rd13;\n" -" mul.lo.u64 %rd26, %rd5, 4;\n" -" sub.u64 %rd18, %rd25, %rd26;\n" -" .loc 16 160 0\n" -" ld.param.u64 %rd27, [__cudaparm_calc_neigh_list_cell_host_nbor_list];\n" -" ld.param.s32 %r61, [__cudaparm_calc_neigh_list_cell_neigh_bin_size];\n" -" sub.s32 %r62, %r47, %r38;\n" -" mul.lo.s32 %r63, %r61, %r62;\n" -" cvt.s64.s32 %rd28, %r63;\n" -" mul.wide.s32 %rd29, %r63, 4;\n" -" add.u64 %rd22, %rd27, %rd29;\n" -" mov.s32 %r60, 0;\n" -"$Lt_3_15874:\n" -" .loc 16 165 0\n" -" mov.s32 %r64, %r42;\n" -" @!%p2 bra $Lt_3_24066;\n" -" sub.s32 %r65, %r27, %r42;\n" -" add.s32 %r66, %r65, 1;\n" -" setp.le.s32 %p6, %r20, %r23;\n" -" add.s32 %r67, %r27, 1;\n" -" mov.s32 %r68, 0;\n" -" mov.s32 %r69, %r66;\n" -"$Lt_3_16898:\n" -" .loc 16 166 0\n" -" mov.s32 %r70, %r20;\n" -" @!%p6 bra $Lt_3_17154;\n" -" sub.s32 %r71, %r23, %r20;\n" -" add.s32 %r72, %r71, 1;\n" -" setp.ge.s32 %p7, %r33, %r30;\n" -" add.s32 %r73, %r23, 1;\n" -" mov.s32 %r74, %r72;\n" -"$Lt_3_17666:\n" -" @!%p7 bra $Lt_3_17922;\n" -" sub.s32 %r75, %r33, %r30;\n" -" add.s32 %r76, %r75, 1;\n" -" mul.lo.s32 %r77, %r70, %r5;\n" -" mul.lo.s32 %r78, %r64, %r5;\n" -" mul.lo.s32 %r79, %r78, %r1;\n" -" add.s32 %r80, %r33, 1;\n" -" add.s32 %r81, %r77, %r79;\n" -" add.s32 %r82, %r81, %r30;\n" -" add.s32 %r83, %r80, %r81;\n" -" cvt.s64.s32 %rd30, %r82;\n" -" mul.wide.s32 %rd31, %r82, 4;\n" -" add.u64 %rd32, %rd1, %rd31;\n" -" mov.s32 %r84, %r76;\n" -"$Lt_3_18434:\n" -" .loc 16 171 0\n" -" ld.global.s32 %r85, [%rd32+0];\n" -" .loc 16 172 0\n" -" ld.global.s32 %r86, [%rd32+4];\n" -" .loc 16 176 0\n" -" sub.s32 %r87, %r86, %r85;\n" -" cvt.rn.f32.s32 %f15, %r87;\n" -" mov.f32 %f16, 0f43000000; \n" -" div.approx.ftz.f32 %f17, %f15, %f16;\n" -" cvt.rpi.ftz.f32.f32 %f18, %f17;\n" -" cvt.rzi.ftz.s32.f32 %r88, %f18;\n" -" mov.u32 %r89, 0;\n" -" setp.le.s32 %p8, %r88, %r89;\n" -" @%p8 bra $Lt_3_18690;\n" -" mov.s32 %r90, %r88;\n" -" mov.s32 %r91, 0;\n" -" setp.lt.s32 %p9, %r47, %r43;\n" -" mul.lo.s32 %r92, %r88, 128;\n" -" mov.s32 %r93, %r90;\n" -"$Lt_3_19202:\n" -" sub.s32 %r94, %r87, %r91;\n" -" mov.s32 %r95, 128;\n" -" min.s32 %r96, %r94, %r95;\n" -" setp.le.s32 %p10, %r96, %r35;\n" -" @%p10 bra $Lt_3_19458;\n" -" .loc 16 183 0\n" -" ld.param.u64 %rd33, [__cudaparm_calc_neigh_list_cell_cell_particle_id];\n" -" add.s32 %r97, %r91, %r35;\n" -" add.s32 %r98, %r85, %r97;\n" -" cvt.s64.s32 %rd34, %r98;\n" -" mul.wide.s32 %rd35, %r98, 4;\n" -" add.u64 %rd36, %rd33, %rd35;\n" -" ld.global.s32 %r99, [%rd36+0];\n" -" .loc 16 184 0\n" -" cvt.s64.s32 %rd37, %r35;\n" -" mul.wide.s32 %rd38, %r35, 4;\n" -" add.u64 %rd39, %rd7, %rd38;\n" -" st.shared.s32 [%rd39+0], %r99;\n" -" .loc 16 185 0\n" -" mov.u32 %r100, %r99;\n" -" mov.s32 %r101, 0;\n" -" mov.u32 %r102, %r101;\n" -" mov.s32 %r103, 0;\n" -" mov.u32 %r104, %r103;\n" -" mov.s32 %r105, 0;\n" -" mov.u32 %r106, %r105;\n" -" tex.1d.v4.f32.s32 {%f19,%f20,%f21,%f22},[neigh_tex,{%r100,%r102,%r104,%r106}];\n" -" mov.f32 %f23, %f19;\n" -" mov.f32 %f24, %f20;\n" -" mov.f32 %f25, %f21;\n" -" .loc 16 186 0\n" -" mul.lo.u64 %rd40, %rd37, 16;\n" -" add.u64 %rd41, %rd6, %rd40;\n" -" st.shared.v2.f32 [%rd41+0], {%f23,%f24};\n" -" .loc 16 188 0\n" -" st.shared.f32 [%rd41+8], %f25;\n" -"$Lt_3_19458:\n" -" .loc 16 190 0\n" -" bar.sync 0;\n" -" @!%p9 bra $Lt_3_20482;\n" -" mov.u32 %r107, 0;\n" -" setp.le.s32 %p11, %r96, %r107;\n" -" @%p11 bra $Lt_3_20482;\n" -" mov.s32 %r108, %r96;\n" -" mov.s64 %rd42, 0;\n" -" ld.param.f32 %f26, [__cudaparm_calc_neigh_list_cell_cell_size];\n" -" mul.ftz.f32 %f27, %f26, %f26;\n" -" mov.s64 %rd43, %rd6;\n" -" mov.f32 %f28, %f14;\n" -" mov.f32 %f29, %f13;\n" -" mov.f32 %f30, %f12;\n" -" mov.s32 %r109, 0;\n" -" mov.s32 %r110, %r108;\n" -"$Lt_3_20994:\n" -" ld.shared.v4.f32 {%f31,%f32,%f33,_}, [%rd43+0];\n" -" .loc 16 196 0\n" -" sub.ftz.f32 %f34, %f30, %f31;\n" -" .loc 16 197 0\n" -" sub.ftz.f32 %f35, %f29, %f32;\n" -" .loc 16 198 0\n" -" sub.ftz.f32 %f36, %f28, %f33;\n" -" .loc 16 195 0\n" -" mul.ftz.f32 %f37, %f35, %f35;\n" -" fma.rn.ftz.f32 %f38, %f34, %f34, %f37;\n" -" fma.rn.ftz.f32 %f39, %f36, %f36, %f38;\n" -" setp.gt.ftz.f32 %p12, %f27, %f39;\n" -" @!%p12 bra $Lt_3_25346;\n" -" cvt.ftz.f64.f32 %fd1, %f39;\n" -" mov.f64 %fd2, 0d3ee4f8b588e368f1; \n" -" setp.gt.f64 %p13, %fd1, %fd2;\n" -" @!%p13 bra $Lt_3_25346;\n" -" .loc 16 202 0\n" -" add.s32 %r68, %r68, 1;\n" -" ld.param.s32 %r111, [__cudaparm_calc_neigh_list_cell_neigh_bin_size];\n" -" setp.lt.s32 %p14, %r111, %r68;\n" -" @%p14 bra $Lt_3_25346;\n" -" .loc 16 204 0\n" -" mul.lo.u64 %rd44, %rd42, 4;\n" -" add.u64 %rd45, %rd7, %rd44;\n" -" ld.shared.s32 %r112, [%rd45+0];\n" -" st.global.s32 [%rd22+0], %r112;\n" -" cvt.s64.s32 %rd46, %r60;\n" -" mul.wide.s32 %rd47, %r60, 4;\n" -" add.u64 %rd48, %rd22, %rd47;\n" -" add.u64 %rd49, %rd48, 4;\n" -" add.u64 %rd50, %rd22, 4;\n" -" ld.param.s32 %r113, [__cudaparm_calc_neigh_list_cell_t_per_atom];\n" -" sub.s32 %r114, %r113, 1;\n" -" and.b32 %r115, %r68, %r114;\n" -" mov.s32 %r116, 0;\n" -" setp.eq.s32 %p15, %r115, %r116;\n" -" selp.u64 %rd22, %rd49, %rd50, %p15;\n" -"$Lt_3_25346:\n" -"$L_3_13570:\n" -" .loc 16 202 0\n" -" add.s32 %r109, %r109, 1;\n" -" add.s64 %rd42, %rd42, 1;\n" -" add.u64 %rd43, %rd43, 16;\n" -" setp.ne.s32 %p16, %r96, %r109;\n" -" @%p16 bra $Lt_3_20994;\n" -"$Lt_3_20482:\n" -"$Lt_3_19970:\n" -" .loc 16 212 0\n" -" bar.sync 0;\n" -" add.s32 %r91, %r91, 128;\n" -" setp.ne.s32 %p17, %r91, %r92;\n" -" @%p17 bra $Lt_3_19202;\n" -"$Lt_3_18690:\n" -" add.s32 %r82, %r82, 1;\n" -" add.u64 %rd32, %rd32, 4;\n" -" setp.ne.s32 %p18, %r82, %r83;\n" -" @%p18 bra $Lt_3_18434;\n" -"$Lt_3_17922:\n" -" add.s32 %r70, %r70, 1;\n" -" setp.ne.s32 %p19, %r73, %r70;\n" -" @%p19 bra $Lt_3_17666;\n" -"$Lt_3_17154:\n" -" add.s32 %r64, %r64, 1;\n" -" setp.ne.s32 %p20, %r67, %r64;\n" -" @%p20 bra $Lt_3_16898;\n" -" bra.uni $Lt_3_16386;\n" -"$Lt_3_24066:\n" -" mov.s32 %r68, 0;\n" -"$Lt_3_16386:\n" -" @!%p4 bra $Lt_3_23042;\n" -" .loc 16 218 0\n" -" st.global.s32 [%rd18+0], %r68;\n" -"$Lt_3_23042:\n" -" add.s32 %r45, %r45, 1;\n" -" add.u32 %r37, %r37, %r15;\n" -" add.s32 %r40, %r40, %r15;\n" -" setp.ne.s32 %p21, %r16, %r45;\n" -" @%p21 bra $Lt_3_14594;\n" -"$Lt_3_14082:\n" -" .loc 16 220 0\n" -" exit;\n" -"$LDWend_calc_neigh_list_cell:\n" -" }\n" -" .entry kernel_special (\n" -" .param .u64 __cudaparm_kernel_special_dev_nbor,\n" -" .param .u64 __cudaparm_kernel_special_host_nbor_list,\n" -" .param .u64 __cudaparm_kernel_special_host_numj,\n" -" .param .u64 __cudaparm_kernel_special_tag,\n" -" .param .u64 __cudaparm_kernel_special_nspecial,\n" -" .param .u64 __cudaparm_kernel_special_special,\n" -" .param .s32 __cudaparm_kernel_special_inum,\n" -" .param .s32 __cudaparm_kernel_special_nt,\n" -" .param .s32 __cudaparm_kernel_special_max_nbors,\n" -" .param .s32 __cudaparm_kernel_special_t_per_atom)\n" -" {\n" -" .reg .u32 %r<45>;\n" -" .reg .u64 %rd<45>;\n" -" .reg .pred %p<11>;\n" -" .loc 16 226 0\n" -"$LDWbegin_kernel_special:\n" -" ld.param.s32 %r1, [__cudaparm_kernel_special_t_per_atom];\n" -" cvt.s32.u32 %r2, %tid.x;\n" -" div.s32 %r3, %r2, %r1;\n" -" cvt.s32.u32 %r4, %ntid.x;\n" -" div.s32 %r5, %r4, %r1;\n" -" cvt.s32.u32 %r6, %ctaid.x;\n" -" mul.lo.s32 %r7, %r6, %r5;\n" -" add.s32 %r8, %r3, %r7;\n" -" ld.param.s32 %r9, [__cudaparm_kernel_special_nt];\n" -" setp.ge.s32 %p1, %r8, %r9;\n" -" @%p1 bra $Lt_4_6146;\n" -" .loc 16 236 0\n" -" ld.param.u64 %rd1, [__cudaparm_kernel_special_nspecial];\n" -" mul.lo.s32 %r10, %r8, 3;\n" -" cvt.s64.s32 %rd2, %r10;\n" -" mul.wide.s32 %rd3, %r10, 4;\n" -" add.u64 %rd4, %rd1, %rd3;\n" -" ld.global.s32 %r11, [%rd4+0];\n" -" .loc 16 237 0\n" -" ld.global.s32 %r12, [%rd4+4];\n" -" .loc 16 238 0\n" -" ld.global.s32 %r13, [%rd4+8];\n" -" ld.param.s32 %r14, [__cudaparm_kernel_special_inum];\n" -" setp.ge.s32 %p2, %r8, %r14;\n" -" @%p2 bra $Lt_4_6914;\n" -" .loc 16 244 0\n" -" ld.param.u64 %rd5, [__cudaparm_kernel_special_dev_nbor];\n" -" cvt.s64.s32 %rd6, %r8;\n" -" cvt.s64.s32 %rd7, %r14;\n" -" add.u64 %rd8, %rd6, %rd7;\n" -" mul.lo.u64 %rd9, %rd8, 4;\n" -" add.u64 %rd10, %rd5, %rd9;\n" -" ld.global.s32 %r15, [%rd10+0];\n" -" .loc 16 246 0\n" -" mul.lo.s32 %r16, %r14, %r1;\n" -" mov.s32 %r17, %r16;\n" -" .loc 16 248 0\n" -" sub.s32 %r18, %r1, 1;\n" -" mul.lo.s32 %r19, %r18, %r8;\n" -" add.s32 %r20, %r14, %r19;\n" -" cvt.s64.s32 %rd11, %r20;\n" -" mul.wide.s32 %rd12, %r20, 4;\n" -" add.u64 %rd13, %rd10, %rd12;\n" -" and.b32 %r21, %r18, %r15;\n" -" cvt.s64.s32 %rd14, %r21;\n" -" div.s32 %r22, %r15, %r1;\n" -" mul.lo.s32 %r23, %r16, %r22;\n" -" cvt.s64.s32 %rd15, %r23;\n" -" add.u64 %rd16, %rd14, %rd15;\n" -" mul.lo.u64 %rd17, %rd16, 4;\n" -" add.u64 %rd18, %rd13, %rd17;\n" -" .loc 16 249 0\n" -" and.b32 %r24, %r18, %r2;\n" -" cvt.s64.s32 %rd19, %r24;\n" -" mul.wide.s32 %rd20, %r24, 4;\n" -" add.u64 %rd21, %rd13, %rd20;\n" -" bra.uni $Lt_4_6658;\n" -"$Lt_4_6914:\n" -" .loc 16 252 0\n" -" sub.s32 %r25, %r8, %r14;\n" -" ld.param.u64 %rd22, [__cudaparm_kernel_special_host_nbor_list];\n" -" ld.param.s32 %r26, [__cudaparm_kernel_special_max_nbors];\n" -" mul.lo.s32 %r27, %r26, %r25;\n" -" cvt.s64.s32 %rd23, %r27;\n" -" mul.wide.s32 %rd24, %r27, 4;\n" -" add.u64 %rd25, %rd22, %rd24;\n" -" mov.s64 %rd21, %rd25;\n" -" .loc 16 254 0\n" -" ld.param.u64 %rd26, [__cudaparm_kernel_special_host_numj];\n" -" cvt.s64.s32 %rd27, %r25;\n" -" mul.wide.s32 %rd28, %r25, 4;\n" -" add.u64 %rd29, %rd26, %rd28;\n" -" ld.global.s32 %r28, [%rd29+0];\n" -" cvt.s64.s32 %rd30, %r28;\n" -" mul.wide.s32 %rd31, %r28, 4;\n" -" add.u64 %rd18, %rd25, %rd31;\n" -" mov.s32 %r17, 1;\n" -"$Lt_4_6658:\n" -" setp.ge.u64 %p3, %rd21, %rd18;\n" -" @%p3 bra $Lt_4_7170;\n" -" mov.s32 %r29, 0;\n" -" setp.gt.s32 %p4, %r13, %r29;\n" -" cvt.s64.s32 %rd32, %r17;\n" -" ld.param.u64 %rd33, [__cudaparm_kernel_special_tag];\n" -"$Lt_4_7682:\n" -" .loc 16 258 0\n" -" ld.global.s32 %r30, [%rd21+0];\n" -" .loc 16 259 0\n" -" cvt.s64.s32 %rd34, %r30;\n" -" mul.wide.s32 %rd35, %r30, 4;\n" -" add.u64 %rd36, %rd33, %rd35;\n" -" ld.global.s32 %r31, [%rd36+0];\n" -" @!%p4 bra $Lt_4_7938;\n" -" mov.s32 %r32, %r13;\n" -" cvt.s64.s32 %rd37, %r8;\n" -" cvt.s64.s32 %rd38, %r9;\n" -" mul.wide.s32 %rd39, %r9, 4;\n" -" ld.param.u64 %rd40, [__cudaparm_kernel_special_special];\n" -" mul.wide.s32 %rd41, %r8, 4;\n" -" add.u64 %rd42, %rd40, %rd41;\n" -" mov.s32 %r33, 0;\n" -" mov.s32 %r34, %r32;\n" -"$Lt_4_8450:\n" -" ld.global.s32 %r35, [%rd42+0];\n" -" setp.ne.s32 %p5, %r35, %r31;\n" -" @%p5 bra $Lt_4_8706;\n" -" .loc 16 269 0\n" -" setp.le.s32 %p6, %r11, %r33;\n" -" mov.s32 %r36, 3;\n" -" mov.s32 %r37, 2;\n" -" selp.s32 %r38, %r36, %r37, %p6;\n" -" mov.s32 %r39, 2;\n" -" mov.s32 %r40, 1;\n" -" selp.s32 %r41, %r39, %r40, %p6;\n" -" setp.le.s32 %p7, %r12, %r33;\n" -" selp.s32 %r42, %r38, %r41, %p7;\n" -" shl.b32 %r43, %r42, 30;\n" -" xor.b32 %r30, %r30, %r43;\n" -" .loc 16 270 0\n" -" st.global.s32 [%rd21+0], %r30;\n" -"$Lt_4_8706:\n" -" add.s32 %r33, %r33, 1;\n" -" add.u64 %rd42, %rd39, %rd42;\n" -" setp.ne.s32 %p8, %r13, %r33;\n" -" @%p8 bra $Lt_4_8450;\n" -"$Lt_4_7938:\n" -" .loc 16 257 0\n" -" mul.lo.u64 %rd43, %rd32, 4;\n" -" add.u64 %rd21, %rd21, %rd43;\n" -" setp.lt.u64 %p9, %rd21, %rd18;\n" -" @%p9 bra $Lt_4_7682;\n" -"$Lt_4_7170:\n" -"$Lt_4_6146:\n" -" .loc 16 276 0\n" -" exit;\n" -"$LDWend_kernel_special:\n" -" }\n" -; diff --git a/lib/gpu/pppm_d.ptx b/lib/gpu/pppm_d.ptx deleted file mode 100644 index 9f1eb71503..0000000000 --- a/lib/gpu/pppm_d.ptx +++ /dev/null @@ -1,900 +0,0 @@ - .version 2.3 - .target sm_20 - .address_size 64 - // compiled with /usr/local/cuda/open64/lib//be - // nvopencc 4.0 built on 2011-05-12 - - //----------------------------------------------------------- - // Compiling /tmp/tmpxft_00009b29_00000000-9_lal_pppm.cpp3.i (/home/sjplimp/ccBI#.sIoydv) - //----------------------------------------------------------- - - //----------------------------------------------------------- - // Options: - //----------------------------------------------------------- - // Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64 - // -O3 (Optimization level) - // -g0 (Debug level) - // -m2 (Report advisories) - //----------------------------------------------------------- - - .file 1 "" - .file 2 "/tmp/tmpxft_00009b29_00000000-8_lal_pppm.cudafe2.gpu" - .file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h" - .file 4 "/usr/local/cuda/include/crt/device_runtime.h" - .file 5 "/usr/local/cuda/include/host_defines.h" - .file 6 "/usr/local/cuda/include/builtin_types.h" - .file 7 "/usr/local/cuda/include/device_types.h" - .file 8 "/usr/local/cuda/include/driver_types.h" - .file 9 "/usr/local/cuda/include/surface_types.h" - .file 10 "/usr/local/cuda/include/texture_types.h" - .file 11 "/usr/local/cuda/include/vector_types.h" - .file 12 "/usr/local/cuda/include/device_launch_parameters.h" - .file 13 "/usr/local/cuda/include/crt/storage_class.h" - .file 14 "/usr/include/bits/types.h" - .file 15 "/usr/include/time.h" - .file 16 "/usr/local/cuda/include/sm_11_atomic_functions.h" - .file 17 "lal_pppm.cu" - .file 18 "/usr/local/cuda/include/common_functions.h" - .file 19 "/usr/local/cuda/include/math_functions.h" - .file 20 "/usr/local/cuda/include/math_constants.h" - .file 21 "/usr/local/cuda/include/device_functions.h" - .file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h" - .file 23 "/usr/local/cuda/include/sm_13_double_functions.h" - .file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h" - .file 25 "/usr/local/cuda/include/sm_20_intrinsics.h" - .file 26 "/usr/local/cuda/include/surface_functions.h" - .file 27 "/usr/local/cuda/include/texture_fetch_functions.h" - .file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h" - - .global .texref pos_tex; - .global .texref q_tex; - - .entry particle_map ( - .param .u64 __cudaparm_particle_map_x_, - .param .u64 __cudaparm_particle_map_q_, - .param .f64 __cudaparm_particle_map_delvolinv, - .param .s32 __cudaparm_particle_map_nlocal, - .param .u64 __cudaparm_particle_map_counts, - .param .u64 __cudaparm_particle_map_ans, - .param .f64 __cudaparm_particle_map_b_lo_x, - .param .f64 __cudaparm_particle_map_b_lo_y, - .param .f64 __cudaparm_particle_map_b_lo_z, - .param .f64 __cudaparm_particle_map_delxinv, - .param .f64 __cudaparm_particle_map_delyinv, - .param .f64 __cudaparm_particle_map_delzinv, - .param .s32 __cudaparm_particle_map_nlocal_x, - .param .s32 __cudaparm_particle_map_nlocal_y, - .param .s32 __cudaparm_particle_map_nlocal_z, - .param .s32 __cudaparm_particle_map_atom_stride, - .param .s32 __cudaparm_particle_map_max_atoms, - .param .u64 __cudaparm_particle_map_error) - { - .reg .u32 %r<50>; - .reg .u64 %rd<12>; - .reg .f32 %f<14>; - .reg .f64 %fd<36>; - .reg .pred %p<11>; - .loc 17 50 0 -$LDWbegin_particle_map: - cvt.s32.u32 %r1, %ntid.x; - cvt.s32.u32 %r2, %ctaid.x; - mul24.lo.s32 %r3, %r2, %r1; - cvt.s32.u32 %r4, %nctaid.x; - mul24.lo.s32 %r5, %r4, %r1; - mov.u32 %r6, %tid.x; - add.u32 %r7, %r3, %r6; - sub.s32 %r8, %r5, 1; - mul.lo.s32 %r9, %r7, 64; - div.s32 %r10, %r9, %r5; - mul.lo.s32 %r11, %r8, %r10; - sub.s32 %r12, %r9, %r11; - ld.param.s32 %r13, [__cudaparm_particle_map_nlocal]; - setp.le.s32 %p1, %r13, %r12; - @%p1 bra $Lt_0_7426; - .loc 17 62 0 - mov.u32 %r14, %r12; - mov.s32 %r15, 0; - mov.u32 %r16, %r15; - mov.s32 %r17, 0; - mov.u32 %r18, %r17; - mov.s32 %r19, 0; - mov.u32 %r20, %r19; - tex.1d.v4.f32.s32 {%f1,%f2,%f3,%f4},[pos_tex,{%r14,%r16,%r18,%r20}]; - mov.f32 %f5, %f1; - mov.f32 %f6, %f2; - mov.f32 %f7, %f3; - .loc 17 64 0 - mov.u32 %r21, %r12; - mov.s32 %r22, 0; - mov.u32 %r23, %r22; - mov.s32 %r24, 0; - mov.u32 %r25, %r24; - mov.s32 %r26, 0; - mov.u32 %r27, %r26; - tex.1d.v4.f32.s32 {%f8,%f9,%f10,%f11},[q_tex,{%r21,%r23,%r25,%r27}]; - mov.f32 %f12, %f8; - cvt.ftz.f64.f32 %fd1, %f12; - ld.param.f64 %fd2, [__cudaparm_particle_map_delvolinv]; - mul.f64 %fd3, %fd1, %fd2; - mov.f64 %fd4, 0d0000000000000000; // 0 - setp.neu.f64 %p2, %fd3, %fd4; - @!%p2 bra $Lt_0_7426; - .loc 17 67 0 - ld.param.f64 %fd5, [__cudaparm_particle_map_delxinv]; - cvt.ftz.f64.f32 %fd6, %f5; - ld.param.f64 %fd7, [__cudaparm_particle_map_b_lo_x]; - sub.f64 %fd8, %fd6, %fd7; - mul.f64 %fd9, %fd5, %fd8; - mov.f64 %fd10, 0d0000000000000000; // 0 - setp.lt.f64 %p3, %fd9, %fd10; - @%p3 bra $Lt_0_8706; - ld.param.f64 %fd11, [__cudaparm_particle_map_delyinv]; - cvt.ftz.f64.f32 %fd12, %f6; - ld.param.f64 %fd13, [__cudaparm_particle_map_b_lo_y]; - sub.f64 %fd14, %fd12, %fd13; - mul.f64 %fd15, %fd11, %fd14; - mov.f64 %fd16, 0d0000000000000000; // 0 - setp.lt.f64 %p4, %fd15, %fd16; - @%p4 bra $Lt_0_8706; - ld.param.f64 %fd17, [__cudaparm_particle_map_delzinv]; - cvt.ftz.f64.f32 %fd18, %f7; - ld.param.f64 %fd19, [__cudaparm_particle_map_b_lo_z]; - sub.f64 %fd20, %fd18, %fd19; - mul.f64 %fd21, %fd17, %fd20; - mov.f64 %fd22, 0d0000000000000000; // 0 - setp.lt.f64 %p5, %fd21, %fd22; - @%p5 bra $Lt_0_8706; - cvt.rzi.s32.f64 %r28, %fd9; - ld.param.s32 %r29, [__cudaparm_particle_map_nlocal_x]; - setp.ge.s32 %p6, %r28, %r29; - @%p6 bra $Lt_0_8706; - cvt.rzi.s32.f64 %r30, %fd15; - ld.param.s32 %r31, [__cudaparm_particle_map_nlocal_y]; - setp.ge.s32 %p7, %r30, %r31; - @%p7 bra $Lt_0_8706; - cvt.rzi.s32.f64 %r32, %fd21; - ld.param.s32 %r33, [__cudaparm_particle_map_nlocal_z]; - setp.gt.s32 %p8, %r33, %r32; - @%p8 bra $L_0_4866; -$Lt_0_8706: -$L_0_5122: - .loc 17 76 0 - mov.s32 %r34, 1; - ld.param.u64 %rd1, [__cudaparm_particle_map_error]; - st.global.s32 [%rd1+0], %r34; - bra.uni $Lt_0_7426; -$L_0_4866: - .loc 17 83 0 - mul.lo.s32 %r35, %r32, %r31; - add.s32 %r36, %r30, %r35; - mul.lo.s32 %r37, %r36, %r29; - add.s32 %r38, %r28, %r37; - ld.param.u64 %rd2, [__cudaparm_particle_map_counts]; - cvt.s64.s32 %rd3, %r38; - mul.wide.s32 %rd4, %r38, 4; - add.u64 %rd5, %rd2, %rd4; - mov.s32 %r39, 1; - atom.global.add.s32 %r40, [%rd5], %r39; - mov.s32 %r41, %r40; - ld.param.s32 %r42, [__cudaparm_particle_map_max_atoms]; - setp.gt.s32 %p9, %r42, %r41; - @%p9 bra $Lt_0_7682; - .loc 17 85 0 - mov.s32 %r43, 2; - ld.param.u64 %rd6, [__cudaparm_particle_map_error]; - st.global.s32 [%rd6+0], %r43; - .loc 16 118 0 - mov.s32 %r44, -1; - atom.global.add.s32 %r45, [%rd5], %r44; - bra.uni $Lt_0_7426; -$Lt_0_7682: - .loc 17 88 0 - ld.param.u64 %rd7, [__cudaparm_particle_map_ans]; - ld.param.s32 %r46, [__cudaparm_particle_map_atom_stride]; - mul.lo.s32 %r47, %r46, %r41; - add.s32 %r48, %r38, %r47; - cvt.s64.s32 %rd8, %r48; - mul.wide.s32 %rd9, %r48, 32; - add.u64 %rd10, %rd7, %rd9; - cvt.rn.f64.s32 %fd23, %r28; - mov.f64 %fd24, 0d3fe0000000000000; // 0.5 - add.f64 %fd25, %fd23, %fd24; - sub.f64 %fd26, %fd25, %fd9; - cvt.rn.f64.s32 %fd27, %r30; - mov.f64 %fd28, 0d3fe0000000000000; // 0.5 - add.f64 %fd29, %fd27, %fd28; - sub.f64 %fd30, %fd29, %fd15; - st.global.v2.f64 [%rd10+0], {%fd26,%fd30}; - cvt.rn.f64.s32 %fd31, %r32; - mov.f64 %fd32, 0d3fe0000000000000; // 0.5 - add.f64 %fd33, %fd31, %fd32; - sub.f64 %fd34, %fd33, %fd21; - st.global.v2.f64 [%rd10+16], {%fd34,%fd3}; -$Lt_0_7426: -$L_0_4610: -$Lt_0_6914: -$Lt_0_6402: - .loc 17 92 0 - exit; -$LDWend_particle_map: - } // particle_map - - .entry make_rho ( - .param .u64 __cudaparm_make_rho_counts, - .param .u64 __cudaparm_make_rho_atoms, - .param .u64 __cudaparm_make_rho_brick, - .param .u64 __cudaparm_make_rho__rho_coeff, - .param .s32 __cudaparm_make_rho_atom_stride, - .param .s32 __cudaparm_make_rho_npts_x, - .param .s32 __cudaparm_make_rho_npts_y, - .param .s32 __cudaparm_make_rho_npts_z, - .param .s32 __cudaparm_make_rho_nlocal_x, - .param .s32 __cudaparm_make_rho_nlocal_y, - .param .s32 __cudaparm_make_rho_nlocal_z, - .param .s32 __cudaparm_make_rho_order_m_1, - .param .s32 __cudaparm_make_rho_order, - .param .s32 __cudaparm_make_rho_order2) - { - .reg .u32 %r<119>; - .reg .u64 %rd<57>; - .reg .f64 %fd<26>; - .reg .pred %p<27>; - .shared .align 8 .b8 __cuda___cuda_local_var_32578_34_non_const_rho_coeff200[512]; - .shared .align 8 .b8 __cuda___cuda_local_var_32579_34_non_const_front712[640]; - .shared .align 8 .b8 __cuda___cuda_local_var_32580_34_non_const_ans1352[4096]; - .loc 17 101 0 -$LDWbegin_make_rho: - ld.param.s32 %r1, [__cudaparm_make_rho_order2]; - ld.param.s32 %r2, [__cudaparm_make_rho_order]; - add.s32 %r3, %r1, %r2; - cvt.s32.u32 %r4, %tid.x; - setp.le.s32 %p1, %r3, %r4; - @%p1 bra $Lt_1_16898; - .loc 17 108 0 - mov.u64 %rd1, __cuda___cuda_local_var_32578_34_non_const_rho_coeff200; - cvt.s64.s32 %rd2, %r4; - mul.wide.s32 %rd3, %r4, 8; - ld.param.u64 %rd4, [__cudaparm_make_rho__rho_coeff]; - add.u64 %rd5, %rd4, %rd3; - ld.global.f64 %fd1, [%rd5+0]; - add.u64 %rd6, %rd3, %rd1; - st.shared.f64 [%rd6+0], %fd1; -$Lt_1_16898: - mov.u64 %rd1, __cuda___cuda_local_var_32578_34_non_const_rho_coeff200; - shr.s32 %r5, %r4, 31; - mov.s32 %r6, 31; - and.b32 %r7, %r5, %r6; - add.s32 %r8, %r7, %r4; - shr.s32 %r9, %r8, 5; - mul.lo.s32 %r10, %r9, 32; - sub.s32 %r11, %r4, %r10; - setp.lt.s32 %p2, %r11, %r2; - @!%p2 bra $Lt_1_17410; - .loc 17 114 0 - mov.u64 %rd7, __cuda___cuda_local_var_32579_34_non_const_front712; - mov.f64 %fd2, 0d0000000000000000; // 0 - cvt.s64.s32 %rd8, %r11; - shr.s32 %r12, %r4, 31; - mov.s32 %r13, 31; - and.b32 %r14, %r12, %r13; - add.s32 %r15, %r14, %r4; - shr.s32 %r16, %r15, 5; - cvt.s64.s32 %rd9, %r16; - mul.wide.s32 %rd10, %r16, 40; - add.u64 %rd11, %rd8, %rd10; - mul.lo.u64 %rd12, %rd11, 8; - add.u64 %rd13, %rd7, %rd12; - st.shared.f64 [%rd13+256], %fd2; -$Lt_1_17410: - mov.u64 %rd7, __cuda___cuda_local_var_32579_34_non_const_front712; - .loc 17 116 0 - bar.sync 0; - ld.param.s32 %r17, [__cudaparm_make_rho_npts_x]; - shr.s32 %r18, %r17, 31; - mov.s32 %r19, 31; - and.b32 %r20, %r18, %r19; - add.s32 %r21, %r20, %r17; - shr.s32 %r22, %r21, 5; - add.s32 %r23, %r22, 1; - mov.u32 %r24, 0; - setp.le.s32 %p3, %r23, %r24; - @%p3 bra $Lt_1_17922; - shr.s32 %r25, %r4, 31; - mov.s32 %r26, 31; - and.b32 %r27, %r25, %r26; - add.s32 %r28, %r27, %r4; - shr.s32 %r29, %r28, 5; - add.s32 %r30, %r11, 32; - ld.param.s32 %r31, [__cudaparm_make_rho_nlocal_y]; - ld.param.s32 %r32, [__cudaparm_make_rho_nlocal_x]; - mul.lo.s32 %r33, %r31, %r32; - mov.u32 %r34, %ctaid.x; - mul.lo.u32 %r35, %r34, 2; - add.u32 %r36, %r29, %r35; - ld.param.s32 %r37, [__cudaparm_make_rho_npts_y]; - div.s32 %r38, %r36, %r37; - ld.param.s32 %r39, [__cudaparm_make_rho_order_m_1]; - setp.lt.s32 %p4, %r38, %r39; - sub.s32 %r40, %r39, %r38; - mov.s32 %r41, 0; - selp.s32 %r42, %r40, %r41, %p4; - ld.param.s32 %r43, [__cudaparm_make_rho_nlocal_z]; - setp.ge.s32 %p5, %r38, %r43; - sub.s32 %r44, %r43, %r38; - add.s32 %r45, %r44, %r2; - sub.s32 %r46, %r45, 1; - selp.s32 %r47, %r46, %r2, %p5; - rem.s32 %r48, %r36, %r37; - setp.lt.s32 %p6, %r48, %r39; - sub.s32 %r49, %r39, %r48; - mov.s32 %r50, 0; - selp.s32 %r51, %r49, %r50, %p6; - setp.ge.s32 %p7, %r48, %r31; - sub.s32 %r52, %r31, %r48; - add.s32 %r53, %r52, %r2; - sub.s32 %r54, %r53, 1; - selp.s32 %r55, %r54, %r2, %p7; - mov.s32 %r56, %r23; - mov.s32 %r57, 0; - setp.gt.s32 %p8, %r2, %r57; - mov.s32 %r58, 0; - cvt.s64.s32 %rd14, %r11; - cvt.s64.s32 %rd15, %r29; - mul.lo.s32 %r59, %r23, 32; - mul.wide.s32 %rd16, %r29, 40; - add.u64 %rd17, %rd14, %rd16; - ld.param.s32 %r60, [__cudaparm_make_rho_npts_z]; - setp.gt.s32 %p9, %r60, %r38; - mul.lo.u64 %rd18, %rd17, 8; - selp.s32 %r61, 1, 0, %p9; - add.u64 %rd19, %rd18, %rd7; - mov.u64 %rd20, __cuda___cuda_local_var_32580_34_non_const_ans1352; - mov.s32 %r62, %r56; -$Lt_1_18434: - // Loop body line 116, nesting depth: 1, estimated iterations: unknown - @!%p8 bra $Lt_1_18690; - mov.s32 %r63, %r2; - cvt.s64.s32 %rd21, %r4; - mul.wide.s32 %rd22, %r4, 8; - add.u64 %rd23, %rd20, %rd22; - mov.s32 %r64, 0; - mov.s32 %r65, %r63; -$Lt_1_19202: - // Loop body line 116, nesting depth: 2, estimated iterations: unknown - .loc 17 140 0 - mov.f64 %fd3, 0d0000000000000000; // 0 - st.shared.f64 [%rd23+0], %fd3; - add.s32 %r64, %r64, 1; - add.u64 %rd23, %rd23, 512; - setp.ne.s32 %p10, %r64, %r2; - @%p10 bra $Lt_1_19202; -$Lt_1_18690: - add.s32 %r66, %r11, %r58; - set.lt.u32.s32 %r67, %r66, %r32; - neg.s32 %r68, %r67; - and.b32 %r69, %r61, %r68; - mov.u32 %r70, 0; - setp.eq.s32 %p11, %r69, %r70; - @%p11 bra $Lt_1_20226; - .loc 17 143 0 - mov.s32 %r71, %r42; - setp.ge.s32 %p12, %r42, %r47; - @%p12 bra $Lt_1_20226; - sub.s32 %r72, %r47, %r42; - setp.lt.s32 %p13, %r51, %r55; - mov.s32 %r73, %r72; -$Lt_1_20738: - // Loop body line 143, nesting depth: 2, estimated iterations: unknown - .loc 17 145 0 - mov.s32 %r74, %r51; - @!%p13 bra $Lt_1_20994; - sub.s32 %r75, %r55, %r51; - sub.s32 %r76, %r71, %r42; - add.s32 %r77, %r38, %r42; - add.s32 %r78, %r48, %r51; - sub.s32 %r79, %r77, %r39; - sub.s32 %r80, %r78, %r39; - add.s32 %r81, %r76, %r79; - mul.lo.s32 %r82, %r33, %r81; - ld.param.s32 %r83, [__cudaparm_make_rho_atom_stride]; - ld.param.u64 %rd24, [__cudaparm_make_rho_counts]; - mov.s32 %r84, %r75; -$Lt_1_21506: - // Loop body line 145, nesting depth: 3, estimated iterations: unknown - .loc 17 147 0 - sub.s32 %r85, %r74, %r51; - add.s32 %r86, %r85, %r80; - mul.lo.s32 %r87, %r86, %r32; - add.s32 %r88, %r82, %r87; - add.s32 %r89, %r66, %r88; - cvt.s64.s32 %rd25, %r89; - mul.wide.s32 %rd26, %r89, 4; - add.u64 %rd27, %rd24, %rd26; - ld.global.s32 %r90, [%rd27+0]; - mul.lo.s32 %r91, %r90, %r83; - .loc 17 148 0 - mov.s32 %r92, %r89; - setp.ge.s32 %p14, %r89, %r91; - @%p14 bra $Lt_1_21762; - sub.s32 %r93, %r3, 1; - cvt.s64.s32 %rd28, %r83; - mul.wide.s32 %rd29, %r83, 32; - mov.s32 %r94, -1; - setp.gt.s32 %p15, %r93, %r94; - ld.param.u64 %rd30, [__cudaparm_make_rho_atoms]; - mul.lo.u64 %rd31, %rd25, 32; - add.u64 %rd32, %rd30, %rd31; -$Lt_1_22274: - // Loop body line 148, nesting depth: 4, estimated iterations: unknown - .loc 17 149 0 - ld.global.f64 %fd4, [%rd32+0]; - @!%p15 bra $Lt_1_29954; - sub.s32 %r95, %r93, %r74; - mov.s32 %r96, -1; - sub.s32 %r97, %r96, %r74; - cvt.s64.s32 %rd33, %r2; - mul.wide.s32 %rd34, %r2, 8; - ld.global.f64 %fd5, [%rd32+8]; - ld.global.f64 %fd6, [%rd32+16]; - cvt.s64.s32 %rd35, %r95; - mul.wide.s32 %rd36, %r95, 8; - add.u64 %rd37, %rd1, %rd36; - sub.s32 %r98, %r93, %r71; - cvt.s64.s32 %rd38, %r98; - mul.wide.s32 %rd39, %r98, 8; - add.u64 %rd40, %rd1, %rd39; - mov.f64 %fd7, 0d0000000000000000; // 0 - mov.f64 %fd8, 0d0000000000000000; // 0 -$Lt_1_23042: - // Loop body line 149, nesting depth: 5, estimated iterations: unknown - .loc 17 154 0 - ld.shared.f64 %fd9, [%rd37+0]; - mad.rn.f64 %fd8, %fd8, %fd5, %fd9; - .loc 17 155 0 - ld.shared.f64 %fd10, [%rd40+0]; - mad.rn.f64 %fd7, %fd7, %fd6, %fd10; - sub.u64 %rd40, %rd40, %rd34; - sub.s32 %r95, %r95, %r2; - sub.u64 %rd37, %rd37, %rd34; - setp.gt.s32 %p16, %r95, %r97; - @%p16 bra $Lt_1_23042; - bra.uni $Lt_1_22530; -$Lt_1_29954: - mov.f64 %fd7, 0d0000000000000000; // 0 - mov.f64 %fd8, 0d0000000000000000; // 0 -$Lt_1_22530: - .loc 17 157 0 - ld.global.f64 %fd11, [%rd32+24]; - mul.f64 %fd12, %fd7, %fd8; - mul.f64 %fd13, %fd11, %fd12; - @!%p8 bra $Lt_1_23554; - mov.s32 %r99, %r2; - cvt.s64.s32 %rd41, %r4; - mul.wide.s32 %rd42, %r4, 8; - add.u64 %rd43, %rd20, %rd42; - mov.s32 %r100, 0; - mov.s32 %r101, %r99; -$Lt_1_24066: - // Loop body line 157, nesting depth: 5, estimated iterations: unknown - .loc 17 161 0 - add.s32 %r102, %r100, %r1; - mov.s32 %r103, %r102; - setp.lt.s32 %p17, %r102, %r100; - @%p17 bra $Lt_1_30466; - cvt.s64.s32 %rd44, %r2; - mul.wide.s32 %rd34, %r2, 8; - cvt.s64.s32 %rd45, %r102; - mul.wide.s32 %rd46, %r102, 8; - add.u64 %rd47, %rd1, %rd46; - mov.f64 %fd14, 0d0000000000000000; // 0 -$Lt_1_24834: - // Loop body line 161, nesting depth: 6, estimated iterations: unknown - .loc 17 162 0 - ld.shared.f64 %fd15, [%rd47+0]; - mad.rn.f64 %fd14, %fd4, %fd14, %fd15; - sub.s32 %r103, %r103, %r2; - sub.u64 %rd47, %rd47, %rd34; - setp.ge.s32 %p18, %r103, %r100; - @%p18 bra $Lt_1_24834; - bra.uni $Lt_1_24322; -$Lt_1_30466: - mov.f64 %fd14, 0d0000000000000000; // 0 -$Lt_1_24322: - .loc 17 163 0 - ld.shared.f64 %fd16, [%rd43+0]; - mad.rn.f64 %fd17, %fd14, %fd13, %fd16; - st.shared.f64 [%rd43+0], %fd17; - add.s32 %r100, %r100, 1; - add.u64 %rd43, %rd43, 512; - setp.ne.s32 %p19, %r100, %r2; - @%p19 bra $Lt_1_24066; -$Lt_1_23554: - add.s32 %r92, %r92, %r83; - add.u64 %rd32, %rd29, %rd32; - setp.gt.s32 %p20, %r91, %r92; - @%p20 bra $Lt_1_22274; -$Lt_1_21762: - add.s32 %r74, %r74, 1; - setp.ne.s32 %p21, %r55, %r74; - @%p21 bra $Lt_1_21506; -$Lt_1_20994: - add.s32 %r71, %r71, 1; - setp.ne.s32 %p22, %r47, %r71; - @%p22 bra $Lt_1_20738; -$Lt_1_20226: -$Lt_1_19714: - .loc 17 172 0 - bar.sync 0; - @!%p2 bra $Lt_1_26626; - .loc 17 174 0 - ld.shared.f64 %fd18, [%rd19+256]; - st.shared.f64 [%rd19+0], %fd18; - .loc 17 175 0 - mov.f64 %fd19, 0d0000000000000000; // 0 - st.shared.f64 [%rd19+256], %fd19; - bra.uni $Lt_1_26370; -$Lt_1_26626: - .loc 17 177 0 - mov.f64 %fd20, 0d0000000000000000; // 0 - st.shared.f64 [%rd19+0], %fd20; -$Lt_1_26370: - @!%p8 bra $Lt_1_26882; - mov.s32 %r104, %r2; - cvt.s64.s32 %rd48, %r4; - mov.s32 %r105, %r11; - add.s32 %r106, %r11, %r2; - mul.wide.s32 %rd49, %r4, 8; - add.u64 %rd50, %rd20, %rd49; - mov.s64 %rd51, %rd19; - mov.s32 %r107, %r104; -$Lt_1_27394: - // Loop body line 177, nesting depth: 2, estimated iterations: unknown - .loc 17 180 0 - ld.shared.f64 %fd21, [%rd50+0]; - ld.shared.f64 %fd22, [%rd51+0]; - add.f64 %fd23, %fd21, %fd22; - st.shared.f64 [%rd51+0], %fd23; - .loc 17 181 0 - bar.sync 0; - add.s32 %r105, %r105, 1; - add.u64 %rd51, %rd51, 8; - add.u64 %rd50, %rd50, 512; - setp.ne.s32 %p23, %r105, %r106; - @%p23 bra $Lt_1_27394; -$Lt_1_26882: - set.lt.u32.s32 %r108, %r66, %r17; - neg.s32 %r109, %r108; - and.b32 %r110, %r61, %r109; - mov.u32 %r111, 0; - setp.eq.s32 %p24, %r110, %r111; - @%p24 bra $Lt_1_27906; - .loc 17 185 0 - ld.shared.f64 %fd24, [%rd19+0]; - ld.param.u64 %rd52, [__cudaparm_make_rho_brick]; - add.s32 %r112, %r11, %r58; - mul.lo.s32 %r113, %r37, %r17; - mul.lo.s32 %r114, %r38, %r113; - mul.lo.s32 %r115, %r48, %r17; - add.s32 %r116, %r114, %r115; - add.s32 %r117, %r112, %r116; - cvt.s64.s32 %rd53, %r117; - mul.wide.s32 %rd54, %r117, 8; - add.u64 %rd55, %rd52, %rd54; - st.global.f64 [%rd55+0], %fd24; -$Lt_1_27906: - add.s32 %r58, %r58, 32; - setp.ne.s32 %p25, %r58, %r59; - @%p25 bra $Lt_1_18434; -$Lt_1_17922: - .loc 17 189 0 - exit; -$LDWend_make_rho: - } // make_rho - - .entry interp ( - .param .u64 __cudaparm_interp_x_, - .param .u64 __cudaparm_interp_q_, - .param .s32 __cudaparm_interp_nlocal, - .param .u64 __cudaparm_interp_brick, - .param .u64 __cudaparm_interp__rho_coeff, - .param .s32 __cudaparm_interp_npts_x, - .param .s32 __cudaparm_interp_npts_yx, - .param .f64 __cudaparm_interp_b_lo_x, - .param .f64 __cudaparm_interp_b_lo_y, - .param .f64 __cudaparm_interp_b_lo_z, - .param .f64 __cudaparm_interp_delxinv, - .param .f64 __cudaparm_interp_delyinv, - .param .f64 __cudaparm_interp_delzinv, - .param .s32 __cudaparm_interp_order, - .param .s32 __cudaparm_interp_order2, - .param .f64 __cudaparm_interp_qqrd2e_scale, - .param .u64 __cudaparm_interp_ans) - { - .reg .u32 %r<56>; - .reg .u64 %rd<37>; - .reg .f32 %f<19>; - .reg .f64 %fd<63>; - .reg .pred %p<14>; - .shared .align 8 .b8 __cuda___cuda_local_var_32676_34_non_const_rho_coeff5568[512]; - .shared .align 8 .b8 __cuda___cuda_local_var_32677_34_non_const_rho1d_06080[4096]; - .shared .align 8 .b8 __cuda___cuda_local_var_32678_34_non_const_rho1d_110176[4096]; - // __cuda_local_var_32694_12_non_const_ek = 16 - .loc 17 199 0 -$LDWbegin_interp: - ld.param.s32 %r1, [__cudaparm_interp_order2]; - ld.param.s32 %r2, [__cudaparm_interp_order]; - add.s32 %r3, %r1, %r2; - cvt.s32.u32 %r4, %tid.x; - setp.le.s32 %p1, %r3, %r4; - @%p1 bra $Lt_2_8706; - .loc 17 206 0 - mov.u64 %rd1, __cuda___cuda_local_var_32676_34_non_const_rho_coeff5568; - cvt.s64.s32 %rd2, %r4; - mul.wide.s32 %rd3, %r4, 8; - ld.param.u64 %rd4, [__cudaparm_interp__rho_coeff]; - add.u64 %rd5, %rd4, %rd3; - ld.global.f64 %fd1, [%rd5+0]; - add.u64 %rd6, %rd3, %rd1; - st.shared.f64 [%rd6+0], %fd1; -$Lt_2_8706: - mov.u64 %rd1, __cuda___cuda_local_var_32676_34_non_const_rho_coeff5568; - .loc 17 207 0 - bar.sync 0; - mov.u32 %r5, %ctaid.x; - mov.u32 %r6, %ntid.x; - mul.lo.u32 %r7, %r5, %r6; - add.u32 %r8, %r4, %r7; - ld.param.s32 %r9, [__cudaparm_interp_nlocal]; - setp.le.s32 %p2, %r9, %r8; - @%p2 bra $Lt_2_9218; - .loc 17 215 0 - mov.u32 %r10, %r8; - mov.s32 %r11, 0; - mov.u32 %r12, %r11; - mov.s32 %r13, 0; - mov.u32 %r14, %r13; - mov.s32 %r15, 0; - mov.u32 %r16, %r15; - tex.1d.v4.f32.s32 {%f1,%f2,%f3,%f4},[pos_tex,{%r10,%r12,%r14,%r16}]; - mov.f32 %f5, %f1; - mov.f32 %f6, %f2; - mov.f32 %f7, %f3; - .loc 17 216 0 - mov.u32 %r17, %r8; - mov.s32 %r18, 0; - mov.u32 %r19, %r18; - mov.s32 %r20, 0; - mov.u32 %r21, %r20; - mov.s32 %r22, 0; - mov.u32 %r23, %r22; - tex.1d.v4.f32.s32 {%f8,%f9,%f10,%f11},[q_tex,{%r17,%r19,%r21,%r23}]; - mov.f32 %f12, %f8; - cvt.ftz.f64.f32 %fd2, %f12; - ld.param.f64 %fd3, [__cudaparm_interp_qqrd2e_scale]; - mul.f64 %fd4, %fd2, %fd3; - mov.f64 %fd5, 0d0000000000000000; // 0 - setp.neu.f64 %p3, %fd4, %fd5; - @!%p3 bra $Lt_2_9986; - mov.s32 %r24, 0; - setp.gt.s32 %p4, %r2, %r24; - ld.param.f64 %fd6, [__cudaparm_interp_delxinv]; - cvt.ftz.f64.f32 %fd7, %f5; - ld.param.f64 %fd8, [__cudaparm_interp_b_lo_x]; - sub.f64 %fd9, %fd7, %fd8; - mul.f64 %fd10, %fd6, %fd9; - @!%p4 bra $Lt_2_16386; - mov.u64 %rd7, __cuda___cuda_local_var_32677_34_non_const_rho1d_06080; - mov.u64 %rd8, __cuda___cuda_local_var_32678_34_non_const_rho1d_110176; - cvt.rzi.s32.f64 %r25, %fd10; - cvt.rn.f64.s32 %fd11, %r25; - mov.f64 %fd12, 0d3fe0000000000000; // 0.5 - add.f64 %fd13, %fd11, %fd12; - sub.f64 %fd14, %fd13, %fd10; - ld.param.f64 %fd15, [__cudaparm_interp_delyinv]; - cvt.ftz.f64.f32 %fd16, %f6; - ld.param.f64 %fd17, [__cudaparm_interp_b_lo_y]; - sub.f64 %fd18, %fd16, %fd17; - mul.f64 %fd19, %fd15, %fd18; - cvt.rzi.s32.f64 %r26, %fd19; - cvt.rn.f64.s32 %fd20, %r26; - mov.f64 %fd21, 0d3fe0000000000000; // 0.5 - add.f64 %fd22, %fd20, %fd21; - sub.f64 %fd23, %fd22, %fd19; - mov.s32 %r27, %r2; - cvt.s64.s32 %rd9, %r4; - mov.s32 %r28, %r1; - mul.wide.s32 %rd3, %r4, 8; - add.u64 %rd10, %rd3, %rd7; - add.u64 %rd11, %rd3, %rd8; - mov.s32 %r29, 0; - mov.s32 %r30, %r27; -$Lt_2_10754: - // Loop body line 216, nesting depth: 1, estimated iterations: unknown - .loc 17 235 0 - mov.f64 %fd24, 0d0000000000000000; // 0 - mov.f64 %fd25, 0d0000000000000000; // 0 - st.shared.f64 [%rd10+0], %fd25; - .loc 17 236 0 - mov.f64 %fd26, 0d0000000000000000; // 0 - mov.f64 %fd27, 0d0000000000000000; // 0 - st.shared.f64 [%rd11+0], %fd27; - .loc 17 237 0 - mov.s32 %r31, %r28; - setp.lt.s32 %p5, %r28, %r29; - @%p5 bra $Lt_2_11010; - cvt.s64.s32 %rd12, %r2; - mul.wide.s32 %rd13, %r2, 8; - cvt.s64.s32 %rd14, %r28; - mul.wide.s32 %rd15, %r28, 8; - add.u64 %rd16, %rd1, %rd15; -$Lt_2_11522: - // Loop body line 237, nesting depth: 2, estimated iterations: unknown - .loc 17 238 0 - ld.shared.f64 %fd28, [%rd16+0]; - mad.rn.f64 %fd24, %fd24, %fd14, %fd28; - st.shared.f64 [%rd10+0], %fd24; - .loc 17 239 0 - mad.rn.f64 %fd26, %fd26, %fd23, %fd28; - st.shared.f64 [%rd11+0], %fd26; - sub.s32 %r31, %r31, %r2; - sub.u64 %rd16, %rd16, %rd13; - setp.ge.s32 %p6, %r31, %r29; - @%p6 bra $Lt_2_11522; -$Lt_2_11010: - add.s32 %r29, %r29, 1; - add.s32 %r28, %r28, 1; - add.u64 %rd11, %rd11, 512; - add.u64 %rd10, %rd10, 512; - setp.ne.s32 %p7, %r28, %r3; - @%p7 bra $Lt_2_10754; - bra.uni $Lt_2_10242; -$Lt_2_16386: - cvt.rzi.s32.f64 %r25, %fd10; - mov.u64 %rd8, __cuda___cuda_local_var_32678_34_non_const_rho1d_110176; - mov.u64 %rd7, __cuda___cuda_local_var_32677_34_non_const_rho1d_06080; -$Lt_2_10242: - .loc 17 243 0 - ld.param.f64 %fd29, [__cudaparm_interp_delzinv]; - cvt.ftz.f64.f32 %fd30, %f7; - ld.param.f64 %fd31, [__cudaparm_interp_b_lo_z]; - sub.f64 %fd32, %fd30, %fd31; - mul.f64 %fd33, %fd29, %fd32; - cvt.rzi.s32.f64 %r32, %fd33; - ld.param.s32 %r33, [__cudaparm_interp_npts_yx]; - mul.lo.s32 %r34, %r32, %r33; - add.s32 %r35, %r25, %r34; - @!%p4 bra $Lt_2_16898; - cvt.rn.f64.s32 %fd34, %r32; - mov.f64 %fd35, 0d3fe0000000000000; // 0.5 - add.f64 %fd36, %fd34, %fd35; - sub.f64 %fd37, %fd36, %fd33; - mov.s32 %r36, %r2; - cvt.ftz.f64.f32 %fd38, %f6; - cvt.s64.s32 %rd17, %r4; - ld.param.f64 %fd39, [__cudaparm_interp_delyinv]; - ld.param.f64 %fd40, [__cudaparm_interp_b_lo_y]; - sub.f64 %fd41, %fd38, %fd40; - mul.f64 %fd42, %fd39, %fd41; - cvt.rzi.s32.f64 %r37, %fd42; - mul.wide.s32 %rd3, %r4, 8; - ld.param.s32 %r38, [__cudaparm_interp_npts_x]; - mul.lo.s32 %r39, %r37, %r38; - add.u64 %rd18, %rd3, %rd7; - add.u64 %rd19, %rd3, %rd8; - cvt.s64.s32 %rd20, %r38; - mul.wide.s32 %rd21, %r38, 32; - add.s32 %r40, %r39, %r35; - mov.s32 %r41, %r40; - ld.param.u64 %rd22, [__cudaparm_interp_brick]; - mov.s32 %r42, 0; - mov.f32 %f13, 0f00000000; // 0 - mov.f32 %f14, 0f00000000; // 0 - mov.f32 %f15, 0f00000000; // 0 - mov.s32 %r43, %r36; -$Lt_2_12802: - // Loop body line 243, nesting depth: 1, estimated iterations: unknown - .loc 17 246 0 - add.s32 %r44, %r42, %r1; - mov.s32 %r45, %r44; - setp.lt.s32 %p8, %r44, %r42; - @%p8 bra $Lt_2_17154; - cvt.s64.s32 %rd23, %r2; - mul.wide.s32 %rd13, %r2, 8; - cvt.s64.s32 %rd24, %r44; - mul.wide.s32 %rd25, %r44, 8; - add.u64 %rd26, %rd1, %rd25; - mov.f64 %fd43, 0d0000000000000000; // 0 -$Lt_2_13570: - // Loop body line 246, nesting depth: 2, estimated iterations: unknown - .loc 17 247 0 - ld.shared.f64 %fd44, [%rd26+0]; - mad.rn.f64 %fd43, %fd37, %fd43, %fd44; - sub.s32 %r45, %r45, %r2; - sub.u64 %rd26, %rd26, %rd13; - setp.ge.s32 %p9, %r45, %r42; - @%p9 bra $Lt_2_13570; - bra.uni $Lt_2_13058; -$Lt_2_17154: - mov.f64 %fd43, 0d0000000000000000; // 0 -$Lt_2_13058: - .loc 17 249 0 - mov.s32 %r46, %r41; - mov.s32 %r47, %r2; - mov.s32 %r48, %r46; - mul.f64 %fd45, %fd4, %fd43; - mov.s64 %rd27, %rd19; - cvt.s64.s32 %rd28, %r46; - mul.wide.s32 %rd29, %r46, 32; - mov.s32 %r49, 0; - mov.s32 %r50, %r47; -$Lt_2_14594: - // Loop body line 249, nesting depth: 2, estimated iterations: unknown - mov.s32 %r51, %r2; - mov.s32 %r52, %r48; - add.s32 %r53, %r48, %r2; - mov.s64 %rd30, %rd18; - ld.shared.f64 %fd46, [%rd27+0]; - add.u64 %rd31, %rd29, %rd22; - mul.f64 %fd47, %fd45, %fd46; - mov.s32 %r54, %r51; -$Lt_2_15362: - // Loop body line 249, nesting depth: 3, estimated iterations: unknown - .loc 17 253 0 - ld.shared.f64 %fd48, [%rd30+0]; - mul.f64 %fd49, %fd48, %fd47; - .loc 17 255 0 - cvt.ftz.f64.f32 %fd50, %f15; - ld.global.v2.f64 {%fd51,%fd52}, [%rd31+0]; - mul.f64 %fd53, %fd49, %fd51; - sub.f64 %fd54, %fd50, %fd53; - cvt.rn.ftz.f32.f64 %f15, %fd54; - .loc 17 256 0 - cvt.ftz.f64.f32 %fd55, %f14; - mul.f64 %fd56, %fd49, %fd52; - sub.f64 %fd57, %fd55, %fd56; - cvt.rn.ftz.f32.f64 %f14, %fd57; - .loc 17 257 0 - cvt.ftz.f64.f32 %fd58, %f13; - ld.global.f64 %fd59, [%rd31+16]; - mul.f64 %fd60, %fd49, %fd59; - sub.f64 %fd61, %fd58, %fd60; - cvt.rn.ftz.f32.f64 %f13, %fd61; - add.s32 %r52, %r52, 1; - add.u64 %rd31, %rd31, 32; - add.u64 %rd30, %rd30, 512; - setp.ne.s32 %p10, %r52, %r53; - @%p10 bra $Lt_2_15362; - add.s32 %r49, %r49, 1; - add.s32 %r48, %r48, %r38; - add.u64 %rd29, %rd29, %rd21; - add.u64 %rd27, %rd27, 512; - setp.ne.s32 %p11, %r49, %r2; - @%p11 bra $Lt_2_14594; - add.s32 %r42, %r42, 1; - add.s32 %r41, %r46, %r33; - setp.ne.s32 %p12, %r42, %r2; - @%p12 bra $Lt_2_12802; - bra.uni $Lt_2_9730; -$Lt_2_16898: - mov.f32 %f13, 0f00000000; // 0 - mov.f32 %f14, 0f00000000; // 0 - mov.f32 %f15, 0f00000000; // 0 - bra.uni $Lt_2_9730; -$Lt_2_9986: - mov.f32 %f13, 0f00000000; // 0 - mov.f32 %f14, 0f00000000; // 0 - mov.f32 %f15, 0f00000000; // 0 -$Lt_2_9730: - .loc 17 264 0 - ld.param.u64 %rd32, [__cudaparm_interp_ans]; - cvt.s64.s32 %rd33, %r8; - mul.wide.s32 %rd34, %r8, 16; - add.u64 %rd35, %rd32, %rd34; - mov.f32 %f16, %f17; - st.global.v4.f32 [%rd35+0], {%f15,%f14,%f13,%f16}; -$Lt_2_9218: - .loc 17 266 0 - exit; -$LDWend_interp: - } // interp - diff --git a/lib/gpu/pppm_d_ptx.h b/lib/gpu/pppm_d_ptx.h deleted file mode 100644 index b7b2d2f1d0..0000000000 --- a/lib/gpu/pppm_d_ptx.h +++ /dev/null @@ -1,837 +0,0 @@ -const char * pppm_d = -" .version 2.3\n" -" .target sm_20\n" -" .address_size 64\n" -" .global .texref pos_tex;\n" -" .global .texref q_tex;\n" -" .entry particle_map (\n" -" .param .u64 __cudaparm_particle_map_x_,\n" -" .param .u64 __cudaparm_particle_map_q_,\n" -" .param .f64 __cudaparm_particle_map_delvolinv,\n" -" .param .s32 __cudaparm_particle_map_nlocal,\n" -" .param .u64 __cudaparm_particle_map_counts,\n" -" .param .u64 __cudaparm_particle_map_ans,\n" -" .param .f64 __cudaparm_particle_map_b_lo_x,\n" -" .param .f64 __cudaparm_particle_map_b_lo_y,\n" -" .param .f64 __cudaparm_particle_map_b_lo_z,\n" -" .param .f64 __cudaparm_particle_map_delxinv,\n" -" .param .f64 __cudaparm_particle_map_delyinv,\n" -" .param .f64 __cudaparm_particle_map_delzinv,\n" -" .param .s32 __cudaparm_particle_map_nlocal_x,\n" -" .param .s32 __cudaparm_particle_map_nlocal_y,\n" -" .param .s32 __cudaparm_particle_map_nlocal_z,\n" -" .param .s32 __cudaparm_particle_map_atom_stride,\n" -" .param .s32 __cudaparm_particle_map_max_atoms,\n" -" .param .u64 __cudaparm_particle_map_error)\n" -" {\n" -" .reg .u32 %r<50>;\n" -" .reg .u64 %rd<12>;\n" -" .reg .f32 %f<14>;\n" -" .reg .f64 %fd<36>;\n" -" .reg .pred %p<11>;\n" -" .loc 17 50 0\n" -"$LDWbegin_particle_map:\n" -" cvt.s32.u32 %r1, %ntid.x;\n" -" cvt.s32.u32 %r2, %ctaid.x;\n" -" mul24.lo.s32 %r3, %r2, %r1;\n" -" cvt.s32.u32 %r4, %nctaid.x;\n" -" mul24.lo.s32 %r5, %r4, %r1;\n" -" mov.u32 %r6, %tid.x;\n" -" add.u32 %r7, %r3, %r6;\n" -" sub.s32 %r8, %r5, 1;\n" -" mul.lo.s32 %r9, %r7, 64;\n" -" div.s32 %r10, %r9, %r5;\n" -" mul.lo.s32 %r11, %r8, %r10;\n" -" sub.s32 %r12, %r9, %r11;\n" -" ld.param.s32 %r13, [__cudaparm_particle_map_nlocal];\n" -" setp.le.s32 %p1, %r13, %r12;\n" -" @%p1 bra $Lt_0_7426;\n" -" .loc 17 62 0\n" -" mov.u32 %r14, %r12;\n" -" mov.s32 %r15, 0;\n" -" mov.u32 %r16, %r15;\n" -" mov.s32 %r17, 0;\n" -" mov.u32 %r18, %r17;\n" -" mov.s32 %r19, 0;\n" -" mov.u32 %r20, %r19;\n" -" tex.1d.v4.f32.s32 {%f1,%f2,%f3,%f4},[pos_tex,{%r14,%r16,%r18,%r20}];\n" -" mov.f32 %f5, %f1;\n" -" mov.f32 %f6, %f2;\n" -" mov.f32 %f7, %f3;\n" -" .loc 17 64 0\n" -" mov.u32 %r21, %r12;\n" -" mov.s32 %r22, 0;\n" -" mov.u32 %r23, %r22;\n" -" mov.s32 %r24, 0;\n" -" mov.u32 %r25, %r24;\n" -" mov.s32 %r26, 0;\n" -" mov.u32 %r27, %r26;\n" -" tex.1d.v4.f32.s32 {%f8,%f9,%f10,%f11},[q_tex,{%r21,%r23,%r25,%r27}];\n" -" mov.f32 %f12, %f8;\n" -" cvt.ftz.f64.f32 %fd1, %f12;\n" -" ld.param.f64 %fd2, [__cudaparm_particle_map_delvolinv];\n" -" mul.f64 %fd3, %fd1, %fd2;\n" -" mov.f64 %fd4, 0d0000000000000000; \n" -" setp.neu.f64 %p2, %fd3, %fd4;\n" -" @!%p2 bra $Lt_0_7426;\n" -" .loc 17 67 0\n" -" ld.param.f64 %fd5, [__cudaparm_particle_map_delxinv];\n" -" cvt.ftz.f64.f32 %fd6, %f5;\n" -" ld.param.f64 %fd7, [__cudaparm_particle_map_b_lo_x];\n" -" sub.f64 %fd8, %fd6, %fd7;\n" -" mul.f64 %fd9, %fd5, %fd8;\n" -" mov.f64 %fd10, 0d0000000000000000; \n" -" setp.lt.f64 %p3, %fd9, %fd10;\n" -" @%p3 bra $Lt_0_8706;\n" -" ld.param.f64 %fd11, [__cudaparm_particle_map_delyinv];\n" -" cvt.ftz.f64.f32 %fd12, %f6;\n" -" ld.param.f64 %fd13, [__cudaparm_particle_map_b_lo_y];\n" -" sub.f64 %fd14, %fd12, %fd13;\n" -" mul.f64 %fd15, %fd11, %fd14;\n" -" mov.f64 %fd16, 0d0000000000000000; \n" -" setp.lt.f64 %p4, %fd15, %fd16;\n" -" @%p4 bra $Lt_0_8706;\n" -" ld.param.f64 %fd17, [__cudaparm_particle_map_delzinv];\n" -" cvt.ftz.f64.f32 %fd18, %f7;\n" -" ld.param.f64 %fd19, [__cudaparm_particle_map_b_lo_z];\n" -" sub.f64 %fd20, %fd18, %fd19;\n" -" mul.f64 %fd21, %fd17, %fd20;\n" -" mov.f64 %fd22, 0d0000000000000000; \n" -" setp.lt.f64 %p5, %fd21, %fd22;\n" -" @%p5 bra $Lt_0_8706;\n" -" cvt.rzi.s32.f64 %r28, %fd9;\n" -" ld.param.s32 %r29, [__cudaparm_particle_map_nlocal_x];\n" -" setp.ge.s32 %p6, %r28, %r29;\n" -" @%p6 bra $Lt_0_8706;\n" -" cvt.rzi.s32.f64 %r30, %fd15;\n" -" ld.param.s32 %r31, [__cudaparm_particle_map_nlocal_y];\n" -" setp.ge.s32 %p7, %r30, %r31;\n" -" @%p7 bra $Lt_0_8706;\n" -" cvt.rzi.s32.f64 %r32, %fd21;\n" -" ld.param.s32 %r33, [__cudaparm_particle_map_nlocal_z];\n" -" setp.gt.s32 %p8, %r33, %r32;\n" -" @%p8 bra $L_0_4866;\n" -"$Lt_0_8706:\n" -"$L_0_5122:\n" -" .loc 17 76 0\n" -" mov.s32 %r34, 1;\n" -" ld.param.u64 %rd1, [__cudaparm_particle_map_error];\n" -" st.global.s32 [%rd1+0], %r34;\n" -" bra.uni $Lt_0_7426;\n" -"$L_0_4866:\n" -" .loc 17 83 0\n" -" mul.lo.s32 %r35, %r32, %r31;\n" -" add.s32 %r36, %r30, %r35;\n" -" mul.lo.s32 %r37, %r36, %r29;\n" -" add.s32 %r38, %r28, %r37;\n" -" ld.param.u64 %rd2, [__cudaparm_particle_map_counts];\n" -" cvt.s64.s32 %rd3, %r38;\n" -" mul.wide.s32 %rd4, %r38, 4;\n" -" add.u64 %rd5, %rd2, %rd4;\n" -" mov.s32 %r39, 1;\n" -" atom.global.add.s32 %r40, [%rd5], %r39;\n" -" mov.s32 %r41, %r40;\n" -" ld.param.s32 %r42, [__cudaparm_particle_map_max_atoms];\n" -" setp.gt.s32 %p9, %r42, %r41;\n" -" @%p9 bra $Lt_0_7682;\n" -" .loc 17 85 0\n" -" mov.s32 %r43, 2;\n" -" ld.param.u64 %rd6, [__cudaparm_particle_map_error];\n" -" st.global.s32 [%rd6+0], %r43;\n" -" .loc 16 118 0\n" -" mov.s32 %r44, -1;\n" -" atom.global.add.s32 %r45, [%rd5], %r44;\n" -" bra.uni $Lt_0_7426;\n" -"$Lt_0_7682:\n" -" .loc 17 88 0\n" -" ld.param.u64 %rd7, [__cudaparm_particle_map_ans];\n" -" ld.param.s32 %r46, [__cudaparm_particle_map_atom_stride];\n" -" mul.lo.s32 %r47, %r46, %r41;\n" -" add.s32 %r48, %r38, %r47;\n" -" cvt.s64.s32 %rd8, %r48;\n" -" mul.wide.s32 %rd9, %r48, 32;\n" -" add.u64 %rd10, %rd7, %rd9;\n" -" cvt.rn.f64.s32 %fd23, %r28;\n" -" mov.f64 %fd24, 0d3fe0000000000000; \n" -" add.f64 %fd25, %fd23, %fd24;\n" -" sub.f64 %fd26, %fd25, %fd9;\n" -" cvt.rn.f64.s32 %fd27, %r30;\n" -" mov.f64 %fd28, 0d3fe0000000000000; \n" -" add.f64 %fd29, %fd27, %fd28;\n" -" sub.f64 %fd30, %fd29, %fd15;\n" -" st.global.v2.f64 [%rd10+0], {%fd26,%fd30};\n" -" cvt.rn.f64.s32 %fd31, %r32;\n" -" mov.f64 %fd32, 0d3fe0000000000000; \n" -" add.f64 %fd33, %fd31, %fd32;\n" -" sub.f64 %fd34, %fd33, %fd21;\n" -" st.global.v2.f64 [%rd10+16], {%fd34,%fd3};\n" -"$Lt_0_7426:\n" -"$L_0_4610:\n" -"$Lt_0_6914:\n" -"$Lt_0_6402:\n" -" .loc 17 92 0\n" -" exit;\n" -"$LDWend_particle_map:\n" -" }\n" -" .entry make_rho (\n" -" .param .u64 __cudaparm_make_rho_counts,\n" -" .param .u64 __cudaparm_make_rho_atoms,\n" -" .param .u64 __cudaparm_make_rho_brick,\n" -" .param .u64 __cudaparm_make_rho__rho_coeff,\n" -" .param .s32 __cudaparm_make_rho_atom_stride,\n" -" .param .s32 __cudaparm_make_rho_npts_x,\n" -" .param .s32 __cudaparm_make_rho_npts_y,\n" -" .param .s32 __cudaparm_make_rho_npts_z,\n" -" .param .s32 __cudaparm_make_rho_nlocal_x,\n" -" .param .s32 __cudaparm_make_rho_nlocal_y,\n" -" .param .s32 __cudaparm_make_rho_nlocal_z,\n" -" .param .s32 __cudaparm_make_rho_order_m_1,\n" -" .param .s32 __cudaparm_make_rho_order,\n" -" .param .s32 __cudaparm_make_rho_order2)\n" -" {\n" -" .reg .u32 %r<119>;\n" -" .reg .u64 %rd<57>;\n" -" .reg .f64 %fd<26>;\n" -" .reg .pred %p<27>;\n" -" .shared .align 8 .b8 __cuda___cuda_local_var_32578_34_non_const_rho_coeff200[512];\n" -" .shared .align 8 .b8 __cuda___cuda_local_var_32579_34_non_const_front712[640];\n" -" .shared .align 8 .b8 __cuda___cuda_local_var_32580_34_non_const_ans1352[4096];\n" -" .loc 17 101 0\n" -"$LDWbegin_make_rho:\n" -" ld.param.s32 %r1, [__cudaparm_make_rho_order2];\n" -" ld.param.s32 %r2, [__cudaparm_make_rho_order];\n" -" add.s32 %r3, %r1, %r2;\n" -" cvt.s32.u32 %r4, %tid.x;\n" -" setp.le.s32 %p1, %r3, %r4;\n" -" @%p1 bra $Lt_1_16898;\n" -" .loc 17 108 0\n" -" mov.u64 %rd1, __cuda___cuda_local_var_32578_34_non_const_rho_coeff200;\n" -" cvt.s64.s32 %rd2, %r4;\n" -" mul.wide.s32 %rd3, %r4, 8;\n" -" ld.param.u64 %rd4, [__cudaparm_make_rho__rho_coeff];\n" -" add.u64 %rd5, %rd4, %rd3;\n" -" ld.global.f64 %fd1, [%rd5+0];\n" -" add.u64 %rd6, %rd3, %rd1;\n" -" st.shared.f64 [%rd6+0], %fd1;\n" -"$Lt_1_16898:\n" -" mov.u64 %rd1, __cuda___cuda_local_var_32578_34_non_const_rho_coeff200;\n" -" shr.s32 %r5, %r4, 31;\n" -" mov.s32 %r6, 31;\n" -" and.b32 %r7, %r5, %r6;\n" -" add.s32 %r8, %r7, %r4;\n" -" shr.s32 %r9, %r8, 5;\n" -" mul.lo.s32 %r10, %r9, 32;\n" -" sub.s32 %r11, %r4, %r10;\n" -" setp.lt.s32 %p2, %r11, %r2;\n" -" @!%p2 bra $Lt_1_17410;\n" -" .loc 17 114 0\n" -" mov.u64 %rd7, __cuda___cuda_local_var_32579_34_non_const_front712;\n" -" mov.f64 %fd2, 0d0000000000000000; \n" -" cvt.s64.s32 %rd8, %r11;\n" -" shr.s32 %r12, %r4, 31;\n" -" mov.s32 %r13, 31;\n" -" and.b32 %r14, %r12, %r13;\n" -" add.s32 %r15, %r14, %r4;\n" -" shr.s32 %r16, %r15, 5;\n" -" cvt.s64.s32 %rd9, %r16;\n" -" mul.wide.s32 %rd10, %r16, 40;\n" -" add.u64 %rd11, %rd8, %rd10;\n" -" mul.lo.u64 %rd12, %rd11, 8;\n" -" add.u64 %rd13, %rd7, %rd12;\n" -" st.shared.f64 [%rd13+256], %fd2;\n" -"$Lt_1_17410:\n" -" mov.u64 %rd7, __cuda___cuda_local_var_32579_34_non_const_front712;\n" -" .loc 17 116 0\n" -" bar.sync 0;\n" -" ld.param.s32 %r17, [__cudaparm_make_rho_npts_x];\n" -" shr.s32 %r18, %r17, 31;\n" -" mov.s32 %r19, 31;\n" -" and.b32 %r20, %r18, %r19;\n" -" add.s32 %r21, %r20, %r17;\n" -" shr.s32 %r22, %r21, 5;\n" -" add.s32 %r23, %r22, 1;\n" -" mov.u32 %r24, 0;\n" -" setp.le.s32 %p3, %r23, %r24;\n" -" @%p3 bra $Lt_1_17922;\n" -" shr.s32 %r25, %r4, 31;\n" -" mov.s32 %r26, 31;\n" -" and.b32 %r27, %r25, %r26;\n" -" add.s32 %r28, %r27, %r4;\n" -" shr.s32 %r29, %r28, 5;\n" -" add.s32 %r30, %r11, 32;\n" -" ld.param.s32 %r31, [__cudaparm_make_rho_nlocal_y];\n" -" ld.param.s32 %r32, [__cudaparm_make_rho_nlocal_x];\n" -" mul.lo.s32 %r33, %r31, %r32;\n" -" mov.u32 %r34, %ctaid.x;\n" -" mul.lo.u32 %r35, %r34, 2;\n" -" add.u32 %r36, %r29, %r35;\n" -" ld.param.s32 %r37, [__cudaparm_make_rho_npts_y];\n" -" div.s32 %r38, %r36, %r37;\n" -" ld.param.s32 %r39, [__cudaparm_make_rho_order_m_1];\n" -" setp.lt.s32 %p4, %r38, %r39;\n" -" sub.s32 %r40, %r39, %r38;\n" -" mov.s32 %r41, 0;\n" -" selp.s32 %r42, %r40, %r41, %p4;\n" -" ld.param.s32 %r43, [__cudaparm_make_rho_nlocal_z];\n" -" setp.ge.s32 %p5, %r38, %r43;\n" -" sub.s32 %r44, %r43, %r38;\n" -" add.s32 %r45, %r44, %r2;\n" -" sub.s32 %r46, %r45, 1;\n" -" selp.s32 %r47, %r46, %r2, %p5;\n" -" rem.s32 %r48, %r36, %r37;\n" -" setp.lt.s32 %p6, %r48, %r39;\n" -" sub.s32 %r49, %r39, %r48;\n" -" mov.s32 %r50, 0;\n" -" selp.s32 %r51, %r49, %r50, %p6;\n" -" setp.ge.s32 %p7, %r48, %r31;\n" -" sub.s32 %r52, %r31, %r48;\n" -" add.s32 %r53, %r52, %r2;\n" -" sub.s32 %r54, %r53, 1;\n" -" selp.s32 %r55, %r54, %r2, %p7;\n" -" mov.s32 %r56, %r23;\n" -" mov.s32 %r57, 0;\n" -" setp.gt.s32 %p8, %r2, %r57;\n" -" mov.s32 %r58, 0;\n" -" cvt.s64.s32 %rd14, %r11;\n" -" cvt.s64.s32 %rd15, %r29;\n" -" mul.lo.s32 %r59, %r23, 32;\n" -" mul.wide.s32 %rd16, %r29, 40;\n" -" add.u64 %rd17, %rd14, %rd16;\n" -" ld.param.s32 %r60, [__cudaparm_make_rho_npts_z];\n" -" setp.gt.s32 %p9, %r60, %r38;\n" -" mul.lo.u64 %rd18, %rd17, 8;\n" -" selp.s32 %r61, 1, 0, %p9;\n" -" add.u64 %rd19, %rd18, %rd7;\n" -" mov.u64 %rd20, __cuda___cuda_local_var_32580_34_non_const_ans1352;\n" -" mov.s32 %r62, %r56;\n" -"$Lt_1_18434:\n" -" @!%p8 bra $Lt_1_18690;\n" -" mov.s32 %r63, %r2;\n" -" cvt.s64.s32 %rd21, %r4;\n" -" mul.wide.s32 %rd22, %r4, 8;\n" -" add.u64 %rd23, %rd20, %rd22;\n" -" mov.s32 %r64, 0;\n" -" mov.s32 %r65, %r63;\n" -"$Lt_1_19202:\n" -" .loc 17 140 0\n" -" mov.f64 %fd3, 0d0000000000000000; \n" -" st.shared.f64 [%rd23+0], %fd3;\n" -" add.s32 %r64, %r64, 1;\n" -" add.u64 %rd23, %rd23, 512;\n" -" setp.ne.s32 %p10, %r64, %r2;\n" -" @%p10 bra $Lt_1_19202;\n" -"$Lt_1_18690:\n" -" add.s32 %r66, %r11, %r58;\n" -" set.lt.u32.s32 %r67, %r66, %r32;\n" -" neg.s32 %r68, %r67;\n" -" and.b32 %r69, %r61, %r68;\n" -" mov.u32 %r70, 0;\n" -" setp.eq.s32 %p11, %r69, %r70;\n" -" @%p11 bra $Lt_1_20226;\n" -" .loc 17 143 0\n" -" mov.s32 %r71, %r42;\n" -" setp.ge.s32 %p12, %r42, %r47;\n" -" @%p12 bra $Lt_1_20226;\n" -" sub.s32 %r72, %r47, %r42;\n" -" setp.lt.s32 %p13, %r51, %r55;\n" -" mov.s32 %r73, %r72;\n" -"$Lt_1_20738:\n" -" .loc 17 145 0\n" -" mov.s32 %r74, %r51;\n" -" @!%p13 bra $Lt_1_20994;\n" -" sub.s32 %r75, %r55, %r51;\n" -" sub.s32 %r76, %r71, %r42;\n" -" add.s32 %r77, %r38, %r42;\n" -" add.s32 %r78, %r48, %r51;\n" -" sub.s32 %r79, %r77, %r39;\n" -" sub.s32 %r80, %r78, %r39;\n" -" add.s32 %r81, %r76, %r79;\n" -" mul.lo.s32 %r82, %r33, %r81;\n" -" ld.param.s32 %r83, [__cudaparm_make_rho_atom_stride];\n" -" ld.param.u64 %rd24, [__cudaparm_make_rho_counts];\n" -" mov.s32 %r84, %r75;\n" -"$Lt_1_21506:\n" -" .loc 17 147 0\n" -" sub.s32 %r85, %r74, %r51;\n" -" add.s32 %r86, %r85, %r80;\n" -" mul.lo.s32 %r87, %r86, %r32;\n" -" add.s32 %r88, %r82, %r87;\n" -" add.s32 %r89, %r66, %r88;\n" -" cvt.s64.s32 %rd25, %r89;\n" -" mul.wide.s32 %rd26, %r89, 4;\n" -" add.u64 %rd27, %rd24, %rd26;\n" -" ld.global.s32 %r90, [%rd27+0];\n" -" mul.lo.s32 %r91, %r90, %r83;\n" -" .loc 17 148 0\n" -" mov.s32 %r92, %r89;\n" -" setp.ge.s32 %p14, %r89, %r91;\n" -" @%p14 bra $Lt_1_21762;\n" -" sub.s32 %r93, %r3, 1;\n" -" cvt.s64.s32 %rd28, %r83;\n" -" mul.wide.s32 %rd29, %r83, 32;\n" -" mov.s32 %r94, -1;\n" -" setp.gt.s32 %p15, %r93, %r94;\n" -" ld.param.u64 %rd30, [__cudaparm_make_rho_atoms];\n" -" mul.lo.u64 %rd31, %rd25, 32;\n" -" add.u64 %rd32, %rd30, %rd31;\n" -"$Lt_1_22274:\n" -" .loc 17 149 0\n" -" ld.global.f64 %fd4, [%rd32+0];\n" -" @!%p15 bra $Lt_1_29954;\n" -" sub.s32 %r95, %r93, %r74;\n" -" mov.s32 %r96, -1;\n" -" sub.s32 %r97, %r96, %r74;\n" -" cvt.s64.s32 %rd33, %r2;\n" -" mul.wide.s32 %rd34, %r2, 8;\n" -" ld.global.f64 %fd5, [%rd32+8];\n" -" ld.global.f64 %fd6, [%rd32+16];\n" -" cvt.s64.s32 %rd35, %r95;\n" -" mul.wide.s32 %rd36, %r95, 8;\n" -" add.u64 %rd37, %rd1, %rd36;\n" -" sub.s32 %r98, %r93, %r71;\n" -" cvt.s64.s32 %rd38, %r98;\n" -" mul.wide.s32 %rd39, %r98, 8;\n" -" add.u64 %rd40, %rd1, %rd39;\n" -" mov.f64 %fd7, 0d0000000000000000; \n" -" mov.f64 %fd8, 0d0000000000000000; \n" -"$Lt_1_23042:\n" -" .loc 17 154 0\n" -" ld.shared.f64 %fd9, [%rd37+0];\n" -" mad.rn.f64 %fd8, %fd8, %fd5, %fd9;\n" -" .loc 17 155 0\n" -" ld.shared.f64 %fd10, [%rd40+0];\n" -" mad.rn.f64 %fd7, %fd7, %fd6, %fd10;\n" -" sub.u64 %rd40, %rd40, %rd34;\n" -" sub.s32 %r95, %r95, %r2;\n" -" sub.u64 %rd37, %rd37, %rd34;\n" -" setp.gt.s32 %p16, %r95, %r97;\n" -" @%p16 bra $Lt_1_23042;\n" -" bra.uni $Lt_1_22530;\n" -"$Lt_1_29954:\n" -" mov.f64 %fd7, 0d0000000000000000; \n" -" mov.f64 %fd8, 0d0000000000000000; \n" -"$Lt_1_22530:\n" -" .loc 17 157 0\n" -" ld.global.f64 %fd11, [%rd32+24];\n" -" mul.f64 %fd12, %fd7, %fd8;\n" -" mul.f64 %fd13, %fd11, %fd12;\n" -" @!%p8 bra $Lt_1_23554;\n" -" mov.s32 %r99, %r2;\n" -" cvt.s64.s32 %rd41, %r4;\n" -" mul.wide.s32 %rd42, %r4, 8;\n" -" add.u64 %rd43, %rd20, %rd42;\n" -" mov.s32 %r100, 0;\n" -" mov.s32 %r101, %r99;\n" -"$Lt_1_24066:\n" -" .loc 17 161 0\n" -" add.s32 %r102, %r100, %r1;\n" -" mov.s32 %r103, %r102;\n" -" setp.lt.s32 %p17, %r102, %r100;\n" -" @%p17 bra $Lt_1_30466;\n" -" cvt.s64.s32 %rd44, %r2;\n" -" mul.wide.s32 %rd34, %r2, 8;\n" -" cvt.s64.s32 %rd45, %r102;\n" -" mul.wide.s32 %rd46, %r102, 8;\n" -" add.u64 %rd47, %rd1, %rd46;\n" -" mov.f64 %fd14, 0d0000000000000000; \n" -"$Lt_1_24834:\n" -" .loc 17 162 0\n" -" ld.shared.f64 %fd15, [%rd47+0];\n" -" mad.rn.f64 %fd14, %fd4, %fd14, %fd15;\n" -" sub.s32 %r103, %r103, %r2;\n" -" sub.u64 %rd47, %rd47, %rd34;\n" -" setp.ge.s32 %p18, %r103, %r100;\n" -" @%p18 bra $Lt_1_24834;\n" -" bra.uni $Lt_1_24322;\n" -"$Lt_1_30466:\n" -" mov.f64 %fd14, 0d0000000000000000; \n" -"$Lt_1_24322:\n" -" .loc 17 163 0\n" -" ld.shared.f64 %fd16, [%rd43+0];\n" -" mad.rn.f64 %fd17, %fd14, %fd13, %fd16;\n" -" st.shared.f64 [%rd43+0], %fd17;\n" -" add.s32 %r100, %r100, 1;\n" -" add.u64 %rd43, %rd43, 512;\n" -" setp.ne.s32 %p19, %r100, %r2;\n" -" @%p19 bra $Lt_1_24066;\n" -"$Lt_1_23554:\n" -" add.s32 %r92, %r92, %r83;\n" -" add.u64 %rd32, %rd29, %rd32;\n" -" setp.gt.s32 %p20, %r91, %r92;\n" -" @%p20 bra $Lt_1_22274;\n" -"$Lt_1_21762:\n" -" add.s32 %r74, %r74, 1;\n" -" setp.ne.s32 %p21, %r55, %r74;\n" -" @%p21 bra $Lt_1_21506;\n" -"$Lt_1_20994:\n" -" add.s32 %r71, %r71, 1;\n" -" setp.ne.s32 %p22, %r47, %r71;\n" -" @%p22 bra $Lt_1_20738;\n" -"$Lt_1_20226:\n" -"$Lt_1_19714:\n" -" .loc 17 172 0\n" -" bar.sync 0;\n" -" @!%p2 bra $Lt_1_26626;\n" -" .loc 17 174 0\n" -" ld.shared.f64 %fd18, [%rd19+256];\n" -" st.shared.f64 [%rd19+0], %fd18;\n" -" .loc 17 175 0\n" -" mov.f64 %fd19, 0d0000000000000000; \n" -" st.shared.f64 [%rd19+256], %fd19;\n" -" bra.uni $Lt_1_26370;\n" -"$Lt_1_26626:\n" -" .loc 17 177 0\n" -" mov.f64 %fd20, 0d0000000000000000; \n" -" st.shared.f64 [%rd19+0], %fd20;\n" -"$Lt_1_26370:\n" -" @!%p8 bra $Lt_1_26882;\n" -" mov.s32 %r104, %r2;\n" -" cvt.s64.s32 %rd48, %r4;\n" -" mov.s32 %r105, %r11;\n" -" add.s32 %r106, %r11, %r2;\n" -" mul.wide.s32 %rd49, %r4, 8;\n" -" add.u64 %rd50, %rd20, %rd49;\n" -" mov.s64 %rd51, %rd19;\n" -" mov.s32 %r107, %r104;\n" -"$Lt_1_27394:\n" -" .loc 17 180 0\n" -" ld.shared.f64 %fd21, [%rd50+0];\n" -" ld.shared.f64 %fd22, [%rd51+0];\n" -" add.f64 %fd23, %fd21, %fd22;\n" -" st.shared.f64 [%rd51+0], %fd23;\n" -" .loc 17 181 0\n" -" bar.sync 0;\n" -" add.s32 %r105, %r105, 1;\n" -" add.u64 %rd51, %rd51, 8;\n" -" add.u64 %rd50, %rd50, 512;\n" -" setp.ne.s32 %p23, %r105, %r106;\n" -" @%p23 bra $Lt_1_27394;\n" -"$Lt_1_26882:\n" -" set.lt.u32.s32 %r108, %r66, %r17;\n" -" neg.s32 %r109, %r108;\n" -" and.b32 %r110, %r61, %r109;\n" -" mov.u32 %r111, 0;\n" -" setp.eq.s32 %p24, %r110, %r111;\n" -" @%p24 bra $Lt_1_27906;\n" -" .loc 17 185 0\n" -" ld.shared.f64 %fd24, [%rd19+0];\n" -" ld.param.u64 %rd52, [__cudaparm_make_rho_brick];\n" -" add.s32 %r112, %r11, %r58;\n" -" mul.lo.s32 %r113, %r37, %r17;\n" -" mul.lo.s32 %r114, %r38, %r113;\n" -" mul.lo.s32 %r115, %r48, %r17;\n" -" add.s32 %r116, %r114, %r115;\n" -" add.s32 %r117, %r112, %r116;\n" -" cvt.s64.s32 %rd53, %r117;\n" -" mul.wide.s32 %rd54, %r117, 8;\n" -" add.u64 %rd55, %rd52, %rd54;\n" -" st.global.f64 [%rd55+0], %fd24;\n" -"$Lt_1_27906:\n" -" add.s32 %r58, %r58, 32;\n" -" setp.ne.s32 %p25, %r58, %r59;\n" -" @%p25 bra $Lt_1_18434;\n" -"$Lt_1_17922:\n" -" .loc 17 189 0\n" -" exit;\n" -"$LDWend_make_rho:\n" -" }\n" -" .entry interp (\n" -" .param .u64 __cudaparm_interp_x_,\n" -" .param .u64 __cudaparm_interp_q_,\n" -" .param .s32 __cudaparm_interp_nlocal,\n" -" .param .u64 __cudaparm_interp_brick,\n" -" .param .u64 __cudaparm_interp__rho_coeff,\n" -" .param .s32 __cudaparm_interp_npts_x,\n" -" .param .s32 __cudaparm_interp_npts_yx,\n" -" .param .f64 __cudaparm_interp_b_lo_x,\n" -" .param .f64 __cudaparm_interp_b_lo_y,\n" -" .param .f64 __cudaparm_interp_b_lo_z,\n" -" .param .f64 __cudaparm_interp_delxinv,\n" -" .param .f64 __cudaparm_interp_delyinv,\n" -" .param .f64 __cudaparm_interp_delzinv,\n" -" .param .s32 __cudaparm_interp_order,\n" -" .param .s32 __cudaparm_interp_order2,\n" -" .param .f64 __cudaparm_interp_qqrd2e_scale,\n" -" .param .u64 __cudaparm_interp_ans)\n" -" {\n" -" .reg .u32 %r<56>;\n" -" .reg .u64 %rd<37>;\n" -" .reg .f32 %f<19>;\n" -" .reg .f64 %fd<63>;\n" -" .reg .pred %p<14>;\n" -" .shared .align 8 .b8 __cuda___cuda_local_var_32676_34_non_const_rho_coeff5568[512];\n" -" .shared .align 8 .b8 __cuda___cuda_local_var_32677_34_non_const_rho1d_06080[4096];\n" -" .shared .align 8 .b8 __cuda___cuda_local_var_32678_34_non_const_rho1d_110176[4096];\n" -" .loc 17 199 0\n" -"$LDWbegin_interp:\n" -" ld.param.s32 %r1, [__cudaparm_interp_order2];\n" -" ld.param.s32 %r2, [__cudaparm_interp_order];\n" -" add.s32 %r3, %r1, %r2;\n" -" cvt.s32.u32 %r4, %tid.x;\n" -" setp.le.s32 %p1, %r3, %r4;\n" -" @%p1 bra $Lt_2_8706;\n" -" .loc 17 206 0\n" -" mov.u64 %rd1, __cuda___cuda_local_var_32676_34_non_const_rho_coeff5568;\n" -" cvt.s64.s32 %rd2, %r4;\n" -" mul.wide.s32 %rd3, %r4, 8;\n" -" ld.param.u64 %rd4, [__cudaparm_interp__rho_coeff];\n" -" add.u64 %rd5, %rd4, %rd3;\n" -" ld.global.f64 %fd1, [%rd5+0];\n" -" add.u64 %rd6, %rd3, %rd1;\n" -" st.shared.f64 [%rd6+0], %fd1;\n" -"$Lt_2_8706:\n" -" mov.u64 %rd1, __cuda___cuda_local_var_32676_34_non_const_rho_coeff5568;\n" -" .loc 17 207 0\n" -" bar.sync 0;\n" -" mov.u32 %r5, %ctaid.x;\n" -" mov.u32 %r6, %ntid.x;\n" -" mul.lo.u32 %r7, %r5, %r6;\n" -" add.u32 %r8, %r4, %r7;\n" -" ld.param.s32 %r9, [__cudaparm_interp_nlocal];\n" -" setp.le.s32 %p2, %r9, %r8;\n" -" @%p2 bra $Lt_2_9218;\n" -" .loc 17 215 0\n" -" mov.u32 %r10, %r8;\n" -" mov.s32 %r11, 0;\n" -" mov.u32 %r12, %r11;\n" -" mov.s32 %r13, 0;\n" -" mov.u32 %r14, %r13;\n" -" mov.s32 %r15, 0;\n" -" mov.u32 %r16, %r15;\n" -" tex.1d.v4.f32.s32 {%f1,%f2,%f3,%f4},[pos_tex,{%r10,%r12,%r14,%r16}];\n" -" mov.f32 %f5, %f1;\n" -" mov.f32 %f6, %f2;\n" -" mov.f32 %f7, %f3;\n" -" .loc 17 216 0\n" -" mov.u32 %r17, %r8;\n" -" mov.s32 %r18, 0;\n" -" mov.u32 %r19, %r18;\n" -" mov.s32 %r20, 0;\n" -" mov.u32 %r21, %r20;\n" -" mov.s32 %r22, 0;\n" -" mov.u32 %r23, %r22;\n" -" tex.1d.v4.f32.s32 {%f8,%f9,%f10,%f11},[q_tex,{%r17,%r19,%r21,%r23}];\n" -" mov.f32 %f12, %f8;\n" -" cvt.ftz.f64.f32 %fd2, %f12;\n" -" ld.param.f64 %fd3, [__cudaparm_interp_qqrd2e_scale];\n" -" mul.f64 %fd4, %fd2, %fd3;\n" -" mov.f64 %fd5, 0d0000000000000000; \n" -" setp.neu.f64 %p3, %fd4, %fd5;\n" -" @!%p3 bra $Lt_2_9986;\n" -" mov.s32 %r24, 0;\n" -" setp.gt.s32 %p4, %r2, %r24;\n" -" ld.param.f64 %fd6, [__cudaparm_interp_delxinv];\n" -" cvt.ftz.f64.f32 %fd7, %f5;\n" -" ld.param.f64 %fd8, [__cudaparm_interp_b_lo_x];\n" -" sub.f64 %fd9, %fd7, %fd8;\n" -" mul.f64 %fd10, %fd6, %fd9;\n" -" @!%p4 bra $Lt_2_16386;\n" -" mov.u64 %rd7, __cuda___cuda_local_var_32677_34_non_const_rho1d_06080;\n" -" mov.u64 %rd8, __cuda___cuda_local_var_32678_34_non_const_rho1d_110176;\n" -" cvt.rzi.s32.f64 %r25, %fd10;\n" -" cvt.rn.f64.s32 %fd11, %r25;\n" -" mov.f64 %fd12, 0d3fe0000000000000; \n" -" add.f64 %fd13, %fd11, %fd12;\n" -" sub.f64 %fd14, %fd13, %fd10;\n" -" ld.param.f64 %fd15, [__cudaparm_interp_delyinv];\n" -" cvt.ftz.f64.f32 %fd16, %f6;\n" -" ld.param.f64 %fd17, [__cudaparm_interp_b_lo_y];\n" -" sub.f64 %fd18, %fd16, %fd17;\n" -" mul.f64 %fd19, %fd15, %fd18;\n" -" cvt.rzi.s32.f64 %r26, %fd19;\n" -" cvt.rn.f64.s32 %fd20, %r26;\n" -" mov.f64 %fd21, 0d3fe0000000000000; \n" -" add.f64 %fd22, %fd20, %fd21;\n" -" sub.f64 %fd23, %fd22, %fd19;\n" -" mov.s32 %r27, %r2;\n" -" cvt.s64.s32 %rd9, %r4;\n" -" mov.s32 %r28, %r1;\n" -" mul.wide.s32 %rd3, %r4, 8;\n" -" add.u64 %rd10, %rd3, %rd7;\n" -" add.u64 %rd11, %rd3, %rd8;\n" -" mov.s32 %r29, 0;\n" -" mov.s32 %r30, %r27;\n" -"$Lt_2_10754:\n" -" .loc 17 235 0\n" -" mov.f64 %fd24, 0d0000000000000000; \n" -" mov.f64 %fd25, 0d0000000000000000; \n" -" st.shared.f64 [%rd10+0], %fd25;\n" -" .loc 17 236 0\n" -" mov.f64 %fd26, 0d0000000000000000; \n" -" mov.f64 %fd27, 0d0000000000000000; \n" -" st.shared.f64 [%rd11+0], %fd27;\n" -" .loc 17 237 0\n" -" mov.s32 %r31, %r28;\n" -" setp.lt.s32 %p5, %r28, %r29;\n" -" @%p5 bra $Lt_2_11010;\n" -" cvt.s64.s32 %rd12, %r2;\n" -" mul.wide.s32 %rd13, %r2, 8;\n" -" cvt.s64.s32 %rd14, %r28;\n" -" mul.wide.s32 %rd15, %r28, 8;\n" -" add.u64 %rd16, %rd1, %rd15;\n" -"$Lt_2_11522:\n" -" .loc 17 238 0\n" -" ld.shared.f64 %fd28, [%rd16+0];\n" -" mad.rn.f64 %fd24, %fd24, %fd14, %fd28;\n" -" st.shared.f64 [%rd10+0], %fd24;\n" -" .loc 17 239 0\n" -" mad.rn.f64 %fd26, %fd26, %fd23, %fd28;\n" -" st.shared.f64 [%rd11+0], %fd26;\n" -" sub.s32 %r31, %r31, %r2;\n" -" sub.u64 %rd16, %rd16, %rd13;\n" -" setp.ge.s32 %p6, %r31, %r29;\n" -" @%p6 bra $Lt_2_11522;\n" -"$Lt_2_11010:\n" -" add.s32 %r29, %r29, 1;\n" -" add.s32 %r28, %r28, 1;\n" -" add.u64 %rd11, %rd11, 512;\n" -" add.u64 %rd10, %rd10, 512;\n" -" setp.ne.s32 %p7, %r28, %r3;\n" -" @%p7 bra $Lt_2_10754;\n" -" bra.uni $Lt_2_10242;\n" -"$Lt_2_16386:\n" -" cvt.rzi.s32.f64 %r25, %fd10;\n" -" mov.u64 %rd8, __cuda___cuda_local_var_32678_34_non_const_rho1d_110176;\n" -" mov.u64 %rd7, __cuda___cuda_local_var_32677_34_non_const_rho1d_06080;\n" -"$Lt_2_10242:\n" -" .loc 17 243 0\n" -" ld.param.f64 %fd29, [__cudaparm_interp_delzinv];\n" -" cvt.ftz.f64.f32 %fd30, %f7;\n" -" ld.param.f64 %fd31, [__cudaparm_interp_b_lo_z];\n" -" sub.f64 %fd32, %fd30, %fd31;\n" -" mul.f64 %fd33, %fd29, %fd32;\n" -" cvt.rzi.s32.f64 %r32, %fd33;\n" -" ld.param.s32 %r33, [__cudaparm_interp_npts_yx];\n" -" mul.lo.s32 %r34, %r32, %r33;\n" -" add.s32 %r35, %r25, %r34;\n" -" @!%p4 bra $Lt_2_16898;\n" -" cvt.rn.f64.s32 %fd34, %r32;\n" -" mov.f64 %fd35, 0d3fe0000000000000; \n" -" add.f64 %fd36, %fd34, %fd35;\n" -" sub.f64 %fd37, %fd36, %fd33;\n" -" mov.s32 %r36, %r2;\n" -" cvt.ftz.f64.f32 %fd38, %f6;\n" -" cvt.s64.s32 %rd17, %r4;\n" -" ld.param.f64 %fd39, [__cudaparm_interp_delyinv];\n" -" ld.param.f64 %fd40, [__cudaparm_interp_b_lo_y];\n" -" sub.f64 %fd41, %fd38, %fd40;\n" -" mul.f64 %fd42, %fd39, %fd41;\n" -" cvt.rzi.s32.f64 %r37, %fd42;\n" -" mul.wide.s32 %rd3, %r4, 8;\n" -" ld.param.s32 %r38, [__cudaparm_interp_npts_x];\n" -" mul.lo.s32 %r39, %r37, %r38;\n" -" add.u64 %rd18, %rd3, %rd7;\n" -" add.u64 %rd19, %rd3, %rd8;\n" -" cvt.s64.s32 %rd20, %r38;\n" -" mul.wide.s32 %rd21, %r38, 32;\n" -" add.s32 %r40, %r39, %r35;\n" -" mov.s32 %r41, %r40;\n" -" ld.param.u64 %rd22, [__cudaparm_interp_brick];\n" -" mov.s32 %r42, 0;\n" -" mov.f32 %f13, 0f00000000; \n" -" mov.f32 %f14, 0f00000000; \n" -" mov.f32 %f15, 0f00000000; \n" -" mov.s32 %r43, %r36;\n" -"$Lt_2_12802:\n" -" .loc 17 246 0\n" -" add.s32 %r44, %r42, %r1;\n" -" mov.s32 %r45, %r44;\n" -" setp.lt.s32 %p8, %r44, %r42;\n" -" @%p8 bra $Lt_2_17154;\n" -" cvt.s64.s32 %rd23, %r2;\n" -" mul.wide.s32 %rd13, %r2, 8;\n" -" cvt.s64.s32 %rd24, %r44;\n" -" mul.wide.s32 %rd25, %r44, 8;\n" -" add.u64 %rd26, %rd1, %rd25;\n" -" mov.f64 %fd43, 0d0000000000000000; \n" -"$Lt_2_13570:\n" -" .loc 17 247 0\n" -" ld.shared.f64 %fd44, [%rd26+0];\n" -" mad.rn.f64 %fd43, %fd37, %fd43, %fd44;\n" -" sub.s32 %r45, %r45, %r2;\n" -" sub.u64 %rd26, %rd26, %rd13;\n" -" setp.ge.s32 %p9, %r45, %r42;\n" -" @%p9 bra $Lt_2_13570;\n" -" bra.uni $Lt_2_13058;\n" -"$Lt_2_17154:\n" -" mov.f64 %fd43, 0d0000000000000000; \n" -"$Lt_2_13058:\n" -" .loc 17 249 0\n" -" mov.s32 %r46, %r41;\n" -" mov.s32 %r47, %r2;\n" -" mov.s32 %r48, %r46;\n" -" mul.f64 %fd45, %fd4, %fd43;\n" -" mov.s64 %rd27, %rd19;\n" -" cvt.s64.s32 %rd28, %r46;\n" -" mul.wide.s32 %rd29, %r46, 32;\n" -" mov.s32 %r49, 0;\n" -" mov.s32 %r50, %r47;\n" -"$Lt_2_14594:\n" -" mov.s32 %r51, %r2;\n" -" mov.s32 %r52, %r48;\n" -" add.s32 %r53, %r48, %r2;\n" -" mov.s64 %rd30, %rd18;\n" -" ld.shared.f64 %fd46, [%rd27+0];\n" -" add.u64 %rd31, %rd29, %rd22;\n" -" mul.f64 %fd47, %fd45, %fd46;\n" -" mov.s32 %r54, %r51;\n" -"$Lt_2_15362:\n" -" .loc 17 253 0\n" -" ld.shared.f64 %fd48, [%rd30+0];\n" -" mul.f64 %fd49, %fd48, %fd47;\n" -" .loc 17 255 0\n" -" cvt.ftz.f64.f32 %fd50, %f15;\n" -" ld.global.v2.f64 {%fd51,%fd52}, [%rd31+0];\n" -" mul.f64 %fd53, %fd49, %fd51;\n" -" sub.f64 %fd54, %fd50, %fd53;\n" -" cvt.rn.ftz.f32.f64 %f15, %fd54;\n" -" .loc 17 256 0\n" -" cvt.ftz.f64.f32 %fd55, %f14;\n" -" mul.f64 %fd56, %fd49, %fd52;\n" -" sub.f64 %fd57, %fd55, %fd56;\n" -" cvt.rn.ftz.f32.f64 %f14, %fd57;\n" -" .loc 17 257 0\n" -" cvt.ftz.f64.f32 %fd58, %f13;\n" -" ld.global.f64 %fd59, [%rd31+16];\n" -" mul.f64 %fd60, %fd49, %fd59;\n" -" sub.f64 %fd61, %fd58, %fd60;\n" -" cvt.rn.ftz.f32.f64 %f13, %fd61;\n" -" add.s32 %r52, %r52, 1;\n" -" add.u64 %rd31, %rd31, 32;\n" -" add.u64 %rd30, %rd30, 512;\n" -" setp.ne.s32 %p10, %r52, %r53;\n" -" @%p10 bra $Lt_2_15362;\n" -" add.s32 %r49, %r49, 1;\n" -" add.s32 %r48, %r48, %r38;\n" -" add.u64 %rd29, %rd29, %rd21;\n" -" add.u64 %rd27, %rd27, 512;\n" -" setp.ne.s32 %p11, %r49, %r2;\n" -" @%p11 bra $Lt_2_14594;\n" -" add.s32 %r42, %r42, 1;\n" -" add.s32 %r41, %r46, %r33;\n" -" setp.ne.s32 %p12, %r42, %r2;\n" -" @%p12 bra $Lt_2_12802;\n" -" bra.uni $Lt_2_9730;\n" -"$Lt_2_16898:\n" -" mov.f32 %f13, 0f00000000; \n" -" mov.f32 %f14, 0f00000000; \n" -" mov.f32 %f15, 0f00000000; \n" -" bra.uni $Lt_2_9730;\n" -"$Lt_2_9986:\n" -" mov.f32 %f13, 0f00000000; \n" -" mov.f32 %f14, 0f00000000; \n" -" mov.f32 %f15, 0f00000000; \n" -"$Lt_2_9730:\n" -" .loc 17 264 0\n" -" ld.param.u64 %rd32, [__cudaparm_interp_ans];\n" -" cvt.s64.s32 %rd33, %r8;\n" -" mul.wide.s32 %rd34, %r8, 16;\n" -" add.u64 %rd35, %rd32, %rd34;\n" -" mov.f32 %f16, %f17;\n" -" st.global.v4.f32 [%rd35+0], {%f15,%f14,%f13,%f16};\n" -"$Lt_2_9218:\n" -" .loc 17 266 0\n" -" exit;\n" -"$LDWend_interp:\n" -" }\n" -; diff --git a/lib/gpu/pppm_f.ptx b/lib/gpu/pppm_f.ptx deleted file mode 100644 index 85e890e5c3..0000000000 --- a/lib/gpu/pppm_f.ptx +++ /dev/null @@ -1,881 +0,0 @@ - .version 2.3 - .target sm_20 - .address_size 64 - // compiled with /usr/local/cuda/open64/lib//be - // nvopencc 4.0 built on 2011-05-12 - - //----------------------------------------------------------- - // Compiling /tmp/tmpxft_00009b0b_00000000-9_lal_pppm.cpp3.i (/home/sjplimp/ccBI#.wCkpTI) - //----------------------------------------------------------- - - //----------------------------------------------------------- - // Options: - //----------------------------------------------------------- - // Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64 - // -O3 (Optimization level) - // -g0 (Debug level) - // -m2 (Report advisories) - //----------------------------------------------------------- - - .file 1 "" - .file 2 "/tmp/tmpxft_00009b0b_00000000-8_lal_pppm.cudafe2.gpu" - .file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h" - .file 4 "/usr/local/cuda/include/crt/device_runtime.h" - .file 5 "/usr/local/cuda/include/host_defines.h" - .file 6 "/usr/local/cuda/include/builtin_types.h" - .file 7 "/usr/local/cuda/include/device_types.h" - .file 8 "/usr/local/cuda/include/driver_types.h" - .file 9 "/usr/local/cuda/include/surface_types.h" - .file 10 "/usr/local/cuda/include/texture_types.h" - .file 11 "/usr/local/cuda/include/vector_types.h" - .file 12 "/usr/local/cuda/include/device_launch_parameters.h" - .file 13 "/usr/local/cuda/include/crt/storage_class.h" - .file 14 "/usr/include/bits/types.h" - .file 15 "/usr/include/time.h" - .file 16 "/usr/local/cuda/include/sm_11_atomic_functions.h" - .file 17 "lal_pppm.cu" - .file 18 "/usr/local/cuda/include/common_functions.h" - .file 19 "/usr/local/cuda/include/math_functions.h" - .file 20 "/usr/local/cuda/include/math_constants.h" - .file 21 "/usr/local/cuda/include/device_functions.h" - .file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h" - .file 23 "/usr/local/cuda/include/sm_13_double_functions.h" - .file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h" - .file 25 "/usr/local/cuda/include/sm_20_intrinsics.h" - .file 26 "/usr/local/cuda/include/surface_functions.h" - .file 27 "/usr/local/cuda/include/texture_fetch_functions.h" - .file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h" - - .global .texref pos_tex; - .global .texref q_tex; - - .entry particle_map ( - .param .u64 __cudaparm_particle_map_x_, - .param .u64 __cudaparm_particle_map_q_, - .param .f32 __cudaparm_particle_map_delvolinv, - .param .s32 __cudaparm_particle_map_nlocal, - .param .u64 __cudaparm_particle_map_counts, - .param .u64 __cudaparm_particle_map_ans, - .param .f32 __cudaparm_particle_map_b_lo_x, - .param .f32 __cudaparm_particle_map_b_lo_y, - .param .f32 __cudaparm_particle_map_b_lo_z, - .param .f32 __cudaparm_particle_map_delxinv, - .param .f32 __cudaparm_particle_map_delyinv, - .param .f32 __cudaparm_particle_map_delzinv, - .param .s32 __cudaparm_particle_map_nlocal_x, - .param .s32 __cudaparm_particle_map_nlocal_y, - .param .s32 __cudaparm_particle_map_nlocal_z, - .param .s32 __cudaparm_particle_map_atom_stride, - .param .s32 __cudaparm_particle_map_max_atoms, - .param .u64 __cudaparm_particle_map_error) - { - .reg .u32 %r<50>; - .reg .u64 %rd<12>; - .reg .f32 %f<44>; - .reg .pred %p<11>; - .loc 17 50 0 -$LDWbegin_particle_map: - cvt.s32.u32 %r1, %ntid.x; - cvt.s32.u32 %r2, %ctaid.x; - mul24.lo.s32 %r3, %r2, %r1; - cvt.s32.u32 %r4, %nctaid.x; - mul24.lo.s32 %r5, %r4, %r1; - mov.u32 %r6, %tid.x; - add.u32 %r7, %r3, %r6; - sub.s32 %r8, %r5, 1; - mul.lo.s32 %r9, %r7, 64; - div.s32 %r10, %r9, %r5; - mul.lo.s32 %r11, %r8, %r10; - sub.s32 %r12, %r9, %r11; - ld.param.s32 %r13, [__cudaparm_particle_map_nlocal]; - setp.le.s32 %p1, %r13, %r12; - @%p1 bra $Lt_0_7426; - .loc 17 62 0 - mov.u32 %r14, %r12; - mov.s32 %r15, 0; - mov.u32 %r16, %r15; - mov.s32 %r17, 0; - mov.u32 %r18, %r17; - mov.s32 %r19, 0; - mov.u32 %r20, %r19; - tex.1d.v4.f32.s32 {%f1,%f2,%f3,%f4},[pos_tex,{%r14,%r16,%r18,%r20}]; - mov.f32 %f5, %f1; - mov.f32 %f6, %f2; - mov.f32 %f7, %f3; - .loc 17 64 0 - mov.u32 %r21, %r12; - mov.s32 %r22, 0; - mov.u32 %r23, %r22; - mov.s32 %r24, 0; - mov.u32 %r25, %r24; - mov.s32 %r26, 0; - mov.u32 %r27, %r26; - tex.1d.v4.f32.s32 {%f8,%f9,%f10,%f11},[q_tex,{%r21,%r23,%r25,%r27}]; - mov.f32 %f12, %f8; - ld.param.f32 %f13, [__cudaparm_particle_map_delvolinv]; - mul.ftz.f32 %f14, %f13, %f12; - mov.f32 %f15, 0f00000000; // 0 - setp.neu.ftz.f32 %p2, %f14, %f15; - @!%p2 bra $Lt_0_7426; - .loc 17 67 0 - ld.param.f32 %f16, [__cudaparm_particle_map_b_lo_x]; - sub.ftz.f32 %f17, %f5, %f16; - ld.param.f32 %f18, [__cudaparm_particle_map_delxinv]; - mul.ftz.f32 %f19, %f18, %f17; - mov.f32 %f20, 0f00000000; // 0 - setp.lt.ftz.f32 %p3, %f19, %f20; - @%p3 bra $Lt_0_8706; - ld.param.f32 %f21, [__cudaparm_particle_map_b_lo_y]; - sub.ftz.f32 %f22, %f6, %f21; - ld.param.f32 %f23, [__cudaparm_particle_map_delyinv]; - mul.ftz.f32 %f24, %f23, %f22; - mov.f32 %f25, 0f00000000; // 0 - setp.lt.ftz.f32 %p4, %f24, %f25; - @%p4 bra $Lt_0_8706; - ld.param.f32 %f26, [__cudaparm_particle_map_b_lo_z]; - sub.ftz.f32 %f27, %f7, %f26; - ld.param.f32 %f28, [__cudaparm_particle_map_delzinv]; - mul.ftz.f32 %f29, %f28, %f27; - mov.f32 %f30, 0f00000000; // 0 - setp.lt.ftz.f32 %p5, %f29, %f30; - @%p5 bra $Lt_0_8706; - cvt.rzi.ftz.s32.f32 %r28, %f19; - ld.param.s32 %r29, [__cudaparm_particle_map_nlocal_x]; - setp.ge.s32 %p6, %r28, %r29; - @%p6 bra $Lt_0_8706; - cvt.rzi.ftz.s32.f32 %r30, %f24; - ld.param.s32 %r31, [__cudaparm_particle_map_nlocal_y]; - setp.ge.s32 %p7, %r30, %r31; - @%p7 bra $Lt_0_8706; - cvt.rzi.ftz.s32.f32 %r32, %f29; - ld.param.s32 %r33, [__cudaparm_particle_map_nlocal_z]; - setp.gt.s32 %p8, %r33, %r32; - @%p8 bra $L_0_4866; -$Lt_0_8706: -$L_0_5122: - .loc 17 76 0 - mov.s32 %r34, 1; - ld.param.u64 %rd1, [__cudaparm_particle_map_error]; - st.global.s32 [%rd1+0], %r34; - bra.uni $Lt_0_7426; -$L_0_4866: - .loc 17 83 0 - mul.lo.s32 %r35, %r32, %r31; - add.s32 %r36, %r30, %r35; - mul.lo.s32 %r37, %r36, %r29; - add.s32 %r38, %r28, %r37; - ld.param.u64 %rd2, [__cudaparm_particle_map_counts]; - cvt.s64.s32 %rd3, %r38; - mul.wide.s32 %rd4, %r38, 4; - add.u64 %rd5, %rd2, %rd4; - mov.s32 %r39, 1; - atom.global.add.s32 %r40, [%rd5], %r39; - mov.s32 %r41, %r40; - ld.param.s32 %r42, [__cudaparm_particle_map_max_atoms]; - setp.gt.s32 %p9, %r42, %r41; - @%p9 bra $Lt_0_7682; - .loc 17 85 0 - mov.s32 %r43, 2; - ld.param.u64 %rd6, [__cudaparm_particle_map_error]; - st.global.s32 [%rd6+0], %r43; - .loc 16 118 0 - mov.s32 %r44, -1; - atom.global.add.s32 %r45, [%rd5], %r44; - bra.uni $Lt_0_7426; -$Lt_0_7682: - .loc 17 88 0 - ld.param.u64 %rd7, [__cudaparm_particle_map_ans]; - ld.param.s32 %r46, [__cudaparm_particle_map_atom_stride]; - mul.lo.s32 %r47, %r46, %r41; - add.s32 %r48, %r38, %r47; - cvt.s64.s32 %rd8, %r48; - mul.wide.s32 %rd9, %r48, 16; - add.u64 %rd10, %rd7, %rd9; - cvt.rn.f32.s32 %f31, %r28; - mov.f32 %f32, 0f3f000000; // 0.5 - add.ftz.f32 %f33, %f31, %f32; - sub.ftz.f32 %f34, %f33, %f19; - cvt.rn.f32.s32 %f35, %r30; - mov.f32 %f36, 0f3f000000; // 0.5 - add.ftz.f32 %f37, %f35, %f36; - sub.ftz.f32 %f38, %f37, %f24; - cvt.rn.f32.s32 %f39, %r32; - mov.f32 %f40, 0f3f000000; // 0.5 - add.ftz.f32 %f41, %f39, %f40; - sub.ftz.f32 %f42, %f41, %f29; - st.global.v4.f32 [%rd10+0], {%f34,%f38,%f42,%f14}; -$Lt_0_7426: -$L_0_4610: -$Lt_0_6914: -$Lt_0_6402: - .loc 17 92 0 - exit; -$LDWend_particle_map: - } // particle_map - - .entry make_rho ( - .param .u64 __cudaparm_make_rho_counts, - .param .u64 __cudaparm_make_rho_atoms, - .param .u64 __cudaparm_make_rho_brick, - .param .u64 __cudaparm_make_rho__rho_coeff, - .param .s32 __cudaparm_make_rho_atom_stride, - .param .s32 __cudaparm_make_rho_npts_x, - .param .s32 __cudaparm_make_rho_npts_y, - .param .s32 __cudaparm_make_rho_npts_z, - .param .s32 __cudaparm_make_rho_nlocal_x, - .param .s32 __cudaparm_make_rho_nlocal_y, - .param .s32 __cudaparm_make_rho_nlocal_z, - .param .s32 __cudaparm_make_rho_order_m_1, - .param .s32 __cudaparm_make_rho_order, - .param .s32 __cudaparm_make_rho_order2) - { - .reg .u32 %r<119>; - .reg .u64 %rd<57>; - .reg .f32 %f<26>; - .reg .pred %p<27>; - .shared .align 4 .b8 __cuda___cuda_local_var_32578_33_non_const_rho_coeff168[256]; - .shared .align 4 .b8 __cuda___cuda_local_var_32579_33_non_const_front424[320]; - .shared .align 4 .b8 __cuda___cuda_local_var_32580_33_non_const_ans744[2048]; - .loc 17 101 0 -$LDWbegin_make_rho: - ld.param.s32 %r1, [__cudaparm_make_rho_order2]; - ld.param.s32 %r2, [__cudaparm_make_rho_order]; - add.s32 %r3, %r1, %r2; - cvt.s32.u32 %r4, %tid.x; - setp.le.s32 %p1, %r3, %r4; - @%p1 bra $Lt_1_16898; - .loc 17 108 0 - mov.u64 %rd1, __cuda___cuda_local_var_32578_33_non_const_rho_coeff168; - cvt.s64.s32 %rd2, %r4; - mul.wide.s32 %rd3, %r4, 4; - ld.param.u64 %rd4, [__cudaparm_make_rho__rho_coeff]; - add.u64 %rd5, %rd4, %rd3; - ld.global.f32 %f1, [%rd5+0]; - add.u64 %rd6, %rd3, %rd1; - st.shared.f32 [%rd6+0], %f1; -$Lt_1_16898: - mov.u64 %rd1, __cuda___cuda_local_var_32578_33_non_const_rho_coeff168; - shr.s32 %r5, %r4, 31; - mov.s32 %r6, 31; - and.b32 %r7, %r5, %r6; - add.s32 %r8, %r7, %r4; - shr.s32 %r9, %r8, 5; - mul.lo.s32 %r10, %r9, 32; - sub.s32 %r11, %r4, %r10; - setp.lt.s32 %p2, %r11, %r2; - @!%p2 bra $Lt_1_17410; - .loc 17 114 0 - mov.u64 %rd7, __cuda___cuda_local_var_32579_33_non_const_front424; - mov.f32 %f2, 0f00000000; // 0 - cvt.s64.s32 %rd8, %r11; - shr.s32 %r12, %r4, 31; - mov.s32 %r13, 31; - and.b32 %r14, %r12, %r13; - add.s32 %r15, %r14, %r4; - shr.s32 %r16, %r15, 5; - cvt.s64.s32 %rd9, %r16; - mul.wide.s32 %rd10, %r16, 40; - add.u64 %rd11, %rd8, %rd10; - mul.lo.u64 %rd12, %rd11, 4; - add.u64 %rd13, %rd7, %rd12; - st.shared.f32 [%rd13+128], %f2; -$Lt_1_17410: - mov.u64 %rd7, __cuda___cuda_local_var_32579_33_non_const_front424; - .loc 17 116 0 - bar.sync 0; - ld.param.s32 %r17, [__cudaparm_make_rho_npts_x]; - shr.s32 %r18, %r17, 31; - mov.s32 %r19, 31; - and.b32 %r20, %r18, %r19; - add.s32 %r21, %r20, %r17; - shr.s32 %r22, %r21, 5; - add.s32 %r23, %r22, 1; - mov.u32 %r24, 0; - setp.le.s32 %p3, %r23, %r24; - @%p3 bra $Lt_1_17922; - shr.s32 %r25, %r4, 31; - mov.s32 %r26, 31; - and.b32 %r27, %r25, %r26; - add.s32 %r28, %r27, %r4; - shr.s32 %r29, %r28, 5; - add.s32 %r30, %r11, 32; - ld.param.s32 %r31, [__cudaparm_make_rho_nlocal_y]; - ld.param.s32 %r32, [__cudaparm_make_rho_nlocal_x]; - mul.lo.s32 %r33, %r31, %r32; - mov.u32 %r34, %ctaid.x; - mul.lo.u32 %r35, %r34, 2; - add.u32 %r36, %r29, %r35; - ld.param.s32 %r37, [__cudaparm_make_rho_npts_y]; - div.s32 %r38, %r36, %r37; - ld.param.s32 %r39, [__cudaparm_make_rho_order_m_1]; - setp.lt.s32 %p4, %r38, %r39; - sub.s32 %r40, %r39, %r38; - mov.s32 %r41, 0; - selp.s32 %r42, %r40, %r41, %p4; - ld.param.s32 %r43, [__cudaparm_make_rho_nlocal_z]; - setp.ge.s32 %p5, %r38, %r43; - sub.s32 %r44, %r43, %r38; - add.s32 %r45, %r44, %r2; - sub.s32 %r46, %r45, 1; - selp.s32 %r47, %r46, %r2, %p5; - rem.s32 %r48, %r36, %r37; - setp.lt.s32 %p6, %r48, %r39; - sub.s32 %r49, %r39, %r48; - mov.s32 %r50, 0; - selp.s32 %r51, %r49, %r50, %p6; - setp.ge.s32 %p7, %r48, %r31; - sub.s32 %r52, %r31, %r48; - add.s32 %r53, %r52, %r2; - sub.s32 %r54, %r53, 1; - selp.s32 %r55, %r54, %r2, %p7; - mov.s32 %r56, %r23; - mov.s32 %r57, 0; - setp.gt.s32 %p8, %r2, %r57; - mov.s32 %r58, 0; - cvt.s64.s32 %rd14, %r11; - cvt.s64.s32 %rd15, %r29; - mul.lo.s32 %r59, %r23, 32; - mul.wide.s32 %rd16, %r29, 40; - add.u64 %rd17, %rd14, %rd16; - ld.param.s32 %r60, [__cudaparm_make_rho_npts_z]; - setp.gt.s32 %p9, %r60, %r38; - mul.lo.u64 %rd18, %rd17, 4; - selp.s32 %r61, 1, 0, %p9; - add.u64 %rd19, %rd18, %rd7; - mov.u64 %rd20, __cuda___cuda_local_var_32580_33_non_const_ans744; - mov.s32 %r62, %r56; -$Lt_1_18434: - // Loop body line 116, nesting depth: 1, estimated iterations: unknown - @!%p8 bra $Lt_1_18690; - mov.s32 %r63, %r2; - cvt.s64.s32 %rd21, %r4; - mul.wide.s32 %rd22, %r4, 4; - add.u64 %rd23, %rd20, %rd22; - mov.s32 %r64, 0; - mov.s32 %r65, %r63; -$Lt_1_19202: - // Loop body line 116, nesting depth: 2, estimated iterations: unknown - .loc 17 140 0 - mov.f32 %f3, 0f00000000; // 0 - st.shared.f32 [%rd23+0], %f3; - add.s32 %r64, %r64, 1; - add.u64 %rd23, %rd23, 256; - setp.ne.s32 %p10, %r64, %r2; - @%p10 bra $Lt_1_19202; -$Lt_1_18690: - add.s32 %r66, %r11, %r58; - set.lt.u32.s32 %r67, %r66, %r32; - neg.s32 %r68, %r67; - and.b32 %r69, %r61, %r68; - mov.u32 %r70, 0; - setp.eq.s32 %p11, %r69, %r70; - @%p11 bra $Lt_1_20226; - .loc 17 143 0 - mov.s32 %r71, %r42; - setp.ge.s32 %p12, %r42, %r47; - @%p12 bra $Lt_1_20226; - sub.s32 %r72, %r47, %r42; - setp.lt.s32 %p13, %r51, %r55; - mov.s32 %r73, %r72; -$Lt_1_20738: - // Loop body line 143, nesting depth: 2, estimated iterations: unknown - .loc 17 145 0 - mov.s32 %r74, %r51; - @!%p13 bra $Lt_1_20994; - sub.s32 %r75, %r55, %r51; - sub.s32 %r76, %r71, %r42; - add.s32 %r77, %r38, %r42; - add.s32 %r78, %r48, %r51; - sub.s32 %r79, %r77, %r39; - sub.s32 %r80, %r78, %r39; - add.s32 %r81, %r76, %r79; - mul.lo.s32 %r82, %r33, %r81; - ld.param.s32 %r83, [__cudaparm_make_rho_atom_stride]; - ld.param.u64 %rd24, [__cudaparm_make_rho_counts]; - mov.s32 %r84, %r75; -$Lt_1_21506: - // Loop body line 145, nesting depth: 3, estimated iterations: unknown - .loc 17 147 0 - sub.s32 %r85, %r74, %r51; - add.s32 %r86, %r85, %r80; - mul.lo.s32 %r87, %r86, %r32; - add.s32 %r88, %r82, %r87; - add.s32 %r89, %r66, %r88; - cvt.s64.s32 %rd25, %r89; - mul.wide.s32 %rd26, %r89, 4; - add.u64 %rd27, %rd24, %rd26; - ld.global.s32 %r90, [%rd27+0]; - mul.lo.s32 %r91, %r90, %r83; - .loc 17 148 0 - mov.s32 %r92, %r89; - setp.ge.s32 %p14, %r89, %r91; - @%p14 bra $Lt_1_21762; - sub.s32 %r93, %r3, 1; - cvt.s64.s32 %rd28, %r83; - mul.wide.s32 %rd29, %r83, 16; - mov.s32 %r94, -1; - setp.gt.s32 %p15, %r93, %r94; - ld.param.u64 %rd30, [__cudaparm_make_rho_atoms]; - mul.lo.u64 %rd31, %rd25, 16; - add.u64 %rd32, %rd30, %rd31; -$Lt_1_22274: - // Loop body line 148, nesting depth: 4, estimated iterations: unknown - .loc 17 149 0 - ld.global.f32 %f4, [%rd32+0]; - @!%p15 bra $Lt_1_29954; - sub.s32 %r95, %r93, %r74; - mov.s32 %r96, -1; - sub.s32 %r97, %r96, %r74; - cvt.s64.s32 %rd33, %r2; - mul.wide.s32 %rd34, %r2, 4; - ld.global.f32 %f5, [%rd32+4]; - ld.global.f32 %f6, [%rd32+8]; - cvt.s64.s32 %rd35, %r95; - mul.wide.s32 %rd36, %r95, 4; - add.u64 %rd37, %rd1, %rd36; - sub.s32 %r98, %r93, %r71; - cvt.s64.s32 %rd38, %r98; - mul.wide.s32 %rd39, %r98, 4; - add.u64 %rd40, %rd1, %rd39; - mov.f32 %f7, 0f00000000; // 0 - mov.f32 %f8, 0f00000000; // 0 -$Lt_1_23042: - // Loop body line 149, nesting depth: 5, estimated iterations: unknown - .loc 17 154 0 - ld.shared.f32 %f9, [%rd37+0]; - fma.rn.ftz.f32 %f8, %f8, %f5, %f9; - .loc 17 155 0 - ld.shared.f32 %f10, [%rd40+0]; - fma.rn.ftz.f32 %f7, %f7, %f6, %f10; - sub.u64 %rd40, %rd40, %rd34; - sub.s32 %r95, %r95, %r2; - sub.u64 %rd37, %rd37, %rd34; - setp.gt.s32 %p16, %r95, %r97; - @%p16 bra $Lt_1_23042; - bra.uni $Lt_1_22530; -$Lt_1_29954: - mov.f32 %f7, 0f00000000; // 0 - mov.f32 %f8, 0f00000000; // 0 -$Lt_1_22530: - .loc 17 157 0 - ld.global.f32 %f11, [%rd32+12]; - mul.ftz.f32 %f12, %f7, %f8; - mul.ftz.f32 %f13, %f11, %f12; - @!%p8 bra $Lt_1_23554; - mov.s32 %r99, %r2; - cvt.s64.s32 %rd41, %r4; - mul.wide.s32 %rd42, %r4, 4; - add.u64 %rd43, %rd20, %rd42; - mov.s32 %r100, 0; - mov.s32 %r101, %r99; -$Lt_1_24066: - // Loop body line 157, nesting depth: 5, estimated iterations: unknown - .loc 17 161 0 - add.s32 %r102, %r100, %r1; - mov.s32 %r103, %r102; - setp.lt.s32 %p17, %r102, %r100; - @%p17 bra $Lt_1_30466; - cvt.s64.s32 %rd44, %r2; - mul.wide.s32 %rd34, %r2, 4; - cvt.s64.s32 %rd45, %r102; - mul.wide.s32 %rd46, %r102, 4; - add.u64 %rd47, %rd1, %rd46; - mov.f32 %f14, 0f00000000; // 0 -$Lt_1_24834: - // Loop body line 161, nesting depth: 6, estimated iterations: unknown - .loc 17 162 0 - ld.shared.f32 %f15, [%rd47+0]; - fma.rn.ftz.f32 %f14, %f4, %f14, %f15; - sub.s32 %r103, %r103, %r2; - sub.u64 %rd47, %rd47, %rd34; - setp.ge.s32 %p18, %r103, %r100; - @%p18 bra $Lt_1_24834; - bra.uni $Lt_1_24322; -$Lt_1_30466: - mov.f32 %f14, 0f00000000; // 0 -$Lt_1_24322: - .loc 17 163 0 - ld.shared.f32 %f16, [%rd43+0]; - fma.rn.ftz.f32 %f17, %f14, %f13, %f16; - st.shared.f32 [%rd43+0], %f17; - add.s32 %r100, %r100, 1; - add.u64 %rd43, %rd43, 256; - setp.ne.s32 %p19, %r100, %r2; - @%p19 bra $Lt_1_24066; -$Lt_1_23554: - add.s32 %r92, %r92, %r83; - add.u64 %rd32, %rd29, %rd32; - setp.gt.s32 %p20, %r91, %r92; - @%p20 bra $Lt_1_22274; -$Lt_1_21762: - add.s32 %r74, %r74, 1; - setp.ne.s32 %p21, %r55, %r74; - @%p21 bra $Lt_1_21506; -$Lt_1_20994: - add.s32 %r71, %r71, 1; - setp.ne.s32 %p22, %r47, %r71; - @%p22 bra $Lt_1_20738; -$Lt_1_20226: -$Lt_1_19714: - .loc 17 172 0 - bar.sync 0; - @!%p2 bra $Lt_1_26626; - .loc 17 174 0 - ld.shared.f32 %f18, [%rd19+128]; - st.shared.f32 [%rd19+0], %f18; - .loc 17 175 0 - mov.f32 %f19, 0f00000000; // 0 - st.shared.f32 [%rd19+128], %f19; - bra.uni $Lt_1_26370; -$Lt_1_26626: - .loc 17 177 0 - mov.f32 %f20, 0f00000000; // 0 - st.shared.f32 [%rd19+0], %f20; -$Lt_1_26370: - @!%p8 bra $Lt_1_26882; - mov.s32 %r104, %r2; - cvt.s64.s32 %rd48, %r4; - mov.s32 %r105, %r11; - add.s32 %r106, %r11, %r2; - mul.wide.s32 %rd49, %r4, 4; - add.u64 %rd50, %rd20, %rd49; - mov.s64 %rd51, %rd19; - mov.s32 %r107, %r104; -$Lt_1_27394: - // Loop body line 177, nesting depth: 2, estimated iterations: unknown - .loc 17 180 0 - ld.shared.f32 %f21, [%rd50+0]; - ld.shared.f32 %f22, [%rd51+0]; - add.ftz.f32 %f23, %f21, %f22; - st.shared.f32 [%rd51+0], %f23; - .loc 17 181 0 - bar.sync 0; - add.s32 %r105, %r105, 1; - add.u64 %rd51, %rd51, 4; - add.u64 %rd50, %rd50, 256; - setp.ne.s32 %p23, %r105, %r106; - @%p23 bra $Lt_1_27394; -$Lt_1_26882: - set.lt.u32.s32 %r108, %r66, %r17; - neg.s32 %r109, %r108; - and.b32 %r110, %r61, %r109; - mov.u32 %r111, 0; - setp.eq.s32 %p24, %r110, %r111; - @%p24 bra $Lt_1_27906; - .loc 17 185 0 - ld.shared.f32 %f24, [%rd19+0]; - ld.param.u64 %rd52, [__cudaparm_make_rho_brick]; - add.s32 %r112, %r11, %r58; - mul.lo.s32 %r113, %r37, %r17; - mul.lo.s32 %r114, %r38, %r113; - mul.lo.s32 %r115, %r48, %r17; - add.s32 %r116, %r114, %r115; - add.s32 %r117, %r112, %r116; - cvt.s64.s32 %rd53, %r117; - mul.wide.s32 %rd54, %r117, 4; - add.u64 %rd55, %rd52, %rd54; - st.global.f32 [%rd55+0], %f24; -$Lt_1_27906: - add.s32 %r58, %r58, 32; - setp.ne.s32 %p25, %r58, %r59; - @%p25 bra $Lt_1_18434; -$Lt_1_17922: - .loc 17 189 0 - exit; -$LDWend_make_rho: - } // make_rho - - .entry interp ( - .param .u64 __cudaparm_interp_x_, - .param .u64 __cudaparm_interp_q_, - .param .s32 __cudaparm_interp_nlocal, - .param .u64 __cudaparm_interp_brick, - .param .u64 __cudaparm_interp__rho_coeff, - .param .s32 __cudaparm_interp_npts_x, - .param .s32 __cudaparm_interp_npts_yx, - .param .f32 __cudaparm_interp_b_lo_x, - .param .f32 __cudaparm_interp_b_lo_y, - .param .f32 __cudaparm_interp_b_lo_z, - .param .f32 __cudaparm_interp_delxinv, - .param .f32 __cudaparm_interp_delyinv, - .param .f32 __cudaparm_interp_delzinv, - .param .s32 __cudaparm_interp_order, - .param .s32 __cudaparm_interp_order2, - .param .f32 __cudaparm_interp_qqrd2e_scale, - .param .u64 __cudaparm_interp_ans) - { - .reg .u32 %r<56>; - .reg .u64 %rd<37>; - .reg .f32 %f<69>; - .reg .pred %p<14>; - .shared .align 4 .b8 __cuda___cuda_local_var_32676_33_non_const_rho_coeff2888[256]; - .shared .align 4 .b8 __cuda___cuda_local_var_32677_33_non_const_rho1d_03144[2048]; - .shared .align 4 .b8 __cuda___cuda_local_var_32678_33_non_const_rho1d_15192[2048]; - // __cuda_local_var_32694_12_non_const_ek = 16 - .loc 17 199 0 -$LDWbegin_interp: - ld.param.s32 %r1, [__cudaparm_interp_order2]; - ld.param.s32 %r2, [__cudaparm_interp_order]; - add.s32 %r3, %r1, %r2; - cvt.s32.u32 %r4, %tid.x; - setp.le.s32 %p1, %r3, %r4; - @%p1 bra $Lt_2_8706; - .loc 17 206 0 - mov.u64 %rd1, __cuda___cuda_local_var_32676_33_non_const_rho_coeff2888; - cvt.s64.s32 %rd2, %r4; - mul.wide.s32 %rd3, %r4, 4; - ld.param.u64 %rd4, [__cudaparm_interp__rho_coeff]; - add.u64 %rd5, %rd4, %rd3; - ld.global.f32 %f1, [%rd5+0]; - add.u64 %rd6, %rd3, %rd1; - st.shared.f32 [%rd6+0], %f1; -$Lt_2_8706: - mov.u64 %rd1, __cuda___cuda_local_var_32676_33_non_const_rho_coeff2888; - .loc 17 207 0 - bar.sync 0; - mov.u32 %r5, %ctaid.x; - mov.u32 %r6, %ntid.x; - mul.lo.u32 %r7, %r5, %r6; - add.u32 %r8, %r4, %r7; - ld.param.s32 %r9, [__cudaparm_interp_nlocal]; - setp.le.s32 %p2, %r9, %r8; - @%p2 bra $Lt_2_9218; - .loc 17 215 0 - mov.u32 %r10, %r8; - mov.s32 %r11, 0; - mov.u32 %r12, %r11; - mov.s32 %r13, 0; - mov.u32 %r14, %r13; - mov.s32 %r15, 0; - mov.u32 %r16, %r15; - tex.1d.v4.f32.s32 {%f2,%f3,%f4,%f5},[pos_tex,{%r10,%r12,%r14,%r16}]; - mov.f32 %f6, %f2; - mov.f32 %f7, %f3; - mov.f32 %f8, %f4; - .loc 17 216 0 - mov.u32 %r17, %r8; - mov.s32 %r18, 0; - mov.u32 %r19, %r18; - mov.s32 %r20, 0; - mov.u32 %r21, %r20; - mov.s32 %r22, 0; - mov.u32 %r23, %r22; - tex.1d.v4.f32.s32 {%f9,%f10,%f11,%f12},[q_tex,{%r17,%r19,%r21,%r23}]; - mov.f32 %f13, %f9; - ld.param.f32 %f14, [__cudaparm_interp_qqrd2e_scale]; - mul.ftz.f32 %f15, %f14, %f13; - mov.f32 %f16, 0f00000000; // 0 - setp.neu.ftz.f32 %p3, %f15, %f16; - @!%p3 bra $Lt_2_9986; - mov.s32 %r24, 0; - setp.gt.s32 %p4, %r2, %r24; - ld.param.f32 %f17, [__cudaparm_interp_b_lo_x]; - sub.ftz.f32 %f18, %f6, %f17; - ld.param.f32 %f19, [__cudaparm_interp_delxinv]; - mul.ftz.f32 %f20, %f19, %f18; - @!%p4 bra $Lt_2_16386; - mov.u64 %rd7, __cuda___cuda_local_var_32677_33_non_const_rho1d_03144; - mov.u64 %rd8, __cuda___cuda_local_var_32678_33_non_const_rho1d_15192; - cvt.rzi.ftz.s32.f32 %r25, %f20; - cvt.rn.f32.s32 %f21, %r25; - mov.f32 %f22, 0f3f000000; // 0.5 - add.ftz.f32 %f23, %f21, %f22; - sub.ftz.f32 %f24, %f23, %f20; - ld.param.f32 %f25, [__cudaparm_interp_b_lo_y]; - sub.ftz.f32 %f26, %f7, %f25; - ld.param.f32 %f27, [__cudaparm_interp_delyinv]; - mul.ftz.f32 %f28, %f27, %f26; - cvt.rzi.ftz.s32.f32 %r26, %f28; - cvt.rn.f32.s32 %f29, %r26; - mov.f32 %f30, 0f3f000000; // 0.5 - add.ftz.f32 %f31, %f29, %f30; - sub.ftz.f32 %f32, %f31, %f28; - mov.s32 %r27, %r2; - cvt.s64.s32 %rd9, %r4; - mov.s32 %r28, %r1; - mul.wide.s32 %rd3, %r4, 4; - add.u64 %rd10, %rd3, %rd7; - add.u64 %rd11, %rd3, %rd8; - mov.s32 %r29, 0; - mov.s32 %r30, %r27; -$Lt_2_10754: - // Loop body line 216, nesting depth: 1, estimated iterations: unknown - .loc 17 235 0 - mov.f32 %f33, 0f00000000; // 0 - mov.f32 %f34, 0f00000000; // 0 - st.shared.f32 [%rd10+0], %f34; - .loc 17 236 0 - mov.f32 %f35, 0f00000000; // 0 - mov.f32 %f36, 0f00000000; // 0 - st.shared.f32 [%rd11+0], %f36; - .loc 17 237 0 - mov.s32 %r31, %r28; - setp.lt.s32 %p5, %r28, %r29; - @%p5 bra $Lt_2_11010; - cvt.s64.s32 %rd12, %r2; - mul.wide.s32 %rd13, %r2, 4; - cvt.s64.s32 %rd14, %r28; - mul.wide.s32 %rd15, %r28, 4; - add.u64 %rd16, %rd1, %rd15; -$Lt_2_11522: - // Loop body line 237, nesting depth: 2, estimated iterations: unknown - .loc 17 238 0 - ld.shared.f32 %f37, [%rd16+0]; - fma.rn.ftz.f32 %f33, %f33, %f24, %f37; - st.shared.f32 [%rd10+0], %f33; - .loc 17 239 0 - fma.rn.ftz.f32 %f35, %f35, %f32, %f37; - st.shared.f32 [%rd11+0], %f35; - sub.s32 %r31, %r31, %r2; - sub.u64 %rd16, %rd16, %rd13; - setp.ge.s32 %p6, %r31, %r29; - @%p6 bra $Lt_2_11522; -$Lt_2_11010: - add.s32 %r29, %r29, 1; - add.s32 %r28, %r28, 1; - add.u64 %rd11, %rd11, 256; - add.u64 %rd10, %rd10, 256; - setp.ne.s32 %p7, %r28, %r3; - @%p7 bra $Lt_2_10754; - bra.uni $Lt_2_10242; -$Lt_2_16386: - cvt.rzi.ftz.s32.f32 %r25, %f20; - mov.u64 %rd8, __cuda___cuda_local_var_32678_33_non_const_rho1d_15192; - mov.u64 %rd7, __cuda___cuda_local_var_32677_33_non_const_rho1d_03144; -$Lt_2_10242: - .loc 17 243 0 - ld.param.f32 %f38, [__cudaparm_interp_b_lo_z]; - sub.ftz.f32 %f39, %f8, %f38; - ld.param.f32 %f40, [__cudaparm_interp_delzinv]; - mul.ftz.f32 %f41, %f40, %f39; - cvt.rzi.ftz.s32.f32 %r32, %f41; - ld.param.s32 %r33, [__cudaparm_interp_npts_yx]; - mul.lo.s32 %r34, %r32, %r33; - add.s32 %r35, %r25, %r34; - @!%p4 bra $Lt_2_16898; - cvt.rn.f32.s32 %f42, %r32; - mov.f32 %f43, 0f3f000000; // 0.5 - add.ftz.f32 %f44, %f42, %f43; - sub.ftz.f32 %f45, %f44, %f41; - mov.s32 %r36, %r2; - ld.param.f32 %f46, [__cudaparm_interp_b_lo_y]; - sub.ftz.f32 %f47, %f7, %f46; - cvt.s64.s32 %rd17, %r4; - ld.param.f32 %f48, [__cudaparm_interp_delyinv]; - mul.ftz.f32 %f49, %f48, %f47; - cvt.rzi.ftz.s32.f32 %r37, %f49; - ld.param.s32 %r38, [__cudaparm_interp_npts_x]; - mul.lo.s32 %r39, %r37, %r38; - mul.wide.s32 %rd3, %r4, 4; - add.s32 %r40, %r39, %r35; - add.u64 %rd18, %rd3, %rd7; - add.u64 %rd19, %rd3, %rd8; - cvt.s64.s32 %rd20, %r38; - mul.wide.s32 %rd21, %r38, 16; - mov.s32 %r41, %r40; - ld.param.u64 %rd22, [__cudaparm_interp_brick]; - mov.s32 %r42, 0; - mov.f32 %f50, 0f00000000; // 0 - mov.f32 %f51, 0f00000000; // 0 - mov.f32 %f52, 0f00000000; // 0 - mov.s32 %r43, %r36; -$Lt_2_12802: - // Loop body line 243, nesting depth: 1, estimated iterations: unknown - .loc 17 246 0 - add.s32 %r44, %r42, %r1; - mov.s32 %r45, %r44; - setp.lt.s32 %p8, %r44, %r42; - @%p8 bra $Lt_2_17154; - cvt.s64.s32 %rd23, %r2; - mul.wide.s32 %rd13, %r2, 4; - cvt.s64.s32 %rd24, %r44; - mul.wide.s32 %rd25, %r44, 4; - add.u64 %rd26, %rd1, %rd25; - mov.f32 %f53, 0f00000000; // 0 -$Lt_2_13570: - // Loop body line 246, nesting depth: 2, estimated iterations: unknown - .loc 17 247 0 - ld.shared.f32 %f54, [%rd26+0]; - fma.rn.ftz.f32 %f53, %f45, %f53, %f54; - sub.s32 %r45, %r45, %r2; - sub.u64 %rd26, %rd26, %rd13; - setp.ge.s32 %p9, %r45, %r42; - @%p9 bra $Lt_2_13570; - bra.uni $Lt_2_13058; -$Lt_2_17154: - mov.f32 %f53, 0f00000000; // 0 -$Lt_2_13058: - .loc 17 249 0 - mov.s32 %r46, %r41; - mov.s32 %r47, %r2; - mul.ftz.f32 %f55, %f15, %f53; - mov.s32 %r48, %r46; - mov.s64 %rd27, %rd19; - cvt.s64.s32 %rd28, %r46; - mul.wide.s32 %rd29, %r46, 16; - mov.s32 %r49, 0; - mov.s32 %r50, %r47; -$Lt_2_14594: - // Loop body line 249, nesting depth: 2, estimated iterations: unknown - mov.s32 %r51, %r2; - mov.s32 %r52, %r48; - add.s32 %r53, %r48, %r2; - mov.s64 %rd30, %rd18; - ld.shared.f32 %f56, [%rd27+0]; - add.u64 %rd31, %rd29, %rd22; - mul.ftz.f32 %f57, %f55, %f56; - mov.s32 %r54, %r51; -$Lt_2_15362: - // Loop body line 249, nesting depth: 3, estimated iterations: unknown - .loc 17 253 0 - ld.shared.f32 %f58, [%rd30+0]; - mul.ftz.f32 %f59, %f58, %f57; - ld.global.v4.f32 {%f60,%f61,%f62,_}, [%rd31+0]; - .loc 17 255 0 - mul.ftz.f32 %f63, %f59, %f60; - sub.ftz.f32 %f52, %f52, %f63; - .loc 17 256 0 - mul.ftz.f32 %f64, %f59, %f61; - sub.ftz.f32 %f51, %f51, %f64; - .loc 17 257 0 - mul.ftz.f32 %f65, %f59, %f62; - sub.ftz.f32 %f50, %f50, %f65; - add.s32 %r52, %r52, 1; - add.u64 %rd31, %rd31, 16; - add.u64 %rd30, %rd30, 256; - setp.ne.s32 %p10, %r52, %r53; - @%p10 bra $Lt_2_15362; - add.s32 %r49, %r49, 1; - add.s32 %r48, %r48, %r38; - add.u64 %rd29, %rd29, %rd21; - add.u64 %rd27, %rd27, 256; - setp.ne.s32 %p11, %r49, %r2; - @%p11 bra $Lt_2_14594; - add.s32 %r42, %r42, 1; - add.s32 %r41, %r46, %r33; - setp.ne.s32 %p12, %r42, %r2; - @%p12 bra $Lt_2_12802; - bra.uni $Lt_2_9730; -$Lt_2_16898: - mov.f32 %f50, 0f00000000; // 0 - mov.f32 %f51, 0f00000000; // 0 - mov.f32 %f52, 0f00000000; // 0 - bra.uni $Lt_2_9730; -$Lt_2_9986: - mov.f32 %f50, 0f00000000; // 0 - mov.f32 %f51, 0f00000000; // 0 - mov.f32 %f52, 0f00000000; // 0 -$Lt_2_9730: - .loc 17 264 0 - ld.param.u64 %rd32, [__cudaparm_interp_ans]; - cvt.s64.s32 %rd33, %r8; - mul.wide.s32 %rd34, %r8, 16; - add.u64 %rd35, %rd32, %rd34; - mov.f32 %f66, %f67; - st.global.v4.f32 [%rd35+0], {%f52,%f51,%f50,%f66}; -$Lt_2_9218: - .loc 17 266 0 - exit; -$LDWend_interp: - } // interp - diff --git a/lib/gpu/pppm_f_ptx.h b/lib/gpu/pppm_f_ptx.h deleted file mode 100644 index 388926ff11..0000000000 --- a/lib/gpu/pppm_f_ptx.h +++ /dev/null @@ -1,818 +0,0 @@ -const char * pppm_f = -" .version 2.3\n" -" .target sm_20\n" -" .address_size 64\n" -" .global .texref pos_tex;\n" -" .global .texref q_tex;\n" -" .entry particle_map (\n" -" .param .u64 __cudaparm_particle_map_x_,\n" -" .param .u64 __cudaparm_particle_map_q_,\n" -" .param .f32 __cudaparm_particle_map_delvolinv,\n" -" .param .s32 __cudaparm_particle_map_nlocal,\n" -" .param .u64 __cudaparm_particle_map_counts,\n" -" .param .u64 __cudaparm_particle_map_ans,\n" -" .param .f32 __cudaparm_particle_map_b_lo_x,\n" -" .param .f32 __cudaparm_particle_map_b_lo_y,\n" -" .param .f32 __cudaparm_particle_map_b_lo_z,\n" -" .param .f32 __cudaparm_particle_map_delxinv,\n" -" .param .f32 __cudaparm_particle_map_delyinv,\n" -" .param .f32 __cudaparm_particle_map_delzinv,\n" -" .param .s32 __cudaparm_particle_map_nlocal_x,\n" -" .param .s32 __cudaparm_particle_map_nlocal_y,\n" -" .param .s32 __cudaparm_particle_map_nlocal_z,\n" -" .param .s32 __cudaparm_particle_map_atom_stride,\n" -" .param .s32 __cudaparm_particle_map_max_atoms,\n" -" .param .u64 __cudaparm_particle_map_error)\n" -" {\n" -" .reg .u32 %r<50>;\n" -" .reg .u64 %rd<12>;\n" -" .reg .f32 %f<44>;\n" -" .reg .pred %p<11>;\n" -" .loc 17 50 0\n" -"$LDWbegin_particle_map:\n" -" cvt.s32.u32 %r1, %ntid.x;\n" -" cvt.s32.u32 %r2, %ctaid.x;\n" -" mul24.lo.s32 %r3, %r2, %r1;\n" -" cvt.s32.u32 %r4, %nctaid.x;\n" -" mul24.lo.s32 %r5, %r4, %r1;\n" -" mov.u32 %r6, %tid.x;\n" -" add.u32 %r7, %r3, %r6;\n" -" sub.s32 %r8, %r5, 1;\n" -" mul.lo.s32 %r9, %r7, 64;\n" -" div.s32 %r10, %r9, %r5;\n" -" mul.lo.s32 %r11, %r8, %r10;\n" -" sub.s32 %r12, %r9, %r11;\n" -" ld.param.s32 %r13, [__cudaparm_particle_map_nlocal];\n" -" setp.le.s32 %p1, %r13, %r12;\n" -" @%p1 bra $Lt_0_7426;\n" -" .loc 17 62 0\n" -" mov.u32 %r14, %r12;\n" -" mov.s32 %r15, 0;\n" -" mov.u32 %r16, %r15;\n" -" mov.s32 %r17, 0;\n" -" mov.u32 %r18, %r17;\n" -" mov.s32 %r19, 0;\n" -" mov.u32 %r20, %r19;\n" -" tex.1d.v4.f32.s32 {%f1,%f2,%f3,%f4},[pos_tex,{%r14,%r16,%r18,%r20}];\n" -" mov.f32 %f5, %f1;\n" -" mov.f32 %f6, %f2;\n" -" mov.f32 %f7, %f3;\n" -" .loc 17 64 0\n" -" mov.u32 %r21, %r12;\n" -" mov.s32 %r22, 0;\n" -" mov.u32 %r23, %r22;\n" -" mov.s32 %r24, 0;\n" -" mov.u32 %r25, %r24;\n" -" mov.s32 %r26, 0;\n" -" mov.u32 %r27, %r26;\n" -" tex.1d.v4.f32.s32 {%f8,%f9,%f10,%f11},[q_tex,{%r21,%r23,%r25,%r27}];\n" -" mov.f32 %f12, %f8;\n" -" ld.param.f32 %f13, [__cudaparm_particle_map_delvolinv];\n" -" mul.ftz.f32 %f14, %f13, %f12;\n" -" mov.f32 %f15, 0f00000000; \n" -" setp.neu.ftz.f32 %p2, %f14, %f15;\n" -" @!%p2 bra $Lt_0_7426;\n" -" .loc 17 67 0\n" -" ld.param.f32 %f16, [__cudaparm_particle_map_b_lo_x];\n" -" sub.ftz.f32 %f17, %f5, %f16;\n" -" ld.param.f32 %f18, [__cudaparm_particle_map_delxinv];\n" -" mul.ftz.f32 %f19, %f18, %f17;\n" -" mov.f32 %f20, 0f00000000; \n" -" setp.lt.ftz.f32 %p3, %f19, %f20;\n" -" @%p3 bra $Lt_0_8706;\n" -" ld.param.f32 %f21, [__cudaparm_particle_map_b_lo_y];\n" -" sub.ftz.f32 %f22, %f6, %f21;\n" -" ld.param.f32 %f23, [__cudaparm_particle_map_delyinv];\n" -" mul.ftz.f32 %f24, %f23, %f22;\n" -" mov.f32 %f25, 0f00000000; \n" -" setp.lt.ftz.f32 %p4, %f24, %f25;\n" -" @%p4 bra $Lt_0_8706;\n" -" ld.param.f32 %f26, [__cudaparm_particle_map_b_lo_z];\n" -" sub.ftz.f32 %f27, %f7, %f26;\n" -" ld.param.f32 %f28, [__cudaparm_particle_map_delzinv];\n" -" mul.ftz.f32 %f29, %f28, %f27;\n" -" mov.f32 %f30, 0f00000000; \n" -" setp.lt.ftz.f32 %p5, %f29, %f30;\n" -" @%p5 bra $Lt_0_8706;\n" -" cvt.rzi.ftz.s32.f32 %r28, %f19;\n" -" ld.param.s32 %r29, [__cudaparm_particle_map_nlocal_x];\n" -" setp.ge.s32 %p6, %r28, %r29;\n" -" @%p6 bra $Lt_0_8706;\n" -" cvt.rzi.ftz.s32.f32 %r30, %f24;\n" -" ld.param.s32 %r31, [__cudaparm_particle_map_nlocal_y];\n" -" setp.ge.s32 %p7, %r30, %r31;\n" -" @%p7 bra $Lt_0_8706;\n" -" cvt.rzi.ftz.s32.f32 %r32, %f29;\n" -" ld.param.s32 %r33, [__cudaparm_particle_map_nlocal_z];\n" -" setp.gt.s32 %p8, %r33, %r32;\n" -" @%p8 bra $L_0_4866;\n" -"$Lt_0_8706:\n" -"$L_0_5122:\n" -" .loc 17 76 0\n" -" mov.s32 %r34, 1;\n" -" ld.param.u64 %rd1, [__cudaparm_particle_map_error];\n" -" st.global.s32 [%rd1+0], %r34;\n" -" bra.uni $Lt_0_7426;\n" -"$L_0_4866:\n" -" .loc 17 83 0\n" -" mul.lo.s32 %r35, %r32, %r31;\n" -" add.s32 %r36, %r30, %r35;\n" -" mul.lo.s32 %r37, %r36, %r29;\n" -" add.s32 %r38, %r28, %r37;\n" -" ld.param.u64 %rd2, [__cudaparm_particle_map_counts];\n" -" cvt.s64.s32 %rd3, %r38;\n" -" mul.wide.s32 %rd4, %r38, 4;\n" -" add.u64 %rd5, %rd2, %rd4;\n" -" mov.s32 %r39, 1;\n" -" atom.global.add.s32 %r40, [%rd5], %r39;\n" -" mov.s32 %r41, %r40;\n" -" ld.param.s32 %r42, [__cudaparm_particle_map_max_atoms];\n" -" setp.gt.s32 %p9, %r42, %r41;\n" -" @%p9 bra $Lt_0_7682;\n" -" .loc 17 85 0\n" -" mov.s32 %r43, 2;\n" -" ld.param.u64 %rd6, [__cudaparm_particle_map_error];\n" -" st.global.s32 [%rd6+0], %r43;\n" -" .loc 16 118 0\n" -" mov.s32 %r44, -1;\n" -" atom.global.add.s32 %r45, [%rd5], %r44;\n" -" bra.uni $Lt_0_7426;\n" -"$Lt_0_7682:\n" -" .loc 17 88 0\n" -" ld.param.u64 %rd7, [__cudaparm_particle_map_ans];\n" -" ld.param.s32 %r46, [__cudaparm_particle_map_atom_stride];\n" -" mul.lo.s32 %r47, %r46, %r41;\n" -" add.s32 %r48, %r38, %r47;\n" -" cvt.s64.s32 %rd8, %r48;\n" -" mul.wide.s32 %rd9, %r48, 16;\n" -" add.u64 %rd10, %rd7, %rd9;\n" -" cvt.rn.f32.s32 %f31, %r28;\n" -" mov.f32 %f32, 0f3f000000; \n" -" add.ftz.f32 %f33, %f31, %f32;\n" -" sub.ftz.f32 %f34, %f33, %f19;\n" -" cvt.rn.f32.s32 %f35, %r30;\n" -" mov.f32 %f36, 0f3f000000; \n" -" add.ftz.f32 %f37, %f35, %f36;\n" -" sub.ftz.f32 %f38, %f37, %f24;\n" -" cvt.rn.f32.s32 %f39, %r32;\n" -" mov.f32 %f40, 0f3f000000; \n" -" add.ftz.f32 %f41, %f39, %f40;\n" -" sub.ftz.f32 %f42, %f41, %f29;\n" -" st.global.v4.f32 [%rd10+0], {%f34,%f38,%f42,%f14};\n" -"$Lt_0_7426:\n" -"$L_0_4610:\n" -"$Lt_0_6914:\n" -"$Lt_0_6402:\n" -" .loc 17 92 0\n" -" exit;\n" -"$LDWend_particle_map:\n" -" }\n" -" .entry make_rho (\n" -" .param .u64 __cudaparm_make_rho_counts,\n" -" .param .u64 __cudaparm_make_rho_atoms,\n" -" .param .u64 __cudaparm_make_rho_brick,\n" -" .param .u64 __cudaparm_make_rho__rho_coeff,\n" -" .param .s32 __cudaparm_make_rho_atom_stride,\n" -" .param .s32 __cudaparm_make_rho_npts_x,\n" -" .param .s32 __cudaparm_make_rho_npts_y,\n" -" .param .s32 __cudaparm_make_rho_npts_z,\n" -" .param .s32 __cudaparm_make_rho_nlocal_x,\n" -" .param .s32 __cudaparm_make_rho_nlocal_y,\n" -" .param .s32 __cudaparm_make_rho_nlocal_z,\n" -" .param .s32 __cudaparm_make_rho_order_m_1,\n" -" .param .s32 __cudaparm_make_rho_order,\n" -" .param .s32 __cudaparm_make_rho_order2)\n" -" {\n" -" .reg .u32 %r<119>;\n" -" .reg .u64 %rd<57>;\n" -" .reg .f32 %f<26>;\n" -" .reg .pred %p<27>;\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_32578_33_non_const_rho_coeff168[256];\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_32579_33_non_const_front424[320];\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_32580_33_non_const_ans744[2048];\n" -" .loc 17 101 0\n" -"$LDWbegin_make_rho:\n" -" ld.param.s32 %r1, [__cudaparm_make_rho_order2];\n" -" ld.param.s32 %r2, [__cudaparm_make_rho_order];\n" -" add.s32 %r3, %r1, %r2;\n" -" cvt.s32.u32 %r4, %tid.x;\n" -" setp.le.s32 %p1, %r3, %r4;\n" -" @%p1 bra $Lt_1_16898;\n" -" .loc 17 108 0\n" -" mov.u64 %rd1, __cuda___cuda_local_var_32578_33_non_const_rho_coeff168;\n" -" cvt.s64.s32 %rd2, %r4;\n" -" mul.wide.s32 %rd3, %r4, 4;\n" -" ld.param.u64 %rd4, [__cudaparm_make_rho__rho_coeff];\n" -" add.u64 %rd5, %rd4, %rd3;\n" -" ld.global.f32 %f1, [%rd5+0];\n" -" add.u64 %rd6, %rd3, %rd1;\n" -" st.shared.f32 [%rd6+0], %f1;\n" -"$Lt_1_16898:\n" -" mov.u64 %rd1, __cuda___cuda_local_var_32578_33_non_const_rho_coeff168;\n" -" shr.s32 %r5, %r4, 31;\n" -" mov.s32 %r6, 31;\n" -" and.b32 %r7, %r5, %r6;\n" -" add.s32 %r8, %r7, %r4;\n" -" shr.s32 %r9, %r8, 5;\n" -" mul.lo.s32 %r10, %r9, 32;\n" -" sub.s32 %r11, %r4, %r10;\n" -" setp.lt.s32 %p2, %r11, %r2;\n" -" @!%p2 bra $Lt_1_17410;\n" -" .loc 17 114 0\n" -" mov.u64 %rd7, __cuda___cuda_local_var_32579_33_non_const_front424;\n" -" mov.f32 %f2, 0f00000000; \n" -" cvt.s64.s32 %rd8, %r11;\n" -" shr.s32 %r12, %r4, 31;\n" -" mov.s32 %r13, 31;\n" -" and.b32 %r14, %r12, %r13;\n" -" add.s32 %r15, %r14, %r4;\n" -" shr.s32 %r16, %r15, 5;\n" -" cvt.s64.s32 %rd9, %r16;\n" -" mul.wide.s32 %rd10, %r16, 40;\n" -" add.u64 %rd11, %rd8, %rd10;\n" -" mul.lo.u64 %rd12, %rd11, 4;\n" -" add.u64 %rd13, %rd7, %rd12;\n" -" st.shared.f32 [%rd13+128], %f2;\n" -"$Lt_1_17410:\n" -" mov.u64 %rd7, __cuda___cuda_local_var_32579_33_non_const_front424;\n" -" .loc 17 116 0\n" -" bar.sync 0;\n" -" ld.param.s32 %r17, [__cudaparm_make_rho_npts_x];\n" -" shr.s32 %r18, %r17, 31;\n" -" mov.s32 %r19, 31;\n" -" and.b32 %r20, %r18, %r19;\n" -" add.s32 %r21, %r20, %r17;\n" -" shr.s32 %r22, %r21, 5;\n" -" add.s32 %r23, %r22, 1;\n" -" mov.u32 %r24, 0;\n" -" setp.le.s32 %p3, %r23, %r24;\n" -" @%p3 bra $Lt_1_17922;\n" -" shr.s32 %r25, %r4, 31;\n" -" mov.s32 %r26, 31;\n" -" and.b32 %r27, %r25, %r26;\n" -" add.s32 %r28, %r27, %r4;\n" -" shr.s32 %r29, %r28, 5;\n" -" add.s32 %r30, %r11, 32;\n" -" ld.param.s32 %r31, [__cudaparm_make_rho_nlocal_y];\n" -" ld.param.s32 %r32, [__cudaparm_make_rho_nlocal_x];\n" -" mul.lo.s32 %r33, %r31, %r32;\n" -" mov.u32 %r34, %ctaid.x;\n" -" mul.lo.u32 %r35, %r34, 2;\n" -" add.u32 %r36, %r29, %r35;\n" -" ld.param.s32 %r37, [__cudaparm_make_rho_npts_y];\n" -" div.s32 %r38, %r36, %r37;\n" -" ld.param.s32 %r39, [__cudaparm_make_rho_order_m_1];\n" -" setp.lt.s32 %p4, %r38, %r39;\n" -" sub.s32 %r40, %r39, %r38;\n" -" mov.s32 %r41, 0;\n" -" selp.s32 %r42, %r40, %r41, %p4;\n" -" ld.param.s32 %r43, [__cudaparm_make_rho_nlocal_z];\n" -" setp.ge.s32 %p5, %r38, %r43;\n" -" sub.s32 %r44, %r43, %r38;\n" -" add.s32 %r45, %r44, %r2;\n" -" sub.s32 %r46, %r45, 1;\n" -" selp.s32 %r47, %r46, %r2, %p5;\n" -" rem.s32 %r48, %r36, %r37;\n" -" setp.lt.s32 %p6, %r48, %r39;\n" -" sub.s32 %r49, %r39, %r48;\n" -" mov.s32 %r50, 0;\n" -" selp.s32 %r51, %r49, %r50, %p6;\n" -" setp.ge.s32 %p7, %r48, %r31;\n" -" sub.s32 %r52, %r31, %r48;\n" -" add.s32 %r53, %r52, %r2;\n" -" sub.s32 %r54, %r53, 1;\n" -" selp.s32 %r55, %r54, %r2, %p7;\n" -" mov.s32 %r56, %r23;\n" -" mov.s32 %r57, 0;\n" -" setp.gt.s32 %p8, %r2, %r57;\n" -" mov.s32 %r58, 0;\n" -" cvt.s64.s32 %rd14, %r11;\n" -" cvt.s64.s32 %rd15, %r29;\n" -" mul.lo.s32 %r59, %r23, 32;\n" -" mul.wide.s32 %rd16, %r29, 40;\n" -" add.u64 %rd17, %rd14, %rd16;\n" -" ld.param.s32 %r60, [__cudaparm_make_rho_npts_z];\n" -" setp.gt.s32 %p9, %r60, %r38;\n" -" mul.lo.u64 %rd18, %rd17, 4;\n" -" selp.s32 %r61, 1, 0, %p9;\n" -" add.u64 %rd19, %rd18, %rd7;\n" -" mov.u64 %rd20, __cuda___cuda_local_var_32580_33_non_const_ans744;\n" -" mov.s32 %r62, %r56;\n" -"$Lt_1_18434:\n" -" @!%p8 bra $Lt_1_18690;\n" -" mov.s32 %r63, %r2;\n" -" cvt.s64.s32 %rd21, %r4;\n" -" mul.wide.s32 %rd22, %r4, 4;\n" -" add.u64 %rd23, %rd20, %rd22;\n" -" mov.s32 %r64, 0;\n" -" mov.s32 %r65, %r63;\n" -"$Lt_1_19202:\n" -" .loc 17 140 0\n" -" mov.f32 %f3, 0f00000000; \n" -" st.shared.f32 [%rd23+0], %f3;\n" -" add.s32 %r64, %r64, 1;\n" -" add.u64 %rd23, %rd23, 256;\n" -" setp.ne.s32 %p10, %r64, %r2;\n" -" @%p10 bra $Lt_1_19202;\n" -"$Lt_1_18690:\n" -" add.s32 %r66, %r11, %r58;\n" -" set.lt.u32.s32 %r67, %r66, %r32;\n" -" neg.s32 %r68, %r67;\n" -" and.b32 %r69, %r61, %r68;\n" -" mov.u32 %r70, 0;\n" -" setp.eq.s32 %p11, %r69, %r70;\n" -" @%p11 bra $Lt_1_20226;\n" -" .loc 17 143 0\n" -" mov.s32 %r71, %r42;\n" -" setp.ge.s32 %p12, %r42, %r47;\n" -" @%p12 bra $Lt_1_20226;\n" -" sub.s32 %r72, %r47, %r42;\n" -" setp.lt.s32 %p13, %r51, %r55;\n" -" mov.s32 %r73, %r72;\n" -"$Lt_1_20738:\n" -" .loc 17 145 0\n" -" mov.s32 %r74, %r51;\n" -" @!%p13 bra $Lt_1_20994;\n" -" sub.s32 %r75, %r55, %r51;\n" -" sub.s32 %r76, %r71, %r42;\n" -" add.s32 %r77, %r38, %r42;\n" -" add.s32 %r78, %r48, %r51;\n" -" sub.s32 %r79, %r77, %r39;\n" -" sub.s32 %r80, %r78, %r39;\n" -" add.s32 %r81, %r76, %r79;\n" -" mul.lo.s32 %r82, %r33, %r81;\n" -" ld.param.s32 %r83, [__cudaparm_make_rho_atom_stride];\n" -" ld.param.u64 %rd24, [__cudaparm_make_rho_counts];\n" -" mov.s32 %r84, %r75;\n" -"$Lt_1_21506:\n" -" .loc 17 147 0\n" -" sub.s32 %r85, %r74, %r51;\n" -" add.s32 %r86, %r85, %r80;\n" -" mul.lo.s32 %r87, %r86, %r32;\n" -" add.s32 %r88, %r82, %r87;\n" -" add.s32 %r89, %r66, %r88;\n" -" cvt.s64.s32 %rd25, %r89;\n" -" mul.wide.s32 %rd26, %r89, 4;\n" -" add.u64 %rd27, %rd24, %rd26;\n" -" ld.global.s32 %r90, [%rd27+0];\n" -" mul.lo.s32 %r91, %r90, %r83;\n" -" .loc 17 148 0\n" -" mov.s32 %r92, %r89;\n" -" setp.ge.s32 %p14, %r89, %r91;\n" -" @%p14 bra $Lt_1_21762;\n" -" sub.s32 %r93, %r3, 1;\n" -" cvt.s64.s32 %rd28, %r83;\n" -" mul.wide.s32 %rd29, %r83, 16;\n" -" mov.s32 %r94, -1;\n" -" setp.gt.s32 %p15, %r93, %r94;\n" -" ld.param.u64 %rd30, [__cudaparm_make_rho_atoms];\n" -" mul.lo.u64 %rd31, %rd25, 16;\n" -" add.u64 %rd32, %rd30, %rd31;\n" -"$Lt_1_22274:\n" -" .loc 17 149 0\n" -" ld.global.f32 %f4, [%rd32+0];\n" -" @!%p15 bra $Lt_1_29954;\n" -" sub.s32 %r95, %r93, %r74;\n" -" mov.s32 %r96, -1;\n" -" sub.s32 %r97, %r96, %r74;\n" -" cvt.s64.s32 %rd33, %r2;\n" -" mul.wide.s32 %rd34, %r2, 4;\n" -" ld.global.f32 %f5, [%rd32+4];\n" -" ld.global.f32 %f6, [%rd32+8];\n" -" cvt.s64.s32 %rd35, %r95;\n" -" mul.wide.s32 %rd36, %r95, 4;\n" -" add.u64 %rd37, %rd1, %rd36;\n" -" sub.s32 %r98, %r93, %r71;\n" -" cvt.s64.s32 %rd38, %r98;\n" -" mul.wide.s32 %rd39, %r98, 4;\n" -" add.u64 %rd40, %rd1, %rd39;\n" -" mov.f32 %f7, 0f00000000; \n" -" mov.f32 %f8, 0f00000000; \n" -"$Lt_1_23042:\n" -" .loc 17 154 0\n" -" ld.shared.f32 %f9, [%rd37+0];\n" -" fma.rn.ftz.f32 %f8, %f8, %f5, %f9;\n" -" .loc 17 155 0\n" -" ld.shared.f32 %f10, [%rd40+0];\n" -" fma.rn.ftz.f32 %f7, %f7, %f6, %f10;\n" -" sub.u64 %rd40, %rd40, %rd34;\n" -" sub.s32 %r95, %r95, %r2;\n" -" sub.u64 %rd37, %rd37, %rd34;\n" -" setp.gt.s32 %p16, %r95, %r97;\n" -" @%p16 bra $Lt_1_23042;\n" -" bra.uni $Lt_1_22530;\n" -"$Lt_1_29954:\n" -" mov.f32 %f7, 0f00000000; \n" -" mov.f32 %f8, 0f00000000; \n" -"$Lt_1_22530:\n" -" .loc 17 157 0\n" -" ld.global.f32 %f11, [%rd32+12];\n" -" mul.ftz.f32 %f12, %f7, %f8;\n" -" mul.ftz.f32 %f13, %f11, %f12;\n" -" @!%p8 bra $Lt_1_23554;\n" -" mov.s32 %r99, %r2;\n" -" cvt.s64.s32 %rd41, %r4;\n" -" mul.wide.s32 %rd42, %r4, 4;\n" -" add.u64 %rd43, %rd20, %rd42;\n" -" mov.s32 %r100, 0;\n" -" mov.s32 %r101, %r99;\n" -"$Lt_1_24066:\n" -" .loc 17 161 0\n" -" add.s32 %r102, %r100, %r1;\n" -" mov.s32 %r103, %r102;\n" -" setp.lt.s32 %p17, %r102, %r100;\n" -" @%p17 bra $Lt_1_30466;\n" -" cvt.s64.s32 %rd44, %r2;\n" -" mul.wide.s32 %rd34, %r2, 4;\n" -" cvt.s64.s32 %rd45, %r102;\n" -" mul.wide.s32 %rd46, %r102, 4;\n" -" add.u64 %rd47, %rd1, %rd46;\n" -" mov.f32 %f14, 0f00000000; \n" -"$Lt_1_24834:\n" -" .loc 17 162 0\n" -" ld.shared.f32 %f15, [%rd47+0];\n" -" fma.rn.ftz.f32 %f14, %f4, %f14, %f15;\n" -" sub.s32 %r103, %r103, %r2;\n" -" sub.u64 %rd47, %rd47, %rd34;\n" -" setp.ge.s32 %p18, %r103, %r100;\n" -" @%p18 bra $Lt_1_24834;\n" -" bra.uni $Lt_1_24322;\n" -"$Lt_1_30466:\n" -" mov.f32 %f14, 0f00000000; \n" -"$Lt_1_24322:\n" -" .loc 17 163 0\n" -" ld.shared.f32 %f16, [%rd43+0];\n" -" fma.rn.ftz.f32 %f17, %f14, %f13, %f16;\n" -" st.shared.f32 [%rd43+0], %f17;\n" -" add.s32 %r100, %r100, 1;\n" -" add.u64 %rd43, %rd43, 256;\n" -" setp.ne.s32 %p19, %r100, %r2;\n" -" @%p19 bra $Lt_1_24066;\n" -"$Lt_1_23554:\n" -" add.s32 %r92, %r92, %r83;\n" -" add.u64 %rd32, %rd29, %rd32;\n" -" setp.gt.s32 %p20, %r91, %r92;\n" -" @%p20 bra $Lt_1_22274;\n" -"$Lt_1_21762:\n" -" add.s32 %r74, %r74, 1;\n" -" setp.ne.s32 %p21, %r55, %r74;\n" -" @%p21 bra $Lt_1_21506;\n" -"$Lt_1_20994:\n" -" add.s32 %r71, %r71, 1;\n" -" setp.ne.s32 %p22, %r47, %r71;\n" -" @%p22 bra $Lt_1_20738;\n" -"$Lt_1_20226:\n" -"$Lt_1_19714:\n" -" .loc 17 172 0\n" -" bar.sync 0;\n" -" @!%p2 bra $Lt_1_26626;\n" -" .loc 17 174 0\n" -" ld.shared.f32 %f18, [%rd19+128];\n" -" st.shared.f32 [%rd19+0], %f18;\n" -" .loc 17 175 0\n" -" mov.f32 %f19, 0f00000000; \n" -" st.shared.f32 [%rd19+128], %f19;\n" -" bra.uni $Lt_1_26370;\n" -"$Lt_1_26626:\n" -" .loc 17 177 0\n" -" mov.f32 %f20, 0f00000000; \n" -" st.shared.f32 [%rd19+0], %f20;\n" -"$Lt_1_26370:\n" -" @!%p8 bra $Lt_1_26882;\n" -" mov.s32 %r104, %r2;\n" -" cvt.s64.s32 %rd48, %r4;\n" -" mov.s32 %r105, %r11;\n" -" add.s32 %r106, %r11, %r2;\n" -" mul.wide.s32 %rd49, %r4, 4;\n" -" add.u64 %rd50, %rd20, %rd49;\n" -" mov.s64 %rd51, %rd19;\n" -" mov.s32 %r107, %r104;\n" -"$Lt_1_27394:\n" -" .loc 17 180 0\n" -" ld.shared.f32 %f21, [%rd50+0];\n" -" ld.shared.f32 %f22, [%rd51+0];\n" -" add.ftz.f32 %f23, %f21, %f22;\n" -" st.shared.f32 [%rd51+0], %f23;\n" -" .loc 17 181 0\n" -" bar.sync 0;\n" -" add.s32 %r105, %r105, 1;\n" -" add.u64 %rd51, %rd51, 4;\n" -" add.u64 %rd50, %rd50, 256;\n" -" setp.ne.s32 %p23, %r105, %r106;\n" -" @%p23 bra $Lt_1_27394;\n" -"$Lt_1_26882:\n" -" set.lt.u32.s32 %r108, %r66, %r17;\n" -" neg.s32 %r109, %r108;\n" -" and.b32 %r110, %r61, %r109;\n" -" mov.u32 %r111, 0;\n" -" setp.eq.s32 %p24, %r110, %r111;\n" -" @%p24 bra $Lt_1_27906;\n" -" .loc 17 185 0\n" -" ld.shared.f32 %f24, [%rd19+0];\n" -" ld.param.u64 %rd52, [__cudaparm_make_rho_brick];\n" -" add.s32 %r112, %r11, %r58;\n" -" mul.lo.s32 %r113, %r37, %r17;\n" -" mul.lo.s32 %r114, %r38, %r113;\n" -" mul.lo.s32 %r115, %r48, %r17;\n" -" add.s32 %r116, %r114, %r115;\n" -" add.s32 %r117, %r112, %r116;\n" -" cvt.s64.s32 %rd53, %r117;\n" -" mul.wide.s32 %rd54, %r117, 4;\n" -" add.u64 %rd55, %rd52, %rd54;\n" -" st.global.f32 [%rd55+0], %f24;\n" -"$Lt_1_27906:\n" -" add.s32 %r58, %r58, 32;\n" -" setp.ne.s32 %p25, %r58, %r59;\n" -" @%p25 bra $Lt_1_18434;\n" -"$Lt_1_17922:\n" -" .loc 17 189 0\n" -" exit;\n" -"$LDWend_make_rho:\n" -" }\n" -" .entry interp (\n" -" .param .u64 __cudaparm_interp_x_,\n" -" .param .u64 __cudaparm_interp_q_,\n" -" .param .s32 __cudaparm_interp_nlocal,\n" -" .param .u64 __cudaparm_interp_brick,\n" -" .param .u64 __cudaparm_interp__rho_coeff,\n" -" .param .s32 __cudaparm_interp_npts_x,\n" -" .param .s32 __cudaparm_interp_npts_yx,\n" -" .param .f32 __cudaparm_interp_b_lo_x,\n" -" .param .f32 __cudaparm_interp_b_lo_y,\n" -" .param .f32 __cudaparm_interp_b_lo_z,\n" -" .param .f32 __cudaparm_interp_delxinv,\n" -" .param .f32 __cudaparm_interp_delyinv,\n" -" .param .f32 __cudaparm_interp_delzinv,\n" -" .param .s32 __cudaparm_interp_order,\n" -" .param .s32 __cudaparm_interp_order2,\n" -" .param .f32 __cudaparm_interp_qqrd2e_scale,\n" -" .param .u64 __cudaparm_interp_ans)\n" -" {\n" -" .reg .u32 %r<56>;\n" -" .reg .u64 %rd<37>;\n" -" .reg .f32 %f<69>;\n" -" .reg .pred %p<14>;\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_32676_33_non_const_rho_coeff2888[256];\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_32677_33_non_const_rho1d_03144[2048];\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_32678_33_non_const_rho1d_15192[2048];\n" -" .loc 17 199 0\n" -"$LDWbegin_interp:\n" -" ld.param.s32 %r1, [__cudaparm_interp_order2];\n" -" ld.param.s32 %r2, [__cudaparm_interp_order];\n" -" add.s32 %r3, %r1, %r2;\n" -" cvt.s32.u32 %r4, %tid.x;\n" -" setp.le.s32 %p1, %r3, %r4;\n" -" @%p1 bra $Lt_2_8706;\n" -" .loc 17 206 0\n" -" mov.u64 %rd1, __cuda___cuda_local_var_32676_33_non_const_rho_coeff2888;\n" -" cvt.s64.s32 %rd2, %r4;\n" -" mul.wide.s32 %rd3, %r4, 4;\n" -" ld.param.u64 %rd4, [__cudaparm_interp__rho_coeff];\n" -" add.u64 %rd5, %rd4, %rd3;\n" -" ld.global.f32 %f1, [%rd5+0];\n" -" add.u64 %rd6, %rd3, %rd1;\n" -" st.shared.f32 [%rd6+0], %f1;\n" -"$Lt_2_8706:\n" -" mov.u64 %rd1, __cuda___cuda_local_var_32676_33_non_const_rho_coeff2888;\n" -" .loc 17 207 0\n" -" bar.sync 0;\n" -" mov.u32 %r5, %ctaid.x;\n" -" mov.u32 %r6, %ntid.x;\n" -" mul.lo.u32 %r7, %r5, %r6;\n" -" add.u32 %r8, %r4, %r7;\n" -" ld.param.s32 %r9, [__cudaparm_interp_nlocal];\n" -" setp.le.s32 %p2, %r9, %r8;\n" -" @%p2 bra $Lt_2_9218;\n" -" .loc 17 215 0\n" -" mov.u32 %r10, %r8;\n" -" mov.s32 %r11, 0;\n" -" mov.u32 %r12, %r11;\n" -" mov.s32 %r13, 0;\n" -" mov.u32 %r14, %r13;\n" -" mov.s32 %r15, 0;\n" -" mov.u32 %r16, %r15;\n" -" tex.1d.v4.f32.s32 {%f2,%f3,%f4,%f5},[pos_tex,{%r10,%r12,%r14,%r16}];\n" -" mov.f32 %f6, %f2;\n" -" mov.f32 %f7, %f3;\n" -" mov.f32 %f8, %f4;\n" -" .loc 17 216 0\n" -" mov.u32 %r17, %r8;\n" -" mov.s32 %r18, 0;\n" -" mov.u32 %r19, %r18;\n" -" mov.s32 %r20, 0;\n" -" mov.u32 %r21, %r20;\n" -" mov.s32 %r22, 0;\n" -" mov.u32 %r23, %r22;\n" -" tex.1d.v4.f32.s32 {%f9,%f10,%f11,%f12},[q_tex,{%r17,%r19,%r21,%r23}];\n" -" mov.f32 %f13, %f9;\n" -" ld.param.f32 %f14, [__cudaparm_interp_qqrd2e_scale];\n" -" mul.ftz.f32 %f15, %f14, %f13;\n" -" mov.f32 %f16, 0f00000000; \n" -" setp.neu.ftz.f32 %p3, %f15, %f16;\n" -" @!%p3 bra $Lt_2_9986;\n" -" mov.s32 %r24, 0;\n" -" setp.gt.s32 %p4, %r2, %r24;\n" -" ld.param.f32 %f17, [__cudaparm_interp_b_lo_x];\n" -" sub.ftz.f32 %f18, %f6, %f17;\n" -" ld.param.f32 %f19, [__cudaparm_interp_delxinv];\n" -" mul.ftz.f32 %f20, %f19, %f18;\n" -" @!%p4 bra $Lt_2_16386;\n" -" mov.u64 %rd7, __cuda___cuda_local_var_32677_33_non_const_rho1d_03144;\n" -" mov.u64 %rd8, __cuda___cuda_local_var_32678_33_non_const_rho1d_15192;\n" -" cvt.rzi.ftz.s32.f32 %r25, %f20;\n" -" cvt.rn.f32.s32 %f21, %r25;\n" -" mov.f32 %f22, 0f3f000000; \n" -" add.ftz.f32 %f23, %f21, %f22;\n" -" sub.ftz.f32 %f24, %f23, %f20;\n" -" ld.param.f32 %f25, [__cudaparm_interp_b_lo_y];\n" -" sub.ftz.f32 %f26, %f7, %f25;\n" -" ld.param.f32 %f27, [__cudaparm_interp_delyinv];\n" -" mul.ftz.f32 %f28, %f27, %f26;\n" -" cvt.rzi.ftz.s32.f32 %r26, %f28;\n" -" cvt.rn.f32.s32 %f29, %r26;\n" -" mov.f32 %f30, 0f3f000000; \n" -" add.ftz.f32 %f31, %f29, %f30;\n" -" sub.ftz.f32 %f32, %f31, %f28;\n" -" mov.s32 %r27, %r2;\n" -" cvt.s64.s32 %rd9, %r4;\n" -" mov.s32 %r28, %r1;\n" -" mul.wide.s32 %rd3, %r4, 4;\n" -" add.u64 %rd10, %rd3, %rd7;\n" -" add.u64 %rd11, %rd3, %rd8;\n" -" mov.s32 %r29, 0;\n" -" mov.s32 %r30, %r27;\n" -"$Lt_2_10754:\n" -" .loc 17 235 0\n" -" mov.f32 %f33, 0f00000000; \n" -" mov.f32 %f34, 0f00000000; \n" -" st.shared.f32 [%rd10+0], %f34;\n" -" .loc 17 236 0\n" -" mov.f32 %f35, 0f00000000; \n" -" mov.f32 %f36, 0f00000000; \n" -" st.shared.f32 [%rd11+0], %f36;\n" -" .loc 17 237 0\n" -" mov.s32 %r31, %r28;\n" -" setp.lt.s32 %p5, %r28, %r29;\n" -" @%p5 bra $Lt_2_11010;\n" -" cvt.s64.s32 %rd12, %r2;\n" -" mul.wide.s32 %rd13, %r2, 4;\n" -" cvt.s64.s32 %rd14, %r28;\n" -" mul.wide.s32 %rd15, %r28, 4;\n" -" add.u64 %rd16, %rd1, %rd15;\n" -"$Lt_2_11522:\n" -" .loc 17 238 0\n" -" ld.shared.f32 %f37, [%rd16+0];\n" -" fma.rn.ftz.f32 %f33, %f33, %f24, %f37;\n" -" st.shared.f32 [%rd10+0], %f33;\n" -" .loc 17 239 0\n" -" fma.rn.ftz.f32 %f35, %f35, %f32, %f37;\n" -" st.shared.f32 [%rd11+0], %f35;\n" -" sub.s32 %r31, %r31, %r2;\n" -" sub.u64 %rd16, %rd16, %rd13;\n" -" setp.ge.s32 %p6, %r31, %r29;\n" -" @%p6 bra $Lt_2_11522;\n" -"$Lt_2_11010:\n" -" add.s32 %r29, %r29, 1;\n" -" add.s32 %r28, %r28, 1;\n" -" add.u64 %rd11, %rd11, 256;\n" -" add.u64 %rd10, %rd10, 256;\n" -" setp.ne.s32 %p7, %r28, %r3;\n" -" @%p7 bra $Lt_2_10754;\n" -" bra.uni $Lt_2_10242;\n" -"$Lt_2_16386:\n" -" cvt.rzi.ftz.s32.f32 %r25, %f20;\n" -" mov.u64 %rd8, __cuda___cuda_local_var_32678_33_non_const_rho1d_15192;\n" -" mov.u64 %rd7, __cuda___cuda_local_var_32677_33_non_const_rho1d_03144;\n" -"$Lt_2_10242:\n" -" .loc 17 243 0\n" -" ld.param.f32 %f38, [__cudaparm_interp_b_lo_z];\n" -" sub.ftz.f32 %f39, %f8, %f38;\n" -" ld.param.f32 %f40, [__cudaparm_interp_delzinv];\n" -" mul.ftz.f32 %f41, %f40, %f39;\n" -" cvt.rzi.ftz.s32.f32 %r32, %f41;\n" -" ld.param.s32 %r33, [__cudaparm_interp_npts_yx];\n" -" mul.lo.s32 %r34, %r32, %r33;\n" -" add.s32 %r35, %r25, %r34;\n" -" @!%p4 bra $Lt_2_16898;\n" -" cvt.rn.f32.s32 %f42, %r32;\n" -" mov.f32 %f43, 0f3f000000; \n" -" add.ftz.f32 %f44, %f42, %f43;\n" -" sub.ftz.f32 %f45, %f44, %f41;\n" -" mov.s32 %r36, %r2;\n" -" ld.param.f32 %f46, [__cudaparm_interp_b_lo_y];\n" -" sub.ftz.f32 %f47, %f7, %f46;\n" -" cvt.s64.s32 %rd17, %r4;\n" -" ld.param.f32 %f48, [__cudaparm_interp_delyinv];\n" -" mul.ftz.f32 %f49, %f48, %f47;\n" -" cvt.rzi.ftz.s32.f32 %r37, %f49;\n" -" ld.param.s32 %r38, [__cudaparm_interp_npts_x];\n" -" mul.lo.s32 %r39, %r37, %r38;\n" -" mul.wide.s32 %rd3, %r4, 4;\n" -" add.s32 %r40, %r39, %r35;\n" -" add.u64 %rd18, %rd3, %rd7;\n" -" add.u64 %rd19, %rd3, %rd8;\n" -" cvt.s64.s32 %rd20, %r38;\n" -" mul.wide.s32 %rd21, %r38, 16;\n" -" mov.s32 %r41, %r40;\n" -" ld.param.u64 %rd22, [__cudaparm_interp_brick];\n" -" mov.s32 %r42, 0;\n" -" mov.f32 %f50, 0f00000000; \n" -" mov.f32 %f51, 0f00000000; \n" -" mov.f32 %f52, 0f00000000; \n" -" mov.s32 %r43, %r36;\n" -"$Lt_2_12802:\n" -" .loc 17 246 0\n" -" add.s32 %r44, %r42, %r1;\n" -" mov.s32 %r45, %r44;\n" -" setp.lt.s32 %p8, %r44, %r42;\n" -" @%p8 bra $Lt_2_17154;\n" -" cvt.s64.s32 %rd23, %r2;\n" -" mul.wide.s32 %rd13, %r2, 4;\n" -" cvt.s64.s32 %rd24, %r44;\n" -" mul.wide.s32 %rd25, %r44, 4;\n" -" add.u64 %rd26, %rd1, %rd25;\n" -" mov.f32 %f53, 0f00000000; \n" -"$Lt_2_13570:\n" -" .loc 17 247 0\n" -" ld.shared.f32 %f54, [%rd26+0];\n" -" fma.rn.ftz.f32 %f53, %f45, %f53, %f54;\n" -" sub.s32 %r45, %r45, %r2;\n" -" sub.u64 %rd26, %rd26, %rd13;\n" -" setp.ge.s32 %p9, %r45, %r42;\n" -" @%p9 bra $Lt_2_13570;\n" -" bra.uni $Lt_2_13058;\n" -"$Lt_2_17154:\n" -" mov.f32 %f53, 0f00000000; \n" -"$Lt_2_13058:\n" -" .loc 17 249 0\n" -" mov.s32 %r46, %r41;\n" -" mov.s32 %r47, %r2;\n" -" mul.ftz.f32 %f55, %f15, %f53;\n" -" mov.s32 %r48, %r46;\n" -" mov.s64 %rd27, %rd19;\n" -" cvt.s64.s32 %rd28, %r46;\n" -" mul.wide.s32 %rd29, %r46, 16;\n" -" mov.s32 %r49, 0;\n" -" mov.s32 %r50, %r47;\n" -"$Lt_2_14594:\n" -" mov.s32 %r51, %r2;\n" -" mov.s32 %r52, %r48;\n" -" add.s32 %r53, %r48, %r2;\n" -" mov.s64 %rd30, %rd18;\n" -" ld.shared.f32 %f56, [%rd27+0];\n" -" add.u64 %rd31, %rd29, %rd22;\n" -" mul.ftz.f32 %f57, %f55, %f56;\n" -" mov.s32 %r54, %r51;\n" -"$Lt_2_15362:\n" -" .loc 17 253 0\n" -" ld.shared.f32 %f58, [%rd30+0];\n" -" mul.ftz.f32 %f59, %f58, %f57;\n" -" ld.global.v4.f32 {%f60,%f61,%f62,_}, [%rd31+0];\n" -" .loc 17 255 0\n" -" mul.ftz.f32 %f63, %f59, %f60;\n" -" sub.ftz.f32 %f52, %f52, %f63;\n" -" .loc 17 256 0\n" -" mul.ftz.f32 %f64, %f59, %f61;\n" -" sub.ftz.f32 %f51, %f51, %f64;\n" -" .loc 17 257 0\n" -" mul.ftz.f32 %f65, %f59, %f62;\n" -" sub.ftz.f32 %f50, %f50, %f65;\n" -" add.s32 %r52, %r52, 1;\n" -" add.u64 %rd31, %rd31, 16;\n" -" add.u64 %rd30, %rd30, 256;\n" -" setp.ne.s32 %p10, %r52, %r53;\n" -" @%p10 bra $Lt_2_15362;\n" -" add.s32 %r49, %r49, 1;\n" -" add.s32 %r48, %r48, %r38;\n" -" add.u64 %rd29, %rd29, %rd21;\n" -" add.u64 %rd27, %rd27, 256;\n" -" setp.ne.s32 %p11, %r49, %r2;\n" -" @%p11 bra $Lt_2_14594;\n" -" add.s32 %r42, %r42, 1;\n" -" add.s32 %r41, %r46, %r33;\n" -" setp.ne.s32 %p12, %r42, %r2;\n" -" @%p12 bra $Lt_2_12802;\n" -" bra.uni $Lt_2_9730;\n" -"$Lt_2_16898:\n" -" mov.f32 %f50, 0f00000000; \n" -" mov.f32 %f51, 0f00000000; \n" -" mov.f32 %f52, 0f00000000; \n" -" bra.uni $Lt_2_9730;\n" -"$Lt_2_9986:\n" -" mov.f32 %f50, 0f00000000; \n" -" mov.f32 %f51, 0f00000000; \n" -" mov.f32 %f52, 0f00000000; \n" -"$Lt_2_9730:\n" -" .loc 17 264 0\n" -" ld.param.u64 %rd32, [__cudaparm_interp_ans];\n" -" cvt.s64.s32 %rd33, %r8;\n" -" mul.wide.s32 %rd34, %r8, 16;\n" -" add.u64 %rd35, %rd32, %rd34;\n" -" mov.f32 %f66, %f67;\n" -" st.global.v4.f32 [%rd35+0], {%f52,%f51,%f50,%f66};\n" -"$Lt_2_9218:\n" -" .loc 17 266 0\n" -" exit;\n" -"$LDWend_interp:\n" -" }\n" -; diff --git a/lib/gpu/radixsort_app.cu_o b/lib/gpu/radixsort_app.cu_o deleted file mode 100644 index 6be742d623..0000000000 Binary files a/lib/gpu/radixsort_app.cu_o and /dev/null differ diff --git a/lib/gpu/re_squared.ptx b/lib/gpu/re_squared.ptx deleted file mode 100644 index e0e5821059..0000000000 --- a/lib/gpu/re_squared.ptx +++ /dev/null @@ -1,2357 +0,0 @@ - .version 2.3 - .target sm_20 - .address_size 64 - // compiled with /usr/local/cuda/open64/lib//be - // nvopencc 4.0 built on 2011-05-12 - - //----------------------------------------------------------- - // Compiling /tmp/tmpxft_00009bda_00000000-9_lal_re_squared.cpp3.i (/home/sjplimp/ccBI#.sX5b7D) - //----------------------------------------------------------- - - //----------------------------------------------------------- - // Options: - //----------------------------------------------------------- - // Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64 - // -O3 (Optimization level) - // -g0 (Debug level) - // -m2 (Report advisories) - //----------------------------------------------------------- - - .file 1 "" - .file 2 "/tmp/tmpxft_00009bda_00000000-8_lal_re_squared.cudafe2.gpu" - .file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h" - .file 4 "/usr/local/cuda/include/crt/device_runtime.h" - .file 5 "/usr/local/cuda/include/host_defines.h" - .file 6 "/usr/local/cuda/include/builtin_types.h" - .file 7 "/usr/local/cuda/include/device_types.h" - .file 8 "/usr/local/cuda/include/driver_types.h" - .file 9 "/usr/local/cuda/include/surface_types.h" - .file 10 "/usr/local/cuda/include/texture_types.h" - .file 11 "/usr/local/cuda/include/vector_types.h" - .file 12 "/usr/local/cuda/include/device_launch_parameters.h" - .file 13 "/usr/local/cuda/include/crt/storage_class.h" - .file 14 "/usr/include/bits/types.h" - .file 15 "/usr/include/time.h" - .file 16 "lal_ellipsoid_extra.h" - .file 17 "lal_re_squared.cu" - .file 18 "/usr/local/cuda/include/common_functions.h" - .file 19 "/usr/local/cuda/include/math_functions.h" - .file 20 "/usr/local/cuda/include/math_constants.h" - .file 21 "/usr/local/cuda/include/device_functions.h" - .file 22 "/usr/local/cuda/include/sm_11_atomic_functions.h" - .file 23 "/usr/local/cuda/include/sm_12_atomic_functions.h" - .file 24 "/usr/local/cuda/include/sm_13_double_functions.h" - .file 25 "/usr/local/cuda/include/sm_20_atomic_functions.h" - .file 26 "/usr/local/cuda/include/sm_20_intrinsics.h" - .file 27 "/usr/local/cuda/include/surface_functions.h" - .file 28 "/usr/local/cuda/include/texture_fetch_functions.h" - .file 29 "/usr/local/cuda/include/math_functions_dbl_ptx3.h" - - - .entry kernel_ellipsoid ( - .param .u64 __cudaparm_kernel_ellipsoid_x_, - .param .u64 __cudaparm_kernel_ellipsoid_q, - .param .u64 __cudaparm_kernel_ellipsoid_shape, - .param .u64 __cudaparm_kernel_ellipsoid_well, - .param .u64 __cudaparm_kernel_ellipsoid_splj, - .param .u64 __cudaparm_kernel_ellipsoid_sig_eps, - .param .s32 __cudaparm_kernel_ellipsoid_ntypes, - .param .u64 __cudaparm_kernel_ellipsoid_dev_nbor, - .param .s32 __cudaparm_kernel_ellipsoid_stride, - .param .u64 __cudaparm_kernel_ellipsoid_ans, - .param .s32 __cudaparm_kernel_ellipsoid_astride, - .param .u64 __cudaparm_kernel_ellipsoid_engv, - .param .u64 __cudaparm_kernel_ellipsoid_err_flag, - .param .s32 __cudaparm_kernel_ellipsoid_eflag, - .param .s32 __cudaparm_kernel_ellipsoid_vflag, - .param .s32 __cudaparm_kernel_ellipsoid_inum, - .param .s32 __cudaparm_kernel_ellipsoid_t_per_atom) - { - .reg .u32 %r<65>; - .reg .u64 %rd<78>; - .reg .f32 %f<1598>; - .reg .pred %p<34>; - .shared .align 16 .b8 __cuda___cuda_local_var_32902_33_non_const_sp_lj120[16]; - .shared .align 4 .b8 __cuda___cuda_local_var_33303_55_non_const_red_acc136[3584]; - .shared .f32 __cuda_local_var_32908_33_non_const_b_alpha; - .shared .f32 __cuda_local_var_32908_42_non_const_cr60; - // __cuda_local_var_32913_10_non_const_f = 64 - // __cuda_local_var_32917_10_non_const_tor = 80 - // __cuda_local_var_32921_9_non_const_virial = 16 - // __cuda_local_var_33168_15_non_const_u = 40 - .loc 17 43 0 -$LDWbegin_kernel_ellipsoid: - .loc 17 48 0 - ld.param.u64 %rd1, [__cudaparm_kernel_ellipsoid_splj]; - ldu.global.f32 %f1, [%rd1+0]; - .loc 17 49 0 - ld.global.f32 %f2, [%rd1+4]; - .loc 17 50 0 - ld.global.f32 %f3, [%rd1+8]; - .loc 17 51 0 - ld.global.f32 %f4, [%rd1+12]; - st.shared.v4.f32 [__cuda___cuda_local_var_32902_33_non_const_sp_lj120+0], {%f1,%f2,%f3,%f4}; - .loc 17 54 0 - mov.f32 %f5, 0f3f4db6db; // 0.803571 - st.shared.f32 [__cuda_local_var_32908_33_non_const_b_alpha], %f5; - .loc 17 55 0 - mov.f32 %f6, 0f42700000; // 60 - lg2.approx.ftz.f32 %f7, %f6; - mov.f32 %f8, 0f3eaaaaab; // 0.333333 - mul.ftz.f32 %f9, %f7, %f8; - ex2.approx.ftz.f32 %f10, %f9; - mov.f32 %f11, 0f42700000; // 60 - mul.ftz.f32 %f12, %f10, %f10; - div.approx.ftz.f32 %f13, %f11, %f12; - sub.ftz.f32 %f14, %f10, %f13; - mov.f32 %f15, 0f3eaaaaab; // 0.333333 - mul.ftz.f32 %f16, %f14, %f15; - sub.ftz.f32 %f17, %f10, %f16; - st.shared.f32 [__cuda_local_var_32908_42_non_const_cr60], %f17; - .loc 17 68 0 - mov.f32 %f18, 0f00000000; // 0 - mov.f32 %f19, %f18; - mov.f32 %f20, 0f00000000; // 0 - mov.f32 %f21, %f20; - mov.f32 %f22, 0f00000000; // 0 - mov.f32 %f23, %f22; - mov.f32 %f24, 0f00000000; // 0 - mov.f32 %f25, %f24; - mov.f32 %f26, 0f00000000; // 0 - mov.f32 %f27, %f26; - mov.f32 %f28, 0f00000000; // 0 - mov.f32 %f29, %f28; - ld.param.s32 %r1, [__cudaparm_kernel_ellipsoid_t_per_atom]; - cvt.s32.u32 %r2, %tid.x; - div.s32 %r3, %r2, %r1; - cvt.s32.u32 %r4, %ntid.x; - div.s32 %r5, %r4, %r1; - cvt.s32.u32 %r6, %ctaid.x; - mul.lo.s32 %r7, %r6, %r5; - add.s32 %r8, %r3, %r7; - ld.param.s32 %r9, [__cudaparm_kernel_ellipsoid_inum]; - setp.le.s32 %p1, %r9, %r8; - @%p1 bra $Lt_0_67842; - .loc 17 73 0 - cvt.s64.s32 %rd2, %r8; - mul.wide.s32 %rd3, %r8, 4; - ld.param.u64 %rd4, [__cudaparm_kernel_ellipsoid_dev_nbor]; - add.u64 %rd5, %rd4, %rd3; - ld.global.s32 %r10, [%rd5+0]; - ld.param.s32 %r11, [__cudaparm_kernel_ellipsoid_stride]; - cvt.s64.s32 %rd6, %r11; - mul.wide.s32 %rd7, %r11, 4; - add.u64 %rd8, %rd7, %rd5; - ld.global.s32 %r12, [%rd8+0]; - .loc 17 76 0 - cvt.s64.s32 %rd9, %r10; - mul.wide.s32 %rd10, %r10, 16; - ld.param.u64 %rd11, [__cudaparm_kernel_ellipsoid_x_]; - add.u64 %rd12, %rd10, %rd11; - ld.global.v4.f32 {%f30,%f31,%f32,%f33}, [%rd12+0]; - .loc 17 88 0 - cvt.rzi.ftz.s32.f32 %r13, %f33; - cvt.s64.s32 %rd13, %r13; - mul.wide.s32 %rd14, %r13, 16; - ld.param.u64 %rd15, [__cudaparm_kernel_ellipsoid_shape]; - add.u64 %rd16, %rd14, %rd15; - ld.global.v4.f32 {%f34,%f35,%f36,_}, [%rd16+0]; - .loc 17 97 0 - ld.param.u64 %rd17, [__cudaparm_kernel_ellipsoid_q]; - add.u64 %rd18, %rd10, %rd17; - ld.global.v4.f32 {%f37,%f38,%f39,%f40}, [%rd18+0]; - .loc 17 98 0 - ld.param.u64 %rd19, [__cudaparm_kernel_ellipsoid_well]; - add.u64 %rd20, %rd14, %rd19; - ld.global.v4.f32 {%f41,%f42,%f43,_}, [%rd20+0]; - .loc 17 117 0 - cvt.s32.s64 %r14, %rd6; - sub.s32 %r15, %r1, 1; - and.b32 %r16, %r15, %r2; - add.u64 %rd21, %rd7, %rd8; - mul.lo.s32 %r17, %r14, %r16; - cvt.s64.s32 %rd22, %r17; - mul.wide.s32 %rd23, %r17, 4; - add.u64 %rd24, %rd21, %rd23; - mov.s64 %rd25, %rd24; - mul.lo.s32 %r18, %r14, %r12; - cvt.s64.s32 %rd26, %r18; - mul.wide.s32 %rd27, %r18, 4; - add.u64 %rd28, %rd21, %rd27; - setp.ge.u64 %p2, %rd24, %rd28; - @%p2 bra $Lt_0_69634; - ld.param.s32 %r19, [__cudaparm_kernel_ellipsoid_vflag]; - mov.s32 %r20, 0; - setp.gt.s32 %p3, %r19, %r20; - mul.ftz.f32 %f44, %f35, %f35; - add.ftz.f32 %f45, %f38, %f38; - add.ftz.f32 %f46, %f40, %f40; - mul.ftz.f32 %f47, %f37, %f37; - mul.ftz.f32 %f48, %f38, %f38; - mul.ftz.f32 %f49, %f39, %f39; - mul.ftz.f32 %f50, %f40, %f40; - mul.ftz.f32 %f51, %f34, %f34; - add.ftz.f32 %f52, %f39, %f39; - mul.ftz.f32 %f53, %f36, %f36; - mul.ftz.f32 %f54, %f34, %f35; - add.ftz.f32 %f55, %f34, %f34; - add.ftz.f32 %f56, %f35, %f35; - add.ftz.f32 %f57, %f36, %f36; - ld.param.s32 %r21, [__cudaparm_kernel_ellipsoid_ntypes]; - mul.lo.s32 %r22, %r21, %r13; - rcp.approx.ftz.f32 %f58, %f44; - mul.ftz.f32 %f59, %f45, %f39; - mul.ftz.f32 %f60, %f45, %f40; - mul.ftz.f32 %f61, %f45, %f37; - mul.ftz.f32 %f62, %f46, %f37; - add.ftz.f32 %f63, %f47, %f48; - sub.ftz.f32 %f64, %f47, %f48; - rcp.approx.ftz.f32 %f65, %f51; - mul.ftz.f32 %f66, %f52, %f37; - mul.ftz.f32 %f67, %f52, %f40; - rcp.approx.ftz.f32 %f68, %f53; - mul.ftz.f32 %f69, %f54, %f36; - sub.ftz.f32 %f70, %f59, %f62; - add.ftz.f32 %f71, %f59, %f62; - sub.ftz.f32 %f72, %f62, %f59; - sub.ftz.f32 %f73, %f63, %f49; - add.ftz.f32 %f74, %f49, %f64; - sub.ftz.f32 %f75, %f64, %f49; - add.ftz.f32 %f76, %f60, %f66; - sub.ftz.f32 %f77, %f60, %f66; - sub.ftz.f32 %f78, %f66, %f60; - sub.ftz.f32 %f79, %f67, %f61; - add.ftz.f32 %f80, %f61, %f67; - sub.ftz.f32 %f81, %f61, %f67; - mul.ftz.f32 %f82, %f44, %f70; - mul.ftz.f32 %f83, %f70, %f42; - mul.ftz.f32 %f84, %f51, %f71; - mul.ftz.f32 %f85, %f71, %f41; - neg.ftz.f32 %f86, %f71; - sub.ftz.f32 %f87, %f73, %f50; - sub.ftz.f32 %f88, %f50, %f73; - sub.ftz.f32 %f89, %f74, %f50; - sub.ftz.f32 %f90, %f50, %f74; - add.ftz.f32 %f91, %f50, %f75; - mul.ftz.f32 %f92, %f53, %f76; - mul.ftz.f32 %f93, %f76, %f43; - neg.ftz.f32 %f94, %f76; - mul.ftz.f32 %f95, %f51, %f77; - mul.ftz.f32 %f96, %f77, %f41; - mul.ftz.f32 %f97, %f53, %f79; - mul.ftz.f32 %f98, %f79, %f43; - mul.ftz.f32 %f99, %f44, %f80; - mul.ftz.f32 %f100, %f80, %f42; - mul.ftz.f32 %f101, %f70, %f82; - mul.ftz.f32 %f102, %f80, %f82; - mul.ftz.f32 %f103, %f72, %f82; - mul.ftz.f32 %f104, %f70, %f83; - mul.ftz.f32 %f105, %f80, %f83; - mov.f32 %f106, 0f00000000; // 0 - mov.f32 %f107, 0f00000000; // 0 - fma.rn.ftz.f32 %f108, %f107, %f84, %f106; - mov.f32 %f109, 0f00000000; // 0 - mov.f32 %f110, 0f00000000; // 0 - fma.rn.ftz.f32 %f111, %f84, %f110, %f109; - mul.ftz.f32 %f112, %f51, %f87; - mul.ftz.f32 %f113, %f87, %f41; - mul.ftz.f32 %f114, %f82, %f89; - mul.ftz.f32 %f115, %f44, %f89; - mul.ftz.f32 %f116, %f83, %f89; - mul.ftz.f32 %f117, %f89, %f42; - mul.ftz.f32 %f118, %f82, %f90; - mul.ftz.f32 %f119, %f53, %f91; - mul.ftz.f32 %f120, %f91, %f43; - neg.ftz.f32 %f121, %f91; - mov.f32 %f122, 0f00000000; // 0 - mov.f32 %f123, 0f00000000; // 0 - fma.rn.ftz.f32 %f124, %f123, %f95, %f122; - mov.f32 %f125, 0f00000000; // 0 - mov.f32 %f126, 0f00000000; // 0 - fma.rn.ftz.f32 %f127, %f95, %f126, %f125; - mul.ftz.f32 %f128, %f70, %f99; - mul.ftz.f32 %f129, %f89, %f99; - mul.ftz.f32 %f130, %f80, %f99; - mul.ftz.f32 %f131, %f72, %f99; - mul.ftz.f32 %f132, %f90, %f99; - mul.ftz.f32 %f133, %f70, %f100; - mul.ftz.f32 %f134, %f89, %f100; - mul.ftz.f32 %f135, %f80, %f100; - neg.ftz.f32 %f136, %f102; - mov.f32 %f137, 0f00000000; // 0 - fma.rn.ftz.f32 %f138, %f137, %f97, %f108; - mov.f32 %f139, 0f00000000; // 0 - fma.rn.ftz.f32 %f140, %f97, %f139, %f108; - mov.f32 %f141, 0f00000000; // 0 - fma.rn.ftz.f32 %f142, %f97, %f141, %f111; - fma.rn.ftz.f32 %f143, %f87, %f112, %f101; - fma.rn.ftz.f32 %f144, %f112, %f77, %f102; - mov.f32 %f145, 0f00000000; // 0 - mov.f32 %f146, 0f00000000; // 0 - fma.rn.ftz.f32 %f147, %f146, %f112, %f145; - mov.f32 %f148, 0f00000000; // 0 - mov.f32 %f149, 0f00000000; // 0 - fma.rn.ftz.f32 %f150, %f112, %f149, %f148; - fma.rn.ftz.f32 %f151, %f77, %f112, %f102; - fma.rn.ftz.f32 %f152, %f112, %f88, %f103; - fma.rn.ftz.f32 %f153, %f112, %f87, %f101; - fma.rn.ftz.f32 %f154, %f87, %f113, %f104; - fma.rn.ftz.f32 %f155, %f113, %f77, %f105; - fma.rn.ftz.f32 %f156, %f112, %f71, %f114; - mul.ftz.f32 %f157, %f70, %f115; - mul.ftz.f32 %f158, %f89, %f115; - mul.ftz.f32 %f159, %f80, %f115; - mul.ftz.f32 %f160, %f72, %f115; - mul.ftz.f32 %f161, %f90, %f115; - fma.rn.ftz.f32 %f162, %f113, %f71, %f116; - mul.ftz.f32 %f163, %f70, %f117; - mul.ftz.f32 %f164, %f89, %f117; - mul.ftz.f32 %f165, %f80, %f117; - fma.rn.ftz.f32 %f166, %f112, %f86, %f118; - fma.rn.ftz.f32 %f167, %f86, %f112, %f118; - mov.f32 %f168, 0f00000000; // 0 - fma.rn.ftz.f32 %f169, %f168, %f119, %f124; - mov.f32 %f170, 0f00000000; // 0 - fma.rn.ftz.f32 %f171, %f119, %f170, %f127; - fma.rn.ftz.f32 %f172, %f87, %f95, %f128; - fma.rn.ftz.f32 %f173, %f71, %f95, %f129; - fma.rn.ftz.f32 %f174, %f95, %f71, %f129; - fma.rn.ftz.f32 %f175, %f77, %f95, %f130; - neg.ftz.f32 %f176, %f130; - fma.rn.ftz.f32 %f177, %f95, %f88, %f131; - fma.rn.ftz.f32 %f178, %f88, %f95, %f131; - fma.rn.ftz.f32 %f179, %f86, %f95, %f132; - fma.rn.ftz.f32 %f180, %f87, %f96, %f133; - fma.rn.ftz.f32 %f181, %f71, %f96, %f134; - fma.rn.ftz.f32 %f182, %f77, %f96, %f135; - fma.rn.ftz.f32 %f183, %f112, %f78, %f136; - add.ftz.f32 %f184, %f140, %f142; - fma.rn.ftz.f32 %f185, %f92, %f76, %f143; - fma.rn.ftz.f32 %f186, %f92, %f91, %f144; - mov.f32 %f187, 0f00000000; // 0 - fma.rn.ftz.f32 %f188, %f92, %f187, %f147; - mov.f32 %f189, 0f00000000; // 0 - fma.rn.ftz.f32 %f190, %f92, %f189, %f150; - fma.rn.ftz.f32 %f191, %f92, %f91, %f151; - fma.rn.ftz.f32 %f192, %f92, %f94, %f152; - fma.rn.ftz.f32 %f193, %f92, %f76, %f153; - fma.rn.ftz.f32 %f194, %f93, %f76, %f154; - fma.rn.ftz.f32 %f195, %f93, %f91, %f155; - fma.rn.ftz.f32 %f196, %f92, %f79, %f156; - fma.rn.ftz.f32 %f197, %f87, %f84, %f157; - fma.rn.ftz.f32 %f198, %f84, %f87, %f157; - fma.rn.ftz.f32 %f199, %f71, %f84, %f158; - fma.rn.ftz.f32 %f200, %f84, %f71, %f158; - fma.rn.ftz.f32 %f201, %f77, %f84, %f159; - neg.ftz.f32 %f202, %f159; - fma.rn.ftz.f32 %f203, %f88, %f84, %f160; - fma.rn.ftz.f32 %f204, %f84, %f88, %f160; - fma.rn.ftz.f32 %f205, %f86, %f84, %f161; - fma.rn.ftz.f32 %f206, %f93, %f79, %f162; - fma.rn.ftz.f32 %f207, %f87, %f85, %f163; - fma.rn.ftz.f32 %f208, %f71, %f85, %f164; - fma.rn.ftz.f32 %f209, %f77, %f85, %f165; - fma.rn.ftz.f32 %f210, %f92, %f81, %f166; - fma.rn.ftz.f32 %f211, %f92, %f81, %f167; - add.ftz.f32 %f212, %f169, %f171; - fma.rn.ftz.f32 %f213, %f76, %f119, %f172; - fma.rn.ftz.f32 %f214, %f79, %f119, %f173; - fma.rn.ftz.f32 %f215, %f119, %f79, %f174; - fma.rn.ftz.f32 %f216, %f91, %f119, %f175; - fma.rn.ftz.f32 %f217, %f78, %f95, %f176; - fma.rn.ftz.f32 %f218, %f119, %f94, %f177; - fma.rn.ftz.f32 %f219, %f94, %f119, %f178; - fma.rn.ftz.f32 %f220, %f81, %f119, %f179; - fma.rn.ftz.f32 %f221, %f76, %f120, %f180; - fma.rn.ftz.f32 %f222, %f79, %f120, %f181; - fma.rn.ftz.f32 %f223, %f91, %f120, %f182; - fma.rn.ftz.f32 %f224, %f92, %f121, %f183; - add.ftz.f32 %f225, %f188, %f190; - add.ftz.f32 %f226, %f186, %f191; - add.ftz.f32 %f227, %f169, %f196; - fma.rn.ftz.f32 %f228, %f97, %f76, %f197; - fma.rn.ftz.f32 %f229, %f97, %f76, %f198; - fma.rn.ftz.f32 %f230, %f97, %f79, %f199; - fma.rn.ftz.f32 %f231, %f97, %f79, %f200; - fma.rn.ftz.f32 %f232, %f97, %f91, %f201; - fma.rn.ftz.f32 %f233, %f91, %f97, %f201; - fma.rn.ftz.f32 %f234, %f84, %f78, %f202; - fma.rn.ftz.f32 %f235, %f78, %f84, %f202; - fma.rn.ftz.f32 %f236, %f97, %f94, %f203; - fma.rn.ftz.f32 %f237, %f97, %f94, %f204; - fma.rn.ftz.f32 %f238, %f81, %f97, %f205; - fma.rn.ftz.f32 %f239, %f97, %f81, %f205; - fma.rn.ftz.f32 %f240, %f98, %f76, %f207; - fma.rn.ftz.f32 %f241, %f98, %f79, %f208; - fma.rn.ftz.f32 %f242, %f98, %f91, %f209; - add.ftz.f32 %f243, %f210, %f211; - add.ftz.f32 %f244, %f140, %f213; - add.ftz.f32 %f245, %f142, %f213; - add.ftz.f32 %f246, %f214, %f215; - add.ftz.f32 %f247, %f192, %f216; - fma.rn.ftz.f32 %f248, %f121, %f119, %f217; - add.ftz.f32 %f249, %f218, %f219; - add.ftz.f32 %f250, %f190, %f220; - add.ftz.f32 %f251, %f138, %f224; - add.ftz.f32 %f252, %f140, %f224; - add.ftz.f32 %f253, %f228, %f229; - add.ftz.f32 %f254, %f190, %f232; - add.ftz.f32 %f255, %f190, %f233; - fma.rn.ftz.f32 %f256, %f97, %f121, %f234; - fma.rn.ftz.f32 %f257, %f97, %f121, %f235; - add.ftz.f32 %f258, %f169, %f236; - add.ftz.f32 %f259, %f169, %f237; - add.ftz.f32 %f260, %f193, %f238; - add.ftz.f32 %f261, %f193, %f239; - add.ftz.f32 %f262, %f230, %f248; - add.ftz.f32 %f263, %f231, %f248; - add.ftz.f32 %f264, %f256, %f257; - ld.param.u64 %rd29, [__cudaparm_kernel_ellipsoid_sig_eps]; - mov.f32 %f265, 0f00000000; // 0 - mov.f32 %f266, 0f00000000; // 0 - mov.f32 %f267, 0f00000000; // 0 - mov.f32 %f268, 0f00000000; // 0 - mov.f32 %f269, 0f00000000; // 0 - mov.f32 %f270, 0f00000000; // 0 - mov.f32 %f271, 0f00000000; // 0 - mov.u64 %rd30, __cuda___cuda_local_var_32902_33_non_const_sp_lj120; -$Lt_0_46338: - // Loop body line 117, nesting depth: 1, estimated iterations: unknown - .loc 17 121 0 - ld.global.s32 %r23, [%rd25+0]; - .loc 17 125 0 - and.b32 %r24, %r23, 1073741823; - cvt.s64.s32 %rd31, %r24; - mul.wide.s32 %rd32, %r24, 16; - add.u64 %rd33, %rd32, %rd11; - ld.global.v4.f32 {%f272,%f273,%f274,%f275}, [%rd33+0]; - .loc 17 136 0 - sub.ftz.f32 %f276, %f273, %f31; - sub.ftz.f32 %f277, %f272, %f30; - sub.ftz.f32 %f278, %f274, %f32; - mul.ftz.f32 %f279, %f276, %f276; - fma.rn.ftz.f32 %f280, %f277, %f277, %f279; - fma.rn.ftz.f32 %f281, %f278, %f278, %f280; - rsqrt.approx.ftz.f32 %f282, %f281; - mul.ftz.f32 %f283, %f277, %f282; - .loc 17 137 0 - mul.ftz.f32 %f284, %f276, %f282; - .loc 17 145 0 - cvt.rzi.ftz.s32.f32 %r25, %f275; - cvt.s64.s32 %rd34, %r25; - mul.wide.s32 %rd35, %r25, 16; - add.u64 %rd36, %rd35, %rd15; - ld.global.v4.f32 {%f285,%f286,%f287,_}, [%rd36+0]; - .loc 17 152 0 - add.u64 %rd37, %rd32, %rd17; - ld.global.v4.f32 {%f288,%f289,%f290,%f291}, [%rd37+0]; - .loc 16 299 0 - mov.f32 %f292, %f283; - .loc 16 300 0 - mul.ftz.f32 %f293, %f286, %f286; - add.ftz.f32 %f294, %f289, %f289; - add.ftz.f32 %f295, %f291, %f291; - mul.ftz.f32 %f296, %f288, %f288; - mul.ftz.f32 %f297, %f289, %f289; - mul.ftz.f32 %f298, %f290, %f290; - mul.ftz.f32 %f299, %f291, %f291; - mul.ftz.f32 %f300, %f285, %f285; - add.ftz.f32 %f301, %f290, %f290; - mul.ftz.f32 %f302, %f287, %f287; - mul.ftz.f32 %f303, %f294, %f290; - mul.ftz.f32 %f304, %f294, %f291; - mul.ftz.f32 %f305, %f295, %f288; - add.ftz.f32 %f306, %f296, %f297; - mul.ftz.f32 %f307, %f301, %f288; - sub.ftz.f32 %f308, %f303, %f305; - sub.ftz.f32 %f309, %f306, %f298; - add.ftz.f32 %f310, %f304, %f307; - mul.ftz.f32 %f311, %f293, %f308; - sub.ftz.f32 %f312, %f309, %f299; - mul.ftz.f32 %f313, %f302, %f310; - mul.ftz.f32 %f314, %f308, %f311; - mul.ftz.f32 %f315, %f300, %f312; - fma.rn.ftz.f32 %f316, %f312, %f315, %f314; - fma.rn.ftz.f32 %f317, %f313, %f310, %f316; - add.ftz.f32 %f318, %f185, %f317; - mov.f32 %f319, %f318; - .loc 16 301 0 - mul.ftz.f32 %f320, %f294, %f288; - sub.ftz.f32 %f321, %f296, %f297; - mul.ftz.f32 %f322, %f301, %f291; - add.ftz.f32 %f323, %f303, %f305; - add.ftz.f32 %f324, %f298, %f321; - sub.ftz.f32 %f325, %f322, %f320; - sub.ftz.f32 %f326, %f324, %f299; - mul.ftz.f32 %f327, %f311, %f326; - fma.rn.ftz.f32 %f328, %f315, %f323, %f327; - fma.rn.ftz.f32 %f329, %f313, %f325, %f328; - add.ftz.f32 %f330, %f196, %f329; - mov.f32 %f331, %f330; - .loc 16 302 0 - sub.ftz.f32 %f332, %f321, %f298; - sub.ftz.f32 %f333, %f304, %f307; - add.ftz.f32 %f334, %f320, %f322; - add.ftz.f32 %f335, %f299, %f332; - mul.ftz.f32 %f336, %f334, %f311; - fma.rn.ftz.f32 %f337, %f315, %f333, %f336; - fma.rn.ftz.f32 %f338, %f313, %f335, %f337; - add.ftz.f32 %f339, %f186, %f338; - mov.f32 %f340, %f339; - .loc 16 303 0 - mov.f32 %f341, %f284; - .loc 16 304 0 - mul.ftz.f32 %f342, %f300, %f323; - mul.ftz.f32 %f343, %f302, %f325; - mul.ftz.f32 %f344, %f293, %f326; - mul.ftz.f32 %f345, %f308, %f344; - fma.rn.ftz.f32 %f346, %f312, %f342, %f345; - fma.rn.ftz.f32 %f347, %f343, %f310, %f346; - add.ftz.f32 %f348, %f228, %f347; - mov.f32 %f349, %f348; - .loc 16 305 0 - mul.ftz.f32 %f350, %f326, %f344; - fma.rn.ftz.f32 %f351, %f323, %f342, %f350; - fma.rn.ftz.f32 %f352, %f343, %f325, %f351; - add.ftz.f32 %f353, %f230, %f352; - .loc 16 306 0 - mul.ftz.f32 %f354, %f334, %f344; - fma.rn.ftz.f32 %f355, %f333, %f342, %f354; - fma.rn.ftz.f32 %f356, %f343, %f335, %f355; - add.ftz.f32 %f357, %f232, %f356; - .loc 16 307 0 - mul.ftz.f32 %f358, %f278, %f282; - mov.f32 %f359, %f358; - .loc 16 308 0 - mul.ftz.f32 %f360, %f300, %f333; - mul.ftz.f32 %f361, %f293, %f334; - mul.ftz.f32 %f362, %f302, %f335; - mul.ftz.f32 %f363, %f308, %f361; - fma.rn.ftz.f32 %f364, %f312, %f360, %f363; - fma.rn.ftz.f32 %f365, %f310, %f362, %f364; - add.ftz.f32 %f366, %f213, %f365; - mov.f32 %f367, %f366; - .loc 16 309 0 - mul.ftz.f32 %f368, %f326, %f361; - fma.rn.ftz.f32 %f369, %f323, %f360, %f368; - fma.rn.ftz.f32 %f370, %f325, %f362, %f369; - add.ftz.f32 %f371, %f214, %f370; - .loc 16 310 0 - mul.ftz.f32 %f372, %f334, %f361; - fma.rn.ftz.f32 %f373, %f333, %f360, %f372; - fma.rn.ftz.f32 %f374, %f335, %f362, %f373; - add.ftz.f32 %f375, %f216, %f374; - abs.ftz.f32 %f376, %f348; - abs.ftz.f32 %f377, %f318; - setp.gt.ftz.f32 %p4, %f376, %f377; - @!%p4 bra $Lt_0_46594; - .loc 16 314 0 - mov.f32 %f319, %f348; - mov.f32 %f349, %f318; - .loc 16 315 0 - mov.f32 %f331, %f353; - mov.f32 %f353, %f330; - .loc 16 316 0 - mov.f32 %f340, %f357; - mov.f32 %f357, %f339; - .loc 16 317 0 - mov.f32 %f292, %f284; - mov.f32 %f341, %f283; -$Lt_0_46594: - mov.f32 %f378, %f319; - abs.ftz.f32 %f379, %f378; - abs.ftz.f32 %f380, %f366; - setp.lt.ftz.f32 %p5, %f379, %f380; - @!%p5 bra $Lt_0_47106; - .loc 16 321 0 - mov.f32 %f319, %f366; - mov.f32 %f367, %f378; - .loc 16 322 0 - mov.f32 %f381, %f331; - mov.f32 %f331, %f371; - mov.f32 %f371, %f381; - .loc 16 323 0 - mov.f32 %f382, %f340; - mov.f32 %f340, %f375; - mov.f32 %f375, %f382; - .loc 16 324 0 - mov.f32 %f383, %f292; - mov.f32 %f292, %f358; - mov.f32 %f359, %f383; -$Lt_0_47106: - mov.f32 %f384, %f319; - mov.f32 %f385, 0f00000000; // 0 - setp.neu.ftz.f32 %p6, %f384, %f385; - @!%p6 bra $Lt_0_47874; - bra.uni $Lt_0_48642; -$Lt_0_47874: - mov.f32 %f386, 0f00000000; // 0 - setp.neu.ftz.f32 %p7, %f349, %f386; - @!%p7 bra $Lt_0_48386; - .loc 16 338 0 - mov.f32 %f319, %f349; - mov.f32 %f349, %f384; - .loc 16 339 0 - mov.f32 %f387, %f331; - mov.f32 %f331, %f353; - mov.f32 %f353, %f387; - .loc 16 340 0 - mov.f32 %f388, %f340; - mov.f32 %f340, %f357; - mov.f32 %f357, %f388; - .loc 16 341 0 - mov.f32 %f389, %f292; - mov.f32 %f292, %f341; - mov.f32 %f341, %f389; - bra.uni $Lt_0_48642; -$Lt_0_48386: - mov.f32 %f390, 0f00000000; // 0 - setp.neu.ftz.f32 %p8, %f367, %f390; - @!%p8 bra $Lt_0_48898; - .loc 16 346 0 - mov.f32 %f319, %f367; - mov.f32 %f367, %f384; - .loc 16 347 0 - mov.f32 %f391, %f331; - mov.f32 %f331, %f371; - mov.f32 %f371, %f391; - .loc 16 348 0 - mov.f32 %f392, %f340; - mov.f32 %f340, %f375; - mov.f32 %f375, %f392; - .loc 16 349 0 - mov.f32 %f393, %f292; - mov.f32 %f292, %f359; - mov.f32 %f359, %f393; - bra.uni $Lt_0_48642; -$Lt_0_48898: - .loc 16 352 0 - mov.s32 %r26, 2; - ld.param.u64 %rd38, [__cudaparm_kernel_ellipsoid_err_flag]; - st.global.s32 [%rd38+0], %r26; -$Lt_0_48642: -$Lt_0_48130: -$Lt_0_47618: - .loc 16 355 0 - div.approx.ftz.f32 %f394, %f349, %f319; - mul.ftz.f32 %f395, %f331, %f394; - sub.ftz.f32 %f396, %f353, %f395; - mov.f32 %f353, %f396; - .loc 16 356 0 - mul.ftz.f32 %f397, %f340, %f394; - sub.ftz.f32 %f398, %f357, %f397; - mov.f32 %f357, %f398; - .loc 16 357 0 - mul.ftz.f32 %f399, %f292, %f394; - sub.ftz.f32 %f400, %f341, %f399; - mov.f32 %f341, %f400; - .loc 16 359 0 - div.approx.ftz.f32 %f401, %f367, %f319; - mul.ftz.f32 %f402, %f331, %f401; - sub.ftz.f32 %f371, %f371, %f402; - .loc 16 360 0 - mul.ftz.f32 %f403, %f340, %f401; - sub.ftz.f32 %f375, %f375, %f403; - .loc 16 361 0 - mul.ftz.f32 %f404, %f292, %f401; - sub.ftz.f32 %f359, %f359, %f404; - abs.ftz.f32 %f405, %f396; - abs.ftz.f32 %f406, %f371; - setp.lt.ftz.f32 %p9, %f405, %f406; - @!%p9 bra $Lt_0_49154; - .loc 16 366 0 - mov.f32 %f353, %f371; - mov.f32 %f371, %f396; - .loc 16 367 0 - mov.f32 %f357, %f375; - mov.f32 %f375, %f398; - .loc 16 368 0 - mov.f32 %f341, %f359; - mov.f32 %f359, %f400; -$Lt_0_49154: - mov.f32 %f407, %f353; - mov.f32 %f408, 0f00000000; // 0 - setp.neu.ftz.f32 %p10, %f407, %f408; - @!%p10 bra $Lt_0_49922; - bra.uni $Lt_0_50178; -$Lt_0_49922: - mov.f32 %f409, 0f00000000; // 0 - setp.neu.ftz.f32 %p11, %f371, %f409; - @!%p11 bra $Lt_0_50178; - .loc 16 383 0 - mov.f32 %f353, %f371; - mov.f32 %f371, %f407; - .loc 16 384 0 - mov.f32 %f410, %f357; - mov.f32 %f357, %f375; - mov.f32 %f375, %f410; - .loc 16 385 0 - mov.f32 %f411, %f341; - mov.f32 %f341, %f359; - mov.f32 %f359, %f411; -$Lt_0_50178: -$Lt_0_49666: - .loc 16 390 0 - div.approx.ftz.f32 %f412, %f371, %f353; - mul.ftz.f32 %f413, %f357, %f412; - sub.ftz.f32 %f375, %f375, %f413; - .loc 16 391 0 - mul.ftz.f32 %f414, %f341, %f412; - sub.ftz.f32 %f359, %f359, %f414; - mov.f32 %f415, 0f00000000; // 0 - setp.eq.ftz.f32 %p12, %f375, %f415; - @!%p12 bra $Lt_0_50690; - .loc 16 394 0 - mov.s32 %r27, 2; - ld.param.u64 %rd39, [__cudaparm_kernel_ellipsoid_err_flag]; - st.global.s32 [%rd39+0], %r27; -$Lt_0_50690: - .loc 16 396 0 - div.approx.ftz.f32 %f416, %f359, %f375; - .loc 16 399 0 - mul.ftz.f32 %f417, %f416, %f357; - sub.ftz.f32 %f418, %f341, %f417; - div.approx.ftz.f32 %f419, %f418, %f353; - .loc 16 403 0 - mul.ftz.f32 %f420, %f419, %f331; - fma.rn.ftz.f32 %f421, %f340, %f416, %f420; - sub.ftz.f32 %f422, %f292, %f421; - div.approx.ftz.f32 %f423, %f422, %f319; - .loc 17 161 0 - mul.ftz.f32 %f424, %f419, %f284; - fma.rn.ftz.f32 %f425, %f283, %f423, %f424; - fma.rn.ftz.f32 %f426, %f358, %f416, %f425; - mov.f32 %f427, 0f3f000000; // 0.5 - mul.ftz.f32 %f428, %f426, %f427; - rsqrt.approx.ftz.f32 %f429, %f428; - .loc 17 170 0 - mul.ftz.f32 %f430, %f89, %f284; - mul.ftz.f32 %f431, %f71, %f284; - mul.ftz.f32 %f432, %f79, %f284; - fma.rn.ftz.f32 %f433, %f283, %f70, %f430; - fma.rn.ftz.f32 %f434, %f87, %f283, %f431; - fma.rn.ftz.f32 %f435, %f283, %f76, %f432; - fma.rn.ftz.f32 %f436, %f358, %f80, %f433; - fma.rn.ftz.f32 %f437, %f77, %f358, %f434; - fma.rn.ftz.f32 %f438, %f358, %f91, %f435; - mul.ftz.f32 %f439, %f58, %f436; - mul.ftz.f32 %f440, %f65, %f437; - mul.ftz.f32 %f441, %f68, %f438; - mul.ftz.f32 %f442, %f436, %f439; - fma.rn.ftz.f32 %f443, %f437, %f440, %f442; - fma.rn.ftz.f32 %f444, %f438, %f441, %f443; - sqrt.approx.ftz.f32 %f445, %f444; - .loc 17 171 0 - mul.ftz.f32 %f446, %f326, %f284; - mul.ftz.f32 %f447, %f323, %f284; - mul.ftz.f32 %f448, %f325, %f284; - fma.rn.ftz.f32 %f449, %f283, %f308, %f446; - fma.rn.ftz.f32 %f450, %f283, %f312, %f447; - fma.rn.ftz.f32 %f451, %f283, %f310, %f448; - fma.rn.ftz.f32 %f452, %f358, %f334, %f449; - fma.rn.ftz.f32 %f453, %f358, %f333, %f450; - fma.rn.ftz.f32 %f454, %f358, %f335, %f451; - div.approx.ftz.f32 %f455, %f452, %f293; - div.approx.ftz.f32 %f456, %f453, %f300; - div.approx.ftz.f32 %f457, %f454, %f302; - mul.ftz.f32 %f458, %f452, %f455; - fma.rn.ftz.f32 %f459, %f453, %f456, %f458; - fma.rn.ftz.f32 %f460, %f454, %f457, %f459; - sqrt.approx.ftz.f32 %f461, %f460; - .loc 17 184 0 - mul.ftz.f32 %f462, %f317, %f461; - mul.ftz.f32 %f463, %f338, %f461; - mul.ftz.f32 %f464, %f329, %f461; - mul.ftz.f32 %f465, %f365, %f461; - mul.ftz.f32 %f466, %f370, %f461; - mul.ftz.f32 %f467, %f374, %f461; - fma.rn.ftz.f32 %f468, %f185, %f445, %f462; - fma.rn.ftz.f32 %f469, %f186, %f445, %f463; - fma.rn.ftz.f32 %f470, %f196, %f445, %f464; - mul.ftz.f32 %f471, %f347, %f461; - mul.ftz.f32 %f472, %f352, %f461; - mul.ftz.f32 %f473, %f356, %f461; - fma.rn.ftz.f32 %f474, %f213, %f445, %f465; - fma.rn.ftz.f32 %f475, %f214, %f445, %f466; - fma.rn.ftz.f32 %f476, %f216, %f445, %f467; - fma.rn.ftz.f32 %f477, %f228, %f445, %f471; - fma.rn.ftz.f32 %f478, %f230, %f445, %f472; - fma.rn.ftz.f32 %f479, %f232, %f445, %f473; - mul.ftz.f32 %f480, %f470, %f474; - mul.ftz.f32 %f481, %f469, %f474; - mul.ftz.f32 %f482, %f470, %f477; - mul.ftz.f32 %f483, %f469, %f477; - mul.ftz.f32 %f484, %f468, %f478; - mul.ftz.f32 %f485, %f468, %f479; - mul.ftz.f32 %f486, %f475, %f485; - mul.ftz.f32 %f487, %f476, %f484; - sub.ftz.f32 %f488, %f487, %f486; - mul.ftz.f32 %f489, %f476, %f482; - sub.ftz.f32 %f490, %f488, %f489; - fma.rn.ftz.f32 %f491, %f475, %f483, %f490; - fma.rn.ftz.f32 %f492, %f479, %f480, %f491; - mul.ftz.f32 %f493, %f478, %f481; - sub.ftz.f32 %f494, %f492, %f493; - .loc 17 201 0 - add.s32 %r28, %r25, %r22; - cvt.s64.s32 %rd40, %r28; - mul.wide.s32 %rd41, %r28, 8; - add.u64 %rd42, %rd29, %rd41; - ld.global.v2.f32 {%f495,%f496}, [%rd42+0]; - .loc 17 202 0 - shr.s32 %r29, %r23, 30; - and.b32 %r30, %r29, 3; - cvt.s64.s32 %rd43, %r30; - mul.wide.s32 %rd44, %r30, 4; - add.u64 %rd45, %rd30, %rd44; - ld.shared.f32 %f497, [%rd45+0]; - mul.ftz.f32 %f498, %f497, %f496; - .loc 17 207 0 - add.u64 %rd46, %rd35, %rd19; - ld.global.v4.f32 {%f499,%f500,%f501,_}, [%rd46+0]; - .loc 16 299 0 - mov.f32 %f292, %f283; - .loc 16 300 0 - mul.ftz.f32 %f502, %f308, %f500; - mul.ftz.f32 %f503, %f310, %f501; - mul.ftz.f32 %f504, %f308, %f502; - mul.ftz.f32 %f505, %f312, %f499; - fma.rn.ftz.f32 %f506, %f312, %f505, %f504; - fma.rn.ftz.f32 %f507, %f503, %f310, %f506; - add.ftz.f32 %f508, %f194, %f507; - mov.f32 %f319, %f508; - .loc 16 301 0 - mul.ftz.f32 %f509, %f502, %f326; - fma.rn.ftz.f32 %f510, %f505, %f323, %f509; - fma.rn.ftz.f32 %f511, %f503, %f325, %f510; - add.ftz.f32 %f512, %f206, %f511; - mov.f32 %f331, %f512; - .loc 16 302 0 - mul.ftz.f32 %f513, %f334, %f502; - fma.rn.ftz.f32 %f514, %f505, %f333, %f513; - fma.rn.ftz.f32 %f515, %f503, %f335, %f514; - add.ftz.f32 %f516, %f195, %f515; - mov.f32 %f340, %f516; - .loc 16 303 0 - mov.f32 %f341, %f284; - .loc 16 304 0 - mul.ftz.f32 %f517, %f323, %f499; - mul.ftz.f32 %f518, %f325, %f501; - mul.ftz.f32 %f519, %f326, %f500; - mul.ftz.f32 %f520, %f308, %f519; - fma.rn.ftz.f32 %f521, %f312, %f517, %f520; - fma.rn.ftz.f32 %f522, %f518, %f310, %f521; - add.ftz.f32 %f523, %f240, %f522; - mov.f32 %f349, %f523; - .loc 16 305 0 - mul.ftz.f32 %f524, %f326, %f519; - fma.rn.ftz.f32 %f525, %f323, %f517, %f524; - fma.rn.ftz.f32 %f526, %f518, %f325, %f525; - add.ftz.f32 %f353, %f241, %f526; - .loc 16 306 0 - mul.ftz.f32 %f527, %f334, %f519; - fma.rn.ftz.f32 %f528, %f333, %f517, %f527; - fma.rn.ftz.f32 %f529, %f518, %f335, %f528; - add.ftz.f32 %f357, %f242, %f529; - .loc 16 307 0 - mov.f32 %f359, %f358; - .loc 16 308 0 - mul.ftz.f32 %f530, %f333, %f499; - mul.ftz.f32 %f531, %f334, %f500; - mul.ftz.f32 %f532, %f335, %f501; - mul.ftz.f32 %f533, %f308, %f531; - fma.rn.ftz.f32 %f534, %f312, %f530, %f533; - fma.rn.ftz.f32 %f535, %f310, %f532, %f534; - add.ftz.f32 %f536, %f221, %f535; - mov.f32 %f367, %f536; - .loc 16 309 0 - mul.ftz.f32 %f537, %f326, %f531; - fma.rn.ftz.f32 %f538, %f323, %f530, %f537; - fma.rn.ftz.f32 %f539, %f325, %f532, %f538; - add.ftz.f32 %f371, %f222, %f539; - .loc 16 310 0 - mul.ftz.f32 %f540, %f334, %f531; - fma.rn.ftz.f32 %f541, %f333, %f530, %f540; - fma.rn.ftz.f32 %f542, %f335, %f532, %f541; - add.ftz.f32 %f375, %f223, %f542; - abs.ftz.f32 %f543, %f523; - abs.ftz.f32 %f544, %f508; - setp.gt.ftz.f32 %p13, %f543, %f544; - @!%p13 bra $Lt_0_51202; - .loc 16 314 0 - mov.f32 %f319, %f523; - mov.f32 %f349, %f508; - .loc 16 315 0 - mov.f32 %f331, %f353; - mov.f32 %f353, %f512; - .loc 16 316 0 - mov.f32 %f340, %f357; - mov.f32 %f357, %f516; - .loc 16 317 0 - mov.f32 %f292, %f284; - mov.f32 %f341, %f283; -$Lt_0_51202: - mov.f32 %f545, %f319; - abs.ftz.f32 %f546, %f545; - abs.ftz.f32 %f547, %f536; - setp.lt.ftz.f32 %p14, %f546, %f547; - @!%p14 bra $Lt_0_51714; - .loc 16 321 0 - mov.f32 %f319, %f536; - mov.f32 %f367, %f545; - .loc 16 322 0 - mov.f32 %f548, %f331; - mov.f32 %f331, %f371; - mov.f32 %f371, %f548; - .loc 16 323 0 - mov.f32 %f549, %f340; - mov.f32 %f340, %f375; - mov.f32 %f375, %f549; - .loc 16 324 0 - mov.f32 %f550, %f292; - mov.f32 %f292, %f358; - mov.f32 %f359, %f550; -$Lt_0_51714: - mov.f32 %f551, %f319; - mov.f32 %f552, 0f00000000; // 0 - setp.neu.ftz.f32 %p15, %f551, %f552; - @!%p15 bra $Lt_0_52482; - bra.uni $Lt_0_53250; -$Lt_0_52482: - mov.f32 %f553, 0f00000000; // 0 - setp.neu.ftz.f32 %p16, %f349, %f553; - @!%p16 bra $Lt_0_52994; - .loc 16 338 0 - mov.f32 %f319, %f349; - mov.f32 %f349, %f551; - .loc 16 339 0 - mov.f32 %f554, %f331; - mov.f32 %f331, %f353; - mov.f32 %f353, %f554; - .loc 16 340 0 - mov.f32 %f555, %f340; - mov.f32 %f340, %f357; - mov.f32 %f357, %f555; - .loc 16 341 0 - mov.f32 %f556, %f292; - mov.f32 %f292, %f341; - mov.f32 %f341, %f556; - bra.uni $Lt_0_53250; -$Lt_0_52994: - mov.f32 %f557, 0f00000000; // 0 - setp.neu.ftz.f32 %p17, %f367, %f557; - @!%p17 bra $Lt_0_53506; - .loc 16 346 0 - mov.f32 %f319, %f367; - mov.f32 %f367, %f551; - .loc 16 347 0 - mov.f32 %f558, %f331; - mov.f32 %f331, %f371; - mov.f32 %f371, %f558; - .loc 16 348 0 - mov.f32 %f559, %f340; - mov.f32 %f340, %f375; - mov.f32 %f375, %f559; - .loc 16 349 0 - mov.f32 %f560, %f292; - mov.f32 %f292, %f359; - mov.f32 %f359, %f560; - bra.uni $Lt_0_53250; -$Lt_0_53506: - .loc 16 352 0 - mov.s32 %r31, 2; - ld.param.u64 %rd47, [__cudaparm_kernel_ellipsoid_err_flag]; - st.global.s32 [%rd47+0], %r31; -$Lt_0_53250: -$Lt_0_52738: -$Lt_0_52226: - .loc 16 355 0 - div.approx.ftz.f32 %f561, %f349, %f319; - mul.ftz.f32 %f562, %f331, %f561; - sub.ftz.f32 %f563, %f353, %f562; - mov.f32 %f353, %f563; - .loc 16 356 0 - mul.ftz.f32 %f564, %f340, %f561; - sub.ftz.f32 %f565, %f357, %f564; - mov.f32 %f357, %f565; - .loc 16 357 0 - mul.ftz.f32 %f566, %f292, %f561; - sub.ftz.f32 %f567, %f341, %f566; - mov.f32 %f341, %f567; - .loc 16 359 0 - div.approx.ftz.f32 %f568, %f367, %f319; - mul.ftz.f32 %f569, %f331, %f568; - sub.ftz.f32 %f371, %f371, %f569; - .loc 16 360 0 - mul.ftz.f32 %f570, %f340, %f568; - sub.ftz.f32 %f375, %f375, %f570; - .loc 16 361 0 - mul.ftz.f32 %f571, %f292, %f568; - sub.ftz.f32 %f359, %f359, %f571; - abs.ftz.f32 %f572, %f563; - abs.ftz.f32 %f573, %f371; - setp.lt.ftz.f32 %p18, %f572, %f573; - @!%p18 bra $Lt_0_53762; - .loc 16 366 0 - mov.f32 %f353, %f371; - mov.f32 %f371, %f563; - .loc 16 367 0 - mov.f32 %f357, %f375; - mov.f32 %f375, %f565; - .loc 16 368 0 - mov.f32 %f341, %f359; - mov.f32 %f359, %f567; -$Lt_0_53762: - mov.f32 %f574, %f353; - mov.f32 %f575, 0f00000000; // 0 - setp.neu.ftz.f32 %p19, %f574, %f575; - @!%p19 bra $Lt_0_54530; - bra.uni $Lt_0_54786; -$Lt_0_54530: - mov.f32 %f576, 0f00000000; // 0 - setp.neu.ftz.f32 %p20, %f371, %f576; - @!%p20 bra $Lt_0_54786; - .loc 16 383 0 - mov.f32 %f353, %f371; - mov.f32 %f371, %f574; - .loc 16 384 0 - mov.f32 %f577, %f357; - mov.f32 %f357, %f375; - mov.f32 %f375, %f577; - .loc 16 385 0 - mov.f32 %f578, %f341; - mov.f32 %f341, %f359; - mov.f32 %f359, %f578; -$Lt_0_54786: -$Lt_0_54274: - .loc 16 390 0 - div.approx.ftz.f32 %f579, %f371, %f353; - mul.ftz.f32 %f580, %f357, %f579; - sub.ftz.f32 %f375, %f375, %f580; - .loc 16 391 0 - mul.ftz.f32 %f581, %f341, %f579; - sub.ftz.f32 %f359, %f359, %f581; - mov.f32 %f582, 0f00000000; // 0 - setp.eq.ftz.f32 %p21, %f375, %f582; - @!%p21 bra $Lt_0_55298; - .loc 16 394 0 - mov.s32 %r32, 2; - ld.param.u64 %rd48, [__cudaparm_kernel_ellipsoid_err_flag]; - st.global.s32 [%rd48+0], %r32; -$Lt_0_55298: - .loc 17 213 0 - div.approx.ftz.f32 %f583, %f359, %f375; - mul.ftz.f32 %f584, %f583, %f357; - sub.ftz.f32 %f585, %f341, %f584; - div.approx.ftz.f32 %f586, %f585, %f353; - mul.ftz.f32 %f587, %f586, %f331; - fma.rn.ftz.f32 %f588, %f340, %f583, %f587; - mul.ftz.f32 %f589, %f586, %f284; - sub.ftz.f32 %f590, %f292, %f588; - div.approx.ftz.f32 %f591, %f590, %f319; - fma.rn.ftz.f32 %f592, %f283, %f591, %f589; - fma.rn.ftz.f32 %f593, %f358, %f583, %f592; - add.ftz.f32 %f594, %f593, %f593; - .loc 17 220 0 - rcp.approx.ftz.f32 %f595, %f282; - sub.ftz.f32 %f596, %f595, %f429; - mov.f32 %f597, 0f3f000000; // 0.5 - mul.ftz.f32 %f598, %f596, %f597; - add.ftz.f32 %f599, %f598, %f287; - add.ftz.f32 %f600, %f598, %f286; - add.ftz.f32 %f601, %f598, %f285; - add.ftz.f32 %f602, %f598, %f36; - add.ftz.f32 %f603, %f598, %f34; - add.ftz.f32 %f604, %f598, %f35; - mul.ftz.f32 %f605, %f603, %f604; - mul.ftz.f32 %f606, %f602, %f605; - mul.ftz.f32 %f607, %f601, %f606; - mul.ftz.f32 %f608, %f600, %f607; - mul.ftz.f32 %f609, %f599, %f608; - .loc 17 223 0 - mul.ftz.f32 %f610, %f461, %f461; - mul.ftz.f32 %f611, %f285, %f286; - mul.ftz.f32 %f612, %f445, %f445; - rcp.approx.ftz.f32 %f613, %f445; - rcp.approx.ftz.f32 %f614, %f461; - mul.ftz.f32 %f615, %f611, %f287; - add.ftz.f32 %f616, %f613, %f614; - mul.ftz.f32 %f617, %f610, %f615; - mul.ftz.f32 %f618, %f615, %f69; - div.approx.ftz.f32 %f619, %f616, %f494; - fma.rn.ftz.f32 %f620, %f69, %f612, %f617; - rsqrt.approx.ftz.f32 %f621, %f619; - div.approx.ftz.f32 %f622, %f620, %f621; - mul.ftz.f32 %f623, %f622, %f594; - div.approx.ftz.f32 %f624, %f495, %f596; - mul.ftz.f32 %f625, %f623, %f624; - mov.f32 %f626, 0f3f800000; // 1 - mov.f32 %f627, 0f40400000; // 3 - fma.rn.ftz.f32 %f628, %f627, %f625, %f626; - mul.ftz.f32 %f629, %f618, %f628; - .loc 17 228 0 - div.approx.ftz.f32 %f630, %f596, %f17; - add.ftz.f32 %f631, %f630, %f287; - add.ftz.f32 %f632, %f630, %f286; - add.ftz.f32 %f633, %f630, %f285; - add.ftz.f32 %f634, %f630, %f36; - add.ftz.f32 %f635, %f630, %f34; - add.ftz.f32 %f636, %f630, %f35; - mul.ftz.f32 %f637, %f635, %f636; - mul.ftz.f32 %f638, %f634, %f637; - mul.ftz.f32 %f639, %f633, %f638; - mul.ftz.f32 %f640, %f632, %f639; - mul.ftz.f32 %f641, %f631, %f640; - .loc 17 231 0 - mov.f32 %f642, 0f3f800000; // 1 - mov.f32 %f643, 0f3f4db6db; // 0.803571 - fma.rn.ftz.f32 %f644, %f643, %f625, %f642; - mul.ftz.f32 %f645, %f618, %f644; - .loc 17 233 0 - mul.ftz.f32 %f646, %f624, %f624; - mul.ftz.f32 %f647, %f624, %f646; - mul.ftz.f32 %f648, %f647, %f647; - .loc 17 236 0 - div.approx.ftz.f32 %f649, %f629, %f609; - div.approx.ftz.f32 %f650, %f645, %f641; - mul.ftz.f32 %f651, %f649, %f498; - mul.ftz.f32 %f652, %f650, %f498; - mov.f32 %f653, 0fc2100000; // -36 - div.approx.ftz.f32 %f654, %f651, %f653; - mul.ftz.f32 %f655, %f652, %f648; - mov.f32 %f656, 0f44fd2000; // 2025 - div.approx.ftz.f32 %f657, %f655, %f656; - add.ftz.f32 %f658, %f654, %f657; - add.ftz.f32 %f271, %f271, %f658; - .loc 17 246 0 - div.approx.ftz.f32 %f659, %f613, %f612; - mul.ftz.f32 %f660, %f659, %f440; - neg.ftz.f32 %f661, %f660; - .loc 17 247 0 - mul.ftz.f32 %f662, %f659, %f439; - neg.ftz.f32 %f663, %f662; - .loc 17 248 0 - mul.ftz.f32 %f664, %f659, %f441; - neg.ftz.f32 %f665, %f664; - .loc 17 249 0 - div.approx.ftz.f32 %f666, %f614, %f610; - mul.ftz.f32 %f667, %f666, %f456; - neg.ftz.f32 %f668, %f667; - .loc 17 250 0 - mul.ftz.f32 %f669, %f666, %f455; - neg.ftz.f32 %f670, %f669; - .loc 17 251 0 - mul.ftz.f32 %f671, %f666, %f457; - neg.ftz.f32 %f672, %f671; - .loc 21 544 0 - add.ftz.f32 %f673, %f622, %f622; - div.approx.ftz.f32 %f674, %f673, %f620; - mul.ftz.f32 %f675, %f615, %f674; - div.approx.ftz.f32 %f676, %f675, %f666; - mul.ftz.f32 %f677, %f69, %f674; - div.approx.ftz.f32 %f678, %f677, %f659; - .loc 17 278 0 - mov.f32 %f679, 0f40800000; // 4 - mul.ftz.f32 %f680, %f591, %f679; - .loc 17 286 0 - add.ftz.f32 %f681, %f55, %f596; - rcp.approx.ftz.f32 %f682, %f681; - add.ftz.f32 %f683, %f56, %f596; - rcp.approx.ftz.f32 %f684, %f683; - add.ftz.f32 %f685, %f682, %f684; - add.ftz.f32 %f686, %f57, %f596; - rcp.approx.ftz.f32 %f687, %f686; - add.ftz.f32 %f688, %f685, %f687; - add.ftz.f32 %f689, %f285, %f285; - add.ftz.f32 %f690, %f596, %f689; - rcp.approx.ftz.f32 %f691, %f690; - add.ftz.f32 %f692, %f688, %f691; - add.ftz.f32 %f693, %f286, %f286; - add.ftz.f32 %f694, %f596, %f693; - rcp.approx.ftz.f32 %f695, %f694; - add.ftz.f32 %f696, %f692, %f695; - add.ftz.f32 %f697, %f287, %f287; - add.ftz.f32 %f698, %f596, %f697; - rcp.approx.ftz.f32 %f699, %f698; - add.ftz.f32 %f700, %f696, %f699; - .loc 17 293 0 - mul.ftz.f32 %f701, %f622, %f495; - mul.ftz.f32 %f702, %f701, %f594; - mov.f32 %f703, 0f40400000; // 3 - fma.rn.ftz.f32 %f704, %f703, %f702, %f596; - rcp.approx.ftz.f32 %f705, %f704; - rcp.approx.ftz.f32 %f706, %f596; - sub.ftz.f32 %f707, %f706, %f705; - add.ftz.f32 %f708, %f700, %f707; - .loc 17 297 0 - fma.rn.ftz.f32 %f709, %f17, %f34, %f596; - rcp.approx.ftz.f32 %f710, %f709; - fma.rn.ftz.f32 %f711, %f17, %f35, %f596; - rcp.approx.ftz.f32 %f712, %f711; - add.ftz.f32 %f713, %f710, %f712; - fma.rn.ftz.f32 %f714, %f17, %f36, %f596; - rcp.approx.ftz.f32 %f715, %f714; - add.ftz.f32 %f716, %f713, %f715; - fma.rn.ftz.f32 %f717, %f17, %f285, %f596; - rcp.approx.ftz.f32 %f718, %f717; - add.ftz.f32 %f719, %f716, %f718; - fma.rn.ftz.f32 %f720, %f17, %f286, %f596; - rcp.approx.ftz.f32 %f721, %f720; - add.ftz.f32 %f722, %f719, %f721; - fma.rn.ftz.f32 %f723, %f17, %f287, %f596; - rcp.approx.ftz.f32 %f724, %f723; - add.ftz.f32 %f725, %f722, %f724; - .loc 17 304 0 - mov.f32 %f726, 0f40e00000; // 7 - div.approx.ftz.f32 %f727, %f726, %f596; - mov.f32 %f728, 0f3f4db6db; // 0.803571 - fma.rn.ftz.f32 %f729, %f728, %f702, %f596; - rcp.approx.ftz.f32 %f730, %f729; - sub.ftz.f32 %f731, %f727, %f730; - add.ftz.f32 %f732, %f731, %f725; - .loc 17 314 0 - mul.ftz.f32 %f733, %f283, %f283; - neg.ftz.f32 %f734, %f733; - mov.f32 %f735, %f734; - .loc 17 315 0 - mul.ftz.f32 %f736, %f284, %f283; - neg.ftz.f32 %f737, %f736; - mov.f32 %f738, %f737; - .loc 17 316 0 - mul.ftz.f32 %f739, %f358, %f283; - neg.ftz.f32 %f740, %f739; - mov.f32 %f741, %f740; - .loc 17 317 0 - mov.f32 %f742, 0f3f800000; // 1 - sub.ftz.f32 %f743, %f742, %f733; - mov.f32 %f744, %f743; - .loc 17 318 0 - mul.ftz.f32 %f745, %f282, %f743; - mov.f32 %f746, %f745; - .loc 17 319 0 - mov.f32 %f747, %f738; - mul.ftz.f32 %f748, %f747, %f282; - mov.f32 %f749, %f748; - .loc 17 320 0 - mov.f32 %f750, %f741; - mul.ftz.f32 %f751, %f750, %f282; - mov.f32 %f752, %f751; - .loc 17 325 0 - mul.ftz.f32 %f753, %f71, %f748; - mul.ftz.f32 %f754, %f79, %f748; - mul.ftz.f32 %f755, %f323, %f748; - mul.ftz.f32 %f756, %f325, %f748; - mul.ftz.f32 %f757, %f89, %f748; - mul.ftz.f32 %f758, %f326, %f748; - mul.ftz.f32 %f759, %f612, %f185; - mul.ftz.f32 %f760, %f610, %f317; - neg.ftz.f32 %f761, %f759; - fma.rn.ftz.f32 %f762, %f745, %f308, %f758; - fma.rn.ftz.f32 %f763, %f312, %f745, %f755; - fma.rn.ftz.f32 %f764, %f745, %f310, %f756; - fma.rn.ftz.f32 %f765, %f745, %f70, %f757; - fma.rn.ftz.f32 %f766, %f87, %f745, %f753; - fma.rn.ftz.f32 %f767, %f745, %f76, %f754; - fma.rn.ftz.f32 %f768, %f751, %f334, %f762; - fma.rn.ftz.f32 %f769, %f333, %f751, %f763; - fma.rn.ftz.f32 %f770, %f751, %f335, %f764; - fma.rn.ftz.f32 %f771, %f751, %f80, %f765; - fma.rn.ftz.f32 %f772, %f77, %f751, %f766; - fma.rn.ftz.f32 %f773, %f751, %f91, %f767; - mul.ftz.f32 %f774, %f768, %f670; - mul.ftz.f32 %f775, %f771, %f663; - fma.rn.ftz.f32 %f776, %f668, %f769, %f774; - fma.rn.ftz.f32 %f777, %f661, %f772, %f775; - fma.rn.ftz.f32 %f778, %f672, %f770, %f776; - fma.rn.ftz.f32 %f779, %f665, %f773, %f777; - mul.ftz.f32 %f780, %f760, %f778; - mul.ftz.f32 %f781, %f761, %f779; - sub.ftz.f32 %f782, %f781, %f780; - .loc 17 326 0 - mul.ftz.f32 %f783, %f612, %f196; - mul.ftz.f32 %f784, %f610, %f329; - neg.ftz.f32 %f785, %f783; - mul.ftz.f32 %f786, %f784, %f778; - mul.ftz.f32 %f787, %f785, %f779; - sub.ftz.f32 %f788, %f787, %f786; - .loc 17 327 0 - mul.ftz.f32 %f789, %f612, %f186; - mul.ftz.f32 %f790, %f610, %f338; - neg.ftz.f32 %f791, %f789; - mul.ftz.f32 %f792, %f790, %f778; - mul.ftz.f32 %f793, %f791, %f779; - sub.ftz.f32 %f794, %f793, %f792; - .loc 17 328 0 - mul.ftz.f32 %f795, %f612, %f228; - mul.ftz.f32 %f796, %f610, %f347; - neg.ftz.f32 %f797, %f795; - mul.ftz.f32 %f798, %f796, %f778; - mul.ftz.f32 %f799, %f797, %f779; - sub.ftz.f32 %f800, %f799, %f798; - .loc 17 329 0 - mul.ftz.f32 %f801, %f612, %f230; - mul.ftz.f32 %f802, %f610, %f352; - neg.ftz.f32 %f803, %f801; - mul.ftz.f32 %f804, %f802, %f778; - mul.ftz.f32 %f805, %f803, %f779; - sub.ftz.f32 %f806, %f805, %f804; - .loc 17 330 0 - mul.ftz.f32 %f807, %f612, %f232; - mul.ftz.f32 %f808, %f610, %f356; - neg.ftz.f32 %f809, %f807; - mul.ftz.f32 %f810, %f808, %f778; - mul.ftz.f32 %f811, %f809, %f779; - sub.ftz.f32 %f812, %f811, %f810; - .loc 17 331 0 - mul.ftz.f32 %f813, %f612, %f213; - mul.ftz.f32 %f814, %f610, %f365; - neg.ftz.f32 %f815, %f813; - mul.ftz.f32 %f816, %f814, %f778; - mul.ftz.f32 %f817, %f815, %f779; - sub.ftz.f32 %f818, %f817, %f816; - .loc 17 332 0 - mul.ftz.f32 %f819, %f612, %f214; - mul.ftz.f32 %f820, %f610, %f370; - neg.ftz.f32 %f821, %f819; - mul.ftz.f32 %f822, %f820, %f778; - mul.ftz.f32 %f823, %f821, %f779; - sub.ftz.f32 %f824, %f823, %f822; - .loc 17 333 0 - mul.ftz.f32 %f825, %f612, %f216; - mul.ftz.f32 %f826, %f610, %f374; - neg.ftz.f32 %f827, %f825; - mul.ftz.f32 %f828, %f826, %f778; - mul.ftz.f32 %f829, %f827, %f779; - sub.ftz.f32 %f830, %f829, %f828; - .loc 17 334 0 - mul.ftz.f32 %f831, %f479, %f782; - mul.ftz.f32 %f832, %f475, %f831; - mul.ftz.f32 %f833, %f478, %f782; - mul.ftz.f32 %f834, %f476, %f833; - sub.ftz.f32 %f835, %f834, %f832; - mul.ftz.f32 %f836, %f477, %f788; - mul.ftz.f32 %f837, %f476, %f836; - sub.ftz.f32 %f838, %f835, %f837; - mul.ftz.f32 %f839, %f477, %f794; - fma.rn.ftz.f32 %f840, %f475, %f839, %f838; - mul.ftz.f32 %f841, %f474, %f788; - fma.rn.ftz.f32 %f842, %f479, %f841, %f840; - mul.ftz.f32 %f843, %f474, %f794; - mul.ftz.f32 %f844, %f478, %f843; - sub.ftz.f32 %f845, %f842, %f844; - mul.ftz.f32 %f846, %f468, %f806; - fma.rn.ftz.f32 %f847, %f476, %f846, %f845; - mul.ftz.f32 %f848, %f468, %f812; - mul.ftz.f32 %f849, %f475, %f848; - sub.ftz.f32 %f850, %f847, %f849; - mul.ftz.f32 %f851, %f470, %f800; - mul.ftz.f32 %f852, %f476, %f851; - sub.ftz.f32 %f853, %f850, %f852; - mul.ftz.f32 %f854, %f469, %f800; - fma.rn.ftz.f32 %f855, %f475, %f854, %f853; - fma.rn.ftz.f32 %f856, %f812, %f480, %f855; - mul.ftz.f32 %f857, %f806, %f481; - sub.ftz.f32 %f858, %f856, %f857; - fma.rn.ftz.f32 %f859, %f830, %f484, %f858; - mul.ftz.f32 %f860, %f824, %f485; - sub.ftz.f32 %f861, %f859, %f860; - mul.ftz.f32 %f862, %f830, %f482; - sub.ftz.f32 %f863, %f861, %f862; - fma.rn.ftz.f32 %f864, %f824, %f483, %f863; - mul.ftz.f32 %f865, %f470, %f818; - fma.rn.ftz.f32 %f866, %f479, %f865, %f864; - mul.ftz.f32 %f867, %f469, %f818; - mul.ftz.f32 %f868, %f478, %f867; - sub.ftz.f32 %f869, %f866, %f868; - .loc 17 335 0 - add.ftz.f32 %f870, %f616, %f616; - div.approx.ftz.f32 %f871, %f622, %f870; - add.ftz.f32 %f872, %f778, %f779; - mul.ftz.f32 %f873, %f871, %f872; - .loc 17 336 0 - add.ftz.f32 %f874, %f494, %f494; - div.approx.ftz.f32 %f875, %f622, %f874; - mul.ftz.f32 %f876, %f869, %f875; - sub.ftz.f32 %f877, %f873, %f876; - .loc 17 337 0 - mul.ftz.f32 %f878, %f676, %f778; - fma.rn.ftz.f32 %f879, %f779, %f678, %f878; - sub.ftz.f32 %f880, %f877, %f879; - .loc 17 340 0 - mul.ftz.f32 %f881, %f429, %f429; - mov.f32 %f882, 0f40400000; // 3 - mul.ftz.f32 %f883, %f495, %f882; - mov.f32 %f884, 0f40800000; // 4 - mul.ftz.f32 %f885, %f583, %f884; - mul.ftz.f32 %f886, %f881, %f429; - mov.f32 %f887, 0f3f000000; // 0.5 - mul.ftz.f32 %f888, %f886, %f887; - mul.ftz.f32 %f889, %f888, %f419; - mul.ftz.f32 %f890, %f888, %f423; - mul.ftz.f32 %f891, %f888, %f416; - mov.f32 %f892, 0f40800000; // 4 - mul.ftz.f32 %f893, %f586, %f892; - mul.ftz.f32 %f894, %f889, %f748; - mul.ftz.f32 %f895, %f893, %f748; - fma.rn.ftz.f32 %f896, %f890, %f745, %f894; - fma.rn.ftz.f32 %f897, %f680, %f745, %f895; - mul.ftz.f32 %f898, %f883, %f705; - fma.rn.ftz.f32 %f899, %f891, %f751, %f896; - fma.rn.ftz.f32 %f900, %f885, %f751, %f897; - add.ftz.f32 %f901, %f899, %f283; - mul.ftz.f32 %f902, %f622, %f900; - fma.rn.ftz.f32 %f903, %f594, %f880, %f902; - mul.ftz.f32 %f904, %f901, %f708; - mul.ftz.f32 %f905, %f898, %f903; - sub.ftz.f32 %f906, %f905, %f904; - .loc 17 341 0 - mov.f32 %f907, 0f3f4db6db; // 0.803571 - mul.ftz.f32 %f908, %f495, %f907; - mul.ftz.f32 %f909, %f908, %f730; - mul.ftz.f32 %f910, %f901, %f732; - mul.ftz.f32 %f911, %f909, %f903; - sub.ftz.f32 %f912, %f911, %f910; - .loc 17 344 0 - mul.ftz.f32 %f913, %f657, %f912; - fma.rn.ftz.f32 %f914, %f906, %f654, %f913; - add.ftz.f32 %f270, %f914, %f270; - @!%p3 bra $Lt_0_56322; - .loc 17 346 0 - mov.f32 %f915, %f19; - mul.ftz.f32 %f916, %f277, %f914; - sub.ftz.f32 %f917, %f915, %f916; - mov.f32 %f19, %f917; -$Lt_0_56322: - .loc 17 314 0 - mov.f32 %f918, %f737; - .loc 17 315 0 - mul.ftz.f32 %f919, %f284, %f284; - neg.ftz.f32 %f920, %f919; - mov.f32 %f921, %f920; - .loc 17 316 0 - mul.ftz.f32 %f922, %f358, %f284; - neg.ftz.f32 %f923, %f922; - mov.f32 %f924, %f923; - .loc 17 317 0 - mov.f32 %f925, 0f3f800000; // 1 - sub.ftz.f32 %f926, %f925, %f919; - mov.f32 %f927, %f926; - .loc 17 318 0 - mov.f32 %f928, %f918; - mul.ftz.f32 %f929, %f928, %f282; - mov.f32 %f930, %f929; - .loc 17 319 0 - mul.ftz.f32 %f931, %f282, %f926; - mov.f32 %f932, %f931; - .loc 17 320 0 - mov.f32 %f933, %f924; - mul.ftz.f32 %f934, %f933, %f282; - mov.f32 %f935, %f934; - .loc 17 325 0 - mul.ftz.f32 %f936, %f326, %f931; - mul.ftz.f32 %f937, %f323, %f931; - mul.ftz.f32 %f938, %f325, %f931; - mul.ftz.f32 %f939, %f89, %f931; - mul.ftz.f32 %f940, %f71, %f931; - mul.ftz.f32 %f941, %f79, %f931; - fma.rn.ftz.f32 %f942, %f929, %f308, %f936; - fma.rn.ftz.f32 %f943, %f312, %f929, %f937; - fma.rn.ftz.f32 %f944, %f929, %f310, %f938; - fma.rn.ftz.f32 %f945, %f929, %f70, %f939; - fma.rn.ftz.f32 %f946, %f87, %f929, %f940; - fma.rn.ftz.f32 %f947, %f929, %f76, %f941; - fma.rn.ftz.f32 %f948, %f934, %f334, %f942; - fma.rn.ftz.f32 %f949, %f333, %f934, %f943; - fma.rn.ftz.f32 %f950, %f934, %f335, %f944; - fma.rn.ftz.f32 %f951, %f934, %f80, %f945; - fma.rn.ftz.f32 %f952, %f77, %f934, %f946; - fma.rn.ftz.f32 %f953, %f934, %f91, %f947; - mul.ftz.f32 %f954, %f948, %f670; - mul.ftz.f32 %f955, %f951, %f663; - fma.rn.ftz.f32 %f956, %f668, %f949, %f954; - fma.rn.ftz.f32 %f957, %f661, %f952, %f955; - fma.rn.ftz.f32 %f958, %f672, %f950, %f956; - fma.rn.ftz.f32 %f959, %f665, %f953, %f957; - mul.ftz.f32 %f960, %f760, %f958; - mul.ftz.f32 %f961, %f761, %f959; - sub.ftz.f32 %f962, %f961, %f960; - .loc 17 326 0 - mul.ftz.f32 %f963, %f784, %f958; - mul.ftz.f32 %f964, %f785, %f959; - sub.ftz.f32 %f965, %f964, %f963; - .loc 17 327 0 - mul.ftz.f32 %f966, %f790, %f958; - mul.ftz.f32 %f967, %f791, %f959; - sub.ftz.f32 %f968, %f967, %f966; - .loc 17 328 0 - mul.ftz.f32 %f969, %f796, %f958; - mul.ftz.f32 %f970, %f797, %f959; - sub.ftz.f32 %f971, %f970, %f969; - .loc 17 329 0 - mul.ftz.f32 %f972, %f802, %f958; - mul.ftz.f32 %f973, %f803, %f959; - sub.ftz.f32 %f974, %f973, %f972; - .loc 17 330 0 - mul.ftz.f32 %f975, %f808, %f958; - mul.ftz.f32 %f976, %f809, %f959; - sub.ftz.f32 %f977, %f976, %f975; - .loc 17 331 0 - mul.ftz.f32 %f978, %f814, %f958; - mul.ftz.f32 %f979, %f815, %f959; - sub.ftz.f32 %f980, %f979, %f978; - .loc 17 332 0 - mul.ftz.f32 %f981, %f820, %f958; - mul.ftz.f32 %f982, %f821, %f959; - sub.ftz.f32 %f983, %f982, %f981; - .loc 17 333 0 - mul.ftz.f32 %f984, %f826, %f958; - mul.ftz.f32 %f985, %f827, %f959; - sub.ftz.f32 %f986, %f985, %f984; - .loc 17 334 0 - mul.ftz.f32 %f987, %f479, %f962; - mul.ftz.f32 %f988, %f475, %f987; - mul.ftz.f32 %f989, %f478, %f962; - mul.ftz.f32 %f990, %f476, %f989; - sub.ftz.f32 %f991, %f990, %f988; - mul.ftz.f32 %f992, %f477, %f965; - mul.ftz.f32 %f993, %f476, %f992; - sub.ftz.f32 %f994, %f991, %f993; - mul.ftz.f32 %f995, %f477, %f968; - fma.rn.ftz.f32 %f996, %f475, %f995, %f994; - mul.ftz.f32 %f997, %f474, %f965; - fma.rn.ftz.f32 %f998, %f479, %f997, %f996; - mul.ftz.f32 %f999, %f474, %f968; - mul.ftz.f32 %f1000, %f478, %f999; - sub.ftz.f32 %f1001, %f998, %f1000; - mul.ftz.f32 %f1002, %f468, %f974; - fma.rn.ftz.f32 %f1003, %f476, %f1002, %f1001; - mul.ftz.f32 %f1004, %f468, %f977; - mul.ftz.f32 %f1005, %f475, %f1004; - sub.ftz.f32 %f1006, %f1003, %f1005; - mul.ftz.f32 %f1007, %f470, %f971; - mul.ftz.f32 %f1008, %f476, %f1007; - sub.ftz.f32 %f1009, %f1006, %f1008; - mul.ftz.f32 %f1010, %f469, %f971; - fma.rn.ftz.f32 %f1011, %f475, %f1010, %f1009; - fma.rn.ftz.f32 %f1012, %f977, %f480, %f1011; - mul.ftz.f32 %f1013, %f974, %f481; - sub.ftz.f32 %f1014, %f1012, %f1013; - fma.rn.ftz.f32 %f1015, %f986, %f484, %f1014; - mul.ftz.f32 %f1016, %f983, %f485; - sub.ftz.f32 %f1017, %f1015, %f1016; - mul.ftz.f32 %f1018, %f986, %f482; - sub.ftz.f32 %f1019, %f1017, %f1018; - fma.rn.ftz.f32 %f1020, %f983, %f483, %f1019; - mul.ftz.f32 %f1021, %f470, %f980; - fma.rn.ftz.f32 %f1022, %f479, %f1021, %f1020; - mul.ftz.f32 %f1023, %f469, %f980; - mul.ftz.f32 %f1024, %f478, %f1023; - sub.ftz.f32 %f1025, %f1022, %f1024; - .loc 17 335 0 - add.ftz.f32 %f1026, %f958, %f959; - mul.ftz.f32 %f1027, %f871, %f1026; - .loc 17 336 0 - mul.ftz.f32 %f1028, %f1025, %f875; - sub.ftz.f32 %f1029, %f1027, %f1028; - .loc 17 337 0 - mul.ftz.f32 %f1030, %f676, %f958; - fma.rn.ftz.f32 %f1031, %f959, %f678, %f1030; - sub.ftz.f32 %f1032, %f1029, %f1031; - .loc 17 340 0 - mul.ftz.f32 %f1033, %f889, %f931; - mul.ftz.f32 %f1034, %f893, %f931; - fma.rn.ftz.f32 %f1035, %f890, %f929, %f1033; - fma.rn.ftz.f32 %f1036, %f680, %f929, %f1034; - fma.rn.ftz.f32 %f1037, %f891, %f934, %f1035; - fma.rn.ftz.f32 %f1038, %f885, %f934, %f1036; - add.ftz.f32 %f1039, %f1037, %f284; - mul.ftz.f32 %f1040, %f622, %f1038; - fma.rn.ftz.f32 %f1041, %f594, %f1032, %f1040; - mul.ftz.f32 %f1042, %f1039, %f708; - mul.ftz.f32 %f1043, %f898, %f1041; - sub.ftz.f32 %f1044, %f1043, %f1042; - .loc 17 341 0 - mul.ftz.f32 %f1045, %f1039, %f732; - mul.ftz.f32 %f1046, %f909, %f1041; - sub.ftz.f32 %f1047, %f1046, %f1045; - .loc 17 348 0 - mul.ftz.f32 %f1048, %f657, %f1047; - fma.rn.ftz.f32 %f914, %f1044, %f654, %f1048; - add.ftz.f32 %f269, %f914, %f269; - @!%p3 bra $Lt_0_59906; - .loc 17 350 0 - mov.f32 %f1049, %f21; - mul.ftz.f32 %f1050, %f276, %f914; - sub.ftz.f32 %f1051, %f1049, %f1050; - mov.f32 %f21, %f1051; - .loc 17 351 0 - mov.f32 %f1052, %f25; - mul.ftz.f32 %f1053, %f277, %f914; - sub.ftz.f32 %f1054, %f1052, %f1053; - mov.f32 %f25, %f1054; -$Lt_0_59906: - .loc 17 314 0 - mov.f32 %f1055, %f740; - .loc 17 315 0 - mov.f32 %f1056, %f923; - .loc 17 316 0 - mul.ftz.f32 %f1057, %f358, %f358; - neg.ftz.f32 %f1058, %f1057; - mov.f32 %f1059, %f1058; - .loc 17 317 0 - mov.f32 %f1060, 0f3f800000; // 1 - sub.ftz.f32 %f1061, %f1060, %f1057; - mov.f32 %f1062, %f1061; - .loc 17 318 0 - mov.f32 %f1063, %f1055; - mul.ftz.f32 %f1064, %f1063, %f282; - mov.f32 %f1065, %f1064; - .loc 17 319 0 - mov.f32 %f1066, %f1056; - mul.ftz.f32 %f1067, %f1066, %f282; - mov.f32 %f1068, %f1067; - .loc 17 320 0 - mul.ftz.f32 %f1069, %f282, %f1061; - mov.f32 %f1070, %f1069; - .loc 17 325 0 - mul.ftz.f32 %f1071, %f71, %f1067; - mul.ftz.f32 %f1072, %f79, %f1067; - mul.ftz.f32 %f1073, %f323, %f1067; - mul.ftz.f32 %f1074, %f325, %f1067; - fma.rn.ftz.f32 %f1075, %f87, %f1064, %f1071; - mul.ftz.f32 %f1076, %f89, %f1067; - fma.rn.ftz.f32 %f1077, %f1064, %f76, %f1072; - fma.rn.ftz.f32 %f1078, %f312, %f1064, %f1073; - mul.ftz.f32 %f1079, %f326, %f1067; - fma.rn.ftz.f32 %f1080, %f1064, %f310, %f1074; - fma.rn.ftz.f32 %f1081, %f1064, %f70, %f1076; - fma.rn.ftz.f32 %f1082, %f1064, %f308, %f1079; - fma.rn.ftz.f32 %f1083, %f1069, %f334, %f1082; - fma.rn.ftz.f32 %f1084, %f333, %f1069, %f1078; - fma.rn.ftz.f32 %f1085, %f1069, %f335, %f1080; - fma.rn.ftz.f32 %f1086, %f1069, %f80, %f1081; - fma.rn.ftz.f32 %f1087, %f77, %f1069, %f1075; - fma.rn.ftz.f32 %f1088, %f1069, %f91, %f1077; - mul.ftz.f32 %f1089, %f1083, %f670; - mul.ftz.f32 %f1090, %f1086, %f663; - fma.rn.ftz.f32 %f1091, %f668, %f1084, %f1089; - fma.rn.ftz.f32 %f1092, %f661, %f1087, %f1090; - fma.rn.ftz.f32 %f1093, %f672, %f1085, %f1091; - fma.rn.ftz.f32 %f1094, %f665, %f1088, %f1092; - mul.ftz.f32 %f1095, %f760, %f1093; - mul.ftz.f32 %f1096, %f761, %f1094; - sub.ftz.f32 %f1097, %f1096, %f1095; - .loc 17 326 0 - mul.ftz.f32 %f1098, %f784, %f1093; - mul.ftz.f32 %f1099, %f785, %f1094; - sub.ftz.f32 %f1100, %f1099, %f1098; - .loc 17 327 0 - mul.ftz.f32 %f1101, %f790, %f1093; - mul.ftz.f32 %f1102, %f791, %f1094; - sub.ftz.f32 %f1103, %f1102, %f1101; - .loc 17 328 0 - mul.ftz.f32 %f1104, %f796, %f1093; - mul.ftz.f32 %f1105, %f797, %f1094; - sub.ftz.f32 %f1106, %f1105, %f1104; - .loc 17 329 0 - mul.ftz.f32 %f1107, %f802, %f1093; - mul.ftz.f32 %f1108, %f803, %f1094; - sub.ftz.f32 %f1109, %f1108, %f1107; - .loc 17 330 0 - mul.ftz.f32 %f1110, %f808, %f1093; - mul.ftz.f32 %f1111, %f809, %f1094; - sub.ftz.f32 %f1112, %f1111, %f1110; - .loc 17 331 0 - mul.ftz.f32 %f1113, %f814, %f1093; - mul.ftz.f32 %f1114, %f815, %f1094; - sub.ftz.f32 %f1115, %f1114, %f1113; - .loc 17 332 0 - mul.ftz.f32 %f1116, %f820, %f1093; - mul.ftz.f32 %f1117, %f821, %f1094; - sub.ftz.f32 %f1118, %f1117, %f1116; - .loc 17 333 0 - mul.ftz.f32 %f1119, %f826, %f1093; - mul.ftz.f32 %f1120, %f827, %f1094; - sub.ftz.f32 %f1121, %f1120, %f1119; - .loc 17 334 0 - mul.ftz.f32 %f1122, %f479, %f1097; - mul.ftz.f32 %f1123, %f475, %f1122; - mul.ftz.f32 %f1124, %f478, %f1097; - mul.ftz.f32 %f1125, %f476, %f1124; - sub.ftz.f32 %f1126, %f1125, %f1123; - mul.ftz.f32 %f1127, %f477, %f1100; - mul.ftz.f32 %f1128, %f476, %f1127; - sub.ftz.f32 %f1129, %f1126, %f1128; - mul.ftz.f32 %f1130, %f477, %f1103; - fma.rn.ftz.f32 %f1131, %f475, %f1130, %f1129; - mul.ftz.f32 %f1132, %f474, %f1100; - fma.rn.ftz.f32 %f1133, %f479, %f1132, %f1131; - mul.ftz.f32 %f1134, %f474, %f1103; - mul.ftz.f32 %f1135, %f478, %f1134; - sub.ftz.f32 %f1136, %f1133, %f1135; - mul.ftz.f32 %f1137, %f468, %f1109; - fma.rn.ftz.f32 %f1138, %f476, %f1137, %f1136; - mul.ftz.f32 %f1139, %f468, %f1112; - mul.ftz.f32 %f1140, %f475, %f1139; - sub.ftz.f32 %f1141, %f1138, %f1140; - mul.ftz.f32 %f1142, %f470, %f1106; - mul.ftz.f32 %f1143, %f476, %f1142; - sub.ftz.f32 %f1144, %f1141, %f1143; - mul.ftz.f32 %f1145, %f469, %f1106; - fma.rn.ftz.f32 %f1146, %f475, %f1145, %f1144; - fma.rn.ftz.f32 %f1147, %f1112, %f480, %f1146; - mul.ftz.f32 %f1148, %f1109, %f481; - sub.ftz.f32 %f1149, %f1147, %f1148; - fma.rn.ftz.f32 %f1150, %f1121, %f484, %f1149; - mul.ftz.f32 %f1151, %f1118, %f485; - sub.ftz.f32 %f1152, %f1150, %f1151; - mul.ftz.f32 %f1153, %f1121, %f482; - sub.ftz.f32 %f1154, %f1152, %f1153; - fma.rn.ftz.f32 %f1155, %f1118, %f483, %f1154; - mul.ftz.f32 %f1156, %f470, %f1115; - fma.rn.ftz.f32 %f1157, %f479, %f1156, %f1155; - mul.ftz.f32 %f1158, %f469, %f1115; - mul.ftz.f32 %f1159, %f478, %f1158; - sub.ftz.f32 %f1160, %f1157, %f1159; - .loc 17 335 0 - add.ftz.f32 %f1161, %f1093, %f1094; - mul.ftz.f32 %f1162, %f871, %f1161; - .loc 17 336 0 - mul.ftz.f32 %f1163, %f1160, %f875; - sub.ftz.f32 %f1164, %f1162, %f1163; - .loc 17 337 0 - mul.ftz.f32 %f1165, %f676, %f1093; - fma.rn.ftz.f32 %f1166, %f1094, %f678, %f1165; - sub.ftz.f32 %f1167, %f1164, %f1166; - .loc 17 340 0 - mul.ftz.f32 %f1168, %f889, %f1067; - mul.ftz.f32 %f1169, %f893, %f1067; - fma.rn.ftz.f32 %f1170, %f890, %f1064, %f1168; - fma.rn.ftz.f32 %f1171, %f680, %f1064, %f1169; - fma.rn.ftz.f32 %f1172, %f891, %f1069, %f1170; - fma.rn.ftz.f32 %f1173, %f885, %f1069, %f1171; - add.ftz.f32 %f1174, %f1172, %f358; - mul.ftz.f32 %f1175, %f622, %f1173; - fma.rn.ftz.f32 %f1176, %f594, %f1167, %f1175; - mul.ftz.f32 %f1177, %f1174, %f708; - mul.ftz.f32 %f1178, %f898, %f1176; - sub.ftz.f32 %f1179, %f1178, %f1177; - .loc 17 341 0 - mul.ftz.f32 %f1180, %f1174, %f732; - mul.ftz.f32 %f1181, %f909, %f1176; - sub.ftz.f32 %f1182, %f1181, %f1180; - .loc 17 354 0 - mul.ftz.f32 %f1183, %f657, %f1182; - fma.rn.ftz.f32 %f914, %f1179, %f654, %f1183; - add.ftz.f32 %f268, %f914, %f268; - @!%p3 bra $Lt_0_62978; - .loc 17 356 0 - mov.f32 %f1184, %f23; - mul.ftz.f32 %f1185, %f278, %f914; - sub.ftz.f32 %f1186, %f1184, %f1185; - mov.f32 %f23, %f1186; - .loc 17 357 0 - mov.f32 %f1187, %f27; - mul.ftz.f32 %f1188, %f277, %f914; - sub.ftz.f32 %f1189, %f1187, %f1188; - mov.f32 %f27, %f1189; - .loc 17 358 0 - mul.ftz.f32 %f1190, %f276, %f914; - sub.ftz.f32 %f28, %f28, %f1190; - mov.f32 %f29, %f28; -$Lt_0_62978: - .loc 17 381 0 - mul.ftz.f32 %f1191, %f80, %f284; - mul.ftz.f32 %f1192, %f78, %f284; - mul.ftz.f32 %f1193, %f91, %f284; - neg.ftz.f32 %f1194, %f1191; - mov.f32 %f1195, 0f00000000; // 0 - fma.rn.ftz.f32 %f1196, %f1195, %f283, %f1192; - neg.ftz.f32 %f1197, %f1193; - mov.f32 %f1198, 0f00000000; // 0 - fma.rn.ftz.f32 %f1199, %f283, %f1198, %f1194; - fma.rn.ftz.f32 %f1200, %f71, %f358, %f1196; - mov.f32 %f1201, 0f00000000; // 0 - fma.rn.ftz.f32 %f1202, %f283, %f1201, %f1197; - fma.rn.ftz.f32 %f1203, %f358, %f89, %f1199; - fma.rn.ftz.f32 %f1204, %f358, %f79, %f1202; - mul.ftz.f32 %f1205, %f1203, %f662; - neg.ftz.f32 %f1206, %f1205; - fma.rn.ftz.f32 %f1207, %f661, %f1200, %f1206; - fma.rn.ftz.f32 %f1208, %f665, %f1204, %f1207; - mul.ftz.f32 %f1209, %f759, %f1208; - mul.ftz.f32 %f1210, %f783, %f1208; - mul.ftz.f32 %f1211, %f789, %f1208; - mul.ftz.f32 %f1212, %f801, %f1208; - mul.ftz.f32 %f1213, %f807, %f1208; - mul.ftz.f32 %f1214, %f795, %f1208; - mul.ftz.f32 %f1215, %f825, %f1208; - mul.ftz.f32 %f1216, %f819, %f1208; - mul.ftz.f32 %f1217, %f813, %f1208; - neg.ftz.f32 %f1218, %f1209; - neg.ftz.f32 %f1219, %f1210; - neg.ftz.f32 %f1220, %f1211; - neg.ftz.f32 %f1221, %f1212; - neg.ftz.f32 %f1222, %f1213; - neg.ftz.f32 %f1223, %f1214; - neg.ftz.f32 %f1224, %f1215; - neg.ftz.f32 %f1225, %f1216; - neg.ftz.f32 %f1226, %f1217; - fma.rn.ftz.f32 %f1227, %f225, %f445, %f1218; - fma.rn.ftz.f32 %f1228, %f251, %f445, %f1219; - fma.rn.ftz.f32 %f1229, %f227, %f445, %f1220; - fma.rn.ftz.f32 %f1230, %f264, %f445, %f1221; - fma.rn.ftz.f32 %f1231, %f262, %f445, %f1222; - fma.rn.ftz.f32 %f1232, %f252, %f445, %f1223; - fma.rn.ftz.f32 %f1233, %f246, %f445, %f1224; - fma.rn.ftz.f32 %f1234, %f263, %f445, %f1225; - fma.rn.ftz.f32 %f1235, %f227, %f445, %f1226; - mul.ftz.f32 %f1236, %f479, %f1227; - mul.ftz.f32 %f1237, %f475, %f1236; - mul.ftz.f32 %f1238, %f478, %f1227; - mul.ftz.f32 %f1239, %f476, %f1238; - sub.ftz.f32 %f1240, %f1239, %f1237; - mul.ftz.f32 %f1241, %f477, %f1228; - mul.ftz.f32 %f1242, %f476, %f1241; - sub.ftz.f32 %f1243, %f1240, %f1242; - mul.ftz.f32 %f1244, %f477, %f1229; - fma.rn.ftz.f32 %f1245, %f475, %f1244, %f1243; - mul.ftz.f32 %f1246, %f474, %f1228; - fma.rn.ftz.f32 %f1247, %f479, %f1246, %f1245; - mul.ftz.f32 %f1248, %f474, %f1229; - mul.ftz.f32 %f1249, %f478, %f1248; - sub.ftz.f32 %f1250, %f1247, %f1249; - mul.ftz.f32 %f1251, %f468, %f1230; - fma.rn.ftz.f32 %f1252, %f476, %f1251, %f1250; - mul.ftz.f32 %f1253, %f468, %f1231; - mul.ftz.f32 %f1254, %f475, %f1253; - sub.ftz.f32 %f1255, %f1252, %f1254; - mul.ftz.f32 %f1256, %f470, %f1232; - mul.ftz.f32 %f1257, %f476, %f1256; - sub.ftz.f32 %f1258, %f1255, %f1257; - mul.ftz.f32 %f1259, %f469, %f1232; - fma.rn.ftz.f32 %f1260, %f475, %f1259, %f1258; - fma.rn.ftz.f32 %f1261, %f1231, %f480, %f1260; - mul.ftz.f32 %f1262, %f1230, %f481; - sub.ftz.f32 %f1263, %f1261, %f1262; - fma.rn.ftz.f32 %f1264, %f1233, %f484, %f1263; - mul.ftz.f32 %f1265, %f1234, %f485; - sub.ftz.f32 %f1266, %f1264, %f1265; - mul.ftz.f32 %f1267, %f1233, %f482; - sub.ftz.f32 %f1268, %f1266, %f1267; - fma.rn.ftz.f32 %f1269, %f1234, %f483, %f1268; - mul.ftz.f32 %f1270, %f470, %f1235; - fma.rn.ftz.f32 %f1271, %f479, %f1270, %f1269; - mul.ftz.f32 %f1272, %f469, %f1235; - mul.ftz.f32 %f1273, %f478, %f1272; - sub.ftz.f32 %f1274, %f1271, %f1273; - .loc 17 392 0 - mul.ftz.f32 %f1275, %f80, %f586; - mul.ftz.f32 %f1276, %f78, %f586; - mul.ftz.f32 %f1277, %f91, %f586; - mul.ftz.f32 %f1278, %f117, %f893; - mul.ftz.f32 %f1279, %f85, %f893; - mul.ftz.f32 %f1280, %f98, %f893; - neg.ftz.f32 %f1281, %f1275; - neg.ftz.f32 %f1282, %f1277; - mul.ftz.f32 %f1283, %f875, %f1274; - mul.ftz.f32 %f1284, %f889, %f224; - mul.ftz.f32 %f1285, %f889, %f248; - mov.f32 %f1286, 0f00000000; // 0 - fma.rn.ftz.f32 %f1287, %f591, %f1286, %f1281; - mov.f32 %f1288, 0f00000000; // 0 - fma.rn.ftz.f32 %f1289, %f1288, %f591, %f1276; - mov.f32 %f1290, 0f00000000; // 0 - fma.rn.ftz.f32 %f1291, %f591, %f1290, %f1282; - fma.rn.ftz.f32 %f1292, %f188, %f890, %f1284; - mul.ftz.f32 %f1293, %f889, %f257; - fma.rn.ftz.f32 %f1294, %f890, %f169, %f1285; - fma.rn.ftz.f32 %f1295, %f680, %f83, %f1278; - fma.rn.ftz.f32 %f1296, %f113, %f680, %f1279; - fma.rn.ftz.f32 %f1297, %f680, %f93, %f1280; - fma.rn.ftz.f32 %f1298, %f583, %f89, %f1287; - fma.rn.ftz.f32 %f1299, %f71, %f583, %f1289; - fma.rn.ftz.f32 %f1300, %f583, %f79, %f1291; - fma.rn.ftz.f32 %f1301, %f196, %f891, %f1292; - fma.rn.ftz.f32 %f1302, %f890, %f140, %f1293; - fma.rn.ftz.f32 %f1303, %f891, %f214, %f1294; - fma.rn.ftz.f32 %f1304, %f885, %f100, %f1295; - fma.rn.ftz.f32 %f1305, %f885, %f96, %f1296; - fma.rn.ftz.f32 %f1306, %f885, %f120, %f1297; - fma.rn.ftz.f32 %f1307, %f891, %f230, %f1302; - mul.ftz.f32 %f1308, %f1298, %f1304; - mul.ftz.f32 %f1309, %f1307, %f419; - fma.rn.ftz.f32 %f1310, %f1305, %f1299, %f1308; - fma.rn.ftz.f32 %f1311, %f423, %f1301, %f1309; - fma.rn.ftz.f32 %f1312, %f1306, %f1300, %f1310; - fma.rn.ftz.f32 %f1313, %f416, %f1303, %f1311; - mul.ftz.f32 %f1314, %f622, %f1312; - mul.ftz.f32 %f1315, %f1208, %f871; - sub.ftz.f32 %f1316, %f1315, %f1283; - neg.ftz.f32 %f1317, %f1314; - mul.ftz.f32 %f1318, %f1208, %f678; - sub.ftz.f32 %f1319, %f1316, %f1318; - fma.rn.ftz.f32 %f1320, %f594, %f1319, %f1317; - mul.ftz.f32 %f1321, %f1313, %f732; - fma.rn.ftz.f32 %f1322, %f909, %f1320, %f1321; - mul.ftz.f32 %f1323, %f657, %f1322; - mul.ftz.f32 %f1324, %f1313, %f708; - fma.rn.ftz.f32 %f1325, %f898, %f1320, %f1324; - fma.rn.ftz.f32 %f1326, %f1325, %f654, %f1323; - sub.ftz.f32 %f267, %f267, %f1326; - .loc 17 407 0 - mov.f32 %f1327, 0f00000000; // 0 - fma.rn.ftz.f32 %f1328, %f283, %f80, %f1327; - mov.f32 %f1329, 0f00000000; // 0 - fma.rn.ftz.f32 %f1330, %f77, %f283, %f1329; - mov.f32 %f1331, 0f00000000; // 0 - fma.rn.ftz.f32 %f1332, %f283, %f91, %f1331; - fma.rn.ftz.f32 %f1333, %f358, %f72, %f1328; - fma.rn.ftz.f32 %f1334, %f88, %f358, %f1330; - fma.rn.ftz.f32 %f1335, %f358, %f94, %f1332; - mul.ftz.f32 %f1336, %f1333, %f662; - neg.ftz.f32 %f1337, %f1336; - fma.rn.ftz.f32 %f1338, %f661, %f1334, %f1337; - fma.rn.ftz.f32 %f1339, %f665, %f1335, %f1338; - mul.ftz.f32 %f1340, %f759, %f1339; - mul.ftz.f32 %f1341, %f783, %f1339; - mul.ftz.f32 %f1342, %f789, %f1339; - mul.ftz.f32 %f1343, %f801, %f1339; - mul.ftz.f32 %f1344, %f807, %f1339; - mul.ftz.f32 %f1345, %f795, %f1339; - mul.ftz.f32 %f1346, %f825, %f1339; - mul.ftz.f32 %f1347, %f819, %f1339; - mul.ftz.f32 %f1348, %f813, %f1339; - neg.ftz.f32 %f1349, %f1340; - neg.ftz.f32 %f1350, %f1341; - neg.ftz.f32 %f1351, %f1342; - neg.ftz.f32 %f1352, %f1343; - neg.ftz.f32 %f1353, %f1344; - neg.ftz.f32 %f1354, %f1345; - neg.ftz.f32 %f1355, %f1346; - neg.ftz.f32 %f1356, %f1347; - neg.ftz.f32 %f1357, %f1348; - fma.rn.ftz.f32 %f1358, %f226, %f445, %f1349; - fma.rn.ftz.f32 %f1359, %f255, %f445, %f1350; - fma.rn.ftz.f32 %f1360, %f247, %f445, %f1351; - fma.rn.ftz.f32 %f1361, %f184, %f445, %f1352; - fma.rn.ftz.f32 %f1362, %f258, %f445, %f1353; - fma.rn.ftz.f32 %f1363, %f254, %f445, %f1354; - fma.rn.ftz.f32 %f1364, %f249, %f445, %f1355; - fma.rn.ftz.f32 %f1365, %f259, %f445, %f1356; - fma.rn.ftz.f32 %f1366, %f247, %f445, %f1357; - mul.ftz.f32 %f1367, %f479, %f1358; - mul.ftz.f32 %f1368, %f475, %f1367; - mul.ftz.f32 %f1369, %f478, %f1358; - mul.ftz.f32 %f1370, %f476, %f1369; - sub.ftz.f32 %f1371, %f1370, %f1368; - mul.ftz.f32 %f1372, %f477, %f1359; - mul.ftz.f32 %f1373, %f476, %f1372; - sub.ftz.f32 %f1374, %f1371, %f1373; - mul.ftz.f32 %f1375, %f477, %f1360; - fma.rn.ftz.f32 %f1376, %f475, %f1375, %f1374; - mul.ftz.f32 %f1377, %f474, %f1359; - fma.rn.ftz.f32 %f1378, %f479, %f1377, %f1376; - mul.ftz.f32 %f1379, %f474, %f1360; - mul.ftz.f32 %f1380, %f478, %f1379; - sub.ftz.f32 %f1381, %f1378, %f1380; - mul.ftz.f32 %f1382, %f468, %f1361; - fma.rn.ftz.f32 %f1383, %f476, %f1382, %f1381; - mul.ftz.f32 %f1384, %f468, %f1362; - mul.ftz.f32 %f1385, %f475, %f1384; - sub.ftz.f32 %f1386, %f1383, %f1385; - mul.ftz.f32 %f1387, %f470, %f1363; - mul.ftz.f32 %f1388, %f476, %f1387; - sub.ftz.f32 %f1389, %f1386, %f1388; - mul.ftz.f32 %f1390, %f469, %f1363; - fma.rn.ftz.f32 %f1391, %f475, %f1390, %f1389; - fma.rn.ftz.f32 %f1392, %f1362, %f480, %f1391; - mul.ftz.f32 %f1393, %f1361, %f481; - sub.ftz.f32 %f1394, %f1392, %f1393; - fma.rn.ftz.f32 %f1395, %f1364, %f484, %f1394; - mul.ftz.f32 %f1396, %f1365, %f485; - sub.ftz.f32 %f1397, %f1395, %f1396; - mul.ftz.f32 %f1398, %f1364, %f482; - sub.ftz.f32 %f1399, %f1397, %f1398; - fma.rn.ftz.f32 %f1400, %f1365, %f483, %f1399; - mul.ftz.f32 %f1401, %f470, %f1366; - fma.rn.ftz.f32 %f1402, %f479, %f1401, %f1400; - mul.ftz.f32 %f1403, %f469, %f1366; - mul.ftz.f32 %f1404, %f478, %f1403; - sub.ftz.f32 %f1405, %f1402, %f1404; - .loc 17 418 0 - mul.ftz.f32 %f1406, %f889, %f140; - mul.ftz.f32 %f1407, %f889, %f169; - mul.ftz.f32 %f1408, %f875, %f1405; - mul.ftz.f32 %f1409, %f889, %f190; - fma.rn.ftz.f32 %f1410, %f890, %f216, %f1407; - fma.rn.ftz.f32 %f1411, %f191, %f890, %f1409; - fma.rn.ftz.f32 %f1412, %f890, %f232, %f1406; - fma.rn.ftz.f32 %f1413, %f891, %f219, %f1410; - mov.f32 %f1414, 0f00000000; // 0 - fma.rn.ftz.f32 %f1415, %f591, %f80, %f1414; - mov.f32 %f1416, 0f00000000; // 0 - fma.rn.ftz.f32 %f1417, %f77, %f591, %f1416; - mov.f32 %f1418, 0f00000000; // 0 - fma.rn.ftz.f32 %f1419, %f591, %f91, %f1418; - fma.rn.ftz.f32 %f1420, %f192, %f891, %f1411; - fma.rn.ftz.f32 %f1421, %f891, %f236, %f1412; - fma.rn.ftz.f32 %f1422, %f583, %f72, %f1415; - fma.rn.ftz.f32 %f1423, %f88, %f583, %f1417; - fma.rn.ftz.f32 %f1424, %f583, %f94, %f1419; - mul.ftz.f32 %f1425, %f1421, %f419; - fma.rn.ftz.f32 %f1426, %f423, %f1420, %f1425; - mul.ftz.f32 %f1427, %f1422, %f1304; - fma.rn.ftz.f32 %f1428, %f416, %f1413, %f1426; - fma.rn.ftz.f32 %f1429, %f1305, %f1423, %f1427; - fma.rn.ftz.f32 %f1430, %f1306, %f1424, %f1429; - mul.ftz.f32 %f1431, %f622, %f1430; - mul.ftz.f32 %f1432, %f1339, %f871; - sub.ftz.f32 %f1433, %f1432, %f1408; - neg.ftz.f32 %f1434, %f1431; - mul.ftz.f32 %f1435, %f1339, %f678; - sub.ftz.f32 %f1436, %f1433, %f1435; - fma.rn.ftz.f32 %f1437, %f594, %f1436, %f1434; - mul.ftz.f32 %f1438, %f1428, %f732; - fma.rn.ftz.f32 %f1439, %f909, %f1437, %f1438; - mul.ftz.f32 %f1440, %f657, %f1439; - mul.ftz.f32 %f1441, %f1428, %f708; - fma.rn.ftz.f32 %f1442, %f898, %f1437, %f1441; - fma.rn.ftz.f32 %f1443, %f1442, %f654, %f1440; - sub.ftz.f32 %f266, %f266, %f1443; - .loc 17 433 0 - mul.ftz.f32 %f1444, %f70, %f284; - mul.ftz.f32 %f1445, %f87, %f284; - mul.ftz.f32 %f1446, %f76, %f284; - fma.rn.ftz.f32 %f1447, %f283, %f90, %f1444; - fma.rn.ftz.f32 %f1448, %f86, %f283, %f1445; - fma.rn.ftz.f32 %f1449, %f283, %f81, %f1446; - mov.f32 %f1450, 0f00000000; // 0 - fma.rn.ftz.f32 %f1451, %f358, %f1450, %f1447; - mov.f32 %f1452, 0f00000000; // 0 - fma.rn.ftz.f32 %f1453, %f1452, %f358, %f1448; - mov.f32 %f1454, 0f00000000; // 0 - fma.rn.ftz.f32 %f1455, %f358, %f1454, %f1449; - mul.ftz.f32 %f1456, %f1451, %f662; - neg.ftz.f32 %f1457, %f1456; - fma.rn.ftz.f32 %f1458, %f661, %f1453, %f1457; - fma.rn.ftz.f32 %f1459, %f665, %f1455, %f1458; - mul.ftz.f32 %f1460, %f759, %f1459; - mul.ftz.f32 %f1461, %f783, %f1459; - mul.ftz.f32 %f1462, %f789, %f1459; - mul.ftz.f32 %f1463, %f801, %f1459; - mul.ftz.f32 %f1464, %f807, %f1459; - mul.ftz.f32 %f1465, %f795, %f1459; - mul.ftz.f32 %f1466, %f825, %f1459; - mul.ftz.f32 %f1467, %f819, %f1459; - mul.ftz.f32 %f1468, %f813, %f1459; - neg.ftz.f32 %f1469, %f1460; - neg.ftz.f32 %f1470, %f1461; - neg.ftz.f32 %f1471, %f1462; - neg.ftz.f32 %f1472, %f1463; - neg.ftz.f32 %f1473, %f1464; - neg.ftz.f32 %f1474, %f1465; - neg.ftz.f32 %f1475, %f1466; - neg.ftz.f32 %f1476, %f1467; - neg.ftz.f32 %f1477, %f1468; - fma.rn.ftz.f32 %f1478, %f243, %f445, %f1469; - fma.rn.ftz.f32 %f1479, %f260, %f445, %f1470; - fma.rn.ftz.f32 %f1480, %f250, %f445, %f1471; - fma.rn.ftz.f32 %f1481, %f253, %f445, %f1472; - fma.rn.ftz.f32 %f1482, %f244, %f445, %f1473; - fma.rn.ftz.f32 %f1483, %f261, %f445, %f1474; - fma.rn.ftz.f32 %f1484, %f212, %f445, %f1475; - fma.rn.ftz.f32 %f1485, %f245, %f445, %f1476; - fma.rn.ftz.f32 %f1486, %f250, %f445, %f1477; - mul.ftz.f32 %f1487, %f479, %f1478; - mul.ftz.f32 %f1488, %f475, %f1487; - mul.ftz.f32 %f1489, %f478, %f1478; - mul.ftz.f32 %f1490, %f476, %f1489; - sub.ftz.f32 %f1491, %f1490, %f1488; - mul.ftz.f32 %f1492, %f477, %f1479; - mul.ftz.f32 %f1493, %f476, %f1492; - sub.ftz.f32 %f1494, %f1491, %f1493; - mul.ftz.f32 %f1495, %f477, %f1480; - fma.rn.ftz.f32 %f1496, %f475, %f1495, %f1494; - mul.ftz.f32 %f1497, %f474, %f1479; - fma.rn.ftz.f32 %f1498, %f479, %f1497, %f1496; - mul.ftz.f32 %f1499, %f474, %f1480; - mul.ftz.f32 %f1500, %f478, %f1499; - sub.ftz.f32 %f1501, %f1498, %f1500; - mul.ftz.f32 %f1502, %f468, %f1481; - fma.rn.ftz.f32 %f1503, %f476, %f1502, %f1501; - mul.ftz.f32 %f1504, %f468, %f1482; - mul.ftz.f32 %f1505, %f475, %f1504; - sub.ftz.f32 %f1506, %f1503, %f1505; - mul.ftz.f32 %f1507, %f470, %f1483; - mul.ftz.f32 %f1508, %f476, %f1507; - sub.ftz.f32 %f1509, %f1506, %f1508; - mul.ftz.f32 %f1510, %f469, %f1483; - fma.rn.ftz.f32 %f1511, %f475, %f1510, %f1509; - fma.rn.ftz.f32 %f1512, %f1482, %f480, %f1511; - mul.ftz.f32 %f1513, %f1481, %f481; - sub.ftz.f32 %f1514, %f1512, %f1513; - fma.rn.ftz.f32 %f1515, %f1484, %f484, %f1514; - mul.ftz.f32 %f1516, %f1485, %f485; - sub.ftz.f32 %f1517, %f1515, %f1516; - mul.ftz.f32 %f1518, %f1484, %f482; - sub.ftz.f32 %f1519, %f1517, %f1518; - fma.rn.ftz.f32 %f1520, %f1485, %f483, %f1519; - mul.ftz.f32 %f1521, %f470, %f1486; - fma.rn.ftz.f32 %f1522, %f479, %f1521, %f1520; - mul.ftz.f32 %f1523, %f469, %f1486; - mul.ftz.f32 %f1524, %f478, %f1523; - sub.ftz.f32 %f1525, %f1522, %f1524; - .loc 17 444 0 - mul.ftz.f32 %f1526, %f70, %f586; - mul.ftz.f32 %f1527, %f87, %f586; - mul.ftz.f32 %f1528, %f76, %f586; - mul.ftz.f32 %f1529, %f875, %f1525; - mul.ftz.f32 %f1530, %f889, %f193; - mul.ftz.f32 %f1531, %f889, %f213; - fma.rn.ftz.f32 %f1532, %f211, %f890, %f1530; - mul.ftz.f32 %f1533, %f889, %f228; - fma.rn.ftz.f32 %f1534, %f890, %f220, %f1531; - fma.rn.ftz.f32 %f1535, %f591, %f90, %f1526; - fma.rn.ftz.f32 %f1536, %f86, %f591, %f1527; - fma.rn.ftz.f32 %f1537, %f591, %f81, %f1528; - fma.rn.ftz.f32 %f1538, %f190, %f891, %f1532; - fma.rn.ftz.f32 %f1539, %f890, %f239, %f1533; - fma.rn.ftz.f32 %f1540, %f891, %f169, %f1534; - mov.f32 %f1541, 0f00000000; // 0 - fma.rn.ftz.f32 %f1542, %f583, %f1541, %f1535; - mov.f32 %f1543, 0f00000000; // 0 - fma.rn.ftz.f32 %f1544, %f1543, %f583, %f1536; - mov.f32 %f1545, 0f00000000; // 0 - fma.rn.ftz.f32 %f1546, %f583, %f1545, %f1537; - fma.rn.ftz.f32 %f1547, %f891, %f140, %f1539; - mul.ftz.f32 %f1548, %f1547, %f419; - mul.ftz.f32 %f1549, %f1542, %f1304; - fma.rn.ftz.f32 %f1550, %f423, %f1538, %f1548; - fma.rn.ftz.f32 %f1551, %f1305, %f1544, %f1549; - fma.rn.ftz.f32 %f1552, %f416, %f1540, %f1550; - fma.rn.ftz.f32 %f1553, %f1306, %f1546, %f1551; - mul.ftz.f32 %f1554, %f622, %f1553; - mul.ftz.f32 %f1555, %f1459, %f871; - sub.ftz.f32 %f1556, %f1555, %f1529; - neg.ftz.f32 %f1557, %f1554; - mul.ftz.f32 %f1558, %f1459, %f678; - sub.ftz.f32 %f1559, %f1556, %f1558; - fma.rn.ftz.f32 %f1560, %f594, %f1559, %f1557; - mul.ftz.f32 %f1561, %f1552, %f732; - fma.rn.ftz.f32 %f1562, %f909, %f1560, %f1561; - mul.ftz.f32 %f1563, %f657, %f1562; - mul.ftz.f32 %f1564, %f1552, %f708; - fma.rn.ftz.f32 %f1565, %f898, %f1560, %f1564; - fma.rn.ftz.f32 %f1566, %f1565, %f654, %f1563; - sub.ftz.f32 %f265, %f265, %f1566; - mul.lo.s32 %r33, %r14, %r1; - cvt.s64.s32 %rd49, %r33; - mul.wide.s32 %rd50, %r33, 4; - add.u64 %rd25, %rd25, %rd50; - setp.gt.u64 %p22, %rd28, %rd25; - @%p22 bra $Lt_0_46338; - bra.uni $Lt_0_45826; -$Lt_0_69634: - mov.f32 %f265, 0f00000000; // 0 - mov.f32 %f266, 0f00000000; // 0 - mov.f32 %f267, 0f00000000; // 0 - mov.f32 %f268, 0f00000000; // 0 - mov.f32 %f269, 0f00000000; // 0 - mov.f32 %f270, 0f00000000; // 0 - mov.f32 %f271, 0f00000000; // 0 -$Lt_0_45826: - mov.u32 %r34, 1; - setp.le.s32 %p23, %r1, %r34; - @%p23 bra $Lt_0_65794; - .loc 17 448 0 - mov.u64 %rd51, __cuda___cuda_local_var_33303_55_non_const_red_acc136; - cvt.s64.s32 %rd52, %r2; - mul.wide.s32 %rd53, %r2, 4; - add.u64 %rd54, %rd51, %rd53; - mov.f32 %f1567, %f270; - st.shared.f32 [%rd54+0], %f1567; - mov.f32 %f1568, %f269; - st.shared.f32 [%rd54+512], %f1568; - mov.f32 %f1569, %f268; - st.shared.f32 [%rd54+1024], %f1569; - mov.f32 %f1570, %f267; - st.shared.f32 [%rd54+1536], %f1570; - mov.f32 %f1571, %f266; - st.shared.f32 [%rd54+2048], %f1571; - mov.f32 %f1572, %f265; - st.shared.f32 [%rd54+2560], %f1572; - shr.s32 %r35, %r1, 31; - mov.s32 %r36, 1; - and.b32 %r37, %r35, %r36; - add.s32 %r38, %r37, %r1; - shr.s32 %r39, %r38, 1; - mov.s32 %r40, %r39; - mov.u32 %r41, 0; - setp.ne.u32 %p24, %r39, %r41; - @!%p24 bra $Lt_0_64258; -$Lt_0_64770: - setp.ge.u32 %p25, %r16, %r40; - @%p25 bra $Lt_0_65026; - add.u32 %r42, %r2, %r40; - cvt.u64.u32 %rd55, %r42; - mul.wide.u32 %rd56, %r42, 4; - add.u64 %rd57, %rd51, %rd56; - ld.shared.f32 %f1573, [%rd57+0]; - add.ftz.f32 %f1567, %f1573, %f1567; - st.shared.f32 [%rd54+0], %f1567; - ld.shared.f32 %f1574, [%rd57+512]; - add.ftz.f32 %f1568, %f1574, %f1568; - st.shared.f32 [%rd54+512], %f1568; - ld.shared.f32 %f1575, [%rd57+1024]; - add.ftz.f32 %f1569, %f1575, %f1569; - st.shared.f32 [%rd54+1024], %f1569; - ld.shared.f32 %f1576, [%rd57+1536]; - add.ftz.f32 %f1570, %f1576, %f1570; - st.shared.f32 [%rd54+1536], %f1570; - ld.shared.f32 %f1577, [%rd57+2048]; - add.ftz.f32 %f1571, %f1577, %f1571; - st.shared.f32 [%rd54+2048], %f1571; - ld.shared.f32 %f1578, [%rd57+2560]; - add.ftz.f32 %f1572, %f1578, %f1572; - st.shared.f32 [%rd54+2560], %f1572; -$Lt_0_65026: - shr.u32 %r40, %r40, 1; - mov.u32 %r43, 0; - setp.ne.u32 %p26, %r40, %r43; - @%p26 bra $Lt_0_64770; -$Lt_0_64258: - mov.f32 %f270, %f1567; - mov.f32 %f269, %f1568; - mov.f32 %f268, %f1569; - mov.f32 %f267, %f1570; - mov.f32 %f266, %f1571; - mov.f32 %f265, %f1572; - ld.param.s32 %r44, [__cudaparm_kernel_ellipsoid_eflag]; - mov.s32 %r45, 0; - set.gt.u32.s32 %r46, %r44, %r45; - neg.s32 %r47, %r46; - ld.param.s32 %r48, [__cudaparm_kernel_ellipsoid_vflag]; - mov.s32 %r49, 0; - set.gt.u32.s32 %r50, %r48, %r49; - neg.s32 %r51, %r50; - or.b32 %r52, %r47, %r51; - mov.u32 %r53, 0; - setp.eq.s32 %p27, %r52, %r53; - @%p27 bra $Lt_0_65794; - mov.f32 %f1567, %f19; - st.shared.f32 [%rd54+0], %f1567; - mov.f32 %f1568, %f21; - st.shared.f32 [%rd54+512], %f1568; - mov.f32 %f1569, %f23; - st.shared.f32 [%rd54+1024], %f1569; - mov.f32 %f1570, %f25; - st.shared.f32 [%rd54+1536], %f1570; - mov.f32 %f1571, %f27; - st.shared.f32 [%rd54+2048], %f1571; - mov.f32 %f1572, %f28; - st.shared.f32 [%rd54+2560], %f1572; - mov.f32 %f1579, %f271; - st.shared.f32 [%rd54+3072], %f1579; - mov.s32 %r54, %r39; - @!%p24 bra $Lt_0_66306; -$Lt_0_66818: - setp.ge.u32 %p28, %r16, %r54; - @%p28 bra $Lt_0_67074; - add.u32 %r55, %r2, %r54; - cvt.u64.u32 %rd58, %r55; - mul.wide.u32 %rd59, %r55, 4; - add.u64 %rd60, %rd51, %rd59; - ld.shared.f32 %f1580, [%rd60+0]; - add.ftz.f32 %f1567, %f1580, %f1567; - st.shared.f32 [%rd54+0], %f1567; - ld.shared.f32 %f1581, [%rd60+512]; - add.ftz.f32 %f1568, %f1581, %f1568; - st.shared.f32 [%rd54+512], %f1568; - ld.shared.f32 %f1582, [%rd60+1024]; - add.ftz.f32 %f1569, %f1582, %f1569; - st.shared.f32 [%rd54+1024], %f1569; - ld.shared.f32 %f1583, [%rd60+1536]; - add.ftz.f32 %f1570, %f1583, %f1570; - st.shared.f32 [%rd54+1536], %f1570; - ld.shared.f32 %f1584, [%rd60+2048]; - add.ftz.f32 %f1571, %f1584, %f1571; - st.shared.f32 [%rd54+2048], %f1571; - ld.shared.f32 %f1585, [%rd60+2560]; - add.ftz.f32 %f1572, %f1585, %f1572; - st.shared.f32 [%rd54+2560], %f1572; - ld.shared.f32 %f1586, [%rd60+3072]; - add.ftz.f32 %f1579, %f1586, %f1579; - st.shared.f32 [%rd54+3072], %f1579; -$Lt_0_67074: - shr.u32 %r54, %r54, 1; - mov.u32 %r56, 0; - setp.ne.u32 %p29, %r54, %r56; - @%p29 bra $Lt_0_66818; -$Lt_0_66306: - mov.f32 %f19, %f1567; - mov.f32 %f21, %f1568; - mov.f32 %f23, %f1569; - mov.f32 %f25, %f1570; - mov.f32 %f27, %f1571; - mov.f32 %f29, %f1572; - mov.f32 %f271, %f1579; -$Lt_0_65794: -$Lt_0_63746: - mov.u32 %r57, 0; - setp.ne.s32 %p30, %r16, %r57; - @%p30 bra $Lt_0_67842; - ld.param.u64 %rd61, [__cudaparm_kernel_ellipsoid_engv]; - add.u64 %rd62, %rd61, %rd3; - ld.param.s32 %r58, [__cudaparm_kernel_ellipsoid_astride]; - ld.param.s32 %r59, [__cudaparm_kernel_ellipsoid_eflag]; - mov.u32 %r60, 0; - setp.le.s32 %p31, %r59, %r60; - @%p31 bra $Lt_0_68354; - st.global.f32 [%rd62+0], %f271; - cvt.s64.s32 %rd63, %r58; - mul.wide.s32 %rd64, %r58, 4; - add.u64 %rd62, %rd62, %rd64; -$Lt_0_68354: - ld.param.s32 %r61, [__cudaparm_kernel_ellipsoid_vflag]; - mov.u32 %r62, 0; - setp.le.s32 %p32, %r61, %r62; - @%p32 bra $Lt_0_68866; - mov.f32 %f1587, %f19; - st.global.f32 [%rd62+0], %f1587; - cvt.s64.s32 %rd65, %r58; - mul.wide.s32 %rd66, %r58, 4; - add.u64 %rd67, %rd66, %rd62; - mov.f32 %f1588, %f21; - st.global.f32 [%rd67+0], %f1588; - add.u64 %rd68, %rd66, %rd67; - mov.f32 %f1589, %f23; - st.global.f32 [%rd68+0], %f1589; - add.u64 %rd69, %rd66, %rd68; - mov.f32 %f1590, %f25; - st.global.f32 [%rd69+0], %f1590; - add.u64 %rd62, %rd66, %rd69; - mov.f32 %f1591, %f27; - st.global.f32 [%rd62+0], %f1591; - mov.f32 %f1592, %f29; - add.u64 %rd70, %rd66, %rd62; - st.global.f32 [%rd70+0], %f1592; -$Lt_0_68866: - ld.param.u64 %rd71, [__cudaparm_kernel_ellipsoid_ans]; - mul.lo.u64 %rd72, %rd2, 16; - add.u64 %rd73, %rd71, %rd72; - mov.f32 %f1593, %f1594; - st.global.v4.f32 [%rd73+0], {%f270,%f269,%f268,%f1593}; - add.s32 %r63, %r8, %r58; - cvt.s64.s32 %rd74, %r63; - mul.wide.s32 %rd75, %r63, 16; - add.u64 %rd76, %rd71, %rd75; - mov.f32 %f1595, %f1596; - st.global.v4.f32 [%rd76+0], {%f267,%f266,%f265,%f1595}; -$Lt_0_67842: -$Lt_0_45314: - .loc 17 451 0 - exit; -$LDWend_kernel_ellipsoid: - } // kernel_ellipsoid - diff --git a/lib/gpu/re_squared_lj.ptx b/lib/gpu/re_squared_lj.ptx deleted file mode 100644 index 117340d909..0000000000 --- a/lib/gpu/re_squared_lj.ptx +++ /dev/null @@ -1,3549 +0,0 @@ - .version 2.3 - .target sm_20 - .address_size 64 - // compiled with /usr/local/cuda/open64/lib//be - // nvopencc 4.0 built on 2011-05-12 - - //----------------------------------------------------------- - // Compiling /tmp/tmpxft_00009bfa_00000000-9_lal_re_squared_lj.cpp3.i (/home/sjplimp/ccBI#.L3ZA3I) - //----------------------------------------------------------- - - //----------------------------------------------------------- - // Options: - //----------------------------------------------------------- - // Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64 - // -O3 (Optimization level) - // -g0 (Debug level) - // -m2 (Report advisories) - //----------------------------------------------------------- - - .file 1 "" - .file 2 "/tmp/tmpxft_00009bfa_00000000-8_lal_re_squared_lj.cudafe2.gpu" - .file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h" - .file 4 "/usr/local/cuda/include/crt/device_runtime.h" - .file 5 "/usr/local/cuda/include/host_defines.h" - .file 6 "/usr/local/cuda/include/builtin_types.h" - .file 7 "/usr/local/cuda/include/device_types.h" - .file 8 "/usr/local/cuda/include/driver_types.h" - .file 9 "/usr/local/cuda/include/surface_types.h" - .file 10 "/usr/local/cuda/include/texture_types.h" - .file 11 "/usr/local/cuda/include/vector_types.h" - .file 12 "/usr/local/cuda/include/device_launch_parameters.h" - .file 13 "/usr/local/cuda/include/crt/storage_class.h" - .file 14 "/usr/include/bits/types.h" - .file 15 "/usr/include/time.h" - .file 16 "lal_ellipsoid_extra.h" - .file 17 "lal_re_squared_lj.cu" - .file 18 "/usr/local/cuda/include/common_functions.h" - .file 19 "/usr/local/cuda/include/math_functions.h" - .file 20 "/usr/local/cuda/include/math_constants.h" - .file 21 "/usr/local/cuda/include/device_functions.h" - .file 22 "/usr/local/cuda/include/sm_11_atomic_functions.h" - .file 23 "/usr/local/cuda/include/sm_12_atomic_functions.h" - .file 24 "/usr/local/cuda/include/sm_13_double_functions.h" - .file 25 "/usr/local/cuda/include/sm_20_atomic_functions.h" - .file 26 "/usr/local/cuda/include/sm_20_intrinsics.h" - .file 27 "/usr/local/cuda/include/surface_functions.h" - .file 28 "/usr/local/cuda/include/texture_fetch_functions.h" - .file 29 "/usr/local/cuda/include/math_functions_dbl_ptx3.h" - - - .entry kernel_ellipsoid_sphere ( - .param .u64 __cudaparm_kernel_ellipsoid_sphere_x_, - .param .u64 __cudaparm_kernel_ellipsoid_sphere_q, - .param .u64 __cudaparm_kernel_ellipsoid_sphere_shape, - .param .u64 __cudaparm_kernel_ellipsoid_sphere_well, - .param .u64 __cudaparm_kernel_ellipsoid_sphere_splj, - .param .u64 __cudaparm_kernel_ellipsoid_sphere_sig_eps, - .param .s32 __cudaparm_kernel_ellipsoid_sphere_ntypes, - .param .u64 __cudaparm_kernel_ellipsoid_sphere_dev_nbor, - .param .s32 __cudaparm_kernel_ellipsoid_sphere_stride, - .param .u64 __cudaparm_kernel_ellipsoid_sphere_ans, - .param .s32 __cudaparm_kernel_ellipsoid_sphere_astride, - .param .u64 __cudaparm_kernel_ellipsoid_sphere_engv, - .param .u64 __cudaparm_kernel_ellipsoid_sphere_err_flag, - .param .s32 __cudaparm_kernel_ellipsoid_sphere_eflag, - .param .s32 __cudaparm_kernel_ellipsoid_sphere_vflag, - .param .s32 __cudaparm_kernel_ellipsoid_sphere_inum, - .param .s32 __cudaparm_kernel_ellipsoid_sphere_t_per_atom) - { - .reg .u32 %r<66>; - .reg .u64 %rd<73>; - .reg .f32 %f<777>; - .reg .pred %p<34>; - .shared .align 16 .b8 __cuda___cuda_local_var_32886_33_non_const_sp_lj120[16]; - .shared .align 4 .b8 __cuda___cuda_local_var_33120_37_non_const_red_acc136[3584]; - .shared .f32 __cuda_local_var_32892_33_non_const_b_alpha; - .shared .f32 __cuda_local_var_32892_42_non_const_cr60; - .shared .f32 __cuda_local_var_32892_48_non_const_solv_f_a; - .shared .f32 __cuda_local_var_32892_58_non_const_solv_f_r; - // __cuda_local_var_32907_9_non_const_virial = 32 - // __cuda_local_var_33040_15_non_const_u = 56 - .loc 17 27 0 -$LDWbegin_kernel_ellipsoid_sphere: - .loc 17 32 0 - ld.param.u64 %rd1, [__cudaparm_kernel_ellipsoid_sphere_splj]; - ldu.global.f32 %f1, [%rd1+0]; - .loc 17 33 0 - ld.global.f32 %f2, [%rd1+4]; - .loc 17 34 0 - ld.global.f32 %f3, [%rd1+8]; - .loc 17 35 0 - ld.global.f32 %f4, [%rd1+12]; - st.shared.v4.f32 [__cuda___cuda_local_var_32886_33_non_const_sp_lj120+0], {%f1,%f2,%f3,%f4}; - .loc 17 38 0 - mov.f32 %f5, 0f3f4db6db; // 0.803571 - st.shared.f32 [__cuda_local_var_32892_33_non_const_b_alpha], %f5; - .loc 17 39 0 - mov.f32 %f6, 0f42700000; // 60 - lg2.approx.ftz.f32 %f7, %f6; - mov.f32 %f8, 0f3eaaaaab; // 0.333333 - mul.ftz.f32 %f9, %f7, %f8; - ex2.approx.ftz.f32 %f10, %f9; - mov.f32 %f11, 0f42700000; // 60 - mul.ftz.f32 %f12, %f10, %f10; - div.approx.ftz.f32 %f13, %f11, %f12; - sub.ftz.f32 %f14, %f10, %f13; - mov.f32 %f15, 0f3eaaaaab; // 0.333333 - mul.ftz.f32 %f16, %f14, %f15; - sub.ftz.f32 %f17, %f10, %f16; - st.shared.f32 [__cuda_local_var_32892_42_non_const_cr60], %f17; - .loc 21 544 0 - mov.f32 %f18, 0f3f800000; // 1 - mov.f32 %f19, 0fbf52c7ea; // -0.823363 - mov.f32 %f20, 0fc0b59883; // -5.67487 - fma.rn.ftz.f32 %f21, %f18, %f19, %f20; - mov.f32 %f22, 0f41455dc0; // 12.3354 - mov.f32 %f23, 0f3f800000; // 1 - mov.f32 %f24, 0f41e6bd60; // 28.8425 - fma.rn.ftz.f32 %f25, %f22, %f23, %f24; - mov.f32 %f26, 0f3f800000; // 1 - mov.f32 %f27, 0fc0d21907; // -6.56556 - fma.rn.ftz.f32 %f28, %f21, %f26, %f27; - mov.f32 %f29, 0f3f800000; // 1 - mov.f32 %f30, 0f419d92c8; // 19.6967 - fma.rn.ftz.f32 %f31, %f25, %f29, %f30; - rcp.approx.ftz.f32 %f32, %f31; - mov.f32 %f33, 0f3f800000; // 1 - fma.rn.ftz.f32 %f34, %f28, %f32, %f33; - mov.b32 %r1, %f34; - mov.b32 %f35, %r1; - mov.f32 %f36, 0f41800000; // 16 - mul.ftz.f32 %f37, %f35, %f36; - mov.f32 %f38, 0f40400000; // 3 - mov.f32 %f39, 0fc2100000; // -36 - mul.ftz.f32 %f40, %f37, %f39; - div.approx.ftz.f32 %f41, %f38, %f40; - .loc 17 40 0 - st.shared.f32 [__cuda_local_var_32892_48_non_const_solv_f_a], %f41; - .loc 21 544 0 - mov.f32 %f42, 0f40400000; // 3 - mov.f32 %f43, 0f44fd2000; // 2025 - mul.ftz.f32 %f44, %f37, %f43; - div.approx.ftz.f32 %f45, %f42, %f44; - .loc 17 41 0 - st.shared.f32 [__cuda_local_var_32892_58_non_const_solv_f_r], %f45; - .loc 17 54 0 - mov.f32 %f46, 0f00000000; // 0 - mov.f32 %f47, %f46; - mov.f32 %f48, 0f00000000; // 0 - mov.f32 %f49, %f48; - mov.f32 %f50, 0f00000000; // 0 - mov.f32 %f51, %f50; - mov.f32 %f52, 0f00000000; // 0 - mov.f32 %f53, %f52; - mov.f32 %f54, 0f00000000; // 0 - mov.f32 %f55, %f54; - mov.f32 %f56, 0f00000000; // 0 - mov.f32 %f57, %f56; - ld.param.s32 %r2, [__cudaparm_kernel_ellipsoid_sphere_t_per_atom]; - cvt.s32.u32 %r3, %tid.x; - div.s32 %r4, %r3, %r2; - cvt.s32.u32 %r5, %ntid.x; - div.s32 %r6, %r5, %r2; - cvt.s32.u32 %r7, %ctaid.x; - mul.lo.s32 %r8, %r7, %r6; - add.s32 %r9, %r4, %r8; - ld.param.s32 %r10, [__cudaparm_kernel_ellipsoid_sphere_inum]; - setp.le.s32 %p1, %r10, %r9; - @%p1 bra $Lt_0_73474; - .loc 17 59 0 - cvt.s64.s32 %rd2, %r9; - mul.wide.s32 %rd3, %r9, 4; - ld.param.u64 %rd4, [__cudaparm_kernel_ellipsoid_sphere_dev_nbor]; - add.u64 %rd5, %rd4, %rd3; - ld.global.s32 %r11, [%rd5+0]; - ld.param.s32 %r12, [__cudaparm_kernel_ellipsoid_sphere_stride]; - cvt.s64.s32 %rd6, %r12; - mul.wide.s32 %rd7, %r12, 4; - add.u64 %rd8, %rd7, %rd5; - ld.global.s32 %r13, [%rd8+0]; - .loc 17 62 0 - cvt.s64.s32 %rd9, %r11; - mul.wide.s32 %rd10, %r11, 16; - ld.param.u64 %rd11, [__cudaparm_kernel_ellipsoid_sphere_x_]; - add.u64 %rd12, %rd10, %rd11; - ld.global.v4.f32 {%f58,%f59,%f60,%f61}, [%rd12+0]; - .loc 17 70 0 - cvt.rzi.ftz.s32.f32 %r14, %f61; - cvt.s64.s32 %rd13, %r14; - mul.wide.s32 %rd14, %r14, 16; - ld.param.u64 %rd15, [__cudaparm_kernel_ellipsoid_sphere_shape]; - add.u64 %rd16, %rd15, %rd14; - ld.global.v4.f32 {%f62,%f63,%f64,_}, [%rd16+0]; - .loc 17 74 0 - ld.param.u64 %rd17, [__cudaparm_kernel_ellipsoid_sphere_q]; - add.u64 %rd18, %rd17, %rd10; - ld.global.v4.f32 {%f65,%f66,%f67,%f68}, [%rd18+0]; - .loc 17 75 0 - ld.param.u64 %rd19, [__cudaparm_kernel_ellipsoid_sphere_well]; - add.u64 %rd20, %rd19, %rd14; - ld.global.v4.f32 {%f69,%f70,%f71,_}, [%rd20+0]; - .loc 17 78 0 - cvt.s32.s64 %r15, %rd6; - sub.s32 %r16, %r2, 1; - and.b32 %r17, %r16, %r3; - add.u64 %rd21, %rd7, %rd8; - mul.lo.s32 %r18, %r15, %r17; - cvt.s64.s32 %rd22, %r18; - mul.wide.s32 %rd23, %r18, 4; - add.u64 %rd24, %rd21, %rd23; - mov.s64 %rd25, %rd24; - mul.lo.s32 %r19, %r15, %r13; - cvt.s64.s32 %rd26, %r19; - mul.wide.s32 %rd27, %r19, 4; - add.u64 %rd28, %rd21, %rd27; - setp.ge.u64 %p2, %rd24, %rd28; - @%p2 bra $Lt_0_75266; - ld.param.s32 %r20, [__cudaparm_kernel_ellipsoid_sphere_vflag]; - mov.s32 %r21, 0; - setp.gt.s32 %p3, %r20, %r21; - add.ftz.f32 %f72, %f66, %f66; - add.ftz.f32 %f73, %f68, %f68; - mul.ftz.f32 %f74, %f65, %f65; - mul.ftz.f32 %f75, %f66, %f66; - mul.ftz.f32 %f76, %f67, %f67; - mul.ftz.f32 %f77, %f68, %f68; - add.ftz.f32 %f78, %f67, %f67; - mul.ftz.f32 %f79, %f62, %f63; - add.ftz.f32 %f80, %f62, %f62; - add.ftz.f32 %f81, %f63, %f63; - add.ftz.f32 %f82, %f64, %f64; - ld.param.s32 %r22, [__cudaparm_kernel_ellipsoid_sphere_ntypes]; - mul.lo.s32 %r23, %r22, %r14; - mul.ftz.f32 %f83, %f72, %f67; - mul.ftz.f32 %f84, %f72, %f68; - mul.ftz.f32 %f85, %f72, %f65; - mul.ftz.f32 %f86, %f73, %f65; - add.ftz.f32 %f87, %f74, %f75; - sub.ftz.f32 %f88, %f74, %f75; - mul.ftz.f32 %f89, %f78, %f65; - mul.ftz.f32 %f90, %f78, %f68; - mul.ftz.f32 %f91, %f79, %f64; - sub.ftz.f32 %f92, %f83, %f86; - add.ftz.f32 %f93, %f83, %f86; - sub.ftz.f32 %f94, %f86, %f83; - sub.ftz.f32 %f95, %f87, %f76; - add.ftz.f32 %f96, %f76, %f88; - sub.ftz.f32 %f97, %f88, %f76; - add.ftz.f32 %f98, %f84, %f89; - sub.ftz.f32 %f99, %f84, %f89; - sub.ftz.f32 %f100, %f89, %f84; - sub.ftz.f32 %f101, %f90, %f85; - add.ftz.f32 %f102, %f85, %f90; - sub.ftz.f32 %f103, %f85, %f90; - mul.ftz.f32 %f104, %f92, %f70; - mul.ftz.f32 %f105, %f93, %f69; - neg.ftz.f32 %f106, %f93; - sub.ftz.f32 %f107, %f95, %f77; - sub.ftz.f32 %f108, %f77, %f95; - sub.ftz.f32 %f109, %f96, %f77; - sub.ftz.f32 %f110, %f77, %f96; - add.ftz.f32 %f111, %f77, %f97; - mul.ftz.f32 %f112, %f98, %f71; - neg.ftz.f32 %f113, %f98; - mul.ftz.f32 %f114, %f99, %f69; - mul.ftz.f32 %f115, %f101, %f71; - mul.ftz.f32 %f116, %f102, %f70; - mul.ftz.f32 %f117, %f92, %f104; - mul.ftz.f32 %f118, %f102, %f104; - mul.ftz.f32 %f119, %f107, %f69; - mul.ftz.f32 %f120, %f104, %f109; - mul.ftz.f32 %f121, %f109, %f70; - mul.ftz.f32 %f122, %f111, %f71; - neg.ftz.f32 %f123, %f111; - mul.ftz.f32 %f124, %f92, %f116; - mul.ftz.f32 %f125, %f109, %f116; - mul.ftz.f32 %f126, %f102, %f116; - fma.rn.ftz.f32 %f127, %f107, %f119, %f117; - fma.rn.ftz.f32 %f128, %f119, %f99, %f118; - fma.rn.ftz.f32 %f129, %f119, %f93, %f120; - mul.ftz.f32 %f130, %f92, %f121; - mul.ftz.f32 %f131, %f109, %f121; - mul.ftz.f32 %f132, %f102, %f121; - fma.rn.ftz.f32 %f133, %f107, %f114, %f124; - fma.rn.ftz.f32 %f134, %f93, %f114, %f125; - fma.rn.ftz.f32 %f135, %f99, %f114, %f126; - fma.rn.ftz.f32 %f136, %f112, %f98, %f127; - fma.rn.ftz.f32 %f137, %f112, %f111, %f128; - fma.rn.ftz.f32 %f138, %f112, %f101, %f129; - fma.rn.ftz.f32 %f139, %f107, %f105, %f130; - fma.rn.ftz.f32 %f140, %f93, %f105, %f131; - fma.rn.ftz.f32 %f141, %f99, %f105, %f132; - fma.rn.ftz.f32 %f142, %f98, %f122, %f133; - fma.rn.ftz.f32 %f143, %f101, %f122, %f134; - fma.rn.ftz.f32 %f144, %f111, %f122, %f135; - mov.f32 %f145, 0f3f800000; // 1 - add.ftz.f32 %f146, %f136, %f145; - fma.rn.ftz.f32 %f147, %f115, %f98, %f139; - fma.rn.ftz.f32 %f148, %f115, %f101, %f140; - fma.rn.ftz.f32 %f149, %f115, %f111, %f141; - abs.ftz.f32 %f150, %f142; - mov.f32 %f151, 0f3f800000; // 1 - add.ftz.f32 %f152, %f144, %f151; - abs.ftz.f32 %f153, %f146; - abs.ftz.f32 %f154, %f147; - mov.f32 %f155, 0f3f800000; // 1 - add.ftz.f32 %f156, %f148, %f155; - setp.lt.ftz.f32 %p4, %f153, %f154; - ld.param.u64 %rd29, [__cudaparm_kernel_ellipsoid_sphere_sig_eps]; - mov.f32 %f157, 0f00000000; // 0 - mov.f32 %f158, 0f00000000; // 0 - mov.f32 %f159, 0f00000000; // 0 - mov.f32 %f160, 0f00000000; // 0 - mov.f32 %f161, 0f00000000; // 0 - mov.f32 %f162, 0f00000000; // 0 - mov.f32 %f163, 0f00000000; // 0 - mov.u64 %rd30, __cuda___cuda_local_var_32886_33_non_const_sp_lj120; -$Lt_0_51970: - // Loop body line 78, nesting depth: 1, estimated iterations: unknown - .loc 17 83 0 - ld.global.s32 %r24, [%rd25+0]; - .loc 17 87 0 - and.b32 %r25, %r24, 1073741823; - cvt.s64.s32 %rd31, %r25; - mul.wide.s32 %rd32, %r25, 16; - add.u64 %rd33, %rd11, %rd32; - ld.global.v4.f32 {%f164,%f165,%f166,%f167}, [%rd33+0]; - .loc 17 98 0 - sub.ftz.f32 %f168, %f165, %f59; - sub.ftz.f32 %f169, %f164, %f58; - sub.ftz.f32 %f170, %f166, %f60; - mul.ftz.f32 %f171, %f168, %f168; - fma.rn.ftz.f32 %f172, %f169, %f169, %f171; - fma.rn.ftz.f32 %f173, %f170, %f170, %f172; - rsqrt.approx.ftz.f32 %f174, %f173; - mul.ftz.f32 %f175, %f169, %f174; - .loc 17 99 0 - mul.ftz.f32 %f176, %f168, %f174; - .loc 17 104 0 - cvt.rzi.ftz.s32.f32 %r26, %f167; - add.s32 %r27, %r26, %r23; - cvt.s64.s32 %rd34, %r27; - mul.wide.s32 %rd35, %r27, 8; - add.u64 %rd36, %rd29, %rd35; - ld.global.v2.f32 {%f177,%f178}, [%rd36+0]; - .loc 17 105 0 - shr.s32 %r28, %r24, 30; - and.b32 %r29, %r28, 3; - cvt.s64.s32 %rd37, %r29; - mul.wide.s32 %rd38, %r29, 4; - add.u64 %rd39, %rd30, %rd38; - ld.shared.f32 %f179, [%rd39+0]; - mul.ftz.f32 %f180, %f179, %f178; - .loc 16 299 0 - mov.f32 %f181, %f175; - .loc 16 300 0 - mov.f32 %f182, 0f3f000000; // 0.5 - mul.ftz.f32 %f183, %f177, %f182; - add.ftz.f32 %f184, %f183, %f63; - add.ftz.f32 %f185, %f183, %f62; - add.ftz.f32 %f186, %f183, %f64; - mul.ftz.f32 %f187, %f184, %f184; - mul.ftz.f32 %f188, %f185, %f185; - mul.ftz.f32 %f189, %f186, %f186; - mov.f32 %f190, 0f3f000000; // 0.5 - mul.ftz.f32 %f191, %f187, %f190; - mov.f32 %f192, 0f3f000000; // 0.5 - mul.ftz.f32 %f193, %f188, %f192; - mov.f32 %f194, 0f3f000000; // 0.5 - mul.ftz.f32 %f195, %f189, %f194; - mul.ftz.f32 %f196, %f92, %f191; - mul.ftz.f32 %f197, %f98, %f195; - mul.ftz.f32 %f198, %f193, %f107; - mul.ftz.f32 %f199, %f92, %f196; - fma.rn.ftz.f32 %f200, %f107, %f198, %f199; - fma.rn.ftz.f32 %f201, %f197, %f98, %f200; - mov.f32 %f202, %f201; - .loc 16 301 0 - mul.ftz.f32 %f203, %f109, %f196; - fma.rn.ftz.f32 %f204, %f198, %f93, %f203; - fma.rn.ftz.f32 %f205, %f197, %f101, %f204; - mov.f32 %f206, %f205; - .loc 16 302 0 - mul.ftz.f32 %f207, %f102, %f196; - fma.rn.ftz.f32 %f208, %f198, %f99, %f207; - fma.rn.ftz.f32 %f209, %f197, %f111, %f208; - mov.f32 %f210, %f209; - .loc 16 303 0 - mov.f32 %f211, %f176; - .loc 16 304 0 - mul.ftz.f32 %f212, %f93, %f193; - mul.ftz.f32 %f213, %f101, %f195; - mul.ftz.f32 %f214, %f191, %f109; - mul.ftz.f32 %f215, %f92, %f214; - fma.rn.ftz.f32 %f216, %f107, %f212, %f215; - fma.rn.ftz.f32 %f217, %f213, %f98, %f216; - mov.f32 %f218, %f217; - .loc 16 305 0 - mul.ftz.f32 %f219, %f109, %f214; - fma.rn.ftz.f32 %f220, %f93, %f212, %f219; - fma.rn.ftz.f32 %f221, %f213, %f101, %f220; - mov.f32 %f222, %f221; - .loc 16 306 0 - mul.ftz.f32 %f223, %f102, %f214; - fma.rn.ftz.f32 %f224, %f99, %f212, %f223; - fma.rn.ftz.f32 %f225, %f213, %f111, %f224; - mov.f32 %f226, %f225; - .loc 16 307 0 - mul.ftz.f32 %f227, %f170, %f174; - mov.f32 %f228, %f227; - .loc 16 308 0 - mul.ftz.f32 %f229, %f102, %f191; - mul.ftz.f32 %f230, %f99, %f193; - mul.ftz.f32 %f231, %f195, %f111; - mul.ftz.f32 %f232, %f92, %f229; - fma.rn.ftz.f32 %f233, %f107, %f230, %f232; - fma.rn.ftz.f32 %f234, %f98, %f231, %f233; - mov.f32 %f235, %f234; - .loc 16 309 0 - mul.ftz.f32 %f236, %f109, %f229; - fma.rn.ftz.f32 %f237, %f93, %f230, %f236; - fma.rn.ftz.f32 %f238, %f101, %f231, %f237; - mov.f32 %f239, %f238; - .loc 16 310 0 - mul.ftz.f32 %f240, %f102, %f229; - fma.rn.ftz.f32 %f241, %f99, %f230, %f240; - fma.rn.ftz.f32 %f242, %f111, %f231, %f241; - mov.f32 %f243, %f242; - abs.ftz.f32 %f244, %f217; - abs.ftz.f32 %f245, %f201; - setp.gt.ftz.f32 %p5, %f244, %f245; - @!%p5 bra $Lt_0_52226; - .loc 16 314 0 - mov.f32 %f202, %f217; - mov.f32 %f218, %f201; - .loc 16 315 0 - mov.f32 %f206, %f221; - mov.f32 %f222, %f205; - .loc 16 316 0 - mov.f32 %f210, %f225; - mov.f32 %f226, %f209; - .loc 16 317 0 - mov.f32 %f181, %f176; - mov.f32 %f211, %f175; -$Lt_0_52226: - mov.f32 %f246, %f202; - abs.ftz.f32 %f247, %f246; - abs.ftz.f32 %f248, %f234; - setp.lt.ftz.f32 %p6, %f247, %f248; - @!%p6 bra $Lt_0_52738; - .loc 16 321 0 - mov.f32 %f202, %f234; - mov.f32 %f235, %f246; - .loc 16 322 0 - mov.f32 %f249, %f206; - mov.f32 %f206, %f238; - mov.f32 %f239, %f249; - .loc 16 323 0 - mov.f32 %f250, %f210; - mov.f32 %f210, %f242; - mov.f32 %f243, %f250; - .loc 16 324 0 - mov.f32 %f251, %f181; - mov.f32 %f181, %f227; - mov.f32 %f228, %f251; -$Lt_0_52738: - mov.f32 %f252, %f202; - mov.f32 %f253, 0f00000000; // 0 - setp.neu.ftz.f32 %p7, %f252, %f253; - @!%p7 bra $Lt_0_53506; - bra.uni $Lt_0_54274; -$Lt_0_53506: - mov.f32 %f254, 0f00000000; // 0 - setp.neu.ftz.f32 %p8, %f218, %f254; - @!%p8 bra $Lt_0_54018; - .loc 16 338 0 - mov.f32 %f202, %f218; - mov.f32 %f218, %f252; - .loc 16 339 0 - mov.f32 %f255, %f206; - mov.f32 %f206, %f222; - mov.f32 %f222, %f255; - .loc 16 340 0 - mov.f32 %f256, %f210; - mov.f32 %f210, %f226; - mov.f32 %f226, %f256; - .loc 16 341 0 - mov.f32 %f257, %f181; - mov.f32 %f181, %f211; - mov.f32 %f211, %f257; - bra.uni $Lt_0_54274; -$Lt_0_54018: - mov.f32 %f258, 0f00000000; // 0 - setp.neu.ftz.f32 %p9, %f235, %f258; - @!%p9 bra $Lt_0_54530; - .loc 16 346 0 - mov.f32 %f202, %f235; - mov.f32 %f235, %f252; - .loc 16 347 0 - mov.f32 %f259, %f206; - mov.f32 %f206, %f239; - mov.f32 %f239, %f259; - .loc 16 348 0 - mov.f32 %f260, %f210; - mov.f32 %f210, %f243; - mov.f32 %f243, %f260; - .loc 16 349 0 - mov.f32 %f261, %f181; - mov.f32 %f181, %f228; - mov.f32 %f228, %f261; - bra.uni $Lt_0_54274; -$Lt_0_54530: - .loc 16 352 0 - mov.s32 %r30, 2; - ld.param.u64 %rd40, [__cudaparm_kernel_ellipsoid_sphere_err_flag]; - st.global.s32 [%rd40+0], %r30; -$Lt_0_54274: -$Lt_0_53762: -$Lt_0_53250: - .loc 16 355 0 - div.approx.ftz.f32 %f262, %f218, %f202; - mul.ftz.f32 %f263, %f206, %f262; - sub.ftz.f32 %f264, %f222, %f263; - mov.f32 %f222, %f264; - .loc 16 356 0 - mul.ftz.f32 %f265, %f210, %f262; - sub.ftz.f32 %f266, %f226, %f265; - mov.f32 %f226, %f266; - .loc 16 357 0 - mul.ftz.f32 %f267, %f181, %f262; - sub.ftz.f32 %f268, %f211, %f267; - mov.f32 %f211, %f268; - .loc 16 359 0 - div.approx.ftz.f32 %f269, %f235, %f202; - mul.ftz.f32 %f270, %f206, %f269; - sub.ftz.f32 %f239, %f239, %f270; - .loc 16 360 0 - mul.ftz.f32 %f271, %f210, %f269; - sub.ftz.f32 %f243, %f243, %f271; - .loc 16 361 0 - mul.ftz.f32 %f272, %f181, %f269; - sub.ftz.f32 %f228, %f228, %f272; - abs.ftz.f32 %f273, %f264; - abs.ftz.f32 %f274, %f239; - setp.lt.ftz.f32 %p10, %f273, %f274; - @!%p10 bra $Lt_0_54786; - .loc 16 366 0 - mov.f32 %f222, %f239; - mov.f32 %f239, %f264; - .loc 16 367 0 - mov.f32 %f226, %f243; - mov.f32 %f243, %f266; - .loc 16 368 0 - mov.f32 %f211, %f228; - mov.f32 %f228, %f268; -$Lt_0_54786: - mov.f32 %f275, %f222; - mov.f32 %f276, 0f00000000; // 0 - setp.neu.ftz.f32 %p11, %f275, %f276; - @!%p11 bra $Lt_0_55554; - bra.uni $Lt_0_55810; -$Lt_0_55554: - mov.f32 %f277, 0f00000000; // 0 - setp.neu.ftz.f32 %p12, %f239, %f277; - @!%p12 bra $Lt_0_55810; - .loc 16 383 0 - mov.f32 %f222, %f239; - mov.f32 %f239, %f275; - .loc 16 384 0 - mov.f32 %f278, %f226; - mov.f32 %f226, %f243; - mov.f32 %f243, %f278; - .loc 16 385 0 - mov.f32 %f279, %f211; - mov.f32 %f211, %f228; - mov.f32 %f228, %f279; -$Lt_0_55810: -$Lt_0_55298: - .loc 16 390 0 - div.approx.ftz.f32 %f280, %f239, %f222; - mul.ftz.f32 %f281, %f226, %f280; - sub.ftz.f32 %f243, %f243, %f281; - .loc 16 391 0 - mul.ftz.f32 %f282, %f211, %f280; - sub.ftz.f32 %f228, %f228, %f282; - mov.f32 %f283, 0f00000000; // 0 - setp.eq.ftz.f32 %p13, %f243, %f283; - @!%p13 bra $Lt_0_56322; - .loc 16 394 0 - mov.s32 %r31, 2; - ld.param.u64 %rd41, [__cudaparm_kernel_ellipsoid_sphere_err_flag]; - st.global.s32 [%rd41+0], %r31; -$Lt_0_56322: - .loc 16 396 0 - div.approx.ftz.f32 %f284, %f228, %f243; - .loc 16 399 0 - mul.ftz.f32 %f285, %f284, %f226; - sub.ftz.f32 %f286, %f211, %f285; - div.approx.ftz.f32 %f287, %f286, %f222; - .loc 16 403 0 - mul.ftz.f32 %f288, %f287, %f206; - fma.rn.ftz.f32 %f289, %f210, %f284, %f288; - sub.ftz.f32 %f290, %f181, %f289; - div.approx.ftz.f32 %f291, %f290, %f202; - .loc 17 124 0 - mul.ftz.f32 %f292, %f287, %f176; - fma.rn.ftz.f32 %f293, %f175, %f291, %f292; - fma.rn.ftz.f32 %f294, %f227, %f284, %f293; - mov.f32 %f295, 0f3f000000; // 0.5 - mul.ftz.f32 %f296, %f294, %f295; - rsqrt.approx.ftz.f32 %f297, %f296; - .loc 16 299 0 - mov.f32 %f181, %f175; - .loc 16 300 0 - mov.f32 %f202, %f146; - .loc 16 301 0 - mov.f32 %f206, %f138; - .loc 16 302 0 - mov.f32 %f210, %f137; - .loc 16 303 0 - mov.f32 %f211, %f176; - .loc 16 304 0 - mov.f32 %f218, %f147; - .loc 16 305 0 - mov.f32 %f222, %f156; - .loc 16 306 0 - mov.f32 %f226, %f149; - .loc 16 307 0 - mov.f32 %f228, %f227; - .loc 16 308 0 - mov.f32 %f235, %f142; - .loc 16 309 0 - mov.f32 %f239, %f143; - .loc 16 310 0 - mov.f32 %f243, %f152; - @!%p4 bra $Lt_0_56834; - .loc 16 314 0 - mov.f32 %f202, %f147; - mov.f32 %f218, %f146; - .loc 16 315 0 - mov.f32 %f206, %f156; - mov.f32 %f222, %f138; - .loc 16 316 0 - mov.f32 %f210, %f149; - mov.f32 %f226, %f137; - .loc 16 317 0 - mov.f32 %f181, %f176; - mov.f32 %f211, %f175; -$Lt_0_56834: - mov.f32 %f298, %f202; - abs.ftz.f32 %f299, %f298; - setp.gt.ftz.f32 %p14, %f150, %f299; - @!%p14 bra $Lt_0_57346; - .loc 16 321 0 - mov.f32 %f202, %f142; - mov.f32 %f235, %f298; - .loc 16 322 0 - mov.f32 %f300, %f206; - mov.f32 %f206, %f143; - mov.f32 %f239, %f300; - .loc 16 323 0 - mov.f32 %f301, %f210; - mov.f32 %f210, %f152; - mov.f32 %f243, %f301; - .loc 16 324 0 - mov.f32 %f302, %f181; - mov.f32 %f181, %f227; - mov.f32 %f228, %f302; -$Lt_0_57346: - mov.f32 %f303, %f202; - mov.f32 %f304, 0f00000000; // 0 - setp.neu.ftz.f32 %p15, %f303, %f304; - @!%p15 bra $Lt_0_58114; - bra.uni $Lt_0_58882; -$Lt_0_58114: - mov.f32 %f305, 0f00000000; // 0 - setp.neu.ftz.f32 %p16, %f218, %f305; - @!%p16 bra $Lt_0_58626; - .loc 16 338 0 - mov.f32 %f202, %f218; - mov.f32 %f218, %f303; - .loc 16 339 0 - mov.f32 %f306, %f206; - mov.f32 %f206, %f222; - mov.f32 %f222, %f306; - .loc 16 340 0 - mov.f32 %f307, %f210; - mov.f32 %f210, %f226; - mov.f32 %f226, %f307; - .loc 16 341 0 - mov.f32 %f308, %f181; - mov.f32 %f181, %f211; - mov.f32 %f211, %f308; - bra.uni $Lt_0_58882; -$Lt_0_58626: - mov.f32 %f309, 0f00000000; // 0 - setp.neu.ftz.f32 %p17, %f235, %f309; - @!%p17 bra $Lt_0_59138; - .loc 16 346 0 - mov.f32 %f202, %f235; - mov.f32 %f235, %f303; - .loc 16 347 0 - mov.f32 %f310, %f206; - mov.f32 %f206, %f239; - mov.f32 %f239, %f310; - .loc 16 348 0 - mov.f32 %f311, %f210; - mov.f32 %f210, %f243; - mov.f32 %f243, %f311; - .loc 16 349 0 - mov.f32 %f312, %f181; - mov.f32 %f181, %f228; - mov.f32 %f228, %f312; - bra.uni $Lt_0_58882; -$Lt_0_59138: - .loc 16 352 0 - mov.s32 %r32, 2; - ld.param.u64 %rd42, [__cudaparm_kernel_ellipsoid_sphere_err_flag]; - st.global.s32 [%rd42+0], %r32; -$Lt_0_58882: -$Lt_0_58370: -$Lt_0_57858: - .loc 16 355 0 - div.approx.ftz.f32 %f313, %f218, %f202; - mul.ftz.f32 %f314, %f206, %f313; - sub.ftz.f32 %f315, %f222, %f314; - mov.f32 %f222, %f315; - .loc 16 356 0 - mul.ftz.f32 %f316, %f210, %f313; - sub.ftz.f32 %f317, %f226, %f316; - mov.f32 %f226, %f317; - .loc 16 357 0 - mul.ftz.f32 %f318, %f181, %f313; - sub.ftz.f32 %f319, %f211, %f318; - mov.f32 %f211, %f319; - .loc 16 359 0 - div.approx.ftz.f32 %f320, %f235, %f202; - mul.ftz.f32 %f321, %f206, %f320; - sub.ftz.f32 %f239, %f239, %f321; - .loc 16 360 0 - mul.ftz.f32 %f322, %f210, %f320; - sub.ftz.f32 %f243, %f243, %f322; - .loc 16 361 0 - mul.ftz.f32 %f323, %f181, %f320; - sub.ftz.f32 %f228, %f228, %f323; - abs.ftz.f32 %f324, %f315; - abs.ftz.f32 %f325, %f239; - setp.lt.ftz.f32 %p18, %f324, %f325; - @!%p18 bra $Lt_0_59394; - .loc 16 366 0 - mov.f32 %f222, %f239; - mov.f32 %f239, %f315; - .loc 16 367 0 - mov.f32 %f226, %f243; - mov.f32 %f243, %f317; - .loc 16 368 0 - mov.f32 %f211, %f228; - mov.f32 %f228, %f319; -$Lt_0_59394: - mov.f32 %f326, %f222; - mov.f32 %f327, 0f00000000; // 0 - setp.neu.ftz.f32 %p19, %f326, %f327; - @!%p19 bra $Lt_0_60162; - bra.uni $Lt_0_60418; -$Lt_0_60162: - mov.f32 %f328, 0f00000000; // 0 - setp.neu.ftz.f32 %p20, %f239, %f328; - @!%p20 bra $Lt_0_60418; - .loc 16 383 0 - mov.f32 %f222, %f239; - mov.f32 %f239, %f326; - .loc 16 384 0 - mov.f32 %f329, %f226; - mov.f32 %f226, %f243; - mov.f32 %f243, %f329; - .loc 16 385 0 - mov.f32 %f330, %f211; - mov.f32 %f211, %f228; - mov.f32 %f228, %f330; -$Lt_0_60418: -$Lt_0_59906: - .loc 16 390 0 - div.approx.ftz.f32 %f331, %f239, %f222; - mul.ftz.f32 %f332, %f226, %f331; - sub.ftz.f32 %f243, %f243, %f332; - .loc 16 391 0 - mul.ftz.f32 %f333, %f211, %f331; - sub.ftz.f32 %f228, %f228, %f333; - mov.f32 %f334, 0f00000000; // 0 - setp.eq.ftz.f32 %p21, %f243, %f334; - @!%p21 bra $Lt_0_60930; - .loc 16 394 0 - mov.s32 %r33, 2; - ld.param.u64 %rd43, [__cudaparm_kernel_ellipsoid_sphere_err_flag]; - st.global.s32 [%rd43+0], %r33; -$Lt_0_60930: - .loc 17 133 0 - div.approx.ftz.f32 %f335, %f228, %f243; - mul.ftz.f32 %f336, %f335, %f226; - sub.ftz.f32 %f337, %f211, %f336; - div.approx.ftz.f32 %f338, %f337, %f222; - mul.ftz.f32 %f339, %f338, %f206; - fma.rn.ftz.f32 %f340, %f210, %f335, %f339; - mul.ftz.f32 %f341, %f338, %f176; - sub.ftz.f32 %f342, %f181, %f340; - div.approx.ftz.f32 %f343, %f342, %f202; - fma.rn.ftz.f32 %f344, %f175, %f343, %f341; - fma.rn.ftz.f32 %f345, %f227, %f335, %f344; - add.ftz.f32 %f346, %f345, %f345; - .loc 17 141 0 - rcp.approx.ftz.f32 %f347, %f174; - sub.ftz.f32 %f348, %f347, %f297; - mov.f32 %f349, 0f3f000000; // 0.5 - mul.ftz.f32 %f350, %f348, %f349; - mul.ftz.f32 %f351, %f348, %f348; - mul.ftz.f32 %f352, %f348, %f351; - add.ftz.f32 %f353, %f350, %f64; - add.ftz.f32 %f354, %f350, %f62; - add.ftz.f32 %f355, %f350, %f63; - mul.ftz.f32 %f356, %f354, %f355; - mul.ftz.f32 %f357, %f353, %f356; - mul.ftz.f32 %f358, %f352, %f357; - .loc 17 142 0 - div.approx.ftz.f32 %f359, %f177, %f348; - mul.ftz.f32 %f360, %f359, %f346; - mov.f32 %f361, 0f3f800000; // 1 - mov.f32 %f362, 0f40400000; // 3 - fma.rn.ftz.f32 %f363, %f362, %f360, %f361; - mul.ftz.f32 %f364, %f91, %f363; - .loc 17 146 0 - div.approx.ftz.f32 %f365, %f348, %f17; - add.ftz.f32 %f366, %f365, %f64; - add.ftz.f32 %f367, %f365, %f62; - add.ftz.f32 %f368, %f365, %f63; - mul.ftz.f32 %f369, %f367, %f368; - mul.ftz.f32 %f370, %f366, %f369; - mul.ftz.f32 %f371, %f352, %f370; - .loc 17 148 0 - mov.f32 %f372, 0f3f800000; // 1 - mov.f32 %f373, 0f3f4db6db; // 0.803571 - fma.rn.ftz.f32 %f374, %f373, %f360, %f372; - mul.ftz.f32 %f375, %f91, %f374; - .loc 17 150 0 - mul.ftz.f32 %f376, %f359, %f359; - mul.ftz.f32 %f377, %f359, %f376; - mul.ftz.f32 %f378, %f377, %f377; - .loc 17 153 0 - mul.ftz.f32 %f379, %f177, %f177; - mov.f32 %f380, 0f41000000; // 8 - div.approx.ftz.f32 %f381, %f358, %f380; - mov.f32 %f382, 0f42700000; // 60 - div.approx.ftz.f32 %f383, %f371, %f382; - mul.ftz.f32 %f384, %f379, %f177; - div.approx.ftz.f32 %f385, %f364, %f381; - div.approx.ftz.f32 %f386, %f375, %f383; - mul.ftz.f32 %f387, %f385, %f180; - mul.ftz.f32 %f388, %f386, %f180; - mul.ftz.f32 %f389, %f384, %f387; - mul.ftz.f32 %f390, %f384, %f388; - mul.ftz.f32 %f391, %f389, %f41; - mul.ftz.f32 %f392, %f390, %f378; - mul.ftz.f32 %f393, %f392, %f45; - add.ftz.f32 %f394, %f391, %f393; - add.ftz.f32 %f163, %f163, %f394; - .loc 17 160 0 - mov.f32 %f395, 0f40800000; // 4 - mul.ftz.f32 %f396, %f343, %f395; - .loc 17 167 0 - mov.f32 %f397, 0f40400000; // 3 - div.approx.ftz.f32 %f398, %f397, %f348; - add.ftz.f32 %f399, %f80, %f348; - rcp.approx.ftz.f32 %f400, %f399; - add.ftz.f32 %f401, %f81, %f348; - rcp.approx.ftz.f32 %f402, %f401; - add.ftz.f32 %f403, %f400, %f402; - add.ftz.f32 %f404, %f82, %f348; - rcp.approx.ftz.f32 %f405, %f404; - add.ftz.f32 %f406, %f403, %f405; - add.ftz.f32 %f407, %f398, %f406; - .loc 17 172 0 - mul.ftz.f32 %f408, %f177, %f346; - mov.f32 %f409, 0f40400000; // 3 - fma.rn.ftz.f32 %f410, %f409, %f408, %f348; - rcp.approx.ftz.f32 %f411, %f410; - rcp.approx.ftz.f32 %f412, %f348; - sub.ftz.f32 %f413, %f412, %f411; - add.ftz.f32 %f414, %f407, %f413; - .loc 17 175 0 - fma.rn.ftz.f32 %f415, %f17, %f62, %f348; - rcp.approx.ftz.f32 %f416, %f415; - fma.rn.ftz.f32 %f417, %f17, %f63, %f348; - rcp.approx.ftz.f32 %f418, %f417; - add.ftz.f32 %f419, %f416, %f418; - fma.rn.ftz.f32 %f420, %f17, %f64, %f348; - rcp.approx.ftz.f32 %f421, %f420; - add.ftz.f32 %f422, %f419, %f421; - add.ftz.f32 %f423, %f398, %f422; - .loc 17 186 0 - mul.ftz.f32 %f424, %f175, %f175; - neg.ftz.f32 %f425, %f424; - mov.f32 %f426, %f425; - .loc 17 187 0 - mul.ftz.f32 %f427, %f176, %f175; - neg.ftz.f32 %f428, %f427; - mov.f32 %f429, %f428; - .loc 17 188 0 - mul.ftz.f32 %f430, %f227, %f175; - neg.ftz.f32 %f431, %f430; - mov.f32 %f432, %f431; - .loc 17 189 0 - mov.f32 %f433, 0f3f800000; // 1 - sub.ftz.f32 %f434, %f433, %f424; - mov.f32 %f435, %f434; - .loc 17 190 0 - mul.ftz.f32 %f436, %f174, %f434; - mov.f32 %f437, %f436; - .loc 17 191 0 - mov.f32 %f438, %f429; - mul.ftz.f32 %f439, %f438, %f174; - mov.f32 %f440, %f439; - .loc 17 192 0 - mov.f32 %f441, %f432; - mul.ftz.f32 %f442, %f441, %f174; - mov.f32 %f443, %f442; - .loc 17 196 0 - mul.ftz.f32 %f444, %f297, %f297; - mov.f32 %f445, 0f3f4db6db; // 0.803571 - mul.ftz.f32 %f446, %f177, %f445; - mov.f32 %f447, 0f40800000; // 4 - mul.ftz.f32 %f448, %f335, %f447; - mul.ftz.f32 %f449, %f444, %f297; - mov.f32 %f450, 0f3f000000; // 0.5 - mul.ftz.f32 %f451, %f449, %f450; - mul.ftz.f32 %f452, %f451, %f287; - mul.ftz.f32 %f453, %f451, %f291; - mul.ftz.f32 %f454, %f451, %f284; - mov.f32 %f455, 0f40800000; // 4 - mul.ftz.f32 %f456, %f338, %f455; - mul.ftz.f32 %f457, %f452, %f439; - mul.ftz.f32 %f458, %f456, %f439; - mov.f32 %f459, 0f40e00000; // 7 - div.approx.ftz.f32 %f460, %f459, %f348; - mov.f32 %f461, 0f3f4db6db; // 0.803571 - fma.rn.ftz.f32 %f462, %f461, %f408, %f348; - rcp.approx.ftz.f32 %f463, %f462; - fma.rn.ftz.f32 %f464, %f453, %f436, %f457; - fma.rn.ftz.f32 %f465, %f396, %f436, %f458; - sub.ftz.f32 %f466, %f460, %f463; - mul.ftz.f32 %f467, %f446, %f463; - fma.rn.ftz.f32 %f468, %f454, %f442, %f464; - fma.rn.ftz.f32 %f469, %f448, %f442, %f465; - add.ftz.f32 %f470, %f466, %f423; - add.ftz.f32 %f471, %f468, %f175; - mul.ftz.f32 %f472, %f470, %f471; - mul.ftz.f32 %f473, %f467, %f469; - sub.ftz.f32 %f474, %f473, %f472; - .loc 17 197 0 - mov.f32 %f475, 0f40400000; // 3 - mul.ftz.f32 %f476, %f177, %f475; - mul.ftz.f32 %f477, %f476, %f411; - mul.ftz.f32 %f478, %f393, %f474; - mul.ftz.f32 %f479, %f471, %f414; - mul.ftz.f32 %f480, %f477, %f469; - sub.ftz.f32 %f481, %f480, %f479; - fma.rn.ftz.f32 %f482, %f391, %f481, %f478; - .loc 17 199 0 - add.ftz.f32 %f162, %f482, %f162; - @!%p3 bra $Lt_0_61954; - .loc 17 201 0 - mov.f32 %f483, %f47; - mul.ftz.f32 %f484, %f169, %f482; - sub.ftz.f32 %f485, %f483, %f484; - mov.f32 %f47, %f485; -$Lt_0_61954: - .loc 17 186 0 - mov.f32 %f486, %f428; - .loc 17 187 0 - mul.ftz.f32 %f487, %f176, %f176; - neg.ftz.f32 %f488, %f487; - mov.f32 %f489, %f488; - .loc 17 188 0 - mul.ftz.f32 %f490, %f227, %f176; - neg.ftz.f32 %f491, %f490; - mov.f32 %f492, %f491; - .loc 17 189 0 - mov.f32 %f493, 0f3f800000; // 1 - sub.ftz.f32 %f494, %f493, %f487; - mov.f32 %f495, %f494; - .loc 17 190 0 - mov.f32 %f496, %f486; - mul.ftz.f32 %f497, %f496, %f174; - mov.f32 %f498, %f497; - .loc 17 191 0 - mul.ftz.f32 %f499, %f174, %f494; - mov.f32 %f500, %f499; - .loc 17 192 0 - mov.f32 %f501, %f492; - mul.ftz.f32 %f502, %f501, %f174; - mov.f32 %f503, %f502; - .loc 17 196 0 - mul.ftz.f32 %f504, %f452, %f499; - mul.ftz.f32 %f505, %f456, %f499; - fma.rn.ftz.f32 %f506, %f453, %f497, %f504; - fma.rn.ftz.f32 %f507, %f396, %f497, %f505; - fma.rn.ftz.f32 %f508, %f454, %f502, %f506; - fma.rn.ftz.f32 %f509, %f448, %f502, %f507; - add.ftz.f32 %f510, %f508, %f176; - mul.ftz.f32 %f511, %f470, %f510; - mul.ftz.f32 %f512, %f467, %f509; - sub.ftz.f32 %f513, %f512, %f511; - .loc 17 197 0 - mul.ftz.f32 %f514, %f393, %f513; - mul.ftz.f32 %f515, %f510, %f414; - mul.ftz.f32 %f516, %f477, %f509; - sub.ftz.f32 %f517, %f516, %f515; - fma.rn.ftz.f32 %f482, %f391, %f517, %f514; - .loc 17 203 0 - add.ftz.f32 %f161, %f482, %f161; - @!%p3 bra $Lt_0_65538; - .loc 17 205 0 - mov.f32 %f518, %f49; - mul.ftz.f32 %f519, %f168, %f482; - sub.ftz.f32 %f520, %f518, %f519; - mov.f32 %f49, %f520; - .loc 17 206 0 - mov.f32 %f521, %f53; - mul.ftz.f32 %f522, %f169, %f482; - sub.ftz.f32 %f523, %f521, %f522; - mov.f32 %f53, %f523; -$Lt_0_65538: - .loc 17 186 0 - mov.f32 %f524, %f431; - .loc 17 187 0 - mov.f32 %f525, %f491; - .loc 17 188 0 - mul.ftz.f32 %f526, %f227, %f227; - neg.ftz.f32 %f527, %f526; - mov.f32 %f528, %f527; - .loc 17 189 0 - mov.f32 %f529, 0f3f800000; // 1 - sub.ftz.f32 %f530, %f529, %f526; - mov.f32 %f531, %f530; - .loc 17 190 0 - mov.f32 %f532, %f524; - mul.ftz.f32 %f533, %f532, %f174; - mov.f32 %f534, %f533; - .loc 17 191 0 - mov.f32 %f535, %f525; - mul.ftz.f32 %f536, %f535, %f174; - mov.f32 %f537, %f536; - .loc 17 192 0 - mul.ftz.f32 %f538, %f174, %f530; - mov.f32 %f539, %f538; - .loc 17 196 0 - mul.ftz.f32 %f540, %f452, %f536; - mul.ftz.f32 %f541, %f456, %f536; - fma.rn.ftz.f32 %f542, %f453, %f533, %f540; - fma.rn.ftz.f32 %f543, %f396, %f533, %f541; - fma.rn.ftz.f32 %f544, %f454, %f538, %f542; - fma.rn.ftz.f32 %f545, %f448, %f538, %f543; - add.ftz.f32 %f546, %f544, %f227; - mul.ftz.f32 %f547, %f546, %f470; - mul.ftz.f32 %f548, %f467, %f545; - sub.ftz.f32 %f549, %f548, %f547; - .loc 17 197 0 - mul.ftz.f32 %f550, %f393, %f549; - mul.ftz.f32 %f551, %f546, %f414; - mul.ftz.f32 %f552, %f477, %f545; - sub.ftz.f32 %f553, %f552, %f551; - fma.rn.ftz.f32 %f482, %f391, %f553, %f550; - .loc 17 209 0 - add.ftz.f32 %f160, %f482, %f160; - @!%p3 bra $Lt_0_68610; - .loc 17 211 0 - mov.f32 %f554, %f51; - mul.ftz.f32 %f555, %f170, %f482; - sub.ftz.f32 %f556, %f554, %f555; - mov.f32 %f51, %f556; - .loc 17 212 0 - mov.f32 %f557, %f55; - mul.ftz.f32 %f558, %f169, %f482; - sub.ftz.f32 %f559, %f557, %f558; - mov.f32 %f55, %f559; - .loc 17 213 0 - mul.ftz.f32 %f560, %f168, %f482; - sub.ftz.f32 %f56, %f56, %f560; - mov.f32 %f57, %f56; -$Lt_0_68610: - .loc 17 232 0 - mul.ftz.f32 %f561, %f102, %f338; - mul.ftz.f32 %f562, %f100, %f338; - mul.ftz.f32 %f563, %f111, %f338; - mov.f32 %f564, 0f00000000; // 0 - mov.f32 %f565, 0f00000000; // 0 - fma.rn.ftz.f32 %f566, %f565, %f212, %f564; - mov.f32 %f567, 0f00000000; // 0 - mov.f32 %f568, 0f00000000; // 0 - fma.rn.ftz.f32 %f569, %f568, %f230, %f567; - mov.f32 %f570, 0f00000000; // 0 - mov.f32 %f571, 0f00000000; // 0 - fma.rn.ftz.f32 %f572, %f571, %f198, %f570; - mul.ftz.f32 %f573, %f121, %f456; - mul.ftz.f32 %f574, %f105, %f456; - mul.ftz.f32 %f575, %f115, %f456; - neg.ftz.f32 %f576, %f561; - neg.ftz.f32 %f577, %f563; - neg.ftz.f32 %f578, %f207; - neg.ftz.f32 %f579, %f240; - mov.f32 %f580, 0f00000000; // 0 - fma.rn.ftz.f32 %f581, %f213, %f580, %f566; - mov.f32 %f582, 0f00000000; // 0 - fma.rn.ftz.f32 %f583, %f582, %f231, %f569; - mov.f32 %f584, 0f00000000; // 0 - fma.rn.ftz.f32 %f585, %f197, %f584, %f572; - neg.ftz.f32 %f586, %f223; - fma.rn.ftz.f32 %f587, %f198, %f100, %f578; - fma.rn.ftz.f32 %f588, %f100, %f230, %f579; - fma.rn.ftz.f32 %f589, %f100, %f212, %f586; - fma.rn.ftz.f32 %f590, %f197, %f123, %f587; - fma.rn.ftz.f32 %f591, %f123, %f231, %f588; - fma.rn.ftz.f32 %f592, %f213, %f123, %f589; - mov.f32 %f593, 0f00000000; // 0 - fma.rn.ftz.f32 %f594, %f343, %f593, %f576; - mov.f32 %f595, 0f00000000; // 0 - fma.rn.ftz.f32 %f596, %f595, %f343, %f562; - mov.f32 %f597, 0f00000000; // 0 - fma.rn.ftz.f32 %f598, %f343, %f597, %f577; - mul.ftz.f32 %f599, %f452, %f590; - mul.ftz.f32 %f600, %f452, %f591; - mul.ftz.f32 %f601, %f452, %f592; - fma.rn.ftz.f32 %f602, %f396, %f104, %f573; - fma.rn.ftz.f32 %f603, %f119, %f396, %f574; - fma.rn.ftz.f32 %f604, %f396, %f112, %f575; - fma.rn.ftz.f32 %f605, %f335, %f109, %f594; - fma.rn.ftz.f32 %f606, %f93, %f335, %f596; - fma.rn.ftz.f32 %f607, %f335, %f101, %f598; - fma.rn.ftz.f32 %f608, %f453, %f585, %f599; - fma.rn.ftz.f32 %f609, %f453, %f583, %f600; - fma.rn.ftz.f32 %f610, %f453, %f581, %f601; - fma.rn.ftz.f32 %f611, %f448, %f116, %f602; - fma.rn.ftz.f32 %f612, %f448, %f114, %f603; - fma.rn.ftz.f32 %f613, %f448, %f122, %f604; - fma.rn.ftz.f32 %f614, %f454, %f205, %f608; - fma.rn.ftz.f32 %f615, %f454, %f238, %f609; - fma.rn.ftz.f32 %f616, %f454, %f221, %f610; - mul.ftz.f32 %f617, %f605, %f611; - mul.ftz.f32 %f618, %f616, %f287; - fma.rn.ftz.f32 %f619, %f612, %f606, %f617; - fma.rn.ftz.f32 %f620, %f291, %f614, %f618; - fma.rn.ftz.f32 %f621, %f613, %f607, %f619; - fma.rn.ftz.f32 %f622, %f284, %f615, %f620; - neg.ftz.f32 %f623, %f621; - mul.ftz.f32 %f624, %f470, %f622; - fma.rn.ftz.f32 %f625, %f467, %f623, %f624; - mul.ftz.f32 %f626, %f393, %f625; - mul.ftz.f32 %f627, %f622, %f414; - fma.rn.ftz.f32 %f628, %f477, %f623, %f627; - fma.rn.ftz.f32 %f629, %f391, %f628, %f626; - sub.ftz.f32 %f159, %f159, %f629; - .loc 17 245 0 - mul.ftz.f32 %f630, %f94, %f196; - mul.ftz.f32 %f631, %f94, %f229; - mov.f32 %f632, 0f00000000; // 0 - mov.f32 %f633, 0f00000000; // 0 - fma.rn.ftz.f32 %f634, %f198, %f633, %f632; - mul.ftz.f32 %f635, %f94, %f214; - fma.rn.ftz.f32 %f636, %f99, %f198, %f207; - fma.rn.ftz.f32 %f637, %f198, %f108, %f630; - fma.rn.ftz.f32 %f638, %f108, %f230, %f631; - mov.f32 %f639, 0f00000000; // 0 - fma.rn.ftz.f32 %f640, %f197, %f639, %f634; - fma.rn.ftz.f32 %f641, %f108, %f212, %f635; - fma.rn.ftz.f32 %f642, %f197, %f111, %f636; - fma.rn.ftz.f32 %f643, %f197, %f113, %f637; - fma.rn.ftz.f32 %f644, %f113, %f231, %f638; - mul.ftz.f32 %f645, %f452, %f581; - mul.ftz.f32 %f646, %f452, %f583; - mul.ftz.f32 %f647, %f452, %f640; - fma.rn.ftz.f32 %f648, %f213, %f113, %f641; - fma.rn.ftz.f32 %f649, %f453, %f242, %f646; - fma.rn.ftz.f32 %f650, %f453, %f642, %f647; - fma.rn.ftz.f32 %f651, %f453, %f225, %f645; - mov.f32 %f652, 0f00000000; // 0 - fma.rn.ftz.f32 %f653, %f343, %f102, %f652; - mov.f32 %f654, 0f00000000; // 0 - fma.rn.ftz.f32 %f655, %f99, %f343, %f654; - mov.f32 %f656, 0f00000000; // 0 - fma.rn.ftz.f32 %f657, %f343, %f111, %f656; - fma.rn.ftz.f32 %f658, %f454, %f644, %f649; - fma.rn.ftz.f32 %f659, %f454, %f643, %f650; - fma.rn.ftz.f32 %f660, %f454, %f648, %f651; - fma.rn.ftz.f32 %f661, %f335, %f94, %f653; - fma.rn.ftz.f32 %f662, %f108, %f335, %f655; - fma.rn.ftz.f32 %f663, %f335, %f113, %f657; - mul.ftz.f32 %f664, %f660, %f287; - fma.rn.ftz.f32 %f665, %f291, %f659, %f664; - mul.ftz.f32 %f666, %f661, %f611; - fma.rn.ftz.f32 %f667, %f284, %f658, %f665; - fma.rn.ftz.f32 %f668, %f612, %f662, %f666; - fma.rn.ftz.f32 %f669, %f613, %f663, %f668; - neg.ftz.f32 %f670, %f669; - mul.ftz.f32 %f671, %f470, %f667; - fma.rn.ftz.f32 %f672, %f467, %f670, %f671; - mul.ftz.f32 %f673, %f393, %f672; - mul.ftz.f32 %f674, %f667, %f414; - fma.rn.ftz.f32 %f675, %f477, %f670, %f674; - fma.rn.ftz.f32 %f676, %f391, %f675, %f673; - sub.ftz.f32 %f158, %f158, %f676; - .loc 17 258 0 - mul.ftz.f32 %f677, %f92, %f338; - mul.ftz.f32 %f678, %f107, %f338; - mul.ftz.f32 %f679, %f98, %f338; - mul.ftz.f32 %f680, %f110, %f196; - mul.ftz.f32 %f681, %f110, %f229; - mul.ftz.f32 %f682, %f110, %f214; - fma.rn.ftz.f32 %f683, %f198, %f107, %f199; - fma.rn.ftz.f32 %f684, %f106, %f198, %f680; - fma.rn.ftz.f32 %f685, %f106, %f230, %f681; - fma.rn.ftz.f32 %f686, %f106, %f212, %f682; - fma.rn.ftz.f32 %f687, %f197, %f98, %f683; - fma.rn.ftz.f32 %f688, %f197, %f103, %f684; - fma.rn.ftz.f32 %f689, %f103, %f231, %f685; - fma.rn.ftz.f32 %f690, %f213, %f103, %f686; - mul.ftz.f32 %f691, %f452, %f687; - mul.ftz.f32 %f692, %f452, %f234; - mul.ftz.f32 %f693, %f452, %f217; - fma.rn.ftz.f32 %f694, %f343, %f110, %f677; - fma.rn.ftz.f32 %f695, %f106, %f343, %f678; - fma.rn.ftz.f32 %f696, %f343, %f103, %f679; - fma.rn.ftz.f32 %f697, %f453, %f688, %f691; - fma.rn.ftz.f32 %f698, %f453, %f689, %f692; - fma.rn.ftz.f32 %f699, %f453, %f690, %f693; - mov.f32 %f700, 0f00000000; // 0 - fma.rn.ftz.f32 %f701, %f335, %f700, %f694; - mov.f32 %f702, 0f00000000; // 0 - fma.rn.ftz.f32 %f703, %f702, %f335, %f695; - mov.f32 %f704, 0f00000000; // 0 - fma.rn.ftz.f32 %f705, %f335, %f704, %f696; - fma.rn.ftz.f32 %f706, %f454, %f640, %f697; - fma.rn.ftz.f32 %f707, %f454, %f583, %f698; - fma.rn.ftz.f32 %f708, %f454, %f581, %f699; - mul.ftz.f32 %f709, %f708, %f287; - mul.ftz.f32 %f710, %f701, %f611; - fma.rn.ftz.f32 %f711, %f291, %f706, %f709; - fma.rn.ftz.f32 %f712, %f612, %f703, %f710; - fma.rn.ftz.f32 %f713, %f284, %f707, %f711; - fma.rn.ftz.f32 %f714, %f613, %f705, %f712; - neg.ftz.f32 %f715, %f714; - mul.ftz.f32 %f716, %f470, %f713; - fma.rn.ftz.f32 %f717, %f467, %f715, %f716; - mul.ftz.f32 %f718, %f393, %f717; - mul.ftz.f32 %f719, %f713, %f414; - fma.rn.ftz.f32 %f720, %f477, %f715, %f719; - fma.rn.ftz.f32 %f721, %f391, %f720, %f718; - sub.ftz.f32 %f157, %f157, %f721; - mul.lo.s32 %r34, %r15, %r2; - cvt.s64.s32 %rd44, %r34; - mul.wide.s32 %rd45, %r34, 4; - add.u64 %rd25, %rd25, %rd45; - setp.gt.u64 %p22, %rd28, %rd25; - @%p22 bra $Lt_0_51970; - bra.uni $Lt_0_51458; -$Lt_0_75266: - mov.f32 %f157, 0f00000000; // 0 - mov.f32 %f158, 0f00000000; // 0 - mov.f32 %f159, 0f00000000; // 0 - mov.f32 %f160, 0f00000000; // 0 - mov.f32 %f161, 0f00000000; // 0 - mov.f32 %f162, 0f00000000; // 0 - mov.f32 %f163, 0f00000000; // 0 -$Lt_0_51458: - mov.u32 %r35, 1; - setp.le.s32 %p23, %r2, %r35; - @%p23 bra $Lt_0_71426; - .loc 17 267 0 - mov.u64 %rd46, __cuda___cuda_local_var_33120_37_non_const_red_acc136; - cvt.s64.s32 %rd47, %r3; - mul.wide.s32 %rd48, %r3, 4; - add.u64 %rd49, %rd46, %rd48; - mov.f32 %f722, %f162; - st.shared.f32 [%rd49+0], %f722; - .loc 17 268 0 - mov.f32 %f723, %f161; - st.shared.f32 [%rd49+512], %f723; - .loc 17 269 0 - mov.f32 %f724, %f160; - st.shared.f32 [%rd49+1024], %f724; - .loc 17 270 0 - mov.f32 %f725, %f159; - st.shared.f32 [%rd49+1536], %f725; - .loc 17 271 0 - mov.f32 %f726, %f158; - st.shared.f32 [%rd49+2048], %f726; - .loc 17 272 0 - mov.f32 %f727, %f157; - st.shared.f32 [%rd49+2560], %f727; - .loc 17 274 0 - shr.s32 %r36, %r2, 31; - mov.s32 %r37, 1; - and.b32 %r38, %r36, %r37; - add.s32 %r39, %r38, %r2; - shr.s32 %r40, %r39, 1; - mov.s32 %r41, %r40; - mov.u32 %r42, 0; - setp.ne.u32 %p24, %r40, %r42; - @!%p24 bra $Lt_0_69890; -$Lt_0_70402: - setp.ge.u32 %p25, %r17, %r41; - @%p25 bra $Lt_0_70658; - .loc 17 277 0 - add.u32 %r43, %r3, %r41; - cvt.u64.u32 %rd50, %r43; - mul.wide.u32 %rd51, %r43, 4; - add.u64 %rd52, %rd46, %rd51; - ld.shared.f32 %f728, [%rd52+0]; - add.ftz.f32 %f722, %f728, %f722; - st.shared.f32 [%rd49+0], %f722; - ld.shared.f32 %f729, [%rd52+512]; - add.ftz.f32 %f723, %f729, %f723; - st.shared.f32 [%rd49+512], %f723; - ld.shared.f32 %f730, [%rd52+1024]; - add.ftz.f32 %f724, %f730, %f724; - st.shared.f32 [%rd49+1024], %f724; - ld.shared.f32 %f731, [%rd52+1536]; - add.ftz.f32 %f725, %f731, %f725; - st.shared.f32 [%rd49+1536], %f725; - ld.shared.f32 %f732, [%rd52+2048]; - add.ftz.f32 %f726, %f732, %f726; - st.shared.f32 [%rd49+2048], %f726; - ld.shared.f32 %f733, [%rd52+2560]; - add.ftz.f32 %f727, %f733, %f727; - st.shared.f32 [%rd49+2560], %f727; -$Lt_0_70658: - .loc 17 274 0 - shr.u32 %r41, %r41, 1; - mov.u32 %r44, 0; - setp.ne.u32 %p26, %r41, %r44; - @%p26 bra $Lt_0_70402; -$Lt_0_69890: - .loc 17 281 0 - mov.f32 %f162, %f722; - .loc 17 282 0 - mov.f32 %f161, %f723; - .loc 17 283 0 - mov.f32 %f160, %f724; - .loc 17 284 0 - mov.f32 %f159, %f725; - .loc 17 285 0 - mov.f32 %f158, %f726; - .loc 17 286 0 - mov.f32 %f157, %f727; - ld.param.s32 %r45, [__cudaparm_kernel_ellipsoid_sphere_eflag]; - mov.s32 %r46, 0; - set.gt.u32.s32 %r47, %r45, %r46; - neg.s32 %r48, %r47; - ld.param.s32 %r49, [__cudaparm_kernel_ellipsoid_sphere_vflag]; - mov.s32 %r50, 0; - set.gt.u32.s32 %r51, %r49, %r50; - neg.s32 %r52, %r51; - or.b32 %r53, %r48, %r52; - mov.u32 %r54, 0; - setp.eq.s32 %p27, %r53, %r54; - @%p27 bra $Lt_0_71426; - .loc 17 290 0 - mov.f32 %f722, %f47; - st.shared.f32 [%rd49+0], %f722; - mov.f32 %f723, %f49; - st.shared.f32 [%rd49+512], %f723; - mov.f32 %f724, %f51; - st.shared.f32 [%rd49+1024], %f724; - mov.f32 %f725, %f53; - st.shared.f32 [%rd49+1536], %f725; - mov.f32 %f726, %f55; - st.shared.f32 [%rd49+2048], %f726; - mov.f32 %f727, %f56; - st.shared.f32 [%rd49+2560], %f727; - .loc 17 291 0 - mov.f32 %f734, %f163; - st.shared.f32 [%rd49+3072], %f734; - .loc 17 293 0 - mov.s32 %r55, %r40; - @!%p24 bra $Lt_0_71938; -$Lt_0_72450: - setp.ge.u32 %p28, %r17, %r55; - @%p28 bra $Lt_0_72706; - .loc 17 296 0 - add.u32 %r56, %r3, %r55; - cvt.u64.u32 %rd53, %r56; - mul.wide.u32 %rd54, %r56, 4; - add.u64 %rd55, %rd46, %rd54; - ld.shared.f32 %f735, [%rd55+0]; - add.ftz.f32 %f722, %f735, %f722; - st.shared.f32 [%rd49+0], %f722; - ld.shared.f32 %f736, [%rd55+512]; - add.ftz.f32 %f723, %f736, %f723; - st.shared.f32 [%rd49+512], %f723; - ld.shared.f32 %f737, [%rd55+1024]; - add.ftz.f32 %f724, %f737, %f724; - st.shared.f32 [%rd49+1024], %f724; - ld.shared.f32 %f738, [%rd55+1536]; - add.ftz.f32 %f725, %f738, %f725; - st.shared.f32 [%rd49+1536], %f725; - ld.shared.f32 %f739, [%rd55+2048]; - add.ftz.f32 %f726, %f739, %f726; - st.shared.f32 [%rd49+2048], %f726; - ld.shared.f32 %f740, [%rd55+2560]; - add.ftz.f32 %f727, %f740, %f727; - st.shared.f32 [%rd49+2560], %f727; - ld.shared.f32 %f741, [%rd55+3072]; - add.ftz.f32 %f734, %f741, %f734; - st.shared.f32 [%rd49+3072], %f734; -$Lt_0_72706: - .loc 17 293 0 - shr.u32 %r55, %r55, 1; - mov.u32 %r57, 0; - setp.ne.u32 %p29, %r55, %r57; - @%p29 bra $Lt_0_72450; -$Lt_0_71938: - .loc 17 301 0 - mov.f32 %f47, %f722; - mov.f32 %f49, %f723; - mov.f32 %f51, %f724; - mov.f32 %f53, %f725; - mov.f32 %f55, %f726; - mov.f32 %f57, %f727; - .loc 17 302 0 - mov.f32 %f163, %f734; -$Lt_0_71426: -$Lt_0_69378: - mov.u32 %r58, 0; - setp.ne.s32 %p30, %r17, %r58; - @%p30 bra $Lt_0_73474; - .loc 17 308 0 - ld.param.u64 %rd56, [__cudaparm_kernel_ellipsoid_sphere_engv]; - add.u64 %rd57, %rd56, %rd3; - ld.param.s32 %r59, [__cudaparm_kernel_ellipsoid_sphere_astride]; - ld.param.s32 %r60, [__cudaparm_kernel_ellipsoid_sphere_eflag]; - mov.u32 %r61, 0; - setp.le.s32 %p31, %r60, %r61; - @%p31 bra $Lt_0_73986; - .loc 17 310 0 - ld.global.f32 %f742, [%rd57+0]; - add.ftz.f32 %f743, %f742, %f163; - st.global.f32 [%rd57+0], %f743; - .loc 17 311 0 - cvt.s64.s32 %rd58, %r59; - mul.wide.s32 %rd59, %r59, 4; - add.u64 %rd57, %rd57, %rd59; -$Lt_0_73986: - ld.param.s32 %r62, [__cudaparm_kernel_ellipsoid_sphere_vflag]; - mov.u32 %r63, 0; - setp.le.s32 %p32, %r62, %r63; - @%p32 bra $Lt_0_74498; - .loc 17 315 0 - ld.global.f32 %f744, [%rd57+0]; - mov.f32 %f745, %f47; - add.ftz.f32 %f746, %f744, %f745; - st.global.f32 [%rd57+0], %f746; - .loc 17 316 0 - cvt.s64.s32 %rd60, %r59; - mul.wide.s32 %rd61, %r59, 4; - add.u64 %rd62, %rd61, %rd57; - .loc 17 315 0 - ld.global.f32 %f747, [%rd62+0]; - mov.f32 %f748, %f49; - add.ftz.f32 %f749, %f747, %f748; - st.global.f32 [%rd62+0], %f749; - .loc 17 316 0 - add.u64 %rd63, %rd61, %rd62; - .loc 17 315 0 - ld.global.f32 %f750, [%rd63+0]; - mov.f32 %f751, %f51; - add.ftz.f32 %f752, %f750, %f751; - st.global.f32 [%rd63+0], %f752; - .loc 17 316 0 - add.u64 %rd64, %rd61, %rd63; - .loc 17 315 0 - ld.global.f32 %f753, [%rd64+0]; - mov.f32 %f754, %f53; - add.ftz.f32 %f755, %f753, %f754; - st.global.f32 [%rd64+0], %f755; - .loc 17 316 0 - add.u64 %rd65, %rd61, %rd64; - .loc 17 315 0 - ld.global.f32 %f756, [%rd65+0]; - mov.f32 %f757, %f55; - add.ftz.f32 %f758, %f756, %f757; - st.global.f32 [%rd65+0], %f758; - .loc 17 316 0 - add.u64 %rd57, %rd61, %rd65; - .loc 17 315 0 - ld.global.f32 %f759, [%rd57+0]; - mov.f32 %f760, %f57; - add.ftz.f32 %f761, %f759, %f760; - st.global.f32 [%rd57+0], %f761; -$Lt_0_74498: - .loc 17 319 0 - ld.param.u64 %rd66, [__cudaparm_kernel_ellipsoid_sphere_ans]; - mul.lo.u64 %rd67, %rd2, 16; - add.u64 %rd68, %rd66, %rd67; - ld.global.v4.f32 {%f762,%f763,%f764,%f765}, [%rd68+0]; - .loc 17 321 0 - add.ftz.f32 %f766, %f763, %f161; - .loc 17 322 0 - add.ftz.f32 %f767, %f764, %f160; - .loc 17 323 0 - add.ftz.f32 %f768, %f762, %f162; - st.global.v4.f32 [%rd68+0], {%f768,%f766,%f767,%f765}; - .loc 17 325 0 - add.s32 %r64, %r9, %r59; - cvt.s64.s32 %rd69, %r64; - mul.wide.s32 %rd70, %r64, 16; - add.u64 %rd71, %rd66, %rd70; - ld.global.v4.f32 {%f769,%f770,%f771,%f772}, [%rd71+0]; - .loc 17 327 0 - add.ftz.f32 %f773, %f770, %f158; - .loc 17 328 0 - add.ftz.f32 %f774, %f771, %f157; - .loc 17 329 0 - add.ftz.f32 %f775, %f769, %f159; - st.global.v4.f32 [%rd71+0], {%f775,%f773,%f774,%f772}; -$Lt_0_73474: -$Lt_0_50946: - .loc 17 332 0 - exit; -$LDWend_kernel_ellipsoid_sphere: - } // kernel_ellipsoid_sphere - - .entry kernel_sphere_ellipsoid ( - .param .u64 __cudaparm_kernel_sphere_ellipsoid_x_, - .param .u64 __cudaparm_kernel_sphere_ellipsoid_q, - .param .u64 __cudaparm_kernel_sphere_ellipsoid_shape, - .param .u64 __cudaparm_kernel_sphere_ellipsoid_well, - .param .u64 __cudaparm_kernel_sphere_ellipsoid_splj, - .param .u64 __cudaparm_kernel_sphere_ellipsoid_sig_eps, - .param .s32 __cudaparm_kernel_sphere_ellipsoid_ntypes, - .param .u64 __cudaparm_kernel_sphere_ellipsoid_dev_nbor, - .param .s32 __cudaparm_kernel_sphere_ellipsoid_stride, - .param .u64 __cudaparm_kernel_sphere_ellipsoid_ans, - .param .u64 __cudaparm_kernel_sphere_ellipsoid___val_paramengv, - .param .u64 __cudaparm_kernel_sphere_ellipsoid_err_flag, - .param .s32 __cudaparm_kernel_sphere_ellipsoid_eflag, - .param .s32 __cudaparm_kernel_sphere_ellipsoid_vflag, - .param .s32 __cudaparm_kernel_sphere_ellipsoid_start, - .param .s32 __cudaparm_kernel_sphere_ellipsoid_inum, - .param .s32 __cudaparm_kernel_sphere_ellipsoid_t_per_atom) - { - .reg .u32 %r<58>; - .reg .u64 %rd<70>; - .reg .f32 %f<567>; - .reg .pred %p<34>; - .shared .align 16 .b8 __cuda___cuda_local_var_33201_33_non_const_sp_lj3836[16]; - .shared .align 4 .b8 __cuda___cuda_local_var_33377_55_non_const_red_acc3852[3072]; - .shared .f32 __cuda_local_var_33207_33_non_const_b_alpha; - .shared .f32 __cuda_local_var_33207_42_non_const_cr60; - .shared .f32 __cuda_local_var_33207_48_non_const_solv_f_a; - .shared .f32 __cuda_local_var_33207_58_non_const_solv_f_r; - // __cuda_local_var_33214_10_non_const_f = 80 - // __cuda_local_var_33218_9_non_const_virial = 32 - // __cuda_local_var_33344_15_non_const_u = 56 - .loc 17 341 0 -$LDWbegin_kernel_sphere_ellipsoid: - .loc 17 347 0 - ld.param.u64 %rd1, [__cudaparm_kernel_sphere_ellipsoid_splj]; - ldu.global.f32 %f1, [%rd1+0]; - .loc 17 348 0 - ld.global.f32 %f2, [%rd1+4]; - .loc 17 349 0 - ld.global.f32 %f3, [%rd1+8]; - .loc 17 350 0 - ld.global.f32 %f4, [%rd1+12]; - st.shared.v4.f32 [__cuda___cuda_local_var_33201_33_non_const_sp_lj3836+0], {%f1,%f2,%f3,%f4}; - .loc 17 353 0 - mov.f32 %f5, 0f3f4db6db; // 0.803571 - st.shared.f32 [__cuda_local_var_33207_33_non_const_b_alpha], %f5; - .loc 17 354 0 - mov.f32 %f6, 0f42700000; // 60 - lg2.approx.ftz.f32 %f7, %f6; - mov.f32 %f8, 0f3eaaaaab; // 0.333333 - mul.ftz.f32 %f9, %f7, %f8; - ex2.approx.ftz.f32 %f10, %f9; - mov.f32 %f11, 0f42700000; // 60 - mul.ftz.f32 %f12, %f10, %f10; - div.approx.ftz.f32 %f13, %f11, %f12; - sub.ftz.f32 %f14, %f10, %f13; - mov.f32 %f15, 0f3eaaaaab; // 0.333333 - mul.ftz.f32 %f16, %f14, %f15; - sub.ftz.f32 %f17, %f10, %f16; - st.shared.f32 [__cuda_local_var_33207_42_non_const_cr60], %f17; - .loc 21 544 0 - mov.f32 %f18, 0f3f800000; // 1 - mov.f32 %f19, 0fbf52c7ea; // -0.823363 - mov.f32 %f20, 0fc0b59883; // -5.67487 - fma.rn.ftz.f32 %f21, %f18, %f19, %f20; - mov.f32 %f22, 0f41455dc0; // 12.3354 - mov.f32 %f23, 0f3f800000; // 1 - mov.f32 %f24, 0f41e6bd60; // 28.8425 - fma.rn.ftz.f32 %f25, %f22, %f23, %f24; - mov.f32 %f26, 0f3f800000; // 1 - mov.f32 %f27, 0fc0d21907; // -6.56556 - fma.rn.ftz.f32 %f28, %f21, %f26, %f27; - mov.f32 %f29, 0f3f800000; // 1 - mov.f32 %f30, 0f419d92c8; // 19.6967 - fma.rn.ftz.f32 %f31, %f25, %f29, %f30; - rcp.approx.ftz.f32 %f32, %f31; - mov.f32 %f33, 0f3f800000; // 1 - fma.rn.ftz.f32 %f34, %f28, %f32, %f33; - mov.b32 %r1, %f34; - mov.b32 %f35, %r1; - mov.f32 %f36, 0f41800000; // 16 - mul.ftz.f32 %f37, %f35, %f36; - mov.f32 %f38, 0f40400000; // 3 - mov.f32 %f39, 0fc2100000; // -36 - mul.ftz.f32 %f40, %f37, %f39; - div.approx.ftz.f32 %f41, %f38, %f40; - .loc 17 355 0 - st.shared.f32 [__cuda_local_var_33207_48_non_const_solv_f_a], %f41; - .loc 21 544 0 - mov.f32 %f42, 0f40400000; // 3 - mov.f32 %f43, 0f44fd2000; // 2025 - mul.ftz.f32 %f44, %f37, %f43; - div.approx.ftz.f32 %f45, %f42, %f44; - .loc 17 356 0 - st.shared.f32 [__cuda_local_var_33207_58_non_const_solv_f_r], %f45; - .loc 17 365 0 - mov.f32 %f46, 0f00000000; // 0 - mov.f32 %f47, %f46; - mov.f32 %f48, 0f00000000; // 0 - mov.f32 %f49, %f48; - mov.f32 %f50, 0f00000000; // 0 - mov.f32 %f51, %f50; - mov.f32 %f52, 0f00000000; // 0 - mov.f32 %f53, %f52; - mov.f32 %f54, 0f00000000; // 0 - mov.f32 %f55, %f54; - mov.f32 %f56, 0f00000000; // 0 - mov.f32 %f57, %f56; - ld.param.s32 %r2, [__cudaparm_kernel_sphere_ellipsoid_t_per_atom]; - cvt.s32.u32 %r3, %tid.x; - div.s32 %r4, %r3, %r2; - cvt.s32.u32 %r5, %ntid.x; - div.s32 %r6, %r5, %r2; - cvt.s32.u32 %r7, %ctaid.x; - mul.lo.s32 %r8, %r7, %r6; - add.s32 %r9, %r4, %r8; - ld.param.s32 %r10, [__cudaparm_kernel_sphere_ellipsoid_start]; - add.s32 %r11, %r10, %r9; - ld.param.s32 %r12, [__cudaparm_kernel_sphere_ellipsoid_inum]; - setp.ge.s32 %p1, %r11, %r12; - @%p1 bra $Lt_1_73218; - .loc 17 370 0 - cvt.s64.s32 %rd2, %r11; - mul.wide.s32 %rd3, %r11, 4; - ld.param.u64 %rd4, [__cudaparm_kernel_sphere_ellipsoid_dev_nbor]; - add.u64 %rd5, %rd4, %rd3; - ld.global.s32 %r13, [%rd5+0]; - ld.param.s32 %r14, [__cudaparm_kernel_sphere_ellipsoid_stride]; - cvt.s64.s32 %rd6, %r14; - mul.wide.s32 %rd7, %r14, 4; - add.u64 %rd8, %rd7, %rd5; - ld.global.s32 %r15, [%rd8+0]; - .loc 17 373 0 - ld.param.u64 %rd9, [__cudaparm_kernel_sphere_ellipsoid_x_]; - cvt.s64.s32 %rd10, %r13; - mul.wide.s32 %rd11, %r13, 16; - add.u64 %rd12, %rd9, %rd11; - ld.global.v4.f32 {%f58,%f59,%f60,%f61}, [%rd12+0]; - .loc 17 374 0 - cvt.s32.s64 %r16, %rd6; - sub.s32 %r17, %r2, 1; - and.b32 %r18, %r17, %r3; - add.u64 %rd13, %rd7, %rd8; - mul.lo.s32 %r19, %r16, %r18; - cvt.s64.s32 %rd14, %r19; - mul.wide.s32 %rd15, %r19, 4; - add.u64 %rd16, %rd13, %rd15; - mov.s64 %rd17, %rd16; - mul.lo.s32 %r20, %r16, %r15; - cvt.s64.s32 %rd18, %r20; - mul.wide.s32 %rd19, %r20, 4; - add.u64 %rd20, %rd13, %rd19; - setp.ge.u64 %p2, %rd16, %rd20; - @%p2 bra $Lt_1_75010; - ld.param.s32 %r21, [__cudaparm_kernel_sphere_ellipsoid_vflag]; - mov.s32 %r22, 0; - setp.gt.s32 %p3, %r21, %r22; - cvt.rzi.ftz.s32.f32 %r23, %f61; - ld.param.u64 %rd21, [__cudaparm_kernel_sphere_ellipsoid_sig_eps]; - ld.param.s32 %r24, [__cudaparm_kernel_sphere_ellipsoid_ntypes]; - ld.param.u64 %rd22, [__cudaparm_kernel_sphere_ellipsoid_well]; - ld.param.u64 %rd23, [__cudaparm_kernel_sphere_ellipsoid_q]; - ld.param.u64 %rd24, [__cudaparm_kernel_sphere_ellipsoid_shape]; - mov.f32 %f62, 0f00000000; // 0 - mov.f32 %f63, 0f00000000; // 0 - mov.f32 %f64, 0f00000000; // 0 - mov.f32 %f65, 0f00000000; // 0 - mov.u64 %rd25, __cuda___cuda_local_var_33201_33_non_const_sp_lj3836; -$Lt_1_51714: - // Loop body line 374, nesting depth: 1, estimated iterations: unknown - .loc 17 378 0 - ld.global.s32 %r25, [%rd17+0]; - .loc 17 382 0 - and.b32 %r26, %r25, 1073741823; - cvt.s64.s32 %rd26, %r26; - mul.wide.s32 %rd27, %r26, 16; - add.u64 %rd28, %rd27, %rd9; - ld.global.v4.f32 {%f66,%f67,%f68,%f69}, [%rd28+0]; - .loc 17 389 0 - cvt.rzi.ftz.s32.f32 %r27, %f69; - cvt.s64.s32 %rd29, %r27; - mul.wide.s32 %rd30, %r27, 16; - add.u64 %rd31, %rd30, %rd24; - ld.global.v4.f32 {%f70,%f71,%f72,_}, [%rd31+0]; - .loc 17 390 0 - add.u64 %rd32, %rd27, %rd23; - ld.global.v4.f32 {%f73,%f74,%f75,%f76}, [%rd32+0]; - .loc 17 391 0 - add.u64 %rd33, %rd30, %rd22; - ld.global.v4.f32 {%f77,%f78,%f79,_}, [%rd33+0]; - .loc 17 401 0 - sub.ftz.f32 %f80, %f67, %f59; - sub.ftz.f32 %f81, %f66, %f58; - sub.ftz.f32 %f82, %f68, %f60; - mul.ftz.f32 %f83, %f80, %f80; - fma.rn.ftz.f32 %f84, %f81, %f81, %f83; - fma.rn.ftz.f32 %f85, %f82, %f82, %f84; - rsqrt.approx.ftz.f32 %f86, %f85; - mul.ftz.f32 %f87, %f81, %f86; - .loc 17 402 0 - mul.ftz.f32 %f88, %f80, %f86; - .loc 17 407 0 - mul.lo.s32 %r28, %r27, %r24; - add.s32 %r29, %r23, %r28; - cvt.s64.s32 %rd34, %r29; - mul.wide.s32 %rd35, %r29, 8; - add.u64 %rd36, %rd21, %rd35; - ld.global.v2.f32 {%f89,%f90}, [%rd36+0]; - .loc 17 408 0 - shr.s32 %r30, %r25, 30; - and.b32 %r31, %r30, 3; - cvt.s64.s32 %rd37, %r31; - mul.wide.s32 %rd38, %r31, 4; - add.u64 %rd39, %rd25, %rd38; - ld.shared.f32 %f91, [%rd39+0]; - mul.ftz.f32 %f92, %f91, %f90; - .loc 16 299 0 - mov.f32 %f93, %f87; - .loc 16 300 0 - mov.f32 %f94, 0f3f000000; // 0.5 - mul.ftz.f32 %f95, %f89, %f94; - add.ftz.f32 %f96, %f74, %f74; - add.ftz.f32 %f97, %f76, %f76; - mul.ftz.f32 %f98, %f73, %f73; - mul.ftz.f32 %f99, %f74, %f74; - mul.ftz.f32 %f100, %f75, %f75; - mul.ftz.f32 %f101, %f76, %f76; - add.ftz.f32 %f102, %f75, %f75; - add.ftz.f32 %f103, %f95, %f71; - add.ftz.f32 %f104, %f95, %f70; - add.ftz.f32 %f105, %f95, %f72; - mul.ftz.f32 %f106, %f96, %f75; - mul.ftz.f32 %f107, %f96, %f76; - mul.ftz.f32 %f108, %f97, %f73; - add.ftz.f32 %f109, %f98, %f99; - mul.ftz.f32 %f110, %f102, %f73; - mul.ftz.f32 %f111, %f103, %f103; - mul.ftz.f32 %f112, %f104, %f104; - mul.ftz.f32 %f113, %f105, %f105; - sub.ftz.f32 %f114, %f106, %f108; - sub.ftz.f32 %f115, %f109, %f100; - add.ftz.f32 %f116, %f107, %f110; - mov.f32 %f117, 0f3f000000; // 0.5 - mul.ftz.f32 %f118, %f111, %f117; - mov.f32 %f119, 0f3f000000; // 0.5 - mul.ftz.f32 %f120, %f112, %f119; - mov.f32 %f121, 0f3f000000; // 0.5 - mul.ftz.f32 %f122, %f113, %f121; - sub.ftz.f32 %f123, %f115, %f101; - mul.ftz.f32 %f124, %f114, %f118; - mul.ftz.f32 %f125, %f116, %f122; - mul.ftz.f32 %f126, %f120, %f123; - mul.ftz.f32 %f127, %f114, %f124; - fma.rn.ftz.f32 %f128, %f123, %f126, %f127; - fma.rn.ftz.f32 %f129, %f125, %f116, %f128; - mov.f32 %f130, %f129; - .loc 16 301 0 - mul.ftz.f32 %f131, %f96, %f73; - sub.ftz.f32 %f132, %f98, %f99; - mul.ftz.f32 %f133, %f102, %f76; - add.ftz.f32 %f134, %f106, %f108; - add.ftz.f32 %f135, %f100, %f132; - sub.ftz.f32 %f136, %f133, %f131; - sub.ftz.f32 %f137, %f135, %f101; - mul.ftz.f32 %f138, %f137, %f124; - fma.rn.ftz.f32 %f139, %f126, %f134, %f138; - fma.rn.ftz.f32 %f140, %f125, %f136, %f139; - mov.f32 %f141, %f140; - .loc 16 302 0 - sub.ftz.f32 %f142, %f132, %f100; - sub.ftz.f32 %f143, %f107, %f110; - add.ftz.f32 %f144, %f131, %f133; - add.ftz.f32 %f145, %f101, %f142; - mul.ftz.f32 %f146, %f144, %f124; - fma.rn.ftz.f32 %f147, %f126, %f143, %f146; - fma.rn.ftz.f32 %f148, %f125, %f145, %f147; - mov.f32 %f149, %f148; - .loc 16 303 0 - mov.f32 %f150, %f88; - .loc 16 304 0 - mul.ftz.f32 %f151, %f134, %f120; - mul.ftz.f32 %f152, %f136, %f122; - mul.ftz.f32 %f153, %f118, %f137; - mul.ftz.f32 %f154, %f114, %f153; - fma.rn.ftz.f32 %f155, %f123, %f151, %f154; - fma.rn.ftz.f32 %f156, %f152, %f116, %f155; - mov.f32 %f157, %f156; - .loc 16 305 0 - mul.ftz.f32 %f158, %f137, %f153; - fma.rn.ftz.f32 %f159, %f134, %f151, %f158; - fma.rn.ftz.f32 %f160, %f152, %f136, %f159; - .loc 16 306 0 - mul.ftz.f32 %f161, %f144, %f153; - fma.rn.ftz.f32 %f162, %f143, %f151, %f161; - fma.rn.ftz.f32 %f163, %f152, %f145, %f162; - .loc 16 307 0 - mul.ftz.f32 %f164, %f82, %f86; - mov.f32 %f165, %f164; - .loc 16 308 0 - mul.ftz.f32 %f166, %f144, %f118; - mul.ftz.f32 %f167, %f143, %f120; - mul.ftz.f32 %f168, %f122, %f145; - mul.ftz.f32 %f169, %f114, %f166; - fma.rn.ftz.f32 %f170, %f123, %f167, %f169; - fma.rn.ftz.f32 %f171, %f116, %f168, %f170; - mov.f32 %f172, %f171; - .loc 16 309 0 - mul.ftz.f32 %f173, %f137, %f166; - fma.rn.ftz.f32 %f174, %f134, %f167, %f173; - fma.rn.ftz.f32 %f175, %f136, %f168, %f174; - .loc 16 310 0 - mul.ftz.f32 %f176, %f144, %f166; - fma.rn.ftz.f32 %f177, %f143, %f167, %f176; - fma.rn.ftz.f32 %f178, %f145, %f168, %f177; - abs.ftz.f32 %f179, %f156; - abs.ftz.f32 %f180, %f129; - setp.gt.ftz.f32 %p4, %f179, %f180; - @!%p4 bra $Lt_1_51970; - .loc 16 314 0 - mov.f32 %f130, %f156; - mov.f32 %f157, %f129; - .loc 16 315 0 - mov.f32 %f141, %f160; - mov.f32 %f160, %f140; - .loc 16 316 0 - mov.f32 %f149, %f163; - mov.f32 %f163, %f148; - .loc 16 317 0 - mov.f32 %f93, %f88; - mov.f32 %f150, %f87; -$Lt_1_51970: - mov.f32 %f181, %f130; - abs.ftz.f32 %f182, %f181; - abs.ftz.f32 %f183, %f171; - setp.lt.ftz.f32 %p5, %f182, %f183; - @!%p5 bra $Lt_1_52482; - .loc 16 321 0 - mov.f32 %f130, %f171; - mov.f32 %f172, %f181; - .loc 16 322 0 - mov.f32 %f184, %f141; - mov.f32 %f141, %f175; - mov.f32 %f175, %f184; - .loc 16 323 0 - mov.f32 %f185, %f149; - mov.f32 %f149, %f178; - mov.f32 %f178, %f185; - .loc 16 324 0 - mov.f32 %f186, %f93; - mov.f32 %f93, %f164; - mov.f32 %f165, %f186; -$Lt_1_52482: - mov.f32 %f187, %f130; - mov.f32 %f188, 0f00000000; // 0 - setp.neu.ftz.f32 %p6, %f187, %f188; - @!%p6 bra $Lt_1_53250; - bra.uni $Lt_1_54018; -$Lt_1_53250: - mov.f32 %f189, 0f00000000; // 0 - setp.neu.ftz.f32 %p7, %f157, %f189; - @!%p7 bra $Lt_1_53762; - .loc 16 338 0 - mov.f32 %f130, %f157; - mov.f32 %f157, %f187; - .loc 16 339 0 - mov.f32 %f190, %f141; - mov.f32 %f141, %f160; - mov.f32 %f160, %f190; - .loc 16 340 0 - mov.f32 %f191, %f149; - mov.f32 %f149, %f163; - mov.f32 %f163, %f191; - .loc 16 341 0 - mov.f32 %f192, %f93; - mov.f32 %f93, %f150; - mov.f32 %f150, %f192; - bra.uni $Lt_1_54018; -$Lt_1_53762: - mov.f32 %f193, 0f00000000; // 0 - setp.neu.ftz.f32 %p8, %f172, %f193; - @!%p8 bra $Lt_1_54274; - .loc 16 346 0 - mov.f32 %f130, %f172; - mov.f32 %f172, %f187; - .loc 16 347 0 - mov.f32 %f194, %f141; - mov.f32 %f141, %f175; - mov.f32 %f175, %f194; - .loc 16 348 0 - mov.f32 %f195, %f149; - mov.f32 %f149, %f178; - mov.f32 %f178, %f195; - .loc 16 349 0 - mov.f32 %f196, %f93; - mov.f32 %f93, %f165; - mov.f32 %f165, %f196; - bra.uni $Lt_1_54018; -$Lt_1_54274: - .loc 16 352 0 - mov.s32 %r32, 2; - ld.param.u64 %rd40, [__cudaparm_kernel_sphere_ellipsoid_err_flag]; - st.global.s32 [%rd40+0], %r32; -$Lt_1_54018: -$Lt_1_53506: -$Lt_1_52994: - .loc 16 355 0 - div.approx.ftz.f32 %f197, %f157, %f130; - mul.ftz.f32 %f198, %f141, %f197; - sub.ftz.f32 %f199, %f160, %f198; - mov.f32 %f160, %f199; - .loc 16 356 0 - mul.ftz.f32 %f200, %f149, %f197; - sub.ftz.f32 %f201, %f163, %f200; - mov.f32 %f163, %f201; - .loc 16 357 0 - mul.ftz.f32 %f202, %f93, %f197; - sub.ftz.f32 %f203, %f150, %f202; - mov.f32 %f150, %f203; - .loc 16 359 0 - div.approx.ftz.f32 %f204, %f172, %f130; - mul.ftz.f32 %f205, %f141, %f204; - sub.ftz.f32 %f175, %f175, %f205; - .loc 16 360 0 - mul.ftz.f32 %f206, %f149, %f204; - sub.ftz.f32 %f178, %f178, %f206; - .loc 16 361 0 - mul.ftz.f32 %f207, %f93, %f204; - sub.ftz.f32 %f165, %f165, %f207; - abs.ftz.f32 %f208, %f199; - abs.ftz.f32 %f209, %f175; - setp.lt.ftz.f32 %p9, %f208, %f209; - @!%p9 bra $Lt_1_54530; - .loc 16 366 0 - mov.f32 %f160, %f175; - mov.f32 %f175, %f199; - .loc 16 367 0 - mov.f32 %f163, %f178; - mov.f32 %f178, %f201; - .loc 16 368 0 - mov.f32 %f150, %f165; - mov.f32 %f165, %f203; -$Lt_1_54530: - mov.f32 %f210, %f160; - mov.f32 %f211, 0f00000000; // 0 - setp.neu.ftz.f32 %p10, %f210, %f211; - @!%p10 bra $Lt_1_55298; - bra.uni $Lt_1_55554; -$Lt_1_55298: - mov.f32 %f212, 0f00000000; // 0 - setp.neu.ftz.f32 %p11, %f175, %f212; - @!%p11 bra $Lt_1_55554; - .loc 16 383 0 - mov.f32 %f160, %f175; - mov.f32 %f175, %f210; - .loc 16 384 0 - mov.f32 %f213, %f163; - mov.f32 %f163, %f178; - mov.f32 %f178, %f213; - .loc 16 385 0 - mov.f32 %f214, %f150; - mov.f32 %f150, %f165; - mov.f32 %f165, %f214; -$Lt_1_55554: -$Lt_1_55042: - .loc 16 390 0 - div.approx.ftz.f32 %f215, %f175, %f160; - mul.ftz.f32 %f216, %f163, %f215; - sub.ftz.f32 %f178, %f178, %f216; - .loc 16 391 0 - mul.ftz.f32 %f217, %f150, %f215; - sub.ftz.f32 %f165, %f165, %f217; - mov.f32 %f218, 0f00000000; // 0 - setp.eq.ftz.f32 %p12, %f178, %f218; - @!%p12 bra $Lt_1_56066; - .loc 16 394 0 - mov.s32 %r33, 2; - ld.param.u64 %rd41, [__cudaparm_kernel_sphere_ellipsoid_err_flag]; - st.global.s32 [%rd41+0], %r33; -$Lt_1_56066: - .loc 16 396 0 - div.approx.ftz.f32 %f219, %f165, %f178; - .loc 16 399 0 - mul.ftz.f32 %f220, %f219, %f163; - sub.ftz.f32 %f221, %f150, %f220; - div.approx.ftz.f32 %f222, %f221, %f160; - .loc 16 403 0 - mul.ftz.f32 %f223, %f222, %f141; - fma.rn.ftz.f32 %f224, %f149, %f219, %f223; - sub.ftz.f32 %f225, %f93, %f224; - div.approx.ftz.f32 %f226, %f225, %f130; - .loc 17 427 0 - mul.ftz.f32 %f227, %f222, %f88; - fma.rn.ftz.f32 %f228, %f87, %f226, %f227; - fma.rn.ftz.f32 %f229, %f164, %f219, %f228; - mov.f32 %f230, 0f3f000000; // 0.5 - mul.ftz.f32 %f231, %f229, %f230; - rsqrt.approx.ftz.f32 %f232, %f231; - .loc 16 299 0 - mov.f32 %f93, %f87; - .loc 16 300 0 - mul.ftz.f32 %f233, %f114, %f78; - mul.ftz.f32 %f234, %f116, %f79; - mul.ftz.f32 %f235, %f114, %f233; - mul.ftz.f32 %f236, %f123, %f77; - fma.rn.ftz.f32 %f237, %f123, %f236, %f235; - fma.rn.ftz.f32 %f238, %f234, %f116, %f237; - mov.f32 %f239, 0f3f800000; // 1 - add.ftz.f32 %f240, %f238, %f239; - mov.f32 %f130, %f240; - .loc 16 301 0 - mul.ftz.f32 %f241, %f233, %f137; - fma.rn.ftz.f32 %f242, %f236, %f134, %f241; - fma.rn.ftz.f32 %f243, %f234, %f136, %f242; - mov.f32 %f141, %f243; - .loc 16 302 0 - mul.ftz.f32 %f244, %f144, %f233; - fma.rn.ftz.f32 %f245, %f236, %f143, %f244; - fma.rn.ftz.f32 %f246, %f234, %f145, %f245; - mov.f32 %f149, %f246; - .loc 16 303 0 - mov.f32 %f150, %f88; - .loc 16 304 0 - mul.ftz.f32 %f247, %f134, %f77; - mul.ftz.f32 %f248, %f136, %f79; - mul.ftz.f32 %f249, %f137, %f78; - mul.ftz.f32 %f250, %f114, %f249; - fma.rn.ftz.f32 %f251, %f123, %f247, %f250; - fma.rn.ftz.f32 %f252, %f248, %f116, %f251; - mov.f32 %f157, %f252; - .loc 16 305 0 - mul.ftz.f32 %f253, %f137, %f249; - fma.rn.ftz.f32 %f254, %f134, %f247, %f253; - fma.rn.ftz.f32 %f255, %f248, %f136, %f254; - mov.f32 %f256, 0f3f800000; // 1 - add.ftz.f32 %f160, %f255, %f256; - .loc 16 306 0 - mul.ftz.f32 %f257, %f144, %f249; - fma.rn.ftz.f32 %f258, %f143, %f247, %f257; - fma.rn.ftz.f32 %f163, %f248, %f145, %f258; - .loc 16 307 0 - mov.f32 %f165, %f164; - .loc 16 308 0 - mul.ftz.f32 %f259, %f143, %f77; - mul.ftz.f32 %f260, %f144, %f78; - mul.ftz.f32 %f261, %f145, %f79; - mul.ftz.f32 %f262, %f114, %f260; - fma.rn.ftz.f32 %f263, %f123, %f259, %f262; - fma.rn.ftz.f32 %f264, %f116, %f261, %f263; - mov.f32 %f172, %f264; - .loc 16 309 0 - mul.ftz.f32 %f265, %f137, %f260; - fma.rn.ftz.f32 %f266, %f134, %f259, %f265; - fma.rn.ftz.f32 %f175, %f136, %f261, %f266; - .loc 16 310 0 - mul.ftz.f32 %f267, %f144, %f260; - fma.rn.ftz.f32 %f268, %f143, %f259, %f267; - fma.rn.ftz.f32 %f269, %f145, %f261, %f268; - mov.f32 %f270, 0f3f800000; // 1 - add.ftz.f32 %f178, %f269, %f270; - abs.ftz.f32 %f271, %f252; - abs.ftz.f32 %f272, %f240; - setp.gt.ftz.f32 %p13, %f271, %f272; - @!%p13 bra $Lt_1_56578; - .loc 16 314 0 - mov.f32 %f130, %f252; - mov.f32 %f157, %f240; - .loc 16 315 0 - mov.f32 %f141, %f160; - mov.f32 %f160, %f243; - .loc 16 316 0 - mov.f32 %f149, %f163; - mov.f32 %f163, %f246; - .loc 16 317 0 - mov.f32 %f93, %f88; - mov.f32 %f150, %f87; -$Lt_1_56578: - mov.f32 %f273, %f130; - abs.ftz.f32 %f274, %f273; - abs.ftz.f32 %f275, %f264; - setp.lt.ftz.f32 %p14, %f274, %f275; - @!%p14 bra $Lt_1_57090; - .loc 16 321 0 - mov.f32 %f130, %f264; - mov.f32 %f172, %f273; - .loc 16 322 0 - mov.f32 %f276, %f141; - mov.f32 %f141, %f175; - mov.f32 %f175, %f276; - .loc 16 323 0 - mov.f32 %f277, %f149; - mov.f32 %f149, %f178; - mov.f32 %f178, %f277; - .loc 16 324 0 - mov.f32 %f278, %f93; - mov.f32 %f93, %f164; - mov.f32 %f165, %f278; -$Lt_1_57090: - mov.f32 %f279, %f130; - mov.f32 %f280, 0f00000000; // 0 - setp.neu.ftz.f32 %p15, %f279, %f280; - @!%p15 bra $Lt_1_57858; - bra.uni $Lt_1_58626; -$Lt_1_57858: - mov.f32 %f281, 0f00000000; // 0 - setp.neu.ftz.f32 %p16, %f157, %f281; - @!%p16 bra $Lt_1_58370; - .loc 16 338 0 - mov.f32 %f130, %f157; - mov.f32 %f157, %f279; - .loc 16 339 0 - mov.f32 %f282, %f141; - mov.f32 %f141, %f160; - mov.f32 %f160, %f282; - .loc 16 340 0 - mov.f32 %f283, %f149; - mov.f32 %f149, %f163; - mov.f32 %f163, %f283; - .loc 16 341 0 - mov.f32 %f284, %f93; - mov.f32 %f93, %f150; - mov.f32 %f150, %f284; - bra.uni $Lt_1_58626; -$Lt_1_58370: - mov.f32 %f285, 0f00000000; // 0 - setp.neu.ftz.f32 %p17, %f172, %f285; - @!%p17 bra $Lt_1_58882; - .loc 16 346 0 - mov.f32 %f130, %f172; - mov.f32 %f172, %f279; - .loc 16 347 0 - mov.f32 %f286, %f141; - mov.f32 %f141, %f175; - mov.f32 %f175, %f286; - .loc 16 348 0 - mov.f32 %f287, %f149; - mov.f32 %f149, %f178; - mov.f32 %f178, %f287; - .loc 16 349 0 - mov.f32 %f288, %f93; - mov.f32 %f93, %f165; - mov.f32 %f165, %f288; - bra.uni $Lt_1_58626; -$Lt_1_58882: - .loc 16 352 0 - mov.s32 %r34, 2; - ld.param.u64 %rd42, [__cudaparm_kernel_sphere_ellipsoid_err_flag]; - st.global.s32 [%rd42+0], %r34; -$Lt_1_58626: -$Lt_1_58114: -$Lt_1_57602: - .loc 16 355 0 - div.approx.ftz.f32 %f289, %f157, %f130; - mul.ftz.f32 %f290, %f141, %f289; - sub.ftz.f32 %f291, %f160, %f290; - mov.f32 %f160, %f291; - .loc 16 356 0 - mul.ftz.f32 %f292, %f149, %f289; - sub.ftz.f32 %f293, %f163, %f292; - mov.f32 %f163, %f293; - .loc 16 357 0 - mul.ftz.f32 %f294, %f93, %f289; - sub.ftz.f32 %f295, %f150, %f294; - mov.f32 %f150, %f295; - .loc 16 359 0 - div.approx.ftz.f32 %f296, %f172, %f130; - mul.ftz.f32 %f297, %f141, %f296; - sub.ftz.f32 %f175, %f175, %f297; - .loc 16 360 0 - mul.ftz.f32 %f298, %f149, %f296; - sub.ftz.f32 %f178, %f178, %f298; - .loc 16 361 0 - mul.ftz.f32 %f299, %f93, %f296; - sub.ftz.f32 %f165, %f165, %f299; - abs.ftz.f32 %f300, %f291; - abs.ftz.f32 %f301, %f175; - setp.lt.ftz.f32 %p18, %f300, %f301; - @!%p18 bra $Lt_1_59138; - .loc 16 366 0 - mov.f32 %f160, %f175; - mov.f32 %f175, %f291; - .loc 16 367 0 - mov.f32 %f163, %f178; - mov.f32 %f178, %f293; - .loc 16 368 0 - mov.f32 %f150, %f165; - mov.f32 %f165, %f295; -$Lt_1_59138: - mov.f32 %f302, %f160; - mov.f32 %f303, 0f00000000; // 0 - setp.neu.ftz.f32 %p19, %f302, %f303; - @!%p19 bra $Lt_1_59906; - bra.uni $Lt_1_60162; -$Lt_1_59906: - mov.f32 %f304, 0f00000000; // 0 - setp.neu.ftz.f32 %p20, %f175, %f304; - @!%p20 bra $Lt_1_60162; - .loc 16 383 0 - mov.f32 %f160, %f175; - mov.f32 %f175, %f302; - .loc 16 384 0 - mov.f32 %f305, %f163; - mov.f32 %f163, %f178; - mov.f32 %f178, %f305; - .loc 16 385 0 - mov.f32 %f306, %f150; - mov.f32 %f150, %f165; - mov.f32 %f165, %f306; -$Lt_1_60162: -$Lt_1_59650: - .loc 16 390 0 - div.approx.ftz.f32 %f307, %f175, %f160; - mul.ftz.f32 %f308, %f163, %f307; - sub.ftz.f32 %f178, %f178, %f308; - .loc 16 391 0 - mul.ftz.f32 %f309, %f150, %f307; - sub.ftz.f32 %f165, %f165, %f309; - mov.f32 %f310, 0f00000000; // 0 - setp.eq.ftz.f32 %p21, %f178, %f310; - @!%p21 bra $Lt_1_60674; - .loc 16 394 0 - mov.s32 %r35, 2; - ld.param.u64 %rd43, [__cudaparm_kernel_sphere_ellipsoid_err_flag]; - st.global.s32 [%rd43+0], %r35; -$Lt_1_60674: - .loc 17 436 0 - div.approx.ftz.f32 %f311, %f165, %f178; - mul.ftz.f32 %f312, %f311, %f163; - sub.ftz.f32 %f313, %f150, %f312; - div.approx.ftz.f32 %f314, %f313, %f160; - mul.ftz.f32 %f315, %f314, %f141; - fma.rn.ftz.f32 %f316, %f149, %f311, %f315; - mul.ftz.f32 %f317, %f314, %f88; - sub.ftz.f32 %f318, %f93, %f316; - div.approx.ftz.f32 %f319, %f318, %f130; - fma.rn.ftz.f32 %f320, %f87, %f319, %f317; - fma.rn.ftz.f32 %f321, %f164, %f311, %f320; - add.ftz.f32 %f322, %f321, %f321; - .loc 17 444 0 - rcp.approx.ftz.f32 %f323, %f86; - sub.ftz.f32 %f324, %f323, %f232; - mov.f32 %f325, 0f40000000; // 2 - div.approx.ftz.f32 %f326, %f324, %f325; - mul.ftz.f32 %f327, %f324, %f324; - mul.ftz.f32 %f328, %f324, %f327; - add.ftz.f32 %f329, %f326, %f72; - add.ftz.f32 %f330, %f326, %f70; - add.ftz.f32 %f331, %f326, %f71; - mul.ftz.f32 %f332, %f330, %f331; - mul.ftz.f32 %f333, %f329, %f332; - mul.ftz.f32 %f334, %f328, %f333; - .loc 17 446 0 - mul.ftz.f32 %f335, %f70, %f71; - mul.ftz.f32 %f336, %f335, %f72; - div.approx.ftz.f32 %f337, %f89, %f324; - mul.ftz.f32 %f338, %f337, %f322; - mov.f32 %f339, 0f3f800000; // 1 - mov.f32 %f340, 0f40400000; // 3 - fma.rn.ftz.f32 %f341, %f340, %f338, %f339; - mul.ftz.f32 %f342, %f336, %f341; - .loc 17 450 0 - div.approx.ftz.f32 %f343, %f324, %f17; - add.ftz.f32 %f344, %f343, %f72; - add.ftz.f32 %f345, %f343, %f70; - add.ftz.f32 %f346, %f343, %f71; - mul.ftz.f32 %f347, %f345, %f346; - mul.ftz.f32 %f348, %f344, %f347; - mul.ftz.f32 %f349, %f328, %f348; - .loc 17 452 0 - mov.f32 %f350, 0f3f800000; // 1 - mov.f32 %f351, 0f3f4db6db; // 0.803571 - fma.rn.ftz.f32 %f352, %f351, %f338, %f350; - mul.ftz.f32 %f353, %f336, %f352; - .loc 17 454 0 - mul.ftz.f32 %f354, %f337, %f337; - mul.ftz.f32 %f355, %f337, %f354; - mul.ftz.f32 %f356, %f355, %f355; - .loc 17 457 0 - mul.ftz.f32 %f357, %f89, %f89; - mov.f32 %f358, 0f41000000; // 8 - div.approx.ftz.f32 %f359, %f334, %f358; - mov.f32 %f360, 0f42700000; // 60 - div.approx.ftz.f32 %f361, %f349, %f360; - mul.ftz.f32 %f362, %f357, %f89; - div.approx.ftz.f32 %f363, %f342, %f359; - div.approx.ftz.f32 %f364, %f353, %f361; - mul.ftz.f32 %f365, %f363, %f92; - mul.ftz.f32 %f366, %f364, %f92; - mul.ftz.f32 %f367, %f362, %f365; - mul.ftz.f32 %f368, %f362, %f366; - mul.ftz.f32 %f369, %f367, %f41; - mul.ftz.f32 %f370, %f368, %f356; - mul.ftz.f32 %f371, %f370, %f45; - add.ftz.f32 %f372, %f369, %f371; - add.ftz.f32 %f65, %f65, %f372; - .loc 17 464 0 - mov.f32 %f373, 0f40800000; // 4 - mul.ftz.f32 %f374, %f319, %f373; - .loc 17 471 0 - mov.f32 %f375, 0f40400000; // 3 - div.approx.ftz.f32 %f376, %f375, %f324; - add.ftz.f32 %f377, %f70, %f70; - add.ftz.f32 %f378, %f324, %f377; - rcp.approx.ftz.f32 %f379, %f378; - add.ftz.f32 %f380, %f71, %f71; - add.ftz.f32 %f381, %f324, %f380; - rcp.approx.ftz.f32 %f382, %f381; - add.ftz.f32 %f383, %f379, %f382; - add.ftz.f32 %f384, %f72, %f72; - add.ftz.f32 %f385, %f324, %f384; - rcp.approx.ftz.f32 %f386, %f385; - add.ftz.f32 %f387, %f383, %f386; - add.ftz.f32 %f388, %f376, %f387; - .loc 17 476 0 - mul.ftz.f32 %f389, %f89, %f322; - mov.f32 %f390, 0f40400000; // 3 - fma.rn.ftz.f32 %f391, %f390, %f389, %f324; - rcp.approx.ftz.f32 %f392, %f391; - rcp.approx.ftz.f32 %f393, %f324; - sub.ftz.f32 %f394, %f393, %f392; - add.ftz.f32 %f395, %f388, %f394; - .loc 17 479 0 - fma.rn.ftz.f32 %f396, %f17, %f70, %f324; - rcp.approx.ftz.f32 %f397, %f396; - fma.rn.ftz.f32 %f398, %f17, %f71, %f324; - rcp.approx.ftz.f32 %f399, %f398; - add.ftz.f32 %f400, %f397, %f399; - fma.rn.ftz.f32 %f401, %f17, %f72, %f324; - rcp.approx.ftz.f32 %f402, %f401; - add.ftz.f32 %f403, %f400, %f402; - add.ftz.f32 %f404, %f376, %f403; - .loc 17 490 0 - mul.ftz.f32 %f405, %f87, %f87; - neg.ftz.f32 %f406, %f405; - mov.f32 %f407, %f406; - .loc 17 491 0 - mul.ftz.f32 %f408, %f88, %f87; - neg.ftz.f32 %f409, %f408; - mov.f32 %f410, %f409; - .loc 17 492 0 - mul.ftz.f32 %f411, %f164, %f87; - neg.ftz.f32 %f412, %f411; - mov.f32 %f413, %f412; - .loc 17 493 0 - mov.f32 %f414, 0f3f800000; // 1 - sub.ftz.f32 %f415, %f414, %f405; - mov.f32 %f416, %f415; - .loc 17 494 0 - mul.ftz.f32 %f417, %f86, %f415; - mov.f32 %f418, %f417; - .loc 17 495 0 - mov.f32 %f419, %f410; - mul.ftz.f32 %f420, %f419, %f86; - mov.f32 %f421, %f420; - .loc 17 496 0 - mov.f32 %f422, %f413; - mul.ftz.f32 %f423, %f422, %f86; - mov.f32 %f424, %f423; - .loc 17 500 0 - mul.ftz.f32 %f425, %f232, %f232; - mov.f32 %f426, 0f3f4db6db; // 0.803571 - mul.ftz.f32 %f427, %f89, %f426; - mov.f32 %f428, 0f40800000; // 4 - mul.ftz.f32 %f429, %f311, %f428; - mul.ftz.f32 %f430, %f425, %f232; - mov.f32 %f431, 0f3f000000; // 0.5 - mul.ftz.f32 %f432, %f430, %f431; - mul.ftz.f32 %f433, %f432, %f222; - mul.ftz.f32 %f434, %f432, %f226; - mul.ftz.f32 %f435, %f432, %f219; - mov.f32 %f436, 0f40800000; // 4 - mul.ftz.f32 %f437, %f314, %f436; - mul.ftz.f32 %f438, %f433, %f420; - mul.ftz.f32 %f439, %f437, %f420; - mov.f32 %f440, 0f40e00000; // 7 - div.approx.ftz.f32 %f441, %f440, %f324; - mov.f32 %f442, 0f3f4db6db; // 0.803571 - fma.rn.ftz.f32 %f443, %f442, %f389, %f324; - rcp.approx.ftz.f32 %f444, %f443; - fma.rn.ftz.f32 %f445, %f434, %f417, %f438; - fma.rn.ftz.f32 %f446, %f374, %f417, %f439; - sub.ftz.f32 %f447, %f441, %f444; - mul.ftz.f32 %f448, %f427, %f444; - fma.rn.ftz.f32 %f449, %f435, %f423, %f445; - fma.rn.ftz.f32 %f450, %f429, %f423, %f446; - add.ftz.f32 %f451, %f447, %f404; - add.ftz.f32 %f452, %f449, %f87; - mul.ftz.f32 %f453, %f451, %f452; - mul.ftz.f32 %f454, %f448, %f450; - sub.ftz.f32 %f455, %f454, %f453; - .loc 17 501 0 - mov.f32 %f456, 0f40400000; // 3 - mul.ftz.f32 %f457, %f89, %f456; - mul.ftz.f32 %f458, %f457, %f392; - mul.ftz.f32 %f459, %f371, %f455; - mul.ftz.f32 %f460, %f452, %f395; - mul.ftz.f32 %f461, %f458, %f450; - sub.ftz.f32 %f462, %f461, %f460; - fma.rn.ftz.f32 %f463, %f369, %f462, %f459; - .loc 17 503 0 - add.ftz.f32 %f64, %f463, %f64; - @!%p3 bra $Lt_1_61698; - .loc 17 505 0 - mov.f32 %f464, %f47; - mul.ftz.f32 %f465, %f81, %f463; - sub.ftz.f32 %f466, %f464, %f465; - mov.f32 %f47, %f466; -$Lt_1_61698: - .loc 17 490 0 - mov.f32 %f467, %f409; - .loc 17 491 0 - mul.ftz.f32 %f468, %f88, %f88; - neg.ftz.f32 %f469, %f468; - mov.f32 %f470, %f469; - .loc 17 492 0 - mul.ftz.f32 %f471, %f164, %f88; - neg.ftz.f32 %f472, %f471; - mov.f32 %f473, %f472; - .loc 17 493 0 - mov.f32 %f474, 0f3f800000; // 1 - sub.ftz.f32 %f475, %f474, %f468; - mov.f32 %f476, %f475; - .loc 17 494 0 - mov.f32 %f477, %f467; - mul.ftz.f32 %f478, %f477, %f86; - mov.f32 %f479, %f478; - .loc 17 495 0 - mul.ftz.f32 %f480, %f86, %f475; - mov.f32 %f481, %f480; - .loc 17 496 0 - mov.f32 %f482, %f473; - mul.ftz.f32 %f483, %f482, %f86; - mov.f32 %f484, %f483; - .loc 17 500 0 - mul.ftz.f32 %f485, %f433, %f480; - mul.ftz.f32 %f486, %f437, %f480; - fma.rn.ftz.f32 %f487, %f434, %f478, %f485; - fma.rn.ftz.f32 %f488, %f374, %f478, %f486; - fma.rn.ftz.f32 %f489, %f435, %f483, %f487; - fma.rn.ftz.f32 %f490, %f429, %f483, %f488; - add.ftz.f32 %f491, %f489, %f88; - mul.ftz.f32 %f492, %f451, %f491; - mul.ftz.f32 %f493, %f448, %f490; - sub.ftz.f32 %f494, %f493, %f492; - .loc 17 501 0 - mul.ftz.f32 %f495, %f371, %f494; - mul.ftz.f32 %f496, %f491, %f395; - mul.ftz.f32 %f497, %f458, %f490; - sub.ftz.f32 %f498, %f497, %f496; - fma.rn.ftz.f32 %f463, %f369, %f498, %f495; - .loc 17 507 0 - add.ftz.f32 %f63, %f463, %f63; - @!%p3 bra $Lt_1_65282; - .loc 17 509 0 - mov.f32 %f499, %f49; - mul.ftz.f32 %f500, %f80, %f463; - sub.ftz.f32 %f501, %f499, %f500; - mov.f32 %f49, %f501; - .loc 17 510 0 - mov.f32 %f502, %f53; - mul.ftz.f32 %f503, %f81, %f463; - sub.ftz.f32 %f504, %f502, %f503; - mov.f32 %f53, %f504; -$Lt_1_65282: - .loc 17 490 0 - mov.f32 %f505, %f412; - .loc 17 491 0 - mov.f32 %f506, %f472; - .loc 17 492 0 - mul.ftz.f32 %f507, %f164, %f164; - neg.ftz.f32 %f508, %f507; - mov.f32 %f509, %f508; - .loc 17 493 0 - mov.f32 %f510, 0f3f800000; // 1 - sub.ftz.f32 %f511, %f510, %f507; - mov.f32 %f512, %f511; - .loc 17 494 0 - mov.f32 %f513, %f505; - mul.ftz.f32 %f514, %f513, %f86; - mov.f32 %f515, %f514; - .loc 17 495 0 - mov.f32 %f516, %f506; - mul.ftz.f32 %f517, %f516, %f86; - mov.f32 %f518, %f517; - .loc 17 496 0 - mul.ftz.f32 %f519, %f86, %f511; - mov.f32 %f520, %f519; - .loc 17 500 0 - mul.ftz.f32 %f521, %f433, %f517; - mul.ftz.f32 %f522, %f437, %f517; - fma.rn.ftz.f32 %f523, %f434, %f514, %f521; - fma.rn.ftz.f32 %f524, %f374, %f514, %f522; - fma.rn.ftz.f32 %f525, %f435, %f519, %f523; - fma.rn.ftz.f32 %f526, %f429, %f519, %f524; - add.ftz.f32 %f527, %f525, %f164; - mul.ftz.f32 %f528, %f527, %f451; - mul.ftz.f32 %f529, %f448, %f526; - sub.ftz.f32 %f530, %f529, %f528; - .loc 17 501 0 - mul.ftz.f32 %f531, %f371, %f530; - mul.ftz.f32 %f532, %f527, %f395; - mul.ftz.f32 %f533, %f458, %f526; - sub.ftz.f32 %f534, %f533, %f532; - fma.rn.ftz.f32 %f463, %f369, %f534, %f531; - .loc 17 513 0 - add.ftz.f32 %f62, %f463, %f62; - @!%p3 bra $Lt_1_68354; - .loc 17 515 0 - mov.f32 %f535, %f51; - mul.ftz.f32 %f536, %f82, %f463; - sub.ftz.f32 %f537, %f535, %f536; - mov.f32 %f51, %f537; - .loc 17 516 0 - mov.f32 %f538, %f55; - mul.ftz.f32 %f539, %f81, %f463; - sub.ftz.f32 %f540, %f538, %f539; - mov.f32 %f55, %f540; - .loc 17 517 0 - mul.ftz.f32 %f541, %f80, %f463; - sub.ftz.f32 %f56, %f56, %f541; - mov.f32 %f57, %f56; -$Lt_1_68354: - mul.lo.s32 %r36, %r16, %r2; - cvt.s64.s32 %rd44, %r36; - mul.wide.s32 %rd45, %r36, 4; - add.u64 %rd17, %rd17, %rd45; - setp.gt.u64 %p22, %rd20, %rd17; - @%p22 bra $Lt_1_51714; - bra.uni $Lt_1_51202; -$Lt_1_75010: - mov.f32 %f62, 0f00000000; // 0 - mov.f32 %f63, 0f00000000; // 0 - mov.f32 %f64, 0f00000000; // 0 - mov.f32 %f65, 0f00000000; // 0 -$Lt_1_51202: - mov.u32 %r37, 1; - setp.le.s32 %p23, %r2, %r37; - @%p23 bra $Lt_1_71170; - .loc 17 522 0 - mov.u64 %rd46, __cuda___cuda_local_var_33377_55_non_const_red_acc3852; - cvt.s64.s32 %rd47, %r3; - mul.wide.s32 %rd48, %r3, 4; - add.u64 %rd49, %rd46, %rd48; - mov.f32 %f542, %f64; - st.shared.f32 [%rd49+0], %f542; - mov.f32 %f543, %f63; - st.shared.f32 [%rd49+512], %f543; - mov.f32 %f544, %f62; - st.shared.f32 [%rd49+1024], %f544; - mov.f32 %f545, %f65; - st.shared.f32 [%rd49+1536], %f545; - shr.s32 %r38, %r2, 31; - mov.s32 %r39, 1; - and.b32 %r40, %r38, %r39; - add.s32 %r41, %r40, %r2; - shr.s32 %r42, %r41, 1; - mov.s32 %r43, %r42; - mov.u32 %r44, 0; - setp.ne.u32 %p24, %r42, %r44; - @!%p24 bra $Lt_1_69634; -$Lt_1_70146: - setp.ge.u32 %p25, %r18, %r43; - @%p25 bra $Lt_1_70402; - add.u32 %r45, %r3, %r43; - cvt.u64.u32 %rd50, %r45; - mul.wide.u32 %rd51, %r45, 4; - add.u64 %rd52, %rd46, %rd51; - ld.shared.f32 %f546, [%rd52+0]; - add.ftz.f32 %f542, %f546, %f542; - st.shared.f32 [%rd49+0], %f542; - ld.shared.f32 %f547, [%rd52+512]; - add.ftz.f32 %f543, %f547, %f543; - st.shared.f32 [%rd49+512], %f543; - ld.shared.f32 %f548, [%rd52+1024]; - add.ftz.f32 %f544, %f548, %f544; - st.shared.f32 [%rd49+1024], %f544; - ld.shared.f32 %f549, [%rd52+1536]; - add.ftz.f32 %f545, %f549, %f545; - st.shared.f32 [%rd49+1536], %f545; -$Lt_1_70402: - shr.u32 %r43, %r43, 1; - mov.u32 %r46, 0; - setp.ne.u32 %p26, %r43, %r46; - @%p26 bra $Lt_1_70146; -$Lt_1_69634: - mov.f32 %f64, %f542; - mov.f32 %f63, %f543; - mov.f32 %f62, %f544; - mov.f32 %f65, %f545; - ld.param.s32 %r47, [__cudaparm_kernel_sphere_ellipsoid_vflag]; - mov.u32 %r48, 0; - setp.le.s32 %p27, %r47, %r48; - @%p27 bra $Lt_1_71170; - mov.f32 %f542, %f47; - st.shared.f32 [%rd49+0], %f542; - mov.f32 %f543, %f49; - st.shared.f32 [%rd49+512], %f543; - mov.f32 %f544, %f51; - st.shared.f32 [%rd49+1024], %f544; - mov.f32 %f545, %f53; - st.shared.f32 [%rd49+1536], %f545; - mov.f32 %f550, %f55; - st.shared.f32 [%rd49+2048], %f550; - mov.f32 %f551, %f56; - st.shared.f32 [%rd49+2560], %f551; - mov.s32 %r49, %r42; - @!%p24 bra $Lt_1_71682; -$Lt_1_72194: - setp.ge.u32 %p28, %r18, %r49; - @%p28 bra $Lt_1_72450; - add.u32 %r50, %r3, %r49; - cvt.u64.u32 %rd53, %r50; - mul.wide.u32 %rd54, %r50, 4; - add.u64 %rd55, %rd46, %rd54; - ld.shared.f32 %f552, [%rd55+0]; - add.ftz.f32 %f542, %f552, %f542; - st.shared.f32 [%rd49+0], %f542; - ld.shared.f32 %f553, [%rd55+512]; - add.ftz.f32 %f543, %f553, %f543; - st.shared.f32 [%rd49+512], %f543; - ld.shared.f32 %f554, [%rd55+1024]; - add.ftz.f32 %f544, %f554, %f544; - st.shared.f32 [%rd49+1024], %f544; - ld.shared.f32 %f555, [%rd55+1536]; - add.ftz.f32 %f545, %f555, %f545; - st.shared.f32 [%rd49+1536], %f545; - ld.shared.f32 %f556, [%rd55+2048]; - add.ftz.f32 %f550, %f556, %f550; - st.shared.f32 [%rd49+2048], %f550; - ld.shared.f32 %f557, [%rd55+2560]; - add.ftz.f32 %f551, %f557, %f551; - st.shared.f32 [%rd49+2560], %f551; -$Lt_1_72450: - shr.u32 %r49, %r49, 1; - mov.u32 %r51, 0; - setp.ne.u32 %p29, %r49, %r51; - @%p29 bra $Lt_1_72194; -$Lt_1_71682: - mov.f32 %f47, %f542; - mov.f32 %f49, %f543; - mov.f32 %f51, %f544; - mov.f32 %f53, %f545; - mov.f32 %f55, %f550; - mov.f32 %f57, %f551; -$Lt_1_71170: -$Lt_1_69122: - mov.u32 %r52, 0; - setp.ne.s32 %p30, %r18, %r52; - @%p30 bra $Lt_1_73218; - ld.param.u64 %rd56, [__cudaparm_kernel_sphere_ellipsoid___val_paramengv]; - add.u64 %rd57, %rd56, %rd3; - ld.param.s32 %r53, [__cudaparm_kernel_sphere_ellipsoid_eflag]; - mov.u32 %r54, 0; - setp.le.s32 %p31, %r53, %r54; - @%p31 bra $Lt_1_73730; - st.global.f32 [%rd57+0], %f65; - cvt.s64.s32 %rd58, %r12; - mul.wide.s32 %rd59, %r12, 4; - add.u64 %rd57, %rd57, %rd59; -$Lt_1_73730: - ld.param.s32 %r55, [__cudaparm_kernel_sphere_ellipsoid_vflag]; - mov.u32 %r56, 0; - setp.le.s32 %p32, %r55, %r56; - @%p32 bra $Lt_1_74242; - mov.f32 %f558, %f47; - st.global.f32 [%rd57+0], %f558; - cvt.s64.s32 %rd60, %r12; - mul.wide.s32 %rd61, %r12, 4; - add.u64 %rd62, %rd61, %rd57; - mov.f32 %f559, %f49; - st.global.f32 [%rd62+0], %f559; - add.u64 %rd63, %rd61, %rd62; - mov.f32 %f560, %f51; - st.global.f32 [%rd63+0], %f560; - add.u64 %rd64, %rd61, %rd63; - mov.f32 %f561, %f53; - st.global.f32 [%rd64+0], %f561; - add.u64 %rd57, %rd61, %rd64; - mov.f32 %f562, %f55; - st.global.f32 [%rd57+0], %f562; - mov.f32 %f563, %f57; - add.u64 %rd65, %rd61, %rd57; - st.global.f32 [%rd65+0], %f563; -$Lt_1_74242: - ld.param.u64 %rd66, [__cudaparm_kernel_sphere_ellipsoid_ans]; - mul.lo.u64 %rd67, %rd2, 16; - add.u64 %rd68, %rd66, %rd67; - mov.f32 %f564, %f565; - st.global.v4.f32 [%rd68+0], {%f64,%f63,%f62,%f564}; -$Lt_1_73218: -$Lt_1_50690: - .loc 17 525 0 - exit; -$LDWend_kernel_sphere_ellipsoid: - } // kernel_sphere_ellipsoid - - .entry kernel_lj ( - .param .u64 __cudaparm_kernel_lj_x_, - .param .u64 __cudaparm_kernel_lj_lj1, - .param .u64 __cudaparm_kernel_lj_lj3, - .param .s32 __cudaparm_kernel_lj_lj_types, - .param .u64 __cudaparm_kernel_lj_gum, - .param .s32 __cudaparm_kernel_lj_stride, - .param .u64 __cudaparm_kernel_lj_dev_ij, - .param .u64 __cudaparm_kernel_lj_ans, - .param .u64 __cudaparm_kernel_lj___val_paramengv, - .param .u64 __cudaparm_kernel_lj_err_flag, - .param .s32 __cudaparm_kernel_lj_eflag, - .param .s32 __cudaparm_kernel_lj_vflag, - .param .s32 __cudaparm_kernel_lj_start, - .param .s32 __cudaparm_kernel_lj_inum, - .param .s32 __cudaparm_kernel_lj_t_per_atom) - { - .reg .u32 %r<55>; - .reg .u64 %rd<60>; - .reg .f32 %f<115>; - .reg .pred %p<19>; - .shared .align 16 .b8 __cuda___cuda_local_var_33394_33_non_const_sp_lj7028[16]; - .shared .align 4 .b8 __cuda___cuda_local_var_33459_55_non_const_red_acc7044[3072]; - // __cuda_local_var_33405_9_non_const_virial = 16 - .loc 17 534 0 -$LDWbegin_kernel_lj: - .loc 17 540 0 - ld.param.u64 %rd1, [__cudaparm_kernel_lj_gum]; - ldu.global.f32 %f1, [%rd1+0]; - .loc 17 541 0 - ld.global.f32 %f2, [%rd1+4]; - .loc 17 542 0 - ld.global.f32 %f3, [%rd1+8]; - .loc 17 543 0 - ld.global.f32 %f4, [%rd1+12]; - st.shared.v4.f32 [__cuda___cuda_local_var_33394_33_non_const_sp_lj7028+0], {%f1,%f2,%f3,%f4}; - .loc 17 552 0 - mov.f32 %f5, 0f00000000; // 0 - mov.f32 %f6, %f5; - mov.f32 %f7, 0f00000000; // 0 - mov.f32 %f8, %f7; - mov.f32 %f9, 0f00000000; // 0 - mov.f32 %f10, %f9; - mov.f32 %f11, 0f00000000; // 0 - mov.f32 %f12, %f11; - mov.f32 %f13, 0f00000000; // 0 - mov.f32 %f14, %f13; - mov.f32 %f15, 0f00000000; // 0 - mov.f32 %f16, %f15; - ld.param.s32 %r1, [__cudaparm_kernel_lj_t_per_atom]; - cvt.s32.u32 %r2, %tid.x; - div.s32 %r3, %r2, %r1; - cvt.s32.u32 %r4, %ntid.x; - div.s32 %r5, %r4, %r1; - cvt.s32.u32 %r6, %ctaid.x; - mul.lo.s32 %r7, %r6, %r5; - add.s32 %r8, %r3, %r7; - ld.param.s32 %r9, [__cudaparm_kernel_lj_start]; - add.s32 %r10, %r9, %r8; - ld.param.s32 %r11, [__cudaparm_kernel_lj_inum]; - setp.ge.s32 %p1, %r10, %r11; - @%p1 bra $Lt_2_25346; - .loc 17 557 0 - cvt.s64.s32 %rd2, %r10; - mul.wide.s32 %rd3, %r10, 4; - ld.param.u64 %rd4, [__cudaparm_kernel_lj_dev_ij]; - add.u64 %rd5, %rd4, %rd3; - ld.global.s32 %r12, [%rd5+0]; - ld.param.s32 %r13, [__cudaparm_kernel_lj_stride]; - cvt.s64.s32 %rd6, %r13; - mul.wide.s32 %rd7, %r13, 4; - add.u64 %rd8, %rd7, %rd5; - ld.global.s32 %r14, [%rd8+0]; - .loc 17 560 0 - ld.param.u64 %rd9, [__cudaparm_kernel_lj_x_]; - cvt.s64.s32 %rd10, %r12; - mul.wide.s32 %rd11, %r12, 16; - add.u64 %rd12, %rd9, %rd11; - ld.global.v4.f32 {%f17,%f18,%f19,%f20}, [%rd12+0]; - .loc 17 561 0 - cvt.s32.s64 %r15, %rd6; - sub.s32 %r16, %r1, 1; - and.b32 %r17, %r16, %r2; - add.u64 %rd13, %rd7, %rd8; - mul.lo.s32 %r18, %r15, %r17; - cvt.s64.s32 %rd14, %r18; - mul.wide.s32 %rd15, %r18, 4; - add.u64 %rd16, %rd13, %rd15; - mov.s64 %rd17, %rd16; - mul.lo.s32 %r19, %r15, %r14; - cvt.s64.s32 %rd18, %r19; - mul.wide.s32 %rd19, %r19, 4; - add.u64 %rd20, %rd13, %rd19; - setp.ge.u64 %p2, %rd16, %rd20; - @%p2 bra $Lt_2_26882; - cvt.rzi.ftz.s32.f32 %r20, %f20; - ld.param.s32 %r21, [__cudaparm_kernel_lj_lj_types]; - mul.lo.s32 %r22, %r21, %r20; - ld.param.u64 %rd21, [__cudaparm_kernel_lj_lj1]; - mov.f32 %f21, 0f00000000; // 0 - mov.f32 %f22, 0f00000000; // 0 - mov.f32 %f23, 0f00000000; // 0 - mov.f32 %f24, 0f00000000; // 0 - mov.u64 %rd22, __cuda___cuda_local_var_33394_33_non_const_sp_lj7028; -$Lt_2_19714: - // Loop body line 561, nesting depth: 1, estimated iterations: unknown - .loc 17 566 0 - ld.global.s32 %r23, [%rd17+0]; - .loc 17 567 0 - shr.s32 %r24, %r23, 30; - and.b32 %r25, %r24, 3; - cvt.s64.s32 %rd23, %r25; - mul.wide.s32 %rd24, %r25, 4; - add.u64 %rd25, %rd22, %rd24; - ld.shared.f32 %f25, [%rd25+0]; - .loc 17 570 0 - and.b32 %r26, %r23, 1073741823; - cvt.s64.s32 %rd26, %r26; - mul.wide.s32 %rd27, %r26, 16; - add.u64 %rd28, %rd9, %rd27; - ld.global.v4.f32 {%f26,%f27,%f28,%f29}, [%rd28+0]; - .loc 17 566 0 - cvt.rzi.ftz.s32.f32 %r27, %f29; - sub.ftz.f32 %f30, %f18, %f27; - sub.ftz.f32 %f31, %f17, %f26; - sub.ftz.f32 %f32, %f19, %f28; - mul.ftz.f32 %f33, %f30, %f30; - fma.rn.ftz.f32 %f34, %f31, %f31, %f33; - fma.rn.ftz.f32 %f35, %f32, %f32, %f34; - add.s32 %r28, %r27, %r22; - cvt.s64.s32 %rd29, %r28; - mul.wide.s32 %rd30, %r28, 16; - add.u64 %rd31, %rd30, %rd21; - ld.global.f32 %f36, [%rd31+8]; - setp.gt.ftz.f32 %p3, %f36, %f35; - @!%p3 bra $Lt_2_27138; - ld.global.f32 %f37, [%rd31+12]; - mov.f32 %f38, 0f00000000; // 0 - setp.eq.ftz.f32 %p4, %f37, %f38; - @!%p4 bra $Lt_2_27138; - .loc 17 584 0 - rcp.approx.ftz.f32 %f39, %f35; - mul.ftz.f32 %f40, %f39, %f39; - mul.ftz.f32 %f41, %f39, %f40; - mul.ftz.f32 %f42, %f39, %f41; - ld.global.v2.f32 {%f43,%f44}, [%rd31+0]; - mul.ftz.f32 %f45, %f43, %f41; - sub.ftz.f32 %f46, %f45, %f44; - mul.ftz.f32 %f47, %f42, %f46; - mul.ftz.f32 %f48, %f25, %f47; - .loc 17 586 0 - fma.rn.ftz.f32 %f23, %f31, %f48, %f23; - .loc 17 587 0 - fma.rn.ftz.f32 %f22, %f30, %f48, %f22; - .loc 17 588 0 - fma.rn.ftz.f32 %f21, %f32, %f48, %f21; - ld.param.s32 %r29, [__cudaparm_kernel_lj_eflag]; - mov.u32 %r30, 0; - setp.le.s32 %p5, %r29, %r30; - @%p5 bra $Lt_2_19970; - .loc 17 592 0 - ld.param.u64 %rd32, [__cudaparm_kernel_lj_lj3]; - add.u64 %rd33, %rd32, %rd30; - ld.global.v4.f32 {%f49,%f50,%f51,_}, [%rd33+0]; - mul.ftz.f32 %f52, %f49, %f41; - sub.ftz.f32 %f53, %f52, %f50; - mul.ftz.f32 %f54, %f41, %f53; - sub.ftz.f32 %f55, %f54, %f51; - fma.rn.ftz.f32 %f24, %f25, %f55, %f24; -$Lt_2_19970: - ld.param.s32 %r31, [__cudaparm_kernel_lj_vflag]; - mov.u32 %r32, 0; - setp.le.s32 %p6, %r31, %r32; - @%p6 bra $Lt_2_27138; - .loc 17 595 0 - mov.f32 %f56, %f6; - mul.ftz.f32 %f57, %f31, %f31; - fma.rn.ftz.f32 %f58, %f48, %f57, %f56; - mov.f32 %f6, %f58; - .loc 17 596 0 - mov.f32 %f59, %f8; - fma.rn.ftz.f32 %f60, %f48, %f33, %f59; - mov.f32 %f8, %f60; - .loc 17 597 0 - mov.f32 %f61, %f10; - mul.ftz.f32 %f62, %f32, %f32; - fma.rn.ftz.f32 %f63, %f48, %f62, %f61; - mov.f32 %f10, %f63; - .loc 17 598 0 - mov.f32 %f64, %f12; - mul.ftz.f32 %f65, %f30, %f31; - fma.rn.ftz.f32 %f66, %f48, %f65, %f64; - mov.f32 %f12, %f66; - .loc 17 599 0 - mov.f32 %f67, %f14; - mul.ftz.f32 %f68, %f31, %f32; - fma.rn.ftz.f32 %f69, %f48, %f68, %f67; - mov.f32 %f14, %f69; - .loc 17 600 0 - mul.ftz.f32 %f70, %f30, %f32; - fma.rn.ftz.f32 %f15, %f48, %f70, %f15; - mov.f32 %f16, %f15; -$Lt_2_27138: -$L_2_18178: - .loc 17 594 0 - mul.lo.s32 %r33, %r15, %r1; - cvt.s64.s32 %rd34, %r33; - mul.wide.s32 %rd35, %r33, 4; - add.u64 %rd17, %rd17, %rd35; - setp.gt.u64 %p7, %rd20, %rd17; - @%p7 bra $Lt_2_19714; - bra.uni $Lt_2_19202; -$Lt_2_26882: - mov.f32 %f21, 0f00000000; // 0 - mov.f32 %f22, 0f00000000; // 0 - mov.f32 %f23, 0f00000000; // 0 - mov.f32 %f24, 0f00000000; // 0 -$Lt_2_19202: - mov.u32 %r34, 1; - setp.le.s32 %p8, %r1, %r34; - @%p8 bra $Lt_2_23298; - .loc 17 604 0 - mov.u64 %rd36, __cuda___cuda_local_var_33459_55_non_const_red_acc7044; - cvt.s64.s32 %rd37, %r2; - mul.wide.s32 %rd38, %r2, 4; - add.u64 %rd39, %rd36, %rd38; - mov.f32 %f71, %f23; - st.shared.f32 [%rd39+0], %f71; - mov.f32 %f72, %f22; - st.shared.f32 [%rd39+512], %f72; - mov.f32 %f73, %f21; - st.shared.f32 [%rd39+1024], %f73; - mov.f32 %f74, %f24; - st.shared.f32 [%rd39+1536], %f74; - shr.s32 %r35, %r1, 31; - mov.s32 %r36, 1; - and.b32 %r37, %r35, %r36; - add.s32 %r38, %r37, %r1; - shr.s32 %r39, %r38, 1; - mov.s32 %r40, %r39; - mov.u32 %r41, 0; - setp.ne.u32 %p9, %r39, %r41; - @!%p9 bra $Lt_2_21762; -$Lt_2_22274: - setp.ge.u32 %p10, %r17, %r40; - @%p10 bra $Lt_2_22530; - add.u32 %r42, %r2, %r40; - cvt.u64.u32 %rd40, %r42; - mul.wide.u32 %rd41, %r42, 4; - add.u64 %rd42, %rd36, %rd41; - ld.shared.f32 %f75, [%rd42+0]; - add.ftz.f32 %f71, %f75, %f71; - st.shared.f32 [%rd39+0], %f71; - ld.shared.f32 %f76, [%rd42+512]; - add.ftz.f32 %f72, %f76, %f72; - st.shared.f32 [%rd39+512], %f72; - ld.shared.f32 %f77, [%rd42+1024]; - add.ftz.f32 %f73, %f77, %f73; - st.shared.f32 [%rd39+1024], %f73; - ld.shared.f32 %f78, [%rd42+1536]; - add.ftz.f32 %f74, %f78, %f74; - st.shared.f32 [%rd39+1536], %f74; -$Lt_2_22530: - shr.u32 %r40, %r40, 1; - mov.u32 %r43, 0; - setp.ne.u32 %p11, %r40, %r43; - @%p11 bra $Lt_2_22274; -$Lt_2_21762: - mov.f32 %f23, %f71; - mov.f32 %f22, %f72; - mov.f32 %f21, %f73; - mov.f32 %f24, %f74; - ld.param.s32 %r44, [__cudaparm_kernel_lj_vflag]; - mov.u32 %r45, 0; - setp.le.s32 %p12, %r44, %r45; - @%p12 bra $Lt_2_23298; - mov.f32 %f71, %f6; - st.shared.f32 [%rd39+0], %f71; - mov.f32 %f72, %f8; - st.shared.f32 [%rd39+512], %f72; - mov.f32 %f73, %f10; - st.shared.f32 [%rd39+1024], %f73; - mov.f32 %f74, %f12; - st.shared.f32 [%rd39+1536], %f74; - mov.f32 %f79, %f14; - st.shared.f32 [%rd39+2048], %f79; - mov.f32 %f80, %f15; - st.shared.f32 [%rd39+2560], %f80; - mov.s32 %r46, %r39; - @!%p9 bra $Lt_2_23810; -$Lt_2_24322: - setp.ge.u32 %p13, %r17, %r46; - @%p13 bra $Lt_2_24578; - add.u32 %r47, %r2, %r46; - cvt.u64.u32 %rd43, %r47; - mul.wide.u32 %rd44, %r47, 4; - add.u64 %rd45, %rd36, %rd44; - ld.shared.f32 %f81, [%rd45+0]; - add.ftz.f32 %f71, %f81, %f71; - st.shared.f32 [%rd39+0], %f71; - ld.shared.f32 %f82, [%rd45+512]; - add.ftz.f32 %f72, %f82, %f72; - st.shared.f32 [%rd39+512], %f72; - ld.shared.f32 %f83, [%rd45+1024]; - add.ftz.f32 %f73, %f83, %f73; - st.shared.f32 [%rd39+1024], %f73; - ld.shared.f32 %f84, [%rd45+1536]; - add.ftz.f32 %f74, %f84, %f74; - st.shared.f32 [%rd39+1536], %f74; - ld.shared.f32 %f85, [%rd45+2048]; - add.ftz.f32 %f79, %f85, %f79; - st.shared.f32 [%rd39+2048], %f79; - ld.shared.f32 %f86, [%rd45+2560]; - add.ftz.f32 %f80, %f86, %f80; - st.shared.f32 [%rd39+2560], %f80; -$Lt_2_24578: - shr.u32 %r46, %r46, 1; - mov.u32 %r48, 0; - setp.ne.u32 %p14, %r46, %r48; - @%p14 bra $Lt_2_24322; -$Lt_2_23810: - mov.f32 %f6, %f71; - mov.f32 %f8, %f72; - mov.f32 %f10, %f73; - mov.f32 %f12, %f74; - mov.f32 %f14, %f79; - mov.f32 %f16, %f80; -$Lt_2_23298: -$Lt_2_21250: - mov.u32 %r49, 0; - setp.ne.s32 %p15, %r17, %r49; - @%p15 bra $Lt_2_25346; - ld.param.u64 %rd46, [__cudaparm_kernel_lj___val_paramengv]; - add.u64 %rd47, %rd46, %rd3; - ld.param.s32 %r50, [__cudaparm_kernel_lj_eflag]; - mov.u32 %r51, 0; - setp.le.s32 %p16, %r50, %r51; - @%p16 bra $Lt_2_25858; - ld.global.f32 %f87, [%rd47+0]; - add.ftz.f32 %f88, %f87, %f24; - st.global.f32 [%rd47+0], %f88; - cvt.s64.s32 %rd48, %r11; - mul.wide.s32 %rd49, %r11, 4; - add.u64 %rd47, %rd47, %rd49; -$Lt_2_25858: - ld.param.s32 %r52, [__cudaparm_kernel_lj_vflag]; - mov.u32 %r53, 0; - setp.le.s32 %p17, %r52, %r53; - @%p17 bra $Lt_2_26370; - ld.global.f32 %f89, [%rd47+0]; - mov.f32 %f90, %f6; - add.ftz.f32 %f91, %f89, %f90; - st.global.f32 [%rd47+0], %f91; - cvt.s64.s32 %rd50, %r11; - mul.wide.s32 %rd51, %r11, 4; - add.u64 %rd52, %rd51, %rd47; - ld.global.f32 %f92, [%rd52+0]; - mov.f32 %f93, %f8; - add.ftz.f32 %f94, %f92, %f93; - st.global.f32 [%rd52+0], %f94; - add.u64 %rd53, %rd51, %rd52; - ld.global.f32 %f95, [%rd53+0]; - mov.f32 %f96, %f10; - add.ftz.f32 %f97, %f95, %f96; - st.global.f32 [%rd53+0], %f97; - add.u64 %rd54, %rd51, %rd53; - ld.global.f32 %f98, [%rd54+0]; - mov.f32 %f99, %f12; - add.ftz.f32 %f100, %f98, %f99; - st.global.f32 [%rd54+0], %f100; - add.u64 %rd55, %rd51, %rd54; - ld.global.f32 %f101, [%rd55+0]; - mov.f32 %f102, %f14; - add.ftz.f32 %f103, %f101, %f102; - st.global.f32 [%rd55+0], %f103; - add.u64 %rd47, %rd51, %rd55; - ld.global.f32 %f104, [%rd47+0]; - mov.f32 %f105, %f16; - add.ftz.f32 %f106, %f104, %f105; - st.global.f32 [%rd47+0], %f106; -$Lt_2_26370: - ld.param.u64 %rd56, [__cudaparm_kernel_lj_ans]; - mul.lo.u64 %rd57, %rd2, 16; - add.u64 %rd58, %rd56, %rd57; - ld.global.v4.f32 {%f107,%f108,%f109,%f110}, [%rd58+0]; - add.ftz.f32 %f111, %f108, %f22; - add.ftz.f32 %f112, %f109, %f21; - add.ftz.f32 %f113, %f107, %f23; - st.global.v4.f32 [%rd58+0], {%f113,%f111,%f112,%f110}; -$Lt_2_25346: -$Lt_2_18690: - .loc 17 607 0 - exit; -$LDWend_kernel_lj: - } // kernel_lj - - .entry kernel_lj_fast ( - .param .u64 __cudaparm_kernel_lj_fast_x_, - .param .u64 __cudaparm_kernel_lj_fast_lj1_in, - .param .u64 __cudaparm_kernel_lj_fast_lj3_in, - .param .u64 __cudaparm_kernel_lj_fast_gum, - .param .s32 __cudaparm_kernel_lj_fast_stride, - .param .u64 __cudaparm_kernel_lj_fast_dev_ij, - .param .u64 __cudaparm_kernel_lj_fast_ans, - .param .u64 __cudaparm_kernel_lj_fast___val_paramengv, - .param .u64 __cudaparm_kernel_lj_fast_err_flag, - .param .s32 __cudaparm_kernel_lj_fast_eflag, - .param .s32 __cudaparm_kernel_lj_fast_vflag, - .param .s32 __cudaparm_kernel_lj_fast_start, - .param .s32 __cudaparm_kernel_lj_fast_inum, - .param .s32 __cudaparm_kernel_lj_fast_t_per_atom) - { - .reg .u32 %r<57>; - .reg .u64 %rd<72>; - .reg .f32 %f<122>; - .reg .pred %p<22>; - .shared .align 4 .b8 __cuda___cuda_local_var_33475_33_non_const_sp_lj10212[16]; - .shared .align 16 .b8 __cuda___cuda_local_var_33476_34_non_const_lj110240[1936]; - .shared .align 16 .b8 __cuda___cuda_local_var_33477_34_non_const_lj312176[1936]; - .shared .align 4 .b8 __cuda___cuda_local_var_33547_55_non_const_red_acc14112[3072]; - // __cuda_local_var_33491_9_non_const_virial = 16 - .loc 17 615 0 -$LDWbegin_kernel_lj_fast: - cvt.s32.u32 %r1, %tid.x; - mov.u32 %r2, 3; - setp.gt.s32 %p1, %r1, %r2; - @%p1 bra $Lt_3_20994; - .loc 17 624 0 - mov.u64 %rd1, __cuda___cuda_local_var_33475_33_non_const_sp_lj10212; - cvt.s64.s32 %rd2, %r1; - mul.wide.s32 %rd3, %r1, 4; - ld.param.u64 %rd4, [__cudaparm_kernel_lj_fast_gum]; - add.u64 %rd5, %rd4, %rd3; - ld.global.f32 %f1, [%rd5+0]; - add.u64 %rd6, %rd3, %rd1; - st.shared.f32 [%rd6+0], %f1; -$Lt_3_20994: - mov.u64 %rd1, __cuda___cuda_local_var_33475_33_non_const_sp_lj10212; - mov.u32 %r3, 120; - setp.gt.s32 %p2, %r1, %r3; - @%p2 bra $Lt_3_21506; - .loc 17 626 0 - mov.u64 %rd7, __cuda___cuda_local_var_33476_34_non_const_lj110240; - cvt.s64.s32 %rd8, %r1; - mul.wide.s32 %rd9, %r1, 16; - ld.param.u64 %rd10, [__cudaparm_kernel_lj_fast_lj1_in]; - add.u64 %rd11, %rd10, %rd9; - add.u64 %rd12, %rd9, %rd7; - ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0]; - st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5}; - ld.param.s32 %r4, [__cudaparm_kernel_lj_fast_eflag]; - mov.u32 %r5, 0; - setp.le.s32 %p3, %r4, %r5; - @%p3 bra $Lt_3_22018; - .loc 17 628 0 - mov.u64 %rd13, __cuda___cuda_local_var_33477_34_non_const_lj312176; - ld.param.u64 %rd14, [__cudaparm_kernel_lj_fast_lj3_in]; - add.u64 %rd15, %rd14, %rd9; - add.u64 %rd16, %rd9, %rd13; - ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0]; - st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9}; -$Lt_3_22018: - mov.u64 %rd13, __cuda___cuda_local_var_33477_34_non_const_lj312176; -$Lt_3_21506: - mov.u64 %rd13, __cuda___cuda_local_var_33477_34_non_const_lj312176; - mov.u64 %rd7, __cuda___cuda_local_var_33476_34_non_const_lj110240; - .loc 17 638 0 - mov.f32 %f10, 0f00000000; // 0 - mov.f32 %f11, %f10; - mov.f32 %f12, 0f00000000; // 0 - mov.f32 %f13, %f12; - mov.f32 %f14, 0f00000000; // 0 - mov.f32 %f15, %f14; - mov.f32 %f16, 0f00000000; // 0 - mov.f32 %f17, %f16; - mov.f32 %f18, 0f00000000; // 0 - mov.f32 %f19, %f18; - mov.f32 %f20, 0f00000000; // 0 - mov.f32 %f21, %f20; - .loc 17 640 0 - bar.sync 0; - ld.param.s32 %r6, [__cudaparm_kernel_lj_fast_t_per_atom]; - div.s32 %r7, %r1, %r6; - cvt.s32.u32 %r8, %ntid.x; - div.s32 %r9, %r8, %r6; - cvt.s32.u32 %r10, %ctaid.x; - mul.lo.s32 %r11, %r10, %r9; - add.s32 %r12, %r7, %r11; - ld.param.s32 %r13, [__cudaparm_kernel_lj_fast_start]; - add.s32 %r14, %r13, %r12; - ld.param.s32 %r15, [__cudaparm_kernel_lj_fast_inum]; - setp.ge.s32 %p4, %r14, %r15; - @%p4 bra $Lt_3_29186; - .loc 17 645 0 - cvt.s64.s32 %rd17, %r14; - mul.wide.s32 %rd18, %r14, 4; - ld.param.u64 %rd19, [__cudaparm_kernel_lj_fast_dev_ij]; - add.u64 %rd20, %rd19, %rd18; - ld.global.s32 %r16, [%rd20+0]; - ld.param.s32 %r17, [__cudaparm_kernel_lj_fast_stride]; - cvt.s64.s32 %rd21, %r17; - mul.wide.s32 %rd22, %r17, 4; - add.u64 %rd23, %rd22, %rd20; - ld.global.s32 %r18, [%rd23+0]; - .loc 17 648 0 - ld.param.u64 %rd24, [__cudaparm_kernel_lj_fast_x_]; - cvt.s64.s32 %rd25, %r16; - mul.wide.s32 %rd26, %r16, 16; - add.u64 %rd27, %rd24, %rd26; - ld.global.v4.f32 {%f22,%f23,%f24,%f25}, [%rd27+0]; - .loc 17 650 0 - cvt.s32.s64 %r19, %rd21; - sub.s32 %r20, %r6, 1; - and.b32 %r21, %r20, %r1; - add.u64 %rd28, %rd22, %rd23; - mul.lo.s32 %r22, %r19, %r21; - cvt.s64.s32 %rd29, %r22; - mul.wide.s32 %rd30, %r22, 4; - add.u64 %rd31, %rd28, %rd30; - mov.s64 %rd32, %rd31; - mul.lo.s32 %r23, %r19, %r18; - cvt.s64.s32 %rd33, %r23; - mul.wide.s32 %rd34, %r23, 4; - add.u64 %rd35, %rd28, %rd34; - setp.ge.u64 %p5, %rd31, %rd35; - @%p5 bra $Lt_3_30722; - cvt.rzi.ftz.s32.f32 %r24, %f25; - mul.lo.s32 %r25, %r24, 11; - cvt.rn.f32.s32 %f26, %r25; - mov.f32 %f27, 0f00000000; // 0 - mov.f32 %f28, 0f00000000; // 0 - mov.f32 %f29, 0f00000000; // 0 - mov.f32 %f30, 0f00000000; // 0 -$Lt_3_23554: - // Loop body line 650, nesting depth: 1, estimated iterations: unknown - .loc 17 655 0 - ld.global.s32 %r26, [%rd32+0]; - .loc 17 656 0 - shr.s32 %r27, %r26, 30; - and.b32 %r28, %r27, 3; - cvt.s64.s32 %rd36, %r28; - mul.wide.s32 %rd37, %r28, 4; - add.u64 %rd38, %rd1, %rd37; - ld.shared.f32 %f31, [%rd38+0]; - .loc 17 659 0 - and.b32 %r29, %r26, 1073741823; - cvt.s64.s32 %rd39, %r29; - mul.wide.s32 %rd40, %r29, 16; - add.u64 %rd41, %rd24, %rd40; - ld.global.v4.f32 {%f32,%f33,%f34,%f35}, [%rd41+0]; - .loc 17 655 0 - sub.ftz.f32 %f36, %f23, %f33; - sub.ftz.f32 %f37, %f22, %f32; - sub.ftz.f32 %f38, %f24, %f34; - mul.ftz.f32 %f39, %f36, %f36; - fma.rn.ftz.f32 %f40, %f37, %f37, %f39; - fma.rn.ftz.f32 %f41, %f38, %f38, %f40; - add.ftz.f32 %f42, %f26, %f35; - cvt.rzi.ftz.s32.f32 %r30, %f42; - cvt.s64.s32 %rd42, %r30; - mul.wide.s32 %rd43, %r30, 16; - add.u64 %rd44, %rd43, %rd7; - ld.shared.f32 %f43, [%rd44+8]; - setp.gt.ftz.f32 %p6, %f43, %f41; - @!%p6 bra $Lt_3_30978; - ld.shared.f32 %f44, [%rd44+12]; - mov.f32 %f45, 0f00000000; // 0 - setp.eq.ftz.f32 %p7, %f44, %f45; - @!%p7 bra $Lt_3_30978; - .loc 17 671 0 - rcp.approx.ftz.f32 %f46, %f41; - mul.ftz.f32 %f47, %f46, %f46; - mul.ftz.f32 %f48, %f46, %f47; - mul.ftz.f32 %f49, %f46, %f31; - mul.ftz.f32 %f50, %f48, %f49; - ld.shared.v2.f32 {%f51,%f52}, [%rd44+0]; - mul.ftz.f32 %f53, %f51, %f48; - sub.ftz.f32 %f54, %f53, %f52; - mul.ftz.f32 %f55, %f50, %f54; - .loc 17 673 0 - fma.rn.ftz.f32 %f29, %f37, %f55, %f29; - .loc 17 674 0 - fma.rn.ftz.f32 %f28, %f36, %f55, %f28; - .loc 17 675 0 - fma.rn.ftz.f32 %f27, %f38, %f55, %f27; - ld.param.s32 %r31, [__cudaparm_kernel_lj_fast_eflag]; - mov.u32 %r32, 0; - setp.le.s32 %p8, %r31, %r32; - @%p8 bra $Lt_3_23810; - .loc 17 678 0 - add.u64 %rd45, %rd43, %rd13; - ld.shared.v4.f32 {%f56,%f57,%f58,_}, [%rd45+0]; - mul.ftz.f32 %f59, %f56, %f48; - sub.ftz.f32 %f60, %f59, %f57; - mul.ftz.f32 %f61, %f48, %f60; - .loc 17 679 0 - sub.ftz.f32 %f62, %f61, %f58; - fma.rn.ftz.f32 %f30, %f31, %f62, %f30; -$Lt_3_23810: - ld.param.s32 %r33, [__cudaparm_kernel_lj_fast_vflag]; - mov.u32 %r34, 0; - setp.le.s32 %p9, %r33, %r34; - @%p9 bra $Lt_3_30978; - .loc 17 682 0 - mov.f32 %f63, %f11; - mul.ftz.f32 %f64, %f37, %f37; - fma.rn.ftz.f32 %f65, %f55, %f64, %f63; - mov.f32 %f11, %f65; - .loc 17 683 0 - mov.f32 %f66, %f13; - fma.rn.ftz.f32 %f67, %f55, %f39, %f66; - mov.f32 %f13, %f67; - .loc 17 684 0 - mov.f32 %f68, %f15; - mul.ftz.f32 %f69, %f38, %f38; - fma.rn.ftz.f32 %f70, %f55, %f69, %f68; - mov.f32 %f15, %f70; - .loc 17 685 0 - mov.f32 %f71, %f17; - mul.ftz.f32 %f72, %f36, %f37; - fma.rn.ftz.f32 %f73, %f55, %f72, %f71; - mov.f32 %f17, %f73; - .loc 17 686 0 - mov.f32 %f74, %f19; - mul.ftz.f32 %f75, %f37, %f38; - fma.rn.ftz.f32 %f76, %f55, %f75, %f74; - mov.f32 %f19, %f76; - .loc 17 687 0 - mul.ftz.f32 %f77, %f36, %f38; - fma.rn.ftz.f32 %f20, %f55, %f77, %f20; - mov.f32 %f21, %f20; -$Lt_3_30978: -$L_3_20482: - .loc 17 681 0 - mul.lo.s32 %r35, %r19, %r6; - cvt.s64.s32 %rd46, %r35; - mul.wide.s32 %rd47, %r35, 4; - add.u64 %rd32, %rd32, %rd47; - setp.gt.u64 %p10, %rd35, %rd32; - @%p10 bra $Lt_3_23554; - bra.uni $Lt_3_23042; -$Lt_3_30722: - mov.f32 %f27, 0f00000000; // 0 - mov.f32 %f28, 0f00000000; // 0 - mov.f32 %f29, 0f00000000; // 0 - mov.f32 %f30, 0f00000000; // 0 -$Lt_3_23042: - mov.u32 %r36, 1; - setp.le.s32 %p11, %r6, %r36; - @%p11 bra $Lt_3_27138; - .loc 17 692 0 - mov.u64 %rd48, __cuda___cuda_local_var_33547_55_non_const_red_acc14112; - cvt.s64.s32 %rd49, %r1; - mul.wide.s32 %rd50, %r1, 4; - add.u64 %rd51, %rd48, %rd50; - mov.f32 %f78, %f29; - st.shared.f32 [%rd51+0], %f78; - mov.f32 %f79, %f28; - st.shared.f32 [%rd51+512], %f79; - mov.f32 %f80, %f27; - st.shared.f32 [%rd51+1024], %f80; - mov.f32 %f81, %f30; - st.shared.f32 [%rd51+1536], %f81; - shr.s32 %r37, %r6, 31; - mov.s32 %r38, 1; - and.b32 %r39, %r37, %r38; - add.s32 %r40, %r39, %r6; - shr.s32 %r41, %r40, 1; - mov.s32 %r42, %r41; - mov.u32 %r43, 0; - setp.ne.u32 %p12, %r41, %r43; - @!%p12 bra $Lt_3_25602; -$Lt_3_26114: - setp.ge.u32 %p13, %r21, %r42; - @%p13 bra $Lt_3_26370; - add.u32 %r44, %r1, %r42; - cvt.u64.u32 %rd52, %r44; - mul.wide.u32 %rd53, %r44, 4; - add.u64 %rd54, %rd48, %rd53; - ld.shared.f32 %f82, [%rd54+0]; - add.ftz.f32 %f78, %f82, %f78; - st.shared.f32 [%rd51+0], %f78; - ld.shared.f32 %f83, [%rd54+512]; - add.ftz.f32 %f79, %f83, %f79; - st.shared.f32 [%rd51+512], %f79; - ld.shared.f32 %f84, [%rd54+1024]; - add.ftz.f32 %f80, %f84, %f80; - st.shared.f32 [%rd51+1024], %f80; - ld.shared.f32 %f85, [%rd54+1536]; - add.ftz.f32 %f81, %f85, %f81; - st.shared.f32 [%rd51+1536], %f81; -$Lt_3_26370: - shr.u32 %r42, %r42, 1; - mov.u32 %r45, 0; - setp.ne.u32 %p14, %r42, %r45; - @%p14 bra $Lt_3_26114; -$Lt_3_25602: - mov.f32 %f29, %f78; - mov.f32 %f28, %f79; - mov.f32 %f27, %f80; - mov.f32 %f30, %f81; - ld.param.s32 %r46, [__cudaparm_kernel_lj_fast_vflag]; - mov.u32 %r47, 0; - setp.le.s32 %p15, %r46, %r47; - @%p15 bra $Lt_3_27138; - mov.f32 %f78, %f11; - st.shared.f32 [%rd51+0], %f78; - mov.f32 %f79, %f13; - st.shared.f32 [%rd51+512], %f79; - mov.f32 %f80, %f15; - st.shared.f32 [%rd51+1024], %f80; - mov.f32 %f81, %f17; - st.shared.f32 [%rd51+1536], %f81; - mov.f32 %f86, %f19; - st.shared.f32 [%rd51+2048], %f86; - mov.f32 %f87, %f20; - st.shared.f32 [%rd51+2560], %f87; - mov.s32 %r48, %r41; - @!%p12 bra $Lt_3_27650; -$Lt_3_28162: - setp.ge.u32 %p16, %r21, %r48; - @%p16 bra $Lt_3_28418; - add.u32 %r49, %r1, %r48; - cvt.u64.u32 %rd55, %r49; - mul.wide.u32 %rd56, %r49, 4; - add.u64 %rd57, %rd48, %rd56; - ld.shared.f32 %f88, [%rd57+0]; - add.ftz.f32 %f78, %f88, %f78; - st.shared.f32 [%rd51+0], %f78; - ld.shared.f32 %f89, [%rd57+512]; - add.ftz.f32 %f79, %f89, %f79; - st.shared.f32 [%rd51+512], %f79; - ld.shared.f32 %f90, [%rd57+1024]; - add.ftz.f32 %f80, %f90, %f80; - st.shared.f32 [%rd51+1024], %f80; - ld.shared.f32 %f91, [%rd57+1536]; - add.ftz.f32 %f81, %f91, %f81; - st.shared.f32 [%rd51+1536], %f81; - ld.shared.f32 %f92, [%rd57+2048]; - add.ftz.f32 %f86, %f92, %f86; - st.shared.f32 [%rd51+2048], %f86; - ld.shared.f32 %f93, [%rd57+2560]; - add.ftz.f32 %f87, %f93, %f87; - st.shared.f32 [%rd51+2560], %f87; -$Lt_3_28418: - shr.u32 %r48, %r48, 1; - mov.u32 %r50, 0; - setp.ne.u32 %p17, %r48, %r50; - @%p17 bra $Lt_3_28162; -$Lt_3_27650: - mov.f32 %f11, %f78; - mov.f32 %f13, %f79; - mov.f32 %f15, %f80; - mov.f32 %f17, %f81; - mov.f32 %f19, %f86; - mov.f32 %f21, %f87; -$Lt_3_27138: -$Lt_3_25090: - mov.u32 %r51, 0; - setp.ne.s32 %p18, %r21, %r51; - @%p18 bra $Lt_3_29186; - ld.param.u64 %rd58, [__cudaparm_kernel_lj_fast___val_paramengv]; - add.u64 %rd59, %rd58, %rd18; - ld.param.s32 %r52, [__cudaparm_kernel_lj_fast_eflag]; - mov.u32 %r53, 0; - setp.le.s32 %p19, %r52, %r53; - @%p19 bra $Lt_3_29698; - ld.global.f32 %f94, [%rd59+0]; - add.ftz.f32 %f95, %f94, %f30; - st.global.f32 [%rd59+0], %f95; - cvt.s64.s32 %rd60, %r15; - mul.wide.s32 %rd61, %r15, 4; - add.u64 %rd59, %rd59, %rd61; -$Lt_3_29698: - ld.param.s32 %r54, [__cudaparm_kernel_lj_fast_vflag]; - mov.u32 %r55, 0; - setp.le.s32 %p20, %r54, %r55; - @%p20 bra $Lt_3_30210; - ld.global.f32 %f96, [%rd59+0]; - mov.f32 %f97, %f11; - add.ftz.f32 %f98, %f96, %f97; - st.global.f32 [%rd59+0], %f98; - cvt.s64.s32 %rd62, %r15; - mul.wide.s32 %rd63, %r15, 4; - add.u64 %rd64, %rd63, %rd59; - ld.global.f32 %f99, [%rd64+0]; - mov.f32 %f100, %f13; - add.ftz.f32 %f101, %f99, %f100; - st.global.f32 [%rd64+0], %f101; - add.u64 %rd65, %rd63, %rd64; - ld.global.f32 %f102, [%rd65+0]; - mov.f32 %f103, %f15; - add.ftz.f32 %f104, %f102, %f103; - st.global.f32 [%rd65+0], %f104; - add.u64 %rd66, %rd63, %rd65; - ld.global.f32 %f105, [%rd66+0]; - mov.f32 %f106, %f17; - add.ftz.f32 %f107, %f105, %f106; - st.global.f32 [%rd66+0], %f107; - add.u64 %rd67, %rd63, %rd66; - ld.global.f32 %f108, [%rd67+0]; - mov.f32 %f109, %f19; - add.ftz.f32 %f110, %f108, %f109; - st.global.f32 [%rd67+0], %f110; - add.u64 %rd59, %rd63, %rd67; - ld.global.f32 %f111, [%rd59+0]; - mov.f32 %f112, %f21; - add.ftz.f32 %f113, %f111, %f112; - st.global.f32 [%rd59+0], %f113; -$Lt_3_30210: - ld.param.u64 %rd68, [__cudaparm_kernel_lj_fast_ans]; - mul.lo.u64 %rd69, %rd17, 16; - add.u64 %rd70, %rd68, %rd69; - ld.global.v4.f32 {%f114,%f115,%f116,%f117}, [%rd70+0]; - add.ftz.f32 %f118, %f115, %f28; - add.ftz.f32 %f119, %f116, %f27; - add.ftz.f32 %f120, %f114, %f29; - st.global.v4.f32 [%rd70+0], {%f120,%f118,%f119,%f117}; -$Lt_3_29186: -$Lt_3_22530: - .loc 17 695 0 - exit; -$LDWend_kernel_lj_fast: - } // kernel_lj_fast - diff --git a/lib/gpu/re_squared_lj_ptx.h b/lib/gpu/re_squared_lj_ptx.h deleted file mode 100644 index e1990aee9a..0000000000 --- a/lib/gpu/re_squared_lj_ptx.h +++ /dev/null @@ -1,3489 +0,0 @@ -const char * re_squared_lj = -" .version 2.3\n" -" .target sm_20\n" -" .address_size 64\n" -" .entry kernel_ellipsoid_sphere (\n" -" .param .u64 __cudaparm_kernel_ellipsoid_sphere_x_,\n" -" .param .u64 __cudaparm_kernel_ellipsoid_sphere_q,\n" -" .param .u64 __cudaparm_kernel_ellipsoid_sphere_shape,\n" -" .param .u64 __cudaparm_kernel_ellipsoid_sphere_well,\n" -" .param .u64 __cudaparm_kernel_ellipsoid_sphere_splj,\n" -" .param .u64 __cudaparm_kernel_ellipsoid_sphere_sig_eps,\n" -" .param .s32 __cudaparm_kernel_ellipsoid_sphere_ntypes,\n" -" .param .u64 __cudaparm_kernel_ellipsoid_sphere_dev_nbor,\n" -" .param .s32 __cudaparm_kernel_ellipsoid_sphere_stride,\n" -" .param .u64 __cudaparm_kernel_ellipsoid_sphere_ans,\n" -" .param .s32 __cudaparm_kernel_ellipsoid_sphere_astride,\n" -" .param .u64 __cudaparm_kernel_ellipsoid_sphere_engv,\n" -" .param .u64 __cudaparm_kernel_ellipsoid_sphere_err_flag,\n" -" .param .s32 __cudaparm_kernel_ellipsoid_sphere_eflag,\n" -" .param .s32 __cudaparm_kernel_ellipsoid_sphere_vflag,\n" -" .param .s32 __cudaparm_kernel_ellipsoid_sphere_inum,\n" -" .param .s32 __cudaparm_kernel_ellipsoid_sphere_t_per_atom)\n" -" {\n" -" .reg .u32 %r<66>;\n" -" .reg .u64 %rd<73>;\n" -" .reg .f32 %f<777>;\n" -" .reg .pred %p<34>;\n" -" .shared .align 16 .b8 __cuda___cuda_local_var_32886_33_non_const_sp_lj120[16];\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_33120_37_non_const_red_acc136[3584];\n" -" .shared .f32 __cuda_local_var_32892_33_non_const_b_alpha;\n" -" .shared .f32 __cuda_local_var_32892_42_non_const_cr60;\n" -" .shared .f32 __cuda_local_var_32892_48_non_const_solv_f_a;\n" -" .shared .f32 __cuda_local_var_32892_58_non_const_solv_f_r;\n" -" .loc 17 27 0\n" -"$LDWbegin_kernel_ellipsoid_sphere:\n" -" .loc 17 32 0\n" -" ld.param.u64 %rd1, [__cudaparm_kernel_ellipsoid_sphere_splj];\n" -" ldu.global.f32 %f1, [%rd1+0];\n" -" .loc 17 33 0\n" -" ld.global.f32 %f2, [%rd1+4];\n" -" .loc 17 34 0\n" -" ld.global.f32 %f3, [%rd1+8];\n" -" .loc 17 35 0\n" -" ld.global.f32 %f4, [%rd1+12];\n" -" st.shared.v4.f32 [__cuda___cuda_local_var_32886_33_non_const_sp_lj120+0], {%f1,%f2,%f3,%f4};\n" -" .loc 17 38 0\n" -" mov.f32 %f5, 0f3f4db6db; \n" -" st.shared.f32 [__cuda_local_var_32892_33_non_const_b_alpha], %f5;\n" -" .loc 17 39 0\n" -" mov.f32 %f6, 0f42700000; \n" -" lg2.approx.ftz.f32 %f7, %f6;\n" -" mov.f32 %f8, 0f3eaaaaab; \n" -" mul.ftz.f32 %f9, %f7, %f8;\n" -" ex2.approx.ftz.f32 %f10, %f9;\n" -" mov.f32 %f11, 0f42700000; \n" -" mul.ftz.f32 %f12, %f10, %f10;\n" -" div.approx.ftz.f32 %f13, %f11, %f12;\n" -" sub.ftz.f32 %f14, %f10, %f13;\n" -" mov.f32 %f15, 0f3eaaaaab; \n" -" mul.ftz.f32 %f16, %f14, %f15;\n" -" sub.ftz.f32 %f17, %f10, %f16;\n" -" st.shared.f32 [__cuda_local_var_32892_42_non_const_cr60], %f17;\n" -" .loc 21 544 0\n" -" mov.f32 %f18, 0f3f800000; \n" -" mov.f32 %f19, 0fbf52c7ea; \n" -" mov.f32 %f20, 0fc0b59883; \n" -" fma.rn.ftz.f32 %f21, %f18, %f19, %f20;\n" -" mov.f32 %f22, 0f41455dc0; \n" -" mov.f32 %f23, 0f3f800000; \n" -" mov.f32 %f24, 0f41e6bd60; \n" -" fma.rn.ftz.f32 %f25, %f22, %f23, %f24;\n" -" mov.f32 %f26, 0f3f800000; \n" -" mov.f32 %f27, 0fc0d21907; \n" -" fma.rn.ftz.f32 %f28, %f21, %f26, %f27;\n" -" mov.f32 %f29, 0f3f800000; \n" -" mov.f32 %f30, 0f419d92c8; \n" -" fma.rn.ftz.f32 %f31, %f25, %f29, %f30;\n" -" rcp.approx.ftz.f32 %f32, %f31;\n" -" mov.f32 %f33, 0f3f800000; \n" -" fma.rn.ftz.f32 %f34, %f28, %f32, %f33;\n" -" mov.b32 %r1, %f34;\n" -" mov.b32 %f35, %r1;\n" -" mov.f32 %f36, 0f41800000; \n" -" mul.ftz.f32 %f37, %f35, %f36;\n" -" mov.f32 %f38, 0f40400000; \n" -" mov.f32 %f39, 0fc2100000; \n" -" mul.ftz.f32 %f40, %f37, %f39;\n" -" div.approx.ftz.f32 %f41, %f38, %f40;\n" -" .loc 17 40 0\n" -" st.shared.f32 [__cuda_local_var_32892_48_non_const_solv_f_a], %f41;\n" -" .loc 21 544 0\n" -" mov.f32 %f42, 0f40400000; \n" -" mov.f32 %f43, 0f44fd2000; \n" -" mul.ftz.f32 %f44, %f37, %f43;\n" -" div.approx.ftz.f32 %f45, %f42, %f44;\n" -" .loc 17 41 0\n" -" st.shared.f32 [__cuda_local_var_32892_58_non_const_solv_f_r], %f45;\n" -" .loc 17 54 0\n" -" mov.f32 %f46, 0f00000000; \n" -" mov.f32 %f47, %f46;\n" -" mov.f32 %f48, 0f00000000; \n" -" mov.f32 %f49, %f48;\n" -" mov.f32 %f50, 0f00000000; \n" -" mov.f32 %f51, %f50;\n" -" mov.f32 %f52, 0f00000000; \n" -" mov.f32 %f53, %f52;\n" -" mov.f32 %f54, 0f00000000; \n" -" mov.f32 %f55, %f54;\n" -" mov.f32 %f56, 0f00000000; \n" -" mov.f32 %f57, %f56;\n" -" ld.param.s32 %r2, [__cudaparm_kernel_ellipsoid_sphere_t_per_atom];\n" -" cvt.s32.u32 %r3, %tid.x;\n" -" div.s32 %r4, %r3, %r2;\n" -" cvt.s32.u32 %r5, %ntid.x;\n" -" div.s32 %r6, %r5, %r2;\n" -" cvt.s32.u32 %r7, %ctaid.x;\n" -" mul.lo.s32 %r8, %r7, %r6;\n" -" add.s32 %r9, %r4, %r8;\n" -" ld.param.s32 %r10, [__cudaparm_kernel_ellipsoid_sphere_inum];\n" -" setp.le.s32 %p1, %r10, %r9;\n" -" @%p1 bra $Lt_0_73474;\n" -" .loc 17 59 0\n" -" cvt.s64.s32 %rd2, %r9;\n" -" mul.wide.s32 %rd3, %r9, 4;\n" -" ld.param.u64 %rd4, [__cudaparm_kernel_ellipsoid_sphere_dev_nbor];\n" -" add.u64 %rd5, %rd4, %rd3;\n" -" ld.global.s32 %r11, [%rd5+0];\n" -" ld.param.s32 %r12, [__cudaparm_kernel_ellipsoid_sphere_stride];\n" -" cvt.s64.s32 %rd6, %r12;\n" -" mul.wide.s32 %rd7, %r12, 4;\n" -" add.u64 %rd8, %rd7, %rd5;\n" -" ld.global.s32 %r13, [%rd8+0];\n" -" .loc 17 62 0\n" -" cvt.s64.s32 %rd9, %r11;\n" -" mul.wide.s32 %rd10, %r11, 16;\n" -" ld.param.u64 %rd11, [__cudaparm_kernel_ellipsoid_sphere_x_];\n" -" add.u64 %rd12, %rd10, %rd11;\n" -" ld.global.v4.f32 {%f58,%f59,%f60,%f61}, [%rd12+0];\n" -" .loc 17 70 0\n" -" cvt.rzi.ftz.s32.f32 %r14, %f61;\n" -" cvt.s64.s32 %rd13, %r14;\n" -" mul.wide.s32 %rd14, %r14, 16;\n" -" ld.param.u64 %rd15, [__cudaparm_kernel_ellipsoid_sphere_shape];\n" -" add.u64 %rd16, %rd15, %rd14;\n" -" ld.global.v4.f32 {%f62,%f63,%f64,_}, [%rd16+0];\n" -" .loc 17 74 0\n" -" ld.param.u64 %rd17, [__cudaparm_kernel_ellipsoid_sphere_q];\n" -" add.u64 %rd18, %rd17, %rd10;\n" -" ld.global.v4.f32 {%f65,%f66,%f67,%f68}, [%rd18+0];\n" -" .loc 17 75 0\n" -" ld.param.u64 %rd19, [__cudaparm_kernel_ellipsoid_sphere_well];\n" -" add.u64 %rd20, %rd19, %rd14;\n" -" ld.global.v4.f32 {%f69,%f70,%f71,_}, [%rd20+0];\n" -" .loc 17 78 0\n" -" cvt.s32.s64 %r15, %rd6;\n" -" sub.s32 %r16, %r2, 1;\n" -" and.b32 %r17, %r16, %r3;\n" -" add.u64 %rd21, %rd7, %rd8;\n" -" mul.lo.s32 %r18, %r15, %r17;\n" -" cvt.s64.s32 %rd22, %r18;\n" -" mul.wide.s32 %rd23, %r18, 4;\n" -" add.u64 %rd24, %rd21, %rd23;\n" -" mov.s64 %rd25, %rd24;\n" -" mul.lo.s32 %r19, %r15, %r13;\n" -" cvt.s64.s32 %rd26, %r19;\n" -" mul.wide.s32 %rd27, %r19, 4;\n" -" add.u64 %rd28, %rd21, %rd27;\n" -" setp.ge.u64 %p2, %rd24, %rd28;\n" -" @%p2 bra $Lt_0_75266;\n" -" ld.param.s32 %r20, [__cudaparm_kernel_ellipsoid_sphere_vflag];\n" -" mov.s32 %r21, 0;\n" -" setp.gt.s32 %p3, %r20, %r21;\n" -" add.ftz.f32 %f72, %f66, %f66;\n" -" add.ftz.f32 %f73, %f68, %f68;\n" -" mul.ftz.f32 %f74, %f65, %f65;\n" -" mul.ftz.f32 %f75, %f66, %f66;\n" -" mul.ftz.f32 %f76, %f67, %f67;\n" -" mul.ftz.f32 %f77, %f68, %f68;\n" -" add.ftz.f32 %f78, %f67, %f67;\n" -" mul.ftz.f32 %f79, %f62, %f63;\n" -" add.ftz.f32 %f80, %f62, %f62;\n" -" add.ftz.f32 %f81, %f63, %f63;\n" -" add.ftz.f32 %f82, %f64, %f64;\n" -" ld.param.s32 %r22, [__cudaparm_kernel_ellipsoid_sphere_ntypes];\n" -" mul.lo.s32 %r23, %r22, %r14;\n" -" mul.ftz.f32 %f83, %f72, %f67;\n" -" mul.ftz.f32 %f84, %f72, %f68;\n" -" mul.ftz.f32 %f85, %f72, %f65;\n" -" mul.ftz.f32 %f86, %f73, %f65;\n" -" add.ftz.f32 %f87, %f74, %f75;\n" -" sub.ftz.f32 %f88, %f74, %f75;\n" -" mul.ftz.f32 %f89, %f78, %f65;\n" -" mul.ftz.f32 %f90, %f78, %f68;\n" -" mul.ftz.f32 %f91, %f79, %f64;\n" -" sub.ftz.f32 %f92, %f83, %f86;\n" -" add.ftz.f32 %f93, %f83, %f86;\n" -" sub.ftz.f32 %f94, %f86, %f83;\n" -" sub.ftz.f32 %f95, %f87, %f76;\n" -" add.ftz.f32 %f96, %f76, %f88;\n" -" sub.ftz.f32 %f97, %f88, %f76;\n" -" add.ftz.f32 %f98, %f84, %f89;\n" -" sub.ftz.f32 %f99, %f84, %f89;\n" -" sub.ftz.f32 %f100, %f89, %f84;\n" -" sub.ftz.f32 %f101, %f90, %f85;\n" -" add.ftz.f32 %f102, %f85, %f90;\n" -" sub.ftz.f32 %f103, %f85, %f90;\n" -" mul.ftz.f32 %f104, %f92, %f70;\n" -" mul.ftz.f32 %f105, %f93, %f69;\n" -" neg.ftz.f32 %f106, %f93;\n" -" sub.ftz.f32 %f107, %f95, %f77;\n" -" sub.ftz.f32 %f108, %f77, %f95;\n" -" sub.ftz.f32 %f109, %f96, %f77;\n" -" sub.ftz.f32 %f110, %f77, %f96;\n" -" add.ftz.f32 %f111, %f77, %f97;\n" -" mul.ftz.f32 %f112, %f98, %f71;\n" -" neg.ftz.f32 %f113, %f98;\n" -" mul.ftz.f32 %f114, %f99, %f69;\n" -" mul.ftz.f32 %f115, %f101, %f71;\n" -" mul.ftz.f32 %f116, %f102, %f70;\n" -" mul.ftz.f32 %f117, %f92, %f104;\n" -" mul.ftz.f32 %f118, %f102, %f104;\n" -" mul.ftz.f32 %f119, %f107, %f69;\n" -" mul.ftz.f32 %f120, %f104, %f109;\n" -" mul.ftz.f32 %f121, %f109, %f70;\n" -" mul.ftz.f32 %f122, %f111, %f71;\n" -" neg.ftz.f32 %f123, %f111;\n" -" mul.ftz.f32 %f124, %f92, %f116;\n" -" mul.ftz.f32 %f125, %f109, %f116;\n" -" mul.ftz.f32 %f126, %f102, %f116;\n" -" fma.rn.ftz.f32 %f127, %f107, %f119, %f117;\n" -" fma.rn.ftz.f32 %f128, %f119, %f99, %f118;\n" -" fma.rn.ftz.f32 %f129, %f119, %f93, %f120;\n" -" mul.ftz.f32 %f130, %f92, %f121;\n" -" mul.ftz.f32 %f131, %f109, %f121;\n" -" mul.ftz.f32 %f132, %f102, %f121;\n" -" fma.rn.ftz.f32 %f133, %f107, %f114, %f124;\n" -" fma.rn.ftz.f32 %f134, %f93, %f114, %f125;\n" -" fma.rn.ftz.f32 %f135, %f99, %f114, %f126;\n" -" fma.rn.ftz.f32 %f136, %f112, %f98, %f127;\n" -" fma.rn.ftz.f32 %f137, %f112, %f111, %f128;\n" -" fma.rn.ftz.f32 %f138, %f112, %f101, %f129;\n" -" fma.rn.ftz.f32 %f139, %f107, %f105, %f130;\n" -" fma.rn.ftz.f32 %f140, %f93, %f105, %f131;\n" -" fma.rn.ftz.f32 %f141, %f99, %f105, %f132;\n" -" fma.rn.ftz.f32 %f142, %f98, %f122, %f133;\n" -" fma.rn.ftz.f32 %f143, %f101, %f122, %f134;\n" -" fma.rn.ftz.f32 %f144, %f111, %f122, %f135;\n" -" mov.f32 %f145, 0f3f800000; \n" -" add.ftz.f32 %f146, %f136, %f145;\n" -" fma.rn.ftz.f32 %f147, %f115, %f98, %f139;\n" -" fma.rn.ftz.f32 %f148, %f115, %f101, %f140;\n" -" fma.rn.ftz.f32 %f149, %f115, %f111, %f141;\n" -" abs.ftz.f32 %f150, %f142;\n" -" mov.f32 %f151, 0f3f800000; \n" -" add.ftz.f32 %f152, %f144, %f151;\n" -" abs.ftz.f32 %f153, %f146;\n" -" abs.ftz.f32 %f154, %f147;\n" -" mov.f32 %f155, 0f3f800000; \n" -" add.ftz.f32 %f156, %f148, %f155;\n" -" setp.lt.ftz.f32 %p4, %f153, %f154;\n" -" ld.param.u64 %rd29, [__cudaparm_kernel_ellipsoid_sphere_sig_eps];\n" -" mov.f32 %f157, 0f00000000; \n" -" mov.f32 %f158, 0f00000000; \n" -" mov.f32 %f159, 0f00000000; \n" -" mov.f32 %f160, 0f00000000; \n" -" mov.f32 %f161, 0f00000000; \n" -" mov.f32 %f162, 0f00000000; \n" -" mov.f32 %f163, 0f00000000; \n" -" mov.u64 %rd30, __cuda___cuda_local_var_32886_33_non_const_sp_lj120;\n" -"$Lt_0_51970:\n" -" .loc 17 83 0\n" -" ld.global.s32 %r24, [%rd25+0];\n" -" .loc 17 87 0\n" -" and.b32 %r25, %r24, 1073741823;\n" -" cvt.s64.s32 %rd31, %r25;\n" -" mul.wide.s32 %rd32, %r25, 16;\n" -" add.u64 %rd33, %rd11, %rd32;\n" -" ld.global.v4.f32 {%f164,%f165,%f166,%f167}, [%rd33+0];\n" -" .loc 17 98 0\n" -" sub.ftz.f32 %f168, %f165, %f59;\n" -" sub.ftz.f32 %f169, %f164, %f58;\n" -" sub.ftz.f32 %f170, %f166, %f60;\n" -" mul.ftz.f32 %f171, %f168, %f168;\n" -" fma.rn.ftz.f32 %f172, %f169, %f169, %f171;\n" -" fma.rn.ftz.f32 %f173, %f170, %f170, %f172;\n" -" rsqrt.approx.ftz.f32 %f174, %f173;\n" -" mul.ftz.f32 %f175, %f169, %f174;\n" -" .loc 17 99 0\n" -" mul.ftz.f32 %f176, %f168, %f174;\n" -" .loc 17 104 0\n" -" cvt.rzi.ftz.s32.f32 %r26, %f167;\n" -" add.s32 %r27, %r26, %r23;\n" -" cvt.s64.s32 %rd34, %r27;\n" -" mul.wide.s32 %rd35, %r27, 8;\n" -" add.u64 %rd36, %rd29, %rd35;\n" -" ld.global.v2.f32 {%f177,%f178}, [%rd36+0];\n" -" .loc 17 105 0\n" -" shr.s32 %r28, %r24, 30;\n" -" and.b32 %r29, %r28, 3;\n" -" cvt.s64.s32 %rd37, %r29;\n" -" mul.wide.s32 %rd38, %r29, 4;\n" -" add.u64 %rd39, %rd30, %rd38;\n" -" ld.shared.f32 %f179, [%rd39+0];\n" -" mul.ftz.f32 %f180, %f179, %f178;\n" -" .loc 16 299 0\n" -" mov.f32 %f181, %f175;\n" -" .loc 16 300 0\n" -" mov.f32 %f182, 0f3f000000; \n" -" mul.ftz.f32 %f183, %f177, %f182;\n" -" add.ftz.f32 %f184, %f183, %f63;\n" -" add.ftz.f32 %f185, %f183, %f62;\n" -" add.ftz.f32 %f186, %f183, %f64;\n" -" mul.ftz.f32 %f187, %f184, %f184;\n" -" mul.ftz.f32 %f188, %f185, %f185;\n" -" mul.ftz.f32 %f189, %f186, %f186;\n" -" mov.f32 %f190, 0f3f000000; \n" -" mul.ftz.f32 %f191, %f187, %f190;\n" -" mov.f32 %f192, 0f3f000000; \n" -" mul.ftz.f32 %f193, %f188, %f192;\n" -" mov.f32 %f194, 0f3f000000; \n" -" mul.ftz.f32 %f195, %f189, %f194;\n" -" mul.ftz.f32 %f196, %f92, %f191;\n" -" mul.ftz.f32 %f197, %f98, %f195;\n" -" mul.ftz.f32 %f198, %f193, %f107;\n" -" mul.ftz.f32 %f199, %f92, %f196;\n" -" fma.rn.ftz.f32 %f200, %f107, %f198, %f199;\n" -" fma.rn.ftz.f32 %f201, %f197, %f98, %f200;\n" -" mov.f32 %f202, %f201;\n" -" .loc 16 301 0\n" -" mul.ftz.f32 %f203, %f109, %f196;\n" -" fma.rn.ftz.f32 %f204, %f198, %f93, %f203;\n" -" fma.rn.ftz.f32 %f205, %f197, %f101, %f204;\n" -" mov.f32 %f206, %f205;\n" -" .loc 16 302 0\n" -" mul.ftz.f32 %f207, %f102, %f196;\n" -" fma.rn.ftz.f32 %f208, %f198, %f99, %f207;\n" -" fma.rn.ftz.f32 %f209, %f197, %f111, %f208;\n" -" mov.f32 %f210, %f209;\n" -" .loc 16 303 0\n" -" mov.f32 %f211, %f176;\n" -" .loc 16 304 0\n" -" mul.ftz.f32 %f212, %f93, %f193;\n" -" mul.ftz.f32 %f213, %f101, %f195;\n" -" mul.ftz.f32 %f214, %f191, %f109;\n" -" mul.ftz.f32 %f215, %f92, %f214;\n" -" fma.rn.ftz.f32 %f216, %f107, %f212, %f215;\n" -" fma.rn.ftz.f32 %f217, %f213, %f98, %f216;\n" -" mov.f32 %f218, %f217;\n" -" .loc 16 305 0\n" -" mul.ftz.f32 %f219, %f109, %f214;\n" -" fma.rn.ftz.f32 %f220, %f93, %f212, %f219;\n" -" fma.rn.ftz.f32 %f221, %f213, %f101, %f220;\n" -" mov.f32 %f222, %f221;\n" -" .loc 16 306 0\n" -" mul.ftz.f32 %f223, %f102, %f214;\n" -" fma.rn.ftz.f32 %f224, %f99, %f212, %f223;\n" -" fma.rn.ftz.f32 %f225, %f213, %f111, %f224;\n" -" mov.f32 %f226, %f225;\n" -" .loc 16 307 0\n" -" mul.ftz.f32 %f227, %f170, %f174;\n" -" mov.f32 %f228, %f227;\n" -" .loc 16 308 0\n" -" mul.ftz.f32 %f229, %f102, %f191;\n" -" mul.ftz.f32 %f230, %f99, %f193;\n" -" mul.ftz.f32 %f231, %f195, %f111;\n" -" mul.ftz.f32 %f232, %f92, %f229;\n" -" fma.rn.ftz.f32 %f233, %f107, %f230, %f232;\n" -" fma.rn.ftz.f32 %f234, %f98, %f231, %f233;\n" -" mov.f32 %f235, %f234;\n" -" .loc 16 309 0\n" -" mul.ftz.f32 %f236, %f109, %f229;\n" -" fma.rn.ftz.f32 %f237, %f93, %f230, %f236;\n" -" fma.rn.ftz.f32 %f238, %f101, %f231, %f237;\n" -" mov.f32 %f239, %f238;\n" -" .loc 16 310 0\n" -" mul.ftz.f32 %f240, %f102, %f229;\n" -" fma.rn.ftz.f32 %f241, %f99, %f230, %f240;\n" -" fma.rn.ftz.f32 %f242, %f111, %f231, %f241;\n" -" mov.f32 %f243, %f242;\n" -" abs.ftz.f32 %f244, %f217;\n" -" abs.ftz.f32 %f245, %f201;\n" -" setp.gt.ftz.f32 %p5, %f244, %f245;\n" -" @!%p5 bra $Lt_0_52226;\n" -" .loc 16 314 0\n" -" mov.f32 %f202, %f217;\n" -" mov.f32 %f218, %f201;\n" -" .loc 16 315 0\n" -" mov.f32 %f206, %f221;\n" -" mov.f32 %f222, %f205;\n" -" .loc 16 316 0\n" -" mov.f32 %f210, %f225;\n" -" mov.f32 %f226, %f209;\n" -" .loc 16 317 0\n" -" mov.f32 %f181, %f176;\n" -" mov.f32 %f211, %f175;\n" -"$Lt_0_52226:\n" -" mov.f32 %f246, %f202;\n" -" abs.ftz.f32 %f247, %f246;\n" -" abs.ftz.f32 %f248, %f234;\n" -" setp.lt.ftz.f32 %p6, %f247, %f248;\n" -" @!%p6 bra $Lt_0_52738;\n" -" .loc 16 321 0\n" -" mov.f32 %f202, %f234;\n" -" mov.f32 %f235, %f246;\n" -" .loc 16 322 0\n" -" mov.f32 %f249, %f206;\n" -" mov.f32 %f206, %f238;\n" -" mov.f32 %f239, %f249;\n" -" .loc 16 323 0\n" -" mov.f32 %f250, %f210;\n" -" mov.f32 %f210, %f242;\n" -" mov.f32 %f243, %f250;\n" -" .loc 16 324 0\n" -" mov.f32 %f251, %f181;\n" -" mov.f32 %f181, %f227;\n" -" mov.f32 %f228, %f251;\n" -"$Lt_0_52738:\n" -" mov.f32 %f252, %f202;\n" -" mov.f32 %f253, 0f00000000; \n" -" setp.neu.ftz.f32 %p7, %f252, %f253;\n" -" @!%p7 bra $Lt_0_53506;\n" -" bra.uni $Lt_0_54274;\n" -"$Lt_0_53506:\n" -" mov.f32 %f254, 0f00000000; \n" -" setp.neu.ftz.f32 %p8, %f218, %f254;\n" -" @!%p8 bra $Lt_0_54018;\n" -" .loc 16 338 0\n" -" mov.f32 %f202, %f218;\n" -" mov.f32 %f218, %f252;\n" -" .loc 16 339 0\n" -" mov.f32 %f255, %f206;\n" -" mov.f32 %f206, %f222;\n" -" mov.f32 %f222, %f255;\n" -" .loc 16 340 0\n" -" mov.f32 %f256, %f210;\n" -" mov.f32 %f210, %f226;\n" -" mov.f32 %f226, %f256;\n" -" .loc 16 341 0\n" -" mov.f32 %f257, %f181;\n" -" mov.f32 %f181, %f211;\n" -" mov.f32 %f211, %f257;\n" -" bra.uni $Lt_0_54274;\n" -"$Lt_0_54018:\n" -" mov.f32 %f258, 0f00000000; \n" -" setp.neu.ftz.f32 %p9, %f235, %f258;\n" -" @!%p9 bra $Lt_0_54530;\n" -" .loc 16 346 0\n" -" mov.f32 %f202, %f235;\n" -" mov.f32 %f235, %f252;\n" -" .loc 16 347 0\n" -" mov.f32 %f259, %f206;\n" -" mov.f32 %f206, %f239;\n" -" mov.f32 %f239, %f259;\n" -" .loc 16 348 0\n" -" mov.f32 %f260, %f210;\n" -" mov.f32 %f210, %f243;\n" -" mov.f32 %f243, %f260;\n" -" .loc 16 349 0\n" -" mov.f32 %f261, %f181;\n" -" mov.f32 %f181, %f228;\n" -" mov.f32 %f228, %f261;\n" -" bra.uni $Lt_0_54274;\n" -"$Lt_0_54530:\n" -" .loc 16 352 0\n" -" mov.s32 %r30, 2;\n" -" ld.param.u64 %rd40, [__cudaparm_kernel_ellipsoid_sphere_err_flag];\n" -" st.global.s32 [%rd40+0], %r30;\n" -"$Lt_0_54274:\n" -"$Lt_0_53762:\n" -"$Lt_0_53250:\n" -" .loc 16 355 0\n" -" div.approx.ftz.f32 %f262, %f218, %f202;\n" -" mul.ftz.f32 %f263, %f206, %f262;\n" -" sub.ftz.f32 %f264, %f222, %f263;\n" -" mov.f32 %f222, %f264;\n" -" .loc 16 356 0\n" -" mul.ftz.f32 %f265, %f210, %f262;\n" -" sub.ftz.f32 %f266, %f226, %f265;\n" -" mov.f32 %f226, %f266;\n" -" .loc 16 357 0\n" -" mul.ftz.f32 %f267, %f181, %f262;\n" -" sub.ftz.f32 %f268, %f211, %f267;\n" -" mov.f32 %f211, %f268;\n" -" .loc 16 359 0\n" -" div.approx.ftz.f32 %f269, %f235, %f202;\n" -" mul.ftz.f32 %f270, %f206, %f269;\n" -" sub.ftz.f32 %f239, %f239, %f270;\n" -" .loc 16 360 0\n" -" mul.ftz.f32 %f271, %f210, %f269;\n" -" sub.ftz.f32 %f243, %f243, %f271;\n" -" .loc 16 361 0\n" -" mul.ftz.f32 %f272, %f181, %f269;\n" -" sub.ftz.f32 %f228, %f228, %f272;\n" -" abs.ftz.f32 %f273, %f264;\n" -" abs.ftz.f32 %f274, %f239;\n" -" setp.lt.ftz.f32 %p10, %f273, %f274;\n" -" @!%p10 bra $Lt_0_54786;\n" -" .loc 16 366 0\n" -" mov.f32 %f222, %f239;\n" -" mov.f32 %f239, %f264;\n" -" .loc 16 367 0\n" -" mov.f32 %f226, %f243;\n" -" mov.f32 %f243, %f266;\n" -" .loc 16 368 0\n" -" mov.f32 %f211, %f228;\n" -" mov.f32 %f228, %f268;\n" -"$Lt_0_54786:\n" -" mov.f32 %f275, %f222;\n" -" mov.f32 %f276, 0f00000000; \n" -" setp.neu.ftz.f32 %p11, %f275, %f276;\n" -" @!%p11 bra $Lt_0_55554;\n" -" bra.uni $Lt_0_55810;\n" -"$Lt_0_55554:\n" -" mov.f32 %f277, 0f00000000; \n" -" setp.neu.ftz.f32 %p12, %f239, %f277;\n" -" @!%p12 bra $Lt_0_55810;\n" -" .loc 16 383 0\n" -" mov.f32 %f222, %f239;\n" -" mov.f32 %f239, %f275;\n" -" .loc 16 384 0\n" -" mov.f32 %f278, %f226;\n" -" mov.f32 %f226, %f243;\n" -" mov.f32 %f243, %f278;\n" -" .loc 16 385 0\n" -" mov.f32 %f279, %f211;\n" -" mov.f32 %f211, %f228;\n" -" mov.f32 %f228, %f279;\n" -"$Lt_0_55810:\n" -"$Lt_0_55298:\n" -" .loc 16 390 0\n" -" div.approx.ftz.f32 %f280, %f239, %f222;\n" -" mul.ftz.f32 %f281, %f226, %f280;\n" -" sub.ftz.f32 %f243, %f243, %f281;\n" -" .loc 16 391 0\n" -" mul.ftz.f32 %f282, %f211, %f280;\n" -" sub.ftz.f32 %f228, %f228, %f282;\n" -" mov.f32 %f283, 0f00000000; \n" -" setp.eq.ftz.f32 %p13, %f243, %f283;\n" -" @!%p13 bra $Lt_0_56322;\n" -" .loc 16 394 0\n" -" mov.s32 %r31, 2;\n" -" ld.param.u64 %rd41, [__cudaparm_kernel_ellipsoid_sphere_err_flag];\n" -" st.global.s32 [%rd41+0], %r31;\n" -"$Lt_0_56322:\n" -" .loc 16 396 0\n" -" div.approx.ftz.f32 %f284, %f228, %f243;\n" -" .loc 16 399 0\n" -" mul.ftz.f32 %f285, %f284, %f226;\n" -" sub.ftz.f32 %f286, %f211, %f285;\n" -" div.approx.ftz.f32 %f287, %f286, %f222;\n" -" .loc 16 403 0\n" -" mul.ftz.f32 %f288, %f287, %f206;\n" -" fma.rn.ftz.f32 %f289, %f210, %f284, %f288;\n" -" sub.ftz.f32 %f290, %f181, %f289;\n" -" div.approx.ftz.f32 %f291, %f290, %f202;\n" -" .loc 17 124 0\n" -" mul.ftz.f32 %f292, %f287, %f176;\n" -" fma.rn.ftz.f32 %f293, %f175, %f291, %f292;\n" -" fma.rn.ftz.f32 %f294, %f227, %f284, %f293;\n" -" mov.f32 %f295, 0f3f000000; \n" -" mul.ftz.f32 %f296, %f294, %f295;\n" -" rsqrt.approx.ftz.f32 %f297, %f296;\n" -" .loc 16 299 0\n" -" mov.f32 %f181, %f175;\n" -" .loc 16 300 0\n" -" mov.f32 %f202, %f146;\n" -" .loc 16 301 0\n" -" mov.f32 %f206, %f138;\n" -" .loc 16 302 0\n" -" mov.f32 %f210, %f137;\n" -" .loc 16 303 0\n" -" mov.f32 %f211, %f176;\n" -" .loc 16 304 0\n" -" mov.f32 %f218, %f147;\n" -" .loc 16 305 0\n" -" mov.f32 %f222, %f156;\n" -" .loc 16 306 0\n" -" mov.f32 %f226, %f149;\n" -" .loc 16 307 0\n" -" mov.f32 %f228, %f227;\n" -" .loc 16 308 0\n" -" mov.f32 %f235, %f142;\n" -" .loc 16 309 0\n" -" mov.f32 %f239, %f143;\n" -" .loc 16 310 0\n" -" mov.f32 %f243, %f152;\n" -" @!%p4 bra $Lt_0_56834;\n" -" .loc 16 314 0\n" -" mov.f32 %f202, %f147;\n" -" mov.f32 %f218, %f146;\n" -" .loc 16 315 0\n" -" mov.f32 %f206, %f156;\n" -" mov.f32 %f222, %f138;\n" -" .loc 16 316 0\n" -" mov.f32 %f210, %f149;\n" -" mov.f32 %f226, %f137;\n" -" .loc 16 317 0\n" -" mov.f32 %f181, %f176;\n" -" mov.f32 %f211, %f175;\n" -"$Lt_0_56834:\n" -" mov.f32 %f298, %f202;\n" -" abs.ftz.f32 %f299, %f298;\n" -" setp.gt.ftz.f32 %p14, %f150, %f299;\n" -" @!%p14 bra $Lt_0_57346;\n" -" .loc 16 321 0\n" -" mov.f32 %f202, %f142;\n" -" mov.f32 %f235, %f298;\n" -" .loc 16 322 0\n" -" mov.f32 %f300, %f206;\n" -" mov.f32 %f206, %f143;\n" -" mov.f32 %f239, %f300;\n" -" .loc 16 323 0\n" -" mov.f32 %f301, %f210;\n" -" mov.f32 %f210, %f152;\n" -" mov.f32 %f243, %f301;\n" -" .loc 16 324 0\n" -" mov.f32 %f302, %f181;\n" -" mov.f32 %f181, %f227;\n" -" mov.f32 %f228, %f302;\n" -"$Lt_0_57346:\n" -" mov.f32 %f303, %f202;\n" -" mov.f32 %f304, 0f00000000; \n" -" setp.neu.ftz.f32 %p15, %f303, %f304;\n" -" @!%p15 bra $Lt_0_58114;\n" -" bra.uni $Lt_0_58882;\n" -"$Lt_0_58114:\n" -" mov.f32 %f305, 0f00000000; \n" -" setp.neu.ftz.f32 %p16, %f218, %f305;\n" -" @!%p16 bra $Lt_0_58626;\n" -" .loc 16 338 0\n" -" mov.f32 %f202, %f218;\n" -" mov.f32 %f218, %f303;\n" -" .loc 16 339 0\n" -" mov.f32 %f306, %f206;\n" -" mov.f32 %f206, %f222;\n" -" mov.f32 %f222, %f306;\n" -" .loc 16 340 0\n" -" mov.f32 %f307, %f210;\n" -" mov.f32 %f210, %f226;\n" -" mov.f32 %f226, %f307;\n" -" .loc 16 341 0\n" -" mov.f32 %f308, %f181;\n" -" mov.f32 %f181, %f211;\n" -" mov.f32 %f211, %f308;\n" -" bra.uni $Lt_0_58882;\n" -"$Lt_0_58626:\n" -" mov.f32 %f309, 0f00000000; \n" -" setp.neu.ftz.f32 %p17, %f235, %f309;\n" -" @!%p17 bra $Lt_0_59138;\n" -" .loc 16 346 0\n" -" mov.f32 %f202, %f235;\n" -" mov.f32 %f235, %f303;\n" -" .loc 16 347 0\n" -" mov.f32 %f310, %f206;\n" -" mov.f32 %f206, %f239;\n" -" mov.f32 %f239, %f310;\n" -" .loc 16 348 0\n" -" mov.f32 %f311, %f210;\n" -" mov.f32 %f210, %f243;\n" -" mov.f32 %f243, %f311;\n" -" .loc 16 349 0\n" -" mov.f32 %f312, %f181;\n" -" mov.f32 %f181, %f228;\n" -" mov.f32 %f228, %f312;\n" -" bra.uni $Lt_0_58882;\n" -"$Lt_0_59138:\n" -" .loc 16 352 0\n" -" mov.s32 %r32, 2;\n" -" ld.param.u64 %rd42, [__cudaparm_kernel_ellipsoid_sphere_err_flag];\n" -" st.global.s32 [%rd42+0], %r32;\n" -"$Lt_0_58882:\n" -"$Lt_0_58370:\n" -"$Lt_0_57858:\n" -" .loc 16 355 0\n" -" div.approx.ftz.f32 %f313, %f218, %f202;\n" -" mul.ftz.f32 %f314, %f206, %f313;\n" -" sub.ftz.f32 %f315, %f222, %f314;\n" -" mov.f32 %f222, %f315;\n" -" .loc 16 356 0\n" -" mul.ftz.f32 %f316, %f210, %f313;\n" -" sub.ftz.f32 %f317, %f226, %f316;\n" -" mov.f32 %f226, %f317;\n" -" .loc 16 357 0\n" -" mul.ftz.f32 %f318, %f181, %f313;\n" -" sub.ftz.f32 %f319, %f211, %f318;\n" -" mov.f32 %f211, %f319;\n" -" .loc 16 359 0\n" -" div.approx.ftz.f32 %f320, %f235, %f202;\n" -" mul.ftz.f32 %f321, %f206, %f320;\n" -" sub.ftz.f32 %f239, %f239, %f321;\n" -" .loc 16 360 0\n" -" mul.ftz.f32 %f322, %f210, %f320;\n" -" sub.ftz.f32 %f243, %f243, %f322;\n" -" .loc 16 361 0\n" -" mul.ftz.f32 %f323, %f181, %f320;\n" -" sub.ftz.f32 %f228, %f228, %f323;\n" -" abs.ftz.f32 %f324, %f315;\n" -" abs.ftz.f32 %f325, %f239;\n" -" setp.lt.ftz.f32 %p18, %f324, %f325;\n" -" @!%p18 bra $Lt_0_59394;\n" -" .loc 16 366 0\n" -" mov.f32 %f222, %f239;\n" -" mov.f32 %f239, %f315;\n" -" .loc 16 367 0\n" -" mov.f32 %f226, %f243;\n" -" mov.f32 %f243, %f317;\n" -" .loc 16 368 0\n" -" mov.f32 %f211, %f228;\n" -" mov.f32 %f228, %f319;\n" -"$Lt_0_59394:\n" -" mov.f32 %f326, %f222;\n" -" mov.f32 %f327, 0f00000000; \n" -" setp.neu.ftz.f32 %p19, %f326, %f327;\n" -" @!%p19 bra $Lt_0_60162;\n" -" bra.uni $Lt_0_60418;\n" -"$Lt_0_60162:\n" -" mov.f32 %f328, 0f00000000; \n" -" setp.neu.ftz.f32 %p20, %f239, %f328;\n" -" @!%p20 bra $Lt_0_60418;\n" -" .loc 16 383 0\n" -" mov.f32 %f222, %f239;\n" -" mov.f32 %f239, %f326;\n" -" .loc 16 384 0\n" -" mov.f32 %f329, %f226;\n" -" mov.f32 %f226, %f243;\n" -" mov.f32 %f243, %f329;\n" -" .loc 16 385 0\n" -" mov.f32 %f330, %f211;\n" -" mov.f32 %f211, %f228;\n" -" mov.f32 %f228, %f330;\n" -"$Lt_0_60418:\n" -"$Lt_0_59906:\n" -" .loc 16 390 0\n" -" div.approx.ftz.f32 %f331, %f239, %f222;\n" -" mul.ftz.f32 %f332, %f226, %f331;\n" -" sub.ftz.f32 %f243, %f243, %f332;\n" -" .loc 16 391 0\n" -" mul.ftz.f32 %f333, %f211, %f331;\n" -" sub.ftz.f32 %f228, %f228, %f333;\n" -" mov.f32 %f334, 0f00000000; \n" -" setp.eq.ftz.f32 %p21, %f243, %f334;\n" -" @!%p21 bra $Lt_0_60930;\n" -" .loc 16 394 0\n" -" mov.s32 %r33, 2;\n" -" ld.param.u64 %rd43, [__cudaparm_kernel_ellipsoid_sphere_err_flag];\n" -" st.global.s32 [%rd43+0], %r33;\n" -"$Lt_0_60930:\n" -" .loc 17 133 0\n" -" div.approx.ftz.f32 %f335, %f228, %f243;\n" -" mul.ftz.f32 %f336, %f335, %f226;\n" -" sub.ftz.f32 %f337, %f211, %f336;\n" -" div.approx.ftz.f32 %f338, %f337, %f222;\n" -" mul.ftz.f32 %f339, %f338, %f206;\n" -" fma.rn.ftz.f32 %f340, %f210, %f335, %f339;\n" -" mul.ftz.f32 %f341, %f338, %f176;\n" -" sub.ftz.f32 %f342, %f181, %f340;\n" -" div.approx.ftz.f32 %f343, %f342, %f202;\n" -" fma.rn.ftz.f32 %f344, %f175, %f343, %f341;\n" -" fma.rn.ftz.f32 %f345, %f227, %f335, %f344;\n" -" add.ftz.f32 %f346, %f345, %f345;\n" -" .loc 17 141 0\n" -" rcp.approx.ftz.f32 %f347, %f174;\n" -" sub.ftz.f32 %f348, %f347, %f297;\n" -" mov.f32 %f349, 0f3f000000; \n" -" mul.ftz.f32 %f350, %f348, %f349;\n" -" mul.ftz.f32 %f351, %f348, %f348;\n" -" mul.ftz.f32 %f352, %f348, %f351;\n" -" add.ftz.f32 %f353, %f350, %f64;\n" -" add.ftz.f32 %f354, %f350, %f62;\n" -" add.ftz.f32 %f355, %f350, %f63;\n" -" mul.ftz.f32 %f356, %f354, %f355;\n" -" mul.ftz.f32 %f357, %f353, %f356;\n" -" mul.ftz.f32 %f358, %f352, %f357;\n" -" .loc 17 142 0\n" -" div.approx.ftz.f32 %f359, %f177, %f348;\n" -" mul.ftz.f32 %f360, %f359, %f346;\n" -" mov.f32 %f361, 0f3f800000; \n" -" mov.f32 %f362, 0f40400000; \n" -" fma.rn.ftz.f32 %f363, %f362, %f360, %f361;\n" -" mul.ftz.f32 %f364, %f91, %f363;\n" -" .loc 17 146 0\n" -" div.approx.ftz.f32 %f365, %f348, %f17;\n" -" add.ftz.f32 %f366, %f365, %f64;\n" -" add.ftz.f32 %f367, %f365, %f62;\n" -" add.ftz.f32 %f368, %f365, %f63;\n" -" mul.ftz.f32 %f369, %f367, %f368;\n" -" mul.ftz.f32 %f370, %f366, %f369;\n" -" mul.ftz.f32 %f371, %f352, %f370;\n" -" .loc 17 148 0\n" -" mov.f32 %f372, 0f3f800000; \n" -" mov.f32 %f373, 0f3f4db6db; \n" -" fma.rn.ftz.f32 %f374, %f373, %f360, %f372;\n" -" mul.ftz.f32 %f375, %f91, %f374;\n" -" .loc 17 150 0\n" -" mul.ftz.f32 %f376, %f359, %f359;\n" -" mul.ftz.f32 %f377, %f359, %f376;\n" -" mul.ftz.f32 %f378, %f377, %f377;\n" -" .loc 17 153 0\n" -" mul.ftz.f32 %f379, %f177, %f177;\n" -" mov.f32 %f380, 0f41000000; \n" -" div.approx.ftz.f32 %f381, %f358, %f380;\n" -" mov.f32 %f382, 0f42700000; \n" -" div.approx.ftz.f32 %f383, %f371, %f382;\n" -" mul.ftz.f32 %f384, %f379, %f177;\n" -" div.approx.ftz.f32 %f385, %f364, %f381;\n" -" div.approx.ftz.f32 %f386, %f375, %f383;\n" -" mul.ftz.f32 %f387, %f385, %f180;\n" -" mul.ftz.f32 %f388, %f386, %f180;\n" -" mul.ftz.f32 %f389, %f384, %f387;\n" -" mul.ftz.f32 %f390, %f384, %f388;\n" -" mul.ftz.f32 %f391, %f389, %f41;\n" -" mul.ftz.f32 %f392, %f390, %f378;\n" -" mul.ftz.f32 %f393, %f392, %f45;\n" -" add.ftz.f32 %f394, %f391, %f393;\n" -" add.ftz.f32 %f163, %f163, %f394;\n" -" .loc 17 160 0\n" -" mov.f32 %f395, 0f40800000; \n" -" mul.ftz.f32 %f396, %f343, %f395;\n" -" .loc 17 167 0\n" -" mov.f32 %f397, 0f40400000; \n" -" div.approx.ftz.f32 %f398, %f397, %f348;\n" -" add.ftz.f32 %f399, %f80, %f348;\n" -" rcp.approx.ftz.f32 %f400, %f399;\n" -" add.ftz.f32 %f401, %f81, %f348;\n" -" rcp.approx.ftz.f32 %f402, %f401;\n" -" add.ftz.f32 %f403, %f400, %f402;\n" -" add.ftz.f32 %f404, %f82, %f348;\n" -" rcp.approx.ftz.f32 %f405, %f404;\n" -" add.ftz.f32 %f406, %f403, %f405;\n" -" add.ftz.f32 %f407, %f398, %f406;\n" -" .loc 17 172 0\n" -" mul.ftz.f32 %f408, %f177, %f346;\n" -" mov.f32 %f409, 0f40400000; \n" -" fma.rn.ftz.f32 %f410, %f409, %f408, %f348;\n" -" rcp.approx.ftz.f32 %f411, %f410;\n" -" rcp.approx.ftz.f32 %f412, %f348;\n" -" sub.ftz.f32 %f413, %f412, %f411;\n" -" add.ftz.f32 %f414, %f407, %f413;\n" -" .loc 17 175 0\n" -" fma.rn.ftz.f32 %f415, %f17, %f62, %f348;\n" -" rcp.approx.ftz.f32 %f416, %f415;\n" -" fma.rn.ftz.f32 %f417, %f17, %f63, %f348;\n" -" rcp.approx.ftz.f32 %f418, %f417;\n" -" add.ftz.f32 %f419, %f416, %f418;\n" -" fma.rn.ftz.f32 %f420, %f17, %f64, %f348;\n" -" rcp.approx.ftz.f32 %f421, %f420;\n" -" add.ftz.f32 %f422, %f419, %f421;\n" -" add.ftz.f32 %f423, %f398, %f422;\n" -" .loc 17 186 0\n" -" mul.ftz.f32 %f424, %f175, %f175;\n" -" neg.ftz.f32 %f425, %f424;\n" -" mov.f32 %f426, %f425;\n" -" .loc 17 187 0\n" -" mul.ftz.f32 %f427, %f176, %f175;\n" -" neg.ftz.f32 %f428, %f427;\n" -" mov.f32 %f429, %f428;\n" -" .loc 17 188 0\n" -" mul.ftz.f32 %f430, %f227, %f175;\n" -" neg.ftz.f32 %f431, %f430;\n" -" mov.f32 %f432, %f431;\n" -" .loc 17 189 0\n" -" mov.f32 %f433, 0f3f800000; \n" -" sub.ftz.f32 %f434, %f433, %f424;\n" -" mov.f32 %f435, %f434;\n" -" .loc 17 190 0\n" -" mul.ftz.f32 %f436, %f174, %f434;\n" -" mov.f32 %f437, %f436;\n" -" .loc 17 191 0\n" -" mov.f32 %f438, %f429;\n" -" mul.ftz.f32 %f439, %f438, %f174;\n" -" mov.f32 %f440, %f439;\n" -" .loc 17 192 0\n" -" mov.f32 %f441, %f432;\n" -" mul.ftz.f32 %f442, %f441, %f174;\n" -" mov.f32 %f443, %f442;\n" -" .loc 17 196 0\n" -" mul.ftz.f32 %f444, %f297, %f297;\n" -" mov.f32 %f445, 0f3f4db6db; \n" -" mul.ftz.f32 %f446, %f177, %f445;\n" -" mov.f32 %f447, 0f40800000; \n" -" mul.ftz.f32 %f448, %f335, %f447;\n" -" mul.ftz.f32 %f449, %f444, %f297;\n" -" mov.f32 %f450, 0f3f000000; \n" -" mul.ftz.f32 %f451, %f449, %f450;\n" -" mul.ftz.f32 %f452, %f451, %f287;\n" -" mul.ftz.f32 %f453, %f451, %f291;\n" -" mul.ftz.f32 %f454, %f451, %f284;\n" -" mov.f32 %f455, 0f40800000; \n" -" mul.ftz.f32 %f456, %f338, %f455;\n" -" mul.ftz.f32 %f457, %f452, %f439;\n" -" mul.ftz.f32 %f458, %f456, %f439;\n" -" mov.f32 %f459, 0f40e00000; \n" -" div.approx.ftz.f32 %f460, %f459, %f348;\n" -" mov.f32 %f461, 0f3f4db6db; \n" -" fma.rn.ftz.f32 %f462, %f461, %f408, %f348;\n" -" rcp.approx.ftz.f32 %f463, %f462;\n" -" fma.rn.ftz.f32 %f464, %f453, %f436, %f457;\n" -" fma.rn.ftz.f32 %f465, %f396, %f436, %f458;\n" -" sub.ftz.f32 %f466, %f460, %f463;\n" -" mul.ftz.f32 %f467, %f446, %f463;\n" -" fma.rn.ftz.f32 %f468, %f454, %f442, %f464;\n" -" fma.rn.ftz.f32 %f469, %f448, %f442, %f465;\n" -" add.ftz.f32 %f470, %f466, %f423;\n" -" add.ftz.f32 %f471, %f468, %f175;\n" -" mul.ftz.f32 %f472, %f470, %f471;\n" -" mul.ftz.f32 %f473, %f467, %f469;\n" -" sub.ftz.f32 %f474, %f473, %f472;\n" -" .loc 17 197 0\n" -" mov.f32 %f475, 0f40400000; \n" -" mul.ftz.f32 %f476, %f177, %f475;\n" -" mul.ftz.f32 %f477, %f476, %f411;\n" -" mul.ftz.f32 %f478, %f393, %f474;\n" -" mul.ftz.f32 %f479, %f471, %f414;\n" -" mul.ftz.f32 %f480, %f477, %f469;\n" -" sub.ftz.f32 %f481, %f480, %f479;\n" -" fma.rn.ftz.f32 %f482, %f391, %f481, %f478;\n" -" .loc 17 199 0\n" -" add.ftz.f32 %f162, %f482, %f162;\n" -" @!%p3 bra $Lt_0_61954;\n" -" .loc 17 201 0\n" -" mov.f32 %f483, %f47;\n" -" mul.ftz.f32 %f484, %f169, %f482;\n" -" sub.ftz.f32 %f485, %f483, %f484;\n" -" mov.f32 %f47, %f485;\n" -"$Lt_0_61954:\n" -" .loc 17 186 0\n" -" mov.f32 %f486, %f428;\n" -" .loc 17 187 0\n" -" mul.ftz.f32 %f487, %f176, %f176;\n" -" neg.ftz.f32 %f488, %f487;\n" -" mov.f32 %f489, %f488;\n" -" .loc 17 188 0\n" -" mul.ftz.f32 %f490, %f227, %f176;\n" -" neg.ftz.f32 %f491, %f490;\n" -" mov.f32 %f492, %f491;\n" -" .loc 17 189 0\n" -" mov.f32 %f493, 0f3f800000; \n" -" sub.ftz.f32 %f494, %f493, %f487;\n" -" mov.f32 %f495, %f494;\n" -" .loc 17 190 0\n" -" mov.f32 %f496, %f486;\n" -" mul.ftz.f32 %f497, %f496, %f174;\n" -" mov.f32 %f498, %f497;\n" -" .loc 17 191 0\n" -" mul.ftz.f32 %f499, %f174, %f494;\n" -" mov.f32 %f500, %f499;\n" -" .loc 17 192 0\n" -" mov.f32 %f501, %f492;\n" -" mul.ftz.f32 %f502, %f501, %f174;\n" -" mov.f32 %f503, %f502;\n" -" .loc 17 196 0\n" -" mul.ftz.f32 %f504, %f452, %f499;\n" -" mul.ftz.f32 %f505, %f456, %f499;\n" -" fma.rn.ftz.f32 %f506, %f453, %f497, %f504;\n" -" fma.rn.ftz.f32 %f507, %f396, %f497, %f505;\n" -" fma.rn.ftz.f32 %f508, %f454, %f502, %f506;\n" -" fma.rn.ftz.f32 %f509, %f448, %f502, %f507;\n" -" add.ftz.f32 %f510, %f508, %f176;\n" -" mul.ftz.f32 %f511, %f470, %f510;\n" -" mul.ftz.f32 %f512, %f467, %f509;\n" -" sub.ftz.f32 %f513, %f512, %f511;\n" -" .loc 17 197 0\n" -" mul.ftz.f32 %f514, %f393, %f513;\n" -" mul.ftz.f32 %f515, %f510, %f414;\n" -" mul.ftz.f32 %f516, %f477, %f509;\n" -" sub.ftz.f32 %f517, %f516, %f515;\n" -" fma.rn.ftz.f32 %f482, %f391, %f517, %f514;\n" -" .loc 17 203 0\n" -" add.ftz.f32 %f161, %f482, %f161;\n" -" @!%p3 bra $Lt_0_65538;\n" -" .loc 17 205 0\n" -" mov.f32 %f518, %f49;\n" -" mul.ftz.f32 %f519, %f168, %f482;\n" -" sub.ftz.f32 %f520, %f518, %f519;\n" -" mov.f32 %f49, %f520;\n" -" .loc 17 206 0\n" -" mov.f32 %f521, %f53;\n" -" mul.ftz.f32 %f522, %f169, %f482;\n" -" sub.ftz.f32 %f523, %f521, %f522;\n" -" mov.f32 %f53, %f523;\n" -"$Lt_0_65538:\n" -" .loc 17 186 0\n" -" mov.f32 %f524, %f431;\n" -" .loc 17 187 0\n" -" mov.f32 %f525, %f491;\n" -" .loc 17 188 0\n" -" mul.ftz.f32 %f526, %f227, %f227;\n" -" neg.ftz.f32 %f527, %f526;\n" -" mov.f32 %f528, %f527;\n" -" .loc 17 189 0\n" -" mov.f32 %f529, 0f3f800000; \n" -" sub.ftz.f32 %f530, %f529, %f526;\n" -" mov.f32 %f531, %f530;\n" -" .loc 17 190 0\n" -" mov.f32 %f532, %f524;\n" -" mul.ftz.f32 %f533, %f532, %f174;\n" -" mov.f32 %f534, %f533;\n" -" .loc 17 191 0\n" -" mov.f32 %f535, %f525;\n" -" mul.ftz.f32 %f536, %f535, %f174;\n" -" mov.f32 %f537, %f536;\n" -" .loc 17 192 0\n" -" mul.ftz.f32 %f538, %f174, %f530;\n" -" mov.f32 %f539, %f538;\n" -" .loc 17 196 0\n" -" mul.ftz.f32 %f540, %f452, %f536;\n" -" mul.ftz.f32 %f541, %f456, %f536;\n" -" fma.rn.ftz.f32 %f542, %f453, %f533, %f540;\n" -" fma.rn.ftz.f32 %f543, %f396, %f533, %f541;\n" -" fma.rn.ftz.f32 %f544, %f454, %f538, %f542;\n" -" fma.rn.ftz.f32 %f545, %f448, %f538, %f543;\n" -" add.ftz.f32 %f546, %f544, %f227;\n" -" mul.ftz.f32 %f547, %f546, %f470;\n" -" mul.ftz.f32 %f548, %f467, %f545;\n" -" sub.ftz.f32 %f549, %f548, %f547;\n" -" .loc 17 197 0\n" -" mul.ftz.f32 %f550, %f393, %f549;\n" -" mul.ftz.f32 %f551, %f546, %f414;\n" -" mul.ftz.f32 %f552, %f477, %f545;\n" -" sub.ftz.f32 %f553, %f552, %f551;\n" -" fma.rn.ftz.f32 %f482, %f391, %f553, %f550;\n" -" .loc 17 209 0\n" -" add.ftz.f32 %f160, %f482, %f160;\n" -" @!%p3 bra $Lt_0_68610;\n" -" .loc 17 211 0\n" -" mov.f32 %f554, %f51;\n" -" mul.ftz.f32 %f555, %f170, %f482;\n" -" sub.ftz.f32 %f556, %f554, %f555;\n" -" mov.f32 %f51, %f556;\n" -" .loc 17 212 0\n" -" mov.f32 %f557, %f55;\n" -" mul.ftz.f32 %f558, %f169, %f482;\n" -" sub.ftz.f32 %f559, %f557, %f558;\n" -" mov.f32 %f55, %f559;\n" -" .loc 17 213 0\n" -" mul.ftz.f32 %f560, %f168, %f482;\n" -" sub.ftz.f32 %f56, %f56, %f560;\n" -" mov.f32 %f57, %f56;\n" -"$Lt_0_68610:\n" -" .loc 17 232 0\n" -" mul.ftz.f32 %f561, %f102, %f338;\n" -" mul.ftz.f32 %f562, %f100, %f338;\n" -" mul.ftz.f32 %f563, %f111, %f338;\n" -" mov.f32 %f564, 0f00000000; \n" -" mov.f32 %f565, 0f00000000; \n" -" fma.rn.ftz.f32 %f566, %f565, %f212, %f564;\n" -" mov.f32 %f567, 0f00000000; \n" -" mov.f32 %f568, 0f00000000; \n" -" fma.rn.ftz.f32 %f569, %f568, %f230, %f567;\n" -" mov.f32 %f570, 0f00000000; \n" -" mov.f32 %f571, 0f00000000; \n" -" fma.rn.ftz.f32 %f572, %f571, %f198, %f570;\n" -" mul.ftz.f32 %f573, %f121, %f456;\n" -" mul.ftz.f32 %f574, %f105, %f456;\n" -" mul.ftz.f32 %f575, %f115, %f456;\n" -" neg.ftz.f32 %f576, %f561;\n" -" neg.ftz.f32 %f577, %f563;\n" -" neg.ftz.f32 %f578, %f207;\n" -" neg.ftz.f32 %f579, %f240;\n" -" mov.f32 %f580, 0f00000000; \n" -" fma.rn.ftz.f32 %f581, %f213, %f580, %f566;\n" -" mov.f32 %f582, 0f00000000; \n" -" fma.rn.ftz.f32 %f583, %f582, %f231, %f569;\n" -" mov.f32 %f584, 0f00000000; \n" -" fma.rn.ftz.f32 %f585, %f197, %f584, %f572;\n" -" neg.ftz.f32 %f586, %f223;\n" -" fma.rn.ftz.f32 %f587, %f198, %f100, %f578;\n" -" fma.rn.ftz.f32 %f588, %f100, %f230, %f579;\n" -" fma.rn.ftz.f32 %f589, %f100, %f212, %f586;\n" -" fma.rn.ftz.f32 %f590, %f197, %f123, %f587;\n" -" fma.rn.ftz.f32 %f591, %f123, %f231, %f588;\n" -" fma.rn.ftz.f32 %f592, %f213, %f123, %f589;\n" -" mov.f32 %f593, 0f00000000; \n" -" fma.rn.ftz.f32 %f594, %f343, %f593, %f576;\n" -" mov.f32 %f595, 0f00000000; \n" -" fma.rn.ftz.f32 %f596, %f595, %f343, %f562;\n" -" mov.f32 %f597, 0f00000000; \n" -" fma.rn.ftz.f32 %f598, %f343, %f597, %f577;\n" -" mul.ftz.f32 %f599, %f452, %f590;\n" -" mul.ftz.f32 %f600, %f452, %f591;\n" -" mul.ftz.f32 %f601, %f452, %f592;\n" -" fma.rn.ftz.f32 %f602, %f396, %f104, %f573;\n" -" fma.rn.ftz.f32 %f603, %f119, %f396, %f574;\n" -" fma.rn.ftz.f32 %f604, %f396, %f112, %f575;\n" -" fma.rn.ftz.f32 %f605, %f335, %f109, %f594;\n" -" fma.rn.ftz.f32 %f606, %f93, %f335, %f596;\n" -" fma.rn.ftz.f32 %f607, %f335, %f101, %f598;\n" -" fma.rn.ftz.f32 %f608, %f453, %f585, %f599;\n" -" fma.rn.ftz.f32 %f609, %f453, %f583, %f600;\n" -" fma.rn.ftz.f32 %f610, %f453, %f581, %f601;\n" -" fma.rn.ftz.f32 %f611, %f448, %f116, %f602;\n" -" fma.rn.ftz.f32 %f612, %f448, %f114, %f603;\n" -" fma.rn.ftz.f32 %f613, %f448, %f122, %f604;\n" -" fma.rn.ftz.f32 %f614, %f454, %f205, %f608;\n" -" fma.rn.ftz.f32 %f615, %f454, %f238, %f609;\n" -" fma.rn.ftz.f32 %f616, %f454, %f221, %f610;\n" -" mul.ftz.f32 %f617, %f605, %f611;\n" -" mul.ftz.f32 %f618, %f616, %f287;\n" -" fma.rn.ftz.f32 %f619, %f612, %f606, %f617;\n" -" fma.rn.ftz.f32 %f620, %f291, %f614, %f618;\n" -" fma.rn.ftz.f32 %f621, %f613, %f607, %f619;\n" -" fma.rn.ftz.f32 %f622, %f284, %f615, %f620;\n" -" neg.ftz.f32 %f623, %f621;\n" -" mul.ftz.f32 %f624, %f470, %f622;\n" -" fma.rn.ftz.f32 %f625, %f467, %f623, %f624;\n" -" mul.ftz.f32 %f626, %f393, %f625;\n" -" mul.ftz.f32 %f627, %f622, %f414;\n" -" fma.rn.ftz.f32 %f628, %f477, %f623, %f627;\n" -" fma.rn.ftz.f32 %f629, %f391, %f628, %f626;\n" -" sub.ftz.f32 %f159, %f159, %f629;\n" -" .loc 17 245 0\n" -" mul.ftz.f32 %f630, %f94, %f196;\n" -" mul.ftz.f32 %f631, %f94, %f229;\n" -" mov.f32 %f632, 0f00000000; \n" -" mov.f32 %f633, 0f00000000; \n" -" fma.rn.ftz.f32 %f634, %f198, %f633, %f632;\n" -" mul.ftz.f32 %f635, %f94, %f214;\n" -" fma.rn.ftz.f32 %f636, %f99, %f198, %f207;\n" -" fma.rn.ftz.f32 %f637, %f198, %f108, %f630;\n" -" fma.rn.ftz.f32 %f638, %f108, %f230, %f631;\n" -" mov.f32 %f639, 0f00000000; \n" -" fma.rn.ftz.f32 %f640, %f197, %f639, %f634;\n" -" fma.rn.ftz.f32 %f641, %f108, %f212, %f635;\n" -" fma.rn.ftz.f32 %f642, %f197, %f111, %f636;\n" -" fma.rn.ftz.f32 %f643, %f197, %f113, %f637;\n" -" fma.rn.ftz.f32 %f644, %f113, %f231, %f638;\n" -" mul.ftz.f32 %f645, %f452, %f581;\n" -" mul.ftz.f32 %f646, %f452, %f583;\n" -" mul.ftz.f32 %f647, %f452, %f640;\n" -" fma.rn.ftz.f32 %f648, %f213, %f113, %f641;\n" -" fma.rn.ftz.f32 %f649, %f453, %f242, %f646;\n" -" fma.rn.ftz.f32 %f650, %f453, %f642, %f647;\n" -" fma.rn.ftz.f32 %f651, %f453, %f225, %f645;\n" -" mov.f32 %f652, 0f00000000; \n" -" fma.rn.ftz.f32 %f653, %f343, %f102, %f652;\n" -" mov.f32 %f654, 0f00000000; \n" -" fma.rn.ftz.f32 %f655, %f99, %f343, %f654;\n" -" mov.f32 %f656, 0f00000000; \n" -" fma.rn.ftz.f32 %f657, %f343, %f111, %f656;\n" -" fma.rn.ftz.f32 %f658, %f454, %f644, %f649;\n" -" fma.rn.ftz.f32 %f659, %f454, %f643, %f650;\n" -" fma.rn.ftz.f32 %f660, %f454, %f648, %f651;\n" -" fma.rn.ftz.f32 %f661, %f335, %f94, %f653;\n" -" fma.rn.ftz.f32 %f662, %f108, %f335, %f655;\n" -" fma.rn.ftz.f32 %f663, %f335, %f113, %f657;\n" -" mul.ftz.f32 %f664, %f660, %f287;\n" -" fma.rn.ftz.f32 %f665, %f291, %f659, %f664;\n" -" mul.ftz.f32 %f666, %f661, %f611;\n" -" fma.rn.ftz.f32 %f667, %f284, %f658, %f665;\n" -" fma.rn.ftz.f32 %f668, %f612, %f662, %f666;\n" -" fma.rn.ftz.f32 %f669, %f613, %f663, %f668;\n" -" neg.ftz.f32 %f670, %f669;\n" -" mul.ftz.f32 %f671, %f470, %f667;\n" -" fma.rn.ftz.f32 %f672, %f467, %f670, %f671;\n" -" mul.ftz.f32 %f673, %f393, %f672;\n" -" mul.ftz.f32 %f674, %f667, %f414;\n" -" fma.rn.ftz.f32 %f675, %f477, %f670, %f674;\n" -" fma.rn.ftz.f32 %f676, %f391, %f675, %f673;\n" -" sub.ftz.f32 %f158, %f158, %f676;\n" -" .loc 17 258 0\n" -" mul.ftz.f32 %f677, %f92, %f338;\n" -" mul.ftz.f32 %f678, %f107, %f338;\n" -" mul.ftz.f32 %f679, %f98, %f338;\n" -" mul.ftz.f32 %f680, %f110, %f196;\n" -" mul.ftz.f32 %f681, %f110, %f229;\n" -" mul.ftz.f32 %f682, %f110, %f214;\n" -" fma.rn.ftz.f32 %f683, %f198, %f107, %f199;\n" -" fma.rn.ftz.f32 %f684, %f106, %f198, %f680;\n" -" fma.rn.ftz.f32 %f685, %f106, %f230, %f681;\n" -" fma.rn.ftz.f32 %f686, %f106, %f212, %f682;\n" -" fma.rn.ftz.f32 %f687, %f197, %f98, %f683;\n" -" fma.rn.ftz.f32 %f688, %f197, %f103, %f684;\n" -" fma.rn.ftz.f32 %f689, %f103, %f231, %f685;\n" -" fma.rn.ftz.f32 %f690, %f213, %f103, %f686;\n" -" mul.ftz.f32 %f691, %f452, %f687;\n" -" mul.ftz.f32 %f692, %f452, %f234;\n" -" mul.ftz.f32 %f693, %f452, %f217;\n" -" fma.rn.ftz.f32 %f694, %f343, %f110, %f677;\n" -" fma.rn.ftz.f32 %f695, %f106, %f343, %f678;\n" -" fma.rn.ftz.f32 %f696, %f343, %f103, %f679;\n" -" fma.rn.ftz.f32 %f697, %f453, %f688, %f691;\n" -" fma.rn.ftz.f32 %f698, %f453, %f689, %f692;\n" -" fma.rn.ftz.f32 %f699, %f453, %f690, %f693;\n" -" mov.f32 %f700, 0f00000000; \n" -" fma.rn.ftz.f32 %f701, %f335, %f700, %f694;\n" -" mov.f32 %f702, 0f00000000; \n" -" fma.rn.ftz.f32 %f703, %f702, %f335, %f695;\n" -" mov.f32 %f704, 0f00000000; \n" -" fma.rn.ftz.f32 %f705, %f335, %f704, %f696;\n" -" fma.rn.ftz.f32 %f706, %f454, %f640, %f697;\n" -" fma.rn.ftz.f32 %f707, %f454, %f583, %f698;\n" -" fma.rn.ftz.f32 %f708, %f454, %f581, %f699;\n" -" mul.ftz.f32 %f709, %f708, %f287;\n" -" mul.ftz.f32 %f710, %f701, %f611;\n" -" fma.rn.ftz.f32 %f711, %f291, %f706, %f709;\n" -" fma.rn.ftz.f32 %f712, %f612, %f703, %f710;\n" -" fma.rn.ftz.f32 %f713, %f284, %f707, %f711;\n" -" fma.rn.ftz.f32 %f714, %f613, %f705, %f712;\n" -" neg.ftz.f32 %f715, %f714;\n" -" mul.ftz.f32 %f716, %f470, %f713;\n" -" fma.rn.ftz.f32 %f717, %f467, %f715, %f716;\n" -" mul.ftz.f32 %f718, %f393, %f717;\n" -" mul.ftz.f32 %f719, %f713, %f414;\n" -" fma.rn.ftz.f32 %f720, %f477, %f715, %f719;\n" -" fma.rn.ftz.f32 %f721, %f391, %f720, %f718;\n" -" sub.ftz.f32 %f157, %f157, %f721;\n" -" mul.lo.s32 %r34, %r15, %r2;\n" -" cvt.s64.s32 %rd44, %r34;\n" -" mul.wide.s32 %rd45, %r34, 4;\n" -" add.u64 %rd25, %rd25, %rd45;\n" -" setp.gt.u64 %p22, %rd28, %rd25;\n" -" @%p22 bra $Lt_0_51970;\n" -" bra.uni $Lt_0_51458;\n" -"$Lt_0_75266:\n" -" mov.f32 %f157, 0f00000000; \n" -" mov.f32 %f158, 0f00000000; \n" -" mov.f32 %f159, 0f00000000; \n" -" mov.f32 %f160, 0f00000000; \n" -" mov.f32 %f161, 0f00000000; \n" -" mov.f32 %f162, 0f00000000; \n" -" mov.f32 %f163, 0f00000000; \n" -"$Lt_0_51458:\n" -" mov.u32 %r35, 1;\n" -" setp.le.s32 %p23, %r2, %r35;\n" -" @%p23 bra $Lt_0_71426;\n" -" .loc 17 267 0\n" -" mov.u64 %rd46, __cuda___cuda_local_var_33120_37_non_const_red_acc136;\n" -" cvt.s64.s32 %rd47, %r3;\n" -" mul.wide.s32 %rd48, %r3, 4;\n" -" add.u64 %rd49, %rd46, %rd48;\n" -" mov.f32 %f722, %f162;\n" -" st.shared.f32 [%rd49+0], %f722;\n" -" .loc 17 268 0\n" -" mov.f32 %f723, %f161;\n" -" st.shared.f32 [%rd49+512], %f723;\n" -" .loc 17 269 0\n" -" mov.f32 %f724, %f160;\n" -" st.shared.f32 [%rd49+1024], %f724;\n" -" .loc 17 270 0\n" -" mov.f32 %f725, %f159;\n" -" st.shared.f32 [%rd49+1536], %f725;\n" -" .loc 17 271 0\n" -" mov.f32 %f726, %f158;\n" -" st.shared.f32 [%rd49+2048], %f726;\n" -" .loc 17 272 0\n" -" mov.f32 %f727, %f157;\n" -" st.shared.f32 [%rd49+2560], %f727;\n" -" .loc 17 274 0\n" -" shr.s32 %r36, %r2, 31;\n" -" mov.s32 %r37, 1;\n" -" and.b32 %r38, %r36, %r37;\n" -" add.s32 %r39, %r38, %r2;\n" -" shr.s32 %r40, %r39, 1;\n" -" mov.s32 %r41, %r40;\n" -" mov.u32 %r42, 0;\n" -" setp.ne.u32 %p24, %r40, %r42;\n" -" @!%p24 bra $Lt_0_69890;\n" -"$Lt_0_70402:\n" -" setp.ge.u32 %p25, %r17, %r41;\n" -" @%p25 bra $Lt_0_70658;\n" -" .loc 17 277 0\n" -" add.u32 %r43, %r3, %r41;\n" -" cvt.u64.u32 %rd50, %r43;\n" -" mul.wide.u32 %rd51, %r43, 4;\n" -" add.u64 %rd52, %rd46, %rd51;\n" -" ld.shared.f32 %f728, [%rd52+0];\n" -" add.ftz.f32 %f722, %f728, %f722;\n" -" st.shared.f32 [%rd49+0], %f722;\n" -" ld.shared.f32 %f729, [%rd52+512];\n" -" add.ftz.f32 %f723, %f729, %f723;\n" -" st.shared.f32 [%rd49+512], %f723;\n" -" ld.shared.f32 %f730, [%rd52+1024];\n" -" add.ftz.f32 %f724, %f730, %f724;\n" -" st.shared.f32 [%rd49+1024], %f724;\n" -" ld.shared.f32 %f731, [%rd52+1536];\n" -" add.ftz.f32 %f725, %f731, %f725;\n" -" st.shared.f32 [%rd49+1536], %f725;\n" -" ld.shared.f32 %f732, [%rd52+2048];\n" -" add.ftz.f32 %f726, %f732, %f726;\n" -" st.shared.f32 [%rd49+2048], %f726;\n" -" ld.shared.f32 %f733, [%rd52+2560];\n" -" add.ftz.f32 %f727, %f733, %f727;\n" -" st.shared.f32 [%rd49+2560], %f727;\n" -"$Lt_0_70658:\n" -" .loc 17 274 0\n" -" shr.u32 %r41, %r41, 1;\n" -" mov.u32 %r44, 0;\n" -" setp.ne.u32 %p26, %r41, %r44;\n" -" @%p26 bra $Lt_0_70402;\n" -"$Lt_0_69890:\n" -" .loc 17 281 0\n" -" mov.f32 %f162, %f722;\n" -" .loc 17 282 0\n" -" mov.f32 %f161, %f723;\n" -" .loc 17 283 0\n" -" mov.f32 %f160, %f724;\n" -" .loc 17 284 0\n" -" mov.f32 %f159, %f725;\n" -" .loc 17 285 0\n" -" mov.f32 %f158, %f726;\n" -" .loc 17 286 0\n" -" mov.f32 %f157, %f727;\n" -" ld.param.s32 %r45, [__cudaparm_kernel_ellipsoid_sphere_eflag];\n" -" mov.s32 %r46, 0;\n" -" set.gt.u32.s32 %r47, %r45, %r46;\n" -" neg.s32 %r48, %r47;\n" -" ld.param.s32 %r49, [__cudaparm_kernel_ellipsoid_sphere_vflag];\n" -" mov.s32 %r50, 0;\n" -" set.gt.u32.s32 %r51, %r49, %r50;\n" -" neg.s32 %r52, %r51;\n" -" or.b32 %r53, %r48, %r52;\n" -" mov.u32 %r54, 0;\n" -" setp.eq.s32 %p27, %r53, %r54;\n" -" @%p27 bra $Lt_0_71426;\n" -" .loc 17 290 0\n" -" mov.f32 %f722, %f47;\n" -" st.shared.f32 [%rd49+0], %f722;\n" -" mov.f32 %f723, %f49;\n" -" st.shared.f32 [%rd49+512], %f723;\n" -" mov.f32 %f724, %f51;\n" -" st.shared.f32 [%rd49+1024], %f724;\n" -" mov.f32 %f725, %f53;\n" -" st.shared.f32 [%rd49+1536], %f725;\n" -" mov.f32 %f726, %f55;\n" -" st.shared.f32 [%rd49+2048], %f726;\n" -" mov.f32 %f727, %f56;\n" -" st.shared.f32 [%rd49+2560], %f727;\n" -" .loc 17 291 0\n" -" mov.f32 %f734, %f163;\n" -" st.shared.f32 [%rd49+3072], %f734;\n" -" .loc 17 293 0\n" -" mov.s32 %r55, %r40;\n" -" @!%p24 bra $Lt_0_71938;\n" -"$Lt_0_72450:\n" -" setp.ge.u32 %p28, %r17, %r55;\n" -" @%p28 bra $Lt_0_72706;\n" -" .loc 17 296 0\n" -" add.u32 %r56, %r3, %r55;\n" -" cvt.u64.u32 %rd53, %r56;\n" -" mul.wide.u32 %rd54, %r56, 4;\n" -" add.u64 %rd55, %rd46, %rd54;\n" -" ld.shared.f32 %f735, [%rd55+0];\n" -" add.ftz.f32 %f722, %f735, %f722;\n" -" st.shared.f32 [%rd49+0], %f722;\n" -" ld.shared.f32 %f736, [%rd55+512];\n" -" add.ftz.f32 %f723, %f736, %f723;\n" -" st.shared.f32 [%rd49+512], %f723;\n" -" ld.shared.f32 %f737, [%rd55+1024];\n" -" add.ftz.f32 %f724, %f737, %f724;\n" -" st.shared.f32 [%rd49+1024], %f724;\n" -" ld.shared.f32 %f738, [%rd55+1536];\n" -" add.ftz.f32 %f725, %f738, %f725;\n" -" st.shared.f32 [%rd49+1536], %f725;\n" -" ld.shared.f32 %f739, [%rd55+2048];\n" -" add.ftz.f32 %f726, %f739, %f726;\n" -" st.shared.f32 [%rd49+2048], %f726;\n" -" ld.shared.f32 %f740, [%rd55+2560];\n" -" add.ftz.f32 %f727, %f740, %f727;\n" -" st.shared.f32 [%rd49+2560], %f727;\n" -" ld.shared.f32 %f741, [%rd55+3072];\n" -" add.ftz.f32 %f734, %f741, %f734;\n" -" st.shared.f32 [%rd49+3072], %f734;\n" -"$Lt_0_72706:\n" -" .loc 17 293 0\n" -" shr.u32 %r55, %r55, 1;\n" -" mov.u32 %r57, 0;\n" -" setp.ne.u32 %p29, %r55, %r57;\n" -" @%p29 bra $Lt_0_72450;\n" -"$Lt_0_71938:\n" -" .loc 17 301 0\n" -" mov.f32 %f47, %f722;\n" -" mov.f32 %f49, %f723;\n" -" mov.f32 %f51, %f724;\n" -" mov.f32 %f53, %f725;\n" -" mov.f32 %f55, %f726;\n" -" mov.f32 %f57, %f727;\n" -" .loc 17 302 0\n" -" mov.f32 %f163, %f734;\n" -"$Lt_0_71426:\n" -"$Lt_0_69378:\n" -" mov.u32 %r58, 0;\n" -" setp.ne.s32 %p30, %r17, %r58;\n" -" @%p30 bra $Lt_0_73474;\n" -" .loc 17 308 0\n" -" ld.param.u64 %rd56, [__cudaparm_kernel_ellipsoid_sphere_engv];\n" -" add.u64 %rd57, %rd56, %rd3;\n" -" ld.param.s32 %r59, [__cudaparm_kernel_ellipsoid_sphere_astride];\n" -" ld.param.s32 %r60, [__cudaparm_kernel_ellipsoid_sphere_eflag];\n" -" mov.u32 %r61, 0;\n" -" setp.le.s32 %p31, %r60, %r61;\n" -" @%p31 bra $Lt_0_73986;\n" -" .loc 17 310 0\n" -" ld.global.f32 %f742, [%rd57+0];\n" -" add.ftz.f32 %f743, %f742, %f163;\n" -" st.global.f32 [%rd57+0], %f743;\n" -" .loc 17 311 0\n" -" cvt.s64.s32 %rd58, %r59;\n" -" mul.wide.s32 %rd59, %r59, 4;\n" -" add.u64 %rd57, %rd57, %rd59;\n" -"$Lt_0_73986:\n" -" ld.param.s32 %r62, [__cudaparm_kernel_ellipsoid_sphere_vflag];\n" -" mov.u32 %r63, 0;\n" -" setp.le.s32 %p32, %r62, %r63;\n" -" @%p32 bra $Lt_0_74498;\n" -" .loc 17 315 0\n" -" ld.global.f32 %f744, [%rd57+0];\n" -" mov.f32 %f745, %f47;\n" -" add.ftz.f32 %f746, %f744, %f745;\n" -" st.global.f32 [%rd57+0], %f746;\n" -" .loc 17 316 0\n" -" cvt.s64.s32 %rd60, %r59;\n" -" mul.wide.s32 %rd61, %r59, 4;\n" -" add.u64 %rd62, %rd61, %rd57;\n" -" .loc 17 315 0\n" -" ld.global.f32 %f747, [%rd62+0];\n" -" mov.f32 %f748, %f49;\n" -" add.ftz.f32 %f749, %f747, %f748;\n" -" st.global.f32 [%rd62+0], %f749;\n" -" .loc 17 316 0\n" -" add.u64 %rd63, %rd61, %rd62;\n" -" .loc 17 315 0\n" -" ld.global.f32 %f750, [%rd63+0];\n" -" mov.f32 %f751, %f51;\n" -" add.ftz.f32 %f752, %f750, %f751;\n" -" st.global.f32 [%rd63+0], %f752;\n" -" .loc 17 316 0\n" -" add.u64 %rd64, %rd61, %rd63;\n" -" .loc 17 315 0\n" -" ld.global.f32 %f753, [%rd64+0];\n" -" mov.f32 %f754, %f53;\n" -" add.ftz.f32 %f755, %f753, %f754;\n" -" st.global.f32 [%rd64+0], %f755;\n" -" .loc 17 316 0\n" -" add.u64 %rd65, %rd61, %rd64;\n" -" .loc 17 315 0\n" -" ld.global.f32 %f756, [%rd65+0];\n" -" mov.f32 %f757, %f55;\n" -" add.ftz.f32 %f758, %f756, %f757;\n" -" st.global.f32 [%rd65+0], %f758;\n" -" .loc 17 316 0\n" -" add.u64 %rd57, %rd61, %rd65;\n" -" .loc 17 315 0\n" -" ld.global.f32 %f759, [%rd57+0];\n" -" mov.f32 %f760, %f57;\n" -" add.ftz.f32 %f761, %f759, %f760;\n" -" st.global.f32 [%rd57+0], %f761;\n" -"$Lt_0_74498:\n" -" .loc 17 319 0\n" -" ld.param.u64 %rd66, [__cudaparm_kernel_ellipsoid_sphere_ans];\n" -" mul.lo.u64 %rd67, %rd2, 16;\n" -" add.u64 %rd68, %rd66, %rd67;\n" -" ld.global.v4.f32 {%f762,%f763,%f764,%f765}, [%rd68+0];\n" -" .loc 17 321 0\n" -" add.ftz.f32 %f766, %f763, %f161;\n" -" .loc 17 322 0\n" -" add.ftz.f32 %f767, %f764, %f160;\n" -" .loc 17 323 0\n" -" add.ftz.f32 %f768, %f762, %f162;\n" -" st.global.v4.f32 [%rd68+0], {%f768,%f766,%f767,%f765};\n" -" .loc 17 325 0\n" -" add.s32 %r64, %r9, %r59;\n" -" cvt.s64.s32 %rd69, %r64;\n" -" mul.wide.s32 %rd70, %r64, 16;\n" -" add.u64 %rd71, %rd66, %rd70;\n" -" ld.global.v4.f32 {%f769,%f770,%f771,%f772}, [%rd71+0];\n" -" .loc 17 327 0\n" -" add.ftz.f32 %f773, %f770, %f158;\n" -" .loc 17 328 0\n" -" add.ftz.f32 %f774, %f771, %f157;\n" -" .loc 17 329 0\n" -" add.ftz.f32 %f775, %f769, %f159;\n" -" st.global.v4.f32 [%rd71+0], {%f775,%f773,%f774,%f772};\n" -"$Lt_0_73474:\n" -"$Lt_0_50946:\n" -" .loc 17 332 0\n" -" exit;\n" -"$LDWend_kernel_ellipsoid_sphere:\n" -" }\n" -" .entry kernel_sphere_ellipsoid (\n" -" .param .u64 __cudaparm_kernel_sphere_ellipsoid_x_,\n" -" .param .u64 __cudaparm_kernel_sphere_ellipsoid_q,\n" -" .param .u64 __cudaparm_kernel_sphere_ellipsoid_shape,\n" -" .param .u64 __cudaparm_kernel_sphere_ellipsoid_well,\n" -" .param .u64 __cudaparm_kernel_sphere_ellipsoid_splj,\n" -" .param .u64 __cudaparm_kernel_sphere_ellipsoid_sig_eps,\n" -" .param .s32 __cudaparm_kernel_sphere_ellipsoid_ntypes,\n" -" .param .u64 __cudaparm_kernel_sphere_ellipsoid_dev_nbor,\n" -" .param .s32 __cudaparm_kernel_sphere_ellipsoid_stride,\n" -" .param .u64 __cudaparm_kernel_sphere_ellipsoid_ans,\n" -" .param .u64 __cudaparm_kernel_sphere_ellipsoid___val_paramengv,\n" -" .param .u64 __cudaparm_kernel_sphere_ellipsoid_err_flag,\n" -" .param .s32 __cudaparm_kernel_sphere_ellipsoid_eflag,\n" -" .param .s32 __cudaparm_kernel_sphere_ellipsoid_vflag,\n" -" .param .s32 __cudaparm_kernel_sphere_ellipsoid_start,\n" -" .param .s32 __cudaparm_kernel_sphere_ellipsoid_inum,\n" -" .param .s32 __cudaparm_kernel_sphere_ellipsoid_t_per_atom)\n" -" {\n" -" .reg .u32 %r<58>;\n" -" .reg .u64 %rd<70>;\n" -" .reg .f32 %f<567>;\n" -" .reg .pred %p<34>;\n" -" .shared .align 16 .b8 __cuda___cuda_local_var_33201_33_non_const_sp_lj3836[16];\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_33377_55_non_const_red_acc3852[3072];\n" -" .shared .f32 __cuda_local_var_33207_33_non_const_b_alpha;\n" -" .shared .f32 __cuda_local_var_33207_42_non_const_cr60;\n" -" .shared .f32 __cuda_local_var_33207_48_non_const_solv_f_a;\n" -" .shared .f32 __cuda_local_var_33207_58_non_const_solv_f_r;\n" -" .loc 17 341 0\n" -"$LDWbegin_kernel_sphere_ellipsoid:\n" -" .loc 17 347 0\n" -" ld.param.u64 %rd1, [__cudaparm_kernel_sphere_ellipsoid_splj];\n" -" ldu.global.f32 %f1, [%rd1+0];\n" -" .loc 17 348 0\n" -" ld.global.f32 %f2, [%rd1+4];\n" -" .loc 17 349 0\n" -" ld.global.f32 %f3, [%rd1+8];\n" -" .loc 17 350 0\n" -" ld.global.f32 %f4, [%rd1+12];\n" -" st.shared.v4.f32 [__cuda___cuda_local_var_33201_33_non_const_sp_lj3836+0], {%f1,%f2,%f3,%f4};\n" -" .loc 17 353 0\n" -" mov.f32 %f5, 0f3f4db6db; \n" -" st.shared.f32 [__cuda_local_var_33207_33_non_const_b_alpha], %f5;\n" -" .loc 17 354 0\n" -" mov.f32 %f6, 0f42700000; \n" -" lg2.approx.ftz.f32 %f7, %f6;\n" -" mov.f32 %f8, 0f3eaaaaab; \n" -" mul.ftz.f32 %f9, %f7, %f8;\n" -" ex2.approx.ftz.f32 %f10, %f9;\n" -" mov.f32 %f11, 0f42700000; \n" -" mul.ftz.f32 %f12, %f10, %f10;\n" -" div.approx.ftz.f32 %f13, %f11, %f12;\n" -" sub.ftz.f32 %f14, %f10, %f13;\n" -" mov.f32 %f15, 0f3eaaaaab; \n" -" mul.ftz.f32 %f16, %f14, %f15;\n" -" sub.ftz.f32 %f17, %f10, %f16;\n" -" st.shared.f32 [__cuda_local_var_33207_42_non_const_cr60], %f17;\n" -" .loc 21 544 0\n" -" mov.f32 %f18, 0f3f800000; \n" -" mov.f32 %f19, 0fbf52c7ea; \n" -" mov.f32 %f20, 0fc0b59883; \n" -" fma.rn.ftz.f32 %f21, %f18, %f19, %f20;\n" -" mov.f32 %f22, 0f41455dc0; \n" -" mov.f32 %f23, 0f3f800000; \n" -" mov.f32 %f24, 0f41e6bd60; \n" -" fma.rn.ftz.f32 %f25, %f22, %f23, %f24;\n" -" mov.f32 %f26, 0f3f800000; \n" -" mov.f32 %f27, 0fc0d21907; \n" -" fma.rn.ftz.f32 %f28, %f21, %f26, %f27;\n" -" mov.f32 %f29, 0f3f800000; \n" -" mov.f32 %f30, 0f419d92c8; \n" -" fma.rn.ftz.f32 %f31, %f25, %f29, %f30;\n" -" rcp.approx.ftz.f32 %f32, %f31;\n" -" mov.f32 %f33, 0f3f800000; \n" -" fma.rn.ftz.f32 %f34, %f28, %f32, %f33;\n" -" mov.b32 %r1, %f34;\n" -" mov.b32 %f35, %r1;\n" -" mov.f32 %f36, 0f41800000; \n" -" mul.ftz.f32 %f37, %f35, %f36;\n" -" mov.f32 %f38, 0f40400000; \n" -" mov.f32 %f39, 0fc2100000; \n" -" mul.ftz.f32 %f40, %f37, %f39;\n" -" div.approx.ftz.f32 %f41, %f38, %f40;\n" -" .loc 17 355 0\n" -" st.shared.f32 [__cuda_local_var_33207_48_non_const_solv_f_a], %f41;\n" -" .loc 21 544 0\n" -" mov.f32 %f42, 0f40400000; \n" -" mov.f32 %f43, 0f44fd2000; \n" -" mul.ftz.f32 %f44, %f37, %f43;\n" -" div.approx.ftz.f32 %f45, %f42, %f44;\n" -" .loc 17 356 0\n" -" st.shared.f32 [__cuda_local_var_33207_58_non_const_solv_f_r], %f45;\n" -" .loc 17 365 0\n" -" mov.f32 %f46, 0f00000000; \n" -" mov.f32 %f47, %f46;\n" -" mov.f32 %f48, 0f00000000; \n" -" mov.f32 %f49, %f48;\n" -" mov.f32 %f50, 0f00000000; \n" -" mov.f32 %f51, %f50;\n" -" mov.f32 %f52, 0f00000000; \n" -" mov.f32 %f53, %f52;\n" -" mov.f32 %f54, 0f00000000; \n" -" mov.f32 %f55, %f54;\n" -" mov.f32 %f56, 0f00000000; \n" -" mov.f32 %f57, %f56;\n" -" ld.param.s32 %r2, [__cudaparm_kernel_sphere_ellipsoid_t_per_atom];\n" -" cvt.s32.u32 %r3, %tid.x;\n" -" div.s32 %r4, %r3, %r2;\n" -" cvt.s32.u32 %r5, %ntid.x;\n" -" div.s32 %r6, %r5, %r2;\n" -" cvt.s32.u32 %r7, %ctaid.x;\n" -" mul.lo.s32 %r8, %r7, %r6;\n" -" add.s32 %r9, %r4, %r8;\n" -" ld.param.s32 %r10, [__cudaparm_kernel_sphere_ellipsoid_start];\n" -" add.s32 %r11, %r10, %r9;\n" -" ld.param.s32 %r12, [__cudaparm_kernel_sphere_ellipsoid_inum];\n" -" setp.ge.s32 %p1, %r11, %r12;\n" -" @%p1 bra $Lt_1_73218;\n" -" .loc 17 370 0\n" -" cvt.s64.s32 %rd2, %r11;\n" -" mul.wide.s32 %rd3, %r11, 4;\n" -" ld.param.u64 %rd4, [__cudaparm_kernel_sphere_ellipsoid_dev_nbor];\n" -" add.u64 %rd5, %rd4, %rd3;\n" -" ld.global.s32 %r13, [%rd5+0];\n" -" ld.param.s32 %r14, [__cudaparm_kernel_sphere_ellipsoid_stride];\n" -" cvt.s64.s32 %rd6, %r14;\n" -" mul.wide.s32 %rd7, %r14, 4;\n" -" add.u64 %rd8, %rd7, %rd5;\n" -" ld.global.s32 %r15, [%rd8+0];\n" -" .loc 17 373 0\n" -" ld.param.u64 %rd9, [__cudaparm_kernel_sphere_ellipsoid_x_];\n" -" cvt.s64.s32 %rd10, %r13;\n" -" mul.wide.s32 %rd11, %r13, 16;\n" -" add.u64 %rd12, %rd9, %rd11;\n" -" ld.global.v4.f32 {%f58,%f59,%f60,%f61}, [%rd12+0];\n" -" .loc 17 374 0\n" -" cvt.s32.s64 %r16, %rd6;\n" -" sub.s32 %r17, %r2, 1;\n" -" and.b32 %r18, %r17, %r3;\n" -" add.u64 %rd13, %rd7, %rd8;\n" -" mul.lo.s32 %r19, %r16, %r18;\n" -" cvt.s64.s32 %rd14, %r19;\n" -" mul.wide.s32 %rd15, %r19, 4;\n" -" add.u64 %rd16, %rd13, %rd15;\n" -" mov.s64 %rd17, %rd16;\n" -" mul.lo.s32 %r20, %r16, %r15;\n" -" cvt.s64.s32 %rd18, %r20;\n" -" mul.wide.s32 %rd19, %r20, 4;\n" -" add.u64 %rd20, %rd13, %rd19;\n" -" setp.ge.u64 %p2, %rd16, %rd20;\n" -" @%p2 bra $Lt_1_75010;\n" -" ld.param.s32 %r21, [__cudaparm_kernel_sphere_ellipsoid_vflag];\n" -" mov.s32 %r22, 0;\n" -" setp.gt.s32 %p3, %r21, %r22;\n" -" cvt.rzi.ftz.s32.f32 %r23, %f61;\n" -" ld.param.u64 %rd21, [__cudaparm_kernel_sphere_ellipsoid_sig_eps];\n" -" ld.param.s32 %r24, [__cudaparm_kernel_sphere_ellipsoid_ntypes];\n" -" ld.param.u64 %rd22, [__cudaparm_kernel_sphere_ellipsoid_well];\n" -" ld.param.u64 %rd23, [__cudaparm_kernel_sphere_ellipsoid_q];\n" -" ld.param.u64 %rd24, [__cudaparm_kernel_sphere_ellipsoid_shape];\n" -" mov.f32 %f62, 0f00000000; \n" -" mov.f32 %f63, 0f00000000; \n" -" mov.f32 %f64, 0f00000000; \n" -" mov.f32 %f65, 0f00000000; \n" -" mov.u64 %rd25, __cuda___cuda_local_var_33201_33_non_const_sp_lj3836;\n" -"$Lt_1_51714:\n" -" .loc 17 378 0\n" -" ld.global.s32 %r25, [%rd17+0];\n" -" .loc 17 382 0\n" -" and.b32 %r26, %r25, 1073741823;\n" -" cvt.s64.s32 %rd26, %r26;\n" -" mul.wide.s32 %rd27, %r26, 16;\n" -" add.u64 %rd28, %rd27, %rd9;\n" -" ld.global.v4.f32 {%f66,%f67,%f68,%f69}, [%rd28+0];\n" -" .loc 17 389 0\n" -" cvt.rzi.ftz.s32.f32 %r27, %f69;\n" -" cvt.s64.s32 %rd29, %r27;\n" -" mul.wide.s32 %rd30, %r27, 16;\n" -" add.u64 %rd31, %rd30, %rd24;\n" -" ld.global.v4.f32 {%f70,%f71,%f72,_}, [%rd31+0];\n" -" .loc 17 390 0\n" -" add.u64 %rd32, %rd27, %rd23;\n" -" ld.global.v4.f32 {%f73,%f74,%f75,%f76}, [%rd32+0];\n" -" .loc 17 391 0\n" -" add.u64 %rd33, %rd30, %rd22;\n" -" ld.global.v4.f32 {%f77,%f78,%f79,_}, [%rd33+0];\n" -" .loc 17 401 0\n" -" sub.ftz.f32 %f80, %f67, %f59;\n" -" sub.ftz.f32 %f81, %f66, %f58;\n" -" sub.ftz.f32 %f82, %f68, %f60;\n" -" mul.ftz.f32 %f83, %f80, %f80;\n" -" fma.rn.ftz.f32 %f84, %f81, %f81, %f83;\n" -" fma.rn.ftz.f32 %f85, %f82, %f82, %f84;\n" -" rsqrt.approx.ftz.f32 %f86, %f85;\n" -" mul.ftz.f32 %f87, %f81, %f86;\n" -" .loc 17 402 0\n" -" mul.ftz.f32 %f88, %f80, %f86;\n" -" .loc 17 407 0\n" -" mul.lo.s32 %r28, %r27, %r24;\n" -" add.s32 %r29, %r23, %r28;\n" -" cvt.s64.s32 %rd34, %r29;\n" -" mul.wide.s32 %rd35, %r29, 8;\n" -" add.u64 %rd36, %rd21, %rd35;\n" -" ld.global.v2.f32 {%f89,%f90}, [%rd36+0];\n" -" .loc 17 408 0\n" -" shr.s32 %r30, %r25, 30;\n" -" and.b32 %r31, %r30, 3;\n" -" cvt.s64.s32 %rd37, %r31;\n" -" mul.wide.s32 %rd38, %r31, 4;\n" -" add.u64 %rd39, %rd25, %rd38;\n" -" ld.shared.f32 %f91, [%rd39+0];\n" -" mul.ftz.f32 %f92, %f91, %f90;\n" -" .loc 16 299 0\n" -" mov.f32 %f93, %f87;\n" -" .loc 16 300 0\n" -" mov.f32 %f94, 0f3f000000; \n" -" mul.ftz.f32 %f95, %f89, %f94;\n" -" add.ftz.f32 %f96, %f74, %f74;\n" -" add.ftz.f32 %f97, %f76, %f76;\n" -" mul.ftz.f32 %f98, %f73, %f73;\n" -" mul.ftz.f32 %f99, %f74, %f74;\n" -" mul.ftz.f32 %f100, %f75, %f75;\n" -" mul.ftz.f32 %f101, %f76, %f76;\n" -" add.ftz.f32 %f102, %f75, %f75;\n" -" add.ftz.f32 %f103, %f95, %f71;\n" -" add.ftz.f32 %f104, %f95, %f70;\n" -" add.ftz.f32 %f105, %f95, %f72;\n" -" mul.ftz.f32 %f106, %f96, %f75;\n" -" mul.ftz.f32 %f107, %f96, %f76;\n" -" mul.ftz.f32 %f108, %f97, %f73;\n" -" add.ftz.f32 %f109, %f98, %f99;\n" -" mul.ftz.f32 %f110, %f102, %f73;\n" -" mul.ftz.f32 %f111, %f103, %f103;\n" -" mul.ftz.f32 %f112, %f104, %f104;\n" -" mul.ftz.f32 %f113, %f105, %f105;\n" -" sub.ftz.f32 %f114, %f106, %f108;\n" -" sub.ftz.f32 %f115, %f109, %f100;\n" -" add.ftz.f32 %f116, %f107, %f110;\n" -" mov.f32 %f117, 0f3f000000; \n" -" mul.ftz.f32 %f118, %f111, %f117;\n" -" mov.f32 %f119, 0f3f000000; \n" -" mul.ftz.f32 %f120, %f112, %f119;\n" -" mov.f32 %f121, 0f3f000000; \n" -" mul.ftz.f32 %f122, %f113, %f121;\n" -" sub.ftz.f32 %f123, %f115, %f101;\n" -" mul.ftz.f32 %f124, %f114, %f118;\n" -" mul.ftz.f32 %f125, %f116, %f122;\n" -" mul.ftz.f32 %f126, %f120, %f123;\n" -" mul.ftz.f32 %f127, %f114, %f124;\n" -" fma.rn.ftz.f32 %f128, %f123, %f126, %f127;\n" -" fma.rn.ftz.f32 %f129, %f125, %f116, %f128;\n" -" mov.f32 %f130, %f129;\n" -" .loc 16 301 0\n" -" mul.ftz.f32 %f131, %f96, %f73;\n" -" sub.ftz.f32 %f132, %f98, %f99;\n" -" mul.ftz.f32 %f133, %f102, %f76;\n" -" add.ftz.f32 %f134, %f106, %f108;\n" -" add.ftz.f32 %f135, %f100, %f132;\n" -" sub.ftz.f32 %f136, %f133, %f131;\n" -" sub.ftz.f32 %f137, %f135, %f101;\n" -" mul.ftz.f32 %f138, %f137, %f124;\n" -" fma.rn.ftz.f32 %f139, %f126, %f134, %f138;\n" -" fma.rn.ftz.f32 %f140, %f125, %f136, %f139;\n" -" mov.f32 %f141, %f140;\n" -" .loc 16 302 0\n" -" sub.ftz.f32 %f142, %f132, %f100;\n" -" sub.ftz.f32 %f143, %f107, %f110;\n" -" add.ftz.f32 %f144, %f131, %f133;\n" -" add.ftz.f32 %f145, %f101, %f142;\n" -" mul.ftz.f32 %f146, %f144, %f124;\n" -" fma.rn.ftz.f32 %f147, %f126, %f143, %f146;\n" -" fma.rn.ftz.f32 %f148, %f125, %f145, %f147;\n" -" mov.f32 %f149, %f148;\n" -" .loc 16 303 0\n" -" mov.f32 %f150, %f88;\n" -" .loc 16 304 0\n" -" mul.ftz.f32 %f151, %f134, %f120;\n" -" mul.ftz.f32 %f152, %f136, %f122;\n" -" mul.ftz.f32 %f153, %f118, %f137;\n" -" mul.ftz.f32 %f154, %f114, %f153;\n" -" fma.rn.ftz.f32 %f155, %f123, %f151, %f154;\n" -" fma.rn.ftz.f32 %f156, %f152, %f116, %f155;\n" -" mov.f32 %f157, %f156;\n" -" .loc 16 305 0\n" -" mul.ftz.f32 %f158, %f137, %f153;\n" -" fma.rn.ftz.f32 %f159, %f134, %f151, %f158;\n" -" fma.rn.ftz.f32 %f160, %f152, %f136, %f159;\n" -" .loc 16 306 0\n" -" mul.ftz.f32 %f161, %f144, %f153;\n" -" fma.rn.ftz.f32 %f162, %f143, %f151, %f161;\n" -" fma.rn.ftz.f32 %f163, %f152, %f145, %f162;\n" -" .loc 16 307 0\n" -" mul.ftz.f32 %f164, %f82, %f86;\n" -" mov.f32 %f165, %f164;\n" -" .loc 16 308 0\n" -" mul.ftz.f32 %f166, %f144, %f118;\n" -" mul.ftz.f32 %f167, %f143, %f120;\n" -" mul.ftz.f32 %f168, %f122, %f145;\n" -" mul.ftz.f32 %f169, %f114, %f166;\n" -" fma.rn.ftz.f32 %f170, %f123, %f167, %f169;\n" -" fma.rn.ftz.f32 %f171, %f116, %f168, %f170;\n" -" mov.f32 %f172, %f171;\n" -" .loc 16 309 0\n" -" mul.ftz.f32 %f173, %f137, %f166;\n" -" fma.rn.ftz.f32 %f174, %f134, %f167, %f173;\n" -" fma.rn.ftz.f32 %f175, %f136, %f168, %f174;\n" -" .loc 16 310 0\n" -" mul.ftz.f32 %f176, %f144, %f166;\n" -" fma.rn.ftz.f32 %f177, %f143, %f167, %f176;\n" -" fma.rn.ftz.f32 %f178, %f145, %f168, %f177;\n" -" abs.ftz.f32 %f179, %f156;\n" -" abs.ftz.f32 %f180, %f129;\n" -" setp.gt.ftz.f32 %p4, %f179, %f180;\n" -" @!%p4 bra $Lt_1_51970;\n" -" .loc 16 314 0\n" -" mov.f32 %f130, %f156;\n" -" mov.f32 %f157, %f129;\n" -" .loc 16 315 0\n" -" mov.f32 %f141, %f160;\n" -" mov.f32 %f160, %f140;\n" -" .loc 16 316 0\n" -" mov.f32 %f149, %f163;\n" -" mov.f32 %f163, %f148;\n" -" .loc 16 317 0\n" -" mov.f32 %f93, %f88;\n" -" mov.f32 %f150, %f87;\n" -"$Lt_1_51970:\n" -" mov.f32 %f181, %f130;\n" -" abs.ftz.f32 %f182, %f181;\n" -" abs.ftz.f32 %f183, %f171;\n" -" setp.lt.ftz.f32 %p5, %f182, %f183;\n" -" @!%p5 bra $Lt_1_52482;\n" -" .loc 16 321 0\n" -" mov.f32 %f130, %f171;\n" -" mov.f32 %f172, %f181;\n" -" .loc 16 322 0\n" -" mov.f32 %f184, %f141;\n" -" mov.f32 %f141, %f175;\n" -" mov.f32 %f175, %f184;\n" -" .loc 16 323 0\n" -" mov.f32 %f185, %f149;\n" -" mov.f32 %f149, %f178;\n" -" mov.f32 %f178, %f185;\n" -" .loc 16 324 0\n" -" mov.f32 %f186, %f93;\n" -" mov.f32 %f93, %f164;\n" -" mov.f32 %f165, %f186;\n" -"$Lt_1_52482:\n" -" mov.f32 %f187, %f130;\n" -" mov.f32 %f188, 0f00000000; \n" -" setp.neu.ftz.f32 %p6, %f187, %f188;\n" -" @!%p6 bra $Lt_1_53250;\n" -" bra.uni $Lt_1_54018;\n" -"$Lt_1_53250:\n" -" mov.f32 %f189, 0f00000000; \n" -" setp.neu.ftz.f32 %p7, %f157, %f189;\n" -" @!%p7 bra $Lt_1_53762;\n" -" .loc 16 338 0\n" -" mov.f32 %f130, %f157;\n" -" mov.f32 %f157, %f187;\n" -" .loc 16 339 0\n" -" mov.f32 %f190, %f141;\n" -" mov.f32 %f141, %f160;\n" -" mov.f32 %f160, %f190;\n" -" .loc 16 340 0\n" -" mov.f32 %f191, %f149;\n" -" mov.f32 %f149, %f163;\n" -" mov.f32 %f163, %f191;\n" -" .loc 16 341 0\n" -" mov.f32 %f192, %f93;\n" -" mov.f32 %f93, %f150;\n" -" mov.f32 %f150, %f192;\n" -" bra.uni $Lt_1_54018;\n" -"$Lt_1_53762:\n" -" mov.f32 %f193, 0f00000000; \n" -" setp.neu.ftz.f32 %p8, %f172, %f193;\n" -" @!%p8 bra $Lt_1_54274;\n" -" .loc 16 346 0\n" -" mov.f32 %f130, %f172;\n" -" mov.f32 %f172, %f187;\n" -" .loc 16 347 0\n" -" mov.f32 %f194, %f141;\n" -" mov.f32 %f141, %f175;\n" -" mov.f32 %f175, %f194;\n" -" .loc 16 348 0\n" -" mov.f32 %f195, %f149;\n" -" mov.f32 %f149, %f178;\n" -" mov.f32 %f178, %f195;\n" -" .loc 16 349 0\n" -" mov.f32 %f196, %f93;\n" -" mov.f32 %f93, %f165;\n" -" mov.f32 %f165, %f196;\n" -" bra.uni $Lt_1_54018;\n" -"$Lt_1_54274:\n" -" .loc 16 352 0\n" -" mov.s32 %r32, 2;\n" -" ld.param.u64 %rd40, [__cudaparm_kernel_sphere_ellipsoid_err_flag];\n" -" st.global.s32 [%rd40+0], %r32;\n" -"$Lt_1_54018:\n" -"$Lt_1_53506:\n" -"$Lt_1_52994:\n" -" .loc 16 355 0\n" -" div.approx.ftz.f32 %f197, %f157, %f130;\n" -" mul.ftz.f32 %f198, %f141, %f197;\n" -" sub.ftz.f32 %f199, %f160, %f198;\n" -" mov.f32 %f160, %f199;\n" -" .loc 16 356 0\n" -" mul.ftz.f32 %f200, %f149, %f197;\n" -" sub.ftz.f32 %f201, %f163, %f200;\n" -" mov.f32 %f163, %f201;\n" -" .loc 16 357 0\n" -" mul.ftz.f32 %f202, %f93, %f197;\n" -" sub.ftz.f32 %f203, %f150, %f202;\n" -" mov.f32 %f150, %f203;\n" -" .loc 16 359 0\n" -" div.approx.ftz.f32 %f204, %f172, %f130;\n" -" mul.ftz.f32 %f205, %f141, %f204;\n" -" sub.ftz.f32 %f175, %f175, %f205;\n" -" .loc 16 360 0\n" -" mul.ftz.f32 %f206, %f149, %f204;\n" -" sub.ftz.f32 %f178, %f178, %f206;\n" -" .loc 16 361 0\n" -" mul.ftz.f32 %f207, %f93, %f204;\n" -" sub.ftz.f32 %f165, %f165, %f207;\n" -" abs.ftz.f32 %f208, %f199;\n" -" abs.ftz.f32 %f209, %f175;\n" -" setp.lt.ftz.f32 %p9, %f208, %f209;\n" -" @!%p9 bra $Lt_1_54530;\n" -" .loc 16 366 0\n" -" mov.f32 %f160, %f175;\n" -" mov.f32 %f175, %f199;\n" -" .loc 16 367 0\n" -" mov.f32 %f163, %f178;\n" -" mov.f32 %f178, %f201;\n" -" .loc 16 368 0\n" -" mov.f32 %f150, %f165;\n" -" mov.f32 %f165, %f203;\n" -"$Lt_1_54530:\n" -" mov.f32 %f210, %f160;\n" -" mov.f32 %f211, 0f00000000; \n" -" setp.neu.ftz.f32 %p10, %f210, %f211;\n" -" @!%p10 bra $Lt_1_55298;\n" -" bra.uni $Lt_1_55554;\n" -"$Lt_1_55298:\n" -" mov.f32 %f212, 0f00000000; \n" -" setp.neu.ftz.f32 %p11, %f175, %f212;\n" -" @!%p11 bra $Lt_1_55554;\n" -" .loc 16 383 0\n" -" mov.f32 %f160, %f175;\n" -" mov.f32 %f175, %f210;\n" -" .loc 16 384 0\n" -" mov.f32 %f213, %f163;\n" -" mov.f32 %f163, %f178;\n" -" mov.f32 %f178, %f213;\n" -" .loc 16 385 0\n" -" mov.f32 %f214, %f150;\n" -" mov.f32 %f150, %f165;\n" -" mov.f32 %f165, %f214;\n" -"$Lt_1_55554:\n" -"$Lt_1_55042:\n" -" .loc 16 390 0\n" -" div.approx.ftz.f32 %f215, %f175, %f160;\n" -" mul.ftz.f32 %f216, %f163, %f215;\n" -" sub.ftz.f32 %f178, %f178, %f216;\n" -" .loc 16 391 0\n" -" mul.ftz.f32 %f217, %f150, %f215;\n" -" sub.ftz.f32 %f165, %f165, %f217;\n" -" mov.f32 %f218, 0f00000000; \n" -" setp.eq.ftz.f32 %p12, %f178, %f218;\n" -" @!%p12 bra $Lt_1_56066;\n" -" .loc 16 394 0\n" -" mov.s32 %r33, 2;\n" -" ld.param.u64 %rd41, [__cudaparm_kernel_sphere_ellipsoid_err_flag];\n" -" st.global.s32 [%rd41+0], %r33;\n" -"$Lt_1_56066:\n" -" .loc 16 396 0\n" -" div.approx.ftz.f32 %f219, %f165, %f178;\n" -" .loc 16 399 0\n" -" mul.ftz.f32 %f220, %f219, %f163;\n" -" sub.ftz.f32 %f221, %f150, %f220;\n" -" div.approx.ftz.f32 %f222, %f221, %f160;\n" -" .loc 16 403 0\n" -" mul.ftz.f32 %f223, %f222, %f141;\n" -" fma.rn.ftz.f32 %f224, %f149, %f219, %f223;\n" -" sub.ftz.f32 %f225, %f93, %f224;\n" -" div.approx.ftz.f32 %f226, %f225, %f130;\n" -" .loc 17 427 0\n" -" mul.ftz.f32 %f227, %f222, %f88;\n" -" fma.rn.ftz.f32 %f228, %f87, %f226, %f227;\n" -" fma.rn.ftz.f32 %f229, %f164, %f219, %f228;\n" -" mov.f32 %f230, 0f3f000000; \n" -" mul.ftz.f32 %f231, %f229, %f230;\n" -" rsqrt.approx.ftz.f32 %f232, %f231;\n" -" .loc 16 299 0\n" -" mov.f32 %f93, %f87;\n" -" .loc 16 300 0\n" -" mul.ftz.f32 %f233, %f114, %f78;\n" -" mul.ftz.f32 %f234, %f116, %f79;\n" -" mul.ftz.f32 %f235, %f114, %f233;\n" -" mul.ftz.f32 %f236, %f123, %f77;\n" -" fma.rn.ftz.f32 %f237, %f123, %f236, %f235;\n" -" fma.rn.ftz.f32 %f238, %f234, %f116, %f237;\n" -" mov.f32 %f239, 0f3f800000; \n" -" add.ftz.f32 %f240, %f238, %f239;\n" -" mov.f32 %f130, %f240;\n" -" .loc 16 301 0\n" -" mul.ftz.f32 %f241, %f233, %f137;\n" -" fma.rn.ftz.f32 %f242, %f236, %f134, %f241;\n" -" fma.rn.ftz.f32 %f243, %f234, %f136, %f242;\n" -" mov.f32 %f141, %f243;\n" -" .loc 16 302 0\n" -" mul.ftz.f32 %f244, %f144, %f233;\n" -" fma.rn.ftz.f32 %f245, %f236, %f143, %f244;\n" -" fma.rn.ftz.f32 %f246, %f234, %f145, %f245;\n" -" mov.f32 %f149, %f246;\n" -" .loc 16 303 0\n" -" mov.f32 %f150, %f88;\n" -" .loc 16 304 0\n" -" mul.ftz.f32 %f247, %f134, %f77;\n" -" mul.ftz.f32 %f248, %f136, %f79;\n" -" mul.ftz.f32 %f249, %f137, %f78;\n" -" mul.ftz.f32 %f250, %f114, %f249;\n" -" fma.rn.ftz.f32 %f251, %f123, %f247, %f250;\n" -" fma.rn.ftz.f32 %f252, %f248, %f116, %f251;\n" -" mov.f32 %f157, %f252;\n" -" .loc 16 305 0\n" -" mul.ftz.f32 %f253, %f137, %f249;\n" -" fma.rn.ftz.f32 %f254, %f134, %f247, %f253;\n" -" fma.rn.ftz.f32 %f255, %f248, %f136, %f254;\n" -" mov.f32 %f256, 0f3f800000; \n" -" add.ftz.f32 %f160, %f255, %f256;\n" -" .loc 16 306 0\n" -" mul.ftz.f32 %f257, %f144, %f249;\n" -" fma.rn.ftz.f32 %f258, %f143, %f247, %f257;\n" -" fma.rn.ftz.f32 %f163, %f248, %f145, %f258;\n" -" .loc 16 307 0\n" -" mov.f32 %f165, %f164;\n" -" .loc 16 308 0\n" -" mul.ftz.f32 %f259, %f143, %f77;\n" -" mul.ftz.f32 %f260, %f144, %f78;\n" -" mul.ftz.f32 %f261, %f145, %f79;\n" -" mul.ftz.f32 %f262, %f114, %f260;\n" -" fma.rn.ftz.f32 %f263, %f123, %f259, %f262;\n" -" fma.rn.ftz.f32 %f264, %f116, %f261, %f263;\n" -" mov.f32 %f172, %f264;\n" -" .loc 16 309 0\n" -" mul.ftz.f32 %f265, %f137, %f260;\n" -" fma.rn.ftz.f32 %f266, %f134, %f259, %f265;\n" -" fma.rn.ftz.f32 %f175, %f136, %f261, %f266;\n" -" .loc 16 310 0\n" -" mul.ftz.f32 %f267, %f144, %f260;\n" -" fma.rn.ftz.f32 %f268, %f143, %f259, %f267;\n" -" fma.rn.ftz.f32 %f269, %f145, %f261, %f268;\n" -" mov.f32 %f270, 0f3f800000; \n" -" add.ftz.f32 %f178, %f269, %f270;\n" -" abs.ftz.f32 %f271, %f252;\n" -" abs.ftz.f32 %f272, %f240;\n" -" setp.gt.ftz.f32 %p13, %f271, %f272;\n" -" @!%p13 bra $Lt_1_56578;\n" -" .loc 16 314 0\n" -" mov.f32 %f130, %f252;\n" -" mov.f32 %f157, %f240;\n" -" .loc 16 315 0\n" -" mov.f32 %f141, %f160;\n" -" mov.f32 %f160, %f243;\n" -" .loc 16 316 0\n" -" mov.f32 %f149, %f163;\n" -" mov.f32 %f163, %f246;\n" -" .loc 16 317 0\n" -" mov.f32 %f93, %f88;\n" -" mov.f32 %f150, %f87;\n" -"$Lt_1_56578:\n" -" mov.f32 %f273, %f130;\n" -" abs.ftz.f32 %f274, %f273;\n" -" abs.ftz.f32 %f275, %f264;\n" -" setp.lt.ftz.f32 %p14, %f274, %f275;\n" -" @!%p14 bra $Lt_1_57090;\n" -" .loc 16 321 0\n" -" mov.f32 %f130, %f264;\n" -" mov.f32 %f172, %f273;\n" -" .loc 16 322 0\n" -" mov.f32 %f276, %f141;\n" -" mov.f32 %f141, %f175;\n" -" mov.f32 %f175, %f276;\n" -" .loc 16 323 0\n" -" mov.f32 %f277, %f149;\n" -" mov.f32 %f149, %f178;\n" -" mov.f32 %f178, %f277;\n" -" .loc 16 324 0\n" -" mov.f32 %f278, %f93;\n" -" mov.f32 %f93, %f164;\n" -" mov.f32 %f165, %f278;\n" -"$Lt_1_57090:\n" -" mov.f32 %f279, %f130;\n" -" mov.f32 %f280, 0f00000000; \n" -" setp.neu.ftz.f32 %p15, %f279, %f280;\n" -" @!%p15 bra $Lt_1_57858;\n" -" bra.uni $Lt_1_58626;\n" -"$Lt_1_57858:\n" -" mov.f32 %f281, 0f00000000; \n" -" setp.neu.ftz.f32 %p16, %f157, %f281;\n" -" @!%p16 bra $Lt_1_58370;\n" -" .loc 16 338 0\n" -" mov.f32 %f130, %f157;\n" -" mov.f32 %f157, %f279;\n" -" .loc 16 339 0\n" -" mov.f32 %f282, %f141;\n" -" mov.f32 %f141, %f160;\n" -" mov.f32 %f160, %f282;\n" -" .loc 16 340 0\n" -" mov.f32 %f283, %f149;\n" -" mov.f32 %f149, %f163;\n" -" mov.f32 %f163, %f283;\n" -" .loc 16 341 0\n" -" mov.f32 %f284, %f93;\n" -" mov.f32 %f93, %f150;\n" -" mov.f32 %f150, %f284;\n" -" bra.uni $Lt_1_58626;\n" -"$Lt_1_58370:\n" -" mov.f32 %f285, 0f00000000; \n" -" setp.neu.ftz.f32 %p17, %f172, %f285;\n" -" @!%p17 bra $Lt_1_58882;\n" -" .loc 16 346 0\n" -" mov.f32 %f130, %f172;\n" -" mov.f32 %f172, %f279;\n" -" .loc 16 347 0\n" -" mov.f32 %f286, %f141;\n" -" mov.f32 %f141, %f175;\n" -" mov.f32 %f175, %f286;\n" -" .loc 16 348 0\n" -" mov.f32 %f287, %f149;\n" -" mov.f32 %f149, %f178;\n" -" mov.f32 %f178, %f287;\n" -" .loc 16 349 0\n" -" mov.f32 %f288, %f93;\n" -" mov.f32 %f93, %f165;\n" -" mov.f32 %f165, %f288;\n" -" bra.uni $Lt_1_58626;\n" -"$Lt_1_58882:\n" -" .loc 16 352 0\n" -" mov.s32 %r34, 2;\n" -" ld.param.u64 %rd42, [__cudaparm_kernel_sphere_ellipsoid_err_flag];\n" -" st.global.s32 [%rd42+0], %r34;\n" -"$Lt_1_58626:\n" -"$Lt_1_58114:\n" -"$Lt_1_57602:\n" -" .loc 16 355 0\n" -" div.approx.ftz.f32 %f289, %f157, %f130;\n" -" mul.ftz.f32 %f290, %f141, %f289;\n" -" sub.ftz.f32 %f291, %f160, %f290;\n" -" mov.f32 %f160, %f291;\n" -" .loc 16 356 0\n" -" mul.ftz.f32 %f292, %f149, %f289;\n" -" sub.ftz.f32 %f293, %f163, %f292;\n" -" mov.f32 %f163, %f293;\n" -" .loc 16 357 0\n" -" mul.ftz.f32 %f294, %f93, %f289;\n" -" sub.ftz.f32 %f295, %f150, %f294;\n" -" mov.f32 %f150, %f295;\n" -" .loc 16 359 0\n" -" div.approx.ftz.f32 %f296, %f172, %f130;\n" -" mul.ftz.f32 %f297, %f141, %f296;\n" -" sub.ftz.f32 %f175, %f175, %f297;\n" -" .loc 16 360 0\n" -" mul.ftz.f32 %f298, %f149, %f296;\n" -" sub.ftz.f32 %f178, %f178, %f298;\n" -" .loc 16 361 0\n" -" mul.ftz.f32 %f299, %f93, %f296;\n" -" sub.ftz.f32 %f165, %f165, %f299;\n" -" abs.ftz.f32 %f300, %f291;\n" -" abs.ftz.f32 %f301, %f175;\n" -" setp.lt.ftz.f32 %p18, %f300, %f301;\n" -" @!%p18 bra $Lt_1_59138;\n" -" .loc 16 366 0\n" -" mov.f32 %f160, %f175;\n" -" mov.f32 %f175, %f291;\n" -" .loc 16 367 0\n" -" mov.f32 %f163, %f178;\n" -" mov.f32 %f178, %f293;\n" -" .loc 16 368 0\n" -" mov.f32 %f150, %f165;\n" -" mov.f32 %f165, %f295;\n" -"$Lt_1_59138:\n" -" mov.f32 %f302, %f160;\n" -" mov.f32 %f303, 0f00000000; \n" -" setp.neu.ftz.f32 %p19, %f302, %f303;\n" -" @!%p19 bra $Lt_1_59906;\n" -" bra.uni $Lt_1_60162;\n" -"$Lt_1_59906:\n" -" mov.f32 %f304, 0f00000000; \n" -" setp.neu.ftz.f32 %p20, %f175, %f304;\n" -" @!%p20 bra $Lt_1_60162;\n" -" .loc 16 383 0\n" -" mov.f32 %f160, %f175;\n" -" mov.f32 %f175, %f302;\n" -" .loc 16 384 0\n" -" mov.f32 %f305, %f163;\n" -" mov.f32 %f163, %f178;\n" -" mov.f32 %f178, %f305;\n" -" .loc 16 385 0\n" -" mov.f32 %f306, %f150;\n" -" mov.f32 %f150, %f165;\n" -" mov.f32 %f165, %f306;\n" -"$Lt_1_60162:\n" -"$Lt_1_59650:\n" -" .loc 16 390 0\n" -" div.approx.ftz.f32 %f307, %f175, %f160;\n" -" mul.ftz.f32 %f308, %f163, %f307;\n" -" sub.ftz.f32 %f178, %f178, %f308;\n" -" .loc 16 391 0\n" -" mul.ftz.f32 %f309, %f150, %f307;\n" -" sub.ftz.f32 %f165, %f165, %f309;\n" -" mov.f32 %f310, 0f00000000; \n" -" setp.eq.ftz.f32 %p21, %f178, %f310;\n" -" @!%p21 bra $Lt_1_60674;\n" -" .loc 16 394 0\n" -" mov.s32 %r35, 2;\n" -" ld.param.u64 %rd43, [__cudaparm_kernel_sphere_ellipsoid_err_flag];\n" -" st.global.s32 [%rd43+0], %r35;\n" -"$Lt_1_60674:\n" -" .loc 17 436 0\n" -" div.approx.ftz.f32 %f311, %f165, %f178;\n" -" mul.ftz.f32 %f312, %f311, %f163;\n" -" sub.ftz.f32 %f313, %f150, %f312;\n" -" div.approx.ftz.f32 %f314, %f313, %f160;\n" -" mul.ftz.f32 %f315, %f314, %f141;\n" -" fma.rn.ftz.f32 %f316, %f149, %f311, %f315;\n" -" mul.ftz.f32 %f317, %f314, %f88;\n" -" sub.ftz.f32 %f318, %f93, %f316;\n" -" div.approx.ftz.f32 %f319, %f318, %f130;\n" -" fma.rn.ftz.f32 %f320, %f87, %f319, %f317;\n" -" fma.rn.ftz.f32 %f321, %f164, %f311, %f320;\n" -" add.ftz.f32 %f322, %f321, %f321;\n" -" .loc 17 444 0\n" -" rcp.approx.ftz.f32 %f323, %f86;\n" -" sub.ftz.f32 %f324, %f323, %f232;\n" -" mov.f32 %f325, 0f40000000; \n" -" div.approx.ftz.f32 %f326, %f324, %f325;\n" -" mul.ftz.f32 %f327, %f324, %f324;\n" -" mul.ftz.f32 %f328, %f324, %f327;\n" -" add.ftz.f32 %f329, %f326, %f72;\n" -" add.ftz.f32 %f330, %f326, %f70;\n" -" add.ftz.f32 %f331, %f326, %f71;\n" -" mul.ftz.f32 %f332, %f330, %f331;\n" -" mul.ftz.f32 %f333, %f329, %f332;\n" -" mul.ftz.f32 %f334, %f328, %f333;\n" -" .loc 17 446 0\n" -" mul.ftz.f32 %f335, %f70, %f71;\n" -" mul.ftz.f32 %f336, %f335, %f72;\n" -" div.approx.ftz.f32 %f337, %f89, %f324;\n" -" mul.ftz.f32 %f338, %f337, %f322;\n" -" mov.f32 %f339, 0f3f800000; \n" -" mov.f32 %f340, 0f40400000; \n" -" fma.rn.ftz.f32 %f341, %f340, %f338, %f339;\n" -" mul.ftz.f32 %f342, %f336, %f341;\n" -" .loc 17 450 0\n" -" div.approx.ftz.f32 %f343, %f324, %f17;\n" -" add.ftz.f32 %f344, %f343, %f72;\n" -" add.ftz.f32 %f345, %f343, %f70;\n" -" add.ftz.f32 %f346, %f343, %f71;\n" -" mul.ftz.f32 %f347, %f345, %f346;\n" -" mul.ftz.f32 %f348, %f344, %f347;\n" -" mul.ftz.f32 %f349, %f328, %f348;\n" -" .loc 17 452 0\n" -" mov.f32 %f350, 0f3f800000; \n" -" mov.f32 %f351, 0f3f4db6db; \n" -" fma.rn.ftz.f32 %f352, %f351, %f338, %f350;\n" -" mul.ftz.f32 %f353, %f336, %f352;\n" -" .loc 17 454 0\n" -" mul.ftz.f32 %f354, %f337, %f337;\n" -" mul.ftz.f32 %f355, %f337, %f354;\n" -" mul.ftz.f32 %f356, %f355, %f355;\n" -" .loc 17 457 0\n" -" mul.ftz.f32 %f357, %f89, %f89;\n" -" mov.f32 %f358, 0f41000000; \n" -" div.approx.ftz.f32 %f359, %f334, %f358;\n" -" mov.f32 %f360, 0f42700000; \n" -" div.approx.ftz.f32 %f361, %f349, %f360;\n" -" mul.ftz.f32 %f362, %f357, %f89;\n" -" div.approx.ftz.f32 %f363, %f342, %f359;\n" -" div.approx.ftz.f32 %f364, %f353, %f361;\n" -" mul.ftz.f32 %f365, %f363, %f92;\n" -" mul.ftz.f32 %f366, %f364, %f92;\n" -" mul.ftz.f32 %f367, %f362, %f365;\n" -" mul.ftz.f32 %f368, %f362, %f366;\n" -" mul.ftz.f32 %f369, %f367, %f41;\n" -" mul.ftz.f32 %f370, %f368, %f356;\n" -" mul.ftz.f32 %f371, %f370, %f45;\n" -" add.ftz.f32 %f372, %f369, %f371;\n" -" add.ftz.f32 %f65, %f65, %f372;\n" -" .loc 17 464 0\n" -" mov.f32 %f373, 0f40800000; \n" -" mul.ftz.f32 %f374, %f319, %f373;\n" -" .loc 17 471 0\n" -" mov.f32 %f375, 0f40400000; \n" -" div.approx.ftz.f32 %f376, %f375, %f324;\n" -" add.ftz.f32 %f377, %f70, %f70;\n" -" add.ftz.f32 %f378, %f324, %f377;\n" -" rcp.approx.ftz.f32 %f379, %f378;\n" -" add.ftz.f32 %f380, %f71, %f71;\n" -" add.ftz.f32 %f381, %f324, %f380;\n" -" rcp.approx.ftz.f32 %f382, %f381;\n" -" add.ftz.f32 %f383, %f379, %f382;\n" -" add.ftz.f32 %f384, %f72, %f72;\n" -" add.ftz.f32 %f385, %f324, %f384;\n" -" rcp.approx.ftz.f32 %f386, %f385;\n" -" add.ftz.f32 %f387, %f383, %f386;\n" -" add.ftz.f32 %f388, %f376, %f387;\n" -" .loc 17 476 0\n" -" mul.ftz.f32 %f389, %f89, %f322;\n" -" mov.f32 %f390, 0f40400000; \n" -" fma.rn.ftz.f32 %f391, %f390, %f389, %f324;\n" -" rcp.approx.ftz.f32 %f392, %f391;\n" -" rcp.approx.ftz.f32 %f393, %f324;\n" -" sub.ftz.f32 %f394, %f393, %f392;\n" -" add.ftz.f32 %f395, %f388, %f394;\n" -" .loc 17 479 0\n" -" fma.rn.ftz.f32 %f396, %f17, %f70, %f324;\n" -" rcp.approx.ftz.f32 %f397, %f396;\n" -" fma.rn.ftz.f32 %f398, %f17, %f71, %f324;\n" -" rcp.approx.ftz.f32 %f399, %f398;\n" -" add.ftz.f32 %f400, %f397, %f399;\n" -" fma.rn.ftz.f32 %f401, %f17, %f72, %f324;\n" -" rcp.approx.ftz.f32 %f402, %f401;\n" -" add.ftz.f32 %f403, %f400, %f402;\n" -" add.ftz.f32 %f404, %f376, %f403;\n" -" .loc 17 490 0\n" -" mul.ftz.f32 %f405, %f87, %f87;\n" -" neg.ftz.f32 %f406, %f405;\n" -" mov.f32 %f407, %f406;\n" -" .loc 17 491 0\n" -" mul.ftz.f32 %f408, %f88, %f87;\n" -" neg.ftz.f32 %f409, %f408;\n" -" mov.f32 %f410, %f409;\n" -" .loc 17 492 0\n" -" mul.ftz.f32 %f411, %f164, %f87;\n" -" neg.ftz.f32 %f412, %f411;\n" -" mov.f32 %f413, %f412;\n" -" .loc 17 493 0\n" -" mov.f32 %f414, 0f3f800000; \n" -" sub.ftz.f32 %f415, %f414, %f405;\n" -" mov.f32 %f416, %f415;\n" -" .loc 17 494 0\n" -" mul.ftz.f32 %f417, %f86, %f415;\n" -" mov.f32 %f418, %f417;\n" -" .loc 17 495 0\n" -" mov.f32 %f419, %f410;\n" -" mul.ftz.f32 %f420, %f419, %f86;\n" -" mov.f32 %f421, %f420;\n" -" .loc 17 496 0\n" -" mov.f32 %f422, %f413;\n" -" mul.ftz.f32 %f423, %f422, %f86;\n" -" mov.f32 %f424, %f423;\n" -" .loc 17 500 0\n" -" mul.ftz.f32 %f425, %f232, %f232;\n" -" mov.f32 %f426, 0f3f4db6db; \n" -" mul.ftz.f32 %f427, %f89, %f426;\n" -" mov.f32 %f428, 0f40800000; \n" -" mul.ftz.f32 %f429, %f311, %f428;\n" -" mul.ftz.f32 %f430, %f425, %f232;\n" -" mov.f32 %f431, 0f3f000000; \n" -" mul.ftz.f32 %f432, %f430, %f431;\n" -" mul.ftz.f32 %f433, %f432, %f222;\n" -" mul.ftz.f32 %f434, %f432, %f226;\n" -" mul.ftz.f32 %f435, %f432, %f219;\n" -" mov.f32 %f436, 0f40800000; \n" -" mul.ftz.f32 %f437, %f314, %f436;\n" -" mul.ftz.f32 %f438, %f433, %f420;\n" -" mul.ftz.f32 %f439, %f437, %f420;\n" -" mov.f32 %f440, 0f40e00000; \n" -" div.approx.ftz.f32 %f441, %f440, %f324;\n" -" mov.f32 %f442, 0f3f4db6db; \n" -" fma.rn.ftz.f32 %f443, %f442, %f389, %f324;\n" -" rcp.approx.ftz.f32 %f444, %f443;\n" -" fma.rn.ftz.f32 %f445, %f434, %f417, %f438;\n" -" fma.rn.ftz.f32 %f446, %f374, %f417, %f439;\n" -" sub.ftz.f32 %f447, %f441, %f444;\n" -" mul.ftz.f32 %f448, %f427, %f444;\n" -" fma.rn.ftz.f32 %f449, %f435, %f423, %f445;\n" -" fma.rn.ftz.f32 %f450, %f429, %f423, %f446;\n" -" add.ftz.f32 %f451, %f447, %f404;\n" -" add.ftz.f32 %f452, %f449, %f87;\n" -" mul.ftz.f32 %f453, %f451, %f452;\n" -" mul.ftz.f32 %f454, %f448, %f450;\n" -" sub.ftz.f32 %f455, %f454, %f453;\n" -" .loc 17 501 0\n" -" mov.f32 %f456, 0f40400000; \n" -" mul.ftz.f32 %f457, %f89, %f456;\n" -" mul.ftz.f32 %f458, %f457, %f392;\n" -" mul.ftz.f32 %f459, %f371, %f455;\n" -" mul.ftz.f32 %f460, %f452, %f395;\n" -" mul.ftz.f32 %f461, %f458, %f450;\n" -" sub.ftz.f32 %f462, %f461, %f460;\n" -" fma.rn.ftz.f32 %f463, %f369, %f462, %f459;\n" -" .loc 17 503 0\n" -" add.ftz.f32 %f64, %f463, %f64;\n" -" @!%p3 bra $Lt_1_61698;\n" -" .loc 17 505 0\n" -" mov.f32 %f464, %f47;\n" -" mul.ftz.f32 %f465, %f81, %f463;\n" -" sub.ftz.f32 %f466, %f464, %f465;\n" -" mov.f32 %f47, %f466;\n" -"$Lt_1_61698:\n" -" .loc 17 490 0\n" -" mov.f32 %f467, %f409;\n" -" .loc 17 491 0\n" -" mul.ftz.f32 %f468, %f88, %f88;\n" -" neg.ftz.f32 %f469, %f468;\n" -" mov.f32 %f470, %f469;\n" -" .loc 17 492 0\n" -" mul.ftz.f32 %f471, %f164, %f88;\n" -" neg.ftz.f32 %f472, %f471;\n" -" mov.f32 %f473, %f472;\n" -" .loc 17 493 0\n" -" mov.f32 %f474, 0f3f800000; \n" -" sub.ftz.f32 %f475, %f474, %f468;\n" -" mov.f32 %f476, %f475;\n" -" .loc 17 494 0\n" -" mov.f32 %f477, %f467;\n" -" mul.ftz.f32 %f478, %f477, %f86;\n" -" mov.f32 %f479, %f478;\n" -" .loc 17 495 0\n" -" mul.ftz.f32 %f480, %f86, %f475;\n" -" mov.f32 %f481, %f480;\n" -" .loc 17 496 0\n" -" mov.f32 %f482, %f473;\n" -" mul.ftz.f32 %f483, %f482, %f86;\n" -" mov.f32 %f484, %f483;\n" -" .loc 17 500 0\n" -" mul.ftz.f32 %f485, %f433, %f480;\n" -" mul.ftz.f32 %f486, %f437, %f480;\n" -" fma.rn.ftz.f32 %f487, %f434, %f478, %f485;\n" -" fma.rn.ftz.f32 %f488, %f374, %f478, %f486;\n" -" fma.rn.ftz.f32 %f489, %f435, %f483, %f487;\n" -" fma.rn.ftz.f32 %f490, %f429, %f483, %f488;\n" -" add.ftz.f32 %f491, %f489, %f88;\n" -" mul.ftz.f32 %f492, %f451, %f491;\n" -" mul.ftz.f32 %f493, %f448, %f490;\n" -" sub.ftz.f32 %f494, %f493, %f492;\n" -" .loc 17 501 0\n" -" mul.ftz.f32 %f495, %f371, %f494;\n" -" mul.ftz.f32 %f496, %f491, %f395;\n" -" mul.ftz.f32 %f497, %f458, %f490;\n" -" sub.ftz.f32 %f498, %f497, %f496;\n" -" fma.rn.ftz.f32 %f463, %f369, %f498, %f495;\n" -" .loc 17 507 0\n" -" add.ftz.f32 %f63, %f463, %f63;\n" -" @!%p3 bra $Lt_1_65282;\n" -" .loc 17 509 0\n" -" mov.f32 %f499, %f49;\n" -" mul.ftz.f32 %f500, %f80, %f463;\n" -" sub.ftz.f32 %f501, %f499, %f500;\n" -" mov.f32 %f49, %f501;\n" -" .loc 17 510 0\n" -" mov.f32 %f502, %f53;\n" -" mul.ftz.f32 %f503, %f81, %f463;\n" -" sub.ftz.f32 %f504, %f502, %f503;\n" -" mov.f32 %f53, %f504;\n" -"$Lt_1_65282:\n" -" .loc 17 490 0\n" -" mov.f32 %f505, %f412;\n" -" .loc 17 491 0\n" -" mov.f32 %f506, %f472;\n" -" .loc 17 492 0\n" -" mul.ftz.f32 %f507, %f164, %f164;\n" -" neg.ftz.f32 %f508, %f507;\n" -" mov.f32 %f509, %f508;\n" -" .loc 17 493 0\n" -" mov.f32 %f510, 0f3f800000; \n" -" sub.ftz.f32 %f511, %f510, %f507;\n" -" mov.f32 %f512, %f511;\n" -" .loc 17 494 0\n" -" mov.f32 %f513, %f505;\n" -" mul.ftz.f32 %f514, %f513, %f86;\n" -" mov.f32 %f515, %f514;\n" -" .loc 17 495 0\n" -" mov.f32 %f516, %f506;\n" -" mul.ftz.f32 %f517, %f516, %f86;\n" -" mov.f32 %f518, %f517;\n" -" .loc 17 496 0\n" -" mul.ftz.f32 %f519, %f86, %f511;\n" -" mov.f32 %f520, %f519;\n" -" .loc 17 500 0\n" -" mul.ftz.f32 %f521, %f433, %f517;\n" -" mul.ftz.f32 %f522, %f437, %f517;\n" -" fma.rn.ftz.f32 %f523, %f434, %f514, %f521;\n" -" fma.rn.ftz.f32 %f524, %f374, %f514, %f522;\n" -" fma.rn.ftz.f32 %f525, %f435, %f519, %f523;\n" -" fma.rn.ftz.f32 %f526, %f429, %f519, %f524;\n" -" add.ftz.f32 %f527, %f525, %f164;\n" -" mul.ftz.f32 %f528, %f527, %f451;\n" -" mul.ftz.f32 %f529, %f448, %f526;\n" -" sub.ftz.f32 %f530, %f529, %f528;\n" -" .loc 17 501 0\n" -" mul.ftz.f32 %f531, %f371, %f530;\n" -" mul.ftz.f32 %f532, %f527, %f395;\n" -" mul.ftz.f32 %f533, %f458, %f526;\n" -" sub.ftz.f32 %f534, %f533, %f532;\n" -" fma.rn.ftz.f32 %f463, %f369, %f534, %f531;\n" -" .loc 17 513 0\n" -" add.ftz.f32 %f62, %f463, %f62;\n" -" @!%p3 bra $Lt_1_68354;\n" -" .loc 17 515 0\n" -" mov.f32 %f535, %f51;\n" -" mul.ftz.f32 %f536, %f82, %f463;\n" -" sub.ftz.f32 %f537, %f535, %f536;\n" -" mov.f32 %f51, %f537;\n" -" .loc 17 516 0\n" -" mov.f32 %f538, %f55;\n" -" mul.ftz.f32 %f539, %f81, %f463;\n" -" sub.ftz.f32 %f540, %f538, %f539;\n" -" mov.f32 %f55, %f540;\n" -" .loc 17 517 0\n" -" mul.ftz.f32 %f541, %f80, %f463;\n" -" sub.ftz.f32 %f56, %f56, %f541;\n" -" mov.f32 %f57, %f56;\n" -"$Lt_1_68354:\n" -" mul.lo.s32 %r36, %r16, %r2;\n" -" cvt.s64.s32 %rd44, %r36;\n" -" mul.wide.s32 %rd45, %r36, 4;\n" -" add.u64 %rd17, %rd17, %rd45;\n" -" setp.gt.u64 %p22, %rd20, %rd17;\n" -" @%p22 bra $Lt_1_51714;\n" -" bra.uni $Lt_1_51202;\n" -"$Lt_1_75010:\n" -" mov.f32 %f62, 0f00000000; \n" -" mov.f32 %f63, 0f00000000; \n" -" mov.f32 %f64, 0f00000000; \n" -" mov.f32 %f65, 0f00000000; \n" -"$Lt_1_51202:\n" -" mov.u32 %r37, 1;\n" -" setp.le.s32 %p23, %r2, %r37;\n" -" @%p23 bra $Lt_1_71170;\n" -" .loc 17 522 0\n" -" mov.u64 %rd46, __cuda___cuda_local_var_33377_55_non_const_red_acc3852;\n" -" cvt.s64.s32 %rd47, %r3;\n" -" mul.wide.s32 %rd48, %r3, 4;\n" -" add.u64 %rd49, %rd46, %rd48;\n" -" mov.f32 %f542, %f64;\n" -" st.shared.f32 [%rd49+0], %f542;\n" -" mov.f32 %f543, %f63;\n" -" st.shared.f32 [%rd49+512], %f543;\n" -" mov.f32 %f544, %f62;\n" -" st.shared.f32 [%rd49+1024], %f544;\n" -" mov.f32 %f545, %f65;\n" -" st.shared.f32 [%rd49+1536], %f545;\n" -" shr.s32 %r38, %r2, 31;\n" -" mov.s32 %r39, 1;\n" -" and.b32 %r40, %r38, %r39;\n" -" add.s32 %r41, %r40, %r2;\n" -" shr.s32 %r42, %r41, 1;\n" -" mov.s32 %r43, %r42;\n" -" mov.u32 %r44, 0;\n" -" setp.ne.u32 %p24, %r42, %r44;\n" -" @!%p24 bra $Lt_1_69634;\n" -"$Lt_1_70146:\n" -" setp.ge.u32 %p25, %r18, %r43;\n" -" @%p25 bra $Lt_1_70402;\n" -" add.u32 %r45, %r3, %r43;\n" -" cvt.u64.u32 %rd50, %r45;\n" -" mul.wide.u32 %rd51, %r45, 4;\n" -" add.u64 %rd52, %rd46, %rd51;\n" -" ld.shared.f32 %f546, [%rd52+0];\n" -" add.ftz.f32 %f542, %f546, %f542;\n" -" st.shared.f32 [%rd49+0], %f542;\n" -" ld.shared.f32 %f547, [%rd52+512];\n" -" add.ftz.f32 %f543, %f547, %f543;\n" -" st.shared.f32 [%rd49+512], %f543;\n" -" ld.shared.f32 %f548, [%rd52+1024];\n" -" add.ftz.f32 %f544, %f548, %f544;\n" -" st.shared.f32 [%rd49+1024], %f544;\n" -" ld.shared.f32 %f549, [%rd52+1536];\n" -" add.ftz.f32 %f545, %f549, %f545;\n" -" st.shared.f32 [%rd49+1536], %f545;\n" -"$Lt_1_70402:\n" -" shr.u32 %r43, %r43, 1;\n" -" mov.u32 %r46, 0;\n" -" setp.ne.u32 %p26, %r43, %r46;\n" -" @%p26 bra $Lt_1_70146;\n" -"$Lt_1_69634:\n" -" mov.f32 %f64, %f542;\n" -" mov.f32 %f63, %f543;\n" -" mov.f32 %f62, %f544;\n" -" mov.f32 %f65, %f545;\n" -" ld.param.s32 %r47, [__cudaparm_kernel_sphere_ellipsoid_vflag];\n" -" mov.u32 %r48, 0;\n" -" setp.le.s32 %p27, %r47, %r48;\n" -" @%p27 bra $Lt_1_71170;\n" -" mov.f32 %f542, %f47;\n" -" st.shared.f32 [%rd49+0], %f542;\n" -" mov.f32 %f543, %f49;\n" -" st.shared.f32 [%rd49+512], %f543;\n" -" mov.f32 %f544, %f51;\n" -" st.shared.f32 [%rd49+1024], %f544;\n" -" mov.f32 %f545, %f53;\n" -" st.shared.f32 [%rd49+1536], %f545;\n" -" mov.f32 %f550, %f55;\n" -" st.shared.f32 [%rd49+2048], %f550;\n" -" mov.f32 %f551, %f56;\n" -" st.shared.f32 [%rd49+2560], %f551;\n" -" mov.s32 %r49, %r42;\n" -" @!%p24 bra $Lt_1_71682;\n" -"$Lt_1_72194:\n" -" setp.ge.u32 %p28, %r18, %r49;\n" -" @%p28 bra $Lt_1_72450;\n" -" add.u32 %r50, %r3, %r49;\n" -" cvt.u64.u32 %rd53, %r50;\n" -" mul.wide.u32 %rd54, %r50, 4;\n" -" add.u64 %rd55, %rd46, %rd54;\n" -" ld.shared.f32 %f552, [%rd55+0];\n" -" add.ftz.f32 %f542, %f552, %f542;\n" -" st.shared.f32 [%rd49+0], %f542;\n" -" ld.shared.f32 %f553, [%rd55+512];\n" -" add.ftz.f32 %f543, %f553, %f543;\n" -" st.shared.f32 [%rd49+512], %f543;\n" -" ld.shared.f32 %f554, [%rd55+1024];\n" -" add.ftz.f32 %f544, %f554, %f544;\n" -" st.shared.f32 [%rd49+1024], %f544;\n" -" ld.shared.f32 %f555, [%rd55+1536];\n" -" add.ftz.f32 %f545, %f555, %f545;\n" -" st.shared.f32 [%rd49+1536], %f545;\n" -" ld.shared.f32 %f556, [%rd55+2048];\n" -" add.ftz.f32 %f550, %f556, %f550;\n" -" st.shared.f32 [%rd49+2048], %f550;\n" -" ld.shared.f32 %f557, [%rd55+2560];\n" -" add.ftz.f32 %f551, %f557, %f551;\n" -" st.shared.f32 [%rd49+2560], %f551;\n" -"$Lt_1_72450:\n" -" shr.u32 %r49, %r49, 1;\n" -" mov.u32 %r51, 0;\n" -" setp.ne.u32 %p29, %r49, %r51;\n" -" @%p29 bra $Lt_1_72194;\n" -"$Lt_1_71682:\n" -" mov.f32 %f47, %f542;\n" -" mov.f32 %f49, %f543;\n" -" mov.f32 %f51, %f544;\n" -" mov.f32 %f53, %f545;\n" -" mov.f32 %f55, %f550;\n" -" mov.f32 %f57, %f551;\n" -"$Lt_1_71170:\n" -"$Lt_1_69122:\n" -" mov.u32 %r52, 0;\n" -" setp.ne.s32 %p30, %r18, %r52;\n" -" @%p30 bra $Lt_1_73218;\n" -" ld.param.u64 %rd56, [__cudaparm_kernel_sphere_ellipsoid___val_paramengv];\n" -" add.u64 %rd57, %rd56, %rd3;\n" -" ld.param.s32 %r53, [__cudaparm_kernel_sphere_ellipsoid_eflag];\n" -" mov.u32 %r54, 0;\n" -" setp.le.s32 %p31, %r53, %r54;\n" -" @%p31 bra $Lt_1_73730;\n" -" st.global.f32 [%rd57+0], %f65;\n" -" cvt.s64.s32 %rd58, %r12;\n" -" mul.wide.s32 %rd59, %r12, 4;\n" -" add.u64 %rd57, %rd57, %rd59;\n" -"$Lt_1_73730:\n" -" ld.param.s32 %r55, [__cudaparm_kernel_sphere_ellipsoid_vflag];\n" -" mov.u32 %r56, 0;\n" -" setp.le.s32 %p32, %r55, %r56;\n" -" @%p32 bra $Lt_1_74242;\n" -" mov.f32 %f558, %f47;\n" -" st.global.f32 [%rd57+0], %f558;\n" -" cvt.s64.s32 %rd60, %r12;\n" -" mul.wide.s32 %rd61, %r12, 4;\n" -" add.u64 %rd62, %rd61, %rd57;\n" -" mov.f32 %f559, %f49;\n" -" st.global.f32 [%rd62+0], %f559;\n" -" add.u64 %rd63, %rd61, %rd62;\n" -" mov.f32 %f560, %f51;\n" -" st.global.f32 [%rd63+0], %f560;\n" -" add.u64 %rd64, %rd61, %rd63;\n" -" mov.f32 %f561, %f53;\n" -" st.global.f32 [%rd64+0], %f561;\n" -" add.u64 %rd57, %rd61, %rd64;\n" -" mov.f32 %f562, %f55;\n" -" st.global.f32 [%rd57+0], %f562;\n" -" mov.f32 %f563, %f57;\n" -" add.u64 %rd65, %rd61, %rd57;\n" -" st.global.f32 [%rd65+0], %f563;\n" -"$Lt_1_74242:\n" -" ld.param.u64 %rd66, [__cudaparm_kernel_sphere_ellipsoid_ans];\n" -" mul.lo.u64 %rd67, %rd2, 16;\n" -" add.u64 %rd68, %rd66, %rd67;\n" -" mov.f32 %f564, %f565;\n" -" st.global.v4.f32 [%rd68+0], {%f64,%f63,%f62,%f564};\n" -"$Lt_1_73218:\n" -"$Lt_1_50690:\n" -" .loc 17 525 0\n" -" exit;\n" -"$LDWend_kernel_sphere_ellipsoid:\n" -" }\n" -" .entry kernel_lj (\n" -" .param .u64 __cudaparm_kernel_lj_x_,\n" -" .param .u64 __cudaparm_kernel_lj_lj1,\n" -" .param .u64 __cudaparm_kernel_lj_lj3,\n" -" .param .s32 __cudaparm_kernel_lj_lj_types,\n" -" .param .u64 __cudaparm_kernel_lj_gum,\n" -" .param .s32 __cudaparm_kernel_lj_stride,\n" -" .param .u64 __cudaparm_kernel_lj_dev_ij,\n" -" .param .u64 __cudaparm_kernel_lj_ans,\n" -" .param .u64 __cudaparm_kernel_lj___val_paramengv,\n" -" .param .u64 __cudaparm_kernel_lj_err_flag,\n" -" .param .s32 __cudaparm_kernel_lj_eflag,\n" -" .param .s32 __cudaparm_kernel_lj_vflag,\n" -" .param .s32 __cudaparm_kernel_lj_start,\n" -" .param .s32 __cudaparm_kernel_lj_inum,\n" -" .param .s32 __cudaparm_kernel_lj_t_per_atom)\n" -" {\n" -" .reg .u32 %r<55>;\n" -" .reg .u64 %rd<60>;\n" -" .reg .f32 %f<115>;\n" -" .reg .pred %p<19>;\n" -" .shared .align 16 .b8 __cuda___cuda_local_var_33394_33_non_const_sp_lj7028[16];\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_33459_55_non_const_red_acc7044[3072];\n" -" .loc 17 534 0\n" -"$LDWbegin_kernel_lj:\n" -" .loc 17 540 0\n" -" ld.param.u64 %rd1, [__cudaparm_kernel_lj_gum];\n" -" ldu.global.f32 %f1, [%rd1+0];\n" -" .loc 17 541 0\n" -" ld.global.f32 %f2, [%rd1+4];\n" -" .loc 17 542 0\n" -" ld.global.f32 %f3, [%rd1+8];\n" -" .loc 17 543 0\n" -" ld.global.f32 %f4, [%rd1+12];\n" -" st.shared.v4.f32 [__cuda___cuda_local_var_33394_33_non_const_sp_lj7028+0], {%f1,%f2,%f3,%f4};\n" -" .loc 17 552 0\n" -" mov.f32 %f5, 0f00000000; \n" -" mov.f32 %f6, %f5;\n" -" mov.f32 %f7, 0f00000000; \n" -" mov.f32 %f8, %f7;\n" -" mov.f32 %f9, 0f00000000; \n" -" mov.f32 %f10, %f9;\n" -" mov.f32 %f11, 0f00000000; \n" -" mov.f32 %f12, %f11;\n" -" mov.f32 %f13, 0f00000000; \n" -" mov.f32 %f14, %f13;\n" -" mov.f32 %f15, 0f00000000; \n" -" mov.f32 %f16, %f15;\n" -" ld.param.s32 %r1, [__cudaparm_kernel_lj_t_per_atom];\n" -" cvt.s32.u32 %r2, %tid.x;\n" -" div.s32 %r3, %r2, %r1;\n" -" cvt.s32.u32 %r4, %ntid.x;\n" -" div.s32 %r5, %r4, %r1;\n" -" cvt.s32.u32 %r6, %ctaid.x;\n" -" mul.lo.s32 %r7, %r6, %r5;\n" -" add.s32 %r8, %r3, %r7;\n" -" ld.param.s32 %r9, [__cudaparm_kernel_lj_start];\n" -" add.s32 %r10, %r9, %r8;\n" -" ld.param.s32 %r11, [__cudaparm_kernel_lj_inum];\n" -" setp.ge.s32 %p1, %r10, %r11;\n" -" @%p1 bra $Lt_2_25346;\n" -" .loc 17 557 0\n" -" cvt.s64.s32 %rd2, %r10;\n" -" mul.wide.s32 %rd3, %r10, 4;\n" -" ld.param.u64 %rd4, [__cudaparm_kernel_lj_dev_ij];\n" -" add.u64 %rd5, %rd4, %rd3;\n" -" ld.global.s32 %r12, [%rd5+0];\n" -" ld.param.s32 %r13, [__cudaparm_kernel_lj_stride];\n" -" cvt.s64.s32 %rd6, %r13;\n" -" mul.wide.s32 %rd7, %r13, 4;\n" -" add.u64 %rd8, %rd7, %rd5;\n" -" ld.global.s32 %r14, [%rd8+0];\n" -" .loc 17 560 0\n" -" ld.param.u64 %rd9, [__cudaparm_kernel_lj_x_];\n" -" cvt.s64.s32 %rd10, %r12;\n" -" mul.wide.s32 %rd11, %r12, 16;\n" -" add.u64 %rd12, %rd9, %rd11;\n" -" ld.global.v4.f32 {%f17,%f18,%f19,%f20}, [%rd12+0];\n" -" .loc 17 561 0\n" -" cvt.s32.s64 %r15, %rd6;\n" -" sub.s32 %r16, %r1, 1;\n" -" and.b32 %r17, %r16, %r2;\n" -" add.u64 %rd13, %rd7, %rd8;\n" -" mul.lo.s32 %r18, %r15, %r17;\n" -" cvt.s64.s32 %rd14, %r18;\n" -" mul.wide.s32 %rd15, %r18, 4;\n" -" add.u64 %rd16, %rd13, %rd15;\n" -" mov.s64 %rd17, %rd16;\n" -" mul.lo.s32 %r19, %r15, %r14;\n" -" cvt.s64.s32 %rd18, %r19;\n" -" mul.wide.s32 %rd19, %r19, 4;\n" -" add.u64 %rd20, %rd13, %rd19;\n" -" setp.ge.u64 %p2, %rd16, %rd20;\n" -" @%p2 bra $Lt_2_26882;\n" -" cvt.rzi.ftz.s32.f32 %r20, %f20;\n" -" ld.param.s32 %r21, [__cudaparm_kernel_lj_lj_types];\n" -" mul.lo.s32 %r22, %r21, %r20;\n" -" ld.param.u64 %rd21, [__cudaparm_kernel_lj_lj1];\n" -" mov.f32 %f21, 0f00000000; \n" -" mov.f32 %f22, 0f00000000; \n" -" mov.f32 %f23, 0f00000000; \n" -" mov.f32 %f24, 0f00000000; \n" -" mov.u64 %rd22, __cuda___cuda_local_var_33394_33_non_const_sp_lj7028;\n" -"$Lt_2_19714:\n" -" .loc 17 566 0\n" -" ld.global.s32 %r23, [%rd17+0];\n" -" .loc 17 567 0\n" -" shr.s32 %r24, %r23, 30;\n" -" and.b32 %r25, %r24, 3;\n" -" cvt.s64.s32 %rd23, %r25;\n" -" mul.wide.s32 %rd24, %r25, 4;\n" -" add.u64 %rd25, %rd22, %rd24;\n" -" ld.shared.f32 %f25, [%rd25+0];\n" -" .loc 17 570 0\n" -" and.b32 %r26, %r23, 1073741823;\n" -" cvt.s64.s32 %rd26, %r26;\n" -" mul.wide.s32 %rd27, %r26, 16;\n" -" add.u64 %rd28, %rd9, %rd27;\n" -" ld.global.v4.f32 {%f26,%f27,%f28,%f29}, [%rd28+0];\n" -" .loc 17 566 0\n" -" cvt.rzi.ftz.s32.f32 %r27, %f29;\n" -" sub.ftz.f32 %f30, %f18, %f27;\n" -" sub.ftz.f32 %f31, %f17, %f26;\n" -" sub.ftz.f32 %f32, %f19, %f28;\n" -" mul.ftz.f32 %f33, %f30, %f30;\n" -" fma.rn.ftz.f32 %f34, %f31, %f31, %f33;\n" -" fma.rn.ftz.f32 %f35, %f32, %f32, %f34;\n" -" add.s32 %r28, %r27, %r22;\n" -" cvt.s64.s32 %rd29, %r28;\n" -" mul.wide.s32 %rd30, %r28, 16;\n" -" add.u64 %rd31, %rd30, %rd21;\n" -" ld.global.f32 %f36, [%rd31+8];\n" -" setp.gt.ftz.f32 %p3, %f36, %f35;\n" -" @!%p3 bra $Lt_2_27138;\n" -" ld.global.f32 %f37, [%rd31+12];\n" -" mov.f32 %f38, 0f00000000; \n" -" setp.eq.ftz.f32 %p4, %f37, %f38;\n" -" @!%p4 bra $Lt_2_27138;\n" -" .loc 17 584 0\n" -" rcp.approx.ftz.f32 %f39, %f35;\n" -" mul.ftz.f32 %f40, %f39, %f39;\n" -" mul.ftz.f32 %f41, %f39, %f40;\n" -" mul.ftz.f32 %f42, %f39, %f41;\n" -" ld.global.v2.f32 {%f43,%f44}, [%rd31+0];\n" -" mul.ftz.f32 %f45, %f43, %f41;\n" -" sub.ftz.f32 %f46, %f45, %f44;\n" -" mul.ftz.f32 %f47, %f42, %f46;\n" -" mul.ftz.f32 %f48, %f25, %f47;\n" -" .loc 17 586 0\n" -" fma.rn.ftz.f32 %f23, %f31, %f48, %f23;\n" -" .loc 17 587 0\n" -" fma.rn.ftz.f32 %f22, %f30, %f48, %f22;\n" -" .loc 17 588 0\n" -" fma.rn.ftz.f32 %f21, %f32, %f48, %f21;\n" -" ld.param.s32 %r29, [__cudaparm_kernel_lj_eflag];\n" -" mov.u32 %r30, 0;\n" -" setp.le.s32 %p5, %r29, %r30;\n" -" @%p5 bra $Lt_2_19970;\n" -" .loc 17 592 0\n" -" ld.param.u64 %rd32, [__cudaparm_kernel_lj_lj3];\n" -" add.u64 %rd33, %rd32, %rd30;\n" -" ld.global.v4.f32 {%f49,%f50,%f51,_}, [%rd33+0];\n" -" mul.ftz.f32 %f52, %f49, %f41;\n" -" sub.ftz.f32 %f53, %f52, %f50;\n" -" mul.ftz.f32 %f54, %f41, %f53;\n" -" sub.ftz.f32 %f55, %f54, %f51;\n" -" fma.rn.ftz.f32 %f24, %f25, %f55, %f24;\n" -"$Lt_2_19970:\n" -" ld.param.s32 %r31, [__cudaparm_kernel_lj_vflag];\n" -" mov.u32 %r32, 0;\n" -" setp.le.s32 %p6, %r31, %r32;\n" -" @%p6 bra $Lt_2_27138;\n" -" .loc 17 595 0\n" -" mov.f32 %f56, %f6;\n" -" mul.ftz.f32 %f57, %f31, %f31;\n" -" fma.rn.ftz.f32 %f58, %f48, %f57, %f56;\n" -" mov.f32 %f6, %f58;\n" -" .loc 17 596 0\n" -" mov.f32 %f59, %f8;\n" -" fma.rn.ftz.f32 %f60, %f48, %f33, %f59;\n" -" mov.f32 %f8, %f60;\n" -" .loc 17 597 0\n" -" mov.f32 %f61, %f10;\n" -" mul.ftz.f32 %f62, %f32, %f32;\n" -" fma.rn.ftz.f32 %f63, %f48, %f62, %f61;\n" -" mov.f32 %f10, %f63;\n" -" .loc 17 598 0\n" -" mov.f32 %f64, %f12;\n" -" mul.ftz.f32 %f65, %f30, %f31;\n" -" fma.rn.ftz.f32 %f66, %f48, %f65, %f64;\n" -" mov.f32 %f12, %f66;\n" -" .loc 17 599 0\n" -" mov.f32 %f67, %f14;\n" -" mul.ftz.f32 %f68, %f31, %f32;\n" -" fma.rn.ftz.f32 %f69, %f48, %f68, %f67;\n" -" mov.f32 %f14, %f69;\n" -" .loc 17 600 0\n" -" mul.ftz.f32 %f70, %f30, %f32;\n" -" fma.rn.ftz.f32 %f15, %f48, %f70, %f15;\n" -" mov.f32 %f16, %f15;\n" -"$Lt_2_27138:\n" -"$L_2_18178:\n" -" .loc 17 594 0\n" -" mul.lo.s32 %r33, %r15, %r1;\n" -" cvt.s64.s32 %rd34, %r33;\n" -" mul.wide.s32 %rd35, %r33, 4;\n" -" add.u64 %rd17, %rd17, %rd35;\n" -" setp.gt.u64 %p7, %rd20, %rd17;\n" -" @%p7 bra $Lt_2_19714;\n" -" bra.uni $Lt_2_19202;\n" -"$Lt_2_26882:\n" -" mov.f32 %f21, 0f00000000; \n" -" mov.f32 %f22, 0f00000000; \n" -" mov.f32 %f23, 0f00000000; \n" -" mov.f32 %f24, 0f00000000; \n" -"$Lt_2_19202:\n" -" mov.u32 %r34, 1;\n" -" setp.le.s32 %p8, %r1, %r34;\n" -" @%p8 bra $Lt_2_23298;\n" -" .loc 17 604 0\n" -" mov.u64 %rd36, __cuda___cuda_local_var_33459_55_non_const_red_acc7044;\n" -" cvt.s64.s32 %rd37, %r2;\n" -" mul.wide.s32 %rd38, %r2, 4;\n" -" add.u64 %rd39, %rd36, %rd38;\n" -" mov.f32 %f71, %f23;\n" -" st.shared.f32 [%rd39+0], %f71;\n" -" mov.f32 %f72, %f22;\n" -" st.shared.f32 [%rd39+512], %f72;\n" -" mov.f32 %f73, %f21;\n" -" st.shared.f32 [%rd39+1024], %f73;\n" -" mov.f32 %f74, %f24;\n" -" st.shared.f32 [%rd39+1536], %f74;\n" -" shr.s32 %r35, %r1, 31;\n" -" mov.s32 %r36, 1;\n" -" and.b32 %r37, %r35, %r36;\n" -" add.s32 %r38, %r37, %r1;\n" -" shr.s32 %r39, %r38, 1;\n" -" mov.s32 %r40, %r39;\n" -" mov.u32 %r41, 0;\n" -" setp.ne.u32 %p9, %r39, %r41;\n" -" @!%p9 bra $Lt_2_21762;\n" -"$Lt_2_22274:\n" -" setp.ge.u32 %p10, %r17, %r40;\n" -" @%p10 bra $Lt_2_22530;\n" -" add.u32 %r42, %r2, %r40;\n" -" cvt.u64.u32 %rd40, %r42;\n" -" mul.wide.u32 %rd41, %r42, 4;\n" -" add.u64 %rd42, %rd36, %rd41;\n" -" ld.shared.f32 %f75, [%rd42+0];\n" -" add.ftz.f32 %f71, %f75, %f71;\n" -" st.shared.f32 [%rd39+0], %f71;\n" -" ld.shared.f32 %f76, [%rd42+512];\n" -" add.ftz.f32 %f72, %f76, %f72;\n" -" st.shared.f32 [%rd39+512], %f72;\n" -" ld.shared.f32 %f77, [%rd42+1024];\n" -" add.ftz.f32 %f73, %f77, %f73;\n" -" st.shared.f32 [%rd39+1024], %f73;\n" -" ld.shared.f32 %f78, [%rd42+1536];\n" -" add.ftz.f32 %f74, %f78, %f74;\n" -" st.shared.f32 [%rd39+1536], %f74;\n" -"$Lt_2_22530:\n" -" shr.u32 %r40, %r40, 1;\n" -" mov.u32 %r43, 0;\n" -" setp.ne.u32 %p11, %r40, %r43;\n" -" @%p11 bra $Lt_2_22274;\n" -"$Lt_2_21762:\n" -" mov.f32 %f23, %f71;\n" -" mov.f32 %f22, %f72;\n" -" mov.f32 %f21, %f73;\n" -" mov.f32 %f24, %f74;\n" -" ld.param.s32 %r44, [__cudaparm_kernel_lj_vflag];\n" -" mov.u32 %r45, 0;\n" -" setp.le.s32 %p12, %r44, %r45;\n" -" @%p12 bra $Lt_2_23298;\n" -" mov.f32 %f71, %f6;\n" -" st.shared.f32 [%rd39+0], %f71;\n" -" mov.f32 %f72, %f8;\n" -" st.shared.f32 [%rd39+512], %f72;\n" -" mov.f32 %f73, %f10;\n" -" st.shared.f32 [%rd39+1024], %f73;\n" -" mov.f32 %f74, %f12;\n" -" st.shared.f32 [%rd39+1536], %f74;\n" -" mov.f32 %f79, %f14;\n" -" st.shared.f32 [%rd39+2048], %f79;\n" -" mov.f32 %f80, %f15;\n" -" st.shared.f32 [%rd39+2560], %f80;\n" -" mov.s32 %r46, %r39;\n" -" @!%p9 bra $Lt_2_23810;\n" -"$Lt_2_24322:\n" -" setp.ge.u32 %p13, %r17, %r46;\n" -" @%p13 bra $Lt_2_24578;\n" -" add.u32 %r47, %r2, %r46;\n" -" cvt.u64.u32 %rd43, %r47;\n" -" mul.wide.u32 %rd44, %r47, 4;\n" -" add.u64 %rd45, %rd36, %rd44;\n" -" ld.shared.f32 %f81, [%rd45+0];\n" -" add.ftz.f32 %f71, %f81, %f71;\n" -" st.shared.f32 [%rd39+0], %f71;\n" -" ld.shared.f32 %f82, [%rd45+512];\n" -" add.ftz.f32 %f72, %f82, %f72;\n" -" st.shared.f32 [%rd39+512], %f72;\n" -" ld.shared.f32 %f83, [%rd45+1024];\n" -" add.ftz.f32 %f73, %f83, %f73;\n" -" st.shared.f32 [%rd39+1024], %f73;\n" -" ld.shared.f32 %f84, [%rd45+1536];\n" -" add.ftz.f32 %f74, %f84, %f74;\n" -" st.shared.f32 [%rd39+1536], %f74;\n" -" ld.shared.f32 %f85, [%rd45+2048];\n" -" add.ftz.f32 %f79, %f85, %f79;\n" -" st.shared.f32 [%rd39+2048], %f79;\n" -" ld.shared.f32 %f86, [%rd45+2560];\n" -" add.ftz.f32 %f80, %f86, %f80;\n" -" st.shared.f32 [%rd39+2560], %f80;\n" -"$Lt_2_24578:\n" -" shr.u32 %r46, %r46, 1;\n" -" mov.u32 %r48, 0;\n" -" setp.ne.u32 %p14, %r46, %r48;\n" -" @%p14 bra $Lt_2_24322;\n" -"$Lt_2_23810:\n" -" mov.f32 %f6, %f71;\n" -" mov.f32 %f8, %f72;\n" -" mov.f32 %f10, %f73;\n" -" mov.f32 %f12, %f74;\n" -" mov.f32 %f14, %f79;\n" -" mov.f32 %f16, %f80;\n" -"$Lt_2_23298:\n" -"$Lt_2_21250:\n" -" mov.u32 %r49, 0;\n" -" setp.ne.s32 %p15, %r17, %r49;\n" -" @%p15 bra $Lt_2_25346;\n" -" ld.param.u64 %rd46, [__cudaparm_kernel_lj___val_paramengv];\n" -" add.u64 %rd47, %rd46, %rd3;\n" -" ld.param.s32 %r50, [__cudaparm_kernel_lj_eflag];\n" -" mov.u32 %r51, 0;\n" -" setp.le.s32 %p16, %r50, %r51;\n" -" @%p16 bra $Lt_2_25858;\n" -" ld.global.f32 %f87, [%rd47+0];\n" -" add.ftz.f32 %f88, %f87, %f24;\n" -" st.global.f32 [%rd47+0], %f88;\n" -" cvt.s64.s32 %rd48, %r11;\n" -" mul.wide.s32 %rd49, %r11, 4;\n" -" add.u64 %rd47, %rd47, %rd49;\n" -"$Lt_2_25858:\n" -" ld.param.s32 %r52, [__cudaparm_kernel_lj_vflag];\n" -" mov.u32 %r53, 0;\n" -" setp.le.s32 %p17, %r52, %r53;\n" -" @%p17 bra $Lt_2_26370;\n" -" ld.global.f32 %f89, [%rd47+0];\n" -" mov.f32 %f90, %f6;\n" -" add.ftz.f32 %f91, %f89, %f90;\n" -" st.global.f32 [%rd47+0], %f91;\n" -" cvt.s64.s32 %rd50, %r11;\n" -" mul.wide.s32 %rd51, %r11, 4;\n" -" add.u64 %rd52, %rd51, %rd47;\n" -" ld.global.f32 %f92, [%rd52+0];\n" -" mov.f32 %f93, %f8;\n" -" add.ftz.f32 %f94, %f92, %f93;\n" -" st.global.f32 [%rd52+0], %f94;\n" -" add.u64 %rd53, %rd51, %rd52;\n" -" ld.global.f32 %f95, [%rd53+0];\n" -" mov.f32 %f96, %f10;\n" -" add.ftz.f32 %f97, %f95, %f96;\n" -" st.global.f32 [%rd53+0], %f97;\n" -" add.u64 %rd54, %rd51, %rd53;\n" -" ld.global.f32 %f98, [%rd54+0];\n" -" mov.f32 %f99, %f12;\n" -" add.ftz.f32 %f100, %f98, %f99;\n" -" st.global.f32 [%rd54+0], %f100;\n" -" add.u64 %rd55, %rd51, %rd54;\n" -" ld.global.f32 %f101, [%rd55+0];\n" -" mov.f32 %f102, %f14;\n" -" add.ftz.f32 %f103, %f101, %f102;\n" -" st.global.f32 [%rd55+0], %f103;\n" -" add.u64 %rd47, %rd51, %rd55;\n" -" ld.global.f32 %f104, [%rd47+0];\n" -" mov.f32 %f105, %f16;\n" -" add.ftz.f32 %f106, %f104, %f105;\n" -" st.global.f32 [%rd47+0], %f106;\n" -"$Lt_2_26370:\n" -" ld.param.u64 %rd56, [__cudaparm_kernel_lj_ans];\n" -" mul.lo.u64 %rd57, %rd2, 16;\n" -" add.u64 %rd58, %rd56, %rd57;\n" -" ld.global.v4.f32 {%f107,%f108,%f109,%f110}, [%rd58+0];\n" -" add.ftz.f32 %f111, %f108, %f22;\n" -" add.ftz.f32 %f112, %f109, %f21;\n" -" add.ftz.f32 %f113, %f107, %f23;\n" -" st.global.v4.f32 [%rd58+0], {%f113,%f111,%f112,%f110};\n" -"$Lt_2_25346:\n" -"$Lt_2_18690:\n" -" .loc 17 607 0\n" -" exit;\n" -"$LDWend_kernel_lj:\n" -" }\n" -" .entry kernel_lj_fast (\n" -" .param .u64 __cudaparm_kernel_lj_fast_x_,\n" -" .param .u64 __cudaparm_kernel_lj_fast_lj1_in,\n" -" .param .u64 __cudaparm_kernel_lj_fast_lj3_in,\n" -" .param .u64 __cudaparm_kernel_lj_fast_gum,\n" -" .param .s32 __cudaparm_kernel_lj_fast_stride,\n" -" .param .u64 __cudaparm_kernel_lj_fast_dev_ij,\n" -" .param .u64 __cudaparm_kernel_lj_fast_ans,\n" -" .param .u64 __cudaparm_kernel_lj_fast___val_paramengv,\n" -" .param .u64 __cudaparm_kernel_lj_fast_err_flag,\n" -" .param .s32 __cudaparm_kernel_lj_fast_eflag,\n" -" .param .s32 __cudaparm_kernel_lj_fast_vflag,\n" -" .param .s32 __cudaparm_kernel_lj_fast_start,\n" -" .param .s32 __cudaparm_kernel_lj_fast_inum,\n" -" .param .s32 __cudaparm_kernel_lj_fast_t_per_atom)\n" -" {\n" -" .reg .u32 %r<57>;\n" -" .reg .u64 %rd<72>;\n" -" .reg .f32 %f<122>;\n" -" .reg .pred %p<22>;\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_33475_33_non_const_sp_lj10212[16];\n" -" .shared .align 16 .b8 __cuda___cuda_local_var_33476_34_non_const_lj110240[1936];\n" -" .shared .align 16 .b8 __cuda___cuda_local_var_33477_34_non_const_lj312176[1936];\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_33547_55_non_const_red_acc14112[3072];\n" -" .loc 17 615 0\n" -"$LDWbegin_kernel_lj_fast:\n" -" cvt.s32.u32 %r1, %tid.x;\n" -" mov.u32 %r2, 3;\n" -" setp.gt.s32 %p1, %r1, %r2;\n" -" @%p1 bra $Lt_3_20994;\n" -" .loc 17 624 0\n" -" mov.u64 %rd1, __cuda___cuda_local_var_33475_33_non_const_sp_lj10212;\n" -" cvt.s64.s32 %rd2, %r1;\n" -" mul.wide.s32 %rd3, %r1, 4;\n" -" ld.param.u64 %rd4, [__cudaparm_kernel_lj_fast_gum];\n" -" add.u64 %rd5, %rd4, %rd3;\n" -" ld.global.f32 %f1, [%rd5+0];\n" -" add.u64 %rd6, %rd3, %rd1;\n" -" st.shared.f32 [%rd6+0], %f1;\n" -"$Lt_3_20994:\n" -" mov.u64 %rd1, __cuda___cuda_local_var_33475_33_non_const_sp_lj10212;\n" -" mov.u32 %r3, 120;\n" -" setp.gt.s32 %p2, %r1, %r3;\n" -" @%p2 bra $Lt_3_21506;\n" -" .loc 17 626 0\n" -" mov.u64 %rd7, __cuda___cuda_local_var_33476_34_non_const_lj110240;\n" -" cvt.s64.s32 %rd8, %r1;\n" -" mul.wide.s32 %rd9, %r1, 16;\n" -" ld.param.u64 %rd10, [__cudaparm_kernel_lj_fast_lj1_in];\n" -" add.u64 %rd11, %rd10, %rd9;\n" -" add.u64 %rd12, %rd9, %rd7;\n" -" ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0];\n" -" st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5};\n" -" ld.param.s32 %r4, [__cudaparm_kernel_lj_fast_eflag];\n" -" mov.u32 %r5, 0;\n" -" setp.le.s32 %p3, %r4, %r5;\n" -" @%p3 bra $Lt_3_22018;\n" -" .loc 17 628 0\n" -" mov.u64 %rd13, __cuda___cuda_local_var_33477_34_non_const_lj312176;\n" -" ld.param.u64 %rd14, [__cudaparm_kernel_lj_fast_lj3_in];\n" -" add.u64 %rd15, %rd14, %rd9;\n" -" add.u64 %rd16, %rd9, %rd13;\n" -" ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0];\n" -" st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9};\n" -"$Lt_3_22018:\n" -" mov.u64 %rd13, __cuda___cuda_local_var_33477_34_non_const_lj312176;\n" -"$Lt_3_21506:\n" -" mov.u64 %rd13, __cuda___cuda_local_var_33477_34_non_const_lj312176;\n" -" mov.u64 %rd7, __cuda___cuda_local_var_33476_34_non_const_lj110240;\n" -" .loc 17 638 0\n" -" mov.f32 %f10, 0f00000000; \n" -" mov.f32 %f11, %f10;\n" -" mov.f32 %f12, 0f00000000; \n" -" mov.f32 %f13, %f12;\n" -" mov.f32 %f14, 0f00000000; \n" -" mov.f32 %f15, %f14;\n" -" mov.f32 %f16, 0f00000000; \n" -" mov.f32 %f17, %f16;\n" -" mov.f32 %f18, 0f00000000; \n" -" mov.f32 %f19, %f18;\n" -" mov.f32 %f20, 0f00000000; \n" -" mov.f32 %f21, %f20;\n" -" .loc 17 640 0\n" -" bar.sync 0;\n" -" ld.param.s32 %r6, [__cudaparm_kernel_lj_fast_t_per_atom];\n" -" div.s32 %r7, %r1, %r6;\n" -" cvt.s32.u32 %r8, %ntid.x;\n" -" div.s32 %r9, %r8, %r6;\n" -" cvt.s32.u32 %r10, %ctaid.x;\n" -" mul.lo.s32 %r11, %r10, %r9;\n" -" add.s32 %r12, %r7, %r11;\n" -" ld.param.s32 %r13, [__cudaparm_kernel_lj_fast_start];\n" -" add.s32 %r14, %r13, %r12;\n" -" ld.param.s32 %r15, [__cudaparm_kernel_lj_fast_inum];\n" -" setp.ge.s32 %p4, %r14, %r15;\n" -" @%p4 bra $Lt_3_29186;\n" -" .loc 17 645 0\n" -" cvt.s64.s32 %rd17, %r14;\n" -" mul.wide.s32 %rd18, %r14, 4;\n" -" ld.param.u64 %rd19, [__cudaparm_kernel_lj_fast_dev_ij];\n" -" add.u64 %rd20, %rd19, %rd18;\n" -" ld.global.s32 %r16, [%rd20+0];\n" -" ld.param.s32 %r17, [__cudaparm_kernel_lj_fast_stride];\n" -" cvt.s64.s32 %rd21, %r17;\n" -" mul.wide.s32 %rd22, %r17, 4;\n" -" add.u64 %rd23, %rd22, %rd20;\n" -" ld.global.s32 %r18, [%rd23+0];\n" -" .loc 17 648 0\n" -" ld.param.u64 %rd24, [__cudaparm_kernel_lj_fast_x_];\n" -" cvt.s64.s32 %rd25, %r16;\n" -" mul.wide.s32 %rd26, %r16, 16;\n" -" add.u64 %rd27, %rd24, %rd26;\n" -" ld.global.v4.f32 {%f22,%f23,%f24,%f25}, [%rd27+0];\n" -" .loc 17 650 0\n" -" cvt.s32.s64 %r19, %rd21;\n" -" sub.s32 %r20, %r6, 1;\n" -" and.b32 %r21, %r20, %r1;\n" -" add.u64 %rd28, %rd22, %rd23;\n" -" mul.lo.s32 %r22, %r19, %r21;\n" -" cvt.s64.s32 %rd29, %r22;\n" -" mul.wide.s32 %rd30, %r22, 4;\n" -" add.u64 %rd31, %rd28, %rd30;\n" -" mov.s64 %rd32, %rd31;\n" -" mul.lo.s32 %r23, %r19, %r18;\n" -" cvt.s64.s32 %rd33, %r23;\n" -" mul.wide.s32 %rd34, %r23, 4;\n" -" add.u64 %rd35, %rd28, %rd34;\n" -" setp.ge.u64 %p5, %rd31, %rd35;\n" -" @%p5 bra $Lt_3_30722;\n" -" cvt.rzi.ftz.s32.f32 %r24, %f25;\n" -" mul.lo.s32 %r25, %r24, 11;\n" -" cvt.rn.f32.s32 %f26, %r25;\n" -" mov.f32 %f27, 0f00000000; \n" -" mov.f32 %f28, 0f00000000; \n" -" mov.f32 %f29, 0f00000000; \n" -" mov.f32 %f30, 0f00000000; \n" -"$Lt_3_23554:\n" -" .loc 17 655 0\n" -" ld.global.s32 %r26, [%rd32+0];\n" -" .loc 17 656 0\n" -" shr.s32 %r27, %r26, 30;\n" -" and.b32 %r28, %r27, 3;\n" -" cvt.s64.s32 %rd36, %r28;\n" -" mul.wide.s32 %rd37, %r28, 4;\n" -" add.u64 %rd38, %rd1, %rd37;\n" -" ld.shared.f32 %f31, [%rd38+0];\n" -" .loc 17 659 0\n" -" and.b32 %r29, %r26, 1073741823;\n" -" cvt.s64.s32 %rd39, %r29;\n" -" mul.wide.s32 %rd40, %r29, 16;\n" -" add.u64 %rd41, %rd24, %rd40;\n" -" ld.global.v4.f32 {%f32,%f33,%f34,%f35}, [%rd41+0];\n" -" .loc 17 655 0\n" -" sub.ftz.f32 %f36, %f23, %f33;\n" -" sub.ftz.f32 %f37, %f22, %f32;\n" -" sub.ftz.f32 %f38, %f24, %f34;\n" -" mul.ftz.f32 %f39, %f36, %f36;\n" -" fma.rn.ftz.f32 %f40, %f37, %f37, %f39;\n" -" fma.rn.ftz.f32 %f41, %f38, %f38, %f40;\n" -" add.ftz.f32 %f42, %f26, %f35;\n" -" cvt.rzi.ftz.s32.f32 %r30, %f42;\n" -" cvt.s64.s32 %rd42, %r30;\n" -" mul.wide.s32 %rd43, %r30, 16;\n" -" add.u64 %rd44, %rd43, %rd7;\n" -" ld.shared.f32 %f43, [%rd44+8];\n" -" setp.gt.ftz.f32 %p6, %f43, %f41;\n" -" @!%p6 bra $Lt_3_30978;\n" -" ld.shared.f32 %f44, [%rd44+12];\n" -" mov.f32 %f45, 0f00000000; \n" -" setp.eq.ftz.f32 %p7, %f44, %f45;\n" -" @!%p7 bra $Lt_3_30978;\n" -" .loc 17 671 0\n" -" rcp.approx.ftz.f32 %f46, %f41;\n" -" mul.ftz.f32 %f47, %f46, %f46;\n" -" mul.ftz.f32 %f48, %f46, %f47;\n" -" mul.ftz.f32 %f49, %f46, %f31;\n" -" mul.ftz.f32 %f50, %f48, %f49;\n" -" ld.shared.v2.f32 {%f51,%f52}, [%rd44+0];\n" -" mul.ftz.f32 %f53, %f51, %f48;\n" -" sub.ftz.f32 %f54, %f53, %f52;\n" -" mul.ftz.f32 %f55, %f50, %f54;\n" -" .loc 17 673 0\n" -" fma.rn.ftz.f32 %f29, %f37, %f55, %f29;\n" -" .loc 17 674 0\n" -" fma.rn.ftz.f32 %f28, %f36, %f55, %f28;\n" -" .loc 17 675 0\n" -" fma.rn.ftz.f32 %f27, %f38, %f55, %f27;\n" -" ld.param.s32 %r31, [__cudaparm_kernel_lj_fast_eflag];\n" -" mov.u32 %r32, 0;\n" -" setp.le.s32 %p8, %r31, %r32;\n" -" @%p8 bra $Lt_3_23810;\n" -" .loc 17 678 0\n" -" add.u64 %rd45, %rd43, %rd13;\n" -" ld.shared.v4.f32 {%f56,%f57,%f58,_}, [%rd45+0];\n" -" mul.ftz.f32 %f59, %f56, %f48;\n" -" sub.ftz.f32 %f60, %f59, %f57;\n" -" mul.ftz.f32 %f61, %f48, %f60;\n" -" .loc 17 679 0\n" -" sub.ftz.f32 %f62, %f61, %f58;\n" -" fma.rn.ftz.f32 %f30, %f31, %f62, %f30;\n" -"$Lt_3_23810:\n" -" ld.param.s32 %r33, [__cudaparm_kernel_lj_fast_vflag];\n" -" mov.u32 %r34, 0;\n" -" setp.le.s32 %p9, %r33, %r34;\n" -" @%p9 bra $Lt_3_30978;\n" -" .loc 17 682 0\n" -" mov.f32 %f63, %f11;\n" -" mul.ftz.f32 %f64, %f37, %f37;\n" -" fma.rn.ftz.f32 %f65, %f55, %f64, %f63;\n" -" mov.f32 %f11, %f65;\n" -" .loc 17 683 0\n" -" mov.f32 %f66, %f13;\n" -" fma.rn.ftz.f32 %f67, %f55, %f39, %f66;\n" -" mov.f32 %f13, %f67;\n" -" .loc 17 684 0\n" -" mov.f32 %f68, %f15;\n" -" mul.ftz.f32 %f69, %f38, %f38;\n" -" fma.rn.ftz.f32 %f70, %f55, %f69, %f68;\n" -" mov.f32 %f15, %f70;\n" -" .loc 17 685 0\n" -" mov.f32 %f71, %f17;\n" -" mul.ftz.f32 %f72, %f36, %f37;\n" -" fma.rn.ftz.f32 %f73, %f55, %f72, %f71;\n" -" mov.f32 %f17, %f73;\n" -" .loc 17 686 0\n" -" mov.f32 %f74, %f19;\n" -" mul.ftz.f32 %f75, %f37, %f38;\n" -" fma.rn.ftz.f32 %f76, %f55, %f75, %f74;\n" -" mov.f32 %f19, %f76;\n" -" .loc 17 687 0\n" -" mul.ftz.f32 %f77, %f36, %f38;\n" -" fma.rn.ftz.f32 %f20, %f55, %f77, %f20;\n" -" mov.f32 %f21, %f20;\n" -"$Lt_3_30978:\n" -"$L_3_20482:\n" -" .loc 17 681 0\n" -" mul.lo.s32 %r35, %r19, %r6;\n" -" cvt.s64.s32 %rd46, %r35;\n" -" mul.wide.s32 %rd47, %r35, 4;\n" -" add.u64 %rd32, %rd32, %rd47;\n" -" setp.gt.u64 %p10, %rd35, %rd32;\n" -" @%p10 bra $Lt_3_23554;\n" -" bra.uni $Lt_3_23042;\n" -"$Lt_3_30722:\n" -" mov.f32 %f27, 0f00000000; \n" -" mov.f32 %f28, 0f00000000; \n" -" mov.f32 %f29, 0f00000000; \n" -" mov.f32 %f30, 0f00000000; \n" -"$Lt_3_23042:\n" -" mov.u32 %r36, 1;\n" -" setp.le.s32 %p11, %r6, %r36;\n" -" @%p11 bra $Lt_3_27138;\n" -" .loc 17 692 0\n" -" mov.u64 %rd48, __cuda___cuda_local_var_33547_55_non_const_red_acc14112;\n" -" cvt.s64.s32 %rd49, %r1;\n" -" mul.wide.s32 %rd50, %r1, 4;\n" -" add.u64 %rd51, %rd48, %rd50;\n" -" mov.f32 %f78, %f29;\n" -" st.shared.f32 [%rd51+0], %f78;\n" -" mov.f32 %f79, %f28;\n" -" st.shared.f32 [%rd51+512], %f79;\n" -" mov.f32 %f80, %f27;\n" -" st.shared.f32 [%rd51+1024], %f80;\n" -" mov.f32 %f81, %f30;\n" -" st.shared.f32 [%rd51+1536], %f81;\n" -" shr.s32 %r37, %r6, 31;\n" -" mov.s32 %r38, 1;\n" -" and.b32 %r39, %r37, %r38;\n" -" add.s32 %r40, %r39, %r6;\n" -" shr.s32 %r41, %r40, 1;\n" -" mov.s32 %r42, %r41;\n" -" mov.u32 %r43, 0;\n" -" setp.ne.u32 %p12, %r41, %r43;\n" -" @!%p12 bra $Lt_3_25602;\n" -"$Lt_3_26114:\n" -" setp.ge.u32 %p13, %r21, %r42;\n" -" @%p13 bra $Lt_3_26370;\n" -" add.u32 %r44, %r1, %r42;\n" -" cvt.u64.u32 %rd52, %r44;\n" -" mul.wide.u32 %rd53, %r44, 4;\n" -" add.u64 %rd54, %rd48, %rd53;\n" -" ld.shared.f32 %f82, [%rd54+0];\n" -" add.ftz.f32 %f78, %f82, %f78;\n" -" st.shared.f32 [%rd51+0], %f78;\n" -" ld.shared.f32 %f83, [%rd54+512];\n" -" add.ftz.f32 %f79, %f83, %f79;\n" -" st.shared.f32 [%rd51+512], %f79;\n" -" ld.shared.f32 %f84, [%rd54+1024];\n" -" add.ftz.f32 %f80, %f84, %f80;\n" -" st.shared.f32 [%rd51+1024], %f80;\n" -" ld.shared.f32 %f85, [%rd54+1536];\n" -" add.ftz.f32 %f81, %f85, %f81;\n" -" st.shared.f32 [%rd51+1536], %f81;\n" -"$Lt_3_26370:\n" -" shr.u32 %r42, %r42, 1;\n" -" mov.u32 %r45, 0;\n" -" setp.ne.u32 %p14, %r42, %r45;\n" -" @%p14 bra $Lt_3_26114;\n" -"$Lt_3_25602:\n" -" mov.f32 %f29, %f78;\n" -" mov.f32 %f28, %f79;\n" -" mov.f32 %f27, %f80;\n" -" mov.f32 %f30, %f81;\n" -" ld.param.s32 %r46, [__cudaparm_kernel_lj_fast_vflag];\n" -" mov.u32 %r47, 0;\n" -" setp.le.s32 %p15, %r46, %r47;\n" -" @%p15 bra $Lt_3_27138;\n" -" mov.f32 %f78, %f11;\n" -" st.shared.f32 [%rd51+0], %f78;\n" -" mov.f32 %f79, %f13;\n" -" st.shared.f32 [%rd51+512], %f79;\n" -" mov.f32 %f80, %f15;\n" -" st.shared.f32 [%rd51+1024], %f80;\n" -" mov.f32 %f81, %f17;\n" -" st.shared.f32 [%rd51+1536], %f81;\n" -" mov.f32 %f86, %f19;\n" -" st.shared.f32 [%rd51+2048], %f86;\n" -" mov.f32 %f87, %f20;\n" -" st.shared.f32 [%rd51+2560], %f87;\n" -" mov.s32 %r48, %r41;\n" -" @!%p12 bra $Lt_3_27650;\n" -"$Lt_3_28162:\n" -" setp.ge.u32 %p16, %r21, %r48;\n" -" @%p16 bra $Lt_3_28418;\n" -" add.u32 %r49, %r1, %r48;\n" -" cvt.u64.u32 %rd55, %r49;\n" -" mul.wide.u32 %rd56, %r49, 4;\n" -" add.u64 %rd57, %rd48, %rd56;\n" -" ld.shared.f32 %f88, [%rd57+0];\n" -" add.ftz.f32 %f78, %f88, %f78;\n" -" st.shared.f32 [%rd51+0], %f78;\n" -" ld.shared.f32 %f89, [%rd57+512];\n" -" add.ftz.f32 %f79, %f89, %f79;\n" -" st.shared.f32 [%rd51+512], %f79;\n" -" ld.shared.f32 %f90, [%rd57+1024];\n" -" add.ftz.f32 %f80, %f90, %f80;\n" -" st.shared.f32 [%rd51+1024], %f80;\n" -" ld.shared.f32 %f91, [%rd57+1536];\n" -" add.ftz.f32 %f81, %f91, %f81;\n" -" st.shared.f32 [%rd51+1536], %f81;\n" -" ld.shared.f32 %f92, [%rd57+2048];\n" -" add.ftz.f32 %f86, %f92, %f86;\n" -" st.shared.f32 [%rd51+2048], %f86;\n" -" ld.shared.f32 %f93, [%rd57+2560];\n" -" add.ftz.f32 %f87, %f93, %f87;\n" -" st.shared.f32 [%rd51+2560], %f87;\n" -"$Lt_3_28418:\n" -" shr.u32 %r48, %r48, 1;\n" -" mov.u32 %r50, 0;\n" -" setp.ne.u32 %p17, %r48, %r50;\n" -" @%p17 bra $Lt_3_28162;\n" -"$Lt_3_27650:\n" -" mov.f32 %f11, %f78;\n" -" mov.f32 %f13, %f79;\n" -" mov.f32 %f15, %f80;\n" -" mov.f32 %f17, %f81;\n" -" mov.f32 %f19, %f86;\n" -" mov.f32 %f21, %f87;\n" -"$Lt_3_27138:\n" -"$Lt_3_25090:\n" -" mov.u32 %r51, 0;\n" -" setp.ne.s32 %p18, %r21, %r51;\n" -" @%p18 bra $Lt_3_29186;\n" -" ld.param.u64 %rd58, [__cudaparm_kernel_lj_fast___val_paramengv];\n" -" add.u64 %rd59, %rd58, %rd18;\n" -" ld.param.s32 %r52, [__cudaparm_kernel_lj_fast_eflag];\n" -" mov.u32 %r53, 0;\n" -" setp.le.s32 %p19, %r52, %r53;\n" -" @%p19 bra $Lt_3_29698;\n" -" ld.global.f32 %f94, [%rd59+0];\n" -" add.ftz.f32 %f95, %f94, %f30;\n" -" st.global.f32 [%rd59+0], %f95;\n" -" cvt.s64.s32 %rd60, %r15;\n" -" mul.wide.s32 %rd61, %r15, 4;\n" -" add.u64 %rd59, %rd59, %rd61;\n" -"$Lt_3_29698:\n" -" ld.param.s32 %r54, [__cudaparm_kernel_lj_fast_vflag];\n" -" mov.u32 %r55, 0;\n" -" setp.le.s32 %p20, %r54, %r55;\n" -" @%p20 bra $Lt_3_30210;\n" -" ld.global.f32 %f96, [%rd59+0];\n" -" mov.f32 %f97, %f11;\n" -" add.ftz.f32 %f98, %f96, %f97;\n" -" st.global.f32 [%rd59+0], %f98;\n" -" cvt.s64.s32 %rd62, %r15;\n" -" mul.wide.s32 %rd63, %r15, 4;\n" -" add.u64 %rd64, %rd63, %rd59;\n" -" ld.global.f32 %f99, [%rd64+0];\n" -" mov.f32 %f100, %f13;\n" -" add.ftz.f32 %f101, %f99, %f100;\n" -" st.global.f32 [%rd64+0], %f101;\n" -" add.u64 %rd65, %rd63, %rd64;\n" -" ld.global.f32 %f102, [%rd65+0];\n" -" mov.f32 %f103, %f15;\n" -" add.ftz.f32 %f104, %f102, %f103;\n" -" st.global.f32 [%rd65+0], %f104;\n" -" add.u64 %rd66, %rd63, %rd65;\n" -" ld.global.f32 %f105, [%rd66+0];\n" -" mov.f32 %f106, %f17;\n" -" add.ftz.f32 %f107, %f105, %f106;\n" -" st.global.f32 [%rd66+0], %f107;\n" -" add.u64 %rd67, %rd63, %rd66;\n" -" ld.global.f32 %f108, [%rd67+0];\n" -" mov.f32 %f109, %f19;\n" -" add.ftz.f32 %f110, %f108, %f109;\n" -" st.global.f32 [%rd67+0], %f110;\n" -" add.u64 %rd59, %rd63, %rd67;\n" -" ld.global.f32 %f111, [%rd59+0];\n" -" mov.f32 %f112, %f21;\n" -" add.ftz.f32 %f113, %f111, %f112;\n" -" st.global.f32 [%rd59+0], %f113;\n" -"$Lt_3_30210:\n" -" ld.param.u64 %rd68, [__cudaparm_kernel_lj_fast_ans];\n" -" mul.lo.u64 %rd69, %rd17, 16;\n" -" add.u64 %rd70, %rd68, %rd69;\n" -" ld.global.v4.f32 {%f114,%f115,%f116,%f117}, [%rd70+0];\n" -" add.ftz.f32 %f118, %f115, %f28;\n" -" add.ftz.f32 %f119, %f116, %f27;\n" -" add.ftz.f32 %f120, %f114, %f29;\n" -" st.global.v4.f32 [%rd70+0], {%f120,%f118,%f119,%f117};\n" -"$Lt_3_29186:\n" -"$Lt_3_22530:\n" -" .loc 17 695 0\n" -" exit;\n" -"$LDWend_kernel_lj_fast:\n" -" }\n" -; diff --git a/lib/gpu/re_squared_ptx.h b/lib/gpu/re_squared_ptx.h deleted file mode 100644 index 9343fc5cae..0000000000 --- a/lib/gpu/re_squared_ptx.h +++ /dev/null @@ -1,2306 +0,0 @@ -const char * re_squared = -" .version 2.3\n" -" .target sm_20\n" -" .address_size 64\n" -" .entry kernel_ellipsoid (\n" -" .param .u64 __cudaparm_kernel_ellipsoid_x_,\n" -" .param .u64 __cudaparm_kernel_ellipsoid_q,\n" -" .param .u64 __cudaparm_kernel_ellipsoid_shape,\n" -" .param .u64 __cudaparm_kernel_ellipsoid_well,\n" -" .param .u64 __cudaparm_kernel_ellipsoid_splj,\n" -" .param .u64 __cudaparm_kernel_ellipsoid_sig_eps,\n" -" .param .s32 __cudaparm_kernel_ellipsoid_ntypes,\n" -" .param .u64 __cudaparm_kernel_ellipsoid_dev_nbor,\n" -" .param .s32 __cudaparm_kernel_ellipsoid_stride,\n" -" .param .u64 __cudaparm_kernel_ellipsoid_ans,\n" -" .param .s32 __cudaparm_kernel_ellipsoid_astride,\n" -" .param .u64 __cudaparm_kernel_ellipsoid_engv,\n" -" .param .u64 __cudaparm_kernel_ellipsoid_err_flag,\n" -" .param .s32 __cudaparm_kernel_ellipsoid_eflag,\n" -" .param .s32 __cudaparm_kernel_ellipsoid_vflag,\n" -" .param .s32 __cudaparm_kernel_ellipsoid_inum,\n" -" .param .s32 __cudaparm_kernel_ellipsoid_t_per_atom)\n" -" {\n" -" .reg .u32 %r<65>;\n" -" .reg .u64 %rd<78>;\n" -" .reg .f32 %f<1598>;\n" -" .reg .pred %p<34>;\n" -" .shared .align 16 .b8 __cuda___cuda_local_var_32902_33_non_const_sp_lj120[16];\n" -" .shared .align 4 .b8 __cuda___cuda_local_var_33303_55_non_const_red_acc136[3584];\n" -" .shared .f32 __cuda_local_var_32908_33_non_const_b_alpha;\n" -" .shared .f32 __cuda_local_var_32908_42_non_const_cr60;\n" -" .loc 17 43 0\n" -"$LDWbegin_kernel_ellipsoid:\n" -" .loc 17 48 0\n" -" ld.param.u64 %rd1, [__cudaparm_kernel_ellipsoid_splj];\n" -" ldu.global.f32 %f1, [%rd1+0];\n" -" .loc 17 49 0\n" -" ld.global.f32 %f2, [%rd1+4];\n" -" .loc 17 50 0\n" -" ld.global.f32 %f3, [%rd1+8];\n" -" .loc 17 51 0\n" -" ld.global.f32 %f4, [%rd1+12];\n" -" st.shared.v4.f32 [__cuda___cuda_local_var_32902_33_non_const_sp_lj120+0], {%f1,%f2,%f3,%f4};\n" -" .loc 17 54 0\n" -" mov.f32 %f5, 0f3f4db6db; \n" -" st.shared.f32 [__cuda_local_var_32908_33_non_const_b_alpha], %f5;\n" -" .loc 17 55 0\n" -" mov.f32 %f6, 0f42700000; \n" -" lg2.approx.ftz.f32 %f7, %f6;\n" -" mov.f32 %f8, 0f3eaaaaab; \n" -" mul.ftz.f32 %f9, %f7, %f8;\n" -" ex2.approx.ftz.f32 %f10, %f9;\n" -" mov.f32 %f11, 0f42700000; \n" -" mul.ftz.f32 %f12, %f10, %f10;\n" -" div.approx.ftz.f32 %f13, %f11, %f12;\n" -" sub.ftz.f32 %f14, %f10, %f13;\n" -" mov.f32 %f15, 0f3eaaaaab; \n" -" mul.ftz.f32 %f16, %f14, %f15;\n" -" sub.ftz.f32 %f17, %f10, %f16;\n" -" st.shared.f32 [__cuda_local_var_32908_42_non_const_cr60], %f17;\n" -" .loc 17 68 0\n" -" mov.f32 %f18, 0f00000000; \n" -" mov.f32 %f19, %f18;\n" -" mov.f32 %f20, 0f00000000; \n" -" mov.f32 %f21, %f20;\n" -" mov.f32 %f22, 0f00000000; \n" -" mov.f32 %f23, %f22;\n" -" mov.f32 %f24, 0f00000000; \n" -" mov.f32 %f25, %f24;\n" -" mov.f32 %f26, 0f00000000; \n" -" mov.f32 %f27, %f26;\n" -" mov.f32 %f28, 0f00000000; \n" -" mov.f32 %f29, %f28;\n" -" ld.param.s32 %r1, [__cudaparm_kernel_ellipsoid_t_per_atom];\n" -" cvt.s32.u32 %r2, %tid.x;\n" -" div.s32 %r3, %r2, %r1;\n" -" cvt.s32.u32 %r4, %ntid.x;\n" -" div.s32 %r5, %r4, %r1;\n" -" cvt.s32.u32 %r6, %ctaid.x;\n" -" mul.lo.s32 %r7, %r6, %r5;\n" -" add.s32 %r8, %r3, %r7;\n" -" ld.param.s32 %r9, [__cudaparm_kernel_ellipsoid_inum];\n" -" setp.le.s32 %p1, %r9, %r8;\n" -" @%p1 bra $Lt_0_67842;\n" -" .loc 17 73 0\n" -" cvt.s64.s32 %rd2, %r8;\n" -" mul.wide.s32 %rd3, %r8, 4;\n" -" ld.param.u64 %rd4, [__cudaparm_kernel_ellipsoid_dev_nbor];\n" -" add.u64 %rd5, %rd4, %rd3;\n" -" ld.global.s32 %r10, [%rd5+0];\n" -" ld.param.s32 %r11, [__cudaparm_kernel_ellipsoid_stride];\n" -" cvt.s64.s32 %rd6, %r11;\n" -" mul.wide.s32 %rd7, %r11, 4;\n" -" add.u64 %rd8, %rd7, %rd5;\n" -" ld.global.s32 %r12, [%rd8+0];\n" -" .loc 17 76 0\n" -" cvt.s64.s32 %rd9, %r10;\n" -" mul.wide.s32 %rd10, %r10, 16;\n" -" ld.param.u64 %rd11, [__cudaparm_kernel_ellipsoid_x_];\n" -" add.u64 %rd12, %rd10, %rd11;\n" -" ld.global.v4.f32 {%f30,%f31,%f32,%f33}, [%rd12+0];\n" -" .loc 17 88 0\n" -" cvt.rzi.ftz.s32.f32 %r13, %f33;\n" -" cvt.s64.s32 %rd13, %r13;\n" -" mul.wide.s32 %rd14, %r13, 16;\n" -" ld.param.u64 %rd15, [__cudaparm_kernel_ellipsoid_shape];\n" -" add.u64 %rd16, %rd14, %rd15;\n" -" ld.global.v4.f32 {%f34,%f35,%f36,_}, [%rd16+0];\n" -" .loc 17 97 0\n" -" ld.param.u64 %rd17, [__cudaparm_kernel_ellipsoid_q];\n" -" add.u64 %rd18, %rd10, %rd17;\n" -" ld.global.v4.f32 {%f37,%f38,%f39,%f40}, [%rd18+0];\n" -" .loc 17 98 0\n" -" ld.param.u64 %rd19, [__cudaparm_kernel_ellipsoid_well];\n" -" add.u64 %rd20, %rd14, %rd19;\n" -" ld.global.v4.f32 {%f41,%f42,%f43,_}, [%rd20+0];\n" -" .loc 17 117 0\n" -" cvt.s32.s64 %r14, %rd6;\n" -" sub.s32 %r15, %r1, 1;\n" -" and.b32 %r16, %r15, %r2;\n" -" add.u64 %rd21, %rd7, %rd8;\n" -" mul.lo.s32 %r17, %r14, %r16;\n" -" cvt.s64.s32 %rd22, %r17;\n" -" mul.wide.s32 %rd23, %r17, 4;\n" -" add.u64 %rd24, %rd21, %rd23;\n" -" mov.s64 %rd25, %rd24;\n" -" mul.lo.s32 %r18, %r14, %r12;\n" -" cvt.s64.s32 %rd26, %r18;\n" -" mul.wide.s32 %rd27, %r18, 4;\n" -" add.u64 %rd28, %rd21, %rd27;\n" -" setp.ge.u64 %p2, %rd24, %rd28;\n" -" @%p2 bra $Lt_0_69634;\n" -" ld.param.s32 %r19, [__cudaparm_kernel_ellipsoid_vflag];\n" -" mov.s32 %r20, 0;\n" -" setp.gt.s32 %p3, %r19, %r20;\n" -" mul.ftz.f32 %f44, %f35, %f35;\n" -" add.ftz.f32 %f45, %f38, %f38;\n" -" add.ftz.f32 %f46, %f40, %f40;\n" -" mul.ftz.f32 %f47, %f37, %f37;\n" -" mul.ftz.f32 %f48, %f38, %f38;\n" -" mul.ftz.f32 %f49, %f39, %f39;\n" -" mul.ftz.f32 %f50, %f40, %f40;\n" -" mul.ftz.f32 %f51, %f34, %f34;\n" -" add.ftz.f32 %f52, %f39, %f39;\n" -" mul.ftz.f32 %f53, %f36, %f36;\n" -" mul.ftz.f32 %f54, %f34, %f35;\n" -" add.ftz.f32 %f55, %f34, %f34;\n" -" add.ftz.f32 %f56, %f35, %f35;\n" -" add.ftz.f32 %f57, %f36, %f36;\n" -" ld.param.s32 %r21, [__cudaparm_kernel_ellipsoid_ntypes];\n" -" mul.lo.s32 %r22, %r21, %r13;\n" -" rcp.approx.ftz.f32 %f58, %f44;\n" -" mul.ftz.f32 %f59, %f45, %f39;\n" -" mul.ftz.f32 %f60, %f45, %f40;\n" -" mul.ftz.f32 %f61, %f45, %f37;\n" -" mul.ftz.f32 %f62, %f46, %f37;\n" -" add.ftz.f32 %f63, %f47, %f48;\n" -" sub.ftz.f32 %f64, %f47, %f48;\n" -" rcp.approx.ftz.f32 %f65, %f51;\n" -" mul.ftz.f32 %f66, %f52, %f37;\n" -" mul.ftz.f32 %f67, %f52, %f40;\n" -" rcp.approx.ftz.f32 %f68, %f53;\n" -" mul.ftz.f32 %f69, %f54, %f36;\n" -" sub.ftz.f32 %f70, %f59, %f62;\n" -" add.ftz.f32 %f71, %f59, %f62;\n" -" sub.ftz.f32 %f72, %f62, %f59;\n" -" sub.ftz.f32 %f73, %f63, %f49;\n" -" add.ftz.f32 %f74, %f49, %f64;\n" -" sub.ftz.f32 %f75, %f64, %f49;\n" -" add.ftz.f32 %f76, %f60, %f66;\n" -" sub.ftz.f32 %f77, %f60, %f66;\n" -" sub.ftz.f32 %f78, %f66, %f60;\n" -" sub.ftz.f32 %f79, %f67, %f61;\n" -" add.ftz.f32 %f80, %f61, %f67;\n" -" sub.ftz.f32 %f81, %f61, %f67;\n" -" mul.ftz.f32 %f82, %f44, %f70;\n" -" mul.ftz.f32 %f83, %f70, %f42;\n" -" mul.ftz.f32 %f84, %f51, %f71;\n" -" mul.ftz.f32 %f85, %f71, %f41;\n" -" neg.ftz.f32 %f86, %f71;\n" -" sub.ftz.f32 %f87, %f73, %f50;\n" -" sub.ftz.f32 %f88, %f50, %f73;\n" -" sub.ftz.f32 %f89, %f74, %f50;\n" -" sub.ftz.f32 %f90, %f50, %f74;\n" -" add.ftz.f32 %f91, %f50, %f75;\n" -" mul.ftz.f32 %f92, %f53, %f76;\n" -" mul.ftz.f32 %f93, %f76, %f43;\n" -" neg.ftz.f32 %f94, %f76;\n" -" mul.ftz.f32 %f95, %f51, %f77;\n" -" mul.ftz.f32 %f96, %f77, %f41;\n" -" mul.ftz.f32 %f97, %f53, %f79;\n" -" mul.ftz.f32 %f98, %f79, %f43;\n" -" mul.ftz.f32 %f99, %f44, %f80;\n" -" mul.ftz.f32 %f100, %f80, %f42;\n" -" mul.ftz.f32 %f101, %f70, %f82;\n" -" mul.ftz.f32 %f102, %f80, %f82;\n" -" mul.ftz.f32 %f103, %f72, %f82;\n" -" mul.ftz.f32 %f104, %f70, %f83;\n" -" mul.ftz.f32 %f105, %f80, %f83;\n" -" mov.f32 %f106, 0f00000000; \n" -" mov.f32 %f107, 0f00000000; \n" -" fma.rn.ftz.f32 %f108, %f107, %f84, %f106;\n" -" mov.f32 %f109, 0f00000000; \n" -" mov.f32 %f110, 0f00000000; \n" -" fma.rn.ftz.f32 %f111, %f84, %f110, %f109;\n" -" mul.ftz.f32 %f112, %f51, %f87;\n" -" mul.ftz.f32 %f113, %f87, %f41;\n" -" mul.ftz.f32 %f114, %f82, %f89;\n" -" mul.ftz.f32 %f115, %f44, %f89;\n" -" mul.ftz.f32 %f116, %f83, %f89;\n" -" mul.ftz.f32 %f117, %f89, %f42;\n" -" mul.ftz.f32 %f118, %f82, %f90;\n" -" mul.ftz.f32 %f119, %f53, %f91;\n" -" mul.ftz.f32 %f120, %f91, %f43;\n" -" neg.ftz.f32 %f121, %f91;\n" -" mov.f32 %f122, 0f00000000; \n" -" mov.f32 %f123, 0f00000000; \n" -" fma.rn.ftz.f32 %f124, %f123, %f95, %f122;\n" -" mov.f32 %f125, 0f00000000; \n" -" mov.f32 %f126, 0f00000000; \n" -" fma.rn.ftz.f32 %f127, %f95, %f126, %f125;\n" -" mul.ftz.f32 %f128, %f70, %f99;\n" -" mul.ftz.f32 %f129, %f89, %f99;\n" -" mul.ftz.f32 %f130, %f80, %f99;\n" -" mul.ftz.f32 %f131, %f72, %f99;\n" -" mul.ftz.f32 %f132, %f90, %f99;\n" -" mul.ftz.f32 %f133, %f70, %f100;\n" -" mul.ftz.f32 %f134, %f89, %f100;\n" -" mul.ftz.f32 %f135, %f80, %f100;\n" -" neg.ftz.f32 %f136, %f102;\n" -" mov.f32 %f137, 0f00000000; \n" -" fma.rn.ftz.f32 %f138, %f137, %f97, %f108;\n" -" mov.f32 %f139, 0f00000000; \n" -" fma.rn.ftz.f32 %f140, %f97, %f139, %f108;\n" -" mov.f32 %f141, 0f00000000; \n" -" fma.rn.ftz.f32 %f142, %f97, %f141, %f111;\n" -" fma.rn.ftz.f32 %f143, %f87, %f112, %f101;\n" -" fma.rn.ftz.f32 %f144, %f112, %f77, %f102;\n" -" mov.f32 %f145, 0f00000000; \n" -" mov.f32 %f146, 0f00000000; \n" -" fma.rn.ftz.f32 %f147, %f146, %f112, %f145;\n" -" mov.f32 %f148, 0f00000000; \n" -" mov.f32 %f149, 0f00000000; \n" -" fma.rn.ftz.f32 %f150, %f112, %f149, %f148;\n" -" fma.rn.ftz.f32 %f151, %f77, %f112, %f102;\n" -" fma.rn.ftz.f32 %f152, %f112, %f88, %f103;\n" -" fma.rn.ftz.f32 %f153, %f112, %f87, %f101;\n" -" fma.rn.ftz.f32 %f154, %f87, %f113, %f104;\n" -" fma.rn.ftz.f32 %f155, %f113, %f77, %f105;\n" -" fma.rn.ftz.f32 %f156, %f112, %f71, %f114;\n" -" mul.ftz.f32 %f157, %f70, %f115;\n" -" mul.ftz.f32 %f158, %f89, %f115;\n" -" mul.ftz.f32 %f159, %f80, %f115;\n" -" mul.ftz.f32 %f160, %f72, %f115;\n" -" mul.ftz.f32 %f161, %f90, %f115;\n" -" fma.rn.ftz.f32 %f162, %f113, %f71, %f116;\n" -" mul.ftz.f32 %f163, %f70, %f117;\n" -" mul.ftz.f32 %f164, %f89, %f117;\n" -" mul.ftz.f32 %f165, %f80, %f117;\n" -" fma.rn.ftz.f32 %f166, %f112, %f86, %f118;\n" -" fma.rn.ftz.f32 %f167, %f86, %f112, %f118;\n" -" mov.f32 %f168, 0f00000000; \n" -" fma.rn.ftz.f32 %f169, %f168, %f119, %f124;\n" -" mov.f32 %f170, 0f00000000; \n" -" fma.rn.ftz.f32 %f171, %f119, %f170, %f127;\n" -" fma.rn.ftz.f32 %f172, %f87, %f95, %f128;\n" -" fma.rn.ftz.f32 %f173, %f71, %f95, %f129;\n" -" fma.rn.ftz.f32 %f174, %f95, %f71, %f129;\n" -" fma.rn.ftz.f32 %f175, %f77, %f95, %f130;\n" -" neg.ftz.f32 %f176, %f130;\n" -" fma.rn.ftz.f32 %f177, %f95, %f88, %f131;\n" -" fma.rn.ftz.f32 %f178, %f88, %f95, %f131;\n" -" fma.rn.ftz.f32 %f179, %f86, %f95, %f132;\n" -" fma.rn.ftz.f32 %f180, %f87, %f96, %f133;\n" -" fma.rn.ftz.f32 %f181, %f71, %f96, %f134;\n" -" fma.rn.ftz.f32 %f182, %f77, %f96, %f135;\n" -" fma.rn.ftz.f32 %f183, %f112, %f78, %f136;\n" -" add.ftz.f32 %f184, %f140, %f142;\n" -" fma.rn.ftz.f32 %f185, %f92, %f76, %f143;\n" -" fma.rn.ftz.f32 %f186, %f92, %f91, %f144;\n" -" mov.f32 %f187, 0f00000000; \n" -" fma.rn.ftz.f32 %f188, %f92, %f187, %f147;\n" -" mov.f32 %f189, 0f00000000; \n" -" fma.rn.ftz.f32 %f190, %f92, %f189, %f150;\n" -" fma.rn.ftz.f32 %f191, %f92, %f91, %f151;\n" -" fma.rn.ftz.f32 %f192, %f92, %f94, %f152;\n" -" fma.rn.ftz.f32 %f193, %f92, %f76, %f153;\n" -" fma.rn.ftz.f32 %f194, %f93, %f76, %f154;\n" -" fma.rn.ftz.f32 %f195, %f93, %f91, %f155;\n" -" fma.rn.ftz.f32 %f196, %f92, %f79, %f156;\n" -" fma.rn.ftz.f32 %f197, %f87, %f84, %f157;\n" -" fma.rn.ftz.f32 %f198, %f84, %f87, %f157;\n" -" fma.rn.ftz.f32 %f199, %f71, %f84, %f158;\n" -" fma.rn.ftz.f32 %f200, %f84, %f71, %f158;\n" -" fma.rn.ftz.f32 %f201, %f77, %f84, %f159;\n" -" neg.ftz.f32 %f202, %f159;\n" -" fma.rn.ftz.f32 %f203, %f88, %f84, %f160;\n" -" fma.rn.ftz.f32 %f204, %f84, %f88, %f160;\n" -" fma.rn.ftz.f32 %f205, %f86, %f84, %f161;\n" -" fma.rn.ftz.f32 %f206, %f93, %f79, %f162;\n" -" fma.rn.ftz.f32 %f207, %f87, %f85, %f163;\n" -" fma.rn.ftz.f32 %f208, %f71, %f85, %f164;\n" -" fma.rn.ftz.f32 %f209, %f77, %f85, %f165;\n" -" fma.rn.ftz.f32 %f210, %f92, %f81, %f166;\n" -" fma.rn.ftz.f32 %f211, %f92, %f81, %f167;\n" -" add.ftz.f32 %f212, %f169, %f171;\n" -" fma.rn.ftz.f32 %f213, %f76, %f119, %f172;\n" -" fma.rn.ftz.f32 %f214, %f79, %f119, %f173;\n" -" fma.rn.ftz.f32 %f215, %f119, %f79, %f174;\n" -" fma.rn.ftz.f32 %f216, %f91, %f119, %f175;\n" -" fma.rn.ftz.f32 %f217, %f78, %f95, %f176;\n" -" fma.rn.ftz.f32 %f218, %f119, %f94, %f177;\n" -" fma.rn.ftz.f32 %f219, %f94, %f119, %f178;\n" -" fma.rn.ftz.f32 %f220, %f81, %f119, %f179;\n" -" fma.rn.ftz.f32 %f221, %f76, %f120, %f180;\n" -" fma.rn.ftz.f32 %f222, %f79, %f120, %f181;\n" -" fma.rn.ftz.f32 %f223, %f91, %f120, %f182;\n" -" fma.rn.ftz.f32 %f224, %f92, %f121, %f183;\n" -" add.ftz.f32 %f225, %f188, %f190;\n" -" add.ftz.f32 %f226, %f186, %f191;\n" -" add.ftz.f32 %f227, %f169, %f196;\n" -" fma.rn.ftz.f32 %f228, %f97, %f76, %f197;\n" -" fma.rn.ftz.f32 %f229, %f97, %f76, %f198;\n" -" fma.rn.ftz.f32 %f230, %f97, %f79, %f199;\n" -" fma.rn.ftz.f32 %f231, %f97, %f79, %f200;\n" -" fma.rn.ftz.f32 %f232, %f97, %f91, %f201;\n" -" fma.rn.ftz.f32 %f233, %f91, %f97, %f201;\n" -" fma.rn.ftz.f32 %f234, %f84, %f78, %f202;\n" -" fma.rn.ftz.f32 %f235, %f78, %f84, %f202;\n" -" fma.rn.ftz.f32 %f236, %f97, %f94, %f203;\n" -" fma.rn.ftz.f32 %f237, %f97, %f94, %f204;\n" -" fma.rn.ftz.f32 %f238, %f81, %f97, %f205;\n" -" fma.rn.ftz.f32 %f239, %f97, %f81, %f205;\n" -" fma.rn.ftz.f32 %f240, %f98, %f76, %f207;\n" -" fma.rn.ftz.f32 %f241, %f98, %f79, %f208;\n" -" fma.rn.ftz.f32 %f242, %f98, %f91, %f209;\n" -" add.ftz.f32 %f243, %f210, %f211;\n" -" add.ftz.f32 %f244, %f140, %f213;\n" -" add.ftz.f32 %f245, %f142, %f213;\n" -" add.ftz.f32 %f246, %f214, %f215;\n" -" add.ftz.f32 %f247, %f192, %f216;\n" -" fma.rn.ftz.f32 %f248, %f121, %f119, %f217;\n" -" add.ftz.f32 %f249, %f218, %f219;\n" -" add.ftz.f32 %f250, %f190, %f220;\n" -" add.ftz.f32 %f251, %f138, %f224;\n" -" add.ftz.f32 %f252, %f140, %f224;\n" -" add.ftz.f32 %f253, %f228, %f229;\n" -" add.ftz.f32 %f254, %f190, %f232;\n" -" add.ftz.f32 %f255, %f190, %f233;\n" -" fma.rn.ftz.f32 %f256, %f97, %f121, %f234;\n" -" fma.rn.ftz.f32 %f257, %f97, %f121, %f235;\n" -" add.ftz.f32 %f258, %f169, %f236;\n" -" add.ftz.f32 %f259, %f169, %f237;\n" -" add.ftz.f32 %f260, %f193, %f238;\n" -" add.ftz.f32 %f261, %f193, %f239;\n" -" add.ftz.f32 %f262, %f230, %f248;\n" -" add.ftz.f32 %f263, %f231, %f248;\n" -" add.ftz.f32 %f264, %f256, %f257;\n" -" ld.param.u64 %rd29, [__cudaparm_kernel_ellipsoid_sig_eps];\n" -" mov.f32 %f265, 0f00000000; \n" -" mov.f32 %f266, 0f00000000; \n" -" mov.f32 %f267, 0f00000000; \n" -" mov.f32 %f268, 0f00000000; \n" -" mov.f32 %f269, 0f00000000; \n" -" mov.f32 %f270, 0f00000000; \n" -" mov.f32 %f271, 0f00000000; \n" -" mov.u64 %rd30, __cuda___cuda_local_var_32902_33_non_const_sp_lj120;\n" -"$Lt_0_46338:\n" -" .loc 17 121 0\n" -" ld.global.s32 %r23, [%rd25+0];\n" -" .loc 17 125 0\n" -" and.b32 %r24, %r23, 1073741823;\n" -" cvt.s64.s32 %rd31, %r24;\n" -" mul.wide.s32 %rd32, %r24, 16;\n" -" add.u64 %rd33, %rd32, %rd11;\n" -" ld.global.v4.f32 {%f272,%f273,%f274,%f275}, [%rd33+0];\n" -" .loc 17 136 0\n" -" sub.ftz.f32 %f276, %f273, %f31;\n" -" sub.ftz.f32 %f277, %f272, %f30;\n" -" sub.ftz.f32 %f278, %f274, %f32;\n" -" mul.ftz.f32 %f279, %f276, %f276;\n" -" fma.rn.ftz.f32 %f280, %f277, %f277, %f279;\n" -" fma.rn.ftz.f32 %f281, %f278, %f278, %f280;\n" -" rsqrt.approx.ftz.f32 %f282, %f281;\n" -" mul.ftz.f32 %f283, %f277, %f282;\n" -" .loc 17 137 0\n" -" mul.ftz.f32 %f284, %f276, %f282;\n" -" .loc 17 145 0\n" -" cvt.rzi.ftz.s32.f32 %r25, %f275;\n" -" cvt.s64.s32 %rd34, %r25;\n" -" mul.wide.s32 %rd35, %r25, 16;\n" -" add.u64 %rd36, %rd35, %rd15;\n" -" ld.global.v4.f32 {%f285,%f286,%f287,_}, [%rd36+0];\n" -" .loc 17 152 0\n" -" add.u64 %rd37, %rd32, %rd17;\n" -" ld.global.v4.f32 {%f288,%f289,%f290,%f291}, [%rd37+0];\n" -" .loc 16 299 0\n" -" mov.f32 %f292, %f283;\n" -" .loc 16 300 0\n" -" mul.ftz.f32 %f293, %f286, %f286;\n" -" add.ftz.f32 %f294, %f289, %f289;\n" -" add.ftz.f32 %f295, %f291, %f291;\n" -" mul.ftz.f32 %f296, %f288, %f288;\n" -" mul.ftz.f32 %f297, %f289, %f289;\n" -" mul.ftz.f32 %f298, %f290, %f290;\n" -" mul.ftz.f32 %f299, %f291, %f291;\n" -" mul.ftz.f32 %f300, %f285, %f285;\n" -" add.ftz.f32 %f301, %f290, %f290;\n" -" mul.ftz.f32 %f302, %f287, %f287;\n" -" mul.ftz.f32 %f303, %f294, %f290;\n" -" mul.ftz.f32 %f304, %f294, %f291;\n" -" mul.ftz.f32 %f305, %f295, %f288;\n" -" add.ftz.f32 %f306, %f296, %f297;\n" -" mul.ftz.f32 %f307, %f301, %f288;\n" -" sub.ftz.f32 %f308, %f303, %f305;\n" -" sub.ftz.f32 %f309, %f306, %f298;\n" -" add.ftz.f32 %f310, %f304, %f307;\n" -" mul.ftz.f32 %f311, %f293, %f308;\n" -" sub.ftz.f32 %f312, %f309, %f299;\n" -" mul.ftz.f32 %f313, %f302, %f310;\n" -" mul.ftz.f32 %f314, %f308, %f311;\n" -" mul.ftz.f32 %f315, %f300, %f312;\n" -" fma.rn.ftz.f32 %f316, %f312, %f315, %f314;\n" -" fma.rn.ftz.f32 %f317, %f313, %f310, %f316;\n" -" add.ftz.f32 %f318, %f185, %f317;\n" -" mov.f32 %f319, %f318;\n" -" .loc 16 301 0\n" -" mul.ftz.f32 %f320, %f294, %f288;\n" -" sub.ftz.f32 %f321, %f296, %f297;\n" -" mul.ftz.f32 %f322, %f301, %f291;\n" -" add.ftz.f32 %f323, %f303, %f305;\n" -" add.ftz.f32 %f324, %f298, %f321;\n" -" sub.ftz.f32 %f325, %f322, %f320;\n" -" sub.ftz.f32 %f326, %f324, %f299;\n" -" mul.ftz.f32 %f327, %f311, %f326;\n" -" fma.rn.ftz.f32 %f328, %f315, %f323, %f327;\n" -" fma.rn.ftz.f32 %f329, %f313, %f325, %f328;\n" -" add.ftz.f32 %f330, %f196, %f329;\n" -" mov.f32 %f331, %f330;\n" -" .loc 16 302 0\n" -" sub.ftz.f32 %f332, %f321, %f298;\n" -" sub.ftz.f32 %f333, %f304, %f307;\n" -" add.ftz.f32 %f334, %f320, %f322;\n" -" add.ftz.f32 %f335, %f299, %f332;\n" -" mul.ftz.f32 %f336, %f334, %f311;\n" -" fma.rn.ftz.f32 %f337, %f315, %f333, %f336;\n" -" fma.rn.ftz.f32 %f338, %f313, %f335, %f337;\n" -" add.ftz.f32 %f339, %f186, %f338;\n" -" mov.f32 %f340, %f339;\n" -" .loc 16 303 0\n" -" mov.f32 %f341, %f284;\n" -" .loc 16 304 0\n" -" mul.ftz.f32 %f342, %f300, %f323;\n" -" mul.ftz.f32 %f343, %f302, %f325;\n" -" mul.ftz.f32 %f344, %f293, %f326;\n" -" mul.ftz.f32 %f345, %f308, %f344;\n" -" fma.rn.ftz.f32 %f346, %f312, %f342, %f345;\n" -" fma.rn.ftz.f32 %f347, %f343, %f310, %f346;\n" -" add.ftz.f32 %f348, %f228, %f347;\n" -" mov.f32 %f349, %f348;\n" -" .loc 16 305 0\n" -" mul.ftz.f32 %f350, %f326, %f344;\n" -" fma.rn.ftz.f32 %f351, %f323, %f342, %f350;\n" -" fma.rn.ftz.f32 %f352, %f343, %f325, %f351;\n" -" add.ftz.f32 %f353, %f230, %f352;\n" -" .loc 16 306 0\n" -" mul.ftz.f32 %f354, %f334, %f344;\n" -" fma.rn.ftz.f32 %f355, %f333, %f342, %f354;\n" -" fma.rn.ftz.f32 %f356, %f343, %f335, %f355;\n" -" add.ftz.f32 %f357, %f232, %f356;\n" -" .loc 16 307 0\n" -" mul.ftz.f32 %f358, %f278, %f282;\n" -" mov.f32 %f359, %f358;\n" -" .loc 16 308 0\n" -" mul.ftz.f32 %f360, %f300, %f333;\n" -" mul.ftz.f32 %f361, %f293, %f334;\n" -" mul.ftz.f32 %f362, %f302, %f335;\n" -" mul.ftz.f32 %f363, %f308, %f361;\n" -" fma.rn.ftz.f32 %f364, %f312, %f360, %f363;\n" -" fma.rn.ftz.f32 %f365, %f310, %f362, %f364;\n" -" add.ftz.f32 %f366, %f213, %f365;\n" -" mov.f32 %f367, %f366;\n" -" .loc 16 309 0\n" -" mul.ftz.f32 %f368, %f326, %f361;\n" -" fma.rn.ftz.f32 %f369, %f323, %f360, %f368;\n" -" fma.rn.ftz.f32 %f370, %f325, %f362, %f369;\n" -" add.ftz.f32 %f371, %f214, %f370;\n" -" .loc 16 310 0\n" -" mul.ftz.f32 %f372, %f334, %f361;\n" -" fma.rn.ftz.f32 %f373, %f333, %f360, %f372;\n" -" fma.rn.ftz.f32 %f374, %f335, %f362, %f373;\n" -" add.ftz.f32 %f375, %f216, %f374;\n" -" abs.ftz.f32 %f376, %f348;\n" -" abs.ftz.f32 %f377, %f318;\n" -" setp.gt.ftz.f32 %p4, %f376, %f377;\n" -" @!%p4 bra $Lt_0_46594;\n" -" .loc 16 314 0\n" -" mov.f32 %f319, %f348;\n" -" mov.f32 %f349, %f318;\n" -" .loc 16 315 0\n" -" mov.f32 %f331, %f353;\n" -" mov.f32 %f353, %f330;\n" -" .loc 16 316 0\n" -" mov.f32 %f340, %f357;\n" -" mov.f32 %f357, %f339;\n" -" .loc 16 317 0\n" -" mov.f32 %f292, %f284;\n" -" mov.f32 %f341, %f283;\n" -"$Lt_0_46594:\n" -" mov.f32 %f378, %f319;\n" -" abs.ftz.f32 %f379, %f378;\n" -" abs.ftz.f32 %f380, %f366;\n" -" setp.lt.ftz.f32 %p5, %f379, %f380;\n" -" @!%p5 bra $Lt_0_47106;\n" -" .loc 16 321 0\n" -" mov.f32 %f319, %f366;\n" -" mov.f32 %f367, %f378;\n" -" .loc 16 322 0\n" -" mov.f32 %f381, %f331;\n" -" mov.f32 %f331, %f371;\n" -" mov.f32 %f371, %f381;\n" -" .loc 16 323 0\n" -" mov.f32 %f382, %f340;\n" -" mov.f32 %f340, %f375;\n" -" mov.f32 %f375, %f382;\n" -" .loc 16 324 0\n" -" mov.f32 %f383, %f292;\n" -" mov.f32 %f292, %f358;\n" -" mov.f32 %f359, %f383;\n" -"$Lt_0_47106:\n" -" mov.f32 %f384, %f319;\n" -" mov.f32 %f385, 0f00000000; \n" -" setp.neu.ftz.f32 %p6, %f384, %f385;\n" -" @!%p6 bra $Lt_0_47874;\n" -" bra.uni $Lt_0_48642;\n" -"$Lt_0_47874:\n" -" mov.f32 %f386, 0f00000000; \n" -" setp.neu.ftz.f32 %p7, %f349, %f386;\n" -" @!%p7 bra $Lt_0_48386;\n" -" .loc 16 338 0\n" -" mov.f32 %f319, %f349;\n" -" mov.f32 %f349, %f384;\n" -" .loc 16 339 0\n" -" mov.f32 %f387, %f331;\n" -" mov.f32 %f331, %f353;\n" -" mov.f32 %f353, %f387;\n" -" .loc 16 340 0\n" -" mov.f32 %f388, %f340;\n" -" mov.f32 %f340, %f357;\n" -" mov.f32 %f357, %f388;\n" -" .loc 16 341 0\n" -" mov.f32 %f389, %f292;\n" -" mov.f32 %f292, %f341;\n" -" mov.f32 %f341, %f389;\n" -" bra.uni $Lt_0_48642;\n" -"$Lt_0_48386:\n" -" mov.f32 %f390, 0f00000000; \n" -" setp.neu.ftz.f32 %p8, %f367, %f390;\n" -" @!%p8 bra $Lt_0_48898;\n" -" .loc 16 346 0\n" -" mov.f32 %f319, %f367;\n" -" mov.f32 %f367, %f384;\n" -" .loc 16 347 0\n" -" mov.f32 %f391, %f331;\n" -" mov.f32 %f331, %f371;\n" -" mov.f32 %f371, %f391;\n" -" .loc 16 348 0\n" -" mov.f32 %f392, %f340;\n" -" mov.f32 %f340, %f375;\n" -" mov.f32 %f375, %f392;\n" -" .loc 16 349 0\n" -" mov.f32 %f393, %f292;\n" -" mov.f32 %f292, %f359;\n" -" mov.f32 %f359, %f393;\n" -" bra.uni $Lt_0_48642;\n" -"$Lt_0_48898:\n" -" .loc 16 352 0\n" -" mov.s32 %r26, 2;\n" -" ld.param.u64 %rd38, [__cudaparm_kernel_ellipsoid_err_flag];\n" -" st.global.s32 [%rd38+0], %r26;\n" -"$Lt_0_48642:\n" -"$Lt_0_48130:\n" -"$Lt_0_47618:\n" -" .loc 16 355 0\n" -" div.approx.ftz.f32 %f394, %f349, %f319;\n" -" mul.ftz.f32 %f395, %f331, %f394;\n" -" sub.ftz.f32 %f396, %f353, %f395;\n" -" mov.f32 %f353, %f396;\n" -" .loc 16 356 0\n" -" mul.ftz.f32 %f397, %f340, %f394;\n" -" sub.ftz.f32 %f398, %f357, %f397;\n" -" mov.f32 %f357, %f398;\n" -" .loc 16 357 0\n" -" mul.ftz.f32 %f399, %f292, %f394;\n" -" sub.ftz.f32 %f400, %f341, %f399;\n" -" mov.f32 %f341, %f400;\n" -" .loc 16 359 0\n" -" div.approx.ftz.f32 %f401, %f367, %f319;\n" -" mul.ftz.f32 %f402, %f331, %f401;\n" -" sub.ftz.f32 %f371, %f371, %f402;\n" -" .loc 16 360 0\n" -" mul.ftz.f32 %f403, %f340, %f401;\n" -" sub.ftz.f32 %f375, %f375, %f403;\n" -" .loc 16 361 0\n" -" mul.ftz.f32 %f404, %f292, %f401;\n" -" sub.ftz.f32 %f359, %f359, %f404;\n" -" abs.ftz.f32 %f405, %f396;\n" -" abs.ftz.f32 %f406, %f371;\n" -" setp.lt.ftz.f32 %p9, %f405, %f406;\n" -" @!%p9 bra $Lt_0_49154;\n" -" .loc 16 366 0\n" -" mov.f32 %f353, %f371;\n" -" mov.f32 %f371, %f396;\n" -" .loc 16 367 0\n" -" mov.f32 %f357, %f375;\n" -" mov.f32 %f375, %f398;\n" -" .loc 16 368 0\n" -" mov.f32 %f341, %f359;\n" -" mov.f32 %f359, %f400;\n" -"$Lt_0_49154:\n" -" mov.f32 %f407, %f353;\n" -" mov.f32 %f408, 0f00000000; \n" -" setp.neu.ftz.f32 %p10, %f407, %f408;\n" -" @!%p10 bra $Lt_0_49922;\n" -" bra.uni $Lt_0_50178;\n" -"$Lt_0_49922:\n" -" mov.f32 %f409, 0f00000000; \n" -" setp.neu.ftz.f32 %p11, %f371, %f409;\n" -" @!%p11 bra $Lt_0_50178;\n" -" .loc 16 383 0\n" -" mov.f32 %f353, %f371;\n" -" mov.f32 %f371, %f407;\n" -" .loc 16 384 0\n" -" mov.f32 %f410, %f357;\n" -" mov.f32 %f357, %f375;\n" -" mov.f32 %f375, %f410;\n" -" .loc 16 385 0\n" -" mov.f32 %f411, %f341;\n" -" mov.f32 %f341, %f359;\n" -" mov.f32 %f359, %f411;\n" -"$Lt_0_50178:\n" -"$Lt_0_49666:\n" -" .loc 16 390 0\n" -" div.approx.ftz.f32 %f412, %f371, %f353;\n" -" mul.ftz.f32 %f413, %f357, %f412;\n" -" sub.ftz.f32 %f375, %f375, %f413;\n" -" .loc 16 391 0\n" -" mul.ftz.f32 %f414, %f341, %f412;\n" -" sub.ftz.f32 %f359, %f359, %f414;\n" -" mov.f32 %f415, 0f00000000; \n" -" setp.eq.ftz.f32 %p12, %f375, %f415;\n" -" @!%p12 bra $Lt_0_50690;\n" -" .loc 16 394 0\n" -" mov.s32 %r27, 2;\n" -" ld.param.u64 %rd39, [__cudaparm_kernel_ellipsoid_err_flag];\n" -" st.global.s32 [%rd39+0], %r27;\n" -"$Lt_0_50690:\n" -" .loc 16 396 0\n" -" div.approx.ftz.f32 %f416, %f359, %f375;\n" -" .loc 16 399 0\n" -" mul.ftz.f32 %f417, %f416, %f357;\n" -" sub.ftz.f32 %f418, %f341, %f417;\n" -" div.approx.ftz.f32 %f419, %f418, %f353;\n" -" .loc 16 403 0\n" -" mul.ftz.f32 %f420, %f419, %f331;\n" -" fma.rn.ftz.f32 %f421, %f340, %f416, %f420;\n" -" sub.ftz.f32 %f422, %f292, %f421;\n" -" div.approx.ftz.f32 %f423, %f422, %f319;\n" -" .loc 17 161 0\n" -" mul.ftz.f32 %f424, %f419, %f284;\n" -" fma.rn.ftz.f32 %f425, %f283, %f423, %f424;\n" -" fma.rn.ftz.f32 %f426, %f358, %f416, %f425;\n" -" mov.f32 %f427, 0f3f000000; \n" -" mul.ftz.f32 %f428, %f426, %f427;\n" -" rsqrt.approx.ftz.f32 %f429, %f428;\n" -" .loc 17 170 0\n" -" mul.ftz.f32 %f430, %f89, %f284;\n" -" mul.ftz.f32 %f431, %f71, %f284;\n" -" mul.ftz.f32 %f432, %f79, %f284;\n" -" fma.rn.ftz.f32 %f433, %f283, %f70, %f430;\n" -" fma.rn.ftz.f32 %f434, %f87, %f283, %f431;\n" -" fma.rn.ftz.f32 %f435, %f283, %f76, %f432;\n" -" fma.rn.ftz.f32 %f436, %f358, %f80, %f433;\n" -" fma.rn.ftz.f32 %f437, %f77, %f358, %f434;\n" -" fma.rn.ftz.f32 %f438, %f358, %f91, %f435;\n" -" mul.ftz.f32 %f439, %f58, %f436;\n" -" mul.ftz.f32 %f440, %f65, %f437;\n" -" mul.ftz.f32 %f441, %f68, %f438;\n" -" mul.ftz.f32 %f442, %f436, %f439;\n" -" fma.rn.ftz.f32 %f443, %f437, %f440, %f442;\n" -" fma.rn.ftz.f32 %f444, %f438, %f441, %f443;\n" -" sqrt.approx.ftz.f32 %f445, %f444;\n" -" .loc 17 171 0\n" -" mul.ftz.f32 %f446, %f326, %f284;\n" -" mul.ftz.f32 %f447, %f323, %f284;\n" -" mul.ftz.f32 %f448, %f325, %f284;\n" -" fma.rn.ftz.f32 %f449, %f283, %f308, %f446;\n" -" fma.rn.ftz.f32 %f450, %f283, %f312, %f447;\n" -" fma.rn.ftz.f32 %f451, %f283, %f310, %f448;\n" -" fma.rn.ftz.f32 %f452, %f358, %f334, %f449;\n" -" fma.rn.ftz.f32 %f453, %f358, %f333, %f450;\n" -" fma.rn.ftz.f32 %f454, %f358, %f335, %f451;\n" -" div.approx.ftz.f32 %f455, %f452, %f293;\n" -" div.approx.ftz.f32 %f456, %f453, %f300;\n" -" div.approx.ftz.f32 %f457, %f454, %f302;\n" -" mul.ftz.f32 %f458, %f452, %f455;\n" -" fma.rn.ftz.f32 %f459, %f453, %f456, %f458;\n" -" fma.rn.ftz.f32 %f460, %f454, %f457, %f459;\n" -" sqrt.approx.ftz.f32 %f461, %f460;\n" -" .loc 17 184 0\n" -" mul.ftz.f32 %f462, %f317, %f461;\n" -" mul.ftz.f32 %f463, %f338, %f461;\n" -" mul.ftz.f32 %f464, %f329, %f461;\n" -" mul.ftz.f32 %f465, %f365, %f461;\n" -" mul.ftz.f32 %f466, %f370, %f461;\n" -" mul.ftz.f32 %f467, %f374, %f461;\n" -" fma.rn.ftz.f32 %f468, %f185, %f445, %f462;\n" -" fma.rn.ftz.f32 %f469, %f186, %f445, %f463;\n" -" fma.rn.ftz.f32 %f470, %f196, %f445, %f464;\n" -" mul.ftz.f32 %f471, %f347, %f461;\n" -" mul.ftz.f32 %f472, %f352, %f461;\n" -" mul.ftz.f32 %f473, %f356, %f461;\n" -" fma.rn.ftz.f32 %f474, %f213, %f445, %f465;\n" -" fma.rn.ftz.f32 %f475, %f214, %f445, %f466;\n" -" fma.rn.ftz.f32 %f476, %f216, %f445, %f467;\n" -" fma.rn.ftz.f32 %f477, %f228, %f445, %f471;\n" -" fma.rn.ftz.f32 %f478, %f230, %f445, %f472;\n" -" fma.rn.ftz.f32 %f479, %f232, %f445, %f473;\n" -" mul.ftz.f32 %f480, %f470, %f474;\n" -" mul.ftz.f32 %f481, %f469, %f474;\n" -" mul.ftz.f32 %f482, %f470, %f477;\n" -" mul.ftz.f32 %f483, %f469, %f477;\n" -" mul.ftz.f32 %f484, %f468, %f478;\n" -" mul.ftz.f32 %f485, %f468, %f479;\n" -" mul.ftz.f32 %f486, %f475, %f485;\n" -" mul.ftz.f32 %f487, %f476, %f484;\n" -" sub.ftz.f32 %f488, %f487, %f486;\n" -" mul.ftz.f32 %f489, %f476, %f482;\n" -" sub.ftz.f32 %f490, %f488, %f489;\n" -" fma.rn.ftz.f32 %f491, %f475, %f483, %f490;\n" -" fma.rn.ftz.f32 %f492, %f479, %f480, %f491;\n" -" mul.ftz.f32 %f493, %f478, %f481;\n" -" sub.ftz.f32 %f494, %f492, %f493;\n" -" .loc 17 201 0\n" -" add.s32 %r28, %r25, %r22;\n" -" cvt.s64.s32 %rd40, %r28;\n" -" mul.wide.s32 %rd41, %r28, 8;\n" -" add.u64 %rd42, %rd29, %rd41;\n" -" ld.global.v2.f32 {%f495,%f496}, [%rd42+0];\n" -" .loc 17 202 0\n" -" shr.s32 %r29, %r23, 30;\n" -" and.b32 %r30, %r29, 3;\n" -" cvt.s64.s32 %rd43, %r30;\n" -" mul.wide.s32 %rd44, %r30, 4;\n" -" add.u64 %rd45, %rd30, %rd44;\n" -" ld.shared.f32 %f497, [%rd45+0];\n" -" mul.ftz.f32 %f498, %f497, %f496;\n" -" .loc 17 207 0\n" -" add.u64 %rd46, %rd35, %rd19;\n" -" ld.global.v4.f32 {%f499,%f500,%f501,_}, [%rd46+0];\n" -" .loc 16 299 0\n" -" mov.f32 %f292, %f283;\n" -" .loc 16 300 0\n" -" mul.ftz.f32 %f502, %f308, %f500;\n" -" mul.ftz.f32 %f503, %f310, %f501;\n" -" mul.ftz.f32 %f504, %f308, %f502;\n" -" mul.ftz.f32 %f505, %f312, %f499;\n" -" fma.rn.ftz.f32 %f506, %f312, %f505, %f504;\n" -" fma.rn.ftz.f32 %f507, %f503, %f310, %f506;\n" -" add.ftz.f32 %f508, %f194, %f507;\n" -" mov.f32 %f319, %f508;\n" -" .loc 16 301 0\n" -" mul.ftz.f32 %f509, %f502, %f326;\n" -" fma.rn.ftz.f32 %f510, %f505, %f323, %f509;\n" -" fma.rn.ftz.f32 %f511, %f503, %f325, %f510;\n" -" add.ftz.f32 %f512, %f206, %f511;\n" -" mov.f32 %f331, %f512;\n" -" .loc 16 302 0\n" -" mul.ftz.f32 %f513, %f334, %f502;\n" -" fma.rn.ftz.f32 %f514, %f505, %f333, %f513;\n" -" fma.rn.ftz.f32 %f515, %f503, %f335, %f514;\n" -" add.ftz.f32 %f516, %f195, %f515;\n" -" mov.f32 %f340, %f516;\n" -" .loc 16 303 0\n" -" mov.f32 %f341, %f284;\n" -" .loc 16 304 0\n" -" mul.ftz.f32 %f517, %f323, %f499;\n" -" mul.ftz.f32 %f518, %f325, %f501;\n" -" mul.ftz.f32 %f519, %f326, %f500;\n" -" mul.ftz.f32 %f520, %f308, %f519;\n" -" fma.rn.ftz.f32 %f521, %f312, %f517, %f520;\n" -" fma.rn.ftz.f32 %f522, %f518, %f310, %f521;\n" -" add.ftz.f32 %f523, %f240, %f522;\n" -" mov.f32 %f349, %f523;\n" -" .loc 16 305 0\n" -" mul.ftz.f32 %f524, %f326, %f519;\n" -" fma.rn.ftz.f32 %f525, %f323, %f517, %f524;\n" -" fma.rn.ftz.f32 %f526, %f518, %f325, %f525;\n" -" add.ftz.f32 %f353, %f241, %f526;\n" -" .loc 16 306 0\n" -" mul.ftz.f32 %f527, %f334, %f519;\n" -" fma.rn.ftz.f32 %f528, %f333, %f517, %f527;\n" -" fma.rn.ftz.f32 %f529, %f518, %f335, %f528;\n" -" add.ftz.f32 %f357, %f242, %f529;\n" -" .loc 16 307 0\n" -" mov.f32 %f359, %f358;\n" -" .loc 16 308 0\n" -" mul.ftz.f32 %f530, %f333, %f499;\n" -" mul.ftz.f32 %f531, %f334, %f500;\n" -" mul.ftz.f32 %f532, %f335, %f501;\n" -" mul.ftz.f32 %f533, %f308, %f531;\n" -" fma.rn.ftz.f32 %f534, %f312, %f530, %f533;\n" -" fma.rn.ftz.f32 %f535, %f310, %f532, %f534;\n" -" add.ftz.f32 %f536, %f221, %f535;\n" -" mov.f32 %f367, %f536;\n" -" .loc 16 309 0\n" -" mul.ftz.f32 %f537, %f326, %f531;\n" -" fma.rn.ftz.f32 %f538, %f323, %f530, %f537;\n" -" fma.rn.ftz.f32 %f539, %f325, %f532, %f538;\n" -" add.ftz.f32 %f371, %f222, %f539;\n" -" .loc 16 310 0\n" -" mul.ftz.f32 %f540, %f334, %f531;\n" -" fma.rn.ftz.f32 %f541, %f333, %f530, %f540;\n" -" fma.rn.ftz.f32 %f542, %f335, %f532, %f541;\n" -" add.ftz.f32 %f375, %f223, %f542;\n" -" abs.ftz.f32 %f543, %f523;\n" -" abs.ftz.f32 %f544, %f508;\n" -" setp.gt.ftz.f32 %p13, %f543, %f544;\n" -" @!%p13 bra $Lt_0_51202;\n" -" .loc 16 314 0\n" -" mov.f32 %f319, %f523;\n" -" mov.f32 %f349, %f508;\n" -" .loc 16 315 0\n" -" mov.f32 %f331, %f353;\n" -" mov.f32 %f353, %f512;\n" -" .loc 16 316 0\n" -" mov.f32 %f340, %f357;\n" -" mov.f32 %f357, %f516;\n" -" .loc 16 317 0\n" -" mov.f32 %f292, %f284;\n" -" mov.f32 %f341, %f283;\n" -"$Lt_0_51202:\n" -" mov.f32 %f545, %f319;\n" -" abs.ftz.f32 %f546, %f545;\n" -" abs.ftz.f32 %f547, %f536;\n" -" setp.lt.ftz.f32 %p14, %f546, %f547;\n" -" @!%p14 bra $Lt_0_51714;\n" -" .loc 16 321 0\n" -" mov.f32 %f319, %f536;\n" -" mov.f32 %f367, %f545;\n" -" .loc 16 322 0\n" -" mov.f32 %f548, %f331;\n" -" mov.f32 %f331, %f371;\n" -" mov.f32 %f371, %f548;\n" -" .loc 16 323 0\n" -" mov.f32 %f549, %f340;\n" -" mov.f32 %f340, %f375;\n" -" mov.f32 %f375, %f549;\n" -" .loc 16 324 0\n" -" mov.f32 %f550, %f292;\n" -" mov.f32 %f292, %f358;\n" -" mov.f32 %f359, %f550;\n" -"$Lt_0_51714:\n" -" mov.f32 %f551, %f319;\n" -" mov.f32 %f552, 0f00000000; \n" -" setp.neu.ftz.f32 %p15, %f551, %f552;\n" -" @!%p15 bra $Lt_0_52482;\n" -" bra.uni $Lt_0_53250;\n" -"$Lt_0_52482:\n" -" mov.f32 %f553, 0f00000000; \n" -" setp.neu.ftz.f32 %p16, %f349, %f553;\n" -" @!%p16 bra $Lt_0_52994;\n" -" .loc 16 338 0\n" -" mov.f32 %f319, %f349;\n" -" mov.f32 %f349, %f551;\n" -" .loc 16 339 0\n" -" mov.f32 %f554, %f331;\n" -" mov.f32 %f331, %f353;\n" -" mov.f32 %f353, %f554;\n" -" .loc 16 340 0\n" -" mov.f32 %f555, %f340;\n" -" mov.f32 %f340, %f357;\n" -" mov.f32 %f357, %f555;\n" -" .loc 16 341 0\n" -" mov.f32 %f556, %f292;\n" -" mov.f32 %f292, %f341;\n" -" mov.f32 %f341, %f556;\n" -" bra.uni $Lt_0_53250;\n" -"$Lt_0_52994:\n" -" mov.f32 %f557, 0f00000000; \n" -" setp.neu.ftz.f32 %p17, %f367, %f557;\n" -" @!%p17 bra $Lt_0_53506;\n" -" .loc 16 346 0\n" -" mov.f32 %f319, %f367;\n" -" mov.f32 %f367, %f551;\n" -" .loc 16 347 0\n" -" mov.f32 %f558, %f331;\n" -" mov.f32 %f331, %f371;\n" -" mov.f32 %f371, %f558;\n" -" .loc 16 348 0\n" -" mov.f32 %f559, %f340;\n" -" mov.f32 %f340, %f375;\n" -" mov.f32 %f375, %f559;\n" -" .loc 16 349 0\n" -" mov.f32 %f560, %f292;\n" -" mov.f32 %f292, %f359;\n" -" mov.f32 %f359, %f560;\n" -" bra.uni $Lt_0_53250;\n" -"$Lt_0_53506:\n" -" .loc 16 352 0\n" -" mov.s32 %r31, 2;\n" -" ld.param.u64 %rd47, [__cudaparm_kernel_ellipsoid_err_flag];\n" -" st.global.s32 [%rd47+0], %r31;\n" -"$Lt_0_53250:\n" -"$Lt_0_52738:\n" -"$Lt_0_52226:\n" -" .loc 16 355 0\n" -" div.approx.ftz.f32 %f561, %f349, %f319;\n" -" mul.ftz.f32 %f562, %f331, %f561;\n" -" sub.ftz.f32 %f563, %f353, %f562;\n" -" mov.f32 %f353, %f563;\n" -" .loc 16 356 0\n" -" mul.ftz.f32 %f564, %f340, %f561;\n" -" sub.ftz.f32 %f565, %f357, %f564;\n" -" mov.f32 %f357, %f565;\n" -" .loc 16 357 0\n" -" mul.ftz.f32 %f566, %f292, %f561;\n" -" sub.ftz.f32 %f567, %f341, %f566;\n" -" mov.f32 %f341, %f567;\n" -" .loc 16 359 0\n" -" div.approx.ftz.f32 %f568, %f367, %f319;\n" -" mul.ftz.f32 %f569, %f331, %f568;\n" -" sub.ftz.f32 %f371, %f371, %f569;\n" -" .loc 16 360 0\n" -" mul.ftz.f32 %f570, %f340, %f568;\n" -" sub.ftz.f32 %f375, %f375, %f570;\n" -" .loc 16 361 0\n" -" mul.ftz.f32 %f571, %f292, %f568;\n" -" sub.ftz.f32 %f359, %f359, %f571;\n" -" abs.ftz.f32 %f572, %f563;\n" -" abs.ftz.f32 %f573, %f371;\n" -" setp.lt.ftz.f32 %p18, %f572, %f573;\n" -" @!%p18 bra $Lt_0_53762;\n" -" .loc 16 366 0\n" -" mov.f32 %f353, %f371;\n" -" mov.f32 %f371, %f563;\n" -" .loc 16 367 0\n" -" mov.f32 %f357, %f375;\n" -" mov.f32 %f375, %f565;\n" -" .loc 16 368 0\n" -" mov.f32 %f341, %f359;\n" -" mov.f32 %f359, %f567;\n" -"$Lt_0_53762:\n" -" mov.f32 %f574, %f353;\n" -" mov.f32 %f575, 0f00000000; \n" -" setp.neu.ftz.f32 %p19, %f574, %f575;\n" -" @!%p19 bra $Lt_0_54530;\n" -" bra.uni $Lt_0_54786;\n" -"$Lt_0_54530:\n" -" mov.f32 %f576, 0f00000000; \n" -" setp.neu.ftz.f32 %p20, %f371, %f576;\n" -" @!%p20 bra $Lt_0_54786;\n" -" .loc 16 383 0\n" -" mov.f32 %f353, %f371;\n" -" mov.f32 %f371, %f574;\n" -" .loc 16 384 0\n" -" mov.f32 %f577, %f357;\n" -" mov.f32 %f357, %f375;\n" -" mov.f32 %f375, %f577;\n" -" .loc 16 385 0\n" -" mov.f32 %f578, %f341;\n" -" mov.f32 %f341, %f359;\n" -" mov.f32 %f359, %f578;\n" -"$Lt_0_54786:\n" -"$Lt_0_54274:\n" -" .loc 16 390 0\n" -" div.approx.ftz.f32 %f579, %f371, %f353;\n" -" mul.ftz.f32 %f580, %f357, %f579;\n" -" sub.ftz.f32 %f375, %f375, %f580;\n" -" .loc 16 391 0\n" -" mul.ftz.f32 %f581, %f341, %f579;\n" -" sub.ftz.f32 %f359, %f359, %f581;\n" -" mov.f32 %f582, 0f00000000; \n" -" setp.eq.ftz.f32 %p21, %f375, %f582;\n" -" @!%p21 bra $Lt_0_55298;\n" -" .loc 16 394 0\n" -" mov.s32 %r32, 2;\n" -" ld.param.u64 %rd48, [__cudaparm_kernel_ellipsoid_err_flag];\n" -" st.global.s32 [%rd48+0], %r32;\n" -"$Lt_0_55298:\n" -" .loc 17 213 0\n" -" div.approx.ftz.f32 %f583, %f359, %f375;\n" -" mul.ftz.f32 %f584, %f583, %f357;\n" -" sub.ftz.f32 %f585, %f341, %f584;\n" -" div.approx.ftz.f32 %f586, %f585, %f353;\n" -" mul.ftz.f32 %f587, %f586, %f331;\n" -" fma.rn.ftz.f32 %f588, %f340, %f583, %f587;\n" -" mul.ftz.f32 %f589, %f586, %f284;\n" -" sub.ftz.f32 %f590, %f292, %f588;\n" -" div.approx.ftz.f32 %f591, %f590, %f319;\n" -" fma.rn.ftz.f32 %f592, %f283, %f591, %f589;\n" -" fma.rn.ftz.f32 %f593, %f358, %f583, %f592;\n" -" add.ftz.f32 %f594, %f593, %f593;\n" -" .loc 17 220 0\n" -" rcp.approx.ftz.f32 %f595, %f282;\n" -" sub.ftz.f32 %f596, %f595, %f429;\n" -" mov.f32 %f597, 0f3f000000; \n" -" mul.ftz.f32 %f598, %f596, %f597;\n" -" add.ftz.f32 %f599, %f598, %f287;\n" -" add.ftz.f32 %f600, %f598, %f286;\n" -" add.ftz.f32 %f601, %f598, %f285;\n" -" add.ftz.f32 %f602, %f598, %f36;\n" -" add.ftz.f32 %f603, %f598, %f34;\n" -" add.ftz.f32 %f604, %f598, %f35;\n" -" mul.ftz.f32 %f605, %f603, %f604;\n" -" mul.ftz.f32 %f606, %f602, %f605;\n" -" mul.ftz.f32 %f607, %f601, %f606;\n" -" mul.ftz.f32 %f608, %f600, %f607;\n" -" mul.ftz.f32 %f609, %f599, %f608;\n" -" .loc 17 223 0\n" -" mul.ftz.f32 %f610, %f461, %f461;\n" -" mul.ftz.f32 %f611, %f285, %f286;\n" -" mul.ftz.f32 %f612, %f445, %f445;\n" -" rcp.approx.ftz.f32 %f613, %f445;\n" -" rcp.approx.ftz.f32 %f614, %f461;\n" -" mul.ftz.f32 %f615, %f611, %f287;\n" -" add.ftz.f32 %f616, %f613, %f614;\n" -" mul.ftz.f32 %f617, %f610, %f615;\n" -" mul.ftz.f32 %f618, %f615, %f69;\n" -" div.approx.ftz.f32 %f619, %f616, %f494;\n" -" fma.rn.ftz.f32 %f620, %f69, %f612, %f617;\n" -" rsqrt.approx.ftz.f32 %f621, %f619;\n" -" div.approx.ftz.f32 %f622, %f620, %f621;\n" -" mul.ftz.f32 %f623, %f622, %f594;\n" -" div.approx.ftz.f32 %f624, %f495, %f596;\n" -" mul.ftz.f32 %f625, %f623, %f624;\n" -" mov.f32 %f626, 0f3f800000; \n" -" mov.f32 %f627, 0f40400000; \n" -" fma.rn.ftz.f32 %f628, %f627, %f625, %f626;\n" -" mul.ftz.f32 %f629, %f618, %f628;\n" -" .loc 17 228 0\n" -" div.approx.ftz.f32 %f630, %f596, %f17;\n" -" add.ftz.f32 %f631, %f630, %f287;\n" -" add.ftz.f32 %f632, %f630, %f286;\n" -" add.ftz.f32 %f633, %f630, %f285;\n" -" add.ftz.f32 %f634, %f630, %f36;\n" -" add.ftz.f32 %f635, %f630, %f34;\n" -" add.ftz.f32 %f636, %f630, %f35;\n" -" mul.ftz.f32 %f637, %f635, %f636;\n" -" mul.ftz.f32 %f638, %f634, %f637;\n" -" mul.ftz.f32 %f639, %f633, %f638;\n" -" mul.ftz.f32 %f640, %f632, %f639;\n" -" mul.ftz.f32 %f641, %f631, %f640;\n" -" .loc 17 231 0\n" -" mov.f32 %f642, 0f3f800000; \n" -" mov.f32 %f643, 0f3f4db6db; \n" -" fma.rn.ftz.f32 %f644, %f643, %f625, %f642;\n" -" mul.ftz.f32 %f645, %f618, %f644;\n" -" .loc 17 233 0\n" -" mul.ftz.f32 %f646, %f624, %f624;\n" -" mul.ftz.f32 %f647, %f624, %f646;\n" -" mul.ftz.f32 %f648, %f647, %f647;\n" -" .loc 17 236 0\n" -" div.approx.ftz.f32 %f649, %f629, %f609;\n" -" div.approx.ftz.f32 %f650, %f645, %f641;\n" -" mul.ftz.f32 %f651, %f649, %f498;\n" -" mul.ftz.f32 %f652, %f650, %f498;\n" -" mov.f32 %f653, 0fc2100000; \n" -" div.approx.ftz.f32 %f654, %f651, %f653;\n" -" mul.ftz.f32 %f655, %f652, %f648;\n" -" mov.f32 %f656, 0f44fd2000; \n" -" div.approx.ftz.f32 %f657, %f655, %f656;\n" -" add.ftz.f32 %f658, %f654, %f657;\n" -" add.ftz.f32 %f271, %f271, %f658;\n" -" .loc 17 246 0\n" -" div.approx.ftz.f32 %f659, %f613, %f612;\n" -" mul.ftz.f32 %f660, %f659, %f440;\n" -" neg.ftz.f32 %f661, %f660;\n" -" .loc 17 247 0\n" -" mul.ftz.f32 %f662, %f659, %f439;\n" -" neg.ftz.f32 %f663, %f662;\n" -" .loc 17 248 0\n" -" mul.ftz.f32 %f664, %f659, %f441;\n" -" neg.ftz.f32 %f665, %f664;\n" -" .loc 17 249 0\n" -" div.approx.ftz.f32 %f666, %f614, %f610;\n" -" mul.ftz.f32 %f667, %f666, %f456;\n" -" neg.ftz.f32 %f668, %f667;\n" -" .loc 17 250 0\n" -" mul.ftz.f32 %f669, %f666, %f455;\n" -" neg.ftz.f32 %f670, %f669;\n" -" .loc 17 251 0\n" -" mul.ftz.f32 %f671, %f666, %f457;\n" -" neg.ftz.f32 %f672, %f671;\n" -" .loc 21 544 0\n" -" add.ftz.f32 %f673, %f622, %f622;\n" -" div.approx.ftz.f32 %f674, %f673, %f620;\n" -" mul.ftz.f32 %f675, %f615, %f674;\n" -" div.approx.ftz.f32 %f676, %f675, %f666;\n" -" mul.ftz.f32 %f677, %f69, %f674;\n" -" div.approx.ftz.f32 %f678, %f677, %f659;\n" -" .loc 17 278 0\n" -" mov.f32 %f679, 0f40800000; \n" -" mul.ftz.f32 %f680, %f591, %f679;\n" -" .loc 17 286 0\n" -" add.ftz.f32 %f681, %f55, %f596;\n" -" rcp.approx.ftz.f32 %f682, %f681;\n" -" add.ftz.f32 %f683, %f56, %f596;\n" -" rcp.approx.ftz.f32 %f684, %f683;\n" -" add.ftz.f32 %f685, %f682, %f684;\n" -" add.ftz.f32 %f686, %f57, %f596;\n" -" rcp.approx.ftz.f32 %f687, %f686;\n" -" add.ftz.f32 %f688, %f685, %f687;\n" -" add.ftz.f32 %f689, %f285, %f285;\n" -" add.ftz.f32 %f690, %f596, %f689;\n" -" rcp.approx.ftz.f32 %f691, %f690;\n" -" add.ftz.f32 %f692, %f688, %f691;\n" -" add.ftz.f32 %f693, %f286, %f286;\n" -" add.ftz.f32 %f694, %f596, %f693;\n" -" rcp.approx.ftz.f32 %f695, %f694;\n" -" add.ftz.f32 %f696, %f692, %f695;\n" -" add.ftz.f32 %f697, %f287, %f287;\n" -" add.ftz.f32 %f698, %f596, %f697;\n" -" rcp.approx.ftz.f32 %f699, %f698;\n" -" add.ftz.f32 %f700, %f696, %f699;\n" -" .loc 17 293 0\n" -" mul.ftz.f32 %f701, %f622, %f495;\n" -" mul.ftz.f32 %f702, %f701, %f594;\n" -" mov.f32 %f703, 0f40400000; \n" -" fma.rn.ftz.f32 %f704, %f703, %f702, %f596;\n" -" rcp.approx.ftz.f32 %f705, %f704;\n" -" rcp.approx.ftz.f32 %f706, %f596;\n" -" sub.ftz.f32 %f707, %f706, %f705;\n" -" add.ftz.f32 %f708, %f700, %f707;\n" -" .loc 17 297 0\n" -" fma.rn.ftz.f32 %f709, %f17, %f34, %f596;\n" -" rcp.approx.ftz.f32 %f710, %f709;\n" -" fma.rn.ftz.f32 %f711, %f17, %f35, %f596;\n" -" rcp.approx.ftz.f32 %f712, %f711;\n" -" add.ftz.f32 %f713, %f710, %f712;\n" -" fma.rn.ftz.f32 %f714, %f17, %f36, %f596;\n" -" rcp.approx.ftz.f32 %f715, %f714;\n" -" add.ftz.f32 %f716, %f713, %f715;\n" -" fma.rn.ftz.f32 %f717, %f17, %f285, %f596;\n" -" rcp.approx.ftz.f32 %f718, %f717;\n" -" add.ftz.f32 %f719, %f716, %f718;\n" -" fma.rn.ftz.f32 %f720, %f17, %f286, %f596;\n" -" rcp.approx.ftz.f32 %f721, %f720;\n" -" add.ftz.f32 %f722, %f719, %f721;\n" -" fma.rn.ftz.f32 %f723, %f17, %f287, %f596;\n" -" rcp.approx.ftz.f32 %f724, %f723;\n" -" add.ftz.f32 %f725, %f722, %f724;\n" -" .loc 17 304 0\n" -" mov.f32 %f726, 0f40e00000; \n" -" div.approx.ftz.f32 %f727, %f726, %f596;\n" -" mov.f32 %f728, 0f3f4db6db; \n" -" fma.rn.ftz.f32 %f729, %f728, %f702, %f596;\n" -" rcp.approx.ftz.f32 %f730, %f729;\n" -" sub.ftz.f32 %f731, %f727, %f730;\n" -" add.ftz.f32 %f732, %f731, %f725;\n" -" .loc 17 314 0\n" -" mul.ftz.f32 %f733, %f283, %f283;\n" -" neg.ftz.f32 %f734, %f733;\n" -" mov.f32 %f735, %f734;\n" -" .loc 17 315 0\n" -" mul.ftz.f32 %f736, %f284, %f283;\n" -" neg.ftz.f32 %f737, %f736;\n" -" mov.f32 %f738, %f737;\n" -" .loc 17 316 0\n" -" mul.ftz.f32 %f739, %f358, %f283;\n" -" neg.ftz.f32 %f740, %f739;\n" -" mov.f32 %f741, %f740;\n" -" .loc 17 317 0\n" -" mov.f32 %f742, 0f3f800000; \n" -" sub.ftz.f32 %f743, %f742, %f733;\n" -" mov.f32 %f744, %f743;\n" -" .loc 17 318 0\n" -" mul.ftz.f32 %f745, %f282, %f743;\n" -" mov.f32 %f746, %f745;\n" -" .loc 17 319 0\n" -" mov.f32 %f747, %f738;\n" -" mul.ftz.f32 %f748, %f747, %f282;\n" -" mov.f32 %f749, %f748;\n" -" .loc 17 320 0\n" -" mov.f32 %f750, %f741;\n" -" mul.ftz.f32 %f751, %f750, %f282;\n" -" mov.f32 %f752, %f751;\n" -" .loc 17 325 0\n" -" mul.ftz.f32 %f753, %f71, %f748;\n" -" mul.ftz.f32 %f754, %f79, %f748;\n" -" mul.ftz.f32 %f755, %f323, %f748;\n" -" mul.ftz.f32 %f756, %f325, %f748;\n" -" mul.ftz.f32 %f757, %f89, %f748;\n" -" mul.ftz.f32 %f758, %f326, %f748;\n" -" mul.ftz.f32 %f759, %f612, %f185;\n" -" mul.ftz.f32 %f760, %f610, %f317;\n" -" neg.ftz.f32 %f761, %f759;\n" -" fma.rn.ftz.f32 %f762, %f745, %f308, %f758;\n" -" fma.rn.ftz.f32 %f763, %f312, %f745, %f755;\n" -" fma.rn.ftz.f32 %f764, %f745, %f310, %f756;\n" -" fma.rn.ftz.f32 %f765, %f745, %f70, %f757;\n" -" fma.rn.ftz.f32 %f766, %f87, %f745, %f753;\n" -" fma.rn.ftz.f32 %f767, %f745, %f76, %f754;\n" -" fma.rn.ftz.f32 %f768, %f751, %f334, %f762;\n" -" fma.rn.ftz.f32 %f769, %f333, %f751, %f763;\n" -" fma.rn.ftz.f32 %f770, %f751, %f335, %f764;\n" -" fma.rn.ftz.f32 %f771, %f751, %f80, %f765;\n" -" fma.rn.ftz.f32 %f772, %f77, %f751, %f766;\n" -" fma.rn.ftz.f32 %f773, %f751, %f91, %f767;\n" -" mul.ftz.f32 %f774, %f768, %f670;\n" -" mul.ftz.f32 %f775, %f771, %f663;\n" -" fma.rn.ftz.f32 %f776, %f668, %f769, %f774;\n" -" fma.rn.ftz.f32 %f777, %f661, %f772, %f775;\n" -" fma.rn.ftz.f32 %f778, %f672, %f770, %f776;\n" -" fma.rn.ftz.f32 %f779, %f665, %f773, %f777;\n" -" mul.ftz.f32 %f780, %f760, %f778;\n" -" mul.ftz.f32 %f781, %f761, %f779;\n" -" sub.ftz.f32 %f782, %f781, %f780;\n" -" .loc 17 326 0\n" -" mul.ftz.f32 %f783, %f612, %f196;\n" -" mul.ftz.f32 %f784, %f610, %f329;\n" -" neg.ftz.f32 %f785, %f783;\n" -" mul.ftz.f32 %f786, %f784, %f778;\n" -" mul.ftz.f32 %f787, %f785, %f779;\n" -" sub.ftz.f32 %f788, %f787, %f786;\n" -" .loc 17 327 0\n" -" mul.ftz.f32 %f789, %f612, %f186;\n" -" mul.ftz.f32 %f790, %f610, %f338;\n" -" neg.ftz.f32 %f791, %f789;\n" -" mul.ftz.f32 %f792, %f790, %f778;\n" -" mul.ftz.f32 %f793, %f791, %f779;\n" -" sub.ftz.f32 %f794, %f793, %f792;\n" -" .loc 17 328 0\n" -" mul.ftz.f32 %f795, %f612, %f228;\n" -" mul.ftz.f32 %f796, %f610, %f347;\n" -" neg.ftz.f32 %f797, %f795;\n" -" mul.ftz.f32 %f798, %f796, %f778;\n" -" mul.ftz.f32 %f799, %f797, %f779;\n" -" sub.ftz.f32 %f800, %f799, %f798;\n" -" .loc 17 329 0\n" -" mul.ftz.f32 %f801, %f612, %f230;\n" -" mul.ftz.f32 %f802, %f610, %f352;\n" -" neg.ftz.f32 %f803, %f801;\n" -" mul.ftz.f32 %f804, %f802, %f778;\n" -" mul.ftz.f32 %f805, %f803, %f779;\n" -" sub.ftz.f32 %f806, %f805, %f804;\n" -" .loc 17 330 0\n" -" mul.ftz.f32 %f807, %f612, %f232;\n" -" mul.ftz.f32 %f808, %f610, %f356;\n" -" neg.ftz.f32 %f809, %f807;\n" -" mul.ftz.f32 %f810, %f808, %f778;\n" -" mul.ftz.f32 %f811, %f809, %f779;\n" -" sub.ftz.f32 %f812, %f811, %f810;\n" -" .loc 17 331 0\n" -" mul.ftz.f32 %f813, %f612, %f213;\n" -" mul.ftz.f32 %f814, %f610, %f365;\n" -" neg.ftz.f32 %f815, %f813;\n" -" mul.ftz.f32 %f816, %f814, %f778;\n" -" mul.ftz.f32 %f817, %f815, %f779;\n" -" sub.ftz.f32 %f818, %f817, %f816;\n" -" .loc 17 332 0\n" -" mul.ftz.f32 %f819, %f612, %f214;\n" -" mul.ftz.f32 %f820, %f610, %f370;\n" -" neg.ftz.f32 %f821, %f819;\n" -" mul.ftz.f32 %f822, %f820, %f778;\n" -" mul.ftz.f32 %f823, %f821, %f779;\n" -" sub.ftz.f32 %f824, %f823, %f822;\n" -" .loc 17 333 0\n" -" mul.ftz.f32 %f825, %f612, %f216;\n" -" mul.ftz.f32 %f826, %f610, %f374;\n" -" neg.ftz.f32 %f827, %f825;\n" -" mul.ftz.f32 %f828, %f826, %f778;\n" -" mul.ftz.f32 %f829, %f827, %f779;\n" -" sub.ftz.f32 %f830, %f829, %f828;\n" -" .loc 17 334 0\n" -" mul.ftz.f32 %f831, %f479, %f782;\n" -" mul.ftz.f32 %f832, %f475, %f831;\n" -" mul.ftz.f32 %f833, %f478, %f782;\n" -" mul.ftz.f32 %f834, %f476, %f833;\n" -" sub.ftz.f32 %f835, %f834, %f832;\n" -" mul.ftz.f32 %f836, %f477, %f788;\n" -" mul.ftz.f32 %f837, %f476, %f836;\n" -" sub.ftz.f32 %f838, %f835, %f837;\n" -" mul.ftz.f32 %f839, %f477, %f794;\n" -" fma.rn.ftz.f32 %f840, %f475, %f839, %f838;\n" -" mul.ftz.f32 %f841, %f474, %f788;\n" -" fma.rn.ftz.f32 %f842, %f479, %f841, %f840;\n" -" mul.ftz.f32 %f843, %f474, %f794;\n" -" mul.ftz.f32 %f844, %f478, %f843;\n" -" sub.ftz.f32 %f845, %f842, %f844;\n" -" mul.ftz.f32 %f846, %f468, %f806;\n" -" fma.rn.ftz.f32 %f847, %f476, %f846, %f845;\n" -" mul.ftz.f32 %f848, %f468, %f812;\n" -" mul.ftz.f32 %f849, %f475, %f848;\n" -" sub.ftz.f32 %f850, %f847, %f849;\n" -" mul.ftz.f32 %f851, %f470, %f800;\n" -" mul.ftz.f32 %f852, %f476, %f851;\n" -" sub.ftz.f32 %f853, %f850, %f852;\n" -" mul.ftz.f32 %f854, %f469, %f800;\n" -" fma.rn.ftz.f32 %f855, %f475, %f854, %f853;\n" -" fma.rn.ftz.f32 %f856, %f812, %f480, %f855;\n" -" mul.ftz.f32 %f857, %f806, %f481;\n" -" sub.ftz.f32 %f858, %f856, %f857;\n" -" fma.rn.ftz.f32 %f859, %f830, %f484, %f858;\n" -" mul.ftz.f32 %f860, %f824, %f485;\n" -" sub.ftz.f32 %f861, %f859, %f860;\n" -" mul.ftz.f32 %f862, %f830, %f482;\n" -" sub.ftz.f32 %f863, %f861, %f862;\n" -" fma.rn.ftz.f32 %f864, %f824, %f483, %f863;\n" -" mul.ftz.f32 %f865, %f470, %f818;\n" -" fma.rn.ftz.f32 %f866, %f479, %f865, %f864;\n" -" mul.ftz.f32 %f867, %f469, %f818;\n" -" mul.ftz.f32 %f868, %f478, %f867;\n" -" sub.ftz.f32 %f869, %f866, %f868;\n" -" .loc 17 335 0\n" -" add.ftz.f32 %f870, %f616, %f616;\n" -" div.approx.ftz.f32 %f871, %f622, %f870;\n" -" add.ftz.f32 %f872, %f778, %f779;\n" -" mul.ftz.f32 %f873, %f871, %f872;\n" -" .loc 17 336 0\n" -" add.ftz.f32 %f874, %f494, %f494;\n" -" div.approx.ftz.f32 %f875, %f622, %f874;\n" -" mul.ftz.f32 %f876, %f869, %f875;\n" -" sub.ftz.f32 %f877, %f873, %f876;\n" -" .loc 17 337 0\n" -" mul.ftz.f32 %f878, %f676, %f778;\n" -" fma.rn.ftz.f32 %f879, %f779, %f678, %f878;\n" -" sub.ftz.f32 %f880, %f877, %f879;\n" -" .loc 17 340 0\n" -" mul.ftz.f32 %f881, %f429, %f429;\n" -" mov.f32 %f882, 0f40400000; \n" -" mul.ftz.f32 %f883, %f495, %f882;\n" -" mov.f32 %f884, 0f40800000; \n" -" mul.ftz.f32 %f885, %f583, %f884;\n" -" mul.ftz.f32 %f886, %f881, %f429;\n" -" mov.f32 %f887, 0f3f000000; \n" -" mul.ftz.f32 %f888, %f886, %f887;\n" -" mul.ftz.f32 %f889, %f888, %f419;\n" -" mul.ftz.f32 %f890, %f888, %f423;\n" -" mul.ftz.f32 %f891, %f888, %f416;\n" -" mov.f32 %f892, 0f40800000; \n" -" mul.ftz.f32 %f893, %f586, %f892;\n" -" mul.ftz.f32 %f894, %f889, %f748;\n" -" mul.ftz.f32 %f895, %f893, %f748;\n" -" fma.rn.ftz.f32 %f896, %f890, %f745, %f894;\n" -" fma.rn.ftz.f32 %f897, %f680, %f745, %f895;\n" -" mul.ftz.f32 %f898, %f883, %f705;\n" -" fma.rn.ftz.f32 %f899, %f891, %f751, %f896;\n" -" fma.rn.ftz.f32 %f900, %f885, %f751, %f897;\n" -" add.ftz.f32 %f901, %f899, %f283;\n" -" mul.ftz.f32 %f902, %f622, %f900;\n" -" fma.rn.ftz.f32 %f903, %f594, %f880, %f902;\n" -" mul.ftz.f32 %f904, %f901, %f708;\n" -" mul.ftz.f32 %f905, %f898, %f903;\n" -" sub.ftz.f32 %f906, %f905, %f904;\n" -" .loc 17 341 0\n" -" mov.f32 %f907, 0f3f4db6db; \n" -" mul.ftz.f32 %f908, %f495, %f907;\n" -" mul.ftz.f32 %f909, %f908, %f730;\n" -" mul.ftz.f32 %f910, %f901, %f732;\n" -" mul.ftz.f32 %f911, %f909, %f903;\n" -" sub.ftz.f32 %f912, %f911, %f910;\n" -" .loc 17 344 0\n" -" mul.ftz.f32 %f913, %f657, %f912;\n" -" fma.rn.ftz.f32 %f914, %f906, %f654, %f913;\n" -" add.ftz.f32 %f270, %f914, %f270;\n" -" @!%p3 bra $Lt_0_56322;\n" -" .loc 17 346 0\n" -" mov.f32 %f915, %f19;\n" -" mul.ftz.f32 %f916, %f277, %f914;\n" -" sub.ftz.f32 %f917, %f915, %f916;\n" -" mov.f32 %f19, %f917;\n" -"$Lt_0_56322:\n" -" .loc 17 314 0\n" -" mov.f32 %f918, %f737;\n" -" .loc 17 315 0\n" -" mul.ftz.f32 %f919, %f284, %f284;\n" -" neg.ftz.f32 %f920, %f919;\n" -" mov.f32 %f921, %f920;\n" -" .loc 17 316 0\n" -" mul.ftz.f32 %f922, %f358, %f284;\n" -" neg.ftz.f32 %f923, %f922;\n" -" mov.f32 %f924, %f923;\n" -" .loc 17 317 0\n" -" mov.f32 %f925, 0f3f800000; \n" -" sub.ftz.f32 %f926, %f925, %f919;\n" -" mov.f32 %f927, %f926;\n" -" .loc 17 318 0\n" -" mov.f32 %f928, %f918;\n" -" mul.ftz.f32 %f929, %f928, %f282;\n" -" mov.f32 %f930, %f929;\n" -" .loc 17 319 0\n" -" mul.ftz.f32 %f931, %f282, %f926;\n" -" mov.f32 %f932, %f931;\n" -" .loc 17 320 0\n" -" mov.f32 %f933, %f924;\n" -" mul.ftz.f32 %f934, %f933, %f282;\n" -" mov.f32 %f935, %f934;\n" -" .loc 17 325 0\n" -" mul.ftz.f32 %f936, %f326, %f931;\n" -" mul.ftz.f32 %f937, %f323, %f931;\n" -" mul.ftz.f32 %f938, %f325, %f931;\n" -" mul.ftz.f32 %f939, %f89, %f931;\n" -" mul.ftz.f32 %f940, %f71, %f931;\n" -" mul.ftz.f32 %f941, %f79, %f931;\n" -" fma.rn.ftz.f32 %f942, %f929, %f308, %f936;\n" -" fma.rn.ftz.f32 %f943, %f312, %f929, %f937;\n" -" fma.rn.ftz.f32 %f944, %f929, %f310, %f938;\n" -" fma.rn.ftz.f32 %f945, %f929, %f70, %f939;\n" -" fma.rn.ftz.f32 %f946, %f87, %f929, %f940;\n" -" fma.rn.ftz.f32 %f947, %f929, %f76, %f941;\n" -" fma.rn.ftz.f32 %f948, %f934, %f334, %f942;\n" -" fma.rn.ftz.f32 %f949, %f333, %f934, %f943;\n" -" fma.rn.ftz.f32 %f950, %f934, %f335, %f944;\n" -" fma.rn.ftz.f32 %f951, %f934, %f80, %f945;\n" -" fma.rn.ftz.f32 %f952, %f77, %f934, %f946;\n" -" fma.rn.ftz.f32 %f953, %f934, %f91, %f947;\n" -" mul.ftz.f32 %f954, %f948, %f670;\n" -" mul.ftz.f32 %f955, %f951, %f663;\n" -" fma.rn.ftz.f32 %f956, %f668, %f949, %f954;\n" -" fma.rn.ftz.f32 %f957, %f661, %f952, %f955;\n" -" fma.rn.ftz.f32 %f958, %f672, %f950, %f956;\n" -" fma.rn.ftz.f32 %f959, %f665, %f953, %f957;\n" -" mul.ftz.f32 %f960, %f760, %f958;\n" -" mul.ftz.f32 %f961, %f761, %f959;\n" -" sub.ftz.f32 %f962, %f961, %f960;\n" -" .loc 17 326 0\n" -" mul.ftz.f32 %f963, %f784, %f958;\n" -" mul.ftz.f32 %f964, %f785, %f959;\n" -" sub.ftz.f32 %f965, %f964, %f963;\n" -" .loc 17 327 0\n" -" mul.ftz.f32 %f966, %f790, %f958;\n" -" mul.ftz.f32 %f967, %f791, %f959;\n" -" sub.ftz.f32 %f968, %f967, %f966;\n" -" .loc 17 328 0\n" -" mul.ftz.f32 %f969, %f796, %f958;\n" -" mul.ftz.f32 %f970, %f797, %f959;\n" -" sub.ftz.f32 %f971, %f970, %f969;\n" -" .loc 17 329 0\n" -" mul.ftz.f32 %f972, %f802, %f958;\n" -" mul.ftz.f32 %f973, %f803, %f959;\n" -" sub.ftz.f32 %f974, %f973, %f972;\n" -" .loc 17 330 0\n" -" mul.ftz.f32 %f975, %f808, %f958;\n" -" mul.ftz.f32 %f976, %f809, %f959;\n" -" sub.ftz.f32 %f977, %f976, %f975;\n" -" .loc 17 331 0\n" -" mul.ftz.f32 %f978, %f814, %f958;\n" -" mul.ftz.f32 %f979, %f815, %f959;\n" -" sub.ftz.f32 %f980, %f979, %f978;\n" -" .loc 17 332 0\n" -" mul.ftz.f32 %f981, %f820, %f958;\n" -" mul.ftz.f32 %f982, %f821, %f959;\n" -" sub.ftz.f32 %f983, %f982, %f981;\n" -" .loc 17 333 0\n" -" mul.ftz.f32 %f984, %f826, %f958;\n" -" mul.ftz.f32 %f985, %f827, %f959;\n" -" sub.ftz.f32 %f986, %f985, %f984;\n" -" .loc 17 334 0\n" -" mul.ftz.f32 %f987, %f479, %f962;\n" -" mul.ftz.f32 %f988, %f475, %f987;\n" -" mul.ftz.f32 %f989, %f478, %f962;\n" -" mul.ftz.f32 %f990, %f476, %f989;\n" -" sub.ftz.f32 %f991, %f990, %f988;\n" -" mul.ftz.f32 %f992, %f477, %f965;\n" -" mul.ftz.f32 %f993, %f476, %f992;\n" -" sub.ftz.f32 %f994, %f991, %f993;\n" -" mul.ftz.f32 %f995, %f477, %f968;\n" -" fma.rn.ftz.f32 %f996, %f475, %f995, %f994;\n" -" mul.ftz.f32 %f997, %f474, %f965;\n" -" fma.rn.ftz.f32 %f998, %f479, %f997, %f996;\n" -" mul.ftz.f32 %f999, %f474, %f968;\n" -" mul.ftz.f32 %f1000, %f478, %f999;\n" -" sub.ftz.f32 %f1001, %f998, %f1000;\n" -" mul.ftz.f32 %f1002, %f468, %f974;\n" -" fma.rn.ftz.f32 %f1003, %f476, %f1002, %f1001;\n" -" mul.ftz.f32 %f1004, %f468, %f977;\n" -" mul.ftz.f32 %f1005, %f475, %f1004;\n" -" sub.ftz.f32 %f1006, %f1003, %f1005;\n" -" mul.ftz.f32 %f1007, %f470, %f971;\n" -" mul.ftz.f32 %f1008, %f476, %f1007;\n" -" sub.ftz.f32 %f1009, %f1006, %f1008;\n" -" mul.ftz.f32 %f1010, %f469, %f971;\n" -" fma.rn.ftz.f32 %f1011, %f475, %f1010, %f1009;\n" -" fma.rn.ftz.f32 %f1012, %f977, %f480, %f1011;\n" -" mul.ftz.f32 %f1013, %f974, %f481;\n" -" sub.ftz.f32 %f1014, %f1012, %f1013;\n" -" fma.rn.ftz.f32 %f1015, %f986, %f484, %f1014;\n" -" mul.ftz.f32 %f1016, %f983, %f485;\n" -" sub.ftz.f32 %f1017, %f1015, %f1016;\n" -" mul.ftz.f32 %f1018, %f986, %f482;\n" -" sub.ftz.f32 %f1019, %f1017, %f1018;\n" -" fma.rn.ftz.f32 %f1020, %f983, %f483, %f1019;\n" -" mul.ftz.f32 %f1021, %f470, %f980;\n" -" fma.rn.ftz.f32 %f1022, %f479, %f1021, %f1020;\n" -" mul.ftz.f32 %f1023, %f469, %f980;\n" -" mul.ftz.f32 %f1024, %f478, %f1023;\n" -" sub.ftz.f32 %f1025, %f1022, %f1024;\n" -" .loc 17 335 0\n" -" add.ftz.f32 %f1026, %f958, %f959;\n" -" mul.ftz.f32 %f1027, %f871, %f1026;\n" -" .loc 17 336 0\n" -" mul.ftz.f32 %f1028, %f1025, %f875;\n" -" sub.ftz.f32 %f1029, %f1027, %f1028;\n" -" .loc 17 337 0\n" -" mul.ftz.f32 %f1030, %f676, %f958;\n" -" fma.rn.ftz.f32 %f1031, %f959, %f678, %f1030;\n" -" sub.ftz.f32 %f1032, %f1029, %f1031;\n" -" .loc 17 340 0\n" -" mul.ftz.f32 %f1033, %f889, %f931;\n" -" mul.ftz.f32 %f1034, %f893, %f931;\n" -" fma.rn.ftz.f32 %f1035, %f890, %f929, %f1033;\n" -" fma.rn.ftz.f32 %f1036, %f680, %f929, %f1034;\n" -" fma.rn.ftz.f32 %f1037, %f891, %f934, %f1035;\n" -" fma.rn.ftz.f32 %f1038, %f885, %f934, %f1036;\n" -" add.ftz.f32 %f1039, %f1037, %f284;\n" -" mul.ftz.f32 %f1040, %f622, %f1038;\n" -" fma.rn.ftz.f32 %f1041, %f594, %f1032, %f1040;\n" -" mul.ftz.f32 %f1042, %f1039, %f708;\n" -" mul.ftz.f32 %f1043, %f898, %f1041;\n" -" sub.ftz.f32 %f1044, %f1043, %f1042;\n" -" .loc 17 341 0\n" -" mul.ftz.f32 %f1045, %f1039, %f732;\n" -" mul.ftz.f32 %f1046, %f909, %f1041;\n" -" sub.ftz.f32 %f1047, %f1046, %f1045;\n" -" .loc 17 348 0\n" -" mul.ftz.f32 %f1048, %f657, %f1047;\n" -" fma.rn.ftz.f32 %f914, %f1044, %f654, %f1048;\n" -" add.ftz.f32 %f269, %f914, %f269;\n" -" @!%p3 bra $Lt_0_59906;\n" -" .loc 17 350 0\n" -" mov.f32 %f1049, %f21;\n" -" mul.ftz.f32 %f1050, %f276, %f914;\n" -" sub.ftz.f32 %f1051, %f1049, %f1050;\n" -" mov.f32 %f21, %f1051;\n" -" .loc 17 351 0\n" -" mov.f32 %f1052, %f25;\n" -" mul.ftz.f32 %f1053, %f277, %f914;\n" -" sub.ftz.f32 %f1054, %f1052, %f1053;\n" -" mov.f32 %f25, %f1054;\n" -"$Lt_0_59906:\n" -" .loc 17 314 0\n" -" mov.f32 %f1055, %f740;\n" -" .loc 17 315 0\n" -" mov.f32 %f1056, %f923;\n" -" .loc 17 316 0\n" -" mul.ftz.f32 %f1057, %f358, %f358;\n" -" neg.ftz.f32 %f1058, %f1057;\n" -" mov.f32 %f1059, %f1058;\n" -" .loc 17 317 0\n" -" mov.f32 %f1060, 0f3f800000; \n" -" sub.ftz.f32 %f1061, %f1060, %f1057;\n" -" mov.f32 %f1062, %f1061;\n" -" .loc 17 318 0\n" -" mov.f32 %f1063, %f1055;\n" -" mul.ftz.f32 %f1064, %f1063, %f282;\n" -" mov.f32 %f1065, %f1064;\n" -" .loc 17 319 0\n" -" mov.f32 %f1066, %f1056;\n" -" mul.ftz.f32 %f1067, %f1066, %f282;\n" -" mov.f32 %f1068, %f1067;\n" -" .loc 17 320 0\n" -" mul.ftz.f32 %f1069, %f282, %f1061;\n" -" mov.f32 %f1070, %f1069;\n" -" .loc 17 325 0\n" -" mul.ftz.f32 %f1071, %f71, %f1067;\n" -" mul.ftz.f32 %f1072, %f79, %f1067;\n" -" mul.ftz.f32 %f1073, %f323, %f1067;\n" -" mul.ftz.f32 %f1074, %f325, %f1067;\n" -" fma.rn.ftz.f32 %f1075, %f87, %f1064, %f1071;\n" -" mul.ftz.f32 %f1076, %f89, %f1067;\n" -" fma.rn.ftz.f32 %f1077, %f1064, %f76, %f1072;\n" -" fma.rn.ftz.f32 %f1078, %f312, %f1064, %f1073;\n" -" mul.ftz.f32 %f1079, %f326, %f1067;\n" -" fma.rn.ftz.f32 %f1080, %f1064, %f310, %f1074;\n" -" fma.rn.ftz.f32 %f1081, %f1064, %f70, %f1076;\n" -" fma.rn.ftz.f32 %f1082, %f1064, %f308, %f1079;\n" -" fma.rn.ftz.f32 %f1083, %f1069, %f334, %f1082;\n" -" fma.rn.ftz.f32 %f1084, %f333, %f1069, %f1078;\n" -" fma.rn.ftz.f32 %f1085, %f1069, %f335, %f1080;\n" -" fma.rn.ftz.f32 %f1086, %f1069, %f80, %f1081;\n" -" fma.rn.ftz.f32 %f1087, %f77, %f1069, %f1075;\n" -" fma.rn.ftz.f32 %f1088, %f1069, %f91, %f1077;\n" -" mul.ftz.f32 %f1089, %f1083, %f670;\n" -" mul.ftz.f32 %f1090, %f1086, %f663;\n" -" fma.rn.ftz.f32 %f1091, %f668, %f1084, %f1089;\n" -" fma.rn.ftz.f32 %f1092, %f661, %f1087, %f1090;\n" -" fma.rn.ftz.f32 %f1093, %f672, %f1085, %f1091;\n" -" fma.rn.ftz.f32 %f1094, %f665, %f1088, %f1092;\n" -" mul.ftz.f32 %f1095, %f760, %f1093;\n" -" mul.ftz.f32 %f1096, %f761, %f1094;\n" -" sub.ftz.f32 %f1097, %f1096, %f1095;\n" -" .loc 17 326 0\n" -" mul.ftz.f32 %f1098, %f784, %f1093;\n" -" mul.ftz.f32 %f1099, %f785, %f1094;\n" -" sub.ftz.f32 %f1100, %f1099, %f1098;\n" -" .loc 17 327 0\n" -" mul.ftz.f32 %f1101, %f790, %f1093;\n" -" mul.ftz.f32 %f1102, %f791, %f1094;\n" -" sub.ftz.f32 %f1103, %f1102, %f1101;\n" -" .loc 17 328 0\n" -" mul.ftz.f32 %f1104, %f796, %f1093;\n" -" mul.ftz.f32 %f1105, %f797, %f1094;\n" -" sub.ftz.f32 %f1106, %f1105, %f1104;\n" -" .loc 17 329 0\n" -" mul.ftz.f32 %f1107, %f802, %f1093;\n" -" mul.ftz.f32 %f1108, %f803, %f1094;\n" -" sub.ftz.f32 %f1109, %f1108, %f1107;\n" -" .loc 17 330 0\n" -" mul.ftz.f32 %f1110, %f808, %f1093;\n" -" mul.ftz.f32 %f1111, %f809, %f1094;\n" -" sub.ftz.f32 %f1112, %f1111, %f1110;\n" -" .loc 17 331 0\n" -" mul.ftz.f32 %f1113, %f814, %f1093;\n" -" mul.ftz.f32 %f1114, %f815, %f1094;\n" -" sub.ftz.f32 %f1115, %f1114, %f1113;\n" -" .loc 17 332 0\n" -" mul.ftz.f32 %f1116, %f820, %f1093;\n" -" mul.ftz.f32 %f1117, %f821, %f1094;\n" -" sub.ftz.f32 %f1118, %f1117, %f1116;\n" -" .loc 17 333 0\n" -" mul.ftz.f32 %f1119, %f826, %f1093;\n" -" mul.ftz.f32 %f1120, %f827, %f1094;\n" -" sub.ftz.f32 %f1121, %f1120, %f1119;\n" -" .loc 17 334 0\n" -" mul.ftz.f32 %f1122, %f479, %f1097;\n" -" mul.ftz.f32 %f1123, %f475, %f1122;\n" -" mul.ftz.f32 %f1124, %f478, %f1097;\n" -" mul.ftz.f32 %f1125, %f476, %f1124;\n" -" sub.ftz.f32 %f1126, %f1125, %f1123;\n" -" mul.ftz.f32 %f1127, %f477, %f1100;\n" -" mul.ftz.f32 %f1128, %f476, %f1127;\n" -" sub.ftz.f32 %f1129, %f1126, %f1128;\n" -" mul.ftz.f32 %f1130, %f477, %f1103;\n" -" fma.rn.ftz.f32 %f1131, %f475, %f1130, %f1129;\n" -" mul.ftz.f32 %f1132, %f474, %f1100;\n" -" fma.rn.ftz.f32 %f1133, %f479, %f1132, %f1131;\n" -" mul.ftz.f32 %f1134, %f474, %f1103;\n" -" mul.ftz.f32 %f1135, %f478, %f1134;\n" -" sub.ftz.f32 %f1136, %f1133, %f1135;\n" -" mul.ftz.f32 %f1137, %f468, %f1109;\n" -" fma.rn.ftz.f32 %f1138, %f476, %f1137, %f1136;\n" -" mul.ftz.f32 %f1139, %f468, %f1112;\n" -" mul.ftz.f32 %f1140, %f475, %f1139;\n" -" sub.ftz.f32 %f1141, %f1138, %f1140;\n" -" mul.ftz.f32 %f1142, %f470, %f1106;\n" -" mul.ftz.f32 %f1143, %f476, %f1142;\n" -" sub.ftz.f32 %f1144, %f1141, %f1143;\n" -" mul.ftz.f32 %f1145, %f469, %f1106;\n" -" fma.rn.ftz.f32 %f1146, %f475, %f1145, %f1144;\n" -" fma.rn.ftz.f32 %f1147, %f1112, %f480, %f1146;\n" -" mul.ftz.f32 %f1148, %f1109, %f481;\n" -" sub.ftz.f32 %f1149, %f1147, %f1148;\n" -" fma.rn.ftz.f32 %f1150, %f1121, %f484, %f1149;\n" -" mul.ftz.f32 %f1151, %f1118, %f485;\n" -" sub.ftz.f32 %f1152, %f1150, %f1151;\n" -" mul.ftz.f32 %f1153, %f1121, %f482;\n" -" sub.ftz.f32 %f1154, %f1152, %f1153;\n" -" fma.rn.ftz.f32 %f1155, %f1118, %f483, %f1154;\n" -" mul.ftz.f32 %f1156, %f470, %f1115;\n" -" fma.rn.ftz.f32 %f1157, %f479, %f1156, %f1155;\n" -" mul.ftz.f32 %f1158, %f469, %f1115;\n" -" mul.ftz.f32 %f1159, %f478, %f1158;\n" -" sub.ftz.f32 %f1160, %f1157, %f1159;\n" -" .loc 17 335 0\n" -" add.ftz.f32 %f1161, %f1093, %f1094;\n" -" mul.ftz.f32 %f1162, %f871, %f1161;\n" -" .loc 17 336 0\n" -" mul.ftz.f32 %f1163, %f1160, %f875;\n" -" sub.ftz.f32 %f1164, %f1162, %f1163;\n" -" .loc 17 337 0\n" -" mul.ftz.f32 %f1165, %f676, %f1093;\n" -" fma.rn.ftz.f32 %f1166, %f1094, %f678, %f1165;\n" -" sub.ftz.f32 %f1167, %f1164, %f1166;\n" -" .loc 17 340 0\n" -" mul.ftz.f32 %f1168, %f889, %f1067;\n" -" mul.ftz.f32 %f1169, %f893, %f1067;\n" -" fma.rn.ftz.f32 %f1170, %f890, %f1064, %f1168;\n" -" fma.rn.ftz.f32 %f1171, %f680, %f1064, %f1169;\n" -" fma.rn.ftz.f32 %f1172, %f891, %f1069, %f1170;\n" -" fma.rn.ftz.f32 %f1173, %f885, %f1069, %f1171;\n" -" add.ftz.f32 %f1174, %f1172, %f358;\n" -" mul.ftz.f32 %f1175, %f622, %f1173;\n" -" fma.rn.ftz.f32 %f1176, %f594, %f1167, %f1175;\n" -" mul.ftz.f32 %f1177, %f1174, %f708;\n" -" mul.ftz.f32 %f1178, %f898, %f1176;\n" -" sub.ftz.f32 %f1179, %f1178, %f1177;\n" -" .loc 17 341 0\n" -" mul.ftz.f32 %f1180, %f1174, %f732;\n" -" mul.ftz.f32 %f1181, %f909, %f1176;\n" -" sub.ftz.f32 %f1182, %f1181, %f1180;\n" -" .loc 17 354 0\n" -" mul.ftz.f32 %f1183, %f657, %f1182;\n" -" fma.rn.ftz.f32 %f914, %f1179, %f654, %f1183;\n" -" add.ftz.f32 %f268, %f914, %f268;\n" -" @!%p3 bra $Lt_0_62978;\n" -" .loc 17 356 0\n" -" mov.f32 %f1184, %f23;\n" -" mul.ftz.f32 %f1185, %f278, %f914;\n" -" sub.ftz.f32 %f1186, %f1184, %f1185;\n" -" mov.f32 %f23, %f1186;\n" -" .loc 17 357 0\n" -" mov.f32 %f1187, %f27;\n" -" mul.ftz.f32 %f1188, %f277, %f914;\n" -" sub.ftz.f32 %f1189, %f1187, %f1188;\n" -" mov.f32 %f27, %f1189;\n" -" .loc 17 358 0\n" -" mul.ftz.f32 %f1190, %f276, %f914;\n" -" sub.ftz.f32 %f28, %f28, %f1190;\n" -" mov.f32 %f29, %f28;\n" -"$Lt_0_62978:\n" -" .loc 17 381 0\n" -" mul.ftz.f32 %f1191, %f80, %f284;\n" -" mul.ftz.f32 %f1192, %f78, %f284;\n" -" mul.ftz.f32 %f1193, %f91, %f284;\n" -" neg.ftz.f32 %f1194, %f1191;\n" -" mov.f32 %f1195, 0f00000000; \n" -" fma.rn.ftz.f32 %f1196, %f1195, %f283, %f1192;\n" -" neg.ftz.f32 %f1197, %f1193;\n" -" mov.f32 %f1198, 0f00000000; \n" -" fma.rn.ftz.f32 %f1199, %f283, %f1198, %f1194;\n" -" fma.rn.ftz.f32 %f1200, %f71, %f358, %f1196;\n" -" mov.f32 %f1201, 0f00000000; \n" -" fma.rn.ftz.f32 %f1202, %f283, %f1201, %f1197;\n" -" fma.rn.ftz.f32 %f1203, %f358, %f89, %f1199;\n" -" fma.rn.ftz.f32 %f1204, %f358, %f79, %f1202;\n" -" mul.ftz.f32 %f1205, %f1203, %f662;\n" -" neg.ftz.f32 %f1206, %f1205;\n" -" fma.rn.ftz.f32 %f1207, %f661, %f1200, %f1206;\n" -" fma.rn.ftz.f32 %f1208, %f665, %f1204, %f1207;\n" -" mul.ftz.f32 %f1209, %f759, %f1208;\n" -" mul.ftz.f32 %f1210, %f783, %f1208;\n" -" mul.ftz.f32 %f1211, %f789, %f1208;\n" -" mul.ftz.f32 %f1212, %f801, %f1208;\n" -" mul.ftz.f32 %f1213, %f807, %f1208;\n" -" mul.ftz.f32 %f1214, %f795, %f1208;\n" -" mul.ftz.f32 %f1215, %f825, %f1208;\n" -" mul.ftz.f32 %f1216, %f819, %f1208;\n" -" mul.ftz.f32 %f1217, %f813, %f1208;\n" -" neg.ftz.f32 %f1218, %f1209;\n" -" neg.ftz.f32 %f1219, %f1210;\n" -" neg.ftz.f32 %f1220, %f1211;\n" -" neg.ftz.f32 %f1221, %f1212;\n" -" neg.ftz.f32 %f1222, %f1213;\n" -" neg.ftz.f32 %f1223, %f1214;\n" -" neg.ftz.f32 %f1224, %f1215;\n" -" neg.ftz.f32 %f1225, %f1216;\n" -" neg.ftz.f32 %f1226, %f1217;\n" -" fma.rn.ftz.f32 %f1227, %f225, %f445, %f1218;\n" -" fma.rn.ftz.f32 %f1228, %f251, %f445, %f1219;\n" -" fma.rn.ftz.f32 %f1229, %f227, %f445, %f1220;\n" -" fma.rn.ftz.f32 %f1230, %f264, %f445, %f1221;\n" -" fma.rn.ftz.f32 %f1231, %f262, %f445, %f1222;\n" -" fma.rn.ftz.f32 %f1232, %f252, %f445, %f1223;\n" -" fma.rn.ftz.f32 %f1233, %f246, %f445, %f1224;\n" -" fma.rn.ftz.f32 %f1234, %f263, %f445, %f1225;\n" -" fma.rn.ftz.f32 %f1235, %f227, %f445, %f1226;\n" -" mul.ftz.f32 %f1236, %f479, %f1227;\n" -" mul.ftz.f32 %f1237, %f475, %f1236;\n" -" mul.ftz.f32 %f1238, %f478, %f1227;\n" -" mul.ftz.f32 %f1239, %f476, %f1238;\n" -" sub.ftz.f32 %f1240, %f1239, %f1237;\n" -" mul.ftz.f32 %f1241, %f477, %f1228;\n" -" mul.ftz.f32 %f1242, %f476, %f1241;\n" -" sub.ftz.f32 %f1243, %f1240, %f1242;\n" -" mul.ftz.f32 %f1244, %f477, %f1229;\n" -" fma.rn.ftz.f32 %f1245, %f475, %f1244, %f1243;\n" -" mul.ftz.f32 %f1246, %f474, %f1228;\n" -" fma.rn.ftz.f32 %f1247, %f479, %f1246, %f1245;\n" -" mul.ftz.f32 %f1248, %f474, %f1229;\n" -" mul.ftz.f32 %f1249, %f478, %f1248;\n" -" sub.ftz.f32 %f1250, %f1247, %f1249;\n" -" mul.ftz.f32 %f1251, %f468, %f1230;\n" -" fma.rn.ftz.f32 %f1252, %f476, %f1251, %f1250;\n" -" mul.ftz.f32 %f1253, %f468, %f1231;\n" -" mul.ftz.f32 %f1254, %f475, %f1253;\n" -" sub.ftz.f32 %f1255, %f1252, %f1254;\n" -" mul.ftz.f32 %f1256, %f470, %f1232;\n" -" mul.ftz.f32 %f1257, %f476, %f1256;\n" -" sub.ftz.f32 %f1258, %f1255, %f1257;\n" -" mul.ftz.f32 %f1259, %f469, %f1232;\n" -" fma.rn.ftz.f32 %f1260, %f475, %f1259, %f1258;\n" -" fma.rn.ftz.f32 %f1261, %f1231, %f480, %f1260;\n" -" mul.ftz.f32 %f1262, %f1230, %f481;\n" -" sub.ftz.f32 %f1263, %f1261, %f1262;\n" -" fma.rn.ftz.f32 %f1264, %f1233, %f484, %f1263;\n" -" mul.ftz.f32 %f1265, %f1234, %f485;\n" -" sub.ftz.f32 %f1266, %f1264, %f1265;\n" -" mul.ftz.f32 %f1267, %f1233, %f482;\n" -" sub.ftz.f32 %f1268, %f1266, %f1267;\n" -" fma.rn.ftz.f32 %f1269, %f1234, %f483, %f1268;\n" -" mul.ftz.f32 %f1270, %f470, %f1235;\n" -" fma.rn.ftz.f32 %f1271, %f479, %f1270, %f1269;\n" -" mul.ftz.f32 %f1272, %f469, %f1235;\n" -" mul.ftz.f32 %f1273, %f478, %f1272;\n" -" sub.ftz.f32 %f1274, %f1271, %f1273;\n" -" .loc 17 392 0\n" -" mul.ftz.f32 %f1275, %f80, %f586;\n" -" mul.ftz.f32 %f1276, %f78, %f586;\n" -" mul.ftz.f32 %f1277, %f91, %f586;\n" -" mul.ftz.f32 %f1278, %f117, %f893;\n" -" mul.ftz.f32 %f1279, %f85, %f893;\n" -" mul.ftz.f32 %f1280, %f98, %f893;\n" -" neg.ftz.f32 %f1281, %f1275;\n" -" neg.ftz.f32 %f1282, %f1277;\n" -" mul.ftz.f32 %f1283, %f875, %f1274;\n" -" mul.ftz.f32 %f1284, %f889, %f224;\n" -" mul.ftz.f32 %f1285, %f889, %f248;\n" -" mov.f32 %f1286, 0f00000000; \n" -" fma.rn.ftz.f32 %f1287, %f591, %f1286, %f1281;\n" -" mov.f32 %f1288, 0f00000000; \n" -" fma.rn.ftz.f32 %f1289, %f1288, %f591, %f1276;\n" -" mov.f32 %f1290, 0f00000000; \n" -" fma.rn.ftz.f32 %f1291, %f591, %f1290, %f1282;\n" -" fma.rn.ftz.f32 %f1292, %f188, %f890, %f1284;\n" -" mul.ftz.f32 %f1293, %f889, %f257;\n" -" fma.rn.ftz.f32 %f1294, %f890, %f169, %f1285;\n" -" fma.rn.ftz.f32 %f1295, %f680, %f83, %f1278;\n" -" fma.rn.ftz.f32 %f1296, %f113, %f680, %f1279;\n" -" fma.rn.ftz.f32 %f1297, %f680, %f93, %f1280;\n" -" fma.rn.ftz.f32 %f1298, %f583, %f89, %f1287;\n" -" fma.rn.ftz.f32 %f1299, %f71, %f583, %f1289;\n" -" fma.rn.ftz.f32 %f1300, %f583, %f79, %f1291;\n" -" fma.rn.ftz.f32 %f1301, %f196, %f891, %f1292;\n" -" fma.rn.ftz.f32 %f1302, %f890, %f140, %f1293;\n" -" fma.rn.ftz.f32 %f1303, %f891, %f214, %f1294;\n" -" fma.rn.ftz.f32 %f1304, %f885, %f100, %f1295;\n" -" fma.rn.ftz.f32 %f1305, %f885, %f96, %f1296;\n" -" fma.rn.ftz.f32 %f1306, %f885, %f120, %f1297;\n" -" fma.rn.ftz.f32 %f1307, %f891, %f230, %f1302;\n" -" mul.ftz.f32 %f1308, %f1298, %f1304;\n" -" mul.ftz.f32 %f1309, %f1307, %f419;\n" -" fma.rn.ftz.f32 %f1310, %f1305, %f1299, %f1308;\n" -" fma.rn.ftz.f32 %f1311, %f423, %f1301, %f1309;\n" -" fma.rn.ftz.f32 %f1312, %f1306, %f1300, %f1310;\n" -" fma.rn.ftz.f32 %f1313, %f416, %f1303, %f1311;\n" -" mul.ftz.f32 %f1314, %f622, %f1312;\n" -" mul.ftz.f32 %f1315, %f1208, %f871;\n" -" sub.ftz.f32 %f1316, %f1315, %f1283;\n" -" neg.ftz.f32 %f1317, %f1314;\n" -" mul.ftz.f32 %f1318, %f1208, %f678;\n" -" sub.ftz.f32 %f1319, %f1316, %f1318;\n" -" fma.rn.ftz.f32 %f1320, %f594, %f1319, %f1317;\n" -" mul.ftz.f32 %f1321, %f1313, %f732;\n" -" fma.rn.ftz.f32 %f1322, %f909, %f1320, %f1321;\n" -" mul.ftz.f32 %f1323, %f657, %f1322;\n" -" mul.ftz.f32 %f1324, %f1313, %f708;\n" -" fma.rn.ftz.f32 %f1325, %f898, %f1320, %f1324;\n" -" fma.rn.ftz.f32 %f1326, %f1325, %f654, %f1323;\n" -" sub.ftz.f32 %f267, %f267, %f1326;\n" -" .loc 17 407 0\n" -" mov.f32 %f1327, 0f00000000; \n" -" fma.rn.ftz.f32 %f1328, %f283, %f80, %f1327;\n" -" mov.f32 %f1329, 0f00000000; \n" -" fma.rn.ftz.f32 %f1330, %f77, %f283, %f1329;\n" -" mov.f32 %f1331, 0f00000000; \n" -" fma.rn.ftz.f32 %f1332, %f283, %f91, %f1331;\n" -" fma.rn.ftz.f32 %f1333, %f358, %f72, %f1328;\n" -" fma.rn.ftz.f32 %f1334, %f88, %f358, %f1330;\n" -" fma.rn.ftz.f32 %f1335, %f358, %f94, %f1332;\n" -" mul.ftz.f32 %f1336, %f1333, %f662;\n" -" neg.ftz.f32 %f1337, %f1336;\n" -" fma.rn.ftz.f32 %f1338, %f661, %f1334, %f1337;\n" -" fma.rn.ftz.f32 %f1339, %f665, %f1335, %f1338;\n" -" mul.ftz.f32 %f1340, %f759, %f1339;\n" -" mul.ftz.f32 %f1341, %f783, %f1339;\n" -" mul.ftz.f32 %f1342, %f789, %f1339;\n" -" mul.ftz.f32 %f1343, %f801, %f1339;\n" -" mul.ftz.f32 %f1344, %f807, %f1339;\n" -" mul.ftz.f32 %f1345, %f795, %f1339;\n" -" mul.ftz.f32 %f1346, %f825, %f1339;\n" -" mul.ftz.f32 %f1347, %f819, %f1339;\n" -" mul.ftz.f32 %f1348, %f813, %f1339;\n" -" neg.ftz.f32 %f1349, %f1340;\n" -" neg.ftz.f32 %f1350, %f1341;\n" -" neg.ftz.f32 %f1351, %f1342;\n" -" neg.ftz.f32 %f1352, %f1343;\n" -" neg.ftz.f32 %f1353, %f1344;\n" -" neg.ftz.f32 %f1354, %f1345;\n" -" neg.ftz.f32 %f1355, %f1346;\n" -" neg.ftz.f32 %f1356, %f1347;\n" -" neg.ftz.f32 %f1357, %f1348;\n" -" fma.rn.ftz.f32 %f1358, %f226, %f445, %f1349;\n" -" fma.rn.ftz.f32 %f1359, %f255, %f445, %f1350;\n" -" fma.rn.ftz.f32 %f1360, %f247, %f445, %f1351;\n" -" fma.rn.ftz.f32 %f1361, %f184, %f445, %f1352;\n" -" fma.rn.ftz.f32 %f1362, %f258, %f445, %f1353;\n" -" fma.rn.ftz.f32 %f1363, %f254, %f445, %f1354;\n" -" fma.rn.ftz.f32 %f1364, %f249, %f445, %f1355;\n" -" fma.rn.ftz.f32 %f1365, %f259, %f445, %f1356;\n" -" fma.rn.ftz.f32 %f1366, %f247, %f445, %f1357;\n" -" mul.ftz.f32 %f1367, %f479, %f1358;\n" -" mul.ftz.f32 %f1368, %f475, %f1367;\n" -" mul.ftz.f32 %f1369, %f478, %f1358;\n" -" mul.ftz.f32 %f1370, %f476, %f1369;\n" -" sub.ftz.f32 %f1371, %f1370, %f1368;\n" -" mul.ftz.f32 %f1372, %f477, %f1359;\n" -" mul.ftz.f32 %f1373, %f476, %f1372;\n" -" sub.ftz.f32 %f1374, %f1371, %f1373;\n" -" mul.ftz.f32 %f1375, %f477, %f1360;\n" -" fma.rn.ftz.f32 %f1376, %f475, %f1375, %f1374;\n" -" mul.ftz.f32 %f1377, %f474, %f1359;\n" -" fma.rn.ftz.f32 %f1378, %f479, %f1377, %f1376;\n" -" mul.ftz.f32 %f1379, %f474, %f1360;\n" -" mul.ftz.f32 %f1380, %f478, %f1379;\n" -" sub.ftz.f32 %f1381, %f1378, %f1380;\n" -" mul.ftz.f32 %f1382, %f468, %f1361;\n" -" fma.rn.ftz.f32 %f1383, %f476, %f1382, %f1381;\n" -" mul.ftz.f32 %f1384, %f468, %f1362;\n" -" mul.ftz.f32 %f1385, %f475, %f1384;\n" -" sub.ftz.f32 %f1386, %f1383, %f1385;\n" -" mul.ftz.f32 %f1387, %f470, %f1363;\n" -" mul.ftz.f32 %f1388, %f476, %f1387;\n" -" sub.ftz.f32 %f1389, %f1386, %f1388;\n" -" mul.ftz.f32 %f1390, %f469, %f1363;\n" -" fma.rn.ftz.f32 %f1391, %f475, %f1390, %f1389;\n" -" fma.rn.ftz.f32 %f1392, %f1362, %f480, %f1391;\n" -" mul.ftz.f32 %f1393, %f1361, %f481;\n" -" sub.ftz.f32 %f1394, %f1392, %f1393;\n" -" fma.rn.ftz.f32 %f1395, %f1364, %f484, %f1394;\n" -" mul.ftz.f32 %f1396, %f1365, %f485;\n" -" sub.ftz.f32 %f1397, %f1395, %f1396;\n" -" mul.ftz.f32 %f1398, %f1364, %f482;\n" -" sub.ftz.f32 %f1399, %f1397, %f1398;\n" -" fma.rn.ftz.f32 %f1400, %f1365, %f483, %f1399;\n" -" mul.ftz.f32 %f1401, %f470, %f1366;\n" -" fma.rn.ftz.f32 %f1402, %f479, %f1401, %f1400;\n" -" mul.ftz.f32 %f1403, %f469, %f1366;\n" -" mul.ftz.f32 %f1404, %f478, %f1403;\n" -" sub.ftz.f32 %f1405, %f1402, %f1404;\n" -" .loc 17 418 0\n" -" mul.ftz.f32 %f1406, %f889, %f140;\n" -" mul.ftz.f32 %f1407, %f889, %f169;\n" -" mul.ftz.f32 %f1408, %f875, %f1405;\n" -" mul.ftz.f32 %f1409, %f889, %f190;\n" -" fma.rn.ftz.f32 %f1410, %f890, %f216, %f1407;\n" -" fma.rn.ftz.f32 %f1411, %f191, %f890, %f1409;\n" -" fma.rn.ftz.f32 %f1412, %f890, %f232, %f1406;\n" -" fma.rn.ftz.f32 %f1413, %f891, %f219, %f1410;\n" -" mov.f32 %f1414, 0f00000000; \n" -" fma.rn.ftz.f32 %f1415, %f591, %f80, %f1414;\n" -" mov.f32 %f1416, 0f00000000; \n" -" fma.rn.ftz.f32 %f1417, %f77, %f591, %f1416;\n" -" mov.f32 %f1418, 0f00000000; \n" -" fma.rn.ftz.f32 %f1419, %f591, %f91, %f1418;\n" -" fma.rn.ftz.f32 %f1420, %f192, %f891, %f1411;\n" -" fma.rn.ftz.f32 %f1421, %f891, %f236, %f1412;\n" -" fma.rn.ftz.f32 %f1422, %f583, %f72, %f1415;\n" -" fma.rn.ftz.f32 %f1423, %f88, %f583, %f1417;\n" -" fma.rn.ftz.f32 %f1424, %f583, %f94, %f1419;\n" -" mul.ftz.f32 %f1425, %f1421, %f419;\n" -" fma.rn.ftz.f32 %f1426, %f423, %f1420, %f1425;\n" -" mul.ftz.f32 %f1427, %f1422, %f1304;\n" -" fma.rn.ftz.f32 %f1428, %f416, %f1413, %f1426;\n" -" fma.rn.ftz.f32 %f1429, %f1305, %f1423, %f1427;\n" -" fma.rn.ftz.f32 %f1430, %f1306, %f1424, %f1429;\n" -" mul.ftz.f32 %f1431, %f622, %f1430;\n" -" mul.ftz.f32 %f1432, %f1339, %f871;\n" -" sub.ftz.f32 %f1433, %f1432, %f1408;\n" -" neg.ftz.f32 %f1434, %f1431;\n" -" mul.ftz.f32 %f1435, %f1339, %f678;\n" -" sub.ftz.f32 %f1436, %f1433, %f1435;\n" -" fma.rn.ftz.f32 %f1437, %f594, %f1436, %f1434;\n" -" mul.ftz.f32 %f1438, %f1428, %f732;\n" -" fma.rn.ftz.f32 %f1439, %f909, %f1437, %f1438;\n" -" mul.ftz.f32 %f1440, %f657, %f1439;\n" -" mul.ftz.f32 %f1441, %f1428, %f708;\n" -" fma.rn.ftz.f32 %f1442, %f898, %f1437, %f1441;\n" -" fma.rn.ftz.f32 %f1443, %f1442, %f654, %f1440;\n" -" sub.ftz.f32 %f266, %f266, %f1443;\n" -" .loc 17 433 0\n" -" mul.ftz.f32 %f1444, %f70, %f284;\n" -" mul.ftz.f32 %f1445, %f87, %f284;\n" -" mul.ftz.f32 %f1446, %f76, %f284;\n" -" fma.rn.ftz.f32 %f1447, %f283, %f90, %f1444;\n" -" fma.rn.ftz.f32 %f1448, %f86, %f283, %f1445;\n" -" fma.rn.ftz.f32 %f1449, %f283, %f81, %f1446;\n" -" mov.f32 %f1450, 0f00000000; \n" -" fma.rn.ftz.f32 %f1451, %f358, %f1450, %f1447;\n" -" mov.f32 %f1452, 0f00000000; \n" -" fma.rn.ftz.f32 %f1453, %f1452, %f358, %f1448;\n" -" mov.f32 %f1454, 0f00000000; \n" -" fma.rn.ftz.f32 %f1455, %f358, %f1454, %f1449;\n" -" mul.ftz.f32 %f1456, %f1451, %f662;\n" -" neg.ftz.f32 %f1457, %f1456;\n" -" fma.rn.ftz.f32 %f1458, %f661, %f1453, %f1457;\n" -" fma.rn.ftz.f32 %f1459, %f665, %f1455, %f1458;\n" -" mul.ftz.f32 %f1460, %f759, %f1459;\n" -" mul.ftz.f32 %f1461, %f783, %f1459;\n" -" mul.ftz.f32 %f1462, %f789, %f1459;\n" -" mul.ftz.f32 %f1463, %f801, %f1459;\n" -" mul.ftz.f32 %f1464, %f807, %f1459;\n" -" mul.ftz.f32 %f1465, %f795, %f1459;\n" -" mul.ftz.f32 %f1466, %f825, %f1459;\n" -" mul.ftz.f32 %f1467, %f819, %f1459;\n" -" mul.ftz.f32 %f1468, %f813, %f1459;\n" -" neg.ftz.f32 %f1469, %f1460;\n" -" neg.ftz.f32 %f1470, %f1461;\n" -" neg.ftz.f32 %f1471, %f1462;\n" -" neg.ftz.f32 %f1472, %f1463;\n" -" neg.ftz.f32 %f1473, %f1464;\n" -" neg.ftz.f32 %f1474, %f1465;\n" -" neg.ftz.f32 %f1475, %f1466;\n" -" neg.ftz.f32 %f1476, %f1467;\n" -" neg.ftz.f32 %f1477, %f1468;\n" -" fma.rn.ftz.f32 %f1478, %f243, %f445, %f1469;\n" -" fma.rn.ftz.f32 %f1479, %f260, %f445, %f1470;\n" -" fma.rn.ftz.f32 %f1480, %f250, %f445, %f1471;\n" -" fma.rn.ftz.f32 %f1481, %f253, %f445, %f1472;\n" -" fma.rn.ftz.f32 %f1482, %f244, %f445, %f1473;\n" -" fma.rn.ftz.f32 %f1483, %f261, %f445, %f1474;\n" -" fma.rn.ftz.f32 %f1484, %f212, %f445, %f1475;\n" -" fma.rn.ftz.f32 %f1485, %f245, %f445, %f1476;\n" -" fma.rn.ftz.f32 %f1486, %f250, %f445, %f1477;\n" -" mul.ftz.f32 %f1487, %f479, %f1478;\n" -" mul.ftz.f32 %f1488, %f475, %f1487;\n" -" mul.ftz.f32 %f1489, %f478, %f1478;\n" -" mul.ftz.f32 %f1490, %f476, %f1489;\n" -" sub.ftz.f32 %f1491, %f1490, %f1488;\n" -" mul.ftz.f32 %f1492, %f477, %f1479;\n" -" mul.ftz.f32 %f1493, %f476, %f1492;\n" -" sub.ftz.f32 %f1494, %f1491, %f1493;\n" -" mul.ftz.f32 %f1495, %f477, %f1480;\n" -" fma.rn.ftz.f32 %f1496, %f475, %f1495, %f1494;\n" -" mul.ftz.f32 %f1497, %f474, %f1479;\n" -" fma.rn.ftz.f32 %f1498, %f479, %f1497, %f1496;\n" -" mul.ftz.f32 %f1499, %f474, %f1480;\n" -" mul.ftz.f32 %f1500, %f478, %f1499;\n" -" sub.ftz.f32 %f1501, %f1498, %f1500;\n" -" mul.ftz.f32 %f1502, %f468, %f1481;\n" -" fma.rn.ftz.f32 %f1503, %f476, %f1502, %f1501;\n" -" mul.ftz.f32 %f1504, %f468, %f1482;\n" -" mul.ftz.f32 %f1505, %f475, %f1504;\n" -" sub.ftz.f32 %f1506, %f1503, %f1505;\n" -" mul.ftz.f32 %f1507, %f470, %f1483;\n" -" mul.ftz.f32 %f1508, %f476, %f1507;\n" -" sub.ftz.f32 %f1509, %f1506, %f1508;\n" -" mul.ftz.f32 %f1510, %f469, %f1483;\n" -" fma.rn.ftz.f32 %f1511, %f475, %f1510, %f1509;\n" -" fma.rn.ftz.f32 %f1512, %f1482, %f480, %f1511;\n" -" mul.ftz.f32 %f1513, %f1481, %f481;\n" -" sub.ftz.f32 %f1514, %f1512, %f1513;\n" -" fma.rn.ftz.f32 %f1515, %f1484, %f484, %f1514;\n" -" mul.ftz.f32 %f1516, %f1485, %f485;\n" -" sub.ftz.f32 %f1517, %f1515, %f1516;\n" -" mul.ftz.f32 %f1518, %f1484, %f482;\n" -" sub.ftz.f32 %f1519, %f1517, %f1518;\n" -" fma.rn.ftz.f32 %f1520, %f1485, %f483, %f1519;\n" -" mul.ftz.f32 %f1521, %f470, %f1486;\n" -" fma.rn.ftz.f32 %f1522, %f479, %f1521, %f1520;\n" -" mul.ftz.f32 %f1523, %f469, %f1486;\n" -" mul.ftz.f32 %f1524, %f478, %f1523;\n" -" sub.ftz.f32 %f1525, %f1522, %f1524;\n" -" .loc 17 444 0\n" -" mul.ftz.f32 %f1526, %f70, %f586;\n" -" mul.ftz.f32 %f1527, %f87, %f586;\n" -" mul.ftz.f32 %f1528, %f76, %f586;\n" -" mul.ftz.f32 %f1529, %f875, %f1525;\n" -" mul.ftz.f32 %f1530, %f889, %f193;\n" -" mul.ftz.f32 %f1531, %f889, %f213;\n" -" fma.rn.ftz.f32 %f1532, %f211, %f890, %f1530;\n" -" mul.ftz.f32 %f1533, %f889, %f228;\n" -" fma.rn.ftz.f32 %f1534, %f890, %f220, %f1531;\n" -" fma.rn.ftz.f32 %f1535, %f591, %f90, %f1526;\n" -" fma.rn.ftz.f32 %f1536, %f86, %f591, %f1527;\n" -" fma.rn.ftz.f32 %f1537, %f591, %f81, %f1528;\n" -" fma.rn.ftz.f32 %f1538, %f190, %f891, %f1532;\n" -" fma.rn.ftz.f32 %f1539, %f890, %f239, %f1533;\n" -" fma.rn.ftz.f32 %f1540, %f891, %f169, %f1534;\n" -" mov.f32 %f1541, 0f00000000; \n" -" fma.rn.ftz.f32 %f1542, %f583, %f1541, %f1535;\n" -" mov.f32 %f1543, 0f00000000; \n" -" fma.rn.ftz.f32 %f1544, %f1543, %f583, %f1536;\n" -" mov.f32 %f1545, 0f00000000; \n" -" fma.rn.ftz.f32 %f1546, %f583, %f1545, %f1537;\n" -" fma.rn.ftz.f32 %f1547, %f891, %f140, %f1539;\n" -" mul.ftz.f32 %f1548, %f1547, %f419;\n" -" mul.ftz.f32 %f1549, %f1542, %f1304;\n" -" fma.rn.ftz.f32 %f1550, %f423, %f1538, %f1548;\n" -" fma.rn.ftz.f32 %f1551, %f1305, %f1544, %f1549;\n" -" fma.rn.ftz.f32 %f1552, %f416, %f1540, %f1550;\n" -" fma.rn.ftz.f32 %f1553, %f1306, %f1546, %f1551;\n" -" mul.ftz.f32 %f1554, %f622, %f1553;\n" -" mul.ftz.f32 %f1555, %f1459, %f871;\n" -" sub.ftz.f32 %f1556, %f1555, %f1529;\n" -" neg.ftz.f32 %f1557, %f1554;\n" -" mul.ftz.f32 %f1558, %f1459, %f678;\n" -" sub.ftz.f32 %f1559, %f1556, %f1558;\n" -" fma.rn.ftz.f32 %f1560, %f594, %f1559, %f1557;\n" -" mul.ftz.f32 %f1561, %f1552, %f732;\n" -" fma.rn.ftz.f32 %f1562, %f909, %f1560, %f1561;\n" -" mul.ftz.f32 %f1563, %f657, %f1562;\n" -" mul.ftz.f32 %f1564, %f1552, %f708;\n" -" fma.rn.ftz.f32 %f1565, %f898, %f1560, %f1564;\n" -" fma.rn.ftz.f32 %f1566, %f1565, %f654, %f1563;\n" -" sub.ftz.f32 %f265, %f265, %f1566;\n" -" mul.lo.s32 %r33, %r14, %r1;\n" -" cvt.s64.s32 %rd49, %r33;\n" -" mul.wide.s32 %rd50, %r33, 4;\n" -" add.u64 %rd25, %rd25, %rd50;\n" -" setp.gt.u64 %p22, %rd28, %rd25;\n" -" @%p22 bra $Lt_0_46338;\n" -" bra.uni $Lt_0_45826;\n" -"$Lt_0_69634:\n" -" mov.f32 %f265, 0f00000000; \n" -" mov.f32 %f266, 0f00000000; \n" -" mov.f32 %f267, 0f00000000; \n" -" mov.f32 %f268, 0f00000000; \n" -" mov.f32 %f269, 0f00000000; \n" -" mov.f32 %f270, 0f00000000; \n" -" mov.f32 %f271, 0f00000000; \n" -"$Lt_0_45826:\n" -" mov.u32 %r34, 1;\n" -" setp.le.s32 %p23, %r1, %r34;\n" -" @%p23 bra $Lt_0_65794;\n" -" .loc 17 448 0\n" -" mov.u64 %rd51, __cuda___cuda_local_var_33303_55_non_const_red_acc136;\n" -" cvt.s64.s32 %rd52, %r2;\n" -" mul.wide.s32 %rd53, %r2, 4;\n" -" add.u64 %rd54, %rd51, %rd53;\n" -" mov.f32 %f1567, %f270;\n" -" st.shared.f32 [%rd54+0], %f1567;\n" -" mov.f32 %f1568, %f269;\n" -" st.shared.f32 [%rd54+512], %f1568;\n" -" mov.f32 %f1569, %f268;\n" -" st.shared.f32 [%rd54+1024], %f1569;\n" -" mov.f32 %f1570, %f267;\n" -" st.shared.f32 [%rd54+1536], %f1570;\n" -" mov.f32 %f1571, %f266;\n" -" st.shared.f32 [%rd54+2048], %f1571;\n" -" mov.f32 %f1572, %f265;\n" -" st.shared.f32 [%rd54+2560], %f1572;\n" -" shr.s32 %r35, %r1, 31;\n" -" mov.s32 %r36, 1;\n" -" and.b32 %r37, %r35, %r36;\n" -" add.s32 %r38, %r37, %r1;\n" -" shr.s32 %r39, %r38, 1;\n" -" mov.s32 %r40, %r39;\n" -" mov.u32 %r41, 0;\n" -" setp.ne.u32 %p24, %r39, %r41;\n" -" @!%p24 bra $Lt_0_64258;\n" -"$Lt_0_64770:\n" -" setp.ge.u32 %p25, %r16, %r40;\n" -" @%p25 bra $Lt_0_65026;\n" -" add.u32 %r42, %r2, %r40;\n" -" cvt.u64.u32 %rd55, %r42;\n" -" mul.wide.u32 %rd56, %r42, 4;\n" -" add.u64 %rd57, %rd51, %rd56;\n" -" ld.shared.f32 %f1573, [%rd57+0];\n" -" add.ftz.f32 %f1567, %f1573, %f1567;\n" -" st.shared.f32 [%rd54+0], %f1567;\n" -" ld.shared.f32 %f1574, [%rd57+512];\n" -" add.ftz.f32 %f1568, %f1574, %f1568;\n" -" st.shared.f32 [%rd54+512], %f1568;\n" -" ld.shared.f32 %f1575, [%rd57+1024];\n" -" add.ftz.f32 %f1569, %f1575, %f1569;\n" -" st.shared.f32 [%rd54+1024], %f1569;\n" -" ld.shared.f32 %f1576, [%rd57+1536];\n" -" add.ftz.f32 %f1570, %f1576, %f1570;\n" -" st.shared.f32 [%rd54+1536], %f1570;\n" -" ld.shared.f32 %f1577, [%rd57+2048];\n" -" add.ftz.f32 %f1571, %f1577, %f1571;\n" -" st.shared.f32 [%rd54+2048], %f1571;\n" -" ld.shared.f32 %f1578, [%rd57+2560];\n" -" add.ftz.f32 %f1572, %f1578, %f1572;\n" -" st.shared.f32 [%rd54+2560], %f1572;\n" -"$Lt_0_65026:\n" -" shr.u32 %r40, %r40, 1;\n" -" mov.u32 %r43, 0;\n" -" setp.ne.u32 %p26, %r40, %r43;\n" -" @%p26 bra $Lt_0_64770;\n" -"$Lt_0_64258:\n" -" mov.f32 %f270, %f1567;\n" -" mov.f32 %f269, %f1568;\n" -" mov.f32 %f268, %f1569;\n" -" mov.f32 %f267, %f1570;\n" -" mov.f32 %f266, %f1571;\n" -" mov.f32 %f265, %f1572;\n" -" ld.param.s32 %r44, [__cudaparm_kernel_ellipsoid_eflag];\n" -" mov.s32 %r45, 0;\n" -" set.gt.u32.s32 %r46, %r44, %r45;\n" -" neg.s32 %r47, %r46;\n" -" ld.param.s32 %r48, [__cudaparm_kernel_ellipsoid_vflag];\n" -" mov.s32 %r49, 0;\n" -" set.gt.u32.s32 %r50, %r48, %r49;\n" -" neg.s32 %r51, %r50;\n" -" or.b32 %r52, %r47, %r51;\n" -" mov.u32 %r53, 0;\n" -" setp.eq.s32 %p27, %r52, %r53;\n" -" @%p27 bra $Lt_0_65794;\n" -" mov.f32 %f1567, %f19;\n" -" st.shared.f32 [%rd54+0], %f1567;\n" -" mov.f32 %f1568, %f21;\n" -" st.shared.f32 [%rd54+512], %f1568;\n" -" mov.f32 %f1569, %f23;\n" -" st.shared.f32 [%rd54+1024], %f1569;\n" -" mov.f32 %f1570, %f25;\n" -" st.shared.f32 [%rd54+1536], %f1570;\n" -" mov.f32 %f1571, %f27;\n" -" st.shared.f32 [%rd54+2048], %f1571;\n" -" mov.f32 %f1572, %f28;\n" -" st.shared.f32 [%rd54+2560], %f1572;\n" -" mov.f32 %f1579, %f271;\n" -" st.shared.f32 [%rd54+3072], %f1579;\n" -" mov.s32 %r54, %r39;\n" -" @!%p24 bra $Lt_0_66306;\n" -"$Lt_0_66818:\n" -" setp.ge.u32 %p28, %r16, %r54;\n" -" @%p28 bra $Lt_0_67074;\n" -" add.u32 %r55, %r2, %r54;\n" -" cvt.u64.u32 %rd58, %r55;\n" -" mul.wide.u32 %rd59, %r55, 4;\n" -" add.u64 %rd60, %rd51, %rd59;\n" -" ld.shared.f32 %f1580, [%rd60+0];\n" -" add.ftz.f32 %f1567, %f1580, %f1567;\n" -" st.shared.f32 [%rd54+0], %f1567;\n" -" ld.shared.f32 %f1581, [%rd60+512];\n" -" add.ftz.f32 %f1568, %f1581, %f1568;\n" -" st.shared.f32 [%rd54+512], %f1568;\n" -" ld.shared.f32 %f1582, [%rd60+1024];\n" -" add.ftz.f32 %f1569, %f1582, %f1569;\n" -" st.shared.f32 [%rd54+1024], %f1569;\n" -" ld.shared.f32 %f1583, [%rd60+1536];\n" -" add.ftz.f32 %f1570, %f1583, %f1570;\n" -" st.shared.f32 [%rd54+1536], %f1570;\n" -" ld.shared.f32 %f1584, [%rd60+2048];\n" -" add.ftz.f32 %f1571, %f1584, %f1571;\n" -" st.shared.f32 [%rd54+2048], %f1571;\n" -" ld.shared.f32 %f1585, [%rd60+2560];\n" -" add.ftz.f32 %f1572, %f1585, %f1572;\n" -" st.shared.f32 [%rd54+2560], %f1572;\n" -" ld.shared.f32 %f1586, [%rd60+3072];\n" -" add.ftz.f32 %f1579, %f1586, %f1579;\n" -" st.shared.f32 [%rd54+3072], %f1579;\n" -"$Lt_0_67074:\n" -" shr.u32 %r54, %r54, 1;\n" -" mov.u32 %r56, 0;\n" -" setp.ne.u32 %p29, %r54, %r56;\n" -" @%p29 bra $Lt_0_66818;\n" -"$Lt_0_66306:\n" -" mov.f32 %f19, %f1567;\n" -" mov.f32 %f21, %f1568;\n" -" mov.f32 %f23, %f1569;\n" -" mov.f32 %f25, %f1570;\n" -" mov.f32 %f27, %f1571;\n" -" mov.f32 %f29, %f1572;\n" -" mov.f32 %f271, %f1579;\n" -"$Lt_0_65794:\n" -"$Lt_0_63746:\n" -" mov.u32 %r57, 0;\n" -" setp.ne.s32 %p30, %r16, %r57;\n" -" @%p30 bra $Lt_0_67842;\n" -" ld.param.u64 %rd61, [__cudaparm_kernel_ellipsoid_engv];\n" -" add.u64 %rd62, %rd61, %rd3;\n" -" ld.param.s32 %r58, [__cudaparm_kernel_ellipsoid_astride];\n" -" ld.param.s32 %r59, [__cudaparm_kernel_ellipsoid_eflag];\n" -" mov.u32 %r60, 0;\n" -" setp.le.s32 %p31, %r59, %r60;\n" -" @%p31 bra $Lt_0_68354;\n" -" st.global.f32 [%rd62+0], %f271;\n" -" cvt.s64.s32 %rd63, %r58;\n" -" mul.wide.s32 %rd64, %r58, 4;\n" -" add.u64 %rd62, %rd62, %rd64;\n" -"$Lt_0_68354:\n" -" ld.param.s32 %r61, [__cudaparm_kernel_ellipsoid_vflag];\n" -" mov.u32 %r62, 0;\n" -" setp.le.s32 %p32, %r61, %r62;\n" -" @%p32 bra $Lt_0_68866;\n" -" mov.f32 %f1587, %f19;\n" -" st.global.f32 [%rd62+0], %f1587;\n" -" cvt.s64.s32 %rd65, %r58;\n" -" mul.wide.s32 %rd66, %r58, 4;\n" -" add.u64 %rd67, %rd66, %rd62;\n" -" mov.f32 %f1588, %f21;\n" -" st.global.f32 [%rd67+0], %f1588;\n" -" add.u64 %rd68, %rd66, %rd67;\n" -" mov.f32 %f1589, %f23;\n" -" st.global.f32 [%rd68+0], %f1589;\n" -" add.u64 %rd69, %rd66, %rd68;\n" -" mov.f32 %f1590, %f25;\n" -" st.global.f32 [%rd69+0], %f1590;\n" -" add.u64 %rd62, %rd66, %rd69;\n" -" mov.f32 %f1591, %f27;\n" -" st.global.f32 [%rd62+0], %f1591;\n" -" mov.f32 %f1592, %f29;\n" -" add.u64 %rd70, %rd66, %rd62;\n" -" st.global.f32 [%rd70+0], %f1592;\n" -"$Lt_0_68866:\n" -" ld.param.u64 %rd71, [__cudaparm_kernel_ellipsoid_ans];\n" -" mul.lo.u64 %rd72, %rd2, 16;\n" -" add.u64 %rd73, %rd71, %rd72;\n" -" mov.f32 %f1593, %f1594;\n" -" st.global.v4.f32 [%rd73+0], {%f270,%f269,%f268,%f1593};\n" -" add.s32 %r63, %r8, %r58;\n" -" cvt.s64.s32 %rd74, %r63;\n" -" mul.wide.s32 %rd75, %r63, 16;\n" -" add.u64 %rd76, %rd71, %rd75;\n" -" mov.f32 %f1595, %f1596;\n" -" st.global.v4.f32 [%rd76+0], {%f267,%f266,%f265,%f1595};\n" -"$Lt_0_67842:\n" -"$Lt_0_45314:\n" -" .loc 17 451 0\n" -" exit;\n" -"$LDWend_kernel_ellipsoid:\n" -" }\n" -; diff --git a/lib/gpu/scan_app.cu_o b/lib/gpu/scan_app.cu_o deleted file mode 100644 index 1ce05f2dc2..0000000000 Binary files a/lib/gpu/scan_app.cu_o and /dev/null differ