282 lines
9.3 KiB
C
282 lines
9.3 KiB
C
const char * ellipsoid_nbor =
|
|
" .version 2.3\n"
|
|
" .target sm_20\n"
|
|
" .address_size 64\n"
|
|
" .entry kernel_nbor (\n"
|
|
" .param .u64 __cudaparm_kernel_nbor_x_,\n"
|
|
" .param .u64 __cudaparm_kernel_nbor_cut_form,\n"
|
|
" .param .s32 __cudaparm_kernel_nbor_ntypes,\n"
|
|
" .param .u64 __cudaparm_kernel_nbor_dev_nbor,\n"
|
|
" .param .s32 __cudaparm_kernel_nbor_nbor_pitch,\n"
|
|
" .param .s32 __cudaparm_kernel_nbor_start,\n"
|
|
" .param .s32 __cudaparm_kernel_nbor_inum,\n"
|
|
" .param .u64 __cudaparm_kernel_nbor_dev_ij,\n"
|
|
" .param .s32 __cudaparm_kernel_nbor_form_low,\n"
|
|
" .param .s32 __cudaparm_kernel_nbor_form_high)\n"
|
|
" {\n"
|
|
" .reg .u32 %r<26>;\n"
|
|
" .reg .u64 %rd<33>;\n"
|
|
" .reg .f32 %f<20>;\n"
|
|
" .reg .pred %p<8>;\n"
|
|
" .loc 16 29 0\n"
|
|
"$LDWbegin_kernel_nbor:\n"
|
|
" cvt.s32.u32 %r1, %ctaid.x;\n"
|
|
" cvt.s32.u32 %r2, %ntid.x;\n"
|
|
" mul24.lo.s32 %r3, %r1, %r2;\n"
|
|
" mov.u32 %r4, %tid.x;\n"
|
|
" add.u32 %r5, %r3, %r4;\n"
|
|
" ld.param.s32 %r6, [__cudaparm_kernel_nbor_start];\n"
|
|
" add.u32 %r7, %r6, %r5;\n"
|
|
" ld.param.s32 %r8, [__cudaparm_kernel_nbor_inum];\n"
|
|
" setp.le.s32 %p1, %r8, %r7;\n"
|
|
" @%p1 bra $Lt_0_4354;\n"
|
|
" .loc 16 36 0\n"
|
|
" cvt.s64.s32 %rd1, %r7;\n"
|
|
" ld.param.u64 %rd2, [__cudaparm_kernel_nbor_dev_ij];\n"
|
|
" mul.wide.s32 %rd3, %r7, 4;\n"
|
|
" add.u64 %rd4, %rd2, %rd3;\n"
|
|
" ld.global.s32 %r9, [%rd4+0];\n"
|
|
" .loc 16 38 0\n"
|
|
" ld.param.s32 %r10, [__cudaparm_kernel_nbor_nbor_pitch];\n"
|
|
" cvt.s64.s32 %rd5, %r10;\n"
|
|
" mul.wide.s32 %rd6, %r10, 4;\n"
|
|
" add.u64 %rd7, %rd6, %rd4;\n"
|
|
" ld.global.s32 %r11, [%rd7+0];\n"
|
|
" .loc 16 39 0\n"
|
|
" add.u64 %rd8, %rd6, %rd7;\n"
|
|
" mov.s64 %rd9, %rd8;\n"
|
|
" .loc 16 41 0\n"
|
|
" ld.param.u64 %rd10, [__cudaparm_kernel_nbor_dev_nbor];\n"
|
|
" add.u64 %rd11, %rd1, %rd5;\n"
|
|
" add.u64 %rd12, %rd5, %rd11;\n"
|
|
" mul.lo.u64 %rd13, %rd12, 4;\n"
|
|
" add.u64 %rd14, %rd10, %rd13;\n"
|
|
" .loc 16 43 0\n"
|
|
" ld.param.u64 %rd15, [__cudaparm_kernel_nbor_x_];\n"
|
|
" cvt.s64.s32 %rd16, %r9;\n"
|
|
" mul.wide.s32 %rd17, %r9, 16;\n"
|
|
" add.u64 %rd18, %rd15, %rd17;\n"
|
|
" ld.global.v4.f32 {%f1,%f2,%f3,%f4}, [%rd18+0];\n"
|
|
" cvt.s32.s64 %r12, %rd5;\n"
|
|
" mul.lo.s32 %r13, %r12, %r11;\n"
|
|
" cvt.s64.s32 %rd19, %r13;\n"
|
|
" mul.wide.s32 %rd20, %r13, 4;\n"
|
|
" add.u64 %rd21, %rd8, %rd20;\n"
|
|
" setp.ge.u64 %p2, %rd8, %rd21;\n"
|
|
" @%p2 bra $Lt_0_6402;\n"
|
|
" cvt.rzi.ftz.s32.f32 %r14, %f4;\n"
|
|
" ld.param.s32 %r15, [__cudaparm_kernel_nbor_form_low];\n"
|
|
" cvt.rn.f32.s32 %f5, %r15;\n"
|
|
" ld.param.s32 %r16, [__cudaparm_kernel_nbor_ntypes];\n"
|
|
" mul.lo.s32 %r17, %r16, %r14;\n"
|
|
" ld.param.u64 %rd22, [__cudaparm_kernel_nbor_cut_form];\n"
|
|
" mov.s32 %r18, 0;\n"
|
|
"$Lt_0_5378:\n"
|
|
" .loc 16 49 0\n"
|
|
" ld.global.s32 %r19, [%rd9+0];\n"
|
|
" and.b32 %r20, %r19, 1073741823;\n"
|
|
" .loc 16 50 0\n"
|
|
" cvt.s64.s32 %rd23, %r20;\n"
|
|
" mul.wide.s32 %rd24, %r20, 16;\n"
|
|
" add.u64 %rd25, %rd15, %rd24;\n"
|
|
" ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd25+0];\n"
|
|
" .loc 16 53 0\n"
|
|
" cvt.rzi.ftz.s32.f32 %r21, %f9;\n"
|
|
" add.s32 %r22, %r21, %r17;\n"
|
|
" cvt.s64.s32 %rd26, %r22;\n"
|
|
" mul.wide.s32 %rd27, %r22, 8;\n"
|
|
" add.u64 %rd28, %rd22, %rd27;\n"
|
|
" ld.global.f32 %f10, [%rd28+4];\n"
|
|
" .loc 16 48 0\n"
|
|
" setp.le.ftz.f32 %p3, %f5, %f10;\n"
|
|
" @!%p3 bra $Lt_0_6658;\n"
|
|
" ld.param.s32 %r23, [__cudaparm_kernel_nbor_form_high];\n"
|
|
" cvt.rn.f32.s32 %f11, %r23;\n"
|
|
" setp.ge.ftz.f32 %p4, %f11, %f10;\n"
|
|
" @!%p4 bra $Lt_0_6658;\n"
|
|
" sub.ftz.f32 %f12, %f6, %f1;\n"
|
|
" sub.ftz.f32 %f13, %f7, %f2;\n"
|
|
" sub.ftz.f32 %f14, %f8, %f3;\n"
|
|
" ld.global.f32 %f15, [%rd28+0];\n"
|
|
" mul.ftz.f32 %f16, %f12, %f12;\n"
|
|
" fma.rn.ftz.f32 %f17, %f13, %f13, %f16;\n"
|
|
" fma.rn.ftz.f32 %f18, %f14, %f14, %f17;\n"
|
|
" setp.gt.ftz.f32 %p5, %f15, %f18;\n"
|
|
" @!%p5 bra $Lt_0_6658;\n"
|
|
" .loc 16 64 0\n"
|
|
" st.global.s32 [%rd14+0], %r20;\n"
|
|
" .loc 16 65 0\n"
|
|
" add.u64 %rd14, %rd6, %rd14;\n"
|
|
" .loc 16 66 0\n"
|
|
" add.s32 %r18, %r18, 1;\n"
|
|
"$Lt_0_6658:\n"
|
|
"$L_0_3842:\n"
|
|
" .loc 16 47 0\n"
|
|
" add.u64 %rd9, %rd6, %rd9;\n"
|
|
" setp.gt.u64 %p6, %rd21, %rd9;\n"
|
|
" @%p6 bra $Lt_0_5378;\n"
|
|
" bra.uni $Lt_0_4866;\n"
|
|
"$Lt_0_6402:\n"
|
|
" mov.s32 %r18, 0;\n"
|
|
"$Lt_0_4866:\n"
|
|
" .loc 16 70 0\n"
|
|
" add.s32 %r24, %r12, %r7;\n"
|
|
" cvt.s64.s32 %rd29, %r24;\n"
|
|
" mul.wide.s32 %rd30, %r24, 4;\n"
|
|
" add.u64 %rd31, %rd10, %rd30;\n"
|
|
" st.global.s32 [%rd31+0], %r18;\n"
|
|
"$Lt_0_4354:\n"
|
|
" .loc 16 72 0\n"
|
|
" exit;\n"
|
|
"$LDWend_kernel_nbor:\n"
|
|
" }\n"
|
|
" .entry kernel_nbor_fast (\n"
|
|
" .param .u64 __cudaparm_kernel_nbor_fast_x_,\n"
|
|
" .param .u64 __cudaparm_kernel_nbor_fast_cut_form,\n"
|
|
" .param .u64 __cudaparm_kernel_nbor_fast_dev_nbor,\n"
|
|
" .param .s32 __cudaparm_kernel_nbor_fast_nbor_pitch,\n"
|
|
" .param .s32 __cudaparm_kernel_nbor_fast_start,\n"
|
|
" .param .s32 __cudaparm_kernel_nbor_fast_inum,\n"
|
|
" .param .u64 __cudaparm_kernel_nbor_fast_dev_ij,\n"
|
|
" .param .s32 __cudaparm_kernel_nbor_fast_form_low,\n"
|
|
" .param .s32 __cudaparm_kernel_nbor_fast_form_high)\n"
|
|
" {\n"
|
|
" .reg .u32 %r<28>;\n"
|
|
" .reg .u64 %rd<42>;\n"
|
|
" .reg .f32 %f<19>;\n"
|
|
" .reg .pred %p<9>;\n"
|
|
" .shared .align 4 .b8 __cuda___cuda_local_var_32570_31_non_const_form120[484];\n"
|
|
" .shared .align 4 .b8 __cuda___cuda_local_var_32571_33_non_const_cutsq604[484];\n"
|
|
" .loc 16 84 0\n"
|
|
"$LDWbegin_kernel_nbor_fast:\n"
|
|
" cvt.s32.u32 %r1, %tid.x;\n"
|
|
" mov.u32 %r2, 120;\n"
|
|
" setp.gt.s32 %p1, %r1, %r2;\n"
|
|
" @%p1 bra $Lt_1_5122;\n"
|
|
" .loc 16 90 0\n"
|
|
" mov.u64 %rd1, __cuda___cuda_local_var_32570_31_non_const_form120;\n"
|
|
" mov.u64 %rd2, __cuda___cuda_local_var_32571_33_non_const_cutsq604;\n"
|
|
" cvt.s64.s32 %rd3, %r1;\n"
|
|
" mul.wide.s32 %rd4, %r1, 4;\n"
|
|
" ld.param.u64 %rd5, [__cudaparm_kernel_nbor_fast_cut_form];\n"
|
|
" mul.wide.s32 %rd6, %r1, 8;\n"
|
|
" add.u64 %rd7, %rd5, %rd6;\n"
|
|
" ld.global.v2.f32 {%f1,%f2}, [%rd7+0];\n"
|
|
" add.u64 %rd8, %rd4, %rd2;\n"
|
|
" st.shared.f32 [%rd8+0], %f1;\n"
|
|
" .loc 16 91 0\n"
|
|
" cvt.rzi.ftz.s32.f32 %r3, %f2;\n"
|
|
" add.u64 %rd9, %rd4, %rd1;\n"
|
|
" st.shared.s32 [%rd9+0], %r3;\n"
|
|
"$Lt_1_5122:\n"
|
|
" mov.u64 %rd1, __cuda___cuda_local_var_32570_31_non_const_form120;\n"
|
|
" mov.u64 %rd2, __cuda___cuda_local_var_32571_33_non_const_cutsq604;\n"
|
|
" .loc 16 94 0\n"
|
|
" bar.sync 0;\n"
|
|
" cvt.s32.u32 %r4, %ctaid.x;\n"
|
|
" cvt.s32.u32 %r5, %ntid.x;\n"
|
|
" mul.lo.s32 %r6, %r4, %r5;\n"
|
|
" ld.param.s32 %r7, [__cudaparm_kernel_nbor_fast_start];\n"
|
|
" add.s32 %r8, %r7, %r6;\n"
|
|
" add.s32 %r9, %r8, %r1;\n"
|
|
" ld.param.s32 %r10, [__cudaparm_kernel_nbor_fast_inum];\n"
|
|
" setp.le.s32 %p2, %r10, %r9;\n"
|
|
" @%p2 bra $Lt_1_5634;\n"
|
|
" .loc 16 98 0\n"
|
|
" cvt.s64.s32 %rd10, %r9;\n"
|
|
" ld.param.u64 %rd11, [__cudaparm_kernel_nbor_fast_dev_ij];\n"
|
|
" mul.wide.s32 %rd12, %r9, 4;\n"
|
|
" add.u64 %rd13, %rd11, %rd12;\n"
|
|
" ld.global.s32 %r11, [%rd13+0];\n"
|
|
" .loc 16 100 0\n"
|
|
" ld.param.s32 %r12, [__cudaparm_kernel_nbor_fast_nbor_pitch];\n"
|
|
" cvt.s64.s32 %rd14, %r12;\n"
|
|
" mul.wide.s32 %rd15, %r12, 4;\n"
|
|
" add.u64 %rd16, %rd15, %rd13;\n"
|
|
" ld.global.s32 %r13, [%rd16+0];\n"
|
|
" .loc 16 101 0\n"
|
|
" add.u64 %rd17, %rd15, %rd16;\n"
|
|
" mov.s64 %rd18, %rd17;\n"
|
|
" .loc 16 103 0\n"
|
|
" ld.param.u64 %rd19, [__cudaparm_kernel_nbor_fast_dev_nbor];\n"
|
|
" add.u64 %rd20, %rd10, %rd14;\n"
|
|
" add.u64 %rd21, %rd14, %rd20;\n"
|
|
" mul.lo.u64 %rd22, %rd21, 4;\n"
|
|
" add.u64 %rd23, %rd19, %rd22;\n"
|
|
" .loc 16 105 0\n"
|
|
" ld.param.u64 %rd24, [__cudaparm_kernel_nbor_fast_x_];\n"
|
|
" cvt.s64.s32 %rd25, %r11;\n"
|
|
" mul.wide.s32 %rd26, %r11, 16;\n"
|
|
" add.u64 %rd27, %rd24, %rd26;\n"
|
|
" ld.global.v4.f32 {%f3,%f4,%f5,%f6}, [%rd27+0];\n"
|
|
" cvt.s32.s64 %r14, %rd14;\n"
|
|
" mul.lo.s32 %r15, %r14, %r13;\n"
|
|
" cvt.s64.s32 %rd28, %r15;\n"
|
|
" mul.wide.s32 %rd29, %r15, 4;\n"
|
|
" add.u64 %rd30, %rd17, %rd29;\n"
|
|
" setp.ge.u64 %p3, %rd17, %rd30;\n"
|
|
" @%p3 bra $Lt_1_7682;\n"
|
|
" cvt.rzi.ftz.s32.f32 %r16, %f6;\n"
|
|
" mul.lo.s32 %r17, %r16, 11;\n"
|
|
" ld.param.s32 %r18, [__cudaparm_kernel_nbor_fast_form_low];\n"
|
|
" mov.s32 %r19, 0;\n"
|
|
"$Lt_1_6658:\n"
|
|
" .loc 16 112 0\n"
|
|
" ld.global.s32 %r20, [%rd18+0];\n"
|
|
" and.b32 %r21, %r20, 1073741823;\n"
|
|
" .loc 16 113 0\n"
|
|
" cvt.s64.s32 %rd31, %r21;\n"
|
|
" mul.wide.s32 %rd32, %r21, 16;\n"
|
|
" add.u64 %rd33, %rd24, %rd32;\n"
|
|
" ld.global.v4.f32 {%f7,%f8,%f9,%f10}, [%rd33+0];\n"
|
|
" .loc 16 111 0\n"
|
|
" cvt.rzi.ftz.s32.f32 %r22, %f10;\n"
|
|
" add.s32 %r23, %r22, %r17;\n"
|
|
" cvt.s64.s32 %rd34, %r23;\n"
|
|
" mul.wide.s32 %rd35, %r23, 4;\n"
|
|
" add.u64 %rd36, %rd35, %rd1;\n"
|
|
" ld.shared.s32 %r24, [%rd36+0];\n"
|
|
" setp.lt.s32 %p4, %r24, %r18;\n"
|
|
" @%p4 bra $Lt_1_7938;\n"
|
|
" ld.param.s32 %r25, [__cudaparm_kernel_nbor_fast_form_high];\n"
|
|
" setp.lt.s32 %p5, %r25, %r24;\n"
|
|
" @%p5 bra $Lt_1_7938;\n"
|
|
" sub.ftz.f32 %f11, %f7, %f3;\n"
|
|
" sub.ftz.f32 %f12, %f8, %f4;\n"
|
|
" sub.ftz.f32 %f13, %f9, %f5;\n"
|
|
" add.u64 %rd37, %rd35, %rd2;\n"
|
|
" ld.shared.f32 %f14, [%rd37+0];\n"
|
|
" mul.ftz.f32 %f15, %f11, %f11;\n"
|
|
" fma.rn.ftz.f32 %f16, %f12, %f12, %f15;\n"
|
|
" fma.rn.ftz.f32 %f17, %f13, %f13, %f16;\n"
|
|
" setp.gt.ftz.f32 %p6, %f14, %f17;\n"
|
|
" @!%p6 bra $Lt_1_7938;\n"
|
|
" .loc 16 127 0\n"
|
|
" st.global.s32 [%rd23+0], %r21;\n"
|
|
" .loc 16 128 0\n"
|
|
" add.u64 %rd23, %rd15, %rd23;\n"
|
|
" .loc 16 129 0\n"
|
|
" add.s32 %r19, %r19, 1;\n"
|
|
"$Lt_1_7938:\n"
|
|
"$L_1_4610:\n"
|
|
" .loc 16 110 0\n"
|
|
" add.u64 %rd18, %rd15, %rd18;\n"
|
|
" setp.gt.u64 %p7, %rd30, %rd18;\n"
|
|
" @%p7 bra $Lt_1_6658;\n"
|
|
" bra.uni $Lt_1_6146;\n"
|
|
"$Lt_1_7682:\n"
|
|
" mov.s32 %r19, 0;\n"
|
|
"$Lt_1_6146:\n"
|
|
" .loc 16 133 0\n"
|
|
" add.s32 %r26, %r14, %r9;\n"
|
|
" cvt.s64.s32 %rd38, %r26;\n"
|
|
" mul.wide.s32 %rd39, %r26, 4;\n"
|
|
" add.u64 %rd40, %rd19, %rd39;\n"
|
|
" st.global.s32 [%rd40+0], %r19;\n"
|
|
"$Lt_1_5634:\n"
|
|
" .loc 16 135 0\n"
|
|
" exit;\n"
|
|
"$LDWend_kernel_nbor_fast:\n"
|
|
" }\n"
|
|
;
|