830 lines
26 KiB
C
830 lines
26 KiB
C
const char * cmmc_long_gpu_kernel =
|
|
" .version 1.4\n"
|
|
" .target sm_13\n"
|
|
" .tex .u64 pos_tex;\n"
|
|
" .tex .u64 q_tex;\n"
|
|
" .entry kernel_pair (\n"
|
|
" .param .u64 __cudaparm_kernel_pair_x_,\n"
|
|
" .param .u64 __cudaparm_kernel_pair_lj1,\n"
|
|
" .param .u64 __cudaparm_kernel_pair_lj3,\n"
|
|
" .param .s32 __cudaparm_kernel_pair_lj_types,\n"
|
|
" .param .u64 __cudaparm_kernel_pair_sp_lj_in,\n"
|
|
" .param .u64 __cudaparm_kernel_pair_dev_nbor,\n"
|
|
" .param .u64 __cudaparm_kernel_pair_ans,\n"
|
|
" .param .u64 __cudaparm_kernel_pair_engv,\n"
|
|
" .param .s32 __cudaparm_kernel_pair_eflag,\n"
|
|
" .param .s32 __cudaparm_kernel_pair_vflag,\n"
|
|
" .param .s32 __cudaparm_kernel_pair_inum,\n"
|
|
" .param .s32 __cudaparm_kernel_pair_nall,\n"
|
|
" .param .s32 __cudaparm_kernel_pair_nbor_pitch,\n"
|
|
" .param .u64 __cudaparm_kernel_pair_q_,\n"
|
|
" .param .f32 __cudaparm_kernel_pair_cut_coulsq,\n"
|
|
" .param .f32 __cudaparm_kernel_pair_qqrd2e,\n"
|
|
" .param .f32 __cudaparm_kernel_pair_g_ewald)\n"
|
|
" {\n"
|
|
" .reg .u32 %r<42>;\n"
|
|
" .reg .u64 %rd<38>;\n"
|
|
" .reg .f32 %f<156>;\n"
|
|
" .reg .pred %p<12>;\n"
|
|
" .shared .align 4 .b8 __cuda_sp_lj108[32];\n"
|
|
" .loc 14 107 0\n"
|
|
"$LBB1_kernel_pair:\n"
|
|
" .loc 14 111 0\n"
|
|
" ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];\n"
|
|
" ld.global.f32 %f1, [%rd1+0];\n"
|
|
" st.shared.f32 [__cuda_sp_lj108+0], %f1;\n"
|
|
" .loc 14 112 0\n"
|
|
" ld.global.f32 %f2, [%rd1+4];\n"
|
|
" st.shared.f32 [__cuda_sp_lj108+4], %f2;\n"
|
|
" .loc 14 113 0\n"
|
|
" ld.global.f32 %f3, [%rd1+8];\n"
|
|
" st.shared.f32 [__cuda_sp_lj108+8], %f3;\n"
|
|
" .loc 14 114 0\n"
|
|
" ld.global.f32 %f4, [%rd1+12];\n"
|
|
" st.shared.f32 [__cuda_sp_lj108+12], %f4;\n"
|
|
" .loc 14 115 0\n"
|
|
" ld.global.f32 %f5, [%rd1+16];\n"
|
|
" st.shared.f32 [__cuda_sp_lj108+16], %f5;\n"
|
|
" .loc 14 116 0\n"
|
|
" ld.global.f32 %f6, [%rd1+20];\n"
|
|
" st.shared.f32 [__cuda_sp_lj108+20], %f6;\n"
|
|
" .loc 14 117 0\n"
|
|
" ld.global.f32 %f7, [%rd1+24];\n"
|
|
" st.shared.f32 [__cuda_sp_lj108+24], %f7;\n"
|
|
" .loc 14 118 0\n"
|
|
" ld.global.f32 %f8, [%rd1+28];\n"
|
|
" st.shared.f32 [__cuda_sp_lj108+28], %f8;\n"
|
|
" cvt.s32.u16 %r1, %ctaid.x;\n"
|
|
" cvt.s32.u16 %r2, %ntid.x;\n"
|
|
" mul24.lo.s32 %r3, %r1, %r2;\n"
|
|
" cvt.u32.u16 %r4, %tid.x;\n"
|
|
" add.u32 %r5, %r3, %r4;\n"
|
|
" ld.param.s32 %r6, [__cudaparm_kernel_pair_inum];\n"
|
|
" setp.le.s32 %p1, %r6, %r5;\n"
|
|
" @%p1 bra $Lt_0_11778;\n"
|
|
" .loc 14 129 0\n"
|
|
" mov.f32 %f9, 0f00000000; \n"
|
|
" mov.f32 %f10, %f9;\n"
|
|
" mov.f32 %f11, 0f00000000; \n"
|
|
" mov.f32 %f12, %f11;\n"
|
|
" mov.f32 %f13, 0f00000000; \n"
|
|
" mov.f32 %f14, %f13;\n"
|
|
" mov.f32 %f15, 0f00000000; \n"
|
|
" mov.f32 %f16, %f15;\n"
|
|
" mov.f32 %f17, 0f00000000; \n"
|
|
" mov.f32 %f18, %f17;\n"
|
|
" mov.f32 %f19, 0f00000000; \n"
|
|
" mov.f32 %f20, %f19;\n"
|
|
" .loc 14 132 0\n"
|
|
" cvt.u64.s32 %rd2, %r5;\n"
|
|
" mul.lo.u64 %rd3, %rd2, 4;\n"
|
|
" ld.param.u64 %rd4, [__cudaparm_kernel_pair_dev_nbor];\n"
|
|
" add.u64 %rd5, %rd4, %rd3;\n"
|
|
" ld.global.s32 %r7, [%rd5+0];\n"
|
|
" .loc 14 134 0\n"
|
|
" ld.param.s32 %r8, [__cudaparm_kernel_pair_nbor_pitch];\n"
|
|
" cvt.u64.s32 %rd6, %r8;\n"
|
|
" mul.lo.u64 %rd7, %rd6, 4;\n"
|
|
" add.u64 %rd8, %rd5, %rd7;\n"
|
|
" ld.global.s32 %r9, [%rd8+0];\n"
|
|
" .loc 14 135 0\n"
|
|
" add.u64 %rd9, %rd8, %rd7;\n"
|
|
" mov.s64 %rd10, %rd9;\n"
|
|
" mov.s32 %r10, %r7;\n"
|
|
" mov.s32 %r11, 0;\n"
|
|
" mov.s32 %r12, 0;\n"
|
|
" mov.s32 %r13, 0;\n"
|
|
" tex.1d.v4.f32.s32 {%f21,%f22,%f23,%f24},[pos_tex,{%r10,%r11,%r12,%r13}];\n"
|
|
" .loc 14 138 0\n"
|
|
" mov.f32 %f25, %f21;\n"
|
|
" mov.f32 %f26, %f22;\n"
|
|
" mov.f32 %f27, %f23;\n"
|
|
" mov.f32 %f28, %f24;\n"
|
|
" mov.s32 %r14, %r7;\n"
|
|
" mov.s32 %r15, 0;\n"
|
|
" mov.s32 %r16, 0;\n"
|
|
" mov.s32 %r17, 0;\n"
|
|
" tex.1d.v4.f32.s32 {%f29,%f30,%f31,%f32},[q_tex,{%r14,%r15,%r16,%r17}];\n"
|
|
" .loc 14 139 0\n"
|
|
" mov.f32 %f33, %f29;\n"
|
|
" mul24.lo.s32 %r18, %r9, %r8;\n"
|
|
" cvt.s64.s32 %rd11, %r18;\n"
|
|
" mul.lo.u64 %rd12, %rd11, 4;\n"
|
|
" add.u64 %rd13, %rd9, %rd12;\n"
|
|
" ld.param.s32 %r19, [__cudaparm_kernel_pair_vflag];\n"
|
|
" ld.param.s32 %r20, [__cudaparm_kernel_pair_eflag];\n"
|
|
" setp.ge.u64 %p2, %rd9, %rd13;\n"
|
|
" mov.f32 %f34, 0f00000000; \n"
|
|
" mov.f32 %f35, 0f00000000; \n"
|
|
" mov.f32 %f36, 0f00000000; \n"
|
|
" mov.f32 %f37, 0f00000000; \n"
|
|
" mov.f32 %f38, 0f00000000; \n"
|
|
" @%p2 bra $Lt_0_18434;\n"
|
|
" mov.s32 %r21, 0;\n"
|
|
" setp.gt.s32 %p3, %r20, %r21;\n"
|
|
" mov.s32 %r22, 0;\n"
|
|
" setp.gt.s32 %p4, %r19, %r22;\n"
|
|
" cvt.rzi.s32.f32 %r23, %f28;\n"
|
|
" ld.param.s32 %r24, [__cudaparm_kernel_pair_lj_types];\n"
|
|
" mul.lo.s32 %r25, %r24, %r23;\n"
|
|
" ld.param.u64 %rd14, [__cudaparm_kernel_pair_lj1];\n"
|
|
" mov.u64 %rd15, __cuda_sp_lj108;\n"
|
|
"$Lt_0_12802:\n"
|
|
" .loc 14 143 0\n"
|
|
" ld.global.s32 %r26, [%rd10+0];\n"
|
|
" .loc 14 146 0\n"
|
|
" shr.s32 %r27, %r26, 30;\n"
|
|
" cvt.s64.s32 %rd16, %r27;\n"
|
|
" and.b64 %rd17, %rd16, 3;\n"
|
|
" mul.lo.u64 %rd18, %rd17, 4;\n"
|
|
" add.u64 %rd19, %rd15, %rd18;\n"
|
|
" ld.shared.f32 %f39, [%rd19+0];\n"
|
|
" .loc 14 147 0\n"
|
|
" mov.f32 %f40, 0f3f800000; \n"
|
|
" ld.shared.f32 %f41, [%rd19+16];\n"
|
|
" sub.f32 %f42, %f40, %f41;\n"
|
|
" and.b32 %r28, %r26, 1073741823;\n"
|
|
" mov.s32 %r29, %r28;\n"
|
|
" mov.s32 %r30, 0;\n"
|
|
" mov.s32 %r31, 0;\n"
|
|
" mov.s32 %r32, 0;\n"
|
|
" tex.1d.v4.f32.s32 {%f43,%f44,%f45,%f46},[pos_tex,{%r29,%r30,%r31,%r32}];\n"
|
|
" .loc 14 150 0\n"
|
|
" mov.f32 %f47, %f43;\n"
|
|
" mov.f32 %f48, %f44;\n"
|
|
" mov.f32 %f49, %f45;\n"
|
|
" mov.f32 %f50, %f46;\n"
|
|
" cvt.rzi.s32.f32 %r33, %f50;\n"
|
|
" sub.f32 %f51, %f26, %f48;\n"
|
|
" sub.f32 %f52, %f25, %f47;\n"
|
|
" sub.f32 %f53, %f27, %f49;\n"
|
|
" mul.f32 %f54, %f51, %f51;\n"
|
|
" mad.f32 %f55, %f52, %f52, %f54;\n"
|
|
" mad.f32 %f56, %f53, %f53, %f55;\n"
|
|
" add.s32 %r34, %r33, %r25;\n"
|
|
" cvt.u64.s32 %rd20, %r34;\n"
|
|
" mul.lo.u64 %rd21, %rd20, 16;\n"
|
|
" add.u64 %rd22, %rd21, %rd14;\n"
|
|
" ld.global.f32 %f57, [%rd22+0];\n"
|
|
" setp.gt.f32 %p5, %f57, %f56;\n"
|
|
" @!%p5 bra $Lt_0_16642;\n"
|
|
" rcp.approx.f32 %f58, %f56;\n"
|
|
" ld.global.f32 %f59, [%rd22+4];\n"
|
|
" setp.lt.f32 %p6, %f56, %f59;\n"
|
|
" @!%p6 bra $Lt_0_13826;\n"
|
|
" ld.param.u64 %rd23, [__cudaparm_kernel_pair_lj3];\n"
|
|
" add.u64 %rd24, %rd23, %rd21;\n"
|
|
" ld.global.f32 %f60, [%rd24+0];\n"
|
|
" mov.f32 %f61, 0f40000000; \n"
|
|
" setp.eq.f32 %p7, %f60, %f61;\n"
|
|
" @!%p7 bra $Lt_0_14338;\n"
|
|
" .loc 14 166 0\n"
|
|
" mul.f32 %f62, %f58, %f58;\n"
|
|
" mov.f32 %f63, %f62;\n"
|
|
" mov.f32 %f64, %f63;\n"
|
|
" .loc 14 167 0\n"
|
|
" mul.f32 %f65, %f62, %f62;\n"
|
|
" mov.f32 %f66, %f65;\n"
|
|
" bra.uni $Lt_0_14594;\n"
|
|
"$Lt_0_14338:\n"
|
|
" mov.f32 %f67, 0f3f800000; \n"
|
|
" setp.eq.f32 %p8, %f60, %f67;\n"
|
|
" @!%p8 bra $Lt_0_14850;\n"
|
|
" .loc 14 169 0\n"
|
|
" sqrt.approx.f32 %f68, %f58;\n"
|
|
" mul.f32 %f69, %f58, %f68;\n"
|
|
" mov.f32 %f65, %f69;\n"
|
|
" mov.f32 %f66, %f65;\n"
|
|
" .loc 14 170 0\n"
|
|
" mul.f32 %f63, %f69, %f69;\n"
|
|
" mov.f32 %f64, %f63;\n"
|
|
" bra.uni $Lt_0_14594;\n"
|
|
"$Lt_0_14850:\n"
|
|
" .loc 14 172 0\n"
|
|
" mul.f32 %f70, %f58, %f58;\n"
|
|
" mul.f32 %f71, %f58, %f70;\n"
|
|
" mov.f32 %f63, %f71;\n"
|
|
" mov.f32 %f64, %f63;\n"
|
|
" .loc 14 173 0\n"
|
|
" mov.f32 %f65, %f71;\n"
|
|
" mov.f32 %f66, %f65;\n"
|
|
"$Lt_0_14594:\n"
|
|
"$Lt_0_14082:\n"
|
|
" .loc 14 146 0\n"
|
|
" ld.shared.f32 %f39, [%rd19+0];\n"
|
|
" .loc 14 175 0\n"
|
|
" mul.f32 %f72, %f39, %f63;\n"
|
|
" ld.global.v2.f32 {%f73,%f74}, [%rd22+8];\n"
|
|
" mul.f32 %f75, %f73, %f65;\n"
|
|
" sub.f32 %f76, %f75, %f74;\n"
|
|
" mul.f32 %f77, %f72, %f76;\n"
|
|
" bra.uni $Lt_0_13570;\n"
|
|
"$Lt_0_13826:\n"
|
|
" .loc 14 177 0\n"
|
|
" mov.f32 %f77, 0f00000000; \n"
|
|
"$Lt_0_13570:\n"
|
|
" ld.param.f32 %f78, [__cudaparm_kernel_pair_cut_coulsq];\n"
|
|
" setp.gt.f32 %p9, %f78, %f56;\n"
|
|
" @!%p9 bra $Lt_0_15362;\n"
|
|
" .loc 14 184 0\n"
|
|
" sqrt.approx.f32 %f79, %f56;\n"
|
|
" ld.param.f32 %f80, [__cudaparm_kernel_pair_g_ewald];\n"
|
|
" mul.f32 %f81, %f80, %f79;\n"
|
|
" mul.f32 %f82, %f81, %f81;\n"
|
|
" mov.f32 %f83, 0f3f800000; \n"
|
|
" mov.f32 %f84, 0f3ea7ba05; \n"
|
|
" mad.f32 %f85, %f84, %f81, %f83;\n"
|
|
" neg.f32 %f86, %f82;\n"
|
|
" rcp.approx.f32 %f87, %f85;\n"
|
|
" mov.f32 %f88, 0f3fb8aa3b; \n"
|
|
" mul.f32 %f89, %f86, %f88;\n"
|
|
" ex2.approx.f32 %f90, %f89;\n"
|
|
" mov.f32 %f91, 0f3e827906; \n"
|
|
" mov.f32 %f92, 0fbe91a98e; \n"
|
|
" mov.f32 %f93, 0f3fb5f0e3; \n"
|
|
" mov.f32 %f94, 0fbfba00e3; \n"
|
|
" mov.f32 %f95, 0f3f87dc22; \n"
|
|
" mad.f32 %f96, %f95, %f87, %f94;\n"
|
|
" mad.f32 %f97, %f87, %f96, %f93;\n"
|
|
" mad.f32 %f98, %f87, %f97, %f92;\n"
|
|
" mad.f32 %f99, %f87, %f98, %f91;\n"
|
|
" mul.f32 %f100, %f87, %f99;\n"
|
|
" mul.f32 %f101, %f90, %f100;\n"
|
|
" mov.f32 %f102, %f101;\n"
|
|
" mov.s32 %r35, %r28;\n"
|
|
" mov.s32 %r36, 0;\n"
|
|
" mov.s32 %r37, 0;\n"
|
|
" mov.s32 %r38, 0;\n"
|
|
" tex.1d.v4.f32.s32 {%f103,%f104,%f105,%f106},[q_tex,{%r35,%r36,%r37,%r38}];\n"
|
|
" .loc 14 185 0\n"
|
|
" mov.f32 %f107, %f103;\n"
|
|
" ld.param.f32 %f108, [__cudaparm_kernel_pair_qqrd2e];\n"
|
|
" mul.f32 %f109, %f108, %f33;\n"
|
|
" mul.f32 %f110, %f109, %f107;\n"
|
|
" div.approx.f32 %f111, %f110, %f79;\n"
|
|
" mov.f32 %f112, %f111;\n"
|
|
" .loc 14 186 0\n"
|
|
" mov.f32 %f113, 0f3f906ebb; \n"
|
|
" mul.f32 %f114, %f81, %f113;\n"
|
|
" mad.f32 %f115, %f90, %f114, %f101;\n"
|
|
" sub.f32 %f116, %f115, %f42;\n"
|
|
" mul.f32 %f117, %f111, %f116;\n"
|
|
" bra.uni $Lt_0_15106;\n"
|
|
"$Lt_0_15362:\n"
|
|
" .loc 14 189 0\n"
|
|
" mov.f32 %f112, 0f00000000; \n"
|
|
" mov.f32 %f117, 0f00000000; \n"
|
|
"$Lt_0_15106:\n"
|
|
" .loc 14 194 0\n"
|
|
" add.f32 %f118, %f117, %f77;\n"
|
|
" mul.f32 %f119, %f118, %f58;\n"
|
|
" mad.f32 %f36, %f52, %f119, %f36;\n"
|
|
" .loc 14 195 0\n"
|
|
" mad.f32 %f35, %f51, %f119, %f35;\n"
|
|
" .loc 14 196 0\n"
|
|
" mad.f32 %f34, %f53, %f119, %f34;\n"
|
|
" @!%p3 bra $Lt_0_16130;\n"
|
|
" .loc 14 199 0\n"
|
|
" mov.f32 %f120, %f102;\n"
|
|
" sub.f32 %f121, %f120, %f42;\n"
|
|
" mad.f32 %f37, %f112, %f121, %f37;\n"
|
|
" @!%p6 bra $Lt_0_16130;\n"
|
|
" .loc 14 201 0\n"
|
|
" ld.param.u64 %rd25, [__cudaparm_kernel_pair_lj3];\n"
|
|
" add.u64 %rd26, %rd25, %rd21;\n"
|
|
" ld.global.v4.f32 {_,%f122,%f123,%f124}, [%rd26+0];\n"
|
|
" mov.f32 %f125, %f64;\n"
|
|
" .loc 14 146 0\n"
|
|
" ld.shared.f32 %f39, [%rd19+0];\n"
|
|
" .loc 14 201 0\n"
|
|
" mul.f32 %f126, %f125, %f39;\n"
|
|
" mov.f32 %f127, %f66;\n"
|
|
" mul.f32 %f128, %f122, %f127;\n"
|
|
" sub.f32 %f129, %f128, %f123;\n"
|
|
" mul.f32 %f130, %f126, %f129;\n"
|
|
" sub.f32 %f131, %f130, %f124;\n"
|
|
" add.f32 %f38, %f38, %f131;\n"
|
|
"$Lt_0_16130:\n"
|
|
"$Lt_0_15618:\n"
|
|
" @!%p4 bra $Lt_0_16642;\n"
|
|
" .loc 14 206 0\n"
|
|
" mov.f32 %f132, %f10;\n"
|
|
" mul.f32 %f133, %f52, %f52;\n"
|
|
" mad.f32 %f134, %f119, %f133, %f132;\n"
|
|
" mov.f32 %f10, %f134;\n"
|
|
" .loc 14 207 0\n"
|
|
" mov.f32 %f135, %f12;\n"
|
|
" mad.f32 %f136, %f119, %f54, %f135;\n"
|
|
" mov.f32 %f12, %f136;\n"
|
|
" .loc 14 208 0\n"
|
|
" mov.f32 %f137, %f14;\n"
|
|
" mul.f32 %f138, %f53, %f53;\n"
|
|
" mad.f32 %f139, %f119, %f138, %f137;\n"
|
|
" mov.f32 %f14, %f139;\n"
|
|
" .loc 14 209 0\n"
|
|
" mov.f32 %f140, %f16;\n"
|
|
" mul.f32 %f141, %f51, %f52;\n"
|
|
" mad.f32 %f142, %f119, %f141, %f140;\n"
|
|
" mov.f32 %f16, %f142;\n"
|
|
" .loc 14 210 0\n"
|
|
" mov.f32 %f143, %f18;\n"
|
|
" mul.f32 %f144, %f52, %f53;\n"
|
|
" mad.f32 %f145, %f119, %f144, %f143;\n"
|
|
" mov.f32 %f18, %f145;\n"
|
|
" .loc 14 211 0\n"
|
|
" mul.f32 %f146, %f51, %f53;\n"
|
|
" mad.f32 %f19, %f119, %f146, %f19;\n"
|
|
" mov.f32 %f147, %f19;\n"
|
|
"$Lt_0_16642:\n"
|
|
"$Lt_0_13058:\n"
|
|
" .loc 14 142 0\n"
|
|
" add.u64 %rd10, %rd7, %rd10;\n"
|
|
" setp.gt.u64 %p10, %rd13, %rd10;\n"
|
|
" @%p10 bra $Lt_0_12802;\n"
|
|
" bra.uni $Lt_0_12290;\n"
|
|
"$Lt_0_18434:\n"
|
|
" mov.s32 %r39, 0;\n"
|
|
" setp.gt.s32 %p3, %r20, %r39;\n"
|
|
" mov.s32 %r40, 0;\n"
|
|
" setp.gt.s32 %p4, %r19, %r40;\n"
|
|
"$Lt_0_12290:\n"
|
|
" .loc 14 218 0\n"
|
|
" ld.param.u64 %rd27, [__cudaparm_kernel_pair_engv];\n"
|
|
" add.u64 %rd28, %rd27, %rd3;\n"
|
|
" @!%p3 bra $Lt_0_17410;\n"
|
|
" .loc 14 220 0\n"
|
|
" st.global.f32 [%rd28+0], %f38;\n"
|
|
" .loc 14 221 0\n"
|
|
" cvt.u64.s32 %rd29, %r6;\n"
|
|
" mul.lo.u64 %rd30, %rd29, 4;\n"
|
|
" add.u64 %rd28, %rd30, %rd28;\n"
|
|
" .loc 14 222 0\n"
|
|
" st.global.f32 [%rd28+0], %f37;\n"
|
|
" .loc 14 223 0\n"
|
|
" add.u64 %rd28, %rd30, %rd28;\n"
|
|
"$Lt_0_17410:\n"
|
|
" @!%p4 bra $Lt_0_17922;\n"
|
|
" .loc 14 227 0\n"
|
|
" mov.f32 %f148, %f10;\n"
|
|
" st.global.f32 [%rd28+0], %f148;\n"
|
|
" .loc 14 228 0\n"
|
|
" cvt.u64.s32 %rd31, %r6;\n"
|
|
" mul.lo.u64 %rd32, %rd31, 4;\n"
|
|
" add.u64 %rd28, %rd32, %rd28;\n"
|
|
" .loc 14 227 0\n"
|
|
" mov.f32 %f149, %f12;\n"
|
|
" st.global.f32 [%rd28+0], %f149;\n"
|
|
" .loc 14 228 0\n"
|
|
" add.u64 %rd28, %rd32, %rd28;\n"
|
|
" .loc 14 227 0\n"
|
|
" mov.f32 %f150, %f14;\n"
|
|
" st.global.f32 [%rd28+0], %f150;\n"
|
|
" .loc 14 228 0\n"
|
|
" add.u64 %rd28, %rd32, %rd28;\n"
|
|
" .loc 14 227 0\n"
|
|
" mov.f32 %f151, %f16;\n"
|
|
" st.global.f32 [%rd28+0], %f151;\n"
|
|
" .loc 14 228 0\n"
|
|
" add.u64 %rd28, %rd32, %rd28;\n"
|
|
" .loc 14 227 0\n"
|
|
" mov.f32 %f152, %f18;\n"
|
|
" st.global.f32 [%rd28+0], %f152;\n"
|
|
" add.u64 %rd33, %rd32, %rd28;\n"
|
|
" st.global.f32 [%rd33+0], %f19;\n"
|
|
"$Lt_0_17922:\n"
|
|
" .loc 14 231 0\n"
|
|
" ld.param.u64 %rd34, [__cudaparm_kernel_pair_ans];\n"
|
|
" mul.lo.u64 %rd35, %rd2, 16;\n"
|
|
" add.u64 %rd36, %rd34, %rd35;\n"
|
|
" mov.f32 %f153, %f154;\n"
|
|
" st.global.v4.f32 [%rd36+0], {%f36,%f35,%f34,%f153};\n"
|
|
"$Lt_0_11778:\n"
|
|
" .loc 14 233 0\n"
|
|
" exit;\n"
|
|
"$LDWend_kernel_pair:\n"
|
|
" }\n"
|
|
" .entry kernel_pair_fast (\n"
|
|
" .param .u64 __cudaparm_kernel_pair_fast_x_,\n"
|
|
" .param .u64 __cudaparm_kernel_pair_fast_lj1_in,\n"
|
|
" .param .u64 __cudaparm_kernel_pair_fast_lj3_in,\n"
|
|
" .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,\n"
|
|
" .param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n"
|
|
" .param .u64 __cudaparm_kernel_pair_fast_ans,\n"
|
|
" .param .u64 __cudaparm_kernel_pair_fast_engv,\n"
|
|
" .param .s32 __cudaparm_kernel_pair_fast_eflag,\n"
|
|
" .param .s32 __cudaparm_kernel_pair_fast_vflag,\n"
|
|
" .param .s32 __cudaparm_kernel_pair_fast_inum,\n"
|
|
" .param .s32 __cudaparm_kernel_pair_fast_nall,\n"
|
|
" .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,\n"
|
|
" .param .u64 __cudaparm_kernel_pair_fast_q_,\n"
|
|
" .param .f32 __cudaparm_kernel_pair_fast_cut_coulsq,\n"
|
|
" .param .f32 __cudaparm_kernel_pair_fast_qqrd2e,\n"
|
|
" .param .f32 __cudaparm_kernel_pair_fast_g_ewald)\n"
|
|
" {\n"
|
|
" .reg .u32 %r<43>;\n"
|
|
" .reg .u64 %rd<49>;\n"
|
|
" .reg .f32 %f<159>;\n"
|
|
" .reg .pred %p<14>;\n"
|
|
" .shared .align 4 .b8 __cuda_sp_lj244[32];\n"
|
|
" .shared .align 16 .b8 __cuda_lj3288[1024];\n"
|
|
" .shared .align 16 .b8 __cuda_lj11312[1024];\n"
|
|
" .loc 14 242 0\n"
|
|
"$LBB1_kernel_pair_fast:\n"
|
|
" cvt.s32.u16 %r1, %tid.x;\n"
|
|
" mov.u32 %r2, 7;\n"
|
|
" setp.gt.s32 %p1, %r1, %r2;\n"
|
|
" @%p1 bra $Lt_1_13314;\n"
|
|
" .loc 14 249 0\n"
|
|
" mov.u64 %rd1, __cuda_sp_lj244;\n"
|
|
" cvt.u64.s32 %rd2, %r1;\n"
|
|
" mul.lo.u64 %rd3, %rd2, 4;\n"
|
|
" ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];\n"
|
|
" add.u64 %rd5, %rd4, %rd3;\n"
|
|
" ld.global.f32 %f1, [%rd5+0];\n"
|
|
" add.u64 %rd6, %rd3, %rd1;\n"
|
|
" st.shared.f32 [%rd6+0], %f1;\n"
|
|
"$Lt_1_13314:\n"
|
|
" mov.u64 %rd1, __cuda_sp_lj244;\n"
|
|
" mov.u32 %r3, 63;\n"
|
|
" setp.gt.s32 %p2, %r1, %r3;\n"
|
|
" @%p2 bra $Lt_1_13826;\n"
|
|
" .loc 14 251 0\n"
|
|
" mov.u64 %rd7, __cuda_lj3288;\n"
|
|
" mov.u64 %rd8, __cuda_lj11312;\n"
|
|
" cvt.u64.s32 %rd9, %r1;\n"
|
|
" mul.lo.u64 %rd10, %rd9, 16;\n"
|
|
" ld.param.u64 %rd11, [__cudaparm_kernel_pair_fast_lj1_in];\n"
|
|
" add.u64 %rd12, %rd11, %rd10;\n"
|
|
" add.u64 %rd13, %rd10, %rd8;\n"
|
|
" ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd12+0];\n"
|
|
" st.shared.f32 [%rd13+0], %f2;\n"
|
|
" st.shared.f32 [%rd13+4], %f3;\n"
|
|
" st.shared.f32 [%rd13+8], %f4;\n"
|
|
" st.shared.f32 [%rd13+12], %f5;\n"
|
|
" .loc 14 252 0\n"
|
|
" ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in];\n"
|
|
" add.u64 %rd15, %rd14, %rd10;\n"
|
|
" add.u64 %rd16, %rd10, %rd7;\n"
|
|
" ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0];\n"
|
|
" st.shared.f32 [%rd16+0], %f6;\n"
|
|
" st.shared.f32 [%rd16+4], %f7;\n"
|
|
" st.shared.f32 [%rd16+8], %f8;\n"
|
|
" st.shared.f32 [%rd16+12], %f9;\n"
|
|
"$Lt_1_13826:\n"
|
|
" mov.u64 %rd7, __cuda_lj3288;\n"
|
|
" mov.u64 %rd8, __cuda_lj11312;\n"
|
|
" .loc 14 255 0\n"
|
|
" bar.sync 0;\n"
|
|
" cvt.s32.u16 %r4, %ctaid.x;\n"
|
|
" cvt.s32.u16 %r5, %ntid.x;\n"
|
|
" mul24.lo.s32 %r6, %r4, %r5;\n"
|
|
" add.s32 %r7, %r6, %r1;\n"
|
|
" ld.param.s32 %r8, [__cudaparm_kernel_pair_fast_inum];\n"
|
|
" setp.ge.s32 %p3, %r7, %r8;\n"
|
|
" @%p3 bra $Lt_1_14338;\n"
|
|
" .loc 14 267 0\n"
|
|
" mov.f32 %f10, 0f00000000; \n"
|
|
" mov.f32 %f11, %f10;\n"
|
|
" mov.f32 %f12, 0f00000000; \n"
|
|
" mov.f32 %f13, %f12;\n"
|
|
" mov.f32 %f14, 0f00000000; \n"
|
|
" mov.f32 %f15, %f14;\n"
|
|
" mov.f32 %f16, 0f00000000; \n"
|
|
" mov.f32 %f17, %f16;\n"
|
|
" mov.f32 %f18, 0f00000000; \n"
|
|
" mov.f32 %f19, %f18;\n"
|
|
" mov.f32 %f20, 0f00000000; \n"
|
|
" mov.f32 %f21, %f20;\n"
|
|
" .loc 14 270 0\n"
|
|
" cvt.u64.s32 %rd17, %r7;\n"
|
|
" mul.lo.u64 %rd18, %rd17, 4;\n"
|
|
" ld.param.u64 %rd19, [__cudaparm_kernel_pair_fast_dev_nbor];\n"
|
|
" add.u64 %rd20, %rd19, %rd18;\n"
|
|
" ld.global.s32 %r9, [%rd20+0];\n"
|
|
" .loc 14 272 0\n"
|
|
" ld.param.s32 %r10, [__cudaparm_kernel_pair_fast_nbor_pitch];\n"
|
|
" cvt.u64.s32 %rd21, %r10;\n"
|
|
" mul.lo.u64 %rd22, %rd21, 4;\n"
|
|
" add.u64 %rd23, %rd20, %rd22;\n"
|
|
" ld.global.s32 %r11, [%rd23+0];\n"
|
|
" .loc 14 273 0\n"
|
|
" add.u64 %rd24, %rd23, %rd22;\n"
|
|
" mov.s64 %rd25, %rd24;\n"
|
|
" mov.s32 %r12, %r9;\n"
|
|
" mov.s32 %r13, 0;\n"
|
|
" mov.s32 %r14, 0;\n"
|
|
" mov.s32 %r15, 0;\n"
|
|
" tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r12,%r13,%r14,%r15}];\n"
|
|
" .loc 14 276 0\n"
|
|
" mov.f32 %f26, %f22;\n"
|
|
" mov.f32 %f27, %f23;\n"
|
|
" mov.f32 %f28, %f24;\n"
|
|
" mov.f32 %f29, %f25;\n"
|
|
" mov.s32 %r16, %r9;\n"
|
|
" mov.s32 %r17, 0;\n"
|
|
" mov.s32 %r18, 0;\n"
|
|
" mov.s32 %r19, 0;\n"
|
|
" tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[q_tex,{%r16,%r17,%r18,%r19}];\n"
|
|
" .loc 14 277 0\n"
|
|
" mov.f32 %f34, %f30;\n"
|
|
" mul24.lo.s32 %r20, %r11, %r10;\n"
|
|
" cvt.s64.s32 %rd26, %r20;\n"
|
|
" mul.lo.u64 %rd27, %rd26, 4;\n"
|
|
" add.u64 %rd28, %rd24, %rd27;\n"
|
|
" ld.param.s32 %r21, [__cudaparm_kernel_pair_fast_vflag];\n"
|
|
" ld.param.s32 %r22, [__cudaparm_kernel_pair_fast_eflag];\n"
|
|
" setp.ge.u64 %p4, %rd24, %rd28;\n"
|
|
" mov.f32 %f35, 0f00000000; \n"
|
|
" mov.f32 %f36, 0f00000000; \n"
|
|
" mov.f32 %f37, 0f00000000; \n"
|
|
" mov.f32 %f38, 0f00000000; \n"
|
|
" mov.f32 %f39, 0f00000000; \n"
|
|
" @%p4 bra $Lt_1_20994;\n"
|
|
" mov.s32 %r23, 0;\n"
|
|
" setp.gt.s32 %p5, %r22, %r23;\n"
|
|
" mov.s32 %r24, 0;\n"
|
|
" setp.gt.s32 %p6, %r21, %r24;\n"
|
|
" cvt.rzi.s32.f32 %r25, %f29;\n"
|
|
" mov.s32 %r26, 8;\n"
|
|
" mul24.lo.s32 %r27, %r26, %r25;\n"
|
|
" cvt.rn.f32.s32 %f40, %r27;\n"
|
|
"$Lt_1_15362:\n"
|
|
" .loc 14 282 0\n"
|
|
" ld.global.s32 %r28, [%rd25+0];\n"
|
|
" .loc 14 285 0\n"
|
|
" shr.s32 %r29, %r28, 30;\n"
|
|
" cvt.s64.s32 %rd29, %r29;\n"
|
|
" and.b64 %rd30, %rd29, 3;\n"
|
|
" mul.lo.u64 %rd31, %rd30, 4;\n"
|
|
" add.u64 %rd32, %rd1, %rd31;\n"
|
|
" ld.shared.f32 %f41, [%rd32+0];\n"
|
|
" .loc 14 286 0\n"
|
|
" mov.f32 %f42, 0f3f800000; \n"
|
|
" ld.shared.f32 %f43, [%rd32+16];\n"
|
|
" sub.f32 %f44, %f42, %f43;\n"
|
|
" and.b32 %r30, %r28, 1073741823;\n"
|
|
" mov.s32 %r31, %r30;\n"
|
|
" mov.s32 %r32, 0;\n"
|
|
" mov.s32 %r33, 0;\n"
|
|
" mov.s32 %r34, 0;\n"
|
|
" tex.1d.v4.f32.s32 {%f45,%f46,%f47,%f48},[pos_tex,{%r31,%r32,%r33,%r34}];\n"
|
|
" .loc 14 289 0\n"
|
|
" mov.f32 %f49, %f45;\n"
|
|
" mov.f32 %f50, %f46;\n"
|
|
" mov.f32 %f51, %f47;\n"
|
|
" mov.f32 %f52, %f48;\n"
|
|
" sub.f32 %f53, %f27, %f50;\n"
|
|
" sub.f32 %f54, %f26, %f49;\n"
|
|
" sub.f32 %f55, %f28, %f51;\n"
|
|
" mul.f32 %f56, %f53, %f53;\n"
|
|
" mad.f32 %f57, %f54, %f54, %f56;\n"
|
|
" mad.f32 %f58, %f55, %f55, %f57;\n"
|
|
" add.f32 %f59, %f40, %f52;\n"
|
|
" cvt.rzi.s32.f32 %r35, %f59;\n"
|
|
" cvt.u64.s32 %rd33, %r35;\n"
|
|
" mul.lo.u64 %rd34, %rd33, 16;\n"
|
|
" add.u64 %rd35, %rd34, %rd8;\n"
|
|
" ld.shared.f32 %f60, [%rd35+0];\n"
|
|
" setp.gt.f32 %p7, %f60, %f58;\n"
|
|
" @!%p7 bra $Lt_1_19202;\n"
|
|
" rcp.approx.f32 %f61, %f58;\n"
|
|
" ld.shared.f32 %f62, [%rd35+4];\n"
|
|
" setp.lt.f32 %p8, %f58, %f62;\n"
|
|
" @!%p8 bra $Lt_1_16386;\n"
|
|
" add.u64 %rd36, %rd34, %rd7;\n"
|
|
" ld.shared.f32 %f63, [%rd36+0];\n"
|
|
" mov.f32 %f64, 0f40000000; \n"
|
|
" setp.eq.f32 %p9, %f63, %f64;\n"
|
|
" @!%p9 bra $Lt_1_16898;\n"
|
|
" .loc 14 304 0\n"
|
|
" mul.f32 %f65, %f61, %f61;\n"
|
|
" mov.f32 %f66, %f65;\n"
|
|
" mov.f32 %f67, %f66;\n"
|
|
" .loc 14 305 0\n"
|
|
" mul.f32 %f68, %f65, %f65;\n"
|
|
" mov.f32 %f69, %f68;\n"
|
|
" bra.uni $Lt_1_17154;\n"
|
|
"$Lt_1_16898:\n"
|
|
" mov.f32 %f70, 0f3f800000; \n"
|
|
" .loc 14 289 0\n"
|
|
" ld.shared.f32 %f63, [%rd36+0];\n"
|
|
" .loc 14 305 0\n"
|
|
" setp.eq.f32 %p10, %f63, %f70;\n"
|
|
" @!%p10 bra $Lt_1_17410;\n"
|
|
" .loc 14 307 0\n"
|
|
" sqrt.approx.f32 %f71, %f61;\n"
|
|
" mul.f32 %f72, %f61, %f71;\n"
|
|
" mov.f32 %f68, %f72;\n"
|
|
" mov.f32 %f69, %f68;\n"
|
|
" .loc 14 308 0\n"
|
|
" mul.f32 %f66, %f72, %f72;\n"
|
|
" mov.f32 %f67, %f66;\n"
|
|
" bra.uni $Lt_1_17154;\n"
|
|
"$Lt_1_17410:\n"
|
|
" .loc 14 310 0\n"
|
|
" mul.f32 %f73, %f61, %f61;\n"
|
|
" mul.f32 %f74, %f61, %f73;\n"
|
|
" mov.f32 %f66, %f74;\n"
|
|
" mov.f32 %f67, %f66;\n"
|
|
" .loc 14 311 0\n"
|
|
" mov.f32 %f68, %f74;\n"
|
|
" mov.f32 %f69, %f68;\n"
|
|
"$Lt_1_17154:\n"
|
|
"$Lt_1_16642:\n"
|
|
" .loc 14 285 0\n"
|
|
" ld.shared.f32 %f41, [%rd32+0];\n"
|
|
" .loc 14 313 0\n"
|
|
" mul.f32 %f75, %f41, %f66;\n"
|
|
" ld.shared.f32 %f76, [%rd35+12];\n"
|
|
" ld.shared.f32 %f77, [%rd35+8];\n"
|
|
" mul.f32 %f78, %f77, %f68;\n"
|
|
" sub.f32 %f79, %f78, %f76;\n"
|
|
" mul.f32 %f80, %f75, %f79;\n"
|
|
" bra.uni $Lt_1_16130;\n"
|
|
"$Lt_1_16386:\n"
|
|
" .loc 14 315 0\n"
|
|
" mov.f32 %f80, 0f00000000; \n"
|
|
"$Lt_1_16130:\n"
|
|
" ld.param.f32 %f81, [__cudaparm_kernel_pair_fast_cut_coulsq];\n"
|
|
" setp.gt.f32 %p11, %f81, %f58;\n"
|
|
" @!%p11 bra $Lt_1_17922;\n"
|
|
" .loc 14 322 0\n"
|
|
" sqrt.approx.f32 %f82, %f58;\n"
|
|
" ld.param.f32 %f83, [__cudaparm_kernel_pair_fast_g_ewald];\n"
|
|
" mul.f32 %f84, %f83, %f82;\n"
|
|
" mul.f32 %f85, %f84, %f84;\n"
|
|
" mov.f32 %f86, 0f3f800000; \n"
|
|
" mov.f32 %f87, 0f3ea7ba05; \n"
|
|
" mad.f32 %f88, %f87, %f84, %f86;\n"
|
|
" neg.f32 %f89, %f85;\n"
|
|
" rcp.approx.f32 %f90, %f88;\n"
|
|
" mov.f32 %f91, 0f3fb8aa3b; \n"
|
|
" mul.f32 %f92, %f89, %f91;\n"
|
|
" ex2.approx.f32 %f93, %f92;\n"
|
|
" mov.f32 %f94, 0f3e827906; \n"
|
|
" mov.f32 %f95, 0fbe91a98e; \n"
|
|
" mov.f32 %f96, 0f3fb5f0e3; \n"
|
|
" mov.f32 %f97, 0fbfba00e3; \n"
|
|
" mov.f32 %f98, 0f3f87dc22; \n"
|
|
" mad.f32 %f99, %f98, %f90, %f97;\n"
|
|
" mad.f32 %f100, %f90, %f99, %f96;\n"
|
|
" mad.f32 %f101, %f90, %f100, %f95;\n"
|
|
" mad.f32 %f102, %f90, %f101, %f94;\n"
|
|
" mul.f32 %f103, %f90, %f102;\n"
|
|
" mul.f32 %f104, %f93, %f103;\n"
|
|
" mov.f32 %f105, %f104;\n"
|
|
" mov.s32 %r36, %r30;\n"
|
|
" mov.s32 %r37, 0;\n"
|
|
" mov.s32 %r38, 0;\n"
|
|
" mov.s32 %r39, 0;\n"
|
|
" tex.1d.v4.f32.s32 {%f106,%f107,%f108,%f109},[q_tex,{%r36,%r37,%r38,%r39}];\n"
|
|
" .loc 14 323 0\n"
|
|
" mov.f32 %f110, %f106;\n"
|
|
" ld.param.f32 %f111, [__cudaparm_kernel_pair_fast_qqrd2e];\n"
|
|
" mul.f32 %f112, %f111, %f34;\n"
|
|
" mul.f32 %f113, %f112, %f110;\n"
|
|
" div.approx.f32 %f114, %f113, %f82;\n"
|
|
" mov.f32 %f115, %f114;\n"
|
|
" .loc 14 324 0\n"
|
|
" mov.f32 %f116, 0f3f906ebb; \n"
|
|
" mul.f32 %f117, %f84, %f116;\n"
|
|
" mad.f32 %f118, %f93, %f117, %f104;\n"
|
|
" sub.f32 %f119, %f118, %f44;\n"
|
|
" mul.f32 %f120, %f114, %f119;\n"
|
|
" bra.uni $Lt_1_17666;\n"
|
|
"$Lt_1_17922:\n"
|
|
" .loc 14 327 0\n"
|
|
" mov.f32 %f115, 0f00000000; \n"
|
|
" mov.f32 %f120, 0f00000000; \n"
|
|
"$Lt_1_17666:\n"
|
|
" .loc 14 332 0\n"
|
|
" add.f32 %f121, %f120, %f80;\n"
|
|
" mul.f32 %f122, %f121, %f61;\n"
|
|
" mad.f32 %f37, %f54, %f122, %f37;\n"
|
|
" .loc 14 333 0\n"
|
|
" mad.f32 %f36, %f53, %f122, %f36;\n"
|
|
" .loc 14 334 0\n"
|
|
" mad.f32 %f35, %f55, %f122, %f35;\n"
|
|
" @!%p5 bra $Lt_1_18690;\n"
|
|
" .loc 14 337 0\n"
|
|
" mov.f32 %f123, %f105;\n"
|
|
" sub.f32 %f124, %f123, %f44;\n"
|
|
" mad.f32 %f38, %f115, %f124, %f38;\n"
|
|
" @!%p8 bra $Lt_1_18690;\n"
|
|
" .loc 14 339 0\n"
|
|
" add.u64 %rd37, %rd34, %rd7;\n"
|
|
" ld.shared.f32 %f125, [%rd37+12];\n"
|
|
" mov.f32 %f126, %f67;\n"
|
|
" .loc 14 285 0\n"
|
|
" ld.shared.f32 %f41, [%rd32+0];\n"
|
|
" .loc 14 339 0\n"
|
|
" mul.f32 %f127, %f126, %f41;\n"
|
|
" ld.shared.f32 %f128, [%rd37+8];\n"
|
|
" ld.shared.f32 %f129, [%rd37+4];\n"
|
|
" mov.f32 %f130, %f69;\n"
|
|
" mul.f32 %f131, %f129, %f130;\n"
|
|
" sub.f32 %f132, %f131, %f128;\n"
|
|
" mul.f32 %f133, %f127, %f132;\n"
|
|
" sub.f32 %f134, %f133, %f125;\n"
|
|
" add.f32 %f39, %f39, %f134;\n"
|
|
"$Lt_1_18690:\n"
|
|
"$Lt_1_18178:\n"
|
|
" @!%p6 bra $Lt_1_19202;\n"
|
|
" .loc 14 344 0\n"
|
|
" mov.f32 %f135, %f11;\n"
|
|
" mul.f32 %f136, %f54, %f54;\n"
|
|
" mad.f32 %f137, %f122, %f136, %f135;\n"
|
|
" mov.f32 %f11, %f137;\n"
|
|
" .loc 14 345 0\n"
|
|
" mov.f32 %f138, %f13;\n"
|
|
" mad.f32 %f139, %f122, %f56, %f138;\n"
|
|
" mov.f32 %f13, %f139;\n"
|
|
" .loc 14 346 0\n"
|
|
" mov.f32 %f140, %f15;\n"
|
|
" mul.f32 %f141, %f55, %f55;\n"
|
|
" mad.f32 %f142, %f122, %f141, %f140;\n"
|
|
" mov.f32 %f15, %f142;\n"
|
|
" .loc 14 347 0\n"
|
|
" mov.f32 %f143, %f17;\n"
|
|
" mul.f32 %f144, %f53, %f54;\n"
|
|
" mad.f32 %f145, %f122, %f144, %f143;\n"
|
|
" mov.f32 %f17, %f145;\n"
|
|
" .loc 14 348 0\n"
|
|
" mov.f32 %f146, %f19;\n"
|
|
" mul.f32 %f147, %f54, %f55;\n"
|
|
" mad.f32 %f148, %f122, %f147, %f146;\n"
|
|
" mov.f32 %f19, %f148;\n"
|
|
" .loc 14 349 0\n"
|
|
" mul.f32 %f149, %f53, %f55;\n"
|
|
" mad.f32 %f20, %f122, %f149, %f20;\n"
|
|
" mov.f32 %f150, %f20;\n"
|
|
"$Lt_1_19202:\n"
|
|
"$Lt_1_15618:\n"
|
|
" .loc 14 281 0\n"
|
|
" add.u64 %rd25, %rd22, %rd25;\n"
|
|
" setp.gt.u64 %p12, %rd28, %rd25;\n"
|
|
" @%p12 bra $Lt_1_15362;\n"
|
|
" bra.uni $Lt_1_14850;\n"
|
|
"$Lt_1_20994:\n"
|
|
" mov.s32 %r40, 0;\n"
|
|
" setp.gt.s32 %p5, %r22, %r40;\n"
|
|
" mov.s32 %r41, 0;\n"
|
|
" setp.gt.s32 %p6, %r21, %r41;\n"
|
|
"$Lt_1_14850:\n"
|
|
" .loc 14 356 0\n"
|
|
" ld.param.u64 %rd38, [__cudaparm_kernel_pair_fast_engv];\n"
|
|
" add.u64 %rd39, %rd38, %rd18;\n"
|
|
" @!%p5 bra $Lt_1_19970;\n"
|
|
" .loc 14 358 0\n"
|
|
" st.global.f32 [%rd39+0], %f39;\n"
|
|
" .loc 14 359 0\n"
|
|
" cvt.u64.s32 %rd40, %r8;\n"
|
|
" mul.lo.u64 %rd41, %rd40, 4;\n"
|
|
" add.u64 %rd39, %rd41, %rd39;\n"
|
|
" .loc 14 360 0\n"
|
|
" st.global.f32 [%rd39+0], %f38;\n"
|
|
" .loc 14 361 0\n"
|
|
" add.u64 %rd39, %rd41, %rd39;\n"
|
|
"$Lt_1_19970:\n"
|
|
" @!%p6 bra $Lt_1_20482;\n"
|
|
" .loc 14 365 0\n"
|
|
" mov.f32 %f151, %f11;\n"
|
|
" st.global.f32 [%rd39+0], %f151;\n"
|
|
" .loc 14 366 0\n"
|
|
" cvt.u64.s32 %rd42, %r8;\n"
|
|
" mul.lo.u64 %rd43, %rd42, 4;\n"
|
|
" add.u64 %rd39, %rd43, %rd39;\n"
|
|
" .loc 14 365 0\n"
|
|
" mov.f32 %f152, %f13;\n"
|
|
" st.global.f32 [%rd39+0], %f152;\n"
|
|
" .loc 14 366 0\n"
|
|
" add.u64 %rd39, %rd43, %rd39;\n"
|
|
" .loc 14 365 0\n"
|
|
" mov.f32 %f153, %f15;\n"
|
|
" st.global.f32 [%rd39+0], %f153;\n"
|
|
" .loc 14 366 0\n"
|
|
" add.u64 %rd39, %rd43, %rd39;\n"
|
|
" .loc 14 365 0\n"
|
|
" mov.f32 %f154, %f17;\n"
|
|
" st.global.f32 [%rd39+0], %f154;\n"
|
|
" .loc 14 366 0\n"
|
|
" add.u64 %rd39, %rd43, %rd39;\n"
|
|
" .loc 14 365 0\n"
|
|
" mov.f32 %f155, %f19;\n"
|
|
" st.global.f32 [%rd39+0], %f155;\n"
|
|
" add.u64 %rd44, %rd43, %rd39;\n"
|
|
" st.global.f32 [%rd44+0], %f20;\n"
|
|
"$Lt_1_20482:\n"
|
|
" .loc 14 369 0\n"
|
|
" ld.param.u64 %rd45, [__cudaparm_kernel_pair_fast_ans];\n"
|
|
" mul.lo.u64 %rd46, %rd17, 16;\n"
|
|
" add.u64 %rd47, %rd45, %rd46;\n"
|
|
" mov.f32 %f156, %f157;\n"
|
|
" st.global.v4.f32 [%rd47+0], {%f37,%f36,%f35,%f156};\n"
|
|
"$Lt_1_14338:\n"
|
|
" .loc 14 371 0\n"
|
|
" exit;\n"
|
|
"$LDWend_kernel_pair_fast:\n"
|
|
" }\n"
|
|
;
|