Files
lammps/lib/gpu/re_squared_lj_ptx.h

3490 lines
117 KiB
C

const char * re_squared_lj =
" .version 2.3\n"
" .target sm_20\n"
" .address_size 64\n"
" .entry kernel_ellipsoid_sphere (\n"
" .param .u64 __cudaparm_kernel_ellipsoid_sphere_x_,\n"
" .param .u64 __cudaparm_kernel_ellipsoid_sphere_q,\n"
" .param .u64 __cudaparm_kernel_ellipsoid_sphere_shape,\n"
" .param .u64 __cudaparm_kernel_ellipsoid_sphere_well,\n"
" .param .u64 __cudaparm_kernel_ellipsoid_sphere_splj,\n"
" .param .u64 __cudaparm_kernel_ellipsoid_sphere_sig_eps,\n"
" .param .s32 __cudaparm_kernel_ellipsoid_sphere_ntypes,\n"
" .param .u64 __cudaparm_kernel_ellipsoid_sphere_dev_nbor,\n"
" .param .s32 __cudaparm_kernel_ellipsoid_sphere_stride,\n"
" .param .u64 __cudaparm_kernel_ellipsoid_sphere_ans,\n"
" .param .s32 __cudaparm_kernel_ellipsoid_sphere_astride,\n"
" .param .u64 __cudaparm_kernel_ellipsoid_sphere_engv,\n"
" .param .u64 __cudaparm_kernel_ellipsoid_sphere_err_flag,\n"
" .param .s32 __cudaparm_kernel_ellipsoid_sphere_eflag,\n"
" .param .s32 __cudaparm_kernel_ellipsoid_sphere_vflag,\n"
" .param .s32 __cudaparm_kernel_ellipsoid_sphere_inum,\n"
" .param .s32 __cudaparm_kernel_ellipsoid_sphere_t_per_atom)\n"
" {\n"
" .reg .u32 %r<66>;\n"
" .reg .u64 %rd<73>;\n"
" .reg .f32 %f<777>;\n"
" .reg .pred %p<34>;\n"
" .shared .align 16 .b8 __cuda___cuda_local_var_32886_33_non_const_sp_lj120[16];\n"
" .shared .align 4 .b8 __cuda___cuda_local_var_33120_37_non_const_red_acc136[3584];\n"
" .shared .f32 __cuda_local_var_32892_33_non_const_b_alpha;\n"
" .shared .f32 __cuda_local_var_32892_42_non_const_cr60;\n"
" .shared .f32 __cuda_local_var_32892_48_non_const_solv_f_a;\n"
" .shared .f32 __cuda_local_var_32892_58_non_const_solv_f_r;\n"
" .loc 17 27 0\n"
"$LDWbegin_kernel_ellipsoid_sphere:\n"
" .loc 17 32 0\n"
" ld.param.u64 %rd1, [__cudaparm_kernel_ellipsoid_sphere_splj];\n"
" ldu.global.f32 %f1, [%rd1+0];\n"
" .loc 17 33 0\n"
" ld.global.f32 %f2, [%rd1+4];\n"
" .loc 17 34 0\n"
" ld.global.f32 %f3, [%rd1+8];\n"
" .loc 17 35 0\n"
" ld.global.f32 %f4, [%rd1+12];\n"
" st.shared.v4.f32 [__cuda___cuda_local_var_32886_33_non_const_sp_lj120+0], {%f1,%f2,%f3,%f4};\n"
" .loc 17 38 0\n"
" mov.f32 %f5, 0f3f4db6db; \n"
" st.shared.f32 [__cuda_local_var_32892_33_non_const_b_alpha], %f5;\n"
" .loc 17 39 0\n"
" mov.f32 %f6, 0f42700000; \n"
" lg2.approx.ftz.f32 %f7, %f6;\n"
" mov.f32 %f8, 0f3eaaaaab; \n"
" mul.ftz.f32 %f9, %f7, %f8;\n"
" ex2.approx.ftz.f32 %f10, %f9;\n"
" mov.f32 %f11, 0f42700000; \n"
" mul.ftz.f32 %f12, %f10, %f10;\n"
" div.approx.ftz.f32 %f13, %f11, %f12;\n"
" sub.ftz.f32 %f14, %f10, %f13;\n"
" mov.f32 %f15, 0f3eaaaaab; \n"
" mul.ftz.f32 %f16, %f14, %f15;\n"
" sub.ftz.f32 %f17, %f10, %f16;\n"
" st.shared.f32 [__cuda_local_var_32892_42_non_const_cr60], %f17;\n"
" .loc 21 544 0\n"
" mov.f32 %f18, 0f3f800000; \n"
" mov.f32 %f19, 0fbf52c7ea; \n"
" mov.f32 %f20, 0fc0b59883; \n"
" fma.rn.ftz.f32 %f21, %f18, %f19, %f20;\n"
" mov.f32 %f22, 0f41455dc0; \n"
" mov.f32 %f23, 0f3f800000; \n"
" mov.f32 %f24, 0f41e6bd60; \n"
" fma.rn.ftz.f32 %f25, %f22, %f23, %f24;\n"
" mov.f32 %f26, 0f3f800000; \n"
" mov.f32 %f27, 0fc0d21907; \n"
" fma.rn.ftz.f32 %f28, %f21, %f26, %f27;\n"
" mov.f32 %f29, 0f3f800000; \n"
" mov.f32 %f30, 0f419d92c8; \n"
" fma.rn.ftz.f32 %f31, %f25, %f29, %f30;\n"
" rcp.approx.ftz.f32 %f32, %f31;\n"
" mov.f32 %f33, 0f3f800000; \n"
" fma.rn.ftz.f32 %f34, %f28, %f32, %f33;\n"
" mov.b32 %r1, %f34;\n"
" mov.b32 %f35, %r1;\n"
" mov.f32 %f36, 0f41800000; \n"
" mul.ftz.f32 %f37, %f35, %f36;\n"
" mov.f32 %f38, 0f40400000; \n"
" mov.f32 %f39, 0fc2100000; \n"
" mul.ftz.f32 %f40, %f37, %f39;\n"
" div.approx.ftz.f32 %f41, %f38, %f40;\n"
" .loc 17 40 0\n"
" st.shared.f32 [__cuda_local_var_32892_48_non_const_solv_f_a], %f41;\n"
" .loc 21 544 0\n"
" mov.f32 %f42, 0f40400000; \n"
" mov.f32 %f43, 0f44fd2000; \n"
" mul.ftz.f32 %f44, %f37, %f43;\n"
" div.approx.ftz.f32 %f45, %f42, %f44;\n"
" .loc 17 41 0\n"
" st.shared.f32 [__cuda_local_var_32892_58_non_const_solv_f_r], %f45;\n"
" .loc 17 54 0\n"
" mov.f32 %f46, 0f00000000; \n"
" mov.f32 %f47, %f46;\n"
" mov.f32 %f48, 0f00000000; \n"
" mov.f32 %f49, %f48;\n"
" mov.f32 %f50, 0f00000000; \n"
" mov.f32 %f51, %f50;\n"
" mov.f32 %f52, 0f00000000; \n"
" mov.f32 %f53, %f52;\n"
" mov.f32 %f54, 0f00000000; \n"
" mov.f32 %f55, %f54;\n"
" mov.f32 %f56, 0f00000000; \n"
" mov.f32 %f57, %f56;\n"
" ld.param.s32 %r2, [__cudaparm_kernel_ellipsoid_sphere_t_per_atom];\n"
" cvt.s32.u32 %r3, %tid.x;\n"
" div.s32 %r4, %r3, %r2;\n"
" cvt.s32.u32 %r5, %ntid.x;\n"
" div.s32 %r6, %r5, %r2;\n"
" cvt.s32.u32 %r7, %ctaid.x;\n"
" mul.lo.s32 %r8, %r7, %r6;\n"
" add.s32 %r9, %r4, %r8;\n"
" ld.param.s32 %r10, [__cudaparm_kernel_ellipsoid_sphere_inum];\n"
" setp.le.s32 %p1, %r10, %r9;\n"
" @%p1 bra $Lt_0_73474;\n"
" .loc 17 59 0\n"
" cvt.s64.s32 %rd2, %r9;\n"
" mul.wide.s32 %rd3, %r9, 4;\n"
" ld.param.u64 %rd4, [__cudaparm_kernel_ellipsoid_sphere_dev_nbor];\n"
" add.u64 %rd5, %rd4, %rd3;\n"
" ld.global.s32 %r11, [%rd5+0];\n"
" ld.param.s32 %r12, [__cudaparm_kernel_ellipsoid_sphere_stride];\n"
" cvt.s64.s32 %rd6, %r12;\n"
" mul.wide.s32 %rd7, %r12, 4;\n"
" add.u64 %rd8, %rd7, %rd5;\n"
" ld.global.s32 %r13, [%rd8+0];\n"
" .loc 17 62 0\n"
" cvt.s64.s32 %rd9, %r11;\n"
" mul.wide.s32 %rd10, %r11, 16;\n"
" ld.param.u64 %rd11, [__cudaparm_kernel_ellipsoid_sphere_x_];\n"
" add.u64 %rd12, %rd10, %rd11;\n"
" ld.global.v4.f32 {%f58,%f59,%f60,%f61}, [%rd12+0];\n"
" .loc 17 70 0\n"
" cvt.rzi.ftz.s32.f32 %r14, %f61;\n"
" cvt.s64.s32 %rd13, %r14;\n"
" mul.wide.s32 %rd14, %r14, 16;\n"
" ld.param.u64 %rd15, [__cudaparm_kernel_ellipsoid_sphere_shape];\n"
" add.u64 %rd16, %rd15, %rd14;\n"
" ld.global.v4.f32 {%f62,%f63,%f64,_}, [%rd16+0];\n"
" .loc 17 74 0\n"
" ld.param.u64 %rd17, [__cudaparm_kernel_ellipsoid_sphere_q];\n"
" add.u64 %rd18, %rd17, %rd10;\n"
" ld.global.v4.f32 {%f65,%f66,%f67,%f68}, [%rd18+0];\n"
" .loc 17 75 0\n"
" ld.param.u64 %rd19, [__cudaparm_kernel_ellipsoid_sphere_well];\n"
" add.u64 %rd20, %rd19, %rd14;\n"
" ld.global.v4.f32 {%f69,%f70,%f71,_}, [%rd20+0];\n"
" .loc 17 78 0\n"
" cvt.s32.s64 %r15, %rd6;\n"
" sub.s32 %r16, %r2, 1;\n"
" and.b32 %r17, %r16, %r3;\n"
" add.u64 %rd21, %rd7, %rd8;\n"
" mul.lo.s32 %r18, %r15, %r17;\n"
" cvt.s64.s32 %rd22, %r18;\n"
" mul.wide.s32 %rd23, %r18, 4;\n"
" add.u64 %rd24, %rd21, %rd23;\n"
" mov.s64 %rd25, %rd24;\n"
" mul.lo.s32 %r19, %r15, %r13;\n"
" cvt.s64.s32 %rd26, %r19;\n"
" mul.wide.s32 %rd27, %r19, 4;\n"
" add.u64 %rd28, %rd21, %rd27;\n"
" setp.ge.u64 %p2, %rd24, %rd28;\n"
" @%p2 bra $Lt_0_75266;\n"
" ld.param.s32 %r20, [__cudaparm_kernel_ellipsoid_sphere_vflag];\n"
" mov.s32 %r21, 0;\n"
" setp.gt.s32 %p3, %r20, %r21;\n"
" add.ftz.f32 %f72, %f66, %f66;\n"
" add.ftz.f32 %f73, %f68, %f68;\n"
" mul.ftz.f32 %f74, %f65, %f65;\n"
" mul.ftz.f32 %f75, %f66, %f66;\n"
" mul.ftz.f32 %f76, %f67, %f67;\n"
" mul.ftz.f32 %f77, %f68, %f68;\n"
" add.ftz.f32 %f78, %f67, %f67;\n"
" mul.ftz.f32 %f79, %f62, %f63;\n"
" add.ftz.f32 %f80, %f62, %f62;\n"
" add.ftz.f32 %f81, %f63, %f63;\n"
" add.ftz.f32 %f82, %f64, %f64;\n"
" ld.param.s32 %r22, [__cudaparm_kernel_ellipsoid_sphere_ntypes];\n"
" mul.lo.s32 %r23, %r22, %r14;\n"
" mul.ftz.f32 %f83, %f72, %f67;\n"
" mul.ftz.f32 %f84, %f72, %f68;\n"
" mul.ftz.f32 %f85, %f72, %f65;\n"
" mul.ftz.f32 %f86, %f73, %f65;\n"
" add.ftz.f32 %f87, %f74, %f75;\n"
" sub.ftz.f32 %f88, %f74, %f75;\n"
" mul.ftz.f32 %f89, %f78, %f65;\n"
" mul.ftz.f32 %f90, %f78, %f68;\n"
" mul.ftz.f32 %f91, %f79, %f64;\n"
" sub.ftz.f32 %f92, %f83, %f86;\n"
" add.ftz.f32 %f93, %f83, %f86;\n"
" sub.ftz.f32 %f94, %f86, %f83;\n"
" sub.ftz.f32 %f95, %f87, %f76;\n"
" add.ftz.f32 %f96, %f76, %f88;\n"
" sub.ftz.f32 %f97, %f88, %f76;\n"
" add.ftz.f32 %f98, %f84, %f89;\n"
" sub.ftz.f32 %f99, %f84, %f89;\n"
" sub.ftz.f32 %f100, %f89, %f84;\n"
" sub.ftz.f32 %f101, %f90, %f85;\n"
" add.ftz.f32 %f102, %f85, %f90;\n"
" sub.ftz.f32 %f103, %f85, %f90;\n"
" mul.ftz.f32 %f104, %f92, %f70;\n"
" mul.ftz.f32 %f105, %f93, %f69;\n"
" neg.ftz.f32 %f106, %f93;\n"
" sub.ftz.f32 %f107, %f95, %f77;\n"
" sub.ftz.f32 %f108, %f77, %f95;\n"
" sub.ftz.f32 %f109, %f96, %f77;\n"
" sub.ftz.f32 %f110, %f77, %f96;\n"
" add.ftz.f32 %f111, %f77, %f97;\n"
" mul.ftz.f32 %f112, %f98, %f71;\n"
" neg.ftz.f32 %f113, %f98;\n"
" mul.ftz.f32 %f114, %f99, %f69;\n"
" mul.ftz.f32 %f115, %f101, %f71;\n"
" mul.ftz.f32 %f116, %f102, %f70;\n"
" mul.ftz.f32 %f117, %f92, %f104;\n"
" mul.ftz.f32 %f118, %f102, %f104;\n"
" mul.ftz.f32 %f119, %f107, %f69;\n"
" mul.ftz.f32 %f120, %f104, %f109;\n"
" mul.ftz.f32 %f121, %f109, %f70;\n"
" mul.ftz.f32 %f122, %f111, %f71;\n"
" neg.ftz.f32 %f123, %f111;\n"
" mul.ftz.f32 %f124, %f92, %f116;\n"
" mul.ftz.f32 %f125, %f109, %f116;\n"
" mul.ftz.f32 %f126, %f102, %f116;\n"
" fma.rn.ftz.f32 %f127, %f107, %f119, %f117;\n"
" fma.rn.ftz.f32 %f128, %f119, %f99, %f118;\n"
" fma.rn.ftz.f32 %f129, %f119, %f93, %f120;\n"
" mul.ftz.f32 %f130, %f92, %f121;\n"
" mul.ftz.f32 %f131, %f109, %f121;\n"
" mul.ftz.f32 %f132, %f102, %f121;\n"
" fma.rn.ftz.f32 %f133, %f107, %f114, %f124;\n"
" fma.rn.ftz.f32 %f134, %f93, %f114, %f125;\n"
" fma.rn.ftz.f32 %f135, %f99, %f114, %f126;\n"
" fma.rn.ftz.f32 %f136, %f112, %f98, %f127;\n"
" fma.rn.ftz.f32 %f137, %f112, %f111, %f128;\n"
" fma.rn.ftz.f32 %f138, %f112, %f101, %f129;\n"
" fma.rn.ftz.f32 %f139, %f107, %f105, %f130;\n"
" fma.rn.ftz.f32 %f140, %f93, %f105, %f131;\n"
" fma.rn.ftz.f32 %f141, %f99, %f105, %f132;\n"
" fma.rn.ftz.f32 %f142, %f98, %f122, %f133;\n"
" fma.rn.ftz.f32 %f143, %f101, %f122, %f134;\n"
" fma.rn.ftz.f32 %f144, %f111, %f122, %f135;\n"
" mov.f32 %f145, 0f3f800000; \n"
" add.ftz.f32 %f146, %f136, %f145;\n"
" fma.rn.ftz.f32 %f147, %f115, %f98, %f139;\n"
" fma.rn.ftz.f32 %f148, %f115, %f101, %f140;\n"
" fma.rn.ftz.f32 %f149, %f115, %f111, %f141;\n"
" abs.ftz.f32 %f150, %f142;\n"
" mov.f32 %f151, 0f3f800000; \n"
" add.ftz.f32 %f152, %f144, %f151;\n"
" abs.ftz.f32 %f153, %f146;\n"
" abs.ftz.f32 %f154, %f147;\n"
" mov.f32 %f155, 0f3f800000; \n"
" add.ftz.f32 %f156, %f148, %f155;\n"
" setp.lt.ftz.f32 %p4, %f153, %f154;\n"
" ld.param.u64 %rd29, [__cudaparm_kernel_ellipsoid_sphere_sig_eps];\n"
" mov.f32 %f157, 0f00000000; \n"
" mov.f32 %f158, 0f00000000; \n"
" mov.f32 %f159, 0f00000000; \n"
" mov.f32 %f160, 0f00000000; \n"
" mov.f32 %f161, 0f00000000; \n"
" mov.f32 %f162, 0f00000000; \n"
" mov.f32 %f163, 0f00000000; \n"
" mov.u64 %rd30, __cuda___cuda_local_var_32886_33_non_const_sp_lj120;\n"
"$Lt_0_51970:\n"
" .loc 17 83 0\n"
" ld.global.s32 %r24, [%rd25+0];\n"
" .loc 17 87 0\n"
" and.b32 %r25, %r24, 1073741823;\n"
" cvt.s64.s32 %rd31, %r25;\n"
" mul.wide.s32 %rd32, %r25, 16;\n"
" add.u64 %rd33, %rd11, %rd32;\n"
" ld.global.v4.f32 {%f164,%f165,%f166,%f167}, [%rd33+0];\n"
" .loc 17 98 0\n"
" sub.ftz.f32 %f168, %f165, %f59;\n"
" sub.ftz.f32 %f169, %f164, %f58;\n"
" sub.ftz.f32 %f170, %f166, %f60;\n"
" mul.ftz.f32 %f171, %f168, %f168;\n"
" fma.rn.ftz.f32 %f172, %f169, %f169, %f171;\n"
" fma.rn.ftz.f32 %f173, %f170, %f170, %f172;\n"
" rsqrt.approx.ftz.f32 %f174, %f173;\n"
" mul.ftz.f32 %f175, %f169, %f174;\n"
" .loc 17 99 0\n"
" mul.ftz.f32 %f176, %f168, %f174;\n"
" .loc 17 104 0\n"
" cvt.rzi.ftz.s32.f32 %r26, %f167;\n"
" add.s32 %r27, %r26, %r23;\n"
" cvt.s64.s32 %rd34, %r27;\n"
" mul.wide.s32 %rd35, %r27, 8;\n"
" add.u64 %rd36, %rd29, %rd35;\n"
" ld.global.v2.f32 {%f177,%f178}, [%rd36+0];\n"
" .loc 17 105 0\n"
" shr.s32 %r28, %r24, 30;\n"
" and.b32 %r29, %r28, 3;\n"
" cvt.s64.s32 %rd37, %r29;\n"
" mul.wide.s32 %rd38, %r29, 4;\n"
" add.u64 %rd39, %rd30, %rd38;\n"
" ld.shared.f32 %f179, [%rd39+0];\n"
" mul.ftz.f32 %f180, %f179, %f178;\n"
" .loc 16 299 0\n"
" mov.f32 %f181, %f175;\n"
" .loc 16 300 0\n"
" mov.f32 %f182, 0f3f000000; \n"
" mul.ftz.f32 %f183, %f177, %f182;\n"
" add.ftz.f32 %f184, %f183, %f63;\n"
" add.ftz.f32 %f185, %f183, %f62;\n"
" add.ftz.f32 %f186, %f183, %f64;\n"
" mul.ftz.f32 %f187, %f184, %f184;\n"
" mul.ftz.f32 %f188, %f185, %f185;\n"
" mul.ftz.f32 %f189, %f186, %f186;\n"
" mov.f32 %f190, 0f3f000000; \n"
" mul.ftz.f32 %f191, %f187, %f190;\n"
" mov.f32 %f192, 0f3f000000; \n"
" mul.ftz.f32 %f193, %f188, %f192;\n"
" mov.f32 %f194, 0f3f000000; \n"
" mul.ftz.f32 %f195, %f189, %f194;\n"
" mul.ftz.f32 %f196, %f92, %f191;\n"
" mul.ftz.f32 %f197, %f98, %f195;\n"
" mul.ftz.f32 %f198, %f193, %f107;\n"
" mul.ftz.f32 %f199, %f92, %f196;\n"
" fma.rn.ftz.f32 %f200, %f107, %f198, %f199;\n"
" fma.rn.ftz.f32 %f201, %f197, %f98, %f200;\n"
" mov.f32 %f202, %f201;\n"
" .loc 16 301 0\n"
" mul.ftz.f32 %f203, %f109, %f196;\n"
" fma.rn.ftz.f32 %f204, %f198, %f93, %f203;\n"
" fma.rn.ftz.f32 %f205, %f197, %f101, %f204;\n"
" mov.f32 %f206, %f205;\n"
" .loc 16 302 0\n"
" mul.ftz.f32 %f207, %f102, %f196;\n"
" fma.rn.ftz.f32 %f208, %f198, %f99, %f207;\n"
" fma.rn.ftz.f32 %f209, %f197, %f111, %f208;\n"
" mov.f32 %f210, %f209;\n"
" .loc 16 303 0\n"
" mov.f32 %f211, %f176;\n"
" .loc 16 304 0\n"
" mul.ftz.f32 %f212, %f93, %f193;\n"
" mul.ftz.f32 %f213, %f101, %f195;\n"
" mul.ftz.f32 %f214, %f191, %f109;\n"
" mul.ftz.f32 %f215, %f92, %f214;\n"
" fma.rn.ftz.f32 %f216, %f107, %f212, %f215;\n"
" fma.rn.ftz.f32 %f217, %f213, %f98, %f216;\n"
" mov.f32 %f218, %f217;\n"
" .loc 16 305 0\n"
" mul.ftz.f32 %f219, %f109, %f214;\n"
" fma.rn.ftz.f32 %f220, %f93, %f212, %f219;\n"
" fma.rn.ftz.f32 %f221, %f213, %f101, %f220;\n"
" mov.f32 %f222, %f221;\n"
" .loc 16 306 0\n"
" mul.ftz.f32 %f223, %f102, %f214;\n"
" fma.rn.ftz.f32 %f224, %f99, %f212, %f223;\n"
" fma.rn.ftz.f32 %f225, %f213, %f111, %f224;\n"
" mov.f32 %f226, %f225;\n"
" .loc 16 307 0\n"
" mul.ftz.f32 %f227, %f170, %f174;\n"
" mov.f32 %f228, %f227;\n"
" .loc 16 308 0\n"
" mul.ftz.f32 %f229, %f102, %f191;\n"
" mul.ftz.f32 %f230, %f99, %f193;\n"
" mul.ftz.f32 %f231, %f195, %f111;\n"
" mul.ftz.f32 %f232, %f92, %f229;\n"
" fma.rn.ftz.f32 %f233, %f107, %f230, %f232;\n"
" fma.rn.ftz.f32 %f234, %f98, %f231, %f233;\n"
" mov.f32 %f235, %f234;\n"
" .loc 16 309 0\n"
" mul.ftz.f32 %f236, %f109, %f229;\n"
" fma.rn.ftz.f32 %f237, %f93, %f230, %f236;\n"
" fma.rn.ftz.f32 %f238, %f101, %f231, %f237;\n"
" mov.f32 %f239, %f238;\n"
" .loc 16 310 0\n"
" mul.ftz.f32 %f240, %f102, %f229;\n"
" fma.rn.ftz.f32 %f241, %f99, %f230, %f240;\n"
" fma.rn.ftz.f32 %f242, %f111, %f231, %f241;\n"
" mov.f32 %f243, %f242;\n"
" abs.ftz.f32 %f244, %f217;\n"
" abs.ftz.f32 %f245, %f201;\n"
" setp.gt.ftz.f32 %p5, %f244, %f245;\n"
" @!%p5 bra $Lt_0_52226;\n"
" .loc 16 314 0\n"
" mov.f32 %f202, %f217;\n"
" mov.f32 %f218, %f201;\n"
" .loc 16 315 0\n"
" mov.f32 %f206, %f221;\n"
" mov.f32 %f222, %f205;\n"
" .loc 16 316 0\n"
" mov.f32 %f210, %f225;\n"
" mov.f32 %f226, %f209;\n"
" .loc 16 317 0\n"
" mov.f32 %f181, %f176;\n"
" mov.f32 %f211, %f175;\n"
"$Lt_0_52226:\n"
" mov.f32 %f246, %f202;\n"
" abs.ftz.f32 %f247, %f246;\n"
" abs.ftz.f32 %f248, %f234;\n"
" setp.lt.ftz.f32 %p6, %f247, %f248;\n"
" @!%p6 bra $Lt_0_52738;\n"
" .loc 16 321 0\n"
" mov.f32 %f202, %f234;\n"
" mov.f32 %f235, %f246;\n"
" .loc 16 322 0\n"
" mov.f32 %f249, %f206;\n"
" mov.f32 %f206, %f238;\n"
" mov.f32 %f239, %f249;\n"
" .loc 16 323 0\n"
" mov.f32 %f250, %f210;\n"
" mov.f32 %f210, %f242;\n"
" mov.f32 %f243, %f250;\n"
" .loc 16 324 0\n"
" mov.f32 %f251, %f181;\n"
" mov.f32 %f181, %f227;\n"
" mov.f32 %f228, %f251;\n"
"$Lt_0_52738:\n"
" mov.f32 %f252, %f202;\n"
" mov.f32 %f253, 0f00000000; \n"
" setp.neu.ftz.f32 %p7, %f252, %f253;\n"
" @!%p7 bra $Lt_0_53506;\n"
" bra.uni $Lt_0_54274;\n"
"$Lt_0_53506:\n"
" mov.f32 %f254, 0f00000000; \n"
" setp.neu.ftz.f32 %p8, %f218, %f254;\n"
" @!%p8 bra $Lt_0_54018;\n"
" .loc 16 338 0\n"
" mov.f32 %f202, %f218;\n"
" mov.f32 %f218, %f252;\n"
" .loc 16 339 0\n"
" mov.f32 %f255, %f206;\n"
" mov.f32 %f206, %f222;\n"
" mov.f32 %f222, %f255;\n"
" .loc 16 340 0\n"
" mov.f32 %f256, %f210;\n"
" mov.f32 %f210, %f226;\n"
" mov.f32 %f226, %f256;\n"
" .loc 16 341 0\n"
" mov.f32 %f257, %f181;\n"
" mov.f32 %f181, %f211;\n"
" mov.f32 %f211, %f257;\n"
" bra.uni $Lt_0_54274;\n"
"$Lt_0_54018:\n"
" mov.f32 %f258, 0f00000000; \n"
" setp.neu.ftz.f32 %p9, %f235, %f258;\n"
" @!%p9 bra $Lt_0_54530;\n"
" .loc 16 346 0\n"
" mov.f32 %f202, %f235;\n"
" mov.f32 %f235, %f252;\n"
" .loc 16 347 0\n"
" mov.f32 %f259, %f206;\n"
" mov.f32 %f206, %f239;\n"
" mov.f32 %f239, %f259;\n"
" .loc 16 348 0\n"
" mov.f32 %f260, %f210;\n"
" mov.f32 %f210, %f243;\n"
" mov.f32 %f243, %f260;\n"
" .loc 16 349 0\n"
" mov.f32 %f261, %f181;\n"
" mov.f32 %f181, %f228;\n"
" mov.f32 %f228, %f261;\n"
" bra.uni $Lt_0_54274;\n"
"$Lt_0_54530:\n"
" .loc 16 352 0\n"
" mov.s32 %r30, 2;\n"
" ld.param.u64 %rd40, [__cudaparm_kernel_ellipsoid_sphere_err_flag];\n"
" st.global.s32 [%rd40+0], %r30;\n"
"$Lt_0_54274:\n"
"$Lt_0_53762:\n"
"$Lt_0_53250:\n"
" .loc 16 355 0\n"
" div.approx.ftz.f32 %f262, %f218, %f202;\n"
" mul.ftz.f32 %f263, %f206, %f262;\n"
" sub.ftz.f32 %f264, %f222, %f263;\n"
" mov.f32 %f222, %f264;\n"
" .loc 16 356 0\n"
" mul.ftz.f32 %f265, %f210, %f262;\n"
" sub.ftz.f32 %f266, %f226, %f265;\n"
" mov.f32 %f226, %f266;\n"
" .loc 16 357 0\n"
" mul.ftz.f32 %f267, %f181, %f262;\n"
" sub.ftz.f32 %f268, %f211, %f267;\n"
" mov.f32 %f211, %f268;\n"
" .loc 16 359 0\n"
" div.approx.ftz.f32 %f269, %f235, %f202;\n"
" mul.ftz.f32 %f270, %f206, %f269;\n"
" sub.ftz.f32 %f239, %f239, %f270;\n"
" .loc 16 360 0\n"
" mul.ftz.f32 %f271, %f210, %f269;\n"
" sub.ftz.f32 %f243, %f243, %f271;\n"
" .loc 16 361 0\n"
" mul.ftz.f32 %f272, %f181, %f269;\n"
" sub.ftz.f32 %f228, %f228, %f272;\n"
" abs.ftz.f32 %f273, %f264;\n"
" abs.ftz.f32 %f274, %f239;\n"
" setp.lt.ftz.f32 %p10, %f273, %f274;\n"
" @!%p10 bra $Lt_0_54786;\n"
" .loc 16 366 0\n"
" mov.f32 %f222, %f239;\n"
" mov.f32 %f239, %f264;\n"
" .loc 16 367 0\n"
" mov.f32 %f226, %f243;\n"
" mov.f32 %f243, %f266;\n"
" .loc 16 368 0\n"
" mov.f32 %f211, %f228;\n"
" mov.f32 %f228, %f268;\n"
"$Lt_0_54786:\n"
" mov.f32 %f275, %f222;\n"
" mov.f32 %f276, 0f00000000; \n"
" setp.neu.ftz.f32 %p11, %f275, %f276;\n"
" @!%p11 bra $Lt_0_55554;\n"
" bra.uni $Lt_0_55810;\n"
"$Lt_0_55554:\n"
" mov.f32 %f277, 0f00000000; \n"
" setp.neu.ftz.f32 %p12, %f239, %f277;\n"
" @!%p12 bra $Lt_0_55810;\n"
" .loc 16 383 0\n"
" mov.f32 %f222, %f239;\n"
" mov.f32 %f239, %f275;\n"
" .loc 16 384 0\n"
" mov.f32 %f278, %f226;\n"
" mov.f32 %f226, %f243;\n"
" mov.f32 %f243, %f278;\n"
" .loc 16 385 0\n"
" mov.f32 %f279, %f211;\n"
" mov.f32 %f211, %f228;\n"
" mov.f32 %f228, %f279;\n"
"$Lt_0_55810:\n"
"$Lt_0_55298:\n"
" .loc 16 390 0\n"
" div.approx.ftz.f32 %f280, %f239, %f222;\n"
" mul.ftz.f32 %f281, %f226, %f280;\n"
" sub.ftz.f32 %f243, %f243, %f281;\n"
" .loc 16 391 0\n"
" mul.ftz.f32 %f282, %f211, %f280;\n"
" sub.ftz.f32 %f228, %f228, %f282;\n"
" mov.f32 %f283, 0f00000000; \n"
" setp.eq.ftz.f32 %p13, %f243, %f283;\n"
" @!%p13 bra $Lt_0_56322;\n"
" .loc 16 394 0\n"
" mov.s32 %r31, 2;\n"
" ld.param.u64 %rd41, [__cudaparm_kernel_ellipsoid_sphere_err_flag];\n"
" st.global.s32 [%rd41+0], %r31;\n"
"$Lt_0_56322:\n"
" .loc 16 396 0\n"
" div.approx.ftz.f32 %f284, %f228, %f243;\n"
" .loc 16 399 0\n"
" mul.ftz.f32 %f285, %f284, %f226;\n"
" sub.ftz.f32 %f286, %f211, %f285;\n"
" div.approx.ftz.f32 %f287, %f286, %f222;\n"
" .loc 16 403 0\n"
" mul.ftz.f32 %f288, %f287, %f206;\n"
" fma.rn.ftz.f32 %f289, %f210, %f284, %f288;\n"
" sub.ftz.f32 %f290, %f181, %f289;\n"
" div.approx.ftz.f32 %f291, %f290, %f202;\n"
" .loc 17 124 0\n"
" mul.ftz.f32 %f292, %f287, %f176;\n"
" fma.rn.ftz.f32 %f293, %f175, %f291, %f292;\n"
" fma.rn.ftz.f32 %f294, %f227, %f284, %f293;\n"
" mov.f32 %f295, 0f3f000000; \n"
" mul.ftz.f32 %f296, %f294, %f295;\n"
" rsqrt.approx.ftz.f32 %f297, %f296;\n"
" .loc 16 299 0\n"
" mov.f32 %f181, %f175;\n"
" .loc 16 300 0\n"
" mov.f32 %f202, %f146;\n"
" .loc 16 301 0\n"
" mov.f32 %f206, %f138;\n"
" .loc 16 302 0\n"
" mov.f32 %f210, %f137;\n"
" .loc 16 303 0\n"
" mov.f32 %f211, %f176;\n"
" .loc 16 304 0\n"
" mov.f32 %f218, %f147;\n"
" .loc 16 305 0\n"
" mov.f32 %f222, %f156;\n"
" .loc 16 306 0\n"
" mov.f32 %f226, %f149;\n"
" .loc 16 307 0\n"
" mov.f32 %f228, %f227;\n"
" .loc 16 308 0\n"
" mov.f32 %f235, %f142;\n"
" .loc 16 309 0\n"
" mov.f32 %f239, %f143;\n"
" .loc 16 310 0\n"
" mov.f32 %f243, %f152;\n"
" @!%p4 bra $Lt_0_56834;\n"
" .loc 16 314 0\n"
" mov.f32 %f202, %f147;\n"
" mov.f32 %f218, %f146;\n"
" .loc 16 315 0\n"
" mov.f32 %f206, %f156;\n"
" mov.f32 %f222, %f138;\n"
" .loc 16 316 0\n"
" mov.f32 %f210, %f149;\n"
" mov.f32 %f226, %f137;\n"
" .loc 16 317 0\n"
" mov.f32 %f181, %f176;\n"
" mov.f32 %f211, %f175;\n"
"$Lt_0_56834:\n"
" mov.f32 %f298, %f202;\n"
" abs.ftz.f32 %f299, %f298;\n"
" setp.gt.ftz.f32 %p14, %f150, %f299;\n"
" @!%p14 bra $Lt_0_57346;\n"
" .loc 16 321 0\n"
" mov.f32 %f202, %f142;\n"
" mov.f32 %f235, %f298;\n"
" .loc 16 322 0\n"
" mov.f32 %f300, %f206;\n"
" mov.f32 %f206, %f143;\n"
" mov.f32 %f239, %f300;\n"
" .loc 16 323 0\n"
" mov.f32 %f301, %f210;\n"
" mov.f32 %f210, %f152;\n"
" mov.f32 %f243, %f301;\n"
" .loc 16 324 0\n"
" mov.f32 %f302, %f181;\n"
" mov.f32 %f181, %f227;\n"
" mov.f32 %f228, %f302;\n"
"$Lt_0_57346:\n"
" mov.f32 %f303, %f202;\n"
" mov.f32 %f304, 0f00000000; \n"
" setp.neu.ftz.f32 %p15, %f303, %f304;\n"
" @!%p15 bra $Lt_0_58114;\n"
" bra.uni $Lt_0_58882;\n"
"$Lt_0_58114:\n"
" mov.f32 %f305, 0f00000000; \n"
" setp.neu.ftz.f32 %p16, %f218, %f305;\n"
" @!%p16 bra $Lt_0_58626;\n"
" .loc 16 338 0\n"
" mov.f32 %f202, %f218;\n"
" mov.f32 %f218, %f303;\n"
" .loc 16 339 0\n"
" mov.f32 %f306, %f206;\n"
" mov.f32 %f206, %f222;\n"
" mov.f32 %f222, %f306;\n"
" .loc 16 340 0\n"
" mov.f32 %f307, %f210;\n"
" mov.f32 %f210, %f226;\n"
" mov.f32 %f226, %f307;\n"
" .loc 16 341 0\n"
" mov.f32 %f308, %f181;\n"
" mov.f32 %f181, %f211;\n"
" mov.f32 %f211, %f308;\n"
" bra.uni $Lt_0_58882;\n"
"$Lt_0_58626:\n"
" mov.f32 %f309, 0f00000000; \n"
" setp.neu.ftz.f32 %p17, %f235, %f309;\n"
" @!%p17 bra $Lt_0_59138;\n"
" .loc 16 346 0\n"
" mov.f32 %f202, %f235;\n"
" mov.f32 %f235, %f303;\n"
" .loc 16 347 0\n"
" mov.f32 %f310, %f206;\n"
" mov.f32 %f206, %f239;\n"
" mov.f32 %f239, %f310;\n"
" .loc 16 348 0\n"
" mov.f32 %f311, %f210;\n"
" mov.f32 %f210, %f243;\n"
" mov.f32 %f243, %f311;\n"
" .loc 16 349 0\n"
" mov.f32 %f312, %f181;\n"
" mov.f32 %f181, %f228;\n"
" mov.f32 %f228, %f312;\n"
" bra.uni $Lt_0_58882;\n"
"$Lt_0_59138:\n"
" .loc 16 352 0\n"
" mov.s32 %r32, 2;\n"
" ld.param.u64 %rd42, [__cudaparm_kernel_ellipsoid_sphere_err_flag];\n"
" st.global.s32 [%rd42+0], %r32;\n"
"$Lt_0_58882:\n"
"$Lt_0_58370:\n"
"$Lt_0_57858:\n"
" .loc 16 355 0\n"
" div.approx.ftz.f32 %f313, %f218, %f202;\n"
" mul.ftz.f32 %f314, %f206, %f313;\n"
" sub.ftz.f32 %f315, %f222, %f314;\n"
" mov.f32 %f222, %f315;\n"
" .loc 16 356 0\n"
" mul.ftz.f32 %f316, %f210, %f313;\n"
" sub.ftz.f32 %f317, %f226, %f316;\n"
" mov.f32 %f226, %f317;\n"
" .loc 16 357 0\n"
" mul.ftz.f32 %f318, %f181, %f313;\n"
" sub.ftz.f32 %f319, %f211, %f318;\n"
" mov.f32 %f211, %f319;\n"
" .loc 16 359 0\n"
" div.approx.ftz.f32 %f320, %f235, %f202;\n"
" mul.ftz.f32 %f321, %f206, %f320;\n"
" sub.ftz.f32 %f239, %f239, %f321;\n"
" .loc 16 360 0\n"
" mul.ftz.f32 %f322, %f210, %f320;\n"
" sub.ftz.f32 %f243, %f243, %f322;\n"
" .loc 16 361 0\n"
" mul.ftz.f32 %f323, %f181, %f320;\n"
" sub.ftz.f32 %f228, %f228, %f323;\n"
" abs.ftz.f32 %f324, %f315;\n"
" abs.ftz.f32 %f325, %f239;\n"
" setp.lt.ftz.f32 %p18, %f324, %f325;\n"
" @!%p18 bra $Lt_0_59394;\n"
" .loc 16 366 0\n"
" mov.f32 %f222, %f239;\n"
" mov.f32 %f239, %f315;\n"
" .loc 16 367 0\n"
" mov.f32 %f226, %f243;\n"
" mov.f32 %f243, %f317;\n"
" .loc 16 368 0\n"
" mov.f32 %f211, %f228;\n"
" mov.f32 %f228, %f319;\n"
"$Lt_0_59394:\n"
" mov.f32 %f326, %f222;\n"
" mov.f32 %f327, 0f00000000; \n"
" setp.neu.ftz.f32 %p19, %f326, %f327;\n"
" @!%p19 bra $Lt_0_60162;\n"
" bra.uni $Lt_0_60418;\n"
"$Lt_0_60162:\n"
" mov.f32 %f328, 0f00000000; \n"
" setp.neu.ftz.f32 %p20, %f239, %f328;\n"
" @!%p20 bra $Lt_0_60418;\n"
" .loc 16 383 0\n"
" mov.f32 %f222, %f239;\n"
" mov.f32 %f239, %f326;\n"
" .loc 16 384 0\n"
" mov.f32 %f329, %f226;\n"
" mov.f32 %f226, %f243;\n"
" mov.f32 %f243, %f329;\n"
" .loc 16 385 0\n"
" mov.f32 %f330, %f211;\n"
" mov.f32 %f211, %f228;\n"
" mov.f32 %f228, %f330;\n"
"$Lt_0_60418:\n"
"$Lt_0_59906:\n"
" .loc 16 390 0\n"
" div.approx.ftz.f32 %f331, %f239, %f222;\n"
" mul.ftz.f32 %f332, %f226, %f331;\n"
" sub.ftz.f32 %f243, %f243, %f332;\n"
" .loc 16 391 0\n"
" mul.ftz.f32 %f333, %f211, %f331;\n"
" sub.ftz.f32 %f228, %f228, %f333;\n"
" mov.f32 %f334, 0f00000000; \n"
" setp.eq.ftz.f32 %p21, %f243, %f334;\n"
" @!%p21 bra $Lt_0_60930;\n"
" .loc 16 394 0\n"
" mov.s32 %r33, 2;\n"
" ld.param.u64 %rd43, [__cudaparm_kernel_ellipsoid_sphere_err_flag];\n"
" st.global.s32 [%rd43+0], %r33;\n"
"$Lt_0_60930:\n"
" .loc 17 133 0\n"
" div.approx.ftz.f32 %f335, %f228, %f243;\n"
" mul.ftz.f32 %f336, %f335, %f226;\n"
" sub.ftz.f32 %f337, %f211, %f336;\n"
" div.approx.ftz.f32 %f338, %f337, %f222;\n"
" mul.ftz.f32 %f339, %f338, %f206;\n"
" fma.rn.ftz.f32 %f340, %f210, %f335, %f339;\n"
" mul.ftz.f32 %f341, %f338, %f176;\n"
" sub.ftz.f32 %f342, %f181, %f340;\n"
" div.approx.ftz.f32 %f343, %f342, %f202;\n"
" fma.rn.ftz.f32 %f344, %f175, %f343, %f341;\n"
" fma.rn.ftz.f32 %f345, %f227, %f335, %f344;\n"
" add.ftz.f32 %f346, %f345, %f345;\n"
" .loc 17 141 0\n"
" rcp.approx.ftz.f32 %f347, %f174;\n"
" sub.ftz.f32 %f348, %f347, %f297;\n"
" mov.f32 %f349, 0f3f000000; \n"
" mul.ftz.f32 %f350, %f348, %f349;\n"
" mul.ftz.f32 %f351, %f348, %f348;\n"
" mul.ftz.f32 %f352, %f348, %f351;\n"
" add.ftz.f32 %f353, %f350, %f64;\n"
" add.ftz.f32 %f354, %f350, %f62;\n"
" add.ftz.f32 %f355, %f350, %f63;\n"
" mul.ftz.f32 %f356, %f354, %f355;\n"
" mul.ftz.f32 %f357, %f353, %f356;\n"
" mul.ftz.f32 %f358, %f352, %f357;\n"
" .loc 17 142 0\n"
" div.approx.ftz.f32 %f359, %f177, %f348;\n"
" mul.ftz.f32 %f360, %f359, %f346;\n"
" mov.f32 %f361, 0f3f800000; \n"
" mov.f32 %f362, 0f40400000; \n"
" fma.rn.ftz.f32 %f363, %f362, %f360, %f361;\n"
" mul.ftz.f32 %f364, %f91, %f363;\n"
" .loc 17 146 0\n"
" div.approx.ftz.f32 %f365, %f348, %f17;\n"
" add.ftz.f32 %f366, %f365, %f64;\n"
" add.ftz.f32 %f367, %f365, %f62;\n"
" add.ftz.f32 %f368, %f365, %f63;\n"
" mul.ftz.f32 %f369, %f367, %f368;\n"
" mul.ftz.f32 %f370, %f366, %f369;\n"
" mul.ftz.f32 %f371, %f352, %f370;\n"
" .loc 17 148 0\n"
" mov.f32 %f372, 0f3f800000; \n"
" mov.f32 %f373, 0f3f4db6db; \n"
" fma.rn.ftz.f32 %f374, %f373, %f360, %f372;\n"
" mul.ftz.f32 %f375, %f91, %f374;\n"
" .loc 17 150 0\n"
" mul.ftz.f32 %f376, %f359, %f359;\n"
" mul.ftz.f32 %f377, %f359, %f376;\n"
" mul.ftz.f32 %f378, %f377, %f377;\n"
" .loc 17 153 0\n"
" mul.ftz.f32 %f379, %f177, %f177;\n"
" mov.f32 %f380, 0f41000000; \n"
" div.approx.ftz.f32 %f381, %f358, %f380;\n"
" mov.f32 %f382, 0f42700000; \n"
" div.approx.ftz.f32 %f383, %f371, %f382;\n"
" mul.ftz.f32 %f384, %f379, %f177;\n"
" div.approx.ftz.f32 %f385, %f364, %f381;\n"
" div.approx.ftz.f32 %f386, %f375, %f383;\n"
" mul.ftz.f32 %f387, %f385, %f180;\n"
" mul.ftz.f32 %f388, %f386, %f180;\n"
" mul.ftz.f32 %f389, %f384, %f387;\n"
" mul.ftz.f32 %f390, %f384, %f388;\n"
" mul.ftz.f32 %f391, %f389, %f41;\n"
" mul.ftz.f32 %f392, %f390, %f378;\n"
" mul.ftz.f32 %f393, %f392, %f45;\n"
" add.ftz.f32 %f394, %f391, %f393;\n"
" add.ftz.f32 %f163, %f163, %f394;\n"
" .loc 17 160 0\n"
" mov.f32 %f395, 0f40800000; \n"
" mul.ftz.f32 %f396, %f343, %f395;\n"
" .loc 17 167 0\n"
" mov.f32 %f397, 0f40400000; \n"
" div.approx.ftz.f32 %f398, %f397, %f348;\n"
" add.ftz.f32 %f399, %f80, %f348;\n"
" rcp.approx.ftz.f32 %f400, %f399;\n"
" add.ftz.f32 %f401, %f81, %f348;\n"
" rcp.approx.ftz.f32 %f402, %f401;\n"
" add.ftz.f32 %f403, %f400, %f402;\n"
" add.ftz.f32 %f404, %f82, %f348;\n"
" rcp.approx.ftz.f32 %f405, %f404;\n"
" add.ftz.f32 %f406, %f403, %f405;\n"
" add.ftz.f32 %f407, %f398, %f406;\n"
" .loc 17 172 0\n"
" mul.ftz.f32 %f408, %f177, %f346;\n"
" mov.f32 %f409, 0f40400000; \n"
" fma.rn.ftz.f32 %f410, %f409, %f408, %f348;\n"
" rcp.approx.ftz.f32 %f411, %f410;\n"
" rcp.approx.ftz.f32 %f412, %f348;\n"
" sub.ftz.f32 %f413, %f412, %f411;\n"
" add.ftz.f32 %f414, %f407, %f413;\n"
" .loc 17 175 0\n"
" fma.rn.ftz.f32 %f415, %f17, %f62, %f348;\n"
" rcp.approx.ftz.f32 %f416, %f415;\n"
" fma.rn.ftz.f32 %f417, %f17, %f63, %f348;\n"
" rcp.approx.ftz.f32 %f418, %f417;\n"
" add.ftz.f32 %f419, %f416, %f418;\n"
" fma.rn.ftz.f32 %f420, %f17, %f64, %f348;\n"
" rcp.approx.ftz.f32 %f421, %f420;\n"
" add.ftz.f32 %f422, %f419, %f421;\n"
" add.ftz.f32 %f423, %f398, %f422;\n"
" .loc 17 186 0\n"
" mul.ftz.f32 %f424, %f175, %f175;\n"
" neg.ftz.f32 %f425, %f424;\n"
" mov.f32 %f426, %f425;\n"
" .loc 17 187 0\n"
" mul.ftz.f32 %f427, %f176, %f175;\n"
" neg.ftz.f32 %f428, %f427;\n"
" mov.f32 %f429, %f428;\n"
" .loc 17 188 0\n"
" mul.ftz.f32 %f430, %f227, %f175;\n"
" neg.ftz.f32 %f431, %f430;\n"
" mov.f32 %f432, %f431;\n"
" .loc 17 189 0\n"
" mov.f32 %f433, 0f3f800000; \n"
" sub.ftz.f32 %f434, %f433, %f424;\n"
" mov.f32 %f435, %f434;\n"
" .loc 17 190 0\n"
" mul.ftz.f32 %f436, %f174, %f434;\n"
" mov.f32 %f437, %f436;\n"
" .loc 17 191 0\n"
" mov.f32 %f438, %f429;\n"
" mul.ftz.f32 %f439, %f438, %f174;\n"
" mov.f32 %f440, %f439;\n"
" .loc 17 192 0\n"
" mov.f32 %f441, %f432;\n"
" mul.ftz.f32 %f442, %f441, %f174;\n"
" mov.f32 %f443, %f442;\n"
" .loc 17 196 0\n"
" mul.ftz.f32 %f444, %f297, %f297;\n"
" mov.f32 %f445, 0f3f4db6db; \n"
" mul.ftz.f32 %f446, %f177, %f445;\n"
" mov.f32 %f447, 0f40800000; \n"
" mul.ftz.f32 %f448, %f335, %f447;\n"
" mul.ftz.f32 %f449, %f444, %f297;\n"
" mov.f32 %f450, 0f3f000000; \n"
" mul.ftz.f32 %f451, %f449, %f450;\n"
" mul.ftz.f32 %f452, %f451, %f287;\n"
" mul.ftz.f32 %f453, %f451, %f291;\n"
" mul.ftz.f32 %f454, %f451, %f284;\n"
" mov.f32 %f455, 0f40800000; \n"
" mul.ftz.f32 %f456, %f338, %f455;\n"
" mul.ftz.f32 %f457, %f452, %f439;\n"
" mul.ftz.f32 %f458, %f456, %f439;\n"
" mov.f32 %f459, 0f40e00000; \n"
" div.approx.ftz.f32 %f460, %f459, %f348;\n"
" mov.f32 %f461, 0f3f4db6db; \n"
" fma.rn.ftz.f32 %f462, %f461, %f408, %f348;\n"
" rcp.approx.ftz.f32 %f463, %f462;\n"
" fma.rn.ftz.f32 %f464, %f453, %f436, %f457;\n"
" fma.rn.ftz.f32 %f465, %f396, %f436, %f458;\n"
" sub.ftz.f32 %f466, %f460, %f463;\n"
" mul.ftz.f32 %f467, %f446, %f463;\n"
" fma.rn.ftz.f32 %f468, %f454, %f442, %f464;\n"
" fma.rn.ftz.f32 %f469, %f448, %f442, %f465;\n"
" add.ftz.f32 %f470, %f466, %f423;\n"
" add.ftz.f32 %f471, %f468, %f175;\n"
" mul.ftz.f32 %f472, %f470, %f471;\n"
" mul.ftz.f32 %f473, %f467, %f469;\n"
" sub.ftz.f32 %f474, %f473, %f472;\n"
" .loc 17 197 0\n"
" mov.f32 %f475, 0f40400000; \n"
" mul.ftz.f32 %f476, %f177, %f475;\n"
" mul.ftz.f32 %f477, %f476, %f411;\n"
" mul.ftz.f32 %f478, %f393, %f474;\n"
" mul.ftz.f32 %f479, %f471, %f414;\n"
" mul.ftz.f32 %f480, %f477, %f469;\n"
" sub.ftz.f32 %f481, %f480, %f479;\n"
" fma.rn.ftz.f32 %f482, %f391, %f481, %f478;\n"
" .loc 17 199 0\n"
" add.ftz.f32 %f162, %f482, %f162;\n"
" @!%p3 bra $Lt_0_61954;\n"
" .loc 17 201 0\n"
" mov.f32 %f483, %f47;\n"
" mul.ftz.f32 %f484, %f169, %f482;\n"
" sub.ftz.f32 %f485, %f483, %f484;\n"
" mov.f32 %f47, %f485;\n"
"$Lt_0_61954:\n"
" .loc 17 186 0\n"
" mov.f32 %f486, %f428;\n"
" .loc 17 187 0\n"
" mul.ftz.f32 %f487, %f176, %f176;\n"
" neg.ftz.f32 %f488, %f487;\n"
" mov.f32 %f489, %f488;\n"
" .loc 17 188 0\n"
" mul.ftz.f32 %f490, %f227, %f176;\n"
" neg.ftz.f32 %f491, %f490;\n"
" mov.f32 %f492, %f491;\n"
" .loc 17 189 0\n"
" mov.f32 %f493, 0f3f800000; \n"
" sub.ftz.f32 %f494, %f493, %f487;\n"
" mov.f32 %f495, %f494;\n"
" .loc 17 190 0\n"
" mov.f32 %f496, %f486;\n"
" mul.ftz.f32 %f497, %f496, %f174;\n"
" mov.f32 %f498, %f497;\n"
" .loc 17 191 0\n"
" mul.ftz.f32 %f499, %f174, %f494;\n"
" mov.f32 %f500, %f499;\n"
" .loc 17 192 0\n"
" mov.f32 %f501, %f492;\n"
" mul.ftz.f32 %f502, %f501, %f174;\n"
" mov.f32 %f503, %f502;\n"
" .loc 17 196 0\n"
" mul.ftz.f32 %f504, %f452, %f499;\n"
" mul.ftz.f32 %f505, %f456, %f499;\n"
" fma.rn.ftz.f32 %f506, %f453, %f497, %f504;\n"
" fma.rn.ftz.f32 %f507, %f396, %f497, %f505;\n"
" fma.rn.ftz.f32 %f508, %f454, %f502, %f506;\n"
" fma.rn.ftz.f32 %f509, %f448, %f502, %f507;\n"
" add.ftz.f32 %f510, %f508, %f176;\n"
" mul.ftz.f32 %f511, %f470, %f510;\n"
" mul.ftz.f32 %f512, %f467, %f509;\n"
" sub.ftz.f32 %f513, %f512, %f511;\n"
" .loc 17 197 0\n"
" mul.ftz.f32 %f514, %f393, %f513;\n"
" mul.ftz.f32 %f515, %f510, %f414;\n"
" mul.ftz.f32 %f516, %f477, %f509;\n"
" sub.ftz.f32 %f517, %f516, %f515;\n"
" fma.rn.ftz.f32 %f482, %f391, %f517, %f514;\n"
" .loc 17 203 0\n"
" add.ftz.f32 %f161, %f482, %f161;\n"
" @!%p3 bra $Lt_0_65538;\n"
" .loc 17 205 0\n"
" mov.f32 %f518, %f49;\n"
" mul.ftz.f32 %f519, %f168, %f482;\n"
" sub.ftz.f32 %f520, %f518, %f519;\n"
" mov.f32 %f49, %f520;\n"
" .loc 17 206 0\n"
" mov.f32 %f521, %f53;\n"
" mul.ftz.f32 %f522, %f169, %f482;\n"
" sub.ftz.f32 %f523, %f521, %f522;\n"
" mov.f32 %f53, %f523;\n"
"$Lt_0_65538:\n"
" .loc 17 186 0\n"
" mov.f32 %f524, %f431;\n"
" .loc 17 187 0\n"
" mov.f32 %f525, %f491;\n"
" .loc 17 188 0\n"
" mul.ftz.f32 %f526, %f227, %f227;\n"
" neg.ftz.f32 %f527, %f526;\n"
" mov.f32 %f528, %f527;\n"
" .loc 17 189 0\n"
" mov.f32 %f529, 0f3f800000; \n"
" sub.ftz.f32 %f530, %f529, %f526;\n"
" mov.f32 %f531, %f530;\n"
" .loc 17 190 0\n"
" mov.f32 %f532, %f524;\n"
" mul.ftz.f32 %f533, %f532, %f174;\n"
" mov.f32 %f534, %f533;\n"
" .loc 17 191 0\n"
" mov.f32 %f535, %f525;\n"
" mul.ftz.f32 %f536, %f535, %f174;\n"
" mov.f32 %f537, %f536;\n"
" .loc 17 192 0\n"
" mul.ftz.f32 %f538, %f174, %f530;\n"
" mov.f32 %f539, %f538;\n"
" .loc 17 196 0\n"
" mul.ftz.f32 %f540, %f452, %f536;\n"
" mul.ftz.f32 %f541, %f456, %f536;\n"
" fma.rn.ftz.f32 %f542, %f453, %f533, %f540;\n"
" fma.rn.ftz.f32 %f543, %f396, %f533, %f541;\n"
" fma.rn.ftz.f32 %f544, %f454, %f538, %f542;\n"
" fma.rn.ftz.f32 %f545, %f448, %f538, %f543;\n"
" add.ftz.f32 %f546, %f544, %f227;\n"
" mul.ftz.f32 %f547, %f546, %f470;\n"
" mul.ftz.f32 %f548, %f467, %f545;\n"
" sub.ftz.f32 %f549, %f548, %f547;\n"
" .loc 17 197 0\n"
" mul.ftz.f32 %f550, %f393, %f549;\n"
" mul.ftz.f32 %f551, %f546, %f414;\n"
" mul.ftz.f32 %f552, %f477, %f545;\n"
" sub.ftz.f32 %f553, %f552, %f551;\n"
" fma.rn.ftz.f32 %f482, %f391, %f553, %f550;\n"
" .loc 17 209 0\n"
" add.ftz.f32 %f160, %f482, %f160;\n"
" @!%p3 bra $Lt_0_68610;\n"
" .loc 17 211 0\n"
" mov.f32 %f554, %f51;\n"
" mul.ftz.f32 %f555, %f170, %f482;\n"
" sub.ftz.f32 %f556, %f554, %f555;\n"
" mov.f32 %f51, %f556;\n"
" .loc 17 212 0\n"
" mov.f32 %f557, %f55;\n"
" mul.ftz.f32 %f558, %f169, %f482;\n"
" sub.ftz.f32 %f559, %f557, %f558;\n"
" mov.f32 %f55, %f559;\n"
" .loc 17 213 0\n"
" mul.ftz.f32 %f560, %f168, %f482;\n"
" sub.ftz.f32 %f56, %f56, %f560;\n"
" mov.f32 %f57, %f56;\n"
"$Lt_0_68610:\n"
" .loc 17 232 0\n"
" mul.ftz.f32 %f561, %f102, %f338;\n"
" mul.ftz.f32 %f562, %f100, %f338;\n"
" mul.ftz.f32 %f563, %f111, %f338;\n"
" mov.f32 %f564, 0f00000000; \n"
" mov.f32 %f565, 0f00000000; \n"
" fma.rn.ftz.f32 %f566, %f565, %f212, %f564;\n"
" mov.f32 %f567, 0f00000000; \n"
" mov.f32 %f568, 0f00000000; \n"
" fma.rn.ftz.f32 %f569, %f568, %f230, %f567;\n"
" mov.f32 %f570, 0f00000000; \n"
" mov.f32 %f571, 0f00000000; \n"
" fma.rn.ftz.f32 %f572, %f571, %f198, %f570;\n"
" mul.ftz.f32 %f573, %f121, %f456;\n"
" mul.ftz.f32 %f574, %f105, %f456;\n"
" mul.ftz.f32 %f575, %f115, %f456;\n"
" neg.ftz.f32 %f576, %f561;\n"
" neg.ftz.f32 %f577, %f563;\n"
" neg.ftz.f32 %f578, %f207;\n"
" neg.ftz.f32 %f579, %f240;\n"
" mov.f32 %f580, 0f00000000; \n"
" fma.rn.ftz.f32 %f581, %f213, %f580, %f566;\n"
" mov.f32 %f582, 0f00000000; \n"
" fma.rn.ftz.f32 %f583, %f582, %f231, %f569;\n"
" mov.f32 %f584, 0f00000000; \n"
" fma.rn.ftz.f32 %f585, %f197, %f584, %f572;\n"
" neg.ftz.f32 %f586, %f223;\n"
" fma.rn.ftz.f32 %f587, %f198, %f100, %f578;\n"
" fma.rn.ftz.f32 %f588, %f100, %f230, %f579;\n"
" fma.rn.ftz.f32 %f589, %f100, %f212, %f586;\n"
" fma.rn.ftz.f32 %f590, %f197, %f123, %f587;\n"
" fma.rn.ftz.f32 %f591, %f123, %f231, %f588;\n"
" fma.rn.ftz.f32 %f592, %f213, %f123, %f589;\n"
" mov.f32 %f593, 0f00000000; \n"
" fma.rn.ftz.f32 %f594, %f343, %f593, %f576;\n"
" mov.f32 %f595, 0f00000000; \n"
" fma.rn.ftz.f32 %f596, %f595, %f343, %f562;\n"
" mov.f32 %f597, 0f00000000; \n"
" fma.rn.ftz.f32 %f598, %f343, %f597, %f577;\n"
" mul.ftz.f32 %f599, %f452, %f590;\n"
" mul.ftz.f32 %f600, %f452, %f591;\n"
" mul.ftz.f32 %f601, %f452, %f592;\n"
" fma.rn.ftz.f32 %f602, %f396, %f104, %f573;\n"
" fma.rn.ftz.f32 %f603, %f119, %f396, %f574;\n"
" fma.rn.ftz.f32 %f604, %f396, %f112, %f575;\n"
" fma.rn.ftz.f32 %f605, %f335, %f109, %f594;\n"
" fma.rn.ftz.f32 %f606, %f93, %f335, %f596;\n"
" fma.rn.ftz.f32 %f607, %f335, %f101, %f598;\n"
" fma.rn.ftz.f32 %f608, %f453, %f585, %f599;\n"
" fma.rn.ftz.f32 %f609, %f453, %f583, %f600;\n"
" fma.rn.ftz.f32 %f610, %f453, %f581, %f601;\n"
" fma.rn.ftz.f32 %f611, %f448, %f116, %f602;\n"
" fma.rn.ftz.f32 %f612, %f448, %f114, %f603;\n"
" fma.rn.ftz.f32 %f613, %f448, %f122, %f604;\n"
" fma.rn.ftz.f32 %f614, %f454, %f205, %f608;\n"
" fma.rn.ftz.f32 %f615, %f454, %f238, %f609;\n"
" fma.rn.ftz.f32 %f616, %f454, %f221, %f610;\n"
" mul.ftz.f32 %f617, %f605, %f611;\n"
" mul.ftz.f32 %f618, %f616, %f287;\n"
" fma.rn.ftz.f32 %f619, %f612, %f606, %f617;\n"
" fma.rn.ftz.f32 %f620, %f291, %f614, %f618;\n"
" fma.rn.ftz.f32 %f621, %f613, %f607, %f619;\n"
" fma.rn.ftz.f32 %f622, %f284, %f615, %f620;\n"
" neg.ftz.f32 %f623, %f621;\n"
" mul.ftz.f32 %f624, %f470, %f622;\n"
" fma.rn.ftz.f32 %f625, %f467, %f623, %f624;\n"
" mul.ftz.f32 %f626, %f393, %f625;\n"
" mul.ftz.f32 %f627, %f622, %f414;\n"
" fma.rn.ftz.f32 %f628, %f477, %f623, %f627;\n"
" fma.rn.ftz.f32 %f629, %f391, %f628, %f626;\n"
" sub.ftz.f32 %f159, %f159, %f629;\n"
" .loc 17 245 0\n"
" mul.ftz.f32 %f630, %f94, %f196;\n"
" mul.ftz.f32 %f631, %f94, %f229;\n"
" mov.f32 %f632, 0f00000000; \n"
" mov.f32 %f633, 0f00000000; \n"
" fma.rn.ftz.f32 %f634, %f198, %f633, %f632;\n"
" mul.ftz.f32 %f635, %f94, %f214;\n"
" fma.rn.ftz.f32 %f636, %f99, %f198, %f207;\n"
" fma.rn.ftz.f32 %f637, %f198, %f108, %f630;\n"
" fma.rn.ftz.f32 %f638, %f108, %f230, %f631;\n"
" mov.f32 %f639, 0f00000000; \n"
" fma.rn.ftz.f32 %f640, %f197, %f639, %f634;\n"
" fma.rn.ftz.f32 %f641, %f108, %f212, %f635;\n"
" fma.rn.ftz.f32 %f642, %f197, %f111, %f636;\n"
" fma.rn.ftz.f32 %f643, %f197, %f113, %f637;\n"
" fma.rn.ftz.f32 %f644, %f113, %f231, %f638;\n"
" mul.ftz.f32 %f645, %f452, %f581;\n"
" mul.ftz.f32 %f646, %f452, %f583;\n"
" mul.ftz.f32 %f647, %f452, %f640;\n"
" fma.rn.ftz.f32 %f648, %f213, %f113, %f641;\n"
" fma.rn.ftz.f32 %f649, %f453, %f242, %f646;\n"
" fma.rn.ftz.f32 %f650, %f453, %f642, %f647;\n"
" fma.rn.ftz.f32 %f651, %f453, %f225, %f645;\n"
" mov.f32 %f652, 0f00000000; \n"
" fma.rn.ftz.f32 %f653, %f343, %f102, %f652;\n"
" mov.f32 %f654, 0f00000000; \n"
" fma.rn.ftz.f32 %f655, %f99, %f343, %f654;\n"
" mov.f32 %f656, 0f00000000; \n"
" fma.rn.ftz.f32 %f657, %f343, %f111, %f656;\n"
" fma.rn.ftz.f32 %f658, %f454, %f644, %f649;\n"
" fma.rn.ftz.f32 %f659, %f454, %f643, %f650;\n"
" fma.rn.ftz.f32 %f660, %f454, %f648, %f651;\n"
" fma.rn.ftz.f32 %f661, %f335, %f94, %f653;\n"
" fma.rn.ftz.f32 %f662, %f108, %f335, %f655;\n"
" fma.rn.ftz.f32 %f663, %f335, %f113, %f657;\n"
" mul.ftz.f32 %f664, %f660, %f287;\n"
" fma.rn.ftz.f32 %f665, %f291, %f659, %f664;\n"
" mul.ftz.f32 %f666, %f661, %f611;\n"
" fma.rn.ftz.f32 %f667, %f284, %f658, %f665;\n"
" fma.rn.ftz.f32 %f668, %f612, %f662, %f666;\n"
" fma.rn.ftz.f32 %f669, %f613, %f663, %f668;\n"
" neg.ftz.f32 %f670, %f669;\n"
" mul.ftz.f32 %f671, %f470, %f667;\n"
" fma.rn.ftz.f32 %f672, %f467, %f670, %f671;\n"
" mul.ftz.f32 %f673, %f393, %f672;\n"
" mul.ftz.f32 %f674, %f667, %f414;\n"
" fma.rn.ftz.f32 %f675, %f477, %f670, %f674;\n"
" fma.rn.ftz.f32 %f676, %f391, %f675, %f673;\n"
" sub.ftz.f32 %f158, %f158, %f676;\n"
" .loc 17 258 0\n"
" mul.ftz.f32 %f677, %f92, %f338;\n"
" mul.ftz.f32 %f678, %f107, %f338;\n"
" mul.ftz.f32 %f679, %f98, %f338;\n"
" mul.ftz.f32 %f680, %f110, %f196;\n"
" mul.ftz.f32 %f681, %f110, %f229;\n"
" mul.ftz.f32 %f682, %f110, %f214;\n"
" fma.rn.ftz.f32 %f683, %f198, %f107, %f199;\n"
" fma.rn.ftz.f32 %f684, %f106, %f198, %f680;\n"
" fma.rn.ftz.f32 %f685, %f106, %f230, %f681;\n"
" fma.rn.ftz.f32 %f686, %f106, %f212, %f682;\n"
" fma.rn.ftz.f32 %f687, %f197, %f98, %f683;\n"
" fma.rn.ftz.f32 %f688, %f197, %f103, %f684;\n"
" fma.rn.ftz.f32 %f689, %f103, %f231, %f685;\n"
" fma.rn.ftz.f32 %f690, %f213, %f103, %f686;\n"
" mul.ftz.f32 %f691, %f452, %f687;\n"
" mul.ftz.f32 %f692, %f452, %f234;\n"
" mul.ftz.f32 %f693, %f452, %f217;\n"
" fma.rn.ftz.f32 %f694, %f343, %f110, %f677;\n"
" fma.rn.ftz.f32 %f695, %f106, %f343, %f678;\n"
" fma.rn.ftz.f32 %f696, %f343, %f103, %f679;\n"
" fma.rn.ftz.f32 %f697, %f453, %f688, %f691;\n"
" fma.rn.ftz.f32 %f698, %f453, %f689, %f692;\n"
" fma.rn.ftz.f32 %f699, %f453, %f690, %f693;\n"
" mov.f32 %f700, 0f00000000; \n"
" fma.rn.ftz.f32 %f701, %f335, %f700, %f694;\n"
" mov.f32 %f702, 0f00000000; \n"
" fma.rn.ftz.f32 %f703, %f702, %f335, %f695;\n"
" mov.f32 %f704, 0f00000000; \n"
" fma.rn.ftz.f32 %f705, %f335, %f704, %f696;\n"
" fma.rn.ftz.f32 %f706, %f454, %f640, %f697;\n"
" fma.rn.ftz.f32 %f707, %f454, %f583, %f698;\n"
" fma.rn.ftz.f32 %f708, %f454, %f581, %f699;\n"
" mul.ftz.f32 %f709, %f708, %f287;\n"
" mul.ftz.f32 %f710, %f701, %f611;\n"
" fma.rn.ftz.f32 %f711, %f291, %f706, %f709;\n"
" fma.rn.ftz.f32 %f712, %f612, %f703, %f710;\n"
" fma.rn.ftz.f32 %f713, %f284, %f707, %f711;\n"
" fma.rn.ftz.f32 %f714, %f613, %f705, %f712;\n"
" neg.ftz.f32 %f715, %f714;\n"
" mul.ftz.f32 %f716, %f470, %f713;\n"
" fma.rn.ftz.f32 %f717, %f467, %f715, %f716;\n"
" mul.ftz.f32 %f718, %f393, %f717;\n"
" mul.ftz.f32 %f719, %f713, %f414;\n"
" fma.rn.ftz.f32 %f720, %f477, %f715, %f719;\n"
" fma.rn.ftz.f32 %f721, %f391, %f720, %f718;\n"
" sub.ftz.f32 %f157, %f157, %f721;\n"
" mul.lo.s32 %r34, %r15, %r2;\n"
" cvt.s64.s32 %rd44, %r34;\n"
" mul.wide.s32 %rd45, %r34, 4;\n"
" add.u64 %rd25, %rd25, %rd45;\n"
" setp.gt.u64 %p22, %rd28, %rd25;\n"
" @%p22 bra $Lt_0_51970;\n"
" bra.uni $Lt_0_51458;\n"
"$Lt_0_75266:\n"
" mov.f32 %f157, 0f00000000; \n"
" mov.f32 %f158, 0f00000000; \n"
" mov.f32 %f159, 0f00000000; \n"
" mov.f32 %f160, 0f00000000; \n"
" mov.f32 %f161, 0f00000000; \n"
" mov.f32 %f162, 0f00000000; \n"
" mov.f32 %f163, 0f00000000; \n"
"$Lt_0_51458:\n"
" mov.u32 %r35, 1;\n"
" setp.le.s32 %p23, %r2, %r35;\n"
" @%p23 bra $Lt_0_71426;\n"
" .loc 17 267 0\n"
" mov.u64 %rd46, __cuda___cuda_local_var_33120_37_non_const_red_acc136;\n"
" cvt.s64.s32 %rd47, %r3;\n"
" mul.wide.s32 %rd48, %r3, 4;\n"
" add.u64 %rd49, %rd46, %rd48;\n"
" mov.f32 %f722, %f162;\n"
" st.shared.f32 [%rd49+0], %f722;\n"
" .loc 17 268 0\n"
" mov.f32 %f723, %f161;\n"
" st.shared.f32 [%rd49+512], %f723;\n"
" .loc 17 269 0\n"
" mov.f32 %f724, %f160;\n"
" st.shared.f32 [%rd49+1024], %f724;\n"
" .loc 17 270 0\n"
" mov.f32 %f725, %f159;\n"
" st.shared.f32 [%rd49+1536], %f725;\n"
" .loc 17 271 0\n"
" mov.f32 %f726, %f158;\n"
" st.shared.f32 [%rd49+2048], %f726;\n"
" .loc 17 272 0\n"
" mov.f32 %f727, %f157;\n"
" st.shared.f32 [%rd49+2560], %f727;\n"
" .loc 17 274 0\n"
" shr.s32 %r36, %r2, 31;\n"
" mov.s32 %r37, 1;\n"
" and.b32 %r38, %r36, %r37;\n"
" add.s32 %r39, %r38, %r2;\n"
" shr.s32 %r40, %r39, 1;\n"
" mov.s32 %r41, %r40;\n"
" mov.u32 %r42, 0;\n"
" setp.ne.u32 %p24, %r40, %r42;\n"
" @!%p24 bra $Lt_0_69890;\n"
"$Lt_0_70402:\n"
" setp.ge.u32 %p25, %r17, %r41;\n"
" @%p25 bra $Lt_0_70658;\n"
" .loc 17 277 0\n"
" add.u32 %r43, %r3, %r41;\n"
" cvt.u64.u32 %rd50, %r43;\n"
" mul.wide.u32 %rd51, %r43, 4;\n"
" add.u64 %rd52, %rd46, %rd51;\n"
" ld.shared.f32 %f728, [%rd52+0];\n"
" add.ftz.f32 %f722, %f728, %f722;\n"
" st.shared.f32 [%rd49+0], %f722;\n"
" ld.shared.f32 %f729, [%rd52+512];\n"
" add.ftz.f32 %f723, %f729, %f723;\n"
" st.shared.f32 [%rd49+512], %f723;\n"
" ld.shared.f32 %f730, [%rd52+1024];\n"
" add.ftz.f32 %f724, %f730, %f724;\n"
" st.shared.f32 [%rd49+1024], %f724;\n"
" ld.shared.f32 %f731, [%rd52+1536];\n"
" add.ftz.f32 %f725, %f731, %f725;\n"
" st.shared.f32 [%rd49+1536], %f725;\n"
" ld.shared.f32 %f732, [%rd52+2048];\n"
" add.ftz.f32 %f726, %f732, %f726;\n"
" st.shared.f32 [%rd49+2048], %f726;\n"
" ld.shared.f32 %f733, [%rd52+2560];\n"
" add.ftz.f32 %f727, %f733, %f727;\n"
" st.shared.f32 [%rd49+2560], %f727;\n"
"$Lt_0_70658:\n"
" .loc 17 274 0\n"
" shr.u32 %r41, %r41, 1;\n"
" mov.u32 %r44, 0;\n"
" setp.ne.u32 %p26, %r41, %r44;\n"
" @%p26 bra $Lt_0_70402;\n"
"$Lt_0_69890:\n"
" .loc 17 281 0\n"
" mov.f32 %f162, %f722;\n"
" .loc 17 282 0\n"
" mov.f32 %f161, %f723;\n"
" .loc 17 283 0\n"
" mov.f32 %f160, %f724;\n"
" .loc 17 284 0\n"
" mov.f32 %f159, %f725;\n"
" .loc 17 285 0\n"
" mov.f32 %f158, %f726;\n"
" .loc 17 286 0\n"
" mov.f32 %f157, %f727;\n"
" ld.param.s32 %r45, [__cudaparm_kernel_ellipsoid_sphere_eflag];\n"
" mov.s32 %r46, 0;\n"
" set.gt.u32.s32 %r47, %r45, %r46;\n"
" neg.s32 %r48, %r47;\n"
" ld.param.s32 %r49, [__cudaparm_kernel_ellipsoid_sphere_vflag];\n"
" mov.s32 %r50, 0;\n"
" set.gt.u32.s32 %r51, %r49, %r50;\n"
" neg.s32 %r52, %r51;\n"
" or.b32 %r53, %r48, %r52;\n"
" mov.u32 %r54, 0;\n"
" setp.eq.s32 %p27, %r53, %r54;\n"
" @%p27 bra $Lt_0_71426;\n"
" .loc 17 290 0\n"
" mov.f32 %f722, %f47;\n"
" st.shared.f32 [%rd49+0], %f722;\n"
" mov.f32 %f723, %f49;\n"
" st.shared.f32 [%rd49+512], %f723;\n"
" mov.f32 %f724, %f51;\n"
" st.shared.f32 [%rd49+1024], %f724;\n"
" mov.f32 %f725, %f53;\n"
" st.shared.f32 [%rd49+1536], %f725;\n"
" mov.f32 %f726, %f55;\n"
" st.shared.f32 [%rd49+2048], %f726;\n"
" mov.f32 %f727, %f56;\n"
" st.shared.f32 [%rd49+2560], %f727;\n"
" .loc 17 291 0\n"
" mov.f32 %f734, %f163;\n"
" st.shared.f32 [%rd49+3072], %f734;\n"
" .loc 17 293 0\n"
" mov.s32 %r55, %r40;\n"
" @!%p24 bra $Lt_0_71938;\n"
"$Lt_0_72450:\n"
" setp.ge.u32 %p28, %r17, %r55;\n"
" @%p28 bra $Lt_0_72706;\n"
" .loc 17 296 0\n"
" add.u32 %r56, %r3, %r55;\n"
" cvt.u64.u32 %rd53, %r56;\n"
" mul.wide.u32 %rd54, %r56, 4;\n"
" add.u64 %rd55, %rd46, %rd54;\n"
" ld.shared.f32 %f735, [%rd55+0];\n"
" add.ftz.f32 %f722, %f735, %f722;\n"
" st.shared.f32 [%rd49+0], %f722;\n"
" ld.shared.f32 %f736, [%rd55+512];\n"
" add.ftz.f32 %f723, %f736, %f723;\n"
" st.shared.f32 [%rd49+512], %f723;\n"
" ld.shared.f32 %f737, [%rd55+1024];\n"
" add.ftz.f32 %f724, %f737, %f724;\n"
" st.shared.f32 [%rd49+1024], %f724;\n"
" ld.shared.f32 %f738, [%rd55+1536];\n"
" add.ftz.f32 %f725, %f738, %f725;\n"
" st.shared.f32 [%rd49+1536], %f725;\n"
" ld.shared.f32 %f739, [%rd55+2048];\n"
" add.ftz.f32 %f726, %f739, %f726;\n"
" st.shared.f32 [%rd49+2048], %f726;\n"
" ld.shared.f32 %f740, [%rd55+2560];\n"
" add.ftz.f32 %f727, %f740, %f727;\n"
" st.shared.f32 [%rd49+2560], %f727;\n"
" ld.shared.f32 %f741, [%rd55+3072];\n"
" add.ftz.f32 %f734, %f741, %f734;\n"
" st.shared.f32 [%rd49+3072], %f734;\n"
"$Lt_0_72706:\n"
" .loc 17 293 0\n"
" shr.u32 %r55, %r55, 1;\n"
" mov.u32 %r57, 0;\n"
" setp.ne.u32 %p29, %r55, %r57;\n"
" @%p29 bra $Lt_0_72450;\n"
"$Lt_0_71938:\n"
" .loc 17 301 0\n"
" mov.f32 %f47, %f722;\n"
" mov.f32 %f49, %f723;\n"
" mov.f32 %f51, %f724;\n"
" mov.f32 %f53, %f725;\n"
" mov.f32 %f55, %f726;\n"
" mov.f32 %f57, %f727;\n"
" .loc 17 302 0\n"
" mov.f32 %f163, %f734;\n"
"$Lt_0_71426:\n"
"$Lt_0_69378:\n"
" mov.u32 %r58, 0;\n"
" setp.ne.s32 %p30, %r17, %r58;\n"
" @%p30 bra $Lt_0_73474;\n"
" .loc 17 308 0\n"
" ld.param.u64 %rd56, [__cudaparm_kernel_ellipsoid_sphere_engv];\n"
" add.u64 %rd57, %rd56, %rd3;\n"
" ld.param.s32 %r59, [__cudaparm_kernel_ellipsoid_sphere_astride];\n"
" ld.param.s32 %r60, [__cudaparm_kernel_ellipsoid_sphere_eflag];\n"
" mov.u32 %r61, 0;\n"
" setp.le.s32 %p31, %r60, %r61;\n"
" @%p31 bra $Lt_0_73986;\n"
" .loc 17 310 0\n"
" ld.global.f32 %f742, [%rd57+0];\n"
" add.ftz.f32 %f743, %f742, %f163;\n"
" st.global.f32 [%rd57+0], %f743;\n"
" .loc 17 311 0\n"
" cvt.s64.s32 %rd58, %r59;\n"
" mul.wide.s32 %rd59, %r59, 4;\n"
" add.u64 %rd57, %rd57, %rd59;\n"
"$Lt_0_73986:\n"
" ld.param.s32 %r62, [__cudaparm_kernel_ellipsoid_sphere_vflag];\n"
" mov.u32 %r63, 0;\n"
" setp.le.s32 %p32, %r62, %r63;\n"
" @%p32 bra $Lt_0_74498;\n"
" .loc 17 315 0\n"
" ld.global.f32 %f744, [%rd57+0];\n"
" mov.f32 %f745, %f47;\n"
" add.ftz.f32 %f746, %f744, %f745;\n"
" st.global.f32 [%rd57+0], %f746;\n"
" .loc 17 316 0\n"
" cvt.s64.s32 %rd60, %r59;\n"
" mul.wide.s32 %rd61, %r59, 4;\n"
" add.u64 %rd62, %rd61, %rd57;\n"
" .loc 17 315 0\n"
" ld.global.f32 %f747, [%rd62+0];\n"
" mov.f32 %f748, %f49;\n"
" add.ftz.f32 %f749, %f747, %f748;\n"
" st.global.f32 [%rd62+0], %f749;\n"
" .loc 17 316 0\n"
" add.u64 %rd63, %rd61, %rd62;\n"
" .loc 17 315 0\n"
" ld.global.f32 %f750, [%rd63+0];\n"
" mov.f32 %f751, %f51;\n"
" add.ftz.f32 %f752, %f750, %f751;\n"
" st.global.f32 [%rd63+0], %f752;\n"
" .loc 17 316 0\n"
" add.u64 %rd64, %rd61, %rd63;\n"
" .loc 17 315 0\n"
" ld.global.f32 %f753, [%rd64+0];\n"
" mov.f32 %f754, %f53;\n"
" add.ftz.f32 %f755, %f753, %f754;\n"
" st.global.f32 [%rd64+0], %f755;\n"
" .loc 17 316 0\n"
" add.u64 %rd65, %rd61, %rd64;\n"
" .loc 17 315 0\n"
" ld.global.f32 %f756, [%rd65+0];\n"
" mov.f32 %f757, %f55;\n"
" add.ftz.f32 %f758, %f756, %f757;\n"
" st.global.f32 [%rd65+0], %f758;\n"
" .loc 17 316 0\n"
" add.u64 %rd57, %rd61, %rd65;\n"
" .loc 17 315 0\n"
" ld.global.f32 %f759, [%rd57+0];\n"
" mov.f32 %f760, %f57;\n"
" add.ftz.f32 %f761, %f759, %f760;\n"
" st.global.f32 [%rd57+0], %f761;\n"
"$Lt_0_74498:\n"
" .loc 17 319 0\n"
" ld.param.u64 %rd66, [__cudaparm_kernel_ellipsoid_sphere_ans];\n"
" mul.lo.u64 %rd67, %rd2, 16;\n"
" add.u64 %rd68, %rd66, %rd67;\n"
" ld.global.v4.f32 {%f762,%f763,%f764,%f765}, [%rd68+0];\n"
" .loc 17 321 0\n"
" add.ftz.f32 %f766, %f763, %f161;\n"
" .loc 17 322 0\n"
" add.ftz.f32 %f767, %f764, %f160;\n"
" .loc 17 323 0\n"
" add.ftz.f32 %f768, %f762, %f162;\n"
" st.global.v4.f32 [%rd68+0], {%f768,%f766,%f767,%f765};\n"
" .loc 17 325 0\n"
" add.s32 %r64, %r9, %r59;\n"
" cvt.s64.s32 %rd69, %r64;\n"
" mul.wide.s32 %rd70, %r64, 16;\n"
" add.u64 %rd71, %rd66, %rd70;\n"
" ld.global.v4.f32 {%f769,%f770,%f771,%f772}, [%rd71+0];\n"
" .loc 17 327 0\n"
" add.ftz.f32 %f773, %f770, %f158;\n"
" .loc 17 328 0\n"
" add.ftz.f32 %f774, %f771, %f157;\n"
" .loc 17 329 0\n"
" add.ftz.f32 %f775, %f769, %f159;\n"
" st.global.v4.f32 [%rd71+0], {%f775,%f773,%f774,%f772};\n"
"$Lt_0_73474:\n"
"$Lt_0_50946:\n"
" .loc 17 332 0\n"
" exit;\n"
"$LDWend_kernel_ellipsoid_sphere:\n"
" }\n"
" .entry kernel_sphere_ellipsoid (\n"
" .param .u64 __cudaparm_kernel_sphere_ellipsoid_x_,\n"
" .param .u64 __cudaparm_kernel_sphere_ellipsoid_q,\n"
" .param .u64 __cudaparm_kernel_sphere_ellipsoid_shape,\n"
" .param .u64 __cudaparm_kernel_sphere_ellipsoid_well,\n"
" .param .u64 __cudaparm_kernel_sphere_ellipsoid_splj,\n"
" .param .u64 __cudaparm_kernel_sphere_ellipsoid_sig_eps,\n"
" .param .s32 __cudaparm_kernel_sphere_ellipsoid_ntypes,\n"
" .param .u64 __cudaparm_kernel_sphere_ellipsoid_dev_nbor,\n"
" .param .s32 __cudaparm_kernel_sphere_ellipsoid_stride,\n"
" .param .u64 __cudaparm_kernel_sphere_ellipsoid_ans,\n"
" .param .u64 __cudaparm_kernel_sphere_ellipsoid___val_paramengv,\n"
" .param .u64 __cudaparm_kernel_sphere_ellipsoid_err_flag,\n"
" .param .s32 __cudaparm_kernel_sphere_ellipsoid_eflag,\n"
" .param .s32 __cudaparm_kernel_sphere_ellipsoid_vflag,\n"
" .param .s32 __cudaparm_kernel_sphere_ellipsoid_start,\n"
" .param .s32 __cudaparm_kernel_sphere_ellipsoid_inum,\n"
" .param .s32 __cudaparm_kernel_sphere_ellipsoid_t_per_atom)\n"
" {\n"
" .reg .u32 %r<58>;\n"
" .reg .u64 %rd<70>;\n"
" .reg .f32 %f<567>;\n"
" .reg .pred %p<34>;\n"
" .shared .align 16 .b8 __cuda___cuda_local_var_33201_33_non_const_sp_lj3836[16];\n"
" .shared .align 4 .b8 __cuda___cuda_local_var_33377_55_non_const_red_acc3852[3072];\n"
" .shared .f32 __cuda_local_var_33207_33_non_const_b_alpha;\n"
" .shared .f32 __cuda_local_var_33207_42_non_const_cr60;\n"
" .shared .f32 __cuda_local_var_33207_48_non_const_solv_f_a;\n"
" .shared .f32 __cuda_local_var_33207_58_non_const_solv_f_r;\n"
" .loc 17 341 0\n"
"$LDWbegin_kernel_sphere_ellipsoid:\n"
" .loc 17 347 0\n"
" ld.param.u64 %rd1, [__cudaparm_kernel_sphere_ellipsoid_splj];\n"
" ldu.global.f32 %f1, [%rd1+0];\n"
" .loc 17 348 0\n"
" ld.global.f32 %f2, [%rd1+4];\n"
" .loc 17 349 0\n"
" ld.global.f32 %f3, [%rd1+8];\n"
" .loc 17 350 0\n"
" ld.global.f32 %f4, [%rd1+12];\n"
" st.shared.v4.f32 [__cuda___cuda_local_var_33201_33_non_const_sp_lj3836+0], {%f1,%f2,%f3,%f4};\n"
" .loc 17 353 0\n"
" mov.f32 %f5, 0f3f4db6db; \n"
" st.shared.f32 [__cuda_local_var_33207_33_non_const_b_alpha], %f5;\n"
" .loc 17 354 0\n"
" mov.f32 %f6, 0f42700000; \n"
" lg2.approx.ftz.f32 %f7, %f6;\n"
" mov.f32 %f8, 0f3eaaaaab; \n"
" mul.ftz.f32 %f9, %f7, %f8;\n"
" ex2.approx.ftz.f32 %f10, %f9;\n"
" mov.f32 %f11, 0f42700000; \n"
" mul.ftz.f32 %f12, %f10, %f10;\n"
" div.approx.ftz.f32 %f13, %f11, %f12;\n"
" sub.ftz.f32 %f14, %f10, %f13;\n"
" mov.f32 %f15, 0f3eaaaaab; \n"
" mul.ftz.f32 %f16, %f14, %f15;\n"
" sub.ftz.f32 %f17, %f10, %f16;\n"
" st.shared.f32 [__cuda_local_var_33207_42_non_const_cr60], %f17;\n"
" .loc 21 544 0\n"
" mov.f32 %f18, 0f3f800000; \n"
" mov.f32 %f19, 0fbf52c7ea; \n"
" mov.f32 %f20, 0fc0b59883; \n"
" fma.rn.ftz.f32 %f21, %f18, %f19, %f20;\n"
" mov.f32 %f22, 0f41455dc0; \n"
" mov.f32 %f23, 0f3f800000; \n"
" mov.f32 %f24, 0f41e6bd60; \n"
" fma.rn.ftz.f32 %f25, %f22, %f23, %f24;\n"
" mov.f32 %f26, 0f3f800000; \n"
" mov.f32 %f27, 0fc0d21907; \n"
" fma.rn.ftz.f32 %f28, %f21, %f26, %f27;\n"
" mov.f32 %f29, 0f3f800000; \n"
" mov.f32 %f30, 0f419d92c8; \n"
" fma.rn.ftz.f32 %f31, %f25, %f29, %f30;\n"
" rcp.approx.ftz.f32 %f32, %f31;\n"
" mov.f32 %f33, 0f3f800000; \n"
" fma.rn.ftz.f32 %f34, %f28, %f32, %f33;\n"
" mov.b32 %r1, %f34;\n"
" mov.b32 %f35, %r1;\n"
" mov.f32 %f36, 0f41800000; \n"
" mul.ftz.f32 %f37, %f35, %f36;\n"
" mov.f32 %f38, 0f40400000; \n"
" mov.f32 %f39, 0fc2100000; \n"
" mul.ftz.f32 %f40, %f37, %f39;\n"
" div.approx.ftz.f32 %f41, %f38, %f40;\n"
" .loc 17 355 0\n"
" st.shared.f32 [__cuda_local_var_33207_48_non_const_solv_f_a], %f41;\n"
" .loc 21 544 0\n"
" mov.f32 %f42, 0f40400000; \n"
" mov.f32 %f43, 0f44fd2000; \n"
" mul.ftz.f32 %f44, %f37, %f43;\n"
" div.approx.ftz.f32 %f45, %f42, %f44;\n"
" .loc 17 356 0\n"
" st.shared.f32 [__cuda_local_var_33207_58_non_const_solv_f_r], %f45;\n"
" .loc 17 365 0\n"
" mov.f32 %f46, 0f00000000; \n"
" mov.f32 %f47, %f46;\n"
" mov.f32 %f48, 0f00000000; \n"
" mov.f32 %f49, %f48;\n"
" mov.f32 %f50, 0f00000000; \n"
" mov.f32 %f51, %f50;\n"
" mov.f32 %f52, 0f00000000; \n"
" mov.f32 %f53, %f52;\n"
" mov.f32 %f54, 0f00000000; \n"
" mov.f32 %f55, %f54;\n"
" mov.f32 %f56, 0f00000000; \n"
" mov.f32 %f57, %f56;\n"
" ld.param.s32 %r2, [__cudaparm_kernel_sphere_ellipsoid_t_per_atom];\n"
" cvt.s32.u32 %r3, %tid.x;\n"
" div.s32 %r4, %r3, %r2;\n"
" cvt.s32.u32 %r5, %ntid.x;\n"
" div.s32 %r6, %r5, %r2;\n"
" cvt.s32.u32 %r7, %ctaid.x;\n"
" mul.lo.s32 %r8, %r7, %r6;\n"
" add.s32 %r9, %r4, %r8;\n"
" ld.param.s32 %r10, [__cudaparm_kernel_sphere_ellipsoid_start];\n"
" add.s32 %r11, %r10, %r9;\n"
" ld.param.s32 %r12, [__cudaparm_kernel_sphere_ellipsoid_inum];\n"
" setp.ge.s32 %p1, %r11, %r12;\n"
" @%p1 bra $Lt_1_73218;\n"
" .loc 17 370 0\n"
" cvt.s64.s32 %rd2, %r11;\n"
" mul.wide.s32 %rd3, %r11, 4;\n"
" ld.param.u64 %rd4, [__cudaparm_kernel_sphere_ellipsoid_dev_nbor];\n"
" add.u64 %rd5, %rd4, %rd3;\n"
" ld.global.s32 %r13, [%rd5+0];\n"
" ld.param.s32 %r14, [__cudaparm_kernel_sphere_ellipsoid_stride];\n"
" cvt.s64.s32 %rd6, %r14;\n"
" mul.wide.s32 %rd7, %r14, 4;\n"
" add.u64 %rd8, %rd7, %rd5;\n"
" ld.global.s32 %r15, [%rd8+0];\n"
" .loc 17 373 0\n"
" ld.param.u64 %rd9, [__cudaparm_kernel_sphere_ellipsoid_x_];\n"
" cvt.s64.s32 %rd10, %r13;\n"
" mul.wide.s32 %rd11, %r13, 16;\n"
" add.u64 %rd12, %rd9, %rd11;\n"
" ld.global.v4.f32 {%f58,%f59,%f60,%f61}, [%rd12+0];\n"
" .loc 17 374 0\n"
" cvt.s32.s64 %r16, %rd6;\n"
" sub.s32 %r17, %r2, 1;\n"
" and.b32 %r18, %r17, %r3;\n"
" add.u64 %rd13, %rd7, %rd8;\n"
" mul.lo.s32 %r19, %r16, %r18;\n"
" cvt.s64.s32 %rd14, %r19;\n"
" mul.wide.s32 %rd15, %r19, 4;\n"
" add.u64 %rd16, %rd13, %rd15;\n"
" mov.s64 %rd17, %rd16;\n"
" mul.lo.s32 %r20, %r16, %r15;\n"
" cvt.s64.s32 %rd18, %r20;\n"
" mul.wide.s32 %rd19, %r20, 4;\n"
" add.u64 %rd20, %rd13, %rd19;\n"
" setp.ge.u64 %p2, %rd16, %rd20;\n"
" @%p2 bra $Lt_1_75010;\n"
" ld.param.s32 %r21, [__cudaparm_kernel_sphere_ellipsoid_vflag];\n"
" mov.s32 %r22, 0;\n"
" setp.gt.s32 %p3, %r21, %r22;\n"
" cvt.rzi.ftz.s32.f32 %r23, %f61;\n"
" ld.param.u64 %rd21, [__cudaparm_kernel_sphere_ellipsoid_sig_eps];\n"
" ld.param.s32 %r24, [__cudaparm_kernel_sphere_ellipsoid_ntypes];\n"
" ld.param.u64 %rd22, [__cudaparm_kernel_sphere_ellipsoid_well];\n"
" ld.param.u64 %rd23, [__cudaparm_kernel_sphere_ellipsoid_q];\n"
" ld.param.u64 %rd24, [__cudaparm_kernel_sphere_ellipsoid_shape];\n"
" mov.f32 %f62, 0f00000000; \n"
" mov.f32 %f63, 0f00000000; \n"
" mov.f32 %f64, 0f00000000; \n"
" mov.f32 %f65, 0f00000000; \n"
" mov.u64 %rd25, __cuda___cuda_local_var_33201_33_non_const_sp_lj3836;\n"
"$Lt_1_51714:\n"
" .loc 17 378 0\n"
" ld.global.s32 %r25, [%rd17+0];\n"
" .loc 17 382 0\n"
" and.b32 %r26, %r25, 1073741823;\n"
" cvt.s64.s32 %rd26, %r26;\n"
" mul.wide.s32 %rd27, %r26, 16;\n"
" add.u64 %rd28, %rd27, %rd9;\n"
" ld.global.v4.f32 {%f66,%f67,%f68,%f69}, [%rd28+0];\n"
" .loc 17 389 0\n"
" cvt.rzi.ftz.s32.f32 %r27, %f69;\n"
" cvt.s64.s32 %rd29, %r27;\n"
" mul.wide.s32 %rd30, %r27, 16;\n"
" add.u64 %rd31, %rd30, %rd24;\n"
" ld.global.v4.f32 {%f70,%f71,%f72,_}, [%rd31+0];\n"
" .loc 17 390 0\n"
" add.u64 %rd32, %rd27, %rd23;\n"
" ld.global.v4.f32 {%f73,%f74,%f75,%f76}, [%rd32+0];\n"
" .loc 17 391 0\n"
" add.u64 %rd33, %rd30, %rd22;\n"
" ld.global.v4.f32 {%f77,%f78,%f79,_}, [%rd33+0];\n"
" .loc 17 401 0\n"
" sub.ftz.f32 %f80, %f67, %f59;\n"
" sub.ftz.f32 %f81, %f66, %f58;\n"
" sub.ftz.f32 %f82, %f68, %f60;\n"
" mul.ftz.f32 %f83, %f80, %f80;\n"
" fma.rn.ftz.f32 %f84, %f81, %f81, %f83;\n"
" fma.rn.ftz.f32 %f85, %f82, %f82, %f84;\n"
" rsqrt.approx.ftz.f32 %f86, %f85;\n"
" mul.ftz.f32 %f87, %f81, %f86;\n"
" .loc 17 402 0\n"
" mul.ftz.f32 %f88, %f80, %f86;\n"
" .loc 17 407 0\n"
" mul.lo.s32 %r28, %r27, %r24;\n"
" add.s32 %r29, %r23, %r28;\n"
" cvt.s64.s32 %rd34, %r29;\n"
" mul.wide.s32 %rd35, %r29, 8;\n"
" add.u64 %rd36, %rd21, %rd35;\n"
" ld.global.v2.f32 {%f89,%f90}, [%rd36+0];\n"
" .loc 17 408 0\n"
" shr.s32 %r30, %r25, 30;\n"
" and.b32 %r31, %r30, 3;\n"
" cvt.s64.s32 %rd37, %r31;\n"
" mul.wide.s32 %rd38, %r31, 4;\n"
" add.u64 %rd39, %rd25, %rd38;\n"
" ld.shared.f32 %f91, [%rd39+0];\n"
" mul.ftz.f32 %f92, %f91, %f90;\n"
" .loc 16 299 0\n"
" mov.f32 %f93, %f87;\n"
" .loc 16 300 0\n"
" mov.f32 %f94, 0f3f000000; \n"
" mul.ftz.f32 %f95, %f89, %f94;\n"
" add.ftz.f32 %f96, %f74, %f74;\n"
" add.ftz.f32 %f97, %f76, %f76;\n"
" mul.ftz.f32 %f98, %f73, %f73;\n"
" mul.ftz.f32 %f99, %f74, %f74;\n"
" mul.ftz.f32 %f100, %f75, %f75;\n"
" mul.ftz.f32 %f101, %f76, %f76;\n"
" add.ftz.f32 %f102, %f75, %f75;\n"
" add.ftz.f32 %f103, %f95, %f71;\n"
" add.ftz.f32 %f104, %f95, %f70;\n"
" add.ftz.f32 %f105, %f95, %f72;\n"
" mul.ftz.f32 %f106, %f96, %f75;\n"
" mul.ftz.f32 %f107, %f96, %f76;\n"
" mul.ftz.f32 %f108, %f97, %f73;\n"
" add.ftz.f32 %f109, %f98, %f99;\n"
" mul.ftz.f32 %f110, %f102, %f73;\n"
" mul.ftz.f32 %f111, %f103, %f103;\n"
" mul.ftz.f32 %f112, %f104, %f104;\n"
" mul.ftz.f32 %f113, %f105, %f105;\n"
" sub.ftz.f32 %f114, %f106, %f108;\n"
" sub.ftz.f32 %f115, %f109, %f100;\n"
" add.ftz.f32 %f116, %f107, %f110;\n"
" mov.f32 %f117, 0f3f000000; \n"
" mul.ftz.f32 %f118, %f111, %f117;\n"
" mov.f32 %f119, 0f3f000000; \n"
" mul.ftz.f32 %f120, %f112, %f119;\n"
" mov.f32 %f121, 0f3f000000; \n"
" mul.ftz.f32 %f122, %f113, %f121;\n"
" sub.ftz.f32 %f123, %f115, %f101;\n"
" mul.ftz.f32 %f124, %f114, %f118;\n"
" mul.ftz.f32 %f125, %f116, %f122;\n"
" mul.ftz.f32 %f126, %f120, %f123;\n"
" mul.ftz.f32 %f127, %f114, %f124;\n"
" fma.rn.ftz.f32 %f128, %f123, %f126, %f127;\n"
" fma.rn.ftz.f32 %f129, %f125, %f116, %f128;\n"
" mov.f32 %f130, %f129;\n"
" .loc 16 301 0\n"
" mul.ftz.f32 %f131, %f96, %f73;\n"
" sub.ftz.f32 %f132, %f98, %f99;\n"
" mul.ftz.f32 %f133, %f102, %f76;\n"
" add.ftz.f32 %f134, %f106, %f108;\n"
" add.ftz.f32 %f135, %f100, %f132;\n"
" sub.ftz.f32 %f136, %f133, %f131;\n"
" sub.ftz.f32 %f137, %f135, %f101;\n"
" mul.ftz.f32 %f138, %f137, %f124;\n"
" fma.rn.ftz.f32 %f139, %f126, %f134, %f138;\n"
" fma.rn.ftz.f32 %f140, %f125, %f136, %f139;\n"
" mov.f32 %f141, %f140;\n"
" .loc 16 302 0\n"
" sub.ftz.f32 %f142, %f132, %f100;\n"
" sub.ftz.f32 %f143, %f107, %f110;\n"
" add.ftz.f32 %f144, %f131, %f133;\n"
" add.ftz.f32 %f145, %f101, %f142;\n"
" mul.ftz.f32 %f146, %f144, %f124;\n"
" fma.rn.ftz.f32 %f147, %f126, %f143, %f146;\n"
" fma.rn.ftz.f32 %f148, %f125, %f145, %f147;\n"
" mov.f32 %f149, %f148;\n"
" .loc 16 303 0\n"
" mov.f32 %f150, %f88;\n"
" .loc 16 304 0\n"
" mul.ftz.f32 %f151, %f134, %f120;\n"
" mul.ftz.f32 %f152, %f136, %f122;\n"
" mul.ftz.f32 %f153, %f118, %f137;\n"
" mul.ftz.f32 %f154, %f114, %f153;\n"
" fma.rn.ftz.f32 %f155, %f123, %f151, %f154;\n"
" fma.rn.ftz.f32 %f156, %f152, %f116, %f155;\n"
" mov.f32 %f157, %f156;\n"
" .loc 16 305 0\n"
" mul.ftz.f32 %f158, %f137, %f153;\n"
" fma.rn.ftz.f32 %f159, %f134, %f151, %f158;\n"
" fma.rn.ftz.f32 %f160, %f152, %f136, %f159;\n"
" .loc 16 306 0\n"
" mul.ftz.f32 %f161, %f144, %f153;\n"
" fma.rn.ftz.f32 %f162, %f143, %f151, %f161;\n"
" fma.rn.ftz.f32 %f163, %f152, %f145, %f162;\n"
" .loc 16 307 0\n"
" mul.ftz.f32 %f164, %f82, %f86;\n"
" mov.f32 %f165, %f164;\n"
" .loc 16 308 0\n"
" mul.ftz.f32 %f166, %f144, %f118;\n"
" mul.ftz.f32 %f167, %f143, %f120;\n"
" mul.ftz.f32 %f168, %f122, %f145;\n"
" mul.ftz.f32 %f169, %f114, %f166;\n"
" fma.rn.ftz.f32 %f170, %f123, %f167, %f169;\n"
" fma.rn.ftz.f32 %f171, %f116, %f168, %f170;\n"
" mov.f32 %f172, %f171;\n"
" .loc 16 309 0\n"
" mul.ftz.f32 %f173, %f137, %f166;\n"
" fma.rn.ftz.f32 %f174, %f134, %f167, %f173;\n"
" fma.rn.ftz.f32 %f175, %f136, %f168, %f174;\n"
" .loc 16 310 0\n"
" mul.ftz.f32 %f176, %f144, %f166;\n"
" fma.rn.ftz.f32 %f177, %f143, %f167, %f176;\n"
" fma.rn.ftz.f32 %f178, %f145, %f168, %f177;\n"
" abs.ftz.f32 %f179, %f156;\n"
" abs.ftz.f32 %f180, %f129;\n"
" setp.gt.ftz.f32 %p4, %f179, %f180;\n"
" @!%p4 bra $Lt_1_51970;\n"
" .loc 16 314 0\n"
" mov.f32 %f130, %f156;\n"
" mov.f32 %f157, %f129;\n"
" .loc 16 315 0\n"
" mov.f32 %f141, %f160;\n"
" mov.f32 %f160, %f140;\n"
" .loc 16 316 0\n"
" mov.f32 %f149, %f163;\n"
" mov.f32 %f163, %f148;\n"
" .loc 16 317 0\n"
" mov.f32 %f93, %f88;\n"
" mov.f32 %f150, %f87;\n"
"$Lt_1_51970:\n"
" mov.f32 %f181, %f130;\n"
" abs.ftz.f32 %f182, %f181;\n"
" abs.ftz.f32 %f183, %f171;\n"
" setp.lt.ftz.f32 %p5, %f182, %f183;\n"
" @!%p5 bra $Lt_1_52482;\n"
" .loc 16 321 0\n"
" mov.f32 %f130, %f171;\n"
" mov.f32 %f172, %f181;\n"
" .loc 16 322 0\n"
" mov.f32 %f184, %f141;\n"
" mov.f32 %f141, %f175;\n"
" mov.f32 %f175, %f184;\n"
" .loc 16 323 0\n"
" mov.f32 %f185, %f149;\n"
" mov.f32 %f149, %f178;\n"
" mov.f32 %f178, %f185;\n"
" .loc 16 324 0\n"
" mov.f32 %f186, %f93;\n"
" mov.f32 %f93, %f164;\n"
" mov.f32 %f165, %f186;\n"
"$Lt_1_52482:\n"
" mov.f32 %f187, %f130;\n"
" mov.f32 %f188, 0f00000000; \n"
" setp.neu.ftz.f32 %p6, %f187, %f188;\n"
" @!%p6 bra $Lt_1_53250;\n"
" bra.uni $Lt_1_54018;\n"
"$Lt_1_53250:\n"
" mov.f32 %f189, 0f00000000; \n"
" setp.neu.ftz.f32 %p7, %f157, %f189;\n"
" @!%p7 bra $Lt_1_53762;\n"
" .loc 16 338 0\n"
" mov.f32 %f130, %f157;\n"
" mov.f32 %f157, %f187;\n"
" .loc 16 339 0\n"
" mov.f32 %f190, %f141;\n"
" mov.f32 %f141, %f160;\n"
" mov.f32 %f160, %f190;\n"
" .loc 16 340 0\n"
" mov.f32 %f191, %f149;\n"
" mov.f32 %f149, %f163;\n"
" mov.f32 %f163, %f191;\n"
" .loc 16 341 0\n"
" mov.f32 %f192, %f93;\n"
" mov.f32 %f93, %f150;\n"
" mov.f32 %f150, %f192;\n"
" bra.uni $Lt_1_54018;\n"
"$Lt_1_53762:\n"
" mov.f32 %f193, 0f00000000; \n"
" setp.neu.ftz.f32 %p8, %f172, %f193;\n"
" @!%p8 bra $Lt_1_54274;\n"
" .loc 16 346 0\n"
" mov.f32 %f130, %f172;\n"
" mov.f32 %f172, %f187;\n"
" .loc 16 347 0\n"
" mov.f32 %f194, %f141;\n"
" mov.f32 %f141, %f175;\n"
" mov.f32 %f175, %f194;\n"
" .loc 16 348 0\n"
" mov.f32 %f195, %f149;\n"
" mov.f32 %f149, %f178;\n"
" mov.f32 %f178, %f195;\n"
" .loc 16 349 0\n"
" mov.f32 %f196, %f93;\n"
" mov.f32 %f93, %f165;\n"
" mov.f32 %f165, %f196;\n"
" bra.uni $Lt_1_54018;\n"
"$Lt_1_54274:\n"
" .loc 16 352 0\n"
" mov.s32 %r32, 2;\n"
" ld.param.u64 %rd40, [__cudaparm_kernel_sphere_ellipsoid_err_flag];\n"
" st.global.s32 [%rd40+0], %r32;\n"
"$Lt_1_54018:\n"
"$Lt_1_53506:\n"
"$Lt_1_52994:\n"
" .loc 16 355 0\n"
" div.approx.ftz.f32 %f197, %f157, %f130;\n"
" mul.ftz.f32 %f198, %f141, %f197;\n"
" sub.ftz.f32 %f199, %f160, %f198;\n"
" mov.f32 %f160, %f199;\n"
" .loc 16 356 0\n"
" mul.ftz.f32 %f200, %f149, %f197;\n"
" sub.ftz.f32 %f201, %f163, %f200;\n"
" mov.f32 %f163, %f201;\n"
" .loc 16 357 0\n"
" mul.ftz.f32 %f202, %f93, %f197;\n"
" sub.ftz.f32 %f203, %f150, %f202;\n"
" mov.f32 %f150, %f203;\n"
" .loc 16 359 0\n"
" div.approx.ftz.f32 %f204, %f172, %f130;\n"
" mul.ftz.f32 %f205, %f141, %f204;\n"
" sub.ftz.f32 %f175, %f175, %f205;\n"
" .loc 16 360 0\n"
" mul.ftz.f32 %f206, %f149, %f204;\n"
" sub.ftz.f32 %f178, %f178, %f206;\n"
" .loc 16 361 0\n"
" mul.ftz.f32 %f207, %f93, %f204;\n"
" sub.ftz.f32 %f165, %f165, %f207;\n"
" abs.ftz.f32 %f208, %f199;\n"
" abs.ftz.f32 %f209, %f175;\n"
" setp.lt.ftz.f32 %p9, %f208, %f209;\n"
" @!%p9 bra $Lt_1_54530;\n"
" .loc 16 366 0\n"
" mov.f32 %f160, %f175;\n"
" mov.f32 %f175, %f199;\n"
" .loc 16 367 0\n"
" mov.f32 %f163, %f178;\n"
" mov.f32 %f178, %f201;\n"
" .loc 16 368 0\n"
" mov.f32 %f150, %f165;\n"
" mov.f32 %f165, %f203;\n"
"$Lt_1_54530:\n"
" mov.f32 %f210, %f160;\n"
" mov.f32 %f211, 0f00000000; \n"
" setp.neu.ftz.f32 %p10, %f210, %f211;\n"
" @!%p10 bra $Lt_1_55298;\n"
" bra.uni $Lt_1_55554;\n"
"$Lt_1_55298:\n"
" mov.f32 %f212, 0f00000000; \n"
" setp.neu.ftz.f32 %p11, %f175, %f212;\n"
" @!%p11 bra $Lt_1_55554;\n"
" .loc 16 383 0\n"
" mov.f32 %f160, %f175;\n"
" mov.f32 %f175, %f210;\n"
" .loc 16 384 0\n"
" mov.f32 %f213, %f163;\n"
" mov.f32 %f163, %f178;\n"
" mov.f32 %f178, %f213;\n"
" .loc 16 385 0\n"
" mov.f32 %f214, %f150;\n"
" mov.f32 %f150, %f165;\n"
" mov.f32 %f165, %f214;\n"
"$Lt_1_55554:\n"
"$Lt_1_55042:\n"
" .loc 16 390 0\n"
" div.approx.ftz.f32 %f215, %f175, %f160;\n"
" mul.ftz.f32 %f216, %f163, %f215;\n"
" sub.ftz.f32 %f178, %f178, %f216;\n"
" .loc 16 391 0\n"
" mul.ftz.f32 %f217, %f150, %f215;\n"
" sub.ftz.f32 %f165, %f165, %f217;\n"
" mov.f32 %f218, 0f00000000; \n"
" setp.eq.ftz.f32 %p12, %f178, %f218;\n"
" @!%p12 bra $Lt_1_56066;\n"
" .loc 16 394 0\n"
" mov.s32 %r33, 2;\n"
" ld.param.u64 %rd41, [__cudaparm_kernel_sphere_ellipsoid_err_flag];\n"
" st.global.s32 [%rd41+0], %r33;\n"
"$Lt_1_56066:\n"
" .loc 16 396 0\n"
" div.approx.ftz.f32 %f219, %f165, %f178;\n"
" .loc 16 399 0\n"
" mul.ftz.f32 %f220, %f219, %f163;\n"
" sub.ftz.f32 %f221, %f150, %f220;\n"
" div.approx.ftz.f32 %f222, %f221, %f160;\n"
" .loc 16 403 0\n"
" mul.ftz.f32 %f223, %f222, %f141;\n"
" fma.rn.ftz.f32 %f224, %f149, %f219, %f223;\n"
" sub.ftz.f32 %f225, %f93, %f224;\n"
" div.approx.ftz.f32 %f226, %f225, %f130;\n"
" .loc 17 427 0\n"
" mul.ftz.f32 %f227, %f222, %f88;\n"
" fma.rn.ftz.f32 %f228, %f87, %f226, %f227;\n"
" fma.rn.ftz.f32 %f229, %f164, %f219, %f228;\n"
" mov.f32 %f230, 0f3f000000; \n"
" mul.ftz.f32 %f231, %f229, %f230;\n"
" rsqrt.approx.ftz.f32 %f232, %f231;\n"
" .loc 16 299 0\n"
" mov.f32 %f93, %f87;\n"
" .loc 16 300 0\n"
" mul.ftz.f32 %f233, %f114, %f78;\n"
" mul.ftz.f32 %f234, %f116, %f79;\n"
" mul.ftz.f32 %f235, %f114, %f233;\n"
" mul.ftz.f32 %f236, %f123, %f77;\n"
" fma.rn.ftz.f32 %f237, %f123, %f236, %f235;\n"
" fma.rn.ftz.f32 %f238, %f234, %f116, %f237;\n"
" mov.f32 %f239, 0f3f800000; \n"
" add.ftz.f32 %f240, %f238, %f239;\n"
" mov.f32 %f130, %f240;\n"
" .loc 16 301 0\n"
" mul.ftz.f32 %f241, %f233, %f137;\n"
" fma.rn.ftz.f32 %f242, %f236, %f134, %f241;\n"
" fma.rn.ftz.f32 %f243, %f234, %f136, %f242;\n"
" mov.f32 %f141, %f243;\n"
" .loc 16 302 0\n"
" mul.ftz.f32 %f244, %f144, %f233;\n"
" fma.rn.ftz.f32 %f245, %f236, %f143, %f244;\n"
" fma.rn.ftz.f32 %f246, %f234, %f145, %f245;\n"
" mov.f32 %f149, %f246;\n"
" .loc 16 303 0\n"
" mov.f32 %f150, %f88;\n"
" .loc 16 304 0\n"
" mul.ftz.f32 %f247, %f134, %f77;\n"
" mul.ftz.f32 %f248, %f136, %f79;\n"
" mul.ftz.f32 %f249, %f137, %f78;\n"
" mul.ftz.f32 %f250, %f114, %f249;\n"
" fma.rn.ftz.f32 %f251, %f123, %f247, %f250;\n"
" fma.rn.ftz.f32 %f252, %f248, %f116, %f251;\n"
" mov.f32 %f157, %f252;\n"
" .loc 16 305 0\n"
" mul.ftz.f32 %f253, %f137, %f249;\n"
" fma.rn.ftz.f32 %f254, %f134, %f247, %f253;\n"
" fma.rn.ftz.f32 %f255, %f248, %f136, %f254;\n"
" mov.f32 %f256, 0f3f800000; \n"
" add.ftz.f32 %f160, %f255, %f256;\n"
" .loc 16 306 0\n"
" mul.ftz.f32 %f257, %f144, %f249;\n"
" fma.rn.ftz.f32 %f258, %f143, %f247, %f257;\n"
" fma.rn.ftz.f32 %f163, %f248, %f145, %f258;\n"
" .loc 16 307 0\n"
" mov.f32 %f165, %f164;\n"
" .loc 16 308 0\n"
" mul.ftz.f32 %f259, %f143, %f77;\n"
" mul.ftz.f32 %f260, %f144, %f78;\n"
" mul.ftz.f32 %f261, %f145, %f79;\n"
" mul.ftz.f32 %f262, %f114, %f260;\n"
" fma.rn.ftz.f32 %f263, %f123, %f259, %f262;\n"
" fma.rn.ftz.f32 %f264, %f116, %f261, %f263;\n"
" mov.f32 %f172, %f264;\n"
" .loc 16 309 0\n"
" mul.ftz.f32 %f265, %f137, %f260;\n"
" fma.rn.ftz.f32 %f266, %f134, %f259, %f265;\n"
" fma.rn.ftz.f32 %f175, %f136, %f261, %f266;\n"
" .loc 16 310 0\n"
" mul.ftz.f32 %f267, %f144, %f260;\n"
" fma.rn.ftz.f32 %f268, %f143, %f259, %f267;\n"
" fma.rn.ftz.f32 %f269, %f145, %f261, %f268;\n"
" mov.f32 %f270, 0f3f800000; \n"
" add.ftz.f32 %f178, %f269, %f270;\n"
" abs.ftz.f32 %f271, %f252;\n"
" abs.ftz.f32 %f272, %f240;\n"
" setp.gt.ftz.f32 %p13, %f271, %f272;\n"
" @!%p13 bra $Lt_1_56578;\n"
" .loc 16 314 0\n"
" mov.f32 %f130, %f252;\n"
" mov.f32 %f157, %f240;\n"
" .loc 16 315 0\n"
" mov.f32 %f141, %f160;\n"
" mov.f32 %f160, %f243;\n"
" .loc 16 316 0\n"
" mov.f32 %f149, %f163;\n"
" mov.f32 %f163, %f246;\n"
" .loc 16 317 0\n"
" mov.f32 %f93, %f88;\n"
" mov.f32 %f150, %f87;\n"
"$Lt_1_56578:\n"
" mov.f32 %f273, %f130;\n"
" abs.ftz.f32 %f274, %f273;\n"
" abs.ftz.f32 %f275, %f264;\n"
" setp.lt.ftz.f32 %p14, %f274, %f275;\n"
" @!%p14 bra $Lt_1_57090;\n"
" .loc 16 321 0\n"
" mov.f32 %f130, %f264;\n"
" mov.f32 %f172, %f273;\n"
" .loc 16 322 0\n"
" mov.f32 %f276, %f141;\n"
" mov.f32 %f141, %f175;\n"
" mov.f32 %f175, %f276;\n"
" .loc 16 323 0\n"
" mov.f32 %f277, %f149;\n"
" mov.f32 %f149, %f178;\n"
" mov.f32 %f178, %f277;\n"
" .loc 16 324 0\n"
" mov.f32 %f278, %f93;\n"
" mov.f32 %f93, %f164;\n"
" mov.f32 %f165, %f278;\n"
"$Lt_1_57090:\n"
" mov.f32 %f279, %f130;\n"
" mov.f32 %f280, 0f00000000; \n"
" setp.neu.ftz.f32 %p15, %f279, %f280;\n"
" @!%p15 bra $Lt_1_57858;\n"
" bra.uni $Lt_1_58626;\n"
"$Lt_1_57858:\n"
" mov.f32 %f281, 0f00000000; \n"
" setp.neu.ftz.f32 %p16, %f157, %f281;\n"
" @!%p16 bra $Lt_1_58370;\n"
" .loc 16 338 0\n"
" mov.f32 %f130, %f157;\n"
" mov.f32 %f157, %f279;\n"
" .loc 16 339 0\n"
" mov.f32 %f282, %f141;\n"
" mov.f32 %f141, %f160;\n"
" mov.f32 %f160, %f282;\n"
" .loc 16 340 0\n"
" mov.f32 %f283, %f149;\n"
" mov.f32 %f149, %f163;\n"
" mov.f32 %f163, %f283;\n"
" .loc 16 341 0\n"
" mov.f32 %f284, %f93;\n"
" mov.f32 %f93, %f150;\n"
" mov.f32 %f150, %f284;\n"
" bra.uni $Lt_1_58626;\n"
"$Lt_1_58370:\n"
" mov.f32 %f285, 0f00000000; \n"
" setp.neu.ftz.f32 %p17, %f172, %f285;\n"
" @!%p17 bra $Lt_1_58882;\n"
" .loc 16 346 0\n"
" mov.f32 %f130, %f172;\n"
" mov.f32 %f172, %f279;\n"
" .loc 16 347 0\n"
" mov.f32 %f286, %f141;\n"
" mov.f32 %f141, %f175;\n"
" mov.f32 %f175, %f286;\n"
" .loc 16 348 0\n"
" mov.f32 %f287, %f149;\n"
" mov.f32 %f149, %f178;\n"
" mov.f32 %f178, %f287;\n"
" .loc 16 349 0\n"
" mov.f32 %f288, %f93;\n"
" mov.f32 %f93, %f165;\n"
" mov.f32 %f165, %f288;\n"
" bra.uni $Lt_1_58626;\n"
"$Lt_1_58882:\n"
" .loc 16 352 0\n"
" mov.s32 %r34, 2;\n"
" ld.param.u64 %rd42, [__cudaparm_kernel_sphere_ellipsoid_err_flag];\n"
" st.global.s32 [%rd42+0], %r34;\n"
"$Lt_1_58626:\n"
"$Lt_1_58114:\n"
"$Lt_1_57602:\n"
" .loc 16 355 0\n"
" div.approx.ftz.f32 %f289, %f157, %f130;\n"
" mul.ftz.f32 %f290, %f141, %f289;\n"
" sub.ftz.f32 %f291, %f160, %f290;\n"
" mov.f32 %f160, %f291;\n"
" .loc 16 356 0\n"
" mul.ftz.f32 %f292, %f149, %f289;\n"
" sub.ftz.f32 %f293, %f163, %f292;\n"
" mov.f32 %f163, %f293;\n"
" .loc 16 357 0\n"
" mul.ftz.f32 %f294, %f93, %f289;\n"
" sub.ftz.f32 %f295, %f150, %f294;\n"
" mov.f32 %f150, %f295;\n"
" .loc 16 359 0\n"
" div.approx.ftz.f32 %f296, %f172, %f130;\n"
" mul.ftz.f32 %f297, %f141, %f296;\n"
" sub.ftz.f32 %f175, %f175, %f297;\n"
" .loc 16 360 0\n"
" mul.ftz.f32 %f298, %f149, %f296;\n"
" sub.ftz.f32 %f178, %f178, %f298;\n"
" .loc 16 361 0\n"
" mul.ftz.f32 %f299, %f93, %f296;\n"
" sub.ftz.f32 %f165, %f165, %f299;\n"
" abs.ftz.f32 %f300, %f291;\n"
" abs.ftz.f32 %f301, %f175;\n"
" setp.lt.ftz.f32 %p18, %f300, %f301;\n"
" @!%p18 bra $Lt_1_59138;\n"
" .loc 16 366 0\n"
" mov.f32 %f160, %f175;\n"
" mov.f32 %f175, %f291;\n"
" .loc 16 367 0\n"
" mov.f32 %f163, %f178;\n"
" mov.f32 %f178, %f293;\n"
" .loc 16 368 0\n"
" mov.f32 %f150, %f165;\n"
" mov.f32 %f165, %f295;\n"
"$Lt_1_59138:\n"
" mov.f32 %f302, %f160;\n"
" mov.f32 %f303, 0f00000000; \n"
" setp.neu.ftz.f32 %p19, %f302, %f303;\n"
" @!%p19 bra $Lt_1_59906;\n"
" bra.uni $Lt_1_60162;\n"
"$Lt_1_59906:\n"
" mov.f32 %f304, 0f00000000; \n"
" setp.neu.ftz.f32 %p20, %f175, %f304;\n"
" @!%p20 bra $Lt_1_60162;\n"
" .loc 16 383 0\n"
" mov.f32 %f160, %f175;\n"
" mov.f32 %f175, %f302;\n"
" .loc 16 384 0\n"
" mov.f32 %f305, %f163;\n"
" mov.f32 %f163, %f178;\n"
" mov.f32 %f178, %f305;\n"
" .loc 16 385 0\n"
" mov.f32 %f306, %f150;\n"
" mov.f32 %f150, %f165;\n"
" mov.f32 %f165, %f306;\n"
"$Lt_1_60162:\n"
"$Lt_1_59650:\n"
" .loc 16 390 0\n"
" div.approx.ftz.f32 %f307, %f175, %f160;\n"
" mul.ftz.f32 %f308, %f163, %f307;\n"
" sub.ftz.f32 %f178, %f178, %f308;\n"
" .loc 16 391 0\n"
" mul.ftz.f32 %f309, %f150, %f307;\n"
" sub.ftz.f32 %f165, %f165, %f309;\n"
" mov.f32 %f310, 0f00000000; \n"
" setp.eq.ftz.f32 %p21, %f178, %f310;\n"
" @!%p21 bra $Lt_1_60674;\n"
" .loc 16 394 0\n"
" mov.s32 %r35, 2;\n"
" ld.param.u64 %rd43, [__cudaparm_kernel_sphere_ellipsoid_err_flag];\n"
" st.global.s32 [%rd43+0], %r35;\n"
"$Lt_1_60674:\n"
" .loc 17 436 0\n"
" div.approx.ftz.f32 %f311, %f165, %f178;\n"
" mul.ftz.f32 %f312, %f311, %f163;\n"
" sub.ftz.f32 %f313, %f150, %f312;\n"
" div.approx.ftz.f32 %f314, %f313, %f160;\n"
" mul.ftz.f32 %f315, %f314, %f141;\n"
" fma.rn.ftz.f32 %f316, %f149, %f311, %f315;\n"
" mul.ftz.f32 %f317, %f314, %f88;\n"
" sub.ftz.f32 %f318, %f93, %f316;\n"
" div.approx.ftz.f32 %f319, %f318, %f130;\n"
" fma.rn.ftz.f32 %f320, %f87, %f319, %f317;\n"
" fma.rn.ftz.f32 %f321, %f164, %f311, %f320;\n"
" add.ftz.f32 %f322, %f321, %f321;\n"
" .loc 17 444 0\n"
" rcp.approx.ftz.f32 %f323, %f86;\n"
" sub.ftz.f32 %f324, %f323, %f232;\n"
" mov.f32 %f325, 0f40000000; \n"
" div.approx.ftz.f32 %f326, %f324, %f325;\n"
" mul.ftz.f32 %f327, %f324, %f324;\n"
" mul.ftz.f32 %f328, %f324, %f327;\n"
" add.ftz.f32 %f329, %f326, %f72;\n"
" add.ftz.f32 %f330, %f326, %f70;\n"
" add.ftz.f32 %f331, %f326, %f71;\n"
" mul.ftz.f32 %f332, %f330, %f331;\n"
" mul.ftz.f32 %f333, %f329, %f332;\n"
" mul.ftz.f32 %f334, %f328, %f333;\n"
" .loc 17 446 0\n"
" mul.ftz.f32 %f335, %f70, %f71;\n"
" mul.ftz.f32 %f336, %f335, %f72;\n"
" div.approx.ftz.f32 %f337, %f89, %f324;\n"
" mul.ftz.f32 %f338, %f337, %f322;\n"
" mov.f32 %f339, 0f3f800000; \n"
" mov.f32 %f340, 0f40400000; \n"
" fma.rn.ftz.f32 %f341, %f340, %f338, %f339;\n"
" mul.ftz.f32 %f342, %f336, %f341;\n"
" .loc 17 450 0\n"
" div.approx.ftz.f32 %f343, %f324, %f17;\n"
" add.ftz.f32 %f344, %f343, %f72;\n"
" add.ftz.f32 %f345, %f343, %f70;\n"
" add.ftz.f32 %f346, %f343, %f71;\n"
" mul.ftz.f32 %f347, %f345, %f346;\n"
" mul.ftz.f32 %f348, %f344, %f347;\n"
" mul.ftz.f32 %f349, %f328, %f348;\n"
" .loc 17 452 0\n"
" mov.f32 %f350, 0f3f800000; \n"
" mov.f32 %f351, 0f3f4db6db; \n"
" fma.rn.ftz.f32 %f352, %f351, %f338, %f350;\n"
" mul.ftz.f32 %f353, %f336, %f352;\n"
" .loc 17 454 0\n"
" mul.ftz.f32 %f354, %f337, %f337;\n"
" mul.ftz.f32 %f355, %f337, %f354;\n"
" mul.ftz.f32 %f356, %f355, %f355;\n"
" .loc 17 457 0\n"
" mul.ftz.f32 %f357, %f89, %f89;\n"
" mov.f32 %f358, 0f41000000; \n"
" div.approx.ftz.f32 %f359, %f334, %f358;\n"
" mov.f32 %f360, 0f42700000; \n"
" div.approx.ftz.f32 %f361, %f349, %f360;\n"
" mul.ftz.f32 %f362, %f357, %f89;\n"
" div.approx.ftz.f32 %f363, %f342, %f359;\n"
" div.approx.ftz.f32 %f364, %f353, %f361;\n"
" mul.ftz.f32 %f365, %f363, %f92;\n"
" mul.ftz.f32 %f366, %f364, %f92;\n"
" mul.ftz.f32 %f367, %f362, %f365;\n"
" mul.ftz.f32 %f368, %f362, %f366;\n"
" mul.ftz.f32 %f369, %f367, %f41;\n"
" mul.ftz.f32 %f370, %f368, %f356;\n"
" mul.ftz.f32 %f371, %f370, %f45;\n"
" add.ftz.f32 %f372, %f369, %f371;\n"
" add.ftz.f32 %f65, %f65, %f372;\n"
" .loc 17 464 0\n"
" mov.f32 %f373, 0f40800000; \n"
" mul.ftz.f32 %f374, %f319, %f373;\n"
" .loc 17 471 0\n"
" mov.f32 %f375, 0f40400000; \n"
" div.approx.ftz.f32 %f376, %f375, %f324;\n"
" add.ftz.f32 %f377, %f70, %f70;\n"
" add.ftz.f32 %f378, %f324, %f377;\n"
" rcp.approx.ftz.f32 %f379, %f378;\n"
" add.ftz.f32 %f380, %f71, %f71;\n"
" add.ftz.f32 %f381, %f324, %f380;\n"
" rcp.approx.ftz.f32 %f382, %f381;\n"
" add.ftz.f32 %f383, %f379, %f382;\n"
" add.ftz.f32 %f384, %f72, %f72;\n"
" add.ftz.f32 %f385, %f324, %f384;\n"
" rcp.approx.ftz.f32 %f386, %f385;\n"
" add.ftz.f32 %f387, %f383, %f386;\n"
" add.ftz.f32 %f388, %f376, %f387;\n"
" .loc 17 476 0\n"
" mul.ftz.f32 %f389, %f89, %f322;\n"
" mov.f32 %f390, 0f40400000; \n"
" fma.rn.ftz.f32 %f391, %f390, %f389, %f324;\n"
" rcp.approx.ftz.f32 %f392, %f391;\n"
" rcp.approx.ftz.f32 %f393, %f324;\n"
" sub.ftz.f32 %f394, %f393, %f392;\n"
" add.ftz.f32 %f395, %f388, %f394;\n"
" .loc 17 479 0\n"
" fma.rn.ftz.f32 %f396, %f17, %f70, %f324;\n"
" rcp.approx.ftz.f32 %f397, %f396;\n"
" fma.rn.ftz.f32 %f398, %f17, %f71, %f324;\n"
" rcp.approx.ftz.f32 %f399, %f398;\n"
" add.ftz.f32 %f400, %f397, %f399;\n"
" fma.rn.ftz.f32 %f401, %f17, %f72, %f324;\n"
" rcp.approx.ftz.f32 %f402, %f401;\n"
" add.ftz.f32 %f403, %f400, %f402;\n"
" add.ftz.f32 %f404, %f376, %f403;\n"
" .loc 17 490 0\n"
" mul.ftz.f32 %f405, %f87, %f87;\n"
" neg.ftz.f32 %f406, %f405;\n"
" mov.f32 %f407, %f406;\n"
" .loc 17 491 0\n"
" mul.ftz.f32 %f408, %f88, %f87;\n"
" neg.ftz.f32 %f409, %f408;\n"
" mov.f32 %f410, %f409;\n"
" .loc 17 492 0\n"
" mul.ftz.f32 %f411, %f164, %f87;\n"
" neg.ftz.f32 %f412, %f411;\n"
" mov.f32 %f413, %f412;\n"
" .loc 17 493 0\n"
" mov.f32 %f414, 0f3f800000; \n"
" sub.ftz.f32 %f415, %f414, %f405;\n"
" mov.f32 %f416, %f415;\n"
" .loc 17 494 0\n"
" mul.ftz.f32 %f417, %f86, %f415;\n"
" mov.f32 %f418, %f417;\n"
" .loc 17 495 0\n"
" mov.f32 %f419, %f410;\n"
" mul.ftz.f32 %f420, %f419, %f86;\n"
" mov.f32 %f421, %f420;\n"
" .loc 17 496 0\n"
" mov.f32 %f422, %f413;\n"
" mul.ftz.f32 %f423, %f422, %f86;\n"
" mov.f32 %f424, %f423;\n"
" .loc 17 500 0\n"
" mul.ftz.f32 %f425, %f232, %f232;\n"
" mov.f32 %f426, 0f3f4db6db; \n"
" mul.ftz.f32 %f427, %f89, %f426;\n"
" mov.f32 %f428, 0f40800000; \n"
" mul.ftz.f32 %f429, %f311, %f428;\n"
" mul.ftz.f32 %f430, %f425, %f232;\n"
" mov.f32 %f431, 0f3f000000; \n"
" mul.ftz.f32 %f432, %f430, %f431;\n"
" mul.ftz.f32 %f433, %f432, %f222;\n"
" mul.ftz.f32 %f434, %f432, %f226;\n"
" mul.ftz.f32 %f435, %f432, %f219;\n"
" mov.f32 %f436, 0f40800000; \n"
" mul.ftz.f32 %f437, %f314, %f436;\n"
" mul.ftz.f32 %f438, %f433, %f420;\n"
" mul.ftz.f32 %f439, %f437, %f420;\n"
" mov.f32 %f440, 0f40e00000; \n"
" div.approx.ftz.f32 %f441, %f440, %f324;\n"
" mov.f32 %f442, 0f3f4db6db; \n"
" fma.rn.ftz.f32 %f443, %f442, %f389, %f324;\n"
" rcp.approx.ftz.f32 %f444, %f443;\n"
" fma.rn.ftz.f32 %f445, %f434, %f417, %f438;\n"
" fma.rn.ftz.f32 %f446, %f374, %f417, %f439;\n"
" sub.ftz.f32 %f447, %f441, %f444;\n"
" mul.ftz.f32 %f448, %f427, %f444;\n"
" fma.rn.ftz.f32 %f449, %f435, %f423, %f445;\n"
" fma.rn.ftz.f32 %f450, %f429, %f423, %f446;\n"
" add.ftz.f32 %f451, %f447, %f404;\n"
" add.ftz.f32 %f452, %f449, %f87;\n"
" mul.ftz.f32 %f453, %f451, %f452;\n"
" mul.ftz.f32 %f454, %f448, %f450;\n"
" sub.ftz.f32 %f455, %f454, %f453;\n"
" .loc 17 501 0\n"
" mov.f32 %f456, 0f40400000; \n"
" mul.ftz.f32 %f457, %f89, %f456;\n"
" mul.ftz.f32 %f458, %f457, %f392;\n"
" mul.ftz.f32 %f459, %f371, %f455;\n"
" mul.ftz.f32 %f460, %f452, %f395;\n"
" mul.ftz.f32 %f461, %f458, %f450;\n"
" sub.ftz.f32 %f462, %f461, %f460;\n"
" fma.rn.ftz.f32 %f463, %f369, %f462, %f459;\n"
" .loc 17 503 0\n"
" add.ftz.f32 %f64, %f463, %f64;\n"
" @!%p3 bra $Lt_1_61698;\n"
" .loc 17 505 0\n"
" mov.f32 %f464, %f47;\n"
" mul.ftz.f32 %f465, %f81, %f463;\n"
" sub.ftz.f32 %f466, %f464, %f465;\n"
" mov.f32 %f47, %f466;\n"
"$Lt_1_61698:\n"
" .loc 17 490 0\n"
" mov.f32 %f467, %f409;\n"
" .loc 17 491 0\n"
" mul.ftz.f32 %f468, %f88, %f88;\n"
" neg.ftz.f32 %f469, %f468;\n"
" mov.f32 %f470, %f469;\n"
" .loc 17 492 0\n"
" mul.ftz.f32 %f471, %f164, %f88;\n"
" neg.ftz.f32 %f472, %f471;\n"
" mov.f32 %f473, %f472;\n"
" .loc 17 493 0\n"
" mov.f32 %f474, 0f3f800000; \n"
" sub.ftz.f32 %f475, %f474, %f468;\n"
" mov.f32 %f476, %f475;\n"
" .loc 17 494 0\n"
" mov.f32 %f477, %f467;\n"
" mul.ftz.f32 %f478, %f477, %f86;\n"
" mov.f32 %f479, %f478;\n"
" .loc 17 495 0\n"
" mul.ftz.f32 %f480, %f86, %f475;\n"
" mov.f32 %f481, %f480;\n"
" .loc 17 496 0\n"
" mov.f32 %f482, %f473;\n"
" mul.ftz.f32 %f483, %f482, %f86;\n"
" mov.f32 %f484, %f483;\n"
" .loc 17 500 0\n"
" mul.ftz.f32 %f485, %f433, %f480;\n"
" mul.ftz.f32 %f486, %f437, %f480;\n"
" fma.rn.ftz.f32 %f487, %f434, %f478, %f485;\n"
" fma.rn.ftz.f32 %f488, %f374, %f478, %f486;\n"
" fma.rn.ftz.f32 %f489, %f435, %f483, %f487;\n"
" fma.rn.ftz.f32 %f490, %f429, %f483, %f488;\n"
" add.ftz.f32 %f491, %f489, %f88;\n"
" mul.ftz.f32 %f492, %f451, %f491;\n"
" mul.ftz.f32 %f493, %f448, %f490;\n"
" sub.ftz.f32 %f494, %f493, %f492;\n"
" .loc 17 501 0\n"
" mul.ftz.f32 %f495, %f371, %f494;\n"
" mul.ftz.f32 %f496, %f491, %f395;\n"
" mul.ftz.f32 %f497, %f458, %f490;\n"
" sub.ftz.f32 %f498, %f497, %f496;\n"
" fma.rn.ftz.f32 %f463, %f369, %f498, %f495;\n"
" .loc 17 507 0\n"
" add.ftz.f32 %f63, %f463, %f63;\n"
" @!%p3 bra $Lt_1_65282;\n"
" .loc 17 509 0\n"
" mov.f32 %f499, %f49;\n"
" mul.ftz.f32 %f500, %f80, %f463;\n"
" sub.ftz.f32 %f501, %f499, %f500;\n"
" mov.f32 %f49, %f501;\n"
" .loc 17 510 0\n"
" mov.f32 %f502, %f53;\n"
" mul.ftz.f32 %f503, %f81, %f463;\n"
" sub.ftz.f32 %f504, %f502, %f503;\n"
" mov.f32 %f53, %f504;\n"
"$Lt_1_65282:\n"
" .loc 17 490 0\n"
" mov.f32 %f505, %f412;\n"
" .loc 17 491 0\n"
" mov.f32 %f506, %f472;\n"
" .loc 17 492 0\n"
" mul.ftz.f32 %f507, %f164, %f164;\n"
" neg.ftz.f32 %f508, %f507;\n"
" mov.f32 %f509, %f508;\n"
" .loc 17 493 0\n"
" mov.f32 %f510, 0f3f800000; \n"
" sub.ftz.f32 %f511, %f510, %f507;\n"
" mov.f32 %f512, %f511;\n"
" .loc 17 494 0\n"
" mov.f32 %f513, %f505;\n"
" mul.ftz.f32 %f514, %f513, %f86;\n"
" mov.f32 %f515, %f514;\n"
" .loc 17 495 0\n"
" mov.f32 %f516, %f506;\n"
" mul.ftz.f32 %f517, %f516, %f86;\n"
" mov.f32 %f518, %f517;\n"
" .loc 17 496 0\n"
" mul.ftz.f32 %f519, %f86, %f511;\n"
" mov.f32 %f520, %f519;\n"
" .loc 17 500 0\n"
" mul.ftz.f32 %f521, %f433, %f517;\n"
" mul.ftz.f32 %f522, %f437, %f517;\n"
" fma.rn.ftz.f32 %f523, %f434, %f514, %f521;\n"
" fma.rn.ftz.f32 %f524, %f374, %f514, %f522;\n"
" fma.rn.ftz.f32 %f525, %f435, %f519, %f523;\n"
" fma.rn.ftz.f32 %f526, %f429, %f519, %f524;\n"
" add.ftz.f32 %f527, %f525, %f164;\n"
" mul.ftz.f32 %f528, %f527, %f451;\n"
" mul.ftz.f32 %f529, %f448, %f526;\n"
" sub.ftz.f32 %f530, %f529, %f528;\n"
" .loc 17 501 0\n"
" mul.ftz.f32 %f531, %f371, %f530;\n"
" mul.ftz.f32 %f532, %f527, %f395;\n"
" mul.ftz.f32 %f533, %f458, %f526;\n"
" sub.ftz.f32 %f534, %f533, %f532;\n"
" fma.rn.ftz.f32 %f463, %f369, %f534, %f531;\n"
" .loc 17 513 0\n"
" add.ftz.f32 %f62, %f463, %f62;\n"
" @!%p3 bra $Lt_1_68354;\n"
" .loc 17 515 0\n"
" mov.f32 %f535, %f51;\n"
" mul.ftz.f32 %f536, %f82, %f463;\n"
" sub.ftz.f32 %f537, %f535, %f536;\n"
" mov.f32 %f51, %f537;\n"
" .loc 17 516 0\n"
" mov.f32 %f538, %f55;\n"
" mul.ftz.f32 %f539, %f81, %f463;\n"
" sub.ftz.f32 %f540, %f538, %f539;\n"
" mov.f32 %f55, %f540;\n"
" .loc 17 517 0\n"
" mul.ftz.f32 %f541, %f80, %f463;\n"
" sub.ftz.f32 %f56, %f56, %f541;\n"
" mov.f32 %f57, %f56;\n"
"$Lt_1_68354:\n"
" mul.lo.s32 %r36, %r16, %r2;\n"
" cvt.s64.s32 %rd44, %r36;\n"
" mul.wide.s32 %rd45, %r36, 4;\n"
" add.u64 %rd17, %rd17, %rd45;\n"
" setp.gt.u64 %p22, %rd20, %rd17;\n"
" @%p22 bra $Lt_1_51714;\n"
" bra.uni $Lt_1_51202;\n"
"$Lt_1_75010:\n"
" mov.f32 %f62, 0f00000000; \n"
" mov.f32 %f63, 0f00000000; \n"
" mov.f32 %f64, 0f00000000; \n"
" mov.f32 %f65, 0f00000000; \n"
"$Lt_1_51202:\n"
" mov.u32 %r37, 1;\n"
" setp.le.s32 %p23, %r2, %r37;\n"
" @%p23 bra $Lt_1_71170;\n"
" .loc 17 522 0\n"
" mov.u64 %rd46, __cuda___cuda_local_var_33377_55_non_const_red_acc3852;\n"
" cvt.s64.s32 %rd47, %r3;\n"
" mul.wide.s32 %rd48, %r3, 4;\n"
" add.u64 %rd49, %rd46, %rd48;\n"
" mov.f32 %f542, %f64;\n"
" st.shared.f32 [%rd49+0], %f542;\n"
" mov.f32 %f543, %f63;\n"
" st.shared.f32 [%rd49+512], %f543;\n"
" mov.f32 %f544, %f62;\n"
" st.shared.f32 [%rd49+1024], %f544;\n"
" mov.f32 %f545, %f65;\n"
" st.shared.f32 [%rd49+1536], %f545;\n"
" shr.s32 %r38, %r2, 31;\n"
" mov.s32 %r39, 1;\n"
" and.b32 %r40, %r38, %r39;\n"
" add.s32 %r41, %r40, %r2;\n"
" shr.s32 %r42, %r41, 1;\n"
" mov.s32 %r43, %r42;\n"
" mov.u32 %r44, 0;\n"
" setp.ne.u32 %p24, %r42, %r44;\n"
" @!%p24 bra $Lt_1_69634;\n"
"$Lt_1_70146:\n"
" setp.ge.u32 %p25, %r18, %r43;\n"
" @%p25 bra $Lt_1_70402;\n"
" add.u32 %r45, %r3, %r43;\n"
" cvt.u64.u32 %rd50, %r45;\n"
" mul.wide.u32 %rd51, %r45, 4;\n"
" add.u64 %rd52, %rd46, %rd51;\n"
" ld.shared.f32 %f546, [%rd52+0];\n"
" add.ftz.f32 %f542, %f546, %f542;\n"
" st.shared.f32 [%rd49+0], %f542;\n"
" ld.shared.f32 %f547, [%rd52+512];\n"
" add.ftz.f32 %f543, %f547, %f543;\n"
" st.shared.f32 [%rd49+512], %f543;\n"
" ld.shared.f32 %f548, [%rd52+1024];\n"
" add.ftz.f32 %f544, %f548, %f544;\n"
" st.shared.f32 [%rd49+1024], %f544;\n"
" ld.shared.f32 %f549, [%rd52+1536];\n"
" add.ftz.f32 %f545, %f549, %f545;\n"
" st.shared.f32 [%rd49+1536], %f545;\n"
"$Lt_1_70402:\n"
" shr.u32 %r43, %r43, 1;\n"
" mov.u32 %r46, 0;\n"
" setp.ne.u32 %p26, %r43, %r46;\n"
" @%p26 bra $Lt_1_70146;\n"
"$Lt_1_69634:\n"
" mov.f32 %f64, %f542;\n"
" mov.f32 %f63, %f543;\n"
" mov.f32 %f62, %f544;\n"
" mov.f32 %f65, %f545;\n"
" ld.param.s32 %r47, [__cudaparm_kernel_sphere_ellipsoid_vflag];\n"
" mov.u32 %r48, 0;\n"
" setp.le.s32 %p27, %r47, %r48;\n"
" @%p27 bra $Lt_1_71170;\n"
" mov.f32 %f542, %f47;\n"
" st.shared.f32 [%rd49+0], %f542;\n"
" mov.f32 %f543, %f49;\n"
" st.shared.f32 [%rd49+512], %f543;\n"
" mov.f32 %f544, %f51;\n"
" st.shared.f32 [%rd49+1024], %f544;\n"
" mov.f32 %f545, %f53;\n"
" st.shared.f32 [%rd49+1536], %f545;\n"
" mov.f32 %f550, %f55;\n"
" st.shared.f32 [%rd49+2048], %f550;\n"
" mov.f32 %f551, %f56;\n"
" st.shared.f32 [%rd49+2560], %f551;\n"
" mov.s32 %r49, %r42;\n"
" @!%p24 bra $Lt_1_71682;\n"
"$Lt_1_72194:\n"
" setp.ge.u32 %p28, %r18, %r49;\n"
" @%p28 bra $Lt_1_72450;\n"
" add.u32 %r50, %r3, %r49;\n"
" cvt.u64.u32 %rd53, %r50;\n"
" mul.wide.u32 %rd54, %r50, 4;\n"
" add.u64 %rd55, %rd46, %rd54;\n"
" ld.shared.f32 %f552, [%rd55+0];\n"
" add.ftz.f32 %f542, %f552, %f542;\n"
" st.shared.f32 [%rd49+0], %f542;\n"
" ld.shared.f32 %f553, [%rd55+512];\n"
" add.ftz.f32 %f543, %f553, %f543;\n"
" st.shared.f32 [%rd49+512], %f543;\n"
" ld.shared.f32 %f554, [%rd55+1024];\n"
" add.ftz.f32 %f544, %f554, %f544;\n"
" st.shared.f32 [%rd49+1024], %f544;\n"
" ld.shared.f32 %f555, [%rd55+1536];\n"
" add.ftz.f32 %f545, %f555, %f545;\n"
" st.shared.f32 [%rd49+1536], %f545;\n"
" ld.shared.f32 %f556, [%rd55+2048];\n"
" add.ftz.f32 %f550, %f556, %f550;\n"
" st.shared.f32 [%rd49+2048], %f550;\n"
" ld.shared.f32 %f557, [%rd55+2560];\n"
" add.ftz.f32 %f551, %f557, %f551;\n"
" st.shared.f32 [%rd49+2560], %f551;\n"
"$Lt_1_72450:\n"
" shr.u32 %r49, %r49, 1;\n"
" mov.u32 %r51, 0;\n"
" setp.ne.u32 %p29, %r49, %r51;\n"
" @%p29 bra $Lt_1_72194;\n"
"$Lt_1_71682:\n"
" mov.f32 %f47, %f542;\n"
" mov.f32 %f49, %f543;\n"
" mov.f32 %f51, %f544;\n"
" mov.f32 %f53, %f545;\n"
" mov.f32 %f55, %f550;\n"
" mov.f32 %f57, %f551;\n"
"$Lt_1_71170:\n"
"$Lt_1_69122:\n"
" mov.u32 %r52, 0;\n"
" setp.ne.s32 %p30, %r18, %r52;\n"
" @%p30 bra $Lt_1_73218;\n"
" ld.param.u64 %rd56, [__cudaparm_kernel_sphere_ellipsoid___val_paramengv];\n"
" add.u64 %rd57, %rd56, %rd3;\n"
" ld.param.s32 %r53, [__cudaparm_kernel_sphere_ellipsoid_eflag];\n"
" mov.u32 %r54, 0;\n"
" setp.le.s32 %p31, %r53, %r54;\n"
" @%p31 bra $Lt_1_73730;\n"
" st.global.f32 [%rd57+0], %f65;\n"
" cvt.s64.s32 %rd58, %r12;\n"
" mul.wide.s32 %rd59, %r12, 4;\n"
" add.u64 %rd57, %rd57, %rd59;\n"
"$Lt_1_73730:\n"
" ld.param.s32 %r55, [__cudaparm_kernel_sphere_ellipsoid_vflag];\n"
" mov.u32 %r56, 0;\n"
" setp.le.s32 %p32, %r55, %r56;\n"
" @%p32 bra $Lt_1_74242;\n"
" mov.f32 %f558, %f47;\n"
" st.global.f32 [%rd57+0], %f558;\n"
" cvt.s64.s32 %rd60, %r12;\n"
" mul.wide.s32 %rd61, %r12, 4;\n"
" add.u64 %rd62, %rd61, %rd57;\n"
" mov.f32 %f559, %f49;\n"
" st.global.f32 [%rd62+0], %f559;\n"
" add.u64 %rd63, %rd61, %rd62;\n"
" mov.f32 %f560, %f51;\n"
" st.global.f32 [%rd63+0], %f560;\n"
" add.u64 %rd64, %rd61, %rd63;\n"
" mov.f32 %f561, %f53;\n"
" st.global.f32 [%rd64+0], %f561;\n"
" add.u64 %rd57, %rd61, %rd64;\n"
" mov.f32 %f562, %f55;\n"
" st.global.f32 [%rd57+0], %f562;\n"
" mov.f32 %f563, %f57;\n"
" add.u64 %rd65, %rd61, %rd57;\n"
" st.global.f32 [%rd65+0], %f563;\n"
"$Lt_1_74242:\n"
" ld.param.u64 %rd66, [__cudaparm_kernel_sphere_ellipsoid_ans];\n"
" mul.lo.u64 %rd67, %rd2, 16;\n"
" add.u64 %rd68, %rd66, %rd67;\n"
" mov.f32 %f564, %f565;\n"
" st.global.v4.f32 [%rd68+0], {%f64,%f63,%f62,%f564};\n"
"$Lt_1_73218:\n"
"$Lt_1_50690:\n"
" .loc 17 525 0\n"
" exit;\n"
"$LDWend_kernel_sphere_ellipsoid:\n"
" }\n"
" .entry kernel_lj (\n"
" .param .u64 __cudaparm_kernel_lj_x_,\n"
" .param .u64 __cudaparm_kernel_lj_lj1,\n"
" .param .u64 __cudaparm_kernel_lj_lj3,\n"
" .param .s32 __cudaparm_kernel_lj_lj_types,\n"
" .param .u64 __cudaparm_kernel_lj_gum,\n"
" .param .s32 __cudaparm_kernel_lj_stride,\n"
" .param .u64 __cudaparm_kernel_lj_dev_ij,\n"
" .param .u64 __cudaparm_kernel_lj_ans,\n"
" .param .u64 __cudaparm_kernel_lj___val_paramengv,\n"
" .param .u64 __cudaparm_kernel_lj_err_flag,\n"
" .param .s32 __cudaparm_kernel_lj_eflag,\n"
" .param .s32 __cudaparm_kernel_lj_vflag,\n"
" .param .s32 __cudaparm_kernel_lj_start,\n"
" .param .s32 __cudaparm_kernel_lj_inum,\n"
" .param .s32 __cudaparm_kernel_lj_t_per_atom)\n"
" {\n"
" .reg .u32 %r<55>;\n"
" .reg .u64 %rd<60>;\n"
" .reg .f32 %f<115>;\n"
" .reg .pred %p<19>;\n"
" .shared .align 16 .b8 __cuda___cuda_local_var_33394_33_non_const_sp_lj7028[16];\n"
" .shared .align 4 .b8 __cuda___cuda_local_var_33459_55_non_const_red_acc7044[3072];\n"
" .loc 17 534 0\n"
"$LDWbegin_kernel_lj:\n"
" .loc 17 540 0\n"
" ld.param.u64 %rd1, [__cudaparm_kernel_lj_gum];\n"
" ldu.global.f32 %f1, [%rd1+0];\n"
" .loc 17 541 0\n"
" ld.global.f32 %f2, [%rd1+4];\n"
" .loc 17 542 0\n"
" ld.global.f32 %f3, [%rd1+8];\n"
" .loc 17 543 0\n"
" ld.global.f32 %f4, [%rd1+12];\n"
" st.shared.v4.f32 [__cuda___cuda_local_var_33394_33_non_const_sp_lj7028+0], {%f1,%f2,%f3,%f4};\n"
" .loc 17 552 0\n"
" mov.f32 %f5, 0f00000000; \n"
" mov.f32 %f6, %f5;\n"
" mov.f32 %f7, 0f00000000; \n"
" mov.f32 %f8, %f7;\n"
" mov.f32 %f9, 0f00000000; \n"
" mov.f32 %f10, %f9;\n"
" mov.f32 %f11, 0f00000000; \n"
" mov.f32 %f12, %f11;\n"
" mov.f32 %f13, 0f00000000; \n"
" mov.f32 %f14, %f13;\n"
" mov.f32 %f15, 0f00000000; \n"
" mov.f32 %f16, %f15;\n"
" ld.param.s32 %r1, [__cudaparm_kernel_lj_t_per_atom];\n"
" cvt.s32.u32 %r2, %tid.x;\n"
" div.s32 %r3, %r2, %r1;\n"
" cvt.s32.u32 %r4, %ntid.x;\n"
" div.s32 %r5, %r4, %r1;\n"
" cvt.s32.u32 %r6, %ctaid.x;\n"
" mul.lo.s32 %r7, %r6, %r5;\n"
" add.s32 %r8, %r3, %r7;\n"
" ld.param.s32 %r9, [__cudaparm_kernel_lj_start];\n"
" add.s32 %r10, %r9, %r8;\n"
" ld.param.s32 %r11, [__cudaparm_kernel_lj_inum];\n"
" setp.ge.s32 %p1, %r10, %r11;\n"
" @%p1 bra $Lt_2_25346;\n"
" .loc 17 557 0\n"
" cvt.s64.s32 %rd2, %r10;\n"
" mul.wide.s32 %rd3, %r10, 4;\n"
" ld.param.u64 %rd4, [__cudaparm_kernel_lj_dev_ij];\n"
" add.u64 %rd5, %rd4, %rd3;\n"
" ld.global.s32 %r12, [%rd5+0];\n"
" ld.param.s32 %r13, [__cudaparm_kernel_lj_stride];\n"
" cvt.s64.s32 %rd6, %r13;\n"
" mul.wide.s32 %rd7, %r13, 4;\n"
" add.u64 %rd8, %rd7, %rd5;\n"
" ld.global.s32 %r14, [%rd8+0];\n"
" .loc 17 560 0\n"
" ld.param.u64 %rd9, [__cudaparm_kernel_lj_x_];\n"
" cvt.s64.s32 %rd10, %r12;\n"
" mul.wide.s32 %rd11, %r12, 16;\n"
" add.u64 %rd12, %rd9, %rd11;\n"
" ld.global.v4.f32 {%f17,%f18,%f19,%f20}, [%rd12+0];\n"
" .loc 17 561 0\n"
" cvt.s32.s64 %r15, %rd6;\n"
" sub.s32 %r16, %r1, 1;\n"
" and.b32 %r17, %r16, %r2;\n"
" add.u64 %rd13, %rd7, %rd8;\n"
" mul.lo.s32 %r18, %r15, %r17;\n"
" cvt.s64.s32 %rd14, %r18;\n"
" mul.wide.s32 %rd15, %r18, 4;\n"
" add.u64 %rd16, %rd13, %rd15;\n"
" mov.s64 %rd17, %rd16;\n"
" mul.lo.s32 %r19, %r15, %r14;\n"
" cvt.s64.s32 %rd18, %r19;\n"
" mul.wide.s32 %rd19, %r19, 4;\n"
" add.u64 %rd20, %rd13, %rd19;\n"
" setp.ge.u64 %p2, %rd16, %rd20;\n"
" @%p2 bra $Lt_2_26882;\n"
" cvt.rzi.ftz.s32.f32 %r20, %f20;\n"
" ld.param.s32 %r21, [__cudaparm_kernel_lj_lj_types];\n"
" mul.lo.s32 %r22, %r21, %r20;\n"
" ld.param.u64 %rd21, [__cudaparm_kernel_lj_lj1];\n"
" mov.f32 %f21, 0f00000000; \n"
" mov.f32 %f22, 0f00000000; \n"
" mov.f32 %f23, 0f00000000; \n"
" mov.f32 %f24, 0f00000000; \n"
" mov.u64 %rd22, __cuda___cuda_local_var_33394_33_non_const_sp_lj7028;\n"
"$Lt_2_19714:\n"
" .loc 17 566 0\n"
" ld.global.s32 %r23, [%rd17+0];\n"
" .loc 17 567 0\n"
" shr.s32 %r24, %r23, 30;\n"
" and.b32 %r25, %r24, 3;\n"
" cvt.s64.s32 %rd23, %r25;\n"
" mul.wide.s32 %rd24, %r25, 4;\n"
" add.u64 %rd25, %rd22, %rd24;\n"
" ld.shared.f32 %f25, [%rd25+0];\n"
" .loc 17 570 0\n"
" and.b32 %r26, %r23, 1073741823;\n"
" cvt.s64.s32 %rd26, %r26;\n"
" mul.wide.s32 %rd27, %r26, 16;\n"
" add.u64 %rd28, %rd9, %rd27;\n"
" ld.global.v4.f32 {%f26,%f27,%f28,%f29}, [%rd28+0];\n"
" .loc 17 566 0\n"
" cvt.rzi.ftz.s32.f32 %r27, %f29;\n"
" sub.ftz.f32 %f30, %f18, %f27;\n"
" sub.ftz.f32 %f31, %f17, %f26;\n"
" sub.ftz.f32 %f32, %f19, %f28;\n"
" mul.ftz.f32 %f33, %f30, %f30;\n"
" fma.rn.ftz.f32 %f34, %f31, %f31, %f33;\n"
" fma.rn.ftz.f32 %f35, %f32, %f32, %f34;\n"
" add.s32 %r28, %r27, %r22;\n"
" cvt.s64.s32 %rd29, %r28;\n"
" mul.wide.s32 %rd30, %r28, 16;\n"
" add.u64 %rd31, %rd30, %rd21;\n"
" ld.global.f32 %f36, [%rd31+8];\n"
" setp.gt.ftz.f32 %p3, %f36, %f35;\n"
" @!%p3 bra $Lt_2_27138;\n"
" ld.global.f32 %f37, [%rd31+12];\n"
" mov.f32 %f38, 0f00000000; \n"
" setp.eq.ftz.f32 %p4, %f37, %f38;\n"
" @!%p4 bra $Lt_2_27138;\n"
" .loc 17 584 0\n"
" rcp.approx.ftz.f32 %f39, %f35;\n"
" mul.ftz.f32 %f40, %f39, %f39;\n"
" mul.ftz.f32 %f41, %f39, %f40;\n"
" mul.ftz.f32 %f42, %f39, %f41;\n"
" ld.global.v2.f32 {%f43,%f44}, [%rd31+0];\n"
" mul.ftz.f32 %f45, %f43, %f41;\n"
" sub.ftz.f32 %f46, %f45, %f44;\n"
" mul.ftz.f32 %f47, %f42, %f46;\n"
" mul.ftz.f32 %f48, %f25, %f47;\n"
" .loc 17 586 0\n"
" fma.rn.ftz.f32 %f23, %f31, %f48, %f23;\n"
" .loc 17 587 0\n"
" fma.rn.ftz.f32 %f22, %f30, %f48, %f22;\n"
" .loc 17 588 0\n"
" fma.rn.ftz.f32 %f21, %f32, %f48, %f21;\n"
" ld.param.s32 %r29, [__cudaparm_kernel_lj_eflag];\n"
" mov.u32 %r30, 0;\n"
" setp.le.s32 %p5, %r29, %r30;\n"
" @%p5 bra $Lt_2_19970;\n"
" .loc 17 592 0\n"
" ld.param.u64 %rd32, [__cudaparm_kernel_lj_lj3];\n"
" add.u64 %rd33, %rd32, %rd30;\n"
" ld.global.v4.f32 {%f49,%f50,%f51,_}, [%rd33+0];\n"
" mul.ftz.f32 %f52, %f49, %f41;\n"
" sub.ftz.f32 %f53, %f52, %f50;\n"
" mul.ftz.f32 %f54, %f41, %f53;\n"
" sub.ftz.f32 %f55, %f54, %f51;\n"
" fma.rn.ftz.f32 %f24, %f25, %f55, %f24;\n"
"$Lt_2_19970:\n"
" ld.param.s32 %r31, [__cudaparm_kernel_lj_vflag];\n"
" mov.u32 %r32, 0;\n"
" setp.le.s32 %p6, %r31, %r32;\n"
" @%p6 bra $Lt_2_27138;\n"
" .loc 17 595 0\n"
" mov.f32 %f56, %f6;\n"
" mul.ftz.f32 %f57, %f31, %f31;\n"
" fma.rn.ftz.f32 %f58, %f48, %f57, %f56;\n"
" mov.f32 %f6, %f58;\n"
" .loc 17 596 0\n"
" mov.f32 %f59, %f8;\n"
" fma.rn.ftz.f32 %f60, %f48, %f33, %f59;\n"
" mov.f32 %f8, %f60;\n"
" .loc 17 597 0\n"
" mov.f32 %f61, %f10;\n"
" mul.ftz.f32 %f62, %f32, %f32;\n"
" fma.rn.ftz.f32 %f63, %f48, %f62, %f61;\n"
" mov.f32 %f10, %f63;\n"
" .loc 17 598 0\n"
" mov.f32 %f64, %f12;\n"
" mul.ftz.f32 %f65, %f30, %f31;\n"
" fma.rn.ftz.f32 %f66, %f48, %f65, %f64;\n"
" mov.f32 %f12, %f66;\n"
" .loc 17 599 0\n"
" mov.f32 %f67, %f14;\n"
" mul.ftz.f32 %f68, %f31, %f32;\n"
" fma.rn.ftz.f32 %f69, %f48, %f68, %f67;\n"
" mov.f32 %f14, %f69;\n"
" .loc 17 600 0\n"
" mul.ftz.f32 %f70, %f30, %f32;\n"
" fma.rn.ftz.f32 %f15, %f48, %f70, %f15;\n"
" mov.f32 %f16, %f15;\n"
"$Lt_2_27138:\n"
"$L_2_18178:\n"
" .loc 17 594 0\n"
" mul.lo.s32 %r33, %r15, %r1;\n"
" cvt.s64.s32 %rd34, %r33;\n"
" mul.wide.s32 %rd35, %r33, 4;\n"
" add.u64 %rd17, %rd17, %rd35;\n"
" setp.gt.u64 %p7, %rd20, %rd17;\n"
" @%p7 bra $Lt_2_19714;\n"
" bra.uni $Lt_2_19202;\n"
"$Lt_2_26882:\n"
" mov.f32 %f21, 0f00000000; \n"
" mov.f32 %f22, 0f00000000; \n"
" mov.f32 %f23, 0f00000000; \n"
" mov.f32 %f24, 0f00000000; \n"
"$Lt_2_19202:\n"
" mov.u32 %r34, 1;\n"
" setp.le.s32 %p8, %r1, %r34;\n"
" @%p8 bra $Lt_2_23298;\n"
" .loc 17 604 0\n"
" mov.u64 %rd36, __cuda___cuda_local_var_33459_55_non_const_red_acc7044;\n"
" cvt.s64.s32 %rd37, %r2;\n"
" mul.wide.s32 %rd38, %r2, 4;\n"
" add.u64 %rd39, %rd36, %rd38;\n"
" mov.f32 %f71, %f23;\n"
" st.shared.f32 [%rd39+0], %f71;\n"
" mov.f32 %f72, %f22;\n"
" st.shared.f32 [%rd39+512], %f72;\n"
" mov.f32 %f73, %f21;\n"
" st.shared.f32 [%rd39+1024], %f73;\n"
" mov.f32 %f74, %f24;\n"
" st.shared.f32 [%rd39+1536], %f74;\n"
" shr.s32 %r35, %r1, 31;\n"
" mov.s32 %r36, 1;\n"
" and.b32 %r37, %r35, %r36;\n"
" add.s32 %r38, %r37, %r1;\n"
" shr.s32 %r39, %r38, 1;\n"
" mov.s32 %r40, %r39;\n"
" mov.u32 %r41, 0;\n"
" setp.ne.u32 %p9, %r39, %r41;\n"
" @!%p9 bra $Lt_2_21762;\n"
"$Lt_2_22274:\n"
" setp.ge.u32 %p10, %r17, %r40;\n"
" @%p10 bra $Lt_2_22530;\n"
" add.u32 %r42, %r2, %r40;\n"
" cvt.u64.u32 %rd40, %r42;\n"
" mul.wide.u32 %rd41, %r42, 4;\n"
" add.u64 %rd42, %rd36, %rd41;\n"
" ld.shared.f32 %f75, [%rd42+0];\n"
" add.ftz.f32 %f71, %f75, %f71;\n"
" st.shared.f32 [%rd39+0], %f71;\n"
" ld.shared.f32 %f76, [%rd42+512];\n"
" add.ftz.f32 %f72, %f76, %f72;\n"
" st.shared.f32 [%rd39+512], %f72;\n"
" ld.shared.f32 %f77, [%rd42+1024];\n"
" add.ftz.f32 %f73, %f77, %f73;\n"
" st.shared.f32 [%rd39+1024], %f73;\n"
" ld.shared.f32 %f78, [%rd42+1536];\n"
" add.ftz.f32 %f74, %f78, %f74;\n"
" st.shared.f32 [%rd39+1536], %f74;\n"
"$Lt_2_22530:\n"
" shr.u32 %r40, %r40, 1;\n"
" mov.u32 %r43, 0;\n"
" setp.ne.u32 %p11, %r40, %r43;\n"
" @%p11 bra $Lt_2_22274;\n"
"$Lt_2_21762:\n"
" mov.f32 %f23, %f71;\n"
" mov.f32 %f22, %f72;\n"
" mov.f32 %f21, %f73;\n"
" mov.f32 %f24, %f74;\n"
" ld.param.s32 %r44, [__cudaparm_kernel_lj_vflag];\n"
" mov.u32 %r45, 0;\n"
" setp.le.s32 %p12, %r44, %r45;\n"
" @%p12 bra $Lt_2_23298;\n"
" mov.f32 %f71, %f6;\n"
" st.shared.f32 [%rd39+0], %f71;\n"
" mov.f32 %f72, %f8;\n"
" st.shared.f32 [%rd39+512], %f72;\n"
" mov.f32 %f73, %f10;\n"
" st.shared.f32 [%rd39+1024], %f73;\n"
" mov.f32 %f74, %f12;\n"
" st.shared.f32 [%rd39+1536], %f74;\n"
" mov.f32 %f79, %f14;\n"
" st.shared.f32 [%rd39+2048], %f79;\n"
" mov.f32 %f80, %f15;\n"
" st.shared.f32 [%rd39+2560], %f80;\n"
" mov.s32 %r46, %r39;\n"
" @!%p9 bra $Lt_2_23810;\n"
"$Lt_2_24322:\n"
" setp.ge.u32 %p13, %r17, %r46;\n"
" @%p13 bra $Lt_2_24578;\n"
" add.u32 %r47, %r2, %r46;\n"
" cvt.u64.u32 %rd43, %r47;\n"
" mul.wide.u32 %rd44, %r47, 4;\n"
" add.u64 %rd45, %rd36, %rd44;\n"
" ld.shared.f32 %f81, [%rd45+0];\n"
" add.ftz.f32 %f71, %f81, %f71;\n"
" st.shared.f32 [%rd39+0], %f71;\n"
" ld.shared.f32 %f82, [%rd45+512];\n"
" add.ftz.f32 %f72, %f82, %f72;\n"
" st.shared.f32 [%rd39+512], %f72;\n"
" ld.shared.f32 %f83, [%rd45+1024];\n"
" add.ftz.f32 %f73, %f83, %f73;\n"
" st.shared.f32 [%rd39+1024], %f73;\n"
" ld.shared.f32 %f84, [%rd45+1536];\n"
" add.ftz.f32 %f74, %f84, %f74;\n"
" st.shared.f32 [%rd39+1536], %f74;\n"
" ld.shared.f32 %f85, [%rd45+2048];\n"
" add.ftz.f32 %f79, %f85, %f79;\n"
" st.shared.f32 [%rd39+2048], %f79;\n"
" ld.shared.f32 %f86, [%rd45+2560];\n"
" add.ftz.f32 %f80, %f86, %f80;\n"
" st.shared.f32 [%rd39+2560], %f80;\n"
"$Lt_2_24578:\n"
" shr.u32 %r46, %r46, 1;\n"
" mov.u32 %r48, 0;\n"
" setp.ne.u32 %p14, %r46, %r48;\n"
" @%p14 bra $Lt_2_24322;\n"
"$Lt_2_23810:\n"
" mov.f32 %f6, %f71;\n"
" mov.f32 %f8, %f72;\n"
" mov.f32 %f10, %f73;\n"
" mov.f32 %f12, %f74;\n"
" mov.f32 %f14, %f79;\n"
" mov.f32 %f16, %f80;\n"
"$Lt_2_23298:\n"
"$Lt_2_21250:\n"
" mov.u32 %r49, 0;\n"
" setp.ne.s32 %p15, %r17, %r49;\n"
" @%p15 bra $Lt_2_25346;\n"
" ld.param.u64 %rd46, [__cudaparm_kernel_lj___val_paramengv];\n"
" add.u64 %rd47, %rd46, %rd3;\n"
" ld.param.s32 %r50, [__cudaparm_kernel_lj_eflag];\n"
" mov.u32 %r51, 0;\n"
" setp.le.s32 %p16, %r50, %r51;\n"
" @%p16 bra $Lt_2_25858;\n"
" ld.global.f32 %f87, [%rd47+0];\n"
" add.ftz.f32 %f88, %f87, %f24;\n"
" st.global.f32 [%rd47+0], %f88;\n"
" cvt.s64.s32 %rd48, %r11;\n"
" mul.wide.s32 %rd49, %r11, 4;\n"
" add.u64 %rd47, %rd47, %rd49;\n"
"$Lt_2_25858:\n"
" ld.param.s32 %r52, [__cudaparm_kernel_lj_vflag];\n"
" mov.u32 %r53, 0;\n"
" setp.le.s32 %p17, %r52, %r53;\n"
" @%p17 bra $Lt_2_26370;\n"
" ld.global.f32 %f89, [%rd47+0];\n"
" mov.f32 %f90, %f6;\n"
" add.ftz.f32 %f91, %f89, %f90;\n"
" st.global.f32 [%rd47+0], %f91;\n"
" cvt.s64.s32 %rd50, %r11;\n"
" mul.wide.s32 %rd51, %r11, 4;\n"
" add.u64 %rd52, %rd51, %rd47;\n"
" ld.global.f32 %f92, [%rd52+0];\n"
" mov.f32 %f93, %f8;\n"
" add.ftz.f32 %f94, %f92, %f93;\n"
" st.global.f32 [%rd52+0], %f94;\n"
" add.u64 %rd53, %rd51, %rd52;\n"
" ld.global.f32 %f95, [%rd53+0];\n"
" mov.f32 %f96, %f10;\n"
" add.ftz.f32 %f97, %f95, %f96;\n"
" st.global.f32 [%rd53+0], %f97;\n"
" add.u64 %rd54, %rd51, %rd53;\n"
" ld.global.f32 %f98, [%rd54+0];\n"
" mov.f32 %f99, %f12;\n"
" add.ftz.f32 %f100, %f98, %f99;\n"
" st.global.f32 [%rd54+0], %f100;\n"
" add.u64 %rd55, %rd51, %rd54;\n"
" ld.global.f32 %f101, [%rd55+0];\n"
" mov.f32 %f102, %f14;\n"
" add.ftz.f32 %f103, %f101, %f102;\n"
" st.global.f32 [%rd55+0], %f103;\n"
" add.u64 %rd47, %rd51, %rd55;\n"
" ld.global.f32 %f104, [%rd47+0];\n"
" mov.f32 %f105, %f16;\n"
" add.ftz.f32 %f106, %f104, %f105;\n"
" st.global.f32 [%rd47+0], %f106;\n"
"$Lt_2_26370:\n"
" ld.param.u64 %rd56, [__cudaparm_kernel_lj_ans];\n"
" mul.lo.u64 %rd57, %rd2, 16;\n"
" add.u64 %rd58, %rd56, %rd57;\n"
" ld.global.v4.f32 {%f107,%f108,%f109,%f110}, [%rd58+0];\n"
" add.ftz.f32 %f111, %f108, %f22;\n"
" add.ftz.f32 %f112, %f109, %f21;\n"
" add.ftz.f32 %f113, %f107, %f23;\n"
" st.global.v4.f32 [%rd58+0], {%f113,%f111,%f112,%f110};\n"
"$Lt_2_25346:\n"
"$Lt_2_18690:\n"
" .loc 17 607 0\n"
" exit;\n"
"$LDWend_kernel_lj:\n"
" }\n"
" .entry kernel_lj_fast (\n"
" .param .u64 __cudaparm_kernel_lj_fast_x_,\n"
" .param .u64 __cudaparm_kernel_lj_fast_lj1_in,\n"
" .param .u64 __cudaparm_kernel_lj_fast_lj3_in,\n"
" .param .u64 __cudaparm_kernel_lj_fast_gum,\n"
" .param .s32 __cudaparm_kernel_lj_fast_stride,\n"
" .param .u64 __cudaparm_kernel_lj_fast_dev_ij,\n"
" .param .u64 __cudaparm_kernel_lj_fast_ans,\n"
" .param .u64 __cudaparm_kernel_lj_fast___val_paramengv,\n"
" .param .u64 __cudaparm_kernel_lj_fast_err_flag,\n"
" .param .s32 __cudaparm_kernel_lj_fast_eflag,\n"
" .param .s32 __cudaparm_kernel_lj_fast_vflag,\n"
" .param .s32 __cudaparm_kernel_lj_fast_start,\n"
" .param .s32 __cudaparm_kernel_lj_fast_inum,\n"
" .param .s32 __cudaparm_kernel_lj_fast_t_per_atom)\n"
" {\n"
" .reg .u32 %r<57>;\n"
" .reg .u64 %rd<72>;\n"
" .reg .f32 %f<122>;\n"
" .reg .pred %p<22>;\n"
" .shared .align 4 .b8 __cuda___cuda_local_var_33475_33_non_const_sp_lj10212[16];\n"
" .shared .align 16 .b8 __cuda___cuda_local_var_33476_34_non_const_lj110240[1936];\n"
" .shared .align 16 .b8 __cuda___cuda_local_var_33477_34_non_const_lj312176[1936];\n"
" .shared .align 4 .b8 __cuda___cuda_local_var_33547_55_non_const_red_acc14112[3072];\n"
" .loc 17 615 0\n"
"$LDWbegin_kernel_lj_fast:\n"
" cvt.s32.u32 %r1, %tid.x;\n"
" mov.u32 %r2, 3;\n"
" setp.gt.s32 %p1, %r1, %r2;\n"
" @%p1 bra $Lt_3_20994;\n"
" .loc 17 624 0\n"
" mov.u64 %rd1, __cuda___cuda_local_var_33475_33_non_const_sp_lj10212;\n"
" cvt.s64.s32 %rd2, %r1;\n"
" mul.wide.s32 %rd3, %r1, 4;\n"
" ld.param.u64 %rd4, [__cudaparm_kernel_lj_fast_gum];\n"
" add.u64 %rd5, %rd4, %rd3;\n"
" ld.global.f32 %f1, [%rd5+0];\n"
" add.u64 %rd6, %rd3, %rd1;\n"
" st.shared.f32 [%rd6+0], %f1;\n"
"$Lt_3_20994:\n"
" mov.u64 %rd1, __cuda___cuda_local_var_33475_33_non_const_sp_lj10212;\n"
" mov.u32 %r3, 120;\n"
" setp.gt.s32 %p2, %r1, %r3;\n"
" @%p2 bra $Lt_3_21506;\n"
" .loc 17 626 0\n"
" mov.u64 %rd7, __cuda___cuda_local_var_33476_34_non_const_lj110240;\n"
" cvt.s64.s32 %rd8, %r1;\n"
" mul.wide.s32 %rd9, %r1, 16;\n"
" ld.param.u64 %rd10, [__cudaparm_kernel_lj_fast_lj1_in];\n"
" add.u64 %rd11, %rd10, %rd9;\n"
" add.u64 %rd12, %rd9, %rd7;\n"
" ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0];\n"
" st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5};\n"
" ld.param.s32 %r4, [__cudaparm_kernel_lj_fast_eflag];\n"
" mov.u32 %r5, 0;\n"
" setp.le.s32 %p3, %r4, %r5;\n"
" @%p3 bra $Lt_3_22018;\n"
" .loc 17 628 0\n"
" mov.u64 %rd13, __cuda___cuda_local_var_33477_34_non_const_lj312176;\n"
" ld.param.u64 %rd14, [__cudaparm_kernel_lj_fast_lj3_in];\n"
" add.u64 %rd15, %rd14, %rd9;\n"
" add.u64 %rd16, %rd9, %rd13;\n"
" ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0];\n"
" st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9};\n"
"$Lt_3_22018:\n"
" mov.u64 %rd13, __cuda___cuda_local_var_33477_34_non_const_lj312176;\n"
"$Lt_3_21506:\n"
" mov.u64 %rd13, __cuda___cuda_local_var_33477_34_non_const_lj312176;\n"
" mov.u64 %rd7, __cuda___cuda_local_var_33476_34_non_const_lj110240;\n"
" .loc 17 638 0\n"
" mov.f32 %f10, 0f00000000; \n"
" mov.f32 %f11, %f10;\n"
" mov.f32 %f12, 0f00000000; \n"
" mov.f32 %f13, %f12;\n"
" mov.f32 %f14, 0f00000000; \n"
" mov.f32 %f15, %f14;\n"
" mov.f32 %f16, 0f00000000; \n"
" mov.f32 %f17, %f16;\n"
" mov.f32 %f18, 0f00000000; \n"
" mov.f32 %f19, %f18;\n"
" mov.f32 %f20, 0f00000000; \n"
" mov.f32 %f21, %f20;\n"
" .loc 17 640 0\n"
" bar.sync 0;\n"
" ld.param.s32 %r6, [__cudaparm_kernel_lj_fast_t_per_atom];\n"
" div.s32 %r7, %r1, %r6;\n"
" cvt.s32.u32 %r8, %ntid.x;\n"
" div.s32 %r9, %r8, %r6;\n"
" cvt.s32.u32 %r10, %ctaid.x;\n"
" mul.lo.s32 %r11, %r10, %r9;\n"
" add.s32 %r12, %r7, %r11;\n"
" ld.param.s32 %r13, [__cudaparm_kernel_lj_fast_start];\n"
" add.s32 %r14, %r13, %r12;\n"
" ld.param.s32 %r15, [__cudaparm_kernel_lj_fast_inum];\n"
" setp.ge.s32 %p4, %r14, %r15;\n"
" @%p4 bra $Lt_3_29186;\n"
" .loc 17 645 0\n"
" cvt.s64.s32 %rd17, %r14;\n"
" mul.wide.s32 %rd18, %r14, 4;\n"
" ld.param.u64 %rd19, [__cudaparm_kernel_lj_fast_dev_ij];\n"
" add.u64 %rd20, %rd19, %rd18;\n"
" ld.global.s32 %r16, [%rd20+0];\n"
" ld.param.s32 %r17, [__cudaparm_kernel_lj_fast_stride];\n"
" cvt.s64.s32 %rd21, %r17;\n"
" mul.wide.s32 %rd22, %r17, 4;\n"
" add.u64 %rd23, %rd22, %rd20;\n"
" ld.global.s32 %r18, [%rd23+0];\n"
" .loc 17 648 0\n"
" ld.param.u64 %rd24, [__cudaparm_kernel_lj_fast_x_];\n"
" cvt.s64.s32 %rd25, %r16;\n"
" mul.wide.s32 %rd26, %r16, 16;\n"
" add.u64 %rd27, %rd24, %rd26;\n"
" ld.global.v4.f32 {%f22,%f23,%f24,%f25}, [%rd27+0];\n"
" .loc 17 650 0\n"
" cvt.s32.s64 %r19, %rd21;\n"
" sub.s32 %r20, %r6, 1;\n"
" and.b32 %r21, %r20, %r1;\n"
" add.u64 %rd28, %rd22, %rd23;\n"
" mul.lo.s32 %r22, %r19, %r21;\n"
" cvt.s64.s32 %rd29, %r22;\n"
" mul.wide.s32 %rd30, %r22, 4;\n"
" add.u64 %rd31, %rd28, %rd30;\n"
" mov.s64 %rd32, %rd31;\n"
" mul.lo.s32 %r23, %r19, %r18;\n"
" cvt.s64.s32 %rd33, %r23;\n"
" mul.wide.s32 %rd34, %r23, 4;\n"
" add.u64 %rd35, %rd28, %rd34;\n"
" setp.ge.u64 %p5, %rd31, %rd35;\n"
" @%p5 bra $Lt_3_30722;\n"
" cvt.rzi.ftz.s32.f32 %r24, %f25;\n"
" mul.lo.s32 %r25, %r24, 11;\n"
" cvt.rn.f32.s32 %f26, %r25;\n"
" mov.f32 %f27, 0f00000000; \n"
" mov.f32 %f28, 0f00000000; \n"
" mov.f32 %f29, 0f00000000; \n"
" mov.f32 %f30, 0f00000000; \n"
"$Lt_3_23554:\n"
" .loc 17 655 0\n"
" ld.global.s32 %r26, [%rd32+0];\n"
" .loc 17 656 0\n"
" shr.s32 %r27, %r26, 30;\n"
" and.b32 %r28, %r27, 3;\n"
" cvt.s64.s32 %rd36, %r28;\n"
" mul.wide.s32 %rd37, %r28, 4;\n"
" add.u64 %rd38, %rd1, %rd37;\n"
" ld.shared.f32 %f31, [%rd38+0];\n"
" .loc 17 659 0\n"
" and.b32 %r29, %r26, 1073741823;\n"
" cvt.s64.s32 %rd39, %r29;\n"
" mul.wide.s32 %rd40, %r29, 16;\n"
" add.u64 %rd41, %rd24, %rd40;\n"
" ld.global.v4.f32 {%f32,%f33,%f34,%f35}, [%rd41+0];\n"
" .loc 17 655 0\n"
" sub.ftz.f32 %f36, %f23, %f33;\n"
" sub.ftz.f32 %f37, %f22, %f32;\n"
" sub.ftz.f32 %f38, %f24, %f34;\n"
" mul.ftz.f32 %f39, %f36, %f36;\n"
" fma.rn.ftz.f32 %f40, %f37, %f37, %f39;\n"
" fma.rn.ftz.f32 %f41, %f38, %f38, %f40;\n"
" add.ftz.f32 %f42, %f26, %f35;\n"
" cvt.rzi.ftz.s32.f32 %r30, %f42;\n"
" cvt.s64.s32 %rd42, %r30;\n"
" mul.wide.s32 %rd43, %r30, 16;\n"
" add.u64 %rd44, %rd43, %rd7;\n"
" ld.shared.f32 %f43, [%rd44+8];\n"
" setp.gt.ftz.f32 %p6, %f43, %f41;\n"
" @!%p6 bra $Lt_3_30978;\n"
" ld.shared.f32 %f44, [%rd44+12];\n"
" mov.f32 %f45, 0f00000000; \n"
" setp.eq.ftz.f32 %p7, %f44, %f45;\n"
" @!%p7 bra $Lt_3_30978;\n"
" .loc 17 671 0\n"
" rcp.approx.ftz.f32 %f46, %f41;\n"
" mul.ftz.f32 %f47, %f46, %f46;\n"
" mul.ftz.f32 %f48, %f46, %f47;\n"
" mul.ftz.f32 %f49, %f46, %f31;\n"
" mul.ftz.f32 %f50, %f48, %f49;\n"
" ld.shared.v2.f32 {%f51,%f52}, [%rd44+0];\n"
" mul.ftz.f32 %f53, %f51, %f48;\n"
" sub.ftz.f32 %f54, %f53, %f52;\n"
" mul.ftz.f32 %f55, %f50, %f54;\n"
" .loc 17 673 0\n"
" fma.rn.ftz.f32 %f29, %f37, %f55, %f29;\n"
" .loc 17 674 0\n"
" fma.rn.ftz.f32 %f28, %f36, %f55, %f28;\n"
" .loc 17 675 0\n"
" fma.rn.ftz.f32 %f27, %f38, %f55, %f27;\n"
" ld.param.s32 %r31, [__cudaparm_kernel_lj_fast_eflag];\n"
" mov.u32 %r32, 0;\n"
" setp.le.s32 %p8, %r31, %r32;\n"
" @%p8 bra $Lt_3_23810;\n"
" .loc 17 678 0\n"
" add.u64 %rd45, %rd43, %rd13;\n"
" ld.shared.v4.f32 {%f56,%f57,%f58,_}, [%rd45+0];\n"
" mul.ftz.f32 %f59, %f56, %f48;\n"
" sub.ftz.f32 %f60, %f59, %f57;\n"
" mul.ftz.f32 %f61, %f48, %f60;\n"
" .loc 17 679 0\n"
" sub.ftz.f32 %f62, %f61, %f58;\n"
" fma.rn.ftz.f32 %f30, %f31, %f62, %f30;\n"
"$Lt_3_23810:\n"
" ld.param.s32 %r33, [__cudaparm_kernel_lj_fast_vflag];\n"
" mov.u32 %r34, 0;\n"
" setp.le.s32 %p9, %r33, %r34;\n"
" @%p9 bra $Lt_3_30978;\n"
" .loc 17 682 0\n"
" mov.f32 %f63, %f11;\n"
" mul.ftz.f32 %f64, %f37, %f37;\n"
" fma.rn.ftz.f32 %f65, %f55, %f64, %f63;\n"
" mov.f32 %f11, %f65;\n"
" .loc 17 683 0\n"
" mov.f32 %f66, %f13;\n"
" fma.rn.ftz.f32 %f67, %f55, %f39, %f66;\n"
" mov.f32 %f13, %f67;\n"
" .loc 17 684 0\n"
" mov.f32 %f68, %f15;\n"
" mul.ftz.f32 %f69, %f38, %f38;\n"
" fma.rn.ftz.f32 %f70, %f55, %f69, %f68;\n"
" mov.f32 %f15, %f70;\n"
" .loc 17 685 0\n"
" mov.f32 %f71, %f17;\n"
" mul.ftz.f32 %f72, %f36, %f37;\n"
" fma.rn.ftz.f32 %f73, %f55, %f72, %f71;\n"
" mov.f32 %f17, %f73;\n"
" .loc 17 686 0\n"
" mov.f32 %f74, %f19;\n"
" mul.ftz.f32 %f75, %f37, %f38;\n"
" fma.rn.ftz.f32 %f76, %f55, %f75, %f74;\n"
" mov.f32 %f19, %f76;\n"
" .loc 17 687 0\n"
" mul.ftz.f32 %f77, %f36, %f38;\n"
" fma.rn.ftz.f32 %f20, %f55, %f77, %f20;\n"
" mov.f32 %f21, %f20;\n"
"$Lt_3_30978:\n"
"$L_3_20482:\n"
" .loc 17 681 0\n"
" mul.lo.s32 %r35, %r19, %r6;\n"
" cvt.s64.s32 %rd46, %r35;\n"
" mul.wide.s32 %rd47, %r35, 4;\n"
" add.u64 %rd32, %rd32, %rd47;\n"
" setp.gt.u64 %p10, %rd35, %rd32;\n"
" @%p10 bra $Lt_3_23554;\n"
" bra.uni $Lt_3_23042;\n"
"$Lt_3_30722:\n"
" mov.f32 %f27, 0f00000000; \n"
" mov.f32 %f28, 0f00000000; \n"
" mov.f32 %f29, 0f00000000; \n"
" mov.f32 %f30, 0f00000000; \n"
"$Lt_3_23042:\n"
" mov.u32 %r36, 1;\n"
" setp.le.s32 %p11, %r6, %r36;\n"
" @%p11 bra $Lt_3_27138;\n"
" .loc 17 692 0\n"
" mov.u64 %rd48, __cuda___cuda_local_var_33547_55_non_const_red_acc14112;\n"
" cvt.s64.s32 %rd49, %r1;\n"
" mul.wide.s32 %rd50, %r1, 4;\n"
" add.u64 %rd51, %rd48, %rd50;\n"
" mov.f32 %f78, %f29;\n"
" st.shared.f32 [%rd51+0], %f78;\n"
" mov.f32 %f79, %f28;\n"
" st.shared.f32 [%rd51+512], %f79;\n"
" mov.f32 %f80, %f27;\n"
" st.shared.f32 [%rd51+1024], %f80;\n"
" mov.f32 %f81, %f30;\n"
" st.shared.f32 [%rd51+1536], %f81;\n"
" shr.s32 %r37, %r6, 31;\n"
" mov.s32 %r38, 1;\n"
" and.b32 %r39, %r37, %r38;\n"
" add.s32 %r40, %r39, %r6;\n"
" shr.s32 %r41, %r40, 1;\n"
" mov.s32 %r42, %r41;\n"
" mov.u32 %r43, 0;\n"
" setp.ne.u32 %p12, %r41, %r43;\n"
" @!%p12 bra $Lt_3_25602;\n"
"$Lt_3_26114:\n"
" setp.ge.u32 %p13, %r21, %r42;\n"
" @%p13 bra $Lt_3_26370;\n"
" add.u32 %r44, %r1, %r42;\n"
" cvt.u64.u32 %rd52, %r44;\n"
" mul.wide.u32 %rd53, %r44, 4;\n"
" add.u64 %rd54, %rd48, %rd53;\n"
" ld.shared.f32 %f82, [%rd54+0];\n"
" add.ftz.f32 %f78, %f82, %f78;\n"
" st.shared.f32 [%rd51+0], %f78;\n"
" ld.shared.f32 %f83, [%rd54+512];\n"
" add.ftz.f32 %f79, %f83, %f79;\n"
" st.shared.f32 [%rd51+512], %f79;\n"
" ld.shared.f32 %f84, [%rd54+1024];\n"
" add.ftz.f32 %f80, %f84, %f80;\n"
" st.shared.f32 [%rd51+1024], %f80;\n"
" ld.shared.f32 %f85, [%rd54+1536];\n"
" add.ftz.f32 %f81, %f85, %f81;\n"
" st.shared.f32 [%rd51+1536], %f81;\n"
"$Lt_3_26370:\n"
" shr.u32 %r42, %r42, 1;\n"
" mov.u32 %r45, 0;\n"
" setp.ne.u32 %p14, %r42, %r45;\n"
" @%p14 bra $Lt_3_26114;\n"
"$Lt_3_25602:\n"
" mov.f32 %f29, %f78;\n"
" mov.f32 %f28, %f79;\n"
" mov.f32 %f27, %f80;\n"
" mov.f32 %f30, %f81;\n"
" ld.param.s32 %r46, [__cudaparm_kernel_lj_fast_vflag];\n"
" mov.u32 %r47, 0;\n"
" setp.le.s32 %p15, %r46, %r47;\n"
" @%p15 bra $Lt_3_27138;\n"
" mov.f32 %f78, %f11;\n"
" st.shared.f32 [%rd51+0], %f78;\n"
" mov.f32 %f79, %f13;\n"
" st.shared.f32 [%rd51+512], %f79;\n"
" mov.f32 %f80, %f15;\n"
" st.shared.f32 [%rd51+1024], %f80;\n"
" mov.f32 %f81, %f17;\n"
" st.shared.f32 [%rd51+1536], %f81;\n"
" mov.f32 %f86, %f19;\n"
" st.shared.f32 [%rd51+2048], %f86;\n"
" mov.f32 %f87, %f20;\n"
" st.shared.f32 [%rd51+2560], %f87;\n"
" mov.s32 %r48, %r41;\n"
" @!%p12 bra $Lt_3_27650;\n"
"$Lt_3_28162:\n"
" setp.ge.u32 %p16, %r21, %r48;\n"
" @%p16 bra $Lt_3_28418;\n"
" add.u32 %r49, %r1, %r48;\n"
" cvt.u64.u32 %rd55, %r49;\n"
" mul.wide.u32 %rd56, %r49, 4;\n"
" add.u64 %rd57, %rd48, %rd56;\n"
" ld.shared.f32 %f88, [%rd57+0];\n"
" add.ftz.f32 %f78, %f88, %f78;\n"
" st.shared.f32 [%rd51+0], %f78;\n"
" ld.shared.f32 %f89, [%rd57+512];\n"
" add.ftz.f32 %f79, %f89, %f79;\n"
" st.shared.f32 [%rd51+512], %f79;\n"
" ld.shared.f32 %f90, [%rd57+1024];\n"
" add.ftz.f32 %f80, %f90, %f80;\n"
" st.shared.f32 [%rd51+1024], %f80;\n"
" ld.shared.f32 %f91, [%rd57+1536];\n"
" add.ftz.f32 %f81, %f91, %f81;\n"
" st.shared.f32 [%rd51+1536], %f81;\n"
" ld.shared.f32 %f92, [%rd57+2048];\n"
" add.ftz.f32 %f86, %f92, %f86;\n"
" st.shared.f32 [%rd51+2048], %f86;\n"
" ld.shared.f32 %f93, [%rd57+2560];\n"
" add.ftz.f32 %f87, %f93, %f87;\n"
" st.shared.f32 [%rd51+2560], %f87;\n"
"$Lt_3_28418:\n"
" shr.u32 %r48, %r48, 1;\n"
" mov.u32 %r50, 0;\n"
" setp.ne.u32 %p17, %r48, %r50;\n"
" @%p17 bra $Lt_3_28162;\n"
"$Lt_3_27650:\n"
" mov.f32 %f11, %f78;\n"
" mov.f32 %f13, %f79;\n"
" mov.f32 %f15, %f80;\n"
" mov.f32 %f17, %f81;\n"
" mov.f32 %f19, %f86;\n"
" mov.f32 %f21, %f87;\n"
"$Lt_3_27138:\n"
"$Lt_3_25090:\n"
" mov.u32 %r51, 0;\n"
" setp.ne.s32 %p18, %r21, %r51;\n"
" @%p18 bra $Lt_3_29186;\n"
" ld.param.u64 %rd58, [__cudaparm_kernel_lj_fast___val_paramengv];\n"
" add.u64 %rd59, %rd58, %rd18;\n"
" ld.param.s32 %r52, [__cudaparm_kernel_lj_fast_eflag];\n"
" mov.u32 %r53, 0;\n"
" setp.le.s32 %p19, %r52, %r53;\n"
" @%p19 bra $Lt_3_29698;\n"
" ld.global.f32 %f94, [%rd59+0];\n"
" add.ftz.f32 %f95, %f94, %f30;\n"
" st.global.f32 [%rd59+0], %f95;\n"
" cvt.s64.s32 %rd60, %r15;\n"
" mul.wide.s32 %rd61, %r15, 4;\n"
" add.u64 %rd59, %rd59, %rd61;\n"
"$Lt_3_29698:\n"
" ld.param.s32 %r54, [__cudaparm_kernel_lj_fast_vflag];\n"
" mov.u32 %r55, 0;\n"
" setp.le.s32 %p20, %r54, %r55;\n"
" @%p20 bra $Lt_3_30210;\n"
" ld.global.f32 %f96, [%rd59+0];\n"
" mov.f32 %f97, %f11;\n"
" add.ftz.f32 %f98, %f96, %f97;\n"
" st.global.f32 [%rd59+0], %f98;\n"
" cvt.s64.s32 %rd62, %r15;\n"
" mul.wide.s32 %rd63, %r15, 4;\n"
" add.u64 %rd64, %rd63, %rd59;\n"
" ld.global.f32 %f99, [%rd64+0];\n"
" mov.f32 %f100, %f13;\n"
" add.ftz.f32 %f101, %f99, %f100;\n"
" st.global.f32 [%rd64+0], %f101;\n"
" add.u64 %rd65, %rd63, %rd64;\n"
" ld.global.f32 %f102, [%rd65+0];\n"
" mov.f32 %f103, %f15;\n"
" add.ftz.f32 %f104, %f102, %f103;\n"
" st.global.f32 [%rd65+0], %f104;\n"
" add.u64 %rd66, %rd63, %rd65;\n"
" ld.global.f32 %f105, [%rd66+0];\n"
" mov.f32 %f106, %f17;\n"
" add.ftz.f32 %f107, %f105, %f106;\n"
" st.global.f32 [%rd66+0], %f107;\n"
" add.u64 %rd67, %rd63, %rd66;\n"
" ld.global.f32 %f108, [%rd67+0];\n"
" mov.f32 %f109, %f19;\n"
" add.ftz.f32 %f110, %f108, %f109;\n"
" st.global.f32 [%rd67+0], %f110;\n"
" add.u64 %rd59, %rd63, %rd67;\n"
" ld.global.f32 %f111, [%rd59+0];\n"
" mov.f32 %f112, %f21;\n"
" add.ftz.f32 %f113, %f111, %f112;\n"
" st.global.f32 [%rd59+0], %f113;\n"
"$Lt_3_30210:\n"
" ld.param.u64 %rd68, [__cudaparm_kernel_lj_fast_ans];\n"
" mul.lo.u64 %rd69, %rd17, 16;\n"
" add.u64 %rd70, %rd68, %rd69;\n"
" ld.global.v4.f32 {%f114,%f115,%f116,%f117}, [%rd70+0];\n"
" add.ftz.f32 %f118, %f115, %f28;\n"
" add.ftz.f32 %f119, %f116, %f27;\n"
" add.ftz.f32 %f120, %f114, %f29;\n"
" st.global.v4.f32 [%rd70+0], {%f120,%f118,%f119,%f117};\n"
"$Lt_3_29186:\n"
"$Lt_3_22530:\n"
" .loc 17 695 0\n"
" exit;\n"
"$LDWend_kernel_lj_fast:\n"
" }\n"
;