git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@7581 f3b2605a-c512-4ea7-a41b-209d697bcdaa

This commit is contained in:
sjplimp
2012-01-25 21:22:15 +00:00
parent 4f7ae6ff9d
commit 2aaf4f6d7e
86 changed files with 0 additions and 46929 deletions

View File

@ -1,101 +0,0 @@
.version 2.3
.target sm_20
.address_size 64
// compiled with /usr/local/cuda/open64/lib//be
// nvopencc 4.0 built on 2011-05-12
//-----------------------------------------------------------
// Compiling /tmp/tmpxft_000099dd_00000000-9_lal_atom.cpp3.i (/home/sjplimp/ccBI#.Q6OzuV)
//-----------------------------------------------------------
//-----------------------------------------------------------
// Options:
//-----------------------------------------------------------
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
// -O3 (Optimization level)
// -g0 (Debug level)
// -m2 (Report advisories)
//-----------------------------------------------------------
.file 1 "<command-line>"
.file 2 "/tmp/tmpxft_000099dd_00000000-8_lal_atom.cudafe2.gpu"
.file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
.file 4 "/usr/local/cuda/include/crt/device_runtime.h"
.file 5 "/usr/local/cuda/include/host_defines.h"
.file 6 "/usr/local/cuda/include/builtin_types.h"
.file 7 "/usr/local/cuda/include/device_types.h"
.file 8 "/usr/local/cuda/include/driver_types.h"
.file 9 "/usr/local/cuda/include/surface_types.h"
.file 10 "/usr/local/cuda/include/texture_types.h"
.file 11 "/usr/local/cuda/include/vector_types.h"
.file 12 "/usr/local/cuda/include/device_launch_parameters.h"
.file 13 "/usr/local/cuda/include/crt/storage_class.h"
.file 14 "/usr/include/bits/types.h"
.file 15 "/usr/include/time.h"
.file 16 "lal_atom.cu"
.file 17 "/usr/local/cuda/include/common_functions.h"
.file 18 "/usr/local/cuda/include/math_functions.h"
.file 19 "/usr/local/cuda/include/math_constants.h"
.file 20 "/usr/local/cuda/include/device_functions.h"
.file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h"
.file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h"
.file 23 "/usr/local/cuda/include/sm_13_double_functions.h"
.file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h"
.file 25 "/usr/local/cuda/include/sm_20_intrinsics.h"
.file 26 "/usr/local/cuda/include/surface_functions.h"
.file 27 "/usr/local/cuda/include/texture_fetch_functions.h"
.file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
.entry kernel_cast_x (
.param .u64 __cudaparm_kernel_cast_x_x_type,
.param .u64 __cudaparm_kernel_cast_x_x,
.param .u64 __cudaparm_kernel_cast_x_type,
.param .s32 __cudaparm_kernel_cast_x_nall)
{
.reg .u32 %r<10>;
.reg .u64 %rd<13>;
.reg .f32 %f<6>;
.reg .f64 %fd<5>;
.reg .pred %p<3>;
.loc 16 21 0
$LDWbegin_kernel_cast_x:
cvt.s32.u32 %r1, %ctaid.x;
cvt.s32.u32 %r2, %ntid.x;
mul24.lo.s32 %r3, %r1, %r2;
mov.u32 %r4, %tid.x;
add.u32 %r5, %r3, %r4;
ld.param.s32 %r6, [__cudaparm_kernel_cast_x_nall];
setp.le.s32 %p1, %r6, %r5;
@%p1 bra $Lt_0_1026;
.loc 16 26 0
cvt.s64.s32 %rd1, %r5;
ld.param.u64 %rd2, [__cudaparm_kernel_cast_x_type];
mul.wide.s32 %rd3, %r5, 4;
add.u64 %rd4, %rd2, %rd3;
ld.global.s32 %r7, [%rd4+0];
cvt.rn.f32.s32 %f1, %r7;
.loc 16 29 0
ld.param.u64 %rd5, [__cudaparm_kernel_cast_x_x];
mul.lo.s32 %r8, %r5, 3;
cvt.s64.s32 %rd6, %r8;
mul.wide.s32 %rd7, %r8, 8;
add.u64 %rd8, %rd5, %rd7;
ld.global.f64 %fd1, [%rd8+8];
cvt.rn.ftz.f32.f64 %f2, %fd1;
.loc 16 30 0
ld.global.f64 %fd2, [%rd8+16];
cvt.rn.ftz.f32.f64 %f3, %fd2;
.loc 16 31 0
ld.param.u64 %rd9, [__cudaparm_kernel_cast_x_x_type];
mul.wide.s32 %rd10, %r5, 16;
add.u64 %rd11, %rd9, %rd10;
ld.global.f64 %fd3, [%rd8+0];
cvt.rn.ftz.f32.f64 %f4, %fd3;
st.global.v4.f32 [%rd11+0], {%f4,%f2,%f3,%f1};
$Lt_0_1026:
.loc 16 33 0
exit;
$LDWend_kernel_cast_x:
} // kernel_cast_x

View File

@ -1,56 +0,0 @@
const char * atom =
" .version 2.3\n"
" .target sm_20\n"
" .address_size 64\n"
" .entry kernel_cast_x (\n"
" .param .u64 __cudaparm_kernel_cast_x_x_type,\n"
" .param .u64 __cudaparm_kernel_cast_x_x,\n"
" .param .u64 __cudaparm_kernel_cast_x_type,\n"
" .param .s32 __cudaparm_kernel_cast_x_nall)\n"
" {\n"
" .reg .u32 %r<10>;\n"
" .reg .u64 %rd<13>;\n"
" .reg .f32 %f<6>;\n"
" .reg .f64 %fd<5>;\n"
" .reg .pred %p<3>;\n"
" .loc 16 21 0\n"
"$LDWbegin_kernel_cast_x:\n"
" cvt.s32.u32 %r1, %ctaid.x;\n"
" cvt.s32.u32 %r2, %ntid.x;\n"
" mul24.lo.s32 %r3, %r1, %r2;\n"
" mov.u32 %r4, %tid.x;\n"
" add.u32 %r5, %r3, %r4;\n"
" ld.param.s32 %r6, [__cudaparm_kernel_cast_x_nall];\n"
" setp.le.s32 %p1, %r6, %r5;\n"
" @%p1 bra $Lt_0_1026;\n"
" .loc 16 26 0\n"
" cvt.s64.s32 %rd1, %r5;\n"
" ld.param.u64 %rd2, [__cudaparm_kernel_cast_x_type];\n"
" mul.wide.s32 %rd3, %r5, 4;\n"
" add.u64 %rd4, %rd2, %rd3;\n"
" ld.global.s32 %r7, [%rd4+0];\n"
" cvt.rn.f32.s32 %f1, %r7;\n"
" .loc 16 29 0\n"
" ld.param.u64 %rd5, [__cudaparm_kernel_cast_x_x];\n"
" mul.lo.s32 %r8, %r5, 3;\n"
" cvt.s64.s32 %rd6, %r8;\n"
" mul.wide.s32 %rd7, %r8, 8;\n"
" add.u64 %rd8, %rd5, %rd7;\n"
" ld.global.f64 %fd1, [%rd8+8];\n"
" cvt.rn.ftz.f32.f64 %f2, %fd1;\n"
" .loc 16 30 0\n"
" ld.global.f64 %fd2, [%rd8+16];\n"
" cvt.rn.ftz.f32.f64 %f3, %fd2;\n"
" .loc 16 31 0\n"
" ld.param.u64 %rd9, [__cudaparm_kernel_cast_x_x_type];\n"
" mul.wide.s32 %rd10, %r5, 16;\n"
" add.u64 %rd11, %rd9, %rd10;\n"
" ld.global.f64 %fd3, [%rd8+0];\n"
" cvt.rn.ftz.f32.f64 %f4, %fd3;\n"
" st.global.v4.f32 [%rd11+0], {%f4,%f2,%f3,%f1};\n"
"$Lt_0_1026:\n"
" .loc 16 33 0\n"
" exit;\n"
"$LDWend_kernel_cast_x:\n"
" }\n"
;

View File

@ -1,958 +0,0 @@
.version 2.3
.target sm_20
.address_size 64
// compiled with /usr/local/cuda/open64/lib//be
// nvopencc 4.0 built on 2011-05-12
//-----------------------------------------------------------
// Compiling /tmp/tmpxft_00009eb0_00000000-9_lal_cg_cmm.cpp3.i (/home/sjplimp/ccBI#.oK8Qzh)
//-----------------------------------------------------------
//-----------------------------------------------------------
// Options:
//-----------------------------------------------------------
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
// -O3 (Optimization level)
// -g0 (Debug level)
// -m2 (Report advisories)
//-----------------------------------------------------------
.file 1 "<command-line>"
.file 2 "/tmp/tmpxft_00009eb0_00000000-8_lal_cg_cmm.cudafe2.gpu"
.file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
.file 4 "/usr/local/cuda/include/crt/device_runtime.h"
.file 5 "/usr/local/cuda/include/host_defines.h"
.file 6 "/usr/local/cuda/include/builtin_types.h"
.file 7 "/usr/local/cuda/include/device_types.h"
.file 8 "/usr/local/cuda/include/driver_types.h"
.file 9 "/usr/local/cuda/include/surface_types.h"
.file 10 "/usr/local/cuda/include/texture_types.h"
.file 11 "/usr/local/cuda/include/vector_types.h"
.file 12 "/usr/local/cuda/include/device_launch_parameters.h"
.file 13 "/usr/local/cuda/include/crt/storage_class.h"
.file 14 "/usr/include/bits/types.h"
.file 15 "/usr/include/time.h"
.file 16 "lal_cg_cmm.cu"
.file 17 "/usr/local/cuda/include/common_functions.h"
.file 18 "/usr/local/cuda/include/math_functions.h"
.file 19 "/usr/local/cuda/include/math_constants.h"
.file 20 "/usr/local/cuda/include/device_functions.h"
.file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h"
.file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h"
.file 23 "/usr/local/cuda/include/sm_13_double_functions.h"
.file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h"
.file 25 "/usr/local/cuda/include/sm_20_intrinsics.h"
.file 26 "/usr/local/cuda/include/surface_functions.h"
.file 27 "/usr/local/cuda/include/texture_fetch_functions.h"
.file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
.global .texref pos_tex;
.entry kernel_pair (
.param .u64 __cudaparm_kernel_pair_x_,
.param .u64 __cudaparm_kernel_pair_lj1,
.param .u64 __cudaparm_kernel_pair_lj3,
.param .s32 __cudaparm_kernel_pair_lj_types,
.param .u64 __cudaparm_kernel_pair_sp_lj_in,
.param .u64 __cudaparm_kernel_pair_dev_nbor,
.param .u64 __cudaparm_kernel_pair_dev_packed,
.param .u64 __cudaparm_kernel_pair_ans,
.param .u64 __cudaparm_kernel_pair___val_paramengv,
.param .s32 __cudaparm_kernel_pair_eflag,
.param .s32 __cudaparm_kernel_pair_vflag,
.param .s32 __cudaparm_kernel_pair_inum,
.param .s32 __cudaparm_kernel_pair_nbor_pitch,
.param .s32 __cudaparm_kernel_pair_t_per_atom)
{
.reg .u32 %r<72>;
.reg .u64 %rd<63>;
.reg .f32 %f<111>;
.reg .pred %p<21>;
.shared .align 16 .b8 __cuda___cuda_local_var_32536_33_non_const_sp_lj92[16];
.shared .align 4 .b8 __cuda___cuda_local_var_32608_55_non_const_red_acc108[3072];
// __cuda_local_var_32543_10_non_const_f = 48
// __cuda_local_var_32545_9_non_const_virial = 16
.loc 16 31 0
$LDWbegin_kernel_pair:
.loc 16 36 0
ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];
ldu.global.f32 %f1, [%rd1+0];
.loc 16 37 0
ld.global.f32 %f2, [%rd1+4];
.loc 16 38 0
ld.global.f32 %f3, [%rd1+8];
.loc 16 39 0
ld.global.f32 %f4, [%rd1+12];
st.shared.v4.f32 [__cuda___cuda_local_var_32536_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4};
.loc 16 46 0
mov.f32 %f5, 0f00000000; // 0
mov.f32 %f6, %f5;
mov.f32 %f7, 0f00000000; // 0
mov.f32 %f8, %f7;
mov.f32 %f9, 0f00000000; // 0
mov.f32 %f10, %f9;
mov.f32 %f11, 0f00000000; // 0
mov.f32 %f12, %f11;
mov.f32 %f13, 0f00000000; // 0
mov.f32 %f14, %f13;
mov.f32 %f15, 0f00000000; // 0
mov.f32 %f16, %f15;
ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];
cvt.s32.u32 %r2, %tid.x;
div.s32 %r3, %r2, %r1;
cvt.s32.u32 %r4, %ntid.x;
div.s32 %r5, %r4, %r1;
cvt.s32.u32 %r6, %ctaid.x;
mul.lo.s32 %r7, %r6, %r5;
add.s32 %r8, %r3, %r7;
ld.param.s32 %r9, [__cudaparm_kernel_pair_inum];
setp.ge.s32 %p1, %r8, %r9;
@%p1 bra $Lt_0_28930;
.loc 16 51 0
ld.param.s32 %r10, [__cudaparm_kernel_pair_nbor_pitch];
cvt.s64.s32 %rd2, %r10;
mul.wide.s32 %rd3, %r10, 4;
cvt.s64.s32 %rd4, %r8;
mul.wide.s32 %rd5, %r8, 4;
ld.param.u64 %rd6, [__cudaparm_kernel_pair_dev_nbor];
add.u64 %rd7, %rd5, %rd6;
add.u64 %rd8, %rd3, %rd7;
ld.global.s32 %r11, [%rd8+0];
sub.s32 %r12, %r1, 1;
and.b32 %r13, %r12, %r2;
cvt.s64.s32 %rd9, %r13;
mul.wide.s32 %rd10, %r13, 4;
ld.param.u64 %rd11, [__cudaparm_kernel_pair_dev_packed];
setp.ne.u64 %p2, %rd11, %rd6;
@%p2 bra $Lt_0_20994;
cvt.s32.s64 %r14, %rd2;
mul.lo.s32 %r15, %r14, %r1;
mov.s32 %r16, %r15;
mul.lo.s32 %r17, %r12, %r8;
add.s32 %r18, %r14, %r17;
cvt.s64.s32 %rd12, %r18;
mul.wide.s32 %rd13, %r18, 4;
add.u64 %rd14, %rd8, %rd13;
and.b32 %r19, %r12, %r11;
cvt.s64.s32 %rd15, %r19;
div.s32 %r20, %r11, %r1;
mul.lo.s32 %r21, %r15, %r20;
cvt.s64.s32 %rd16, %r21;
add.u64 %rd17, %rd15, %rd16;
mul.lo.u64 %rd18, %rd17, 4;
add.u64 %rd19, %rd14, %rd18;
add.u64 %rd20, %rd10, %rd14;
bra.uni $Lt_0_20738;
$Lt_0_20994:
add.u64 %rd21, %rd3, %rd8;
ld.global.s32 %r22, [%rd21+0];
cvt.s64.s32 %rd22, %r22;
mul.wide.s32 %rd23, %r22, 4;
add.u64 %rd24, %rd11, %rd23;
cvt.s64.s32 %rd25, %r11;
mul.wide.s32 %rd26, %r11, 4;
add.u64 %rd19, %rd24, %rd26;
mov.s32 %r16, %r1;
add.u64 %rd20, %rd10, %rd24;
$Lt_0_20738:
.loc 16 54 0
ld.global.s32 %r23, [%rd7+0];
mov.u32 %r24, %r23;
mov.s32 %r25, 0;
mov.u32 %r26, %r25;
mov.s32 %r27, 0;
mov.u32 %r28, %r27;
mov.s32 %r29, 0;
mov.u32 %r30, %r29;
tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r24,%r26,%r28,%r30}];
mov.f32 %f21, %f17;
mov.f32 %f22, %f18;
mov.f32 %f23, %f19;
mov.f32 %f24, %f20;
setp.ge.u64 %p3, %rd20, %rd19;
@%p3 bra $Lt_0_30466;
cvt.rzi.ftz.s32.f32 %r31, %f24;
cvt.s64.s32 %rd27, %r16;
ld.param.s32 %r32, [__cudaparm_kernel_pair_lj_types];
mul.lo.s32 %r33, %r32, %r31;
ld.param.u64 %rd28, [__cudaparm_kernel_pair_lj1];
mov.f32 %f25, 0f00000000; // 0
mov.f32 %f26, 0f00000000; // 0
mov.f32 %f27, 0f00000000; // 0
mov.f32 %f28, 0f00000000; // 0
mov.u64 %rd29, __cuda___cuda_local_var_32536_33_non_const_sp_lj92;
$Lt_0_21762:
//<loop> Loop body line 54, nesting depth: 1, estimated iterations: unknown
.loc 16 60 0
ld.global.s32 %r34, [%rd20+0];
.loc 16 61 0
shr.s32 %r35, %r34, 30;
and.b32 %r36, %r35, 3;
cvt.s64.s32 %rd30, %r36;
mul.wide.s32 %rd31, %r36, 4;
add.u64 %rd32, %rd29, %rd31;
ld.shared.f32 %f29, [%rd32+0];
.loc 16 64 0
and.b32 %r37, %r34, 1073741823;
mov.u32 %r38, %r37;
mov.s32 %r39, 0;
mov.u32 %r40, %r39;
mov.s32 %r41, 0;
mov.u32 %r42, %r41;
mov.s32 %r43, 0;
mov.u32 %r44, %r43;
tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r38,%r40,%r42,%r44}];
mov.f32 %f34, %f30;
mov.f32 %f35, %f31;
mov.f32 %f36, %f32;
mov.f32 %f37, %f33;
cvt.rzi.ftz.s32.f32 %r45, %f37;
sub.ftz.f32 %f38, %f22, %f35;
sub.ftz.f32 %f39, %f21, %f34;
sub.ftz.f32 %f40, %f23, %f36;
mul.ftz.f32 %f41, %f38, %f38;
fma.rn.ftz.f32 %f42, %f39, %f39, %f41;
fma.rn.ftz.f32 %f43, %f40, %f40, %f42;
add.s32 %r46, %r45, %r33;
cvt.s64.s32 %rd33, %r46;
mul.wide.s32 %rd34, %r46, 16;
add.u64 %rd35, %rd34, %rd28;
ld.global.f32 %f44, [%rd35+0];
setp.gt.ftz.f32 %p4, %f44, %f43;
@!%p4 bra $Lt_0_24066;
rcp.approx.ftz.f32 %f45, %f43;
ld.global.f32 %f46, [%rd35+4];
mov.f32 %f47, 0f40000000; // 2
setp.eq.ftz.f32 %p5, %f46, %f47;
@!%p5 bra $Lt_0_22786;
.loc 16 79 0
mul.ftz.f32 %f48, %f45, %f45;
mov.f32 %f49, %f48;
.loc 16 80 0
mul.ftz.f32 %f50, %f48, %f48;
bra.uni $Lt_0_23042;
$Lt_0_22786:
mov.f32 %f51, 0f3f800000; // 1
setp.eq.ftz.f32 %p6, %f46, %f51;
@!%p6 bra $Lt_0_23298;
.loc 16 82 0
sqrt.approx.ftz.f32 %f52, %f45;
mul.ftz.f32 %f53, %f45, %f52;
mov.f32 %f50, %f53;
.loc 16 83 0
mul.ftz.f32 %f49, %f53, %f53;
bra.uni $Lt_0_23042;
$Lt_0_23298:
.loc 16 85 0
mul.ftz.f32 %f54, %f45, %f45;
mul.ftz.f32 %f55, %f45, %f54;
mov.f32 %f49, %f55;
.loc 16 86 0
mov.f32 %f50, %f55;
$Lt_0_23042:
$Lt_0_22530:
.loc 16 88 0
mul.ftz.f32 %f56, %f45, %f29;
mul.ftz.f32 %f57, %f49, %f56;
ld.global.v2.f32 {%f58,%f59}, [%rd35+8];
mul.ftz.f32 %f60, %f58, %f50;
sub.ftz.f32 %f61, %f60, %f59;
mul.ftz.f32 %f62, %f57, %f61;
.loc 16 90 0
fma.rn.ftz.f32 %f27, %f39, %f62, %f27;
.loc 16 91 0
fma.rn.ftz.f32 %f26, %f38, %f62, %f26;
.loc 16 92 0
fma.rn.ftz.f32 %f25, %f40, %f62, %f25;
ld.param.s32 %r47, [__cudaparm_kernel_pair_eflag];
mov.u32 %r48, 0;
setp.le.s32 %p7, %r47, %r48;
@%p7 bra $Lt_0_23554;
.loc 16 94 0
ld.param.u64 %rd36, [__cudaparm_kernel_pair_lj3];
add.u64 %rd37, %rd36, %rd34;
ld.global.v4.f32 {%f63,%f64,%f65,_}, [%rd37+0];
mul.ftz.f32 %f66, %f29, %f49;
mul.ftz.f32 %f67, %f63, %f50;
sub.ftz.f32 %f68, %f67, %f64;
mul.ftz.f32 %f69, %f66, %f68;
sub.ftz.f32 %f70, %f69, %f65;
add.ftz.f32 %f28, %f28, %f70;
$Lt_0_23554:
ld.param.s32 %r49, [__cudaparm_kernel_pair_vflag];
mov.u32 %r50, 0;
setp.le.s32 %p8, %r49, %r50;
@%p8 bra $Lt_0_24066;
.loc 16 97 0
mov.f32 %f71, %f6;
mul.ftz.f32 %f72, %f39, %f39;
fma.rn.ftz.f32 %f73, %f62, %f72, %f71;
mov.f32 %f6, %f73;
.loc 16 98 0
mov.f32 %f74, %f8;
fma.rn.ftz.f32 %f75, %f62, %f41, %f74;
mov.f32 %f8, %f75;
.loc 16 99 0
mov.f32 %f76, %f10;
mul.ftz.f32 %f77, %f40, %f40;
fma.rn.ftz.f32 %f78, %f62, %f77, %f76;
mov.f32 %f10, %f78;
.loc 16 100 0
mov.f32 %f79, %f12;
mul.ftz.f32 %f80, %f38, %f39;
fma.rn.ftz.f32 %f81, %f62, %f80, %f79;
mov.f32 %f12, %f81;
.loc 16 101 0
mov.f32 %f82, %f14;
mul.ftz.f32 %f83, %f39, %f40;
fma.rn.ftz.f32 %f84, %f62, %f83, %f82;
mov.f32 %f14, %f84;
.loc 16 102 0
mul.ftz.f32 %f85, %f38, %f40;
fma.rn.ftz.f32 %f15, %f62, %f85, %f15;
mov.f32 %f16, %f15;
$Lt_0_24066:
$Lt_0_22018:
.loc 16 58 0
mul.lo.u64 %rd38, %rd27, 4;
add.u64 %rd20, %rd20, %rd38;
setp.lt.u64 %p9, %rd20, %rd19;
@%p9 bra $Lt_0_21762;
bra.uni $Lt_0_21250;
$Lt_0_30466:
mov.f32 %f25, 0f00000000; // 0
mov.f32 %f26, 0f00000000; // 0
mov.f32 %f27, 0f00000000; // 0
mov.f32 %f28, 0f00000000; // 0
$Lt_0_21250:
mov.u32 %r51, 1;
setp.le.s32 %p10, %r1, %r51;
@%p10 bra $Lt_0_26882;
.loc 16 107 0
mov.u64 %rd39, __cuda___cuda_local_var_32608_55_non_const_red_acc108;
cvt.s64.s32 %rd40, %r2;
mul.wide.s32 %rd41, %r2, 4;
add.u64 %rd42, %rd39, %rd41;
mov.f32 %f86, %f27;
st.shared.f32 [%rd42+0], %f86;
mov.f32 %f87, %f26;
st.shared.f32 [%rd42+512], %f87;
mov.f32 %f88, %f25;
st.shared.f32 [%rd42+1024], %f88;
mov.f32 %f89, %f28;
st.shared.f32 [%rd42+1536], %f89;
shr.s32 %r52, %r1, 31;
mov.s32 %r53, 1;
and.b32 %r54, %r52, %r53;
add.s32 %r55, %r54, %r1;
shr.s32 %r56, %r55, 1;
mov.s32 %r57, %r56;
mov.u32 %r58, 0;
setp.ne.u32 %p11, %r56, %r58;
@!%p11 bra $Lt_0_25346;
$Lt_0_25858:
setp.ge.u32 %p12, %r13, %r57;
@%p12 bra $Lt_0_26114;
add.u32 %r59, %r2, %r57;
cvt.u64.u32 %rd43, %r59;
mul.wide.u32 %rd44, %r59, 4;
add.u64 %rd45, %rd39, %rd44;
ld.shared.f32 %f90, [%rd45+0];
add.ftz.f32 %f86, %f90, %f86;
st.shared.f32 [%rd42+0], %f86;
ld.shared.f32 %f91, [%rd45+512];
add.ftz.f32 %f87, %f91, %f87;
st.shared.f32 [%rd42+512], %f87;
ld.shared.f32 %f92, [%rd45+1024];
add.ftz.f32 %f88, %f92, %f88;
st.shared.f32 [%rd42+1024], %f88;
ld.shared.f32 %f93, [%rd45+1536];
add.ftz.f32 %f89, %f93, %f89;
st.shared.f32 [%rd42+1536], %f89;
$Lt_0_26114:
shr.u32 %r57, %r57, 1;
mov.u32 %r60, 0;
setp.ne.u32 %p13, %r57, %r60;
@%p13 bra $Lt_0_25858;
$Lt_0_25346:
mov.f32 %f27, %f86;
mov.f32 %f26, %f87;
mov.f32 %f25, %f88;
mov.f32 %f28, %f89;
ld.param.s32 %r61, [__cudaparm_kernel_pair_vflag];
mov.u32 %r62, 0;
setp.le.s32 %p14, %r61, %r62;
@%p14 bra $Lt_0_26882;
mov.f32 %f86, %f6;
st.shared.f32 [%rd42+0], %f86;
mov.f32 %f87, %f8;
st.shared.f32 [%rd42+512], %f87;
mov.f32 %f88, %f10;
st.shared.f32 [%rd42+1024], %f88;
mov.f32 %f89, %f12;
st.shared.f32 [%rd42+1536], %f89;
mov.f32 %f94, %f14;
st.shared.f32 [%rd42+2048], %f94;
mov.f32 %f95, %f15;
st.shared.f32 [%rd42+2560], %f95;
mov.s32 %r63, %r56;
@!%p11 bra $Lt_0_27394;
$Lt_0_27906:
setp.ge.u32 %p15, %r13, %r63;
@%p15 bra $Lt_0_28162;
add.u32 %r64, %r2, %r63;
cvt.u64.u32 %rd46, %r64;
mul.wide.u32 %rd47, %r64, 4;
add.u64 %rd48, %rd39, %rd47;
ld.shared.f32 %f96, [%rd48+0];
add.ftz.f32 %f86, %f96, %f86;
st.shared.f32 [%rd42+0], %f86;
ld.shared.f32 %f97, [%rd48+512];
add.ftz.f32 %f87, %f97, %f87;
st.shared.f32 [%rd42+512], %f87;
ld.shared.f32 %f98, [%rd48+1024];
add.ftz.f32 %f88, %f98, %f88;
st.shared.f32 [%rd42+1024], %f88;
ld.shared.f32 %f99, [%rd48+1536];
add.ftz.f32 %f89, %f99, %f89;
st.shared.f32 [%rd42+1536], %f89;
ld.shared.f32 %f100, [%rd48+2048];
add.ftz.f32 %f94, %f100, %f94;
st.shared.f32 [%rd42+2048], %f94;
ld.shared.f32 %f101, [%rd48+2560];
add.ftz.f32 %f95, %f101, %f95;
st.shared.f32 [%rd42+2560], %f95;
$Lt_0_28162:
shr.u32 %r63, %r63, 1;
mov.u32 %r65, 0;
setp.ne.u32 %p16, %r63, %r65;
@%p16 bra $Lt_0_27906;
$Lt_0_27394:
mov.f32 %f6, %f86;
mov.f32 %f8, %f87;
mov.f32 %f10, %f88;
mov.f32 %f12, %f89;
mov.f32 %f14, %f94;
mov.f32 %f16, %f95;
$Lt_0_26882:
$Lt_0_24834:
mov.u32 %r66, 0;
setp.ne.s32 %p17, %r13, %r66;
@%p17 bra $Lt_0_28930;
ld.param.u64 %rd49, [__cudaparm_kernel_pair___val_paramengv];
add.u64 %rd50, %rd49, %rd5;
ld.param.s32 %r67, [__cudaparm_kernel_pair_eflag];
mov.u32 %r68, 0;
setp.le.s32 %p18, %r67, %r68;
@%p18 bra $Lt_0_29442;
st.global.f32 [%rd50+0], %f28;
cvt.s64.s32 %rd51, %r9;
mul.wide.s32 %rd52, %r9, 4;
add.u64 %rd50, %rd50, %rd52;
$Lt_0_29442:
ld.param.s32 %r69, [__cudaparm_kernel_pair_vflag];
mov.u32 %r70, 0;
setp.le.s32 %p19, %r69, %r70;
@%p19 bra $Lt_0_29954;
mov.f32 %f102, %f6;
st.global.f32 [%rd50+0], %f102;
cvt.s64.s32 %rd53, %r9;
mul.wide.s32 %rd54, %r9, 4;
add.u64 %rd55, %rd54, %rd50;
mov.f32 %f103, %f8;
st.global.f32 [%rd55+0], %f103;
add.u64 %rd56, %rd54, %rd55;
mov.f32 %f104, %f10;
st.global.f32 [%rd56+0], %f104;
add.u64 %rd57, %rd54, %rd56;
mov.f32 %f105, %f12;
st.global.f32 [%rd57+0], %f105;
add.u64 %rd50, %rd54, %rd57;
mov.f32 %f106, %f14;
st.global.f32 [%rd50+0], %f106;
mov.f32 %f107, %f16;
add.u64 %rd58, %rd54, %rd50;
st.global.f32 [%rd58+0], %f107;
$Lt_0_29954:
ld.param.u64 %rd59, [__cudaparm_kernel_pair_ans];
mul.lo.u64 %rd60, %rd4, 16;
add.u64 %rd61, %rd59, %rd60;
mov.f32 %f108, %f109;
st.global.v4.f32 [%rd61+0], {%f27,%f26,%f25,%f108};
$Lt_0_28930:
$Lt_0_20226:
.loc 16 110 0
exit;
$LDWend_kernel_pair:
} // kernel_pair
.entry kernel_pair_fast (
.param .u64 __cudaparm_kernel_pair_fast_x_,
.param .u64 __cudaparm_kernel_pair_fast_lj1_in,
.param .u64 __cudaparm_kernel_pair_fast_lj3_in,
.param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,
.param .u64 __cudaparm_kernel_pair_fast_dev_nbor,
.param .u64 __cudaparm_kernel_pair_fast_dev_packed,
.param .u64 __cudaparm_kernel_pair_fast_ans,
.param .u64 __cudaparm_kernel_pair_fast___val_paramengv,
.param .s32 __cudaparm_kernel_pair_fast_eflag,
.param .s32 __cudaparm_kernel_pair_fast_vflag,
.param .s32 __cudaparm_kernel_pair_fast_inum,
.param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,
.param .s32 __cudaparm_kernel_pair_fast_t_per_atom)
{
.reg .u32 %r<74>;
.reg .u64 %rd<75>;
.reg .f32 %f<118>;
.reg .pred %p<24>;
.shared .align 4 .b8 __cuda___cuda_local_var_32625_33_non_const_sp_lj3268[16];
.shared .align 16 .b8 __cuda___cuda_local_var_32623_34_non_const_lj13296[1936];
.shared .align 16 .b8 __cuda___cuda_local_var_32624_34_non_const_lj35232[1936];
.shared .align 4 .b8 __cuda___cuda_local_var_32702_55_non_const_red_acc7168[3072];
// __cuda_local_var_32635_10_non_const_f = 48
// __cuda_local_var_32637_9_non_const_virial = 16
.loc 16 118 0
$LDWbegin_kernel_pair_fast:
cvt.s32.u32 %r1, %tid.x;
mov.u32 %r2, 3;
setp.gt.s32 %p1, %r1, %r2;
@%p1 bra $Lt_1_22530;
.loc 16 126 0
mov.u64 %rd1, __cuda___cuda_local_var_32625_33_non_const_sp_lj3268;
cvt.s64.s32 %rd2, %r1;
mul.wide.s32 %rd3, %r1, 4;
ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];
add.u64 %rd5, %rd4, %rd3;
ld.global.f32 %f1, [%rd5+0];
add.u64 %rd6, %rd3, %rd1;
st.shared.f32 [%rd6+0], %f1;
$Lt_1_22530:
mov.u64 %rd1, __cuda___cuda_local_var_32625_33_non_const_sp_lj3268;
mov.u32 %r3, 120;
setp.gt.s32 %p2, %r1, %r3;
@%p2 bra $Lt_1_23042;
.loc 16 128 0
mov.u64 %rd7, __cuda___cuda_local_var_32623_34_non_const_lj13296;
cvt.s64.s32 %rd8, %r1;
mul.wide.s32 %rd9, %r1, 16;
ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_lj1_in];
add.u64 %rd11, %rd10, %rd9;
add.u64 %rd12, %rd9, %rd7;
ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0];
st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5};
ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag];
mov.u32 %r5, 0;
setp.le.s32 %p3, %r4, %r5;
@%p3 bra $Lt_1_23554;
.loc 16 130 0
mov.u64 %rd13, __cuda___cuda_local_var_32624_34_non_const_lj35232;
ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in];
add.u64 %rd15, %rd14, %rd9;
add.u64 %rd16, %rd9, %rd13;
ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0];
st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9};
$Lt_1_23554:
mov.u64 %rd13, __cuda___cuda_local_var_32624_34_non_const_lj35232;
$Lt_1_23042:
mov.u64 %rd13, __cuda___cuda_local_var_32624_34_non_const_lj35232;
mov.u64 %rd7, __cuda___cuda_local_var_32623_34_non_const_lj13296;
.loc 16 138 0
mov.f32 %f10, 0f00000000; // 0
mov.f32 %f11, %f10;
mov.f32 %f12, 0f00000000; // 0
mov.f32 %f13, %f12;
mov.f32 %f14, 0f00000000; // 0
mov.f32 %f15, %f14;
mov.f32 %f16, 0f00000000; // 0
mov.f32 %f17, %f16;
mov.f32 %f18, 0f00000000; // 0
mov.f32 %f19, %f18;
mov.f32 %f20, 0f00000000; // 0
mov.f32 %f21, %f20;
.loc 16 140 0
bar.sync 0;
ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom];
div.s32 %r7, %r1, %r6;
cvt.s32.u32 %r8, %ntid.x;
div.s32 %r9, %r8, %r6;
cvt.s32.u32 %r10, %ctaid.x;
mul.lo.s32 %r11, %r10, %r9;
add.s32 %r12, %r7, %r11;
ld.param.s32 %r13, [__cudaparm_kernel_pair_fast_inum];
setp.ge.s32 %p4, %r12, %r13;
@%p4 bra $Lt_1_32770;
.loc 16 145 0
ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_nbor_pitch];
cvt.s64.s32 %rd17, %r14;
mul.wide.s32 %rd18, %r14, 4;
cvt.s64.s32 %rd19, %r12;
mul.wide.s32 %rd20, %r12, 4;
ld.param.u64 %rd21, [__cudaparm_kernel_pair_fast_dev_nbor];
add.u64 %rd22, %rd20, %rd21;
add.u64 %rd23, %rd18, %rd22;
ld.global.s32 %r15, [%rd23+0];
sub.s32 %r16, %r6, 1;
and.b32 %r17, %r16, %r1;
cvt.s64.s32 %rd24, %r17;
mul.wide.s32 %rd25, %r17, 4;
ld.param.u64 %rd26, [__cudaparm_kernel_pair_fast_dev_packed];
setp.ne.u64 %p5, %rd26, %rd21;
@%p5 bra $Lt_1_24834;
cvt.s32.s64 %r18, %rd17;
mul.lo.s32 %r19, %r18, %r6;
mov.s32 %r20, %r19;
mul.lo.s32 %r21, %r16, %r12;
add.s32 %r22, %r18, %r21;
cvt.s64.s32 %rd27, %r22;
mul.wide.s32 %rd28, %r22, 4;
add.u64 %rd29, %rd23, %rd28;
and.b32 %r23, %r16, %r15;
cvt.s64.s32 %rd30, %r23;
div.s32 %r24, %r15, %r6;
mul.lo.s32 %r25, %r19, %r24;
cvt.s64.s32 %rd31, %r25;
add.u64 %rd32, %rd30, %rd31;
mul.lo.u64 %rd33, %rd32, 4;
add.u64 %rd34, %rd29, %rd33;
add.u64 %rd35, %rd25, %rd29;
bra.uni $Lt_1_24578;
$Lt_1_24834:
add.u64 %rd36, %rd18, %rd23;
ld.global.s32 %r26, [%rd36+0];
cvt.s64.s32 %rd37, %r26;
mul.wide.s32 %rd38, %r26, 4;
add.u64 %rd39, %rd26, %rd38;
cvt.s64.s32 %rd40, %r15;
mul.wide.s32 %rd41, %r15, 4;
add.u64 %rd34, %rd39, %rd41;
mov.s32 %r20, %r6;
add.u64 %rd35, %rd25, %rd39;
$Lt_1_24578:
.loc 16 148 0
ld.global.s32 %r27, [%rd22+0];
mov.u32 %r28, %r27;
mov.s32 %r29, 0;
mov.u32 %r30, %r29;
mov.s32 %r31, 0;
mov.u32 %r32, %r31;
mov.s32 %r33, 0;
mov.u32 %r34, %r33;
tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r28,%r30,%r32,%r34}];
mov.f32 %f26, %f22;
mov.f32 %f27, %f23;
mov.f32 %f28, %f24;
mov.f32 %f29, %f25;
setp.ge.u64 %p6, %rd35, %rd34;
@%p6 bra $Lt_1_34306;
cvt.rzi.ftz.s32.f32 %r35, %f29;
cvt.s64.s32 %rd42, %r20;
mul.lo.s32 %r36, %r35, 11;
cvt.rn.f32.s32 %f30, %r36;
mov.f32 %f31, 0f00000000; // 0
mov.f32 %f32, 0f00000000; // 0
mov.f32 %f33, 0f00000000; // 0
mov.f32 %f34, 0f00000000; // 0
$Lt_1_25602:
//<loop> Loop body line 148, nesting depth: 1, estimated iterations: unknown
.loc 16 155 0
ld.global.s32 %r37, [%rd35+0];
.loc 16 156 0
shr.s32 %r38, %r37, 30;
and.b32 %r39, %r38, 3;
cvt.s64.s32 %rd43, %r39;
mul.wide.s32 %rd44, %r39, 4;
add.u64 %rd45, %rd1, %rd44;
ld.shared.f32 %f35, [%rd45+0];
.loc 16 159 0
and.b32 %r40, %r37, 1073741823;
mov.u32 %r41, %r40;
mov.s32 %r42, 0;
mov.u32 %r43, %r42;
mov.s32 %r44, 0;
mov.u32 %r45, %r44;
mov.s32 %r46, 0;
mov.u32 %r47, %r46;
tex.1d.v4.f32.s32 {%f36,%f37,%f38,%f39},[pos_tex,{%r41,%r43,%r45,%r47}];
mov.f32 %f40, %f36;
mov.f32 %f41, %f37;
mov.f32 %f42, %f38;
mov.f32 %f43, %f39;
sub.ftz.f32 %f44, %f27, %f41;
sub.ftz.f32 %f45, %f26, %f40;
sub.ftz.f32 %f46, %f28, %f42;
mul.ftz.f32 %f47, %f44, %f44;
fma.rn.ftz.f32 %f48, %f45, %f45, %f47;
fma.rn.ftz.f32 %f49, %f46, %f46, %f48;
add.ftz.f32 %f50, %f30, %f43;
cvt.rzi.ftz.s32.f32 %r48, %f50;
cvt.s64.s32 %rd46, %r48;
mul.wide.s32 %rd47, %r48, 16;
add.u64 %rd48, %rd47, %rd7;
ld.shared.f32 %f51, [%rd48+0];
setp.gt.ftz.f32 %p7, %f51, %f49;
@!%p7 bra $Lt_1_27906;
rcp.approx.ftz.f32 %f52, %f49;
ld.shared.f32 %f53, [%rd48+4];
mov.f32 %f54, 0f40000000; // 2
setp.eq.ftz.f32 %p8, %f53, %f54;
@!%p8 bra $Lt_1_26626;
.loc 16 173 0
mul.ftz.f32 %f55, %f52, %f52;
mov.f32 %f56, %f55;
.loc 16 174 0
mul.ftz.f32 %f57, %f55, %f55;
bra.uni $Lt_1_26882;
$Lt_1_26626:
mov.f32 %f58, 0f3f800000; // 1
setp.eq.ftz.f32 %p9, %f53, %f58;
@!%p9 bra $Lt_1_27138;
.loc 16 176 0
sqrt.approx.ftz.f32 %f59, %f52;
mul.ftz.f32 %f60, %f52, %f59;
mov.f32 %f57, %f60;
.loc 16 177 0
mul.ftz.f32 %f56, %f60, %f60;
bra.uni $Lt_1_26882;
$Lt_1_27138:
.loc 16 179 0
mul.ftz.f32 %f61, %f52, %f52;
mul.ftz.f32 %f62, %f52, %f61;
mov.f32 %f56, %f62;
.loc 16 180 0
mov.f32 %f57, %f62;
$Lt_1_26882:
$Lt_1_26370:
.loc 16 182 0
mul.ftz.f32 %f63, %f52, %f35;
mul.ftz.f32 %f64, %f56, %f63;
ld.shared.v2.f32 {%f65,%f66}, [%rd48+8];
mul.ftz.f32 %f67, %f65, %f57;
sub.ftz.f32 %f68, %f67, %f66;
mul.ftz.f32 %f69, %f64, %f68;
.loc 16 184 0
fma.rn.ftz.f32 %f33, %f45, %f69, %f33;
.loc 16 185 0
fma.rn.ftz.f32 %f32, %f44, %f69, %f32;
.loc 16 186 0
fma.rn.ftz.f32 %f31, %f46, %f69, %f31;
ld.param.s32 %r49, [__cudaparm_kernel_pair_fast_eflag];
mov.u32 %r50, 0;
setp.le.s32 %p10, %r49, %r50;
@%p10 bra $Lt_1_27394;
.loc 16 188 0
add.u64 %rd49, %rd47, %rd13;
ld.shared.v4.f32 {%f70,%f71,%f72,_}, [%rd49+0];
mul.ftz.f32 %f73, %f35, %f56;
mul.ftz.f32 %f74, %f70, %f57;
sub.ftz.f32 %f75, %f74, %f71;
mul.ftz.f32 %f76, %f73, %f75;
sub.ftz.f32 %f77, %f76, %f72;
add.ftz.f32 %f34, %f34, %f77;
$Lt_1_27394:
ld.param.s32 %r51, [__cudaparm_kernel_pair_fast_vflag];
mov.u32 %r52, 0;
setp.le.s32 %p11, %r51, %r52;
@%p11 bra $Lt_1_27906;
.loc 16 191 0
mov.f32 %f78, %f11;
mul.ftz.f32 %f79, %f45, %f45;
fma.rn.ftz.f32 %f80, %f69, %f79, %f78;
mov.f32 %f11, %f80;
.loc 16 192 0
mov.f32 %f81, %f13;
fma.rn.ftz.f32 %f82, %f69, %f47, %f81;
mov.f32 %f13, %f82;
.loc 16 193 0
mov.f32 %f83, %f15;
mul.ftz.f32 %f84, %f46, %f46;
fma.rn.ftz.f32 %f85, %f69, %f84, %f83;
mov.f32 %f15, %f85;
.loc 16 194 0
mov.f32 %f86, %f17;
mul.ftz.f32 %f87, %f44, %f45;
fma.rn.ftz.f32 %f88, %f69, %f87, %f86;
mov.f32 %f17, %f88;
.loc 16 195 0
mov.f32 %f89, %f19;
mul.ftz.f32 %f90, %f45, %f46;
fma.rn.ftz.f32 %f91, %f69, %f90, %f89;
mov.f32 %f19, %f91;
.loc 16 196 0
mul.ftz.f32 %f92, %f44, %f46;
fma.rn.ftz.f32 %f20, %f69, %f92, %f20;
mov.f32 %f21, %f20;
$Lt_1_27906:
$Lt_1_25858:
.loc 16 153 0
mul.lo.u64 %rd50, %rd42, 4;
add.u64 %rd35, %rd35, %rd50;
setp.lt.u64 %p12, %rd35, %rd34;
@%p12 bra $Lt_1_25602;
bra.uni $Lt_1_25090;
$Lt_1_34306:
mov.f32 %f31, 0f00000000; // 0
mov.f32 %f32, 0f00000000; // 0
mov.f32 %f33, 0f00000000; // 0
mov.f32 %f34, 0f00000000; // 0
$Lt_1_25090:
mov.u32 %r53, 1;
setp.le.s32 %p13, %r6, %r53;
@%p13 bra $Lt_1_30722;
.loc 16 201 0
mov.u64 %rd51, __cuda___cuda_local_var_32702_55_non_const_red_acc7168;
cvt.s64.s32 %rd52, %r1;
mul.wide.s32 %rd53, %r1, 4;
add.u64 %rd54, %rd51, %rd53;
mov.f32 %f93, %f33;
st.shared.f32 [%rd54+0], %f93;
mov.f32 %f94, %f32;
st.shared.f32 [%rd54+512], %f94;
mov.f32 %f95, %f31;
st.shared.f32 [%rd54+1024], %f95;
mov.f32 %f96, %f34;
st.shared.f32 [%rd54+1536], %f96;
shr.s32 %r54, %r6, 31;
mov.s32 %r55, 1;
and.b32 %r56, %r54, %r55;
add.s32 %r57, %r56, %r6;
shr.s32 %r58, %r57, 1;
mov.s32 %r59, %r58;
mov.u32 %r60, 0;
setp.ne.u32 %p14, %r58, %r60;
@!%p14 bra $Lt_1_29186;
$Lt_1_29698:
setp.ge.u32 %p15, %r17, %r59;
@%p15 bra $Lt_1_29954;
add.u32 %r61, %r1, %r59;
cvt.u64.u32 %rd55, %r61;
mul.wide.u32 %rd56, %r61, 4;
add.u64 %rd57, %rd51, %rd56;
ld.shared.f32 %f97, [%rd57+0];
add.ftz.f32 %f93, %f97, %f93;
st.shared.f32 [%rd54+0], %f93;
ld.shared.f32 %f98, [%rd57+512];
add.ftz.f32 %f94, %f98, %f94;
st.shared.f32 [%rd54+512], %f94;
ld.shared.f32 %f99, [%rd57+1024];
add.ftz.f32 %f95, %f99, %f95;
st.shared.f32 [%rd54+1024], %f95;
ld.shared.f32 %f100, [%rd57+1536];
add.ftz.f32 %f96, %f100, %f96;
st.shared.f32 [%rd54+1536], %f96;
$Lt_1_29954:
shr.u32 %r59, %r59, 1;
mov.u32 %r62, 0;
setp.ne.u32 %p16, %r59, %r62;
@%p16 bra $Lt_1_29698;
$Lt_1_29186:
mov.f32 %f33, %f93;
mov.f32 %f32, %f94;
mov.f32 %f31, %f95;
mov.f32 %f34, %f96;
ld.param.s32 %r63, [__cudaparm_kernel_pair_fast_vflag];
mov.u32 %r64, 0;
setp.le.s32 %p17, %r63, %r64;
@%p17 bra $Lt_1_30722;
mov.f32 %f93, %f11;
st.shared.f32 [%rd54+0], %f93;
mov.f32 %f94, %f13;
st.shared.f32 [%rd54+512], %f94;
mov.f32 %f95, %f15;
st.shared.f32 [%rd54+1024], %f95;
mov.f32 %f96, %f17;
st.shared.f32 [%rd54+1536], %f96;
mov.f32 %f101, %f19;
st.shared.f32 [%rd54+2048], %f101;
mov.f32 %f102, %f20;
st.shared.f32 [%rd54+2560], %f102;
mov.s32 %r65, %r58;
@!%p14 bra $Lt_1_31234;
$Lt_1_31746:
setp.ge.u32 %p18, %r17, %r65;
@%p18 bra $Lt_1_32002;
add.u32 %r66, %r1, %r65;
cvt.u64.u32 %rd58, %r66;
mul.wide.u32 %rd59, %r66, 4;
add.u64 %rd60, %rd51, %rd59;
ld.shared.f32 %f103, [%rd60+0];
add.ftz.f32 %f93, %f103, %f93;
st.shared.f32 [%rd54+0], %f93;
ld.shared.f32 %f104, [%rd60+512];
add.ftz.f32 %f94, %f104, %f94;
st.shared.f32 [%rd54+512], %f94;
ld.shared.f32 %f105, [%rd60+1024];
add.ftz.f32 %f95, %f105, %f95;
st.shared.f32 [%rd54+1024], %f95;
ld.shared.f32 %f106, [%rd60+1536];
add.ftz.f32 %f96, %f106, %f96;
st.shared.f32 [%rd54+1536], %f96;
ld.shared.f32 %f107, [%rd60+2048];
add.ftz.f32 %f101, %f107, %f101;
st.shared.f32 [%rd54+2048], %f101;
ld.shared.f32 %f108, [%rd60+2560];
add.ftz.f32 %f102, %f108, %f102;
st.shared.f32 [%rd54+2560], %f102;
$Lt_1_32002:
shr.u32 %r65, %r65, 1;
mov.u32 %r67, 0;
setp.ne.u32 %p19, %r65, %r67;
@%p19 bra $Lt_1_31746;
$Lt_1_31234:
mov.f32 %f11, %f93;
mov.f32 %f13, %f94;
mov.f32 %f15, %f95;
mov.f32 %f17, %f96;
mov.f32 %f19, %f101;
mov.f32 %f21, %f102;
$Lt_1_30722:
$Lt_1_28674:
mov.u32 %r68, 0;
setp.ne.s32 %p20, %r17, %r68;
@%p20 bra $Lt_1_32770;
ld.param.u64 %rd61, [__cudaparm_kernel_pair_fast___val_paramengv];
add.u64 %rd62, %rd61, %rd20;
ld.param.s32 %r69, [__cudaparm_kernel_pair_fast_eflag];
mov.u32 %r70, 0;
setp.le.s32 %p21, %r69, %r70;
@%p21 bra $Lt_1_33282;
st.global.f32 [%rd62+0], %f34;
cvt.s64.s32 %rd63, %r13;
mul.wide.s32 %rd64, %r13, 4;
add.u64 %rd62, %rd62, %rd64;
$Lt_1_33282:
ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag];
mov.u32 %r72, 0;
setp.le.s32 %p22, %r71, %r72;
@%p22 bra $Lt_1_33794;
mov.f32 %f109, %f11;
st.global.f32 [%rd62+0], %f109;
cvt.s64.s32 %rd65, %r13;
mul.wide.s32 %rd66, %r13, 4;
add.u64 %rd67, %rd66, %rd62;
mov.f32 %f110, %f13;
st.global.f32 [%rd67+0], %f110;
add.u64 %rd68, %rd66, %rd67;
mov.f32 %f111, %f15;
st.global.f32 [%rd68+0], %f111;
add.u64 %rd69, %rd66, %rd68;
mov.f32 %f112, %f17;
st.global.f32 [%rd69+0], %f112;
add.u64 %rd62, %rd66, %rd69;
mov.f32 %f113, %f19;
st.global.f32 [%rd62+0], %f113;
mov.f32 %f114, %f21;
add.u64 %rd70, %rd66, %rd62;
st.global.f32 [%rd70+0], %f114;
$Lt_1_33794:
ld.param.u64 %rd71, [__cudaparm_kernel_pair_fast_ans];
mul.lo.u64 %rd72, %rd19, 16;
add.u64 %rd73, %rd71, %rd72;
mov.f32 %f115, %f116;
st.global.v4.f32 [%rd73+0], {%f33,%f32,%f31,%f115};
$Lt_1_32770:
$Lt_1_24066:
.loc 16 204 0
exit;
$LDWend_kernel_pair_fast:
} // kernel_pair_fast

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,906 +0,0 @@
const char * cg_cmm =
" .version 2.3\n"
" .target sm_20\n"
" .address_size 64\n"
" .global .texref pos_tex;\n"
" .entry kernel_pair (\n"
" .param .u64 __cudaparm_kernel_pair_x_,\n"
" .param .u64 __cudaparm_kernel_pair_lj1,\n"
" .param .u64 __cudaparm_kernel_pair_lj3,\n"
" .param .s32 __cudaparm_kernel_pair_lj_types,\n"
" .param .u64 __cudaparm_kernel_pair_sp_lj_in,\n"
" .param .u64 __cudaparm_kernel_pair_dev_nbor,\n"
" .param .u64 __cudaparm_kernel_pair_dev_packed,\n"
" .param .u64 __cudaparm_kernel_pair_ans,\n"
" .param .u64 __cudaparm_kernel_pair___val_paramengv,\n"
" .param .s32 __cudaparm_kernel_pair_eflag,\n"
" .param .s32 __cudaparm_kernel_pair_vflag,\n"
" .param .s32 __cudaparm_kernel_pair_inum,\n"
" .param .s32 __cudaparm_kernel_pair_nbor_pitch,\n"
" .param .s32 __cudaparm_kernel_pair_t_per_atom)\n"
" {\n"
" .reg .u32 %r<72>;\n"
" .reg .u64 %rd<63>;\n"
" .reg .f32 %f<111>;\n"
" .reg .pred %p<21>;\n"
" .shared .align 16 .b8 __cuda___cuda_local_var_32536_33_non_const_sp_lj92[16];\n"
" .shared .align 4 .b8 __cuda___cuda_local_var_32608_55_non_const_red_acc108[3072];\n"
" .loc 16 31 0\n"
"$LDWbegin_kernel_pair:\n"
" .loc 16 36 0\n"
" ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];\n"
" ldu.global.f32 %f1, [%rd1+0];\n"
" .loc 16 37 0\n"
" ld.global.f32 %f2, [%rd1+4];\n"
" .loc 16 38 0\n"
" ld.global.f32 %f3, [%rd1+8];\n"
" .loc 16 39 0\n"
" ld.global.f32 %f4, [%rd1+12];\n"
" st.shared.v4.f32 [__cuda___cuda_local_var_32536_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4};\n"
" .loc 16 46 0\n"
" mov.f32 %f5, 0f00000000; \n"
" mov.f32 %f6, %f5;\n"
" mov.f32 %f7, 0f00000000; \n"
" mov.f32 %f8, %f7;\n"
" mov.f32 %f9, 0f00000000; \n"
" mov.f32 %f10, %f9;\n"
" mov.f32 %f11, 0f00000000; \n"
" mov.f32 %f12, %f11;\n"
" mov.f32 %f13, 0f00000000; \n"
" mov.f32 %f14, %f13;\n"
" mov.f32 %f15, 0f00000000; \n"
" mov.f32 %f16, %f15;\n"
" ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];\n"
" cvt.s32.u32 %r2, %tid.x;\n"
" div.s32 %r3, %r2, %r1;\n"
" cvt.s32.u32 %r4, %ntid.x;\n"
" div.s32 %r5, %r4, %r1;\n"
" cvt.s32.u32 %r6, %ctaid.x;\n"
" mul.lo.s32 %r7, %r6, %r5;\n"
" add.s32 %r8, %r3, %r7;\n"
" ld.param.s32 %r9, [__cudaparm_kernel_pair_inum];\n"
" setp.ge.s32 %p1, %r8, %r9;\n"
" @%p1 bra $Lt_0_28930;\n"
" .loc 16 51 0\n"
" ld.param.s32 %r10, [__cudaparm_kernel_pair_nbor_pitch];\n"
" cvt.s64.s32 %rd2, %r10;\n"
" mul.wide.s32 %rd3, %r10, 4;\n"
" cvt.s64.s32 %rd4, %r8;\n"
" mul.wide.s32 %rd5, %r8, 4;\n"
" ld.param.u64 %rd6, [__cudaparm_kernel_pair_dev_nbor];\n"
" add.u64 %rd7, %rd5, %rd6;\n"
" add.u64 %rd8, %rd3, %rd7;\n"
" ld.global.s32 %r11, [%rd8+0];\n"
" sub.s32 %r12, %r1, 1;\n"
" and.b32 %r13, %r12, %r2;\n"
" cvt.s64.s32 %rd9, %r13;\n"
" mul.wide.s32 %rd10, %r13, 4;\n"
" ld.param.u64 %rd11, [__cudaparm_kernel_pair_dev_packed];\n"
" setp.ne.u64 %p2, %rd11, %rd6;\n"
" @%p2 bra $Lt_0_20994;\n"
" cvt.s32.s64 %r14, %rd2;\n"
" mul.lo.s32 %r15, %r14, %r1;\n"
" mov.s32 %r16, %r15;\n"
" mul.lo.s32 %r17, %r12, %r8;\n"
" add.s32 %r18, %r14, %r17;\n"
" cvt.s64.s32 %rd12, %r18;\n"
" mul.wide.s32 %rd13, %r18, 4;\n"
" add.u64 %rd14, %rd8, %rd13;\n"
" and.b32 %r19, %r12, %r11;\n"
" cvt.s64.s32 %rd15, %r19;\n"
" div.s32 %r20, %r11, %r1;\n"
" mul.lo.s32 %r21, %r15, %r20;\n"
" cvt.s64.s32 %rd16, %r21;\n"
" add.u64 %rd17, %rd15, %rd16;\n"
" mul.lo.u64 %rd18, %rd17, 4;\n"
" add.u64 %rd19, %rd14, %rd18;\n"
" add.u64 %rd20, %rd10, %rd14;\n"
" bra.uni $Lt_0_20738;\n"
"$Lt_0_20994:\n"
" add.u64 %rd21, %rd3, %rd8;\n"
" ld.global.s32 %r22, [%rd21+0];\n"
" cvt.s64.s32 %rd22, %r22;\n"
" mul.wide.s32 %rd23, %r22, 4;\n"
" add.u64 %rd24, %rd11, %rd23;\n"
" cvt.s64.s32 %rd25, %r11;\n"
" mul.wide.s32 %rd26, %r11, 4;\n"
" add.u64 %rd19, %rd24, %rd26;\n"
" mov.s32 %r16, %r1;\n"
" add.u64 %rd20, %rd10, %rd24;\n"
"$Lt_0_20738:\n"
" .loc 16 54 0\n"
" ld.global.s32 %r23, [%rd7+0];\n"
" mov.u32 %r24, %r23;\n"
" mov.s32 %r25, 0;\n"
" mov.u32 %r26, %r25;\n"
" mov.s32 %r27, 0;\n"
" mov.u32 %r28, %r27;\n"
" mov.s32 %r29, 0;\n"
" mov.u32 %r30, %r29;\n"
" tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r24,%r26,%r28,%r30}];\n"
" mov.f32 %f21, %f17;\n"
" mov.f32 %f22, %f18;\n"
" mov.f32 %f23, %f19;\n"
" mov.f32 %f24, %f20;\n"
" setp.ge.u64 %p3, %rd20, %rd19;\n"
" @%p3 bra $Lt_0_30466;\n"
" cvt.rzi.ftz.s32.f32 %r31, %f24;\n"
" cvt.s64.s32 %rd27, %r16;\n"
" ld.param.s32 %r32, [__cudaparm_kernel_pair_lj_types];\n"
" mul.lo.s32 %r33, %r32, %r31;\n"
" ld.param.u64 %rd28, [__cudaparm_kernel_pair_lj1];\n"
" mov.f32 %f25, 0f00000000; \n"
" mov.f32 %f26, 0f00000000; \n"
" mov.f32 %f27, 0f00000000; \n"
" mov.f32 %f28, 0f00000000; \n"
" mov.u64 %rd29, __cuda___cuda_local_var_32536_33_non_const_sp_lj92;\n"
"$Lt_0_21762:\n"
" .loc 16 60 0\n"
" ld.global.s32 %r34, [%rd20+0];\n"
" .loc 16 61 0\n"
" shr.s32 %r35, %r34, 30;\n"
" and.b32 %r36, %r35, 3;\n"
" cvt.s64.s32 %rd30, %r36;\n"
" mul.wide.s32 %rd31, %r36, 4;\n"
" add.u64 %rd32, %rd29, %rd31;\n"
" ld.shared.f32 %f29, [%rd32+0];\n"
" .loc 16 64 0\n"
" and.b32 %r37, %r34, 1073741823;\n"
" mov.u32 %r38, %r37;\n"
" mov.s32 %r39, 0;\n"
" mov.u32 %r40, %r39;\n"
" mov.s32 %r41, 0;\n"
" mov.u32 %r42, %r41;\n"
" mov.s32 %r43, 0;\n"
" mov.u32 %r44, %r43;\n"
" tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r38,%r40,%r42,%r44}];\n"
" mov.f32 %f34, %f30;\n"
" mov.f32 %f35, %f31;\n"
" mov.f32 %f36, %f32;\n"
" mov.f32 %f37, %f33;\n"
" cvt.rzi.ftz.s32.f32 %r45, %f37;\n"
" sub.ftz.f32 %f38, %f22, %f35;\n"
" sub.ftz.f32 %f39, %f21, %f34;\n"
" sub.ftz.f32 %f40, %f23, %f36;\n"
" mul.ftz.f32 %f41, %f38, %f38;\n"
" fma.rn.ftz.f32 %f42, %f39, %f39, %f41;\n"
" fma.rn.ftz.f32 %f43, %f40, %f40, %f42;\n"
" add.s32 %r46, %r45, %r33;\n"
" cvt.s64.s32 %rd33, %r46;\n"
" mul.wide.s32 %rd34, %r46, 16;\n"
" add.u64 %rd35, %rd34, %rd28;\n"
" ld.global.f32 %f44, [%rd35+0];\n"
" setp.gt.ftz.f32 %p4, %f44, %f43;\n"
" @!%p4 bra $Lt_0_24066;\n"
" rcp.approx.ftz.f32 %f45, %f43;\n"
" ld.global.f32 %f46, [%rd35+4];\n"
" mov.f32 %f47, 0f40000000; \n"
" setp.eq.ftz.f32 %p5, %f46, %f47;\n"
" @!%p5 bra $Lt_0_22786;\n"
" .loc 16 79 0\n"
" mul.ftz.f32 %f48, %f45, %f45;\n"
" mov.f32 %f49, %f48;\n"
" .loc 16 80 0\n"
" mul.ftz.f32 %f50, %f48, %f48;\n"
" bra.uni $Lt_0_23042;\n"
"$Lt_0_22786:\n"
" mov.f32 %f51, 0f3f800000; \n"
" setp.eq.ftz.f32 %p6, %f46, %f51;\n"
" @!%p6 bra $Lt_0_23298;\n"
" .loc 16 82 0\n"
" sqrt.approx.ftz.f32 %f52, %f45;\n"
" mul.ftz.f32 %f53, %f45, %f52;\n"
" mov.f32 %f50, %f53;\n"
" .loc 16 83 0\n"
" mul.ftz.f32 %f49, %f53, %f53;\n"
" bra.uni $Lt_0_23042;\n"
"$Lt_0_23298:\n"
" .loc 16 85 0\n"
" mul.ftz.f32 %f54, %f45, %f45;\n"
" mul.ftz.f32 %f55, %f45, %f54;\n"
" mov.f32 %f49, %f55;\n"
" .loc 16 86 0\n"
" mov.f32 %f50, %f55;\n"
"$Lt_0_23042:\n"
"$Lt_0_22530:\n"
" .loc 16 88 0\n"
" mul.ftz.f32 %f56, %f45, %f29;\n"
" mul.ftz.f32 %f57, %f49, %f56;\n"
" ld.global.v2.f32 {%f58,%f59}, [%rd35+8];\n"
" mul.ftz.f32 %f60, %f58, %f50;\n"
" sub.ftz.f32 %f61, %f60, %f59;\n"
" mul.ftz.f32 %f62, %f57, %f61;\n"
" .loc 16 90 0\n"
" fma.rn.ftz.f32 %f27, %f39, %f62, %f27;\n"
" .loc 16 91 0\n"
" fma.rn.ftz.f32 %f26, %f38, %f62, %f26;\n"
" .loc 16 92 0\n"
" fma.rn.ftz.f32 %f25, %f40, %f62, %f25;\n"
" ld.param.s32 %r47, [__cudaparm_kernel_pair_eflag];\n"
" mov.u32 %r48, 0;\n"
" setp.le.s32 %p7, %r47, %r48;\n"
" @%p7 bra $Lt_0_23554;\n"
" .loc 16 94 0\n"
" ld.param.u64 %rd36, [__cudaparm_kernel_pair_lj3];\n"
" add.u64 %rd37, %rd36, %rd34;\n"
" ld.global.v4.f32 {%f63,%f64,%f65,_}, [%rd37+0];\n"
" mul.ftz.f32 %f66, %f29, %f49;\n"
" mul.ftz.f32 %f67, %f63, %f50;\n"
" sub.ftz.f32 %f68, %f67, %f64;\n"
" mul.ftz.f32 %f69, %f66, %f68;\n"
" sub.ftz.f32 %f70, %f69, %f65;\n"
" add.ftz.f32 %f28, %f28, %f70;\n"
"$Lt_0_23554:\n"
" ld.param.s32 %r49, [__cudaparm_kernel_pair_vflag];\n"
" mov.u32 %r50, 0;\n"
" setp.le.s32 %p8, %r49, %r50;\n"
" @%p8 bra $Lt_0_24066;\n"
" .loc 16 97 0\n"
" mov.f32 %f71, %f6;\n"
" mul.ftz.f32 %f72, %f39, %f39;\n"
" fma.rn.ftz.f32 %f73, %f62, %f72, %f71;\n"
" mov.f32 %f6, %f73;\n"
" .loc 16 98 0\n"
" mov.f32 %f74, %f8;\n"
" fma.rn.ftz.f32 %f75, %f62, %f41, %f74;\n"
" mov.f32 %f8, %f75;\n"
" .loc 16 99 0\n"
" mov.f32 %f76, %f10;\n"
" mul.ftz.f32 %f77, %f40, %f40;\n"
" fma.rn.ftz.f32 %f78, %f62, %f77, %f76;\n"
" mov.f32 %f10, %f78;\n"
" .loc 16 100 0\n"
" mov.f32 %f79, %f12;\n"
" mul.ftz.f32 %f80, %f38, %f39;\n"
" fma.rn.ftz.f32 %f81, %f62, %f80, %f79;\n"
" mov.f32 %f12, %f81;\n"
" .loc 16 101 0\n"
" mov.f32 %f82, %f14;\n"
" mul.ftz.f32 %f83, %f39, %f40;\n"
" fma.rn.ftz.f32 %f84, %f62, %f83, %f82;\n"
" mov.f32 %f14, %f84;\n"
" .loc 16 102 0\n"
" mul.ftz.f32 %f85, %f38, %f40;\n"
" fma.rn.ftz.f32 %f15, %f62, %f85, %f15;\n"
" mov.f32 %f16, %f15;\n"
"$Lt_0_24066:\n"
"$Lt_0_22018:\n"
" .loc 16 58 0\n"
" mul.lo.u64 %rd38, %rd27, 4;\n"
" add.u64 %rd20, %rd20, %rd38;\n"
" setp.lt.u64 %p9, %rd20, %rd19;\n"
" @%p9 bra $Lt_0_21762;\n"
" bra.uni $Lt_0_21250;\n"
"$Lt_0_30466:\n"
" mov.f32 %f25, 0f00000000; \n"
" mov.f32 %f26, 0f00000000; \n"
" mov.f32 %f27, 0f00000000; \n"
" mov.f32 %f28, 0f00000000; \n"
"$Lt_0_21250:\n"
" mov.u32 %r51, 1;\n"
" setp.le.s32 %p10, %r1, %r51;\n"
" @%p10 bra $Lt_0_26882;\n"
" .loc 16 107 0\n"
" mov.u64 %rd39, __cuda___cuda_local_var_32608_55_non_const_red_acc108;\n"
" cvt.s64.s32 %rd40, %r2;\n"
" mul.wide.s32 %rd41, %r2, 4;\n"
" add.u64 %rd42, %rd39, %rd41;\n"
" mov.f32 %f86, %f27;\n"
" st.shared.f32 [%rd42+0], %f86;\n"
" mov.f32 %f87, %f26;\n"
" st.shared.f32 [%rd42+512], %f87;\n"
" mov.f32 %f88, %f25;\n"
" st.shared.f32 [%rd42+1024], %f88;\n"
" mov.f32 %f89, %f28;\n"
" st.shared.f32 [%rd42+1536], %f89;\n"
" shr.s32 %r52, %r1, 31;\n"
" mov.s32 %r53, 1;\n"
" and.b32 %r54, %r52, %r53;\n"
" add.s32 %r55, %r54, %r1;\n"
" shr.s32 %r56, %r55, 1;\n"
" mov.s32 %r57, %r56;\n"
" mov.u32 %r58, 0;\n"
" setp.ne.u32 %p11, %r56, %r58;\n"
" @!%p11 bra $Lt_0_25346;\n"
"$Lt_0_25858:\n"
" setp.ge.u32 %p12, %r13, %r57;\n"
" @%p12 bra $Lt_0_26114;\n"
" add.u32 %r59, %r2, %r57;\n"
" cvt.u64.u32 %rd43, %r59;\n"
" mul.wide.u32 %rd44, %r59, 4;\n"
" add.u64 %rd45, %rd39, %rd44;\n"
" ld.shared.f32 %f90, [%rd45+0];\n"
" add.ftz.f32 %f86, %f90, %f86;\n"
" st.shared.f32 [%rd42+0], %f86;\n"
" ld.shared.f32 %f91, [%rd45+512];\n"
" add.ftz.f32 %f87, %f91, %f87;\n"
" st.shared.f32 [%rd42+512], %f87;\n"
" ld.shared.f32 %f92, [%rd45+1024];\n"
" add.ftz.f32 %f88, %f92, %f88;\n"
" st.shared.f32 [%rd42+1024], %f88;\n"
" ld.shared.f32 %f93, [%rd45+1536];\n"
" add.ftz.f32 %f89, %f93, %f89;\n"
" st.shared.f32 [%rd42+1536], %f89;\n"
"$Lt_0_26114:\n"
" shr.u32 %r57, %r57, 1;\n"
" mov.u32 %r60, 0;\n"
" setp.ne.u32 %p13, %r57, %r60;\n"
" @%p13 bra $Lt_0_25858;\n"
"$Lt_0_25346:\n"
" mov.f32 %f27, %f86;\n"
" mov.f32 %f26, %f87;\n"
" mov.f32 %f25, %f88;\n"
" mov.f32 %f28, %f89;\n"
" ld.param.s32 %r61, [__cudaparm_kernel_pair_vflag];\n"
" mov.u32 %r62, 0;\n"
" setp.le.s32 %p14, %r61, %r62;\n"
" @%p14 bra $Lt_0_26882;\n"
" mov.f32 %f86, %f6;\n"
" st.shared.f32 [%rd42+0], %f86;\n"
" mov.f32 %f87, %f8;\n"
" st.shared.f32 [%rd42+512], %f87;\n"
" mov.f32 %f88, %f10;\n"
" st.shared.f32 [%rd42+1024], %f88;\n"
" mov.f32 %f89, %f12;\n"
" st.shared.f32 [%rd42+1536], %f89;\n"
" mov.f32 %f94, %f14;\n"
" st.shared.f32 [%rd42+2048], %f94;\n"
" mov.f32 %f95, %f15;\n"
" st.shared.f32 [%rd42+2560], %f95;\n"
" mov.s32 %r63, %r56;\n"
" @!%p11 bra $Lt_0_27394;\n"
"$Lt_0_27906:\n"
" setp.ge.u32 %p15, %r13, %r63;\n"
" @%p15 bra $Lt_0_28162;\n"
" add.u32 %r64, %r2, %r63;\n"
" cvt.u64.u32 %rd46, %r64;\n"
" mul.wide.u32 %rd47, %r64, 4;\n"
" add.u64 %rd48, %rd39, %rd47;\n"
" ld.shared.f32 %f96, [%rd48+0];\n"
" add.ftz.f32 %f86, %f96, %f86;\n"
" st.shared.f32 [%rd42+0], %f86;\n"
" ld.shared.f32 %f97, [%rd48+512];\n"
" add.ftz.f32 %f87, %f97, %f87;\n"
" st.shared.f32 [%rd42+512], %f87;\n"
" ld.shared.f32 %f98, [%rd48+1024];\n"
" add.ftz.f32 %f88, %f98, %f88;\n"
" st.shared.f32 [%rd42+1024], %f88;\n"
" ld.shared.f32 %f99, [%rd48+1536];\n"
" add.ftz.f32 %f89, %f99, %f89;\n"
" st.shared.f32 [%rd42+1536], %f89;\n"
" ld.shared.f32 %f100, [%rd48+2048];\n"
" add.ftz.f32 %f94, %f100, %f94;\n"
" st.shared.f32 [%rd42+2048], %f94;\n"
" ld.shared.f32 %f101, [%rd48+2560];\n"
" add.ftz.f32 %f95, %f101, %f95;\n"
" st.shared.f32 [%rd42+2560], %f95;\n"
"$Lt_0_28162:\n"
" shr.u32 %r63, %r63, 1;\n"
" mov.u32 %r65, 0;\n"
" setp.ne.u32 %p16, %r63, %r65;\n"
" @%p16 bra $Lt_0_27906;\n"
"$Lt_0_27394:\n"
" mov.f32 %f6, %f86;\n"
" mov.f32 %f8, %f87;\n"
" mov.f32 %f10, %f88;\n"
" mov.f32 %f12, %f89;\n"
" mov.f32 %f14, %f94;\n"
" mov.f32 %f16, %f95;\n"
"$Lt_0_26882:\n"
"$Lt_0_24834:\n"
" mov.u32 %r66, 0;\n"
" setp.ne.s32 %p17, %r13, %r66;\n"
" @%p17 bra $Lt_0_28930;\n"
" ld.param.u64 %rd49, [__cudaparm_kernel_pair___val_paramengv];\n"
" add.u64 %rd50, %rd49, %rd5;\n"
" ld.param.s32 %r67, [__cudaparm_kernel_pair_eflag];\n"
" mov.u32 %r68, 0;\n"
" setp.le.s32 %p18, %r67, %r68;\n"
" @%p18 bra $Lt_0_29442;\n"
" st.global.f32 [%rd50+0], %f28;\n"
" cvt.s64.s32 %rd51, %r9;\n"
" mul.wide.s32 %rd52, %r9, 4;\n"
" add.u64 %rd50, %rd50, %rd52;\n"
"$Lt_0_29442:\n"
" ld.param.s32 %r69, [__cudaparm_kernel_pair_vflag];\n"
" mov.u32 %r70, 0;\n"
" setp.le.s32 %p19, %r69, %r70;\n"
" @%p19 bra $Lt_0_29954;\n"
" mov.f32 %f102, %f6;\n"
" st.global.f32 [%rd50+0], %f102;\n"
" cvt.s64.s32 %rd53, %r9;\n"
" mul.wide.s32 %rd54, %r9, 4;\n"
" add.u64 %rd55, %rd54, %rd50;\n"
" mov.f32 %f103, %f8;\n"
" st.global.f32 [%rd55+0], %f103;\n"
" add.u64 %rd56, %rd54, %rd55;\n"
" mov.f32 %f104, %f10;\n"
" st.global.f32 [%rd56+0], %f104;\n"
" add.u64 %rd57, %rd54, %rd56;\n"
" mov.f32 %f105, %f12;\n"
" st.global.f32 [%rd57+0], %f105;\n"
" add.u64 %rd50, %rd54, %rd57;\n"
" mov.f32 %f106, %f14;\n"
" st.global.f32 [%rd50+0], %f106;\n"
" mov.f32 %f107, %f16;\n"
" add.u64 %rd58, %rd54, %rd50;\n"
" st.global.f32 [%rd58+0], %f107;\n"
"$Lt_0_29954:\n"
" ld.param.u64 %rd59, [__cudaparm_kernel_pair_ans];\n"
" mul.lo.u64 %rd60, %rd4, 16;\n"
" add.u64 %rd61, %rd59, %rd60;\n"
" mov.f32 %f108, %f109;\n"
" st.global.v4.f32 [%rd61+0], {%f27,%f26,%f25,%f108};\n"
"$Lt_0_28930:\n"
"$Lt_0_20226:\n"
" .loc 16 110 0\n"
" exit;\n"
"$LDWend_kernel_pair:\n"
" }\n"
" .entry kernel_pair_fast (\n"
" .param .u64 __cudaparm_kernel_pair_fast_x_,\n"
" .param .u64 __cudaparm_kernel_pair_fast_lj1_in,\n"
" .param .u64 __cudaparm_kernel_pair_fast_lj3_in,\n"
" .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,\n"
" .param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n"
" .param .u64 __cudaparm_kernel_pair_fast_dev_packed,\n"
" .param .u64 __cudaparm_kernel_pair_fast_ans,\n"
" .param .u64 __cudaparm_kernel_pair_fast___val_paramengv,\n"
" .param .s32 __cudaparm_kernel_pair_fast_eflag,\n"
" .param .s32 __cudaparm_kernel_pair_fast_vflag,\n"
" .param .s32 __cudaparm_kernel_pair_fast_inum,\n"
" .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,\n"
" .param .s32 __cudaparm_kernel_pair_fast_t_per_atom)\n"
" {\n"
" .reg .u32 %r<74>;\n"
" .reg .u64 %rd<75>;\n"
" .reg .f32 %f<118>;\n"
" .reg .pred %p<24>;\n"
" .shared .align 4 .b8 __cuda___cuda_local_var_32625_33_non_const_sp_lj3268[16];\n"
" .shared .align 16 .b8 __cuda___cuda_local_var_32623_34_non_const_lj13296[1936];\n"
" .shared .align 16 .b8 __cuda___cuda_local_var_32624_34_non_const_lj35232[1936];\n"
" .shared .align 4 .b8 __cuda___cuda_local_var_32702_55_non_const_red_acc7168[3072];\n"
" .loc 16 118 0\n"
"$LDWbegin_kernel_pair_fast:\n"
" cvt.s32.u32 %r1, %tid.x;\n"
" mov.u32 %r2, 3;\n"
" setp.gt.s32 %p1, %r1, %r2;\n"
" @%p1 bra $Lt_1_22530;\n"
" .loc 16 126 0\n"
" mov.u64 %rd1, __cuda___cuda_local_var_32625_33_non_const_sp_lj3268;\n"
" cvt.s64.s32 %rd2, %r1;\n"
" mul.wide.s32 %rd3, %r1, 4;\n"
" ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];\n"
" add.u64 %rd5, %rd4, %rd3;\n"
" ld.global.f32 %f1, [%rd5+0];\n"
" add.u64 %rd6, %rd3, %rd1;\n"
" st.shared.f32 [%rd6+0], %f1;\n"
"$Lt_1_22530:\n"
" mov.u64 %rd1, __cuda___cuda_local_var_32625_33_non_const_sp_lj3268;\n"
" mov.u32 %r3, 120;\n"
" setp.gt.s32 %p2, %r1, %r3;\n"
" @%p2 bra $Lt_1_23042;\n"
" .loc 16 128 0\n"
" mov.u64 %rd7, __cuda___cuda_local_var_32623_34_non_const_lj13296;\n"
" cvt.s64.s32 %rd8, %r1;\n"
" mul.wide.s32 %rd9, %r1, 16;\n"
" ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_lj1_in];\n"
" add.u64 %rd11, %rd10, %rd9;\n"
" add.u64 %rd12, %rd9, %rd7;\n"
" ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0];\n"
" st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5};\n"
" ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag];\n"
" mov.u32 %r5, 0;\n"
" setp.le.s32 %p3, %r4, %r5;\n"
" @%p3 bra $Lt_1_23554;\n"
" .loc 16 130 0\n"
" mov.u64 %rd13, __cuda___cuda_local_var_32624_34_non_const_lj35232;\n"
" ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in];\n"
" add.u64 %rd15, %rd14, %rd9;\n"
" add.u64 %rd16, %rd9, %rd13;\n"
" ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0];\n"
" st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9};\n"
"$Lt_1_23554:\n"
" mov.u64 %rd13, __cuda___cuda_local_var_32624_34_non_const_lj35232;\n"
"$Lt_1_23042:\n"
" mov.u64 %rd13, __cuda___cuda_local_var_32624_34_non_const_lj35232;\n"
" mov.u64 %rd7, __cuda___cuda_local_var_32623_34_non_const_lj13296;\n"
" .loc 16 138 0\n"
" mov.f32 %f10, 0f00000000; \n"
" mov.f32 %f11, %f10;\n"
" mov.f32 %f12, 0f00000000; \n"
" mov.f32 %f13, %f12;\n"
" mov.f32 %f14, 0f00000000; \n"
" mov.f32 %f15, %f14;\n"
" mov.f32 %f16, 0f00000000; \n"
" mov.f32 %f17, %f16;\n"
" mov.f32 %f18, 0f00000000; \n"
" mov.f32 %f19, %f18;\n"
" mov.f32 %f20, 0f00000000; \n"
" mov.f32 %f21, %f20;\n"
" .loc 16 140 0\n"
" bar.sync 0;\n"
" ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom];\n"
" div.s32 %r7, %r1, %r6;\n"
" cvt.s32.u32 %r8, %ntid.x;\n"
" div.s32 %r9, %r8, %r6;\n"
" cvt.s32.u32 %r10, %ctaid.x;\n"
" mul.lo.s32 %r11, %r10, %r9;\n"
" add.s32 %r12, %r7, %r11;\n"
" ld.param.s32 %r13, [__cudaparm_kernel_pair_fast_inum];\n"
" setp.ge.s32 %p4, %r12, %r13;\n"
" @%p4 bra $Lt_1_32770;\n"
" .loc 16 145 0\n"
" ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_nbor_pitch];\n"
" cvt.s64.s32 %rd17, %r14;\n"
" mul.wide.s32 %rd18, %r14, 4;\n"
" cvt.s64.s32 %rd19, %r12;\n"
" mul.wide.s32 %rd20, %r12, 4;\n"
" ld.param.u64 %rd21, [__cudaparm_kernel_pair_fast_dev_nbor];\n"
" add.u64 %rd22, %rd20, %rd21;\n"
" add.u64 %rd23, %rd18, %rd22;\n"
" ld.global.s32 %r15, [%rd23+0];\n"
" sub.s32 %r16, %r6, 1;\n"
" and.b32 %r17, %r16, %r1;\n"
" cvt.s64.s32 %rd24, %r17;\n"
" mul.wide.s32 %rd25, %r17, 4;\n"
" ld.param.u64 %rd26, [__cudaparm_kernel_pair_fast_dev_packed];\n"
" setp.ne.u64 %p5, %rd26, %rd21;\n"
" @%p5 bra $Lt_1_24834;\n"
" cvt.s32.s64 %r18, %rd17;\n"
" mul.lo.s32 %r19, %r18, %r6;\n"
" mov.s32 %r20, %r19;\n"
" mul.lo.s32 %r21, %r16, %r12;\n"
" add.s32 %r22, %r18, %r21;\n"
" cvt.s64.s32 %rd27, %r22;\n"
" mul.wide.s32 %rd28, %r22, 4;\n"
" add.u64 %rd29, %rd23, %rd28;\n"
" and.b32 %r23, %r16, %r15;\n"
" cvt.s64.s32 %rd30, %r23;\n"
" div.s32 %r24, %r15, %r6;\n"
" mul.lo.s32 %r25, %r19, %r24;\n"
" cvt.s64.s32 %rd31, %r25;\n"
" add.u64 %rd32, %rd30, %rd31;\n"
" mul.lo.u64 %rd33, %rd32, 4;\n"
" add.u64 %rd34, %rd29, %rd33;\n"
" add.u64 %rd35, %rd25, %rd29;\n"
" bra.uni $Lt_1_24578;\n"
"$Lt_1_24834:\n"
" add.u64 %rd36, %rd18, %rd23;\n"
" ld.global.s32 %r26, [%rd36+0];\n"
" cvt.s64.s32 %rd37, %r26;\n"
" mul.wide.s32 %rd38, %r26, 4;\n"
" add.u64 %rd39, %rd26, %rd38;\n"
" cvt.s64.s32 %rd40, %r15;\n"
" mul.wide.s32 %rd41, %r15, 4;\n"
" add.u64 %rd34, %rd39, %rd41;\n"
" mov.s32 %r20, %r6;\n"
" add.u64 %rd35, %rd25, %rd39;\n"
"$Lt_1_24578:\n"
" .loc 16 148 0\n"
" ld.global.s32 %r27, [%rd22+0];\n"
" mov.u32 %r28, %r27;\n"
" mov.s32 %r29, 0;\n"
" mov.u32 %r30, %r29;\n"
" mov.s32 %r31, 0;\n"
" mov.u32 %r32, %r31;\n"
" mov.s32 %r33, 0;\n"
" mov.u32 %r34, %r33;\n"
" tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r28,%r30,%r32,%r34}];\n"
" mov.f32 %f26, %f22;\n"
" mov.f32 %f27, %f23;\n"
" mov.f32 %f28, %f24;\n"
" mov.f32 %f29, %f25;\n"
" setp.ge.u64 %p6, %rd35, %rd34;\n"
" @%p6 bra $Lt_1_34306;\n"
" cvt.rzi.ftz.s32.f32 %r35, %f29;\n"
" cvt.s64.s32 %rd42, %r20;\n"
" mul.lo.s32 %r36, %r35, 11;\n"
" cvt.rn.f32.s32 %f30, %r36;\n"
" mov.f32 %f31, 0f00000000; \n"
" mov.f32 %f32, 0f00000000; \n"
" mov.f32 %f33, 0f00000000; \n"
" mov.f32 %f34, 0f00000000; \n"
"$Lt_1_25602:\n"
" .loc 16 155 0\n"
" ld.global.s32 %r37, [%rd35+0];\n"
" .loc 16 156 0\n"
" shr.s32 %r38, %r37, 30;\n"
" and.b32 %r39, %r38, 3;\n"
" cvt.s64.s32 %rd43, %r39;\n"
" mul.wide.s32 %rd44, %r39, 4;\n"
" add.u64 %rd45, %rd1, %rd44;\n"
" ld.shared.f32 %f35, [%rd45+0];\n"
" .loc 16 159 0\n"
" and.b32 %r40, %r37, 1073741823;\n"
" mov.u32 %r41, %r40;\n"
" mov.s32 %r42, 0;\n"
" mov.u32 %r43, %r42;\n"
" mov.s32 %r44, 0;\n"
" mov.u32 %r45, %r44;\n"
" mov.s32 %r46, 0;\n"
" mov.u32 %r47, %r46;\n"
" tex.1d.v4.f32.s32 {%f36,%f37,%f38,%f39},[pos_tex,{%r41,%r43,%r45,%r47}];\n"
" mov.f32 %f40, %f36;\n"
" mov.f32 %f41, %f37;\n"
" mov.f32 %f42, %f38;\n"
" mov.f32 %f43, %f39;\n"
" sub.ftz.f32 %f44, %f27, %f41;\n"
" sub.ftz.f32 %f45, %f26, %f40;\n"
" sub.ftz.f32 %f46, %f28, %f42;\n"
" mul.ftz.f32 %f47, %f44, %f44;\n"
" fma.rn.ftz.f32 %f48, %f45, %f45, %f47;\n"
" fma.rn.ftz.f32 %f49, %f46, %f46, %f48;\n"
" add.ftz.f32 %f50, %f30, %f43;\n"
" cvt.rzi.ftz.s32.f32 %r48, %f50;\n"
" cvt.s64.s32 %rd46, %r48;\n"
" mul.wide.s32 %rd47, %r48, 16;\n"
" add.u64 %rd48, %rd47, %rd7;\n"
" ld.shared.f32 %f51, [%rd48+0];\n"
" setp.gt.ftz.f32 %p7, %f51, %f49;\n"
" @!%p7 bra $Lt_1_27906;\n"
" rcp.approx.ftz.f32 %f52, %f49;\n"
" ld.shared.f32 %f53, [%rd48+4];\n"
" mov.f32 %f54, 0f40000000; \n"
" setp.eq.ftz.f32 %p8, %f53, %f54;\n"
" @!%p8 bra $Lt_1_26626;\n"
" .loc 16 173 0\n"
" mul.ftz.f32 %f55, %f52, %f52;\n"
" mov.f32 %f56, %f55;\n"
" .loc 16 174 0\n"
" mul.ftz.f32 %f57, %f55, %f55;\n"
" bra.uni $Lt_1_26882;\n"
"$Lt_1_26626:\n"
" mov.f32 %f58, 0f3f800000; \n"
" setp.eq.ftz.f32 %p9, %f53, %f58;\n"
" @!%p9 bra $Lt_1_27138;\n"
" .loc 16 176 0\n"
" sqrt.approx.ftz.f32 %f59, %f52;\n"
" mul.ftz.f32 %f60, %f52, %f59;\n"
" mov.f32 %f57, %f60;\n"
" .loc 16 177 0\n"
" mul.ftz.f32 %f56, %f60, %f60;\n"
" bra.uni $Lt_1_26882;\n"
"$Lt_1_27138:\n"
" .loc 16 179 0\n"
" mul.ftz.f32 %f61, %f52, %f52;\n"
" mul.ftz.f32 %f62, %f52, %f61;\n"
" mov.f32 %f56, %f62;\n"
" .loc 16 180 0\n"
" mov.f32 %f57, %f62;\n"
"$Lt_1_26882:\n"
"$Lt_1_26370:\n"
" .loc 16 182 0\n"
" mul.ftz.f32 %f63, %f52, %f35;\n"
" mul.ftz.f32 %f64, %f56, %f63;\n"
" ld.shared.v2.f32 {%f65,%f66}, [%rd48+8];\n"
" mul.ftz.f32 %f67, %f65, %f57;\n"
" sub.ftz.f32 %f68, %f67, %f66;\n"
" mul.ftz.f32 %f69, %f64, %f68;\n"
" .loc 16 184 0\n"
" fma.rn.ftz.f32 %f33, %f45, %f69, %f33;\n"
" .loc 16 185 0\n"
" fma.rn.ftz.f32 %f32, %f44, %f69, %f32;\n"
" .loc 16 186 0\n"
" fma.rn.ftz.f32 %f31, %f46, %f69, %f31;\n"
" ld.param.s32 %r49, [__cudaparm_kernel_pair_fast_eflag];\n"
" mov.u32 %r50, 0;\n"
" setp.le.s32 %p10, %r49, %r50;\n"
" @%p10 bra $Lt_1_27394;\n"
" .loc 16 188 0\n"
" add.u64 %rd49, %rd47, %rd13;\n"
" ld.shared.v4.f32 {%f70,%f71,%f72,_}, [%rd49+0];\n"
" mul.ftz.f32 %f73, %f35, %f56;\n"
" mul.ftz.f32 %f74, %f70, %f57;\n"
" sub.ftz.f32 %f75, %f74, %f71;\n"
" mul.ftz.f32 %f76, %f73, %f75;\n"
" sub.ftz.f32 %f77, %f76, %f72;\n"
" add.ftz.f32 %f34, %f34, %f77;\n"
"$Lt_1_27394:\n"
" ld.param.s32 %r51, [__cudaparm_kernel_pair_fast_vflag];\n"
" mov.u32 %r52, 0;\n"
" setp.le.s32 %p11, %r51, %r52;\n"
" @%p11 bra $Lt_1_27906;\n"
" .loc 16 191 0\n"
" mov.f32 %f78, %f11;\n"
" mul.ftz.f32 %f79, %f45, %f45;\n"
" fma.rn.ftz.f32 %f80, %f69, %f79, %f78;\n"
" mov.f32 %f11, %f80;\n"
" .loc 16 192 0\n"
" mov.f32 %f81, %f13;\n"
" fma.rn.ftz.f32 %f82, %f69, %f47, %f81;\n"
" mov.f32 %f13, %f82;\n"
" .loc 16 193 0\n"
" mov.f32 %f83, %f15;\n"
" mul.ftz.f32 %f84, %f46, %f46;\n"
" fma.rn.ftz.f32 %f85, %f69, %f84, %f83;\n"
" mov.f32 %f15, %f85;\n"
" .loc 16 194 0\n"
" mov.f32 %f86, %f17;\n"
" mul.ftz.f32 %f87, %f44, %f45;\n"
" fma.rn.ftz.f32 %f88, %f69, %f87, %f86;\n"
" mov.f32 %f17, %f88;\n"
" .loc 16 195 0\n"
" mov.f32 %f89, %f19;\n"
" mul.ftz.f32 %f90, %f45, %f46;\n"
" fma.rn.ftz.f32 %f91, %f69, %f90, %f89;\n"
" mov.f32 %f19, %f91;\n"
" .loc 16 196 0\n"
" mul.ftz.f32 %f92, %f44, %f46;\n"
" fma.rn.ftz.f32 %f20, %f69, %f92, %f20;\n"
" mov.f32 %f21, %f20;\n"
"$Lt_1_27906:\n"
"$Lt_1_25858:\n"
" .loc 16 153 0\n"
" mul.lo.u64 %rd50, %rd42, 4;\n"
" add.u64 %rd35, %rd35, %rd50;\n"
" setp.lt.u64 %p12, %rd35, %rd34;\n"
" @%p12 bra $Lt_1_25602;\n"
" bra.uni $Lt_1_25090;\n"
"$Lt_1_34306:\n"
" mov.f32 %f31, 0f00000000; \n"
" mov.f32 %f32, 0f00000000; \n"
" mov.f32 %f33, 0f00000000; \n"
" mov.f32 %f34, 0f00000000; \n"
"$Lt_1_25090:\n"
" mov.u32 %r53, 1;\n"
" setp.le.s32 %p13, %r6, %r53;\n"
" @%p13 bra $Lt_1_30722;\n"
" .loc 16 201 0\n"
" mov.u64 %rd51, __cuda___cuda_local_var_32702_55_non_const_red_acc7168;\n"
" cvt.s64.s32 %rd52, %r1;\n"
" mul.wide.s32 %rd53, %r1, 4;\n"
" add.u64 %rd54, %rd51, %rd53;\n"
" mov.f32 %f93, %f33;\n"
" st.shared.f32 [%rd54+0], %f93;\n"
" mov.f32 %f94, %f32;\n"
" st.shared.f32 [%rd54+512], %f94;\n"
" mov.f32 %f95, %f31;\n"
" st.shared.f32 [%rd54+1024], %f95;\n"
" mov.f32 %f96, %f34;\n"
" st.shared.f32 [%rd54+1536], %f96;\n"
" shr.s32 %r54, %r6, 31;\n"
" mov.s32 %r55, 1;\n"
" and.b32 %r56, %r54, %r55;\n"
" add.s32 %r57, %r56, %r6;\n"
" shr.s32 %r58, %r57, 1;\n"
" mov.s32 %r59, %r58;\n"
" mov.u32 %r60, 0;\n"
" setp.ne.u32 %p14, %r58, %r60;\n"
" @!%p14 bra $Lt_1_29186;\n"
"$Lt_1_29698:\n"
" setp.ge.u32 %p15, %r17, %r59;\n"
" @%p15 bra $Lt_1_29954;\n"
" add.u32 %r61, %r1, %r59;\n"
" cvt.u64.u32 %rd55, %r61;\n"
" mul.wide.u32 %rd56, %r61, 4;\n"
" add.u64 %rd57, %rd51, %rd56;\n"
" ld.shared.f32 %f97, [%rd57+0];\n"
" add.ftz.f32 %f93, %f97, %f93;\n"
" st.shared.f32 [%rd54+0], %f93;\n"
" ld.shared.f32 %f98, [%rd57+512];\n"
" add.ftz.f32 %f94, %f98, %f94;\n"
" st.shared.f32 [%rd54+512], %f94;\n"
" ld.shared.f32 %f99, [%rd57+1024];\n"
" add.ftz.f32 %f95, %f99, %f95;\n"
" st.shared.f32 [%rd54+1024], %f95;\n"
" ld.shared.f32 %f100, [%rd57+1536];\n"
" add.ftz.f32 %f96, %f100, %f96;\n"
" st.shared.f32 [%rd54+1536], %f96;\n"
"$Lt_1_29954:\n"
" shr.u32 %r59, %r59, 1;\n"
" mov.u32 %r62, 0;\n"
" setp.ne.u32 %p16, %r59, %r62;\n"
" @%p16 bra $Lt_1_29698;\n"
"$Lt_1_29186:\n"
" mov.f32 %f33, %f93;\n"
" mov.f32 %f32, %f94;\n"
" mov.f32 %f31, %f95;\n"
" mov.f32 %f34, %f96;\n"
" ld.param.s32 %r63, [__cudaparm_kernel_pair_fast_vflag];\n"
" mov.u32 %r64, 0;\n"
" setp.le.s32 %p17, %r63, %r64;\n"
" @%p17 bra $Lt_1_30722;\n"
" mov.f32 %f93, %f11;\n"
" st.shared.f32 [%rd54+0], %f93;\n"
" mov.f32 %f94, %f13;\n"
" st.shared.f32 [%rd54+512], %f94;\n"
" mov.f32 %f95, %f15;\n"
" st.shared.f32 [%rd54+1024], %f95;\n"
" mov.f32 %f96, %f17;\n"
" st.shared.f32 [%rd54+1536], %f96;\n"
" mov.f32 %f101, %f19;\n"
" st.shared.f32 [%rd54+2048], %f101;\n"
" mov.f32 %f102, %f20;\n"
" st.shared.f32 [%rd54+2560], %f102;\n"
" mov.s32 %r65, %r58;\n"
" @!%p14 bra $Lt_1_31234;\n"
"$Lt_1_31746:\n"
" setp.ge.u32 %p18, %r17, %r65;\n"
" @%p18 bra $Lt_1_32002;\n"
" add.u32 %r66, %r1, %r65;\n"
" cvt.u64.u32 %rd58, %r66;\n"
" mul.wide.u32 %rd59, %r66, 4;\n"
" add.u64 %rd60, %rd51, %rd59;\n"
" ld.shared.f32 %f103, [%rd60+0];\n"
" add.ftz.f32 %f93, %f103, %f93;\n"
" st.shared.f32 [%rd54+0], %f93;\n"
" ld.shared.f32 %f104, [%rd60+512];\n"
" add.ftz.f32 %f94, %f104, %f94;\n"
" st.shared.f32 [%rd54+512], %f94;\n"
" ld.shared.f32 %f105, [%rd60+1024];\n"
" add.ftz.f32 %f95, %f105, %f95;\n"
" st.shared.f32 [%rd54+1024], %f95;\n"
" ld.shared.f32 %f106, [%rd60+1536];\n"
" add.ftz.f32 %f96, %f106, %f96;\n"
" st.shared.f32 [%rd54+1536], %f96;\n"
" ld.shared.f32 %f107, [%rd60+2048];\n"
" add.ftz.f32 %f101, %f107, %f101;\n"
" st.shared.f32 [%rd54+2048], %f101;\n"
" ld.shared.f32 %f108, [%rd60+2560];\n"
" add.ftz.f32 %f102, %f108, %f102;\n"
" st.shared.f32 [%rd54+2560], %f102;\n"
"$Lt_1_32002:\n"
" shr.u32 %r65, %r65, 1;\n"
" mov.u32 %r67, 0;\n"
" setp.ne.u32 %p19, %r65, %r67;\n"
" @%p19 bra $Lt_1_31746;\n"
"$Lt_1_31234:\n"
" mov.f32 %f11, %f93;\n"
" mov.f32 %f13, %f94;\n"
" mov.f32 %f15, %f95;\n"
" mov.f32 %f17, %f96;\n"
" mov.f32 %f19, %f101;\n"
" mov.f32 %f21, %f102;\n"
"$Lt_1_30722:\n"
"$Lt_1_28674:\n"
" mov.u32 %r68, 0;\n"
" setp.ne.s32 %p20, %r17, %r68;\n"
" @%p20 bra $Lt_1_32770;\n"
" ld.param.u64 %rd61, [__cudaparm_kernel_pair_fast___val_paramengv];\n"
" add.u64 %rd62, %rd61, %rd20;\n"
" ld.param.s32 %r69, [__cudaparm_kernel_pair_fast_eflag];\n"
" mov.u32 %r70, 0;\n"
" setp.le.s32 %p21, %r69, %r70;\n"
" @%p21 bra $Lt_1_33282;\n"
" st.global.f32 [%rd62+0], %f34;\n"
" cvt.s64.s32 %rd63, %r13;\n"
" mul.wide.s32 %rd64, %r13, 4;\n"
" add.u64 %rd62, %rd62, %rd64;\n"
"$Lt_1_33282:\n"
" ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag];\n"
" mov.u32 %r72, 0;\n"
" setp.le.s32 %p22, %r71, %r72;\n"
" @%p22 bra $Lt_1_33794;\n"
" mov.f32 %f109, %f11;\n"
" st.global.f32 [%rd62+0], %f109;\n"
" cvt.s64.s32 %rd65, %r13;\n"
" mul.wide.s32 %rd66, %r13, 4;\n"
" add.u64 %rd67, %rd66, %rd62;\n"
" mov.f32 %f110, %f13;\n"
" st.global.f32 [%rd67+0], %f110;\n"
" add.u64 %rd68, %rd66, %rd67;\n"
" mov.f32 %f111, %f15;\n"
" st.global.f32 [%rd68+0], %f111;\n"
" add.u64 %rd69, %rd66, %rd68;\n"
" mov.f32 %f112, %f17;\n"
" st.global.f32 [%rd69+0], %f112;\n"
" add.u64 %rd62, %rd66, %rd69;\n"
" mov.f32 %f113, %f19;\n"
" st.global.f32 [%rd62+0], %f113;\n"
" mov.f32 %f114, %f21;\n"
" add.u64 %rd70, %rd66, %rd62;\n"
" st.global.f32 [%rd70+0], %f114;\n"
"$Lt_1_33794:\n"
" ld.param.u64 %rd71, [__cudaparm_kernel_pair_fast_ans];\n"
" mul.lo.u64 %rd72, %rd19, 16;\n"
" add.u64 %rd73, %rd71, %rd72;\n"
" mov.f32 %f115, %f116;\n"
" st.global.v4.f32 [%rd73+0], {%f33,%f32,%f31,%f115};\n"
"$Lt_1_32770:\n"
"$Lt_1_24066:\n"
" .loc 16 204 0\n"
" exit;\n"
"$LDWend_kernel_pair_fast:\n"
" }\n"
;

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,957 +0,0 @@
const char * coul_long =
" .version 2.3\n"
" .target sm_20\n"
" .address_size 64\n"
" .global .texref pos_tex;\n"
" .global .texref q_tex;\n"
" .entry kernel_pair (\n"
" .param .u64 __cudaparm_kernel_pair_x_,\n"
" .param .u64 __cudaparm_kernel_pair_lj1,\n"
" .param .u64 __cudaparm_kernel_pair_lj3,\n"
" .param .s32 __cudaparm_kernel_pair_lj_types,\n"
" .param .u64 __cudaparm_kernel_pair_sp_cl_in,\n"
" .param .u64 __cudaparm_kernel_pair_dev_nbor,\n"
" .param .u64 __cudaparm_kernel_pair_dev_packed,\n"
" .param .u64 __cudaparm_kernel_pair_ans,\n"
" .param .u64 __cudaparm_kernel_pair_engv,\n"
" .param .s32 __cudaparm_kernel_pair_eflag,\n"
" .param .s32 __cudaparm_kernel_pair_vflag,\n"
" .param .s32 __cudaparm_kernel_pair_inum,\n"
" .param .s32 __cudaparm_kernel_pair_nbor_pitch,\n"
" .param .u64 __cudaparm_kernel_pair_q_,\n"
" .param .f32 __cudaparm_kernel_pair_cut_coulsq,\n"
" .param .f32 __cudaparm_kernel_pair_qqrd2e,\n"
" .param .f32 __cudaparm_kernel_pair_g_ewald,\n"
" .param .s32 __cudaparm_kernel_pair_t_per_atom)\n"
" {\n"
" .reg .u32 %r<81>;\n"
" .reg .u64 %rd<58>;\n"
" .reg .f32 %f<132>;\n"
" .reg .pred %p<19>;\n"
" .shared .align 16 .b8 __cuda___cuda_local_var_32541_33_non_const_sp_cl112[16];\n"
" .shared .align 4 .b8 __cuda___cuda_local_var_32611_37_non_const_red_acc128[3072];\n"
" .loc 16 36 0\n"
"$LDWbegin_kernel_pair:\n"
" .loc 16 41 0\n"
" ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_cl_in];\n"
" ldu.global.f32 %f1, [%rd1+0];\n"
" .loc 16 42 0\n"
" ld.global.f32 %f2, [%rd1+4];\n"
" .loc 16 43 0\n"
" ld.global.f32 %f3, [%rd1+8];\n"
" .loc 16 44 0\n"
" ld.global.f32 %f4, [%rd1+12];\n"
" st.shared.v4.f32 [__cuda___cuda_local_var_32541_33_non_const_sp_cl112+0], {%f1,%f2,%f3,%f4};\n"
" .loc 16 51 0\n"
" mov.f32 %f5, 0f00000000; \n"
" mov.f32 %f6, %f5;\n"
" mov.f32 %f7, 0f00000000; \n"
" mov.f32 %f8, %f7;\n"
" mov.f32 %f9, 0f00000000; \n"
" mov.f32 %f10, %f9;\n"
" mov.f32 %f11, 0f00000000; \n"
" mov.f32 %f12, %f11;\n"
" mov.f32 %f13, 0f00000000; \n"
" mov.f32 %f14, %f13;\n"
" mov.f32 %f15, 0f00000000; \n"
" mov.f32 %f16, %f15;\n"
" ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];\n"
" cvt.s32.u32 %r2, %tid.x;\n"
" div.s32 %r3, %r2, %r1;\n"
" cvt.s32.u32 %r4, %ntid.x;\n"
" div.s32 %r5, %r4, %r1;\n"
" cvt.s32.u32 %r6, %ctaid.x;\n"
" mul.lo.s32 %r7, %r6, %r5;\n"
" add.s32 %r8, %r3, %r7;\n"
" ld.param.s32 %r9, [__cudaparm_kernel_pair_inum];\n"
" setp.ge.s32 %p1, %r8, %r9;\n"
" @%p1 bra $Lt_0_25858;\n"
" .loc 16 56 0\n"
" cvt.s64.s32 %rd2, %r8;\n"
" mul.wide.s32 %rd3, %r8, 4;\n"
" ld.param.u64 %rd4, [__cudaparm_kernel_pair_dev_nbor];\n"
" add.u64 %rd5, %rd3, %rd4;\n"
" ld.global.s32 %r10, [%rd5+0];\n"
" ld.param.s32 %r11, [__cudaparm_kernel_pair_nbor_pitch];\n"
" cvt.s64.s32 %rd6, %r11;\n"
" mul.wide.s32 %rd7, %r11, 4;\n"
" add.u64 %rd8, %rd7, %rd5;\n"
" ld.global.s32 %r12, [%rd8+0];\n"
" sub.s32 %r13, %r1, 1;\n"
" and.b32 %r14, %r13, %r2;\n"
" cvt.s64.s32 %rd9, %r14;\n"
" mul.wide.s32 %rd10, %r14, 4;\n"
" ld.param.u64 %rd11, [__cudaparm_kernel_pair_dev_packed];\n"
" setp.ne.u64 %p2, %rd11, %rd4;\n"
" @%p2 bra $Lt_0_19458;\n"
" cvt.s32.s64 %r15, %rd6;\n"
" mul.lo.s32 %r16, %r15, %r1;\n"
" mov.s32 %r17, %r16;\n"
" mul.lo.s32 %r18, %r13, %r8;\n"
" add.s32 %r19, %r15, %r18;\n"
" cvt.s64.s32 %rd12, %r19;\n"
" mul.wide.s32 %rd13, %r19, 4;\n"
" add.u64 %rd14, %rd8, %rd13;\n"
" and.b32 %r20, %r13, %r12;\n"
" cvt.s64.s32 %rd15, %r20;\n"
" div.s32 %r21, %r12, %r1;\n"
" mul.lo.s32 %r22, %r16, %r21;\n"
" cvt.s64.s32 %rd16, %r22;\n"
" add.u64 %rd17, %rd15, %rd16;\n"
" mul.lo.u64 %rd18, %rd17, 4;\n"
" add.u64 %rd19, %rd14, %rd18;\n"
" add.u64 %rd20, %rd10, %rd14;\n"
" bra.uni $Lt_0_19202;\n"
"$Lt_0_19458:\n"
" add.u64 %rd21, %rd7, %rd8;\n"
" ld.global.s32 %r23, [%rd21+0];\n"
" cvt.s64.s32 %rd22, %r23;\n"
" mul.wide.s32 %rd23, %r23, 4;\n"
" add.u64 %rd24, %rd11, %rd23;\n"
" cvt.s64.s32 %rd25, %r12;\n"
" mul.wide.s32 %rd26, %r12, 4;\n"
" add.u64 %rd19, %rd24, %rd26;\n"
" mov.s32 %r17, %r1;\n"
" add.u64 %rd20, %rd10, %rd24;\n"
"$Lt_0_19202:\n"
" .loc 16 59 0\n"
" mov.u32 %r24, %r10;\n"
" mov.s32 %r25, 0;\n"
" mov.u32 %r26, %r25;\n"
" mov.s32 %r27, 0;\n"
" mov.u32 %r28, %r27;\n"
" mov.s32 %r29, 0;\n"
" mov.u32 %r30, %r29;\n"
" tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r24,%r26,%r28,%r30}];\n"
" mov.f32 %f21, %f17;\n"
" mov.f32 %f22, %f18;\n"
" mov.f32 %f23, %f19;\n"
" .loc 16 60 0\n"
" mov.u32 %r31, %r10;\n"
" mov.s32 %r32, 0;\n"
" mov.u32 %r33, %r32;\n"
" mov.s32 %r34, 0;\n"
" mov.u32 %r35, %r34;\n"
" mov.s32 %r36, 0;\n"
" mov.u32 %r37, %r36;\n"
" tex.1d.v4.f32.s32 {%f24,%f25,%f26,%f27},[q_tex,{%r31,%r33,%r35,%r37}];\n"
" mov.f32 %f28, %f24;\n"
" setp.ge.u64 %p3, %rd20, %rd19;\n"
" @%p3 bra $Lt_0_27394;\n"
" cvt.s64.s32 %rd27, %r17;\n"
" ld.param.f32 %f29, [__cudaparm_kernel_pair_cut_coulsq];\n"
" mov.f32 %f30, 0f00000000; \n"
" mov.f32 %f31, 0f00000000; \n"
" mov.f32 %f32, 0f00000000; \n"
" mov.f32 %f33, 0f00000000; \n"
" mov.u64 %rd28, __cuda___cuda_local_var_32541_33_non_const_sp_cl112;\n"
"$Lt_0_20226:\n"
" .loc 16 63 0\n"
" ld.global.s32 %r38, [%rd20+0];\n"
" .loc 16 66 0\n"
" mov.f32 %f34, 0f3f800000; \n"
" shr.s32 %r39, %r38, 30;\n"
" and.b32 %r40, %r39, 3;\n"
" cvt.s64.s32 %rd29, %r40;\n"
" mul.wide.s32 %rd30, %r40, 4;\n"
" add.u64 %rd31, %rd28, %rd30;\n"
" ld.shared.f32 %f35, [%rd31+0];\n"
" sub.ftz.f32 %f36, %f34, %f35;\n"
" .loc 16 69 0\n"
" and.b32 %r41, %r38, 1073741823;\n"
" mov.u32 %r42, %r41;\n"
" mov.s32 %r43, 0;\n"
" mov.u32 %r44, %r43;\n"
" mov.s32 %r45, 0;\n"
" mov.u32 %r46, %r45;\n"
" mov.s32 %r47, 0;\n"
" mov.u32 %r48, %r47;\n"
" tex.1d.v4.f32.s32 {%f37,%f38,%f39,%f40},[pos_tex,{%r42,%r44,%r46,%r48}];\n"
" mov.f32 %f41, %f37;\n"
" mov.f32 %f42, %f38;\n"
" mov.f32 %f43, %f39;\n"
" sub.ftz.f32 %f44, %f22, %f42;\n"
" sub.ftz.f32 %f45, %f21, %f41;\n"
" sub.ftz.f32 %f46, %f23, %f43;\n"
" mul.ftz.f32 %f47, %f44, %f44;\n"
" fma.rn.ftz.f32 %f48, %f45, %f45, %f47;\n"
" fma.rn.ftz.f32 %f49, %f46, %f46, %f48;\n"
" setp.lt.ftz.f32 %p4, %f49, %f29;\n"
" @!%p4 bra $Lt_0_20994;\n"
" .loc 20 518 0\n"
" rcp.approx.ftz.f32 %f50, %f49;\n"
" rsqrt.approx.ftz.f32 %f51, %f50;\n"
" ld.param.f32 %f52, [__cudaparm_kernel_pair_g_ewald];\n"
" mul.ftz.f32 %f53, %f52, %f51;\n"
" mul.ftz.f32 %f54, %f53, %f53;\n"
" neg.ftz.f32 %f55, %f54;\n"
" mov.f32 %f56, 0f3fb8aa3b; \n"
" mul.ftz.f32 %f57, %f55, %f56;\n"
" ex2.approx.ftz.f32 %f58, %f57;\n"
" .loc 16 85 0\n"
" mov.f32 %f59, 0f3f800000; \n"
" mov.f32 %f60, 0f3ea7ba05; \n"
" fma.rn.ftz.f32 %f61, %f60, %f53, %f59;\n"
" rcp.approx.ftz.f32 %f62, %f61;\n"
" mov.f32 %f63, 0f3e827906; \n"
" mov.f32 %f64, 0fbe91a98e; \n"
" mov.f32 %f65, 0f3fb5f0e3; \n"
" mov.f32 %f66, 0fbfba00e3; \n"
" mov.f32 %f67, 0f3f87dc22; \n"
" fma.rn.ftz.f32 %f68, %f67, %f62, %f66;\n"
" fma.rn.ftz.f32 %f69, %f62, %f68, %f65;\n"
" fma.rn.ftz.f32 %f70, %f62, %f69, %f64;\n"
" fma.rn.ftz.f32 %f71, %f62, %f70, %f63;\n"
" mul.ftz.f32 %f72, %f62, %f71;\n"
" mul.ftz.f32 %f73, %f58, %f72;\n"
" .loc 16 86 0\n"
" mov.u32 %r49, %r41;\n"
" mov.s32 %r50, 0;\n"
" mov.u32 %r51, %r50;\n"
" mov.s32 %r52, 0;\n"
" mov.u32 %r53, %r52;\n"
" mov.s32 %r54, 0;\n"
" mov.u32 %r55, %r54;\n"
" tex.1d.v4.f32.s32 {%f74,%f75,%f76,%f77},[q_tex,{%r49,%r51,%r53,%r55}];\n"
" mov.f32 %f78, %f74;\n"
" .loc 16 87 0\n"
" ld.param.f32 %f79, [__cudaparm_kernel_pair_qqrd2e];\n"
" mul.ftz.f32 %f80, %f79, %f28;\n"
" mul.ftz.f32 %f81, %f80, %f78;\n"
" div.approx.ftz.f32 %f82, %f81, %f51;\n"
" mov.f32 %f83, 0f3f906ebb; \n"
" mul.ftz.f32 %f84, %f53, %f83;\n"
" fma.rn.ftz.f32 %f85, %f58, %f84, %f73;\n"
" sub.ftz.f32 %f86, %f85, %f36;\n"
" mul.ftz.f32 %f87, %f82, %f86;\n"
" mul.ftz.f32 %f88, %f50, %f87;\n"
" .loc 16 89 0\n"
" fma.rn.ftz.f32 %f32, %f45, %f88, %f32;\n"
" .loc 16 90 0\n"
" fma.rn.ftz.f32 %f31, %f44, %f88, %f31;\n"
" .loc 16 91 0\n"
" fma.rn.ftz.f32 %f30, %f46, %f88, %f30;\n"
" .loc 16 78 0\n"
" sub.ftz.f32 %f89, %f73, %f36;\n"
" fma.rn.ftz.f32 %f90, %f82, %f89, %f33;\n"
" ld.param.s32 %r56, [__cudaparm_kernel_pair_eflag];\n"
" mov.s32 %r57, 0;\n"
" setp.gt.s32 %p5, %r56, %r57;\n"
" selp.f32 %f33, %f90, %f33, %p5;\n"
" ld.param.s32 %r58, [__cudaparm_kernel_pair_vflag];\n"
" mov.u32 %r59, 0;\n"
" setp.le.s32 %p6, %r58, %r59;\n"
" @%p6 bra $Lt_0_20994;\n"
" .loc 16 97 0\n"
" mov.f32 %f91, %f6;\n"
" mul.ftz.f32 %f92, %f45, %f45;\n"
" fma.rn.ftz.f32 %f93, %f88, %f92, %f91;\n"
" mov.f32 %f6, %f93;\n"
" .loc 16 98 0\n"
" mov.f32 %f94, %f8;\n"
" fma.rn.ftz.f32 %f95, %f88, %f47, %f94;\n"
" mov.f32 %f8, %f95;\n"
" .loc 16 99 0\n"
" mov.f32 %f96, %f10;\n"
" mul.ftz.f32 %f97, %f46, %f46;\n"
" fma.rn.ftz.f32 %f98, %f88, %f97, %f96;\n"
" mov.f32 %f10, %f98;\n"
" .loc 16 100 0\n"
" mov.f32 %f99, %f12;\n"
" mul.ftz.f32 %f100, %f44, %f45;\n"
" fma.rn.ftz.f32 %f101, %f88, %f100, %f99;\n"
" mov.f32 %f12, %f101;\n"
" .loc 16 101 0\n"
" mov.f32 %f102, %f14;\n"
" mul.ftz.f32 %f103, %f45, %f46;\n"
" fma.rn.ftz.f32 %f104, %f88, %f103, %f102;\n"
" mov.f32 %f14, %f104;\n"
" .loc 16 102 0\n"
" mul.ftz.f32 %f105, %f44, %f46;\n"
" fma.rn.ftz.f32 %f15, %f88, %f105, %f15;\n"
" mov.f32 %f16, %f15;\n"
"$Lt_0_20994:\n"
"$Lt_0_20482:\n"
" .loc 16 62 0\n"
" mul.lo.u64 %rd32, %rd27, 4;\n"
" add.u64 %rd20, %rd20, %rd32;\n"
" setp.lt.u64 %p7, %rd20, %rd19;\n"
" @%p7 bra $Lt_0_20226;\n"
" bra.uni $Lt_0_19714;\n"
"$Lt_0_27394:\n"
" mov.f32 %f30, 0f00000000; \n"
" mov.f32 %f31, 0f00000000; \n"
" mov.f32 %f32, 0f00000000; \n"
" mov.f32 %f33, 0f00000000; \n"
"$Lt_0_19714:\n"
" mov.u32 %r60, 1;\n"
" setp.le.s32 %p8, %r1, %r60;\n"
" @%p8 bra $Lt_0_23810;\n"
" .loc 16 112 0\n"
" mov.u64 %rd33, __cuda___cuda_local_var_32611_37_non_const_red_acc128;\n"
" cvt.s64.s32 %rd34, %r2;\n"
" mul.wide.s32 %rd35, %r2, 4;\n"
" add.u64 %rd36, %rd33, %rd35;\n"
" mov.f32 %f106, %f32;\n"
" st.shared.f32 [%rd36+0], %f106;\n"
" .loc 16 113 0\n"
" mov.f32 %f107, %f31;\n"
" st.shared.f32 [%rd36+512], %f107;\n"
" .loc 16 114 0\n"
" mov.f32 %f108, %f30;\n"
" st.shared.f32 [%rd36+1024], %f108;\n"
" .loc 16 115 0\n"
" mov.f32 %f109, %f33;\n"
" st.shared.f32 [%rd36+1536], %f109;\n"
" .loc 16 117 0\n"
" shr.s32 %r61, %r1, 31;\n"
" mov.s32 %r62, 1;\n"
" and.b32 %r63, %r61, %r62;\n"
" add.s32 %r64, %r63, %r1;\n"
" shr.s32 %r65, %r64, 1;\n"
" mov.s32 %r66, %r65;\n"
" mov.u32 %r67, 0;\n"
" setp.ne.u32 %p9, %r65, %r67;\n"
" @!%p9 bra $Lt_0_22274;\n"
"$Lt_0_22786:\n"
" setp.ge.u32 %p10, %r14, %r66;\n"
" @%p10 bra $Lt_0_23042;\n"
" .loc 16 120 0\n"
" add.u32 %r68, %r2, %r66;\n"
" cvt.u64.u32 %rd37, %r68;\n"
" mul.wide.u32 %rd38, %r68, 4;\n"
" add.u64 %rd39, %rd33, %rd38;\n"
" ld.shared.f32 %f110, [%rd39+0];\n"
" add.ftz.f32 %f106, %f110, %f106;\n"
" st.shared.f32 [%rd36+0], %f106;\n"
" ld.shared.f32 %f111, [%rd39+512];\n"
" add.ftz.f32 %f107, %f111, %f107;\n"
" st.shared.f32 [%rd36+512], %f107;\n"
" ld.shared.f32 %f112, [%rd39+1024];\n"
" add.ftz.f32 %f108, %f112, %f108;\n"
" st.shared.f32 [%rd36+1024], %f108;\n"
" ld.shared.f32 %f113, [%rd39+1536];\n"
" add.ftz.f32 %f109, %f113, %f109;\n"
" st.shared.f32 [%rd36+1536], %f109;\n"
"$Lt_0_23042:\n"
" .loc 16 117 0\n"
" shr.u32 %r66, %r66, 1;\n"
" mov.u32 %r69, 0;\n"
" setp.ne.u32 %p11, %r66, %r69;\n"
" @%p11 bra $Lt_0_22786;\n"
"$Lt_0_22274:\n"
" .loc 16 124 0\n"
" mov.f32 %f32, %f106;\n"
" .loc 16 125 0\n"
" mov.f32 %f31, %f107;\n"
" .loc 16 126 0\n"
" mov.f32 %f30, %f108;\n"
" .loc 16 127 0\n"
" mov.f32 %f33, %f109;\n"
" ld.param.s32 %r70, [__cudaparm_kernel_pair_vflag];\n"
" mov.u32 %r71, 0;\n"
" setp.le.s32 %p12, %r70, %r71;\n"
" @%p12 bra $Lt_0_23810;\n"
" .loc 16 131 0\n"
" mov.f32 %f106, %f6;\n"
" st.shared.f32 [%rd36+0], %f106;\n"
" mov.f32 %f107, %f8;\n"
" st.shared.f32 [%rd36+512], %f107;\n"
" mov.f32 %f108, %f10;\n"
" st.shared.f32 [%rd36+1024], %f108;\n"
" mov.f32 %f109, %f12;\n"
" st.shared.f32 [%rd36+1536], %f109;\n"
" mov.f32 %f114, %f14;\n"
" st.shared.f32 [%rd36+2048], %f114;\n"
" mov.f32 %f115, %f15;\n"
" st.shared.f32 [%rd36+2560], %f115;\n"
" .loc 16 133 0\n"
" mov.s32 %r72, %r65;\n"
" @!%p9 bra $Lt_0_24322;\n"
"$Lt_0_24834:\n"
" setp.ge.u32 %p13, %r14, %r72;\n"
" @%p13 bra $Lt_0_25090;\n"
" .loc 16 136 0\n"
" add.u32 %r73, %r2, %r72;\n"
" cvt.u64.u32 %rd40, %r73;\n"
" mul.wide.u32 %rd41, %r73, 4;\n"
" add.u64 %rd42, %rd33, %rd41;\n"
" ld.shared.f32 %f116, [%rd42+0];\n"
" add.ftz.f32 %f106, %f116, %f106;\n"
" st.shared.f32 [%rd36+0], %f106;\n"
" ld.shared.f32 %f117, [%rd42+512];\n"
" add.ftz.f32 %f107, %f117, %f107;\n"
" st.shared.f32 [%rd36+512], %f107;\n"
" ld.shared.f32 %f118, [%rd42+1024];\n"
" add.ftz.f32 %f108, %f118, %f108;\n"
" st.shared.f32 [%rd36+1024], %f108;\n"
" ld.shared.f32 %f119, [%rd42+1536];\n"
" add.ftz.f32 %f109, %f119, %f109;\n"
" st.shared.f32 [%rd36+1536], %f109;\n"
" ld.shared.f32 %f120, [%rd42+2048];\n"
" add.ftz.f32 %f114, %f120, %f114;\n"
" st.shared.f32 [%rd36+2048], %f114;\n"
" ld.shared.f32 %f121, [%rd42+2560];\n"
" add.ftz.f32 %f115, %f121, %f115;\n"
" st.shared.f32 [%rd36+2560], %f115;\n"
"$Lt_0_25090:\n"
" .loc 16 133 0\n"
" shr.u32 %r72, %r72, 1;\n"
" mov.u32 %r74, 0;\n"
" setp.ne.u32 %p14, %r72, %r74;\n"
" @%p14 bra $Lt_0_24834;\n"
"$Lt_0_24322:\n"
" .loc 16 141 0\n"
" mov.f32 %f6, %f106;\n"
" mov.f32 %f8, %f107;\n"
" mov.f32 %f10, %f108;\n"
" mov.f32 %f12, %f109;\n"
" mov.f32 %f14, %f114;\n"
" mov.f32 %f16, %f115;\n"
"$Lt_0_23810:\n"
"$Lt_0_21762:\n"
" mov.u32 %r75, 0;\n"
" setp.ne.s32 %p15, %r14, %r75;\n"
" @%p15 bra $Lt_0_25858;\n"
" .loc 16 147 0\n"
" ld.param.u64 %rd43, [__cudaparm_kernel_pair_engv];\n"
" add.u64 %rd44, %rd43, %rd3;\n"
" ld.param.s32 %r76, [__cudaparm_kernel_pair_eflag];\n"
" mov.u32 %r77, 0;\n"
" setp.le.s32 %p16, %r76, %r77;\n"
" @%p16 bra $Lt_0_26370;\n"
" .loc 16 149 0\n"
" mov.f32 %f122, 0f00000000; \n"
" st.global.f32 [%rd44+0], %f122;\n"
" .loc 16 150 0\n"
" cvt.s64.s32 %rd45, %r9;\n"
" mul.wide.s32 %rd46, %r9, 4;\n"
" add.u64 %rd47, %rd46, %rd44;\n"
" .loc 16 151 0\n"
" st.global.f32 [%rd47+0], %f33;\n"
" .loc 16 152 0\n"
" add.u64 %rd44, %rd46, %rd47;\n"
"$Lt_0_26370:\n"
" ld.param.s32 %r78, [__cudaparm_kernel_pair_vflag];\n"
" mov.u32 %r79, 0;\n"
" setp.le.s32 %p17, %r78, %r79;\n"
" @%p17 bra $Lt_0_26882;\n"
" .loc 16 156 0\n"
" mov.f32 %f123, %f6;\n"
" st.global.f32 [%rd44+0], %f123;\n"
" .loc 16 157 0\n"
" cvt.s64.s32 %rd48, %r9;\n"
" mul.wide.s32 %rd49, %r9, 4;\n"
" add.u64 %rd50, %rd49, %rd44;\n"
" .loc 16 156 0\n"
" mov.f32 %f124, %f8;\n"
" st.global.f32 [%rd50+0], %f124;\n"
" .loc 16 157 0\n"
" add.u64 %rd51, %rd49, %rd50;\n"
" .loc 16 156 0\n"
" mov.f32 %f125, %f10;\n"
" st.global.f32 [%rd51+0], %f125;\n"
" .loc 16 157 0\n"
" add.u64 %rd52, %rd49, %rd51;\n"
" .loc 16 156 0\n"
" mov.f32 %f126, %f12;\n"
" st.global.f32 [%rd52+0], %f126;\n"
" .loc 16 157 0\n"
" add.u64 %rd44, %rd49, %rd52;\n"
" .loc 16 156 0\n"
" mov.f32 %f127, %f14;\n"
" st.global.f32 [%rd44+0], %f127;\n"
" mov.f32 %f128, %f16;\n"
" add.u64 %rd53, %rd49, %rd44;\n"
" st.global.f32 [%rd53+0], %f128;\n"
"$Lt_0_26882:\n"
" .loc 16 160 0\n"
" ld.param.u64 %rd54, [__cudaparm_kernel_pair_ans];\n"
" mul.lo.u64 %rd55, %rd2, 16;\n"
" add.u64 %rd56, %rd54, %rd55;\n"
" mov.f32 %f129, %f130;\n"
" st.global.v4.f32 [%rd56+0], {%f32,%f31,%f30,%f129};\n"
"$Lt_0_25858:\n"
"$Lt_0_18690:\n"
" .loc 16 163 0\n"
" exit;\n"
"$LDWend_kernel_pair:\n"
" }\n"
" .entry kernel_pair_fast (\n"
" .param .u64 __cudaparm_kernel_pair_fast_x_,\n"
" .param .u64 __cudaparm_kernel_pair_fast_lj1_in,\n"
" .param .u64 __cudaparm_kernel_pair_fast_lj3_in,\n"
" .param .u64 __cudaparm_kernel_pair_fast_sp_cl_in,\n"
" .param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n"
" .param .u64 __cudaparm_kernel_pair_fast_dev_packed,\n"
" .param .u64 __cudaparm_kernel_pair_fast_ans,\n"
" .param .u64 __cudaparm_kernel_pair_fast_engv,\n"
" .param .s32 __cudaparm_kernel_pair_fast_eflag,\n"
" .param .s32 __cudaparm_kernel_pair_fast_vflag,\n"
" .param .s32 __cudaparm_kernel_pair_fast_inum,\n"
" .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,\n"
" .param .u64 __cudaparm_kernel_pair_fast_q_,\n"
" .param .f32 __cudaparm_kernel_pair_fast_cut_coulsq,\n"
" .param .f32 __cudaparm_kernel_pair_fast_qqrd2e,\n"
" .param .f32 __cudaparm_kernel_pair_fast_g_ewald,\n"
" .param .s32 __cudaparm_kernel_pair_fast_t_per_atom)\n"
" {\n"
" .reg .u32 %r<82>;\n"
" .reg .u64 %rd<62>;\n"
" .reg .f32 %f<129>;\n"
" .reg .pred %p<20>;\n"
" .shared .align 4 .b8 __cuda___cuda_local_var_32678_33_non_const_sp_cl3304[16];\n"
" .shared .align 4 .b8 __cuda___cuda_local_var_32748_37_non_const_red_acc3320[3072];\n"
" .loc 16 173 0\n"
"$LDWbegin_kernel_pair_fast:\n"
" cvt.s32.u32 %r1, %tid.x;\n"
" mov.u32 %r2, 3;\n"
" setp.gt.s32 %p1, %r1, %r2;\n"
" @%p1 bra $Lt_1_19458;\n"
" .loc 16 179 0\n"
" mov.u64 %rd1, __cuda___cuda_local_var_32678_33_non_const_sp_cl3304;\n"
" cvt.s64.s32 %rd2, %r1;\n"
" mul.wide.s32 %rd3, %r1, 4;\n"
" ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_cl_in];\n"
" add.u64 %rd5, %rd4, %rd3;\n"
" ld.global.f32 %f1, [%rd5+0];\n"
" add.u64 %rd6, %rd3, %rd1;\n"
" st.shared.f32 [%rd6+0], %f1;\n"
"$Lt_1_19458:\n"
" mov.u64 %rd1, __cuda___cuda_local_var_32678_33_non_const_sp_cl3304;\n"
" .loc 16 186 0\n"
" mov.f32 %f2, 0f00000000; \n"
" mov.f32 %f3, %f2;\n"
" mov.f32 %f4, 0f00000000; \n"
" mov.f32 %f5, %f4;\n"
" mov.f32 %f6, 0f00000000; \n"
" mov.f32 %f7, %f6;\n"
" mov.f32 %f8, 0f00000000; \n"
" mov.f32 %f9, %f8;\n"
" mov.f32 %f10, 0f00000000; \n"
" mov.f32 %f11, %f10;\n"
" mov.f32 %f12, 0f00000000; \n"
" mov.f32 %f13, %f12;\n"
" .loc 16 188 0\n"
" bar.sync 0;\n"
" ld.param.s32 %r3, [__cudaparm_kernel_pair_fast_t_per_atom];\n"
" div.s32 %r4, %r1, %r3;\n"
" cvt.s32.u32 %r5, %ntid.x;\n"
" div.s32 %r6, %r5, %r3;\n"
" cvt.s32.u32 %r7, %ctaid.x;\n"
" mul.lo.s32 %r8, %r7, %r6;\n"
" add.s32 %r9, %r4, %r8;\n"
" ld.param.s32 %r10, [__cudaparm_kernel_pair_fast_inum];\n"
" setp.ge.s32 %p2, %r9, %r10;\n"
" @%p2 bra $Lt_1_27138;\n"
" .loc 16 193 0\n"
" cvt.s64.s32 %rd7, %r9;\n"
" mul.wide.s32 %rd8, %r9, 4;\n"
" ld.param.u64 %rd9, [__cudaparm_kernel_pair_fast_dev_nbor];\n"
" add.u64 %rd10, %rd8, %rd9;\n"
" ld.global.s32 %r11, [%rd10+0];\n"
" ld.param.s32 %r12, [__cudaparm_kernel_pair_fast_nbor_pitch];\n"
" cvt.s64.s32 %rd11, %r12;\n"
" mul.wide.s32 %rd12, %r12, 4;\n"
" add.u64 %rd13, %rd12, %rd10;\n"
" ld.global.s32 %r13, [%rd13+0];\n"
" sub.s32 %r14, %r3, 1;\n"
" and.b32 %r15, %r14, %r1;\n"
" cvt.s64.s32 %rd14, %r15;\n"
" mul.wide.s32 %rd15, %r15, 4;\n"
" ld.param.u64 %rd16, [__cudaparm_kernel_pair_fast_dev_packed];\n"
" setp.ne.u64 %p3, %rd16, %rd9;\n"
" @%p3 bra $Lt_1_20738;\n"
" cvt.s32.s64 %r16, %rd11;\n"
" mul.lo.s32 %r17, %r16, %r3;\n"
" mov.s32 %r18, %r17;\n"
" mul.lo.s32 %r19, %r14, %r9;\n"
" add.s32 %r20, %r16, %r19;\n"
" cvt.s64.s32 %rd17, %r20;\n"
" mul.wide.s32 %rd18, %r20, 4;\n"
" add.u64 %rd19, %rd13, %rd18;\n"
" and.b32 %r21, %r14, %r13;\n"
" cvt.s64.s32 %rd20, %r21;\n"
" div.s32 %r22, %r13, %r3;\n"
" mul.lo.s32 %r23, %r17, %r22;\n"
" cvt.s64.s32 %rd21, %r23;\n"
" add.u64 %rd22, %rd20, %rd21;\n"
" mul.lo.u64 %rd23, %rd22, 4;\n"
" add.u64 %rd24, %rd19, %rd23;\n"
" add.u64 %rd25, %rd15, %rd19;\n"
" bra.uni $Lt_1_20482;\n"
"$Lt_1_20738:\n"
" add.u64 %rd26, %rd12, %rd13;\n"
" ld.global.s32 %r24, [%rd26+0];\n"
" cvt.s64.s32 %rd27, %r24;\n"
" mul.wide.s32 %rd28, %r24, 4;\n"
" add.u64 %rd29, %rd16, %rd28;\n"
" cvt.s64.s32 %rd30, %r13;\n"
" mul.wide.s32 %rd31, %r13, 4;\n"
" add.u64 %rd24, %rd29, %rd31;\n"
" mov.s32 %r18, %r3;\n"
" add.u64 %rd25, %rd15, %rd29;\n"
"$Lt_1_20482:\n"
" .loc 16 196 0\n"
" mov.u32 %r25, %r11;\n"
" mov.s32 %r26, 0;\n"
" mov.u32 %r27, %r26;\n"
" mov.s32 %r28, 0;\n"
" mov.u32 %r29, %r28;\n"
" mov.s32 %r30, 0;\n"
" mov.u32 %r31, %r30;\n"
" tex.1d.v4.f32.s32 {%f14,%f15,%f16,%f17},[pos_tex,{%r25,%r27,%r29,%r31}];\n"
" mov.f32 %f18, %f14;\n"
" mov.f32 %f19, %f15;\n"
" mov.f32 %f20, %f16;\n"
" .loc 16 197 0\n"
" mov.u32 %r32, %r11;\n"
" mov.s32 %r33, 0;\n"
" mov.u32 %r34, %r33;\n"
" mov.s32 %r35, 0;\n"
" mov.u32 %r36, %r35;\n"
" mov.s32 %r37, 0;\n"
" mov.u32 %r38, %r37;\n"
" tex.1d.v4.f32.s32 {%f21,%f22,%f23,%f24},[q_tex,{%r32,%r34,%r36,%r38}];\n"
" mov.f32 %f25, %f21;\n"
" setp.ge.u64 %p4, %rd25, %rd24;\n"
" @%p4 bra $Lt_1_28674;\n"
" cvt.s64.s32 %rd32, %r18;\n"
" ld.param.f32 %f26, [__cudaparm_kernel_pair_fast_cut_coulsq];\n"
" mov.f32 %f27, 0f00000000; \n"
" mov.f32 %f28, 0f00000000; \n"
" mov.f32 %f29, 0f00000000; \n"
" mov.f32 %f30, 0f00000000; \n"
"$Lt_1_21506:\n"
" .loc 16 200 0\n"
" ld.global.s32 %r39, [%rd25+0];\n"
" .loc 16 203 0\n"
" mov.f32 %f31, 0f3f800000; \n"
" shr.s32 %r40, %r39, 30;\n"
" and.b32 %r41, %r40, 3;\n"
" cvt.s64.s32 %rd33, %r41;\n"
" mul.wide.s32 %rd34, %r41, 4;\n"
" add.u64 %rd35, %rd1, %rd34;\n"
" ld.shared.f32 %f32, [%rd35+0];\n"
" sub.ftz.f32 %f33, %f31, %f32;\n"
" .loc 16 206 0\n"
" and.b32 %r42, %r39, 1073741823;\n"
" mov.u32 %r43, %r42;\n"
" mov.s32 %r44, 0;\n"
" mov.u32 %r45, %r44;\n"
" mov.s32 %r46, 0;\n"
" mov.u32 %r47, %r46;\n"
" mov.s32 %r48, 0;\n"
" mov.u32 %r49, %r48;\n"
" tex.1d.v4.f32.s32 {%f34,%f35,%f36,%f37},[pos_tex,{%r43,%r45,%r47,%r49}];\n"
" mov.f32 %f38, %f34;\n"
" mov.f32 %f39, %f35;\n"
" mov.f32 %f40, %f36;\n"
" sub.ftz.f32 %f41, %f19, %f39;\n"
" sub.ftz.f32 %f42, %f18, %f38;\n"
" sub.ftz.f32 %f43, %f20, %f40;\n"
" mul.ftz.f32 %f44, %f41, %f41;\n"
" fma.rn.ftz.f32 %f45, %f42, %f42, %f44;\n"
" fma.rn.ftz.f32 %f46, %f43, %f43, %f45;\n"
" setp.lt.ftz.f32 %p5, %f46, %f26;\n"
" @!%p5 bra $Lt_1_22274;\n"
" .loc 20 518 0\n"
" rcp.approx.ftz.f32 %f47, %f46;\n"
" rsqrt.approx.ftz.f32 %f48, %f47;\n"
" ld.param.f32 %f49, [__cudaparm_kernel_pair_fast_g_ewald];\n"
" mul.ftz.f32 %f50, %f49, %f48;\n"
" mul.ftz.f32 %f51, %f50, %f50;\n"
" neg.ftz.f32 %f52, %f51;\n"
" mov.f32 %f53, 0f3fb8aa3b; \n"
" mul.ftz.f32 %f54, %f52, %f53;\n"
" ex2.approx.ftz.f32 %f55, %f54;\n"
" .loc 16 222 0\n"
" mov.f32 %f56, 0f3f800000; \n"
" mov.f32 %f57, 0f3ea7ba05; \n"
" fma.rn.ftz.f32 %f58, %f57, %f50, %f56;\n"
" rcp.approx.ftz.f32 %f59, %f58;\n"
" mov.f32 %f60, 0f3e827906; \n"
" mov.f32 %f61, 0fbe91a98e; \n"
" mov.f32 %f62, 0f3fb5f0e3; \n"
" mov.f32 %f63, 0fbfba00e3; \n"
" mov.f32 %f64, 0f3f87dc22; \n"
" fma.rn.ftz.f32 %f65, %f64, %f59, %f63;\n"
" fma.rn.ftz.f32 %f66, %f59, %f65, %f62;\n"
" fma.rn.ftz.f32 %f67, %f59, %f66, %f61;\n"
" fma.rn.ftz.f32 %f68, %f59, %f67, %f60;\n"
" mul.ftz.f32 %f69, %f59, %f68;\n"
" mul.ftz.f32 %f70, %f55, %f69;\n"
" .loc 16 223 0\n"
" mov.u32 %r50, %r42;\n"
" mov.s32 %r51, 0;\n"
" mov.u32 %r52, %r51;\n"
" mov.s32 %r53, 0;\n"
" mov.u32 %r54, %r53;\n"
" mov.s32 %r55, 0;\n"
" mov.u32 %r56, %r55;\n"
" tex.1d.v4.f32.s32 {%f71,%f72,%f73,%f74},[q_tex,{%r50,%r52,%r54,%r56}];\n"
" mov.f32 %f75, %f71;\n"
" .loc 16 224 0\n"
" ld.param.f32 %f76, [__cudaparm_kernel_pair_fast_qqrd2e];\n"
" mul.ftz.f32 %f77, %f76, %f25;\n"
" mul.ftz.f32 %f78, %f77, %f75;\n"
" div.approx.ftz.f32 %f79, %f78, %f48;\n"
" mov.f32 %f80, 0f3f906ebb; \n"
" mul.ftz.f32 %f81, %f50, %f80;\n"
" fma.rn.ftz.f32 %f82, %f55, %f81, %f70;\n"
" sub.ftz.f32 %f83, %f82, %f33;\n"
" mul.ftz.f32 %f84, %f79, %f83;\n"
" mul.ftz.f32 %f85, %f47, %f84;\n"
" .loc 16 226 0\n"
" fma.rn.ftz.f32 %f29, %f42, %f85, %f29;\n"
" .loc 16 227 0\n"
" fma.rn.ftz.f32 %f28, %f41, %f85, %f28;\n"
" .loc 16 228 0\n"
" fma.rn.ftz.f32 %f27, %f43, %f85, %f27;\n"
" .loc 16 215 0\n"
" sub.ftz.f32 %f86, %f70, %f33;\n"
" fma.rn.ftz.f32 %f87, %f79, %f86, %f30;\n"
" ld.param.s32 %r57, [__cudaparm_kernel_pair_fast_eflag];\n"
" mov.s32 %r58, 0;\n"
" setp.gt.s32 %p6, %r57, %r58;\n"
" selp.f32 %f30, %f87, %f30, %p6;\n"
" ld.param.s32 %r59, [__cudaparm_kernel_pair_fast_vflag];\n"
" mov.u32 %r60, 0;\n"
" setp.le.s32 %p7, %r59, %r60;\n"
" @%p7 bra $Lt_1_22274;\n"
" .loc 16 234 0\n"
" mov.f32 %f88, %f3;\n"
" mul.ftz.f32 %f89, %f42, %f42;\n"
" fma.rn.ftz.f32 %f90, %f85, %f89, %f88;\n"
" mov.f32 %f3, %f90;\n"
" .loc 16 235 0\n"
" mov.f32 %f91, %f5;\n"
" fma.rn.ftz.f32 %f92, %f85, %f44, %f91;\n"
" mov.f32 %f5, %f92;\n"
" .loc 16 236 0\n"
" mov.f32 %f93, %f7;\n"
" mul.ftz.f32 %f94, %f43, %f43;\n"
" fma.rn.ftz.f32 %f95, %f85, %f94, %f93;\n"
" mov.f32 %f7, %f95;\n"
" .loc 16 237 0\n"
" mov.f32 %f96, %f9;\n"
" mul.ftz.f32 %f97, %f41, %f42;\n"
" fma.rn.ftz.f32 %f98, %f85, %f97, %f96;\n"
" mov.f32 %f9, %f98;\n"
" .loc 16 238 0\n"
" mov.f32 %f99, %f11;\n"
" mul.ftz.f32 %f100, %f42, %f43;\n"
" fma.rn.ftz.f32 %f101, %f85, %f100, %f99;\n"
" mov.f32 %f11, %f101;\n"
" .loc 16 239 0\n"
" mul.ftz.f32 %f102, %f41, %f43;\n"
" fma.rn.ftz.f32 %f12, %f85, %f102, %f12;\n"
" mov.f32 %f13, %f12;\n"
"$Lt_1_22274:\n"
"$Lt_1_21762:\n"
" .loc 16 199 0\n"
" mul.lo.u64 %rd36, %rd32, 4;\n"
" add.u64 %rd25, %rd25, %rd36;\n"
" setp.lt.u64 %p8, %rd25, %rd24;\n"
" @%p8 bra $Lt_1_21506;\n"
" bra.uni $Lt_1_20994;\n"
"$Lt_1_28674:\n"
" mov.f32 %f27, 0f00000000; \n"
" mov.f32 %f28, 0f00000000; \n"
" mov.f32 %f29, 0f00000000; \n"
" mov.f32 %f30, 0f00000000; \n"
"$Lt_1_20994:\n"
" mov.u32 %r61, 1;\n"
" setp.le.s32 %p9, %r3, %r61;\n"
" @%p9 bra $Lt_1_25090;\n"
" .loc 16 249 0\n"
" mov.u64 %rd37, __cuda___cuda_local_var_32748_37_non_const_red_acc3320;\n"
" cvt.s64.s32 %rd38, %r1;\n"
" mul.wide.s32 %rd39, %r1, 4;\n"
" add.u64 %rd40, %rd37, %rd39;\n"
" mov.f32 %f103, %f29;\n"
" st.shared.f32 [%rd40+0], %f103;\n"
" .loc 16 250 0\n"
" mov.f32 %f104, %f28;\n"
" st.shared.f32 [%rd40+512], %f104;\n"
" .loc 16 251 0\n"
" mov.f32 %f105, %f27;\n"
" st.shared.f32 [%rd40+1024], %f105;\n"
" .loc 16 252 0\n"
" mov.f32 %f106, %f30;\n"
" st.shared.f32 [%rd40+1536], %f106;\n"
" .loc 16 254 0\n"
" shr.s32 %r62, %r3, 31;\n"
" mov.s32 %r63, 1;\n"
" and.b32 %r64, %r62, %r63;\n"
" add.s32 %r65, %r64, %r3;\n"
" shr.s32 %r66, %r65, 1;\n"
" mov.s32 %r67, %r66;\n"
" mov.u32 %r68, 0;\n"
" setp.ne.u32 %p10, %r66, %r68;\n"
" @!%p10 bra $Lt_1_23554;\n"
"$Lt_1_24066:\n"
" setp.ge.u32 %p11, %r15, %r67;\n"
" @%p11 bra $Lt_1_24322;\n"
" .loc 16 257 0\n"
" add.u32 %r69, %r1, %r67;\n"
" cvt.u64.u32 %rd41, %r69;\n"
" mul.wide.u32 %rd42, %r69, 4;\n"
" add.u64 %rd43, %rd37, %rd42;\n"
" ld.shared.f32 %f107, [%rd43+0];\n"
" add.ftz.f32 %f103, %f107, %f103;\n"
" st.shared.f32 [%rd40+0], %f103;\n"
" ld.shared.f32 %f108, [%rd43+512];\n"
" add.ftz.f32 %f104, %f108, %f104;\n"
" st.shared.f32 [%rd40+512], %f104;\n"
" ld.shared.f32 %f109, [%rd43+1024];\n"
" add.ftz.f32 %f105, %f109, %f105;\n"
" st.shared.f32 [%rd40+1024], %f105;\n"
" ld.shared.f32 %f110, [%rd43+1536];\n"
" add.ftz.f32 %f106, %f110, %f106;\n"
" st.shared.f32 [%rd40+1536], %f106;\n"
"$Lt_1_24322:\n"
" .loc 16 254 0\n"
" shr.u32 %r67, %r67, 1;\n"
" mov.u32 %r70, 0;\n"
" setp.ne.u32 %p12, %r67, %r70;\n"
" @%p12 bra $Lt_1_24066;\n"
"$Lt_1_23554:\n"
" .loc 16 261 0\n"
" mov.f32 %f29, %f103;\n"
" .loc 16 262 0\n"
" mov.f32 %f28, %f104;\n"
" .loc 16 263 0\n"
" mov.f32 %f27, %f105;\n"
" .loc 16 264 0\n"
" mov.f32 %f30, %f106;\n"
" ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag];\n"
" mov.u32 %r72, 0;\n"
" setp.le.s32 %p13, %r71, %r72;\n"
" @%p13 bra $Lt_1_25090;\n"
" .loc 16 268 0\n"
" mov.f32 %f103, %f3;\n"
" st.shared.f32 [%rd40+0], %f103;\n"
" mov.f32 %f104, %f5;\n"
" st.shared.f32 [%rd40+512], %f104;\n"
" mov.f32 %f105, %f7;\n"
" st.shared.f32 [%rd40+1024], %f105;\n"
" mov.f32 %f106, %f9;\n"
" st.shared.f32 [%rd40+1536], %f106;\n"
" mov.f32 %f111, %f11;\n"
" st.shared.f32 [%rd40+2048], %f111;\n"
" mov.f32 %f112, %f12;\n"
" st.shared.f32 [%rd40+2560], %f112;\n"
" .loc 16 270 0\n"
" mov.s32 %r73, %r66;\n"
" @!%p10 bra $Lt_1_25602;\n"
"$Lt_1_26114:\n"
" setp.ge.u32 %p14, %r15, %r73;\n"
" @%p14 bra $Lt_1_26370;\n"
" .loc 16 273 0\n"
" add.u32 %r74, %r1, %r73;\n"
" cvt.u64.u32 %rd44, %r74;\n"
" mul.wide.u32 %rd45, %r74, 4;\n"
" add.u64 %rd46, %rd37, %rd45;\n"
" ld.shared.f32 %f113, [%rd46+0];\n"
" add.ftz.f32 %f103, %f113, %f103;\n"
" st.shared.f32 [%rd40+0], %f103;\n"
" ld.shared.f32 %f114, [%rd46+512];\n"
" add.ftz.f32 %f104, %f114, %f104;\n"
" st.shared.f32 [%rd40+512], %f104;\n"
" ld.shared.f32 %f115, [%rd46+1024];\n"
" add.ftz.f32 %f105, %f115, %f105;\n"
" st.shared.f32 [%rd40+1024], %f105;\n"
" ld.shared.f32 %f116, [%rd46+1536];\n"
" add.ftz.f32 %f106, %f116, %f106;\n"
" st.shared.f32 [%rd40+1536], %f106;\n"
" ld.shared.f32 %f117, [%rd46+2048];\n"
" add.ftz.f32 %f111, %f117, %f111;\n"
" st.shared.f32 [%rd40+2048], %f111;\n"
" ld.shared.f32 %f118, [%rd46+2560];\n"
" add.ftz.f32 %f112, %f118, %f112;\n"
" st.shared.f32 [%rd40+2560], %f112;\n"
"$Lt_1_26370:\n"
" .loc 16 270 0\n"
" shr.u32 %r73, %r73, 1;\n"
" mov.u32 %r75, 0;\n"
" setp.ne.u32 %p15, %r73, %r75;\n"
" @%p15 bra $Lt_1_26114;\n"
"$Lt_1_25602:\n"
" .loc 16 278 0\n"
" mov.f32 %f3, %f103;\n"
" mov.f32 %f5, %f104;\n"
" mov.f32 %f7, %f105;\n"
" mov.f32 %f9, %f106;\n"
" mov.f32 %f11, %f111;\n"
" mov.f32 %f13, %f112;\n"
"$Lt_1_25090:\n"
"$Lt_1_23042:\n"
" mov.u32 %r76, 0;\n"
" setp.ne.s32 %p16, %r15, %r76;\n"
" @%p16 bra $Lt_1_27138;\n"
" .loc 16 284 0\n"
" ld.param.u64 %rd47, [__cudaparm_kernel_pair_fast_engv];\n"
" add.u64 %rd48, %rd47, %rd8;\n"
" ld.param.s32 %r77, [__cudaparm_kernel_pair_fast_eflag];\n"
" mov.u32 %r78, 0;\n"
" setp.le.s32 %p17, %r77, %r78;\n"
" @%p17 bra $Lt_1_27650;\n"
" .loc 16 286 0\n"
" mov.f32 %f119, 0f00000000; \n"
" st.global.f32 [%rd48+0], %f119;\n"
" .loc 16 287 0\n"
" cvt.s64.s32 %rd49, %r10;\n"
" mul.wide.s32 %rd50, %r10, 4;\n"
" add.u64 %rd51, %rd50, %rd48;\n"
" .loc 16 288 0\n"
" st.global.f32 [%rd51+0], %f30;\n"
" .loc 16 289 0\n"
" add.u64 %rd48, %rd50, %rd51;\n"
"$Lt_1_27650:\n"
" ld.param.s32 %r79, [__cudaparm_kernel_pair_fast_vflag];\n"
" mov.u32 %r80, 0;\n"
" setp.le.s32 %p18, %r79, %r80;\n"
" @%p18 bra $Lt_1_28162;\n"
" .loc 16 293 0\n"
" mov.f32 %f120, %f3;\n"
" st.global.f32 [%rd48+0], %f120;\n"
" .loc 16 294 0\n"
" cvt.s64.s32 %rd52, %r10;\n"
" mul.wide.s32 %rd53, %r10, 4;\n"
" add.u64 %rd54, %rd53, %rd48;\n"
" .loc 16 293 0\n"
" mov.f32 %f121, %f5;\n"
" st.global.f32 [%rd54+0], %f121;\n"
" .loc 16 294 0\n"
" add.u64 %rd55, %rd53, %rd54;\n"
" .loc 16 293 0\n"
" mov.f32 %f122, %f7;\n"
" st.global.f32 [%rd55+0], %f122;\n"
" .loc 16 294 0\n"
" add.u64 %rd56, %rd53, %rd55;\n"
" .loc 16 293 0\n"
" mov.f32 %f123, %f9;\n"
" st.global.f32 [%rd56+0], %f123;\n"
" .loc 16 294 0\n"
" add.u64 %rd48, %rd53, %rd56;\n"
" .loc 16 293 0\n"
" mov.f32 %f124, %f11;\n"
" st.global.f32 [%rd48+0], %f124;\n"
" mov.f32 %f125, %f13;\n"
" add.u64 %rd57, %rd53, %rd48;\n"
" st.global.f32 [%rd57+0], %f125;\n"
"$Lt_1_28162:\n"
" .loc 16 297 0\n"
" ld.param.u64 %rd58, [__cudaparm_kernel_pair_fast_ans];\n"
" mul.lo.u64 %rd59, %rd7, 16;\n"
" add.u64 %rd60, %rd58, %rd59;\n"
" mov.f32 %f126, %f127;\n"
" st.global.v4.f32 [%rd60+0], {%f29,%f28,%f27,%f126};\n"
"$Lt_1_27138:\n"
"$Lt_1_19970:\n"
" .loc 16 300 0\n"
" exit;\n"
"$LDWend_kernel_pair_fast:\n"
" }\n"
;

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -1,134 +0,0 @@
.version 2.3
.target sm_20
.address_size 64
// compiled with /usr/local/cuda/open64/lib//be
// nvopencc 4.0 built on 2011-05-12
//-----------------------------------------------------------
// Compiling /tmp/tmpxft_00009a81_00000000-9_lal_device.cpp3.i (/home/sjplimp/ccBI#.zwVkZj)
//-----------------------------------------------------------
//-----------------------------------------------------------
// Options:
//-----------------------------------------------------------
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
// -O3 (Optimization level)
// -g0 (Debug level)
// -m2 (Report advisories)
//-----------------------------------------------------------
.file 1 "<command-line>"
.file 2 "/tmp/tmpxft_00009a81_00000000-8_lal_device.cudafe2.gpu"
.file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
.file 4 "/usr/local/cuda/include/crt/device_runtime.h"
.file 5 "/usr/local/cuda/include/host_defines.h"
.file 6 "/usr/local/cuda/include/builtin_types.h"
.file 7 "/usr/local/cuda/include/device_types.h"
.file 8 "/usr/local/cuda/include/driver_types.h"
.file 9 "/usr/local/cuda/include/surface_types.h"
.file 10 "/usr/local/cuda/include/texture_types.h"
.file 11 "/usr/local/cuda/include/vector_types.h"
.file 12 "/usr/local/cuda/include/device_launch_parameters.h"
.file 13 "/usr/local/cuda/include/crt/storage_class.h"
.file 14 "/usr/include/bits/types.h"
.file 15 "/usr/include/time.h"
.file 16 "lal_device.cu"
.file 17 "/usr/local/cuda/include/common_functions.h"
.file 18 "/usr/local/cuda/include/math_functions.h"
.file 19 "/usr/local/cuda/include/math_constants.h"
.file 20 "/usr/local/cuda/include/device_functions.h"
.file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h"
.file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h"
.file 23 "/usr/local/cuda/include/sm_13_double_functions.h"
.file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h"
.file 25 "/usr/local/cuda/include/sm_20_intrinsics.h"
.file 26 "/usr/local/cuda/include/surface_functions.h"
.file 27 "/usr/local/cuda/include/texture_fetch_functions.h"
.file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
.entry kernel_zero (
.param .u64 __cudaparm_kernel_zero_mem,
.param .s32 __cudaparm_kernel_zero_numel)
{
.reg .u32 %r<9>;
.reg .u64 %rd<6>;
.reg .pred %p<3>;
.loc 16 20 0
$LDWbegin_kernel_zero:
cvt.s32.u32 %r1, %ctaid.x;
cvt.s32.u32 %r2, %ntid.x;
mul24.lo.s32 %r3, %r1, %r2;
mov.u32 %r4, %tid.x;
add.u32 %r5, %r3, %r4;
ld.param.s32 %r6, [__cudaparm_kernel_zero_numel];
setp.le.s32 %p1, %r6, %r5;
@%p1 bra $Lt_0_1026;
.loc 16 24 0
mov.s32 %r7, 0;
ld.param.u64 %rd1, [__cudaparm_kernel_zero_mem];
cvt.s64.s32 %rd2, %r5;
mul.wide.s32 %rd3, %r5, 4;
add.u64 %rd4, %rd1, %rd3;
st.global.s32 [%rd4+0], %r7;
$Lt_0_1026:
.loc 16 25 0
exit;
$LDWend_kernel_zero:
} // kernel_zero
.entry kernel_info (
.param .u64 __cudaparm_kernel_info_info)
{
.reg .u32 %r<16>;
.reg .u64 %rd<3>;
.loc 16 27 0
$LDWbegin_kernel_info:
.loc 16 28 0
ld.param.u64 %rd1, [__cudaparm_kernel_info_info];
mov.s32 %r1, 200;
st.global.s32 [%rd1+0], %r1;
.loc 16 29 0
mov.s32 %r2, 32;
st.global.s32 [%rd1+4], %r2;
.loc 16 30 0
mov.s32 %r3, 32;
st.global.s32 [%rd1+8], %r3;
.loc 16 31 0
mov.s32 %r4, 4;
st.global.s32 [%rd1+12], %r4;
.loc 16 32 0
mov.s32 %r5, 8;
st.global.s32 [%rd1+16], %r5;
.loc 16 33 0
mov.s32 %r6, 64;
st.global.s32 [%rd1+20], %r6;
.loc 16 34 0
mov.s32 %r7, 128;
st.global.s32 [%rd1+24], %r7;
.loc 16 35 0
mov.s32 %r8, 11;
st.global.s32 [%rd1+28], %r8;
.loc 16 36 0
mov.s32 %r9, 8;
st.global.s32 [%rd1+32], %r9;
.loc 16 37 0
mov.s32 %r10, 128;
st.global.s32 [%rd1+36], %r10;
.loc 16 38 0
mov.s32 %r11, 128;
st.global.s32 [%rd1+40], %r11;
.loc 16 39 0
mov.s32 %r12, 128;
st.global.s32 [%rd1+44], %r12;
.loc 16 40 0
mov.s32 %r13, 128;
st.global.s32 [%rd1+48], %r13;
.loc 16 41 0
mov.s32 %r14, 8;
st.global.s32 [%rd1+52], %r14;
.loc 16 42 0
exit;
$LDWend_kernel_info:
} // kernel_info

View File

@ -1,88 +0,0 @@
const char * device =
" .version 2.3\n"
" .target sm_20\n"
" .address_size 64\n"
" .entry kernel_zero (\n"
" .param .u64 __cudaparm_kernel_zero_mem,\n"
" .param .s32 __cudaparm_kernel_zero_numel)\n"
" {\n"
" .reg .u32 %r<9>;\n"
" .reg .u64 %rd<6>;\n"
" .reg .pred %p<3>;\n"
" .loc 16 20 0\n"
"$LDWbegin_kernel_zero:\n"
" cvt.s32.u32 %r1, %ctaid.x;\n"
" cvt.s32.u32 %r2, %ntid.x;\n"
" mul24.lo.s32 %r3, %r1, %r2;\n"
" mov.u32 %r4, %tid.x;\n"
" add.u32 %r5, %r3, %r4;\n"
" ld.param.s32 %r6, [__cudaparm_kernel_zero_numel];\n"
" setp.le.s32 %p1, %r6, %r5;\n"
" @%p1 bra $Lt_0_1026;\n"
" .loc 16 24 0\n"
" mov.s32 %r7, 0;\n"
" ld.param.u64 %rd1, [__cudaparm_kernel_zero_mem];\n"
" cvt.s64.s32 %rd2, %r5;\n"
" mul.wide.s32 %rd3, %r5, 4;\n"
" add.u64 %rd4, %rd1, %rd3;\n"
" st.global.s32 [%rd4+0], %r7;\n"
"$Lt_0_1026:\n"
" .loc 16 25 0\n"
" exit;\n"
"$LDWend_kernel_zero:\n"
" }\n"
" .entry kernel_info (\n"
" .param .u64 __cudaparm_kernel_info_info)\n"
" {\n"
" .reg .u32 %r<16>;\n"
" .reg .u64 %rd<3>;\n"
" .loc 16 27 0\n"
"$LDWbegin_kernel_info:\n"
" .loc 16 28 0\n"
" ld.param.u64 %rd1, [__cudaparm_kernel_info_info];\n"
" mov.s32 %r1, 200;\n"
" st.global.s32 [%rd1+0], %r1;\n"
" .loc 16 29 0\n"
" mov.s32 %r2, 32;\n"
" st.global.s32 [%rd1+4], %r2;\n"
" .loc 16 30 0\n"
" mov.s32 %r3, 32;\n"
" st.global.s32 [%rd1+8], %r3;\n"
" .loc 16 31 0\n"
" mov.s32 %r4, 4;\n"
" st.global.s32 [%rd1+12], %r4;\n"
" .loc 16 32 0\n"
" mov.s32 %r5, 8;\n"
" st.global.s32 [%rd1+16], %r5;\n"
" .loc 16 33 0\n"
" mov.s32 %r6, 64;\n"
" st.global.s32 [%rd1+20], %r6;\n"
" .loc 16 34 0\n"
" mov.s32 %r7, 128;\n"
" st.global.s32 [%rd1+24], %r7;\n"
" .loc 16 35 0\n"
" mov.s32 %r8, 11;\n"
" st.global.s32 [%rd1+28], %r8;\n"
" .loc 16 36 0\n"
" mov.s32 %r9, 8;\n"
" st.global.s32 [%rd1+32], %r9;\n"
" .loc 16 37 0\n"
" mov.s32 %r10, 128;\n"
" st.global.s32 [%rd1+36], %r10;\n"
" .loc 16 38 0\n"
" mov.s32 %r11, 128;\n"
" st.global.s32 [%rd1+40], %r11;\n"
" .loc 16 39 0\n"
" mov.s32 %r12, 128;\n"
" st.global.s32 [%rd1+44], %r12;\n"
" .loc 16 40 0\n"
" mov.s32 %r13, 128;\n"
" st.global.s32 [%rd1+48], %r13;\n"
" .loc 16 41 0\n"
" mov.s32 %r14, 8;\n"
" st.global.s32 [%rd1+52], %r14;\n"
" .loc 16 42 0\n"
" exit;\n"
"$LDWend_kernel_info:\n"
" }\n"
;

View File

@ -1,329 +0,0 @@
.version 2.3
.target sm_20
.address_size 64
// compiled with /usr/local/cuda/open64/lib//be
// nvopencc 4.0 built on 2011-05-12
//-----------------------------------------------------------
// Compiling /tmp/tmpxft_00009ad9_00000000-9_lal_ellipsoid_nbor.cpp3.i (/home/sjplimp/ccBI#.7CLzz0)
//-----------------------------------------------------------
//-----------------------------------------------------------
// Options:
//-----------------------------------------------------------
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
// -O3 (Optimization level)
// -g0 (Debug level)
// -m2 (Report advisories)
//-----------------------------------------------------------
.file 1 "<command-line>"
.file 2 "/tmp/tmpxft_00009ad9_00000000-8_lal_ellipsoid_nbor.cudafe2.gpu"
.file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
.file 4 "/usr/local/cuda/include/crt/device_runtime.h"
.file 5 "/usr/local/cuda/include/host_defines.h"
.file 6 "/usr/local/cuda/include/builtin_types.h"
.file 7 "/usr/local/cuda/include/device_types.h"
.file 8 "/usr/local/cuda/include/driver_types.h"
.file 9 "/usr/local/cuda/include/surface_types.h"
.file 10 "/usr/local/cuda/include/texture_types.h"
.file 11 "/usr/local/cuda/include/vector_types.h"
.file 12 "/usr/local/cuda/include/device_launch_parameters.h"
.file 13 "/usr/local/cuda/include/crt/storage_class.h"
.file 14 "/usr/include/bits/types.h"
.file 15 "/usr/include/time.h"
.file 16 "lal_ellipsoid_nbor.cu"
.file 17 "/usr/local/cuda/include/common_functions.h"
.file 18 "/usr/local/cuda/include/math_functions.h"
.file 19 "/usr/local/cuda/include/math_constants.h"
.file 20 "/usr/local/cuda/include/device_functions.h"
.file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h"
.file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h"
.file 23 "/usr/local/cuda/include/sm_13_double_functions.h"
.file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h"
.file 25 "/usr/local/cuda/include/sm_20_intrinsics.h"
.file 26 "/usr/local/cuda/include/surface_functions.h"
.file 27 "/usr/local/cuda/include/texture_fetch_functions.h"
.file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
.entry kernel_nbor (
.param .u64 __cudaparm_kernel_nbor_x_,
.param .u64 __cudaparm_kernel_nbor_cut_form,
.param .s32 __cudaparm_kernel_nbor_ntypes,
.param .u64 __cudaparm_kernel_nbor_dev_nbor,
.param .s32 __cudaparm_kernel_nbor_nbor_pitch,
.param .s32 __cudaparm_kernel_nbor_start,
.param .s32 __cudaparm_kernel_nbor_inum,
.param .u64 __cudaparm_kernel_nbor_dev_ij,
.param .s32 __cudaparm_kernel_nbor_form_low,
.param .s32 __cudaparm_kernel_nbor_form_high)
{
.reg .u32 %r<26>;
.reg .u64 %rd<33>;
.reg .f32 %f<20>;
.reg .pred %p<8>;
.loc 16 29 0
$LDWbegin_kernel_nbor:
cvt.s32.u32 %r1, %ctaid.x;
cvt.s32.u32 %r2, %ntid.x;
mul24.lo.s32 %r3, %r1, %r2;
mov.u32 %r4, %tid.x;
add.u32 %r5, %r3, %r4;
ld.param.s32 %r6, [__cudaparm_kernel_nbor_start];
add.u32 %r7, %r6, %r5;
ld.param.s32 %r8, [__cudaparm_kernel_nbor_inum];
setp.le.s32 %p1, %r8, %r7;
@%p1 bra $Lt_0_4354;
.loc 16 36 0
cvt.s64.s32 %rd1, %r7;
ld.param.u64 %rd2, [__cudaparm_kernel_nbor_dev_ij];
mul.wide.s32 %rd3, %r7, 4;
add.u64 %rd4, %rd2, %rd3;
ld.global.s32 %r9, [%rd4+0];
.loc 16 38 0
ld.param.s32 %r10, [__cudaparm_kernel_nbor_nbor_pitch];
cvt.s64.s32 %rd5, %r10;
mul.wide.s32 %rd6, %r10, 4;
add.u64 %rd7, %rd6, %rd4;
ld.global.s32 %r11, [%rd7+0];
.loc 16 39 0
add.u64 %rd8, %rd6, %rd7;
mov.s64 %rd9, %rd8;
.loc 16 41 0
ld.param.u64 %rd10, [__cudaparm_kernel_nbor_dev_nbor];
add.u64 %rd11, %rd1, %rd5;
add.u64 %rd12, %rd5, %rd11;
mul.lo.u64 %rd13, %rd12, 4;
add.u64 %rd14, %rd10, %rd13;
.loc 16 43 0
ld.param.u64 %rd15, [__cudaparm_kernel_nbor_x_];
cvt.s64.s32 %rd16, %r9;
mul.wide.s32 %rd17, %r9, 16;
add.u64 %rd18, %rd15, %rd17;
ld.global.v4.f32 {%f1,%f2,%f3,%f4}, [%rd18+0];
cvt.s32.s64 %r12, %rd5;
mul.lo.s32 %r13, %r12, %r11;
cvt.s64.s32 %rd19, %r13;
mul.wide.s32 %rd20, %r13, 4;
add.u64 %rd21, %rd8, %rd20;
setp.ge.u64 %p2, %rd8, %rd21;
@%p2 bra $Lt_0_6402;
cvt.rzi.ftz.s32.f32 %r14, %f4;
ld.param.s32 %r15, [__cudaparm_kernel_nbor_form_low];
cvt.rn.f32.s32 %f5, %r15;
ld.param.s32 %r16, [__cudaparm_kernel_nbor_ntypes];
mul.lo.s32 %r17, %r16, %r14;
ld.param.u64 %rd22, [__cudaparm_kernel_nbor_cut_form];
mov.s32 %r18, 0;
$Lt_0_5378:
//<loop> Loop body line 43, nesting depth: 1, estimated iterations: unknown
.loc 16 49 0
ld.global.s32 %r19, [%rd9+0];
and.b32 %r20, %r19, 1073741823;
.loc 16 50 0
cvt.s64.s32 %rd23, %r20;
mul.wide.s32 %rd24, %r20, 16;
add.u64 %rd25, %rd15, %rd24;
ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd25+0];
.loc 16 53 0
cvt.rzi.ftz.s32.f32 %r21, %f9;
add.s32 %r22, %r21, %r17;
cvt.s64.s32 %rd26, %r22;
mul.wide.s32 %rd27, %r22, 8;
add.u64 %rd28, %rd22, %rd27;
ld.global.f32 %f10, [%rd28+4];
.loc 16 48 0
setp.le.ftz.f32 %p3, %f5, %f10;
@!%p3 bra $Lt_0_6658;
ld.param.s32 %r23, [__cudaparm_kernel_nbor_form_high];
cvt.rn.f32.s32 %f11, %r23;
setp.ge.ftz.f32 %p4, %f11, %f10;
@!%p4 bra $Lt_0_6658;
sub.ftz.f32 %f12, %f6, %f1;
sub.ftz.f32 %f13, %f7, %f2;
sub.ftz.f32 %f14, %f8, %f3;
ld.global.f32 %f15, [%rd28+0];
mul.ftz.f32 %f16, %f12, %f12;
fma.rn.ftz.f32 %f17, %f13, %f13, %f16;
fma.rn.ftz.f32 %f18, %f14, %f14, %f17;
setp.gt.ftz.f32 %p5, %f15, %f18;
@!%p5 bra $Lt_0_6658;
.loc 16 64 0
st.global.s32 [%rd14+0], %r20;
.loc 16 65 0
add.u64 %rd14, %rd6, %rd14;
.loc 16 66 0
add.s32 %r18, %r18, 1;
$Lt_0_6658:
$L_0_3842:
.loc 16 47 0
add.u64 %rd9, %rd6, %rd9;
setp.gt.u64 %p6, %rd21, %rd9;
@%p6 bra $Lt_0_5378;
bra.uni $Lt_0_4866;
$Lt_0_6402:
mov.s32 %r18, 0;
$Lt_0_4866:
.loc 16 70 0
add.s32 %r24, %r12, %r7;
cvt.s64.s32 %rd29, %r24;
mul.wide.s32 %rd30, %r24, 4;
add.u64 %rd31, %rd10, %rd30;
st.global.s32 [%rd31+0], %r18;
$Lt_0_4354:
.loc 16 72 0
exit;
$LDWend_kernel_nbor:
} // kernel_nbor
.entry kernel_nbor_fast (
.param .u64 __cudaparm_kernel_nbor_fast_x_,
.param .u64 __cudaparm_kernel_nbor_fast_cut_form,
.param .u64 __cudaparm_kernel_nbor_fast_dev_nbor,
.param .s32 __cudaparm_kernel_nbor_fast_nbor_pitch,
.param .s32 __cudaparm_kernel_nbor_fast_start,
.param .s32 __cudaparm_kernel_nbor_fast_inum,
.param .u64 __cudaparm_kernel_nbor_fast_dev_ij,
.param .s32 __cudaparm_kernel_nbor_fast_form_low,
.param .s32 __cudaparm_kernel_nbor_fast_form_high)
{
.reg .u32 %r<28>;
.reg .u64 %rd<42>;
.reg .f32 %f<19>;
.reg .pred %p<9>;
.shared .align 4 .b8 __cuda___cuda_local_var_32570_31_non_const_form120[484];
.shared .align 4 .b8 __cuda___cuda_local_var_32571_33_non_const_cutsq604[484];
.loc 16 84 0
$LDWbegin_kernel_nbor_fast:
cvt.s32.u32 %r1, %tid.x;
mov.u32 %r2, 120;
setp.gt.s32 %p1, %r1, %r2;
@%p1 bra $Lt_1_5122;
.loc 16 90 0
mov.u64 %rd1, __cuda___cuda_local_var_32570_31_non_const_form120;
mov.u64 %rd2, __cuda___cuda_local_var_32571_33_non_const_cutsq604;
cvt.s64.s32 %rd3, %r1;
mul.wide.s32 %rd4, %r1, 4;
ld.param.u64 %rd5, [__cudaparm_kernel_nbor_fast_cut_form];
mul.wide.s32 %rd6, %r1, 8;
add.u64 %rd7, %rd5, %rd6;
ld.global.v2.f32 {%f1,%f2}, [%rd7+0];
add.u64 %rd8, %rd4, %rd2;
st.shared.f32 [%rd8+0], %f1;
.loc 16 91 0
cvt.rzi.ftz.s32.f32 %r3, %f2;
add.u64 %rd9, %rd4, %rd1;
st.shared.s32 [%rd9+0], %r3;
$Lt_1_5122:
mov.u64 %rd1, __cuda___cuda_local_var_32570_31_non_const_form120;
mov.u64 %rd2, __cuda___cuda_local_var_32571_33_non_const_cutsq604;
.loc 16 94 0
bar.sync 0;
cvt.s32.u32 %r4, %ctaid.x;
cvt.s32.u32 %r5, %ntid.x;
mul.lo.s32 %r6, %r4, %r5;
ld.param.s32 %r7, [__cudaparm_kernel_nbor_fast_start];
add.s32 %r8, %r7, %r6;
add.s32 %r9, %r8, %r1;
ld.param.s32 %r10, [__cudaparm_kernel_nbor_fast_inum];
setp.le.s32 %p2, %r10, %r9;
@%p2 bra $Lt_1_5634;
.loc 16 98 0
cvt.s64.s32 %rd10, %r9;
ld.param.u64 %rd11, [__cudaparm_kernel_nbor_fast_dev_ij];
mul.wide.s32 %rd12, %r9, 4;
add.u64 %rd13, %rd11, %rd12;
ld.global.s32 %r11, [%rd13+0];
.loc 16 100 0
ld.param.s32 %r12, [__cudaparm_kernel_nbor_fast_nbor_pitch];
cvt.s64.s32 %rd14, %r12;
mul.wide.s32 %rd15, %r12, 4;
add.u64 %rd16, %rd15, %rd13;
ld.global.s32 %r13, [%rd16+0];
.loc 16 101 0
add.u64 %rd17, %rd15, %rd16;
mov.s64 %rd18, %rd17;
.loc 16 103 0
ld.param.u64 %rd19, [__cudaparm_kernel_nbor_fast_dev_nbor];
add.u64 %rd20, %rd10, %rd14;
add.u64 %rd21, %rd14, %rd20;
mul.lo.u64 %rd22, %rd21, 4;
add.u64 %rd23, %rd19, %rd22;
.loc 16 105 0
ld.param.u64 %rd24, [__cudaparm_kernel_nbor_fast_x_];
cvt.s64.s32 %rd25, %r11;
mul.wide.s32 %rd26, %r11, 16;
add.u64 %rd27, %rd24, %rd26;
ld.global.v4.f32 {%f3,%f4,%f5,%f6}, [%rd27+0];
cvt.s32.s64 %r14, %rd14;
mul.lo.s32 %r15, %r14, %r13;
cvt.s64.s32 %rd28, %r15;
mul.wide.s32 %rd29, %r15, 4;
add.u64 %rd30, %rd17, %rd29;
setp.ge.u64 %p3, %rd17, %rd30;
@%p3 bra $Lt_1_7682;
cvt.rzi.ftz.s32.f32 %r16, %f6;
mul.lo.s32 %r17, %r16, 11;
ld.param.s32 %r18, [__cudaparm_kernel_nbor_fast_form_low];
mov.s32 %r19, 0;
$Lt_1_6658:
//<loop> Loop body line 105, nesting depth: 1, estimated iterations: unknown
.loc 16 112 0
ld.global.s32 %r20, [%rd18+0];
and.b32 %r21, %r20, 1073741823;
.loc 16 113 0
cvt.s64.s32 %rd31, %r21;
mul.wide.s32 %rd32, %r21, 16;
add.u64 %rd33, %rd24, %rd32;
ld.global.v4.f32 {%f7,%f8,%f9,%f10}, [%rd33+0];
.loc 16 111 0
cvt.rzi.ftz.s32.f32 %r22, %f10;
add.s32 %r23, %r22, %r17;
cvt.s64.s32 %rd34, %r23;
mul.wide.s32 %rd35, %r23, 4;
add.u64 %rd36, %rd35, %rd1;
ld.shared.s32 %r24, [%rd36+0];
setp.lt.s32 %p4, %r24, %r18;
@%p4 bra $Lt_1_7938;
ld.param.s32 %r25, [__cudaparm_kernel_nbor_fast_form_high];
setp.lt.s32 %p5, %r25, %r24;
@%p5 bra $Lt_1_7938;
sub.ftz.f32 %f11, %f7, %f3;
sub.ftz.f32 %f12, %f8, %f4;
sub.ftz.f32 %f13, %f9, %f5;
add.u64 %rd37, %rd35, %rd2;
ld.shared.f32 %f14, [%rd37+0];
mul.ftz.f32 %f15, %f11, %f11;
fma.rn.ftz.f32 %f16, %f12, %f12, %f15;
fma.rn.ftz.f32 %f17, %f13, %f13, %f16;
setp.gt.ftz.f32 %p6, %f14, %f17;
@!%p6 bra $Lt_1_7938;
.loc 16 127 0
st.global.s32 [%rd23+0], %r21;
.loc 16 128 0
add.u64 %rd23, %rd15, %rd23;
.loc 16 129 0
add.s32 %r19, %r19, 1;
$Lt_1_7938:
$L_1_4610:
.loc 16 110 0
add.u64 %rd18, %rd15, %rd18;
setp.gt.u64 %p7, %rd30, %rd18;
@%p7 bra $Lt_1_6658;
bra.uni $Lt_1_6146;
$Lt_1_7682:
mov.s32 %r19, 0;
$Lt_1_6146:
.loc 16 133 0
add.s32 %r26, %r14, %r9;
cvt.s64.s32 %rd38, %r26;
mul.wide.s32 %rd39, %r26, 4;
add.u64 %rd40, %rd19, %rd39;
st.global.s32 [%rd40+0], %r19;
$Lt_1_5634:
.loc 16 135 0
exit;
$LDWend_kernel_nbor_fast:
} // kernel_nbor_fast

View File

@ -1,281 +0,0 @@
const char * ellipsoid_nbor =
" .version 2.3\n"
" .target sm_20\n"
" .address_size 64\n"
" .entry kernel_nbor (\n"
" .param .u64 __cudaparm_kernel_nbor_x_,\n"
" .param .u64 __cudaparm_kernel_nbor_cut_form,\n"
" .param .s32 __cudaparm_kernel_nbor_ntypes,\n"
" .param .u64 __cudaparm_kernel_nbor_dev_nbor,\n"
" .param .s32 __cudaparm_kernel_nbor_nbor_pitch,\n"
" .param .s32 __cudaparm_kernel_nbor_start,\n"
" .param .s32 __cudaparm_kernel_nbor_inum,\n"
" .param .u64 __cudaparm_kernel_nbor_dev_ij,\n"
" .param .s32 __cudaparm_kernel_nbor_form_low,\n"
" .param .s32 __cudaparm_kernel_nbor_form_high)\n"
" {\n"
" .reg .u32 %r<26>;\n"
" .reg .u64 %rd<33>;\n"
" .reg .f32 %f<20>;\n"
" .reg .pred %p<8>;\n"
" .loc 16 29 0\n"
"$LDWbegin_kernel_nbor:\n"
" cvt.s32.u32 %r1, %ctaid.x;\n"
" cvt.s32.u32 %r2, %ntid.x;\n"
" mul24.lo.s32 %r3, %r1, %r2;\n"
" mov.u32 %r4, %tid.x;\n"
" add.u32 %r5, %r3, %r4;\n"
" ld.param.s32 %r6, [__cudaparm_kernel_nbor_start];\n"
" add.u32 %r7, %r6, %r5;\n"
" ld.param.s32 %r8, [__cudaparm_kernel_nbor_inum];\n"
" setp.le.s32 %p1, %r8, %r7;\n"
" @%p1 bra $Lt_0_4354;\n"
" .loc 16 36 0\n"
" cvt.s64.s32 %rd1, %r7;\n"
" ld.param.u64 %rd2, [__cudaparm_kernel_nbor_dev_ij];\n"
" mul.wide.s32 %rd3, %r7, 4;\n"
" add.u64 %rd4, %rd2, %rd3;\n"
" ld.global.s32 %r9, [%rd4+0];\n"
" .loc 16 38 0\n"
" ld.param.s32 %r10, [__cudaparm_kernel_nbor_nbor_pitch];\n"
" cvt.s64.s32 %rd5, %r10;\n"
" mul.wide.s32 %rd6, %r10, 4;\n"
" add.u64 %rd7, %rd6, %rd4;\n"
" ld.global.s32 %r11, [%rd7+0];\n"
" .loc 16 39 0\n"
" add.u64 %rd8, %rd6, %rd7;\n"
" mov.s64 %rd9, %rd8;\n"
" .loc 16 41 0\n"
" ld.param.u64 %rd10, [__cudaparm_kernel_nbor_dev_nbor];\n"
" add.u64 %rd11, %rd1, %rd5;\n"
" add.u64 %rd12, %rd5, %rd11;\n"
" mul.lo.u64 %rd13, %rd12, 4;\n"
" add.u64 %rd14, %rd10, %rd13;\n"
" .loc 16 43 0\n"
" ld.param.u64 %rd15, [__cudaparm_kernel_nbor_x_];\n"
" cvt.s64.s32 %rd16, %r9;\n"
" mul.wide.s32 %rd17, %r9, 16;\n"
" add.u64 %rd18, %rd15, %rd17;\n"
" ld.global.v4.f32 {%f1,%f2,%f3,%f4}, [%rd18+0];\n"
" cvt.s32.s64 %r12, %rd5;\n"
" mul.lo.s32 %r13, %r12, %r11;\n"
" cvt.s64.s32 %rd19, %r13;\n"
" mul.wide.s32 %rd20, %r13, 4;\n"
" add.u64 %rd21, %rd8, %rd20;\n"
" setp.ge.u64 %p2, %rd8, %rd21;\n"
" @%p2 bra $Lt_0_6402;\n"
" cvt.rzi.ftz.s32.f32 %r14, %f4;\n"
" ld.param.s32 %r15, [__cudaparm_kernel_nbor_form_low];\n"
" cvt.rn.f32.s32 %f5, %r15;\n"
" ld.param.s32 %r16, [__cudaparm_kernel_nbor_ntypes];\n"
" mul.lo.s32 %r17, %r16, %r14;\n"
" ld.param.u64 %rd22, [__cudaparm_kernel_nbor_cut_form];\n"
" mov.s32 %r18, 0;\n"
"$Lt_0_5378:\n"
" .loc 16 49 0\n"
" ld.global.s32 %r19, [%rd9+0];\n"
" and.b32 %r20, %r19, 1073741823;\n"
" .loc 16 50 0\n"
" cvt.s64.s32 %rd23, %r20;\n"
" mul.wide.s32 %rd24, %r20, 16;\n"
" add.u64 %rd25, %rd15, %rd24;\n"
" ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd25+0];\n"
" .loc 16 53 0\n"
" cvt.rzi.ftz.s32.f32 %r21, %f9;\n"
" add.s32 %r22, %r21, %r17;\n"
" cvt.s64.s32 %rd26, %r22;\n"
" mul.wide.s32 %rd27, %r22, 8;\n"
" add.u64 %rd28, %rd22, %rd27;\n"
" ld.global.f32 %f10, [%rd28+4];\n"
" .loc 16 48 0\n"
" setp.le.ftz.f32 %p3, %f5, %f10;\n"
" @!%p3 bra $Lt_0_6658;\n"
" ld.param.s32 %r23, [__cudaparm_kernel_nbor_form_high];\n"
" cvt.rn.f32.s32 %f11, %r23;\n"
" setp.ge.ftz.f32 %p4, %f11, %f10;\n"
" @!%p4 bra $Lt_0_6658;\n"
" sub.ftz.f32 %f12, %f6, %f1;\n"
" sub.ftz.f32 %f13, %f7, %f2;\n"
" sub.ftz.f32 %f14, %f8, %f3;\n"
" ld.global.f32 %f15, [%rd28+0];\n"
" mul.ftz.f32 %f16, %f12, %f12;\n"
" fma.rn.ftz.f32 %f17, %f13, %f13, %f16;\n"
" fma.rn.ftz.f32 %f18, %f14, %f14, %f17;\n"
" setp.gt.ftz.f32 %p5, %f15, %f18;\n"
" @!%p5 bra $Lt_0_6658;\n"
" .loc 16 64 0\n"
" st.global.s32 [%rd14+0], %r20;\n"
" .loc 16 65 0\n"
" add.u64 %rd14, %rd6, %rd14;\n"
" .loc 16 66 0\n"
" add.s32 %r18, %r18, 1;\n"
"$Lt_0_6658:\n"
"$L_0_3842:\n"
" .loc 16 47 0\n"
" add.u64 %rd9, %rd6, %rd9;\n"
" setp.gt.u64 %p6, %rd21, %rd9;\n"
" @%p6 bra $Lt_0_5378;\n"
" bra.uni $Lt_0_4866;\n"
"$Lt_0_6402:\n"
" mov.s32 %r18, 0;\n"
"$Lt_0_4866:\n"
" .loc 16 70 0\n"
" add.s32 %r24, %r12, %r7;\n"
" cvt.s64.s32 %rd29, %r24;\n"
" mul.wide.s32 %rd30, %r24, 4;\n"
" add.u64 %rd31, %rd10, %rd30;\n"
" st.global.s32 [%rd31+0], %r18;\n"
"$Lt_0_4354:\n"
" .loc 16 72 0\n"
" exit;\n"
"$LDWend_kernel_nbor:\n"
" }\n"
" .entry kernel_nbor_fast (\n"
" .param .u64 __cudaparm_kernel_nbor_fast_x_,\n"
" .param .u64 __cudaparm_kernel_nbor_fast_cut_form,\n"
" .param .u64 __cudaparm_kernel_nbor_fast_dev_nbor,\n"
" .param .s32 __cudaparm_kernel_nbor_fast_nbor_pitch,\n"
" .param .s32 __cudaparm_kernel_nbor_fast_start,\n"
" .param .s32 __cudaparm_kernel_nbor_fast_inum,\n"
" .param .u64 __cudaparm_kernel_nbor_fast_dev_ij,\n"
" .param .s32 __cudaparm_kernel_nbor_fast_form_low,\n"
" .param .s32 __cudaparm_kernel_nbor_fast_form_high)\n"
" {\n"
" .reg .u32 %r<28>;\n"
" .reg .u64 %rd<42>;\n"
" .reg .f32 %f<19>;\n"
" .reg .pred %p<9>;\n"
" .shared .align 4 .b8 __cuda___cuda_local_var_32570_31_non_const_form120[484];\n"
" .shared .align 4 .b8 __cuda___cuda_local_var_32571_33_non_const_cutsq604[484];\n"
" .loc 16 84 0\n"
"$LDWbegin_kernel_nbor_fast:\n"
" cvt.s32.u32 %r1, %tid.x;\n"
" mov.u32 %r2, 120;\n"
" setp.gt.s32 %p1, %r1, %r2;\n"
" @%p1 bra $Lt_1_5122;\n"
" .loc 16 90 0\n"
" mov.u64 %rd1, __cuda___cuda_local_var_32570_31_non_const_form120;\n"
" mov.u64 %rd2, __cuda___cuda_local_var_32571_33_non_const_cutsq604;\n"
" cvt.s64.s32 %rd3, %r1;\n"
" mul.wide.s32 %rd4, %r1, 4;\n"
" ld.param.u64 %rd5, [__cudaparm_kernel_nbor_fast_cut_form];\n"
" mul.wide.s32 %rd6, %r1, 8;\n"
" add.u64 %rd7, %rd5, %rd6;\n"
" ld.global.v2.f32 {%f1,%f2}, [%rd7+0];\n"
" add.u64 %rd8, %rd4, %rd2;\n"
" st.shared.f32 [%rd8+0], %f1;\n"
" .loc 16 91 0\n"
" cvt.rzi.ftz.s32.f32 %r3, %f2;\n"
" add.u64 %rd9, %rd4, %rd1;\n"
" st.shared.s32 [%rd9+0], %r3;\n"
"$Lt_1_5122:\n"
" mov.u64 %rd1, __cuda___cuda_local_var_32570_31_non_const_form120;\n"
" mov.u64 %rd2, __cuda___cuda_local_var_32571_33_non_const_cutsq604;\n"
" .loc 16 94 0\n"
" bar.sync 0;\n"
" cvt.s32.u32 %r4, %ctaid.x;\n"
" cvt.s32.u32 %r5, %ntid.x;\n"
" mul.lo.s32 %r6, %r4, %r5;\n"
" ld.param.s32 %r7, [__cudaparm_kernel_nbor_fast_start];\n"
" add.s32 %r8, %r7, %r6;\n"
" add.s32 %r9, %r8, %r1;\n"
" ld.param.s32 %r10, [__cudaparm_kernel_nbor_fast_inum];\n"
" setp.le.s32 %p2, %r10, %r9;\n"
" @%p2 bra $Lt_1_5634;\n"
" .loc 16 98 0\n"
" cvt.s64.s32 %rd10, %r9;\n"
" ld.param.u64 %rd11, [__cudaparm_kernel_nbor_fast_dev_ij];\n"
" mul.wide.s32 %rd12, %r9, 4;\n"
" add.u64 %rd13, %rd11, %rd12;\n"
" ld.global.s32 %r11, [%rd13+0];\n"
" .loc 16 100 0\n"
" ld.param.s32 %r12, [__cudaparm_kernel_nbor_fast_nbor_pitch];\n"
" cvt.s64.s32 %rd14, %r12;\n"
" mul.wide.s32 %rd15, %r12, 4;\n"
" add.u64 %rd16, %rd15, %rd13;\n"
" ld.global.s32 %r13, [%rd16+0];\n"
" .loc 16 101 0\n"
" add.u64 %rd17, %rd15, %rd16;\n"
" mov.s64 %rd18, %rd17;\n"
" .loc 16 103 0\n"
" ld.param.u64 %rd19, [__cudaparm_kernel_nbor_fast_dev_nbor];\n"
" add.u64 %rd20, %rd10, %rd14;\n"
" add.u64 %rd21, %rd14, %rd20;\n"
" mul.lo.u64 %rd22, %rd21, 4;\n"
" add.u64 %rd23, %rd19, %rd22;\n"
" .loc 16 105 0\n"
" ld.param.u64 %rd24, [__cudaparm_kernel_nbor_fast_x_];\n"
" cvt.s64.s32 %rd25, %r11;\n"
" mul.wide.s32 %rd26, %r11, 16;\n"
" add.u64 %rd27, %rd24, %rd26;\n"
" ld.global.v4.f32 {%f3,%f4,%f5,%f6}, [%rd27+0];\n"
" cvt.s32.s64 %r14, %rd14;\n"
" mul.lo.s32 %r15, %r14, %r13;\n"
" cvt.s64.s32 %rd28, %r15;\n"
" mul.wide.s32 %rd29, %r15, 4;\n"
" add.u64 %rd30, %rd17, %rd29;\n"
" setp.ge.u64 %p3, %rd17, %rd30;\n"
" @%p3 bra $Lt_1_7682;\n"
" cvt.rzi.ftz.s32.f32 %r16, %f6;\n"
" mul.lo.s32 %r17, %r16, 11;\n"
" ld.param.s32 %r18, [__cudaparm_kernel_nbor_fast_form_low];\n"
" mov.s32 %r19, 0;\n"
"$Lt_1_6658:\n"
" .loc 16 112 0\n"
" ld.global.s32 %r20, [%rd18+0];\n"
" and.b32 %r21, %r20, 1073741823;\n"
" .loc 16 113 0\n"
" cvt.s64.s32 %rd31, %r21;\n"
" mul.wide.s32 %rd32, %r21, 16;\n"
" add.u64 %rd33, %rd24, %rd32;\n"
" ld.global.v4.f32 {%f7,%f8,%f9,%f10}, [%rd33+0];\n"
" .loc 16 111 0\n"
" cvt.rzi.ftz.s32.f32 %r22, %f10;\n"
" add.s32 %r23, %r22, %r17;\n"
" cvt.s64.s32 %rd34, %r23;\n"
" mul.wide.s32 %rd35, %r23, 4;\n"
" add.u64 %rd36, %rd35, %rd1;\n"
" ld.shared.s32 %r24, [%rd36+0];\n"
" setp.lt.s32 %p4, %r24, %r18;\n"
" @%p4 bra $Lt_1_7938;\n"
" ld.param.s32 %r25, [__cudaparm_kernel_nbor_fast_form_high];\n"
" setp.lt.s32 %p5, %r25, %r24;\n"
" @%p5 bra $Lt_1_7938;\n"
" sub.ftz.f32 %f11, %f7, %f3;\n"
" sub.ftz.f32 %f12, %f8, %f4;\n"
" sub.ftz.f32 %f13, %f9, %f5;\n"
" add.u64 %rd37, %rd35, %rd2;\n"
" ld.shared.f32 %f14, [%rd37+0];\n"
" mul.ftz.f32 %f15, %f11, %f11;\n"
" fma.rn.ftz.f32 %f16, %f12, %f12, %f15;\n"
" fma.rn.ftz.f32 %f17, %f13, %f13, %f16;\n"
" setp.gt.ftz.f32 %p6, %f14, %f17;\n"
" @!%p6 bra $Lt_1_7938;\n"
" .loc 16 127 0\n"
" st.global.s32 [%rd23+0], %r21;\n"
" .loc 16 128 0\n"
" add.u64 %rd23, %rd15, %rd23;\n"
" .loc 16 129 0\n"
" add.s32 %r19, %r19, 1;\n"
"$Lt_1_7938:\n"
"$L_1_4610:\n"
" .loc 16 110 0\n"
" add.u64 %rd18, %rd15, %rd18;\n"
" setp.gt.u64 %p7, %rd30, %rd18;\n"
" @%p7 bra $Lt_1_6658;\n"
" bra.uni $Lt_1_6146;\n"
"$Lt_1_7682:\n"
" mov.s32 %r19, 0;\n"
"$Lt_1_6146:\n"
" .loc 16 133 0\n"
" add.s32 %r26, %r14, %r9;\n"
" cvt.s64.s32 %rd38, %r26;\n"
" mul.wide.s32 %rd39, %r26, 4;\n"
" add.u64 %rd40, %rd19, %rd39;\n"
" st.global.s32 [%rd40+0], %r19;\n"
"$Lt_1_5634:\n"
" .loc 16 135 0\n"
" exit;\n"
"$LDWend_kernel_nbor_fast:\n"
" }\n"
;

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -1,901 +0,0 @@
.version 2.3
.target sm_20
.address_size 64
// compiled with /usr/local/cuda/open64/lib//be
// nvopencc 4.0 built on 2011-05-12
//-----------------------------------------------------------
// Compiling /tmp/tmpxft_00009c40_00000000-9_lal_lj.cpp3.i (/home/sjplimp/ccBI#.N4UW9Z)
//-----------------------------------------------------------
//-----------------------------------------------------------
// Options:
//-----------------------------------------------------------
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
// -O3 (Optimization level)
// -g0 (Debug level)
// -m2 (Report advisories)
//-----------------------------------------------------------
.file 1 "<command-line>"
.file 2 "/tmp/tmpxft_00009c40_00000000-8_lal_lj.cudafe2.gpu"
.file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
.file 4 "/usr/local/cuda/include/crt/device_runtime.h"
.file 5 "/usr/local/cuda/include/host_defines.h"
.file 6 "/usr/local/cuda/include/builtin_types.h"
.file 7 "/usr/local/cuda/include/device_types.h"
.file 8 "/usr/local/cuda/include/driver_types.h"
.file 9 "/usr/local/cuda/include/surface_types.h"
.file 10 "/usr/local/cuda/include/texture_types.h"
.file 11 "/usr/local/cuda/include/vector_types.h"
.file 12 "/usr/local/cuda/include/device_launch_parameters.h"
.file 13 "/usr/local/cuda/include/crt/storage_class.h"
.file 14 "/usr/include/bits/types.h"
.file 15 "/usr/include/time.h"
.file 16 "lal_lj.cu"
.file 17 "/usr/local/cuda/include/common_functions.h"
.file 18 "/usr/local/cuda/include/math_functions.h"
.file 19 "/usr/local/cuda/include/math_constants.h"
.file 20 "/usr/local/cuda/include/device_functions.h"
.file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h"
.file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h"
.file 23 "/usr/local/cuda/include/sm_13_double_functions.h"
.file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h"
.file 25 "/usr/local/cuda/include/sm_20_intrinsics.h"
.file 26 "/usr/local/cuda/include/surface_functions.h"
.file 27 "/usr/local/cuda/include/texture_fetch_functions.h"
.file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
.global .texref pos_tex;
.entry kernel_pair (
.param .u64 __cudaparm_kernel_pair_x_,
.param .u64 __cudaparm_kernel_pair_lj1,
.param .u64 __cudaparm_kernel_pair_lj3,
.param .s32 __cudaparm_kernel_pair_lj_types,
.param .u64 __cudaparm_kernel_pair_sp_lj_in,
.param .u64 __cudaparm_kernel_pair_dev_nbor,
.param .u64 __cudaparm_kernel_pair_dev_packed,
.param .u64 __cudaparm_kernel_pair_ans,
.param .u64 __cudaparm_kernel_pair___val_paramengv,
.param .s32 __cudaparm_kernel_pair_eflag,
.param .s32 __cudaparm_kernel_pair_vflag,
.param .s32 __cudaparm_kernel_pair_inum,
.param .s32 __cudaparm_kernel_pair_nbor_pitch,
.param .s32 __cudaparm_kernel_pair_t_per_atom)
{
.reg .u32 %r<72>;
.reg .u64 %rd<63>;
.reg .f32 %f<102>;
.reg .pred %p<19>;
.shared .align 16 .b8 __cuda___cuda_local_var_32536_33_non_const_sp_lj92[16];
.shared .align 4 .b8 __cuda___cuda_local_var_32600_55_non_const_red_acc108[3072];
// __cuda_local_var_32543_10_non_const_f = 48
// __cuda_local_var_32545_9_non_const_virial = 16
.loc 16 31 0
$LDWbegin_kernel_pair:
.loc 16 36 0
ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];
ldu.global.f32 %f1, [%rd1+0];
.loc 16 37 0
ld.global.f32 %f2, [%rd1+4];
.loc 16 38 0
ld.global.f32 %f3, [%rd1+8];
.loc 16 39 0
ld.global.f32 %f4, [%rd1+12];
st.shared.v4.f32 [__cuda___cuda_local_var_32536_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4};
.loc 16 46 0
mov.f32 %f5, 0f00000000; // 0
mov.f32 %f6, %f5;
mov.f32 %f7, 0f00000000; // 0
mov.f32 %f8, %f7;
mov.f32 %f9, 0f00000000; // 0
mov.f32 %f10, %f9;
mov.f32 %f11, 0f00000000; // 0
mov.f32 %f12, %f11;
mov.f32 %f13, 0f00000000; // 0
mov.f32 %f14, %f13;
mov.f32 %f15, 0f00000000; // 0
mov.f32 %f16, %f15;
ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];
cvt.s32.u32 %r2, %tid.x;
div.s32 %r3, %r2, %r1;
cvt.s32.u32 %r4, %ntid.x;
div.s32 %r5, %r4, %r1;
cvt.s32.u32 %r6, %ctaid.x;
mul.lo.s32 %r7, %r6, %r5;
add.s32 %r8, %r3, %r7;
ld.param.s32 %r9, [__cudaparm_kernel_pair_inum];
setp.ge.s32 %p1, %r8, %r9;
@%p1 bra $Lt_0_26370;
.loc 16 51 0
ld.param.s32 %r10, [__cudaparm_kernel_pair_nbor_pitch];
cvt.s64.s32 %rd2, %r10;
mul.wide.s32 %rd3, %r10, 4;
cvt.s64.s32 %rd4, %r8;
mul.wide.s32 %rd5, %r8, 4;
ld.param.u64 %rd6, [__cudaparm_kernel_pair_dev_nbor];
add.u64 %rd7, %rd5, %rd6;
add.u64 %rd8, %rd3, %rd7;
ld.global.s32 %r11, [%rd8+0];
sub.s32 %r12, %r1, 1;
and.b32 %r13, %r12, %r2;
cvt.s64.s32 %rd9, %r13;
mul.wide.s32 %rd10, %r13, 4;
ld.param.u64 %rd11, [__cudaparm_kernel_pair_dev_packed];
setp.ne.u64 %p2, %rd11, %rd6;
@%p2 bra $Lt_0_19458;
cvt.s32.s64 %r14, %rd2;
mul.lo.s32 %r15, %r14, %r1;
mov.s32 %r16, %r15;
mul.lo.s32 %r17, %r12, %r8;
add.s32 %r18, %r14, %r17;
cvt.s64.s32 %rd12, %r18;
mul.wide.s32 %rd13, %r18, 4;
add.u64 %rd14, %rd8, %rd13;
and.b32 %r19, %r12, %r11;
cvt.s64.s32 %rd15, %r19;
div.s32 %r20, %r11, %r1;
mul.lo.s32 %r21, %r15, %r20;
cvt.s64.s32 %rd16, %r21;
add.u64 %rd17, %rd15, %rd16;
mul.lo.u64 %rd18, %rd17, 4;
add.u64 %rd19, %rd14, %rd18;
add.u64 %rd20, %rd10, %rd14;
bra.uni $Lt_0_19202;
$Lt_0_19458:
add.u64 %rd21, %rd3, %rd8;
ld.global.s32 %r22, [%rd21+0];
cvt.s64.s32 %rd22, %r22;
mul.wide.s32 %rd23, %r22, 4;
add.u64 %rd24, %rd11, %rd23;
cvt.s64.s32 %rd25, %r11;
mul.wide.s32 %rd26, %r11, 4;
add.u64 %rd19, %rd24, %rd26;
mov.s32 %r16, %r1;
add.u64 %rd20, %rd10, %rd24;
$Lt_0_19202:
.loc 16 54 0
ld.global.s32 %r23, [%rd7+0];
mov.u32 %r24, %r23;
mov.s32 %r25, 0;
mov.u32 %r26, %r25;
mov.s32 %r27, 0;
mov.u32 %r28, %r27;
mov.s32 %r29, 0;
mov.u32 %r30, %r29;
tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r24,%r26,%r28,%r30}];
mov.f32 %f21, %f17;
mov.f32 %f22, %f18;
mov.f32 %f23, %f19;
mov.f32 %f24, %f20;
setp.ge.u64 %p3, %rd20, %rd19;
@%p3 bra $Lt_0_27906;
cvt.rzi.ftz.s32.f32 %r31, %f24;
cvt.s64.s32 %rd27, %r16;
ld.param.s32 %r32, [__cudaparm_kernel_pair_lj_types];
mul.lo.s32 %r33, %r32, %r31;
ld.param.u64 %rd28, [__cudaparm_kernel_pair_lj1];
mov.f32 %f25, 0f00000000; // 0
mov.f32 %f26, 0f00000000; // 0
mov.f32 %f27, 0f00000000; // 0
mov.f32 %f28, 0f00000000; // 0
mov.u64 %rd29, __cuda___cuda_local_var_32536_33_non_const_sp_lj92;
$Lt_0_20226:
//<loop> Loop body line 54, nesting depth: 1, estimated iterations: unknown
.loc 16 60 0
ld.global.s32 %r34, [%rd20+0];
.loc 16 61 0
shr.s32 %r35, %r34, 30;
and.b32 %r36, %r35, 3;
cvt.s64.s32 %rd30, %r36;
mul.wide.s32 %rd31, %r36, 4;
add.u64 %rd32, %rd29, %rd31;
ld.shared.f32 %f29, [%rd32+0];
.loc 16 64 0
and.b32 %r37, %r34, 1073741823;
mov.u32 %r38, %r37;
mov.s32 %r39, 0;
mov.u32 %r40, %r39;
mov.s32 %r41, 0;
mov.u32 %r42, %r41;
mov.s32 %r43, 0;
mov.u32 %r44, %r43;
tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r38,%r40,%r42,%r44}];
mov.f32 %f34, %f30;
mov.f32 %f35, %f31;
mov.f32 %f36, %f32;
mov.f32 %f37, %f33;
cvt.rzi.ftz.s32.f32 %r45, %f37;
sub.ftz.f32 %f38, %f22, %f35;
sub.ftz.f32 %f39, %f21, %f34;
sub.ftz.f32 %f40, %f23, %f36;
mul.ftz.f32 %f41, %f38, %f38;
fma.rn.ftz.f32 %f42, %f39, %f39, %f41;
fma.rn.ftz.f32 %f43, %f40, %f40, %f42;
add.s32 %r46, %r45, %r33;
cvt.s64.s32 %rd33, %r46;
mul.wide.s32 %rd34, %r46, 16;
add.u64 %rd35, %rd34, %rd28;
ld.global.f32 %f44, [%rd35+8];
setp.gt.ftz.f32 %p4, %f44, %f43;
@!%p4 bra $Lt_0_21506;
.loc 16 78 0
rcp.approx.ftz.f32 %f45, %f43;
mul.ftz.f32 %f46, %f45, %f45;
mul.ftz.f32 %f47, %f45, %f46;
mul.ftz.f32 %f48, %f45, %f47;
ld.global.v2.f32 {%f49,%f50}, [%rd35+0];
mul.ftz.f32 %f51, %f49, %f47;
sub.ftz.f32 %f52, %f51, %f50;
mul.ftz.f32 %f53, %f48, %f52;
mul.ftz.f32 %f54, %f29, %f53;
.loc 16 80 0
fma.rn.ftz.f32 %f27, %f39, %f54, %f27;
.loc 16 81 0
fma.rn.ftz.f32 %f26, %f38, %f54, %f26;
.loc 16 82 0
fma.rn.ftz.f32 %f25, %f40, %f54, %f25;
ld.param.s32 %r47, [__cudaparm_kernel_pair_eflag];
mov.u32 %r48, 0;
setp.le.s32 %p5, %r47, %r48;
@%p5 bra $Lt_0_20994;
.loc 16 86 0
ld.param.u64 %rd36, [__cudaparm_kernel_pair_lj3];
add.u64 %rd37, %rd36, %rd34;
ld.global.v4.f32 {%f55,%f56,%f57,_}, [%rd37+0];
mul.ftz.f32 %f58, %f55, %f47;
sub.ftz.f32 %f59, %f58, %f56;
mul.ftz.f32 %f60, %f47, %f59;
sub.ftz.f32 %f61, %f60, %f57;
fma.rn.ftz.f32 %f28, %f29, %f61, %f28;
$Lt_0_20994:
ld.param.s32 %r49, [__cudaparm_kernel_pair_vflag];
mov.u32 %r50, 0;
setp.le.s32 %p6, %r49, %r50;
@%p6 bra $Lt_0_21506;
.loc 16 89 0
mov.f32 %f62, %f6;
mul.ftz.f32 %f63, %f39, %f39;
fma.rn.ftz.f32 %f64, %f54, %f63, %f62;
mov.f32 %f6, %f64;
.loc 16 90 0
mov.f32 %f65, %f8;
fma.rn.ftz.f32 %f66, %f54, %f41, %f65;
mov.f32 %f8, %f66;
.loc 16 91 0
mov.f32 %f67, %f10;
mul.ftz.f32 %f68, %f40, %f40;
fma.rn.ftz.f32 %f69, %f54, %f68, %f67;
mov.f32 %f10, %f69;
.loc 16 92 0
mov.f32 %f70, %f12;
mul.ftz.f32 %f71, %f38, %f39;
fma.rn.ftz.f32 %f72, %f54, %f71, %f70;
mov.f32 %f12, %f72;
.loc 16 93 0
mov.f32 %f73, %f14;
mul.ftz.f32 %f74, %f39, %f40;
fma.rn.ftz.f32 %f75, %f54, %f74, %f73;
mov.f32 %f14, %f75;
.loc 16 94 0
mul.ftz.f32 %f76, %f38, %f40;
fma.rn.ftz.f32 %f15, %f54, %f76, %f15;
mov.f32 %f16, %f15;
$Lt_0_21506:
$Lt_0_20482:
.loc 16 58 0
mul.lo.u64 %rd38, %rd27, 4;
add.u64 %rd20, %rd20, %rd38;
setp.lt.u64 %p7, %rd20, %rd19;
@%p7 bra $Lt_0_20226;
bra.uni $Lt_0_19714;
$Lt_0_27906:
mov.f32 %f25, 0f00000000; // 0
mov.f32 %f26, 0f00000000; // 0
mov.f32 %f27, 0f00000000; // 0
mov.f32 %f28, 0f00000000; // 0
$Lt_0_19714:
mov.u32 %r51, 1;
setp.le.s32 %p8, %r1, %r51;
@%p8 bra $Lt_0_24322;
.loc 16 99 0
mov.u64 %rd39, __cuda___cuda_local_var_32600_55_non_const_red_acc108;
cvt.s64.s32 %rd40, %r2;
mul.wide.s32 %rd41, %r2, 4;
add.u64 %rd42, %rd39, %rd41;
mov.f32 %f77, %f27;
st.shared.f32 [%rd42+0], %f77;
mov.f32 %f78, %f26;
st.shared.f32 [%rd42+512], %f78;
mov.f32 %f79, %f25;
st.shared.f32 [%rd42+1024], %f79;
mov.f32 %f80, %f28;
st.shared.f32 [%rd42+1536], %f80;
shr.s32 %r52, %r1, 31;
mov.s32 %r53, 1;
and.b32 %r54, %r52, %r53;
add.s32 %r55, %r54, %r1;
shr.s32 %r56, %r55, 1;
mov.s32 %r57, %r56;
mov.u32 %r58, 0;
setp.ne.u32 %p9, %r56, %r58;
@!%p9 bra $Lt_0_22786;
$Lt_0_23298:
setp.ge.u32 %p10, %r13, %r57;
@%p10 bra $Lt_0_23554;
add.u32 %r59, %r2, %r57;
cvt.u64.u32 %rd43, %r59;
mul.wide.u32 %rd44, %r59, 4;
add.u64 %rd45, %rd39, %rd44;
ld.shared.f32 %f81, [%rd45+0];
add.ftz.f32 %f77, %f81, %f77;
st.shared.f32 [%rd42+0], %f77;
ld.shared.f32 %f82, [%rd45+512];
add.ftz.f32 %f78, %f82, %f78;
st.shared.f32 [%rd42+512], %f78;
ld.shared.f32 %f83, [%rd45+1024];
add.ftz.f32 %f79, %f83, %f79;
st.shared.f32 [%rd42+1024], %f79;
ld.shared.f32 %f84, [%rd45+1536];
add.ftz.f32 %f80, %f84, %f80;
st.shared.f32 [%rd42+1536], %f80;
$Lt_0_23554:
shr.u32 %r57, %r57, 1;
mov.u32 %r60, 0;
setp.ne.u32 %p11, %r57, %r60;
@%p11 bra $Lt_0_23298;
$Lt_0_22786:
mov.f32 %f27, %f77;
mov.f32 %f26, %f78;
mov.f32 %f25, %f79;
mov.f32 %f28, %f80;
ld.param.s32 %r61, [__cudaparm_kernel_pair_vflag];
mov.u32 %r62, 0;
setp.le.s32 %p12, %r61, %r62;
@%p12 bra $Lt_0_24322;
mov.f32 %f77, %f6;
st.shared.f32 [%rd42+0], %f77;
mov.f32 %f78, %f8;
st.shared.f32 [%rd42+512], %f78;
mov.f32 %f79, %f10;
st.shared.f32 [%rd42+1024], %f79;
mov.f32 %f80, %f12;
st.shared.f32 [%rd42+1536], %f80;
mov.f32 %f85, %f14;
st.shared.f32 [%rd42+2048], %f85;
mov.f32 %f86, %f15;
st.shared.f32 [%rd42+2560], %f86;
mov.s32 %r63, %r56;
@!%p9 bra $Lt_0_24834;
$Lt_0_25346:
setp.ge.u32 %p13, %r13, %r63;
@%p13 bra $Lt_0_25602;
add.u32 %r64, %r2, %r63;
cvt.u64.u32 %rd46, %r64;
mul.wide.u32 %rd47, %r64, 4;
add.u64 %rd48, %rd39, %rd47;
ld.shared.f32 %f87, [%rd48+0];
add.ftz.f32 %f77, %f87, %f77;
st.shared.f32 [%rd42+0], %f77;
ld.shared.f32 %f88, [%rd48+512];
add.ftz.f32 %f78, %f88, %f78;
st.shared.f32 [%rd42+512], %f78;
ld.shared.f32 %f89, [%rd48+1024];
add.ftz.f32 %f79, %f89, %f79;
st.shared.f32 [%rd42+1024], %f79;
ld.shared.f32 %f90, [%rd48+1536];
add.ftz.f32 %f80, %f90, %f80;
st.shared.f32 [%rd42+1536], %f80;
ld.shared.f32 %f91, [%rd48+2048];
add.ftz.f32 %f85, %f91, %f85;
st.shared.f32 [%rd42+2048], %f85;
ld.shared.f32 %f92, [%rd48+2560];
add.ftz.f32 %f86, %f92, %f86;
st.shared.f32 [%rd42+2560], %f86;
$Lt_0_25602:
shr.u32 %r63, %r63, 1;
mov.u32 %r65, 0;
setp.ne.u32 %p14, %r63, %r65;
@%p14 bra $Lt_0_25346;
$Lt_0_24834:
mov.f32 %f6, %f77;
mov.f32 %f8, %f78;
mov.f32 %f10, %f79;
mov.f32 %f12, %f80;
mov.f32 %f14, %f85;
mov.f32 %f16, %f86;
$Lt_0_24322:
$Lt_0_22274:
mov.u32 %r66, 0;
setp.ne.s32 %p15, %r13, %r66;
@%p15 bra $Lt_0_26370;
ld.param.u64 %rd49, [__cudaparm_kernel_pair___val_paramengv];
add.u64 %rd50, %rd49, %rd5;
ld.param.s32 %r67, [__cudaparm_kernel_pair_eflag];
mov.u32 %r68, 0;
setp.le.s32 %p16, %r67, %r68;
@%p16 bra $Lt_0_26882;
st.global.f32 [%rd50+0], %f28;
cvt.s64.s32 %rd51, %r9;
mul.wide.s32 %rd52, %r9, 4;
add.u64 %rd50, %rd50, %rd52;
$Lt_0_26882:
ld.param.s32 %r69, [__cudaparm_kernel_pair_vflag];
mov.u32 %r70, 0;
setp.le.s32 %p17, %r69, %r70;
@%p17 bra $Lt_0_27394;
mov.f32 %f93, %f6;
st.global.f32 [%rd50+0], %f93;
cvt.s64.s32 %rd53, %r9;
mul.wide.s32 %rd54, %r9, 4;
add.u64 %rd55, %rd54, %rd50;
mov.f32 %f94, %f8;
st.global.f32 [%rd55+0], %f94;
add.u64 %rd56, %rd54, %rd55;
mov.f32 %f95, %f10;
st.global.f32 [%rd56+0], %f95;
add.u64 %rd57, %rd54, %rd56;
mov.f32 %f96, %f12;
st.global.f32 [%rd57+0], %f96;
add.u64 %rd50, %rd54, %rd57;
mov.f32 %f97, %f14;
st.global.f32 [%rd50+0], %f97;
mov.f32 %f98, %f16;
add.u64 %rd58, %rd54, %rd50;
st.global.f32 [%rd58+0], %f98;
$Lt_0_27394:
ld.param.u64 %rd59, [__cudaparm_kernel_pair_ans];
mul.lo.u64 %rd60, %rd4, 16;
add.u64 %rd61, %rd59, %rd60;
mov.f32 %f99, %f100;
st.global.v4.f32 [%rd61+0], {%f27,%f26,%f25,%f99};
$Lt_0_26370:
$Lt_0_18690:
.loc 16 102 0
exit;
$LDWend_kernel_pair:
} // kernel_pair
.entry kernel_pair_fast (
.param .u64 __cudaparm_kernel_pair_fast_x_,
.param .u64 __cudaparm_kernel_pair_fast_lj1_in,
.param .u64 __cudaparm_kernel_pair_fast_lj3_in,
.param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,
.param .u64 __cudaparm_kernel_pair_fast_dev_nbor,
.param .u64 __cudaparm_kernel_pair_fast_dev_packed,
.param .u64 __cudaparm_kernel_pair_fast_ans,
.param .u64 __cudaparm_kernel_pair_fast___val_paramengv,
.param .s32 __cudaparm_kernel_pair_fast_eflag,
.param .s32 __cudaparm_kernel_pair_fast_vflag,
.param .s32 __cudaparm_kernel_pair_fast_inum,
.param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,
.param .s32 __cudaparm_kernel_pair_fast_t_per_atom)
{
.reg .u32 %r<74>;
.reg .u64 %rd<75>;
.reg .f32 %f<109>;
.reg .pred %p<22>;
.shared .align 4 .b8 __cuda___cuda_local_var_32617_33_non_const_sp_lj3268[16];
.shared .align 16 .b8 __cuda___cuda_local_var_32615_34_non_const_lj13296[1936];
.shared .align 16 .b8 __cuda___cuda_local_var_32616_34_non_const_lj35232[1936];
.shared .align 4 .b8 __cuda___cuda_local_var_32685_55_non_const_red_acc7168[3072];
// __cuda_local_var_32627_10_non_const_f = 48
// __cuda_local_var_32629_9_non_const_virial = 16
.loc 16 110 0
$LDWbegin_kernel_pair_fast:
cvt.s32.u32 %r1, %tid.x;
mov.u32 %r2, 3;
setp.gt.s32 %p1, %r1, %r2;
@%p1 bra $Lt_1_20994;
.loc 16 118 0
mov.u64 %rd1, __cuda___cuda_local_var_32617_33_non_const_sp_lj3268;
cvt.s64.s32 %rd2, %r1;
mul.wide.s32 %rd3, %r1, 4;
ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];
add.u64 %rd5, %rd4, %rd3;
ld.global.f32 %f1, [%rd5+0];
add.u64 %rd6, %rd3, %rd1;
st.shared.f32 [%rd6+0], %f1;
$Lt_1_20994:
mov.u64 %rd1, __cuda___cuda_local_var_32617_33_non_const_sp_lj3268;
mov.u32 %r3, 120;
setp.gt.s32 %p2, %r1, %r3;
@%p2 bra $Lt_1_21506;
.loc 16 120 0
mov.u64 %rd7, __cuda___cuda_local_var_32615_34_non_const_lj13296;
cvt.s64.s32 %rd8, %r1;
mul.wide.s32 %rd9, %r1, 16;
ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_lj1_in];
add.u64 %rd11, %rd10, %rd9;
add.u64 %rd12, %rd9, %rd7;
ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0];
st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5};
ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag];
mov.u32 %r5, 0;
setp.le.s32 %p3, %r4, %r5;
@%p3 bra $Lt_1_22018;
.loc 16 122 0
mov.u64 %rd13, __cuda___cuda_local_var_32616_34_non_const_lj35232;
ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in];
add.u64 %rd15, %rd14, %rd9;
add.u64 %rd16, %rd9, %rd13;
ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0];
st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9};
$Lt_1_22018:
mov.u64 %rd13, __cuda___cuda_local_var_32616_34_non_const_lj35232;
$Lt_1_21506:
mov.u64 %rd13, __cuda___cuda_local_var_32616_34_non_const_lj35232;
mov.u64 %rd7, __cuda___cuda_local_var_32615_34_non_const_lj13296;
.loc 16 130 0
mov.f32 %f10, 0f00000000; // 0
mov.f32 %f11, %f10;
mov.f32 %f12, 0f00000000; // 0
mov.f32 %f13, %f12;
mov.f32 %f14, 0f00000000; // 0
mov.f32 %f15, %f14;
mov.f32 %f16, 0f00000000; // 0
mov.f32 %f17, %f16;
mov.f32 %f18, 0f00000000; // 0
mov.f32 %f19, %f18;
mov.f32 %f20, 0f00000000; // 0
mov.f32 %f21, %f20;
.loc 16 132 0
bar.sync 0;
ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom];
div.s32 %r7, %r1, %r6;
cvt.s32.u32 %r8, %ntid.x;
div.s32 %r9, %r8, %r6;
cvt.s32.u32 %r10, %ctaid.x;
mul.lo.s32 %r11, %r10, %r9;
add.s32 %r12, %r7, %r11;
ld.param.s32 %r13, [__cudaparm_kernel_pair_fast_inum];
setp.ge.s32 %p4, %r12, %r13;
@%p4 bra $Lt_1_30210;
.loc 16 137 0
ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_nbor_pitch];
cvt.s64.s32 %rd17, %r14;
mul.wide.s32 %rd18, %r14, 4;
cvt.s64.s32 %rd19, %r12;
mul.wide.s32 %rd20, %r12, 4;
ld.param.u64 %rd21, [__cudaparm_kernel_pair_fast_dev_nbor];
add.u64 %rd22, %rd20, %rd21;
add.u64 %rd23, %rd18, %rd22;
ld.global.s32 %r15, [%rd23+0];
sub.s32 %r16, %r6, 1;
and.b32 %r17, %r16, %r1;
cvt.s64.s32 %rd24, %r17;
mul.wide.s32 %rd25, %r17, 4;
ld.param.u64 %rd26, [__cudaparm_kernel_pair_fast_dev_packed];
setp.ne.u64 %p5, %rd26, %rd21;
@%p5 bra $Lt_1_23298;
cvt.s32.s64 %r18, %rd17;
mul.lo.s32 %r19, %r18, %r6;
mov.s32 %r20, %r19;
mul.lo.s32 %r21, %r16, %r12;
add.s32 %r22, %r18, %r21;
cvt.s64.s32 %rd27, %r22;
mul.wide.s32 %rd28, %r22, 4;
add.u64 %rd29, %rd23, %rd28;
and.b32 %r23, %r16, %r15;
cvt.s64.s32 %rd30, %r23;
div.s32 %r24, %r15, %r6;
mul.lo.s32 %r25, %r19, %r24;
cvt.s64.s32 %rd31, %r25;
add.u64 %rd32, %rd30, %rd31;
mul.lo.u64 %rd33, %rd32, 4;
add.u64 %rd34, %rd29, %rd33;
add.u64 %rd35, %rd25, %rd29;
bra.uni $Lt_1_23042;
$Lt_1_23298:
add.u64 %rd36, %rd18, %rd23;
ld.global.s32 %r26, [%rd36+0];
cvt.s64.s32 %rd37, %r26;
mul.wide.s32 %rd38, %r26, 4;
add.u64 %rd39, %rd26, %rd38;
cvt.s64.s32 %rd40, %r15;
mul.wide.s32 %rd41, %r15, 4;
add.u64 %rd34, %rd39, %rd41;
mov.s32 %r20, %r6;
add.u64 %rd35, %rd25, %rd39;
$Lt_1_23042:
.loc 16 140 0
ld.global.s32 %r27, [%rd22+0];
mov.u32 %r28, %r27;
mov.s32 %r29, 0;
mov.u32 %r30, %r29;
mov.s32 %r31, 0;
mov.u32 %r32, %r31;
mov.s32 %r33, 0;
mov.u32 %r34, %r33;
tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r28,%r30,%r32,%r34}];
mov.f32 %f26, %f22;
mov.f32 %f27, %f23;
mov.f32 %f28, %f24;
mov.f32 %f29, %f25;
setp.ge.u64 %p6, %rd35, %rd34;
@%p6 bra $Lt_1_31746;
cvt.rzi.ftz.s32.f32 %r35, %f29;
cvt.s64.s32 %rd42, %r20;
mul.lo.s32 %r36, %r35, 11;
cvt.rn.f32.s32 %f30, %r36;
mov.f32 %f31, 0f00000000; // 0
mov.f32 %f32, 0f00000000; // 0
mov.f32 %f33, 0f00000000; // 0
mov.f32 %f34, 0f00000000; // 0
$Lt_1_24066:
//<loop> Loop body line 140, nesting depth: 1, estimated iterations: unknown
.loc 16 147 0
ld.global.s32 %r37, [%rd35+0];
.loc 16 148 0
shr.s32 %r38, %r37, 30;
and.b32 %r39, %r38, 3;
cvt.s64.s32 %rd43, %r39;
mul.wide.s32 %rd44, %r39, 4;
add.u64 %rd45, %rd1, %rd44;
ld.shared.f32 %f35, [%rd45+0];
.loc 16 151 0
and.b32 %r40, %r37, 1073741823;
mov.u32 %r41, %r40;
mov.s32 %r42, 0;
mov.u32 %r43, %r42;
mov.s32 %r44, 0;
mov.u32 %r45, %r44;
mov.s32 %r46, 0;
mov.u32 %r47, %r46;
tex.1d.v4.f32.s32 {%f36,%f37,%f38,%f39},[pos_tex,{%r41,%r43,%r45,%r47}];
mov.f32 %f40, %f36;
mov.f32 %f41, %f37;
mov.f32 %f42, %f38;
mov.f32 %f43, %f39;
sub.ftz.f32 %f44, %f27, %f41;
sub.ftz.f32 %f45, %f26, %f40;
sub.ftz.f32 %f46, %f28, %f42;
mul.ftz.f32 %f47, %f44, %f44;
fma.rn.ftz.f32 %f48, %f45, %f45, %f47;
fma.rn.ftz.f32 %f49, %f46, %f46, %f48;
add.ftz.f32 %f50, %f30, %f43;
cvt.rzi.ftz.s32.f32 %r48, %f50;
cvt.s64.s32 %rd46, %r48;
mul.wide.s32 %rd47, %r48, 16;
add.u64 %rd48, %rd47, %rd7;
ld.shared.f32 %f51, [%rd48+8];
setp.gt.ftz.f32 %p7, %f51, %f49;
@!%p7 bra $Lt_1_25346;
.loc 16 163 0
rcp.approx.ftz.f32 %f52, %f49;
mul.ftz.f32 %f53, %f52, %f52;
mul.ftz.f32 %f54, %f52, %f53;
mul.ftz.f32 %f55, %f52, %f35;
mul.ftz.f32 %f56, %f54, %f55;
ld.shared.v2.f32 {%f57,%f58}, [%rd48+0];
mul.ftz.f32 %f59, %f57, %f54;
sub.ftz.f32 %f60, %f59, %f58;
mul.ftz.f32 %f61, %f56, %f60;
.loc 16 165 0
fma.rn.ftz.f32 %f33, %f45, %f61, %f33;
.loc 16 166 0
fma.rn.ftz.f32 %f32, %f44, %f61, %f32;
.loc 16 167 0
fma.rn.ftz.f32 %f31, %f46, %f61, %f31;
ld.param.s32 %r49, [__cudaparm_kernel_pair_fast_eflag];
mov.u32 %r50, 0;
setp.le.s32 %p8, %r49, %r50;
@%p8 bra $Lt_1_24834;
.loc 16 170 0
add.u64 %rd49, %rd47, %rd13;
ld.shared.v4.f32 {%f62,%f63,%f64,_}, [%rd49+0];
mul.ftz.f32 %f65, %f62, %f54;
sub.ftz.f32 %f66, %f65, %f63;
mul.ftz.f32 %f67, %f54, %f66;
.loc 16 171 0
sub.ftz.f32 %f68, %f67, %f64;
fma.rn.ftz.f32 %f34, %f35, %f68, %f34;
$Lt_1_24834:
ld.param.s32 %r51, [__cudaparm_kernel_pair_fast_vflag];
mov.u32 %r52, 0;
setp.le.s32 %p9, %r51, %r52;
@%p9 bra $Lt_1_25346;
.loc 16 174 0
mov.f32 %f69, %f11;
mul.ftz.f32 %f70, %f45, %f45;
fma.rn.ftz.f32 %f71, %f61, %f70, %f69;
mov.f32 %f11, %f71;
.loc 16 175 0
mov.f32 %f72, %f13;
fma.rn.ftz.f32 %f73, %f61, %f47, %f72;
mov.f32 %f13, %f73;
.loc 16 176 0
mov.f32 %f74, %f15;
mul.ftz.f32 %f75, %f46, %f46;
fma.rn.ftz.f32 %f76, %f61, %f75, %f74;
mov.f32 %f15, %f76;
.loc 16 177 0
mov.f32 %f77, %f17;
mul.ftz.f32 %f78, %f44, %f45;
fma.rn.ftz.f32 %f79, %f61, %f78, %f77;
mov.f32 %f17, %f79;
.loc 16 178 0
mov.f32 %f80, %f19;
mul.ftz.f32 %f81, %f45, %f46;
fma.rn.ftz.f32 %f82, %f61, %f81, %f80;
mov.f32 %f19, %f82;
.loc 16 179 0
mul.ftz.f32 %f83, %f44, %f46;
fma.rn.ftz.f32 %f20, %f61, %f83, %f20;
mov.f32 %f21, %f20;
$Lt_1_25346:
$Lt_1_24322:
.loc 16 145 0
mul.lo.u64 %rd50, %rd42, 4;
add.u64 %rd35, %rd35, %rd50;
setp.lt.u64 %p10, %rd35, %rd34;
@%p10 bra $Lt_1_24066;
bra.uni $Lt_1_23554;
$Lt_1_31746:
mov.f32 %f31, 0f00000000; // 0
mov.f32 %f32, 0f00000000; // 0
mov.f32 %f33, 0f00000000; // 0
mov.f32 %f34, 0f00000000; // 0
$Lt_1_23554:
mov.u32 %r53, 1;
setp.le.s32 %p11, %r6, %r53;
@%p11 bra $Lt_1_28162;
.loc 16 184 0
mov.u64 %rd51, __cuda___cuda_local_var_32685_55_non_const_red_acc7168;
cvt.s64.s32 %rd52, %r1;
mul.wide.s32 %rd53, %r1, 4;
add.u64 %rd54, %rd51, %rd53;
mov.f32 %f84, %f33;
st.shared.f32 [%rd54+0], %f84;
mov.f32 %f85, %f32;
st.shared.f32 [%rd54+512], %f85;
mov.f32 %f86, %f31;
st.shared.f32 [%rd54+1024], %f86;
mov.f32 %f87, %f34;
st.shared.f32 [%rd54+1536], %f87;
shr.s32 %r54, %r6, 31;
mov.s32 %r55, 1;
and.b32 %r56, %r54, %r55;
add.s32 %r57, %r56, %r6;
shr.s32 %r58, %r57, 1;
mov.s32 %r59, %r58;
mov.u32 %r60, 0;
setp.ne.u32 %p12, %r58, %r60;
@!%p12 bra $Lt_1_26626;
$Lt_1_27138:
setp.ge.u32 %p13, %r17, %r59;
@%p13 bra $Lt_1_27394;
add.u32 %r61, %r1, %r59;
cvt.u64.u32 %rd55, %r61;
mul.wide.u32 %rd56, %r61, 4;
add.u64 %rd57, %rd51, %rd56;
ld.shared.f32 %f88, [%rd57+0];
add.ftz.f32 %f84, %f88, %f84;
st.shared.f32 [%rd54+0], %f84;
ld.shared.f32 %f89, [%rd57+512];
add.ftz.f32 %f85, %f89, %f85;
st.shared.f32 [%rd54+512], %f85;
ld.shared.f32 %f90, [%rd57+1024];
add.ftz.f32 %f86, %f90, %f86;
st.shared.f32 [%rd54+1024], %f86;
ld.shared.f32 %f91, [%rd57+1536];
add.ftz.f32 %f87, %f91, %f87;
st.shared.f32 [%rd54+1536], %f87;
$Lt_1_27394:
shr.u32 %r59, %r59, 1;
mov.u32 %r62, 0;
setp.ne.u32 %p14, %r59, %r62;
@%p14 bra $Lt_1_27138;
$Lt_1_26626:
mov.f32 %f33, %f84;
mov.f32 %f32, %f85;
mov.f32 %f31, %f86;
mov.f32 %f34, %f87;
ld.param.s32 %r63, [__cudaparm_kernel_pair_fast_vflag];
mov.u32 %r64, 0;
setp.le.s32 %p15, %r63, %r64;
@%p15 bra $Lt_1_28162;
mov.f32 %f84, %f11;
st.shared.f32 [%rd54+0], %f84;
mov.f32 %f85, %f13;
st.shared.f32 [%rd54+512], %f85;
mov.f32 %f86, %f15;
st.shared.f32 [%rd54+1024], %f86;
mov.f32 %f87, %f17;
st.shared.f32 [%rd54+1536], %f87;
mov.f32 %f92, %f19;
st.shared.f32 [%rd54+2048], %f92;
mov.f32 %f93, %f20;
st.shared.f32 [%rd54+2560], %f93;
mov.s32 %r65, %r58;
@!%p12 bra $Lt_1_28674;
$Lt_1_29186:
setp.ge.u32 %p16, %r17, %r65;
@%p16 bra $Lt_1_29442;
add.u32 %r66, %r1, %r65;
cvt.u64.u32 %rd58, %r66;
mul.wide.u32 %rd59, %r66, 4;
add.u64 %rd60, %rd51, %rd59;
ld.shared.f32 %f94, [%rd60+0];
add.ftz.f32 %f84, %f94, %f84;
st.shared.f32 [%rd54+0], %f84;
ld.shared.f32 %f95, [%rd60+512];
add.ftz.f32 %f85, %f95, %f85;
st.shared.f32 [%rd54+512], %f85;
ld.shared.f32 %f96, [%rd60+1024];
add.ftz.f32 %f86, %f96, %f86;
st.shared.f32 [%rd54+1024], %f86;
ld.shared.f32 %f97, [%rd60+1536];
add.ftz.f32 %f87, %f97, %f87;
st.shared.f32 [%rd54+1536], %f87;
ld.shared.f32 %f98, [%rd60+2048];
add.ftz.f32 %f92, %f98, %f92;
st.shared.f32 [%rd54+2048], %f92;
ld.shared.f32 %f99, [%rd60+2560];
add.ftz.f32 %f93, %f99, %f93;
st.shared.f32 [%rd54+2560], %f93;
$Lt_1_29442:
shr.u32 %r65, %r65, 1;
mov.u32 %r67, 0;
setp.ne.u32 %p17, %r65, %r67;
@%p17 bra $Lt_1_29186;
$Lt_1_28674:
mov.f32 %f11, %f84;
mov.f32 %f13, %f85;
mov.f32 %f15, %f86;
mov.f32 %f17, %f87;
mov.f32 %f19, %f92;
mov.f32 %f21, %f93;
$Lt_1_28162:
$Lt_1_26114:
mov.u32 %r68, 0;
setp.ne.s32 %p18, %r17, %r68;
@%p18 bra $Lt_1_30210;
ld.param.u64 %rd61, [__cudaparm_kernel_pair_fast___val_paramengv];
add.u64 %rd62, %rd61, %rd20;
ld.param.s32 %r69, [__cudaparm_kernel_pair_fast_eflag];
mov.u32 %r70, 0;
setp.le.s32 %p19, %r69, %r70;
@%p19 bra $Lt_1_30722;
st.global.f32 [%rd62+0], %f34;
cvt.s64.s32 %rd63, %r13;
mul.wide.s32 %rd64, %r13, 4;
add.u64 %rd62, %rd62, %rd64;
$Lt_1_30722:
ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag];
mov.u32 %r72, 0;
setp.le.s32 %p20, %r71, %r72;
@%p20 bra $Lt_1_31234;
mov.f32 %f100, %f11;
st.global.f32 [%rd62+0], %f100;
cvt.s64.s32 %rd65, %r13;
mul.wide.s32 %rd66, %r13, 4;
add.u64 %rd67, %rd66, %rd62;
mov.f32 %f101, %f13;
st.global.f32 [%rd67+0], %f101;
add.u64 %rd68, %rd66, %rd67;
mov.f32 %f102, %f15;
st.global.f32 [%rd68+0], %f102;
add.u64 %rd69, %rd66, %rd68;
mov.f32 %f103, %f17;
st.global.f32 [%rd69+0], %f103;
add.u64 %rd62, %rd66, %rd69;
mov.f32 %f104, %f19;
st.global.f32 [%rd62+0], %f104;
mov.f32 %f105, %f21;
add.u64 %rd70, %rd66, %rd62;
st.global.f32 [%rd70+0], %f105;
$Lt_1_31234:
ld.param.u64 %rd71, [__cudaparm_kernel_pair_fast_ans];
mul.lo.u64 %rd72, %rd19, 16;
add.u64 %rd73, %rd71, %rd72;
mov.f32 %f106, %f107;
st.global.v4.f32 [%rd73+0], {%f33,%f32,%f31,%f106};
$Lt_1_30210:
$Lt_1_22530:
.loc 16 187 0
exit;
$LDWend_kernel_pair_fast:
} // kernel_pair_fast

View File

@ -1,901 +0,0 @@
.version 2.3
.target sm_20
.address_size 64
// compiled with /usr/local/cuda/open64/lib//be
// nvopencc 4.0 built on 2011-05-12
//-----------------------------------------------------------
// Compiling /tmp/tmpxft_00009c89_00000000-9_lal_lj96.cpp3.i (/home/sjplimp/ccBI#.pOwwSL)
//-----------------------------------------------------------
//-----------------------------------------------------------
// Options:
//-----------------------------------------------------------
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
// -O3 (Optimization level)
// -g0 (Debug level)
// -m2 (Report advisories)
//-----------------------------------------------------------
.file 1 "<command-line>"
.file 2 "/tmp/tmpxft_00009c89_00000000-8_lal_lj96.cudafe2.gpu"
.file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
.file 4 "/usr/local/cuda/include/crt/device_runtime.h"
.file 5 "/usr/local/cuda/include/host_defines.h"
.file 6 "/usr/local/cuda/include/builtin_types.h"
.file 7 "/usr/local/cuda/include/device_types.h"
.file 8 "/usr/local/cuda/include/driver_types.h"
.file 9 "/usr/local/cuda/include/surface_types.h"
.file 10 "/usr/local/cuda/include/texture_types.h"
.file 11 "/usr/local/cuda/include/vector_types.h"
.file 12 "/usr/local/cuda/include/device_launch_parameters.h"
.file 13 "/usr/local/cuda/include/crt/storage_class.h"
.file 14 "/usr/include/bits/types.h"
.file 15 "/usr/include/time.h"
.file 16 "lal_lj96.cu"
.file 17 "/usr/local/cuda/include/common_functions.h"
.file 18 "/usr/local/cuda/include/math_functions.h"
.file 19 "/usr/local/cuda/include/math_constants.h"
.file 20 "/usr/local/cuda/include/device_functions.h"
.file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h"
.file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h"
.file 23 "/usr/local/cuda/include/sm_13_double_functions.h"
.file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h"
.file 25 "/usr/local/cuda/include/sm_20_intrinsics.h"
.file 26 "/usr/local/cuda/include/surface_functions.h"
.file 27 "/usr/local/cuda/include/texture_fetch_functions.h"
.file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
.global .texref pos_tex;
.entry kernel_pair (
.param .u64 __cudaparm_kernel_pair_x_,
.param .u64 __cudaparm_kernel_pair_lj1,
.param .u64 __cudaparm_kernel_pair_lj3,
.param .s32 __cudaparm_kernel_pair_lj_types,
.param .u64 __cudaparm_kernel_pair_sp_lj_in,
.param .u64 __cudaparm_kernel_pair_dev_nbor,
.param .u64 __cudaparm_kernel_pair_dev_packed,
.param .u64 __cudaparm_kernel_pair_ans,
.param .u64 __cudaparm_kernel_pair___val_paramengv,
.param .s32 __cudaparm_kernel_pair_eflag,
.param .s32 __cudaparm_kernel_pair_vflag,
.param .s32 __cudaparm_kernel_pair_inum,
.param .s32 __cudaparm_kernel_pair_nbor_pitch,
.param .s32 __cudaparm_kernel_pair_t_per_atom)
{
.reg .u32 %r<72>;
.reg .u64 %rd<63>;
.reg .f32 %f<103>;
.reg .pred %p<19>;
.shared .align 16 .b8 __cuda___cuda_local_var_32536_33_non_const_sp_lj92[16];
.shared .align 4 .b8 __cuda___cuda_local_var_32601_55_non_const_red_acc108[3072];
// __cuda_local_var_32543_10_non_const_f = 48
// __cuda_local_var_32545_9_non_const_virial = 16
.loc 16 31 0
$LDWbegin_kernel_pair:
.loc 16 36 0
ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];
ldu.global.f32 %f1, [%rd1+0];
.loc 16 37 0
ld.global.f32 %f2, [%rd1+4];
.loc 16 38 0
ld.global.f32 %f3, [%rd1+8];
.loc 16 39 0
ld.global.f32 %f4, [%rd1+12];
st.shared.v4.f32 [__cuda___cuda_local_var_32536_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4};
.loc 16 46 0
mov.f32 %f5, 0f00000000; // 0
mov.f32 %f6, %f5;
mov.f32 %f7, 0f00000000; // 0
mov.f32 %f8, %f7;
mov.f32 %f9, 0f00000000; // 0
mov.f32 %f10, %f9;
mov.f32 %f11, 0f00000000; // 0
mov.f32 %f12, %f11;
mov.f32 %f13, 0f00000000; // 0
mov.f32 %f14, %f13;
mov.f32 %f15, 0f00000000; // 0
mov.f32 %f16, %f15;
ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];
cvt.s32.u32 %r2, %tid.x;
div.s32 %r3, %r2, %r1;
cvt.s32.u32 %r4, %ntid.x;
div.s32 %r5, %r4, %r1;
cvt.s32.u32 %r6, %ctaid.x;
mul.lo.s32 %r7, %r6, %r5;
add.s32 %r8, %r3, %r7;
ld.param.s32 %r9, [__cudaparm_kernel_pair_inum];
setp.ge.s32 %p1, %r8, %r9;
@%p1 bra $Lt_0_26370;
.loc 16 51 0
ld.param.s32 %r10, [__cudaparm_kernel_pair_nbor_pitch];
cvt.s64.s32 %rd2, %r10;
mul.wide.s32 %rd3, %r10, 4;
cvt.s64.s32 %rd4, %r8;
mul.wide.s32 %rd5, %r8, 4;
ld.param.u64 %rd6, [__cudaparm_kernel_pair_dev_nbor];
add.u64 %rd7, %rd5, %rd6;
add.u64 %rd8, %rd3, %rd7;
ld.global.s32 %r11, [%rd8+0];
sub.s32 %r12, %r1, 1;
and.b32 %r13, %r12, %r2;
cvt.s64.s32 %rd9, %r13;
mul.wide.s32 %rd10, %r13, 4;
ld.param.u64 %rd11, [__cudaparm_kernel_pair_dev_packed];
setp.ne.u64 %p2, %rd11, %rd6;
@%p2 bra $Lt_0_19458;
cvt.s32.s64 %r14, %rd2;
mul.lo.s32 %r15, %r14, %r1;
mov.s32 %r16, %r15;
mul.lo.s32 %r17, %r12, %r8;
add.s32 %r18, %r14, %r17;
cvt.s64.s32 %rd12, %r18;
mul.wide.s32 %rd13, %r18, 4;
add.u64 %rd14, %rd8, %rd13;
and.b32 %r19, %r12, %r11;
cvt.s64.s32 %rd15, %r19;
div.s32 %r20, %r11, %r1;
mul.lo.s32 %r21, %r15, %r20;
cvt.s64.s32 %rd16, %r21;
add.u64 %rd17, %rd15, %rd16;
mul.lo.u64 %rd18, %rd17, 4;
add.u64 %rd19, %rd14, %rd18;
add.u64 %rd20, %rd10, %rd14;
bra.uni $Lt_0_19202;
$Lt_0_19458:
add.u64 %rd21, %rd3, %rd8;
ld.global.s32 %r22, [%rd21+0];
cvt.s64.s32 %rd22, %r22;
mul.wide.s32 %rd23, %r22, 4;
add.u64 %rd24, %rd11, %rd23;
cvt.s64.s32 %rd25, %r11;
mul.wide.s32 %rd26, %r11, 4;
add.u64 %rd19, %rd24, %rd26;
mov.s32 %r16, %r1;
add.u64 %rd20, %rd10, %rd24;
$Lt_0_19202:
.loc 16 54 0
ld.global.s32 %r23, [%rd7+0];
mov.u32 %r24, %r23;
mov.s32 %r25, 0;
mov.u32 %r26, %r25;
mov.s32 %r27, 0;
mov.u32 %r28, %r27;
mov.s32 %r29, 0;
mov.u32 %r30, %r29;
tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r24,%r26,%r28,%r30}];
mov.f32 %f21, %f17;
mov.f32 %f22, %f18;
mov.f32 %f23, %f19;
mov.f32 %f24, %f20;
setp.ge.u64 %p3, %rd20, %rd19;
@%p3 bra $Lt_0_27906;
cvt.rzi.ftz.s32.f32 %r31, %f24;
cvt.s64.s32 %rd27, %r16;
ld.param.s32 %r32, [__cudaparm_kernel_pair_lj_types];
mul.lo.s32 %r33, %r32, %r31;
ld.param.u64 %rd28, [__cudaparm_kernel_pair_lj1];
mov.f32 %f25, 0f00000000; // 0
mov.f32 %f26, 0f00000000; // 0
mov.f32 %f27, 0f00000000; // 0
mov.f32 %f28, 0f00000000; // 0
mov.u64 %rd29, __cuda___cuda_local_var_32536_33_non_const_sp_lj92;
$Lt_0_20226:
//<loop> Loop body line 54, nesting depth: 1, estimated iterations: unknown
.loc 16 60 0
ld.global.s32 %r34, [%rd20+0];
.loc 16 61 0
shr.s32 %r35, %r34, 30;
and.b32 %r36, %r35, 3;
cvt.s64.s32 %rd30, %r36;
mul.wide.s32 %rd31, %r36, 4;
add.u64 %rd32, %rd29, %rd31;
ld.shared.f32 %f29, [%rd32+0];
.loc 16 64 0
and.b32 %r37, %r34, 1073741823;
mov.u32 %r38, %r37;
mov.s32 %r39, 0;
mov.u32 %r40, %r39;
mov.s32 %r41, 0;
mov.u32 %r42, %r41;
mov.s32 %r43, 0;
mov.u32 %r44, %r43;
tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r38,%r40,%r42,%r44}];
mov.f32 %f34, %f30;
mov.f32 %f35, %f31;
mov.f32 %f36, %f32;
mov.f32 %f37, %f33;
cvt.rzi.ftz.s32.f32 %r45, %f37;
sub.ftz.f32 %f38, %f22, %f35;
sub.ftz.f32 %f39, %f21, %f34;
sub.ftz.f32 %f40, %f23, %f36;
mul.ftz.f32 %f41, %f38, %f38;
fma.rn.ftz.f32 %f42, %f39, %f39, %f41;
fma.rn.ftz.f32 %f43, %f40, %f40, %f42;
add.s32 %r46, %r45, %r33;
cvt.s64.s32 %rd33, %r46;
mul.wide.s32 %rd34, %r46, 16;
add.u64 %rd35, %rd34, %rd28;
ld.global.f32 %f44, [%rd35+8];
setp.gt.ftz.f32 %p4, %f44, %f43;
@!%p4 bra $Lt_0_21506;
.loc 16 79 0
rcp.approx.ftz.f32 %f45, %f43;
mul.ftz.f32 %f46, %f45, %f45;
mul.ftz.f32 %f47, %f45, %f46;
sqrt.approx.ftz.f32 %f48, %f47;
mul.ftz.f32 %f49, %f45, %f47;
ld.global.v2.f32 {%f50,%f51}, [%rd35+0];
mul.ftz.f32 %f52, %f50, %f48;
sub.ftz.f32 %f53, %f52, %f51;
mul.ftz.f32 %f54, %f49, %f53;
mul.ftz.f32 %f55, %f29, %f54;
.loc 16 81 0
fma.rn.ftz.f32 %f27, %f39, %f55, %f27;
.loc 16 82 0
fma.rn.ftz.f32 %f26, %f38, %f55, %f26;
.loc 16 83 0
fma.rn.ftz.f32 %f25, %f40, %f55, %f25;
ld.param.s32 %r47, [__cudaparm_kernel_pair_eflag];
mov.u32 %r48, 0;
setp.le.s32 %p5, %r47, %r48;
@%p5 bra $Lt_0_20994;
.loc 16 87 0
ld.param.u64 %rd36, [__cudaparm_kernel_pair_lj3];
add.u64 %rd37, %rd36, %rd34;
ld.global.v4.f32 {%f56,%f57,%f58,_}, [%rd37+0];
mul.ftz.f32 %f59, %f56, %f48;
sub.ftz.f32 %f60, %f59, %f57;
mul.ftz.f32 %f61, %f47, %f60;
sub.ftz.f32 %f62, %f61, %f58;
fma.rn.ftz.f32 %f28, %f29, %f62, %f28;
$Lt_0_20994:
ld.param.s32 %r49, [__cudaparm_kernel_pair_vflag];
mov.u32 %r50, 0;
setp.le.s32 %p6, %r49, %r50;
@%p6 bra $Lt_0_21506;
.loc 16 90 0
mov.f32 %f63, %f6;
mul.ftz.f32 %f64, %f39, %f39;
fma.rn.ftz.f32 %f65, %f55, %f64, %f63;
mov.f32 %f6, %f65;
.loc 16 91 0
mov.f32 %f66, %f8;
fma.rn.ftz.f32 %f67, %f55, %f41, %f66;
mov.f32 %f8, %f67;
.loc 16 92 0
mov.f32 %f68, %f10;
mul.ftz.f32 %f69, %f40, %f40;
fma.rn.ftz.f32 %f70, %f55, %f69, %f68;
mov.f32 %f10, %f70;
.loc 16 93 0
mov.f32 %f71, %f12;
mul.ftz.f32 %f72, %f38, %f39;
fma.rn.ftz.f32 %f73, %f55, %f72, %f71;
mov.f32 %f12, %f73;
.loc 16 94 0
mov.f32 %f74, %f14;
mul.ftz.f32 %f75, %f39, %f40;
fma.rn.ftz.f32 %f76, %f55, %f75, %f74;
mov.f32 %f14, %f76;
.loc 16 95 0
mul.ftz.f32 %f77, %f38, %f40;
fma.rn.ftz.f32 %f15, %f55, %f77, %f15;
mov.f32 %f16, %f15;
$Lt_0_21506:
$Lt_0_20482:
.loc 16 58 0
mul.lo.u64 %rd38, %rd27, 4;
add.u64 %rd20, %rd20, %rd38;
setp.lt.u64 %p7, %rd20, %rd19;
@%p7 bra $Lt_0_20226;
bra.uni $Lt_0_19714;
$Lt_0_27906:
mov.f32 %f25, 0f00000000; // 0
mov.f32 %f26, 0f00000000; // 0
mov.f32 %f27, 0f00000000; // 0
mov.f32 %f28, 0f00000000; // 0
$Lt_0_19714:
mov.u32 %r51, 1;
setp.le.s32 %p8, %r1, %r51;
@%p8 bra $Lt_0_24322;
.loc 16 100 0
mov.u64 %rd39, __cuda___cuda_local_var_32601_55_non_const_red_acc108;
cvt.s64.s32 %rd40, %r2;
mul.wide.s32 %rd41, %r2, 4;
add.u64 %rd42, %rd39, %rd41;
mov.f32 %f78, %f27;
st.shared.f32 [%rd42+0], %f78;
mov.f32 %f79, %f26;
st.shared.f32 [%rd42+512], %f79;
mov.f32 %f80, %f25;
st.shared.f32 [%rd42+1024], %f80;
mov.f32 %f81, %f28;
st.shared.f32 [%rd42+1536], %f81;
shr.s32 %r52, %r1, 31;
mov.s32 %r53, 1;
and.b32 %r54, %r52, %r53;
add.s32 %r55, %r54, %r1;
shr.s32 %r56, %r55, 1;
mov.s32 %r57, %r56;
mov.u32 %r58, 0;
setp.ne.u32 %p9, %r56, %r58;
@!%p9 bra $Lt_0_22786;
$Lt_0_23298:
setp.ge.u32 %p10, %r13, %r57;
@%p10 bra $Lt_0_23554;
add.u32 %r59, %r2, %r57;
cvt.u64.u32 %rd43, %r59;
mul.wide.u32 %rd44, %r59, 4;
add.u64 %rd45, %rd39, %rd44;
ld.shared.f32 %f82, [%rd45+0];
add.ftz.f32 %f78, %f82, %f78;
st.shared.f32 [%rd42+0], %f78;
ld.shared.f32 %f83, [%rd45+512];
add.ftz.f32 %f79, %f83, %f79;
st.shared.f32 [%rd42+512], %f79;
ld.shared.f32 %f84, [%rd45+1024];
add.ftz.f32 %f80, %f84, %f80;
st.shared.f32 [%rd42+1024], %f80;
ld.shared.f32 %f85, [%rd45+1536];
add.ftz.f32 %f81, %f85, %f81;
st.shared.f32 [%rd42+1536], %f81;
$Lt_0_23554:
shr.u32 %r57, %r57, 1;
mov.u32 %r60, 0;
setp.ne.u32 %p11, %r57, %r60;
@%p11 bra $Lt_0_23298;
$Lt_0_22786:
mov.f32 %f27, %f78;
mov.f32 %f26, %f79;
mov.f32 %f25, %f80;
mov.f32 %f28, %f81;
ld.param.s32 %r61, [__cudaparm_kernel_pair_vflag];
mov.u32 %r62, 0;
setp.le.s32 %p12, %r61, %r62;
@%p12 bra $Lt_0_24322;
mov.f32 %f78, %f6;
st.shared.f32 [%rd42+0], %f78;
mov.f32 %f79, %f8;
st.shared.f32 [%rd42+512], %f79;
mov.f32 %f80, %f10;
st.shared.f32 [%rd42+1024], %f80;
mov.f32 %f81, %f12;
st.shared.f32 [%rd42+1536], %f81;
mov.f32 %f86, %f14;
st.shared.f32 [%rd42+2048], %f86;
mov.f32 %f87, %f15;
st.shared.f32 [%rd42+2560], %f87;
mov.s32 %r63, %r56;
@!%p9 bra $Lt_0_24834;
$Lt_0_25346:
setp.ge.u32 %p13, %r13, %r63;
@%p13 bra $Lt_0_25602;
add.u32 %r64, %r2, %r63;
cvt.u64.u32 %rd46, %r64;
mul.wide.u32 %rd47, %r64, 4;
add.u64 %rd48, %rd39, %rd47;
ld.shared.f32 %f88, [%rd48+0];
add.ftz.f32 %f78, %f88, %f78;
st.shared.f32 [%rd42+0], %f78;
ld.shared.f32 %f89, [%rd48+512];
add.ftz.f32 %f79, %f89, %f79;
st.shared.f32 [%rd42+512], %f79;
ld.shared.f32 %f90, [%rd48+1024];
add.ftz.f32 %f80, %f90, %f80;
st.shared.f32 [%rd42+1024], %f80;
ld.shared.f32 %f91, [%rd48+1536];
add.ftz.f32 %f81, %f91, %f81;
st.shared.f32 [%rd42+1536], %f81;
ld.shared.f32 %f92, [%rd48+2048];
add.ftz.f32 %f86, %f92, %f86;
st.shared.f32 [%rd42+2048], %f86;
ld.shared.f32 %f93, [%rd48+2560];
add.ftz.f32 %f87, %f93, %f87;
st.shared.f32 [%rd42+2560], %f87;
$Lt_0_25602:
shr.u32 %r63, %r63, 1;
mov.u32 %r65, 0;
setp.ne.u32 %p14, %r63, %r65;
@%p14 bra $Lt_0_25346;
$Lt_0_24834:
mov.f32 %f6, %f78;
mov.f32 %f8, %f79;
mov.f32 %f10, %f80;
mov.f32 %f12, %f81;
mov.f32 %f14, %f86;
mov.f32 %f16, %f87;
$Lt_0_24322:
$Lt_0_22274:
mov.u32 %r66, 0;
setp.ne.s32 %p15, %r13, %r66;
@%p15 bra $Lt_0_26370;
ld.param.u64 %rd49, [__cudaparm_kernel_pair___val_paramengv];
add.u64 %rd50, %rd49, %rd5;
ld.param.s32 %r67, [__cudaparm_kernel_pair_eflag];
mov.u32 %r68, 0;
setp.le.s32 %p16, %r67, %r68;
@%p16 bra $Lt_0_26882;
st.global.f32 [%rd50+0], %f28;
cvt.s64.s32 %rd51, %r9;
mul.wide.s32 %rd52, %r9, 4;
add.u64 %rd50, %rd50, %rd52;
$Lt_0_26882:
ld.param.s32 %r69, [__cudaparm_kernel_pair_vflag];
mov.u32 %r70, 0;
setp.le.s32 %p17, %r69, %r70;
@%p17 bra $Lt_0_27394;
mov.f32 %f94, %f6;
st.global.f32 [%rd50+0], %f94;
cvt.s64.s32 %rd53, %r9;
mul.wide.s32 %rd54, %r9, 4;
add.u64 %rd55, %rd54, %rd50;
mov.f32 %f95, %f8;
st.global.f32 [%rd55+0], %f95;
add.u64 %rd56, %rd54, %rd55;
mov.f32 %f96, %f10;
st.global.f32 [%rd56+0], %f96;
add.u64 %rd57, %rd54, %rd56;
mov.f32 %f97, %f12;
st.global.f32 [%rd57+0], %f97;
add.u64 %rd50, %rd54, %rd57;
mov.f32 %f98, %f14;
st.global.f32 [%rd50+0], %f98;
mov.f32 %f99, %f16;
add.u64 %rd58, %rd54, %rd50;
st.global.f32 [%rd58+0], %f99;
$Lt_0_27394:
ld.param.u64 %rd59, [__cudaparm_kernel_pair_ans];
mul.lo.u64 %rd60, %rd4, 16;
add.u64 %rd61, %rd59, %rd60;
mov.f32 %f100, %f101;
st.global.v4.f32 [%rd61+0], {%f27,%f26,%f25,%f100};
$Lt_0_26370:
$Lt_0_18690:
.loc 16 103 0
exit;
$LDWend_kernel_pair:
} // kernel_pair
.entry kernel_pair_fast (
.param .u64 __cudaparm_kernel_pair_fast_x_,
.param .u64 __cudaparm_kernel_pair_fast_lj1_in,
.param .u64 __cudaparm_kernel_pair_fast_lj3_in,
.param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,
.param .u64 __cudaparm_kernel_pair_fast_dev_nbor,
.param .u64 __cudaparm_kernel_pair_fast_dev_packed,
.param .u64 __cudaparm_kernel_pair_fast_ans,
.param .u64 __cudaparm_kernel_pair_fast___val_paramengv,
.param .s32 __cudaparm_kernel_pair_fast_eflag,
.param .s32 __cudaparm_kernel_pair_fast_vflag,
.param .s32 __cudaparm_kernel_pair_fast_inum,
.param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,
.param .s32 __cudaparm_kernel_pair_fast_t_per_atom)
{
.reg .u32 %r<74>;
.reg .u64 %rd<75>;
.reg .f32 %f<109>;
.reg .pred %p<22>;
.shared .align 4 .b8 __cuda___cuda_local_var_32618_33_non_const_sp_lj3268[16];
.shared .align 16 .b8 __cuda___cuda_local_var_32616_34_non_const_lj13296[1936];
.shared .align 16 .b8 __cuda___cuda_local_var_32617_34_non_const_lj35232[1936];
.shared .align 4 .b8 __cuda___cuda_local_var_32687_55_non_const_red_acc7168[3072];
// __cuda_local_var_32628_10_non_const_f = 48
// __cuda_local_var_32630_9_non_const_virial = 16
.loc 16 111 0
$LDWbegin_kernel_pair_fast:
cvt.s32.u32 %r1, %tid.x;
mov.u32 %r2, 3;
setp.gt.s32 %p1, %r1, %r2;
@%p1 bra $Lt_1_20994;
.loc 16 119 0
mov.u64 %rd1, __cuda___cuda_local_var_32618_33_non_const_sp_lj3268;
cvt.s64.s32 %rd2, %r1;
mul.wide.s32 %rd3, %r1, 4;
ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];
add.u64 %rd5, %rd4, %rd3;
ld.global.f32 %f1, [%rd5+0];
add.u64 %rd6, %rd3, %rd1;
st.shared.f32 [%rd6+0], %f1;
$Lt_1_20994:
mov.u64 %rd1, __cuda___cuda_local_var_32618_33_non_const_sp_lj3268;
mov.u32 %r3, 120;
setp.gt.s32 %p2, %r1, %r3;
@%p2 bra $Lt_1_21506;
.loc 16 121 0
mov.u64 %rd7, __cuda___cuda_local_var_32616_34_non_const_lj13296;
cvt.s64.s32 %rd8, %r1;
mul.wide.s32 %rd9, %r1, 16;
ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_lj1_in];
add.u64 %rd11, %rd10, %rd9;
add.u64 %rd12, %rd9, %rd7;
ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0];
st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5};
ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag];
mov.u32 %r5, 0;
setp.le.s32 %p3, %r4, %r5;
@%p3 bra $Lt_1_22018;
.loc 16 123 0
mov.u64 %rd13, __cuda___cuda_local_var_32617_34_non_const_lj35232;
ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in];
add.u64 %rd15, %rd14, %rd9;
add.u64 %rd16, %rd9, %rd13;
ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0];
st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9};
$Lt_1_22018:
mov.u64 %rd13, __cuda___cuda_local_var_32617_34_non_const_lj35232;
$Lt_1_21506:
mov.u64 %rd13, __cuda___cuda_local_var_32617_34_non_const_lj35232;
mov.u64 %rd7, __cuda___cuda_local_var_32616_34_non_const_lj13296;
.loc 16 131 0
mov.f32 %f10, 0f00000000; // 0
mov.f32 %f11, %f10;
mov.f32 %f12, 0f00000000; // 0
mov.f32 %f13, %f12;
mov.f32 %f14, 0f00000000; // 0
mov.f32 %f15, %f14;
mov.f32 %f16, 0f00000000; // 0
mov.f32 %f17, %f16;
mov.f32 %f18, 0f00000000; // 0
mov.f32 %f19, %f18;
mov.f32 %f20, 0f00000000; // 0
mov.f32 %f21, %f20;
.loc 16 133 0
bar.sync 0;
ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom];
div.s32 %r7, %r1, %r6;
cvt.s32.u32 %r8, %ntid.x;
div.s32 %r9, %r8, %r6;
cvt.s32.u32 %r10, %ctaid.x;
mul.lo.s32 %r11, %r10, %r9;
add.s32 %r12, %r7, %r11;
ld.param.s32 %r13, [__cudaparm_kernel_pair_fast_inum];
setp.ge.s32 %p4, %r12, %r13;
@%p4 bra $Lt_1_30210;
.loc 16 138 0
ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_nbor_pitch];
cvt.s64.s32 %rd17, %r14;
mul.wide.s32 %rd18, %r14, 4;
cvt.s64.s32 %rd19, %r12;
mul.wide.s32 %rd20, %r12, 4;
ld.param.u64 %rd21, [__cudaparm_kernel_pair_fast_dev_nbor];
add.u64 %rd22, %rd20, %rd21;
add.u64 %rd23, %rd18, %rd22;
ld.global.s32 %r15, [%rd23+0];
sub.s32 %r16, %r6, 1;
and.b32 %r17, %r16, %r1;
cvt.s64.s32 %rd24, %r17;
mul.wide.s32 %rd25, %r17, 4;
ld.param.u64 %rd26, [__cudaparm_kernel_pair_fast_dev_packed];
setp.ne.u64 %p5, %rd26, %rd21;
@%p5 bra $Lt_1_23298;
cvt.s32.s64 %r18, %rd17;
mul.lo.s32 %r19, %r18, %r6;
mov.s32 %r20, %r19;
mul.lo.s32 %r21, %r16, %r12;
add.s32 %r22, %r18, %r21;
cvt.s64.s32 %rd27, %r22;
mul.wide.s32 %rd28, %r22, 4;
add.u64 %rd29, %rd23, %rd28;
and.b32 %r23, %r16, %r15;
cvt.s64.s32 %rd30, %r23;
div.s32 %r24, %r15, %r6;
mul.lo.s32 %r25, %r19, %r24;
cvt.s64.s32 %rd31, %r25;
add.u64 %rd32, %rd30, %rd31;
mul.lo.u64 %rd33, %rd32, 4;
add.u64 %rd34, %rd29, %rd33;
add.u64 %rd35, %rd25, %rd29;
bra.uni $Lt_1_23042;
$Lt_1_23298:
add.u64 %rd36, %rd18, %rd23;
ld.global.s32 %r26, [%rd36+0];
cvt.s64.s32 %rd37, %r26;
mul.wide.s32 %rd38, %r26, 4;
add.u64 %rd39, %rd26, %rd38;
cvt.s64.s32 %rd40, %r15;
mul.wide.s32 %rd41, %r15, 4;
add.u64 %rd34, %rd39, %rd41;
mov.s32 %r20, %r6;
add.u64 %rd35, %rd25, %rd39;
$Lt_1_23042:
.loc 16 141 0
ld.global.s32 %r27, [%rd22+0];
mov.u32 %r28, %r27;
mov.s32 %r29, 0;
mov.u32 %r30, %r29;
mov.s32 %r31, 0;
mov.u32 %r32, %r31;
mov.s32 %r33, 0;
mov.u32 %r34, %r33;
tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r28,%r30,%r32,%r34}];
mov.f32 %f26, %f22;
mov.f32 %f27, %f23;
mov.f32 %f28, %f24;
mov.f32 %f29, %f25;
setp.ge.u64 %p6, %rd35, %rd34;
@%p6 bra $Lt_1_31746;
cvt.rzi.ftz.s32.f32 %r35, %f29;
cvt.s64.s32 %rd42, %r20;
mul.lo.s32 %r36, %r35, 11;
cvt.rn.f32.s32 %f30, %r36;
mov.f32 %f31, 0f00000000; // 0
mov.f32 %f32, 0f00000000; // 0
mov.f32 %f33, 0f00000000; // 0
mov.f32 %f34, 0f00000000; // 0
$Lt_1_24066:
//<loop> Loop body line 141, nesting depth: 1, estimated iterations: unknown
.loc 16 148 0
ld.global.s32 %r37, [%rd35+0];
.loc 16 152 0
and.b32 %r38, %r37, 1073741823;
mov.u32 %r39, %r38;
mov.s32 %r40, 0;
mov.u32 %r41, %r40;
mov.s32 %r42, 0;
mov.u32 %r43, %r42;
mov.s32 %r44, 0;
mov.u32 %r45, %r44;
tex.1d.v4.f32.s32 {%f35,%f36,%f37,%f38},[pos_tex,{%r39,%r41,%r43,%r45}];
mov.f32 %f39, %f35;
mov.f32 %f40, %f36;
mov.f32 %f41, %f37;
mov.f32 %f42, %f38;
sub.ftz.f32 %f43, %f27, %f40;
sub.ftz.f32 %f44, %f26, %f39;
sub.ftz.f32 %f45, %f28, %f41;
mul.ftz.f32 %f46, %f43, %f43;
fma.rn.ftz.f32 %f47, %f44, %f44, %f46;
fma.rn.ftz.f32 %f48, %f45, %f45, %f47;
add.ftz.f32 %f49, %f30, %f42;
cvt.rzi.ftz.s32.f32 %r46, %f49;
cvt.s64.s32 %rd43, %r46;
mul.wide.s32 %rd44, %r46, 16;
add.u64 %rd45, %rd44, %rd7;
ld.shared.f32 %f50, [%rd45+8];
setp.gt.ftz.f32 %p7, %f50, %f48;
@!%p7 bra $Lt_1_25346;
.loc 16 165 0
rcp.approx.ftz.f32 %f51, %f48;
mul.ftz.f32 %f52, %f51, %f51;
mul.ftz.f32 %f53, %f51, %f52;
sqrt.approx.ftz.f32 %f54, %f53;
mul.ftz.f32 %f55, %f51, %f53;
ld.shared.v2.f32 {%f56,%f57}, [%rd45+0];
mul.ftz.f32 %f58, %f56, %f54;
sub.ftz.f32 %f59, %f58, %f57;
mul.ftz.f32 %f60, %f55, %f59;
.loc 16 167 0
fma.rn.ftz.f32 %f33, %f44, %f60, %f33;
.loc 16 168 0
fma.rn.ftz.f32 %f32, %f43, %f60, %f32;
.loc 16 169 0
fma.rn.ftz.f32 %f31, %f45, %f60, %f31;
ld.param.s32 %r47, [__cudaparm_kernel_pair_fast_eflag];
mov.u32 %r48, 0;
setp.le.s32 %p8, %r47, %r48;
@%p8 bra $Lt_1_24834;
.loc 16 172 0
add.u64 %rd46, %rd44, %rd13;
ld.shared.v4.f32 {%f61,%f62,%f63,_}, [%rd46+0];
mul.ftz.f32 %f64, %f61, %f54;
sub.ftz.f32 %f65, %f64, %f62;
mul.ftz.f32 %f66, %f53, %f65;
.loc 16 173 0
shr.s32 %r49, %r37, 30;
and.b32 %r50, %r49, 3;
cvt.s64.s32 %rd47, %r50;
mul.wide.s32 %rd48, %r50, 4;
add.u64 %rd49, %rd1, %rd48;
ld.shared.f32 %f67, [%rd49+0];
sub.ftz.f32 %f68, %f66, %f63;
fma.rn.ftz.f32 %f34, %f67, %f68, %f34;
$Lt_1_24834:
ld.param.s32 %r51, [__cudaparm_kernel_pair_fast_vflag];
mov.u32 %r52, 0;
setp.le.s32 %p9, %r51, %r52;
@%p9 bra $Lt_1_25346;
.loc 16 176 0
mov.f32 %f69, %f11;
mul.ftz.f32 %f70, %f44, %f44;
fma.rn.ftz.f32 %f71, %f60, %f70, %f69;
mov.f32 %f11, %f71;
.loc 16 177 0
mov.f32 %f72, %f13;
fma.rn.ftz.f32 %f73, %f60, %f46, %f72;
mov.f32 %f13, %f73;
.loc 16 178 0
mov.f32 %f74, %f15;
mul.ftz.f32 %f75, %f45, %f45;
fma.rn.ftz.f32 %f76, %f60, %f75, %f74;
mov.f32 %f15, %f76;
.loc 16 179 0
mov.f32 %f77, %f17;
mul.ftz.f32 %f78, %f43, %f44;
fma.rn.ftz.f32 %f79, %f60, %f78, %f77;
mov.f32 %f17, %f79;
.loc 16 180 0
mov.f32 %f80, %f19;
mul.ftz.f32 %f81, %f44, %f45;
fma.rn.ftz.f32 %f82, %f60, %f81, %f80;
mov.f32 %f19, %f82;
.loc 16 181 0
mul.ftz.f32 %f83, %f43, %f45;
fma.rn.ftz.f32 %f20, %f60, %f83, %f20;
mov.f32 %f21, %f20;
$Lt_1_25346:
$Lt_1_24322:
.loc 16 146 0
mul.lo.u64 %rd50, %rd42, 4;
add.u64 %rd35, %rd35, %rd50;
setp.lt.u64 %p10, %rd35, %rd34;
@%p10 bra $Lt_1_24066;
bra.uni $Lt_1_23554;
$Lt_1_31746:
mov.f32 %f31, 0f00000000; // 0
mov.f32 %f32, 0f00000000; // 0
mov.f32 %f33, 0f00000000; // 0
mov.f32 %f34, 0f00000000; // 0
$Lt_1_23554:
mov.u32 %r53, 1;
setp.le.s32 %p11, %r6, %r53;
@%p11 bra $Lt_1_28162;
.loc 16 186 0
mov.u64 %rd51, __cuda___cuda_local_var_32687_55_non_const_red_acc7168;
cvt.s64.s32 %rd52, %r1;
mul.wide.s32 %rd53, %r1, 4;
add.u64 %rd54, %rd51, %rd53;
mov.f32 %f84, %f33;
st.shared.f32 [%rd54+0], %f84;
mov.f32 %f85, %f32;
st.shared.f32 [%rd54+512], %f85;
mov.f32 %f86, %f31;
st.shared.f32 [%rd54+1024], %f86;
mov.f32 %f87, %f34;
st.shared.f32 [%rd54+1536], %f87;
shr.s32 %r54, %r6, 31;
mov.s32 %r55, 1;
and.b32 %r56, %r54, %r55;
add.s32 %r57, %r56, %r6;
shr.s32 %r58, %r57, 1;
mov.s32 %r59, %r58;
mov.u32 %r60, 0;
setp.ne.u32 %p12, %r58, %r60;
@!%p12 bra $Lt_1_26626;
$Lt_1_27138:
setp.ge.u32 %p13, %r17, %r59;
@%p13 bra $Lt_1_27394;
add.u32 %r61, %r1, %r59;
cvt.u64.u32 %rd55, %r61;
mul.wide.u32 %rd56, %r61, 4;
add.u64 %rd57, %rd51, %rd56;
ld.shared.f32 %f88, [%rd57+0];
add.ftz.f32 %f84, %f88, %f84;
st.shared.f32 [%rd54+0], %f84;
ld.shared.f32 %f89, [%rd57+512];
add.ftz.f32 %f85, %f89, %f85;
st.shared.f32 [%rd54+512], %f85;
ld.shared.f32 %f90, [%rd57+1024];
add.ftz.f32 %f86, %f90, %f86;
st.shared.f32 [%rd54+1024], %f86;
ld.shared.f32 %f91, [%rd57+1536];
add.ftz.f32 %f87, %f91, %f87;
st.shared.f32 [%rd54+1536], %f87;
$Lt_1_27394:
shr.u32 %r59, %r59, 1;
mov.u32 %r62, 0;
setp.ne.u32 %p14, %r59, %r62;
@%p14 bra $Lt_1_27138;
$Lt_1_26626:
mov.f32 %f33, %f84;
mov.f32 %f32, %f85;
mov.f32 %f31, %f86;
mov.f32 %f34, %f87;
ld.param.s32 %r63, [__cudaparm_kernel_pair_fast_vflag];
mov.u32 %r64, 0;
setp.le.s32 %p15, %r63, %r64;
@%p15 bra $Lt_1_28162;
mov.f32 %f84, %f11;
st.shared.f32 [%rd54+0], %f84;
mov.f32 %f85, %f13;
st.shared.f32 [%rd54+512], %f85;
mov.f32 %f86, %f15;
st.shared.f32 [%rd54+1024], %f86;
mov.f32 %f87, %f17;
st.shared.f32 [%rd54+1536], %f87;
mov.f32 %f92, %f19;
st.shared.f32 [%rd54+2048], %f92;
mov.f32 %f93, %f20;
st.shared.f32 [%rd54+2560], %f93;
mov.s32 %r65, %r58;
@!%p12 bra $Lt_1_28674;
$Lt_1_29186:
setp.ge.u32 %p16, %r17, %r65;
@%p16 bra $Lt_1_29442;
add.u32 %r66, %r1, %r65;
cvt.u64.u32 %rd58, %r66;
mul.wide.u32 %rd59, %r66, 4;
add.u64 %rd60, %rd51, %rd59;
ld.shared.f32 %f94, [%rd60+0];
add.ftz.f32 %f84, %f94, %f84;
st.shared.f32 [%rd54+0], %f84;
ld.shared.f32 %f95, [%rd60+512];
add.ftz.f32 %f85, %f95, %f85;
st.shared.f32 [%rd54+512], %f85;
ld.shared.f32 %f96, [%rd60+1024];
add.ftz.f32 %f86, %f96, %f86;
st.shared.f32 [%rd54+1024], %f86;
ld.shared.f32 %f97, [%rd60+1536];
add.ftz.f32 %f87, %f97, %f87;
st.shared.f32 [%rd54+1536], %f87;
ld.shared.f32 %f98, [%rd60+2048];
add.ftz.f32 %f92, %f98, %f92;
st.shared.f32 [%rd54+2048], %f92;
ld.shared.f32 %f99, [%rd60+2560];
add.ftz.f32 %f93, %f99, %f93;
st.shared.f32 [%rd54+2560], %f93;
$Lt_1_29442:
shr.u32 %r65, %r65, 1;
mov.u32 %r67, 0;
setp.ne.u32 %p17, %r65, %r67;
@%p17 bra $Lt_1_29186;
$Lt_1_28674:
mov.f32 %f11, %f84;
mov.f32 %f13, %f85;
mov.f32 %f15, %f86;
mov.f32 %f17, %f87;
mov.f32 %f19, %f92;
mov.f32 %f21, %f93;
$Lt_1_28162:
$Lt_1_26114:
mov.u32 %r68, 0;
setp.ne.s32 %p18, %r17, %r68;
@%p18 bra $Lt_1_30210;
ld.param.u64 %rd61, [__cudaparm_kernel_pair_fast___val_paramengv];
add.u64 %rd62, %rd61, %rd20;
ld.param.s32 %r69, [__cudaparm_kernel_pair_fast_eflag];
mov.u32 %r70, 0;
setp.le.s32 %p19, %r69, %r70;
@%p19 bra $Lt_1_30722;
st.global.f32 [%rd62+0], %f34;
cvt.s64.s32 %rd63, %r13;
mul.wide.s32 %rd64, %r13, 4;
add.u64 %rd62, %rd62, %rd64;
$Lt_1_30722:
ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag];
mov.u32 %r72, 0;
setp.le.s32 %p20, %r71, %r72;
@%p20 bra $Lt_1_31234;
mov.f32 %f100, %f11;
st.global.f32 [%rd62+0], %f100;
cvt.s64.s32 %rd65, %r13;
mul.wide.s32 %rd66, %r13, 4;
add.u64 %rd67, %rd66, %rd62;
mov.f32 %f101, %f13;
st.global.f32 [%rd67+0], %f101;
add.u64 %rd68, %rd66, %rd67;
mov.f32 %f102, %f15;
st.global.f32 [%rd68+0], %f102;
add.u64 %rd69, %rd66, %rd68;
mov.f32 %f103, %f17;
st.global.f32 [%rd69+0], %f103;
add.u64 %rd62, %rd66, %rd69;
mov.f32 %f104, %f19;
st.global.f32 [%rd62+0], %f104;
mov.f32 %f105, %f21;
add.u64 %rd70, %rd66, %rd62;
st.global.f32 [%rd70+0], %f105;
$Lt_1_31234:
ld.param.u64 %rd71, [__cudaparm_kernel_pair_fast_ans];
mul.lo.u64 %rd72, %rd19, 16;
add.u64 %rd73, %rd71, %rd72;
mov.f32 %f106, %f107;
st.global.v4.f32 [%rd73+0], {%f33,%f32,%f31,%f106};
$Lt_1_30210:
$Lt_1_22530:
.loc 16 189 0
exit;
$LDWend_kernel_pair_fast:
} // kernel_pair_fast

View File

@ -1,849 +0,0 @@
const char * lj96 =
" .version 2.3\n"
" .target sm_20\n"
" .address_size 64\n"
" .global .texref pos_tex;\n"
" .entry kernel_pair (\n"
" .param .u64 __cudaparm_kernel_pair_x_,\n"
" .param .u64 __cudaparm_kernel_pair_lj1,\n"
" .param .u64 __cudaparm_kernel_pair_lj3,\n"
" .param .s32 __cudaparm_kernel_pair_lj_types,\n"
" .param .u64 __cudaparm_kernel_pair_sp_lj_in,\n"
" .param .u64 __cudaparm_kernel_pair_dev_nbor,\n"
" .param .u64 __cudaparm_kernel_pair_dev_packed,\n"
" .param .u64 __cudaparm_kernel_pair_ans,\n"
" .param .u64 __cudaparm_kernel_pair___val_paramengv,\n"
" .param .s32 __cudaparm_kernel_pair_eflag,\n"
" .param .s32 __cudaparm_kernel_pair_vflag,\n"
" .param .s32 __cudaparm_kernel_pair_inum,\n"
" .param .s32 __cudaparm_kernel_pair_nbor_pitch,\n"
" .param .s32 __cudaparm_kernel_pair_t_per_atom)\n"
" {\n"
" .reg .u32 %r<72>;\n"
" .reg .u64 %rd<63>;\n"
" .reg .f32 %f<103>;\n"
" .reg .pred %p<19>;\n"
" .shared .align 16 .b8 __cuda___cuda_local_var_32536_33_non_const_sp_lj92[16];\n"
" .shared .align 4 .b8 __cuda___cuda_local_var_32601_55_non_const_red_acc108[3072];\n"
" .loc 16 31 0\n"
"$LDWbegin_kernel_pair:\n"
" .loc 16 36 0\n"
" ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];\n"
" ldu.global.f32 %f1, [%rd1+0];\n"
" .loc 16 37 0\n"
" ld.global.f32 %f2, [%rd1+4];\n"
" .loc 16 38 0\n"
" ld.global.f32 %f3, [%rd1+8];\n"
" .loc 16 39 0\n"
" ld.global.f32 %f4, [%rd1+12];\n"
" st.shared.v4.f32 [__cuda___cuda_local_var_32536_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4};\n"
" .loc 16 46 0\n"
" mov.f32 %f5, 0f00000000; \n"
" mov.f32 %f6, %f5;\n"
" mov.f32 %f7, 0f00000000; \n"
" mov.f32 %f8, %f7;\n"
" mov.f32 %f9, 0f00000000; \n"
" mov.f32 %f10, %f9;\n"
" mov.f32 %f11, 0f00000000; \n"
" mov.f32 %f12, %f11;\n"
" mov.f32 %f13, 0f00000000; \n"
" mov.f32 %f14, %f13;\n"
" mov.f32 %f15, 0f00000000; \n"
" mov.f32 %f16, %f15;\n"
" ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];\n"
" cvt.s32.u32 %r2, %tid.x;\n"
" div.s32 %r3, %r2, %r1;\n"
" cvt.s32.u32 %r4, %ntid.x;\n"
" div.s32 %r5, %r4, %r1;\n"
" cvt.s32.u32 %r6, %ctaid.x;\n"
" mul.lo.s32 %r7, %r6, %r5;\n"
" add.s32 %r8, %r3, %r7;\n"
" ld.param.s32 %r9, [__cudaparm_kernel_pair_inum];\n"
" setp.ge.s32 %p1, %r8, %r9;\n"
" @%p1 bra $Lt_0_26370;\n"
" .loc 16 51 0\n"
" ld.param.s32 %r10, [__cudaparm_kernel_pair_nbor_pitch];\n"
" cvt.s64.s32 %rd2, %r10;\n"
" mul.wide.s32 %rd3, %r10, 4;\n"
" cvt.s64.s32 %rd4, %r8;\n"
" mul.wide.s32 %rd5, %r8, 4;\n"
" ld.param.u64 %rd6, [__cudaparm_kernel_pair_dev_nbor];\n"
" add.u64 %rd7, %rd5, %rd6;\n"
" add.u64 %rd8, %rd3, %rd7;\n"
" ld.global.s32 %r11, [%rd8+0];\n"
" sub.s32 %r12, %r1, 1;\n"
" and.b32 %r13, %r12, %r2;\n"
" cvt.s64.s32 %rd9, %r13;\n"
" mul.wide.s32 %rd10, %r13, 4;\n"
" ld.param.u64 %rd11, [__cudaparm_kernel_pair_dev_packed];\n"
" setp.ne.u64 %p2, %rd11, %rd6;\n"
" @%p2 bra $Lt_0_19458;\n"
" cvt.s32.s64 %r14, %rd2;\n"
" mul.lo.s32 %r15, %r14, %r1;\n"
" mov.s32 %r16, %r15;\n"
" mul.lo.s32 %r17, %r12, %r8;\n"
" add.s32 %r18, %r14, %r17;\n"
" cvt.s64.s32 %rd12, %r18;\n"
" mul.wide.s32 %rd13, %r18, 4;\n"
" add.u64 %rd14, %rd8, %rd13;\n"
" and.b32 %r19, %r12, %r11;\n"
" cvt.s64.s32 %rd15, %r19;\n"
" div.s32 %r20, %r11, %r1;\n"
" mul.lo.s32 %r21, %r15, %r20;\n"
" cvt.s64.s32 %rd16, %r21;\n"
" add.u64 %rd17, %rd15, %rd16;\n"
" mul.lo.u64 %rd18, %rd17, 4;\n"
" add.u64 %rd19, %rd14, %rd18;\n"
" add.u64 %rd20, %rd10, %rd14;\n"
" bra.uni $Lt_0_19202;\n"
"$Lt_0_19458:\n"
" add.u64 %rd21, %rd3, %rd8;\n"
" ld.global.s32 %r22, [%rd21+0];\n"
" cvt.s64.s32 %rd22, %r22;\n"
" mul.wide.s32 %rd23, %r22, 4;\n"
" add.u64 %rd24, %rd11, %rd23;\n"
" cvt.s64.s32 %rd25, %r11;\n"
" mul.wide.s32 %rd26, %r11, 4;\n"
" add.u64 %rd19, %rd24, %rd26;\n"
" mov.s32 %r16, %r1;\n"
" add.u64 %rd20, %rd10, %rd24;\n"
"$Lt_0_19202:\n"
" .loc 16 54 0\n"
" ld.global.s32 %r23, [%rd7+0];\n"
" mov.u32 %r24, %r23;\n"
" mov.s32 %r25, 0;\n"
" mov.u32 %r26, %r25;\n"
" mov.s32 %r27, 0;\n"
" mov.u32 %r28, %r27;\n"
" mov.s32 %r29, 0;\n"
" mov.u32 %r30, %r29;\n"
" tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r24,%r26,%r28,%r30}];\n"
" mov.f32 %f21, %f17;\n"
" mov.f32 %f22, %f18;\n"
" mov.f32 %f23, %f19;\n"
" mov.f32 %f24, %f20;\n"
" setp.ge.u64 %p3, %rd20, %rd19;\n"
" @%p3 bra $Lt_0_27906;\n"
" cvt.rzi.ftz.s32.f32 %r31, %f24;\n"
" cvt.s64.s32 %rd27, %r16;\n"
" ld.param.s32 %r32, [__cudaparm_kernel_pair_lj_types];\n"
" mul.lo.s32 %r33, %r32, %r31;\n"
" ld.param.u64 %rd28, [__cudaparm_kernel_pair_lj1];\n"
" mov.f32 %f25, 0f00000000; \n"
" mov.f32 %f26, 0f00000000; \n"
" mov.f32 %f27, 0f00000000; \n"
" mov.f32 %f28, 0f00000000; \n"
" mov.u64 %rd29, __cuda___cuda_local_var_32536_33_non_const_sp_lj92;\n"
"$Lt_0_20226:\n"
" .loc 16 60 0\n"
" ld.global.s32 %r34, [%rd20+0];\n"
" .loc 16 61 0\n"
" shr.s32 %r35, %r34, 30;\n"
" and.b32 %r36, %r35, 3;\n"
" cvt.s64.s32 %rd30, %r36;\n"
" mul.wide.s32 %rd31, %r36, 4;\n"
" add.u64 %rd32, %rd29, %rd31;\n"
" ld.shared.f32 %f29, [%rd32+0];\n"
" .loc 16 64 0\n"
" and.b32 %r37, %r34, 1073741823;\n"
" mov.u32 %r38, %r37;\n"
" mov.s32 %r39, 0;\n"
" mov.u32 %r40, %r39;\n"
" mov.s32 %r41, 0;\n"
" mov.u32 %r42, %r41;\n"
" mov.s32 %r43, 0;\n"
" mov.u32 %r44, %r43;\n"
" tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r38,%r40,%r42,%r44}];\n"
" mov.f32 %f34, %f30;\n"
" mov.f32 %f35, %f31;\n"
" mov.f32 %f36, %f32;\n"
" mov.f32 %f37, %f33;\n"
" cvt.rzi.ftz.s32.f32 %r45, %f37;\n"
" sub.ftz.f32 %f38, %f22, %f35;\n"
" sub.ftz.f32 %f39, %f21, %f34;\n"
" sub.ftz.f32 %f40, %f23, %f36;\n"
" mul.ftz.f32 %f41, %f38, %f38;\n"
" fma.rn.ftz.f32 %f42, %f39, %f39, %f41;\n"
" fma.rn.ftz.f32 %f43, %f40, %f40, %f42;\n"
" add.s32 %r46, %r45, %r33;\n"
" cvt.s64.s32 %rd33, %r46;\n"
" mul.wide.s32 %rd34, %r46, 16;\n"
" add.u64 %rd35, %rd34, %rd28;\n"
" ld.global.f32 %f44, [%rd35+8];\n"
" setp.gt.ftz.f32 %p4, %f44, %f43;\n"
" @!%p4 bra $Lt_0_21506;\n"
" .loc 16 79 0\n"
" rcp.approx.ftz.f32 %f45, %f43;\n"
" mul.ftz.f32 %f46, %f45, %f45;\n"
" mul.ftz.f32 %f47, %f45, %f46;\n"
" sqrt.approx.ftz.f32 %f48, %f47;\n"
" mul.ftz.f32 %f49, %f45, %f47;\n"
" ld.global.v2.f32 {%f50,%f51}, [%rd35+0];\n"
" mul.ftz.f32 %f52, %f50, %f48;\n"
" sub.ftz.f32 %f53, %f52, %f51;\n"
" mul.ftz.f32 %f54, %f49, %f53;\n"
" mul.ftz.f32 %f55, %f29, %f54;\n"
" .loc 16 81 0\n"
" fma.rn.ftz.f32 %f27, %f39, %f55, %f27;\n"
" .loc 16 82 0\n"
" fma.rn.ftz.f32 %f26, %f38, %f55, %f26;\n"
" .loc 16 83 0\n"
" fma.rn.ftz.f32 %f25, %f40, %f55, %f25;\n"
" ld.param.s32 %r47, [__cudaparm_kernel_pair_eflag];\n"
" mov.u32 %r48, 0;\n"
" setp.le.s32 %p5, %r47, %r48;\n"
" @%p5 bra $Lt_0_20994;\n"
" .loc 16 87 0\n"
" ld.param.u64 %rd36, [__cudaparm_kernel_pair_lj3];\n"
" add.u64 %rd37, %rd36, %rd34;\n"
" ld.global.v4.f32 {%f56,%f57,%f58,_}, [%rd37+0];\n"
" mul.ftz.f32 %f59, %f56, %f48;\n"
" sub.ftz.f32 %f60, %f59, %f57;\n"
" mul.ftz.f32 %f61, %f47, %f60;\n"
" sub.ftz.f32 %f62, %f61, %f58;\n"
" fma.rn.ftz.f32 %f28, %f29, %f62, %f28;\n"
"$Lt_0_20994:\n"
" ld.param.s32 %r49, [__cudaparm_kernel_pair_vflag];\n"
" mov.u32 %r50, 0;\n"
" setp.le.s32 %p6, %r49, %r50;\n"
" @%p6 bra $Lt_0_21506;\n"
" .loc 16 90 0\n"
" mov.f32 %f63, %f6;\n"
" mul.ftz.f32 %f64, %f39, %f39;\n"
" fma.rn.ftz.f32 %f65, %f55, %f64, %f63;\n"
" mov.f32 %f6, %f65;\n"
" .loc 16 91 0\n"
" mov.f32 %f66, %f8;\n"
" fma.rn.ftz.f32 %f67, %f55, %f41, %f66;\n"
" mov.f32 %f8, %f67;\n"
" .loc 16 92 0\n"
" mov.f32 %f68, %f10;\n"
" mul.ftz.f32 %f69, %f40, %f40;\n"
" fma.rn.ftz.f32 %f70, %f55, %f69, %f68;\n"
" mov.f32 %f10, %f70;\n"
" .loc 16 93 0\n"
" mov.f32 %f71, %f12;\n"
" mul.ftz.f32 %f72, %f38, %f39;\n"
" fma.rn.ftz.f32 %f73, %f55, %f72, %f71;\n"
" mov.f32 %f12, %f73;\n"
" .loc 16 94 0\n"
" mov.f32 %f74, %f14;\n"
" mul.ftz.f32 %f75, %f39, %f40;\n"
" fma.rn.ftz.f32 %f76, %f55, %f75, %f74;\n"
" mov.f32 %f14, %f76;\n"
" .loc 16 95 0\n"
" mul.ftz.f32 %f77, %f38, %f40;\n"
" fma.rn.ftz.f32 %f15, %f55, %f77, %f15;\n"
" mov.f32 %f16, %f15;\n"
"$Lt_0_21506:\n"
"$Lt_0_20482:\n"
" .loc 16 58 0\n"
" mul.lo.u64 %rd38, %rd27, 4;\n"
" add.u64 %rd20, %rd20, %rd38;\n"
" setp.lt.u64 %p7, %rd20, %rd19;\n"
" @%p7 bra $Lt_0_20226;\n"
" bra.uni $Lt_0_19714;\n"
"$Lt_0_27906:\n"
" mov.f32 %f25, 0f00000000; \n"
" mov.f32 %f26, 0f00000000; \n"
" mov.f32 %f27, 0f00000000; \n"
" mov.f32 %f28, 0f00000000; \n"
"$Lt_0_19714:\n"
" mov.u32 %r51, 1;\n"
" setp.le.s32 %p8, %r1, %r51;\n"
" @%p8 bra $Lt_0_24322;\n"
" .loc 16 100 0\n"
" mov.u64 %rd39, __cuda___cuda_local_var_32601_55_non_const_red_acc108;\n"
" cvt.s64.s32 %rd40, %r2;\n"
" mul.wide.s32 %rd41, %r2, 4;\n"
" add.u64 %rd42, %rd39, %rd41;\n"
" mov.f32 %f78, %f27;\n"
" st.shared.f32 [%rd42+0], %f78;\n"
" mov.f32 %f79, %f26;\n"
" st.shared.f32 [%rd42+512], %f79;\n"
" mov.f32 %f80, %f25;\n"
" st.shared.f32 [%rd42+1024], %f80;\n"
" mov.f32 %f81, %f28;\n"
" st.shared.f32 [%rd42+1536], %f81;\n"
" shr.s32 %r52, %r1, 31;\n"
" mov.s32 %r53, 1;\n"
" and.b32 %r54, %r52, %r53;\n"
" add.s32 %r55, %r54, %r1;\n"
" shr.s32 %r56, %r55, 1;\n"
" mov.s32 %r57, %r56;\n"
" mov.u32 %r58, 0;\n"
" setp.ne.u32 %p9, %r56, %r58;\n"
" @!%p9 bra $Lt_0_22786;\n"
"$Lt_0_23298:\n"
" setp.ge.u32 %p10, %r13, %r57;\n"
" @%p10 bra $Lt_0_23554;\n"
" add.u32 %r59, %r2, %r57;\n"
" cvt.u64.u32 %rd43, %r59;\n"
" mul.wide.u32 %rd44, %r59, 4;\n"
" add.u64 %rd45, %rd39, %rd44;\n"
" ld.shared.f32 %f82, [%rd45+0];\n"
" add.ftz.f32 %f78, %f82, %f78;\n"
" st.shared.f32 [%rd42+0], %f78;\n"
" ld.shared.f32 %f83, [%rd45+512];\n"
" add.ftz.f32 %f79, %f83, %f79;\n"
" st.shared.f32 [%rd42+512], %f79;\n"
" ld.shared.f32 %f84, [%rd45+1024];\n"
" add.ftz.f32 %f80, %f84, %f80;\n"
" st.shared.f32 [%rd42+1024], %f80;\n"
" ld.shared.f32 %f85, [%rd45+1536];\n"
" add.ftz.f32 %f81, %f85, %f81;\n"
" st.shared.f32 [%rd42+1536], %f81;\n"
"$Lt_0_23554:\n"
" shr.u32 %r57, %r57, 1;\n"
" mov.u32 %r60, 0;\n"
" setp.ne.u32 %p11, %r57, %r60;\n"
" @%p11 bra $Lt_0_23298;\n"
"$Lt_0_22786:\n"
" mov.f32 %f27, %f78;\n"
" mov.f32 %f26, %f79;\n"
" mov.f32 %f25, %f80;\n"
" mov.f32 %f28, %f81;\n"
" ld.param.s32 %r61, [__cudaparm_kernel_pair_vflag];\n"
" mov.u32 %r62, 0;\n"
" setp.le.s32 %p12, %r61, %r62;\n"
" @%p12 bra $Lt_0_24322;\n"
" mov.f32 %f78, %f6;\n"
" st.shared.f32 [%rd42+0], %f78;\n"
" mov.f32 %f79, %f8;\n"
" st.shared.f32 [%rd42+512], %f79;\n"
" mov.f32 %f80, %f10;\n"
" st.shared.f32 [%rd42+1024], %f80;\n"
" mov.f32 %f81, %f12;\n"
" st.shared.f32 [%rd42+1536], %f81;\n"
" mov.f32 %f86, %f14;\n"
" st.shared.f32 [%rd42+2048], %f86;\n"
" mov.f32 %f87, %f15;\n"
" st.shared.f32 [%rd42+2560], %f87;\n"
" mov.s32 %r63, %r56;\n"
" @!%p9 bra $Lt_0_24834;\n"
"$Lt_0_25346:\n"
" setp.ge.u32 %p13, %r13, %r63;\n"
" @%p13 bra $Lt_0_25602;\n"
" add.u32 %r64, %r2, %r63;\n"
" cvt.u64.u32 %rd46, %r64;\n"
" mul.wide.u32 %rd47, %r64, 4;\n"
" add.u64 %rd48, %rd39, %rd47;\n"
" ld.shared.f32 %f88, [%rd48+0];\n"
" add.ftz.f32 %f78, %f88, %f78;\n"
" st.shared.f32 [%rd42+0], %f78;\n"
" ld.shared.f32 %f89, [%rd48+512];\n"
" add.ftz.f32 %f79, %f89, %f79;\n"
" st.shared.f32 [%rd42+512], %f79;\n"
" ld.shared.f32 %f90, [%rd48+1024];\n"
" add.ftz.f32 %f80, %f90, %f80;\n"
" st.shared.f32 [%rd42+1024], %f80;\n"
" ld.shared.f32 %f91, [%rd48+1536];\n"
" add.ftz.f32 %f81, %f91, %f81;\n"
" st.shared.f32 [%rd42+1536], %f81;\n"
" ld.shared.f32 %f92, [%rd48+2048];\n"
" add.ftz.f32 %f86, %f92, %f86;\n"
" st.shared.f32 [%rd42+2048], %f86;\n"
" ld.shared.f32 %f93, [%rd48+2560];\n"
" add.ftz.f32 %f87, %f93, %f87;\n"
" st.shared.f32 [%rd42+2560], %f87;\n"
"$Lt_0_25602:\n"
" shr.u32 %r63, %r63, 1;\n"
" mov.u32 %r65, 0;\n"
" setp.ne.u32 %p14, %r63, %r65;\n"
" @%p14 bra $Lt_0_25346;\n"
"$Lt_0_24834:\n"
" mov.f32 %f6, %f78;\n"
" mov.f32 %f8, %f79;\n"
" mov.f32 %f10, %f80;\n"
" mov.f32 %f12, %f81;\n"
" mov.f32 %f14, %f86;\n"
" mov.f32 %f16, %f87;\n"
"$Lt_0_24322:\n"
"$Lt_0_22274:\n"
" mov.u32 %r66, 0;\n"
" setp.ne.s32 %p15, %r13, %r66;\n"
" @%p15 bra $Lt_0_26370;\n"
" ld.param.u64 %rd49, [__cudaparm_kernel_pair___val_paramengv];\n"
" add.u64 %rd50, %rd49, %rd5;\n"
" ld.param.s32 %r67, [__cudaparm_kernel_pair_eflag];\n"
" mov.u32 %r68, 0;\n"
" setp.le.s32 %p16, %r67, %r68;\n"
" @%p16 bra $Lt_0_26882;\n"
" st.global.f32 [%rd50+0], %f28;\n"
" cvt.s64.s32 %rd51, %r9;\n"
" mul.wide.s32 %rd52, %r9, 4;\n"
" add.u64 %rd50, %rd50, %rd52;\n"
"$Lt_0_26882:\n"
" ld.param.s32 %r69, [__cudaparm_kernel_pair_vflag];\n"
" mov.u32 %r70, 0;\n"
" setp.le.s32 %p17, %r69, %r70;\n"
" @%p17 bra $Lt_0_27394;\n"
" mov.f32 %f94, %f6;\n"
" st.global.f32 [%rd50+0], %f94;\n"
" cvt.s64.s32 %rd53, %r9;\n"
" mul.wide.s32 %rd54, %r9, 4;\n"
" add.u64 %rd55, %rd54, %rd50;\n"
" mov.f32 %f95, %f8;\n"
" st.global.f32 [%rd55+0], %f95;\n"
" add.u64 %rd56, %rd54, %rd55;\n"
" mov.f32 %f96, %f10;\n"
" st.global.f32 [%rd56+0], %f96;\n"
" add.u64 %rd57, %rd54, %rd56;\n"
" mov.f32 %f97, %f12;\n"
" st.global.f32 [%rd57+0], %f97;\n"
" add.u64 %rd50, %rd54, %rd57;\n"
" mov.f32 %f98, %f14;\n"
" st.global.f32 [%rd50+0], %f98;\n"
" mov.f32 %f99, %f16;\n"
" add.u64 %rd58, %rd54, %rd50;\n"
" st.global.f32 [%rd58+0], %f99;\n"
"$Lt_0_27394:\n"
" ld.param.u64 %rd59, [__cudaparm_kernel_pair_ans];\n"
" mul.lo.u64 %rd60, %rd4, 16;\n"
" add.u64 %rd61, %rd59, %rd60;\n"
" mov.f32 %f100, %f101;\n"
" st.global.v4.f32 [%rd61+0], {%f27,%f26,%f25,%f100};\n"
"$Lt_0_26370:\n"
"$Lt_0_18690:\n"
" .loc 16 103 0\n"
" exit;\n"
"$LDWend_kernel_pair:\n"
" }\n"
" .entry kernel_pair_fast (\n"
" .param .u64 __cudaparm_kernel_pair_fast_x_,\n"
" .param .u64 __cudaparm_kernel_pair_fast_lj1_in,\n"
" .param .u64 __cudaparm_kernel_pair_fast_lj3_in,\n"
" .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,\n"
" .param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n"
" .param .u64 __cudaparm_kernel_pair_fast_dev_packed,\n"
" .param .u64 __cudaparm_kernel_pair_fast_ans,\n"
" .param .u64 __cudaparm_kernel_pair_fast___val_paramengv,\n"
" .param .s32 __cudaparm_kernel_pair_fast_eflag,\n"
" .param .s32 __cudaparm_kernel_pair_fast_vflag,\n"
" .param .s32 __cudaparm_kernel_pair_fast_inum,\n"
" .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,\n"
" .param .s32 __cudaparm_kernel_pair_fast_t_per_atom)\n"
" {\n"
" .reg .u32 %r<74>;\n"
" .reg .u64 %rd<75>;\n"
" .reg .f32 %f<109>;\n"
" .reg .pred %p<22>;\n"
" .shared .align 4 .b8 __cuda___cuda_local_var_32618_33_non_const_sp_lj3268[16];\n"
" .shared .align 16 .b8 __cuda___cuda_local_var_32616_34_non_const_lj13296[1936];\n"
" .shared .align 16 .b8 __cuda___cuda_local_var_32617_34_non_const_lj35232[1936];\n"
" .shared .align 4 .b8 __cuda___cuda_local_var_32687_55_non_const_red_acc7168[3072];\n"
" .loc 16 111 0\n"
"$LDWbegin_kernel_pair_fast:\n"
" cvt.s32.u32 %r1, %tid.x;\n"
" mov.u32 %r2, 3;\n"
" setp.gt.s32 %p1, %r1, %r2;\n"
" @%p1 bra $Lt_1_20994;\n"
" .loc 16 119 0\n"
" mov.u64 %rd1, __cuda___cuda_local_var_32618_33_non_const_sp_lj3268;\n"
" cvt.s64.s32 %rd2, %r1;\n"
" mul.wide.s32 %rd3, %r1, 4;\n"
" ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];\n"
" add.u64 %rd5, %rd4, %rd3;\n"
" ld.global.f32 %f1, [%rd5+0];\n"
" add.u64 %rd6, %rd3, %rd1;\n"
" st.shared.f32 [%rd6+0], %f1;\n"
"$Lt_1_20994:\n"
" mov.u64 %rd1, __cuda___cuda_local_var_32618_33_non_const_sp_lj3268;\n"
" mov.u32 %r3, 120;\n"
" setp.gt.s32 %p2, %r1, %r3;\n"
" @%p2 bra $Lt_1_21506;\n"
" .loc 16 121 0\n"
" mov.u64 %rd7, __cuda___cuda_local_var_32616_34_non_const_lj13296;\n"
" cvt.s64.s32 %rd8, %r1;\n"
" mul.wide.s32 %rd9, %r1, 16;\n"
" ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_lj1_in];\n"
" add.u64 %rd11, %rd10, %rd9;\n"
" add.u64 %rd12, %rd9, %rd7;\n"
" ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0];\n"
" st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5};\n"
" ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag];\n"
" mov.u32 %r5, 0;\n"
" setp.le.s32 %p3, %r4, %r5;\n"
" @%p3 bra $Lt_1_22018;\n"
" .loc 16 123 0\n"
" mov.u64 %rd13, __cuda___cuda_local_var_32617_34_non_const_lj35232;\n"
" ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in];\n"
" add.u64 %rd15, %rd14, %rd9;\n"
" add.u64 %rd16, %rd9, %rd13;\n"
" ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0];\n"
" st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9};\n"
"$Lt_1_22018:\n"
" mov.u64 %rd13, __cuda___cuda_local_var_32617_34_non_const_lj35232;\n"
"$Lt_1_21506:\n"
" mov.u64 %rd13, __cuda___cuda_local_var_32617_34_non_const_lj35232;\n"
" mov.u64 %rd7, __cuda___cuda_local_var_32616_34_non_const_lj13296;\n"
" .loc 16 131 0\n"
" mov.f32 %f10, 0f00000000; \n"
" mov.f32 %f11, %f10;\n"
" mov.f32 %f12, 0f00000000; \n"
" mov.f32 %f13, %f12;\n"
" mov.f32 %f14, 0f00000000; \n"
" mov.f32 %f15, %f14;\n"
" mov.f32 %f16, 0f00000000; \n"
" mov.f32 %f17, %f16;\n"
" mov.f32 %f18, 0f00000000; \n"
" mov.f32 %f19, %f18;\n"
" mov.f32 %f20, 0f00000000; \n"
" mov.f32 %f21, %f20;\n"
" .loc 16 133 0\n"
" bar.sync 0;\n"
" ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom];\n"
" div.s32 %r7, %r1, %r6;\n"
" cvt.s32.u32 %r8, %ntid.x;\n"
" div.s32 %r9, %r8, %r6;\n"
" cvt.s32.u32 %r10, %ctaid.x;\n"
" mul.lo.s32 %r11, %r10, %r9;\n"
" add.s32 %r12, %r7, %r11;\n"
" ld.param.s32 %r13, [__cudaparm_kernel_pair_fast_inum];\n"
" setp.ge.s32 %p4, %r12, %r13;\n"
" @%p4 bra $Lt_1_30210;\n"
" .loc 16 138 0\n"
" ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_nbor_pitch];\n"
" cvt.s64.s32 %rd17, %r14;\n"
" mul.wide.s32 %rd18, %r14, 4;\n"
" cvt.s64.s32 %rd19, %r12;\n"
" mul.wide.s32 %rd20, %r12, 4;\n"
" ld.param.u64 %rd21, [__cudaparm_kernel_pair_fast_dev_nbor];\n"
" add.u64 %rd22, %rd20, %rd21;\n"
" add.u64 %rd23, %rd18, %rd22;\n"
" ld.global.s32 %r15, [%rd23+0];\n"
" sub.s32 %r16, %r6, 1;\n"
" and.b32 %r17, %r16, %r1;\n"
" cvt.s64.s32 %rd24, %r17;\n"
" mul.wide.s32 %rd25, %r17, 4;\n"
" ld.param.u64 %rd26, [__cudaparm_kernel_pair_fast_dev_packed];\n"
" setp.ne.u64 %p5, %rd26, %rd21;\n"
" @%p5 bra $Lt_1_23298;\n"
" cvt.s32.s64 %r18, %rd17;\n"
" mul.lo.s32 %r19, %r18, %r6;\n"
" mov.s32 %r20, %r19;\n"
" mul.lo.s32 %r21, %r16, %r12;\n"
" add.s32 %r22, %r18, %r21;\n"
" cvt.s64.s32 %rd27, %r22;\n"
" mul.wide.s32 %rd28, %r22, 4;\n"
" add.u64 %rd29, %rd23, %rd28;\n"
" and.b32 %r23, %r16, %r15;\n"
" cvt.s64.s32 %rd30, %r23;\n"
" div.s32 %r24, %r15, %r6;\n"
" mul.lo.s32 %r25, %r19, %r24;\n"
" cvt.s64.s32 %rd31, %r25;\n"
" add.u64 %rd32, %rd30, %rd31;\n"
" mul.lo.u64 %rd33, %rd32, 4;\n"
" add.u64 %rd34, %rd29, %rd33;\n"
" add.u64 %rd35, %rd25, %rd29;\n"
" bra.uni $Lt_1_23042;\n"
"$Lt_1_23298:\n"
" add.u64 %rd36, %rd18, %rd23;\n"
" ld.global.s32 %r26, [%rd36+0];\n"
" cvt.s64.s32 %rd37, %r26;\n"
" mul.wide.s32 %rd38, %r26, 4;\n"
" add.u64 %rd39, %rd26, %rd38;\n"
" cvt.s64.s32 %rd40, %r15;\n"
" mul.wide.s32 %rd41, %r15, 4;\n"
" add.u64 %rd34, %rd39, %rd41;\n"
" mov.s32 %r20, %r6;\n"
" add.u64 %rd35, %rd25, %rd39;\n"
"$Lt_1_23042:\n"
" .loc 16 141 0\n"
" ld.global.s32 %r27, [%rd22+0];\n"
" mov.u32 %r28, %r27;\n"
" mov.s32 %r29, 0;\n"
" mov.u32 %r30, %r29;\n"
" mov.s32 %r31, 0;\n"
" mov.u32 %r32, %r31;\n"
" mov.s32 %r33, 0;\n"
" mov.u32 %r34, %r33;\n"
" tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r28,%r30,%r32,%r34}];\n"
" mov.f32 %f26, %f22;\n"
" mov.f32 %f27, %f23;\n"
" mov.f32 %f28, %f24;\n"
" mov.f32 %f29, %f25;\n"
" setp.ge.u64 %p6, %rd35, %rd34;\n"
" @%p6 bra $Lt_1_31746;\n"
" cvt.rzi.ftz.s32.f32 %r35, %f29;\n"
" cvt.s64.s32 %rd42, %r20;\n"
" mul.lo.s32 %r36, %r35, 11;\n"
" cvt.rn.f32.s32 %f30, %r36;\n"
" mov.f32 %f31, 0f00000000; \n"
" mov.f32 %f32, 0f00000000; \n"
" mov.f32 %f33, 0f00000000; \n"
" mov.f32 %f34, 0f00000000; \n"
"$Lt_1_24066:\n"
" .loc 16 148 0\n"
" ld.global.s32 %r37, [%rd35+0];\n"
" .loc 16 152 0\n"
" and.b32 %r38, %r37, 1073741823;\n"
" mov.u32 %r39, %r38;\n"
" mov.s32 %r40, 0;\n"
" mov.u32 %r41, %r40;\n"
" mov.s32 %r42, 0;\n"
" mov.u32 %r43, %r42;\n"
" mov.s32 %r44, 0;\n"
" mov.u32 %r45, %r44;\n"
" tex.1d.v4.f32.s32 {%f35,%f36,%f37,%f38},[pos_tex,{%r39,%r41,%r43,%r45}];\n"
" mov.f32 %f39, %f35;\n"
" mov.f32 %f40, %f36;\n"
" mov.f32 %f41, %f37;\n"
" mov.f32 %f42, %f38;\n"
" sub.ftz.f32 %f43, %f27, %f40;\n"
" sub.ftz.f32 %f44, %f26, %f39;\n"
" sub.ftz.f32 %f45, %f28, %f41;\n"
" mul.ftz.f32 %f46, %f43, %f43;\n"
" fma.rn.ftz.f32 %f47, %f44, %f44, %f46;\n"
" fma.rn.ftz.f32 %f48, %f45, %f45, %f47;\n"
" add.ftz.f32 %f49, %f30, %f42;\n"
" cvt.rzi.ftz.s32.f32 %r46, %f49;\n"
" cvt.s64.s32 %rd43, %r46;\n"
" mul.wide.s32 %rd44, %r46, 16;\n"
" add.u64 %rd45, %rd44, %rd7;\n"
" ld.shared.f32 %f50, [%rd45+8];\n"
" setp.gt.ftz.f32 %p7, %f50, %f48;\n"
" @!%p7 bra $Lt_1_25346;\n"
" .loc 16 165 0\n"
" rcp.approx.ftz.f32 %f51, %f48;\n"
" mul.ftz.f32 %f52, %f51, %f51;\n"
" mul.ftz.f32 %f53, %f51, %f52;\n"
" sqrt.approx.ftz.f32 %f54, %f53;\n"
" mul.ftz.f32 %f55, %f51, %f53;\n"
" ld.shared.v2.f32 {%f56,%f57}, [%rd45+0];\n"
" mul.ftz.f32 %f58, %f56, %f54;\n"
" sub.ftz.f32 %f59, %f58, %f57;\n"
" mul.ftz.f32 %f60, %f55, %f59;\n"
" .loc 16 167 0\n"
" fma.rn.ftz.f32 %f33, %f44, %f60, %f33;\n"
" .loc 16 168 0\n"
" fma.rn.ftz.f32 %f32, %f43, %f60, %f32;\n"
" .loc 16 169 0\n"
" fma.rn.ftz.f32 %f31, %f45, %f60, %f31;\n"
" ld.param.s32 %r47, [__cudaparm_kernel_pair_fast_eflag];\n"
" mov.u32 %r48, 0;\n"
" setp.le.s32 %p8, %r47, %r48;\n"
" @%p8 bra $Lt_1_24834;\n"
" .loc 16 172 0\n"
" add.u64 %rd46, %rd44, %rd13;\n"
" ld.shared.v4.f32 {%f61,%f62,%f63,_}, [%rd46+0];\n"
" mul.ftz.f32 %f64, %f61, %f54;\n"
" sub.ftz.f32 %f65, %f64, %f62;\n"
" mul.ftz.f32 %f66, %f53, %f65;\n"
" .loc 16 173 0\n"
" shr.s32 %r49, %r37, 30;\n"
" and.b32 %r50, %r49, 3;\n"
" cvt.s64.s32 %rd47, %r50;\n"
" mul.wide.s32 %rd48, %r50, 4;\n"
" add.u64 %rd49, %rd1, %rd48;\n"
" ld.shared.f32 %f67, [%rd49+0];\n"
" sub.ftz.f32 %f68, %f66, %f63;\n"
" fma.rn.ftz.f32 %f34, %f67, %f68, %f34;\n"
"$Lt_1_24834:\n"
" ld.param.s32 %r51, [__cudaparm_kernel_pair_fast_vflag];\n"
" mov.u32 %r52, 0;\n"
" setp.le.s32 %p9, %r51, %r52;\n"
" @%p9 bra $Lt_1_25346;\n"
" .loc 16 176 0\n"
" mov.f32 %f69, %f11;\n"
" mul.ftz.f32 %f70, %f44, %f44;\n"
" fma.rn.ftz.f32 %f71, %f60, %f70, %f69;\n"
" mov.f32 %f11, %f71;\n"
" .loc 16 177 0\n"
" mov.f32 %f72, %f13;\n"
" fma.rn.ftz.f32 %f73, %f60, %f46, %f72;\n"
" mov.f32 %f13, %f73;\n"
" .loc 16 178 0\n"
" mov.f32 %f74, %f15;\n"
" mul.ftz.f32 %f75, %f45, %f45;\n"
" fma.rn.ftz.f32 %f76, %f60, %f75, %f74;\n"
" mov.f32 %f15, %f76;\n"
" .loc 16 179 0\n"
" mov.f32 %f77, %f17;\n"
" mul.ftz.f32 %f78, %f43, %f44;\n"
" fma.rn.ftz.f32 %f79, %f60, %f78, %f77;\n"
" mov.f32 %f17, %f79;\n"
" .loc 16 180 0\n"
" mov.f32 %f80, %f19;\n"
" mul.ftz.f32 %f81, %f44, %f45;\n"
" fma.rn.ftz.f32 %f82, %f60, %f81, %f80;\n"
" mov.f32 %f19, %f82;\n"
" .loc 16 181 0\n"
" mul.ftz.f32 %f83, %f43, %f45;\n"
" fma.rn.ftz.f32 %f20, %f60, %f83, %f20;\n"
" mov.f32 %f21, %f20;\n"
"$Lt_1_25346:\n"
"$Lt_1_24322:\n"
" .loc 16 146 0\n"
" mul.lo.u64 %rd50, %rd42, 4;\n"
" add.u64 %rd35, %rd35, %rd50;\n"
" setp.lt.u64 %p10, %rd35, %rd34;\n"
" @%p10 bra $Lt_1_24066;\n"
" bra.uni $Lt_1_23554;\n"
"$Lt_1_31746:\n"
" mov.f32 %f31, 0f00000000; \n"
" mov.f32 %f32, 0f00000000; \n"
" mov.f32 %f33, 0f00000000; \n"
" mov.f32 %f34, 0f00000000; \n"
"$Lt_1_23554:\n"
" mov.u32 %r53, 1;\n"
" setp.le.s32 %p11, %r6, %r53;\n"
" @%p11 bra $Lt_1_28162;\n"
" .loc 16 186 0\n"
" mov.u64 %rd51, __cuda___cuda_local_var_32687_55_non_const_red_acc7168;\n"
" cvt.s64.s32 %rd52, %r1;\n"
" mul.wide.s32 %rd53, %r1, 4;\n"
" add.u64 %rd54, %rd51, %rd53;\n"
" mov.f32 %f84, %f33;\n"
" st.shared.f32 [%rd54+0], %f84;\n"
" mov.f32 %f85, %f32;\n"
" st.shared.f32 [%rd54+512], %f85;\n"
" mov.f32 %f86, %f31;\n"
" st.shared.f32 [%rd54+1024], %f86;\n"
" mov.f32 %f87, %f34;\n"
" st.shared.f32 [%rd54+1536], %f87;\n"
" shr.s32 %r54, %r6, 31;\n"
" mov.s32 %r55, 1;\n"
" and.b32 %r56, %r54, %r55;\n"
" add.s32 %r57, %r56, %r6;\n"
" shr.s32 %r58, %r57, 1;\n"
" mov.s32 %r59, %r58;\n"
" mov.u32 %r60, 0;\n"
" setp.ne.u32 %p12, %r58, %r60;\n"
" @!%p12 bra $Lt_1_26626;\n"
"$Lt_1_27138:\n"
" setp.ge.u32 %p13, %r17, %r59;\n"
" @%p13 bra $Lt_1_27394;\n"
" add.u32 %r61, %r1, %r59;\n"
" cvt.u64.u32 %rd55, %r61;\n"
" mul.wide.u32 %rd56, %r61, 4;\n"
" add.u64 %rd57, %rd51, %rd56;\n"
" ld.shared.f32 %f88, [%rd57+0];\n"
" add.ftz.f32 %f84, %f88, %f84;\n"
" st.shared.f32 [%rd54+0], %f84;\n"
" ld.shared.f32 %f89, [%rd57+512];\n"
" add.ftz.f32 %f85, %f89, %f85;\n"
" st.shared.f32 [%rd54+512], %f85;\n"
" ld.shared.f32 %f90, [%rd57+1024];\n"
" add.ftz.f32 %f86, %f90, %f86;\n"
" st.shared.f32 [%rd54+1024], %f86;\n"
" ld.shared.f32 %f91, [%rd57+1536];\n"
" add.ftz.f32 %f87, %f91, %f87;\n"
" st.shared.f32 [%rd54+1536], %f87;\n"
"$Lt_1_27394:\n"
" shr.u32 %r59, %r59, 1;\n"
" mov.u32 %r62, 0;\n"
" setp.ne.u32 %p14, %r59, %r62;\n"
" @%p14 bra $Lt_1_27138;\n"
"$Lt_1_26626:\n"
" mov.f32 %f33, %f84;\n"
" mov.f32 %f32, %f85;\n"
" mov.f32 %f31, %f86;\n"
" mov.f32 %f34, %f87;\n"
" ld.param.s32 %r63, [__cudaparm_kernel_pair_fast_vflag];\n"
" mov.u32 %r64, 0;\n"
" setp.le.s32 %p15, %r63, %r64;\n"
" @%p15 bra $Lt_1_28162;\n"
" mov.f32 %f84, %f11;\n"
" st.shared.f32 [%rd54+0], %f84;\n"
" mov.f32 %f85, %f13;\n"
" st.shared.f32 [%rd54+512], %f85;\n"
" mov.f32 %f86, %f15;\n"
" st.shared.f32 [%rd54+1024], %f86;\n"
" mov.f32 %f87, %f17;\n"
" st.shared.f32 [%rd54+1536], %f87;\n"
" mov.f32 %f92, %f19;\n"
" st.shared.f32 [%rd54+2048], %f92;\n"
" mov.f32 %f93, %f20;\n"
" st.shared.f32 [%rd54+2560], %f93;\n"
" mov.s32 %r65, %r58;\n"
" @!%p12 bra $Lt_1_28674;\n"
"$Lt_1_29186:\n"
" setp.ge.u32 %p16, %r17, %r65;\n"
" @%p16 bra $Lt_1_29442;\n"
" add.u32 %r66, %r1, %r65;\n"
" cvt.u64.u32 %rd58, %r66;\n"
" mul.wide.u32 %rd59, %r66, 4;\n"
" add.u64 %rd60, %rd51, %rd59;\n"
" ld.shared.f32 %f94, [%rd60+0];\n"
" add.ftz.f32 %f84, %f94, %f84;\n"
" st.shared.f32 [%rd54+0], %f84;\n"
" ld.shared.f32 %f95, [%rd60+512];\n"
" add.ftz.f32 %f85, %f95, %f85;\n"
" st.shared.f32 [%rd54+512], %f85;\n"
" ld.shared.f32 %f96, [%rd60+1024];\n"
" add.ftz.f32 %f86, %f96, %f86;\n"
" st.shared.f32 [%rd54+1024], %f86;\n"
" ld.shared.f32 %f97, [%rd60+1536];\n"
" add.ftz.f32 %f87, %f97, %f87;\n"
" st.shared.f32 [%rd54+1536], %f87;\n"
" ld.shared.f32 %f98, [%rd60+2048];\n"
" add.ftz.f32 %f92, %f98, %f92;\n"
" st.shared.f32 [%rd54+2048], %f92;\n"
" ld.shared.f32 %f99, [%rd60+2560];\n"
" add.ftz.f32 %f93, %f99, %f93;\n"
" st.shared.f32 [%rd54+2560], %f93;\n"
"$Lt_1_29442:\n"
" shr.u32 %r65, %r65, 1;\n"
" mov.u32 %r67, 0;\n"
" setp.ne.u32 %p17, %r65, %r67;\n"
" @%p17 bra $Lt_1_29186;\n"
"$Lt_1_28674:\n"
" mov.f32 %f11, %f84;\n"
" mov.f32 %f13, %f85;\n"
" mov.f32 %f15, %f86;\n"
" mov.f32 %f17, %f87;\n"
" mov.f32 %f19, %f92;\n"
" mov.f32 %f21, %f93;\n"
"$Lt_1_28162:\n"
"$Lt_1_26114:\n"
" mov.u32 %r68, 0;\n"
" setp.ne.s32 %p18, %r17, %r68;\n"
" @%p18 bra $Lt_1_30210;\n"
" ld.param.u64 %rd61, [__cudaparm_kernel_pair_fast___val_paramengv];\n"
" add.u64 %rd62, %rd61, %rd20;\n"
" ld.param.s32 %r69, [__cudaparm_kernel_pair_fast_eflag];\n"
" mov.u32 %r70, 0;\n"
" setp.le.s32 %p19, %r69, %r70;\n"
" @%p19 bra $Lt_1_30722;\n"
" st.global.f32 [%rd62+0], %f34;\n"
" cvt.s64.s32 %rd63, %r13;\n"
" mul.wide.s32 %rd64, %r13, 4;\n"
" add.u64 %rd62, %rd62, %rd64;\n"
"$Lt_1_30722:\n"
" ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag];\n"
" mov.u32 %r72, 0;\n"
" setp.le.s32 %p20, %r71, %r72;\n"
" @%p20 bra $Lt_1_31234;\n"
" mov.f32 %f100, %f11;\n"
" st.global.f32 [%rd62+0], %f100;\n"
" cvt.s64.s32 %rd65, %r13;\n"
" mul.wide.s32 %rd66, %r13, 4;\n"
" add.u64 %rd67, %rd66, %rd62;\n"
" mov.f32 %f101, %f13;\n"
" st.global.f32 [%rd67+0], %f101;\n"
" add.u64 %rd68, %rd66, %rd67;\n"
" mov.f32 %f102, %f15;\n"
" st.global.f32 [%rd68+0], %f102;\n"
" add.u64 %rd69, %rd66, %rd68;\n"
" mov.f32 %f103, %f17;\n"
" st.global.f32 [%rd69+0], %f103;\n"
" add.u64 %rd62, %rd66, %rd69;\n"
" mov.f32 %f104, %f19;\n"
" st.global.f32 [%rd62+0], %f104;\n"
" mov.f32 %f105, %f21;\n"
" add.u64 %rd70, %rd66, %rd62;\n"
" st.global.f32 [%rd70+0], %f105;\n"
"$Lt_1_31234:\n"
" ld.param.u64 %rd71, [__cudaparm_kernel_pair_fast_ans];\n"
" mul.lo.u64 %rd72, %rd19, 16;\n"
" add.u64 %rd73, %rd71, %rd72;\n"
" mov.f32 %f106, %f107;\n"
" st.global.v4.f32 [%rd73+0], {%f33,%f32,%f31,%f106};\n"
"$Lt_1_30210:\n"
"$Lt_1_22530:\n"
" .loc 16 189 0\n"
" exit;\n"
"$LDWend_kernel_pair_fast:\n"
" }\n"
;

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,912 +0,0 @@
.version 2.3
.target sm_20
.address_size 64
// compiled with /usr/local/cuda/open64/lib//be
// nvopencc 4.0 built on 2011-05-12
//-----------------------------------------------------------
// Compiling /tmp/tmpxft_00009ccd_00000000-9_lal_lj_expand.cpp3.i (/home/sjplimp/ccBI#.06ur5E)
//-----------------------------------------------------------
//-----------------------------------------------------------
// Options:
//-----------------------------------------------------------
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
// -O3 (Optimization level)
// -g0 (Debug level)
// -m2 (Report advisories)
//-----------------------------------------------------------
.file 1 "<command-line>"
.file 2 "/tmp/tmpxft_00009ccd_00000000-8_lal_lj_expand.cudafe2.gpu"
.file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
.file 4 "/usr/local/cuda/include/crt/device_runtime.h"
.file 5 "/usr/local/cuda/include/host_defines.h"
.file 6 "/usr/local/cuda/include/builtin_types.h"
.file 7 "/usr/local/cuda/include/device_types.h"
.file 8 "/usr/local/cuda/include/driver_types.h"
.file 9 "/usr/local/cuda/include/surface_types.h"
.file 10 "/usr/local/cuda/include/texture_types.h"
.file 11 "/usr/local/cuda/include/vector_types.h"
.file 12 "/usr/local/cuda/include/device_launch_parameters.h"
.file 13 "/usr/local/cuda/include/crt/storage_class.h"
.file 14 "/usr/include/bits/types.h"
.file 15 "/usr/include/time.h"
.file 16 "lal_lj_expand.cu"
.file 17 "/usr/local/cuda/include/common_functions.h"
.file 18 "/usr/local/cuda/include/math_functions.h"
.file 19 "/usr/local/cuda/include/math_constants.h"
.file 20 "/usr/local/cuda/include/device_functions.h"
.file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h"
.file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h"
.file 23 "/usr/local/cuda/include/sm_13_double_functions.h"
.file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h"
.file 25 "/usr/local/cuda/include/sm_20_intrinsics.h"
.file 26 "/usr/local/cuda/include/surface_functions.h"
.file 27 "/usr/local/cuda/include/texture_fetch_functions.h"
.file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
.global .texref pos_tex;
.entry kernel_pair (
.param .u64 __cudaparm_kernel_pair_x_,
.param .u64 __cudaparm_kernel_pair_lj1,
.param .u64 __cudaparm_kernel_pair_lj3,
.param .s32 __cudaparm_kernel_pair_lj_types,
.param .u64 __cudaparm_kernel_pair_sp_lj_in,
.param .u64 __cudaparm_kernel_pair_dev_nbor,
.param .u64 __cudaparm_kernel_pair_dev_packed,
.param .u64 __cudaparm_kernel_pair_ans,
.param .u64 __cudaparm_kernel_pair___val_paramengv,
.param .s32 __cudaparm_kernel_pair_eflag,
.param .s32 __cudaparm_kernel_pair_vflag,
.param .s32 __cudaparm_kernel_pair_inum,
.param .s32 __cudaparm_kernel_pair_nbor_pitch,
.param .s32 __cudaparm_kernel_pair_t_per_atom)
{
.reg .u32 %r<72>;
.reg .u64 %rd<63>;
.reg .f32 %f<107>;
.reg .pred %p<19>;
.shared .align 16 .b8 __cuda___cuda_local_var_32536_33_non_const_sp_lj92[16];
.shared .align 4 .b8 __cuda___cuda_local_var_32603_55_non_const_red_acc108[3072];
// __cuda_local_var_32543_10_non_const_f = 48
// __cuda_local_var_32545_9_non_const_virial = 16
.loc 16 31 0
$LDWbegin_kernel_pair:
.loc 16 36 0
ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];
ldu.global.f32 %f1, [%rd1+0];
.loc 16 37 0
ld.global.f32 %f2, [%rd1+4];
.loc 16 38 0
ld.global.f32 %f3, [%rd1+8];
.loc 16 39 0
ld.global.f32 %f4, [%rd1+12];
st.shared.v4.f32 [__cuda___cuda_local_var_32536_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4};
.loc 16 46 0
mov.f32 %f5, 0f00000000; // 0
mov.f32 %f6, %f5;
mov.f32 %f7, 0f00000000; // 0
mov.f32 %f8, %f7;
mov.f32 %f9, 0f00000000; // 0
mov.f32 %f10, %f9;
mov.f32 %f11, 0f00000000; // 0
mov.f32 %f12, %f11;
mov.f32 %f13, 0f00000000; // 0
mov.f32 %f14, %f13;
mov.f32 %f15, 0f00000000; // 0
mov.f32 %f16, %f15;
ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];
cvt.s32.u32 %r2, %tid.x;
div.s32 %r3, %r2, %r1;
cvt.s32.u32 %r4, %ntid.x;
div.s32 %r5, %r4, %r1;
cvt.s32.u32 %r6, %ctaid.x;
mul.lo.s32 %r7, %r6, %r5;
add.s32 %r8, %r3, %r7;
ld.param.s32 %r9, [__cudaparm_kernel_pair_inum];
setp.ge.s32 %p1, %r8, %r9;
@%p1 bra $Lt_0_26370;
.loc 16 51 0
ld.param.s32 %r10, [__cudaparm_kernel_pair_nbor_pitch];
cvt.s64.s32 %rd2, %r10;
mul.wide.s32 %rd3, %r10, 4;
cvt.s64.s32 %rd4, %r8;
mul.wide.s32 %rd5, %r8, 4;
ld.param.u64 %rd6, [__cudaparm_kernel_pair_dev_nbor];
add.u64 %rd7, %rd5, %rd6;
add.u64 %rd8, %rd3, %rd7;
ld.global.s32 %r11, [%rd8+0];
sub.s32 %r12, %r1, 1;
and.b32 %r13, %r12, %r2;
cvt.s64.s32 %rd9, %r13;
mul.wide.s32 %rd10, %r13, 4;
ld.param.u64 %rd11, [__cudaparm_kernel_pair_dev_packed];
setp.ne.u64 %p2, %rd11, %rd6;
@%p2 bra $Lt_0_19458;
cvt.s32.s64 %r14, %rd2;
mul.lo.s32 %r15, %r14, %r1;
mov.s32 %r16, %r15;
mul.lo.s32 %r17, %r12, %r8;
add.s32 %r18, %r14, %r17;
cvt.s64.s32 %rd12, %r18;
mul.wide.s32 %rd13, %r18, 4;
add.u64 %rd14, %rd8, %rd13;
and.b32 %r19, %r12, %r11;
cvt.s64.s32 %rd15, %r19;
div.s32 %r20, %r11, %r1;
mul.lo.s32 %r21, %r15, %r20;
cvt.s64.s32 %rd16, %r21;
add.u64 %rd17, %rd15, %rd16;
mul.lo.u64 %rd18, %rd17, 4;
add.u64 %rd19, %rd14, %rd18;
add.u64 %rd20, %rd10, %rd14;
bra.uni $Lt_0_19202;
$Lt_0_19458:
add.u64 %rd21, %rd3, %rd8;
ld.global.s32 %r22, [%rd21+0];
cvt.s64.s32 %rd22, %r22;
mul.wide.s32 %rd23, %r22, 4;
add.u64 %rd24, %rd11, %rd23;
cvt.s64.s32 %rd25, %r11;
mul.wide.s32 %rd26, %r11, 4;
add.u64 %rd19, %rd24, %rd26;
mov.s32 %r16, %r1;
add.u64 %rd20, %rd10, %rd24;
$Lt_0_19202:
.loc 16 54 0
ld.global.s32 %r23, [%rd7+0];
mov.u32 %r24, %r23;
mov.s32 %r25, 0;
mov.u32 %r26, %r25;
mov.s32 %r27, 0;
mov.u32 %r28, %r27;
mov.s32 %r29, 0;
mov.u32 %r30, %r29;
tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r24,%r26,%r28,%r30}];
mov.f32 %f21, %f17;
mov.f32 %f22, %f18;
mov.f32 %f23, %f19;
mov.f32 %f24, %f20;
setp.ge.u64 %p3, %rd20, %rd19;
@%p3 bra $Lt_0_27906;
cvt.rzi.ftz.s32.f32 %r31, %f24;
cvt.s64.s32 %rd27, %r16;
ld.param.s32 %r32, [__cudaparm_kernel_pair_lj_types];
mul.lo.s32 %r33, %r32, %r31;
ld.param.u64 %rd28, [__cudaparm_kernel_pair_lj1];
mov.f32 %f25, 0f00000000; // 0
mov.f32 %f26, 0f00000000; // 0
mov.f32 %f27, 0f00000000; // 0
mov.f32 %f28, 0f00000000; // 0
mov.u64 %rd29, __cuda___cuda_local_var_32536_33_non_const_sp_lj92;
$Lt_0_20226:
//<loop> Loop body line 54, nesting depth: 1, estimated iterations: unknown
.loc 16 60 0
ld.global.s32 %r34, [%rd20+0];
.loc 16 61 0
shr.s32 %r35, %r34, 30;
and.b32 %r36, %r35, 3;
cvt.s64.s32 %rd30, %r36;
mul.wide.s32 %rd31, %r36, 4;
add.u64 %rd32, %rd29, %rd31;
ld.shared.f32 %f29, [%rd32+0];
.loc 16 64 0
and.b32 %r37, %r34, 1073741823;
mov.u32 %r38, %r37;
mov.s32 %r39, 0;
mov.u32 %r40, %r39;
mov.s32 %r41, 0;
mov.u32 %r42, %r41;
mov.s32 %r43, 0;
mov.u32 %r44, %r43;
tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r38,%r40,%r42,%r44}];
mov.f32 %f34, %f30;
mov.f32 %f35, %f31;
mov.f32 %f36, %f32;
mov.f32 %f37, %f33;
cvt.rzi.ftz.s32.f32 %r45, %f37;
sub.ftz.f32 %f38, %f22, %f35;
sub.ftz.f32 %f39, %f21, %f34;
sub.ftz.f32 %f40, %f23, %f36;
mul.ftz.f32 %f41, %f38, %f38;
fma.rn.ftz.f32 %f42, %f39, %f39, %f41;
fma.rn.ftz.f32 %f43, %f40, %f40, %f42;
add.s32 %r46, %r45, %r33;
cvt.s64.s32 %rd33, %r46;
mul.wide.s32 %rd34, %r46, 16;
add.u64 %rd35, %rd34, %rd28;
ld.global.f32 %f44, [%rd35+8];
setp.gt.ftz.f32 %p4, %f44, %f43;
@!%p4 bra $Lt_0_21506;
.loc 16 76 0
sqrt.approx.ftz.f32 %f45, %f43;
ld.global.v4.f32 {%f46,%f47,_,%f48}, [%rd35+0];
sub.ftz.f32 %f49, %f45, %f48;
.loc 16 81 0
mul.ftz.f32 %f50, %f49, %f49;
rcp.approx.ftz.f32 %f51, %f50;
mul.ftz.f32 %f52, %f51, %f51;
mul.ftz.f32 %f53, %f51, %f52;
div.approx.ftz.f32 %f54, %f29, %f49;
div.approx.ftz.f32 %f55, %f54, %f45;
mul.ftz.f32 %f56, %f46, %f53;
sub.ftz.f32 %f57, %f56, %f47;
mul.ftz.f32 %f58, %f53, %f57;
mul.ftz.f32 %f59, %f55, %f58;
.loc 16 83 0
fma.rn.ftz.f32 %f27, %f39, %f59, %f27;
.loc 16 84 0
fma.rn.ftz.f32 %f26, %f38, %f59, %f26;
.loc 16 85 0
fma.rn.ftz.f32 %f25, %f40, %f59, %f25;
ld.param.s32 %r47, [__cudaparm_kernel_pair_eflag];
mov.u32 %r48, 0;
setp.le.s32 %p5, %r47, %r48;
@%p5 bra $Lt_0_20994;
.loc 16 89 0
ld.param.u64 %rd36, [__cudaparm_kernel_pair_lj3];
add.u64 %rd37, %rd36, %rd34;
ld.global.v4.f32 {%f60,%f61,%f62,_}, [%rd37+0];
mul.ftz.f32 %f63, %f60, %f53;
sub.ftz.f32 %f64, %f63, %f61;
mul.ftz.f32 %f65, %f53, %f64;
sub.ftz.f32 %f66, %f65, %f62;
fma.rn.ftz.f32 %f28, %f29, %f66, %f28;
$Lt_0_20994:
ld.param.s32 %r49, [__cudaparm_kernel_pair_vflag];
mov.u32 %r50, 0;
setp.le.s32 %p6, %r49, %r50;
@%p6 bra $Lt_0_21506;
.loc 16 92 0
mov.f32 %f67, %f6;
mul.ftz.f32 %f68, %f39, %f39;
fma.rn.ftz.f32 %f69, %f59, %f68, %f67;
mov.f32 %f6, %f69;
.loc 16 93 0
mov.f32 %f70, %f8;
fma.rn.ftz.f32 %f71, %f59, %f41, %f70;
mov.f32 %f8, %f71;
.loc 16 94 0
mov.f32 %f72, %f10;
mul.ftz.f32 %f73, %f40, %f40;
fma.rn.ftz.f32 %f74, %f59, %f73, %f72;
mov.f32 %f10, %f74;
.loc 16 95 0
mov.f32 %f75, %f12;
mul.ftz.f32 %f76, %f38, %f39;
fma.rn.ftz.f32 %f77, %f59, %f76, %f75;
mov.f32 %f12, %f77;
.loc 16 96 0
mov.f32 %f78, %f14;
mul.ftz.f32 %f79, %f39, %f40;
fma.rn.ftz.f32 %f80, %f59, %f79, %f78;
mov.f32 %f14, %f80;
.loc 16 97 0
mul.ftz.f32 %f81, %f38, %f40;
fma.rn.ftz.f32 %f15, %f59, %f81, %f15;
mov.f32 %f16, %f15;
$Lt_0_21506:
$Lt_0_20482:
.loc 16 58 0
mul.lo.u64 %rd38, %rd27, 4;
add.u64 %rd20, %rd20, %rd38;
setp.lt.u64 %p7, %rd20, %rd19;
@%p7 bra $Lt_0_20226;
bra.uni $Lt_0_19714;
$Lt_0_27906:
mov.f32 %f25, 0f00000000; // 0
mov.f32 %f26, 0f00000000; // 0
mov.f32 %f27, 0f00000000; // 0
mov.f32 %f28, 0f00000000; // 0
$Lt_0_19714:
mov.u32 %r51, 1;
setp.le.s32 %p8, %r1, %r51;
@%p8 bra $Lt_0_24322;
.loc 16 102 0
mov.u64 %rd39, __cuda___cuda_local_var_32603_55_non_const_red_acc108;
cvt.s64.s32 %rd40, %r2;
mul.wide.s32 %rd41, %r2, 4;
add.u64 %rd42, %rd39, %rd41;
mov.f32 %f82, %f27;
st.shared.f32 [%rd42+0], %f82;
mov.f32 %f83, %f26;
st.shared.f32 [%rd42+512], %f83;
mov.f32 %f84, %f25;
st.shared.f32 [%rd42+1024], %f84;
mov.f32 %f85, %f28;
st.shared.f32 [%rd42+1536], %f85;
shr.s32 %r52, %r1, 31;
mov.s32 %r53, 1;
and.b32 %r54, %r52, %r53;
add.s32 %r55, %r54, %r1;
shr.s32 %r56, %r55, 1;
mov.s32 %r57, %r56;
mov.u32 %r58, 0;
setp.ne.u32 %p9, %r56, %r58;
@!%p9 bra $Lt_0_22786;
$Lt_0_23298:
setp.ge.u32 %p10, %r13, %r57;
@%p10 bra $Lt_0_23554;
add.u32 %r59, %r2, %r57;
cvt.u64.u32 %rd43, %r59;
mul.wide.u32 %rd44, %r59, 4;
add.u64 %rd45, %rd39, %rd44;
ld.shared.f32 %f86, [%rd45+0];
add.ftz.f32 %f82, %f86, %f82;
st.shared.f32 [%rd42+0], %f82;
ld.shared.f32 %f87, [%rd45+512];
add.ftz.f32 %f83, %f87, %f83;
st.shared.f32 [%rd42+512], %f83;
ld.shared.f32 %f88, [%rd45+1024];
add.ftz.f32 %f84, %f88, %f84;
st.shared.f32 [%rd42+1024], %f84;
ld.shared.f32 %f89, [%rd45+1536];
add.ftz.f32 %f85, %f89, %f85;
st.shared.f32 [%rd42+1536], %f85;
$Lt_0_23554:
shr.u32 %r57, %r57, 1;
mov.u32 %r60, 0;
setp.ne.u32 %p11, %r57, %r60;
@%p11 bra $Lt_0_23298;
$Lt_0_22786:
mov.f32 %f27, %f82;
mov.f32 %f26, %f83;
mov.f32 %f25, %f84;
mov.f32 %f28, %f85;
ld.param.s32 %r61, [__cudaparm_kernel_pair_vflag];
mov.u32 %r62, 0;
setp.le.s32 %p12, %r61, %r62;
@%p12 bra $Lt_0_24322;
mov.f32 %f82, %f6;
st.shared.f32 [%rd42+0], %f82;
mov.f32 %f83, %f8;
st.shared.f32 [%rd42+512], %f83;
mov.f32 %f84, %f10;
st.shared.f32 [%rd42+1024], %f84;
mov.f32 %f85, %f12;
st.shared.f32 [%rd42+1536], %f85;
mov.f32 %f90, %f14;
st.shared.f32 [%rd42+2048], %f90;
mov.f32 %f91, %f15;
st.shared.f32 [%rd42+2560], %f91;
mov.s32 %r63, %r56;
@!%p9 bra $Lt_0_24834;
$Lt_0_25346:
setp.ge.u32 %p13, %r13, %r63;
@%p13 bra $Lt_0_25602;
add.u32 %r64, %r2, %r63;
cvt.u64.u32 %rd46, %r64;
mul.wide.u32 %rd47, %r64, 4;
add.u64 %rd48, %rd39, %rd47;
ld.shared.f32 %f92, [%rd48+0];
add.ftz.f32 %f82, %f92, %f82;
st.shared.f32 [%rd42+0], %f82;
ld.shared.f32 %f93, [%rd48+512];
add.ftz.f32 %f83, %f93, %f83;
st.shared.f32 [%rd42+512], %f83;
ld.shared.f32 %f94, [%rd48+1024];
add.ftz.f32 %f84, %f94, %f84;
st.shared.f32 [%rd42+1024], %f84;
ld.shared.f32 %f95, [%rd48+1536];
add.ftz.f32 %f85, %f95, %f85;
st.shared.f32 [%rd42+1536], %f85;
ld.shared.f32 %f96, [%rd48+2048];
add.ftz.f32 %f90, %f96, %f90;
st.shared.f32 [%rd42+2048], %f90;
ld.shared.f32 %f97, [%rd48+2560];
add.ftz.f32 %f91, %f97, %f91;
st.shared.f32 [%rd42+2560], %f91;
$Lt_0_25602:
shr.u32 %r63, %r63, 1;
mov.u32 %r65, 0;
setp.ne.u32 %p14, %r63, %r65;
@%p14 bra $Lt_0_25346;
$Lt_0_24834:
mov.f32 %f6, %f82;
mov.f32 %f8, %f83;
mov.f32 %f10, %f84;
mov.f32 %f12, %f85;
mov.f32 %f14, %f90;
mov.f32 %f16, %f91;
$Lt_0_24322:
$Lt_0_22274:
mov.u32 %r66, 0;
setp.ne.s32 %p15, %r13, %r66;
@%p15 bra $Lt_0_26370;
ld.param.u64 %rd49, [__cudaparm_kernel_pair___val_paramengv];
add.u64 %rd50, %rd49, %rd5;
ld.param.s32 %r67, [__cudaparm_kernel_pair_eflag];
mov.u32 %r68, 0;
setp.le.s32 %p16, %r67, %r68;
@%p16 bra $Lt_0_26882;
st.global.f32 [%rd50+0], %f28;
cvt.s64.s32 %rd51, %r9;
mul.wide.s32 %rd52, %r9, 4;
add.u64 %rd50, %rd50, %rd52;
$Lt_0_26882:
ld.param.s32 %r69, [__cudaparm_kernel_pair_vflag];
mov.u32 %r70, 0;
setp.le.s32 %p17, %r69, %r70;
@%p17 bra $Lt_0_27394;
mov.f32 %f98, %f6;
st.global.f32 [%rd50+0], %f98;
cvt.s64.s32 %rd53, %r9;
mul.wide.s32 %rd54, %r9, 4;
add.u64 %rd55, %rd54, %rd50;
mov.f32 %f99, %f8;
st.global.f32 [%rd55+0], %f99;
add.u64 %rd56, %rd54, %rd55;
mov.f32 %f100, %f10;
st.global.f32 [%rd56+0], %f100;
add.u64 %rd57, %rd54, %rd56;
mov.f32 %f101, %f12;
st.global.f32 [%rd57+0], %f101;
add.u64 %rd50, %rd54, %rd57;
mov.f32 %f102, %f14;
st.global.f32 [%rd50+0], %f102;
mov.f32 %f103, %f16;
add.u64 %rd58, %rd54, %rd50;
st.global.f32 [%rd58+0], %f103;
$Lt_0_27394:
ld.param.u64 %rd59, [__cudaparm_kernel_pair_ans];
mul.lo.u64 %rd60, %rd4, 16;
add.u64 %rd61, %rd59, %rd60;
mov.f32 %f104, %f105;
st.global.v4.f32 [%rd61+0], {%f27,%f26,%f25,%f104};
$Lt_0_26370:
$Lt_0_18690:
.loc 16 105 0
exit;
$LDWend_kernel_pair:
} // kernel_pair
.entry kernel_pair_fast (
.param .u64 __cudaparm_kernel_pair_fast_x_,
.param .u64 __cudaparm_kernel_pair_fast_lj1_in,
.param .u64 __cudaparm_kernel_pair_fast_lj3_in,
.param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,
.param .u64 __cudaparm_kernel_pair_fast_dev_nbor,
.param .u64 __cudaparm_kernel_pair_fast_dev_packed,
.param .u64 __cudaparm_kernel_pair_fast_ans,
.param .u64 __cudaparm_kernel_pair_fast___val_paramengv,
.param .s32 __cudaparm_kernel_pair_fast_eflag,
.param .s32 __cudaparm_kernel_pair_fast_vflag,
.param .s32 __cudaparm_kernel_pair_fast_inum,
.param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,
.param .s32 __cudaparm_kernel_pair_fast_t_per_atom)
{
.reg .u32 %r<74>;
.reg .u64 %rd<75>;
.reg .f32 %f<114>;
.reg .pred %p<22>;
.shared .align 4 .b8 __cuda___cuda_local_var_32620_33_non_const_sp_lj3268[16];
.shared .align 16 .b8 __cuda___cuda_local_var_32618_34_non_const_lj13296[1936];
.shared .align 16 .b8 __cuda___cuda_local_var_32619_34_non_const_lj35232[1936];
.shared .align 4 .b8 __cuda___cuda_local_var_32692_55_non_const_red_acc7168[3072];
// __cuda_local_var_32630_10_non_const_f = 48
// __cuda_local_var_32632_9_non_const_virial = 16
.loc 16 113 0
$LDWbegin_kernel_pair_fast:
cvt.s32.u32 %r1, %tid.x;
mov.u32 %r2, 3;
setp.gt.s32 %p1, %r1, %r2;
@%p1 bra $Lt_1_20994;
.loc 16 121 0
mov.u64 %rd1, __cuda___cuda_local_var_32620_33_non_const_sp_lj3268;
cvt.s64.s32 %rd2, %r1;
mul.wide.s32 %rd3, %r1, 4;
ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];
add.u64 %rd5, %rd4, %rd3;
ld.global.f32 %f1, [%rd5+0];
add.u64 %rd6, %rd3, %rd1;
st.shared.f32 [%rd6+0], %f1;
$Lt_1_20994:
mov.u64 %rd1, __cuda___cuda_local_var_32620_33_non_const_sp_lj3268;
mov.u32 %r3, 120;
setp.gt.s32 %p2, %r1, %r3;
@%p2 bra $Lt_1_21506;
.loc 16 123 0
mov.u64 %rd7, __cuda___cuda_local_var_32618_34_non_const_lj13296;
cvt.s64.s32 %rd8, %r1;
mul.wide.s32 %rd9, %r1, 16;
ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_lj1_in];
add.u64 %rd11, %rd10, %rd9;
add.u64 %rd12, %rd9, %rd7;
ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0];
st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5};
ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag];
mov.u32 %r5, 0;
setp.le.s32 %p3, %r4, %r5;
@%p3 bra $Lt_1_22018;
.loc 16 125 0
mov.u64 %rd13, __cuda___cuda_local_var_32619_34_non_const_lj35232;
ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in];
add.u64 %rd15, %rd14, %rd9;
add.u64 %rd16, %rd9, %rd13;
ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0];
st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9};
$Lt_1_22018:
mov.u64 %rd13, __cuda___cuda_local_var_32619_34_non_const_lj35232;
$Lt_1_21506:
mov.u64 %rd13, __cuda___cuda_local_var_32619_34_non_const_lj35232;
mov.u64 %rd7, __cuda___cuda_local_var_32618_34_non_const_lj13296;
.loc 16 133 0
mov.f32 %f10, 0f00000000; // 0
mov.f32 %f11, %f10;
mov.f32 %f12, 0f00000000; // 0
mov.f32 %f13, %f12;
mov.f32 %f14, 0f00000000; // 0
mov.f32 %f15, %f14;
mov.f32 %f16, 0f00000000; // 0
mov.f32 %f17, %f16;
mov.f32 %f18, 0f00000000; // 0
mov.f32 %f19, %f18;
mov.f32 %f20, 0f00000000; // 0
mov.f32 %f21, %f20;
.loc 16 135 0
bar.sync 0;
ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom];
div.s32 %r7, %r1, %r6;
cvt.s32.u32 %r8, %ntid.x;
div.s32 %r9, %r8, %r6;
cvt.s32.u32 %r10, %ctaid.x;
mul.lo.s32 %r11, %r10, %r9;
add.s32 %r12, %r7, %r11;
ld.param.s32 %r13, [__cudaparm_kernel_pair_fast_inum];
setp.ge.s32 %p4, %r12, %r13;
@%p4 bra $Lt_1_30210;
.loc 16 140 0
ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_nbor_pitch];
cvt.s64.s32 %rd17, %r14;
mul.wide.s32 %rd18, %r14, 4;
cvt.s64.s32 %rd19, %r12;
mul.wide.s32 %rd20, %r12, 4;
ld.param.u64 %rd21, [__cudaparm_kernel_pair_fast_dev_nbor];
add.u64 %rd22, %rd20, %rd21;
add.u64 %rd23, %rd18, %rd22;
ld.global.s32 %r15, [%rd23+0];
sub.s32 %r16, %r6, 1;
and.b32 %r17, %r16, %r1;
cvt.s64.s32 %rd24, %r17;
mul.wide.s32 %rd25, %r17, 4;
ld.param.u64 %rd26, [__cudaparm_kernel_pair_fast_dev_packed];
setp.ne.u64 %p5, %rd26, %rd21;
@%p5 bra $Lt_1_23298;
cvt.s32.s64 %r18, %rd17;
mul.lo.s32 %r19, %r18, %r6;
mov.s32 %r20, %r19;
mul.lo.s32 %r21, %r16, %r12;
add.s32 %r22, %r18, %r21;
cvt.s64.s32 %rd27, %r22;
mul.wide.s32 %rd28, %r22, 4;
add.u64 %rd29, %rd23, %rd28;
and.b32 %r23, %r16, %r15;
cvt.s64.s32 %rd30, %r23;
div.s32 %r24, %r15, %r6;
mul.lo.s32 %r25, %r19, %r24;
cvt.s64.s32 %rd31, %r25;
add.u64 %rd32, %rd30, %rd31;
mul.lo.u64 %rd33, %rd32, 4;
add.u64 %rd34, %rd29, %rd33;
add.u64 %rd35, %rd25, %rd29;
bra.uni $Lt_1_23042;
$Lt_1_23298:
add.u64 %rd36, %rd18, %rd23;
ld.global.s32 %r26, [%rd36+0];
cvt.s64.s32 %rd37, %r26;
mul.wide.s32 %rd38, %r26, 4;
add.u64 %rd39, %rd26, %rd38;
cvt.s64.s32 %rd40, %r15;
mul.wide.s32 %rd41, %r15, 4;
add.u64 %rd34, %rd39, %rd41;
mov.s32 %r20, %r6;
add.u64 %rd35, %rd25, %rd39;
$Lt_1_23042:
.loc 16 143 0
ld.global.s32 %r27, [%rd22+0];
mov.u32 %r28, %r27;
mov.s32 %r29, 0;
mov.u32 %r30, %r29;
mov.s32 %r31, 0;
mov.u32 %r32, %r31;
mov.s32 %r33, 0;
mov.u32 %r34, %r33;
tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r28,%r30,%r32,%r34}];
mov.f32 %f26, %f22;
mov.f32 %f27, %f23;
mov.f32 %f28, %f24;
mov.f32 %f29, %f25;
setp.ge.u64 %p6, %rd35, %rd34;
@%p6 bra $Lt_1_31746;
cvt.rzi.ftz.s32.f32 %r35, %f29;
cvt.s64.s32 %rd42, %r20;
mul.lo.s32 %r36, %r35, 11;
cvt.rn.f32.s32 %f30, %r36;
mov.f32 %f31, 0f00000000; // 0
mov.f32 %f32, 0f00000000; // 0
mov.f32 %f33, 0f00000000; // 0
mov.f32 %f34, 0f00000000; // 0
$Lt_1_24066:
//<loop> Loop body line 143, nesting depth: 1, estimated iterations: unknown
.loc 16 150 0
ld.global.s32 %r37, [%rd35+0];
.loc 16 151 0
shr.s32 %r38, %r37, 30;
and.b32 %r39, %r38, 3;
cvt.s64.s32 %rd43, %r39;
mul.wide.s32 %rd44, %r39, 4;
add.u64 %rd45, %rd1, %rd44;
ld.shared.f32 %f35, [%rd45+0];
.loc 16 154 0
and.b32 %r40, %r37, 1073741823;
mov.u32 %r41, %r40;
mov.s32 %r42, 0;
mov.u32 %r43, %r42;
mov.s32 %r44, 0;
mov.u32 %r45, %r44;
mov.s32 %r46, 0;
mov.u32 %r47, %r46;
tex.1d.v4.f32.s32 {%f36,%f37,%f38,%f39},[pos_tex,{%r41,%r43,%r45,%r47}];
mov.f32 %f40, %f36;
mov.f32 %f41, %f37;
mov.f32 %f42, %f38;
mov.f32 %f43, %f39;
sub.ftz.f32 %f44, %f27, %f41;
sub.ftz.f32 %f45, %f26, %f40;
sub.ftz.f32 %f46, %f28, %f42;
mul.ftz.f32 %f47, %f44, %f44;
fma.rn.ftz.f32 %f48, %f45, %f45, %f47;
fma.rn.ftz.f32 %f49, %f46, %f46, %f48;
add.ftz.f32 %f50, %f30, %f43;
cvt.rzi.ftz.s32.f32 %r48, %f50;
cvt.s64.s32 %rd46, %r48;
mul.wide.s32 %rd47, %r48, 16;
add.u64 %rd48, %rd47, %rd7;
ld.shared.f32 %f51, [%rd48+8];
setp.gt.ftz.f32 %p7, %f51, %f49;
@!%p7 bra $Lt_1_25346;
.loc 16 165 0
sqrt.approx.ftz.f32 %f52, %f49;
ld.shared.v4.f32 {%f53,%f54,_,%f55}, [%rd48+0];
sub.ftz.f32 %f56, %f52, %f55;
.loc 16 169 0
mul.ftz.f32 %f57, %f56, %f56;
rcp.approx.ftz.f32 %f58, %f57;
mul.ftz.f32 %f59, %f58, %f58;
mul.ftz.f32 %f60, %f58, %f59;
mul.ftz.f32 %f61, %f53, %f60;
sub.ftz.f32 %f62, %f61, %f54;
mul.ftz.f32 %f63, %f60, %f62;
.loc 16 170 0
div.approx.ftz.f32 %f64, %f35, %f56;
div.approx.ftz.f32 %f65, %f64, %f52;
mul.ftz.f32 %f66, %f63, %f65;
.loc 16 172 0
fma.rn.ftz.f32 %f33, %f45, %f66, %f33;
.loc 16 173 0
fma.rn.ftz.f32 %f32, %f44, %f66, %f32;
.loc 16 174 0
fma.rn.ftz.f32 %f31, %f46, %f66, %f31;
ld.param.s32 %r49, [__cudaparm_kernel_pair_fast_eflag];
mov.u32 %r50, 0;
setp.le.s32 %p8, %r49, %r50;
@%p8 bra $Lt_1_24834;
.loc 16 177 0
add.u64 %rd49, %rd47, %rd13;
ld.shared.v4.f32 {%f67,%f68,%f69,_}, [%rd49+0];
mul.ftz.f32 %f70, %f67, %f60;
sub.ftz.f32 %f71, %f70, %f68;
mul.ftz.f32 %f72, %f60, %f71;
.loc 16 178 0
sub.ftz.f32 %f73, %f72, %f69;
fma.rn.ftz.f32 %f34, %f35, %f73, %f34;
$Lt_1_24834:
ld.param.s32 %r51, [__cudaparm_kernel_pair_fast_vflag];
mov.u32 %r52, 0;
setp.le.s32 %p9, %r51, %r52;
@%p9 bra $Lt_1_25346;
.loc 16 181 0
mov.f32 %f74, %f11;
mul.ftz.f32 %f75, %f45, %f45;
fma.rn.ftz.f32 %f76, %f66, %f75, %f74;
mov.f32 %f11, %f76;
.loc 16 182 0
mov.f32 %f77, %f13;
fma.rn.ftz.f32 %f78, %f66, %f47, %f77;
mov.f32 %f13, %f78;
.loc 16 183 0
mov.f32 %f79, %f15;
mul.ftz.f32 %f80, %f46, %f46;
fma.rn.ftz.f32 %f81, %f66, %f80, %f79;
mov.f32 %f15, %f81;
.loc 16 184 0
mov.f32 %f82, %f17;
mul.ftz.f32 %f83, %f44, %f45;
fma.rn.ftz.f32 %f84, %f66, %f83, %f82;
mov.f32 %f17, %f84;
.loc 16 185 0
mov.f32 %f85, %f19;
mul.ftz.f32 %f86, %f45, %f46;
fma.rn.ftz.f32 %f87, %f66, %f86, %f85;
mov.f32 %f19, %f87;
.loc 16 186 0
mul.ftz.f32 %f88, %f44, %f46;
fma.rn.ftz.f32 %f20, %f66, %f88, %f20;
mov.f32 %f21, %f20;
$Lt_1_25346:
$Lt_1_24322:
.loc 16 148 0
mul.lo.u64 %rd50, %rd42, 4;
add.u64 %rd35, %rd35, %rd50;
setp.lt.u64 %p10, %rd35, %rd34;
@%p10 bra $Lt_1_24066;
bra.uni $Lt_1_23554;
$Lt_1_31746:
mov.f32 %f31, 0f00000000; // 0
mov.f32 %f32, 0f00000000; // 0
mov.f32 %f33, 0f00000000; // 0
mov.f32 %f34, 0f00000000; // 0
$Lt_1_23554:
mov.u32 %r53, 1;
setp.le.s32 %p11, %r6, %r53;
@%p11 bra $Lt_1_28162;
.loc 16 191 0
mov.u64 %rd51, __cuda___cuda_local_var_32692_55_non_const_red_acc7168;
cvt.s64.s32 %rd52, %r1;
mul.wide.s32 %rd53, %r1, 4;
add.u64 %rd54, %rd51, %rd53;
mov.f32 %f89, %f33;
st.shared.f32 [%rd54+0], %f89;
mov.f32 %f90, %f32;
st.shared.f32 [%rd54+512], %f90;
mov.f32 %f91, %f31;
st.shared.f32 [%rd54+1024], %f91;
mov.f32 %f92, %f34;
st.shared.f32 [%rd54+1536], %f92;
shr.s32 %r54, %r6, 31;
mov.s32 %r55, 1;
and.b32 %r56, %r54, %r55;
add.s32 %r57, %r56, %r6;
shr.s32 %r58, %r57, 1;
mov.s32 %r59, %r58;
mov.u32 %r60, 0;
setp.ne.u32 %p12, %r58, %r60;
@!%p12 bra $Lt_1_26626;
$Lt_1_27138:
setp.ge.u32 %p13, %r17, %r59;
@%p13 bra $Lt_1_27394;
add.u32 %r61, %r1, %r59;
cvt.u64.u32 %rd55, %r61;
mul.wide.u32 %rd56, %r61, 4;
add.u64 %rd57, %rd51, %rd56;
ld.shared.f32 %f93, [%rd57+0];
add.ftz.f32 %f89, %f93, %f89;
st.shared.f32 [%rd54+0], %f89;
ld.shared.f32 %f94, [%rd57+512];
add.ftz.f32 %f90, %f94, %f90;
st.shared.f32 [%rd54+512], %f90;
ld.shared.f32 %f95, [%rd57+1024];
add.ftz.f32 %f91, %f95, %f91;
st.shared.f32 [%rd54+1024], %f91;
ld.shared.f32 %f96, [%rd57+1536];
add.ftz.f32 %f92, %f96, %f92;
st.shared.f32 [%rd54+1536], %f92;
$Lt_1_27394:
shr.u32 %r59, %r59, 1;
mov.u32 %r62, 0;
setp.ne.u32 %p14, %r59, %r62;
@%p14 bra $Lt_1_27138;
$Lt_1_26626:
mov.f32 %f33, %f89;
mov.f32 %f32, %f90;
mov.f32 %f31, %f91;
mov.f32 %f34, %f92;
ld.param.s32 %r63, [__cudaparm_kernel_pair_fast_vflag];
mov.u32 %r64, 0;
setp.le.s32 %p15, %r63, %r64;
@%p15 bra $Lt_1_28162;
mov.f32 %f89, %f11;
st.shared.f32 [%rd54+0], %f89;
mov.f32 %f90, %f13;
st.shared.f32 [%rd54+512], %f90;
mov.f32 %f91, %f15;
st.shared.f32 [%rd54+1024], %f91;
mov.f32 %f92, %f17;
st.shared.f32 [%rd54+1536], %f92;
mov.f32 %f97, %f19;
st.shared.f32 [%rd54+2048], %f97;
mov.f32 %f98, %f20;
st.shared.f32 [%rd54+2560], %f98;
mov.s32 %r65, %r58;
@!%p12 bra $Lt_1_28674;
$Lt_1_29186:
setp.ge.u32 %p16, %r17, %r65;
@%p16 bra $Lt_1_29442;
add.u32 %r66, %r1, %r65;
cvt.u64.u32 %rd58, %r66;
mul.wide.u32 %rd59, %r66, 4;
add.u64 %rd60, %rd51, %rd59;
ld.shared.f32 %f99, [%rd60+0];
add.ftz.f32 %f89, %f99, %f89;
st.shared.f32 [%rd54+0], %f89;
ld.shared.f32 %f100, [%rd60+512];
add.ftz.f32 %f90, %f100, %f90;
st.shared.f32 [%rd54+512], %f90;
ld.shared.f32 %f101, [%rd60+1024];
add.ftz.f32 %f91, %f101, %f91;
st.shared.f32 [%rd54+1024], %f91;
ld.shared.f32 %f102, [%rd60+1536];
add.ftz.f32 %f92, %f102, %f92;
st.shared.f32 [%rd54+1536], %f92;
ld.shared.f32 %f103, [%rd60+2048];
add.ftz.f32 %f97, %f103, %f97;
st.shared.f32 [%rd54+2048], %f97;
ld.shared.f32 %f104, [%rd60+2560];
add.ftz.f32 %f98, %f104, %f98;
st.shared.f32 [%rd54+2560], %f98;
$Lt_1_29442:
shr.u32 %r65, %r65, 1;
mov.u32 %r67, 0;
setp.ne.u32 %p17, %r65, %r67;
@%p17 bra $Lt_1_29186;
$Lt_1_28674:
mov.f32 %f11, %f89;
mov.f32 %f13, %f90;
mov.f32 %f15, %f91;
mov.f32 %f17, %f92;
mov.f32 %f19, %f97;
mov.f32 %f21, %f98;
$Lt_1_28162:
$Lt_1_26114:
mov.u32 %r68, 0;
setp.ne.s32 %p18, %r17, %r68;
@%p18 bra $Lt_1_30210;
ld.param.u64 %rd61, [__cudaparm_kernel_pair_fast___val_paramengv];
add.u64 %rd62, %rd61, %rd20;
ld.param.s32 %r69, [__cudaparm_kernel_pair_fast_eflag];
mov.u32 %r70, 0;
setp.le.s32 %p19, %r69, %r70;
@%p19 bra $Lt_1_30722;
st.global.f32 [%rd62+0], %f34;
cvt.s64.s32 %rd63, %r13;
mul.wide.s32 %rd64, %r13, 4;
add.u64 %rd62, %rd62, %rd64;
$Lt_1_30722:
ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag];
mov.u32 %r72, 0;
setp.le.s32 %p20, %r71, %r72;
@%p20 bra $Lt_1_31234;
mov.f32 %f105, %f11;
st.global.f32 [%rd62+0], %f105;
cvt.s64.s32 %rd65, %r13;
mul.wide.s32 %rd66, %r13, 4;
add.u64 %rd67, %rd66, %rd62;
mov.f32 %f106, %f13;
st.global.f32 [%rd67+0], %f106;
add.u64 %rd68, %rd66, %rd67;
mov.f32 %f107, %f15;
st.global.f32 [%rd68+0], %f107;
add.u64 %rd69, %rd66, %rd68;
mov.f32 %f108, %f17;
st.global.f32 [%rd69+0], %f108;
add.u64 %rd62, %rd66, %rd69;
mov.f32 %f109, %f19;
st.global.f32 [%rd62+0], %f109;
mov.f32 %f110, %f21;
add.u64 %rd70, %rd66, %rd62;
st.global.f32 [%rd70+0], %f110;
$Lt_1_31234:
ld.param.u64 %rd71, [__cudaparm_kernel_pair_fast_ans];
mul.lo.u64 %rd72, %rd19, 16;
add.u64 %rd73, %rd71, %rd72;
mov.f32 %f111, %f112;
st.global.v4.f32 [%rd73+0], {%f33,%f32,%f31,%f111};
$Lt_1_30210:
$Lt_1_22530:
.loc 16 194 0
exit;
$LDWend_kernel_pair_fast:
} // kernel_pair_fast

View File

@ -1,860 +0,0 @@
const char * lj_expand =
" .version 2.3\n"
" .target sm_20\n"
" .address_size 64\n"
" .global .texref pos_tex;\n"
" .entry kernel_pair (\n"
" .param .u64 __cudaparm_kernel_pair_x_,\n"
" .param .u64 __cudaparm_kernel_pair_lj1,\n"
" .param .u64 __cudaparm_kernel_pair_lj3,\n"
" .param .s32 __cudaparm_kernel_pair_lj_types,\n"
" .param .u64 __cudaparm_kernel_pair_sp_lj_in,\n"
" .param .u64 __cudaparm_kernel_pair_dev_nbor,\n"
" .param .u64 __cudaparm_kernel_pair_dev_packed,\n"
" .param .u64 __cudaparm_kernel_pair_ans,\n"
" .param .u64 __cudaparm_kernel_pair___val_paramengv,\n"
" .param .s32 __cudaparm_kernel_pair_eflag,\n"
" .param .s32 __cudaparm_kernel_pair_vflag,\n"
" .param .s32 __cudaparm_kernel_pair_inum,\n"
" .param .s32 __cudaparm_kernel_pair_nbor_pitch,\n"
" .param .s32 __cudaparm_kernel_pair_t_per_atom)\n"
" {\n"
" .reg .u32 %r<72>;\n"
" .reg .u64 %rd<63>;\n"
" .reg .f32 %f<107>;\n"
" .reg .pred %p<19>;\n"
" .shared .align 16 .b8 __cuda___cuda_local_var_32536_33_non_const_sp_lj92[16];\n"
" .shared .align 4 .b8 __cuda___cuda_local_var_32603_55_non_const_red_acc108[3072];\n"
" .loc 16 31 0\n"
"$LDWbegin_kernel_pair:\n"
" .loc 16 36 0\n"
" ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];\n"
" ldu.global.f32 %f1, [%rd1+0];\n"
" .loc 16 37 0\n"
" ld.global.f32 %f2, [%rd1+4];\n"
" .loc 16 38 0\n"
" ld.global.f32 %f3, [%rd1+8];\n"
" .loc 16 39 0\n"
" ld.global.f32 %f4, [%rd1+12];\n"
" st.shared.v4.f32 [__cuda___cuda_local_var_32536_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4};\n"
" .loc 16 46 0\n"
" mov.f32 %f5, 0f00000000; \n"
" mov.f32 %f6, %f5;\n"
" mov.f32 %f7, 0f00000000; \n"
" mov.f32 %f8, %f7;\n"
" mov.f32 %f9, 0f00000000; \n"
" mov.f32 %f10, %f9;\n"
" mov.f32 %f11, 0f00000000; \n"
" mov.f32 %f12, %f11;\n"
" mov.f32 %f13, 0f00000000; \n"
" mov.f32 %f14, %f13;\n"
" mov.f32 %f15, 0f00000000; \n"
" mov.f32 %f16, %f15;\n"
" ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];\n"
" cvt.s32.u32 %r2, %tid.x;\n"
" div.s32 %r3, %r2, %r1;\n"
" cvt.s32.u32 %r4, %ntid.x;\n"
" div.s32 %r5, %r4, %r1;\n"
" cvt.s32.u32 %r6, %ctaid.x;\n"
" mul.lo.s32 %r7, %r6, %r5;\n"
" add.s32 %r8, %r3, %r7;\n"
" ld.param.s32 %r9, [__cudaparm_kernel_pair_inum];\n"
" setp.ge.s32 %p1, %r8, %r9;\n"
" @%p1 bra $Lt_0_26370;\n"
" .loc 16 51 0\n"
" ld.param.s32 %r10, [__cudaparm_kernel_pair_nbor_pitch];\n"
" cvt.s64.s32 %rd2, %r10;\n"
" mul.wide.s32 %rd3, %r10, 4;\n"
" cvt.s64.s32 %rd4, %r8;\n"
" mul.wide.s32 %rd5, %r8, 4;\n"
" ld.param.u64 %rd6, [__cudaparm_kernel_pair_dev_nbor];\n"
" add.u64 %rd7, %rd5, %rd6;\n"
" add.u64 %rd8, %rd3, %rd7;\n"
" ld.global.s32 %r11, [%rd8+0];\n"
" sub.s32 %r12, %r1, 1;\n"
" and.b32 %r13, %r12, %r2;\n"
" cvt.s64.s32 %rd9, %r13;\n"
" mul.wide.s32 %rd10, %r13, 4;\n"
" ld.param.u64 %rd11, [__cudaparm_kernel_pair_dev_packed];\n"
" setp.ne.u64 %p2, %rd11, %rd6;\n"
" @%p2 bra $Lt_0_19458;\n"
" cvt.s32.s64 %r14, %rd2;\n"
" mul.lo.s32 %r15, %r14, %r1;\n"
" mov.s32 %r16, %r15;\n"
" mul.lo.s32 %r17, %r12, %r8;\n"
" add.s32 %r18, %r14, %r17;\n"
" cvt.s64.s32 %rd12, %r18;\n"
" mul.wide.s32 %rd13, %r18, 4;\n"
" add.u64 %rd14, %rd8, %rd13;\n"
" and.b32 %r19, %r12, %r11;\n"
" cvt.s64.s32 %rd15, %r19;\n"
" div.s32 %r20, %r11, %r1;\n"
" mul.lo.s32 %r21, %r15, %r20;\n"
" cvt.s64.s32 %rd16, %r21;\n"
" add.u64 %rd17, %rd15, %rd16;\n"
" mul.lo.u64 %rd18, %rd17, 4;\n"
" add.u64 %rd19, %rd14, %rd18;\n"
" add.u64 %rd20, %rd10, %rd14;\n"
" bra.uni $Lt_0_19202;\n"
"$Lt_0_19458:\n"
" add.u64 %rd21, %rd3, %rd8;\n"
" ld.global.s32 %r22, [%rd21+0];\n"
" cvt.s64.s32 %rd22, %r22;\n"
" mul.wide.s32 %rd23, %r22, 4;\n"
" add.u64 %rd24, %rd11, %rd23;\n"
" cvt.s64.s32 %rd25, %r11;\n"
" mul.wide.s32 %rd26, %r11, 4;\n"
" add.u64 %rd19, %rd24, %rd26;\n"
" mov.s32 %r16, %r1;\n"
" add.u64 %rd20, %rd10, %rd24;\n"
"$Lt_0_19202:\n"
" .loc 16 54 0\n"
" ld.global.s32 %r23, [%rd7+0];\n"
" mov.u32 %r24, %r23;\n"
" mov.s32 %r25, 0;\n"
" mov.u32 %r26, %r25;\n"
" mov.s32 %r27, 0;\n"
" mov.u32 %r28, %r27;\n"
" mov.s32 %r29, 0;\n"
" mov.u32 %r30, %r29;\n"
" tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r24,%r26,%r28,%r30}];\n"
" mov.f32 %f21, %f17;\n"
" mov.f32 %f22, %f18;\n"
" mov.f32 %f23, %f19;\n"
" mov.f32 %f24, %f20;\n"
" setp.ge.u64 %p3, %rd20, %rd19;\n"
" @%p3 bra $Lt_0_27906;\n"
" cvt.rzi.ftz.s32.f32 %r31, %f24;\n"
" cvt.s64.s32 %rd27, %r16;\n"
" ld.param.s32 %r32, [__cudaparm_kernel_pair_lj_types];\n"
" mul.lo.s32 %r33, %r32, %r31;\n"
" ld.param.u64 %rd28, [__cudaparm_kernel_pair_lj1];\n"
" mov.f32 %f25, 0f00000000; \n"
" mov.f32 %f26, 0f00000000; \n"
" mov.f32 %f27, 0f00000000; \n"
" mov.f32 %f28, 0f00000000; \n"
" mov.u64 %rd29, __cuda___cuda_local_var_32536_33_non_const_sp_lj92;\n"
"$Lt_0_20226:\n"
" .loc 16 60 0\n"
" ld.global.s32 %r34, [%rd20+0];\n"
" .loc 16 61 0\n"
" shr.s32 %r35, %r34, 30;\n"
" and.b32 %r36, %r35, 3;\n"
" cvt.s64.s32 %rd30, %r36;\n"
" mul.wide.s32 %rd31, %r36, 4;\n"
" add.u64 %rd32, %rd29, %rd31;\n"
" ld.shared.f32 %f29, [%rd32+0];\n"
" .loc 16 64 0\n"
" and.b32 %r37, %r34, 1073741823;\n"
" mov.u32 %r38, %r37;\n"
" mov.s32 %r39, 0;\n"
" mov.u32 %r40, %r39;\n"
" mov.s32 %r41, 0;\n"
" mov.u32 %r42, %r41;\n"
" mov.s32 %r43, 0;\n"
" mov.u32 %r44, %r43;\n"
" tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r38,%r40,%r42,%r44}];\n"
" mov.f32 %f34, %f30;\n"
" mov.f32 %f35, %f31;\n"
" mov.f32 %f36, %f32;\n"
" mov.f32 %f37, %f33;\n"
" cvt.rzi.ftz.s32.f32 %r45, %f37;\n"
" sub.ftz.f32 %f38, %f22, %f35;\n"
" sub.ftz.f32 %f39, %f21, %f34;\n"
" sub.ftz.f32 %f40, %f23, %f36;\n"
" mul.ftz.f32 %f41, %f38, %f38;\n"
" fma.rn.ftz.f32 %f42, %f39, %f39, %f41;\n"
" fma.rn.ftz.f32 %f43, %f40, %f40, %f42;\n"
" add.s32 %r46, %r45, %r33;\n"
" cvt.s64.s32 %rd33, %r46;\n"
" mul.wide.s32 %rd34, %r46, 16;\n"
" add.u64 %rd35, %rd34, %rd28;\n"
" ld.global.f32 %f44, [%rd35+8];\n"
" setp.gt.ftz.f32 %p4, %f44, %f43;\n"
" @!%p4 bra $Lt_0_21506;\n"
" .loc 16 76 0\n"
" sqrt.approx.ftz.f32 %f45, %f43;\n"
" ld.global.v4.f32 {%f46,%f47,_,%f48}, [%rd35+0];\n"
" sub.ftz.f32 %f49, %f45, %f48;\n"
" .loc 16 81 0\n"
" mul.ftz.f32 %f50, %f49, %f49;\n"
" rcp.approx.ftz.f32 %f51, %f50;\n"
" mul.ftz.f32 %f52, %f51, %f51;\n"
" mul.ftz.f32 %f53, %f51, %f52;\n"
" div.approx.ftz.f32 %f54, %f29, %f49;\n"
" div.approx.ftz.f32 %f55, %f54, %f45;\n"
" mul.ftz.f32 %f56, %f46, %f53;\n"
" sub.ftz.f32 %f57, %f56, %f47;\n"
" mul.ftz.f32 %f58, %f53, %f57;\n"
" mul.ftz.f32 %f59, %f55, %f58;\n"
" .loc 16 83 0\n"
" fma.rn.ftz.f32 %f27, %f39, %f59, %f27;\n"
" .loc 16 84 0\n"
" fma.rn.ftz.f32 %f26, %f38, %f59, %f26;\n"
" .loc 16 85 0\n"
" fma.rn.ftz.f32 %f25, %f40, %f59, %f25;\n"
" ld.param.s32 %r47, [__cudaparm_kernel_pair_eflag];\n"
" mov.u32 %r48, 0;\n"
" setp.le.s32 %p5, %r47, %r48;\n"
" @%p5 bra $Lt_0_20994;\n"
" .loc 16 89 0\n"
" ld.param.u64 %rd36, [__cudaparm_kernel_pair_lj3];\n"
" add.u64 %rd37, %rd36, %rd34;\n"
" ld.global.v4.f32 {%f60,%f61,%f62,_}, [%rd37+0];\n"
" mul.ftz.f32 %f63, %f60, %f53;\n"
" sub.ftz.f32 %f64, %f63, %f61;\n"
" mul.ftz.f32 %f65, %f53, %f64;\n"
" sub.ftz.f32 %f66, %f65, %f62;\n"
" fma.rn.ftz.f32 %f28, %f29, %f66, %f28;\n"
"$Lt_0_20994:\n"
" ld.param.s32 %r49, [__cudaparm_kernel_pair_vflag];\n"
" mov.u32 %r50, 0;\n"
" setp.le.s32 %p6, %r49, %r50;\n"
" @%p6 bra $Lt_0_21506;\n"
" .loc 16 92 0\n"
" mov.f32 %f67, %f6;\n"
" mul.ftz.f32 %f68, %f39, %f39;\n"
" fma.rn.ftz.f32 %f69, %f59, %f68, %f67;\n"
" mov.f32 %f6, %f69;\n"
" .loc 16 93 0\n"
" mov.f32 %f70, %f8;\n"
" fma.rn.ftz.f32 %f71, %f59, %f41, %f70;\n"
" mov.f32 %f8, %f71;\n"
" .loc 16 94 0\n"
" mov.f32 %f72, %f10;\n"
" mul.ftz.f32 %f73, %f40, %f40;\n"
" fma.rn.ftz.f32 %f74, %f59, %f73, %f72;\n"
" mov.f32 %f10, %f74;\n"
" .loc 16 95 0\n"
" mov.f32 %f75, %f12;\n"
" mul.ftz.f32 %f76, %f38, %f39;\n"
" fma.rn.ftz.f32 %f77, %f59, %f76, %f75;\n"
" mov.f32 %f12, %f77;\n"
" .loc 16 96 0\n"
" mov.f32 %f78, %f14;\n"
" mul.ftz.f32 %f79, %f39, %f40;\n"
" fma.rn.ftz.f32 %f80, %f59, %f79, %f78;\n"
" mov.f32 %f14, %f80;\n"
" .loc 16 97 0\n"
" mul.ftz.f32 %f81, %f38, %f40;\n"
" fma.rn.ftz.f32 %f15, %f59, %f81, %f15;\n"
" mov.f32 %f16, %f15;\n"
"$Lt_0_21506:\n"
"$Lt_0_20482:\n"
" .loc 16 58 0\n"
" mul.lo.u64 %rd38, %rd27, 4;\n"
" add.u64 %rd20, %rd20, %rd38;\n"
" setp.lt.u64 %p7, %rd20, %rd19;\n"
" @%p7 bra $Lt_0_20226;\n"
" bra.uni $Lt_0_19714;\n"
"$Lt_0_27906:\n"
" mov.f32 %f25, 0f00000000; \n"
" mov.f32 %f26, 0f00000000; \n"
" mov.f32 %f27, 0f00000000; \n"
" mov.f32 %f28, 0f00000000; \n"
"$Lt_0_19714:\n"
" mov.u32 %r51, 1;\n"
" setp.le.s32 %p8, %r1, %r51;\n"
" @%p8 bra $Lt_0_24322;\n"
" .loc 16 102 0\n"
" mov.u64 %rd39, __cuda___cuda_local_var_32603_55_non_const_red_acc108;\n"
" cvt.s64.s32 %rd40, %r2;\n"
" mul.wide.s32 %rd41, %r2, 4;\n"
" add.u64 %rd42, %rd39, %rd41;\n"
" mov.f32 %f82, %f27;\n"
" st.shared.f32 [%rd42+0], %f82;\n"
" mov.f32 %f83, %f26;\n"
" st.shared.f32 [%rd42+512], %f83;\n"
" mov.f32 %f84, %f25;\n"
" st.shared.f32 [%rd42+1024], %f84;\n"
" mov.f32 %f85, %f28;\n"
" st.shared.f32 [%rd42+1536], %f85;\n"
" shr.s32 %r52, %r1, 31;\n"
" mov.s32 %r53, 1;\n"
" and.b32 %r54, %r52, %r53;\n"
" add.s32 %r55, %r54, %r1;\n"
" shr.s32 %r56, %r55, 1;\n"
" mov.s32 %r57, %r56;\n"
" mov.u32 %r58, 0;\n"
" setp.ne.u32 %p9, %r56, %r58;\n"
" @!%p9 bra $Lt_0_22786;\n"
"$Lt_0_23298:\n"
" setp.ge.u32 %p10, %r13, %r57;\n"
" @%p10 bra $Lt_0_23554;\n"
" add.u32 %r59, %r2, %r57;\n"
" cvt.u64.u32 %rd43, %r59;\n"
" mul.wide.u32 %rd44, %r59, 4;\n"
" add.u64 %rd45, %rd39, %rd44;\n"
" ld.shared.f32 %f86, [%rd45+0];\n"
" add.ftz.f32 %f82, %f86, %f82;\n"
" st.shared.f32 [%rd42+0], %f82;\n"
" ld.shared.f32 %f87, [%rd45+512];\n"
" add.ftz.f32 %f83, %f87, %f83;\n"
" st.shared.f32 [%rd42+512], %f83;\n"
" ld.shared.f32 %f88, [%rd45+1024];\n"
" add.ftz.f32 %f84, %f88, %f84;\n"
" st.shared.f32 [%rd42+1024], %f84;\n"
" ld.shared.f32 %f89, [%rd45+1536];\n"
" add.ftz.f32 %f85, %f89, %f85;\n"
" st.shared.f32 [%rd42+1536], %f85;\n"
"$Lt_0_23554:\n"
" shr.u32 %r57, %r57, 1;\n"
" mov.u32 %r60, 0;\n"
" setp.ne.u32 %p11, %r57, %r60;\n"
" @%p11 bra $Lt_0_23298;\n"
"$Lt_0_22786:\n"
" mov.f32 %f27, %f82;\n"
" mov.f32 %f26, %f83;\n"
" mov.f32 %f25, %f84;\n"
" mov.f32 %f28, %f85;\n"
" ld.param.s32 %r61, [__cudaparm_kernel_pair_vflag];\n"
" mov.u32 %r62, 0;\n"
" setp.le.s32 %p12, %r61, %r62;\n"
" @%p12 bra $Lt_0_24322;\n"
" mov.f32 %f82, %f6;\n"
" st.shared.f32 [%rd42+0], %f82;\n"
" mov.f32 %f83, %f8;\n"
" st.shared.f32 [%rd42+512], %f83;\n"
" mov.f32 %f84, %f10;\n"
" st.shared.f32 [%rd42+1024], %f84;\n"
" mov.f32 %f85, %f12;\n"
" st.shared.f32 [%rd42+1536], %f85;\n"
" mov.f32 %f90, %f14;\n"
" st.shared.f32 [%rd42+2048], %f90;\n"
" mov.f32 %f91, %f15;\n"
" st.shared.f32 [%rd42+2560], %f91;\n"
" mov.s32 %r63, %r56;\n"
" @!%p9 bra $Lt_0_24834;\n"
"$Lt_0_25346:\n"
" setp.ge.u32 %p13, %r13, %r63;\n"
" @%p13 bra $Lt_0_25602;\n"
" add.u32 %r64, %r2, %r63;\n"
" cvt.u64.u32 %rd46, %r64;\n"
" mul.wide.u32 %rd47, %r64, 4;\n"
" add.u64 %rd48, %rd39, %rd47;\n"
" ld.shared.f32 %f92, [%rd48+0];\n"
" add.ftz.f32 %f82, %f92, %f82;\n"
" st.shared.f32 [%rd42+0], %f82;\n"
" ld.shared.f32 %f93, [%rd48+512];\n"
" add.ftz.f32 %f83, %f93, %f83;\n"
" st.shared.f32 [%rd42+512], %f83;\n"
" ld.shared.f32 %f94, [%rd48+1024];\n"
" add.ftz.f32 %f84, %f94, %f84;\n"
" st.shared.f32 [%rd42+1024], %f84;\n"
" ld.shared.f32 %f95, [%rd48+1536];\n"
" add.ftz.f32 %f85, %f95, %f85;\n"
" st.shared.f32 [%rd42+1536], %f85;\n"
" ld.shared.f32 %f96, [%rd48+2048];\n"
" add.ftz.f32 %f90, %f96, %f90;\n"
" st.shared.f32 [%rd42+2048], %f90;\n"
" ld.shared.f32 %f97, [%rd48+2560];\n"
" add.ftz.f32 %f91, %f97, %f91;\n"
" st.shared.f32 [%rd42+2560], %f91;\n"
"$Lt_0_25602:\n"
" shr.u32 %r63, %r63, 1;\n"
" mov.u32 %r65, 0;\n"
" setp.ne.u32 %p14, %r63, %r65;\n"
" @%p14 bra $Lt_0_25346;\n"
"$Lt_0_24834:\n"
" mov.f32 %f6, %f82;\n"
" mov.f32 %f8, %f83;\n"
" mov.f32 %f10, %f84;\n"
" mov.f32 %f12, %f85;\n"
" mov.f32 %f14, %f90;\n"
" mov.f32 %f16, %f91;\n"
"$Lt_0_24322:\n"
"$Lt_0_22274:\n"
" mov.u32 %r66, 0;\n"
" setp.ne.s32 %p15, %r13, %r66;\n"
" @%p15 bra $Lt_0_26370;\n"
" ld.param.u64 %rd49, [__cudaparm_kernel_pair___val_paramengv];\n"
" add.u64 %rd50, %rd49, %rd5;\n"
" ld.param.s32 %r67, [__cudaparm_kernel_pair_eflag];\n"
" mov.u32 %r68, 0;\n"
" setp.le.s32 %p16, %r67, %r68;\n"
" @%p16 bra $Lt_0_26882;\n"
" st.global.f32 [%rd50+0], %f28;\n"
" cvt.s64.s32 %rd51, %r9;\n"
" mul.wide.s32 %rd52, %r9, 4;\n"
" add.u64 %rd50, %rd50, %rd52;\n"
"$Lt_0_26882:\n"
" ld.param.s32 %r69, [__cudaparm_kernel_pair_vflag];\n"
" mov.u32 %r70, 0;\n"
" setp.le.s32 %p17, %r69, %r70;\n"
" @%p17 bra $Lt_0_27394;\n"
" mov.f32 %f98, %f6;\n"
" st.global.f32 [%rd50+0], %f98;\n"
" cvt.s64.s32 %rd53, %r9;\n"
" mul.wide.s32 %rd54, %r9, 4;\n"
" add.u64 %rd55, %rd54, %rd50;\n"
" mov.f32 %f99, %f8;\n"
" st.global.f32 [%rd55+0], %f99;\n"
" add.u64 %rd56, %rd54, %rd55;\n"
" mov.f32 %f100, %f10;\n"
" st.global.f32 [%rd56+0], %f100;\n"
" add.u64 %rd57, %rd54, %rd56;\n"
" mov.f32 %f101, %f12;\n"
" st.global.f32 [%rd57+0], %f101;\n"
" add.u64 %rd50, %rd54, %rd57;\n"
" mov.f32 %f102, %f14;\n"
" st.global.f32 [%rd50+0], %f102;\n"
" mov.f32 %f103, %f16;\n"
" add.u64 %rd58, %rd54, %rd50;\n"
" st.global.f32 [%rd58+0], %f103;\n"
"$Lt_0_27394:\n"
" ld.param.u64 %rd59, [__cudaparm_kernel_pair_ans];\n"
" mul.lo.u64 %rd60, %rd4, 16;\n"
" add.u64 %rd61, %rd59, %rd60;\n"
" mov.f32 %f104, %f105;\n"
" st.global.v4.f32 [%rd61+0], {%f27,%f26,%f25,%f104};\n"
"$Lt_0_26370:\n"
"$Lt_0_18690:\n"
" .loc 16 105 0\n"
" exit;\n"
"$LDWend_kernel_pair:\n"
" }\n"
" .entry kernel_pair_fast (\n"
" .param .u64 __cudaparm_kernel_pair_fast_x_,\n"
" .param .u64 __cudaparm_kernel_pair_fast_lj1_in,\n"
" .param .u64 __cudaparm_kernel_pair_fast_lj3_in,\n"
" .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,\n"
" .param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n"
" .param .u64 __cudaparm_kernel_pair_fast_dev_packed,\n"
" .param .u64 __cudaparm_kernel_pair_fast_ans,\n"
" .param .u64 __cudaparm_kernel_pair_fast___val_paramengv,\n"
" .param .s32 __cudaparm_kernel_pair_fast_eflag,\n"
" .param .s32 __cudaparm_kernel_pair_fast_vflag,\n"
" .param .s32 __cudaparm_kernel_pair_fast_inum,\n"
" .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,\n"
" .param .s32 __cudaparm_kernel_pair_fast_t_per_atom)\n"
" {\n"
" .reg .u32 %r<74>;\n"
" .reg .u64 %rd<75>;\n"
" .reg .f32 %f<114>;\n"
" .reg .pred %p<22>;\n"
" .shared .align 4 .b8 __cuda___cuda_local_var_32620_33_non_const_sp_lj3268[16];\n"
" .shared .align 16 .b8 __cuda___cuda_local_var_32618_34_non_const_lj13296[1936];\n"
" .shared .align 16 .b8 __cuda___cuda_local_var_32619_34_non_const_lj35232[1936];\n"
" .shared .align 4 .b8 __cuda___cuda_local_var_32692_55_non_const_red_acc7168[3072];\n"
" .loc 16 113 0\n"
"$LDWbegin_kernel_pair_fast:\n"
" cvt.s32.u32 %r1, %tid.x;\n"
" mov.u32 %r2, 3;\n"
" setp.gt.s32 %p1, %r1, %r2;\n"
" @%p1 bra $Lt_1_20994;\n"
" .loc 16 121 0\n"
" mov.u64 %rd1, __cuda___cuda_local_var_32620_33_non_const_sp_lj3268;\n"
" cvt.s64.s32 %rd2, %r1;\n"
" mul.wide.s32 %rd3, %r1, 4;\n"
" ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];\n"
" add.u64 %rd5, %rd4, %rd3;\n"
" ld.global.f32 %f1, [%rd5+0];\n"
" add.u64 %rd6, %rd3, %rd1;\n"
" st.shared.f32 [%rd6+0], %f1;\n"
"$Lt_1_20994:\n"
" mov.u64 %rd1, __cuda___cuda_local_var_32620_33_non_const_sp_lj3268;\n"
" mov.u32 %r3, 120;\n"
" setp.gt.s32 %p2, %r1, %r3;\n"
" @%p2 bra $Lt_1_21506;\n"
" .loc 16 123 0\n"
" mov.u64 %rd7, __cuda___cuda_local_var_32618_34_non_const_lj13296;\n"
" cvt.s64.s32 %rd8, %r1;\n"
" mul.wide.s32 %rd9, %r1, 16;\n"
" ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_lj1_in];\n"
" add.u64 %rd11, %rd10, %rd9;\n"
" add.u64 %rd12, %rd9, %rd7;\n"
" ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0];\n"
" st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5};\n"
" ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag];\n"
" mov.u32 %r5, 0;\n"
" setp.le.s32 %p3, %r4, %r5;\n"
" @%p3 bra $Lt_1_22018;\n"
" .loc 16 125 0\n"
" mov.u64 %rd13, __cuda___cuda_local_var_32619_34_non_const_lj35232;\n"
" ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in];\n"
" add.u64 %rd15, %rd14, %rd9;\n"
" add.u64 %rd16, %rd9, %rd13;\n"
" ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0];\n"
" st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9};\n"
"$Lt_1_22018:\n"
" mov.u64 %rd13, __cuda___cuda_local_var_32619_34_non_const_lj35232;\n"
"$Lt_1_21506:\n"
" mov.u64 %rd13, __cuda___cuda_local_var_32619_34_non_const_lj35232;\n"
" mov.u64 %rd7, __cuda___cuda_local_var_32618_34_non_const_lj13296;\n"
" .loc 16 133 0\n"
" mov.f32 %f10, 0f00000000; \n"
" mov.f32 %f11, %f10;\n"
" mov.f32 %f12, 0f00000000; \n"
" mov.f32 %f13, %f12;\n"
" mov.f32 %f14, 0f00000000; \n"
" mov.f32 %f15, %f14;\n"
" mov.f32 %f16, 0f00000000; \n"
" mov.f32 %f17, %f16;\n"
" mov.f32 %f18, 0f00000000; \n"
" mov.f32 %f19, %f18;\n"
" mov.f32 %f20, 0f00000000; \n"
" mov.f32 %f21, %f20;\n"
" .loc 16 135 0\n"
" bar.sync 0;\n"
" ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom];\n"
" div.s32 %r7, %r1, %r6;\n"
" cvt.s32.u32 %r8, %ntid.x;\n"
" div.s32 %r9, %r8, %r6;\n"
" cvt.s32.u32 %r10, %ctaid.x;\n"
" mul.lo.s32 %r11, %r10, %r9;\n"
" add.s32 %r12, %r7, %r11;\n"
" ld.param.s32 %r13, [__cudaparm_kernel_pair_fast_inum];\n"
" setp.ge.s32 %p4, %r12, %r13;\n"
" @%p4 bra $Lt_1_30210;\n"
" .loc 16 140 0\n"
" ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_nbor_pitch];\n"
" cvt.s64.s32 %rd17, %r14;\n"
" mul.wide.s32 %rd18, %r14, 4;\n"
" cvt.s64.s32 %rd19, %r12;\n"
" mul.wide.s32 %rd20, %r12, 4;\n"
" ld.param.u64 %rd21, [__cudaparm_kernel_pair_fast_dev_nbor];\n"
" add.u64 %rd22, %rd20, %rd21;\n"
" add.u64 %rd23, %rd18, %rd22;\n"
" ld.global.s32 %r15, [%rd23+0];\n"
" sub.s32 %r16, %r6, 1;\n"
" and.b32 %r17, %r16, %r1;\n"
" cvt.s64.s32 %rd24, %r17;\n"
" mul.wide.s32 %rd25, %r17, 4;\n"
" ld.param.u64 %rd26, [__cudaparm_kernel_pair_fast_dev_packed];\n"
" setp.ne.u64 %p5, %rd26, %rd21;\n"
" @%p5 bra $Lt_1_23298;\n"
" cvt.s32.s64 %r18, %rd17;\n"
" mul.lo.s32 %r19, %r18, %r6;\n"
" mov.s32 %r20, %r19;\n"
" mul.lo.s32 %r21, %r16, %r12;\n"
" add.s32 %r22, %r18, %r21;\n"
" cvt.s64.s32 %rd27, %r22;\n"
" mul.wide.s32 %rd28, %r22, 4;\n"
" add.u64 %rd29, %rd23, %rd28;\n"
" and.b32 %r23, %r16, %r15;\n"
" cvt.s64.s32 %rd30, %r23;\n"
" div.s32 %r24, %r15, %r6;\n"
" mul.lo.s32 %r25, %r19, %r24;\n"
" cvt.s64.s32 %rd31, %r25;\n"
" add.u64 %rd32, %rd30, %rd31;\n"
" mul.lo.u64 %rd33, %rd32, 4;\n"
" add.u64 %rd34, %rd29, %rd33;\n"
" add.u64 %rd35, %rd25, %rd29;\n"
" bra.uni $Lt_1_23042;\n"
"$Lt_1_23298:\n"
" add.u64 %rd36, %rd18, %rd23;\n"
" ld.global.s32 %r26, [%rd36+0];\n"
" cvt.s64.s32 %rd37, %r26;\n"
" mul.wide.s32 %rd38, %r26, 4;\n"
" add.u64 %rd39, %rd26, %rd38;\n"
" cvt.s64.s32 %rd40, %r15;\n"
" mul.wide.s32 %rd41, %r15, 4;\n"
" add.u64 %rd34, %rd39, %rd41;\n"
" mov.s32 %r20, %r6;\n"
" add.u64 %rd35, %rd25, %rd39;\n"
"$Lt_1_23042:\n"
" .loc 16 143 0\n"
" ld.global.s32 %r27, [%rd22+0];\n"
" mov.u32 %r28, %r27;\n"
" mov.s32 %r29, 0;\n"
" mov.u32 %r30, %r29;\n"
" mov.s32 %r31, 0;\n"
" mov.u32 %r32, %r31;\n"
" mov.s32 %r33, 0;\n"
" mov.u32 %r34, %r33;\n"
" tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r28,%r30,%r32,%r34}];\n"
" mov.f32 %f26, %f22;\n"
" mov.f32 %f27, %f23;\n"
" mov.f32 %f28, %f24;\n"
" mov.f32 %f29, %f25;\n"
" setp.ge.u64 %p6, %rd35, %rd34;\n"
" @%p6 bra $Lt_1_31746;\n"
" cvt.rzi.ftz.s32.f32 %r35, %f29;\n"
" cvt.s64.s32 %rd42, %r20;\n"
" mul.lo.s32 %r36, %r35, 11;\n"
" cvt.rn.f32.s32 %f30, %r36;\n"
" mov.f32 %f31, 0f00000000; \n"
" mov.f32 %f32, 0f00000000; \n"
" mov.f32 %f33, 0f00000000; \n"
" mov.f32 %f34, 0f00000000; \n"
"$Lt_1_24066:\n"
" .loc 16 150 0\n"
" ld.global.s32 %r37, [%rd35+0];\n"
" .loc 16 151 0\n"
" shr.s32 %r38, %r37, 30;\n"
" and.b32 %r39, %r38, 3;\n"
" cvt.s64.s32 %rd43, %r39;\n"
" mul.wide.s32 %rd44, %r39, 4;\n"
" add.u64 %rd45, %rd1, %rd44;\n"
" ld.shared.f32 %f35, [%rd45+0];\n"
" .loc 16 154 0\n"
" and.b32 %r40, %r37, 1073741823;\n"
" mov.u32 %r41, %r40;\n"
" mov.s32 %r42, 0;\n"
" mov.u32 %r43, %r42;\n"
" mov.s32 %r44, 0;\n"
" mov.u32 %r45, %r44;\n"
" mov.s32 %r46, 0;\n"
" mov.u32 %r47, %r46;\n"
" tex.1d.v4.f32.s32 {%f36,%f37,%f38,%f39},[pos_tex,{%r41,%r43,%r45,%r47}];\n"
" mov.f32 %f40, %f36;\n"
" mov.f32 %f41, %f37;\n"
" mov.f32 %f42, %f38;\n"
" mov.f32 %f43, %f39;\n"
" sub.ftz.f32 %f44, %f27, %f41;\n"
" sub.ftz.f32 %f45, %f26, %f40;\n"
" sub.ftz.f32 %f46, %f28, %f42;\n"
" mul.ftz.f32 %f47, %f44, %f44;\n"
" fma.rn.ftz.f32 %f48, %f45, %f45, %f47;\n"
" fma.rn.ftz.f32 %f49, %f46, %f46, %f48;\n"
" add.ftz.f32 %f50, %f30, %f43;\n"
" cvt.rzi.ftz.s32.f32 %r48, %f50;\n"
" cvt.s64.s32 %rd46, %r48;\n"
" mul.wide.s32 %rd47, %r48, 16;\n"
" add.u64 %rd48, %rd47, %rd7;\n"
" ld.shared.f32 %f51, [%rd48+8];\n"
" setp.gt.ftz.f32 %p7, %f51, %f49;\n"
" @!%p7 bra $Lt_1_25346;\n"
" .loc 16 165 0\n"
" sqrt.approx.ftz.f32 %f52, %f49;\n"
" ld.shared.v4.f32 {%f53,%f54,_,%f55}, [%rd48+0];\n"
" sub.ftz.f32 %f56, %f52, %f55;\n"
" .loc 16 169 0\n"
" mul.ftz.f32 %f57, %f56, %f56;\n"
" rcp.approx.ftz.f32 %f58, %f57;\n"
" mul.ftz.f32 %f59, %f58, %f58;\n"
" mul.ftz.f32 %f60, %f58, %f59;\n"
" mul.ftz.f32 %f61, %f53, %f60;\n"
" sub.ftz.f32 %f62, %f61, %f54;\n"
" mul.ftz.f32 %f63, %f60, %f62;\n"
" .loc 16 170 0\n"
" div.approx.ftz.f32 %f64, %f35, %f56;\n"
" div.approx.ftz.f32 %f65, %f64, %f52;\n"
" mul.ftz.f32 %f66, %f63, %f65;\n"
" .loc 16 172 0\n"
" fma.rn.ftz.f32 %f33, %f45, %f66, %f33;\n"
" .loc 16 173 0\n"
" fma.rn.ftz.f32 %f32, %f44, %f66, %f32;\n"
" .loc 16 174 0\n"
" fma.rn.ftz.f32 %f31, %f46, %f66, %f31;\n"
" ld.param.s32 %r49, [__cudaparm_kernel_pair_fast_eflag];\n"
" mov.u32 %r50, 0;\n"
" setp.le.s32 %p8, %r49, %r50;\n"
" @%p8 bra $Lt_1_24834;\n"
" .loc 16 177 0\n"
" add.u64 %rd49, %rd47, %rd13;\n"
" ld.shared.v4.f32 {%f67,%f68,%f69,_}, [%rd49+0];\n"
" mul.ftz.f32 %f70, %f67, %f60;\n"
" sub.ftz.f32 %f71, %f70, %f68;\n"
" mul.ftz.f32 %f72, %f60, %f71;\n"
" .loc 16 178 0\n"
" sub.ftz.f32 %f73, %f72, %f69;\n"
" fma.rn.ftz.f32 %f34, %f35, %f73, %f34;\n"
"$Lt_1_24834:\n"
" ld.param.s32 %r51, [__cudaparm_kernel_pair_fast_vflag];\n"
" mov.u32 %r52, 0;\n"
" setp.le.s32 %p9, %r51, %r52;\n"
" @%p9 bra $Lt_1_25346;\n"
" .loc 16 181 0\n"
" mov.f32 %f74, %f11;\n"
" mul.ftz.f32 %f75, %f45, %f45;\n"
" fma.rn.ftz.f32 %f76, %f66, %f75, %f74;\n"
" mov.f32 %f11, %f76;\n"
" .loc 16 182 0\n"
" mov.f32 %f77, %f13;\n"
" fma.rn.ftz.f32 %f78, %f66, %f47, %f77;\n"
" mov.f32 %f13, %f78;\n"
" .loc 16 183 0\n"
" mov.f32 %f79, %f15;\n"
" mul.ftz.f32 %f80, %f46, %f46;\n"
" fma.rn.ftz.f32 %f81, %f66, %f80, %f79;\n"
" mov.f32 %f15, %f81;\n"
" .loc 16 184 0\n"
" mov.f32 %f82, %f17;\n"
" mul.ftz.f32 %f83, %f44, %f45;\n"
" fma.rn.ftz.f32 %f84, %f66, %f83, %f82;\n"
" mov.f32 %f17, %f84;\n"
" .loc 16 185 0\n"
" mov.f32 %f85, %f19;\n"
" mul.ftz.f32 %f86, %f45, %f46;\n"
" fma.rn.ftz.f32 %f87, %f66, %f86, %f85;\n"
" mov.f32 %f19, %f87;\n"
" .loc 16 186 0\n"
" mul.ftz.f32 %f88, %f44, %f46;\n"
" fma.rn.ftz.f32 %f20, %f66, %f88, %f20;\n"
" mov.f32 %f21, %f20;\n"
"$Lt_1_25346:\n"
"$Lt_1_24322:\n"
" .loc 16 148 0\n"
" mul.lo.u64 %rd50, %rd42, 4;\n"
" add.u64 %rd35, %rd35, %rd50;\n"
" setp.lt.u64 %p10, %rd35, %rd34;\n"
" @%p10 bra $Lt_1_24066;\n"
" bra.uni $Lt_1_23554;\n"
"$Lt_1_31746:\n"
" mov.f32 %f31, 0f00000000; \n"
" mov.f32 %f32, 0f00000000; \n"
" mov.f32 %f33, 0f00000000; \n"
" mov.f32 %f34, 0f00000000; \n"
"$Lt_1_23554:\n"
" mov.u32 %r53, 1;\n"
" setp.le.s32 %p11, %r6, %r53;\n"
" @%p11 bra $Lt_1_28162;\n"
" .loc 16 191 0\n"
" mov.u64 %rd51, __cuda___cuda_local_var_32692_55_non_const_red_acc7168;\n"
" cvt.s64.s32 %rd52, %r1;\n"
" mul.wide.s32 %rd53, %r1, 4;\n"
" add.u64 %rd54, %rd51, %rd53;\n"
" mov.f32 %f89, %f33;\n"
" st.shared.f32 [%rd54+0], %f89;\n"
" mov.f32 %f90, %f32;\n"
" st.shared.f32 [%rd54+512], %f90;\n"
" mov.f32 %f91, %f31;\n"
" st.shared.f32 [%rd54+1024], %f91;\n"
" mov.f32 %f92, %f34;\n"
" st.shared.f32 [%rd54+1536], %f92;\n"
" shr.s32 %r54, %r6, 31;\n"
" mov.s32 %r55, 1;\n"
" and.b32 %r56, %r54, %r55;\n"
" add.s32 %r57, %r56, %r6;\n"
" shr.s32 %r58, %r57, 1;\n"
" mov.s32 %r59, %r58;\n"
" mov.u32 %r60, 0;\n"
" setp.ne.u32 %p12, %r58, %r60;\n"
" @!%p12 bra $Lt_1_26626;\n"
"$Lt_1_27138:\n"
" setp.ge.u32 %p13, %r17, %r59;\n"
" @%p13 bra $Lt_1_27394;\n"
" add.u32 %r61, %r1, %r59;\n"
" cvt.u64.u32 %rd55, %r61;\n"
" mul.wide.u32 %rd56, %r61, 4;\n"
" add.u64 %rd57, %rd51, %rd56;\n"
" ld.shared.f32 %f93, [%rd57+0];\n"
" add.ftz.f32 %f89, %f93, %f89;\n"
" st.shared.f32 [%rd54+0], %f89;\n"
" ld.shared.f32 %f94, [%rd57+512];\n"
" add.ftz.f32 %f90, %f94, %f90;\n"
" st.shared.f32 [%rd54+512], %f90;\n"
" ld.shared.f32 %f95, [%rd57+1024];\n"
" add.ftz.f32 %f91, %f95, %f91;\n"
" st.shared.f32 [%rd54+1024], %f91;\n"
" ld.shared.f32 %f96, [%rd57+1536];\n"
" add.ftz.f32 %f92, %f96, %f92;\n"
" st.shared.f32 [%rd54+1536], %f92;\n"
"$Lt_1_27394:\n"
" shr.u32 %r59, %r59, 1;\n"
" mov.u32 %r62, 0;\n"
" setp.ne.u32 %p14, %r59, %r62;\n"
" @%p14 bra $Lt_1_27138;\n"
"$Lt_1_26626:\n"
" mov.f32 %f33, %f89;\n"
" mov.f32 %f32, %f90;\n"
" mov.f32 %f31, %f91;\n"
" mov.f32 %f34, %f92;\n"
" ld.param.s32 %r63, [__cudaparm_kernel_pair_fast_vflag];\n"
" mov.u32 %r64, 0;\n"
" setp.le.s32 %p15, %r63, %r64;\n"
" @%p15 bra $Lt_1_28162;\n"
" mov.f32 %f89, %f11;\n"
" st.shared.f32 [%rd54+0], %f89;\n"
" mov.f32 %f90, %f13;\n"
" st.shared.f32 [%rd54+512], %f90;\n"
" mov.f32 %f91, %f15;\n"
" st.shared.f32 [%rd54+1024], %f91;\n"
" mov.f32 %f92, %f17;\n"
" st.shared.f32 [%rd54+1536], %f92;\n"
" mov.f32 %f97, %f19;\n"
" st.shared.f32 [%rd54+2048], %f97;\n"
" mov.f32 %f98, %f20;\n"
" st.shared.f32 [%rd54+2560], %f98;\n"
" mov.s32 %r65, %r58;\n"
" @!%p12 bra $Lt_1_28674;\n"
"$Lt_1_29186:\n"
" setp.ge.u32 %p16, %r17, %r65;\n"
" @%p16 bra $Lt_1_29442;\n"
" add.u32 %r66, %r1, %r65;\n"
" cvt.u64.u32 %rd58, %r66;\n"
" mul.wide.u32 %rd59, %r66, 4;\n"
" add.u64 %rd60, %rd51, %rd59;\n"
" ld.shared.f32 %f99, [%rd60+0];\n"
" add.ftz.f32 %f89, %f99, %f89;\n"
" st.shared.f32 [%rd54+0], %f89;\n"
" ld.shared.f32 %f100, [%rd60+512];\n"
" add.ftz.f32 %f90, %f100, %f90;\n"
" st.shared.f32 [%rd54+512], %f90;\n"
" ld.shared.f32 %f101, [%rd60+1024];\n"
" add.ftz.f32 %f91, %f101, %f91;\n"
" st.shared.f32 [%rd54+1024], %f91;\n"
" ld.shared.f32 %f102, [%rd60+1536];\n"
" add.ftz.f32 %f92, %f102, %f92;\n"
" st.shared.f32 [%rd54+1536], %f92;\n"
" ld.shared.f32 %f103, [%rd60+2048];\n"
" add.ftz.f32 %f97, %f103, %f97;\n"
" st.shared.f32 [%rd54+2048], %f97;\n"
" ld.shared.f32 %f104, [%rd60+2560];\n"
" add.ftz.f32 %f98, %f104, %f98;\n"
" st.shared.f32 [%rd54+2560], %f98;\n"
"$Lt_1_29442:\n"
" shr.u32 %r65, %r65, 1;\n"
" mov.u32 %r67, 0;\n"
" setp.ne.u32 %p17, %r65, %r67;\n"
" @%p17 bra $Lt_1_29186;\n"
"$Lt_1_28674:\n"
" mov.f32 %f11, %f89;\n"
" mov.f32 %f13, %f90;\n"
" mov.f32 %f15, %f91;\n"
" mov.f32 %f17, %f92;\n"
" mov.f32 %f19, %f97;\n"
" mov.f32 %f21, %f98;\n"
"$Lt_1_28162:\n"
"$Lt_1_26114:\n"
" mov.u32 %r68, 0;\n"
" setp.ne.s32 %p18, %r17, %r68;\n"
" @%p18 bra $Lt_1_30210;\n"
" ld.param.u64 %rd61, [__cudaparm_kernel_pair_fast___val_paramengv];\n"
" add.u64 %rd62, %rd61, %rd20;\n"
" ld.param.s32 %r69, [__cudaparm_kernel_pair_fast_eflag];\n"
" mov.u32 %r70, 0;\n"
" setp.le.s32 %p19, %r69, %r70;\n"
" @%p19 bra $Lt_1_30722;\n"
" st.global.f32 [%rd62+0], %f34;\n"
" cvt.s64.s32 %rd63, %r13;\n"
" mul.wide.s32 %rd64, %r13, 4;\n"
" add.u64 %rd62, %rd62, %rd64;\n"
"$Lt_1_30722:\n"
" ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag];\n"
" mov.u32 %r72, 0;\n"
" setp.le.s32 %p20, %r71, %r72;\n"
" @%p20 bra $Lt_1_31234;\n"
" mov.f32 %f105, %f11;\n"
" st.global.f32 [%rd62+0], %f105;\n"
" cvt.s64.s32 %rd65, %r13;\n"
" mul.wide.s32 %rd66, %r13, 4;\n"
" add.u64 %rd67, %rd66, %rd62;\n"
" mov.f32 %f106, %f13;\n"
" st.global.f32 [%rd67+0], %f106;\n"
" add.u64 %rd68, %rd66, %rd67;\n"
" mov.f32 %f107, %f15;\n"
" st.global.f32 [%rd68+0], %f107;\n"
" add.u64 %rd69, %rd66, %rd68;\n"
" mov.f32 %f108, %f17;\n"
" st.global.f32 [%rd69+0], %f108;\n"
" add.u64 %rd62, %rd66, %rd69;\n"
" mov.f32 %f109, %f19;\n"
" st.global.f32 [%rd62+0], %f109;\n"
" mov.f32 %f110, %f21;\n"
" add.u64 %rd70, %rd66, %rd62;\n"
" st.global.f32 [%rd70+0], %f110;\n"
"$Lt_1_31234:\n"
" ld.param.u64 %rd71, [__cudaparm_kernel_pair_fast_ans];\n"
" mul.lo.u64 %rd72, %rd19, 16;\n"
" add.u64 %rd73, %rd71, %rd72;\n"
" mov.f32 %f111, %f112;\n"
" st.global.v4.f32 [%rd73+0], {%f33,%f32,%f31,%f111};\n"
"$Lt_1_30210:\n"
"$Lt_1_22530:\n"
" .loc 16 194 0\n"
" exit;\n"
"$LDWend_kernel_pair_fast:\n"
" }\n"
;

View File

@ -1,849 +0,0 @@
const char * lj =
" .version 2.3\n"
" .target sm_20\n"
" .address_size 64\n"
" .global .texref pos_tex;\n"
" .entry kernel_pair (\n"
" .param .u64 __cudaparm_kernel_pair_x_,\n"
" .param .u64 __cudaparm_kernel_pair_lj1,\n"
" .param .u64 __cudaparm_kernel_pair_lj3,\n"
" .param .s32 __cudaparm_kernel_pair_lj_types,\n"
" .param .u64 __cudaparm_kernel_pair_sp_lj_in,\n"
" .param .u64 __cudaparm_kernel_pair_dev_nbor,\n"
" .param .u64 __cudaparm_kernel_pair_dev_packed,\n"
" .param .u64 __cudaparm_kernel_pair_ans,\n"
" .param .u64 __cudaparm_kernel_pair___val_paramengv,\n"
" .param .s32 __cudaparm_kernel_pair_eflag,\n"
" .param .s32 __cudaparm_kernel_pair_vflag,\n"
" .param .s32 __cudaparm_kernel_pair_inum,\n"
" .param .s32 __cudaparm_kernel_pair_nbor_pitch,\n"
" .param .s32 __cudaparm_kernel_pair_t_per_atom)\n"
" {\n"
" .reg .u32 %r<72>;\n"
" .reg .u64 %rd<63>;\n"
" .reg .f32 %f<102>;\n"
" .reg .pred %p<19>;\n"
" .shared .align 16 .b8 __cuda___cuda_local_var_32536_33_non_const_sp_lj92[16];\n"
" .shared .align 4 .b8 __cuda___cuda_local_var_32600_55_non_const_red_acc108[3072];\n"
" .loc 16 31 0\n"
"$LDWbegin_kernel_pair:\n"
" .loc 16 36 0\n"
" ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];\n"
" ldu.global.f32 %f1, [%rd1+0];\n"
" .loc 16 37 0\n"
" ld.global.f32 %f2, [%rd1+4];\n"
" .loc 16 38 0\n"
" ld.global.f32 %f3, [%rd1+8];\n"
" .loc 16 39 0\n"
" ld.global.f32 %f4, [%rd1+12];\n"
" st.shared.v4.f32 [__cuda___cuda_local_var_32536_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4};\n"
" .loc 16 46 0\n"
" mov.f32 %f5, 0f00000000; \n"
" mov.f32 %f6, %f5;\n"
" mov.f32 %f7, 0f00000000; \n"
" mov.f32 %f8, %f7;\n"
" mov.f32 %f9, 0f00000000; \n"
" mov.f32 %f10, %f9;\n"
" mov.f32 %f11, 0f00000000; \n"
" mov.f32 %f12, %f11;\n"
" mov.f32 %f13, 0f00000000; \n"
" mov.f32 %f14, %f13;\n"
" mov.f32 %f15, 0f00000000; \n"
" mov.f32 %f16, %f15;\n"
" ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];\n"
" cvt.s32.u32 %r2, %tid.x;\n"
" div.s32 %r3, %r2, %r1;\n"
" cvt.s32.u32 %r4, %ntid.x;\n"
" div.s32 %r5, %r4, %r1;\n"
" cvt.s32.u32 %r6, %ctaid.x;\n"
" mul.lo.s32 %r7, %r6, %r5;\n"
" add.s32 %r8, %r3, %r7;\n"
" ld.param.s32 %r9, [__cudaparm_kernel_pair_inum];\n"
" setp.ge.s32 %p1, %r8, %r9;\n"
" @%p1 bra $Lt_0_26370;\n"
" .loc 16 51 0\n"
" ld.param.s32 %r10, [__cudaparm_kernel_pair_nbor_pitch];\n"
" cvt.s64.s32 %rd2, %r10;\n"
" mul.wide.s32 %rd3, %r10, 4;\n"
" cvt.s64.s32 %rd4, %r8;\n"
" mul.wide.s32 %rd5, %r8, 4;\n"
" ld.param.u64 %rd6, [__cudaparm_kernel_pair_dev_nbor];\n"
" add.u64 %rd7, %rd5, %rd6;\n"
" add.u64 %rd8, %rd3, %rd7;\n"
" ld.global.s32 %r11, [%rd8+0];\n"
" sub.s32 %r12, %r1, 1;\n"
" and.b32 %r13, %r12, %r2;\n"
" cvt.s64.s32 %rd9, %r13;\n"
" mul.wide.s32 %rd10, %r13, 4;\n"
" ld.param.u64 %rd11, [__cudaparm_kernel_pair_dev_packed];\n"
" setp.ne.u64 %p2, %rd11, %rd6;\n"
" @%p2 bra $Lt_0_19458;\n"
" cvt.s32.s64 %r14, %rd2;\n"
" mul.lo.s32 %r15, %r14, %r1;\n"
" mov.s32 %r16, %r15;\n"
" mul.lo.s32 %r17, %r12, %r8;\n"
" add.s32 %r18, %r14, %r17;\n"
" cvt.s64.s32 %rd12, %r18;\n"
" mul.wide.s32 %rd13, %r18, 4;\n"
" add.u64 %rd14, %rd8, %rd13;\n"
" and.b32 %r19, %r12, %r11;\n"
" cvt.s64.s32 %rd15, %r19;\n"
" div.s32 %r20, %r11, %r1;\n"
" mul.lo.s32 %r21, %r15, %r20;\n"
" cvt.s64.s32 %rd16, %r21;\n"
" add.u64 %rd17, %rd15, %rd16;\n"
" mul.lo.u64 %rd18, %rd17, 4;\n"
" add.u64 %rd19, %rd14, %rd18;\n"
" add.u64 %rd20, %rd10, %rd14;\n"
" bra.uni $Lt_0_19202;\n"
"$Lt_0_19458:\n"
" add.u64 %rd21, %rd3, %rd8;\n"
" ld.global.s32 %r22, [%rd21+0];\n"
" cvt.s64.s32 %rd22, %r22;\n"
" mul.wide.s32 %rd23, %r22, 4;\n"
" add.u64 %rd24, %rd11, %rd23;\n"
" cvt.s64.s32 %rd25, %r11;\n"
" mul.wide.s32 %rd26, %r11, 4;\n"
" add.u64 %rd19, %rd24, %rd26;\n"
" mov.s32 %r16, %r1;\n"
" add.u64 %rd20, %rd10, %rd24;\n"
"$Lt_0_19202:\n"
" .loc 16 54 0\n"
" ld.global.s32 %r23, [%rd7+0];\n"
" mov.u32 %r24, %r23;\n"
" mov.s32 %r25, 0;\n"
" mov.u32 %r26, %r25;\n"
" mov.s32 %r27, 0;\n"
" mov.u32 %r28, %r27;\n"
" mov.s32 %r29, 0;\n"
" mov.u32 %r30, %r29;\n"
" tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r24,%r26,%r28,%r30}];\n"
" mov.f32 %f21, %f17;\n"
" mov.f32 %f22, %f18;\n"
" mov.f32 %f23, %f19;\n"
" mov.f32 %f24, %f20;\n"
" setp.ge.u64 %p3, %rd20, %rd19;\n"
" @%p3 bra $Lt_0_27906;\n"
" cvt.rzi.ftz.s32.f32 %r31, %f24;\n"
" cvt.s64.s32 %rd27, %r16;\n"
" ld.param.s32 %r32, [__cudaparm_kernel_pair_lj_types];\n"
" mul.lo.s32 %r33, %r32, %r31;\n"
" ld.param.u64 %rd28, [__cudaparm_kernel_pair_lj1];\n"
" mov.f32 %f25, 0f00000000; \n"
" mov.f32 %f26, 0f00000000; \n"
" mov.f32 %f27, 0f00000000; \n"
" mov.f32 %f28, 0f00000000; \n"
" mov.u64 %rd29, __cuda___cuda_local_var_32536_33_non_const_sp_lj92;\n"
"$Lt_0_20226:\n"
" .loc 16 60 0\n"
" ld.global.s32 %r34, [%rd20+0];\n"
" .loc 16 61 0\n"
" shr.s32 %r35, %r34, 30;\n"
" and.b32 %r36, %r35, 3;\n"
" cvt.s64.s32 %rd30, %r36;\n"
" mul.wide.s32 %rd31, %r36, 4;\n"
" add.u64 %rd32, %rd29, %rd31;\n"
" ld.shared.f32 %f29, [%rd32+0];\n"
" .loc 16 64 0\n"
" and.b32 %r37, %r34, 1073741823;\n"
" mov.u32 %r38, %r37;\n"
" mov.s32 %r39, 0;\n"
" mov.u32 %r40, %r39;\n"
" mov.s32 %r41, 0;\n"
" mov.u32 %r42, %r41;\n"
" mov.s32 %r43, 0;\n"
" mov.u32 %r44, %r43;\n"
" tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r38,%r40,%r42,%r44}];\n"
" mov.f32 %f34, %f30;\n"
" mov.f32 %f35, %f31;\n"
" mov.f32 %f36, %f32;\n"
" mov.f32 %f37, %f33;\n"
" cvt.rzi.ftz.s32.f32 %r45, %f37;\n"
" sub.ftz.f32 %f38, %f22, %f35;\n"
" sub.ftz.f32 %f39, %f21, %f34;\n"
" sub.ftz.f32 %f40, %f23, %f36;\n"
" mul.ftz.f32 %f41, %f38, %f38;\n"
" fma.rn.ftz.f32 %f42, %f39, %f39, %f41;\n"
" fma.rn.ftz.f32 %f43, %f40, %f40, %f42;\n"
" add.s32 %r46, %r45, %r33;\n"
" cvt.s64.s32 %rd33, %r46;\n"
" mul.wide.s32 %rd34, %r46, 16;\n"
" add.u64 %rd35, %rd34, %rd28;\n"
" ld.global.f32 %f44, [%rd35+8];\n"
" setp.gt.ftz.f32 %p4, %f44, %f43;\n"
" @!%p4 bra $Lt_0_21506;\n"
" .loc 16 78 0\n"
" rcp.approx.ftz.f32 %f45, %f43;\n"
" mul.ftz.f32 %f46, %f45, %f45;\n"
" mul.ftz.f32 %f47, %f45, %f46;\n"
" mul.ftz.f32 %f48, %f45, %f47;\n"
" ld.global.v2.f32 {%f49,%f50}, [%rd35+0];\n"
" mul.ftz.f32 %f51, %f49, %f47;\n"
" sub.ftz.f32 %f52, %f51, %f50;\n"
" mul.ftz.f32 %f53, %f48, %f52;\n"
" mul.ftz.f32 %f54, %f29, %f53;\n"
" .loc 16 80 0\n"
" fma.rn.ftz.f32 %f27, %f39, %f54, %f27;\n"
" .loc 16 81 0\n"
" fma.rn.ftz.f32 %f26, %f38, %f54, %f26;\n"
" .loc 16 82 0\n"
" fma.rn.ftz.f32 %f25, %f40, %f54, %f25;\n"
" ld.param.s32 %r47, [__cudaparm_kernel_pair_eflag];\n"
" mov.u32 %r48, 0;\n"
" setp.le.s32 %p5, %r47, %r48;\n"
" @%p5 bra $Lt_0_20994;\n"
" .loc 16 86 0\n"
" ld.param.u64 %rd36, [__cudaparm_kernel_pair_lj3];\n"
" add.u64 %rd37, %rd36, %rd34;\n"
" ld.global.v4.f32 {%f55,%f56,%f57,_}, [%rd37+0];\n"
" mul.ftz.f32 %f58, %f55, %f47;\n"
" sub.ftz.f32 %f59, %f58, %f56;\n"
" mul.ftz.f32 %f60, %f47, %f59;\n"
" sub.ftz.f32 %f61, %f60, %f57;\n"
" fma.rn.ftz.f32 %f28, %f29, %f61, %f28;\n"
"$Lt_0_20994:\n"
" ld.param.s32 %r49, [__cudaparm_kernel_pair_vflag];\n"
" mov.u32 %r50, 0;\n"
" setp.le.s32 %p6, %r49, %r50;\n"
" @%p6 bra $Lt_0_21506;\n"
" .loc 16 89 0\n"
" mov.f32 %f62, %f6;\n"
" mul.ftz.f32 %f63, %f39, %f39;\n"
" fma.rn.ftz.f32 %f64, %f54, %f63, %f62;\n"
" mov.f32 %f6, %f64;\n"
" .loc 16 90 0\n"
" mov.f32 %f65, %f8;\n"
" fma.rn.ftz.f32 %f66, %f54, %f41, %f65;\n"
" mov.f32 %f8, %f66;\n"
" .loc 16 91 0\n"
" mov.f32 %f67, %f10;\n"
" mul.ftz.f32 %f68, %f40, %f40;\n"
" fma.rn.ftz.f32 %f69, %f54, %f68, %f67;\n"
" mov.f32 %f10, %f69;\n"
" .loc 16 92 0\n"
" mov.f32 %f70, %f12;\n"
" mul.ftz.f32 %f71, %f38, %f39;\n"
" fma.rn.ftz.f32 %f72, %f54, %f71, %f70;\n"
" mov.f32 %f12, %f72;\n"
" .loc 16 93 0\n"
" mov.f32 %f73, %f14;\n"
" mul.ftz.f32 %f74, %f39, %f40;\n"
" fma.rn.ftz.f32 %f75, %f54, %f74, %f73;\n"
" mov.f32 %f14, %f75;\n"
" .loc 16 94 0\n"
" mul.ftz.f32 %f76, %f38, %f40;\n"
" fma.rn.ftz.f32 %f15, %f54, %f76, %f15;\n"
" mov.f32 %f16, %f15;\n"
"$Lt_0_21506:\n"
"$Lt_0_20482:\n"
" .loc 16 58 0\n"
" mul.lo.u64 %rd38, %rd27, 4;\n"
" add.u64 %rd20, %rd20, %rd38;\n"
" setp.lt.u64 %p7, %rd20, %rd19;\n"
" @%p7 bra $Lt_0_20226;\n"
" bra.uni $Lt_0_19714;\n"
"$Lt_0_27906:\n"
" mov.f32 %f25, 0f00000000; \n"
" mov.f32 %f26, 0f00000000; \n"
" mov.f32 %f27, 0f00000000; \n"
" mov.f32 %f28, 0f00000000; \n"
"$Lt_0_19714:\n"
" mov.u32 %r51, 1;\n"
" setp.le.s32 %p8, %r1, %r51;\n"
" @%p8 bra $Lt_0_24322;\n"
" .loc 16 99 0\n"
" mov.u64 %rd39, __cuda___cuda_local_var_32600_55_non_const_red_acc108;\n"
" cvt.s64.s32 %rd40, %r2;\n"
" mul.wide.s32 %rd41, %r2, 4;\n"
" add.u64 %rd42, %rd39, %rd41;\n"
" mov.f32 %f77, %f27;\n"
" st.shared.f32 [%rd42+0], %f77;\n"
" mov.f32 %f78, %f26;\n"
" st.shared.f32 [%rd42+512], %f78;\n"
" mov.f32 %f79, %f25;\n"
" st.shared.f32 [%rd42+1024], %f79;\n"
" mov.f32 %f80, %f28;\n"
" st.shared.f32 [%rd42+1536], %f80;\n"
" shr.s32 %r52, %r1, 31;\n"
" mov.s32 %r53, 1;\n"
" and.b32 %r54, %r52, %r53;\n"
" add.s32 %r55, %r54, %r1;\n"
" shr.s32 %r56, %r55, 1;\n"
" mov.s32 %r57, %r56;\n"
" mov.u32 %r58, 0;\n"
" setp.ne.u32 %p9, %r56, %r58;\n"
" @!%p9 bra $Lt_0_22786;\n"
"$Lt_0_23298:\n"
" setp.ge.u32 %p10, %r13, %r57;\n"
" @%p10 bra $Lt_0_23554;\n"
" add.u32 %r59, %r2, %r57;\n"
" cvt.u64.u32 %rd43, %r59;\n"
" mul.wide.u32 %rd44, %r59, 4;\n"
" add.u64 %rd45, %rd39, %rd44;\n"
" ld.shared.f32 %f81, [%rd45+0];\n"
" add.ftz.f32 %f77, %f81, %f77;\n"
" st.shared.f32 [%rd42+0], %f77;\n"
" ld.shared.f32 %f82, [%rd45+512];\n"
" add.ftz.f32 %f78, %f82, %f78;\n"
" st.shared.f32 [%rd42+512], %f78;\n"
" ld.shared.f32 %f83, [%rd45+1024];\n"
" add.ftz.f32 %f79, %f83, %f79;\n"
" st.shared.f32 [%rd42+1024], %f79;\n"
" ld.shared.f32 %f84, [%rd45+1536];\n"
" add.ftz.f32 %f80, %f84, %f80;\n"
" st.shared.f32 [%rd42+1536], %f80;\n"
"$Lt_0_23554:\n"
" shr.u32 %r57, %r57, 1;\n"
" mov.u32 %r60, 0;\n"
" setp.ne.u32 %p11, %r57, %r60;\n"
" @%p11 bra $Lt_0_23298;\n"
"$Lt_0_22786:\n"
" mov.f32 %f27, %f77;\n"
" mov.f32 %f26, %f78;\n"
" mov.f32 %f25, %f79;\n"
" mov.f32 %f28, %f80;\n"
" ld.param.s32 %r61, [__cudaparm_kernel_pair_vflag];\n"
" mov.u32 %r62, 0;\n"
" setp.le.s32 %p12, %r61, %r62;\n"
" @%p12 bra $Lt_0_24322;\n"
" mov.f32 %f77, %f6;\n"
" st.shared.f32 [%rd42+0], %f77;\n"
" mov.f32 %f78, %f8;\n"
" st.shared.f32 [%rd42+512], %f78;\n"
" mov.f32 %f79, %f10;\n"
" st.shared.f32 [%rd42+1024], %f79;\n"
" mov.f32 %f80, %f12;\n"
" st.shared.f32 [%rd42+1536], %f80;\n"
" mov.f32 %f85, %f14;\n"
" st.shared.f32 [%rd42+2048], %f85;\n"
" mov.f32 %f86, %f15;\n"
" st.shared.f32 [%rd42+2560], %f86;\n"
" mov.s32 %r63, %r56;\n"
" @!%p9 bra $Lt_0_24834;\n"
"$Lt_0_25346:\n"
" setp.ge.u32 %p13, %r13, %r63;\n"
" @%p13 bra $Lt_0_25602;\n"
" add.u32 %r64, %r2, %r63;\n"
" cvt.u64.u32 %rd46, %r64;\n"
" mul.wide.u32 %rd47, %r64, 4;\n"
" add.u64 %rd48, %rd39, %rd47;\n"
" ld.shared.f32 %f87, [%rd48+0];\n"
" add.ftz.f32 %f77, %f87, %f77;\n"
" st.shared.f32 [%rd42+0], %f77;\n"
" ld.shared.f32 %f88, [%rd48+512];\n"
" add.ftz.f32 %f78, %f88, %f78;\n"
" st.shared.f32 [%rd42+512], %f78;\n"
" ld.shared.f32 %f89, [%rd48+1024];\n"
" add.ftz.f32 %f79, %f89, %f79;\n"
" st.shared.f32 [%rd42+1024], %f79;\n"
" ld.shared.f32 %f90, [%rd48+1536];\n"
" add.ftz.f32 %f80, %f90, %f80;\n"
" st.shared.f32 [%rd42+1536], %f80;\n"
" ld.shared.f32 %f91, [%rd48+2048];\n"
" add.ftz.f32 %f85, %f91, %f85;\n"
" st.shared.f32 [%rd42+2048], %f85;\n"
" ld.shared.f32 %f92, [%rd48+2560];\n"
" add.ftz.f32 %f86, %f92, %f86;\n"
" st.shared.f32 [%rd42+2560], %f86;\n"
"$Lt_0_25602:\n"
" shr.u32 %r63, %r63, 1;\n"
" mov.u32 %r65, 0;\n"
" setp.ne.u32 %p14, %r63, %r65;\n"
" @%p14 bra $Lt_0_25346;\n"
"$Lt_0_24834:\n"
" mov.f32 %f6, %f77;\n"
" mov.f32 %f8, %f78;\n"
" mov.f32 %f10, %f79;\n"
" mov.f32 %f12, %f80;\n"
" mov.f32 %f14, %f85;\n"
" mov.f32 %f16, %f86;\n"
"$Lt_0_24322:\n"
"$Lt_0_22274:\n"
" mov.u32 %r66, 0;\n"
" setp.ne.s32 %p15, %r13, %r66;\n"
" @%p15 bra $Lt_0_26370;\n"
" ld.param.u64 %rd49, [__cudaparm_kernel_pair___val_paramengv];\n"
" add.u64 %rd50, %rd49, %rd5;\n"
" ld.param.s32 %r67, [__cudaparm_kernel_pair_eflag];\n"
" mov.u32 %r68, 0;\n"
" setp.le.s32 %p16, %r67, %r68;\n"
" @%p16 bra $Lt_0_26882;\n"
" st.global.f32 [%rd50+0], %f28;\n"
" cvt.s64.s32 %rd51, %r9;\n"
" mul.wide.s32 %rd52, %r9, 4;\n"
" add.u64 %rd50, %rd50, %rd52;\n"
"$Lt_0_26882:\n"
" ld.param.s32 %r69, [__cudaparm_kernel_pair_vflag];\n"
" mov.u32 %r70, 0;\n"
" setp.le.s32 %p17, %r69, %r70;\n"
" @%p17 bra $Lt_0_27394;\n"
" mov.f32 %f93, %f6;\n"
" st.global.f32 [%rd50+0], %f93;\n"
" cvt.s64.s32 %rd53, %r9;\n"
" mul.wide.s32 %rd54, %r9, 4;\n"
" add.u64 %rd55, %rd54, %rd50;\n"
" mov.f32 %f94, %f8;\n"
" st.global.f32 [%rd55+0], %f94;\n"
" add.u64 %rd56, %rd54, %rd55;\n"
" mov.f32 %f95, %f10;\n"
" st.global.f32 [%rd56+0], %f95;\n"
" add.u64 %rd57, %rd54, %rd56;\n"
" mov.f32 %f96, %f12;\n"
" st.global.f32 [%rd57+0], %f96;\n"
" add.u64 %rd50, %rd54, %rd57;\n"
" mov.f32 %f97, %f14;\n"
" st.global.f32 [%rd50+0], %f97;\n"
" mov.f32 %f98, %f16;\n"
" add.u64 %rd58, %rd54, %rd50;\n"
" st.global.f32 [%rd58+0], %f98;\n"
"$Lt_0_27394:\n"
" ld.param.u64 %rd59, [__cudaparm_kernel_pair_ans];\n"
" mul.lo.u64 %rd60, %rd4, 16;\n"
" add.u64 %rd61, %rd59, %rd60;\n"
" mov.f32 %f99, %f100;\n"
" st.global.v4.f32 [%rd61+0], {%f27,%f26,%f25,%f99};\n"
"$Lt_0_26370:\n"
"$Lt_0_18690:\n"
" .loc 16 102 0\n"
" exit;\n"
"$LDWend_kernel_pair:\n"
" }\n"
" .entry kernel_pair_fast (\n"
" .param .u64 __cudaparm_kernel_pair_fast_x_,\n"
" .param .u64 __cudaparm_kernel_pair_fast_lj1_in,\n"
" .param .u64 __cudaparm_kernel_pair_fast_lj3_in,\n"
" .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,\n"
" .param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n"
" .param .u64 __cudaparm_kernel_pair_fast_dev_packed,\n"
" .param .u64 __cudaparm_kernel_pair_fast_ans,\n"
" .param .u64 __cudaparm_kernel_pair_fast___val_paramengv,\n"
" .param .s32 __cudaparm_kernel_pair_fast_eflag,\n"
" .param .s32 __cudaparm_kernel_pair_fast_vflag,\n"
" .param .s32 __cudaparm_kernel_pair_fast_inum,\n"
" .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,\n"
" .param .s32 __cudaparm_kernel_pair_fast_t_per_atom)\n"
" {\n"
" .reg .u32 %r<74>;\n"
" .reg .u64 %rd<75>;\n"
" .reg .f32 %f<109>;\n"
" .reg .pred %p<22>;\n"
" .shared .align 4 .b8 __cuda___cuda_local_var_32617_33_non_const_sp_lj3268[16];\n"
" .shared .align 16 .b8 __cuda___cuda_local_var_32615_34_non_const_lj13296[1936];\n"
" .shared .align 16 .b8 __cuda___cuda_local_var_32616_34_non_const_lj35232[1936];\n"
" .shared .align 4 .b8 __cuda___cuda_local_var_32685_55_non_const_red_acc7168[3072];\n"
" .loc 16 110 0\n"
"$LDWbegin_kernel_pair_fast:\n"
" cvt.s32.u32 %r1, %tid.x;\n"
" mov.u32 %r2, 3;\n"
" setp.gt.s32 %p1, %r1, %r2;\n"
" @%p1 bra $Lt_1_20994;\n"
" .loc 16 118 0\n"
" mov.u64 %rd1, __cuda___cuda_local_var_32617_33_non_const_sp_lj3268;\n"
" cvt.s64.s32 %rd2, %r1;\n"
" mul.wide.s32 %rd3, %r1, 4;\n"
" ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];\n"
" add.u64 %rd5, %rd4, %rd3;\n"
" ld.global.f32 %f1, [%rd5+0];\n"
" add.u64 %rd6, %rd3, %rd1;\n"
" st.shared.f32 [%rd6+0], %f1;\n"
"$Lt_1_20994:\n"
" mov.u64 %rd1, __cuda___cuda_local_var_32617_33_non_const_sp_lj3268;\n"
" mov.u32 %r3, 120;\n"
" setp.gt.s32 %p2, %r1, %r3;\n"
" @%p2 bra $Lt_1_21506;\n"
" .loc 16 120 0\n"
" mov.u64 %rd7, __cuda___cuda_local_var_32615_34_non_const_lj13296;\n"
" cvt.s64.s32 %rd8, %r1;\n"
" mul.wide.s32 %rd9, %r1, 16;\n"
" ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_lj1_in];\n"
" add.u64 %rd11, %rd10, %rd9;\n"
" add.u64 %rd12, %rd9, %rd7;\n"
" ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0];\n"
" st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5};\n"
" ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag];\n"
" mov.u32 %r5, 0;\n"
" setp.le.s32 %p3, %r4, %r5;\n"
" @%p3 bra $Lt_1_22018;\n"
" .loc 16 122 0\n"
" mov.u64 %rd13, __cuda___cuda_local_var_32616_34_non_const_lj35232;\n"
" ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in];\n"
" add.u64 %rd15, %rd14, %rd9;\n"
" add.u64 %rd16, %rd9, %rd13;\n"
" ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0];\n"
" st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9};\n"
"$Lt_1_22018:\n"
" mov.u64 %rd13, __cuda___cuda_local_var_32616_34_non_const_lj35232;\n"
"$Lt_1_21506:\n"
" mov.u64 %rd13, __cuda___cuda_local_var_32616_34_non_const_lj35232;\n"
" mov.u64 %rd7, __cuda___cuda_local_var_32615_34_non_const_lj13296;\n"
" .loc 16 130 0\n"
" mov.f32 %f10, 0f00000000; \n"
" mov.f32 %f11, %f10;\n"
" mov.f32 %f12, 0f00000000; \n"
" mov.f32 %f13, %f12;\n"
" mov.f32 %f14, 0f00000000; \n"
" mov.f32 %f15, %f14;\n"
" mov.f32 %f16, 0f00000000; \n"
" mov.f32 %f17, %f16;\n"
" mov.f32 %f18, 0f00000000; \n"
" mov.f32 %f19, %f18;\n"
" mov.f32 %f20, 0f00000000; \n"
" mov.f32 %f21, %f20;\n"
" .loc 16 132 0\n"
" bar.sync 0;\n"
" ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom];\n"
" div.s32 %r7, %r1, %r6;\n"
" cvt.s32.u32 %r8, %ntid.x;\n"
" div.s32 %r9, %r8, %r6;\n"
" cvt.s32.u32 %r10, %ctaid.x;\n"
" mul.lo.s32 %r11, %r10, %r9;\n"
" add.s32 %r12, %r7, %r11;\n"
" ld.param.s32 %r13, [__cudaparm_kernel_pair_fast_inum];\n"
" setp.ge.s32 %p4, %r12, %r13;\n"
" @%p4 bra $Lt_1_30210;\n"
" .loc 16 137 0\n"
" ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_nbor_pitch];\n"
" cvt.s64.s32 %rd17, %r14;\n"
" mul.wide.s32 %rd18, %r14, 4;\n"
" cvt.s64.s32 %rd19, %r12;\n"
" mul.wide.s32 %rd20, %r12, 4;\n"
" ld.param.u64 %rd21, [__cudaparm_kernel_pair_fast_dev_nbor];\n"
" add.u64 %rd22, %rd20, %rd21;\n"
" add.u64 %rd23, %rd18, %rd22;\n"
" ld.global.s32 %r15, [%rd23+0];\n"
" sub.s32 %r16, %r6, 1;\n"
" and.b32 %r17, %r16, %r1;\n"
" cvt.s64.s32 %rd24, %r17;\n"
" mul.wide.s32 %rd25, %r17, 4;\n"
" ld.param.u64 %rd26, [__cudaparm_kernel_pair_fast_dev_packed];\n"
" setp.ne.u64 %p5, %rd26, %rd21;\n"
" @%p5 bra $Lt_1_23298;\n"
" cvt.s32.s64 %r18, %rd17;\n"
" mul.lo.s32 %r19, %r18, %r6;\n"
" mov.s32 %r20, %r19;\n"
" mul.lo.s32 %r21, %r16, %r12;\n"
" add.s32 %r22, %r18, %r21;\n"
" cvt.s64.s32 %rd27, %r22;\n"
" mul.wide.s32 %rd28, %r22, 4;\n"
" add.u64 %rd29, %rd23, %rd28;\n"
" and.b32 %r23, %r16, %r15;\n"
" cvt.s64.s32 %rd30, %r23;\n"
" div.s32 %r24, %r15, %r6;\n"
" mul.lo.s32 %r25, %r19, %r24;\n"
" cvt.s64.s32 %rd31, %r25;\n"
" add.u64 %rd32, %rd30, %rd31;\n"
" mul.lo.u64 %rd33, %rd32, 4;\n"
" add.u64 %rd34, %rd29, %rd33;\n"
" add.u64 %rd35, %rd25, %rd29;\n"
" bra.uni $Lt_1_23042;\n"
"$Lt_1_23298:\n"
" add.u64 %rd36, %rd18, %rd23;\n"
" ld.global.s32 %r26, [%rd36+0];\n"
" cvt.s64.s32 %rd37, %r26;\n"
" mul.wide.s32 %rd38, %r26, 4;\n"
" add.u64 %rd39, %rd26, %rd38;\n"
" cvt.s64.s32 %rd40, %r15;\n"
" mul.wide.s32 %rd41, %r15, 4;\n"
" add.u64 %rd34, %rd39, %rd41;\n"
" mov.s32 %r20, %r6;\n"
" add.u64 %rd35, %rd25, %rd39;\n"
"$Lt_1_23042:\n"
" .loc 16 140 0\n"
" ld.global.s32 %r27, [%rd22+0];\n"
" mov.u32 %r28, %r27;\n"
" mov.s32 %r29, 0;\n"
" mov.u32 %r30, %r29;\n"
" mov.s32 %r31, 0;\n"
" mov.u32 %r32, %r31;\n"
" mov.s32 %r33, 0;\n"
" mov.u32 %r34, %r33;\n"
" tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r28,%r30,%r32,%r34}];\n"
" mov.f32 %f26, %f22;\n"
" mov.f32 %f27, %f23;\n"
" mov.f32 %f28, %f24;\n"
" mov.f32 %f29, %f25;\n"
" setp.ge.u64 %p6, %rd35, %rd34;\n"
" @%p6 bra $Lt_1_31746;\n"
" cvt.rzi.ftz.s32.f32 %r35, %f29;\n"
" cvt.s64.s32 %rd42, %r20;\n"
" mul.lo.s32 %r36, %r35, 11;\n"
" cvt.rn.f32.s32 %f30, %r36;\n"
" mov.f32 %f31, 0f00000000; \n"
" mov.f32 %f32, 0f00000000; \n"
" mov.f32 %f33, 0f00000000; \n"
" mov.f32 %f34, 0f00000000; \n"
"$Lt_1_24066:\n"
" .loc 16 147 0\n"
" ld.global.s32 %r37, [%rd35+0];\n"
" .loc 16 148 0\n"
" shr.s32 %r38, %r37, 30;\n"
" and.b32 %r39, %r38, 3;\n"
" cvt.s64.s32 %rd43, %r39;\n"
" mul.wide.s32 %rd44, %r39, 4;\n"
" add.u64 %rd45, %rd1, %rd44;\n"
" ld.shared.f32 %f35, [%rd45+0];\n"
" .loc 16 151 0\n"
" and.b32 %r40, %r37, 1073741823;\n"
" mov.u32 %r41, %r40;\n"
" mov.s32 %r42, 0;\n"
" mov.u32 %r43, %r42;\n"
" mov.s32 %r44, 0;\n"
" mov.u32 %r45, %r44;\n"
" mov.s32 %r46, 0;\n"
" mov.u32 %r47, %r46;\n"
" tex.1d.v4.f32.s32 {%f36,%f37,%f38,%f39},[pos_tex,{%r41,%r43,%r45,%r47}];\n"
" mov.f32 %f40, %f36;\n"
" mov.f32 %f41, %f37;\n"
" mov.f32 %f42, %f38;\n"
" mov.f32 %f43, %f39;\n"
" sub.ftz.f32 %f44, %f27, %f41;\n"
" sub.ftz.f32 %f45, %f26, %f40;\n"
" sub.ftz.f32 %f46, %f28, %f42;\n"
" mul.ftz.f32 %f47, %f44, %f44;\n"
" fma.rn.ftz.f32 %f48, %f45, %f45, %f47;\n"
" fma.rn.ftz.f32 %f49, %f46, %f46, %f48;\n"
" add.ftz.f32 %f50, %f30, %f43;\n"
" cvt.rzi.ftz.s32.f32 %r48, %f50;\n"
" cvt.s64.s32 %rd46, %r48;\n"
" mul.wide.s32 %rd47, %r48, 16;\n"
" add.u64 %rd48, %rd47, %rd7;\n"
" ld.shared.f32 %f51, [%rd48+8];\n"
" setp.gt.ftz.f32 %p7, %f51, %f49;\n"
" @!%p7 bra $Lt_1_25346;\n"
" .loc 16 163 0\n"
" rcp.approx.ftz.f32 %f52, %f49;\n"
" mul.ftz.f32 %f53, %f52, %f52;\n"
" mul.ftz.f32 %f54, %f52, %f53;\n"
" mul.ftz.f32 %f55, %f52, %f35;\n"
" mul.ftz.f32 %f56, %f54, %f55;\n"
" ld.shared.v2.f32 {%f57,%f58}, [%rd48+0];\n"
" mul.ftz.f32 %f59, %f57, %f54;\n"
" sub.ftz.f32 %f60, %f59, %f58;\n"
" mul.ftz.f32 %f61, %f56, %f60;\n"
" .loc 16 165 0\n"
" fma.rn.ftz.f32 %f33, %f45, %f61, %f33;\n"
" .loc 16 166 0\n"
" fma.rn.ftz.f32 %f32, %f44, %f61, %f32;\n"
" .loc 16 167 0\n"
" fma.rn.ftz.f32 %f31, %f46, %f61, %f31;\n"
" ld.param.s32 %r49, [__cudaparm_kernel_pair_fast_eflag];\n"
" mov.u32 %r50, 0;\n"
" setp.le.s32 %p8, %r49, %r50;\n"
" @%p8 bra $Lt_1_24834;\n"
" .loc 16 170 0\n"
" add.u64 %rd49, %rd47, %rd13;\n"
" ld.shared.v4.f32 {%f62,%f63,%f64,_}, [%rd49+0];\n"
" mul.ftz.f32 %f65, %f62, %f54;\n"
" sub.ftz.f32 %f66, %f65, %f63;\n"
" mul.ftz.f32 %f67, %f54, %f66;\n"
" .loc 16 171 0\n"
" sub.ftz.f32 %f68, %f67, %f64;\n"
" fma.rn.ftz.f32 %f34, %f35, %f68, %f34;\n"
"$Lt_1_24834:\n"
" ld.param.s32 %r51, [__cudaparm_kernel_pair_fast_vflag];\n"
" mov.u32 %r52, 0;\n"
" setp.le.s32 %p9, %r51, %r52;\n"
" @%p9 bra $Lt_1_25346;\n"
" .loc 16 174 0\n"
" mov.f32 %f69, %f11;\n"
" mul.ftz.f32 %f70, %f45, %f45;\n"
" fma.rn.ftz.f32 %f71, %f61, %f70, %f69;\n"
" mov.f32 %f11, %f71;\n"
" .loc 16 175 0\n"
" mov.f32 %f72, %f13;\n"
" fma.rn.ftz.f32 %f73, %f61, %f47, %f72;\n"
" mov.f32 %f13, %f73;\n"
" .loc 16 176 0\n"
" mov.f32 %f74, %f15;\n"
" mul.ftz.f32 %f75, %f46, %f46;\n"
" fma.rn.ftz.f32 %f76, %f61, %f75, %f74;\n"
" mov.f32 %f15, %f76;\n"
" .loc 16 177 0\n"
" mov.f32 %f77, %f17;\n"
" mul.ftz.f32 %f78, %f44, %f45;\n"
" fma.rn.ftz.f32 %f79, %f61, %f78, %f77;\n"
" mov.f32 %f17, %f79;\n"
" .loc 16 178 0\n"
" mov.f32 %f80, %f19;\n"
" mul.ftz.f32 %f81, %f45, %f46;\n"
" fma.rn.ftz.f32 %f82, %f61, %f81, %f80;\n"
" mov.f32 %f19, %f82;\n"
" .loc 16 179 0\n"
" mul.ftz.f32 %f83, %f44, %f46;\n"
" fma.rn.ftz.f32 %f20, %f61, %f83, %f20;\n"
" mov.f32 %f21, %f20;\n"
"$Lt_1_25346:\n"
"$Lt_1_24322:\n"
" .loc 16 145 0\n"
" mul.lo.u64 %rd50, %rd42, 4;\n"
" add.u64 %rd35, %rd35, %rd50;\n"
" setp.lt.u64 %p10, %rd35, %rd34;\n"
" @%p10 bra $Lt_1_24066;\n"
" bra.uni $Lt_1_23554;\n"
"$Lt_1_31746:\n"
" mov.f32 %f31, 0f00000000; \n"
" mov.f32 %f32, 0f00000000; \n"
" mov.f32 %f33, 0f00000000; \n"
" mov.f32 %f34, 0f00000000; \n"
"$Lt_1_23554:\n"
" mov.u32 %r53, 1;\n"
" setp.le.s32 %p11, %r6, %r53;\n"
" @%p11 bra $Lt_1_28162;\n"
" .loc 16 184 0\n"
" mov.u64 %rd51, __cuda___cuda_local_var_32685_55_non_const_red_acc7168;\n"
" cvt.s64.s32 %rd52, %r1;\n"
" mul.wide.s32 %rd53, %r1, 4;\n"
" add.u64 %rd54, %rd51, %rd53;\n"
" mov.f32 %f84, %f33;\n"
" st.shared.f32 [%rd54+0], %f84;\n"
" mov.f32 %f85, %f32;\n"
" st.shared.f32 [%rd54+512], %f85;\n"
" mov.f32 %f86, %f31;\n"
" st.shared.f32 [%rd54+1024], %f86;\n"
" mov.f32 %f87, %f34;\n"
" st.shared.f32 [%rd54+1536], %f87;\n"
" shr.s32 %r54, %r6, 31;\n"
" mov.s32 %r55, 1;\n"
" and.b32 %r56, %r54, %r55;\n"
" add.s32 %r57, %r56, %r6;\n"
" shr.s32 %r58, %r57, 1;\n"
" mov.s32 %r59, %r58;\n"
" mov.u32 %r60, 0;\n"
" setp.ne.u32 %p12, %r58, %r60;\n"
" @!%p12 bra $Lt_1_26626;\n"
"$Lt_1_27138:\n"
" setp.ge.u32 %p13, %r17, %r59;\n"
" @%p13 bra $Lt_1_27394;\n"
" add.u32 %r61, %r1, %r59;\n"
" cvt.u64.u32 %rd55, %r61;\n"
" mul.wide.u32 %rd56, %r61, 4;\n"
" add.u64 %rd57, %rd51, %rd56;\n"
" ld.shared.f32 %f88, [%rd57+0];\n"
" add.ftz.f32 %f84, %f88, %f84;\n"
" st.shared.f32 [%rd54+0], %f84;\n"
" ld.shared.f32 %f89, [%rd57+512];\n"
" add.ftz.f32 %f85, %f89, %f85;\n"
" st.shared.f32 [%rd54+512], %f85;\n"
" ld.shared.f32 %f90, [%rd57+1024];\n"
" add.ftz.f32 %f86, %f90, %f86;\n"
" st.shared.f32 [%rd54+1024], %f86;\n"
" ld.shared.f32 %f91, [%rd57+1536];\n"
" add.ftz.f32 %f87, %f91, %f87;\n"
" st.shared.f32 [%rd54+1536], %f87;\n"
"$Lt_1_27394:\n"
" shr.u32 %r59, %r59, 1;\n"
" mov.u32 %r62, 0;\n"
" setp.ne.u32 %p14, %r59, %r62;\n"
" @%p14 bra $Lt_1_27138;\n"
"$Lt_1_26626:\n"
" mov.f32 %f33, %f84;\n"
" mov.f32 %f32, %f85;\n"
" mov.f32 %f31, %f86;\n"
" mov.f32 %f34, %f87;\n"
" ld.param.s32 %r63, [__cudaparm_kernel_pair_fast_vflag];\n"
" mov.u32 %r64, 0;\n"
" setp.le.s32 %p15, %r63, %r64;\n"
" @%p15 bra $Lt_1_28162;\n"
" mov.f32 %f84, %f11;\n"
" st.shared.f32 [%rd54+0], %f84;\n"
" mov.f32 %f85, %f13;\n"
" st.shared.f32 [%rd54+512], %f85;\n"
" mov.f32 %f86, %f15;\n"
" st.shared.f32 [%rd54+1024], %f86;\n"
" mov.f32 %f87, %f17;\n"
" st.shared.f32 [%rd54+1536], %f87;\n"
" mov.f32 %f92, %f19;\n"
" st.shared.f32 [%rd54+2048], %f92;\n"
" mov.f32 %f93, %f20;\n"
" st.shared.f32 [%rd54+2560], %f93;\n"
" mov.s32 %r65, %r58;\n"
" @!%p12 bra $Lt_1_28674;\n"
"$Lt_1_29186:\n"
" setp.ge.u32 %p16, %r17, %r65;\n"
" @%p16 bra $Lt_1_29442;\n"
" add.u32 %r66, %r1, %r65;\n"
" cvt.u64.u32 %rd58, %r66;\n"
" mul.wide.u32 %rd59, %r66, 4;\n"
" add.u64 %rd60, %rd51, %rd59;\n"
" ld.shared.f32 %f94, [%rd60+0];\n"
" add.ftz.f32 %f84, %f94, %f84;\n"
" st.shared.f32 [%rd54+0], %f84;\n"
" ld.shared.f32 %f95, [%rd60+512];\n"
" add.ftz.f32 %f85, %f95, %f85;\n"
" st.shared.f32 [%rd54+512], %f85;\n"
" ld.shared.f32 %f96, [%rd60+1024];\n"
" add.ftz.f32 %f86, %f96, %f86;\n"
" st.shared.f32 [%rd54+1024], %f86;\n"
" ld.shared.f32 %f97, [%rd60+1536];\n"
" add.ftz.f32 %f87, %f97, %f87;\n"
" st.shared.f32 [%rd54+1536], %f87;\n"
" ld.shared.f32 %f98, [%rd60+2048];\n"
" add.ftz.f32 %f92, %f98, %f92;\n"
" st.shared.f32 [%rd54+2048], %f92;\n"
" ld.shared.f32 %f99, [%rd60+2560];\n"
" add.ftz.f32 %f93, %f99, %f93;\n"
" st.shared.f32 [%rd54+2560], %f93;\n"
"$Lt_1_29442:\n"
" shr.u32 %r65, %r65, 1;\n"
" mov.u32 %r67, 0;\n"
" setp.ne.u32 %p17, %r65, %r67;\n"
" @%p17 bra $Lt_1_29186;\n"
"$Lt_1_28674:\n"
" mov.f32 %f11, %f84;\n"
" mov.f32 %f13, %f85;\n"
" mov.f32 %f15, %f86;\n"
" mov.f32 %f17, %f87;\n"
" mov.f32 %f19, %f92;\n"
" mov.f32 %f21, %f93;\n"
"$Lt_1_28162:\n"
"$Lt_1_26114:\n"
" mov.u32 %r68, 0;\n"
" setp.ne.s32 %p18, %r17, %r68;\n"
" @%p18 bra $Lt_1_30210;\n"
" ld.param.u64 %rd61, [__cudaparm_kernel_pair_fast___val_paramengv];\n"
" add.u64 %rd62, %rd61, %rd20;\n"
" ld.param.s32 %r69, [__cudaparm_kernel_pair_fast_eflag];\n"
" mov.u32 %r70, 0;\n"
" setp.le.s32 %p19, %r69, %r70;\n"
" @%p19 bra $Lt_1_30722;\n"
" st.global.f32 [%rd62+0], %f34;\n"
" cvt.s64.s32 %rd63, %r13;\n"
" mul.wide.s32 %rd64, %r13, 4;\n"
" add.u64 %rd62, %rd62, %rd64;\n"
"$Lt_1_30722:\n"
" ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag];\n"
" mov.u32 %r72, 0;\n"
" setp.le.s32 %p20, %r71, %r72;\n"
" @%p20 bra $Lt_1_31234;\n"
" mov.f32 %f100, %f11;\n"
" st.global.f32 [%rd62+0], %f100;\n"
" cvt.s64.s32 %rd65, %r13;\n"
" mul.wide.s32 %rd66, %r13, 4;\n"
" add.u64 %rd67, %rd66, %rd62;\n"
" mov.f32 %f101, %f13;\n"
" st.global.f32 [%rd67+0], %f101;\n"
" add.u64 %rd68, %rd66, %rd67;\n"
" mov.f32 %f102, %f15;\n"
" st.global.f32 [%rd68+0], %f102;\n"
" add.u64 %rd69, %rd66, %rd68;\n"
" mov.f32 %f103, %f17;\n"
" st.global.f32 [%rd69+0], %f103;\n"
" add.u64 %rd62, %rd66, %rd69;\n"
" mov.f32 %f104, %f19;\n"
" st.global.f32 [%rd62+0], %f104;\n"
" mov.f32 %f105, %f21;\n"
" add.u64 %rd70, %rd66, %rd62;\n"
" st.global.f32 [%rd70+0], %f105;\n"
"$Lt_1_31234:\n"
" ld.param.u64 %rd71, [__cudaparm_kernel_pair_fast_ans];\n"
" mul.lo.u64 %rd72, %rd19, 16;\n"
" add.u64 %rd73, %rd71, %rd72;\n"
" mov.f32 %f106, %f107;\n"
" st.global.v4.f32 [%rd73+0], {%f33,%f32,%f31,%f106};\n"
"$Lt_1_30210:\n"
"$Lt_1_22530:\n"
" .loc 16 187 0\n"
" exit;\n"
"$LDWend_kernel_pair_fast:\n"
" }\n"
;

View File

@ -1,921 +0,0 @@
.version 2.3
.target sm_20
.address_size 64
// compiled with /usr/local/cuda/open64/lib//be
// nvopencc 4.0 built on 2011-05-12
//-----------------------------------------------------------
// Compiling /tmp/tmpxft_00009e26_00000000-9_lal_morse.cpp3.i (/home/sjplimp/ccBI#.ffCTdB)
//-----------------------------------------------------------
//-----------------------------------------------------------
// Options:
//-----------------------------------------------------------
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
// -O3 (Optimization level)
// -g0 (Debug level)
// -m2 (Report advisories)
//-----------------------------------------------------------
.file 1 "<command-line>"
.file 2 "/tmp/tmpxft_00009e26_00000000-8_lal_morse.cudafe2.gpu"
.file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
.file 4 "/usr/local/cuda/include/crt/device_runtime.h"
.file 5 "/usr/local/cuda/include/host_defines.h"
.file 6 "/usr/local/cuda/include/builtin_types.h"
.file 7 "/usr/local/cuda/include/device_types.h"
.file 8 "/usr/local/cuda/include/driver_types.h"
.file 9 "/usr/local/cuda/include/surface_types.h"
.file 10 "/usr/local/cuda/include/texture_types.h"
.file 11 "/usr/local/cuda/include/vector_types.h"
.file 12 "/usr/local/cuda/include/device_launch_parameters.h"
.file 13 "/usr/local/cuda/include/crt/storage_class.h"
.file 14 "/usr/include/bits/types.h"
.file 15 "/usr/include/time.h"
.file 16 "lal_morse.cu"
.file 17 "/usr/local/cuda/include/common_functions.h"
.file 18 "/usr/local/cuda/include/math_functions.h"
.file 19 "/usr/local/cuda/include/math_constants.h"
.file 20 "/usr/local/cuda/include/device_functions.h"
.file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h"
.file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h"
.file 23 "/usr/local/cuda/include/sm_13_double_functions.h"
.file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h"
.file 25 "/usr/local/cuda/include/sm_20_intrinsics.h"
.file 26 "/usr/local/cuda/include/surface_functions.h"
.file 27 "/usr/local/cuda/include/texture_fetch_functions.h"
.file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
.global .texref pos_tex;
.entry kernel_pair (
.param .u64 __cudaparm_kernel_pair_x_,
.param .u64 __cudaparm_kernel_pair_mor1,
.param .u64 __cudaparm_kernel_pair_mor2,
.param .s32 __cudaparm_kernel_pair_lj_types,
.param .u64 __cudaparm_kernel_pair_sp_lj_in,
.param .u64 __cudaparm_kernel_pair_dev_nbor,
.param .u64 __cudaparm_kernel_pair_dev_packed,
.param .u64 __cudaparm_kernel_pair_ans,
.param .u64 __cudaparm_kernel_pair___val_paramengv,
.param .s32 __cudaparm_kernel_pair_eflag,
.param .s32 __cudaparm_kernel_pair_vflag,
.param .s32 __cudaparm_kernel_pair_inum,
.param .s32 __cudaparm_kernel_pair_nbor_pitch,
.param .s32 __cudaparm_kernel_pair_t_per_atom)
{
.reg .u32 %r<72>;
.reg .u64 %rd<64>;
.reg .f32 %f<104>;
.reg .f64 %fd<10>;
.reg .pred %p<19>;
.shared .align 16 .b8 __cuda___cuda_local_var_32536_33_non_const_sp_lj92[16];
.shared .align 4 .b8 __cuda___cuda_local_var_32601_55_non_const_red_acc108[3072];
// __cuda_local_var_32543_10_non_const_f = 48
// __cuda_local_var_32545_9_non_const_virial = 16
.loc 16 31 0
$LDWbegin_kernel_pair:
.loc 16 36 0
ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];
ldu.global.f32 %f1, [%rd1+0];
.loc 16 37 0
ld.global.f32 %f2, [%rd1+4];
.loc 16 38 0
ld.global.f32 %f3, [%rd1+8];
.loc 16 39 0
ld.global.f32 %f4, [%rd1+12];
st.shared.v4.f32 [__cuda___cuda_local_var_32536_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4};
.loc 16 46 0
mov.f32 %f5, 0f00000000; // 0
mov.f32 %f6, %f5;
mov.f32 %f7, 0f00000000; // 0
mov.f32 %f8, %f7;
mov.f32 %f9, 0f00000000; // 0
mov.f32 %f10, %f9;
mov.f32 %f11, 0f00000000; // 0
mov.f32 %f12, %f11;
mov.f32 %f13, 0f00000000; // 0
mov.f32 %f14, %f13;
mov.f32 %f15, 0f00000000; // 0
mov.f32 %f16, %f15;
ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];
cvt.s32.u32 %r2, %tid.x;
div.s32 %r3, %r2, %r1;
cvt.s32.u32 %r4, %ntid.x;
div.s32 %r5, %r4, %r1;
cvt.s32.u32 %r6, %ctaid.x;
mul.lo.s32 %r7, %r6, %r5;
add.s32 %r8, %r3, %r7;
ld.param.s32 %r9, [__cudaparm_kernel_pair_inum];
setp.ge.s32 %p1, %r8, %r9;
@%p1 bra $Lt_0_26370;
.loc 16 51 0
ld.param.s32 %r10, [__cudaparm_kernel_pair_nbor_pitch];
cvt.s64.s32 %rd2, %r10;
mul.wide.s32 %rd3, %r10, 4;
cvt.s64.s32 %rd4, %r8;
mul.wide.s32 %rd5, %r8, 4;
ld.param.u64 %rd6, [__cudaparm_kernel_pair_dev_nbor];
add.u64 %rd7, %rd5, %rd6;
add.u64 %rd8, %rd3, %rd7;
ld.global.s32 %r11, [%rd8+0];
sub.s32 %r12, %r1, 1;
and.b32 %r13, %r12, %r2;
cvt.s64.s32 %rd9, %r13;
mul.wide.s32 %rd10, %r13, 4;
ld.param.u64 %rd11, [__cudaparm_kernel_pair_dev_packed];
setp.ne.u64 %p2, %rd11, %rd6;
@%p2 bra $Lt_0_19458;
cvt.s32.s64 %r14, %rd2;
mul.lo.s32 %r15, %r14, %r1;
mov.s32 %r16, %r15;
mul.lo.s32 %r17, %r12, %r8;
add.s32 %r18, %r14, %r17;
cvt.s64.s32 %rd12, %r18;
mul.wide.s32 %rd13, %r18, 4;
add.u64 %rd14, %rd8, %rd13;
and.b32 %r19, %r12, %r11;
cvt.s64.s32 %rd15, %r19;
div.s32 %r20, %r11, %r1;
mul.lo.s32 %r21, %r15, %r20;
cvt.s64.s32 %rd16, %r21;
add.u64 %rd17, %rd15, %rd16;
mul.lo.u64 %rd18, %rd17, 4;
add.u64 %rd19, %rd14, %rd18;
add.u64 %rd20, %rd10, %rd14;
bra.uni $Lt_0_19202;
$Lt_0_19458:
add.u64 %rd21, %rd3, %rd8;
ld.global.s32 %r22, [%rd21+0];
cvt.s64.s32 %rd22, %r22;
mul.wide.s32 %rd23, %r22, 4;
add.u64 %rd24, %rd11, %rd23;
cvt.s64.s32 %rd25, %r11;
mul.wide.s32 %rd26, %r11, 4;
add.u64 %rd19, %rd24, %rd26;
mov.s32 %r16, %r1;
add.u64 %rd20, %rd10, %rd24;
$Lt_0_19202:
.loc 16 54 0
ld.global.s32 %r23, [%rd7+0];
mov.u32 %r24, %r23;
mov.s32 %r25, 0;
mov.u32 %r26, %r25;
mov.s32 %r27, 0;
mov.u32 %r28, %r27;
mov.s32 %r29, 0;
mov.u32 %r30, %r29;
tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r24,%r26,%r28,%r30}];
mov.f32 %f21, %f17;
mov.f32 %f22, %f18;
mov.f32 %f23, %f19;
mov.f32 %f24, %f20;
setp.ge.u64 %p3, %rd20, %rd19;
@%p3 bra $Lt_0_27906;
cvt.rzi.ftz.s32.f32 %r31, %f24;
cvt.s64.s32 %rd27, %r16;
ld.param.s32 %r32, [__cudaparm_kernel_pair_lj_types];
mul.lo.s32 %r33, %r32, %r31;
ld.param.u64 %rd28, [__cudaparm_kernel_pair_mor1];
mov.f32 %f25, 0f00000000; // 0
mov.f32 %f26, 0f00000000; // 0
mov.f32 %f27, 0f00000000; // 0
mov.f32 %f28, 0f00000000; // 0
mov.u64 %rd29, __cuda___cuda_local_var_32536_33_non_const_sp_lj92;
$Lt_0_20226:
//<loop> Loop body line 54, nesting depth: 1, estimated iterations: unknown
.loc 16 60 0
ld.global.s32 %r34, [%rd20+0];
.loc 16 61 0
shr.s32 %r35, %r34, 30;
and.b32 %r36, %r35, 3;
cvt.s64.s32 %rd30, %r36;
mul.wide.s32 %rd31, %r36, 4;
add.u64 %rd32, %rd29, %rd31;
ld.shared.f32 %f29, [%rd32+0];
.loc 16 64 0
and.b32 %r37, %r34, 1073741823;
mov.u32 %r38, %r37;
mov.s32 %r39, 0;
mov.u32 %r40, %r39;
mov.s32 %r41, 0;
mov.u32 %r42, %r41;
mov.s32 %r43, 0;
mov.u32 %r44, %r43;
tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r38,%r40,%r42,%r44}];
mov.f32 %f34, %f30;
mov.f32 %f35, %f31;
mov.f32 %f36, %f32;
mov.f32 %f37, %f33;
cvt.rzi.ftz.s32.f32 %r45, %f37;
sub.ftz.f32 %f38, %f22, %f35;
sub.ftz.f32 %f39, %f21, %f34;
sub.ftz.f32 %f40, %f23, %f36;
mul.ftz.f32 %f41, %f38, %f38;
fma.rn.ftz.f32 %f42, %f39, %f39, %f41;
add.s32 %r46, %r45, %r33;
cvt.s64.s32 %rd33, %r46;
fma.rn.ftz.f32 %f43, %f40, %f40, %f42;
mul.wide.s32 %rd34, %r46, 16;
add.u64 %rd35, %rd28, %rd34;
ld.global.f32 %f44, [%rd35+0];
setp.gt.ftz.f32 %p4, %f44, %f43;
@!%p4 bra $Lt_0_21506;
.loc 16 77 0
sqrt.approx.ftz.f32 %f45, %f43;
ld.global.v4.f32 {_,%f46,%f47,%f48}, [%rd35+0];
sub.ftz.f32 %f49, %f45, %f47;
mul.ftz.f32 %f50, %f48, %f49;
neg.ftz.f32 %f51, %f50;
.loc 16 79 0
mov.f32 %f52, 0f3fb8aa3b; // 1.4427
mul.ftz.f32 %f53, %f51, %f52;
ex2.approx.ftz.f32 %f54, %f53;
mul.ftz.f32 %f55, %f54, %f54;
sub.ftz.f32 %f56, %f55, %f54;
mul.ftz.f32 %f57, %f46, %f56;
.loc 16 81 0
div.approx.ftz.f32 %f58, %f57, %f45;
mul.ftz.f32 %f59, %f58, %f29;
fma.rn.ftz.f32 %f27, %f39, %f59, %f27;
.loc 16 82 0
fma.rn.ftz.f32 %f26, %f38, %f59, %f26;
.loc 16 83 0
fma.rn.ftz.f32 %f25, %f40, %f59, %f25;
ld.param.s32 %r47, [__cudaparm_kernel_pair_eflag];
mov.u32 %r48, 0;
setp.le.s32 %p5, %r47, %r48;
@%p5 bra $Lt_0_20994;
.loc 16 87 0
cvt.ftz.f64.f32 %fd1, %f54;
ld.param.u64 %rd36, [__cudaparm_kernel_pair_mor2];
mul.lo.u64 %rd37, %rd33, 8;
add.u64 %rd38, %rd36, %rd37;
ld.global.v2.f32 {%f60,%f61}, [%rd38+0];
cvt.ftz.f64.f32 %fd2, %f61;
cvt.ftz.f64.f32 %fd3, %f60;
mul.ftz.f32 %f62, %f54, %f54;
cvt.ftz.f64.f32 %fd4, %f62;
add.f64 %fd5, %fd1, %fd1;
sub.f64 %fd6, %fd4, %fd5;
mul.f64 %fd7, %fd3, %fd6;
sub.f64 %fd8, %fd7, %fd2;
cvt.rn.ftz.f32.f64 %f63, %fd8;
fma.rn.ftz.f32 %f28, %f29, %f63, %f28;
$Lt_0_20994:
ld.param.s32 %r49, [__cudaparm_kernel_pair_vflag];
mov.u32 %r50, 0;
setp.le.s32 %p6, %r49, %r50;
@%p6 bra $Lt_0_21506;
.loc 16 90 0
mov.f32 %f64, %f6;
mul.ftz.f32 %f65, %f39, %f39;
fma.rn.ftz.f32 %f66, %f59, %f65, %f64;
mov.f32 %f6, %f66;
.loc 16 91 0
mov.f32 %f67, %f8;
fma.rn.ftz.f32 %f68, %f59, %f41, %f67;
mov.f32 %f8, %f68;
.loc 16 92 0
mov.f32 %f69, %f10;
mul.ftz.f32 %f70, %f40, %f40;
fma.rn.ftz.f32 %f71, %f59, %f70, %f69;
mov.f32 %f10, %f71;
.loc 16 93 0
mov.f32 %f72, %f12;
mul.ftz.f32 %f73, %f38, %f39;
fma.rn.ftz.f32 %f74, %f59, %f73, %f72;
mov.f32 %f12, %f74;
.loc 16 94 0
mov.f32 %f75, %f14;
mul.ftz.f32 %f76, %f39, %f40;
fma.rn.ftz.f32 %f77, %f59, %f76, %f75;
mov.f32 %f14, %f77;
.loc 16 95 0
mul.ftz.f32 %f78, %f38, %f40;
fma.rn.ftz.f32 %f15, %f59, %f78, %f15;
mov.f32 %f16, %f15;
$Lt_0_21506:
$Lt_0_20482:
.loc 16 58 0
mul.lo.u64 %rd39, %rd27, 4;
add.u64 %rd20, %rd20, %rd39;
setp.lt.u64 %p7, %rd20, %rd19;
@%p7 bra $Lt_0_20226;
bra.uni $Lt_0_19714;
$Lt_0_27906:
mov.f32 %f25, 0f00000000; // 0
mov.f32 %f26, 0f00000000; // 0
mov.f32 %f27, 0f00000000; // 0
mov.f32 %f28, 0f00000000; // 0
$Lt_0_19714:
mov.u32 %r51, 1;
setp.le.s32 %p8, %r1, %r51;
@%p8 bra $Lt_0_24322;
.loc 16 100 0
mov.u64 %rd40, __cuda___cuda_local_var_32601_55_non_const_red_acc108;
cvt.s64.s32 %rd41, %r2;
mul.wide.s32 %rd42, %r2, 4;
add.u64 %rd43, %rd40, %rd42;
mov.f32 %f79, %f27;
st.shared.f32 [%rd43+0], %f79;
mov.f32 %f80, %f26;
st.shared.f32 [%rd43+512], %f80;
mov.f32 %f81, %f25;
st.shared.f32 [%rd43+1024], %f81;
mov.f32 %f82, %f28;
st.shared.f32 [%rd43+1536], %f82;
shr.s32 %r52, %r1, 31;
mov.s32 %r53, 1;
and.b32 %r54, %r52, %r53;
add.s32 %r55, %r54, %r1;
shr.s32 %r56, %r55, 1;
mov.s32 %r57, %r56;
mov.u32 %r58, 0;
setp.ne.u32 %p9, %r56, %r58;
@!%p9 bra $Lt_0_22786;
$Lt_0_23298:
setp.ge.u32 %p10, %r13, %r57;
@%p10 bra $Lt_0_23554;
add.u32 %r59, %r2, %r57;
cvt.u64.u32 %rd44, %r59;
mul.wide.u32 %rd45, %r59, 4;
add.u64 %rd46, %rd40, %rd45;
ld.shared.f32 %f83, [%rd46+0];
add.ftz.f32 %f79, %f83, %f79;
st.shared.f32 [%rd43+0], %f79;
ld.shared.f32 %f84, [%rd46+512];
add.ftz.f32 %f80, %f84, %f80;
st.shared.f32 [%rd43+512], %f80;
ld.shared.f32 %f85, [%rd46+1024];
add.ftz.f32 %f81, %f85, %f81;
st.shared.f32 [%rd43+1024], %f81;
ld.shared.f32 %f86, [%rd46+1536];
add.ftz.f32 %f82, %f86, %f82;
st.shared.f32 [%rd43+1536], %f82;
$Lt_0_23554:
shr.u32 %r57, %r57, 1;
mov.u32 %r60, 0;
setp.ne.u32 %p11, %r57, %r60;
@%p11 bra $Lt_0_23298;
$Lt_0_22786:
mov.f32 %f27, %f79;
mov.f32 %f26, %f80;
mov.f32 %f25, %f81;
mov.f32 %f28, %f82;
ld.param.s32 %r61, [__cudaparm_kernel_pair_vflag];
mov.u32 %r62, 0;
setp.le.s32 %p12, %r61, %r62;
@%p12 bra $Lt_0_24322;
mov.f32 %f79, %f6;
st.shared.f32 [%rd43+0], %f79;
mov.f32 %f80, %f8;
st.shared.f32 [%rd43+512], %f80;
mov.f32 %f81, %f10;
st.shared.f32 [%rd43+1024], %f81;
mov.f32 %f82, %f12;
st.shared.f32 [%rd43+1536], %f82;
mov.f32 %f87, %f14;
st.shared.f32 [%rd43+2048], %f87;
mov.f32 %f88, %f15;
st.shared.f32 [%rd43+2560], %f88;
mov.s32 %r63, %r56;
@!%p9 bra $Lt_0_24834;
$Lt_0_25346:
setp.ge.u32 %p13, %r13, %r63;
@%p13 bra $Lt_0_25602;
add.u32 %r64, %r2, %r63;
cvt.u64.u32 %rd47, %r64;
mul.wide.u32 %rd48, %r64, 4;
add.u64 %rd49, %rd40, %rd48;
ld.shared.f32 %f89, [%rd49+0];
add.ftz.f32 %f79, %f89, %f79;
st.shared.f32 [%rd43+0], %f79;
ld.shared.f32 %f90, [%rd49+512];
add.ftz.f32 %f80, %f90, %f80;
st.shared.f32 [%rd43+512], %f80;
ld.shared.f32 %f91, [%rd49+1024];
add.ftz.f32 %f81, %f91, %f81;
st.shared.f32 [%rd43+1024], %f81;
ld.shared.f32 %f92, [%rd49+1536];
add.ftz.f32 %f82, %f92, %f82;
st.shared.f32 [%rd43+1536], %f82;
ld.shared.f32 %f93, [%rd49+2048];
add.ftz.f32 %f87, %f93, %f87;
st.shared.f32 [%rd43+2048], %f87;
ld.shared.f32 %f94, [%rd49+2560];
add.ftz.f32 %f88, %f94, %f88;
st.shared.f32 [%rd43+2560], %f88;
$Lt_0_25602:
shr.u32 %r63, %r63, 1;
mov.u32 %r65, 0;
setp.ne.u32 %p14, %r63, %r65;
@%p14 bra $Lt_0_25346;
$Lt_0_24834:
mov.f32 %f6, %f79;
mov.f32 %f8, %f80;
mov.f32 %f10, %f81;
mov.f32 %f12, %f82;
mov.f32 %f14, %f87;
mov.f32 %f16, %f88;
$Lt_0_24322:
$Lt_0_22274:
mov.u32 %r66, 0;
setp.ne.s32 %p15, %r13, %r66;
@%p15 bra $Lt_0_26370;
ld.param.u64 %rd50, [__cudaparm_kernel_pair___val_paramengv];
add.u64 %rd51, %rd50, %rd5;
ld.param.s32 %r67, [__cudaparm_kernel_pair_eflag];
mov.u32 %r68, 0;
setp.le.s32 %p16, %r67, %r68;
@%p16 bra $Lt_0_26882;
st.global.f32 [%rd51+0], %f28;
cvt.s64.s32 %rd52, %r9;
mul.wide.s32 %rd53, %r9, 4;
add.u64 %rd51, %rd51, %rd53;
$Lt_0_26882:
ld.param.s32 %r69, [__cudaparm_kernel_pair_vflag];
mov.u32 %r70, 0;
setp.le.s32 %p17, %r69, %r70;
@%p17 bra $Lt_0_27394;
mov.f32 %f95, %f6;
st.global.f32 [%rd51+0], %f95;
cvt.s64.s32 %rd54, %r9;
mul.wide.s32 %rd55, %r9, 4;
add.u64 %rd56, %rd55, %rd51;
mov.f32 %f96, %f8;
st.global.f32 [%rd56+0], %f96;
add.u64 %rd57, %rd55, %rd56;
mov.f32 %f97, %f10;
st.global.f32 [%rd57+0], %f97;
add.u64 %rd58, %rd55, %rd57;
mov.f32 %f98, %f12;
st.global.f32 [%rd58+0], %f98;
add.u64 %rd51, %rd55, %rd58;
mov.f32 %f99, %f14;
st.global.f32 [%rd51+0], %f99;
mov.f32 %f100, %f16;
add.u64 %rd59, %rd55, %rd51;
st.global.f32 [%rd59+0], %f100;
$Lt_0_27394:
ld.param.u64 %rd60, [__cudaparm_kernel_pair_ans];
mul.lo.u64 %rd61, %rd4, 16;
add.u64 %rd62, %rd60, %rd61;
mov.f32 %f101, %f102;
st.global.v4.f32 [%rd62+0], {%f27,%f26,%f25,%f101};
$Lt_0_26370:
$Lt_0_18690:
.loc 16 103 0
exit;
$LDWend_kernel_pair:
} // kernel_pair
.entry kernel_pair_fast (
.param .u64 __cudaparm_kernel_pair_fast_x_,
.param .u64 __cudaparm_kernel_pair_fast_mor1_in,
.param .u64 __cudaparm_kernel_pair_fast_mor2_in,
.param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,
.param .u64 __cudaparm_kernel_pair_fast_dev_nbor,
.param .u64 __cudaparm_kernel_pair_fast_dev_packed,
.param .u64 __cudaparm_kernel_pair_fast_ans,
.param .u64 __cudaparm_kernel_pair_fast___val_paramengv,
.param .s32 __cudaparm_kernel_pair_fast_eflag,
.param .s32 __cudaparm_kernel_pair_fast_vflag,
.param .s32 __cudaparm_kernel_pair_fast_inum,
.param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,
.param .s32 __cudaparm_kernel_pair_fast_t_per_atom)
{
.reg .u32 %r<74>;
.reg .u64 %rd<77>;
.reg .f32 %f<110>;
.reg .pred %p<22>;
.shared .align 4 .b8 __cuda___cuda_local_var_32618_33_non_const_sp_lj3268[16];
.shared .align 16 .b8 __cuda___cuda_local_var_32616_34_non_const_mor13296[1936];
.shared .align 8 .b8 __cuda___cuda_local_var_32617_34_non_const_mor25232[968];
.shared .align 4 .b8 __cuda___cuda_local_var_32688_55_non_const_red_acc6200[3072];
// __cuda_local_var_32628_10_non_const_f = 48
// __cuda_local_var_32630_9_non_const_virial = 16
.loc 16 111 0
$LDWbegin_kernel_pair_fast:
cvt.s32.u32 %r1, %tid.x;
mov.u32 %r2, 3;
setp.gt.s32 %p1, %r1, %r2;
@%p1 bra $Lt_1_20994;
.loc 16 119 0
mov.u64 %rd1, __cuda___cuda_local_var_32618_33_non_const_sp_lj3268;
cvt.s64.s32 %rd2, %r1;
mul.wide.s32 %rd3, %r1, 4;
ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];
add.u64 %rd5, %rd4, %rd3;
ld.global.f32 %f1, [%rd5+0];
add.u64 %rd6, %rd3, %rd1;
st.shared.f32 [%rd6+0], %f1;
$Lt_1_20994:
mov.u64 %rd1, __cuda___cuda_local_var_32618_33_non_const_sp_lj3268;
mov.u32 %r3, 120;
setp.gt.s32 %p2, %r1, %r3;
@%p2 bra $Lt_1_21506;
.loc 16 121 0
mov.u64 %rd7, __cuda___cuda_local_var_32616_34_non_const_mor13296;
cvt.s64.s32 %rd8, %r1;
mul.wide.s32 %rd9, %r1, 16;
ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_mor1_in];
add.u64 %rd11, %rd10, %rd9;
add.u64 %rd12, %rd9, %rd7;
ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0];
st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5};
ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag];
mov.u32 %r5, 0;
setp.le.s32 %p3, %r4, %r5;
@%p3 bra $Lt_1_22018;
.loc 16 123 0
mov.u64 %rd13, __cuda___cuda_local_var_32617_34_non_const_mor25232;
mul.lo.u64 %rd14, %rd8, 8;
ld.param.u64 %rd15, [__cudaparm_kernel_pair_fast_mor2_in];
add.u64 %rd16, %rd15, %rd14;
add.u64 %rd17, %rd14, %rd13;
ld.global.v2.f32 {%f6,%f7}, [%rd16+0];
st.shared.v2.f32 [%rd17+0], {%f6,%f7};
$Lt_1_22018:
mov.u64 %rd13, __cuda___cuda_local_var_32617_34_non_const_mor25232;
$Lt_1_21506:
mov.u64 %rd13, __cuda___cuda_local_var_32617_34_non_const_mor25232;
mov.u64 %rd7, __cuda___cuda_local_var_32616_34_non_const_mor13296;
.loc 16 131 0
mov.f32 %f8, 0f00000000; // 0
mov.f32 %f9, %f8;
mov.f32 %f10, 0f00000000; // 0
mov.f32 %f11, %f10;
mov.f32 %f12, 0f00000000; // 0
mov.f32 %f13, %f12;
mov.f32 %f14, 0f00000000; // 0
mov.f32 %f15, %f14;
mov.f32 %f16, 0f00000000; // 0
mov.f32 %f17, %f16;
mov.f32 %f18, 0f00000000; // 0
mov.f32 %f19, %f18;
.loc 16 133 0
bar.sync 0;
ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom];
div.s32 %r7, %r1, %r6;
cvt.s32.u32 %r8, %ntid.x;
div.s32 %r9, %r8, %r6;
cvt.s32.u32 %r10, %ctaid.x;
mul.lo.s32 %r11, %r10, %r9;
add.s32 %r12, %r7, %r11;
ld.param.s32 %r13, [__cudaparm_kernel_pair_fast_inum];
setp.ge.s32 %p4, %r12, %r13;
@%p4 bra $Lt_1_30210;
.loc 16 138 0
ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_nbor_pitch];
cvt.s64.s32 %rd18, %r14;
mul.wide.s32 %rd19, %r14, 4;
cvt.s64.s32 %rd20, %r12;
mul.wide.s32 %rd21, %r12, 4;
ld.param.u64 %rd22, [__cudaparm_kernel_pair_fast_dev_nbor];
add.u64 %rd23, %rd21, %rd22;
add.u64 %rd24, %rd19, %rd23;
ld.global.s32 %r15, [%rd24+0];
sub.s32 %r16, %r6, 1;
and.b32 %r17, %r16, %r1;
cvt.s64.s32 %rd25, %r17;
mul.wide.s32 %rd26, %r17, 4;
ld.param.u64 %rd27, [__cudaparm_kernel_pair_fast_dev_packed];
setp.ne.u64 %p5, %rd27, %rd22;
@%p5 bra $Lt_1_23298;
cvt.s32.s64 %r18, %rd18;
mul.lo.s32 %r19, %r18, %r6;
mov.s32 %r20, %r19;
mul.lo.s32 %r21, %r16, %r12;
add.s32 %r22, %r18, %r21;
cvt.s64.s32 %rd28, %r22;
mul.wide.s32 %rd29, %r22, 4;
add.u64 %rd30, %rd24, %rd29;
and.b32 %r23, %r16, %r15;
cvt.s64.s32 %rd31, %r23;
div.s32 %r24, %r15, %r6;
mul.lo.s32 %r25, %r19, %r24;
cvt.s64.s32 %rd32, %r25;
add.u64 %rd33, %rd31, %rd32;
mul.lo.u64 %rd34, %rd33, 4;
add.u64 %rd35, %rd30, %rd34;
add.u64 %rd36, %rd26, %rd30;
bra.uni $Lt_1_23042;
$Lt_1_23298:
add.u64 %rd37, %rd19, %rd24;
ld.global.s32 %r26, [%rd37+0];
cvt.s64.s32 %rd38, %r26;
mul.wide.s32 %rd39, %r26, 4;
add.u64 %rd40, %rd27, %rd39;
cvt.s64.s32 %rd41, %r15;
mul.wide.s32 %rd42, %r15, 4;
add.u64 %rd35, %rd40, %rd42;
mov.s32 %r20, %r6;
add.u64 %rd36, %rd26, %rd40;
$Lt_1_23042:
.loc 16 141 0
ld.global.s32 %r27, [%rd23+0];
mov.u32 %r28, %r27;
mov.s32 %r29, 0;
mov.u32 %r30, %r29;
mov.s32 %r31, 0;
mov.u32 %r32, %r31;
mov.s32 %r33, 0;
mov.u32 %r34, %r33;
tex.1d.v4.f32.s32 {%f20,%f21,%f22,%f23},[pos_tex,{%r28,%r30,%r32,%r34}];
mov.f32 %f24, %f20;
mov.f32 %f25, %f21;
mov.f32 %f26, %f22;
mov.f32 %f27, %f23;
setp.ge.u64 %p6, %rd36, %rd35;
@%p6 bra $Lt_1_31746;
cvt.rzi.ftz.s32.f32 %r35, %f27;
cvt.s64.s32 %rd43, %r20;
mul.lo.s32 %r36, %r35, 11;
cvt.rn.f32.s32 %f28, %r36;
mov.f32 %f29, 0f00000000; // 0
mov.f32 %f30, 0f00000000; // 0
mov.f32 %f31, 0f00000000; // 0
mov.f32 %f32, 0f00000000; // 0
$Lt_1_24066:
//<loop> Loop body line 141, nesting depth: 1, estimated iterations: unknown
.loc 16 148 0
ld.global.s32 %r37, [%rd36+0];
.loc 16 149 0
shr.s32 %r38, %r37, 30;
and.b32 %r39, %r38, 3;
cvt.s64.s32 %rd44, %r39;
mul.wide.s32 %rd45, %r39, 4;
add.u64 %rd46, %rd1, %rd45;
ld.shared.f32 %f33, [%rd46+0];
.loc 16 152 0
and.b32 %r40, %r37, 1073741823;
mov.u32 %r41, %r40;
mov.s32 %r42, 0;
mov.u32 %r43, %r42;
mov.s32 %r44, 0;
mov.u32 %r45, %r44;
mov.s32 %r46, 0;
mov.u32 %r47, %r46;
tex.1d.v4.f32.s32 {%f34,%f35,%f36,%f37},[pos_tex,{%r41,%r43,%r45,%r47}];
mov.f32 %f38, %f34;
mov.f32 %f39, %f35;
mov.f32 %f40, %f36;
mov.f32 %f41, %f37;
sub.ftz.f32 %f42, %f25, %f39;
sub.ftz.f32 %f43, %f24, %f38;
sub.ftz.f32 %f44, %f26, %f40;
mul.ftz.f32 %f45, %f42, %f42;
fma.rn.ftz.f32 %f46, %f43, %f43, %f45;
fma.rn.ftz.f32 %f47, %f44, %f44, %f46;
add.ftz.f32 %f48, %f28, %f41;
cvt.rzi.ftz.s32.f32 %r48, %f48;
cvt.s64.s32 %rd47, %r48;
mul.wide.s32 %rd48, %r48, 16;
add.u64 %rd49, %rd7, %rd48;
ld.shared.f32 %f49, [%rd49+0];
setp.gt.ftz.f32 %p7, %f49, %f47;
@!%p7 bra $Lt_1_25346;
.loc 16 163 0
sqrt.approx.ftz.f32 %f50, %f47;
ld.shared.v4.f32 {_,%f51,%f52,%f53}, [%rd49+0];
sub.ftz.f32 %f54, %f50, %f52;
.loc 16 164 0
mul.ftz.f32 %f55, %f53, %f54;
neg.ftz.f32 %f56, %f55;
.loc 16 166 0
mov.f32 %f57, 0f3fb8aa3b; // 1.4427
mul.ftz.f32 %f58, %f56, %f57;
ex2.approx.ftz.f32 %f59, %f58;
mul.ftz.f32 %f60, %f59, %f59;
sub.ftz.f32 %f61, %f60, %f59;
mul.ftz.f32 %f62, %f51, %f61;
.loc 16 168 0
div.approx.ftz.f32 %f63, %f62, %f50;
mul.ftz.f32 %f64, %f63, %f33;
fma.rn.ftz.f32 %f31, %f43, %f64, %f31;
.loc 16 169 0
fma.rn.ftz.f32 %f30, %f42, %f64, %f30;
.loc 16 170 0
fma.rn.ftz.f32 %f29, %f44, %f64, %f29;
ld.param.s32 %r49, [__cudaparm_kernel_pair_fast_eflag];
mov.u32 %r50, 0;
setp.le.s32 %p8, %r49, %r50;
@%p8 bra $Lt_1_24834;
.loc 16 173 0
mul.lo.u64 %rd50, %rd47, 8;
add.u64 %rd51, %rd13, %rd50;
ld.shared.v2.f32 {%f65,%f66}, [%rd51+0];
sub.ftz.f32 %f67, %f61, %f59;
mul.ftz.f32 %f68, %f65, %f67;
sub.ftz.f32 %f69, %f68, %f66;
.loc 16 174 0
fma.rn.ftz.f32 %f32, %f33, %f69, %f32;
$Lt_1_24834:
ld.param.s32 %r51, [__cudaparm_kernel_pair_fast_vflag];
mov.u32 %r52, 0;
setp.le.s32 %p9, %r51, %r52;
@%p9 bra $Lt_1_25346;
.loc 16 177 0
mov.f32 %f70, %f9;
mul.ftz.f32 %f71, %f43, %f43;
fma.rn.ftz.f32 %f72, %f64, %f71, %f70;
mov.f32 %f9, %f72;
.loc 16 178 0
mov.f32 %f73, %f11;
fma.rn.ftz.f32 %f74, %f64, %f45, %f73;
mov.f32 %f11, %f74;
.loc 16 179 0
mov.f32 %f75, %f13;
mul.ftz.f32 %f76, %f44, %f44;
fma.rn.ftz.f32 %f77, %f64, %f76, %f75;
mov.f32 %f13, %f77;
.loc 16 180 0
mov.f32 %f78, %f15;
mul.ftz.f32 %f79, %f42, %f43;
fma.rn.ftz.f32 %f80, %f64, %f79, %f78;
mov.f32 %f15, %f80;
.loc 16 181 0
mov.f32 %f81, %f17;
mul.ftz.f32 %f82, %f43, %f44;
fma.rn.ftz.f32 %f83, %f64, %f82, %f81;
mov.f32 %f17, %f83;
.loc 16 182 0
mul.ftz.f32 %f84, %f42, %f44;
fma.rn.ftz.f32 %f18, %f64, %f84, %f18;
mov.f32 %f19, %f18;
$Lt_1_25346:
$Lt_1_24322:
.loc 16 146 0
mul.lo.u64 %rd52, %rd43, 4;
add.u64 %rd36, %rd36, %rd52;
setp.lt.u64 %p10, %rd36, %rd35;
@%p10 bra $Lt_1_24066;
bra.uni $Lt_1_23554;
$Lt_1_31746:
mov.f32 %f29, 0f00000000; // 0
mov.f32 %f30, 0f00000000; // 0
mov.f32 %f31, 0f00000000; // 0
mov.f32 %f32, 0f00000000; // 0
$Lt_1_23554:
mov.u32 %r53, 1;
setp.le.s32 %p11, %r6, %r53;
@%p11 bra $Lt_1_28162;
.loc 16 187 0
mov.u64 %rd53, __cuda___cuda_local_var_32688_55_non_const_red_acc6200;
cvt.s64.s32 %rd54, %r1;
mul.wide.s32 %rd55, %r1, 4;
add.u64 %rd56, %rd53, %rd55;
mov.f32 %f85, %f31;
st.shared.f32 [%rd56+0], %f85;
mov.f32 %f86, %f30;
st.shared.f32 [%rd56+512], %f86;
mov.f32 %f87, %f29;
st.shared.f32 [%rd56+1024], %f87;
mov.f32 %f88, %f32;
st.shared.f32 [%rd56+1536], %f88;
shr.s32 %r54, %r6, 31;
mov.s32 %r55, 1;
and.b32 %r56, %r54, %r55;
add.s32 %r57, %r56, %r6;
shr.s32 %r58, %r57, 1;
mov.s32 %r59, %r58;
mov.u32 %r60, 0;
setp.ne.u32 %p12, %r58, %r60;
@!%p12 bra $Lt_1_26626;
$Lt_1_27138:
setp.ge.u32 %p13, %r17, %r59;
@%p13 bra $Lt_1_27394;
add.u32 %r61, %r1, %r59;
cvt.u64.u32 %rd57, %r61;
mul.wide.u32 %rd58, %r61, 4;
add.u64 %rd59, %rd53, %rd58;
ld.shared.f32 %f89, [%rd59+0];
add.ftz.f32 %f85, %f89, %f85;
st.shared.f32 [%rd56+0], %f85;
ld.shared.f32 %f90, [%rd59+512];
add.ftz.f32 %f86, %f90, %f86;
st.shared.f32 [%rd56+512], %f86;
ld.shared.f32 %f91, [%rd59+1024];
add.ftz.f32 %f87, %f91, %f87;
st.shared.f32 [%rd56+1024], %f87;
ld.shared.f32 %f92, [%rd59+1536];
add.ftz.f32 %f88, %f92, %f88;
st.shared.f32 [%rd56+1536], %f88;
$Lt_1_27394:
shr.u32 %r59, %r59, 1;
mov.u32 %r62, 0;
setp.ne.u32 %p14, %r59, %r62;
@%p14 bra $Lt_1_27138;
$Lt_1_26626:
mov.f32 %f31, %f85;
mov.f32 %f30, %f86;
mov.f32 %f29, %f87;
mov.f32 %f32, %f88;
ld.param.s32 %r63, [__cudaparm_kernel_pair_fast_vflag];
mov.u32 %r64, 0;
setp.le.s32 %p15, %r63, %r64;
@%p15 bra $Lt_1_28162;
mov.f32 %f85, %f9;
st.shared.f32 [%rd56+0], %f85;
mov.f32 %f86, %f11;
st.shared.f32 [%rd56+512], %f86;
mov.f32 %f87, %f13;
st.shared.f32 [%rd56+1024], %f87;
mov.f32 %f88, %f15;
st.shared.f32 [%rd56+1536], %f88;
mov.f32 %f93, %f17;
st.shared.f32 [%rd56+2048], %f93;
mov.f32 %f94, %f18;
st.shared.f32 [%rd56+2560], %f94;
mov.s32 %r65, %r58;
@!%p12 bra $Lt_1_28674;
$Lt_1_29186:
setp.ge.u32 %p16, %r17, %r65;
@%p16 bra $Lt_1_29442;
add.u32 %r66, %r1, %r65;
cvt.u64.u32 %rd60, %r66;
mul.wide.u32 %rd61, %r66, 4;
add.u64 %rd62, %rd53, %rd61;
ld.shared.f32 %f95, [%rd62+0];
add.ftz.f32 %f85, %f95, %f85;
st.shared.f32 [%rd56+0], %f85;
ld.shared.f32 %f96, [%rd62+512];
add.ftz.f32 %f86, %f96, %f86;
st.shared.f32 [%rd56+512], %f86;
ld.shared.f32 %f97, [%rd62+1024];
add.ftz.f32 %f87, %f97, %f87;
st.shared.f32 [%rd56+1024], %f87;
ld.shared.f32 %f98, [%rd62+1536];
add.ftz.f32 %f88, %f98, %f88;
st.shared.f32 [%rd56+1536], %f88;
ld.shared.f32 %f99, [%rd62+2048];
add.ftz.f32 %f93, %f99, %f93;
st.shared.f32 [%rd56+2048], %f93;
ld.shared.f32 %f100, [%rd62+2560];
add.ftz.f32 %f94, %f100, %f94;
st.shared.f32 [%rd56+2560], %f94;
$Lt_1_29442:
shr.u32 %r65, %r65, 1;
mov.u32 %r67, 0;
setp.ne.u32 %p17, %r65, %r67;
@%p17 bra $Lt_1_29186;
$Lt_1_28674:
mov.f32 %f9, %f85;
mov.f32 %f11, %f86;
mov.f32 %f13, %f87;
mov.f32 %f15, %f88;
mov.f32 %f17, %f93;
mov.f32 %f19, %f94;
$Lt_1_28162:
$Lt_1_26114:
mov.u32 %r68, 0;
setp.ne.s32 %p18, %r17, %r68;
@%p18 bra $Lt_1_30210;
ld.param.u64 %rd63, [__cudaparm_kernel_pair_fast___val_paramengv];
add.u64 %rd64, %rd63, %rd21;
ld.param.s32 %r69, [__cudaparm_kernel_pair_fast_eflag];
mov.u32 %r70, 0;
setp.le.s32 %p19, %r69, %r70;
@%p19 bra $Lt_1_30722;
st.global.f32 [%rd64+0], %f32;
cvt.s64.s32 %rd65, %r13;
mul.wide.s32 %rd66, %r13, 4;
add.u64 %rd64, %rd64, %rd66;
$Lt_1_30722:
ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag];
mov.u32 %r72, 0;
setp.le.s32 %p20, %r71, %r72;
@%p20 bra $Lt_1_31234;
mov.f32 %f101, %f9;
st.global.f32 [%rd64+0], %f101;
cvt.s64.s32 %rd67, %r13;
mul.wide.s32 %rd68, %r13, 4;
add.u64 %rd69, %rd68, %rd64;
mov.f32 %f102, %f11;
st.global.f32 [%rd69+0], %f102;
add.u64 %rd70, %rd68, %rd69;
mov.f32 %f103, %f13;
st.global.f32 [%rd70+0], %f103;
add.u64 %rd71, %rd68, %rd70;
mov.f32 %f104, %f15;
st.global.f32 [%rd71+0], %f104;
add.u64 %rd64, %rd68, %rd71;
mov.f32 %f105, %f17;
st.global.f32 [%rd64+0], %f105;
mov.f32 %f106, %f19;
add.u64 %rd72, %rd68, %rd64;
st.global.f32 [%rd72+0], %f106;
$Lt_1_31234:
ld.param.u64 %rd73, [__cudaparm_kernel_pair_fast_ans];
mul.lo.u64 %rd74, %rd20, 16;
add.u64 %rd75, %rd73, %rd74;
mov.f32 %f107, %f108;
st.global.v4.f32 [%rd75+0], {%f31,%f30,%f29,%f107};
$Lt_1_30210:
$Lt_1_22530:
.loc 16 190 0
exit;
$LDWend_kernel_pair_fast:
} // kernel_pair_fast

View File

@ -1,869 +0,0 @@
const char * morse =
" .version 2.3\n"
" .target sm_20\n"
" .address_size 64\n"
" .global .texref pos_tex;\n"
" .entry kernel_pair (\n"
" .param .u64 __cudaparm_kernel_pair_x_,\n"
" .param .u64 __cudaparm_kernel_pair_mor1,\n"
" .param .u64 __cudaparm_kernel_pair_mor2,\n"
" .param .s32 __cudaparm_kernel_pair_lj_types,\n"
" .param .u64 __cudaparm_kernel_pair_sp_lj_in,\n"
" .param .u64 __cudaparm_kernel_pair_dev_nbor,\n"
" .param .u64 __cudaparm_kernel_pair_dev_packed,\n"
" .param .u64 __cudaparm_kernel_pair_ans,\n"
" .param .u64 __cudaparm_kernel_pair___val_paramengv,\n"
" .param .s32 __cudaparm_kernel_pair_eflag,\n"
" .param .s32 __cudaparm_kernel_pair_vflag,\n"
" .param .s32 __cudaparm_kernel_pair_inum,\n"
" .param .s32 __cudaparm_kernel_pair_nbor_pitch,\n"
" .param .s32 __cudaparm_kernel_pair_t_per_atom)\n"
" {\n"
" .reg .u32 %r<72>;\n"
" .reg .u64 %rd<64>;\n"
" .reg .f32 %f<104>;\n"
" .reg .f64 %fd<10>;\n"
" .reg .pred %p<19>;\n"
" .shared .align 16 .b8 __cuda___cuda_local_var_32536_33_non_const_sp_lj92[16];\n"
" .shared .align 4 .b8 __cuda___cuda_local_var_32601_55_non_const_red_acc108[3072];\n"
" .loc 16 31 0\n"
"$LDWbegin_kernel_pair:\n"
" .loc 16 36 0\n"
" ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];\n"
" ldu.global.f32 %f1, [%rd1+0];\n"
" .loc 16 37 0\n"
" ld.global.f32 %f2, [%rd1+4];\n"
" .loc 16 38 0\n"
" ld.global.f32 %f3, [%rd1+8];\n"
" .loc 16 39 0\n"
" ld.global.f32 %f4, [%rd1+12];\n"
" st.shared.v4.f32 [__cuda___cuda_local_var_32536_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4};\n"
" .loc 16 46 0\n"
" mov.f32 %f5, 0f00000000; \n"
" mov.f32 %f6, %f5;\n"
" mov.f32 %f7, 0f00000000; \n"
" mov.f32 %f8, %f7;\n"
" mov.f32 %f9, 0f00000000; \n"
" mov.f32 %f10, %f9;\n"
" mov.f32 %f11, 0f00000000; \n"
" mov.f32 %f12, %f11;\n"
" mov.f32 %f13, 0f00000000; \n"
" mov.f32 %f14, %f13;\n"
" mov.f32 %f15, 0f00000000; \n"
" mov.f32 %f16, %f15;\n"
" ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];\n"
" cvt.s32.u32 %r2, %tid.x;\n"
" div.s32 %r3, %r2, %r1;\n"
" cvt.s32.u32 %r4, %ntid.x;\n"
" div.s32 %r5, %r4, %r1;\n"
" cvt.s32.u32 %r6, %ctaid.x;\n"
" mul.lo.s32 %r7, %r6, %r5;\n"
" add.s32 %r8, %r3, %r7;\n"
" ld.param.s32 %r9, [__cudaparm_kernel_pair_inum];\n"
" setp.ge.s32 %p1, %r8, %r9;\n"
" @%p1 bra $Lt_0_26370;\n"
" .loc 16 51 0\n"
" ld.param.s32 %r10, [__cudaparm_kernel_pair_nbor_pitch];\n"
" cvt.s64.s32 %rd2, %r10;\n"
" mul.wide.s32 %rd3, %r10, 4;\n"
" cvt.s64.s32 %rd4, %r8;\n"
" mul.wide.s32 %rd5, %r8, 4;\n"
" ld.param.u64 %rd6, [__cudaparm_kernel_pair_dev_nbor];\n"
" add.u64 %rd7, %rd5, %rd6;\n"
" add.u64 %rd8, %rd3, %rd7;\n"
" ld.global.s32 %r11, [%rd8+0];\n"
" sub.s32 %r12, %r1, 1;\n"
" and.b32 %r13, %r12, %r2;\n"
" cvt.s64.s32 %rd9, %r13;\n"
" mul.wide.s32 %rd10, %r13, 4;\n"
" ld.param.u64 %rd11, [__cudaparm_kernel_pair_dev_packed];\n"
" setp.ne.u64 %p2, %rd11, %rd6;\n"
" @%p2 bra $Lt_0_19458;\n"
" cvt.s32.s64 %r14, %rd2;\n"
" mul.lo.s32 %r15, %r14, %r1;\n"
" mov.s32 %r16, %r15;\n"
" mul.lo.s32 %r17, %r12, %r8;\n"
" add.s32 %r18, %r14, %r17;\n"
" cvt.s64.s32 %rd12, %r18;\n"
" mul.wide.s32 %rd13, %r18, 4;\n"
" add.u64 %rd14, %rd8, %rd13;\n"
" and.b32 %r19, %r12, %r11;\n"
" cvt.s64.s32 %rd15, %r19;\n"
" div.s32 %r20, %r11, %r1;\n"
" mul.lo.s32 %r21, %r15, %r20;\n"
" cvt.s64.s32 %rd16, %r21;\n"
" add.u64 %rd17, %rd15, %rd16;\n"
" mul.lo.u64 %rd18, %rd17, 4;\n"
" add.u64 %rd19, %rd14, %rd18;\n"
" add.u64 %rd20, %rd10, %rd14;\n"
" bra.uni $Lt_0_19202;\n"
"$Lt_0_19458:\n"
" add.u64 %rd21, %rd3, %rd8;\n"
" ld.global.s32 %r22, [%rd21+0];\n"
" cvt.s64.s32 %rd22, %r22;\n"
" mul.wide.s32 %rd23, %r22, 4;\n"
" add.u64 %rd24, %rd11, %rd23;\n"
" cvt.s64.s32 %rd25, %r11;\n"
" mul.wide.s32 %rd26, %r11, 4;\n"
" add.u64 %rd19, %rd24, %rd26;\n"
" mov.s32 %r16, %r1;\n"
" add.u64 %rd20, %rd10, %rd24;\n"
"$Lt_0_19202:\n"
" .loc 16 54 0\n"
" ld.global.s32 %r23, [%rd7+0];\n"
" mov.u32 %r24, %r23;\n"
" mov.s32 %r25, 0;\n"
" mov.u32 %r26, %r25;\n"
" mov.s32 %r27, 0;\n"
" mov.u32 %r28, %r27;\n"
" mov.s32 %r29, 0;\n"
" mov.u32 %r30, %r29;\n"
" tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r24,%r26,%r28,%r30}];\n"
" mov.f32 %f21, %f17;\n"
" mov.f32 %f22, %f18;\n"
" mov.f32 %f23, %f19;\n"
" mov.f32 %f24, %f20;\n"
" setp.ge.u64 %p3, %rd20, %rd19;\n"
" @%p3 bra $Lt_0_27906;\n"
" cvt.rzi.ftz.s32.f32 %r31, %f24;\n"
" cvt.s64.s32 %rd27, %r16;\n"
" ld.param.s32 %r32, [__cudaparm_kernel_pair_lj_types];\n"
" mul.lo.s32 %r33, %r32, %r31;\n"
" ld.param.u64 %rd28, [__cudaparm_kernel_pair_mor1];\n"
" mov.f32 %f25, 0f00000000; \n"
" mov.f32 %f26, 0f00000000; \n"
" mov.f32 %f27, 0f00000000; \n"
" mov.f32 %f28, 0f00000000; \n"
" mov.u64 %rd29, __cuda___cuda_local_var_32536_33_non_const_sp_lj92;\n"
"$Lt_0_20226:\n"
" .loc 16 60 0\n"
" ld.global.s32 %r34, [%rd20+0];\n"
" .loc 16 61 0\n"
" shr.s32 %r35, %r34, 30;\n"
" and.b32 %r36, %r35, 3;\n"
" cvt.s64.s32 %rd30, %r36;\n"
" mul.wide.s32 %rd31, %r36, 4;\n"
" add.u64 %rd32, %rd29, %rd31;\n"
" ld.shared.f32 %f29, [%rd32+0];\n"
" .loc 16 64 0\n"
" and.b32 %r37, %r34, 1073741823;\n"
" mov.u32 %r38, %r37;\n"
" mov.s32 %r39, 0;\n"
" mov.u32 %r40, %r39;\n"
" mov.s32 %r41, 0;\n"
" mov.u32 %r42, %r41;\n"
" mov.s32 %r43, 0;\n"
" mov.u32 %r44, %r43;\n"
" tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r38,%r40,%r42,%r44}];\n"
" mov.f32 %f34, %f30;\n"
" mov.f32 %f35, %f31;\n"
" mov.f32 %f36, %f32;\n"
" mov.f32 %f37, %f33;\n"
" cvt.rzi.ftz.s32.f32 %r45, %f37;\n"
" sub.ftz.f32 %f38, %f22, %f35;\n"
" sub.ftz.f32 %f39, %f21, %f34;\n"
" sub.ftz.f32 %f40, %f23, %f36;\n"
" mul.ftz.f32 %f41, %f38, %f38;\n"
" fma.rn.ftz.f32 %f42, %f39, %f39, %f41;\n"
" add.s32 %r46, %r45, %r33;\n"
" cvt.s64.s32 %rd33, %r46;\n"
" fma.rn.ftz.f32 %f43, %f40, %f40, %f42;\n"
" mul.wide.s32 %rd34, %r46, 16;\n"
" add.u64 %rd35, %rd28, %rd34;\n"
" ld.global.f32 %f44, [%rd35+0];\n"
" setp.gt.ftz.f32 %p4, %f44, %f43;\n"
" @!%p4 bra $Lt_0_21506;\n"
" .loc 16 77 0\n"
" sqrt.approx.ftz.f32 %f45, %f43;\n"
" ld.global.v4.f32 {_,%f46,%f47,%f48}, [%rd35+0];\n"
" sub.ftz.f32 %f49, %f45, %f47;\n"
" mul.ftz.f32 %f50, %f48, %f49;\n"
" neg.ftz.f32 %f51, %f50;\n"
" .loc 16 79 0\n"
" mov.f32 %f52, 0f3fb8aa3b; \n"
" mul.ftz.f32 %f53, %f51, %f52;\n"
" ex2.approx.ftz.f32 %f54, %f53;\n"
" mul.ftz.f32 %f55, %f54, %f54;\n"
" sub.ftz.f32 %f56, %f55, %f54;\n"
" mul.ftz.f32 %f57, %f46, %f56;\n"
" .loc 16 81 0\n"
" div.approx.ftz.f32 %f58, %f57, %f45;\n"
" mul.ftz.f32 %f59, %f58, %f29;\n"
" fma.rn.ftz.f32 %f27, %f39, %f59, %f27;\n"
" .loc 16 82 0\n"
" fma.rn.ftz.f32 %f26, %f38, %f59, %f26;\n"
" .loc 16 83 0\n"
" fma.rn.ftz.f32 %f25, %f40, %f59, %f25;\n"
" ld.param.s32 %r47, [__cudaparm_kernel_pair_eflag];\n"
" mov.u32 %r48, 0;\n"
" setp.le.s32 %p5, %r47, %r48;\n"
" @%p5 bra $Lt_0_20994;\n"
" .loc 16 87 0\n"
" cvt.ftz.f64.f32 %fd1, %f54;\n"
" ld.param.u64 %rd36, [__cudaparm_kernel_pair_mor2];\n"
" mul.lo.u64 %rd37, %rd33, 8;\n"
" add.u64 %rd38, %rd36, %rd37;\n"
" ld.global.v2.f32 {%f60,%f61}, [%rd38+0];\n"
" cvt.ftz.f64.f32 %fd2, %f61;\n"
" cvt.ftz.f64.f32 %fd3, %f60;\n"
" mul.ftz.f32 %f62, %f54, %f54;\n"
" cvt.ftz.f64.f32 %fd4, %f62;\n"
" add.f64 %fd5, %fd1, %fd1;\n"
" sub.f64 %fd6, %fd4, %fd5;\n"
" mul.f64 %fd7, %fd3, %fd6;\n"
" sub.f64 %fd8, %fd7, %fd2;\n"
" cvt.rn.ftz.f32.f64 %f63, %fd8;\n"
" fma.rn.ftz.f32 %f28, %f29, %f63, %f28;\n"
"$Lt_0_20994:\n"
" ld.param.s32 %r49, [__cudaparm_kernel_pair_vflag];\n"
" mov.u32 %r50, 0;\n"
" setp.le.s32 %p6, %r49, %r50;\n"
" @%p6 bra $Lt_0_21506;\n"
" .loc 16 90 0\n"
" mov.f32 %f64, %f6;\n"
" mul.ftz.f32 %f65, %f39, %f39;\n"
" fma.rn.ftz.f32 %f66, %f59, %f65, %f64;\n"
" mov.f32 %f6, %f66;\n"
" .loc 16 91 0\n"
" mov.f32 %f67, %f8;\n"
" fma.rn.ftz.f32 %f68, %f59, %f41, %f67;\n"
" mov.f32 %f8, %f68;\n"
" .loc 16 92 0\n"
" mov.f32 %f69, %f10;\n"
" mul.ftz.f32 %f70, %f40, %f40;\n"
" fma.rn.ftz.f32 %f71, %f59, %f70, %f69;\n"
" mov.f32 %f10, %f71;\n"
" .loc 16 93 0\n"
" mov.f32 %f72, %f12;\n"
" mul.ftz.f32 %f73, %f38, %f39;\n"
" fma.rn.ftz.f32 %f74, %f59, %f73, %f72;\n"
" mov.f32 %f12, %f74;\n"
" .loc 16 94 0\n"
" mov.f32 %f75, %f14;\n"
" mul.ftz.f32 %f76, %f39, %f40;\n"
" fma.rn.ftz.f32 %f77, %f59, %f76, %f75;\n"
" mov.f32 %f14, %f77;\n"
" .loc 16 95 0\n"
" mul.ftz.f32 %f78, %f38, %f40;\n"
" fma.rn.ftz.f32 %f15, %f59, %f78, %f15;\n"
" mov.f32 %f16, %f15;\n"
"$Lt_0_21506:\n"
"$Lt_0_20482:\n"
" .loc 16 58 0\n"
" mul.lo.u64 %rd39, %rd27, 4;\n"
" add.u64 %rd20, %rd20, %rd39;\n"
" setp.lt.u64 %p7, %rd20, %rd19;\n"
" @%p7 bra $Lt_0_20226;\n"
" bra.uni $Lt_0_19714;\n"
"$Lt_0_27906:\n"
" mov.f32 %f25, 0f00000000; \n"
" mov.f32 %f26, 0f00000000; \n"
" mov.f32 %f27, 0f00000000; \n"
" mov.f32 %f28, 0f00000000; \n"
"$Lt_0_19714:\n"
" mov.u32 %r51, 1;\n"
" setp.le.s32 %p8, %r1, %r51;\n"
" @%p8 bra $Lt_0_24322;\n"
" .loc 16 100 0\n"
" mov.u64 %rd40, __cuda___cuda_local_var_32601_55_non_const_red_acc108;\n"
" cvt.s64.s32 %rd41, %r2;\n"
" mul.wide.s32 %rd42, %r2, 4;\n"
" add.u64 %rd43, %rd40, %rd42;\n"
" mov.f32 %f79, %f27;\n"
" st.shared.f32 [%rd43+0], %f79;\n"
" mov.f32 %f80, %f26;\n"
" st.shared.f32 [%rd43+512], %f80;\n"
" mov.f32 %f81, %f25;\n"
" st.shared.f32 [%rd43+1024], %f81;\n"
" mov.f32 %f82, %f28;\n"
" st.shared.f32 [%rd43+1536], %f82;\n"
" shr.s32 %r52, %r1, 31;\n"
" mov.s32 %r53, 1;\n"
" and.b32 %r54, %r52, %r53;\n"
" add.s32 %r55, %r54, %r1;\n"
" shr.s32 %r56, %r55, 1;\n"
" mov.s32 %r57, %r56;\n"
" mov.u32 %r58, 0;\n"
" setp.ne.u32 %p9, %r56, %r58;\n"
" @!%p9 bra $Lt_0_22786;\n"
"$Lt_0_23298:\n"
" setp.ge.u32 %p10, %r13, %r57;\n"
" @%p10 bra $Lt_0_23554;\n"
" add.u32 %r59, %r2, %r57;\n"
" cvt.u64.u32 %rd44, %r59;\n"
" mul.wide.u32 %rd45, %r59, 4;\n"
" add.u64 %rd46, %rd40, %rd45;\n"
" ld.shared.f32 %f83, [%rd46+0];\n"
" add.ftz.f32 %f79, %f83, %f79;\n"
" st.shared.f32 [%rd43+0], %f79;\n"
" ld.shared.f32 %f84, [%rd46+512];\n"
" add.ftz.f32 %f80, %f84, %f80;\n"
" st.shared.f32 [%rd43+512], %f80;\n"
" ld.shared.f32 %f85, [%rd46+1024];\n"
" add.ftz.f32 %f81, %f85, %f81;\n"
" st.shared.f32 [%rd43+1024], %f81;\n"
" ld.shared.f32 %f86, [%rd46+1536];\n"
" add.ftz.f32 %f82, %f86, %f82;\n"
" st.shared.f32 [%rd43+1536], %f82;\n"
"$Lt_0_23554:\n"
" shr.u32 %r57, %r57, 1;\n"
" mov.u32 %r60, 0;\n"
" setp.ne.u32 %p11, %r57, %r60;\n"
" @%p11 bra $Lt_0_23298;\n"
"$Lt_0_22786:\n"
" mov.f32 %f27, %f79;\n"
" mov.f32 %f26, %f80;\n"
" mov.f32 %f25, %f81;\n"
" mov.f32 %f28, %f82;\n"
" ld.param.s32 %r61, [__cudaparm_kernel_pair_vflag];\n"
" mov.u32 %r62, 0;\n"
" setp.le.s32 %p12, %r61, %r62;\n"
" @%p12 bra $Lt_0_24322;\n"
" mov.f32 %f79, %f6;\n"
" st.shared.f32 [%rd43+0], %f79;\n"
" mov.f32 %f80, %f8;\n"
" st.shared.f32 [%rd43+512], %f80;\n"
" mov.f32 %f81, %f10;\n"
" st.shared.f32 [%rd43+1024], %f81;\n"
" mov.f32 %f82, %f12;\n"
" st.shared.f32 [%rd43+1536], %f82;\n"
" mov.f32 %f87, %f14;\n"
" st.shared.f32 [%rd43+2048], %f87;\n"
" mov.f32 %f88, %f15;\n"
" st.shared.f32 [%rd43+2560], %f88;\n"
" mov.s32 %r63, %r56;\n"
" @!%p9 bra $Lt_0_24834;\n"
"$Lt_0_25346:\n"
" setp.ge.u32 %p13, %r13, %r63;\n"
" @%p13 bra $Lt_0_25602;\n"
" add.u32 %r64, %r2, %r63;\n"
" cvt.u64.u32 %rd47, %r64;\n"
" mul.wide.u32 %rd48, %r64, 4;\n"
" add.u64 %rd49, %rd40, %rd48;\n"
" ld.shared.f32 %f89, [%rd49+0];\n"
" add.ftz.f32 %f79, %f89, %f79;\n"
" st.shared.f32 [%rd43+0], %f79;\n"
" ld.shared.f32 %f90, [%rd49+512];\n"
" add.ftz.f32 %f80, %f90, %f80;\n"
" st.shared.f32 [%rd43+512], %f80;\n"
" ld.shared.f32 %f91, [%rd49+1024];\n"
" add.ftz.f32 %f81, %f91, %f81;\n"
" st.shared.f32 [%rd43+1024], %f81;\n"
" ld.shared.f32 %f92, [%rd49+1536];\n"
" add.ftz.f32 %f82, %f92, %f82;\n"
" st.shared.f32 [%rd43+1536], %f82;\n"
" ld.shared.f32 %f93, [%rd49+2048];\n"
" add.ftz.f32 %f87, %f93, %f87;\n"
" st.shared.f32 [%rd43+2048], %f87;\n"
" ld.shared.f32 %f94, [%rd49+2560];\n"
" add.ftz.f32 %f88, %f94, %f88;\n"
" st.shared.f32 [%rd43+2560], %f88;\n"
"$Lt_0_25602:\n"
" shr.u32 %r63, %r63, 1;\n"
" mov.u32 %r65, 0;\n"
" setp.ne.u32 %p14, %r63, %r65;\n"
" @%p14 bra $Lt_0_25346;\n"
"$Lt_0_24834:\n"
" mov.f32 %f6, %f79;\n"
" mov.f32 %f8, %f80;\n"
" mov.f32 %f10, %f81;\n"
" mov.f32 %f12, %f82;\n"
" mov.f32 %f14, %f87;\n"
" mov.f32 %f16, %f88;\n"
"$Lt_0_24322:\n"
"$Lt_0_22274:\n"
" mov.u32 %r66, 0;\n"
" setp.ne.s32 %p15, %r13, %r66;\n"
" @%p15 bra $Lt_0_26370;\n"
" ld.param.u64 %rd50, [__cudaparm_kernel_pair___val_paramengv];\n"
" add.u64 %rd51, %rd50, %rd5;\n"
" ld.param.s32 %r67, [__cudaparm_kernel_pair_eflag];\n"
" mov.u32 %r68, 0;\n"
" setp.le.s32 %p16, %r67, %r68;\n"
" @%p16 bra $Lt_0_26882;\n"
" st.global.f32 [%rd51+0], %f28;\n"
" cvt.s64.s32 %rd52, %r9;\n"
" mul.wide.s32 %rd53, %r9, 4;\n"
" add.u64 %rd51, %rd51, %rd53;\n"
"$Lt_0_26882:\n"
" ld.param.s32 %r69, [__cudaparm_kernel_pair_vflag];\n"
" mov.u32 %r70, 0;\n"
" setp.le.s32 %p17, %r69, %r70;\n"
" @%p17 bra $Lt_0_27394;\n"
" mov.f32 %f95, %f6;\n"
" st.global.f32 [%rd51+0], %f95;\n"
" cvt.s64.s32 %rd54, %r9;\n"
" mul.wide.s32 %rd55, %r9, 4;\n"
" add.u64 %rd56, %rd55, %rd51;\n"
" mov.f32 %f96, %f8;\n"
" st.global.f32 [%rd56+0], %f96;\n"
" add.u64 %rd57, %rd55, %rd56;\n"
" mov.f32 %f97, %f10;\n"
" st.global.f32 [%rd57+0], %f97;\n"
" add.u64 %rd58, %rd55, %rd57;\n"
" mov.f32 %f98, %f12;\n"
" st.global.f32 [%rd58+0], %f98;\n"
" add.u64 %rd51, %rd55, %rd58;\n"
" mov.f32 %f99, %f14;\n"
" st.global.f32 [%rd51+0], %f99;\n"
" mov.f32 %f100, %f16;\n"
" add.u64 %rd59, %rd55, %rd51;\n"
" st.global.f32 [%rd59+0], %f100;\n"
"$Lt_0_27394:\n"
" ld.param.u64 %rd60, [__cudaparm_kernel_pair_ans];\n"
" mul.lo.u64 %rd61, %rd4, 16;\n"
" add.u64 %rd62, %rd60, %rd61;\n"
" mov.f32 %f101, %f102;\n"
" st.global.v4.f32 [%rd62+0], {%f27,%f26,%f25,%f101};\n"
"$Lt_0_26370:\n"
"$Lt_0_18690:\n"
" .loc 16 103 0\n"
" exit;\n"
"$LDWend_kernel_pair:\n"
" }\n"
" .entry kernel_pair_fast (\n"
" .param .u64 __cudaparm_kernel_pair_fast_x_,\n"
" .param .u64 __cudaparm_kernel_pair_fast_mor1_in,\n"
" .param .u64 __cudaparm_kernel_pair_fast_mor2_in,\n"
" .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,\n"
" .param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n"
" .param .u64 __cudaparm_kernel_pair_fast_dev_packed,\n"
" .param .u64 __cudaparm_kernel_pair_fast_ans,\n"
" .param .u64 __cudaparm_kernel_pair_fast___val_paramengv,\n"
" .param .s32 __cudaparm_kernel_pair_fast_eflag,\n"
" .param .s32 __cudaparm_kernel_pair_fast_vflag,\n"
" .param .s32 __cudaparm_kernel_pair_fast_inum,\n"
" .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,\n"
" .param .s32 __cudaparm_kernel_pair_fast_t_per_atom)\n"
" {\n"
" .reg .u32 %r<74>;\n"
" .reg .u64 %rd<77>;\n"
" .reg .f32 %f<110>;\n"
" .reg .pred %p<22>;\n"
" .shared .align 4 .b8 __cuda___cuda_local_var_32618_33_non_const_sp_lj3268[16];\n"
" .shared .align 16 .b8 __cuda___cuda_local_var_32616_34_non_const_mor13296[1936];\n"
" .shared .align 8 .b8 __cuda___cuda_local_var_32617_34_non_const_mor25232[968];\n"
" .shared .align 4 .b8 __cuda___cuda_local_var_32688_55_non_const_red_acc6200[3072];\n"
" .loc 16 111 0\n"
"$LDWbegin_kernel_pair_fast:\n"
" cvt.s32.u32 %r1, %tid.x;\n"
" mov.u32 %r2, 3;\n"
" setp.gt.s32 %p1, %r1, %r2;\n"
" @%p1 bra $Lt_1_20994;\n"
" .loc 16 119 0\n"
" mov.u64 %rd1, __cuda___cuda_local_var_32618_33_non_const_sp_lj3268;\n"
" cvt.s64.s32 %rd2, %r1;\n"
" mul.wide.s32 %rd3, %r1, 4;\n"
" ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];\n"
" add.u64 %rd5, %rd4, %rd3;\n"
" ld.global.f32 %f1, [%rd5+0];\n"
" add.u64 %rd6, %rd3, %rd1;\n"
" st.shared.f32 [%rd6+0], %f1;\n"
"$Lt_1_20994:\n"
" mov.u64 %rd1, __cuda___cuda_local_var_32618_33_non_const_sp_lj3268;\n"
" mov.u32 %r3, 120;\n"
" setp.gt.s32 %p2, %r1, %r3;\n"
" @%p2 bra $Lt_1_21506;\n"
" .loc 16 121 0\n"
" mov.u64 %rd7, __cuda___cuda_local_var_32616_34_non_const_mor13296;\n"
" cvt.s64.s32 %rd8, %r1;\n"
" mul.wide.s32 %rd9, %r1, 16;\n"
" ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_mor1_in];\n"
" add.u64 %rd11, %rd10, %rd9;\n"
" add.u64 %rd12, %rd9, %rd7;\n"
" ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0];\n"
" st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5};\n"
" ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag];\n"
" mov.u32 %r5, 0;\n"
" setp.le.s32 %p3, %r4, %r5;\n"
" @%p3 bra $Lt_1_22018;\n"
" .loc 16 123 0\n"
" mov.u64 %rd13, __cuda___cuda_local_var_32617_34_non_const_mor25232;\n"
" mul.lo.u64 %rd14, %rd8, 8;\n"
" ld.param.u64 %rd15, [__cudaparm_kernel_pair_fast_mor2_in];\n"
" add.u64 %rd16, %rd15, %rd14;\n"
" add.u64 %rd17, %rd14, %rd13;\n"
" ld.global.v2.f32 {%f6,%f7}, [%rd16+0];\n"
" st.shared.v2.f32 [%rd17+0], {%f6,%f7};\n"
"$Lt_1_22018:\n"
" mov.u64 %rd13, __cuda___cuda_local_var_32617_34_non_const_mor25232;\n"
"$Lt_1_21506:\n"
" mov.u64 %rd13, __cuda___cuda_local_var_32617_34_non_const_mor25232;\n"
" mov.u64 %rd7, __cuda___cuda_local_var_32616_34_non_const_mor13296;\n"
" .loc 16 131 0\n"
" mov.f32 %f8, 0f00000000; \n"
" mov.f32 %f9, %f8;\n"
" mov.f32 %f10, 0f00000000; \n"
" mov.f32 %f11, %f10;\n"
" mov.f32 %f12, 0f00000000; \n"
" mov.f32 %f13, %f12;\n"
" mov.f32 %f14, 0f00000000; \n"
" mov.f32 %f15, %f14;\n"
" mov.f32 %f16, 0f00000000; \n"
" mov.f32 %f17, %f16;\n"
" mov.f32 %f18, 0f00000000; \n"
" mov.f32 %f19, %f18;\n"
" .loc 16 133 0\n"
" bar.sync 0;\n"
" ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom];\n"
" div.s32 %r7, %r1, %r6;\n"
" cvt.s32.u32 %r8, %ntid.x;\n"
" div.s32 %r9, %r8, %r6;\n"
" cvt.s32.u32 %r10, %ctaid.x;\n"
" mul.lo.s32 %r11, %r10, %r9;\n"
" add.s32 %r12, %r7, %r11;\n"
" ld.param.s32 %r13, [__cudaparm_kernel_pair_fast_inum];\n"
" setp.ge.s32 %p4, %r12, %r13;\n"
" @%p4 bra $Lt_1_30210;\n"
" .loc 16 138 0\n"
" ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_nbor_pitch];\n"
" cvt.s64.s32 %rd18, %r14;\n"
" mul.wide.s32 %rd19, %r14, 4;\n"
" cvt.s64.s32 %rd20, %r12;\n"
" mul.wide.s32 %rd21, %r12, 4;\n"
" ld.param.u64 %rd22, [__cudaparm_kernel_pair_fast_dev_nbor];\n"
" add.u64 %rd23, %rd21, %rd22;\n"
" add.u64 %rd24, %rd19, %rd23;\n"
" ld.global.s32 %r15, [%rd24+0];\n"
" sub.s32 %r16, %r6, 1;\n"
" and.b32 %r17, %r16, %r1;\n"
" cvt.s64.s32 %rd25, %r17;\n"
" mul.wide.s32 %rd26, %r17, 4;\n"
" ld.param.u64 %rd27, [__cudaparm_kernel_pair_fast_dev_packed];\n"
" setp.ne.u64 %p5, %rd27, %rd22;\n"
" @%p5 bra $Lt_1_23298;\n"
" cvt.s32.s64 %r18, %rd18;\n"
" mul.lo.s32 %r19, %r18, %r6;\n"
" mov.s32 %r20, %r19;\n"
" mul.lo.s32 %r21, %r16, %r12;\n"
" add.s32 %r22, %r18, %r21;\n"
" cvt.s64.s32 %rd28, %r22;\n"
" mul.wide.s32 %rd29, %r22, 4;\n"
" add.u64 %rd30, %rd24, %rd29;\n"
" and.b32 %r23, %r16, %r15;\n"
" cvt.s64.s32 %rd31, %r23;\n"
" div.s32 %r24, %r15, %r6;\n"
" mul.lo.s32 %r25, %r19, %r24;\n"
" cvt.s64.s32 %rd32, %r25;\n"
" add.u64 %rd33, %rd31, %rd32;\n"
" mul.lo.u64 %rd34, %rd33, 4;\n"
" add.u64 %rd35, %rd30, %rd34;\n"
" add.u64 %rd36, %rd26, %rd30;\n"
" bra.uni $Lt_1_23042;\n"
"$Lt_1_23298:\n"
" add.u64 %rd37, %rd19, %rd24;\n"
" ld.global.s32 %r26, [%rd37+0];\n"
" cvt.s64.s32 %rd38, %r26;\n"
" mul.wide.s32 %rd39, %r26, 4;\n"
" add.u64 %rd40, %rd27, %rd39;\n"
" cvt.s64.s32 %rd41, %r15;\n"
" mul.wide.s32 %rd42, %r15, 4;\n"
" add.u64 %rd35, %rd40, %rd42;\n"
" mov.s32 %r20, %r6;\n"
" add.u64 %rd36, %rd26, %rd40;\n"
"$Lt_1_23042:\n"
" .loc 16 141 0\n"
" ld.global.s32 %r27, [%rd23+0];\n"
" mov.u32 %r28, %r27;\n"
" mov.s32 %r29, 0;\n"
" mov.u32 %r30, %r29;\n"
" mov.s32 %r31, 0;\n"
" mov.u32 %r32, %r31;\n"
" mov.s32 %r33, 0;\n"
" mov.u32 %r34, %r33;\n"
" tex.1d.v4.f32.s32 {%f20,%f21,%f22,%f23},[pos_tex,{%r28,%r30,%r32,%r34}];\n"
" mov.f32 %f24, %f20;\n"
" mov.f32 %f25, %f21;\n"
" mov.f32 %f26, %f22;\n"
" mov.f32 %f27, %f23;\n"
" setp.ge.u64 %p6, %rd36, %rd35;\n"
" @%p6 bra $Lt_1_31746;\n"
" cvt.rzi.ftz.s32.f32 %r35, %f27;\n"
" cvt.s64.s32 %rd43, %r20;\n"
" mul.lo.s32 %r36, %r35, 11;\n"
" cvt.rn.f32.s32 %f28, %r36;\n"
" mov.f32 %f29, 0f00000000; \n"
" mov.f32 %f30, 0f00000000; \n"
" mov.f32 %f31, 0f00000000; \n"
" mov.f32 %f32, 0f00000000; \n"
"$Lt_1_24066:\n"
" .loc 16 148 0\n"
" ld.global.s32 %r37, [%rd36+0];\n"
" .loc 16 149 0\n"
" shr.s32 %r38, %r37, 30;\n"
" and.b32 %r39, %r38, 3;\n"
" cvt.s64.s32 %rd44, %r39;\n"
" mul.wide.s32 %rd45, %r39, 4;\n"
" add.u64 %rd46, %rd1, %rd45;\n"
" ld.shared.f32 %f33, [%rd46+0];\n"
" .loc 16 152 0\n"
" and.b32 %r40, %r37, 1073741823;\n"
" mov.u32 %r41, %r40;\n"
" mov.s32 %r42, 0;\n"
" mov.u32 %r43, %r42;\n"
" mov.s32 %r44, 0;\n"
" mov.u32 %r45, %r44;\n"
" mov.s32 %r46, 0;\n"
" mov.u32 %r47, %r46;\n"
" tex.1d.v4.f32.s32 {%f34,%f35,%f36,%f37},[pos_tex,{%r41,%r43,%r45,%r47}];\n"
" mov.f32 %f38, %f34;\n"
" mov.f32 %f39, %f35;\n"
" mov.f32 %f40, %f36;\n"
" mov.f32 %f41, %f37;\n"
" sub.ftz.f32 %f42, %f25, %f39;\n"
" sub.ftz.f32 %f43, %f24, %f38;\n"
" sub.ftz.f32 %f44, %f26, %f40;\n"
" mul.ftz.f32 %f45, %f42, %f42;\n"
" fma.rn.ftz.f32 %f46, %f43, %f43, %f45;\n"
" fma.rn.ftz.f32 %f47, %f44, %f44, %f46;\n"
" add.ftz.f32 %f48, %f28, %f41;\n"
" cvt.rzi.ftz.s32.f32 %r48, %f48;\n"
" cvt.s64.s32 %rd47, %r48;\n"
" mul.wide.s32 %rd48, %r48, 16;\n"
" add.u64 %rd49, %rd7, %rd48;\n"
" ld.shared.f32 %f49, [%rd49+0];\n"
" setp.gt.ftz.f32 %p7, %f49, %f47;\n"
" @!%p7 bra $Lt_1_25346;\n"
" .loc 16 163 0\n"
" sqrt.approx.ftz.f32 %f50, %f47;\n"
" ld.shared.v4.f32 {_,%f51,%f52,%f53}, [%rd49+0];\n"
" sub.ftz.f32 %f54, %f50, %f52;\n"
" .loc 16 164 0\n"
" mul.ftz.f32 %f55, %f53, %f54;\n"
" neg.ftz.f32 %f56, %f55;\n"
" .loc 16 166 0\n"
" mov.f32 %f57, 0f3fb8aa3b; \n"
" mul.ftz.f32 %f58, %f56, %f57;\n"
" ex2.approx.ftz.f32 %f59, %f58;\n"
" mul.ftz.f32 %f60, %f59, %f59;\n"
" sub.ftz.f32 %f61, %f60, %f59;\n"
" mul.ftz.f32 %f62, %f51, %f61;\n"
" .loc 16 168 0\n"
" div.approx.ftz.f32 %f63, %f62, %f50;\n"
" mul.ftz.f32 %f64, %f63, %f33;\n"
" fma.rn.ftz.f32 %f31, %f43, %f64, %f31;\n"
" .loc 16 169 0\n"
" fma.rn.ftz.f32 %f30, %f42, %f64, %f30;\n"
" .loc 16 170 0\n"
" fma.rn.ftz.f32 %f29, %f44, %f64, %f29;\n"
" ld.param.s32 %r49, [__cudaparm_kernel_pair_fast_eflag];\n"
" mov.u32 %r50, 0;\n"
" setp.le.s32 %p8, %r49, %r50;\n"
" @%p8 bra $Lt_1_24834;\n"
" .loc 16 173 0\n"
" mul.lo.u64 %rd50, %rd47, 8;\n"
" add.u64 %rd51, %rd13, %rd50;\n"
" ld.shared.v2.f32 {%f65,%f66}, [%rd51+0];\n"
" sub.ftz.f32 %f67, %f61, %f59;\n"
" mul.ftz.f32 %f68, %f65, %f67;\n"
" sub.ftz.f32 %f69, %f68, %f66;\n"
" .loc 16 174 0\n"
" fma.rn.ftz.f32 %f32, %f33, %f69, %f32;\n"
"$Lt_1_24834:\n"
" ld.param.s32 %r51, [__cudaparm_kernel_pair_fast_vflag];\n"
" mov.u32 %r52, 0;\n"
" setp.le.s32 %p9, %r51, %r52;\n"
" @%p9 bra $Lt_1_25346;\n"
" .loc 16 177 0\n"
" mov.f32 %f70, %f9;\n"
" mul.ftz.f32 %f71, %f43, %f43;\n"
" fma.rn.ftz.f32 %f72, %f64, %f71, %f70;\n"
" mov.f32 %f9, %f72;\n"
" .loc 16 178 0\n"
" mov.f32 %f73, %f11;\n"
" fma.rn.ftz.f32 %f74, %f64, %f45, %f73;\n"
" mov.f32 %f11, %f74;\n"
" .loc 16 179 0\n"
" mov.f32 %f75, %f13;\n"
" mul.ftz.f32 %f76, %f44, %f44;\n"
" fma.rn.ftz.f32 %f77, %f64, %f76, %f75;\n"
" mov.f32 %f13, %f77;\n"
" .loc 16 180 0\n"
" mov.f32 %f78, %f15;\n"
" mul.ftz.f32 %f79, %f42, %f43;\n"
" fma.rn.ftz.f32 %f80, %f64, %f79, %f78;\n"
" mov.f32 %f15, %f80;\n"
" .loc 16 181 0\n"
" mov.f32 %f81, %f17;\n"
" mul.ftz.f32 %f82, %f43, %f44;\n"
" fma.rn.ftz.f32 %f83, %f64, %f82, %f81;\n"
" mov.f32 %f17, %f83;\n"
" .loc 16 182 0\n"
" mul.ftz.f32 %f84, %f42, %f44;\n"
" fma.rn.ftz.f32 %f18, %f64, %f84, %f18;\n"
" mov.f32 %f19, %f18;\n"
"$Lt_1_25346:\n"
"$Lt_1_24322:\n"
" .loc 16 146 0\n"
" mul.lo.u64 %rd52, %rd43, 4;\n"
" add.u64 %rd36, %rd36, %rd52;\n"
" setp.lt.u64 %p10, %rd36, %rd35;\n"
" @%p10 bra $Lt_1_24066;\n"
" bra.uni $Lt_1_23554;\n"
"$Lt_1_31746:\n"
" mov.f32 %f29, 0f00000000; \n"
" mov.f32 %f30, 0f00000000; \n"
" mov.f32 %f31, 0f00000000; \n"
" mov.f32 %f32, 0f00000000; \n"
"$Lt_1_23554:\n"
" mov.u32 %r53, 1;\n"
" setp.le.s32 %p11, %r6, %r53;\n"
" @%p11 bra $Lt_1_28162;\n"
" .loc 16 187 0\n"
" mov.u64 %rd53, __cuda___cuda_local_var_32688_55_non_const_red_acc6200;\n"
" cvt.s64.s32 %rd54, %r1;\n"
" mul.wide.s32 %rd55, %r1, 4;\n"
" add.u64 %rd56, %rd53, %rd55;\n"
" mov.f32 %f85, %f31;\n"
" st.shared.f32 [%rd56+0], %f85;\n"
" mov.f32 %f86, %f30;\n"
" st.shared.f32 [%rd56+512], %f86;\n"
" mov.f32 %f87, %f29;\n"
" st.shared.f32 [%rd56+1024], %f87;\n"
" mov.f32 %f88, %f32;\n"
" st.shared.f32 [%rd56+1536], %f88;\n"
" shr.s32 %r54, %r6, 31;\n"
" mov.s32 %r55, 1;\n"
" and.b32 %r56, %r54, %r55;\n"
" add.s32 %r57, %r56, %r6;\n"
" shr.s32 %r58, %r57, 1;\n"
" mov.s32 %r59, %r58;\n"
" mov.u32 %r60, 0;\n"
" setp.ne.u32 %p12, %r58, %r60;\n"
" @!%p12 bra $Lt_1_26626;\n"
"$Lt_1_27138:\n"
" setp.ge.u32 %p13, %r17, %r59;\n"
" @%p13 bra $Lt_1_27394;\n"
" add.u32 %r61, %r1, %r59;\n"
" cvt.u64.u32 %rd57, %r61;\n"
" mul.wide.u32 %rd58, %r61, 4;\n"
" add.u64 %rd59, %rd53, %rd58;\n"
" ld.shared.f32 %f89, [%rd59+0];\n"
" add.ftz.f32 %f85, %f89, %f85;\n"
" st.shared.f32 [%rd56+0], %f85;\n"
" ld.shared.f32 %f90, [%rd59+512];\n"
" add.ftz.f32 %f86, %f90, %f86;\n"
" st.shared.f32 [%rd56+512], %f86;\n"
" ld.shared.f32 %f91, [%rd59+1024];\n"
" add.ftz.f32 %f87, %f91, %f87;\n"
" st.shared.f32 [%rd56+1024], %f87;\n"
" ld.shared.f32 %f92, [%rd59+1536];\n"
" add.ftz.f32 %f88, %f92, %f88;\n"
" st.shared.f32 [%rd56+1536], %f88;\n"
"$Lt_1_27394:\n"
" shr.u32 %r59, %r59, 1;\n"
" mov.u32 %r62, 0;\n"
" setp.ne.u32 %p14, %r59, %r62;\n"
" @%p14 bra $Lt_1_27138;\n"
"$Lt_1_26626:\n"
" mov.f32 %f31, %f85;\n"
" mov.f32 %f30, %f86;\n"
" mov.f32 %f29, %f87;\n"
" mov.f32 %f32, %f88;\n"
" ld.param.s32 %r63, [__cudaparm_kernel_pair_fast_vflag];\n"
" mov.u32 %r64, 0;\n"
" setp.le.s32 %p15, %r63, %r64;\n"
" @%p15 bra $Lt_1_28162;\n"
" mov.f32 %f85, %f9;\n"
" st.shared.f32 [%rd56+0], %f85;\n"
" mov.f32 %f86, %f11;\n"
" st.shared.f32 [%rd56+512], %f86;\n"
" mov.f32 %f87, %f13;\n"
" st.shared.f32 [%rd56+1024], %f87;\n"
" mov.f32 %f88, %f15;\n"
" st.shared.f32 [%rd56+1536], %f88;\n"
" mov.f32 %f93, %f17;\n"
" st.shared.f32 [%rd56+2048], %f93;\n"
" mov.f32 %f94, %f18;\n"
" st.shared.f32 [%rd56+2560], %f94;\n"
" mov.s32 %r65, %r58;\n"
" @!%p12 bra $Lt_1_28674;\n"
"$Lt_1_29186:\n"
" setp.ge.u32 %p16, %r17, %r65;\n"
" @%p16 bra $Lt_1_29442;\n"
" add.u32 %r66, %r1, %r65;\n"
" cvt.u64.u32 %rd60, %r66;\n"
" mul.wide.u32 %rd61, %r66, 4;\n"
" add.u64 %rd62, %rd53, %rd61;\n"
" ld.shared.f32 %f95, [%rd62+0];\n"
" add.ftz.f32 %f85, %f95, %f85;\n"
" st.shared.f32 [%rd56+0], %f85;\n"
" ld.shared.f32 %f96, [%rd62+512];\n"
" add.ftz.f32 %f86, %f96, %f86;\n"
" st.shared.f32 [%rd56+512], %f86;\n"
" ld.shared.f32 %f97, [%rd62+1024];\n"
" add.ftz.f32 %f87, %f97, %f87;\n"
" st.shared.f32 [%rd56+1024], %f87;\n"
" ld.shared.f32 %f98, [%rd62+1536];\n"
" add.ftz.f32 %f88, %f98, %f88;\n"
" st.shared.f32 [%rd56+1536], %f88;\n"
" ld.shared.f32 %f99, [%rd62+2048];\n"
" add.ftz.f32 %f93, %f99, %f93;\n"
" st.shared.f32 [%rd56+2048], %f93;\n"
" ld.shared.f32 %f100, [%rd62+2560];\n"
" add.ftz.f32 %f94, %f100, %f94;\n"
" st.shared.f32 [%rd56+2560], %f94;\n"
"$Lt_1_29442:\n"
" shr.u32 %r65, %r65, 1;\n"
" mov.u32 %r67, 0;\n"
" setp.ne.u32 %p17, %r65, %r67;\n"
" @%p17 bra $Lt_1_29186;\n"
"$Lt_1_28674:\n"
" mov.f32 %f9, %f85;\n"
" mov.f32 %f11, %f86;\n"
" mov.f32 %f13, %f87;\n"
" mov.f32 %f15, %f88;\n"
" mov.f32 %f17, %f93;\n"
" mov.f32 %f19, %f94;\n"
"$Lt_1_28162:\n"
"$Lt_1_26114:\n"
" mov.u32 %r68, 0;\n"
" setp.ne.s32 %p18, %r17, %r68;\n"
" @%p18 bra $Lt_1_30210;\n"
" ld.param.u64 %rd63, [__cudaparm_kernel_pair_fast___val_paramengv];\n"
" add.u64 %rd64, %rd63, %rd21;\n"
" ld.param.s32 %r69, [__cudaparm_kernel_pair_fast_eflag];\n"
" mov.u32 %r70, 0;\n"
" setp.le.s32 %p19, %r69, %r70;\n"
" @%p19 bra $Lt_1_30722;\n"
" st.global.f32 [%rd64+0], %f32;\n"
" cvt.s64.s32 %rd65, %r13;\n"
" mul.wide.s32 %rd66, %r13, 4;\n"
" add.u64 %rd64, %rd64, %rd66;\n"
"$Lt_1_30722:\n"
" ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag];\n"
" mov.u32 %r72, 0;\n"
" setp.le.s32 %p20, %r71, %r72;\n"
" @%p20 bra $Lt_1_31234;\n"
" mov.f32 %f101, %f9;\n"
" st.global.f32 [%rd64+0], %f101;\n"
" cvt.s64.s32 %rd67, %r13;\n"
" mul.wide.s32 %rd68, %r13, 4;\n"
" add.u64 %rd69, %rd68, %rd64;\n"
" mov.f32 %f102, %f11;\n"
" st.global.f32 [%rd69+0], %f102;\n"
" add.u64 %rd70, %rd68, %rd69;\n"
" mov.f32 %f103, %f13;\n"
" st.global.f32 [%rd70+0], %f103;\n"
" add.u64 %rd71, %rd68, %rd70;\n"
" mov.f32 %f104, %f15;\n"
" st.global.f32 [%rd71+0], %f104;\n"
" add.u64 %rd64, %rd68, %rd71;\n"
" mov.f32 %f105, %f17;\n"
" st.global.f32 [%rd64+0], %f105;\n"
" mov.f32 %f106, %f19;\n"
" add.u64 %rd72, %rd68, %rd64;\n"
" st.global.f32 [%rd72+0], %f106;\n"
"$Lt_1_31234:\n"
" ld.param.u64 %rd73, [__cudaparm_kernel_pair_fast_ans];\n"
" mul.lo.u64 %rd74, %rd20, 16;\n"
" add.u64 %rd75, %rd73, %rd74;\n"
" mov.f32 %f107, %f108;\n"
" st.global.v4.f32 [%rd75+0], {%f31,%f30,%f29,%f107};\n"
"$Lt_1_30210:\n"
"$Lt_1_22530:\n"
" .loc 16 190 0\n"
" exit;\n"
"$LDWend_kernel_pair_fast:\n"
" }\n"
;

View File

@ -1,132 +0,0 @@
.version 2.3
.target sm_20
.address_size 64
// compiled with /usr/local/cuda/open64/lib//be
// nvopencc 4.0 built on 2011-05-12
//-----------------------------------------------------------
// Compiling /tmp/tmpxft_00009a34_00000000-9_lal_neighbor_cpu.cpp3.i (/home/sjplimp/ccBI#.V8lyjI)
//-----------------------------------------------------------
//-----------------------------------------------------------
// Options:
//-----------------------------------------------------------
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
// -O3 (Optimization level)
// -g0 (Debug level)
// -m2 (Report advisories)
//-----------------------------------------------------------
.file 1 "<command-line>"
.file 2 "/tmp/tmpxft_00009a34_00000000-8_lal_neighbor_cpu.cudafe2.gpu"
.file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
.file 4 "/usr/local/cuda/include/crt/device_runtime.h"
.file 5 "/usr/local/cuda/include/host_defines.h"
.file 6 "/usr/local/cuda/include/builtin_types.h"
.file 7 "/usr/local/cuda/include/device_types.h"
.file 8 "/usr/local/cuda/include/driver_types.h"
.file 9 "/usr/local/cuda/include/surface_types.h"
.file 10 "/usr/local/cuda/include/texture_types.h"
.file 11 "/usr/local/cuda/include/vector_types.h"
.file 12 "/usr/local/cuda/include/device_launch_parameters.h"
.file 13 "/usr/local/cuda/include/crt/storage_class.h"
.file 14 "/usr/include/bits/types.h"
.file 15 "/usr/include/time.h"
.file 16 "lal_neighbor_cpu.cu"
.file 17 "/usr/local/cuda/include/common_functions.h"
.file 18 "/usr/local/cuda/include/math_functions.h"
.file 19 "/usr/local/cuda/include/math_constants.h"
.file 20 "/usr/local/cuda/include/device_functions.h"
.file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h"
.file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h"
.file 23 "/usr/local/cuda/include/sm_13_double_functions.h"
.file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h"
.file 25 "/usr/local/cuda/include/sm_20_intrinsics.h"
.file 26 "/usr/local/cuda/include/surface_functions.h"
.file 27 "/usr/local/cuda/include/texture_fetch_functions.h"
.file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
.entry kernel_unpack (
.param .u64 __cudaparm_kernel_unpack_dev_nbor,
.param .u64 __cudaparm_kernel_unpack_dev_ij,
.param .s32 __cudaparm_kernel_unpack_inum,
.param .s32 __cudaparm_kernel_unpack_t_per_atom)
{
.reg .u32 %r<19>;
.reg .u64 %rd<33>;
.reg .pred %p<5>;
.loc 16 21 0
$LDWbegin_kernel_unpack:
ld.param.s32 %r1, [__cudaparm_kernel_unpack_t_per_atom];
cvt.s32.u32 %r2, %tid.x;
div.s32 %r3, %r2, %r1;
cvt.s32.u32 %r4, %ntid.x;
div.s32 %r5, %r4, %r1;
cvt.s32.u32 %r6, %ctaid.x;
mul.lo.s32 %r7, %r6, %r5;
add.s32 %r8, %r3, %r7;
ld.param.s32 %r9, [__cudaparm_kernel_unpack_inum];
setp.ge.s32 %p1, %r8, %r9;
@%p1 bra $Lt_0_2050;
.loc 16 30 0
cvt.s64.s32 %rd1, %r9;
ld.param.u64 %rd2, [__cudaparm_kernel_unpack_dev_nbor];
cvt.s64.s32 %rd3, %r8;
add.u64 %rd4, %rd3, %rd1;
mul.lo.u64 %rd5, %rd4, 4;
add.u64 %rd6, %rd2, %rd5;
mul.wide.s32 %rd7, %r9, 4;
add.u64 %rd8, %rd6, %rd7;
ld.param.u64 %rd9, [__cudaparm_kernel_unpack_dev_ij];
ld.global.s32 %r10, [%rd8+0];
cvt.s64.s32 %rd10, %r10;
mul.wide.s32 %rd11, %r10, 4;
add.u64 %rd12, %rd9, %rd11;
.loc 16 31 0
ld.global.s32 %r11, [%rd6+0];
cvt.s64.s32 %rd13, %r11;
mul.wide.s32 %rd14, %r11, 4;
add.u64 %rd15, %rd12, %rd14;
.loc 16 33 0
sub.s32 %r12, %r1, 1;
and.b32 %r13, %r12, %r2;
mul.lo.s32 %r14, %r12, %r8;
add.s32 %r15, %r13, %r14;
cvt.s64.s32 %rd16, %r15;
mul.wide.s32 %rd17, %r15, 4;
add.u64 %rd18, %rd8, %rd17;
.loc 16 34 0
cvt.s64.s32 %rd19, %r13;
mul.wide.s32 %rd20, %r13, 4;
add.u64 %rd21, %rd12, %rd20;
setp.ge.u64 %p2, %rd21, %rd15;
@%p2 bra $Lt_0_2562;
sub.u64 %rd22, %rd15, %rd21;
add.u64 %rd23, %rd22, 3;
shr.s64 %rd24, %rd23, 63;
mov.s64 %rd25, 3;
and.b64 %rd26, %rd24, %rd25;
add.s64 %rd27, %rd26, %rd23;
shr.s64 %rd28, %rd27, 2;
mul.lo.s32 %r16, %r9, %r1;
mov.s64 %rd29, %rd28;
$Lt_0_3074:
//<loop> Loop body line 34, nesting depth: 1, estimated iterations: unknown
.loc 16 37 0
ld.global.s32 %r17, [%rd21+0];
st.global.s32 [%rd18+0], %r17;
.loc 16 38 0
cvt.s64.s32 %rd30, %r16;
mul.wide.s32 %rd31, %r16, 4;
add.u64 %rd18, %rd18, %rd31;
add.u64 %rd21, %rd21, 4;
setp.ne.u64 %p3, %rd21, %rd15;
@%p3 bra $Lt_0_3074;
$Lt_0_2562:
$Lt_0_2050:
.loc 16 41 0
exit;
$LDWend_kernel_unpack:
} // kernel_unpack

View File

@ -1,86 +0,0 @@
const char * neighbor_cpu =
" .version 2.3\n"
" .target sm_20\n"
" .address_size 64\n"
" .entry kernel_unpack (\n"
" .param .u64 __cudaparm_kernel_unpack_dev_nbor,\n"
" .param .u64 __cudaparm_kernel_unpack_dev_ij,\n"
" .param .s32 __cudaparm_kernel_unpack_inum,\n"
" .param .s32 __cudaparm_kernel_unpack_t_per_atom)\n"
" {\n"
" .reg .u32 %r<19>;\n"
" .reg .u64 %rd<33>;\n"
" .reg .pred %p<5>;\n"
" .loc 16 21 0\n"
"$LDWbegin_kernel_unpack:\n"
" ld.param.s32 %r1, [__cudaparm_kernel_unpack_t_per_atom];\n"
" cvt.s32.u32 %r2, %tid.x;\n"
" div.s32 %r3, %r2, %r1;\n"
" cvt.s32.u32 %r4, %ntid.x;\n"
" div.s32 %r5, %r4, %r1;\n"
" cvt.s32.u32 %r6, %ctaid.x;\n"
" mul.lo.s32 %r7, %r6, %r5;\n"
" add.s32 %r8, %r3, %r7;\n"
" ld.param.s32 %r9, [__cudaparm_kernel_unpack_inum];\n"
" setp.ge.s32 %p1, %r8, %r9;\n"
" @%p1 bra $Lt_0_2050;\n"
" .loc 16 30 0\n"
" cvt.s64.s32 %rd1, %r9;\n"
" ld.param.u64 %rd2, [__cudaparm_kernel_unpack_dev_nbor];\n"
" cvt.s64.s32 %rd3, %r8;\n"
" add.u64 %rd4, %rd3, %rd1;\n"
" mul.lo.u64 %rd5, %rd4, 4;\n"
" add.u64 %rd6, %rd2, %rd5;\n"
" mul.wide.s32 %rd7, %r9, 4;\n"
" add.u64 %rd8, %rd6, %rd7;\n"
" ld.param.u64 %rd9, [__cudaparm_kernel_unpack_dev_ij];\n"
" ld.global.s32 %r10, [%rd8+0];\n"
" cvt.s64.s32 %rd10, %r10;\n"
" mul.wide.s32 %rd11, %r10, 4;\n"
" add.u64 %rd12, %rd9, %rd11;\n"
" .loc 16 31 0\n"
" ld.global.s32 %r11, [%rd6+0];\n"
" cvt.s64.s32 %rd13, %r11;\n"
" mul.wide.s32 %rd14, %r11, 4;\n"
" add.u64 %rd15, %rd12, %rd14;\n"
" .loc 16 33 0\n"
" sub.s32 %r12, %r1, 1;\n"
" and.b32 %r13, %r12, %r2;\n"
" mul.lo.s32 %r14, %r12, %r8;\n"
" add.s32 %r15, %r13, %r14;\n"
" cvt.s64.s32 %rd16, %r15;\n"
" mul.wide.s32 %rd17, %r15, 4;\n"
" add.u64 %rd18, %rd8, %rd17;\n"
" .loc 16 34 0\n"
" cvt.s64.s32 %rd19, %r13;\n"
" mul.wide.s32 %rd20, %r13, 4;\n"
" add.u64 %rd21, %rd12, %rd20;\n"
" setp.ge.u64 %p2, %rd21, %rd15;\n"
" @%p2 bra $Lt_0_2562;\n"
" sub.u64 %rd22, %rd15, %rd21;\n"
" add.u64 %rd23, %rd22, 3;\n"
" shr.s64 %rd24, %rd23, 63;\n"
" mov.s64 %rd25, 3;\n"
" and.b64 %rd26, %rd24, %rd25;\n"
" add.s64 %rd27, %rd26, %rd23;\n"
" shr.s64 %rd28, %rd27, 2;\n"
" mul.lo.s32 %r16, %r9, %r1;\n"
" mov.s64 %rd29, %rd28;\n"
"$Lt_0_3074:\n"
" .loc 16 37 0\n"
" ld.global.s32 %r17, [%rd21+0];\n"
" st.global.s32 [%rd18+0], %r17;\n"
" .loc 16 38 0\n"
" cvt.s64.s32 %rd30, %r16;\n"
" mul.wide.s32 %rd31, %r16, 4;\n"
" add.u64 %rd18, %rd18, %rd31;\n"
" add.u64 %rd21, %rd21, 4;\n"
" setp.ne.u64 %p3, %rd21, %rd15;\n"
" @%p3 bra $Lt_0_3074;\n"
"$Lt_0_2562:\n"
"$Lt_0_2050:\n"
" .loc 16 41 0\n"
" exit;\n"
"$LDWend_kernel_unpack:\n"
" }\n"
;

View File

@ -1,870 +0,0 @@
.version 2.3
.target sm_20
.address_size 64
// compiled with /usr/local/cuda/open64/lib//be
// nvopencc 4.0 built on 2011-05-12
//-----------------------------------------------------------
// Compiling /tmp/tmpxft_00009a53_00000000-9_lal_neighbor_gpu.cpp3.i (/home/sjplimp/ccBI#.a5G2Mh)
//-----------------------------------------------------------
//-----------------------------------------------------------
// Options:
//-----------------------------------------------------------
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
// -O3 (Optimization level)
// -g0 (Debug level)
// -m2 (Report advisories)
//-----------------------------------------------------------
.file 1 "<command-line>"
.file 2 "/tmp/tmpxft_00009a53_00000000-8_lal_neighbor_gpu.cudafe2.gpu"
.file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
.file 4 "/usr/local/cuda/include/crt/device_runtime.h"
.file 5 "/usr/local/cuda/include/host_defines.h"
.file 6 "/usr/local/cuda/include/builtin_types.h"
.file 7 "/usr/local/cuda/include/device_types.h"
.file 8 "/usr/local/cuda/include/driver_types.h"
.file 9 "/usr/local/cuda/include/surface_types.h"
.file 10 "/usr/local/cuda/include/texture_types.h"
.file 11 "/usr/local/cuda/include/vector_types.h"
.file 12 "/usr/local/cuda/include/device_launch_parameters.h"
.file 13 "/usr/local/cuda/include/crt/storage_class.h"
.file 14 "/usr/include/bits/types.h"
.file 15 "/usr/include/time.h"
.file 16 "lal_neighbor_gpu.cu"
.file 17 "/usr/local/cuda/include/common_functions.h"
.file 18 "/usr/local/cuda/include/math_functions.h"
.file 19 "/usr/local/cuda/include/math_constants.h"
.file 20 "/usr/local/cuda/include/device_functions.h"
.file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h"
.file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h"
.file 23 "/usr/local/cuda/include/sm_13_double_functions.h"
.file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h"
.file 25 "/usr/local/cuda/include/sm_20_intrinsics.h"
.file 26 "/usr/local/cuda/include/surface_functions.h"
.file 27 "/usr/local/cuda/include/texture_fetch_functions.h"
.file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
.global .texref neigh_tex;
.entry calc_cell_id (
.param .u64 __cudaparm_calc_cell_id_pos,
.param .u64 __cudaparm_calc_cell_id_cell_id,
.param .u64 __cudaparm_calc_cell_id_particle_id,
.param .f32 __cudaparm_calc_cell_id_boxlo0,
.param .f32 __cudaparm_calc_cell_id_boxlo1,
.param .f32 __cudaparm_calc_cell_id_boxlo2,
.param .f32 __cudaparm_calc_cell_id_boxhi0,
.param .f32 __cudaparm_calc_cell_id_boxhi1,
.param .f32 __cudaparm_calc_cell_id_boxhi2,
.param .f32 __cudaparm_calc_cell_id_cell_size,
.param .s32 __cudaparm_calc_cell_id_ncellx,
.param .s32 __cudaparm_calc_cell_id_ncelly,
.param .s32 __cudaparm_calc_cell_id_nall)
{
.reg .u32 %r<25>;
.reg .u64 %rd<8>;
.reg .f32 %f<35>;
.reg .f64 %fd<11>;
.reg .pred %p<3>;
.loc 16 29 0
$LDWbegin_calc_cell_id:
mov.u32 %r1, %tid.x;
mov.u32 %r2, %ctaid.x;
mov.u32 %r3, %ntid.x;
mul.lo.u32 %r4, %r2, %r3;
add.u32 %r5, %r1, %r4;
ld.param.s32 %r6, [__cudaparm_calc_cell_id_nall];
setp.le.s32 %p1, %r6, %r5;
@%p1 bra $Lt_0_1026;
.loc 16 33 0
mov.u32 %r7, %r5;
mov.s32 %r8, 0;
mov.u32 %r9, %r8;
mov.s32 %r10, 0;
mov.u32 %r11, %r10;
mov.s32 %r12, 0;
mov.u32 %r13, %r12;
tex.1d.v4.f32.s32 {%f1,%f2,%f3,%f4},[neigh_tex,{%r7,%r9,%r11,%r13}];
mov.f32 %f5, %f1;
mov.f32 %f6, %f2;
mov.f32 %f7, %f3;
.loc 16 46 0
ld.param.f32 %f8, [__cudaparm_calc_cell_id_cell_size];
neg.ftz.f32 %f9, %f8;
ld.param.f32 %f10, [__cudaparm_calc_cell_id_boxlo0];
ld.param.f32 %f11, [__cudaparm_calc_cell_id_boxlo2];
ld.param.f32 %f12, [__cudaparm_calc_cell_id_boxlo1];
ld.param.s32 %r14, [__cudaparm_calc_cell_id_ncellx];
ld.param.s32 %r15, [__cudaparm_calc_cell_id_ncelly];
ld.param.f32 %f13, [__cudaparm_calc_cell_id_boxhi2];
sub.ftz.f32 %f14, %f13, %f11;
add.ftz.f32 %f15, %f8, %f14;
sub.ftz.f32 %f16, %f7, %f11;
max.ftz.f32 %f17, %f9, %f16;
min.ftz.f32 %f18, %f15, %f17;
div.approx.ftz.f32 %f19, %f18, %f8;
cvt.ftz.f64.f32 %fd1, %f19;
mov.f64 %fd2, 0d3ff0000000000000; // 1
add.f64 %fd3, %fd1, %fd2;
cvt.rzi.u32.f64 %r16, %fd3;
mul.lo.u32 %r17, %r14, %r16;
mul.lo.u32 %r18, %r15, %r17;
ld.param.f32 %f20, [__cudaparm_calc_cell_id_boxhi1];
sub.ftz.f32 %f21, %f20, %f12;
add.ftz.f32 %f22, %f8, %f21;
sub.ftz.f32 %f23, %f6, %f12;
max.ftz.f32 %f24, %f9, %f23;
min.ftz.f32 %f25, %f22, %f24;
div.approx.ftz.f32 %f26, %f25, %f8;
cvt.ftz.f64.f32 %fd4, %f26;
mov.f64 %fd5, 0d3ff0000000000000; // 1
add.f64 %fd6, %fd4, %fd5;
cvt.rzi.u32.f64 %r19, %fd6;
mul.lo.u32 %r20, %r14, %r19;
add.u32 %r21, %r18, %r20;
ld.param.f32 %f27, [__cudaparm_calc_cell_id_boxhi0];
sub.ftz.f32 %f28, %f27, %f10;
add.ftz.f32 %f29, %f8, %f28;
sub.ftz.f32 %f30, %f5, %f10;
max.ftz.f32 %f31, %f9, %f30;
min.ftz.f32 %f32, %f29, %f31;
div.approx.ftz.f32 %f33, %f32, %f8;
cvt.ftz.f64.f32 %fd7, %f33;
mov.f64 %fd8, 0d3ff0000000000000; // 1
add.f64 %fd9, %fd7, %fd8;
cvt.rzi.u32.f64 %r22, %fd9;
add.u32 %r23, %r21, %r22;
.loc 16 50 0
cvt.s64.s32 %rd1, %r5;
mul.wide.s32 %rd2, %r5, 4;
ld.param.u64 %rd3, [__cudaparm_calc_cell_id_cell_id];
add.u64 %rd4, %rd3, %rd2;
st.global.u32 [%rd4+0], %r23;
.loc 16 51 0
ld.param.u64 %rd5, [__cudaparm_calc_cell_id_particle_id];
add.u64 %rd6, %rd5, %rd2;
st.global.s32 [%rd6+0], %r5;
$Lt_0_1026:
.loc 16 53 0
exit;
$LDWend_calc_cell_id:
} // calc_cell_id
.entry kernel_calc_cell_counts (
.param .u64 __cudaparm_kernel_calc_cell_counts_cell_id,
.param .u64 __cudaparm_kernel_calc_cell_counts_cell_counts,
.param .s32 __cudaparm_kernel_calc_cell_counts_nall,
.param .s32 __cudaparm_kernel_calc_cell_counts_ncell)
{
.reg .u32 %r<33>;
.reg .u64 %rd<15>;
.reg .pred %p<13>;
.loc 16 56 0
$LDWbegin_kernel_calc_cell_counts:
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %ntid.x;
mul.lo.u32 %r3, %r1, %r2;
mov.u32 %r4, %tid.x;
add.u32 %r5, %r4, %r3;
ld.param.s32 %r6, [__cudaparm_kernel_calc_cell_counts_nall];
setp.gt.s32 %p1, %r6, %r5;
@!%p1 bra $Lt_1_7426;
.loc 16 59 0
ld.param.u64 %rd1, [__cudaparm_kernel_calc_cell_counts_cell_id];
cvt.s64.s32 %rd2, %r5;
mul.wide.s32 %rd3, %r5, 4;
add.u64 %rd4, %rd1, %rd3;
ld.global.u32 %r7, [%rd4+0];
mov.u32 %r8, 0;
setp.ne.s32 %p2, %r5, %r8;
@%p2 bra $Lt_1_7938;
add.s32 %r9, %r7, 1;
mov.u32 %r10, 0;
setp.le.s32 %p3, %r9, %r10;
@%p3 bra $Lt_1_8450;
mov.s32 %r11, %r9;
ld.param.u64 %rd5, [__cudaparm_kernel_calc_cell_counts_cell_counts];
mov.s32 %r12, 0;
mov.s32 %r13, %r11;
$Lt_1_8962:
//<loop> Loop body line 59, nesting depth: 1, estimated iterations: unknown
.loc 16 64 0
mov.s32 %r14, 0;
st.global.s32 [%rd5+0], %r14;
add.s32 %r12, %r12, 1;
add.u64 %rd5, %rd5, 4;
setp.ne.s32 %p4, %r9, %r12;
@%p4 bra $Lt_1_8962;
$Lt_1_8450:
$Lt_1_7938:
sub.s32 %r15, %r6, 1;
setp.ne.s32 %p5, %r5, %r15;
@%p5 bra $Lt_1_9474;
.loc 16 67 0
add.s32 %r9, %r7, 1;
mov.s32 %r16, %r9;
ld.param.s32 %r17, [__cudaparm_kernel_calc_cell_counts_ncell];
setp.gt.s32 %p6, %r9, %r17;
@%p6 bra $Lt_1_9986;
sub.s32 %r18, %r17, %r7;
add.s32 %r19, %r17, 1;
ld.param.u64 %rd6, [__cudaparm_kernel_calc_cell_counts_cell_counts];
cvt.s64.s32 %rd7, %r9;
mul.wide.s32 %rd8, %r9, 4;
add.u64 %rd9, %rd6, %rd8;
mov.s32 %r20, %r18;
$Lt_1_10498:
//<loop> Loop body line 67, nesting depth: 1, estimated iterations: unknown
.loc 16 68 0
st.global.s32 [%rd9+0], %r6;
add.s32 %r16, %r16, 1;
add.u64 %rd9, %rd9, 4;
setp.ne.s32 %p7, %r19, %r16;
@%p7 bra $Lt_1_10498;
$Lt_1_9986:
$Lt_1_9474:
selp.s32 %r21, 1, 0, %p1;
mov.s32 %r22, 0;
set.gt.u32.s32 %r23, %r5, %r22;
neg.s32 %r24, %r23;
and.b32 %r25, %r21, %r24;
mov.u32 %r26, 0;
setp.eq.s32 %p8, %r25, %r26;
@%p8 bra $Lt_1_11010;
.loc 16 72 0
ld.global.u32 %r27, [%rd4+-4];
setp.eq.s32 %p9, %r7, %r27;
@%p9 bra $Lt_1_11522;
.loc 16 74 0
add.s32 %r28, %r27, 1;
mov.s32 %r29, %r28;
setp.gt.s32 %p10, %r28, %r7;
@%p10 bra $Lt_1_12034;
sub.s32 %r30, %r7, %r27;
add.s32 %r9, %r7, 1;
ld.param.u64 %rd10, [__cudaparm_kernel_calc_cell_counts_cell_counts];
cvt.s64.s32 %rd11, %r28;
mul.wide.s32 %rd12, %r28, 4;
add.u64 %rd13, %rd10, %rd12;
mov.s32 %r31, %r30;
$Lt_1_12546:
//<loop> Loop body line 74, nesting depth: 1, estimated iterations: unknown
.loc 16 75 0
st.global.s32 [%rd13+0], %r5;
add.s32 %r29, %r29, 1;
add.u64 %rd13, %rd13, 4;
setp.ne.s32 %p11, %r9, %r29;
@%p11 bra $Lt_1_12546;
$Lt_1_12034:
$Lt_1_11522:
$Lt_1_11010:
$Lt_1_7426:
.loc 16 79 0
exit;
$LDWend_kernel_calc_cell_counts:
} // kernel_calc_cell_counts
.entry transpose (
.param .u64 __cudaparm_transpose_out,
.param .u64 __cudaparm_transpose_in,
.param .s32 __cudaparm_transpose_columns_in,
.param .s32 __cudaparm_transpose_rows_in)
{
.reg .u32 %r<32>;
.reg .u64 %rd<23>;
.reg .f32 %f<4>;
.reg .pred %p<4>;
.shared .align 4 .b8 __cuda___cuda_local_var_32571_32_non_const_block112[288];
.loc 16 86 0
$LDWbegin_transpose:
mov.u32 %r1, %ctaid.x;
mul.lo.u32 %r2, %r1, 8;
mov.u32 %r3, %ctaid.y;
mul.lo.u32 %r4, %r3, 8;
mov.u32 %r5, %tid.x;
add.u32 %r6, %r2, %r5;
mov.u32 %r7, %tid.y;
add.u32 %r8, %r4, %r7;
ld.param.s32 %r9, [__cudaparm_transpose_rows_in];
ld.param.s32 %r10, [__cudaparm_transpose_columns_in];
set.gt.u32.u32 %r11, %r9, %r8;
neg.s32 %r12, %r11;
set.gt.u32.u32 %r13, %r10, %r6;
neg.s32 %r14, %r13;
and.b32 %r15, %r12, %r14;
mov.u32 %r16, 0;
setp.eq.s32 %p1, %r15, %r16;
@%p1 bra $Lt_2_2306;
.loc 16 98 0
mov.u64 %rd1, __cuda___cuda_local_var_32571_32_non_const_block112;
ld.param.u64 %rd2, [__cudaparm_transpose_in];
mul.lo.u32 %r17, %r10, %r8;
add.u32 %r18, %r6, %r17;
cvt.u64.u32 %rd3, %r18;
mul.wide.u32 %rd4, %r18, 4;
add.u64 %rd5, %rd2, %rd4;
ld.global.s32 %r19, [%rd5+0];
cvt.rn.f32.s32 %f1, %r19;
cvt.u64.u32 %rd6, %r5;
cvt.u64.u32 %rd7, %r7;
mul.wide.u32 %rd8, %r7, 9;
add.u64 %rd9, %rd6, %rd8;
mul.lo.u64 %rd10, %rd9, 4;
add.u64 %rd11, %rd1, %rd10;
st.shared.f32 [%rd11+0], %f1;
$Lt_2_2306:
mov.u64 %rd1, __cuda___cuda_local_var_32571_32_non_const_block112;
.loc 16 100 0
bar.sync 0;
add.u32 %r20, %r2, %r7;
add.u32 %r21, %r4, %r5;
set.gt.u32.u32 %r22, %r9, %r21;
neg.s32 %r23, %r22;
set.gt.u32.u32 %r24, %r10, %r20;
neg.s32 %r25, %r24;
and.b32 %r26, %r23, %r25;
mov.u32 %r27, 0;
setp.eq.s32 %p2, %r26, %r27;
@%p2 bra $Lt_2_2818;
.loc 16 105 0
cvt.u64.u32 %rd12, %r7;
cvt.u64.u32 %rd13, %r5;
mul.wide.u32 %rd14, %r5, 9;
add.u64 %rd15, %rd12, %rd14;
mul.lo.u64 %rd16, %rd15, 4;
add.u64 %rd17, %rd1, %rd16;
ld.shared.f32 %f2, [%rd17+0];
cvt.rzi.ftz.s32.f32 %r28, %f2;
ld.param.u64 %rd18, [__cudaparm_transpose_out];
mul.lo.u32 %r29, %r9, %r20;
add.u32 %r30, %r21, %r29;
cvt.u64.u32 %rd19, %r30;
mul.wide.u32 %rd20, %r30, 4;
add.u64 %rd21, %rd18, %rd20;
st.global.s32 [%rd21+0], %r28;
$Lt_2_2818:
.loc 16 106 0
exit;
$LDWend_transpose:
} // transpose
.entry calc_neigh_list_cell (
.param .u64 __cudaparm_calc_neigh_list_cell_x_,
.param .u64 __cudaparm_calc_neigh_list_cell_cell_particle_id,
.param .u64 __cudaparm_calc_neigh_list_cell_cell_counts,
.param .u64 __cudaparm_calc_neigh_list_cell_nbor_list,
.param .u64 __cudaparm_calc_neigh_list_cell_host_nbor_list,
.param .u64 __cudaparm_calc_neigh_list_cell_host_numj,
.param .s32 __cudaparm_calc_neigh_list_cell_neigh_bin_size,
.param .f32 __cudaparm_calc_neigh_list_cell_cell_size,
.param .s32 __cudaparm_calc_neigh_list_cell_ncellx,
.param .s32 __cudaparm_calc_neigh_list_cell_ncelly,
.param .s32 __cudaparm_calc_neigh_list_cell_ncellz,
.param .s32 __cudaparm_calc_neigh_list_cell_inum,
.param .s32 __cudaparm_calc_neigh_list_cell_nt,
.param .s32 __cudaparm_calc_neigh_list_cell_nall,
.param .s32 __cudaparm_calc_neigh_list_cell_t_per_atom)
{
.reg .u32 %r<118>;
.reg .u64 %rd<52>;
.reg .f32 %f<41>;
.reg .f64 %fd<4>;
.reg .pred %p<23>;
.shared .align 16 .b8 __cuda___cuda_local_var_32609_34_non_const_pos_sh496[2048];
.shared .align 4 .b8 __cuda___cuda_local_var_32608_31_non_const_cell_list_sh2544[512];
// __cuda_local_var_32624_12_non_const_atom_i = 16
.loc 16 116 0
$LDWbegin_calc_neigh_list_cell:
.loc 16 128 0
ld.param.s32 %r1, [__cudaparm_calc_neigh_list_cell_ncelly];
mov.u32 %r2, %ctaid.y;
rem.u32 %r3, %r2, %r1;
div.u32 %r4, %r2, %r1;
ld.param.s32 %r5, [__cudaparm_calc_neigh_list_cell_ncellx];
mul.lo.s32 %r6, %r5, %r3;
mul.lo.s32 %r7, %r5, %r4;
mul.lo.s32 %r8, %r7, %r1;
cvt.s32.u32 %r9, %ctaid.x;
ld.param.u64 %rd1, [__cudaparm_calc_neigh_list_cell_cell_counts];
add.s32 %r10, %r6, %r8;
add.s32 %r11, %r9, %r10;
cvt.s64.s32 %rd2, %r11;
mul.wide.s32 %rd3, %r11, 4;
add.u64 %rd4, %rd1, %rd3;
ldu.global.s32 %r12, [%rd4+0];
.loc 16 129 0
ldu.global.s32 %r13, [%rd4+4];
.loc 16 137 0
sub.s32 %r14, %r13, %r12;
mov.u32 %r15, %ntid.x;
cvt.rn.f32.u32 %f1, %r15;
cvt.rn.f32.s32 %f2, %r14;
div.approx.ftz.f32 %f3, %f2, %f1;
cvt.rpi.ftz.f32.f32 %f4, %f3;
cvt.rzi.ftz.s32.f32 %r16, %f4;
mov.u32 %r17, 0;
setp.le.s32 %p1, %r16, %r17;
@%p1 bra $Lt_3_14082;
sub.s32 %r18, %r3, 1;
mov.s32 %r19, 0;
max.s32 %r20, %r18, %r19;
sub.s32 %r21, %r1, 1;
add.s32 %r22, %r3, 1;
min.s32 %r23, %r21, %r22;
ld.param.s32 %r24, [__cudaparm_calc_neigh_list_cell_ncellz];
sub.s32 %r25, %r24, 1;
add.s32 %r26, %r4, 1;
min.s32 %r27, %r25, %r26;
sub.s32 %r28, %r9, 1;
mov.s32 %r29, 0;
max.s32 %r30, %r28, %r29;
add.s32 %r31, %r9, 1;
sub.s32 %r32, %r5, 1;
min.s32 %r33, %r31, %r32;
mov.s32 %r34, %r16;
cvt.s32.u32 %r35, %tid.x;
add.s32 %r36, %r12, %r35;
mov.u32 %r37, 0;
ld.param.s32 %r38, [__cudaparm_calc_neigh_list_cell_inum];
cvt.s64.s32 %rd5, %r38;
sub.s32 %r39, %r4, 1;
mov.s32 %r40, %r36;
mov.s32 %r41, 0;
max.s32 %r42, %r39, %r41;
setp.ge.s32 %p2, %r27, %r42;
ld.param.s32 %r43, [__cudaparm_calc_neigh_list_cell_nt];
ld.param.s32 %r44, [__cudaparm_calc_neigh_list_cell_nall];
mov.s32 %r45, 0;
mov.u64 %rd6, __cuda___cuda_local_var_32609_34_non_const_pos_sh496;
mov.u64 %rd7, __cuda___cuda_local_var_32608_31_non_const_cell_list_sh2544;
mov.s32 %r46, %r34;
$Lt_3_14594:
//<loop> Loop body line 137, nesting depth: 1, estimated iterations: unknown
.loc 16 140 0
mov.s32 %r47, %r44;
setp.ge.s32 %p3, %r40, %r13;
@%p3 bra $Lt_3_14850;
.loc 16 146 0
ld.param.u64 %rd8, [__cudaparm_calc_neigh_list_cell_cell_particle_id];
add.u32 %r48, %r36, %r37;
cvt.s64.s32 %rd9, %r48;
mul.wide.s32 %rd10, %r48, 4;
add.u64 %rd11, %rd8, %rd10;
ld.global.s32 %r47, [%rd11+0];
$Lt_3_14850:
setp.lt.s32 %p4, %r47, %r43;
@!%p4 bra $Lt_3_15362;
.loc 16 149 0
mov.u32 %r49, %r47;
mov.s32 %r50, 0;
mov.u32 %r51, %r50;
mov.s32 %r52, 0;
mov.u32 %r53, %r52;
mov.s32 %r54, 0;
mov.u32 %r55, %r54;
tex.1d.v4.f32.s32 {%f5,%f6,%f7,%f8},[neigh_tex,{%r49,%r51,%r53,%r55}];
mov.f32 %f9, %f5;
mov.f32 %f10, %f6;
mov.f32 %f11, %f7;
mov.f32 %f12, %f9;
mov.f32 %f13, %f10;
mov.f32 %f14, %f11;
$Lt_3_15362:
cvt.s64.s32 %rd12, %r47;
mul.wide.s32 %rd13, %r47, 4;
setp.ge.s32 %p5, %r47, %r38;
@%p5 bra $Lt_3_16130;
.loc 16 153 0
ld.param.u64 %rd14, [__cudaparm_calc_neigh_list_cell_nbor_list];
add.u64 %rd15, %rd12, %rd5;
mul.lo.u64 %rd16, %rd15, 4;
add.u64 %rd17, %rd14, %rd16;
mov.s64 %rd18, %rd17;
.loc 16 154 0
ld.param.s32 %r56, [__cudaparm_calc_neigh_list_cell_t_per_atom];
sub.s32 %r57, %r56, 1;
mul.lo.s32 %r58, %r47, %r57;
cvt.s64.s32 %rd19, %r58;
add.u64 %rd20, %rd19, %rd5;
mul.lo.u64 %rd21, %rd20, 4;
add.u64 %rd22, %rd17, %rd21;
.loc 16 155 0
mul.lo.s32 %r59, %r56, %r38;
sub.s32 %r60, %r59, %r56;
.loc 16 156 0
add.u64 %rd23, %rd13, %rd14;
st.global.s32 [%rd23+0], %r47;
bra.uni $Lt_3_15874;
$Lt_3_16130:
.loc 16 159 0
ld.param.u64 %rd24, [__cudaparm_calc_neigh_list_cell_host_numj];
add.u64 %rd25, %rd24, %rd13;
mul.lo.u64 %rd26, %rd5, 4;
sub.u64 %rd18, %rd25, %rd26;
.loc 16 160 0
ld.param.u64 %rd27, [__cudaparm_calc_neigh_list_cell_host_nbor_list];
ld.param.s32 %r61, [__cudaparm_calc_neigh_list_cell_neigh_bin_size];
sub.s32 %r62, %r47, %r38;
mul.lo.s32 %r63, %r61, %r62;
cvt.s64.s32 %rd28, %r63;
mul.wide.s32 %rd29, %r63, 4;
add.u64 %rd22, %rd27, %rd29;
mov.s32 %r60, 0;
$Lt_3_15874:
.loc 16 165 0
mov.s32 %r64, %r42;
@!%p2 bra $Lt_3_24066;
sub.s32 %r65, %r27, %r42;
add.s32 %r66, %r65, 1;
setp.le.s32 %p6, %r20, %r23;
add.s32 %r67, %r27, 1;
mov.s32 %r68, 0;
mov.s32 %r69, %r66;
$Lt_3_16898:
//<loop> Loop body line 165, nesting depth: 2, estimated iterations: unknown
.loc 16 166 0
mov.s32 %r70, %r20;
@!%p6 bra $Lt_3_17154;
sub.s32 %r71, %r23, %r20;
add.s32 %r72, %r71, 1;
setp.ge.s32 %p7, %r33, %r30;
add.s32 %r73, %r23, 1;
mov.s32 %r74, %r72;
$Lt_3_17666:
//<loop> Loop body line 166, nesting depth: 3, estimated iterations: unknown
@!%p7 bra $Lt_3_17922;
sub.s32 %r75, %r33, %r30;
add.s32 %r76, %r75, 1;
mul.lo.s32 %r77, %r70, %r5;
mul.lo.s32 %r78, %r64, %r5;
mul.lo.s32 %r79, %r78, %r1;
add.s32 %r80, %r33, 1;
add.s32 %r81, %r77, %r79;
add.s32 %r82, %r81, %r30;
add.s32 %r83, %r80, %r81;
cvt.s64.s32 %rd30, %r82;
mul.wide.s32 %rd31, %r82, 4;
add.u64 %rd32, %rd1, %rd31;
mov.s32 %r84, %r76;
$Lt_3_18434:
//<loop> Loop body line 166, nesting depth: 4, estimated iterations: unknown
.loc 16 171 0
ld.global.s32 %r85, [%rd32+0];
.loc 16 172 0
ld.global.s32 %r86, [%rd32+4];
.loc 16 176 0
sub.s32 %r87, %r86, %r85;
cvt.rn.f32.s32 %f15, %r87;
mov.f32 %f16, 0f43000000; // 128
div.approx.ftz.f32 %f17, %f15, %f16;
cvt.rpi.ftz.f32.f32 %f18, %f17;
cvt.rzi.ftz.s32.f32 %r88, %f18;
mov.u32 %r89, 0;
setp.le.s32 %p8, %r88, %r89;
@%p8 bra $Lt_3_18690;
mov.s32 %r90, %r88;
mov.s32 %r91, 0;
setp.lt.s32 %p9, %r47, %r43;
mul.lo.s32 %r92, %r88, 128;
mov.s32 %r93, %r90;
$Lt_3_19202:
//<loop> Loop body line 176, nesting depth: 5, estimated iterations: unknown
sub.s32 %r94, %r87, %r91;
mov.s32 %r95, 128;
min.s32 %r96, %r94, %r95;
setp.le.s32 %p10, %r96, %r35;
@%p10 bra $Lt_3_19458;
.loc 16 183 0
ld.param.u64 %rd33, [__cudaparm_calc_neigh_list_cell_cell_particle_id];
add.s32 %r97, %r91, %r35;
add.s32 %r98, %r85, %r97;
cvt.s64.s32 %rd34, %r98;
mul.wide.s32 %rd35, %r98, 4;
add.u64 %rd36, %rd33, %rd35;
ld.global.s32 %r99, [%rd36+0];
.loc 16 184 0
cvt.s64.s32 %rd37, %r35;
mul.wide.s32 %rd38, %r35, 4;
add.u64 %rd39, %rd7, %rd38;
st.shared.s32 [%rd39+0], %r99;
.loc 16 185 0
mov.u32 %r100, %r99;
mov.s32 %r101, 0;
mov.u32 %r102, %r101;
mov.s32 %r103, 0;
mov.u32 %r104, %r103;
mov.s32 %r105, 0;
mov.u32 %r106, %r105;
tex.1d.v4.f32.s32 {%f19,%f20,%f21,%f22},[neigh_tex,{%r100,%r102,%r104,%r106}];
mov.f32 %f23, %f19;
mov.f32 %f24, %f20;
mov.f32 %f25, %f21;
.loc 16 186 0
mul.lo.u64 %rd40, %rd37, 16;
add.u64 %rd41, %rd6, %rd40;
st.shared.v2.f32 [%rd41+0], {%f23,%f24};
.loc 16 188 0
st.shared.f32 [%rd41+8], %f25;
$Lt_3_19458:
.loc 16 190 0
bar.sync 0;
@!%p9 bra $Lt_3_20482;
mov.u32 %r107, 0;
setp.le.s32 %p11, %r96, %r107;
@%p11 bra $Lt_3_20482;
mov.s32 %r108, %r96;
mov.s64 %rd42, 0;
ld.param.f32 %f26, [__cudaparm_calc_neigh_list_cell_cell_size];
mul.ftz.f32 %f27, %f26, %f26;
mov.s64 %rd43, %rd6;
mov.f32 %f28, %f14;
mov.f32 %f29, %f13;
mov.f32 %f30, %f12;
mov.s32 %r109, 0;
mov.s32 %r110, %r108;
$Lt_3_20994:
//<loop> Loop body line 190, nesting depth: 6, estimated iterations: unknown
ld.shared.v4.f32 {%f31,%f32,%f33,_}, [%rd43+0];
.loc 16 196 0
sub.ftz.f32 %f34, %f30, %f31;
.loc 16 197 0
sub.ftz.f32 %f35, %f29, %f32;
.loc 16 198 0
sub.ftz.f32 %f36, %f28, %f33;
.loc 16 195 0
mul.ftz.f32 %f37, %f35, %f35;
fma.rn.ftz.f32 %f38, %f34, %f34, %f37;
fma.rn.ftz.f32 %f39, %f36, %f36, %f38;
setp.gt.ftz.f32 %p12, %f27, %f39;
@!%p12 bra $Lt_3_25346;
cvt.ftz.f64.f32 %fd1, %f39;
mov.f64 %fd2, 0d3ee4f8b588e368f1; // 1e-05
setp.gt.f64 %p13, %fd1, %fd2;
@!%p13 bra $Lt_3_25346;
.loc 16 202 0
add.s32 %r68, %r68, 1;
ld.param.s32 %r111, [__cudaparm_calc_neigh_list_cell_neigh_bin_size];
setp.lt.s32 %p14, %r111, %r68;
@%p14 bra $Lt_3_25346;
.loc 16 204 0
mul.lo.u64 %rd44, %rd42, 4;
add.u64 %rd45, %rd7, %rd44;
ld.shared.s32 %r112, [%rd45+0];
st.global.s32 [%rd22+0], %r112;
cvt.s64.s32 %rd46, %r60;
mul.wide.s32 %rd47, %r60, 4;
add.u64 %rd48, %rd22, %rd47;
add.u64 %rd49, %rd48, 4;
add.u64 %rd50, %rd22, 4;
ld.param.s32 %r113, [__cudaparm_calc_neigh_list_cell_t_per_atom];
sub.s32 %r114, %r113, 1;
and.b32 %r115, %r68, %r114;
mov.s32 %r116, 0;
setp.eq.s32 %p15, %r115, %r116;
selp.u64 %rd22, %rd49, %rd50, %p15;
$Lt_3_25346:
$L_3_13570:
.loc 16 202 0
add.s32 %r109, %r109, 1;
add.s64 %rd42, %rd42, 1;
add.u64 %rd43, %rd43, 16;
setp.ne.s32 %p16, %r96, %r109;
@%p16 bra $Lt_3_20994;
$Lt_3_20482:
$Lt_3_19970:
.loc 16 212 0
bar.sync 0;
add.s32 %r91, %r91, 128;
setp.ne.s32 %p17, %r91, %r92;
@%p17 bra $Lt_3_19202;
$Lt_3_18690:
add.s32 %r82, %r82, 1;
add.u64 %rd32, %rd32, 4;
setp.ne.s32 %p18, %r82, %r83;
@%p18 bra $Lt_3_18434;
$Lt_3_17922:
add.s32 %r70, %r70, 1;
setp.ne.s32 %p19, %r73, %r70;
@%p19 bra $Lt_3_17666;
$Lt_3_17154:
add.s32 %r64, %r64, 1;
setp.ne.s32 %p20, %r67, %r64;
@%p20 bra $Lt_3_16898;
bra.uni $Lt_3_16386;
$Lt_3_24066:
mov.s32 %r68, 0;
$Lt_3_16386:
@!%p4 bra $Lt_3_23042;
.loc 16 218 0
st.global.s32 [%rd18+0], %r68;
$Lt_3_23042:
add.s32 %r45, %r45, 1;
add.u32 %r37, %r37, %r15;
add.s32 %r40, %r40, %r15;
setp.ne.s32 %p21, %r16, %r45;
@%p21 bra $Lt_3_14594;
$Lt_3_14082:
.loc 16 220 0
exit;
$LDWend_calc_neigh_list_cell:
} // calc_neigh_list_cell
.entry kernel_special (
.param .u64 __cudaparm_kernel_special_dev_nbor,
.param .u64 __cudaparm_kernel_special_host_nbor_list,
.param .u64 __cudaparm_kernel_special_host_numj,
.param .u64 __cudaparm_kernel_special_tag,
.param .u64 __cudaparm_kernel_special_nspecial,
.param .u64 __cudaparm_kernel_special_special,
.param .s32 __cudaparm_kernel_special_inum,
.param .s32 __cudaparm_kernel_special_nt,
.param .s32 __cudaparm_kernel_special_max_nbors,
.param .s32 __cudaparm_kernel_special_t_per_atom)
{
.reg .u32 %r<45>;
.reg .u64 %rd<45>;
.reg .pred %p<11>;
.loc 16 226 0
$LDWbegin_kernel_special:
ld.param.s32 %r1, [__cudaparm_kernel_special_t_per_atom];
cvt.s32.u32 %r2, %tid.x;
div.s32 %r3, %r2, %r1;
cvt.s32.u32 %r4, %ntid.x;
div.s32 %r5, %r4, %r1;
cvt.s32.u32 %r6, %ctaid.x;
mul.lo.s32 %r7, %r6, %r5;
add.s32 %r8, %r3, %r7;
ld.param.s32 %r9, [__cudaparm_kernel_special_nt];
setp.ge.s32 %p1, %r8, %r9;
@%p1 bra $Lt_4_6146;
.loc 16 236 0
ld.param.u64 %rd1, [__cudaparm_kernel_special_nspecial];
mul.lo.s32 %r10, %r8, 3;
cvt.s64.s32 %rd2, %r10;
mul.wide.s32 %rd3, %r10, 4;
add.u64 %rd4, %rd1, %rd3;
ld.global.s32 %r11, [%rd4+0];
.loc 16 237 0
ld.global.s32 %r12, [%rd4+4];
.loc 16 238 0
ld.global.s32 %r13, [%rd4+8];
ld.param.s32 %r14, [__cudaparm_kernel_special_inum];
setp.ge.s32 %p2, %r8, %r14;
@%p2 bra $Lt_4_6914;
.loc 16 244 0
ld.param.u64 %rd5, [__cudaparm_kernel_special_dev_nbor];
cvt.s64.s32 %rd6, %r8;
cvt.s64.s32 %rd7, %r14;
add.u64 %rd8, %rd6, %rd7;
mul.lo.u64 %rd9, %rd8, 4;
add.u64 %rd10, %rd5, %rd9;
ld.global.s32 %r15, [%rd10+0];
.loc 16 246 0
mul.lo.s32 %r16, %r14, %r1;
mov.s32 %r17, %r16;
.loc 16 248 0
sub.s32 %r18, %r1, 1;
mul.lo.s32 %r19, %r18, %r8;
add.s32 %r20, %r14, %r19;
cvt.s64.s32 %rd11, %r20;
mul.wide.s32 %rd12, %r20, 4;
add.u64 %rd13, %rd10, %rd12;
and.b32 %r21, %r18, %r15;
cvt.s64.s32 %rd14, %r21;
div.s32 %r22, %r15, %r1;
mul.lo.s32 %r23, %r16, %r22;
cvt.s64.s32 %rd15, %r23;
add.u64 %rd16, %rd14, %rd15;
mul.lo.u64 %rd17, %rd16, 4;
add.u64 %rd18, %rd13, %rd17;
.loc 16 249 0
and.b32 %r24, %r18, %r2;
cvt.s64.s32 %rd19, %r24;
mul.wide.s32 %rd20, %r24, 4;
add.u64 %rd21, %rd13, %rd20;
bra.uni $Lt_4_6658;
$Lt_4_6914:
.loc 16 252 0
sub.s32 %r25, %r8, %r14;
ld.param.u64 %rd22, [__cudaparm_kernel_special_host_nbor_list];
ld.param.s32 %r26, [__cudaparm_kernel_special_max_nbors];
mul.lo.s32 %r27, %r26, %r25;
cvt.s64.s32 %rd23, %r27;
mul.wide.s32 %rd24, %r27, 4;
add.u64 %rd25, %rd22, %rd24;
mov.s64 %rd21, %rd25;
.loc 16 254 0
ld.param.u64 %rd26, [__cudaparm_kernel_special_host_numj];
cvt.s64.s32 %rd27, %r25;
mul.wide.s32 %rd28, %r25, 4;
add.u64 %rd29, %rd26, %rd28;
ld.global.s32 %r28, [%rd29+0];
cvt.s64.s32 %rd30, %r28;
mul.wide.s32 %rd31, %r28, 4;
add.u64 %rd18, %rd25, %rd31;
mov.s32 %r17, 1;
$Lt_4_6658:
setp.ge.u64 %p3, %rd21, %rd18;
@%p3 bra $Lt_4_7170;
mov.s32 %r29, 0;
setp.gt.s32 %p4, %r13, %r29;
cvt.s64.s32 %rd32, %r17;
ld.param.u64 %rd33, [__cudaparm_kernel_special_tag];
$Lt_4_7682:
//<loop> Loop body line 254, nesting depth: 1, estimated iterations: unknown
.loc 16 258 0
ld.global.s32 %r30, [%rd21+0];
.loc 16 259 0
cvt.s64.s32 %rd34, %r30;
mul.wide.s32 %rd35, %r30, 4;
add.u64 %rd36, %rd33, %rd35;
ld.global.s32 %r31, [%rd36+0];
@!%p4 bra $Lt_4_7938;
mov.s32 %r32, %r13;
cvt.s64.s32 %rd37, %r8;
cvt.s64.s32 %rd38, %r9;
mul.wide.s32 %rd39, %r9, 4;
ld.param.u64 %rd40, [__cudaparm_kernel_special_special];
mul.wide.s32 %rd41, %r8, 4;
add.u64 %rd42, %rd40, %rd41;
mov.s32 %r33, 0;
mov.s32 %r34, %r32;
$Lt_4_8450:
//<loop> Loop body line 259, nesting depth: 1, estimated iterations: unknown
ld.global.s32 %r35, [%rd42+0];
setp.ne.s32 %p5, %r35, %r31;
@%p5 bra $Lt_4_8706;
.loc 16 269 0
setp.le.s32 %p6, %r11, %r33;
mov.s32 %r36, 3;
mov.s32 %r37, 2;
selp.s32 %r38, %r36, %r37, %p6;
mov.s32 %r39, 2;
mov.s32 %r40, 1;
selp.s32 %r41, %r39, %r40, %p6;
setp.le.s32 %p7, %r12, %r33;
selp.s32 %r42, %r38, %r41, %p7;
shl.b32 %r43, %r42, 30;
xor.b32 %r30, %r30, %r43;
.loc 16 270 0
st.global.s32 [%rd21+0], %r30;
$Lt_4_8706:
add.s32 %r33, %r33, 1;
add.u64 %rd42, %rd39, %rd42;
setp.ne.s32 %p8, %r13, %r33;
@%p8 bra $Lt_4_8450;
$Lt_4_7938:
.loc 16 257 0
mul.lo.u64 %rd43, %rd32, 4;
add.u64 %rd21, %rd21, %rd43;
setp.lt.u64 %p9, %rd21, %rd18;
@%p9 bra $Lt_4_7682;
$Lt_4_7170:
$Lt_4_6146:
.loc 16 276 0
exit;
$LDWend_kernel_special:
} // kernel_special

View File

@ -1,809 +0,0 @@
const char * neighbor_gpu =
" .version 2.3\n"
" .target sm_20\n"
" .address_size 64\n"
" .global .texref neigh_tex;\n"
" .entry calc_cell_id (\n"
" .param .u64 __cudaparm_calc_cell_id_pos,\n"
" .param .u64 __cudaparm_calc_cell_id_cell_id,\n"
" .param .u64 __cudaparm_calc_cell_id_particle_id,\n"
" .param .f32 __cudaparm_calc_cell_id_boxlo0,\n"
" .param .f32 __cudaparm_calc_cell_id_boxlo1,\n"
" .param .f32 __cudaparm_calc_cell_id_boxlo2,\n"
" .param .f32 __cudaparm_calc_cell_id_boxhi0,\n"
" .param .f32 __cudaparm_calc_cell_id_boxhi1,\n"
" .param .f32 __cudaparm_calc_cell_id_boxhi2,\n"
" .param .f32 __cudaparm_calc_cell_id_cell_size,\n"
" .param .s32 __cudaparm_calc_cell_id_ncellx,\n"
" .param .s32 __cudaparm_calc_cell_id_ncelly,\n"
" .param .s32 __cudaparm_calc_cell_id_nall)\n"
" {\n"
" .reg .u32 %r<25>;\n"
" .reg .u64 %rd<8>;\n"
" .reg .f32 %f<35>;\n"
" .reg .f64 %fd<11>;\n"
" .reg .pred %p<3>;\n"
" .loc 16 29 0\n"
"$LDWbegin_calc_cell_id:\n"
" mov.u32 %r1, %tid.x;\n"
" mov.u32 %r2, %ctaid.x;\n"
" mov.u32 %r3, %ntid.x;\n"
" mul.lo.u32 %r4, %r2, %r3;\n"
" add.u32 %r5, %r1, %r4;\n"
" ld.param.s32 %r6, [__cudaparm_calc_cell_id_nall];\n"
" setp.le.s32 %p1, %r6, %r5;\n"
" @%p1 bra $Lt_0_1026;\n"
" .loc 16 33 0\n"
" mov.u32 %r7, %r5;\n"
" mov.s32 %r8, 0;\n"
" mov.u32 %r9, %r8;\n"
" mov.s32 %r10, 0;\n"
" mov.u32 %r11, %r10;\n"
" mov.s32 %r12, 0;\n"
" mov.u32 %r13, %r12;\n"
" tex.1d.v4.f32.s32 {%f1,%f2,%f3,%f4},[neigh_tex,{%r7,%r9,%r11,%r13}];\n"
" mov.f32 %f5, %f1;\n"
" mov.f32 %f6, %f2;\n"
" mov.f32 %f7, %f3;\n"
" .loc 16 46 0\n"
" ld.param.f32 %f8, [__cudaparm_calc_cell_id_cell_size];\n"
" neg.ftz.f32 %f9, %f8;\n"
" ld.param.f32 %f10, [__cudaparm_calc_cell_id_boxlo0];\n"
" ld.param.f32 %f11, [__cudaparm_calc_cell_id_boxlo2];\n"
" ld.param.f32 %f12, [__cudaparm_calc_cell_id_boxlo1];\n"
" ld.param.s32 %r14, [__cudaparm_calc_cell_id_ncellx];\n"
" ld.param.s32 %r15, [__cudaparm_calc_cell_id_ncelly];\n"
" ld.param.f32 %f13, [__cudaparm_calc_cell_id_boxhi2];\n"
" sub.ftz.f32 %f14, %f13, %f11;\n"
" add.ftz.f32 %f15, %f8, %f14;\n"
" sub.ftz.f32 %f16, %f7, %f11;\n"
" max.ftz.f32 %f17, %f9, %f16;\n"
" min.ftz.f32 %f18, %f15, %f17;\n"
" div.approx.ftz.f32 %f19, %f18, %f8;\n"
" cvt.ftz.f64.f32 %fd1, %f19;\n"
" mov.f64 %fd2, 0d3ff0000000000000; \n"
" add.f64 %fd3, %fd1, %fd2;\n"
" cvt.rzi.u32.f64 %r16, %fd3;\n"
" mul.lo.u32 %r17, %r14, %r16;\n"
" mul.lo.u32 %r18, %r15, %r17;\n"
" ld.param.f32 %f20, [__cudaparm_calc_cell_id_boxhi1];\n"
" sub.ftz.f32 %f21, %f20, %f12;\n"
" add.ftz.f32 %f22, %f8, %f21;\n"
" sub.ftz.f32 %f23, %f6, %f12;\n"
" max.ftz.f32 %f24, %f9, %f23;\n"
" min.ftz.f32 %f25, %f22, %f24;\n"
" div.approx.ftz.f32 %f26, %f25, %f8;\n"
" cvt.ftz.f64.f32 %fd4, %f26;\n"
" mov.f64 %fd5, 0d3ff0000000000000; \n"
" add.f64 %fd6, %fd4, %fd5;\n"
" cvt.rzi.u32.f64 %r19, %fd6;\n"
" mul.lo.u32 %r20, %r14, %r19;\n"
" add.u32 %r21, %r18, %r20;\n"
" ld.param.f32 %f27, [__cudaparm_calc_cell_id_boxhi0];\n"
" sub.ftz.f32 %f28, %f27, %f10;\n"
" add.ftz.f32 %f29, %f8, %f28;\n"
" sub.ftz.f32 %f30, %f5, %f10;\n"
" max.ftz.f32 %f31, %f9, %f30;\n"
" min.ftz.f32 %f32, %f29, %f31;\n"
" div.approx.ftz.f32 %f33, %f32, %f8;\n"
" cvt.ftz.f64.f32 %fd7, %f33;\n"
" mov.f64 %fd8, 0d3ff0000000000000; \n"
" add.f64 %fd9, %fd7, %fd8;\n"
" cvt.rzi.u32.f64 %r22, %fd9;\n"
" add.u32 %r23, %r21, %r22;\n"
" .loc 16 50 0\n"
" cvt.s64.s32 %rd1, %r5;\n"
" mul.wide.s32 %rd2, %r5, 4;\n"
" ld.param.u64 %rd3, [__cudaparm_calc_cell_id_cell_id];\n"
" add.u64 %rd4, %rd3, %rd2;\n"
" st.global.u32 [%rd4+0], %r23;\n"
" .loc 16 51 0\n"
" ld.param.u64 %rd5, [__cudaparm_calc_cell_id_particle_id];\n"
" add.u64 %rd6, %rd5, %rd2;\n"
" st.global.s32 [%rd6+0], %r5;\n"
"$Lt_0_1026:\n"
" .loc 16 53 0\n"
" exit;\n"
"$LDWend_calc_cell_id:\n"
" }\n"
" .entry kernel_calc_cell_counts (\n"
" .param .u64 __cudaparm_kernel_calc_cell_counts_cell_id,\n"
" .param .u64 __cudaparm_kernel_calc_cell_counts_cell_counts,\n"
" .param .s32 __cudaparm_kernel_calc_cell_counts_nall,\n"
" .param .s32 __cudaparm_kernel_calc_cell_counts_ncell)\n"
" {\n"
" .reg .u32 %r<33>;\n"
" .reg .u64 %rd<15>;\n"
" .reg .pred %p<13>;\n"
" .loc 16 56 0\n"
"$LDWbegin_kernel_calc_cell_counts:\n"
" mov.u32 %r1, %ctaid.x;\n"
" mov.u32 %r2, %ntid.x;\n"
" mul.lo.u32 %r3, %r1, %r2;\n"
" mov.u32 %r4, %tid.x;\n"
" add.u32 %r5, %r4, %r3;\n"
" ld.param.s32 %r6, [__cudaparm_kernel_calc_cell_counts_nall];\n"
" setp.gt.s32 %p1, %r6, %r5;\n"
" @!%p1 bra $Lt_1_7426;\n"
" .loc 16 59 0\n"
" ld.param.u64 %rd1, [__cudaparm_kernel_calc_cell_counts_cell_id];\n"
" cvt.s64.s32 %rd2, %r5;\n"
" mul.wide.s32 %rd3, %r5, 4;\n"
" add.u64 %rd4, %rd1, %rd3;\n"
" ld.global.u32 %r7, [%rd4+0];\n"
" mov.u32 %r8, 0;\n"
" setp.ne.s32 %p2, %r5, %r8;\n"
" @%p2 bra $Lt_1_7938;\n"
" add.s32 %r9, %r7, 1;\n"
" mov.u32 %r10, 0;\n"
" setp.le.s32 %p3, %r9, %r10;\n"
" @%p3 bra $Lt_1_8450;\n"
" mov.s32 %r11, %r9;\n"
" ld.param.u64 %rd5, [__cudaparm_kernel_calc_cell_counts_cell_counts];\n"
" mov.s32 %r12, 0;\n"
" mov.s32 %r13, %r11;\n"
"$Lt_1_8962:\n"
" .loc 16 64 0\n"
" mov.s32 %r14, 0;\n"
" st.global.s32 [%rd5+0], %r14;\n"
" add.s32 %r12, %r12, 1;\n"
" add.u64 %rd5, %rd5, 4;\n"
" setp.ne.s32 %p4, %r9, %r12;\n"
" @%p4 bra $Lt_1_8962;\n"
"$Lt_1_8450:\n"
"$Lt_1_7938:\n"
" sub.s32 %r15, %r6, 1;\n"
" setp.ne.s32 %p5, %r5, %r15;\n"
" @%p5 bra $Lt_1_9474;\n"
" .loc 16 67 0\n"
" add.s32 %r9, %r7, 1;\n"
" mov.s32 %r16, %r9;\n"
" ld.param.s32 %r17, [__cudaparm_kernel_calc_cell_counts_ncell];\n"
" setp.gt.s32 %p6, %r9, %r17;\n"
" @%p6 bra $Lt_1_9986;\n"
" sub.s32 %r18, %r17, %r7;\n"
" add.s32 %r19, %r17, 1;\n"
" ld.param.u64 %rd6, [__cudaparm_kernel_calc_cell_counts_cell_counts];\n"
" cvt.s64.s32 %rd7, %r9;\n"
" mul.wide.s32 %rd8, %r9, 4;\n"
" add.u64 %rd9, %rd6, %rd8;\n"
" mov.s32 %r20, %r18;\n"
"$Lt_1_10498:\n"
" .loc 16 68 0\n"
" st.global.s32 [%rd9+0], %r6;\n"
" add.s32 %r16, %r16, 1;\n"
" add.u64 %rd9, %rd9, 4;\n"
" setp.ne.s32 %p7, %r19, %r16;\n"
" @%p7 bra $Lt_1_10498;\n"
"$Lt_1_9986:\n"
"$Lt_1_9474:\n"
" selp.s32 %r21, 1, 0, %p1;\n"
" mov.s32 %r22, 0;\n"
" set.gt.u32.s32 %r23, %r5, %r22;\n"
" neg.s32 %r24, %r23;\n"
" and.b32 %r25, %r21, %r24;\n"
" mov.u32 %r26, 0;\n"
" setp.eq.s32 %p8, %r25, %r26;\n"
" @%p8 bra $Lt_1_11010;\n"
" .loc 16 72 0\n"
" ld.global.u32 %r27, [%rd4+-4];\n"
" setp.eq.s32 %p9, %r7, %r27;\n"
" @%p9 bra $Lt_1_11522;\n"
" .loc 16 74 0\n"
" add.s32 %r28, %r27, 1;\n"
" mov.s32 %r29, %r28;\n"
" setp.gt.s32 %p10, %r28, %r7;\n"
" @%p10 bra $Lt_1_12034;\n"
" sub.s32 %r30, %r7, %r27;\n"
" add.s32 %r9, %r7, 1;\n"
" ld.param.u64 %rd10, [__cudaparm_kernel_calc_cell_counts_cell_counts];\n"
" cvt.s64.s32 %rd11, %r28;\n"
" mul.wide.s32 %rd12, %r28, 4;\n"
" add.u64 %rd13, %rd10, %rd12;\n"
" mov.s32 %r31, %r30;\n"
"$Lt_1_12546:\n"
" .loc 16 75 0\n"
" st.global.s32 [%rd13+0], %r5;\n"
" add.s32 %r29, %r29, 1;\n"
" add.u64 %rd13, %rd13, 4;\n"
" setp.ne.s32 %p11, %r9, %r29;\n"
" @%p11 bra $Lt_1_12546;\n"
"$Lt_1_12034:\n"
"$Lt_1_11522:\n"
"$Lt_1_11010:\n"
"$Lt_1_7426:\n"
" .loc 16 79 0\n"
" exit;\n"
"$LDWend_kernel_calc_cell_counts:\n"
" }\n"
" .entry transpose (\n"
" .param .u64 __cudaparm_transpose_out,\n"
" .param .u64 __cudaparm_transpose_in,\n"
" .param .s32 __cudaparm_transpose_columns_in,\n"
" .param .s32 __cudaparm_transpose_rows_in)\n"
" {\n"
" .reg .u32 %r<32>;\n"
" .reg .u64 %rd<23>;\n"
" .reg .f32 %f<4>;\n"
" .reg .pred %p<4>;\n"
" .shared .align 4 .b8 __cuda___cuda_local_var_32571_32_non_const_block112[288];\n"
" .loc 16 86 0\n"
"$LDWbegin_transpose:\n"
" mov.u32 %r1, %ctaid.x;\n"
" mul.lo.u32 %r2, %r1, 8;\n"
" mov.u32 %r3, %ctaid.y;\n"
" mul.lo.u32 %r4, %r3, 8;\n"
" mov.u32 %r5, %tid.x;\n"
" add.u32 %r6, %r2, %r5;\n"
" mov.u32 %r7, %tid.y;\n"
" add.u32 %r8, %r4, %r7;\n"
" ld.param.s32 %r9, [__cudaparm_transpose_rows_in];\n"
" ld.param.s32 %r10, [__cudaparm_transpose_columns_in];\n"
" set.gt.u32.u32 %r11, %r9, %r8;\n"
" neg.s32 %r12, %r11;\n"
" set.gt.u32.u32 %r13, %r10, %r6;\n"
" neg.s32 %r14, %r13;\n"
" and.b32 %r15, %r12, %r14;\n"
" mov.u32 %r16, 0;\n"
" setp.eq.s32 %p1, %r15, %r16;\n"
" @%p1 bra $Lt_2_2306;\n"
" .loc 16 98 0\n"
" mov.u64 %rd1, __cuda___cuda_local_var_32571_32_non_const_block112;\n"
" ld.param.u64 %rd2, [__cudaparm_transpose_in];\n"
" mul.lo.u32 %r17, %r10, %r8;\n"
" add.u32 %r18, %r6, %r17;\n"
" cvt.u64.u32 %rd3, %r18;\n"
" mul.wide.u32 %rd4, %r18, 4;\n"
" add.u64 %rd5, %rd2, %rd4;\n"
" ld.global.s32 %r19, [%rd5+0];\n"
" cvt.rn.f32.s32 %f1, %r19;\n"
" cvt.u64.u32 %rd6, %r5;\n"
" cvt.u64.u32 %rd7, %r7;\n"
" mul.wide.u32 %rd8, %r7, 9;\n"
" add.u64 %rd9, %rd6, %rd8;\n"
" mul.lo.u64 %rd10, %rd9, 4;\n"
" add.u64 %rd11, %rd1, %rd10;\n"
" st.shared.f32 [%rd11+0], %f1;\n"
"$Lt_2_2306:\n"
" mov.u64 %rd1, __cuda___cuda_local_var_32571_32_non_const_block112;\n"
" .loc 16 100 0\n"
" bar.sync 0;\n"
" add.u32 %r20, %r2, %r7;\n"
" add.u32 %r21, %r4, %r5;\n"
" set.gt.u32.u32 %r22, %r9, %r21;\n"
" neg.s32 %r23, %r22;\n"
" set.gt.u32.u32 %r24, %r10, %r20;\n"
" neg.s32 %r25, %r24;\n"
" and.b32 %r26, %r23, %r25;\n"
" mov.u32 %r27, 0;\n"
" setp.eq.s32 %p2, %r26, %r27;\n"
" @%p2 bra $Lt_2_2818;\n"
" .loc 16 105 0\n"
" cvt.u64.u32 %rd12, %r7;\n"
" cvt.u64.u32 %rd13, %r5;\n"
" mul.wide.u32 %rd14, %r5, 9;\n"
" add.u64 %rd15, %rd12, %rd14;\n"
" mul.lo.u64 %rd16, %rd15, 4;\n"
" add.u64 %rd17, %rd1, %rd16;\n"
" ld.shared.f32 %f2, [%rd17+0];\n"
" cvt.rzi.ftz.s32.f32 %r28, %f2;\n"
" ld.param.u64 %rd18, [__cudaparm_transpose_out];\n"
" mul.lo.u32 %r29, %r9, %r20;\n"
" add.u32 %r30, %r21, %r29;\n"
" cvt.u64.u32 %rd19, %r30;\n"
" mul.wide.u32 %rd20, %r30, 4;\n"
" add.u64 %rd21, %rd18, %rd20;\n"
" st.global.s32 [%rd21+0], %r28;\n"
"$Lt_2_2818:\n"
" .loc 16 106 0\n"
" exit;\n"
"$LDWend_transpose:\n"
" }\n"
" .entry calc_neigh_list_cell (\n"
" .param .u64 __cudaparm_calc_neigh_list_cell_x_,\n"
" .param .u64 __cudaparm_calc_neigh_list_cell_cell_particle_id,\n"
" .param .u64 __cudaparm_calc_neigh_list_cell_cell_counts,\n"
" .param .u64 __cudaparm_calc_neigh_list_cell_nbor_list,\n"
" .param .u64 __cudaparm_calc_neigh_list_cell_host_nbor_list,\n"
" .param .u64 __cudaparm_calc_neigh_list_cell_host_numj,\n"
" .param .s32 __cudaparm_calc_neigh_list_cell_neigh_bin_size,\n"
" .param .f32 __cudaparm_calc_neigh_list_cell_cell_size,\n"
" .param .s32 __cudaparm_calc_neigh_list_cell_ncellx,\n"
" .param .s32 __cudaparm_calc_neigh_list_cell_ncelly,\n"
" .param .s32 __cudaparm_calc_neigh_list_cell_ncellz,\n"
" .param .s32 __cudaparm_calc_neigh_list_cell_inum,\n"
" .param .s32 __cudaparm_calc_neigh_list_cell_nt,\n"
" .param .s32 __cudaparm_calc_neigh_list_cell_nall,\n"
" .param .s32 __cudaparm_calc_neigh_list_cell_t_per_atom)\n"
" {\n"
" .reg .u32 %r<118>;\n"
" .reg .u64 %rd<52>;\n"
" .reg .f32 %f<41>;\n"
" .reg .f64 %fd<4>;\n"
" .reg .pred %p<23>;\n"
" .shared .align 16 .b8 __cuda___cuda_local_var_32609_34_non_const_pos_sh496[2048];\n"
" .shared .align 4 .b8 __cuda___cuda_local_var_32608_31_non_const_cell_list_sh2544[512];\n"
" .loc 16 116 0\n"
"$LDWbegin_calc_neigh_list_cell:\n"
" .loc 16 128 0\n"
" ld.param.s32 %r1, [__cudaparm_calc_neigh_list_cell_ncelly];\n"
" mov.u32 %r2, %ctaid.y;\n"
" rem.u32 %r3, %r2, %r1;\n"
" div.u32 %r4, %r2, %r1;\n"
" ld.param.s32 %r5, [__cudaparm_calc_neigh_list_cell_ncellx];\n"
" mul.lo.s32 %r6, %r5, %r3;\n"
" mul.lo.s32 %r7, %r5, %r4;\n"
" mul.lo.s32 %r8, %r7, %r1;\n"
" cvt.s32.u32 %r9, %ctaid.x;\n"
" ld.param.u64 %rd1, [__cudaparm_calc_neigh_list_cell_cell_counts];\n"
" add.s32 %r10, %r6, %r8;\n"
" add.s32 %r11, %r9, %r10;\n"
" cvt.s64.s32 %rd2, %r11;\n"
" mul.wide.s32 %rd3, %r11, 4;\n"
" add.u64 %rd4, %rd1, %rd3;\n"
" ldu.global.s32 %r12, [%rd4+0];\n"
" .loc 16 129 0\n"
" ldu.global.s32 %r13, [%rd4+4];\n"
" .loc 16 137 0\n"
" sub.s32 %r14, %r13, %r12;\n"
" mov.u32 %r15, %ntid.x;\n"
" cvt.rn.f32.u32 %f1, %r15;\n"
" cvt.rn.f32.s32 %f2, %r14;\n"
" div.approx.ftz.f32 %f3, %f2, %f1;\n"
" cvt.rpi.ftz.f32.f32 %f4, %f3;\n"
" cvt.rzi.ftz.s32.f32 %r16, %f4;\n"
" mov.u32 %r17, 0;\n"
" setp.le.s32 %p1, %r16, %r17;\n"
" @%p1 bra $Lt_3_14082;\n"
" sub.s32 %r18, %r3, 1;\n"
" mov.s32 %r19, 0;\n"
" max.s32 %r20, %r18, %r19;\n"
" sub.s32 %r21, %r1, 1;\n"
" add.s32 %r22, %r3, 1;\n"
" min.s32 %r23, %r21, %r22;\n"
" ld.param.s32 %r24, [__cudaparm_calc_neigh_list_cell_ncellz];\n"
" sub.s32 %r25, %r24, 1;\n"
" add.s32 %r26, %r4, 1;\n"
" min.s32 %r27, %r25, %r26;\n"
" sub.s32 %r28, %r9, 1;\n"
" mov.s32 %r29, 0;\n"
" max.s32 %r30, %r28, %r29;\n"
" add.s32 %r31, %r9, 1;\n"
" sub.s32 %r32, %r5, 1;\n"
" min.s32 %r33, %r31, %r32;\n"
" mov.s32 %r34, %r16;\n"
" cvt.s32.u32 %r35, %tid.x;\n"
" add.s32 %r36, %r12, %r35;\n"
" mov.u32 %r37, 0;\n"
" ld.param.s32 %r38, [__cudaparm_calc_neigh_list_cell_inum];\n"
" cvt.s64.s32 %rd5, %r38;\n"
" sub.s32 %r39, %r4, 1;\n"
" mov.s32 %r40, %r36;\n"
" mov.s32 %r41, 0;\n"
" max.s32 %r42, %r39, %r41;\n"
" setp.ge.s32 %p2, %r27, %r42;\n"
" ld.param.s32 %r43, [__cudaparm_calc_neigh_list_cell_nt];\n"
" ld.param.s32 %r44, [__cudaparm_calc_neigh_list_cell_nall];\n"
" mov.s32 %r45, 0;\n"
" mov.u64 %rd6, __cuda___cuda_local_var_32609_34_non_const_pos_sh496;\n"
" mov.u64 %rd7, __cuda___cuda_local_var_32608_31_non_const_cell_list_sh2544;\n"
" mov.s32 %r46, %r34;\n"
"$Lt_3_14594:\n"
" .loc 16 140 0\n"
" mov.s32 %r47, %r44;\n"
" setp.ge.s32 %p3, %r40, %r13;\n"
" @%p3 bra $Lt_3_14850;\n"
" .loc 16 146 0\n"
" ld.param.u64 %rd8, [__cudaparm_calc_neigh_list_cell_cell_particle_id];\n"
" add.u32 %r48, %r36, %r37;\n"
" cvt.s64.s32 %rd9, %r48;\n"
" mul.wide.s32 %rd10, %r48, 4;\n"
" add.u64 %rd11, %rd8, %rd10;\n"
" ld.global.s32 %r47, [%rd11+0];\n"
"$Lt_3_14850:\n"
" setp.lt.s32 %p4, %r47, %r43;\n"
" @!%p4 bra $Lt_3_15362;\n"
" .loc 16 149 0\n"
" mov.u32 %r49, %r47;\n"
" mov.s32 %r50, 0;\n"
" mov.u32 %r51, %r50;\n"
" mov.s32 %r52, 0;\n"
" mov.u32 %r53, %r52;\n"
" mov.s32 %r54, 0;\n"
" mov.u32 %r55, %r54;\n"
" tex.1d.v4.f32.s32 {%f5,%f6,%f7,%f8},[neigh_tex,{%r49,%r51,%r53,%r55}];\n"
" mov.f32 %f9, %f5;\n"
" mov.f32 %f10, %f6;\n"
" mov.f32 %f11, %f7;\n"
" mov.f32 %f12, %f9;\n"
" mov.f32 %f13, %f10;\n"
" mov.f32 %f14, %f11;\n"
"$Lt_3_15362:\n"
" cvt.s64.s32 %rd12, %r47;\n"
" mul.wide.s32 %rd13, %r47, 4;\n"
" setp.ge.s32 %p5, %r47, %r38;\n"
" @%p5 bra $Lt_3_16130;\n"
" .loc 16 153 0\n"
" ld.param.u64 %rd14, [__cudaparm_calc_neigh_list_cell_nbor_list];\n"
" add.u64 %rd15, %rd12, %rd5;\n"
" mul.lo.u64 %rd16, %rd15, 4;\n"
" add.u64 %rd17, %rd14, %rd16;\n"
" mov.s64 %rd18, %rd17;\n"
" .loc 16 154 0\n"
" ld.param.s32 %r56, [__cudaparm_calc_neigh_list_cell_t_per_atom];\n"
" sub.s32 %r57, %r56, 1;\n"
" mul.lo.s32 %r58, %r47, %r57;\n"
" cvt.s64.s32 %rd19, %r58;\n"
" add.u64 %rd20, %rd19, %rd5;\n"
" mul.lo.u64 %rd21, %rd20, 4;\n"
" add.u64 %rd22, %rd17, %rd21;\n"
" .loc 16 155 0\n"
" mul.lo.s32 %r59, %r56, %r38;\n"
" sub.s32 %r60, %r59, %r56;\n"
" .loc 16 156 0\n"
" add.u64 %rd23, %rd13, %rd14;\n"
" st.global.s32 [%rd23+0], %r47;\n"
" bra.uni $Lt_3_15874;\n"
"$Lt_3_16130:\n"
" .loc 16 159 0\n"
" ld.param.u64 %rd24, [__cudaparm_calc_neigh_list_cell_host_numj];\n"
" add.u64 %rd25, %rd24, %rd13;\n"
" mul.lo.u64 %rd26, %rd5, 4;\n"
" sub.u64 %rd18, %rd25, %rd26;\n"
" .loc 16 160 0\n"
" ld.param.u64 %rd27, [__cudaparm_calc_neigh_list_cell_host_nbor_list];\n"
" ld.param.s32 %r61, [__cudaparm_calc_neigh_list_cell_neigh_bin_size];\n"
" sub.s32 %r62, %r47, %r38;\n"
" mul.lo.s32 %r63, %r61, %r62;\n"
" cvt.s64.s32 %rd28, %r63;\n"
" mul.wide.s32 %rd29, %r63, 4;\n"
" add.u64 %rd22, %rd27, %rd29;\n"
" mov.s32 %r60, 0;\n"
"$Lt_3_15874:\n"
" .loc 16 165 0\n"
" mov.s32 %r64, %r42;\n"
" @!%p2 bra $Lt_3_24066;\n"
" sub.s32 %r65, %r27, %r42;\n"
" add.s32 %r66, %r65, 1;\n"
" setp.le.s32 %p6, %r20, %r23;\n"
" add.s32 %r67, %r27, 1;\n"
" mov.s32 %r68, 0;\n"
" mov.s32 %r69, %r66;\n"
"$Lt_3_16898:\n"
" .loc 16 166 0\n"
" mov.s32 %r70, %r20;\n"
" @!%p6 bra $Lt_3_17154;\n"
" sub.s32 %r71, %r23, %r20;\n"
" add.s32 %r72, %r71, 1;\n"
" setp.ge.s32 %p7, %r33, %r30;\n"
" add.s32 %r73, %r23, 1;\n"
" mov.s32 %r74, %r72;\n"
"$Lt_3_17666:\n"
" @!%p7 bra $Lt_3_17922;\n"
" sub.s32 %r75, %r33, %r30;\n"
" add.s32 %r76, %r75, 1;\n"
" mul.lo.s32 %r77, %r70, %r5;\n"
" mul.lo.s32 %r78, %r64, %r5;\n"
" mul.lo.s32 %r79, %r78, %r1;\n"
" add.s32 %r80, %r33, 1;\n"
" add.s32 %r81, %r77, %r79;\n"
" add.s32 %r82, %r81, %r30;\n"
" add.s32 %r83, %r80, %r81;\n"
" cvt.s64.s32 %rd30, %r82;\n"
" mul.wide.s32 %rd31, %r82, 4;\n"
" add.u64 %rd32, %rd1, %rd31;\n"
" mov.s32 %r84, %r76;\n"
"$Lt_3_18434:\n"
" .loc 16 171 0\n"
" ld.global.s32 %r85, [%rd32+0];\n"
" .loc 16 172 0\n"
" ld.global.s32 %r86, [%rd32+4];\n"
" .loc 16 176 0\n"
" sub.s32 %r87, %r86, %r85;\n"
" cvt.rn.f32.s32 %f15, %r87;\n"
" mov.f32 %f16, 0f43000000; \n"
" div.approx.ftz.f32 %f17, %f15, %f16;\n"
" cvt.rpi.ftz.f32.f32 %f18, %f17;\n"
" cvt.rzi.ftz.s32.f32 %r88, %f18;\n"
" mov.u32 %r89, 0;\n"
" setp.le.s32 %p8, %r88, %r89;\n"
" @%p8 bra $Lt_3_18690;\n"
" mov.s32 %r90, %r88;\n"
" mov.s32 %r91, 0;\n"
" setp.lt.s32 %p9, %r47, %r43;\n"
" mul.lo.s32 %r92, %r88, 128;\n"
" mov.s32 %r93, %r90;\n"
"$Lt_3_19202:\n"
" sub.s32 %r94, %r87, %r91;\n"
" mov.s32 %r95, 128;\n"
" min.s32 %r96, %r94, %r95;\n"
" setp.le.s32 %p10, %r96, %r35;\n"
" @%p10 bra $Lt_3_19458;\n"
" .loc 16 183 0\n"
" ld.param.u64 %rd33, [__cudaparm_calc_neigh_list_cell_cell_particle_id];\n"
" add.s32 %r97, %r91, %r35;\n"
" add.s32 %r98, %r85, %r97;\n"
" cvt.s64.s32 %rd34, %r98;\n"
" mul.wide.s32 %rd35, %r98, 4;\n"
" add.u64 %rd36, %rd33, %rd35;\n"
" ld.global.s32 %r99, [%rd36+0];\n"
" .loc 16 184 0\n"
" cvt.s64.s32 %rd37, %r35;\n"
" mul.wide.s32 %rd38, %r35, 4;\n"
" add.u64 %rd39, %rd7, %rd38;\n"
" st.shared.s32 [%rd39+0], %r99;\n"
" .loc 16 185 0\n"
" mov.u32 %r100, %r99;\n"
" mov.s32 %r101, 0;\n"
" mov.u32 %r102, %r101;\n"
" mov.s32 %r103, 0;\n"
" mov.u32 %r104, %r103;\n"
" mov.s32 %r105, 0;\n"
" mov.u32 %r106, %r105;\n"
" tex.1d.v4.f32.s32 {%f19,%f20,%f21,%f22},[neigh_tex,{%r100,%r102,%r104,%r106}];\n"
" mov.f32 %f23, %f19;\n"
" mov.f32 %f24, %f20;\n"
" mov.f32 %f25, %f21;\n"
" .loc 16 186 0\n"
" mul.lo.u64 %rd40, %rd37, 16;\n"
" add.u64 %rd41, %rd6, %rd40;\n"
" st.shared.v2.f32 [%rd41+0], {%f23,%f24};\n"
" .loc 16 188 0\n"
" st.shared.f32 [%rd41+8], %f25;\n"
"$Lt_3_19458:\n"
" .loc 16 190 0\n"
" bar.sync 0;\n"
" @!%p9 bra $Lt_3_20482;\n"
" mov.u32 %r107, 0;\n"
" setp.le.s32 %p11, %r96, %r107;\n"
" @%p11 bra $Lt_3_20482;\n"
" mov.s32 %r108, %r96;\n"
" mov.s64 %rd42, 0;\n"
" ld.param.f32 %f26, [__cudaparm_calc_neigh_list_cell_cell_size];\n"
" mul.ftz.f32 %f27, %f26, %f26;\n"
" mov.s64 %rd43, %rd6;\n"
" mov.f32 %f28, %f14;\n"
" mov.f32 %f29, %f13;\n"
" mov.f32 %f30, %f12;\n"
" mov.s32 %r109, 0;\n"
" mov.s32 %r110, %r108;\n"
"$Lt_3_20994:\n"
" ld.shared.v4.f32 {%f31,%f32,%f33,_}, [%rd43+0];\n"
" .loc 16 196 0\n"
" sub.ftz.f32 %f34, %f30, %f31;\n"
" .loc 16 197 0\n"
" sub.ftz.f32 %f35, %f29, %f32;\n"
" .loc 16 198 0\n"
" sub.ftz.f32 %f36, %f28, %f33;\n"
" .loc 16 195 0\n"
" mul.ftz.f32 %f37, %f35, %f35;\n"
" fma.rn.ftz.f32 %f38, %f34, %f34, %f37;\n"
" fma.rn.ftz.f32 %f39, %f36, %f36, %f38;\n"
" setp.gt.ftz.f32 %p12, %f27, %f39;\n"
" @!%p12 bra $Lt_3_25346;\n"
" cvt.ftz.f64.f32 %fd1, %f39;\n"
" mov.f64 %fd2, 0d3ee4f8b588e368f1; \n"
" setp.gt.f64 %p13, %fd1, %fd2;\n"
" @!%p13 bra $Lt_3_25346;\n"
" .loc 16 202 0\n"
" add.s32 %r68, %r68, 1;\n"
" ld.param.s32 %r111, [__cudaparm_calc_neigh_list_cell_neigh_bin_size];\n"
" setp.lt.s32 %p14, %r111, %r68;\n"
" @%p14 bra $Lt_3_25346;\n"
" .loc 16 204 0\n"
" mul.lo.u64 %rd44, %rd42, 4;\n"
" add.u64 %rd45, %rd7, %rd44;\n"
" ld.shared.s32 %r112, [%rd45+0];\n"
" st.global.s32 [%rd22+0], %r112;\n"
" cvt.s64.s32 %rd46, %r60;\n"
" mul.wide.s32 %rd47, %r60, 4;\n"
" add.u64 %rd48, %rd22, %rd47;\n"
" add.u64 %rd49, %rd48, 4;\n"
" add.u64 %rd50, %rd22, 4;\n"
" ld.param.s32 %r113, [__cudaparm_calc_neigh_list_cell_t_per_atom];\n"
" sub.s32 %r114, %r113, 1;\n"
" and.b32 %r115, %r68, %r114;\n"
" mov.s32 %r116, 0;\n"
" setp.eq.s32 %p15, %r115, %r116;\n"
" selp.u64 %rd22, %rd49, %rd50, %p15;\n"
"$Lt_3_25346:\n"
"$L_3_13570:\n"
" .loc 16 202 0\n"
" add.s32 %r109, %r109, 1;\n"
" add.s64 %rd42, %rd42, 1;\n"
" add.u64 %rd43, %rd43, 16;\n"
" setp.ne.s32 %p16, %r96, %r109;\n"
" @%p16 bra $Lt_3_20994;\n"
"$Lt_3_20482:\n"
"$Lt_3_19970:\n"
" .loc 16 212 0\n"
" bar.sync 0;\n"
" add.s32 %r91, %r91, 128;\n"
" setp.ne.s32 %p17, %r91, %r92;\n"
" @%p17 bra $Lt_3_19202;\n"
"$Lt_3_18690:\n"
" add.s32 %r82, %r82, 1;\n"
" add.u64 %rd32, %rd32, 4;\n"
" setp.ne.s32 %p18, %r82, %r83;\n"
" @%p18 bra $Lt_3_18434;\n"
"$Lt_3_17922:\n"
" add.s32 %r70, %r70, 1;\n"
" setp.ne.s32 %p19, %r73, %r70;\n"
" @%p19 bra $Lt_3_17666;\n"
"$Lt_3_17154:\n"
" add.s32 %r64, %r64, 1;\n"
" setp.ne.s32 %p20, %r67, %r64;\n"
" @%p20 bra $Lt_3_16898;\n"
" bra.uni $Lt_3_16386;\n"
"$Lt_3_24066:\n"
" mov.s32 %r68, 0;\n"
"$Lt_3_16386:\n"
" @!%p4 bra $Lt_3_23042;\n"
" .loc 16 218 0\n"
" st.global.s32 [%rd18+0], %r68;\n"
"$Lt_3_23042:\n"
" add.s32 %r45, %r45, 1;\n"
" add.u32 %r37, %r37, %r15;\n"
" add.s32 %r40, %r40, %r15;\n"
" setp.ne.s32 %p21, %r16, %r45;\n"
" @%p21 bra $Lt_3_14594;\n"
"$Lt_3_14082:\n"
" .loc 16 220 0\n"
" exit;\n"
"$LDWend_calc_neigh_list_cell:\n"
" }\n"
" .entry kernel_special (\n"
" .param .u64 __cudaparm_kernel_special_dev_nbor,\n"
" .param .u64 __cudaparm_kernel_special_host_nbor_list,\n"
" .param .u64 __cudaparm_kernel_special_host_numj,\n"
" .param .u64 __cudaparm_kernel_special_tag,\n"
" .param .u64 __cudaparm_kernel_special_nspecial,\n"
" .param .u64 __cudaparm_kernel_special_special,\n"
" .param .s32 __cudaparm_kernel_special_inum,\n"
" .param .s32 __cudaparm_kernel_special_nt,\n"
" .param .s32 __cudaparm_kernel_special_max_nbors,\n"
" .param .s32 __cudaparm_kernel_special_t_per_atom)\n"
" {\n"
" .reg .u32 %r<45>;\n"
" .reg .u64 %rd<45>;\n"
" .reg .pred %p<11>;\n"
" .loc 16 226 0\n"
"$LDWbegin_kernel_special:\n"
" ld.param.s32 %r1, [__cudaparm_kernel_special_t_per_atom];\n"
" cvt.s32.u32 %r2, %tid.x;\n"
" div.s32 %r3, %r2, %r1;\n"
" cvt.s32.u32 %r4, %ntid.x;\n"
" div.s32 %r5, %r4, %r1;\n"
" cvt.s32.u32 %r6, %ctaid.x;\n"
" mul.lo.s32 %r7, %r6, %r5;\n"
" add.s32 %r8, %r3, %r7;\n"
" ld.param.s32 %r9, [__cudaparm_kernel_special_nt];\n"
" setp.ge.s32 %p1, %r8, %r9;\n"
" @%p1 bra $Lt_4_6146;\n"
" .loc 16 236 0\n"
" ld.param.u64 %rd1, [__cudaparm_kernel_special_nspecial];\n"
" mul.lo.s32 %r10, %r8, 3;\n"
" cvt.s64.s32 %rd2, %r10;\n"
" mul.wide.s32 %rd3, %r10, 4;\n"
" add.u64 %rd4, %rd1, %rd3;\n"
" ld.global.s32 %r11, [%rd4+0];\n"
" .loc 16 237 0\n"
" ld.global.s32 %r12, [%rd4+4];\n"
" .loc 16 238 0\n"
" ld.global.s32 %r13, [%rd4+8];\n"
" ld.param.s32 %r14, [__cudaparm_kernel_special_inum];\n"
" setp.ge.s32 %p2, %r8, %r14;\n"
" @%p2 bra $Lt_4_6914;\n"
" .loc 16 244 0\n"
" ld.param.u64 %rd5, [__cudaparm_kernel_special_dev_nbor];\n"
" cvt.s64.s32 %rd6, %r8;\n"
" cvt.s64.s32 %rd7, %r14;\n"
" add.u64 %rd8, %rd6, %rd7;\n"
" mul.lo.u64 %rd9, %rd8, 4;\n"
" add.u64 %rd10, %rd5, %rd9;\n"
" ld.global.s32 %r15, [%rd10+0];\n"
" .loc 16 246 0\n"
" mul.lo.s32 %r16, %r14, %r1;\n"
" mov.s32 %r17, %r16;\n"
" .loc 16 248 0\n"
" sub.s32 %r18, %r1, 1;\n"
" mul.lo.s32 %r19, %r18, %r8;\n"
" add.s32 %r20, %r14, %r19;\n"
" cvt.s64.s32 %rd11, %r20;\n"
" mul.wide.s32 %rd12, %r20, 4;\n"
" add.u64 %rd13, %rd10, %rd12;\n"
" and.b32 %r21, %r18, %r15;\n"
" cvt.s64.s32 %rd14, %r21;\n"
" div.s32 %r22, %r15, %r1;\n"
" mul.lo.s32 %r23, %r16, %r22;\n"
" cvt.s64.s32 %rd15, %r23;\n"
" add.u64 %rd16, %rd14, %rd15;\n"
" mul.lo.u64 %rd17, %rd16, 4;\n"
" add.u64 %rd18, %rd13, %rd17;\n"
" .loc 16 249 0\n"
" and.b32 %r24, %r18, %r2;\n"
" cvt.s64.s32 %rd19, %r24;\n"
" mul.wide.s32 %rd20, %r24, 4;\n"
" add.u64 %rd21, %rd13, %rd20;\n"
" bra.uni $Lt_4_6658;\n"
"$Lt_4_6914:\n"
" .loc 16 252 0\n"
" sub.s32 %r25, %r8, %r14;\n"
" ld.param.u64 %rd22, [__cudaparm_kernel_special_host_nbor_list];\n"
" ld.param.s32 %r26, [__cudaparm_kernel_special_max_nbors];\n"
" mul.lo.s32 %r27, %r26, %r25;\n"
" cvt.s64.s32 %rd23, %r27;\n"
" mul.wide.s32 %rd24, %r27, 4;\n"
" add.u64 %rd25, %rd22, %rd24;\n"
" mov.s64 %rd21, %rd25;\n"
" .loc 16 254 0\n"
" ld.param.u64 %rd26, [__cudaparm_kernel_special_host_numj];\n"
" cvt.s64.s32 %rd27, %r25;\n"
" mul.wide.s32 %rd28, %r25, 4;\n"
" add.u64 %rd29, %rd26, %rd28;\n"
" ld.global.s32 %r28, [%rd29+0];\n"
" cvt.s64.s32 %rd30, %r28;\n"
" mul.wide.s32 %rd31, %r28, 4;\n"
" add.u64 %rd18, %rd25, %rd31;\n"
" mov.s32 %r17, 1;\n"
"$Lt_4_6658:\n"
" setp.ge.u64 %p3, %rd21, %rd18;\n"
" @%p3 bra $Lt_4_7170;\n"
" mov.s32 %r29, 0;\n"
" setp.gt.s32 %p4, %r13, %r29;\n"
" cvt.s64.s32 %rd32, %r17;\n"
" ld.param.u64 %rd33, [__cudaparm_kernel_special_tag];\n"
"$Lt_4_7682:\n"
" .loc 16 258 0\n"
" ld.global.s32 %r30, [%rd21+0];\n"
" .loc 16 259 0\n"
" cvt.s64.s32 %rd34, %r30;\n"
" mul.wide.s32 %rd35, %r30, 4;\n"
" add.u64 %rd36, %rd33, %rd35;\n"
" ld.global.s32 %r31, [%rd36+0];\n"
" @!%p4 bra $Lt_4_7938;\n"
" mov.s32 %r32, %r13;\n"
" cvt.s64.s32 %rd37, %r8;\n"
" cvt.s64.s32 %rd38, %r9;\n"
" mul.wide.s32 %rd39, %r9, 4;\n"
" ld.param.u64 %rd40, [__cudaparm_kernel_special_special];\n"
" mul.wide.s32 %rd41, %r8, 4;\n"
" add.u64 %rd42, %rd40, %rd41;\n"
" mov.s32 %r33, 0;\n"
" mov.s32 %r34, %r32;\n"
"$Lt_4_8450:\n"
" ld.global.s32 %r35, [%rd42+0];\n"
" setp.ne.s32 %p5, %r35, %r31;\n"
" @%p5 bra $Lt_4_8706;\n"
" .loc 16 269 0\n"
" setp.le.s32 %p6, %r11, %r33;\n"
" mov.s32 %r36, 3;\n"
" mov.s32 %r37, 2;\n"
" selp.s32 %r38, %r36, %r37, %p6;\n"
" mov.s32 %r39, 2;\n"
" mov.s32 %r40, 1;\n"
" selp.s32 %r41, %r39, %r40, %p6;\n"
" setp.le.s32 %p7, %r12, %r33;\n"
" selp.s32 %r42, %r38, %r41, %p7;\n"
" shl.b32 %r43, %r42, 30;\n"
" xor.b32 %r30, %r30, %r43;\n"
" .loc 16 270 0\n"
" st.global.s32 [%rd21+0], %r30;\n"
"$Lt_4_8706:\n"
" add.s32 %r33, %r33, 1;\n"
" add.u64 %rd42, %rd39, %rd42;\n"
" setp.ne.s32 %p8, %r13, %r33;\n"
" @%p8 bra $Lt_4_8450;\n"
"$Lt_4_7938:\n"
" .loc 16 257 0\n"
" mul.lo.u64 %rd43, %rd32, 4;\n"
" add.u64 %rd21, %rd21, %rd43;\n"
" setp.lt.u64 %p9, %rd21, %rd18;\n"
" @%p9 bra $Lt_4_7682;\n"
"$Lt_4_7170:\n"
"$Lt_4_6146:\n"
" .loc 16 276 0\n"
" exit;\n"
"$LDWend_kernel_special:\n"
" }\n"
;

View File

@ -1,900 +0,0 @@
.version 2.3
.target sm_20
.address_size 64
// compiled with /usr/local/cuda/open64/lib//be
// nvopencc 4.0 built on 2011-05-12
//-----------------------------------------------------------
// Compiling /tmp/tmpxft_00009b29_00000000-9_lal_pppm.cpp3.i (/home/sjplimp/ccBI#.sIoydv)
//-----------------------------------------------------------
//-----------------------------------------------------------
// Options:
//-----------------------------------------------------------
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
// -O3 (Optimization level)
// -g0 (Debug level)
// -m2 (Report advisories)
//-----------------------------------------------------------
.file 1 "<command-line>"
.file 2 "/tmp/tmpxft_00009b29_00000000-8_lal_pppm.cudafe2.gpu"
.file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
.file 4 "/usr/local/cuda/include/crt/device_runtime.h"
.file 5 "/usr/local/cuda/include/host_defines.h"
.file 6 "/usr/local/cuda/include/builtin_types.h"
.file 7 "/usr/local/cuda/include/device_types.h"
.file 8 "/usr/local/cuda/include/driver_types.h"
.file 9 "/usr/local/cuda/include/surface_types.h"
.file 10 "/usr/local/cuda/include/texture_types.h"
.file 11 "/usr/local/cuda/include/vector_types.h"
.file 12 "/usr/local/cuda/include/device_launch_parameters.h"
.file 13 "/usr/local/cuda/include/crt/storage_class.h"
.file 14 "/usr/include/bits/types.h"
.file 15 "/usr/include/time.h"
.file 16 "/usr/local/cuda/include/sm_11_atomic_functions.h"
.file 17 "lal_pppm.cu"
.file 18 "/usr/local/cuda/include/common_functions.h"
.file 19 "/usr/local/cuda/include/math_functions.h"
.file 20 "/usr/local/cuda/include/math_constants.h"
.file 21 "/usr/local/cuda/include/device_functions.h"
.file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h"
.file 23 "/usr/local/cuda/include/sm_13_double_functions.h"
.file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h"
.file 25 "/usr/local/cuda/include/sm_20_intrinsics.h"
.file 26 "/usr/local/cuda/include/surface_functions.h"
.file 27 "/usr/local/cuda/include/texture_fetch_functions.h"
.file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
.global .texref pos_tex;
.global .texref q_tex;
.entry particle_map (
.param .u64 __cudaparm_particle_map_x_,
.param .u64 __cudaparm_particle_map_q_,
.param .f64 __cudaparm_particle_map_delvolinv,
.param .s32 __cudaparm_particle_map_nlocal,
.param .u64 __cudaparm_particle_map_counts,
.param .u64 __cudaparm_particle_map_ans,
.param .f64 __cudaparm_particle_map_b_lo_x,
.param .f64 __cudaparm_particle_map_b_lo_y,
.param .f64 __cudaparm_particle_map_b_lo_z,
.param .f64 __cudaparm_particle_map_delxinv,
.param .f64 __cudaparm_particle_map_delyinv,
.param .f64 __cudaparm_particle_map_delzinv,
.param .s32 __cudaparm_particle_map_nlocal_x,
.param .s32 __cudaparm_particle_map_nlocal_y,
.param .s32 __cudaparm_particle_map_nlocal_z,
.param .s32 __cudaparm_particle_map_atom_stride,
.param .s32 __cudaparm_particle_map_max_atoms,
.param .u64 __cudaparm_particle_map_error)
{
.reg .u32 %r<50>;
.reg .u64 %rd<12>;
.reg .f32 %f<14>;
.reg .f64 %fd<36>;
.reg .pred %p<11>;
.loc 17 50 0
$LDWbegin_particle_map:
cvt.s32.u32 %r1, %ntid.x;
cvt.s32.u32 %r2, %ctaid.x;
mul24.lo.s32 %r3, %r2, %r1;
cvt.s32.u32 %r4, %nctaid.x;
mul24.lo.s32 %r5, %r4, %r1;
mov.u32 %r6, %tid.x;
add.u32 %r7, %r3, %r6;
sub.s32 %r8, %r5, 1;
mul.lo.s32 %r9, %r7, 64;
div.s32 %r10, %r9, %r5;
mul.lo.s32 %r11, %r8, %r10;
sub.s32 %r12, %r9, %r11;
ld.param.s32 %r13, [__cudaparm_particle_map_nlocal];
setp.le.s32 %p1, %r13, %r12;
@%p1 bra $Lt_0_7426;
.loc 17 62 0
mov.u32 %r14, %r12;
mov.s32 %r15, 0;
mov.u32 %r16, %r15;
mov.s32 %r17, 0;
mov.u32 %r18, %r17;
mov.s32 %r19, 0;
mov.u32 %r20, %r19;
tex.1d.v4.f32.s32 {%f1,%f2,%f3,%f4},[pos_tex,{%r14,%r16,%r18,%r20}];
mov.f32 %f5, %f1;
mov.f32 %f6, %f2;
mov.f32 %f7, %f3;
.loc 17 64 0
mov.u32 %r21, %r12;
mov.s32 %r22, 0;
mov.u32 %r23, %r22;
mov.s32 %r24, 0;
mov.u32 %r25, %r24;
mov.s32 %r26, 0;
mov.u32 %r27, %r26;
tex.1d.v4.f32.s32 {%f8,%f9,%f10,%f11},[q_tex,{%r21,%r23,%r25,%r27}];
mov.f32 %f12, %f8;
cvt.ftz.f64.f32 %fd1, %f12;
ld.param.f64 %fd2, [__cudaparm_particle_map_delvolinv];
mul.f64 %fd3, %fd1, %fd2;
mov.f64 %fd4, 0d0000000000000000; // 0
setp.neu.f64 %p2, %fd3, %fd4;
@!%p2 bra $Lt_0_7426;
.loc 17 67 0
ld.param.f64 %fd5, [__cudaparm_particle_map_delxinv];
cvt.ftz.f64.f32 %fd6, %f5;
ld.param.f64 %fd7, [__cudaparm_particle_map_b_lo_x];
sub.f64 %fd8, %fd6, %fd7;
mul.f64 %fd9, %fd5, %fd8;
mov.f64 %fd10, 0d0000000000000000; // 0
setp.lt.f64 %p3, %fd9, %fd10;
@%p3 bra $Lt_0_8706;
ld.param.f64 %fd11, [__cudaparm_particle_map_delyinv];
cvt.ftz.f64.f32 %fd12, %f6;
ld.param.f64 %fd13, [__cudaparm_particle_map_b_lo_y];
sub.f64 %fd14, %fd12, %fd13;
mul.f64 %fd15, %fd11, %fd14;
mov.f64 %fd16, 0d0000000000000000; // 0
setp.lt.f64 %p4, %fd15, %fd16;
@%p4 bra $Lt_0_8706;
ld.param.f64 %fd17, [__cudaparm_particle_map_delzinv];
cvt.ftz.f64.f32 %fd18, %f7;
ld.param.f64 %fd19, [__cudaparm_particle_map_b_lo_z];
sub.f64 %fd20, %fd18, %fd19;
mul.f64 %fd21, %fd17, %fd20;
mov.f64 %fd22, 0d0000000000000000; // 0
setp.lt.f64 %p5, %fd21, %fd22;
@%p5 bra $Lt_0_8706;
cvt.rzi.s32.f64 %r28, %fd9;
ld.param.s32 %r29, [__cudaparm_particle_map_nlocal_x];
setp.ge.s32 %p6, %r28, %r29;
@%p6 bra $Lt_0_8706;
cvt.rzi.s32.f64 %r30, %fd15;
ld.param.s32 %r31, [__cudaparm_particle_map_nlocal_y];
setp.ge.s32 %p7, %r30, %r31;
@%p7 bra $Lt_0_8706;
cvt.rzi.s32.f64 %r32, %fd21;
ld.param.s32 %r33, [__cudaparm_particle_map_nlocal_z];
setp.gt.s32 %p8, %r33, %r32;
@%p8 bra $L_0_4866;
$Lt_0_8706:
$L_0_5122:
.loc 17 76 0
mov.s32 %r34, 1;
ld.param.u64 %rd1, [__cudaparm_particle_map_error];
st.global.s32 [%rd1+0], %r34;
bra.uni $Lt_0_7426;
$L_0_4866:
.loc 17 83 0
mul.lo.s32 %r35, %r32, %r31;
add.s32 %r36, %r30, %r35;
mul.lo.s32 %r37, %r36, %r29;
add.s32 %r38, %r28, %r37;
ld.param.u64 %rd2, [__cudaparm_particle_map_counts];
cvt.s64.s32 %rd3, %r38;
mul.wide.s32 %rd4, %r38, 4;
add.u64 %rd5, %rd2, %rd4;
mov.s32 %r39, 1;
atom.global.add.s32 %r40, [%rd5], %r39;
mov.s32 %r41, %r40;
ld.param.s32 %r42, [__cudaparm_particle_map_max_atoms];
setp.gt.s32 %p9, %r42, %r41;
@%p9 bra $Lt_0_7682;
.loc 17 85 0
mov.s32 %r43, 2;
ld.param.u64 %rd6, [__cudaparm_particle_map_error];
st.global.s32 [%rd6+0], %r43;
.loc 16 118 0
mov.s32 %r44, -1;
atom.global.add.s32 %r45, [%rd5], %r44;
bra.uni $Lt_0_7426;
$Lt_0_7682:
.loc 17 88 0
ld.param.u64 %rd7, [__cudaparm_particle_map_ans];
ld.param.s32 %r46, [__cudaparm_particle_map_atom_stride];
mul.lo.s32 %r47, %r46, %r41;
add.s32 %r48, %r38, %r47;
cvt.s64.s32 %rd8, %r48;
mul.wide.s32 %rd9, %r48, 32;
add.u64 %rd10, %rd7, %rd9;
cvt.rn.f64.s32 %fd23, %r28;
mov.f64 %fd24, 0d3fe0000000000000; // 0.5
add.f64 %fd25, %fd23, %fd24;
sub.f64 %fd26, %fd25, %fd9;
cvt.rn.f64.s32 %fd27, %r30;
mov.f64 %fd28, 0d3fe0000000000000; // 0.5
add.f64 %fd29, %fd27, %fd28;
sub.f64 %fd30, %fd29, %fd15;
st.global.v2.f64 [%rd10+0], {%fd26,%fd30};
cvt.rn.f64.s32 %fd31, %r32;
mov.f64 %fd32, 0d3fe0000000000000; // 0.5
add.f64 %fd33, %fd31, %fd32;
sub.f64 %fd34, %fd33, %fd21;
st.global.v2.f64 [%rd10+16], {%fd34,%fd3};
$Lt_0_7426:
$L_0_4610:
$Lt_0_6914:
$Lt_0_6402:
.loc 17 92 0
exit;
$LDWend_particle_map:
} // particle_map
.entry make_rho (
.param .u64 __cudaparm_make_rho_counts,
.param .u64 __cudaparm_make_rho_atoms,
.param .u64 __cudaparm_make_rho_brick,
.param .u64 __cudaparm_make_rho__rho_coeff,
.param .s32 __cudaparm_make_rho_atom_stride,
.param .s32 __cudaparm_make_rho_npts_x,
.param .s32 __cudaparm_make_rho_npts_y,
.param .s32 __cudaparm_make_rho_npts_z,
.param .s32 __cudaparm_make_rho_nlocal_x,
.param .s32 __cudaparm_make_rho_nlocal_y,
.param .s32 __cudaparm_make_rho_nlocal_z,
.param .s32 __cudaparm_make_rho_order_m_1,
.param .s32 __cudaparm_make_rho_order,
.param .s32 __cudaparm_make_rho_order2)
{
.reg .u32 %r<119>;
.reg .u64 %rd<57>;
.reg .f64 %fd<26>;
.reg .pred %p<27>;
.shared .align 8 .b8 __cuda___cuda_local_var_32578_34_non_const_rho_coeff200[512];
.shared .align 8 .b8 __cuda___cuda_local_var_32579_34_non_const_front712[640];
.shared .align 8 .b8 __cuda___cuda_local_var_32580_34_non_const_ans1352[4096];
.loc 17 101 0
$LDWbegin_make_rho:
ld.param.s32 %r1, [__cudaparm_make_rho_order2];
ld.param.s32 %r2, [__cudaparm_make_rho_order];
add.s32 %r3, %r1, %r2;
cvt.s32.u32 %r4, %tid.x;
setp.le.s32 %p1, %r3, %r4;
@%p1 bra $Lt_1_16898;
.loc 17 108 0
mov.u64 %rd1, __cuda___cuda_local_var_32578_34_non_const_rho_coeff200;
cvt.s64.s32 %rd2, %r4;
mul.wide.s32 %rd3, %r4, 8;
ld.param.u64 %rd4, [__cudaparm_make_rho__rho_coeff];
add.u64 %rd5, %rd4, %rd3;
ld.global.f64 %fd1, [%rd5+0];
add.u64 %rd6, %rd3, %rd1;
st.shared.f64 [%rd6+0], %fd1;
$Lt_1_16898:
mov.u64 %rd1, __cuda___cuda_local_var_32578_34_non_const_rho_coeff200;
shr.s32 %r5, %r4, 31;
mov.s32 %r6, 31;
and.b32 %r7, %r5, %r6;
add.s32 %r8, %r7, %r4;
shr.s32 %r9, %r8, 5;
mul.lo.s32 %r10, %r9, 32;
sub.s32 %r11, %r4, %r10;
setp.lt.s32 %p2, %r11, %r2;
@!%p2 bra $Lt_1_17410;
.loc 17 114 0
mov.u64 %rd7, __cuda___cuda_local_var_32579_34_non_const_front712;
mov.f64 %fd2, 0d0000000000000000; // 0
cvt.s64.s32 %rd8, %r11;
shr.s32 %r12, %r4, 31;
mov.s32 %r13, 31;
and.b32 %r14, %r12, %r13;
add.s32 %r15, %r14, %r4;
shr.s32 %r16, %r15, 5;
cvt.s64.s32 %rd9, %r16;
mul.wide.s32 %rd10, %r16, 40;
add.u64 %rd11, %rd8, %rd10;
mul.lo.u64 %rd12, %rd11, 8;
add.u64 %rd13, %rd7, %rd12;
st.shared.f64 [%rd13+256], %fd2;
$Lt_1_17410:
mov.u64 %rd7, __cuda___cuda_local_var_32579_34_non_const_front712;
.loc 17 116 0
bar.sync 0;
ld.param.s32 %r17, [__cudaparm_make_rho_npts_x];
shr.s32 %r18, %r17, 31;
mov.s32 %r19, 31;
and.b32 %r20, %r18, %r19;
add.s32 %r21, %r20, %r17;
shr.s32 %r22, %r21, 5;
add.s32 %r23, %r22, 1;
mov.u32 %r24, 0;
setp.le.s32 %p3, %r23, %r24;
@%p3 bra $Lt_1_17922;
shr.s32 %r25, %r4, 31;
mov.s32 %r26, 31;
and.b32 %r27, %r25, %r26;
add.s32 %r28, %r27, %r4;
shr.s32 %r29, %r28, 5;
add.s32 %r30, %r11, 32;
ld.param.s32 %r31, [__cudaparm_make_rho_nlocal_y];
ld.param.s32 %r32, [__cudaparm_make_rho_nlocal_x];
mul.lo.s32 %r33, %r31, %r32;
mov.u32 %r34, %ctaid.x;
mul.lo.u32 %r35, %r34, 2;
add.u32 %r36, %r29, %r35;
ld.param.s32 %r37, [__cudaparm_make_rho_npts_y];
div.s32 %r38, %r36, %r37;
ld.param.s32 %r39, [__cudaparm_make_rho_order_m_1];
setp.lt.s32 %p4, %r38, %r39;
sub.s32 %r40, %r39, %r38;
mov.s32 %r41, 0;
selp.s32 %r42, %r40, %r41, %p4;
ld.param.s32 %r43, [__cudaparm_make_rho_nlocal_z];
setp.ge.s32 %p5, %r38, %r43;
sub.s32 %r44, %r43, %r38;
add.s32 %r45, %r44, %r2;
sub.s32 %r46, %r45, 1;
selp.s32 %r47, %r46, %r2, %p5;
rem.s32 %r48, %r36, %r37;
setp.lt.s32 %p6, %r48, %r39;
sub.s32 %r49, %r39, %r48;
mov.s32 %r50, 0;
selp.s32 %r51, %r49, %r50, %p6;
setp.ge.s32 %p7, %r48, %r31;
sub.s32 %r52, %r31, %r48;
add.s32 %r53, %r52, %r2;
sub.s32 %r54, %r53, 1;
selp.s32 %r55, %r54, %r2, %p7;
mov.s32 %r56, %r23;
mov.s32 %r57, 0;
setp.gt.s32 %p8, %r2, %r57;
mov.s32 %r58, 0;
cvt.s64.s32 %rd14, %r11;
cvt.s64.s32 %rd15, %r29;
mul.lo.s32 %r59, %r23, 32;
mul.wide.s32 %rd16, %r29, 40;
add.u64 %rd17, %rd14, %rd16;
ld.param.s32 %r60, [__cudaparm_make_rho_npts_z];
setp.gt.s32 %p9, %r60, %r38;
mul.lo.u64 %rd18, %rd17, 8;
selp.s32 %r61, 1, 0, %p9;
add.u64 %rd19, %rd18, %rd7;
mov.u64 %rd20, __cuda___cuda_local_var_32580_34_non_const_ans1352;
mov.s32 %r62, %r56;
$Lt_1_18434:
//<loop> Loop body line 116, nesting depth: 1, estimated iterations: unknown
@!%p8 bra $Lt_1_18690;
mov.s32 %r63, %r2;
cvt.s64.s32 %rd21, %r4;
mul.wide.s32 %rd22, %r4, 8;
add.u64 %rd23, %rd20, %rd22;
mov.s32 %r64, 0;
mov.s32 %r65, %r63;
$Lt_1_19202:
//<loop> Loop body line 116, nesting depth: 2, estimated iterations: unknown
.loc 17 140 0
mov.f64 %fd3, 0d0000000000000000; // 0
st.shared.f64 [%rd23+0], %fd3;
add.s32 %r64, %r64, 1;
add.u64 %rd23, %rd23, 512;
setp.ne.s32 %p10, %r64, %r2;
@%p10 bra $Lt_1_19202;
$Lt_1_18690:
add.s32 %r66, %r11, %r58;
set.lt.u32.s32 %r67, %r66, %r32;
neg.s32 %r68, %r67;
and.b32 %r69, %r61, %r68;
mov.u32 %r70, 0;
setp.eq.s32 %p11, %r69, %r70;
@%p11 bra $Lt_1_20226;
.loc 17 143 0
mov.s32 %r71, %r42;
setp.ge.s32 %p12, %r42, %r47;
@%p12 bra $Lt_1_20226;
sub.s32 %r72, %r47, %r42;
setp.lt.s32 %p13, %r51, %r55;
mov.s32 %r73, %r72;
$Lt_1_20738:
//<loop> Loop body line 143, nesting depth: 2, estimated iterations: unknown
.loc 17 145 0
mov.s32 %r74, %r51;
@!%p13 bra $Lt_1_20994;
sub.s32 %r75, %r55, %r51;
sub.s32 %r76, %r71, %r42;
add.s32 %r77, %r38, %r42;
add.s32 %r78, %r48, %r51;
sub.s32 %r79, %r77, %r39;
sub.s32 %r80, %r78, %r39;
add.s32 %r81, %r76, %r79;
mul.lo.s32 %r82, %r33, %r81;
ld.param.s32 %r83, [__cudaparm_make_rho_atom_stride];
ld.param.u64 %rd24, [__cudaparm_make_rho_counts];
mov.s32 %r84, %r75;
$Lt_1_21506:
//<loop> Loop body line 145, nesting depth: 3, estimated iterations: unknown
.loc 17 147 0
sub.s32 %r85, %r74, %r51;
add.s32 %r86, %r85, %r80;
mul.lo.s32 %r87, %r86, %r32;
add.s32 %r88, %r82, %r87;
add.s32 %r89, %r66, %r88;
cvt.s64.s32 %rd25, %r89;
mul.wide.s32 %rd26, %r89, 4;
add.u64 %rd27, %rd24, %rd26;
ld.global.s32 %r90, [%rd27+0];
mul.lo.s32 %r91, %r90, %r83;
.loc 17 148 0
mov.s32 %r92, %r89;
setp.ge.s32 %p14, %r89, %r91;
@%p14 bra $Lt_1_21762;
sub.s32 %r93, %r3, 1;
cvt.s64.s32 %rd28, %r83;
mul.wide.s32 %rd29, %r83, 32;
mov.s32 %r94, -1;
setp.gt.s32 %p15, %r93, %r94;
ld.param.u64 %rd30, [__cudaparm_make_rho_atoms];
mul.lo.u64 %rd31, %rd25, 32;
add.u64 %rd32, %rd30, %rd31;
$Lt_1_22274:
//<loop> Loop body line 148, nesting depth: 4, estimated iterations: unknown
.loc 17 149 0
ld.global.f64 %fd4, [%rd32+0];
@!%p15 bra $Lt_1_29954;
sub.s32 %r95, %r93, %r74;
mov.s32 %r96, -1;
sub.s32 %r97, %r96, %r74;
cvt.s64.s32 %rd33, %r2;
mul.wide.s32 %rd34, %r2, 8;
ld.global.f64 %fd5, [%rd32+8];
ld.global.f64 %fd6, [%rd32+16];
cvt.s64.s32 %rd35, %r95;
mul.wide.s32 %rd36, %r95, 8;
add.u64 %rd37, %rd1, %rd36;
sub.s32 %r98, %r93, %r71;
cvt.s64.s32 %rd38, %r98;
mul.wide.s32 %rd39, %r98, 8;
add.u64 %rd40, %rd1, %rd39;
mov.f64 %fd7, 0d0000000000000000; // 0
mov.f64 %fd8, 0d0000000000000000; // 0
$Lt_1_23042:
//<loop> Loop body line 149, nesting depth: 5, estimated iterations: unknown
.loc 17 154 0
ld.shared.f64 %fd9, [%rd37+0];
mad.rn.f64 %fd8, %fd8, %fd5, %fd9;
.loc 17 155 0
ld.shared.f64 %fd10, [%rd40+0];
mad.rn.f64 %fd7, %fd7, %fd6, %fd10;
sub.u64 %rd40, %rd40, %rd34;
sub.s32 %r95, %r95, %r2;
sub.u64 %rd37, %rd37, %rd34;
setp.gt.s32 %p16, %r95, %r97;
@%p16 bra $Lt_1_23042;
bra.uni $Lt_1_22530;
$Lt_1_29954:
mov.f64 %fd7, 0d0000000000000000; // 0
mov.f64 %fd8, 0d0000000000000000; // 0
$Lt_1_22530:
.loc 17 157 0
ld.global.f64 %fd11, [%rd32+24];
mul.f64 %fd12, %fd7, %fd8;
mul.f64 %fd13, %fd11, %fd12;
@!%p8 bra $Lt_1_23554;
mov.s32 %r99, %r2;
cvt.s64.s32 %rd41, %r4;
mul.wide.s32 %rd42, %r4, 8;
add.u64 %rd43, %rd20, %rd42;
mov.s32 %r100, 0;
mov.s32 %r101, %r99;
$Lt_1_24066:
//<loop> Loop body line 157, nesting depth: 5, estimated iterations: unknown
.loc 17 161 0
add.s32 %r102, %r100, %r1;
mov.s32 %r103, %r102;
setp.lt.s32 %p17, %r102, %r100;
@%p17 bra $Lt_1_30466;
cvt.s64.s32 %rd44, %r2;
mul.wide.s32 %rd34, %r2, 8;
cvt.s64.s32 %rd45, %r102;
mul.wide.s32 %rd46, %r102, 8;
add.u64 %rd47, %rd1, %rd46;
mov.f64 %fd14, 0d0000000000000000; // 0
$Lt_1_24834:
//<loop> Loop body line 161, nesting depth: 6, estimated iterations: unknown
.loc 17 162 0
ld.shared.f64 %fd15, [%rd47+0];
mad.rn.f64 %fd14, %fd4, %fd14, %fd15;
sub.s32 %r103, %r103, %r2;
sub.u64 %rd47, %rd47, %rd34;
setp.ge.s32 %p18, %r103, %r100;
@%p18 bra $Lt_1_24834;
bra.uni $Lt_1_24322;
$Lt_1_30466:
mov.f64 %fd14, 0d0000000000000000; // 0
$Lt_1_24322:
.loc 17 163 0
ld.shared.f64 %fd16, [%rd43+0];
mad.rn.f64 %fd17, %fd14, %fd13, %fd16;
st.shared.f64 [%rd43+0], %fd17;
add.s32 %r100, %r100, 1;
add.u64 %rd43, %rd43, 512;
setp.ne.s32 %p19, %r100, %r2;
@%p19 bra $Lt_1_24066;
$Lt_1_23554:
add.s32 %r92, %r92, %r83;
add.u64 %rd32, %rd29, %rd32;
setp.gt.s32 %p20, %r91, %r92;
@%p20 bra $Lt_1_22274;
$Lt_1_21762:
add.s32 %r74, %r74, 1;
setp.ne.s32 %p21, %r55, %r74;
@%p21 bra $Lt_1_21506;
$Lt_1_20994:
add.s32 %r71, %r71, 1;
setp.ne.s32 %p22, %r47, %r71;
@%p22 bra $Lt_1_20738;
$Lt_1_20226:
$Lt_1_19714:
.loc 17 172 0
bar.sync 0;
@!%p2 bra $Lt_1_26626;
.loc 17 174 0
ld.shared.f64 %fd18, [%rd19+256];
st.shared.f64 [%rd19+0], %fd18;
.loc 17 175 0
mov.f64 %fd19, 0d0000000000000000; // 0
st.shared.f64 [%rd19+256], %fd19;
bra.uni $Lt_1_26370;
$Lt_1_26626:
.loc 17 177 0
mov.f64 %fd20, 0d0000000000000000; // 0
st.shared.f64 [%rd19+0], %fd20;
$Lt_1_26370:
@!%p8 bra $Lt_1_26882;
mov.s32 %r104, %r2;
cvt.s64.s32 %rd48, %r4;
mov.s32 %r105, %r11;
add.s32 %r106, %r11, %r2;
mul.wide.s32 %rd49, %r4, 8;
add.u64 %rd50, %rd20, %rd49;
mov.s64 %rd51, %rd19;
mov.s32 %r107, %r104;
$Lt_1_27394:
//<loop> Loop body line 177, nesting depth: 2, estimated iterations: unknown
.loc 17 180 0
ld.shared.f64 %fd21, [%rd50+0];
ld.shared.f64 %fd22, [%rd51+0];
add.f64 %fd23, %fd21, %fd22;
st.shared.f64 [%rd51+0], %fd23;
.loc 17 181 0
bar.sync 0;
add.s32 %r105, %r105, 1;
add.u64 %rd51, %rd51, 8;
add.u64 %rd50, %rd50, 512;
setp.ne.s32 %p23, %r105, %r106;
@%p23 bra $Lt_1_27394;
$Lt_1_26882:
set.lt.u32.s32 %r108, %r66, %r17;
neg.s32 %r109, %r108;
and.b32 %r110, %r61, %r109;
mov.u32 %r111, 0;
setp.eq.s32 %p24, %r110, %r111;
@%p24 bra $Lt_1_27906;
.loc 17 185 0
ld.shared.f64 %fd24, [%rd19+0];
ld.param.u64 %rd52, [__cudaparm_make_rho_brick];
add.s32 %r112, %r11, %r58;
mul.lo.s32 %r113, %r37, %r17;
mul.lo.s32 %r114, %r38, %r113;
mul.lo.s32 %r115, %r48, %r17;
add.s32 %r116, %r114, %r115;
add.s32 %r117, %r112, %r116;
cvt.s64.s32 %rd53, %r117;
mul.wide.s32 %rd54, %r117, 8;
add.u64 %rd55, %rd52, %rd54;
st.global.f64 [%rd55+0], %fd24;
$Lt_1_27906:
add.s32 %r58, %r58, 32;
setp.ne.s32 %p25, %r58, %r59;
@%p25 bra $Lt_1_18434;
$Lt_1_17922:
.loc 17 189 0
exit;
$LDWend_make_rho:
} // make_rho
.entry interp (
.param .u64 __cudaparm_interp_x_,
.param .u64 __cudaparm_interp_q_,
.param .s32 __cudaparm_interp_nlocal,
.param .u64 __cudaparm_interp_brick,
.param .u64 __cudaparm_interp__rho_coeff,
.param .s32 __cudaparm_interp_npts_x,
.param .s32 __cudaparm_interp_npts_yx,
.param .f64 __cudaparm_interp_b_lo_x,
.param .f64 __cudaparm_interp_b_lo_y,
.param .f64 __cudaparm_interp_b_lo_z,
.param .f64 __cudaparm_interp_delxinv,
.param .f64 __cudaparm_interp_delyinv,
.param .f64 __cudaparm_interp_delzinv,
.param .s32 __cudaparm_interp_order,
.param .s32 __cudaparm_interp_order2,
.param .f64 __cudaparm_interp_qqrd2e_scale,
.param .u64 __cudaparm_interp_ans)
{
.reg .u32 %r<56>;
.reg .u64 %rd<37>;
.reg .f32 %f<19>;
.reg .f64 %fd<63>;
.reg .pred %p<14>;
.shared .align 8 .b8 __cuda___cuda_local_var_32676_34_non_const_rho_coeff5568[512];
.shared .align 8 .b8 __cuda___cuda_local_var_32677_34_non_const_rho1d_06080[4096];
.shared .align 8 .b8 __cuda___cuda_local_var_32678_34_non_const_rho1d_110176[4096];
// __cuda_local_var_32694_12_non_const_ek = 16
.loc 17 199 0
$LDWbegin_interp:
ld.param.s32 %r1, [__cudaparm_interp_order2];
ld.param.s32 %r2, [__cudaparm_interp_order];
add.s32 %r3, %r1, %r2;
cvt.s32.u32 %r4, %tid.x;
setp.le.s32 %p1, %r3, %r4;
@%p1 bra $Lt_2_8706;
.loc 17 206 0
mov.u64 %rd1, __cuda___cuda_local_var_32676_34_non_const_rho_coeff5568;
cvt.s64.s32 %rd2, %r4;
mul.wide.s32 %rd3, %r4, 8;
ld.param.u64 %rd4, [__cudaparm_interp__rho_coeff];
add.u64 %rd5, %rd4, %rd3;
ld.global.f64 %fd1, [%rd5+0];
add.u64 %rd6, %rd3, %rd1;
st.shared.f64 [%rd6+0], %fd1;
$Lt_2_8706:
mov.u64 %rd1, __cuda___cuda_local_var_32676_34_non_const_rho_coeff5568;
.loc 17 207 0
bar.sync 0;
mov.u32 %r5, %ctaid.x;
mov.u32 %r6, %ntid.x;
mul.lo.u32 %r7, %r5, %r6;
add.u32 %r8, %r4, %r7;
ld.param.s32 %r9, [__cudaparm_interp_nlocal];
setp.le.s32 %p2, %r9, %r8;
@%p2 bra $Lt_2_9218;
.loc 17 215 0
mov.u32 %r10, %r8;
mov.s32 %r11, 0;
mov.u32 %r12, %r11;
mov.s32 %r13, 0;
mov.u32 %r14, %r13;
mov.s32 %r15, 0;
mov.u32 %r16, %r15;
tex.1d.v4.f32.s32 {%f1,%f2,%f3,%f4},[pos_tex,{%r10,%r12,%r14,%r16}];
mov.f32 %f5, %f1;
mov.f32 %f6, %f2;
mov.f32 %f7, %f3;
.loc 17 216 0
mov.u32 %r17, %r8;
mov.s32 %r18, 0;
mov.u32 %r19, %r18;
mov.s32 %r20, 0;
mov.u32 %r21, %r20;
mov.s32 %r22, 0;
mov.u32 %r23, %r22;
tex.1d.v4.f32.s32 {%f8,%f9,%f10,%f11},[q_tex,{%r17,%r19,%r21,%r23}];
mov.f32 %f12, %f8;
cvt.ftz.f64.f32 %fd2, %f12;
ld.param.f64 %fd3, [__cudaparm_interp_qqrd2e_scale];
mul.f64 %fd4, %fd2, %fd3;
mov.f64 %fd5, 0d0000000000000000; // 0
setp.neu.f64 %p3, %fd4, %fd5;
@!%p3 bra $Lt_2_9986;
mov.s32 %r24, 0;
setp.gt.s32 %p4, %r2, %r24;
ld.param.f64 %fd6, [__cudaparm_interp_delxinv];
cvt.ftz.f64.f32 %fd7, %f5;
ld.param.f64 %fd8, [__cudaparm_interp_b_lo_x];
sub.f64 %fd9, %fd7, %fd8;
mul.f64 %fd10, %fd6, %fd9;
@!%p4 bra $Lt_2_16386;
mov.u64 %rd7, __cuda___cuda_local_var_32677_34_non_const_rho1d_06080;
mov.u64 %rd8, __cuda___cuda_local_var_32678_34_non_const_rho1d_110176;
cvt.rzi.s32.f64 %r25, %fd10;
cvt.rn.f64.s32 %fd11, %r25;
mov.f64 %fd12, 0d3fe0000000000000; // 0.5
add.f64 %fd13, %fd11, %fd12;
sub.f64 %fd14, %fd13, %fd10;
ld.param.f64 %fd15, [__cudaparm_interp_delyinv];
cvt.ftz.f64.f32 %fd16, %f6;
ld.param.f64 %fd17, [__cudaparm_interp_b_lo_y];
sub.f64 %fd18, %fd16, %fd17;
mul.f64 %fd19, %fd15, %fd18;
cvt.rzi.s32.f64 %r26, %fd19;
cvt.rn.f64.s32 %fd20, %r26;
mov.f64 %fd21, 0d3fe0000000000000; // 0.5
add.f64 %fd22, %fd20, %fd21;
sub.f64 %fd23, %fd22, %fd19;
mov.s32 %r27, %r2;
cvt.s64.s32 %rd9, %r4;
mov.s32 %r28, %r1;
mul.wide.s32 %rd3, %r4, 8;
add.u64 %rd10, %rd3, %rd7;
add.u64 %rd11, %rd3, %rd8;
mov.s32 %r29, 0;
mov.s32 %r30, %r27;
$Lt_2_10754:
//<loop> Loop body line 216, nesting depth: 1, estimated iterations: unknown
.loc 17 235 0
mov.f64 %fd24, 0d0000000000000000; // 0
mov.f64 %fd25, 0d0000000000000000; // 0
st.shared.f64 [%rd10+0], %fd25;
.loc 17 236 0
mov.f64 %fd26, 0d0000000000000000; // 0
mov.f64 %fd27, 0d0000000000000000; // 0
st.shared.f64 [%rd11+0], %fd27;
.loc 17 237 0
mov.s32 %r31, %r28;
setp.lt.s32 %p5, %r28, %r29;
@%p5 bra $Lt_2_11010;
cvt.s64.s32 %rd12, %r2;
mul.wide.s32 %rd13, %r2, 8;
cvt.s64.s32 %rd14, %r28;
mul.wide.s32 %rd15, %r28, 8;
add.u64 %rd16, %rd1, %rd15;
$Lt_2_11522:
//<loop> Loop body line 237, nesting depth: 2, estimated iterations: unknown
.loc 17 238 0
ld.shared.f64 %fd28, [%rd16+0];
mad.rn.f64 %fd24, %fd24, %fd14, %fd28;
st.shared.f64 [%rd10+0], %fd24;
.loc 17 239 0
mad.rn.f64 %fd26, %fd26, %fd23, %fd28;
st.shared.f64 [%rd11+0], %fd26;
sub.s32 %r31, %r31, %r2;
sub.u64 %rd16, %rd16, %rd13;
setp.ge.s32 %p6, %r31, %r29;
@%p6 bra $Lt_2_11522;
$Lt_2_11010:
add.s32 %r29, %r29, 1;
add.s32 %r28, %r28, 1;
add.u64 %rd11, %rd11, 512;
add.u64 %rd10, %rd10, 512;
setp.ne.s32 %p7, %r28, %r3;
@%p7 bra $Lt_2_10754;
bra.uni $Lt_2_10242;
$Lt_2_16386:
cvt.rzi.s32.f64 %r25, %fd10;
mov.u64 %rd8, __cuda___cuda_local_var_32678_34_non_const_rho1d_110176;
mov.u64 %rd7, __cuda___cuda_local_var_32677_34_non_const_rho1d_06080;
$Lt_2_10242:
.loc 17 243 0
ld.param.f64 %fd29, [__cudaparm_interp_delzinv];
cvt.ftz.f64.f32 %fd30, %f7;
ld.param.f64 %fd31, [__cudaparm_interp_b_lo_z];
sub.f64 %fd32, %fd30, %fd31;
mul.f64 %fd33, %fd29, %fd32;
cvt.rzi.s32.f64 %r32, %fd33;
ld.param.s32 %r33, [__cudaparm_interp_npts_yx];
mul.lo.s32 %r34, %r32, %r33;
add.s32 %r35, %r25, %r34;
@!%p4 bra $Lt_2_16898;
cvt.rn.f64.s32 %fd34, %r32;
mov.f64 %fd35, 0d3fe0000000000000; // 0.5
add.f64 %fd36, %fd34, %fd35;
sub.f64 %fd37, %fd36, %fd33;
mov.s32 %r36, %r2;
cvt.ftz.f64.f32 %fd38, %f6;
cvt.s64.s32 %rd17, %r4;
ld.param.f64 %fd39, [__cudaparm_interp_delyinv];
ld.param.f64 %fd40, [__cudaparm_interp_b_lo_y];
sub.f64 %fd41, %fd38, %fd40;
mul.f64 %fd42, %fd39, %fd41;
cvt.rzi.s32.f64 %r37, %fd42;
mul.wide.s32 %rd3, %r4, 8;
ld.param.s32 %r38, [__cudaparm_interp_npts_x];
mul.lo.s32 %r39, %r37, %r38;
add.u64 %rd18, %rd3, %rd7;
add.u64 %rd19, %rd3, %rd8;
cvt.s64.s32 %rd20, %r38;
mul.wide.s32 %rd21, %r38, 32;
add.s32 %r40, %r39, %r35;
mov.s32 %r41, %r40;
ld.param.u64 %rd22, [__cudaparm_interp_brick];
mov.s32 %r42, 0;
mov.f32 %f13, 0f00000000; // 0
mov.f32 %f14, 0f00000000; // 0
mov.f32 %f15, 0f00000000; // 0
mov.s32 %r43, %r36;
$Lt_2_12802:
//<loop> Loop body line 243, nesting depth: 1, estimated iterations: unknown
.loc 17 246 0
add.s32 %r44, %r42, %r1;
mov.s32 %r45, %r44;
setp.lt.s32 %p8, %r44, %r42;
@%p8 bra $Lt_2_17154;
cvt.s64.s32 %rd23, %r2;
mul.wide.s32 %rd13, %r2, 8;
cvt.s64.s32 %rd24, %r44;
mul.wide.s32 %rd25, %r44, 8;
add.u64 %rd26, %rd1, %rd25;
mov.f64 %fd43, 0d0000000000000000; // 0
$Lt_2_13570:
//<loop> Loop body line 246, nesting depth: 2, estimated iterations: unknown
.loc 17 247 0
ld.shared.f64 %fd44, [%rd26+0];
mad.rn.f64 %fd43, %fd37, %fd43, %fd44;
sub.s32 %r45, %r45, %r2;
sub.u64 %rd26, %rd26, %rd13;
setp.ge.s32 %p9, %r45, %r42;
@%p9 bra $Lt_2_13570;
bra.uni $Lt_2_13058;
$Lt_2_17154:
mov.f64 %fd43, 0d0000000000000000; // 0
$Lt_2_13058:
.loc 17 249 0
mov.s32 %r46, %r41;
mov.s32 %r47, %r2;
mov.s32 %r48, %r46;
mul.f64 %fd45, %fd4, %fd43;
mov.s64 %rd27, %rd19;
cvt.s64.s32 %rd28, %r46;
mul.wide.s32 %rd29, %r46, 32;
mov.s32 %r49, 0;
mov.s32 %r50, %r47;
$Lt_2_14594:
//<loop> Loop body line 249, nesting depth: 2, estimated iterations: unknown
mov.s32 %r51, %r2;
mov.s32 %r52, %r48;
add.s32 %r53, %r48, %r2;
mov.s64 %rd30, %rd18;
ld.shared.f64 %fd46, [%rd27+0];
add.u64 %rd31, %rd29, %rd22;
mul.f64 %fd47, %fd45, %fd46;
mov.s32 %r54, %r51;
$Lt_2_15362:
//<loop> Loop body line 249, nesting depth: 3, estimated iterations: unknown
.loc 17 253 0
ld.shared.f64 %fd48, [%rd30+0];
mul.f64 %fd49, %fd48, %fd47;
.loc 17 255 0
cvt.ftz.f64.f32 %fd50, %f15;
ld.global.v2.f64 {%fd51,%fd52}, [%rd31+0];
mul.f64 %fd53, %fd49, %fd51;
sub.f64 %fd54, %fd50, %fd53;
cvt.rn.ftz.f32.f64 %f15, %fd54;
.loc 17 256 0
cvt.ftz.f64.f32 %fd55, %f14;
mul.f64 %fd56, %fd49, %fd52;
sub.f64 %fd57, %fd55, %fd56;
cvt.rn.ftz.f32.f64 %f14, %fd57;
.loc 17 257 0
cvt.ftz.f64.f32 %fd58, %f13;
ld.global.f64 %fd59, [%rd31+16];
mul.f64 %fd60, %fd49, %fd59;
sub.f64 %fd61, %fd58, %fd60;
cvt.rn.ftz.f32.f64 %f13, %fd61;
add.s32 %r52, %r52, 1;
add.u64 %rd31, %rd31, 32;
add.u64 %rd30, %rd30, 512;
setp.ne.s32 %p10, %r52, %r53;
@%p10 bra $Lt_2_15362;
add.s32 %r49, %r49, 1;
add.s32 %r48, %r48, %r38;
add.u64 %rd29, %rd29, %rd21;
add.u64 %rd27, %rd27, 512;
setp.ne.s32 %p11, %r49, %r2;
@%p11 bra $Lt_2_14594;
add.s32 %r42, %r42, 1;
add.s32 %r41, %r46, %r33;
setp.ne.s32 %p12, %r42, %r2;
@%p12 bra $Lt_2_12802;
bra.uni $Lt_2_9730;
$Lt_2_16898:
mov.f32 %f13, 0f00000000; // 0
mov.f32 %f14, 0f00000000; // 0
mov.f32 %f15, 0f00000000; // 0
bra.uni $Lt_2_9730;
$Lt_2_9986:
mov.f32 %f13, 0f00000000; // 0
mov.f32 %f14, 0f00000000; // 0
mov.f32 %f15, 0f00000000; // 0
$Lt_2_9730:
.loc 17 264 0
ld.param.u64 %rd32, [__cudaparm_interp_ans];
cvt.s64.s32 %rd33, %r8;
mul.wide.s32 %rd34, %r8, 16;
add.u64 %rd35, %rd32, %rd34;
mov.f32 %f16, %f17;
st.global.v4.f32 [%rd35+0], {%f15,%f14,%f13,%f16};
$Lt_2_9218:
.loc 17 266 0
exit;
$LDWend_interp:
} // interp

View File

@ -1,837 +0,0 @@
const char * pppm_d =
" .version 2.3\n"
" .target sm_20\n"
" .address_size 64\n"
" .global .texref pos_tex;\n"
" .global .texref q_tex;\n"
" .entry particle_map (\n"
" .param .u64 __cudaparm_particle_map_x_,\n"
" .param .u64 __cudaparm_particle_map_q_,\n"
" .param .f64 __cudaparm_particle_map_delvolinv,\n"
" .param .s32 __cudaparm_particle_map_nlocal,\n"
" .param .u64 __cudaparm_particle_map_counts,\n"
" .param .u64 __cudaparm_particle_map_ans,\n"
" .param .f64 __cudaparm_particle_map_b_lo_x,\n"
" .param .f64 __cudaparm_particle_map_b_lo_y,\n"
" .param .f64 __cudaparm_particle_map_b_lo_z,\n"
" .param .f64 __cudaparm_particle_map_delxinv,\n"
" .param .f64 __cudaparm_particle_map_delyinv,\n"
" .param .f64 __cudaparm_particle_map_delzinv,\n"
" .param .s32 __cudaparm_particle_map_nlocal_x,\n"
" .param .s32 __cudaparm_particle_map_nlocal_y,\n"
" .param .s32 __cudaparm_particle_map_nlocal_z,\n"
" .param .s32 __cudaparm_particle_map_atom_stride,\n"
" .param .s32 __cudaparm_particle_map_max_atoms,\n"
" .param .u64 __cudaparm_particle_map_error)\n"
" {\n"
" .reg .u32 %r<50>;\n"
" .reg .u64 %rd<12>;\n"
" .reg .f32 %f<14>;\n"
" .reg .f64 %fd<36>;\n"
" .reg .pred %p<11>;\n"
" .loc 17 50 0\n"
"$LDWbegin_particle_map:\n"
" cvt.s32.u32 %r1, %ntid.x;\n"
" cvt.s32.u32 %r2, %ctaid.x;\n"
" mul24.lo.s32 %r3, %r2, %r1;\n"
" cvt.s32.u32 %r4, %nctaid.x;\n"
" mul24.lo.s32 %r5, %r4, %r1;\n"
" mov.u32 %r6, %tid.x;\n"
" add.u32 %r7, %r3, %r6;\n"
" sub.s32 %r8, %r5, 1;\n"
" mul.lo.s32 %r9, %r7, 64;\n"
" div.s32 %r10, %r9, %r5;\n"
" mul.lo.s32 %r11, %r8, %r10;\n"
" sub.s32 %r12, %r9, %r11;\n"
" ld.param.s32 %r13, [__cudaparm_particle_map_nlocal];\n"
" setp.le.s32 %p1, %r13, %r12;\n"
" @%p1 bra $Lt_0_7426;\n"
" .loc 17 62 0\n"
" mov.u32 %r14, %r12;\n"
" mov.s32 %r15, 0;\n"
" mov.u32 %r16, %r15;\n"
" mov.s32 %r17, 0;\n"
" mov.u32 %r18, %r17;\n"
" mov.s32 %r19, 0;\n"
" mov.u32 %r20, %r19;\n"
" tex.1d.v4.f32.s32 {%f1,%f2,%f3,%f4},[pos_tex,{%r14,%r16,%r18,%r20}];\n"
" mov.f32 %f5, %f1;\n"
" mov.f32 %f6, %f2;\n"
" mov.f32 %f7, %f3;\n"
" .loc 17 64 0\n"
" mov.u32 %r21, %r12;\n"
" mov.s32 %r22, 0;\n"
" mov.u32 %r23, %r22;\n"
" mov.s32 %r24, 0;\n"
" mov.u32 %r25, %r24;\n"
" mov.s32 %r26, 0;\n"
" mov.u32 %r27, %r26;\n"
" tex.1d.v4.f32.s32 {%f8,%f9,%f10,%f11},[q_tex,{%r21,%r23,%r25,%r27}];\n"
" mov.f32 %f12, %f8;\n"
" cvt.ftz.f64.f32 %fd1, %f12;\n"
" ld.param.f64 %fd2, [__cudaparm_particle_map_delvolinv];\n"
" mul.f64 %fd3, %fd1, %fd2;\n"
" mov.f64 %fd4, 0d0000000000000000; \n"
" setp.neu.f64 %p2, %fd3, %fd4;\n"
" @!%p2 bra $Lt_0_7426;\n"
" .loc 17 67 0\n"
" ld.param.f64 %fd5, [__cudaparm_particle_map_delxinv];\n"
" cvt.ftz.f64.f32 %fd6, %f5;\n"
" ld.param.f64 %fd7, [__cudaparm_particle_map_b_lo_x];\n"
" sub.f64 %fd8, %fd6, %fd7;\n"
" mul.f64 %fd9, %fd5, %fd8;\n"
" mov.f64 %fd10, 0d0000000000000000; \n"
" setp.lt.f64 %p3, %fd9, %fd10;\n"
" @%p3 bra $Lt_0_8706;\n"
" ld.param.f64 %fd11, [__cudaparm_particle_map_delyinv];\n"
" cvt.ftz.f64.f32 %fd12, %f6;\n"
" ld.param.f64 %fd13, [__cudaparm_particle_map_b_lo_y];\n"
" sub.f64 %fd14, %fd12, %fd13;\n"
" mul.f64 %fd15, %fd11, %fd14;\n"
" mov.f64 %fd16, 0d0000000000000000; \n"
" setp.lt.f64 %p4, %fd15, %fd16;\n"
" @%p4 bra $Lt_0_8706;\n"
" ld.param.f64 %fd17, [__cudaparm_particle_map_delzinv];\n"
" cvt.ftz.f64.f32 %fd18, %f7;\n"
" ld.param.f64 %fd19, [__cudaparm_particle_map_b_lo_z];\n"
" sub.f64 %fd20, %fd18, %fd19;\n"
" mul.f64 %fd21, %fd17, %fd20;\n"
" mov.f64 %fd22, 0d0000000000000000; \n"
" setp.lt.f64 %p5, %fd21, %fd22;\n"
" @%p5 bra $Lt_0_8706;\n"
" cvt.rzi.s32.f64 %r28, %fd9;\n"
" ld.param.s32 %r29, [__cudaparm_particle_map_nlocal_x];\n"
" setp.ge.s32 %p6, %r28, %r29;\n"
" @%p6 bra $Lt_0_8706;\n"
" cvt.rzi.s32.f64 %r30, %fd15;\n"
" ld.param.s32 %r31, [__cudaparm_particle_map_nlocal_y];\n"
" setp.ge.s32 %p7, %r30, %r31;\n"
" @%p7 bra $Lt_0_8706;\n"
" cvt.rzi.s32.f64 %r32, %fd21;\n"
" ld.param.s32 %r33, [__cudaparm_particle_map_nlocal_z];\n"
" setp.gt.s32 %p8, %r33, %r32;\n"
" @%p8 bra $L_0_4866;\n"
"$Lt_0_8706:\n"
"$L_0_5122:\n"
" .loc 17 76 0\n"
" mov.s32 %r34, 1;\n"
" ld.param.u64 %rd1, [__cudaparm_particle_map_error];\n"
" st.global.s32 [%rd1+0], %r34;\n"
" bra.uni $Lt_0_7426;\n"
"$L_0_4866:\n"
" .loc 17 83 0\n"
" mul.lo.s32 %r35, %r32, %r31;\n"
" add.s32 %r36, %r30, %r35;\n"
" mul.lo.s32 %r37, %r36, %r29;\n"
" add.s32 %r38, %r28, %r37;\n"
" ld.param.u64 %rd2, [__cudaparm_particle_map_counts];\n"
" cvt.s64.s32 %rd3, %r38;\n"
" mul.wide.s32 %rd4, %r38, 4;\n"
" add.u64 %rd5, %rd2, %rd4;\n"
" mov.s32 %r39, 1;\n"
" atom.global.add.s32 %r40, [%rd5], %r39;\n"
" mov.s32 %r41, %r40;\n"
" ld.param.s32 %r42, [__cudaparm_particle_map_max_atoms];\n"
" setp.gt.s32 %p9, %r42, %r41;\n"
" @%p9 bra $Lt_0_7682;\n"
" .loc 17 85 0\n"
" mov.s32 %r43, 2;\n"
" ld.param.u64 %rd6, [__cudaparm_particle_map_error];\n"
" st.global.s32 [%rd6+0], %r43;\n"
" .loc 16 118 0\n"
" mov.s32 %r44, -1;\n"
" atom.global.add.s32 %r45, [%rd5], %r44;\n"
" bra.uni $Lt_0_7426;\n"
"$Lt_0_7682:\n"
" .loc 17 88 0\n"
" ld.param.u64 %rd7, [__cudaparm_particle_map_ans];\n"
" ld.param.s32 %r46, [__cudaparm_particle_map_atom_stride];\n"
" mul.lo.s32 %r47, %r46, %r41;\n"
" add.s32 %r48, %r38, %r47;\n"
" cvt.s64.s32 %rd8, %r48;\n"
" mul.wide.s32 %rd9, %r48, 32;\n"
" add.u64 %rd10, %rd7, %rd9;\n"
" cvt.rn.f64.s32 %fd23, %r28;\n"
" mov.f64 %fd24, 0d3fe0000000000000; \n"
" add.f64 %fd25, %fd23, %fd24;\n"
" sub.f64 %fd26, %fd25, %fd9;\n"
" cvt.rn.f64.s32 %fd27, %r30;\n"
" mov.f64 %fd28, 0d3fe0000000000000; \n"
" add.f64 %fd29, %fd27, %fd28;\n"
" sub.f64 %fd30, %fd29, %fd15;\n"
" st.global.v2.f64 [%rd10+0], {%fd26,%fd30};\n"
" cvt.rn.f64.s32 %fd31, %r32;\n"
" mov.f64 %fd32, 0d3fe0000000000000; \n"
" add.f64 %fd33, %fd31, %fd32;\n"
" sub.f64 %fd34, %fd33, %fd21;\n"
" st.global.v2.f64 [%rd10+16], {%fd34,%fd3};\n"
"$Lt_0_7426:\n"
"$L_0_4610:\n"
"$Lt_0_6914:\n"
"$Lt_0_6402:\n"
" .loc 17 92 0\n"
" exit;\n"
"$LDWend_particle_map:\n"
" }\n"
" .entry make_rho (\n"
" .param .u64 __cudaparm_make_rho_counts,\n"
" .param .u64 __cudaparm_make_rho_atoms,\n"
" .param .u64 __cudaparm_make_rho_brick,\n"
" .param .u64 __cudaparm_make_rho__rho_coeff,\n"
" .param .s32 __cudaparm_make_rho_atom_stride,\n"
" .param .s32 __cudaparm_make_rho_npts_x,\n"
" .param .s32 __cudaparm_make_rho_npts_y,\n"
" .param .s32 __cudaparm_make_rho_npts_z,\n"
" .param .s32 __cudaparm_make_rho_nlocal_x,\n"
" .param .s32 __cudaparm_make_rho_nlocal_y,\n"
" .param .s32 __cudaparm_make_rho_nlocal_z,\n"
" .param .s32 __cudaparm_make_rho_order_m_1,\n"
" .param .s32 __cudaparm_make_rho_order,\n"
" .param .s32 __cudaparm_make_rho_order2)\n"
" {\n"
" .reg .u32 %r<119>;\n"
" .reg .u64 %rd<57>;\n"
" .reg .f64 %fd<26>;\n"
" .reg .pred %p<27>;\n"
" .shared .align 8 .b8 __cuda___cuda_local_var_32578_34_non_const_rho_coeff200[512];\n"
" .shared .align 8 .b8 __cuda___cuda_local_var_32579_34_non_const_front712[640];\n"
" .shared .align 8 .b8 __cuda___cuda_local_var_32580_34_non_const_ans1352[4096];\n"
" .loc 17 101 0\n"
"$LDWbegin_make_rho:\n"
" ld.param.s32 %r1, [__cudaparm_make_rho_order2];\n"
" ld.param.s32 %r2, [__cudaparm_make_rho_order];\n"
" add.s32 %r3, %r1, %r2;\n"
" cvt.s32.u32 %r4, %tid.x;\n"
" setp.le.s32 %p1, %r3, %r4;\n"
" @%p1 bra $Lt_1_16898;\n"
" .loc 17 108 0\n"
" mov.u64 %rd1, __cuda___cuda_local_var_32578_34_non_const_rho_coeff200;\n"
" cvt.s64.s32 %rd2, %r4;\n"
" mul.wide.s32 %rd3, %r4, 8;\n"
" ld.param.u64 %rd4, [__cudaparm_make_rho__rho_coeff];\n"
" add.u64 %rd5, %rd4, %rd3;\n"
" ld.global.f64 %fd1, [%rd5+0];\n"
" add.u64 %rd6, %rd3, %rd1;\n"
" st.shared.f64 [%rd6+0], %fd1;\n"
"$Lt_1_16898:\n"
" mov.u64 %rd1, __cuda___cuda_local_var_32578_34_non_const_rho_coeff200;\n"
" shr.s32 %r5, %r4, 31;\n"
" mov.s32 %r6, 31;\n"
" and.b32 %r7, %r5, %r6;\n"
" add.s32 %r8, %r7, %r4;\n"
" shr.s32 %r9, %r8, 5;\n"
" mul.lo.s32 %r10, %r9, 32;\n"
" sub.s32 %r11, %r4, %r10;\n"
" setp.lt.s32 %p2, %r11, %r2;\n"
" @!%p2 bra $Lt_1_17410;\n"
" .loc 17 114 0\n"
" mov.u64 %rd7, __cuda___cuda_local_var_32579_34_non_const_front712;\n"
" mov.f64 %fd2, 0d0000000000000000; \n"
" cvt.s64.s32 %rd8, %r11;\n"
" shr.s32 %r12, %r4, 31;\n"
" mov.s32 %r13, 31;\n"
" and.b32 %r14, %r12, %r13;\n"
" add.s32 %r15, %r14, %r4;\n"
" shr.s32 %r16, %r15, 5;\n"
" cvt.s64.s32 %rd9, %r16;\n"
" mul.wide.s32 %rd10, %r16, 40;\n"
" add.u64 %rd11, %rd8, %rd10;\n"
" mul.lo.u64 %rd12, %rd11, 8;\n"
" add.u64 %rd13, %rd7, %rd12;\n"
" st.shared.f64 [%rd13+256], %fd2;\n"
"$Lt_1_17410:\n"
" mov.u64 %rd7, __cuda___cuda_local_var_32579_34_non_const_front712;\n"
" .loc 17 116 0\n"
" bar.sync 0;\n"
" ld.param.s32 %r17, [__cudaparm_make_rho_npts_x];\n"
" shr.s32 %r18, %r17, 31;\n"
" mov.s32 %r19, 31;\n"
" and.b32 %r20, %r18, %r19;\n"
" add.s32 %r21, %r20, %r17;\n"
" shr.s32 %r22, %r21, 5;\n"
" add.s32 %r23, %r22, 1;\n"
" mov.u32 %r24, 0;\n"
" setp.le.s32 %p3, %r23, %r24;\n"
" @%p3 bra $Lt_1_17922;\n"
" shr.s32 %r25, %r4, 31;\n"
" mov.s32 %r26, 31;\n"
" and.b32 %r27, %r25, %r26;\n"
" add.s32 %r28, %r27, %r4;\n"
" shr.s32 %r29, %r28, 5;\n"
" add.s32 %r30, %r11, 32;\n"
" ld.param.s32 %r31, [__cudaparm_make_rho_nlocal_y];\n"
" ld.param.s32 %r32, [__cudaparm_make_rho_nlocal_x];\n"
" mul.lo.s32 %r33, %r31, %r32;\n"
" mov.u32 %r34, %ctaid.x;\n"
" mul.lo.u32 %r35, %r34, 2;\n"
" add.u32 %r36, %r29, %r35;\n"
" ld.param.s32 %r37, [__cudaparm_make_rho_npts_y];\n"
" div.s32 %r38, %r36, %r37;\n"
" ld.param.s32 %r39, [__cudaparm_make_rho_order_m_1];\n"
" setp.lt.s32 %p4, %r38, %r39;\n"
" sub.s32 %r40, %r39, %r38;\n"
" mov.s32 %r41, 0;\n"
" selp.s32 %r42, %r40, %r41, %p4;\n"
" ld.param.s32 %r43, [__cudaparm_make_rho_nlocal_z];\n"
" setp.ge.s32 %p5, %r38, %r43;\n"
" sub.s32 %r44, %r43, %r38;\n"
" add.s32 %r45, %r44, %r2;\n"
" sub.s32 %r46, %r45, 1;\n"
" selp.s32 %r47, %r46, %r2, %p5;\n"
" rem.s32 %r48, %r36, %r37;\n"
" setp.lt.s32 %p6, %r48, %r39;\n"
" sub.s32 %r49, %r39, %r48;\n"
" mov.s32 %r50, 0;\n"
" selp.s32 %r51, %r49, %r50, %p6;\n"
" setp.ge.s32 %p7, %r48, %r31;\n"
" sub.s32 %r52, %r31, %r48;\n"
" add.s32 %r53, %r52, %r2;\n"
" sub.s32 %r54, %r53, 1;\n"
" selp.s32 %r55, %r54, %r2, %p7;\n"
" mov.s32 %r56, %r23;\n"
" mov.s32 %r57, 0;\n"
" setp.gt.s32 %p8, %r2, %r57;\n"
" mov.s32 %r58, 0;\n"
" cvt.s64.s32 %rd14, %r11;\n"
" cvt.s64.s32 %rd15, %r29;\n"
" mul.lo.s32 %r59, %r23, 32;\n"
" mul.wide.s32 %rd16, %r29, 40;\n"
" add.u64 %rd17, %rd14, %rd16;\n"
" ld.param.s32 %r60, [__cudaparm_make_rho_npts_z];\n"
" setp.gt.s32 %p9, %r60, %r38;\n"
" mul.lo.u64 %rd18, %rd17, 8;\n"
" selp.s32 %r61, 1, 0, %p9;\n"
" add.u64 %rd19, %rd18, %rd7;\n"
" mov.u64 %rd20, __cuda___cuda_local_var_32580_34_non_const_ans1352;\n"
" mov.s32 %r62, %r56;\n"
"$Lt_1_18434:\n"
" @!%p8 bra $Lt_1_18690;\n"
" mov.s32 %r63, %r2;\n"
" cvt.s64.s32 %rd21, %r4;\n"
" mul.wide.s32 %rd22, %r4, 8;\n"
" add.u64 %rd23, %rd20, %rd22;\n"
" mov.s32 %r64, 0;\n"
" mov.s32 %r65, %r63;\n"
"$Lt_1_19202:\n"
" .loc 17 140 0\n"
" mov.f64 %fd3, 0d0000000000000000; \n"
" st.shared.f64 [%rd23+0], %fd3;\n"
" add.s32 %r64, %r64, 1;\n"
" add.u64 %rd23, %rd23, 512;\n"
" setp.ne.s32 %p10, %r64, %r2;\n"
" @%p10 bra $Lt_1_19202;\n"
"$Lt_1_18690:\n"
" add.s32 %r66, %r11, %r58;\n"
" set.lt.u32.s32 %r67, %r66, %r32;\n"
" neg.s32 %r68, %r67;\n"
" and.b32 %r69, %r61, %r68;\n"
" mov.u32 %r70, 0;\n"
" setp.eq.s32 %p11, %r69, %r70;\n"
" @%p11 bra $Lt_1_20226;\n"
" .loc 17 143 0\n"
" mov.s32 %r71, %r42;\n"
" setp.ge.s32 %p12, %r42, %r47;\n"
" @%p12 bra $Lt_1_20226;\n"
" sub.s32 %r72, %r47, %r42;\n"
" setp.lt.s32 %p13, %r51, %r55;\n"
" mov.s32 %r73, %r72;\n"
"$Lt_1_20738:\n"
" .loc 17 145 0\n"
" mov.s32 %r74, %r51;\n"
" @!%p13 bra $Lt_1_20994;\n"
" sub.s32 %r75, %r55, %r51;\n"
" sub.s32 %r76, %r71, %r42;\n"
" add.s32 %r77, %r38, %r42;\n"
" add.s32 %r78, %r48, %r51;\n"
" sub.s32 %r79, %r77, %r39;\n"
" sub.s32 %r80, %r78, %r39;\n"
" add.s32 %r81, %r76, %r79;\n"
" mul.lo.s32 %r82, %r33, %r81;\n"
" ld.param.s32 %r83, [__cudaparm_make_rho_atom_stride];\n"
" ld.param.u64 %rd24, [__cudaparm_make_rho_counts];\n"
" mov.s32 %r84, %r75;\n"
"$Lt_1_21506:\n"
" .loc 17 147 0\n"
" sub.s32 %r85, %r74, %r51;\n"
" add.s32 %r86, %r85, %r80;\n"
" mul.lo.s32 %r87, %r86, %r32;\n"
" add.s32 %r88, %r82, %r87;\n"
" add.s32 %r89, %r66, %r88;\n"
" cvt.s64.s32 %rd25, %r89;\n"
" mul.wide.s32 %rd26, %r89, 4;\n"
" add.u64 %rd27, %rd24, %rd26;\n"
" ld.global.s32 %r90, [%rd27+0];\n"
" mul.lo.s32 %r91, %r90, %r83;\n"
" .loc 17 148 0\n"
" mov.s32 %r92, %r89;\n"
" setp.ge.s32 %p14, %r89, %r91;\n"
" @%p14 bra $Lt_1_21762;\n"
" sub.s32 %r93, %r3, 1;\n"
" cvt.s64.s32 %rd28, %r83;\n"
" mul.wide.s32 %rd29, %r83, 32;\n"
" mov.s32 %r94, -1;\n"
" setp.gt.s32 %p15, %r93, %r94;\n"
" ld.param.u64 %rd30, [__cudaparm_make_rho_atoms];\n"
" mul.lo.u64 %rd31, %rd25, 32;\n"
" add.u64 %rd32, %rd30, %rd31;\n"
"$Lt_1_22274:\n"
" .loc 17 149 0\n"
" ld.global.f64 %fd4, [%rd32+0];\n"
" @!%p15 bra $Lt_1_29954;\n"
" sub.s32 %r95, %r93, %r74;\n"
" mov.s32 %r96, -1;\n"
" sub.s32 %r97, %r96, %r74;\n"
" cvt.s64.s32 %rd33, %r2;\n"
" mul.wide.s32 %rd34, %r2, 8;\n"
" ld.global.f64 %fd5, [%rd32+8];\n"
" ld.global.f64 %fd6, [%rd32+16];\n"
" cvt.s64.s32 %rd35, %r95;\n"
" mul.wide.s32 %rd36, %r95, 8;\n"
" add.u64 %rd37, %rd1, %rd36;\n"
" sub.s32 %r98, %r93, %r71;\n"
" cvt.s64.s32 %rd38, %r98;\n"
" mul.wide.s32 %rd39, %r98, 8;\n"
" add.u64 %rd40, %rd1, %rd39;\n"
" mov.f64 %fd7, 0d0000000000000000; \n"
" mov.f64 %fd8, 0d0000000000000000; \n"
"$Lt_1_23042:\n"
" .loc 17 154 0\n"
" ld.shared.f64 %fd9, [%rd37+0];\n"
" mad.rn.f64 %fd8, %fd8, %fd5, %fd9;\n"
" .loc 17 155 0\n"
" ld.shared.f64 %fd10, [%rd40+0];\n"
" mad.rn.f64 %fd7, %fd7, %fd6, %fd10;\n"
" sub.u64 %rd40, %rd40, %rd34;\n"
" sub.s32 %r95, %r95, %r2;\n"
" sub.u64 %rd37, %rd37, %rd34;\n"
" setp.gt.s32 %p16, %r95, %r97;\n"
" @%p16 bra $Lt_1_23042;\n"
" bra.uni $Lt_1_22530;\n"
"$Lt_1_29954:\n"
" mov.f64 %fd7, 0d0000000000000000; \n"
" mov.f64 %fd8, 0d0000000000000000; \n"
"$Lt_1_22530:\n"
" .loc 17 157 0\n"
" ld.global.f64 %fd11, [%rd32+24];\n"
" mul.f64 %fd12, %fd7, %fd8;\n"
" mul.f64 %fd13, %fd11, %fd12;\n"
" @!%p8 bra $Lt_1_23554;\n"
" mov.s32 %r99, %r2;\n"
" cvt.s64.s32 %rd41, %r4;\n"
" mul.wide.s32 %rd42, %r4, 8;\n"
" add.u64 %rd43, %rd20, %rd42;\n"
" mov.s32 %r100, 0;\n"
" mov.s32 %r101, %r99;\n"
"$Lt_1_24066:\n"
" .loc 17 161 0\n"
" add.s32 %r102, %r100, %r1;\n"
" mov.s32 %r103, %r102;\n"
" setp.lt.s32 %p17, %r102, %r100;\n"
" @%p17 bra $Lt_1_30466;\n"
" cvt.s64.s32 %rd44, %r2;\n"
" mul.wide.s32 %rd34, %r2, 8;\n"
" cvt.s64.s32 %rd45, %r102;\n"
" mul.wide.s32 %rd46, %r102, 8;\n"
" add.u64 %rd47, %rd1, %rd46;\n"
" mov.f64 %fd14, 0d0000000000000000; \n"
"$Lt_1_24834:\n"
" .loc 17 162 0\n"
" ld.shared.f64 %fd15, [%rd47+0];\n"
" mad.rn.f64 %fd14, %fd4, %fd14, %fd15;\n"
" sub.s32 %r103, %r103, %r2;\n"
" sub.u64 %rd47, %rd47, %rd34;\n"
" setp.ge.s32 %p18, %r103, %r100;\n"
" @%p18 bra $Lt_1_24834;\n"
" bra.uni $Lt_1_24322;\n"
"$Lt_1_30466:\n"
" mov.f64 %fd14, 0d0000000000000000; \n"
"$Lt_1_24322:\n"
" .loc 17 163 0\n"
" ld.shared.f64 %fd16, [%rd43+0];\n"
" mad.rn.f64 %fd17, %fd14, %fd13, %fd16;\n"
" st.shared.f64 [%rd43+0], %fd17;\n"
" add.s32 %r100, %r100, 1;\n"
" add.u64 %rd43, %rd43, 512;\n"
" setp.ne.s32 %p19, %r100, %r2;\n"
" @%p19 bra $Lt_1_24066;\n"
"$Lt_1_23554:\n"
" add.s32 %r92, %r92, %r83;\n"
" add.u64 %rd32, %rd29, %rd32;\n"
" setp.gt.s32 %p20, %r91, %r92;\n"
" @%p20 bra $Lt_1_22274;\n"
"$Lt_1_21762:\n"
" add.s32 %r74, %r74, 1;\n"
" setp.ne.s32 %p21, %r55, %r74;\n"
" @%p21 bra $Lt_1_21506;\n"
"$Lt_1_20994:\n"
" add.s32 %r71, %r71, 1;\n"
" setp.ne.s32 %p22, %r47, %r71;\n"
" @%p22 bra $Lt_1_20738;\n"
"$Lt_1_20226:\n"
"$Lt_1_19714:\n"
" .loc 17 172 0\n"
" bar.sync 0;\n"
" @!%p2 bra $Lt_1_26626;\n"
" .loc 17 174 0\n"
" ld.shared.f64 %fd18, [%rd19+256];\n"
" st.shared.f64 [%rd19+0], %fd18;\n"
" .loc 17 175 0\n"
" mov.f64 %fd19, 0d0000000000000000; \n"
" st.shared.f64 [%rd19+256], %fd19;\n"
" bra.uni $Lt_1_26370;\n"
"$Lt_1_26626:\n"
" .loc 17 177 0\n"
" mov.f64 %fd20, 0d0000000000000000; \n"
" st.shared.f64 [%rd19+0], %fd20;\n"
"$Lt_1_26370:\n"
" @!%p8 bra $Lt_1_26882;\n"
" mov.s32 %r104, %r2;\n"
" cvt.s64.s32 %rd48, %r4;\n"
" mov.s32 %r105, %r11;\n"
" add.s32 %r106, %r11, %r2;\n"
" mul.wide.s32 %rd49, %r4, 8;\n"
" add.u64 %rd50, %rd20, %rd49;\n"
" mov.s64 %rd51, %rd19;\n"
" mov.s32 %r107, %r104;\n"
"$Lt_1_27394:\n"
" .loc 17 180 0\n"
" ld.shared.f64 %fd21, [%rd50+0];\n"
" ld.shared.f64 %fd22, [%rd51+0];\n"
" add.f64 %fd23, %fd21, %fd22;\n"
" st.shared.f64 [%rd51+0], %fd23;\n"
" .loc 17 181 0\n"
" bar.sync 0;\n"
" add.s32 %r105, %r105, 1;\n"
" add.u64 %rd51, %rd51, 8;\n"
" add.u64 %rd50, %rd50, 512;\n"
" setp.ne.s32 %p23, %r105, %r106;\n"
" @%p23 bra $Lt_1_27394;\n"
"$Lt_1_26882:\n"
" set.lt.u32.s32 %r108, %r66, %r17;\n"
" neg.s32 %r109, %r108;\n"
" and.b32 %r110, %r61, %r109;\n"
" mov.u32 %r111, 0;\n"
" setp.eq.s32 %p24, %r110, %r111;\n"
" @%p24 bra $Lt_1_27906;\n"
" .loc 17 185 0\n"
" ld.shared.f64 %fd24, [%rd19+0];\n"
" ld.param.u64 %rd52, [__cudaparm_make_rho_brick];\n"
" add.s32 %r112, %r11, %r58;\n"
" mul.lo.s32 %r113, %r37, %r17;\n"
" mul.lo.s32 %r114, %r38, %r113;\n"
" mul.lo.s32 %r115, %r48, %r17;\n"
" add.s32 %r116, %r114, %r115;\n"
" add.s32 %r117, %r112, %r116;\n"
" cvt.s64.s32 %rd53, %r117;\n"
" mul.wide.s32 %rd54, %r117, 8;\n"
" add.u64 %rd55, %rd52, %rd54;\n"
" st.global.f64 [%rd55+0], %fd24;\n"
"$Lt_1_27906:\n"
" add.s32 %r58, %r58, 32;\n"
" setp.ne.s32 %p25, %r58, %r59;\n"
" @%p25 bra $Lt_1_18434;\n"
"$Lt_1_17922:\n"
" .loc 17 189 0\n"
" exit;\n"
"$LDWend_make_rho:\n"
" }\n"
" .entry interp (\n"
" .param .u64 __cudaparm_interp_x_,\n"
" .param .u64 __cudaparm_interp_q_,\n"
" .param .s32 __cudaparm_interp_nlocal,\n"
" .param .u64 __cudaparm_interp_brick,\n"
" .param .u64 __cudaparm_interp__rho_coeff,\n"
" .param .s32 __cudaparm_interp_npts_x,\n"
" .param .s32 __cudaparm_interp_npts_yx,\n"
" .param .f64 __cudaparm_interp_b_lo_x,\n"
" .param .f64 __cudaparm_interp_b_lo_y,\n"
" .param .f64 __cudaparm_interp_b_lo_z,\n"
" .param .f64 __cudaparm_interp_delxinv,\n"
" .param .f64 __cudaparm_interp_delyinv,\n"
" .param .f64 __cudaparm_interp_delzinv,\n"
" .param .s32 __cudaparm_interp_order,\n"
" .param .s32 __cudaparm_interp_order2,\n"
" .param .f64 __cudaparm_interp_qqrd2e_scale,\n"
" .param .u64 __cudaparm_interp_ans)\n"
" {\n"
" .reg .u32 %r<56>;\n"
" .reg .u64 %rd<37>;\n"
" .reg .f32 %f<19>;\n"
" .reg .f64 %fd<63>;\n"
" .reg .pred %p<14>;\n"
" .shared .align 8 .b8 __cuda___cuda_local_var_32676_34_non_const_rho_coeff5568[512];\n"
" .shared .align 8 .b8 __cuda___cuda_local_var_32677_34_non_const_rho1d_06080[4096];\n"
" .shared .align 8 .b8 __cuda___cuda_local_var_32678_34_non_const_rho1d_110176[4096];\n"
" .loc 17 199 0\n"
"$LDWbegin_interp:\n"
" ld.param.s32 %r1, [__cudaparm_interp_order2];\n"
" ld.param.s32 %r2, [__cudaparm_interp_order];\n"
" add.s32 %r3, %r1, %r2;\n"
" cvt.s32.u32 %r4, %tid.x;\n"
" setp.le.s32 %p1, %r3, %r4;\n"
" @%p1 bra $Lt_2_8706;\n"
" .loc 17 206 0\n"
" mov.u64 %rd1, __cuda___cuda_local_var_32676_34_non_const_rho_coeff5568;\n"
" cvt.s64.s32 %rd2, %r4;\n"
" mul.wide.s32 %rd3, %r4, 8;\n"
" ld.param.u64 %rd4, [__cudaparm_interp__rho_coeff];\n"
" add.u64 %rd5, %rd4, %rd3;\n"
" ld.global.f64 %fd1, [%rd5+0];\n"
" add.u64 %rd6, %rd3, %rd1;\n"
" st.shared.f64 [%rd6+0], %fd1;\n"
"$Lt_2_8706:\n"
" mov.u64 %rd1, __cuda___cuda_local_var_32676_34_non_const_rho_coeff5568;\n"
" .loc 17 207 0\n"
" bar.sync 0;\n"
" mov.u32 %r5, %ctaid.x;\n"
" mov.u32 %r6, %ntid.x;\n"
" mul.lo.u32 %r7, %r5, %r6;\n"
" add.u32 %r8, %r4, %r7;\n"
" ld.param.s32 %r9, [__cudaparm_interp_nlocal];\n"
" setp.le.s32 %p2, %r9, %r8;\n"
" @%p2 bra $Lt_2_9218;\n"
" .loc 17 215 0\n"
" mov.u32 %r10, %r8;\n"
" mov.s32 %r11, 0;\n"
" mov.u32 %r12, %r11;\n"
" mov.s32 %r13, 0;\n"
" mov.u32 %r14, %r13;\n"
" mov.s32 %r15, 0;\n"
" mov.u32 %r16, %r15;\n"
" tex.1d.v4.f32.s32 {%f1,%f2,%f3,%f4},[pos_tex,{%r10,%r12,%r14,%r16}];\n"
" mov.f32 %f5, %f1;\n"
" mov.f32 %f6, %f2;\n"
" mov.f32 %f7, %f3;\n"
" .loc 17 216 0\n"
" mov.u32 %r17, %r8;\n"
" mov.s32 %r18, 0;\n"
" mov.u32 %r19, %r18;\n"
" mov.s32 %r20, 0;\n"
" mov.u32 %r21, %r20;\n"
" mov.s32 %r22, 0;\n"
" mov.u32 %r23, %r22;\n"
" tex.1d.v4.f32.s32 {%f8,%f9,%f10,%f11},[q_tex,{%r17,%r19,%r21,%r23}];\n"
" mov.f32 %f12, %f8;\n"
" cvt.ftz.f64.f32 %fd2, %f12;\n"
" ld.param.f64 %fd3, [__cudaparm_interp_qqrd2e_scale];\n"
" mul.f64 %fd4, %fd2, %fd3;\n"
" mov.f64 %fd5, 0d0000000000000000; \n"
" setp.neu.f64 %p3, %fd4, %fd5;\n"
" @!%p3 bra $Lt_2_9986;\n"
" mov.s32 %r24, 0;\n"
" setp.gt.s32 %p4, %r2, %r24;\n"
" ld.param.f64 %fd6, [__cudaparm_interp_delxinv];\n"
" cvt.ftz.f64.f32 %fd7, %f5;\n"
" ld.param.f64 %fd8, [__cudaparm_interp_b_lo_x];\n"
" sub.f64 %fd9, %fd7, %fd8;\n"
" mul.f64 %fd10, %fd6, %fd9;\n"
" @!%p4 bra $Lt_2_16386;\n"
" mov.u64 %rd7, __cuda___cuda_local_var_32677_34_non_const_rho1d_06080;\n"
" mov.u64 %rd8, __cuda___cuda_local_var_32678_34_non_const_rho1d_110176;\n"
" cvt.rzi.s32.f64 %r25, %fd10;\n"
" cvt.rn.f64.s32 %fd11, %r25;\n"
" mov.f64 %fd12, 0d3fe0000000000000; \n"
" add.f64 %fd13, %fd11, %fd12;\n"
" sub.f64 %fd14, %fd13, %fd10;\n"
" ld.param.f64 %fd15, [__cudaparm_interp_delyinv];\n"
" cvt.ftz.f64.f32 %fd16, %f6;\n"
" ld.param.f64 %fd17, [__cudaparm_interp_b_lo_y];\n"
" sub.f64 %fd18, %fd16, %fd17;\n"
" mul.f64 %fd19, %fd15, %fd18;\n"
" cvt.rzi.s32.f64 %r26, %fd19;\n"
" cvt.rn.f64.s32 %fd20, %r26;\n"
" mov.f64 %fd21, 0d3fe0000000000000; \n"
" add.f64 %fd22, %fd20, %fd21;\n"
" sub.f64 %fd23, %fd22, %fd19;\n"
" mov.s32 %r27, %r2;\n"
" cvt.s64.s32 %rd9, %r4;\n"
" mov.s32 %r28, %r1;\n"
" mul.wide.s32 %rd3, %r4, 8;\n"
" add.u64 %rd10, %rd3, %rd7;\n"
" add.u64 %rd11, %rd3, %rd8;\n"
" mov.s32 %r29, 0;\n"
" mov.s32 %r30, %r27;\n"
"$Lt_2_10754:\n"
" .loc 17 235 0\n"
" mov.f64 %fd24, 0d0000000000000000; \n"
" mov.f64 %fd25, 0d0000000000000000; \n"
" st.shared.f64 [%rd10+0], %fd25;\n"
" .loc 17 236 0\n"
" mov.f64 %fd26, 0d0000000000000000; \n"
" mov.f64 %fd27, 0d0000000000000000; \n"
" st.shared.f64 [%rd11+0], %fd27;\n"
" .loc 17 237 0\n"
" mov.s32 %r31, %r28;\n"
" setp.lt.s32 %p5, %r28, %r29;\n"
" @%p5 bra $Lt_2_11010;\n"
" cvt.s64.s32 %rd12, %r2;\n"
" mul.wide.s32 %rd13, %r2, 8;\n"
" cvt.s64.s32 %rd14, %r28;\n"
" mul.wide.s32 %rd15, %r28, 8;\n"
" add.u64 %rd16, %rd1, %rd15;\n"
"$Lt_2_11522:\n"
" .loc 17 238 0\n"
" ld.shared.f64 %fd28, [%rd16+0];\n"
" mad.rn.f64 %fd24, %fd24, %fd14, %fd28;\n"
" st.shared.f64 [%rd10+0], %fd24;\n"
" .loc 17 239 0\n"
" mad.rn.f64 %fd26, %fd26, %fd23, %fd28;\n"
" st.shared.f64 [%rd11+0], %fd26;\n"
" sub.s32 %r31, %r31, %r2;\n"
" sub.u64 %rd16, %rd16, %rd13;\n"
" setp.ge.s32 %p6, %r31, %r29;\n"
" @%p6 bra $Lt_2_11522;\n"
"$Lt_2_11010:\n"
" add.s32 %r29, %r29, 1;\n"
" add.s32 %r28, %r28, 1;\n"
" add.u64 %rd11, %rd11, 512;\n"
" add.u64 %rd10, %rd10, 512;\n"
" setp.ne.s32 %p7, %r28, %r3;\n"
" @%p7 bra $Lt_2_10754;\n"
" bra.uni $Lt_2_10242;\n"
"$Lt_2_16386:\n"
" cvt.rzi.s32.f64 %r25, %fd10;\n"
" mov.u64 %rd8, __cuda___cuda_local_var_32678_34_non_const_rho1d_110176;\n"
" mov.u64 %rd7, __cuda___cuda_local_var_32677_34_non_const_rho1d_06080;\n"
"$Lt_2_10242:\n"
" .loc 17 243 0\n"
" ld.param.f64 %fd29, [__cudaparm_interp_delzinv];\n"
" cvt.ftz.f64.f32 %fd30, %f7;\n"
" ld.param.f64 %fd31, [__cudaparm_interp_b_lo_z];\n"
" sub.f64 %fd32, %fd30, %fd31;\n"
" mul.f64 %fd33, %fd29, %fd32;\n"
" cvt.rzi.s32.f64 %r32, %fd33;\n"
" ld.param.s32 %r33, [__cudaparm_interp_npts_yx];\n"
" mul.lo.s32 %r34, %r32, %r33;\n"
" add.s32 %r35, %r25, %r34;\n"
" @!%p4 bra $Lt_2_16898;\n"
" cvt.rn.f64.s32 %fd34, %r32;\n"
" mov.f64 %fd35, 0d3fe0000000000000; \n"
" add.f64 %fd36, %fd34, %fd35;\n"
" sub.f64 %fd37, %fd36, %fd33;\n"
" mov.s32 %r36, %r2;\n"
" cvt.ftz.f64.f32 %fd38, %f6;\n"
" cvt.s64.s32 %rd17, %r4;\n"
" ld.param.f64 %fd39, [__cudaparm_interp_delyinv];\n"
" ld.param.f64 %fd40, [__cudaparm_interp_b_lo_y];\n"
" sub.f64 %fd41, %fd38, %fd40;\n"
" mul.f64 %fd42, %fd39, %fd41;\n"
" cvt.rzi.s32.f64 %r37, %fd42;\n"
" mul.wide.s32 %rd3, %r4, 8;\n"
" ld.param.s32 %r38, [__cudaparm_interp_npts_x];\n"
" mul.lo.s32 %r39, %r37, %r38;\n"
" add.u64 %rd18, %rd3, %rd7;\n"
" add.u64 %rd19, %rd3, %rd8;\n"
" cvt.s64.s32 %rd20, %r38;\n"
" mul.wide.s32 %rd21, %r38, 32;\n"
" add.s32 %r40, %r39, %r35;\n"
" mov.s32 %r41, %r40;\n"
" ld.param.u64 %rd22, [__cudaparm_interp_brick];\n"
" mov.s32 %r42, 0;\n"
" mov.f32 %f13, 0f00000000; \n"
" mov.f32 %f14, 0f00000000; \n"
" mov.f32 %f15, 0f00000000; \n"
" mov.s32 %r43, %r36;\n"
"$Lt_2_12802:\n"
" .loc 17 246 0\n"
" add.s32 %r44, %r42, %r1;\n"
" mov.s32 %r45, %r44;\n"
" setp.lt.s32 %p8, %r44, %r42;\n"
" @%p8 bra $Lt_2_17154;\n"
" cvt.s64.s32 %rd23, %r2;\n"
" mul.wide.s32 %rd13, %r2, 8;\n"
" cvt.s64.s32 %rd24, %r44;\n"
" mul.wide.s32 %rd25, %r44, 8;\n"
" add.u64 %rd26, %rd1, %rd25;\n"
" mov.f64 %fd43, 0d0000000000000000; \n"
"$Lt_2_13570:\n"
" .loc 17 247 0\n"
" ld.shared.f64 %fd44, [%rd26+0];\n"
" mad.rn.f64 %fd43, %fd37, %fd43, %fd44;\n"
" sub.s32 %r45, %r45, %r2;\n"
" sub.u64 %rd26, %rd26, %rd13;\n"
" setp.ge.s32 %p9, %r45, %r42;\n"
" @%p9 bra $Lt_2_13570;\n"
" bra.uni $Lt_2_13058;\n"
"$Lt_2_17154:\n"
" mov.f64 %fd43, 0d0000000000000000; \n"
"$Lt_2_13058:\n"
" .loc 17 249 0\n"
" mov.s32 %r46, %r41;\n"
" mov.s32 %r47, %r2;\n"
" mov.s32 %r48, %r46;\n"
" mul.f64 %fd45, %fd4, %fd43;\n"
" mov.s64 %rd27, %rd19;\n"
" cvt.s64.s32 %rd28, %r46;\n"
" mul.wide.s32 %rd29, %r46, 32;\n"
" mov.s32 %r49, 0;\n"
" mov.s32 %r50, %r47;\n"
"$Lt_2_14594:\n"
" mov.s32 %r51, %r2;\n"
" mov.s32 %r52, %r48;\n"
" add.s32 %r53, %r48, %r2;\n"
" mov.s64 %rd30, %rd18;\n"
" ld.shared.f64 %fd46, [%rd27+0];\n"
" add.u64 %rd31, %rd29, %rd22;\n"
" mul.f64 %fd47, %fd45, %fd46;\n"
" mov.s32 %r54, %r51;\n"
"$Lt_2_15362:\n"
" .loc 17 253 0\n"
" ld.shared.f64 %fd48, [%rd30+0];\n"
" mul.f64 %fd49, %fd48, %fd47;\n"
" .loc 17 255 0\n"
" cvt.ftz.f64.f32 %fd50, %f15;\n"
" ld.global.v2.f64 {%fd51,%fd52}, [%rd31+0];\n"
" mul.f64 %fd53, %fd49, %fd51;\n"
" sub.f64 %fd54, %fd50, %fd53;\n"
" cvt.rn.ftz.f32.f64 %f15, %fd54;\n"
" .loc 17 256 0\n"
" cvt.ftz.f64.f32 %fd55, %f14;\n"
" mul.f64 %fd56, %fd49, %fd52;\n"
" sub.f64 %fd57, %fd55, %fd56;\n"
" cvt.rn.ftz.f32.f64 %f14, %fd57;\n"
" .loc 17 257 0\n"
" cvt.ftz.f64.f32 %fd58, %f13;\n"
" ld.global.f64 %fd59, [%rd31+16];\n"
" mul.f64 %fd60, %fd49, %fd59;\n"
" sub.f64 %fd61, %fd58, %fd60;\n"
" cvt.rn.ftz.f32.f64 %f13, %fd61;\n"
" add.s32 %r52, %r52, 1;\n"
" add.u64 %rd31, %rd31, 32;\n"
" add.u64 %rd30, %rd30, 512;\n"
" setp.ne.s32 %p10, %r52, %r53;\n"
" @%p10 bra $Lt_2_15362;\n"
" add.s32 %r49, %r49, 1;\n"
" add.s32 %r48, %r48, %r38;\n"
" add.u64 %rd29, %rd29, %rd21;\n"
" add.u64 %rd27, %rd27, 512;\n"
" setp.ne.s32 %p11, %r49, %r2;\n"
" @%p11 bra $Lt_2_14594;\n"
" add.s32 %r42, %r42, 1;\n"
" add.s32 %r41, %r46, %r33;\n"
" setp.ne.s32 %p12, %r42, %r2;\n"
" @%p12 bra $Lt_2_12802;\n"
" bra.uni $Lt_2_9730;\n"
"$Lt_2_16898:\n"
" mov.f32 %f13, 0f00000000; \n"
" mov.f32 %f14, 0f00000000; \n"
" mov.f32 %f15, 0f00000000; \n"
" bra.uni $Lt_2_9730;\n"
"$Lt_2_9986:\n"
" mov.f32 %f13, 0f00000000; \n"
" mov.f32 %f14, 0f00000000; \n"
" mov.f32 %f15, 0f00000000; \n"
"$Lt_2_9730:\n"
" .loc 17 264 0\n"
" ld.param.u64 %rd32, [__cudaparm_interp_ans];\n"
" cvt.s64.s32 %rd33, %r8;\n"
" mul.wide.s32 %rd34, %r8, 16;\n"
" add.u64 %rd35, %rd32, %rd34;\n"
" mov.f32 %f16, %f17;\n"
" st.global.v4.f32 [%rd35+0], {%f15,%f14,%f13,%f16};\n"
"$Lt_2_9218:\n"
" .loc 17 266 0\n"
" exit;\n"
"$LDWend_interp:\n"
" }\n"
;

View File

@ -1,881 +0,0 @@
.version 2.3
.target sm_20
.address_size 64
// compiled with /usr/local/cuda/open64/lib//be
// nvopencc 4.0 built on 2011-05-12
//-----------------------------------------------------------
// Compiling /tmp/tmpxft_00009b0b_00000000-9_lal_pppm.cpp3.i (/home/sjplimp/ccBI#.wCkpTI)
//-----------------------------------------------------------
//-----------------------------------------------------------
// Options:
//-----------------------------------------------------------
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
// -O3 (Optimization level)
// -g0 (Debug level)
// -m2 (Report advisories)
//-----------------------------------------------------------
.file 1 "<command-line>"
.file 2 "/tmp/tmpxft_00009b0b_00000000-8_lal_pppm.cudafe2.gpu"
.file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
.file 4 "/usr/local/cuda/include/crt/device_runtime.h"
.file 5 "/usr/local/cuda/include/host_defines.h"
.file 6 "/usr/local/cuda/include/builtin_types.h"
.file 7 "/usr/local/cuda/include/device_types.h"
.file 8 "/usr/local/cuda/include/driver_types.h"
.file 9 "/usr/local/cuda/include/surface_types.h"
.file 10 "/usr/local/cuda/include/texture_types.h"
.file 11 "/usr/local/cuda/include/vector_types.h"
.file 12 "/usr/local/cuda/include/device_launch_parameters.h"
.file 13 "/usr/local/cuda/include/crt/storage_class.h"
.file 14 "/usr/include/bits/types.h"
.file 15 "/usr/include/time.h"
.file 16 "/usr/local/cuda/include/sm_11_atomic_functions.h"
.file 17 "lal_pppm.cu"
.file 18 "/usr/local/cuda/include/common_functions.h"
.file 19 "/usr/local/cuda/include/math_functions.h"
.file 20 "/usr/local/cuda/include/math_constants.h"
.file 21 "/usr/local/cuda/include/device_functions.h"
.file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h"
.file 23 "/usr/local/cuda/include/sm_13_double_functions.h"
.file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h"
.file 25 "/usr/local/cuda/include/sm_20_intrinsics.h"
.file 26 "/usr/local/cuda/include/surface_functions.h"
.file 27 "/usr/local/cuda/include/texture_fetch_functions.h"
.file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
.global .texref pos_tex;
.global .texref q_tex;
.entry particle_map (
.param .u64 __cudaparm_particle_map_x_,
.param .u64 __cudaparm_particle_map_q_,
.param .f32 __cudaparm_particle_map_delvolinv,
.param .s32 __cudaparm_particle_map_nlocal,
.param .u64 __cudaparm_particle_map_counts,
.param .u64 __cudaparm_particle_map_ans,
.param .f32 __cudaparm_particle_map_b_lo_x,
.param .f32 __cudaparm_particle_map_b_lo_y,
.param .f32 __cudaparm_particle_map_b_lo_z,
.param .f32 __cudaparm_particle_map_delxinv,
.param .f32 __cudaparm_particle_map_delyinv,
.param .f32 __cudaparm_particle_map_delzinv,
.param .s32 __cudaparm_particle_map_nlocal_x,
.param .s32 __cudaparm_particle_map_nlocal_y,
.param .s32 __cudaparm_particle_map_nlocal_z,
.param .s32 __cudaparm_particle_map_atom_stride,
.param .s32 __cudaparm_particle_map_max_atoms,
.param .u64 __cudaparm_particle_map_error)
{
.reg .u32 %r<50>;
.reg .u64 %rd<12>;
.reg .f32 %f<44>;
.reg .pred %p<11>;
.loc 17 50 0
$LDWbegin_particle_map:
cvt.s32.u32 %r1, %ntid.x;
cvt.s32.u32 %r2, %ctaid.x;
mul24.lo.s32 %r3, %r2, %r1;
cvt.s32.u32 %r4, %nctaid.x;
mul24.lo.s32 %r5, %r4, %r1;
mov.u32 %r6, %tid.x;
add.u32 %r7, %r3, %r6;
sub.s32 %r8, %r5, 1;
mul.lo.s32 %r9, %r7, 64;
div.s32 %r10, %r9, %r5;
mul.lo.s32 %r11, %r8, %r10;
sub.s32 %r12, %r9, %r11;
ld.param.s32 %r13, [__cudaparm_particle_map_nlocal];
setp.le.s32 %p1, %r13, %r12;
@%p1 bra $Lt_0_7426;
.loc 17 62 0
mov.u32 %r14, %r12;
mov.s32 %r15, 0;
mov.u32 %r16, %r15;
mov.s32 %r17, 0;
mov.u32 %r18, %r17;
mov.s32 %r19, 0;
mov.u32 %r20, %r19;
tex.1d.v4.f32.s32 {%f1,%f2,%f3,%f4},[pos_tex,{%r14,%r16,%r18,%r20}];
mov.f32 %f5, %f1;
mov.f32 %f6, %f2;
mov.f32 %f7, %f3;
.loc 17 64 0
mov.u32 %r21, %r12;
mov.s32 %r22, 0;
mov.u32 %r23, %r22;
mov.s32 %r24, 0;
mov.u32 %r25, %r24;
mov.s32 %r26, 0;
mov.u32 %r27, %r26;
tex.1d.v4.f32.s32 {%f8,%f9,%f10,%f11},[q_tex,{%r21,%r23,%r25,%r27}];
mov.f32 %f12, %f8;
ld.param.f32 %f13, [__cudaparm_particle_map_delvolinv];
mul.ftz.f32 %f14, %f13, %f12;
mov.f32 %f15, 0f00000000; // 0
setp.neu.ftz.f32 %p2, %f14, %f15;
@!%p2 bra $Lt_0_7426;
.loc 17 67 0
ld.param.f32 %f16, [__cudaparm_particle_map_b_lo_x];
sub.ftz.f32 %f17, %f5, %f16;
ld.param.f32 %f18, [__cudaparm_particle_map_delxinv];
mul.ftz.f32 %f19, %f18, %f17;
mov.f32 %f20, 0f00000000; // 0
setp.lt.ftz.f32 %p3, %f19, %f20;
@%p3 bra $Lt_0_8706;
ld.param.f32 %f21, [__cudaparm_particle_map_b_lo_y];
sub.ftz.f32 %f22, %f6, %f21;
ld.param.f32 %f23, [__cudaparm_particle_map_delyinv];
mul.ftz.f32 %f24, %f23, %f22;
mov.f32 %f25, 0f00000000; // 0
setp.lt.ftz.f32 %p4, %f24, %f25;
@%p4 bra $Lt_0_8706;
ld.param.f32 %f26, [__cudaparm_particle_map_b_lo_z];
sub.ftz.f32 %f27, %f7, %f26;
ld.param.f32 %f28, [__cudaparm_particle_map_delzinv];
mul.ftz.f32 %f29, %f28, %f27;
mov.f32 %f30, 0f00000000; // 0
setp.lt.ftz.f32 %p5, %f29, %f30;
@%p5 bra $Lt_0_8706;
cvt.rzi.ftz.s32.f32 %r28, %f19;
ld.param.s32 %r29, [__cudaparm_particle_map_nlocal_x];
setp.ge.s32 %p6, %r28, %r29;
@%p6 bra $Lt_0_8706;
cvt.rzi.ftz.s32.f32 %r30, %f24;
ld.param.s32 %r31, [__cudaparm_particle_map_nlocal_y];
setp.ge.s32 %p7, %r30, %r31;
@%p7 bra $Lt_0_8706;
cvt.rzi.ftz.s32.f32 %r32, %f29;
ld.param.s32 %r33, [__cudaparm_particle_map_nlocal_z];
setp.gt.s32 %p8, %r33, %r32;
@%p8 bra $L_0_4866;
$Lt_0_8706:
$L_0_5122:
.loc 17 76 0
mov.s32 %r34, 1;
ld.param.u64 %rd1, [__cudaparm_particle_map_error];
st.global.s32 [%rd1+0], %r34;
bra.uni $Lt_0_7426;
$L_0_4866:
.loc 17 83 0
mul.lo.s32 %r35, %r32, %r31;
add.s32 %r36, %r30, %r35;
mul.lo.s32 %r37, %r36, %r29;
add.s32 %r38, %r28, %r37;
ld.param.u64 %rd2, [__cudaparm_particle_map_counts];
cvt.s64.s32 %rd3, %r38;
mul.wide.s32 %rd4, %r38, 4;
add.u64 %rd5, %rd2, %rd4;
mov.s32 %r39, 1;
atom.global.add.s32 %r40, [%rd5], %r39;
mov.s32 %r41, %r40;
ld.param.s32 %r42, [__cudaparm_particle_map_max_atoms];
setp.gt.s32 %p9, %r42, %r41;
@%p9 bra $Lt_0_7682;
.loc 17 85 0
mov.s32 %r43, 2;
ld.param.u64 %rd6, [__cudaparm_particle_map_error];
st.global.s32 [%rd6+0], %r43;
.loc 16 118 0
mov.s32 %r44, -1;
atom.global.add.s32 %r45, [%rd5], %r44;
bra.uni $Lt_0_7426;
$Lt_0_7682:
.loc 17 88 0
ld.param.u64 %rd7, [__cudaparm_particle_map_ans];
ld.param.s32 %r46, [__cudaparm_particle_map_atom_stride];
mul.lo.s32 %r47, %r46, %r41;
add.s32 %r48, %r38, %r47;
cvt.s64.s32 %rd8, %r48;
mul.wide.s32 %rd9, %r48, 16;
add.u64 %rd10, %rd7, %rd9;
cvt.rn.f32.s32 %f31, %r28;
mov.f32 %f32, 0f3f000000; // 0.5
add.ftz.f32 %f33, %f31, %f32;
sub.ftz.f32 %f34, %f33, %f19;
cvt.rn.f32.s32 %f35, %r30;
mov.f32 %f36, 0f3f000000; // 0.5
add.ftz.f32 %f37, %f35, %f36;
sub.ftz.f32 %f38, %f37, %f24;
cvt.rn.f32.s32 %f39, %r32;
mov.f32 %f40, 0f3f000000; // 0.5
add.ftz.f32 %f41, %f39, %f40;
sub.ftz.f32 %f42, %f41, %f29;
st.global.v4.f32 [%rd10+0], {%f34,%f38,%f42,%f14};
$Lt_0_7426:
$L_0_4610:
$Lt_0_6914:
$Lt_0_6402:
.loc 17 92 0
exit;
$LDWend_particle_map:
} // particle_map
.entry make_rho (
.param .u64 __cudaparm_make_rho_counts,
.param .u64 __cudaparm_make_rho_atoms,
.param .u64 __cudaparm_make_rho_brick,
.param .u64 __cudaparm_make_rho__rho_coeff,
.param .s32 __cudaparm_make_rho_atom_stride,
.param .s32 __cudaparm_make_rho_npts_x,
.param .s32 __cudaparm_make_rho_npts_y,
.param .s32 __cudaparm_make_rho_npts_z,
.param .s32 __cudaparm_make_rho_nlocal_x,
.param .s32 __cudaparm_make_rho_nlocal_y,
.param .s32 __cudaparm_make_rho_nlocal_z,
.param .s32 __cudaparm_make_rho_order_m_1,
.param .s32 __cudaparm_make_rho_order,
.param .s32 __cudaparm_make_rho_order2)
{
.reg .u32 %r<119>;
.reg .u64 %rd<57>;
.reg .f32 %f<26>;
.reg .pred %p<27>;
.shared .align 4 .b8 __cuda___cuda_local_var_32578_33_non_const_rho_coeff168[256];
.shared .align 4 .b8 __cuda___cuda_local_var_32579_33_non_const_front424[320];
.shared .align 4 .b8 __cuda___cuda_local_var_32580_33_non_const_ans744[2048];
.loc 17 101 0
$LDWbegin_make_rho:
ld.param.s32 %r1, [__cudaparm_make_rho_order2];
ld.param.s32 %r2, [__cudaparm_make_rho_order];
add.s32 %r3, %r1, %r2;
cvt.s32.u32 %r4, %tid.x;
setp.le.s32 %p1, %r3, %r4;
@%p1 bra $Lt_1_16898;
.loc 17 108 0
mov.u64 %rd1, __cuda___cuda_local_var_32578_33_non_const_rho_coeff168;
cvt.s64.s32 %rd2, %r4;
mul.wide.s32 %rd3, %r4, 4;
ld.param.u64 %rd4, [__cudaparm_make_rho__rho_coeff];
add.u64 %rd5, %rd4, %rd3;
ld.global.f32 %f1, [%rd5+0];
add.u64 %rd6, %rd3, %rd1;
st.shared.f32 [%rd6+0], %f1;
$Lt_1_16898:
mov.u64 %rd1, __cuda___cuda_local_var_32578_33_non_const_rho_coeff168;
shr.s32 %r5, %r4, 31;
mov.s32 %r6, 31;
and.b32 %r7, %r5, %r6;
add.s32 %r8, %r7, %r4;
shr.s32 %r9, %r8, 5;
mul.lo.s32 %r10, %r9, 32;
sub.s32 %r11, %r4, %r10;
setp.lt.s32 %p2, %r11, %r2;
@!%p2 bra $Lt_1_17410;
.loc 17 114 0
mov.u64 %rd7, __cuda___cuda_local_var_32579_33_non_const_front424;
mov.f32 %f2, 0f00000000; // 0
cvt.s64.s32 %rd8, %r11;
shr.s32 %r12, %r4, 31;
mov.s32 %r13, 31;
and.b32 %r14, %r12, %r13;
add.s32 %r15, %r14, %r4;
shr.s32 %r16, %r15, 5;
cvt.s64.s32 %rd9, %r16;
mul.wide.s32 %rd10, %r16, 40;
add.u64 %rd11, %rd8, %rd10;
mul.lo.u64 %rd12, %rd11, 4;
add.u64 %rd13, %rd7, %rd12;
st.shared.f32 [%rd13+128], %f2;
$Lt_1_17410:
mov.u64 %rd7, __cuda___cuda_local_var_32579_33_non_const_front424;
.loc 17 116 0
bar.sync 0;
ld.param.s32 %r17, [__cudaparm_make_rho_npts_x];
shr.s32 %r18, %r17, 31;
mov.s32 %r19, 31;
and.b32 %r20, %r18, %r19;
add.s32 %r21, %r20, %r17;
shr.s32 %r22, %r21, 5;
add.s32 %r23, %r22, 1;
mov.u32 %r24, 0;
setp.le.s32 %p3, %r23, %r24;
@%p3 bra $Lt_1_17922;
shr.s32 %r25, %r4, 31;
mov.s32 %r26, 31;
and.b32 %r27, %r25, %r26;
add.s32 %r28, %r27, %r4;
shr.s32 %r29, %r28, 5;
add.s32 %r30, %r11, 32;
ld.param.s32 %r31, [__cudaparm_make_rho_nlocal_y];
ld.param.s32 %r32, [__cudaparm_make_rho_nlocal_x];
mul.lo.s32 %r33, %r31, %r32;
mov.u32 %r34, %ctaid.x;
mul.lo.u32 %r35, %r34, 2;
add.u32 %r36, %r29, %r35;
ld.param.s32 %r37, [__cudaparm_make_rho_npts_y];
div.s32 %r38, %r36, %r37;
ld.param.s32 %r39, [__cudaparm_make_rho_order_m_1];
setp.lt.s32 %p4, %r38, %r39;
sub.s32 %r40, %r39, %r38;
mov.s32 %r41, 0;
selp.s32 %r42, %r40, %r41, %p4;
ld.param.s32 %r43, [__cudaparm_make_rho_nlocal_z];
setp.ge.s32 %p5, %r38, %r43;
sub.s32 %r44, %r43, %r38;
add.s32 %r45, %r44, %r2;
sub.s32 %r46, %r45, 1;
selp.s32 %r47, %r46, %r2, %p5;
rem.s32 %r48, %r36, %r37;
setp.lt.s32 %p6, %r48, %r39;
sub.s32 %r49, %r39, %r48;
mov.s32 %r50, 0;
selp.s32 %r51, %r49, %r50, %p6;
setp.ge.s32 %p7, %r48, %r31;
sub.s32 %r52, %r31, %r48;
add.s32 %r53, %r52, %r2;
sub.s32 %r54, %r53, 1;
selp.s32 %r55, %r54, %r2, %p7;
mov.s32 %r56, %r23;
mov.s32 %r57, 0;
setp.gt.s32 %p8, %r2, %r57;
mov.s32 %r58, 0;
cvt.s64.s32 %rd14, %r11;
cvt.s64.s32 %rd15, %r29;
mul.lo.s32 %r59, %r23, 32;
mul.wide.s32 %rd16, %r29, 40;
add.u64 %rd17, %rd14, %rd16;
ld.param.s32 %r60, [__cudaparm_make_rho_npts_z];
setp.gt.s32 %p9, %r60, %r38;
mul.lo.u64 %rd18, %rd17, 4;
selp.s32 %r61, 1, 0, %p9;
add.u64 %rd19, %rd18, %rd7;
mov.u64 %rd20, __cuda___cuda_local_var_32580_33_non_const_ans744;
mov.s32 %r62, %r56;
$Lt_1_18434:
//<loop> Loop body line 116, nesting depth: 1, estimated iterations: unknown
@!%p8 bra $Lt_1_18690;
mov.s32 %r63, %r2;
cvt.s64.s32 %rd21, %r4;
mul.wide.s32 %rd22, %r4, 4;
add.u64 %rd23, %rd20, %rd22;
mov.s32 %r64, 0;
mov.s32 %r65, %r63;
$Lt_1_19202:
//<loop> Loop body line 116, nesting depth: 2, estimated iterations: unknown
.loc 17 140 0
mov.f32 %f3, 0f00000000; // 0
st.shared.f32 [%rd23+0], %f3;
add.s32 %r64, %r64, 1;
add.u64 %rd23, %rd23, 256;
setp.ne.s32 %p10, %r64, %r2;
@%p10 bra $Lt_1_19202;
$Lt_1_18690:
add.s32 %r66, %r11, %r58;
set.lt.u32.s32 %r67, %r66, %r32;
neg.s32 %r68, %r67;
and.b32 %r69, %r61, %r68;
mov.u32 %r70, 0;
setp.eq.s32 %p11, %r69, %r70;
@%p11 bra $Lt_1_20226;
.loc 17 143 0
mov.s32 %r71, %r42;
setp.ge.s32 %p12, %r42, %r47;
@%p12 bra $Lt_1_20226;
sub.s32 %r72, %r47, %r42;
setp.lt.s32 %p13, %r51, %r55;
mov.s32 %r73, %r72;
$Lt_1_20738:
//<loop> Loop body line 143, nesting depth: 2, estimated iterations: unknown
.loc 17 145 0
mov.s32 %r74, %r51;
@!%p13 bra $Lt_1_20994;
sub.s32 %r75, %r55, %r51;
sub.s32 %r76, %r71, %r42;
add.s32 %r77, %r38, %r42;
add.s32 %r78, %r48, %r51;
sub.s32 %r79, %r77, %r39;
sub.s32 %r80, %r78, %r39;
add.s32 %r81, %r76, %r79;
mul.lo.s32 %r82, %r33, %r81;
ld.param.s32 %r83, [__cudaparm_make_rho_atom_stride];
ld.param.u64 %rd24, [__cudaparm_make_rho_counts];
mov.s32 %r84, %r75;
$Lt_1_21506:
//<loop> Loop body line 145, nesting depth: 3, estimated iterations: unknown
.loc 17 147 0
sub.s32 %r85, %r74, %r51;
add.s32 %r86, %r85, %r80;
mul.lo.s32 %r87, %r86, %r32;
add.s32 %r88, %r82, %r87;
add.s32 %r89, %r66, %r88;
cvt.s64.s32 %rd25, %r89;
mul.wide.s32 %rd26, %r89, 4;
add.u64 %rd27, %rd24, %rd26;
ld.global.s32 %r90, [%rd27+0];
mul.lo.s32 %r91, %r90, %r83;
.loc 17 148 0
mov.s32 %r92, %r89;
setp.ge.s32 %p14, %r89, %r91;
@%p14 bra $Lt_1_21762;
sub.s32 %r93, %r3, 1;
cvt.s64.s32 %rd28, %r83;
mul.wide.s32 %rd29, %r83, 16;
mov.s32 %r94, -1;
setp.gt.s32 %p15, %r93, %r94;
ld.param.u64 %rd30, [__cudaparm_make_rho_atoms];
mul.lo.u64 %rd31, %rd25, 16;
add.u64 %rd32, %rd30, %rd31;
$Lt_1_22274:
//<loop> Loop body line 148, nesting depth: 4, estimated iterations: unknown
.loc 17 149 0
ld.global.f32 %f4, [%rd32+0];
@!%p15 bra $Lt_1_29954;
sub.s32 %r95, %r93, %r74;
mov.s32 %r96, -1;
sub.s32 %r97, %r96, %r74;
cvt.s64.s32 %rd33, %r2;
mul.wide.s32 %rd34, %r2, 4;
ld.global.f32 %f5, [%rd32+4];
ld.global.f32 %f6, [%rd32+8];
cvt.s64.s32 %rd35, %r95;
mul.wide.s32 %rd36, %r95, 4;
add.u64 %rd37, %rd1, %rd36;
sub.s32 %r98, %r93, %r71;
cvt.s64.s32 %rd38, %r98;
mul.wide.s32 %rd39, %r98, 4;
add.u64 %rd40, %rd1, %rd39;
mov.f32 %f7, 0f00000000; // 0
mov.f32 %f8, 0f00000000; // 0
$Lt_1_23042:
//<loop> Loop body line 149, nesting depth: 5, estimated iterations: unknown
.loc 17 154 0
ld.shared.f32 %f9, [%rd37+0];
fma.rn.ftz.f32 %f8, %f8, %f5, %f9;
.loc 17 155 0
ld.shared.f32 %f10, [%rd40+0];
fma.rn.ftz.f32 %f7, %f7, %f6, %f10;
sub.u64 %rd40, %rd40, %rd34;
sub.s32 %r95, %r95, %r2;
sub.u64 %rd37, %rd37, %rd34;
setp.gt.s32 %p16, %r95, %r97;
@%p16 bra $Lt_1_23042;
bra.uni $Lt_1_22530;
$Lt_1_29954:
mov.f32 %f7, 0f00000000; // 0
mov.f32 %f8, 0f00000000; // 0
$Lt_1_22530:
.loc 17 157 0
ld.global.f32 %f11, [%rd32+12];
mul.ftz.f32 %f12, %f7, %f8;
mul.ftz.f32 %f13, %f11, %f12;
@!%p8 bra $Lt_1_23554;
mov.s32 %r99, %r2;
cvt.s64.s32 %rd41, %r4;
mul.wide.s32 %rd42, %r4, 4;
add.u64 %rd43, %rd20, %rd42;
mov.s32 %r100, 0;
mov.s32 %r101, %r99;
$Lt_1_24066:
//<loop> Loop body line 157, nesting depth: 5, estimated iterations: unknown
.loc 17 161 0
add.s32 %r102, %r100, %r1;
mov.s32 %r103, %r102;
setp.lt.s32 %p17, %r102, %r100;
@%p17 bra $Lt_1_30466;
cvt.s64.s32 %rd44, %r2;
mul.wide.s32 %rd34, %r2, 4;
cvt.s64.s32 %rd45, %r102;
mul.wide.s32 %rd46, %r102, 4;
add.u64 %rd47, %rd1, %rd46;
mov.f32 %f14, 0f00000000; // 0
$Lt_1_24834:
//<loop> Loop body line 161, nesting depth: 6, estimated iterations: unknown
.loc 17 162 0
ld.shared.f32 %f15, [%rd47+0];
fma.rn.ftz.f32 %f14, %f4, %f14, %f15;
sub.s32 %r103, %r103, %r2;
sub.u64 %rd47, %rd47, %rd34;
setp.ge.s32 %p18, %r103, %r100;
@%p18 bra $Lt_1_24834;
bra.uni $Lt_1_24322;
$Lt_1_30466:
mov.f32 %f14, 0f00000000; // 0
$Lt_1_24322:
.loc 17 163 0
ld.shared.f32 %f16, [%rd43+0];
fma.rn.ftz.f32 %f17, %f14, %f13, %f16;
st.shared.f32 [%rd43+0], %f17;
add.s32 %r100, %r100, 1;
add.u64 %rd43, %rd43, 256;
setp.ne.s32 %p19, %r100, %r2;
@%p19 bra $Lt_1_24066;
$Lt_1_23554:
add.s32 %r92, %r92, %r83;
add.u64 %rd32, %rd29, %rd32;
setp.gt.s32 %p20, %r91, %r92;
@%p20 bra $Lt_1_22274;
$Lt_1_21762:
add.s32 %r74, %r74, 1;
setp.ne.s32 %p21, %r55, %r74;
@%p21 bra $Lt_1_21506;
$Lt_1_20994:
add.s32 %r71, %r71, 1;
setp.ne.s32 %p22, %r47, %r71;
@%p22 bra $Lt_1_20738;
$Lt_1_20226:
$Lt_1_19714:
.loc 17 172 0
bar.sync 0;
@!%p2 bra $Lt_1_26626;
.loc 17 174 0
ld.shared.f32 %f18, [%rd19+128];
st.shared.f32 [%rd19+0], %f18;
.loc 17 175 0
mov.f32 %f19, 0f00000000; // 0
st.shared.f32 [%rd19+128], %f19;
bra.uni $Lt_1_26370;
$Lt_1_26626:
.loc 17 177 0
mov.f32 %f20, 0f00000000; // 0
st.shared.f32 [%rd19+0], %f20;
$Lt_1_26370:
@!%p8 bra $Lt_1_26882;
mov.s32 %r104, %r2;
cvt.s64.s32 %rd48, %r4;
mov.s32 %r105, %r11;
add.s32 %r106, %r11, %r2;
mul.wide.s32 %rd49, %r4, 4;
add.u64 %rd50, %rd20, %rd49;
mov.s64 %rd51, %rd19;
mov.s32 %r107, %r104;
$Lt_1_27394:
//<loop> Loop body line 177, nesting depth: 2, estimated iterations: unknown
.loc 17 180 0
ld.shared.f32 %f21, [%rd50+0];
ld.shared.f32 %f22, [%rd51+0];
add.ftz.f32 %f23, %f21, %f22;
st.shared.f32 [%rd51+0], %f23;
.loc 17 181 0
bar.sync 0;
add.s32 %r105, %r105, 1;
add.u64 %rd51, %rd51, 4;
add.u64 %rd50, %rd50, 256;
setp.ne.s32 %p23, %r105, %r106;
@%p23 bra $Lt_1_27394;
$Lt_1_26882:
set.lt.u32.s32 %r108, %r66, %r17;
neg.s32 %r109, %r108;
and.b32 %r110, %r61, %r109;
mov.u32 %r111, 0;
setp.eq.s32 %p24, %r110, %r111;
@%p24 bra $Lt_1_27906;
.loc 17 185 0
ld.shared.f32 %f24, [%rd19+0];
ld.param.u64 %rd52, [__cudaparm_make_rho_brick];
add.s32 %r112, %r11, %r58;
mul.lo.s32 %r113, %r37, %r17;
mul.lo.s32 %r114, %r38, %r113;
mul.lo.s32 %r115, %r48, %r17;
add.s32 %r116, %r114, %r115;
add.s32 %r117, %r112, %r116;
cvt.s64.s32 %rd53, %r117;
mul.wide.s32 %rd54, %r117, 4;
add.u64 %rd55, %rd52, %rd54;
st.global.f32 [%rd55+0], %f24;
$Lt_1_27906:
add.s32 %r58, %r58, 32;
setp.ne.s32 %p25, %r58, %r59;
@%p25 bra $Lt_1_18434;
$Lt_1_17922:
.loc 17 189 0
exit;
$LDWend_make_rho:
} // make_rho
.entry interp (
.param .u64 __cudaparm_interp_x_,
.param .u64 __cudaparm_interp_q_,
.param .s32 __cudaparm_interp_nlocal,
.param .u64 __cudaparm_interp_brick,
.param .u64 __cudaparm_interp__rho_coeff,
.param .s32 __cudaparm_interp_npts_x,
.param .s32 __cudaparm_interp_npts_yx,
.param .f32 __cudaparm_interp_b_lo_x,
.param .f32 __cudaparm_interp_b_lo_y,
.param .f32 __cudaparm_interp_b_lo_z,
.param .f32 __cudaparm_interp_delxinv,
.param .f32 __cudaparm_interp_delyinv,
.param .f32 __cudaparm_interp_delzinv,
.param .s32 __cudaparm_interp_order,
.param .s32 __cudaparm_interp_order2,
.param .f32 __cudaparm_interp_qqrd2e_scale,
.param .u64 __cudaparm_interp_ans)
{
.reg .u32 %r<56>;
.reg .u64 %rd<37>;
.reg .f32 %f<69>;
.reg .pred %p<14>;
.shared .align 4 .b8 __cuda___cuda_local_var_32676_33_non_const_rho_coeff2888[256];
.shared .align 4 .b8 __cuda___cuda_local_var_32677_33_non_const_rho1d_03144[2048];
.shared .align 4 .b8 __cuda___cuda_local_var_32678_33_non_const_rho1d_15192[2048];
// __cuda_local_var_32694_12_non_const_ek = 16
.loc 17 199 0
$LDWbegin_interp:
ld.param.s32 %r1, [__cudaparm_interp_order2];
ld.param.s32 %r2, [__cudaparm_interp_order];
add.s32 %r3, %r1, %r2;
cvt.s32.u32 %r4, %tid.x;
setp.le.s32 %p1, %r3, %r4;
@%p1 bra $Lt_2_8706;
.loc 17 206 0
mov.u64 %rd1, __cuda___cuda_local_var_32676_33_non_const_rho_coeff2888;
cvt.s64.s32 %rd2, %r4;
mul.wide.s32 %rd3, %r4, 4;
ld.param.u64 %rd4, [__cudaparm_interp__rho_coeff];
add.u64 %rd5, %rd4, %rd3;
ld.global.f32 %f1, [%rd5+0];
add.u64 %rd6, %rd3, %rd1;
st.shared.f32 [%rd6+0], %f1;
$Lt_2_8706:
mov.u64 %rd1, __cuda___cuda_local_var_32676_33_non_const_rho_coeff2888;
.loc 17 207 0
bar.sync 0;
mov.u32 %r5, %ctaid.x;
mov.u32 %r6, %ntid.x;
mul.lo.u32 %r7, %r5, %r6;
add.u32 %r8, %r4, %r7;
ld.param.s32 %r9, [__cudaparm_interp_nlocal];
setp.le.s32 %p2, %r9, %r8;
@%p2 bra $Lt_2_9218;
.loc 17 215 0
mov.u32 %r10, %r8;
mov.s32 %r11, 0;
mov.u32 %r12, %r11;
mov.s32 %r13, 0;
mov.u32 %r14, %r13;
mov.s32 %r15, 0;
mov.u32 %r16, %r15;
tex.1d.v4.f32.s32 {%f2,%f3,%f4,%f5},[pos_tex,{%r10,%r12,%r14,%r16}];
mov.f32 %f6, %f2;
mov.f32 %f7, %f3;
mov.f32 %f8, %f4;
.loc 17 216 0
mov.u32 %r17, %r8;
mov.s32 %r18, 0;
mov.u32 %r19, %r18;
mov.s32 %r20, 0;
mov.u32 %r21, %r20;
mov.s32 %r22, 0;
mov.u32 %r23, %r22;
tex.1d.v4.f32.s32 {%f9,%f10,%f11,%f12},[q_tex,{%r17,%r19,%r21,%r23}];
mov.f32 %f13, %f9;
ld.param.f32 %f14, [__cudaparm_interp_qqrd2e_scale];
mul.ftz.f32 %f15, %f14, %f13;
mov.f32 %f16, 0f00000000; // 0
setp.neu.ftz.f32 %p3, %f15, %f16;
@!%p3 bra $Lt_2_9986;
mov.s32 %r24, 0;
setp.gt.s32 %p4, %r2, %r24;
ld.param.f32 %f17, [__cudaparm_interp_b_lo_x];
sub.ftz.f32 %f18, %f6, %f17;
ld.param.f32 %f19, [__cudaparm_interp_delxinv];
mul.ftz.f32 %f20, %f19, %f18;
@!%p4 bra $Lt_2_16386;
mov.u64 %rd7, __cuda___cuda_local_var_32677_33_non_const_rho1d_03144;
mov.u64 %rd8, __cuda___cuda_local_var_32678_33_non_const_rho1d_15192;
cvt.rzi.ftz.s32.f32 %r25, %f20;
cvt.rn.f32.s32 %f21, %r25;
mov.f32 %f22, 0f3f000000; // 0.5
add.ftz.f32 %f23, %f21, %f22;
sub.ftz.f32 %f24, %f23, %f20;
ld.param.f32 %f25, [__cudaparm_interp_b_lo_y];
sub.ftz.f32 %f26, %f7, %f25;
ld.param.f32 %f27, [__cudaparm_interp_delyinv];
mul.ftz.f32 %f28, %f27, %f26;
cvt.rzi.ftz.s32.f32 %r26, %f28;
cvt.rn.f32.s32 %f29, %r26;
mov.f32 %f30, 0f3f000000; // 0.5
add.ftz.f32 %f31, %f29, %f30;
sub.ftz.f32 %f32, %f31, %f28;
mov.s32 %r27, %r2;
cvt.s64.s32 %rd9, %r4;
mov.s32 %r28, %r1;
mul.wide.s32 %rd3, %r4, 4;
add.u64 %rd10, %rd3, %rd7;
add.u64 %rd11, %rd3, %rd8;
mov.s32 %r29, 0;
mov.s32 %r30, %r27;
$Lt_2_10754:
//<loop> Loop body line 216, nesting depth: 1, estimated iterations: unknown
.loc 17 235 0
mov.f32 %f33, 0f00000000; // 0
mov.f32 %f34, 0f00000000; // 0
st.shared.f32 [%rd10+0], %f34;
.loc 17 236 0
mov.f32 %f35, 0f00000000; // 0
mov.f32 %f36, 0f00000000; // 0
st.shared.f32 [%rd11+0], %f36;
.loc 17 237 0
mov.s32 %r31, %r28;
setp.lt.s32 %p5, %r28, %r29;
@%p5 bra $Lt_2_11010;
cvt.s64.s32 %rd12, %r2;
mul.wide.s32 %rd13, %r2, 4;
cvt.s64.s32 %rd14, %r28;
mul.wide.s32 %rd15, %r28, 4;
add.u64 %rd16, %rd1, %rd15;
$Lt_2_11522:
//<loop> Loop body line 237, nesting depth: 2, estimated iterations: unknown
.loc 17 238 0
ld.shared.f32 %f37, [%rd16+0];
fma.rn.ftz.f32 %f33, %f33, %f24, %f37;
st.shared.f32 [%rd10+0], %f33;
.loc 17 239 0
fma.rn.ftz.f32 %f35, %f35, %f32, %f37;
st.shared.f32 [%rd11+0], %f35;
sub.s32 %r31, %r31, %r2;
sub.u64 %rd16, %rd16, %rd13;
setp.ge.s32 %p6, %r31, %r29;
@%p6 bra $Lt_2_11522;
$Lt_2_11010:
add.s32 %r29, %r29, 1;
add.s32 %r28, %r28, 1;
add.u64 %rd11, %rd11, 256;
add.u64 %rd10, %rd10, 256;
setp.ne.s32 %p7, %r28, %r3;
@%p7 bra $Lt_2_10754;
bra.uni $Lt_2_10242;
$Lt_2_16386:
cvt.rzi.ftz.s32.f32 %r25, %f20;
mov.u64 %rd8, __cuda___cuda_local_var_32678_33_non_const_rho1d_15192;
mov.u64 %rd7, __cuda___cuda_local_var_32677_33_non_const_rho1d_03144;
$Lt_2_10242:
.loc 17 243 0
ld.param.f32 %f38, [__cudaparm_interp_b_lo_z];
sub.ftz.f32 %f39, %f8, %f38;
ld.param.f32 %f40, [__cudaparm_interp_delzinv];
mul.ftz.f32 %f41, %f40, %f39;
cvt.rzi.ftz.s32.f32 %r32, %f41;
ld.param.s32 %r33, [__cudaparm_interp_npts_yx];
mul.lo.s32 %r34, %r32, %r33;
add.s32 %r35, %r25, %r34;
@!%p4 bra $Lt_2_16898;
cvt.rn.f32.s32 %f42, %r32;
mov.f32 %f43, 0f3f000000; // 0.5
add.ftz.f32 %f44, %f42, %f43;
sub.ftz.f32 %f45, %f44, %f41;
mov.s32 %r36, %r2;
ld.param.f32 %f46, [__cudaparm_interp_b_lo_y];
sub.ftz.f32 %f47, %f7, %f46;
cvt.s64.s32 %rd17, %r4;
ld.param.f32 %f48, [__cudaparm_interp_delyinv];
mul.ftz.f32 %f49, %f48, %f47;
cvt.rzi.ftz.s32.f32 %r37, %f49;
ld.param.s32 %r38, [__cudaparm_interp_npts_x];
mul.lo.s32 %r39, %r37, %r38;
mul.wide.s32 %rd3, %r4, 4;
add.s32 %r40, %r39, %r35;
add.u64 %rd18, %rd3, %rd7;
add.u64 %rd19, %rd3, %rd8;
cvt.s64.s32 %rd20, %r38;
mul.wide.s32 %rd21, %r38, 16;
mov.s32 %r41, %r40;
ld.param.u64 %rd22, [__cudaparm_interp_brick];
mov.s32 %r42, 0;
mov.f32 %f50, 0f00000000; // 0
mov.f32 %f51, 0f00000000; // 0
mov.f32 %f52, 0f00000000; // 0
mov.s32 %r43, %r36;
$Lt_2_12802:
//<loop> Loop body line 243, nesting depth: 1, estimated iterations: unknown
.loc 17 246 0
add.s32 %r44, %r42, %r1;
mov.s32 %r45, %r44;
setp.lt.s32 %p8, %r44, %r42;
@%p8 bra $Lt_2_17154;
cvt.s64.s32 %rd23, %r2;
mul.wide.s32 %rd13, %r2, 4;
cvt.s64.s32 %rd24, %r44;
mul.wide.s32 %rd25, %r44, 4;
add.u64 %rd26, %rd1, %rd25;
mov.f32 %f53, 0f00000000; // 0
$Lt_2_13570:
//<loop> Loop body line 246, nesting depth: 2, estimated iterations: unknown
.loc 17 247 0
ld.shared.f32 %f54, [%rd26+0];
fma.rn.ftz.f32 %f53, %f45, %f53, %f54;
sub.s32 %r45, %r45, %r2;
sub.u64 %rd26, %rd26, %rd13;
setp.ge.s32 %p9, %r45, %r42;
@%p9 bra $Lt_2_13570;
bra.uni $Lt_2_13058;
$Lt_2_17154:
mov.f32 %f53, 0f00000000; // 0
$Lt_2_13058:
.loc 17 249 0
mov.s32 %r46, %r41;
mov.s32 %r47, %r2;
mul.ftz.f32 %f55, %f15, %f53;
mov.s32 %r48, %r46;
mov.s64 %rd27, %rd19;
cvt.s64.s32 %rd28, %r46;
mul.wide.s32 %rd29, %r46, 16;
mov.s32 %r49, 0;
mov.s32 %r50, %r47;
$Lt_2_14594:
//<loop> Loop body line 249, nesting depth: 2, estimated iterations: unknown
mov.s32 %r51, %r2;
mov.s32 %r52, %r48;
add.s32 %r53, %r48, %r2;
mov.s64 %rd30, %rd18;
ld.shared.f32 %f56, [%rd27+0];
add.u64 %rd31, %rd29, %rd22;
mul.ftz.f32 %f57, %f55, %f56;
mov.s32 %r54, %r51;
$Lt_2_15362:
//<loop> Loop body line 249, nesting depth: 3, estimated iterations: unknown
.loc 17 253 0
ld.shared.f32 %f58, [%rd30+0];
mul.ftz.f32 %f59, %f58, %f57;
ld.global.v4.f32 {%f60,%f61,%f62,_}, [%rd31+0];
.loc 17 255 0
mul.ftz.f32 %f63, %f59, %f60;
sub.ftz.f32 %f52, %f52, %f63;
.loc 17 256 0
mul.ftz.f32 %f64, %f59, %f61;
sub.ftz.f32 %f51, %f51, %f64;
.loc 17 257 0
mul.ftz.f32 %f65, %f59, %f62;
sub.ftz.f32 %f50, %f50, %f65;
add.s32 %r52, %r52, 1;
add.u64 %rd31, %rd31, 16;
add.u64 %rd30, %rd30, 256;
setp.ne.s32 %p10, %r52, %r53;
@%p10 bra $Lt_2_15362;
add.s32 %r49, %r49, 1;
add.s32 %r48, %r48, %r38;
add.u64 %rd29, %rd29, %rd21;
add.u64 %rd27, %rd27, 256;
setp.ne.s32 %p11, %r49, %r2;
@%p11 bra $Lt_2_14594;
add.s32 %r42, %r42, 1;
add.s32 %r41, %r46, %r33;
setp.ne.s32 %p12, %r42, %r2;
@%p12 bra $Lt_2_12802;
bra.uni $Lt_2_9730;
$Lt_2_16898:
mov.f32 %f50, 0f00000000; // 0
mov.f32 %f51, 0f00000000; // 0
mov.f32 %f52, 0f00000000; // 0
bra.uni $Lt_2_9730;
$Lt_2_9986:
mov.f32 %f50, 0f00000000; // 0
mov.f32 %f51, 0f00000000; // 0
mov.f32 %f52, 0f00000000; // 0
$Lt_2_9730:
.loc 17 264 0
ld.param.u64 %rd32, [__cudaparm_interp_ans];
cvt.s64.s32 %rd33, %r8;
mul.wide.s32 %rd34, %r8, 16;
add.u64 %rd35, %rd32, %rd34;
mov.f32 %f66, %f67;
st.global.v4.f32 [%rd35+0], {%f52,%f51,%f50,%f66};
$Lt_2_9218:
.loc 17 266 0
exit;
$LDWend_interp:
} // interp

View File

@ -1,818 +0,0 @@
const char * pppm_f =
" .version 2.3\n"
" .target sm_20\n"
" .address_size 64\n"
" .global .texref pos_tex;\n"
" .global .texref q_tex;\n"
" .entry particle_map (\n"
" .param .u64 __cudaparm_particle_map_x_,\n"
" .param .u64 __cudaparm_particle_map_q_,\n"
" .param .f32 __cudaparm_particle_map_delvolinv,\n"
" .param .s32 __cudaparm_particle_map_nlocal,\n"
" .param .u64 __cudaparm_particle_map_counts,\n"
" .param .u64 __cudaparm_particle_map_ans,\n"
" .param .f32 __cudaparm_particle_map_b_lo_x,\n"
" .param .f32 __cudaparm_particle_map_b_lo_y,\n"
" .param .f32 __cudaparm_particle_map_b_lo_z,\n"
" .param .f32 __cudaparm_particle_map_delxinv,\n"
" .param .f32 __cudaparm_particle_map_delyinv,\n"
" .param .f32 __cudaparm_particle_map_delzinv,\n"
" .param .s32 __cudaparm_particle_map_nlocal_x,\n"
" .param .s32 __cudaparm_particle_map_nlocal_y,\n"
" .param .s32 __cudaparm_particle_map_nlocal_z,\n"
" .param .s32 __cudaparm_particle_map_atom_stride,\n"
" .param .s32 __cudaparm_particle_map_max_atoms,\n"
" .param .u64 __cudaparm_particle_map_error)\n"
" {\n"
" .reg .u32 %r<50>;\n"
" .reg .u64 %rd<12>;\n"
" .reg .f32 %f<44>;\n"
" .reg .pred %p<11>;\n"
" .loc 17 50 0\n"
"$LDWbegin_particle_map:\n"
" cvt.s32.u32 %r1, %ntid.x;\n"
" cvt.s32.u32 %r2, %ctaid.x;\n"
" mul24.lo.s32 %r3, %r2, %r1;\n"
" cvt.s32.u32 %r4, %nctaid.x;\n"
" mul24.lo.s32 %r5, %r4, %r1;\n"
" mov.u32 %r6, %tid.x;\n"
" add.u32 %r7, %r3, %r6;\n"
" sub.s32 %r8, %r5, 1;\n"
" mul.lo.s32 %r9, %r7, 64;\n"
" div.s32 %r10, %r9, %r5;\n"
" mul.lo.s32 %r11, %r8, %r10;\n"
" sub.s32 %r12, %r9, %r11;\n"
" ld.param.s32 %r13, [__cudaparm_particle_map_nlocal];\n"
" setp.le.s32 %p1, %r13, %r12;\n"
" @%p1 bra $Lt_0_7426;\n"
" .loc 17 62 0\n"
" mov.u32 %r14, %r12;\n"
" mov.s32 %r15, 0;\n"
" mov.u32 %r16, %r15;\n"
" mov.s32 %r17, 0;\n"
" mov.u32 %r18, %r17;\n"
" mov.s32 %r19, 0;\n"
" mov.u32 %r20, %r19;\n"
" tex.1d.v4.f32.s32 {%f1,%f2,%f3,%f4},[pos_tex,{%r14,%r16,%r18,%r20}];\n"
" mov.f32 %f5, %f1;\n"
" mov.f32 %f6, %f2;\n"
" mov.f32 %f7, %f3;\n"
" .loc 17 64 0\n"
" mov.u32 %r21, %r12;\n"
" mov.s32 %r22, 0;\n"
" mov.u32 %r23, %r22;\n"
" mov.s32 %r24, 0;\n"
" mov.u32 %r25, %r24;\n"
" mov.s32 %r26, 0;\n"
" mov.u32 %r27, %r26;\n"
" tex.1d.v4.f32.s32 {%f8,%f9,%f10,%f11},[q_tex,{%r21,%r23,%r25,%r27}];\n"
" mov.f32 %f12, %f8;\n"
" ld.param.f32 %f13, [__cudaparm_particle_map_delvolinv];\n"
" mul.ftz.f32 %f14, %f13, %f12;\n"
" mov.f32 %f15, 0f00000000; \n"
" setp.neu.ftz.f32 %p2, %f14, %f15;\n"
" @!%p2 bra $Lt_0_7426;\n"
" .loc 17 67 0\n"
" ld.param.f32 %f16, [__cudaparm_particle_map_b_lo_x];\n"
" sub.ftz.f32 %f17, %f5, %f16;\n"
" ld.param.f32 %f18, [__cudaparm_particle_map_delxinv];\n"
" mul.ftz.f32 %f19, %f18, %f17;\n"
" mov.f32 %f20, 0f00000000; \n"
" setp.lt.ftz.f32 %p3, %f19, %f20;\n"
" @%p3 bra $Lt_0_8706;\n"
" ld.param.f32 %f21, [__cudaparm_particle_map_b_lo_y];\n"
" sub.ftz.f32 %f22, %f6, %f21;\n"
" ld.param.f32 %f23, [__cudaparm_particle_map_delyinv];\n"
" mul.ftz.f32 %f24, %f23, %f22;\n"
" mov.f32 %f25, 0f00000000; \n"
" setp.lt.ftz.f32 %p4, %f24, %f25;\n"
" @%p4 bra $Lt_0_8706;\n"
" ld.param.f32 %f26, [__cudaparm_particle_map_b_lo_z];\n"
" sub.ftz.f32 %f27, %f7, %f26;\n"
" ld.param.f32 %f28, [__cudaparm_particle_map_delzinv];\n"
" mul.ftz.f32 %f29, %f28, %f27;\n"
" mov.f32 %f30, 0f00000000; \n"
" setp.lt.ftz.f32 %p5, %f29, %f30;\n"
" @%p5 bra $Lt_0_8706;\n"
" cvt.rzi.ftz.s32.f32 %r28, %f19;\n"
" ld.param.s32 %r29, [__cudaparm_particle_map_nlocal_x];\n"
" setp.ge.s32 %p6, %r28, %r29;\n"
" @%p6 bra $Lt_0_8706;\n"
" cvt.rzi.ftz.s32.f32 %r30, %f24;\n"
" ld.param.s32 %r31, [__cudaparm_particle_map_nlocal_y];\n"
" setp.ge.s32 %p7, %r30, %r31;\n"
" @%p7 bra $Lt_0_8706;\n"
" cvt.rzi.ftz.s32.f32 %r32, %f29;\n"
" ld.param.s32 %r33, [__cudaparm_particle_map_nlocal_z];\n"
" setp.gt.s32 %p8, %r33, %r32;\n"
" @%p8 bra $L_0_4866;\n"
"$Lt_0_8706:\n"
"$L_0_5122:\n"
" .loc 17 76 0\n"
" mov.s32 %r34, 1;\n"
" ld.param.u64 %rd1, [__cudaparm_particle_map_error];\n"
" st.global.s32 [%rd1+0], %r34;\n"
" bra.uni $Lt_0_7426;\n"
"$L_0_4866:\n"
" .loc 17 83 0\n"
" mul.lo.s32 %r35, %r32, %r31;\n"
" add.s32 %r36, %r30, %r35;\n"
" mul.lo.s32 %r37, %r36, %r29;\n"
" add.s32 %r38, %r28, %r37;\n"
" ld.param.u64 %rd2, [__cudaparm_particle_map_counts];\n"
" cvt.s64.s32 %rd3, %r38;\n"
" mul.wide.s32 %rd4, %r38, 4;\n"
" add.u64 %rd5, %rd2, %rd4;\n"
" mov.s32 %r39, 1;\n"
" atom.global.add.s32 %r40, [%rd5], %r39;\n"
" mov.s32 %r41, %r40;\n"
" ld.param.s32 %r42, [__cudaparm_particle_map_max_atoms];\n"
" setp.gt.s32 %p9, %r42, %r41;\n"
" @%p9 bra $Lt_0_7682;\n"
" .loc 17 85 0\n"
" mov.s32 %r43, 2;\n"
" ld.param.u64 %rd6, [__cudaparm_particle_map_error];\n"
" st.global.s32 [%rd6+0], %r43;\n"
" .loc 16 118 0\n"
" mov.s32 %r44, -1;\n"
" atom.global.add.s32 %r45, [%rd5], %r44;\n"
" bra.uni $Lt_0_7426;\n"
"$Lt_0_7682:\n"
" .loc 17 88 0\n"
" ld.param.u64 %rd7, [__cudaparm_particle_map_ans];\n"
" ld.param.s32 %r46, [__cudaparm_particle_map_atom_stride];\n"
" mul.lo.s32 %r47, %r46, %r41;\n"
" add.s32 %r48, %r38, %r47;\n"
" cvt.s64.s32 %rd8, %r48;\n"
" mul.wide.s32 %rd9, %r48, 16;\n"
" add.u64 %rd10, %rd7, %rd9;\n"
" cvt.rn.f32.s32 %f31, %r28;\n"
" mov.f32 %f32, 0f3f000000; \n"
" add.ftz.f32 %f33, %f31, %f32;\n"
" sub.ftz.f32 %f34, %f33, %f19;\n"
" cvt.rn.f32.s32 %f35, %r30;\n"
" mov.f32 %f36, 0f3f000000; \n"
" add.ftz.f32 %f37, %f35, %f36;\n"
" sub.ftz.f32 %f38, %f37, %f24;\n"
" cvt.rn.f32.s32 %f39, %r32;\n"
" mov.f32 %f40, 0f3f000000; \n"
" add.ftz.f32 %f41, %f39, %f40;\n"
" sub.ftz.f32 %f42, %f41, %f29;\n"
" st.global.v4.f32 [%rd10+0], {%f34,%f38,%f42,%f14};\n"
"$Lt_0_7426:\n"
"$L_0_4610:\n"
"$Lt_0_6914:\n"
"$Lt_0_6402:\n"
" .loc 17 92 0\n"
" exit;\n"
"$LDWend_particle_map:\n"
" }\n"
" .entry make_rho (\n"
" .param .u64 __cudaparm_make_rho_counts,\n"
" .param .u64 __cudaparm_make_rho_atoms,\n"
" .param .u64 __cudaparm_make_rho_brick,\n"
" .param .u64 __cudaparm_make_rho__rho_coeff,\n"
" .param .s32 __cudaparm_make_rho_atom_stride,\n"
" .param .s32 __cudaparm_make_rho_npts_x,\n"
" .param .s32 __cudaparm_make_rho_npts_y,\n"
" .param .s32 __cudaparm_make_rho_npts_z,\n"
" .param .s32 __cudaparm_make_rho_nlocal_x,\n"
" .param .s32 __cudaparm_make_rho_nlocal_y,\n"
" .param .s32 __cudaparm_make_rho_nlocal_z,\n"
" .param .s32 __cudaparm_make_rho_order_m_1,\n"
" .param .s32 __cudaparm_make_rho_order,\n"
" .param .s32 __cudaparm_make_rho_order2)\n"
" {\n"
" .reg .u32 %r<119>;\n"
" .reg .u64 %rd<57>;\n"
" .reg .f32 %f<26>;\n"
" .reg .pred %p<27>;\n"
" .shared .align 4 .b8 __cuda___cuda_local_var_32578_33_non_const_rho_coeff168[256];\n"
" .shared .align 4 .b8 __cuda___cuda_local_var_32579_33_non_const_front424[320];\n"
" .shared .align 4 .b8 __cuda___cuda_local_var_32580_33_non_const_ans744[2048];\n"
" .loc 17 101 0\n"
"$LDWbegin_make_rho:\n"
" ld.param.s32 %r1, [__cudaparm_make_rho_order2];\n"
" ld.param.s32 %r2, [__cudaparm_make_rho_order];\n"
" add.s32 %r3, %r1, %r2;\n"
" cvt.s32.u32 %r4, %tid.x;\n"
" setp.le.s32 %p1, %r3, %r4;\n"
" @%p1 bra $Lt_1_16898;\n"
" .loc 17 108 0\n"
" mov.u64 %rd1, __cuda___cuda_local_var_32578_33_non_const_rho_coeff168;\n"
" cvt.s64.s32 %rd2, %r4;\n"
" mul.wide.s32 %rd3, %r4, 4;\n"
" ld.param.u64 %rd4, [__cudaparm_make_rho__rho_coeff];\n"
" add.u64 %rd5, %rd4, %rd3;\n"
" ld.global.f32 %f1, [%rd5+0];\n"
" add.u64 %rd6, %rd3, %rd1;\n"
" st.shared.f32 [%rd6+0], %f1;\n"
"$Lt_1_16898:\n"
" mov.u64 %rd1, __cuda___cuda_local_var_32578_33_non_const_rho_coeff168;\n"
" shr.s32 %r5, %r4, 31;\n"
" mov.s32 %r6, 31;\n"
" and.b32 %r7, %r5, %r6;\n"
" add.s32 %r8, %r7, %r4;\n"
" shr.s32 %r9, %r8, 5;\n"
" mul.lo.s32 %r10, %r9, 32;\n"
" sub.s32 %r11, %r4, %r10;\n"
" setp.lt.s32 %p2, %r11, %r2;\n"
" @!%p2 bra $Lt_1_17410;\n"
" .loc 17 114 0\n"
" mov.u64 %rd7, __cuda___cuda_local_var_32579_33_non_const_front424;\n"
" mov.f32 %f2, 0f00000000; \n"
" cvt.s64.s32 %rd8, %r11;\n"
" shr.s32 %r12, %r4, 31;\n"
" mov.s32 %r13, 31;\n"
" and.b32 %r14, %r12, %r13;\n"
" add.s32 %r15, %r14, %r4;\n"
" shr.s32 %r16, %r15, 5;\n"
" cvt.s64.s32 %rd9, %r16;\n"
" mul.wide.s32 %rd10, %r16, 40;\n"
" add.u64 %rd11, %rd8, %rd10;\n"
" mul.lo.u64 %rd12, %rd11, 4;\n"
" add.u64 %rd13, %rd7, %rd12;\n"
" st.shared.f32 [%rd13+128], %f2;\n"
"$Lt_1_17410:\n"
" mov.u64 %rd7, __cuda___cuda_local_var_32579_33_non_const_front424;\n"
" .loc 17 116 0\n"
" bar.sync 0;\n"
" ld.param.s32 %r17, [__cudaparm_make_rho_npts_x];\n"
" shr.s32 %r18, %r17, 31;\n"
" mov.s32 %r19, 31;\n"
" and.b32 %r20, %r18, %r19;\n"
" add.s32 %r21, %r20, %r17;\n"
" shr.s32 %r22, %r21, 5;\n"
" add.s32 %r23, %r22, 1;\n"
" mov.u32 %r24, 0;\n"
" setp.le.s32 %p3, %r23, %r24;\n"
" @%p3 bra $Lt_1_17922;\n"
" shr.s32 %r25, %r4, 31;\n"
" mov.s32 %r26, 31;\n"
" and.b32 %r27, %r25, %r26;\n"
" add.s32 %r28, %r27, %r4;\n"
" shr.s32 %r29, %r28, 5;\n"
" add.s32 %r30, %r11, 32;\n"
" ld.param.s32 %r31, [__cudaparm_make_rho_nlocal_y];\n"
" ld.param.s32 %r32, [__cudaparm_make_rho_nlocal_x];\n"
" mul.lo.s32 %r33, %r31, %r32;\n"
" mov.u32 %r34, %ctaid.x;\n"
" mul.lo.u32 %r35, %r34, 2;\n"
" add.u32 %r36, %r29, %r35;\n"
" ld.param.s32 %r37, [__cudaparm_make_rho_npts_y];\n"
" div.s32 %r38, %r36, %r37;\n"
" ld.param.s32 %r39, [__cudaparm_make_rho_order_m_1];\n"
" setp.lt.s32 %p4, %r38, %r39;\n"
" sub.s32 %r40, %r39, %r38;\n"
" mov.s32 %r41, 0;\n"
" selp.s32 %r42, %r40, %r41, %p4;\n"
" ld.param.s32 %r43, [__cudaparm_make_rho_nlocal_z];\n"
" setp.ge.s32 %p5, %r38, %r43;\n"
" sub.s32 %r44, %r43, %r38;\n"
" add.s32 %r45, %r44, %r2;\n"
" sub.s32 %r46, %r45, 1;\n"
" selp.s32 %r47, %r46, %r2, %p5;\n"
" rem.s32 %r48, %r36, %r37;\n"
" setp.lt.s32 %p6, %r48, %r39;\n"
" sub.s32 %r49, %r39, %r48;\n"
" mov.s32 %r50, 0;\n"
" selp.s32 %r51, %r49, %r50, %p6;\n"
" setp.ge.s32 %p7, %r48, %r31;\n"
" sub.s32 %r52, %r31, %r48;\n"
" add.s32 %r53, %r52, %r2;\n"
" sub.s32 %r54, %r53, 1;\n"
" selp.s32 %r55, %r54, %r2, %p7;\n"
" mov.s32 %r56, %r23;\n"
" mov.s32 %r57, 0;\n"
" setp.gt.s32 %p8, %r2, %r57;\n"
" mov.s32 %r58, 0;\n"
" cvt.s64.s32 %rd14, %r11;\n"
" cvt.s64.s32 %rd15, %r29;\n"
" mul.lo.s32 %r59, %r23, 32;\n"
" mul.wide.s32 %rd16, %r29, 40;\n"
" add.u64 %rd17, %rd14, %rd16;\n"
" ld.param.s32 %r60, [__cudaparm_make_rho_npts_z];\n"
" setp.gt.s32 %p9, %r60, %r38;\n"
" mul.lo.u64 %rd18, %rd17, 4;\n"
" selp.s32 %r61, 1, 0, %p9;\n"
" add.u64 %rd19, %rd18, %rd7;\n"
" mov.u64 %rd20, __cuda___cuda_local_var_32580_33_non_const_ans744;\n"
" mov.s32 %r62, %r56;\n"
"$Lt_1_18434:\n"
" @!%p8 bra $Lt_1_18690;\n"
" mov.s32 %r63, %r2;\n"
" cvt.s64.s32 %rd21, %r4;\n"
" mul.wide.s32 %rd22, %r4, 4;\n"
" add.u64 %rd23, %rd20, %rd22;\n"
" mov.s32 %r64, 0;\n"
" mov.s32 %r65, %r63;\n"
"$Lt_1_19202:\n"
" .loc 17 140 0\n"
" mov.f32 %f3, 0f00000000; \n"
" st.shared.f32 [%rd23+0], %f3;\n"
" add.s32 %r64, %r64, 1;\n"
" add.u64 %rd23, %rd23, 256;\n"
" setp.ne.s32 %p10, %r64, %r2;\n"
" @%p10 bra $Lt_1_19202;\n"
"$Lt_1_18690:\n"
" add.s32 %r66, %r11, %r58;\n"
" set.lt.u32.s32 %r67, %r66, %r32;\n"
" neg.s32 %r68, %r67;\n"
" and.b32 %r69, %r61, %r68;\n"
" mov.u32 %r70, 0;\n"
" setp.eq.s32 %p11, %r69, %r70;\n"
" @%p11 bra $Lt_1_20226;\n"
" .loc 17 143 0\n"
" mov.s32 %r71, %r42;\n"
" setp.ge.s32 %p12, %r42, %r47;\n"
" @%p12 bra $Lt_1_20226;\n"
" sub.s32 %r72, %r47, %r42;\n"
" setp.lt.s32 %p13, %r51, %r55;\n"
" mov.s32 %r73, %r72;\n"
"$Lt_1_20738:\n"
" .loc 17 145 0\n"
" mov.s32 %r74, %r51;\n"
" @!%p13 bra $Lt_1_20994;\n"
" sub.s32 %r75, %r55, %r51;\n"
" sub.s32 %r76, %r71, %r42;\n"
" add.s32 %r77, %r38, %r42;\n"
" add.s32 %r78, %r48, %r51;\n"
" sub.s32 %r79, %r77, %r39;\n"
" sub.s32 %r80, %r78, %r39;\n"
" add.s32 %r81, %r76, %r79;\n"
" mul.lo.s32 %r82, %r33, %r81;\n"
" ld.param.s32 %r83, [__cudaparm_make_rho_atom_stride];\n"
" ld.param.u64 %rd24, [__cudaparm_make_rho_counts];\n"
" mov.s32 %r84, %r75;\n"
"$Lt_1_21506:\n"
" .loc 17 147 0\n"
" sub.s32 %r85, %r74, %r51;\n"
" add.s32 %r86, %r85, %r80;\n"
" mul.lo.s32 %r87, %r86, %r32;\n"
" add.s32 %r88, %r82, %r87;\n"
" add.s32 %r89, %r66, %r88;\n"
" cvt.s64.s32 %rd25, %r89;\n"
" mul.wide.s32 %rd26, %r89, 4;\n"
" add.u64 %rd27, %rd24, %rd26;\n"
" ld.global.s32 %r90, [%rd27+0];\n"
" mul.lo.s32 %r91, %r90, %r83;\n"
" .loc 17 148 0\n"
" mov.s32 %r92, %r89;\n"
" setp.ge.s32 %p14, %r89, %r91;\n"
" @%p14 bra $Lt_1_21762;\n"
" sub.s32 %r93, %r3, 1;\n"
" cvt.s64.s32 %rd28, %r83;\n"
" mul.wide.s32 %rd29, %r83, 16;\n"
" mov.s32 %r94, -1;\n"
" setp.gt.s32 %p15, %r93, %r94;\n"
" ld.param.u64 %rd30, [__cudaparm_make_rho_atoms];\n"
" mul.lo.u64 %rd31, %rd25, 16;\n"
" add.u64 %rd32, %rd30, %rd31;\n"
"$Lt_1_22274:\n"
" .loc 17 149 0\n"
" ld.global.f32 %f4, [%rd32+0];\n"
" @!%p15 bra $Lt_1_29954;\n"
" sub.s32 %r95, %r93, %r74;\n"
" mov.s32 %r96, -1;\n"
" sub.s32 %r97, %r96, %r74;\n"
" cvt.s64.s32 %rd33, %r2;\n"
" mul.wide.s32 %rd34, %r2, 4;\n"
" ld.global.f32 %f5, [%rd32+4];\n"
" ld.global.f32 %f6, [%rd32+8];\n"
" cvt.s64.s32 %rd35, %r95;\n"
" mul.wide.s32 %rd36, %r95, 4;\n"
" add.u64 %rd37, %rd1, %rd36;\n"
" sub.s32 %r98, %r93, %r71;\n"
" cvt.s64.s32 %rd38, %r98;\n"
" mul.wide.s32 %rd39, %r98, 4;\n"
" add.u64 %rd40, %rd1, %rd39;\n"
" mov.f32 %f7, 0f00000000; \n"
" mov.f32 %f8, 0f00000000; \n"
"$Lt_1_23042:\n"
" .loc 17 154 0\n"
" ld.shared.f32 %f9, [%rd37+0];\n"
" fma.rn.ftz.f32 %f8, %f8, %f5, %f9;\n"
" .loc 17 155 0\n"
" ld.shared.f32 %f10, [%rd40+0];\n"
" fma.rn.ftz.f32 %f7, %f7, %f6, %f10;\n"
" sub.u64 %rd40, %rd40, %rd34;\n"
" sub.s32 %r95, %r95, %r2;\n"
" sub.u64 %rd37, %rd37, %rd34;\n"
" setp.gt.s32 %p16, %r95, %r97;\n"
" @%p16 bra $Lt_1_23042;\n"
" bra.uni $Lt_1_22530;\n"
"$Lt_1_29954:\n"
" mov.f32 %f7, 0f00000000; \n"
" mov.f32 %f8, 0f00000000; \n"
"$Lt_1_22530:\n"
" .loc 17 157 0\n"
" ld.global.f32 %f11, [%rd32+12];\n"
" mul.ftz.f32 %f12, %f7, %f8;\n"
" mul.ftz.f32 %f13, %f11, %f12;\n"
" @!%p8 bra $Lt_1_23554;\n"
" mov.s32 %r99, %r2;\n"
" cvt.s64.s32 %rd41, %r4;\n"
" mul.wide.s32 %rd42, %r4, 4;\n"
" add.u64 %rd43, %rd20, %rd42;\n"
" mov.s32 %r100, 0;\n"
" mov.s32 %r101, %r99;\n"
"$Lt_1_24066:\n"
" .loc 17 161 0\n"
" add.s32 %r102, %r100, %r1;\n"
" mov.s32 %r103, %r102;\n"
" setp.lt.s32 %p17, %r102, %r100;\n"
" @%p17 bra $Lt_1_30466;\n"
" cvt.s64.s32 %rd44, %r2;\n"
" mul.wide.s32 %rd34, %r2, 4;\n"
" cvt.s64.s32 %rd45, %r102;\n"
" mul.wide.s32 %rd46, %r102, 4;\n"
" add.u64 %rd47, %rd1, %rd46;\n"
" mov.f32 %f14, 0f00000000; \n"
"$Lt_1_24834:\n"
" .loc 17 162 0\n"
" ld.shared.f32 %f15, [%rd47+0];\n"
" fma.rn.ftz.f32 %f14, %f4, %f14, %f15;\n"
" sub.s32 %r103, %r103, %r2;\n"
" sub.u64 %rd47, %rd47, %rd34;\n"
" setp.ge.s32 %p18, %r103, %r100;\n"
" @%p18 bra $Lt_1_24834;\n"
" bra.uni $Lt_1_24322;\n"
"$Lt_1_30466:\n"
" mov.f32 %f14, 0f00000000; \n"
"$Lt_1_24322:\n"
" .loc 17 163 0\n"
" ld.shared.f32 %f16, [%rd43+0];\n"
" fma.rn.ftz.f32 %f17, %f14, %f13, %f16;\n"
" st.shared.f32 [%rd43+0], %f17;\n"
" add.s32 %r100, %r100, 1;\n"
" add.u64 %rd43, %rd43, 256;\n"
" setp.ne.s32 %p19, %r100, %r2;\n"
" @%p19 bra $Lt_1_24066;\n"
"$Lt_1_23554:\n"
" add.s32 %r92, %r92, %r83;\n"
" add.u64 %rd32, %rd29, %rd32;\n"
" setp.gt.s32 %p20, %r91, %r92;\n"
" @%p20 bra $Lt_1_22274;\n"
"$Lt_1_21762:\n"
" add.s32 %r74, %r74, 1;\n"
" setp.ne.s32 %p21, %r55, %r74;\n"
" @%p21 bra $Lt_1_21506;\n"
"$Lt_1_20994:\n"
" add.s32 %r71, %r71, 1;\n"
" setp.ne.s32 %p22, %r47, %r71;\n"
" @%p22 bra $Lt_1_20738;\n"
"$Lt_1_20226:\n"
"$Lt_1_19714:\n"
" .loc 17 172 0\n"
" bar.sync 0;\n"
" @!%p2 bra $Lt_1_26626;\n"
" .loc 17 174 0\n"
" ld.shared.f32 %f18, [%rd19+128];\n"
" st.shared.f32 [%rd19+0], %f18;\n"
" .loc 17 175 0\n"
" mov.f32 %f19, 0f00000000; \n"
" st.shared.f32 [%rd19+128], %f19;\n"
" bra.uni $Lt_1_26370;\n"
"$Lt_1_26626:\n"
" .loc 17 177 0\n"
" mov.f32 %f20, 0f00000000; \n"
" st.shared.f32 [%rd19+0], %f20;\n"
"$Lt_1_26370:\n"
" @!%p8 bra $Lt_1_26882;\n"
" mov.s32 %r104, %r2;\n"
" cvt.s64.s32 %rd48, %r4;\n"
" mov.s32 %r105, %r11;\n"
" add.s32 %r106, %r11, %r2;\n"
" mul.wide.s32 %rd49, %r4, 4;\n"
" add.u64 %rd50, %rd20, %rd49;\n"
" mov.s64 %rd51, %rd19;\n"
" mov.s32 %r107, %r104;\n"
"$Lt_1_27394:\n"
" .loc 17 180 0\n"
" ld.shared.f32 %f21, [%rd50+0];\n"
" ld.shared.f32 %f22, [%rd51+0];\n"
" add.ftz.f32 %f23, %f21, %f22;\n"
" st.shared.f32 [%rd51+0], %f23;\n"
" .loc 17 181 0\n"
" bar.sync 0;\n"
" add.s32 %r105, %r105, 1;\n"
" add.u64 %rd51, %rd51, 4;\n"
" add.u64 %rd50, %rd50, 256;\n"
" setp.ne.s32 %p23, %r105, %r106;\n"
" @%p23 bra $Lt_1_27394;\n"
"$Lt_1_26882:\n"
" set.lt.u32.s32 %r108, %r66, %r17;\n"
" neg.s32 %r109, %r108;\n"
" and.b32 %r110, %r61, %r109;\n"
" mov.u32 %r111, 0;\n"
" setp.eq.s32 %p24, %r110, %r111;\n"
" @%p24 bra $Lt_1_27906;\n"
" .loc 17 185 0\n"
" ld.shared.f32 %f24, [%rd19+0];\n"
" ld.param.u64 %rd52, [__cudaparm_make_rho_brick];\n"
" add.s32 %r112, %r11, %r58;\n"
" mul.lo.s32 %r113, %r37, %r17;\n"
" mul.lo.s32 %r114, %r38, %r113;\n"
" mul.lo.s32 %r115, %r48, %r17;\n"
" add.s32 %r116, %r114, %r115;\n"
" add.s32 %r117, %r112, %r116;\n"
" cvt.s64.s32 %rd53, %r117;\n"
" mul.wide.s32 %rd54, %r117, 4;\n"
" add.u64 %rd55, %rd52, %rd54;\n"
" st.global.f32 [%rd55+0], %f24;\n"
"$Lt_1_27906:\n"
" add.s32 %r58, %r58, 32;\n"
" setp.ne.s32 %p25, %r58, %r59;\n"
" @%p25 bra $Lt_1_18434;\n"
"$Lt_1_17922:\n"
" .loc 17 189 0\n"
" exit;\n"
"$LDWend_make_rho:\n"
" }\n"
" .entry interp (\n"
" .param .u64 __cudaparm_interp_x_,\n"
" .param .u64 __cudaparm_interp_q_,\n"
" .param .s32 __cudaparm_interp_nlocal,\n"
" .param .u64 __cudaparm_interp_brick,\n"
" .param .u64 __cudaparm_interp__rho_coeff,\n"
" .param .s32 __cudaparm_interp_npts_x,\n"
" .param .s32 __cudaparm_interp_npts_yx,\n"
" .param .f32 __cudaparm_interp_b_lo_x,\n"
" .param .f32 __cudaparm_interp_b_lo_y,\n"
" .param .f32 __cudaparm_interp_b_lo_z,\n"
" .param .f32 __cudaparm_interp_delxinv,\n"
" .param .f32 __cudaparm_interp_delyinv,\n"
" .param .f32 __cudaparm_interp_delzinv,\n"
" .param .s32 __cudaparm_interp_order,\n"
" .param .s32 __cudaparm_interp_order2,\n"
" .param .f32 __cudaparm_interp_qqrd2e_scale,\n"
" .param .u64 __cudaparm_interp_ans)\n"
" {\n"
" .reg .u32 %r<56>;\n"
" .reg .u64 %rd<37>;\n"
" .reg .f32 %f<69>;\n"
" .reg .pred %p<14>;\n"
" .shared .align 4 .b8 __cuda___cuda_local_var_32676_33_non_const_rho_coeff2888[256];\n"
" .shared .align 4 .b8 __cuda___cuda_local_var_32677_33_non_const_rho1d_03144[2048];\n"
" .shared .align 4 .b8 __cuda___cuda_local_var_32678_33_non_const_rho1d_15192[2048];\n"
" .loc 17 199 0\n"
"$LDWbegin_interp:\n"
" ld.param.s32 %r1, [__cudaparm_interp_order2];\n"
" ld.param.s32 %r2, [__cudaparm_interp_order];\n"
" add.s32 %r3, %r1, %r2;\n"
" cvt.s32.u32 %r4, %tid.x;\n"
" setp.le.s32 %p1, %r3, %r4;\n"
" @%p1 bra $Lt_2_8706;\n"
" .loc 17 206 0\n"
" mov.u64 %rd1, __cuda___cuda_local_var_32676_33_non_const_rho_coeff2888;\n"
" cvt.s64.s32 %rd2, %r4;\n"
" mul.wide.s32 %rd3, %r4, 4;\n"
" ld.param.u64 %rd4, [__cudaparm_interp__rho_coeff];\n"
" add.u64 %rd5, %rd4, %rd3;\n"
" ld.global.f32 %f1, [%rd5+0];\n"
" add.u64 %rd6, %rd3, %rd1;\n"
" st.shared.f32 [%rd6+0], %f1;\n"
"$Lt_2_8706:\n"
" mov.u64 %rd1, __cuda___cuda_local_var_32676_33_non_const_rho_coeff2888;\n"
" .loc 17 207 0\n"
" bar.sync 0;\n"
" mov.u32 %r5, %ctaid.x;\n"
" mov.u32 %r6, %ntid.x;\n"
" mul.lo.u32 %r7, %r5, %r6;\n"
" add.u32 %r8, %r4, %r7;\n"
" ld.param.s32 %r9, [__cudaparm_interp_nlocal];\n"
" setp.le.s32 %p2, %r9, %r8;\n"
" @%p2 bra $Lt_2_9218;\n"
" .loc 17 215 0\n"
" mov.u32 %r10, %r8;\n"
" mov.s32 %r11, 0;\n"
" mov.u32 %r12, %r11;\n"
" mov.s32 %r13, 0;\n"
" mov.u32 %r14, %r13;\n"
" mov.s32 %r15, 0;\n"
" mov.u32 %r16, %r15;\n"
" tex.1d.v4.f32.s32 {%f2,%f3,%f4,%f5},[pos_tex,{%r10,%r12,%r14,%r16}];\n"
" mov.f32 %f6, %f2;\n"
" mov.f32 %f7, %f3;\n"
" mov.f32 %f8, %f4;\n"
" .loc 17 216 0\n"
" mov.u32 %r17, %r8;\n"
" mov.s32 %r18, 0;\n"
" mov.u32 %r19, %r18;\n"
" mov.s32 %r20, 0;\n"
" mov.u32 %r21, %r20;\n"
" mov.s32 %r22, 0;\n"
" mov.u32 %r23, %r22;\n"
" tex.1d.v4.f32.s32 {%f9,%f10,%f11,%f12},[q_tex,{%r17,%r19,%r21,%r23}];\n"
" mov.f32 %f13, %f9;\n"
" ld.param.f32 %f14, [__cudaparm_interp_qqrd2e_scale];\n"
" mul.ftz.f32 %f15, %f14, %f13;\n"
" mov.f32 %f16, 0f00000000; \n"
" setp.neu.ftz.f32 %p3, %f15, %f16;\n"
" @!%p3 bra $Lt_2_9986;\n"
" mov.s32 %r24, 0;\n"
" setp.gt.s32 %p4, %r2, %r24;\n"
" ld.param.f32 %f17, [__cudaparm_interp_b_lo_x];\n"
" sub.ftz.f32 %f18, %f6, %f17;\n"
" ld.param.f32 %f19, [__cudaparm_interp_delxinv];\n"
" mul.ftz.f32 %f20, %f19, %f18;\n"
" @!%p4 bra $Lt_2_16386;\n"
" mov.u64 %rd7, __cuda___cuda_local_var_32677_33_non_const_rho1d_03144;\n"
" mov.u64 %rd8, __cuda___cuda_local_var_32678_33_non_const_rho1d_15192;\n"
" cvt.rzi.ftz.s32.f32 %r25, %f20;\n"
" cvt.rn.f32.s32 %f21, %r25;\n"
" mov.f32 %f22, 0f3f000000; \n"
" add.ftz.f32 %f23, %f21, %f22;\n"
" sub.ftz.f32 %f24, %f23, %f20;\n"
" ld.param.f32 %f25, [__cudaparm_interp_b_lo_y];\n"
" sub.ftz.f32 %f26, %f7, %f25;\n"
" ld.param.f32 %f27, [__cudaparm_interp_delyinv];\n"
" mul.ftz.f32 %f28, %f27, %f26;\n"
" cvt.rzi.ftz.s32.f32 %r26, %f28;\n"
" cvt.rn.f32.s32 %f29, %r26;\n"
" mov.f32 %f30, 0f3f000000; \n"
" add.ftz.f32 %f31, %f29, %f30;\n"
" sub.ftz.f32 %f32, %f31, %f28;\n"
" mov.s32 %r27, %r2;\n"
" cvt.s64.s32 %rd9, %r4;\n"
" mov.s32 %r28, %r1;\n"
" mul.wide.s32 %rd3, %r4, 4;\n"
" add.u64 %rd10, %rd3, %rd7;\n"
" add.u64 %rd11, %rd3, %rd8;\n"
" mov.s32 %r29, 0;\n"
" mov.s32 %r30, %r27;\n"
"$Lt_2_10754:\n"
" .loc 17 235 0\n"
" mov.f32 %f33, 0f00000000; \n"
" mov.f32 %f34, 0f00000000; \n"
" st.shared.f32 [%rd10+0], %f34;\n"
" .loc 17 236 0\n"
" mov.f32 %f35, 0f00000000; \n"
" mov.f32 %f36, 0f00000000; \n"
" st.shared.f32 [%rd11+0], %f36;\n"
" .loc 17 237 0\n"
" mov.s32 %r31, %r28;\n"
" setp.lt.s32 %p5, %r28, %r29;\n"
" @%p5 bra $Lt_2_11010;\n"
" cvt.s64.s32 %rd12, %r2;\n"
" mul.wide.s32 %rd13, %r2, 4;\n"
" cvt.s64.s32 %rd14, %r28;\n"
" mul.wide.s32 %rd15, %r28, 4;\n"
" add.u64 %rd16, %rd1, %rd15;\n"
"$Lt_2_11522:\n"
" .loc 17 238 0\n"
" ld.shared.f32 %f37, [%rd16+0];\n"
" fma.rn.ftz.f32 %f33, %f33, %f24, %f37;\n"
" st.shared.f32 [%rd10+0], %f33;\n"
" .loc 17 239 0\n"
" fma.rn.ftz.f32 %f35, %f35, %f32, %f37;\n"
" st.shared.f32 [%rd11+0], %f35;\n"
" sub.s32 %r31, %r31, %r2;\n"
" sub.u64 %rd16, %rd16, %rd13;\n"
" setp.ge.s32 %p6, %r31, %r29;\n"
" @%p6 bra $Lt_2_11522;\n"
"$Lt_2_11010:\n"
" add.s32 %r29, %r29, 1;\n"
" add.s32 %r28, %r28, 1;\n"
" add.u64 %rd11, %rd11, 256;\n"
" add.u64 %rd10, %rd10, 256;\n"
" setp.ne.s32 %p7, %r28, %r3;\n"
" @%p7 bra $Lt_2_10754;\n"
" bra.uni $Lt_2_10242;\n"
"$Lt_2_16386:\n"
" cvt.rzi.ftz.s32.f32 %r25, %f20;\n"
" mov.u64 %rd8, __cuda___cuda_local_var_32678_33_non_const_rho1d_15192;\n"
" mov.u64 %rd7, __cuda___cuda_local_var_32677_33_non_const_rho1d_03144;\n"
"$Lt_2_10242:\n"
" .loc 17 243 0\n"
" ld.param.f32 %f38, [__cudaparm_interp_b_lo_z];\n"
" sub.ftz.f32 %f39, %f8, %f38;\n"
" ld.param.f32 %f40, [__cudaparm_interp_delzinv];\n"
" mul.ftz.f32 %f41, %f40, %f39;\n"
" cvt.rzi.ftz.s32.f32 %r32, %f41;\n"
" ld.param.s32 %r33, [__cudaparm_interp_npts_yx];\n"
" mul.lo.s32 %r34, %r32, %r33;\n"
" add.s32 %r35, %r25, %r34;\n"
" @!%p4 bra $Lt_2_16898;\n"
" cvt.rn.f32.s32 %f42, %r32;\n"
" mov.f32 %f43, 0f3f000000; \n"
" add.ftz.f32 %f44, %f42, %f43;\n"
" sub.ftz.f32 %f45, %f44, %f41;\n"
" mov.s32 %r36, %r2;\n"
" ld.param.f32 %f46, [__cudaparm_interp_b_lo_y];\n"
" sub.ftz.f32 %f47, %f7, %f46;\n"
" cvt.s64.s32 %rd17, %r4;\n"
" ld.param.f32 %f48, [__cudaparm_interp_delyinv];\n"
" mul.ftz.f32 %f49, %f48, %f47;\n"
" cvt.rzi.ftz.s32.f32 %r37, %f49;\n"
" ld.param.s32 %r38, [__cudaparm_interp_npts_x];\n"
" mul.lo.s32 %r39, %r37, %r38;\n"
" mul.wide.s32 %rd3, %r4, 4;\n"
" add.s32 %r40, %r39, %r35;\n"
" add.u64 %rd18, %rd3, %rd7;\n"
" add.u64 %rd19, %rd3, %rd8;\n"
" cvt.s64.s32 %rd20, %r38;\n"
" mul.wide.s32 %rd21, %r38, 16;\n"
" mov.s32 %r41, %r40;\n"
" ld.param.u64 %rd22, [__cudaparm_interp_brick];\n"
" mov.s32 %r42, 0;\n"
" mov.f32 %f50, 0f00000000; \n"
" mov.f32 %f51, 0f00000000; \n"
" mov.f32 %f52, 0f00000000; \n"
" mov.s32 %r43, %r36;\n"
"$Lt_2_12802:\n"
" .loc 17 246 0\n"
" add.s32 %r44, %r42, %r1;\n"
" mov.s32 %r45, %r44;\n"
" setp.lt.s32 %p8, %r44, %r42;\n"
" @%p8 bra $Lt_2_17154;\n"
" cvt.s64.s32 %rd23, %r2;\n"
" mul.wide.s32 %rd13, %r2, 4;\n"
" cvt.s64.s32 %rd24, %r44;\n"
" mul.wide.s32 %rd25, %r44, 4;\n"
" add.u64 %rd26, %rd1, %rd25;\n"
" mov.f32 %f53, 0f00000000; \n"
"$Lt_2_13570:\n"
" .loc 17 247 0\n"
" ld.shared.f32 %f54, [%rd26+0];\n"
" fma.rn.ftz.f32 %f53, %f45, %f53, %f54;\n"
" sub.s32 %r45, %r45, %r2;\n"
" sub.u64 %rd26, %rd26, %rd13;\n"
" setp.ge.s32 %p9, %r45, %r42;\n"
" @%p9 bra $Lt_2_13570;\n"
" bra.uni $Lt_2_13058;\n"
"$Lt_2_17154:\n"
" mov.f32 %f53, 0f00000000; \n"
"$Lt_2_13058:\n"
" .loc 17 249 0\n"
" mov.s32 %r46, %r41;\n"
" mov.s32 %r47, %r2;\n"
" mul.ftz.f32 %f55, %f15, %f53;\n"
" mov.s32 %r48, %r46;\n"
" mov.s64 %rd27, %rd19;\n"
" cvt.s64.s32 %rd28, %r46;\n"
" mul.wide.s32 %rd29, %r46, 16;\n"
" mov.s32 %r49, 0;\n"
" mov.s32 %r50, %r47;\n"
"$Lt_2_14594:\n"
" mov.s32 %r51, %r2;\n"
" mov.s32 %r52, %r48;\n"
" add.s32 %r53, %r48, %r2;\n"
" mov.s64 %rd30, %rd18;\n"
" ld.shared.f32 %f56, [%rd27+0];\n"
" add.u64 %rd31, %rd29, %rd22;\n"
" mul.ftz.f32 %f57, %f55, %f56;\n"
" mov.s32 %r54, %r51;\n"
"$Lt_2_15362:\n"
" .loc 17 253 0\n"
" ld.shared.f32 %f58, [%rd30+0];\n"
" mul.ftz.f32 %f59, %f58, %f57;\n"
" ld.global.v4.f32 {%f60,%f61,%f62,_}, [%rd31+0];\n"
" .loc 17 255 0\n"
" mul.ftz.f32 %f63, %f59, %f60;\n"
" sub.ftz.f32 %f52, %f52, %f63;\n"
" .loc 17 256 0\n"
" mul.ftz.f32 %f64, %f59, %f61;\n"
" sub.ftz.f32 %f51, %f51, %f64;\n"
" .loc 17 257 0\n"
" mul.ftz.f32 %f65, %f59, %f62;\n"
" sub.ftz.f32 %f50, %f50, %f65;\n"
" add.s32 %r52, %r52, 1;\n"
" add.u64 %rd31, %rd31, 16;\n"
" add.u64 %rd30, %rd30, 256;\n"
" setp.ne.s32 %p10, %r52, %r53;\n"
" @%p10 bra $Lt_2_15362;\n"
" add.s32 %r49, %r49, 1;\n"
" add.s32 %r48, %r48, %r38;\n"
" add.u64 %rd29, %rd29, %rd21;\n"
" add.u64 %rd27, %rd27, 256;\n"
" setp.ne.s32 %p11, %r49, %r2;\n"
" @%p11 bra $Lt_2_14594;\n"
" add.s32 %r42, %r42, 1;\n"
" add.s32 %r41, %r46, %r33;\n"
" setp.ne.s32 %p12, %r42, %r2;\n"
" @%p12 bra $Lt_2_12802;\n"
" bra.uni $Lt_2_9730;\n"
"$Lt_2_16898:\n"
" mov.f32 %f50, 0f00000000; \n"
" mov.f32 %f51, 0f00000000; \n"
" mov.f32 %f52, 0f00000000; \n"
" bra.uni $Lt_2_9730;\n"
"$Lt_2_9986:\n"
" mov.f32 %f50, 0f00000000; \n"
" mov.f32 %f51, 0f00000000; \n"
" mov.f32 %f52, 0f00000000; \n"
"$Lt_2_9730:\n"
" .loc 17 264 0\n"
" ld.param.u64 %rd32, [__cudaparm_interp_ans];\n"
" cvt.s64.s32 %rd33, %r8;\n"
" mul.wide.s32 %rd34, %r8, 16;\n"
" add.u64 %rd35, %rd32, %rd34;\n"
" mov.f32 %f66, %f67;\n"
" st.global.v4.f32 [%rd35+0], {%f52,%f51,%f50,%f66};\n"
"$Lt_2_9218:\n"
" .loc 17 266 0\n"
" exit;\n"
"$LDWend_interp:\n"
" }\n"
;

Binary file not shown.

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

Binary file not shown.