git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@7581 f3b2605a-c512-4ea7-a41b-209d697bcdaa
This commit is contained in:
101
lib/gpu/atom.ptx
101
lib/gpu/atom.ptx
@ -1,101 +0,0 @@
|
||||
.version 2.3
|
||||
.target sm_20
|
||||
.address_size 64
|
||||
// compiled with /usr/local/cuda/open64/lib//be
|
||||
// nvopencc 4.0 built on 2011-05-12
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Compiling /tmp/tmpxft_000099dd_00000000-9_lal_atom.cpp3.i (/home/sjplimp/ccBI#.Q6OzuV)
|
||||
//-----------------------------------------------------------
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Options:
|
||||
//-----------------------------------------------------------
|
||||
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
|
||||
// -O3 (Optimization level)
|
||||
// -g0 (Debug level)
|
||||
// -m2 (Report advisories)
|
||||
//-----------------------------------------------------------
|
||||
|
||||
.file 1 "<command-line>"
|
||||
.file 2 "/tmp/tmpxft_000099dd_00000000-8_lal_atom.cudafe2.gpu"
|
||||
.file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
|
||||
.file 4 "/usr/local/cuda/include/crt/device_runtime.h"
|
||||
.file 5 "/usr/local/cuda/include/host_defines.h"
|
||||
.file 6 "/usr/local/cuda/include/builtin_types.h"
|
||||
.file 7 "/usr/local/cuda/include/device_types.h"
|
||||
.file 8 "/usr/local/cuda/include/driver_types.h"
|
||||
.file 9 "/usr/local/cuda/include/surface_types.h"
|
||||
.file 10 "/usr/local/cuda/include/texture_types.h"
|
||||
.file 11 "/usr/local/cuda/include/vector_types.h"
|
||||
.file 12 "/usr/local/cuda/include/device_launch_parameters.h"
|
||||
.file 13 "/usr/local/cuda/include/crt/storage_class.h"
|
||||
.file 14 "/usr/include/bits/types.h"
|
||||
.file 15 "/usr/include/time.h"
|
||||
.file 16 "lal_atom.cu"
|
||||
.file 17 "/usr/local/cuda/include/common_functions.h"
|
||||
.file 18 "/usr/local/cuda/include/math_functions.h"
|
||||
.file 19 "/usr/local/cuda/include/math_constants.h"
|
||||
.file 20 "/usr/local/cuda/include/device_functions.h"
|
||||
.file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h"
|
||||
.file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h"
|
||||
.file 23 "/usr/local/cuda/include/sm_13_double_functions.h"
|
||||
.file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h"
|
||||
.file 25 "/usr/local/cuda/include/sm_20_intrinsics.h"
|
||||
.file 26 "/usr/local/cuda/include/surface_functions.h"
|
||||
.file 27 "/usr/local/cuda/include/texture_fetch_functions.h"
|
||||
.file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
|
||||
|
||||
|
||||
.entry kernel_cast_x (
|
||||
.param .u64 __cudaparm_kernel_cast_x_x_type,
|
||||
.param .u64 __cudaparm_kernel_cast_x_x,
|
||||
.param .u64 __cudaparm_kernel_cast_x_type,
|
||||
.param .s32 __cudaparm_kernel_cast_x_nall)
|
||||
{
|
||||
.reg .u32 %r<10>;
|
||||
.reg .u64 %rd<13>;
|
||||
.reg .f32 %f<6>;
|
||||
.reg .f64 %fd<5>;
|
||||
.reg .pred %p<3>;
|
||||
.loc 16 21 0
|
||||
$LDWbegin_kernel_cast_x:
|
||||
cvt.s32.u32 %r1, %ctaid.x;
|
||||
cvt.s32.u32 %r2, %ntid.x;
|
||||
mul24.lo.s32 %r3, %r1, %r2;
|
||||
mov.u32 %r4, %tid.x;
|
||||
add.u32 %r5, %r3, %r4;
|
||||
ld.param.s32 %r6, [__cudaparm_kernel_cast_x_nall];
|
||||
setp.le.s32 %p1, %r6, %r5;
|
||||
@%p1 bra $Lt_0_1026;
|
||||
.loc 16 26 0
|
||||
cvt.s64.s32 %rd1, %r5;
|
||||
ld.param.u64 %rd2, [__cudaparm_kernel_cast_x_type];
|
||||
mul.wide.s32 %rd3, %r5, 4;
|
||||
add.u64 %rd4, %rd2, %rd3;
|
||||
ld.global.s32 %r7, [%rd4+0];
|
||||
cvt.rn.f32.s32 %f1, %r7;
|
||||
.loc 16 29 0
|
||||
ld.param.u64 %rd5, [__cudaparm_kernel_cast_x_x];
|
||||
mul.lo.s32 %r8, %r5, 3;
|
||||
cvt.s64.s32 %rd6, %r8;
|
||||
mul.wide.s32 %rd7, %r8, 8;
|
||||
add.u64 %rd8, %rd5, %rd7;
|
||||
ld.global.f64 %fd1, [%rd8+8];
|
||||
cvt.rn.ftz.f32.f64 %f2, %fd1;
|
||||
.loc 16 30 0
|
||||
ld.global.f64 %fd2, [%rd8+16];
|
||||
cvt.rn.ftz.f32.f64 %f3, %fd2;
|
||||
.loc 16 31 0
|
||||
ld.param.u64 %rd9, [__cudaparm_kernel_cast_x_x_type];
|
||||
mul.wide.s32 %rd10, %r5, 16;
|
||||
add.u64 %rd11, %rd9, %rd10;
|
||||
ld.global.f64 %fd3, [%rd8+0];
|
||||
cvt.rn.ftz.f32.f64 %f4, %fd3;
|
||||
st.global.v4.f32 [%rd11+0], {%f4,%f2,%f3,%f1};
|
||||
$Lt_0_1026:
|
||||
.loc 16 33 0
|
||||
exit;
|
||||
$LDWend_kernel_cast_x:
|
||||
} // kernel_cast_x
|
||||
|
||||
@ -1,56 +0,0 @@
|
||||
const char * atom =
|
||||
" .version 2.3\n"
|
||||
" .target sm_20\n"
|
||||
" .address_size 64\n"
|
||||
" .entry kernel_cast_x (\n"
|
||||
" .param .u64 __cudaparm_kernel_cast_x_x_type,\n"
|
||||
" .param .u64 __cudaparm_kernel_cast_x_x,\n"
|
||||
" .param .u64 __cudaparm_kernel_cast_x_type,\n"
|
||||
" .param .s32 __cudaparm_kernel_cast_x_nall)\n"
|
||||
" {\n"
|
||||
" .reg .u32 %r<10>;\n"
|
||||
" .reg .u64 %rd<13>;\n"
|
||||
" .reg .f32 %f<6>;\n"
|
||||
" .reg .f64 %fd<5>;\n"
|
||||
" .reg .pred %p<3>;\n"
|
||||
" .loc 16 21 0\n"
|
||||
"$LDWbegin_kernel_cast_x:\n"
|
||||
" cvt.s32.u32 %r1, %ctaid.x;\n"
|
||||
" cvt.s32.u32 %r2, %ntid.x;\n"
|
||||
" mul24.lo.s32 %r3, %r1, %r2;\n"
|
||||
" mov.u32 %r4, %tid.x;\n"
|
||||
" add.u32 %r5, %r3, %r4;\n"
|
||||
" ld.param.s32 %r6, [__cudaparm_kernel_cast_x_nall];\n"
|
||||
" setp.le.s32 %p1, %r6, %r5;\n"
|
||||
" @%p1 bra $Lt_0_1026;\n"
|
||||
" .loc 16 26 0\n"
|
||||
" cvt.s64.s32 %rd1, %r5;\n"
|
||||
" ld.param.u64 %rd2, [__cudaparm_kernel_cast_x_type];\n"
|
||||
" mul.wide.s32 %rd3, %r5, 4;\n"
|
||||
" add.u64 %rd4, %rd2, %rd3;\n"
|
||||
" ld.global.s32 %r7, [%rd4+0];\n"
|
||||
" cvt.rn.f32.s32 %f1, %r7;\n"
|
||||
" .loc 16 29 0\n"
|
||||
" ld.param.u64 %rd5, [__cudaparm_kernel_cast_x_x];\n"
|
||||
" mul.lo.s32 %r8, %r5, 3;\n"
|
||||
" cvt.s64.s32 %rd6, %r8;\n"
|
||||
" mul.wide.s32 %rd7, %r8, 8;\n"
|
||||
" add.u64 %rd8, %rd5, %rd7;\n"
|
||||
" ld.global.f64 %fd1, [%rd8+8];\n"
|
||||
" cvt.rn.ftz.f32.f64 %f2, %fd1;\n"
|
||||
" .loc 16 30 0\n"
|
||||
" ld.global.f64 %fd2, [%rd8+16];\n"
|
||||
" cvt.rn.ftz.f32.f64 %f3, %fd2;\n"
|
||||
" .loc 16 31 0\n"
|
||||
" ld.param.u64 %rd9, [__cudaparm_kernel_cast_x_x_type];\n"
|
||||
" mul.wide.s32 %rd10, %r5, 16;\n"
|
||||
" add.u64 %rd11, %rd9, %rd10;\n"
|
||||
" ld.global.f64 %fd3, [%rd8+0];\n"
|
||||
" cvt.rn.ftz.f32.f64 %f4, %fd3;\n"
|
||||
" st.global.v4.f32 [%rd11+0], {%f4,%f2,%f3,%f1};\n"
|
||||
"$Lt_0_1026:\n"
|
||||
" .loc 16 33 0\n"
|
||||
" exit;\n"
|
||||
"$LDWend_kernel_cast_x:\n"
|
||||
" }\n"
|
||||
;
|
||||
@ -1,958 +0,0 @@
|
||||
.version 2.3
|
||||
.target sm_20
|
||||
.address_size 64
|
||||
// compiled with /usr/local/cuda/open64/lib//be
|
||||
// nvopencc 4.0 built on 2011-05-12
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Compiling /tmp/tmpxft_00009eb0_00000000-9_lal_cg_cmm.cpp3.i (/home/sjplimp/ccBI#.oK8Qzh)
|
||||
//-----------------------------------------------------------
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Options:
|
||||
//-----------------------------------------------------------
|
||||
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
|
||||
// -O3 (Optimization level)
|
||||
// -g0 (Debug level)
|
||||
// -m2 (Report advisories)
|
||||
//-----------------------------------------------------------
|
||||
|
||||
.file 1 "<command-line>"
|
||||
.file 2 "/tmp/tmpxft_00009eb0_00000000-8_lal_cg_cmm.cudafe2.gpu"
|
||||
.file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
|
||||
.file 4 "/usr/local/cuda/include/crt/device_runtime.h"
|
||||
.file 5 "/usr/local/cuda/include/host_defines.h"
|
||||
.file 6 "/usr/local/cuda/include/builtin_types.h"
|
||||
.file 7 "/usr/local/cuda/include/device_types.h"
|
||||
.file 8 "/usr/local/cuda/include/driver_types.h"
|
||||
.file 9 "/usr/local/cuda/include/surface_types.h"
|
||||
.file 10 "/usr/local/cuda/include/texture_types.h"
|
||||
.file 11 "/usr/local/cuda/include/vector_types.h"
|
||||
.file 12 "/usr/local/cuda/include/device_launch_parameters.h"
|
||||
.file 13 "/usr/local/cuda/include/crt/storage_class.h"
|
||||
.file 14 "/usr/include/bits/types.h"
|
||||
.file 15 "/usr/include/time.h"
|
||||
.file 16 "lal_cg_cmm.cu"
|
||||
.file 17 "/usr/local/cuda/include/common_functions.h"
|
||||
.file 18 "/usr/local/cuda/include/math_functions.h"
|
||||
.file 19 "/usr/local/cuda/include/math_constants.h"
|
||||
.file 20 "/usr/local/cuda/include/device_functions.h"
|
||||
.file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h"
|
||||
.file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h"
|
||||
.file 23 "/usr/local/cuda/include/sm_13_double_functions.h"
|
||||
.file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h"
|
||||
.file 25 "/usr/local/cuda/include/sm_20_intrinsics.h"
|
||||
.file 26 "/usr/local/cuda/include/surface_functions.h"
|
||||
.file 27 "/usr/local/cuda/include/texture_fetch_functions.h"
|
||||
.file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
|
||||
|
||||
.global .texref pos_tex;
|
||||
|
||||
.entry kernel_pair (
|
||||
.param .u64 __cudaparm_kernel_pair_x_,
|
||||
.param .u64 __cudaparm_kernel_pair_lj1,
|
||||
.param .u64 __cudaparm_kernel_pair_lj3,
|
||||
.param .s32 __cudaparm_kernel_pair_lj_types,
|
||||
.param .u64 __cudaparm_kernel_pair_sp_lj_in,
|
||||
.param .u64 __cudaparm_kernel_pair_dev_nbor,
|
||||
.param .u64 __cudaparm_kernel_pair_dev_packed,
|
||||
.param .u64 __cudaparm_kernel_pair_ans,
|
||||
.param .u64 __cudaparm_kernel_pair___val_paramengv,
|
||||
.param .s32 __cudaparm_kernel_pair_eflag,
|
||||
.param .s32 __cudaparm_kernel_pair_vflag,
|
||||
.param .s32 __cudaparm_kernel_pair_inum,
|
||||
.param .s32 __cudaparm_kernel_pair_nbor_pitch,
|
||||
.param .s32 __cudaparm_kernel_pair_t_per_atom)
|
||||
{
|
||||
.reg .u32 %r<72>;
|
||||
.reg .u64 %rd<63>;
|
||||
.reg .f32 %f<111>;
|
||||
.reg .pred %p<21>;
|
||||
.shared .align 16 .b8 __cuda___cuda_local_var_32536_33_non_const_sp_lj92[16];
|
||||
.shared .align 4 .b8 __cuda___cuda_local_var_32608_55_non_const_red_acc108[3072];
|
||||
// __cuda_local_var_32543_10_non_const_f = 48
|
||||
// __cuda_local_var_32545_9_non_const_virial = 16
|
||||
.loc 16 31 0
|
||||
$LDWbegin_kernel_pair:
|
||||
.loc 16 36 0
|
||||
ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];
|
||||
ldu.global.f32 %f1, [%rd1+0];
|
||||
.loc 16 37 0
|
||||
ld.global.f32 %f2, [%rd1+4];
|
||||
.loc 16 38 0
|
||||
ld.global.f32 %f3, [%rd1+8];
|
||||
.loc 16 39 0
|
||||
ld.global.f32 %f4, [%rd1+12];
|
||||
st.shared.v4.f32 [__cuda___cuda_local_var_32536_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4};
|
||||
.loc 16 46 0
|
||||
mov.f32 %f5, 0f00000000; // 0
|
||||
mov.f32 %f6, %f5;
|
||||
mov.f32 %f7, 0f00000000; // 0
|
||||
mov.f32 %f8, %f7;
|
||||
mov.f32 %f9, 0f00000000; // 0
|
||||
mov.f32 %f10, %f9;
|
||||
mov.f32 %f11, 0f00000000; // 0
|
||||
mov.f32 %f12, %f11;
|
||||
mov.f32 %f13, 0f00000000; // 0
|
||||
mov.f32 %f14, %f13;
|
||||
mov.f32 %f15, 0f00000000; // 0
|
||||
mov.f32 %f16, %f15;
|
||||
ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];
|
||||
cvt.s32.u32 %r2, %tid.x;
|
||||
div.s32 %r3, %r2, %r1;
|
||||
cvt.s32.u32 %r4, %ntid.x;
|
||||
div.s32 %r5, %r4, %r1;
|
||||
cvt.s32.u32 %r6, %ctaid.x;
|
||||
mul.lo.s32 %r7, %r6, %r5;
|
||||
add.s32 %r8, %r3, %r7;
|
||||
ld.param.s32 %r9, [__cudaparm_kernel_pair_inum];
|
||||
setp.ge.s32 %p1, %r8, %r9;
|
||||
@%p1 bra $Lt_0_28930;
|
||||
.loc 16 51 0
|
||||
ld.param.s32 %r10, [__cudaparm_kernel_pair_nbor_pitch];
|
||||
cvt.s64.s32 %rd2, %r10;
|
||||
mul.wide.s32 %rd3, %r10, 4;
|
||||
cvt.s64.s32 %rd4, %r8;
|
||||
mul.wide.s32 %rd5, %r8, 4;
|
||||
ld.param.u64 %rd6, [__cudaparm_kernel_pair_dev_nbor];
|
||||
add.u64 %rd7, %rd5, %rd6;
|
||||
add.u64 %rd8, %rd3, %rd7;
|
||||
ld.global.s32 %r11, [%rd8+0];
|
||||
sub.s32 %r12, %r1, 1;
|
||||
and.b32 %r13, %r12, %r2;
|
||||
cvt.s64.s32 %rd9, %r13;
|
||||
mul.wide.s32 %rd10, %r13, 4;
|
||||
ld.param.u64 %rd11, [__cudaparm_kernel_pair_dev_packed];
|
||||
setp.ne.u64 %p2, %rd11, %rd6;
|
||||
@%p2 bra $Lt_0_20994;
|
||||
cvt.s32.s64 %r14, %rd2;
|
||||
mul.lo.s32 %r15, %r14, %r1;
|
||||
mov.s32 %r16, %r15;
|
||||
mul.lo.s32 %r17, %r12, %r8;
|
||||
add.s32 %r18, %r14, %r17;
|
||||
cvt.s64.s32 %rd12, %r18;
|
||||
mul.wide.s32 %rd13, %r18, 4;
|
||||
add.u64 %rd14, %rd8, %rd13;
|
||||
and.b32 %r19, %r12, %r11;
|
||||
cvt.s64.s32 %rd15, %r19;
|
||||
div.s32 %r20, %r11, %r1;
|
||||
mul.lo.s32 %r21, %r15, %r20;
|
||||
cvt.s64.s32 %rd16, %r21;
|
||||
add.u64 %rd17, %rd15, %rd16;
|
||||
mul.lo.u64 %rd18, %rd17, 4;
|
||||
add.u64 %rd19, %rd14, %rd18;
|
||||
add.u64 %rd20, %rd10, %rd14;
|
||||
bra.uni $Lt_0_20738;
|
||||
$Lt_0_20994:
|
||||
add.u64 %rd21, %rd3, %rd8;
|
||||
ld.global.s32 %r22, [%rd21+0];
|
||||
cvt.s64.s32 %rd22, %r22;
|
||||
mul.wide.s32 %rd23, %r22, 4;
|
||||
add.u64 %rd24, %rd11, %rd23;
|
||||
cvt.s64.s32 %rd25, %r11;
|
||||
mul.wide.s32 %rd26, %r11, 4;
|
||||
add.u64 %rd19, %rd24, %rd26;
|
||||
mov.s32 %r16, %r1;
|
||||
add.u64 %rd20, %rd10, %rd24;
|
||||
$Lt_0_20738:
|
||||
.loc 16 54 0
|
||||
ld.global.s32 %r23, [%rd7+0];
|
||||
mov.u32 %r24, %r23;
|
||||
mov.s32 %r25, 0;
|
||||
mov.u32 %r26, %r25;
|
||||
mov.s32 %r27, 0;
|
||||
mov.u32 %r28, %r27;
|
||||
mov.s32 %r29, 0;
|
||||
mov.u32 %r30, %r29;
|
||||
tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r24,%r26,%r28,%r30}];
|
||||
mov.f32 %f21, %f17;
|
||||
mov.f32 %f22, %f18;
|
||||
mov.f32 %f23, %f19;
|
||||
mov.f32 %f24, %f20;
|
||||
setp.ge.u64 %p3, %rd20, %rd19;
|
||||
@%p3 bra $Lt_0_30466;
|
||||
cvt.rzi.ftz.s32.f32 %r31, %f24;
|
||||
cvt.s64.s32 %rd27, %r16;
|
||||
ld.param.s32 %r32, [__cudaparm_kernel_pair_lj_types];
|
||||
mul.lo.s32 %r33, %r32, %r31;
|
||||
ld.param.u64 %rd28, [__cudaparm_kernel_pair_lj1];
|
||||
mov.f32 %f25, 0f00000000; // 0
|
||||
mov.f32 %f26, 0f00000000; // 0
|
||||
mov.f32 %f27, 0f00000000; // 0
|
||||
mov.f32 %f28, 0f00000000; // 0
|
||||
mov.u64 %rd29, __cuda___cuda_local_var_32536_33_non_const_sp_lj92;
|
||||
$Lt_0_21762:
|
||||
//<loop> Loop body line 54, nesting depth: 1, estimated iterations: unknown
|
||||
.loc 16 60 0
|
||||
ld.global.s32 %r34, [%rd20+0];
|
||||
.loc 16 61 0
|
||||
shr.s32 %r35, %r34, 30;
|
||||
and.b32 %r36, %r35, 3;
|
||||
cvt.s64.s32 %rd30, %r36;
|
||||
mul.wide.s32 %rd31, %r36, 4;
|
||||
add.u64 %rd32, %rd29, %rd31;
|
||||
ld.shared.f32 %f29, [%rd32+0];
|
||||
.loc 16 64 0
|
||||
and.b32 %r37, %r34, 1073741823;
|
||||
mov.u32 %r38, %r37;
|
||||
mov.s32 %r39, 0;
|
||||
mov.u32 %r40, %r39;
|
||||
mov.s32 %r41, 0;
|
||||
mov.u32 %r42, %r41;
|
||||
mov.s32 %r43, 0;
|
||||
mov.u32 %r44, %r43;
|
||||
tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r38,%r40,%r42,%r44}];
|
||||
mov.f32 %f34, %f30;
|
||||
mov.f32 %f35, %f31;
|
||||
mov.f32 %f36, %f32;
|
||||
mov.f32 %f37, %f33;
|
||||
cvt.rzi.ftz.s32.f32 %r45, %f37;
|
||||
sub.ftz.f32 %f38, %f22, %f35;
|
||||
sub.ftz.f32 %f39, %f21, %f34;
|
||||
sub.ftz.f32 %f40, %f23, %f36;
|
||||
mul.ftz.f32 %f41, %f38, %f38;
|
||||
fma.rn.ftz.f32 %f42, %f39, %f39, %f41;
|
||||
fma.rn.ftz.f32 %f43, %f40, %f40, %f42;
|
||||
add.s32 %r46, %r45, %r33;
|
||||
cvt.s64.s32 %rd33, %r46;
|
||||
mul.wide.s32 %rd34, %r46, 16;
|
||||
add.u64 %rd35, %rd34, %rd28;
|
||||
ld.global.f32 %f44, [%rd35+0];
|
||||
setp.gt.ftz.f32 %p4, %f44, %f43;
|
||||
@!%p4 bra $Lt_0_24066;
|
||||
rcp.approx.ftz.f32 %f45, %f43;
|
||||
ld.global.f32 %f46, [%rd35+4];
|
||||
mov.f32 %f47, 0f40000000; // 2
|
||||
setp.eq.ftz.f32 %p5, %f46, %f47;
|
||||
@!%p5 bra $Lt_0_22786;
|
||||
.loc 16 79 0
|
||||
mul.ftz.f32 %f48, %f45, %f45;
|
||||
mov.f32 %f49, %f48;
|
||||
.loc 16 80 0
|
||||
mul.ftz.f32 %f50, %f48, %f48;
|
||||
bra.uni $Lt_0_23042;
|
||||
$Lt_0_22786:
|
||||
mov.f32 %f51, 0f3f800000; // 1
|
||||
setp.eq.ftz.f32 %p6, %f46, %f51;
|
||||
@!%p6 bra $Lt_0_23298;
|
||||
.loc 16 82 0
|
||||
sqrt.approx.ftz.f32 %f52, %f45;
|
||||
mul.ftz.f32 %f53, %f45, %f52;
|
||||
mov.f32 %f50, %f53;
|
||||
.loc 16 83 0
|
||||
mul.ftz.f32 %f49, %f53, %f53;
|
||||
bra.uni $Lt_0_23042;
|
||||
$Lt_0_23298:
|
||||
.loc 16 85 0
|
||||
mul.ftz.f32 %f54, %f45, %f45;
|
||||
mul.ftz.f32 %f55, %f45, %f54;
|
||||
mov.f32 %f49, %f55;
|
||||
.loc 16 86 0
|
||||
mov.f32 %f50, %f55;
|
||||
$Lt_0_23042:
|
||||
$Lt_0_22530:
|
||||
.loc 16 88 0
|
||||
mul.ftz.f32 %f56, %f45, %f29;
|
||||
mul.ftz.f32 %f57, %f49, %f56;
|
||||
ld.global.v2.f32 {%f58,%f59}, [%rd35+8];
|
||||
mul.ftz.f32 %f60, %f58, %f50;
|
||||
sub.ftz.f32 %f61, %f60, %f59;
|
||||
mul.ftz.f32 %f62, %f57, %f61;
|
||||
.loc 16 90 0
|
||||
fma.rn.ftz.f32 %f27, %f39, %f62, %f27;
|
||||
.loc 16 91 0
|
||||
fma.rn.ftz.f32 %f26, %f38, %f62, %f26;
|
||||
.loc 16 92 0
|
||||
fma.rn.ftz.f32 %f25, %f40, %f62, %f25;
|
||||
ld.param.s32 %r47, [__cudaparm_kernel_pair_eflag];
|
||||
mov.u32 %r48, 0;
|
||||
setp.le.s32 %p7, %r47, %r48;
|
||||
@%p7 bra $Lt_0_23554;
|
||||
.loc 16 94 0
|
||||
ld.param.u64 %rd36, [__cudaparm_kernel_pair_lj3];
|
||||
add.u64 %rd37, %rd36, %rd34;
|
||||
ld.global.v4.f32 {%f63,%f64,%f65,_}, [%rd37+0];
|
||||
mul.ftz.f32 %f66, %f29, %f49;
|
||||
mul.ftz.f32 %f67, %f63, %f50;
|
||||
sub.ftz.f32 %f68, %f67, %f64;
|
||||
mul.ftz.f32 %f69, %f66, %f68;
|
||||
sub.ftz.f32 %f70, %f69, %f65;
|
||||
add.ftz.f32 %f28, %f28, %f70;
|
||||
$Lt_0_23554:
|
||||
ld.param.s32 %r49, [__cudaparm_kernel_pair_vflag];
|
||||
mov.u32 %r50, 0;
|
||||
setp.le.s32 %p8, %r49, %r50;
|
||||
@%p8 bra $Lt_0_24066;
|
||||
.loc 16 97 0
|
||||
mov.f32 %f71, %f6;
|
||||
mul.ftz.f32 %f72, %f39, %f39;
|
||||
fma.rn.ftz.f32 %f73, %f62, %f72, %f71;
|
||||
mov.f32 %f6, %f73;
|
||||
.loc 16 98 0
|
||||
mov.f32 %f74, %f8;
|
||||
fma.rn.ftz.f32 %f75, %f62, %f41, %f74;
|
||||
mov.f32 %f8, %f75;
|
||||
.loc 16 99 0
|
||||
mov.f32 %f76, %f10;
|
||||
mul.ftz.f32 %f77, %f40, %f40;
|
||||
fma.rn.ftz.f32 %f78, %f62, %f77, %f76;
|
||||
mov.f32 %f10, %f78;
|
||||
.loc 16 100 0
|
||||
mov.f32 %f79, %f12;
|
||||
mul.ftz.f32 %f80, %f38, %f39;
|
||||
fma.rn.ftz.f32 %f81, %f62, %f80, %f79;
|
||||
mov.f32 %f12, %f81;
|
||||
.loc 16 101 0
|
||||
mov.f32 %f82, %f14;
|
||||
mul.ftz.f32 %f83, %f39, %f40;
|
||||
fma.rn.ftz.f32 %f84, %f62, %f83, %f82;
|
||||
mov.f32 %f14, %f84;
|
||||
.loc 16 102 0
|
||||
mul.ftz.f32 %f85, %f38, %f40;
|
||||
fma.rn.ftz.f32 %f15, %f62, %f85, %f15;
|
||||
mov.f32 %f16, %f15;
|
||||
$Lt_0_24066:
|
||||
$Lt_0_22018:
|
||||
.loc 16 58 0
|
||||
mul.lo.u64 %rd38, %rd27, 4;
|
||||
add.u64 %rd20, %rd20, %rd38;
|
||||
setp.lt.u64 %p9, %rd20, %rd19;
|
||||
@%p9 bra $Lt_0_21762;
|
||||
bra.uni $Lt_0_21250;
|
||||
$Lt_0_30466:
|
||||
mov.f32 %f25, 0f00000000; // 0
|
||||
mov.f32 %f26, 0f00000000; // 0
|
||||
mov.f32 %f27, 0f00000000; // 0
|
||||
mov.f32 %f28, 0f00000000; // 0
|
||||
$Lt_0_21250:
|
||||
mov.u32 %r51, 1;
|
||||
setp.le.s32 %p10, %r1, %r51;
|
||||
@%p10 bra $Lt_0_26882;
|
||||
.loc 16 107 0
|
||||
mov.u64 %rd39, __cuda___cuda_local_var_32608_55_non_const_red_acc108;
|
||||
cvt.s64.s32 %rd40, %r2;
|
||||
mul.wide.s32 %rd41, %r2, 4;
|
||||
add.u64 %rd42, %rd39, %rd41;
|
||||
mov.f32 %f86, %f27;
|
||||
st.shared.f32 [%rd42+0], %f86;
|
||||
mov.f32 %f87, %f26;
|
||||
st.shared.f32 [%rd42+512], %f87;
|
||||
mov.f32 %f88, %f25;
|
||||
st.shared.f32 [%rd42+1024], %f88;
|
||||
mov.f32 %f89, %f28;
|
||||
st.shared.f32 [%rd42+1536], %f89;
|
||||
shr.s32 %r52, %r1, 31;
|
||||
mov.s32 %r53, 1;
|
||||
and.b32 %r54, %r52, %r53;
|
||||
add.s32 %r55, %r54, %r1;
|
||||
shr.s32 %r56, %r55, 1;
|
||||
mov.s32 %r57, %r56;
|
||||
mov.u32 %r58, 0;
|
||||
setp.ne.u32 %p11, %r56, %r58;
|
||||
@!%p11 bra $Lt_0_25346;
|
||||
$Lt_0_25858:
|
||||
setp.ge.u32 %p12, %r13, %r57;
|
||||
@%p12 bra $Lt_0_26114;
|
||||
add.u32 %r59, %r2, %r57;
|
||||
cvt.u64.u32 %rd43, %r59;
|
||||
mul.wide.u32 %rd44, %r59, 4;
|
||||
add.u64 %rd45, %rd39, %rd44;
|
||||
ld.shared.f32 %f90, [%rd45+0];
|
||||
add.ftz.f32 %f86, %f90, %f86;
|
||||
st.shared.f32 [%rd42+0], %f86;
|
||||
ld.shared.f32 %f91, [%rd45+512];
|
||||
add.ftz.f32 %f87, %f91, %f87;
|
||||
st.shared.f32 [%rd42+512], %f87;
|
||||
ld.shared.f32 %f92, [%rd45+1024];
|
||||
add.ftz.f32 %f88, %f92, %f88;
|
||||
st.shared.f32 [%rd42+1024], %f88;
|
||||
ld.shared.f32 %f93, [%rd45+1536];
|
||||
add.ftz.f32 %f89, %f93, %f89;
|
||||
st.shared.f32 [%rd42+1536], %f89;
|
||||
$Lt_0_26114:
|
||||
shr.u32 %r57, %r57, 1;
|
||||
mov.u32 %r60, 0;
|
||||
setp.ne.u32 %p13, %r57, %r60;
|
||||
@%p13 bra $Lt_0_25858;
|
||||
$Lt_0_25346:
|
||||
mov.f32 %f27, %f86;
|
||||
mov.f32 %f26, %f87;
|
||||
mov.f32 %f25, %f88;
|
||||
mov.f32 %f28, %f89;
|
||||
ld.param.s32 %r61, [__cudaparm_kernel_pair_vflag];
|
||||
mov.u32 %r62, 0;
|
||||
setp.le.s32 %p14, %r61, %r62;
|
||||
@%p14 bra $Lt_0_26882;
|
||||
mov.f32 %f86, %f6;
|
||||
st.shared.f32 [%rd42+0], %f86;
|
||||
mov.f32 %f87, %f8;
|
||||
st.shared.f32 [%rd42+512], %f87;
|
||||
mov.f32 %f88, %f10;
|
||||
st.shared.f32 [%rd42+1024], %f88;
|
||||
mov.f32 %f89, %f12;
|
||||
st.shared.f32 [%rd42+1536], %f89;
|
||||
mov.f32 %f94, %f14;
|
||||
st.shared.f32 [%rd42+2048], %f94;
|
||||
mov.f32 %f95, %f15;
|
||||
st.shared.f32 [%rd42+2560], %f95;
|
||||
mov.s32 %r63, %r56;
|
||||
@!%p11 bra $Lt_0_27394;
|
||||
$Lt_0_27906:
|
||||
setp.ge.u32 %p15, %r13, %r63;
|
||||
@%p15 bra $Lt_0_28162;
|
||||
add.u32 %r64, %r2, %r63;
|
||||
cvt.u64.u32 %rd46, %r64;
|
||||
mul.wide.u32 %rd47, %r64, 4;
|
||||
add.u64 %rd48, %rd39, %rd47;
|
||||
ld.shared.f32 %f96, [%rd48+0];
|
||||
add.ftz.f32 %f86, %f96, %f86;
|
||||
st.shared.f32 [%rd42+0], %f86;
|
||||
ld.shared.f32 %f97, [%rd48+512];
|
||||
add.ftz.f32 %f87, %f97, %f87;
|
||||
st.shared.f32 [%rd42+512], %f87;
|
||||
ld.shared.f32 %f98, [%rd48+1024];
|
||||
add.ftz.f32 %f88, %f98, %f88;
|
||||
st.shared.f32 [%rd42+1024], %f88;
|
||||
ld.shared.f32 %f99, [%rd48+1536];
|
||||
add.ftz.f32 %f89, %f99, %f89;
|
||||
st.shared.f32 [%rd42+1536], %f89;
|
||||
ld.shared.f32 %f100, [%rd48+2048];
|
||||
add.ftz.f32 %f94, %f100, %f94;
|
||||
st.shared.f32 [%rd42+2048], %f94;
|
||||
ld.shared.f32 %f101, [%rd48+2560];
|
||||
add.ftz.f32 %f95, %f101, %f95;
|
||||
st.shared.f32 [%rd42+2560], %f95;
|
||||
$Lt_0_28162:
|
||||
shr.u32 %r63, %r63, 1;
|
||||
mov.u32 %r65, 0;
|
||||
setp.ne.u32 %p16, %r63, %r65;
|
||||
@%p16 bra $Lt_0_27906;
|
||||
$Lt_0_27394:
|
||||
mov.f32 %f6, %f86;
|
||||
mov.f32 %f8, %f87;
|
||||
mov.f32 %f10, %f88;
|
||||
mov.f32 %f12, %f89;
|
||||
mov.f32 %f14, %f94;
|
||||
mov.f32 %f16, %f95;
|
||||
$Lt_0_26882:
|
||||
$Lt_0_24834:
|
||||
mov.u32 %r66, 0;
|
||||
setp.ne.s32 %p17, %r13, %r66;
|
||||
@%p17 bra $Lt_0_28930;
|
||||
ld.param.u64 %rd49, [__cudaparm_kernel_pair___val_paramengv];
|
||||
add.u64 %rd50, %rd49, %rd5;
|
||||
ld.param.s32 %r67, [__cudaparm_kernel_pair_eflag];
|
||||
mov.u32 %r68, 0;
|
||||
setp.le.s32 %p18, %r67, %r68;
|
||||
@%p18 bra $Lt_0_29442;
|
||||
st.global.f32 [%rd50+0], %f28;
|
||||
cvt.s64.s32 %rd51, %r9;
|
||||
mul.wide.s32 %rd52, %r9, 4;
|
||||
add.u64 %rd50, %rd50, %rd52;
|
||||
$Lt_0_29442:
|
||||
ld.param.s32 %r69, [__cudaparm_kernel_pair_vflag];
|
||||
mov.u32 %r70, 0;
|
||||
setp.le.s32 %p19, %r69, %r70;
|
||||
@%p19 bra $Lt_0_29954;
|
||||
mov.f32 %f102, %f6;
|
||||
st.global.f32 [%rd50+0], %f102;
|
||||
cvt.s64.s32 %rd53, %r9;
|
||||
mul.wide.s32 %rd54, %r9, 4;
|
||||
add.u64 %rd55, %rd54, %rd50;
|
||||
mov.f32 %f103, %f8;
|
||||
st.global.f32 [%rd55+0], %f103;
|
||||
add.u64 %rd56, %rd54, %rd55;
|
||||
mov.f32 %f104, %f10;
|
||||
st.global.f32 [%rd56+0], %f104;
|
||||
add.u64 %rd57, %rd54, %rd56;
|
||||
mov.f32 %f105, %f12;
|
||||
st.global.f32 [%rd57+0], %f105;
|
||||
add.u64 %rd50, %rd54, %rd57;
|
||||
mov.f32 %f106, %f14;
|
||||
st.global.f32 [%rd50+0], %f106;
|
||||
mov.f32 %f107, %f16;
|
||||
add.u64 %rd58, %rd54, %rd50;
|
||||
st.global.f32 [%rd58+0], %f107;
|
||||
$Lt_0_29954:
|
||||
ld.param.u64 %rd59, [__cudaparm_kernel_pair_ans];
|
||||
mul.lo.u64 %rd60, %rd4, 16;
|
||||
add.u64 %rd61, %rd59, %rd60;
|
||||
mov.f32 %f108, %f109;
|
||||
st.global.v4.f32 [%rd61+0], {%f27,%f26,%f25,%f108};
|
||||
$Lt_0_28930:
|
||||
$Lt_0_20226:
|
||||
.loc 16 110 0
|
||||
exit;
|
||||
$LDWend_kernel_pair:
|
||||
} // kernel_pair
|
||||
|
||||
.entry kernel_pair_fast (
|
||||
.param .u64 __cudaparm_kernel_pair_fast_x_,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_lj1_in,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_lj3_in,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_dev_nbor,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_dev_packed,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_ans,
|
||||
.param .u64 __cudaparm_kernel_pair_fast___val_paramengv,
|
||||
.param .s32 __cudaparm_kernel_pair_fast_eflag,
|
||||
.param .s32 __cudaparm_kernel_pair_fast_vflag,
|
||||
.param .s32 __cudaparm_kernel_pair_fast_inum,
|
||||
.param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,
|
||||
.param .s32 __cudaparm_kernel_pair_fast_t_per_atom)
|
||||
{
|
||||
.reg .u32 %r<74>;
|
||||
.reg .u64 %rd<75>;
|
||||
.reg .f32 %f<118>;
|
||||
.reg .pred %p<24>;
|
||||
.shared .align 4 .b8 __cuda___cuda_local_var_32625_33_non_const_sp_lj3268[16];
|
||||
.shared .align 16 .b8 __cuda___cuda_local_var_32623_34_non_const_lj13296[1936];
|
||||
.shared .align 16 .b8 __cuda___cuda_local_var_32624_34_non_const_lj35232[1936];
|
||||
.shared .align 4 .b8 __cuda___cuda_local_var_32702_55_non_const_red_acc7168[3072];
|
||||
// __cuda_local_var_32635_10_non_const_f = 48
|
||||
// __cuda_local_var_32637_9_non_const_virial = 16
|
||||
.loc 16 118 0
|
||||
$LDWbegin_kernel_pair_fast:
|
||||
cvt.s32.u32 %r1, %tid.x;
|
||||
mov.u32 %r2, 3;
|
||||
setp.gt.s32 %p1, %r1, %r2;
|
||||
@%p1 bra $Lt_1_22530;
|
||||
.loc 16 126 0
|
||||
mov.u64 %rd1, __cuda___cuda_local_var_32625_33_non_const_sp_lj3268;
|
||||
cvt.s64.s32 %rd2, %r1;
|
||||
mul.wide.s32 %rd3, %r1, 4;
|
||||
ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];
|
||||
add.u64 %rd5, %rd4, %rd3;
|
||||
ld.global.f32 %f1, [%rd5+0];
|
||||
add.u64 %rd6, %rd3, %rd1;
|
||||
st.shared.f32 [%rd6+0], %f1;
|
||||
$Lt_1_22530:
|
||||
mov.u64 %rd1, __cuda___cuda_local_var_32625_33_non_const_sp_lj3268;
|
||||
mov.u32 %r3, 120;
|
||||
setp.gt.s32 %p2, %r1, %r3;
|
||||
@%p2 bra $Lt_1_23042;
|
||||
.loc 16 128 0
|
||||
mov.u64 %rd7, __cuda___cuda_local_var_32623_34_non_const_lj13296;
|
||||
cvt.s64.s32 %rd8, %r1;
|
||||
mul.wide.s32 %rd9, %r1, 16;
|
||||
ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_lj1_in];
|
||||
add.u64 %rd11, %rd10, %rd9;
|
||||
add.u64 %rd12, %rd9, %rd7;
|
||||
ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0];
|
||||
st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5};
|
||||
ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag];
|
||||
mov.u32 %r5, 0;
|
||||
setp.le.s32 %p3, %r4, %r5;
|
||||
@%p3 bra $Lt_1_23554;
|
||||
.loc 16 130 0
|
||||
mov.u64 %rd13, __cuda___cuda_local_var_32624_34_non_const_lj35232;
|
||||
ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in];
|
||||
add.u64 %rd15, %rd14, %rd9;
|
||||
add.u64 %rd16, %rd9, %rd13;
|
||||
ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0];
|
||||
st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9};
|
||||
$Lt_1_23554:
|
||||
mov.u64 %rd13, __cuda___cuda_local_var_32624_34_non_const_lj35232;
|
||||
$Lt_1_23042:
|
||||
mov.u64 %rd13, __cuda___cuda_local_var_32624_34_non_const_lj35232;
|
||||
mov.u64 %rd7, __cuda___cuda_local_var_32623_34_non_const_lj13296;
|
||||
.loc 16 138 0
|
||||
mov.f32 %f10, 0f00000000; // 0
|
||||
mov.f32 %f11, %f10;
|
||||
mov.f32 %f12, 0f00000000; // 0
|
||||
mov.f32 %f13, %f12;
|
||||
mov.f32 %f14, 0f00000000; // 0
|
||||
mov.f32 %f15, %f14;
|
||||
mov.f32 %f16, 0f00000000; // 0
|
||||
mov.f32 %f17, %f16;
|
||||
mov.f32 %f18, 0f00000000; // 0
|
||||
mov.f32 %f19, %f18;
|
||||
mov.f32 %f20, 0f00000000; // 0
|
||||
mov.f32 %f21, %f20;
|
||||
.loc 16 140 0
|
||||
bar.sync 0;
|
||||
ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom];
|
||||
div.s32 %r7, %r1, %r6;
|
||||
cvt.s32.u32 %r8, %ntid.x;
|
||||
div.s32 %r9, %r8, %r6;
|
||||
cvt.s32.u32 %r10, %ctaid.x;
|
||||
mul.lo.s32 %r11, %r10, %r9;
|
||||
add.s32 %r12, %r7, %r11;
|
||||
ld.param.s32 %r13, [__cudaparm_kernel_pair_fast_inum];
|
||||
setp.ge.s32 %p4, %r12, %r13;
|
||||
@%p4 bra $Lt_1_32770;
|
||||
.loc 16 145 0
|
||||
ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_nbor_pitch];
|
||||
cvt.s64.s32 %rd17, %r14;
|
||||
mul.wide.s32 %rd18, %r14, 4;
|
||||
cvt.s64.s32 %rd19, %r12;
|
||||
mul.wide.s32 %rd20, %r12, 4;
|
||||
ld.param.u64 %rd21, [__cudaparm_kernel_pair_fast_dev_nbor];
|
||||
add.u64 %rd22, %rd20, %rd21;
|
||||
add.u64 %rd23, %rd18, %rd22;
|
||||
ld.global.s32 %r15, [%rd23+0];
|
||||
sub.s32 %r16, %r6, 1;
|
||||
and.b32 %r17, %r16, %r1;
|
||||
cvt.s64.s32 %rd24, %r17;
|
||||
mul.wide.s32 %rd25, %r17, 4;
|
||||
ld.param.u64 %rd26, [__cudaparm_kernel_pair_fast_dev_packed];
|
||||
setp.ne.u64 %p5, %rd26, %rd21;
|
||||
@%p5 bra $Lt_1_24834;
|
||||
cvt.s32.s64 %r18, %rd17;
|
||||
mul.lo.s32 %r19, %r18, %r6;
|
||||
mov.s32 %r20, %r19;
|
||||
mul.lo.s32 %r21, %r16, %r12;
|
||||
add.s32 %r22, %r18, %r21;
|
||||
cvt.s64.s32 %rd27, %r22;
|
||||
mul.wide.s32 %rd28, %r22, 4;
|
||||
add.u64 %rd29, %rd23, %rd28;
|
||||
and.b32 %r23, %r16, %r15;
|
||||
cvt.s64.s32 %rd30, %r23;
|
||||
div.s32 %r24, %r15, %r6;
|
||||
mul.lo.s32 %r25, %r19, %r24;
|
||||
cvt.s64.s32 %rd31, %r25;
|
||||
add.u64 %rd32, %rd30, %rd31;
|
||||
mul.lo.u64 %rd33, %rd32, 4;
|
||||
add.u64 %rd34, %rd29, %rd33;
|
||||
add.u64 %rd35, %rd25, %rd29;
|
||||
bra.uni $Lt_1_24578;
|
||||
$Lt_1_24834:
|
||||
add.u64 %rd36, %rd18, %rd23;
|
||||
ld.global.s32 %r26, [%rd36+0];
|
||||
cvt.s64.s32 %rd37, %r26;
|
||||
mul.wide.s32 %rd38, %r26, 4;
|
||||
add.u64 %rd39, %rd26, %rd38;
|
||||
cvt.s64.s32 %rd40, %r15;
|
||||
mul.wide.s32 %rd41, %r15, 4;
|
||||
add.u64 %rd34, %rd39, %rd41;
|
||||
mov.s32 %r20, %r6;
|
||||
add.u64 %rd35, %rd25, %rd39;
|
||||
$Lt_1_24578:
|
||||
.loc 16 148 0
|
||||
ld.global.s32 %r27, [%rd22+0];
|
||||
mov.u32 %r28, %r27;
|
||||
mov.s32 %r29, 0;
|
||||
mov.u32 %r30, %r29;
|
||||
mov.s32 %r31, 0;
|
||||
mov.u32 %r32, %r31;
|
||||
mov.s32 %r33, 0;
|
||||
mov.u32 %r34, %r33;
|
||||
tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r28,%r30,%r32,%r34}];
|
||||
mov.f32 %f26, %f22;
|
||||
mov.f32 %f27, %f23;
|
||||
mov.f32 %f28, %f24;
|
||||
mov.f32 %f29, %f25;
|
||||
setp.ge.u64 %p6, %rd35, %rd34;
|
||||
@%p6 bra $Lt_1_34306;
|
||||
cvt.rzi.ftz.s32.f32 %r35, %f29;
|
||||
cvt.s64.s32 %rd42, %r20;
|
||||
mul.lo.s32 %r36, %r35, 11;
|
||||
cvt.rn.f32.s32 %f30, %r36;
|
||||
mov.f32 %f31, 0f00000000; // 0
|
||||
mov.f32 %f32, 0f00000000; // 0
|
||||
mov.f32 %f33, 0f00000000; // 0
|
||||
mov.f32 %f34, 0f00000000; // 0
|
||||
$Lt_1_25602:
|
||||
//<loop> Loop body line 148, nesting depth: 1, estimated iterations: unknown
|
||||
.loc 16 155 0
|
||||
ld.global.s32 %r37, [%rd35+0];
|
||||
.loc 16 156 0
|
||||
shr.s32 %r38, %r37, 30;
|
||||
and.b32 %r39, %r38, 3;
|
||||
cvt.s64.s32 %rd43, %r39;
|
||||
mul.wide.s32 %rd44, %r39, 4;
|
||||
add.u64 %rd45, %rd1, %rd44;
|
||||
ld.shared.f32 %f35, [%rd45+0];
|
||||
.loc 16 159 0
|
||||
and.b32 %r40, %r37, 1073741823;
|
||||
mov.u32 %r41, %r40;
|
||||
mov.s32 %r42, 0;
|
||||
mov.u32 %r43, %r42;
|
||||
mov.s32 %r44, 0;
|
||||
mov.u32 %r45, %r44;
|
||||
mov.s32 %r46, 0;
|
||||
mov.u32 %r47, %r46;
|
||||
tex.1d.v4.f32.s32 {%f36,%f37,%f38,%f39},[pos_tex,{%r41,%r43,%r45,%r47}];
|
||||
mov.f32 %f40, %f36;
|
||||
mov.f32 %f41, %f37;
|
||||
mov.f32 %f42, %f38;
|
||||
mov.f32 %f43, %f39;
|
||||
sub.ftz.f32 %f44, %f27, %f41;
|
||||
sub.ftz.f32 %f45, %f26, %f40;
|
||||
sub.ftz.f32 %f46, %f28, %f42;
|
||||
mul.ftz.f32 %f47, %f44, %f44;
|
||||
fma.rn.ftz.f32 %f48, %f45, %f45, %f47;
|
||||
fma.rn.ftz.f32 %f49, %f46, %f46, %f48;
|
||||
add.ftz.f32 %f50, %f30, %f43;
|
||||
cvt.rzi.ftz.s32.f32 %r48, %f50;
|
||||
cvt.s64.s32 %rd46, %r48;
|
||||
mul.wide.s32 %rd47, %r48, 16;
|
||||
add.u64 %rd48, %rd47, %rd7;
|
||||
ld.shared.f32 %f51, [%rd48+0];
|
||||
setp.gt.ftz.f32 %p7, %f51, %f49;
|
||||
@!%p7 bra $Lt_1_27906;
|
||||
rcp.approx.ftz.f32 %f52, %f49;
|
||||
ld.shared.f32 %f53, [%rd48+4];
|
||||
mov.f32 %f54, 0f40000000; // 2
|
||||
setp.eq.ftz.f32 %p8, %f53, %f54;
|
||||
@!%p8 bra $Lt_1_26626;
|
||||
.loc 16 173 0
|
||||
mul.ftz.f32 %f55, %f52, %f52;
|
||||
mov.f32 %f56, %f55;
|
||||
.loc 16 174 0
|
||||
mul.ftz.f32 %f57, %f55, %f55;
|
||||
bra.uni $Lt_1_26882;
|
||||
$Lt_1_26626:
|
||||
mov.f32 %f58, 0f3f800000; // 1
|
||||
setp.eq.ftz.f32 %p9, %f53, %f58;
|
||||
@!%p9 bra $Lt_1_27138;
|
||||
.loc 16 176 0
|
||||
sqrt.approx.ftz.f32 %f59, %f52;
|
||||
mul.ftz.f32 %f60, %f52, %f59;
|
||||
mov.f32 %f57, %f60;
|
||||
.loc 16 177 0
|
||||
mul.ftz.f32 %f56, %f60, %f60;
|
||||
bra.uni $Lt_1_26882;
|
||||
$Lt_1_27138:
|
||||
.loc 16 179 0
|
||||
mul.ftz.f32 %f61, %f52, %f52;
|
||||
mul.ftz.f32 %f62, %f52, %f61;
|
||||
mov.f32 %f56, %f62;
|
||||
.loc 16 180 0
|
||||
mov.f32 %f57, %f62;
|
||||
$Lt_1_26882:
|
||||
$Lt_1_26370:
|
||||
.loc 16 182 0
|
||||
mul.ftz.f32 %f63, %f52, %f35;
|
||||
mul.ftz.f32 %f64, %f56, %f63;
|
||||
ld.shared.v2.f32 {%f65,%f66}, [%rd48+8];
|
||||
mul.ftz.f32 %f67, %f65, %f57;
|
||||
sub.ftz.f32 %f68, %f67, %f66;
|
||||
mul.ftz.f32 %f69, %f64, %f68;
|
||||
.loc 16 184 0
|
||||
fma.rn.ftz.f32 %f33, %f45, %f69, %f33;
|
||||
.loc 16 185 0
|
||||
fma.rn.ftz.f32 %f32, %f44, %f69, %f32;
|
||||
.loc 16 186 0
|
||||
fma.rn.ftz.f32 %f31, %f46, %f69, %f31;
|
||||
ld.param.s32 %r49, [__cudaparm_kernel_pair_fast_eflag];
|
||||
mov.u32 %r50, 0;
|
||||
setp.le.s32 %p10, %r49, %r50;
|
||||
@%p10 bra $Lt_1_27394;
|
||||
.loc 16 188 0
|
||||
add.u64 %rd49, %rd47, %rd13;
|
||||
ld.shared.v4.f32 {%f70,%f71,%f72,_}, [%rd49+0];
|
||||
mul.ftz.f32 %f73, %f35, %f56;
|
||||
mul.ftz.f32 %f74, %f70, %f57;
|
||||
sub.ftz.f32 %f75, %f74, %f71;
|
||||
mul.ftz.f32 %f76, %f73, %f75;
|
||||
sub.ftz.f32 %f77, %f76, %f72;
|
||||
add.ftz.f32 %f34, %f34, %f77;
|
||||
$Lt_1_27394:
|
||||
ld.param.s32 %r51, [__cudaparm_kernel_pair_fast_vflag];
|
||||
mov.u32 %r52, 0;
|
||||
setp.le.s32 %p11, %r51, %r52;
|
||||
@%p11 bra $Lt_1_27906;
|
||||
.loc 16 191 0
|
||||
mov.f32 %f78, %f11;
|
||||
mul.ftz.f32 %f79, %f45, %f45;
|
||||
fma.rn.ftz.f32 %f80, %f69, %f79, %f78;
|
||||
mov.f32 %f11, %f80;
|
||||
.loc 16 192 0
|
||||
mov.f32 %f81, %f13;
|
||||
fma.rn.ftz.f32 %f82, %f69, %f47, %f81;
|
||||
mov.f32 %f13, %f82;
|
||||
.loc 16 193 0
|
||||
mov.f32 %f83, %f15;
|
||||
mul.ftz.f32 %f84, %f46, %f46;
|
||||
fma.rn.ftz.f32 %f85, %f69, %f84, %f83;
|
||||
mov.f32 %f15, %f85;
|
||||
.loc 16 194 0
|
||||
mov.f32 %f86, %f17;
|
||||
mul.ftz.f32 %f87, %f44, %f45;
|
||||
fma.rn.ftz.f32 %f88, %f69, %f87, %f86;
|
||||
mov.f32 %f17, %f88;
|
||||
.loc 16 195 0
|
||||
mov.f32 %f89, %f19;
|
||||
mul.ftz.f32 %f90, %f45, %f46;
|
||||
fma.rn.ftz.f32 %f91, %f69, %f90, %f89;
|
||||
mov.f32 %f19, %f91;
|
||||
.loc 16 196 0
|
||||
mul.ftz.f32 %f92, %f44, %f46;
|
||||
fma.rn.ftz.f32 %f20, %f69, %f92, %f20;
|
||||
mov.f32 %f21, %f20;
|
||||
$Lt_1_27906:
|
||||
$Lt_1_25858:
|
||||
.loc 16 153 0
|
||||
mul.lo.u64 %rd50, %rd42, 4;
|
||||
add.u64 %rd35, %rd35, %rd50;
|
||||
setp.lt.u64 %p12, %rd35, %rd34;
|
||||
@%p12 bra $Lt_1_25602;
|
||||
bra.uni $Lt_1_25090;
|
||||
$Lt_1_34306:
|
||||
mov.f32 %f31, 0f00000000; // 0
|
||||
mov.f32 %f32, 0f00000000; // 0
|
||||
mov.f32 %f33, 0f00000000; // 0
|
||||
mov.f32 %f34, 0f00000000; // 0
|
||||
$Lt_1_25090:
|
||||
mov.u32 %r53, 1;
|
||||
setp.le.s32 %p13, %r6, %r53;
|
||||
@%p13 bra $Lt_1_30722;
|
||||
.loc 16 201 0
|
||||
mov.u64 %rd51, __cuda___cuda_local_var_32702_55_non_const_red_acc7168;
|
||||
cvt.s64.s32 %rd52, %r1;
|
||||
mul.wide.s32 %rd53, %r1, 4;
|
||||
add.u64 %rd54, %rd51, %rd53;
|
||||
mov.f32 %f93, %f33;
|
||||
st.shared.f32 [%rd54+0], %f93;
|
||||
mov.f32 %f94, %f32;
|
||||
st.shared.f32 [%rd54+512], %f94;
|
||||
mov.f32 %f95, %f31;
|
||||
st.shared.f32 [%rd54+1024], %f95;
|
||||
mov.f32 %f96, %f34;
|
||||
st.shared.f32 [%rd54+1536], %f96;
|
||||
shr.s32 %r54, %r6, 31;
|
||||
mov.s32 %r55, 1;
|
||||
and.b32 %r56, %r54, %r55;
|
||||
add.s32 %r57, %r56, %r6;
|
||||
shr.s32 %r58, %r57, 1;
|
||||
mov.s32 %r59, %r58;
|
||||
mov.u32 %r60, 0;
|
||||
setp.ne.u32 %p14, %r58, %r60;
|
||||
@!%p14 bra $Lt_1_29186;
|
||||
$Lt_1_29698:
|
||||
setp.ge.u32 %p15, %r17, %r59;
|
||||
@%p15 bra $Lt_1_29954;
|
||||
add.u32 %r61, %r1, %r59;
|
||||
cvt.u64.u32 %rd55, %r61;
|
||||
mul.wide.u32 %rd56, %r61, 4;
|
||||
add.u64 %rd57, %rd51, %rd56;
|
||||
ld.shared.f32 %f97, [%rd57+0];
|
||||
add.ftz.f32 %f93, %f97, %f93;
|
||||
st.shared.f32 [%rd54+0], %f93;
|
||||
ld.shared.f32 %f98, [%rd57+512];
|
||||
add.ftz.f32 %f94, %f98, %f94;
|
||||
st.shared.f32 [%rd54+512], %f94;
|
||||
ld.shared.f32 %f99, [%rd57+1024];
|
||||
add.ftz.f32 %f95, %f99, %f95;
|
||||
st.shared.f32 [%rd54+1024], %f95;
|
||||
ld.shared.f32 %f100, [%rd57+1536];
|
||||
add.ftz.f32 %f96, %f100, %f96;
|
||||
st.shared.f32 [%rd54+1536], %f96;
|
||||
$Lt_1_29954:
|
||||
shr.u32 %r59, %r59, 1;
|
||||
mov.u32 %r62, 0;
|
||||
setp.ne.u32 %p16, %r59, %r62;
|
||||
@%p16 bra $Lt_1_29698;
|
||||
$Lt_1_29186:
|
||||
mov.f32 %f33, %f93;
|
||||
mov.f32 %f32, %f94;
|
||||
mov.f32 %f31, %f95;
|
||||
mov.f32 %f34, %f96;
|
||||
ld.param.s32 %r63, [__cudaparm_kernel_pair_fast_vflag];
|
||||
mov.u32 %r64, 0;
|
||||
setp.le.s32 %p17, %r63, %r64;
|
||||
@%p17 bra $Lt_1_30722;
|
||||
mov.f32 %f93, %f11;
|
||||
st.shared.f32 [%rd54+0], %f93;
|
||||
mov.f32 %f94, %f13;
|
||||
st.shared.f32 [%rd54+512], %f94;
|
||||
mov.f32 %f95, %f15;
|
||||
st.shared.f32 [%rd54+1024], %f95;
|
||||
mov.f32 %f96, %f17;
|
||||
st.shared.f32 [%rd54+1536], %f96;
|
||||
mov.f32 %f101, %f19;
|
||||
st.shared.f32 [%rd54+2048], %f101;
|
||||
mov.f32 %f102, %f20;
|
||||
st.shared.f32 [%rd54+2560], %f102;
|
||||
mov.s32 %r65, %r58;
|
||||
@!%p14 bra $Lt_1_31234;
|
||||
$Lt_1_31746:
|
||||
setp.ge.u32 %p18, %r17, %r65;
|
||||
@%p18 bra $Lt_1_32002;
|
||||
add.u32 %r66, %r1, %r65;
|
||||
cvt.u64.u32 %rd58, %r66;
|
||||
mul.wide.u32 %rd59, %r66, 4;
|
||||
add.u64 %rd60, %rd51, %rd59;
|
||||
ld.shared.f32 %f103, [%rd60+0];
|
||||
add.ftz.f32 %f93, %f103, %f93;
|
||||
st.shared.f32 [%rd54+0], %f93;
|
||||
ld.shared.f32 %f104, [%rd60+512];
|
||||
add.ftz.f32 %f94, %f104, %f94;
|
||||
st.shared.f32 [%rd54+512], %f94;
|
||||
ld.shared.f32 %f105, [%rd60+1024];
|
||||
add.ftz.f32 %f95, %f105, %f95;
|
||||
st.shared.f32 [%rd54+1024], %f95;
|
||||
ld.shared.f32 %f106, [%rd60+1536];
|
||||
add.ftz.f32 %f96, %f106, %f96;
|
||||
st.shared.f32 [%rd54+1536], %f96;
|
||||
ld.shared.f32 %f107, [%rd60+2048];
|
||||
add.ftz.f32 %f101, %f107, %f101;
|
||||
st.shared.f32 [%rd54+2048], %f101;
|
||||
ld.shared.f32 %f108, [%rd60+2560];
|
||||
add.ftz.f32 %f102, %f108, %f102;
|
||||
st.shared.f32 [%rd54+2560], %f102;
|
||||
$Lt_1_32002:
|
||||
shr.u32 %r65, %r65, 1;
|
||||
mov.u32 %r67, 0;
|
||||
setp.ne.u32 %p19, %r65, %r67;
|
||||
@%p19 bra $Lt_1_31746;
|
||||
$Lt_1_31234:
|
||||
mov.f32 %f11, %f93;
|
||||
mov.f32 %f13, %f94;
|
||||
mov.f32 %f15, %f95;
|
||||
mov.f32 %f17, %f96;
|
||||
mov.f32 %f19, %f101;
|
||||
mov.f32 %f21, %f102;
|
||||
$Lt_1_30722:
|
||||
$Lt_1_28674:
|
||||
mov.u32 %r68, 0;
|
||||
setp.ne.s32 %p20, %r17, %r68;
|
||||
@%p20 bra $Lt_1_32770;
|
||||
ld.param.u64 %rd61, [__cudaparm_kernel_pair_fast___val_paramengv];
|
||||
add.u64 %rd62, %rd61, %rd20;
|
||||
ld.param.s32 %r69, [__cudaparm_kernel_pair_fast_eflag];
|
||||
mov.u32 %r70, 0;
|
||||
setp.le.s32 %p21, %r69, %r70;
|
||||
@%p21 bra $Lt_1_33282;
|
||||
st.global.f32 [%rd62+0], %f34;
|
||||
cvt.s64.s32 %rd63, %r13;
|
||||
mul.wide.s32 %rd64, %r13, 4;
|
||||
add.u64 %rd62, %rd62, %rd64;
|
||||
$Lt_1_33282:
|
||||
ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag];
|
||||
mov.u32 %r72, 0;
|
||||
setp.le.s32 %p22, %r71, %r72;
|
||||
@%p22 bra $Lt_1_33794;
|
||||
mov.f32 %f109, %f11;
|
||||
st.global.f32 [%rd62+0], %f109;
|
||||
cvt.s64.s32 %rd65, %r13;
|
||||
mul.wide.s32 %rd66, %r13, 4;
|
||||
add.u64 %rd67, %rd66, %rd62;
|
||||
mov.f32 %f110, %f13;
|
||||
st.global.f32 [%rd67+0], %f110;
|
||||
add.u64 %rd68, %rd66, %rd67;
|
||||
mov.f32 %f111, %f15;
|
||||
st.global.f32 [%rd68+0], %f111;
|
||||
add.u64 %rd69, %rd66, %rd68;
|
||||
mov.f32 %f112, %f17;
|
||||
st.global.f32 [%rd69+0], %f112;
|
||||
add.u64 %rd62, %rd66, %rd69;
|
||||
mov.f32 %f113, %f19;
|
||||
st.global.f32 [%rd62+0], %f113;
|
||||
mov.f32 %f114, %f21;
|
||||
add.u64 %rd70, %rd66, %rd62;
|
||||
st.global.f32 [%rd70+0], %f114;
|
||||
$Lt_1_33794:
|
||||
ld.param.u64 %rd71, [__cudaparm_kernel_pair_fast_ans];
|
||||
mul.lo.u64 %rd72, %rd19, 16;
|
||||
add.u64 %rd73, %rd71, %rd72;
|
||||
mov.f32 %f115, %f116;
|
||||
st.global.v4.f32 [%rd73+0], {%f33,%f32,%f31,%f115};
|
||||
$Lt_1_32770:
|
||||
$Lt_1_24066:
|
||||
.loc 16 204 0
|
||||
exit;
|
||||
$LDWend_kernel_pair_fast:
|
||||
} // kernel_pair_fast
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,906 +0,0 @@
|
||||
const char * cg_cmm =
|
||||
" .version 2.3\n"
|
||||
" .target sm_20\n"
|
||||
" .address_size 64\n"
|
||||
" .global .texref pos_tex;\n"
|
||||
" .entry kernel_pair (\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_x_,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_lj1,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_lj3,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_lj_types,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_sp_lj_in,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_dev_nbor,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_dev_packed,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_ans,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair___val_paramengv,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_eflag,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_vflag,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_inum,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_nbor_pitch,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_t_per_atom)\n"
|
||||
" {\n"
|
||||
" .reg .u32 %r<72>;\n"
|
||||
" .reg .u64 %rd<63>;\n"
|
||||
" .reg .f32 %f<111>;\n"
|
||||
" .reg .pred %p<21>;\n"
|
||||
" .shared .align 16 .b8 __cuda___cuda_local_var_32536_33_non_const_sp_lj92[16];\n"
|
||||
" .shared .align 4 .b8 __cuda___cuda_local_var_32608_55_non_const_red_acc108[3072];\n"
|
||||
" .loc 16 31 0\n"
|
||||
"$LDWbegin_kernel_pair:\n"
|
||||
" .loc 16 36 0\n"
|
||||
" ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];\n"
|
||||
" ldu.global.f32 %f1, [%rd1+0];\n"
|
||||
" .loc 16 37 0\n"
|
||||
" ld.global.f32 %f2, [%rd1+4];\n"
|
||||
" .loc 16 38 0\n"
|
||||
" ld.global.f32 %f3, [%rd1+8];\n"
|
||||
" .loc 16 39 0\n"
|
||||
" ld.global.f32 %f4, [%rd1+12];\n"
|
||||
" st.shared.v4.f32 [__cuda___cuda_local_var_32536_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4};\n"
|
||||
" .loc 16 46 0\n"
|
||||
" mov.f32 %f5, 0f00000000; \n"
|
||||
" mov.f32 %f6, %f5;\n"
|
||||
" mov.f32 %f7, 0f00000000; \n"
|
||||
" mov.f32 %f8, %f7;\n"
|
||||
" mov.f32 %f9, 0f00000000; \n"
|
||||
" mov.f32 %f10, %f9;\n"
|
||||
" mov.f32 %f11, 0f00000000; \n"
|
||||
" mov.f32 %f12, %f11;\n"
|
||||
" mov.f32 %f13, 0f00000000; \n"
|
||||
" mov.f32 %f14, %f13;\n"
|
||||
" mov.f32 %f15, 0f00000000; \n"
|
||||
" mov.f32 %f16, %f15;\n"
|
||||
" ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];\n"
|
||||
" cvt.s32.u32 %r2, %tid.x;\n"
|
||||
" div.s32 %r3, %r2, %r1;\n"
|
||||
" cvt.s32.u32 %r4, %ntid.x;\n"
|
||||
" div.s32 %r5, %r4, %r1;\n"
|
||||
" cvt.s32.u32 %r6, %ctaid.x;\n"
|
||||
" mul.lo.s32 %r7, %r6, %r5;\n"
|
||||
" add.s32 %r8, %r3, %r7;\n"
|
||||
" ld.param.s32 %r9, [__cudaparm_kernel_pair_inum];\n"
|
||||
" setp.ge.s32 %p1, %r8, %r9;\n"
|
||||
" @%p1 bra $Lt_0_28930;\n"
|
||||
" .loc 16 51 0\n"
|
||||
" ld.param.s32 %r10, [__cudaparm_kernel_pair_nbor_pitch];\n"
|
||||
" cvt.s64.s32 %rd2, %r10;\n"
|
||||
" mul.wide.s32 %rd3, %r10, 4;\n"
|
||||
" cvt.s64.s32 %rd4, %r8;\n"
|
||||
" mul.wide.s32 %rd5, %r8, 4;\n"
|
||||
" ld.param.u64 %rd6, [__cudaparm_kernel_pair_dev_nbor];\n"
|
||||
" add.u64 %rd7, %rd5, %rd6;\n"
|
||||
" add.u64 %rd8, %rd3, %rd7;\n"
|
||||
" ld.global.s32 %r11, [%rd8+0];\n"
|
||||
" sub.s32 %r12, %r1, 1;\n"
|
||||
" and.b32 %r13, %r12, %r2;\n"
|
||||
" cvt.s64.s32 %rd9, %r13;\n"
|
||||
" mul.wide.s32 %rd10, %r13, 4;\n"
|
||||
" ld.param.u64 %rd11, [__cudaparm_kernel_pair_dev_packed];\n"
|
||||
" setp.ne.u64 %p2, %rd11, %rd6;\n"
|
||||
" @%p2 bra $Lt_0_20994;\n"
|
||||
" cvt.s32.s64 %r14, %rd2;\n"
|
||||
" mul.lo.s32 %r15, %r14, %r1;\n"
|
||||
" mov.s32 %r16, %r15;\n"
|
||||
" mul.lo.s32 %r17, %r12, %r8;\n"
|
||||
" add.s32 %r18, %r14, %r17;\n"
|
||||
" cvt.s64.s32 %rd12, %r18;\n"
|
||||
" mul.wide.s32 %rd13, %r18, 4;\n"
|
||||
" add.u64 %rd14, %rd8, %rd13;\n"
|
||||
" and.b32 %r19, %r12, %r11;\n"
|
||||
" cvt.s64.s32 %rd15, %r19;\n"
|
||||
" div.s32 %r20, %r11, %r1;\n"
|
||||
" mul.lo.s32 %r21, %r15, %r20;\n"
|
||||
" cvt.s64.s32 %rd16, %r21;\n"
|
||||
" add.u64 %rd17, %rd15, %rd16;\n"
|
||||
" mul.lo.u64 %rd18, %rd17, 4;\n"
|
||||
" add.u64 %rd19, %rd14, %rd18;\n"
|
||||
" add.u64 %rd20, %rd10, %rd14;\n"
|
||||
" bra.uni $Lt_0_20738;\n"
|
||||
"$Lt_0_20994:\n"
|
||||
" add.u64 %rd21, %rd3, %rd8;\n"
|
||||
" ld.global.s32 %r22, [%rd21+0];\n"
|
||||
" cvt.s64.s32 %rd22, %r22;\n"
|
||||
" mul.wide.s32 %rd23, %r22, 4;\n"
|
||||
" add.u64 %rd24, %rd11, %rd23;\n"
|
||||
" cvt.s64.s32 %rd25, %r11;\n"
|
||||
" mul.wide.s32 %rd26, %r11, 4;\n"
|
||||
" add.u64 %rd19, %rd24, %rd26;\n"
|
||||
" mov.s32 %r16, %r1;\n"
|
||||
" add.u64 %rd20, %rd10, %rd24;\n"
|
||||
"$Lt_0_20738:\n"
|
||||
" .loc 16 54 0\n"
|
||||
" ld.global.s32 %r23, [%rd7+0];\n"
|
||||
" mov.u32 %r24, %r23;\n"
|
||||
" mov.s32 %r25, 0;\n"
|
||||
" mov.u32 %r26, %r25;\n"
|
||||
" mov.s32 %r27, 0;\n"
|
||||
" mov.u32 %r28, %r27;\n"
|
||||
" mov.s32 %r29, 0;\n"
|
||||
" mov.u32 %r30, %r29;\n"
|
||||
" tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r24,%r26,%r28,%r30}];\n"
|
||||
" mov.f32 %f21, %f17;\n"
|
||||
" mov.f32 %f22, %f18;\n"
|
||||
" mov.f32 %f23, %f19;\n"
|
||||
" mov.f32 %f24, %f20;\n"
|
||||
" setp.ge.u64 %p3, %rd20, %rd19;\n"
|
||||
" @%p3 bra $Lt_0_30466;\n"
|
||||
" cvt.rzi.ftz.s32.f32 %r31, %f24;\n"
|
||||
" cvt.s64.s32 %rd27, %r16;\n"
|
||||
" ld.param.s32 %r32, [__cudaparm_kernel_pair_lj_types];\n"
|
||||
" mul.lo.s32 %r33, %r32, %r31;\n"
|
||||
" ld.param.u64 %rd28, [__cudaparm_kernel_pair_lj1];\n"
|
||||
" mov.f32 %f25, 0f00000000; \n"
|
||||
" mov.f32 %f26, 0f00000000; \n"
|
||||
" mov.f32 %f27, 0f00000000; \n"
|
||||
" mov.f32 %f28, 0f00000000; \n"
|
||||
" mov.u64 %rd29, __cuda___cuda_local_var_32536_33_non_const_sp_lj92;\n"
|
||||
"$Lt_0_21762:\n"
|
||||
" .loc 16 60 0\n"
|
||||
" ld.global.s32 %r34, [%rd20+0];\n"
|
||||
" .loc 16 61 0\n"
|
||||
" shr.s32 %r35, %r34, 30;\n"
|
||||
" and.b32 %r36, %r35, 3;\n"
|
||||
" cvt.s64.s32 %rd30, %r36;\n"
|
||||
" mul.wide.s32 %rd31, %r36, 4;\n"
|
||||
" add.u64 %rd32, %rd29, %rd31;\n"
|
||||
" ld.shared.f32 %f29, [%rd32+0];\n"
|
||||
" .loc 16 64 0\n"
|
||||
" and.b32 %r37, %r34, 1073741823;\n"
|
||||
" mov.u32 %r38, %r37;\n"
|
||||
" mov.s32 %r39, 0;\n"
|
||||
" mov.u32 %r40, %r39;\n"
|
||||
" mov.s32 %r41, 0;\n"
|
||||
" mov.u32 %r42, %r41;\n"
|
||||
" mov.s32 %r43, 0;\n"
|
||||
" mov.u32 %r44, %r43;\n"
|
||||
" tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r38,%r40,%r42,%r44}];\n"
|
||||
" mov.f32 %f34, %f30;\n"
|
||||
" mov.f32 %f35, %f31;\n"
|
||||
" mov.f32 %f36, %f32;\n"
|
||||
" mov.f32 %f37, %f33;\n"
|
||||
" cvt.rzi.ftz.s32.f32 %r45, %f37;\n"
|
||||
" sub.ftz.f32 %f38, %f22, %f35;\n"
|
||||
" sub.ftz.f32 %f39, %f21, %f34;\n"
|
||||
" sub.ftz.f32 %f40, %f23, %f36;\n"
|
||||
" mul.ftz.f32 %f41, %f38, %f38;\n"
|
||||
" fma.rn.ftz.f32 %f42, %f39, %f39, %f41;\n"
|
||||
" fma.rn.ftz.f32 %f43, %f40, %f40, %f42;\n"
|
||||
" add.s32 %r46, %r45, %r33;\n"
|
||||
" cvt.s64.s32 %rd33, %r46;\n"
|
||||
" mul.wide.s32 %rd34, %r46, 16;\n"
|
||||
" add.u64 %rd35, %rd34, %rd28;\n"
|
||||
" ld.global.f32 %f44, [%rd35+0];\n"
|
||||
" setp.gt.ftz.f32 %p4, %f44, %f43;\n"
|
||||
" @!%p4 bra $Lt_0_24066;\n"
|
||||
" rcp.approx.ftz.f32 %f45, %f43;\n"
|
||||
" ld.global.f32 %f46, [%rd35+4];\n"
|
||||
" mov.f32 %f47, 0f40000000; \n"
|
||||
" setp.eq.ftz.f32 %p5, %f46, %f47;\n"
|
||||
" @!%p5 bra $Lt_0_22786;\n"
|
||||
" .loc 16 79 0\n"
|
||||
" mul.ftz.f32 %f48, %f45, %f45;\n"
|
||||
" mov.f32 %f49, %f48;\n"
|
||||
" .loc 16 80 0\n"
|
||||
" mul.ftz.f32 %f50, %f48, %f48;\n"
|
||||
" bra.uni $Lt_0_23042;\n"
|
||||
"$Lt_0_22786:\n"
|
||||
" mov.f32 %f51, 0f3f800000; \n"
|
||||
" setp.eq.ftz.f32 %p6, %f46, %f51;\n"
|
||||
" @!%p6 bra $Lt_0_23298;\n"
|
||||
" .loc 16 82 0\n"
|
||||
" sqrt.approx.ftz.f32 %f52, %f45;\n"
|
||||
" mul.ftz.f32 %f53, %f45, %f52;\n"
|
||||
" mov.f32 %f50, %f53;\n"
|
||||
" .loc 16 83 0\n"
|
||||
" mul.ftz.f32 %f49, %f53, %f53;\n"
|
||||
" bra.uni $Lt_0_23042;\n"
|
||||
"$Lt_0_23298:\n"
|
||||
" .loc 16 85 0\n"
|
||||
" mul.ftz.f32 %f54, %f45, %f45;\n"
|
||||
" mul.ftz.f32 %f55, %f45, %f54;\n"
|
||||
" mov.f32 %f49, %f55;\n"
|
||||
" .loc 16 86 0\n"
|
||||
" mov.f32 %f50, %f55;\n"
|
||||
"$Lt_0_23042:\n"
|
||||
"$Lt_0_22530:\n"
|
||||
" .loc 16 88 0\n"
|
||||
" mul.ftz.f32 %f56, %f45, %f29;\n"
|
||||
" mul.ftz.f32 %f57, %f49, %f56;\n"
|
||||
" ld.global.v2.f32 {%f58,%f59}, [%rd35+8];\n"
|
||||
" mul.ftz.f32 %f60, %f58, %f50;\n"
|
||||
" sub.ftz.f32 %f61, %f60, %f59;\n"
|
||||
" mul.ftz.f32 %f62, %f57, %f61;\n"
|
||||
" .loc 16 90 0\n"
|
||||
" fma.rn.ftz.f32 %f27, %f39, %f62, %f27;\n"
|
||||
" .loc 16 91 0\n"
|
||||
" fma.rn.ftz.f32 %f26, %f38, %f62, %f26;\n"
|
||||
" .loc 16 92 0\n"
|
||||
" fma.rn.ftz.f32 %f25, %f40, %f62, %f25;\n"
|
||||
" ld.param.s32 %r47, [__cudaparm_kernel_pair_eflag];\n"
|
||||
" mov.u32 %r48, 0;\n"
|
||||
" setp.le.s32 %p7, %r47, %r48;\n"
|
||||
" @%p7 bra $Lt_0_23554;\n"
|
||||
" .loc 16 94 0\n"
|
||||
" ld.param.u64 %rd36, [__cudaparm_kernel_pair_lj3];\n"
|
||||
" add.u64 %rd37, %rd36, %rd34;\n"
|
||||
" ld.global.v4.f32 {%f63,%f64,%f65,_}, [%rd37+0];\n"
|
||||
" mul.ftz.f32 %f66, %f29, %f49;\n"
|
||||
" mul.ftz.f32 %f67, %f63, %f50;\n"
|
||||
" sub.ftz.f32 %f68, %f67, %f64;\n"
|
||||
" mul.ftz.f32 %f69, %f66, %f68;\n"
|
||||
" sub.ftz.f32 %f70, %f69, %f65;\n"
|
||||
" add.ftz.f32 %f28, %f28, %f70;\n"
|
||||
"$Lt_0_23554:\n"
|
||||
" ld.param.s32 %r49, [__cudaparm_kernel_pair_vflag];\n"
|
||||
" mov.u32 %r50, 0;\n"
|
||||
" setp.le.s32 %p8, %r49, %r50;\n"
|
||||
" @%p8 bra $Lt_0_24066;\n"
|
||||
" .loc 16 97 0\n"
|
||||
" mov.f32 %f71, %f6;\n"
|
||||
" mul.ftz.f32 %f72, %f39, %f39;\n"
|
||||
" fma.rn.ftz.f32 %f73, %f62, %f72, %f71;\n"
|
||||
" mov.f32 %f6, %f73;\n"
|
||||
" .loc 16 98 0\n"
|
||||
" mov.f32 %f74, %f8;\n"
|
||||
" fma.rn.ftz.f32 %f75, %f62, %f41, %f74;\n"
|
||||
" mov.f32 %f8, %f75;\n"
|
||||
" .loc 16 99 0\n"
|
||||
" mov.f32 %f76, %f10;\n"
|
||||
" mul.ftz.f32 %f77, %f40, %f40;\n"
|
||||
" fma.rn.ftz.f32 %f78, %f62, %f77, %f76;\n"
|
||||
" mov.f32 %f10, %f78;\n"
|
||||
" .loc 16 100 0\n"
|
||||
" mov.f32 %f79, %f12;\n"
|
||||
" mul.ftz.f32 %f80, %f38, %f39;\n"
|
||||
" fma.rn.ftz.f32 %f81, %f62, %f80, %f79;\n"
|
||||
" mov.f32 %f12, %f81;\n"
|
||||
" .loc 16 101 0\n"
|
||||
" mov.f32 %f82, %f14;\n"
|
||||
" mul.ftz.f32 %f83, %f39, %f40;\n"
|
||||
" fma.rn.ftz.f32 %f84, %f62, %f83, %f82;\n"
|
||||
" mov.f32 %f14, %f84;\n"
|
||||
" .loc 16 102 0\n"
|
||||
" mul.ftz.f32 %f85, %f38, %f40;\n"
|
||||
" fma.rn.ftz.f32 %f15, %f62, %f85, %f15;\n"
|
||||
" mov.f32 %f16, %f15;\n"
|
||||
"$Lt_0_24066:\n"
|
||||
"$Lt_0_22018:\n"
|
||||
" .loc 16 58 0\n"
|
||||
" mul.lo.u64 %rd38, %rd27, 4;\n"
|
||||
" add.u64 %rd20, %rd20, %rd38;\n"
|
||||
" setp.lt.u64 %p9, %rd20, %rd19;\n"
|
||||
" @%p9 bra $Lt_0_21762;\n"
|
||||
" bra.uni $Lt_0_21250;\n"
|
||||
"$Lt_0_30466:\n"
|
||||
" mov.f32 %f25, 0f00000000; \n"
|
||||
" mov.f32 %f26, 0f00000000; \n"
|
||||
" mov.f32 %f27, 0f00000000; \n"
|
||||
" mov.f32 %f28, 0f00000000; \n"
|
||||
"$Lt_0_21250:\n"
|
||||
" mov.u32 %r51, 1;\n"
|
||||
" setp.le.s32 %p10, %r1, %r51;\n"
|
||||
" @%p10 bra $Lt_0_26882;\n"
|
||||
" .loc 16 107 0\n"
|
||||
" mov.u64 %rd39, __cuda___cuda_local_var_32608_55_non_const_red_acc108;\n"
|
||||
" cvt.s64.s32 %rd40, %r2;\n"
|
||||
" mul.wide.s32 %rd41, %r2, 4;\n"
|
||||
" add.u64 %rd42, %rd39, %rd41;\n"
|
||||
" mov.f32 %f86, %f27;\n"
|
||||
" st.shared.f32 [%rd42+0], %f86;\n"
|
||||
" mov.f32 %f87, %f26;\n"
|
||||
" st.shared.f32 [%rd42+512], %f87;\n"
|
||||
" mov.f32 %f88, %f25;\n"
|
||||
" st.shared.f32 [%rd42+1024], %f88;\n"
|
||||
" mov.f32 %f89, %f28;\n"
|
||||
" st.shared.f32 [%rd42+1536], %f89;\n"
|
||||
" shr.s32 %r52, %r1, 31;\n"
|
||||
" mov.s32 %r53, 1;\n"
|
||||
" and.b32 %r54, %r52, %r53;\n"
|
||||
" add.s32 %r55, %r54, %r1;\n"
|
||||
" shr.s32 %r56, %r55, 1;\n"
|
||||
" mov.s32 %r57, %r56;\n"
|
||||
" mov.u32 %r58, 0;\n"
|
||||
" setp.ne.u32 %p11, %r56, %r58;\n"
|
||||
" @!%p11 bra $Lt_0_25346;\n"
|
||||
"$Lt_0_25858:\n"
|
||||
" setp.ge.u32 %p12, %r13, %r57;\n"
|
||||
" @%p12 bra $Lt_0_26114;\n"
|
||||
" add.u32 %r59, %r2, %r57;\n"
|
||||
" cvt.u64.u32 %rd43, %r59;\n"
|
||||
" mul.wide.u32 %rd44, %r59, 4;\n"
|
||||
" add.u64 %rd45, %rd39, %rd44;\n"
|
||||
" ld.shared.f32 %f90, [%rd45+0];\n"
|
||||
" add.ftz.f32 %f86, %f90, %f86;\n"
|
||||
" st.shared.f32 [%rd42+0], %f86;\n"
|
||||
" ld.shared.f32 %f91, [%rd45+512];\n"
|
||||
" add.ftz.f32 %f87, %f91, %f87;\n"
|
||||
" st.shared.f32 [%rd42+512], %f87;\n"
|
||||
" ld.shared.f32 %f92, [%rd45+1024];\n"
|
||||
" add.ftz.f32 %f88, %f92, %f88;\n"
|
||||
" st.shared.f32 [%rd42+1024], %f88;\n"
|
||||
" ld.shared.f32 %f93, [%rd45+1536];\n"
|
||||
" add.ftz.f32 %f89, %f93, %f89;\n"
|
||||
" st.shared.f32 [%rd42+1536], %f89;\n"
|
||||
"$Lt_0_26114:\n"
|
||||
" shr.u32 %r57, %r57, 1;\n"
|
||||
" mov.u32 %r60, 0;\n"
|
||||
" setp.ne.u32 %p13, %r57, %r60;\n"
|
||||
" @%p13 bra $Lt_0_25858;\n"
|
||||
"$Lt_0_25346:\n"
|
||||
" mov.f32 %f27, %f86;\n"
|
||||
" mov.f32 %f26, %f87;\n"
|
||||
" mov.f32 %f25, %f88;\n"
|
||||
" mov.f32 %f28, %f89;\n"
|
||||
" ld.param.s32 %r61, [__cudaparm_kernel_pair_vflag];\n"
|
||||
" mov.u32 %r62, 0;\n"
|
||||
" setp.le.s32 %p14, %r61, %r62;\n"
|
||||
" @%p14 bra $Lt_0_26882;\n"
|
||||
" mov.f32 %f86, %f6;\n"
|
||||
" st.shared.f32 [%rd42+0], %f86;\n"
|
||||
" mov.f32 %f87, %f8;\n"
|
||||
" st.shared.f32 [%rd42+512], %f87;\n"
|
||||
" mov.f32 %f88, %f10;\n"
|
||||
" st.shared.f32 [%rd42+1024], %f88;\n"
|
||||
" mov.f32 %f89, %f12;\n"
|
||||
" st.shared.f32 [%rd42+1536], %f89;\n"
|
||||
" mov.f32 %f94, %f14;\n"
|
||||
" st.shared.f32 [%rd42+2048], %f94;\n"
|
||||
" mov.f32 %f95, %f15;\n"
|
||||
" st.shared.f32 [%rd42+2560], %f95;\n"
|
||||
" mov.s32 %r63, %r56;\n"
|
||||
" @!%p11 bra $Lt_0_27394;\n"
|
||||
"$Lt_0_27906:\n"
|
||||
" setp.ge.u32 %p15, %r13, %r63;\n"
|
||||
" @%p15 bra $Lt_0_28162;\n"
|
||||
" add.u32 %r64, %r2, %r63;\n"
|
||||
" cvt.u64.u32 %rd46, %r64;\n"
|
||||
" mul.wide.u32 %rd47, %r64, 4;\n"
|
||||
" add.u64 %rd48, %rd39, %rd47;\n"
|
||||
" ld.shared.f32 %f96, [%rd48+0];\n"
|
||||
" add.ftz.f32 %f86, %f96, %f86;\n"
|
||||
" st.shared.f32 [%rd42+0], %f86;\n"
|
||||
" ld.shared.f32 %f97, [%rd48+512];\n"
|
||||
" add.ftz.f32 %f87, %f97, %f87;\n"
|
||||
" st.shared.f32 [%rd42+512], %f87;\n"
|
||||
" ld.shared.f32 %f98, [%rd48+1024];\n"
|
||||
" add.ftz.f32 %f88, %f98, %f88;\n"
|
||||
" st.shared.f32 [%rd42+1024], %f88;\n"
|
||||
" ld.shared.f32 %f99, [%rd48+1536];\n"
|
||||
" add.ftz.f32 %f89, %f99, %f89;\n"
|
||||
" st.shared.f32 [%rd42+1536], %f89;\n"
|
||||
" ld.shared.f32 %f100, [%rd48+2048];\n"
|
||||
" add.ftz.f32 %f94, %f100, %f94;\n"
|
||||
" st.shared.f32 [%rd42+2048], %f94;\n"
|
||||
" ld.shared.f32 %f101, [%rd48+2560];\n"
|
||||
" add.ftz.f32 %f95, %f101, %f95;\n"
|
||||
" st.shared.f32 [%rd42+2560], %f95;\n"
|
||||
"$Lt_0_28162:\n"
|
||||
" shr.u32 %r63, %r63, 1;\n"
|
||||
" mov.u32 %r65, 0;\n"
|
||||
" setp.ne.u32 %p16, %r63, %r65;\n"
|
||||
" @%p16 bra $Lt_0_27906;\n"
|
||||
"$Lt_0_27394:\n"
|
||||
" mov.f32 %f6, %f86;\n"
|
||||
" mov.f32 %f8, %f87;\n"
|
||||
" mov.f32 %f10, %f88;\n"
|
||||
" mov.f32 %f12, %f89;\n"
|
||||
" mov.f32 %f14, %f94;\n"
|
||||
" mov.f32 %f16, %f95;\n"
|
||||
"$Lt_0_26882:\n"
|
||||
"$Lt_0_24834:\n"
|
||||
" mov.u32 %r66, 0;\n"
|
||||
" setp.ne.s32 %p17, %r13, %r66;\n"
|
||||
" @%p17 bra $Lt_0_28930;\n"
|
||||
" ld.param.u64 %rd49, [__cudaparm_kernel_pair___val_paramengv];\n"
|
||||
" add.u64 %rd50, %rd49, %rd5;\n"
|
||||
" ld.param.s32 %r67, [__cudaparm_kernel_pair_eflag];\n"
|
||||
" mov.u32 %r68, 0;\n"
|
||||
" setp.le.s32 %p18, %r67, %r68;\n"
|
||||
" @%p18 bra $Lt_0_29442;\n"
|
||||
" st.global.f32 [%rd50+0], %f28;\n"
|
||||
" cvt.s64.s32 %rd51, %r9;\n"
|
||||
" mul.wide.s32 %rd52, %r9, 4;\n"
|
||||
" add.u64 %rd50, %rd50, %rd52;\n"
|
||||
"$Lt_0_29442:\n"
|
||||
" ld.param.s32 %r69, [__cudaparm_kernel_pair_vflag];\n"
|
||||
" mov.u32 %r70, 0;\n"
|
||||
" setp.le.s32 %p19, %r69, %r70;\n"
|
||||
" @%p19 bra $Lt_0_29954;\n"
|
||||
" mov.f32 %f102, %f6;\n"
|
||||
" st.global.f32 [%rd50+0], %f102;\n"
|
||||
" cvt.s64.s32 %rd53, %r9;\n"
|
||||
" mul.wide.s32 %rd54, %r9, 4;\n"
|
||||
" add.u64 %rd55, %rd54, %rd50;\n"
|
||||
" mov.f32 %f103, %f8;\n"
|
||||
" st.global.f32 [%rd55+0], %f103;\n"
|
||||
" add.u64 %rd56, %rd54, %rd55;\n"
|
||||
" mov.f32 %f104, %f10;\n"
|
||||
" st.global.f32 [%rd56+0], %f104;\n"
|
||||
" add.u64 %rd57, %rd54, %rd56;\n"
|
||||
" mov.f32 %f105, %f12;\n"
|
||||
" st.global.f32 [%rd57+0], %f105;\n"
|
||||
" add.u64 %rd50, %rd54, %rd57;\n"
|
||||
" mov.f32 %f106, %f14;\n"
|
||||
" st.global.f32 [%rd50+0], %f106;\n"
|
||||
" mov.f32 %f107, %f16;\n"
|
||||
" add.u64 %rd58, %rd54, %rd50;\n"
|
||||
" st.global.f32 [%rd58+0], %f107;\n"
|
||||
"$Lt_0_29954:\n"
|
||||
" ld.param.u64 %rd59, [__cudaparm_kernel_pair_ans];\n"
|
||||
" mul.lo.u64 %rd60, %rd4, 16;\n"
|
||||
" add.u64 %rd61, %rd59, %rd60;\n"
|
||||
" mov.f32 %f108, %f109;\n"
|
||||
" st.global.v4.f32 [%rd61+0], {%f27,%f26,%f25,%f108};\n"
|
||||
"$Lt_0_28930:\n"
|
||||
"$Lt_0_20226:\n"
|
||||
" .loc 16 110 0\n"
|
||||
" exit;\n"
|
||||
"$LDWend_kernel_pair:\n"
|
||||
" }\n"
|
||||
" .entry kernel_pair_fast (\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_x_,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_lj1_in,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_lj3_in,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_dev_packed,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_ans,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast___val_paramengv,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_eflag,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_vflag,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_inum,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_t_per_atom)\n"
|
||||
" {\n"
|
||||
" .reg .u32 %r<74>;\n"
|
||||
" .reg .u64 %rd<75>;\n"
|
||||
" .reg .f32 %f<118>;\n"
|
||||
" .reg .pred %p<24>;\n"
|
||||
" .shared .align 4 .b8 __cuda___cuda_local_var_32625_33_non_const_sp_lj3268[16];\n"
|
||||
" .shared .align 16 .b8 __cuda___cuda_local_var_32623_34_non_const_lj13296[1936];\n"
|
||||
" .shared .align 16 .b8 __cuda___cuda_local_var_32624_34_non_const_lj35232[1936];\n"
|
||||
" .shared .align 4 .b8 __cuda___cuda_local_var_32702_55_non_const_red_acc7168[3072];\n"
|
||||
" .loc 16 118 0\n"
|
||||
"$LDWbegin_kernel_pair_fast:\n"
|
||||
" cvt.s32.u32 %r1, %tid.x;\n"
|
||||
" mov.u32 %r2, 3;\n"
|
||||
" setp.gt.s32 %p1, %r1, %r2;\n"
|
||||
" @%p1 bra $Lt_1_22530;\n"
|
||||
" .loc 16 126 0\n"
|
||||
" mov.u64 %rd1, __cuda___cuda_local_var_32625_33_non_const_sp_lj3268;\n"
|
||||
" cvt.s64.s32 %rd2, %r1;\n"
|
||||
" mul.wide.s32 %rd3, %r1, 4;\n"
|
||||
" ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];\n"
|
||||
" add.u64 %rd5, %rd4, %rd3;\n"
|
||||
" ld.global.f32 %f1, [%rd5+0];\n"
|
||||
" add.u64 %rd6, %rd3, %rd1;\n"
|
||||
" st.shared.f32 [%rd6+0], %f1;\n"
|
||||
"$Lt_1_22530:\n"
|
||||
" mov.u64 %rd1, __cuda___cuda_local_var_32625_33_non_const_sp_lj3268;\n"
|
||||
" mov.u32 %r3, 120;\n"
|
||||
" setp.gt.s32 %p2, %r1, %r3;\n"
|
||||
" @%p2 bra $Lt_1_23042;\n"
|
||||
" .loc 16 128 0\n"
|
||||
" mov.u64 %rd7, __cuda___cuda_local_var_32623_34_non_const_lj13296;\n"
|
||||
" cvt.s64.s32 %rd8, %r1;\n"
|
||||
" mul.wide.s32 %rd9, %r1, 16;\n"
|
||||
" ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_lj1_in];\n"
|
||||
" add.u64 %rd11, %rd10, %rd9;\n"
|
||||
" add.u64 %rd12, %rd9, %rd7;\n"
|
||||
" ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0];\n"
|
||||
" st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5};\n"
|
||||
" ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag];\n"
|
||||
" mov.u32 %r5, 0;\n"
|
||||
" setp.le.s32 %p3, %r4, %r5;\n"
|
||||
" @%p3 bra $Lt_1_23554;\n"
|
||||
" .loc 16 130 0\n"
|
||||
" mov.u64 %rd13, __cuda___cuda_local_var_32624_34_non_const_lj35232;\n"
|
||||
" ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in];\n"
|
||||
" add.u64 %rd15, %rd14, %rd9;\n"
|
||||
" add.u64 %rd16, %rd9, %rd13;\n"
|
||||
" ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0];\n"
|
||||
" st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9};\n"
|
||||
"$Lt_1_23554:\n"
|
||||
" mov.u64 %rd13, __cuda___cuda_local_var_32624_34_non_const_lj35232;\n"
|
||||
"$Lt_1_23042:\n"
|
||||
" mov.u64 %rd13, __cuda___cuda_local_var_32624_34_non_const_lj35232;\n"
|
||||
" mov.u64 %rd7, __cuda___cuda_local_var_32623_34_non_const_lj13296;\n"
|
||||
" .loc 16 138 0\n"
|
||||
" mov.f32 %f10, 0f00000000; \n"
|
||||
" mov.f32 %f11, %f10;\n"
|
||||
" mov.f32 %f12, 0f00000000; \n"
|
||||
" mov.f32 %f13, %f12;\n"
|
||||
" mov.f32 %f14, 0f00000000; \n"
|
||||
" mov.f32 %f15, %f14;\n"
|
||||
" mov.f32 %f16, 0f00000000; \n"
|
||||
" mov.f32 %f17, %f16;\n"
|
||||
" mov.f32 %f18, 0f00000000; \n"
|
||||
" mov.f32 %f19, %f18;\n"
|
||||
" mov.f32 %f20, 0f00000000; \n"
|
||||
" mov.f32 %f21, %f20;\n"
|
||||
" .loc 16 140 0\n"
|
||||
" bar.sync 0;\n"
|
||||
" ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom];\n"
|
||||
" div.s32 %r7, %r1, %r6;\n"
|
||||
" cvt.s32.u32 %r8, %ntid.x;\n"
|
||||
" div.s32 %r9, %r8, %r6;\n"
|
||||
" cvt.s32.u32 %r10, %ctaid.x;\n"
|
||||
" mul.lo.s32 %r11, %r10, %r9;\n"
|
||||
" add.s32 %r12, %r7, %r11;\n"
|
||||
" ld.param.s32 %r13, [__cudaparm_kernel_pair_fast_inum];\n"
|
||||
" setp.ge.s32 %p4, %r12, %r13;\n"
|
||||
" @%p4 bra $Lt_1_32770;\n"
|
||||
" .loc 16 145 0\n"
|
||||
" ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_nbor_pitch];\n"
|
||||
" cvt.s64.s32 %rd17, %r14;\n"
|
||||
" mul.wide.s32 %rd18, %r14, 4;\n"
|
||||
" cvt.s64.s32 %rd19, %r12;\n"
|
||||
" mul.wide.s32 %rd20, %r12, 4;\n"
|
||||
" ld.param.u64 %rd21, [__cudaparm_kernel_pair_fast_dev_nbor];\n"
|
||||
" add.u64 %rd22, %rd20, %rd21;\n"
|
||||
" add.u64 %rd23, %rd18, %rd22;\n"
|
||||
" ld.global.s32 %r15, [%rd23+0];\n"
|
||||
" sub.s32 %r16, %r6, 1;\n"
|
||||
" and.b32 %r17, %r16, %r1;\n"
|
||||
" cvt.s64.s32 %rd24, %r17;\n"
|
||||
" mul.wide.s32 %rd25, %r17, 4;\n"
|
||||
" ld.param.u64 %rd26, [__cudaparm_kernel_pair_fast_dev_packed];\n"
|
||||
" setp.ne.u64 %p5, %rd26, %rd21;\n"
|
||||
" @%p5 bra $Lt_1_24834;\n"
|
||||
" cvt.s32.s64 %r18, %rd17;\n"
|
||||
" mul.lo.s32 %r19, %r18, %r6;\n"
|
||||
" mov.s32 %r20, %r19;\n"
|
||||
" mul.lo.s32 %r21, %r16, %r12;\n"
|
||||
" add.s32 %r22, %r18, %r21;\n"
|
||||
" cvt.s64.s32 %rd27, %r22;\n"
|
||||
" mul.wide.s32 %rd28, %r22, 4;\n"
|
||||
" add.u64 %rd29, %rd23, %rd28;\n"
|
||||
" and.b32 %r23, %r16, %r15;\n"
|
||||
" cvt.s64.s32 %rd30, %r23;\n"
|
||||
" div.s32 %r24, %r15, %r6;\n"
|
||||
" mul.lo.s32 %r25, %r19, %r24;\n"
|
||||
" cvt.s64.s32 %rd31, %r25;\n"
|
||||
" add.u64 %rd32, %rd30, %rd31;\n"
|
||||
" mul.lo.u64 %rd33, %rd32, 4;\n"
|
||||
" add.u64 %rd34, %rd29, %rd33;\n"
|
||||
" add.u64 %rd35, %rd25, %rd29;\n"
|
||||
" bra.uni $Lt_1_24578;\n"
|
||||
"$Lt_1_24834:\n"
|
||||
" add.u64 %rd36, %rd18, %rd23;\n"
|
||||
" ld.global.s32 %r26, [%rd36+0];\n"
|
||||
" cvt.s64.s32 %rd37, %r26;\n"
|
||||
" mul.wide.s32 %rd38, %r26, 4;\n"
|
||||
" add.u64 %rd39, %rd26, %rd38;\n"
|
||||
" cvt.s64.s32 %rd40, %r15;\n"
|
||||
" mul.wide.s32 %rd41, %r15, 4;\n"
|
||||
" add.u64 %rd34, %rd39, %rd41;\n"
|
||||
" mov.s32 %r20, %r6;\n"
|
||||
" add.u64 %rd35, %rd25, %rd39;\n"
|
||||
"$Lt_1_24578:\n"
|
||||
" .loc 16 148 0\n"
|
||||
" ld.global.s32 %r27, [%rd22+0];\n"
|
||||
" mov.u32 %r28, %r27;\n"
|
||||
" mov.s32 %r29, 0;\n"
|
||||
" mov.u32 %r30, %r29;\n"
|
||||
" mov.s32 %r31, 0;\n"
|
||||
" mov.u32 %r32, %r31;\n"
|
||||
" mov.s32 %r33, 0;\n"
|
||||
" mov.u32 %r34, %r33;\n"
|
||||
" tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r28,%r30,%r32,%r34}];\n"
|
||||
" mov.f32 %f26, %f22;\n"
|
||||
" mov.f32 %f27, %f23;\n"
|
||||
" mov.f32 %f28, %f24;\n"
|
||||
" mov.f32 %f29, %f25;\n"
|
||||
" setp.ge.u64 %p6, %rd35, %rd34;\n"
|
||||
" @%p6 bra $Lt_1_34306;\n"
|
||||
" cvt.rzi.ftz.s32.f32 %r35, %f29;\n"
|
||||
" cvt.s64.s32 %rd42, %r20;\n"
|
||||
" mul.lo.s32 %r36, %r35, 11;\n"
|
||||
" cvt.rn.f32.s32 %f30, %r36;\n"
|
||||
" mov.f32 %f31, 0f00000000; \n"
|
||||
" mov.f32 %f32, 0f00000000; \n"
|
||||
" mov.f32 %f33, 0f00000000; \n"
|
||||
" mov.f32 %f34, 0f00000000; \n"
|
||||
"$Lt_1_25602:\n"
|
||||
" .loc 16 155 0\n"
|
||||
" ld.global.s32 %r37, [%rd35+0];\n"
|
||||
" .loc 16 156 0\n"
|
||||
" shr.s32 %r38, %r37, 30;\n"
|
||||
" and.b32 %r39, %r38, 3;\n"
|
||||
" cvt.s64.s32 %rd43, %r39;\n"
|
||||
" mul.wide.s32 %rd44, %r39, 4;\n"
|
||||
" add.u64 %rd45, %rd1, %rd44;\n"
|
||||
" ld.shared.f32 %f35, [%rd45+0];\n"
|
||||
" .loc 16 159 0\n"
|
||||
" and.b32 %r40, %r37, 1073741823;\n"
|
||||
" mov.u32 %r41, %r40;\n"
|
||||
" mov.s32 %r42, 0;\n"
|
||||
" mov.u32 %r43, %r42;\n"
|
||||
" mov.s32 %r44, 0;\n"
|
||||
" mov.u32 %r45, %r44;\n"
|
||||
" mov.s32 %r46, 0;\n"
|
||||
" mov.u32 %r47, %r46;\n"
|
||||
" tex.1d.v4.f32.s32 {%f36,%f37,%f38,%f39},[pos_tex,{%r41,%r43,%r45,%r47}];\n"
|
||||
" mov.f32 %f40, %f36;\n"
|
||||
" mov.f32 %f41, %f37;\n"
|
||||
" mov.f32 %f42, %f38;\n"
|
||||
" mov.f32 %f43, %f39;\n"
|
||||
" sub.ftz.f32 %f44, %f27, %f41;\n"
|
||||
" sub.ftz.f32 %f45, %f26, %f40;\n"
|
||||
" sub.ftz.f32 %f46, %f28, %f42;\n"
|
||||
" mul.ftz.f32 %f47, %f44, %f44;\n"
|
||||
" fma.rn.ftz.f32 %f48, %f45, %f45, %f47;\n"
|
||||
" fma.rn.ftz.f32 %f49, %f46, %f46, %f48;\n"
|
||||
" add.ftz.f32 %f50, %f30, %f43;\n"
|
||||
" cvt.rzi.ftz.s32.f32 %r48, %f50;\n"
|
||||
" cvt.s64.s32 %rd46, %r48;\n"
|
||||
" mul.wide.s32 %rd47, %r48, 16;\n"
|
||||
" add.u64 %rd48, %rd47, %rd7;\n"
|
||||
" ld.shared.f32 %f51, [%rd48+0];\n"
|
||||
" setp.gt.ftz.f32 %p7, %f51, %f49;\n"
|
||||
" @!%p7 bra $Lt_1_27906;\n"
|
||||
" rcp.approx.ftz.f32 %f52, %f49;\n"
|
||||
" ld.shared.f32 %f53, [%rd48+4];\n"
|
||||
" mov.f32 %f54, 0f40000000; \n"
|
||||
" setp.eq.ftz.f32 %p8, %f53, %f54;\n"
|
||||
" @!%p8 bra $Lt_1_26626;\n"
|
||||
" .loc 16 173 0\n"
|
||||
" mul.ftz.f32 %f55, %f52, %f52;\n"
|
||||
" mov.f32 %f56, %f55;\n"
|
||||
" .loc 16 174 0\n"
|
||||
" mul.ftz.f32 %f57, %f55, %f55;\n"
|
||||
" bra.uni $Lt_1_26882;\n"
|
||||
"$Lt_1_26626:\n"
|
||||
" mov.f32 %f58, 0f3f800000; \n"
|
||||
" setp.eq.ftz.f32 %p9, %f53, %f58;\n"
|
||||
" @!%p9 bra $Lt_1_27138;\n"
|
||||
" .loc 16 176 0\n"
|
||||
" sqrt.approx.ftz.f32 %f59, %f52;\n"
|
||||
" mul.ftz.f32 %f60, %f52, %f59;\n"
|
||||
" mov.f32 %f57, %f60;\n"
|
||||
" .loc 16 177 0\n"
|
||||
" mul.ftz.f32 %f56, %f60, %f60;\n"
|
||||
" bra.uni $Lt_1_26882;\n"
|
||||
"$Lt_1_27138:\n"
|
||||
" .loc 16 179 0\n"
|
||||
" mul.ftz.f32 %f61, %f52, %f52;\n"
|
||||
" mul.ftz.f32 %f62, %f52, %f61;\n"
|
||||
" mov.f32 %f56, %f62;\n"
|
||||
" .loc 16 180 0\n"
|
||||
" mov.f32 %f57, %f62;\n"
|
||||
"$Lt_1_26882:\n"
|
||||
"$Lt_1_26370:\n"
|
||||
" .loc 16 182 0\n"
|
||||
" mul.ftz.f32 %f63, %f52, %f35;\n"
|
||||
" mul.ftz.f32 %f64, %f56, %f63;\n"
|
||||
" ld.shared.v2.f32 {%f65,%f66}, [%rd48+8];\n"
|
||||
" mul.ftz.f32 %f67, %f65, %f57;\n"
|
||||
" sub.ftz.f32 %f68, %f67, %f66;\n"
|
||||
" mul.ftz.f32 %f69, %f64, %f68;\n"
|
||||
" .loc 16 184 0\n"
|
||||
" fma.rn.ftz.f32 %f33, %f45, %f69, %f33;\n"
|
||||
" .loc 16 185 0\n"
|
||||
" fma.rn.ftz.f32 %f32, %f44, %f69, %f32;\n"
|
||||
" .loc 16 186 0\n"
|
||||
" fma.rn.ftz.f32 %f31, %f46, %f69, %f31;\n"
|
||||
" ld.param.s32 %r49, [__cudaparm_kernel_pair_fast_eflag];\n"
|
||||
" mov.u32 %r50, 0;\n"
|
||||
" setp.le.s32 %p10, %r49, %r50;\n"
|
||||
" @%p10 bra $Lt_1_27394;\n"
|
||||
" .loc 16 188 0\n"
|
||||
" add.u64 %rd49, %rd47, %rd13;\n"
|
||||
" ld.shared.v4.f32 {%f70,%f71,%f72,_}, [%rd49+0];\n"
|
||||
" mul.ftz.f32 %f73, %f35, %f56;\n"
|
||||
" mul.ftz.f32 %f74, %f70, %f57;\n"
|
||||
" sub.ftz.f32 %f75, %f74, %f71;\n"
|
||||
" mul.ftz.f32 %f76, %f73, %f75;\n"
|
||||
" sub.ftz.f32 %f77, %f76, %f72;\n"
|
||||
" add.ftz.f32 %f34, %f34, %f77;\n"
|
||||
"$Lt_1_27394:\n"
|
||||
" ld.param.s32 %r51, [__cudaparm_kernel_pair_fast_vflag];\n"
|
||||
" mov.u32 %r52, 0;\n"
|
||||
" setp.le.s32 %p11, %r51, %r52;\n"
|
||||
" @%p11 bra $Lt_1_27906;\n"
|
||||
" .loc 16 191 0\n"
|
||||
" mov.f32 %f78, %f11;\n"
|
||||
" mul.ftz.f32 %f79, %f45, %f45;\n"
|
||||
" fma.rn.ftz.f32 %f80, %f69, %f79, %f78;\n"
|
||||
" mov.f32 %f11, %f80;\n"
|
||||
" .loc 16 192 0\n"
|
||||
" mov.f32 %f81, %f13;\n"
|
||||
" fma.rn.ftz.f32 %f82, %f69, %f47, %f81;\n"
|
||||
" mov.f32 %f13, %f82;\n"
|
||||
" .loc 16 193 0\n"
|
||||
" mov.f32 %f83, %f15;\n"
|
||||
" mul.ftz.f32 %f84, %f46, %f46;\n"
|
||||
" fma.rn.ftz.f32 %f85, %f69, %f84, %f83;\n"
|
||||
" mov.f32 %f15, %f85;\n"
|
||||
" .loc 16 194 0\n"
|
||||
" mov.f32 %f86, %f17;\n"
|
||||
" mul.ftz.f32 %f87, %f44, %f45;\n"
|
||||
" fma.rn.ftz.f32 %f88, %f69, %f87, %f86;\n"
|
||||
" mov.f32 %f17, %f88;\n"
|
||||
" .loc 16 195 0\n"
|
||||
" mov.f32 %f89, %f19;\n"
|
||||
" mul.ftz.f32 %f90, %f45, %f46;\n"
|
||||
" fma.rn.ftz.f32 %f91, %f69, %f90, %f89;\n"
|
||||
" mov.f32 %f19, %f91;\n"
|
||||
" .loc 16 196 0\n"
|
||||
" mul.ftz.f32 %f92, %f44, %f46;\n"
|
||||
" fma.rn.ftz.f32 %f20, %f69, %f92, %f20;\n"
|
||||
" mov.f32 %f21, %f20;\n"
|
||||
"$Lt_1_27906:\n"
|
||||
"$Lt_1_25858:\n"
|
||||
" .loc 16 153 0\n"
|
||||
" mul.lo.u64 %rd50, %rd42, 4;\n"
|
||||
" add.u64 %rd35, %rd35, %rd50;\n"
|
||||
" setp.lt.u64 %p12, %rd35, %rd34;\n"
|
||||
" @%p12 bra $Lt_1_25602;\n"
|
||||
" bra.uni $Lt_1_25090;\n"
|
||||
"$Lt_1_34306:\n"
|
||||
" mov.f32 %f31, 0f00000000; \n"
|
||||
" mov.f32 %f32, 0f00000000; \n"
|
||||
" mov.f32 %f33, 0f00000000; \n"
|
||||
" mov.f32 %f34, 0f00000000; \n"
|
||||
"$Lt_1_25090:\n"
|
||||
" mov.u32 %r53, 1;\n"
|
||||
" setp.le.s32 %p13, %r6, %r53;\n"
|
||||
" @%p13 bra $Lt_1_30722;\n"
|
||||
" .loc 16 201 0\n"
|
||||
" mov.u64 %rd51, __cuda___cuda_local_var_32702_55_non_const_red_acc7168;\n"
|
||||
" cvt.s64.s32 %rd52, %r1;\n"
|
||||
" mul.wide.s32 %rd53, %r1, 4;\n"
|
||||
" add.u64 %rd54, %rd51, %rd53;\n"
|
||||
" mov.f32 %f93, %f33;\n"
|
||||
" st.shared.f32 [%rd54+0], %f93;\n"
|
||||
" mov.f32 %f94, %f32;\n"
|
||||
" st.shared.f32 [%rd54+512], %f94;\n"
|
||||
" mov.f32 %f95, %f31;\n"
|
||||
" st.shared.f32 [%rd54+1024], %f95;\n"
|
||||
" mov.f32 %f96, %f34;\n"
|
||||
" st.shared.f32 [%rd54+1536], %f96;\n"
|
||||
" shr.s32 %r54, %r6, 31;\n"
|
||||
" mov.s32 %r55, 1;\n"
|
||||
" and.b32 %r56, %r54, %r55;\n"
|
||||
" add.s32 %r57, %r56, %r6;\n"
|
||||
" shr.s32 %r58, %r57, 1;\n"
|
||||
" mov.s32 %r59, %r58;\n"
|
||||
" mov.u32 %r60, 0;\n"
|
||||
" setp.ne.u32 %p14, %r58, %r60;\n"
|
||||
" @!%p14 bra $Lt_1_29186;\n"
|
||||
"$Lt_1_29698:\n"
|
||||
" setp.ge.u32 %p15, %r17, %r59;\n"
|
||||
" @%p15 bra $Lt_1_29954;\n"
|
||||
" add.u32 %r61, %r1, %r59;\n"
|
||||
" cvt.u64.u32 %rd55, %r61;\n"
|
||||
" mul.wide.u32 %rd56, %r61, 4;\n"
|
||||
" add.u64 %rd57, %rd51, %rd56;\n"
|
||||
" ld.shared.f32 %f97, [%rd57+0];\n"
|
||||
" add.ftz.f32 %f93, %f97, %f93;\n"
|
||||
" st.shared.f32 [%rd54+0], %f93;\n"
|
||||
" ld.shared.f32 %f98, [%rd57+512];\n"
|
||||
" add.ftz.f32 %f94, %f98, %f94;\n"
|
||||
" st.shared.f32 [%rd54+512], %f94;\n"
|
||||
" ld.shared.f32 %f99, [%rd57+1024];\n"
|
||||
" add.ftz.f32 %f95, %f99, %f95;\n"
|
||||
" st.shared.f32 [%rd54+1024], %f95;\n"
|
||||
" ld.shared.f32 %f100, [%rd57+1536];\n"
|
||||
" add.ftz.f32 %f96, %f100, %f96;\n"
|
||||
" st.shared.f32 [%rd54+1536], %f96;\n"
|
||||
"$Lt_1_29954:\n"
|
||||
" shr.u32 %r59, %r59, 1;\n"
|
||||
" mov.u32 %r62, 0;\n"
|
||||
" setp.ne.u32 %p16, %r59, %r62;\n"
|
||||
" @%p16 bra $Lt_1_29698;\n"
|
||||
"$Lt_1_29186:\n"
|
||||
" mov.f32 %f33, %f93;\n"
|
||||
" mov.f32 %f32, %f94;\n"
|
||||
" mov.f32 %f31, %f95;\n"
|
||||
" mov.f32 %f34, %f96;\n"
|
||||
" ld.param.s32 %r63, [__cudaparm_kernel_pair_fast_vflag];\n"
|
||||
" mov.u32 %r64, 0;\n"
|
||||
" setp.le.s32 %p17, %r63, %r64;\n"
|
||||
" @%p17 bra $Lt_1_30722;\n"
|
||||
" mov.f32 %f93, %f11;\n"
|
||||
" st.shared.f32 [%rd54+0], %f93;\n"
|
||||
" mov.f32 %f94, %f13;\n"
|
||||
" st.shared.f32 [%rd54+512], %f94;\n"
|
||||
" mov.f32 %f95, %f15;\n"
|
||||
" st.shared.f32 [%rd54+1024], %f95;\n"
|
||||
" mov.f32 %f96, %f17;\n"
|
||||
" st.shared.f32 [%rd54+1536], %f96;\n"
|
||||
" mov.f32 %f101, %f19;\n"
|
||||
" st.shared.f32 [%rd54+2048], %f101;\n"
|
||||
" mov.f32 %f102, %f20;\n"
|
||||
" st.shared.f32 [%rd54+2560], %f102;\n"
|
||||
" mov.s32 %r65, %r58;\n"
|
||||
" @!%p14 bra $Lt_1_31234;\n"
|
||||
"$Lt_1_31746:\n"
|
||||
" setp.ge.u32 %p18, %r17, %r65;\n"
|
||||
" @%p18 bra $Lt_1_32002;\n"
|
||||
" add.u32 %r66, %r1, %r65;\n"
|
||||
" cvt.u64.u32 %rd58, %r66;\n"
|
||||
" mul.wide.u32 %rd59, %r66, 4;\n"
|
||||
" add.u64 %rd60, %rd51, %rd59;\n"
|
||||
" ld.shared.f32 %f103, [%rd60+0];\n"
|
||||
" add.ftz.f32 %f93, %f103, %f93;\n"
|
||||
" st.shared.f32 [%rd54+0], %f93;\n"
|
||||
" ld.shared.f32 %f104, [%rd60+512];\n"
|
||||
" add.ftz.f32 %f94, %f104, %f94;\n"
|
||||
" st.shared.f32 [%rd54+512], %f94;\n"
|
||||
" ld.shared.f32 %f105, [%rd60+1024];\n"
|
||||
" add.ftz.f32 %f95, %f105, %f95;\n"
|
||||
" st.shared.f32 [%rd54+1024], %f95;\n"
|
||||
" ld.shared.f32 %f106, [%rd60+1536];\n"
|
||||
" add.ftz.f32 %f96, %f106, %f96;\n"
|
||||
" st.shared.f32 [%rd54+1536], %f96;\n"
|
||||
" ld.shared.f32 %f107, [%rd60+2048];\n"
|
||||
" add.ftz.f32 %f101, %f107, %f101;\n"
|
||||
" st.shared.f32 [%rd54+2048], %f101;\n"
|
||||
" ld.shared.f32 %f108, [%rd60+2560];\n"
|
||||
" add.ftz.f32 %f102, %f108, %f102;\n"
|
||||
" st.shared.f32 [%rd54+2560], %f102;\n"
|
||||
"$Lt_1_32002:\n"
|
||||
" shr.u32 %r65, %r65, 1;\n"
|
||||
" mov.u32 %r67, 0;\n"
|
||||
" setp.ne.u32 %p19, %r65, %r67;\n"
|
||||
" @%p19 bra $Lt_1_31746;\n"
|
||||
"$Lt_1_31234:\n"
|
||||
" mov.f32 %f11, %f93;\n"
|
||||
" mov.f32 %f13, %f94;\n"
|
||||
" mov.f32 %f15, %f95;\n"
|
||||
" mov.f32 %f17, %f96;\n"
|
||||
" mov.f32 %f19, %f101;\n"
|
||||
" mov.f32 %f21, %f102;\n"
|
||||
"$Lt_1_30722:\n"
|
||||
"$Lt_1_28674:\n"
|
||||
" mov.u32 %r68, 0;\n"
|
||||
" setp.ne.s32 %p20, %r17, %r68;\n"
|
||||
" @%p20 bra $Lt_1_32770;\n"
|
||||
" ld.param.u64 %rd61, [__cudaparm_kernel_pair_fast___val_paramengv];\n"
|
||||
" add.u64 %rd62, %rd61, %rd20;\n"
|
||||
" ld.param.s32 %r69, [__cudaparm_kernel_pair_fast_eflag];\n"
|
||||
" mov.u32 %r70, 0;\n"
|
||||
" setp.le.s32 %p21, %r69, %r70;\n"
|
||||
" @%p21 bra $Lt_1_33282;\n"
|
||||
" st.global.f32 [%rd62+0], %f34;\n"
|
||||
" cvt.s64.s32 %rd63, %r13;\n"
|
||||
" mul.wide.s32 %rd64, %r13, 4;\n"
|
||||
" add.u64 %rd62, %rd62, %rd64;\n"
|
||||
"$Lt_1_33282:\n"
|
||||
" ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag];\n"
|
||||
" mov.u32 %r72, 0;\n"
|
||||
" setp.le.s32 %p22, %r71, %r72;\n"
|
||||
" @%p22 bra $Lt_1_33794;\n"
|
||||
" mov.f32 %f109, %f11;\n"
|
||||
" st.global.f32 [%rd62+0], %f109;\n"
|
||||
" cvt.s64.s32 %rd65, %r13;\n"
|
||||
" mul.wide.s32 %rd66, %r13, 4;\n"
|
||||
" add.u64 %rd67, %rd66, %rd62;\n"
|
||||
" mov.f32 %f110, %f13;\n"
|
||||
" st.global.f32 [%rd67+0], %f110;\n"
|
||||
" add.u64 %rd68, %rd66, %rd67;\n"
|
||||
" mov.f32 %f111, %f15;\n"
|
||||
" st.global.f32 [%rd68+0], %f111;\n"
|
||||
" add.u64 %rd69, %rd66, %rd68;\n"
|
||||
" mov.f32 %f112, %f17;\n"
|
||||
" st.global.f32 [%rd69+0], %f112;\n"
|
||||
" add.u64 %rd62, %rd66, %rd69;\n"
|
||||
" mov.f32 %f113, %f19;\n"
|
||||
" st.global.f32 [%rd62+0], %f113;\n"
|
||||
" mov.f32 %f114, %f21;\n"
|
||||
" add.u64 %rd70, %rd66, %rd62;\n"
|
||||
" st.global.f32 [%rd70+0], %f114;\n"
|
||||
"$Lt_1_33794:\n"
|
||||
" ld.param.u64 %rd71, [__cudaparm_kernel_pair_fast_ans];\n"
|
||||
" mul.lo.u64 %rd72, %rd19, 16;\n"
|
||||
" add.u64 %rd73, %rd71, %rd72;\n"
|
||||
" mov.f32 %f115, %f116;\n"
|
||||
" st.global.v4.f32 [%rd73+0], {%f33,%f32,%f31,%f115};\n"
|
||||
"$Lt_1_32770:\n"
|
||||
"$Lt_1_24066:\n"
|
||||
" .loc 16 204 0\n"
|
||||
" exit;\n"
|
||||
"$LDWend_kernel_pair_fast:\n"
|
||||
" }\n"
|
||||
;
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,957 +0,0 @@
|
||||
const char * coul_long =
|
||||
" .version 2.3\n"
|
||||
" .target sm_20\n"
|
||||
" .address_size 64\n"
|
||||
" .global .texref pos_tex;\n"
|
||||
" .global .texref q_tex;\n"
|
||||
" .entry kernel_pair (\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_x_,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_lj1,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_lj3,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_lj_types,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_sp_cl_in,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_dev_nbor,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_dev_packed,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_ans,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_engv,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_eflag,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_vflag,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_inum,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_nbor_pitch,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_q_,\n"
|
||||
" .param .f32 __cudaparm_kernel_pair_cut_coulsq,\n"
|
||||
" .param .f32 __cudaparm_kernel_pair_qqrd2e,\n"
|
||||
" .param .f32 __cudaparm_kernel_pair_g_ewald,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_t_per_atom)\n"
|
||||
" {\n"
|
||||
" .reg .u32 %r<81>;\n"
|
||||
" .reg .u64 %rd<58>;\n"
|
||||
" .reg .f32 %f<132>;\n"
|
||||
" .reg .pred %p<19>;\n"
|
||||
" .shared .align 16 .b8 __cuda___cuda_local_var_32541_33_non_const_sp_cl112[16];\n"
|
||||
" .shared .align 4 .b8 __cuda___cuda_local_var_32611_37_non_const_red_acc128[3072];\n"
|
||||
" .loc 16 36 0\n"
|
||||
"$LDWbegin_kernel_pair:\n"
|
||||
" .loc 16 41 0\n"
|
||||
" ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_cl_in];\n"
|
||||
" ldu.global.f32 %f1, [%rd1+0];\n"
|
||||
" .loc 16 42 0\n"
|
||||
" ld.global.f32 %f2, [%rd1+4];\n"
|
||||
" .loc 16 43 0\n"
|
||||
" ld.global.f32 %f3, [%rd1+8];\n"
|
||||
" .loc 16 44 0\n"
|
||||
" ld.global.f32 %f4, [%rd1+12];\n"
|
||||
" st.shared.v4.f32 [__cuda___cuda_local_var_32541_33_non_const_sp_cl112+0], {%f1,%f2,%f3,%f4};\n"
|
||||
" .loc 16 51 0\n"
|
||||
" mov.f32 %f5, 0f00000000; \n"
|
||||
" mov.f32 %f6, %f5;\n"
|
||||
" mov.f32 %f7, 0f00000000; \n"
|
||||
" mov.f32 %f8, %f7;\n"
|
||||
" mov.f32 %f9, 0f00000000; \n"
|
||||
" mov.f32 %f10, %f9;\n"
|
||||
" mov.f32 %f11, 0f00000000; \n"
|
||||
" mov.f32 %f12, %f11;\n"
|
||||
" mov.f32 %f13, 0f00000000; \n"
|
||||
" mov.f32 %f14, %f13;\n"
|
||||
" mov.f32 %f15, 0f00000000; \n"
|
||||
" mov.f32 %f16, %f15;\n"
|
||||
" ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];\n"
|
||||
" cvt.s32.u32 %r2, %tid.x;\n"
|
||||
" div.s32 %r3, %r2, %r1;\n"
|
||||
" cvt.s32.u32 %r4, %ntid.x;\n"
|
||||
" div.s32 %r5, %r4, %r1;\n"
|
||||
" cvt.s32.u32 %r6, %ctaid.x;\n"
|
||||
" mul.lo.s32 %r7, %r6, %r5;\n"
|
||||
" add.s32 %r8, %r3, %r7;\n"
|
||||
" ld.param.s32 %r9, [__cudaparm_kernel_pair_inum];\n"
|
||||
" setp.ge.s32 %p1, %r8, %r9;\n"
|
||||
" @%p1 bra $Lt_0_25858;\n"
|
||||
" .loc 16 56 0\n"
|
||||
" cvt.s64.s32 %rd2, %r8;\n"
|
||||
" mul.wide.s32 %rd3, %r8, 4;\n"
|
||||
" ld.param.u64 %rd4, [__cudaparm_kernel_pair_dev_nbor];\n"
|
||||
" add.u64 %rd5, %rd3, %rd4;\n"
|
||||
" ld.global.s32 %r10, [%rd5+0];\n"
|
||||
" ld.param.s32 %r11, [__cudaparm_kernel_pair_nbor_pitch];\n"
|
||||
" cvt.s64.s32 %rd6, %r11;\n"
|
||||
" mul.wide.s32 %rd7, %r11, 4;\n"
|
||||
" add.u64 %rd8, %rd7, %rd5;\n"
|
||||
" ld.global.s32 %r12, [%rd8+0];\n"
|
||||
" sub.s32 %r13, %r1, 1;\n"
|
||||
" and.b32 %r14, %r13, %r2;\n"
|
||||
" cvt.s64.s32 %rd9, %r14;\n"
|
||||
" mul.wide.s32 %rd10, %r14, 4;\n"
|
||||
" ld.param.u64 %rd11, [__cudaparm_kernel_pair_dev_packed];\n"
|
||||
" setp.ne.u64 %p2, %rd11, %rd4;\n"
|
||||
" @%p2 bra $Lt_0_19458;\n"
|
||||
" cvt.s32.s64 %r15, %rd6;\n"
|
||||
" mul.lo.s32 %r16, %r15, %r1;\n"
|
||||
" mov.s32 %r17, %r16;\n"
|
||||
" mul.lo.s32 %r18, %r13, %r8;\n"
|
||||
" add.s32 %r19, %r15, %r18;\n"
|
||||
" cvt.s64.s32 %rd12, %r19;\n"
|
||||
" mul.wide.s32 %rd13, %r19, 4;\n"
|
||||
" add.u64 %rd14, %rd8, %rd13;\n"
|
||||
" and.b32 %r20, %r13, %r12;\n"
|
||||
" cvt.s64.s32 %rd15, %r20;\n"
|
||||
" div.s32 %r21, %r12, %r1;\n"
|
||||
" mul.lo.s32 %r22, %r16, %r21;\n"
|
||||
" cvt.s64.s32 %rd16, %r22;\n"
|
||||
" add.u64 %rd17, %rd15, %rd16;\n"
|
||||
" mul.lo.u64 %rd18, %rd17, 4;\n"
|
||||
" add.u64 %rd19, %rd14, %rd18;\n"
|
||||
" add.u64 %rd20, %rd10, %rd14;\n"
|
||||
" bra.uni $Lt_0_19202;\n"
|
||||
"$Lt_0_19458:\n"
|
||||
" add.u64 %rd21, %rd7, %rd8;\n"
|
||||
" ld.global.s32 %r23, [%rd21+0];\n"
|
||||
" cvt.s64.s32 %rd22, %r23;\n"
|
||||
" mul.wide.s32 %rd23, %r23, 4;\n"
|
||||
" add.u64 %rd24, %rd11, %rd23;\n"
|
||||
" cvt.s64.s32 %rd25, %r12;\n"
|
||||
" mul.wide.s32 %rd26, %r12, 4;\n"
|
||||
" add.u64 %rd19, %rd24, %rd26;\n"
|
||||
" mov.s32 %r17, %r1;\n"
|
||||
" add.u64 %rd20, %rd10, %rd24;\n"
|
||||
"$Lt_0_19202:\n"
|
||||
" .loc 16 59 0\n"
|
||||
" mov.u32 %r24, %r10;\n"
|
||||
" mov.s32 %r25, 0;\n"
|
||||
" mov.u32 %r26, %r25;\n"
|
||||
" mov.s32 %r27, 0;\n"
|
||||
" mov.u32 %r28, %r27;\n"
|
||||
" mov.s32 %r29, 0;\n"
|
||||
" mov.u32 %r30, %r29;\n"
|
||||
" tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r24,%r26,%r28,%r30}];\n"
|
||||
" mov.f32 %f21, %f17;\n"
|
||||
" mov.f32 %f22, %f18;\n"
|
||||
" mov.f32 %f23, %f19;\n"
|
||||
" .loc 16 60 0\n"
|
||||
" mov.u32 %r31, %r10;\n"
|
||||
" mov.s32 %r32, 0;\n"
|
||||
" mov.u32 %r33, %r32;\n"
|
||||
" mov.s32 %r34, 0;\n"
|
||||
" mov.u32 %r35, %r34;\n"
|
||||
" mov.s32 %r36, 0;\n"
|
||||
" mov.u32 %r37, %r36;\n"
|
||||
" tex.1d.v4.f32.s32 {%f24,%f25,%f26,%f27},[q_tex,{%r31,%r33,%r35,%r37}];\n"
|
||||
" mov.f32 %f28, %f24;\n"
|
||||
" setp.ge.u64 %p3, %rd20, %rd19;\n"
|
||||
" @%p3 bra $Lt_0_27394;\n"
|
||||
" cvt.s64.s32 %rd27, %r17;\n"
|
||||
" ld.param.f32 %f29, [__cudaparm_kernel_pair_cut_coulsq];\n"
|
||||
" mov.f32 %f30, 0f00000000; \n"
|
||||
" mov.f32 %f31, 0f00000000; \n"
|
||||
" mov.f32 %f32, 0f00000000; \n"
|
||||
" mov.f32 %f33, 0f00000000; \n"
|
||||
" mov.u64 %rd28, __cuda___cuda_local_var_32541_33_non_const_sp_cl112;\n"
|
||||
"$Lt_0_20226:\n"
|
||||
" .loc 16 63 0\n"
|
||||
" ld.global.s32 %r38, [%rd20+0];\n"
|
||||
" .loc 16 66 0\n"
|
||||
" mov.f32 %f34, 0f3f800000; \n"
|
||||
" shr.s32 %r39, %r38, 30;\n"
|
||||
" and.b32 %r40, %r39, 3;\n"
|
||||
" cvt.s64.s32 %rd29, %r40;\n"
|
||||
" mul.wide.s32 %rd30, %r40, 4;\n"
|
||||
" add.u64 %rd31, %rd28, %rd30;\n"
|
||||
" ld.shared.f32 %f35, [%rd31+0];\n"
|
||||
" sub.ftz.f32 %f36, %f34, %f35;\n"
|
||||
" .loc 16 69 0\n"
|
||||
" and.b32 %r41, %r38, 1073741823;\n"
|
||||
" mov.u32 %r42, %r41;\n"
|
||||
" mov.s32 %r43, 0;\n"
|
||||
" mov.u32 %r44, %r43;\n"
|
||||
" mov.s32 %r45, 0;\n"
|
||||
" mov.u32 %r46, %r45;\n"
|
||||
" mov.s32 %r47, 0;\n"
|
||||
" mov.u32 %r48, %r47;\n"
|
||||
" tex.1d.v4.f32.s32 {%f37,%f38,%f39,%f40},[pos_tex,{%r42,%r44,%r46,%r48}];\n"
|
||||
" mov.f32 %f41, %f37;\n"
|
||||
" mov.f32 %f42, %f38;\n"
|
||||
" mov.f32 %f43, %f39;\n"
|
||||
" sub.ftz.f32 %f44, %f22, %f42;\n"
|
||||
" sub.ftz.f32 %f45, %f21, %f41;\n"
|
||||
" sub.ftz.f32 %f46, %f23, %f43;\n"
|
||||
" mul.ftz.f32 %f47, %f44, %f44;\n"
|
||||
" fma.rn.ftz.f32 %f48, %f45, %f45, %f47;\n"
|
||||
" fma.rn.ftz.f32 %f49, %f46, %f46, %f48;\n"
|
||||
" setp.lt.ftz.f32 %p4, %f49, %f29;\n"
|
||||
" @!%p4 bra $Lt_0_20994;\n"
|
||||
" .loc 20 518 0\n"
|
||||
" rcp.approx.ftz.f32 %f50, %f49;\n"
|
||||
" rsqrt.approx.ftz.f32 %f51, %f50;\n"
|
||||
" ld.param.f32 %f52, [__cudaparm_kernel_pair_g_ewald];\n"
|
||||
" mul.ftz.f32 %f53, %f52, %f51;\n"
|
||||
" mul.ftz.f32 %f54, %f53, %f53;\n"
|
||||
" neg.ftz.f32 %f55, %f54;\n"
|
||||
" mov.f32 %f56, 0f3fb8aa3b; \n"
|
||||
" mul.ftz.f32 %f57, %f55, %f56;\n"
|
||||
" ex2.approx.ftz.f32 %f58, %f57;\n"
|
||||
" .loc 16 85 0\n"
|
||||
" mov.f32 %f59, 0f3f800000; \n"
|
||||
" mov.f32 %f60, 0f3ea7ba05; \n"
|
||||
" fma.rn.ftz.f32 %f61, %f60, %f53, %f59;\n"
|
||||
" rcp.approx.ftz.f32 %f62, %f61;\n"
|
||||
" mov.f32 %f63, 0f3e827906; \n"
|
||||
" mov.f32 %f64, 0fbe91a98e; \n"
|
||||
" mov.f32 %f65, 0f3fb5f0e3; \n"
|
||||
" mov.f32 %f66, 0fbfba00e3; \n"
|
||||
" mov.f32 %f67, 0f3f87dc22; \n"
|
||||
" fma.rn.ftz.f32 %f68, %f67, %f62, %f66;\n"
|
||||
" fma.rn.ftz.f32 %f69, %f62, %f68, %f65;\n"
|
||||
" fma.rn.ftz.f32 %f70, %f62, %f69, %f64;\n"
|
||||
" fma.rn.ftz.f32 %f71, %f62, %f70, %f63;\n"
|
||||
" mul.ftz.f32 %f72, %f62, %f71;\n"
|
||||
" mul.ftz.f32 %f73, %f58, %f72;\n"
|
||||
" .loc 16 86 0\n"
|
||||
" mov.u32 %r49, %r41;\n"
|
||||
" mov.s32 %r50, 0;\n"
|
||||
" mov.u32 %r51, %r50;\n"
|
||||
" mov.s32 %r52, 0;\n"
|
||||
" mov.u32 %r53, %r52;\n"
|
||||
" mov.s32 %r54, 0;\n"
|
||||
" mov.u32 %r55, %r54;\n"
|
||||
" tex.1d.v4.f32.s32 {%f74,%f75,%f76,%f77},[q_tex,{%r49,%r51,%r53,%r55}];\n"
|
||||
" mov.f32 %f78, %f74;\n"
|
||||
" .loc 16 87 0\n"
|
||||
" ld.param.f32 %f79, [__cudaparm_kernel_pair_qqrd2e];\n"
|
||||
" mul.ftz.f32 %f80, %f79, %f28;\n"
|
||||
" mul.ftz.f32 %f81, %f80, %f78;\n"
|
||||
" div.approx.ftz.f32 %f82, %f81, %f51;\n"
|
||||
" mov.f32 %f83, 0f3f906ebb; \n"
|
||||
" mul.ftz.f32 %f84, %f53, %f83;\n"
|
||||
" fma.rn.ftz.f32 %f85, %f58, %f84, %f73;\n"
|
||||
" sub.ftz.f32 %f86, %f85, %f36;\n"
|
||||
" mul.ftz.f32 %f87, %f82, %f86;\n"
|
||||
" mul.ftz.f32 %f88, %f50, %f87;\n"
|
||||
" .loc 16 89 0\n"
|
||||
" fma.rn.ftz.f32 %f32, %f45, %f88, %f32;\n"
|
||||
" .loc 16 90 0\n"
|
||||
" fma.rn.ftz.f32 %f31, %f44, %f88, %f31;\n"
|
||||
" .loc 16 91 0\n"
|
||||
" fma.rn.ftz.f32 %f30, %f46, %f88, %f30;\n"
|
||||
" .loc 16 78 0\n"
|
||||
" sub.ftz.f32 %f89, %f73, %f36;\n"
|
||||
" fma.rn.ftz.f32 %f90, %f82, %f89, %f33;\n"
|
||||
" ld.param.s32 %r56, [__cudaparm_kernel_pair_eflag];\n"
|
||||
" mov.s32 %r57, 0;\n"
|
||||
" setp.gt.s32 %p5, %r56, %r57;\n"
|
||||
" selp.f32 %f33, %f90, %f33, %p5;\n"
|
||||
" ld.param.s32 %r58, [__cudaparm_kernel_pair_vflag];\n"
|
||||
" mov.u32 %r59, 0;\n"
|
||||
" setp.le.s32 %p6, %r58, %r59;\n"
|
||||
" @%p6 bra $Lt_0_20994;\n"
|
||||
" .loc 16 97 0\n"
|
||||
" mov.f32 %f91, %f6;\n"
|
||||
" mul.ftz.f32 %f92, %f45, %f45;\n"
|
||||
" fma.rn.ftz.f32 %f93, %f88, %f92, %f91;\n"
|
||||
" mov.f32 %f6, %f93;\n"
|
||||
" .loc 16 98 0\n"
|
||||
" mov.f32 %f94, %f8;\n"
|
||||
" fma.rn.ftz.f32 %f95, %f88, %f47, %f94;\n"
|
||||
" mov.f32 %f8, %f95;\n"
|
||||
" .loc 16 99 0\n"
|
||||
" mov.f32 %f96, %f10;\n"
|
||||
" mul.ftz.f32 %f97, %f46, %f46;\n"
|
||||
" fma.rn.ftz.f32 %f98, %f88, %f97, %f96;\n"
|
||||
" mov.f32 %f10, %f98;\n"
|
||||
" .loc 16 100 0\n"
|
||||
" mov.f32 %f99, %f12;\n"
|
||||
" mul.ftz.f32 %f100, %f44, %f45;\n"
|
||||
" fma.rn.ftz.f32 %f101, %f88, %f100, %f99;\n"
|
||||
" mov.f32 %f12, %f101;\n"
|
||||
" .loc 16 101 0\n"
|
||||
" mov.f32 %f102, %f14;\n"
|
||||
" mul.ftz.f32 %f103, %f45, %f46;\n"
|
||||
" fma.rn.ftz.f32 %f104, %f88, %f103, %f102;\n"
|
||||
" mov.f32 %f14, %f104;\n"
|
||||
" .loc 16 102 0\n"
|
||||
" mul.ftz.f32 %f105, %f44, %f46;\n"
|
||||
" fma.rn.ftz.f32 %f15, %f88, %f105, %f15;\n"
|
||||
" mov.f32 %f16, %f15;\n"
|
||||
"$Lt_0_20994:\n"
|
||||
"$Lt_0_20482:\n"
|
||||
" .loc 16 62 0\n"
|
||||
" mul.lo.u64 %rd32, %rd27, 4;\n"
|
||||
" add.u64 %rd20, %rd20, %rd32;\n"
|
||||
" setp.lt.u64 %p7, %rd20, %rd19;\n"
|
||||
" @%p7 bra $Lt_0_20226;\n"
|
||||
" bra.uni $Lt_0_19714;\n"
|
||||
"$Lt_0_27394:\n"
|
||||
" mov.f32 %f30, 0f00000000; \n"
|
||||
" mov.f32 %f31, 0f00000000; \n"
|
||||
" mov.f32 %f32, 0f00000000; \n"
|
||||
" mov.f32 %f33, 0f00000000; \n"
|
||||
"$Lt_0_19714:\n"
|
||||
" mov.u32 %r60, 1;\n"
|
||||
" setp.le.s32 %p8, %r1, %r60;\n"
|
||||
" @%p8 bra $Lt_0_23810;\n"
|
||||
" .loc 16 112 0\n"
|
||||
" mov.u64 %rd33, __cuda___cuda_local_var_32611_37_non_const_red_acc128;\n"
|
||||
" cvt.s64.s32 %rd34, %r2;\n"
|
||||
" mul.wide.s32 %rd35, %r2, 4;\n"
|
||||
" add.u64 %rd36, %rd33, %rd35;\n"
|
||||
" mov.f32 %f106, %f32;\n"
|
||||
" st.shared.f32 [%rd36+0], %f106;\n"
|
||||
" .loc 16 113 0\n"
|
||||
" mov.f32 %f107, %f31;\n"
|
||||
" st.shared.f32 [%rd36+512], %f107;\n"
|
||||
" .loc 16 114 0\n"
|
||||
" mov.f32 %f108, %f30;\n"
|
||||
" st.shared.f32 [%rd36+1024], %f108;\n"
|
||||
" .loc 16 115 0\n"
|
||||
" mov.f32 %f109, %f33;\n"
|
||||
" st.shared.f32 [%rd36+1536], %f109;\n"
|
||||
" .loc 16 117 0\n"
|
||||
" shr.s32 %r61, %r1, 31;\n"
|
||||
" mov.s32 %r62, 1;\n"
|
||||
" and.b32 %r63, %r61, %r62;\n"
|
||||
" add.s32 %r64, %r63, %r1;\n"
|
||||
" shr.s32 %r65, %r64, 1;\n"
|
||||
" mov.s32 %r66, %r65;\n"
|
||||
" mov.u32 %r67, 0;\n"
|
||||
" setp.ne.u32 %p9, %r65, %r67;\n"
|
||||
" @!%p9 bra $Lt_0_22274;\n"
|
||||
"$Lt_0_22786:\n"
|
||||
" setp.ge.u32 %p10, %r14, %r66;\n"
|
||||
" @%p10 bra $Lt_0_23042;\n"
|
||||
" .loc 16 120 0\n"
|
||||
" add.u32 %r68, %r2, %r66;\n"
|
||||
" cvt.u64.u32 %rd37, %r68;\n"
|
||||
" mul.wide.u32 %rd38, %r68, 4;\n"
|
||||
" add.u64 %rd39, %rd33, %rd38;\n"
|
||||
" ld.shared.f32 %f110, [%rd39+0];\n"
|
||||
" add.ftz.f32 %f106, %f110, %f106;\n"
|
||||
" st.shared.f32 [%rd36+0], %f106;\n"
|
||||
" ld.shared.f32 %f111, [%rd39+512];\n"
|
||||
" add.ftz.f32 %f107, %f111, %f107;\n"
|
||||
" st.shared.f32 [%rd36+512], %f107;\n"
|
||||
" ld.shared.f32 %f112, [%rd39+1024];\n"
|
||||
" add.ftz.f32 %f108, %f112, %f108;\n"
|
||||
" st.shared.f32 [%rd36+1024], %f108;\n"
|
||||
" ld.shared.f32 %f113, [%rd39+1536];\n"
|
||||
" add.ftz.f32 %f109, %f113, %f109;\n"
|
||||
" st.shared.f32 [%rd36+1536], %f109;\n"
|
||||
"$Lt_0_23042:\n"
|
||||
" .loc 16 117 0\n"
|
||||
" shr.u32 %r66, %r66, 1;\n"
|
||||
" mov.u32 %r69, 0;\n"
|
||||
" setp.ne.u32 %p11, %r66, %r69;\n"
|
||||
" @%p11 bra $Lt_0_22786;\n"
|
||||
"$Lt_0_22274:\n"
|
||||
" .loc 16 124 0\n"
|
||||
" mov.f32 %f32, %f106;\n"
|
||||
" .loc 16 125 0\n"
|
||||
" mov.f32 %f31, %f107;\n"
|
||||
" .loc 16 126 0\n"
|
||||
" mov.f32 %f30, %f108;\n"
|
||||
" .loc 16 127 0\n"
|
||||
" mov.f32 %f33, %f109;\n"
|
||||
" ld.param.s32 %r70, [__cudaparm_kernel_pair_vflag];\n"
|
||||
" mov.u32 %r71, 0;\n"
|
||||
" setp.le.s32 %p12, %r70, %r71;\n"
|
||||
" @%p12 bra $Lt_0_23810;\n"
|
||||
" .loc 16 131 0\n"
|
||||
" mov.f32 %f106, %f6;\n"
|
||||
" st.shared.f32 [%rd36+0], %f106;\n"
|
||||
" mov.f32 %f107, %f8;\n"
|
||||
" st.shared.f32 [%rd36+512], %f107;\n"
|
||||
" mov.f32 %f108, %f10;\n"
|
||||
" st.shared.f32 [%rd36+1024], %f108;\n"
|
||||
" mov.f32 %f109, %f12;\n"
|
||||
" st.shared.f32 [%rd36+1536], %f109;\n"
|
||||
" mov.f32 %f114, %f14;\n"
|
||||
" st.shared.f32 [%rd36+2048], %f114;\n"
|
||||
" mov.f32 %f115, %f15;\n"
|
||||
" st.shared.f32 [%rd36+2560], %f115;\n"
|
||||
" .loc 16 133 0\n"
|
||||
" mov.s32 %r72, %r65;\n"
|
||||
" @!%p9 bra $Lt_0_24322;\n"
|
||||
"$Lt_0_24834:\n"
|
||||
" setp.ge.u32 %p13, %r14, %r72;\n"
|
||||
" @%p13 bra $Lt_0_25090;\n"
|
||||
" .loc 16 136 0\n"
|
||||
" add.u32 %r73, %r2, %r72;\n"
|
||||
" cvt.u64.u32 %rd40, %r73;\n"
|
||||
" mul.wide.u32 %rd41, %r73, 4;\n"
|
||||
" add.u64 %rd42, %rd33, %rd41;\n"
|
||||
" ld.shared.f32 %f116, [%rd42+0];\n"
|
||||
" add.ftz.f32 %f106, %f116, %f106;\n"
|
||||
" st.shared.f32 [%rd36+0], %f106;\n"
|
||||
" ld.shared.f32 %f117, [%rd42+512];\n"
|
||||
" add.ftz.f32 %f107, %f117, %f107;\n"
|
||||
" st.shared.f32 [%rd36+512], %f107;\n"
|
||||
" ld.shared.f32 %f118, [%rd42+1024];\n"
|
||||
" add.ftz.f32 %f108, %f118, %f108;\n"
|
||||
" st.shared.f32 [%rd36+1024], %f108;\n"
|
||||
" ld.shared.f32 %f119, [%rd42+1536];\n"
|
||||
" add.ftz.f32 %f109, %f119, %f109;\n"
|
||||
" st.shared.f32 [%rd36+1536], %f109;\n"
|
||||
" ld.shared.f32 %f120, [%rd42+2048];\n"
|
||||
" add.ftz.f32 %f114, %f120, %f114;\n"
|
||||
" st.shared.f32 [%rd36+2048], %f114;\n"
|
||||
" ld.shared.f32 %f121, [%rd42+2560];\n"
|
||||
" add.ftz.f32 %f115, %f121, %f115;\n"
|
||||
" st.shared.f32 [%rd36+2560], %f115;\n"
|
||||
"$Lt_0_25090:\n"
|
||||
" .loc 16 133 0\n"
|
||||
" shr.u32 %r72, %r72, 1;\n"
|
||||
" mov.u32 %r74, 0;\n"
|
||||
" setp.ne.u32 %p14, %r72, %r74;\n"
|
||||
" @%p14 bra $Lt_0_24834;\n"
|
||||
"$Lt_0_24322:\n"
|
||||
" .loc 16 141 0\n"
|
||||
" mov.f32 %f6, %f106;\n"
|
||||
" mov.f32 %f8, %f107;\n"
|
||||
" mov.f32 %f10, %f108;\n"
|
||||
" mov.f32 %f12, %f109;\n"
|
||||
" mov.f32 %f14, %f114;\n"
|
||||
" mov.f32 %f16, %f115;\n"
|
||||
"$Lt_0_23810:\n"
|
||||
"$Lt_0_21762:\n"
|
||||
" mov.u32 %r75, 0;\n"
|
||||
" setp.ne.s32 %p15, %r14, %r75;\n"
|
||||
" @%p15 bra $Lt_0_25858;\n"
|
||||
" .loc 16 147 0\n"
|
||||
" ld.param.u64 %rd43, [__cudaparm_kernel_pair_engv];\n"
|
||||
" add.u64 %rd44, %rd43, %rd3;\n"
|
||||
" ld.param.s32 %r76, [__cudaparm_kernel_pair_eflag];\n"
|
||||
" mov.u32 %r77, 0;\n"
|
||||
" setp.le.s32 %p16, %r76, %r77;\n"
|
||||
" @%p16 bra $Lt_0_26370;\n"
|
||||
" .loc 16 149 0\n"
|
||||
" mov.f32 %f122, 0f00000000; \n"
|
||||
" st.global.f32 [%rd44+0], %f122;\n"
|
||||
" .loc 16 150 0\n"
|
||||
" cvt.s64.s32 %rd45, %r9;\n"
|
||||
" mul.wide.s32 %rd46, %r9, 4;\n"
|
||||
" add.u64 %rd47, %rd46, %rd44;\n"
|
||||
" .loc 16 151 0\n"
|
||||
" st.global.f32 [%rd47+0], %f33;\n"
|
||||
" .loc 16 152 0\n"
|
||||
" add.u64 %rd44, %rd46, %rd47;\n"
|
||||
"$Lt_0_26370:\n"
|
||||
" ld.param.s32 %r78, [__cudaparm_kernel_pair_vflag];\n"
|
||||
" mov.u32 %r79, 0;\n"
|
||||
" setp.le.s32 %p17, %r78, %r79;\n"
|
||||
" @%p17 bra $Lt_0_26882;\n"
|
||||
" .loc 16 156 0\n"
|
||||
" mov.f32 %f123, %f6;\n"
|
||||
" st.global.f32 [%rd44+0], %f123;\n"
|
||||
" .loc 16 157 0\n"
|
||||
" cvt.s64.s32 %rd48, %r9;\n"
|
||||
" mul.wide.s32 %rd49, %r9, 4;\n"
|
||||
" add.u64 %rd50, %rd49, %rd44;\n"
|
||||
" .loc 16 156 0\n"
|
||||
" mov.f32 %f124, %f8;\n"
|
||||
" st.global.f32 [%rd50+0], %f124;\n"
|
||||
" .loc 16 157 0\n"
|
||||
" add.u64 %rd51, %rd49, %rd50;\n"
|
||||
" .loc 16 156 0\n"
|
||||
" mov.f32 %f125, %f10;\n"
|
||||
" st.global.f32 [%rd51+0], %f125;\n"
|
||||
" .loc 16 157 0\n"
|
||||
" add.u64 %rd52, %rd49, %rd51;\n"
|
||||
" .loc 16 156 0\n"
|
||||
" mov.f32 %f126, %f12;\n"
|
||||
" st.global.f32 [%rd52+0], %f126;\n"
|
||||
" .loc 16 157 0\n"
|
||||
" add.u64 %rd44, %rd49, %rd52;\n"
|
||||
" .loc 16 156 0\n"
|
||||
" mov.f32 %f127, %f14;\n"
|
||||
" st.global.f32 [%rd44+0], %f127;\n"
|
||||
" mov.f32 %f128, %f16;\n"
|
||||
" add.u64 %rd53, %rd49, %rd44;\n"
|
||||
" st.global.f32 [%rd53+0], %f128;\n"
|
||||
"$Lt_0_26882:\n"
|
||||
" .loc 16 160 0\n"
|
||||
" ld.param.u64 %rd54, [__cudaparm_kernel_pair_ans];\n"
|
||||
" mul.lo.u64 %rd55, %rd2, 16;\n"
|
||||
" add.u64 %rd56, %rd54, %rd55;\n"
|
||||
" mov.f32 %f129, %f130;\n"
|
||||
" st.global.v4.f32 [%rd56+0], {%f32,%f31,%f30,%f129};\n"
|
||||
"$Lt_0_25858:\n"
|
||||
"$Lt_0_18690:\n"
|
||||
" .loc 16 163 0\n"
|
||||
" exit;\n"
|
||||
"$LDWend_kernel_pair:\n"
|
||||
" }\n"
|
||||
" .entry kernel_pair_fast (\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_x_,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_lj1_in,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_lj3_in,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_sp_cl_in,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_dev_packed,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_ans,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_engv,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_eflag,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_vflag,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_inum,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_q_,\n"
|
||||
" .param .f32 __cudaparm_kernel_pair_fast_cut_coulsq,\n"
|
||||
" .param .f32 __cudaparm_kernel_pair_fast_qqrd2e,\n"
|
||||
" .param .f32 __cudaparm_kernel_pair_fast_g_ewald,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_t_per_atom)\n"
|
||||
" {\n"
|
||||
" .reg .u32 %r<82>;\n"
|
||||
" .reg .u64 %rd<62>;\n"
|
||||
" .reg .f32 %f<129>;\n"
|
||||
" .reg .pred %p<20>;\n"
|
||||
" .shared .align 4 .b8 __cuda___cuda_local_var_32678_33_non_const_sp_cl3304[16];\n"
|
||||
" .shared .align 4 .b8 __cuda___cuda_local_var_32748_37_non_const_red_acc3320[3072];\n"
|
||||
" .loc 16 173 0\n"
|
||||
"$LDWbegin_kernel_pair_fast:\n"
|
||||
" cvt.s32.u32 %r1, %tid.x;\n"
|
||||
" mov.u32 %r2, 3;\n"
|
||||
" setp.gt.s32 %p1, %r1, %r2;\n"
|
||||
" @%p1 bra $Lt_1_19458;\n"
|
||||
" .loc 16 179 0\n"
|
||||
" mov.u64 %rd1, __cuda___cuda_local_var_32678_33_non_const_sp_cl3304;\n"
|
||||
" cvt.s64.s32 %rd2, %r1;\n"
|
||||
" mul.wide.s32 %rd3, %r1, 4;\n"
|
||||
" ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_cl_in];\n"
|
||||
" add.u64 %rd5, %rd4, %rd3;\n"
|
||||
" ld.global.f32 %f1, [%rd5+0];\n"
|
||||
" add.u64 %rd6, %rd3, %rd1;\n"
|
||||
" st.shared.f32 [%rd6+0], %f1;\n"
|
||||
"$Lt_1_19458:\n"
|
||||
" mov.u64 %rd1, __cuda___cuda_local_var_32678_33_non_const_sp_cl3304;\n"
|
||||
" .loc 16 186 0\n"
|
||||
" mov.f32 %f2, 0f00000000; \n"
|
||||
" mov.f32 %f3, %f2;\n"
|
||||
" mov.f32 %f4, 0f00000000; \n"
|
||||
" mov.f32 %f5, %f4;\n"
|
||||
" mov.f32 %f6, 0f00000000; \n"
|
||||
" mov.f32 %f7, %f6;\n"
|
||||
" mov.f32 %f8, 0f00000000; \n"
|
||||
" mov.f32 %f9, %f8;\n"
|
||||
" mov.f32 %f10, 0f00000000; \n"
|
||||
" mov.f32 %f11, %f10;\n"
|
||||
" mov.f32 %f12, 0f00000000; \n"
|
||||
" mov.f32 %f13, %f12;\n"
|
||||
" .loc 16 188 0\n"
|
||||
" bar.sync 0;\n"
|
||||
" ld.param.s32 %r3, [__cudaparm_kernel_pair_fast_t_per_atom];\n"
|
||||
" div.s32 %r4, %r1, %r3;\n"
|
||||
" cvt.s32.u32 %r5, %ntid.x;\n"
|
||||
" div.s32 %r6, %r5, %r3;\n"
|
||||
" cvt.s32.u32 %r7, %ctaid.x;\n"
|
||||
" mul.lo.s32 %r8, %r7, %r6;\n"
|
||||
" add.s32 %r9, %r4, %r8;\n"
|
||||
" ld.param.s32 %r10, [__cudaparm_kernel_pair_fast_inum];\n"
|
||||
" setp.ge.s32 %p2, %r9, %r10;\n"
|
||||
" @%p2 bra $Lt_1_27138;\n"
|
||||
" .loc 16 193 0\n"
|
||||
" cvt.s64.s32 %rd7, %r9;\n"
|
||||
" mul.wide.s32 %rd8, %r9, 4;\n"
|
||||
" ld.param.u64 %rd9, [__cudaparm_kernel_pair_fast_dev_nbor];\n"
|
||||
" add.u64 %rd10, %rd8, %rd9;\n"
|
||||
" ld.global.s32 %r11, [%rd10+0];\n"
|
||||
" ld.param.s32 %r12, [__cudaparm_kernel_pair_fast_nbor_pitch];\n"
|
||||
" cvt.s64.s32 %rd11, %r12;\n"
|
||||
" mul.wide.s32 %rd12, %r12, 4;\n"
|
||||
" add.u64 %rd13, %rd12, %rd10;\n"
|
||||
" ld.global.s32 %r13, [%rd13+0];\n"
|
||||
" sub.s32 %r14, %r3, 1;\n"
|
||||
" and.b32 %r15, %r14, %r1;\n"
|
||||
" cvt.s64.s32 %rd14, %r15;\n"
|
||||
" mul.wide.s32 %rd15, %r15, 4;\n"
|
||||
" ld.param.u64 %rd16, [__cudaparm_kernel_pair_fast_dev_packed];\n"
|
||||
" setp.ne.u64 %p3, %rd16, %rd9;\n"
|
||||
" @%p3 bra $Lt_1_20738;\n"
|
||||
" cvt.s32.s64 %r16, %rd11;\n"
|
||||
" mul.lo.s32 %r17, %r16, %r3;\n"
|
||||
" mov.s32 %r18, %r17;\n"
|
||||
" mul.lo.s32 %r19, %r14, %r9;\n"
|
||||
" add.s32 %r20, %r16, %r19;\n"
|
||||
" cvt.s64.s32 %rd17, %r20;\n"
|
||||
" mul.wide.s32 %rd18, %r20, 4;\n"
|
||||
" add.u64 %rd19, %rd13, %rd18;\n"
|
||||
" and.b32 %r21, %r14, %r13;\n"
|
||||
" cvt.s64.s32 %rd20, %r21;\n"
|
||||
" div.s32 %r22, %r13, %r3;\n"
|
||||
" mul.lo.s32 %r23, %r17, %r22;\n"
|
||||
" cvt.s64.s32 %rd21, %r23;\n"
|
||||
" add.u64 %rd22, %rd20, %rd21;\n"
|
||||
" mul.lo.u64 %rd23, %rd22, 4;\n"
|
||||
" add.u64 %rd24, %rd19, %rd23;\n"
|
||||
" add.u64 %rd25, %rd15, %rd19;\n"
|
||||
" bra.uni $Lt_1_20482;\n"
|
||||
"$Lt_1_20738:\n"
|
||||
" add.u64 %rd26, %rd12, %rd13;\n"
|
||||
" ld.global.s32 %r24, [%rd26+0];\n"
|
||||
" cvt.s64.s32 %rd27, %r24;\n"
|
||||
" mul.wide.s32 %rd28, %r24, 4;\n"
|
||||
" add.u64 %rd29, %rd16, %rd28;\n"
|
||||
" cvt.s64.s32 %rd30, %r13;\n"
|
||||
" mul.wide.s32 %rd31, %r13, 4;\n"
|
||||
" add.u64 %rd24, %rd29, %rd31;\n"
|
||||
" mov.s32 %r18, %r3;\n"
|
||||
" add.u64 %rd25, %rd15, %rd29;\n"
|
||||
"$Lt_1_20482:\n"
|
||||
" .loc 16 196 0\n"
|
||||
" mov.u32 %r25, %r11;\n"
|
||||
" mov.s32 %r26, 0;\n"
|
||||
" mov.u32 %r27, %r26;\n"
|
||||
" mov.s32 %r28, 0;\n"
|
||||
" mov.u32 %r29, %r28;\n"
|
||||
" mov.s32 %r30, 0;\n"
|
||||
" mov.u32 %r31, %r30;\n"
|
||||
" tex.1d.v4.f32.s32 {%f14,%f15,%f16,%f17},[pos_tex,{%r25,%r27,%r29,%r31}];\n"
|
||||
" mov.f32 %f18, %f14;\n"
|
||||
" mov.f32 %f19, %f15;\n"
|
||||
" mov.f32 %f20, %f16;\n"
|
||||
" .loc 16 197 0\n"
|
||||
" mov.u32 %r32, %r11;\n"
|
||||
" mov.s32 %r33, 0;\n"
|
||||
" mov.u32 %r34, %r33;\n"
|
||||
" mov.s32 %r35, 0;\n"
|
||||
" mov.u32 %r36, %r35;\n"
|
||||
" mov.s32 %r37, 0;\n"
|
||||
" mov.u32 %r38, %r37;\n"
|
||||
" tex.1d.v4.f32.s32 {%f21,%f22,%f23,%f24},[q_tex,{%r32,%r34,%r36,%r38}];\n"
|
||||
" mov.f32 %f25, %f21;\n"
|
||||
" setp.ge.u64 %p4, %rd25, %rd24;\n"
|
||||
" @%p4 bra $Lt_1_28674;\n"
|
||||
" cvt.s64.s32 %rd32, %r18;\n"
|
||||
" ld.param.f32 %f26, [__cudaparm_kernel_pair_fast_cut_coulsq];\n"
|
||||
" mov.f32 %f27, 0f00000000; \n"
|
||||
" mov.f32 %f28, 0f00000000; \n"
|
||||
" mov.f32 %f29, 0f00000000; \n"
|
||||
" mov.f32 %f30, 0f00000000; \n"
|
||||
"$Lt_1_21506:\n"
|
||||
" .loc 16 200 0\n"
|
||||
" ld.global.s32 %r39, [%rd25+0];\n"
|
||||
" .loc 16 203 0\n"
|
||||
" mov.f32 %f31, 0f3f800000; \n"
|
||||
" shr.s32 %r40, %r39, 30;\n"
|
||||
" and.b32 %r41, %r40, 3;\n"
|
||||
" cvt.s64.s32 %rd33, %r41;\n"
|
||||
" mul.wide.s32 %rd34, %r41, 4;\n"
|
||||
" add.u64 %rd35, %rd1, %rd34;\n"
|
||||
" ld.shared.f32 %f32, [%rd35+0];\n"
|
||||
" sub.ftz.f32 %f33, %f31, %f32;\n"
|
||||
" .loc 16 206 0\n"
|
||||
" and.b32 %r42, %r39, 1073741823;\n"
|
||||
" mov.u32 %r43, %r42;\n"
|
||||
" mov.s32 %r44, 0;\n"
|
||||
" mov.u32 %r45, %r44;\n"
|
||||
" mov.s32 %r46, 0;\n"
|
||||
" mov.u32 %r47, %r46;\n"
|
||||
" mov.s32 %r48, 0;\n"
|
||||
" mov.u32 %r49, %r48;\n"
|
||||
" tex.1d.v4.f32.s32 {%f34,%f35,%f36,%f37},[pos_tex,{%r43,%r45,%r47,%r49}];\n"
|
||||
" mov.f32 %f38, %f34;\n"
|
||||
" mov.f32 %f39, %f35;\n"
|
||||
" mov.f32 %f40, %f36;\n"
|
||||
" sub.ftz.f32 %f41, %f19, %f39;\n"
|
||||
" sub.ftz.f32 %f42, %f18, %f38;\n"
|
||||
" sub.ftz.f32 %f43, %f20, %f40;\n"
|
||||
" mul.ftz.f32 %f44, %f41, %f41;\n"
|
||||
" fma.rn.ftz.f32 %f45, %f42, %f42, %f44;\n"
|
||||
" fma.rn.ftz.f32 %f46, %f43, %f43, %f45;\n"
|
||||
" setp.lt.ftz.f32 %p5, %f46, %f26;\n"
|
||||
" @!%p5 bra $Lt_1_22274;\n"
|
||||
" .loc 20 518 0\n"
|
||||
" rcp.approx.ftz.f32 %f47, %f46;\n"
|
||||
" rsqrt.approx.ftz.f32 %f48, %f47;\n"
|
||||
" ld.param.f32 %f49, [__cudaparm_kernel_pair_fast_g_ewald];\n"
|
||||
" mul.ftz.f32 %f50, %f49, %f48;\n"
|
||||
" mul.ftz.f32 %f51, %f50, %f50;\n"
|
||||
" neg.ftz.f32 %f52, %f51;\n"
|
||||
" mov.f32 %f53, 0f3fb8aa3b; \n"
|
||||
" mul.ftz.f32 %f54, %f52, %f53;\n"
|
||||
" ex2.approx.ftz.f32 %f55, %f54;\n"
|
||||
" .loc 16 222 0\n"
|
||||
" mov.f32 %f56, 0f3f800000; \n"
|
||||
" mov.f32 %f57, 0f3ea7ba05; \n"
|
||||
" fma.rn.ftz.f32 %f58, %f57, %f50, %f56;\n"
|
||||
" rcp.approx.ftz.f32 %f59, %f58;\n"
|
||||
" mov.f32 %f60, 0f3e827906; \n"
|
||||
" mov.f32 %f61, 0fbe91a98e; \n"
|
||||
" mov.f32 %f62, 0f3fb5f0e3; \n"
|
||||
" mov.f32 %f63, 0fbfba00e3; \n"
|
||||
" mov.f32 %f64, 0f3f87dc22; \n"
|
||||
" fma.rn.ftz.f32 %f65, %f64, %f59, %f63;\n"
|
||||
" fma.rn.ftz.f32 %f66, %f59, %f65, %f62;\n"
|
||||
" fma.rn.ftz.f32 %f67, %f59, %f66, %f61;\n"
|
||||
" fma.rn.ftz.f32 %f68, %f59, %f67, %f60;\n"
|
||||
" mul.ftz.f32 %f69, %f59, %f68;\n"
|
||||
" mul.ftz.f32 %f70, %f55, %f69;\n"
|
||||
" .loc 16 223 0\n"
|
||||
" mov.u32 %r50, %r42;\n"
|
||||
" mov.s32 %r51, 0;\n"
|
||||
" mov.u32 %r52, %r51;\n"
|
||||
" mov.s32 %r53, 0;\n"
|
||||
" mov.u32 %r54, %r53;\n"
|
||||
" mov.s32 %r55, 0;\n"
|
||||
" mov.u32 %r56, %r55;\n"
|
||||
" tex.1d.v4.f32.s32 {%f71,%f72,%f73,%f74},[q_tex,{%r50,%r52,%r54,%r56}];\n"
|
||||
" mov.f32 %f75, %f71;\n"
|
||||
" .loc 16 224 0\n"
|
||||
" ld.param.f32 %f76, [__cudaparm_kernel_pair_fast_qqrd2e];\n"
|
||||
" mul.ftz.f32 %f77, %f76, %f25;\n"
|
||||
" mul.ftz.f32 %f78, %f77, %f75;\n"
|
||||
" div.approx.ftz.f32 %f79, %f78, %f48;\n"
|
||||
" mov.f32 %f80, 0f3f906ebb; \n"
|
||||
" mul.ftz.f32 %f81, %f50, %f80;\n"
|
||||
" fma.rn.ftz.f32 %f82, %f55, %f81, %f70;\n"
|
||||
" sub.ftz.f32 %f83, %f82, %f33;\n"
|
||||
" mul.ftz.f32 %f84, %f79, %f83;\n"
|
||||
" mul.ftz.f32 %f85, %f47, %f84;\n"
|
||||
" .loc 16 226 0\n"
|
||||
" fma.rn.ftz.f32 %f29, %f42, %f85, %f29;\n"
|
||||
" .loc 16 227 0\n"
|
||||
" fma.rn.ftz.f32 %f28, %f41, %f85, %f28;\n"
|
||||
" .loc 16 228 0\n"
|
||||
" fma.rn.ftz.f32 %f27, %f43, %f85, %f27;\n"
|
||||
" .loc 16 215 0\n"
|
||||
" sub.ftz.f32 %f86, %f70, %f33;\n"
|
||||
" fma.rn.ftz.f32 %f87, %f79, %f86, %f30;\n"
|
||||
" ld.param.s32 %r57, [__cudaparm_kernel_pair_fast_eflag];\n"
|
||||
" mov.s32 %r58, 0;\n"
|
||||
" setp.gt.s32 %p6, %r57, %r58;\n"
|
||||
" selp.f32 %f30, %f87, %f30, %p6;\n"
|
||||
" ld.param.s32 %r59, [__cudaparm_kernel_pair_fast_vflag];\n"
|
||||
" mov.u32 %r60, 0;\n"
|
||||
" setp.le.s32 %p7, %r59, %r60;\n"
|
||||
" @%p7 bra $Lt_1_22274;\n"
|
||||
" .loc 16 234 0\n"
|
||||
" mov.f32 %f88, %f3;\n"
|
||||
" mul.ftz.f32 %f89, %f42, %f42;\n"
|
||||
" fma.rn.ftz.f32 %f90, %f85, %f89, %f88;\n"
|
||||
" mov.f32 %f3, %f90;\n"
|
||||
" .loc 16 235 0\n"
|
||||
" mov.f32 %f91, %f5;\n"
|
||||
" fma.rn.ftz.f32 %f92, %f85, %f44, %f91;\n"
|
||||
" mov.f32 %f5, %f92;\n"
|
||||
" .loc 16 236 0\n"
|
||||
" mov.f32 %f93, %f7;\n"
|
||||
" mul.ftz.f32 %f94, %f43, %f43;\n"
|
||||
" fma.rn.ftz.f32 %f95, %f85, %f94, %f93;\n"
|
||||
" mov.f32 %f7, %f95;\n"
|
||||
" .loc 16 237 0\n"
|
||||
" mov.f32 %f96, %f9;\n"
|
||||
" mul.ftz.f32 %f97, %f41, %f42;\n"
|
||||
" fma.rn.ftz.f32 %f98, %f85, %f97, %f96;\n"
|
||||
" mov.f32 %f9, %f98;\n"
|
||||
" .loc 16 238 0\n"
|
||||
" mov.f32 %f99, %f11;\n"
|
||||
" mul.ftz.f32 %f100, %f42, %f43;\n"
|
||||
" fma.rn.ftz.f32 %f101, %f85, %f100, %f99;\n"
|
||||
" mov.f32 %f11, %f101;\n"
|
||||
" .loc 16 239 0\n"
|
||||
" mul.ftz.f32 %f102, %f41, %f43;\n"
|
||||
" fma.rn.ftz.f32 %f12, %f85, %f102, %f12;\n"
|
||||
" mov.f32 %f13, %f12;\n"
|
||||
"$Lt_1_22274:\n"
|
||||
"$Lt_1_21762:\n"
|
||||
" .loc 16 199 0\n"
|
||||
" mul.lo.u64 %rd36, %rd32, 4;\n"
|
||||
" add.u64 %rd25, %rd25, %rd36;\n"
|
||||
" setp.lt.u64 %p8, %rd25, %rd24;\n"
|
||||
" @%p8 bra $Lt_1_21506;\n"
|
||||
" bra.uni $Lt_1_20994;\n"
|
||||
"$Lt_1_28674:\n"
|
||||
" mov.f32 %f27, 0f00000000; \n"
|
||||
" mov.f32 %f28, 0f00000000; \n"
|
||||
" mov.f32 %f29, 0f00000000; \n"
|
||||
" mov.f32 %f30, 0f00000000; \n"
|
||||
"$Lt_1_20994:\n"
|
||||
" mov.u32 %r61, 1;\n"
|
||||
" setp.le.s32 %p9, %r3, %r61;\n"
|
||||
" @%p9 bra $Lt_1_25090;\n"
|
||||
" .loc 16 249 0\n"
|
||||
" mov.u64 %rd37, __cuda___cuda_local_var_32748_37_non_const_red_acc3320;\n"
|
||||
" cvt.s64.s32 %rd38, %r1;\n"
|
||||
" mul.wide.s32 %rd39, %r1, 4;\n"
|
||||
" add.u64 %rd40, %rd37, %rd39;\n"
|
||||
" mov.f32 %f103, %f29;\n"
|
||||
" st.shared.f32 [%rd40+0], %f103;\n"
|
||||
" .loc 16 250 0\n"
|
||||
" mov.f32 %f104, %f28;\n"
|
||||
" st.shared.f32 [%rd40+512], %f104;\n"
|
||||
" .loc 16 251 0\n"
|
||||
" mov.f32 %f105, %f27;\n"
|
||||
" st.shared.f32 [%rd40+1024], %f105;\n"
|
||||
" .loc 16 252 0\n"
|
||||
" mov.f32 %f106, %f30;\n"
|
||||
" st.shared.f32 [%rd40+1536], %f106;\n"
|
||||
" .loc 16 254 0\n"
|
||||
" shr.s32 %r62, %r3, 31;\n"
|
||||
" mov.s32 %r63, 1;\n"
|
||||
" and.b32 %r64, %r62, %r63;\n"
|
||||
" add.s32 %r65, %r64, %r3;\n"
|
||||
" shr.s32 %r66, %r65, 1;\n"
|
||||
" mov.s32 %r67, %r66;\n"
|
||||
" mov.u32 %r68, 0;\n"
|
||||
" setp.ne.u32 %p10, %r66, %r68;\n"
|
||||
" @!%p10 bra $Lt_1_23554;\n"
|
||||
"$Lt_1_24066:\n"
|
||||
" setp.ge.u32 %p11, %r15, %r67;\n"
|
||||
" @%p11 bra $Lt_1_24322;\n"
|
||||
" .loc 16 257 0\n"
|
||||
" add.u32 %r69, %r1, %r67;\n"
|
||||
" cvt.u64.u32 %rd41, %r69;\n"
|
||||
" mul.wide.u32 %rd42, %r69, 4;\n"
|
||||
" add.u64 %rd43, %rd37, %rd42;\n"
|
||||
" ld.shared.f32 %f107, [%rd43+0];\n"
|
||||
" add.ftz.f32 %f103, %f107, %f103;\n"
|
||||
" st.shared.f32 [%rd40+0], %f103;\n"
|
||||
" ld.shared.f32 %f108, [%rd43+512];\n"
|
||||
" add.ftz.f32 %f104, %f108, %f104;\n"
|
||||
" st.shared.f32 [%rd40+512], %f104;\n"
|
||||
" ld.shared.f32 %f109, [%rd43+1024];\n"
|
||||
" add.ftz.f32 %f105, %f109, %f105;\n"
|
||||
" st.shared.f32 [%rd40+1024], %f105;\n"
|
||||
" ld.shared.f32 %f110, [%rd43+1536];\n"
|
||||
" add.ftz.f32 %f106, %f110, %f106;\n"
|
||||
" st.shared.f32 [%rd40+1536], %f106;\n"
|
||||
"$Lt_1_24322:\n"
|
||||
" .loc 16 254 0\n"
|
||||
" shr.u32 %r67, %r67, 1;\n"
|
||||
" mov.u32 %r70, 0;\n"
|
||||
" setp.ne.u32 %p12, %r67, %r70;\n"
|
||||
" @%p12 bra $Lt_1_24066;\n"
|
||||
"$Lt_1_23554:\n"
|
||||
" .loc 16 261 0\n"
|
||||
" mov.f32 %f29, %f103;\n"
|
||||
" .loc 16 262 0\n"
|
||||
" mov.f32 %f28, %f104;\n"
|
||||
" .loc 16 263 0\n"
|
||||
" mov.f32 %f27, %f105;\n"
|
||||
" .loc 16 264 0\n"
|
||||
" mov.f32 %f30, %f106;\n"
|
||||
" ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag];\n"
|
||||
" mov.u32 %r72, 0;\n"
|
||||
" setp.le.s32 %p13, %r71, %r72;\n"
|
||||
" @%p13 bra $Lt_1_25090;\n"
|
||||
" .loc 16 268 0\n"
|
||||
" mov.f32 %f103, %f3;\n"
|
||||
" st.shared.f32 [%rd40+0], %f103;\n"
|
||||
" mov.f32 %f104, %f5;\n"
|
||||
" st.shared.f32 [%rd40+512], %f104;\n"
|
||||
" mov.f32 %f105, %f7;\n"
|
||||
" st.shared.f32 [%rd40+1024], %f105;\n"
|
||||
" mov.f32 %f106, %f9;\n"
|
||||
" st.shared.f32 [%rd40+1536], %f106;\n"
|
||||
" mov.f32 %f111, %f11;\n"
|
||||
" st.shared.f32 [%rd40+2048], %f111;\n"
|
||||
" mov.f32 %f112, %f12;\n"
|
||||
" st.shared.f32 [%rd40+2560], %f112;\n"
|
||||
" .loc 16 270 0\n"
|
||||
" mov.s32 %r73, %r66;\n"
|
||||
" @!%p10 bra $Lt_1_25602;\n"
|
||||
"$Lt_1_26114:\n"
|
||||
" setp.ge.u32 %p14, %r15, %r73;\n"
|
||||
" @%p14 bra $Lt_1_26370;\n"
|
||||
" .loc 16 273 0\n"
|
||||
" add.u32 %r74, %r1, %r73;\n"
|
||||
" cvt.u64.u32 %rd44, %r74;\n"
|
||||
" mul.wide.u32 %rd45, %r74, 4;\n"
|
||||
" add.u64 %rd46, %rd37, %rd45;\n"
|
||||
" ld.shared.f32 %f113, [%rd46+0];\n"
|
||||
" add.ftz.f32 %f103, %f113, %f103;\n"
|
||||
" st.shared.f32 [%rd40+0], %f103;\n"
|
||||
" ld.shared.f32 %f114, [%rd46+512];\n"
|
||||
" add.ftz.f32 %f104, %f114, %f104;\n"
|
||||
" st.shared.f32 [%rd40+512], %f104;\n"
|
||||
" ld.shared.f32 %f115, [%rd46+1024];\n"
|
||||
" add.ftz.f32 %f105, %f115, %f105;\n"
|
||||
" st.shared.f32 [%rd40+1024], %f105;\n"
|
||||
" ld.shared.f32 %f116, [%rd46+1536];\n"
|
||||
" add.ftz.f32 %f106, %f116, %f106;\n"
|
||||
" st.shared.f32 [%rd40+1536], %f106;\n"
|
||||
" ld.shared.f32 %f117, [%rd46+2048];\n"
|
||||
" add.ftz.f32 %f111, %f117, %f111;\n"
|
||||
" st.shared.f32 [%rd40+2048], %f111;\n"
|
||||
" ld.shared.f32 %f118, [%rd46+2560];\n"
|
||||
" add.ftz.f32 %f112, %f118, %f112;\n"
|
||||
" st.shared.f32 [%rd40+2560], %f112;\n"
|
||||
"$Lt_1_26370:\n"
|
||||
" .loc 16 270 0\n"
|
||||
" shr.u32 %r73, %r73, 1;\n"
|
||||
" mov.u32 %r75, 0;\n"
|
||||
" setp.ne.u32 %p15, %r73, %r75;\n"
|
||||
" @%p15 bra $Lt_1_26114;\n"
|
||||
"$Lt_1_25602:\n"
|
||||
" .loc 16 278 0\n"
|
||||
" mov.f32 %f3, %f103;\n"
|
||||
" mov.f32 %f5, %f104;\n"
|
||||
" mov.f32 %f7, %f105;\n"
|
||||
" mov.f32 %f9, %f106;\n"
|
||||
" mov.f32 %f11, %f111;\n"
|
||||
" mov.f32 %f13, %f112;\n"
|
||||
"$Lt_1_25090:\n"
|
||||
"$Lt_1_23042:\n"
|
||||
" mov.u32 %r76, 0;\n"
|
||||
" setp.ne.s32 %p16, %r15, %r76;\n"
|
||||
" @%p16 bra $Lt_1_27138;\n"
|
||||
" .loc 16 284 0\n"
|
||||
" ld.param.u64 %rd47, [__cudaparm_kernel_pair_fast_engv];\n"
|
||||
" add.u64 %rd48, %rd47, %rd8;\n"
|
||||
" ld.param.s32 %r77, [__cudaparm_kernel_pair_fast_eflag];\n"
|
||||
" mov.u32 %r78, 0;\n"
|
||||
" setp.le.s32 %p17, %r77, %r78;\n"
|
||||
" @%p17 bra $Lt_1_27650;\n"
|
||||
" .loc 16 286 0\n"
|
||||
" mov.f32 %f119, 0f00000000; \n"
|
||||
" st.global.f32 [%rd48+0], %f119;\n"
|
||||
" .loc 16 287 0\n"
|
||||
" cvt.s64.s32 %rd49, %r10;\n"
|
||||
" mul.wide.s32 %rd50, %r10, 4;\n"
|
||||
" add.u64 %rd51, %rd50, %rd48;\n"
|
||||
" .loc 16 288 0\n"
|
||||
" st.global.f32 [%rd51+0], %f30;\n"
|
||||
" .loc 16 289 0\n"
|
||||
" add.u64 %rd48, %rd50, %rd51;\n"
|
||||
"$Lt_1_27650:\n"
|
||||
" ld.param.s32 %r79, [__cudaparm_kernel_pair_fast_vflag];\n"
|
||||
" mov.u32 %r80, 0;\n"
|
||||
" setp.le.s32 %p18, %r79, %r80;\n"
|
||||
" @%p18 bra $Lt_1_28162;\n"
|
||||
" .loc 16 293 0\n"
|
||||
" mov.f32 %f120, %f3;\n"
|
||||
" st.global.f32 [%rd48+0], %f120;\n"
|
||||
" .loc 16 294 0\n"
|
||||
" cvt.s64.s32 %rd52, %r10;\n"
|
||||
" mul.wide.s32 %rd53, %r10, 4;\n"
|
||||
" add.u64 %rd54, %rd53, %rd48;\n"
|
||||
" .loc 16 293 0\n"
|
||||
" mov.f32 %f121, %f5;\n"
|
||||
" st.global.f32 [%rd54+0], %f121;\n"
|
||||
" .loc 16 294 0\n"
|
||||
" add.u64 %rd55, %rd53, %rd54;\n"
|
||||
" .loc 16 293 0\n"
|
||||
" mov.f32 %f122, %f7;\n"
|
||||
" st.global.f32 [%rd55+0], %f122;\n"
|
||||
" .loc 16 294 0\n"
|
||||
" add.u64 %rd56, %rd53, %rd55;\n"
|
||||
" .loc 16 293 0\n"
|
||||
" mov.f32 %f123, %f9;\n"
|
||||
" st.global.f32 [%rd56+0], %f123;\n"
|
||||
" .loc 16 294 0\n"
|
||||
" add.u64 %rd48, %rd53, %rd56;\n"
|
||||
" .loc 16 293 0\n"
|
||||
" mov.f32 %f124, %f11;\n"
|
||||
" st.global.f32 [%rd48+0], %f124;\n"
|
||||
" mov.f32 %f125, %f13;\n"
|
||||
" add.u64 %rd57, %rd53, %rd48;\n"
|
||||
" st.global.f32 [%rd57+0], %f125;\n"
|
||||
"$Lt_1_28162:\n"
|
||||
" .loc 16 297 0\n"
|
||||
" ld.param.u64 %rd58, [__cudaparm_kernel_pair_fast_ans];\n"
|
||||
" mul.lo.u64 %rd59, %rd7, 16;\n"
|
||||
" add.u64 %rd60, %rd58, %rd59;\n"
|
||||
" mov.f32 %f126, %f127;\n"
|
||||
" st.global.v4.f32 [%rd60+0], {%f29,%f28,%f27,%f126};\n"
|
||||
"$Lt_1_27138:\n"
|
||||
"$Lt_1_19970:\n"
|
||||
" .loc 16 300 0\n"
|
||||
" exit;\n"
|
||||
"$LDWend_kernel_pair_fast:\n"
|
||||
" }\n"
|
||||
;
|
||||
BIN
lib/gpu/cudpp.o
BIN
lib/gpu/cudpp.o
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1,134 +0,0 @@
|
||||
.version 2.3
|
||||
.target sm_20
|
||||
.address_size 64
|
||||
// compiled with /usr/local/cuda/open64/lib//be
|
||||
// nvopencc 4.0 built on 2011-05-12
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Compiling /tmp/tmpxft_00009a81_00000000-9_lal_device.cpp3.i (/home/sjplimp/ccBI#.zwVkZj)
|
||||
//-----------------------------------------------------------
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Options:
|
||||
//-----------------------------------------------------------
|
||||
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
|
||||
// -O3 (Optimization level)
|
||||
// -g0 (Debug level)
|
||||
// -m2 (Report advisories)
|
||||
//-----------------------------------------------------------
|
||||
|
||||
.file 1 "<command-line>"
|
||||
.file 2 "/tmp/tmpxft_00009a81_00000000-8_lal_device.cudafe2.gpu"
|
||||
.file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
|
||||
.file 4 "/usr/local/cuda/include/crt/device_runtime.h"
|
||||
.file 5 "/usr/local/cuda/include/host_defines.h"
|
||||
.file 6 "/usr/local/cuda/include/builtin_types.h"
|
||||
.file 7 "/usr/local/cuda/include/device_types.h"
|
||||
.file 8 "/usr/local/cuda/include/driver_types.h"
|
||||
.file 9 "/usr/local/cuda/include/surface_types.h"
|
||||
.file 10 "/usr/local/cuda/include/texture_types.h"
|
||||
.file 11 "/usr/local/cuda/include/vector_types.h"
|
||||
.file 12 "/usr/local/cuda/include/device_launch_parameters.h"
|
||||
.file 13 "/usr/local/cuda/include/crt/storage_class.h"
|
||||
.file 14 "/usr/include/bits/types.h"
|
||||
.file 15 "/usr/include/time.h"
|
||||
.file 16 "lal_device.cu"
|
||||
.file 17 "/usr/local/cuda/include/common_functions.h"
|
||||
.file 18 "/usr/local/cuda/include/math_functions.h"
|
||||
.file 19 "/usr/local/cuda/include/math_constants.h"
|
||||
.file 20 "/usr/local/cuda/include/device_functions.h"
|
||||
.file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h"
|
||||
.file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h"
|
||||
.file 23 "/usr/local/cuda/include/sm_13_double_functions.h"
|
||||
.file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h"
|
||||
.file 25 "/usr/local/cuda/include/sm_20_intrinsics.h"
|
||||
.file 26 "/usr/local/cuda/include/surface_functions.h"
|
||||
.file 27 "/usr/local/cuda/include/texture_fetch_functions.h"
|
||||
.file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
|
||||
|
||||
|
||||
.entry kernel_zero (
|
||||
.param .u64 __cudaparm_kernel_zero_mem,
|
||||
.param .s32 __cudaparm_kernel_zero_numel)
|
||||
{
|
||||
.reg .u32 %r<9>;
|
||||
.reg .u64 %rd<6>;
|
||||
.reg .pred %p<3>;
|
||||
.loc 16 20 0
|
||||
$LDWbegin_kernel_zero:
|
||||
cvt.s32.u32 %r1, %ctaid.x;
|
||||
cvt.s32.u32 %r2, %ntid.x;
|
||||
mul24.lo.s32 %r3, %r1, %r2;
|
||||
mov.u32 %r4, %tid.x;
|
||||
add.u32 %r5, %r3, %r4;
|
||||
ld.param.s32 %r6, [__cudaparm_kernel_zero_numel];
|
||||
setp.le.s32 %p1, %r6, %r5;
|
||||
@%p1 bra $Lt_0_1026;
|
||||
.loc 16 24 0
|
||||
mov.s32 %r7, 0;
|
||||
ld.param.u64 %rd1, [__cudaparm_kernel_zero_mem];
|
||||
cvt.s64.s32 %rd2, %r5;
|
||||
mul.wide.s32 %rd3, %r5, 4;
|
||||
add.u64 %rd4, %rd1, %rd3;
|
||||
st.global.s32 [%rd4+0], %r7;
|
||||
$Lt_0_1026:
|
||||
.loc 16 25 0
|
||||
exit;
|
||||
$LDWend_kernel_zero:
|
||||
} // kernel_zero
|
||||
|
||||
.entry kernel_info (
|
||||
.param .u64 __cudaparm_kernel_info_info)
|
||||
{
|
||||
.reg .u32 %r<16>;
|
||||
.reg .u64 %rd<3>;
|
||||
.loc 16 27 0
|
||||
$LDWbegin_kernel_info:
|
||||
.loc 16 28 0
|
||||
ld.param.u64 %rd1, [__cudaparm_kernel_info_info];
|
||||
mov.s32 %r1, 200;
|
||||
st.global.s32 [%rd1+0], %r1;
|
||||
.loc 16 29 0
|
||||
mov.s32 %r2, 32;
|
||||
st.global.s32 [%rd1+4], %r2;
|
||||
.loc 16 30 0
|
||||
mov.s32 %r3, 32;
|
||||
st.global.s32 [%rd1+8], %r3;
|
||||
.loc 16 31 0
|
||||
mov.s32 %r4, 4;
|
||||
st.global.s32 [%rd1+12], %r4;
|
||||
.loc 16 32 0
|
||||
mov.s32 %r5, 8;
|
||||
st.global.s32 [%rd1+16], %r5;
|
||||
.loc 16 33 0
|
||||
mov.s32 %r6, 64;
|
||||
st.global.s32 [%rd1+20], %r6;
|
||||
.loc 16 34 0
|
||||
mov.s32 %r7, 128;
|
||||
st.global.s32 [%rd1+24], %r7;
|
||||
.loc 16 35 0
|
||||
mov.s32 %r8, 11;
|
||||
st.global.s32 [%rd1+28], %r8;
|
||||
.loc 16 36 0
|
||||
mov.s32 %r9, 8;
|
||||
st.global.s32 [%rd1+32], %r9;
|
||||
.loc 16 37 0
|
||||
mov.s32 %r10, 128;
|
||||
st.global.s32 [%rd1+36], %r10;
|
||||
.loc 16 38 0
|
||||
mov.s32 %r11, 128;
|
||||
st.global.s32 [%rd1+40], %r11;
|
||||
.loc 16 39 0
|
||||
mov.s32 %r12, 128;
|
||||
st.global.s32 [%rd1+44], %r12;
|
||||
.loc 16 40 0
|
||||
mov.s32 %r13, 128;
|
||||
st.global.s32 [%rd1+48], %r13;
|
||||
.loc 16 41 0
|
||||
mov.s32 %r14, 8;
|
||||
st.global.s32 [%rd1+52], %r14;
|
||||
.loc 16 42 0
|
||||
exit;
|
||||
$LDWend_kernel_info:
|
||||
} // kernel_info
|
||||
|
||||
@ -1,88 +0,0 @@
|
||||
const char * device =
|
||||
" .version 2.3\n"
|
||||
" .target sm_20\n"
|
||||
" .address_size 64\n"
|
||||
" .entry kernel_zero (\n"
|
||||
" .param .u64 __cudaparm_kernel_zero_mem,\n"
|
||||
" .param .s32 __cudaparm_kernel_zero_numel)\n"
|
||||
" {\n"
|
||||
" .reg .u32 %r<9>;\n"
|
||||
" .reg .u64 %rd<6>;\n"
|
||||
" .reg .pred %p<3>;\n"
|
||||
" .loc 16 20 0\n"
|
||||
"$LDWbegin_kernel_zero:\n"
|
||||
" cvt.s32.u32 %r1, %ctaid.x;\n"
|
||||
" cvt.s32.u32 %r2, %ntid.x;\n"
|
||||
" mul24.lo.s32 %r3, %r1, %r2;\n"
|
||||
" mov.u32 %r4, %tid.x;\n"
|
||||
" add.u32 %r5, %r3, %r4;\n"
|
||||
" ld.param.s32 %r6, [__cudaparm_kernel_zero_numel];\n"
|
||||
" setp.le.s32 %p1, %r6, %r5;\n"
|
||||
" @%p1 bra $Lt_0_1026;\n"
|
||||
" .loc 16 24 0\n"
|
||||
" mov.s32 %r7, 0;\n"
|
||||
" ld.param.u64 %rd1, [__cudaparm_kernel_zero_mem];\n"
|
||||
" cvt.s64.s32 %rd2, %r5;\n"
|
||||
" mul.wide.s32 %rd3, %r5, 4;\n"
|
||||
" add.u64 %rd4, %rd1, %rd3;\n"
|
||||
" st.global.s32 [%rd4+0], %r7;\n"
|
||||
"$Lt_0_1026:\n"
|
||||
" .loc 16 25 0\n"
|
||||
" exit;\n"
|
||||
"$LDWend_kernel_zero:\n"
|
||||
" }\n"
|
||||
" .entry kernel_info (\n"
|
||||
" .param .u64 __cudaparm_kernel_info_info)\n"
|
||||
" {\n"
|
||||
" .reg .u32 %r<16>;\n"
|
||||
" .reg .u64 %rd<3>;\n"
|
||||
" .loc 16 27 0\n"
|
||||
"$LDWbegin_kernel_info:\n"
|
||||
" .loc 16 28 0\n"
|
||||
" ld.param.u64 %rd1, [__cudaparm_kernel_info_info];\n"
|
||||
" mov.s32 %r1, 200;\n"
|
||||
" st.global.s32 [%rd1+0], %r1;\n"
|
||||
" .loc 16 29 0\n"
|
||||
" mov.s32 %r2, 32;\n"
|
||||
" st.global.s32 [%rd1+4], %r2;\n"
|
||||
" .loc 16 30 0\n"
|
||||
" mov.s32 %r3, 32;\n"
|
||||
" st.global.s32 [%rd1+8], %r3;\n"
|
||||
" .loc 16 31 0\n"
|
||||
" mov.s32 %r4, 4;\n"
|
||||
" st.global.s32 [%rd1+12], %r4;\n"
|
||||
" .loc 16 32 0\n"
|
||||
" mov.s32 %r5, 8;\n"
|
||||
" st.global.s32 [%rd1+16], %r5;\n"
|
||||
" .loc 16 33 0\n"
|
||||
" mov.s32 %r6, 64;\n"
|
||||
" st.global.s32 [%rd1+20], %r6;\n"
|
||||
" .loc 16 34 0\n"
|
||||
" mov.s32 %r7, 128;\n"
|
||||
" st.global.s32 [%rd1+24], %r7;\n"
|
||||
" .loc 16 35 0\n"
|
||||
" mov.s32 %r8, 11;\n"
|
||||
" st.global.s32 [%rd1+28], %r8;\n"
|
||||
" .loc 16 36 0\n"
|
||||
" mov.s32 %r9, 8;\n"
|
||||
" st.global.s32 [%rd1+32], %r9;\n"
|
||||
" .loc 16 37 0\n"
|
||||
" mov.s32 %r10, 128;\n"
|
||||
" st.global.s32 [%rd1+36], %r10;\n"
|
||||
" .loc 16 38 0\n"
|
||||
" mov.s32 %r11, 128;\n"
|
||||
" st.global.s32 [%rd1+40], %r11;\n"
|
||||
" .loc 16 39 0\n"
|
||||
" mov.s32 %r12, 128;\n"
|
||||
" st.global.s32 [%rd1+44], %r12;\n"
|
||||
" .loc 16 40 0\n"
|
||||
" mov.s32 %r13, 128;\n"
|
||||
" st.global.s32 [%rd1+48], %r13;\n"
|
||||
" .loc 16 41 0\n"
|
||||
" mov.s32 %r14, 8;\n"
|
||||
" st.global.s32 [%rd1+52], %r14;\n"
|
||||
" .loc 16 42 0\n"
|
||||
" exit;\n"
|
||||
"$LDWend_kernel_info:\n"
|
||||
" }\n"
|
||||
;
|
||||
@ -1,329 +0,0 @@
|
||||
.version 2.3
|
||||
.target sm_20
|
||||
.address_size 64
|
||||
// compiled with /usr/local/cuda/open64/lib//be
|
||||
// nvopencc 4.0 built on 2011-05-12
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Compiling /tmp/tmpxft_00009ad9_00000000-9_lal_ellipsoid_nbor.cpp3.i (/home/sjplimp/ccBI#.7CLzz0)
|
||||
//-----------------------------------------------------------
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Options:
|
||||
//-----------------------------------------------------------
|
||||
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
|
||||
// -O3 (Optimization level)
|
||||
// -g0 (Debug level)
|
||||
// -m2 (Report advisories)
|
||||
//-----------------------------------------------------------
|
||||
|
||||
.file 1 "<command-line>"
|
||||
.file 2 "/tmp/tmpxft_00009ad9_00000000-8_lal_ellipsoid_nbor.cudafe2.gpu"
|
||||
.file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
|
||||
.file 4 "/usr/local/cuda/include/crt/device_runtime.h"
|
||||
.file 5 "/usr/local/cuda/include/host_defines.h"
|
||||
.file 6 "/usr/local/cuda/include/builtin_types.h"
|
||||
.file 7 "/usr/local/cuda/include/device_types.h"
|
||||
.file 8 "/usr/local/cuda/include/driver_types.h"
|
||||
.file 9 "/usr/local/cuda/include/surface_types.h"
|
||||
.file 10 "/usr/local/cuda/include/texture_types.h"
|
||||
.file 11 "/usr/local/cuda/include/vector_types.h"
|
||||
.file 12 "/usr/local/cuda/include/device_launch_parameters.h"
|
||||
.file 13 "/usr/local/cuda/include/crt/storage_class.h"
|
||||
.file 14 "/usr/include/bits/types.h"
|
||||
.file 15 "/usr/include/time.h"
|
||||
.file 16 "lal_ellipsoid_nbor.cu"
|
||||
.file 17 "/usr/local/cuda/include/common_functions.h"
|
||||
.file 18 "/usr/local/cuda/include/math_functions.h"
|
||||
.file 19 "/usr/local/cuda/include/math_constants.h"
|
||||
.file 20 "/usr/local/cuda/include/device_functions.h"
|
||||
.file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h"
|
||||
.file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h"
|
||||
.file 23 "/usr/local/cuda/include/sm_13_double_functions.h"
|
||||
.file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h"
|
||||
.file 25 "/usr/local/cuda/include/sm_20_intrinsics.h"
|
||||
.file 26 "/usr/local/cuda/include/surface_functions.h"
|
||||
.file 27 "/usr/local/cuda/include/texture_fetch_functions.h"
|
||||
.file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
|
||||
|
||||
|
||||
.entry kernel_nbor (
|
||||
.param .u64 __cudaparm_kernel_nbor_x_,
|
||||
.param .u64 __cudaparm_kernel_nbor_cut_form,
|
||||
.param .s32 __cudaparm_kernel_nbor_ntypes,
|
||||
.param .u64 __cudaparm_kernel_nbor_dev_nbor,
|
||||
.param .s32 __cudaparm_kernel_nbor_nbor_pitch,
|
||||
.param .s32 __cudaparm_kernel_nbor_start,
|
||||
.param .s32 __cudaparm_kernel_nbor_inum,
|
||||
.param .u64 __cudaparm_kernel_nbor_dev_ij,
|
||||
.param .s32 __cudaparm_kernel_nbor_form_low,
|
||||
.param .s32 __cudaparm_kernel_nbor_form_high)
|
||||
{
|
||||
.reg .u32 %r<26>;
|
||||
.reg .u64 %rd<33>;
|
||||
.reg .f32 %f<20>;
|
||||
.reg .pred %p<8>;
|
||||
.loc 16 29 0
|
||||
$LDWbegin_kernel_nbor:
|
||||
cvt.s32.u32 %r1, %ctaid.x;
|
||||
cvt.s32.u32 %r2, %ntid.x;
|
||||
mul24.lo.s32 %r3, %r1, %r2;
|
||||
mov.u32 %r4, %tid.x;
|
||||
add.u32 %r5, %r3, %r4;
|
||||
ld.param.s32 %r6, [__cudaparm_kernel_nbor_start];
|
||||
add.u32 %r7, %r6, %r5;
|
||||
ld.param.s32 %r8, [__cudaparm_kernel_nbor_inum];
|
||||
setp.le.s32 %p1, %r8, %r7;
|
||||
@%p1 bra $Lt_0_4354;
|
||||
.loc 16 36 0
|
||||
cvt.s64.s32 %rd1, %r7;
|
||||
ld.param.u64 %rd2, [__cudaparm_kernel_nbor_dev_ij];
|
||||
mul.wide.s32 %rd3, %r7, 4;
|
||||
add.u64 %rd4, %rd2, %rd3;
|
||||
ld.global.s32 %r9, [%rd4+0];
|
||||
.loc 16 38 0
|
||||
ld.param.s32 %r10, [__cudaparm_kernel_nbor_nbor_pitch];
|
||||
cvt.s64.s32 %rd5, %r10;
|
||||
mul.wide.s32 %rd6, %r10, 4;
|
||||
add.u64 %rd7, %rd6, %rd4;
|
||||
ld.global.s32 %r11, [%rd7+0];
|
||||
.loc 16 39 0
|
||||
add.u64 %rd8, %rd6, %rd7;
|
||||
mov.s64 %rd9, %rd8;
|
||||
.loc 16 41 0
|
||||
ld.param.u64 %rd10, [__cudaparm_kernel_nbor_dev_nbor];
|
||||
add.u64 %rd11, %rd1, %rd5;
|
||||
add.u64 %rd12, %rd5, %rd11;
|
||||
mul.lo.u64 %rd13, %rd12, 4;
|
||||
add.u64 %rd14, %rd10, %rd13;
|
||||
.loc 16 43 0
|
||||
ld.param.u64 %rd15, [__cudaparm_kernel_nbor_x_];
|
||||
cvt.s64.s32 %rd16, %r9;
|
||||
mul.wide.s32 %rd17, %r9, 16;
|
||||
add.u64 %rd18, %rd15, %rd17;
|
||||
ld.global.v4.f32 {%f1,%f2,%f3,%f4}, [%rd18+0];
|
||||
cvt.s32.s64 %r12, %rd5;
|
||||
mul.lo.s32 %r13, %r12, %r11;
|
||||
cvt.s64.s32 %rd19, %r13;
|
||||
mul.wide.s32 %rd20, %r13, 4;
|
||||
add.u64 %rd21, %rd8, %rd20;
|
||||
setp.ge.u64 %p2, %rd8, %rd21;
|
||||
@%p2 bra $Lt_0_6402;
|
||||
cvt.rzi.ftz.s32.f32 %r14, %f4;
|
||||
ld.param.s32 %r15, [__cudaparm_kernel_nbor_form_low];
|
||||
cvt.rn.f32.s32 %f5, %r15;
|
||||
ld.param.s32 %r16, [__cudaparm_kernel_nbor_ntypes];
|
||||
mul.lo.s32 %r17, %r16, %r14;
|
||||
ld.param.u64 %rd22, [__cudaparm_kernel_nbor_cut_form];
|
||||
mov.s32 %r18, 0;
|
||||
$Lt_0_5378:
|
||||
//<loop> Loop body line 43, nesting depth: 1, estimated iterations: unknown
|
||||
.loc 16 49 0
|
||||
ld.global.s32 %r19, [%rd9+0];
|
||||
and.b32 %r20, %r19, 1073741823;
|
||||
.loc 16 50 0
|
||||
cvt.s64.s32 %rd23, %r20;
|
||||
mul.wide.s32 %rd24, %r20, 16;
|
||||
add.u64 %rd25, %rd15, %rd24;
|
||||
ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd25+0];
|
||||
.loc 16 53 0
|
||||
cvt.rzi.ftz.s32.f32 %r21, %f9;
|
||||
add.s32 %r22, %r21, %r17;
|
||||
cvt.s64.s32 %rd26, %r22;
|
||||
mul.wide.s32 %rd27, %r22, 8;
|
||||
add.u64 %rd28, %rd22, %rd27;
|
||||
ld.global.f32 %f10, [%rd28+4];
|
||||
.loc 16 48 0
|
||||
setp.le.ftz.f32 %p3, %f5, %f10;
|
||||
@!%p3 bra $Lt_0_6658;
|
||||
ld.param.s32 %r23, [__cudaparm_kernel_nbor_form_high];
|
||||
cvt.rn.f32.s32 %f11, %r23;
|
||||
setp.ge.ftz.f32 %p4, %f11, %f10;
|
||||
@!%p4 bra $Lt_0_6658;
|
||||
sub.ftz.f32 %f12, %f6, %f1;
|
||||
sub.ftz.f32 %f13, %f7, %f2;
|
||||
sub.ftz.f32 %f14, %f8, %f3;
|
||||
ld.global.f32 %f15, [%rd28+0];
|
||||
mul.ftz.f32 %f16, %f12, %f12;
|
||||
fma.rn.ftz.f32 %f17, %f13, %f13, %f16;
|
||||
fma.rn.ftz.f32 %f18, %f14, %f14, %f17;
|
||||
setp.gt.ftz.f32 %p5, %f15, %f18;
|
||||
@!%p5 bra $Lt_0_6658;
|
||||
.loc 16 64 0
|
||||
st.global.s32 [%rd14+0], %r20;
|
||||
.loc 16 65 0
|
||||
add.u64 %rd14, %rd6, %rd14;
|
||||
.loc 16 66 0
|
||||
add.s32 %r18, %r18, 1;
|
||||
$Lt_0_6658:
|
||||
$L_0_3842:
|
||||
.loc 16 47 0
|
||||
add.u64 %rd9, %rd6, %rd9;
|
||||
setp.gt.u64 %p6, %rd21, %rd9;
|
||||
@%p6 bra $Lt_0_5378;
|
||||
bra.uni $Lt_0_4866;
|
||||
$Lt_0_6402:
|
||||
mov.s32 %r18, 0;
|
||||
$Lt_0_4866:
|
||||
.loc 16 70 0
|
||||
add.s32 %r24, %r12, %r7;
|
||||
cvt.s64.s32 %rd29, %r24;
|
||||
mul.wide.s32 %rd30, %r24, 4;
|
||||
add.u64 %rd31, %rd10, %rd30;
|
||||
st.global.s32 [%rd31+0], %r18;
|
||||
$Lt_0_4354:
|
||||
.loc 16 72 0
|
||||
exit;
|
||||
$LDWend_kernel_nbor:
|
||||
} // kernel_nbor
|
||||
|
||||
.entry kernel_nbor_fast (
|
||||
.param .u64 __cudaparm_kernel_nbor_fast_x_,
|
||||
.param .u64 __cudaparm_kernel_nbor_fast_cut_form,
|
||||
.param .u64 __cudaparm_kernel_nbor_fast_dev_nbor,
|
||||
.param .s32 __cudaparm_kernel_nbor_fast_nbor_pitch,
|
||||
.param .s32 __cudaparm_kernel_nbor_fast_start,
|
||||
.param .s32 __cudaparm_kernel_nbor_fast_inum,
|
||||
.param .u64 __cudaparm_kernel_nbor_fast_dev_ij,
|
||||
.param .s32 __cudaparm_kernel_nbor_fast_form_low,
|
||||
.param .s32 __cudaparm_kernel_nbor_fast_form_high)
|
||||
{
|
||||
.reg .u32 %r<28>;
|
||||
.reg .u64 %rd<42>;
|
||||
.reg .f32 %f<19>;
|
||||
.reg .pred %p<9>;
|
||||
.shared .align 4 .b8 __cuda___cuda_local_var_32570_31_non_const_form120[484];
|
||||
.shared .align 4 .b8 __cuda___cuda_local_var_32571_33_non_const_cutsq604[484];
|
||||
.loc 16 84 0
|
||||
$LDWbegin_kernel_nbor_fast:
|
||||
cvt.s32.u32 %r1, %tid.x;
|
||||
mov.u32 %r2, 120;
|
||||
setp.gt.s32 %p1, %r1, %r2;
|
||||
@%p1 bra $Lt_1_5122;
|
||||
.loc 16 90 0
|
||||
mov.u64 %rd1, __cuda___cuda_local_var_32570_31_non_const_form120;
|
||||
mov.u64 %rd2, __cuda___cuda_local_var_32571_33_non_const_cutsq604;
|
||||
cvt.s64.s32 %rd3, %r1;
|
||||
mul.wide.s32 %rd4, %r1, 4;
|
||||
ld.param.u64 %rd5, [__cudaparm_kernel_nbor_fast_cut_form];
|
||||
mul.wide.s32 %rd6, %r1, 8;
|
||||
add.u64 %rd7, %rd5, %rd6;
|
||||
ld.global.v2.f32 {%f1,%f2}, [%rd7+0];
|
||||
add.u64 %rd8, %rd4, %rd2;
|
||||
st.shared.f32 [%rd8+0], %f1;
|
||||
.loc 16 91 0
|
||||
cvt.rzi.ftz.s32.f32 %r3, %f2;
|
||||
add.u64 %rd9, %rd4, %rd1;
|
||||
st.shared.s32 [%rd9+0], %r3;
|
||||
$Lt_1_5122:
|
||||
mov.u64 %rd1, __cuda___cuda_local_var_32570_31_non_const_form120;
|
||||
mov.u64 %rd2, __cuda___cuda_local_var_32571_33_non_const_cutsq604;
|
||||
.loc 16 94 0
|
||||
bar.sync 0;
|
||||
cvt.s32.u32 %r4, %ctaid.x;
|
||||
cvt.s32.u32 %r5, %ntid.x;
|
||||
mul.lo.s32 %r6, %r4, %r5;
|
||||
ld.param.s32 %r7, [__cudaparm_kernel_nbor_fast_start];
|
||||
add.s32 %r8, %r7, %r6;
|
||||
add.s32 %r9, %r8, %r1;
|
||||
ld.param.s32 %r10, [__cudaparm_kernel_nbor_fast_inum];
|
||||
setp.le.s32 %p2, %r10, %r9;
|
||||
@%p2 bra $Lt_1_5634;
|
||||
.loc 16 98 0
|
||||
cvt.s64.s32 %rd10, %r9;
|
||||
ld.param.u64 %rd11, [__cudaparm_kernel_nbor_fast_dev_ij];
|
||||
mul.wide.s32 %rd12, %r9, 4;
|
||||
add.u64 %rd13, %rd11, %rd12;
|
||||
ld.global.s32 %r11, [%rd13+0];
|
||||
.loc 16 100 0
|
||||
ld.param.s32 %r12, [__cudaparm_kernel_nbor_fast_nbor_pitch];
|
||||
cvt.s64.s32 %rd14, %r12;
|
||||
mul.wide.s32 %rd15, %r12, 4;
|
||||
add.u64 %rd16, %rd15, %rd13;
|
||||
ld.global.s32 %r13, [%rd16+0];
|
||||
.loc 16 101 0
|
||||
add.u64 %rd17, %rd15, %rd16;
|
||||
mov.s64 %rd18, %rd17;
|
||||
.loc 16 103 0
|
||||
ld.param.u64 %rd19, [__cudaparm_kernel_nbor_fast_dev_nbor];
|
||||
add.u64 %rd20, %rd10, %rd14;
|
||||
add.u64 %rd21, %rd14, %rd20;
|
||||
mul.lo.u64 %rd22, %rd21, 4;
|
||||
add.u64 %rd23, %rd19, %rd22;
|
||||
.loc 16 105 0
|
||||
ld.param.u64 %rd24, [__cudaparm_kernel_nbor_fast_x_];
|
||||
cvt.s64.s32 %rd25, %r11;
|
||||
mul.wide.s32 %rd26, %r11, 16;
|
||||
add.u64 %rd27, %rd24, %rd26;
|
||||
ld.global.v4.f32 {%f3,%f4,%f5,%f6}, [%rd27+0];
|
||||
cvt.s32.s64 %r14, %rd14;
|
||||
mul.lo.s32 %r15, %r14, %r13;
|
||||
cvt.s64.s32 %rd28, %r15;
|
||||
mul.wide.s32 %rd29, %r15, 4;
|
||||
add.u64 %rd30, %rd17, %rd29;
|
||||
setp.ge.u64 %p3, %rd17, %rd30;
|
||||
@%p3 bra $Lt_1_7682;
|
||||
cvt.rzi.ftz.s32.f32 %r16, %f6;
|
||||
mul.lo.s32 %r17, %r16, 11;
|
||||
ld.param.s32 %r18, [__cudaparm_kernel_nbor_fast_form_low];
|
||||
mov.s32 %r19, 0;
|
||||
$Lt_1_6658:
|
||||
//<loop> Loop body line 105, nesting depth: 1, estimated iterations: unknown
|
||||
.loc 16 112 0
|
||||
ld.global.s32 %r20, [%rd18+0];
|
||||
and.b32 %r21, %r20, 1073741823;
|
||||
.loc 16 113 0
|
||||
cvt.s64.s32 %rd31, %r21;
|
||||
mul.wide.s32 %rd32, %r21, 16;
|
||||
add.u64 %rd33, %rd24, %rd32;
|
||||
ld.global.v4.f32 {%f7,%f8,%f9,%f10}, [%rd33+0];
|
||||
.loc 16 111 0
|
||||
cvt.rzi.ftz.s32.f32 %r22, %f10;
|
||||
add.s32 %r23, %r22, %r17;
|
||||
cvt.s64.s32 %rd34, %r23;
|
||||
mul.wide.s32 %rd35, %r23, 4;
|
||||
add.u64 %rd36, %rd35, %rd1;
|
||||
ld.shared.s32 %r24, [%rd36+0];
|
||||
setp.lt.s32 %p4, %r24, %r18;
|
||||
@%p4 bra $Lt_1_7938;
|
||||
ld.param.s32 %r25, [__cudaparm_kernel_nbor_fast_form_high];
|
||||
setp.lt.s32 %p5, %r25, %r24;
|
||||
@%p5 bra $Lt_1_7938;
|
||||
sub.ftz.f32 %f11, %f7, %f3;
|
||||
sub.ftz.f32 %f12, %f8, %f4;
|
||||
sub.ftz.f32 %f13, %f9, %f5;
|
||||
add.u64 %rd37, %rd35, %rd2;
|
||||
ld.shared.f32 %f14, [%rd37+0];
|
||||
mul.ftz.f32 %f15, %f11, %f11;
|
||||
fma.rn.ftz.f32 %f16, %f12, %f12, %f15;
|
||||
fma.rn.ftz.f32 %f17, %f13, %f13, %f16;
|
||||
setp.gt.ftz.f32 %p6, %f14, %f17;
|
||||
@!%p6 bra $Lt_1_7938;
|
||||
.loc 16 127 0
|
||||
st.global.s32 [%rd23+0], %r21;
|
||||
.loc 16 128 0
|
||||
add.u64 %rd23, %rd15, %rd23;
|
||||
.loc 16 129 0
|
||||
add.s32 %r19, %r19, 1;
|
||||
$Lt_1_7938:
|
||||
$L_1_4610:
|
||||
.loc 16 110 0
|
||||
add.u64 %rd18, %rd15, %rd18;
|
||||
setp.gt.u64 %p7, %rd30, %rd18;
|
||||
@%p7 bra $Lt_1_6658;
|
||||
bra.uni $Lt_1_6146;
|
||||
$Lt_1_7682:
|
||||
mov.s32 %r19, 0;
|
||||
$Lt_1_6146:
|
||||
.loc 16 133 0
|
||||
add.s32 %r26, %r14, %r9;
|
||||
cvt.s64.s32 %rd38, %r26;
|
||||
mul.wide.s32 %rd39, %r26, 4;
|
||||
add.u64 %rd40, %rd19, %rd39;
|
||||
st.global.s32 [%rd40+0], %r19;
|
||||
$Lt_1_5634:
|
||||
.loc 16 135 0
|
||||
exit;
|
||||
$LDWend_kernel_nbor_fast:
|
||||
} // kernel_nbor_fast
|
||||
|
||||
@ -1,281 +0,0 @@
|
||||
const char * ellipsoid_nbor =
|
||||
" .version 2.3\n"
|
||||
" .target sm_20\n"
|
||||
" .address_size 64\n"
|
||||
" .entry kernel_nbor (\n"
|
||||
" .param .u64 __cudaparm_kernel_nbor_x_,\n"
|
||||
" .param .u64 __cudaparm_kernel_nbor_cut_form,\n"
|
||||
" .param .s32 __cudaparm_kernel_nbor_ntypes,\n"
|
||||
" .param .u64 __cudaparm_kernel_nbor_dev_nbor,\n"
|
||||
" .param .s32 __cudaparm_kernel_nbor_nbor_pitch,\n"
|
||||
" .param .s32 __cudaparm_kernel_nbor_start,\n"
|
||||
" .param .s32 __cudaparm_kernel_nbor_inum,\n"
|
||||
" .param .u64 __cudaparm_kernel_nbor_dev_ij,\n"
|
||||
" .param .s32 __cudaparm_kernel_nbor_form_low,\n"
|
||||
" .param .s32 __cudaparm_kernel_nbor_form_high)\n"
|
||||
" {\n"
|
||||
" .reg .u32 %r<26>;\n"
|
||||
" .reg .u64 %rd<33>;\n"
|
||||
" .reg .f32 %f<20>;\n"
|
||||
" .reg .pred %p<8>;\n"
|
||||
" .loc 16 29 0\n"
|
||||
"$LDWbegin_kernel_nbor:\n"
|
||||
" cvt.s32.u32 %r1, %ctaid.x;\n"
|
||||
" cvt.s32.u32 %r2, %ntid.x;\n"
|
||||
" mul24.lo.s32 %r3, %r1, %r2;\n"
|
||||
" mov.u32 %r4, %tid.x;\n"
|
||||
" add.u32 %r5, %r3, %r4;\n"
|
||||
" ld.param.s32 %r6, [__cudaparm_kernel_nbor_start];\n"
|
||||
" add.u32 %r7, %r6, %r5;\n"
|
||||
" ld.param.s32 %r8, [__cudaparm_kernel_nbor_inum];\n"
|
||||
" setp.le.s32 %p1, %r8, %r7;\n"
|
||||
" @%p1 bra $Lt_0_4354;\n"
|
||||
" .loc 16 36 0\n"
|
||||
" cvt.s64.s32 %rd1, %r7;\n"
|
||||
" ld.param.u64 %rd2, [__cudaparm_kernel_nbor_dev_ij];\n"
|
||||
" mul.wide.s32 %rd3, %r7, 4;\n"
|
||||
" add.u64 %rd4, %rd2, %rd3;\n"
|
||||
" ld.global.s32 %r9, [%rd4+0];\n"
|
||||
" .loc 16 38 0\n"
|
||||
" ld.param.s32 %r10, [__cudaparm_kernel_nbor_nbor_pitch];\n"
|
||||
" cvt.s64.s32 %rd5, %r10;\n"
|
||||
" mul.wide.s32 %rd6, %r10, 4;\n"
|
||||
" add.u64 %rd7, %rd6, %rd4;\n"
|
||||
" ld.global.s32 %r11, [%rd7+0];\n"
|
||||
" .loc 16 39 0\n"
|
||||
" add.u64 %rd8, %rd6, %rd7;\n"
|
||||
" mov.s64 %rd9, %rd8;\n"
|
||||
" .loc 16 41 0\n"
|
||||
" ld.param.u64 %rd10, [__cudaparm_kernel_nbor_dev_nbor];\n"
|
||||
" add.u64 %rd11, %rd1, %rd5;\n"
|
||||
" add.u64 %rd12, %rd5, %rd11;\n"
|
||||
" mul.lo.u64 %rd13, %rd12, 4;\n"
|
||||
" add.u64 %rd14, %rd10, %rd13;\n"
|
||||
" .loc 16 43 0\n"
|
||||
" ld.param.u64 %rd15, [__cudaparm_kernel_nbor_x_];\n"
|
||||
" cvt.s64.s32 %rd16, %r9;\n"
|
||||
" mul.wide.s32 %rd17, %r9, 16;\n"
|
||||
" add.u64 %rd18, %rd15, %rd17;\n"
|
||||
" ld.global.v4.f32 {%f1,%f2,%f3,%f4}, [%rd18+0];\n"
|
||||
" cvt.s32.s64 %r12, %rd5;\n"
|
||||
" mul.lo.s32 %r13, %r12, %r11;\n"
|
||||
" cvt.s64.s32 %rd19, %r13;\n"
|
||||
" mul.wide.s32 %rd20, %r13, 4;\n"
|
||||
" add.u64 %rd21, %rd8, %rd20;\n"
|
||||
" setp.ge.u64 %p2, %rd8, %rd21;\n"
|
||||
" @%p2 bra $Lt_0_6402;\n"
|
||||
" cvt.rzi.ftz.s32.f32 %r14, %f4;\n"
|
||||
" ld.param.s32 %r15, [__cudaparm_kernel_nbor_form_low];\n"
|
||||
" cvt.rn.f32.s32 %f5, %r15;\n"
|
||||
" ld.param.s32 %r16, [__cudaparm_kernel_nbor_ntypes];\n"
|
||||
" mul.lo.s32 %r17, %r16, %r14;\n"
|
||||
" ld.param.u64 %rd22, [__cudaparm_kernel_nbor_cut_form];\n"
|
||||
" mov.s32 %r18, 0;\n"
|
||||
"$Lt_0_5378:\n"
|
||||
" .loc 16 49 0\n"
|
||||
" ld.global.s32 %r19, [%rd9+0];\n"
|
||||
" and.b32 %r20, %r19, 1073741823;\n"
|
||||
" .loc 16 50 0\n"
|
||||
" cvt.s64.s32 %rd23, %r20;\n"
|
||||
" mul.wide.s32 %rd24, %r20, 16;\n"
|
||||
" add.u64 %rd25, %rd15, %rd24;\n"
|
||||
" ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd25+0];\n"
|
||||
" .loc 16 53 0\n"
|
||||
" cvt.rzi.ftz.s32.f32 %r21, %f9;\n"
|
||||
" add.s32 %r22, %r21, %r17;\n"
|
||||
" cvt.s64.s32 %rd26, %r22;\n"
|
||||
" mul.wide.s32 %rd27, %r22, 8;\n"
|
||||
" add.u64 %rd28, %rd22, %rd27;\n"
|
||||
" ld.global.f32 %f10, [%rd28+4];\n"
|
||||
" .loc 16 48 0\n"
|
||||
" setp.le.ftz.f32 %p3, %f5, %f10;\n"
|
||||
" @!%p3 bra $Lt_0_6658;\n"
|
||||
" ld.param.s32 %r23, [__cudaparm_kernel_nbor_form_high];\n"
|
||||
" cvt.rn.f32.s32 %f11, %r23;\n"
|
||||
" setp.ge.ftz.f32 %p4, %f11, %f10;\n"
|
||||
" @!%p4 bra $Lt_0_6658;\n"
|
||||
" sub.ftz.f32 %f12, %f6, %f1;\n"
|
||||
" sub.ftz.f32 %f13, %f7, %f2;\n"
|
||||
" sub.ftz.f32 %f14, %f8, %f3;\n"
|
||||
" ld.global.f32 %f15, [%rd28+0];\n"
|
||||
" mul.ftz.f32 %f16, %f12, %f12;\n"
|
||||
" fma.rn.ftz.f32 %f17, %f13, %f13, %f16;\n"
|
||||
" fma.rn.ftz.f32 %f18, %f14, %f14, %f17;\n"
|
||||
" setp.gt.ftz.f32 %p5, %f15, %f18;\n"
|
||||
" @!%p5 bra $Lt_0_6658;\n"
|
||||
" .loc 16 64 0\n"
|
||||
" st.global.s32 [%rd14+0], %r20;\n"
|
||||
" .loc 16 65 0\n"
|
||||
" add.u64 %rd14, %rd6, %rd14;\n"
|
||||
" .loc 16 66 0\n"
|
||||
" add.s32 %r18, %r18, 1;\n"
|
||||
"$Lt_0_6658:\n"
|
||||
"$L_0_3842:\n"
|
||||
" .loc 16 47 0\n"
|
||||
" add.u64 %rd9, %rd6, %rd9;\n"
|
||||
" setp.gt.u64 %p6, %rd21, %rd9;\n"
|
||||
" @%p6 bra $Lt_0_5378;\n"
|
||||
" bra.uni $Lt_0_4866;\n"
|
||||
"$Lt_0_6402:\n"
|
||||
" mov.s32 %r18, 0;\n"
|
||||
"$Lt_0_4866:\n"
|
||||
" .loc 16 70 0\n"
|
||||
" add.s32 %r24, %r12, %r7;\n"
|
||||
" cvt.s64.s32 %rd29, %r24;\n"
|
||||
" mul.wide.s32 %rd30, %r24, 4;\n"
|
||||
" add.u64 %rd31, %rd10, %rd30;\n"
|
||||
" st.global.s32 [%rd31+0], %r18;\n"
|
||||
"$Lt_0_4354:\n"
|
||||
" .loc 16 72 0\n"
|
||||
" exit;\n"
|
||||
"$LDWend_kernel_nbor:\n"
|
||||
" }\n"
|
||||
" .entry kernel_nbor_fast (\n"
|
||||
" .param .u64 __cudaparm_kernel_nbor_fast_x_,\n"
|
||||
" .param .u64 __cudaparm_kernel_nbor_fast_cut_form,\n"
|
||||
" .param .u64 __cudaparm_kernel_nbor_fast_dev_nbor,\n"
|
||||
" .param .s32 __cudaparm_kernel_nbor_fast_nbor_pitch,\n"
|
||||
" .param .s32 __cudaparm_kernel_nbor_fast_start,\n"
|
||||
" .param .s32 __cudaparm_kernel_nbor_fast_inum,\n"
|
||||
" .param .u64 __cudaparm_kernel_nbor_fast_dev_ij,\n"
|
||||
" .param .s32 __cudaparm_kernel_nbor_fast_form_low,\n"
|
||||
" .param .s32 __cudaparm_kernel_nbor_fast_form_high)\n"
|
||||
" {\n"
|
||||
" .reg .u32 %r<28>;\n"
|
||||
" .reg .u64 %rd<42>;\n"
|
||||
" .reg .f32 %f<19>;\n"
|
||||
" .reg .pred %p<9>;\n"
|
||||
" .shared .align 4 .b8 __cuda___cuda_local_var_32570_31_non_const_form120[484];\n"
|
||||
" .shared .align 4 .b8 __cuda___cuda_local_var_32571_33_non_const_cutsq604[484];\n"
|
||||
" .loc 16 84 0\n"
|
||||
"$LDWbegin_kernel_nbor_fast:\n"
|
||||
" cvt.s32.u32 %r1, %tid.x;\n"
|
||||
" mov.u32 %r2, 120;\n"
|
||||
" setp.gt.s32 %p1, %r1, %r2;\n"
|
||||
" @%p1 bra $Lt_1_5122;\n"
|
||||
" .loc 16 90 0\n"
|
||||
" mov.u64 %rd1, __cuda___cuda_local_var_32570_31_non_const_form120;\n"
|
||||
" mov.u64 %rd2, __cuda___cuda_local_var_32571_33_non_const_cutsq604;\n"
|
||||
" cvt.s64.s32 %rd3, %r1;\n"
|
||||
" mul.wide.s32 %rd4, %r1, 4;\n"
|
||||
" ld.param.u64 %rd5, [__cudaparm_kernel_nbor_fast_cut_form];\n"
|
||||
" mul.wide.s32 %rd6, %r1, 8;\n"
|
||||
" add.u64 %rd7, %rd5, %rd6;\n"
|
||||
" ld.global.v2.f32 {%f1,%f2}, [%rd7+0];\n"
|
||||
" add.u64 %rd8, %rd4, %rd2;\n"
|
||||
" st.shared.f32 [%rd8+0], %f1;\n"
|
||||
" .loc 16 91 0\n"
|
||||
" cvt.rzi.ftz.s32.f32 %r3, %f2;\n"
|
||||
" add.u64 %rd9, %rd4, %rd1;\n"
|
||||
" st.shared.s32 [%rd9+0], %r3;\n"
|
||||
"$Lt_1_5122:\n"
|
||||
" mov.u64 %rd1, __cuda___cuda_local_var_32570_31_non_const_form120;\n"
|
||||
" mov.u64 %rd2, __cuda___cuda_local_var_32571_33_non_const_cutsq604;\n"
|
||||
" .loc 16 94 0\n"
|
||||
" bar.sync 0;\n"
|
||||
" cvt.s32.u32 %r4, %ctaid.x;\n"
|
||||
" cvt.s32.u32 %r5, %ntid.x;\n"
|
||||
" mul.lo.s32 %r6, %r4, %r5;\n"
|
||||
" ld.param.s32 %r7, [__cudaparm_kernel_nbor_fast_start];\n"
|
||||
" add.s32 %r8, %r7, %r6;\n"
|
||||
" add.s32 %r9, %r8, %r1;\n"
|
||||
" ld.param.s32 %r10, [__cudaparm_kernel_nbor_fast_inum];\n"
|
||||
" setp.le.s32 %p2, %r10, %r9;\n"
|
||||
" @%p2 bra $Lt_1_5634;\n"
|
||||
" .loc 16 98 0\n"
|
||||
" cvt.s64.s32 %rd10, %r9;\n"
|
||||
" ld.param.u64 %rd11, [__cudaparm_kernel_nbor_fast_dev_ij];\n"
|
||||
" mul.wide.s32 %rd12, %r9, 4;\n"
|
||||
" add.u64 %rd13, %rd11, %rd12;\n"
|
||||
" ld.global.s32 %r11, [%rd13+0];\n"
|
||||
" .loc 16 100 0\n"
|
||||
" ld.param.s32 %r12, [__cudaparm_kernel_nbor_fast_nbor_pitch];\n"
|
||||
" cvt.s64.s32 %rd14, %r12;\n"
|
||||
" mul.wide.s32 %rd15, %r12, 4;\n"
|
||||
" add.u64 %rd16, %rd15, %rd13;\n"
|
||||
" ld.global.s32 %r13, [%rd16+0];\n"
|
||||
" .loc 16 101 0\n"
|
||||
" add.u64 %rd17, %rd15, %rd16;\n"
|
||||
" mov.s64 %rd18, %rd17;\n"
|
||||
" .loc 16 103 0\n"
|
||||
" ld.param.u64 %rd19, [__cudaparm_kernel_nbor_fast_dev_nbor];\n"
|
||||
" add.u64 %rd20, %rd10, %rd14;\n"
|
||||
" add.u64 %rd21, %rd14, %rd20;\n"
|
||||
" mul.lo.u64 %rd22, %rd21, 4;\n"
|
||||
" add.u64 %rd23, %rd19, %rd22;\n"
|
||||
" .loc 16 105 0\n"
|
||||
" ld.param.u64 %rd24, [__cudaparm_kernel_nbor_fast_x_];\n"
|
||||
" cvt.s64.s32 %rd25, %r11;\n"
|
||||
" mul.wide.s32 %rd26, %r11, 16;\n"
|
||||
" add.u64 %rd27, %rd24, %rd26;\n"
|
||||
" ld.global.v4.f32 {%f3,%f4,%f5,%f6}, [%rd27+0];\n"
|
||||
" cvt.s32.s64 %r14, %rd14;\n"
|
||||
" mul.lo.s32 %r15, %r14, %r13;\n"
|
||||
" cvt.s64.s32 %rd28, %r15;\n"
|
||||
" mul.wide.s32 %rd29, %r15, 4;\n"
|
||||
" add.u64 %rd30, %rd17, %rd29;\n"
|
||||
" setp.ge.u64 %p3, %rd17, %rd30;\n"
|
||||
" @%p3 bra $Lt_1_7682;\n"
|
||||
" cvt.rzi.ftz.s32.f32 %r16, %f6;\n"
|
||||
" mul.lo.s32 %r17, %r16, 11;\n"
|
||||
" ld.param.s32 %r18, [__cudaparm_kernel_nbor_fast_form_low];\n"
|
||||
" mov.s32 %r19, 0;\n"
|
||||
"$Lt_1_6658:\n"
|
||||
" .loc 16 112 0\n"
|
||||
" ld.global.s32 %r20, [%rd18+0];\n"
|
||||
" and.b32 %r21, %r20, 1073741823;\n"
|
||||
" .loc 16 113 0\n"
|
||||
" cvt.s64.s32 %rd31, %r21;\n"
|
||||
" mul.wide.s32 %rd32, %r21, 16;\n"
|
||||
" add.u64 %rd33, %rd24, %rd32;\n"
|
||||
" ld.global.v4.f32 {%f7,%f8,%f9,%f10}, [%rd33+0];\n"
|
||||
" .loc 16 111 0\n"
|
||||
" cvt.rzi.ftz.s32.f32 %r22, %f10;\n"
|
||||
" add.s32 %r23, %r22, %r17;\n"
|
||||
" cvt.s64.s32 %rd34, %r23;\n"
|
||||
" mul.wide.s32 %rd35, %r23, 4;\n"
|
||||
" add.u64 %rd36, %rd35, %rd1;\n"
|
||||
" ld.shared.s32 %r24, [%rd36+0];\n"
|
||||
" setp.lt.s32 %p4, %r24, %r18;\n"
|
||||
" @%p4 bra $Lt_1_7938;\n"
|
||||
" ld.param.s32 %r25, [__cudaparm_kernel_nbor_fast_form_high];\n"
|
||||
" setp.lt.s32 %p5, %r25, %r24;\n"
|
||||
" @%p5 bra $Lt_1_7938;\n"
|
||||
" sub.ftz.f32 %f11, %f7, %f3;\n"
|
||||
" sub.ftz.f32 %f12, %f8, %f4;\n"
|
||||
" sub.ftz.f32 %f13, %f9, %f5;\n"
|
||||
" add.u64 %rd37, %rd35, %rd2;\n"
|
||||
" ld.shared.f32 %f14, [%rd37+0];\n"
|
||||
" mul.ftz.f32 %f15, %f11, %f11;\n"
|
||||
" fma.rn.ftz.f32 %f16, %f12, %f12, %f15;\n"
|
||||
" fma.rn.ftz.f32 %f17, %f13, %f13, %f16;\n"
|
||||
" setp.gt.ftz.f32 %p6, %f14, %f17;\n"
|
||||
" @!%p6 bra $Lt_1_7938;\n"
|
||||
" .loc 16 127 0\n"
|
||||
" st.global.s32 [%rd23+0], %r21;\n"
|
||||
" .loc 16 128 0\n"
|
||||
" add.u64 %rd23, %rd15, %rd23;\n"
|
||||
" .loc 16 129 0\n"
|
||||
" add.s32 %r19, %r19, 1;\n"
|
||||
"$Lt_1_7938:\n"
|
||||
"$L_1_4610:\n"
|
||||
" .loc 16 110 0\n"
|
||||
" add.u64 %rd18, %rd15, %rd18;\n"
|
||||
" setp.gt.u64 %p7, %rd30, %rd18;\n"
|
||||
" @%p7 bra $Lt_1_6658;\n"
|
||||
" bra.uni $Lt_1_6146;\n"
|
||||
"$Lt_1_7682:\n"
|
||||
" mov.s32 %r19, 0;\n"
|
||||
"$Lt_1_6146:\n"
|
||||
" .loc 16 133 0\n"
|
||||
" add.s32 %r26, %r14, %r9;\n"
|
||||
" cvt.s64.s32 %rd38, %r26;\n"
|
||||
" mul.wide.s32 %rd39, %r26, 4;\n"
|
||||
" add.u64 %rd40, %rd19, %rd39;\n"
|
||||
" st.global.s32 [%rd40+0], %r19;\n"
|
||||
"$Lt_1_5634:\n"
|
||||
" .loc 16 135 0\n"
|
||||
" exit;\n"
|
||||
"$LDWend_kernel_nbor_fast:\n"
|
||||
" }\n"
|
||||
;
|
||||
1590
lib/gpu/gayberne.ptx
1590
lib/gpu/gayberne.ptx
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
lib/gpu/lal_lj.o
BIN
lib/gpu/lal_lj.o
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
901
lib/gpu/lj.ptx
901
lib/gpu/lj.ptx
@ -1,901 +0,0 @@
|
||||
.version 2.3
|
||||
.target sm_20
|
||||
.address_size 64
|
||||
// compiled with /usr/local/cuda/open64/lib//be
|
||||
// nvopencc 4.0 built on 2011-05-12
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Compiling /tmp/tmpxft_00009c40_00000000-9_lal_lj.cpp3.i (/home/sjplimp/ccBI#.N4UW9Z)
|
||||
//-----------------------------------------------------------
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Options:
|
||||
//-----------------------------------------------------------
|
||||
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
|
||||
// -O3 (Optimization level)
|
||||
// -g0 (Debug level)
|
||||
// -m2 (Report advisories)
|
||||
//-----------------------------------------------------------
|
||||
|
||||
.file 1 "<command-line>"
|
||||
.file 2 "/tmp/tmpxft_00009c40_00000000-8_lal_lj.cudafe2.gpu"
|
||||
.file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
|
||||
.file 4 "/usr/local/cuda/include/crt/device_runtime.h"
|
||||
.file 5 "/usr/local/cuda/include/host_defines.h"
|
||||
.file 6 "/usr/local/cuda/include/builtin_types.h"
|
||||
.file 7 "/usr/local/cuda/include/device_types.h"
|
||||
.file 8 "/usr/local/cuda/include/driver_types.h"
|
||||
.file 9 "/usr/local/cuda/include/surface_types.h"
|
||||
.file 10 "/usr/local/cuda/include/texture_types.h"
|
||||
.file 11 "/usr/local/cuda/include/vector_types.h"
|
||||
.file 12 "/usr/local/cuda/include/device_launch_parameters.h"
|
||||
.file 13 "/usr/local/cuda/include/crt/storage_class.h"
|
||||
.file 14 "/usr/include/bits/types.h"
|
||||
.file 15 "/usr/include/time.h"
|
||||
.file 16 "lal_lj.cu"
|
||||
.file 17 "/usr/local/cuda/include/common_functions.h"
|
||||
.file 18 "/usr/local/cuda/include/math_functions.h"
|
||||
.file 19 "/usr/local/cuda/include/math_constants.h"
|
||||
.file 20 "/usr/local/cuda/include/device_functions.h"
|
||||
.file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h"
|
||||
.file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h"
|
||||
.file 23 "/usr/local/cuda/include/sm_13_double_functions.h"
|
||||
.file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h"
|
||||
.file 25 "/usr/local/cuda/include/sm_20_intrinsics.h"
|
||||
.file 26 "/usr/local/cuda/include/surface_functions.h"
|
||||
.file 27 "/usr/local/cuda/include/texture_fetch_functions.h"
|
||||
.file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
|
||||
|
||||
.global .texref pos_tex;
|
||||
|
||||
.entry kernel_pair (
|
||||
.param .u64 __cudaparm_kernel_pair_x_,
|
||||
.param .u64 __cudaparm_kernel_pair_lj1,
|
||||
.param .u64 __cudaparm_kernel_pair_lj3,
|
||||
.param .s32 __cudaparm_kernel_pair_lj_types,
|
||||
.param .u64 __cudaparm_kernel_pair_sp_lj_in,
|
||||
.param .u64 __cudaparm_kernel_pair_dev_nbor,
|
||||
.param .u64 __cudaparm_kernel_pair_dev_packed,
|
||||
.param .u64 __cudaparm_kernel_pair_ans,
|
||||
.param .u64 __cudaparm_kernel_pair___val_paramengv,
|
||||
.param .s32 __cudaparm_kernel_pair_eflag,
|
||||
.param .s32 __cudaparm_kernel_pair_vflag,
|
||||
.param .s32 __cudaparm_kernel_pair_inum,
|
||||
.param .s32 __cudaparm_kernel_pair_nbor_pitch,
|
||||
.param .s32 __cudaparm_kernel_pair_t_per_atom)
|
||||
{
|
||||
.reg .u32 %r<72>;
|
||||
.reg .u64 %rd<63>;
|
||||
.reg .f32 %f<102>;
|
||||
.reg .pred %p<19>;
|
||||
.shared .align 16 .b8 __cuda___cuda_local_var_32536_33_non_const_sp_lj92[16];
|
||||
.shared .align 4 .b8 __cuda___cuda_local_var_32600_55_non_const_red_acc108[3072];
|
||||
// __cuda_local_var_32543_10_non_const_f = 48
|
||||
// __cuda_local_var_32545_9_non_const_virial = 16
|
||||
.loc 16 31 0
|
||||
$LDWbegin_kernel_pair:
|
||||
.loc 16 36 0
|
||||
ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];
|
||||
ldu.global.f32 %f1, [%rd1+0];
|
||||
.loc 16 37 0
|
||||
ld.global.f32 %f2, [%rd1+4];
|
||||
.loc 16 38 0
|
||||
ld.global.f32 %f3, [%rd1+8];
|
||||
.loc 16 39 0
|
||||
ld.global.f32 %f4, [%rd1+12];
|
||||
st.shared.v4.f32 [__cuda___cuda_local_var_32536_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4};
|
||||
.loc 16 46 0
|
||||
mov.f32 %f5, 0f00000000; // 0
|
||||
mov.f32 %f6, %f5;
|
||||
mov.f32 %f7, 0f00000000; // 0
|
||||
mov.f32 %f8, %f7;
|
||||
mov.f32 %f9, 0f00000000; // 0
|
||||
mov.f32 %f10, %f9;
|
||||
mov.f32 %f11, 0f00000000; // 0
|
||||
mov.f32 %f12, %f11;
|
||||
mov.f32 %f13, 0f00000000; // 0
|
||||
mov.f32 %f14, %f13;
|
||||
mov.f32 %f15, 0f00000000; // 0
|
||||
mov.f32 %f16, %f15;
|
||||
ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];
|
||||
cvt.s32.u32 %r2, %tid.x;
|
||||
div.s32 %r3, %r2, %r1;
|
||||
cvt.s32.u32 %r4, %ntid.x;
|
||||
div.s32 %r5, %r4, %r1;
|
||||
cvt.s32.u32 %r6, %ctaid.x;
|
||||
mul.lo.s32 %r7, %r6, %r5;
|
||||
add.s32 %r8, %r3, %r7;
|
||||
ld.param.s32 %r9, [__cudaparm_kernel_pair_inum];
|
||||
setp.ge.s32 %p1, %r8, %r9;
|
||||
@%p1 bra $Lt_0_26370;
|
||||
.loc 16 51 0
|
||||
ld.param.s32 %r10, [__cudaparm_kernel_pair_nbor_pitch];
|
||||
cvt.s64.s32 %rd2, %r10;
|
||||
mul.wide.s32 %rd3, %r10, 4;
|
||||
cvt.s64.s32 %rd4, %r8;
|
||||
mul.wide.s32 %rd5, %r8, 4;
|
||||
ld.param.u64 %rd6, [__cudaparm_kernel_pair_dev_nbor];
|
||||
add.u64 %rd7, %rd5, %rd6;
|
||||
add.u64 %rd8, %rd3, %rd7;
|
||||
ld.global.s32 %r11, [%rd8+0];
|
||||
sub.s32 %r12, %r1, 1;
|
||||
and.b32 %r13, %r12, %r2;
|
||||
cvt.s64.s32 %rd9, %r13;
|
||||
mul.wide.s32 %rd10, %r13, 4;
|
||||
ld.param.u64 %rd11, [__cudaparm_kernel_pair_dev_packed];
|
||||
setp.ne.u64 %p2, %rd11, %rd6;
|
||||
@%p2 bra $Lt_0_19458;
|
||||
cvt.s32.s64 %r14, %rd2;
|
||||
mul.lo.s32 %r15, %r14, %r1;
|
||||
mov.s32 %r16, %r15;
|
||||
mul.lo.s32 %r17, %r12, %r8;
|
||||
add.s32 %r18, %r14, %r17;
|
||||
cvt.s64.s32 %rd12, %r18;
|
||||
mul.wide.s32 %rd13, %r18, 4;
|
||||
add.u64 %rd14, %rd8, %rd13;
|
||||
and.b32 %r19, %r12, %r11;
|
||||
cvt.s64.s32 %rd15, %r19;
|
||||
div.s32 %r20, %r11, %r1;
|
||||
mul.lo.s32 %r21, %r15, %r20;
|
||||
cvt.s64.s32 %rd16, %r21;
|
||||
add.u64 %rd17, %rd15, %rd16;
|
||||
mul.lo.u64 %rd18, %rd17, 4;
|
||||
add.u64 %rd19, %rd14, %rd18;
|
||||
add.u64 %rd20, %rd10, %rd14;
|
||||
bra.uni $Lt_0_19202;
|
||||
$Lt_0_19458:
|
||||
add.u64 %rd21, %rd3, %rd8;
|
||||
ld.global.s32 %r22, [%rd21+0];
|
||||
cvt.s64.s32 %rd22, %r22;
|
||||
mul.wide.s32 %rd23, %r22, 4;
|
||||
add.u64 %rd24, %rd11, %rd23;
|
||||
cvt.s64.s32 %rd25, %r11;
|
||||
mul.wide.s32 %rd26, %r11, 4;
|
||||
add.u64 %rd19, %rd24, %rd26;
|
||||
mov.s32 %r16, %r1;
|
||||
add.u64 %rd20, %rd10, %rd24;
|
||||
$Lt_0_19202:
|
||||
.loc 16 54 0
|
||||
ld.global.s32 %r23, [%rd7+0];
|
||||
mov.u32 %r24, %r23;
|
||||
mov.s32 %r25, 0;
|
||||
mov.u32 %r26, %r25;
|
||||
mov.s32 %r27, 0;
|
||||
mov.u32 %r28, %r27;
|
||||
mov.s32 %r29, 0;
|
||||
mov.u32 %r30, %r29;
|
||||
tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r24,%r26,%r28,%r30}];
|
||||
mov.f32 %f21, %f17;
|
||||
mov.f32 %f22, %f18;
|
||||
mov.f32 %f23, %f19;
|
||||
mov.f32 %f24, %f20;
|
||||
setp.ge.u64 %p3, %rd20, %rd19;
|
||||
@%p3 bra $Lt_0_27906;
|
||||
cvt.rzi.ftz.s32.f32 %r31, %f24;
|
||||
cvt.s64.s32 %rd27, %r16;
|
||||
ld.param.s32 %r32, [__cudaparm_kernel_pair_lj_types];
|
||||
mul.lo.s32 %r33, %r32, %r31;
|
||||
ld.param.u64 %rd28, [__cudaparm_kernel_pair_lj1];
|
||||
mov.f32 %f25, 0f00000000; // 0
|
||||
mov.f32 %f26, 0f00000000; // 0
|
||||
mov.f32 %f27, 0f00000000; // 0
|
||||
mov.f32 %f28, 0f00000000; // 0
|
||||
mov.u64 %rd29, __cuda___cuda_local_var_32536_33_non_const_sp_lj92;
|
||||
$Lt_0_20226:
|
||||
//<loop> Loop body line 54, nesting depth: 1, estimated iterations: unknown
|
||||
.loc 16 60 0
|
||||
ld.global.s32 %r34, [%rd20+0];
|
||||
.loc 16 61 0
|
||||
shr.s32 %r35, %r34, 30;
|
||||
and.b32 %r36, %r35, 3;
|
||||
cvt.s64.s32 %rd30, %r36;
|
||||
mul.wide.s32 %rd31, %r36, 4;
|
||||
add.u64 %rd32, %rd29, %rd31;
|
||||
ld.shared.f32 %f29, [%rd32+0];
|
||||
.loc 16 64 0
|
||||
and.b32 %r37, %r34, 1073741823;
|
||||
mov.u32 %r38, %r37;
|
||||
mov.s32 %r39, 0;
|
||||
mov.u32 %r40, %r39;
|
||||
mov.s32 %r41, 0;
|
||||
mov.u32 %r42, %r41;
|
||||
mov.s32 %r43, 0;
|
||||
mov.u32 %r44, %r43;
|
||||
tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r38,%r40,%r42,%r44}];
|
||||
mov.f32 %f34, %f30;
|
||||
mov.f32 %f35, %f31;
|
||||
mov.f32 %f36, %f32;
|
||||
mov.f32 %f37, %f33;
|
||||
cvt.rzi.ftz.s32.f32 %r45, %f37;
|
||||
sub.ftz.f32 %f38, %f22, %f35;
|
||||
sub.ftz.f32 %f39, %f21, %f34;
|
||||
sub.ftz.f32 %f40, %f23, %f36;
|
||||
mul.ftz.f32 %f41, %f38, %f38;
|
||||
fma.rn.ftz.f32 %f42, %f39, %f39, %f41;
|
||||
fma.rn.ftz.f32 %f43, %f40, %f40, %f42;
|
||||
add.s32 %r46, %r45, %r33;
|
||||
cvt.s64.s32 %rd33, %r46;
|
||||
mul.wide.s32 %rd34, %r46, 16;
|
||||
add.u64 %rd35, %rd34, %rd28;
|
||||
ld.global.f32 %f44, [%rd35+8];
|
||||
setp.gt.ftz.f32 %p4, %f44, %f43;
|
||||
@!%p4 bra $Lt_0_21506;
|
||||
.loc 16 78 0
|
||||
rcp.approx.ftz.f32 %f45, %f43;
|
||||
mul.ftz.f32 %f46, %f45, %f45;
|
||||
mul.ftz.f32 %f47, %f45, %f46;
|
||||
mul.ftz.f32 %f48, %f45, %f47;
|
||||
ld.global.v2.f32 {%f49,%f50}, [%rd35+0];
|
||||
mul.ftz.f32 %f51, %f49, %f47;
|
||||
sub.ftz.f32 %f52, %f51, %f50;
|
||||
mul.ftz.f32 %f53, %f48, %f52;
|
||||
mul.ftz.f32 %f54, %f29, %f53;
|
||||
.loc 16 80 0
|
||||
fma.rn.ftz.f32 %f27, %f39, %f54, %f27;
|
||||
.loc 16 81 0
|
||||
fma.rn.ftz.f32 %f26, %f38, %f54, %f26;
|
||||
.loc 16 82 0
|
||||
fma.rn.ftz.f32 %f25, %f40, %f54, %f25;
|
||||
ld.param.s32 %r47, [__cudaparm_kernel_pair_eflag];
|
||||
mov.u32 %r48, 0;
|
||||
setp.le.s32 %p5, %r47, %r48;
|
||||
@%p5 bra $Lt_0_20994;
|
||||
.loc 16 86 0
|
||||
ld.param.u64 %rd36, [__cudaparm_kernel_pair_lj3];
|
||||
add.u64 %rd37, %rd36, %rd34;
|
||||
ld.global.v4.f32 {%f55,%f56,%f57,_}, [%rd37+0];
|
||||
mul.ftz.f32 %f58, %f55, %f47;
|
||||
sub.ftz.f32 %f59, %f58, %f56;
|
||||
mul.ftz.f32 %f60, %f47, %f59;
|
||||
sub.ftz.f32 %f61, %f60, %f57;
|
||||
fma.rn.ftz.f32 %f28, %f29, %f61, %f28;
|
||||
$Lt_0_20994:
|
||||
ld.param.s32 %r49, [__cudaparm_kernel_pair_vflag];
|
||||
mov.u32 %r50, 0;
|
||||
setp.le.s32 %p6, %r49, %r50;
|
||||
@%p6 bra $Lt_0_21506;
|
||||
.loc 16 89 0
|
||||
mov.f32 %f62, %f6;
|
||||
mul.ftz.f32 %f63, %f39, %f39;
|
||||
fma.rn.ftz.f32 %f64, %f54, %f63, %f62;
|
||||
mov.f32 %f6, %f64;
|
||||
.loc 16 90 0
|
||||
mov.f32 %f65, %f8;
|
||||
fma.rn.ftz.f32 %f66, %f54, %f41, %f65;
|
||||
mov.f32 %f8, %f66;
|
||||
.loc 16 91 0
|
||||
mov.f32 %f67, %f10;
|
||||
mul.ftz.f32 %f68, %f40, %f40;
|
||||
fma.rn.ftz.f32 %f69, %f54, %f68, %f67;
|
||||
mov.f32 %f10, %f69;
|
||||
.loc 16 92 0
|
||||
mov.f32 %f70, %f12;
|
||||
mul.ftz.f32 %f71, %f38, %f39;
|
||||
fma.rn.ftz.f32 %f72, %f54, %f71, %f70;
|
||||
mov.f32 %f12, %f72;
|
||||
.loc 16 93 0
|
||||
mov.f32 %f73, %f14;
|
||||
mul.ftz.f32 %f74, %f39, %f40;
|
||||
fma.rn.ftz.f32 %f75, %f54, %f74, %f73;
|
||||
mov.f32 %f14, %f75;
|
||||
.loc 16 94 0
|
||||
mul.ftz.f32 %f76, %f38, %f40;
|
||||
fma.rn.ftz.f32 %f15, %f54, %f76, %f15;
|
||||
mov.f32 %f16, %f15;
|
||||
$Lt_0_21506:
|
||||
$Lt_0_20482:
|
||||
.loc 16 58 0
|
||||
mul.lo.u64 %rd38, %rd27, 4;
|
||||
add.u64 %rd20, %rd20, %rd38;
|
||||
setp.lt.u64 %p7, %rd20, %rd19;
|
||||
@%p7 bra $Lt_0_20226;
|
||||
bra.uni $Lt_0_19714;
|
||||
$Lt_0_27906:
|
||||
mov.f32 %f25, 0f00000000; // 0
|
||||
mov.f32 %f26, 0f00000000; // 0
|
||||
mov.f32 %f27, 0f00000000; // 0
|
||||
mov.f32 %f28, 0f00000000; // 0
|
||||
$Lt_0_19714:
|
||||
mov.u32 %r51, 1;
|
||||
setp.le.s32 %p8, %r1, %r51;
|
||||
@%p8 bra $Lt_0_24322;
|
||||
.loc 16 99 0
|
||||
mov.u64 %rd39, __cuda___cuda_local_var_32600_55_non_const_red_acc108;
|
||||
cvt.s64.s32 %rd40, %r2;
|
||||
mul.wide.s32 %rd41, %r2, 4;
|
||||
add.u64 %rd42, %rd39, %rd41;
|
||||
mov.f32 %f77, %f27;
|
||||
st.shared.f32 [%rd42+0], %f77;
|
||||
mov.f32 %f78, %f26;
|
||||
st.shared.f32 [%rd42+512], %f78;
|
||||
mov.f32 %f79, %f25;
|
||||
st.shared.f32 [%rd42+1024], %f79;
|
||||
mov.f32 %f80, %f28;
|
||||
st.shared.f32 [%rd42+1536], %f80;
|
||||
shr.s32 %r52, %r1, 31;
|
||||
mov.s32 %r53, 1;
|
||||
and.b32 %r54, %r52, %r53;
|
||||
add.s32 %r55, %r54, %r1;
|
||||
shr.s32 %r56, %r55, 1;
|
||||
mov.s32 %r57, %r56;
|
||||
mov.u32 %r58, 0;
|
||||
setp.ne.u32 %p9, %r56, %r58;
|
||||
@!%p9 bra $Lt_0_22786;
|
||||
$Lt_0_23298:
|
||||
setp.ge.u32 %p10, %r13, %r57;
|
||||
@%p10 bra $Lt_0_23554;
|
||||
add.u32 %r59, %r2, %r57;
|
||||
cvt.u64.u32 %rd43, %r59;
|
||||
mul.wide.u32 %rd44, %r59, 4;
|
||||
add.u64 %rd45, %rd39, %rd44;
|
||||
ld.shared.f32 %f81, [%rd45+0];
|
||||
add.ftz.f32 %f77, %f81, %f77;
|
||||
st.shared.f32 [%rd42+0], %f77;
|
||||
ld.shared.f32 %f82, [%rd45+512];
|
||||
add.ftz.f32 %f78, %f82, %f78;
|
||||
st.shared.f32 [%rd42+512], %f78;
|
||||
ld.shared.f32 %f83, [%rd45+1024];
|
||||
add.ftz.f32 %f79, %f83, %f79;
|
||||
st.shared.f32 [%rd42+1024], %f79;
|
||||
ld.shared.f32 %f84, [%rd45+1536];
|
||||
add.ftz.f32 %f80, %f84, %f80;
|
||||
st.shared.f32 [%rd42+1536], %f80;
|
||||
$Lt_0_23554:
|
||||
shr.u32 %r57, %r57, 1;
|
||||
mov.u32 %r60, 0;
|
||||
setp.ne.u32 %p11, %r57, %r60;
|
||||
@%p11 bra $Lt_0_23298;
|
||||
$Lt_0_22786:
|
||||
mov.f32 %f27, %f77;
|
||||
mov.f32 %f26, %f78;
|
||||
mov.f32 %f25, %f79;
|
||||
mov.f32 %f28, %f80;
|
||||
ld.param.s32 %r61, [__cudaparm_kernel_pair_vflag];
|
||||
mov.u32 %r62, 0;
|
||||
setp.le.s32 %p12, %r61, %r62;
|
||||
@%p12 bra $Lt_0_24322;
|
||||
mov.f32 %f77, %f6;
|
||||
st.shared.f32 [%rd42+0], %f77;
|
||||
mov.f32 %f78, %f8;
|
||||
st.shared.f32 [%rd42+512], %f78;
|
||||
mov.f32 %f79, %f10;
|
||||
st.shared.f32 [%rd42+1024], %f79;
|
||||
mov.f32 %f80, %f12;
|
||||
st.shared.f32 [%rd42+1536], %f80;
|
||||
mov.f32 %f85, %f14;
|
||||
st.shared.f32 [%rd42+2048], %f85;
|
||||
mov.f32 %f86, %f15;
|
||||
st.shared.f32 [%rd42+2560], %f86;
|
||||
mov.s32 %r63, %r56;
|
||||
@!%p9 bra $Lt_0_24834;
|
||||
$Lt_0_25346:
|
||||
setp.ge.u32 %p13, %r13, %r63;
|
||||
@%p13 bra $Lt_0_25602;
|
||||
add.u32 %r64, %r2, %r63;
|
||||
cvt.u64.u32 %rd46, %r64;
|
||||
mul.wide.u32 %rd47, %r64, 4;
|
||||
add.u64 %rd48, %rd39, %rd47;
|
||||
ld.shared.f32 %f87, [%rd48+0];
|
||||
add.ftz.f32 %f77, %f87, %f77;
|
||||
st.shared.f32 [%rd42+0], %f77;
|
||||
ld.shared.f32 %f88, [%rd48+512];
|
||||
add.ftz.f32 %f78, %f88, %f78;
|
||||
st.shared.f32 [%rd42+512], %f78;
|
||||
ld.shared.f32 %f89, [%rd48+1024];
|
||||
add.ftz.f32 %f79, %f89, %f79;
|
||||
st.shared.f32 [%rd42+1024], %f79;
|
||||
ld.shared.f32 %f90, [%rd48+1536];
|
||||
add.ftz.f32 %f80, %f90, %f80;
|
||||
st.shared.f32 [%rd42+1536], %f80;
|
||||
ld.shared.f32 %f91, [%rd48+2048];
|
||||
add.ftz.f32 %f85, %f91, %f85;
|
||||
st.shared.f32 [%rd42+2048], %f85;
|
||||
ld.shared.f32 %f92, [%rd48+2560];
|
||||
add.ftz.f32 %f86, %f92, %f86;
|
||||
st.shared.f32 [%rd42+2560], %f86;
|
||||
$Lt_0_25602:
|
||||
shr.u32 %r63, %r63, 1;
|
||||
mov.u32 %r65, 0;
|
||||
setp.ne.u32 %p14, %r63, %r65;
|
||||
@%p14 bra $Lt_0_25346;
|
||||
$Lt_0_24834:
|
||||
mov.f32 %f6, %f77;
|
||||
mov.f32 %f8, %f78;
|
||||
mov.f32 %f10, %f79;
|
||||
mov.f32 %f12, %f80;
|
||||
mov.f32 %f14, %f85;
|
||||
mov.f32 %f16, %f86;
|
||||
$Lt_0_24322:
|
||||
$Lt_0_22274:
|
||||
mov.u32 %r66, 0;
|
||||
setp.ne.s32 %p15, %r13, %r66;
|
||||
@%p15 bra $Lt_0_26370;
|
||||
ld.param.u64 %rd49, [__cudaparm_kernel_pair___val_paramengv];
|
||||
add.u64 %rd50, %rd49, %rd5;
|
||||
ld.param.s32 %r67, [__cudaparm_kernel_pair_eflag];
|
||||
mov.u32 %r68, 0;
|
||||
setp.le.s32 %p16, %r67, %r68;
|
||||
@%p16 bra $Lt_0_26882;
|
||||
st.global.f32 [%rd50+0], %f28;
|
||||
cvt.s64.s32 %rd51, %r9;
|
||||
mul.wide.s32 %rd52, %r9, 4;
|
||||
add.u64 %rd50, %rd50, %rd52;
|
||||
$Lt_0_26882:
|
||||
ld.param.s32 %r69, [__cudaparm_kernel_pair_vflag];
|
||||
mov.u32 %r70, 0;
|
||||
setp.le.s32 %p17, %r69, %r70;
|
||||
@%p17 bra $Lt_0_27394;
|
||||
mov.f32 %f93, %f6;
|
||||
st.global.f32 [%rd50+0], %f93;
|
||||
cvt.s64.s32 %rd53, %r9;
|
||||
mul.wide.s32 %rd54, %r9, 4;
|
||||
add.u64 %rd55, %rd54, %rd50;
|
||||
mov.f32 %f94, %f8;
|
||||
st.global.f32 [%rd55+0], %f94;
|
||||
add.u64 %rd56, %rd54, %rd55;
|
||||
mov.f32 %f95, %f10;
|
||||
st.global.f32 [%rd56+0], %f95;
|
||||
add.u64 %rd57, %rd54, %rd56;
|
||||
mov.f32 %f96, %f12;
|
||||
st.global.f32 [%rd57+0], %f96;
|
||||
add.u64 %rd50, %rd54, %rd57;
|
||||
mov.f32 %f97, %f14;
|
||||
st.global.f32 [%rd50+0], %f97;
|
||||
mov.f32 %f98, %f16;
|
||||
add.u64 %rd58, %rd54, %rd50;
|
||||
st.global.f32 [%rd58+0], %f98;
|
||||
$Lt_0_27394:
|
||||
ld.param.u64 %rd59, [__cudaparm_kernel_pair_ans];
|
||||
mul.lo.u64 %rd60, %rd4, 16;
|
||||
add.u64 %rd61, %rd59, %rd60;
|
||||
mov.f32 %f99, %f100;
|
||||
st.global.v4.f32 [%rd61+0], {%f27,%f26,%f25,%f99};
|
||||
$Lt_0_26370:
|
||||
$Lt_0_18690:
|
||||
.loc 16 102 0
|
||||
exit;
|
||||
$LDWend_kernel_pair:
|
||||
} // kernel_pair
|
||||
|
||||
.entry kernel_pair_fast (
|
||||
.param .u64 __cudaparm_kernel_pair_fast_x_,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_lj1_in,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_lj3_in,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_dev_nbor,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_dev_packed,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_ans,
|
||||
.param .u64 __cudaparm_kernel_pair_fast___val_paramengv,
|
||||
.param .s32 __cudaparm_kernel_pair_fast_eflag,
|
||||
.param .s32 __cudaparm_kernel_pair_fast_vflag,
|
||||
.param .s32 __cudaparm_kernel_pair_fast_inum,
|
||||
.param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,
|
||||
.param .s32 __cudaparm_kernel_pair_fast_t_per_atom)
|
||||
{
|
||||
.reg .u32 %r<74>;
|
||||
.reg .u64 %rd<75>;
|
||||
.reg .f32 %f<109>;
|
||||
.reg .pred %p<22>;
|
||||
.shared .align 4 .b8 __cuda___cuda_local_var_32617_33_non_const_sp_lj3268[16];
|
||||
.shared .align 16 .b8 __cuda___cuda_local_var_32615_34_non_const_lj13296[1936];
|
||||
.shared .align 16 .b8 __cuda___cuda_local_var_32616_34_non_const_lj35232[1936];
|
||||
.shared .align 4 .b8 __cuda___cuda_local_var_32685_55_non_const_red_acc7168[3072];
|
||||
// __cuda_local_var_32627_10_non_const_f = 48
|
||||
// __cuda_local_var_32629_9_non_const_virial = 16
|
||||
.loc 16 110 0
|
||||
$LDWbegin_kernel_pair_fast:
|
||||
cvt.s32.u32 %r1, %tid.x;
|
||||
mov.u32 %r2, 3;
|
||||
setp.gt.s32 %p1, %r1, %r2;
|
||||
@%p1 bra $Lt_1_20994;
|
||||
.loc 16 118 0
|
||||
mov.u64 %rd1, __cuda___cuda_local_var_32617_33_non_const_sp_lj3268;
|
||||
cvt.s64.s32 %rd2, %r1;
|
||||
mul.wide.s32 %rd3, %r1, 4;
|
||||
ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];
|
||||
add.u64 %rd5, %rd4, %rd3;
|
||||
ld.global.f32 %f1, [%rd5+0];
|
||||
add.u64 %rd6, %rd3, %rd1;
|
||||
st.shared.f32 [%rd6+0], %f1;
|
||||
$Lt_1_20994:
|
||||
mov.u64 %rd1, __cuda___cuda_local_var_32617_33_non_const_sp_lj3268;
|
||||
mov.u32 %r3, 120;
|
||||
setp.gt.s32 %p2, %r1, %r3;
|
||||
@%p2 bra $Lt_1_21506;
|
||||
.loc 16 120 0
|
||||
mov.u64 %rd7, __cuda___cuda_local_var_32615_34_non_const_lj13296;
|
||||
cvt.s64.s32 %rd8, %r1;
|
||||
mul.wide.s32 %rd9, %r1, 16;
|
||||
ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_lj1_in];
|
||||
add.u64 %rd11, %rd10, %rd9;
|
||||
add.u64 %rd12, %rd9, %rd7;
|
||||
ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0];
|
||||
st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5};
|
||||
ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag];
|
||||
mov.u32 %r5, 0;
|
||||
setp.le.s32 %p3, %r4, %r5;
|
||||
@%p3 bra $Lt_1_22018;
|
||||
.loc 16 122 0
|
||||
mov.u64 %rd13, __cuda___cuda_local_var_32616_34_non_const_lj35232;
|
||||
ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in];
|
||||
add.u64 %rd15, %rd14, %rd9;
|
||||
add.u64 %rd16, %rd9, %rd13;
|
||||
ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0];
|
||||
st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9};
|
||||
$Lt_1_22018:
|
||||
mov.u64 %rd13, __cuda___cuda_local_var_32616_34_non_const_lj35232;
|
||||
$Lt_1_21506:
|
||||
mov.u64 %rd13, __cuda___cuda_local_var_32616_34_non_const_lj35232;
|
||||
mov.u64 %rd7, __cuda___cuda_local_var_32615_34_non_const_lj13296;
|
||||
.loc 16 130 0
|
||||
mov.f32 %f10, 0f00000000; // 0
|
||||
mov.f32 %f11, %f10;
|
||||
mov.f32 %f12, 0f00000000; // 0
|
||||
mov.f32 %f13, %f12;
|
||||
mov.f32 %f14, 0f00000000; // 0
|
||||
mov.f32 %f15, %f14;
|
||||
mov.f32 %f16, 0f00000000; // 0
|
||||
mov.f32 %f17, %f16;
|
||||
mov.f32 %f18, 0f00000000; // 0
|
||||
mov.f32 %f19, %f18;
|
||||
mov.f32 %f20, 0f00000000; // 0
|
||||
mov.f32 %f21, %f20;
|
||||
.loc 16 132 0
|
||||
bar.sync 0;
|
||||
ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom];
|
||||
div.s32 %r7, %r1, %r6;
|
||||
cvt.s32.u32 %r8, %ntid.x;
|
||||
div.s32 %r9, %r8, %r6;
|
||||
cvt.s32.u32 %r10, %ctaid.x;
|
||||
mul.lo.s32 %r11, %r10, %r9;
|
||||
add.s32 %r12, %r7, %r11;
|
||||
ld.param.s32 %r13, [__cudaparm_kernel_pair_fast_inum];
|
||||
setp.ge.s32 %p4, %r12, %r13;
|
||||
@%p4 bra $Lt_1_30210;
|
||||
.loc 16 137 0
|
||||
ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_nbor_pitch];
|
||||
cvt.s64.s32 %rd17, %r14;
|
||||
mul.wide.s32 %rd18, %r14, 4;
|
||||
cvt.s64.s32 %rd19, %r12;
|
||||
mul.wide.s32 %rd20, %r12, 4;
|
||||
ld.param.u64 %rd21, [__cudaparm_kernel_pair_fast_dev_nbor];
|
||||
add.u64 %rd22, %rd20, %rd21;
|
||||
add.u64 %rd23, %rd18, %rd22;
|
||||
ld.global.s32 %r15, [%rd23+0];
|
||||
sub.s32 %r16, %r6, 1;
|
||||
and.b32 %r17, %r16, %r1;
|
||||
cvt.s64.s32 %rd24, %r17;
|
||||
mul.wide.s32 %rd25, %r17, 4;
|
||||
ld.param.u64 %rd26, [__cudaparm_kernel_pair_fast_dev_packed];
|
||||
setp.ne.u64 %p5, %rd26, %rd21;
|
||||
@%p5 bra $Lt_1_23298;
|
||||
cvt.s32.s64 %r18, %rd17;
|
||||
mul.lo.s32 %r19, %r18, %r6;
|
||||
mov.s32 %r20, %r19;
|
||||
mul.lo.s32 %r21, %r16, %r12;
|
||||
add.s32 %r22, %r18, %r21;
|
||||
cvt.s64.s32 %rd27, %r22;
|
||||
mul.wide.s32 %rd28, %r22, 4;
|
||||
add.u64 %rd29, %rd23, %rd28;
|
||||
and.b32 %r23, %r16, %r15;
|
||||
cvt.s64.s32 %rd30, %r23;
|
||||
div.s32 %r24, %r15, %r6;
|
||||
mul.lo.s32 %r25, %r19, %r24;
|
||||
cvt.s64.s32 %rd31, %r25;
|
||||
add.u64 %rd32, %rd30, %rd31;
|
||||
mul.lo.u64 %rd33, %rd32, 4;
|
||||
add.u64 %rd34, %rd29, %rd33;
|
||||
add.u64 %rd35, %rd25, %rd29;
|
||||
bra.uni $Lt_1_23042;
|
||||
$Lt_1_23298:
|
||||
add.u64 %rd36, %rd18, %rd23;
|
||||
ld.global.s32 %r26, [%rd36+0];
|
||||
cvt.s64.s32 %rd37, %r26;
|
||||
mul.wide.s32 %rd38, %r26, 4;
|
||||
add.u64 %rd39, %rd26, %rd38;
|
||||
cvt.s64.s32 %rd40, %r15;
|
||||
mul.wide.s32 %rd41, %r15, 4;
|
||||
add.u64 %rd34, %rd39, %rd41;
|
||||
mov.s32 %r20, %r6;
|
||||
add.u64 %rd35, %rd25, %rd39;
|
||||
$Lt_1_23042:
|
||||
.loc 16 140 0
|
||||
ld.global.s32 %r27, [%rd22+0];
|
||||
mov.u32 %r28, %r27;
|
||||
mov.s32 %r29, 0;
|
||||
mov.u32 %r30, %r29;
|
||||
mov.s32 %r31, 0;
|
||||
mov.u32 %r32, %r31;
|
||||
mov.s32 %r33, 0;
|
||||
mov.u32 %r34, %r33;
|
||||
tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r28,%r30,%r32,%r34}];
|
||||
mov.f32 %f26, %f22;
|
||||
mov.f32 %f27, %f23;
|
||||
mov.f32 %f28, %f24;
|
||||
mov.f32 %f29, %f25;
|
||||
setp.ge.u64 %p6, %rd35, %rd34;
|
||||
@%p6 bra $Lt_1_31746;
|
||||
cvt.rzi.ftz.s32.f32 %r35, %f29;
|
||||
cvt.s64.s32 %rd42, %r20;
|
||||
mul.lo.s32 %r36, %r35, 11;
|
||||
cvt.rn.f32.s32 %f30, %r36;
|
||||
mov.f32 %f31, 0f00000000; // 0
|
||||
mov.f32 %f32, 0f00000000; // 0
|
||||
mov.f32 %f33, 0f00000000; // 0
|
||||
mov.f32 %f34, 0f00000000; // 0
|
||||
$Lt_1_24066:
|
||||
//<loop> Loop body line 140, nesting depth: 1, estimated iterations: unknown
|
||||
.loc 16 147 0
|
||||
ld.global.s32 %r37, [%rd35+0];
|
||||
.loc 16 148 0
|
||||
shr.s32 %r38, %r37, 30;
|
||||
and.b32 %r39, %r38, 3;
|
||||
cvt.s64.s32 %rd43, %r39;
|
||||
mul.wide.s32 %rd44, %r39, 4;
|
||||
add.u64 %rd45, %rd1, %rd44;
|
||||
ld.shared.f32 %f35, [%rd45+0];
|
||||
.loc 16 151 0
|
||||
and.b32 %r40, %r37, 1073741823;
|
||||
mov.u32 %r41, %r40;
|
||||
mov.s32 %r42, 0;
|
||||
mov.u32 %r43, %r42;
|
||||
mov.s32 %r44, 0;
|
||||
mov.u32 %r45, %r44;
|
||||
mov.s32 %r46, 0;
|
||||
mov.u32 %r47, %r46;
|
||||
tex.1d.v4.f32.s32 {%f36,%f37,%f38,%f39},[pos_tex,{%r41,%r43,%r45,%r47}];
|
||||
mov.f32 %f40, %f36;
|
||||
mov.f32 %f41, %f37;
|
||||
mov.f32 %f42, %f38;
|
||||
mov.f32 %f43, %f39;
|
||||
sub.ftz.f32 %f44, %f27, %f41;
|
||||
sub.ftz.f32 %f45, %f26, %f40;
|
||||
sub.ftz.f32 %f46, %f28, %f42;
|
||||
mul.ftz.f32 %f47, %f44, %f44;
|
||||
fma.rn.ftz.f32 %f48, %f45, %f45, %f47;
|
||||
fma.rn.ftz.f32 %f49, %f46, %f46, %f48;
|
||||
add.ftz.f32 %f50, %f30, %f43;
|
||||
cvt.rzi.ftz.s32.f32 %r48, %f50;
|
||||
cvt.s64.s32 %rd46, %r48;
|
||||
mul.wide.s32 %rd47, %r48, 16;
|
||||
add.u64 %rd48, %rd47, %rd7;
|
||||
ld.shared.f32 %f51, [%rd48+8];
|
||||
setp.gt.ftz.f32 %p7, %f51, %f49;
|
||||
@!%p7 bra $Lt_1_25346;
|
||||
.loc 16 163 0
|
||||
rcp.approx.ftz.f32 %f52, %f49;
|
||||
mul.ftz.f32 %f53, %f52, %f52;
|
||||
mul.ftz.f32 %f54, %f52, %f53;
|
||||
mul.ftz.f32 %f55, %f52, %f35;
|
||||
mul.ftz.f32 %f56, %f54, %f55;
|
||||
ld.shared.v2.f32 {%f57,%f58}, [%rd48+0];
|
||||
mul.ftz.f32 %f59, %f57, %f54;
|
||||
sub.ftz.f32 %f60, %f59, %f58;
|
||||
mul.ftz.f32 %f61, %f56, %f60;
|
||||
.loc 16 165 0
|
||||
fma.rn.ftz.f32 %f33, %f45, %f61, %f33;
|
||||
.loc 16 166 0
|
||||
fma.rn.ftz.f32 %f32, %f44, %f61, %f32;
|
||||
.loc 16 167 0
|
||||
fma.rn.ftz.f32 %f31, %f46, %f61, %f31;
|
||||
ld.param.s32 %r49, [__cudaparm_kernel_pair_fast_eflag];
|
||||
mov.u32 %r50, 0;
|
||||
setp.le.s32 %p8, %r49, %r50;
|
||||
@%p8 bra $Lt_1_24834;
|
||||
.loc 16 170 0
|
||||
add.u64 %rd49, %rd47, %rd13;
|
||||
ld.shared.v4.f32 {%f62,%f63,%f64,_}, [%rd49+0];
|
||||
mul.ftz.f32 %f65, %f62, %f54;
|
||||
sub.ftz.f32 %f66, %f65, %f63;
|
||||
mul.ftz.f32 %f67, %f54, %f66;
|
||||
.loc 16 171 0
|
||||
sub.ftz.f32 %f68, %f67, %f64;
|
||||
fma.rn.ftz.f32 %f34, %f35, %f68, %f34;
|
||||
$Lt_1_24834:
|
||||
ld.param.s32 %r51, [__cudaparm_kernel_pair_fast_vflag];
|
||||
mov.u32 %r52, 0;
|
||||
setp.le.s32 %p9, %r51, %r52;
|
||||
@%p9 bra $Lt_1_25346;
|
||||
.loc 16 174 0
|
||||
mov.f32 %f69, %f11;
|
||||
mul.ftz.f32 %f70, %f45, %f45;
|
||||
fma.rn.ftz.f32 %f71, %f61, %f70, %f69;
|
||||
mov.f32 %f11, %f71;
|
||||
.loc 16 175 0
|
||||
mov.f32 %f72, %f13;
|
||||
fma.rn.ftz.f32 %f73, %f61, %f47, %f72;
|
||||
mov.f32 %f13, %f73;
|
||||
.loc 16 176 0
|
||||
mov.f32 %f74, %f15;
|
||||
mul.ftz.f32 %f75, %f46, %f46;
|
||||
fma.rn.ftz.f32 %f76, %f61, %f75, %f74;
|
||||
mov.f32 %f15, %f76;
|
||||
.loc 16 177 0
|
||||
mov.f32 %f77, %f17;
|
||||
mul.ftz.f32 %f78, %f44, %f45;
|
||||
fma.rn.ftz.f32 %f79, %f61, %f78, %f77;
|
||||
mov.f32 %f17, %f79;
|
||||
.loc 16 178 0
|
||||
mov.f32 %f80, %f19;
|
||||
mul.ftz.f32 %f81, %f45, %f46;
|
||||
fma.rn.ftz.f32 %f82, %f61, %f81, %f80;
|
||||
mov.f32 %f19, %f82;
|
||||
.loc 16 179 0
|
||||
mul.ftz.f32 %f83, %f44, %f46;
|
||||
fma.rn.ftz.f32 %f20, %f61, %f83, %f20;
|
||||
mov.f32 %f21, %f20;
|
||||
$Lt_1_25346:
|
||||
$Lt_1_24322:
|
||||
.loc 16 145 0
|
||||
mul.lo.u64 %rd50, %rd42, 4;
|
||||
add.u64 %rd35, %rd35, %rd50;
|
||||
setp.lt.u64 %p10, %rd35, %rd34;
|
||||
@%p10 bra $Lt_1_24066;
|
||||
bra.uni $Lt_1_23554;
|
||||
$Lt_1_31746:
|
||||
mov.f32 %f31, 0f00000000; // 0
|
||||
mov.f32 %f32, 0f00000000; // 0
|
||||
mov.f32 %f33, 0f00000000; // 0
|
||||
mov.f32 %f34, 0f00000000; // 0
|
||||
$Lt_1_23554:
|
||||
mov.u32 %r53, 1;
|
||||
setp.le.s32 %p11, %r6, %r53;
|
||||
@%p11 bra $Lt_1_28162;
|
||||
.loc 16 184 0
|
||||
mov.u64 %rd51, __cuda___cuda_local_var_32685_55_non_const_red_acc7168;
|
||||
cvt.s64.s32 %rd52, %r1;
|
||||
mul.wide.s32 %rd53, %r1, 4;
|
||||
add.u64 %rd54, %rd51, %rd53;
|
||||
mov.f32 %f84, %f33;
|
||||
st.shared.f32 [%rd54+0], %f84;
|
||||
mov.f32 %f85, %f32;
|
||||
st.shared.f32 [%rd54+512], %f85;
|
||||
mov.f32 %f86, %f31;
|
||||
st.shared.f32 [%rd54+1024], %f86;
|
||||
mov.f32 %f87, %f34;
|
||||
st.shared.f32 [%rd54+1536], %f87;
|
||||
shr.s32 %r54, %r6, 31;
|
||||
mov.s32 %r55, 1;
|
||||
and.b32 %r56, %r54, %r55;
|
||||
add.s32 %r57, %r56, %r6;
|
||||
shr.s32 %r58, %r57, 1;
|
||||
mov.s32 %r59, %r58;
|
||||
mov.u32 %r60, 0;
|
||||
setp.ne.u32 %p12, %r58, %r60;
|
||||
@!%p12 bra $Lt_1_26626;
|
||||
$Lt_1_27138:
|
||||
setp.ge.u32 %p13, %r17, %r59;
|
||||
@%p13 bra $Lt_1_27394;
|
||||
add.u32 %r61, %r1, %r59;
|
||||
cvt.u64.u32 %rd55, %r61;
|
||||
mul.wide.u32 %rd56, %r61, 4;
|
||||
add.u64 %rd57, %rd51, %rd56;
|
||||
ld.shared.f32 %f88, [%rd57+0];
|
||||
add.ftz.f32 %f84, %f88, %f84;
|
||||
st.shared.f32 [%rd54+0], %f84;
|
||||
ld.shared.f32 %f89, [%rd57+512];
|
||||
add.ftz.f32 %f85, %f89, %f85;
|
||||
st.shared.f32 [%rd54+512], %f85;
|
||||
ld.shared.f32 %f90, [%rd57+1024];
|
||||
add.ftz.f32 %f86, %f90, %f86;
|
||||
st.shared.f32 [%rd54+1024], %f86;
|
||||
ld.shared.f32 %f91, [%rd57+1536];
|
||||
add.ftz.f32 %f87, %f91, %f87;
|
||||
st.shared.f32 [%rd54+1536], %f87;
|
||||
$Lt_1_27394:
|
||||
shr.u32 %r59, %r59, 1;
|
||||
mov.u32 %r62, 0;
|
||||
setp.ne.u32 %p14, %r59, %r62;
|
||||
@%p14 bra $Lt_1_27138;
|
||||
$Lt_1_26626:
|
||||
mov.f32 %f33, %f84;
|
||||
mov.f32 %f32, %f85;
|
||||
mov.f32 %f31, %f86;
|
||||
mov.f32 %f34, %f87;
|
||||
ld.param.s32 %r63, [__cudaparm_kernel_pair_fast_vflag];
|
||||
mov.u32 %r64, 0;
|
||||
setp.le.s32 %p15, %r63, %r64;
|
||||
@%p15 bra $Lt_1_28162;
|
||||
mov.f32 %f84, %f11;
|
||||
st.shared.f32 [%rd54+0], %f84;
|
||||
mov.f32 %f85, %f13;
|
||||
st.shared.f32 [%rd54+512], %f85;
|
||||
mov.f32 %f86, %f15;
|
||||
st.shared.f32 [%rd54+1024], %f86;
|
||||
mov.f32 %f87, %f17;
|
||||
st.shared.f32 [%rd54+1536], %f87;
|
||||
mov.f32 %f92, %f19;
|
||||
st.shared.f32 [%rd54+2048], %f92;
|
||||
mov.f32 %f93, %f20;
|
||||
st.shared.f32 [%rd54+2560], %f93;
|
||||
mov.s32 %r65, %r58;
|
||||
@!%p12 bra $Lt_1_28674;
|
||||
$Lt_1_29186:
|
||||
setp.ge.u32 %p16, %r17, %r65;
|
||||
@%p16 bra $Lt_1_29442;
|
||||
add.u32 %r66, %r1, %r65;
|
||||
cvt.u64.u32 %rd58, %r66;
|
||||
mul.wide.u32 %rd59, %r66, 4;
|
||||
add.u64 %rd60, %rd51, %rd59;
|
||||
ld.shared.f32 %f94, [%rd60+0];
|
||||
add.ftz.f32 %f84, %f94, %f84;
|
||||
st.shared.f32 [%rd54+0], %f84;
|
||||
ld.shared.f32 %f95, [%rd60+512];
|
||||
add.ftz.f32 %f85, %f95, %f85;
|
||||
st.shared.f32 [%rd54+512], %f85;
|
||||
ld.shared.f32 %f96, [%rd60+1024];
|
||||
add.ftz.f32 %f86, %f96, %f86;
|
||||
st.shared.f32 [%rd54+1024], %f86;
|
||||
ld.shared.f32 %f97, [%rd60+1536];
|
||||
add.ftz.f32 %f87, %f97, %f87;
|
||||
st.shared.f32 [%rd54+1536], %f87;
|
||||
ld.shared.f32 %f98, [%rd60+2048];
|
||||
add.ftz.f32 %f92, %f98, %f92;
|
||||
st.shared.f32 [%rd54+2048], %f92;
|
||||
ld.shared.f32 %f99, [%rd60+2560];
|
||||
add.ftz.f32 %f93, %f99, %f93;
|
||||
st.shared.f32 [%rd54+2560], %f93;
|
||||
$Lt_1_29442:
|
||||
shr.u32 %r65, %r65, 1;
|
||||
mov.u32 %r67, 0;
|
||||
setp.ne.u32 %p17, %r65, %r67;
|
||||
@%p17 bra $Lt_1_29186;
|
||||
$Lt_1_28674:
|
||||
mov.f32 %f11, %f84;
|
||||
mov.f32 %f13, %f85;
|
||||
mov.f32 %f15, %f86;
|
||||
mov.f32 %f17, %f87;
|
||||
mov.f32 %f19, %f92;
|
||||
mov.f32 %f21, %f93;
|
||||
$Lt_1_28162:
|
||||
$Lt_1_26114:
|
||||
mov.u32 %r68, 0;
|
||||
setp.ne.s32 %p18, %r17, %r68;
|
||||
@%p18 bra $Lt_1_30210;
|
||||
ld.param.u64 %rd61, [__cudaparm_kernel_pair_fast___val_paramengv];
|
||||
add.u64 %rd62, %rd61, %rd20;
|
||||
ld.param.s32 %r69, [__cudaparm_kernel_pair_fast_eflag];
|
||||
mov.u32 %r70, 0;
|
||||
setp.le.s32 %p19, %r69, %r70;
|
||||
@%p19 bra $Lt_1_30722;
|
||||
st.global.f32 [%rd62+0], %f34;
|
||||
cvt.s64.s32 %rd63, %r13;
|
||||
mul.wide.s32 %rd64, %r13, 4;
|
||||
add.u64 %rd62, %rd62, %rd64;
|
||||
$Lt_1_30722:
|
||||
ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag];
|
||||
mov.u32 %r72, 0;
|
||||
setp.le.s32 %p20, %r71, %r72;
|
||||
@%p20 bra $Lt_1_31234;
|
||||
mov.f32 %f100, %f11;
|
||||
st.global.f32 [%rd62+0], %f100;
|
||||
cvt.s64.s32 %rd65, %r13;
|
||||
mul.wide.s32 %rd66, %r13, 4;
|
||||
add.u64 %rd67, %rd66, %rd62;
|
||||
mov.f32 %f101, %f13;
|
||||
st.global.f32 [%rd67+0], %f101;
|
||||
add.u64 %rd68, %rd66, %rd67;
|
||||
mov.f32 %f102, %f15;
|
||||
st.global.f32 [%rd68+0], %f102;
|
||||
add.u64 %rd69, %rd66, %rd68;
|
||||
mov.f32 %f103, %f17;
|
||||
st.global.f32 [%rd69+0], %f103;
|
||||
add.u64 %rd62, %rd66, %rd69;
|
||||
mov.f32 %f104, %f19;
|
||||
st.global.f32 [%rd62+0], %f104;
|
||||
mov.f32 %f105, %f21;
|
||||
add.u64 %rd70, %rd66, %rd62;
|
||||
st.global.f32 [%rd70+0], %f105;
|
||||
$Lt_1_31234:
|
||||
ld.param.u64 %rd71, [__cudaparm_kernel_pair_fast_ans];
|
||||
mul.lo.u64 %rd72, %rd19, 16;
|
||||
add.u64 %rd73, %rd71, %rd72;
|
||||
mov.f32 %f106, %f107;
|
||||
st.global.v4.f32 [%rd73+0], {%f33,%f32,%f31,%f106};
|
||||
$Lt_1_30210:
|
||||
$Lt_1_22530:
|
||||
.loc 16 187 0
|
||||
exit;
|
||||
$LDWend_kernel_pair_fast:
|
||||
} // kernel_pair_fast
|
||||
|
||||
901
lib/gpu/lj96.ptx
901
lib/gpu/lj96.ptx
@ -1,901 +0,0 @@
|
||||
.version 2.3
|
||||
.target sm_20
|
||||
.address_size 64
|
||||
// compiled with /usr/local/cuda/open64/lib//be
|
||||
// nvopencc 4.0 built on 2011-05-12
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Compiling /tmp/tmpxft_00009c89_00000000-9_lal_lj96.cpp3.i (/home/sjplimp/ccBI#.pOwwSL)
|
||||
//-----------------------------------------------------------
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Options:
|
||||
//-----------------------------------------------------------
|
||||
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
|
||||
// -O3 (Optimization level)
|
||||
// -g0 (Debug level)
|
||||
// -m2 (Report advisories)
|
||||
//-----------------------------------------------------------
|
||||
|
||||
.file 1 "<command-line>"
|
||||
.file 2 "/tmp/tmpxft_00009c89_00000000-8_lal_lj96.cudafe2.gpu"
|
||||
.file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
|
||||
.file 4 "/usr/local/cuda/include/crt/device_runtime.h"
|
||||
.file 5 "/usr/local/cuda/include/host_defines.h"
|
||||
.file 6 "/usr/local/cuda/include/builtin_types.h"
|
||||
.file 7 "/usr/local/cuda/include/device_types.h"
|
||||
.file 8 "/usr/local/cuda/include/driver_types.h"
|
||||
.file 9 "/usr/local/cuda/include/surface_types.h"
|
||||
.file 10 "/usr/local/cuda/include/texture_types.h"
|
||||
.file 11 "/usr/local/cuda/include/vector_types.h"
|
||||
.file 12 "/usr/local/cuda/include/device_launch_parameters.h"
|
||||
.file 13 "/usr/local/cuda/include/crt/storage_class.h"
|
||||
.file 14 "/usr/include/bits/types.h"
|
||||
.file 15 "/usr/include/time.h"
|
||||
.file 16 "lal_lj96.cu"
|
||||
.file 17 "/usr/local/cuda/include/common_functions.h"
|
||||
.file 18 "/usr/local/cuda/include/math_functions.h"
|
||||
.file 19 "/usr/local/cuda/include/math_constants.h"
|
||||
.file 20 "/usr/local/cuda/include/device_functions.h"
|
||||
.file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h"
|
||||
.file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h"
|
||||
.file 23 "/usr/local/cuda/include/sm_13_double_functions.h"
|
||||
.file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h"
|
||||
.file 25 "/usr/local/cuda/include/sm_20_intrinsics.h"
|
||||
.file 26 "/usr/local/cuda/include/surface_functions.h"
|
||||
.file 27 "/usr/local/cuda/include/texture_fetch_functions.h"
|
||||
.file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
|
||||
|
||||
.global .texref pos_tex;
|
||||
|
||||
.entry kernel_pair (
|
||||
.param .u64 __cudaparm_kernel_pair_x_,
|
||||
.param .u64 __cudaparm_kernel_pair_lj1,
|
||||
.param .u64 __cudaparm_kernel_pair_lj3,
|
||||
.param .s32 __cudaparm_kernel_pair_lj_types,
|
||||
.param .u64 __cudaparm_kernel_pair_sp_lj_in,
|
||||
.param .u64 __cudaparm_kernel_pair_dev_nbor,
|
||||
.param .u64 __cudaparm_kernel_pair_dev_packed,
|
||||
.param .u64 __cudaparm_kernel_pair_ans,
|
||||
.param .u64 __cudaparm_kernel_pair___val_paramengv,
|
||||
.param .s32 __cudaparm_kernel_pair_eflag,
|
||||
.param .s32 __cudaparm_kernel_pair_vflag,
|
||||
.param .s32 __cudaparm_kernel_pair_inum,
|
||||
.param .s32 __cudaparm_kernel_pair_nbor_pitch,
|
||||
.param .s32 __cudaparm_kernel_pair_t_per_atom)
|
||||
{
|
||||
.reg .u32 %r<72>;
|
||||
.reg .u64 %rd<63>;
|
||||
.reg .f32 %f<103>;
|
||||
.reg .pred %p<19>;
|
||||
.shared .align 16 .b8 __cuda___cuda_local_var_32536_33_non_const_sp_lj92[16];
|
||||
.shared .align 4 .b8 __cuda___cuda_local_var_32601_55_non_const_red_acc108[3072];
|
||||
// __cuda_local_var_32543_10_non_const_f = 48
|
||||
// __cuda_local_var_32545_9_non_const_virial = 16
|
||||
.loc 16 31 0
|
||||
$LDWbegin_kernel_pair:
|
||||
.loc 16 36 0
|
||||
ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];
|
||||
ldu.global.f32 %f1, [%rd1+0];
|
||||
.loc 16 37 0
|
||||
ld.global.f32 %f2, [%rd1+4];
|
||||
.loc 16 38 0
|
||||
ld.global.f32 %f3, [%rd1+8];
|
||||
.loc 16 39 0
|
||||
ld.global.f32 %f4, [%rd1+12];
|
||||
st.shared.v4.f32 [__cuda___cuda_local_var_32536_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4};
|
||||
.loc 16 46 0
|
||||
mov.f32 %f5, 0f00000000; // 0
|
||||
mov.f32 %f6, %f5;
|
||||
mov.f32 %f7, 0f00000000; // 0
|
||||
mov.f32 %f8, %f7;
|
||||
mov.f32 %f9, 0f00000000; // 0
|
||||
mov.f32 %f10, %f9;
|
||||
mov.f32 %f11, 0f00000000; // 0
|
||||
mov.f32 %f12, %f11;
|
||||
mov.f32 %f13, 0f00000000; // 0
|
||||
mov.f32 %f14, %f13;
|
||||
mov.f32 %f15, 0f00000000; // 0
|
||||
mov.f32 %f16, %f15;
|
||||
ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];
|
||||
cvt.s32.u32 %r2, %tid.x;
|
||||
div.s32 %r3, %r2, %r1;
|
||||
cvt.s32.u32 %r4, %ntid.x;
|
||||
div.s32 %r5, %r4, %r1;
|
||||
cvt.s32.u32 %r6, %ctaid.x;
|
||||
mul.lo.s32 %r7, %r6, %r5;
|
||||
add.s32 %r8, %r3, %r7;
|
||||
ld.param.s32 %r9, [__cudaparm_kernel_pair_inum];
|
||||
setp.ge.s32 %p1, %r8, %r9;
|
||||
@%p1 bra $Lt_0_26370;
|
||||
.loc 16 51 0
|
||||
ld.param.s32 %r10, [__cudaparm_kernel_pair_nbor_pitch];
|
||||
cvt.s64.s32 %rd2, %r10;
|
||||
mul.wide.s32 %rd3, %r10, 4;
|
||||
cvt.s64.s32 %rd4, %r8;
|
||||
mul.wide.s32 %rd5, %r8, 4;
|
||||
ld.param.u64 %rd6, [__cudaparm_kernel_pair_dev_nbor];
|
||||
add.u64 %rd7, %rd5, %rd6;
|
||||
add.u64 %rd8, %rd3, %rd7;
|
||||
ld.global.s32 %r11, [%rd8+0];
|
||||
sub.s32 %r12, %r1, 1;
|
||||
and.b32 %r13, %r12, %r2;
|
||||
cvt.s64.s32 %rd9, %r13;
|
||||
mul.wide.s32 %rd10, %r13, 4;
|
||||
ld.param.u64 %rd11, [__cudaparm_kernel_pair_dev_packed];
|
||||
setp.ne.u64 %p2, %rd11, %rd6;
|
||||
@%p2 bra $Lt_0_19458;
|
||||
cvt.s32.s64 %r14, %rd2;
|
||||
mul.lo.s32 %r15, %r14, %r1;
|
||||
mov.s32 %r16, %r15;
|
||||
mul.lo.s32 %r17, %r12, %r8;
|
||||
add.s32 %r18, %r14, %r17;
|
||||
cvt.s64.s32 %rd12, %r18;
|
||||
mul.wide.s32 %rd13, %r18, 4;
|
||||
add.u64 %rd14, %rd8, %rd13;
|
||||
and.b32 %r19, %r12, %r11;
|
||||
cvt.s64.s32 %rd15, %r19;
|
||||
div.s32 %r20, %r11, %r1;
|
||||
mul.lo.s32 %r21, %r15, %r20;
|
||||
cvt.s64.s32 %rd16, %r21;
|
||||
add.u64 %rd17, %rd15, %rd16;
|
||||
mul.lo.u64 %rd18, %rd17, 4;
|
||||
add.u64 %rd19, %rd14, %rd18;
|
||||
add.u64 %rd20, %rd10, %rd14;
|
||||
bra.uni $Lt_0_19202;
|
||||
$Lt_0_19458:
|
||||
add.u64 %rd21, %rd3, %rd8;
|
||||
ld.global.s32 %r22, [%rd21+0];
|
||||
cvt.s64.s32 %rd22, %r22;
|
||||
mul.wide.s32 %rd23, %r22, 4;
|
||||
add.u64 %rd24, %rd11, %rd23;
|
||||
cvt.s64.s32 %rd25, %r11;
|
||||
mul.wide.s32 %rd26, %r11, 4;
|
||||
add.u64 %rd19, %rd24, %rd26;
|
||||
mov.s32 %r16, %r1;
|
||||
add.u64 %rd20, %rd10, %rd24;
|
||||
$Lt_0_19202:
|
||||
.loc 16 54 0
|
||||
ld.global.s32 %r23, [%rd7+0];
|
||||
mov.u32 %r24, %r23;
|
||||
mov.s32 %r25, 0;
|
||||
mov.u32 %r26, %r25;
|
||||
mov.s32 %r27, 0;
|
||||
mov.u32 %r28, %r27;
|
||||
mov.s32 %r29, 0;
|
||||
mov.u32 %r30, %r29;
|
||||
tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r24,%r26,%r28,%r30}];
|
||||
mov.f32 %f21, %f17;
|
||||
mov.f32 %f22, %f18;
|
||||
mov.f32 %f23, %f19;
|
||||
mov.f32 %f24, %f20;
|
||||
setp.ge.u64 %p3, %rd20, %rd19;
|
||||
@%p3 bra $Lt_0_27906;
|
||||
cvt.rzi.ftz.s32.f32 %r31, %f24;
|
||||
cvt.s64.s32 %rd27, %r16;
|
||||
ld.param.s32 %r32, [__cudaparm_kernel_pair_lj_types];
|
||||
mul.lo.s32 %r33, %r32, %r31;
|
||||
ld.param.u64 %rd28, [__cudaparm_kernel_pair_lj1];
|
||||
mov.f32 %f25, 0f00000000; // 0
|
||||
mov.f32 %f26, 0f00000000; // 0
|
||||
mov.f32 %f27, 0f00000000; // 0
|
||||
mov.f32 %f28, 0f00000000; // 0
|
||||
mov.u64 %rd29, __cuda___cuda_local_var_32536_33_non_const_sp_lj92;
|
||||
$Lt_0_20226:
|
||||
//<loop> Loop body line 54, nesting depth: 1, estimated iterations: unknown
|
||||
.loc 16 60 0
|
||||
ld.global.s32 %r34, [%rd20+0];
|
||||
.loc 16 61 0
|
||||
shr.s32 %r35, %r34, 30;
|
||||
and.b32 %r36, %r35, 3;
|
||||
cvt.s64.s32 %rd30, %r36;
|
||||
mul.wide.s32 %rd31, %r36, 4;
|
||||
add.u64 %rd32, %rd29, %rd31;
|
||||
ld.shared.f32 %f29, [%rd32+0];
|
||||
.loc 16 64 0
|
||||
and.b32 %r37, %r34, 1073741823;
|
||||
mov.u32 %r38, %r37;
|
||||
mov.s32 %r39, 0;
|
||||
mov.u32 %r40, %r39;
|
||||
mov.s32 %r41, 0;
|
||||
mov.u32 %r42, %r41;
|
||||
mov.s32 %r43, 0;
|
||||
mov.u32 %r44, %r43;
|
||||
tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r38,%r40,%r42,%r44}];
|
||||
mov.f32 %f34, %f30;
|
||||
mov.f32 %f35, %f31;
|
||||
mov.f32 %f36, %f32;
|
||||
mov.f32 %f37, %f33;
|
||||
cvt.rzi.ftz.s32.f32 %r45, %f37;
|
||||
sub.ftz.f32 %f38, %f22, %f35;
|
||||
sub.ftz.f32 %f39, %f21, %f34;
|
||||
sub.ftz.f32 %f40, %f23, %f36;
|
||||
mul.ftz.f32 %f41, %f38, %f38;
|
||||
fma.rn.ftz.f32 %f42, %f39, %f39, %f41;
|
||||
fma.rn.ftz.f32 %f43, %f40, %f40, %f42;
|
||||
add.s32 %r46, %r45, %r33;
|
||||
cvt.s64.s32 %rd33, %r46;
|
||||
mul.wide.s32 %rd34, %r46, 16;
|
||||
add.u64 %rd35, %rd34, %rd28;
|
||||
ld.global.f32 %f44, [%rd35+8];
|
||||
setp.gt.ftz.f32 %p4, %f44, %f43;
|
||||
@!%p4 bra $Lt_0_21506;
|
||||
.loc 16 79 0
|
||||
rcp.approx.ftz.f32 %f45, %f43;
|
||||
mul.ftz.f32 %f46, %f45, %f45;
|
||||
mul.ftz.f32 %f47, %f45, %f46;
|
||||
sqrt.approx.ftz.f32 %f48, %f47;
|
||||
mul.ftz.f32 %f49, %f45, %f47;
|
||||
ld.global.v2.f32 {%f50,%f51}, [%rd35+0];
|
||||
mul.ftz.f32 %f52, %f50, %f48;
|
||||
sub.ftz.f32 %f53, %f52, %f51;
|
||||
mul.ftz.f32 %f54, %f49, %f53;
|
||||
mul.ftz.f32 %f55, %f29, %f54;
|
||||
.loc 16 81 0
|
||||
fma.rn.ftz.f32 %f27, %f39, %f55, %f27;
|
||||
.loc 16 82 0
|
||||
fma.rn.ftz.f32 %f26, %f38, %f55, %f26;
|
||||
.loc 16 83 0
|
||||
fma.rn.ftz.f32 %f25, %f40, %f55, %f25;
|
||||
ld.param.s32 %r47, [__cudaparm_kernel_pair_eflag];
|
||||
mov.u32 %r48, 0;
|
||||
setp.le.s32 %p5, %r47, %r48;
|
||||
@%p5 bra $Lt_0_20994;
|
||||
.loc 16 87 0
|
||||
ld.param.u64 %rd36, [__cudaparm_kernel_pair_lj3];
|
||||
add.u64 %rd37, %rd36, %rd34;
|
||||
ld.global.v4.f32 {%f56,%f57,%f58,_}, [%rd37+0];
|
||||
mul.ftz.f32 %f59, %f56, %f48;
|
||||
sub.ftz.f32 %f60, %f59, %f57;
|
||||
mul.ftz.f32 %f61, %f47, %f60;
|
||||
sub.ftz.f32 %f62, %f61, %f58;
|
||||
fma.rn.ftz.f32 %f28, %f29, %f62, %f28;
|
||||
$Lt_0_20994:
|
||||
ld.param.s32 %r49, [__cudaparm_kernel_pair_vflag];
|
||||
mov.u32 %r50, 0;
|
||||
setp.le.s32 %p6, %r49, %r50;
|
||||
@%p6 bra $Lt_0_21506;
|
||||
.loc 16 90 0
|
||||
mov.f32 %f63, %f6;
|
||||
mul.ftz.f32 %f64, %f39, %f39;
|
||||
fma.rn.ftz.f32 %f65, %f55, %f64, %f63;
|
||||
mov.f32 %f6, %f65;
|
||||
.loc 16 91 0
|
||||
mov.f32 %f66, %f8;
|
||||
fma.rn.ftz.f32 %f67, %f55, %f41, %f66;
|
||||
mov.f32 %f8, %f67;
|
||||
.loc 16 92 0
|
||||
mov.f32 %f68, %f10;
|
||||
mul.ftz.f32 %f69, %f40, %f40;
|
||||
fma.rn.ftz.f32 %f70, %f55, %f69, %f68;
|
||||
mov.f32 %f10, %f70;
|
||||
.loc 16 93 0
|
||||
mov.f32 %f71, %f12;
|
||||
mul.ftz.f32 %f72, %f38, %f39;
|
||||
fma.rn.ftz.f32 %f73, %f55, %f72, %f71;
|
||||
mov.f32 %f12, %f73;
|
||||
.loc 16 94 0
|
||||
mov.f32 %f74, %f14;
|
||||
mul.ftz.f32 %f75, %f39, %f40;
|
||||
fma.rn.ftz.f32 %f76, %f55, %f75, %f74;
|
||||
mov.f32 %f14, %f76;
|
||||
.loc 16 95 0
|
||||
mul.ftz.f32 %f77, %f38, %f40;
|
||||
fma.rn.ftz.f32 %f15, %f55, %f77, %f15;
|
||||
mov.f32 %f16, %f15;
|
||||
$Lt_0_21506:
|
||||
$Lt_0_20482:
|
||||
.loc 16 58 0
|
||||
mul.lo.u64 %rd38, %rd27, 4;
|
||||
add.u64 %rd20, %rd20, %rd38;
|
||||
setp.lt.u64 %p7, %rd20, %rd19;
|
||||
@%p7 bra $Lt_0_20226;
|
||||
bra.uni $Lt_0_19714;
|
||||
$Lt_0_27906:
|
||||
mov.f32 %f25, 0f00000000; // 0
|
||||
mov.f32 %f26, 0f00000000; // 0
|
||||
mov.f32 %f27, 0f00000000; // 0
|
||||
mov.f32 %f28, 0f00000000; // 0
|
||||
$Lt_0_19714:
|
||||
mov.u32 %r51, 1;
|
||||
setp.le.s32 %p8, %r1, %r51;
|
||||
@%p8 bra $Lt_0_24322;
|
||||
.loc 16 100 0
|
||||
mov.u64 %rd39, __cuda___cuda_local_var_32601_55_non_const_red_acc108;
|
||||
cvt.s64.s32 %rd40, %r2;
|
||||
mul.wide.s32 %rd41, %r2, 4;
|
||||
add.u64 %rd42, %rd39, %rd41;
|
||||
mov.f32 %f78, %f27;
|
||||
st.shared.f32 [%rd42+0], %f78;
|
||||
mov.f32 %f79, %f26;
|
||||
st.shared.f32 [%rd42+512], %f79;
|
||||
mov.f32 %f80, %f25;
|
||||
st.shared.f32 [%rd42+1024], %f80;
|
||||
mov.f32 %f81, %f28;
|
||||
st.shared.f32 [%rd42+1536], %f81;
|
||||
shr.s32 %r52, %r1, 31;
|
||||
mov.s32 %r53, 1;
|
||||
and.b32 %r54, %r52, %r53;
|
||||
add.s32 %r55, %r54, %r1;
|
||||
shr.s32 %r56, %r55, 1;
|
||||
mov.s32 %r57, %r56;
|
||||
mov.u32 %r58, 0;
|
||||
setp.ne.u32 %p9, %r56, %r58;
|
||||
@!%p9 bra $Lt_0_22786;
|
||||
$Lt_0_23298:
|
||||
setp.ge.u32 %p10, %r13, %r57;
|
||||
@%p10 bra $Lt_0_23554;
|
||||
add.u32 %r59, %r2, %r57;
|
||||
cvt.u64.u32 %rd43, %r59;
|
||||
mul.wide.u32 %rd44, %r59, 4;
|
||||
add.u64 %rd45, %rd39, %rd44;
|
||||
ld.shared.f32 %f82, [%rd45+0];
|
||||
add.ftz.f32 %f78, %f82, %f78;
|
||||
st.shared.f32 [%rd42+0], %f78;
|
||||
ld.shared.f32 %f83, [%rd45+512];
|
||||
add.ftz.f32 %f79, %f83, %f79;
|
||||
st.shared.f32 [%rd42+512], %f79;
|
||||
ld.shared.f32 %f84, [%rd45+1024];
|
||||
add.ftz.f32 %f80, %f84, %f80;
|
||||
st.shared.f32 [%rd42+1024], %f80;
|
||||
ld.shared.f32 %f85, [%rd45+1536];
|
||||
add.ftz.f32 %f81, %f85, %f81;
|
||||
st.shared.f32 [%rd42+1536], %f81;
|
||||
$Lt_0_23554:
|
||||
shr.u32 %r57, %r57, 1;
|
||||
mov.u32 %r60, 0;
|
||||
setp.ne.u32 %p11, %r57, %r60;
|
||||
@%p11 bra $Lt_0_23298;
|
||||
$Lt_0_22786:
|
||||
mov.f32 %f27, %f78;
|
||||
mov.f32 %f26, %f79;
|
||||
mov.f32 %f25, %f80;
|
||||
mov.f32 %f28, %f81;
|
||||
ld.param.s32 %r61, [__cudaparm_kernel_pair_vflag];
|
||||
mov.u32 %r62, 0;
|
||||
setp.le.s32 %p12, %r61, %r62;
|
||||
@%p12 bra $Lt_0_24322;
|
||||
mov.f32 %f78, %f6;
|
||||
st.shared.f32 [%rd42+0], %f78;
|
||||
mov.f32 %f79, %f8;
|
||||
st.shared.f32 [%rd42+512], %f79;
|
||||
mov.f32 %f80, %f10;
|
||||
st.shared.f32 [%rd42+1024], %f80;
|
||||
mov.f32 %f81, %f12;
|
||||
st.shared.f32 [%rd42+1536], %f81;
|
||||
mov.f32 %f86, %f14;
|
||||
st.shared.f32 [%rd42+2048], %f86;
|
||||
mov.f32 %f87, %f15;
|
||||
st.shared.f32 [%rd42+2560], %f87;
|
||||
mov.s32 %r63, %r56;
|
||||
@!%p9 bra $Lt_0_24834;
|
||||
$Lt_0_25346:
|
||||
setp.ge.u32 %p13, %r13, %r63;
|
||||
@%p13 bra $Lt_0_25602;
|
||||
add.u32 %r64, %r2, %r63;
|
||||
cvt.u64.u32 %rd46, %r64;
|
||||
mul.wide.u32 %rd47, %r64, 4;
|
||||
add.u64 %rd48, %rd39, %rd47;
|
||||
ld.shared.f32 %f88, [%rd48+0];
|
||||
add.ftz.f32 %f78, %f88, %f78;
|
||||
st.shared.f32 [%rd42+0], %f78;
|
||||
ld.shared.f32 %f89, [%rd48+512];
|
||||
add.ftz.f32 %f79, %f89, %f79;
|
||||
st.shared.f32 [%rd42+512], %f79;
|
||||
ld.shared.f32 %f90, [%rd48+1024];
|
||||
add.ftz.f32 %f80, %f90, %f80;
|
||||
st.shared.f32 [%rd42+1024], %f80;
|
||||
ld.shared.f32 %f91, [%rd48+1536];
|
||||
add.ftz.f32 %f81, %f91, %f81;
|
||||
st.shared.f32 [%rd42+1536], %f81;
|
||||
ld.shared.f32 %f92, [%rd48+2048];
|
||||
add.ftz.f32 %f86, %f92, %f86;
|
||||
st.shared.f32 [%rd42+2048], %f86;
|
||||
ld.shared.f32 %f93, [%rd48+2560];
|
||||
add.ftz.f32 %f87, %f93, %f87;
|
||||
st.shared.f32 [%rd42+2560], %f87;
|
||||
$Lt_0_25602:
|
||||
shr.u32 %r63, %r63, 1;
|
||||
mov.u32 %r65, 0;
|
||||
setp.ne.u32 %p14, %r63, %r65;
|
||||
@%p14 bra $Lt_0_25346;
|
||||
$Lt_0_24834:
|
||||
mov.f32 %f6, %f78;
|
||||
mov.f32 %f8, %f79;
|
||||
mov.f32 %f10, %f80;
|
||||
mov.f32 %f12, %f81;
|
||||
mov.f32 %f14, %f86;
|
||||
mov.f32 %f16, %f87;
|
||||
$Lt_0_24322:
|
||||
$Lt_0_22274:
|
||||
mov.u32 %r66, 0;
|
||||
setp.ne.s32 %p15, %r13, %r66;
|
||||
@%p15 bra $Lt_0_26370;
|
||||
ld.param.u64 %rd49, [__cudaparm_kernel_pair___val_paramengv];
|
||||
add.u64 %rd50, %rd49, %rd5;
|
||||
ld.param.s32 %r67, [__cudaparm_kernel_pair_eflag];
|
||||
mov.u32 %r68, 0;
|
||||
setp.le.s32 %p16, %r67, %r68;
|
||||
@%p16 bra $Lt_0_26882;
|
||||
st.global.f32 [%rd50+0], %f28;
|
||||
cvt.s64.s32 %rd51, %r9;
|
||||
mul.wide.s32 %rd52, %r9, 4;
|
||||
add.u64 %rd50, %rd50, %rd52;
|
||||
$Lt_0_26882:
|
||||
ld.param.s32 %r69, [__cudaparm_kernel_pair_vflag];
|
||||
mov.u32 %r70, 0;
|
||||
setp.le.s32 %p17, %r69, %r70;
|
||||
@%p17 bra $Lt_0_27394;
|
||||
mov.f32 %f94, %f6;
|
||||
st.global.f32 [%rd50+0], %f94;
|
||||
cvt.s64.s32 %rd53, %r9;
|
||||
mul.wide.s32 %rd54, %r9, 4;
|
||||
add.u64 %rd55, %rd54, %rd50;
|
||||
mov.f32 %f95, %f8;
|
||||
st.global.f32 [%rd55+0], %f95;
|
||||
add.u64 %rd56, %rd54, %rd55;
|
||||
mov.f32 %f96, %f10;
|
||||
st.global.f32 [%rd56+0], %f96;
|
||||
add.u64 %rd57, %rd54, %rd56;
|
||||
mov.f32 %f97, %f12;
|
||||
st.global.f32 [%rd57+0], %f97;
|
||||
add.u64 %rd50, %rd54, %rd57;
|
||||
mov.f32 %f98, %f14;
|
||||
st.global.f32 [%rd50+0], %f98;
|
||||
mov.f32 %f99, %f16;
|
||||
add.u64 %rd58, %rd54, %rd50;
|
||||
st.global.f32 [%rd58+0], %f99;
|
||||
$Lt_0_27394:
|
||||
ld.param.u64 %rd59, [__cudaparm_kernel_pair_ans];
|
||||
mul.lo.u64 %rd60, %rd4, 16;
|
||||
add.u64 %rd61, %rd59, %rd60;
|
||||
mov.f32 %f100, %f101;
|
||||
st.global.v4.f32 [%rd61+0], {%f27,%f26,%f25,%f100};
|
||||
$Lt_0_26370:
|
||||
$Lt_0_18690:
|
||||
.loc 16 103 0
|
||||
exit;
|
||||
$LDWend_kernel_pair:
|
||||
} // kernel_pair
|
||||
|
||||
.entry kernel_pair_fast (
|
||||
.param .u64 __cudaparm_kernel_pair_fast_x_,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_lj1_in,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_lj3_in,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_dev_nbor,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_dev_packed,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_ans,
|
||||
.param .u64 __cudaparm_kernel_pair_fast___val_paramengv,
|
||||
.param .s32 __cudaparm_kernel_pair_fast_eflag,
|
||||
.param .s32 __cudaparm_kernel_pair_fast_vflag,
|
||||
.param .s32 __cudaparm_kernel_pair_fast_inum,
|
||||
.param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,
|
||||
.param .s32 __cudaparm_kernel_pair_fast_t_per_atom)
|
||||
{
|
||||
.reg .u32 %r<74>;
|
||||
.reg .u64 %rd<75>;
|
||||
.reg .f32 %f<109>;
|
||||
.reg .pred %p<22>;
|
||||
.shared .align 4 .b8 __cuda___cuda_local_var_32618_33_non_const_sp_lj3268[16];
|
||||
.shared .align 16 .b8 __cuda___cuda_local_var_32616_34_non_const_lj13296[1936];
|
||||
.shared .align 16 .b8 __cuda___cuda_local_var_32617_34_non_const_lj35232[1936];
|
||||
.shared .align 4 .b8 __cuda___cuda_local_var_32687_55_non_const_red_acc7168[3072];
|
||||
// __cuda_local_var_32628_10_non_const_f = 48
|
||||
// __cuda_local_var_32630_9_non_const_virial = 16
|
||||
.loc 16 111 0
|
||||
$LDWbegin_kernel_pair_fast:
|
||||
cvt.s32.u32 %r1, %tid.x;
|
||||
mov.u32 %r2, 3;
|
||||
setp.gt.s32 %p1, %r1, %r2;
|
||||
@%p1 bra $Lt_1_20994;
|
||||
.loc 16 119 0
|
||||
mov.u64 %rd1, __cuda___cuda_local_var_32618_33_non_const_sp_lj3268;
|
||||
cvt.s64.s32 %rd2, %r1;
|
||||
mul.wide.s32 %rd3, %r1, 4;
|
||||
ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];
|
||||
add.u64 %rd5, %rd4, %rd3;
|
||||
ld.global.f32 %f1, [%rd5+0];
|
||||
add.u64 %rd6, %rd3, %rd1;
|
||||
st.shared.f32 [%rd6+0], %f1;
|
||||
$Lt_1_20994:
|
||||
mov.u64 %rd1, __cuda___cuda_local_var_32618_33_non_const_sp_lj3268;
|
||||
mov.u32 %r3, 120;
|
||||
setp.gt.s32 %p2, %r1, %r3;
|
||||
@%p2 bra $Lt_1_21506;
|
||||
.loc 16 121 0
|
||||
mov.u64 %rd7, __cuda___cuda_local_var_32616_34_non_const_lj13296;
|
||||
cvt.s64.s32 %rd8, %r1;
|
||||
mul.wide.s32 %rd9, %r1, 16;
|
||||
ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_lj1_in];
|
||||
add.u64 %rd11, %rd10, %rd9;
|
||||
add.u64 %rd12, %rd9, %rd7;
|
||||
ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0];
|
||||
st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5};
|
||||
ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag];
|
||||
mov.u32 %r5, 0;
|
||||
setp.le.s32 %p3, %r4, %r5;
|
||||
@%p3 bra $Lt_1_22018;
|
||||
.loc 16 123 0
|
||||
mov.u64 %rd13, __cuda___cuda_local_var_32617_34_non_const_lj35232;
|
||||
ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in];
|
||||
add.u64 %rd15, %rd14, %rd9;
|
||||
add.u64 %rd16, %rd9, %rd13;
|
||||
ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0];
|
||||
st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9};
|
||||
$Lt_1_22018:
|
||||
mov.u64 %rd13, __cuda___cuda_local_var_32617_34_non_const_lj35232;
|
||||
$Lt_1_21506:
|
||||
mov.u64 %rd13, __cuda___cuda_local_var_32617_34_non_const_lj35232;
|
||||
mov.u64 %rd7, __cuda___cuda_local_var_32616_34_non_const_lj13296;
|
||||
.loc 16 131 0
|
||||
mov.f32 %f10, 0f00000000; // 0
|
||||
mov.f32 %f11, %f10;
|
||||
mov.f32 %f12, 0f00000000; // 0
|
||||
mov.f32 %f13, %f12;
|
||||
mov.f32 %f14, 0f00000000; // 0
|
||||
mov.f32 %f15, %f14;
|
||||
mov.f32 %f16, 0f00000000; // 0
|
||||
mov.f32 %f17, %f16;
|
||||
mov.f32 %f18, 0f00000000; // 0
|
||||
mov.f32 %f19, %f18;
|
||||
mov.f32 %f20, 0f00000000; // 0
|
||||
mov.f32 %f21, %f20;
|
||||
.loc 16 133 0
|
||||
bar.sync 0;
|
||||
ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom];
|
||||
div.s32 %r7, %r1, %r6;
|
||||
cvt.s32.u32 %r8, %ntid.x;
|
||||
div.s32 %r9, %r8, %r6;
|
||||
cvt.s32.u32 %r10, %ctaid.x;
|
||||
mul.lo.s32 %r11, %r10, %r9;
|
||||
add.s32 %r12, %r7, %r11;
|
||||
ld.param.s32 %r13, [__cudaparm_kernel_pair_fast_inum];
|
||||
setp.ge.s32 %p4, %r12, %r13;
|
||||
@%p4 bra $Lt_1_30210;
|
||||
.loc 16 138 0
|
||||
ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_nbor_pitch];
|
||||
cvt.s64.s32 %rd17, %r14;
|
||||
mul.wide.s32 %rd18, %r14, 4;
|
||||
cvt.s64.s32 %rd19, %r12;
|
||||
mul.wide.s32 %rd20, %r12, 4;
|
||||
ld.param.u64 %rd21, [__cudaparm_kernel_pair_fast_dev_nbor];
|
||||
add.u64 %rd22, %rd20, %rd21;
|
||||
add.u64 %rd23, %rd18, %rd22;
|
||||
ld.global.s32 %r15, [%rd23+0];
|
||||
sub.s32 %r16, %r6, 1;
|
||||
and.b32 %r17, %r16, %r1;
|
||||
cvt.s64.s32 %rd24, %r17;
|
||||
mul.wide.s32 %rd25, %r17, 4;
|
||||
ld.param.u64 %rd26, [__cudaparm_kernel_pair_fast_dev_packed];
|
||||
setp.ne.u64 %p5, %rd26, %rd21;
|
||||
@%p5 bra $Lt_1_23298;
|
||||
cvt.s32.s64 %r18, %rd17;
|
||||
mul.lo.s32 %r19, %r18, %r6;
|
||||
mov.s32 %r20, %r19;
|
||||
mul.lo.s32 %r21, %r16, %r12;
|
||||
add.s32 %r22, %r18, %r21;
|
||||
cvt.s64.s32 %rd27, %r22;
|
||||
mul.wide.s32 %rd28, %r22, 4;
|
||||
add.u64 %rd29, %rd23, %rd28;
|
||||
and.b32 %r23, %r16, %r15;
|
||||
cvt.s64.s32 %rd30, %r23;
|
||||
div.s32 %r24, %r15, %r6;
|
||||
mul.lo.s32 %r25, %r19, %r24;
|
||||
cvt.s64.s32 %rd31, %r25;
|
||||
add.u64 %rd32, %rd30, %rd31;
|
||||
mul.lo.u64 %rd33, %rd32, 4;
|
||||
add.u64 %rd34, %rd29, %rd33;
|
||||
add.u64 %rd35, %rd25, %rd29;
|
||||
bra.uni $Lt_1_23042;
|
||||
$Lt_1_23298:
|
||||
add.u64 %rd36, %rd18, %rd23;
|
||||
ld.global.s32 %r26, [%rd36+0];
|
||||
cvt.s64.s32 %rd37, %r26;
|
||||
mul.wide.s32 %rd38, %r26, 4;
|
||||
add.u64 %rd39, %rd26, %rd38;
|
||||
cvt.s64.s32 %rd40, %r15;
|
||||
mul.wide.s32 %rd41, %r15, 4;
|
||||
add.u64 %rd34, %rd39, %rd41;
|
||||
mov.s32 %r20, %r6;
|
||||
add.u64 %rd35, %rd25, %rd39;
|
||||
$Lt_1_23042:
|
||||
.loc 16 141 0
|
||||
ld.global.s32 %r27, [%rd22+0];
|
||||
mov.u32 %r28, %r27;
|
||||
mov.s32 %r29, 0;
|
||||
mov.u32 %r30, %r29;
|
||||
mov.s32 %r31, 0;
|
||||
mov.u32 %r32, %r31;
|
||||
mov.s32 %r33, 0;
|
||||
mov.u32 %r34, %r33;
|
||||
tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r28,%r30,%r32,%r34}];
|
||||
mov.f32 %f26, %f22;
|
||||
mov.f32 %f27, %f23;
|
||||
mov.f32 %f28, %f24;
|
||||
mov.f32 %f29, %f25;
|
||||
setp.ge.u64 %p6, %rd35, %rd34;
|
||||
@%p6 bra $Lt_1_31746;
|
||||
cvt.rzi.ftz.s32.f32 %r35, %f29;
|
||||
cvt.s64.s32 %rd42, %r20;
|
||||
mul.lo.s32 %r36, %r35, 11;
|
||||
cvt.rn.f32.s32 %f30, %r36;
|
||||
mov.f32 %f31, 0f00000000; // 0
|
||||
mov.f32 %f32, 0f00000000; // 0
|
||||
mov.f32 %f33, 0f00000000; // 0
|
||||
mov.f32 %f34, 0f00000000; // 0
|
||||
$Lt_1_24066:
|
||||
//<loop> Loop body line 141, nesting depth: 1, estimated iterations: unknown
|
||||
.loc 16 148 0
|
||||
ld.global.s32 %r37, [%rd35+0];
|
||||
.loc 16 152 0
|
||||
and.b32 %r38, %r37, 1073741823;
|
||||
mov.u32 %r39, %r38;
|
||||
mov.s32 %r40, 0;
|
||||
mov.u32 %r41, %r40;
|
||||
mov.s32 %r42, 0;
|
||||
mov.u32 %r43, %r42;
|
||||
mov.s32 %r44, 0;
|
||||
mov.u32 %r45, %r44;
|
||||
tex.1d.v4.f32.s32 {%f35,%f36,%f37,%f38},[pos_tex,{%r39,%r41,%r43,%r45}];
|
||||
mov.f32 %f39, %f35;
|
||||
mov.f32 %f40, %f36;
|
||||
mov.f32 %f41, %f37;
|
||||
mov.f32 %f42, %f38;
|
||||
sub.ftz.f32 %f43, %f27, %f40;
|
||||
sub.ftz.f32 %f44, %f26, %f39;
|
||||
sub.ftz.f32 %f45, %f28, %f41;
|
||||
mul.ftz.f32 %f46, %f43, %f43;
|
||||
fma.rn.ftz.f32 %f47, %f44, %f44, %f46;
|
||||
fma.rn.ftz.f32 %f48, %f45, %f45, %f47;
|
||||
add.ftz.f32 %f49, %f30, %f42;
|
||||
cvt.rzi.ftz.s32.f32 %r46, %f49;
|
||||
cvt.s64.s32 %rd43, %r46;
|
||||
mul.wide.s32 %rd44, %r46, 16;
|
||||
add.u64 %rd45, %rd44, %rd7;
|
||||
ld.shared.f32 %f50, [%rd45+8];
|
||||
setp.gt.ftz.f32 %p7, %f50, %f48;
|
||||
@!%p7 bra $Lt_1_25346;
|
||||
.loc 16 165 0
|
||||
rcp.approx.ftz.f32 %f51, %f48;
|
||||
mul.ftz.f32 %f52, %f51, %f51;
|
||||
mul.ftz.f32 %f53, %f51, %f52;
|
||||
sqrt.approx.ftz.f32 %f54, %f53;
|
||||
mul.ftz.f32 %f55, %f51, %f53;
|
||||
ld.shared.v2.f32 {%f56,%f57}, [%rd45+0];
|
||||
mul.ftz.f32 %f58, %f56, %f54;
|
||||
sub.ftz.f32 %f59, %f58, %f57;
|
||||
mul.ftz.f32 %f60, %f55, %f59;
|
||||
.loc 16 167 0
|
||||
fma.rn.ftz.f32 %f33, %f44, %f60, %f33;
|
||||
.loc 16 168 0
|
||||
fma.rn.ftz.f32 %f32, %f43, %f60, %f32;
|
||||
.loc 16 169 0
|
||||
fma.rn.ftz.f32 %f31, %f45, %f60, %f31;
|
||||
ld.param.s32 %r47, [__cudaparm_kernel_pair_fast_eflag];
|
||||
mov.u32 %r48, 0;
|
||||
setp.le.s32 %p8, %r47, %r48;
|
||||
@%p8 bra $Lt_1_24834;
|
||||
.loc 16 172 0
|
||||
add.u64 %rd46, %rd44, %rd13;
|
||||
ld.shared.v4.f32 {%f61,%f62,%f63,_}, [%rd46+0];
|
||||
mul.ftz.f32 %f64, %f61, %f54;
|
||||
sub.ftz.f32 %f65, %f64, %f62;
|
||||
mul.ftz.f32 %f66, %f53, %f65;
|
||||
.loc 16 173 0
|
||||
shr.s32 %r49, %r37, 30;
|
||||
and.b32 %r50, %r49, 3;
|
||||
cvt.s64.s32 %rd47, %r50;
|
||||
mul.wide.s32 %rd48, %r50, 4;
|
||||
add.u64 %rd49, %rd1, %rd48;
|
||||
ld.shared.f32 %f67, [%rd49+0];
|
||||
sub.ftz.f32 %f68, %f66, %f63;
|
||||
fma.rn.ftz.f32 %f34, %f67, %f68, %f34;
|
||||
$Lt_1_24834:
|
||||
ld.param.s32 %r51, [__cudaparm_kernel_pair_fast_vflag];
|
||||
mov.u32 %r52, 0;
|
||||
setp.le.s32 %p9, %r51, %r52;
|
||||
@%p9 bra $Lt_1_25346;
|
||||
.loc 16 176 0
|
||||
mov.f32 %f69, %f11;
|
||||
mul.ftz.f32 %f70, %f44, %f44;
|
||||
fma.rn.ftz.f32 %f71, %f60, %f70, %f69;
|
||||
mov.f32 %f11, %f71;
|
||||
.loc 16 177 0
|
||||
mov.f32 %f72, %f13;
|
||||
fma.rn.ftz.f32 %f73, %f60, %f46, %f72;
|
||||
mov.f32 %f13, %f73;
|
||||
.loc 16 178 0
|
||||
mov.f32 %f74, %f15;
|
||||
mul.ftz.f32 %f75, %f45, %f45;
|
||||
fma.rn.ftz.f32 %f76, %f60, %f75, %f74;
|
||||
mov.f32 %f15, %f76;
|
||||
.loc 16 179 0
|
||||
mov.f32 %f77, %f17;
|
||||
mul.ftz.f32 %f78, %f43, %f44;
|
||||
fma.rn.ftz.f32 %f79, %f60, %f78, %f77;
|
||||
mov.f32 %f17, %f79;
|
||||
.loc 16 180 0
|
||||
mov.f32 %f80, %f19;
|
||||
mul.ftz.f32 %f81, %f44, %f45;
|
||||
fma.rn.ftz.f32 %f82, %f60, %f81, %f80;
|
||||
mov.f32 %f19, %f82;
|
||||
.loc 16 181 0
|
||||
mul.ftz.f32 %f83, %f43, %f45;
|
||||
fma.rn.ftz.f32 %f20, %f60, %f83, %f20;
|
||||
mov.f32 %f21, %f20;
|
||||
$Lt_1_25346:
|
||||
$Lt_1_24322:
|
||||
.loc 16 146 0
|
||||
mul.lo.u64 %rd50, %rd42, 4;
|
||||
add.u64 %rd35, %rd35, %rd50;
|
||||
setp.lt.u64 %p10, %rd35, %rd34;
|
||||
@%p10 bra $Lt_1_24066;
|
||||
bra.uni $Lt_1_23554;
|
||||
$Lt_1_31746:
|
||||
mov.f32 %f31, 0f00000000; // 0
|
||||
mov.f32 %f32, 0f00000000; // 0
|
||||
mov.f32 %f33, 0f00000000; // 0
|
||||
mov.f32 %f34, 0f00000000; // 0
|
||||
$Lt_1_23554:
|
||||
mov.u32 %r53, 1;
|
||||
setp.le.s32 %p11, %r6, %r53;
|
||||
@%p11 bra $Lt_1_28162;
|
||||
.loc 16 186 0
|
||||
mov.u64 %rd51, __cuda___cuda_local_var_32687_55_non_const_red_acc7168;
|
||||
cvt.s64.s32 %rd52, %r1;
|
||||
mul.wide.s32 %rd53, %r1, 4;
|
||||
add.u64 %rd54, %rd51, %rd53;
|
||||
mov.f32 %f84, %f33;
|
||||
st.shared.f32 [%rd54+0], %f84;
|
||||
mov.f32 %f85, %f32;
|
||||
st.shared.f32 [%rd54+512], %f85;
|
||||
mov.f32 %f86, %f31;
|
||||
st.shared.f32 [%rd54+1024], %f86;
|
||||
mov.f32 %f87, %f34;
|
||||
st.shared.f32 [%rd54+1536], %f87;
|
||||
shr.s32 %r54, %r6, 31;
|
||||
mov.s32 %r55, 1;
|
||||
and.b32 %r56, %r54, %r55;
|
||||
add.s32 %r57, %r56, %r6;
|
||||
shr.s32 %r58, %r57, 1;
|
||||
mov.s32 %r59, %r58;
|
||||
mov.u32 %r60, 0;
|
||||
setp.ne.u32 %p12, %r58, %r60;
|
||||
@!%p12 bra $Lt_1_26626;
|
||||
$Lt_1_27138:
|
||||
setp.ge.u32 %p13, %r17, %r59;
|
||||
@%p13 bra $Lt_1_27394;
|
||||
add.u32 %r61, %r1, %r59;
|
||||
cvt.u64.u32 %rd55, %r61;
|
||||
mul.wide.u32 %rd56, %r61, 4;
|
||||
add.u64 %rd57, %rd51, %rd56;
|
||||
ld.shared.f32 %f88, [%rd57+0];
|
||||
add.ftz.f32 %f84, %f88, %f84;
|
||||
st.shared.f32 [%rd54+0], %f84;
|
||||
ld.shared.f32 %f89, [%rd57+512];
|
||||
add.ftz.f32 %f85, %f89, %f85;
|
||||
st.shared.f32 [%rd54+512], %f85;
|
||||
ld.shared.f32 %f90, [%rd57+1024];
|
||||
add.ftz.f32 %f86, %f90, %f86;
|
||||
st.shared.f32 [%rd54+1024], %f86;
|
||||
ld.shared.f32 %f91, [%rd57+1536];
|
||||
add.ftz.f32 %f87, %f91, %f87;
|
||||
st.shared.f32 [%rd54+1536], %f87;
|
||||
$Lt_1_27394:
|
||||
shr.u32 %r59, %r59, 1;
|
||||
mov.u32 %r62, 0;
|
||||
setp.ne.u32 %p14, %r59, %r62;
|
||||
@%p14 bra $Lt_1_27138;
|
||||
$Lt_1_26626:
|
||||
mov.f32 %f33, %f84;
|
||||
mov.f32 %f32, %f85;
|
||||
mov.f32 %f31, %f86;
|
||||
mov.f32 %f34, %f87;
|
||||
ld.param.s32 %r63, [__cudaparm_kernel_pair_fast_vflag];
|
||||
mov.u32 %r64, 0;
|
||||
setp.le.s32 %p15, %r63, %r64;
|
||||
@%p15 bra $Lt_1_28162;
|
||||
mov.f32 %f84, %f11;
|
||||
st.shared.f32 [%rd54+0], %f84;
|
||||
mov.f32 %f85, %f13;
|
||||
st.shared.f32 [%rd54+512], %f85;
|
||||
mov.f32 %f86, %f15;
|
||||
st.shared.f32 [%rd54+1024], %f86;
|
||||
mov.f32 %f87, %f17;
|
||||
st.shared.f32 [%rd54+1536], %f87;
|
||||
mov.f32 %f92, %f19;
|
||||
st.shared.f32 [%rd54+2048], %f92;
|
||||
mov.f32 %f93, %f20;
|
||||
st.shared.f32 [%rd54+2560], %f93;
|
||||
mov.s32 %r65, %r58;
|
||||
@!%p12 bra $Lt_1_28674;
|
||||
$Lt_1_29186:
|
||||
setp.ge.u32 %p16, %r17, %r65;
|
||||
@%p16 bra $Lt_1_29442;
|
||||
add.u32 %r66, %r1, %r65;
|
||||
cvt.u64.u32 %rd58, %r66;
|
||||
mul.wide.u32 %rd59, %r66, 4;
|
||||
add.u64 %rd60, %rd51, %rd59;
|
||||
ld.shared.f32 %f94, [%rd60+0];
|
||||
add.ftz.f32 %f84, %f94, %f84;
|
||||
st.shared.f32 [%rd54+0], %f84;
|
||||
ld.shared.f32 %f95, [%rd60+512];
|
||||
add.ftz.f32 %f85, %f95, %f85;
|
||||
st.shared.f32 [%rd54+512], %f85;
|
||||
ld.shared.f32 %f96, [%rd60+1024];
|
||||
add.ftz.f32 %f86, %f96, %f86;
|
||||
st.shared.f32 [%rd54+1024], %f86;
|
||||
ld.shared.f32 %f97, [%rd60+1536];
|
||||
add.ftz.f32 %f87, %f97, %f87;
|
||||
st.shared.f32 [%rd54+1536], %f87;
|
||||
ld.shared.f32 %f98, [%rd60+2048];
|
||||
add.ftz.f32 %f92, %f98, %f92;
|
||||
st.shared.f32 [%rd54+2048], %f92;
|
||||
ld.shared.f32 %f99, [%rd60+2560];
|
||||
add.ftz.f32 %f93, %f99, %f93;
|
||||
st.shared.f32 [%rd54+2560], %f93;
|
||||
$Lt_1_29442:
|
||||
shr.u32 %r65, %r65, 1;
|
||||
mov.u32 %r67, 0;
|
||||
setp.ne.u32 %p17, %r65, %r67;
|
||||
@%p17 bra $Lt_1_29186;
|
||||
$Lt_1_28674:
|
||||
mov.f32 %f11, %f84;
|
||||
mov.f32 %f13, %f85;
|
||||
mov.f32 %f15, %f86;
|
||||
mov.f32 %f17, %f87;
|
||||
mov.f32 %f19, %f92;
|
||||
mov.f32 %f21, %f93;
|
||||
$Lt_1_28162:
|
||||
$Lt_1_26114:
|
||||
mov.u32 %r68, 0;
|
||||
setp.ne.s32 %p18, %r17, %r68;
|
||||
@%p18 bra $Lt_1_30210;
|
||||
ld.param.u64 %rd61, [__cudaparm_kernel_pair_fast___val_paramengv];
|
||||
add.u64 %rd62, %rd61, %rd20;
|
||||
ld.param.s32 %r69, [__cudaparm_kernel_pair_fast_eflag];
|
||||
mov.u32 %r70, 0;
|
||||
setp.le.s32 %p19, %r69, %r70;
|
||||
@%p19 bra $Lt_1_30722;
|
||||
st.global.f32 [%rd62+0], %f34;
|
||||
cvt.s64.s32 %rd63, %r13;
|
||||
mul.wide.s32 %rd64, %r13, 4;
|
||||
add.u64 %rd62, %rd62, %rd64;
|
||||
$Lt_1_30722:
|
||||
ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag];
|
||||
mov.u32 %r72, 0;
|
||||
setp.le.s32 %p20, %r71, %r72;
|
||||
@%p20 bra $Lt_1_31234;
|
||||
mov.f32 %f100, %f11;
|
||||
st.global.f32 [%rd62+0], %f100;
|
||||
cvt.s64.s32 %rd65, %r13;
|
||||
mul.wide.s32 %rd66, %r13, 4;
|
||||
add.u64 %rd67, %rd66, %rd62;
|
||||
mov.f32 %f101, %f13;
|
||||
st.global.f32 [%rd67+0], %f101;
|
||||
add.u64 %rd68, %rd66, %rd67;
|
||||
mov.f32 %f102, %f15;
|
||||
st.global.f32 [%rd68+0], %f102;
|
||||
add.u64 %rd69, %rd66, %rd68;
|
||||
mov.f32 %f103, %f17;
|
||||
st.global.f32 [%rd69+0], %f103;
|
||||
add.u64 %rd62, %rd66, %rd69;
|
||||
mov.f32 %f104, %f19;
|
||||
st.global.f32 [%rd62+0], %f104;
|
||||
mov.f32 %f105, %f21;
|
||||
add.u64 %rd70, %rd66, %rd62;
|
||||
st.global.f32 [%rd70+0], %f105;
|
||||
$Lt_1_31234:
|
||||
ld.param.u64 %rd71, [__cudaparm_kernel_pair_fast_ans];
|
||||
mul.lo.u64 %rd72, %rd19, 16;
|
||||
add.u64 %rd73, %rd71, %rd72;
|
||||
mov.f32 %f106, %f107;
|
||||
st.global.v4.f32 [%rd73+0], {%f33,%f32,%f31,%f106};
|
||||
$Lt_1_30210:
|
||||
$Lt_1_22530:
|
||||
.loc 16 189 0
|
||||
exit;
|
||||
$LDWend_kernel_pair_fast:
|
||||
} // kernel_pair_fast
|
||||
|
||||
@ -1,849 +0,0 @@
|
||||
const char * lj96 =
|
||||
" .version 2.3\n"
|
||||
" .target sm_20\n"
|
||||
" .address_size 64\n"
|
||||
" .global .texref pos_tex;\n"
|
||||
" .entry kernel_pair (\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_x_,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_lj1,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_lj3,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_lj_types,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_sp_lj_in,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_dev_nbor,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_dev_packed,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_ans,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair___val_paramengv,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_eflag,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_vflag,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_inum,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_nbor_pitch,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_t_per_atom)\n"
|
||||
" {\n"
|
||||
" .reg .u32 %r<72>;\n"
|
||||
" .reg .u64 %rd<63>;\n"
|
||||
" .reg .f32 %f<103>;\n"
|
||||
" .reg .pred %p<19>;\n"
|
||||
" .shared .align 16 .b8 __cuda___cuda_local_var_32536_33_non_const_sp_lj92[16];\n"
|
||||
" .shared .align 4 .b8 __cuda___cuda_local_var_32601_55_non_const_red_acc108[3072];\n"
|
||||
" .loc 16 31 0\n"
|
||||
"$LDWbegin_kernel_pair:\n"
|
||||
" .loc 16 36 0\n"
|
||||
" ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];\n"
|
||||
" ldu.global.f32 %f1, [%rd1+0];\n"
|
||||
" .loc 16 37 0\n"
|
||||
" ld.global.f32 %f2, [%rd1+4];\n"
|
||||
" .loc 16 38 0\n"
|
||||
" ld.global.f32 %f3, [%rd1+8];\n"
|
||||
" .loc 16 39 0\n"
|
||||
" ld.global.f32 %f4, [%rd1+12];\n"
|
||||
" st.shared.v4.f32 [__cuda___cuda_local_var_32536_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4};\n"
|
||||
" .loc 16 46 0\n"
|
||||
" mov.f32 %f5, 0f00000000; \n"
|
||||
" mov.f32 %f6, %f5;\n"
|
||||
" mov.f32 %f7, 0f00000000; \n"
|
||||
" mov.f32 %f8, %f7;\n"
|
||||
" mov.f32 %f9, 0f00000000; \n"
|
||||
" mov.f32 %f10, %f9;\n"
|
||||
" mov.f32 %f11, 0f00000000; \n"
|
||||
" mov.f32 %f12, %f11;\n"
|
||||
" mov.f32 %f13, 0f00000000; \n"
|
||||
" mov.f32 %f14, %f13;\n"
|
||||
" mov.f32 %f15, 0f00000000; \n"
|
||||
" mov.f32 %f16, %f15;\n"
|
||||
" ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];\n"
|
||||
" cvt.s32.u32 %r2, %tid.x;\n"
|
||||
" div.s32 %r3, %r2, %r1;\n"
|
||||
" cvt.s32.u32 %r4, %ntid.x;\n"
|
||||
" div.s32 %r5, %r4, %r1;\n"
|
||||
" cvt.s32.u32 %r6, %ctaid.x;\n"
|
||||
" mul.lo.s32 %r7, %r6, %r5;\n"
|
||||
" add.s32 %r8, %r3, %r7;\n"
|
||||
" ld.param.s32 %r9, [__cudaparm_kernel_pair_inum];\n"
|
||||
" setp.ge.s32 %p1, %r8, %r9;\n"
|
||||
" @%p1 bra $Lt_0_26370;\n"
|
||||
" .loc 16 51 0\n"
|
||||
" ld.param.s32 %r10, [__cudaparm_kernel_pair_nbor_pitch];\n"
|
||||
" cvt.s64.s32 %rd2, %r10;\n"
|
||||
" mul.wide.s32 %rd3, %r10, 4;\n"
|
||||
" cvt.s64.s32 %rd4, %r8;\n"
|
||||
" mul.wide.s32 %rd5, %r8, 4;\n"
|
||||
" ld.param.u64 %rd6, [__cudaparm_kernel_pair_dev_nbor];\n"
|
||||
" add.u64 %rd7, %rd5, %rd6;\n"
|
||||
" add.u64 %rd8, %rd3, %rd7;\n"
|
||||
" ld.global.s32 %r11, [%rd8+0];\n"
|
||||
" sub.s32 %r12, %r1, 1;\n"
|
||||
" and.b32 %r13, %r12, %r2;\n"
|
||||
" cvt.s64.s32 %rd9, %r13;\n"
|
||||
" mul.wide.s32 %rd10, %r13, 4;\n"
|
||||
" ld.param.u64 %rd11, [__cudaparm_kernel_pair_dev_packed];\n"
|
||||
" setp.ne.u64 %p2, %rd11, %rd6;\n"
|
||||
" @%p2 bra $Lt_0_19458;\n"
|
||||
" cvt.s32.s64 %r14, %rd2;\n"
|
||||
" mul.lo.s32 %r15, %r14, %r1;\n"
|
||||
" mov.s32 %r16, %r15;\n"
|
||||
" mul.lo.s32 %r17, %r12, %r8;\n"
|
||||
" add.s32 %r18, %r14, %r17;\n"
|
||||
" cvt.s64.s32 %rd12, %r18;\n"
|
||||
" mul.wide.s32 %rd13, %r18, 4;\n"
|
||||
" add.u64 %rd14, %rd8, %rd13;\n"
|
||||
" and.b32 %r19, %r12, %r11;\n"
|
||||
" cvt.s64.s32 %rd15, %r19;\n"
|
||||
" div.s32 %r20, %r11, %r1;\n"
|
||||
" mul.lo.s32 %r21, %r15, %r20;\n"
|
||||
" cvt.s64.s32 %rd16, %r21;\n"
|
||||
" add.u64 %rd17, %rd15, %rd16;\n"
|
||||
" mul.lo.u64 %rd18, %rd17, 4;\n"
|
||||
" add.u64 %rd19, %rd14, %rd18;\n"
|
||||
" add.u64 %rd20, %rd10, %rd14;\n"
|
||||
" bra.uni $Lt_0_19202;\n"
|
||||
"$Lt_0_19458:\n"
|
||||
" add.u64 %rd21, %rd3, %rd8;\n"
|
||||
" ld.global.s32 %r22, [%rd21+0];\n"
|
||||
" cvt.s64.s32 %rd22, %r22;\n"
|
||||
" mul.wide.s32 %rd23, %r22, 4;\n"
|
||||
" add.u64 %rd24, %rd11, %rd23;\n"
|
||||
" cvt.s64.s32 %rd25, %r11;\n"
|
||||
" mul.wide.s32 %rd26, %r11, 4;\n"
|
||||
" add.u64 %rd19, %rd24, %rd26;\n"
|
||||
" mov.s32 %r16, %r1;\n"
|
||||
" add.u64 %rd20, %rd10, %rd24;\n"
|
||||
"$Lt_0_19202:\n"
|
||||
" .loc 16 54 0\n"
|
||||
" ld.global.s32 %r23, [%rd7+0];\n"
|
||||
" mov.u32 %r24, %r23;\n"
|
||||
" mov.s32 %r25, 0;\n"
|
||||
" mov.u32 %r26, %r25;\n"
|
||||
" mov.s32 %r27, 0;\n"
|
||||
" mov.u32 %r28, %r27;\n"
|
||||
" mov.s32 %r29, 0;\n"
|
||||
" mov.u32 %r30, %r29;\n"
|
||||
" tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r24,%r26,%r28,%r30}];\n"
|
||||
" mov.f32 %f21, %f17;\n"
|
||||
" mov.f32 %f22, %f18;\n"
|
||||
" mov.f32 %f23, %f19;\n"
|
||||
" mov.f32 %f24, %f20;\n"
|
||||
" setp.ge.u64 %p3, %rd20, %rd19;\n"
|
||||
" @%p3 bra $Lt_0_27906;\n"
|
||||
" cvt.rzi.ftz.s32.f32 %r31, %f24;\n"
|
||||
" cvt.s64.s32 %rd27, %r16;\n"
|
||||
" ld.param.s32 %r32, [__cudaparm_kernel_pair_lj_types];\n"
|
||||
" mul.lo.s32 %r33, %r32, %r31;\n"
|
||||
" ld.param.u64 %rd28, [__cudaparm_kernel_pair_lj1];\n"
|
||||
" mov.f32 %f25, 0f00000000; \n"
|
||||
" mov.f32 %f26, 0f00000000; \n"
|
||||
" mov.f32 %f27, 0f00000000; \n"
|
||||
" mov.f32 %f28, 0f00000000; \n"
|
||||
" mov.u64 %rd29, __cuda___cuda_local_var_32536_33_non_const_sp_lj92;\n"
|
||||
"$Lt_0_20226:\n"
|
||||
" .loc 16 60 0\n"
|
||||
" ld.global.s32 %r34, [%rd20+0];\n"
|
||||
" .loc 16 61 0\n"
|
||||
" shr.s32 %r35, %r34, 30;\n"
|
||||
" and.b32 %r36, %r35, 3;\n"
|
||||
" cvt.s64.s32 %rd30, %r36;\n"
|
||||
" mul.wide.s32 %rd31, %r36, 4;\n"
|
||||
" add.u64 %rd32, %rd29, %rd31;\n"
|
||||
" ld.shared.f32 %f29, [%rd32+0];\n"
|
||||
" .loc 16 64 0\n"
|
||||
" and.b32 %r37, %r34, 1073741823;\n"
|
||||
" mov.u32 %r38, %r37;\n"
|
||||
" mov.s32 %r39, 0;\n"
|
||||
" mov.u32 %r40, %r39;\n"
|
||||
" mov.s32 %r41, 0;\n"
|
||||
" mov.u32 %r42, %r41;\n"
|
||||
" mov.s32 %r43, 0;\n"
|
||||
" mov.u32 %r44, %r43;\n"
|
||||
" tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r38,%r40,%r42,%r44}];\n"
|
||||
" mov.f32 %f34, %f30;\n"
|
||||
" mov.f32 %f35, %f31;\n"
|
||||
" mov.f32 %f36, %f32;\n"
|
||||
" mov.f32 %f37, %f33;\n"
|
||||
" cvt.rzi.ftz.s32.f32 %r45, %f37;\n"
|
||||
" sub.ftz.f32 %f38, %f22, %f35;\n"
|
||||
" sub.ftz.f32 %f39, %f21, %f34;\n"
|
||||
" sub.ftz.f32 %f40, %f23, %f36;\n"
|
||||
" mul.ftz.f32 %f41, %f38, %f38;\n"
|
||||
" fma.rn.ftz.f32 %f42, %f39, %f39, %f41;\n"
|
||||
" fma.rn.ftz.f32 %f43, %f40, %f40, %f42;\n"
|
||||
" add.s32 %r46, %r45, %r33;\n"
|
||||
" cvt.s64.s32 %rd33, %r46;\n"
|
||||
" mul.wide.s32 %rd34, %r46, 16;\n"
|
||||
" add.u64 %rd35, %rd34, %rd28;\n"
|
||||
" ld.global.f32 %f44, [%rd35+8];\n"
|
||||
" setp.gt.ftz.f32 %p4, %f44, %f43;\n"
|
||||
" @!%p4 bra $Lt_0_21506;\n"
|
||||
" .loc 16 79 0\n"
|
||||
" rcp.approx.ftz.f32 %f45, %f43;\n"
|
||||
" mul.ftz.f32 %f46, %f45, %f45;\n"
|
||||
" mul.ftz.f32 %f47, %f45, %f46;\n"
|
||||
" sqrt.approx.ftz.f32 %f48, %f47;\n"
|
||||
" mul.ftz.f32 %f49, %f45, %f47;\n"
|
||||
" ld.global.v2.f32 {%f50,%f51}, [%rd35+0];\n"
|
||||
" mul.ftz.f32 %f52, %f50, %f48;\n"
|
||||
" sub.ftz.f32 %f53, %f52, %f51;\n"
|
||||
" mul.ftz.f32 %f54, %f49, %f53;\n"
|
||||
" mul.ftz.f32 %f55, %f29, %f54;\n"
|
||||
" .loc 16 81 0\n"
|
||||
" fma.rn.ftz.f32 %f27, %f39, %f55, %f27;\n"
|
||||
" .loc 16 82 0\n"
|
||||
" fma.rn.ftz.f32 %f26, %f38, %f55, %f26;\n"
|
||||
" .loc 16 83 0\n"
|
||||
" fma.rn.ftz.f32 %f25, %f40, %f55, %f25;\n"
|
||||
" ld.param.s32 %r47, [__cudaparm_kernel_pair_eflag];\n"
|
||||
" mov.u32 %r48, 0;\n"
|
||||
" setp.le.s32 %p5, %r47, %r48;\n"
|
||||
" @%p5 bra $Lt_0_20994;\n"
|
||||
" .loc 16 87 0\n"
|
||||
" ld.param.u64 %rd36, [__cudaparm_kernel_pair_lj3];\n"
|
||||
" add.u64 %rd37, %rd36, %rd34;\n"
|
||||
" ld.global.v4.f32 {%f56,%f57,%f58,_}, [%rd37+0];\n"
|
||||
" mul.ftz.f32 %f59, %f56, %f48;\n"
|
||||
" sub.ftz.f32 %f60, %f59, %f57;\n"
|
||||
" mul.ftz.f32 %f61, %f47, %f60;\n"
|
||||
" sub.ftz.f32 %f62, %f61, %f58;\n"
|
||||
" fma.rn.ftz.f32 %f28, %f29, %f62, %f28;\n"
|
||||
"$Lt_0_20994:\n"
|
||||
" ld.param.s32 %r49, [__cudaparm_kernel_pair_vflag];\n"
|
||||
" mov.u32 %r50, 0;\n"
|
||||
" setp.le.s32 %p6, %r49, %r50;\n"
|
||||
" @%p6 bra $Lt_0_21506;\n"
|
||||
" .loc 16 90 0\n"
|
||||
" mov.f32 %f63, %f6;\n"
|
||||
" mul.ftz.f32 %f64, %f39, %f39;\n"
|
||||
" fma.rn.ftz.f32 %f65, %f55, %f64, %f63;\n"
|
||||
" mov.f32 %f6, %f65;\n"
|
||||
" .loc 16 91 0\n"
|
||||
" mov.f32 %f66, %f8;\n"
|
||||
" fma.rn.ftz.f32 %f67, %f55, %f41, %f66;\n"
|
||||
" mov.f32 %f8, %f67;\n"
|
||||
" .loc 16 92 0\n"
|
||||
" mov.f32 %f68, %f10;\n"
|
||||
" mul.ftz.f32 %f69, %f40, %f40;\n"
|
||||
" fma.rn.ftz.f32 %f70, %f55, %f69, %f68;\n"
|
||||
" mov.f32 %f10, %f70;\n"
|
||||
" .loc 16 93 0\n"
|
||||
" mov.f32 %f71, %f12;\n"
|
||||
" mul.ftz.f32 %f72, %f38, %f39;\n"
|
||||
" fma.rn.ftz.f32 %f73, %f55, %f72, %f71;\n"
|
||||
" mov.f32 %f12, %f73;\n"
|
||||
" .loc 16 94 0\n"
|
||||
" mov.f32 %f74, %f14;\n"
|
||||
" mul.ftz.f32 %f75, %f39, %f40;\n"
|
||||
" fma.rn.ftz.f32 %f76, %f55, %f75, %f74;\n"
|
||||
" mov.f32 %f14, %f76;\n"
|
||||
" .loc 16 95 0\n"
|
||||
" mul.ftz.f32 %f77, %f38, %f40;\n"
|
||||
" fma.rn.ftz.f32 %f15, %f55, %f77, %f15;\n"
|
||||
" mov.f32 %f16, %f15;\n"
|
||||
"$Lt_0_21506:\n"
|
||||
"$Lt_0_20482:\n"
|
||||
" .loc 16 58 0\n"
|
||||
" mul.lo.u64 %rd38, %rd27, 4;\n"
|
||||
" add.u64 %rd20, %rd20, %rd38;\n"
|
||||
" setp.lt.u64 %p7, %rd20, %rd19;\n"
|
||||
" @%p7 bra $Lt_0_20226;\n"
|
||||
" bra.uni $Lt_0_19714;\n"
|
||||
"$Lt_0_27906:\n"
|
||||
" mov.f32 %f25, 0f00000000; \n"
|
||||
" mov.f32 %f26, 0f00000000; \n"
|
||||
" mov.f32 %f27, 0f00000000; \n"
|
||||
" mov.f32 %f28, 0f00000000; \n"
|
||||
"$Lt_0_19714:\n"
|
||||
" mov.u32 %r51, 1;\n"
|
||||
" setp.le.s32 %p8, %r1, %r51;\n"
|
||||
" @%p8 bra $Lt_0_24322;\n"
|
||||
" .loc 16 100 0\n"
|
||||
" mov.u64 %rd39, __cuda___cuda_local_var_32601_55_non_const_red_acc108;\n"
|
||||
" cvt.s64.s32 %rd40, %r2;\n"
|
||||
" mul.wide.s32 %rd41, %r2, 4;\n"
|
||||
" add.u64 %rd42, %rd39, %rd41;\n"
|
||||
" mov.f32 %f78, %f27;\n"
|
||||
" st.shared.f32 [%rd42+0], %f78;\n"
|
||||
" mov.f32 %f79, %f26;\n"
|
||||
" st.shared.f32 [%rd42+512], %f79;\n"
|
||||
" mov.f32 %f80, %f25;\n"
|
||||
" st.shared.f32 [%rd42+1024], %f80;\n"
|
||||
" mov.f32 %f81, %f28;\n"
|
||||
" st.shared.f32 [%rd42+1536], %f81;\n"
|
||||
" shr.s32 %r52, %r1, 31;\n"
|
||||
" mov.s32 %r53, 1;\n"
|
||||
" and.b32 %r54, %r52, %r53;\n"
|
||||
" add.s32 %r55, %r54, %r1;\n"
|
||||
" shr.s32 %r56, %r55, 1;\n"
|
||||
" mov.s32 %r57, %r56;\n"
|
||||
" mov.u32 %r58, 0;\n"
|
||||
" setp.ne.u32 %p9, %r56, %r58;\n"
|
||||
" @!%p9 bra $Lt_0_22786;\n"
|
||||
"$Lt_0_23298:\n"
|
||||
" setp.ge.u32 %p10, %r13, %r57;\n"
|
||||
" @%p10 bra $Lt_0_23554;\n"
|
||||
" add.u32 %r59, %r2, %r57;\n"
|
||||
" cvt.u64.u32 %rd43, %r59;\n"
|
||||
" mul.wide.u32 %rd44, %r59, 4;\n"
|
||||
" add.u64 %rd45, %rd39, %rd44;\n"
|
||||
" ld.shared.f32 %f82, [%rd45+0];\n"
|
||||
" add.ftz.f32 %f78, %f82, %f78;\n"
|
||||
" st.shared.f32 [%rd42+0], %f78;\n"
|
||||
" ld.shared.f32 %f83, [%rd45+512];\n"
|
||||
" add.ftz.f32 %f79, %f83, %f79;\n"
|
||||
" st.shared.f32 [%rd42+512], %f79;\n"
|
||||
" ld.shared.f32 %f84, [%rd45+1024];\n"
|
||||
" add.ftz.f32 %f80, %f84, %f80;\n"
|
||||
" st.shared.f32 [%rd42+1024], %f80;\n"
|
||||
" ld.shared.f32 %f85, [%rd45+1536];\n"
|
||||
" add.ftz.f32 %f81, %f85, %f81;\n"
|
||||
" st.shared.f32 [%rd42+1536], %f81;\n"
|
||||
"$Lt_0_23554:\n"
|
||||
" shr.u32 %r57, %r57, 1;\n"
|
||||
" mov.u32 %r60, 0;\n"
|
||||
" setp.ne.u32 %p11, %r57, %r60;\n"
|
||||
" @%p11 bra $Lt_0_23298;\n"
|
||||
"$Lt_0_22786:\n"
|
||||
" mov.f32 %f27, %f78;\n"
|
||||
" mov.f32 %f26, %f79;\n"
|
||||
" mov.f32 %f25, %f80;\n"
|
||||
" mov.f32 %f28, %f81;\n"
|
||||
" ld.param.s32 %r61, [__cudaparm_kernel_pair_vflag];\n"
|
||||
" mov.u32 %r62, 0;\n"
|
||||
" setp.le.s32 %p12, %r61, %r62;\n"
|
||||
" @%p12 bra $Lt_0_24322;\n"
|
||||
" mov.f32 %f78, %f6;\n"
|
||||
" st.shared.f32 [%rd42+0], %f78;\n"
|
||||
" mov.f32 %f79, %f8;\n"
|
||||
" st.shared.f32 [%rd42+512], %f79;\n"
|
||||
" mov.f32 %f80, %f10;\n"
|
||||
" st.shared.f32 [%rd42+1024], %f80;\n"
|
||||
" mov.f32 %f81, %f12;\n"
|
||||
" st.shared.f32 [%rd42+1536], %f81;\n"
|
||||
" mov.f32 %f86, %f14;\n"
|
||||
" st.shared.f32 [%rd42+2048], %f86;\n"
|
||||
" mov.f32 %f87, %f15;\n"
|
||||
" st.shared.f32 [%rd42+2560], %f87;\n"
|
||||
" mov.s32 %r63, %r56;\n"
|
||||
" @!%p9 bra $Lt_0_24834;\n"
|
||||
"$Lt_0_25346:\n"
|
||||
" setp.ge.u32 %p13, %r13, %r63;\n"
|
||||
" @%p13 bra $Lt_0_25602;\n"
|
||||
" add.u32 %r64, %r2, %r63;\n"
|
||||
" cvt.u64.u32 %rd46, %r64;\n"
|
||||
" mul.wide.u32 %rd47, %r64, 4;\n"
|
||||
" add.u64 %rd48, %rd39, %rd47;\n"
|
||||
" ld.shared.f32 %f88, [%rd48+0];\n"
|
||||
" add.ftz.f32 %f78, %f88, %f78;\n"
|
||||
" st.shared.f32 [%rd42+0], %f78;\n"
|
||||
" ld.shared.f32 %f89, [%rd48+512];\n"
|
||||
" add.ftz.f32 %f79, %f89, %f79;\n"
|
||||
" st.shared.f32 [%rd42+512], %f79;\n"
|
||||
" ld.shared.f32 %f90, [%rd48+1024];\n"
|
||||
" add.ftz.f32 %f80, %f90, %f80;\n"
|
||||
" st.shared.f32 [%rd42+1024], %f80;\n"
|
||||
" ld.shared.f32 %f91, [%rd48+1536];\n"
|
||||
" add.ftz.f32 %f81, %f91, %f81;\n"
|
||||
" st.shared.f32 [%rd42+1536], %f81;\n"
|
||||
" ld.shared.f32 %f92, [%rd48+2048];\n"
|
||||
" add.ftz.f32 %f86, %f92, %f86;\n"
|
||||
" st.shared.f32 [%rd42+2048], %f86;\n"
|
||||
" ld.shared.f32 %f93, [%rd48+2560];\n"
|
||||
" add.ftz.f32 %f87, %f93, %f87;\n"
|
||||
" st.shared.f32 [%rd42+2560], %f87;\n"
|
||||
"$Lt_0_25602:\n"
|
||||
" shr.u32 %r63, %r63, 1;\n"
|
||||
" mov.u32 %r65, 0;\n"
|
||||
" setp.ne.u32 %p14, %r63, %r65;\n"
|
||||
" @%p14 bra $Lt_0_25346;\n"
|
||||
"$Lt_0_24834:\n"
|
||||
" mov.f32 %f6, %f78;\n"
|
||||
" mov.f32 %f8, %f79;\n"
|
||||
" mov.f32 %f10, %f80;\n"
|
||||
" mov.f32 %f12, %f81;\n"
|
||||
" mov.f32 %f14, %f86;\n"
|
||||
" mov.f32 %f16, %f87;\n"
|
||||
"$Lt_0_24322:\n"
|
||||
"$Lt_0_22274:\n"
|
||||
" mov.u32 %r66, 0;\n"
|
||||
" setp.ne.s32 %p15, %r13, %r66;\n"
|
||||
" @%p15 bra $Lt_0_26370;\n"
|
||||
" ld.param.u64 %rd49, [__cudaparm_kernel_pair___val_paramengv];\n"
|
||||
" add.u64 %rd50, %rd49, %rd5;\n"
|
||||
" ld.param.s32 %r67, [__cudaparm_kernel_pair_eflag];\n"
|
||||
" mov.u32 %r68, 0;\n"
|
||||
" setp.le.s32 %p16, %r67, %r68;\n"
|
||||
" @%p16 bra $Lt_0_26882;\n"
|
||||
" st.global.f32 [%rd50+0], %f28;\n"
|
||||
" cvt.s64.s32 %rd51, %r9;\n"
|
||||
" mul.wide.s32 %rd52, %r9, 4;\n"
|
||||
" add.u64 %rd50, %rd50, %rd52;\n"
|
||||
"$Lt_0_26882:\n"
|
||||
" ld.param.s32 %r69, [__cudaparm_kernel_pair_vflag];\n"
|
||||
" mov.u32 %r70, 0;\n"
|
||||
" setp.le.s32 %p17, %r69, %r70;\n"
|
||||
" @%p17 bra $Lt_0_27394;\n"
|
||||
" mov.f32 %f94, %f6;\n"
|
||||
" st.global.f32 [%rd50+0], %f94;\n"
|
||||
" cvt.s64.s32 %rd53, %r9;\n"
|
||||
" mul.wide.s32 %rd54, %r9, 4;\n"
|
||||
" add.u64 %rd55, %rd54, %rd50;\n"
|
||||
" mov.f32 %f95, %f8;\n"
|
||||
" st.global.f32 [%rd55+0], %f95;\n"
|
||||
" add.u64 %rd56, %rd54, %rd55;\n"
|
||||
" mov.f32 %f96, %f10;\n"
|
||||
" st.global.f32 [%rd56+0], %f96;\n"
|
||||
" add.u64 %rd57, %rd54, %rd56;\n"
|
||||
" mov.f32 %f97, %f12;\n"
|
||||
" st.global.f32 [%rd57+0], %f97;\n"
|
||||
" add.u64 %rd50, %rd54, %rd57;\n"
|
||||
" mov.f32 %f98, %f14;\n"
|
||||
" st.global.f32 [%rd50+0], %f98;\n"
|
||||
" mov.f32 %f99, %f16;\n"
|
||||
" add.u64 %rd58, %rd54, %rd50;\n"
|
||||
" st.global.f32 [%rd58+0], %f99;\n"
|
||||
"$Lt_0_27394:\n"
|
||||
" ld.param.u64 %rd59, [__cudaparm_kernel_pair_ans];\n"
|
||||
" mul.lo.u64 %rd60, %rd4, 16;\n"
|
||||
" add.u64 %rd61, %rd59, %rd60;\n"
|
||||
" mov.f32 %f100, %f101;\n"
|
||||
" st.global.v4.f32 [%rd61+0], {%f27,%f26,%f25,%f100};\n"
|
||||
"$Lt_0_26370:\n"
|
||||
"$Lt_0_18690:\n"
|
||||
" .loc 16 103 0\n"
|
||||
" exit;\n"
|
||||
"$LDWend_kernel_pair:\n"
|
||||
" }\n"
|
||||
" .entry kernel_pair_fast (\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_x_,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_lj1_in,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_lj3_in,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_dev_packed,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_ans,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast___val_paramengv,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_eflag,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_vflag,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_inum,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_t_per_atom)\n"
|
||||
" {\n"
|
||||
" .reg .u32 %r<74>;\n"
|
||||
" .reg .u64 %rd<75>;\n"
|
||||
" .reg .f32 %f<109>;\n"
|
||||
" .reg .pred %p<22>;\n"
|
||||
" .shared .align 4 .b8 __cuda___cuda_local_var_32618_33_non_const_sp_lj3268[16];\n"
|
||||
" .shared .align 16 .b8 __cuda___cuda_local_var_32616_34_non_const_lj13296[1936];\n"
|
||||
" .shared .align 16 .b8 __cuda___cuda_local_var_32617_34_non_const_lj35232[1936];\n"
|
||||
" .shared .align 4 .b8 __cuda___cuda_local_var_32687_55_non_const_red_acc7168[3072];\n"
|
||||
" .loc 16 111 0\n"
|
||||
"$LDWbegin_kernel_pair_fast:\n"
|
||||
" cvt.s32.u32 %r1, %tid.x;\n"
|
||||
" mov.u32 %r2, 3;\n"
|
||||
" setp.gt.s32 %p1, %r1, %r2;\n"
|
||||
" @%p1 bra $Lt_1_20994;\n"
|
||||
" .loc 16 119 0\n"
|
||||
" mov.u64 %rd1, __cuda___cuda_local_var_32618_33_non_const_sp_lj3268;\n"
|
||||
" cvt.s64.s32 %rd2, %r1;\n"
|
||||
" mul.wide.s32 %rd3, %r1, 4;\n"
|
||||
" ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];\n"
|
||||
" add.u64 %rd5, %rd4, %rd3;\n"
|
||||
" ld.global.f32 %f1, [%rd5+0];\n"
|
||||
" add.u64 %rd6, %rd3, %rd1;\n"
|
||||
" st.shared.f32 [%rd6+0], %f1;\n"
|
||||
"$Lt_1_20994:\n"
|
||||
" mov.u64 %rd1, __cuda___cuda_local_var_32618_33_non_const_sp_lj3268;\n"
|
||||
" mov.u32 %r3, 120;\n"
|
||||
" setp.gt.s32 %p2, %r1, %r3;\n"
|
||||
" @%p2 bra $Lt_1_21506;\n"
|
||||
" .loc 16 121 0\n"
|
||||
" mov.u64 %rd7, __cuda___cuda_local_var_32616_34_non_const_lj13296;\n"
|
||||
" cvt.s64.s32 %rd8, %r1;\n"
|
||||
" mul.wide.s32 %rd9, %r1, 16;\n"
|
||||
" ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_lj1_in];\n"
|
||||
" add.u64 %rd11, %rd10, %rd9;\n"
|
||||
" add.u64 %rd12, %rd9, %rd7;\n"
|
||||
" ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0];\n"
|
||||
" st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5};\n"
|
||||
" ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag];\n"
|
||||
" mov.u32 %r5, 0;\n"
|
||||
" setp.le.s32 %p3, %r4, %r5;\n"
|
||||
" @%p3 bra $Lt_1_22018;\n"
|
||||
" .loc 16 123 0\n"
|
||||
" mov.u64 %rd13, __cuda___cuda_local_var_32617_34_non_const_lj35232;\n"
|
||||
" ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in];\n"
|
||||
" add.u64 %rd15, %rd14, %rd9;\n"
|
||||
" add.u64 %rd16, %rd9, %rd13;\n"
|
||||
" ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0];\n"
|
||||
" st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9};\n"
|
||||
"$Lt_1_22018:\n"
|
||||
" mov.u64 %rd13, __cuda___cuda_local_var_32617_34_non_const_lj35232;\n"
|
||||
"$Lt_1_21506:\n"
|
||||
" mov.u64 %rd13, __cuda___cuda_local_var_32617_34_non_const_lj35232;\n"
|
||||
" mov.u64 %rd7, __cuda___cuda_local_var_32616_34_non_const_lj13296;\n"
|
||||
" .loc 16 131 0\n"
|
||||
" mov.f32 %f10, 0f00000000; \n"
|
||||
" mov.f32 %f11, %f10;\n"
|
||||
" mov.f32 %f12, 0f00000000; \n"
|
||||
" mov.f32 %f13, %f12;\n"
|
||||
" mov.f32 %f14, 0f00000000; \n"
|
||||
" mov.f32 %f15, %f14;\n"
|
||||
" mov.f32 %f16, 0f00000000; \n"
|
||||
" mov.f32 %f17, %f16;\n"
|
||||
" mov.f32 %f18, 0f00000000; \n"
|
||||
" mov.f32 %f19, %f18;\n"
|
||||
" mov.f32 %f20, 0f00000000; \n"
|
||||
" mov.f32 %f21, %f20;\n"
|
||||
" .loc 16 133 0\n"
|
||||
" bar.sync 0;\n"
|
||||
" ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom];\n"
|
||||
" div.s32 %r7, %r1, %r6;\n"
|
||||
" cvt.s32.u32 %r8, %ntid.x;\n"
|
||||
" div.s32 %r9, %r8, %r6;\n"
|
||||
" cvt.s32.u32 %r10, %ctaid.x;\n"
|
||||
" mul.lo.s32 %r11, %r10, %r9;\n"
|
||||
" add.s32 %r12, %r7, %r11;\n"
|
||||
" ld.param.s32 %r13, [__cudaparm_kernel_pair_fast_inum];\n"
|
||||
" setp.ge.s32 %p4, %r12, %r13;\n"
|
||||
" @%p4 bra $Lt_1_30210;\n"
|
||||
" .loc 16 138 0\n"
|
||||
" ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_nbor_pitch];\n"
|
||||
" cvt.s64.s32 %rd17, %r14;\n"
|
||||
" mul.wide.s32 %rd18, %r14, 4;\n"
|
||||
" cvt.s64.s32 %rd19, %r12;\n"
|
||||
" mul.wide.s32 %rd20, %r12, 4;\n"
|
||||
" ld.param.u64 %rd21, [__cudaparm_kernel_pair_fast_dev_nbor];\n"
|
||||
" add.u64 %rd22, %rd20, %rd21;\n"
|
||||
" add.u64 %rd23, %rd18, %rd22;\n"
|
||||
" ld.global.s32 %r15, [%rd23+0];\n"
|
||||
" sub.s32 %r16, %r6, 1;\n"
|
||||
" and.b32 %r17, %r16, %r1;\n"
|
||||
" cvt.s64.s32 %rd24, %r17;\n"
|
||||
" mul.wide.s32 %rd25, %r17, 4;\n"
|
||||
" ld.param.u64 %rd26, [__cudaparm_kernel_pair_fast_dev_packed];\n"
|
||||
" setp.ne.u64 %p5, %rd26, %rd21;\n"
|
||||
" @%p5 bra $Lt_1_23298;\n"
|
||||
" cvt.s32.s64 %r18, %rd17;\n"
|
||||
" mul.lo.s32 %r19, %r18, %r6;\n"
|
||||
" mov.s32 %r20, %r19;\n"
|
||||
" mul.lo.s32 %r21, %r16, %r12;\n"
|
||||
" add.s32 %r22, %r18, %r21;\n"
|
||||
" cvt.s64.s32 %rd27, %r22;\n"
|
||||
" mul.wide.s32 %rd28, %r22, 4;\n"
|
||||
" add.u64 %rd29, %rd23, %rd28;\n"
|
||||
" and.b32 %r23, %r16, %r15;\n"
|
||||
" cvt.s64.s32 %rd30, %r23;\n"
|
||||
" div.s32 %r24, %r15, %r6;\n"
|
||||
" mul.lo.s32 %r25, %r19, %r24;\n"
|
||||
" cvt.s64.s32 %rd31, %r25;\n"
|
||||
" add.u64 %rd32, %rd30, %rd31;\n"
|
||||
" mul.lo.u64 %rd33, %rd32, 4;\n"
|
||||
" add.u64 %rd34, %rd29, %rd33;\n"
|
||||
" add.u64 %rd35, %rd25, %rd29;\n"
|
||||
" bra.uni $Lt_1_23042;\n"
|
||||
"$Lt_1_23298:\n"
|
||||
" add.u64 %rd36, %rd18, %rd23;\n"
|
||||
" ld.global.s32 %r26, [%rd36+0];\n"
|
||||
" cvt.s64.s32 %rd37, %r26;\n"
|
||||
" mul.wide.s32 %rd38, %r26, 4;\n"
|
||||
" add.u64 %rd39, %rd26, %rd38;\n"
|
||||
" cvt.s64.s32 %rd40, %r15;\n"
|
||||
" mul.wide.s32 %rd41, %r15, 4;\n"
|
||||
" add.u64 %rd34, %rd39, %rd41;\n"
|
||||
" mov.s32 %r20, %r6;\n"
|
||||
" add.u64 %rd35, %rd25, %rd39;\n"
|
||||
"$Lt_1_23042:\n"
|
||||
" .loc 16 141 0\n"
|
||||
" ld.global.s32 %r27, [%rd22+0];\n"
|
||||
" mov.u32 %r28, %r27;\n"
|
||||
" mov.s32 %r29, 0;\n"
|
||||
" mov.u32 %r30, %r29;\n"
|
||||
" mov.s32 %r31, 0;\n"
|
||||
" mov.u32 %r32, %r31;\n"
|
||||
" mov.s32 %r33, 0;\n"
|
||||
" mov.u32 %r34, %r33;\n"
|
||||
" tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r28,%r30,%r32,%r34}];\n"
|
||||
" mov.f32 %f26, %f22;\n"
|
||||
" mov.f32 %f27, %f23;\n"
|
||||
" mov.f32 %f28, %f24;\n"
|
||||
" mov.f32 %f29, %f25;\n"
|
||||
" setp.ge.u64 %p6, %rd35, %rd34;\n"
|
||||
" @%p6 bra $Lt_1_31746;\n"
|
||||
" cvt.rzi.ftz.s32.f32 %r35, %f29;\n"
|
||||
" cvt.s64.s32 %rd42, %r20;\n"
|
||||
" mul.lo.s32 %r36, %r35, 11;\n"
|
||||
" cvt.rn.f32.s32 %f30, %r36;\n"
|
||||
" mov.f32 %f31, 0f00000000; \n"
|
||||
" mov.f32 %f32, 0f00000000; \n"
|
||||
" mov.f32 %f33, 0f00000000; \n"
|
||||
" mov.f32 %f34, 0f00000000; \n"
|
||||
"$Lt_1_24066:\n"
|
||||
" .loc 16 148 0\n"
|
||||
" ld.global.s32 %r37, [%rd35+0];\n"
|
||||
" .loc 16 152 0\n"
|
||||
" and.b32 %r38, %r37, 1073741823;\n"
|
||||
" mov.u32 %r39, %r38;\n"
|
||||
" mov.s32 %r40, 0;\n"
|
||||
" mov.u32 %r41, %r40;\n"
|
||||
" mov.s32 %r42, 0;\n"
|
||||
" mov.u32 %r43, %r42;\n"
|
||||
" mov.s32 %r44, 0;\n"
|
||||
" mov.u32 %r45, %r44;\n"
|
||||
" tex.1d.v4.f32.s32 {%f35,%f36,%f37,%f38},[pos_tex,{%r39,%r41,%r43,%r45}];\n"
|
||||
" mov.f32 %f39, %f35;\n"
|
||||
" mov.f32 %f40, %f36;\n"
|
||||
" mov.f32 %f41, %f37;\n"
|
||||
" mov.f32 %f42, %f38;\n"
|
||||
" sub.ftz.f32 %f43, %f27, %f40;\n"
|
||||
" sub.ftz.f32 %f44, %f26, %f39;\n"
|
||||
" sub.ftz.f32 %f45, %f28, %f41;\n"
|
||||
" mul.ftz.f32 %f46, %f43, %f43;\n"
|
||||
" fma.rn.ftz.f32 %f47, %f44, %f44, %f46;\n"
|
||||
" fma.rn.ftz.f32 %f48, %f45, %f45, %f47;\n"
|
||||
" add.ftz.f32 %f49, %f30, %f42;\n"
|
||||
" cvt.rzi.ftz.s32.f32 %r46, %f49;\n"
|
||||
" cvt.s64.s32 %rd43, %r46;\n"
|
||||
" mul.wide.s32 %rd44, %r46, 16;\n"
|
||||
" add.u64 %rd45, %rd44, %rd7;\n"
|
||||
" ld.shared.f32 %f50, [%rd45+8];\n"
|
||||
" setp.gt.ftz.f32 %p7, %f50, %f48;\n"
|
||||
" @!%p7 bra $Lt_1_25346;\n"
|
||||
" .loc 16 165 0\n"
|
||||
" rcp.approx.ftz.f32 %f51, %f48;\n"
|
||||
" mul.ftz.f32 %f52, %f51, %f51;\n"
|
||||
" mul.ftz.f32 %f53, %f51, %f52;\n"
|
||||
" sqrt.approx.ftz.f32 %f54, %f53;\n"
|
||||
" mul.ftz.f32 %f55, %f51, %f53;\n"
|
||||
" ld.shared.v2.f32 {%f56,%f57}, [%rd45+0];\n"
|
||||
" mul.ftz.f32 %f58, %f56, %f54;\n"
|
||||
" sub.ftz.f32 %f59, %f58, %f57;\n"
|
||||
" mul.ftz.f32 %f60, %f55, %f59;\n"
|
||||
" .loc 16 167 0\n"
|
||||
" fma.rn.ftz.f32 %f33, %f44, %f60, %f33;\n"
|
||||
" .loc 16 168 0\n"
|
||||
" fma.rn.ftz.f32 %f32, %f43, %f60, %f32;\n"
|
||||
" .loc 16 169 0\n"
|
||||
" fma.rn.ftz.f32 %f31, %f45, %f60, %f31;\n"
|
||||
" ld.param.s32 %r47, [__cudaparm_kernel_pair_fast_eflag];\n"
|
||||
" mov.u32 %r48, 0;\n"
|
||||
" setp.le.s32 %p8, %r47, %r48;\n"
|
||||
" @%p8 bra $Lt_1_24834;\n"
|
||||
" .loc 16 172 0\n"
|
||||
" add.u64 %rd46, %rd44, %rd13;\n"
|
||||
" ld.shared.v4.f32 {%f61,%f62,%f63,_}, [%rd46+0];\n"
|
||||
" mul.ftz.f32 %f64, %f61, %f54;\n"
|
||||
" sub.ftz.f32 %f65, %f64, %f62;\n"
|
||||
" mul.ftz.f32 %f66, %f53, %f65;\n"
|
||||
" .loc 16 173 0\n"
|
||||
" shr.s32 %r49, %r37, 30;\n"
|
||||
" and.b32 %r50, %r49, 3;\n"
|
||||
" cvt.s64.s32 %rd47, %r50;\n"
|
||||
" mul.wide.s32 %rd48, %r50, 4;\n"
|
||||
" add.u64 %rd49, %rd1, %rd48;\n"
|
||||
" ld.shared.f32 %f67, [%rd49+0];\n"
|
||||
" sub.ftz.f32 %f68, %f66, %f63;\n"
|
||||
" fma.rn.ftz.f32 %f34, %f67, %f68, %f34;\n"
|
||||
"$Lt_1_24834:\n"
|
||||
" ld.param.s32 %r51, [__cudaparm_kernel_pair_fast_vflag];\n"
|
||||
" mov.u32 %r52, 0;\n"
|
||||
" setp.le.s32 %p9, %r51, %r52;\n"
|
||||
" @%p9 bra $Lt_1_25346;\n"
|
||||
" .loc 16 176 0\n"
|
||||
" mov.f32 %f69, %f11;\n"
|
||||
" mul.ftz.f32 %f70, %f44, %f44;\n"
|
||||
" fma.rn.ftz.f32 %f71, %f60, %f70, %f69;\n"
|
||||
" mov.f32 %f11, %f71;\n"
|
||||
" .loc 16 177 0\n"
|
||||
" mov.f32 %f72, %f13;\n"
|
||||
" fma.rn.ftz.f32 %f73, %f60, %f46, %f72;\n"
|
||||
" mov.f32 %f13, %f73;\n"
|
||||
" .loc 16 178 0\n"
|
||||
" mov.f32 %f74, %f15;\n"
|
||||
" mul.ftz.f32 %f75, %f45, %f45;\n"
|
||||
" fma.rn.ftz.f32 %f76, %f60, %f75, %f74;\n"
|
||||
" mov.f32 %f15, %f76;\n"
|
||||
" .loc 16 179 0\n"
|
||||
" mov.f32 %f77, %f17;\n"
|
||||
" mul.ftz.f32 %f78, %f43, %f44;\n"
|
||||
" fma.rn.ftz.f32 %f79, %f60, %f78, %f77;\n"
|
||||
" mov.f32 %f17, %f79;\n"
|
||||
" .loc 16 180 0\n"
|
||||
" mov.f32 %f80, %f19;\n"
|
||||
" mul.ftz.f32 %f81, %f44, %f45;\n"
|
||||
" fma.rn.ftz.f32 %f82, %f60, %f81, %f80;\n"
|
||||
" mov.f32 %f19, %f82;\n"
|
||||
" .loc 16 181 0\n"
|
||||
" mul.ftz.f32 %f83, %f43, %f45;\n"
|
||||
" fma.rn.ftz.f32 %f20, %f60, %f83, %f20;\n"
|
||||
" mov.f32 %f21, %f20;\n"
|
||||
"$Lt_1_25346:\n"
|
||||
"$Lt_1_24322:\n"
|
||||
" .loc 16 146 0\n"
|
||||
" mul.lo.u64 %rd50, %rd42, 4;\n"
|
||||
" add.u64 %rd35, %rd35, %rd50;\n"
|
||||
" setp.lt.u64 %p10, %rd35, %rd34;\n"
|
||||
" @%p10 bra $Lt_1_24066;\n"
|
||||
" bra.uni $Lt_1_23554;\n"
|
||||
"$Lt_1_31746:\n"
|
||||
" mov.f32 %f31, 0f00000000; \n"
|
||||
" mov.f32 %f32, 0f00000000; \n"
|
||||
" mov.f32 %f33, 0f00000000; \n"
|
||||
" mov.f32 %f34, 0f00000000; \n"
|
||||
"$Lt_1_23554:\n"
|
||||
" mov.u32 %r53, 1;\n"
|
||||
" setp.le.s32 %p11, %r6, %r53;\n"
|
||||
" @%p11 bra $Lt_1_28162;\n"
|
||||
" .loc 16 186 0\n"
|
||||
" mov.u64 %rd51, __cuda___cuda_local_var_32687_55_non_const_red_acc7168;\n"
|
||||
" cvt.s64.s32 %rd52, %r1;\n"
|
||||
" mul.wide.s32 %rd53, %r1, 4;\n"
|
||||
" add.u64 %rd54, %rd51, %rd53;\n"
|
||||
" mov.f32 %f84, %f33;\n"
|
||||
" st.shared.f32 [%rd54+0], %f84;\n"
|
||||
" mov.f32 %f85, %f32;\n"
|
||||
" st.shared.f32 [%rd54+512], %f85;\n"
|
||||
" mov.f32 %f86, %f31;\n"
|
||||
" st.shared.f32 [%rd54+1024], %f86;\n"
|
||||
" mov.f32 %f87, %f34;\n"
|
||||
" st.shared.f32 [%rd54+1536], %f87;\n"
|
||||
" shr.s32 %r54, %r6, 31;\n"
|
||||
" mov.s32 %r55, 1;\n"
|
||||
" and.b32 %r56, %r54, %r55;\n"
|
||||
" add.s32 %r57, %r56, %r6;\n"
|
||||
" shr.s32 %r58, %r57, 1;\n"
|
||||
" mov.s32 %r59, %r58;\n"
|
||||
" mov.u32 %r60, 0;\n"
|
||||
" setp.ne.u32 %p12, %r58, %r60;\n"
|
||||
" @!%p12 bra $Lt_1_26626;\n"
|
||||
"$Lt_1_27138:\n"
|
||||
" setp.ge.u32 %p13, %r17, %r59;\n"
|
||||
" @%p13 bra $Lt_1_27394;\n"
|
||||
" add.u32 %r61, %r1, %r59;\n"
|
||||
" cvt.u64.u32 %rd55, %r61;\n"
|
||||
" mul.wide.u32 %rd56, %r61, 4;\n"
|
||||
" add.u64 %rd57, %rd51, %rd56;\n"
|
||||
" ld.shared.f32 %f88, [%rd57+0];\n"
|
||||
" add.ftz.f32 %f84, %f88, %f84;\n"
|
||||
" st.shared.f32 [%rd54+0], %f84;\n"
|
||||
" ld.shared.f32 %f89, [%rd57+512];\n"
|
||||
" add.ftz.f32 %f85, %f89, %f85;\n"
|
||||
" st.shared.f32 [%rd54+512], %f85;\n"
|
||||
" ld.shared.f32 %f90, [%rd57+1024];\n"
|
||||
" add.ftz.f32 %f86, %f90, %f86;\n"
|
||||
" st.shared.f32 [%rd54+1024], %f86;\n"
|
||||
" ld.shared.f32 %f91, [%rd57+1536];\n"
|
||||
" add.ftz.f32 %f87, %f91, %f87;\n"
|
||||
" st.shared.f32 [%rd54+1536], %f87;\n"
|
||||
"$Lt_1_27394:\n"
|
||||
" shr.u32 %r59, %r59, 1;\n"
|
||||
" mov.u32 %r62, 0;\n"
|
||||
" setp.ne.u32 %p14, %r59, %r62;\n"
|
||||
" @%p14 bra $Lt_1_27138;\n"
|
||||
"$Lt_1_26626:\n"
|
||||
" mov.f32 %f33, %f84;\n"
|
||||
" mov.f32 %f32, %f85;\n"
|
||||
" mov.f32 %f31, %f86;\n"
|
||||
" mov.f32 %f34, %f87;\n"
|
||||
" ld.param.s32 %r63, [__cudaparm_kernel_pair_fast_vflag];\n"
|
||||
" mov.u32 %r64, 0;\n"
|
||||
" setp.le.s32 %p15, %r63, %r64;\n"
|
||||
" @%p15 bra $Lt_1_28162;\n"
|
||||
" mov.f32 %f84, %f11;\n"
|
||||
" st.shared.f32 [%rd54+0], %f84;\n"
|
||||
" mov.f32 %f85, %f13;\n"
|
||||
" st.shared.f32 [%rd54+512], %f85;\n"
|
||||
" mov.f32 %f86, %f15;\n"
|
||||
" st.shared.f32 [%rd54+1024], %f86;\n"
|
||||
" mov.f32 %f87, %f17;\n"
|
||||
" st.shared.f32 [%rd54+1536], %f87;\n"
|
||||
" mov.f32 %f92, %f19;\n"
|
||||
" st.shared.f32 [%rd54+2048], %f92;\n"
|
||||
" mov.f32 %f93, %f20;\n"
|
||||
" st.shared.f32 [%rd54+2560], %f93;\n"
|
||||
" mov.s32 %r65, %r58;\n"
|
||||
" @!%p12 bra $Lt_1_28674;\n"
|
||||
"$Lt_1_29186:\n"
|
||||
" setp.ge.u32 %p16, %r17, %r65;\n"
|
||||
" @%p16 bra $Lt_1_29442;\n"
|
||||
" add.u32 %r66, %r1, %r65;\n"
|
||||
" cvt.u64.u32 %rd58, %r66;\n"
|
||||
" mul.wide.u32 %rd59, %r66, 4;\n"
|
||||
" add.u64 %rd60, %rd51, %rd59;\n"
|
||||
" ld.shared.f32 %f94, [%rd60+0];\n"
|
||||
" add.ftz.f32 %f84, %f94, %f84;\n"
|
||||
" st.shared.f32 [%rd54+0], %f84;\n"
|
||||
" ld.shared.f32 %f95, [%rd60+512];\n"
|
||||
" add.ftz.f32 %f85, %f95, %f85;\n"
|
||||
" st.shared.f32 [%rd54+512], %f85;\n"
|
||||
" ld.shared.f32 %f96, [%rd60+1024];\n"
|
||||
" add.ftz.f32 %f86, %f96, %f86;\n"
|
||||
" st.shared.f32 [%rd54+1024], %f86;\n"
|
||||
" ld.shared.f32 %f97, [%rd60+1536];\n"
|
||||
" add.ftz.f32 %f87, %f97, %f87;\n"
|
||||
" st.shared.f32 [%rd54+1536], %f87;\n"
|
||||
" ld.shared.f32 %f98, [%rd60+2048];\n"
|
||||
" add.ftz.f32 %f92, %f98, %f92;\n"
|
||||
" st.shared.f32 [%rd54+2048], %f92;\n"
|
||||
" ld.shared.f32 %f99, [%rd60+2560];\n"
|
||||
" add.ftz.f32 %f93, %f99, %f93;\n"
|
||||
" st.shared.f32 [%rd54+2560], %f93;\n"
|
||||
"$Lt_1_29442:\n"
|
||||
" shr.u32 %r65, %r65, 1;\n"
|
||||
" mov.u32 %r67, 0;\n"
|
||||
" setp.ne.u32 %p17, %r65, %r67;\n"
|
||||
" @%p17 bra $Lt_1_29186;\n"
|
||||
"$Lt_1_28674:\n"
|
||||
" mov.f32 %f11, %f84;\n"
|
||||
" mov.f32 %f13, %f85;\n"
|
||||
" mov.f32 %f15, %f86;\n"
|
||||
" mov.f32 %f17, %f87;\n"
|
||||
" mov.f32 %f19, %f92;\n"
|
||||
" mov.f32 %f21, %f93;\n"
|
||||
"$Lt_1_28162:\n"
|
||||
"$Lt_1_26114:\n"
|
||||
" mov.u32 %r68, 0;\n"
|
||||
" setp.ne.s32 %p18, %r17, %r68;\n"
|
||||
" @%p18 bra $Lt_1_30210;\n"
|
||||
" ld.param.u64 %rd61, [__cudaparm_kernel_pair_fast___val_paramengv];\n"
|
||||
" add.u64 %rd62, %rd61, %rd20;\n"
|
||||
" ld.param.s32 %r69, [__cudaparm_kernel_pair_fast_eflag];\n"
|
||||
" mov.u32 %r70, 0;\n"
|
||||
" setp.le.s32 %p19, %r69, %r70;\n"
|
||||
" @%p19 bra $Lt_1_30722;\n"
|
||||
" st.global.f32 [%rd62+0], %f34;\n"
|
||||
" cvt.s64.s32 %rd63, %r13;\n"
|
||||
" mul.wide.s32 %rd64, %r13, 4;\n"
|
||||
" add.u64 %rd62, %rd62, %rd64;\n"
|
||||
"$Lt_1_30722:\n"
|
||||
" ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag];\n"
|
||||
" mov.u32 %r72, 0;\n"
|
||||
" setp.le.s32 %p20, %r71, %r72;\n"
|
||||
" @%p20 bra $Lt_1_31234;\n"
|
||||
" mov.f32 %f100, %f11;\n"
|
||||
" st.global.f32 [%rd62+0], %f100;\n"
|
||||
" cvt.s64.s32 %rd65, %r13;\n"
|
||||
" mul.wide.s32 %rd66, %r13, 4;\n"
|
||||
" add.u64 %rd67, %rd66, %rd62;\n"
|
||||
" mov.f32 %f101, %f13;\n"
|
||||
" st.global.f32 [%rd67+0], %f101;\n"
|
||||
" add.u64 %rd68, %rd66, %rd67;\n"
|
||||
" mov.f32 %f102, %f15;\n"
|
||||
" st.global.f32 [%rd68+0], %f102;\n"
|
||||
" add.u64 %rd69, %rd66, %rd68;\n"
|
||||
" mov.f32 %f103, %f17;\n"
|
||||
" st.global.f32 [%rd69+0], %f103;\n"
|
||||
" add.u64 %rd62, %rd66, %rd69;\n"
|
||||
" mov.f32 %f104, %f19;\n"
|
||||
" st.global.f32 [%rd62+0], %f104;\n"
|
||||
" mov.f32 %f105, %f21;\n"
|
||||
" add.u64 %rd70, %rd66, %rd62;\n"
|
||||
" st.global.f32 [%rd70+0], %f105;\n"
|
||||
"$Lt_1_31234:\n"
|
||||
" ld.param.u64 %rd71, [__cudaparm_kernel_pair_fast_ans];\n"
|
||||
" mul.lo.u64 %rd72, %rd19, 16;\n"
|
||||
" add.u64 %rd73, %rd71, %rd72;\n"
|
||||
" mov.f32 %f106, %f107;\n"
|
||||
" st.global.v4.f32 [%rd73+0], {%f33,%f32,%f31,%f106};\n"
|
||||
"$Lt_1_30210:\n"
|
||||
"$Lt_1_22530:\n"
|
||||
" .loc 16 189 0\n"
|
||||
" exit;\n"
|
||||
"$LDWend_kernel_pair_fast:\n"
|
||||
" }\n"
|
||||
;
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
1056
lib/gpu/lj_coul.ptx
1056
lib/gpu/lj_coul.ptx
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,912 +0,0 @@
|
||||
.version 2.3
|
||||
.target sm_20
|
||||
.address_size 64
|
||||
// compiled with /usr/local/cuda/open64/lib//be
|
||||
// nvopencc 4.0 built on 2011-05-12
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Compiling /tmp/tmpxft_00009ccd_00000000-9_lal_lj_expand.cpp3.i (/home/sjplimp/ccBI#.06ur5E)
|
||||
//-----------------------------------------------------------
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Options:
|
||||
//-----------------------------------------------------------
|
||||
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
|
||||
// -O3 (Optimization level)
|
||||
// -g0 (Debug level)
|
||||
// -m2 (Report advisories)
|
||||
//-----------------------------------------------------------
|
||||
|
||||
.file 1 "<command-line>"
|
||||
.file 2 "/tmp/tmpxft_00009ccd_00000000-8_lal_lj_expand.cudafe2.gpu"
|
||||
.file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
|
||||
.file 4 "/usr/local/cuda/include/crt/device_runtime.h"
|
||||
.file 5 "/usr/local/cuda/include/host_defines.h"
|
||||
.file 6 "/usr/local/cuda/include/builtin_types.h"
|
||||
.file 7 "/usr/local/cuda/include/device_types.h"
|
||||
.file 8 "/usr/local/cuda/include/driver_types.h"
|
||||
.file 9 "/usr/local/cuda/include/surface_types.h"
|
||||
.file 10 "/usr/local/cuda/include/texture_types.h"
|
||||
.file 11 "/usr/local/cuda/include/vector_types.h"
|
||||
.file 12 "/usr/local/cuda/include/device_launch_parameters.h"
|
||||
.file 13 "/usr/local/cuda/include/crt/storage_class.h"
|
||||
.file 14 "/usr/include/bits/types.h"
|
||||
.file 15 "/usr/include/time.h"
|
||||
.file 16 "lal_lj_expand.cu"
|
||||
.file 17 "/usr/local/cuda/include/common_functions.h"
|
||||
.file 18 "/usr/local/cuda/include/math_functions.h"
|
||||
.file 19 "/usr/local/cuda/include/math_constants.h"
|
||||
.file 20 "/usr/local/cuda/include/device_functions.h"
|
||||
.file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h"
|
||||
.file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h"
|
||||
.file 23 "/usr/local/cuda/include/sm_13_double_functions.h"
|
||||
.file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h"
|
||||
.file 25 "/usr/local/cuda/include/sm_20_intrinsics.h"
|
||||
.file 26 "/usr/local/cuda/include/surface_functions.h"
|
||||
.file 27 "/usr/local/cuda/include/texture_fetch_functions.h"
|
||||
.file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
|
||||
|
||||
.global .texref pos_tex;
|
||||
|
||||
.entry kernel_pair (
|
||||
.param .u64 __cudaparm_kernel_pair_x_,
|
||||
.param .u64 __cudaparm_kernel_pair_lj1,
|
||||
.param .u64 __cudaparm_kernel_pair_lj3,
|
||||
.param .s32 __cudaparm_kernel_pair_lj_types,
|
||||
.param .u64 __cudaparm_kernel_pair_sp_lj_in,
|
||||
.param .u64 __cudaparm_kernel_pair_dev_nbor,
|
||||
.param .u64 __cudaparm_kernel_pair_dev_packed,
|
||||
.param .u64 __cudaparm_kernel_pair_ans,
|
||||
.param .u64 __cudaparm_kernel_pair___val_paramengv,
|
||||
.param .s32 __cudaparm_kernel_pair_eflag,
|
||||
.param .s32 __cudaparm_kernel_pair_vflag,
|
||||
.param .s32 __cudaparm_kernel_pair_inum,
|
||||
.param .s32 __cudaparm_kernel_pair_nbor_pitch,
|
||||
.param .s32 __cudaparm_kernel_pair_t_per_atom)
|
||||
{
|
||||
.reg .u32 %r<72>;
|
||||
.reg .u64 %rd<63>;
|
||||
.reg .f32 %f<107>;
|
||||
.reg .pred %p<19>;
|
||||
.shared .align 16 .b8 __cuda___cuda_local_var_32536_33_non_const_sp_lj92[16];
|
||||
.shared .align 4 .b8 __cuda___cuda_local_var_32603_55_non_const_red_acc108[3072];
|
||||
// __cuda_local_var_32543_10_non_const_f = 48
|
||||
// __cuda_local_var_32545_9_non_const_virial = 16
|
||||
.loc 16 31 0
|
||||
$LDWbegin_kernel_pair:
|
||||
.loc 16 36 0
|
||||
ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];
|
||||
ldu.global.f32 %f1, [%rd1+0];
|
||||
.loc 16 37 0
|
||||
ld.global.f32 %f2, [%rd1+4];
|
||||
.loc 16 38 0
|
||||
ld.global.f32 %f3, [%rd1+8];
|
||||
.loc 16 39 0
|
||||
ld.global.f32 %f4, [%rd1+12];
|
||||
st.shared.v4.f32 [__cuda___cuda_local_var_32536_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4};
|
||||
.loc 16 46 0
|
||||
mov.f32 %f5, 0f00000000; // 0
|
||||
mov.f32 %f6, %f5;
|
||||
mov.f32 %f7, 0f00000000; // 0
|
||||
mov.f32 %f8, %f7;
|
||||
mov.f32 %f9, 0f00000000; // 0
|
||||
mov.f32 %f10, %f9;
|
||||
mov.f32 %f11, 0f00000000; // 0
|
||||
mov.f32 %f12, %f11;
|
||||
mov.f32 %f13, 0f00000000; // 0
|
||||
mov.f32 %f14, %f13;
|
||||
mov.f32 %f15, 0f00000000; // 0
|
||||
mov.f32 %f16, %f15;
|
||||
ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];
|
||||
cvt.s32.u32 %r2, %tid.x;
|
||||
div.s32 %r3, %r2, %r1;
|
||||
cvt.s32.u32 %r4, %ntid.x;
|
||||
div.s32 %r5, %r4, %r1;
|
||||
cvt.s32.u32 %r6, %ctaid.x;
|
||||
mul.lo.s32 %r7, %r6, %r5;
|
||||
add.s32 %r8, %r3, %r7;
|
||||
ld.param.s32 %r9, [__cudaparm_kernel_pair_inum];
|
||||
setp.ge.s32 %p1, %r8, %r9;
|
||||
@%p1 bra $Lt_0_26370;
|
||||
.loc 16 51 0
|
||||
ld.param.s32 %r10, [__cudaparm_kernel_pair_nbor_pitch];
|
||||
cvt.s64.s32 %rd2, %r10;
|
||||
mul.wide.s32 %rd3, %r10, 4;
|
||||
cvt.s64.s32 %rd4, %r8;
|
||||
mul.wide.s32 %rd5, %r8, 4;
|
||||
ld.param.u64 %rd6, [__cudaparm_kernel_pair_dev_nbor];
|
||||
add.u64 %rd7, %rd5, %rd6;
|
||||
add.u64 %rd8, %rd3, %rd7;
|
||||
ld.global.s32 %r11, [%rd8+0];
|
||||
sub.s32 %r12, %r1, 1;
|
||||
and.b32 %r13, %r12, %r2;
|
||||
cvt.s64.s32 %rd9, %r13;
|
||||
mul.wide.s32 %rd10, %r13, 4;
|
||||
ld.param.u64 %rd11, [__cudaparm_kernel_pair_dev_packed];
|
||||
setp.ne.u64 %p2, %rd11, %rd6;
|
||||
@%p2 bra $Lt_0_19458;
|
||||
cvt.s32.s64 %r14, %rd2;
|
||||
mul.lo.s32 %r15, %r14, %r1;
|
||||
mov.s32 %r16, %r15;
|
||||
mul.lo.s32 %r17, %r12, %r8;
|
||||
add.s32 %r18, %r14, %r17;
|
||||
cvt.s64.s32 %rd12, %r18;
|
||||
mul.wide.s32 %rd13, %r18, 4;
|
||||
add.u64 %rd14, %rd8, %rd13;
|
||||
and.b32 %r19, %r12, %r11;
|
||||
cvt.s64.s32 %rd15, %r19;
|
||||
div.s32 %r20, %r11, %r1;
|
||||
mul.lo.s32 %r21, %r15, %r20;
|
||||
cvt.s64.s32 %rd16, %r21;
|
||||
add.u64 %rd17, %rd15, %rd16;
|
||||
mul.lo.u64 %rd18, %rd17, 4;
|
||||
add.u64 %rd19, %rd14, %rd18;
|
||||
add.u64 %rd20, %rd10, %rd14;
|
||||
bra.uni $Lt_0_19202;
|
||||
$Lt_0_19458:
|
||||
add.u64 %rd21, %rd3, %rd8;
|
||||
ld.global.s32 %r22, [%rd21+0];
|
||||
cvt.s64.s32 %rd22, %r22;
|
||||
mul.wide.s32 %rd23, %r22, 4;
|
||||
add.u64 %rd24, %rd11, %rd23;
|
||||
cvt.s64.s32 %rd25, %r11;
|
||||
mul.wide.s32 %rd26, %r11, 4;
|
||||
add.u64 %rd19, %rd24, %rd26;
|
||||
mov.s32 %r16, %r1;
|
||||
add.u64 %rd20, %rd10, %rd24;
|
||||
$Lt_0_19202:
|
||||
.loc 16 54 0
|
||||
ld.global.s32 %r23, [%rd7+0];
|
||||
mov.u32 %r24, %r23;
|
||||
mov.s32 %r25, 0;
|
||||
mov.u32 %r26, %r25;
|
||||
mov.s32 %r27, 0;
|
||||
mov.u32 %r28, %r27;
|
||||
mov.s32 %r29, 0;
|
||||
mov.u32 %r30, %r29;
|
||||
tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r24,%r26,%r28,%r30}];
|
||||
mov.f32 %f21, %f17;
|
||||
mov.f32 %f22, %f18;
|
||||
mov.f32 %f23, %f19;
|
||||
mov.f32 %f24, %f20;
|
||||
setp.ge.u64 %p3, %rd20, %rd19;
|
||||
@%p3 bra $Lt_0_27906;
|
||||
cvt.rzi.ftz.s32.f32 %r31, %f24;
|
||||
cvt.s64.s32 %rd27, %r16;
|
||||
ld.param.s32 %r32, [__cudaparm_kernel_pair_lj_types];
|
||||
mul.lo.s32 %r33, %r32, %r31;
|
||||
ld.param.u64 %rd28, [__cudaparm_kernel_pair_lj1];
|
||||
mov.f32 %f25, 0f00000000; // 0
|
||||
mov.f32 %f26, 0f00000000; // 0
|
||||
mov.f32 %f27, 0f00000000; // 0
|
||||
mov.f32 %f28, 0f00000000; // 0
|
||||
mov.u64 %rd29, __cuda___cuda_local_var_32536_33_non_const_sp_lj92;
|
||||
$Lt_0_20226:
|
||||
//<loop> Loop body line 54, nesting depth: 1, estimated iterations: unknown
|
||||
.loc 16 60 0
|
||||
ld.global.s32 %r34, [%rd20+0];
|
||||
.loc 16 61 0
|
||||
shr.s32 %r35, %r34, 30;
|
||||
and.b32 %r36, %r35, 3;
|
||||
cvt.s64.s32 %rd30, %r36;
|
||||
mul.wide.s32 %rd31, %r36, 4;
|
||||
add.u64 %rd32, %rd29, %rd31;
|
||||
ld.shared.f32 %f29, [%rd32+0];
|
||||
.loc 16 64 0
|
||||
and.b32 %r37, %r34, 1073741823;
|
||||
mov.u32 %r38, %r37;
|
||||
mov.s32 %r39, 0;
|
||||
mov.u32 %r40, %r39;
|
||||
mov.s32 %r41, 0;
|
||||
mov.u32 %r42, %r41;
|
||||
mov.s32 %r43, 0;
|
||||
mov.u32 %r44, %r43;
|
||||
tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r38,%r40,%r42,%r44}];
|
||||
mov.f32 %f34, %f30;
|
||||
mov.f32 %f35, %f31;
|
||||
mov.f32 %f36, %f32;
|
||||
mov.f32 %f37, %f33;
|
||||
cvt.rzi.ftz.s32.f32 %r45, %f37;
|
||||
sub.ftz.f32 %f38, %f22, %f35;
|
||||
sub.ftz.f32 %f39, %f21, %f34;
|
||||
sub.ftz.f32 %f40, %f23, %f36;
|
||||
mul.ftz.f32 %f41, %f38, %f38;
|
||||
fma.rn.ftz.f32 %f42, %f39, %f39, %f41;
|
||||
fma.rn.ftz.f32 %f43, %f40, %f40, %f42;
|
||||
add.s32 %r46, %r45, %r33;
|
||||
cvt.s64.s32 %rd33, %r46;
|
||||
mul.wide.s32 %rd34, %r46, 16;
|
||||
add.u64 %rd35, %rd34, %rd28;
|
||||
ld.global.f32 %f44, [%rd35+8];
|
||||
setp.gt.ftz.f32 %p4, %f44, %f43;
|
||||
@!%p4 bra $Lt_0_21506;
|
||||
.loc 16 76 0
|
||||
sqrt.approx.ftz.f32 %f45, %f43;
|
||||
ld.global.v4.f32 {%f46,%f47,_,%f48}, [%rd35+0];
|
||||
sub.ftz.f32 %f49, %f45, %f48;
|
||||
.loc 16 81 0
|
||||
mul.ftz.f32 %f50, %f49, %f49;
|
||||
rcp.approx.ftz.f32 %f51, %f50;
|
||||
mul.ftz.f32 %f52, %f51, %f51;
|
||||
mul.ftz.f32 %f53, %f51, %f52;
|
||||
div.approx.ftz.f32 %f54, %f29, %f49;
|
||||
div.approx.ftz.f32 %f55, %f54, %f45;
|
||||
mul.ftz.f32 %f56, %f46, %f53;
|
||||
sub.ftz.f32 %f57, %f56, %f47;
|
||||
mul.ftz.f32 %f58, %f53, %f57;
|
||||
mul.ftz.f32 %f59, %f55, %f58;
|
||||
.loc 16 83 0
|
||||
fma.rn.ftz.f32 %f27, %f39, %f59, %f27;
|
||||
.loc 16 84 0
|
||||
fma.rn.ftz.f32 %f26, %f38, %f59, %f26;
|
||||
.loc 16 85 0
|
||||
fma.rn.ftz.f32 %f25, %f40, %f59, %f25;
|
||||
ld.param.s32 %r47, [__cudaparm_kernel_pair_eflag];
|
||||
mov.u32 %r48, 0;
|
||||
setp.le.s32 %p5, %r47, %r48;
|
||||
@%p5 bra $Lt_0_20994;
|
||||
.loc 16 89 0
|
||||
ld.param.u64 %rd36, [__cudaparm_kernel_pair_lj3];
|
||||
add.u64 %rd37, %rd36, %rd34;
|
||||
ld.global.v4.f32 {%f60,%f61,%f62,_}, [%rd37+0];
|
||||
mul.ftz.f32 %f63, %f60, %f53;
|
||||
sub.ftz.f32 %f64, %f63, %f61;
|
||||
mul.ftz.f32 %f65, %f53, %f64;
|
||||
sub.ftz.f32 %f66, %f65, %f62;
|
||||
fma.rn.ftz.f32 %f28, %f29, %f66, %f28;
|
||||
$Lt_0_20994:
|
||||
ld.param.s32 %r49, [__cudaparm_kernel_pair_vflag];
|
||||
mov.u32 %r50, 0;
|
||||
setp.le.s32 %p6, %r49, %r50;
|
||||
@%p6 bra $Lt_0_21506;
|
||||
.loc 16 92 0
|
||||
mov.f32 %f67, %f6;
|
||||
mul.ftz.f32 %f68, %f39, %f39;
|
||||
fma.rn.ftz.f32 %f69, %f59, %f68, %f67;
|
||||
mov.f32 %f6, %f69;
|
||||
.loc 16 93 0
|
||||
mov.f32 %f70, %f8;
|
||||
fma.rn.ftz.f32 %f71, %f59, %f41, %f70;
|
||||
mov.f32 %f8, %f71;
|
||||
.loc 16 94 0
|
||||
mov.f32 %f72, %f10;
|
||||
mul.ftz.f32 %f73, %f40, %f40;
|
||||
fma.rn.ftz.f32 %f74, %f59, %f73, %f72;
|
||||
mov.f32 %f10, %f74;
|
||||
.loc 16 95 0
|
||||
mov.f32 %f75, %f12;
|
||||
mul.ftz.f32 %f76, %f38, %f39;
|
||||
fma.rn.ftz.f32 %f77, %f59, %f76, %f75;
|
||||
mov.f32 %f12, %f77;
|
||||
.loc 16 96 0
|
||||
mov.f32 %f78, %f14;
|
||||
mul.ftz.f32 %f79, %f39, %f40;
|
||||
fma.rn.ftz.f32 %f80, %f59, %f79, %f78;
|
||||
mov.f32 %f14, %f80;
|
||||
.loc 16 97 0
|
||||
mul.ftz.f32 %f81, %f38, %f40;
|
||||
fma.rn.ftz.f32 %f15, %f59, %f81, %f15;
|
||||
mov.f32 %f16, %f15;
|
||||
$Lt_0_21506:
|
||||
$Lt_0_20482:
|
||||
.loc 16 58 0
|
||||
mul.lo.u64 %rd38, %rd27, 4;
|
||||
add.u64 %rd20, %rd20, %rd38;
|
||||
setp.lt.u64 %p7, %rd20, %rd19;
|
||||
@%p7 bra $Lt_0_20226;
|
||||
bra.uni $Lt_0_19714;
|
||||
$Lt_0_27906:
|
||||
mov.f32 %f25, 0f00000000; // 0
|
||||
mov.f32 %f26, 0f00000000; // 0
|
||||
mov.f32 %f27, 0f00000000; // 0
|
||||
mov.f32 %f28, 0f00000000; // 0
|
||||
$Lt_0_19714:
|
||||
mov.u32 %r51, 1;
|
||||
setp.le.s32 %p8, %r1, %r51;
|
||||
@%p8 bra $Lt_0_24322;
|
||||
.loc 16 102 0
|
||||
mov.u64 %rd39, __cuda___cuda_local_var_32603_55_non_const_red_acc108;
|
||||
cvt.s64.s32 %rd40, %r2;
|
||||
mul.wide.s32 %rd41, %r2, 4;
|
||||
add.u64 %rd42, %rd39, %rd41;
|
||||
mov.f32 %f82, %f27;
|
||||
st.shared.f32 [%rd42+0], %f82;
|
||||
mov.f32 %f83, %f26;
|
||||
st.shared.f32 [%rd42+512], %f83;
|
||||
mov.f32 %f84, %f25;
|
||||
st.shared.f32 [%rd42+1024], %f84;
|
||||
mov.f32 %f85, %f28;
|
||||
st.shared.f32 [%rd42+1536], %f85;
|
||||
shr.s32 %r52, %r1, 31;
|
||||
mov.s32 %r53, 1;
|
||||
and.b32 %r54, %r52, %r53;
|
||||
add.s32 %r55, %r54, %r1;
|
||||
shr.s32 %r56, %r55, 1;
|
||||
mov.s32 %r57, %r56;
|
||||
mov.u32 %r58, 0;
|
||||
setp.ne.u32 %p9, %r56, %r58;
|
||||
@!%p9 bra $Lt_0_22786;
|
||||
$Lt_0_23298:
|
||||
setp.ge.u32 %p10, %r13, %r57;
|
||||
@%p10 bra $Lt_0_23554;
|
||||
add.u32 %r59, %r2, %r57;
|
||||
cvt.u64.u32 %rd43, %r59;
|
||||
mul.wide.u32 %rd44, %r59, 4;
|
||||
add.u64 %rd45, %rd39, %rd44;
|
||||
ld.shared.f32 %f86, [%rd45+0];
|
||||
add.ftz.f32 %f82, %f86, %f82;
|
||||
st.shared.f32 [%rd42+0], %f82;
|
||||
ld.shared.f32 %f87, [%rd45+512];
|
||||
add.ftz.f32 %f83, %f87, %f83;
|
||||
st.shared.f32 [%rd42+512], %f83;
|
||||
ld.shared.f32 %f88, [%rd45+1024];
|
||||
add.ftz.f32 %f84, %f88, %f84;
|
||||
st.shared.f32 [%rd42+1024], %f84;
|
||||
ld.shared.f32 %f89, [%rd45+1536];
|
||||
add.ftz.f32 %f85, %f89, %f85;
|
||||
st.shared.f32 [%rd42+1536], %f85;
|
||||
$Lt_0_23554:
|
||||
shr.u32 %r57, %r57, 1;
|
||||
mov.u32 %r60, 0;
|
||||
setp.ne.u32 %p11, %r57, %r60;
|
||||
@%p11 bra $Lt_0_23298;
|
||||
$Lt_0_22786:
|
||||
mov.f32 %f27, %f82;
|
||||
mov.f32 %f26, %f83;
|
||||
mov.f32 %f25, %f84;
|
||||
mov.f32 %f28, %f85;
|
||||
ld.param.s32 %r61, [__cudaparm_kernel_pair_vflag];
|
||||
mov.u32 %r62, 0;
|
||||
setp.le.s32 %p12, %r61, %r62;
|
||||
@%p12 bra $Lt_0_24322;
|
||||
mov.f32 %f82, %f6;
|
||||
st.shared.f32 [%rd42+0], %f82;
|
||||
mov.f32 %f83, %f8;
|
||||
st.shared.f32 [%rd42+512], %f83;
|
||||
mov.f32 %f84, %f10;
|
||||
st.shared.f32 [%rd42+1024], %f84;
|
||||
mov.f32 %f85, %f12;
|
||||
st.shared.f32 [%rd42+1536], %f85;
|
||||
mov.f32 %f90, %f14;
|
||||
st.shared.f32 [%rd42+2048], %f90;
|
||||
mov.f32 %f91, %f15;
|
||||
st.shared.f32 [%rd42+2560], %f91;
|
||||
mov.s32 %r63, %r56;
|
||||
@!%p9 bra $Lt_0_24834;
|
||||
$Lt_0_25346:
|
||||
setp.ge.u32 %p13, %r13, %r63;
|
||||
@%p13 bra $Lt_0_25602;
|
||||
add.u32 %r64, %r2, %r63;
|
||||
cvt.u64.u32 %rd46, %r64;
|
||||
mul.wide.u32 %rd47, %r64, 4;
|
||||
add.u64 %rd48, %rd39, %rd47;
|
||||
ld.shared.f32 %f92, [%rd48+0];
|
||||
add.ftz.f32 %f82, %f92, %f82;
|
||||
st.shared.f32 [%rd42+0], %f82;
|
||||
ld.shared.f32 %f93, [%rd48+512];
|
||||
add.ftz.f32 %f83, %f93, %f83;
|
||||
st.shared.f32 [%rd42+512], %f83;
|
||||
ld.shared.f32 %f94, [%rd48+1024];
|
||||
add.ftz.f32 %f84, %f94, %f84;
|
||||
st.shared.f32 [%rd42+1024], %f84;
|
||||
ld.shared.f32 %f95, [%rd48+1536];
|
||||
add.ftz.f32 %f85, %f95, %f85;
|
||||
st.shared.f32 [%rd42+1536], %f85;
|
||||
ld.shared.f32 %f96, [%rd48+2048];
|
||||
add.ftz.f32 %f90, %f96, %f90;
|
||||
st.shared.f32 [%rd42+2048], %f90;
|
||||
ld.shared.f32 %f97, [%rd48+2560];
|
||||
add.ftz.f32 %f91, %f97, %f91;
|
||||
st.shared.f32 [%rd42+2560], %f91;
|
||||
$Lt_0_25602:
|
||||
shr.u32 %r63, %r63, 1;
|
||||
mov.u32 %r65, 0;
|
||||
setp.ne.u32 %p14, %r63, %r65;
|
||||
@%p14 bra $Lt_0_25346;
|
||||
$Lt_0_24834:
|
||||
mov.f32 %f6, %f82;
|
||||
mov.f32 %f8, %f83;
|
||||
mov.f32 %f10, %f84;
|
||||
mov.f32 %f12, %f85;
|
||||
mov.f32 %f14, %f90;
|
||||
mov.f32 %f16, %f91;
|
||||
$Lt_0_24322:
|
||||
$Lt_0_22274:
|
||||
mov.u32 %r66, 0;
|
||||
setp.ne.s32 %p15, %r13, %r66;
|
||||
@%p15 bra $Lt_0_26370;
|
||||
ld.param.u64 %rd49, [__cudaparm_kernel_pair___val_paramengv];
|
||||
add.u64 %rd50, %rd49, %rd5;
|
||||
ld.param.s32 %r67, [__cudaparm_kernel_pair_eflag];
|
||||
mov.u32 %r68, 0;
|
||||
setp.le.s32 %p16, %r67, %r68;
|
||||
@%p16 bra $Lt_0_26882;
|
||||
st.global.f32 [%rd50+0], %f28;
|
||||
cvt.s64.s32 %rd51, %r9;
|
||||
mul.wide.s32 %rd52, %r9, 4;
|
||||
add.u64 %rd50, %rd50, %rd52;
|
||||
$Lt_0_26882:
|
||||
ld.param.s32 %r69, [__cudaparm_kernel_pair_vflag];
|
||||
mov.u32 %r70, 0;
|
||||
setp.le.s32 %p17, %r69, %r70;
|
||||
@%p17 bra $Lt_0_27394;
|
||||
mov.f32 %f98, %f6;
|
||||
st.global.f32 [%rd50+0], %f98;
|
||||
cvt.s64.s32 %rd53, %r9;
|
||||
mul.wide.s32 %rd54, %r9, 4;
|
||||
add.u64 %rd55, %rd54, %rd50;
|
||||
mov.f32 %f99, %f8;
|
||||
st.global.f32 [%rd55+0], %f99;
|
||||
add.u64 %rd56, %rd54, %rd55;
|
||||
mov.f32 %f100, %f10;
|
||||
st.global.f32 [%rd56+0], %f100;
|
||||
add.u64 %rd57, %rd54, %rd56;
|
||||
mov.f32 %f101, %f12;
|
||||
st.global.f32 [%rd57+0], %f101;
|
||||
add.u64 %rd50, %rd54, %rd57;
|
||||
mov.f32 %f102, %f14;
|
||||
st.global.f32 [%rd50+0], %f102;
|
||||
mov.f32 %f103, %f16;
|
||||
add.u64 %rd58, %rd54, %rd50;
|
||||
st.global.f32 [%rd58+0], %f103;
|
||||
$Lt_0_27394:
|
||||
ld.param.u64 %rd59, [__cudaparm_kernel_pair_ans];
|
||||
mul.lo.u64 %rd60, %rd4, 16;
|
||||
add.u64 %rd61, %rd59, %rd60;
|
||||
mov.f32 %f104, %f105;
|
||||
st.global.v4.f32 [%rd61+0], {%f27,%f26,%f25,%f104};
|
||||
$Lt_0_26370:
|
||||
$Lt_0_18690:
|
||||
.loc 16 105 0
|
||||
exit;
|
||||
$LDWend_kernel_pair:
|
||||
} // kernel_pair
|
||||
|
||||
.entry kernel_pair_fast (
|
||||
.param .u64 __cudaparm_kernel_pair_fast_x_,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_lj1_in,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_lj3_in,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_dev_nbor,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_dev_packed,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_ans,
|
||||
.param .u64 __cudaparm_kernel_pair_fast___val_paramengv,
|
||||
.param .s32 __cudaparm_kernel_pair_fast_eflag,
|
||||
.param .s32 __cudaparm_kernel_pair_fast_vflag,
|
||||
.param .s32 __cudaparm_kernel_pair_fast_inum,
|
||||
.param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,
|
||||
.param .s32 __cudaparm_kernel_pair_fast_t_per_atom)
|
||||
{
|
||||
.reg .u32 %r<74>;
|
||||
.reg .u64 %rd<75>;
|
||||
.reg .f32 %f<114>;
|
||||
.reg .pred %p<22>;
|
||||
.shared .align 4 .b8 __cuda___cuda_local_var_32620_33_non_const_sp_lj3268[16];
|
||||
.shared .align 16 .b8 __cuda___cuda_local_var_32618_34_non_const_lj13296[1936];
|
||||
.shared .align 16 .b8 __cuda___cuda_local_var_32619_34_non_const_lj35232[1936];
|
||||
.shared .align 4 .b8 __cuda___cuda_local_var_32692_55_non_const_red_acc7168[3072];
|
||||
// __cuda_local_var_32630_10_non_const_f = 48
|
||||
// __cuda_local_var_32632_9_non_const_virial = 16
|
||||
.loc 16 113 0
|
||||
$LDWbegin_kernel_pair_fast:
|
||||
cvt.s32.u32 %r1, %tid.x;
|
||||
mov.u32 %r2, 3;
|
||||
setp.gt.s32 %p1, %r1, %r2;
|
||||
@%p1 bra $Lt_1_20994;
|
||||
.loc 16 121 0
|
||||
mov.u64 %rd1, __cuda___cuda_local_var_32620_33_non_const_sp_lj3268;
|
||||
cvt.s64.s32 %rd2, %r1;
|
||||
mul.wide.s32 %rd3, %r1, 4;
|
||||
ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];
|
||||
add.u64 %rd5, %rd4, %rd3;
|
||||
ld.global.f32 %f1, [%rd5+0];
|
||||
add.u64 %rd6, %rd3, %rd1;
|
||||
st.shared.f32 [%rd6+0], %f1;
|
||||
$Lt_1_20994:
|
||||
mov.u64 %rd1, __cuda___cuda_local_var_32620_33_non_const_sp_lj3268;
|
||||
mov.u32 %r3, 120;
|
||||
setp.gt.s32 %p2, %r1, %r3;
|
||||
@%p2 bra $Lt_1_21506;
|
||||
.loc 16 123 0
|
||||
mov.u64 %rd7, __cuda___cuda_local_var_32618_34_non_const_lj13296;
|
||||
cvt.s64.s32 %rd8, %r1;
|
||||
mul.wide.s32 %rd9, %r1, 16;
|
||||
ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_lj1_in];
|
||||
add.u64 %rd11, %rd10, %rd9;
|
||||
add.u64 %rd12, %rd9, %rd7;
|
||||
ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0];
|
||||
st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5};
|
||||
ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag];
|
||||
mov.u32 %r5, 0;
|
||||
setp.le.s32 %p3, %r4, %r5;
|
||||
@%p3 bra $Lt_1_22018;
|
||||
.loc 16 125 0
|
||||
mov.u64 %rd13, __cuda___cuda_local_var_32619_34_non_const_lj35232;
|
||||
ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in];
|
||||
add.u64 %rd15, %rd14, %rd9;
|
||||
add.u64 %rd16, %rd9, %rd13;
|
||||
ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0];
|
||||
st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9};
|
||||
$Lt_1_22018:
|
||||
mov.u64 %rd13, __cuda___cuda_local_var_32619_34_non_const_lj35232;
|
||||
$Lt_1_21506:
|
||||
mov.u64 %rd13, __cuda___cuda_local_var_32619_34_non_const_lj35232;
|
||||
mov.u64 %rd7, __cuda___cuda_local_var_32618_34_non_const_lj13296;
|
||||
.loc 16 133 0
|
||||
mov.f32 %f10, 0f00000000; // 0
|
||||
mov.f32 %f11, %f10;
|
||||
mov.f32 %f12, 0f00000000; // 0
|
||||
mov.f32 %f13, %f12;
|
||||
mov.f32 %f14, 0f00000000; // 0
|
||||
mov.f32 %f15, %f14;
|
||||
mov.f32 %f16, 0f00000000; // 0
|
||||
mov.f32 %f17, %f16;
|
||||
mov.f32 %f18, 0f00000000; // 0
|
||||
mov.f32 %f19, %f18;
|
||||
mov.f32 %f20, 0f00000000; // 0
|
||||
mov.f32 %f21, %f20;
|
||||
.loc 16 135 0
|
||||
bar.sync 0;
|
||||
ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom];
|
||||
div.s32 %r7, %r1, %r6;
|
||||
cvt.s32.u32 %r8, %ntid.x;
|
||||
div.s32 %r9, %r8, %r6;
|
||||
cvt.s32.u32 %r10, %ctaid.x;
|
||||
mul.lo.s32 %r11, %r10, %r9;
|
||||
add.s32 %r12, %r7, %r11;
|
||||
ld.param.s32 %r13, [__cudaparm_kernel_pair_fast_inum];
|
||||
setp.ge.s32 %p4, %r12, %r13;
|
||||
@%p4 bra $Lt_1_30210;
|
||||
.loc 16 140 0
|
||||
ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_nbor_pitch];
|
||||
cvt.s64.s32 %rd17, %r14;
|
||||
mul.wide.s32 %rd18, %r14, 4;
|
||||
cvt.s64.s32 %rd19, %r12;
|
||||
mul.wide.s32 %rd20, %r12, 4;
|
||||
ld.param.u64 %rd21, [__cudaparm_kernel_pair_fast_dev_nbor];
|
||||
add.u64 %rd22, %rd20, %rd21;
|
||||
add.u64 %rd23, %rd18, %rd22;
|
||||
ld.global.s32 %r15, [%rd23+0];
|
||||
sub.s32 %r16, %r6, 1;
|
||||
and.b32 %r17, %r16, %r1;
|
||||
cvt.s64.s32 %rd24, %r17;
|
||||
mul.wide.s32 %rd25, %r17, 4;
|
||||
ld.param.u64 %rd26, [__cudaparm_kernel_pair_fast_dev_packed];
|
||||
setp.ne.u64 %p5, %rd26, %rd21;
|
||||
@%p5 bra $Lt_1_23298;
|
||||
cvt.s32.s64 %r18, %rd17;
|
||||
mul.lo.s32 %r19, %r18, %r6;
|
||||
mov.s32 %r20, %r19;
|
||||
mul.lo.s32 %r21, %r16, %r12;
|
||||
add.s32 %r22, %r18, %r21;
|
||||
cvt.s64.s32 %rd27, %r22;
|
||||
mul.wide.s32 %rd28, %r22, 4;
|
||||
add.u64 %rd29, %rd23, %rd28;
|
||||
and.b32 %r23, %r16, %r15;
|
||||
cvt.s64.s32 %rd30, %r23;
|
||||
div.s32 %r24, %r15, %r6;
|
||||
mul.lo.s32 %r25, %r19, %r24;
|
||||
cvt.s64.s32 %rd31, %r25;
|
||||
add.u64 %rd32, %rd30, %rd31;
|
||||
mul.lo.u64 %rd33, %rd32, 4;
|
||||
add.u64 %rd34, %rd29, %rd33;
|
||||
add.u64 %rd35, %rd25, %rd29;
|
||||
bra.uni $Lt_1_23042;
|
||||
$Lt_1_23298:
|
||||
add.u64 %rd36, %rd18, %rd23;
|
||||
ld.global.s32 %r26, [%rd36+0];
|
||||
cvt.s64.s32 %rd37, %r26;
|
||||
mul.wide.s32 %rd38, %r26, 4;
|
||||
add.u64 %rd39, %rd26, %rd38;
|
||||
cvt.s64.s32 %rd40, %r15;
|
||||
mul.wide.s32 %rd41, %r15, 4;
|
||||
add.u64 %rd34, %rd39, %rd41;
|
||||
mov.s32 %r20, %r6;
|
||||
add.u64 %rd35, %rd25, %rd39;
|
||||
$Lt_1_23042:
|
||||
.loc 16 143 0
|
||||
ld.global.s32 %r27, [%rd22+0];
|
||||
mov.u32 %r28, %r27;
|
||||
mov.s32 %r29, 0;
|
||||
mov.u32 %r30, %r29;
|
||||
mov.s32 %r31, 0;
|
||||
mov.u32 %r32, %r31;
|
||||
mov.s32 %r33, 0;
|
||||
mov.u32 %r34, %r33;
|
||||
tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r28,%r30,%r32,%r34}];
|
||||
mov.f32 %f26, %f22;
|
||||
mov.f32 %f27, %f23;
|
||||
mov.f32 %f28, %f24;
|
||||
mov.f32 %f29, %f25;
|
||||
setp.ge.u64 %p6, %rd35, %rd34;
|
||||
@%p6 bra $Lt_1_31746;
|
||||
cvt.rzi.ftz.s32.f32 %r35, %f29;
|
||||
cvt.s64.s32 %rd42, %r20;
|
||||
mul.lo.s32 %r36, %r35, 11;
|
||||
cvt.rn.f32.s32 %f30, %r36;
|
||||
mov.f32 %f31, 0f00000000; // 0
|
||||
mov.f32 %f32, 0f00000000; // 0
|
||||
mov.f32 %f33, 0f00000000; // 0
|
||||
mov.f32 %f34, 0f00000000; // 0
|
||||
$Lt_1_24066:
|
||||
//<loop> Loop body line 143, nesting depth: 1, estimated iterations: unknown
|
||||
.loc 16 150 0
|
||||
ld.global.s32 %r37, [%rd35+0];
|
||||
.loc 16 151 0
|
||||
shr.s32 %r38, %r37, 30;
|
||||
and.b32 %r39, %r38, 3;
|
||||
cvt.s64.s32 %rd43, %r39;
|
||||
mul.wide.s32 %rd44, %r39, 4;
|
||||
add.u64 %rd45, %rd1, %rd44;
|
||||
ld.shared.f32 %f35, [%rd45+0];
|
||||
.loc 16 154 0
|
||||
and.b32 %r40, %r37, 1073741823;
|
||||
mov.u32 %r41, %r40;
|
||||
mov.s32 %r42, 0;
|
||||
mov.u32 %r43, %r42;
|
||||
mov.s32 %r44, 0;
|
||||
mov.u32 %r45, %r44;
|
||||
mov.s32 %r46, 0;
|
||||
mov.u32 %r47, %r46;
|
||||
tex.1d.v4.f32.s32 {%f36,%f37,%f38,%f39},[pos_tex,{%r41,%r43,%r45,%r47}];
|
||||
mov.f32 %f40, %f36;
|
||||
mov.f32 %f41, %f37;
|
||||
mov.f32 %f42, %f38;
|
||||
mov.f32 %f43, %f39;
|
||||
sub.ftz.f32 %f44, %f27, %f41;
|
||||
sub.ftz.f32 %f45, %f26, %f40;
|
||||
sub.ftz.f32 %f46, %f28, %f42;
|
||||
mul.ftz.f32 %f47, %f44, %f44;
|
||||
fma.rn.ftz.f32 %f48, %f45, %f45, %f47;
|
||||
fma.rn.ftz.f32 %f49, %f46, %f46, %f48;
|
||||
add.ftz.f32 %f50, %f30, %f43;
|
||||
cvt.rzi.ftz.s32.f32 %r48, %f50;
|
||||
cvt.s64.s32 %rd46, %r48;
|
||||
mul.wide.s32 %rd47, %r48, 16;
|
||||
add.u64 %rd48, %rd47, %rd7;
|
||||
ld.shared.f32 %f51, [%rd48+8];
|
||||
setp.gt.ftz.f32 %p7, %f51, %f49;
|
||||
@!%p7 bra $Lt_1_25346;
|
||||
.loc 16 165 0
|
||||
sqrt.approx.ftz.f32 %f52, %f49;
|
||||
ld.shared.v4.f32 {%f53,%f54,_,%f55}, [%rd48+0];
|
||||
sub.ftz.f32 %f56, %f52, %f55;
|
||||
.loc 16 169 0
|
||||
mul.ftz.f32 %f57, %f56, %f56;
|
||||
rcp.approx.ftz.f32 %f58, %f57;
|
||||
mul.ftz.f32 %f59, %f58, %f58;
|
||||
mul.ftz.f32 %f60, %f58, %f59;
|
||||
mul.ftz.f32 %f61, %f53, %f60;
|
||||
sub.ftz.f32 %f62, %f61, %f54;
|
||||
mul.ftz.f32 %f63, %f60, %f62;
|
||||
.loc 16 170 0
|
||||
div.approx.ftz.f32 %f64, %f35, %f56;
|
||||
div.approx.ftz.f32 %f65, %f64, %f52;
|
||||
mul.ftz.f32 %f66, %f63, %f65;
|
||||
.loc 16 172 0
|
||||
fma.rn.ftz.f32 %f33, %f45, %f66, %f33;
|
||||
.loc 16 173 0
|
||||
fma.rn.ftz.f32 %f32, %f44, %f66, %f32;
|
||||
.loc 16 174 0
|
||||
fma.rn.ftz.f32 %f31, %f46, %f66, %f31;
|
||||
ld.param.s32 %r49, [__cudaparm_kernel_pair_fast_eflag];
|
||||
mov.u32 %r50, 0;
|
||||
setp.le.s32 %p8, %r49, %r50;
|
||||
@%p8 bra $Lt_1_24834;
|
||||
.loc 16 177 0
|
||||
add.u64 %rd49, %rd47, %rd13;
|
||||
ld.shared.v4.f32 {%f67,%f68,%f69,_}, [%rd49+0];
|
||||
mul.ftz.f32 %f70, %f67, %f60;
|
||||
sub.ftz.f32 %f71, %f70, %f68;
|
||||
mul.ftz.f32 %f72, %f60, %f71;
|
||||
.loc 16 178 0
|
||||
sub.ftz.f32 %f73, %f72, %f69;
|
||||
fma.rn.ftz.f32 %f34, %f35, %f73, %f34;
|
||||
$Lt_1_24834:
|
||||
ld.param.s32 %r51, [__cudaparm_kernel_pair_fast_vflag];
|
||||
mov.u32 %r52, 0;
|
||||
setp.le.s32 %p9, %r51, %r52;
|
||||
@%p9 bra $Lt_1_25346;
|
||||
.loc 16 181 0
|
||||
mov.f32 %f74, %f11;
|
||||
mul.ftz.f32 %f75, %f45, %f45;
|
||||
fma.rn.ftz.f32 %f76, %f66, %f75, %f74;
|
||||
mov.f32 %f11, %f76;
|
||||
.loc 16 182 0
|
||||
mov.f32 %f77, %f13;
|
||||
fma.rn.ftz.f32 %f78, %f66, %f47, %f77;
|
||||
mov.f32 %f13, %f78;
|
||||
.loc 16 183 0
|
||||
mov.f32 %f79, %f15;
|
||||
mul.ftz.f32 %f80, %f46, %f46;
|
||||
fma.rn.ftz.f32 %f81, %f66, %f80, %f79;
|
||||
mov.f32 %f15, %f81;
|
||||
.loc 16 184 0
|
||||
mov.f32 %f82, %f17;
|
||||
mul.ftz.f32 %f83, %f44, %f45;
|
||||
fma.rn.ftz.f32 %f84, %f66, %f83, %f82;
|
||||
mov.f32 %f17, %f84;
|
||||
.loc 16 185 0
|
||||
mov.f32 %f85, %f19;
|
||||
mul.ftz.f32 %f86, %f45, %f46;
|
||||
fma.rn.ftz.f32 %f87, %f66, %f86, %f85;
|
||||
mov.f32 %f19, %f87;
|
||||
.loc 16 186 0
|
||||
mul.ftz.f32 %f88, %f44, %f46;
|
||||
fma.rn.ftz.f32 %f20, %f66, %f88, %f20;
|
||||
mov.f32 %f21, %f20;
|
||||
$Lt_1_25346:
|
||||
$Lt_1_24322:
|
||||
.loc 16 148 0
|
||||
mul.lo.u64 %rd50, %rd42, 4;
|
||||
add.u64 %rd35, %rd35, %rd50;
|
||||
setp.lt.u64 %p10, %rd35, %rd34;
|
||||
@%p10 bra $Lt_1_24066;
|
||||
bra.uni $Lt_1_23554;
|
||||
$Lt_1_31746:
|
||||
mov.f32 %f31, 0f00000000; // 0
|
||||
mov.f32 %f32, 0f00000000; // 0
|
||||
mov.f32 %f33, 0f00000000; // 0
|
||||
mov.f32 %f34, 0f00000000; // 0
|
||||
$Lt_1_23554:
|
||||
mov.u32 %r53, 1;
|
||||
setp.le.s32 %p11, %r6, %r53;
|
||||
@%p11 bra $Lt_1_28162;
|
||||
.loc 16 191 0
|
||||
mov.u64 %rd51, __cuda___cuda_local_var_32692_55_non_const_red_acc7168;
|
||||
cvt.s64.s32 %rd52, %r1;
|
||||
mul.wide.s32 %rd53, %r1, 4;
|
||||
add.u64 %rd54, %rd51, %rd53;
|
||||
mov.f32 %f89, %f33;
|
||||
st.shared.f32 [%rd54+0], %f89;
|
||||
mov.f32 %f90, %f32;
|
||||
st.shared.f32 [%rd54+512], %f90;
|
||||
mov.f32 %f91, %f31;
|
||||
st.shared.f32 [%rd54+1024], %f91;
|
||||
mov.f32 %f92, %f34;
|
||||
st.shared.f32 [%rd54+1536], %f92;
|
||||
shr.s32 %r54, %r6, 31;
|
||||
mov.s32 %r55, 1;
|
||||
and.b32 %r56, %r54, %r55;
|
||||
add.s32 %r57, %r56, %r6;
|
||||
shr.s32 %r58, %r57, 1;
|
||||
mov.s32 %r59, %r58;
|
||||
mov.u32 %r60, 0;
|
||||
setp.ne.u32 %p12, %r58, %r60;
|
||||
@!%p12 bra $Lt_1_26626;
|
||||
$Lt_1_27138:
|
||||
setp.ge.u32 %p13, %r17, %r59;
|
||||
@%p13 bra $Lt_1_27394;
|
||||
add.u32 %r61, %r1, %r59;
|
||||
cvt.u64.u32 %rd55, %r61;
|
||||
mul.wide.u32 %rd56, %r61, 4;
|
||||
add.u64 %rd57, %rd51, %rd56;
|
||||
ld.shared.f32 %f93, [%rd57+0];
|
||||
add.ftz.f32 %f89, %f93, %f89;
|
||||
st.shared.f32 [%rd54+0], %f89;
|
||||
ld.shared.f32 %f94, [%rd57+512];
|
||||
add.ftz.f32 %f90, %f94, %f90;
|
||||
st.shared.f32 [%rd54+512], %f90;
|
||||
ld.shared.f32 %f95, [%rd57+1024];
|
||||
add.ftz.f32 %f91, %f95, %f91;
|
||||
st.shared.f32 [%rd54+1024], %f91;
|
||||
ld.shared.f32 %f96, [%rd57+1536];
|
||||
add.ftz.f32 %f92, %f96, %f92;
|
||||
st.shared.f32 [%rd54+1536], %f92;
|
||||
$Lt_1_27394:
|
||||
shr.u32 %r59, %r59, 1;
|
||||
mov.u32 %r62, 0;
|
||||
setp.ne.u32 %p14, %r59, %r62;
|
||||
@%p14 bra $Lt_1_27138;
|
||||
$Lt_1_26626:
|
||||
mov.f32 %f33, %f89;
|
||||
mov.f32 %f32, %f90;
|
||||
mov.f32 %f31, %f91;
|
||||
mov.f32 %f34, %f92;
|
||||
ld.param.s32 %r63, [__cudaparm_kernel_pair_fast_vflag];
|
||||
mov.u32 %r64, 0;
|
||||
setp.le.s32 %p15, %r63, %r64;
|
||||
@%p15 bra $Lt_1_28162;
|
||||
mov.f32 %f89, %f11;
|
||||
st.shared.f32 [%rd54+0], %f89;
|
||||
mov.f32 %f90, %f13;
|
||||
st.shared.f32 [%rd54+512], %f90;
|
||||
mov.f32 %f91, %f15;
|
||||
st.shared.f32 [%rd54+1024], %f91;
|
||||
mov.f32 %f92, %f17;
|
||||
st.shared.f32 [%rd54+1536], %f92;
|
||||
mov.f32 %f97, %f19;
|
||||
st.shared.f32 [%rd54+2048], %f97;
|
||||
mov.f32 %f98, %f20;
|
||||
st.shared.f32 [%rd54+2560], %f98;
|
||||
mov.s32 %r65, %r58;
|
||||
@!%p12 bra $Lt_1_28674;
|
||||
$Lt_1_29186:
|
||||
setp.ge.u32 %p16, %r17, %r65;
|
||||
@%p16 bra $Lt_1_29442;
|
||||
add.u32 %r66, %r1, %r65;
|
||||
cvt.u64.u32 %rd58, %r66;
|
||||
mul.wide.u32 %rd59, %r66, 4;
|
||||
add.u64 %rd60, %rd51, %rd59;
|
||||
ld.shared.f32 %f99, [%rd60+0];
|
||||
add.ftz.f32 %f89, %f99, %f89;
|
||||
st.shared.f32 [%rd54+0], %f89;
|
||||
ld.shared.f32 %f100, [%rd60+512];
|
||||
add.ftz.f32 %f90, %f100, %f90;
|
||||
st.shared.f32 [%rd54+512], %f90;
|
||||
ld.shared.f32 %f101, [%rd60+1024];
|
||||
add.ftz.f32 %f91, %f101, %f91;
|
||||
st.shared.f32 [%rd54+1024], %f91;
|
||||
ld.shared.f32 %f102, [%rd60+1536];
|
||||
add.ftz.f32 %f92, %f102, %f92;
|
||||
st.shared.f32 [%rd54+1536], %f92;
|
||||
ld.shared.f32 %f103, [%rd60+2048];
|
||||
add.ftz.f32 %f97, %f103, %f97;
|
||||
st.shared.f32 [%rd54+2048], %f97;
|
||||
ld.shared.f32 %f104, [%rd60+2560];
|
||||
add.ftz.f32 %f98, %f104, %f98;
|
||||
st.shared.f32 [%rd54+2560], %f98;
|
||||
$Lt_1_29442:
|
||||
shr.u32 %r65, %r65, 1;
|
||||
mov.u32 %r67, 0;
|
||||
setp.ne.u32 %p17, %r65, %r67;
|
||||
@%p17 bra $Lt_1_29186;
|
||||
$Lt_1_28674:
|
||||
mov.f32 %f11, %f89;
|
||||
mov.f32 %f13, %f90;
|
||||
mov.f32 %f15, %f91;
|
||||
mov.f32 %f17, %f92;
|
||||
mov.f32 %f19, %f97;
|
||||
mov.f32 %f21, %f98;
|
||||
$Lt_1_28162:
|
||||
$Lt_1_26114:
|
||||
mov.u32 %r68, 0;
|
||||
setp.ne.s32 %p18, %r17, %r68;
|
||||
@%p18 bra $Lt_1_30210;
|
||||
ld.param.u64 %rd61, [__cudaparm_kernel_pair_fast___val_paramengv];
|
||||
add.u64 %rd62, %rd61, %rd20;
|
||||
ld.param.s32 %r69, [__cudaparm_kernel_pair_fast_eflag];
|
||||
mov.u32 %r70, 0;
|
||||
setp.le.s32 %p19, %r69, %r70;
|
||||
@%p19 bra $Lt_1_30722;
|
||||
st.global.f32 [%rd62+0], %f34;
|
||||
cvt.s64.s32 %rd63, %r13;
|
||||
mul.wide.s32 %rd64, %r13, 4;
|
||||
add.u64 %rd62, %rd62, %rd64;
|
||||
$Lt_1_30722:
|
||||
ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag];
|
||||
mov.u32 %r72, 0;
|
||||
setp.le.s32 %p20, %r71, %r72;
|
||||
@%p20 bra $Lt_1_31234;
|
||||
mov.f32 %f105, %f11;
|
||||
st.global.f32 [%rd62+0], %f105;
|
||||
cvt.s64.s32 %rd65, %r13;
|
||||
mul.wide.s32 %rd66, %r13, 4;
|
||||
add.u64 %rd67, %rd66, %rd62;
|
||||
mov.f32 %f106, %f13;
|
||||
st.global.f32 [%rd67+0], %f106;
|
||||
add.u64 %rd68, %rd66, %rd67;
|
||||
mov.f32 %f107, %f15;
|
||||
st.global.f32 [%rd68+0], %f107;
|
||||
add.u64 %rd69, %rd66, %rd68;
|
||||
mov.f32 %f108, %f17;
|
||||
st.global.f32 [%rd69+0], %f108;
|
||||
add.u64 %rd62, %rd66, %rd69;
|
||||
mov.f32 %f109, %f19;
|
||||
st.global.f32 [%rd62+0], %f109;
|
||||
mov.f32 %f110, %f21;
|
||||
add.u64 %rd70, %rd66, %rd62;
|
||||
st.global.f32 [%rd70+0], %f110;
|
||||
$Lt_1_31234:
|
||||
ld.param.u64 %rd71, [__cudaparm_kernel_pair_fast_ans];
|
||||
mul.lo.u64 %rd72, %rd19, 16;
|
||||
add.u64 %rd73, %rd71, %rd72;
|
||||
mov.f32 %f111, %f112;
|
||||
st.global.v4.f32 [%rd73+0], {%f33,%f32,%f31,%f111};
|
||||
$Lt_1_30210:
|
||||
$Lt_1_22530:
|
||||
.loc 16 194 0
|
||||
exit;
|
||||
$LDWend_kernel_pair_fast:
|
||||
} // kernel_pair_fast
|
||||
|
||||
@ -1,860 +0,0 @@
|
||||
const char * lj_expand =
|
||||
" .version 2.3\n"
|
||||
" .target sm_20\n"
|
||||
" .address_size 64\n"
|
||||
" .global .texref pos_tex;\n"
|
||||
" .entry kernel_pair (\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_x_,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_lj1,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_lj3,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_lj_types,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_sp_lj_in,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_dev_nbor,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_dev_packed,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_ans,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair___val_paramengv,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_eflag,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_vflag,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_inum,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_nbor_pitch,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_t_per_atom)\n"
|
||||
" {\n"
|
||||
" .reg .u32 %r<72>;\n"
|
||||
" .reg .u64 %rd<63>;\n"
|
||||
" .reg .f32 %f<107>;\n"
|
||||
" .reg .pred %p<19>;\n"
|
||||
" .shared .align 16 .b8 __cuda___cuda_local_var_32536_33_non_const_sp_lj92[16];\n"
|
||||
" .shared .align 4 .b8 __cuda___cuda_local_var_32603_55_non_const_red_acc108[3072];\n"
|
||||
" .loc 16 31 0\n"
|
||||
"$LDWbegin_kernel_pair:\n"
|
||||
" .loc 16 36 0\n"
|
||||
" ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];\n"
|
||||
" ldu.global.f32 %f1, [%rd1+0];\n"
|
||||
" .loc 16 37 0\n"
|
||||
" ld.global.f32 %f2, [%rd1+4];\n"
|
||||
" .loc 16 38 0\n"
|
||||
" ld.global.f32 %f3, [%rd1+8];\n"
|
||||
" .loc 16 39 0\n"
|
||||
" ld.global.f32 %f4, [%rd1+12];\n"
|
||||
" st.shared.v4.f32 [__cuda___cuda_local_var_32536_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4};\n"
|
||||
" .loc 16 46 0\n"
|
||||
" mov.f32 %f5, 0f00000000; \n"
|
||||
" mov.f32 %f6, %f5;\n"
|
||||
" mov.f32 %f7, 0f00000000; \n"
|
||||
" mov.f32 %f8, %f7;\n"
|
||||
" mov.f32 %f9, 0f00000000; \n"
|
||||
" mov.f32 %f10, %f9;\n"
|
||||
" mov.f32 %f11, 0f00000000; \n"
|
||||
" mov.f32 %f12, %f11;\n"
|
||||
" mov.f32 %f13, 0f00000000; \n"
|
||||
" mov.f32 %f14, %f13;\n"
|
||||
" mov.f32 %f15, 0f00000000; \n"
|
||||
" mov.f32 %f16, %f15;\n"
|
||||
" ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];\n"
|
||||
" cvt.s32.u32 %r2, %tid.x;\n"
|
||||
" div.s32 %r3, %r2, %r1;\n"
|
||||
" cvt.s32.u32 %r4, %ntid.x;\n"
|
||||
" div.s32 %r5, %r4, %r1;\n"
|
||||
" cvt.s32.u32 %r6, %ctaid.x;\n"
|
||||
" mul.lo.s32 %r7, %r6, %r5;\n"
|
||||
" add.s32 %r8, %r3, %r7;\n"
|
||||
" ld.param.s32 %r9, [__cudaparm_kernel_pair_inum];\n"
|
||||
" setp.ge.s32 %p1, %r8, %r9;\n"
|
||||
" @%p1 bra $Lt_0_26370;\n"
|
||||
" .loc 16 51 0\n"
|
||||
" ld.param.s32 %r10, [__cudaparm_kernel_pair_nbor_pitch];\n"
|
||||
" cvt.s64.s32 %rd2, %r10;\n"
|
||||
" mul.wide.s32 %rd3, %r10, 4;\n"
|
||||
" cvt.s64.s32 %rd4, %r8;\n"
|
||||
" mul.wide.s32 %rd5, %r8, 4;\n"
|
||||
" ld.param.u64 %rd6, [__cudaparm_kernel_pair_dev_nbor];\n"
|
||||
" add.u64 %rd7, %rd5, %rd6;\n"
|
||||
" add.u64 %rd8, %rd3, %rd7;\n"
|
||||
" ld.global.s32 %r11, [%rd8+0];\n"
|
||||
" sub.s32 %r12, %r1, 1;\n"
|
||||
" and.b32 %r13, %r12, %r2;\n"
|
||||
" cvt.s64.s32 %rd9, %r13;\n"
|
||||
" mul.wide.s32 %rd10, %r13, 4;\n"
|
||||
" ld.param.u64 %rd11, [__cudaparm_kernel_pair_dev_packed];\n"
|
||||
" setp.ne.u64 %p2, %rd11, %rd6;\n"
|
||||
" @%p2 bra $Lt_0_19458;\n"
|
||||
" cvt.s32.s64 %r14, %rd2;\n"
|
||||
" mul.lo.s32 %r15, %r14, %r1;\n"
|
||||
" mov.s32 %r16, %r15;\n"
|
||||
" mul.lo.s32 %r17, %r12, %r8;\n"
|
||||
" add.s32 %r18, %r14, %r17;\n"
|
||||
" cvt.s64.s32 %rd12, %r18;\n"
|
||||
" mul.wide.s32 %rd13, %r18, 4;\n"
|
||||
" add.u64 %rd14, %rd8, %rd13;\n"
|
||||
" and.b32 %r19, %r12, %r11;\n"
|
||||
" cvt.s64.s32 %rd15, %r19;\n"
|
||||
" div.s32 %r20, %r11, %r1;\n"
|
||||
" mul.lo.s32 %r21, %r15, %r20;\n"
|
||||
" cvt.s64.s32 %rd16, %r21;\n"
|
||||
" add.u64 %rd17, %rd15, %rd16;\n"
|
||||
" mul.lo.u64 %rd18, %rd17, 4;\n"
|
||||
" add.u64 %rd19, %rd14, %rd18;\n"
|
||||
" add.u64 %rd20, %rd10, %rd14;\n"
|
||||
" bra.uni $Lt_0_19202;\n"
|
||||
"$Lt_0_19458:\n"
|
||||
" add.u64 %rd21, %rd3, %rd8;\n"
|
||||
" ld.global.s32 %r22, [%rd21+0];\n"
|
||||
" cvt.s64.s32 %rd22, %r22;\n"
|
||||
" mul.wide.s32 %rd23, %r22, 4;\n"
|
||||
" add.u64 %rd24, %rd11, %rd23;\n"
|
||||
" cvt.s64.s32 %rd25, %r11;\n"
|
||||
" mul.wide.s32 %rd26, %r11, 4;\n"
|
||||
" add.u64 %rd19, %rd24, %rd26;\n"
|
||||
" mov.s32 %r16, %r1;\n"
|
||||
" add.u64 %rd20, %rd10, %rd24;\n"
|
||||
"$Lt_0_19202:\n"
|
||||
" .loc 16 54 0\n"
|
||||
" ld.global.s32 %r23, [%rd7+0];\n"
|
||||
" mov.u32 %r24, %r23;\n"
|
||||
" mov.s32 %r25, 0;\n"
|
||||
" mov.u32 %r26, %r25;\n"
|
||||
" mov.s32 %r27, 0;\n"
|
||||
" mov.u32 %r28, %r27;\n"
|
||||
" mov.s32 %r29, 0;\n"
|
||||
" mov.u32 %r30, %r29;\n"
|
||||
" tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r24,%r26,%r28,%r30}];\n"
|
||||
" mov.f32 %f21, %f17;\n"
|
||||
" mov.f32 %f22, %f18;\n"
|
||||
" mov.f32 %f23, %f19;\n"
|
||||
" mov.f32 %f24, %f20;\n"
|
||||
" setp.ge.u64 %p3, %rd20, %rd19;\n"
|
||||
" @%p3 bra $Lt_0_27906;\n"
|
||||
" cvt.rzi.ftz.s32.f32 %r31, %f24;\n"
|
||||
" cvt.s64.s32 %rd27, %r16;\n"
|
||||
" ld.param.s32 %r32, [__cudaparm_kernel_pair_lj_types];\n"
|
||||
" mul.lo.s32 %r33, %r32, %r31;\n"
|
||||
" ld.param.u64 %rd28, [__cudaparm_kernel_pair_lj1];\n"
|
||||
" mov.f32 %f25, 0f00000000; \n"
|
||||
" mov.f32 %f26, 0f00000000; \n"
|
||||
" mov.f32 %f27, 0f00000000; \n"
|
||||
" mov.f32 %f28, 0f00000000; \n"
|
||||
" mov.u64 %rd29, __cuda___cuda_local_var_32536_33_non_const_sp_lj92;\n"
|
||||
"$Lt_0_20226:\n"
|
||||
" .loc 16 60 0\n"
|
||||
" ld.global.s32 %r34, [%rd20+0];\n"
|
||||
" .loc 16 61 0\n"
|
||||
" shr.s32 %r35, %r34, 30;\n"
|
||||
" and.b32 %r36, %r35, 3;\n"
|
||||
" cvt.s64.s32 %rd30, %r36;\n"
|
||||
" mul.wide.s32 %rd31, %r36, 4;\n"
|
||||
" add.u64 %rd32, %rd29, %rd31;\n"
|
||||
" ld.shared.f32 %f29, [%rd32+0];\n"
|
||||
" .loc 16 64 0\n"
|
||||
" and.b32 %r37, %r34, 1073741823;\n"
|
||||
" mov.u32 %r38, %r37;\n"
|
||||
" mov.s32 %r39, 0;\n"
|
||||
" mov.u32 %r40, %r39;\n"
|
||||
" mov.s32 %r41, 0;\n"
|
||||
" mov.u32 %r42, %r41;\n"
|
||||
" mov.s32 %r43, 0;\n"
|
||||
" mov.u32 %r44, %r43;\n"
|
||||
" tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r38,%r40,%r42,%r44}];\n"
|
||||
" mov.f32 %f34, %f30;\n"
|
||||
" mov.f32 %f35, %f31;\n"
|
||||
" mov.f32 %f36, %f32;\n"
|
||||
" mov.f32 %f37, %f33;\n"
|
||||
" cvt.rzi.ftz.s32.f32 %r45, %f37;\n"
|
||||
" sub.ftz.f32 %f38, %f22, %f35;\n"
|
||||
" sub.ftz.f32 %f39, %f21, %f34;\n"
|
||||
" sub.ftz.f32 %f40, %f23, %f36;\n"
|
||||
" mul.ftz.f32 %f41, %f38, %f38;\n"
|
||||
" fma.rn.ftz.f32 %f42, %f39, %f39, %f41;\n"
|
||||
" fma.rn.ftz.f32 %f43, %f40, %f40, %f42;\n"
|
||||
" add.s32 %r46, %r45, %r33;\n"
|
||||
" cvt.s64.s32 %rd33, %r46;\n"
|
||||
" mul.wide.s32 %rd34, %r46, 16;\n"
|
||||
" add.u64 %rd35, %rd34, %rd28;\n"
|
||||
" ld.global.f32 %f44, [%rd35+8];\n"
|
||||
" setp.gt.ftz.f32 %p4, %f44, %f43;\n"
|
||||
" @!%p4 bra $Lt_0_21506;\n"
|
||||
" .loc 16 76 0\n"
|
||||
" sqrt.approx.ftz.f32 %f45, %f43;\n"
|
||||
" ld.global.v4.f32 {%f46,%f47,_,%f48}, [%rd35+0];\n"
|
||||
" sub.ftz.f32 %f49, %f45, %f48;\n"
|
||||
" .loc 16 81 0\n"
|
||||
" mul.ftz.f32 %f50, %f49, %f49;\n"
|
||||
" rcp.approx.ftz.f32 %f51, %f50;\n"
|
||||
" mul.ftz.f32 %f52, %f51, %f51;\n"
|
||||
" mul.ftz.f32 %f53, %f51, %f52;\n"
|
||||
" div.approx.ftz.f32 %f54, %f29, %f49;\n"
|
||||
" div.approx.ftz.f32 %f55, %f54, %f45;\n"
|
||||
" mul.ftz.f32 %f56, %f46, %f53;\n"
|
||||
" sub.ftz.f32 %f57, %f56, %f47;\n"
|
||||
" mul.ftz.f32 %f58, %f53, %f57;\n"
|
||||
" mul.ftz.f32 %f59, %f55, %f58;\n"
|
||||
" .loc 16 83 0\n"
|
||||
" fma.rn.ftz.f32 %f27, %f39, %f59, %f27;\n"
|
||||
" .loc 16 84 0\n"
|
||||
" fma.rn.ftz.f32 %f26, %f38, %f59, %f26;\n"
|
||||
" .loc 16 85 0\n"
|
||||
" fma.rn.ftz.f32 %f25, %f40, %f59, %f25;\n"
|
||||
" ld.param.s32 %r47, [__cudaparm_kernel_pair_eflag];\n"
|
||||
" mov.u32 %r48, 0;\n"
|
||||
" setp.le.s32 %p5, %r47, %r48;\n"
|
||||
" @%p5 bra $Lt_0_20994;\n"
|
||||
" .loc 16 89 0\n"
|
||||
" ld.param.u64 %rd36, [__cudaparm_kernel_pair_lj3];\n"
|
||||
" add.u64 %rd37, %rd36, %rd34;\n"
|
||||
" ld.global.v4.f32 {%f60,%f61,%f62,_}, [%rd37+0];\n"
|
||||
" mul.ftz.f32 %f63, %f60, %f53;\n"
|
||||
" sub.ftz.f32 %f64, %f63, %f61;\n"
|
||||
" mul.ftz.f32 %f65, %f53, %f64;\n"
|
||||
" sub.ftz.f32 %f66, %f65, %f62;\n"
|
||||
" fma.rn.ftz.f32 %f28, %f29, %f66, %f28;\n"
|
||||
"$Lt_0_20994:\n"
|
||||
" ld.param.s32 %r49, [__cudaparm_kernel_pair_vflag];\n"
|
||||
" mov.u32 %r50, 0;\n"
|
||||
" setp.le.s32 %p6, %r49, %r50;\n"
|
||||
" @%p6 bra $Lt_0_21506;\n"
|
||||
" .loc 16 92 0\n"
|
||||
" mov.f32 %f67, %f6;\n"
|
||||
" mul.ftz.f32 %f68, %f39, %f39;\n"
|
||||
" fma.rn.ftz.f32 %f69, %f59, %f68, %f67;\n"
|
||||
" mov.f32 %f6, %f69;\n"
|
||||
" .loc 16 93 0\n"
|
||||
" mov.f32 %f70, %f8;\n"
|
||||
" fma.rn.ftz.f32 %f71, %f59, %f41, %f70;\n"
|
||||
" mov.f32 %f8, %f71;\n"
|
||||
" .loc 16 94 0\n"
|
||||
" mov.f32 %f72, %f10;\n"
|
||||
" mul.ftz.f32 %f73, %f40, %f40;\n"
|
||||
" fma.rn.ftz.f32 %f74, %f59, %f73, %f72;\n"
|
||||
" mov.f32 %f10, %f74;\n"
|
||||
" .loc 16 95 0\n"
|
||||
" mov.f32 %f75, %f12;\n"
|
||||
" mul.ftz.f32 %f76, %f38, %f39;\n"
|
||||
" fma.rn.ftz.f32 %f77, %f59, %f76, %f75;\n"
|
||||
" mov.f32 %f12, %f77;\n"
|
||||
" .loc 16 96 0\n"
|
||||
" mov.f32 %f78, %f14;\n"
|
||||
" mul.ftz.f32 %f79, %f39, %f40;\n"
|
||||
" fma.rn.ftz.f32 %f80, %f59, %f79, %f78;\n"
|
||||
" mov.f32 %f14, %f80;\n"
|
||||
" .loc 16 97 0\n"
|
||||
" mul.ftz.f32 %f81, %f38, %f40;\n"
|
||||
" fma.rn.ftz.f32 %f15, %f59, %f81, %f15;\n"
|
||||
" mov.f32 %f16, %f15;\n"
|
||||
"$Lt_0_21506:\n"
|
||||
"$Lt_0_20482:\n"
|
||||
" .loc 16 58 0\n"
|
||||
" mul.lo.u64 %rd38, %rd27, 4;\n"
|
||||
" add.u64 %rd20, %rd20, %rd38;\n"
|
||||
" setp.lt.u64 %p7, %rd20, %rd19;\n"
|
||||
" @%p7 bra $Lt_0_20226;\n"
|
||||
" bra.uni $Lt_0_19714;\n"
|
||||
"$Lt_0_27906:\n"
|
||||
" mov.f32 %f25, 0f00000000; \n"
|
||||
" mov.f32 %f26, 0f00000000; \n"
|
||||
" mov.f32 %f27, 0f00000000; \n"
|
||||
" mov.f32 %f28, 0f00000000; \n"
|
||||
"$Lt_0_19714:\n"
|
||||
" mov.u32 %r51, 1;\n"
|
||||
" setp.le.s32 %p8, %r1, %r51;\n"
|
||||
" @%p8 bra $Lt_0_24322;\n"
|
||||
" .loc 16 102 0\n"
|
||||
" mov.u64 %rd39, __cuda___cuda_local_var_32603_55_non_const_red_acc108;\n"
|
||||
" cvt.s64.s32 %rd40, %r2;\n"
|
||||
" mul.wide.s32 %rd41, %r2, 4;\n"
|
||||
" add.u64 %rd42, %rd39, %rd41;\n"
|
||||
" mov.f32 %f82, %f27;\n"
|
||||
" st.shared.f32 [%rd42+0], %f82;\n"
|
||||
" mov.f32 %f83, %f26;\n"
|
||||
" st.shared.f32 [%rd42+512], %f83;\n"
|
||||
" mov.f32 %f84, %f25;\n"
|
||||
" st.shared.f32 [%rd42+1024], %f84;\n"
|
||||
" mov.f32 %f85, %f28;\n"
|
||||
" st.shared.f32 [%rd42+1536], %f85;\n"
|
||||
" shr.s32 %r52, %r1, 31;\n"
|
||||
" mov.s32 %r53, 1;\n"
|
||||
" and.b32 %r54, %r52, %r53;\n"
|
||||
" add.s32 %r55, %r54, %r1;\n"
|
||||
" shr.s32 %r56, %r55, 1;\n"
|
||||
" mov.s32 %r57, %r56;\n"
|
||||
" mov.u32 %r58, 0;\n"
|
||||
" setp.ne.u32 %p9, %r56, %r58;\n"
|
||||
" @!%p9 bra $Lt_0_22786;\n"
|
||||
"$Lt_0_23298:\n"
|
||||
" setp.ge.u32 %p10, %r13, %r57;\n"
|
||||
" @%p10 bra $Lt_0_23554;\n"
|
||||
" add.u32 %r59, %r2, %r57;\n"
|
||||
" cvt.u64.u32 %rd43, %r59;\n"
|
||||
" mul.wide.u32 %rd44, %r59, 4;\n"
|
||||
" add.u64 %rd45, %rd39, %rd44;\n"
|
||||
" ld.shared.f32 %f86, [%rd45+0];\n"
|
||||
" add.ftz.f32 %f82, %f86, %f82;\n"
|
||||
" st.shared.f32 [%rd42+0], %f82;\n"
|
||||
" ld.shared.f32 %f87, [%rd45+512];\n"
|
||||
" add.ftz.f32 %f83, %f87, %f83;\n"
|
||||
" st.shared.f32 [%rd42+512], %f83;\n"
|
||||
" ld.shared.f32 %f88, [%rd45+1024];\n"
|
||||
" add.ftz.f32 %f84, %f88, %f84;\n"
|
||||
" st.shared.f32 [%rd42+1024], %f84;\n"
|
||||
" ld.shared.f32 %f89, [%rd45+1536];\n"
|
||||
" add.ftz.f32 %f85, %f89, %f85;\n"
|
||||
" st.shared.f32 [%rd42+1536], %f85;\n"
|
||||
"$Lt_0_23554:\n"
|
||||
" shr.u32 %r57, %r57, 1;\n"
|
||||
" mov.u32 %r60, 0;\n"
|
||||
" setp.ne.u32 %p11, %r57, %r60;\n"
|
||||
" @%p11 bra $Lt_0_23298;\n"
|
||||
"$Lt_0_22786:\n"
|
||||
" mov.f32 %f27, %f82;\n"
|
||||
" mov.f32 %f26, %f83;\n"
|
||||
" mov.f32 %f25, %f84;\n"
|
||||
" mov.f32 %f28, %f85;\n"
|
||||
" ld.param.s32 %r61, [__cudaparm_kernel_pair_vflag];\n"
|
||||
" mov.u32 %r62, 0;\n"
|
||||
" setp.le.s32 %p12, %r61, %r62;\n"
|
||||
" @%p12 bra $Lt_0_24322;\n"
|
||||
" mov.f32 %f82, %f6;\n"
|
||||
" st.shared.f32 [%rd42+0], %f82;\n"
|
||||
" mov.f32 %f83, %f8;\n"
|
||||
" st.shared.f32 [%rd42+512], %f83;\n"
|
||||
" mov.f32 %f84, %f10;\n"
|
||||
" st.shared.f32 [%rd42+1024], %f84;\n"
|
||||
" mov.f32 %f85, %f12;\n"
|
||||
" st.shared.f32 [%rd42+1536], %f85;\n"
|
||||
" mov.f32 %f90, %f14;\n"
|
||||
" st.shared.f32 [%rd42+2048], %f90;\n"
|
||||
" mov.f32 %f91, %f15;\n"
|
||||
" st.shared.f32 [%rd42+2560], %f91;\n"
|
||||
" mov.s32 %r63, %r56;\n"
|
||||
" @!%p9 bra $Lt_0_24834;\n"
|
||||
"$Lt_0_25346:\n"
|
||||
" setp.ge.u32 %p13, %r13, %r63;\n"
|
||||
" @%p13 bra $Lt_0_25602;\n"
|
||||
" add.u32 %r64, %r2, %r63;\n"
|
||||
" cvt.u64.u32 %rd46, %r64;\n"
|
||||
" mul.wide.u32 %rd47, %r64, 4;\n"
|
||||
" add.u64 %rd48, %rd39, %rd47;\n"
|
||||
" ld.shared.f32 %f92, [%rd48+0];\n"
|
||||
" add.ftz.f32 %f82, %f92, %f82;\n"
|
||||
" st.shared.f32 [%rd42+0], %f82;\n"
|
||||
" ld.shared.f32 %f93, [%rd48+512];\n"
|
||||
" add.ftz.f32 %f83, %f93, %f83;\n"
|
||||
" st.shared.f32 [%rd42+512], %f83;\n"
|
||||
" ld.shared.f32 %f94, [%rd48+1024];\n"
|
||||
" add.ftz.f32 %f84, %f94, %f84;\n"
|
||||
" st.shared.f32 [%rd42+1024], %f84;\n"
|
||||
" ld.shared.f32 %f95, [%rd48+1536];\n"
|
||||
" add.ftz.f32 %f85, %f95, %f85;\n"
|
||||
" st.shared.f32 [%rd42+1536], %f85;\n"
|
||||
" ld.shared.f32 %f96, [%rd48+2048];\n"
|
||||
" add.ftz.f32 %f90, %f96, %f90;\n"
|
||||
" st.shared.f32 [%rd42+2048], %f90;\n"
|
||||
" ld.shared.f32 %f97, [%rd48+2560];\n"
|
||||
" add.ftz.f32 %f91, %f97, %f91;\n"
|
||||
" st.shared.f32 [%rd42+2560], %f91;\n"
|
||||
"$Lt_0_25602:\n"
|
||||
" shr.u32 %r63, %r63, 1;\n"
|
||||
" mov.u32 %r65, 0;\n"
|
||||
" setp.ne.u32 %p14, %r63, %r65;\n"
|
||||
" @%p14 bra $Lt_0_25346;\n"
|
||||
"$Lt_0_24834:\n"
|
||||
" mov.f32 %f6, %f82;\n"
|
||||
" mov.f32 %f8, %f83;\n"
|
||||
" mov.f32 %f10, %f84;\n"
|
||||
" mov.f32 %f12, %f85;\n"
|
||||
" mov.f32 %f14, %f90;\n"
|
||||
" mov.f32 %f16, %f91;\n"
|
||||
"$Lt_0_24322:\n"
|
||||
"$Lt_0_22274:\n"
|
||||
" mov.u32 %r66, 0;\n"
|
||||
" setp.ne.s32 %p15, %r13, %r66;\n"
|
||||
" @%p15 bra $Lt_0_26370;\n"
|
||||
" ld.param.u64 %rd49, [__cudaparm_kernel_pair___val_paramengv];\n"
|
||||
" add.u64 %rd50, %rd49, %rd5;\n"
|
||||
" ld.param.s32 %r67, [__cudaparm_kernel_pair_eflag];\n"
|
||||
" mov.u32 %r68, 0;\n"
|
||||
" setp.le.s32 %p16, %r67, %r68;\n"
|
||||
" @%p16 bra $Lt_0_26882;\n"
|
||||
" st.global.f32 [%rd50+0], %f28;\n"
|
||||
" cvt.s64.s32 %rd51, %r9;\n"
|
||||
" mul.wide.s32 %rd52, %r9, 4;\n"
|
||||
" add.u64 %rd50, %rd50, %rd52;\n"
|
||||
"$Lt_0_26882:\n"
|
||||
" ld.param.s32 %r69, [__cudaparm_kernel_pair_vflag];\n"
|
||||
" mov.u32 %r70, 0;\n"
|
||||
" setp.le.s32 %p17, %r69, %r70;\n"
|
||||
" @%p17 bra $Lt_0_27394;\n"
|
||||
" mov.f32 %f98, %f6;\n"
|
||||
" st.global.f32 [%rd50+0], %f98;\n"
|
||||
" cvt.s64.s32 %rd53, %r9;\n"
|
||||
" mul.wide.s32 %rd54, %r9, 4;\n"
|
||||
" add.u64 %rd55, %rd54, %rd50;\n"
|
||||
" mov.f32 %f99, %f8;\n"
|
||||
" st.global.f32 [%rd55+0], %f99;\n"
|
||||
" add.u64 %rd56, %rd54, %rd55;\n"
|
||||
" mov.f32 %f100, %f10;\n"
|
||||
" st.global.f32 [%rd56+0], %f100;\n"
|
||||
" add.u64 %rd57, %rd54, %rd56;\n"
|
||||
" mov.f32 %f101, %f12;\n"
|
||||
" st.global.f32 [%rd57+0], %f101;\n"
|
||||
" add.u64 %rd50, %rd54, %rd57;\n"
|
||||
" mov.f32 %f102, %f14;\n"
|
||||
" st.global.f32 [%rd50+0], %f102;\n"
|
||||
" mov.f32 %f103, %f16;\n"
|
||||
" add.u64 %rd58, %rd54, %rd50;\n"
|
||||
" st.global.f32 [%rd58+0], %f103;\n"
|
||||
"$Lt_0_27394:\n"
|
||||
" ld.param.u64 %rd59, [__cudaparm_kernel_pair_ans];\n"
|
||||
" mul.lo.u64 %rd60, %rd4, 16;\n"
|
||||
" add.u64 %rd61, %rd59, %rd60;\n"
|
||||
" mov.f32 %f104, %f105;\n"
|
||||
" st.global.v4.f32 [%rd61+0], {%f27,%f26,%f25,%f104};\n"
|
||||
"$Lt_0_26370:\n"
|
||||
"$Lt_0_18690:\n"
|
||||
" .loc 16 105 0\n"
|
||||
" exit;\n"
|
||||
"$LDWend_kernel_pair:\n"
|
||||
" }\n"
|
||||
" .entry kernel_pair_fast (\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_x_,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_lj1_in,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_lj3_in,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_dev_packed,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_ans,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast___val_paramengv,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_eflag,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_vflag,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_inum,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_t_per_atom)\n"
|
||||
" {\n"
|
||||
" .reg .u32 %r<74>;\n"
|
||||
" .reg .u64 %rd<75>;\n"
|
||||
" .reg .f32 %f<114>;\n"
|
||||
" .reg .pred %p<22>;\n"
|
||||
" .shared .align 4 .b8 __cuda___cuda_local_var_32620_33_non_const_sp_lj3268[16];\n"
|
||||
" .shared .align 16 .b8 __cuda___cuda_local_var_32618_34_non_const_lj13296[1936];\n"
|
||||
" .shared .align 16 .b8 __cuda___cuda_local_var_32619_34_non_const_lj35232[1936];\n"
|
||||
" .shared .align 4 .b8 __cuda___cuda_local_var_32692_55_non_const_red_acc7168[3072];\n"
|
||||
" .loc 16 113 0\n"
|
||||
"$LDWbegin_kernel_pair_fast:\n"
|
||||
" cvt.s32.u32 %r1, %tid.x;\n"
|
||||
" mov.u32 %r2, 3;\n"
|
||||
" setp.gt.s32 %p1, %r1, %r2;\n"
|
||||
" @%p1 bra $Lt_1_20994;\n"
|
||||
" .loc 16 121 0\n"
|
||||
" mov.u64 %rd1, __cuda___cuda_local_var_32620_33_non_const_sp_lj3268;\n"
|
||||
" cvt.s64.s32 %rd2, %r1;\n"
|
||||
" mul.wide.s32 %rd3, %r1, 4;\n"
|
||||
" ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];\n"
|
||||
" add.u64 %rd5, %rd4, %rd3;\n"
|
||||
" ld.global.f32 %f1, [%rd5+0];\n"
|
||||
" add.u64 %rd6, %rd3, %rd1;\n"
|
||||
" st.shared.f32 [%rd6+0], %f1;\n"
|
||||
"$Lt_1_20994:\n"
|
||||
" mov.u64 %rd1, __cuda___cuda_local_var_32620_33_non_const_sp_lj3268;\n"
|
||||
" mov.u32 %r3, 120;\n"
|
||||
" setp.gt.s32 %p2, %r1, %r3;\n"
|
||||
" @%p2 bra $Lt_1_21506;\n"
|
||||
" .loc 16 123 0\n"
|
||||
" mov.u64 %rd7, __cuda___cuda_local_var_32618_34_non_const_lj13296;\n"
|
||||
" cvt.s64.s32 %rd8, %r1;\n"
|
||||
" mul.wide.s32 %rd9, %r1, 16;\n"
|
||||
" ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_lj1_in];\n"
|
||||
" add.u64 %rd11, %rd10, %rd9;\n"
|
||||
" add.u64 %rd12, %rd9, %rd7;\n"
|
||||
" ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0];\n"
|
||||
" st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5};\n"
|
||||
" ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag];\n"
|
||||
" mov.u32 %r5, 0;\n"
|
||||
" setp.le.s32 %p3, %r4, %r5;\n"
|
||||
" @%p3 bra $Lt_1_22018;\n"
|
||||
" .loc 16 125 0\n"
|
||||
" mov.u64 %rd13, __cuda___cuda_local_var_32619_34_non_const_lj35232;\n"
|
||||
" ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in];\n"
|
||||
" add.u64 %rd15, %rd14, %rd9;\n"
|
||||
" add.u64 %rd16, %rd9, %rd13;\n"
|
||||
" ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0];\n"
|
||||
" st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9};\n"
|
||||
"$Lt_1_22018:\n"
|
||||
" mov.u64 %rd13, __cuda___cuda_local_var_32619_34_non_const_lj35232;\n"
|
||||
"$Lt_1_21506:\n"
|
||||
" mov.u64 %rd13, __cuda___cuda_local_var_32619_34_non_const_lj35232;\n"
|
||||
" mov.u64 %rd7, __cuda___cuda_local_var_32618_34_non_const_lj13296;\n"
|
||||
" .loc 16 133 0\n"
|
||||
" mov.f32 %f10, 0f00000000; \n"
|
||||
" mov.f32 %f11, %f10;\n"
|
||||
" mov.f32 %f12, 0f00000000; \n"
|
||||
" mov.f32 %f13, %f12;\n"
|
||||
" mov.f32 %f14, 0f00000000; \n"
|
||||
" mov.f32 %f15, %f14;\n"
|
||||
" mov.f32 %f16, 0f00000000; \n"
|
||||
" mov.f32 %f17, %f16;\n"
|
||||
" mov.f32 %f18, 0f00000000; \n"
|
||||
" mov.f32 %f19, %f18;\n"
|
||||
" mov.f32 %f20, 0f00000000; \n"
|
||||
" mov.f32 %f21, %f20;\n"
|
||||
" .loc 16 135 0\n"
|
||||
" bar.sync 0;\n"
|
||||
" ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom];\n"
|
||||
" div.s32 %r7, %r1, %r6;\n"
|
||||
" cvt.s32.u32 %r8, %ntid.x;\n"
|
||||
" div.s32 %r9, %r8, %r6;\n"
|
||||
" cvt.s32.u32 %r10, %ctaid.x;\n"
|
||||
" mul.lo.s32 %r11, %r10, %r9;\n"
|
||||
" add.s32 %r12, %r7, %r11;\n"
|
||||
" ld.param.s32 %r13, [__cudaparm_kernel_pair_fast_inum];\n"
|
||||
" setp.ge.s32 %p4, %r12, %r13;\n"
|
||||
" @%p4 bra $Lt_1_30210;\n"
|
||||
" .loc 16 140 0\n"
|
||||
" ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_nbor_pitch];\n"
|
||||
" cvt.s64.s32 %rd17, %r14;\n"
|
||||
" mul.wide.s32 %rd18, %r14, 4;\n"
|
||||
" cvt.s64.s32 %rd19, %r12;\n"
|
||||
" mul.wide.s32 %rd20, %r12, 4;\n"
|
||||
" ld.param.u64 %rd21, [__cudaparm_kernel_pair_fast_dev_nbor];\n"
|
||||
" add.u64 %rd22, %rd20, %rd21;\n"
|
||||
" add.u64 %rd23, %rd18, %rd22;\n"
|
||||
" ld.global.s32 %r15, [%rd23+0];\n"
|
||||
" sub.s32 %r16, %r6, 1;\n"
|
||||
" and.b32 %r17, %r16, %r1;\n"
|
||||
" cvt.s64.s32 %rd24, %r17;\n"
|
||||
" mul.wide.s32 %rd25, %r17, 4;\n"
|
||||
" ld.param.u64 %rd26, [__cudaparm_kernel_pair_fast_dev_packed];\n"
|
||||
" setp.ne.u64 %p5, %rd26, %rd21;\n"
|
||||
" @%p5 bra $Lt_1_23298;\n"
|
||||
" cvt.s32.s64 %r18, %rd17;\n"
|
||||
" mul.lo.s32 %r19, %r18, %r6;\n"
|
||||
" mov.s32 %r20, %r19;\n"
|
||||
" mul.lo.s32 %r21, %r16, %r12;\n"
|
||||
" add.s32 %r22, %r18, %r21;\n"
|
||||
" cvt.s64.s32 %rd27, %r22;\n"
|
||||
" mul.wide.s32 %rd28, %r22, 4;\n"
|
||||
" add.u64 %rd29, %rd23, %rd28;\n"
|
||||
" and.b32 %r23, %r16, %r15;\n"
|
||||
" cvt.s64.s32 %rd30, %r23;\n"
|
||||
" div.s32 %r24, %r15, %r6;\n"
|
||||
" mul.lo.s32 %r25, %r19, %r24;\n"
|
||||
" cvt.s64.s32 %rd31, %r25;\n"
|
||||
" add.u64 %rd32, %rd30, %rd31;\n"
|
||||
" mul.lo.u64 %rd33, %rd32, 4;\n"
|
||||
" add.u64 %rd34, %rd29, %rd33;\n"
|
||||
" add.u64 %rd35, %rd25, %rd29;\n"
|
||||
" bra.uni $Lt_1_23042;\n"
|
||||
"$Lt_1_23298:\n"
|
||||
" add.u64 %rd36, %rd18, %rd23;\n"
|
||||
" ld.global.s32 %r26, [%rd36+0];\n"
|
||||
" cvt.s64.s32 %rd37, %r26;\n"
|
||||
" mul.wide.s32 %rd38, %r26, 4;\n"
|
||||
" add.u64 %rd39, %rd26, %rd38;\n"
|
||||
" cvt.s64.s32 %rd40, %r15;\n"
|
||||
" mul.wide.s32 %rd41, %r15, 4;\n"
|
||||
" add.u64 %rd34, %rd39, %rd41;\n"
|
||||
" mov.s32 %r20, %r6;\n"
|
||||
" add.u64 %rd35, %rd25, %rd39;\n"
|
||||
"$Lt_1_23042:\n"
|
||||
" .loc 16 143 0\n"
|
||||
" ld.global.s32 %r27, [%rd22+0];\n"
|
||||
" mov.u32 %r28, %r27;\n"
|
||||
" mov.s32 %r29, 0;\n"
|
||||
" mov.u32 %r30, %r29;\n"
|
||||
" mov.s32 %r31, 0;\n"
|
||||
" mov.u32 %r32, %r31;\n"
|
||||
" mov.s32 %r33, 0;\n"
|
||||
" mov.u32 %r34, %r33;\n"
|
||||
" tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r28,%r30,%r32,%r34}];\n"
|
||||
" mov.f32 %f26, %f22;\n"
|
||||
" mov.f32 %f27, %f23;\n"
|
||||
" mov.f32 %f28, %f24;\n"
|
||||
" mov.f32 %f29, %f25;\n"
|
||||
" setp.ge.u64 %p6, %rd35, %rd34;\n"
|
||||
" @%p6 bra $Lt_1_31746;\n"
|
||||
" cvt.rzi.ftz.s32.f32 %r35, %f29;\n"
|
||||
" cvt.s64.s32 %rd42, %r20;\n"
|
||||
" mul.lo.s32 %r36, %r35, 11;\n"
|
||||
" cvt.rn.f32.s32 %f30, %r36;\n"
|
||||
" mov.f32 %f31, 0f00000000; \n"
|
||||
" mov.f32 %f32, 0f00000000; \n"
|
||||
" mov.f32 %f33, 0f00000000; \n"
|
||||
" mov.f32 %f34, 0f00000000; \n"
|
||||
"$Lt_1_24066:\n"
|
||||
" .loc 16 150 0\n"
|
||||
" ld.global.s32 %r37, [%rd35+0];\n"
|
||||
" .loc 16 151 0\n"
|
||||
" shr.s32 %r38, %r37, 30;\n"
|
||||
" and.b32 %r39, %r38, 3;\n"
|
||||
" cvt.s64.s32 %rd43, %r39;\n"
|
||||
" mul.wide.s32 %rd44, %r39, 4;\n"
|
||||
" add.u64 %rd45, %rd1, %rd44;\n"
|
||||
" ld.shared.f32 %f35, [%rd45+0];\n"
|
||||
" .loc 16 154 0\n"
|
||||
" and.b32 %r40, %r37, 1073741823;\n"
|
||||
" mov.u32 %r41, %r40;\n"
|
||||
" mov.s32 %r42, 0;\n"
|
||||
" mov.u32 %r43, %r42;\n"
|
||||
" mov.s32 %r44, 0;\n"
|
||||
" mov.u32 %r45, %r44;\n"
|
||||
" mov.s32 %r46, 0;\n"
|
||||
" mov.u32 %r47, %r46;\n"
|
||||
" tex.1d.v4.f32.s32 {%f36,%f37,%f38,%f39},[pos_tex,{%r41,%r43,%r45,%r47}];\n"
|
||||
" mov.f32 %f40, %f36;\n"
|
||||
" mov.f32 %f41, %f37;\n"
|
||||
" mov.f32 %f42, %f38;\n"
|
||||
" mov.f32 %f43, %f39;\n"
|
||||
" sub.ftz.f32 %f44, %f27, %f41;\n"
|
||||
" sub.ftz.f32 %f45, %f26, %f40;\n"
|
||||
" sub.ftz.f32 %f46, %f28, %f42;\n"
|
||||
" mul.ftz.f32 %f47, %f44, %f44;\n"
|
||||
" fma.rn.ftz.f32 %f48, %f45, %f45, %f47;\n"
|
||||
" fma.rn.ftz.f32 %f49, %f46, %f46, %f48;\n"
|
||||
" add.ftz.f32 %f50, %f30, %f43;\n"
|
||||
" cvt.rzi.ftz.s32.f32 %r48, %f50;\n"
|
||||
" cvt.s64.s32 %rd46, %r48;\n"
|
||||
" mul.wide.s32 %rd47, %r48, 16;\n"
|
||||
" add.u64 %rd48, %rd47, %rd7;\n"
|
||||
" ld.shared.f32 %f51, [%rd48+8];\n"
|
||||
" setp.gt.ftz.f32 %p7, %f51, %f49;\n"
|
||||
" @!%p7 bra $Lt_1_25346;\n"
|
||||
" .loc 16 165 0\n"
|
||||
" sqrt.approx.ftz.f32 %f52, %f49;\n"
|
||||
" ld.shared.v4.f32 {%f53,%f54,_,%f55}, [%rd48+0];\n"
|
||||
" sub.ftz.f32 %f56, %f52, %f55;\n"
|
||||
" .loc 16 169 0\n"
|
||||
" mul.ftz.f32 %f57, %f56, %f56;\n"
|
||||
" rcp.approx.ftz.f32 %f58, %f57;\n"
|
||||
" mul.ftz.f32 %f59, %f58, %f58;\n"
|
||||
" mul.ftz.f32 %f60, %f58, %f59;\n"
|
||||
" mul.ftz.f32 %f61, %f53, %f60;\n"
|
||||
" sub.ftz.f32 %f62, %f61, %f54;\n"
|
||||
" mul.ftz.f32 %f63, %f60, %f62;\n"
|
||||
" .loc 16 170 0\n"
|
||||
" div.approx.ftz.f32 %f64, %f35, %f56;\n"
|
||||
" div.approx.ftz.f32 %f65, %f64, %f52;\n"
|
||||
" mul.ftz.f32 %f66, %f63, %f65;\n"
|
||||
" .loc 16 172 0\n"
|
||||
" fma.rn.ftz.f32 %f33, %f45, %f66, %f33;\n"
|
||||
" .loc 16 173 0\n"
|
||||
" fma.rn.ftz.f32 %f32, %f44, %f66, %f32;\n"
|
||||
" .loc 16 174 0\n"
|
||||
" fma.rn.ftz.f32 %f31, %f46, %f66, %f31;\n"
|
||||
" ld.param.s32 %r49, [__cudaparm_kernel_pair_fast_eflag];\n"
|
||||
" mov.u32 %r50, 0;\n"
|
||||
" setp.le.s32 %p8, %r49, %r50;\n"
|
||||
" @%p8 bra $Lt_1_24834;\n"
|
||||
" .loc 16 177 0\n"
|
||||
" add.u64 %rd49, %rd47, %rd13;\n"
|
||||
" ld.shared.v4.f32 {%f67,%f68,%f69,_}, [%rd49+0];\n"
|
||||
" mul.ftz.f32 %f70, %f67, %f60;\n"
|
||||
" sub.ftz.f32 %f71, %f70, %f68;\n"
|
||||
" mul.ftz.f32 %f72, %f60, %f71;\n"
|
||||
" .loc 16 178 0\n"
|
||||
" sub.ftz.f32 %f73, %f72, %f69;\n"
|
||||
" fma.rn.ftz.f32 %f34, %f35, %f73, %f34;\n"
|
||||
"$Lt_1_24834:\n"
|
||||
" ld.param.s32 %r51, [__cudaparm_kernel_pair_fast_vflag];\n"
|
||||
" mov.u32 %r52, 0;\n"
|
||||
" setp.le.s32 %p9, %r51, %r52;\n"
|
||||
" @%p9 bra $Lt_1_25346;\n"
|
||||
" .loc 16 181 0\n"
|
||||
" mov.f32 %f74, %f11;\n"
|
||||
" mul.ftz.f32 %f75, %f45, %f45;\n"
|
||||
" fma.rn.ftz.f32 %f76, %f66, %f75, %f74;\n"
|
||||
" mov.f32 %f11, %f76;\n"
|
||||
" .loc 16 182 0\n"
|
||||
" mov.f32 %f77, %f13;\n"
|
||||
" fma.rn.ftz.f32 %f78, %f66, %f47, %f77;\n"
|
||||
" mov.f32 %f13, %f78;\n"
|
||||
" .loc 16 183 0\n"
|
||||
" mov.f32 %f79, %f15;\n"
|
||||
" mul.ftz.f32 %f80, %f46, %f46;\n"
|
||||
" fma.rn.ftz.f32 %f81, %f66, %f80, %f79;\n"
|
||||
" mov.f32 %f15, %f81;\n"
|
||||
" .loc 16 184 0\n"
|
||||
" mov.f32 %f82, %f17;\n"
|
||||
" mul.ftz.f32 %f83, %f44, %f45;\n"
|
||||
" fma.rn.ftz.f32 %f84, %f66, %f83, %f82;\n"
|
||||
" mov.f32 %f17, %f84;\n"
|
||||
" .loc 16 185 0\n"
|
||||
" mov.f32 %f85, %f19;\n"
|
||||
" mul.ftz.f32 %f86, %f45, %f46;\n"
|
||||
" fma.rn.ftz.f32 %f87, %f66, %f86, %f85;\n"
|
||||
" mov.f32 %f19, %f87;\n"
|
||||
" .loc 16 186 0\n"
|
||||
" mul.ftz.f32 %f88, %f44, %f46;\n"
|
||||
" fma.rn.ftz.f32 %f20, %f66, %f88, %f20;\n"
|
||||
" mov.f32 %f21, %f20;\n"
|
||||
"$Lt_1_25346:\n"
|
||||
"$Lt_1_24322:\n"
|
||||
" .loc 16 148 0\n"
|
||||
" mul.lo.u64 %rd50, %rd42, 4;\n"
|
||||
" add.u64 %rd35, %rd35, %rd50;\n"
|
||||
" setp.lt.u64 %p10, %rd35, %rd34;\n"
|
||||
" @%p10 bra $Lt_1_24066;\n"
|
||||
" bra.uni $Lt_1_23554;\n"
|
||||
"$Lt_1_31746:\n"
|
||||
" mov.f32 %f31, 0f00000000; \n"
|
||||
" mov.f32 %f32, 0f00000000; \n"
|
||||
" mov.f32 %f33, 0f00000000; \n"
|
||||
" mov.f32 %f34, 0f00000000; \n"
|
||||
"$Lt_1_23554:\n"
|
||||
" mov.u32 %r53, 1;\n"
|
||||
" setp.le.s32 %p11, %r6, %r53;\n"
|
||||
" @%p11 bra $Lt_1_28162;\n"
|
||||
" .loc 16 191 0\n"
|
||||
" mov.u64 %rd51, __cuda___cuda_local_var_32692_55_non_const_red_acc7168;\n"
|
||||
" cvt.s64.s32 %rd52, %r1;\n"
|
||||
" mul.wide.s32 %rd53, %r1, 4;\n"
|
||||
" add.u64 %rd54, %rd51, %rd53;\n"
|
||||
" mov.f32 %f89, %f33;\n"
|
||||
" st.shared.f32 [%rd54+0], %f89;\n"
|
||||
" mov.f32 %f90, %f32;\n"
|
||||
" st.shared.f32 [%rd54+512], %f90;\n"
|
||||
" mov.f32 %f91, %f31;\n"
|
||||
" st.shared.f32 [%rd54+1024], %f91;\n"
|
||||
" mov.f32 %f92, %f34;\n"
|
||||
" st.shared.f32 [%rd54+1536], %f92;\n"
|
||||
" shr.s32 %r54, %r6, 31;\n"
|
||||
" mov.s32 %r55, 1;\n"
|
||||
" and.b32 %r56, %r54, %r55;\n"
|
||||
" add.s32 %r57, %r56, %r6;\n"
|
||||
" shr.s32 %r58, %r57, 1;\n"
|
||||
" mov.s32 %r59, %r58;\n"
|
||||
" mov.u32 %r60, 0;\n"
|
||||
" setp.ne.u32 %p12, %r58, %r60;\n"
|
||||
" @!%p12 bra $Lt_1_26626;\n"
|
||||
"$Lt_1_27138:\n"
|
||||
" setp.ge.u32 %p13, %r17, %r59;\n"
|
||||
" @%p13 bra $Lt_1_27394;\n"
|
||||
" add.u32 %r61, %r1, %r59;\n"
|
||||
" cvt.u64.u32 %rd55, %r61;\n"
|
||||
" mul.wide.u32 %rd56, %r61, 4;\n"
|
||||
" add.u64 %rd57, %rd51, %rd56;\n"
|
||||
" ld.shared.f32 %f93, [%rd57+0];\n"
|
||||
" add.ftz.f32 %f89, %f93, %f89;\n"
|
||||
" st.shared.f32 [%rd54+0], %f89;\n"
|
||||
" ld.shared.f32 %f94, [%rd57+512];\n"
|
||||
" add.ftz.f32 %f90, %f94, %f90;\n"
|
||||
" st.shared.f32 [%rd54+512], %f90;\n"
|
||||
" ld.shared.f32 %f95, [%rd57+1024];\n"
|
||||
" add.ftz.f32 %f91, %f95, %f91;\n"
|
||||
" st.shared.f32 [%rd54+1024], %f91;\n"
|
||||
" ld.shared.f32 %f96, [%rd57+1536];\n"
|
||||
" add.ftz.f32 %f92, %f96, %f92;\n"
|
||||
" st.shared.f32 [%rd54+1536], %f92;\n"
|
||||
"$Lt_1_27394:\n"
|
||||
" shr.u32 %r59, %r59, 1;\n"
|
||||
" mov.u32 %r62, 0;\n"
|
||||
" setp.ne.u32 %p14, %r59, %r62;\n"
|
||||
" @%p14 bra $Lt_1_27138;\n"
|
||||
"$Lt_1_26626:\n"
|
||||
" mov.f32 %f33, %f89;\n"
|
||||
" mov.f32 %f32, %f90;\n"
|
||||
" mov.f32 %f31, %f91;\n"
|
||||
" mov.f32 %f34, %f92;\n"
|
||||
" ld.param.s32 %r63, [__cudaparm_kernel_pair_fast_vflag];\n"
|
||||
" mov.u32 %r64, 0;\n"
|
||||
" setp.le.s32 %p15, %r63, %r64;\n"
|
||||
" @%p15 bra $Lt_1_28162;\n"
|
||||
" mov.f32 %f89, %f11;\n"
|
||||
" st.shared.f32 [%rd54+0], %f89;\n"
|
||||
" mov.f32 %f90, %f13;\n"
|
||||
" st.shared.f32 [%rd54+512], %f90;\n"
|
||||
" mov.f32 %f91, %f15;\n"
|
||||
" st.shared.f32 [%rd54+1024], %f91;\n"
|
||||
" mov.f32 %f92, %f17;\n"
|
||||
" st.shared.f32 [%rd54+1536], %f92;\n"
|
||||
" mov.f32 %f97, %f19;\n"
|
||||
" st.shared.f32 [%rd54+2048], %f97;\n"
|
||||
" mov.f32 %f98, %f20;\n"
|
||||
" st.shared.f32 [%rd54+2560], %f98;\n"
|
||||
" mov.s32 %r65, %r58;\n"
|
||||
" @!%p12 bra $Lt_1_28674;\n"
|
||||
"$Lt_1_29186:\n"
|
||||
" setp.ge.u32 %p16, %r17, %r65;\n"
|
||||
" @%p16 bra $Lt_1_29442;\n"
|
||||
" add.u32 %r66, %r1, %r65;\n"
|
||||
" cvt.u64.u32 %rd58, %r66;\n"
|
||||
" mul.wide.u32 %rd59, %r66, 4;\n"
|
||||
" add.u64 %rd60, %rd51, %rd59;\n"
|
||||
" ld.shared.f32 %f99, [%rd60+0];\n"
|
||||
" add.ftz.f32 %f89, %f99, %f89;\n"
|
||||
" st.shared.f32 [%rd54+0], %f89;\n"
|
||||
" ld.shared.f32 %f100, [%rd60+512];\n"
|
||||
" add.ftz.f32 %f90, %f100, %f90;\n"
|
||||
" st.shared.f32 [%rd54+512], %f90;\n"
|
||||
" ld.shared.f32 %f101, [%rd60+1024];\n"
|
||||
" add.ftz.f32 %f91, %f101, %f91;\n"
|
||||
" st.shared.f32 [%rd54+1024], %f91;\n"
|
||||
" ld.shared.f32 %f102, [%rd60+1536];\n"
|
||||
" add.ftz.f32 %f92, %f102, %f92;\n"
|
||||
" st.shared.f32 [%rd54+1536], %f92;\n"
|
||||
" ld.shared.f32 %f103, [%rd60+2048];\n"
|
||||
" add.ftz.f32 %f97, %f103, %f97;\n"
|
||||
" st.shared.f32 [%rd54+2048], %f97;\n"
|
||||
" ld.shared.f32 %f104, [%rd60+2560];\n"
|
||||
" add.ftz.f32 %f98, %f104, %f98;\n"
|
||||
" st.shared.f32 [%rd54+2560], %f98;\n"
|
||||
"$Lt_1_29442:\n"
|
||||
" shr.u32 %r65, %r65, 1;\n"
|
||||
" mov.u32 %r67, 0;\n"
|
||||
" setp.ne.u32 %p17, %r65, %r67;\n"
|
||||
" @%p17 bra $Lt_1_29186;\n"
|
||||
"$Lt_1_28674:\n"
|
||||
" mov.f32 %f11, %f89;\n"
|
||||
" mov.f32 %f13, %f90;\n"
|
||||
" mov.f32 %f15, %f91;\n"
|
||||
" mov.f32 %f17, %f92;\n"
|
||||
" mov.f32 %f19, %f97;\n"
|
||||
" mov.f32 %f21, %f98;\n"
|
||||
"$Lt_1_28162:\n"
|
||||
"$Lt_1_26114:\n"
|
||||
" mov.u32 %r68, 0;\n"
|
||||
" setp.ne.s32 %p18, %r17, %r68;\n"
|
||||
" @%p18 bra $Lt_1_30210;\n"
|
||||
" ld.param.u64 %rd61, [__cudaparm_kernel_pair_fast___val_paramengv];\n"
|
||||
" add.u64 %rd62, %rd61, %rd20;\n"
|
||||
" ld.param.s32 %r69, [__cudaparm_kernel_pair_fast_eflag];\n"
|
||||
" mov.u32 %r70, 0;\n"
|
||||
" setp.le.s32 %p19, %r69, %r70;\n"
|
||||
" @%p19 bra $Lt_1_30722;\n"
|
||||
" st.global.f32 [%rd62+0], %f34;\n"
|
||||
" cvt.s64.s32 %rd63, %r13;\n"
|
||||
" mul.wide.s32 %rd64, %r13, 4;\n"
|
||||
" add.u64 %rd62, %rd62, %rd64;\n"
|
||||
"$Lt_1_30722:\n"
|
||||
" ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag];\n"
|
||||
" mov.u32 %r72, 0;\n"
|
||||
" setp.le.s32 %p20, %r71, %r72;\n"
|
||||
" @%p20 bra $Lt_1_31234;\n"
|
||||
" mov.f32 %f105, %f11;\n"
|
||||
" st.global.f32 [%rd62+0], %f105;\n"
|
||||
" cvt.s64.s32 %rd65, %r13;\n"
|
||||
" mul.wide.s32 %rd66, %r13, 4;\n"
|
||||
" add.u64 %rd67, %rd66, %rd62;\n"
|
||||
" mov.f32 %f106, %f13;\n"
|
||||
" st.global.f32 [%rd67+0], %f106;\n"
|
||||
" add.u64 %rd68, %rd66, %rd67;\n"
|
||||
" mov.f32 %f107, %f15;\n"
|
||||
" st.global.f32 [%rd68+0], %f107;\n"
|
||||
" add.u64 %rd69, %rd66, %rd68;\n"
|
||||
" mov.f32 %f108, %f17;\n"
|
||||
" st.global.f32 [%rd69+0], %f108;\n"
|
||||
" add.u64 %rd62, %rd66, %rd69;\n"
|
||||
" mov.f32 %f109, %f19;\n"
|
||||
" st.global.f32 [%rd62+0], %f109;\n"
|
||||
" mov.f32 %f110, %f21;\n"
|
||||
" add.u64 %rd70, %rd66, %rd62;\n"
|
||||
" st.global.f32 [%rd70+0], %f110;\n"
|
||||
"$Lt_1_31234:\n"
|
||||
" ld.param.u64 %rd71, [__cudaparm_kernel_pair_fast_ans];\n"
|
||||
" mul.lo.u64 %rd72, %rd19, 16;\n"
|
||||
" add.u64 %rd73, %rd71, %rd72;\n"
|
||||
" mov.f32 %f111, %f112;\n"
|
||||
" st.global.v4.f32 [%rd73+0], {%f33,%f32,%f31,%f111};\n"
|
||||
"$Lt_1_30210:\n"
|
||||
"$Lt_1_22530:\n"
|
||||
" .loc 16 194 0\n"
|
||||
" exit;\n"
|
||||
"$LDWend_kernel_pair_fast:\n"
|
||||
" }\n"
|
||||
;
|
||||
849
lib/gpu/lj_ptx.h
849
lib/gpu/lj_ptx.h
@ -1,849 +0,0 @@
|
||||
const char * lj =
|
||||
" .version 2.3\n"
|
||||
" .target sm_20\n"
|
||||
" .address_size 64\n"
|
||||
" .global .texref pos_tex;\n"
|
||||
" .entry kernel_pair (\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_x_,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_lj1,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_lj3,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_lj_types,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_sp_lj_in,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_dev_nbor,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_dev_packed,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_ans,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair___val_paramengv,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_eflag,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_vflag,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_inum,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_nbor_pitch,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_t_per_atom)\n"
|
||||
" {\n"
|
||||
" .reg .u32 %r<72>;\n"
|
||||
" .reg .u64 %rd<63>;\n"
|
||||
" .reg .f32 %f<102>;\n"
|
||||
" .reg .pred %p<19>;\n"
|
||||
" .shared .align 16 .b8 __cuda___cuda_local_var_32536_33_non_const_sp_lj92[16];\n"
|
||||
" .shared .align 4 .b8 __cuda___cuda_local_var_32600_55_non_const_red_acc108[3072];\n"
|
||||
" .loc 16 31 0\n"
|
||||
"$LDWbegin_kernel_pair:\n"
|
||||
" .loc 16 36 0\n"
|
||||
" ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];\n"
|
||||
" ldu.global.f32 %f1, [%rd1+0];\n"
|
||||
" .loc 16 37 0\n"
|
||||
" ld.global.f32 %f2, [%rd1+4];\n"
|
||||
" .loc 16 38 0\n"
|
||||
" ld.global.f32 %f3, [%rd1+8];\n"
|
||||
" .loc 16 39 0\n"
|
||||
" ld.global.f32 %f4, [%rd1+12];\n"
|
||||
" st.shared.v4.f32 [__cuda___cuda_local_var_32536_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4};\n"
|
||||
" .loc 16 46 0\n"
|
||||
" mov.f32 %f5, 0f00000000; \n"
|
||||
" mov.f32 %f6, %f5;\n"
|
||||
" mov.f32 %f7, 0f00000000; \n"
|
||||
" mov.f32 %f8, %f7;\n"
|
||||
" mov.f32 %f9, 0f00000000; \n"
|
||||
" mov.f32 %f10, %f9;\n"
|
||||
" mov.f32 %f11, 0f00000000; \n"
|
||||
" mov.f32 %f12, %f11;\n"
|
||||
" mov.f32 %f13, 0f00000000; \n"
|
||||
" mov.f32 %f14, %f13;\n"
|
||||
" mov.f32 %f15, 0f00000000; \n"
|
||||
" mov.f32 %f16, %f15;\n"
|
||||
" ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];\n"
|
||||
" cvt.s32.u32 %r2, %tid.x;\n"
|
||||
" div.s32 %r3, %r2, %r1;\n"
|
||||
" cvt.s32.u32 %r4, %ntid.x;\n"
|
||||
" div.s32 %r5, %r4, %r1;\n"
|
||||
" cvt.s32.u32 %r6, %ctaid.x;\n"
|
||||
" mul.lo.s32 %r7, %r6, %r5;\n"
|
||||
" add.s32 %r8, %r3, %r7;\n"
|
||||
" ld.param.s32 %r9, [__cudaparm_kernel_pair_inum];\n"
|
||||
" setp.ge.s32 %p1, %r8, %r9;\n"
|
||||
" @%p1 bra $Lt_0_26370;\n"
|
||||
" .loc 16 51 0\n"
|
||||
" ld.param.s32 %r10, [__cudaparm_kernel_pair_nbor_pitch];\n"
|
||||
" cvt.s64.s32 %rd2, %r10;\n"
|
||||
" mul.wide.s32 %rd3, %r10, 4;\n"
|
||||
" cvt.s64.s32 %rd4, %r8;\n"
|
||||
" mul.wide.s32 %rd5, %r8, 4;\n"
|
||||
" ld.param.u64 %rd6, [__cudaparm_kernel_pair_dev_nbor];\n"
|
||||
" add.u64 %rd7, %rd5, %rd6;\n"
|
||||
" add.u64 %rd8, %rd3, %rd7;\n"
|
||||
" ld.global.s32 %r11, [%rd8+0];\n"
|
||||
" sub.s32 %r12, %r1, 1;\n"
|
||||
" and.b32 %r13, %r12, %r2;\n"
|
||||
" cvt.s64.s32 %rd9, %r13;\n"
|
||||
" mul.wide.s32 %rd10, %r13, 4;\n"
|
||||
" ld.param.u64 %rd11, [__cudaparm_kernel_pair_dev_packed];\n"
|
||||
" setp.ne.u64 %p2, %rd11, %rd6;\n"
|
||||
" @%p2 bra $Lt_0_19458;\n"
|
||||
" cvt.s32.s64 %r14, %rd2;\n"
|
||||
" mul.lo.s32 %r15, %r14, %r1;\n"
|
||||
" mov.s32 %r16, %r15;\n"
|
||||
" mul.lo.s32 %r17, %r12, %r8;\n"
|
||||
" add.s32 %r18, %r14, %r17;\n"
|
||||
" cvt.s64.s32 %rd12, %r18;\n"
|
||||
" mul.wide.s32 %rd13, %r18, 4;\n"
|
||||
" add.u64 %rd14, %rd8, %rd13;\n"
|
||||
" and.b32 %r19, %r12, %r11;\n"
|
||||
" cvt.s64.s32 %rd15, %r19;\n"
|
||||
" div.s32 %r20, %r11, %r1;\n"
|
||||
" mul.lo.s32 %r21, %r15, %r20;\n"
|
||||
" cvt.s64.s32 %rd16, %r21;\n"
|
||||
" add.u64 %rd17, %rd15, %rd16;\n"
|
||||
" mul.lo.u64 %rd18, %rd17, 4;\n"
|
||||
" add.u64 %rd19, %rd14, %rd18;\n"
|
||||
" add.u64 %rd20, %rd10, %rd14;\n"
|
||||
" bra.uni $Lt_0_19202;\n"
|
||||
"$Lt_0_19458:\n"
|
||||
" add.u64 %rd21, %rd3, %rd8;\n"
|
||||
" ld.global.s32 %r22, [%rd21+0];\n"
|
||||
" cvt.s64.s32 %rd22, %r22;\n"
|
||||
" mul.wide.s32 %rd23, %r22, 4;\n"
|
||||
" add.u64 %rd24, %rd11, %rd23;\n"
|
||||
" cvt.s64.s32 %rd25, %r11;\n"
|
||||
" mul.wide.s32 %rd26, %r11, 4;\n"
|
||||
" add.u64 %rd19, %rd24, %rd26;\n"
|
||||
" mov.s32 %r16, %r1;\n"
|
||||
" add.u64 %rd20, %rd10, %rd24;\n"
|
||||
"$Lt_0_19202:\n"
|
||||
" .loc 16 54 0\n"
|
||||
" ld.global.s32 %r23, [%rd7+0];\n"
|
||||
" mov.u32 %r24, %r23;\n"
|
||||
" mov.s32 %r25, 0;\n"
|
||||
" mov.u32 %r26, %r25;\n"
|
||||
" mov.s32 %r27, 0;\n"
|
||||
" mov.u32 %r28, %r27;\n"
|
||||
" mov.s32 %r29, 0;\n"
|
||||
" mov.u32 %r30, %r29;\n"
|
||||
" tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r24,%r26,%r28,%r30}];\n"
|
||||
" mov.f32 %f21, %f17;\n"
|
||||
" mov.f32 %f22, %f18;\n"
|
||||
" mov.f32 %f23, %f19;\n"
|
||||
" mov.f32 %f24, %f20;\n"
|
||||
" setp.ge.u64 %p3, %rd20, %rd19;\n"
|
||||
" @%p3 bra $Lt_0_27906;\n"
|
||||
" cvt.rzi.ftz.s32.f32 %r31, %f24;\n"
|
||||
" cvt.s64.s32 %rd27, %r16;\n"
|
||||
" ld.param.s32 %r32, [__cudaparm_kernel_pair_lj_types];\n"
|
||||
" mul.lo.s32 %r33, %r32, %r31;\n"
|
||||
" ld.param.u64 %rd28, [__cudaparm_kernel_pair_lj1];\n"
|
||||
" mov.f32 %f25, 0f00000000; \n"
|
||||
" mov.f32 %f26, 0f00000000; \n"
|
||||
" mov.f32 %f27, 0f00000000; \n"
|
||||
" mov.f32 %f28, 0f00000000; \n"
|
||||
" mov.u64 %rd29, __cuda___cuda_local_var_32536_33_non_const_sp_lj92;\n"
|
||||
"$Lt_0_20226:\n"
|
||||
" .loc 16 60 0\n"
|
||||
" ld.global.s32 %r34, [%rd20+0];\n"
|
||||
" .loc 16 61 0\n"
|
||||
" shr.s32 %r35, %r34, 30;\n"
|
||||
" and.b32 %r36, %r35, 3;\n"
|
||||
" cvt.s64.s32 %rd30, %r36;\n"
|
||||
" mul.wide.s32 %rd31, %r36, 4;\n"
|
||||
" add.u64 %rd32, %rd29, %rd31;\n"
|
||||
" ld.shared.f32 %f29, [%rd32+0];\n"
|
||||
" .loc 16 64 0\n"
|
||||
" and.b32 %r37, %r34, 1073741823;\n"
|
||||
" mov.u32 %r38, %r37;\n"
|
||||
" mov.s32 %r39, 0;\n"
|
||||
" mov.u32 %r40, %r39;\n"
|
||||
" mov.s32 %r41, 0;\n"
|
||||
" mov.u32 %r42, %r41;\n"
|
||||
" mov.s32 %r43, 0;\n"
|
||||
" mov.u32 %r44, %r43;\n"
|
||||
" tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r38,%r40,%r42,%r44}];\n"
|
||||
" mov.f32 %f34, %f30;\n"
|
||||
" mov.f32 %f35, %f31;\n"
|
||||
" mov.f32 %f36, %f32;\n"
|
||||
" mov.f32 %f37, %f33;\n"
|
||||
" cvt.rzi.ftz.s32.f32 %r45, %f37;\n"
|
||||
" sub.ftz.f32 %f38, %f22, %f35;\n"
|
||||
" sub.ftz.f32 %f39, %f21, %f34;\n"
|
||||
" sub.ftz.f32 %f40, %f23, %f36;\n"
|
||||
" mul.ftz.f32 %f41, %f38, %f38;\n"
|
||||
" fma.rn.ftz.f32 %f42, %f39, %f39, %f41;\n"
|
||||
" fma.rn.ftz.f32 %f43, %f40, %f40, %f42;\n"
|
||||
" add.s32 %r46, %r45, %r33;\n"
|
||||
" cvt.s64.s32 %rd33, %r46;\n"
|
||||
" mul.wide.s32 %rd34, %r46, 16;\n"
|
||||
" add.u64 %rd35, %rd34, %rd28;\n"
|
||||
" ld.global.f32 %f44, [%rd35+8];\n"
|
||||
" setp.gt.ftz.f32 %p4, %f44, %f43;\n"
|
||||
" @!%p4 bra $Lt_0_21506;\n"
|
||||
" .loc 16 78 0\n"
|
||||
" rcp.approx.ftz.f32 %f45, %f43;\n"
|
||||
" mul.ftz.f32 %f46, %f45, %f45;\n"
|
||||
" mul.ftz.f32 %f47, %f45, %f46;\n"
|
||||
" mul.ftz.f32 %f48, %f45, %f47;\n"
|
||||
" ld.global.v2.f32 {%f49,%f50}, [%rd35+0];\n"
|
||||
" mul.ftz.f32 %f51, %f49, %f47;\n"
|
||||
" sub.ftz.f32 %f52, %f51, %f50;\n"
|
||||
" mul.ftz.f32 %f53, %f48, %f52;\n"
|
||||
" mul.ftz.f32 %f54, %f29, %f53;\n"
|
||||
" .loc 16 80 0\n"
|
||||
" fma.rn.ftz.f32 %f27, %f39, %f54, %f27;\n"
|
||||
" .loc 16 81 0\n"
|
||||
" fma.rn.ftz.f32 %f26, %f38, %f54, %f26;\n"
|
||||
" .loc 16 82 0\n"
|
||||
" fma.rn.ftz.f32 %f25, %f40, %f54, %f25;\n"
|
||||
" ld.param.s32 %r47, [__cudaparm_kernel_pair_eflag];\n"
|
||||
" mov.u32 %r48, 0;\n"
|
||||
" setp.le.s32 %p5, %r47, %r48;\n"
|
||||
" @%p5 bra $Lt_0_20994;\n"
|
||||
" .loc 16 86 0\n"
|
||||
" ld.param.u64 %rd36, [__cudaparm_kernel_pair_lj3];\n"
|
||||
" add.u64 %rd37, %rd36, %rd34;\n"
|
||||
" ld.global.v4.f32 {%f55,%f56,%f57,_}, [%rd37+0];\n"
|
||||
" mul.ftz.f32 %f58, %f55, %f47;\n"
|
||||
" sub.ftz.f32 %f59, %f58, %f56;\n"
|
||||
" mul.ftz.f32 %f60, %f47, %f59;\n"
|
||||
" sub.ftz.f32 %f61, %f60, %f57;\n"
|
||||
" fma.rn.ftz.f32 %f28, %f29, %f61, %f28;\n"
|
||||
"$Lt_0_20994:\n"
|
||||
" ld.param.s32 %r49, [__cudaparm_kernel_pair_vflag];\n"
|
||||
" mov.u32 %r50, 0;\n"
|
||||
" setp.le.s32 %p6, %r49, %r50;\n"
|
||||
" @%p6 bra $Lt_0_21506;\n"
|
||||
" .loc 16 89 0\n"
|
||||
" mov.f32 %f62, %f6;\n"
|
||||
" mul.ftz.f32 %f63, %f39, %f39;\n"
|
||||
" fma.rn.ftz.f32 %f64, %f54, %f63, %f62;\n"
|
||||
" mov.f32 %f6, %f64;\n"
|
||||
" .loc 16 90 0\n"
|
||||
" mov.f32 %f65, %f8;\n"
|
||||
" fma.rn.ftz.f32 %f66, %f54, %f41, %f65;\n"
|
||||
" mov.f32 %f8, %f66;\n"
|
||||
" .loc 16 91 0\n"
|
||||
" mov.f32 %f67, %f10;\n"
|
||||
" mul.ftz.f32 %f68, %f40, %f40;\n"
|
||||
" fma.rn.ftz.f32 %f69, %f54, %f68, %f67;\n"
|
||||
" mov.f32 %f10, %f69;\n"
|
||||
" .loc 16 92 0\n"
|
||||
" mov.f32 %f70, %f12;\n"
|
||||
" mul.ftz.f32 %f71, %f38, %f39;\n"
|
||||
" fma.rn.ftz.f32 %f72, %f54, %f71, %f70;\n"
|
||||
" mov.f32 %f12, %f72;\n"
|
||||
" .loc 16 93 0\n"
|
||||
" mov.f32 %f73, %f14;\n"
|
||||
" mul.ftz.f32 %f74, %f39, %f40;\n"
|
||||
" fma.rn.ftz.f32 %f75, %f54, %f74, %f73;\n"
|
||||
" mov.f32 %f14, %f75;\n"
|
||||
" .loc 16 94 0\n"
|
||||
" mul.ftz.f32 %f76, %f38, %f40;\n"
|
||||
" fma.rn.ftz.f32 %f15, %f54, %f76, %f15;\n"
|
||||
" mov.f32 %f16, %f15;\n"
|
||||
"$Lt_0_21506:\n"
|
||||
"$Lt_0_20482:\n"
|
||||
" .loc 16 58 0\n"
|
||||
" mul.lo.u64 %rd38, %rd27, 4;\n"
|
||||
" add.u64 %rd20, %rd20, %rd38;\n"
|
||||
" setp.lt.u64 %p7, %rd20, %rd19;\n"
|
||||
" @%p7 bra $Lt_0_20226;\n"
|
||||
" bra.uni $Lt_0_19714;\n"
|
||||
"$Lt_0_27906:\n"
|
||||
" mov.f32 %f25, 0f00000000; \n"
|
||||
" mov.f32 %f26, 0f00000000; \n"
|
||||
" mov.f32 %f27, 0f00000000; \n"
|
||||
" mov.f32 %f28, 0f00000000; \n"
|
||||
"$Lt_0_19714:\n"
|
||||
" mov.u32 %r51, 1;\n"
|
||||
" setp.le.s32 %p8, %r1, %r51;\n"
|
||||
" @%p8 bra $Lt_0_24322;\n"
|
||||
" .loc 16 99 0\n"
|
||||
" mov.u64 %rd39, __cuda___cuda_local_var_32600_55_non_const_red_acc108;\n"
|
||||
" cvt.s64.s32 %rd40, %r2;\n"
|
||||
" mul.wide.s32 %rd41, %r2, 4;\n"
|
||||
" add.u64 %rd42, %rd39, %rd41;\n"
|
||||
" mov.f32 %f77, %f27;\n"
|
||||
" st.shared.f32 [%rd42+0], %f77;\n"
|
||||
" mov.f32 %f78, %f26;\n"
|
||||
" st.shared.f32 [%rd42+512], %f78;\n"
|
||||
" mov.f32 %f79, %f25;\n"
|
||||
" st.shared.f32 [%rd42+1024], %f79;\n"
|
||||
" mov.f32 %f80, %f28;\n"
|
||||
" st.shared.f32 [%rd42+1536], %f80;\n"
|
||||
" shr.s32 %r52, %r1, 31;\n"
|
||||
" mov.s32 %r53, 1;\n"
|
||||
" and.b32 %r54, %r52, %r53;\n"
|
||||
" add.s32 %r55, %r54, %r1;\n"
|
||||
" shr.s32 %r56, %r55, 1;\n"
|
||||
" mov.s32 %r57, %r56;\n"
|
||||
" mov.u32 %r58, 0;\n"
|
||||
" setp.ne.u32 %p9, %r56, %r58;\n"
|
||||
" @!%p9 bra $Lt_0_22786;\n"
|
||||
"$Lt_0_23298:\n"
|
||||
" setp.ge.u32 %p10, %r13, %r57;\n"
|
||||
" @%p10 bra $Lt_0_23554;\n"
|
||||
" add.u32 %r59, %r2, %r57;\n"
|
||||
" cvt.u64.u32 %rd43, %r59;\n"
|
||||
" mul.wide.u32 %rd44, %r59, 4;\n"
|
||||
" add.u64 %rd45, %rd39, %rd44;\n"
|
||||
" ld.shared.f32 %f81, [%rd45+0];\n"
|
||||
" add.ftz.f32 %f77, %f81, %f77;\n"
|
||||
" st.shared.f32 [%rd42+0], %f77;\n"
|
||||
" ld.shared.f32 %f82, [%rd45+512];\n"
|
||||
" add.ftz.f32 %f78, %f82, %f78;\n"
|
||||
" st.shared.f32 [%rd42+512], %f78;\n"
|
||||
" ld.shared.f32 %f83, [%rd45+1024];\n"
|
||||
" add.ftz.f32 %f79, %f83, %f79;\n"
|
||||
" st.shared.f32 [%rd42+1024], %f79;\n"
|
||||
" ld.shared.f32 %f84, [%rd45+1536];\n"
|
||||
" add.ftz.f32 %f80, %f84, %f80;\n"
|
||||
" st.shared.f32 [%rd42+1536], %f80;\n"
|
||||
"$Lt_0_23554:\n"
|
||||
" shr.u32 %r57, %r57, 1;\n"
|
||||
" mov.u32 %r60, 0;\n"
|
||||
" setp.ne.u32 %p11, %r57, %r60;\n"
|
||||
" @%p11 bra $Lt_0_23298;\n"
|
||||
"$Lt_0_22786:\n"
|
||||
" mov.f32 %f27, %f77;\n"
|
||||
" mov.f32 %f26, %f78;\n"
|
||||
" mov.f32 %f25, %f79;\n"
|
||||
" mov.f32 %f28, %f80;\n"
|
||||
" ld.param.s32 %r61, [__cudaparm_kernel_pair_vflag];\n"
|
||||
" mov.u32 %r62, 0;\n"
|
||||
" setp.le.s32 %p12, %r61, %r62;\n"
|
||||
" @%p12 bra $Lt_0_24322;\n"
|
||||
" mov.f32 %f77, %f6;\n"
|
||||
" st.shared.f32 [%rd42+0], %f77;\n"
|
||||
" mov.f32 %f78, %f8;\n"
|
||||
" st.shared.f32 [%rd42+512], %f78;\n"
|
||||
" mov.f32 %f79, %f10;\n"
|
||||
" st.shared.f32 [%rd42+1024], %f79;\n"
|
||||
" mov.f32 %f80, %f12;\n"
|
||||
" st.shared.f32 [%rd42+1536], %f80;\n"
|
||||
" mov.f32 %f85, %f14;\n"
|
||||
" st.shared.f32 [%rd42+2048], %f85;\n"
|
||||
" mov.f32 %f86, %f15;\n"
|
||||
" st.shared.f32 [%rd42+2560], %f86;\n"
|
||||
" mov.s32 %r63, %r56;\n"
|
||||
" @!%p9 bra $Lt_0_24834;\n"
|
||||
"$Lt_0_25346:\n"
|
||||
" setp.ge.u32 %p13, %r13, %r63;\n"
|
||||
" @%p13 bra $Lt_0_25602;\n"
|
||||
" add.u32 %r64, %r2, %r63;\n"
|
||||
" cvt.u64.u32 %rd46, %r64;\n"
|
||||
" mul.wide.u32 %rd47, %r64, 4;\n"
|
||||
" add.u64 %rd48, %rd39, %rd47;\n"
|
||||
" ld.shared.f32 %f87, [%rd48+0];\n"
|
||||
" add.ftz.f32 %f77, %f87, %f77;\n"
|
||||
" st.shared.f32 [%rd42+0], %f77;\n"
|
||||
" ld.shared.f32 %f88, [%rd48+512];\n"
|
||||
" add.ftz.f32 %f78, %f88, %f78;\n"
|
||||
" st.shared.f32 [%rd42+512], %f78;\n"
|
||||
" ld.shared.f32 %f89, [%rd48+1024];\n"
|
||||
" add.ftz.f32 %f79, %f89, %f79;\n"
|
||||
" st.shared.f32 [%rd42+1024], %f79;\n"
|
||||
" ld.shared.f32 %f90, [%rd48+1536];\n"
|
||||
" add.ftz.f32 %f80, %f90, %f80;\n"
|
||||
" st.shared.f32 [%rd42+1536], %f80;\n"
|
||||
" ld.shared.f32 %f91, [%rd48+2048];\n"
|
||||
" add.ftz.f32 %f85, %f91, %f85;\n"
|
||||
" st.shared.f32 [%rd42+2048], %f85;\n"
|
||||
" ld.shared.f32 %f92, [%rd48+2560];\n"
|
||||
" add.ftz.f32 %f86, %f92, %f86;\n"
|
||||
" st.shared.f32 [%rd42+2560], %f86;\n"
|
||||
"$Lt_0_25602:\n"
|
||||
" shr.u32 %r63, %r63, 1;\n"
|
||||
" mov.u32 %r65, 0;\n"
|
||||
" setp.ne.u32 %p14, %r63, %r65;\n"
|
||||
" @%p14 bra $Lt_0_25346;\n"
|
||||
"$Lt_0_24834:\n"
|
||||
" mov.f32 %f6, %f77;\n"
|
||||
" mov.f32 %f8, %f78;\n"
|
||||
" mov.f32 %f10, %f79;\n"
|
||||
" mov.f32 %f12, %f80;\n"
|
||||
" mov.f32 %f14, %f85;\n"
|
||||
" mov.f32 %f16, %f86;\n"
|
||||
"$Lt_0_24322:\n"
|
||||
"$Lt_0_22274:\n"
|
||||
" mov.u32 %r66, 0;\n"
|
||||
" setp.ne.s32 %p15, %r13, %r66;\n"
|
||||
" @%p15 bra $Lt_0_26370;\n"
|
||||
" ld.param.u64 %rd49, [__cudaparm_kernel_pair___val_paramengv];\n"
|
||||
" add.u64 %rd50, %rd49, %rd5;\n"
|
||||
" ld.param.s32 %r67, [__cudaparm_kernel_pair_eflag];\n"
|
||||
" mov.u32 %r68, 0;\n"
|
||||
" setp.le.s32 %p16, %r67, %r68;\n"
|
||||
" @%p16 bra $Lt_0_26882;\n"
|
||||
" st.global.f32 [%rd50+0], %f28;\n"
|
||||
" cvt.s64.s32 %rd51, %r9;\n"
|
||||
" mul.wide.s32 %rd52, %r9, 4;\n"
|
||||
" add.u64 %rd50, %rd50, %rd52;\n"
|
||||
"$Lt_0_26882:\n"
|
||||
" ld.param.s32 %r69, [__cudaparm_kernel_pair_vflag];\n"
|
||||
" mov.u32 %r70, 0;\n"
|
||||
" setp.le.s32 %p17, %r69, %r70;\n"
|
||||
" @%p17 bra $Lt_0_27394;\n"
|
||||
" mov.f32 %f93, %f6;\n"
|
||||
" st.global.f32 [%rd50+0], %f93;\n"
|
||||
" cvt.s64.s32 %rd53, %r9;\n"
|
||||
" mul.wide.s32 %rd54, %r9, 4;\n"
|
||||
" add.u64 %rd55, %rd54, %rd50;\n"
|
||||
" mov.f32 %f94, %f8;\n"
|
||||
" st.global.f32 [%rd55+0], %f94;\n"
|
||||
" add.u64 %rd56, %rd54, %rd55;\n"
|
||||
" mov.f32 %f95, %f10;\n"
|
||||
" st.global.f32 [%rd56+0], %f95;\n"
|
||||
" add.u64 %rd57, %rd54, %rd56;\n"
|
||||
" mov.f32 %f96, %f12;\n"
|
||||
" st.global.f32 [%rd57+0], %f96;\n"
|
||||
" add.u64 %rd50, %rd54, %rd57;\n"
|
||||
" mov.f32 %f97, %f14;\n"
|
||||
" st.global.f32 [%rd50+0], %f97;\n"
|
||||
" mov.f32 %f98, %f16;\n"
|
||||
" add.u64 %rd58, %rd54, %rd50;\n"
|
||||
" st.global.f32 [%rd58+0], %f98;\n"
|
||||
"$Lt_0_27394:\n"
|
||||
" ld.param.u64 %rd59, [__cudaparm_kernel_pair_ans];\n"
|
||||
" mul.lo.u64 %rd60, %rd4, 16;\n"
|
||||
" add.u64 %rd61, %rd59, %rd60;\n"
|
||||
" mov.f32 %f99, %f100;\n"
|
||||
" st.global.v4.f32 [%rd61+0], {%f27,%f26,%f25,%f99};\n"
|
||||
"$Lt_0_26370:\n"
|
||||
"$Lt_0_18690:\n"
|
||||
" .loc 16 102 0\n"
|
||||
" exit;\n"
|
||||
"$LDWend_kernel_pair:\n"
|
||||
" }\n"
|
||||
" .entry kernel_pair_fast (\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_x_,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_lj1_in,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_lj3_in,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_dev_packed,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_ans,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast___val_paramengv,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_eflag,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_vflag,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_inum,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_t_per_atom)\n"
|
||||
" {\n"
|
||||
" .reg .u32 %r<74>;\n"
|
||||
" .reg .u64 %rd<75>;\n"
|
||||
" .reg .f32 %f<109>;\n"
|
||||
" .reg .pred %p<22>;\n"
|
||||
" .shared .align 4 .b8 __cuda___cuda_local_var_32617_33_non_const_sp_lj3268[16];\n"
|
||||
" .shared .align 16 .b8 __cuda___cuda_local_var_32615_34_non_const_lj13296[1936];\n"
|
||||
" .shared .align 16 .b8 __cuda___cuda_local_var_32616_34_non_const_lj35232[1936];\n"
|
||||
" .shared .align 4 .b8 __cuda___cuda_local_var_32685_55_non_const_red_acc7168[3072];\n"
|
||||
" .loc 16 110 0\n"
|
||||
"$LDWbegin_kernel_pair_fast:\n"
|
||||
" cvt.s32.u32 %r1, %tid.x;\n"
|
||||
" mov.u32 %r2, 3;\n"
|
||||
" setp.gt.s32 %p1, %r1, %r2;\n"
|
||||
" @%p1 bra $Lt_1_20994;\n"
|
||||
" .loc 16 118 0\n"
|
||||
" mov.u64 %rd1, __cuda___cuda_local_var_32617_33_non_const_sp_lj3268;\n"
|
||||
" cvt.s64.s32 %rd2, %r1;\n"
|
||||
" mul.wide.s32 %rd3, %r1, 4;\n"
|
||||
" ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];\n"
|
||||
" add.u64 %rd5, %rd4, %rd3;\n"
|
||||
" ld.global.f32 %f1, [%rd5+0];\n"
|
||||
" add.u64 %rd6, %rd3, %rd1;\n"
|
||||
" st.shared.f32 [%rd6+0], %f1;\n"
|
||||
"$Lt_1_20994:\n"
|
||||
" mov.u64 %rd1, __cuda___cuda_local_var_32617_33_non_const_sp_lj3268;\n"
|
||||
" mov.u32 %r3, 120;\n"
|
||||
" setp.gt.s32 %p2, %r1, %r3;\n"
|
||||
" @%p2 bra $Lt_1_21506;\n"
|
||||
" .loc 16 120 0\n"
|
||||
" mov.u64 %rd7, __cuda___cuda_local_var_32615_34_non_const_lj13296;\n"
|
||||
" cvt.s64.s32 %rd8, %r1;\n"
|
||||
" mul.wide.s32 %rd9, %r1, 16;\n"
|
||||
" ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_lj1_in];\n"
|
||||
" add.u64 %rd11, %rd10, %rd9;\n"
|
||||
" add.u64 %rd12, %rd9, %rd7;\n"
|
||||
" ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0];\n"
|
||||
" st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5};\n"
|
||||
" ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag];\n"
|
||||
" mov.u32 %r5, 0;\n"
|
||||
" setp.le.s32 %p3, %r4, %r5;\n"
|
||||
" @%p3 bra $Lt_1_22018;\n"
|
||||
" .loc 16 122 0\n"
|
||||
" mov.u64 %rd13, __cuda___cuda_local_var_32616_34_non_const_lj35232;\n"
|
||||
" ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in];\n"
|
||||
" add.u64 %rd15, %rd14, %rd9;\n"
|
||||
" add.u64 %rd16, %rd9, %rd13;\n"
|
||||
" ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0];\n"
|
||||
" st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9};\n"
|
||||
"$Lt_1_22018:\n"
|
||||
" mov.u64 %rd13, __cuda___cuda_local_var_32616_34_non_const_lj35232;\n"
|
||||
"$Lt_1_21506:\n"
|
||||
" mov.u64 %rd13, __cuda___cuda_local_var_32616_34_non_const_lj35232;\n"
|
||||
" mov.u64 %rd7, __cuda___cuda_local_var_32615_34_non_const_lj13296;\n"
|
||||
" .loc 16 130 0\n"
|
||||
" mov.f32 %f10, 0f00000000; \n"
|
||||
" mov.f32 %f11, %f10;\n"
|
||||
" mov.f32 %f12, 0f00000000; \n"
|
||||
" mov.f32 %f13, %f12;\n"
|
||||
" mov.f32 %f14, 0f00000000; \n"
|
||||
" mov.f32 %f15, %f14;\n"
|
||||
" mov.f32 %f16, 0f00000000; \n"
|
||||
" mov.f32 %f17, %f16;\n"
|
||||
" mov.f32 %f18, 0f00000000; \n"
|
||||
" mov.f32 %f19, %f18;\n"
|
||||
" mov.f32 %f20, 0f00000000; \n"
|
||||
" mov.f32 %f21, %f20;\n"
|
||||
" .loc 16 132 0\n"
|
||||
" bar.sync 0;\n"
|
||||
" ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom];\n"
|
||||
" div.s32 %r7, %r1, %r6;\n"
|
||||
" cvt.s32.u32 %r8, %ntid.x;\n"
|
||||
" div.s32 %r9, %r8, %r6;\n"
|
||||
" cvt.s32.u32 %r10, %ctaid.x;\n"
|
||||
" mul.lo.s32 %r11, %r10, %r9;\n"
|
||||
" add.s32 %r12, %r7, %r11;\n"
|
||||
" ld.param.s32 %r13, [__cudaparm_kernel_pair_fast_inum];\n"
|
||||
" setp.ge.s32 %p4, %r12, %r13;\n"
|
||||
" @%p4 bra $Lt_1_30210;\n"
|
||||
" .loc 16 137 0\n"
|
||||
" ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_nbor_pitch];\n"
|
||||
" cvt.s64.s32 %rd17, %r14;\n"
|
||||
" mul.wide.s32 %rd18, %r14, 4;\n"
|
||||
" cvt.s64.s32 %rd19, %r12;\n"
|
||||
" mul.wide.s32 %rd20, %r12, 4;\n"
|
||||
" ld.param.u64 %rd21, [__cudaparm_kernel_pair_fast_dev_nbor];\n"
|
||||
" add.u64 %rd22, %rd20, %rd21;\n"
|
||||
" add.u64 %rd23, %rd18, %rd22;\n"
|
||||
" ld.global.s32 %r15, [%rd23+0];\n"
|
||||
" sub.s32 %r16, %r6, 1;\n"
|
||||
" and.b32 %r17, %r16, %r1;\n"
|
||||
" cvt.s64.s32 %rd24, %r17;\n"
|
||||
" mul.wide.s32 %rd25, %r17, 4;\n"
|
||||
" ld.param.u64 %rd26, [__cudaparm_kernel_pair_fast_dev_packed];\n"
|
||||
" setp.ne.u64 %p5, %rd26, %rd21;\n"
|
||||
" @%p5 bra $Lt_1_23298;\n"
|
||||
" cvt.s32.s64 %r18, %rd17;\n"
|
||||
" mul.lo.s32 %r19, %r18, %r6;\n"
|
||||
" mov.s32 %r20, %r19;\n"
|
||||
" mul.lo.s32 %r21, %r16, %r12;\n"
|
||||
" add.s32 %r22, %r18, %r21;\n"
|
||||
" cvt.s64.s32 %rd27, %r22;\n"
|
||||
" mul.wide.s32 %rd28, %r22, 4;\n"
|
||||
" add.u64 %rd29, %rd23, %rd28;\n"
|
||||
" and.b32 %r23, %r16, %r15;\n"
|
||||
" cvt.s64.s32 %rd30, %r23;\n"
|
||||
" div.s32 %r24, %r15, %r6;\n"
|
||||
" mul.lo.s32 %r25, %r19, %r24;\n"
|
||||
" cvt.s64.s32 %rd31, %r25;\n"
|
||||
" add.u64 %rd32, %rd30, %rd31;\n"
|
||||
" mul.lo.u64 %rd33, %rd32, 4;\n"
|
||||
" add.u64 %rd34, %rd29, %rd33;\n"
|
||||
" add.u64 %rd35, %rd25, %rd29;\n"
|
||||
" bra.uni $Lt_1_23042;\n"
|
||||
"$Lt_1_23298:\n"
|
||||
" add.u64 %rd36, %rd18, %rd23;\n"
|
||||
" ld.global.s32 %r26, [%rd36+0];\n"
|
||||
" cvt.s64.s32 %rd37, %r26;\n"
|
||||
" mul.wide.s32 %rd38, %r26, 4;\n"
|
||||
" add.u64 %rd39, %rd26, %rd38;\n"
|
||||
" cvt.s64.s32 %rd40, %r15;\n"
|
||||
" mul.wide.s32 %rd41, %r15, 4;\n"
|
||||
" add.u64 %rd34, %rd39, %rd41;\n"
|
||||
" mov.s32 %r20, %r6;\n"
|
||||
" add.u64 %rd35, %rd25, %rd39;\n"
|
||||
"$Lt_1_23042:\n"
|
||||
" .loc 16 140 0\n"
|
||||
" ld.global.s32 %r27, [%rd22+0];\n"
|
||||
" mov.u32 %r28, %r27;\n"
|
||||
" mov.s32 %r29, 0;\n"
|
||||
" mov.u32 %r30, %r29;\n"
|
||||
" mov.s32 %r31, 0;\n"
|
||||
" mov.u32 %r32, %r31;\n"
|
||||
" mov.s32 %r33, 0;\n"
|
||||
" mov.u32 %r34, %r33;\n"
|
||||
" tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r28,%r30,%r32,%r34}];\n"
|
||||
" mov.f32 %f26, %f22;\n"
|
||||
" mov.f32 %f27, %f23;\n"
|
||||
" mov.f32 %f28, %f24;\n"
|
||||
" mov.f32 %f29, %f25;\n"
|
||||
" setp.ge.u64 %p6, %rd35, %rd34;\n"
|
||||
" @%p6 bra $Lt_1_31746;\n"
|
||||
" cvt.rzi.ftz.s32.f32 %r35, %f29;\n"
|
||||
" cvt.s64.s32 %rd42, %r20;\n"
|
||||
" mul.lo.s32 %r36, %r35, 11;\n"
|
||||
" cvt.rn.f32.s32 %f30, %r36;\n"
|
||||
" mov.f32 %f31, 0f00000000; \n"
|
||||
" mov.f32 %f32, 0f00000000; \n"
|
||||
" mov.f32 %f33, 0f00000000; \n"
|
||||
" mov.f32 %f34, 0f00000000; \n"
|
||||
"$Lt_1_24066:\n"
|
||||
" .loc 16 147 0\n"
|
||||
" ld.global.s32 %r37, [%rd35+0];\n"
|
||||
" .loc 16 148 0\n"
|
||||
" shr.s32 %r38, %r37, 30;\n"
|
||||
" and.b32 %r39, %r38, 3;\n"
|
||||
" cvt.s64.s32 %rd43, %r39;\n"
|
||||
" mul.wide.s32 %rd44, %r39, 4;\n"
|
||||
" add.u64 %rd45, %rd1, %rd44;\n"
|
||||
" ld.shared.f32 %f35, [%rd45+0];\n"
|
||||
" .loc 16 151 0\n"
|
||||
" and.b32 %r40, %r37, 1073741823;\n"
|
||||
" mov.u32 %r41, %r40;\n"
|
||||
" mov.s32 %r42, 0;\n"
|
||||
" mov.u32 %r43, %r42;\n"
|
||||
" mov.s32 %r44, 0;\n"
|
||||
" mov.u32 %r45, %r44;\n"
|
||||
" mov.s32 %r46, 0;\n"
|
||||
" mov.u32 %r47, %r46;\n"
|
||||
" tex.1d.v4.f32.s32 {%f36,%f37,%f38,%f39},[pos_tex,{%r41,%r43,%r45,%r47}];\n"
|
||||
" mov.f32 %f40, %f36;\n"
|
||||
" mov.f32 %f41, %f37;\n"
|
||||
" mov.f32 %f42, %f38;\n"
|
||||
" mov.f32 %f43, %f39;\n"
|
||||
" sub.ftz.f32 %f44, %f27, %f41;\n"
|
||||
" sub.ftz.f32 %f45, %f26, %f40;\n"
|
||||
" sub.ftz.f32 %f46, %f28, %f42;\n"
|
||||
" mul.ftz.f32 %f47, %f44, %f44;\n"
|
||||
" fma.rn.ftz.f32 %f48, %f45, %f45, %f47;\n"
|
||||
" fma.rn.ftz.f32 %f49, %f46, %f46, %f48;\n"
|
||||
" add.ftz.f32 %f50, %f30, %f43;\n"
|
||||
" cvt.rzi.ftz.s32.f32 %r48, %f50;\n"
|
||||
" cvt.s64.s32 %rd46, %r48;\n"
|
||||
" mul.wide.s32 %rd47, %r48, 16;\n"
|
||||
" add.u64 %rd48, %rd47, %rd7;\n"
|
||||
" ld.shared.f32 %f51, [%rd48+8];\n"
|
||||
" setp.gt.ftz.f32 %p7, %f51, %f49;\n"
|
||||
" @!%p7 bra $Lt_1_25346;\n"
|
||||
" .loc 16 163 0\n"
|
||||
" rcp.approx.ftz.f32 %f52, %f49;\n"
|
||||
" mul.ftz.f32 %f53, %f52, %f52;\n"
|
||||
" mul.ftz.f32 %f54, %f52, %f53;\n"
|
||||
" mul.ftz.f32 %f55, %f52, %f35;\n"
|
||||
" mul.ftz.f32 %f56, %f54, %f55;\n"
|
||||
" ld.shared.v2.f32 {%f57,%f58}, [%rd48+0];\n"
|
||||
" mul.ftz.f32 %f59, %f57, %f54;\n"
|
||||
" sub.ftz.f32 %f60, %f59, %f58;\n"
|
||||
" mul.ftz.f32 %f61, %f56, %f60;\n"
|
||||
" .loc 16 165 0\n"
|
||||
" fma.rn.ftz.f32 %f33, %f45, %f61, %f33;\n"
|
||||
" .loc 16 166 0\n"
|
||||
" fma.rn.ftz.f32 %f32, %f44, %f61, %f32;\n"
|
||||
" .loc 16 167 0\n"
|
||||
" fma.rn.ftz.f32 %f31, %f46, %f61, %f31;\n"
|
||||
" ld.param.s32 %r49, [__cudaparm_kernel_pair_fast_eflag];\n"
|
||||
" mov.u32 %r50, 0;\n"
|
||||
" setp.le.s32 %p8, %r49, %r50;\n"
|
||||
" @%p8 bra $Lt_1_24834;\n"
|
||||
" .loc 16 170 0\n"
|
||||
" add.u64 %rd49, %rd47, %rd13;\n"
|
||||
" ld.shared.v4.f32 {%f62,%f63,%f64,_}, [%rd49+0];\n"
|
||||
" mul.ftz.f32 %f65, %f62, %f54;\n"
|
||||
" sub.ftz.f32 %f66, %f65, %f63;\n"
|
||||
" mul.ftz.f32 %f67, %f54, %f66;\n"
|
||||
" .loc 16 171 0\n"
|
||||
" sub.ftz.f32 %f68, %f67, %f64;\n"
|
||||
" fma.rn.ftz.f32 %f34, %f35, %f68, %f34;\n"
|
||||
"$Lt_1_24834:\n"
|
||||
" ld.param.s32 %r51, [__cudaparm_kernel_pair_fast_vflag];\n"
|
||||
" mov.u32 %r52, 0;\n"
|
||||
" setp.le.s32 %p9, %r51, %r52;\n"
|
||||
" @%p9 bra $Lt_1_25346;\n"
|
||||
" .loc 16 174 0\n"
|
||||
" mov.f32 %f69, %f11;\n"
|
||||
" mul.ftz.f32 %f70, %f45, %f45;\n"
|
||||
" fma.rn.ftz.f32 %f71, %f61, %f70, %f69;\n"
|
||||
" mov.f32 %f11, %f71;\n"
|
||||
" .loc 16 175 0\n"
|
||||
" mov.f32 %f72, %f13;\n"
|
||||
" fma.rn.ftz.f32 %f73, %f61, %f47, %f72;\n"
|
||||
" mov.f32 %f13, %f73;\n"
|
||||
" .loc 16 176 0\n"
|
||||
" mov.f32 %f74, %f15;\n"
|
||||
" mul.ftz.f32 %f75, %f46, %f46;\n"
|
||||
" fma.rn.ftz.f32 %f76, %f61, %f75, %f74;\n"
|
||||
" mov.f32 %f15, %f76;\n"
|
||||
" .loc 16 177 0\n"
|
||||
" mov.f32 %f77, %f17;\n"
|
||||
" mul.ftz.f32 %f78, %f44, %f45;\n"
|
||||
" fma.rn.ftz.f32 %f79, %f61, %f78, %f77;\n"
|
||||
" mov.f32 %f17, %f79;\n"
|
||||
" .loc 16 178 0\n"
|
||||
" mov.f32 %f80, %f19;\n"
|
||||
" mul.ftz.f32 %f81, %f45, %f46;\n"
|
||||
" fma.rn.ftz.f32 %f82, %f61, %f81, %f80;\n"
|
||||
" mov.f32 %f19, %f82;\n"
|
||||
" .loc 16 179 0\n"
|
||||
" mul.ftz.f32 %f83, %f44, %f46;\n"
|
||||
" fma.rn.ftz.f32 %f20, %f61, %f83, %f20;\n"
|
||||
" mov.f32 %f21, %f20;\n"
|
||||
"$Lt_1_25346:\n"
|
||||
"$Lt_1_24322:\n"
|
||||
" .loc 16 145 0\n"
|
||||
" mul.lo.u64 %rd50, %rd42, 4;\n"
|
||||
" add.u64 %rd35, %rd35, %rd50;\n"
|
||||
" setp.lt.u64 %p10, %rd35, %rd34;\n"
|
||||
" @%p10 bra $Lt_1_24066;\n"
|
||||
" bra.uni $Lt_1_23554;\n"
|
||||
"$Lt_1_31746:\n"
|
||||
" mov.f32 %f31, 0f00000000; \n"
|
||||
" mov.f32 %f32, 0f00000000; \n"
|
||||
" mov.f32 %f33, 0f00000000; \n"
|
||||
" mov.f32 %f34, 0f00000000; \n"
|
||||
"$Lt_1_23554:\n"
|
||||
" mov.u32 %r53, 1;\n"
|
||||
" setp.le.s32 %p11, %r6, %r53;\n"
|
||||
" @%p11 bra $Lt_1_28162;\n"
|
||||
" .loc 16 184 0\n"
|
||||
" mov.u64 %rd51, __cuda___cuda_local_var_32685_55_non_const_red_acc7168;\n"
|
||||
" cvt.s64.s32 %rd52, %r1;\n"
|
||||
" mul.wide.s32 %rd53, %r1, 4;\n"
|
||||
" add.u64 %rd54, %rd51, %rd53;\n"
|
||||
" mov.f32 %f84, %f33;\n"
|
||||
" st.shared.f32 [%rd54+0], %f84;\n"
|
||||
" mov.f32 %f85, %f32;\n"
|
||||
" st.shared.f32 [%rd54+512], %f85;\n"
|
||||
" mov.f32 %f86, %f31;\n"
|
||||
" st.shared.f32 [%rd54+1024], %f86;\n"
|
||||
" mov.f32 %f87, %f34;\n"
|
||||
" st.shared.f32 [%rd54+1536], %f87;\n"
|
||||
" shr.s32 %r54, %r6, 31;\n"
|
||||
" mov.s32 %r55, 1;\n"
|
||||
" and.b32 %r56, %r54, %r55;\n"
|
||||
" add.s32 %r57, %r56, %r6;\n"
|
||||
" shr.s32 %r58, %r57, 1;\n"
|
||||
" mov.s32 %r59, %r58;\n"
|
||||
" mov.u32 %r60, 0;\n"
|
||||
" setp.ne.u32 %p12, %r58, %r60;\n"
|
||||
" @!%p12 bra $Lt_1_26626;\n"
|
||||
"$Lt_1_27138:\n"
|
||||
" setp.ge.u32 %p13, %r17, %r59;\n"
|
||||
" @%p13 bra $Lt_1_27394;\n"
|
||||
" add.u32 %r61, %r1, %r59;\n"
|
||||
" cvt.u64.u32 %rd55, %r61;\n"
|
||||
" mul.wide.u32 %rd56, %r61, 4;\n"
|
||||
" add.u64 %rd57, %rd51, %rd56;\n"
|
||||
" ld.shared.f32 %f88, [%rd57+0];\n"
|
||||
" add.ftz.f32 %f84, %f88, %f84;\n"
|
||||
" st.shared.f32 [%rd54+0], %f84;\n"
|
||||
" ld.shared.f32 %f89, [%rd57+512];\n"
|
||||
" add.ftz.f32 %f85, %f89, %f85;\n"
|
||||
" st.shared.f32 [%rd54+512], %f85;\n"
|
||||
" ld.shared.f32 %f90, [%rd57+1024];\n"
|
||||
" add.ftz.f32 %f86, %f90, %f86;\n"
|
||||
" st.shared.f32 [%rd54+1024], %f86;\n"
|
||||
" ld.shared.f32 %f91, [%rd57+1536];\n"
|
||||
" add.ftz.f32 %f87, %f91, %f87;\n"
|
||||
" st.shared.f32 [%rd54+1536], %f87;\n"
|
||||
"$Lt_1_27394:\n"
|
||||
" shr.u32 %r59, %r59, 1;\n"
|
||||
" mov.u32 %r62, 0;\n"
|
||||
" setp.ne.u32 %p14, %r59, %r62;\n"
|
||||
" @%p14 bra $Lt_1_27138;\n"
|
||||
"$Lt_1_26626:\n"
|
||||
" mov.f32 %f33, %f84;\n"
|
||||
" mov.f32 %f32, %f85;\n"
|
||||
" mov.f32 %f31, %f86;\n"
|
||||
" mov.f32 %f34, %f87;\n"
|
||||
" ld.param.s32 %r63, [__cudaparm_kernel_pair_fast_vflag];\n"
|
||||
" mov.u32 %r64, 0;\n"
|
||||
" setp.le.s32 %p15, %r63, %r64;\n"
|
||||
" @%p15 bra $Lt_1_28162;\n"
|
||||
" mov.f32 %f84, %f11;\n"
|
||||
" st.shared.f32 [%rd54+0], %f84;\n"
|
||||
" mov.f32 %f85, %f13;\n"
|
||||
" st.shared.f32 [%rd54+512], %f85;\n"
|
||||
" mov.f32 %f86, %f15;\n"
|
||||
" st.shared.f32 [%rd54+1024], %f86;\n"
|
||||
" mov.f32 %f87, %f17;\n"
|
||||
" st.shared.f32 [%rd54+1536], %f87;\n"
|
||||
" mov.f32 %f92, %f19;\n"
|
||||
" st.shared.f32 [%rd54+2048], %f92;\n"
|
||||
" mov.f32 %f93, %f20;\n"
|
||||
" st.shared.f32 [%rd54+2560], %f93;\n"
|
||||
" mov.s32 %r65, %r58;\n"
|
||||
" @!%p12 bra $Lt_1_28674;\n"
|
||||
"$Lt_1_29186:\n"
|
||||
" setp.ge.u32 %p16, %r17, %r65;\n"
|
||||
" @%p16 bra $Lt_1_29442;\n"
|
||||
" add.u32 %r66, %r1, %r65;\n"
|
||||
" cvt.u64.u32 %rd58, %r66;\n"
|
||||
" mul.wide.u32 %rd59, %r66, 4;\n"
|
||||
" add.u64 %rd60, %rd51, %rd59;\n"
|
||||
" ld.shared.f32 %f94, [%rd60+0];\n"
|
||||
" add.ftz.f32 %f84, %f94, %f84;\n"
|
||||
" st.shared.f32 [%rd54+0], %f84;\n"
|
||||
" ld.shared.f32 %f95, [%rd60+512];\n"
|
||||
" add.ftz.f32 %f85, %f95, %f85;\n"
|
||||
" st.shared.f32 [%rd54+512], %f85;\n"
|
||||
" ld.shared.f32 %f96, [%rd60+1024];\n"
|
||||
" add.ftz.f32 %f86, %f96, %f86;\n"
|
||||
" st.shared.f32 [%rd54+1024], %f86;\n"
|
||||
" ld.shared.f32 %f97, [%rd60+1536];\n"
|
||||
" add.ftz.f32 %f87, %f97, %f87;\n"
|
||||
" st.shared.f32 [%rd54+1536], %f87;\n"
|
||||
" ld.shared.f32 %f98, [%rd60+2048];\n"
|
||||
" add.ftz.f32 %f92, %f98, %f92;\n"
|
||||
" st.shared.f32 [%rd54+2048], %f92;\n"
|
||||
" ld.shared.f32 %f99, [%rd60+2560];\n"
|
||||
" add.ftz.f32 %f93, %f99, %f93;\n"
|
||||
" st.shared.f32 [%rd54+2560], %f93;\n"
|
||||
"$Lt_1_29442:\n"
|
||||
" shr.u32 %r65, %r65, 1;\n"
|
||||
" mov.u32 %r67, 0;\n"
|
||||
" setp.ne.u32 %p17, %r65, %r67;\n"
|
||||
" @%p17 bra $Lt_1_29186;\n"
|
||||
"$Lt_1_28674:\n"
|
||||
" mov.f32 %f11, %f84;\n"
|
||||
" mov.f32 %f13, %f85;\n"
|
||||
" mov.f32 %f15, %f86;\n"
|
||||
" mov.f32 %f17, %f87;\n"
|
||||
" mov.f32 %f19, %f92;\n"
|
||||
" mov.f32 %f21, %f93;\n"
|
||||
"$Lt_1_28162:\n"
|
||||
"$Lt_1_26114:\n"
|
||||
" mov.u32 %r68, 0;\n"
|
||||
" setp.ne.s32 %p18, %r17, %r68;\n"
|
||||
" @%p18 bra $Lt_1_30210;\n"
|
||||
" ld.param.u64 %rd61, [__cudaparm_kernel_pair_fast___val_paramengv];\n"
|
||||
" add.u64 %rd62, %rd61, %rd20;\n"
|
||||
" ld.param.s32 %r69, [__cudaparm_kernel_pair_fast_eflag];\n"
|
||||
" mov.u32 %r70, 0;\n"
|
||||
" setp.le.s32 %p19, %r69, %r70;\n"
|
||||
" @%p19 bra $Lt_1_30722;\n"
|
||||
" st.global.f32 [%rd62+0], %f34;\n"
|
||||
" cvt.s64.s32 %rd63, %r13;\n"
|
||||
" mul.wide.s32 %rd64, %r13, 4;\n"
|
||||
" add.u64 %rd62, %rd62, %rd64;\n"
|
||||
"$Lt_1_30722:\n"
|
||||
" ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag];\n"
|
||||
" mov.u32 %r72, 0;\n"
|
||||
" setp.le.s32 %p20, %r71, %r72;\n"
|
||||
" @%p20 bra $Lt_1_31234;\n"
|
||||
" mov.f32 %f100, %f11;\n"
|
||||
" st.global.f32 [%rd62+0], %f100;\n"
|
||||
" cvt.s64.s32 %rd65, %r13;\n"
|
||||
" mul.wide.s32 %rd66, %r13, 4;\n"
|
||||
" add.u64 %rd67, %rd66, %rd62;\n"
|
||||
" mov.f32 %f101, %f13;\n"
|
||||
" st.global.f32 [%rd67+0], %f101;\n"
|
||||
" add.u64 %rd68, %rd66, %rd67;\n"
|
||||
" mov.f32 %f102, %f15;\n"
|
||||
" st.global.f32 [%rd68+0], %f102;\n"
|
||||
" add.u64 %rd69, %rd66, %rd68;\n"
|
||||
" mov.f32 %f103, %f17;\n"
|
||||
" st.global.f32 [%rd69+0], %f103;\n"
|
||||
" add.u64 %rd62, %rd66, %rd69;\n"
|
||||
" mov.f32 %f104, %f19;\n"
|
||||
" st.global.f32 [%rd62+0], %f104;\n"
|
||||
" mov.f32 %f105, %f21;\n"
|
||||
" add.u64 %rd70, %rd66, %rd62;\n"
|
||||
" st.global.f32 [%rd70+0], %f105;\n"
|
||||
"$Lt_1_31234:\n"
|
||||
" ld.param.u64 %rd71, [__cudaparm_kernel_pair_fast_ans];\n"
|
||||
" mul.lo.u64 %rd72, %rd19, 16;\n"
|
||||
" add.u64 %rd73, %rd71, %rd72;\n"
|
||||
" mov.f32 %f106, %f107;\n"
|
||||
" st.global.v4.f32 [%rd73+0], {%f33,%f32,%f31,%f106};\n"
|
||||
"$Lt_1_30210:\n"
|
||||
"$Lt_1_22530:\n"
|
||||
" .loc 16 187 0\n"
|
||||
" exit;\n"
|
||||
"$LDWend_kernel_pair_fast:\n"
|
||||
" }\n"
|
||||
;
|
||||
@ -1,921 +0,0 @@
|
||||
.version 2.3
|
||||
.target sm_20
|
||||
.address_size 64
|
||||
// compiled with /usr/local/cuda/open64/lib//be
|
||||
// nvopencc 4.0 built on 2011-05-12
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Compiling /tmp/tmpxft_00009e26_00000000-9_lal_morse.cpp3.i (/home/sjplimp/ccBI#.ffCTdB)
|
||||
//-----------------------------------------------------------
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Options:
|
||||
//-----------------------------------------------------------
|
||||
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
|
||||
// -O3 (Optimization level)
|
||||
// -g0 (Debug level)
|
||||
// -m2 (Report advisories)
|
||||
//-----------------------------------------------------------
|
||||
|
||||
.file 1 "<command-line>"
|
||||
.file 2 "/tmp/tmpxft_00009e26_00000000-8_lal_morse.cudafe2.gpu"
|
||||
.file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
|
||||
.file 4 "/usr/local/cuda/include/crt/device_runtime.h"
|
||||
.file 5 "/usr/local/cuda/include/host_defines.h"
|
||||
.file 6 "/usr/local/cuda/include/builtin_types.h"
|
||||
.file 7 "/usr/local/cuda/include/device_types.h"
|
||||
.file 8 "/usr/local/cuda/include/driver_types.h"
|
||||
.file 9 "/usr/local/cuda/include/surface_types.h"
|
||||
.file 10 "/usr/local/cuda/include/texture_types.h"
|
||||
.file 11 "/usr/local/cuda/include/vector_types.h"
|
||||
.file 12 "/usr/local/cuda/include/device_launch_parameters.h"
|
||||
.file 13 "/usr/local/cuda/include/crt/storage_class.h"
|
||||
.file 14 "/usr/include/bits/types.h"
|
||||
.file 15 "/usr/include/time.h"
|
||||
.file 16 "lal_morse.cu"
|
||||
.file 17 "/usr/local/cuda/include/common_functions.h"
|
||||
.file 18 "/usr/local/cuda/include/math_functions.h"
|
||||
.file 19 "/usr/local/cuda/include/math_constants.h"
|
||||
.file 20 "/usr/local/cuda/include/device_functions.h"
|
||||
.file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h"
|
||||
.file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h"
|
||||
.file 23 "/usr/local/cuda/include/sm_13_double_functions.h"
|
||||
.file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h"
|
||||
.file 25 "/usr/local/cuda/include/sm_20_intrinsics.h"
|
||||
.file 26 "/usr/local/cuda/include/surface_functions.h"
|
||||
.file 27 "/usr/local/cuda/include/texture_fetch_functions.h"
|
||||
.file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
|
||||
|
||||
.global .texref pos_tex;
|
||||
|
||||
.entry kernel_pair (
|
||||
.param .u64 __cudaparm_kernel_pair_x_,
|
||||
.param .u64 __cudaparm_kernel_pair_mor1,
|
||||
.param .u64 __cudaparm_kernel_pair_mor2,
|
||||
.param .s32 __cudaparm_kernel_pair_lj_types,
|
||||
.param .u64 __cudaparm_kernel_pair_sp_lj_in,
|
||||
.param .u64 __cudaparm_kernel_pair_dev_nbor,
|
||||
.param .u64 __cudaparm_kernel_pair_dev_packed,
|
||||
.param .u64 __cudaparm_kernel_pair_ans,
|
||||
.param .u64 __cudaparm_kernel_pair___val_paramengv,
|
||||
.param .s32 __cudaparm_kernel_pair_eflag,
|
||||
.param .s32 __cudaparm_kernel_pair_vflag,
|
||||
.param .s32 __cudaparm_kernel_pair_inum,
|
||||
.param .s32 __cudaparm_kernel_pair_nbor_pitch,
|
||||
.param .s32 __cudaparm_kernel_pair_t_per_atom)
|
||||
{
|
||||
.reg .u32 %r<72>;
|
||||
.reg .u64 %rd<64>;
|
||||
.reg .f32 %f<104>;
|
||||
.reg .f64 %fd<10>;
|
||||
.reg .pred %p<19>;
|
||||
.shared .align 16 .b8 __cuda___cuda_local_var_32536_33_non_const_sp_lj92[16];
|
||||
.shared .align 4 .b8 __cuda___cuda_local_var_32601_55_non_const_red_acc108[3072];
|
||||
// __cuda_local_var_32543_10_non_const_f = 48
|
||||
// __cuda_local_var_32545_9_non_const_virial = 16
|
||||
.loc 16 31 0
|
||||
$LDWbegin_kernel_pair:
|
||||
.loc 16 36 0
|
||||
ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];
|
||||
ldu.global.f32 %f1, [%rd1+0];
|
||||
.loc 16 37 0
|
||||
ld.global.f32 %f2, [%rd1+4];
|
||||
.loc 16 38 0
|
||||
ld.global.f32 %f3, [%rd1+8];
|
||||
.loc 16 39 0
|
||||
ld.global.f32 %f4, [%rd1+12];
|
||||
st.shared.v4.f32 [__cuda___cuda_local_var_32536_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4};
|
||||
.loc 16 46 0
|
||||
mov.f32 %f5, 0f00000000; // 0
|
||||
mov.f32 %f6, %f5;
|
||||
mov.f32 %f7, 0f00000000; // 0
|
||||
mov.f32 %f8, %f7;
|
||||
mov.f32 %f9, 0f00000000; // 0
|
||||
mov.f32 %f10, %f9;
|
||||
mov.f32 %f11, 0f00000000; // 0
|
||||
mov.f32 %f12, %f11;
|
||||
mov.f32 %f13, 0f00000000; // 0
|
||||
mov.f32 %f14, %f13;
|
||||
mov.f32 %f15, 0f00000000; // 0
|
||||
mov.f32 %f16, %f15;
|
||||
ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];
|
||||
cvt.s32.u32 %r2, %tid.x;
|
||||
div.s32 %r3, %r2, %r1;
|
||||
cvt.s32.u32 %r4, %ntid.x;
|
||||
div.s32 %r5, %r4, %r1;
|
||||
cvt.s32.u32 %r6, %ctaid.x;
|
||||
mul.lo.s32 %r7, %r6, %r5;
|
||||
add.s32 %r8, %r3, %r7;
|
||||
ld.param.s32 %r9, [__cudaparm_kernel_pair_inum];
|
||||
setp.ge.s32 %p1, %r8, %r9;
|
||||
@%p1 bra $Lt_0_26370;
|
||||
.loc 16 51 0
|
||||
ld.param.s32 %r10, [__cudaparm_kernel_pair_nbor_pitch];
|
||||
cvt.s64.s32 %rd2, %r10;
|
||||
mul.wide.s32 %rd3, %r10, 4;
|
||||
cvt.s64.s32 %rd4, %r8;
|
||||
mul.wide.s32 %rd5, %r8, 4;
|
||||
ld.param.u64 %rd6, [__cudaparm_kernel_pair_dev_nbor];
|
||||
add.u64 %rd7, %rd5, %rd6;
|
||||
add.u64 %rd8, %rd3, %rd7;
|
||||
ld.global.s32 %r11, [%rd8+0];
|
||||
sub.s32 %r12, %r1, 1;
|
||||
and.b32 %r13, %r12, %r2;
|
||||
cvt.s64.s32 %rd9, %r13;
|
||||
mul.wide.s32 %rd10, %r13, 4;
|
||||
ld.param.u64 %rd11, [__cudaparm_kernel_pair_dev_packed];
|
||||
setp.ne.u64 %p2, %rd11, %rd6;
|
||||
@%p2 bra $Lt_0_19458;
|
||||
cvt.s32.s64 %r14, %rd2;
|
||||
mul.lo.s32 %r15, %r14, %r1;
|
||||
mov.s32 %r16, %r15;
|
||||
mul.lo.s32 %r17, %r12, %r8;
|
||||
add.s32 %r18, %r14, %r17;
|
||||
cvt.s64.s32 %rd12, %r18;
|
||||
mul.wide.s32 %rd13, %r18, 4;
|
||||
add.u64 %rd14, %rd8, %rd13;
|
||||
and.b32 %r19, %r12, %r11;
|
||||
cvt.s64.s32 %rd15, %r19;
|
||||
div.s32 %r20, %r11, %r1;
|
||||
mul.lo.s32 %r21, %r15, %r20;
|
||||
cvt.s64.s32 %rd16, %r21;
|
||||
add.u64 %rd17, %rd15, %rd16;
|
||||
mul.lo.u64 %rd18, %rd17, 4;
|
||||
add.u64 %rd19, %rd14, %rd18;
|
||||
add.u64 %rd20, %rd10, %rd14;
|
||||
bra.uni $Lt_0_19202;
|
||||
$Lt_0_19458:
|
||||
add.u64 %rd21, %rd3, %rd8;
|
||||
ld.global.s32 %r22, [%rd21+0];
|
||||
cvt.s64.s32 %rd22, %r22;
|
||||
mul.wide.s32 %rd23, %r22, 4;
|
||||
add.u64 %rd24, %rd11, %rd23;
|
||||
cvt.s64.s32 %rd25, %r11;
|
||||
mul.wide.s32 %rd26, %r11, 4;
|
||||
add.u64 %rd19, %rd24, %rd26;
|
||||
mov.s32 %r16, %r1;
|
||||
add.u64 %rd20, %rd10, %rd24;
|
||||
$Lt_0_19202:
|
||||
.loc 16 54 0
|
||||
ld.global.s32 %r23, [%rd7+0];
|
||||
mov.u32 %r24, %r23;
|
||||
mov.s32 %r25, 0;
|
||||
mov.u32 %r26, %r25;
|
||||
mov.s32 %r27, 0;
|
||||
mov.u32 %r28, %r27;
|
||||
mov.s32 %r29, 0;
|
||||
mov.u32 %r30, %r29;
|
||||
tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r24,%r26,%r28,%r30}];
|
||||
mov.f32 %f21, %f17;
|
||||
mov.f32 %f22, %f18;
|
||||
mov.f32 %f23, %f19;
|
||||
mov.f32 %f24, %f20;
|
||||
setp.ge.u64 %p3, %rd20, %rd19;
|
||||
@%p3 bra $Lt_0_27906;
|
||||
cvt.rzi.ftz.s32.f32 %r31, %f24;
|
||||
cvt.s64.s32 %rd27, %r16;
|
||||
ld.param.s32 %r32, [__cudaparm_kernel_pair_lj_types];
|
||||
mul.lo.s32 %r33, %r32, %r31;
|
||||
ld.param.u64 %rd28, [__cudaparm_kernel_pair_mor1];
|
||||
mov.f32 %f25, 0f00000000; // 0
|
||||
mov.f32 %f26, 0f00000000; // 0
|
||||
mov.f32 %f27, 0f00000000; // 0
|
||||
mov.f32 %f28, 0f00000000; // 0
|
||||
mov.u64 %rd29, __cuda___cuda_local_var_32536_33_non_const_sp_lj92;
|
||||
$Lt_0_20226:
|
||||
//<loop> Loop body line 54, nesting depth: 1, estimated iterations: unknown
|
||||
.loc 16 60 0
|
||||
ld.global.s32 %r34, [%rd20+0];
|
||||
.loc 16 61 0
|
||||
shr.s32 %r35, %r34, 30;
|
||||
and.b32 %r36, %r35, 3;
|
||||
cvt.s64.s32 %rd30, %r36;
|
||||
mul.wide.s32 %rd31, %r36, 4;
|
||||
add.u64 %rd32, %rd29, %rd31;
|
||||
ld.shared.f32 %f29, [%rd32+0];
|
||||
.loc 16 64 0
|
||||
and.b32 %r37, %r34, 1073741823;
|
||||
mov.u32 %r38, %r37;
|
||||
mov.s32 %r39, 0;
|
||||
mov.u32 %r40, %r39;
|
||||
mov.s32 %r41, 0;
|
||||
mov.u32 %r42, %r41;
|
||||
mov.s32 %r43, 0;
|
||||
mov.u32 %r44, %r43;
|
||||
tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r38,%r40,%r42,%r44}];
|
||||
mov.f32 %f34, %f30;
|
||||
mov.f32 %f35, %f31;
|
||||
mov.f32 %f36, %f32;
|
||||
mov.f32 %f37, %f33;
|
||||
cvt.rzi.ftz.s32.f32 %r45, %f37;
|
||||
sub.ftz.f32 %f38, %f22, %f35;
|
||||
sub.ftz.f32 %f39, %f21, %f34;
|
||||
sub.ftz.f32 %f40, %f23, %f36;
|
||||
mul.ftz.f32 %f41, %f38, %f38;
|
||||
fma.rn.ftz.f32 %f42, %f39, %f39, %f41;
|
||||
add.s32 %r46, %r45, %r33;
|
||||
cvt.s64.s32 %rd33, %r46;
|
||||
fma.rn.ftz.f32 %f43, %f40, %f40, %f42;
|
||||
mul.wide.s32 %rd34, %r46, 16;
|
||||
add.u64 %rd35, %rd28, %rd34;
|
||||
ld.global.f32 %f44, [%rd35+0];
|
||||
setp.gt.ftz.f32 %p4, %f44, %f43;
|
||||
@!%p4 bra $Lt_0_21506;
|
||||
.loc 16 77 0
|
||||
sqrt.approx.ftz.f32 %f45, %f43;
|
||||
ld.global.v4.f32 {_,%f46,%f47,%f48}, [%rd35+0];
|
||||
sub.ftz.f32 %f49, %f45, %f47;
|
||||
mul.ftz.f32 %f50, %f48, %f49;
|
||||
neg.ftz.f32 %f51, %f50;
|
||||
.loc 16 79 0
|
||||
mov.f32 %f52, 0f3fb8aa3b; // 1.4427
|
||||
mul.ftz.f32 %f53, %f51, %f52;
|
||||
ex2.approx.ftz.f32 %f54, %f53;
|
||||
mul.ftz.f32 %f55, %f54, %f54;
|
||||
sub.ftz.f32 %f56, %f55, %f54;
|
||||
mul.ftz.f32 %f57, %f46, %f56;
|
||||
.loc 16 81 0
|
||||
div.approx.ftz.f32 %f58, %f57, %f45;
|
||||
mul.ftz.f32 %f59, %f58, %f29;
|
||||
fma.rn.ftz.f32 %f27, %f39, %f59, %f27;
|
||||
.loc 16 82 0
|
||||
fma.rn.ftz.f32 %f26, %f38, %f59, %f26;
|
||||
.loc 16 83 0
|
||||
fma.rn.ftz.f32 %f25, %f40, %f59, %f25;
|
||||
ld.param.s32 %r47, [__cudaparm_kernel_pair_eflag];
|
||||
mov.u32 %r48, 0;
|
||||
setp.le.s32 %p5, %r47, %r48;
|
||||
@%p5 bra $Lt_0_20994;
|
||||
.loc 16 87 0
|
||||
cvt.ftz.f64.f32 %fd1, %f54;
|
||||
ld.param.u64 %rd36, [__cudaparm_kernel_pair_mor2];
|
||||
mul.lo.u64 %rd37, %rd33, 8;
|
||||
add.u64 %rd38, %rd36, %rd37;
|
||||
ld.global.v2.f32 {%f60,%f61}, [%rd38+0];
|
||||
cvt.ftz.f64.f32 %fd2, %f61;
|
||||
cvt.ftz.f64.f32 %fd3, %f60;
|
||||
mul.ftz.f32 %f62, %f54, %f54;
|
||||
cvt.ftz.f64.f32 %fd4, %f62;
|
||||
add.f64 %fd5, %fd1, %fd1;
|
||||
sub.f64 %fd6, %fd4, %fd5;
|
||||
mul.f64 %fd7, %fd3, %fd6;
|
||||
sub.f64 %fd8, %fd7, %fd2;
|
||||
cvt.rn.ftz.f32.f64 %f63, %fd8;
|
||||
fma.rn.ftz.f32 %f28, %f29, %f63, %f28;
|
||||
$Lt_0_20994:
|
||||
ld.param.s32 %r49, [__cudaparm_kernel_pair_vflag];
|
||||
mov.u32 %r50, 0;
|
||||
setp.le.s32 %p6, %r49, %r50;
|
||||
@%p6 bra $Lt_0_21506;
|
||||
.loc 16 90 0
|
||||
mov.f32 %f64, %f6;
|
||||
mul.ftz.f32 %f65, %f39, %f39;
|
||||
fma.rn.ftz.f32 %f66, %f59, %f65, %f64;
|
||||
mov.f32 %f6, %f66;
|
||||
.loc 16 91 0
|
||||
mov.f32 %f67, %f8;
|
||||
fma.rn.ftz.f32 %f68, %f59, %f41, %f67;
|
||||
mov.f32 %f8, %f68;
|
||||
.loc 16 92 0
|
||||
mov.f32 %f69, %f10;
|
||||
mul.ftz.f32 %f70, %f40, %f40;
|
||||
fma.rn.ftz.f32 %f71, %f59, %f70, %f69;
|
||||
mov.f32 %f10, %f71;
|
||||
.loc 16 93 0
|
||||
mov.f32 %f72, %f12;
|
||||
mul.ftz.f32 %f73, %f38, %f39;
|
||||
fma.rn.ftz.f32 %f74, %f59, %f73, %f72;
|
||||
mov.f32 %f12, %f74;
|
||||
.loc 16 94 0
|
||||
mov.f32 %f75, %f14;
|
||||
mul.ftz.f32 %f76, %f39, %f40;
|
||||
fma.rn.ftz.f32 %f77, %f59, %f76, %f75;
|
||||
mov.f32 %f14, %f77;
|
||||
.loc 16 95 0
|
||||
mul.ftz.f32 %f78, %f38, %f40;
|
||||
fma.rn.ftz.f32 %f15, %f59, %f78, %f15;
|
||||
mov.f32 %f16, %f15;
|
||||
$Lt_0_21506:
|
||||
$Lt_0_20482:
|
||||
.loc 16 58 0
|
||||
mul.lo.u64 %rd39, %rd27, 4;
|
||||
add.u64 %rd20, %rd20, %rd39;
|
||||
setp.lt.u64 %p7, %rd20, %rd19;
|
||||
@%p7 bra $Lt_0_20226;
|
||||
bra.uni $Lt_0_19714;
|
||||
$Lt_0_27906:
|
||||
mov.f32 %f25, 0f00000000; // 0
|
||||
mov.f32 %f26, 0f00000000; // 0
|
||||
mov.f32 %f27, 0f00000000; // 0
|
||||
mov.f32 %f28, 0f00000000; // 0
|
||||
$Lt_0_19714:
|
||||
mov.u32 %r51, 1;
|
||||
setp.le.s32 %p8, %r1, %r51;
|
||||
@%p8 bra $Lt_0_24322;
|
||||
.loc 16 100 0
|
||||
mov.u64 %rd40, __cuda___cuda_local_var_32601_55_non_const_red_acc108;
|
||||
cvt.s64.s32 %rd41, %r2;
|
||||
mul.wide.s32 %rd42, %r2, 4;
|
||||
add.u64 %rd43, %rd40, %rd42;
|
||||
mov.f32 %f79, %f27;
|
||||
st.shared.f32 [%rd43+0], %f79;
|
||||
mov.f32 %f80, %f26;
|
||||
st.shared.f32 [%rd43+512], %f80;
|
||||
mov.f32 %f81, %f25;
|
||||
st.shared.f32 [%rd43+1024], %f81;
|
||||
mov.f32 %f82, %f28;
|
||||
st.shared.f32 [%rd43+1536], %f82;
|
||||
shr.s32 %r52, %r1, 31;
|
||||
mov.s32 %r53, 1;
|
||||
and.b32 %r54, %r52, %r53;
|
||||
add.s32 %r55, %r54, %r1;
|
||||
shr.s32 %r56, %r55, 1;
|
||||
mov.s32 %r57, %r56;
|
||||
mov.u32 %r58, 0;
|
||||
setp.ne.u32 %p9, %r56, %r58;
|
||||
@!%p9 bra $Lt_0_22786;
|
||||
$Lt_0_23298:
|
||||
setp.ge.u32 %p10, %r13, %r57;
|
||||
@%p10 bra $Lt_0_23554;
|
||||
add.u32 %r59, %r2, %r57;
|
||||
cvt.u64.u32 %rd44, %r59;
|
||||
mul.wide.u32 %rd45, %r59, 4;
|
||||
add.u64 %rd46, %rd40, %rd45;
|
||||
ld.shared.f32 %f83, [%rd46+0];
|
||||
add.ftz.f32 %f79, %f83, %f79;
|
||||
st.shared.f32 [%rd43+0], %f79;
|
||||
ld.shared.f32 %f84, [%rd46+512];
|
||||
add.ftz.f32 %f80, %f84, %f80;
|
||||
st.shared.f32 [%rd43+512], %f80;
|
||||
ld.shared.f32 %f85, [%rd46+1024];
|
||||
add.ftz.f32 %f81, %f85, %f81;
|
||||
st.shared.f32 [%rd43+1024], %f81;
|
||||
ld.shared.f32 %f86, [%rd46+1536];
|
||||
add.ftz.f32 %f82, %f86, %f82;
|
||||
st.shared.f32 [%rd43+1536], %f82;
|
||||
$Lt_0_23554:
|
||||
shr.u32 %r57, %r57, 1;
|
||||
mov.u32 %r60, 0;
|
||||
setp.ne.u32 %p11, %r57, %r60;
|
||||
@%p11 bra $Lt_0_23298;
|
||||
$Lt_0_22786:
|
||||
mov.f32 %f27, %f79;
|
||||
mov.f32 %f26, %f80;
|
||||
mov.f32 %f25, %f81;
|
||||
mov.f32 %f28, %f82;
|
||||
ld.param.s32 %r61, [__cudaparm_kernel_pair_vflag];
|
||||
mov.u32 %r62, 0;
|
||||
setp.le.s32 %p12, %r61, %r62;
|
||||
@%p12 bra $Lt_0_24322;
|
||||
mov.f32 %f79, %f6;
|
||||
st.shared.f32 [%rd43+0], %f79;
|
||||
mov.f32 %f80, %f8;
|
||||
st.shared.f32 [%rd43+512], %f80;
|
||||
mov.f32 %f81, %f10;
|
||||
st.shared.f32 [%rd43+1024], %f81;
|
||||
mov.f32 %f82, %f12;
|
||||
st.shared.f32 [%rd43+1536], %f82;
|
||||
mov.f32 %f87, %f14;
|
||||
st.shared.f32 [%rd43+2048], %f87;
|
||||
mov.f32 %f88, %f15;
|
||||
st.shared.f32 [%rd43+2560], %f88;
|
||||
mov.s32 %r63, %r56;
|
||||
@!%p9 bra $Lt_0_24834;
|
||||
$Lt_0_25346:
|
||||
setp.ge.u32 %p13, %r13, %r63;
|
||||
@%p13 bra $Lt_0_25602;
|
||||
add.u32 %r64, %r2, %r63;
|
||||
cvt.u64.u32 %rd47, %r64;
|
||||
mul.wide.u32 %rd48, %r64, 4;
|
||||
add.u64 %rd49, %rd40, %rd48;
|
||||
ld.shared.f32 %f89, [%rd49+0];
|
||||
add.ftz.f32 %f79, %f89, %f79;
|
||||
st.shared.f32 [%rd43+0], %f79;
|
||||
ld.shared.f32 %f90, [%rd49+512];
|
||||
add.ftz.f32 %f80, %f90, %f80;
|
||||
st.shared.f32 [%rd43+512], %f80;
|
||||
ld.shared.f32 %f91, [%rd49+1024];
|
||||
add.ftz.f32 %f81, %f91, %f81;
|
||||
st.shared.f32 [%rd43+1024], %f81;
|
||||
ld.shared.f32 %f92, [%rd49+1536];
|
||||
add.ftz.f32 %f82, %f92, %f82;
|
||||
st.shared.f32 [%rd43+1536], %f82;
|
||||
ld.shared.f32 %f93, [%rd49+2048];
|
||||
add.ftz.f32 %f87, %f93, %f87;
|
||||
st.shared.f32 [%rd43+2048], %f87;
|
||||
ld.shared.f32 %f94, [%rd49+2560];
|
||||
add.ftz.f32 %f88, %f94, %f88;
|
||||
st.shared.f32 [%rd43+2560], %f88;
|
||||
$Lt_0_25602:
|
||||
shr.u32 %r63, %r63, 1;
|
||||
mov.u32 %r65, 0;
|
||||
setp.ne.u32 %p14, %r63, %r65;
|
||||
@%p14 bra $Lt_0_25346;
|
||||
$Lt_0_24834:
|
||||
mov.f32 %f6, %f79;
|
||||
mov.f32 %f8, %f80;
|
||||
mov.f32 %f10, %f81;
|
||||
mov.f32 %f12, %f82;
|
||||
mov.f32 %f14, %f87;
|
||||
mov.f32 %f16, %f88;
|
||||
$Lt_0_24322:
|
||||
$Lt_0_22274:
|
||||
mov.u32 %r66, 0;
|
||||
setp.ne.s32 %p15, %r13, %r66;
|
||||
@%p15 bra $Lt_0_26370;
|
||||
ld.param.u64 %rd50, [__cudaparm_kernel_pair___val_paramengv];
|
||||
add.u64 %rd51, %rd50, %rd5;
|
||||
ld.param.s32 %r67, [__cudaparm_kernel_pair_eflag];
|
||||
mov.u32 %r68, 0;
|
||||
setp.le.s32 %p16, %r67, %r68;
|
||||
@%p16 bra $Lt_0_26882;
|
||||
st.global.f32 [%rd51+0], %f28;
|
||||
cvt.s64.s32 %rd52, %r9;
|
||||
mul.wide.s32 %rd53, %r9, 4;
|
||||
add.u64 %rd51, %rd51, %rd53;
|
||||
$Lt_0_26882:
|
||||
ld.param.s32 %r69, [__cudaparm_kernel_pair_vflag];
|
||||
mov.u32 %r70, 0;
|
||||
setp.le.s32 %p17, %r69, %r70;
|
||||
@%p17 bra $Lt_0_27394;
|
||||
mov.f32 %f95, %f6;
|
||||
st.global.f32 [%rd51+0], %f95;
|
||||
cvt.s64.s32 %rd54, %r9;
|
||||
mul.wide.s32 %rd55, %r9, 4;
|
||||
add.u64 %rd56, %rd55, %rd51;
|
||||
mov.f32 %f96, %f8;
|
||||
st.global.f32 [%rd56+0], %f96;
|
||||
add.u64 %rd57, %rd55, %rd56;
|
||||
mov.f32 %f97, %f10;
|
||||
st.global.f32 [%rd57+0], %f97;
|
||||
add.u64 %rd58, %rd55, %rd57;
|
||||
mov.f32 %f98, %f12;
|
||||
st.global.f32 [%rd58+0], %f98;
|
||||
add.u64 %rd51, %rd55, %rd58;
|
||||
mov.f32 %f99, %f14;
|
||||
st.global.f32 [%rd51+0], %f99;
|
||||
mov.f32 %f100, %f16;
|
||||
add.u64 %rd59, %rd55, %rd51;
|
||||
st.global.f32 [%rd59+0], %f100;
|
||||
$Lt_0_27394:
|
||||
ld.param.u64 %rd60, [__cudaparm_kernel_pair_ans];
|
||||
mul.lo.u64 %rd61, %rd4, 16;
|
||||
add.u64 %rd62, %rd60, %rd61;
|
||||
mov.f32 %f101, %f102;
|
||||
st.global.v4.f32 [%rd62+0], {%f27,%f26,%f25,%f101};
|
||||
$Lt_0_26370:
|
||||
$Lt_0_18690:
|
||||
.loc 16 103 0
|
||||
exit;
|
||||
$LDWend_kernel_pair:
|
||||
} // kernel_pair
|
||||
|
||||
.entry kernel_pair_fast (
|
||||
.param .u64 __cudaparm_kernel_pair_fast_x_,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_mor1_in,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_mor2_in,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_dev_nbor,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_dev_packed,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_ans,
|
||||
.param .u64 __cudaparm_kernel_pair_fast___val_paramengv,
|
||||
.param .s32 __cudaparm_kernel_pair_fast_eflag,
|
||||
.param .s32 __cudaparm_kernel_pair_fast_vflag,
|
||||
.param .s32 __cudaparm_kernel_pair_fast_inum,
|
||||
.param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,
|
||||
.param .s32 __cudaparm_kernel_pair_fast_t_per_atom)
|
||||
{
|
||||
.reg .u32 %r<74>;
|
||||
.reg .u64 %rd<77>;
|
||||
.reg .f32 %f<110>;
|
||||
.reg .pred %p<22>;
|
||||
.shared .align 4 .b8 __cuda___cuda_local_var_32618_33_non_const_sp_lj3268[16];
|
||||
.shared .align 16 .b8 __cuda___cuda_local_var_32616_34_non_const_mor13296[1936];
|
||||
.shared .align 8 .b8 __cuda___cuda_local_var_32617_34_non_const_mor25232[968];
|
||||
.shared .align 4 .b8 __cuda___cuda_local_var_32688_55_non_const_red_acc6200[3072];
|
||||
// __cuda_local_var_32628_10_non_const_f = 48
|
||||
// __cuda_local_var_32630_9_non_const_virial = 16
|
||||
.loc 16 111 0
|
||||
$LDWbegin_kernel_pair_fast:
|
||||
cvt.s32.u32 %r1, %tid.x;
|
||||
mov.u32 %r2, 3;
|
||||
setp.gt.s32 %p1, %r1, %r2;
|
||||
@%p1 bra $Lt_1_20994;
|
||||
.loc 16 119 0
|
||||
mov.u64 %rd1, __cuda___cuda_local_var_32618_33_non_const_sp_lj3268;
|
||||
cvt.s64.s32 %rd2, %r1;
|
||||
mul.wide.s32 %rd3, %r1, 4;
|
||||
ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];
|
||||
add.u64 %rd5, %rd4, %rd3;
|
||||
ld.global.f32 %f1, [%rd5+0];
|
||||
add.u64 %rd6, %rd3, %rd1;
|
||||
st.shared.f32 [%rd6+0], %f1;
|
||||
$Lt_1_20994:
|
||||
mov.u64 %rd1, __cuda___cuda_local_var_32618_33_non_const_sp_lj3268;
|
||||
mov.u32 %r3, 120;
|
||||
setp.gt.s32 %p2, %r1, %r3;
|
||||
@%p2 bra $Lt_1_21506;
|
||||
.loc 16 121 0
|
||||
mov.u64 %rd7, __cuda___cuda_local_var_32616_34_non_const_mor13296;
|
||||
cvt.s64.s32 %rd8, %r1;
|
||||
mul.wide.s32 %rd9, %r1, 16;
|
||||
ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_mor1_in];
|
||||
add.u64 %rd11, %rd10, %rd9;
|
||||
add.u64 %rd12, %rd9, %rd7;
|
||||
ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0];
|
||||
st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5};
|
||||
ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag];
|
||||
mov.u32 %r5, 0;
|
||||
setp.le.s32 %p3, %r4, %r5;
|
||||
@%p3 bra $Lt_1_22018;
|
||||
.loc 16 123 0
|
||||
mov.u64 %rd13, __cuda___cuda_local_var_32617_34_non_const_mor25232;
|
||||
mul.lo.u64 %rd14, %rd8, 8;
|
||||
ld.param.u64 %rd15, [__cudaparm_kernel_pair_fast_mor2_in];
|
||||
add.u64 %rd16, %rd15, %rd14;
|
||||
add.u64 %rd17, %rd14, %rd13;
|
||||
ld.global.v2.f32 {%f6,%f7}, [%rd16+0];
|
||||
st.shared.v2.f32 [%rd17+0], {%f6,%f7};
|
||||
$Lt_1_22018:
|
||||
mov.u64 %rd13, __cuda___cuda_local_var_32617_34_non_const_mor25232;
|
||||
$Lt_1_21506:
|
||||
mov.u64 %rd13, __cuda___cuda_local_var_32617_34_non_const_mor25232;
|
||||
mov.u64 %rd7, __cuda___cuda_local_var_32616_34_non_const_mor13296;
|
||||
.loc 16 131 0
|
||||
mov.f32 %f8, 0f00000000; // 0
|
||||
mov.f32 %f9, %f8;
|
||||
mov.f32 %f10, 0f00000000; // 0
|
||||
mov.f32 %f11, %f10;
|
||||
mov.f32 %f12, 0f00000000; // 0
|
||||
mov.f32 %f13, %f12;
|
||||
mov.f32 %f14, 0f00000000; // 0
|
||||
mov.f32 %f15, %f14;
|
||||
mov.f32 %f16, 0f00000000; // 0
|
||||
mov.f32 %f17, %f16;
|
||||
mov.f32 %f18, 0f00000000; // 0
|
||||
mov.f32 %f19, %f18;
|
||||
.loc 16 133 0
|
||||
bar.sync 0;
|
||||
ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom];
|
||||
div.s32 %r7, %r1, %r6;
|
||||
cvt.s32.u32 %r8, %ntid.x;
|
||||
div.s32 %r9, %r8, %r6;
|
||||
cvt.s32.u32 %r10, %ctaid.x;
|
||||
mul.lo.s32 %r11, %r10, %r9;
|
||||
add.s32 %r12, %r7, %r11;
|
||||
ld.param.s32 %r13, [__cudaparm_kernel_pair_fast_inum];
|
||||
setp.ge.s32 %p4, %r12, %r13;
|
||||
@%p4 bra $Lt_1_30210;
|
||||
.loc 16 138 0
|
||||
ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_nbor_pitch];
|
||||
cvt.s64.s32 %rd18, %r14;
|
||||
mul.wide.s32 %rd19, %r14, 4;
|
||||
cvt.s64.s32 %rd20, %r12;
|
||||
mul.wide.s32 %rd21, %r12, 4;
|
||||
ld.param.u64 %rd22, [__cudaparm_kernel_pair_fast_dev_nbor];
|
||||
add.u64 %rd23, %rd21, %rd22;
|
||||
add.u64 %rd24, %rd19, %rd23;
|
||||
ld.global.s32 %r15, [%rd24+0];
|
||||
sub.s32 %r16, %r6, 1;
|
||||
and.b32 %r17, %r16, %r1;
|
||||
cvt.s64.s32 %rd25, %r17;
|
||||
mul.wide.s32 %rd26, %r17, 4;
|
||||
ld.param.u64 %rd27, [__cudaparm_kernel_pair_fast_dev_packed];
|
||||
setp.ne.u64 %p5, %rd27, %rd22;
|
||||
@%p5 bra $Lt_1_23298;
|
||||
cvt.s32.s64 %r18, %rd18;
|
||||
mul.lo.s32 %r19, %r18, %r6;
|
||||
mov.s32 %r20, %r19;
|
||||
mul.lo.s32 %r21, %r16, %r12;
|
||||
add.s32 %r22, %r18, %r21;
|
||||
cvt.s64.s32 %rd28, %r22;
|
||||
mul.wide.s32 %rd29, %r22, 4;
|
||||
add.u64 %rd30, %rd24, %rd29;
|
||||
and.b32 %r23, %r16, %r15;
|
||||
cvt.s64.s32 %rd31, %r23;
|
||||
div.s32 %r24, %r15, %r6;
|
||||
mul.lo.s32 %r25, %r19, %r24;
|
||||
cvt.s64.s32 %rd32, %r25;
|
||||
add.u64 %rd33, %rd31, %rd32;
|
||||
mul.lo.u64 %rd34, %rd33, 4;
|
||||
add.u64 %rd35, %rd30, %rd34;
|
||||
add.u64 %rd36, %rd26, %rd30;
|
||||
bra.uni $Lt_1_23042;
|
||||
$Lt_1_23298:
|
||||
add.u64 %rd37, %rd19, %rd24;
|
||||
ld.global.s32 %r26, [%rd37+0];
|
||||
cvt.s64.s32 %rd38, %r26;
|
||||
mul.wide.s32 %rd39, %r26, 4;
|
||||
add.u64 %rd40, %rd27, %rd39;
|
||||
cvt.s64.s32 %rd41, %r15;
|
||||
mul.wide.s32 %rd42, %r15, 4;
|
||||
add.u64 %rd35, %rd40, %rd42;
|
||||
mov.s32 %r20, %r6;
|
||||
add.u64 %rd36, %rd26, %rd40;
|
||||
$Lt_1_23042:
|
||||
.loc 16 141 0
|
||||
ld.global.s32 %r27, [%rd23+0];
|
||||
mov.u32 %r28, %r27;
|
||||
mov.s32 %r29, 0;
|
||||
mov.u32 %r30, %r29;
|
||||
mov.s32 %r31, 0;
|
||||
mov.u32 %r32, %r31;
|
||||
mov.s32 %r33, 0;
|
||||
mov.u32 %r34, %r33;
|
||||
tex.1d.v4.f32.s32 {%f20,%f21,%f22,%f23},[pos_tex,{%r28,%r30,%r32,%r34}];
|
||||
mov.f32 %f24, %f20;
|
||||
mov.f32 %f25, %f21;
|
||||
mov.f32 %f26, %f22;
|
||||
mov.f32 %f27, %f23;
|
||||
setp.ge.u64 %p6, %rd36, %rd35;
|
||||
@%p6 bra $Lt_1_31746;
|
||||
cvt.rzi.ftz.s32.f32 %r35, %f27;
|
||||
cvt.s64.s32 %rd43, %r20;
|
||||
mul.lo.s32 %r36, %r35, 11;
|
||||
cvt.rn.f32.s32 %f28, %r36;
|
||||
mov.f32 %f29, 0f00000000; // 0
|
||||
mov.f32 %f30, 0f00000000; // 0
|
||||
mov.f32 %f31, 0f00000000; // 0
|
||||
mov.f32 %f32, 0f00000000; // 0
|
||||
$Lt_1_24066:
|
||||
//<loop> Loop body line 141, nesting depth: 1, estimated iterations: unknown
|
||||
.loc 16 148 0
|
||||
ld.global.s32 %r37, [%rd36+0];
|
||||
.loc 16 149 0
|
||||
shr.s32 %r38, %r37, 30;
|
||||
and.b32 %r39, %r38, 3;
|
||||
cvt.s64.s32 %rd44, %r39;
|
||||
mul.wide.s32 %rd45, %r39, 4;
|
||||
add.u64 %rd46, %rd1, %rd45;
|
||||
ld.shared.f32 %f33, [%rd46+0];
|
||||
.loc 16 152 0
|
||||
and.b32 %r40, %r37, 1073741823;
|
||||
mov.u32 %r41, %r40;
|
||||
mov.s32 %r42, 0;
|
||||
mov.u32 %r43, %r42;
|
||||
mov.s32 %r44, 0;
|
||||
mov.u32 %r45, %r44;
|
||||
mov.s32 %r46, 0;
|
||||
mov.u32 %r47, %r46;
|
||||
tex.1d.v4.f32.s32 {%f34,%f35,%f36,%f37},[pos_tex,{%r41,%r43,%r45,%r47}];
|
||||
mov.f32 %f38, %f34;
|
||||
mov.f32 %f39, %f35;
|
||||
mov.f32 %f40, %f36;
|
||||
mov.f32 %f41, %f37;
|
||||
sub.ftz.f32 %f42, %f25, %f39;
|
||||
sub.ftz.f32 %f43, %f24, %f38;
|
||||
sub.ftz.f32 %f44, %f26, %f40;
|
||||
mul.ftz.f32 %f45, %f42, %f42;
|
||||
fma.rn.ftz.f32 %f46, %f43, %f43, %f45;
|
||||
fma.rn.ftz.f32 %f47, %f44, %f44, %f46;
|
||||
add.ftz.f32 %f48, %f28, %f41;
|
||||
cvt.rzi.ftz.s32.f32 %r48, %f48;
|
||||
cvt.s64.s32 %rd47, %r48;
|
||||
mul.wide.s32 %rd48, %r48, 16;
|
||||
add.u64 %rd49, %rd7, %rd48;
|
||||
ld.shared.f32 %f49, [%rd49+0];
|
||||
setp.gt.ftz.f32 %p7, %f49, %f47;
|
||||
@!%p7 bra $Lt_1_25346;
|
||||
.loc 16 163 0
|
||||
sqrt.approx.ftz.f32 %f50, %f47;
|
||||
ld.shared.v4.f32 {_,%f51,%f52,%f53}, [%rd49+0];
|
||||
sub.ftz.f32 %f54, %f50, %f52;
|
||||
.loc 16 164 0
|
||||
mul.ftz.f32 %f55, %f53, %f54;
|
||||
neg.ftz.f32 %f56, %f55;
|
||||
.loc 16 166 0
|
||||
mov.f32 %f57, 0f3fb8aa3b; // 1.4427
|
||||
mul.ftz.f32 %f58, %f56, %f57;
|
||||
ex2.approx.ftz.f32 %f59, %f58;
|
||||
mul.ftz.f32 %f60, %f59, %f59;
|
||||
sub.ftz.f32 %f61, %f60, %f59;
|
||||
mul.ftz.f32 %f62, %f51, %f61;
|
||||
.loc 16 168 0
|
||||
div.approx.ftz.f32 %f63, %f62, %f50;
|
||||
mul.ftz.f32 %f64, %f63, %f33;
|
||||
fma.rn.ftz.f32 %f31, %f43, %f64, %f31;
|
||||
.loc 16 169 0
|
||||
fma.rn.ftz.f32 %f30, %f42, %f64, %f30;
|
||||
.loc 16 170 0
|
||||
fma.rn.ftz.f32 %f29, %f44, %f64, %f29;
|
||||
ld.param.s32 %r49, [__cudaparm_kernel_pair_fast_eflag];
|
||||
mov.u32 %r50, 0;
|
||||
setp.le.s32 %p8, %r49, %r50;
|
||||
@%p8 bra $Lt_1_24834;
|
||||
.loc 16 173 0
|
||||
mul.lo.u64 %rd50, %rd47, 8;
|
||||
add.u64 %rd51, %rd13, %rd50;
|
||||
ld.shared.v2.f32 {%f65,%f66}, [%rd51+0];
|
||||
sub.ftz.f32 %f67, %f61, %f59;
|
||||
mul.ftz.f32 %f68, %f65, %f67;
|
||||
sub.ftz.f32 %f69, %f68, %f66;
|
||||
.loc 16 174 0
|
||||
fma.rn.ftz.f32 %f32, %f33, %f69, %f32;
|
||||
$Lt_1_24834:
|
||||
ld.param.s32 %r51, [__cudaparm_kernel_pair_fast_vflag];
|
||||
mov.u32 %r52, 0;
|
||||
setp.le.s32 %p9, %r51, %r52;
|
||||
@%p9 bra $Lt_1_25346;
|
||||
.loc 16 177 0
|
||||
mov.f32 %f70, %f9;
|
||||
mul.ftz.f32 %f71, %f43, %f43;
|
||||
fma.rn.ftz.f32 %f72, %f64, %f71, %f70;
|
||||
mov.f32 %f9, %f72;
|
||||
.loc 16 178 0
|
||||
mov.f32 %f73, %f11;
|
||||
fma.rn.ftz.f32 %f74, %f64, %f45, %f73;
|
||||
mov.f32 %f11, %f74;
|
||||
.loc 16 179 0
|
||||
mov.f32 %f75, %f13;
|
||||
mul.ftz.f32 %f76, %f44, %f44;
|
||||
fma.rn.ftz.f32 %f77, %f64, %f76, %f75;
|
||||
mov.f32 %f13, %f77;
|
||||
.loc 16 180 0
|
||||
mov.f32 %f78, %f15;
|
||||
mul.ftz.f32 %f79, %f42, %f43;
|
||||
fma.rn.ftz.f32 %f80, %f64, %f79, %f78;
|
||||
mov.f32 %f15, %f80;
|
||||
.loc 16 181 0
|
||||
mov.f32 %f81, %f17;
|
||||
mul.ftz.f32 %f82, %f43, %f44;
|
||||
fma.rn.ftz.f32 %f83, %f64, %f82, %f81;
|
||||
mov.f32 %f17, %f83;
|
||||
.loc 16 182 0
|
||||
mul.ftz.f32 %f84, %f42, %f44;
|
||||
fma.rn.ftz.f32 %f18, %f64, %f84, %f18;
|
||||
mov.f32 %f19, %f18;
|
||||
$Lt_1_25346:
|
||||
$Lt_1_24322:
|
||||
.loc 16 146 0
|
||||
mul.lo.u64 %rd52, %rd43, 4;
|
||||
add.u64 %rd36, %rd36, %rd52;
|
||||
setp.lt.u64 %p10, %rd36, %rd35;
|
||||
@%p10 bra $Lt_1_24066;
|
||||
bra.uni $Lt_1_23554;
|
||||
$Lt_1_31746:
|
||||
mov.f32 %f29, 0f00000000; // 0
|
||||
mov.f32 %f30, 0f00000000; // 0
|
||||
mov.f32 %f31, 0f00000000; // 0
|
||||
mov.f32 %f32, 0f00000000; // 0
|
||||
$Lt_1_23554:
|
||||
mov.u32 %r53, 1;
|
||||
setp.le.s32 %p11, %r6, %r53;
|
||||
@%p11 bra $Lt_1_28162;
|
||||
.loc 16 187 0
|
||||
mov.u64 %rd53, __cuda___cuda_local_var_32688_55_non_const_red_acc6200;
|
||||
cvt.s64.s32 %rd54, %r1;
|
||||
mul.wide.s32 %rd55, %r1, 4;
|
||||
add.u64 %rd56, %rd53, %rd55;
|
||||
mov.f32 %f85, %f31;
|
||||
st.shared.f32 [%rd56+0], %f85;
|
||||
mov.f32 %f86, %f30;
|
||||
st.shared.f32 [%rd56+512], %f86;
|
||||
mov.f32 %f87, %f29;
|
||||
st.shared.f32 [%rd56+1024], %f87;
|
||||
mov.f32 %f88, %f32;
|
||||
st.shared.f32 [%rd56+1536], %f88;
|
||||
shr.s32 %r54, %r6, 31;
|
||||
mov.s32 %r55, 1;
|
||||
and.b32 %r56, %r54, %r55;
|
||||
add.s32 %r57, %r56, %r6;
|
||||
shr.s32 %r58, %r57, 1;
|
||||
mov.s32 %r59, %r58;
|
||||
mov.u32 %r60, 0;
|
||||
setp.ne.u32 %p12, %r58, %r60;
|
||||
@!%p12 bra $Lt_1_26626;
|
||||
$Lt_1_27138:
|
||||
setp.ge.u32 %p13, %r17, %r59;
|
||||
@%p13 bra $Lt_1_27394;
|
||||
add.u32 %r61, %r1, %r59;
|
||||
cvt.u64.u32 %rd57, %r61;
|
||||
mul.wide.u32 %rd58, %r61, 4;
|
||||
add.u64 %rd59, %rd53, %rd58;
|
||||
ld.shared.f32 %f89, [%rd59+0];
|
||||
add.ftz.f32 %f85, %f89, %f85;
|
||||
st.shared.f32 [%rd56+0], %f85;
|
||||
ld.shared.f32 %f90, [%rd59+512];
|
||||
add.ftz.f32 %f86, %f90, %f86;
|
||||
st.shared.f32 [%rd56+512], %f86;
|
||||
ld.shared.f32 %f91, [%rd59+1024];
|
||||
add.ftz.f32 %f87, %f91, %f87;
|
||||
st.shared.f32 [%rd56+1024], %f87;
|
||||
ld.shared.f32 %f92, [%rd59+1536];
|
||||
add.ftz.f32 %f88, %f92, %f88;
|
||||
st.shared.f32 [%rd56+1536], %f88;
|
||||
$Lt_1_27394:
|
||||
shr.u32 %r59, %r59, 1;
|
||||
mov.u32 %r62, 0;
|
||||
setp.ne.u32 %p14, %r59, %r62;
|
||||
@%p14 bra $Lt_1_27138;
|
||||
$Lt_1_26626:
|
||||
mov.f32 %f31, %f85;
|
||||
mov.f32 %f30, %f86;
|
||||
mov.f32 %f29, %f87;
|
||||
mov.f32 %f32, %f88;
|
||||
ld.param.s32 %r63, [__cudaparm_kernel_pair_fast_vflag];
|
||||
mov.u32 %r64, 0;
|
||||
setp.le.s32 %p15, %r63, %r64;
|
||||
@%p15 bra $Lt_1_28162;
|
||||
mov.f32 %f85, %f9;
|
||||
st.shared.f32 [%rd56+0], %f85;
|
||||
mov.f32 %f86, %f11;
|
||||
st.shared.f32 [%rd56+512], %f86;
|
||||
mov.f32 %f87, %f13;
|
||||
st.shared.f32 [%rd56+1024], %f87;
|
||||
mov.f32 %f88, %f15;
|
||||
st.shared.f32 [%rd56+1536], %f88;
|
||||
mov.f32 %f93, %f17;
|
||||
st.shared.f32 [%rd56+2048], %f93;
|
||||
mov.f32 %f94, %f18;
|
||||
st.shared.f32 [%rd56+2560], %f94;
|
||||
mov.s32 %r65, %r58;
|
||||
@!%p12 bra $Lt_1_28674;
|
||||
$Lt_1_29186:
|
||||
setp.ge.u32 %p16, %r17, %r65;
|
||||
@%p16 bra $Lt_1_29442;
|
||||
add.u32 %r66, %r1, %r65;
|
||||
cvt.u64.u32 %rd60, %r66;
|
||||
mul.wide.u32 %rd61, %r66, 4;
|
||||
add.u64 %rd62, %rd53, %rd61;
|
||||
ld.shared.f32 %f95, [%rd62+0];
|
||||
add.ftz.f32 %f85, %f95, %f85;
|
||||
st.shared.f32 [%rd56+0], %f85;
|
||||
ld.shared.f32 %f96, [%rd62+512];
|
||||
add.ftz.f32 %f86, %f96, %f86;
|
||||
st.shared.f32 [%rd56+512], %f86;
|
||||
ld.shared.f32 %f97, [%rd62+1024];
|
||||
add.ftz.f32 %f87, %f97, %f87;
|
||||
st.shared.f32 [%rd56+1024], %f87;
|
||||
ld.shared.f32 %f98, [%rd62+1536];
|
||||
add.ftz.f32 %f88, %f98, %f88;
|
||||
st.shared.f32 [%rd56+1536], %f88;
|
||||
ld.shared.f32 %f99, [%rd62+2048];
|
||||
add.ftz.f32 %f93, %f99, %f93;
|
||||
st.shared.f32 [%rd56+2048], %f93;
|
||||
ld.shared.f32 %f100, [%rd62+2560];
|
||||
add.ftz.f32 %f94, %f100, %f94;
|
||||
st.shared.f32 [%rd56+2560], %f94;
|
||||
$Lt_1_29442:
|
||||
shr.u32 %r65, %r65, 1;
|
||||
mov.u32 %r67, 0;
|
||||
setp.ne.u32 %p17, %r65, %r67;
|
||||
@%p17 bra $Lt_1_29186;
|
||||
$Lt_1_28674:
|
||||
mov.f32 %f9, %f85;
|
||||
mov.f32 %f11, %f86;
|
||||
mov.f32 %f13, %f87;
|
||||
mov.f32 %f15, %f88;
|
||||
mov.f32 %f17, %f93;
|
||||
mov.f32 %f19, %f94;
|
||||
$Lt_1_28162:
|
||||
$Lt_1_26114:
|
||||
mov.u32 %r68, 0;
|
||||
setp.ne.s32 %p18, %r17, %r68;
|
||||
@%p18 bra $Lt_1_30210;
|
||||
ld.param.u64 %rd63, [__cudaparm_kernel_pair_fast___val_paramengv];
|
||||
add.u64 %rd64, %rd63, %rd21;
|
||||
ld.param.s32 %r69, [__cudaparm_kernel_pair_fast_eflag];
|
||||
mov.u32 %r70, 0;
|
||||
setp.le.s32 %p19, %r69, %r70;
|
||||
@%p19 bra $Lt_1_30722;
|
||||
st.global.f32 [%rd64+0], %f32;
|
||||
cvt.s64.s32 %rd65, %r13;
|
||||
mul.wide.s32 %rd66, %r13, 4;
|
||||
add.u64 %rd64, %rd64, %rd66;
|
||||
$Lt_1_30722:
|
||||
ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag];
|
||||
mov.u32 %r72, 0;
|
||||
setp.le.s32 %p20, %r71, %r72;
|
||||
@%p20 bra $Lt_1_31234;
|
||||
mov.f32 %f101, %f9;
|
||||
st.global.f32 [%rd64+0], %f101;
|
||||
cvt.s64.s32 %rd67, %r13;
|
||||
mul.wide.s32 %rd68, %r13, 4;
|
||||
add.u64 %rd69, %rd68, %rd64;
|
||||
mov.f32 %f102, %f11;
|
||||
st.global.f32 [%rd69+0], %f102;
|
||||
add.u64 %rd70, %rd68, %rd69;
|
||||
mov.f32 %f103, %f13;
|
||||
st.global.f32 [%rd70+0], %f103;
|
||||
add.u64 %rd71, %rd68, %rd70;
|
||||
mov.f32 %f104, %f15;
|
||||
st.global.f32 [%rd71+0], %f104;
|
||||
add.u64 %rd64, %rd68, %rd71;
|
||||
mov.f32 %f105, %f17;
|
||||
st.global.f32 [%rd64+0], %f105;
|
||||
mov.f32 %f106, %f19;
|
||||
add.u64 %rd72, %rd68, %rd64;
|
||||
st.global.f32 [%rd72+0], %f106;
|
||||
$Lt_1_31234:
|
||||
ld.param.u64 %rd73, [__cudaparm_kernel_pair_fast_ans];
|
||||
mul.lo.u64 %rd74, %rd20, 16;
|
||||
add.u64 %rd75, %rd73, %rd74;
|
||||
mov.f32 %f107, %f108;
|
||||
st.global.v4.f32 [%rd75+0], {%f31,%f30,%f29,%f107};
|
||||
$Lt_1_30210:
|
||||
$Lt_1_22530:
|
||||
.loc 16 190 0
|
||||
exit;
|
||||
$LDWend_kernel_pair_fast:
|
||||
} // kernel_pair_fast
|
||||
|
||||
@ -1,869 +0,0 @@
|
||||
const char * morse =
|
||||
" .version 2.3\n"
|
||||
" .target sm_20\n"
|
||||
" .address_size 64\n"
|
||||
" .global .texref pos_tex;\n"
|
||||
" .entry kernel_pair (\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_x_,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_mor1,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_mor2,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_lj_types,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_sp_lj_in,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_dev_nbor,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_dev_packed,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_ans,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair___val_paramengv,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_eflag,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_vflag,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_inum,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_nbor_pitch,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_t_per_atom)\n"
|
||||
" {\n"
|
||||
" .reg .u32 %r<72>;\n"
|
||||
" .reg .u64 %rd<64>;\n"
|
||||
" .reg .f32 %f<104>;\n"
|
||||
" .reg .f64 %fd<10>;\n"
|
||||
" .reg .pred %p<19>;\n"
|
||||
" .shared .align 16 .b8 __cuda___cuda_local_var_32536_33_non_const_sp_lj92[16];\n"
|
||||
" .shared .align 4 .b8 __cuda___cuda_local_var_32601_55_non_const_red_acc108[3072];\n"
|
||||
" .loc 16 31 0\n"
|
||||
"$LDWbegin_kernel_pair:\n"
|
||||
" .loc 16 36 0\n"
|
||||
" ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];\n"
|
||||
" ldu.global.f32 %f1, [%rd1+0];\n"
|
||||
" .loc 16 37 0\n"
|
||||
" ld.global.f32 %f2, [%rd1+4];\n"
|
||||
" .loc 16 38 0\n"
|
||||
" ld.global.f32 %f3, [%rd1+8];\n"
|
||||
" .loc 16 39 0\n"
|
||||
" ld.global.f32 %f4, [%rd1+12];\n"
|
||||
" st.shared.v4.f32 [__cuda___cuda_local_var_32536_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4};\n"
|
||||
" .loc 16 46 0\n"
|
||||
" mov.f32 %f5, 0f00000000; \n"
|
||||
" mov.f32 %f6, %f5;\n"
|
||||
" mov.f32 %f7, 0f00000000; \n"
|
||||
" mov.f32 %f8, %f7;\n"
|
||||
" mov.f32 %f9, 0f00000000; \n"
|
||||
" mov.f32 %f10, %f9;\n"
|
||||
" mov.f32 %f11, 0f00000000; \n"
|
||||
" mov.f32 %f12, %f11;\n"
|
||||
" mov.f32 %f13, 0f00000000; \n"
|
||||
" mov.f32 %f14, %f13;\n"
|
||||
" mov.f32 %f15, 0f00000000; \n"
|
||||
" mov.f32 %f16, %f15;\n"
|
||||
" ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];\n"
|
||||
" cvt.s32.u32 %r2, %tid.x;\n"
|
||||
" div.s32 %r3, %r2, %r1;\n"
|
||||
" cvt.s32.u32 %r4, %ntid.x;\n"
|
||||
" div.s32 %r5, %r4, %r1;\n"
|
||||
" cvt.s32.u32 %r6, %ctaid.x;\n"
|
||||
" mul.lo.s32 %r7, %r6, %r5;\n"
|
||||
" add.s32 %r8, %r3, %r7;\n"
|
||||
" ld.param.s32 %r9, [__cudaparm_kernel_pair_inum];\n"
|
||||
" setp.ge.s32 %p1, %r8, %r9;\n"
|
||||
" @%p1 bra $Lt_0_26370;\n"
|
||||
" .loc 16 51 0\n"
|
||||
" ld.param.s32 %r10, [__cudaparm_kernel_pair_nbor_pitch];\n"
|
||||
" cvt.s64.s32 %rd2, %r10;\n"
|
||||
" mul.wide.s32 %rd3, %r10, 4;\n"
|
||||
" cvt.s64.s32 %rd4, %r8;\n"
|
||||
" mul.wide.s32 %rd5, %r8, 4;\n"
|
||||
" ld.param.u64 %rd6, [__cudaparm_kernel_pair_dev_nbor];\n"
|
||||
" add.u64 %rd7, %rd5, %rd6;\n"
|
||||
" add.u64 %rd8, %rd3, %rd7;\n"
|
||||
" ld.global.s32 %r11, [%rd8+0];\n"
|
||||
" sub.s32 %r12, %r1, 1;\n"
|
||||
" and.b32 %r13, %r12, %r2;\n"
|
||||
" cvt.s64.s32 %rd9, %r13;\n"
|
||||
" mul.wide.s32 %rd10, %r13, 4;\n"
|
||||
" ld.param.u64 %rd11, [__cudaparm_kernel_pair_dev_packed];\n"
|
||||
" setp.ne.u64 %p2, %rd11, %rd6;\n"
|
||||
" @%p2 bra $Lt_0_19458;\n"
|
||||
" cvt.s32.s64 %r14, %rd2;\n"
|
||||
" mul.lo.s32 %r15, %r14, %r1;\n"
|
||||
" mov.s32 %r16, %r15;\n"
|
||||
" mul.lo.s32 %r17, %r12, %r8;\n"
|
||||
" add.s32 %r18, %r14, %r17;\n"
|
||||
" cvt.s64.s32 %rd12, %r18;\n"
|
||||
" mul.wide.s32 %rd13, %r18, 4;\n"
|
||||
" add.u64 %rd14, %rd8, %rd13;\n"
|
||||
" and.b32 %r19, %r12, %r11;\n"
|
||||
" cvt.s64.s32 %rd15, %r19;\n"
|
||||
" div.s32 %r20, %r11, %r1;\n"
|
||||
" mul.lo.s32 %r21, %r15, %r20;\n"
|
||||
" cvt.s64.s32 %rd16, %r21;\n"
|
||||
" add.u64 %rd17, %rd15, %rd16;\n"
|
||||
" mul.lo.u64 %rd18, %rd17, 4;\n"
|
||||
" add.u64 %rd19, %rd14, %rd18;\n"
|
||||
" add.u64 %rd20, %rd10, %rd14;\n"
|
||||
" bra.uni $Lt_0_19202;\n"
|
||||
"$Lt_0_19458:\n"
|
||||
" add.u64 %rd21, %rd3, %rd8;\n"
|
||||
" ld.global.s32 %r22, [%rd21+0];\n"
|
||||
" cvt.s64.s32 %rd22, %r22;\n"
|
||||
" mul.wide.s32 %rd23, %r22, 4;\n"
|
||||
" add.u64 %rd24, %rd11, %rd23;\n"
|
||||
" cvt.s64.s32 %rd25, %r11;\n"
|
||||
" mul.wide.s32 %rd26, %r11, 4;\n"
|
||||
" add.u64 %rd19, %rd24, %rd26;\n"
|
||||
" mov.s32 %r16, %r1;\n"
|
||||
" add.u64 %rd20, %rd10, %rd24;\n"
|
||||
"$Lt_0_19202:\n"
|
||||
" .loc 16 54 0\n"
|
||||
" ld.global.s32 %r23, [%rd7+0];\n"
|
||||
" mov.u32 %r24, %r23;\n"
|
||||
" mov.s32 %r25, 0;\n"
|
||||
" mov.u32 %r26, %r25;\n"
|
||||
" mov.s32 %r27, 0;\n"
|
||||
" mov.u32 %r28, %r27;\n"
|
||||
" mov.s32 %r29, 0;\n"
|
||||
" mov.u32 %r30, %r29;\n"
|
||||
" tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r24,%r26,%r28,%r30}];\n"
|
||||
" mov.f32 %f21, %f17;\n"
|
||||
" mov.f32 %f22, %f18;\n"
|
||||
" mov.f32 %f23, %f19;\n"
|
||||
" mov.f32 %f24, %f20;\n"
|
||||
" setp.ge.u64 %p3, %rd20, %rd19;\n"
|
||||
" @%p3 bra $Lt_0_27906;\n"
|
||||
" cvt.rzi.ftz.s32.f32 %r31, %f24;\n"
|
||||
" cvt.s64.s32 %rd27, %r16;\n"
|
||||
" ld.param.s32 %r32, [__cudaparm_kernel_pair_lj_types];\n"
|
||||
" mul.lo.s32 %r33, %r32, %r31;\n"
|
||||
" ld.param.u64 %rd28, [__cudaparm_kernel_pair_mor1];\n"
|
||||
" mov.f32 %f25, 0f00000000; \n"
|
||||
" mov.f32 %f26, 0f00000000; \n"
|
||||
" mov.f32 %f27, 0f00000000; \n"
|
||||
" mov.f32 %f28, 0f00000000; \n"
|
||||
" mov.u64 %rd29, __cuda___cuda_local_var_32536_33_non_const_sp_lj92;\n"
|
||||
"$Lt_0_20226:\n"
|
||||
" .loc 16 60 0\n"
|
||||
" ld.global.s32 %r34, [%rd20+0];\n"
|
||||
" .loc 16 61 0\n"
|
||||
" shr.s32 %r35, %r34, 30;\n"
|
||||
" and.b32 %r36, %r35, 3;\n"
|
||||
" cvt.s64.s32 %rd30, %r36;\n"
|
||||
" mul.wide.s32 %rd31, %r36, 4;\n"
|
||||
" add.u64 %rd32, %rd29, %rd31;\n"
|
||||
" ld.shared.f32 %f29, [%rd32+0];\n"
|
||||
" .loc 16 64 0\n"
|
||||
" and.b32 %r37, %r34, 1073741823;\n"
|
||||
" mov.u32 %r38, %r37;\n"
|
||||
" mov.s32 %r39, 0;\n"
|
||||
" mov.u32 %r40, %r39;\n"
|
||||
" mov.s32 %r41, 0;\n"
|
||||
" mov.u32 %r42, %r41;\n"
|
||||
" mov.s32 %r43, 0;\n"
|
||||
" mov.u32 %r44, %r43;\n"
|
||||
" tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r38,%r40,%r42,%r44}];\n"
|
||||
" mov.f32 %f34, %f30;\n"
|
||||
" mov.f32 %f35, %f31;\n"
|
||||
" mov.f32 %f36, %f32;\n"
|
||||
" mov.f32 %f37, %f33;\n"
|
||||
" cvt.rzi.ftz.s32.f32 %r45, %f37;\n"
|
||||
" sub.ftz.f32 %f38, %f22, %f35;\n"
|
||||
" sub.ftz.f32 %f39, %f21, %f34;\n"
|
||||
" sub.ftz.f32 %f40, %f23, %f36;\n"
|
||||
" mul.ftz.f32 %f41, %f38, %f38;\n"
|
||||
" fma.rn.ftz.f32 %f42, %f39, %f39, %f41;\n"
|
||||
" add.s32 %r46, %r45, %r33;\n"
|
||||
" cvt.s64.s32 %rd33, %r46;\n"
|
||||
" fma.rn.ftz.f32 %f43, %f40, %f40, %f42;\n"
|
||||
" mul.wide.s32 %rd34, %r46, 16;\n"
|
||||
" add.u64 %rd35, %rd28, %rd34;\n"
|
||||
" ld.global.f32 %f44, [%rd35+0];\n"
|
||||
" setp.gt.ftz.f32 %p4, %f44, %f43;\n"
|
||||
" @!%p4 bra $Lt_0_21506;\n"
|
||||
" .loc 16 77 0\n"
|
||||
" sqrt.approx.ftz.f32 %f45, %f43;\n"
|
||||
" ld.global.v4.f32 {_,%f46,%f47,%f48}, [%rd35+0];\n"
|
||||
" sub.ftz.f32 %f49, %f45, %f47;\n"
|
||||
" mul.ftz.f32 %f50, %f48, %f49;\n"
|
||||
" neg.ftz.f32 %f51, %f50;\n"
|
||||
" .loc 16 79 0\n"
|
||||
" mov.f32 %f52, 0f3fb8aa3b; \n"
|
||||
" mul.ftz.f32 %f53, %f51, %f52;\n"
|
||||
" ex2.approx.ftz.f32 %f54, %f53;\n"
|
||||
" mul.ftz.f32 %f55, %f54, %f54;\n"
|
||||
" sub.ftz.f32 %f56, %f55, %f54;\n"
|
||||
" mul.ftz.f32 %f57, %f46, %f56;\n"
|
||||
" .loc 16 81 0\n"
|
||||
" div.approx.ftz.f32 %f58, %f57, %f45;\n"
|
||||
" mul.ftz.f32 %f59, %f58, %f29;\n"
|
||||
" fma.rn.ftz.f32 %f27, %f39, %f59, %f27;\n"
|
||||
" .loc 16 82 0\n"
|
||||
" fma.rn.ftz.f32 %f26, %f38, %f59, %f26;\n"
|
||||
" .loc 16 83 0\n"
|
||||
" fma.rn.ftz.f32 %f25, %f40, %f59, %f25;\n"
|
||||
" ld.param.s32 %r47, [__cudaparm_kernel_pair_eflag];\n"
|
||||
" mov.u32 %r48, 0;\n"
|
||||
" setp.le.s32 %p5, %r47, %r48;\n"
|
||||
" @%p5 bra $Lt_0_20994;\n"
|
||||
" .loc 16 87 0\n"
|
||||
" cvt.ftz.f64.f32 %fd1, %f54;\n"
|
||||
" ld.param.u64 %rd36, [__cudaparm_kernel_pair_mor2];\n"
|
||||
" mul.lo.u64 %rd37, %rd33, 8;\n"
|
||||
" add.u64 %rd38, %rd36, %rd37;\n"
|
||||
" ld.global.v2.f32 {%f60,%f61}, [%rd38+0];\n"
|
||||
" cvt.ftz.f64.f32 %fd2, %f61;\n"
|
||||
" cvt.ftz.f64.f32 %fd3, %f60;\n"
|
||||
" mul.ftz.f32 %f62, %f54, %f54;\n"
|
||||
" cvt.ftz.f64.f32 %fd4, %f62;\n"
|
||||
" add.f64 %fd5, %fd1, %fd1;\n"
|
||||
" sub.f64 %fd6, %fd4, %fd5;\n"
|
||||
" mul.f64 %fd7, %fd3, %fd6;\n"
|
||||
" sub.f64 %fd8, %fd7, %fd2;\n"
|
||||
" cvt.rn.ftz.f32.f64 %f63, %fd8;\n"
|
||||
" fma.rn.ftz.f32 %f28, %f29, %f63, %f28;\n"
|
||||
"$Lt_0_20994:\n"
|
||||
" ld.param.s32 %r49, [__cudaparm_kernel_pair_vflag];\n"
|
||||
" mov.u32 %r50, 0;\n"
|
||||
" setp.le.s32 %p6, %r49, %r50;\n"
|
||||
" @%p6 bra $Lt_0_21506;\n"
|
||||
" .loc 16 90 0\n"
|
||||
" mov.f32 %f64, %f6;\n"
|
||||
" mul.ftz.f32 %f65, %f39, %f39;\n"
|
||||
" fma.rn.ftz.f32 %f66, %f59, %f65, %f64;\n"
|
||||
" mov.f32 %f6, %f66;\n"
|
||||
" .loc 16 91 0\n"
|
||||
" mov.f32 %f67, %f8;\n"
|
||||
" fma.rn.ftz.f32 %f68, %f59, %f41, %f67;\n"
|
||||
" mov.f32 %f8, %f68;\n"
|
||||
" .loc 16 92 0\n"
|
||||
" mov.f32 %f69, %f10;\n"
|
||||
" mul.ftz.f32 %f70, %f40, %f40;\n"
|
||||
" fma.rn.ftz.f32 %f71, %f59, %f70, %f69;\n"
|
||||
" mov.f32 %f10, %f71;\n"
|
||||
" .loc 16 93 0\n"
|
||||
" mov.f32 %f72, %f12;\n"
|
||||
" mul.ftz.f32 %f73, %f38, %f39;\n"
|
||||
" fma.rn.ftz.f32 %f74, %f59, %f73, %f72;\n"
|
||||
" mov.f32 %f12, %f74;\n"
|
||||
" .loc 16 94 0\n"
|
||||
" mov.f32 %f75, %f14;\n"
|
||||
" mul.ftz.f32 %f76, %f39, %f40;\n"
|
||||
" fma.rn.ftz.f32 %f77, %f59, %f76, %f75;\n"
|
||||
" mov.f32 %f14, %f77;\n"
|
||||
" .loc 16 95 0\n"
|
||||
" mul.ftz.f32 %f78, %f38, %f40;\n"
|
||||
" fma.rn.ftz.f32 %f15, %f59, %f78, %f15;\n"
|
||||
" mov.f32 %f16, %f15;\n"
|
||||
"$Lt_0_21506:\n"
|
||||
"$Lt_0_20482:\n"
|
||||
" .loc 16 58 0\n"
|
||||
" mul.lo.u64 %rd39, %rd27, 4;\n"
|
||||
" add.u64 %rd20, %rd20, %rd39;\n"
|
||||
" setp.lt.u64 %p7, %rd20, %rd19;\n"
|
||||
" @%p7 bra $Lt_0_20226;\n"
|
||||
" bra.uni $Lt_0_19714;\n"
|
||||
"$Lt_0_27906:\n"
|
||||
" mov.f32 %f25, 0f00000000; \n"
|
||||
" mov.f32 %f26, 0f00000000; \n"
|
||||
" mov.f32 %f27, 0f00000000; \n"
|
||||
" mov.f32 %f28, 0f00000000; \n"
|
||||
"$Lt_0_19714:\n"
|
||||
" mov.u32 %r51, 1;\n"
|
||||
" setp.le.s32 %p8, %r1, %r51;\n"
|
||||
" @%p8 bra $Lt_0_24322;\n"
|
||||
" .loc 16 100 0\n"
|
||||
" mov.u64 %rd40, __cuda___cuda_local_var_32601_55_non_const_red_acc108;\n"
|
||||
" cvt.s64.s32 %rd41, %r2;\n"
|
||||
" mul.wide.s32 %rd42, %r2, 4;\n"
|
||||
" add.u64 %rd43, %rd40, %rd42;\n"
|
||||
" mov.f32 %f79, %f27;\n"
|
||||
" st.shared.f32 [%rd43+0], %f79;\n"
|
||||
" mov.f32 %f80, %f26;\n"
|
||||
" st.shared.f32 [%rd43+512], %f80;\n"
|
||||
" mov.f32 %f81, %f25;\n"
|
||||
" st.shared.f32 [%rd43+1024], %f81;\n"
|
||||
" mov.f32 %f82, %f28;\n"
|
||||
" st.shared.f32 [%rd43+1536], %f82;\n"
|
||||
" shr.s32 %r52, %r1, 31;\n"
|
||||
" mov.s32 %r53, 1;\n"
|
||||
" and.b32 %r54, %r52, %r53;\n"
|
||||
" add.s32 %r55, %r54, %r1;\n"
|
||||
" shr.s32 %r56, %r55, 1;\n"
|
||||
" mov.s32 %r57, %r56;\n"
|
||||
" mov.u32 %r58, 0;\n"
|
||||
" setp.ne.u32 %p9, %r56, %r58;\n"
|
||||
" @!%p9 bra $Lt_0_22786;\n"
|
||||
"$Lt_0_23298:\n"
|
||||
" setp.ge.u32 %p10, %r13, %r57;\n"
|
||||
" @%p10 bra $Lt_0_23554;\n"
|
||||
" add.u32 %r59, %r2, %r57;\n"
|
||||
" cvt.u64.u32 %rd44, %r59;\n"
|
||||
" mul.wide.u32 %rd45, %r59, 4;\n"
|
||||
" add.u64 %rd46, %rd40, %rd45;\n"
|
||||
" ld.shared.f32 %f83, [%rd46+0];\n"
|
||||
" add.ftz.f32 %f79, %f83, %f79;\n"
|
||||
" st.shared.f32 [%rd43+0], %f79;\n"
|
||||
" ld.shared.f32 %f84, [%rd46+512];\n"
|
||||
" add.ftz.f32 %f80, %f84, %f80;\n"
|
||||
" st.shared.f32 [%rd43+512], %f80;\n"
|
||||
" ld.shared.f32 %f85, [%rd46+1024];\n"
|
||||
" add.ftz.f32 %f81, %f85, %f81;\n"
|
||||
" st.shared.f32 [%rd43+1024], %f81;\n"
|
||||
" ld.shared.f32 %f86, [%rd46+1536];\n"
|
||||
" add.ftz.f32 %f82, %f86, %f82;\n"
|
||||
" st.shared.f32 [%rd43+1536], %f82;\n"
|
||||
"$Lt_0_23554:\n"
|
||||
" shr.u32 %r57, %r57, 1;\n"
|
||||
" mov.u32 %r60, 0;\n"
|
||||
" setp.ne.u32 %p11, %r57, %r60;\n"
|
||||
" @%p11 bra $Lt_0_23298;\n"
|
||||
"$Lt_0_22786:\n"
|
||||
" mov.f32 %f27, %f79;\n"
|
||||
" mov.f32 %f26, %f80;\n"
|
||||
" mov.f32 %f25, %f81;\n"
|
||||
" mov.f32 %f28, %f82;\n"
|
||||
" ld.param.s32 %r61, [__cudaparm_kernel_pair_vflag];\n"
|
||||
" mov.u32 %r62, 0;\n"
|
||||
" setp.le.s32 %p12, %r61, %r62;\n"
|
||||
" @%p12 bra $Lt_0_24322;\n"
|
||||
" mov.f32 %f79, %f6;\n"
|
||||
" st.shared.f32 [%rd43+0], %f79;\n"
|
||||
" mov.f32 %f80, %f8;\n"
|
||||
" st.shared.f32 [%rd43+512], %f80;\n"
|
||||
" mov.f32 %f81, %f10;\n"
|
||||
" st.shared.f32 [%rd43+1024], %f81;\n"
|
||||
" mov.f32 %f82, %f12;\n"
|
||||
" st.shared.f32 [%rd43+1536], %f82;\n"
|
||||
" mov.f32 %f87, %f14;\n"
|
||||
" st.shared.f32 [%rd43+2048], %f87;\n"
|
||||
" mov.f32 %f88, %f15;\n"
|
||||
" st.shared.f32 [%rd43+2560], %f88;\n"
|
||||
" mov.s32 %r63, %r56;\n"
|
||||
" @!%p9 bra $Lt_0_24834;\n"
|
||||
"$Lt_0_25346:\n"
|
||||
" setp.ge.u32 %p13, %r13, %r63;\n"
|
||||
" @%p13 bra $Lt_0_25602;\n"
|
||||
" add.u32 %r64, %r2, %r63;\n"
|
||||
" cvt.u64.u32 %rd47, %r64;\n"
|
||||
" mul.wide.u32 %rd48, %r64, 4;\n"
|
||||
" add.u64 %rd49, %rd40, %rd48;\n"
|
||||
" ld.shared.f32 %f89, [%rd49+0];\n"
|
||||
" add.ftz.f32 %f79, %f89, %f79;\n"
|
||||
" st.shared.f32 [%rd43+0], %f79;\n"
|
||||
" ld.shared.f32 %f90, [%rd49+512];\n"
|
||||
" add.ftz.f32 %f80, %f90, %f80;\n"
|
||||
" st.shared.f32 [%rd43+512], %f80;\n"
|
||||
" ld.shared.f32 %f91, [%rd49+1024];\n"
|
||||
" add.ftz.f32 %f81, %f91, %f81;\n"
|
||||
" st.shared.f32 [%rd43+1024], %f81;\n"
|
||||
" ld.shared.f32 %f92, [%rd49+1536];\n"
|
||||
" add.ftz.f32 %f82, %f92, %f82;\n"
|
||||
" st.shared.f32 [%rd43+1536], %f82;\n"
|
||||
" ld.shared.f32 %f93, [%rd49+2048];\n"
|
||||
" add.ftz.f32 %f87, %f93, %f87;\n"
|
||||
" st.shared.f32 [%rd43+2048], %f87;\n"
|
||||
" ld.shared.f32 %f94, [%rd49+2560];\n"
|
||||
" add.ftz.f32 %f88, %f94, %f88;\n"
|
||||
" st.shared.f32 [%rd43+2560], %f88;\n"
|
||||
"$Lt_0_25602:\n"
|
||||
" shr.u32 %r63, %r63, 1;\n"
|
||||
" mov.u32 %r65, 0;\n"
|
||||
" setp.ne.u32 %p14, %r63, %r65;\n"
|
||||
" @%p14 bra $Lt_0_25346;\n"
|
||||
"$Lt_0_24834:\n"
|
||||
" mov.f32 %f6, %f79;\n"
|
||||
" mov.f32 %f8, %f80;\n"
|
||||
" mov.f32 %f10, %f81;\n"
|
||||
" mov.f32 %f12, %f82;\n"
|
||||
" mov.f32 %f14, %f87;\n"
|
||||
" mov.f32 %f16, %f88;\n"
|
||||
"$Lt_0_24322:\n"
|
||||
"$Lt_0_22274:\n"
|
||||
" mov.u32 %r66, 0;\n"
|
||||
" setp.ne.s32 %p15, %r13, %r66;\n"
|
||||
" @%p15 bra $Lt_0_26370;\n"
|
||||
" ld.param.u64 %rd50, [__cudaparm_kernel_pair___val_paramengv];\n"
|
||||
" add.u64 %rd51, %rd50, %rd5;\n"
|
||||
" ld.param.s32 %r67, [__cudaparm_kernel_pair_eflag];\n"
|
||||
" mov.u32 %r68, 0;\n"
|
||||
" setp.le.s32 %p16, %r67, %r68;\n"
|
||||
" @%p16 bra $Lt_0_26882;\n"
|
||||
" st.global.f32 [%rd51+0], %f28;\n"
|
||||
" cvt.s64.s32 %rd52, %r9;\n"
|
||||
" mul.wide.s32 %rd53, %r9, 4;\n"
|
||||
" add.u64 %rd51, %rd51, %rd53;\n"
|
||||
"$Lt_0_26882:\n"
|
||||
" ld.param.s32 %r69, [__cudaparm_kernel_pair_vflag];\n"
|
||||
" mov.u32 %r70, 0;\n"
|
||||
" setp.le.s32 %p17, %r69, %r70;\n"
|
||||
" @%p17 bra $Lt_0_27394;\n"
|
||||
" mov.f32 %f95, %f6;\n"
|
||||
" st.global.f32 [%rd51+0], %f95;\n"
|
||||
" cvt.s64.s32 %rd54, %r9;\n"
|
||||
" mul.wide.s32 %rd55, %r9, 4;\n"
|
||||
" add.u64 %rd56, %rd55, %rd51;\n"
|
||||
" mov.f32 %f96, %f8;\n"
|
||||
" st.global.f32 [%rd56+0], %f96;\n"
|
||||
" add.u64 %rd57, %rd55, %rd56;\n"
|
||||
" mov.f32 %f97, %f10;\n"
|
||||
" st.global.f32 [%rd57+0], %f97;\n"
|
||||
" add.u64 %rd58, %rd55, %rd57;\n"
|
||||
" mov.f32 %f98, %f12;\n"
|
||||
" st.global.f32 [%rd58+0], %f98;\n"
|
||||
" add.u64 %rd51, %rd55, %rd58;\n"
|
||||
" mov.f32 %f99, %f14;\n"
|
||||
" st.global.f32 [%rd51+0], %f99;\n"
|
||||
" mov.f32 %f100, %f16;\n"
|
||||
" add.u64 %rd59, %rd55, %rd51;\n"
|
||||
" st.global.f32 [%rd59+0], %f100;\n"
|
||||
"$Lt_0_27394:\n"
|
||||
" ld.param.u64 %rd60, [__cudaparm_kernel_pair_ans];\n"
|
||||
" mul.lo.u64 %rd61, %rd4, 16;\n"
|
||||
" add.u64 %rd62, %rd60, %rd61;\n"
|
||||
" mov.f32 %f101, %f102;\n"
|
||||
" st.global.v4.f32 [%rd62+0], {%f27,%f26,%f25,%f101};\n"
|
||||
"$Lt_0_26370:\n"
|
||||
"$Lt_0_18690:\n"
|
||||
" .loc 16 103 0\n"
|
||||
" exit;\n"
|
||||
"$LDWend_kernel_pair:\n"
|
||||
" }\n"
|
||||
" .entry kernel_pair_fast (\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_x_,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_mor1_in,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_mor2_in,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_dev_packed,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_ans,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast___val_paramengv,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_eflag,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_vflag,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_inum,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_t_per_atom)\n"
|
||||
" {\n"
|
||||
" .reg .u32 %r<74>;\n"
|
||||
" .reg .u64 %rd<77>;\n"
|
||||
" .reg .f32 %f<110>;\n"
|
||||
" .reg .pred %p<22>;\n"
|
||||
" .shared .align 4 .b8 __cuda___cuda_local_var_32618_33_non_const_sp_lj3268[16];\n"
|
||||
" .shared .align 16 .b8 __cuda___cuda_local_var_32616_34_non_const_mor13296[1936];\n"
|
||||
" .shared .align 8 .b8 __cuda___cuda_local_var_32617_34_non_const_mor25232[968];\n"
|
||||
" .shared .align 4 .b8 __cuda___cuda_local_var_32688_55_non_const_red_acc6200[3072];\n"
|
||||
" .loc 16 111 0\n"
|
||||
"$LDWbegin_kernel_pair_fast:\n"
|
||||
" cvt.s32.u32 %r1, %tid.x;\n"
|
||||
" mov.u32 %r2, 3;\n"
|
||||
" setp.gt.s32 %p1, %r1, %r2;\n"
|
||||
" @%p1 bra $Lt_1_20994;\n"
|
||||
" .loc 16 119 0\n"
|
||||
" mov.u64 %rd1, __cuda___cuda_local_var_32618_33_non_const_sp_lj3268;\n"
|
||||
" cvt.s64.s32 %rd2, %r1;\n"
|
||||
" mul.wide.s32 %rd3, %r1, 4;\n"
|
||||
" ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];\n"
|
||||
" add.u64 %rd5, %rd4, %rd3;\n"
|
||||
" ld.global.f32 %f1, [%rd5+0];\n"
|
||||
" add.u64 %rd6, %rd3, %rd1;\n"
|
||||
" st.shared.f32 [%rd6+0], %f1;\n"
|
||||
"$Lt_1_20994:\n"
|
||||
" mov.u64 %rd1, __cuda___cuda_local_var_32618_33_non_const_sp_lj3268;\n"
|
||||
" mov.u32 %r3, 120;\n"
|
||||
" setp.gt.s32 %p2, %r1, %r3;\n"
|
||||
" @%p2 bra $Lt_1_21506;\n"
|
||||
" .loc 16 121 0\n"
|
||||
" mov.u64 %rd7, __cuda___cuda_local_var_32616_34_non_const_mor13296;\n"
|
||||
" cvt.s64.s32 %rd8, %r1;\n"
|
||||
" mul.wide.s32 %rd9, %r1, 16;\n"
|
||||
" ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_mor1_in];\n"
|
||||
" add.u64 %rd11, %rd10, %rd9;\n"
|
||||
" add.u64 %rd12, %rd9, %rd7;\n"
|
||||
" ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0];\n"
|
||||
" st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5};\n"
|
||||
" ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag];\n"
|
||||
" mov.u32 %r5, 0;\n"
|
||||
" setp.le.s32 %p3, %r4, %r5;\n"
|
||||
" @%p3 bra $Lt_1_22018;\n"
|
||||
" .loc 16 123 0\n"
|
||||
" mov.u64 %rd13, __cuda___cuda_local_var_32617_34_non_const_mor25232;\n"
|
||||
" mul.lo.u64 %rd14, %rd8, 8;\n"
|
||||
" ld.param.u64 %rd15, [__cudaparm_kernel_pair_fast_mor2_in];\n"
|
||||
" add.u64 %rd16, %rd15, %rd14;\n"
|
||||
" add.u64 %rd17, %rd14, %rd13;\n"
|
||||
" ld.global.v2.f32 {%f6,%f7}, [%rd16+0];\n"
|
||||
" st.shared.v2.f32 [%rd17+0], {%f6,%f7};\n"
|
||||
"$Lt_1_22018:\n"
|
||||
" mov.u64 %rd13, __cuda___cuda_local_var_32617_34_non_const_mor25232;\n"
|
||||
"$Lt_1_21506:\n"
|
||||
" mov.u64 %rd13, __cuda___cuda_local_var_32617_34_non_const_mor25232;\n"
|
||||
" mov.u64 %rd7, __cuda___cuda_local_var_32616_34_non_const_mor13296;\n"
|
||||
" .loc 16 131 0\n"
|
||||
" mov.f32 %f8, 0f00000000; \n"
|
||||
" mov.f32 %f9, %f8;\n"
|
||||
" mov.f32 %f10, 0f00000000; \n"
|
||||
" mov.f32 %f11, %f10;\n"
|
||||
" mov.f32 %f12, 0f00000000; \n"
|
||||
" mov.f32 %f13, %f12;\n"
|
||||
" mov.f32 %f14, 0f00000000; \n"
|
||||
" mov.f32 %f15, %f14;\n"
|
||||
" mov.f32 %f16, 0f00000000; \n"
|
||||
" mov.f32 %f17, %f16;\n"
|
||||
" mov.f32 %f18, 0f00000000; \n"
|
||||
" mov.f32 %f19, %f18;\n"
|
||||
" .loc 16 133 0\n"
|
||||
" bar.sync 0;\n"
|
||||
" ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom];\n"
|
||||
" div.s32 %r7, %r1, %r6;\n"
|
||||
" cvt.s32.u32 %r8, %ntid.x;\n"
|
||||
" div.s32 %r9, %r8, %r6;\n"
|
||||
" cvt.s32.u32 %r10, %ctaid.x;\n"
|
||||
" mul.lo.s32 %r11, %r10, %r9;\n"
|
||||
" add.s32 %r12, %r7, %r11;\n"
|
||||
" ld.param.s32 %r13, [__cudaparm_kernel_pair_fast_inum];\n"
|
||||
" setp.ge.s32 %p4, %r12, %r13;\n"
|
||||
" @%p4 bra $Lt_1_30210;\n"
|
||||
" .loc 16 138 0\n"
|
||||
" ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_nbor_pitch];\n"
|
||||
" cvt.s64.s32 %rd18, %r14;\n"
|
||||
" mul.wide.s32 %rd19, %r14, 4;\n"
|
||||
" cvt.s64.s32 %rd20, %r12;\n"
|
||||
" mul.wide.s32 %rd21, %r12, 4;\n"
|
||||
" ld.param.u64 %rd22, [__cudaparm_kernel_pair_fast_dev_nbor];\n"
|
||||
" add.u64 %rd23, %rd21, %rd22;\n"
|
||||
" add.u64 %rd24, %rd19, %rd23;\n"
|
||||
" ld.global.s32 %r15, [%rd24+0];\n"
|
||||
" sub.s32 %r16, %r6, 1;\n"
|
||||
" and.b32 %r17, %r16, %r1;\n"
|
||||
" cvt.s64.s32 %rd25, %r17;\n"
|
||||
" mul.wide.s32 %rd26, %r17, 4;\n"
|
||||
" ld.param.u64 %rd27, [__cudaparm_kernel_pair_fast_dev_packed];\n"
|
||||
" setp.ne.u64 %p5, %rd27, %rd22;\n"
|
||||
" @%p5 bra $Lt_1_23298;\n"
|
||||
" cvt.s32.s64 %r18, %rd18;\n"
|
||||
" mul.lo.s32 %r19, %r18, %r6;\n"
|
||||
" mov.s32 %r20, %r19;\n"
|
||||
" mul.lo.s32 %r21, %r16, %r12;\n"
|
||||
" add.s32 %r22, %r18, %r21;\n"
|
||||
" cvt.s64.s32 %rd28, %r22;\n"
|
||||
" mul.wide.s32 %rd29, %r22, 4;\n"
|
||||
" add.u64 %rd30, %rd24, %rd29;\n"
|
||||
" and.b32 %r23, %r16, %r15;\n"
|
||||
" cvt.s64.s32 %rd31, %r23;\n"
|
||||
" div.s32 %r24, %r15, %r6;\n"
|
||||
" mul.lo.s32 %r25, %r19, %r24;\n"
|
||||
" cvt.s64.s32 %rd32, %r25;\n"
|
||||
" add.u64 %rd33, %rd31, %rd32;\n"
|
||||
" mul.lo.u64 %rd34, %rd33, 4;\n"
|
||||
" add.u64 %rd35, %rd30, %rd34;\n"
|
||||
" add.u64 %rd36, %rd26, %rd30;\n"
|
||||
" bra.uni $Lt_1_23042;\n"
|
||||
"$Lt_1_23298:\n"
|
||||
" add.u64 %rd37, %rd19, %rd24;\n"
|
||||
" ld.global.s32 %r26, [%rd37+0];\n"
|
||||
" cvt.s64.s32 %rd38, %r26;\n"
|
||||
" mul.wide.s32 %rd39, %r26, 4;\n"
|
||||
" add.u64 %rd40, %rd27, %rd39;\n"
|
||||
" cvt.s64.s32 %rd41, %r15;\n"
|
||||
" mul.wide.s32 %rd42, %r15, 4;\n"
|
||||
" add.u64 %rd35, %rd40, %rd42;\n"
|
||||
" mov.s32 %r20, %r6;\n"
|
||||
" add.u64 %rd36, %rd26, %rd40;\n"
|
||||
"$Lt_1_23042:\n"
|
||||
" .loc 16 141 0\n"
|
||||
" ld.global.s32 %r27, [%rd23+0];\n"
|
||||
" mov.u32 %r28, %r27;\n"
|
||||
" mov.s32 %r29, 0;\n"
|
||||
" mov.u32 %r30, %r29;\n"
|
||||
" mov.s32 %r31, 0;\n"
|
||||
" mov.u32 %r32, %r31;\n"
|
||||
" mov.s32 %r33, 0;\n"
|
||||
" mov.u32 %r34, %r33;\n"
|
||||
" tex.1d.v4.f32.s32 {%f20,%f21,%f22,%f23},[pos_tex,{%r28,%r30,%r32,%r34}];\n"
|
||||
" mov.f32 %f24, %f20;\n"
|
||||
" mov.f32 %f25, %f21;\n"
|
||||
" mov.f32 %f26, %f22;\n"
|
||||
" mov.f32 %f27, %f23;\n"
|
||||
" setp.ge.u64 %p6, %rd36, %rd35;\n"
|
||||
" @%p6 bra $Lt_1_31746;\n"
|
||||
" cvt.rzi.ftz.s32.f32 %r35, %f27;\n"
|
||||
" cvt.s64.s32 %rd43, %r20;\n"
|
||||
" mul.lo.s32 %r36, %r35, 11;\n"
|
||||
" cvt.rn.f32.s32 %f28, %r36;\n"
|
||||
" mov.f32 %f29, 0f00000000; \n"
|
||||
" mov.f32 %f30, 0f00000000; \n"
|
||||
" mov.f32 %f31, 0f00000000; \n"
|
||||
" mov.f32 %f32, 0f00000000; \n"
|
||||
"$Lt_1_24066:\n"
|
||||
" .loc 16 148 0\n"
|
||||
" ld.global.s32 %r37, [%rd36+0];\n"
|
||||
" .loc 16 149 0\n"
|
||||
" shr.s32 %r38, %r37, 30;\n"
|
||||
" and.b32 %r39, %r38, 3;\n"
|
||||
" cvt.s64.s32 %rd44, %r39;\n"
|
||||
" mul.wide.s32 %rd45, %r39, 4;\n"
|
||||
" add.u64 %rd46, %rd1, %rd45;\n"
|
||||
" ld.shared.f32 %f33, [%rd46+0];\n"
|
||||
" .loc 16 152 0\n"
|
||||
" and.b32 %r40, %r37, 1073741823;\n"
|
||||
" mov.u32 %r41, %r40;\n"
|
||||
" mov.s32 %r42, 0;\n"
|
||||
" mov.u32 %r43, %r42;\n"
|
||||
" mov.s32 %r44, 0;\n"
|
||||
" mov.u32 %r45, %r44;\n"
|
||||
" mov.s32 %r46, 0;\n"
|
||||
" mov.u32 %r47, %r46;\n"
|
||||
" tex.1d.v4.f32.s32 {%f34,%f35,%f36,%f37},[pos_tex,{%r41,%r43,%r45,%r47}];\n"
|
||||
" mov.f32 %f38, %f34;\n"
|
||||
" mov.f32 %f39, %f35;\n"
|
||||
" mov.f32 %f40, %f36;\n"
|
||||
" mov.f32 %f41, %f37;\n"
|
||||
" sub.ftz.f32 %f42, %f25, %f39;\n"
|
||||
" sub.ftz.f32 %f43, %f24, %f38;\n"
|
||||
" sub.ftz.f32 %f44, %f26, %f40;\n"
|
||||
" mul.ftz.f32 %f45, %f42, %f42;\n"
|
||||
" fma.rn.ftz.f32 %f46, %f43, %f43, %f45;\n"
|
||||
" fma.rn.ftz.f32 %f47, %f44, %f44, %f46;\n"
|
||||
" add.ftz.f32 %f48, %f28, %f41;\n"
|
||||
" cvt.rzi.ftz.s32.f32 %r48, %f48;\n"
|
||||
" cvt.s64.s32 %rd47, %r48;\n"
|
||||
" mul.wide.s32 %rd48, %r48, 16;\n"
|
||||
" add.u64 %rd49, %rd7, %rd48;\n"
|
||||
" ld.shared.f32 %f49, [%rd49+0];\n"
|
||||
" setp.gt.ftz.f32 %p7, %f49, %f47;\n"
|
||||
" @!%p7 bra $Lt_1_25346;\n"
|
||||
" .loc 16 163 0\n"
|
||||
" sqrt.approx.ftz.f32 %f50, %f47;\n"
|
||||
" ld.shared.v4.f32 {_,%f51,%f52,%f53}, [%rd49+0];\n"
|
||||
" sub.ftz.f32 %f54, %f50, %f52;\n"
|
||||
" .loc 16 164 0\n"
|
||||
" mul.ftz.f32 %f55, %f53, %f54;\n"
|
||||
" neg.ftz.f32 %f56, %f55;\n"
|
||||
" .loc 16 166 0\n"
|
||||
" mov.f32 %f57, 0f3fb8aa3b; \n"
|
||||
" mul.ftz.f32 %f58, %f56, %f57;\n"
|
||||
" ex2.approx.ftz.f32 %f59, %f58;\n"
|
||||
" mul.ftz.f32 %f60, %f59, %f59;\n"
|
||||
" sub.ftz.f32 %f61, %f60, %f59;\n"
|
||||
" mul.ftz.f32 %f62, %f51, %f61;\n"
|
||||
" .loc 16 168 0\n"
|
||||
" div.approx.ftz.f32 %f63, %f62, %f50;\n"
|
||||
" mul.ftz.f32 %f64, %f63, %f33;\n"
|
||||
" fma.rn.ftz.f32 %f31, %f43, %f64, %f31;\n"
|
||||
" .loc 16 169 0\n"
|
||||
" fma.rn.ftz.f32 %f30, %f42, %f64, %f30;\n"
|
||||
" .loc 16 170 0\n"
|
||||
" fma.rn.ftz.f32 %f29, %f44, %f64, %f29;\n"
|
||||
" ld.param.s32 %r49, [__cudaparm_kernel_pair_fast_eflag];\n"
|
||||
" mov.u32 %r50, 0;\n"
|
||||
" setp.le.s32 %p8, %r49, %r50;\n"
|
||||
" @%p8 bra $Lt_1_24834;\n"
|
||||
" .loc 16 173 0\n"
|
||||
" mul.lo.u64 %rd50, %rd47, 8;\n"
|
||||
" add.u64 %rd51, %rd13, %rd50;\n"
|
||||
" ld.shared.v2.f32 {%f65,%f66}, [%rd51+0];\n"
|
||||
" sub.ftz.f32 %f67, %f61, %f59;\n"
|
||||
" mul.ftz.f32 %f68, %f65, %f67;\n"
|
||||
" sub.ftz.f32 %f69, %f68, %f66;\n"
|
||||
" .loc 16 174 0\n"
|
||||
" fma.rn.ftz.f32 %f32, %f33, %f69, %f32;\n"
|
||||
"$Lt_1_24834:\n"
|
||||
" ld.param.s32 %r51, [__cudaparm_kernel_pair_fast_vflag];\n"
|
||||
" mov.u32 %r52, 0;\n"
|
||||
" setp.le.s32 %p9, %r51, %r52;\n"
|
||||
" @%p9 bra $Lt_1_25346;\n"
|
||||
" .loc 16 177 0\n"
|
||||
" mov.f32 %f70, %f9;\n"
|
||||
" mul.ftz.f32 %f71, %f43, %f43;\n"
|
||||
" fma.rn.ftz.f32 %f72, %f64, %f71, %f70;\n"
|
||||
" mov.f32 %f9, %f72;\n"
|
||||
" .loc 16 178 0\n"
|
||||
" mov.f32 %f73, %f11;\n"
|
||||
" fma.rn.ftz.f32 %f74, %f64, %f45, %f73;\n"
|
||||
" mov.f32 %f11, %f74;\n"
|
||||
" .loc 16 179 0\n"
|
||||
" mov.f32 %f75, %f13;\n"
|
||||
" mul.ftz.f32 %f76, %f44, %f44;\n"
|
||||
" fma.rn.ftz.f32 %f77, %f64, %f76, %f75;\n"
|
||||
" mov.f32 %f13, %f77;\n"
|
||||
" .loc 16 180 0\n"
|
||||
" mov.f32 %f78, %f15;\n"
|
||||
" mul.ftz.f32 %f79, %f42, %f43;\n"
|
||||
" fma.rn.ftz.f32 %f80, %f64, %f79, %f78;\n"
|
||||
" mov.f32 %f15, %f80;\n"
|
||||
" .loc 16 181 0\n"
|
||||
" mov.f32 %f81, %f17;\n"
|
||||
" mul.ftz.f32 %f82, %f43, %f44;\n"
|
||||
" fma.rn.ftz.f32 %f83, %f64, %f82, %f81;\n"
|
||||
" mov.f32 %f17, %f83;\n"
|
||||
" .loc 16 182 0\n"
|
||||
" mul.ftz.f32 %f84, %f42, %f44;\n"
|
||||
" fma.rn.ftz.f32 %f18, %f64, %f84, %f18;\n"
|
||||
" mov.f32 %f19, %f18;\n"
|
||||
"$Lt_1_25346:\n"
|
||||
"$Lt_1_24322:\n"
|
||||
" .loc 16 146 0\n"
|
||||
" mul.lo.u64 %rd52, %rd43, 4;\n"
|
||||
" add.u64 %rd36, %rd36, %rd52;\n"
|
||||
" setp.lt.u64 %p10, %rd36, %rd35;\n"
|
||||
" @%p10 bra $Lt_1_24066;\n"
|
||||
" bra.uni $Lt_1_23554;\n"
|
||||
"$Lt_1_31746:\n"
|
||||
" mov.f32 %f29, 0f00000000; \n"
|
||||
" mov.f32 %f30, 0f00000000; \n"
|
||||
" mov.f32 %f31, 0f00000000; \n"
|
||||
" mov.f32 %f32, 0f00000000; \n"
|
||||
"$Lt_1_23554:\n"
|
||||
" mov.u32 %r53, 1;\n"
|
||||
" setp.le.s32 %p11, %r6, %r53;\n"
|
||||
" @%p11 bra $Lt_1_28162;\n"
|
||||
" .loc 16 187 0\n"
|
||||
" mov.u64 %rd53, __cuda___cuda_local_var_32688_55_non_const_red_acc6200;\n"
|
||||
" cvt.s64.s32 %rd54, %r1;\n"
|
||||
" mul.wide.s32 %rd55, %r1, 4;\n"
|
||||
" add.u64 %rd56, %rd53, %rd55;\n"
|
||||
" mov.f32 %f85, %f31;\n"
|
||||
" st.shared.f32 [%rd56+0], %f85;\n"
|
||||
" mov.f32 %f86, %f30;\n"
|
||||
" st.shared.f32 [%rd56+512], %f86;\n"
|
||||
" mov.f32 %f87, %f29;\n"
|
||||
" st.shared.f32 [%rd56+1024], %f87;\n"
|
||||
" mov.f32 %f88, %f32;\n"
|
||||
" st.shared.f32 [%rd56+1536], %f88;\n"
|
||||
" shr.s32 %r54, %r6, 31;\n"
|
||||
" mov.s32 %r55, 1;\n"
|
||||
" and.b32 %r56, %r54, %r55;\n"
|
||||
" add.s32 %r57, %r56, %r6;\n"
|
||||
" shr.s32 %r58, %r57, 1;\n"
|
||||
" mov.s32 %r59, %r58;\n"
|
||||
" mov.u32 %r60, 0;\n"
|
||||
" setp.ne.u32 %p12, %r58, %r60;\n"
|
||||
" @!%p12 bra $Lt_1_26626;\n"
|
||||
"$Lt_1_27138:\n"
|
||||
" setp.ge.u32 %p13, %r17, %r59;\n"
|
||||
" @%p13 bra $Lt_1_27394;\n"
|
||||
" add.u32 %r61, %r1, %r59;\n"
|
||||
" cvt.u64.u32 %rd57, %r61;\n"
|
||||
" mul.wide.u32 %rd58, %r61, 4;\n"
|
||||
" add.u64 %rd59, %rd53, %rd58;\n"
|
||||
" ld.shared.f32 %f89, [%rd59+0];\n"
|
||||
" add.ftz.f32 %f85, %f89, %f85;\n"
|
||||
" st.shared.f32 [%rd56+0], %f85;\n"
|
||||
" ld.shared.f32 %f90, [%rd59+512];\n"
|
||||
" add.ftz.f32 %f86, %f90, %f86;\n"
|
||||
" st.shared.f32 [%rd56+512], %f86;\n"
|
||||
" ld.shared.f32 %f91, [%rd59+1024];\n"
|
||||
" add.ftz.f32 %f87, %f91, %f87;\n"
|
||||
" st.shared.f32 [%rd56+1024], %f87;\n"
|
||||
" ld.shared.f32 %f92, [%rd59+1536];\n"
|
||||
" add.ftz.f32 %f88, %f92, %f88;\n"
|
||||
" st.shared.f32 [%rd56+1536], %f88;\n"
|
||||
"$Lt_1_27394:\n"
|
||||
" shr.u32 %r59, %r59, 1;\n"
|
||||
" mov.u32 %r62, 0;\n"
|
||||
" setp.ne.u32 %p14, %r59, %r62;\n"
|
||||
" @%p14 bra $Lt_1_27138;\n"
|
||||
"$Lt_1_26626:\n"
|
||||
" mov.f32 %f31, %f85;\n"
|
||||
" mov.f32 %f30, %f86;\n"
|
||||
" mov.f32 %f29, %f87;\n"
|
||||
" mov.f32 %f32, %f88;\n"
|
||||
" ld.param.s32 %r63, [__cudaparm_kernel_pair_fast_vflag];\n"
|
||||
" mov.u32 %r64, 0;\n"
|
||||
" setp.le.s32 %p15, %r63, %r64;\n"
|
||||
" @%p15 bra $Lt_1_28162;\n"
|
||||
" mov.f32 %f85, %f9;\n"
|
||||
" st.shared.f32 [%rd56+0], %f85;\n"
|
||||
" mov.f32 %f86, %f11;\n"
|
||||
" st.shared.f32 [%rd56+512], %f86;\n"
|
||||
" mov.f32 %f87, %f13;\n"
|
||||
" st.shared.f32 [%rd56+1024], %f87;\n"
|
||||
" mov.f32 %f88, %f15;\n"
|
||||
" st.shared.f32 [%rd56+1536], %f88;\n"
|
||||
" mov.f32 %f93, %f17;\n"
|
||||
" st.shared.f32 [%rd56+2048], %f93;\n"
|
||||
" mov.f32 %f94, %f18;\n"
|
||||
" st.shared.f32 [%rd56+2560], %f94;\n"
|
||||
" mov.s32 %r65, %r58;\n"
|
||||
" @!%p12 bra $Lt_1_28674;\n"
|
||||
"$Lt_1_29186:\n"
|
||||
" setp.ge.u32 %p16, %r17, %r65;\n"
|
||||
" @%p16 bra $Lt_1_29442;\n"
|
||||
" add.u32 %r66, %r1, %r65;\n"
|
||||
" cvt.u64.u32 %rd60, %r66;\n"
|
||||
" mul.wide.u32 %rd61, %r66, 4;\n"
|
||||
" add.u64 %rd62, %rd53, %rd61;\n"
|
||||
" ld.shared.f32 %f95, [%rd62+0];\n"
|
||||
" add.ftz.f32 %f85, %f95, %f85;\n"
|
||||
" st.shared.f32 [%rd56+0], %f85;\n"
|
||||
" ld.shared.f32 %f96, [%rd62+512];\n"
|
||||
" add.ftz.f32 %f86, %f96, %f86;\n"
|
||||
" st.shared.f32 [%rd56+512], %f86;\n"
|
||||
" ld.shared.f32 %f97, [%rd62+1024];\n"
|
||||
" add.ftz.f32 %f87, %f97, %f87;\n"
|
||||
" st.shared.f32 [%rd56+1024], %f87;\n"
|
||||
" ld.shared.f32 %f98, [%rd62+1536];\n"
|
||||
" add.ftz.f32 %f88, %f98, %f88;\n"
|
||||
" st.shared.f32 [%rd56+1536], %f88;\n"
|
||||
" ld.shared.f32 %f99, [%rd62+2048];\n"
|
||||
" add.ftz.f32 %f93, %f99, %f93;\n"
|
||||
" st.shared.f32 [%rd56+2048], %f93;\n"
|
||||
" ld.shared.f32 %f100, [%rd62+2560];\n"
|
||||
" add.ftz.f32 %f94, %f100, %f94;\n"
|
||||
" st.shared.f32 [%rd56+2560], %f94;\n"
|
||||
"$Lt_1_29442:\n"
|
||||
" shr.u32 %r65, %r65, 1;\n"
|
||||
" mov.u32 %r67, 0;\n"
|
||||
" setp.ne.u32 %p17, %r65, %r67;\n"
|
||||
" @%p17 bra $Lt_1_29186;\n"
|
||||
"$Lt_1_28674:\n"
|
||||
" mov.f32 %f9, %f85;\n"
|
||||
" mov.f32 %f11, %f86;\n"
|
||||
" mov.f32 %f13, %f87;\n"
|
||||
" mov.f32 %f15, %f88;\n"
|
||||
" mov.f32 %f17, %f93;\n"
|
||||
" mov.f32 %f19, %f94;\n"
|
||||
"$Lt_1_28162:\n"
|
||||
"$Lt_1_26114:\n"
|
||||
" mov.u32 %r68, 0;\n"
|
||||
" setp.ne.s32 %p18, %r17, %r68;\n"
|
||||
" @%p18 bra $Lt_1_30210;\n"
|
||||
" ld.param.u64 %rd63, [__cudaparm_kernel_pair_fast___val_paramengv];\n"
|
||||
" add.u64 %rd64, %rd63, %rd21;\n"
|
||||
" ld.param.s32 %r69, [__cudaparm_kernel_pair_fast_eflag];\n"
|
||||
" mov.u32 %r70, 0;\n"
|
||||
" setp.le.s32 %p19, %r69, %r70;\n"
|
||||
" @%p19 bra $Lt_1_30722;\n"
|
||||
" st.global.f32 [%rd64+0], %f32;\n"
|
||||
" cvt.s64.s32 %rd65, %r13;\n"
|
||||
" mul.wide.s32 %rd66, %r13, 4;\n"
|
||||
" add.u64 %rd64, %rd64, %rd66;\n"
|
||||
"$Lt_1_30722:\n"
|
||||
" ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag];\n"
|
||||
" mov.u32 %r72, 0;\n"
|
||||
" setp.le.s32 %p20, %r71, %r72;\n"
|
||||
" @%p20 bra $Lt_1_31234;\n"
|
||||
" mov.f32 %f101, %f9;\n"
|
||||
" st.global.f32 [%rd64+0], %f101;\n"
|
||||
" cvt.s64.s32 %rd67, %r13;\n"
|
||||
" mul.wide.s32 %rd68, %r13, 4;\n"
|
||||
" add.u64 %rd69, %rd68, %rd64;\n"
|
||||
" mov.f32 %f102, %f11;\n"
|
||||
" st.global.f32 [%rd69+0], %f102;\n"
|
||||
" add.u64 %rd70, %rd68, %rd69;\n"
|
||||
" mov.f32 %f103, %f13;\n"
|
||||
" st.global.f32 [%rd70+0], %f103;\n"
|
||||
" add.u64 %rd71, %rd68, %rd70;\n"
|
||||
" mov.f32 %f104, %f15;\n"
|
||||
" st.global.f32 [%rd71+0], %f104;\n"
|
||||
" add.u64 %rd64, %rd68, %rd71;\n"
|
||||
" mov.f32 %f105, %f17;\n"
|
||||
" st.global.f32 [%rd64+0], %f105;\n"
|
||||
" mov.f32 %f106, %f19;\n"
|
||||
" add.u64 %rd72, %rd68, %rd64;\n"
|
||||
" st.global.f32 [%rd72+0], %f106;\n"
|
||||
"$Lt_1_31234:\n"
|
||||
" ld.param.u64 %rd73, [__cudaparm_kernel_pair_fast_ans];\n"
|
||||
" mul.lo.u64 %rd74, %rd20, 16;\n"
|
||||
" add.u64 %rd75, %rd73, %rd74;\n"
|
||||
" mov.f32 %f107, %f108;\n"
|
||||
" st.global.v4.f32 [%rd75+0], {%f31,%f30,%f29,%f107};\n"
|
||||
"$Lt_1_30210:\n"
|
||||
"$Lt_1_22530:\n"
|
||||
" .loc 16 190 0\n"
|
||||
" exit;\n"
|
||||
"$LDWend_kernel_pair_fast:\n"
|
||||
" }\n"
|
||||
;
|
||||
@ -1,132 +0,0 @@
|
||||
.version 2.3
|
||||
.target sm_20
|
||||
.address_size 64
|
||||
// compiled with /usr/local/cuda/open64/lib//be
|
||||
// nvopencc 4.0 built on 2011-05-12
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Compiling /tmp/tmpxft_00009a34_00000000-9_lal_neighbor_cpu.cpp3.i (/home/sjplimp/ccBI#.V8lyjI)
|
||||
//-----------------------------------------------------------
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Options:
|
||||
//-----------------------------------------------------------
|
||||
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
|
||||
// -O3 (Optimization level)
|
||||
// -g0 (Debug level)
|
||||
// -m2 (Report advisories)
|
||||
//-----------------------------------------------------------
|
||||
|
||||
.file 1 "<command-line>"
|
||||
.file 2 "/tmp/tmpxft_00009a34_00000000-8_lal_neighbor_cpu.cudafe2.gpu"
|
||||
.file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
|
||||
.file 4 "/usr/local/cuda/include/crt/device_runtime.h"
|
||||
.file 5 "/usr/local/cuda/include/host_defines.h"
|
||||
.file 6 "/usr/local/cuda/include/builtin_types.h"
|
||||
.file 7 "/usr/local/cuda/include/device_types.h"
|
||||
.file 8 "/usr/local/cuda/include/driver_types.h"
|
||||
.file 9 "/usr/local/cuda/include/surface_types.h"
|
||||
.file 10 "/usr/local/cuda/include/texture_types.h"
|
||||
.file 11 "/usr/local/cuda/include/vector_types.h"
|
||||
.file 12 "/usr/local/cuda/include/device_launch_parameters.h"
|
||||
.file 13 "/usr/local/cuda/include/crt/storage_class.h"
|
||||
.file 14 "/usr/include/bits/types.h"
|
||||
.file 15 "/usr/include/time.h"
|
||||
.file 16 "lal_neighbor_cpu.cu"
|
||||
.file 17 "/usr/local/cuda/include/common_functions.h"
|
||||
.file 18 "/usr/local/cuda/include/math_functions.h"
|
||||
.file 19 "/usr/local/cuda/include/math_constants.h"
|
||||
.file 20 "/usr/local/cuda/include/device_functions.h"
|
||||
.file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h"
|
||||
.file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h"
|
||||
.file 23 "/usr/local/cuda/include/sm_13_double_functions.h"
|
||||
.file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h"
|
||||
.file 25 "/usr/local/cuda/include/sm_20_intrinsics.h"
|
||||
.file 26 "/usr/local/cuda/include/surface_functions.h"
|
||||
.file 27 "/usr/local/cuda/include/texture_fetch_functions.h"
|
||||
.file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
|
||||
|
||||
|
||||
.entry kernel_unpack (
|
||||
.param .u64 __cudaparm_kernel_unpack_dev_nbor,
|
||||
.param .u64 __cudaparm_kernel_unpack_dev_ij,
|
||||
.param .s32 __cudaparm_kernel_unpack_inum,
|
||||
.param .s32 __cudaparm_kernel_unpack_t_per_atom)
|
||||
{
|
||||
.reg .u32 %r<19>;
|
||||
.reg .u64 %rd<33>;
|
||||
.reg .pred %p<5>;
|
||||
.loc 16 21 0
|
||||
$LDWbegin_kernel_unpack:
|
||||
ld.param.s32 %r1, [__cudaparm_kernel_unpack_t_per_atom];
|
||||
cvt.s32.u32 %r2, %tid.x;
|
||||
div.s32 %r3, %r2, %r1;
|
||||
cvt.s32.u32 %r4, %ntid.x;
|
||||
div.s32 %r5, %r4, %r1;
|
||||
cvt.s32.u32 %r6, %ctaid.x;
|
||||
mul.lo.s32 %r7, %r6, %r5;
|
||||
add.s32 %r8, %r3, %r7;
|
||||
ld.param.s32 %r9, [__cudaparm_kernel_unpack_inum];
|
||||
setp.ge.s32 %p1, %r8, %r9;
|
||||
@%p1 bra $Lt_0_2050;
|
||||
.loc 16 30 0
|
||||
cvt.s64.s32 %rd1, %r9;
|
||||
ld.param.u64 %rd2, [__cudaparm_kernel_unpack_dev_nbor];
|
||||
cvt.s64.s32 %rd3, %r8;
|
||||
add.u64 %rd4, %rd3, %rd1;
|
||||
mul.lo.u64 %rd5, %rd4, 4;
|
||||
add.u64 %rd6, %rd2, %rd5;
|
||||
mul.wide.s32 %rd7, %r9, 4;
|
||||
add.u64 %rd8, %rd6, %rd7;
|
||||
ld.param.u64 %rd9, [__cudaparm_kernel_unpack_dev_ij];
|
||||
ld.global.s32 %r10, [%rd8+0];
|
||||
cvt.s64.s32 %rd10, %r10;
|
||||
mul.wide.s32 %rd11, %r10, 4;
|
||||
add.u64 %rd12, %rd9, %rd11;
|
||||
.loc 16 31 0
|
||||
ld.global.s32 %r11, [%rd6+0];
|
||||
cvt.s64.s32 %rd13, %r11;
|
||||
mul.wide.s32 %rd14, %r11, 4;
|
||||
add.u64 %rd15, %rd12, %rd14;
|
||||
.loc 16 33 0
|
||||
sub.s32 %r12, %r1, 1;
|
||||
and.b32 %r13, %r12, %r2;
|
||||
mul.lo.s32 %r14, %r12, %r8;
|
||||
add.s32 %r15, %r13, %r14;
|
||||
cvt.s64.s32 %rd16, %r15;
|
||||
mul.wide.s32 %rd17, %r15, 4;
|
||||
add.u64 %rd18, %rd8, %rd17;
|
||||
.loc 16 34 0
|
||||
cvt.s64.s32 %rd19, %r13;
|
||||
mul.wide.s32 %rd20, %r13, 4;
|
||||
add.u64 %rd21, %rd12, %rd20;
|
||||
setp.ge.u64 %p2, %rd21, %rd15;
|
||||
@%p2 bra $Lt_0_2562;
|
||||
sub.u64 %rd22, %rd15, %rd21;
|
||||
add.u64 %rd23, %rd22, 3;
|
||||
shr.s64 %rd24, %rd23, 63;
|
||||
mov.s64 %rd25, 3;
|
||||
and.b64 %rd26, %rd24, %rd25;
|
||||
add.s64 %rd27, %rd26, %rd23;
|
||||
shr.s64 %rd28, %rd27, 2;
|
||||
mul.lo.s32 %r16, %r9, %r1;
|
||||
mov.s64 %rd29, %rd28;
|
||||
$Lt_0_3074:
|
||||
//<loop> Loop body line 34, nesting depth: 1, estimated iterations: unknown
|
||||
.loc 16 37 0
|
||||
ld.global.s32 %r17, [%rd21+0];
|
||||
st.global.s32 [%rd18+0], %r17;
|
||||
.loc 16 38 0
|
||||
cvt.s64.s32 %rd30, %r16;
|
||||
mul.wide.s32 %rd31, %r16, 4;
|
||||
add.u64 %rd18, %rd18, %rd31;
|
||||
add.u64 %rd21, %rd21, 4;
|
||||
setp.ne.u64 %p3, %rd21, %rd15;
|
||||
@%p3 bra $Lt_0_3074;
|
||||
$Lt_0_2562:
|
||||
$Lt_0_2050:
|
||||
.loc 16 41 0
|
||||
exit;
|
||||
$LDWend_kernel_unpack:
|
||||
} // kernel_unpack
|
||||
|
||||
@ -1,86 +0,0 @@
|
||||
const char * neighbor_cpu =
|
||||
" .version 2.3\n"
|
||||
" .target sm_20\n"
|
||||
" .address_size 64\n"
|
||||
" .entry kernel_unpack (\n"
|
||||
" .param .u64 __cudaparm_kernel_unpack_dev_nbor,\n"
|
||||
" .param .u64 __cudaparm_kernel_unpack_dev_ij,\n"
|
||||
" .param .s32 __cudaparm_kernel_unpack_inum,\n"
|
||||
" .param .s32 __cudaparm_kernel_unpack_t_per_atom)\n"
|
||||
" {\n"
|
||||
" .reg .u32 %r<19>;\n"
|
||||
" .reg .u64 %rd<33>;\n"
|
||||
" .reg .pred %p<5>;\n"
|
||||
" .loc 16 21 0\n"
|
||||
"$LDWbegin_kernel_unpack:\n"
|
||||
" ld.param.s32 %r1, [__cudaparm_kernel_unpack_t_per_atom];\n"
|
||||
" cvt.s32.u32 %r2, %tid.x;\n"
|
||||
" div.s32 %r3, %r2, %r1;\n"
|
||||
" cvt.s32.u32 %r4, %ntid.x;\n"
|
||||
" div.s32 %r5, %r4, %r1;\n"
|
||||
" cvt.s32.u32 %r6, %ctaid.x;\n"
|
||||
" mul.lo.s32 %r7, %r6, %r5;\n"
|
||||
" add.s32 %r8, %r3, %r7;\n"
|
||||
" ld.param.s32 %r9, [__cudaparm_kernel_unpack_inum];\n"
|
||||
" setp.ge.s32 %p1, %r8, %r9;\n"
|
||||
" @%p1 bra $Lt_0_2050;\n"
|
||||
" .loc 16 30 0\n"
|
||||
" cvt.s64.s32 %rd1, %r9;\n"
|
||||
" ld.param.u64 %rd2, [__cudaparm_kernel_unpack_dev_nbor];\n"
|
||||
" cvt.s64.s32 %rd3, %r8;\n"
|
||||
" add.u64 %rd4, %rd3, %rd1;\n"
|
||||
" mul.lo.u64 %rd5, %rd4, 4;\n"
|
||||
" add.u64 %rd6, %rd2, %rd5;\n"
|
||||
" mul.wide.s32 %rd7, %r9, 4;\n"
|
||||
" add.u64 %rd8, %rd6, %rd7;\n"
|
||||
" ld.param.u64 %rd9, [__cudaparm_kernel_unpack_dev_ij];\n"
|
||||
" ld.global.s32 %r10, [%rd8+0];\n"
|
||||
" cvt.s64.s32 %rd10, %r10;\n"
|
||||
" mul.wide.s32 %rd11, %r10, 4;\n"
|
||||
" add.u64 %rd12, %rd9, %rd11;\n"
|
||||
" .loc 16 31 0\n"
|
||||
" ld.global.s32 %r11, [%rd6+0];\n"
|
||||
" cvt.s64.s32 %rd13, %r11;\n"
|
||||
" mul.wide.s32 %rd14, %r11, 4;\n"
|
||||
" add.u64 %rd15, %rd12, %rd14;\n"
|
||||
" .loc 16 33 0\n"
|
||||
" sub.s32 %r12, %r1, 1;\n"
|
||||
" and.b32 %r13, %r12, %r2;\n"
|
||||
" mul.lo.s32 %r14, %r12, %r8;\n"
|
||||
" add.s32 %r15, %r13, %r14;\n"
|
||||
" cvt.s64.s32 %rd16, %r15;\n"
|
||||
" mul.wide.s32 %rd17, %r15, 4;\n"
|
||||
" add.u64 %rd18, %rd8, %rd17;\n"
|
||||
" .loc 16 34 0\n"
|
||||
" cvt.s64.s32 %rd19, %r13;\n"
|
||||
" mul.wide.s32 %rd20, %r13, 4;\n"
|
||||
" add.u64 %rd21, %rd12, %rd20;\n"
|
||||
" setp.ge.u64 %p2, %rd21, %rd15;\n"
|
||||
" @%p2 bra $Lt_0_2562;\n"
|
||||
" sub.u64 %rd22, %rd15, %rd21;\n"
|
||||
" add.u64 %rd23, %rd22, 3;\n"
|
||||
" shr.s64 %rd24, %rd23, 63;\n"
|
||||
" mov.s64 %rd25, 3;\n"
|
||||
" and.b64 %rd26, %rd24, %rd25;\n"
|
||||
" add.s64 %rd27, %rd26, %rd23;\n"
|
||||
" shr.s64 %rd28, %rd27, 2;\n"
|
||||
" mul.lo.s32 %r16, %r9, %r1;\n"
|
||||
" mov.s64 %rd29, %rd28;\n"
|
||||
"$Lt_0_3074:\n"
|
||||
" .loc 16 37 0\n"
|
||||
" ld.global.s32 %r17, [%rd21+0];\n"
|
||||
" st.global.s32 [%rd18+0], %r17;\n"
|
||||
" .loc 16 38 0\n"
|
||||
" cvt.s64.s32 %rd30, %r16;\n"
|
||||
" mul.wide.s32 %rd31, %r16, 4;\n"
|
||||
" add.u64 %rd18, %rd18, %rd31;\n"
|
||||
" add.u64 %rd21, %rd21, 4;\n"
|
||||
" setp.ne.u64 %p3, %rd21, %rd15;\n"
|
||||
" @%p3 bra $Lt_0_3074;\n"
|
||||
"$Lt_0_2562:\n"
|
||||
"$Lt_0_2050:\n"
|
||||
" .loc 16 41 0\n"
|
||||
" exit;\n"
|
||||
"$LDWend_kernel_unpack:\n"
|
||||
" }\n"
|
||||
;
|
||||
@ -1,870 +0,0 @@
|
||||
.version 2.3
|
||||
.target sm_20
|
||||
.address_size 64
|
||||
// compiled with /usr/local/cuda/open64/lib//be
|
||||
// nvopencc 4.0 built on 2011-05-12
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Compiling /tmp/tmpxft_00009a53_00000000-9_lal_neighbor_gpu.cpp3.i (/home/sjplimp/ccBI#.a5G2Mh)
|
||||
//-----------------------------------------------------------
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Options:
|
||||
//-----------------------------------------------------------
|
||||
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
|
||||
// -O3 (Optimization level)
|
||||
// -g0 (Debug level)
|
||||
// -m2 (Report advisories)
|
||||
//-----------------------------------------------------------
|
||||
|
||||
.file 1 "<command-line>"
|
||||
.file 2 "/tmp/tmpxft_00009a53_00000000-8_lal_neighbor_gpu.cudafe2.gpu"
|
||||
.file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
|
||||
.file 4 "/usr/local/cuda/include/crt/device_runtime.h"
|
||||
.file 5 "/usr/local/cuda/include/host_defines.h"
|
||||
.file 6 "/usr/local/cuda/include/builtin_types.h"
|
||||
.file 7 "/usr/local/cuda/include/device_types.h"
|
||||
.file 8 "/usr/local/cuda/include/driver_types.h"
|
||||
.file 9 "/usr/local/cuda/include/surface_types.h"
|
||||
.file 10 "/usr/local/cuda/include/texture_types.h"
|
||||
.file 11 "/usr/local/cuda/include/vector_types.h"
|
||||
.file 12 "/usr/local/cuda/include/device_launch_parameters.h"
|
||||
.file 13 "/usr/local/cuda/include/crt/storage_class.h"
|
||||
.file 14 "/usr/include/bits/types.h"
|
||||
.file 15 "/usr/include/time.h"
|
||||
.file 16 "lal_neighbor_gpu.cu"
|
||||
.file 17 "/usr/local/cuda/include/common_functions.h"
|
||||
.file 18 "/usr/local/cuda/include/math_functions.h"
|
||||
.file 19 "/usr/local/cuda/include/math_constants.h"
|
||||
.file 20 "/usr/local/cuda/include/device_functions.h"
|
||||
.file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h"
|
||||
.file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h"
|
||||
.file 23 "/usr/local/cuda/include/sm_13_double_functions.h"
|
||||
.file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h"
|
||||
.file 25 "/usr/local/cuda/include/sm_20_intrinsics.h"
|
||||
.file 26 "/usr/local/cuda/include/surface_functions.h"
|
||||
.file 27 "/usr/local/cuda/include/texture_fetch_functions.h"
|
||||
.file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
|
||||
|
||||
.global .texref neigh_tex;
|
||||
|
||||
.entry calc_cell_id (
|
||||
.param .u64 __cudaparm_calc_cell_id_pos,
|
||||
.param .u64 __cudaparm_calc_cell_id_cell_id,
|
||||
.param .u64 __cudaparm_calc_cell_id_particle_id,
|
||||
.param .f32 __cudaparm_calc_cell_id_boxlo0,
|
||||
.param .f32 __cudaparm_calc_cell_id_boxlo1,
|
||||
.param .f32 __cudaparm_calc_cell_id_boxlo2,
|
||||
.param .f32 __cudaparm_calc_cell_id_boxhi0,
|
||||
.param .f32 __cudaparm_calc_cell_id_boxhi1,
|
||||
.param .f32 __cudaparm_calc_cell_id_boxhi2,
|
||||
.param .f32 __cudaparm_calc_cell_id_cell_size,
|
||||
.param .s32 __cudaparm_calc_cell_id_ncellx,
|
||||
.param .s32 __cudaparm_calc_cell_id_ncelly,
|
||||
.param .s32 __cudaparm_calc_cell_id_nall)
|
||||
{
|
||||
.reg .u32 %r<25>;
|
||||
.reg .u64 %rd<8>;
|
||||
.reg .f32 %f<35>;
|
||||
.reg .f64 %fd<11>;
|
||||
.reg .pred %p<3>;
|
||||
.loc 16 29 0
|
||||
$LDWbegin_calc_cell_id:
|
||||
mov.u32 %r1, %tid.x;
|
||||
mov.u32 %r2, %ctaid.x;
|
||||
mov.u32 %r3, %ntid.x;
|
||||
mul.lo.u32 %r4, %r2, %r3;
|
||||
add.u32 %r5, %r1, %r4;
|
||||
ld.param.s32 %r6, [__cudaparm_calc_cell_id_nall];
|
||||
setp.le.s32 %p1, %r6, %r5;
|
||||
@%p1 bra $Lt_0_1026;
|
||||
.loc 16 33 0
|
||||
mov.u32 %r7, %r5;
|
||||
mov.s32 %r8, 0;
|
||||
mov.u32 %r9, %r8;
|
||||
mov.s32 %r10, 0;
|
||||
mov.u32 %r11, %r10;
|
||||
mov.s32 %r12, 0;
|
||||
mov.u32 %r13, %r12;
|
||||
tex.1d.v4.f32.s32 {%f1,%f2,%f3,%f4},[neigh_tex,{%r7,%r9,%r11,%r13}];
|
||||
mov.f32 %f5, %f1;
|
||||
mov.f32 %f6, %f2;
|
||||
mov.f32 %f7, %f3;
|
||||
.loc 16 46 0
|
||||
ld.param.f32 %f8, [__cudaparm_calc_cell_id_cell_size];
|
||||
neg.ftz.f32 %f9, %f8;
|
||||
ld.param.f32 %f10, [__cudaparm_calc_cell_id_boxlo0];
|
||||
ld.param.f32 %f11, [__cudaparm_calc_cell_id_boxlo2];
|
||||
ld.param.f32 %f12, [__cudaparm_calc_cell_id_boxlo1];
|
||||
ld.param.s32 %r14, [__cudaparm_calc_cell_id_ncellx];
|
||||
ld.param.s32 %r15, [__cudaparm_calc_cell_id_ncelly];
|
||||
ld.param.f32 %f13, [__cudaparm_calc_cell_id_boxhi2];
|
||||
sub.ftz.f32 %f14, %f13, %f11;
|
||||
add.ftz.f32 %f15, %f8, %f14;
|
||||
sub.ftz.f32 %f16, %f7, %f11;
|
||||
max.ftz.f32 %f17, %f9, %f16;
|
||||
min.ftz.f32 %f18, %f15, %f17;
|
||||
div.approx.ftz.f32 %f19, %f18, %f8;
|
||||
cvt.ftz.f64.f32 %fd1, %f19;
|
||||
mov.f64 %fd2, 0d3ff0000000000000; // 1
|
||||
add.f64 %fd3, %fd1, %fd2;
|
||||
cvt.rzi.u32.f64 %r16, %fd3;
|
||||
mul.lo.u32 %r17, %r14, %r16;
|
||||
mul.lo.u32 %r18, %r15, %r17;
|
||||
ld.param.f32 %f20, [__cudaparm_calc_cell_id_boxhi1];
|
||||
sub.ftz.f32 %f21, %f20, %f12;
|
||||
add.ftz.f32 %f22, %f8, %f21;
|
||||
sub.ftz.f32 %f23, %f6, %f12;
|
||||
max.ftz.f32 %f24, %f9, %f23;
|
||||
min.ftz.f32 %f25, %f22, %f24;
|
||||
div.approx.ftz.f32 %f26, %f25, %f8;
|
||||
cvt.ftz.f64.f32 %fd4, %f26;
|
||||
mov.f64 %fd5, 0d3ff0000000000000; // 1
|
||||
add.f64 %fd6, %fd4, %fd5;
|
||||
cvt.rzi.u32.f64 %r19, %fd6;
|
||||
mul.lo.u32 %r20, %r14, %r19;
|
||||
add.u32 %r21, %r18, %r20;
|
||||
ld.param.f32 %f27, [__cudaparm_calc_cell_id_boxhi0];
|
||||
sub.ftz.f32 %f28, %f27, %f10;
|
||||
add.ftz.f32 %f29, %f8, %f28;
|
||||
sub.ftz.f32 %f30, %f5, %f10;
|
||||
max.ftz.f32 %f31, %f9, %f30;
|
||||
min.ftz.f32 %f32, %f29, %f31;
|
||||
div.approx.ftz.f32 %f33, %f32, %f8;
|
||||
cvt.ftz.f64.f32 %fd7, %f33;
|
||||
mov.f64 %fd8, 0d3ff0000000000000; // 1
|
||||
add.f64 %fd9, %fd7, %fd8;
|
||||
cvt.rzi.u32.f64 %r22, %fd9;
|
||||
add.u32 %r23, %r21, %r22;
|
||||
.loc 16 50 0
|
||||
cvt.s64.s32 %rd1, %r5;
|
||||
mul.wide.s32 %rd2, %r5, 4;
|
||||
ld.param.u64 %rd3, [__cudaparm_calc_cell_id_cell_id];
|
||||
add.u64 %rd4, %rd3, %rd2;
|
||||
st.global.u32 [%rd4+0], %r23;
|
||||
.loc 16 51 0
|
||||
ld.param.u64 %rd5, [__cudaparm_calc_cell_id_particle_id];
|
||||
add.u64 %rd6, %rd5, %rd2;
|
||||
st.global.s32 [%rd6+0], %r5;
|
||||
$Lt_0_1026:
|
||||
.loc 16 53 0
|
||||
exit;
|
||||
$LDWend_calc_cell_id:
|
||||
} // calc_cell_id
|
||||
|
||||
.entry kernel_calc_cell_counts (
|
||||
.param .u64 __cudaparm_kernel_calc_cell_counts_cell_id,
|
||||
.param .u64 __cudaparm_kernel_calc_cell_counts_cell_counts,
|
||||
.param .s32 __cudaparm_kernel_calc_cell_counts_nall,
|
||||
.param .s32 __cudaparm_kernel_calc_cell_counts_ncell)
|
||||
{
|
||||
.reg .u32 %r<33>;
|
||||
.reg .u64 %rd<15>;
|
||||
.reg .pred %p<13>;
|
||||
.loc 16 56 0
|
||||
$LDWbegin_kernel_calc_cell_counts:
|
||||
mov.u32 %r1, %ctaid.x;
|
||||
mov.u32 %r2, %ntid.x;
|
||||
mul.lo.u32 %r3, %r1, %r2;
|
||||
mov.u32 %r4, %tid.x;
|
||||
add.u32 %r5, %r4, %r3;
|
||||
ld.param.s32 %r6, [__cudaparm_kernel_calc_cell_counts_nall];
|
||||
setp.gt.s32 %p1, %r6, %r5;
|
||||
@!%p1 bra $Lt_1_7426;
|
||||
.loc 16 59 0
|
||||
ld.param.u64 %rd1, [__cudaparm_kernel_calc_cell_counts_cell_id];
|
||||
cvt.s64.s32 %rd2, %r5;
|
||||
mul.wide.s32 %rd3, %r5, 4;
|
||||
add.u64 %rd4, %rd1, %rd3;
|
||||
ld.global.u32 %r7, [%rd4+0];
|
||||
mov.u32 %r8, 0;
|
||||
setp.ne.s32 %p2, %r5, %r8;
|
||||
@%p2 bra $Lt_1_7938;
|
||||
add.s32 %r9, %r7, 1;
|
||||
mov.u32 %r10, 0;
|
||||
setp.le.s32 %p3, %r9, %r10;
|
||||
@%p3 bra $Lt_1_8450;
|
||||
mov.s32 %r11, %r9;
|
||||
ld.param.u64 %rd5, [__cudaparm_kernel_calc_cell_counts_cell_counts];
|
||||
mov.s32 %r12, 0;
|
||||
mov.s32 %r13, %r11;
|
||||
$Lt_1_8962:
|
||||
//<loop> Loop body line 59, nesting depth: 1, estimated iterations: unknown
|
||||
.loc 16 64 0
|
||||
mov.s32 %r14, 0;
|
||||
st.global.s32 [%rd5+0], %r14;
|
||||
add.s32 %r12, %r12, 1;
|
||||
add.u64 %rd5, %rd5, 4;
|
||||
setp.ne.s32 %p4, %r9, %r12;
|
||||
@%p4 bra $Lt_1_8962;
|
||||
$Lt_1_8450:
|
||||
$Lt_1_7938:
|
||||
sub.s32 %r15, %r6, 1;
|
||||
setp.ne.s32 %p5, %r5, %r15;
|
||||
@%p5 bra $Lt_1_9474;
|
||||
.loc 16 67 0
|
||||
add.s32 %r9, %r7, 1;
|
||||
mov.s32 %r16, %r9;
|
||||
ld.param.s32 %r17, [__cudaparm_kernel_calc_cell_counts_ncell];
|
||||
setp.gt.s32 %p6, %r9, %r17;
|
||||
@%p6 bra $Lt_1_9986;
|
||||
sub.s32 %r18, %r17, %r7;
|
||||
add.s32 %r19, %r17, 1;
|
||||
ld.param.u64 %rd6, [__cudaparm_kernel_calc_cell_counts_cell_counts];
|
||||
cvt.s64.s32 %rd7, %r9;
|
||||
mul.wide.s32 %rd8, %r9, 4;
|
||||
add.u64 %rd9, %rd6, %rd8;
|
||||
mov.s32 %r20, %r18;
|
||||
$Lt_1_10498:
|
||||
//<loop> Loop body line 67, nesting depth: 1, estimated iterations: unknown
|
||||
.loc 16 68 0
|
||||
st.global.s32 [%rd9+0], %r6;
|
||||
add.s32 %r16, %r16, 1;
|
||||
add.u64 %rd9, %rd9, 4;
|
||||
setp.ne.s32 %p7, %r19, %r16;
|
||||
@%p7 bra $Lt_1_10498;
|
||||
$Lt_1_9986:
|
||||
$Lt_1_9474:
|
||||
selp.s32 %r21, 1, 0, %p1;
|
||||
mov.s32 %r22, 0;
|
||||
set.gt.u32.s32 %r23, %r5, %r22;
|
||||
neg.s32 %r24, %r23;
|
||||
and.b32 %r25, %r21, %r24;
|
||||
mov.u32 %r26, 0;
|
||||
setp.eq.s32 %p8, %r25, %r26;
|
||||
@%p8 bra $Lt_1_11010;
|
||||
.loc 16 72 0
|
||||
ld.global.u32 %r27, [%rd4+-4];
|
||||
setp.eq.s32 %p9, %r7, %r27;
|
||||
@%p9 bra $Lt_1_11522;
|
||||
.loc 16 74 0
|
||||
add.s32 %r28, %r27, 1;
|
||||
mov.s32 %r29, %r28;
|
||||
setp.gt.s32 %p10, %r28, %r7;
|
||||
@%p10 bra $Lt_1_12034;
|
||||
sub.s32 %r30, %r7, %r27;
|
||||
add.s32 %r9, %r7, 1;
|
||||
ld.param.u64 %rd10, [__cudaparm_kernel_calc_cell_counts_cell_counts];
|
||||
cvt.s64.s32 %rd11, %r28;
|
||||
mul.wide.s32 %rd12, %r28, 4;
|
||||
add.u64 %rd13, %rd10, %rd12;
|
||||
mov.s32 %r31, %r30;
|
||||
$Lt_1_12546:
|
||||
//<loop> Loop body line 74, nesting depth: 1, estimated iterations: unknown
|
||||
.loc 16 75 0
|
||||
st.global.s32 [%rd13+0], %r5;
|
||||
add.s32 %r29, %r29, 1;
|
||||
add.u64 %rd13, %rd13, 4;
|
||||
setp.ne.s32 %p11, %r9, %r29;
|
||||
@%p11 bra $Lt_1_12546;
|
||||
$Lt_1_12034:
|
||||
$Lt_1_11522:
|
||||
$Lt_1_11010:
|
||||
$Lt_1_7426:
|
||||
.loc 16 79 0
|
||||
exit;
|
||||
$LDWend_kernel_calc_cell_counts:
|
||||
} // kernel_calc_cell_counts
|
||||
|
||||
.entry transpose (
|
||||
.param .u64 __cudaparm_transpose_out,
|
||||
.param .u64 __cudaparm_transpose_in,
|
||||
.param .s32 __cudaparm_transpose_columns_in,
|
||||
.param .s32 __cudaparm_transpose_rows_in)
|
||||
{
|
||||
.reg .u32 %r<32>;
|
||||
.reg .u64 %rd<23>;
|
||||
.reg .f32 %f<4>;
|
||||
.reg .pred %p<4>;
|
||||
.shared .align 4 .b8 __cuda___cuda_local_var_32571_32_non_const_block112[288];
|
||||
.loc 16 86 0
|
||||
$LDWbegin_transpose:
|
||||
mov.u32 %r1, %ctaid.x;
|
||||
mul.lo.u32 %r2, %r1, 8;
|
||||
mov.u32 %r3, %ctaid.y;
|
||||
mul.lo.u32 %r4, %r3, 8;
|
||||
mov.u32 %r5, %tid.x;
|
||||
add.u32 %r6, %r2, %r5;
|
||||
mov.u32 %r7, %tid.y;
|
||||
add.u32 %r8, %r4, %r7;
|
||||
ld.param.s32 %r9, [__cudaparm_transpose_rows_in];
|
||||
ld.param.s32 %r10, [__cudaparm_transpose_columns_in];
|
||||
set.gt.u32.u32 %r11, %r9, %r8;
|
||||
neg.s32 %r12, %r11;
|
||||
set.gt.u32.u32 %r13, %r10, %r6;
|
||||
neg.s32 %r14, %r13;
|
||||
and.b32 %r15, %r12, %r14;
|
||||
mov.u32 %r16, 0;
|
||||
setp.eq.s32 %p1, %r15, %r16;
|
||||
@%p1 bra $Lt_2_2306;
|
||||
.loc 16 98 0
|
||||
mov.u64 %rd1, __cuda___cuda_local_var_32571_32_non_const_block112;
|
||||
ld.param.u64 %rd2, [__cudaparm_transpose_in];
|
||||
mul.lo.u32 %r17, %r10, %r8;
|
||||
add.u32 %r18, %r6, %r17;
|
||||
cvt.u64.u32 %rd3, %r18;
|
||||
mul.wide.u32 %rd4, %r18, 4;
|
||||
add.u64 %rd5, %rd2, %rd4;
|
||||
ld.global.s32 %r19, [%rd5+0];
|
||||
cvt.rn.f32.s32 %f1, %r19;
|
||||
cvt.u64.u32 %rd6, %r5;
|
||||
cvt.u64.u32 %rd7, %r7;
|
||||
mul.wide.u32 %rd8, %r7, 9;
|
||||
add.u64 %rd9, %rd6, %rd8;
|
||||
mul.lo.u64 %rd10, %rd9, 4;
|
||||
add.u64 %rd11, %rd1, %rd10;
|
||||
st.shared.f32 [%rd11+0], %f1;
|
||||
$Lt_2_2306:
|
||||
mov.u64 %rd1, __cuda___cuda_local_var_32571_32_non_const_block112;
|
||||
.loc 16 100 0
|
||||
bar.sync 0;
|
||||
add.u32 %r20, %r2, %r7;
|
||||
add.u32 %r21, %r4, %r5;
|
||||
set.gt.u32.u32 %r22, %r9, %r21;
|
||||
neg.s32 %r23, %r22;
|
||||
set.gt.u32.u32 %r24, %r10, %r20;
|
||||
neg.s32 %r25, %r24;
|
||||
and.b32 %r26, %r23, %r25;
|
||||
mov.u32 %r27, 0;
|
||||
setp.eq.s32 %p2, %r26, %r27;
|
||||
@%p2 bra $Lt_2_2818;
|
||||
.loc 16 105 0
|
||||
cvt.u64.u32 %rd12, %r7;
|
||||
cvt.u64.u32 %rd13, %r5;
|
||||
mul.wide.u32 %rd14, %r5, 9;
|
||||
add.u64 %rd15, %rd12, %rd14;
|
||||
mul.lo.u64 %rd16, %rd15, 4;
|
||||
add.u64 %rd17, %rd1, %rd16;
|
||||
ld.shared.f32 %f2, [%rd17+0];
|
||||
cvt.rzi.ftz.s32.f32 %r28, %f2;
|
||||
ld.param.u64 %rd18, [__cudaparm_transpose_out];
|
||||
mul.lo.u32 %r29, %r9, %r20;
|
||||
add.u32 %r30, %r21, %r29;
|
||||
cvt.u64.u32 %rd19, %r30;
|
||||
mul.wide.u32 %rd20, %r30, 4;
|
||||
add.u64 %rd21, %rd18, %rd20;
|
||||
st.global.s32 [%rd21+0], %r28;
|
||||
$Lt_2_2818:
|
||||
.loc 16 106 0
|
||||
exit;
|
||||
$LDWend_transpose:
|
||||
} // transpose
|
||||
|
||||
.entry calc_neigh_list_cell (
|
||||
.param .u64 __cudaparm_calc_neigh_list_cell_x_,
|
||||
.param .u64 __cudaparm_calc_neigh_list_cell_cell_particle_id,
|
||||
.param .u64 __cudaparm_calc_neigh_list_cell_cell_counts,
|
||||
.param .u64 __cudaparm_calc_neigh_list_cell_nbor_list,
|
||||
.param .u64 __cudaparm_calc_neigh_list_cell_host_nbor_list,
|
||||
.param .u64 __cudaparm_calc_neigh_list_cell_host_numj,
|
||||
.param .s32 __cudaparm_calc_neigh_list_cell_neigh_bin_size,
|
||||
.param .f32 __cudaparm_calc_neigh_list_cell_cell_size,
|
||||
.param .s32 __cudaparm_calc_neigh_list_cell_ncellx,
|
||||
.param .s32 __cudaparm_calc_neigh_list_cell_ncelly,
|
||||
.param .s32 __cudaparm_calc_neigh_list_cell_ncellz,
|
||||
.param .s32 __cudaparm_calc_neigh_list_cell_inum,
|
||||
.param .s32 __cudaparm_calc_neigh_list_cell_nt,
|
||||
.param .s32 __cudaparm_calc_neigh_list_cell_nall,
|
||||
.param .s32 __cudaparm_calc_neigh_list_cell_t_per_atom)
|
||||
{
|
||||
.reg .u32 %r<118>;
|
||||
.reg .u64 %rd<52>;
|
||||
.reg .f32 %f<41>;
|
||||
.reg .f64 %fd<4>;
|
||||
.reg .pred %p<23>;
|
||||
.shared .align 16 .b8 __cuda___cuda_local_var_32609_34_non_const_pos_sh496[2048];
|
||||
.shared .align 4 .b8 __cuda___cuda_local_var_32608_31_non_const_cell_list_sh2544[512];
|
||||
// __cuda_local_var_32624_12_non_const_atom_i = 16
|
||||
.loc 16 116 0
|
||||
$LDWbegin_calc_neigh_list_cell:
|
||||
.loc 16 128 0
|
||||
ld.param.s32 %r1, [__cudaparm_calc_neigh_list_cell_ncelly];
|
||||
mov.u32 %r2, %ctaid.y;
|
||||
rem.u32 %r3, %r2, %r1;
|
||||
div.u32 %r4, %r2, %r1;
|
||||
ld.param.s32 %r5, [__cudaparm_calc_neigh_list_cell_ncellx];
|
||||
mul.lo.s32 %r6, %r5, %r3;
|
||||
mul.lo.s32 %r7, %r5, %r4;
|
||||
mul.lo.s32 %r8, %r7, %r1;
|
||||
cvt.s32.u32 %r9, %ctaid.x;
|
||||
ld.param.u64 %rd1, [__cudaparm_calc_neigh_list_cell_cell_counts];
|
||||
add.s32 %r10, %r6, %r8;
|
||||
add.s32 %r11, %r9, %r10;
|
||||
cvt.s64.s32 %rd2, %r11;
|
||||
mul.wide.s32 %rd3, %r11, 4;
|
||||
add.u64 %rd4, %rd1, %rd3;
|
||||
ldu.global.s32 %r12, [%rd4+0];
|
||||
.loc 16 129 0
|
||||
ldu.global.s32 %r13, [%rd4+4];
|
||||
.loc 16 137 0
|
||||
sub.s32 %r14, %r13, %r12;
|
||||
mov.u32 %r15, %ntid.x;
|
||||
cvt.rn.f32.u32 %f1, %r15;
|
||||
cvt.rn.f32.s32 %f2, %r14;
|
||||
div.approx.ftz.f32 %f3, %f2, %f1;
|
||||
cvt.rpi.ftz.f32.f32 %f4, %f3;
|
||||
cvt.rzi.ftz.s32.f32 %r16, %f4;
|
||||
mov.u32 %r17, 0;
|
||||
setp.le.s32 %p1, %r16, %r17;
|
||||
@%p1 bra $Lt_3_14082;
|
||||
sub.s32 %r18, %r3, 1;
|
||||
mov.s32 %r19, 0;
|
||||
max.s32 %r20, %r18, %r19;
|
||||
sub.s32 %r21, %r1, 1;
|
||||
add.s32 %r22, %r3, 1;
|
||||
min.s32 %r23, %r21, %r22;
|
||||
ld.param.s32 %r24, [__cudaparm_calc_neigh_list_cell_ncellz];
|
||||
sub.s32 %r25, %r24, 1;
|
||||
add.s32 %r26, %r4, 1;
|
||||
min.s32 %r27, %r25, %r26;
|
||||
sub.s32 %r28, %r9, 1;
|
||||
mov.s32 %r29, 0;
|
||||
max.s32 %r30, %r28, %r29;
|
||||
add.s32 %r31, %r9, 1;
|
||||
sub.s32 %r32, %r5, 1;
|
||||
min.s32 %r33, %r31, %r32;
|
||||
mov.s32 %r34, %r16;
|
||||
cvt.s32.u32 %r35, %tid.x;
|
||||
add.s32 %r36, %r12, %r35;
|
||||
mov.u32 %r37, 0;
|
||||
ld.param.s32 %r38, [__cudaparm_calc_neigh_list_cell_inum];
|
||||
cvt.s64.s32 %rd5, %r38;
|
||||
sub.s32 %r39, %r4, 1;
|
||||
mov.s32 %r40, %r36;
|
||||
mov.s32 %r41, 0;
|
||||
max.s32 %r42, %r39, %r41;
|
||||
setp.ge.s32 %p2, %r27, %r42;
|
||||
ld.param.s32 %r43, [__cudaparm_calc_neigh_list_cell_nt];
|
||||
ld.param.s32 %r44, [__cudaparm_calc_neigh_list_cell_nall];
|
||||
mov.s32 %r45, 0;
|
||||
mov.u64 %rd6, __cuda___cuda_local_var_32609_34_non_const_pos_sh496;
|
||||
mov.u64 %rd7, __cuda___cuda_local_var_32608_31_non_const_cell_list_sh2544;
|
||||
mov.s32 %r46, %r34;
|
||||
$Lt_3_14594:
|
||||
//<loop> Loop body line 137, nesting depth: 1, estimated iterations: unknown
|
||||
.loc 16 140 0
|
||||
mov.s32 %r47, %r44;
|
||||
setp.ge.s32 %p3, %r40, %r13;
|
||||
@%p3 bra $Lt_3_14850;
|
||||
.loc 16 146 0
|
||||
ld.param.u64 %rd8, [__cudaparm_calc_neigh_list_cell_cell_particle_id];
|
||||
add.u32 %r48, %r36, %r37;
|
||||
cvt.s64.s32 %rd9, %r48;
|
||||
mul.wide.s32 %rd10, %r48, 4;
|
||||
add.u64 %rd11, %rd8, %rd10;
|
||||
ld.global.s32 %r47, [%rd11+0];
|
||||
$Lt_3_14850:
|
||||
setp.lt.s32 %p4, %r47, %r43;
|
||||
@!%p4 bra $Lt_3_15362;
|
||||
.loc 16 149 0
|
||||
mov.u32 %r49, %r47;
|
||||
mov.s32 %r50, 0;
|
||||
mov.u32 %r51, %r50;
|
||||
mov.s32 %r52, 0;
|
||||
mov.u32 %r53, %r52;
|
||||
mov.s32 %r54, 0;
|
||||
mov.u32 %r55, %r54;
|
||||
tex.1d.v4.f32.s32 {%f5,%f6,%f7,%f8},[neigh_tex,{%r49,%r51,%r53,%r55}];
|
||||
mov.f32 %f9, %f5;
|
||||
mov.f32 %f10, %f6;
|
||||
mov.f32 %f11, %f7;
|
||||
mov.f32 %f12, %f9;
|
||||
mov.f32 %f13, %f10;
|
||||
mov.f32 %f14, %f11;
|
||||
$Lt_3_15362:
|
||||
cvt.s64.s32 %rd12, %r47;
|
||||
mul.wide.s32 %rd13, %r47, 4;
|
||||
setp.ge.s32 %p5, %r47, %r38;
|
||||
@%p5 bra $Lt_3_16130;
|
||||
.loc 16 153 0
|
||||
ld.param.u64 %rd14, [__cudaparm_calc_neigh_list_cell_nbor_list];
|
||||
add.u64 %rd15, %rd12, %rd5;
|
||||
mul.lo.u64 %rd16, %rd15, 4;
|
||||
add.u64 %rd17, %rd14, %rd16;
|
||||
mov.s64 %rd18, %rd17;
|
||||
.loc 16 154 0
|
||||
ld.param.s32 %r56, [__cudaparm_calc_neigh_list_cell_t_per_atom];
|
||||
sub.s32 %r57, %r56, 1;
|
||||
mul.lo.s32 %r58, %r47, %r57;
|
||||
cvt.s64.s32 %rd19, %r58;
|
||||
add.u64 %rd20, %rd19, %rd5;
|
||||
mul.lo.u64 %rd21, %rd20, 4;
|
||||
add.u64 %rd22, %rd17, %rd21;
|
||||
.loc 16 155 0
|
||||
mul.lo.s32 %r59, %r56, %r38;
|
||||
sub.s32 %r60, %r59, %r56;
|
||||
.loc 16 156 0
|
||||
add.u64 %rd23, %rd13, %rd14;
|
||||
st.global.s32 [%rd23+0], %r47;
|
||||
bra.uni $Lt_3_15874;
|
||||
$Lt_3_16130:
|
||||
.loc 16 159 0
|
||||
ld.param.u64 %rd24, [__cudaparm_calc_neigh_list_cell_host_numj];
|
||||
add.u64 %rd25, %rd24, %rd13;
|
||||
mul.lo.u64 %rd26, %rd5, 4;
|
||||
sub.u64 %rd18, %rd25, %rd26;
|
||||
.loc 16 160 0
|
||||
ld.param.u64 %rd27, [__cudaparm_calc_neigh_list_cell_host_nbor_list];
|
||||
ld.param.s32 %r61, [__cudaparm_calc_neigh_list_cell_neigh_bin_size];
|
||||
sub.s32 %r62, %r47, %r38;
|
||||
mul.lo.s32 %r63, %r61, %r62;
|
||||
cvt.s64.s32 %rd28, %r63;
|
||||
mul.wide.s32 %rd29, %r63, 4;
|
||||
add.u64 %rd22, %rd27, %rd29;
|
||||
mov.s32 %r60, 0;
|
||||
$Lt_3_15874:
|
||||
.loc 16 165 0
|
||||
mov.s32 %r64, %r42;
|
||||
@!%p2 bra $Lt_3_24066;
|
||||
sub.s32 %r65, %r27, %r42;
|
||||
add.s32 %r66, %r65, 1;
|
||||
setp.le.s32 %p6, %r20, %r23;
|
||||
add.s32 %r67, %r27, 1;
|
||||
mov.s32 %r68, 0;
|
||||
mov.s32 %r69, %r66;
|
||||
$Lt_3_16898:
|
||||
//<loop> Loop body line 165, nesting depth: 2, estimated iterations: unknown
|
||||
.loc 16 166 0
|
||||
mov.s32 %r70, %r20;
|
||||
@!%p6 bra $Lt_3_17154;
|
||||
sub.s32 %r71, %r23, %r20;
|
||||
add.s32 %r72, %r71, 1;
|
||||
setp.ge.s32 %p7, %r33, %r30;
|
||||
add.s32 %r73, %r23, 1;
|
||||
mov.s32 %r74, %r72;
|
||||
$Lt_3_17666:
|
||||
//<loop> Loop body line 166, nesting depth: 3, estimated iterations: unknown
|
||||
@!%p7 bra $Lt_3_17922;
|
||||
sub.s32 %r75, %r33, %r30;
|
||||
add.s32 %r76, %r75, 1;
|
||||
mul.lo.s32 %r77, %r70, %r5;
|
||||
mul.lo.s32 %r78, %r64, %r5;
|
||||
mul.lo.s32 %r79, %r78, %r1;
|
||||
add.s32 %r80, %r33, 1;
|
||||
add.s32 %r81, %r77, %r79;
|
||||
add.s32 %r82, %r81, %r30;
|
||||
add.s32 %r83, %r80, %r81;
|
||||
cvt.s64.s32 %rd30, %r82;
|
||||
mul.wide.s32 %rd31, %r82, 4;
|
||||
add.u64 %rd32, %rd1, %rd31;
|
||||
mov.s32 %r84, %r76;
|
||||
$Lt_3_18434:
|
||||
//<loop> Loop body line 166, nesting depth: 4, estimated iterations: unknown
|
||||
.loc 16 171 0
|
||||
ld.global.s32 %r85, [%rd32+0];
|
||||
.loc 16 172 0
|
||||
ld.global.s32 %r86, [%rd32+4];
|
||||
.loc 16 176 0
|
||||
sub.s32 %r87, %r86, %r85;
|
||||
cvt.rn.f32.s32 %f15, %r87;
|
||||
mov.f32 %f16, 0f43000000; // 128
|
||||
div.approx.ftz.f32 %f17, %f15, %f16;
|
||||
cvt.rpi.ftz.f32.f32 %f18, %f17;
|
||||
cvt.rzi.ftz.s32.f32 %r88, %f18;
|
||||
mov.u32 %r89, 0;
|
||||
setp.le.s32 %p8, %r88, %r89;
|
||||
@%p8 bra $Lt_3_18690;
|
||||
mov.s32 %r90, %r88;
|
||||
mov.s32 %r91, 0;
|
||||
setp.lt.s32 %p9, %r47, %r43;
|
||||
mul.lo.s32 %r92, %r88, 128;
|
||||
mov.s32 %r93, %r90;
|
||||
$Lt_3_19202:
|
||||
//<loop> Loop body line 176, nesting depth: 5, estimated iterations: unknown
|
||||
sub.s32 %r94, %r87, %r91;
|
||||
mov.s32 %r95, 128;
|
||||
min.s32 %r96, %r94, %r95;
|
||||
setp.le.s32 %p10, %r96, %r35;
|
||||
@%p10 bra $Lt_3_19458;
|
||||
.loc 16 183 0
|
||||
ld.param.u64 %rd33, [__cudaparm_calc_neigh_list_cell_cell_particle_id];
|
||||
add.s32 %r97, %r91, %r35;
|
||||
add.s32 %r98, %r85, %r97;
|
||||
cvt.s64.s32 %rd34, %r98;
|
||||
mul.wide.s32 %rd35, %r98, 4;
|
||||
add.u64 %rd36, %rd33, %rd35;
|
||||
ld.global.s32 %r99, [%rd36+0];
|
||||
.loc 16 184 0
|
||||
cvt.s64.s32 %rd37, %r35;
|
||||
mul.wide.s32 %rd38, %r35, 4;
|
||||
add.u64 %rd39, %rd7, %rd38;
|
||||
st.shared.s32 [%rd39+0], %r99;
|
||||
.loc 16 185 0
|
||||
mov.u32 %r100, %r99;
|
||||
mov.s32 %r101, 0;
|
||||
mov.u32 %r102, %r101;
|
||||
mov.s32 %r103, 0;
|
||||
mov.u32 %r104, %r103;
|
||||
mov.s32 %r105, 0;
|
||||
mov.u32 %r106, %r105;
|
||||
tex.1d.v4.f32.s32 {%f19,%f20,%f21,%f22},[neigh_tex,{%r100,%r102,%r104,%r106}];
|
||||
mov.f32 %f23, %f19;
|
||||
mov.f32 %f24, %f20;
|
||||
mov.f32 %f25, %f21;
|
||||
.loc 16 186 0
|
||||
mul.lo.u64 %rd40, %rd37, 16;
|
||||
add.u64 %rd41, %rd6, %rd40;
|
||||
st.shared.v2.f32 [%rd41+0], {%f23,%f24};
|
||||
.loc 16 188 0
|
||||
st.shared.f32 [%rd41+8], %f25;
|
||||
$Lt_3_19458:
|
||||
.loc 16 190 0
|
||||
bar.sync 0;
|
||||
@!%p9 bra $Lt_3_20482;
|
||||
mov.u32 %r107, 0;
|
||||
setp.le.s32 %p11, %r96, %r107;
|
||||
@%p11 bra $Lt_3_20482;
|
||||
mov.s32 %r108, %r96;
|
||||
mov.s64 %rd42, 0;
|
||||
ld.param.f32 %f26, [__cudaparm_calc_neigh_list_cell_cell_size];
|
||||
mul.ftz.f32 %f27, %f26, %f26;
|
||||
mov.s64 %rd43, %rd6;
|
||||
mov.f32 %f28, %f14;
|
||||
mov.f32 %f29, %f13;
|
||||
mov.f32 %f30, %f12;
|
||||
mov.s32 %r109, 0;
|
||||
mov.s32 %r110, %r108;
|
||||
$Lt_3_20994:
|
||||
//<loop> Loop body line 190, nesting depth: 6, estimated iterations: unknown
|
||||
ld.shared.v4.f32 {%f31,%f32,%f33,_}, [%rd43+0];
|
||||
.loc 16 196 0
|
||||
sub.ftz.f32 %f34, %f30, %f31;
|
||||
.loc 16 197 0
|
||||
sub.ftz.f32 %f35, %f29, %f32;
|
||||
.loc 16 198 0
|
||||
sub.ftz.f32 %f36, %f28, %f33;
|
||||
.loc 16 195 0
|
||||
mul.ftz.f32 %f37, %f35, %f35;
|
||||
fma.rn.ftz.f32 %f38, %f34, %f34, %f37;
|
||||
fma.rn.ftz.f32 %f39, %f36, %f36, %f38;
|
||||
setp.gt.ftz.f32 %p12, %f27, %f39;
|
||||
@!%p12 bra $Lt_3_25346;
|
||||
cvt.ftz.f64.f32 %fd1, %f39;
|
||||
mov.f64 %fd2, 0d3ee4f8b588e368f1; // 1e-05
|
||||
setp.gt.f64 %p13, %fd1, %fd2;
|
||||
@!%p13 bra $Lt_3_25346;
|
||||
.loc 16 202 0
|
||||
add.s32 %r68, %r68, 1;
|
||||
ld.param.s32 %r111, [__cudaparm_calc_neigh_list_cell_neigh_bin_size];
|
||||
setp.lt.s32 %p14, %r111, %r68;
|
||||
@%p14 bra $Lt_3_25346;
|
||||
.loc 16 204 0
|
||||
mul.lo.u64 %rd44, %rd42, 4;
|
||||
add.u64 %rd45, %rd7, %rd44;
|
||||
ld.shared.s32 %r112, [%rd45+0];
|
||||
st.global.s32 [%rd22+0], %r112;
|
||||
cvt.s64.s32 %rd46, %r60;
|
||||
mul.wide.s32 %rd47, %r60, 4;
|
||||
add.u64 %rd48, %rd22, %rd47;
|
||||
add.u64 %rd49, %rd48, 4;
|
||||
add.u64 %rd50, %rd22, 4;
|
||||
ld.param.s32 %r113, [__cudaparm_calc_neigh_list_cell_t_per_atom];
|
||||
sub.s32 %r114, %r113, 1;
|
||||
and.b32 %r115, %r68, %r114;
|
||||
mov.s32 %r116, 0;
|
||||
setp.eq.s32 %p15, %r115, %r116;
|
||||
selp.u64 %rd22, %rd49, %rd50, %p15;
|
||||
$Lt_3_25346:
|
||||
$L_3_13570:
|
||||
.loc 16 202 0
|
||||
add.s32 %r109, %r109, 1;
|
||||
add.s64 %rd42, %rd42, 1;
|
||||
add.u64 %rd43, %rd43, 16;
|
||||
setp.ne.s32 %p16, %r96, %r109;
|
||||
@%p16 bra $Lt_3_20994;
|
||||
$Lt_3_20482:
|
||||
$Lt_3_19970:
|
||||
.loc 16 212 0
|
||||
bar.sync 0;
|
||||
add.s32 %r91, %r91, 128;
|
||||
setp.ne.s32 %p17, %r91, %r92;
|
||||
@%p17 bra $Lt_3_19202;
|
||||
$Lt_3_18690:
|
||||
add.s32 %r82, %r82, 1;
|
||||
add.u64 %rd32, %rd32, 4;
|
||||
setp.ne.s32 %p18, %r82, %r83;
|
||||
@%p18 bra $Lt_3_18434;
|
||||
$Lt_3_17922:
|
||||
add.s32 %r70, %r70, 1;
|
||||
setp.ne.s32 %p19, %r73, %r70;
|
||||
@%p19 bra $Lt_3_17666;
|
||||
$Lt_3_17154:
|
||||
add.s32 %r64, %r64, 1;
|
||||
setp.ne.s32 %p20, %r67, %r64;
|
||||
@%p20 bra $Lt_3_16898;
|
||||
bra.uni $Lt_3_16386;
|
||||
$Lt_3_24066:
|
||||
mov.s32 %r68, 0;
|
||||
$Lt_3_16386:
|
||||
@!%p4 bra $Lt_3_23042;
|
||||
.loc 16 218 0
|
||||
st.global.s32 [%rd18+0], %r68;
|
||||
$Lt_3_23042:
|
||||
add.s32 %r45, %r45, 1;
|
||||
add.u32 %r37, %r37, %r15;
|
||||
add.s32 %r40, %r40, %r15;
|
||||
setp.ne.s32 %p21, %r16, %r45;
|
||||
@%p21 bra $Lt_3_14594;
|
||||
$Lt_3_14082:
|
||||
.loc 16 220 0
|
||||
exit;
|
||||
$LDWend_calc_neigh_list_cell:
|
||||
} // calc_neigh_list_cell
|
||||
|
||||
.entry kernel_special (
|
||||
.param .u64 __cudaparm_kernel_special_dev_nbor,
|
||||
.param .u64 __cudaparm_kernel_special_host_nbor_list,
|
||||
.param .u64 __cudaparm_kernel_special_host_numj,
|
||||
.param .u64 __cudaparm_kernel_special_tag,
|
||||
.param .u64 __cudaparm_kernel_special_nspecial,
|
||||
.param .u64 __cudaparm_kernel_special_special,
|
||||
.param .s32 __cudaparm_kernel_special_inum,
|
||||
.param .s32 __cudaparm_kernel_special_nt,
|
||||
.param .s32 __cudaparm_kernel_special_max_nbors,
|
||||
.param .s32 __cudaparm_kernel_special_t_per_atom)
|
||||
{
|
||||
.reg .u32 %r<45>;
|
||||
.reg .u64 %rd<45>;
|
||||
.reg .pred %p<11>;
|
||||
.loc 16 226 0
|
||||
$LDWbegin_kernel_special:
|
||||
ld.param.s32 %r1, [__cudaparm_kernel_special_t_per_atom];
|
||||
cvt.s32.u32 %r2, %tid.x;
|
||||
div.s32 %r3, %r2, %r1;
|
||||
cvt.s32.u32 %r4, %ntid.x;
|
||||
div.s32 %r5, %r4, %r1;
|
||||
cvt.s32.u32 %r6, %ctaid.x;
|
||||
mul.lo.s32 %r7, %r6, %r5;
|
||||
add.s32 %r8, %r3, %r7;
|
||||
ld.param.s32 %r9, [__cudaparm_kernel_special_nt];
|
||||
setp.ge.s32 %p1, %r8, %r9;
|
||||
@%p1 bra $Lt_4_6146;
|
||||
.loc 16 236 0
|
||||
ld.param.u64 %rd1, [__cudaparm_kernel_special_nspecial];
|
||||
mul.lo.s32 %r10, %r8, 3;
|
||||
cvt.s64.s32 %rd2, %r10;
|
||||
mul.wide.s32 %rd3, %r10, 4;
|
||||
add.u64 %rd4, %rd1, %rd3;
|
||||
ld.global.s32 %r11, [%rd4+0];
|
||||
.loc 16 237 0
|
||||
ld.global.s32 %r12, [%rd4+4];
|
||||
.loc 16 238 0
|
||||
ld.global.s32 %r13, [%rd4+8];
|
||||
ld.param.s32 %r14, [__cudaparm_kernel_special_inum];
|
||||
setp.ge.s32 %p2, %r8, %r14;
|
||||
@%p2 bra $Lt_4_6914;
|
||||
.loc 16 244 0
|
||||
ld.param.u64 %rd5, [__cudaparm_kernel_special_dev_nbor];
|
||||
cvt.s64.s32 %rd6, %r8;
|
||||
cvt.s64.s32 %rd7, %r14;
|
||||
add.u64 %rd8, %rd6, %rd7;
|
||||
mul.lo.u64 %rd9, %rd8, 4;
|
||||
add.u64 %rd10, %rd5, %rd9;
|
||||
ld.global.s32 %r15, [%rd10+0];
|
||||
.loc 16 246 0
|
||||
mul.lo.s32 %r16, %r14, %r1;
|
||||
mov.s32 %r17, %r16;
|
||||
.loc 16 248 0
|
||||
sub.s32 %r18, %r1, 1;
|
||||
mul.lo.s32 %r19, %r18, %r8;
|
||||
add.s32 %r20, %r14, %r19;
|
||||
cvt.s64.s32 %rd11, %r20;
|
||||
mul.wide.s32 %rd12, %r20, 4;
|
||||
add.u64 %rd13, %rd10, %rd12;
|
||||
and.b32 %r21, %r18, %r15;
|
||||
cvt.s64.s32 %rd14, %r21;
|
||||
div.s32 %r22, %r15, %r1;
|
||||
mul.lo.s32 %r23, %r16, %r22;
|
||||
cvt.s64.s32 %rd15, %r23;
|
||||
add.u64 %rd16, %rd14, %rd15;
|
||||
mul.lo.u64 %rd17, %rd16, 4;
|
||||
add.u64 %rd18, %rd13, %rd17;
|
||||
.loc 16 249 0
|
||||
and.b32 %r24, %r18, %r2;
|
||||
cvt.s64.s32 %rd19, %r24;
|
||||
mul.wide.s32 %rd20, %r24, 4;
|
||||
add.u64 %rd21, %rd13, %rd20;
|
||||
bra.uni $Lt_4_6658;
|
||||
$Lt_4_6914:
|
||||
.loc 16 252 0
|
||||
sub.s32 %r25, %r8, %r14;
|
||||
ld.param.u64 %rd22, [__cudaparm_kernel_special_host_nbor_list];
|
||||
ld.param.s32 %r26, [__cudaparm_kernel_special_max_nbors];
|
||||
mul.lo.s32 %r27, %r26, %r25;
|
||||
cvt.s64.s32 %rd23, %r27;
|
||||
mul.wide.s32 %rd24, %r27, 4;
|
||||
add.u64 %rd25, %rd22, %rd24;
|
||||
mov.s64 %rd21, %rd25;
|
||||
.loc 16 254 0
|
||||
ld.param.u64 %rd26, [__cudaparm_kernel_special_host_numj];
|
||||
cvt.s64.s32 %rd27, %r25;
|
||||
mul.wide.s32 %rd28, %r25, 4;
|
||||
add.u64 %rd29, %rd26, %rd28;
|
||||
ld.global.s32 %r28, [%rd29+0];
|
||||
cvt.s64.s32 %rd30, %r28;
|
||||
mul.wide.s32 %rd31, %r28, 4;
|
||||
add.u64 %rd18, %rd25, %rd31;
|
||||
mov.s32 %r17, 1;
|
||||
$Lt_4_6658:
|
||||
setp.ge.u64 %p3, %rd21, %rd18;
|
||||
@%p3 bra $Lt_4_7170;
|
||||
mov.s32 %r29, 0;
|
||||
setp.gt.s32 %p4, %r13, %r29;
|
||||
cvt.s64.s32 %rd32, %r17;
|
||||
ld.param.u64 %rd33, [__cudaparm_kernel_special_tag];
|
||||
$Lt_4_7682:
|
||||
//<loop> Loop body line 254, nesting depth: 1, estimated iterations: unknown
|
||||
.loc 16 258 0
|
||||
ld.global.s32 %r30, [%rd21+0];
|
||||
.loc 16 259 0
|
||||
cvt.s64.s32 %rd34, %r30;
|
||||
mul.wide.s32 %rd35, %r30, 4;
|
||||
add.u64 %rd36, %rd33, %rd35;
|
||||
ld.global.s32 %r31, [%rd36+0];
|
||||
@!%p4 bra $Lt_4_7938;
|
||||
mov.s32 %r32, %r13;
|
||||
cvt.s64.s32 %rd37, %r8;
|
||||
cvt.s64.s32 %rd38, %r9;
|
||||
mul.wide.s32 %rd39, %r9, 4;
|
||||
ld.param.u64 %rd40, [__cudaparm_kernel_special_special];
|
||||
mul.wide.s32 %rd41, %r8, 4;
|
||||
add.u64 %rd42, %rd40, %rd41;
|
||||
mov.s32 %r33, 0;
|
||||
mov.s32 %r34, %r32;
|
||||
$Lt_4_8450:
|
||||
//<loop> Loop body line 259, nesting depth: 1, estimated iterations: unknown
|
||||
ld.global.s32 %r35, [%rd42+0];
|
||||
setp.ne.s32 %p5, %r35, %r31;
|
||||
@%p5 bra $Lt_4_8706;
|
||||
.loc 16 269 0
|
||||
setp.le.s32 %p6, %r11, %r33;
|
||||
mov.s32 %r36, 3;
|
||||
mov.s32 %r37, 2;
|
||||
selp.s32 %r38, %r36, %r37, %p6;
|
||||
mov.s32 %r39, 2;
|
||||
mov.s32 %r40, 1;
|
||||
selp.s32 %r41, %r39, %r40, %p6;
|
||||
setp.le.s32 %p7, %r12, %r33;
|
||||
selp.s32 %r42, %r38, %r41, %p7;
|
||||
shl.b32 %r43, %r42, 30;
|
||||
xor.b32 %r30, %r30, %r43;
|
||||
.loc 16 270 0
|
||||
st.global.s32 [%rd21+0], %r30;
|
||||
$Lt_4_8706:
|
||||
add.s32 %r33, %r33, 1;
|
||||
add.u64 %rd42, %rd39, %rd42;
|
||||
setp.ne.s32 %p8, %r13, %r33;
|
||||
@%p8 bra $Lt_4_8450;
|
||||
$Lt_4_7938:
|
||||
.loc 16 257 0
|
||||
mul.lo.u64 %rd43, %rd32, 4;
|
||||
add.u64 %rd21, %rd21, %rd43;
|
||||
setp.lt.u64 %p9, %rd21, %rd18;
|
||||
@%p9 bra $Lt_4_7682;
|
||||
$Lt_4_7170:
|
||||
$Lt_4_6146:
|
||||
.loc 16 276 0
|
||||
exit;
|
||||
$LDWend_kernel_special:
|
||||
} // kernel_special
|
||||
|
||||
@ -1,809 +0,0 @@
|
||||
const char * neighbor_gpu =
|
||||
" .version 2.3\n"
|
||||
" .target sm_20\n"
|
||||
" .address_size 64\n"
|
||||
" .global .texref neigh_tex;\n"
|
||||
" .entry calc_cell_id (\n"
|
||||
" .param .u64 __cudaparm_calc_cell_id_pos,\n"
|
||||
" .param .u64 __cudaparm_calc_cell_id_cell_id,\n"
|
||||
" .param .u64 __cudaparm_calc_cell_id_particle_id,\n"
|
||||
" .param .f32 __cudaparm_calc_cell_id_boxlo0,\n"
|
||||
" .param .f32 __cudaparm_calc_cell_id_boxlo1,\n"
|
||||
" .param .f32 __cudaparm_calc_cell_id_boxlo2,\n"
|
||||
" .param .f32 __cudaparm_calc_cell_id_boxhi0,\n"
|
||||
" .param .f32 __cudaparm_calc_cell_id_boxhi1,\n"
|
||||
" .param .f32 __cudaparm_calc_cell_id_boxhi2,\n"
|
||||
" .param .f32 __cudaparm_calc_cell_id_cell_size,\n"
|
||||
" .param .s32 __cudaparm_calc_cell_id_ncellx,\n"
|
||||
" .param .s32 __cudaparm_calc_cell_id_ncelly,\n"
|
||||
" .param .s32 __cudaparm_calc_cell_id_nall)\n"
|
||||
" {\n"
|
||||
" .reg .u32 %r<25>;\n"
|
||||
" .reg .u64 %rd<8>;\n"
|
||||
" .reg .f32 %f<35>;\n"
|
||||
" .reg .f64 %fd<11>;\n"
|
||||
" .reg .pred %p<3>;\n"
|
||||
" .loc 16 29 0\n"
|
||||
"$LDWbegin_calc_cell_id:\n"
|
||||
" mov.u32 %r1, %tid.x;\n"
|
||||
" mov.u32 %r2, %ctaid.x;\n"
|
||||
" mov.u32 %r3, %ntid.x;\n"
|
||||
" mul.lo.u32 %r4, %r2, %r3;\n"
|
||||
" add.u32 %r5, %r1, %r4;\n"
|
||||
" ld.param.s32 %r6, [__cudaparm_calc_cell_id_nall];\n"
|
||||
" setp.le.s32 %p1, %r6, %r5;\n"
|
||||
" @%p1 bra $Lt_0_1026;\n"
|
||||
" .loc 16 33 0\n"
|
||||
" mov.u32 %r7, %r5;\n"
|
||||
" mov.s32 %r8, 0;\n"
|
||||
" mov.u32 %r9, %r8;\n"
|
||||
" mov.s32 %r10, 0;\n"
|
||||
" mov.u32 %r11, %r10;\n"
|
||||
" mov.s32 %r12, 0;\n"
|
||||
" mov.u32 %r13, %r12;\n"
|
||||
" tex.1d.v4.f32.s32 {%f1,%f2,%f3,%f4},[neigh_tex,{%r7,%r9,%r11,%r13}];\n"
|
||||
" mov.f32 %f5, %f1;\n"
|
||||
" mov.f32 %f6, %f2;\n"
|
||||
" mov.f32 %f7, %f3;\n"
|
||||
" .loc 16 46 0\n"
|
||||
" ld.param.f32 %f8, [__cudaparm_calc_cell_id_cell_size];\n"
|
||||
" neg.ftz.f32 %f9, %f8;\n"
|
||||
" ld.param.f32 %f10, [__cudaparm_calc_cell_id_boxlo0];\n"
|
||||
" ld.param.f32 %f11, [__cudaparm_calc_cell_id_boxlo2];\n"
|
||||
" ld.param.f32 %f12, [__cudaparm_calc_cell_id_boxlo1];\n"
|
||||
" ld.param.s32 %r14, [__cudaparm_calc_cell_id_ncellx];\n"
|
||||
" ld.param.s32 %r15, [__cudaparm_calc_cell_id_ncelly];\n"
|
||||
" ld.param.f32 %f13, [__cudaparm_calc_cell_id_boxhi2];\n"
|
||||
" sub.ftz.f32 %f14, %f13, %f11;\n"
|
||||
" add.ftz.f32 %f15, %f8, %f14;\n"
|
||||
" sub.ftz.f32 %f16, %f7, %f11;\n"
|
||||
" max.ftz.f32 %f17, %f9, %f16;\n"
|
||||
" min.ftz.f32 %f18, %f15, %f17;\n"
|
||||
" div.approx.ftz.f32 %f19, %f18, %f8;\n"
|
||||
" cvt.ftz.f64.f32 %fd1, %f19;\n"
|
||||
" mov.f64 %fd2, 0d3ff0000000000000; \n"
|
||||
" add.f64 %fd3, %fd1, %fd2;\n"
|
||||
" cvt.rzi.u32.f64 %r16, %fd3;\n"
|
||||
" mul.lo.u32 %r17, %r14, %r16;\n"
|
||||
" mul.lo.u32 %r18, %r15, %r17;\n"
|
||||
" ld.param.f32 %f20, [__cudaparm_calc_cell_id_boxhi1];\n"
|
||||
" sub.ftz.f32 %f21, %f20, %f12;\n"
|
||||
" add.ftz.f32 %f22, %f8, %f21;\n"
|
||||
" sub.ftz.f32 %f23, %f6, %f12;\n"
|
||||
" max.ftz.f32 %f24, %f9, %f23;\n"
|
||||
" min.ftz.f32 %f25, %f22, %f24;\n"
|
||||
" div.approx.ftz.f32 %f26, %f25, %f8;\n"
|
||||
" cvt.ftz.f64.f32 %fd4, %f26;\n"
|
||||
" mov.f64 %fd5, 0d3ff0000000000000; \n"
|
||||
" add.f64 %fd6, %fd4, %fd5;\n"
|
||||
" cvt.rzi.u32.f64 %r19, %fd6;\n"
|
||||
" mul.lo.u32 %r20, %r14, %r19;\n"
|
||||
" add.u32 %r21, %r18, %r20;\n"
|
||||
" ld.param.f32 %f27, [__cudaparm_calc_cell_id_boxhi0];\n"
|
||||
" sub.ftz.f32 %f28, %f27, %f10;\n"
|
||||
" add.ftz.f32 %f29, %f8, %f28;\n"
|
||||
" sub.ftz.f32 %f30, %f5, %f10;\n"
|
||||
" max.ftz.f32 %f31, %f9, %f30;\n"
|
||||
" min.ftz.f32 %f32, %f29, %f31;\n"
|
||||
" div.approx.ftz.f32 %f33, %f32, %f8;\n"
|
||||
" cvt.ftz.f64.f32 %fd7, %f33;\n"
|
||||
" mov.f64 %fd8, 0d3ff0000000000000; \n"
|
||||
" add.f64 %fd9, %fd7, %fd8;\n"
|
||||
" cvt.rzi.u32.f64 %r22, %fd9;\n"
|
||||
" add.u32 %r23, %r21, %r22;\n"
|
||||
" .loc 16 50 0\n"
|
||||
" cvt.s64.s32 %rd1, %r5;\n"
|
||||
" mul.wide.s32 %rd2, %r5, 4;\n"
|
||||
" ld.param.u64 %rd3, [__cudaparm_calc_cell_id_cell_id];\n"
|
||||
" add.u64 %rd4, %rd3, %rd2;\n"
|
||||
" st.global.u32 [%rd4+0], %r23;\n"
|
||||
" .loc 16 51 0\n"
|
||||
" ld.param.u64 %rd5, [__cudaparm_calc_cell_id_particle_id];\n"
|
||||
" add.u64 %rd6, %rd5, %rd2;\n"
|
||||
" st.global.s32 [%rd6+0], %r5;\n"
|
||||
"$Lt_0_1026:\n"
|
||||
" .loc 16 53 0\n"
|
||||
" exit;\n"
|
||||
"$LDWend_calc_cell_id:\n"
|
||||
" }\n"
|
||||
" .entry kernel_calc_cell_counts (\n"
|
||||
" .param .u64 __cudaparm_kernel_calc_cell_counts_cell_id,\n"
|
||||
" .param .u64 __cudaparm_kernel_calc_cell_counts_cell_counts,\n"
|
||||
" .param .s32 __cudaparm_kernel_calc_cell_counts_nall,\n"
|
||||
" .param .s32 __cudaparm_kernel_calc_cell_counts_ncell)\n"
|
||||
" {\n"
|
||||
" .reg .u32 %r<33>;\n"
|
||||
" .reg .u64 %rd<15>;\n"
|
||||
" .reg .pred %p<13>;\n"
|
||||
" .loc 16 56 0\n"
|
||||
"$LDWbegin_kernel_calc_cell_counts:\n"
|
||||
" mov.u32 %r1, %ctaid.x;\n"
|
||||
" mov.u32 %r2, %ntid.x;\n"
|
||||
" mul.lo.u32 %r3, %r1, %r2;\n"
|
||||
" mov.u32 %r4, %tid.x;\n"
|
||||
" add.u32 %r5, %r4, %r3;\n"
|
||||
" ld.param.s32 %r6, [__cudaparm_kernel_calc_cell_counts_nall];\n"
|
||||
" setp.gt.s32 %p1, %r6, %r5;\n"
|
||||
" @!%p1 bra $Lt_1_7426;\n"
|
||||
" .loc 16 59 0\n"
|
||||
" ld.param.u64 %rd1, [__cudaparm_kernel_calc_cell_counts_cell_id];\n"
|
||||
" cvt.s64.s32 %rd2, %r5;\n"
|
||||
" mul.wide.s32 %rd3, %r5, 4;\n"
|
||||
" add.u64 %rd4, %rd1, %rd3;\n"
|
||||
" ld.global.u32 %r7, [%rd4+0];\n"
|
||||
" mov.u32 %r8, 0;\n"
|
||||
" setp.ne.s32 %p2, %r5, %r8;\n"
|
||||
" @%p2 bra $Lt_1_7938;\n"
|
||||
" add.s32 %r9, %r7, 1;\n"
|
||||
" mov.u32 %r10, 0;\n"
|
||||
" setp.le.s32 %p3, %r9, %r10;\n"
|
||||
" @%p3 bra $Lt_1_8450;\n"
|
||||
" mov.s32 %r11, %r9;\n"
|
||||
" ld.param.u64 %rd5, [__cudaparm_kernel_calc_cell_counts_cell_counts];\n"
|
||||
" mov.s32 %r12, 0;\n"
|
||||
" mov.s32 %r13, %r11;\n"
|
||||
"$Lt_1_8962:\n"
|
||||
" .loc 16 64 0\n"
|
||||
" mov.s32 %r14, 0;\n"
|
||||
" st.global.s32 [%rd5+0], %r14;\n"
|
||||
" add.s32 %r12, %r12, 1;\n"
|
||||
" add.u64 %rd5, %rd5, 4;\n"
|
||||
" setp.ne.s32 %p4, %r9, %r12;\n"
|
||||
" @%p4 bra $Lt_1_8962;\n"
|
||||
"$Lt_1_8450:\n"
|
||||
"$Lt_1_7938:\n"
|
||||
" sub.s32 %r15, %r6, 1;\n"
|
||||
" setp.ne.s32 %p5, %r5, %r15;\n"
|
||||
" @%p5 bra $Lt_1_9474;\n"
|
||||
" .loc 16 67 0\n"
|
||||
" add.s32 %r9, %r7, 1;\n"
|
||||
" mov.s32 %r16, %r9;\n"
|
||||
" ld.param.s32 %r17, [__cudaparm_kernel_calc_cell_counts_ncell];\n"
|
||||
" setp.gt.s32 %p6, %r9, %r17;\n"
|
||||
" @%p6 bra $Lt_1_9986;\n"
|
||||
" sub.s32 %r18, %r17, %r7;\n"
|
||||
" add.s32 %r19, %r17, 1;\n"
|
||||
" ld.param.u64 %rd6, [__cudaparm_kernel_calc_cell_counts_cell_counts];\n"
|
||||
" cvt.s64.s32 %rd7, %r9;\n"
|
||||
" mul.wide.s32 %rd8, %r9, 4;\n"
|
||||
" add.u64 %rd9, %rd6, %rd8;\n"
|
||||
" mov.s32 %r20, %r18;\n"
|
||||
"$Lt_1_10498:\n"
|
||||
" .loc 16 68 0\n"
|
||||
" st.global.s32 [%rd9+0], %r6;\n"
|
||||
" add.s32 %r16, %r16, 1;\n"
|
||||
" add.u64 %rd9, %rd9, 4;\n"
|
||||
" setp.ne.s32 %p7, %r19, %r16;\n"
|
||||
" @%p7 bra $Lt_1_10498;\n"
|
||||
"$Lt_1_9986:\n"
|
||||
"$Lt_1_9474:\n"
|
||||
" selp.s32 %r21, 1, 0, %p1;\n"
|
||||
" mov.s32 %r22, 0;\n"
|
||||
" set.gt.u32.s32 %r23, %r5, %r22;\n"
|
||||
" neg.s32 %r24, %r23;\n"
|
||||
" and.b32 %r25, %r21, %r24;\n"
|
||||
" mov.u32 %r26, 0;\n"
|
||||
" setp.eq.s32 %p8, %r25, %r26;\n"
|
||||
" @%p8 bra $Lt_1_11010;\n"
|
||||
" .loc 16 72 0\n"
|
||||
" ld.global.u32 %r27, [%rd4+-4];\n"
|
||||
" setp.eq.s32 %p9, %r7, %r27;\n"
|
||||
" @%p9 bra $Lt_1_11522;\n"
|
||||
" .loc 16 74 0\n"
|
||||
" add.s32 %r28, %r27, 1;\n"
|
||||
" mov.s32 %r29, %r28;\n"
|
||||
" setp.gt.s32 %p10, %r28, %r7;\n"
|
||||
" @%p10 bra $Lt_1_12034;\n"
|
||||
" sub.s32 %r30, %r7, %r27;\n"
|
||||
" add.s32 %r9, %r7, 1;\n"
|
||||
" ld.param.u64 %rd10, [__cudaparm_kernel_calc_cell_counts_cell_counts];\n"
|
||||
" cvt.s64.s32 %rd11, %r28;\n"
|
||||
" mul.wide.s32 %rd12, %r28, 4;\n"
|
||||
" add.u64 %rd13, %rd10, %rd12;\n"
|
||||
" mov.s32 %r31, %r30;\n"
|
||||
"$Lt_1_12546:\n"
|
||||
" .loc 16 75 0\n"
|
||||
" st.global.s32 [%rd13+0], %r5;\n"
|
||||
" add.s32 %r29, %r29, 1;\n"
|
||||
" add.u64 %rd13, %rd13, 4;\n"
|
||||
" setp.ne.s32 %p11, %r9, %r29;\n"
|
||||
" @%p11 bra $Lt_1_12546;\n"
|
||||
"$Lt_1_12034:\n"
|
||||
"$Lt_1_11522:\n"
|
||||
"$Lt_1_11010:\n"
|
||||
"$Lt_1_7426:\n"
|
||||
" .loc 16 79 0\n"
|
||||
" exit;\n"
|
||||
"$LDWend_kernel_calc_cell_counts:\n"
|
||||
" }\n"
|
||||
" .entry transpose (\n"
|
||||
" .param .u64 __cudaparm_transpose_out,\n"
|
||||
" .param .u64 __cudaparm_transpose_in,\n"
|
||||
" .param .s32 __cudaparm_transpose_columns_in,\n"
|
||||
" .param .s32 __cudaparm_transpose_rows_in)\n"
|
||||
" {\n"
|
||||
" .reg .u32 %r<32>;\n"
|
||||
" .reg .u64 %rd<23>;\n"
|
||||
" .reg .f32 %f<4>;\n"
|
||||
" .reg .pred %p<4>;\n"
|
||||
" .shared .align 4 .b8 __cuda___cuda_local_var_32571_32_non_const_block112[288];\n"
|
||||
" .loc 16 86 0\n"
|
||||
"$LDWbegin_transpose:\n"
|
||||
" mov.u32 %r1, %ctaid.x;\n"
|
||||
" mul.lo.u32 %r2, %r1, 8;\n"
|
||||
" mov.u32 %r3, %ctaid.y;\n"
|
||||
" mul.lo.u32 %r4, %r3, 8;\n"
|
||||
" mov.u32 %r5, %tid.x;\n"
|
||||
" add.u32 %r6, %r2, %r5;\n"
|
||||
" mov.u32 %r7, %tid.y;\n"
|
||||
" add.u32 %r8, %r4, %r7;\n"
|
||||
" ld.param.s32 %r9, [__cudaparm_transpose_rows_in];\n"
|
||||
" ld.param.s32 %r10, [__cudaparm_transpose_columns_in];\n"
|
||||
" set.gt.u32.u32 %r11, %r9, %r8;\n"
|
||||
" neg.s32 %r12, %r11;\n"
|
||||
" set.gt.u32.u32 %r13, %r10, %r6;\n"
|
||||
" neg.s32 %r14, %r13;\n"
|
||||
" and.b32 %r15, %r12, %r14;\n"
|
||||
" mov.u32 %r16, 0;\n"
|
||||
" setp.eq.s32 %p1, %r15, %r16;\n"
|
||||
" @%p1 bra $Lt_2_2306;\n"
|
||||
" .loc 16 98 0\n"
|
||||
" mov.u64 %rd1, __cuda___cuda_local_var_32571_32_non_const_block112;\n"
|
||||
" ld.param.u64 %rd2, [__cudaparm_transpose_in];\n"
|
||||
" mul.lo.u32 %r17, %r10, %r8;\n"
|
||||
" add.u32 %r18, %r6, %r17;\n"
|
||||
" cvt.u64.u32 %rd3, %r18;\n"
|
||||
" mul.wide.u32 %rd4, %r18, 4;\n"
|
||||
" add.u64 %rd5, %rd2, %rd4;\n"
|
||||
" ld.global.s32 %r19, [%rd5+0];\n"
|
||||
" cvt.rn.f32.s32 %f1, %r19;\n"
|
||||
" cvt.u64.u32 %rd6, %r5;\n"
|
||||
" cvt.u64.u32 %rd7, %r7;\n"
|
||||
" mul.wide.u32 %rd8, %r7, 9;\n"
|
||||
" add.u64 %rd9, %rd6, %rd8;\n"
|
||||
" mul.lo.u64 %rd10, %rd9, 4;\n"
|
||||
" add.u64 %rd11, %rd1, %rd10;\n"
|
||||
" st.shared.f32 [%rd11+0], %f1;\n"
|
||||
"$Lt_2_2306:\n"
|
||||
" mov.u64 %rd1, __cuda___cuda_local_var_32571_32_non_const_block112;\n"
|
||||
" .loc 16 100 0\n"
|
||||
" bar.sync 0;\n"
|
||||
" add.u32 %r20, %r2, %r7;\n"
|
||||
" add.u32 %r21, %r4, %r5;\n"
|
||||
" set.gt.u32.u32 %r22, %r9, %r21;\n"
|
||||
" neg.s32 %r23, %r22;\n"
|
||||
" set.gt.u32.u32 %r24, %r10, %r20;\n"
|
||||
" neg.s32 %r25, %r24;\n"
|
||||
" and.b32 %r26, %r23, %r25;\n"
|
||||
" mov.u32 %r27, 0;\n"
|
||||
" setp.eq.s32 %p2, %r26, %r27;\n"
|
||||
" @%p2 bra $Lt_2_2818;\n"
|
||||
" .loc 16 105 0\n"
|
||||
" cvt.u64.u32 %rd12, %r7;\n"
|
||||
" cvt.u64.u32 %rd13, %r5;\n"
|
||||
" mul.wide.u32 %rd14, %r5, 9;\n"
|
||||
" add.u64 %rd15, %rd12, %rd14;\n"
|
||||
" mul.lo.u64 %rd16, %rd15, 4;\n"
|
||||
" add.u64 %rd17, %rd1, %rd16;\n"
|
||||
" ld.shared.f32 %f2, [%rd17+0];\n"
|
||||
" cvt.rzi.ftz.s32.f32 %r28, %f2;\n"
|
||||
" ld.param.u64 %rd18, [__cudaparm_transpose_out];\n"
|
||||
" mul.lo.u32 %r29, %r9, %r20;\n"
|
||||
" add.u32 %r30, %r21, %r29;\n"
|
||||
" cvt.u64.u32 %rd19, %r30;\n"
|
||||
" mul.wide.u32 %rd20, %r30, 4;\n"
|
||||
" add.u64 %rd21, %rd18, %rd20;\n"
|
||||
" st.global.s32 [%rd21+0], %r28;\n"
|
||||
"$Lt_2_2818:\n"
|
||||
" .loc 16 106 0\n"
|
||||
" exit;\n"
|
||||
"$LDWend_transpose:\n"
|
||||
" }\n"
|
||||
" .entry calc_neigh_list_cell (\n"
|
||||
" .param .u64 __cudaparm_calc_neigh_list_cell_x_,\n"
|
||||
" .param .u64 __cudaparm_calc_neigh_list_cell_cell_particle_id,\n"
|
||||
" .param .u64 __cudaparm_calc_neigh_list_cell_cell_counts,\n"
|
||||
" .param .u64 __cudaparm_calc_neigh_list_cell_nbor_list,\n"
|
||||
" .param .u64 __cudaparm_calc_neigh_list_cell_host_nbor_list,\n"
|
||||
" .param .u64 __cudaparm_calc_neigh_list_cell_host_numj,\n"
|
||||
" .param .s32 __cudaparm_calc_neigh_list_cell_neigh_bin_size,\n"
|
||||
" .param .f32 __cudaparm_calc_neigh_list_cell_cell_size,\n"
|
||||
" .param .s32 __cudaparm_calc_neigh_list_cell_ncellx,\n"
|
||||
" .param .s32 __cudaparm_calc_neigh_list_cell_ncelly,\n"
|
||||
" .param .s32 __cudaparm_calc_neigh_list_cell_ncellz,\n"
|
||||
" .param .s32 __cudaparm_calc_neigh_list_cell_inum,\n"
|
||||
" .param .s32 __cudaparm_calc_neigh_list_cell_nt,\n"
|
||||
" .param .s32 __cudaparm_calc_neigh_list_cell_nall,\n"
|
||||
" .param .s32 __cudaparm_calc_neigh_list_cell_t_per_atom)\n"
|
||||
" {\n"
|
||||
" .reg .u32 %r<118>;\n"
|
||||
" .reg .u64 %rd<52>;\n"
|
||||
" .reg .f32 %f<41>;\n"
|
||||
" .reg .f64 %fd<4>;\n"
|
||||
" .reg .pred %p<23>;\n"
|
||||
" .shared .align 16 .b8 __cuda___cuda_local_var_32609_34_non_const_pos_sh496[2048];\n"
|
||||
" .shared .align 4 .b8 __cuda___cuda_local_var_32608_31_non_const_cell_list_sh2544[512];\n"
|
||||
" .loc 16 116 0\n"
|
||||
"$LDWbegin_calc_neigh_list_cell:\n"
|
||||
" .loc 16 128 0\n"
|
||||
" ld.param.s32 %r1, [__cudaparm_calc_neigh_list_cell_ncelly];\n"
|
||||
" mov.u32 %r2, %ctaid.y;\n"
|
||||
" rem.u32 %r3, %r2, %r1;\n"
|
||||
" div.u32 %r4, %r2, %r1;\n"
|
||||
" ld.param.s32 %r5, [__cudaparm_calc_neigh_list_cell_ncellx];\n"
|
||||
" mul.lo.s32 %r6, %r5, %r3;\n"
|
||||
" mul.lo.s32 %r7, %r5, %r4;\n"
|
||||
" mul.lo.s32 %r8, %r7, %r1;\n"
|
||||
" cvt.s32.u32 %r9, %ctaid.x;\n"
|
||||
" ld.param.u64 %rd1, [__cudaparm_calc_neigh_list_cell_cell_counts];\n"
|
||||
" add.s32 %r10, %r6, %r8;\n"
|
||||
" add.s32 %r11, %r9, %r10;\n"
|
||||
" cvt.s64.s32 %rd2, %r11;\n"
|
||||
" mul.wide.s32 %rd3, %r11, 4;\n"
|
||||
" add.u64 %rd4, %rd1, %rd3;\n"
|
||||
" ldu.global.s32 %r12, [%rd4+0];\n"
|
||||
" .loc 16 129 0\n"
|
||||
" ldu.global.s32 %r13, [%rd4+4];\n"
|
||||
" .loc 16 137 0\n"
|
||||
" sub.s32 %r14, %r13, %r12;\n"
|
||||
" mov.u32 %r15, %ntid.x;\n"
|
||||
" cvt.rn.f32.u32 %f1, %r15;\n"
|
||||
" cvt.rn.f32.s32 %f2, %r14;\n"
|
||||
" div.approx.ftz.f32 %f3, %f2, %f1;\n"
|
||||
" cvt.rpi.ftz.f32.f32 %f4, %f3;\n"
|
||||
" cvt.rzi.ftz.s32.f32 %r16, %f4;\n"
|
||||
" mov.u32 %r17, 0;\n"
|
||||
" setp.le.s32 %p1, %r16, %r17;\n"
|
||||
" @%p1 bra $Lt_3_14082;\n"
|
||||
" sub.s32 %r18, %r3, 1;\n"
|
||||
" mov.s32 %r19, 0;\n"
|
||||
" max.s32 %r20, %r18, %r19;\n"
|
||||
" sub.s32 %r21, %r1, 1;\n"
|
||||
" add.s32 %r22, %r3, 1;\n"
|
||||
" min.s32 %r23, %r21, %r22;\n"
|
||||
" ld.param.s32 %r24, [__cudaparm_calc_neigh_list_cell_ncellz];\n"
|
||||
" sub.s32 %r25, %r24, 1;\n"
|
||||
" add.s32 %r26, %r4, 1;\n"
|
||||
" min.s32 %r27, %r25, %r26;\n"
|
||||
" sub.s32 %r28, %r9, 1;\n"
|
||||
" mov.s32 %r29, 0;\n"
|
||||
" max.s32 %r30, %r28, %r29;\n"
|
||||
" add.s32 %r31, %r9, 1;\n"
|
||||
" sub.s32 %r32, %r5, 1;\n"
|
||||
" min.s32 %r33, %r31, %r32;\n"
|
||||
" mov.s32 %r34, %r16;\n"
|
||||
" cvt.s32.u32 %r35, %tid.x;\n"
|
||||
" add.s32 %r36, %r12, %r35;\n"
|
||||
" mov.u32 %r37, 0;\n"
|
||||
" ld.param.s32 %r38, [__cudaparm_calc_neigh_list_cell_inum];\n"
|
||||
" cvt.s64.s32 %rd5, %r38;\n"
|
||||
" sub.s32 %r39, %r4, 1;\n"
|
||||
" mov.s32 %r40, %r36;\n"
|
||||
" mov.s32 %r41, 0;\n"
|
||||
" max.s32 %r42, %r39, %r41;\n"
|
||||
" setp.ge.s32 %p2, %r27, %r42;\n"
|
||||
" ld.param.s32 %r43, [__cudaparm_calc_neigh_list_cell_nt];\n"
|
||||
" ld.param.s32 %r44, [__cudaparm_calc_neigh_list_cell_nall];\n"
|
||||
" mov.s32 %r45, 0;\n"
|
||||
" mov.u64 %rd6, __cuda___cuda_local_var_32609_34_non_const_pos_sh496;\n"
|
||||
" mov.u64 %rd7, __cuda___cuda_local_var_32608_31_non_const_cell_list_sh2544;\n"
|
||||
" mov.s32 %r46, %r34;\n"
|
||||
"$Lt_3_14594:\n"
|
||||
" .loc 16 140 0\n"
|
||||
" mov.s32 %r47, %r44;\n"
|
||||
" setp.ge.s32 %p3, %r40, %r13;\n"
|
||||
" @%p3 bra $Lt_3_14850;\n"
|
||||
" .loc 16 146 0\n"
|
||||
" ld.param.u64 %rd8, [__cudaparm_calc_neigh_list_cell_cell_particle_id];\n"
|
||||
" add.u32 %r48, %r36, %r37;\n"
|
||||
" cvt.s64.s32 %rd9, %r48;\n"
|
||||
" mul.wide.s32 %rd10, %r48, 4;\n"
|
||||
" add.u64 %rd11, %rd8, %rd10;\n"
|
||||
" ld.global.s32 %r47, [%rd11+0];\n"
|
||||
"$Lt_3_14850:\n"
|
||||
" setp.lt.s32 %p4, %r47, %r43;\n"
|
||||
" @!%p4 bra $Lt_3_15362;\n"
|
||||
" .loc 16 149 0\n"
|
||||
" mov.u32 %r49, %r47;\n"
|
||||
" mov.s32 %r50, 0;\n"
|
||||
" mov.u32 %r51, %r50;\n"
|
||||
" mov.s32 %r52, 0;\n"
|
||||
" mov.u32 %r53, %r52;\n"
|
||||
" mov.s32 %r54, 0;\n"
|
||||
" mov.u32 %r55, %r54;\n"
|
||||
" tex.1d.v4.f32.s32 {%f5,%f6,%f7,%f8},[neigh_tex,{%r49,%r51,%r53,%r55}];\n"
|
||||
" mov.f32 %f9, %f5;\n"
|
||||
" mov.f32 %f10, %f6;\n"
|
||||
" mov.f32 %f11, %f7;\n"
|
||||
" mov.f32 %f12, %f9;\n"
|
||||
" mov.f32 %f13, %f10;\n"
|
||||
" mov.f32 %f14, %f11;\n"
|
||||
"$Lt_3_15362:\n"
|
||||
" cvt.s64.s32 %rd12, %r47;\n"
|
||||
" mul.wide.s32 %rd13, %r47, 4;\n"
|
||||
" setp.ge.s32 %p5, %r47, %r38;\n"
|
||||
" @%p5 bra $Lt_3_16130;\n"
|
||||
" .loc 16 153 0\n"
|
||||
" ld.param.u64 %rd14, [__cudaparm_calc_neigh_list_cell_nbor_list];\n"
|
||||
" add.u64 %rd15, %rd12, %rd5;\n"
|
||||
" mul.lo.u64 %rd16, %rd15, 4;\n"
|
||||
" add.u64 %rd17, %rd14, %rd16;\n"
|
||||
" mov.s64 %rd18, %rd17;\n"
|
||||
" .loc 16 154 0\n"
|
||||
" ld.param.s32 %r56, [__cudaparm_calc_neigh_list_cell_t_per_atom];\n"
|
||||
" sub.s32 %r57, %r56, 1;\n"
|
||||
" mul.lo.s32 %r58, %r47, %r57;\n"
|
||||
" cvt.s64.s32 %rd19, %r58;\n"
|
||||
" add.u64 %rd20, %rd19, %rd5;\n"
|
||||
" mul.lo.u64 %rd21, %rd20, 4;\n"
|
||||
" add.u64 %rd22, %rd17, %rd21;\n"
|
||||
" .loc 16 155 0\n"
|
||||
" mul.lo.s32 %r59, %r56, %r38;\n"
|
||||
" sub.s32 %r60, %r59, %r56;\n"
|
||||
" .loc 16 156 0\n"
|
||||
" add.u64 %rd23, %rd13, %rd14;\n"
|
||||
" st.global.s32 [%rd23+0], %r47;\n"
|
||||
" bra.uni $Lt_3_15874;\n"
|
||||
"$Lt_3_16130:\n"
|
||||
" .loc 16 159 0\n"
|
||||
" ld.param.u64 %rd24, [__cudaparm_calc_neigh_list_cell_host_numj];\n"
|
||||
" add.u64 %rd25, %rd24, %rd13;\n"
|
||||
" mul.lo.u64 %rd26, %rd5, 4;\n"
|
||||
" sub.u64 %rd18, %rd25, %rd26;\n"
|
||||
" .loc 16 160 0\n"
|
||||
" ld.param.u64 %rd27, [__cudaparm_calc_neigh_list_cell_host_nbor_list];\n"
|
||||
" ld.param.s32 %r61, [__cudaparm_calc_neigh_list_cell_neigh_bin_size];\n"
|
||||
" sub.s32 %r62, %r47, %r38;\n"
|
||||
" mul.lo.s32 %r63, %r61, %r62;\n"
|
||||
" cvt.s64.s32 %rd28, %r63;\n"
|
||||
" mul.wide.s32 %rd29, %r63, 4;\n"
|
||||
" add.u64 %rd22, %rd27, %rd29;\n"
|
||||
" mov.s32 %r60, 0;\n"
|
||||
"$Lt_3_15874:\n"
|
||||
" .loc 16 165 0\n"
|
||||
" mov.s32 %r64, %r42;\n"
|
||||
" @!%p2 bra $Lt_3_24066;\n"
|
||||
" sub.s32 %r65, %r27, %r42;\n"
|
||||
" add.s32 %r66, %r65, 1;\n"
|
||||
" setp.le.s32 %p6, %r20, %r23;\n"
|
||||
" add.s32 %r67, %r27, 1;\n"
|
||||
" mov.s32 %r68, 0;\n"
|
||||
" mov.s32 %r69, %r66;\n"
|
||||
"$Lt_3_16898:\n"
|
||||
" .loc 16 166 0\n"
|
||||
" mov.s32 %r70, %r20;\n"
|
||||
" @!%p6 bra $Lt_3_17154;\n"
|
||||
" sub.s32 %r71, %r23, %r20;\n"
|
||||
" add.s32 %r72, %r71, 1;\n"
|
||||
" setp.ge.s32 %p7, %r33, %r30;\n"
|
||||
" add.s32 %r73, %r23, 1;\n"
|
||||
" mov.s32 %r74, %r72;\n"
|
||||
"$Lt_3_17666:\n"
|
||||
" @!%p7 bra $Lt_3_17922;\n"
|
||||
" sub.s32 %r75, %r33, %r30;\n"
|
||||
" add.s32 %r76, %r75, 1;\n"
|
||||
" mul.lo.s32 %r77, %r70, %r5;\n"
|
||||
" mul.lo.s32 %r78, %r64, %r5;\n"
|
||||
" mul.lo.s32 %r79, %r78, %r1;\n"
|
||||
" add.s32 %r80, %r33, 1;\n"
|
||||
" add.s32 %r81, %r77, %r79;\n"
|
||||
" add.s32 %r82, %r81, %r30;\n"
|
||||
" add.s32 %r83, %r80, %r81;\n"
|
||||
" cvt.s64.s32 %rd30, %r82;\n"
|
||||
" mul.wide.s32 %rd31, %r82, 4;\n"
|
||||
" add.u64 %rd32, %rd1, %rd31;\n"
|
||||
" mov.s32 %r84, %r76;\n"
|
||||
"$Lt_3_18434:\n"
|
||||
" .loc 16 171 0\n"
|
||||
" ld.global.s32 %r85, [%rd32+0];\n"
|
||||
" .loc 16 172 0\n"
|
||||
" ld.global.s32 %r86, [%rd32+4];\n"
|
||||
" .loc 16 176 0\n"
|
||||
" sub.s32 %r87, %r86, %r85;\n"
|
||||
" cvt.rn.f32.s32 %f15, %r87;\n"
|
||||
" mov.f32 %f16, 0f43000000; \n"
|
||||
" div.approx.ftz.f32 %f17, %f15, %f16;\n"
|
||||
" cvt.rpi.ftz.f32.f32 %f18, %f17;\n"
|
||||
" cvt.rzi.ftz.s32.f32 %r88, %f18;\n"
|
||||
" mov.u32 %r89, 0;\n"
|
||||
" setp.le.s32 %p8, %r88, %r89;\n"
|
||||
" @%p8 bra $Lt_3_18690;\n"
|
||||
" mov.s32 %r90, %r88;\n"
|
||||
" mov.s32 %r91, 0;\n"
|
||||
" setp.lt.s32 %p9, %r47, %r43;\n"
|
||||
" mul.lo.s32 %r92, %r88, 128;\n"
|
||||
" mov.s32 %r93, %r90;\n"
|
||||
"$Lt_3_19202:\n"
|
||||
" sub.s32 %r94, %r87, %r91;\n"
|
||||
" mov.s32 %r95, 128;\n"
|
||||
" min.s32 %r96, %r94, %r95;\n"
|
||||
" setp.le.s32 %p10, %r96, %r35;\n"
|
||||
" @%p10 bra $Lt_3_19458;\n"
|
||||
" .loc 16 183 0\n"
|
||||
" ld.param.u64 %rd33, [__cudaparm_calc_neigh_list_cell_cell_particle_id];\n"
|
||||
" add.s32 %r97, %r91, %r35;\n"
|
||||
" add.s32 %r98, %r85, %r97;\n"
|
||||
" cvt.s64.s32 %rd34, %r98;\n"
|
||||
" mul.wide.s32 %rd35, %r98, 4;\n"
|
||||
" add.u64 %rd36, %rd33, %rd35;\n"
|
||||
" ld.global.s32 %r99, [%rd36+0];\n"
|
||||
" .loc 16 184 0\n"
|
||||
" cvt.s64.s32 %rd37, %r35;\n"
|
||||
" mul.wide.s32 %rd38, %r35, 4;\n"
|
||||
" add.u64 %rd39, %rd7, %rd38;\n"
|
||||
" st.shared.s32 [%rd39+0], %r99;\n"
|
||||
" .loc 16 185 0\n"
|
||||
" mov.u32 %r100, %r99;\n"
|
||||
" mov.s32 %r101, 0;\n"
|
||||
" mov.u32 %r102, %r101;\n"
|
||||
" mov.s32 %r103, 0;\n"
|
||||
" mov.u32 %r104, %r103;\n"
|
||||
" mov.s32 %r105, 0;\n"
|
||||
" mov.u32 %r106, %r105;\n"
|
||||
" tex.1d.v4.f32.s32 {%f19,%f20,%f21,%f22},[neigh_tex,{%r100,%r102,%r104,%r106}];\n"
|
||||
" mov.f32 %f23, %f19;\n"
|
||||
" mov.f32 %f24, %f20;\n"
|
||||
" mov.f32 %f25, %f21;\n"
|
||||
" .loc 16 186 0\n"
|
||||
" mul.lo.u64 %rd40, %rd37, 16;\n"
|
||||
" add.u64 %rd41, %rd6, %rd40;\n"
|
||||
" st.shared.v2.f32 [%rd41+0], {%f23,%f24};\n"
|
||||
" .loc 16 188 0\n"
|
||||
" st.shared.f32 [%rd41+8], %f25;\n"
|
||||
"$Lt_3_19458:\n"
|
||||
" .loc 16 190 0\n"
|
||||
" bar.sync 0;\n"
|
||||
" @!%p9 bra $Lt_3_20482;\n"
|
||||
" mov.u32 %r107, 0;\n"
|
||||
" setp.le.s32 %p11, %r96, %r107;\n"
|
||||
" @%p11 bra $Lt_3_20482;\n"
|
||||
" mov.s32 %r108, %r96;\n"
|
||||
" mov.s64 %rd42, 0;\n"
|
||||
" ld.param.f32 %f26, [__cudaparm_calc_neigh_list_cell_cell_size];\n"
|
||||
" mul.ftz.f32 %f27, %f26, %f26;\n"
|
||||
" mov.s64 %rd43, %rd6;\n"
|
||||
" mov.f32 %f28, %f14;\n"
|
||||
" mov.f32 %f29, %f13;\n"
|
||||
" mov.f32 %f30, %f12;\n"
|
||||
" mov.s32 %r109, 0;\n"
|
||||
" mov.s32 %r110, %r108;\n"
|
||||
"$Lt_3_20994:\n"
|
||||
" ld.shared.v4.f32 {%f31,%f32,%f33,_}, [%rd43+0];\n"
|
||||
" .loc 16 196 0\n"
|
||||
" sub.ftz.f32 %f34, %f30, %f31;\n"
|
||||
" .loc 16 197 0\n"
|
||||
" sub.ftz.f32 %f35, %f29, %f32;\n"
|
||||
" .loc 16 198 0\n"
|
||||
" sub.ftz.f32 %f36, %f28, %f33;\n"
|
||||
" .loc 16 195 0\n"
|
||||
" mul.ftz.f32 %f37, %f35, %f35;\n"
|
||||
" fma.rn.ftz.f32 %f38, %f34, %f34, %f37;\n"
|
||||
" fma.rn.ftz.f32 %f39, %f36, %f36, %f38;\n"
|
||||
" setp.gt.ftz.f32 %p12, %f27, %f39;\n"
|
||||
" @!%p12 bra $Lt_3_25346;\n"
|
||||
" cvt.ftz.f64.f32 %fd1, %f39;\n"
|
||||
" mov.f64 %fd2, 0d3ee4f8b588e368f1; \n"
|
||||
" setp.gt.f64 %p13, %fd1, %fd2;\n"
|
||||
" @!%p13 bra $Lt_3_25346;\n"
|
||||
" .loc 16 202 0\n"
|
||||
" add.s32 %r68, %r68, 1;\n"
|
||||
" ld.param.s32 %r111, [__cudaparm_calc_neigh_list_cell_neigh_bin_size];\n"
|
||||
" setp.lt.s32 %p14, %r111, %r68;\n"
|
||||
" @%p14 bra $Lt_3_25346;\n"
|
||||
" .loc 16 204 0\n"
|
||||
" mul.lo.u64 %rd44, %rd42, 4;\n"
|
||||
" add.u64 %rd45, %rd7, %rd44;\n"
|
||||
" ld.shared.s32 %r112, [%rd45+0];\n"
|
||||
" st.global.s32 [%rd22+0], %r112;\n"
|
||||
" cvt.s64.s32 %rd46, %r60;\n"
|
||||
" mul.wide.s32 %rd47, %r60, 4;\n"
|
||||
" add.u64 %rd48, %rd22, %rd47;\n"
|
||||
" add.u64 %rd49, %rd48, 4;\n"
|
||||
" add.u64 %rd50, %rd22, 4;\n"
|
||||
" ld.param.s32 %r113, [__cudaparm_calc_neigh_list_cell_t_per_atom];\n"
|
||||
" sub.s32 %r114, %r113, 1;\n"
|
||||
" and.b32 %r115, %r68, %r114;\n"
|
||||
" mov.s32 %r116, 0;\n"
|
||||
" setp.eq.s32 %p15, %r115, %r116;\n"
|
||||
" selp.u64 %rd22, %rd49, %rd50, %p15;\n"
|
||||
"$Lt_3_25346:\n"
|
||||
"$L_3_13570:\n"
|
||||
" .loc 16 202 0\n"
|
||||
" add.s32 %r109, %r109, 1;\n"
|
||||
" add.s64 %rd42, %rd42, 1;\n"
|
||||
" add.u64 %rd43, %rd43, 16;\n"
|
||||
" setp.ne.s32 %p16, %r96, %r109;\n"
|
||||
" @%p16 bra $Lt_3_20994;\n"
|
||||
"$Lt_3_20482:\n"
|
||||
"$Lt_3_19970:\n"
|
||||
" .loc 16 212 0\n"
|
||||
" bar.sync 0;\n"
|
||||
" add.s32 %r91, %r91, 128;\n"
|
||||
" setp.ne.s32 %p17, %r91, %r92;\n"
|
||||
" @%p17 bra $Lt_3_19202;\n"
|
||||
"$Lt_3_18690:\n"
|
||||
" add.s32 %r82, %r82, 1;\n"
|
||||
" add.u64 %rd32, %rd32, 4;\n"
|
||||
" setp.ne.s32 %p18, %r82, %r83;\n"
|
||||
" @%p18 bra $Lt_3_18434;\n"
|
||||
"$Lt_3_17922:\n"
|
||||
" add.s32 %r70, %r70, 1;\n"
|
||||
" setp.ne.s32 %p19, %r73, %r70;\n"
|
||||
" @%p19 bra $Lt_3_17666;\n"
|
||||
"$Lt_3_17154:\n"
|
||||
" add.s32 %r64, %r64, 1;\n"
|
||||
" setp.ne.s32 %p20, %r67, %r64;\n"
|
||||
" @%p20 bra $Lt_3_16898;\n"
|
||||
" bra.uni $Lt_3_16386;\n"
|
||||
"$Lt_3_24066:\n"
|
||||
" mov.s32 %r68, 0;\n"
|
||||
"$Lt_3_16386:\n"
|
||||
" @!%p4 bra $Lt_3_23042;\n"
|
||||
" .loc 16 218 0\n"
|
||||
" st.global.s32 [%rd18+0], %r68;\n"
|
||||
"$Lt_3_23042:\n"
|
||||
" add.s32 %r45, %r45, 1;\n"
|
||||
" add.u32 %r37, %r37, %r15;\n"
|
||||
" add.s32 %r40, %r40, %r15;\n"
|
||||
" setp.ne.s32 %p21, %r16, %r45;\n"
|
||||
" @%p21 bra $Lt_3_14594;\n"
|
||||
"$Lt_3_14082:\n"
|
||||
" .loc 16 220 0\n"
|
||||
" exit;\n"
|
||||
"$LDWend_calc_neigh_list_cell:\n"
|
||||
" }\n"
|
||||
" .entry kernel_special (\n"
|
||||
" .param .u64 __cudaparm_kernel_special_dev_nbor,\n"
|
||||
" .param .u64 __cudaparm_kernel_special_host_nbor_list,\n"
|
||||
" .param .u64 __cudaparm_kernel_special_host_numj,\n"
|
||||
" .param .u64 __cudaparm_kernel_special_tag,\n"
|
||||
" .param .u64 __cudaparm_kernel_special_nspecial,\n"
|
||||
" .param .u64 __cudaparm_kernel_special_special,\n"
|
||||
" .param .s32 __cudaparm_kernel_special_inum,\n"
|
||||
" .param .s32 __cudaparm_kernel_special_nt,\n"
|
||||
" .param .s32 __cudaparm_kernel_special_max_nbors,\n"
|
||||
" .param .s32 __cudaparm_kernel_special_t_per_atom)\n"
|
||||
" {\n"
|
||||
" .reg .u32 %r<45>;\n"
|
||||
" .reg .u64 %rd<45>;\n"
|
||||
" .reg .pred %p<11>;\n"
|
||||
" .loc 16 226 0\n"
|
||||
"$LDWbegin_kernel_special:\n"
|
||||
" ld.param.s32 %r1, [__cudaparm_kernel_special_t_per_atom];\n"
|
||||
" cvt.s32.u32 %r2, %tid.x;\n"
|
||||
" div.s32 %r3, %r2, %r1;\n"
|
||||
" cvt.s32.u32 %r4, %ntid.x;\n"
|
||||
" div.s32 %r5, %r4, %r1;\n"
|
||||
" cvt.s32.u32 %r6, %ctaid.x;\n"
|
||||
" mul.lo.s32 %r7, %r6, %r5;\n"
|
||||
" add.s32 %r8, %r3, %r7;\n"
|
||||
" ld.param.s32 %r9, [__cudaparm_kernel_special_nt];\n"
|
||||
" setp.ge.s32 %p1, %r8, %r9;\n"
|
||||
" @%p1 bra $Lt_4_6146;\n"
|
||||
" .loc 16 236 0\n"
|
||||
" ld.param.u64 %rd1, [__cudaparm_kernel_special_nspecial];\n"
|
||||
" mul.lo.s32 %r10, %r8, 3;\n"
|
||||
" cvt.s64.s32 %rd2, %r10;\n"
|
||||
" mul.wide.s32 %rd3, %r10, 4;\n"
|
||||
" add.u64 %rd4, %rd1, %rd3;\n"
|
||||
" ld.global.s32 %r11, [%rd4+0];\n"
|
||||
" .loc 16 237 0\n"
|
||||
" ld.global.s32 %r12, [%rd4+4];\n"
|
||||
" .loc 16 238 0\n"
|
||||
" ld.global.s32 %r13, [%rd4+8];\n"
|
||||
" ld.param.s32 %r14, [__cudaparm_kernel_special_inum];\n"
|
||||
" setp.ge.s32 %p2, %r8, %r14;\n"
|
||||
" @%p2 bra $Lt_4_6914;\n"
|
||||
" .loc 16 244 0\n"
|
||||
" ld.param.u64 %rd5, [__cudaparm_kernel_special_dev_nbor];\n"
|
||||
" cvt.s64.s32 %rd6, %r8;\n"
|
||||
" cvt.s64.s32 %rd7, %r14;\n"
|
||||
" add.u64 %rd8, %rd6, %rd7;\n"
|
||||
" mul.lo.u64 %rd9, %rd8, 4;\n"
|
||||
" add.u64 %rd10, %rd5, %rd9;\n"
|
||||
" ld.global.s32 %r15, [%rd10+0];\n"
|
||||
" .loc 16 246 0\n"
|
||||
" mul.lo.s32 %r16, %r14, %r1;\n"
|
||||
" mov.s32 %r17, %r16;\n"
|
||||
" .loc 16 248 0\n"
|
||||
" sub.s32 %r18, %r1, 1;\n"
|
||||
" mul.lo.s32 %r19, %r18, %r8;\n"
|
||||
" add.s32 %r20, %r14, %r19;\n"
|
||||
" cvt.s64.s32 %rd11, %r20;\n"
|
||||
" mul.wide.s32 %rd12, %r20, 4;\n"
|
||||
" add.u64 %rd13, %rd10, %rd12;\n"
|
||||
" and.b32 %r21, %r18, %r15;\n"
|
||||
" cvt.s64.s32 %rd14, %r21;\n"
|
||||
" div.s32 %r22, %r15, %r1;\n"
|
||||
" mul.lo.s32 %r23, %r16, %r22;\n"
|
||||
" cvt.s64.s32 %rd15, %r23;\n"
|
||||
" add.u64 %rd16, %rd14, %rd15;\n"
|
||||
" mul.lo.u64 %rd17, %rd16, 4;\n"
|
||||
" add.u64 %rd18, %rd13, %rd17;\n"
|
||||
" .loc 16 249 0\n"
|
||||
" and.b32 %r24, %r18, %r2;\n"
|
||||
" cvt.s64.s32 %rd19, %r24;\n"
|
||||
" mul.wide.s32 %rd20, %r24, 4;\n"
|
||||
" add.u64 %rd21, %rd13, %rd20;\n"
|
||||
" bra.uni $Lt_4_6658;\n"
|
||||
"$Lt_4_6914:\n"
|
||||
" .loc 16 252 0\n"
|
||||
" sub.s32 %r25, %r8, %r14;\n"
|
||||
" ld.param.u64 %rd22, [__cudaparm_kernel_special_host_nbor_list];\n"
|
||||
" ld.param.s32 %r26, [__cudaparm_kernel_special_max_nbors];\n"
|
||||
" mul.lo.s32 %r27, %r26, %r25;\n"
|
||||
" cvt.s64.s32 %rd23, %r27;\n"
|
||||
" mul.wide.s32 %rd24, %r27, 4;\n"
|
||||
" add.u64 %rd25, %rd22, %rd24;\n"
|
||||
" mov.s64 %rd21, %rd25;\n"
|
||||
" .loc 16 254 0\n"
|
||||
" ld.param.u64 %rd26, [__cudaparm_kernel_special_host_numj];\n"
|
||||
" cvt.s64.s32 %rd27, %r25;\n"
|
||||
" mul.wide.s32 %rd28, %r25, 4;\n"
|
||||
" add.u64 %rd29, %rd26, %rd28;\n"
|
||||
" ld.global.s32 %r28, [%rd29+0];\n"
|
||||
" cvt.s64.s32 %rd30, %r28;\n"
|
||||
" mul.wide.s32 %rd31, %r28, 4;\n"
|
||||
" add.u64 %rd18, %rd25, %rd31;\n"
|
||||
" mov.s32 %r17, 1;\n"
|
||||
"$Lt_4_6658:\n"
|
||||
" setp.ge.u64 %p3, %rd21, %rd18;\n"
|
||||
" @%p3 bra $Lt_4_7170;\n"
|
||||
" mov.s32 %r29, 0;\n"
|
||||
" setp.gt.s32 %p4, %r13, %r29;\n"
|
||||
" cvt.s64.s32 %rd32, %r17;\n"
|
||||
" ld.param.u64 %rd33, [__cudaparm_kernel_special_tag];\n"
|
||||
"$Lt_4_7682:\n"
|
||||
" .loc 16 258 0\n"
|
||||
" ld.global.s32 %r30, [%rd21+0];\n"
|
||||
" .loc 16 259 0\n"
|
||||
" cvt.s64.s32 %rd34, %r30;\n"
|
||||
" mul.wide.s32 %rd35, %r30, 4;\n"
|
||||
" add.u64 %rd36, %rd33, %rd35;\n"
|
||||
" ld.global.s32 %r31, [%rd36+0];\n"
|
||||
" @!%p4 bra $Lt_4_7938;\n"
|
||||
" mov.s32 %r32, %r13;\n"
|
||||
" cvt.s64.s32 %rd37, %r8;\n"
|
||||
" cvt.s64.s32 %rd38, %r9;\n"
|
||||
" mul.wide.s32 %rd39, %r9, 4;\n"
|
||||
" ld.param.u64 %rd40, [__cudaparm_kernel_special_special];\n"
|
||||
" mul.wide.s32 %rd41, %r8, 4;\n"
|
||||
" add.u64 %rd42, %rd40, %rd41;\n"
|
||||
" mov.s32 %r33, 0;\n"
|
||||
" mov.s32 %r34, %r32;\n"
|
||||
"$Lt_4_8450:\n"
|
||||
" ld.global.s32 %r35, [%rd42+0];\n"
|
||||
" setp.ne.s32 %p5, %r35, %r31;\n"
|
||||
" @%p5 bra $Lt_4_8706;\n"
|
||||
" .loc 16 269 0\n"
|
||||
" setp.le.s32 %p6, %r11, %r33;\n"
|
||||
" mov.s32 %r36, 3;\n"
|
||||
" mov.s32 %r37, 2;\n"
|
||||
" selp.s32 %r38, %r36, %r37, %p6;\n"
|
||||
" mov.s32 %r39, 2;\n"
|
||||
" mov.s32 %r40, 1;\n"
|
||||
" selp.s32 %r41, %r39, %r40, %p6;\n"
|
||||
" setp.le.s32 %p7, %r12, %r33;\n"
|
||||
" selp.s32 %r42, %r38, %r41, %p7;\n"
|
||||
" shl.b32 %r43, %r42, 30;\n"
|
||||
" xor.b32 %r30, %r30, %r43;\n"
|
||||
" .loc 16 270 0\n"
|
||||
" st.global.s32 [%rd21+0], %r30;\n"
|
||||
"$Lt_4_8706:\n"
|
||||
" add.s32 %r33, %r33, 1;\n"
|
||||
" add.u64 %rd42, %rd39, %rd42;\n"
|
||||
" setp.ne.s32 %p8, %r13, %r33;\n"
|
||||
" @%p8 bra $Lt_4_8450;\n"
|
||||
"$Lt_4_7938:\n"
|
||||
" .loc 16 257 0\n"
|
||||
" mul.lo.u64 %rd43, %rd32, 4;\n"
|
||||
" add.u64 %rd21, %rd21, %rd43;\n"
|
||||
" setp.lt.u64 %p9, %rd21, %rd18;\n"
|
||||
" @%p9 bra $Lt_4_7682;\n"
|
||||
"$Lt_4_7170:\n"
|
||||
"$Lt_4_6146:\n"
|
||||
" .loc 16 276 0\n"
|
||||
" exit;\n"
|
||||
"$LDWend_kernel_special:\n"
|
||||
" }\n"
|
||||
;
|
||||
@ -1,900 +0,0 @@
|
||||
.version 2.3
|
||||
.target sm_20
|
||||
.address_size 64
|
||||
// compiled with /usr/local/cuda/open64/lib//be
|
||||
// nvopencc 4.0 built on 2011-05-12
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Compiling /tmp/tmpxft_00009b29_00000000-9_lal_pppm.cpp3.i (/home/sjplimp/ccBI#.sIoydv)
|
||||
//-----------------------------------------------------------
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Options:
|
||||
//-----------------------------------------------------------
|
||||
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
|
||||
// -O3 (Optimization level)
|
||||
// -g0 (Debug level)
|
||||
// -m2 (Report advisories)
|
||||
//-----------------------------------------------------------
|
||||
|
||||
.file 1 "<command-line>"
|
||||
.file 2 "/tmp/tmpxft_00009b29_00000000-8_lal_pppm.cudafe2.gpu"
|
||||
.file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
|
||||
.file 4 "/usr/local/cuda/include/crt/device_runtime.h"
|
||||
.file 5 "/usr/local/cuda/include/host_defines.h"
|
||||
.file 6 "/usr/local/cuda/include/builtin_types.h"
|
||||
.file 7 "/usr/local/cuda/include/device_types.h"
|
||||
.file 8 "/usr/local/cuda/include/driver_types.h"
|
||||
.file 9 "/usr/local/cuda/include/surface_types.h"
|
||||
.file 10 "/usr/local/cuda/include/texture_types.h"
|
||||
.file 11 "/usr/local/cuda/include/vector_types.h"
|
||||
.file 12 "/usr/local/cuda/include/device_launch_parameters.h"
|
||||
.file 13 "/usr/local/cuda/include/crt/storage_class.h"
|
||||
.file 14 "/usr/include/bits/types.h"
|
||||
.file 15 "/usr/include/time.h"
|
||||
.file 16 "/usr/local/cuda/include/sm_11_atomic_functions.h"
|
||||
.file 17 "lal_pppm.cu"
|
||||
.file 18 "/usr/local/cuda/include/common_functions.h"
|
||||
.file 19 "/usr/local/cuda/include/math_functions.h"
|
||||
.file 20 "/usr/local/cuda/include/math_constants.h"
|
||||
.file 21 "/usr/local/cuda/include/device_functions.h"
|
||||
.file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h"
|
||||
.file 23 "/usr/local/cuda/include/sm_13_double_functions.h"
|
||||
.file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h"
|
||||
.file 25 "/usr/local/cuda/include/sm_20_intrinsics.h"
|
||||
.file 26 "/usr/local/cuda/include/surface_functions.h"
|
||||
.file 27 "/usr/local/cuda/include/texture_fetch_functions.h"
|
||||
.file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
|
||||
|
||||
.global .texref pos_tex;
|
||||
.global .texref q_tex;
|
||||
|
||||
.entry particle_map (
|
||||
.param .u64 __cudaparm_particle_map_x_,
|
||||
.param .u64 __cudaparm_particle_map_q_,
|
||||
.param .f64 __cudaparm_particle_map_delvolinv,
|
||||
.param .s32 __cudaparm_particle_map_nlocal,
|
||||
.param .u64 __cudaparm_particle_map_counts,
|
||||
.param .u64 __cudaparm_particle_map_ans,
|
||||
.param .f64 __cudaparm_particle_map_b_lo_x,
|
||||
.param .f64 __cudaparm_particle_map_b_lo_y,
|
||||
.param .f64 __cudaparm_particle_map_b_lo_z,
|
||||
.param .f64 __cudaparm_particle_map_delxinv,
|
||||
.param .f64 __cudaparm_particle_map_delyinv,
|
||||
.param .f64 __cudaparm_particle_map_delzinv,
|
||||
.param .s32 __cudaparm_particle_map_nlocal_x,
|
||||
.param .s32 __cudaparm_particle_map_nlocal_y,
|
||||
.param .s32 __cudaparm_particle_map_nlocal_z,
|
||||
.param .s32 __cudaparm_particle_map_atom_stride,
|
||||
.param .s32 __cudaparm_particle_map_max_atoms,
|
||||
.param .u64 __cudaparm_particle_map_error)
|
||||
{
|
||||
.reg .u32 %r<50>;
|
||||
.reg .u64 %rd<12>;
|
||||
.reg .f32 %f<14>;
|
||||
.reg .f64 %fd<36>;
|
||||
.reg .pred %p<11>;
|
||||
.loc 17 50 0
|
||||
$LDWbegin_particle_map:
|
||||
cvt.s32.u32 %r1, %ntid.x;
|
||||
cvt.s32.u32 %r2, %ctaid.x;
|
||||
mul24.lo.s32 %r3, %r2, %r1;
|
||||
cvt.s32.u32 %r4, %nctaid.x;
|
||||
mul24.lo.s32 %r5, %r4, %r1;
|
||||
mov.u32 %r6, %tid.x;
|
||||
add.u32 %r7, %r3, %r6;
|
||||
sub.s32 %r8, %r5, 1;
|
||||
mul.lo.s32 %r9, %r7, 64;
|
||||
div.s32 %r10, %r9, %r5;
|
||||
mul.lo.s32 %r11, %r8, %r10;
|
||||
sub.s32 %r12, %r9, %r11;
|
||||
ld.param.s32 %r13, [__cudaparm_particle_map_nlocal];
|
||||
setp.le.s32 %p1, %r13, %r12;
|
||||
@%p1 bra $Lt_0_7426;
|
||||
.loc 17 62 0
|
||||
mov.u32 %r14, %r12;
|
||||
mov.s32 %r15, 0;
|
||||
mov.u32 %r16, %r15;
|
||||
mov.s32 %r17, 0;
|
||||
mov.u32 %r18, %r17;
|
||||
mov.s32 %r19, 0;
|
||||
mov.u32 %r20, %r19;
|
||||
tex.1d.v4.f32.s32 {%f1,%f2,%f3,%f4},[pos_tex,{%r14,%r16,%r18,%r20}];
|
||||
mov.f32 %f5, %f1;
|
||||
mov.f32 %f6, %f2;
|
||||
mov.f32 %f7, %f3;
|
||||
.loc 17 64 0
|
||||
mov.u32 %r21, %r12;
|
||||
mov.s32 %r22, 0;
|
||||
mov.u32 %r23, %r22;
|
||||
mov.s32 %r24, 0;
|
||||
mov.u32 %r25, %r24;
|
||||
mov.s32 %r26, 0;
|
||||
mov.u32 %r27, %r26;
|
||||
tex.1d.v4.f32.s32 {%f8,%f9,%f10,%f11},[q_tex,{%r21,%r23,%r25,%r27}];
|
||||
mov.f32 %f12, %f8;
|
||||
cvt.ftz.f64.f32 %fd1, %f12;
|
||||
ld.param.f64 %fd2, [__cudaparm_particle_map_delvolinv];
|
||||
mul.f64 %fd3, %fd1, %fd2;
|
||||
mov.f64 %fd4, 0d0000000000000000; // 0
|
||||
setp.neu.f64 %p2, %fd3, %fd4;
|
||||
@!%p2 bra $Lt_0_7426;
|
||||
.loc 17 67 0
|
||||
ld.param.f64 %fd5, [__cudaparm_particle_map_delxinv];
|
||||
cvt.ftz.f64.f32 %fd6, %f5;
|
||||
ld.param.f64 %fd7, [__cudaparm_particle_map_b_lo_x];
|
||||
sub.f64 %fd8, %fd6, %fd7;
|
||||
mul.f64 %fd9, %fd5, %fd8;
|
||||
mov.f64 %fd10, 0d0000000000000000; // 0
|
||||
setp.lt.f64 %p3, %fd9, %fd10;
|
||||
@%p3 bra $Lt_0_8706;
|
||||
ld.param.f64 %fd11, [__cudaparm_particle_map_delyinv];
|
||||
cvt.ftz.f64.f32 %fd12, %f6;
|
||||
ld.param.f64 %fd13, [__cudaparm_particle_map_b_lo_y];
|
||||
sub.f64 %fd14, %fd12, %fd13;
|
||||
mul.f64 %fd15, %fd11, %fd14;
|
||||
mov.f64 %fd16, 0d0000000000000000; // 0
|
||||
setp.lt.f64 %p4, %fd15, %fd16;
|
||||
@%p4 bra $Lt_0_8706;
|
||||
ld.param.f64 %fd17, [__cudaparm_particle_map_delzinv];
|
||||
cvt.ftz.f64.f32 %fd18, %f7;
|
||||
ld.param.f64 %fd19, [__cudaparm_particle_map_b_lo_z];
|
||||
sub.f64 %fd20, %fd18, %fd19;
|
||||
mul.f64 %fd21, %fd17, %fd20;
|
||||
mov.f64 %fd22, 0d0000000000000000; // 0
|
||||
setp.lt.f64 %p5, %fd21, %fd22;
|
||||
@%p5 bra $Lt_0_8706;
|
||||
cvt.rzi.s32.f64 %r28, %fd9;
|
||||
ld.param.s32 %r29, [__cudaparm_particle_map_nlocal_x];
|
||||
setp.ge.s32 %p6, %r28, %r29;
|
||||
@%p6 bra $Lt_0_8706;
|
||||
cvt.rzi.s32.f64 %r30, %fd15;
|
||||
ld.param.s32 %r31, [__cudaparm_particle_map_nlocal_y];
|
||||
setp.ge.s32 %p7, %r30, %r31;
|
||||
@%p7 bra $Lt_0_8706;
|
||||
cvt.rzi.s32.f64 %r32, %fd21;
|
||||
ld.param.s32 %r33, [__cudaparm_particle_map_nlocal_z];
|
||||
setp.gt.s32 %p8, %r33, %r32;
|
||||
@%p8 bra $L_0_4866;
|
||||
$Lt_0_8706:
|
||||
$L_0_5122:
|
||||
.loc 17 76 0
|
||||
mov.s32 %r34, 1;
|
||||
ld.param.u64 %rd1, [__cudaparm_particle_map_error];
|
||||
st.global.s32 [%rd1+0], %r34;
|
||||
bra.uni $Lt_0_7426;
|
||||
$L_0_4866:
|
||||
.loc 17 83 0
|
||||
mul.lo.s32 %r35, %r32, %r31;
|
||||
add.s32 %r36, %r30, %r35;
|
||||
mul.lo.s32 %r37, %r36, %r29;
|
||||
add.s32 %r38, %r28, %r37;
|
||||
ld.param.u64 %rd2, [__cudaparm_particle_map_counts];
|
||||
cvt.s64.s32 %rd3, %r38;
|
||||
mul.wide.s32 %rd4, %r38, 4;
|
||||
add.u64 %rd5, %rd2, %rd4;
|
||||
mov.s32 %r39, 1;
|
||||
atom.global.add.s32 %r40, [%rd5], %r39;
|
||||
mov.s32 %r41, %r40;
|
||||
ld.param.s32 %r42, [__cudaparm_particle_map_max_atoms];
|
||||
setp.gt.s32 %p9, %r42, %r41;
|
||||
@%p9 bra $Lt_0_7682;
|
||||
.loc 17 85 0
|
||||
mov.s32 %r43, 2;
|
||||
ld.param.u64 %rd6, [__cudaparm_particle_map_error];
|
||||
st.global.s32 [%rd6+0], %r43;
|
||||
.loc 16 118 0
|
||||
mov.s32 %r44, -1;
|
||||
atom.global.add.s32 %r45, [%rd5], %r44;
|
||||
bra.uni $Lt_0_7426;
|
||||
$Lt_0_7682:
|
||||
.loc 17 88 0
|
||||
ld.param.u64 %rd7, [__cudaparm_particle_map_ans];
|
||||
ld.param.s32 %r46, [__cudaparm_particle_map_atom_stride];
|
||||
mul.lo.s32 %r47, %r46, %r41;
|
||||
add.s32 %r48, %r38, %r47;
|
||||
cvt.s64.s32 %rd8, %r48;
|
||||
mul.wide.s32 %rd9, %r48, 32;
|
||||
add.u64 %rd10, %rd7, %rd9;
|
||||
cvt.rn.f64.s32 %fd23, %r28;
|
||||
mov.f64 %fd24, 0d3fe0000000000000; // 0.5
|
||||
add.f64 %fd25, %fd23, %fd24;
|
||||
sub.f64 %fd26, %fd25, %fd9;
|
||||
cvt.rn.f64.s32 %fd27, %r30;
|
||||
mov.f64 %fd28, 0d3fe0000000000000; // 0.5
|
||||
add.f64 %fd29, %fd27, %fd28;
|
||||
sub.f64 %fd30, %fd29, %fd15;
|
||||
st.global.v2.f64 [%rd10+0], {%fd26,%fd30};
|
||||
cvt.rn.f64.s32 %fd31, %r32;
|
||||
mov.f64 %fd32, 0d3fe0000000000000; // 0.5
|
||||
add.f64 %fd33, %fd31, %fd32;
|
||||
sub.f64 %fd34, %fd33, %fd21;
|
||||
st.global.v2.f64 [%rd10+16], {%fd34,%fd3};
|
||||
$Lt_0_7426:
|
||||
$L_0_4610:
|
||||
$Lt_0_6914:
|
||||
$Lt_0_6402:
|
||||
.loc 17 92 0
|
||||
exit;
|
||||
$LDWend_particle_map:
|
||||
} // particle_map
|
||||
|
||||
.entry make_rho (
|
||||
.param .u64 __cudaparm_make_rho_counts,
|
||||
.param .u64 __cudaparm_make_rho_atoms,
|
||||
.param .u64 __cudaparm_make_rho_brick,
|
||||
.param .u64 __cudaparm_make_rho__rho_coeff,
|
||||
.param .s32 __cudaparm_make_rho_atom_stride,
|
||||
.param .s32 __cudaparm_make_rho_npts_x,
|
||||
.param .s32 __cudaparm_make_rho_npts_y,
|
||||
.param .s32 __cudaparm_make_rho_npts_z,
|
||||
.param .s32 __cudaparm_make_rho_nlocal_x,
|
||||
.param .s32 __cudaparm_make_rho_nlocal_y,
|
||||
.param .s32 __cudaparm_make_rho_nlocal_z,
|
||||
.param .s32 __cudaparm_make_rho_order_m_1,
|
||||
.param .s32 __cudaparm_make_rho_order,
|
||||
.param .s32 __cudaparm_make_rho_order2)
|
||||
{
|
||||
.reg .u32 %r<119>;
|
||||
.reg .u64 %rd<57>;
|
||||
.reg .f64 %fd<26>;
|
||||
.reg .pred %p<27>;
|
||||
.shared .align 8 .b8 __cuda___cuda_local_var_32578_34_non_const_rho_coeff200[512];
|
||||
.shared .align 8 .b8 __cuda___cuda_local_var_32579_34_non_const_front712[640];
|
||||
.shared .align 8 .b8 __cuda___cuda_local_var_32580_34_non_const_ans1352[4096];
|
||||
.loc 17 101 0
|
||||
$LDWbegin_make_rho:
|
||||
ld.param.s32 %r1, [__cudaparm_make_rho_order2];
|
||||
ld.param.s32 %r2, [__cudaparm_make_rho_order];
|
||||
add.s32 %r3, %r1, %r2;
|
||||
cvt.s32.u32 %r4, %tid.x;
|
||||
setp.le.s32 %p1, %r3, %r4;
|
||||
@%p1 bra $Lt_1_16898;
|
||||
.loc 17 108 0
|
||||
mov.u64 %rd1, __cuda___cuda_local_var_32578_34_non_const_rho_coeff200;
|
||||
cvt.s64.s32 %rd2, %r4;
|
||||
mul.wide.s32 %rd3, %r4, 8;
|
||||
ld.param.u64 %rd4, [__cudaparm_make_rho__rho_coeff];
|
||||
add.u64 %rd5, %rd4, %rd3;
|
||||
ld.global.f64 %fd1, [%rd5+0];
|
||||
add.u64 %rd6, %rd3, %rd1;
|
||||
st.shared.f64 [%rd6+0], %fd1;
|
||||
$Lt_1_16898:
|
||||
mov.u64 %rd1, __cuda___cuda_local_var_32578_34_non_const_rho_coeff200;
|
||||
shr.s32 %r5, %r4, 31;
|
||||
mov.s32 %r6, 31;
|
||||
and.b32 %r7, %r5, %r6;
|
||||
add.s32 %r8, %r7, %r4;
|
||||
shr.s32 %r9, %r8, 5;
|
||||
mul.lo.s32 %r10, %r9, 32;
|
||||
sub.s32 %r11, %r4, %r10;
|
||||
setp.lt.s32 %p2, %r11, %r2;
|
||||
@!%p2 bra $Lt_1_17410;
|
||||
.loc 17 114 0
|
||||
mov.u64 %rd7, __cuda___cuda_local_var_32579_34_non_const_front712;
|
||||
mov.f64 %fd2, 0d0000000000000000; // 0
|
||||
cvt.s64.s32 %rd8, %r11;
|
||||
shr.s32 %r12, %r4, 31;
|
||||
mov.s32 %r13, 31;
|
||||
and.b32 %r14, %r12, %r13;
|
||||
add.s32 %r15, %r14, %r4;
|
||||
shr.s32 %r16, %r15, 5;
|
||||
cvt.s64.s32 %rd9, %r16;
|
||||
mul.wide.s32 %rd10, %r16, 40;
|
||||
add.u64 %rd11, %rd8, %rd10;
|
||||
mul.lo.u64 %rd12, %rd11, 8;
|
||||
add.u64 %rd13, %rd7, %rd12;
|
||||
st.shared.f64 [%rd13+256], %fd2;
|
||||
$Lt_1_17410:
|
||||
mov.u64 %rd7, __cuda___cuda_local_var_32579_34_non_const_front712;
|
||||
.loc 17 116 0
|
||||
bar.sync 0;
|
||||
ld.param.s32 %r17, [__cudaparm_make_rho_npts_x];
|
||||
shr.s32 %r18, %r17, 31;
|
||||
mov.s32 %r19, 31;
|
||||
and.b32 %r20, %r18, %r19;
|
||||
add.s32 %r21, %r20, %r17;
|
||||
shr.s32 %r22, %r21, 5;
|
||||
add.s32 %r23, %r22, 1;
|
||||
mov.u32 %r24, 0;
|
||||
setp.le.s32 %p3, %r23, %r24;
|
||||
@%p3 bra $Lt_1_17922;
|
||||
shr.s32 %r25, %r4, 31;
|
||||
mov.s32 %r26, 31;
|
||||
and.b32 %r27, %r25, %r26;
|
||||
add.s32 %r28, %r27, %r4;
|
||||
shr.s32 %r29, %r28, 5;
|
||||
add.s32 %r30, %r11, 32;
|
||||
ld.param.s32 %r31, [__cudaparm_make_rho_nlocal_y];
|
||||
ld.param.s32 %r32, [__cudaparm_make_rho_nlocal_x];
|
||||
mul.lo.s32 %r33, %r31, %r32;
|
||||
mov.u32 %r34, %ctaid.x;
|
||||
mul.lo.u32 %r35, %r34, 2;
|
||||
add.u32 %r36, %r29, %r35;
|
||||
ld.param.s32 %r37, [__cudaparm_make_rho_npts_y];
|
||||
div.s32 %r38, %r36, %r37;
|
||||
ld.param.s32 %r39, [__cudaparm_make_rho_order_m_1];
|
||||
setp.lt.s32 %p4, %r38, %r39;
|
||||
sub.s32 %r40, %r39, %r38;
|
||||
mov.s32 %r41, 0;
|
||||
selp.s32 %r42, %r40, %r41, %p4;
|
||||
ld.param.s32 %r43, [__cudaparm_make_rho_nlocal_z];
|
||||
setp.ge.s32 %p5, %r38, %r43;
|
||||
sub.s32 %r44, %r43, %r38;
|
||||
add.s32 %r45, %r44, %r2;
|
||||
sub.s32 %r46, %r45, 1;
|
||||
selp.s32 %r47, %r46, %r2, %p5;
|
||||
rem.s32 %r48, %r36, %r37;
|
||||
setp.lt.s32 %p6, %r48, %r39;
|
||||
sub.s32 %r49, %r39, %r48;
|
||||
mov.s32 %r50, 0;
|
||||
selp.s32 %r51, %r49, %r50, %p6;
|
||||
setp.ge.s32 %p7, %r48, %r31;
|
||||
sub.s32 %r52, %r31, %r48;
|
||||
add.s32 %r53, %r52, %r2;
|
||||
sub.s32 %r54, %r53, 1;
|
||||
selp.s32 %r55, %r54, %r2, %p7;
|
||||
mov.s32 %r56, %r23;
|
||||
mov.s32 %r57, 0;
|
||||
setp.gt.s32 %p8, %r2, %r57;
|
||||
mov.s32 %r58, 0;
|
||||
cvt.s64.s32 %rd14, %r11;
|
||||
cvt.s64.s32 %rd15, %r29;
|
||||
mul.lo.s32 %r59, %r23, 32;
|
||||
mul.wide.s32 %rd16, %r29, 40;
|
||||
add.u64 %rd17, %rd14, %rd16;
|
||||
ld.param.s32 %r60, [__cudaparm_make_rho_npts_z];
|
||||
setp.gt.s32 %p9, %r60, %r38;
|
||||
mul.lo.u64 %rd18, %rd17, 8;
|
||||
selp.s32 %r61, 1, 0, %p9;
|
||||
add.u64 %rd19, %rd18, %rd7;
|
||||
mov.u64 %rd20, __cuda___cuda_local_var_32580_34_non_const_ans1352;
|
||||
mov.s32 %r62, %r56;
|
||||
$Lt_1_18434:
|
||||
//<loop> Loop body line 116, nesting depth: 1, estimated iterations: unknown
|
||||
@!%p8 bra $Lt_1_18690;
|
||||
mov.s32 %r63, %r2;
|
||||
cvt.s64.s32 %rd21, %r4;
|
||||
mul.wide.s32 %rd22, %r4, 8;
|
||||
add.u64 %rd23, %rd20, %rd22;
|
||||
mov.s32 %r64, 0;
|
||||
mov.s32 %r65, %r63;
|
||||
$Lt_1_19202:
|
||||
//<loop> Loop body line 116, nesting depth: 2, estimated iterations: unknown
|
||||
.loc 17 140 0
|
||||
mov.f64 %fd3, 0d0000000000000000; // 0
|
||||
st.shared.f64 [%rd23+0], %fd3;
|
||||
add.s32 %r64, %r64, 1;
|
||||
add.u64 %rd23, %rd23, 512;
|
||||
setp.ne.s32 %p10, %r64, %r2;
|
||||
@%p10 bra $Lt_1_19202;
|
||||
$Lt_1_18690:
|
||||
add.s32 %r66, %r11, %r58;
|
||||
set.lt.u32.s32 %r67, %r66, %r32;
|
||||
neg.s32 %r68, %r67;
|
||||
and.b32 %r69, %r61, %r68;
|
||||
mov.u32 %r70, 0;
|
||||
setp.eq.s32 %p11, %r69, %r70;
|
||||
@%p11 bra $Lt_1_20226;
|
||||
.loc 17 143 0
|
||||
mov.s32 %r71, %r42;
|
||||
setp.ge.s32 %p12, %r42, %r47;
|
||||
@%p12 bra $Lt_1_20226;
|
||||
sub.s32 %r72, %r47, %r42;
|
||||
setp.lt.s32 %p13, %r51, %r55;
|
||||
mov.s32 %r73, %r72;
|
||||
$Lt_1_20738:
|
||||
//<loop> Loop body line 143, nesting depth: 2, estimated iterations: unknown
|
||||
.loc 17 145 0
|
||||
mov.s32 %r74, %r51;
|
||||
@!%p13 bra $Lt_1_20994;
|
||||
sub.s32 %r75, %r55, %r51;
|
||||
sub.s32 %r76, %r71, %r42;
|
||||
add.s32 %r77, %r38, %r42;
|
||||
add.s32 %r78, %r48, %r51;
|
||||
sub.s32 %r79, %r77, %r39;
|
||||
sub.s32 %r80, %r78, %r39;
|
||||
add.s32 %r81, %r76, %r79;
|
||||
mul.lo.s32 %r82, %r33, %r81;
|
||||
ld.param.s32 %r83, [__cudaparm_make_rho_atom_stride];
|
||||
ld.param.u64 %rd24, [__cudaparm_make_rho_counts];
|
||||
mov.s32 %r84, %r75;
|
||||
$Lt_1_21506:
|
||||
//<loop> Loop body line 145, nesting depth: 3, estimated iterations: unknown
|
||||
.loc 17 147 0
|
||||
sub.s32 %r85, %r74, %r51;
|
||||
add.s32 %r86, %r85, %r80;
|
||||
mul.lo.s32 %r87, %r86, %r32;
|
||||
add.s32 %r88, %r82, %r87;
|
||||
add.s32 %r89, %r66, %r88;
|
||||
cvt.s64.s32 %rd25, %r89;
|
||||
mul.wide.s32 %rd26, %r89, 4;
|
||||
add.u64 %rd27, %rd24, %rd26;
|
||||
ld.global.s32 %r90, [%rd27+0];
|
||||
mul.lo.s32 %r91, %r90, %r83;
|
||||
.loc 17 148 0
|
||||
mov.s32 %r92, %r89;
|
||||
setp.ge.s32 %p14, %r89, %r91;
|
||||
@%p14 bra $Lt_1_21762;
|
||||
sub.s32 %r93, %r3, 1;
|
||||
cvt.s64.s32 %rd28, %r83;
|
||||
mul.wide.s32 %rd29, %r83, 32;
|
||||
mov.s32 %r94, -1;
|
||||
setp.gt.s32 %p15, %r93, %r94;
|
||||
ld.param.u64 %rd30, [__cudaparm_make_rho_atoms];
|
||||
mul.lo.u64 %rd31, %rd25, 32;
|
||||
add.u64 %rd32, %rd30, %rd31;
|
||||
$Lt_1_22274:
|
||||
//<loop> Loop body line 148, nesting depth: 4, estimated iterations: unknown
|
||||
.loc 17 149 0
|
||||
ld.global.f64 %fd4, [%rd32+0];
|
||||
@!%p15 bra $Lt_1_29954;
|
||||
sub.s32 %r95, %r93, %r74;
|
||||
mov.s32 %r96, -1;
|
||||
sub.s32 %r97, %r96, %r74;
|
||||
cvt.s64.s32 %rd33, %r2;
|
||||
mul.wide.s32 %rd34, %r2, 8;
|
||||
ld.global.f64 %fd5, [%rd32+8];
|
||||
ld.global.f64 %fd6, [%rd32+16];
|
||||
cvt.s64.s32 %rd35, %r95;
|
||||
mul.wide.s32 %rd36, %r95, 8;
|
||||
add.u64 %rd37, %rd1, %rd36;
|
||||
sub.s32 %r98, %r93, %r71;
|
||||
cvt.s64.s32 %rd38, %r98;
|
||||
mul.wide.s32 %rd39, %r98, 8;
|
||||
add.u64 %rd40, %rd1, %rd39;
|
||||
mov.f64 %fd7, 0d0000000000000000; // 0
|
||||
mov.f64 %fd8, 0d0000000000000000; // 0
|
||||
$Lt_1_23042:
|
||||
//<loop> Loop body line 149, nesting depth: 5, estimated iterations: unknown
|
||||
.loc 17 154 0
|
||||
ld.shared.f64 %fd9, [%rd37+0];
|
||||
mad.rn.f64 %fd8, %fd8, %fd5, %fd9;
|
||||
.loc 17 155 0
|
||||
ld.shared.f64 %fd10, [%rd40+0];
|
||||
mad.rn.f64 %fd7, %fd7, %fd6, %fd10;
|
||||
sub.u64 %rd40, %rd40, %rd34;
|
||||
sub.s32 %r95, %r95, %r2;
|
||||
sub.u64 %rd37, %rd37, %rd34;
|
||||
setp.gt.s32 %p16, %r95, %r97;
|
||||
@%p16 bra $Lt_1_23042;
|
||||
bra.uni $Lt_1_22530;
|
||||
$Lt_1_29954:
|
||||
mov.f64 %fd7, 0d0000000000000000; // 0
|
||||
mov.f64 %fd8, 0d0000000000000000; // 0
|
||||
$Lt_1_22530:
|
||||
.loc 17 157 0
|
||||
ld.global.f64 %fd11, [%rd32+24];
|
||||
mul.f64 %fd12, %fd7, %fd8;
|
||||
mul.f64 %fd13, %fd11, %fd12;
|
||||
@!%p8 bra $Lt_1_23554;
|
||||
mov.s32 %r99, %r2;
|
||||
cvt.s64.s32 %rd41, %r4;
|
||||
mul.wide.s32 %rd42, %r4, 8;
|
||||
add.u64 %rd43, %rd20, %rd42;
|
||||
mov.s32 %r100, 0;
|
||||
mov.s32 %r101, %r99;
|
||||
$Lt_1_24066:
|
||||
//<loop> Loop body line 157, nesting depth: 5, estimated iterations: unknown
|
||||
.loc 17 161 0
|
||||
add.s32 %r102, %r100, %r1;
|
||||
mov.s32 %r103, %r102;
|
||||
setp.lt.s32 %p17, %r102, %r100;
|
||||
@%p17 bra $Lt_1_30466;
|
||||
cvt.s64.s32 %rd44, %r2;
|
||||
mul.wide.s32 %rd34, %r2, 8;
|
||||
cvt.s64.s32 %rd45, %r102;
|
||||
mul.wide.s32 %rd46, %r102, 8;
|
||||
add.u64 %rd47, %rd1, %rd46;
|
||||
mov.f64 %fd14, 0d0000000000000000; // 0
|
||||
$Lt_1_24834:
|
||||
//<loop> Loop body line 161, nesting depth: 6, estimated iterations: unknown
|
||||
.loc 17 162 0
|
||||
ld.shared.f64 %fd15, [%rd47+0];
|
||||
mad.rn.f64 %fd14, %fd4, %fd14, %fd15;
|
||||
sub.s32 %r103, %r103, %r2;
|
||||
sub.u64 %rd47, %rd47, %rd34;
|
||||
setp.ge.s32 %p18, %r103, %r100;
|
||||
@%p18 bra $Lt_1_24834;
|
||||
bra.uni $Lt_1_24322;
|
||||
$Lt_1_30466:
|
||||
mov.f64 %fd14, 0d0000000000000000; // 0
|
||||
$Lt_1_24322:
|
||||
.loc 17 163 0
|
||||
ld.shared.f64 %fd16, [%rd43+0];
|
||||
mad.rn.f64 %fd17, %fd14, %fd13, %fd16;
|
||||
st.shared.f64 [%rd43+0], %fd17;
|
||||
add.s32 %r100, %r100, 1;
|
||||
add.u64 %rd43, %rd43, 512;
|
||||
setp.ne.s32 %p19, %r100, %r2;
|
||||
@%p19 bra $Lt_1_24066;
|
||||
$Lt_1_23554:
|
||||
add.s32 %r92, %r92, %r83;
|
||||
add.u64 %rd32, %rd29, %rd32;
|
||||
setp.gt.s32 %p20, %r91, %r92;
|
||||
@%p20 bra $Lt_1_22274;
|
||||
$Lt_1_21762:
|
||||
add.s32 %r74, %r74, 1;
|
||||
setp.ne.s32 %p21, %r55, %r74;
|
||||
@%p21 bra $Lt_1_21506;
|
||||
$Lt_1_20994:
|
||||
add.s32 %r71, %r71, 1;
|
||||
setp.ne.s32 %p22, %r47, %r71;
|
||||
@%p22 bra $Lt_1_20738;
|
||||
$Lt_1_20226:
|
||||
$Lt_1_19714:
|
||||
.loc 17 172 0
|
||||
bar.sync 0;
|
||||
@!%p2 bra $Lt_1_26626;
|
||||
.loc 17 174 0
|
||||
ld.shared.f64 %fd18, [%rd19+256];
|
||||
st.shared.f64 [%rd19+0], %fd18;
|
||||
.loc 17 175 0
|
||||
mov.f64 %fd19, 0d0000000000000000; // 0
|
||||
st.shared.f64 [%rd19+256], %fd19;
|
||||
bra.uni $Lt_1_26370;
|
||||
$Lt_1_26626:
|
||||
.loc 17 177 0
|
||||
mov.f64 %fd20, 0d0000000000000000; // 0
|
||||
st.shared.f64 [%rd19+0], %fd20;
|
||||
$Lt_1_26370:
|
||||
@!%p8 bra $Lt_1_26882;
|
||||
mov.s32 %r104, %r2;
|
||||
cvt.s64.s32 %rd48, %r4;
|
||||
mov.s32 %r105, %r11;
|
||||
add.s32 %r106, %r11, %r2;
|
||||
mul.wide.s32 %rd49, %r4, 8;
|
||||
add.u64 %rd50, %rd20, %rd49;
|
||||
mov.s64 %rd51, %rd19;
|
||||
mov.s32 %r107, %r104;
|
||||
$Lt_1_27394:
|
||||
//<loop> Loop body line 177, nesting depth: 2, estimated iterations: unknown
|
||||
.loc 17 180 0
|
||||
ld.shared.f64 %fd21, [%rd50+0];
|
||||
ld.shared.f64 %fd22, [%rd51+0];
|
||||
add.f64 %fd23, %fd21, %fd22;
|
||||
st.shared.f64 [%rd51+0], %fd23;
|
||||
.loc 17 181 0
|
||||
bar.sync 0;
|
||||
add.s32 %r105, %r105, 1;
|
||||
add.u64 %rd51, %rd51, 8;
|
||||
add.u64 %rd50, %rd50, 512;
|
||||
setp.ne.s32 %p23, %r105, %r106;
|
||||
@%p23 bra $Lt_1_27394;
|
||||
$Lt_1_26882:
|
||||
set.lt.u32.s32 %r108, %r66, %r17;
|
||||
neg.s32 %r109, %r108;
|
||||
and.b32 %r110, %r61, %r109;
|
||||
mov.u32 %r111, 0;
|
||||
setp.eq.s32 %p24, %r110, %r111;
|
||||
@%p24 bra $Lt_1_27906;
|
||||
.loc 17 185 0
|
||||
ld.shared.f64 %fd24, [%rd19+0];
|
||||
ld.param.u64 %rd52, [__cudaparm_make_rho_brick];
|
||||
add.s32 %r112, %r11, %r58;
|
||||
mul.lo.s32 %r113, %r37, %r17;
|
||||
mul.lo.s32 %r114, %r38, %r113;
|
||||
mul.lo.s32 %r115, %r48, %r17;
|
||||
add.s32 %r116, %r114, %r115;
|
||||
add.s32 %r117, %r112, %r116;
|
||||
cvt.s64.s32 %rd53, %r117;
|
||||
mul.wide.s32 %rd54, %r117, 8;
|
||||
add.u64 %rd55, %rd52, %rd54;
|
||||
st.global.f64 [%rd55+0], %fd24;
|
||||
$Lt_1_27906:
|
||||
add.s32 %r58, %r58, 32;
|
||||
setp.ne.s32 %p25, %r58, %r59;
|
||||
@%p25 bra $Lt_1_18434;
|
||||
$Lt_1_17922:
|
||||
.loc 17 189 0
|
||||
exit;
|
||||
$LDWend_make_rho:
|
||||
} // make_rho
|
||||
|
||||
.entry interp (
|
||||
.param .u64 __cudaparm_interp_x_,
|
||||
.param .u64 __cudaparm_interp_q_,
|
||||
.param .s32 __cudaparm_interp_nlocal,
|
||||
.param .u64 __cudaparm_interp_brick,
|
||||
.param .u64 __cudaparm_interp__rho_coeff,
|
||||
.param .s32 __cudaparm_interp_npts_x,
|
||||
.param .s32 __cudaparm_interp_npts_yx,
|
||||
.param .f64 __cudaparm_interp_b_lo_x,
|
||||
.param .f64 __cudaparm_interp_b_lo_y,
|
||||
.param .f64 __cudaparm_interp_b_lo_z,
|
||||
.param .f64 __cudaparm_interp_delxinv,
|
||||
.param .f64 __cudaparm_interp_delyinv,
|
||||
.param .f64 __cudaparm_interp_delzinv,
|
||||
.param .s32 __cudaparm_interp_order,
|
||||
.param .s32 __cudaparm_interp_order2,
|
||||
.param .f64 __cudaparm_interp_qqrd2e_scale,
|
||||
.param .u64 __cudaparm_interp_ans)
|
||||
{
|
||||
.reg .u32 %r<56>;
|
||||
.reg .u64 %rd<37>;
|
||||
.reg .f32 %f<19>;
|
||||
.reg .f64 %fd<63>;
|
||||
.reg .pred %p<14>;
|
||||
.shared .align 8 .b8 __cuda___cuda_local_var_32676_34_non_const_rho_coeff5568[512];
|
||||
.shared .align 8 .b8 __cuda___cuda_local_var_32677_34_non_const_rho1d_06080[4096];
|
||||
.shared .align 8 .b8 __cuda___cuda_local_var_32678_34_non_const_rho1d_110176[4096];
|
||||
// __cuda_local_var_32694_12_non_const_ek = 16
|
||||
.loc 17 199 0
|
||||
$LDWbegin_interp:
|
||||
ld.param.s32 %r1, [__cudaparm_interp_order2];
|
||||
ld.param.s32 %r2, [__cudaparm_interp_order];
|
||||
add.s32 %r3, %r1, %r2;
|
||||
cvt.s32.u32 %r4, %tid.x;
|
||||
setp.le.s32 %p1, %r3, %r4;
|
||||
@%p1 bra $Lt_2_8706;
|
||||
.loc 17 206 0
|
||||
mov.u64 %rd1, __cuda___cuda_local_var_32676_34_non_const_rho_coeff5568;
|
||||
cvt.s64.s32 %rd2, %r4;
|
||||
mul.wide.s32 %rd3, %r4, 8;
|
||||
ld.param.u64 %rd4, [__cudaparm_interp__rho_coeff];
|
||||
add.u64 %rd5, %rd4, %rd3;
|
||||
ld.global.f64 %fd1, [%rd5+0];
|
||||
add.u64 %rd6, %rd3, %rd1;
|
||||
st.shared.f64 [%rd6+0], %fd1;
|
||||
$Lt_2_8706:
|
||||
mov.u64 %rd1, __cuda___cuda_local_var_32676_34_non_const_rho_coeff5568;
|
||||
.loc 17 207 0
|
||||
bar.sync 0;
|
||||
mov.u32 %r5, %ctaid.x;
|
||||
mov.u32 %r6, %ntid.x;
|
||||
mul.lo.u32 %r7, %r5, %r6;
|
||||
add.u32 %r8, %r4, %r7;
|
||||
ld.param.s32 %r9, [__cudaparm_interp_nlocal];
|
||||
setp.le.s32 %p2, %r9, %r8;
|
||||
@%p2 bra $Lt_2_9218;
|
||||
.loc 17 215 0
|
||||
mov.u32 %r10, %r8;
|
||||
mov.s32 %r11, 0;
|
||||
mov.u32 %r12, %r11;
|
||||
mov.s32 %r13, 0;
|
||||
mov.u32 %r14, %r13;
|
||||
mov.s32 %r15, 0;
|
||||
mov.u32 %r16, %r15;
|
||||
tex.1d.v4.f32.s32 {%f1,%f2,%f3,%f4},[pos_tex,{%r10,%r12,%r14,%r16}];
|
||||
mov.f32 %f5, %f1;
|
||||
mov.f32 %f6, %f2;
|
||||
mov.f32 %f7, %f3;
|
||||
.loc 17 216 0
|
||||
mov.u32 %r17, %r8;
|
||||
mov.s32 %r18, 0;
|
||||
mov.u32 %r19, %r18;
|
||||
mov.s32 %r20, 0;
|
||||
mov.u32 %r21, %r20;
|
||||
mov.s32 %r22, 0;
|
||||
mov.u32 %r23, %r22;
|
||||
tex.1d.v4.f32.s32 {%f8,%f9,%f10,%f11},[q_tex,{%r17,%r19,%r21,%r23}];
|
||||
mov.f32 %f12, %f8;
|
||||
cvt.ftz.f64.f32 %fd2, %f12;
|
||||
ld.param.f64 %fd3, [__cudaparm_interp_qqrd2e_scale];
|
||||
mul.f64 %fd4, %fd2, %fd3;
|
||||
mov.f64 %fd5, 0d0000000000000000; // 0
|
||||
setp.neu.f64 %p3, %fd4, %fd5;
|
||||
@!%p3 bra $Lt_2_9986;
|
||||
mov.s32 %r24, 0;
|
||||
setp.gt.s32 %p4, %r2, %r24;
|
||||
ld.param.f64 %fd6, [__cudaparm_interp_delxinv];
|
||||
cvt.ftz.f64.f32 %fd7, %f5;
|
||||
ld.param.f64 %fd8, [__cudaparm_interp_b_lo_x];
|
||||
sub.f64 %fd9, %fd7, %fd8;
|
||||
mul.f64 %fd10, %fd6, %fd9;
|
||||
@!%p4 bra $Lt_2_16386;
|
||||
mov.u64 %rd7, __cuda___cuda_local_var_32677_34_non_const_rho1d_06080;
|
||||
mov.u64 %rd8, __cuda___cuda_local_var_32678_34_non_const_rho1d_110176;
|
||||
cvt.rzi.s32.f64 %r25, %fd10;
|
||||
cvt.rn.f64.s32 %fd11, %r25;
|
||||
mov.f64 %fd12, 0d3fe0000000000000; // 0.5
|
||||
add.f64 %fd13, %fd11, %fd12;
|
||||
sub.f64 %fd14, %fd13, %fd10;
|
||||
ld.param.f64 %fd15, [__cudaparm_interp_delyinv];
|
||||
cvt.ftz.f64.f32 %fd16, %f6;
|
||||
ld.param.f64 %fd17, [__cudaparm_interp_b_lo_y];
|
||||
sub.f64 %fd18, %fd16, %fd17;
|
||||
mul.f64 %fd19, %fd15, %fd18;
|
||||
cvt.rzi.s32.f64 %r26, %fd19;
|
||||
cvt.rn.f64.s32 %fd20, %r26;
|
||||
mov.f64 %fd21, 0d3fe0000000000000; // 0.5
|
||||
add.f64 %fd22, %fd20, %fd21;
|
||||
sub.f64 %fd23, %fd22, %fd19;
|
||||
mov.s32 %r27, %r2;
|
||||
cvt.s64.s32 %rd9, %r4;
|
||||
mov.s32 %r28, %r1;
|
||||
mul.wide.s32 %rd3, %r4, 8;
|
||||
add.u64 %rd10, %rd3, %rd7;
|
||||
add.u64 %rd11, %rd3, %rd8;
|
||||
mov.s32 %r29, 0;
|
||||
mov.s32 %r30, %r27;
|
||||
$Lt_2_10754:
|
||||
//<loop> Loop body line 216, nesting depth: 1, estimated iterations: unknown
|
||||
.loc 17 235 0
|
||||
mov.f64 %fd24, 0d0000000000000000; // 0
|
||||
mov.f64 %fd25, 0d0000000000000000; // 0
|
||||
st.shared.f64 [%rd10+0], %fd25;
|
||||
.loc 17 236 0
|
||||
mov.f64 %fd26, 0d0000000000000000; // 0
|
||||
mov.f64 %fd27, 0d0000000000000000; // 0
|
||||
st.shared.f64 [%rd11+0], %fd27;
|
||||
.loc 17 237 0
|
||||
mov.s32 %r31, %r28;
|
||||
setp.lt.s32 %p5, %r28, %r29;
|
||||
@%p5 bra $Lt_2_11010;
|
||||
cvt.s64.s32 %rd12, %r2;
|
||||
mul.wide.s32 %rd13, %r2, 8;
|
||||
cvt.s64.s32 %rd14, %r28;
|
||||
mul.wide.s32 %rd15, %r28, 8;
|
||||
add.u64 %rd16, %rd1, %rd15;
|
||||
$Lt_2_11522:
|
||||
//<loop> Loop body line 237, nesting depth: 2, estimated iterations: unknown
|
||||
.loc 17 238 0
|
||||
ld.shared.f64 %fd28, [%rd16+0];
|
||||
mad.rn.f64 %fd24, %fd24, %fd14, %fd28;
|
||||
st.shared.f64 [%rd10+0], %fd24;
|
||||
.loc 17 239 0
|
||||
mad.rn.f64 %fd26, %fd26, %fd23, %fd28;
|
||||
st.shared.f64 [%rd11+0], %fd26;
|
||||
sub.s32 %r31, %r31, %r2;
|
||||
sub.u64 %rd16, %rd16, %rd13;
|
||||
setp.ge.s32 %p6, %r31, %r29;
|
||||
@%p6 bra $Lt_2_11522;
|
||||
$Lt_2_11010:
|
||||
add.s32 %r29, %r29, 1;
|
||||
add.s32 %r28, %r28, 1;
|
||||
add.u64 %rd11, %rd11, 512;
|
||||
add.u64 %rd10, %rd10, 512;
|
||||
setp.ne.s32 %p7, %r28, %r3;
|
||||
@%p7 bra $Lt_2_10754;
|
||||
bra.uni $Lt_2_10242;
|
||||
$Lt_2_16386:
|
||||
cvt.rzi.s32.f64 %r25, %fd10;
|
||||
mov.u64 %rd8, __cuda___cuda_local_var_32678_34_non_const_rho1d_110176;
|
||||
mov.u64 %rd7, __cuda___cuda_local_var_32677_34_non_const_rho1d_06080;
|
||||
$Lt_2_10242:
|
||||
.loc 17 243 0
|
||||
ld.param.f64 %fd29, [__cudaparm_interp_delzinv];
|
||||
cvt.ftz.f64.f32 %fd30, %f7;
|
||||
ld.param.f64 %fd31, [__cudaparm_interp_b_lo_z];
|
||||
sub.f64 %fd32, %fd30, %fd31;
|
||||
mul.f64 %fd33, %fd29, %fd32;
|
||||
cvt.rzi.s32.f64 %r32, %fd33;
|
||||
ld.param.s32 %r33, [__cudaparm_interp_npts_yx];
|
||||
mul.lo.s32 %r34, %r32, %r33;
|
||||
add.s32 %r35, %r25, %r34;
|
||||
@!%p4 bra $Lt_2_16898;
|
||||
cvt.rn.f64.s32 %fd34, %r32;
|
||||
mov.f64 %fd35, 0d3fe0000000000000; // 0.5
|
||||
add.f64 %fd36, %fd34, %fd35;
|
||||
sub.f64 %fd37, %fd36, %fd33;
|
||||
mov.s32 %r36, %r2;
|
||||
cvt.ftz.f64.f32 %fd38, %f6;
|
||||
cvt.s64.s32 %rd17, %r4;
|
||||
ld.param.f64 %fd39, [__cudaparm_interp_delyinv];
|
||||
ld.param.f64 %fd40, [__cudaparm_interp_b_lo_y];
|
||||
sub.f64 %fd41, %fd38, %fd40;
|
||||
mul.f64 %fd42, %fd39, %fd41;
|
||||
cvt.rzi.s32.f64 %r37, %fd42;
|
||||
mul.wide.s32 %rd3, %r4, 8;
|
||||
ld.param.s32 %r38, [__cudaparm_interp_npts_x];
|
||||
mul.lo.s32 %r39, %r37, %r38;
|
||||
add.u64 %rd18, %rd3, %rd7;
|
||||
add.u64 %rd19, %rd3, %rd8;
|
||||
cvt.s64.s32 %rd20, %r38;
|
||||
mul.wide.s32 %rd21, %r38, 32;
|
||||
add.s32 %r40, %r39, %r35;
|
||||
mov.s32 %r41, %r40;
|
||||
ld.param.u64 %rd22, [__cudaparm_interp_brick];
|
||||
mov.s32 %r42, 0;
|
||||
mov.f32 %f13, 0f00000000; // 0
|
||||
mov.f32 %f14, 0f00000000; // 0
|
||||
mov.f32 %f15, 0f00000000; // 0
|
||||
mov.s32 %r43, %r36;
|
||||
$Lt_2_12802:
|
||||
//<loop> Loop body line 243, nesting depth: 1, estimated iterations: unknown
|
||||
.loc 17 246 0
|
||||
add.s32 %r44, %r42, %r1;
|
||||
mov.s32 %r45, %r44;
|
||||
setp.lt.s32 %p8, %r44, %r42;
|
||||
@%p8 bra $Lt_2_17154;
|
||||
cvt.s64.s32 %rd23, %r2;
|
||||
mul.wide.s32 %rd13, %r2, 8;
|
||||
cvt.s64.s32 %rd24, %r44;
|
||||
mul.wide.s32 %rd25, %r44, 8;
|
||||
add.u64 %rd26, %rd1, %rd25;
|
||||
mov.f64 %fd43, 0d0000000000000000; // 0
|
||||
$Lt_2_13570:
|
||||
//<loop> Loop body line 246, nesting depth: 2, estimated iterations: unknown
|
||||
.loc 17 247 0
|
||||
ld.shared.f64 %fd44, [%rd26+0];
|
||||
mad.rn.f64 %fd43, %fd37, %fd43, %fd44;
|
||||
sub.s32 %r45, %r45, %r2;
|
||||
sub.u64 %rd26, %rd26, %rd13;
|
||||
setp.ge.s32 %p9, %r45, %r42;
|
||||
@%p9 bra $Lt_2_13570;
|
||||
bra.uni $Lt_2_13058;
|
||||
$Lt_2_17154:
|
||||
mov.f64 %fd43, 0d0000000000000000; // 0
|
||||
$Lt_2_13058:
|
||||
.loc 17 249 0
|
||||
mov.s32 %r46, %r41;
|
||||
mov.s32 %r47, %r2;
|
||||
mov.s32 %r48, %r46;
|
||||
mul.f64 %fd45, %fd4, %fd43;
|
||||
mov.s64 %rd27, %rd19;
|
||||
cvt.s64.s32 %rd28, %r46;
|
||||
mul.wide.s32 %rd29, %r46, 32;
|
||||
mov.s32 %r49, 0;
|
||||
mov.s32 %r50, %r47;
|
||||
$Lt_2_14594:
|
||||
//<loop> Loop body line 249, nesting depth: 2, estimated iterations: unknown
|
||||
mov.s32 %r51, %r2;
|
||||
mov.s32 %r52, %r48;
|
||||
add.s32 %r53, %r48, %r2;
|
||||
mov.s64 %rd30, %rd18;
|
||||
ld.shared.f64 %fd46, [%rd27+0];
|
||||
add.u64 %rd31, %rd29, %rd22;
|
||||
mul.f64 %fd47, %fd45, %fd46;
|
||||
mov.s32 %r54, %r51;
|
||||
$Lt_2_15362:
|
||||
//<loop> Loop body line 249, nesting depth: 3, estimated iterations: unknown
|
||||
.loc 17 253 0
|
||||
ld.shared.f64 %fd48, [%rd30+0];
|
||||
mul.f64 %fd49, %fd48, %fd47;
|
||||
.loc 17 255 0
|
||||
cvt.ftz.f64.f32 %fd50, %f15;
|
||||
ld.global.v2.f64 {%fd51,%fd52}, [%rd31+0];
|
||||
mul.f64 %fd53, %fd49, %fd51;
|
||||
sub.f64 %fd54, %fd50, %fd53;
|
||||
cvt.rn.ftz.f32.f64 %f15, %fd54;
|
||||
.loc 17 256 0
|
||||
cvt.ftz.f64.f32 %fd55, %f14;
|
||||
mul.f64 %fd56, %fd49, %fd52;
|
||||
sub.f64 %fd57, %fd55, %fd56;
|
||||
cvt.rn.ftz.f32.f64 %f14, %fd57;
|
||||
.loc 17 257 0
|
||||
cvt.ftz.f64.f32 %fd58, %f13;
|
||||
ld.global.f64 %fd59, [%rd31+16];
|
||||
mul.f64 %fd60, %fd49, %fd59;
|
||||
sub.f64 %fd61, %fd58, %fd60;
|
||||
cvt.rn.ftz.f32.f64 %f13, %fd61;
|
||||
add.s32 %r52, %r52, 1;
|
||||
add.u64 %rd31, %rd31, 32;
|
||||
add.u64 %rd30, %rd30, 512;
|
||||
setp.ne.s32 %p10, %r52, %r53;
|
||||
@%p10 bra $Lt_2_15362;
|
||||
add.s32 %r49, %r49, 1;
|
||||
add.s32 %r48, %r48, %r38;
|
||||
add.u64 %rd29, %rd29, %rd21;
|
||||
add.u64 %rd27, %rd27, 512;
|
||||
setp.ne.s32 %p11, %r49, %r2;
|
||||
@%p11 bra $Lt_2_14594;
|
||||
add.s32 %r42, %r42, 1;
|
||||
add.s32 %r41, %r46, %r33;
|
||||
setp.ne.s32 %p12, %r42, %r2;
|
||||
@%p12 bra $Lt_2_12802;
|
||||
bra.uni $Lt_2_9730;
|
||||
$Lt_2_16898:
|
||||
mov.f32 %f13, 0f00000000; // 0
|
||||
mov.f32 %f14, 0f00000000; // 0
|
||||
mov.f32 %f15, 0f00000000; // 0
|
||||
bra.uni $Lt_2_9730;
|
||||
$Lt_2_9986:
|
||||
mov.f32 %f13, 0f00000000; // 0
|
||||
mov.f32 %f14, 0f00000000; // 0
|
||||
mov.f32 %f15, 0f00000000; // 0
|
||||
$Lt_2_9730:
|
||||
.loc 17 264 0
|
||||
ld.param.u64 %rd32, [__cudaparm_interp_ans];
|
||||
cvt.s64.s32 %rd33, %r8;
|
||||
mul.wide.s32 %rd34, %r8, 16;
|
||||
add.u64 %rd35, %rd32, %rd34;
|
||||
mov.f32 %f16, %f17;
|
||||
st.global.v4.f32 [%rd35+0], {%f15,%f14,%f13,%f16};
|
||||
$Lt_2_9218:
|
||||
.loc 17 266 0
|
||||
exit;
|
||||
$LDWend_interp:
|
||||
} // interp
|
||||
|
||||
@ -1,837 +0,0 @@
|
||||
const char * pppm_d =
|
||||
" .version 2.3\n"
|
||||
" .target sm_20\n"
|
||||
" .address_size 64\n"
|
||||
" .global .texref pos_tex;\n"
|
||||
" .global .texref q_tex;\n"
|
||||
" .entry particle_map (\n"
|
||||
" .param .u64 __cudaparm_particle_map_x_,\n"
|
||||
" .param .u64 __cudaparm_particle_map_q_,\n"
|
||||
" .param .f64 __cudaparm_particle_map_delvolinv,\n"
|
||||
" .param .s32 __cudaparm_particle_map_nlocal,\n"
|
||||
" .param .u64 __cudaparm_particle_map_counts,\n"
|
||||
" .param .u64 __cudaparm_particle_map_ans,\n"
|
||||
" .param .f64 __cudaparm_particle_map_b_lo_x,\n"
|
||||
" .param .f64 __cudaparm_particle_map_b_lo_y,\n"
|
||||
" .param .f64 __cudaparm_particle_map_b_lo_z,\n"
|
||||
" .param .f64 __cudaparm_particle_map_delxinv,\n"
|
||||
" .param .f64 __cudaparm_particle_map_delyinv,\n"
|
||||
" .param .f64 __cudaparm_particle_map_delzinv,\n"
|
||||
" .param .s32 __cudaparm_particle_map_nlocal_x,\n"
|
||||
" .param .s32 __cudaparm_particle_map_nlocal_y,\n"
|
||||
" .param .s32 __cudaparm_particle_map_nlocal_z,\n"
|
||||
" .param .s32 __cudaparm_particle_map_atom_stride,\n"
|
||||
" .param .s32 __cudaparm_particle_map_max_atoms,\n"
|
||||
" .param .u64 __cudaparm_particle_map_error)\n"
|
||||
" {\n"
|
||||
" .reg .u32 %r<50>;\n"
|
||||
" .reg .u64 %rd<12>;\n"
|
||||
" .reg .f32 %f<14>;\n"
|
||||
" .reg .f64 %fd<36>;\n"
|
||||
" .reg .pred %p<11>;\n"
|
||||
" .loc 17 50 0\n"
|
||||
"$LDWbegin_particle_map:\n"
|
||||
" cvt.s32.u32 %r1, %ntid.x;\n"
|
||||
" cvt.s32.u32 %r2, %ctaid.x;\n"
|
||||
" mul24.lo.s32 %r3, %r2, %r1;\n"
|
||||
" cvt.s32.u32 %r4, %nctaid.x;\n"
|
||||
" mul24.lo.s32 %r5, %r4, %r1;\n"
|
||||
" mov.u32 %r6, %tid.x;\n"
|
||||
" add.u32 %r7, %r3, %r6;\n"
|
||||
" sub.s32 %r8, %r5, 1;\n"
|
||||
" mul.lo.s32 %r9, %r7, 64;\n"
|
||||
" div.s32 %r10, %r9, %r5;\n"
|
||||
" mul.lo.s32 %r11, %r8, %r10;\n"
|
||||
" sub.s32 %r12, %r9, %r11;\n"
|
||||
" ld.param.s32 %r13, [__cudaparm_particle_map_nlocal];\n"
|
||||
" setp.le.s32 %p1, %r13, %r12;\n"
|
||||
" @%p1 bra $Lt_0_7426;\n"
|
||||
" .loc 17 62 0\n"
|
||||
" mov.u32 %r14, %r12;\n"
|
||||
" mov.s32 %r15, 0;\n"
|
||||
" mov.u32 %r16, %r15;\n"
|
||||
" mov.s32 %r17, 0;\n"
|
||||
" mov.u32 %r18, %r17;\n"
|
||||
" mov.s32 %r19, 0;\n"
|
||||
" mov.u32 %r20, %r19;\n"
|
||||
" tex.1d.v4.f32.s32 {%f1,%f2,%f3,%f4},[pos_tex,{%r14,%r16,%r18,%r20}];\n"
|
||||
" mov.f32 %f5, %f1;\n"
|
||||
" mov.f32 %f6, %f2;\n"
|
||||
" mov.f32 %f7, %f3;\n"
|
||||
" .loc 17 64 0\n"
|
||||
" mov.u32 %r21, %r12;\n"
|
||||
" mov.s32 %r22, 0;\n"
|
||||
" mov.u32 %r23, %r22;\n"
|
||||
" mov.s32 %r24, 0;\n"
|
||||
" mov.u32 %r25, %r24;\n"
|
||||
" mov.s32 %r26, 0;\n"
|
||||
" mov.u32 %r27, %r26;\n"
|
||||
" tex.1d.v4.f32.s32 {%f8,%f9,%f10,%f11},[q_tex,{%r21,%r23,%r25,%r27}];\n"
|
||||
" mov.f32 %f12, %f8;\n"
|
||||
" cvt.ftz.f64.f32 %fd1, %f12;\n"
|
||||
" ld.param.f64 %fd2, [__cudaparm_particle_map_delvolinv];\n"
|
||||
" mul.f64 %fd3, %fd1, %fd2;\n"
|
||||
" mov.f64 %fd4, 0d0000000000000000; \n"
|
||||
" setp.neu.f64 %p2, %fd3, %fd4;\n"
|
||||
" @!%p2 bra $Lt_0_7426;\n"
|
||||
" .loc 17 67 0\n"
|
||||
" ld.param.f64 %fd5, [__cudaparm_particle_map_delxinv];\n"
|
||||
" cvt.ftz.f64.f32 %fd6, %f5;\n"
|
||||
" ld.param.f64 %fd7, [__cudaparm_particle_map_b_lo_x];\n"
|
||||
" sub.f64 %fd8, %fd6, %fd7;\n"
|
||||
" mul.f64 %fd9, %fd5, %fd8;\n"
|
||||
" mov.f64 %fd10, 0d0000000000000000; \n"
|
||||
" setp.lt.f64 %p3, %fd9, %fd10;\n"
|
||||
" @%p3 bra $Lt_0_8706;\n"
|
||||
" ld.param.f64 %fd11, [__cudaparm_particle_map_delyinv];\n"
|
||||
" cvt.ftz.f64.f32 %fd12, %f6;\n"
|
||||
" ld.param.f64 %fd13, [__cudaparm_particle_map_b_lo_y];\n"
|
||||
" sub.f64 %fd14, %fd12, %fd13;\n"
|
||||
" mul.f64 %fd15, %fd11, %fd14;\n"
|
||||
" mov.f64 %fd16, 0d0000000000000000; \n"
|
||||
" setp.lt.f64 %p4, %fd15, %fd16;\n"
|
||||
" @%p4 bra $Lt_0_8706;\n"
|
||||
" ld.param.f64 %fd17, [__cudaparm_particle_map_delzinv];\n"
|
||||
" cvt.ftz.f64.f32 %fd18, %f7;\n"
|
||||
" ld.param.f64 %fd19, [__cudaparm_particle_map_b_lo_z];\n"
|
||||
" sub.f64 %fd20, %fd18, %fd19;\n"
|
||||
" mul.f64 %fd21, %fd17, %fd20;\n"
|
||||
" mov.f64 %fd22, 0d0000000000000000; \n"
|
||||
" setp.lt.f64 %p5, %fd21, %fd22;\n"
|
||||
" @%p5 bra $Lt_0_8706;\n"
|
||||
" cvt.rzi.s32.f64 %r28, %fd9;\n"
|
||||
" ld.param.s32 %r29, [__cudaparm_particle_map_nlocal_x];\n"
|
||||
" setp.ge.s32 %p6, %r28, %r29;\n"
|
||||
" @%p6 bra $Lt_0_8706;\n"
|
||||
" cvt.rzi.s32.f64 %r30, %fd15;\n"
|
||||
" ld.param.s32 %r31, [__cudaparm_particle_map_nlocal_y];\n"
|
||||
" setp.ge.s32 %p7, %r30, %r31;\n"
|
||||
" @%p7 bra $Lt_0_8706;\n"
|
||||
" cvt.rzi.s32.f64 %r32, %fd21;\n"
|
||||
" ld.param.s32 %r33, [__cudaparm_particle_map_nlocal_z];\n"
|
||||
" setp.gt.s32 %p8, %r33, %r32;\n"
|
||||
" @%p8 bra $L_0_4866;\n"
|
||||
"$Lt_0_8706:\n"
|
||||
"$L_0_5122:\n"
|
||||
" .loc 17 76 0\n"
|
||||
" mov.s32 %r34, 1;\n"
|
||||
" ld.param.u64 %rd1, [__cudaparm_particle_map_error];\n"
|
||||
" st.global.s32 [%rd1+0], %r34;\n"
|
||||
" bra.uni $Lt_0_7426;\n"
|
||||
"$L_0_4866:\n"
|
||||
" .loc 17 83 0\n"
|
||||
" mul.lo.s32 %r35, %r32, %r31;\n"
|
||||
" add.s32 %r36, %r30, %r35;\n"
|
||||
" mul.lo.s32 %r37, %r36, %r29;\n"
|
||||
" add.s32 %r38, %r28, %r37;\n"
|
||||
" ld.param.u64 %rd2, [__cudaparm_particle_map_counts];\n"
|
||||
" cvt.s64.s32 %rd3, %r38;\n"
|
||||
" mul.wide.s32 %rd4, %r38, 4;\n"
|
||||
" add.u64 %rd5, %rd2, %rd4;\n"
|
||||
" mov.s32 %r39, 1;\n"
|
||||
" atom.global.add.s32 %r40, [%rd5], %r39;\n"
|
||||
" mov.s32 %r41, %r40;\n"
|
||||
" ld.param.s32 %r42, [__cudaparm_particle_map_max_atoms];\n"
|
||||
" setp.gt.s32 %p9, %r42, %r41;\n"
|
||||
" @%p9 bra $Lt_0_7682;\n"
|
||||
" .loc 17 85 0\n"
|
||||
" mov.s32 %r43, 2;\n"
|
||||
" ld.param.u64 %rd6, [__cudaparm_particle_map_error];\n"
|
||||
" st.global.s32 [%rd6+0], %r43;\n"
|
||||
" .loc 16 118 0\n"
|
||||
" mov.s32 %r44, -1;\n"
|
||||
" atom.global.add.s32 %r45, [%rd5], %r44;\n"
|
||||
" bra.uni $Lt_0_7426;\n"
|
||||
"$Lt_0_7682:\n"
|
||||
" .loc 17 88 0\n"
|
||||
" ld.param.u64 %rd7, [__cudaparm_particle_map_ans];\n"
|
||||
" ld.param.s32 %r46, [__cudaparm_particle_map_atom_stride];\n"
|
||||
" mul.lo.s32 %r47, %r46, %r41;\n"
|
||||
" add.s32 %r48, %r38, %r47;\n"
|
||||
" cvt.s64.s32 %rd8, %r48;\n"
|
||||
" mul.wide.s32 %rd9, %r48, 32;\n"
|
||||
" add.u64 %rd10, %rd7, %rd9;\n"
|
||||
" cvt.rn.f64.s32 %fd23, %r28;\n"
|
||||
" mov.f64 %fd24, 0d3fe0000000000000; \n"
|
||||
" add.f64 %fd25, %fd23, %fd24;\n"
|
||||
" sub.f64 %fd26, %fd25, %fd9;\n"
|
||||
" cvt.rn.f64.s32 %fd27, %r30;\n"
|
||||
" mov.f64 %fd28, 0d3fe0000000000000; \n"
|
||||
" add.f64 %fd29, %fd27, %fd28;\n"
|
||||
" sub.f64 %fd30, %fd29, %fd15;\n"
|
||||
" st.global.v2.f64 [%rd10+0], {%fd26,%fd30};\n"
|
||||
" cvt.rn.f64.s32 %fd31, %r32;\n"
|
||||
" mov.f64 %fd32, 0d3fe0000000000000; \n"
|
||||
" add.f64 %fd33, %fd31, %fd32;\n"
|
||||
" sub.f64 %fd34, %fd33, %fd21;\n"
|
||||
" st.global.v2.f64 [%rd10+16], {%fd34,%fd3};\n"
|
||||
"$Lt_0_7426:\n"
|
||||
"$L_0_4610:\n"
|
||||
"$Lt_0_6914:\n"
|
||||
"$Lt_0_6402:\n"
|
||||
" .loc 17 92 0\n"
|
||||
" exit;\n"
|
||||
"$LDWend_particle_map:\n"
|
||||
" }\n"
|
||||
" .entry make_rho (\n"
|
||||
" .param .u64 __cudaparm_make_rho_counts,\n"
|
||||
" .param .u64 __cudaparm_make_rho_atoms,\n"
|
||||
" .param .u64 __cudaparm_make_rho_brick,\n"
|
||||
" .param .u64 __cudaparm_make_rho__rho_coeff,\n"
|
||||
" .param .s32 __cudaparm_make_rho_atom_stride,\n"
|
||||
" .param .s32 __cudaparm_make_rho_npts_x,\n"
|
||||
" .param .s32 __cudaparm_make_rho_npts_y,\n"
|
||||
" .param .s32 __cudaparm_make_rho_npts_z,\n"
|
||||
" .param .s32 __cudaparm_make_rho_nlocal_x,\n"
|
||||
" .param .s32 __cudaparm_make_rho_nlocal_y,\n"
|
||||
" .param .s32 __cudaparm_make_rho_nlocal_z,\n"
|
||||
" .param .s32 __cudaparm_make_rho_order_m_1,\n"
|
||||
" .param .s32 __cudaparm_make_rho_order,\n"
|
||||
" .param .s32 __cudaparm_make_rho_order2)\n"
|
||||
" {\n"
|
||||
" .reg .u32 %r<119>;\n"
|
||||
" .reg .u64 %rd<57>;\n"
|
||||
" .reg .f64 %fd<26>;\n"
|
||||
" .reg .pred %p<27>;\n"
|
||||
" .shared .align 8 .b8 __cuda___cuda_local_var_32578_34_non_const_rho_coeff200[512];\n"
|
||||
" .shared .align 8 .b8 __cuda___cuda_local_var_32579_34_non_const_front712[640];\n"
|
||||
" .shared .align 8 .b8 __cuda___cuda_local_var_32580_34_non_const_ans1352[4096];\n"
|
||||
" .loc 17 101 0\n"
|
||||
"$LDWbegin_make_rho:\n"
|
||||
" ld.param.s32 %r1, [__cudaparm_make_rho_order2];\n"
|
||||
" ld.param.s32 %r2, [__cudaparm_make_rho_order];\n"
|
||||
" add.s32 %r3, %r1, %r2;\n"
|
||||
" cvt.s32.u32 %r4, %tid.x;\n"
|
||||
" setp.le.s32 %p1, %r3, %r4;\n"
|
||||
" @%p1 bra $Lt_1_16898;\n"
|
||||
" .loc 17 108 0\n"
|
||||
" mov.u64 %rd1, __cuda___cuda_local_var_32578_34_non_const_rho_coeff200;\n"
|
||||
" cvt.s64.s32 %rd2, %r4;\n"
|
||||
" mul.wide.s32 %rd3, %r4, 8;\n"
|
||||
" ld.param.u64 %rd4, [__cudaparm_make_rho__rho_coeff];\n"
|
||||
" add.u64 %rd5, %rd4, %rd3;\n"
|
||||
" ld.global.f64 %fd1, [%rd5+0];\n"
|
||||
" add.u64 %rd6, %rd3, %rd1;\n"
|
||||
" st.shared.f64 [%rd6+0], %fd1;\n"
|
||||
"$Lt_1_16898:\n"
|
||||
" mov.u64 %rd1, __cuda___cuda_local_var_32578_34_non_const_rho_coeff200;\n"
|
||||
" shr.s32 %r5, %r4, 31;\n"
|
||||
" mov.s32 %r6, 31;\n"
|
||||
" and.b32 %r7, %r5, %r6;\n"
|
||||
" add.s32 %r8, %r7, %r4;\n"
|
||||
" shr.s32 %r9, %r8, 5;\n"
|
||||
" mul.lo.s32 %r10, %r9, 32;\n"
|
||||
" sub.s32 %r11, %r4, %r10;\n"
|
||||
" setp.lt.s32 %p2, %r11, %r2;\n"
|
||||
" @!%p2 bra $Lt_1_17410;\n"
|
||||
" .loc 17 114 0\n"
|
||||
" mov.u64 %rd7, __cuda___cuda_local_var_32579_34_non_const_front712;\n"
|
||||
" mov.f64 %fd2, 0d0000000000000000; \n"
|
||||
" cvt.s64.s32 %rd8, %r11;\n"
|
||||
" shr.s32 %r12, %r4, 31;\n"
|
||||
" mov.s32 %r13, 31;\n"
|
||||
" and.b32 %r14, %r12, %r13;\n"
|
||||
" add.s32 %r15, %r14, %r4;\n"
|
||||
" shr.s32 %r16, %r15, 5;\n"
|
||||
" cvt.s64.s32 %rd9, %r16;\n"
|
||||
" mul.wide.s32 %rd10, %r16, 40;\n"
|
||||
" add.u64 %rd11, %rd8, %rd10;\n"
|
||||
" mul.lo.u64 %rd12, %rd11, 8;\n"
|
||||
" add.u64 %rd13, %rd7, %rd12;\n"
|
||||
" st.shared.f64 [%rd13+256], %fd2;\n"
|
||||
"$Lt_1_17410:\n"
|
||||
" mov.u64 %rd7, __cuda___cuda_local_var_32579_34_non_const_front712;\n"
|
||||
" .loc 17 116 0\n"
|
||||
" bar.sync 0;\n"
|
||||
" ld.param.s32 %r17, [__cudaparm_make_rho_npts_x];\n"
|
||||
" shr.s32 %r18, %r17, 31;\n"
|
||||
" mov.s32 %r19, 31;\n"
|
||||
" and.b32 %r20, %r18, %r19;\n"
|
||||
" add.s32 %r21, %r20, %r17;\n"
|
||||
" shr.s32 %r22, %r21, 5;\n"
|
||||
" add.s32 %r23, %r22, 1;\n"
|
||||
" mov.u32 %r24, 0;\n"
|
||||
" setp.le.s32 %p3, %r23, %r24;\n"
|
||||
" @%p3 bra $Lt_1_17922;\n"
|
||||
" shr.s32 %r25, %r4, 31;\n"
|
||||
" mov.s32 %r26, 31;\n"
|
||||
" and.b32 %r27, %r25, %r26;\n"
|
||||
" add.s32 %r28, %r27, %r4;\n"
|
||||
" shr.s32 %r29, %r28, 5;\n"
|
||||
" add.s32 %r30, %r11, 32;\n"
|
||||
" ld.param.s32 %r31, [__cudaparm_make_rho_nlocal_y];\n"
|
||||
" ld.param.s32 %r32, [__cudaparm_make_rho_nlocal_x];\n"
|
||||
" mul.lo.s32 %r33, %r31, %r32;\n"
|
||||
" mov.u32 %r34, %ctaid.x;\n"
|
||||
" mul.lo.u32 %r35, %r34, 2;\n"
|
||||
" add.u32 %r36, %r29, %r35;\n"
|
||||
" ld.param.s32 %r37, [__cudaparm_make_rho_npts_y];\n"
|
||||
" div.s32 %r38, %r36, %r37;\n"
|
||||
" ld.param.s32 %r39, [__cudaparm_make_rho_order_m_1];\n"
|
||||
" setp.lt.s32 %p4, %r38, %r39;\n"
|
||||
" sub.s32 %r40, %r39, %r38;\n"
|
||||
" mov.s32 %r41, 0;\n"
|
||||
" selp.s32 %r42, %r40, %r41, %p4;\n"
|
||||
" ld.param.s32 %r43, [__cudaparm_make_rho_nlocal_z];\n"
|
||||
" setp.ge.s32 %p5, %r38, %r43;\n"
|
||||
" sub.s32 %r44, %r43, %r38;\n"
|
||||
" add.s32 %r45, %r44, %r2;\n"
|
||||
" sub.s32 %r46, %r45, 1;\n"
|
||||
" selp.s32 %r47, %r46, %r2, %p5;\n"
|
||||
" rem.s32 %r48, %r36, %r37;\n"
|
||||
" setp.lt.s32 %p6, %r48, %r39;\n"
|
||||
" sub.s32 %r49, %r39, %r48;\n"
|
||||
" mov.s32 %r50, 0;\n"
|
||||
" selp.s32 %r51, %r49, %r50, %p6;\n"
|
||||
" setp.ge.s32 %p7, %r48, %r31;\n"
|
||||
" sub.s32 %r52, %r31, %r48;\n"
|
||||
" add.s32 %r53, %r52, %r2;\n"
|
||||
" sub.s32 %r54, %r53, 1;\n"
|
||||
" selp.s32 %r55, %r54, %r2, %p7;\n"
|
||||
" mov.s32 %r56, %r23;\n"
|
||||
" mov.s32 %r57, 0;\n"
|
||||
" setp.gt.s32 %p8, %r2, %r57;\n"
|
||||
" mov.s32 %r58, 0;\n"
|
||||
" cvt.s64.s32 %rd14, %r11;\n"
|
||||
" cvt.s64.s32 %rd15, %r29;\n"
|
||||
" mul.lo.s32 %r59, %r23, 32;\n"
|
||||
" mul.wide.s32 %rd16, %r29, 40;\n"
|
||||
" add.u64 %rd17, %rd14, %rd16;\n"
|
||||
" ld.param.s32 %r60, [__cudaparm_make_rho_npts_z];\n"
|
||||
" setp.gt.s32 %p9, %r60, %r38;\n"
|
||||
" mul.lo.u64 %rd18, %rd17, 8;\n"
|
||||
" selp.s32 %r61, 1, 0, %p9;\n"
|
||||
" add.u64 %rd19, %rd18, %rd7;\n"
|
||||
" mov.u64 %rd20, __cuda___cuda_local_var_32580_34_non_const_ans1352;\n"
|
||||
" mov.s32 %r62, %r56;\n"
|
||||
"$Lt_1_18434:\n"
|
||||
" @!%p8 bra $Lt_1_18690;\n"
|
||||
" mov.s32 %r63, %r2;\n"
|
||||
" cvt.s64.s32 %rd21, %r4;\n"
|
||||
" mul.wide.s32 %rd22, %r4, 8;\n"
|
||||
" add.u64 %rd23, %rd20, %rd22;\n"
|
||||
" mov.s32 %r64, 0;\n"
|
||||
" mov.s32 %r65, %r63;\n"
|
||||
"$Lt_1_19202:\n"
|
||||
" .loc 17 140 0\n"
|
||||
" mov.f64 %fd3, 0d0000000000000000; \n"
|
||||
" st.shared.f64 [%rd23+0], %fd3;\n"
|
||||
" add.s32 %r64, %r64, 1;\n"
|
||||
" add.u64 %rd23, %rd23, 512;\n"
|
||||
" setp.ne.s32 %p10, %r64, %r2;\n"
|
||||
" @%p10 bra $Lt_1_19202;\n"
|
||||
"$Lt_1_18690:\n"
|
||||
" add.s32 %r66, %r11, %r58;\n"
|
||||
" set.lt.u32.s32 %r67, %r66, %r32;\n"
|
||||
" neg.s32 %r68, %r67;\n"
|
||||
" and.b32 %r69, %r61, %r68;\n"
|
||||
" mov.u32 %r70, 0;\n"
|
||||
" setp.eq.s32 %p11, %r69, %r70;\n"
|
||||
" @%p11 bra $Lt_1_20226;\n"
|
||||
" .loc 17 143 0\n"
|
||||
" mov.s32 %r71, %r42;\n"
|
||||
" setp.ge.s32 %p12, %r42, %r47;\n"
|
||||
" @%p12 bra $Lt_1_20226;\n"
|
||||
" sub.s32 %r72, %r47, %r42;\n"
|
||||
" setp.lt.s32 %p13, %r51, %r55;\n"
|
||||
" mov.s32 %r73, %r72;\n"
|
||||
"$Lt_1_20738:\n"
|
||||
" .loc 17 145 0\n"
|
||||
" mov.s32 %r74, %r51;\n"
|
||||
" @!%p13 bra $Lt_1_20994;\n"
|
||||
" sub.s32 %r75, %r55, %r51;\n"
|
||||
" sub.s32 %r76, %r71, %r42;\n"
|
||||
" add.s32 %r77, %r38, %r42;\n"
|
||||
" add.s32 %r78, %r48, %r51;\n"
|
||||
" sub.s32 %r79, %r77, %r39;\n"
|
||||
" sub.s32 %r80, %r78, %r39;\n"
|
||||
" add.s32 %r81, %r76, %r79;\n"
|
||||
" mul.lo.s32 %r82, %r33, %r81;\n"
|
||||
" ld.param.s32 %r83, [__cudaparm_make_rho_atom_stride];\n"
|
||||
" ld.param.u64 %rd24, [__cudaparm_make_rho_counts];\n"
|
||||
" mov.s32 %r84, %r75;\n"
|
||||
"$Lt_1_21506:\n"
|
||||
" .loc 17 147 0\n"
|
||||
" sub.s32 %r85, %r74, %r51;\n"
|
||||
" add.s32 %r86, %r85, %r80;\n"
|
||||
" mul.lo.s32 %r87, %r86, %r32;\n"
|
||||
" add.s32 %r88, %r82, %r87;\n"
|
||||
" add.s32 %r89, %r66, %r88;\n"
|
||||
" cvt.s64.s32 %rd25, %r89;\n"
|
||||
" mul.wide.s32 %rd26, %r89, 4;\n"
|
||||
" add.u64 %rd27, %rd24, %rd26;\n"
|
||||
" ld.global.s32 %r90, [%rd27+0];\n"
|
||||
" mul.lo.s32 %r91, %r90, %r83;\n"
|
||||
" .loc 17 148 0\n"
|
||||
" mov.s32 %r92, %r89;\n"
|
||||
" setp.ge.s32 %p14, %r89, %r91;\n"
|
||||
" @%p14 bra $Lt_1_21762;\n"
|
||||
" sub.s32 %r93, %r3, 1;\n"
|
||||
" cvt.s64.s32 %rd28, %r83;\n"
|
||||
" mul.wide.s32 %rd29, %r83, 32;\n"
|
||||
" mov.s32 %r94, -1;\n"
|
||||
" setp.gt.s32 %p15, %r93, %r94;\n"
|
||||
" ld.param.u64 %rd30, [__cudaparm_make_rho_atoms];\n"
|
||||
" mul.lo.u64 %rd31, %rd25, 32;\n"
|
||||
" add.u64 %rd32, %rd30, %rd31;\n"
|
||||
"$Lt_1_22274:\n"
|
||||
" .loc 17 149 0\n"
|
||||
" ld.global.f64 %fd4, [%rd32+0];\n"
|
||||
" @!%p15 bra $Lt_1_29954;\n"
|
||||
" sub.s32 %r95, %r93, %r74;\n"
|
||||
" mov.s32 %r96, -1;\n"
|
||||
" sub.s32 %r97, %r96, %r74;\n"
|
||||
" cvt.s64.s32 %rd33, %r2;\n"
|
||||
" mul.wide.s32 %rd34, %r2, 8;\n"
|
||||
" ld.global.f64 %fd5, [%rd32+8];\n"
|
||||
" ld.global.f64 %fd6, [%rd32+16];\n"
|
||||
" cvt.s64.s32 %rd35, %r95;\n"
|
||||
" mul.wide.s32 %rd36, %r95, 8;\n"
|
||||
" add.u64 %rd37, %rd1, %rd36;\n"
|
||||
" sub.s32 %r98, %r93, %r71;\n"
|
||||
" cvt.s64.s32 %rd38, %r98;\n"
|
||||
" mul.wide.s32 %rd39, %r98, 8;\n"
|
||||
" add.u64 %rd40, %rd1, %rd39;\n"
|
||||
" mov.f64 %fd7, 0d0000000000000000; \n"
|
||||
" mov.f64 %fd8, 0d0000000000000000; \n"
|
||||
"$Lt_1_23042:\n"
|
||||
" .loc 17 154 0\n"
|
||||
" ld.shared.f64 %fd9, [%rd37+0];\n"
|
||||
" mad.rn.f64 %fd8, %fd8, %fd5, %fd9;\n"
|
||||
" .loc 17 155 0\n"
|
||||
" ld.shared.f64 %fd10, [%rd40+0];\n"
|
||||
" mad.rn.f64 %fd7, %fd7, %fd6, %fd10;\n"
|
||||
" sub.u64 %rd40, %rd40, %rd34;\n"
|
||||
" sub.s32 %r95, %r95, %r2;\n"
|
||||
" sub.u64 %rd37, %rd37, %rd34;\n"
|
||||
" setp.gt.s32 %p16, %r95, %r97;\n"
|
||||
" @%p16 bra $Lt_1_23042;\n"
|
||||
" bra.uni $Lt_1_22530;\n"
|
||||
"$Lt_1_29954:\n"
|
||||
" mov.f64 %fd7, 0d0000000000000000; \n"
|
||||
" mov.f64 %fd8, 0d0000000000000000; \n"
|
||||
"$Lt_1_22530:\n"
|
||||
" .loc 17 157 0\n"
|
||||
" ld.global.f64 %fd11, [%rd32+24];\n"
|
||||
" mul.f64 %fd12, %fd7, %fd8;\n"
|
||||
" mul.f64 %fd13, %fd11, %fd12;\n"
|
||||
" @!%p8 bra $Lt_1_23554;\n"
|
||||
" mov.s32 %r99, %r2;\n"
|
||||
" cvt.s64.s32 %rd41, %r4;\n"
|
||||
" mul.wide.s32 %rd42, %r4, 8;\n"
|
||||
" add.u64 %rd43, %rd20, %rd42;\n"
|
||||
" mov.s32 %r100, 0;\n"
|
||||
" mov.s32 %r101, %r99;\n"
|
||||
"$Lt_1_24066:\n"
|
||||
" .loc 17 161 0\n"
|
||||
" add.s32 %r102, %r100, %r1;\n"
|
||||
" mov.s32 %r103, %r102;\n"
|
||||
" setp.lt.s32 %p17, %r102, %r100;\n"
|
||||
" @%p17 bra $Lt_1_30466;\n"
|
||||
" cvt.s64.s32 %rd44, %r2;\n"
|
||||
" mul.wide.s32 %rd34, %r2, 8;\n"
|
||||
" cvt.s64.s32 %rd45, %r102;\n"
|
||||
" mul.wide.s32 %rd46, %r102, 8;\n"
|
||||
" add.u64 %rd47, %rd1, %rd46;\n"
|
||||
" mov.f64 %fd14, 0d0000000000000000; \n"
|
||||
"$Lt_1_24834:\n"
|
||||
" .loc 17 162 0\n"
|
||||
" ld.shared.f64 %fd15, [%rd47+0];\n"
|
||||
" mad.rn.f64 %fd14, %fd4, %fd14, %fd15;\n"
|
||||
" sub.s32 %r103, %r103, %r2;\n"
|
||||
" sub.u64 %rd47, %rd47, %rd34;\n"
|
||||
" setp.ge.s32 %p18, %r103, %r100;\n"
|
||||
" @%p18 bra $Lt_1_24834;\n"
|
||||
" bra.uni $Lt_1_24322;\n"
|
||||
"$Lt_1_30466:\n"
|
||||
" mov.f64 %fd14, 0d0000000000000000; \n"
|
||||
"$Lt_1_24322:\n"
|
||||
" .loc 17 163 0\n"
|
||||
" ld.shared.f64 %fd16, [%rd43+0];\n"
|
||||
" mad.rn.f64 %fd17, %fd14, %fd13, %fd16;\n"
|
||||
" st.shared.f64 [%rd43+0], %fd17;\n"
|
||||
" add.s32 %r100, %r100, 1;\n"
|
||||
" add.u64 %rd43, %rd43, 512;\n"
|
||||
" setp.ne.s32 %p19, %r100, %r2;\n"
|
||||
" @%p19 bra $Lt_1_24066;\n"
|
||||
"$Lt_1_23554:\n"
|
||||
" add.s32 %r92, %r92, %r83;\n"
|
||||
" add.u64 %rd32, %rd29, %rd32;\n"
|
||||
" setp.gt.s32 %p20, %r91, %r92;\n"
|
||||
" @%p20 bra $Lt_1_22274;\n"
|
||||
"$Lt_1_21762:\n"
|
||||
" add.s32 %r74, %r74, 1;\n"
|
||||
" setp.ne.s32 %p21, %r55, %r74;\n"
|
||||
" @%p21 bra $Lt_1_21506;\n"
|
||||
"$Lt_1_20994:\n"
|
||||
" add.s32 %r71, %r71, 1;\n"
|
||||
" setp.ne.s32 %p22, %r47, %r71;\n"
|
||||
" @%p22 bra $Lt_1_20738;\n"
|
||||
"$Lt_1_20226:\n"
|
||||
"$Lt_1_19714:\n"
|
||||
" .loc 17 172 0\n"
|
||||
" bar.sync 0;\n"
|
||||
" @!%p2 bra $Lt_1_26626;\n"
|
||||
" .loc 17 174 0\n"
|
||||
" ld.shared.f64 %fd18, [%rd19+256];\n"
|
||||
" st.shared.f64 [%rd19+0], %fd18;\n"
|
||||
" .loc 17 175 0\n"
|
||||
" mov.f64 %fd19, 0d0000000000000000; \n"
|
||||
" st.shared.f64 [%rd19+256], %fd19;\n"
|
||||
" bra.uni $Lt_1_26370;\n"
|
||||
"$Lt_1_26626:\n"
|
||||
" .loc 17 177 0\n"
|
||||
" mov.f64 %fd20, 0d0000000000000000; \n"
|
||||
" st.shared.f64 [%rd19+0], %fd20;\n"
|
||||
"$Lt_1_26370:\n"
|
||||
" @!%p8 bra $Lt_1_26882;\n"
|
||||
" mov.s32 %r104, %r2;\n"
|
||||
" cvt.s64.s32 %rd48, %r4;\n"
|
||||
" mov.s32 %r105, %r11;\n"
|
||||
" add.s32 %r106, %r11, %r2;\n"
|
||||
" mul.wide.s32 %rd49, %r4, 8;\n"
|
||||
" add.u64 %rd50, %rd20, %rd49;\n"
|
||||
" mov.s64 %rd51, %rd19;\n"
|
||||
" mov.s32 %r107, %r104;\n"
|
||||
"$Lt_1_27394:\n"
|
||||
" .loc 17 180 0\n"
|
||||
" ld.shared.f64 %fd21, [%rd50+0];\n"
|
||||
" ld.shared.f64 %fd22, [%rd51+0];\n"
|
||||
" add.f64 %fd23, %fd21, %fd22;\n"
|
||||
" st.shared.f64 [%rd51+0], %fd23;\n"
|
||||
" .loc 17 181 0\n"
|
||||
" bar.sync 0;\n"
|
||||
" add.s32 %r105, %r105, 1;\n"
|
||||
" add.u64 %rd51, %rd51, 8;\n"
|
||||
" add.u64 %rd50, %rd50, 512;\n"
|
||||
" setp.ne.s32 %p23, %r105, %r106;\n"
|
||||
" @%p23 bra $Lt_1_27394;\n"
|
||||
"$Lt_1_26882:\n"
|
||||
" set.lt.u32.s32 %r108, %r66, %r17;\n"
|
||||
" neg.s32 %r109, %r108;\n"
|
||||
" and.b32 %r110, %r61, %r109;\n"
|
||||
" mov.u32 %r111, 0;\n"
|
||||
" setp.eq.s32 %p24, %r110, %r111;\n"
|
||||
" @%p24 bra $Lt_1_27906;\n"
|
||||
" .loc 17 185 0\n"
|
||||
" ld.shared.f64 %fd24, [%rd19+0];\n"
|
||||
" ld.param.u64 %rd52, [__cudaparm_make_rho_brick];\n"
|
||||
" add.s32 %r112, %r11, %r58;\n"
|
||||
" mul.lo.s32 %r113, %r37, %r17;\n"
|
||||
" mul.lo.s32 %r114, %r38, %r113;\n"
|
||||
" mul.lo.s32 %r115, %r48, %r17;\n"
|
||||
" add.s32 %r116, %r114, %r115;\n"
|
||||
" add.s32 %r117, %r112, %r116;\n"
|
||||
" cvt.s64.s32 %rd53, %r117;\n"
|
||||
" mul.wide.s32 %rd54, %r117, 8;\n"
|
||||
" add.u64 %rd55, %rd52, %rd54;\n"
|
||||
" st.global.f64 [%rd55+0], %fd24;\n"
|
||||
"$Lt_1_27906:\n"
|
||||
" add.s32 %r58, %r58, 32;\n"
|
||||
" setp.ne.s32 %p25, %r58, %r59;\n"
|
||||
" @%p25 bra $Lt_1_18434;\n"
|
||||
"$Lt_1_17922:\n"
|
||||
" .loc 17 189 0\n"
|
||||
" exit;\n"
|
||||
"$LDWend_make_rho:\n"
|
||||
" }\n"
|
||||
" .entry interp (\n"
|
||||
" .param .u64 __cudaparm_interp_x_,\n"
|
||||
" .param .u64 __cudaparm_interp_q_,\n"
|
||||
" .param .s32 __cudaparm_interp_nlocal,\n"
|
||||
" .param .u64 __cudaparm_interp_brick,\n"
|
||||
" .param .u64 __cudaparm_interp__rho_coeff,\n"
|
||||
" .param .s32 __cudaparm_interp_npts_x,\n"
|
||||
" .param .s32 __cudaparm_interp_npts_yx,\n"
|
||||
" .param .f64 __cudaparm_interp_b_lo_x,\n"
|
||||
" .param .f64 __cudaparm_interp_b_lo_y,\n"
|
||||
" .param .f64 __cudaparm_interp_b_lo_z,\n"
|
||||
" .param .f64 __cudaparm_interp_delxinv,\n"
|
||||
" .param .f64 __cudaparm_interp_delyinv,\n"
|
||||
" .param .f64 __cudaparm_interp_delzinv,\n"
|
||||
" .param .s32 __cudaparm_interp_order,\n"
|
||||
" .param .s32 __cudaparm_interp_order2,\n"
|
||||
" .param .f64 __cudaparm_interp_qqrd2e_scale,\n"
|
||||
" .param .u64 __cudaparm_interp_ans)\n"
|
||||
" {\n"
|
||||
" .reg .u32 %r<56>;\n"
|
||||
" .reg .u64 %rd<37>;\n"
|
||||
" .reg .f32 %f<19>;\n"
|
||||
" .reg .f64 %fd<63>;\n"
|
||||
" .reg .pred %p<14>;\n"
|
||||
" .shared .align 8 .b8 __cuda___cuda_local_var_32676_34_non_const_rho_coeff5568[512];\n"
|
||||
" .shared .align 8 .b8 __cuda___cuda_local_var_32677_34_non_const_rho1d_06080[4096];\n"
|
||||
" .shared .align 8 .b8 __cuda___cuda_local_var_32678_34_non_const_rho1d_110176[4096];\n"
|
||||
" .loc 17 199 0\n"
|
||||
"$LDWbegin_interp:\n"
|
||||
" ld.param.s32 %r1, [__cudaparm_interp_order2];\n"
|
||||
" ld.param.s32 %r2, [__cudaparm_interp_order];\n"
|
||||
" add.s32 %r3, %r1, %r2;\n"
|
||||
" cvt.s32.u32 %r4, %tid.x;\n"
|
||||
" setp.le.s32 %p1, %r3, %r4;\n"
|
||||
" @%p1 bra $Lt_2_8706;\n"
|
||||
" .loc 17 206 0\n"
|
||||
" mov.u64 %rd1, __cuda___cuda_local_var_32676_34_non_const_rho_coeff5568;\n"
|
||||
" cvt.s64.s32 %rd2, %r4;\n"
|
||||
" mul.wide.s32 %rd3, %r4, 8;\n"
|
||||
" ld.param.u64 %rd4, [__cudaparm_interp__rho_coeff];\n"
|
||||
" add.u64 %rd5, %rd4, %rd3;\n"
|
||||
" ld.global.f64 %fd1, [%rd5+0];\n"
|
||||
" add.u64 %rd6, %rd3, %rd1;\n"
|
||||
" st.shared.f64 [%rd6+0], %fd1;\n"
|
||||
"$Lt_2_8706:\n"
|
||||
" mov.u64 %rd1, __cuda___cuda_local_var_32676_34_non_const_rho_coeff5568;\n"
|
||||
" .loc 17 207 0\n"
|
||||
" bar.sync 0;\n"
|
||||
" mov.u32 %r5, %ctaid.x;\n"
|
||||
" mov.u32 %r6, %ntid.x;\n"
|
||||
" mul.lo.u32 %r7, %r5, %r6;\n"
|
||||
" add.u32 %r8, %r4, %r7;\n"
|
||||
" ld.param.s32 %r9, [__cudaparm_interp_nlocal];\n"
|
||||
" setp.le.s32 %p2, %r9, %r8;\n"
|
||||
" @%p2 bra $Lt_2_9218;\n"
|
||||
" .loc 17 215 0\n"
|
||||
" mov.u32 %r10, %r8;\n"
|
||||
" mov.s32 %r11, 0;\n"
|
||||
" mov.u32 %r12, %r11;\n"
|
||||
" mov.s32 %r13, 0;\n"
|
||||
" mov.u32 %r14, %r13;\n"
|
||||
" mov.s32 %r15, 0;\n"
|
||||
" mov.u32 %r16, %r15;\n"
|
||||
" tex.1d.v4.f32.s32 {%f1,%f2,%f3,%f4},[pos_tex,{%r10,%r12,%r14,%r16}];\n"
|
||||
" mov.f32 %f5, %f1;\n"
|
||||
" mov.f32 %f6, %f2;\n"
|
||||
" mov.f32 %f7, %f3;\n"
|
||||
" .loc 17 216 0\n"
|
||||
" mov.u32 %r17, %r8;\n"
|
||||
" mov.s32 %r18, 0;\n"
|
||||
" mov.u32 %r19, %r18;\n"
|
||||
" mov.s32 %r20, 0;\n"
|
||||
" mov.u32 %r21, %r20;\n"
|
||||
" mov.s32 %r22, 0;\n"
|
||||
" mov.u32 %r23, %r22;\n"
|
||||
" tex.1d.v4.f32.s32 {%f8,%f9,%f10,%f11},[q_tex,{%r17,%r19,%r21,%r23}];\n"
|
||||
" mov.f32 %f12, %f8;\n"
|
||||
" cvt.ftz.f64.f32 %fd2, %f12;\n"
|
||||
" ld.param.f64 %fd3, [__cudaparm_interp_qqrd2e_scale];\n"
|
||||
" mul.f64 %fd4, %fd2, %fd3;\n"
|
||||
" mov.f64 %fd5, 0d0000000000000000; \n"
|
||||
" setp.neu.f64 %p3, %fd4, %fd5;\n"
|
||||
" @!%p3 bra $Lt_2_9986;\n"
|
||||
" mov.s32 %r24, 0;\n"
|
||||
" setp.gt.s32 %p4, %r2, %r24;\n"
|
||||
" ld.param.f64 %fd6, [__cudaparm_interp_delxinv];\n"
|
||||
" cvt.ftz.f64.f32 %fd7, %f5;\n"
|
||||
" ld.param.f64 %fd8, [__cudaparm_interp_b_lo_x];\n"
|
||||
" sub.f64 %fd9, %fd7, %fd8;\n"
|
||||
" mul.f64 %fd10, %fd6, %fd9;\n"
|
||||
" @!%p4 bra $Lt_2_16386;\n"
|
||||
" mov.u64 %rd7, __cuda___cuda_local_var_32677_34_non_const_rho1d_06080;\n"
|
||||
" mov.u64 %rd8, __cuda___cuda_local_var_32678_34_non_const_rho1d_110176;\n"
|
||||
" cvt.rzi.s32.f64 %r25, %fd10;\n"
|
||||
" cvt.rn.f64.s32 %fd11, %r25;\n"
|
||||
" mov.f64 %fd12, 0d3fe0000000000000; \n"
|
||||
" add.f64 %fd13, %fd11, %fd12;\n"
|
||||
" sub.f64 %fd14, %fd13, %fd10;\n"
|
||||
" ld.param.f64 %fd15, [__cudaparm_interp_delyinv];\n"
|
||||
" cvt.ftz.f64.f32 %fd16, %f6;\n"
|
||||
" ld.param.f64 %fd17, [__cudaparm_interp_b_lo_y];\n"
|
||||
" sub.f64 %fd18, %fd16, %fd17;\n"
|
||||
" mul.f64 %fd19, %fd15, %fd18;\n"
|
||||
" cvt.rzi.s32.f64 %r26, %fd19;\n"
|
||||
" cvt.rn.f64.s32 %fd20, %r26;\n"
|
||||
" mov.f64 %fd21, 0d3fe0000000000000; \n"
|
||||
" add.f64 %fd22, %fd20, %fd21;\n"
|
||||
" sub.f64 %fd23, %fd22, %fd19;\n"
|
||||
" mov.s32 %r27, %r2;\n"
|
||||
" cvt.s64.s32 %rd9, %r4;\n"
|
||||
" mov.s32 %r28, %r1;\n"
|
||||
" mul.wide.s32 %rd3, %r4, 8;\n"
|
||||
" add.u64 %rd10, %rd3, %rd7;\n"
|
||||
" add.u64 %rd11, %rd3, %rd8;\n"
|
||||
" mov.s32 %r29, 0;\n"
|
||||
" mov.s32 %r30, %r27;\n"
|
||||
"$Lt_2_10754:\n"
|
||||
" .loc 17 235 0\n"
|
||||
" mov.f64 %fd24, 0d0000000000000000; \n"
|
||||
" mov.f64 %fd25, 0d0000000000000000; \n"
|
||||
" st.shared.f64 [%rd10+0], %fd25;\n"
|
||||
" .loc 17 236 0\n"
|
||||
" mov.f64 %fd26, 0d0000000000000000; \n"
|
||||
" mov.f64 %fd27, 0d0000000000000000; \n"
|
||||
" st.shared.f64 [%rd11+0], %fd27;\n"
|
||||
" .loc 17 237 0\n"
|
||||
" mov.s32 %r31, %r28;\n"
|
||||
" setp.lt.s32 %p5, %r28, %r29;\n"
|
||||
" @%p5 bra $Lt_2_11010;\n"
|
||||
" cvt.s64.s32 %rd12, %r2;\n"
|
||||
" mul.wide.s32 %rd13, %r2, 8;\n"
|
||||
" cvt.s64.s32 %rd14, %r28;\n"
|
||||
" mul.wide.s32 %rd15, %r28, 8;\n"
|
||||
" add.u64 %rd16, %rd1, %rd15;\n"
|
||||
"$Lt_2_11522:\n"
|
||||
" .loc 17 238 0\n"
|
||||
" ld.shared.f64 %fd28, [%rd16+0];\n"
|
||||
" mad.rn.f64 %fd24, %fd24, %fd14, %fd28;\n"
|
||||
" st.shared.f64 [%rd10+0], %fd24;\n"
|
||||
" .loc 17 239 0\n"
|
||||
" mad.rn.f64 %fd26, %fd26, %fd23, %fd28;\n"
|
||||
" st.shared.f64 [%rd11+0], %fd26;\n"
|
||||
" sub.s32 %r31, %r31, %r2;\n"
|
||||
" sub.u64 %rd16, %rd16, %rd13;\n"
|
||||
" setp.ge.s32 %p6, %r31, %r29;\n"
|
||||
" @%p6 bra $Lt_2_11522;\n"
|
||||
"$Lt_2_11010:\n"
|
||||
" add.s32 %r29, %r29, 1;\n"
|
||||
" add.s32 %r28, %r28, 1;\n"
|
||||
" add.u64 %rd11, %rd11, 512;\n"
|
||||
" add.u64 %rd10, %rd10, 512;\n"
|
||||
" setp.ne.s32 %p7, %r28, %r3;\n"
|
||||
" @%p7 bra $Lt_2_10754;\n"
|
||||
" bra.uni $Lt_2_10242;\n"
|
||||
"$Lt_2_16386:\n"
|
||||
" cvt.rzi.s32.f64 %r25, %fd10;\n"
|
||||
" mov.u64 %rd8, __cuda___cuda_local_var_32678_34_non_const_rho1d_110176;\n"
|
||||
" mov.u64 %rd7, __cuda___cuda_local_var_32677_34_non_const_rho1d_06080;\n"
|
||||
"$Lt_2_10242:\n"
|
||||
" .loc 17 243 0\n"
|
||||
" ld.param.f64 %fd29, [__cudaparm_interp_delzinv];\n"
|
||||
" cvt.ftz.f64.f32 %fd30, %f7;\n"
|
||||
" ld.param.f64 %fd31, [__cudaparm_interp_b_lo_z];\n"
|
||||
" sub.f64 %fd32, %fd30, %fd31;\n"
|
||||
" mul.f64 %fd33, %fd29, %fd32;\n"
|
||||
" cvt.rzi.s32.f64 %r32, %fd33;\n"
|
||||
" ld.param.s32 %r33, [__cudaparm_interp_npts_yx];\n"
|
||||
" mul.lo.s32 %r34, %r32, %r33;\n"
|
||||
" add.s32 %r35, %r25, %r34;\n"
|
||||
" @!%p4 bra $Lt_2_16898;\n"
|
||||
" cvt.rn.f64.s32 %fd34, %r32;\n"
|
||||
" mov.f64 %fd35, 0d3fe0000000000000; \n"
|
||||
" add.f64 %fd36, %fd34, %fd35;\n"
|
||||
" sub.f64 %fd37, %fd36, %fd33;\n"
|
||||
" mov.s32 %r36, %r2;\n"
|
||||
" cvt.ftz.f64.f32 %fd38, %f6;\n"
|
||||
" cvt.s64.s32 %rd17, %r4;\n"
|
||||
" ld.param.f64 %fd39, [__cudaparm_interp_delyinv];\n"
|
||||
" ld.param.f64 %fd40, [__cudaparm_interp_b_lo_y];\n"
|
||||
" sub.f64 %fd41, %fd38, %fd40;\n"
|
||||
" mul.f64 %fd42, %fd39, %fd41;\n"
|
||||
" cvt.rzi.s32.f64 %r37, %fd42;\n"
|
||||
" mul.wide.s32 %rd3, %r4, 8;\n"
|
||||
" ld.param.s32 %r38, [__cudaparm_interp_npts_x];\n"
|
||||
" mul.lo.s32 %r39, %r37, %r38;\n"
|
||||
" add.u64 %rd18, %rd3, %rd7;\n"
|
||||
" add.u64 %rd19, %rd3, %rd8;\n"
|
||||
" cvt.s64.s32 %rd20, %r38;\n"
|
||||
" mul.wide.s32 %rd21, %r38, 32;\n"
|
||||
" add.s32 %r40, %r39, %r35;\n"
|
||||
" mov.s32 %r41, %r40;\n"
|
||||
" ld.param.u64 %rd22, [__cudaparm_interp_brick];\n"
|
||||
" mov.s32 %r42, 0;\n"
|
||||
" mov.f32 %f13, 0f00000000; \n"
|
||||
" mov.f32 %f14, 0f00000000; \n"
|
||||
" mov.f32 %f15, 0f00000000; \n"
|
||||
" mov.s32 %r43, %r36;\n"
|
||||
"$Lt_2_12802:\n"
|
||||
" .loc 17 246 0\n"
|
||||
" add.s32 %r44, %r42, %r1;\n"
|
||||
" mov.s32 %r45, %r44;\n"
|
||||
" setp.lt.s32 %p8, %r44, %r42;\n"
|
||||
" @%p8 bra $Lt_2_17154;\n"
|
||||
" cvt.s64.s32 %rd23, %r2;\n"
|
||||
" mul.wide.s32 %rd13, %r2, 8;\n"
|
||||
" cvt.s64.s32 %rd24, %r44;\n"
|
||||
" mul.wide.s32 %rd25, %r44, 8;\n"
|
||||
" add.u64 %rd26, %rd1, %rd25;\n"
|
||||
" mov.f64 %fd43, 0d0000000000000000; \n"
|
||||
"$Lt_2_13570:\n"
|
||||
" .loc 17 247 0\n"
|
||||
" ld.shared.f64 %fd44, [%rd26+0];\n"
|
||||
" mad.rn.f64 %fd43, %fd37, %fd43, %fd44;\n"
|
||||
" sub.s32 %r45, %r45, %r2;\n"
|
||||
" sub.u64 %rd26, %rd26, %rd13;\n"
|
||||
" setp.ge.s32 %p9, %r45, %r42;\n"
|
||||
" @%p9 bra $Lt_2_13570;\n"
|
||||
" bra.uni $Lt_2_13058;\n"
|
||||
"$Lt_2_17154:\n"
|
||||
" mov.f64 %fd43, 0d0000000000000000; \n"
|
||||
"$Lt_2_13058:\n"
|
||||
" .loc 17 249 0\n"
|
||||
" mov.s32 %r46, %r41;\n"
|
||||
" mov.s32 %r47, %r2;\n"
|
||||
" mov.s32 %r48, %r46;\n"
|
||||
" mul.f64 %fd45, %fd4, %fd43;\n"
|
||||
" mov.s64 %rd27, %rd19;\n"
|
||||
" cvt.s64.s32 %rd28, %r46;\n"
|
||||
" mul.wide.s32 %rd29, %r46, 32;\n"
|
||||
" mov.s32 %r49, 0;\n"
|
||||
" mov.s32 %r50, %r47;\n"
|
||||
"$Lt_2_14594:\n"
|
||||
" mov.s32 %r51, %r2;\n"
|
||||
" mov.s32 %r52, %r48;\n"
|
||||
" add.s32 %r53, %r48, %r2;\n"
|
||||
" mov.s64 %rd30, %rd18;\n"
|
||||
" ld.shared.f64 %fd46, [%rd27+0];\n"
|
||||
" add.u64 %rd31, %rd29, %rd22;\n"
|
||||
" mul.f64 %fd47, %fd45, %fd46;\n"
|
||||
" mov.s32 %r54, %r51;\n"
|
||||
"$Lt_2_15362:\n"
|
||||
" .loc 17 253 0\n"
|
||||
" ld.shared.f64 %fd48, [%rd30+0];\n"
|
||||
" mul.f64 %fd49, %fd48, %fd47;\n"
|
||||
" .loc 17 255 0\n"
|
||||
" cvt.ftz.f64.f32 %fd50, %f15;\n"
|
||||
" ld.global.v2.f64 {%fd51,%fd52}, [%rd31+0];\n"
|
||||
" mul.f64 %fd53, %fd49, %fd51;\n"
|
||||
" sub.f64 %fd54, %fd50, %fd53;\n"
|
||||
" cvt.rn.ftz.f32.f64 %f15, %fd54;\n"
|
||||
" .loc 17 256 0\n"
|
||||
" cvt.ftz.f64.f32 %fd55, %f14;\n"
|
||||
" mul.f64 %fd56, %fd49, %fd52;\n"
|
||||
" sub.f64 %fd57, %fd55, %fd56;\n"
|
||||
" cvt.rn.ftz.f32.f64 %f14, %fd57;\n"
|
||||
" .loc 17 257 0\n"
|
||||
" cvt.ftz.f64.f32 %fd58, %f13;\n"
|
||||
" ld.global.f64 %fd59, [%rd31+16];\n"
|
||||
" mul.f64 %fd60, %fd49, %fd59;\n"
|
||||
" sub.f64 %fd61, %fd58, %fd60;\n"
|
||||
" cvt.rn.ftz.f32.f64 %f13, %fd61;\n"
|
||||
" add.s32 %r52, %r52, 1;\n"
|
||||
" add.u64 %rd31, %rd31, 32;\n"
|
||||
" add.u64 %rd30, %rd30, 512;\n"
|
||||
" setp.ne.s32 %p10, %r52, %r53;\n"
|
||||
" @%p10 bra $Lt_2_15362;\n"
|
||||
" add.s32 %r49, %r49, 1;\n"
|
||||
" add.s32 %r48, %r48, %r38;\n"
|
||||
" add.u64 %rd29, %rd29, %rd21;\n"
|
||||
" add.u64 %rd27, %rd27, 512;\n"
|
||||
" setp.ne.s32 %p11, %r49, %r2;\n"
|
||||
" @%p11 bra $Lt_2_14594;\n"
|
||||
" add.s32 %r42, %r42, 1;\n"
|
||||
" add.s32 %r41, %r46, %r33;\n"
|
||||
" setp.ne.s32 %p12, %r42, %r2;\n"
|
||||
" @%p12 bra $Lt_2_12802;\n"
|
||||
" bra.uni $Lt_2_9730;\n"
|
||||
"$Lt_2_16898:\n"
|
||||
" mov.f32 %f13, 0f00000000; \n"
|
||||
" mov.f32 %f14, 0f00000000; \n"
|
||||
" mov.f32 %f15, 0f00000000; \n"
|
||||
" bra.uni $Lt_2_9730;\n"
|
||||
"$Lt_2_9986:\n"
|
||||
" mov.f32 %f13, 0f00000000; \n"
|
||||
" mov.f32 %f14, 0f00000000; \n"
|
||||
" mov.f32 %f15, 0f00000000; \n"
|
||||
"$Lt_2_9730:\n"
|
||||
" .loc 17 264 0\n"
|
||||
" ld.param.u64 %rd32, [__cudaparm_interp_ans];\n"
|
||||
" cvt.s64.s32 %rd33, %r8;\n"
|
||||
" mul.wide.s32 %rd34, %r8, 16;\n"
|
||||
" add.u64 %rd35, %rd32, %rd34;\n"
|
||||
" mov.f32 %f16, %f17;\n"
|
||||
" st.global.v4.f32 [%rd35+0], {%f15,%f14,%f13,%f16};\n"
|
||||
"$Lt_2_9218:\n"
|
||||
" .loc 17 266 0\n"
|
||||
" exit;\n"
|
||||
"$LDWend_interp:\n"
|
||||
" }\n"
|
||||
;
|
||||
@ -1,881 +0,0 @@
|
||||
.version 2.3
|
||||
.target sm_20
|
||||
.address_size 64
|
||||
// compiled with /usr/local/cuda/open64/lib//be
|
||||
// nvopencc 4.0 built on 2011-05-12
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Compiling /tmp/tmpxft_00009b0b_00000000-9_lal_pppm.cpp3.i (/home/sjplimp/ccBI#.wCkpTI)
|
||||
//-----------------------------------------------------------
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Options:
|
||||
//-----------------------------------------------------------
|
||||
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
|
||||
// -O3 (Optimization level)
|
||||
// -g0 (Debug level)
|
||||
// -m2 (Report advisories)
|
||||
//-----------------------------------------------------------
|
||||
|
||||
.file 1 "<command-line>"
|
||||
.file 2 "/tmp/tmpxft_00009b0b_00000000-8_lal_pppm.cudafe2.gpu"
|
||||
.file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
|
||||
.file 4 "/usr/local/cuda/include/crt/device_runtime.h"
|
||||
.file 5 "/usr/local/cuda/include/host_defines.h"
|
||||
.file 6 "/usr/local/cuda/include/builtin_types.h"
|
||||
.file 7 "/usr/local/cuda/include/device_types.h"
|
||||
.file 8 "/usr/local/cuda/include/driver_types.h"
|
||||
.file 9 "/usr/local/cuda/include/surface_types.h"
|
||||
.file 10 "/usr/local/cuda/include/texture_types.h"
|
||||
.file 11 "/usr/local/cuda/include/vector_types.h"
|
||||
.file 12 "/usr/local/cuda/include/device_launch_parameters.h"
|
||||
.file 13 "/usr/local/cuda/include/crt/storage_class.h"
|
||||
.file 14 "/usr/include/bits/types.h"
|
||||
.file 15 "/usr/include/time.h"
|
||||
.file 16 "/usr/local/cuda/include/sm_11_atomic_functions.h"
|
||||
.file 17 "lal_pppm.cu"
|
||||
.file 18 "/usr/local/cuda/include/common_functions.h"
|
||||
.file 19 "/usr/local/cuda/include/math_functions.h"
|
||||
.file 20 "/usr/local/cuda/include/math_constants.h"
|
||||
.file 21 "/usr/local/cuda/include/device_functions.h"
|
||||
.file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h"
|
||||
.file 23 "/usr/local/cuda/include/sm_13_double_functions.h"
|
||||
.file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h"
|
||||
.file 25 "/usr/local/cuda/include/sm_20_intrinsics.h"
|
||||
.file 26 "/usr/local/cuda/include/surface_functions.h"
|
||||
.file 27 "/usr/local/cuda/include/texture_fetch_functions.h"
|
||||
.file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
|
||||
|
||||
.global .texref pos_tex;
|
||||
.global .texref q_tex;
|
||||
|
||||
.entry particle_map (
|
||||
.param .u64 __cudaparm_particle_map_x_,
|
||||
.param .u64 __cudaparm_particle_map_q_,
|
||||
.param .f32 __cudaparm_particle_map_delvolinv,
|
||||
.param .s32 __cudaparm_particle_map_nlocal,
|
||||
.param .u64 __cudaparm_particle_map_counts,
|
||||
.param .u64 __cudaparm_particle_map_ans,
|
||||
.param .f32 __cudaparm_particle_map_b_lo_x,
|
||||
.param .f32 __cudaparm_particle_map_b_lo_y,
|
||||
.param .f32 __cudaparm_particle_map_b_lo_z,
|
||||
.param .f32 __cudaparm_particle_map_delxinv,
|
||||
.param .f32 __cudaparm_particle_map_delyinv,
|
||||
.param .f32 __cudaparm_particle_map_delzinv,
|
||||
.param .s32 __cudaparm_particle_map_nlocal_x,
|
||||
.param .s32 __cudaparm_particle_map_nlocal_y,
|
||||
.param .s32 __cudaparm_particle_map_nlocal_z,
|
||||
.param .s32 __cudaparm_particle_map_atom_stride,
|
||||
.param .s32 __cudaparm_particle_map_max_atoms,
|
||||
.param .u64 __cudaparm_particle_map_error)
|
||||
{
|
||||
.reg .u32 %r<50>;
|
||||
.reg .u64 %rd<12>;
|
||||
.reg .f32 %f<44>;
|
||||
.reg .pred %p<11>;
|
||||
.loc 17 50 0
|
||||
$LDWbegin_particle_map:
|
||||
cvt.s32.u32 %r1, %ntid.x;
|
||||
cvt.s32.u32 %r2, %ctaid.x;
|
||||
mul24.lo.s32 %r3, %r2, %r1;
|
||||
cvt.s32.u32 %r4, %nctaid.x;
|
||||
mul24.lo.s32 %r5, %r4, %r1;
|
||||
mov.u32 %r6, %tid.x;
|
||||
add.u32 %r7, %r3, %r6;
|
||||
sub.s32 %r8, %r5, 1;
|
||||
mul.lo.s32 %r9, %r7, 64;
|
||||
div.s32 %r10, %r9, %r5;
|
||||
mul.lo.s32 %r11, %r8, %r10;
|
||||
sub.s32 %r12, %r9, %r11;
|
||||
ld.param.s32 %r13, [__cudaparm_particle_map_nlocal];
|
||||
setp.le.s32 %p1, %r13, %r12;
|
||||
@%p1 bra $Lt_0_7426;
|
||||
.loc 17 62 0
|
||||
mov.u32 %r14, %r12;
|
||||
mov.s32 %r15, 0;
|
||||
mov.u32 %r16, %r15;
|
||||
mov.s32 %r17, 0;
|
||||
mov.u32 %r18, %r17;
|
||||
mov.s32 %r19, 0;
|
||||
mov.u32 %r20, %r19;
|
||||
tex.1d.v4.f32.s32 {%f1,%f2,%f3,%f4},[pos_tex,{%r14,%r16,%r18,%r20}];
|
||||
mov.f32 %f5, %f1;
|
||||
mov.f32 %f6, %f2;
|
||||
mov.f32 %f7, %f3;
|
||||
.loc 17 64 0
|
||||
mov.u32 %r21, %r12;
|
||||
mov.s32 %r22, 0;
|
||||
mov.u32 %r23, %r22;
|
||||
mov.s32 %r24, 0;
|
||||
mov.u32 %r25, %r24;
|
||||
mov.s32 %r26, 0;
|
||||
mov.u32 %r27, %r26;
|
||||
tex.1d.v4.f32.s32 {%f8,%f9,%f10,%f11},[q_tex,{%r21,%r23,%r25,%r27}];
|
||||
mov.f32 %f12, %f8;
|
||||
ld.param.f32 %f13, [__cudaparm_particle_map_delvolinv];
|
||||
mul.ftz.f32 %f14, %f13, %f12;
|
||||
mov.f32 %f15, 0f00000000; // 0
|
||||
setp.neu.ftz.f32 %p2, %f14, %f15;
|
||||
@!%p2 bra $Lt_0_7426;
|
||||
.loc 17 67 0
|
||||
ld.param.f32 %f16, [__cudaparm_particle_map_b_lo_x];
|
||||
sub.ftz.f32 %f17, %f5, %f16;
|
||||
ld.param.f32 %f18, [__cudaparm_particle_map_delxinv];
|
||||
mul.ftz.f32 %f19, %f18, %f17;
|
||||
mov.f32 %f20, 0f00000000; // 0
|
||||
setp.lt.ftz.f32 %p3, %f19, %f20;
|
||||
@%p3 bra $Lt_0_8706;
|
||||
ld.param.f32 %f21, [__cudaparm_particle_map_b_lo_y];
|
||||
sub.ftz.f32 %f22, %f6, %f21;
|
||||
ld.param.f32 %f23, [__cudaparm_particle_map_delyinv];
|
||||
mul.ftz.f32 %f24, %f23, %f22;
|
||||
mov.f32 %f25, 0f00000000; // 0
|
||||
setp.lt.ftz.f32 %p4, %f24, %f25;
|
||||
@%p4 bra $Lt_0_8706;
|
||||
ld.param.f32 %f26, [__cudaparm_particle_map_b_lo_z];
|
||||
sub.ftz.f32 %f27, %f7, %f26;
|
||||
ld.param.f32 %f28, [__cudaparm_particle_map_delzinv];
|
||||
mul.ftz.f32 %f29, %f28, %f27;
|
||||
mov.f32 %f30, 0f00000000; // 0
|
||||
setp.lt.ftz.f32 %p5, %f29, %f30;
|
||||
@%p5 bra $Lt_0_8706;
|
||||
cvt.rzi.ftz.s32.f32 %r28, %f19;
|
||||
ld.param.s32 %r29, [__cudaparm_particle_map_nlocal_x];
|
||||
setp.ge.s32 %p6, %r28, %r29;
|
||||
@%p6 bra $Lt_0_8706;
|
||||
cvt.rzi.ftz.s32.f32 %r30, %f24;
|
||||
ld.param.s32 %r31, [__cudaparm_particle_map_nlocal_y];
|
||||
setp.ge.s32 %p7, %r30, %r31;
|
||||
@%p7 bra $Lt_0_8706;
|
||||
cvt.rzi.ftz.s32.f32 %r32, %f29;
|
||||
ld.param.s32 %r33, [__cudaparm_particle_map_nlocal_z];
|
||||
setp.gt.s32 %p8, %r33, %r32;
|
||||
@%p8 bra $L_0_4866;
|
||||
$Lt_0_8706:
|
||||
$L_0_5122:
|
||||
.loc 17 76 0
|
||||
mov.s32 %r34, 1;
|
||||
ld.param.u64 %rd1, [__cudaparm_particle_map_error];
|
||||
st.global.s32 [%rd1+0], %r34;
|
||||
bra.uni $Lt_0_7426;
|
||||
$L_0_4866:
|
||||
.loc 17 83 0
|
||||
mul.lo.s32 %r35, %r32, %r31;
|
||||
add.s32 %r36, %r30, %r35;
|
||||
mul.lo.s32 %r37, %r36, %r29;
|
||||
add.s32 %r38, %r28, %r37;
|
||||
ld.param.u64 %rd2, [__cudaparm_particle_map_counts];
|
||||
cvt.s64.s32 %rd3, %r38;
|
||||
mul.wide.s32 %rd4, %r38, 4;
|
||||
add.u64 %rd5, %rd2, %rd4;
|
||||
mov.s32 %r39, 1;
|
||||
atom.global.add.s32 %r40, [%rd5], %r39;
|
||||
mov.s32 %r41, %r40;
|
||||
ld.param.s32 %r42, [__cudaparm_particle_map_max_atoms];
|
||||
setp.gt.s32 %p9, %r42, %r41;
|
||||
@%p9 bra $Lt_0_7682;
|
||||
.loc 17 85 0
|
||||
mov.s32 %r43, 2;
|
||||
ld.param.u64 %rd6, [__cudaparm_particle_map_error];
|
||||
st.global.s32 [%rd6+0], %r43;
|
||||
.loc 16 118 0
|
||||
mov.s32 %r44, -1;
|
||||
atom.global.add.s32 %r45, [%rd5], %r44;
|
||||
bra.uni $Lt_0_7426;
|
||||
$Lt_0_7682:
|
||||
.loc 17 88 0
|
||||
ld.param.u64 %rd7, [__cudaparm_particle_map_ans];
|
||||
ld.param.s32 %r46, [__cudaparm_particle_map_atom_stride];
|
||||
mul.lo.s32 %r47, %r46, %r41;
|
||||
add.s32 %r48, %r38, %r47;
|
||||
cvt.s64.s32 %rd8, %r48;
|
||||
mul.wide.s32 %rd9, %r48, 16;
|
||||
add.u64 %rd10, %rd7, %rd9;
|
||||
cvt.rn.f32.s32 %f31, %r28;
|
||||
mov.f32 %f32, 0f3f000000; // 0.5
|
||||
add.ftz.f32 %f33, %f31, %f32;
|
||||
sub.ftz.f32 %f34, %f33, %f19;
|
||||
cvt.rn.f32.s32 %f35, %r30;
|
||||
mov.f32 %f36, 0f3f000000; // 0.5
|
||||
add.ftz.f32 %f37, %f35, %f36;
|
||||
sub.ftz.f32 %f38, %f37, %f24;
|
||||
cvt.rn.f32.s32 %f39, %r32;
|
||||
mov.f32 %f40, 0f3f000000; // 0.5
|
||||
add.ftz.f32 %f41, %f39, %f40;
|
||||
sub.ftz.f32 %f42, %f41, %f29;
|
||||
st.global.v4.f32 [%rd10+0], {%f34,%f38,%f42,%f14};
|
||||
$Lt_0_7426:
|
||||
$L_0_4610:
|
||||
$Lt_0_6914:
|
||||
$Lt_0_6402:
|
||||
.loc 17 92 0
|
||||
exit;
|
||||
$LDWend_particle_map:
|
||||
} // particle_map
|
||||
|
||||
.entry make_rho (
|
||||
.param .u64 __cudaparm_make_rho_counts,
|
||||
.param .u64 __cudaparm_make_rho_atoms,
|
||||
.param .u64 __cudaparm_make_rho_brick,
|
||||
.param .u64 __cudaparm_make_rho__rho_coeff,
|
||||
.param .s32 __cudaparm_make_rho_atom_stride,
|
||||
.param .s32 __cudaparm_make_rho_npts_x,
|
||||
.param .s32 __cudaparm_make_rho_npts_y,
|
||||
.param .s32 __cudaparm_make_rho_npts_z,
|
||||
.param .s32 __cudaparm_make_rho_nlocal_x,
|
||||
.param .s32 __cudaparm_make_rho_nlocal_y,
|
||||
.param .s32 __cudaparm_make_rho_nlocal_z,
|
||||
.param .s32 __cudaparm_make_rho_order_m_1,
|
||||
.param .s32 __cudaparm_make_rho_order,
|
||||
.param .s32 __cudaparm_make_rho_order2)
|
||||
{
|
||||
.reg .u32 %r<119>;
|
||||
.reg .u64 %rd<57>;
|
||||
.reg .f32 %f<26>;
|
||||
.reg .pred %p<27>;
|
||||
.shared .align 4 .b8 __cuda___cuda_local_var_32578_33_non_const_rho_coeff168[256];
|
||||
.shared .align 4 .b8 __cuda___cuda_local_var_32579_33_non_const_front424[320];
|
||||
.shared .align 4 .b8 __cuda___cuda_local_var_32580_33_non_const_ans744[2048];
|
||||
.loc 17 101 0
|
||||
$LDWbegin_make_rho:
|
||||
ld.param.s32 %r1, [__cudaparm_make_rho_order2];
|
||||
ld.param.s32 %r2, [__cudaparm_make_rho_order];
|
||||
add.s32 %r3, %r1, %r2;
|
||||
cvt.s32.u32 %r4, %tid.x;
|
||||
setp.le.s32 %p1, %r3, %r4;
|
||||
@%p1 bra $Lt_1_16898;
|
||||
.loc 17 108 0
|
||||
mov.u64 %rd1, __cuda___cuda_local_var_32578_33_non_const_rho_coeff168;
|
||||
cvt.s64.s32 %rd2, %r4;
|
||||
mul.wide.s32 %rd3, %r4, 4;
|
||||
ld.param.u64 %rd4, [__cudaparm_make_rho__rho_coeff];
|
||||
add.u64 %rd5, %rd4, %rd3;
|
||||
ld.global.f32 %f1, [%rd5+0];
|
||||
add.u64 %rd6, %rd3, %rd1;
|
||||
st.shared.f32 [%rd6+0], %f1;
|
||||
$Lt_1_16898:
|
||||
mov.u64 %rd1, __cuda___cuda_local_var_32578_33_non_const_rho_coeff168;
|
||||
shr.s32 %r5, %r4, 31;
|
||||
mov.s32 %r6, 31;
|
||||
and.b32 %r7, %r5, %r6;
|
||||
add.s32 %r8, %r7, %r4;
|
||||
shr.s32 %r9, %r8, 5;
|
||||
mul.lo.s32 %r10, %r9, 32;
|
||||
sub.s32 %r11, %r4, %r10;
|
||||
setp.lt.s32 %p2, %r11, %r2;
|
||||
@!%p2 bra $Lt_1_17410;
|
||||
.loc 17 114 0
|
||||
mov.u64 %rd7, __cuda___cuda_local_var_32579_33_non_const_front424;
|
||||
mov.f32 %f2, 0f00000000; // 0
|
||||
cvt.s64.s32 %rd8, %r11;
|
||||
shr.s32 %r12, %r4, 31;
|
||||
mov.s32 %r13, 31;
|
||||
and.b32 %r14, %r12, %r13;
|
||||
add.s32 %r15, %r14, %r4;
|
||||
shr.s32 %r16, %r15, 5;
|
||||
cvt.s64.s32 %rd9, %r16;
|
||||
mul.wide.s32 %rd10, %r16, 40;
|
||||
add.u64 %rd11, %rd8, %rd10;
|
||||
mul.lo.u64 %rd12, %rd11, 4;
|
||||
add.u64 %rd13, %rd7, %rd12;
|
||||
st.shared.f32 [%rd13+128], %f2;
|
||||
$Lt_1_17410:
|
||||
mov.u64 %rd7, __cuda___cuda_local_var_32579_33_non_const_front424;
|
||||
.loc 17 116 0
|
||||
bar.sync 0;
|
||||
ld.param.s32 %r17, [__cudaparm_make_rho_npts_x];
|
||||
shr.s32 %r18, %r17, 31;
|
||||
mov.s32 %r19, 31;
|
||||
and.b32 %r20, %r18, %r19;
|
||||
add.s32 %r21, %r20, %r17;
|
||||
shr.s32 %r22, %r21, 5;
|
||||
add.s32 %r23, %r22, 1;
|
||||
mov.u32 %r24, 0;
|
||||
setp.le.s32 %p3, %r23, %r24;
|
||||
@%p3 bra $Lt_1_17922;
|
||||
shr.s32 %r25, %r4, 31;
|
||||
mov.s32 %r26, 31;
|
||||
and.b32 %r27, %r25, %r26;
|
||||
add.s32 %r28, %r27, %r4;
|
||||
shr.s32 %r29, %r28, 5;
|
||||
add.s32 %r30, %r11, 32;
|
||||
ld.param.s32 %r31, [__cudaparm_make_rho_nlocal_y];
|
||||
ld.param.s32 %r32, [__cudaparm_make_rho_nlocal_x];
|
||||
mul.lo.s32 %r33, %r31, %r32;
|
||||
mov.u32 %r34, %ctaid.x;
|
||||
mul.lo.u32 %r35, %r34, 2;
|
||||
add.u32 %r36, %r29, %r35;
|
||||
ld.param.s32 %r37, [__cudaparm_make_rho_npts_y];
|
||||
div.s32 %r38, %r36, %r37;
|
||||
ld.param.s32 %r39, [__cudaparm_make_rho_order_m_1];
|
||||
setp.lt.s32 %p4, %r38, %r39;
|
||||
sub.s32 %r40, %r39, %r38;
|
||||
mov.s32 %r41, 0;
|
||||
selp.s32 %r42, %r40, %r41, %p4;
|
||||
ld.param.s32 %r43, [__cudaparm_make_rho_nlocal_z];
|
||||
setp.ge.s32 %p5, %r38, %r43;
|
||||
sub.s32 %r44, %r43, %r38;
|
||||
add.s32 %r45, %r44, %r2;
|
||||
sub.s32 %r46, %r45, 1;
|
||||
selp.s32 %r47, %r46, %r2, %p5;
|
||||
rem.s32 %r48, %r36, %r37;
|
||||
setp.lt.s32 %p6, %r48, %r39;
|
||||
sub.s32 %r49, %r39, %r48;
|
||||
mov.s32 %r50, 0;
|
||||
selp.s32 %r51, %r49, %r50, %p6;
|
||||
setp.ge.s32 %p7, %r48, %r31;
|
||||
sub.s32 %r52, %r31, %r48;
|
||||
add.s32 %r53, %r52, %r2;
|
||||
sub.s32 %r54, %r53, 1;
|
||||
selp.s32 %r55, %r54, %r2, %p7;
|
||||
mov.s32 %r56, %r23;
|
||||
mov.s32 %r57, 0;
|
||||
setp.gt.s32 %p8, %r2, %r57;
|
||||
mov.s32 %r58, 0;
|
||||
cvt.s64.s32 %rd14, %r11;
|
||||
cvt.s64.s32 %rd15, %r29;
|
||||
mul.lo.s32 %r59, %r23, 32;
|
||||
mul.wide.s32 %rd16, %r29, 40;
|
||||
add.u64 %rd17, %rd14, %rd16;
|
||||
ld.param.s32 %r60, [__cudaparm_make_rho_npts_z];
|
||||
setp.gt.s32 %p9, %r60, %r38;
|
||||
mul.lo.u64 %rd18, %rd17, 4;
|
||||
selp.s32 %r61, 1, 0, %p9;
|
||||
add.u64 %rd19, %rd18, %rd7;
|
||||
mov.u64 %rd20, __cuda___cuda_local_var_32580_33_non_const_ans744;
|
||||
mov.s32 %r62, %r56;
|
||||
$Lt_1_18434:
|
||||
//<loop> Loop body line 116, nesting depth: 1, estimated iterations: unknown
|
||||
@!%p8 bra $Lt_1_18690;
|
||||
mov.s32 %r63, %r2;
|
||||
cvt.s64.s32 %rd21, %r4;
|
||||
mul.wide.s32 %rd22, %r4, 4;
|
||||
add.u64 %rd23, %rd20, %rd22;
|
||||
mov.s32 %r64, 0;
|
||||
mov.s32 %r65, %r63;
|
||||
$Lt_1_19202:
|
||||
//<loop> Loop body line 116, nesting depth: 2, estimated iterations: unknown
|
||||
.loc 17 140 0
|
||||
mov.f32 %f3, 0f00000000; // 0
|
||||
st.shared.f32 [%rd23+0], %f3;
|
||||
add.s32 %r64, %r64, 1;
|
||||
add.u64 %rd23, %rd23, 256;
|
||||
setp.ne.s32 %p10, %r64, %r2;
|
||||
@%p10 bra $Lt_1_19202;
|
||||
$Lt_1_18690:
|
||||
add.s32 %r66, %r11, %r58;
|
||||
set.lt.u32.s32 %r67, %r66, %r32;
|
||||
neg.s32 %r68, %r67;
|
||||
and.b32 %r69, %r61, %r68;
|
||||
mov.u32 %r70, 0;
|
||||
setp.eq.s32 %p11, %r69, %r70;
|
||||
@%p11 bra $Lt_1_20226;
|
||||
.loc 17 143 0
|
||||
mov.s32 %r71, %r42;
|
||||
setp.ge.s32 %p12, %r42, %r47;
|
||||
@%p12 bra $Lt_1_20226;
|
||||
sub.s32 %r72, %r47, %r42;
|
||||
setp.lt.s32 %p13, %r51, %r55;
|
||||
mov.s32 %r73, %r72;
|
||||
$Lt_1_20738:
|
||||
//<loop> Loop body line 143, nesting depth: 2, estimated iterations: unknown
|
||||
.loc 17 145 0
|
||||
mov.s32 %r74, %r51;
|
||||
@!%p13 bra $Lt_1_20994;
|
||||
sub.s32 %r75, %r55, %r51;
|
||||
sub.s32 %r76, %r71, %r42;
|
||||
add.s32 %r77, %r38, %r42;
|
||||
add.s32 %r78, %r48, %r51;
|
||||
sub.s32 %r79, %r77, %r39;
|
||||
sub.s32 %r80, %r78, %r39;
|
||||
add.s32 %r81, %r76, %r79;
|
||||
mul.lo.s32 %r82, %r33, %r81;
|
||||
ld.param.s32 %r83, [__cudaparm_make_rho_atom_stride];
|
||||
ld.param.u64 %rd24, [__cudaparm_make_rho_counts];
|
||||
mov.s32 %r84, %r75;
|
||||
$Lt_1_21506:
|
||||
//<loop> Loop body line 145, nesting depth: 3, estimated iterations: unknown
|
||||
.loc 17 147 0
|
||||
sub.s32 %r85, %r74, %r51;
|
||||
add.s32 %r86, %r85, %r80;
|
||||
mul.lo.s32 %r87, %r86, %r32;
|
||||
add.s32 %r88, %r82, %r87;
|
||||
add.s32 %r89, %r66, %r88;
|
||||
cvt.s64.s32 %rd25, %r89;
|
||||
mul.wide.s32 %rd26, %r89, 4;
|
||||
add.u64 %rd27, %rd24, %rd26;
|
||||
ld.global.s32 %r90, [%rd27+0];
|
||||
mul.lo.s32 %r91, %r90, %r83;
|
||||
.loc 17 148 0
|
||||
mov.s32 %r92, %r89;
|
||||
setp.ge.s32 %p14, %r89, %r91;
|
||||
@%p14 bra $Lt_1_21762;
|
||||
sub.s32 %r93, %r3, 1;
|
||||
cvt.s64.s32 %rd28, %r83;
|
||||
mul.wide.s32 %rd29, %r83, 16;
|
||||
mov.s32 %r94, -1;
|
||||
setp.gt.s32 %p15, %r93, %r94;
|
||||
ld.param.u64 %rd30, [__cudaparm_make_rho_atoms];
|
||||
mul.lo.u64 %rd31, %rd25, 16;
|
||||
add.u64 %rd32, %rd30, %rd31;
|
||||
$Lt_1_22274:
|
||||
//<loop> Loop body line 148, nesting depth: 4, estimated iterations: unknown
|
||||
.loc 17 149 0
|
||||
ld.global.f32 %f4, [%rd32+0];
|
||||
@!%p15 bra $Lt_1_29954;
|
||||
sub.s32 %r95, %r93, %r74;
|
||||
mov.s32 %r96, -1;
|
||||
sub.s32 %r97, %r96, %r74;
|
||||
cvt.s64.s32 %rd33, %r2;
|
||||
mul.wide.s32 %rd34, %r2, 4;
|
||||
ld.global.f32 %f5, [%rd32+4];
|
||||
ld.global.f32 %f6, [%rd32+8];
|
||||
cvt.s64.s32 %rd35, %r95;
|
||||
mul.wide.s32 %rd36, %r95, 4;
|
||||
add.u64 %rd37, %rd1, %rd36;
|
||||
sub.s32 %r98, %r93, %r71;
|
||||
cvt.s64.s32 %rd38, %r98;
|
||||
mul.wide.s32 %rd39, %r98, 4;
|
||||
add.u64 %rd40, %rd1, %rd39;
|
||||
mov.f32 %f7, 0f00000000; // 0
|
||||
mov.f32 %f8, 0f00000000; // 0
|
||||
$Lt_1_23042:
|
||||
//<loop> Loop body line 149, nesting depth: 5, estimated iterations: unknown
|
||||
.loc 17 154 0
|
||||
ld.shared.f32 %f9, [%rd37+0];
|
||||
fma.rn.ftz.f32 %f8, %f8, %f5, %f9;
|
||||
.loc 17 155 0
|
||||
ld.shared.f32 %f10, [%rd40+0];
|
||||
fma.rn.ftz.f32 %f7, %f7, %f6, %f10;
|
||||
sub.u64 %rd40, %rd40, %rd34;
|
||||
sub.s32 %r95, %r95, %r2;
|
||||
sub.u64 %rd37, %rd37, %rd34;
|
||||
setp.gt.s32 %p16, %r95, %r97;
|
||||
@%p16 bra $Lt_1_23042;
|
||||
bra.uni $Lt_1_22530;
|
||||
$Lt_1_29954:
|
||||
mov.f32 %f7, 0f00000000; // 0
|
||||
mov.f32 %f8, 0f00000000; // 0
|
||||
$Lt_1_22530:
|
||||
.loc 17 157 0
|
||||
ld.global.f32 %f11, [%rd32+12];
|
||||
mul.ftz.f32 %f12, %f7, %f8;
|
||||
mul.ftz.f32 %f13, %f11, %f12;
|
||||
@!%p8 bra $Lt_1_23554;
|
||||
mov.s32 %r99, %r2;
|
||||
cvt.s64.s32 %rd41, %r4;
|
||||
mul.wide.s32 %rd42, %r4, 4;
|
||||
add.u64 %rd43, %rd20, %rd42;
|
||||
mov.s32 %r100, 0;
|
||||
mov.s32 %r101, %r99;
|
||||
$Lt_1_24066:
|
||||
//<loop> Loop body line 157, nesting depth: 5, estimated iterations: unknown
|
||||
.loc 17 161 0
|
||||
add.s32 %r102, %r100, %r1;
|
||||
mov.s32 %r103, %r102;
|
||||
setp.lt.s32 %p17, %r102, %r100;
|
||||
@%p17 bra $Lt_1_30466;
|
||||
cvt.s64.s32 %rd44, %r2;
|
||||
mul.wide.s32 %rd34, %r2, 4;
|
||||
cvt.s64.s32 %rd45, %r102;
|
||||
mul.wide.s32 %rd46, %r102, 4;
|
||||
add.u64 %rd47, %rd1, %rd46;
|
||||
mov.f32 %f14, 0f00000000; // 0
|
||||
$Lt_1_24834:
|
||||
//<loop> Loop body line 161, nesting depth: 6, estimated iterations: unknown
|
||||
.loc 17 162 0
|
||||
ld.shared.f32 %f15, [%rd47+0];
|
||||
fma.rn.ftz.f32 %f14, %f4, %f14, %f15;
|
||||
sub.s32 %r103, %r103, %r2;
|
||||
sub.u64 %rd47, %rd47, %rd34;
|
||||
setp.ge.s32 %p18, %r103, %r100;
|
||||
@%p18 bra $Lt_1_24834;
|
||||
bra.uni $Lt_1_24322;
|
||||
$Lt_1_30466:
|
||||
mov.f32 %f14, 0f00000000; // 0
|
||||
$Lt_1_24322:
|
||||
.loc 17 163 0
|
||||
ld.shared.f32 %f16, [%rd43+0];
|
||||
fma.rn.ftz.f32 %f17, %f14, %f13, %f16;
|
||||
st.shared.f32 [%rd43+0], %f17;
|
||||
add.s32 %r100, %r100, 1;
|
||||
add.u64 %rd43, %rd43, 256;
|
||||
setp.ne.s32 %p19, %r100, %r2;
|
||||
@%p19 bra $Lt_1_24066;
|
||||
$Lt_1_23554:
|
||||
add.s32 %r92, %r92, %r83;
|
||||
add.u64 %rd32, %rd29, %rd32;
|
||||
setp.gt.s32 %p20, %r91, %r92;
|
||||
@%p20 bra $Lt_1_22274;
|
||||
$Lt_1_21762:
|
||||
add.s32 %r74, %r74, 1;
|
||||
setp.ne.s32 %p21, %r55, %r74;
|
||||
@%p21 bra $Lt_1_21506;
|
||||
$Lt_1_20994:
|
||||
add.s32 %r71, %r71, 1;
|
||||
setp.ne.s32 %p22, %r47, %r71;
|
||||
@%p22 bra $Lt_1_20738;
|
||||
$Lt_1_20226:
|
||||
$Lt_1_19714:
|
||||
.loc 17 172 0
|
||||
bar.sync 0;
|
||||
@!%p2 bra $Lt_1_26626;
|
||||
.loc 17 174 0
|
||||
ld.shared.f32 %f18, [%rd19+128];
|
||||
st.shared.f32 [%rd19+0], %f18;
|
||||
.loc 17 175 0
|
||||
mov.f32 %f19, 0f00000000; // 0
|
||||
st.shared.f32 [%rd19+128], %f19;
|
||||
bra.uni $Lt_1_26370;
|
||||
$Lt_1_26626:
|
||||
.loc 17 177 0
|
||||
mov.f32 %f20, 0f00000000; // 0
|
||||
st.shared.f32 [%rd19+0], %f20;
|
||||
$Lt_1_26370:
|
||||
@!%p8 bra $Lt_1_26882;
|
||||
mov.s32 %r104, %r2;
|
||||
cvt.s64.s32 %rd48, %r4;
|
||||
mov.s32 %r105, %r11;
|
||||
add.s32 %r106, %r11, %r2;
|
||||
mul.wide.s32 %rd49, %r4, 4;
|
||||
add.u64 %rd50, %rd20, %rd49;
|
||||
mov.s64 %rd51, %rd19;
|
||||
mov.s32 %r107, %r104;
|
||||
$Lt_1_27394:
|
||||
//<loop> Loop body line 177, nesting depth: 2, estimated iterations: unknown
|
||||
.loc 17 180 0
|
||||
ld.shared.f32 %f21, [%rd50+0];
|
||||
ld.shared.f32 %f22, [%rd51+0];
|
||||
add.ftz.f32 %f23, %f21, %f22;
|
||||
st.shared.f32 [%rd51+0], %f23;
|
||||
.loc 17 181 0
|
||||
bar.sync 0;
|
||||
add.s32 %r105, %r105, 1;
|
||||
add.u64 %rd51, %rd51, 4;
|
||||
add.u64 %rd50, %rd50, 256;
|
||||
setp.ne.s32 %p23, %r105, %r106;
|
||||
@%p23 bra $Lt_1_27394;
|
||||
$Lt_1_26882:
|
||||
set.lt.u32.s32 %r108, %r66, %r17;
|
||||
neg.s32 %r109, %r108;
|
||||
and.b32 %r110, %r61, %r109;
|
||||
mov.u32 %r111, 0;
|
||||
setp.eq.s32 %p24, %r110, %r111;
|
||||
@%p24 bra $Lt_1_27906;
|
||||
.loc 17 185 0
|
||||
ld.shared.f32 %f24, [%rd19+0];
|
||||
ld.param.u64 %rd52, [__cudaparm_make_rho_brick];
|
||||
add.s32 %r112, %r11, %r58;
|
||||
mul.lo.s32 %r113, %r37, %r17;
|
||||
mul.lo.s32 %r114, %r38, %r113;
|
||||
mul.lo.s32 %r115, %r48, %r17;
|
||||
add.s32 %r116, %r114, %r115;
|
||||
add.s32 %r117, %r112, %r116;
|
||||
cvt.s64.s32 %rd53, %r117;
|
||||
mul.wide.s32 %rd54, %r117, 4;
|
||||
add.u64 %rd55, %rd52, %rd54;
|
||||
st.global.f32 [%rd55+0], %f24;
|
||||
$Lt_1_27906:
|
||||
add.s32 %r58, %r58, 32;
|
||||
setp.ne.s32 %p25, %r58, %r59;
|
||||
@%p25 bra $Lt_1_18434;
|
||||
$Lt_1_17922:
|
||||
.loc 17 189 0
|
||||
exit;
|
||||
$LDWend_make_rho:
|
||||
} // make_rho
|
||||
|
||||
.entry interp (
|
||||
.param .u64 __cudaparm_interp_x_,
|
||||
.param .u64 __cudaparm_interp_q_,
|
||||
.param .s32 __cudaparm_interp_nlocal,
|
||||
.param .u64 __cudaparm_interp_brick,
|
||||
.param .u64 __cudaparm_interp__rho_coeff,
|
||||
.param .s32 __cudaparm_interp_npts_x,
|
||||
.param .s32 __cudaparm_interp_npts_yx,
|
||||
.param .f32 __cudaparm_interp_b_lo_x,
|
||||
.param .f32 __cudaparm_interp_b_lo_y,
|
||||
.param .f32 __cudaparm_interp_b_lo_z,
|
||||
.param .f32 __cudaparm_interp_delxinv,
|
||||
.param .f32 __cudaparm_interp_delyinv,
|
||||
.param .f32 __cudaparm_interp_delzinv,
|
||||
.param .s32 __cudaparm_interp_order,
|
||||
.param .s32 __cudaparm_interp_order2,
|
||||
.param .f32 __cudaparm_interp_qqrd2e_scale,
|
||||
.param .u64 __cudaparm_interp_ans)
|
||||
{
|
||||
.reg .u32 %r<56>;
|
||||
.reg .u64 %rd<37>;
|
||||
.reg .f32 %f<69>;
|
||||
.reg .pred %p<14>;
|
||||
.shared .align 4 .b8 __cuda___cuda_local_var_32676_33_non_const_rho_coeff2888[256];
|
||||
.shared .align 4 .b8 __cuda___cuda_local_var_32677_33_non_const_rho1d_03144[2048];
|
||||
.shared .align 4 .b8 __cuda___cuda_local_var_32678_33_non_const_rho1d_15192[2048];
|
||||
// __cuda_local_var_32694_12_non_const_ek = 16
|
||||
.loc 17 199 0
|
||||
$LDWbegin_interp:
|
||||
ld.param.s32 %r1, [__cudaparm_interp_order2];
|
||||
ld.param.s32 %r2, [__cudaparm_interp_order];
|
||||
add.s32 %r3, %r1, %r2;
|
||||
cvt.s32.u32 %r4, %tid.x;
|
||||
setp.le.s32 %p1, %r3, %r4;
|
||||
@%p1 bra $Lt_2_8706;
|
||||
.loc 17 206 0
|
||||
mov.u64 %rd1, __cuda___cuda_local_var_32676_33_non_const_rho_coeff2888;
|
||||
cvt.s64.s32 %rd2, %r4;
|
||||
mul.wide.s32 %rd3, %r4, 4;
|
||||
ld.param.u64 %rd4, [__cudaparm_interp__rho_coeff];
|
||||
add.u64 %rd5, %rd4, %rd3;
|
||||
ld.global.f32 %f1, [%rd5+0];
|
||||
add.u64 %rd6, %rd3, %rd1;
|
||||
st.shared.f32 [%rd6+0], %f1;
|
||||
$Lt_2_8706:
|
||||
mov.u64 %rd1, __cuda___cuda_local_var_32676_33_non_const_rho_coeff2888;
|
||||
.loc 17 207 0
|
||||
bar.sync 0;
|
||||
mov.u32 %r5, %ctaid.x;
|
||||
mov.u32 %r6, %ntid.x;
|
||||
mul.lo.u32 %r7, %r5, %r6;
|
||||
add.u32 %r8, %r4, %r7;
|
||||
ld.param.s32 %r9, [__cudaparm_interp_nlocal];
|
||||
setp.le.s32 %p2, %r9, %r8;
|
||||
@%p2 bra $Lt_2_9218;
|
||||
.loc 17 215 0
|
||||
mov.u32 %r10, %r8;
|
||||
mov.s32 %r11, 0;
|
||||
mov.u32 %r12, %r11;
|
||||
mov.s32 %r13, 0;
|
||||
mov.u32 %r14, %r13;
|
||||
mov.s32 %r15, 0;
|
||||
mov.u32 %r16, %r15;
|
||||
tex.1d.v4.f32.s32 {%f2,%f3,%f4,%f5},[pos_tex,{%r10,%r12,%r14,%r16}];
|
||||
mov.f32 %f6, %f2;
|
||||
mov.f32 %f7, %f3;
|
||||
mov.f32 %f8, %f4;
|
||||
.loc 17 216 0
|
||||
mov.u32 %r17, %r8;
|
||||
mov.s32 %r18, 0;
|
||||
mov.u32 %r19, %r18;
|
||||
mov.s32 %r20, 0;
|
||||
mov.u32 %r21, %r20;
|
||||
mov.s32 %r22, 0;
|
||||
mov.u32 %r23, %r22;
|
||||
tex.1d.v4.f32.s32 {%f9,%f10,%f11,%f12},[q_tex,{%r17,%r19,%r21,%r23}];
|
||||
mov.f32 %f13, %f9;
|
||||
ld.param.f32 %f14, [__cudaparm_interp_qqrd2e_scale];
|
||||
mul.ftz.f32 %f15, %f14, %f13;
|
||||
mov.f32 %f16, 0f00000000; // 0
|
||||
setp.neu.ftz.f32 %p3, %f15, %f16;
|
||||
@!%p3 bra $Lt_2_9986;
|
||||
mov.s32 %r24, 0;
|
||||
setp.gt.s32 %p4, %r2, %r24;
|
||||
ld.param.f32 %f17, [__cudaparm_interp_b_lo_x];
|
||||
sub.ftz.f32 %f18, %f6, %f17;
|
||||
ld.param.f32 %f19, [__cudaparm_interp_delxinv];
|
||||
mul.ftz.f32 %f20, %f19, %f18;
|
||||
@!%p4 bra $Lt_2_16386;
|
||||
mov.u64 %rd7, __cuda___cuda_local_var_32677_33_non_const_rho1d_03144;
|
||||
mov.u64 %rd8, __cuda___cuda_local_var_32678_33_non_const_rho1d_15192;
|
||||
cvt.rzi.ftz.s32.f32 %r25, %f20;
|
||||
cvt.rn.f32.s32 %f21, %r25;
|
||||
mov.f32 %f22, 0f3f000000; // 0.5
|
||||
add.ftz.f32 %f23, %f21, %f22;
|
||||
sub.ftz.f32 %f24, %f23, %f20;
|
||||
ld.param.f32 %f25, [__cudaparm_interp_b_lo_y];
|
||||
sub.ftz.f32 %f26, %f7, %f25;
|
||||
ld.param.f32 %f27, [__cudaparm_interp_delyinv];
|
||||
mul.ftz.f32 %f28, %f27, %f26;
|
||||
cvt.rzi.ftz.s32.f32 %r26, %f28;
|
||||
cvt.rn.f32.s32 %f29, %r26;
|
||||
mov.f32 %f30, 0f3f000000; // 0.5
|
||||
add.ftz.f32 %f31, %f29, %f30;
|
||||
sub.ftz.f32 %f32, %f31, %f28;
|
||||
mov.s32 %r27, %r2;
|
||||
cvt.s64.s32 %rd9, %r4;
|
||||
mov.s32 %r28, %r1;
|
||||
mul.wide.s32 %rd3, %r4, 4;
|
||||
add.u64 %rd10, %rd3, %rd7;
|
||||
add.u64 %rd11, %rd3, %rd8;
|
||||
mov.s32 %r29, 0;
|
||||
mov.s32 %r30, %r27;
|
||||
$Lt_2_10754:
|
||||
//<loop> Loop body line 216, nesting depth: 1, estimated iterations: unknown
|
||||
.loc 17 235 0
|
||||
mov.f32 %f33, 0f00000000; // 0
|
||||
mov.f32 %f34, 0f00000000; // 0
|
||||
st.shared.f32 [%rd10+0], %f34;
|
||||
.loc 17 236 0
|
||||
mov.f32 %f35, 0f00000000; // 0
|
||||
mov.f32 %f36, 0f00000000; // 0
|
||||
st.shared.f32 [%rd11+0], %f36;
|
||||
.loc 17 237 0
|
||||
mov.s32 %r31, %r28;
|
||||
setp.lt.s32 %p5, %r28, %r29;
|
||||
@%p5 bra $Lt_2_11010;
|
||||
cvt.s64.s32 %rd12, %r2;
|
||||
mul.wide.s32 %rd13, %r2, 4;
|
||||
cvt.s64.s32 %rd14, %r28;
|
||||
mul.wide.s32 %rd15, %r28, 4;
|
||||
add.u64 %rd16, %rd1, %rd15;
|
||||
$Lt_2_11522:
|
||||
//<loop> Loop body line 237, nesting depth: 2, estimated iterations: unknown
|
||||
.loc 17 238 0
|
||||
ld.shared.f32 %f37, [%rd16+0];
|
||||
fma.rn.ftz.f32 %f33, %f33, %f24, %f37;
|
||||
st.shared.f32 [%rd10+0], %f33;
|
||||
.loc 17 239 0
|
||||
fma.rn.ftz.f32 %f35, %f35, %f32, %f37;
|
||||
st.shared.f32 [%rd11+0], %f35;
|
||||
sub.s32 %r31, %r31, %r2;
|
||||
sub.u64 %rd16, %rd16, %rd13;
|
||||
setp.ge.s32 %p6, %r31, %r29;
|
||||
@%p6 bra $Lt_2_11522;
|
||||
$Lt_2_11010:
|
||||
add.s32 %r29, %r29, 1;
|
||||
add.s32 %r28, %r28, 1;
|
||||
add.u64 %rd11, %rd11, 256;
|
||||
add.u64 %rd10, %rd10, 256;
|
||||
setp.ne.s32 %p7, %r28, %r3;
|
||||
@%p7 bra $Lt_2_10754;
|
||||
bra.uni $Lt_2_10242;
|
||||
$Lt_2_16386:
|
||||
cvt.rzi.ftz.s32.f32 %r25, %f20;
|
||||
mov.u64 %rd8, __cuda___cuda_local_var_32678_33_non_const_rho1d_15192;
|
||||
mov.u64 %rd7, __cuda___cuda_local_var_32677_33_non_const_rho1d_03144;
|
||||
$Lt_2_10242:
|
||||
.loc 17 243 0
|
||||
ld.param.f32 %f38, [__cudaparm_interp_b_lo_z];
|
||||
sub.ftz.f32 %f39, %f8, %f38;
|
||||
ld.param.f32 %f40, [__cudaparm_interp_delzinv];
|
||||
mul.ftz.f32 %f41, %f40, %f39;
|
||||
cvt.rzi.ftz.s32.f32 %r32, %f41;
|
||||
ld.param.s32 %r33, [__cudaparm_interp_npts_yx];
|
||||
mul.lo.s32 %r34, %r32, %r33;
|
||||
add.s32 %r35, %r25, %r34;
|
||||
@!%p4 bra $Lt_2_16898;
|
||||
cvt.rn.f32.s32 %f42, %r32;
|
||||
mov.f32 %f43, 0f3f000000; // 0.5
|
||||
add.ftz.f32 %f44, %f42, %f43;
|
||||
sub.ftz.f32 %f45, %f44, %f41;
|
||||
mov.s32 %r36, %r2;
|
||||
ld.param.f32 %f46, [__cudaparm_interp_b_lo_y];
|
||||
sub.ftz.f32 %f47, %f7, %f46;
|
||||
cvt.s64.s32 %rd17, %r4;
|
||||
ld.param.f32 %f48, [__cudaparm_interp_delyinv];
|
||||
mul.ftz.f32 %f49, %f48, %f47;
|
||||
cvt.rzi.ftz.s32.f32 %r37, %f49;
|
||||
ld.param.s32 %r38, [__cudaparm_interp_npts_x];
|
||||
mul.lo.s32 %r39, %r37, %r38;
|
||||
mul.wide.s32 %rd3, %r4, 4;
|
||||
add.s32 %r40, %r39, %r35;
|
||||
add.u64 %rd18, %rd3, %rd7;
|
||||
add.u64 %rd19, %rd3, %rd8;
|
||||
cvt.s64.s32 %rd20, %r38;
|
||||
mul.wide.s32 %rd21, %r38, 16;
|
||||
mov.s32 %r41, %r40;
|
||||
ld.param.u64 %rd22, [__cudaparm_interp_brick];
|
||||
mov.s32 %r42, 0;
|
||||
mov.f32 %f50, 0f00000000; // 0
|
||||
mov.f32 %f51, 0f00000000; // 0
|
||||
mov.f32 %f52, 0f00000000; // 0
|
||||
mov.s32 %r43, %r36;
|
||||
$Lt_2_12802:
|
||||
//<loop> Loop body line 243, nesting depth: 1, estimated iterations: unknown
|
||||
.loc 17 246 0
|
||||
add.s32 %r44, %r42, %r1;
|
||||
mov.s32 %r45, %r44;
|
||||
setp.lt.s32 %p8, %r44, %r42;
|
||||
@%p8 bra $Lt_2_17154;
|
||||
cvt.s64.s32 %rd23, %r2;
|
||||
mul.wide.s32 %rd13, %r2, 4;
|
||||
cvt.s64.s32 %rd24, %r44;
|
||||
mul.wide.s32 %rd25, %r44, 4;
|
||||
add.u64 %rd26, %rd1, %rd25;
|
||||
mov.f32 %f53, 0f00000000; // 0
|
||||
$Lt_2_13570:
|
||||
//<loop> Loop body line 246, nesting depth: 2, estimated iterations: unknown
|
||||
.loc 17 247 0
|
||||
ld.shared.f32 %f54, [%rd26+0];
|
||||
fma.rn.ftz.f32 %f53, %f45, %f53, %f54;
|
||||
sub.s32 %r45, %r45, %r2;
|
||||
sub.u64 %rd26, %rd26, %rd13;
|
||||
setp.ge.s32 %p9, %r45, %r42;
|
||||
@%p9 bra $Lt_2_13570;
|
||||
bra.uni $Lt_2_13058;
|
||||
$Lt_2_17154:
|
||||
mov.f32 %f53, 0f00000000; // 0
|
||||
$Lt_2_13058:
|
||||
.loc 17 249 0
|
||||
mov.s32 %r46, %r41;
|
||||
mov.s32 %r47, %r2;
|
||||
mul.ftz.f32 %f55, %f15, %f53;
|
||||
mov.s32 %r48, %r46;
|
||||
mov.s64 %rd27, %rd19;
|
||||
cvt.s64.s32 %rd28, %r46;
|
||||
mul.wide.s32 %rd29, %r46, 16;
|
||||
mov.s32 %r49, 0;
|
||||
mov.s32 %r50, %r47;
|
||||
$Lt_2_14594:
|
||||
//<loop> Loop body line 249, nesting depth: 2, estimated iterations: unknown
|
||||
mov.s32 %r51, %r2;
|
||||
mov.s32 %r52, %r48;
|
||||
add.s32 %r53, %r48, %r2;
|
||||
mov.s64 %rd30, %rd18;
|
||||
ld.shared.f32 %f56, [%rd27+0];
|
||||
add.u64 %rd31, %rd29, %rd22;
|
||||
mul.ftz.f32 %f57, %f55, %f56;
|
||||
mov.s32 %r54, %r51;
|
||||
$Lt_2_15362:
|
||||
//<loop> Loop body line 249, nesting depth: 3, estimated iterations: unknown
|
||||
.loc 17 253 0
|
||||
ld.shared.f32 %f58, [%rd30+0];
|
||||
mul.ftz.f32 %f59, %f58, %f57;
|
||||
ld.global.v4.f32 {%f60,%f61,%f62,_}, [%rd31+0];
|
||||
.loc 17 255 0
|
||||
mul.ftz.f32 %f63, %f59, %f60;
|
||||
sub.ftz.f32 %f52, %f52, %f63;
|
||||
.loc 17 256 0
|
||||
mul.ftz.f32 %f64, %f59, %f61;
|
||||
sub.ftz.f32 %f51, %f51, %f64;
|
||||
.loc 17 257 0
|
||||
mul.ftz.f32 %f65, %f59, %f62;
|
||||
sub.ftz.f32 %f50, %f50, %f65;
|
||||
add.s32 %r52, %r52, 1;
|
||||
add.u64 %rd31, %rd31, 16;
|
||||
add.u64 %rd30, %rd30, 256;
|
||||
setp.ne.s32 %p10, %r52, %r53;
|
||||
@%p10 bra $Lt_2_15362;
|
||||
add.s32 %r49, %r49, 1;
|
||||
add.s32 %r48, %r48, %r38;
|
||||
add.u64 %rd29, %rd29, %rd21;
|
||||
add.u64 %rd27, %rd27, 256;
|
||||
setp.ne.s32 %p11, %r49, %r2;
|
||||
@%p11 bra $Lt_2_14594;
|
||||
add.s32 %r42, %r42, 1;
|
||||
add.s32 %r41, %r46, %r33;
|
||||
setp.ne.s32 %p12, %r42, %r2;
|
||||
@%p12 bra $Lt_2_12802;
|
||||
bra.uni $Lt_2_9730;
|
||||
$Lt_2_16898:
|
||||
mov.f32 %f50, 0f00000000; // 0
|
||||
mov.f32 %f51, 0f00000000; // 0
|
||||
mov.f32 %f52, 0f00000000; // 0
|
||||
bra.uni $Lt_2_9730;
|
||||
$Lt_2_9986:
|
||||
mov.f32 %f50, 0f00000000; // 0
|
||||
mov.f32 %f51, 0f00000000; // 0
|
||||
mov.f32 %f52, 0f00000000; // 0
|
||||
$Lt_2_9730:
|
||||
.loc 17 264 0
|
||||
ld.param.u64 %rd32, [__cudaparm_interp_ans];
|
||||
cvt.s64.s32 %rd33, %r8;
|
||||
mul.wide.s32 %rd34, %r8, 16;
|
||||
add.u64 %rd35, %rd32, %rd34;
|
||||
mov.f32 %f66, %f67;
|
||||
st.global.v4.f32 [%rd35+0], {%f52,%f51,%f50,%f66};
|
||||
$Lt_2_9218:
|
||||
.loc 17 266 0
|
||||
exit;
|
||||
$LDWend_interp:
|
||||
} // interp
|
||||
|
||||
@ -1,818 +0,0 @@
|
||||
const char * pppm_f =
|
||||
" .version 2.3\n"
|
||||
" .target sm_20\n"
|
||||
" .address_size 64\n"
|
||||
" .global .texref pos_tex;\n"
|
||||
" .global .texref q_tex;\n"
|
||||
" .entry particle_map (\n"
|
||||
" .param .u64 __cudaparm_particle_map_x_,\n"
|
||||
" .param .u64 __cudaparm_particle_map_q_,\n"
|
||||
" .param .f32 __cudaparm_particle_map_delvolinv,\n"
|
||||
" .param .s32 __cudaparm_particle_map_nlocal,\n"
|
||||
" .param .u64 __cudaparm_particle_map_counts,\n"
|
||||
" .param .u64 __cudaparm_particle_map_ans,\n"
|
||||
" .param .f32 __cudaparm_particle_map_b_lo_x,\n"
|
||||
" .param .f32 __cudaparm_particle_map_b_lo_y,\n"
|
||||
" .param .f32 __cudaparm_particle_map_b_lo_z,\n"
|
||||
" .param .f32 __cudaparm_particle_map_delxinv,\n"
|
||||
" .param .f32 __cudaparm_particle_map_delyinv,\n"
|
||||
" .param .f32 __cudaparm_particle_map_delzinv,\n"
|
||||
" .param .s32 __cudaparm_particle_map_nlocal_x,\n"
|
||||
" .param .s32 __cudaparm_particle_map_nlocal_y,\n"
|
||||
" .param .s32 __cudaparm_particle_map_nlocal_z,\n"
|
||||
" .param .s32 __cudaparm_particle_map_atom_stride,\n"
|
||||
" .param .s32 __cudaparm_particle_map_max_atoms,\n"
|
||||
" .param .u64 __cudaparm_particle_map_error)\n"
|
||||
" {\n"
|
||||
" .reg .u32 %r<50>;\n"
|
||||
" .reg .u64 %rd<12>;\n"
|
||||
" .reg .f32 %f<44>;\n"
|
||||
" .reg .pred %p<11>;\n"
|
||||
" .loc 17 50 0\n"
|
||||
"$LDWbegin_particle_map:\n"
|
||||
" cvt.s32.u32 %r1, %ntid.x;\n"
|
||||
" cvt.s32.u32 %r2, %ctaid.x;\n"
|
||||
" mul24.lo.s32 %r3, %r2, %r1;\n"
|
||||
" cvt.s32.u32 %r4, %nctaid.x;\n"
|
||||
" mul24.lo.s32 %r5, %r4, %r1;\n"
|
||||
" mov.u32 %r6, %tid.x;\n"
|
||||
" add.u32 %r7, %r3, %r6;\n"
|
||||
" sub.s32 %r8, %r5, 1;\n"
|
||||
" mul.lo.s32 %r9, %r7, 64;\n"
|
||||
" div.s32 %r10, %r9, %r5;\n"
|
||||
" mul.lo.s32 %r11, %r8, %r10;\n"
|
||||
" sub.s32 %r12, %r9, %r11;\n"
|
||||
" ld.param.s32 %r13, [__cudaparm_particle_map_nlocal];\n"
|
||||
" setp.le.s32 %p1, %r13, %r12;\n"
|
||||
" @%p1 bra $Lt_0_7426;\n"
|
||||
" .loc 17 62 0\n"
|
||||
" mov.u32 %r14, %r12;\n"
|
||||
" mov.s32 %r15, 0;\n"
|
||||
" mov.u32 %r16, %r15;\n"
|
||||
" mov.s32 %r17, 0;\n"
|
||||
" mov.u32 %r18, %r17;\n"
|
||||
" mov.s32 %r19, 0;\n"
|
||||
" mov.u32 %r20, %r19;\n"
|
||||
" tex.1d.v4.f32.s32 {%f1,%f2,%f3,%f4},[pos_tex,{%r14,%r16,%r18,%r20}];\n"
|
||||
" mov.f32 %f5, %f1;\n"
|
||||
" mov.f32 %f6, %f2;\n"
|
||||
" mov.f32 %f7, %f3;\n"
|
||||
" .loc 17 64 0\n"
|
||||
" mov.u32 %r21, %r12;\n"
|
||||
" mov.s32 %r22, 0;\n"
|
||||
" mov.u32 %r23, %r22;\n"
|
||||
" mov.s32 %r24, 0;\n"
|
||||
" mov.u32 %r25, %r24;\n"
|
||||
" mov.s32 %r26, 0;\n"
|
||||
" mov.u32 %r27, %r26;\n"
|
||||
" tex.1d.v4.f32.s32 {%f8,%f9,%f10,%f11},[q_tex,{%r21,%r23,%r25,%r27}];\n"
|
||||
" mov.f32 %f12, %f8;\n"
|
||||
" ld.param.f32 %f13, [__cudaparm_particle_map_delvolinv];\n"
|
||||
" mul.ftz.f32 %f14, %f13, %f12;\n"
|
||||
" mov.f32 %f15, 0f00000000; \n"
|
||||
" setp.neu.ftz.f32 %p2, %f14, %f15;\n"
|
||||
" @!%p2 bra $Lt_0_7426;\n"
|
||||
" .loc 17 67 0\n"
|
||||
" ld.param.f32 %f16, [__cudaparm_particle_map_b_lo_x];\n"
|
||||
" sub.ftz.f32 %f17, %f5, %f16;\n"
|
||||
" ld.param.f32 %f18, [__cudaparm_particle_map_delxinv];\n"
|
||||
" mul.ftz.f32 %f19, %f18, %f17;\n"
|
||||
" mov.f32 %f20, 0f00000000; \n"
|
||||
" setp.lt.ftz.f32 %p3, %f19, %f20;\n"
|
||||
" @%p3 bra $Lt_0_8706;\n"
|
||||
" ld.param.f32 %f21, [__cudaparm_particle_map_b_lo_y];\n"
|
||||
" sub.ftz.f32 %f22, %f6, %f21;\n"
|
||||
" ld.param.f32 %f23, [__cudaparm_particle_map_delyinv];\n"
|
||||
" mul.ftz.f32 %f24, %f23, %f22;\n"
|
||||
" mov.f32 %f25, 0f00000000; \n"
|
||||
" setp.lt.ftz.f32 %p4, %f24, %f25;\n"
|
||||
" @%p4 bra $Lt_0_8706;\n"
|
||||
" ld.param.f32 %f26, [__cudaparm_particle_map_b_lo_z];\n"
|
||||
" sub.ftz.f32 %f27, %f7, %f26;\n"
|
||||
" ld.param.f32 %f28, [__cudaparm_particle_map_delzinv];\n"
|
||||
" mul.ftz.f32 %f29, %f28, %f27;\n"
|
||||
" mov.f32 %f30, 0f00000000; \n"
|
||||
" setp.lt.ftz.f32 %p5, %f29, %f30;\n"
|
||||
" @%p5 bra $Lt_0_8706;\n"
|
||||
" cvt.rzi.ftz.s32.f32 %r28, %f19;\n"
|
||||
" ld.param.s32 %r29, [__cudaparm_particle_map_nlocal_x];\n"
|
||||
" setp.ge.s32 %p6, %r28, %r29;\n"
|
||||
" @%p6 bra $Lt_0_8706;\n"
|
||||
" cvt.rzi.ftz.s32.f32 %r30, %f24;\n"
|
||||
" ld.param.s32 %r31, [__cudaparm_particle_map_nlocal_y];\n"
|
||||
" setp.ge.s32 %p7, %r30, %r31;\n"
|
||||
" @%p7 bra $Lt_0_8706;\n"
|
||||
" cvt.rzi.ftz.s32.f32 %r32, %f29;\n"
|
||||
" ld.param.s32 %r33, [__cudaparm_particle_map_nlocal_z];\n"
|
||||
" setp.gt.s32 %p8, %r33, %r32;\n"
|
||||
" @%p8 bra $L_0_4866;\n"
|
||||
"$Lt_0_8706:\n"
|
||||
"$L_0_5122:\n"
|
||||
" .loc 17 76 0\n"
|
||||
" mov.s32 %r34, 1;\n"
|
||||
" ld.param.u64 %rd1, [__cudaparm_particle_map_error];\n"
|
||||
" st.global.s32 [%rd1+0], %r34;\n"
|
||||
" bra.uni $Lt_0_7426;\n"
|
||||
"$L_0_4866:\n"
|
||||
" .loc 17 83 0\n"
|
||||
" mul.lo.s32 %r35, %r32, %r31;\n"
|
||||
" add.s32 %r36, %r30, %r35;\n"
|
||||
" mul.lo.s32 %r37, %r36, %r29;\n"
|
||||
" add.s32 %r38, %r28, %r37;\n"
|
||||
" ld.param.u64 %rd2, [__cudaparm_particle_map_counts];\n"
|
||||
" cvt.s64.s32 %rd3, %r38;\n"
|
||||
" mul.wide.s32 %rd4, %r38, 4;\n"
|
||||
" add.u64 %rd5, %rd2, %rd4;\n"
|
||||
" mov.s32 %r39, 1;\n"
|
||||
" atom.global.add.s32 %r40, [%rd5], %r39;\n"
|
||||
" mov.s32 %r41, %r40;\n"
|
||||
" ld.param.s32 %r42, [__cudaparm_particle_map_max_atoms];\n"
|
||||
" setp.gt.s32 %p9, %r42, %r41;\n"
|
||||
" @%p9 bra $Lt_0_7682;\n"
|
||||
" .loc 17 85 0\n"
|
||||
" mov.s32 %r43, 2;\n"
|
||||
" ld.param.u64 %rd6, [__cudaparm_particle_map_error];\n"
|
||||
" st.global.s32 [%rd6+0], %r43;\n"
|
||||
" .loc 16 118 0\n"
|
||||
" mov.s32 %r44, -1;\n"
|
||||
" atom.global.add.s32 %r45, [%rd5], %r44;\n"
|
||||
" bra.uni $Lt_0_7426;\n"
|
||||
"$Lt_0_7682:\n"
|
||||
" .loc 17 88 0\n"
|
||||
" ld.param.u64 %rd7, [__cudaparm_particle_map_ans];\n"
|
||||
" ld.param.s32 %r46, [__cudaparm_particle_map_atom_stride];\n"
|
||||
" mul.lo.s32 %r47, %r46, %r41;\n"
|
||||
" add.s32 %r48, %r38, %r47;\n"
|
||||
" cvt.s64.s32 %rd8, %r48;\n"
|
||||
" mul.wide.s32 %rd9, %r48, 16;\n"
|
||||
" add.u64 %rd10, %rd7, %rd9;\n"
|
||||
" cvt.rn.f32.s32 %f31, %r28;\n"
|
||||
" mov.f32 %f32, 0f3f000000; \n"
|
||||
" add.ftz.f32 %f33, %f31, %f32;\n"
|
||||
" sub.ftz.f32 %f34, %f33, %f19;\n"
|
||||
" cvt.rn.f32.s32 %f35, %r30;\n"
|
||||
" mov.f32 %f36, 0f3f000000; \n"
|
||||
" add.ftz.f32 %f37, %f35, %f36;\n"
|
||||
" sub.ftz.f32 %f38, %f37, %f24;\n"
|
||||
" cvt.rn.f32.s32 %f39, %r32;\n"
|
||||
" mov.f32 %f40, 0f3f000000; \n"
|
||||
" add.ftz.f32 %f41, %f39, %f40;\n"
|
||||
" sub.ftz.f32 %f42, %f41, %f29;\n"
|
||||
" st.global.v4.f32 [%rd10+0], {%f34,%f38,%f42,%f14};\n"
|
||||
"$Lt_0_7426:\n"
|
||||
"$L_0_4610:\n"
|
||||
"$Lt_0_6914:\n"
|
||||
"$Lt_0_6402:\n"
|
||||
" .loc 17 92 0\n"
|
||||
" exit;\n"
|
||||
"$LDWend_particle_map:\n"
|
||||
" }\n"
|
||||
" .entry make_rho (\n"
|
||||
" .param .u64 __cudaparm_make_rho_counts,\n"
|
||||
" .param .u64 __cudaparm_make_rho_atoms,\n"
|
||||
" .param .u64 __cudaparm_make_rho_brick,\n"
|
||||
" .param .u64 __cudaparm_make_rho__rho_coeff,\n"
|
||||
" .param .s32 __cudaparm_make_rho_atom_stride,\n"
|
||||
" .param .s32 __cudaparm_make_rho_npts_x,\n"
|
||||
" .param .s32 __cudaparm_make_rho_npts_y,\n"
|
||||
" .param .s32 __cudaparm_make_rho_npts_z,\n"
|
||||
" .param .s32 __cudaparm_make_rho_nlocal_x,\n"
|
||||
" .param .s32 __cudaparm_make_rho_nlocal_y,\n"
|
||||
" .param .s32 __cudaparm_make_rho_nlocal_z,\n"
|
||||
" .param .s32 __cudaparm_make_rho_order_m_1,\n"
|
||||
" .param .s32 __cudaparm_make_rho_order,\n"
|
||||
" .param .s32 __cudaparm_make_rho_order2)\n"
|
||||
" {\n"
|
||||
" .reg .u32 %r<119>;\n"
|
||||
" .reg .u64 %rd<57>;\n"
|
||||
" .reg .f32 %f<26>;\n"
|
||||
" .reg .pred %p<27>;\n"
|
||||
" .shared .align 4 .b8 __cuda___cuda_local_var_32578_33_non_const_rho_coeff168[256];\n"
|
||||
" .shared .align 4 .b8 __cuda___cuda_local_var_32579_33_non_const_front424[320];\n"
|
||||
" .shared .align 4 .b8 __cuda___cuda_local_var_32580_33_non_const_ans744[2048];\n"
|
||||
" .loc 17 101 0\n"
|
||||
"$LDWbegin_make_rho:\n"
|
||||
" ld.param.s32 %r1, [__cudaparm_make_rho_order2];\n"
|
||||
" ld.param.s32 %r2, [__cudaparm_make_rho_order];\n"
|
||||
" add.s32 %r3, %r1, %r2;\n"
|
||||
" cvt.s32.u32 %r4, %tid.x;\n"
|
||||
" setp.le.s32 %p1, %r3, %r4;\n"
|
||||
" @%p1 bra $Lt_1_16898;\n"
|
||||
" .loc 17 108 0\n"
|
||||
" mov.u64 %rd1, __cuda___cuda_local_var_32578_33_non_const_rho_coeff168;\n"
|
||||
" cvt.s64.s32 %rd2, %r4;\n"
|
||||
" mul.wide.s32 %rd3, %r4, 4;\n"
|
||||
" ld.param.u64 %rd4, [__cudaparm_make_rho__rho_coeff];\n"
|
||||
" add.u64 %rd5, %rd4, %rd3;\n"
|
||||
" ld.global.f32 %f1, [%rd5+0];\n"
|
||||
" add.u64 %rd6, %rd3, %rd1;\n"
|
||||
" st.shared.f32 [%rd6+0], %f1;\n"
|
||||
"$Lt_1_16898:\n"
|
||||
" mov.u64 %rd1, __cuda___cuda_local_var_32578_33_non_const_rho_coeff168;\n"
|
||||
" shr.s32 %r5, %r4, 31;\n"
|
||||
" mov.s32 %r6, 31;\n"
|
||||
" and.b32 %r7, %r5, %r6;\n"
|
||||
" add.s32 %r8, %r7, %r4;\n"
|
||||
" shr.s32 %r9, %r8, 5;\n"
|
||||
" mul.lo.s32 %r10, %r9, 32;\n"
|
||||
" sub.s32 %r11, %r4, %r10;\n"
|
||||
" setp.lt.s32 %p2, %r11, %r2;\n"
|
||||
" @!%p2 bra $Lt_1_17410;\n"
|
||||
" .loc 17 114 0\n"
|
||||
" mov.u64 %rd7, __cuda___cuda_local_var_32579_33_non_const_front424;\n"
|
||||
" mov.f32 %f2, 0f00000000; \n"
|
||||
" cvt.s64.s32 %rd8, %r11;\n"
|
||||
" shr.s32 %r12, %r4, 31;\n"
|
||||
" mov.s32 %r13, 31;\n"
|
||||
" and.b32 %r14, %r12, %r13;\n"
|
||||
" add.s32 %r15, %r14, %r4;\n"
|
||||
" shr.s32 %r16, %r15, 5;\n"
|
||||
" cvt.s64.s32 %rd9, %r16;\n"
|
||||
" mul.wide.s32 %rd10, %r16, 40;\n"
|
||||
" add.u64 %rd11, %rd8, %rd10;\n"
|
||||
" mul.lo.u64 %rd12, %rd11, 4;\n"
|
||||
" add.u64 %rd13, %rd7, %rd12;\n"
|
||||
" st.shared.f32 [%rd13+128], %f2;\n"
|
||||
"$Lt_1_17410:\n"
|
||||
" mov.u64 %rd7, __cuda___cuda_local_var_32579_33_non_const_front424;\n"
|
||||
" .loc 17 116 0\n"
|
||||
" bar.sync 0;\n"
|
||||
" ld.param.s32 %r17, [__cudaparm_make_rho_npts_x];\n"
|
||||
" shr.s32 %r18, %r17, 31;\n"
|
||||
" mov.s32 %r19, 31;\n"
|
||||
" and.b32 %r20, %r18, %r19;\n"
|
||||
" add.s32 %r21, %r20, %r17;\n"
|
||||
" shr.s32 %r22, %r21, 5;\n"
|
||||
" add.s32 %r23, %r22, 1;\n"
|
||||
" mov.u32 %r24, 0;\n"
|
||||
" setp.le.s32 %p3, %r23, %r24;\n"
|
||||
" @%p3 bra $Lt_1_17922;\n"
|
||||
" shr.s32 %r25, %r4, 31;\n"
|
||||
" mov.s32 %r26, 31;\n"
|
||||
" and.b32 %r27, %r25, %r26;\n"
|
||||
" add.s32 %r28, %r27, %r4;\n"
|
||||
" shr.s32 %r29, %r28, 5;\n"
|
||||
" add.s32 %r30, %r11, 32;\n"
|
||||
" ld.param.s32 %r31, [__cudaparm_make_rho_nlocal_y];\n"
|
||||
" ld.param.s32 %r32, [__cudaparm_make_rho_nlocal_x];\n"
|
||||
" mul.lo.s32 %r33, %r31, %r32;\n"
|
||||
" mov.u32 %r34, %ctaid.x;\n"
|
||||
" mul.lo.u32 %r35, %r34, 2;\n"
|
||||
" add.u32 %r36, %r29, %r35;\n"
|
||||
" ld.param.s32 %r37, [__cudaparm_make_rho_npts_y];\n"
|
||||
" div.s32 %r38, %r36, %r37;\n"
|
||||
" ld.param.s32 %r39, [__cudaparm_make_rho_order_m_1];\n"
|
||||
" setp.lt.s32 %p4, %r38, %r39;\n"
|
||||
" sub.s32 %r40, %r39, %r38;\n"
|
||||
" mov.s32 %r41, 0;\n"
|
||||
" selp.s32 %r42, %r40, %r41, %p4;\n"
|
||||
" ld.param.s32 %r43, [__cudaparm_make_rho_nlocal_z];\n"
|
||||
" setp.ge.s32 %p5, %r38, %r43;\n"
|
||||
" sub.s32 %r44, %r43, %r38;\n"
|
||||
" add.s32 %r45, %r44, %r2;\n"
|
||||
" sub.s32 %r46, %r45, 1;\n"
|
||||
" selp.s32 %r47, %r46, %r2, %p5;\n"
|
||||
" rem.s32 %r48, %r36, %r37;\n"
|
||||
" setp.lt.s32 %p6, %r48, %r39;\n"
|
||||
" sub.s32 %r49, %r39, %r48;\n"
|
||||
" mov.s32 %r50, 0;\n"
|
||||
" selp.s32 %r51, %r49, %r50, %p6;\n"
|
||||
" setp.ge.s32 %p7, %r48, %r31;\n"
|
||||
" sub.s32 %r52, %r31, %r48;\n"
|
||||
" add.s32 %r53, %r52, %r2;\n"
|
||||
" sub.s32 %r54, %r53, 1;\n"
|
||||
" selp.s32 %r55, %r54, %r2, %p7;\n"
|
||||
" mov.s32 %r56, %r23;\n"
|
||||
" mov.s32 %r57, 0;\n"
|
||||
" setp.gt.s32 %p8, %r2, %r57;\n"
|
||||
" mov.s32 %r58, 0;\n"
|
||||
" cvt.s64.s32 %rd14, %r11;\n"
|
||||
" cvt.s64.s32 %rd15, %r29;\n"
|
||||
" mul.lo.s32 %r59, %r23, 32;\n"
|
||||
" mul.wide.s32 %rd16, %r29, 40;\n"
|
||||
" add.u64 %rd17, %rd14, %rd16;\n"
|
||||
" ld.param.s32 %r60, [__cudaparm_make_rho_npts_z];\n"
|
||||
" setp.gt.s32 %p9, %r60, %r38;\n"
|
||||
" mul.lo.u64 %rd18, %rd17, 4;\n"
|
||||
" selp.s32 %r61, 1, 0, %p9;\n"
|
||||
" add.u64 %rd19, %rd18, %rd7;\n"
|
||||
" mov.u64 %rd20, __cuda___cuda_local_var_32580_33_non_const_ans744;\n"
|
||||
" mov.s32 %r62, %r56;\n"
|
||||
"$Lt_1_18434:\n"
|
||||
" @!%p8 bra $Lt_1_18690;\n"
|
||||
" mov.s32 %r63, %r2;\n"
|
||||
" cvt.s64.s32 %rd21, %r4;\n"
|
||||
" mul.wide.s32 %rd22, %r4, 4;\n"
|
||||
" add.u64 %rd23, %rd20, %rd22;\n"
|
||||
" mov.s32 %r64, 0;\n"
|
||||
" mov.s32 %r65, %r63;\n"
|
||||
"$Lt_1_19202:\n"
|
||||
" .loc 17 140 0\n"
|
||||
" mov.f32 %f3, 0f00000000; \n"
|
||||
" st.shared.f32 [%rd23+0], %f3;\n"
|
||||
" add.s32 %r64, %r64, 1;\n"
|
||||
" add.u64 %rd23, %rd23, 256;\n"
|
||||
" setp.ne.s32 %p10, %r64, %r2;\n"
|
||||
" @%p10 bra $Lt_1_19202;\n"
|
||||
"$Lt_1_18690:\n"
|
||||
" add.s32 %r66, %r11, %r58;\n"
|
||||
" set.lt.u32.s32 %r67, %r66, %r32;\n"
|
||||
" neg.s32 %r68, %r67;\n"
|
||||
" and.b32 %r69, %r61, %r68;\n"
|
||||
" mov.u32 %r70, 0;\n"
|
||||
" setp.eq.s32 %p11, %r69, %r70;\n"
|
||||
" @%p11 bra $Lt_1_20226;\n"
|
||||
" .loc 17 143 0\n"
|
||||
" mov.s32 %r71, %r42;\n"
|
||||
" setp.ge.s32 %p12, %r42, %r47;\n"
|
||||
" @%p12 bra $Lt_1_20226;\n"
|
||||
" sub.s32 %r72, %r47, %r42;\n"
|
||||
" setp.lt.s32 %p13, %r51, %r55;\n"
|
||||
" mov.s32 %r73, %r72;\n"
|
||||
"$Lt_1_20738:\n"
|
||||
" .loc 17 145 0\n"
|
||||
" mov.s32 %r74, %r51;\n"
|
||||
" @!%p13 bra $Lt_1_20994;\n"
|
||||
" sub.s32 %r75, %r55, %r51;\n"
|
||||
" sub.s32 %r76, %r71, %r42;\n"
|
||||
" add.s32 %r77, %r38, %r42;\n"
|
||||
" add.s32 %r78, %r48, %r51;\n"
|
||||
" sub.s32 %r79, %r77, %r39;\n"
|
||||
" sub.s32 %r80, %r78, %r39;\n"
|
||||
" add.s32 %r81, %r76, %r79;\n"
|
||||
" mul.lo.s32 %r82, %r33, %r81;\n"
|
||||
" ld.param.s32 %r83, [__cudaparm_make_rho_atom_stride];\n"
|
||||
" ld.param.u64 %rd24, [__cudaparm_make_rho_counts];\n"
|
||||
" mov.s32 %r84, %r75;\n"
|
||||
"$Lt_1_21506:\n"
|
||||
" .loc 17 147 0\n"
|
||||
" sub.s32 %r85, %r74, %r51;\n"
|
||||
" add.s32 %r86, %r85, %r80;\n"
|
||||
" mul.lo.s32 %r87, %r86, %r32;\n"
|
||||
" add.s32 %r88, %r82, %r87;\n"
|
||||
" add.s32 %r89, %r66, %r88;\n"
|
||||
" cvt.s64.s32 %rd25, %r89;\n"
|
||||
" mul.wide.s32 %rd26, %r89, 4;\n"
|
||||
" add.u64 %rd27, %rd24, %rd26;\n"
|
||||
" ld.global.s32 %r90, [%rd27+0];\n"
|
||||
" mul.lo.s32 %r91, %r90, %r83;\n"
|
||||
" .loc 17 148 0\n"
|
||||
" mov.s32 %r92, %r89;\n"
|
||||
" setp.ge.s32 %p14, %r89, %r91;\n"
|
||||
" @%p14 bra $Lt_1_21762;\n"
|
||||
" sub.s32 %r93, %r3, 1;\n"
|
||||
" cvt.s64.s32 %rd28, %r83;\n"
|
||||
" mul.wide.s32 %rd29, %r83, 16;\n"
|
||||
" mov.s32 %r94, -1;\n"
|
||||
" setp.gt.s32 %p15, %r93, %r94;\n"
|
||||
" ld.param.u64 %rd30, [__cudaparm_make_rho_atoms];\n"
|
||||
" mul.lo.u64 %rd31, %rd25, 16;\n"
|
||||
" add.u64 %rd32, %rd30, %rd31;\n"
|
||||
"$Lt_1_22274:\n"
|
||||
" .loc 17 149 0\n"
|
||||
" ld.global.f32 %f4, [%rd32+0];\n"
|
||||
" @!%p15 bra $Lt_1_29954;\n"
|
||||
" sub.s32 %r95, %r93, %r74;\n"
|
||||
" mov.s32 %r96, -1;\n"
|
||||
" sub.s32 %r97, %r96, %r74;\n"
|
||||
" cvt.s64.s32 %rd33, %r2;\n"
|
||||
" mul.wide.s32 %rd34, %r2, 4;\n"
|
||||
" ld.global.f32 %f5, [%rd32+4];\n"
|
||||
" ld.global.f32 %f6, [%rd32+8];\n"
|
||||
" cvt.s64.s32 %rd35, %r95;\n"
|
||||
" mul.wide.s32 %rd36, %r95, 4;\n"
|
||||
" add.u64 %rd37, %rd1, %rd36;\n"
|
||||
" sub.s32 %r98, %r93, %r71;\n"
|
||||
" cvt.s64.s32 %rd38, %r98;\n"
|
||||
" mul.wide.s32 %rd39, %r98, 4;\n"
|
||||
" add.u64 %rd40, %rd1, %rd39;\n"
|
||||
" mov.f32 %f7, 0f00000000; \n"
|
||||
" mov.f32 %f8, 0f00000000; \n"
|
||||
"$Lt_1_23042:\n"
|
||||
" .loc 17 154 0\n"
|
||||
" ld.shared.f32 %f9, [%rd37+0];\n"
|
||||
" fma.rn.ftz.f32 %f8, %f8, %f5, %f9;\n"
|
||||
" .loc 17 155 0\n"
|
||||
" ld.shared.f32 %f10, [%rd40+0];\n"
|
||||
" fma.rn.ftz.f32 %f7, %f7, %f6, %f10;\n"
|
||||
" sub.u64 %rd40, %rd40, %rd34;\n"
|
||||
" sub.s32 %r95, %r95, %r2;\n"
|
||||
" sub.u64 %rd37, %rd37, %rd34;\n"
|
||||
" setp.gt.s32 %p16, %r95, %r97;\n"
|
||||
" @%p16 bra $Lt_1_23042;\n"
|
||||
" bra.uni $Lt_1_22530;\n"
|
||||
"$Lt_1_29954:\n"
|
||||
" mov.f32 %f7, 0f00000000; \n"
|
||||
" mov.f32 %f8, 0f00000000; \n"
|
||||
"$Lt_1_22530:\n"
|
||||
" .loc 17 157 0\n"
|
||||
" ld.global.f32 %f11, [%rd32+12];\n"
|
||||
" mul.ftz.f32 %f12, %f7, %f8;\n"
|
||||
" mul.ftz.f32 %f13, %f11, %f12;\n"
|
||||
" @!%p8 bra $Lt_1_23554;\n"
|
||||
" mov.s32 %r99, %r2;\n"
|
||||
" cvt.s64.s32 %rd41, %r4;\n"
|
||||
" mul.wide.s32 %rd42, %r4, 4;\n"
|
||||
" add.u64 %rd43, %rd20, %rd42;\n"
|
||||
" mov.s32 %r100, 0;\n"
|
||||
" mov.s32 %r101, %r99;\n"
|
||||
"$Lt_1_24066:\n"
|
||||
" .loc 17 161 0\n"
|
||||
" add.s32 %r102, %r100, %r1;\n"
|
||||
" mov.s32 %r103, %r102;\n"
|
||||
" setp.lt.s32 %p17, %r102, %r100;\n"
|
||||
" @%p17 bra $Lt_1_30466;\n"
|
||||
" cvt.s64.s32 %rd44, %r2;\n"
|
||||
" mul.wide.s32 %rd34, %r2, 4;\n"
|
||||
" cvt.s64.s32 %rd45, %r102;\n"
|
||||
" mul.wide.s32 %rd46, %r102, 4;\n"
|
||||
" add.u64 %rd47, %rd1, %rd46;\n"
|
||||
" mov.f32 %f14, 0f00000000; \n"
|
||||
"$Lt_1_24834:\n"
|
||||
" .loc 17 162 0\n"
|
||||
" ld.shared.f32 %f15, [%rd47+0];\n"
|
||||
" fma.rn.ftz.f32 %f14, %f4, %f14, %f15;\n"
|
||||
" sub.s32 %r103, %r103, %r2;\n"
|
||||
" sub.u64 %rd47, %rd47, %rd34;\n"
|
||||
" setp.ge.s32 %p18, %r103, %r100;\n"
|
||||
" @%p18 bra $Lt_1_24834;\n"
|
||||
" bra.uni $Lt_1_24322;\n"
|
||||
"$Lt_1_30466:\n"
|
||||
" mov.f32 %f14, 0f00000000; \n"
|
||||
"$Lt_1_24322:\n"
|
||||
" .loc 17 163 0\n"
|
||||
" ld.shared.f32 %f16, [%rd43+0];\n"
|
||||
" fma.rn.ftz.f32 %f17, %f14, %f13, %f16;\n"
|
||||
" st.shared.f32 [%rd43+0], %f17;\n"
|
||||
" add.s32 %r100, %r100, 1;\n"
|
||||
" add.u64 %rd43, %rd43, 256;\n"
|
||||
" setp.ne.s32 %p19, %r100, %r2;\n"
|
||||
" @%p19 bra $Lt_1_24066;\n"
|
||||
"$Lt_1_23554:\n"
|
||||
" add.s32 %r92, %r92, %r83;\n"
|
||||
" add.u64 %rd32, %rd29, %rd32;\n"
|
||||
" setp.gt.s32 %p20, %r91, %r92;\n"
|
||||
" @%p20 bra $Lt_1_22274;\n"
|
||||
"$Lt_1_21762:\n"
|
||||
" add.s32 %r74, %r74, 1;\n"
|
||||
" setp.ne.s32 %p21, %r55, %r74;\n"
|
||||
" @%p21 bra $Lt_1_21506;\n"
|
||||
"$Lt_1_20994:\n"
|
||||
" add.s32 %r71, %r71, 1;\n"
|
||||
" setp.ne.s32 %p22, %r47, %r71;\n"
|
||||
" @%p22 bra $Lt_1_20738;\n"
|
||||
"$Lt_1_20226:\n"
|
||||
"$Lt_1_19714:\n"
|
||||
" .loc 17 172 0\n"
|
||||
" bar.sync 0;\n"
|
||||
" @!%p2 bra $Lt_1_26626;\n"
|
||||
" .loc 17 174 0\n"
|
||||
" ld.shared.f32 %f18, [%rd19+128];\n"
|
||||
" st.shared.f32 [%rd19+0], %f18;\n"
|
||||
" .loc 17 175 0\n"
|
||||
" mov.f32 %f19, 0f00000000; \n"
|
||||
" st.shared.f32 [%rd19+128], %f19;\n"
|
||||
" bra.uni $Lt_1_26370;\n"
|
||||
"$Lt_1_26626:\n"
|
||||
" .loc 17 177 0\n"
|
||||
" mov.f32 %f20, 0f00000000; \n"
|
||||
" st.shared.f32 [%rd19+0], %f20;\n"
|
||||
"$Lt_1_26370:\n"
|
||||
" @!%p8 bra $Lt_1_26882;\n"
|
||||
" mov.s32 %r104, %r2;\n"
|
||||
" cvt.s64.s32 %rd48, %r4;\n"
|
||||
" mov.s32 %r105, %r11;\n"
|
||||
" add.s32 %r106, %r11, %r2;\n"
|
||||
" mul.wide.s32 %rd49, %r4, 4;\n"
|
||||
" add.u64 %rd50, %rd20, %rd49;\n"
|
||||
" mov.s64 %rd51, %rd19;\n"
|
||||
" mov.s32 %r107, %r104;\n"
|
||||
"$Lt_1_27394:\n"
|
||||
" .loc 17 180 0\n"
|
||||
" ld.shared.f32 %f21, [%rd50+0];\n"
|
||||
" ld.shared.f32 %f22, [%rd51+0];\n"
|
||||
" add.ftz.f32 %f23, %f21, %f22;\n"
|
||||
" st.shared.f32 [%rd51+0], %f23;\n"
|
||||
" .loc 17 181 0\n"
|
||||
" bar.sync 0;\n"
|
||||
" add.s32 %r105, %r105, 1;\n"
|
||||
" add.u64 %rd51, %rd51, 4;\n"
|
||||
" add.u64 %rd50, %rd50, 256;\n"
|
||||
" setp.ne.s32 %p23, %r105, %r106;\n"
|
||||
" @%p23 bra $Lt_1_27394;\n"
|
||||
"$Lt_1_26882:\n"
|
||||
" set.lt.u32.s32 %r108, %r66, %r17;\n"
|
||||
" neg.s32 %r109, %r108;\n"
|
||||
" and.b32 %r110, %r61, %r109;\n"
|
||||
" mov.u32 %r111, 0;\n"
|
||||
" setp.eq.s32 %p24, %r110, %r111;\n"
|
||||
" @%p24 bra $Lt_1_27906;\n"
|
||||
" .loc 17 185 0\n"
|
||||
" ld.shared.f32 %f24, [%rd19+0];\n"
|
||||
" ld.param.u64 %rd52, [__cudaparm_make_rho_brick];\n"
|
||||
" add.s32 %r112, %r11, %r58;\n"
|
||||
" mul.lo.s32 %r113, %r37, %r17;\n"
|
||||
" mul.lo.s32 %r114, %r38, %r113;\n"
|
||||
" mul.lo.s32 %r115, %r48, %r17;\n"
|
||||
" add.s32 %r116, %r114, %r115;\n"
|
||||
" add.s32 %r117, %r112, %r116;\n"
|
||||
" cvt.s64.s32 %rd53, %r117;\n"
|
||||
" mul.wide.s32 %rd54, %r117, 4;\n"
|
||||
" add.u64 %rd55, %rd52, %rd54;\n"
|
||||
" st.global.f32 [%rd55+0], %f24;\n"
|
||||
"$Lt_1_27906:\n"
|
||||
" add.s32 %r58, %r58, 32;\n"
|
||||
" setp.ne.s32 %p25, %r58, %r59;\n"
|
||||
" @%p25 bra $Lt_1_18434;\n"
|
||||
"$Lt_1_17922:\n"
|
||||
" .loc 17 189 0\n"
|
||||
" exit;\n"
|
||||
"$LDWend_make_rho:\n"
|
||||
" }\n"
|
||||
" .entry interp (\n"
|
||||
" .param .u64 __cudaparm_interp_x_,\n"
|
||||
" .param .u64 __cudaparm_interp_q_,\n"
|
||||
" .param .s32 __cudaparm_interp_nlocal,\n"
|
||||
" .param .u64 __cudaparm_interp_brick,\n"
|
||||
" .param .u64 __cudaparm_interp__rho_coeff,\n"
|
||||
" .param .s32 __cudaparm_interp_npts_x,\n"
|
||||
" .param .s32 __cudaparm_interp_npts_yx,\n"
|
||||
" .param .f32 __cudaparm_interp_b_lo_x,\n"
|
||||
" .param .f32 __cudaparm_interp_b_lo_y,\n"
|
||||
" .param .f32 __cudaparm_interp_b_lo_z,\n"
|
||||
" .param .f32 __cudaparm_interp_delxinv,\n"
|
||||
" .param .f32 __cudaparm_interp_delyinv,\n"
|
||||
" .param .f32 __cudaparm_interp_delzinv,\n"
|
||||
" .param .s32 __cudaparm_interp_order,\n"
|
||||
" .param .s32 __cudaparm_interp_order2,\n"
|
||||
" .param .f32 __cudaparm_interp_qqrd2e_scale,\n"
|
||||
" .param .u64 __cudaparm_interp_ans)\n"
|
||||
" {\n"
|
||||
" .reg .u32 %r<56>;\n"
|
||||
" .reg .u64 %rd<37>;\n"
|
||||
" .reg .f32 %f<69>;\n"
|
||||
" .reg .pred %p<14>;\n"
|
||||
" .shared .align 4 .b8 __cuda___cuda_local_var_32676_33_non_const_rho_coeff2888[256];\n"
|
||||
" .shared .align 4 .b8 __cuda___cuda_local_var_32677_33_non_const_rho1d_03144[2048];\n"
|
||||
" .shared .align 4 .b8 __cuda___cuda_local_var_32678_33_non_const_rho1d_15192[2048];\n"
|
||||
" .loc 17 199 0\n"
|
||||
"$LDWbegin_interp:\n"
|
||||
" ld.param.s32 %r1, [__cudaparm_interp_order2];\n"
|
||||
" ld.param.s32 %r2, [__cudaparm_interp_order];\n"
|
||||
" add.s32 %r3, %r1, %r2;\n"
|
||||
" cvt.s32.u32 %r4, %tid.x;\n"
|
||||
" setp.le.s32 %p1, %r3, %r4;\n"
|
||||
" @%p1 bra $Lt_2_8706;\n"
|
||||
" .loc 17 206 0\n"
|
||||
" mov.u64 %rd1, __cuda___cuda_local_var_32676_33_non_const_rho_coeff2888;\n"
|
||||
" cvt.s64.s32 %rd2, %r4;\n"
|
||||
" mul.wide.s32 %rd3, %r4, 4;\n"
|
||||
" ld.param.u64 %rd4, [__cudaparm_interp__rho_coeff];\n"
|
||||
" add.u64 %rd5, %rd4, %rd3;\n"
|
||||
" ld.global.f32 %f1, [%rd5+0];\n"
|
||||
" add.u64 %rd6, %rd3, %rd1;\n"
|
||||
" st.shared.f32 [%rd6+0], %f1;\n"
|
||||
"$Lt_2_8706:\n"
|
||||
" mov.u64 %rd1, __cuda___cuda_local_var_32676_33_non_const_rho_coeff2888;\n"
|
||||
" .loc 17 207 0\n"
|
||||
" bar.sync 0;\n"
|
||||
" mov.u32 %r5, %ctaid.x;\n"
|
||||
" mov.u32 %r6, %ntid.x;\n"
|
||||
" mul.lo.u32 %r7, %r5, %r6;\n"
|
||||
" add.u32 %r8, %r4, %r7;\n"
|
||||
" ld.param.s32 %r9, [__cudaparm_interp_nlocal];\n"
|
||||
" setp.le.s32 %p2, %r9, %r8;\n"
|
||||
" @%p2 bra $Lt_2_9218;\n"
|
||||
" .loc 17 215 0\n"
|
||||
" mov.u32 %r10, %r8;\n"
|
||||
" mov.s32 %r11, 0;\n"
|
||||
" mov.u32 %r12, %r11;\n"
|
||||
" mov.s32 %r13, 0;\n"
|
||||
" mov.u32 %r14, %r13;\n"
|
||||
" mov.s32 %r15, 0;\n"
|
||||
" mov.u32 %r16, %r15;\n"
|
||||
" tex.1d.v4.f32.s32 {%f2,%f3,%f4,%f5},[pos_tex,{%r10,%r12,%r14,%r16}];\n"
|
||||
" mov.f32 %f6, %f2;\n"
|
||||
" mov.f32 %f7, %f3;\n"
|
||||
" mov.f32 %f8, %f4;\n"
|
||||
" .loc 17 216 0\n"
|
||||
" mov.u32 %r17, %r8;\n"
|
||||
" mov.s32 %r18, 0;\n"
|
||||
" mov.u32 %r19, %r18;\n"
|
||||
" mov.s32 %r20, 0;\n"
|
||||
" mov.u32 %r21, %r20;\n"
|
||||
" mov.s32 %r22, 0;\n"
|
||||
" mov.u32 %r23, %r22;\n"
|
||||
" tex.1d.v4.f32.s32 {%f9,%f10,%f11,%f12},[q_tex,{%r17,%r19,%r21,%r23}];\n"
|
||||
" mov.f32 %f13, %f9;\n"
|
||||
" ld.param.f32 %f14, [__cudaparm_interp_qqrd2e_scale];\n"
|
||||
" mul.ftz.f32 %f15, %f14, %f13;\n"
|
||||
" mov.f32 %f16, 0f00000000; \n"
|
||||
" setp.neu.ftz.f32 %p3, %f15, %f16;\n"
|
||||
" @!%p3 bra $Lt_2_9986;\n"
|
||||
" mov.s32 %r24, 0;\n"
|
||||
" setp.gt.s32 %p4, %r2, %r24;\n"
|
||||
" ld.param.f32 %f17, [__cudaparm_interp_b_lo_x];\n"
|
||||
" sub.ftz.f32 %f18, %f6, %f17;\n"
|
||||
" ld.param.f32 %f19, [__cudaparm_interp_delxinv];\n"
|
||||
" mul.ftz.f32 %f20, %f19, %f18;\n"
|
||||
" @!%p4 bra $Lt_2_16386;\n"
|
||||
" mov.u64 %rd7, __cuda___cuda_local_var_32677_33_non_const_rho1d_03144;\n"
|
||||
" mov.u64 %rd8, __cuda___cuda_local_var_32678_33_non_const_rho1d_15192;\n"
|
||||
" cvt.rzi.ftz.s32.f32 %r25, %f20;\n"
|
||||
" cvt.rn.f32.s32 %f21, %r25;\n"
|
||||
" mov.f32 %f22, 0f3f000000; \n"
|
||||
" add.ftz.f32 %f23, %f21, %f22;\n"
|
||||
" sub.ftz.f32 %f24, %f23, %f20;\n"
|
||||
" ld.param.f32 %f25, [__cudaparm_interp_b_lo_y];\n"
|
||||
" sub.ftz.f32 %f26, %f7, %f25;\n"
|
||||
" ld.param.f32 %f27, [__cudaparm_interp_delyinv];\n"
|
||||
" mul.ftz.f32 %f28, %f27, %f26;\n"
|
||||
" cvt.rzi.ftz.s32.f32 %r26, %f28;\n"
|
||||
" cvt.rn.f32.s32 %f29, %r26;\n"
|
||||
" mov.f32 %f30, 0f3f000000; \n"
|
||||
" add.ftz.f32 %f31, %f29, %f30;\n"
|
||||
" sub.ftz.f32 %f32, %f31, %f28;\n"
|
||||
" mov.s32 %r27, %r2;\n"
|
||||
" cvt.s64.s32 %rd9, %r4;\n"
|
||||
" mov.s32 %r28, %r1;\n"
|
||||
" mul.wide.s32 %rd3, %r4, 4;\n"
|
||||
" add.u64 %rd10, %rd3, %rd7;\n"
|
||||
" add.u64 %rd11, %rd3, %rd8;\n"
|
||||
" mov.s32 %r29, 0;\n"
|
||||
" mov.s32 %r30, %r27;\n"
|
||||
"$Lt_2_10754:\n"
|
||||
" .loc 17 235 0\n"
|
||||
" mov.f32 %f33, 0f00000000; \n"
|
||||
" mov.f32 %f34, 0f00000000; \n"
|
||||
" st.shared.f32 [%rd10+0], %f34;\n"
|
||||
" .loc 17 236 0\n"
|
||||
" mov.f32 %f35, 0f00000000; \n"
|
||||
" mov.f32 %f36, 0f00000000; \n"
|
||||
" st.shared.f32 [%rd11+0], %f36;\n"
|
||||
" .loc 17 237 0\n"
|
||||
" mov.s32 %r31, %r28;\n"
|
||||
" setp.lt.s32 %p5, %r28, %r29;\n"
|
||||
" @%p5 bra $Lt_2_11010;\n"
|
||||
" cvt.s64.s32 %rd12, %r2;\n"
|
||||
" mul.wide.s32 %rd13, %r2, 4;\n"
|
||||
" cvt.s64.s32 %rd14, %r28;\n"
|
||||
" mul.wide.s32 %rd15, %r28, 4;\n"
|
||||
" add.u64 %rd16, %rd1, %rd15;\n"
|
||||
"$Lt_2_11522:\n"
|
||||
" .loc 17 238 0\n"
|
||||
" ld.shared.f32 %f37, [%rd16+0];\n"
|
||||
" fma.rn.ftz.f32 %f33, %f33, %f24, %f37;\n"
|
||||
" st.shared.f32 [%rd10+0], %f33;\n"
|
||||
" .loc 17 239 0\n"
|
||||
" fma.rn.ftz.f32 %f35, %f35, %f32, %f37;\n"
|
||||
" st.shared.f32 [%rd11+0], %f35;\n"
|
||||
" sub.s32 %r31, %r31, %r2;\n"
|
||||
" sub.u64 %rd16, %rd16, %rd13;\n"
|
||||
" setp.ge.s32 %p6, %r31, %r29;\n"
|
||||
" @%p6 bra $Lt_2_11522;\n"
|
||||
"$Lt_2_11010:\n"
|
||||
" add.s32 %r29, %r29, 1;\n"
|
||||
" add.s32 %r28, %r28, 1;\n"
|
||||
" add.u64 %rd11, %rd11, 256;\n"
|
||||
" add.u64 %rd10, %rd10, 256;\n"
|
||||
" setp.ne.s32 %p7, %r28, %r3;\n"
|
||||
" @%p7 bra $Lt_2_10754;\n"
|
||||
" bra.uni $Lt_2_10242;\n"
|
||||
"$Lt_2_16386:\n"
|
||||
" cvt.rzi.ftz.s32.f32 %r25, %f20;\n"
|
||||
" mov.u64 %rd8, __cuda___cuda_local_var_32678_33_non_const_rho1d_15192;\n"
|
||||
" mov.u64 %rd7, __cuda___cuda_local_var_32677_33_non_const_rho1d_03144;\n"
|
||||
"$Lt_2_10242:\n"
|
||||
" .loc 17 243 0\n"
|
||||
" ld.param.f32 %f38, [__cudaparm_interp_b_lo_z];\n"
|
||||
" sub.ftz.f32 %f39, %f8, %f38;\n"
|
||||
" ld.param.f32 %f40, [__cudaparm_interp_delzinv];\n"
|
||||
" mul.ftz.f32 %f41, %f40, %f39;\n"
|
||||
" cvt.rzi.ftz.s32.f32 %r32, %f41;\n"
|
||||
" ld.param.s32 %r33, [__cudaparm_interp_npts_yx];\n"
|
||||
" mul.lo.s32 %r34, %r32, %r33;\n"
|
||||
" add.s32 %r35, %r25, %r34;\n"
|
||||
" @!%p4 bra $Lt_2_16898;\n"
|
||||
" cvt.rn.f32.s32 %f42, %r32;\n"
|
||||
" mov.f32 %f43, 0f3f000000; \n"
|
||||
" add.ftz.f32 %f44, %f42, %f43;\n"
|
||||
" sub.ftz.f32 %f45, %f44, %f41;\n"
|
||||
" mov.s32 %r36, %r2;\n"
|
||||
" ld.param.f32 %f46, [__cudaparm_interp_b_lo_y];\n"
|
||||
" sub.ftz.f32 %f47, %f7, %f46;\n"
|
||||
" cvt.s64.s32 %rd17, %r4;\n"
|
||||
" ld.param.f32 %f48, [__cudaparm_interp_delyinv];\n"
|
||||
" mul.ftz.f32 %f49, %f48, %f47;\n"
|
||||
" cvt.rzi.ftz.s32.f32 %r37, %f49;\n"
|
||||
" ld.param.s32 %r38, [__cudaparm_interp_npts_x];\n"
|
||||
" mul.lo.s32 %r39, %r37, %r38;\n"
|
||||
" mul.wide.s32 %rd3, %r4, 4;\n"
|
||||
" add.s32 %r40, %r39, %r35;\n"
|
||||
" add.u64 %rd18, %rd3, %rd7;\n"
|
||||
" add.u64 %rd19, %rd3, %rd8;\n"
|
||||
" cvt.s64.s32 %rd20, %r38;\n"
|
||||
" mul.wide.s32 %rd21, %r38, 16;\n"
|
||||
" mov.s32 %r41, %r40;\n"
|
||||
" ld.param.u64 %rd22, [__cudaparm_interp_brick];\n"
|
||||
" mov.s32 %r42, 0;\n"
|
||||
" mov.f32 %f50, 0f00000000; \n"
|
||||
" mov.f32 %f51, 0f00000000; \n"
|
||||
" mov.f32 %f52, 0f00000000; \n"
|
||||
" mov.s32 %r43, %r36;\n"
|
||||
"$Lt_2_12802:\n"
|
||||
" .loc 17 246 0\n"
|
||||
" add.s32 %r44, %r42, %r1;\n"
|
||||
" mov.s32 %r45, %r44;\n"
|
||||
" setp.lt.s32 %p8, %r44, %r42;\n"
|
||||
" @%p8 bra $Lt_2_17154;\n"
|
||||
" cvt.s64.s32 %rd23, %r2;\n"
|
||||
" mul.wide.s32 %rd13, %r2, 4;\n"
|
||||
" cvt.s64.s32 %rd24, %r44;\n"
|
||||
" mul.wide.s32 %rd25, %r44, 4;\n"
|
||||
" add.u64 %rd26, %rd1, %rd25;\n"
|
||||
" mov.f32 %f53, 0f00000000; \n"
|
||||
"$Lt_2_13570:\n"
|
||||
" .loc 17 247 0\n"
|
||||
" ld.shared.f32 %f54, [%rd26+0];\n"
|
||||
" fma.rn.ftz.f32 %f53, %f45, %f53, %f54;\n"
|
||||
" sub.s32 %r45, %r45, %r2;\n"
|
||||
" sub.u64 %rd26, %rd26, %rd13;\n"
|
||||
" setp.ge.s32 %p9, %r45, %r42;\n"
|
||||
" @%p9 bra $Lt_2_13570;\n"
|
||||
" bra.uni $Lt_2_13058;\n"
|
||||
"$Lt_2_17154:\n"
|
||||
" mov.f32 %f53, 0f00000000; \n"
|
||||
"$Lt_2_13058:\n"
|
||||
" .loc 17 249 0\n"
|
||||
" mov.s32 %r46, %r41;\n"
|
||||
" mov.s32 %r47, %r2;\n"
|
||||
" mul.ftz.f32 %f55, %f15, %f53;\n"
|
||||
" mov.s32 %r48, %r46;\n"
|
||||
" mov.s64 %rd27, %rd19;\n"
|
||||
" cvt.s64.s32 %rd28, %r46;\n"
|
||||
" mul.wide.s32 %rd29, %r46, 16;\n"
|
||||
" mov.s32 %r49, 0;\n"
|
||||
" mov.s32 %r50, %r47;\n"
|
||||
"$Lt_2_14594:\n"
|
||||
" mov.s32 %r51, %r2;\n"
|
||||
" mov.s32 %r52, %r48;\n"
|
||||
" add.s32 %r53, %r48, %r2;\n"
|
||||
" mov.s64 %rd30, %rd18;\n"
|
||||
" ld.shared.f32 %f56, [%rd27+0];\n"
|
||||
" add.u64 %rd31, %rd29, %rd22;\n"
|
||||
" mul.ftz.f32 %f57, %f55, %f56;\n"
|
||||
" mov.s32 %r54, %r51;\n"
|
||||
"$Lt_2_15362:\n"
|
||||
" .loc 17 253 0\n"
|
||||
" ld.shared.f32 %f58, [%rd30+0];\n"
|
||||
" mul.ftz.f32 %f59, %f58, %f57;\n"
|
||||
" ld.global.v4.f32 {%f60,%f61,%f62,_}, [%rd31+0];\n"
|
||||
" .loc 17 255 0\n"
|
||||
" mul.ftz.f32 %f63, %f59, %f60;\n"
|
||||
" sub.ftz.f32 %f52, %f52, %f63;\n"
|
||||
" .loc 17 256 0\n"
|
||||
" mul.ftz.f32 %f64, %f59, %f61;\n"
|
||||
" sub.ftz.f32 %f51, %f51, %f64;\n"
|
||||
" .loc 17 257 0\n"
|
||||
" mul.ftz.f32 %f65, %f59, %f62;\n"
|
||||
" sub.ftz.f32 %f50, %f50, %f65;\n"
|
||||
" add.s32 %r52, %r52, 1;\n"
|
||||
" add.u64 %rd31, %rd31, 16;\n"
|
||||
" add.u64 %rd30, %rd30, 256;\n"
|
||||
" setp.ne.s32 %p10, %r52, %r53;\n"
|
||||
" @%p10 bra $Lt_2_15362;\n"
|
||||
" add.s32 %r49, %r49, 1;\n"
|
||||
" add.s32 %r48, %r48, %r38;\n"
|
||||
" add.u64 %rd29, %rd29, %rd21;\n"
|
||||
" add.u64 %rd27, %rd27, 256;\n"
|
||||
" setp.ne.s32 %p11, %r49, %r2;\n"
|
||||
" @%p11 bra $Lt_2_14594;\n"
|
||||
" add.s32 %r42, %r42, 1;\n"
|
||||
" add.s32 %r41, %r46, %r33;\n"
|
||||
" setp.ne.s32 %p12, %r42, %r2;\n"
|
||||
" @%p12 bra $Lt_2_12802;\n"
|
||||
" bra.uni $Lt_2_9730;\n"
|
||||
"$Lt_2_16898:\n"
|
||||
" mov.f32 %f50, 0f00000000; \n"
|
||||
" mov.f32 %f51, 0f00000000; \n"
|
||||
" mov.f32 %f52, 0f00000000; \n"
|
||||
" bra.uni $Lt_2_9730;\n"
|
||||
"$Lt_2_9986:\n"
|
||||
" mov.f32 %f50, 0f00000000; \n"
|
||||
" mov.f32 %f51, 0f00000000; \n"
|
||||
" mov.f32 %f52, 0f00000000; \n"
|
||||
"$Lt_2_9730:\n"
|
||||
" .loc 17 264 0\n"
|
||||
" ld.param.u64 %rd32, [__cudaparm_interp_ans];\n"
|
||||
" cvt.s64.s32 %rd33, %r8;\n"
|
||||
" mul.wide.s32 %rd34, %r8, 16;\n"
|
||||
" add.u64 %rd35, %rd32, %rd34;\n"
|
||||
" mov.f32 %f66, %f67;\n"
|
||||
" st.global.v4.f32 [%rd35+0], {%f52,%f51,%f50,%f66};\n"
|
||||
"$Lt_2_9218:\n"
|
||||
" .loc 17 266 0\n"
|
||||
" exit;\n"
|
||||
"$LDWend_interp:\n"
|
||||
" }\n"
|
||||
;
|
||||
Binary file not shown.
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Reference in New Issue
Block a user