102 lines
3.5 KiB
Plaintext
102 lines
3.5 KiB
Plaintext
.version 2.3
|
|
.target sm_20
|
|
.address_size 64
|
|
// compiled with /usr/local/cuda/open64/lib//be
|
|
// nvopencc 4.0 built on 2011-05-12
|
|
|
|
//-----------------------------------------------------------
|
|
// Compiling /tmp/tmpxft_0000bafa_00000000-9_pair_gpu_atom_kernel.cpp3.i (/home/sjplimp/ccBI#.kAZxYr)
|
|
//-----------------------------------------------------------
|
|
|
|
//-----------------------------------------------------------
|
|
// Options:
|
|
//-----------------------------------------------------------
|
|
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
|
|
// -O3 (Optimization level)
|
|
// -g0 (Debug level)
|
|
// -m2 (Report advisories)
|
|
//-----------------------------------------------------------
|
|
|
|
.file 1 "<command-line>"
|
|
.file 2 "/tmp/tmpxft_0000bafa_00000000-8_pair_gpu_atom_kernel.cudafe2.gpu"
|
|
.file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
|
|
.file 4 "/usr/local/cuda/include/crt/device_runtime.h"
|
|
.file 5 "/usr/local/cuda/include/host_defines.h"
|
|
.file 6 "/usr/local/cuda/include/builtin_types.h"
|
|
.file 7 "/usr/local/cuda/include/device_types.h"
|
|
.file 8 "/usr/local/cuda/include/driver_types.h"
|
|
.file 9 "/usr/local/cuda/include/surface_types.h"
|
|
.file 10 "/usr/local/cuda/include/texture_types.h"
|
|
.file 11 "/usr/local/cuda/include/vector_types.h"
|
|
.file 12 "/usr/local/cuda/include/device_launch_parameters.h"
|
|
.file 13 "/usr/local/cuda/include/crt/storage_class.h"
|
|
.file 14 "/usr/include/bits/types.h"
|
|
.file 15 "/usr/include/time.h"
|
|
.file 16 "pair_gpu_atom_kernel.cu"
|
|
.file 17 "/usr/local/cuda/include/common_functions.h"
|
|
.file 18 "/usr/local/cuda/include/math_functions.h"
|
|
.file 19 "/usr/local/cuda/include/math_constants.h"
|
|
.file 20 "/usr/local/cuda/include/device_functions.h"
|
|
.file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h"
|
|
.file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h"
|
|
.file 23 "/usr/local/cuda/include/sm_13_double_functions.h"
|
|
.file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h"
|
|
.file 25 "/usr/local/cuda/include/sm_20_intrinsics.h"
|
|
.file 26 "/usr/local/cuda/include/surface_functions.h"
|
|
.file 27 "/usr/local/cuda/include/texture_fetch_functions.h"
|
|
.file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
|
|
|
|
|
|
.entry kernel_cast_x (
|
|
.param .u64 __cudaparm_kernel_cast_x_x_type,
|
|
.param .u64 __cudaparm_kernel_cast_x_x,
|
|
.param .u64 __cudaparm_kernel_cast_x_type,
|
|
.param .s32 __cudaparm_kernel_cast_x_nall)
|
|
{
|
|
.reg .u32 %r<10>;
|
|
.reg .u64 %rd<13>;
|
|
.reg .f32 %f<6>;
|
|
.reg .f64 %fd<5>;
|
|
.reg .pred %p<3>;
|
|
.loc 16 34 0
|
|
$LDWbegin_kernel_cast_x:
|
|
mov.u32 %r1, %ctaid.x;
|
|
mov.u32 %r2, %ntid.x;
|
|
mul.lo.u32 %r3, %r1, %r2;
|
|
mov.u32 %r4, %tid.x;
|
|
add.u32 %r5, %r4, %r3;
|
|
ld.param.s32 %r6, [__cudaparm_kernel_cast_x_nall];
|
|
setp.le.s32 %p1, %r6, %r5;
|
|
@%p1 bra $Lt_0_1026;
|
|
.loc 16 39 0
|
|
cvt.s64.s32 %rd1, %r5;
|
|
ld.param.u64 %rd2, [__cudaparm_kernel_cast_x_type];
|
|
mul.wide.s32 %rd3, %r5, 4;
|
|
add.u64 %rd4, %rd2, %rd3;
|
|
ld.global.s32 %r7, [%rd4+0];
|
|
cvt.rn.f32.s32 %f1, %r7;
|
|
.loc 16 42 0
|
|
ld.param.u64 %rd5, [__cudaparm_kernel_cast_x_x];
|
|
mul.lo.s32 %r8, %r5, 3;
|
|
cvt.s64.s32 %rd6, %r8;
|
|
mul.wide.s32 %rd7, %r8, 8;
|
|
add.u64 %rd8, %rd5, %rd7;
|
|
ld.global.f64 %fd1, [%rd8+8];
|
|
cvt.rn.ftz.f32.f64 %f2, %fd1;
|
|
.loc 16 43 0
|
|
ld.global.f64 %fd2, [%rd8+16];
|
|
cvt.rn.ftz.f32.f64 %f3, %fd2;
|
|
.loc 16 44 0
|
|
ld.param.u64 %rd9, [__cudaparm_kernel_cast_x_x_type];
|
|
mul.wide.s32 %rd10, %r5, 16;
|
|
add.u64 %rd11, %rd9, %rd10;
|
|
ld.global.f64 %fd3, [%rd8+0];
|
|
cvt.rn.ftz.f32.f64 %f4, %fd3;
|
|
st.global.v4.f32 [%rd11+0], {%f4,%f2,%f3,%f1};
|
|
$Lt_0_1026:
|
|
.loc 16 46 0
|
|
exit;
|
|
$LDWend_kernel_cast_x:
|
|
} // kernel_cast_x
|
|
|