Files
lammps/lib/gpu/pair_gpu_atom_ptx.h

57 lines
1.7 KiB
C

const char * pair_gpu_atom_kernel =
" .version 2.3\n"
" .target sm_20\n"
" .address_size 64\n"
" .entry kernel_cast_x (\n"
" .param .u64 __cudaparm_kernel_cast_x_x_type,\n"
" .param .u64 __cudaparm_kernel_cast_x_x,\n"
" .param .u64 __cudaparm_kernel_cast_x_type,\n"
" .param .s32 __cudaparm_kernel_cast_x_nall)\n"
" {\n"
" .reg .u32 %r<10>;\n"
" .reg .u64 %rd<13>;\n"
" .reg .f32 %f<6>;\n"
" .reg .f64 %fd<5>;\n"
" .reg .pred %p<3>;\n"
" .loc 16 34 0\n"
"$LDWbegin_kernel_cast_x:\n"
" mov.u32 %r1, %ctaid.x;\n"
" mov.u32 %r2, %ntid.x;\n"
" mul.lo.u32 %r3, %r1, %r2;\n"
" mov.u32 %r4, %tid.x;\n"
" add.u32 %r5, %r4, %r3;\n"
" ld.param.s32 %r6, [__cudaparm_kernel_cast_x_nall];\n"
" setp.le.s32 %p1, %r6, %r5;\n"
" @%p1 bra $Lt_0_1026;\n"
" .loc 16 39 0\n"
" cvt.s64.s32 %rd1, %r5;\n"
" ld.param.u64 %rd2, [__cudaparm_kernel_cast_x_type];\n"
" mul.wide.s32 %rd3, %r5, 4;\n"
" add.u64 %rd4, %rd2, %rd3;\n"
" ld.global.s32 %r7, [%rd4+0];\n"
" cvt.rn.f32.s32 %f1, %r7;\n"
" .loc 16 42 0\n"
" ld.param.u64 %rd5, [__cudaparm_kernel_cast_x_x];\n"
" mul.lo.s32 %r8, %r5, 3;\n"
" cvt.s64.s32 %rd6, %r8;\n"
" mul.wide.s32 %rd7, %r8, 8;\n"
" add.u64 %rd8, %rd5, %rd7;\n"
" ld.global.f64 %fd1, [%rd8+8];\n"
" cvt.rn.ftz.f32.f64 %f2, %fd1;\n"
" .loc 16 43 0\n"
" ld.global.f64 %fd2, [%rd8+16];\n"
" cvt.rn.ftz.f32.f64 %f3, %fd2;\n"
" .loc 16 44 0\n"
" ld.param.u64 %rd9, [__cudaparm_kernel_cast_x_x_type];\n"
" mul.wide.s32 %rd10, %r5, 16;\n"
" add.u64 %rd11, %rd9, %rd10;\n"
" ld.global.f64 %fd3, [%rd8+0];\n"
" cvt.rn.ftz.f32.f64 %f4, %fd3;\n"
" st.global.v4.f32 [%rd11+0], {%f4,%f2,%f3,%f1};\n"
"$Lt_0_1026:\n"
" .loc 16 46 0\n"
" exit;\n"
"$LDWend_kernel_cast_x:\n"
" }\n"
;