This is an automated email from the ASF dual-hosted git repository. markd pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/systemds.git
commit 29bf8f18ad4893bd22015ab4f5e46b6f8b7c218c Author: Mark Dokter <[email protected]> AuthorDate: Wed Apr 20 14:12:41 2022 +0200 [SYSTEMDS-3352] CUDA code generation binaries Code gen native support compiled on Ubuntu 20 LTS (still on CUDA 10.2 ofc) --- .../cpp/lib/libsystemds_spoof_cuda-Linux-x86_64.so | Bin 302880 -> 285976 bytes src/main/cuda/kernels/reduction.ptx | 1185 ++++++++++++-------- 2 files changed, 698 insertions(+), 487 deletions(-) diff --git a/src/main/cpp/lib/libsystemds_spoof_cuda-Linux-x86_64.so b/src/main/cpp/lib/libsystemds_spoof_cuda-Linux-x86_64.so index ec5be11087..81d1184b18 100644 Binary files a/src/main/cpp/lib/libsystemds_spoof_cuda-Linux-x86_64.so and b/src/main/cpp/lib/libsystemds_spoof_cuda-Linux-x86_64.so differ diff --git a/src/main/cuda/kernels/reduction.ptx b/src/main/cuda/kernels/reduction.ptx index 72b922596a..8b949f9dba 100644 --- a/src/main/cuda/kernels/reduction.ptx +++ b/src/main/cuda/kernels/reduction.ptx @@ -11,7 +11,14 @@ .address_size 64 // .globl double2float_f +.extern .func (.param .b32 func_retval0) vprintf +( + .param .b64 vprintf_param_0, + .param .b64 vprintf_param_1 +) +; .extern .shared .align 1 .b8 memory[]; +.global .align 1 .b8 $str[28] = {84, 66, 73, 58, 32, 118, 97, 108, 95, 115, 112, 97, 114, 115, 101, 95, 114, 99, 40, 37, 100, 44, 32, 37, 100, 41, 10, 0}; .visible .entry double2float_f( .param .u64 double2float_f_param_0, @@ -95,151 +102,151 @@ BB1_2: .param .u32 reduce_sum_f_param_2 ) { + .local .align 8 .b8 __local_depot2[8]; + .reg .b64 %SP; + .reg .b64 %SPL; .reg .pred %p<25>; .reg .f32 %f<69>; - .reg .b32 %r<57>; - .reg .b64 %rd<36>; - - - ld.param.u64 %rd9, [reduce_sum_f_param_0]; - ld.param.u64 %rd10, [reduce_sum_f_param_1]; - ld.param.u32 %r13, [reduce_sum_f_param_2]; - mov.u32 %r14, %ctaid.x; - shl.b32 %r15, %r14, 1; - mov.u32 %r16, %ntid.x; + .reg .b32 %r<51>; + .reg .b64 %rd<38>; + + + mov.u64 %SPL, __local_depot2; + cvta.local.u64 %SP, %SPL; + ld.param.u64 %rd10, [reduce_sum_f_param_0]; + ld.param.u64 %rd11, [reduce_sum_f_param_1]; + ld.param.u32 %r14, [reduce_sum_f_param_2]; + mov.u32 %r15, %ctaid.x; + shl.b32 %r16, %r15, 1; + mov.u32 %r1, %ntid.x; mov.u32 %r17, %tid.x; - mad.lo.s32 %r56, %r15, %r16, %r17; + mad.lo.s32 %r48, %r16, %r1, %r17; mov.f32 %f51, 0f00000000; - setp.ge.u32 %p1, %r56, %r13; + setp.ge.u32 %p1, %r48, %r14; @%p1 bra BB2_11; - cvta.to.global.u64 %rd11, %rd9; - ld.global.u64 %rd1, [%rd11+16]; + cvta.to.global.u64 %rd12, %rd10; + ld.global.u64 %rd1, [%rd12+16]; setp.eq.s64 %p2, %rd1, 0; - ld.global.u64 %rd12, [%rd11+32]; - cvta.to.global.u64 %rd2, %rd12; + ld.global.u64 %rd2, [%rd12+32]; + mov.u32 %r18, %nctaid.x; + mul.lo.s32 %r19, %r1, %r18; + shl.b32 %r4, %r19, 1; mov.f32 %f51, 0f00000000; @%p2 bra BB2_8; - mad.lo.s32 %r54, %r15, %r16, %r17; - mov.f32 %f51, 0f00000000; - mov.u64 %rd32, %rd1; + mov.u64 %rd34, %rd1; BB2_3: - cvta.to.global.u64 %rd13, %rd32; - mul.wide.u32 %rd14, %r54, 4; - add.s64 %rd15, %rd13, %rd14; - ld.global.u32 %r27, [%rd15]; - mul.wide.u32 %rd16, %r27, 4; - add.s64 %rd17, %rd2, %rd16; - ld.global.f32 %f36, [%rd17]; + mul.wide.u32 %rd13, %r48, 4; + add.s64 %rd14, %rd34, %rd13; + ld.u32 %r20, [%rd14]; + mul.wide.u32 %rd15, %r20, 4; + add.s64 %rd16, %rd2, %rd15; + ld.f32 %f36, [%rd16]; add.f32 %f51, %f51, %f36; - add.s32 %r55, %r54, %r16; - setp.ge.u32 %p3, %r55, %r13; + add.s32 %r49, %r48, %r1; + setp.ge.u32 %p3, %r49, %r14; @%p3 bra BB2_7; - setp.eq.s64 %p4, %rd32, 0; - mov.u64 %rd32, 0; + setp.eq.s64 %p4, %rd34, 0; + mov.u64 %rd34, 0; @%p4 bra BB2_6; - cvta.to.global.u64 %rd19, %rd1; - mul.wide.u32 %rd20, %r55, 4; - add.s64 %rd21, %rd19, %rd20; - ld.global.u32 %r55, [%rd21]; - mov.u64 %rd32, %rd1; + mul.wide.u32 %rd18, %r49, 4; + add.s64 %rd19, %rd1, %rd18; + ld.u32 %r49, [%rd19]; + mov.u64 %rd34, %rd1; BB2_6: - mul.wide.u32 %rd22, %r55, 4; - add.s64 %rd23, %rd2, %rd22; - ld.global.f32 %f37, [%rd23]; + mul.wide.u32 %rd20, %r49, 4; + add.s64 %rd21, %rd2, %rd20; + ld.f32 %f37, [%rd21]; add.f32 %f51, %f51, %f37; BB2_7: - shl.b32 %r30, %r16, 1; - mov.u32 %r31, %nctaid.x; - mad.lo.s32 %r54, %r30, %r31, %r54; - setp.lt.u32 %p5, %r54, %r13; + shl.b32 %r23, %r1, 1; + mad.lo.s32 %r48, %r23, %r18, %r48; + setp.lt.u32 %p5, %r48, %r14; @%p5 bra BB2_3; bra.uni BB2_11; BB2_8: - mul.wide.u32 %rd24, %r56, 4; - add.s64 %rd25, %rd2, %rd24; - ld.global.f32 %f38, [%rd25]; + mul.wide.u32 %rd22, %r48, 4; + add.s64 %rd23, %rd2, %rd22; + ld.f32 %f38, [%rd23]; add.f32 %f51, %f51, %f38; - add.s32 %r10, %r56, %r16; - setp.ge.u32 %p6, %r10, %r13; + add.s32 %r11, %r48, %r1; + setp.ge.u32 %p6, %r11, %r14; @%p6 bra BB2_10; - mul.wide.u32 %rd26, %r10, 4; - add.s64 %rd27, %rd2, %rd26; - ld.global.f32 %f39, [%rd27]; + mul.wide.u32 %rd24, %r11, 4; + add.s64 %rd25, %rd2, %rd24; + ld.f32 %f39, [%rd25]; add.f32 %f51, %f51, %f39; BB2_10: - mov.u32 %r32, %nctaid.x; - shl.b32 %r33, %r16, 1; - mad.lo.s32 %r56, %r33, %r32, %r56; - setp.lt.u32 %p7, %r56, %r13; + add.s32 %r48, %r48, %r4; + setp.lt.u32 %p7, %r48, %r14; @%p7 bra BB2_8; BB2_11: - shl.b32 %r35, %r17, 2; - mov.u32 %r36, memory; - add.s32 %r12, %r36, %r35; - st.shared.f32 [%r12], %f51; + shl.b32 %r26, %r17, 2; + mov.u32 %r27, memory; + add.s32 %r13, %r27, %r26; + st.shared.f32 [%r13], %f51; bar.sync 0; - setp.lt.u32 %p8, %r16, 1024; + setp.lt.u32 %p8, %r1, 1024; @%p8 bra BB2_15; setp.gt.u32 %p9, %r17, 511; @%p9 bra BB2_14; - ld.shared.f32 %f40, [%r12+2048]; + ld.shared.f32 %f40, [%r13+2048]; add.f32 %f51, %f51, %f40; - st.shared.f32 [%r12], %f51; + st.shared.f32 [%r13], %f51; BB2_14: bar.sync 0; BB2_15: - setp.lt.u32 %p10, %r16, 512; + setp.lt.u32 %p10, %r1, 512; @%p10 bra BB2_19; setp.gt.u32 %p11, %r17, 255; @%p11 bra BB2_18; - ld.shared.f32 %f41, [%r12+1024]; + ld.shared.f32 %f41, [%r13+1024]; add.f32 %f51, %f51, %f41; - st.shared.f32 [%r12], %f51; + st.shared.f32 [%r13], %f51; BB2_18: bar.sync 0; BB2_19: - setp.lt.u32 %p12, %r16, 256; + setp.lt.u32 %p12, %r1, 256; @%p12 bra BB2_23; setp.gt.u32 %p13, %r17, 127; @%p13 bra BB2_22; - ld.shared.f32 %f42, [%r12+512]; + ld.shared.f32 %f42, [%r13+512]; add.f32 %f51, %f51, %f42; - st.shared.f32 [%r12], %f51; + st.shared.f32 [%r13], %f51; BB2_22: bar.sync 0; BB2_23: - setp.lt.u32 %p14, %r16, 128; + setp.lt.u32 %p14, %r1, 128; @%p14 bra BB2_27; setp.gt.u32 %p15, %r17, 63; @%p15 bra BB2_26; - ld.shared.f32 %f43, [%r12+256]; + ld.shared.f32 %f43, [%r13+256]; add.f32 %f51, %f51, %f43; - st.shared.f32 [%r12], %f51; + st.shared.f32 [%r13], %f51; BB2_26: bar.sync 0; @@ -248,72 +255,105 @@ BB2_27: setp.gt.u32 %p16, %r17, 31; @%p16 bra BB2_40; - setp.lt.u32 %p17, %r16, 64; + setp.lt.u32 %p17, %r1, 64; @%p17 bra BB2_30; - ld.volatile.shared.f32 %f44, [%r12+128]; + ld.volatile.shared.f32 %f44, [%r13+128]; add.f32 %f51, %f51, %f44; - st.volatile.shared.f32 [%r12], %f51; + st.volatile.shared.f32 [%r13], %f51; BB2_30: - setp.lt.u32 %p18, %r16, 32; + setp.lt.u32 %p18, %r1, 32; @%p18 bra BB2_32; - ld.volatile.shared.f32 %f45, [%r12+64]; + ld.volatile.shared.f32 %f45, [%r13+64]; add.f32 %f51, %f51, %f45; - st.volatile.shared.f32 [%r12], %f51; + st.volatile.shared.f32 [%r13], %f51; BB2_32: - setp.lt.u32 %p19, %r16, 16; + setp.lt.u32 %p19, %r1, 16; @%p19 bra BB2_34; - ld.volatile.shared.f32 %f46, [%r12+32]; + ld.volatile.shared.f32 %f46, [%r13+32]; add.f32 %f51, %f51, %f46; - st.volatile.shared.f32 [%r12], %f51; + st.volatile.shared.f32 [%r13], %f51; BB2_34: - setp.lt.u32 %p20, %r16, 8; + setp.lt.u32 %p20, %r1, 8; @%p20 bra BB2_36; - ld.volatile.shared.f32 %f47, [%r12+16]; + ld.volatile.shared.f32 %f47, [%r13+16]; add.f32 %f51, %f51, %f47; - st.volatile.shared.f32 [%r12], %f51; + st.volatile.shared.f32 [%r13], %f51; BB2_36: - setp.lt.u32 %p21, %r16, 4; + setp.lt.u32 %p21, %r1, 4; @%p21 bra BB2_38; - ld.volatile.shared.f32 %f48, [%r12+8]; + ld.volatile.shared.f32 %f48, [%r13+8]; add.f32 %f51, %f51, %f48; - st.volatile.shared.f32 [%r12], %f51; + st.volatile.shared.f32 [%r13], %f51; BB2_38: - setp.lt.u32 %p22, %r16, 2; + setp.lt.u32 %p22, %r1, 2; @%p22 bra BB2_40; - ld.volatile.shared.f32 %f49, [%r12+4]; + ld.volatile.shared.f32 %f49, [%r13+4]; add.f32 %f50, %f51, %f49; - st.volatile.shared.f32 [%r12], %f50; + st.volatile.shared.f32 [%r13], %f50; BB2_40: setp.ne.s32 %p23, %r17, 0; - @%p23 bra BB2_44; + @%p23 bra BB2_45; ld.shared.f32 %f32, [memory]; - cvta.to.global.u64 %rd28, %rd10; - ld.global.u64 %rd29, [%rd28+16]; - ld.global.u64 %rd30, [%rd28+32]; - cvta.to.global.u64 %rd35, %rd30; - setp.ne.s64 %p24, %rd29, 0; + cvta.to.global.u64 %rd26, %rd11; + add.s64 %rd6, %rd26, 16; + ld.global.u64 %rd27, [%rd26+16]; + setp.eq.s64 %p24, %rd27, 0; @%p24 bra BB2_43; - mul.wide.u32 %rd31, %r14, 4; - add.s64 %rd35, %rd35, %rd31; + mov.u32 %r44, 0; + add.u64 %rd28, %SP, 0; + add.u64 %rd29, %SPL, 0; + st.local.u32 [%rd29], %r44; + st.local.u32 [%rd29+4], %r15; + mov.u64 %rd30, $str; + cvta.global.u64 %rd31, %rd30; + // Callseq Start 0 + { + .reg .b32 temp_param_reg; + // <end>} + .param .b64 param0; + st.param.b64 [param0+0], %rd31; + .param .b64 param1; + st.param.b64 [param1+0], %rd28; + .param .b32 retval0; + call.uni (retval0), + vprintf, + ( + param0, + param1 + ); + ld.param.b32 %r46, [retval0+0]; + + //{ + }// Callseq End 0 + // inline asm + trap; + // inline asm + ld.global.u64 %rd37, [%rd6+16]; + bra.uni BB2_44; BB2_43: - st.global.f32 [%rd35], %f32; + ld.global.u64 %rd32, [%rd6+16]; + mul.wide.u32 %rd33, %r15, 4; + add.s64 %rd37, %rd32, %rd33; BB2_44: + st.f32 [%rd37], %f32; + +BB2_45: ret; } @@ -324,151 +364,151 @@ BB2_44: .param .u32 reduce_sum_d_param_2 ) { + .local .align 8 .b8 __local_depot3[8]; + .reg .b64 %SP; + .reg .b64 %SPL; .reg .pred %p<25>; - .reg .b32 %r<57>; + .reg .b32 %r<51>; .reg .f64 %fd<69>; - .reg .b64 %rd<36>; + .reg .b64 %rd<38>; - ld.param.u64 %rd9, [reduce_sum_d_param_0]; - ld.param.u64 %rd10, [reduce_sum_d_param_1]; - ld.param.u32 %r13, [reduce_sum_d_param_2]; - mov.u32 %r14, %ctaid.x; - shl.b32 %r15, %r14, 1; - mov.u32 %r16, %ntid.x; + mov.u64 %SPL, __local_depot3; + cvta.local.u64 %SP, %SPL; + ld.param.u64 %rd10, [reduce_sum_d_param_0]; + ld.param.u64 %rd11, [reduce_sum_d_param_1]; + ld.param.u32 %r14, [reduce_sum_d_param_2]; + mov.u32 %r15, %ctaid.x; + shl.b32 %r16, %r15, 1; + mov.u32 %r1, %ntid.x; mov.u32 %r17, %tid.x; - mad.lo.s32 %r56, %r15, %r16, %r17; + mad.lo.s32 %r48, %r16, %r1, %r17; mov.f64 %fd51, 0d0000000000000000; - setp.ge.u32 %p1, %r56, %r13; + setp.ge.u32 %p1, %r48, %r14; @%p1 bra BB3_11; - cvta.to.global.u64 %rd11, %rd9; - ld.global.u64 %rd1, [%rd11+16]; + cvta.to.global.u64 %rd12, %rd10; + ld.global.u64 %rd1, [%rd12+16]; setp.eq.s64 %p2, %rd1, 0; - ld.global.u64 %rd12, [%rd11+32]; - cvta.to.global.u64 %rd2, %rd12; + ld.global.u64 %rd2, [%rd12+32]; + mov.u32 %r18, %nctaid.x; + mul.lo.s32 %r19, %r1, %r18; + shl.b32 %r4, %r19, 1; mov.f64 %fd51, 0d0000000000000000; @%p2 bra BB3_8; - mad.lo.s32 %r54, %r15, %r16, %r17; - mov.f64 %fd51, 0d0000000000000000; - mov.u64 %rd32, %rd1; + mov.u64 %rd34, %rd1; BB3_3: - cvta.to.global.u64 %rd13, %rd32; - mul.wide.u32 %rd14, %r54, 4; - add.s64 %rd15, %rd13, %rd14; - ld.global.u32 %r27, [%rd15]; - mul.wide.u32 %rd16, %r27, 8; - add.s64 %rd17, %rd2, %rd16; - ld.global.f64 %fd36, [%rd17]; + mul.wide.u32 %rd13, %r48, 4; + add.s64 %rd14, %rd34, %rd13; + ld.u32 %r20, [%rd14]; + mul.wide.u32 %rd15, %r20, 8; + add.s64 %rd16, %rd2, %rd15; + ld.f64 %fd36, [%rd16]; add.f64 %fd51, %fd51, %fd36; - add.s32 %r55, %r54, %r16; - setp.ge.u32 %p3, %r55, %r13; + add.s32 %r49, %r48, %r1; + setp.ge.u32 %p3, %r49, %r14; @%p3 bra BB3_7; - setp.eq.s64 %p4, %rd32, 0; - mov.u64 %rd32, 0; + setp.eq.s64 %p4, %rd34, 0; + mov.u64 %rd34, 0; @%p4 bra BB3_6; - cvta.to.global.u64 %rd19, %rd1; - mul.wide.u32 %rd20, %r55, 4; - add.s64 %rd21, %rd19, %rd20; - ld.global.u32 %r55, [%rd21]; - mov.u64 %rd32, %rd1; + mul.wide.u32 %rd18, %r49, 4; + add.s64 %rd19, %rd1, %rd18; + ld.u32 %r49, [%rd19]; + mov.u64 %rd34, %rd1; BB3_6: - mul.wide.u32 %rd22, %r55, 8; - add.s64 %rd23, %rd2, %rd22; - ld.global.f64 %fd37, [%rd23]; + mul.wide.u32 %rd20, %r49, 8; + add.s64 %rd21, %rd2, %rd20; + ld.f64 %fd37, [%rd21]; add.f64 %fd51, %fd51, %fd37; BB3_7: - shl.b32 %r30, %r16, 1; - mov.u32 %r31, %nctaid.x; - mad.lo.s32 %r54, %r30, %r31, %r54; - setp.lt.u32 %p5, %r54, %r13; + shl.b32 %r23, %r1, 1; + mad.lo.s32 %r48, %r23, %r18, %r48; + setp.lt.u32 %p5, %r48, %r14; @%p5 bra BB3_3; bra.uni BB3_11; BB3_8: - mul.wide.u32 %rd24, %r56, 8; - add.s64 %rd25, %rd2, %rd24; - ld.global.f64 %fd38, [%rd25]; + mul.wide.u32 %rd22, %r48, 8; + add.s64 %rd23, %rd2, %rd22; + ld.f64 %fd38, [%rd23]; add.f64 %fd51, %fd51, %fd38; - add.s32 %r10, %r56, %r16; - setp.ge.u32 %p6, %r10, %r13; + add.s32 %r11, %r48, %r1; + setp.ge.u32 %p6, %r11, %r14; @%p6 bra BB3_10; - mul.wide.u32 %rd26, %r10, 8; - add.s64 %rd27, %rd2, %rd26; - ld.global.f64 %fd39, [%rd27]; + mul.wide.u32 %rd24, %r11, 8; + add.s64 %rd25, %rd2, %rd24; + ld.f64 %fd39, [%rd25]; add.f64 %fd51, %fd51, %fd39; BB3_10: - mov.u32 %r32, %nctaid.x; - shl.b32 %r33, %r16, 1; - mad.lo.s32 %r56, %r33, %r32, %r56; - setp.lt.u32 %p7, %r56, %r13; + add.s32 %r48, %r48, %r4; + setp.lt.u32 %p7, %r48, %r14; @%p7 bra BB3_8; BB3_11: - shl.b32 %r35, %r17, 3; - mov.u32 %r36, memory; - add.s32 %r12, %r36, %r35; - st.shared.f64 [%r12], %fd51; + shl.b32 %r26, %r17, 3; + mov.u32 %r27, memory; + add.s32 %r13, %r27, %r26; + st.shared.f64 [%r13], %fd51; bar.sync 0; - setp.lt.u32 %p8, %r16, 1024; + setp.lt.u32 %p8, %r1, 1024; @%p8 bra BB3_15; setp.gt.u32 %p9, %r17, 511; @%p9 bra BB3_14; - ld.shared.f64 %fd40, [%r12+4096]; + ld.shared.f64 %fd40, [%r13+4096]; add.f64 %fd51, %fd51, %fd40; - st.shared.f64 [%r12], %fd51; + st.shared.f64 [%r13], %fd51; BB3_14: bar.sync 0; BB3_15: - setp.lt.u32 %p10, %r16, 512; + setp.lt.u32 %p10, %r1, 512; @%p10 bra BB3_19; setp.gt.u32 %p11, %r17, 255; @%p11 bra BB3_18; - ld.shared.f64 %fd41, [%r12+2048]; + ld.shared.f64 %fd41, [%r13+2048]; add.f64 %fd51, %fd51, %fd41; - st.shared.f64 [%r12], %fd51; + st.shared.f64 [%r13], %fd51; BB3_18: bar.sync 0; BB3_19: - setp.lt.u32 %p12, %r16, 256; + setp.lt.u32 %p12, %r1, 256; @%p12 bra BB3_23; setp.gt.u32 %p13, %r17, 127; @%p13 bra BB3_22; - ld.shared.f64 %fd42, [%r12+1024]; + ld.shared.f64 %fd42, [%r13+1024]; add.f64 %fd51, %fd51, %fd42; - st.shared.f64 [%r12], %fd51; + st.shared.f64 [%r13], %fd51; BB3_22: bar.sync 0; BB3_23: - setp.lt.u32 %p14, %r16, 128; + setp.lt.u32 %p14, %r1, 128; @%p14 bra BB3_27; setp.gt.u32 %p15, %r17, 63; @%p15 bra BB3_26; - ld.shared.f64 %fd43, [%r12+512]; + ld.shared.f64 %fd43, [%r13+512]; add.f64 %fd51, %fd51, %fd43; - st.shared.f64 [%r12], %fd51; + st.shared.f64 [%r13], %fd51; BB3_26: bar.sync 0; @@ -477,72 +517,105 @@ BB3_27: setp.gt.u32 %p16, %r17, 31; @%p16 bra BB3_40; - setp.lt.u32 %p17, %r16, 64; + setp.lt.u32 %p17, %r1, 64; @%p17 bra BB3_30; - ld.volatile.shared.f64 %fd44, [%r12+256]; + ld.volatile.shared.f64 %fd44, [%r13+256]; add.f64 %fd51, %fd51, %fd44; - st.volatile.shared.f64 [%r12], %fd51; + st.volatile.shared.f64 [%r13], %fd51; BB3_30: - setp.lt.u32 %p18, %r16, 32; + setp.lt.u32 %p18, %r1, 32; @%p18 bra BB3_32; - ld.volatile.shared.f64 %fd45, [%r12+128]; + ld.volatile.shared.f64 %fd45, [%r13+128]; add.f64 %fd51, %fd51, %fd45; - st.volatile.shared.f64 [%r12], %fd51; + st.volatile.shared.f64 [%r13], %fd51; BB3_32: - setp.lt.u32 %p19, %r16, 16; + setp.lt.u32 %p19, %r1, 16; @%p19 bra BB3_34; - ld.volatile.shared.f64 %fd46, [%r12+64]; + ld.volatile.shared.f64 %fd46, [%r13+64]; add.f64 %fd51, %fd51, %fd46; - st.volatile.shared.f64 [%r12], %fd51; + st.volatile.shared.f64 [%r13], %fd51; BB3_34: - setp.lt.u32 %p20, %r16, 8; + setp.lt.u32 %p20, %r1, 8; @%p20 bra BB3_36; - ld.volatile.shared.f64 %fd47, [%r12+32]; + ld.volatile.shared.f64 %fd47, [%r13+32]; add.f64 %fd51, %fd51, %fd47; - st.volatile.shared.f64 [%r12], %fd51; + st.volatile.shared.f64 [%r13], %fd51; BB3_36: - setp.lt.u32 %p21, %r16, 4; + setp.lt.u32 %p21, %r1, 4; @%p21 bra BB3_38; - ld.volatile.shared.f64 %fd48, [%r12+16]; + ld.volatile.shared.f64 %fd48, [%r13+16]; add.f64 %fd51, %fd51, %fd48; - st.volatile.shared.f64 [%r12], %fd51; + st.volatile.shared.f64 [%r13], %fd51; BB3_38: - setp.lt.u32 %p22, %r16, 2; + setp.lt.u32 %p22, %r1, 2; @%p22 bra BB3_40; - ld.volatile.shared.f64 %fd49, [%r12+8]; + ld.volatile.shared.f64 %fd49, [%r13+8]; add.f64 %fd50, %fd51, %fd49; - st.volatile.shared.f64 [%r12], %fd50; + st.volatile.shared.f64 [%r13], %fd50; BB3_40: setp.ne.s32 %p23, %r17, 0; - @%p23 bra BB3_44; + @%p23 bra BB3_45; ld.shared.f64 %fd32, [memory]; - cvta.to.global.u64 %rd28, %rd10; - ld.global.u64 %rd29, [%rd28+16]; - ld.global.u64 %rd30, [%rd28+32]; - cvta.to.global.u64 %rd35, %rd30; - setp.ne.s64 %p24, %rd29, 0; + cvta.to.global.u64 %rd26, %rd11; + add.s64 %rd6, %rd26, 16; + ld.global.u64 %rd27, [%rd26+16]; + setp.eq.s64 %p24, %rd27, 0; @%p24 bra BB3_43; - mul.wide.u32 %rd31, %r14, 8; - add.s64 %rd35, %rd35, %rd31; + mov.u32 %r44, 0; + add.u64 %rd28, %SP, 0; + add.u64 %rd29, %SPL, 0; + st.local.u32 [%rd29], %r44; + st.local.u32 [%rd29+4], %r15; + mov.u64 %rd30, $str; + cvta.global.u64 %rd31, %rd30; + // Callseq Start 1 + { + .reg .b32 temp_param_reg; + // <end>} + .param .b64 param0; + st.param.b64 [param0+0], %rd31; + .param .b64 param1; + st.param.b64 [param1+0], %rd28; + .param .b32 retval0; + call.uni (retval0), + vprintf, + ( + param0, + param1 + ); + ld.param.b32 %r46, [retval0+0]; + + //{ + }// Callseq End 1 + // inline asm + trap; + // inline asm + ld.global.u64 %rd37, [%rd6+16]; + bra.uni BB3_44; BB3_43: - st.global.f64 [%rd35], %fd32; + ld.global.u64 %rd32, [%rd6+16]; + mul.wide.u32 %rd33, %r15, 8; + add.s64 %rd37, %rd32, %rd33; BB3_44: + st.f64 [%rd37], %fd32; + +BB3_45: ret; } @@ -553,151 +626,151 @@ BB3_44: .param .u32 reduce_max_f_param_2 ) { + .local .align 8 .b8 __local_depot4[8]; + .reg .b64 %SP; + .reg .b64 %SPL; .reg .pred %p<25>; .reg .f32 %f<69>; - .reg .b32 %r<57>; - .reg .b64 %rd<36>; - - - ld.param.u64 %rd9, [reduce_max_f_param_0]; - ld.param.u64 %rd10, [reduce_max_f_param_1]; - ld.param.u32 %r13, [reduce_max_f_param_2]; - mov.u32 %r14, %ctaid.x; - shl.b32 %r15, %r14, 1; - mov.u32 %r16, %ntid.x; + .reg .b32 %r<51>; + .reg .b64 %rd<38>; + + + mov.u64 %SPL, __local_depot4; + cvta.local.u64 %SP, %SPL; + ld.param.u64 %rd10, [reduce_max_f_param_0]; + ld.param.u64 %rd11, [reduce_max_f_param_1]; + ld.param.u32 %r14, [reduce_max_f_param_2]; + mov.u32 %r15, %ctaid.x; + shl.b32 %r16, %r15, 1; + mov.u32 %r1, %ntid.x; mov.u32 %r17, %tid.x; - mad.lo.s32 %r56, %r15, %r16, %r17; + mad.lo.s32 %r48, %r16, %r1, %r17; mov.f32 %f51, 0fFF800000; - setp.ge.u32 %p1, %r56, %r13; + setp.ge.u32 %p1, %r48, %r14; @%p1 bra BB4_11; - cvta.to.global.u64 %rd11, %rd9; - ld.global.u64 %rd1, [%rd11+16]; + cvta.to.global.u64 %rd12, %rd10; + ld.global.u64 %rd1, [%rd12+16]; setp.eq.s64 %p2, %rd1, 0; - ld.global.u64 %rd12, [%rd11+32]; - cvta.to.global.u64 %rd2, %rd12; + ld.global.u64 %rd2, [%rd12+32]; + mov.u32 %r18, %nctaid.x; + mul.lo.s32 %r19, %r1, %r18; + shl.b32 %r4, %r19, 1; mov.f32 %f51, 0fFF800000; @%p2 bra BB4_8; - mad.lo.s32 %r54, %r15, %r16, %r17; - mov.f32 %f51, 0fFF800000; - mov.u64 %rd32, %rd1; + mov.u64 %rd34, %rd1; BB4_3: - cvta.to.global.u64 %rd13, %rd32; - mul.wide.u32 %rd14, %r54, 4; - add.s64 %rd15, %rd13, %rd14; - ld.global.u32 %r27, [%rd15]; - mul.wide.u32 %rd16, %r27, 4; - add.s64 %rd17, %rd2, %rd16; - ld.global.f32 %f36, [%rd17]; + mul.wide.u32 %rd13, %r48, 4; + add.s64 %rd14, %rd34, %rd13; + ld.u32 %r20, [%rd14]; + mul.wide.u32 %rd15, %r20, 4; + add.s64 %rd16, %rd2, %rd15; + ld.f32 %f36, [%rd16]; max.f32 %f51, %f51, %f36; - add.s32 %r55, %r54, %r16; - setp.ge.u32 %p3, %r55, %r13; + add.s32 %r49, %r48, %r1; + setp.ge.u32 %p3, %r49, %r14; @%p3 bra BB4_7; - setp.eq.s64 %p4, %rd32, 0; - mov.u64 %rd32, 0; + setp.eq.s64 %p4, %rd34, 0; + mov.u64 %rd34, 0; @%p4 bra BB4_6; - cvta.to.global.u64 %rd19, %rd1; - mul.wide.u32 %rd20, %r55, 4; - add.s64 %rd21, %rd19, %rd20; - ld.global.u32 %r55, [%rd21]; - mov.u64 %rd32, %rd1; + mul.wide.u32 %rd18, %r49, 4; + add.s64 %rd19, %rd1, %rd18; + ld.u32 %r49, [%rd19]; + mov.u64 %rd34, %rd1; BB4_6: - mul.wide.u32 %rd22, %r55, 4; - add.s64 %rd23, %rd2, %rd22; - ld.global.f32 %f37, [%rd23]; + mul.wide.u32 %rd20, %r49, 4; + add.s64 %rd21, %rd2, %rd20; + ld.f32 %f37, [%rd21]; max.f32 %f51, %f51, %f37; BB4_7: - shl.b32 %r30, %r16, 1; - mov.u32 %r31, %nctaid.x; - mad.lo.s32 %r54, %r30, %r31, %r54; - setp.lt.u32 %p5, %r54, %r13; + shl.b32 %r23, %r1, 1; + mad.lo.s32 %r48, %r23, %r18, %r48; + setp.lt.u32 %p5, %r48, %r14; @%p5 bra BB4_3; bra.uni BB4_11; BB4_8: - mul.wide.u32 %rd24, %r56, 4; - add.s64 %rd25, %rd2, %rd24; - ld.global.f32 %f38, [%rd25]; + mul.wide.u32 %rd22, %r48, 4; + add.s64 %rd23, %rd2, %rd22; + ld.f32 %f38, [%rd23]; max.f32 %f51, %f51, %f38; - add.s32 %r10, %r56, %r16; - setp.ge.u32 %p6, %r10, %r13; + add.s32 %r11, %r48, %r1; + setp.ge.u32 %p6, %r11, %r14; @%p6 bra BB4_10; - mul.wide.u32 %rd26, %r10, 4; - add.s64 %rd27, %rd2, %rd26; - ld.global.f32 %f39, [%rd27]; + mul.wide.u32 %rd24, %r11, 4; + add.s64 %rd25, %rd2, %rd24; + ld.f32 %f39, [%rd25]; max.f32 %f51, %f51, %f39; BB4_10: - mov.u32 %r32, %nctaid.x; - shl.b32 %r33, %r16, 1; - mad.lo.s32 %r56, %r33, %r32, %r56; - setp.lt.u32 %p7, %r56, %r13; + add.s32 %r48, %r48, %r4; + setp.lt.u32 %p7, %r48, %r14; @%p7 bra BB4_8; BB4_11: - shl.b32 %r35, %r17, 2; - mov.u32 %r36, memory; - add.s32 %r12, %r36, %r35; - st.shared.f32 [%r12], %f51; + shl.b32 %r26, %r17, 2; + mov.u32 %r27, memory; + add.s32 %r13, %r27, %r26; + st.shared.f32 [%r13], %f51; bar.sync 0; - setp.lt.u32 %p8, %r16, 1024; + setp.lt.u32 %p8, %r1, 1024; @%p8 bra BB4_15; setp.gt.u32 %p9, %r17, 511; @%p9 bra BB4_14; - ld.shared.f32 %f40, [%r12+2048]; + ld.shared.f32 %f40, [%r13+2048]; max.f32 %f51, %f51, %f40; - st.shared.f32 [%r12], %f51; + st.shared.f32 [%r13], %f51; BB4_14: bar.sync 0; BB4_15: - setp.lt.u32 %p10, %r16, 512; + setp.lt.u32 %p10, %r1, 512; @%p10 bra BB4_19; setp.gt.u32 %p11, %r17, 255; @%p11 bra BB4_18; - ld.shared.f32 %f41, [%r12+1024]; + ld.shared.f32 %f41, [%r13+1024]; max.f32 %f51, %f51, %f41; - st.shared.f32 [%r12], %f51; + st.shared.f32 [%r13], %f51; BB4_18: bar.sync 0; BB4_19: - setp.lt.u32 %p12, %r16, 256; + setp.lt.u32 %p12, %r1, 256; @%p12 bra BB4_23; setp.gt.u32 %p13, %r17, 127; @%p13 bra BB4_22; - ld.shared.f32 %f42, [%r12+512]; + ld.shared.f32 %f42, [%r13+512]; max.f32 %f51, %f51, %f42; - st.shared.f32 [%r12], %f51; + st.shared.f32 [%r13], %f51; BB4_22: bar.sync 0; BB4_23: - setp.lt.u32 %p14, %r16, 128; + setp.lt.u32 %p14, %r1, 128; @%p14 bra BB4_27; setp.gt.u32 %p15, %r17, 63; @%p15 bra BB4_26; - ld.shared.f32 %f43, [%r12+256]; + ld.shared.f32 %f43, [%r13+256]; max.f32 %f51, %f51, %f43; - st.shared.f32 [%r12], %f51; + st.shared.f32 [%r13], %f51; BB4_26: bar.sync 0; @@ -706,72 +779,105 @@ BB4_27: setp.gt.u32 %p16, %r17, 31; @%p16 bra BB4_40; - setp.lt.u32 %p17, %r16, 64; + setp.lt.u32 %p17, %r1, 64; @%p17 bra BB4_30; - ld.volatile.shared.f32 %f44, [%r12+128]; + ld.volatile.shared.f32 %f44, [%r13+128]; max.f32 %f51, %f51, %f44; - st.volatile.shared.f32 [%r12], %f51; + st.volatile.shared.f32 [%r13], %f51; BB4_30: - setp.lt.u32 %p18, %r16, 32; + setp.lt.u32 %p18, %r1, 32; @%p18 bra BB4_32; - ld.volatile.shared.f32 %f45, [%r12+64]; + ld.volatile.shared.f32 %f45, [%r13+64]; max.f32 %f51, %f51, %f45; - st.volatile.shared.f32 [%r12], %f51; + st.volatile.shared.f32 [%r13], %f51; BB4_32: - setp.lt.u32 %p19, %r16, 16; + setp.lt.u32 %p19, %r1, 16; @%p19 bra BB4_34; - ld.volatile.shared.f32 %f46, [%r12+32]; + ld.volatile.shared.f32 %f46, [%r13+32]; max.f32 %f51, %f51, %f46; - st.volatile.shared.f32 [%r12], %f51; + st.volatile.shared.f32 [%r13], %f51; BB4_34: - setp.lt.u32 %p20, %r16, 8; + setp.lt.u32 %p20, %r1, 8; @%p20 bra BB4_36; - ld.volatile.shared.f32 %f47, [%r12+16]; + ld.volatile.shared.f32 %f47, [%r13+16]; max.f32 %f51, %f51, %f47; - st.volatile.shared.f32 [%r12], %f51; + st.volatile.shared.f32 [%r13], %f51; BB4_36: - setp.lt.u32 %p21, %r16, 4; + setp.lt.u32 %p21, %r1, 4; @%p21 bra BB4_38; - ld.volatile.shared.f32 %f48, [%r12+8]; + ld.volatile.shared.f32 %f48, [%r13+8]; max.f32 %f51, %f51, %f48; - st.volatile.shared.f32 [%r12], %f51; + st.volatile.shared.f32 [%r13], %f51; BB4_38: - setp.lt.u32 %p22, %r16, 2; + setp.lt.u32 %p22, %r1, 2; @%p22 bra BB4_40; - ld.volatile.shared.f32 %f49, [%r12+4]; + ld.volatile.shared.f32 %f49, [%r13+4]; max.f32 %f50, %f51, %f49; - st.volatile.shared.f32 [%r12], %f50; + st.volatile.shared.f32 [%r13], %f50; BB4_40: setp.ne.s32 %p23, %r17, 0; - @%p23 bra BB4_44; + @%p23 bra BB4_45; ld.shared.f32 %f32, [memory]; - cvta.to.global.u64 %rd28, %rd10; - ld.global.u64 %rd29, [%rd28+16]; - ld.global.u64 %rd30, [%rd28+32]; - cvta.to.global.u64 %rd35, %rd30; - setp.ne.s64 %p24, %rd29, 0; + cvta.to.global.u64 %rd26, %rd11; + add.s64 %rd6, %rd26, 16; + ld.global.u64 %rd27, [%rd26+16]; + setp.eq.s64 %p24, %rd27, 0; @%p24 bra BB4_43; - mul.wide.u32 %rd31, %r14, 4; - add.s64 %rd35, %rd35, %rd31; + mov.u32 %r44, 0; + add.u64 %rd28, %SP, 0; + add.u64 %rd29, %SPL, 0; + st.local.u32 [%rd29], %r44; + st.local.u32 [%rd29+4], %r15; + mov.u64 %rd30, $str; + cvta.global.u64 %rd31, %rd30; + // Callseq Start 2 + { + .reg .b32 temp_param_reg; + // <end>} + .param .b64 param0; + st.param.b64 [param0+0], %rd31; + .param .b64 param1; + st.param.b64 [param1+0], %rd28; + .param .b32 retval0; + call.uni (retval0), + vprintf, + ( + param0, + param1 + ); + ld.param.b32 %r46, [retval0+0]; + + //{ + }// Callseq End 2 + // inline asm + trap; + // inline asm + ld.global.u64 %rd37, [%rd6+16]; + bra.uni BB4_44; BB4_43: - st.global.f32 [%rd35], %f32; + ld.global.u64 %rd32, [%rd6+16]; + mul.wide.u32 %rd33, %r15, 4; + add.s64 %rd37, %rd32, %rd33; BB4_44: + st.f32 [%rd37], %f32; + +BB4_45: ret; } @@ -782,85 +888,87 @@ BB4_44: .param .u32 reduce_max_d_param_2 ) { + .local .align 8 .b8 __local_depot5[8]; + .reg .b64 %SP; + .reg .b64 %SPL; .reg .pred %p<23>; - .reg .b32 %r<46>; + .reg .b32 %r<49>; .reg .f64 %fd<60>; - .reg .b64 %rd<34>; + .reg .b64 %rd<36>; - ld.param.u64 %rd10, [reduce_max_d_param_0]; - ld.param.u64 %rd11, [reduce_max_d_param_1]; + mov.u64 %SPL, __local_depot5; + cvta.local.u64 %SP, %SPL; + ld.param.u64 %rd11, [reduce_max_d_param_0]; + ld.param.u64 %rd12, [reduce_max_d_param_1]; ld.param.u32 %r10, [reduce_max_d_param_2]; - mov.u32 %r11, %tid.x; - mov.u32 %r12, %ctaid.x; - shl.b32 %r13, %r12, 1; - mov.u32 %r14, %ntid.x; - mad.lo.s32 %r43, %r13, %r14, %r11; + mov.u32 %r11, %ctaid.x; + shl.b32 %r12, %r11, 1; + mov.u32 %r13, %ntid.x; + mov.u32 %r14, %tid.x; + mad.lo.s32 %r46, %r12, %r13, %r14; mov.f64 %fd44, 0dFFF0000000000000; - setp.ge.u32 %p1, %r43, %r10; + setp.ge.u32 %p1, %r46, %r10; @%p1 bra BB5_9; - cvta.to.global.u64 %rd12, %rd10; - ld.global.u64 %rd1, [%rd12+16]; - ld.global.u64 %rd13, [%rd12+32]; - cvta.to.global.u64 %rd2, %rd13; + cvta.to.global.u64 %rd13, %rd11; + ld.global.u64 %rd1, [%rd13+16]; + ld.global.u64 %rd2, [%rd13+32]; mov.f64 %fd44, 0dFFF0000000000000; - mov.u64 %rd30, %rd1; + mov.u64 %rd32, %rd1; BB5_2: setp.eq.s64 %p2, %rd1, 0; - mov.u32 %r44, %r43; + mov.u32 %r47, %r46; @%p2 bra BB5_4; - cvta.to.global.u64 %rd14, %rd1; - mul.wide.u32 %rd15, %r43, 4; - add.s64 %rd16, %rd14, %rd15; - ld.global.u32 %r44, [%rd16]; - mov.u64 %rd30, %rd1; + mul.wide.u32 %rd14, %r46, 4; + add.s64 %rd15, %rd1, %rd14; + ld.u32 %r47, [%rd15]; + mov.u64 %rd32, %rd1; BB5_4: - mul.wide.u32 %rd17, %r44, 8; - add.s64 %rd18, %rd2, %rd17; - ld.global.f64 %fd31, [%rd18]; + mul.wide.u32 %rd16, %r47, 8; + add.s64 %rd17, %rd2, %rd16; + ld.f64 %fd31, [%rd17]; max.f64 %fd44, %fd44, %fd31; - add.s32 %r45, %r43, %r14; - setp.ge.u32 %p3, %r45, %r10; + add.s32 %r48, %r46, %r13; + setp.ge.u32 %p3, %r48, %r10; @%p3 bra BB5_8; - setp.eq.s64 %p4, %rd30, 0; - mov.u64 %rd30, 0; + setp.eq.s64 %p4, %rd32, 0; + mov.u64 %rd32, 0; @%p4 bra BB5_7; - cvta.to.global.u64 %rd20, %rd1; - add.s32 %r19, %r43, %r14; - mul.wide.u32 %rd21, %r19, 4; - add.s64 %rd22, %rd20, %rd21; - ld.global.u32 %r45, [%rd22]; - mov.u64 %rd30, %rd1; + add.s32 %r19, %r46, %r13; + mul.wide.u32 %rd19, %r19, 4; + add.s64 %rd20, %rd1, %rd19; + ld.u32 %r48, [%rd20]; + mov.u64 %rd32, %rd1; BB5_7: - mul.wide.u32 %rd23, %r45, 8; - add.s64 %rd24, %rd2, %rd23; - ld.global.f64 %fd32, [%rd24]; + mul.wide.u32 %rd21, %r48, 8; + add.s64 %rd22, %rd2, %rd21; + ld.f64 %fd32, [%rd22]; max.f64 %fd44, %fd44, %fd32; BB5_8: - shl.b32 %r21, %r14, 1; + shl.b32 %r21, %r13, 1; mov.u32 %r22, %nctaid.x; - mad.lo.s32 %r43, %r21, %r22, %r43; - setp.lt.u32 %p5, %r43, %r10; + mad.lo.s32 %r46, %r21, %r22, %r46; + setp.lt.u32 %p5, %r46, %r10; @%p5 bra BB5_2; BB5_9: - shl.b32 %r24, %r11, 3; + shl.b32 %r24, %r14, 3; mov.u32 %r25, memory; add.s32 %r9, %r25, %r24; st.shared.f64 [%r9], %fd44; bar.sync 0; - setp.lt.u32 %p6, %r14, 1024; + setp.lt.u32 %p6, %r13, 1024; @%p6 bra BB5_13; - setp.gt.u32 %p7, %r11, 511; + setp.gt.u32 %p7, %r14, 511; @%p7 bra BB5_12; ld.shared.f64 %fd33, [%r9+4096]; @@ -871,10 +979,10 @@ BB5_12: bar.sync 0; BB5_13: - setp.lt.u32 %p8, %r14, 512; + setp.lt.u32 %p8, %r13, 512; @%p8 bra BB5_17; - setp.gt.u32 %p9, %r11, 255; + setp.gt.u32 %p9, %r14, 255; @%p9 bra BB5_16; ld.shared.f64 %fd34, [%r9+2048]; @@ -885,10 +993,10 @@ BB5_16: bar.sync 0; BB5_17: - setp.lt.u32 %p10, %r14, 256; + setp.lt.u32 %p10, %r13, 256; @%p10 bra BB5_21; - setp.gt.u32 %p11, %r11, 127; + setp.gt.u32 %p11, %r14, 127; @%p11 bra BB5_20; ld.shared.f64 %fd35, [%r9+1024]; @@ -899,10 +1007,10 @@ BB5_20: bar.sync 0; BB5_21: - setp.lt.u32 %p12, %r14, 128; + setp.lt.u32 %p12, %r13, 128; @%p12 bra BB5_25; - setp.gt.u32 %p13, %r11, 63; + setp.gt.u32 %p13, %r14, 63; @%p13 bra BB5_24; ld.shared.f64 %fd36, [%r9+512]; @@ -913,10 +1021,10 @@ BB5_24: bar.sync 0; BB5_25: - setp.gt.u32 %p14, %r11, 31; + setp.gt.u32 %p14, %r14, 31; @%p14 bra BB5_38; - setp.lt.u32 %p15, %r14, 64; + setp.lt.u32 %p15, %r13, 64; @%p15 bra BB5_28; ld.volatile.shared.f64 %fd37, [%r9+256]; @@ -924,7 +1032,7 @@ BB5_25: st.volatile.shared.f64 [%r9], %fd44; BB5_28: - setp.lt.u32 %p16, %r14, 32; + setp.lt.u32 %p16, %r13, 32; @%p16 bra BB5_30; ld.volatile.shared.f64 %fd38, [%r9+128]; @@ -932,7 +1040,7 @@ BB5_28: st.volatile.shared.f64 [%r9], %fd44; BB5_30: - setp.lt.u32 %p17, %r14, 16; + setp.lt.u32 %p17, %r13, 16; @%p17 bra BB5_32; ld.volatile.shared.f64 %fd39, [%r9+64]; @@ -940,7 +1048,7 @@ BB5_30: st.volatile.shared.f64 [%r9], %fd44; BB5_32: - setp.lt.u32 %p18, %r14, 8; + setp.lt.u32 %p18, %r13, 8; @%p18 bra BB5_34; ld.volatile.shared.f64 %fd40, [%r9+32]; @@ -948,7 +1056,7 @@ BB5_32: st.volatile.shared.f64 [%r9], %fd44; BB5_34: - setp.lt.u32 %p19, %r14, 4; + setp.lt.u32 %p19, %r13, 4; @%p19 bra BB5_36; ld.volatile.shared.f64 %fd41, [%r9+16]; @@ -956,7 +1064,7 @@ BB5_34: st.volatile.shared.f64 [%r9], %fd44; BB5_36: - setp.lt.u32 %p20, %r14, 2; + setp.lt.u32 %p20, %r13, 2; @%p20 bra BB5_38; ld.volatile.shared.f64 %fd42, [%r9+8]; @@ -964,24 +1072,57 @@ BB5_36: st.volatile.shared.f64 [%r9], %fd43; BB5_38: - setp.ne.s32 %p21, %r11, 0; - @%p21 bra BB5_42; + setp.ne.s32 %p21, %r14, 0; + @%p21 bra BB5_43; ld.shared.f64 %fd28, [memory]; - cvta.to.global.u64 %rd25, %rd11; - ld.global.u64 %rd26, [%rd25+16]; - ld.global.u64 %rd27, [%rd25+32]; - cvta.to.global.u64 %rd33, %rd27; - setp.ne.s64 %p22, %rd26, 0; + cvta.to.global.u64 %rd23, %rd12; + add.s64 %rd7, %rd23, 16; + ld.global.u64 %rd24, [%rd23+16]; + setp.eq.s64 %p22, %rd24, 0; @%p22 bra BB5_41; - mul.wide.u32 %rd28, %r12, 8; - add.s64 %rd33, %rd33, %rd28; + mov.u32 %r42, 0; + add.u64 %rd25, %SP, 0; + add.u64 %rd26, %SPL, 0; + st.local.u32 [%rd26], %r42; + st.local.u32 [%rd26+4], %r11; + mov.u64 %rd27, $str; + cvta.global.u64 %rd28, %rd27; + // Callseq Start 3 + { + .reg .b32 temp_param_reg; + // <end>} + .param .b64 param0; + st.param.b64 [param0+0], %rd28; + .param .b64 param1; + st.param.b64 [param1+0], %rd25; + .param .b32 retval0; + call.uni (retval0), + vprintf, + ( + param0, + param1 + ); + ld.param.b32 %r44, [retval0+0]; + + //{ + }// Callseq End 3 + // inline asm + trap; + // inline asm + ld.global.u64 %rd35, [%rd7+16]; + bra.uni BB5_42; BB5_41: - st.global.f64 [%rd33], %fd28; + ld.global.u64 %rd29, [%rd7+16]; + mul.wide.u32 %rd30, %r11, 8; + add.s64 %rd35, %rd29, %rd30; BB5_42: + st.f64 [%rd35], %fd28; + +BB5_43: ret; } @@ -992,85 +1133,87 @@ BB5_42: .param .u32 reduce_min_f_param_2 ) { + .local .align 8 .b8 __local_depot6[8]; + .reg .b64 %SP; + .reg .b64 %SPL; .reg .pred %p<23>; .reg .f32 %f<60>; - .reg .b32 %r<46>; - .reg .b64 %rd<34>; + .reg .b32 %r<49>; + .reg .b64 %rd<36>; - ld.param.u64 %rd10, [reduce_min_f_param_0]; - ld.param.u64 %rd11, [reduce_min_f_param_1]; + mov.u64 %SPL, __local_depot6; + cvta.local.u64 %SP, %SPL; + ld.param.u64 %rd11, [reduce_min_f_param_0]; + ld.param.u64 %rd12, [reduce_min_f_param_1]; ld.param.u32 %r10, [reduce_min_f_param_2]; - mov.u32 %r11, %tid.x; - mov.u32 %r12, %ctaid.x; - shl.b32 %r13, %r12, 1; - mov.u32 %r14, %ntid.x; - mad.lo.s32 %r43, %r13, %r14, %r11; + mov.u32 %r11, %ctaid.x; + shl.b32 %r12, %r11, 1; + mov.u32 %r13, %ntid.x; + mov.u32 %r14, %tid.x; + mad.lo.s32 %r46, %r12, %r13, %r14; mov.f32 %f44, 0f7F800000; - setp.ge.u32 %p1, %r43, %r10; + setp.ge.u32 %p1, %r46, %r10; @%p1 bra BB6_9; - cvta.to.global.u64 %rd12, %rd10; - ld.global.u64 %rd1, [%rd12+16]; - ld.global.u64 %rd13, [%rd12+32]; - cvta.to.global.u64 %rd2, %rd13; + cvta.to.global.u64 %rd13, %rd11; + ld.global.u64 %rd1, [%rd13+16]; + ld.global.u64 %rd2, [%rd13+32]; mov.f32 %f44, 0f7F800000; - mov.u64 %rd30, %rd1; + mov.u64 %rd32, %rd1; BB6_2: setp.eq.s64 %p2, %rd1, 0; - mov.u32 %r44, %r43; + mov.u32 %r47, %r46; @%p2 bra BB6_4; - cvta.to.global.u64 %rd14, %rd1; - mul.wide.u32 %rd15, %r43, 4; - add.s64 %rd16, %rd14, %rd15; - ld.global.u32 %r44, [%rd16]; - mov.u64 %rd30, %rd1; + mul.wide.u32 %rd14, %r46, 4; + add.s64 %rd15, %rd1, %rd14; + ld.u32 %r47, [%rd15]; + mov.u64 %rd32, %rd1; BB6_4: - mul.wide.u32 %rd17, %r44, 4; - add.s64 %rd18, %rd2, %rd17; - ld.global.f32 %f31, [%rd18]; + mul.wide.u32 %rd16, %r47, 4; + add.s64 %rd17, %rd2, %rd16; + ld.f32 %f31, [%rd17]; min.f32 %f44, %f44, %f31; - add.s32 %r45, %r43, %r14; - setp.ge.u32 %p3, %r45, %r10; + add.s32 %r48, %r46, %r13; + setp.ge.u32 %p3, %r48, %r10; @%p3 bra BB6_8; - setp.eq.s64 %p4, %rd30, 0; - mov.u64 %rd30, 0; + setp.eq.s64 %p4, %rd32, 0; + mov.u64 %rd32, 0; @%p4 bra BB6_7; - cvta.to.global.u64 %rd20, %rd1; - add.s32 %r19, %r43, %r14; - mul.wide.u32 %rd21, %r19, 4; - add.s64 %rd22, %rd20, %rd21; - ld.global.u32 %r45, [%rd22]; - mov.u64 %rd30, %rd1; + add.s32 %r19, %r46, %r13; + mul.wide.u32 %rd19, %r19, 4; + add.s64 %rd20, %rd1, %rd19; + ld.u32 %r48, [%rd20]; + mov.u64 %rd32, %rd1; BB6_7: - mul.wide.u32 %rd23, %r45, 4; - add.s64 %rd24, %rd2, %rd23; - ld.global.f32 %f32, [%rd24]; + mul.wide.u32 %rd21, %r48, 4; + add.s64 %rd22, %rd2, %rd21; + ld.f32 %f32, [%rd22]; min.f32 %f44, %f44, %f32; BB6_8: - shl.b32 %r21, %r14, 1; + shl.b32 %r21, %r13, 1; mov.u32 %r22, %nctaid.x; - mad.lo.s32 %r43, %r21, %r22, %r43; - setp.lt.u32 %p5, %r43, %r10; + mad.lo.s32 %r46, %r21, %r22, %r46; + setp.lt.u32 %p5, %r46, %r10; @%p5 bra BB6_2; BB6_9: - shl.b32 %r24, %r11, 2; + shl.b32 %r24, %r14, 2; mov.u32 %r25, memory; add.s32 %r9, %r25, %r24; st.shared.f32 [%r9], %f44; bar.sync 0; - setp.lt.u32 %p6, %r14, 1024; + setp.lt.u32 %p6, %r13, 1024; @%p6 bra BB6_13; - setp.gt.u32 %p7, %r11, 511; + setp.gt.u32 %p7, %r14, 511; @%p7 bra BB6_12; ld.shared.f32 %f33, [%r9+2048]; @@ -1081,10 +1224,10 @@ BB6_12: bar.sync 0; BB6_13: - setp.lt.u32 %p8, %r14, 512; + setp.lt.u32 %p8, %r13, 512; @%p8 bra BB6_17; - setp.gt.u32 %p9, %r11, 255; + setp.gt.u32 %p9, %r14, 255; @%p9 bra BB6_16; ld.shared.f32 %f34, [%r9+1024]; @@ -1095,10 +1238,10 @@ BB6_16: bar.sync 0; BB6_17: - setp.lt.u32 %p10, %r14, 256; + setp.lt.u32 %p10, %r13, 256; @%p10 bra BB6_21; - setp.gt.u32 %p11, %r11, 127; + setp.gt.u32 %p11, %r14, 127; @%p11 bra BB6_20; ld.shared.f32 %f35, [%r9+512]; @@ -1109,10 +1252,10 @@ BB6_20: bar.sync 0; BB6_21: - setp.lt.u32 %p12, %r14, 128; + setp.lt.u32 %p12, %r13, 128; @%p12 bra BB6_25; - setp.gt.u32 %p13, %r11, 63; + setp.gt.u32 %p13, %r14, 63; @%p13 bra BB6_24; ld.shared.f32 %f36, [%r9+256]; @@ -1123,10 +1266,10 @@ BB6_24: bar.sync 0; BB6_25: - setp.gt.u32 %p14, %r11, 31; + setp.gt.u32 %p14, %r14, 31; @%p14 bra BB6_38; - setp.lt.u32 %p15, %r14, 64; + setp.lt.u32 %p15, %r13, 64; @%p15 bra BB6_28; ld.volatile.shared.f32 %f37, [%r9+128]; @@ -1134,7 +1277,7 @@ BB6_25: st.volatile.shared.f32 [%r9], %f44; BB6_28: - setp.lt.u32 %p16, %r14, 32; + setp.lt.u32 %p16, %r13, 32; @%p16 bra BB6_30; ld.volatile.shared.f32 %f38, [%r9+64]; @@ -1142,7 +1285,7 @@ BB6_28: st.volatile.shared.f32 [%r9], %f44; BB6_30: - setp.lt.u32 %p17, %r14, 16; + setp.lt.u32 %p17, %r13, 16; @%p17 bra BB6_32; ld.volatile.shared.f32 %f39, [%r9+32]; @@ -1150,7 +1293,7 @@ BB6_30: st.volatile.shared.f32 [%r9], %f44; BB6_32: - setp.lt.u32 %p18, %r14, 8; + setp.lt.u32 %p18, %r13, 8; @%p18 bra BB6_34; ld.volatile.shared.f32 %f40, [%r9+16]; @@ -1158,7 +1301,7 @@ BB6_32: st.volatile.shared.f32 [%r9], %f44; BB6_34: - setp.lt.u32 %p19, %r14, 4; + setp.lt.u32 %p19, %r13, 4; @%p19 bra BB6_36; ld.volatile.shared.f32 %f41, [%r9+8]; @@ -1166,7 +1309,7 @@ BB6_34: st.volatile.shared.f32 [%r9], %f44; BB6_36: - setp.lt.u32 %p20, %r14, 2; + setp.lt.u32 %p20, %r13, 2; @%p20 bra BB6_38; ld.volatile.shared.f32 %f42, [%r9+4]; @@ -1174,24 +1317,57 @@ BB6_36: st.volatile.shared.f32 [%r9], %f43; BB6_38: - setp.ne.s32 %p21, %r11, 0; - @%p21 bra BB6_42; + setp.ne.s32 %p21, %r14, 0; + @%p21 bra BB6_43; ld.shared.f32 %f28, [memory]; - cvta.to.global.u64 %rd25, %rd11; - ld.global.u64 %rd26, [%rd25+16]; - ld.global.u64 %rd27, [%rd25+32]; - cvta.to.global.u64 %rd33, %rd27; - setp.ne.s64 %p22, %rd26, 0; + cvta.to.global.u64 %rd23, %rd12; + add.s64 %rd7, %rd23, 16; + ld.global.u64 %rd24, [%rd23+16]; + setp.eq.s64 %p22, %rd24, 0; @%p22 bra BB6_41; - mul.wide.u32 %rd28, %r12, 4; - add.s64 %rd33, %rd33, %rd28; + mov.u32 %r42, 0; + add.u64 %rd25, %SP, 0; + add.u64 %rd26, %SPL, 0; + st.local.u32 [%rd26], %r42; + st.local.u32 [%rd26+4], %r11; + mov.u64 %rd27, $str; + cvta.global.u64 %rd28, %rd27; + // Callseq Start 4 + { + .reg .b32 temp_param_reg; + // <end>} + .param .b64 param0; + st.param.b64 [param0+0], %rd28; + .param .b64 param1; + st.param.b64 [param1+0], %rd25; + .param .b32 retval0; + call.uni (retval0), + vprintf, + ( + param0, + param1 + ); + ld.param.b32 %r44, [retval0+0]; + + //{ + }// Callseq End 4 + // inline asm + trap; + // inline asm + ld.global.u64 %rd35, [%rd7+16]; + bra.uni BB6_42; BB6_41: - st.global.f32 [%rd33], %f28; + ld.global.u64 %rd29, [%rd7+16]; + mul.wide.u32 %rd30, %r11, 4; + add.s64 %rd35, %rd29, %rd30; BB6_42: + st.f32 [%rd35], %f28; + +BB6_43: ret; } @@ -1202,85 +1378,87 @@ BB6_42: .param .u32 reduce_min_d_param_2 ) { + .local .align 8 .b8 __local_depot7[8]; + .reg .b64 %SP; + .reg .b64 %SPL; .reg .pred %p<23>; - .reg .b32 %r<46>; + .reg .b32 %r<49>; .reg .f64 %fd<60>; - .reg .b64 %rd<34>; + .reg .b64 %rd<36>; - ld.param.u64 %rd10, [reduce_min_d_param_0]; - ld.param.u64 %rd11, [reduce_min_d_param_1]; + mov.u64 %SPL, __local_depot7; + cvta.local.u64 %SP, %SPL; + ld.param.u64 %rd11, [reduce_min_d_param_0]; + ld.param.u64 %rd12, [reduce_min_d_param_1]; ld.param.u32 %r10, [reduce_min_d_param_2]; - mov.u32 %r11, %tid.x; - mov.u32 %r12, %ctaid.x; - shl.b32 %r13, %r12, 1; - mov.u32 %r14, %ntid.x; - mad.lo.s32 %r43, %r13, %r14, %r11; + mov.u32 %r11, %ctaid.x; + shl.b32 %r12, %r11, 1; + mov.u32 %r13, %ntid.x; + mov.u32 %r14, %tid.x; + mad.lo.s32 %r46, %r12, %r13, %r14; mov.f64 %fd44, 0d7FF0000000000000; - setp.ge.u32 %p1, %r43, %r10; + setp.ge.u32 %p1, %r46, %r10; @%p1 bra BB7_9; - cvta.to.global.u64 %rd12, %rd10; - ld.global.u64 %rd1, [%rd12+16]; - ld.global.u64 %rd13, [%rd12+32]; - cvta.to.global.u64 %rd2, %rd13; + cvta.to.global.u64 %rd13, %rd11; + ld.global.u64 %rd1, [%rd13+16]; + ld.global.u64 %rd2, [%rd13+32]; mov.f64 %fd44, 0d7FF0000000000000; - mov.u64 %rd30, %rd1; + mov.u64 %rd32, %rd1; BB7_2: setp.eq.s64 %p2, %rd1, 0; - mov.u32 %r44, %r43; + mov.u32 %r47, %r46; @%p2 bra BB7_4; - cvta.to.global.u64 %rd14, %rd1; - mul.wide.u32 %rd15, %r43, 4; - add.s64 %rd16, %rd14, %rd15; - ld.global.u32 %r44, [%rd16]; - mov.u64 %rd30, %rd1; + mul.wide.u32 %rd14, %r46, 4; + add.s64 %rd15, %rd1, %rd14; + ld.u32 %r47, [%rd15]; + mov.u64 %rd32, %rd1; BB7_4: - mul.wide.u32 %rd17, %r44, 8; - add.s64 %rd18, %rd2, %rd17; - ld.global.f64 %fd31, [%rd18]; + mul.wide.u32 %rd16, %r47, 8; + add.s64 %rd17, %rd2, %rd16; + ld.f64 %fd31, [%rd17]; min.f64 %fd44, %fd44, %fd31; - add.s32 %r45, %r43, %r14; - setp.ge.u32 %p3, %r45, %r10; + add.s32 %r48, %r46, %r13; + setp.ge.u32 %p3, %r48, %r10; @%p3 bra BB7_8; - setp.eq.s64 %p4, %rd30, 0; - mov.u64 %rd30, 0; + setp.eq.s64 %p4, %rd32, 0; + mov.u64 %rd32, 0; @%p4 bra BB7_7; - cvta.to.global.u64 %rd20, %rd1; - add.s32 %r19, %r43, %r14; - mul.wide.u32 %rd21, %r19, 4; - add.s64 %rd22, %rd20, %rd21; - ld.global.u32 %r45, [%rd22]; - mov.u64 %rd30, %rd1; + add.s32 %r19, %r46, %r13; + mul.wide.u32 %rd19, %r19, 4; + add.s64 %rd20, %rd1, %rd19; + ld.u32 %r48, [%rd20]; + mov.u64 %rd32, %rd1; BB7_7: - mul.wide.u32 %rd23, %r45, 8; - add.s64 %rd24, %rd2, %rd23; - ld.global.f64 %fd32, [%rd24]; + mul.wide.u32 %rd21, %r48, 8; + add.s64 %rd22, %rd2, %rd21; + ld.f64 %fd32, [%rd22]; min.f64 %fd44, %fd44, %fd32; BB7_8: - shl.b32 %r21, %r14, 1; + shl.b32 %r21, %r13, 1; mov.u32 %r22, %nctaid.x; - mad.lo.s32 %r43, %r21, %r22, %r43; - setp.lt.u32 %p5, %r43, %r10; + mad.lo.s32 %r46, %r21, %r22, %r46; + setp.lt.u32 %p5, %r46, %r10; @%p5 bra BB7_2; BB7_9: - shl.b32 %r24, %r11, 3; + shl.b32 %r24, %r14, 3; mov.u32 %r25, memory; add.s32 %r9, %r25, %r24; st.shared.f64 [%r9], %fd44; bar.sync 0; - setp.lt.u32 %p6, %r14, 1024; + setp.lt.u32 %p6, %r13, 1024; @%p6 bra BB7_13; - setp.gt.u32 %p7, %r11, 511; + setp.gt.u32 %p7, %r14, 511; @%p7 bra BB7_12; ld.shared.f64 %fd33, [%r9+4096]; @@ -1291,10 +1469,10 @@ BB7_12: bar.sync 0; BB7_13: - setp.lt.u32 %p8, %r14, 512; + setp.lt.u32 %p8, %r13, 512; @%p8 bra BB7_17; - setp.gt.u32 %p9, %r11, 255; + setp.gt.u32 %p9, %r14, 255; @%p9 bra BB7_16; ld.shared.f64 %fd34, [%r9+2048]; @@ -1305,10 +1483,10 @@ BB7_16: bar.sync 0; BB7_17: - setp.lt.u32 %p10, %r14, 256; + setp.lt.u32 %p10, %r13, 256; @%p10 bra BB7_21; - setp.gt.u32 %p11, %r11, 127; + setp.gt.u32 %p11, %r14, 127; @%p11 bra BB7_20; ld.shared.f64 %fd35, [%r9+1024]; @@ -1319,10 +1497,10 @@ BB7_20: bar.sync 0; BB7_21: - setp.lt.u32 %p12, %r14, 128; + setp.lt.u32 %p12, %r13, 128; @%p12 bra BB7_25; - setp.gt.u32 %p13, %r11, 63; + setp.gt.u32 %p13, %r14, 63; @%p13 bra BB7_24; ld.shared.f64 %fd36, [%r9+512]; @@ -1333,10 +1511,10 @@ BB7_24: bar.sync 0; BB7_25: - setp.gt.u32 %p14, %r11, 31; + setp.gt.u32 %p14, %r14, 31; @%p14 bra BB7_38; - setp.lt.u32 %p15, %r14, 64; + setp.lt.u32 %p15, %r13, 64; @%p15 bra BB7_28; ld.volatile.shared.f64 %fd37, [%r9+256]; @@ -1344,7 +1522,7 @@ BB7_25: st.volatile.shared.f64 [%r9], %fd44; BB7_28: - setp.lt.u32 %p16, %r14, 32; + setp.lt.u32 %p16, %r13, 32; @%p16 bra BB7_30; ld.volatile.shared.f64 %fd38, [%r9+128]; @@ -1352,7 +1530,7 @@ BB7_28: st.volatile.shared.f64 [%r9], %fd44; BB7_30: - setp.lt.u32 %p17, %r14, 16; + setp.lt.u32 %p17, %r13, 16; @%p17 bra BB7_32; ld.volatile.shared.f64 %fd39, [%r9+64]; @@ -1360,7 +1538,7 @@ BB7_30: st.volatile.shared.f64 [%r9], %fd44; BB7_32: - setp.lt.u32 %p18, %r14, 8; + setp.lt.u32 %p18, %r13, 8; @%p18 bra BB7_34; ld.volatile.shared.f64 %fd40, [%r9+32]; @@ -1368,7 +1546,7 @@ BB7_32: st.volatile.shared.f64 [%r9], %fd44; BB7_34: - setp.lt.u32 %p19, %r14, 4; + setp.lt.u32 %p19, %r13, 4; @%p19 bra BB7_36; ld.volatile.shared.f64 %fd41, [%r9+16]; @@ -1376,7 +1554,7 @@ BB7_34: st.volatile.shared.f64 [%r9], %fd44; BB7_36: - setp.lt.u32 %p20, %r14, 2; + setp.lt.u32 %p20, %r13, 2; @%p20 bra BB7_38; ld.volatile.shared.f64 %fd42, [%r9+8]; @@ -1384,24 +1562,57 @@ BB7_36: st.volatile.shared.f64 [%r9], %fd43; BB7_38: - setp.ne.s32 %p21, %r11, 0; - @%p21 bra BB7_42; + setp.ne.s32 %p21, %r14, 0; + @%p21 bra BB7_43; ld.shared.f64 %fd28, [memory]; - cvta.to.global.u64 %rd25, %rd11; - ld.global.u64 %rd26, [%rd25+16]; - ld.global.u64 %rd27, [%rd25+32]; - cvta.to.global.u64 %rd33, %rd27; - setp.ne.s64 %p22, %rd26, 0; + cvta.to.global.u64 %rd23, %rd12; + add.s64 %rd7, %rd23, 16; + ld.global.u64 %rd24, [%rd23+16]; + setp.eq.s64 %p22, %rd24, 0; @%p22 bra BB7_41; - mul.wide.u32 %rd28, %r12, 8; - add.s64 %rd33, %rd33, %rd28; + mov.u32 %r42, 0; + add.u64 %rd25, %SP, 0; + add.u64 %rd26, %SPL, 0; + st.local.u32 [%rd26], %r42; + st.local.u32 [%rd26+4], %r11; + mov.u64 %rd27, $str; + cvta.global.u64 %rd28, %rd27; + // Callseq Start 5 + { + .reg .b32 temp_param_reg; + // <end>} + .param .b64 param0; + st.param.b64 [param0+0], %rd28; + .param .b64 param1; + st.param.b64 [param1+0], %rd25; + .param .b32 retval0; + call.uni (retval0), + vprintf, + ( + param0, + param1 + ); + ld.param.b32 %r44, [retval0+0]; + + //{ + }// Callseq End 5 + // inline asm + trap; + // inline asm + ld.global.u64 %rd35, [%rd7+16]; + bra.uni BB7_42; BB7_41: - st.global.f64 [%rd33], %fd28; + ld.global.u64 %rd29, [%rd7+16]; + mul.wide.u32 %rd30, %r11, 8; + add.s64 %rd35, %rd29, %rd30; BB7_42: + st.f64 [%rd35], %fd28; + +BB7_43: ret; }
